212 files changed, 7176 insertions, 4816 deletions
diff --git a/fs/9p/fid.h b/fs/9p/fid.h
index 26e07df783b9..c3bbd6af996d 100644
--- a/fs/9p/fid.h
+++ b/fs/9p/fid.h
@@ -22,6 +22,21 @@
 
 #include <linux/list.h>
 
+/**
+ * struct v9fs_dentry - 9p private data stored in dentry d_fsdata
+ * @lock: protects the fidlist
+ * @fidlist: list of FIDs currently associated with this dentry
+ *
+ * This structure defines the 9p private data associated with
+ * a particular dentry.  In particular, this private data is used
+ * to lookup which 9P FID handle should be used for a particular VFS
+ * operation.  FID handles are associated with dentries instead of
+ * inodes in order to more closely map functionality to the Plan 9
+ * expected behavior for FID reclaimation and tracking.
+ *
+ * See Also: Mapping FIDs to Linux VFS model in
+ * Design and Implementation of the Linux 9P File System documentation
+ */
 struct v9fs_dentry {
 	spinlock_t lock; /* protect fidlist */
 	struct list_head fidlist;
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 9b0f0222e8bb..047c791427aa 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -71,19 +71,19 @@ static match_table_t tokens = {
 
 /**
  * v9fs_parse_options - parse mount options into session structure
- * @options: options string passed from mount
  * @v9ses: existing v9fs session information
  *
+ * Return 0 upon success, -ERRNO upon failure.
  */
 
-static void v9fs_parse_options(struct v9fs_session_info *v9ses)
+static int v9fs_parse_options(struct v9fs_session_info *v9ses)
 {
 	char *options;
 	substring_t args[MAX_OPT_ARGS];
 	char *p;
 	int option = 0;
 	char *s, *e;
-	int ret;
+	int ret = 0;
 
 	/* setup defaults */
 	v9ses->afid = ~0;
@@ -91,19 +91,26 @@ static void v9fs_parse_options(struct v9fs_session_info *v9ses)
 	v9ses->cache = 0;
 
 	if (!v9ses->options)
-		return;
+		return 0;
 
 	options = kstrdup(v9ses->options, GFP_KERNEL);
+	if (!options) {
+		P9_DPRINTK(P9_DEBUG_ERROR,
+			   "failed to allocate copy of option string\n");
+		return -ENOMEM;
+	}
+
 	while ((p = strsep(&options, ",")) != NULL) {
 		int token;
 		if (!*p)
 			continue;
 		token = match_token(p, tokens, args);
 		if (token < Opt_uname) {
-			ret = match_int(&args[0], &option);
-			if (ret < 0) {
+			int r = match_int(&args[0], &option);
+			if (r < 0) {
 				P9_DPRINTK(P9_DEBUG_ERROR,
 					"integer field, but no integer?\n");
+				ret = r;
 				continue;
 			}
 		}
@@ -125,10 +132,10 @@ static void v9fs_parse_options(struct v9fs_session_info *v9ses)
 			v9ses->afid = option;
 			break;
 		case Opt_uname:
-			match_strcpy(v9ses->uname, &args[0]);
+			match_strlcpy(v9ses->uname, &args[0], PATH_MAX);
 			break;
 		case Opt_remotename:
-			match_strcpy(v9ses->aname, &args[0]);
+			match_strlcpy(v9ses->aname, &args[0], PATH_MAX);
 			break;
 		case Opt_nodevmap:
 			v9ses->nodev = 1;
@@ -139,6 +146,13 @@ static void v9fs_parse_options(struct v9fs_session_info *v9ses)
 
 		case Opt_access:
 			s = match_strdup(&args[0]);
+			if (!s) {
+				P9_DPRINTK(P9_DEBUG_ERROR,
+					   "failed to allocate copy"
+					   " of option argument\n");
+				ret = -ENOMEM;
+				break;
+			}
 			v9ses->flags &= ~V9FS_ACCESS_MASK;
 			if (strcmp(s, "user") == 0)
 				v9ses->flags |= V9FS_ACCESS_USER;
@@ -158,6 +172,7 @@ static void v9fs_parse_options(struct v9fs_session_info *v9ses)
 		}
 	}
 	kfree(options);
+	return ret;
 }
 
 /**
@@ -173,6 +188,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
 {
 	int retval = -EINVAL;
 	struct p9_fid *fid;
+	int rc;
 
 	v9ses->uname = __getname();
 	if (!v9ses->uname)
@@ -190,8 +206,21 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
 	v9ses->uid = ~0;
 	v9ses->dfltuid = V9FS_DEFUID;
 	v9ses->dfltgid = V9FS_DEFGID;
-	v9ses->options = kstrdup(data, GFP_KERNEL);
-	v9fs_parse_options(v9ses);
+	if (data) {
+		v9ses->options = kstrdup(data, GFP_KERNEL);
+		if (!v9ses->options) {
+			P9_DPRINTK(P9_DEBUG_ERROR,
+			   "failed to allocate copy of option string\n");
+			retval = -ENOMEM;
+			goto error;
+		}
+	}
+
+	rc = v9fs_parse_options(v9ses);
+	if (rc < 0) {
+		retval = rc;
+		goto error;
+	}
 
 	v9ses->clnt = p9_client_create(dev_name, v9ses->options);
 
@@ -233,7 +262,6 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
 	return fid;
 
 error:
-	v9fs_session_close(v9ses);
 	return ERR_PTR(retval);
 }
 
@@ -256,9 +284,12 @@ void v9fs_session_close(struct v9fs_session_info *v9ses)
 }
 
 /**
- * v9fs_session_cancel - mark transport as disconnected
- * 	and cancel all pending requests.
+ * v9fs_session_cancel - terminate a session
+ * @v9ses: session to terminate
+ *
+ * mark transport as disconnected and cancel all pending requests.
  */
+
 void v9fs_session_cancel(struct v9fs_session_info *v9ses) {
 	P9_DPRINTK(P9_DEBUG_ERROR, "cancel session %p\n", v9ses);
 	p9_client_disconnect(v9ses->clnt);
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index 7d3a1018db52..a7d567192998 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -21,18 +21,69 @@
  *
  */
 
-/*
-  * Session structure provides information for an opened session
-  *
-  */
+/**
+ * enum p9_session_flags - option flags for each 9P session
+ * @V9FS_EXTENDED: whether or not to use 9P2000.u extensions
+ * @V9FS_ACCESS_SINGLE: only the mounting user can access the hierarchy
+ * @V9FS_ACCESS_USER: a new attach will be issued for every user (default)
+ * @V9FS_ACCESS_ANY: use a single attach for all users
+ * @V9FS_ACCESS_MASK: bit mask of different ACCESS options
+ *
+ * Session flags reflect options selected by users at mount time
+ */
+enum p9_session_flags {
+	V9FS_EXTENDED		= 0x01,
+	V9FS_ACCESS_SINGLE	= 0x02,
+	V9FS_ACCESS_USER	= 0x04,
+	V9FS_ACCESS_ANY		= 0x06,
+	V9FS_ACCESS_MASK	= 0x06,
+};
+
+/* possible values of ->cache */
+/**
+ * enum p9_cache_modes - user specified cache preferences
+ * @CACHE_NONE: do not cache data, dentries, or directory contents (default)
+ * @CACHE_LOOSE: cache data, dentries, and directory contents w/no consistency
+ *
+ * eventually support loose, tight, time, session, default always none
+ */
+
+enum p9_cache_modes {
+	CACHE_NONE,
+	CACHE_LOOSE,
+};
+
+/**
+ * struct v9fs_session_info - per-instance session information
+ * @flags: session options of type &p9_session_flags
+ * @nodev: set to 1 to disable device mapping
+ * @debug: debug level
+ * @afid: authentication handle
+ * @cache: cache mode of type &p9_cache_modes
+ * @options: copy of options string given by user
+ * @uname: string user name to mount hierarchy as
+ * @aname: mount specifier for remote hierarchy
+ * @maxdata: maximum data to be sent/recvd per protocol message
+ * @dfltuid: default numeric userid to mount hierarchy as
+ * @dfltgid: default numeric groupid to mount hierarchy as
+ * @uid: if %V9FS_ACCESS_SINGLE, the numeric uid which mounted the hierarchy
+ * @clnt: reference to 9P network client instantiated for this session
+ * @debugfs_dir: reference to debugfs_dir which can be used for add'l debug
+ *
+ * This structure holds state for each session instance established during
+ * a sys_mount() .
+ *
+ * Bugs: there seems to be a lot of state which could be condensed and/or
+ * removed.
+ */
 
 struct v9fs_session_info {
 	/* options */
-	unsigned char flags;	/* session flags */
-	unsigned char nodev;	/* set to 1 if no disable device mapping */
-	unsigned short debug;	/* debug level */
-	unsigned int afid;	/* authentication fid */
-	unsigned int cache;	/* cache mode */
+	unsigned char flags;
+	unsigned char nodev;
+	unsigned short debug;
+	unsigned int afid;
+	unsigned int cache;
 
 	char *options;		/* copy of mount options */
 	char *uname;		/* user name to mount as */
@@ -45,22 +96,6 @@ struct v9fs_session_info {
 	struct dentry *debugfs_dir;
 };
 
-/* session flags */
-enum {
-	V9FS_EXTENDED		= 0x01,	/* 9P2000.u */
-	V9FS_ACCESS_MASK	= 0x06,	/* access mask */
-	V9FS_ACCESS_SINGLE	= 0x02,	/* only one user can access the files */
-	V9FS_ACCESS_USER	= 0x04,	/* attache per user */
-	V9FS_ACCESS_ANY		= 0x06,	/* use the same attach for all users */
-};
-
-/* possible values of ->cache */
-/* eventually support loose, tight, time, session, default always none */
-enum {
-	CACHE_NONE,		/* default */
-	CACHE_LOOSE,		/* no consistency */
-};
-
 extern struct dentry *v9fs_debugfs_root;
 
 struct p9_fid *v9fs_session_init(struct v9fs_session_info *, const char *,
diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h
index fd01d90cada5..57997fa14e69 100644
--- a/fs/9p/v9fs_vfs.h
+++ b/fs/9p/v9fs_vfs.h
@@ -51,4 +51,4 @@ int v9fs_dir_release(struct inode *inode, struct file *filp);
 int v9fs_file_open(struct inode *inode, struct file *file);
 void v9fs_inode2stat(struct inode *inode, struct p9_stat *stat);
 void v9fs_dentry_release(struct dentry *);
-int v9fs_uflags2omode(int uflags);
+int v9fs_uflags2omode(int uflags, int extended);
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index 6248f0e727a3..97d3aed57983 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -43,7 +43,7 @@
 /**
  * v9fs_vfs_readpage - read an entire page in from 9P
  *
- * @file: file being read
+ * @filp: file being read
  * @page: structure to page
  *
  */
diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index 0924d4477da3..88e3787c6ea9 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -60,7 +60,7 @@ static inline int dt_type(struct p9_stat *mistat)
 
 /**
  * v9fs_dir_readdir - read a directory
- * @filep: opened file structure
+ * @filp: opened file structure
  * @dirent: directory structure ???
  * @filldir: function to populate directory structure ???
  *
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index a616fff8906d..52944d2249a4 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -59,7 +59,7 @@ int v9fs_file_open(struct inode *inode, struct file *file)
 
 	P9_DPRINTK(P9_DEBUG_VFS, "inode: %p file: %p \n", inode, file);
 	v9ses = v9fs_inode2v9ses(inode);
-	omode = v9fs_uflags2omode(file->f_flags);
+	omode = v9fs_uflags2omode(file->f_flags, v9fs_extended(v9ses));
 	fid = file->private_data;
 	if (!fid) {
 		fid = v9fs_fid_clone(file->f_path.dentry);
@@ -75,6 +75,8 @@ int v9fs_file_open(struct inode *inode, struct file *file)
 			inode->i_size = 0;
 			inode->i_blocks = 0;
 		}
+		if ((file->f_flags & O_APPEND) && (!v9fs_extended(v9ses)))
+			generic_file_llseek(file, 0, SEEK_END);
 	}
 
 	file->private_data = fid;
@@ -90,10 +92,11 @@ int v9fs_file_open(struct inode *inode, struct file *file)
 
 /**
  * v9fs_file_lock - lock a file (or directory)
- * @inode: inode to be opened
- * @file: file being opened
+ * @filp: file to be locked
+ * @cmd: lock command
+ * @fl: file lock structure
  *
- * XXX - this looks like a local only lock, we should extend into 9P
+ * Bugs: this looks like a local only lock, we should extend into 9P
  *       by using open exclusive
  */
 
@@ -118,7 +121,7 @@ static int v9fs_file_lock(struct file *filp, int cmd, struct file_lock *fl)
 
 /**
  * v9fs_file_read - read from a file
- * @filep: file pointer to read
+ * @filp: file pointer to read
  * @data: data buffer to read data into
  * @count: size of buffer
  * @offset: offset at which to read data
@@ -142,7 +145,7 @@ v9fs_file_read(struct file *filp, char __user * data, size_t count,
 
 /**
  * v9fs_file_write - write to a file
- * @filep: file pointer to write
+ * @filp: file pointer to write
  * @data: data buffer to write data from
  * @count: size of buffer
  * @offset: offset at which to write data
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 6a28842052ea..c95295c65045 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -129,7 +129,13 @@ static int p9mode2unixmode(struct v9fs_session_info *v9ses, int mode)
 	return res;
 }
 
-int v9fs_uflags2omode(int uflags)
+/**
+ * v9fs_uflags2omode- convert posix open flags to plan 9 mode bits
+ * @uflags: flags to convert
+ * @extended: if .u extensions are active
+ */
+
+int v9fs_uflags2omode(int uflags, int extended)
 {
 	int ret;
 
@@ -149,14 +155,16 @@ int v9fs_uflags2omode(int uflags)
 		break;
 	}
 
-	if (uflags & O_EXCL)
-		ret |= P9_OEXCL;
-
 	if (uflags & O_TRUNC)
 		ret |= P9_OTRUNC;
 
-	if (uflags & O_APPEND)
-		ret |= P9_OAPPEND;
+	if (extended) {
+		if (uflags & O_EXCL)
+			ret |= P9_OEXCL;
+
+		if (uflags & O_APPEND)
+			ret |= P9_OAPPEND;
+	}
 
 	return ret;
 }
@@ -312,6 +320,14 @@ error:
 }
 */
 
+/**
+ * v9fs_inode_from_fid - populate an inode by issuing a attribute request
+ * @v9ses: session information
+ * @fid: fid to issue attribute request for
+ * @sb: superblock on which to create inode
+ *
+ */
+
 static struct inode *
 v9fs_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid,
 	struct super_block *sb)
@@ -384,9 +400,12 @@ v9fs_open_created(struct inode *inode, struct file *file)
 
 /**
  * v9fs_create - Create a file
+ * @v9ses: session information
+ * @dir: directory that dentry is being created in
  * @dentry:  dentry that is being created
  * @perm: create permissions
  * @mode: open mode
+ * @extension: 9p2000.u extension string to support devices, etc.
  *
  */
 static struct p9_fid *
@@ -461,7 +480,7 @@ error:
 
 /**
  * v9fs_vfs_create - VFS hook to create files
- * @inode: directory inode that is being created
+ * @dir: directory inode that is being created
  * @dentry:  dentry that is being deleted
  * @mode: create permissions
  * @nd: path information
@@ -489,7 +508,7 @@ v9fs_vfs_create(struct inode *dir, struct dentry *dentry, int mode,
 		flags = O_RDWR;
 
 	fid = v9fs_create(v9ses, dir, dentry, NULL, perm,
-						v9fs_uflags2omode(flags));
+				v9fs_uflags2omode(flags, v9fs_extended(v9ses)));
 	if (IS_ERR(fid)) {
 		err = PTR_ERR(fid);
 		fid = NULL;
@@ -519,7 +538,7 @@ error:
 
 /**
  * v9fs_vfs_mkdir - VFS mkdir hook to create a directory
- * @inode:  inode that is being unlinked
+ * @dir:  inode that is being unlinked
  * @dentry: dentry that is being unlinked
  * @mode: mode for new directory
  *
@@ -703,9 +722,9 @@ done:
 
 /**
  * v9fs_vfs_getattr - retrieve file metadata
- * @mnt - mount information
- * @dentry - file to get attributes on
- * @stat - metadata structure to populate
+ * @mnt: mount information
+ * @dentry: file to get attributes on
+ * @stat: metadata structure to populate
  *
  */
 
@@ -928,7 +947,7 @@ done:
 /**
  * v9fs_vfs_readlink - read a symlink's location
  * @dentry: dentry for symlink
- * @buf: buffer to load symlink location into
+ * @buffer: buffer to load symlink location into
  * @buflen: length of buffer
  *
  */
@@ -996,10 +1015,12 @@ static void *v9fs_vfs_follow_link(struct dentry *dentry, struct nameidata *nd)
  * v9fs_vfs_put_link - release a symlink path
  * @dentry: dentry for symlink
  * @nd: nameidata
+ * @p: unused
  *
  */
 
-static void v9fs_vfs_put_link(struct dentry *dentry, struct nameidata *nd, void *p)
+static void
+v9fs_vfs_put_link(struct dentry *dentry, struct nameidata *nd, void *p)
 {
 	char *s = nd_get_link(nd);
 
@@ -1008,6 +1029,15 @@ static void v9fs_vfs_put_link(struct dentry *dentry, struct nameidata *nd, void
 		__putname(s);
 }
 
+/**
+ * v9fs_vfs_mkspecial - create a special file
+ * @dir: inode to create special file in
+ * @dentry: dentry to create
+ * @mode: mode to create special file
+ * @extension: 9p2000.u format extension string representing special file
+ *
+ */
+
 static int v9fs_vfs_mkspecial(struct inode *dir, struct dentry *dentry,
 	int mode, const char *extension)
 {
@@ -1037,7 +1067,7 @@ static int v9fs_vfs_mkspecial(struct inode *dir, struct dentry *dentry,
  * @dentry: dentry for symlink
  * @symname: symlink data
  *
- * See 9P2000.u RFC for more information
+ * See Also: 9P2000.u RFC for more information
  *
  */
 
@@ -1058,10 +1088,6 @@ v9fs_vfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
  *
  */
 
-/* XXX - lots of code dup'd from symlink and creates,
- * figure out a better reuse strategy
- */
-
 static int
 v9fs_vfs_link(struct dentry *old_dentry, struct inode *dir,
 	      struct dentry *dentry)
@@ -1098,7 +1124,7 @@ clunk_fid:
  * @dir: inode destination for new link
  * @dentry: dentry for file
  * @mode: mode for creation
- * @dev_t: device associated with special file
+ * @rdev: device associated with special file
  *
  */
 
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index a452ac67fc94..bf59c3960494 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -75,6 +75,7 @@ static int v9fs_set_super(struct super_block *s, void *data)
  * v9fs_fill_super - populate superblock with info
  * @sb: superblock
  * @v9ses: session information
+ * @flags: flags propagated from v9fs_get_sb()
  *
  */
 
@@ -127,29 +128,26 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags,
 	fid = v9fs_session_init(v9ses, dev_name, data);
 	if (IS_ERR(fid)) {
 		retval = PTR_ERR(fid);
-		fid = NULL;
-		kfree(v9ses);
-		v9ses = NULL;
-		goto error;
+		goto close_session;
 	}
 
 	st = p9_client_stat(fid);
 	if (IS_ERR(st)) {
 		retval = PTR_ERR(st);
-		goto error;
+		goto clunk_fid;
 	}
 
 	sb = sget(fs_type, NULL, v9fs_set_super, v9ses);
 	if (IS_ERR(sb)) {
 		retval = PTR_ERR(sb);
-		goto error;
+		goto free_stat;
 	}
 	v9fs_fill_super(sb, v9ses, flags);
 
 	inode = v9fs_get_inode(sb, S_IFDIR | mode);
 	if (IS_ERR(inode)) {
 		retval = PTR_ERR(inode);
-		goto error;
+		goto release_sb;
 	}
 
 	inode->i_uid = uid;
@@ -158,7 +156,7 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags,
 	root = d_alloc_root(inode);
 	if (!root) {
 		retval = -ENOMEM;
-		goto error;
+		goto release_sb;
 	}
 
 	sb->s_root = root;
@@ -169,21 +167,22 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags,
 
 	return simple_set_mnt(mnt, sb);
 
-error:
-	kfree(st);
-	if (fid)
-		p9_client_clunk(fid);
-
-	if (v9ses) {
-		v9fs_session_close(v9ses);
-		kfree(v9ses);
-	}
-
+release_sb:
 	if (sb) {
 		up_write(&sb->s_umount);
 		deactivate_super(sb);
 	}
 
+free_stat:
+	kfree(st);
+
+clunk_fid:
+	p9_client_clunk(fid);
+
+close_session:
+	v9fs_session_close(v9ses);
+	kfree(v9ses);
+
 	return retval;
 }
 
diff --git a/fs/Kconfig b/fs/Kconfig
index cf12c403b8c7..313b2e06ded5 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -830,7 +830,7 @@ config NTFS_FS
 	  from the project web site.
 
 	  For more information see <file:Documentation/filesystems/ntfs.txt>
-	  and <http://linux-ntfs.sourceforge.net/>.
+	  and <http://www.linux-ntfs.org/>.
 
 	  To compile this file system support as a module, choose M here: the
 	  module will be called ntfs.
@@ -930,7 +930,7 @@ config PROC_KCORE
 
 config PROC_VMCORE
         bool "/proc/vmcore support (EXPERIMENTAL)"
-        depends on PROC_FS && EXPERIMENTAL && CRASH_DUMP
+        depends on PROC_FS && CRASH_DUMP
 	default y
         help
         Exports the dump image of crashed kernel in ELF format.
diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
index 55e8ee1900a5..3263084eef9e 100644
--- a/fs/Kconfig.binfmt
+++ b/fs/Kconfig.binfmt
@@ -42,7 +42,7 @@ config BINFMT_ELF_FDPIC
 
 config BINFMT_FLAT
 	bool "Kernel support for flat binaries"
-	depends on !MMU
+	depends on !MMU && (!FRV || BROKEN)
 	help
 	  Support uClinux FLAT format binaries.
 
diff --git a/fs/Makefile b/fs/Makefile
index 1e7a11bd4da1..277b079dec9e 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -19,6 +19,7 @@ else
 obj-y +=	no-block.o
 endif
 
+obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o
 obj-$(CONFIG_INOTIFY)		+= inotify.o
 obj-$(CONFIG_INOTIFY_USER)	+= inotify_user.o
 obj-$(CONFIG_EPOLL)		+= eventpoll.o
diff --git a/fs/afs/callback.c b/fs/afs/callback.c
index a78d5b236bb1..587ef5123cd8 100644
--- a/fs/afs/callback.c
+++ b/fs/afs/callback.c
@@ -8,7 +8,7 @@
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  *
- * Authors: David Woodhouse <dwmw2@cambridge.redhat.com>
+ * Authors: David Woodhouse <dwmw2@infradead.org>
  *          David Howells <dhowells@redhat.com>
  *
  */
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 08db82e1343a..bb47217f6a18 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -8,7 +8,7 @@
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  *
- * Authors: David Woodhouse <dwmw2@cambridge.redhat.com>
+ * Authors: David Woodhouse <dwmw2@infradead.org>
  *          David Howells <dhowells@redhat.com>
  *
  */
diff --git a/fs/afs/super.c b/fs/afs/super.c
index 4b572b801d8d..7e3faeef6818 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -10,7 +10,7 @@
  * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  *
  * Authors: David Howells <dhowells@redhat.com>
- *          David Woodhouse <dwmw2@redhat.com>
+ *          David Woodhouse <dwmw2@infradead.org>
  *
  */
 
diff --git a/fs/aio.c b/fs/aio.c
index b5253e77eb2f..0fb3117ddd93 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -591,10 +591,6 @@ static void use_mm(struct mm_struct *mm)
 	atomic_inc(&mm->mm_count);
 	tsk->mm = mm;
 	tsk->active_mm = mm;
-	/*
-	 * Note that on UML this *requires* PF_BORROWED_MM to be set, otherwise
-	 * it won't work. Update it accordingly if you change it here
-	 */
 	switch_mm(active_mm, mm, tsk);
 	task_unlock(tsk);
 
diff --git a/fs/befs/endian.h b/fs/befs/endian.h
index e254a20869f4..6cb84d896d05 100644
--- a/fs/befs/endian.h
+++ b/fs/befs/endian.h
@@ -9,7 +9,7 @@
 #ifndef LINUX_BEFS_ENDIAN
 #define LINUX_BEFS_ENDIAN
 
-#include <linux/byteorder/generic.h>
+#include <asm/byteorder.h>
 
 static inline u64
 fs64_to_cpu(const struct super_block *sb, fs64 n)
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index b25707fee2cc..d48ff5f370f4 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -16,7 +16,6 @@
 #include <linux/time.h>
 #include <linux/mm.h>
 #include <linux/mman.h>
-#include <linux/a.out.h>
 #include <linux/errno.h>
 #include <linux/signal.h>
 #include <linux/binfmts.h>
@@ -256,7 +255,7 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
 			return -EFAULT;
 		len = strnlen_user((void __user *)p, MAX_ARG_STRLEN);
 		if (!len || len > MAX_ARG_STRLEN)
-			return 0;
+			return -EINVAL;
 		p += len;
 	}
 	if (__put_user(0, argv))
@@ -268,7 +267,7 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec,
 			return -EFAULT;
 		len = strnlen_user((void __user *)p, MAX_ARG_STRLEN);
 		if (!len || len > MAX_ARG_STRLEN)
-			return 0;
+			return -EINVAL;
 		p += len;
 	}
 	if (__put_user(0, envp))
@@ -548,7 +547,6 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
 	struct {
 		struct elfhdr elf_ex;
 		struct elfhdr interp_elf_ex;
-  		struct exec interp_ex;
 	} *loc;
 
 	loc = kmalloc(sizeof(*loc), GFP_KERNEL);
@@ -680,7 +678,6 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
 			}
 
 			/* Get the exec headers */
-			loc->interp_ex = *((struct exec *)bprm->buf);
 			loc->interp_elf_ex = *((struct elfhdr *)bprm->buf);
 			break;
 		}
@@ -1900,7 +1897,7 @@ static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file, un
 	/* alloc memory for large data structures: too large to be on stack */
 	elf = kmalloc(sizeof(*elf), GFP_KERNEL);
 	if (!elf)
-		goto cleanup;
+		goto out;
 	
 	segs = current->mm->map_count;
 #ifdef ELF_CORE_EXTRA_PHDRS
@@ -2034,8 +2031,9 @@ end_coredump:
 	set_fs(fs);
 
 cleanup:
-	kfree(elf);
 	free_note_info(&info);
+	kfree(elf);
+out:
 	return has_dumped;
 }
 
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index ddd35d873391..d051a32e6270 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -390,7 +390,7 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm,
 	}
 
 	/* expand the stack mapping to use up the entire allocation granule */
-	fullsize = ksize((char *) current->mm->start_brk);
+	fullsize = kobjsize((char *) current->mm->start_brk);
 	if (!IS_ERR_VALUE(do_mremap(current->mm->start_brk, stack_size,
 				    fullsize, 0, 0)))
 		stack_size = fullsize;
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index 3b40d45a3a16..2cb1acda3a82 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -548,7 +548,7 @@ static int load_flat_file(struct linux_binprm * bprm,
 			PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE, 0);
 		/* Remap to use all availabe slack region space */
 		if (realdatastart && (realdatastart < (unsigned long)-4096)) {
-			reallen = ksize((void *)realdatastart);
+			reallen = kobjsize((void *)realdatastart);
 			if (reallen > len) {
 				realdatastart = do_mremap(realdatastart, len,
 					reallen, MREMAP_FIXED, realdatastart);
@@ -600,7 +600,7 @@ static int load_flat_file(struct linux_binprm * bprm,
 			PROT_READ | PROT_EXEC | PROT_WRITE, MAP_PRIVATE, 0);
 		/* Remap to use all availabe slack region space */
 		if (textpos && (textpos < (unsigned long) -4096)) {
-			reallen = ksize((void *)textpos);
+			reallen = kobjsize((void *)textpos);
 			if (reallen > len) {
 				textpos = do_mremap(textpos, len, reallen,
 					MREMAP_FIXED, textpos);
@@ -683,7 +683,7 @@ static int load_flat_file(struct linux_binprm * bprm,
 		 */
 		current->mm->start_brk = datapos + data_len + bss_len;
 		current->mm->brk = (current->mm->start_brk + 3) & ~3;
-		current->mm->context.end_brk = memp + ksize((void *) memp) - stack_len;
+		current->mm->context.end_brk = memp + kobjsize((void *) memp) - stack_len;
 	}
 
 	if (flags & FLAT_FLAG_KTRACE)
@@ -790,7 +790,7 @@ static int load_flat_file(struct linux_binprm * bprm,
 
 	/* zero the BSS,  BRK and stack areas */
 	memset((void*)(datapos + data_len), 0, bss_len + 
-			(memp + ksize((void *) memp) - stack_len -	/* end brk */
+			(memp + kobjsize((void *) memp) - stack_len -	/* end brk */
 			libinfo->lib_list[id].start_brk) +		/* start brk */
 			stack_len);
 
diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
new file mode 100644
index 000000000000..63e2ee63058d
--- /dev/null
+++ b/fs/bio-integrity.c
@@ -0,0 +1,719 @@
+/*
+ * bio-integrity.c - bio data integrity extensions
+ *
+ * Copyright (C) 2007, 2008 Oracle Corporation
+ * Written by: Martin K. Petersen <martin.petersen@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; see the file COPYING.  If not, write to
+ * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139,
+ * USA.
+ *
+ */
+
+#include <linux/blkdev.h>
+#include <linux/mempool.h>
+#include <linux/bio.h>
+#include <linux/workqueue.h>
+
+static struct kmem_cache *bio_integrity_slab __read_mostly;
+static struct workqueue_struct *kintegrityd_wq;
+
+/**
+ * bio_integrity_alloc_bioset - Allocate integrity payload and attach it to bio
+ * @bio:	bio to attach integrity metadata to
+ * @gfp_mask:	Memory allocation mask
+ * @nr_vecs:	Number of integrity metadata scatter-gather elements
+ * @bs:		bio_set to allocate from
+ *
+ * Description: This function prepares a bio for attaching integrity
+ * metadata.  nr_vecs specifies the maximum number of pages containing
+ * integrity metadata that can be attached.
+ */
+struct bio_integrity_payload *bio_integrity_alloc_bioset(struct bio *bio,
+							 gfp_t gfp_mask,
+							 unsigned int nr_vecs,
+							 struct bio_set *bs)
+{
+	struct bio_integrity_payload *bip;
+	struct bio_vec *iv;
+	unsigned long idx;
+
+	BUG_ON(bio == NULL);
+
+	bip = mempool_alloc(bs->bio_integrity_pool, gfp_mask);
+	if (unlikely(bip == NULL)) {
+		printk(KERN_ERR "%s: could not alloc bip\n", __func__);
+		return NULL;
+	}
+
+	memset(bip, 0, sizeof(*bip));
+
+	iv = bvec_alloc_bs(gfp_mask, nr_vecs, &idx, bs);
+	if (unlikely(iv == NULL)) {
+		printk(KERN_ERR "%s: could not alloc bip_vec\n", __func__);
+		mempool_free(bip, bs->bio_integrity_pool);
+		return NULL;
+	}
+
+	bip->bip_pool = idx;
+	bip->bip_vec = iv;
+	bip->bip_bio = bio;
+	bio->bi_integrity = bip;
+
+	return bip;
+}
+EXPORT_SYMBOL(bio_integrity_alloc_bioset);
+
+/**
+ * bio_integrity_alloc - Allocate integrity payload and attach it to bio
+ * @bio:	bio to attach integrity metadata to
+ * @gfp_mask:	Memory allocation mask
+ * @nr_vecs:	Number of integrity metadata scatter-gather elements
+ *
+ * Description: This function prepares a bio for attaching integrity
+ * metadata.  nr_vecs specifies the maximum number of pages containing
+ * integrity metadata that can be attached.
+ */
+struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio,
+						  gfp_t gfp_mask,
+						  unsigned int nr_vecs)
+{
+	return bio_integrity_alloc_bioset(bio, gfp_mask, nr_vecs, fs_bio_set);
+}
+EXPORT_SYMBOL(bio_integrity_alloc);
+
+/**
+ * bio_integrity_free - Free bio integrity payload
+ * @bio:	bio containing bip to be freed
+ * @bs:		bio_set this bio was allocated from
+ *
+ * Description: Used to free the integrity portion of a bio. Usually
+ * called from bio_free().
+ */
+void bio_integrity_free(struct bio *bio, struct bio_set *bs)
+{
+	struct bio_integrity_payload *bip = bio->bi_integrity;
+
+	BUG_ON(bip == NULL);
+
+	/* A cloned bio doesn't own the integrity metadata */
+	if (!bio_flagged(bio, BIO_CLONED) && bip->bip_buf != NULL)
+		kfree(bip->bip_buf);
+
+	mempool_free(bip->bip_vec, bs->bvec_pools[bip->bip_pool]);
+	mempool_free(bip, bs->bio_integrity_pool);
+
+	bio->bi_integrity = NULL;
+}
+EXPORT_SYMBOL(bio_integrity_free);
+
+/**
+ * bio_integrity_add_page - Attach integrity metadata
+ * @bio:	bio to update
+ * @page:	page containing integrity metadata
+ * @len:	number of bytes of integrity metadata in page
+ * @offset:	start offset within page
+ *
+ * Description: Attach a page containing integrity metadata to bio.
+ */
+int bio_integrity_add_page(struct bio *bio, struct page *page,
+			   unsigned int len, unsigned int offset)
+{
+	struct bio_integrity_payload *bip = bio->bi_integrity;
+	struct bio_vec *iv;
+
+	if (bip->bip_vcnt >= bvec_nr_vecs(bip->bip_pool)) {
+		printk(KERN_ERR "%s: bip_vec full\n", __func__);
+		return 0;
+	}
+
+	iv = bip_vec_idx(bip, bip->bip_vcnt);
+	BUG_ON(iv == NULL);
+	BUG_ON(iv->bv_page != NULL);
+
+	iv->bv_page = page;
+	iv->bv_len = len;
+	iv->bv_offset = offset;
+	bip->bip_vcnt++;
+
+	return len;
+}
+EXPORT_SYMBOL(bio_integrity_add_page);
+
+/**
+ * bio_integrity_enabled - Check whether integrity can be passed
+ * @bio:	bio to check
+ *
+ * Description: Determines whether bio_integrity_prep() can be called
+ * on this bio or not.	bio data direction and target device must be
+ * set prior to calling.  The functions honors the write_generate and
+ * read_verify flags in sysfs.
+ */
+int bio_integrity_enabled(struct bio *bio)
+{
+	/* Already protected? */
+	if (bio_integrity(bio))
+		return 0;
+
+	return bdev_integrity_enabled(bio->bi_bdev, bio_data_dir(bio));
+}
+EXPORT_SYMBOL(bio_integrity_enabled);
+
+/**
+ * bio_integrity_hw_sectors - Convert 512b sectors to hardware ditto
+ * @bi:		blk_integrity profile for device
+ * @sectors:	Number of 512 sectors to convert
+ *
+ * Description: The block layer calculates everything in 512 byte
+ * sectors but integrity metadata is done in terms of the hardware
+ * sector size of the storage device.  Convert the block layer sectors
+ * to physical sectors.
+ */
+static inline unsigned int bio_integrity_hw_sectors(struct blk_integrity *bi,
+						    unsigned int sectors)
+{
+	/* At this point there are only 512b or 4096b DIF/EPP devices */
+	if (bi->sector_size == 4096)
+		return sectors >>= 3;
+
+	return sectors;
+}
+
+/**
+ * bio_integrity_tag_size - Retrieve integrity tag space
+ * @bio:	bio to inspect
+ *
+ * Description: Returns the maximum number of tag bytes that can be
+ * attached to this bio. Filesystems can use this to determine how
+ * much metadata to attach to an I/O.
+ */
+unsigned int bio_integrity_tag_size(struct bio *bio)
+{
+	struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
+
+	BUG_ON(bio->bi_size == 0);
+
+	return bi->tag_size * (bio->bi_size / bi->sector_size);
+}
+EXPORT_SYMBOL(bio_integrity_tag_size);
+
+int bio_integrity_tag(struct bio *bio, void *tag_buf, unsigned int len, int set)
+{
+	struct bio_integrity_payload *bip = bio->bi_integrity;
+	struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
+	unsigned int nr_sectors;
+
+	BUG_ON(bip->bip_buf == NULL);
+
+	if (bi->tag_size == 0)
+		return -1;
+
+	nr_sectors = bio_integrity_hw_sectors(bi,
+					DIV_ROUND_UP(len, bi->tag_size));
+
+	if (nr_sectors * bi->tuple_size > bip->bip_size) {
+		printk(KERN_ERR "%s: tag too big for bio: %u > %u\n",
+		       __func__, nr_sectors * bi->tuple_size, bip->bip_size);
+		return -1;
+	}
+
+	if (set)
+		bi->set_tag_fn(bip->bip_buf, tag_buf, nr_sectors);
+	else
+		bi->get_tag_fn(bip->bip_buf, tag_buf, nr_sectors);
+
+	return 0;
+}
+
+/**
+ * bio_integrity_set_tag - Attach a tag buffer to a bio
+ * @bio:	bio to attach buffer to
+ * @tag_buf:	Pointer to a buffer containing tag data
+ * @len:	Length of the included buffer
+ *
+ * Description: Use this function to tag a bio by leveraging the extra
+ * space provided by devices formatted with integrity protection.  The
+ * size of the integrity buffer must be <= to the size reported by
+ * bio_integrity_tag_size().
+ */
+int bio_integrity_set_tag(struct bio *bio, void *tag_buf, unsigned int len)
+{
+	BUG_ON(bio_data_dir(bio) != WRITE);
+
+	return bio_integrity_tag(bio, tag_buf, len, 1);
+}
+EXPORT_SYMBOL(bio_integrity_set_tag);
+
+/**
+ * bio_integrity_get_tag - Retrieve a tag buffer from a bio
+ * @bio:	bio to retrieve buffer from
+ * @tag_buf:	Pointer to a buffer for the tag data
+ * @len:	Length of the target buffer
+ *
+ * Description: Use this function to retrieve the tag buffer from a
+ * completed I/O. The size of the integrity buffer must be <= to the
+ * size reported by bio_integrity_tag_size().
+ */
+int bio_integrity_get_tag(struct bio *bio, void *tag_buf, unsigned int len)
+{
+	BUG_ON(bio_data_dir(bio) != READ);
+
+	return bio_integrity_tag(bio, tag_buf, len, 0);
+}
+EXPORT_SYMBOL(bio_integrity_get_tag);
+
+/**
+ * bio_integrity_generate - Generate integrity metadata for a bio
+ * @bio:	bio to generate integrity metadata for
+ *
+ * Description: Generates integrity metadata for a bio by calling the
+ * block device's generation callback function.  The bio must have a
+ * bip attached with enough room to accommodate the generated
+ * integrity metadata.
+ */
+static void bio_integrity_generate(struct bio *bio)
+{
+	struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
+	struct blk_integrity_exchg bix;
+	struct bio_vec *bv;
+	sector_t sector = bio->bi_sector;
+	unsigned int i, sectors, total;
+	void *prot_buf = bio->bi_integrity->bip_buf;
+
+	total = 0;
+	bix.disk_name = bio->bi_bdev->bd_disk->disk_name;
+	bix.sector_size = bi->sector_size;
+
+	bio_for_each_segment(bv, bio, i) {
+		void *kaddr = kmap_atomic(bv->bv_page, KM_USER0);
+		bix.data_buf = kaddr + bv->bv_offset;
+		bix.data_size = bv->bv_len;
+		bix.prot_buf = prot_buf;
+		bix.sector = sector;
+
+		bi->generate_fn(&bix);
+
+		sectors = bv->bv_len / bi->sector_size;
+		sector += sectors;
+		prot_buf += sectors * bi->tuple_size;
+		total += sectors * bi->tuple_size;
+		BUG_ON(total > bio->bi_integrity->bip_size);
+
+		kunmap_atomic(kaddr, KM_USER0);
+	}
+}
+
+/**
+ * bio_integrity_prep - Prepare bio for integrity I/O
+ * @bio:	bio to prepare
+ *
+ * Description: Allocates a buffer for integrity metadata, maps the
+ * pages and attaches them to a bio.  The bio must have data
+ * direction, target device and start sector set priot to calling.  In
+ * the WRITE case, integrity metadata will be generated using the
+ * block device's integrity function.  In the READ case, the buffer
+ * will be prepared for DMA and a suitable end_io handler set up.
+ */
+int bio_integrity_prep(struct bio *bio)
+{
+	struct bio_integrity_payload *bip;
+	struct blk_integrity *bi;
+	struct request_queue *q;
+	void *buf;
+	unsigned long start, end;
+	unsigned int len, nr_pages;
+	unsigned int bytes, offset, i;
+	unsigned int sectors;
+
+	bi = bdev_get_integrity(bio->bi_bdev);
+	q = bdev_get_queue(bio->bi_bdev);
+	BUG_ON(bi == NULL);
+	BUG_ON(bio_integrity(bio));
+
+	sectors = bio_integrity_hw_sectors(bi, bio_sectors(bio));
+
+	/* Allocate kernel buffer for protection data */
+	len = sectors * blk_integrity_tuple_size(bi);
+	buf = kmalloc(len, GFP_NOIO | __GFP_NOFAIL | q->bounce_gfp);
+	if (unlikely(buf == NULL)) {
+		printk(KERN_ERR "could not allocate integrity buffer\n");
+		return -EIO;
+	}
+
+	end = (((unsigned long) buf) + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	start = ((unsigned long) buf) >> PAGE_SHIFT;
+	nr_pages = end - start;
+
+	/* Allocate bio integrity payload and integrity vectors */
+	bip = bio_integrity_alloc(bio, GFP_NOIO, nr_pages);
+	if (unlikely(bip == NULL)) {
+		printk(KERN_ERR "could not allocate data integrity bioset\n");
+		kfree(buf);
+		return -EIO;
+	}
+
+	bip->bip_buf = buf;
+	bip->bip_size = len;
+	bip->bip_sector = bio->bi_sector;
+
+	/* Map it */
+	offset = offset_in_page(buf);
+	for (i = 0 ; i < nr_pages ; i++) {
+		int ret;
+		bytes = PAGE_SIZE - offset;
+
+		if (len <= 0)
+			break;
+
+		if (bytes > len)
+			bytes = len;
+
+		ret = bio_integrity_add_page(bio, virt_to_page(buf),
+					     bytes, offset);
+
+		if (ret == 0)
+			return 0;
+
+		if (ret < bytes)
+			break;
+
+		buf += bytes;
+		len -= bytes;
+		offset = 0;
+	}
+
+	/* Install custom I/O completion handler if read verify is enabled */
+	if (bio_data_dir(bio) == READ) {
+		bip->bip_end_io = bio->bi_end_io;
+		bio->bi_end_io = bio_integrity_endio;
+	}
+
+	/* Auto-generate integrity metadata if this is a write */
+	if (bio_data_dir(bio) == WRITE)
+		bio_integrity_generate(bio);
+
+	return 0;
+}
+EXPORT_SYMBOL(bio_integrity_prep);
+
+/**
+ * bio_integrity_verify - Verify integrity metadata for a bio
+ * @bio:	bio to verify
+ *
+ * Description: This function is called to verify the integrity of a
+ * bio.	 The data in the bio io_vec is compared to the integrity
+ * metadata returned by the HBA.
+ */
+static int bio_integrity_verify(struct bio *bio)
+{
+	struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
+	struct blk_integrity_exchg bix;
+	struct bio_vec *bv;
+	sector_t sector = bio->bi_integrity->bip_sector;
+	unsigned int i, sectors, total, ret;
+	void *prot_buf = bio->bi_integrity->bip_buf;
+
+	ret = total = 0;
+	bix.disk_name = bio->bi_bdev->bd_disk->disk_name;
+	bix.sector_size = bi->sector_size;
+
+	bio_for_each_segment(bv, bio, i) {
+		void *kaddr = kmap_atomic(bv->bv_page, KM_USER0);
+		bix.data_buf = kaddr + bv->bv_offset;
+		bix.data_size = bv->bv_len;
+		bix.prot_buf = prot_buf;
+		bix.sector = sector;
+
+		ret = bi->verify_fn(&bix);
+
+		if (ret) {
+			kunmap_atomic(kaddr, KM_USER0);
+			break;
+		}
+
+		sectors = bv->bv_len / bi->sector_size;
+		sector += sectors;
+		prot_buf += sectors * bi->tuple_size;
+		total += sectors * bi->tuple_size;
+		BUG_ON(total > bio->bi_integrity->bip_size);
+
+		kunmap_atomic(kaddr, KM_USER0);
+	}
+
+	return ret;
+}
+
+/**
+ * bio_integrity_verify_fn - Integrity I/O completion worker
+ * @work:	Work struct stored in bio to be verified
+ *
+ * Description: This workqueue function is called to complete a READ
+ * request.  The function verifies the transferred integrity metadata
+ * and then calls the original bio end_io function.
+ */
+static void bio_integrity_verify_fn(struct work_struct *work)
+{
+	struct bio_integrity_payload *bip =
+		container_of(work, struct bio_integrity_payload, bip_work);
+	struct bio *bio = bip->bip_bio;
+	int error = bip->bip_error;
+
+	if (bio_integrity_verify(bio)) {
+		clear_bit(BIO_UPTODATE, &bio->bi_flags);
+		error = -EIO;
+	}
+
+	/* Restore original bio completion handler */
+	bio->bi_end_io = bip->bip_end_io;
+
+	if (bio->bi_end_io)
+		bio->bi_end_io(bio, error);
+}
+
+/**
+ * bio_integrity_endio - Integrity I/O completion function
+ * @bio:	Protected bio
+ * @error:	Pointer to errno
+ *
+ * Description: Completion for integrity I/O
+ *
+ * Normally I/O completion is done in interrupt context.  However,
+ * verifying I/O integrity is a time-consuming task which must be run
+ * in process context.	This function postpones completion
+ * accordingly.
+ */
+void bio_integrity_endio(struct bio *bio, int error)
+{
+	struct bio_integrity_payload *bip = bio->bi_integrity;
+
+	BUG_ON(bip->bip_bio != bio);
+
+	bip->bip_error = error;
+	INIT_WORK(&bip->bip_work, bio_integrity_verify_fn);
+	queue_work(kintegrityd_wq, &bip->bip_work);
+}
+EXPORT_SYMBOL(bio_integrity_endio);
+
+/**
+ * bio_integrity_mark_head - Advance bip_vec skip bytes
+ * @bip:	Integrity vector to advance
+ * @skip:	Number of bytes to advance it
+ */
+void bio_integrity_mark_head(struct bio_integrity_payload *bip,
+			     unsigned int skip)
+{
+	struct bio_vec *iv;
+	unsigned int i;
+
+	bip_for_each_vec(iv, bip, i) {
+		if (skip == 0) {
+			bip->bip_idx = i;
+			return;
+		} else if (skip >= iv->bv_len) {
+			skip -= iv->bv_len;
+		} else { /* skip < iv->bv_len) */
+			iv->bv_offset += skip;
+			iv->bv_len -= skip;
+			bip->bip_idx = i;
+			return;
+		}
+	}
+}
+
+/**
+ * bio_integrity_mark_tail - Truncate bip_vec to be len bytes long
+ * @bip:	Integrity vector to truncate
+ * @len:	New length of integrity vector
+ */
+void bio_integrity_mark_tail(struct bio_integrity_payload *bip,
+			     unsigned int len)
+{
+	struct bio_vec *iv;
+	unsigned int i;
+
+	bip_for_each_vec(iv, bip, i) {
+		if (len == 0) {
+			bip->bip_vcnt = i;
+			return;
+		} else if (len >= iv->bv_len) {
+			len -= iv->bv_len;
+		} else { /* len < iv->bv_len) */
+			iv->bv_len = len;
+			len = 0;
+		}
+	}
+}
+
+/**
+ * bio_integrity_advance - Advance integrity vector
+ * @bio:	bio whose integrity vector to update
+ * @bytes_done:	number of data bytes that have been completed
+ *
+ * Description: This function calculates how many integrity bytes the
+ * number of completed data bytes correspond to and advances the
+ * integrity vector accordingly.
+ */
+void bio_integrity_advance(struct bio *bio, unsigned int bytes_done)
+{
+	struct bio_integrity_payload *bip = bio->bi_integrity;
+	struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
+	unsigned int nr_sectors;
+
+	BUG_ON(bip == NULL);
+	BUG_ON(bi == NULL);
+
+	nr_sectors = bio_integrity_hw_sectors(bi, bytes_done >> 9);
+	bio_integrity_mark_head(bip, nr_sectors * bi->tuple_size);
+}
+EXPORT_SYMBOL(bio_integrity_advance);
+
+/**
+ * bio_integrity_trim - Trim integrity vector
+ * @bio:	bio whose integrity vector to update
+ * @offset:	offset to first data sector
+ * @sectors:	number of data sectors
+ *
+ * Description: Used to trim the integrity vector in a cloned bio.
+ * The ivec will be advanced corresponding to 'offset' data sectors
+ * and the length will be truncated corresponding to 'len' data
+ * sectors.
+ */
+void bio_integrity_trim(struct bio *bio, unsigned int offset,
+			unsigned int sectors)
+{
+	struct bio_integrity_payload *bip = bio->bi_integrity;
+	struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
+	unsigned int nr_sectors;
+
+	BUG_ON(bip == NULL);
+	BUG_ON(bi == NULL);
+	BUG_ON(!bio_flagged(bio, BIO_CLONED));
+
+	nr_sectors = bio_integrity_hw_sectors(bi, sectors);
+	bip->bip_sector = bip->bip_sector + offset;
+	bio_integrity_mark_head(bip, offset * bi->tuple_size);
+	bio_integrity_mark_tail(bip, sectors * bi->tuple_size);
+}
+EXPORT_SYMBOL(bio_integrity_trim);
+
+/**
+ * bio_integrity_split - Split integrity metadata
+ * @bio:	Protected bio
+ * @bp:		Resulting bio_pair
+ * @sectors:	Offset
+ *
+ * Description: Splits an integrity page into a bio_pair.
+ */
+void bio_integrity_split(struct bio *bio, struct bio_pair *bp, int sectors)
+{
+	struct blk_integrity *bi;
+	struct bio_integrity_payload *bip = bio->bi_integrity;
+	unsigned int nr_sectors;
+
+	if (bio_integrity(bio) == 0)
+		return;
+
+	bi = bdev_get_integrity(bio->bi_bdev);
+	BUG_ON(bi == NULL);
+	BUG_ON(bip->bip_vcnt != 1);
+
+	nr_sectors = bio_integrity_hw_sectors(bi, sectors);
+
+	bp->bio1.bi_integrity = &bp->bip1;
+	bp->bio2.bi_integrity = &bp->bip2;
+
+	bp->iv1 = bip->bip_vec[0];
+	bp->iv2 = bip->bip_vec[0];
+
+	bp->bip1.bip_vec = &bp->iv1;
+	bp->bip2.bip_vec = &bp->iv2;
+
+	bp->iv1.bv_len = sectors * bi->tuple_size;
+	bp->iv2.bv_offset += sectors * bi->tuple_size;
+	bp->iv2.bv_len -= sectors * bi->tuple_size;
+
+	bp->bip1.bip_sector = bio->bi_integrity->bip_sector;
+	bp->bip2.bip_sector = bio->bi_integrity->bip_sector + nr_sectors;
+
+	bp->bip1.bip_vcnt = bp->bip2.bip_vcnt = 1;
+	bp->bip1.bip_idx = bp->bip2.bip_idx = 0;
+}
+EXPORT_SYMBOL(bio_integrity_split);
+
+/**
+ * bio_integrity_clone - Callback for cloning bios with integrity metadata
+ * @bio:	New bio
+ * @bio_src:	Original bio
+ * @bs:		bio_set to allocate bip from
+ *
+ * Description:	Called to allocate a bip when cloning a bio
+ */
+int bio_integrity_clone(struct bio *bio, struct bio *bio_src,
+			struct bio_set *bs)
+{
+	struct bio_integrity_payload *bip_src = bio_src->bi_integrity;
+	struct bio_integrity_payload *bip;
+
+	BUG_ON(bip_src == NULL);
+
+	bip = bio_integrity_alloc_bioset(bio, GFP_NOIO, bip_src->bip_vcnt, bs);
+
+	if (bip == NULL)
+		return -EIO;
+
+	memcpy(bip->bip_vec, bip_src->bip_vec,
+	       bip_src->bip_vcnt * sizeof(struct bio_vec));
+
+	bip->bip_sector = bip_src->bip_sector;
+	bip->bip_vcnt = bip_src->bip_vcnt;
+	bip->bip_idx = bip_src->bip_idx;
+
+	return 0;
+}
+EXPORT_SYMBOL(bio_integrity_clone);
+
+int bioset_integrity_create(struct bio_set *bs, int pool_size)
+{
+	bs->bio_integrity_pool = mempool_create_slab_pool(pool_size,
+							  bio_integrity_slab);
+	if (!bs->bio_integrity_pool)
+		return -1;
+
+	return 0;
+}
+EXPORT_SYMBOL(bioset_integrity_create);
+
+void bioset_integrity_free(struct bio_set *bs)
+{
+	if (bs->bio_integrity_pool)
+		mempool_destroy(bs->bio_integrity_pool);
+}
+EXPORT_SYMBOL(bioset_integrity_free);
+
+void __init bio_integrity_init_slab(void)
+{
+	bio_integrity_slab = KMEM_CACHE(bio_integrity_payload,
+					SLAB_HWCACHE_ALIGN|SLAB_PANIC);
+}
+EXPORT_SYMBOL(bio_integrity_init_slab);
+
+static int __init integrity_init(void)
+{
+	kintegrityd_wq = create_workqueue("kintegrityd");
+
+	if (!kintegrityd_wq)
+		panic("Failed to create kintegrityd\n");
+
+	return 0;
+}
+subsys_initcall(integrity_init);
diff --git a/fs/bio.c b/fs/bio.c
index 78562574cb52..88322b066acb 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -28,25 +28,10 @@
 #include <linux/blktrace_api.h>
 #include <scsi/sg.h>		/* for struct sg_iovec */
 
-#define BIO_POOL_SIZE 2
-
 static struct kmem_cache *bio_slab __read_mostly;
 
-#define BIOVEC_NR_POOLS 6
-
-/*
- * a small number of entries is fine, not going to be performance critical.
- * basically we just need to survive
- */
-#define BIO_SPLIT_ENTRIES 2
 mempool_t *bio_split_pool __read_mostly;
 
-struct biovec_slab {
-	int nr_vecs;
-	char *name; 
-	struct kmem_cache *slab;
-};
-
 /*
  * if you change this list, also change bvec_alloc or things will
  * break badly! cannot be bigger than what you can fit into an
@@ -60,23 +45,17 @@ static struct biovec_slab bvec_slabs[BIOVEC_NR_POOLS] __read_mostly = {
 #undef BV
 
 /*
- * bio_set is used to allow other portions of the IO system to
- * allocate their own private memory pools for bio and iovec structures.
- * These memory pools in turn all allocate from the bio_slab
- * and the bvec_slabs[].
- */
-struct bio_set {
-	mempool_t *bio_pool;
-	mempool_t *bvec_pools[BIOVEC_NR_POOLS];
-};
-
-/*
  * fs_bio_set is the bio_set containing bio and iovec memory pools used by
  * IO code that does not need private memory pools.
  */
-static struct bio_set *fs_bio_set;
+struct bio_set *fs_bio_set;
+
+unsigned int bvec_nr_vecs(unsigned short idx)
+{
+	return bvec_slabs[idx].nr_vecs;
+}
 
-static inline struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned long *idx, struct bio_set *bs)
+struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned long *idx, struct bio_set *bs)
 {
 	struct bio_vec *bvl;
 
@@ -117,6 +96,9 @@ void bio_free(struct bio *bio, struct bio_set *bio_set)
 		mempool_free(bio->bi_io_vec, bio_set->bvec_pools[pool_idx]);
 	}
 
+	if (bio_integrity(bio))
+		bio_integrity_free(bio, bio_set);
+
 	mempool_free(bio, bio_set->bio_pool);
 }
 
@@ -275,9 +257,19 @@ struct bio *bio_clone(struct bio *bio, gfp_t gfp_mask)
 {
 	struct bio *b = bio_alloc_bioset(gfp_mask, bio->bi_max_vecs, fs_bio_set);
 
-	if (b) {
-		b->bi_destructor = bio_fs_destructor;
-		__bio_clone(b, bio);
+	if (!b)
+		return NULL;
+
+	b->bi_destructor = bio_fs_destructor;
+	__bio_clone(b, bio);
+
+	if (bio_integrity(bio)) {
+		int ret;
+
+		ret = bio_integrity_clone(b, bio, fs_bio_set);
+
+		if (ret < 0)
+			return NULL;
 	}
 
 	return b;
@@ -333,10 +325,19 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
 		if (page == prev->bv_page &&
 		    offset == prev->bv_offset + prev->bv_len) {
 			prev->bv_len += len;
-			if (q->merge_bvec_fn &&
-			    q->merge_bvec_fn(q, bio, prev) < len) {
-				prev->bv_len -= len;
-				return 0;
+
+			if (q->merge_bvec_fn) {
+				struct bvec_merge_data bvm = {
+					.bi_bdev = bio->bi_bdev,
+					.bi_sector = bio->bi_sector,
+					.bi_size = bio->bi_size,
+					.bi_rw = bio->bi_rw,
+				};
+
+				if (q->merge_bvec_fn(q, &bvm, prev) < len) {
+					prev->bv_len -= len;
+					return 0;
+				}
 			}
 
 			goto done;
@@ -377,11 +378,18 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
 	 * queue to get further control
 	 */
 	if (q->merge_bvec_fn) {
+		struct bvec_merge_data bvm = {
+			.bi_bdev = bio->bi_bdev,
+			.bi_sector = bio->bi_sector,
+			.bi_size = bio->bi_size,
+			.bi_rw = bio->bi_rw,
+		};
+
 		/*
 		 * merge_bvec_fn() returns number of bytes it can accept
 		 * at this offset
 		 */
-		if (q->merge_bvec_fn(q, bio, bvec) < len) {
+		if (q->merge_bvec_fn(q, &bvm, bvec) < len) {
 			bvec->bv_page = NULL;
 			bvec->bv_len = 0;
 			bvec->bv_offset = 0;
@@ -1249,6 +1257,9 @@ struct bio_pair *bio_split(struct bio *bi, mempool_t *pool, int first_sectors)
 	bp->bio1.bi_private = bi;
 	bp->bio2.bi_private = pool;
 
+	if (bio_integrity(bi))
+		bio_integrity_split(bi, bp, first_sectors);
+
 	return bp;
 }
 
@@ -1290,6 +1301,7 @@ void bioset_free(struct bio_set *bs)
 	if (bs->bio_pool)
 		mempool_destroy(bs->bio_pool);
 
+	bioset_integrity_free(bs);
 	biovec_free_pools(bs);
 
 	kfree(bs);
@@ -1306,6 +1318,9 @@ struct bio_set *bioset_create(int bio_pool_size, int bvec_pool_size)
 	if (!bs->bio_pool)
 		goto bad;
 
+	if (bioset_integrity_create(bs, bio_pool_size))
+		goto bad;
+
 	if (!biovec_create_pools(bs, bvec_pool_size))
 		return bs;
 
@@ -1332,6 +1347,7 @@ static int __init init_bio(void)
 {
 	bio_slab = KMEM_CACHE(bio, SLAB_HWCACHE_ALIGN|SLAB_PANIC);
 
+	bio_integrity_init_slab();
 	biovec_init_slabs();
 
 	fs_bio_set = bioset_create(BIO_POOL_SIZE, 2);
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 7d822fae7765..10d8a0aa871a 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -12,6 +12,7 @@
 #include <linux/kmod.h>
 #include <linux/major.h>
 #include <linux/smp_lock.h>
+#include <linux/device_cgroup.h>
 #include <linux/highmem.h>
 #include <linux/blkdev.h>
 #include <linux/module.h>
@@ -928,9 +929,22 @@ static int do_open(struct block_device *bdev, struct file *file, int for_part)
 {
 	struct module *owner = NULL;
 	struct gendisk *disk;
-	int ret = -ENXIO;
+	int ret;
 	int part;
+	int perm = 0;
+
+	if (file->f_mode & FMODE_READ)
+		perm |= MAY_READ;
+	if (file->f_mode & FMODE_WRITE)
+		perm |= MAY_WRITE;
+	/*
+	 * hooks: /n/, see "layering violations".
+	 */
+	ret = devcgroup_inode_permission(bdev->bd_inode, perm);
+	if (ret != 0)
+		return ret;
 
+	ret = -ENXIO;
 	file->f_mapping = bdev->bd_inode->i_mapping;
 	lock_kernel();
 	disk = get_gendisk(bdev->bd_dev, &part);
diff --git a/fs/buffer.c b/fs/buffer.c
index a073f3f4f013..5fa1512cd9a2 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -821,7 +821,7 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
 				 * contents - it is a noop if I/O is still in
 				 * flight on potentially older contents.
 				 */
-				ll_rw_block(SWRITE, 1, &bh);
+				ll_rw_block(SWRITE_SYNC, 1, &bh);
 				brelse(bh);
 				spin_lock(lock);
 			}
@@ -1691,11 +1691,13 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
 			 */
 			clear_buffer_dirty(bh);
 			set_buffer_uptodate(bh);
-		} else if (!buffer_mapped(bh) && buffer_dirty(bh)) {
+		} else if ((!buffer_mapped(bh) || buffer_delay(bh)) &&
+			   buffer_dirty(bh)) {
 			WARN_ON(bh->b_size != blocksize);
 			err = get_block(inode, block, bh, 1);
 			if (err)
 				goto recover;
+			clear_buffer_delay(bh);
 			if (buffer_new(bh)) {
 				/* blockdev mappings never come here */
 				clear_buffer_new(bh);
@@ -1774,7 +1776,8 @@ recover:
 	bh = head;
 	/* Recovery: lock and submit the mapped buffers */
 	do {
-		if (buffer_mapped(bh) && buffer_dirty(bh)) {
+		if (buffer_mapped(bh) && buffer_dirty(bh) &&
+		    !buffer_delay(bh)) {
 			lock_buffer(bh);
 			mark_buffer_async_write(bh);
 		} else {
@@ -2061,6 +2064,7 @@ int generic_write_end(struct file *file, struct address_space *mapping,
 			struct page *page, void *fsdata)
 {
 	struct inode *inode = mapping->host;
+	int i_size_changed = 0;
 
 	copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
 
@@ -2073,12 +2077,21 @@ int generic_write_end(struct file *file, struct address_space *mapping,
 	 */
 	if (pos+copied > inode->i_size) {
 		i_size_write(inode, pos+copied);
-		mark_inode_dirty(inode);
+		i_size_changed = 1;
 	}
 
 	unlock_page(page);
 	page_cache_release(page);
 
+	/*
+	 * Don't mark the inode dirty under page lock. First, it unnecessarily
+	 * makes the holding time of page lock longer. Second, it forces lock
+	 * ordering of page lock and transaction start for journaling
+	 * filesystems.
+	 */
+	if (i_size_changed)
+		mark_inode_dirty(inode);
+
 	return copied;
 }
 EXPORT_SYMBOL(generic_write_end);
@@ -2940,16 +2953,19 @@ void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
 	for (i = 0; i < nr; i++) {
 		struct buffer_head *bh = bhs[i];
 
-		if (rw == SWRITE)
+		if (rw == SWRITE || rw == SWRITE_SYNC)
 			lock_buffer(bh);
 		else if (test_set_buffer_locked(bh))
 			continue;
 
-		if (rw == WRITE || rw == SWRITE) {
+		if (rw == WRITE || rw == SWRITE || rw == SWRITE_SYNC) {
 			if (test_clear_buffer_dirty(bh)) {
 				bh->b_end_io = end_buffer_write_sync;
 				get_bh(bh);
-				submit_bh(WRITE, bh);
+				if (rw == SWRITE_SYNC)
+					submit_bh(WRITE_SYNC, bh);
+				else
+					submit_bh(WRITE, bh);
 				continue;
 			}
 		} else {
@@ -2978,7 +2994,7 @@ int sync_dirty_buffer(struct buffer_head *bh)
 	if (test_clear_buffer_dirty(bh)) {
 		get_bh(bh);
 		bh->b_end_io = end_buffer_write_sync;
-		ret = submit_bh(WRITE, bh);
+		ret = submit_bh(WRITE_SYNC, bh);
 		wait_on_buffer(bh);
 		if (buffer_eopnotsupp(bh)) {
 			clear_buffer_eopnotsupp(bh);
diff --git a/fs/char_dev.c b/fs/char_dev.c
index 68e510b88457..3cb7cda3d780 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -373,6 +373,8 @@ static int chrdev_open(struct inode *inode, struct file *filp)
 			return -ENXIO;
 		new = container_of(kobj, struct cdev, kobj);
 		spin_lock(&cdev_lock);
+		/* Check i_cdev again in case somebody beat us to it while
+		   we dropped the lock. */
 		p = inode->i_cdev;
 		if (!p) {
 			inode->i_cdev = p = new;
@@ -392,11 +394,8 @@ static int chrdev_open(struct inode *inode, struct file *filp)
 		cdev_put(p);
 		return -ENXIO;
 	}
-	if (filp->f_op->open) {
-		lock_kernel();
+	if (filp->f_op->open)
 		ret = filp->f_op->open(inode,filp);
-		unlock_kernel();
-	}
 	if (ret)
 		cdev_put(p);
 	return ret;
diff --git a/fs/cifs/AUTHORS b/fs/cifs/AUTHORS
index 8848e4dfa026..9c136d7803d9 100644
--- a/fs/cifs/AUTHORS
+++ b/fs/cifs/AUTHORS
@@ -36,6 +36,7 @@ Miklos Szeredi
 Kazeon team for various fixes especially for 2.4 version.
 Asser Ferno (Change Notify support)
 Shaggy (Dave Kleikamp) for inumerable small fs suggestions and some good cleanup
+Igor Mammedov (DFS support)
 
 Test case and Bug Report contributors
 -------------------------------------
diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES
index 8355e918fddf..1f3465201fdf 100644
--- a/fs/cifs/CHANGES
+++ b/fs/cifs/CHANGES
@@ -1,5 +1,12 @@
 Version 1.53
 ------------
+DFS support added (Microsoft Distributed File System client support needed
+for referrals which enable a hierarchical name space among servers).
+Disable temporary caching of mode bits to servers which do not support
+storing of mode (e.g. Windows servers, when client mounts without cifsacl
+mount option) and add new "dynperm" mount option to enable temporary caching
+of mode (enable old behavior).  Fix hang on mount caused when server crashes
+tcp session during negotiate protocol.
 
 Version 1.52
 ------------
@@ -12,7 +19,8 @@ Add ability to modify cifs acls for handling chmod (when mounted with
 cifsacl flag). Fix prefixpath path separator so we can handle mounts
 with prefixpaths longer than one directory (one path component) when
 mounted to Windows servers.  Fix slow file open when cifsacl
-enabled.
+enabled. Fix memory leak in FindNext when the SMB call returns -EBADF.
+
 
 Version 1.51
 ------------
diff --git a/fs/cifs/README b/fs/cifs/README
index 621aa1a85971..2bd6fe556f88 100644
--- a/fs/cifs/README
+++ b/fs/cifs/README
@@ -483,6 +483,11 @@ A partial list of the supported mount options follows:
  sign           Must use packet signing (helps avoid unwanted data modification
 		by intermediate systems in the route).  Note that signing
 		does not work with lanman or plaintext authentication.
+ seal           Must seal (encrypt) all data on this mounted share before
+		sending on the network.  Requires support for Unix Extensions.
+		Note that this differs from the sign mount option in that it
+		causes encryption of data sent over this mounted share but other
+		shares mounted to the same server are unaffected.
  sec            Security mode.  Allowed values are:
 			none	attempt to connection as a null user (no name)
 			krb5    Use Kerberos version 5 authentication
diff --git a/fs/cifs/TODO b/fs/cifs/TODO
index 92c9feac440f..5aff46c61e52 100644
--- a/fs/cifs/TODO
+++ b/fs/cifs/TODO
@@ -1,4 +1,4 @@
-Version 1.52 January 3, 2008
+Version 1.53 May 20, 2008
 
 A Partial List of Missing Features
 ==================================
@@ -20,20 +20,21 @@ d) Cleanup now unneeded SessSetup code in
 fs/cifs/connect.c and add back in NTLMSSP code if any servers
 need it
 
-e) ms-dfs and ms-dfs host name resolution cleanup
-
-f) fix NTLMv2 signing when two mounts with different users to same
+e) fix NTLMv2 signing when two mounts with different users to same
 server.
 
-g) Directory entry caching relies on a 1 second timer, rather than 
+f) Directory entry caching relies on a 1 second timer, rather than 
 using FindNotify or equivalent.  - (started)
 
-h) quota support (needs minor kernel change since quota calls
+g) quota support (needs minor kernel change since quota calls
 to make it to network filesystems or deviceless filesystems)
 
-i) investigate sync behavior (including syncpage) and check  
+h) investigate sync behavior (including syncpage) and check  
 for proper behavior of intr/nointr
 
+i) improve support for very old servers (OS/2 and Win9x for example)
+Including support for changing the time remotely (utimes command).
+
 j) hook lower into the sockets api (as NFS/SunRPC does) to avoid the
 extra copy in/out of the socket buffers in some cases.
 
diff --git a/fs/cifs/asn1.c b/fs/cifs/asn1.c
index cb52cbbe45ff..f58e41d3ba48 100644
--- a/fs/cifs/asn1.c
+++ b/fs/cifs/asn1.c
@@ -186,6 +186,11 @@ asn1_length_decode(struct asn1_ctx *ctx, unsigned int *def, unsigned int *len)
 			}
 		}
 	}
+
+	/* don't trust len bigger than ctx buffer */
+	if (*len > ctx->end - ctx->pointer)
+		return 0;
+
 	return 1;
 }
 
@@ -203,6 +208,10 @@ asn1_header_decode(struct asn1_ctx *ctx,
 	if (!asn1_length_decode(ctx, &def, &len))
 		return 0;
 
+	/* primitive shall be definite, indefinite shall be constructed */
+	if (*con == ASN1_PRI && !def)
+		return 0;
+
 	if (def)
 		*eoc = ctx->pointer + len;
 	else
@@ -389,6 +398,11 @@ asn1_oid_decode(struct asn1_ctx *ctx,
 	unsigned long *optr;
 
 	size = eoc - ctx->pointer + 1;
+
+	/* first subid actually encodes first two subids */
+	if (size < 2 || size > ULONG_MAX/sizeof(unsigned long))
+		return 0;
+
 	*oid = kmalloc(size * sizeof(unsigned long), GFP_ATOMIC);
 	if (*oid == NULL)
 		return 0;
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index f6fdecf6598c..d82374c9e329 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -219,53 +219,6 @@ static struct vfsmount *cifs_dfs_do_refmount(const struct vfsmount *mnt_parent,
 
 }
 
-static char *build_full_dfs_path_from_dentry(struct dentry *dentry)
-{
-	char *full_path = NULL;
-	char *search_path;
-	char *tmp_path;
-	size_t l_max_len;
-	struct cifs_sb_info *cifs_sb;
-
-	if (dentry->d_inode == NULL)
-		return NULL;
-
-	cifs_sb = CIFS_SB(dentry->d_inode->i_sb);
-
-	if (cifs_sb->tcon == NULL)
-		return NULL;
-
-	search_path = build_path_from_dentry(dentry);
-	if (search_path == NULL)
-		return NULL;
-
-	if (cifs_sb->tcon->Flags & SMB_SHARE_IS_IN_DFS) {
-		int i;
-		/* we should use full path name for correct working with DFS */
-		l_max_len = strnlen(cifs_sb->tcon->treeName, MAX_TREE_SIZE+1) +
-					strnlen(search_path, MAX_PATHCONF) + 1;
-		tmp_path = kmalloc(l_max_len, GFP_KERNEL);
-		if (tmp_path == NULL) {
-			kfree(search_path);
-			return NULL;
-		}
-		strncpy(tmp_path, cifs_sb->tcon->treeName, l_max_len);
-		tmp_path[l_max_len-1] = 0;
-		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS)
-			for (i = 0; i < l_max_len; i++) {
-				if (tmp_path[i] == '\\')
-					tmp_path[i] = '/';
-			}
-		strncat(tmp_path, search_path, l_max_len - strlen(tmp_path));
-
-		full_path = tmp_path;
-		kfree(search_path);
-	} else {
-		full_path = search_path;
-	}
-	return full_path;
-}
-
 static int add_mount_helper(struct vfsmount *newmnt, struct nameidata *nd,
 				struct list_head *mntlist)
 {
@@ -333,7 +286,7 @@ cifs_dfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
 		goto out_err;
 	}
 
-	full_path = build_full_dfs_path_from_dentry(dentry);
+	full_path = build_path_from_dentry(dentry);
 	if (full_path == NULL) {
 		rc = -ENOMEM;
 		goto out_err;
diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h
index 8ad2330ba061..877c85409f1f 100644
--- a/fs/cifs/cifs_fs_sb.h
+++ b/fs/cifs/cifs_fs_sb.h
@@ -30,6 +30,7 @@
 #define CIFS_MOUNT_CIFS_ACL     0x200 /* send ACL requests to non-POSIX srv   */
 #define CIFS_MOUNT_OVERR_UID    0x400 /* override uid returned from server    */
 #define CIFS_MOUNT_OVERR_GID    0x800 /* override gid returned from server    */
+#define CIFS_MOUNT_DYNPERM	0x1000 /* allow in-memory only mode setting */
 
 struct cifs_sb_info {
 	struct cifsTconInfo *tcon;	/* primary mount */
diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c
index 6653e29637a7..7013aaff6aed 100644
--- a/fs/cifs/cifs_spnego.c
+++ b/fs/cifs/cifs_spnego.c
@@ -119,6 +119,9 @@ cifs_get_spnego_key(struct cifsSesInfo *sesInfo)
 	dp = description + strlen(description);
 	sprintf(dp, ";uid=0x%x", sesInfo->linux_uid);
 
+	dp = description + strlen(description);
+	sprintf(dp, ";user=%s", sesInfo->userName);
+
 	cFYI(1, ("key description = %s", description));
 	spnego_key = request_key(&cifs_spnego_key_type, description, "");
 
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 34902cff5400..0e9fc2ba90ee 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -34,11 +34,11 @@
 static struct cifs_wksid wksidarr[NUM_WK_SIDS] = {
 	{{1, 0, {0, 0, 0, 0, 0, 0}, {0, 0, 0, 0, 0} }, "null user"},
 	{{1, 1, {0, 0, 0, 0, 0, 1}, {0, 0, 0, 0, 0} }, "nobody"},
-	{{1, 1, {0, 0, 0, 0, 0, 5}, {cpu_to_le32(11), 0, 0, 0, 0} }, "net-users"},
-	{{1, 1, {0, 0, 0, 0, 0, 5}, {cpu_to_le32(18), 0, 0, 0, 0} }, "sys"},
-	{{1, 2, {0, 0, 0, 0, 0, 5}, {cpu_to_le32(32), cpu_to_le32(544), 0, 0, 0} }, "root"},
-	{{1, 2, {0, 0, 0, 0, 0, 5}, {cpu_to_le32(32), cpu_to_le32(545), 0, 0, 0} }, "users"},
-	{{1, 2, {0, 0, 0, 0, 0, 5}, {cpu_to_le32(32), cpu_to_le32(546), 0, 0, 0} }, "guest"} }
+	{{1, 1, {0, 0, 0, 0, 0, 5}, {__constant_cpu_to_le32(11), 0, 0, 0, 0} }, "net-users"},
+	{{1, 1, {0, 0, 0, 0, 0, 5}, {__constant_cpu_to_le32(18), 0, 0, 0, 0} }, "sys"},
+	{{1, 2, {0, 0, 0, 0, 0, 5}, {__constant_cpu_to_le32(32), __constant_cpu_to_le32(544), 0, 0, 0} }, "root"},
+	{{1, 2, {0, 0, 0, 0, 0, 5}, {__constant_cpu_to_le32(32), __constant_cpu_to_le32(545), 0, 0, 0} }, "users"},
+	{{1, 2, {0, 0, 0, 0, 0, 5}, {__constant_cpu_to_le32(32), __constant_cpu_to_le32(546), 0, 0, 0} }, "guest"} }
 ;
 
 
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 427a7c695896..22857c639df5 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -1,7 +1,7 @@
 /*
  *   fs/cifs/cifsfs.c
  *
- *   Copyright (C) International Business Machines  Corp., 2002,2007
+ *   Copyright (C) International Business Machines  Corp., 2002,2008
  *   Author(s): Steve French (sfrench@us.ibm.com)
  *
  *   Common Internet FileSystem (CIFS) client
@@ -97,9 +97,6 @@ cifs_read_super(struct super_block *sb, void *data,
 {
 	struct inode *inode;
 	struct cifs_sb_info *cifs_sb;
-#ifdef CONFIG_CIFS_DFS_UPCALL
-	int len;
-#endif
 	int rc = 0;
 
 	/* BB should we make this contingent on mount parm? */
@@ -117,15 +114,17 @@ cifs_read_super(struct super_block *sb, void *data,
 	 * complex operation (mount), and in case of fail
 	 * just exit instead of doing mount and attempting
 	 * undo it if this copy fails?*/
-	len = strlen(data);
-	cifs_sb->mountdata = kzalloc(len + 1, GFP_KERNEL);
-	if (cifs_sb->mountdata == NULL) {
-		kfree(sb->s_fs_info);
-		sb->s_fs_info = NULL;
-		return -ENOMEM;
+	if (data) {
+		int len = strlen(data);
+		cifs_sb->mountdata = kzalloc(len + 1, GFP_KERNEL);
+		if (cifs_sb->mountdata == NULL) {
+			kfree(sb->s_fs_info);
+			sb->s_fs_info = NULL;
+			return -ENOMEM;
+		}
+		strncpy(cifs_sb->mountdata, data, len + 1);
+		cifs_sb->mountdata[len] = '\0';
 	}
-	strncpy(cifs_sb->mountdata, data, len + 1);
-	cifs_sb->mountdata[len] = '\0';
 #endif
 
 	rc = cifs_mount(sb, cifs_sb, data, devname);
@@ -353,9 +352,41 @@ cifs_show_options(struct seq_file *s, struct vfsmount *m)
 			if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_GID) ||
 			   !(cifs_sb->tcon->unix_ext))
 				seq_printf(s, ",gid=%d", cifs_sb->mnt_gid);
+			if (!cifs_sb->tcon->unix_ext) {
+				seq_printf(s, ",file_mode=0%o,dir_mode=0%o",
+					   cifs_sb->mnt_file_mode,
+					   cifs_sb->mnt_dir_mode);
+			}
+			if (cifs_sb->tcon->seal)
+				seq_printf(s, ",seal");
+			if (cifs_sb->tcon->nocase)
+				seq_printf(s, ",nocase");
+			if (cifs_sb->tcon->retry)
+				seq_printf(s, ",hard");
 		}
 		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS)
 			seq_printf(s, ",posixpaths");
+		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID)
+			seq_printf(s, ",setuids");
+		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)
+			seq_printf(s, ",serverino");
+		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DIRECT_IO)
+			seq_printf(s, ",directio");
+		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR)
+			seq_printf(s, ",nouser_xattr");
+		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR)
+			seq_printf(s, ",mapchars");
+		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL)
+			seq_printf(s, ",sfu");
+		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
+			seq_printf(s, ",nobrl");
+		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL)
+			seq_printf(s, ",cifsacl");
+		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM)
+			seq_printf(s, ",dynperm");
+		if (m->mnt_sb->s_flags & MS_POSIXACL)
+			seq_printf(s, ",acl");
+
 		seq_printf(s, ",rsize=%d", cifs_sb->rsize);
 		seq_printf(s, ",wsize=%d", cifs_sb->wsize);
 	}
@@ -581,7 +612,7 @@ static loff_t cifs_llseek(struct file *file, loff_t offset, int origin)
 		if (retval < 0)
 			return (loff_t)retval;
 	}
-	return remote_llseek(file, offset, origin);
+	return generic_file_llseek_unlocked(file, offset, origin);
 }
 
 struct file_system_type cifs_fs_type = {
@@ -657,7 +688,7 @@ const struct file_operations cifs_file_ops = {
 	.splice_read = generic_file_splice_read,
 	.llseek = cifs_llseek,
 #ifdef CONFIG_CIFS_POSIX
-	.ioctl	= cifs_ioctl,
+	.unlocked_ioctl	= cifs_ioctl,
 #endif /* CONFIG_CIFS_POSIX */
 
 #ifdef CONFIG_CIFS_EXPERIMENTAL
@@ -677,7 +708,7 @@ const struct file_operations cifs_file_direct_ops = {
 	.flush = cifs_flush,
 	.splice_read = generic_file_splice_read,
 #ifdef CONFIG_CIFS_POSIX
-	.ioctl  = cifs_ioctl,
+	.unlocked_ioctl  = cifs_ioctl,
 #endif /* CONFIG_CIFS_POSIX */
 	.llseek = cifs_llseek,
 #ifdef CONFIG_CIFS_EXPERIMENTAL
@@ -697,7 +728,7 @@ const struct file_operations cifs_file_nobrl_ops = {
 	.splice_read = generic_file_splice_read,
 	.llseek = cifs_llseek,
 #ifdef CONFIG_CIFS_POSIX
-	.ioctl	= cifs_ioctl,
+	.unlocked_ioctl	= cifs_ioctl,
 #endif /* CONFIG_CIFS_POSIX */
 
 #ifdef CONFIG_CIFS_EXPERIMENTAL
@@ -716,7 +747,7 @@ const struct file_operations cifs_file_direct_nobrl_ops = {
 	.flush = cifs_flush,
 	.splice_read = generic_file_splice_read,
 #ifdef CONFIG_CIFS_POSIX
-	.ioctl  = cifs_ioctl,
+	.unlocked_ioctl  = cifs_ioctl,
 #endif /* CONFIG_CIFS_POSIX */
 	.llseek = cifs_llseek,
 #ifdef CONFIG_CIFS_EXPERIMENTAL
@@ -731,7 +762,7 @@ const struct file_operations cifs_dir_ops = {
 #ifdef CONFIG_CIFS_EXPERIMENTAL
 	.dir_notify = cifs_dir_notify,
 #endif /* CONFIG_CIFS_EXPERIMENTAL */
-	.ioctl  = cifs_ioctl,
+	.unlocked_ioctl  = cifs_ioctl,
 };
 
 static void
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index cd1301a09b3b..25a6cbd15529 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -95,8 +95,7 @@ extern int 	cifs_setxattr(struct dentry *, const char *, const void *,
 			size_t, int);
 extern ssize_t	cifs_getxattr(struct dentry *, const char *, void *, size_t);
 extern ssize_t	cifs_listxattr(struct dentry *, char *, size_t);
-extern int cifs_ioctl(struct inode *inode, struct file *filep,
-		       unsigned int command, unsigned long arg);
+extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
 
 #ifdef CONFIG_CIFS_EXPERIMENTAL
 extern const struct export_operations cifs_export_ops;
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index b7d9f698e63e..9cfcf326ead3 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -281,6 +281,7 @@ struct cifsTconInfo {
 	bool ipc:1;		/* set if connection to IPC$ eg for RPC/PIPES */
 	bool retry:1;
 	bool nocase:1;
+	bool seal:1;      /* transport encryption for this mounted share */
 	bool unix_ext:1;  /* if false disable Linux extensions to CIFS protocol
 				for this mount even if server would support */
 	/* BB add field for back pointer to sb struct(s)? */
@@ -332,7 +333,6 @@ struct cifsFileInfo {
 	bool messageMode:1;	/* for pipes: message vs byte mode */
 	atomic_t wrtPending;   /* handle in use - defer close */
 	struct semaphore fh_sem; /* prevents reopen race after dead ses*/
-	char *search_resume_name; /* BB removeme BB */
 	struct cifs_search_info srch_inf;
 };
 
@@ -625,7 +625,7 @@ GLOBAL_EXTERN atomic_t tcpSesAllocCount;
 GLOBAL_EXTERN atomic_t tcpSesReconnectCount;
 GLOBAL_EXTERN atomic_t tconInfoReconnectCount;
 
-/* Various Debug counters to remove someday (BB) */
+/* Various Debug counters */
 GLOBAL_EXTERN atomic_t bufAllocCount;    /* current number allocated  */
 #ifdef CONFIG_CIFS_STATS2
 GLOBAL_EXTERN atomic_t totBufAllocCount; /* total allocated over all time */
diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index c43bf4b7a556..0f327c224da3 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -79,6 +79,19 @@
 #define TRANS2_GET_DFS_REFERRAL       0x10
 #define TRANS2_REPORT_DFS_INCOSISTENCY 0x11
 
+/* SMB Transact (Named Pipe) subcommand codes */
+#define TRANS_SET_NMPIPE_STATE      0x0001
+#define TRANS_RAW_READ_NMPIPE       0x0011
+#define TRANS_QUERY_NMPIPE_STATE    0x0021
+#define TRANS_QUERY_NMPIPE_INFO     0x0022
+#define TRANS_PEEK_NMPIPE           0x0023
+#define TRANS_TRANSACT_NMPIPE       0x0026
+#define TRANS_RAW_WRITE_NMPIPE      0x0031
+#define TRANS_READ_NMPIPE           0x0036
+#define TRANS_WRITE_NMPIPE          0x0037
+#define TRANS_WAIT_NMPIPE           0x0053
+#define TRANS_CALL_NMPIPE           0x0054
+
 /* NT Transact subcommand codes */
 #define NT_TRANSACT_CREATE            0x01
 #define NT_TRANSACT_IOCTL             0x02
@@ -328,12 +341,13 @@
 #define CREATE_COMPLETE_IF_OPLK 0x00000100	/* should be zero */
 #define CREATE_NO_EA_KNOWLEDGE  0x00000200
 #define CREATE_EIGHT_DOT_THREE  0x00000400	/* doc says this is obsolete
-						 open for recovery flag - should
-						 be zero */
+						 "open for recovery" flag - should
+						 be zero in any case */
+#define CREATE_OPEN_FOR_RECOVERY 0x00000400
 #define CREATE_RANDOM_ACCESS	0x00000800
 #define CREATE_DELETE_ON_CLOSE	0x00001000
 #define CREATE_OPEN_BY_ID       0x00002000
-#define CREATE_OPEN_BACKUP_INTN 0x00004000
+#define CREATE_OPEN_BACKUP_INTENT 0x00004000
 #define CREATE_NO_COMPRESSION   0x00008000
 #define CREATE_RESERVE_OPFILTER 0x00100000	/* should be zero */
 #define OPEN_REPARSE_POINT	0x00200000
@@ -722,7 +736,6 @@ typedef struct smb_com_tconx_rsp_ext {
 #define SMB_CSC_CACHE_AUTO_REINT   0x0004
 #define SMB_CSC_CACHE_VDO          0x0008
 #define SMB_CSC_NO_CACHING         0x000C
-
 #define SMB_UNIQUE_FILE_NAME    0x0010
 #define SMB_EXTENDED_SIGNATURES 0x0020
 
@@ -806,7 +819,7 @@ typedef struct smb_com_findclose_req {
 #define ICOUNT_MASK		0x00FF
 #define PIPE_READ_MODE		0x0100
 #define NAMED_PIPE_TYPE		0x0400
-#define PIPE_END_POINT		0x0800
+#define PIPE_END_POINT		0x4000
 #define BLOCKING_NAMED_PIPE	0x8000
 
 typedef struct smb_com_open_req {	/* also handles create */
@@ -1904,19 +1917,26 @@ typedef struct smb_com_transaction2_get_dfs_refer_req {
 	char RequestFileName[1];
 } __attribute__((packed)) TRANSACTION2_GET_DFS_REFER_REQ;
 
+#define DFS_VERSION cpu_to_le16(0x0003)
+
+/* DFS server target type */
+#define DFS_TYPE_LINK 0x0000  /* also for sysvol targets */
+#define DFS_TYPE_ROOT 0x0001
+ 
+/* Referral Entry Flags */
+#define DFS_NAME_LIST_REF 0x0200
+
 typedef struct dfs_referral_level_3 {
 	__le16 VersionNumber;
-	__le16 ReferralSize;
-	__le16 ServerType;	/* 0x0001 = CIFS server */
-	__le16 ReferralFlags;	/* or proximity - not clear which since it is
-				   always set to zero - SNIA spec says 0x01
-				   means strip off PathConsumed chars before
-				   submitting RequestFileName to remote node */
-	__le16 TimeToLive;
-	__le16 Proximity;
+	__le16 Size;
+	__le16 ServerType; /* 0x0001 = root targets; 0x0000 = link targets */
+	__le16 ReferralEntryFlags; /* 0x0200 bit set only for domain
+				      or DC referral responce */
+	__le32 TimeToLive;
 	__le16 DfsPathOffset;
 	__le16 DfsAlternatePathOffset;
-	__le16 NetworkAddressOffset;
+	__le16 NetworkAddressOffset; /* offset of the link target */
+	__le16 ServiceSiteGuid;
 } __attribute__((packed)) REFERRAL3;
 
 typedef struct smb_com_transaction_get_dfs_refer_rsp {
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index d481f6c5a2be..b9f5e935f821 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -93,7 +93,7 @@ extern struct timespec cnvrtDosUnixTm(__u16 date, __u16 time);
 
 extern int cifs_get_inode_info(struct inode **pinode,
 			const unsigned char *search_path,
-			FILE_ALL_INFO * pfile_info,
+			FILE_ALL_INFO *pfile_info,
 			struct super_block *sb, int xid, const __u16 *pfid);
 extern int cifs_get_inode_info_unix(struct inode **pinode,
 			const unsigned char *search_path,
@@ -130,7 +130,7 @@ extern int CIFSFindClose(const int, struct cifsTconInfo *tcon,
 
 extern int CIFSSMBQPathInfo(const int xid, struct cifsTconInfo *tcon,
 			const unsigned char *searchName,
-			FILE_ALL_INFO * findData,
+			FILE_ALL_INFO *findData,
 			int legacy /* whether to use old info level */,
 			const struct nls_table *nls_codepage, int remap);
 extern int SMBQueryInformation(const int xid, struct cifsTconInfo *tcon,
@@ -141,18 +141,15 @@ extern int SMBQueryInformation(const int xid, struct cifsTconInfo *tcon,
 extern int CIFSSMBUnixQPathInfo(const int xid,
 			struct cifsTconInfo *tcon,
 			const unsigned char *searchName,
-			FILE_UNIX_BASIC_INFO * pFindData,
+			FILE_UNIX_BASIC_INFO *pFindData,
 			const struct nls_table *nls_codepage, int remap);
 
 extern int CIFSGetDFSRefer(const int xid, struct cifsSesInfo *ses,
 			const unsigned char *searchName,
-			unsigned char **targetUNCs,
-			unsigned int *number_of_UNC_in_array,
+			struct dfs_info3_param **target_nodes,
+			unsigned int *number_of_nodes_in_array,
 			const struct nls_table *nls_codepage, int remap);
 
-extern int connect_to_dfs_path(int xid, struct cifsSesInfo *pSesInfo,
-			const char *old_path,
-			const struct nls_table *nls_codepage, int remap);
 extern int get_dfs_path(int xid, struct cifsSesInfo *pSesInfo,
 			const char *old_path,
 			const struct nls_table *nls_codepage,
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 95fbba4ea7d4..4511b708f0f3 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -81,6 +81,40 @@ static struct {
 #endif /* CONFIG_CIFS_WEAK_PW_HASH */
 #endif /* CIFS_POSIX */
 
+/* Allocates buffer into dst and copies smb string from src to it.
+ * caller is responsible for freeing dst if function returned 0.
+ * returns:
+ * 	on success - 0
+ *	on failure - errno
+ */
+static int
+cifs_strncpy_to_host(char **dst, const char *src, const int maxlen,
+		 const bool is_unicode, const struct nls_table *nls_codepage)
+{
+	int plen;
+
+	if (is_unicode) {
+		plen = UniStrnlen((wchar_t *)src, maxlen);
+		*dst = kmalloc(plen + 2, GFP_KERNEL);
+		if (!*dst)
+			goto cifs_strncpy_to_host_ErrExit;
+		cifs_strfromUCS_le(*dst, (__le16 *)src, plen, nls_codepage);
+	} else {
+		plen = strnlen(src, maxlen);
+		*dst = kmalloc(plen + 2, GFP_KERNEL);
+		if (!*dst)
+			goto cifs_strncpy_to_host_ErrExit;
+		strncpy(*dst, src, plen);
+	}
+	(*dst)[plen] = 0;
+	(*dst)[plen+1] = 0; /* harmless for ASCII case, needed for Unicode */
+	return 0;
+
+cifs_strncpy_to_host_ErrExit:
+	cERROR(1, ("Failed to allocate buffer for string\n"));
+	return -ENOMEM;
+}
+
 
 /* Mark as invalid, all open files on tree connections since they
    were closed when session to server was lost */
@@ -1166,6 +1200,20 @@ static __u16 convert_disposition(int disposition)
 	return ofun;
 }
 
+static int
+access_flags_to_smbopen_mode(const int access_flags)
+{
+	int masked_flags = access_flags & (GENERIC_READ | GENERIC_WRITE);
+
+	if (masked_flags == GENERIC_READ)
+		return SMBOPEN_READ;
+	else if (masked_flags == GENERIC_WRITE)
+		return SMBOPEN_WRITE;
+
+	/* just go for read/write */
+	return SMBOPEN_READWRITE;
+}
+
 int
 SMBLegacyOpen(const int xid, struct cifsTconInfo *tcon,
 	    const char *fileName, const int openDisposition,
@@ -1207,13 +1255,7 @@ OldOpenRetry:
 		pSMB->OpenFlags = cpu_to_le16(REQ_BATCHOPLOCK);
 
 	pSMB->OpenFlags |= cpu_to_le16(REQ_MORE_INFO);
-	/* BB fixme add conversion for access_flags to bits 0 - 2 of mode */
-	/* 0 = read
-	   1 = write
-	   2 = rw
-	   3 = execute
-	 */
-	pSMB->Mode = cpu_to_le16(2);
+	pSMB->Mode = cpu_to_le16(access_flags_to_smbopen_mode(access_flags));
 	pSMB->Mode |= cpu_to_le16(0x40); /* deny none */
 	/* set file as system file if special file such
 	   as fifo and server expecting SFU style and
@@ -1247,7 +1289,7 @@ OldOpenRetry:
 	} else {
 	/* BB verify if wct == 15 */
 
-/*		*pOplock = pSMBr->OplockLevel; */  /* BB take from action field BB */
+/*		*pOplock = pSMBr->OplockLevel; */ /* BB take from action field*/
 
 		*netfid = pSMBr->Fid;   /* cifs fid stays in le */
 		/* Let caller know file was created so we can set the mode. */
@@ -1686,7 +1728,7 @@ CIFSSMBLock(const int xid, struct cifsTconInfo *tcon,
 {
 	int rc = 0;
 	LOCK_REQ *pSMB = NULL;
-	LOCK_RSP *pSMBr = NULL;
+/*	LOCK_RSP *pSMBr = NULL; */ /* No response data other than rc to parse */
 	int bytes_returned;
 	int timeout = 0;
 	__u16 count;
@@ -1697,8 +1739,6 @@ CIFSSMBLock(const int xid, struct cifsTconInfo *tcon,
 	if (rc)
 		return rc;
 
-	pSMBr = (LOCK_RSP *)pSMB; /* BB removeme BB */
-
 	if (lockType == LOCKING_ANDX_OPLOCK_RELEASE) {
 		timeout = CIFS_ASYNC_OP; /* no response expected */
 		pSMB->Timeout = 0;
@@ -1732,7 +1772,7 @@ CIFSSMBLock(const int xid, struct cifsTconInfo *tcon,
 
 	if (waitFlag) {
 		rc = SendReceiveBlockingLock(xid, tcon, (struct smb_hdr *) pSMB,
-			(struct smb_hdr *) pSMBr, &bytes_returned);
+			(struct smb_hdr *) pSMB, &bytes_returned);
 		cifs_small_buf_release(pSMB);
 	} else {
 		rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *)pSMB,
@@ -1767,7 +1807,7 @@ CIFSSMBPosixLock(const int xid, struct cifsTconInfo *tcon,
 	cFYI(1, ("Posix Lock"));
 
 	if (pLockData == NULL)
-		return EINVAL;
+		return -EINVAL;
 
 	rc = small_smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB);
 
@@ -1944,7 +1984,7 @@ renameRetry:
 	/* protocol requires ASCII signature byte on Unicode string */
 		pSMB->OldFileName[name_len + 1] = 0x00;
 		name_len2 =
-		    cifsConvertToUCS((__le16 *) &pSMB->OldFileName[name_len + 2],
+		    cifsConvertToUCS((__le16 *)&pSMB->OldFileName[name_len + 2],
 				     toName, PATH_MAX, nls_codepage, remap);
 		name_len2 += 1 /* trailing null */  + 1 /* Signature word */ ;
 		name_len2 *= 2;	/* convert to bytes */
@@ -2117,8 +2157,7 @@ copyRetry:
 		cFYI(1, ("Send error in copy = %d with %d files copied",
 			rc, le16_to_cpu(pSMBr->CopyCount)));
 	}
-	if (pSMB)
-		cifs_buf_release(pSMB);
+	cifs_buf_release(pSMB);
 
 	if (rc == -EAGAIN)
 		goto copyRetry;
@@ -2207,8 +2246,7 @@ createSymLinkRetry:
 	if (rc)
 		cFYI(1, ("Send error in SetPathInfo create symlink = %d", rc));
 
-	if (pSMB)
-		cifs_buf_release(pSMB);
+	cifs_buf_release(pSMB);
 
 	if (rc == -EAGAIN)
 		goto createSymLinkRetry;
@@ -2925,7 +2963,8 @@ setAclRetry:
 	}
 	params = 6 + name_len;
 	pSMB->MaxParameterCount = cpu_to_le16(2);
-	pSMB->MaxDataCount = cpu_to_le16(1000); /* BB find max SMB size from sess */
+	/* BB find max SMB size from sess */
+	pSMB->MaxDataCount = cpu_to_le16(1000);
 	pSMB->MaxSetupCount = 0;
 	pSMB->Reserved = 0;
 	pSMB->Flags = 0;
@@ -3322,7 +3361,8 @@ QPathInfoRetry:
 	params = 2 /* level */ + 4 /* reserved */ + name_len /* includes NUL */;
 	pSMB->TotalDataCount = 0;
 	pSMB->MaxParameterCount = cpu_to_le16(2);
-	pSMB->MaxDataCount = cpu_to_le16(4000);	/* BB find exact max SMB PDU from sess structure BB */
+	/* BB find exact max SMB PDU from sess structure BB */
+	pSMB->MaxDataCount = cpu_to_le16(4000);
 	pSMB->MaxSetupCount = 0;
 	pSMB->Reserved = 0;
 	pSMB->Flags = 0;
@@ -3388,7 +3428,7 @@ QPathInfoRetry:
 int
 CIFSSMBUnixQPathInfo(const int xid, struct cifsTconInfo *tcon,
 		     const unsigned char *searchName,
-		     FILE_UNIX_BASIC_INFO * pFindData,
+		     FILE_UNIX_BASIC_INFO *pFindData,
 		     const struct nls_table *nls_codepage, int remap)
 {
 /* SMB_QUERY_FILE_UNIX_BASIC */
@@ -3679,6 +3719,7 @@ int CIFSFindNext(const int xid, struct cifsTconInfo *tcon,
 	if (rc) {
 		if (rc == -EBADF) {
 			psrch_inf->endOfSearch = true;
+			cifs_buf_release(pSMB);
 			rc = 0; /* search probably was closed at end of search*/
 		} else
 			cFYI(1, ("FindNext returned = %d", rc));
@@ -3856,25 +3897,112 @@ GetInodeNumOut:
 	return rc;
 }
 
+/* parses DFS refferal V3 structure
+ * caller is responsible for freeing target_nodes
+ * returns:
+ * 	on success - 0
+ *	on failure - errno
+ */
+static int
+parse_DFS_referrals(TRANSACTION2_GET_DFS_REFER_RSP *pSMBr,
+		unsigned int *num_of_nodes,
+		struct dfs_info3_param **target_nodes,
+		const struct nls_table *nls_codepage)
+{
+	int i, rc = 0;
+	char *data_end;
+	bool is_unicode;
+	struct dfs_referral_level_3 *ref;
+
+	is_unicode = pSMBr->hdr.Flags2 & SMBFLG2_UNICODE;
+	*num_of_nodes = le16_to_cpu(pSMBr->NumberOfReferrals);
+
+	if (*num_of_nodes < 1) {
+		cERROR(1, ("num_referrals: must be at least > 0,"
+			"but we get num_referrals = %d\n", *num_of_nodes));
+		rc = -EINVAL;
+		goto parse_DFS_referrals_exit;
+	}
+
+	ref = (struct dfs_referral_level_3 *) &(pSMBr->referrals);
+	if (ref->VersionNumber != cpu_to_le16(3)) {
+		cERROR(1, ("Referrals of V%d version are not supported,"
+			"should be V3", le16_to_cpu(ref->VersionNumber)));
+		rc = -EINVAL;
+		goto parse_DFS_referrals_exit;
+	}
+
+	/* get the upper boundary of the resp buffer */
+	data_end = (char *)(&(pSMBr->PathConsumed)) +
+				le16_to_cpu(pSMBr->t2.DataCount);
+
+	cFYI(1, ("num_referrals: %d dfs flags: 0x%x ... \n",
+			*num_of_nodes,
+			le16_to_cpu(pSMBr->DFSFlags)));
+
+	*target_nodes = kzalloc(sizeof(struct dfs_info3_param) *
+			*num_of_nodes, GFP_KERNEL);
+	if (*target_nodes == NULL) {
+		cERROR(1, ("Failed to allocate buffer for target_nodes\n"));
+		rc = -ENOMEM;
+		goto parse_DFS_referrals_exit;
+	}
+
+	/* collect neccessary data from referrals */
+	for (i = 0; i < *num_of_nodes; i++) {
+		char *temp;
+		int max_len;
+		struct dfs_info3_param *node = (*target_nodes)+i;
+
+		node->flags = le16_to_cpu(pSMBr->DFSFlags);
+		node->path_consumed = le16_to_cpu(pSMBr->PathConsumed);
+		node->server_type = le16_to_cpu(ref->ServerType);
+		node->ref_flag = le16_to_cpu(ref->ReferralEntryFlags);
+
+		/* copy DfsPath */
+		temp = (char *)ref + le16_to_cpu(ref->DfsPathOffset);
+		max_len = data_end - temp;
+		rc = cifs_strncpy_to_host(&(node->path_name), temp,
+					max_len, is_unicode, nls_codepage);
+		if (rc)
+			goto parse_DFS_referrals_exit;
+
+		/* copy link target UNC */
+		temp = (char *)ref + le16_to_cpu(ref->NetworkAddressOffset);
+		max_len = data_end - temp;
+		rc = cifs_strncpy_to_host(&(node->node_name), temp,
+					max_len, is_unicode, nls_codepage);
+		if (rc)
+			goto parse_DFS_referrals_exit;
+
+		ref += le16_to_cpu(ref->Size);
+	}
+
+parse_DFS_referrals_exit:
+	if (rc) {
+		free_dfs_info_array(*target_nodes, *num_of_nodes);
+		*target_nodes = NULL;
+		*num_of_nodes = 0;
+	}
+	return rc;
+}
+
 int
 CIFSGetDFSRefer(const int xid, struct cifsSesInfo *ses,
 		const unsigned char *searchName,
-		unsigned char **targetUNCs,
-		unsigned int *number_of_UNC_in_array,
+		struct dfs_info3_param **target_nodes,
+		unsigned int *num_of_nodes,
 		const struct nls_table *nls_codepage, int remap)
 {
 /* TRANS2_GET_DFS_REFERRAL */
 	TRANSACTION2_GET_DFS_REFER_REQ *pSMB = NULL;
 	TRANSACTION2_GET_DFS_REFER_RSP *pSMBr = NULL;
-	struct dfs_referral_level_3 *referrals = NULL;
 	int rc = 0;
 	int bytes_returned;
 	int name_len;
-	unsigned int i;
-	char *temp;
 	__u16 params, byte_count;
-	*number_of_UNC_in_array = 0;
-	*targetUNCs = NULL;
+	*num_of_nodes = 0;
+	*target_nodes = NULL;
 
 	cFYI(1, ("In GetDFSRefer the path %s", searchName));
 	if (ses == NULL)
@@ -3921,7 +4049,8 @@ getDFSRetry:
 	pSMB->DataCount = 0;
 	pSMB->DataOffset = 0;
 	pSMB->MaxParameterCount = 0;
-	pSMB->MaxDataCount = cpu_to_le16(4000);	/* BB find exact max SMB PDU from sess structure BB */
+	/* BB find exact max SMB PDU from sess structure BB */
+	pSMB->MaxDataCount = cpu_to_le16(4000);
 	pSMB->MaxSetupCount = 0;
 	pSMB->Reserved = 0;
 	pSMB->Flags = 0;
@@ -3943,103 +4072,26 @@ getDFSRetry:
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	if (rc) {
 		cFYI(1, ("Send error in GetDFSRefer = %d", rc));
-	} else {		/* decode response */
-/* BB Add logic to parse referrals here */
-		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
+		goto GetDFSRefExit;
+	}
+	rc = validate_t2((struct smb_t2_rsp *)pSMBr);
 
-		/* BB Also check if enough total bytes returned? */
-		if (rc || (pSMBr->ByteCount < 17))
-			rc = -EIO;      /* bad smb */
-		else {
-			__u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
-			__u16 data_count = le16_to_cpu(pSMBr->t2.DataCount);
+	/* BB Also check if enough total bytes returned? */
+	if (rc || (pSMBr->ByteCount < 17)) {
+		rc = -EIO;      /* bad smb */
+		goto GetDFSRefExit;
+	}
 
-			cFYI(1,
-			    ("Decoding GetDFSRefer response BCC: %d  Offset %d",
-			      pSMBr->ByteCount, data_offset));
-			referrals =
-			    (struct dfs_referral_level_3 *)
-					(8 /* sizeof start of data block */ +
-					data_offset +
-					(char *) &pSMBr->hdr.Protocol);
-			cFYI(1, ("num_referrals: %d dfs flags: 0x%x ... \n"
-				"for referral one refer size: 0x%x srv "
-				"type: 0x%x refer flags: 0x%x ttl: 0x%x",
-				le16_to_cpu(pSMBr->NumberOfReferrals),
-				le16_to_cpu(pSMBr->DFSFlags),
-				le16_to_cpu(referrals->ReferralSize),
-				le16_to_cpu(referrals->ServerType),
-				le16_to_cpu(referrals->ReferralFlags),
-				le16_to_cpu(referrals->TimeToLive)));
-			/* BB This field is actually two bytes in from start of
-			   data block so we could do safety check that DataBlock
-			   begins at address of pSMBr->NumberOfReferrals */
-			*number_of_UNC_in_array =
-					le16_to_cpu(pSMBr->NumberOfReferrals);
-
-			/* BB Fix below so can return more than one referral */
-			if (*number_of_UNC_in_array > 1)
-				*number_of_UNC_in_array = 1;
-
-			/* get the length of the strings describing refs */
-			name_len = 0;
-			for (i = 0; i < *number_of_UNC_in_array; i++) {
-				/* make sure that DfsPathOffset not past end */
-				__u16 offset =
-					le16_to_cpu(referrals->DfsPathOffset);
-				if (offset > data_count) {
-					/* if invalid referral, stop here and do
-					not try to copy any more */
-					*number_of_UNC_in_array = i;
-					break;
-				}
-				temp = ((char *)referrals) + offset;
+	cFYI(1, ("Decoding GetDFSRefer response BCC: %d  Offset %d",
+				pSMBr->ByteCount,
+				le16_to_cpu(pSMBr->t2.DataOffset)));
 
-				if (pSMBr->hdr.Flags2 & SMBFLG2_UNICODE) {
-					name_len += UniStrnlen((wchar_t *)temp,
-								data_count);
-				} else {
-					name_len += strnlen(temp, data_count);
-				}
-				referrals++;
-				/* BB add check that referral pointer does
-				   not fall off end PDU */
-			}
-			/* BB add check for name_len bigger than bcc */
-			*targetUNCs =
-				kmalloc(name_len+1+(*number_of_UNC_in_array),
-					GFP_KERNEL);
-			if (*targetUNCs == NULL) {
-				rc = -ENOMEM;
-				goto GetDFSRefExit;
-			}
-			/* copy the ref strings */
-			referrals = (struct dfs_referral_level_3 *)
-					(8 /* sizeof data hdr */ + data_offset +
-					(char *) &pSMBr->hdr.Protocol);
-
-			for (i = 0; i < *number_of_UNC_in_array; i++) {
-				temp = ((char *)referrals) +
-					  le16_to_cpu(referrals->DfsPathOffset);
-				if (pSMBr->hdr.Flags2 & SMBFLG2_UNICODE) {
-					cifs_strfromUCS_le(*targetUNCs,
-							  (__le16 *) temp,
-							  name_len,
-							  nls_codepage);
-				} else {
-					strncpy(*targetUNCs, temp, name_len);
-				}
-				/*  BB update target_uncs pointers */
-				referrals++;
-			}
-			temp = *targetUNCs;
-			temp[name_len] = 0;
-		}
+	/* parse returned result into more usable form */
+	rc = parse_DFS_referrals(pSMBr, num_of_nodes,
+				 target_nodes, nls_codepage);
 
-	}
 GetDFSRefExit:
-	if (pSMB)
-		cifs_buf_release(pSMB);
+	cifs_buf_release(pSMB);
 
 	if (rc == -EAGAIN)
 		goto getDFSRetry;
@@ -4229,7 +4281,8 @@ QFSAttributeRetry:
 	params = 2;	/* level */
 	pSMB->TotalDataCount = 0;
 	pSMB->MaxParameterCount = cpu_to_le16(2);
-	pSMB->MaxDataCount = cpu_to_le16(1000);	/* BB find exact max SMB PDU from sess structure BB */
+	/* BB find exact max SMB PDU from sess structure BB */
+	pSMB->MaxDataCount = cpu_to_le16(1000);
 	pSMB->MaxSetupCount = 0;
 	pSMB->Reserved = 0;
 	pSMB->Flags = 0;
@@ -4298,7 +4351,8 @@ QFSDeviceRetry:
 	params = 2;	/* level */
 	pSMB->TotalDataCount = 0;
 	pSMB->MaxParameterCount = cpu_to_le16(2);
-	pSMB->MaxDataCount = cpu_to_le16(1000);	/* BB find exact max SMB PDU from sess structure BB */
+	/* BB find exact max SMB PDU from sess structure BB */
+	pSMB->MaxDataCount = cpu_to_le16(1000);
 	pSMB->MaxSetupCount = 0;
 	pSMB->Reserved = 0;
 	pSMB->Flags = 0;
@@ -4369,7 +4423,8 @@ QFSUnixRetry:
 	pSMB->DataCount = 0;
 	pSMB->DataOffset = 0;
 	pSMB->MaxParameterCount = cpu_to_le16(2);
-	pSMB->MaxDataCount = cpu_to_le16(100);	/* BB find exact max SMB PDU from sess structure BB */
+	/* BB find exact max SMB PDU from sess structure BB */
+	pSMB->MaxDataCount = cpu_to_le16(100);
 	pSMB->MaxSetupCount = 0;
 	pSMB->Reserved = 0;
 	pSMB->Flags = 0;
@@ -4444,7 +4499,8 @@ SETFSUnixRetry:
 	offset = param_offset + params;
 
 	pSMB->MaxParameterCount = cpu_to_le16(4);
-	pSMB->MaxDataCount = cpu_to_le16(100);	/* BB find exact max SMB PDU from sess structure BB */
+	/* BB find exact max SMB PDU from sess structure BB */
+	pSMB->MaxDataCount = cpu_to_le16(100);
 	pSMB->SetupCount = 1;
 	pSMB->Reserved3 = 0;
 	pSMB->SubCommand = cpu_to_le16(TRANS2_SET_FS_INFORMATION);
@@ -4512,7 +4568,8 @@ QFSPosixRetry:
 	pSMB->DataCount = 0;
 	pSMB->DataOffset = 0;
 	pSMB->MaxParameterCount = cpu_to_le16(2);
-	pSMB->MaxDataCount = cpu_to_le16(100);	/* BB find exact max SMB PDU from sess structure BB */
+	/* BB find exact max SMB PDU from sess structure BB */
+	pSMB->MaxDataCount = cpu_to_le16(100);
 	pSMB->MaxSetupCount = 0;
 	pSMB->Reserved = 0;
 	pSMB->Flags = 0;
@@ -4702,7 +4759,8 @@ CIFSSMBSetFileSize(const int xid, struct cifsTconInfo *tcon, __u64 size,
 
 	count = sizeof(struct file_end_of_file_info);
 	pSMB->MaxParameterCount = cpu_to_le16(2);
-	pSMB->MaxDataCount = cpu_to_le16(1000);	/* BB find max SMB PDU from sess */
+	/* BB find exact max SMB PDU from sess structure BB */
+	pSMB->MaxDataCount = cpu_to_le16(1000);
 	pSMB->SetupCount = 1;
 	pSMB->Reserved3 = 0;
 	pSMB->SubCommand = cpu_to_le16(TRANS2_SET_FILE_INFORMATION);
@@ -4789,7 +4847,8 @@ CIFSSMBSetFileTimes(const int xid, struct cifsTconInfo *tcon,
 
 	count = sizeof(FILE_BASIC_INFO);
 	pSMB->MaxParameterCount = cpu_to_le16(2);
-	pSMB->MaxDataCount = cpu_to_le16(1000);	/* BB find max SMB PDU from sess */
+	/* BB find max SMB PDU from sess */
+	pSMB->MaxDataCount = cpu_to_le16(1000);
 	pSMB->SetupCount = 1;
 	pSMB->Reserved3 = 0;
 	pSMB->SubCommand = cpu_to_le16(TRANS2_SET_FILE_INFORMATION);
@@ -4856,7 +4915,8 @@ SetTimesRetry:
 	params = 6 + name_len;
 	count = sizeof(FILE_BASIC_INFO);
 	pSMB->MaxParameterCount = cpu_to_le16(2);
-	pSMB->MaxDataCount = cpu_to_le16(1000);	/* BB find exact max SMB PDU from sess structure BB */
+	/* BB find max SMB PDU from sess structure BB */
+	pSMB->MaxDataCount = cpu_to_le16(1000);
 	pSMB->MaxSetupCount = 0;
 	pSMB->Reserved = 0;
 	pSMB->Flags = 0;
@@ -4986,7 +5046,8 @@ setPermsRetry:
 	params = 6 + name_len;
 	count = sizeof(FILE_UNIX_BASIC_INFO);
 	pSMB->MaxParameterCount = cpu_to_le16(2);
-	pSMB->MaxDataCount = cpu_to_le16(1000);	/* BB find exact max SMB PDU from sess structure BB */
+	/* BB find max SMB PDU from sess structure BB */
+	pSMB->MaxDataCount = cpu_to_le16(1000);
 	pSMB->MaxSetupCount = 0;
 	pSMB->Reserved = 0;
 	pSMB->Flags = 0;
@@ -5051,8 +5112,7 @@ setPermsRetry:
 	if (rc)
 		cFYI(1, ("SetPathInfo (perms) returned %d", rc));
 
-	if (pSMB)
-		cifs_buf_release(pSMB);
+	cifs_buf_release(pSMB);
 	if (rc == -EAGAIN)
 		goto setPermsRetry;
 	return rc;
@@ -5169,7 +5229,8 @@ QAllEAsRetry:
 	params = 2 /* level */ + 4 /* reserved */ + name_len /* includes NUL */;
 	pSMB->TotalDataCount = 0;
 	pSMB->MaxParameterCount = cpu_to_le16(2);
-	pSMB->MaxDataCount = cpu_to_le16(4000);	/* BB find exact max SMB PDU from sess structure BB */
+	/* BB find exact max SMB PDU from sess structure BB */
+	pSMB->MaxDataCount = cpu_to_le16(4000);
 	pSMB->MaxSetupCount = 0;
 	pSMB->Reserved = 0;
 	pSMB->Flags = 0;
@@ -5273,8 +5334,7 @@ QAllEAsRetry:
 			}
 		}
 	}
-	if (pSMB)
-		cifs_buf_release(pSMB);
+	cifs_buf_release(pSMB);
 	if (rc == -EAGAIN)
 		goto QAllEAsRetry;
 
@@ -5317,7 +5377,8 @@ QEARetry:
 	params = 2 /* level */ + 4 /* reserved */ + name_len /* includes NUL */;
 	pSMB->TotalDataCount = 0;
 	pSMB->MaxParameterCount = cpu_to_le16(2);
-	pSMB->MaxDataCount = cpu_to_le16(4000);	/* BB find exact max SMB PDU from sess structure BB */
+	/* BB find exact max SMB PDU from sess structure BB */
+	pSMB->MaxDataCount = cpu_to_le16(4000);
 	pSMB->MaxSetupCount = 0;
 	pSMB->Reserved = 0;
 	pSMB->Flags = 0;
@@ -5422,8 +5483,7 @@ QEARetry:
 			}
 		}
 	}
-	if (pSMB)
-		cifs_buf_release(pSMB);
+	cifs_buf_release(pSMB);
 	if (rc == -EAGAIN)
 		goto QEARetry;
 
@@ -5475,7 +5535,8 @@ SetEARetry:
 
 	count = sizeof(*parm_data) + ea_value_len + name_len;
 	pSMB->MaxParameterCount = cpu_to_le16(2);
-	pSMB->MaxDataCount = cpu_to_le16(1000);	/* BB find max SMB size from sess */
+	/* BB find max SMB PDU from sess */
+	pSMB->MaxDataCount = cpu_to_le16(1000);
 	pSMB->MaxSetupCount = 0;
 	pSMB->Reserved = 0;
 	pSMB->Flags = 0;
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index f428bf3bf1a9..e8fa46c7cff2 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -60,7 +60,7 @@ struct smb_vol {
 	char *domainname;
 	char *UNC;
 	char *UNCip;
-	char *in6_addr;  /* ipv6 address as human readable form of in6_addr */
+	char *in6_addr;   /* ipv6 address as human readable form of in6_addr */
 	char *iocharset;  /* local code page for mapping to and from Unicode */
 	char source_rfc1001_name[16]; /* netbios name of client */
 	char target_rfc1001_name[16]; /* netbios name of server for Win9x/ME */
@@ -75,19 +75,21 @@ struct smb_vol {
 	bool setuids:1;
 	bool override_uid:1;
 	bool override_gid:1;
+	bool dynperm:1;
 	bool noperm:1;
 	bool no_psx_acl:1; /* set if posix acl support should be disabled */
 	bool cifs_acl:1;
 	bool no_xattr:1;   /* set if xattr (EA) support should be disabled*/
 	bool server_ino:1; /* use inode numbers from server ie UniqueId */
 	bool direct_io:1;
-	bool remap:1;     /* set to remap seven reserved chars in filenames */
-	bool posix_paths:1;   /* unset to not ask for posix pathnames. */
+	bool remap:1;      /* set to remap seven reserved chars in filenames */
+	bool posix_paths:1; /* unset to not ask for posix pathnames. */
 	bool no_linux_ext:1;
 	bool sfu_emul:1;
-	bool nullauth:1; /* attempt to authenticate with null user */
-	unsigned nocase;     /* request case insensitive filenames */
-	unsigned nobrl;      /* disable sending byte range locks to srv */
+	bool nullauth:1;   /* attempt to authenticate with null user */
+	bool nocase:1;     /* request case insensitive filenames */
+	bool nobrl:1;      /* disable sending byte range locks to srv */
+	bool seal:1;       /* request transport encryption on share */
 	unsigned int rsize;
 	unsigned int wsize;
 	unsigned int sockopt;
@@ -651,6 +653,7 @@ multi_t2_fnd:
 	spin_lock(&GlobalMid_Lock);
 	server->tcpStatus = CifsExiting;
 	spin_unlock(&GlobalMid_Lock);
+	wake_up_all(&server->response_q);
 
 	/* don't exit until kthread_stop is called */
 	set_current_state(TASK_UNINTERRUPTIBLE);
@@ -1246,6 +1249,10 @@ cifs_parse_mount_options(char *options, const char *devname,
 			vol->setuids = 1;
 		} else if (strnicmp(data, "nosetuids", 9) == 0) {
 			vol->setuids = 0;
+		} else if (strnicmp(data, "dynperm", 7) == 0) {
+			vol->dynperm = true;
+		} else if (strnicmp(data, "nodynperm", 9) == 0) {
+			vol->dynperm = false;
 		} else if (strnicmp(data, "nohard", 6) == 0) {
 			vol->retry = 0;
 		} else if (strnicmp(data, "nosoft", 6) == 0) {
@@ -1268,8 +1275,12 @@ cifs_parse_mount_options(char *options, const char *devname,
 			vol->no_psx_acl = 1;
 		} else if (strnicmp(data, "sign", 4) == 0) {
 			vol->secFlg |= CIFSSEC_MUST_SIGN;
-/*		} else if (strnicmp(data, "seal",4) == 0) {
-			vol->secFlg |= CIFSSEC_MUST_SEAL; */
+		} else if (strnicmp(data, "seal", 4) == 0) {
+			/* we do not do the following in secFlags because seal
+			   is a per tree connection (mount) not a per socket
+			   or per-smb connection option in the protocol */
+			/* vol->secFlg |= CIFSSEC_MUST_SEAL; */
+			vol->seal = 1;
 		} else if (strnicmp(data, "direct", 6) == 0) {
 			vol->direct_io = 1;
 		} else if (strnicmp(data, "forcedirectio", 13) == 0) {
@@ -1414,34 +1425,12 @@ find_unc(__be32 new_target_ip_addr, char *uncName, char *userName)
 }
 
 int
-connect_to_dfs_path(int xid, struct cifsSesInfo *pSesInfo,
-		    const char *old_path, const struct nls_table *nls_codepage,
-		    int remap)
-{
-	struct dfs_info3_param *referrals = NULL;
-	unsigned int num_referrals;
-	int rc = 0;
-
-	rc = get_dfs_path(xid, pSesInfo, old_path, nls_codepage,
-			&num_referrals, &referrals, remap);
-
-	/* BB Add in code to: if valid refrl, if not ip address contact
-		the helper that resolves tcp names, mount to it, try to
-		tcon to it unmount it if fail */
-
-	kfree(referrals);
-
-	return rc;
-}
-
-int
 get_dfs_path(int xid, struct cifsSesInfo *pSesInfo, const char *old_path,
 	     const struct nls_table *nls_codepage, unsigned int *pnum_referrals,
 	     struct dfs_info3_param **preferrals, int remap)
 {
 	char *temp_unc;
 	int rc = 0;
-	unsigned char *targetUNCs;
 
 	*pnum_referrals = 0;
 	*preferrals = NULL;
@@ -1464,7 +1453,7 @@ get_dfs_path(int xid, struct cifsSesInfo *pSesInfo, const char *old_path,
 		kfree(temp_unc);
 	}
 	if (rc == 0)
-		rc = CIFSGetDFSRefer(xid, pSesInfo, old_path, &targetUNCs,
+		rc = CIFSGetDFSRefer(xid, pSesInfo, old_path, preferrals,
 				     pnum_referrals, nls_codepage, remap);
 	/* BB map targetUNCs to dfs_info3 structures, here or
 		in CIFSGetDFSRefer BB */
@@ -1815,7 +1804,7 @@ convert_delimiter(char *path, char delim)
 	if (path == NULL)
 		return;
 
-	if (delim == '/') 
+	if (delim == '/')
 		old_delim = '\\';
 	else
 		old_delim = '/';
@@ -2125,11 +2114,17 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 			cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_OVERR_UID;
 		if (volume_info.override_gid)
 			cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_OVERR_GID;
+		if (volume_info.dynperm)
+			cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_DYNPERM;
 		if (volume_info.direct_io) {
 			cFYI(1, ("mounting share using direct i/o"));
 			cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_DIRECT_IO;
 		}
 
+		if ((volume_info.cifs_acl) && (volume_info.dynperm))
+			cERROR(1, ("mount option dynperm ignored if cifsacl "
+				   "mount option supported"));
+
 		tcon =
 		    find_unc(sin_server.sin_addr.s_addr, volume_info.UNC,
 			     volume_info.username);
@@ -2141,6 +2136,9 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 			   for the retry flag is used */
 			tcon->retry = volume_info.retry;
 			tcon->nocase = volume_info.nocase;
+			if (tcon->seal != volume_info.seal)
+				cERROR(1, ("transport encryption setting "
+					   "conflicts with existing tid"));
 		} else {
 			tcon = tconInfoAlloc();
 			if (tcon == NULL)
@@ -2154,10 +2152,11 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 				if ((strchr(volume_info.UNC + 3, '\\') == NULL)
 				    && (strchr(volume_info.UNC + 3, '/') ==
 					NULL)) {
-					rc = connect_to_dfs_path(xid, pSesInfo,
+/*					rc = connect_to_dfs_path(xid, pSesInfo,
 						"", cifs_sb->local_nls,
 						cifs_sb->mnt_cifs_flags &
-						  CIFS_MOUNT_MAP_SPECIAL_CHR);
+						  CIFS_MOUNT_MAP_SPECIAL_CHR);*/
+					cFYI(1, ("DFS root not supported"));
 					rc = -ENODEV;
 					goto out;
 				} else {
@@ -2173,6 +2172,7 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 					atomic_inc(&pSesInfo->inUse);
 					tcon->retry = volume_info.retry;
 					tcon->nocase = volume_info.nocase;
+					tcon->seal = volume_info.seal;
 				}
 			}
 		}
@@ -2314,9 +2314,10 @@ CIFSSessSetup(unsigned int xid, struct cifsSesInfo *ses,
 	user = ses->userName;
 	domain = ses->domainName;
 	smb_buffer = cifs_buf_get();
-	if (smb_buffer == NULL) {
+
+	if (smb_buffer == NULL)
 		return -ENOMEM;
-	}
+
 	smb_buffer_response = smb_buffer;
 	pSMBr = pSMB = (SESSION_SETUP_ANDX *) smb_buffer;
 
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index e4e0078a0526..fb69c1fa85c9 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -49,18 +49,25 @@ build_path_from_dentry(struct dentry *direntry)
 	struct dentry *temp;
 	int namelen;
 	int pplen;
+	int dfsplen;
 	char *full_path;
 	char dirsep;
+	struct cifs_sb_info *cifs_sb;
 
 	if (direntry == NULL)
 		return NULL;  /* not much we can do if dentry is freed and
 		we need to reopen the file after it was closed implicitly
 		when the server crashed */
 
-	dirsep = CIFS_DIR_SEP(CIFS_SB(direntry->d_sb));
-	pplen = CIFS_SB(direntry->d_sb)->prepathlen;
+	cifs_sb = CIFS_SB(direntry->d_sb);
+	dirsep = CIFS_DIR_SEP(cifs_sb);
+	pplen = cifs_sb->prepathlen;
+	if (cifs_sb->tcon && (cifs_sb->tcon->Flags & SMB_SHARE_IS_IN_DFS))
+		dfsplen = strnlen(cifs_sb->tcon->treeName, MAX_TREE_SIZE + 1);
+	else
+		dfsplen = 0;
 cifs_bp_rename_retry:
-	namelen = pplen;
+	namelen = pplen + dfsplen;
 	for (temp = direntry; !IS_ROOT(temp);) {
 		namelen += (1 + temp->d_name.len);
 		temp = temp->d_parent;
@@ -91,7 +98,7 @@ cifs_bp_rename_retry:
 			return NULL;
 		}
 	}
-	if (namelen != pplen) {
+	if (namelen != pplen + dfsplen) {
 		cERROR(1,
 		       ("did not end path lookup where expected namelen is %d",
 			namelen));
@@ -107,7 +114,18 @@ cifs_bp_rename_retry:
 	   since the '\' is a valid posix character so we can not switch
 	   those safely to '/' if any are found in the middle of the prepath */
 	/* BB test paths to Windows with '/' in the midst of prepath */
-	strncpy(full_path, CIFS_SB(direntry->d_sb)->prepath, pplen);
+
+	if (dfsplen) {
+		strncpy(full_path, cifs_sb->tcon->treeName, dfsplen);
+		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) {
+			int i;
+			for (i = 0; i < dfsplen; i++) {
+				if (full_path[i] == '\\')
+					full_path[i] = '/';
+			}
+		}
+	}
+	strncpy(full_path + dfsplen, CIFS_SB(direntry->d_sb)->prepath, pplen);
 	return full_path;
 }
 
@@ -242,7 +260,9 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
 						 buf, inode->i_sb, xid,
 						 &fileHandle);
 			if (newinode) {
-				newinode->i_mode = mode;
+				if (cifs_sb->mnt_cifs_flags &
+				    CIFS_MOUNT_DYNPERM)
+					newinode->i_mode = mode;
 				if ((oplock & CIFS_CREATE_ACTION) &&
 				    (cifs_sb->mnt_cifs_flags &
 				     CIFS_MOUNT_SET_UID)) {
@@ -590,7 +610,7 @@ static int cifs_ci_compare(struct dentry *dentry, struct qstr *a,
 		 * case take precedence.  If a is not a negative dentry, this
 		 * should have no side effects
 		 */
-		memcpy(a->name, b->name, a->len);
+		memcpy((void *)a->name, b->name, a->len);
 		return 0;
 	}
 	return 1;
diff --git a/fs/cifs/dns_resolve.c b/fs/cifs/dns_resolve.c
index 939e256f8497..f730ef35499e 100644
--- a/fs/cifs/dns_resolve.c
+++ b/fs/cifs/dns_resolve.c
@@ -134,10 +134,6 @@ dns_resolve_server_name_to_ip(const char *unc, char **ip_addr)
 	rkey = request_key(&key_type_dns_resolver, name, "");
 	if (!IS_ERR(rkey)) {
 		data = rkey->payload.data;
-		cFYI(1, ("%s: resolved: %s to %s", __func__,
-					rkey->description,
-					*ip_addr
-				));
 	} else {
 		cERROR(1, ("%s: unable to resolve: %s", __func__, name));
 		goto out;
@@ -150,6 +146,11 @@ skip_upcall:
 		if (*ip_addr) {
 			memcpy(*ip_addr, data, len);
 			(*ip_addr)[len] = '\0';
+			if (!IS_ERR(rkey))
+				cFYI(1, ("%s: resolved: %s to %s", __func__,
+							name,
+							*ip_addr
+					));
 			rc = 0;
 		} else {
 			rc = -ENOMEM;
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 31a0a33b9d95..0aac824371a5 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -75,7 +75,11 @@ static inline int cifs_convert_flags(unsigned int flags)
 		return (GENERIC_READ | GENERIC_WRITE);
 	}
 
-	return 0x20197;
+	return (READ_CONTROL | FILE_WRITE_ATTRIBUTES | FILE_READ_ATTRIBUTES |
+		FILE_WRITE_EA | FILE_APPEND_DATA | FILE_WRITE_DATA |
+		FILE_READ_DATA);
+
+
 }
 
 static inline int cifs_get_disposition(unsigned int flags)
@@ -542,7 +546,6 @@ int cifs_close(struct inode *inode, struct file *file)
 			msleep(timeout);
 			timeout *= 8;
 		}
-		kfree(pSMBFile->search_resume_name);
 		kfree(file->private_data);
 		file->private_data = NULL;
 	} else
@@ -601,12 +604,6 @@ int cifs_closedir(struct inode *inode, struct file *file)
 			else
 				cifs_buf_release(ptmp);
 		}
-		ptmp = pCFileStruct->search_resume_name;
-		if (ptmp) {
-			cFYI(1, ("closedir free resume name"));
-			pCFileStruct->search_resume_name = NULL;
-			kfree(ptmp);
-		}
 		kfree(file->private_data);
 		file->private_data = NULL;
 	}
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index fcbdbb6ad7bf..2e904bd111c8 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -161,118 +161,115 @@ static void cifs_unix_info_to_inode(struct inode *inode,
 	spin_unlock(&inode->i_lock);
 }
 
-static const unsigned char *cifs_get_search_path(struct cifs_sb_info *cifs_sb,
-						const char *search_path)
-{
-	int tree_len;
-	int path_len;
-	int i;
-	char *tmp_path;
-	struct cifsTconInfo *pTcon = cifs_sb->tcon;
-
-	if (!(pTcon->Flags & SMB_SHARE_IS_IN_DFS))
-		return search_path;
-
-	/* use full path name for working with DFS */
-	tree_len = strnlen(pTcon->treeName, MAX_TREE_SIZE + 1);
-	path_len = strnlen(search_path, MAX_PATHCONF);
 
-	tmp_path = kmalloc(tree_len+path_len+1, GFP_KERNEL);
-	if (tmp_path == NULL)
-		return search_path;
+/*
+ *	Needed to setup inode data for the directory which is the
+ *	junction to the new submount (ie to setup the fake directory
+ *      which represents a DFS referral)
+ */
+static void fill_fake_finddataunix(FILE_UNIX_BASIC_INFO *pfnd_dat,
+			       struct super_block *sb)
+{
+	struct inode *pinode = NULL;
+
+	memset(pfnd_dat, 0, sizeof(FILE_UNIX_BASIC_INFO));
+
+/*	__le64 pfnd_dat->EndOfFile = cpu_to_le64(0);
+	__le64 pfnd_dat->NumOfBytes = cpu_to_le64(0);
+	__u64 UniqueId = 0;  */
+	pfnd_dat->LastStatusChange =
+		cpu_to_le64(cifs_UnixTimeToNT(CURRENT_TIME));
+	pfnd_dat->LastAccessTime =
+		cpu_to_le64(cifs_UnixTimeToNT(CURRENT_TIME));
+	pfnd_dat->LastModificationTime =
+		cpu_to_le64(cifs_UnixTimeToNT(CURRENT_TIME));
+	pfnd_dat->Type = cpu_to_le32(UNIX_DIR);
+	pfnd_dat->Permissions = cpu_to_le64(S_IXUGO | S_IRWXU);
+	pfnd_dat->Nlinks = cpu_to_le64(2);
+	if (sb->s_root)
+		pinode = sb->s_root->d_inode;
+	if (pinode == NULL)
+		return;
 
-	strncpy(tmp_path, pTcon->treeName, tree_len);
-	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS)
-		for (i = 0; i < tree_len; i++) {
-			if (tmp_path[i] == '\\')
-				tmp_path[i] = '/';
-		}
-	strncpy(tmp_path+tree_len, search_path, path_len);
-	tmp_path[tree_len+path_len] = 0;
-	return tmp_path;
+	/* fill in default values for the remaining based on root
+	   inode since we can not query the server for this inode info */
+	pfnd_dat->DevMajor = cpu_to_le64(MAJOR(pinode->i_rdev));
+	pfnd_dat->DevMinor = cpu_to_le64(MINOR(pinode->i_rdev));
+	pfnd_dat->Uid = cpu_to_le64(pinode->i_uid);
+	pfnd_dat->Gid = cpu_to_le64(pinode->i_gid);
 }
 
 int cifs_get_inode_info_unix(struct inode **pinode,
-	const unsigned char *search_path, struct super_block *sb, int xid)
+	const unsigned char *full_path, struct super_block *sb, int xid)
 {
 	int rc = 0;
-	FILE_UNIX_BASIC_INFO findData;
+	FILE_UNIX_BASIC_INFO find_data;
 	struct cifsTconInfo *pTcon;
 	struct inode *inode;
 	struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
-	const unsigned char *full_path;
 	bool is_dfs_referral = false;
+	struct cifsInodeInfo *cifsInfo;
+	__u64 num_of_bytes;
+	__u64 end_of_file;
 
 	pTcon = cifs_sb->tcon;
-	cFYI(1, ("Getting info on %s", search_path));
-
-	full_path = cifs_get_search_path(cifs_sb, search_path);
+	cFYI(1, ("Getting info on %s", full_path));
 
-try_again_CIFSSMBUnixQPathInfo:
 	/* could have done a find first instead but this returns more info */
-	rc = CIFSSMBUnixQPathInfo(xid, pTcon, full_path, &findData,
+	rc = CIFSSMBUnixQPathInfo(xid, pTcon, full_path, &find_data,
 				  cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
 					CIFS_MOUNT_MAP_SPECIAL_CHR);
-/*	dump_mem("\nUnixQPathInfo return data", &findData,
-		 sizeof(findData)); */
-	if (rc) {
-		if (rc == -EREMOTE && !is_dfs_referral) {
-			is_dfs_referral = true;
-			if (full_path != search_path) {
-				kfree(full_path);
-				full_path = search_path;
-			}
-			goto try_again_CIFSSMBUnixQPathInfo;
-		}
+	if (rc == -EREMOTE && !is_dfs_referral) {
+		is_dfs_referral = true;
+		cFYI(DBG2, ("DFS ref"));
+		/* for DFS, server does not give us real inode data */
+		fill_fake_finddataunix(&find_data, sb);
+		rc = 0;
+	} else if (rc)
 		goto cgiiu_exit;
-	} else {
-		struct cifsInodeInfo *cifsInfo;
-		__u64 num_of_bytes = le64_to_cpu(findData.NumOfBytes);
-		__u64 end_of_file = le64_to_cpu(findData.EndOfFile);
 
-		/* get new inode */
-		if (*pinode == NULL) {
-			*pinode = new_inode(sb);
-			if (*pinode == NULL) {
-				rc = -ENOMEM;
-				goto cgiiu_exit;
-			}
-			/* Is an i_ino of zero legal? */
-			/* Are there sanity checks we can use to ensure that
-			   the server is really filling in that field? */
-			if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) {
-				(*pinode)->i_ino =
-					(unsigned long)findData.UniqueId;
-			} /* note ino incremented to unique num in new_inode */
-			if (sb->s_flags & MS_NOATIME)
-				(*pinode)->i_flags |= S_NOATIME | S_NOCMTIME;
+	num_of_bytes = le64_to_cpu(find_data.NumOfBytes);
+	end_of_file = le64_to_cpu(find_data.EndOfFile);
 
-			insert_inode_hash(*pinode);
+	/* get new inode */
+	if (*pinode == NULL) {
+		*pinode = new_inode(sb);
+		if (*pinode == NULL) {
+			rc = -ENOMEM;
+			goto cgiiu_exit;
 		}
+		/* Is an i_ino of zero legal? */
+		/* note ino incremented to unique num in new_inode */
+		/* Are there sanity checks we can use to ensure that
+		   the server is really filling in that field? */
+		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)
+			(*pinode)->i_ino = (unsigned long)find_data.UniqueId;
 
-		inode = *pinode;
-		cifsInfo = CIFS_I(inode);
+		if (sb->s_flags & MS_NOATIME)
+			(*pinode)->i_flags |= S_NOATIME | S_NOCMTIME;
 
-		cFYI(1, ("Old time %ld", cifsInfo->time));
-		cifsInfo->time = jiffies;
-		cFYI(1, ("New time %ld", cifsInfo->time));
-		/* this is ok to set on every inode revalidate */
-		atomic_set(&cifsInfo->inUse, 1);
+		insert_inode_hash(*pinode);
+	}
 
-		cifs_unix_info_to_inode(inode, &findData, 0);
+	inode = *pinode;
+	cifsInfo = CIFS_I(inode);
 
+	cFYI(1, ("Old time %ld", cifsInfo->time));
+	cifsInfo->time = jiffies;
+	cFYI(1, ("New time %ld", cifsInfo->time));
+	/* this is ok to set on every inode revalidate */
+	atomic_set(&cifsInfo->inUse, 1);
 
-		if (num_of_bytes < end_of_file)
-			cFYI(1, ("allocation size less than end of file"));
-		cFYI(1, ("Size %ld and blocks %llu",
-			(unsigned long) inode->i_size,
-			(unsigned long long)inode->i_blocks));
+	cifs_unix_info_to_inode(inode, &find_data, 0);
 
-		cifs_set_ops(inode, is_dfs_referral);
-	}
+	if (num_of_bytes < end_of_file)
+		cFYI(1, ("allocation size less than end of file"));
+	cFYI(1, ("Size %ld and blocks %llu",
+		(unsigned long) inode->i_size,
+		(unsigned long long)inode->i_blocks));
+
+	cifs_set_ops(inode, is_dfs_referral);
 cgiiu_exit:
-	if (full_path != search_path)
-		kfree(full_path);
 	return rc;
 }
 
@@ -379,21 +376,52 @@ static int get_sfu_mode(struct inode *inode,
 #endif
 }
 
+/*
+ *	Needed to setup inode data for the directory which is the
+ *	junction to the new submount (ie to setup the fake directory
+ *      which represents a DFS referral)
+ */
+static void fill_fake_finddata(FILE_ALL_INFO *pfnd_dat,
+			       struct super_block *sb)
+{
+	memset(pfnd_dat, 0, sizeof(FILE_ALL_INFO));
+
+/*	__le64 pfnd_dat->AllocationSize = cpu_to_le64(0);
+	__le64 pfnd_dat->EndOfFile = cpu_to_le64(0);
+	__u8 pfnd_dat->DeletePending = 0;
+	__u8 pfnd_data->Directory = 0;
+	__le32 pfnd_dat->EASize = 0;
+	__u64 pfnd_dat->IndexNumber = 0;
+	__u64 pfnd_dat->IndexNumber1 = 0;  */
+	pfnd_dat->CreationTime =
+		cpu_to_le64(cifs_UnixTimeToNT(CURRENT_TIME));
+	pfnd_dat->LastAccessTime =
+		cpu_to_le64(cifs_UnixTimeToNT(CURRENT_TIME));
+	pfnd_dat->LastWriteTime =
+		cpu_to_le64(cifs_UnixTimeToNT(CURRENT_TIME));
+	pfnd_dat->ChangeTime =
+		cpu_to_le64(cifs_UnixTimeToNT(CURRENT_TIME));
+	pfnd_dat->Attributes = cpu_to_le32(ATTR_DIRECTORY);
+	pfnd_dat->NumberOfLinks = cpu_to_le32(2);
+}
+
 int cifs_get_inode_info(struct inode **pinode,
-	const unsigned char *search_path, FILE_ALL_INFO *pfindData,
+	const unsigned char *full_path, FILE_ALL_INFO *pfindData,
 	struct super_block *sb, int xid, const __u16 *pfid)
 {
 	int rc = 0;
+	__u32 attr;
+	struct cifsInodeInfo *cifsInfo;
 	struct cifsTconInfo *pTcon;
 	struct inode *inode;
 	struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
-	const unsigned char *full_path = NULL;
 	char *buf = NULL;
 	bool adjustTZ = false;
 	bool is_dfs_referral = false;
+	umode_t default_mode;
 
 	pTcon = cifs_sb->tcon;
-	cFYI(1, ("Getting info on %s", search_path));
+	cFYI(1, ("Getting info on %s", full_path));
 
 	if ((pfindData == NULL) && (*pinode != NULL)) {
 		if (CIFS_I(*pinode)->clientCanCacheRead) {
@@ -409,9 +437,6 @@ int cifs_get_inode_info(struct inode **pinode,
 			return -ENOMEM;
 		pfindData = (FILE_ALL_INFO *)buf;
 
-		full_path = cifs_get_search_path(cifs_sb, search_path);
-
-try_again_CIFSSMBQPathInfo:
 		/* could do find first instead but this returns more info */
 		rc = CIFSSMBQPathInfo(xid, pTcon, full_path, pfindData,
 			      0 /* not legacy */,
@@ -429,178 +454,163 @@ try_again_CIFSSMBQPathInfo:
 		}
 	}
 	/* dump_mem("\nQPathInfo return data",&findData, sizeof(findData)); */
-	if (rc) {
-		if (rc == -EREMOTE && !is_dfs_referral) {
-			is_dfs_referral = true;
-			if (full_path != search_path) {
-				kfree(full_path);
-				full_path = search_path;
-			}
-			goto try_again_CIFSSMBQPathInfo;
-		}
+	if (rc == -EREMOTE) {
+		is_dfs_referral = true;
+		fill_fake_finddata(pfindData, sb);
+		rc = 0;
+	} else if (rc)
 		goto cgii_exit;
-	} else {
-		struct cifsInodeInfo *cifsInfo;
-		__u32 attr = le32_to_cpu(pfindData->Attributes);
-
-		/* get new inode */
-		if (*pinode == NULL) {
-			*pinode = new_inode(sb);
-			if (*pinode == NULL) {
-				rc = -ENOMEM;
-				goto cgii_exit;
-			}
-			/* Is an i_ino of zero legal? Can we use that to check
-			   if the server supports returning inode numbers?  Are
-			   there other sanity checks we can use to ensure that
-			   the server is really filling in that field? */
-
-			/* We can not use the IndexNumber field by default from
-			   Windows or Samba (in ALL_INFO buf) but we can request
-			   it explicitly.  It may not be unique presumably if
-			   the server has multiple devices mounted under one
-			   share */
-
-			/* There may be higher info levels that work but are
-			   there Windows server or network appliances for which
-			   IndexNumber field is not guaranteed unique? */
 
-			if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) {
-				int rc1 = 0;
-				__u64 inode_num;
+	attr = le32_to_cpu(pfindData->Attributes);
 
-				rc1 = CIFSGetSrvInodeNumber(xid, pTcon,
-					search_path, &inode_num,
+	/* get new inode */
+	if (*pinode == NULL) {
+		*pinode = new_inode(sb);
+		if (*pinode == NULL) {
+			rc = -ENOMEM;
+			goto cgii_exit;
+		}
+		/* Is an i_ino of zero legal? Can we use that to check
+		   if the server supports returning inode numbers?  Are
+		   there other sanity checks we can use to ensure that
+		   the server is really filling in that field? */
+
+		/* We can not use the IndexNumber field by default from
+		   Windows or Samba (in ALL_INFO buf) but we can request
+		   it explicitly.  It may not be unique presumably if
+		   the server has multiple devices mounted under one share */
+
+		/* There may be higher info levels that work but are
+		   there Windows server or network appliances for which
+		   IndexNumber field is not guaranteed unique? */
+
+		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) {
+			int rc1 = 0;
+			__u64 inode_num;
+
+			rc1 = CIFSGetSrvInodeNumber(xid, pTcon,
+					full_path, &inode_num,
 					cifs_sb->local_nls,
 					cifs_sb->mnt_cifs_flags &
 						CIFS_MOUNT_MAP_SPECIAL_CHR);
-				if (rc1) {
-					cFYI(1, ("GetSrvInodeNum rc %d", rc1));
-					/* BB EOPNOSUPP disable SERVER_INUM? */
-				} else /* do we need cast or hash to ino? */
-					(*pinode)->i_ino = inode_num;
-			} /* else ino incremented to unique num in new_inode*/
-			if (sb->s_flags & MS_NOATIME)
-				(*pinode)->i_flags |= S_NOATIME | S_NOCMTIME;
-			insert_inode_hash(*pinode);
-		}
-		inode = *pinode;
-		cifsInfo = CIFS_I(inode);
-		cifsInfo->cifsAttrs = attr;
-		cFYI(1, ("Old time %ld", cifsInfo->time));
-		cifsInfo->time = jiffies;
-		cFYI(1, ("New time %ld", cifsInfo->time));
-
-		/* blksize needs to be multiple of two. So safer to default to
-		blksize and blkbits set in superblock so 2**blkbits and blksize
-		will match rather than setting to:
-		(pTcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE) & 0xFFFFFE00;*/
-
-		/* Linux can not store file creation time so ignore it */
-		if (pfindData->LastAccessTime)
-			inode->i_atime = cifs_NTtimeToUnix
-				(le64_to_cpu(pfindData->LastAccessTime));
-		else /* do not need to use current_fs_time - time not stored */
-			inode->i_atime = CURRENT_TIME;
-		inode->i_mtime =
+			if (rc1) {
+				cFYI(1, ("GetSrvInodeNum rc %d", rc1));
+				/* BB EOPNOSUPP disable SERVER_INUM? */
+			} else /* do we need cast or hash to ino? */
+				(*pinode)->i_ino = inode_num;
+		} /* else ino incremented to unique num in new_inode*/
+		if (sb->s_flags & MS_NOATIME)
+			(*pinode)->i_flags |= S_NOATIME | S_NOCMTIME;
+		insert_inode_hash(*pinode);
+	}
+	inode = *pinode;
+	cifsInfo = CIFS_I(inode);
+	cifsInfo->cifsAttrs = attr;
+	cFYI(1, ("Old time %ld", cifsInfo->time));
+	cifsInfo->time = jiffies;
+	cFYI(1, ("New time %ld", cifsInfo->time));
+
+	/* blksize needs to be multiple of two. So safer to default to
+	blksize and blkbits set in superblock so 2**blkbits and blksize
+	will match rather than setting to:
+	(pTcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE) & 0xFFFFFE00;*/
+
+	/* Linux can not store file creation time so ignore it */
+	if (pfindData->LastAccessTime)
+		inode->i_atime = cifs_NTtimeToUnix
+			(le64_to_cpu(pfindData->LastAccessTime));
+	else /* do not need to use current_fs_time - time not stored */
+		inode->i_atime = CURRENT_TIME;
+	inode->i_mtime =
 		    cifs_NTtimeToUnix(le64_to_cpu(pfindData->LastWriteTime));
-		inode->i_ctime =
-		    cifs_NTtimeToUnix(le64_to_cpu(pfindData->ChangeTime));
-		cFYI(0, ("Attributes came in as 0x%x", attr));
-		if (adjustTZ && (pTcon->ses) && (pTcon->ses->server)) {
-			inode->i_ctime.tv_sec += pTcon->ses->server->timeAdj;
-			inode->i_mtime.tv_sec += pTcon->ses->server->timeAdj;
-		}
+	inode->i_ctime =
+	    cifs_NTtimeToUnix(le64_to_cpu(pfindData->ChangeTime));
+	cFYI(DBG2, ("Attributes came in as 0x%x", attr));
+	if (adjustTZ && (pTcon->ses) && (pTcon->ses->server)) {
+		inode->i_ctime.tv_sec += pTcon->ses->server->timeAdj;
+		inode->i_mtime.tv_sec += pTcon->ses->server->timeAdj;
+	}
 
-		/* set default mode. will override for dirs below */
-		if (atomic_read(&cifsInfo->inUse) == 0)
-			/* new inode, can safely set these fields */
-			inode->i_mode = cifs_sb->mnt_file_mode;
-		else /* since we set the inode type below we need to mask off
-		     to avoid strange results if type changes and both
-		     get orred in */
+	/* get default inode mode */
+	if (attr & ATTR_DIRECTORY)
+		default_mode = cifs_sb->mnt_dir_mode;
+	else
+		default_mode = cifs_sb->mnt_file_mode;
+
+	/* set permission bits */
+	if (atomic_read(&cifsInfo->inUse) == 0 ||
+	    (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM) == 0)
+		inode->i_mode = default_mode;
+	else {
+		/* just reenable write bits if !ATTR_READONLY */
+		if ((inode->i_mode & S_IWUGO) == 0 &&
+		    (attr & ATTR_READONLY) == 0)
+			inode->i_mode |= (S_IWUGO & default_mode);
 			inode->i_mode &= ~S_IFMT;
-/*		if (attr & ATTR_REPARSE)  */
-		/* We no longer handle these as symlinks because we could not
-		   follow them due to the absolute path with drive letter */
-		if (attr & ATTR_DIRECTORY) {
-		/* override default perms since we do not do byte range locking
-		   on dirs */
-			inode->i_mode = cifs_sb->mnt_dir_mode;
-			inode->i_mode |= S_IFDIR;
-		} else if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) &&
-			   (cifsInfo->cifsAttrs & ATTR_SYSTEM) &&
-			   /* No need to le64 convert size of zero */
-			   (pfindData->EndOfFile == 0)) {
-			inode->i_mode = cifs_sb->mnt_file_mode;
+	}
+	/* clear write bits if ATTR_READONLY is set */
+	if (attr & ATTR_READONLY)
+		inode->i_mode &= ~S_IWUGO;
+
+	/* set inode type */
+	if ((attr & ATTR_SYSTEM) &&
+	    (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL)) {
+		/* no need to fix endianness on 0 */
+		if (pfindData->EndOfFile == 0)
 			inode->i_mode |= S_IFIFO;
-/* BB Finish for SFU style symlinks and devices */
-		} else if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) &&
-			   (cifsInfo->cifsAttrs & ATTR_SYSTEM)) {
-			if (decode_sfu_inode(inode,
-					 le64_to_cpu(pfindData->EndOfFile),
-					 search_path,
-					 cifs_sb, xid))
-				cFYI(1, ("Unrecognized sfu inode type"));
-
-			cFYI(1, ("sfu mode 0%o", inode->i_mode));
-		} else {
+		else if (decode_sfu_inode(inode,
+				le64_to_cpu(pfindData->EndOfFile),
+				full_path, cifs_sb, xid))
+			cFYI(1, ("unknown SFU file type\n"));
+	} else {
+		if (attr & ATTR_DIRECTORY)
+			inode->i_mode |= S_IFDIR;
+		else
 			inode->i_mode |= S_IFREG;
-			/* treat the dos attribute of read-only as read-only
-			   mode e.g. 555 */
-			if (cifsInfo->cifsAttrs & ATTR_READONLY)
-				inode->i_mode &= ~(S_IWUGO);
-			else if ((inode->i_mode & S_IWUGO) == 0)
-				/* the ATTR_READONLY flag may have been	*/
-				/* changed on server -- set any w bits	*/
-				/* allowed by mnt_file_mode		*/
-				inode->i_mode |= (S_IWUGO &
-						  cifs_sb->mnt_file_mode);
-		/* BB add code here -
-		   validate if device or weird share or device type? */
-		}
+	}
 
-		spin_lock(&inode->i_lock);
-		if (is_size_safe_to_change(cifsInfo,
-					   le64_to_cpu(pfindData->EndOfFile))) {
-			/* can not safely shrink the file size here if the
-			   client is writing to it due to potential races */
-			i_size_write(inode, le64_to_cpu(pfindData->EndOfFile));
-
-			/* 512 bytes (2**9) is the fake blocksize that must be
-			   used for this calculation */
-			inode->i_blocks = (512 - 1 + le64_to_cpu(
-					   pfindData->AllocationSize)) >> 9;
-		}
-		spin_unlock(&inode->i_lock);
+	spin_lock(&inode->i_lock);
+	if (is_size_safe_to_change(cifsInfo,
+				   le64_to_cpu(pfindData->EndOfFile))) {
+		/* can not safely shrink the file size here if the
+		   client is writing to it due to potential races */
+		i_size_write(inode, le64_to_cpu(pfindData->EndOfFile));
+
+		/* 512 bytes (2**9) is the fake blocksize that must be
+		   used for this calculation */
+		inode->i_blocks = (512 - 1 + le64_to_cpu(
+				   pfindData->AllocationSize)) >> 9;
+	}
+	spin_unlock(&inode->i_lock);
 
-		inode->i_nlink = le32_to_cpu(pfindData->NumberOfLinks);
+	inode->i_nlink = le32_to_cpu(pfindData->NumberOfLinks);
 
-		/* BB fill in uid and gid here? with help from winbind?
-		   or retrieve from NTFS stream extended attribute */
+	/* BB fill in uid and gid here? with help from winbind?
+	   or retrieve from NTFS stream extended attribute */
 #ifdef CONFIG_CIFS_EXPERIMENTAL
-		/* fill in 0777 bits from ACL */
-		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) {
-			cFYI(1, ("Getting mode bits from ACL"));
-			acl_to_uid_mode(inode, search_path, pfid);
-		}
+	/* fill in 0777 bits from ACL */
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) {
+		cFYI(1, ("Getting mode bits from ACL"));
+		acl_to_uid_mode(inode, full_path, pfid);
+	}
 #endif
-		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) {
-			/* fill in remaining high mode bits e.g. SUID, VTX */
-			get_sfu_mode(inode, search_path, cifs_sb, xid);
-		} else if (atomic_read(&cifsInfo->inUse) == 0) {
-			inode->i_uid = cifs_sb->mnt_uid;
-			inode->i_gid = cifs_sb->mnt_gid;
-			/* set so we do not keep refreshing these fields with
-			   bad data after user has changed them in memory */
-			atomic_set(&cifsInfo->inUse, 1);
-		}
-
-		cifs_set_ops(inode, is_dfs_referral);
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) {
+		/* fill in remaining high mode bits e.g. SUID, VTX */
+		get_sfu_mode(inode, full_path, cifs_sb, xid);
+	} else if (atomic_read(&cifsInfo->inUse) == 0) {
+		inode->i_uid = cifs_sb->mnt_uid;
+		inode->i_gid = cifs_sb->mnt_gid;
+		/* set so we do not keep refreshing these fields with
+		   bad data after user has changed them in memory */
+		atomic_set(&cifsInfo->inUse, 1);
 	}
+
+	cifs_set_ops(inode, is_dfs_referral);
+
+
+
+
 cgii_exit:
-	if (full_path != search_path)
-		kfree(full_path);
 	kfree(buf);
 	return rc;
 }
@@ -1005,8 +1015,11 @@ mkdir_get_info:
 						CIFS_MOUNT_MAP_SPECIAL_CHR);
 			}
 			if (direntry->d_inode) {
-				direntry->d_inode->i_mode = mode;
-				direntry->d_inode->i_mode |= S_IFDIR;
+				if (cifs_sb->mnt_cifs_flags &
+				     CIFS_MOUNT_DYNPERM)
+					direntry->d_inode->i_mode =
+						(mode | S_IFDIR);
+
 				if (cifs_sb->mnt_cifs_flags &
 				     CIFS_MOUNT_SET_UID) {
 					direntry->d_inode->i_uid =
@@ -1502,8 +1515,7 @@ int cifs_setattr(struct dentry *direntry, struct iattr *attrs)
 				int oplock = 0;
 
 				rc = SMBLegacyOpen(xid, pTcon, full_path,
-					FILE_OPEN,
-					SYNCHRONIZE | FILE_WRITE_ATTRIBUTES,
+					FILE_OPEN, GENERIC_WRITE,
 					CREATE_NOT_DIR, &netfid, &oplock,
 					NULL, cifs_sb->local_nls,
 					cifs_sb->mnt_cifs_flags &
@@ -1534,13 +1546,26 @@ int cifs_setattr(struct dentry *direntry, struct iattr *attrs)
 		} else
 			goto cifs_setattr_exit;
 	}
-	if (attrs->ia_valid & ATTR_UID) {
-		cFYI(1, ("UID changed to %d", attrs->ia_uid));
-		uid = attrs->ia_uid;
-	}
-	if (attrs->ia_valid & ATTR_GID) {
-		cFYI(1, ("GID changed to %d", attrs->ia_gid));
-		gid = attrs->ia_gid;
+
+	/*
+	 * Without unix extensions we can't send ownership changes to the
+	 * server, so silently ignore them. This is consistent with how
+	 * local DOS/Windows filesystems behave (VFAT, NTFS, etc). With
+	 * CIFSACL support + proper Windows to Unix idmapping, we may be
+	 * able to support this in the future.
+	 */
+	if (!pTcon->unix_ext &&
+	    !(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID)) {
+		attrs->ia_valid &= ~(ATTR_UID | ATTR_GID);
+	} else {
+		if (attrs->ia_valid & ATTR_UID) {
+			cFYI(1, ("UID changed to %d", attrs->ia_uid));
+			uid = attrs->ia_uid;
+		}
+		if (attrs->ia_valid & ATTR_GID) {
+			cFYI(1, ("GID changed to %d", attrs->ia_gid));
+			gid = attrs->ia_gid;
+		}
 	}
 
 	time_buf.Attributes = 0;
@@ -1550,7 +1575,7 @@ int cifs_setattr(struct dentry *direntry, struct iattr *attrs)
 		attrs->ia_valid &= ~ATTR_MODE;
 
 	if (attrs->ia_valid & ATTR_MODE) {
-		cFYI(1, ("Mode changed to 0x%x", attrs->ia_mode));
+		cFYI(1, ("Mode changed to 0%o", attrs->ia_mode));
 		mode = attrs->ia_mode;
 	}
 
@@ -1565,18 +1590,18 @@ int cifs_setattr(struct dentry *direntry, struct iattr *attrs)
 #ifdef CONFIG_CIFS_EXPERIMENTAL
 		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL)
 			rc = mode_to_acl(inode, full_path, mode);
-		else if ((mode & S_IWUGO) == 0) {
-#else
-		if ((mode & S_IWUGO) == 0) {
+		else
 #endif
-			/* not writeable */
-			if ((cifsInode->cifsAttrs & ATTR_READONLY) == 0) {
-				set_dosattr = true;
-				time_buf.Attributes =
-					cpu_to_le32(cifsInode->cifsAttrs |
-						    ATTR_READONLY);
-			}
-		} else if (cifsInode->cifsAttrs & ATTR_READONLY) {
+		if (((mode & S_IWUGO) == 0) &&
+		    (cifsInode->cifsAttrs & ATTR_READONLY) == 0) {
+			set_dosattr = true;
+			time_buf.Attributes = cpu_to_le32(cifsInode->cifsAttrs |
+							  ATTR_READONLY);
+			/* fix up mode if we're not using dynperm */
+			if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM) == 0)
+				attrs->ia_mode = inode->i_mode & ~S_IWUGO;
+		} else if ((mode & S_IWUGO) &&
+			   (cifsInode->cifsAttrs & ATTR_READONLY)) {
 			/* If file is readonly on server, we would
 			not be able to write to it - so if any write
 			bit is enabled for user or group or other we
@@ -1587,6 +1612,20 @@ int cifs_setattr(struct dentry *direntry, struct iattr *attrs)
 			/* Windows ignores set to zero */
 			if (time_buf.Attributes == 0)
 				time_buf.Attributes |= cpu_to_le32(ATTR_NORMAL);
+
+			/* reset local inode permissions to normal */
+			if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM)) {
+				attrs->ia_mode &= ~(S_IALLUGO);
+				if (S_ISDIR(inode->i_mode))
+					attrs->ia_mode |=
+						cifs_sb->mnt_dir_mode;
+				else
+					attrs->ia_mode |=
+						cifs_sb->mnt_file_mode;
+			}
+		} else if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM)) {
+			/* ignore mode change - ATTR_READONLY hasn't changed */
+			attrs->ia_valid &= ~ATTR_MODE;
 		}
 	}
 
diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c
index 5c792df13d62..0088a5b52564 100644
--- a/fs/cifs/ioctl.c
+++ b/fs/cifs/ioctl.c
@@ -30,9 +30,9 @@
 
 #define CIFS_IOC_CHECKUMOUNT _IO(0xCF, 2)
 
-int cifs_ioctl(struct inode *inode, struct file *filep,
-		unsigned int command, unsigned long arg)
+long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
 {
+	struct inode *inode = filep->f_dentry->d_inode;
 	int rc = -ENOTTY; /* strange error - but the precedent */
 	int xid;
 	struct cifs_sb_info *cifs_sb;
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index 1c2c3ce5020b..63f644000ce5 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -234,7 +234,6 @@ cifs_readlink(struct dentry *direntry, char __user *pBuffer, int buflen)
 	struct cifs_sb_info *cifs_sb;
 	struct cifsTconInfo *pTcon;
 	char *full_path = NULL;
-	char *tmp_path = NULL;
 	char *tmpbuffer;
 	int len;
 	__u16 fid;
@@ -295,45 +294,9 @@ cifs_readlink(struct dentry *direntry, char __user *pBuffer, int buflen)
 				cFYI(1, ("Error closing junction point "
 					 "(open for ioctl)"));
 			}
-			/* BB unwind this long, nested function, or remove BB */
-			if (rc == -EIO) {
-				/* Query if DFS Junction */
-				unsigned int num_referrals = 0;
-				struct dfs_info3_param *refs = NULL;
-				tmp_path =
-					kmalloc(MAX_TREE_SIZE + MAX_PATHCONF + 1,
-						GFP_KERNEL);
-				if (tmp_path) {
-					strncpy(tmp_path, pTcon->treeName,
-						MAX_TREE_SIZE);
-					strncat(tmp_path, full_path,
-						MAX_PATHCONF);
-					rc = get_dfs_path(xid, pTcon->ses,
-						tmp_path,
-						cifs_sb->local_nls,
-						&num_referrals, &refs,
-						cifs_sb->mnt_cifs_flags &
-						    CIFS_MOUNT_MAP_SPECIAL_CHR);
-					cFYI(1, ("Get DFS for %s rc = %d ",
-						tmp_path, rc));
-					if ((num_referrals == 0) && (rc == 0))
-						rc = -EACCES;
-					else {
-						cFYI(1, ("num referral: %d",
-							num_referrals));
-						if (refs && refs->path_name) {
-							strncpy(tmpbuffer,
-								refs->path_name,
-								len-1);
-						}
-					}
-					kfree(refs);
-					kfree(tmp_path);
-}
-				/* BB add code like else decode referrals
-				then memcpy to tmpbuffer and free referrals
-				string array BB */
-			}
+			/* If it is a DFS junction earlier we would have gotten
+			   PATH_NOT_COVERED returned from server so we do
+			   not need to request the DFS info here */
 		}
 	}
 	/* BB Anything else to do to handle recursive links? */
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 1d69b8014e0b..4b17f8fe3157 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -519,8 +519,7 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
 			pnotify = (struct file_notify_information *)
 				((char *)&pSMBr->hdr.Protocol + data_offset);
 			cFYI(1, ("dnotify on %s Action: 0x%x",
-				 pnotify->FileName,
-				pnotify->Action));  /* BB removeme BB */
+				 pnotify->FileName, pnotify->Action));
 			/*   cifs_dump_mem("Rcvd notify Data: ",buf,
 				sizeof(struct smb_hdr)+60); */
 			return true;
diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c
index 00f4cff400b3..8703d68f5b20 100644
--- a/fs/cifs/netmisc.c
+++ b/fs/cifs/netmisc.c
@@ -141,11 +141,11 @@ cifs_inet_pton(const int address_family, const char *cp, void *dst)
 	int ret = 0;
 
 	/* calculate length by finding first slash or NULL */
-	if (address_family == AF_INET) {
+	if (address_family == AF_INET)
 		ret = in4_pton(cp, -1 /* len */, dst, '\\', NULL);
-	} else if (address_family == AF_INET6) {
+	else if (address_family == AF_INET6)
 		ret = in6_pton(cp, -1 /* len */, dst , '\\', NULL);
-	}
+
 	cFYI(DBG2, ("address conversion returned %d for %s", ret, cp));
 	if (ret > 0)
 		ret = 1;
diff --git a/fs/cifs/ntlmssp.h b/fs/cifs/ntlmssp.h
index 7170a9b70f1e..c377d8065d99 100644
--- a/fs/cifs/ntlmssp.h
+++ b/fs/cifs/ntlmssp.h
@@ -64,7 +64,7 @@ typedef struct _SECURITY_BUFFER {
 } __attribute__((packed)) SECURITY_BUFFER;
 
 typedef struct _NEGOTIATE_MESSAGE {
-	__u8 Signature[sizeof (NTLMSSP_SIGNATURE)];
+	__u8 Signature[sizeof(NTLMSSP_SIGNATURE)];
 	__le32 MessageType;     /* 1 */
 	__le32 NegotiateFlags;
 	SECURITY_BUFFER DomainName;	/* RFC 1001 style and ASCII */
@@ -74,7 +74,7 @@ typedef struct _NEGOTIATE_MESSAGE {
 } __attribute__((packed)) NEGOTIATE_MESSAGE, *PNEGOTIATE_MESSAGE;
 
 typedef struct _CHALLENGE_MESSAGE {
-	__u8 Signature[sizeof (NTLMSSP_SIGNATURE)];
+	__u8 Signature[sizeof(NTLMSSP_SIGNATURE)];
 	__le32 MessageType;   /* 2 */
 	SECURITY_BUFFER TargetName;
 	__le32 NegotiateFlags;
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 34ec32100c72..83f306954883 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -132,6 +132,7 @@ static void fill_in_inode(struct inode *tmp_inode, int new_buf_type,
 	__u32 attr;
 	__u64 allocation_size;
 	__u64 end_of_file;
+	umode_t default_mode;
 
 	/* save mtime and size */
 	local_mtime = tmp_inode->i_mtime;
@@ -187,48 +188,54 @@ static void fill_in_inode(struct inode *tmp_inode, int new_buf_type,
 	if (atomic_read(&cifsInfo->inUse) == 0) {
 		tmp_inode->i_uid = cifs_sb->mnt_uid;
 		tmp_inode->i_gid = cifs_sb->mnt_gid;
-		/* set default mode. will override for dirs below */
-		tmp_inode->i_mode = cifs_sb->mnt_file_mode;
-	} else {
-		/* mask off the type bits since it gets set
-		below and we do not want to get two type
-		bits set */
+	}
+
+	if (attr & ATTR_DIRECTORY)
+		default_mode = cifs_sb->mnt_dir_mode;
+	else
+		default_mode = cifs_sb->mnt_file_mode;
+
+	/* set initial permissions */
+	if ((atomic_read(&cifsInfo->inUse) == 0) ||
+	    (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM) == 0)
+		tmp_inode->i_mode = default_mode;
+	else {
+		/* just reenable write bits if !ATTR_READONLY */
+		if ((tmp_inode->i_mode & S_IWUGO) == 0 &&
+		    (attr & ATTR_READONLY) == 0)
+			tmp_inode->i_mode |= (S_IWUGO & default_mode);
+
 		tmp_inode->i_mode &= ~S_IFMT;
 	}
 
-	if (attr & ATTR_DIRECTORY) {
-		*pobject_type = DT_DIR;
-		/* override default perms since we do not lock dirs */
-		if (atomic_read(&cifsInfo->inUse) == 0)
-			tmp_inode->i_mode = cifs_sb->mnt_dir_mode;
-		tmp_inode->i_mode |= S_IFDIR;
-	} else if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) &&
-		   (attr & ATTR_SYSTEM)) {
+	/* clear write bits if ATTR_READONLY is set */
+	if (attr & ATTR_READONLY)
+		tmp_inode->i_mode &= ~S_IWUGO;
+
+	/* set inode type */
+	if ((attr & ATTR_SYSTEM) &&
+	    (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL)) {
 		if (end_of_file == 0)  {
-			*pobject_type = DT_FIFO;
 			tmp_inode->i_mode |= S_IFIFO;
+			*pobject_type = DT_FIFO;
 		} else {
-			/* rather than get the type here, we mark the
-			inode as needing revalidate and get the real type
-			(blk vs chr vs. symlink) later ie in lookup */
-			*pobject_type = DT_REG;
+			/*
+			 * trying to get the type can be slow, so just call
+			 * this a regular file for now, and mark for reval
+			 */
 			tmp_inode->i_mode |= S_IFREG;
+			*pobject_type = DT_REG;
 			cifsInfo->time = 0;
 		}
-/* we no longer mark these because we could not follow them */
-/*        } else if (attr & ATTR_REPARSE) {
-		*pobject_type = DT_LNK;
-		tmp_inode->i_mode |= S_IFLNK; */
 	} else {
-		*pobject_type = DT_REG;
-		tmp_inode->i_mode |= S_IFREG;
-		if (attr & ATTR_READONLY)
-			tmp_inode->i_mode &= ~(S_IWUGO);
-		else if ((tmp_inode->i_mode & S_IWUGO) == 0)
-			/* the ATTR_READONLY flag may have been changed on   */
-			/* server -- set any w bits allowed by mnt_file_mode */
-			tmp_inode->i_mode |= (S_IWUGO & cifs_sb->mnt_file_mode);
-	} /* could add code here - to validate if device or weird share type? */
+		if (attr & ATTR_DIRECTORY) {
+			tmp_inode->i_mode |= S_IFDIR;
+			*pobject_type = DT_DIR;
+		} else {
+			tmp_inode->i_mode |= S_IFREG;
+			*pobject_type = DT_REG;
+		}
+	}
 
 	/* can not fill in nlink here as in qpathinfo version and Unx search */
 	if (atomic_read(&cifsInfo->inUse) == 0)
@@ -670,10 +677,11 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon,
 	   (index_to_find < first_entry_in_buffer)) {
 		/* close and restart search */
 		cFYI(1, ("search backing up - close and restart search"));
-		cifsFile->invalidHandle = true;
-		CIFSFindClose(xid, pTcon, cifsFile->netfid);
-		kfree(cifsFile->search_resume_name);
-		cifsFile->search_resume_name = NULL;
+		if (!cifsFile->srch_inf.endOfSearch &&
+		    !cifsFile->invalidHandle) {
+			cifsFile->invalidHandle = true;
+			CIFSFindClose(xid, pTcon, cifsFile->netfid);
+		}
 		if (cifsFile->srch_inf.ntwrk_buf_start) {
 			cFYI(1, ("freeing SMB ff cache buf on search rewind"));
 			if (cifsFile->srch_inf.smallBuf)
@@ -1040,9 +1048,7 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
 		} /* else {
 			cifsFile->invalidHandle = true;
 			CIFSFindClose(xid, pTcon, cifsFile->netfid);
-		}
-		kfree(cifsFile->search_resume_name);
-		cifsFile->search_resume_name = NULL; */
+		} */
 
 		rc = find_cifs_entry(xid, pTcon, file,
 				&current_entry, &num_to_fill);
diff --git a/fs/compat.c b/fs/compat.c
index 332a869d2c53..ed43e17a5dc6 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1405,7 +1405,7 @@ int compat_do_execve(char * filename,
 		/* execve success */
 		security_bprm_free(bprm);
 		acct_update_integrals(current);
-		kfree(bprm);
+		free_bprm(bprm);
 		return retval;
 	}
 
@@ -1424,7 +1424,7 @@ out_file:
 	}
 
 out_kfree:
-	kfree(bprm);
+	free_bprm(bprm);
 
 out_ret:
 	return retval;
diff --git a/fs/dcache.c b/fs/dcache.c
index 3ee588d5f585..6068c25b393c 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -17,6 +17,7 @@
 #include <linux/syscalls.h>
 #include <linux/string.h>
 #include <linux/mm.h>
+#include <linux/fdtable.h>
 #include <linux/fs.h>
 #include <linux/fsnotify.h>
 #include <linux/slab.h>
@@ -106,9 +107,10 @@ static void dentry_lru_remove(struct dentry *dentry)
 /*
  * Release the dentry's inode, using the filesystem
  * d_iput() operation if defined.
- * Called with dcache_lock and per dentry lock held, drops both.
  */
 static void dentry_iput(struct dentry * dentry)
+	__releases(dentry->d_lock)
+	__releases(dcache_lock)
 {
 	struct inode *inode = dentry->d_inode;
 	if (inode) {
@@ -132,12 +134,13 @@ static void dentry_iput(struct dentry * dentry)
  * d_kill - kill dentry and return parent
  * @dentry: dentry to kill
  *
- * Called with dcache_lock and d_lock, releases both.  The dentry must
- * already be unhashed and removed from the LRU.
+ * The dentry must already be unhashed and removed from the LRU.
  *
  * If this is the root of the dentry tree, return NULL.
  */
 static struct dentry *d_kill(struct dentry *dentry)
+	__releases(dentry->d_lock)
+	__releases(dcache_lock)
 {
 	struct dentry *parent;
 
@@ -383,11 +386,11 @@ restart:
  * Try to prune ancestors as well.  This is necessary to prevent
  * quadratic behavior of shrink_dcache_parent(), but is also expected
  * to be beneficial in reducing dentry cache fragmentation.
- *
- * Called with dcache_lock, drops it and then regains.
- * Called with dentry->d_lock held, drops it.
  */
 static void prune_one_dentry(struct dentry * dentry)
+	__releases(dentry->d_lock)
+	__releases(dcache_lock)
+	__acquires(dcache_lock)
 {
 	__d_drop(dentry);
 	dentry = d_kill(dentry);
@@ -1604,10 +1607,9 @@ static int d_isparent(struct dentry *p1, struct dentry *p2)
  *
  * Note: If ever the locking in lock_rename() changes, then please
  * remember to update this too...
- *
- * On return, dcache_lock will have been unlocked.
  */
 static struct dentry *__d_unalias(struct dentry *dentry, struct dentry *alias)
+	__releases(dcache_lock)
 {
 	struct mutex *m1 = NULL, *m2 = NULL;
 	struct dentry *ret;
@@ -1743,11 +1745,9 @@ out_nolock:
 shouldnt_be_hashed:
 	spin_unlock(&dcache_lock);
 	BUG();
-	goto shouldnt_be_hashed;
 }
 
-static int prepend(char **buffer, int *buflen, const char *str,
-			  int namelen)
+static int prepend(char **buffer, int *buflen, const char *str, int namelen)
 {
 	*buflen -= namelen;
 	if (*buflen < 0)
@@ -1757,8 +1757,13 @@ static int prepend(char **buffer, int *buflen, const char *str,
 	return 0;
 }
 
+static int prepend_name(char **buffer, int *buflen, struct qstr *name)
+{
+	return prepend(buffer, buflen, name->name, name->len);
+}
+
 /**
- * d_path - return the path of a dentry
+ * __d_path - return the path of a dentry
  * @path: the dentry/vfsmount to report
  * @root: root vfsmnt/dentry (may be modified by this function)
  * @buffer: buffer to return value in
@@ -1779,9 +1784,10 @@ char *__d_path(const struct path *path, struct path *root,
 {
 	struct dentry *dentry = path->dentry;
 	struct vfsmount *vfsmnt = path->mnt;
-	char * end = buffer+buflen;
-	char * retval;
+	char *end = buffer + buflen;
+	char *retval;
 
+	spin_lock(&vfsmount_lock);
 	prepend(&end, &buflen, "\0", 1);
 	if (!IS_ROOT(dentry) && d_unhashed(dentry) &&
 		(prepend(&end, &buflen, " (deleted)", 10) != 0))
@@ -1800,38 +1806,37 @@ char *__d_path(const struct path *path, struct path *root,
 			break;
 		if (dentry == vfsmnt->mnt_root || IS_ROOT(dentry)) {
 			/* Global root? */
-			spin_lock(&vfsmount_lock);
 			if (vfsmnt->mnt_parent == vfsmnt) {
-				spin_unlock(&vfsmount_lock);
 				goto global_root;
 			}
 			dentry = vfsmnt->mnt_mountpoint;
 			vfsmnt = vfsmnt->mnt_parent;
-			spin_unlock(&vfsmount_lock);
 			continue;
 		}
 		parent = dentry->d_parent;
 		prefetch(parent);
-		if ((prepend(&end, &buflen, dentry->d_name.name,
-				dentry->d_name.len) != 0) ||
+		if ((prepend_name(&end, &buflen, &dentry->d_name) != 0) ||
 		    (prepend(&end, &buflen, "/", 1) != 0))
 			goto Elong;
 		retval = end;
 		dentry = parent;
 	}
 
+out:
+	spin_unlock(&vfsmount_lock);
 	return retval;
 
 global_root:
 	retval += 1;	/* hit the slash */
-	if (prepend(&retval, &buflen, dentry->d_name.name,
-		    dentry->d_name.len) != 0)
+	if (prepend_name(&retval, &buflen, &dentry->d_name) != 0)
 		goto Elong;
 	root->mnt = vfsmnt;
 	root->dentry = dentry;
-	return retval;
+	goto out;
+
 Elong:
-	return ERR_PTR(-ENAMETOOLONG);
+	retval = ERR_PTR(-ENAMETOOLONG);
+	goto out;
 }
 
 /**
@@ -1845,9 +1850,9 @@ Elong:
  *
  * Returns the buffer or an error code if the path was too long.
  *
- * "buflen" should be positive. Caller holds the dcache_lock.
+ * "buflen" should be positive.
  */
-char *d_path(struct path *path, char *buf, int buflen)
+char *d_path(const struct path *path, char *buf, int buflen)
 {
 	char *res;
 	struct path root;
@@ -1915,16 +1920,11 @@ char *dentry_path(struct dentry *dentry, char *buf, int buflen)
 	retval = end-1;
 	*retval = '/';
 
-	for (;;) {
-		struct dentry *parent;
-		if (IS_ROOT(dentry))
-			break;
+	while (!IS_ROOT(dentry)) {
+		struct dentry *parent = dentry->d_parent;
 
-		parent = dentry->d_parent;
 		prefetch(parent);
-
-		if ((prepend(&end, &buflen, dentry->d_name.name,
-				dentry->d_name.len) != 0) ||
+		if ((prepend_name(&end, &buflen, &dentry->d_name) != 0) ||
 		    (prepend(&end, &buflen, "/", 1) != 0))
 			goto Elong;
 
@@ -1975,7 +1975,7 @@ asmlinkage long sys_getcwd(char __user *buf, unsigned long size)
 	error = -ENOENT;
 	/* Has the current directory has been unlinked? */
 	spin_lock(&dcache_lock);
-	if (pwd.dentry->d_parent == pwd.dentry || !d_unhashed(pwd.dentry)) {
+	if (IS_ROOT(pwd.dentry) || !d_unhashed(pwd.dentry)) {
 		unsigned long len;
 		struct path tmp = root;
 		char * cwd;
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 7c1e5e5cccd8..637018c891ef 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -50,6 +50,7 @@
 #include <linux/pagemap.h>
 #include <linux/idr.h>
 #include <linux/file.h>
+#include <linux/mutex.h>
 #include <linux/sctp.h>
 #include <net/sctp/user.h>
 
@@ -138,7 +139,7 @@ static struct workqueue_struct *recv_workqueue;
 static struct workqueue_struct *send_workqueue;
 
 static DEFINE_IDR(connections_idr);
-static DECLARE_MUTEX(connections_lock);
+static DEFINE_MUTEX(connections_lock);
 static int max_nodeid;
 static struct kmem_cache *con_cache;
 
@@ -205,9 +206,9 @@ static struct connection *nodeid2con(int nodeid, gfp_t allocation)
 {
 	struct connection *con;
 
-	down(&connections_lock);
+	mutex_lock(&connections_lock);
 	con = __nodeid2con(nodeid, allocation);
-	up(&connections_lock);
+	mutex_unlock(&connections_lock);
 
 	return con;
 }
@@ -218,15 +219,15 @@ static struct connection *assoc2con(int assoc_id)
 	int i;
 	struct connection *con;
 
-	down(&connections_lock);
+	mutex_lock(&connections_lock);
 	for (i=0; i<=max_nodeid; i++) {
 		con = __nodeid2con(i, 0);
 		if (con && con->sctp_assoc == assoc_id) {
-			up(&connections_lock);
+			mutex_unlock(&connections_lock);
 			return con;
 		}
 	}
-	up(&connections_lock);
+	mutex_unlock(&connections_lock);
 	return NULL;
 }
 
@@ -381,7 +382,7 @@ static void sctp_init_failed(void)
 	int i;
 	struct connection *con;
 
-	down(&connections_lock);
+	mutex_lock(&connections_lock);
 	for (i=1; i<=max_nodeid; i++) {
 		con = __nodeid2con(i, 0);
 		if (!con)
@@ -393,7 +394,7 @@ static void sctp_init_failed(void)
 			}
 		}
 	}
-	up(&connections_lock);
+	mutex_unlock(&connections_lock);
 }
 
 /* Something happened to an association */
@@ -930,7 +931,7 @@ out_err:
 	 * errors we try again until the max number of retries is reached.
 	 */
 	if (result != -EHOSTUNREACH && result != -ENETUNREACH &&
-	    result != -ENETDOWN && result != EINVAL
+	    result != -ENETDOWN && result != -EINVAL
 	    && result != -EPROTONOSUPPORT) {
 		lowcomms_connect_sock(con);
 		result = 0;
@@ -1417,7 +1418,7 @@ void dlm_lowcomms_stop(void)
 	/* Set all the flags to prevent any
 	   socket activity.
 	*/
-	down(&connections_lock);
+	mutex_lock(&connections_lock);
 	for (i = 0; i <= max_nodeid; i++) {
 		con = __nodeid2con(i, 0);
 		if (con) {
@@ -1426,11 +1427,11 @@ void dlm_lowcomms_stop(void)
 				con->sock->sk->sk_user_data = NULL;
 		}
 	}
-	up(&connections_lock);
+	mutex_unlock(&connections_lock);
 
 	work_stop();
 
-	down(&connections_lock);
+	mutex_lock(&connections_lock);
 	clean_writequeues();
 
 	for (i = 0; i <= max_nodeid; i++) {
@@ -1443,7 +1444,7 @@ void dlm_lowcomms_stop(void)
 		}
 	}
 	max_nodeid = 0;
-	up(&connections_lock);
+	mutex_unlock(&connections_lock);
 	kmem_cache_destroy(con_cache);
 	idr_init(&connections_idr);
 }
diff --git a/fs/dlm/netlink.c b/fs/dlm/netlink.c
index 714593621f4f..18bda83cc892 100644
--- a/fs/dlm/netlink.c
+++ b/fs/dlm/netlink.c
@@ -95,7 +95,7 @@ int __init dlm_netlink_init(void)
 	return rv;
 }
 
-void __exit dlm_netlink_exit(void)
+void dlm_netlink_exit(void)
 {
 	genl_unregister_ops(&family, &dlm_nl_ops);
 	genl_unregister_family(&family);
diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c
index d6d6e370f89c..78878c5781ca 100644
--- a/fs/dlm/plock.c
+++ b/fs/dlm/plock.c
@@ -379,7 +379,7 @@ static ssize_t dev_write(struct file *file, const char __user *u, size_t count,
 		struct plock_xop *xop;
 		xop = (struct plock_xop *)op;
 		if (xop->callback)
-			count = dlm_plock_callback(op);
+			dlm_plock_callback(op);
 		else
 			wake_up(&recv_wq);
 	} else
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index ebbcf38fd33b..f976f303c196 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -15,6 +15,7 @@
 #include <linux/poll.h>
 #include <linux/signal.h>
 #include <linux/spinlock.h>
+#include <linux/smp_lock.h>
 #include <linux/dlm.h>
 #include <linux/dlm_device.h>
 
@@ -618,13 +619,17 @@ static int device_open(struct inode *inode, struct file *file)
 	struct dlm_user_proc *proc;
 	struct dlm_ls *ls;
 
+	lock_kernel();
 	ls = dlm_find_lockspace_device(iminor(inode));
-	if (!ls)
+	if (!ls) {
+		unlock_kernel();
 		return -ENOENT;
+	}
 
 	proc = kzalloc(sizeof(struct dlm_user_proc), GFP_KERNEL);
 	if (!proc) {
 		dlm_put_lockspace(ls);
+		unlock_kernel();
 		return -ENOMEM;
 	}
 
@@ -636,6 +641,7 @@ static int device_open(struct inode *inode, struct file *file)
 	spin_lock_init(&proc->locks_spin);
 	init_waitqueue_head(&proc->wait);
 	file->private_data = proc;
+	unlock_kernel();
 
 	return 0;
 }
@@ -870,6 +876,7 @@ static unsigned int device_poll(struct file *file, poll_table *wait)
 
 static int ctl_device_open(struct inode *inode, struct file *file)
 {
+	cycle_kernel_lock();
 	file->private_data = NULL;
 	return 0;
 }
diff --git a/fs/dquot.c b/fs/dquot.c
index dfba1623cccb..5ac77da19959 100644
--- a/fs/dquot.c
+++ b/fs/dquot.c
@@ -1491,6 +1491,16 @@ int vfs_quota_off(struct super_block *sb, int type, int remount)
 
 	/* We need to serialize quota_off() for device */
 	mutex_lock(&dqopt->dqonoff_mutex);
+
+	/*
+	 * Skip everything if there's nothing to do. We have to do this because
+	 * sometimes we are called when fill_super() failed and calling
+	 * sync_fs() in such cases does no good.
+	 */
+	if (!sb_any_quota_enabled(sb) && !sb_any_quota_suspended(sb)) {
+		mutex_unlock(&dqopt->dqonoff_mutex);
+		return 0;
+	}
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
 		toputinode[cnt] = NULL;
 		if (type != -1 && cnt != type)
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index cd62d75b2cc0..e2832bc7869a 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -1906,9 +1906,9 @@ int ecryptfs_get_tfm_and_mutex_for_cipher_name(struct crypto_blkcipher **tfm,
 			goto out;
 		}
 	}
-	mutex_unlock(&key_tfm_list_mutex);
 	(*tfm) = key_tfm->key_tfm;
 	(*tfm_mutex) = &key_tfm->key_tfm_mutex;
 out:
+	mutex_unlock(&key_tfm_list_mutex);
 	return rc;
 }
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index 951ee33a022d..c15c25745e05 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -660,8 +660,6 @@ int ecryptfs_get_tfm_and_mutex_for_cipher_name(struct crypto_blkcipher **tfm,
 int ecryptfs_keyring_auth_tok_for_sig(struct key **auth_tok_key,
 				      struct ecryptfs_auth_tok **auth_tok,
 				      char *sig);
-int ecryptfs_write_zeros(struct file *file, pgoff_t index, int start,
-			 int num_zeros);
 int ecryptfs_write_lower(struct inode *ecryptfs_inode, char *data,
 			 loff_t offset, size_t size);
 int ecryptfs_write_lower_page_segment(struct inode *ecryptfs_inode,
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index 2258b8f654a6..24749bf0668f 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -30,6 +30,7 @@
 #include <linux/security.h>
 #include <linux/compat.h>
 #include <linux/fs_stack.h>
+#include <linux/smp_lock.h>
 #include "ecryptfs_kernel.h"
 
 /**
@@ -277,9 +278,11 @@ static int ecryptfs_fasync(int fd, struct file *file, int flag)
 	int rc = 0;
 	struct file *lower_file = NULL;
 
+	lock_kernel();
 	lower_file = ecryptfs_file_to_lower(file);
 	if (lower_file->f_op && lower_file->f_op->fasync)
 		rc = lower_file->f_op->fasync(fd, lower_file, flag);
+	unlock_kernel();
 	return rc;
 }
 
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 0a1397335a8e..c92cc1c00aae 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -37,17 +37,11 @@ static struct dentry *lock_parent(struct dentry *dentry)
 {
 	struct dentry *dir;
 
-	dir = dget(dentry->d_parent);
+	dir = dget_parent(dentry);
 	mutex_lock_nested(&(dir->d_inode->i_mutex), I_MUTEX_PARENT);
 	return dir;
 }
 
-static void unlock_parent(struct dentry *dentry)
-{
-	mutex_unlock(&(dentry->d_parent->d_inode->i_mutex));
-	dput(dentry->d_parent);
-}
-
 static void unlock_dir(struct dentry *dir)
 {
 	mutex_unlock(&dir->d_inode->i_mutex);
@@ -426,8 +420,9 @@ static int ecryptfs_unlink(struct inode *dir, struct dentry *dentry)
 	int rc = 0;
 	struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
 	struct inode *lower_dir_inode = ecryptfs_inode_to_lower(dir);
+	struct dentry *lower_dir_dentry;
 
-	lock_parent(lower_dentry);
+	lower_dir_dentry = lock_parent(lower_dentry);
 	rc = vfs_unlink(lower_dir_inode, lower_dentry);
 	if (rc) {
 		printk(KERN_ERR "Error in vfs_unlink; rc = [%d]\n", rc);
@@ -439,7 +434,7 @@ static int ecryptfs_unlink(struct inode *dir, struct dentry *dentry)
 	dentry->d_inode->i_ctime = dir->i_ctime;
 	d_drop(dentry);
 out_unlock:
-	unlock_parent(lower_dentry);
+	unlock_dir(lower_dir_dentry);
 	return rc;
 }
 
diff --git a/fs/ecryptfs/miscdev.c b/fs/ecryptfs/miscdev.c
index 788995efd1d3..09a4522f65e6 100644
--- a/fs/ecryptfs/miscdev.c
+++ b/fs/ecryptfs/miscdev.c
@@ -243,7 +243,6 @@ ecryptfs_miscdev_read(struct file *file, char __user *buf, size_t count,
 	struct ecryptfs_daemon *daemon;
 	struct ecryptfs_msg_ctx *msg_ctx;
 	size_t packet_length_size;
-	u32 counter_nbo;
 	char packet_length[3];
 	size_t i;
 	size_t total_length;
@@ -257,12 +256,14 @@ ecryptfs_miscdev_read(struct file *file, char __user *buf, size_t count,
 	mutex_lock(&daemon->mux);
 	if (daemon->flags & ECRYPTFS_DAEMON_ZOMBIE) {
 		rc = 0;
+		mutex_unlock(&ecryptfs_daemon_hash_mux);
 		printk(KERN_WARNING "%s: Attempt to read from zombified "
 		       "daemon\n", __func__);
 		goto out_unlock_daemon;
 	}
 	if (daemon->flags & ECRYPTFS_DAEMON_IN_READ) {
 		rc = 0;
+		mutex_unlock(&ecryptfs_daemon_hash_mux);
 		goto out_unlock_daemon;
 	}
 	/* This daemon will not go away so long as this flag is set */
@@ -326,20 +327,18 @@ check_list:
 		       "pending message\n", __func__, count, total_length);
 		goto out_unlock_msg_ctx;
 	}
-	i = 0;
-	buf[i++] = msg_ctx->type;
-	counter_nbo = cpu_to_be32(msg_ctx->counter);
-	memcpy(&buf[i], (char *)&counter_nbo, 4);
-	i += 4;
+	rc = -EFAULT;
+	if (put_user(msg_ctx->type, buf))
+		goto out_unlock_msg_ctx;
+	if (put_user(cpu_to_be32(msg_ctx->counter), (__be32 __user *)(buf + 1)))
+		goto out_unlock_msg_ctx;
+	i = 5;
 	if (msg_ctx->msg) {
-		memcpy(&buf[i], packet_length, packet_length_size);
+		if (copy_to_user(&buf[i], packet_length, packet_length_size))
+			goto out_unlock_msg_ctx;
 		i += packet_length_size;
-		rc = copy_to_user(&buf[i], msg_ctx->msg, msg_ctx->msg_size);
-		if (rc) {
-			printk(KERN_ERR "%s: copy_to_user returned error "
-			       "[%d]\n", __func__, rc);
+		if (copy_to_user(&buf[i], msg_ctx->msg, msg_ctx->msg_size))
 			goto out_unlock_msg_ctx;
-		}
 		i += msg_ctx->msg_size;
 	}
 	rc = i;
@@ -450,7 +449,8 @@ static ssize_t
 ecryptfs_miscdev_write(struct file *file, const char __user *buf,
 		       size_t count, loff_t *ppos)
 {
-	u32 counter_nbo, seq;
+	__be32 counter_nbo;
+	u32 seq;
 	size_t packet_size, packet_size_length, i;
 	ssize_t sz = 0;
 	char *data;
@@ -483,7 +483,7 @@ ecryptfs_miscdev_write(struct file *file, const char __user *buf,
 			       count);
 			goto out_free;
 		}
-		memcpy((char *)&counter_nbo, &data[i], 4);
+		memcpy(&counter_nbo, &data[i], 4);
 		seq = be32_to_cpu(counter_nbo);
 		i += 4;
 		rc = ecryptfs_parse_packet_length(&data[i], &packet_size,
@@ -575,13 +575,11 @@ int ecryptfs_init_ecryptfs_miscdev(void)
 	int rc;
 
 	atomic_set(&ecryptfs_num_miscdev_opens, 0);
-	mutex_lock(&ecryptfs_daemon_hash_mux);
 	rc = misc_register(&ecryptfs_miscdev);
 	if (rc)
 		printk(KERN_ERR "%s: Failed to register miscellaneous device "
 		       "for communications with userspace daemons; rc = [%d]\n",
 		       __func__, rc);
-	mutex_unlock(&ecryptfs_daemon_hash_mux);
 	return rc;
 }
 
diff --git a/fs/ecryptfs/read_write.c b/fs/ecryptfs/read_write.c
index ebf55150be56..75c2ea9fee35 100644
--- a/fs/ecryptfs/read_write.c
+++ b/fs/ecryptfs/read_write.c
@@ -157,20 +157,6 @@ int ecryptfs_write(struct file *ecryptfs_file, char *data, loff_t offset,
 			       ecryptfs_page_idx, rc);
 			goto out;
 		}
-		if (start_offset_in_page) {
-			/* Read in the page from the lower
-			 * into the eCryptfs inode page cache,
-			 * decrypting */
-			rc = ecryptfs_decrypt_page(ecryptfs_page);
-			if (rc) {
-				printk(KERN_ERR "%s: Error decrypting "
-				       "page; rc = [%d]\n",
-				       __func__, rc);
-				ClearPageUptodate(ecryptfs_page);
-				page_cache_release(ecryptfs_page);
-				goto out;
-			}
-		}
 		ecryptfs_page_virt = kmap_atomic(ecryptfs_page, KM_USER0);
 
 		/*
@@ -349,14 +335,6 @@ int ecryptfs_read(char *data, loff_t offset, size_t size,
 			       ecryptfs_page_idx, rc);
 			goto out;
 		}
-		rc = ecryptfs_decrypt_page(ecryptfs_page);
-		if (rc) {
-			printk(KERN_ERR "%s: Error decrypting "
-			       "page; rc = [%d]\n", __func__, rc);
-			ClearPageUptodate(ecryptfs_page);
-			page_cache_release(ecryptfs_page);
-			goto out;
-		}
 		ecryptfs_page_virt = kmap_atomic(ecryptfs_page, KM_USER0);
 		memcpy((data + data_offset),
 		       ((char *)ecryptfs_page_virt + start_offset_in_page),
diff --git a/fs/exec.c b/fs/exec.c
index aeaa9791d8be..fd9234379e8d 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -26,7 +26,6 @@
 #include <linux/file.h>
 #include <linux/fdtable.h>
 #include <linux/mman.h>
-#include <linux/a.out.h>
 #include <linux/stat.h>
 #include <linux/fcntl.h>
 #include <linux/smp_lock.h>
@@ -61,6 +60,11 @@
 #include <linux/kmod.h>
 #endif
 
+#ifdef __alpha__
+/* for /sbin/loader handling in search_binary_handler() */
+#include <linux/a.out.h>
+#endif
+
 int core_uses_pid;
 char core_pattern[CORENAME_MAX_SIZE] = "core";
 int suid_dumpable = 0;
@@ -606,7 +610,7 @@ int setup_arg_pages(struct linux_binprm *bprm,
 	bprm->exec -= stack_shift;
 
 	down_write(&mm->mmap_sem);
-	vm_flags = vma->vm_flags;
+	vm_flags = VM_STACK_FLAGS;
 
 	/*
 	 * Adjust stack execute permissions; explicitly enable for
@@ -736,7 +740,7 @@ static int exec_mmap(struct mm_struct *mm)
 	tsk->active_mm = mm;
 	activate_mm(active_mm, mm);
 	task_unlock(tsk);
-	mm_update_next_owner(mm);
+	mm_update_next_owner(old_mm);
 	arch_pick_mmap_layout(mm);
 	if (old_mm) {
 		up_read(&old_mm->mmap_sem);
@@ -860,6 +864,7 @@ static int de_thread(struct task_struct *tsk)
 
 no_thread_group:
 	exit_itimers(sig);
+	flush_itimer_signals();
 	if (leader)
 		release_task(leader);
 
@@ -1154,7 +1159,7 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
 {
 	int try,retval;
 	struct linux_binfmt *fmt;
-#if defined(__alpha__) && defined(CONFIG_ARCH_SUPPORTS_AOUT)
+#ifdef __alpha__
 	/* handle /sbin/loader.. */
 	{
 	    struct exec * eh = (struct exec *) bprm->buf;
@@ -1251,6 +1256,12 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
 
 EXPORT_SYMBOL(search_binary_handler);
 
+void free_bprm(struct linux_binprm *bprm)
+{
+	free_arg_pages(bprm);
+	kfree(bprm);
+}
+
 /*
  * sys_execve() executes a new program.
  */
@@ -1320,17 +1331,15 @@ int do_execve(char * filename,
 	retval = search_binary_handler(bprm,regs);
 	if (retval >= 0) {
 		/* execve success */
-		free_arg_pages(bprm);
 		security_bprm_free(bprm);
 		acct_update_integrals(current);
-		kfree(bprm);
+		free_bprm(bprm);
 		if (displaced)
 			put_files_struct(displaced);
 		return retval;
 	}
 
 out:
-	free_arg_pages(bprm);
 	if (bprm->security)
 		security_bprm_free(bprm);
 
@@ -1344,7 +1353,7 @@ out_file:
 		fput(bprm->file);
 	}
 out_kfree:
-	kfree(bprm);
+	free_bprm(bprm);
 
 out_files:
 	if (displaced)
diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c
index 28cfd0b40527..77278e947e94 100644
--- a/fs/ext3/resize.c
+++ b/fs/ext3/resize.c
@@ -580,7 +580,8 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
 	}
 
 	blk = EXT3_SB(sb)->s_sbh->b_blocknr + 1 + EXT3_SB(sb)->s_gdb_count;
-	data = (__le32 *)dind->b_data + EXT3_SB(sb)->s_gdb_count;
+	data = (__le32 *)dind->b_data + (EXT3_SB(sb)->s_gdb_count %
+					 EXT3_ADDR_PER_BLOCK(sb));
 	end = (__le32 *)dind->b_data + EXT3_ADDR_PER_BLOCK(sb);
 
 	/* Get each reserved primary GDT block and verify it holds backups */
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index fe3119a71ada..2845425077e8 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -2875,8 +2875,10 @@ static ssize_t ext3_quota_write(struct super_block *sb, int type,
 		blk++;
 	}
 out:
-	if (len == towrite)
+	if (len == towrite) {
+		mutex_unlock(&inode->i_mutex);
 		return err;
+	}
 	if (inode->i_size < off+len-towrite) {
 		i_size_write(inode, off+len-towrite);
 		EXT3_I(inode)->i_disksize = inode->i_size;
diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c
index d4a4f0e9ff69..175414ac2210 100644
--- a/fs/ext3/xattr.c
+++ b/fs/ext3/xattr.c
@@ -1000,6 +1000,11 @@ ext3_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
 			i.value = NULL;
 			error = ext3_xattr_block_set(handle, inode, &i, &bs);
 		} else if (error == -ENOSPC) {
+			if (EXT3_I(inode)->i_file_acl && !bs.s.base) {
+				error = ext3_xattr_block_find(inode, &i, &bs);
+				if (error)
+					goto cleanup;
+			}
 			error = ext3_xattr_block_set(handle, inode, &i, &bs);
 			if (error)
 				goto cleanup;
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index da994374ec3b..495ab21b9832 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -43,6 +43,46 @@ void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
 
 }
 
+static int ext4_block_in_group(struct super_block *sb, ext4_fsblk_t block,
+			ext4_group_t block_group)
+{
+	ext4_group_t actual_group;
+	ext4_get_group_no_and_offset(sb, block, &actual_group, NULL);
+	if (actual_group == block_group)
+		return 1;
+	return 0;
+}
+
+static int ext4_group_used_meta_blocks(struct super_block *sb,
+				ext4_group_t block_group)
+{
+	ext4_fsblk_t tmp;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	/* block bitmap, inode bitmap, and inode table blocks */
+	int used_blocks = sbi->s_itb_per_group + 2;
+
+	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) {
+		struct ext4_group_desc *gdp;
+		struct buffer_head *bh;
+
+		gdp = ext4_get_group_desc(sb, block_group, &bh);
+		if (!ext4_block_in_group(sb, ext4_block_bitmap(sb, gdp),
+					block_group))
+			used_blocks--;
+
+		if (!ext4_block_in_group(sb, ext4_inode_bitmap(sb, gdp),
+					block_group))
+			used_blocks--;
+
+		tmp = ext4_inode_table(sb, gdp);
+		for (; tmp < ext4_inode_table(sb, gdp) +
+				sbi->s_itb_per_group; tmp++) {
+			if (!ext4_block_in_group(sb, tmp, block_group))
+				used_blocks -= 1;
+		}
+	}
+	return used_blocks;
+}
 /* Initializes an uninitialized block bitmap if given, and returns the
  * number of blocks free in the group. */
 unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
@@ -81,12 +121,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
 				le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks);
 		}
 	} else { /* For META_BG_BLOCK_GROUPS */
-		int group_rel = (block_group -
-				 le32_to_cpu(sbi->s_es->s_first_meta_bg)) %
-				EXT4_DESC_PER_BLOCK(sb);
-		if (group_rel == 0 || group_rel == 1 ||
-		    (group_rel == EXT4_DESC_PER_BLOCK(sb) - 1))
-			bit_max += 1;
+		bit_max += ext4_bg_num_gdb(sb, block_group);
 	}
 
 	if (block_group == sbi->s_groups_count - 1) {
@@ -105,20 +140,34 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
 	free_blocks = group_blocks - bit_max;
 
 	if (bh) {
-		ext4_fsblk_t start;
+		ext4_fsblk_t start, tmp;
+		int flex_bg = 0;
 
 		for (bit = 0; bit < bit_max; bit++)
 			ext4_set_bit(bit, bh->b_data);
 
 		start = ext4_group_first_block_no(sb, block_group);
 
-		/* Set bits for block and inode bitmaps, and inode table */
-		ext4_set_bit(ext4_block_bitmap(sb, gdp) - start, bh->b_data);
-		ext4_set_bit(ext4_inode_bitmap(sb, gdp) - start, bh->b_data);
-		for (bit = (ext4_inode_table(sb, gdp) - start),
-		     bit_max = bit + sbi->s_itb_per_group; bit < bit_max; bit++)
-			ext4_set_bit(bit, bh->b_data);
+		if (EXT4_HAS_INCOMPAT_FEATURE(sb,
+					      EXT4_FEATURE_INCOMPAT_FLEX_BG))
+			flex_bg = 1;
 
+		/* Set bits for block and inode bitmaps, and inode table */
+		tmp = ext4_block_bitmap(sb, gdp);
+		if (!flex_bg || ext4_block_in_group(sb, tmp, block_group))
+			ext4_set_bit(tmp - start, bh->b_data);
+
+		tmp = ext4_inode_bitmap(sb, gdp);
+		if (!flex_bg || ext4_block_in_group(sb, tmp, block_group))
+			ext4_set_bit(tmp - start, bh->b_data);
+
+		tmp = ext4_inode_table(sb, gdp);
+		for (; tmp < ext4_inode_table(sb, gdp) +
+				sbi->s_itb_per_group; tmp++) {
+			if (!flex_bg ||
+				ext4_block_in_group(sb, tmp, block_group))
+				ext4_set_bit(tmp - start, bh->b_data);
+		}
 		/*
 		 * Also if the number of blocks within the group is
 		 * less than the blocksize * 8 ( which is the size
@@ -126,8 +175,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
 		 */
 		mark_bitmap_end(group_blocks, sb->s_blocksize * 8, bh->b_data);
 	}
-
-	return free_blocks - sbi->s_itb_per_group - 2;
+	return free_blocks - ext4_group_used_meta_blocks(sb, block_group);
 }
 
 
@@ -242,7 +290,7 @@ err_out:
 	return 0;
 }
 /**
- * read_block_bitmap()
+ * ext4_read_block_bitmap()
  * @sb:			super block
  * @block_group:	given block group
  *
@@ -252,7 +300,7 @@ err_out:
  * Return buffer_head on success or NULL in case of failure.
  */
 struct buffer_head *
-read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
+ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
 {
 	struct ext4_group_desc * desc;
 	struct buffer_head * bh = NULL;
@@ -287,11 +335,11 @@ read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
 			    (int)block_group, (unsigned long long)bitmap_blk);
 		return NULL;
 	}
-	if (!ext4_valid_block_bitmap(sb, desc, block_group, bh)) {
-		put_bh(bh);
-		return NULL;
-	}
-
+	ext4_valid_block_bitmap(sb, desc, block_group, bh);
+	/*
+	 * file system mounted not to panic on error,
+	 * continue with corrupt bitmap
+	 */
 	return bh;
 }
 /*
@@ -356,8 +404,7 @@ restart:
 		prev = rsv;
 	}
 	printk("Window map complete.\n");
-	if (bad)
-		BUG();
+	BUG_ON(bad);
 }
 #define rsv_window_dump(root, verbose) \
 	__rsv_window_dump((root), (verbose), __func__)
@@ -641,7 +688,7 @@ do_more:
 		count -= overflow;
 	}
 	brelse(bitmap_bh);
-	bitmap_bh = read_block_bitmap(sb, block_group);
+	bitmap_bh = ext4_read_block_bitmap(sb, block_group);
 	if (!bitmap_bh)
 		goto error_return;
 	desc = ext4_get_group_desc (sb, block_group, &gd_bh);
@@ -757,6 +804,13 @@ do_more:
 	spin_unlock(sb_bgl_lock(sbi, block_group));
 	percpu_counter_add(&sbi->s_freeblocks_counter, count);
 
+	if (sbi->s_log_groups_per_flex) {
+		ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
+		spin_lock(sb_bgl_lock(sbi, flex_group));
+		sbi->s_flex_groups[flex_group].free_blocks += count;
+		spin_unlock(sb_bgl_lock(sbi, flex_group));
+	}
+
 	/* We dirtied the bitmap block */
 	BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
 	err = ext4_journal_dirty_metadata(handle, bitmap_bh);
@@ -1545,23 +1599,35 @@ out:
 
 /**
  * ext4_has_free_blocks()
- * @sbi:		in-core super block structure.
+ * @sbi:	in-core super block structure.
+ * @nblocks:	number of neeed blocks
  *
- * Check if filesystem has at least 1 free block available for allocation.
+ * Check if filesystem has free blocks available for allocation.
+ * Return the number of blocks avaible for allocation for this request
+ * On success, return nblocks
  */
-static int ext4_has_free_blocks(struct ext4_sb_info *sbi)
+ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi,
+						ext4_fsblk_t nblocks)
 {
-	ext4_fsblk_t free_blocks, root_blocks;
+	ext4_fsblk_t free_blocks;
+	ext4_fsblk_t root_blocks = 0;
 
 	free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
-	root_blocks = ext4_r_blocks_count(sbi->s_es);
-	if (free_blocks < root_blocks + 1 && !capable(CAP_SYS_RESOURCE) &&
+
+	if (!capable(CAP_SYS_RESOURCE) &&
 		sbi->s_resuid != current->fsuid &&
-		(sbi->s_resgid == 0 || !in_group_p (sbi->s_resgid))) {
-		return 0;
-	}
-	return 1;
-}
+		(sbi->s_resgid == 0 || !in_group_p(sbi->s_resgid)))
+		root_blocks = ext4_r_blocks_count(sbi->s_es);
+#ifdef CONFIG_SMP
+	if (free_blocks - root_blocks < FBC_BATCH)
+		free_blocks =
+			percpu_counter_sum_and_set(&sbi->s_freeblocks_counter);
+#endif
+	if (free_blocks - root_blocks < nblocks)
+		return free_blocks - root_blocks;
+	return nblocks;
+ }
+
 
 /**
  * ext4_should_retry_alloc()
@@ -1577,7 +1643,7 @@ static int ext4_has_free_blocks(struct ext4_sb_info *sbi)
  */
 int ext4_should_retry_alloc(struct super_block *sb, int *retries)
 {
-	if (!ext4_has_free_blocks(EXT4_SB(sb)) || (*retries)++ > 3)
+	if (!ext4_has_free_blocks(EXT4_SB(sb), 1) || (*retries)++ > 3)
 		return 0;
 
 	jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id);
@@ -1586,20 +1652,24 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries)
 }
 
 /**
- * ext4_new_blocks_old() -- core block(s) allocation function
+ * ext4_old_new_blocks() -- core block bitmap based block allocation function
+ *
  * @handle:		handle to this transaction
  * @inode:		file inode
  * @goal:		given target block(filesystem wide)
  * @count:		target number of blocks to allocate
  * @errp:		error code
  *
- * ext4_new_blocks uses a goal block to assist allocation.  It tries to
- * allocate block(s) from the block group contains the goal block first. If that
- * fails, it will try to allocate block(s) from other block groups without
- * any specific goal block.
+ * ext4_old_new_blocks uses a goal block to assist allocation and look up
+ * the block bitmap directly to do block allocation.  It tries to
+ * allocate block(s) from the block group contains the goal block first. If
+ * that fails, it will try to allocate block(s) from other block groups
+ * without any specific goal block.
+ *
+ * This function is called when -o nomballoc mount option is enabled
  *
  */
-ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode,
+ext4_fsblk_t ext4_old_new_blocks(handle_t *handle, struct inode *inode,
 			ext4_fsblk_t goal, unsigned long *count, int *errp)
 {
 	struct buffer_head *bitmap_bh = NULL;
@@ -1623,13 +1693,26 @@ ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode,
 	ext4_group_t ngroups;
 	unsigned long num = *count;
 
-	*errp = -ENOSPC;
 	sb = inode->i_sb;
 	if (!sb) {
+		*errp = -ENODEV;
 		printk("ext4_new_block: nonexistent device");
 		return 0;
 	}
 
+	sbi = EXT4_SB(sb);
+	if (!EXT4_I(inode)->i_delalloc_reserved_flag) {
+		/*
+		 * With delalloc we already reserved the blocks
+		 */
+		*count = ext4_has_free_blocks(sbi, *count);
+	}
+	if (*count == 0) {
+		*errp = -ENOSPC;
+		return 0;	/*return with ENOSPC error */
+	}
+	num = *count;
+
 	/*
 	 * Check quota for allocation of this block.
 	 */
@@ -1653,11 +1736,6 @@ ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode,
 	if (block_i && ((windowsz = block_i->rsv_window_node.rsv_goal_size) > 0))
 		my_rsv = &block_i->rsv_window_node;
 
-	if (!ext4_has_free_blocks(sbi)) {
-		*errp = -ENOSPC;
-		goto out;
-	}
-
 	/*
 	 * First, test whether the goal block is free.
 	 */
@@ -1681,7 +1759,7 @@ retry_alloc:
 		my_rsv = NULL;
 
 	if (free_blocks > 0) {
-		bitmap_bh = read_block_bitmap(sb, group_no);
+		bitmap_bh = ext4_read_block_bitmap(sb, group_no);
 		if (!bitmap_bh)
 			goto io_error;
 		grp_alloc_blk = ext4_try_to_allocate_with_rsv(sb, handle,
@@ -1717,7 +1795,7 @@ retry_alloc:
 			continue;
 
 		brelse(bitmap_bh);
-		bitmap_bh = read_block_bitmap(sb, group_no);
+		bitmap_bh = ext4_read_block_bitmap(sb, group_no);
 		if (!bitmap_bh)
 			goto io_error;
 		/*
@@ -1770,7 +1848,12 @@ allocated:
 			    "Allocating block in system zone - "
 			    "blocks from %llu, length %lu",
 			     ret_block, num);
-		goto out;
+		/*
+		 * claim_block marked the blocks we allocated
+		 * as in use. So we may want to selectively
+		 * mark some of the blocks as free
+		 */
+		goto retry_alloc;
 	}
 
 	performed_allocation = 1;
@@ -1824,7 +1907,15 @@ allocated:
 	le16_add_cpu(&gdp->bg_free_blocks_count, -num);
 	gdp->bg_checksum = ext4_group_desc_csum(sbi, group_no, gdp);
 	spin_unlock(sb_bgl_lock(sbi, group_no));
-	percpu_counter_sub(&sbi->s_freeblocks_counter, num);
+	if (!EXT4_I(inode)->i_delalloc_reserved_flag)
+		percpu_counter_sub(&sbi->s_freeblocks_counter, num);
+
+	if (sbi->s_log_groups_per_flex) {
+		ext4_group_t flex_group = ext4_flex_group(sbi, group_no);
+		spin_lock(sb_bgl_lock(sbi, flex_group));
+		sbi->s_flex_groups[flex_group].free_blocks -= num;
+		spin_unlock(sb_bgl_lock(sbi, flex_group));
+	}
 
 	BUFFER_TRACE(gdp_bh, "journal_dirty_metadata for group descriptor");
 	err = ext4_journal_dirty_metadata(handle, gdp_bh);
@@ -1857,46 +1948,104 @@ out:
 	return 0;
 }
 
-ext4_fsblk_t ext4_new_block(handle_t *handle, struct inode *inode,
-		ext4_fsblk_t goal, int *errp)
+#define EXT4_META_BLOCK 0x1
+
+static ext4_fsblk_t do_blk_alloc(handle_t *handle, struct inode *inode,
+				ext4_lblk_t iblock, ext4_fsblk_t goal,
+				unsigned long *count, int *errp, int flags)
 {
 	struct ext4_allocation_request ar;
 	ext4_fsblk_t ret;
 
 	if (!test_opt(inode->i_sb, MBALLOC)) {
-		unsigned long count = 1;
-		ret = ext4_new_blocks_old(handle, inode, goal, &count, errp);
-		return ret;
+		return ext4_old_new_blocks(handle, inode, goal, count, errp);
 	}
 
 	memset(&ar, 0, sizeof(ar));
+	/* Fill with neighbour allocated blocks */
+
 	ar.inode = inode;
 	ar.goal = goal;
-	ar.len = 1;
+	ar.len = *count;
+	ar.logical = iblock;
+
+	if (S_ISREG(inode->i_mode) && !(flags & EXT4_META_BLOCK))
+		/* enable in-core preallocation for data block allocation */
+		ar.flags = EXT4_MB_HINT_DATA;
+	else
+		/* disable in-core preallocation for non-regular files */
+		ar.flags = 0;
+
 	ret = ext4_mb_new_blocks(handle, &ar, errp);
+	*count = ar.len;
 	return ret;
 }
 
-ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
+/*
+ * ext4_new_meta_blocks() -- allocate block for meta data (indexing) blocks
+ *
+ * @handle:             handle to this transaction
+ * @inode:              file inode
+ * @goal:               given target block(filesystem wide)
+ * @count:		total number of blocks need
+ * @errp:               error code
+ *
+ * Return 1st allocated block numberon success, *count stores total account
+ * error stores in errp pointer
+ */
+ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
 		ext4_fsblk_t goal, unsigned long *count, int *errp)
 {
-	struct ext4_allocation_request ar;
 	ext4_fsblk_t ret;
-
-	if (!test_opt(inode->i_sb, MBALLOC)) {
-		ret = ext4_new_blocks_old(handle, inode, goal, count, errp);
-		return ret;
+	ret = do_blk_alloc(handle, inode, 0, goal,
+				count, errp, EXT4_META_BLOCK);
+	/*
+	 * Account for the allocated meta blocks
+	 */
+	if (!(*errp)) {
+		spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
+		EXT4_I(inode)->i_allocated_meta_blocks += *count;
+		spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
 	}
-
-	memset(&ar, 0, sizeof(ar));
-	ar.inode = inode;
-	ar.goal = goal;
-	ar.len = *count;
-	ret = ext4_mb_new_blocks(handle, &ar, errp);
-	*count = ar.len;
 	return ret;
 }
 
+/*
+ * ext4_new_meta_block() -- allocate block for meta data (indexing) blocks
+ *
+ * @handle:             handle to this transaction
+ * @inode:              file inode
+ * @goal:               given target block(filesystem wide)
+ * @errp:               error code
+ *
+ * Return allocated block number on success
+ */
+ext4_fsblk_t ext4_new_meta_block(handle_t *handle, struct inode *inode,
+		ext4_fsblk_t goal, int *errp)
+{
+	unsigned long count = 1;
+	return ext4_new_meta_blocks(handle, inode, goal, &count, errp);
+}
+
+/*
+ * ext4_new_blocks() -- allocate data blocks
+ *
+ * @handle:             handle to this transaction
+ * @inode:              file inode
+ * @goal:               given target block(filesystem wide)
+ * @count:		total number of blocks need
+ * @errp:               error code
+ *
+ * Return 1st allocated block numberon success, *count stores total account
+ * error stores in errp pointer
+ */
+
+ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
+				ext4_lblk_t iblock, ext4_fsblk_t goal,
+				unsigned long *count, int *errp)
+{
+	return do_blk_alloc(handle, inode, iblock, goal, count, errp, 0);
+}
 
 /**
  * ext4_count_free_blocks() -- count filesystem free blocks
@@ -1928,7 +2077,7 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
 			continue;
 		desc_count += le16_to_cpu(gdp->bg_free_blocks_count);
 		brelse(bitmap_bh);
-		bitmap_bh = read_block_bitmap(sb, i);
+		bitmap_bh = ext4_read_block_bitmap(sb, i);
 		if (bitmap_bh == NULL)
 			continue;
 
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 2bf0331ea194..d3d23d73c08b 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -129,7 +129,8 @@ static int ext4_readdir(struct file * filp,
 		struct buffer_head *bh = NULL;
 
 		map_bh.b_state = 0;
-		err = ext4_get_blocks_wrap(NULL, inode, blk, 1, &map_bh, 0, 0);
+		err = ext4_get_blocks_wrap(NULL, inode, blk, 1, &map_bh,
+						0, 0, 0);
 		if (err > 0) {
 			pgoff_t index = map_bh.b_blocknr >>
 					(PAGE_CACHE_SHIFT - inode->i_blkbits);
@@ -272,7 +273,7 @@ static void free_rb_tree_fname(struct rb_root *root)
 
 	while (n) {
 		/* Do the node's children first */
-		if ((n)->rb_left) {
+		if (n->rb_left) {
 			n = n->rb_left;
 			continue;
 		}
@@ -301,24 +302,18 @@ static void free_rb_tree_fname(struct rb_root *root)
 			parent->rb_right = NULL;
 		n = parent;
 	}
-	root->rb_node = NULL;
 }
 
 
-static struct dir_private_info *create_dir_info(loff_t pos)
+static struct dir_private_info *ext4_htree_create_dir_info(loff_t pos)
 {
 	struct dir_private_info *p;
 
-	p = kmalloc(sizeof(struct dir_private_info), GFP_KERNEL);
+	p = kzalloc(sizeof(struct dir_private_info), GFP_KERNEL);
 	if (!p)
 		return NULL;
-	p->root.rb_node = NULL;
-	p->curr_node = NULL;
-	p->extra_fname = NULL;
-	p->last_pos = 0;
 	p->curr_hash = pos2maj_hash(pos);
 	p->curr_minor_hash = pos2min_hash(pos);
-	p->next_hash = 0;
 	return p;
 }
 
@@ -433,7 +428,7 @@ static int ext4_dx_readdir(struct file * filp,
 	int	ret;
 
 	if (!info) {
-		info = create_dir_info(filp->f_pos);
+		info = ext4_htree_create_dir_info(filp->f_pos);
 		if (!info)
 			return -ENOMEM;
 		filp->private_data = info;
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 8158083f7ac0..303e41cf7b14 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -22,7 +22,7 @@
 #include "ext4_i.h"
 
 /*
- * The second extended filesystem constants/structures
+ * The fourth extended filesystem constants/structures
  */
 
 /*
@@ -45,7 +45,7 @@
 #define ext4_debug(f, a...)						\
 	do {								\
 		printk (KERN_DEBUG "EXT4-fs DEBUG (%s, %d): %s:",	\
-			__FILE__, __LINE__, __FUNCTION__);		\
+			__FILE__, __LINE__, __func__);			\
 		printk (KERN_DEBUG f, ## a);				\
 	} while (0)
 #else
@@ -74,6 +74,9 @@
 #define EXT4_MB_HINT_GOAL_ONLY		256
 /* goal is meaningful */
 #define EXT4_MB_HINT_TRY_GOAL		512
+/* blocks already pre-reserved by delayed allocation */
+#define EXT4_MB_DELALLOC_RESERVED      1024
+
 
 struct ext4_allocation_request {
 	/* target inode for block we're allocating */
@@ -170,6 +173,15 @@ struct ext4_group_desc
 	__u32	bg_reserved2[3];
 };
 
+/*
+ * Structure of a flex block group info
+ */
+
+struct flex_groups {
+	__u32 free_inodes;
+	__u32 free_blocks;
+};
+
 #define EXT4_BG_INODE_UNINIT	0x0001 /* Inode table/bitmap not in use */
 #define EXT4_BG_BLOCK_UNINIT	0x0002 /* Block bitmap not in use */
 #define EXT4_BG_INODE_ZEROED	0x0004 /* On-disk itable initialized to zero */
@@ -527,6 +539,7 @@ do {									       \
 #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT	0x1000000 /* Journal Async Commit */
 #define EXT4_MOUNT_I_VERSION            0x2000000 /* i_version support */
 #define EXT4_MOUNT_MBALLOC		0x4000000 /* Buddy allocation support */
+#define EXT4_MOUNT_DELALLOC		0x8000000 /* Delalloc support */
 /* Compatibility, for having both ext2_fs.h and ext4_fs.h included at once */
 #ifndef _LINUX_EXT2_FS_H
 #define clear_opt(o, opt)		o &= ~EXT4_MOUNT_##opt
@@ -647,7 +660,10 @@ struct ext4_super_block {
 	__le16  s_mmp_interval;         /* # seconds to wait in MMP checking */
 	__le64  s_mmp_block;            /* Block for multi-mount protection */
 	__le32  s_raid_stripe_width;    /* blocks on all data disks (N*stride)*/
-	__u32   s_reserved[163];        /* Padding to the end of the block */
+	__u8	s_log_groups_per_flex;  /* FLEX_BG group size */
+	__u8	s_reserved_char_pad2;
+	__le16  s_reserved_pad;
+	__u32   s_reserved[162];        /* Padding to the end of the block */
 };
 
 #ifdef __KERNEL__
@@ -958,12 +974,17 @@ extern ext4_grpblk_t ext4_block_group_offset(struct super_block *sb,
 extern int ext4_bg_has_super(struct super_block *sb, ext4_group_t group);
 extern unsigned long ext4_bg_num_gdb(struct super_block *sb,
 			ext4_group_t group);
-extern ext4_fsblk_t ext4_new_block (handle_t *handle, struct inode *inode,
+extern ext4_fsblk_t ext4_new_meta_block(handle_t *handle, struct inode *inode,
 			ext4_fsblk_t goal, int *errp);
-extern ext4_fsblk_t ext4_new_blocks (handle_t *handle, struct inode *inode,
+extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
 			ext4_fsblk_t goal, unsigned long *count, int *errp);
-extern ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode,
+extern ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
+					ext4_lblk_t iblock, ext4_fsblk_t goal,
+					unsigned long *count, int *errp);
+extern ext4_fsblk_t ext4_old_new_blocks(handle_t *handle, struct inode *inode,
 			ext4_fsblk_t goal, unsigned long *count, int *errp);
+extern ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi,
+						ext4_fsblk_t nblocks);
 extern void ext4_free_blocks (handle_t *handle, struct inode *inode,
 			ext4_fsblk_t block, unsigned long count, int metadata);
 extern void ext4_free_blocks_sb (handle_t *handle, struct super_block *sb,
@@ -1016,9 +1037,14 @@ extern int __init init_ext4_mballoc(void);
 extern void exit_ext4_mballoc(void);
 extern void ext4_mb_free_blocks(handle_t *, struct inode *,
 		unsigned long, unsigned long, int, unsigned long *);
+extern int ext4_mb_add_more_groupinfo(struct super_block *sb,
+		ext4_group_t i, struct ext4_group_desc *desc);
+extern void ext4_mb_update_group_info(struct ext4_group_info *grp,
+		ext4_grpblk_t add);
 
 
 /* inode.c */
+void ext4_da_release_space(struct inode *inode, int used, int to_free);
 int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode,
 		struct buffer_head *bh, ext4_fsblk_t blocknr);
 struct buffer_head *ext4_getblk(handle_t *, struct inode *,
@@ -1033,19 +1059,23 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
 extern struct inode *ext4_iget(struct super_block *, unsigned long);
 extern int  ext4_write_inode (struct inode *, int);
 extern int  ext4_setattr (struct dentry *, struct iattr *);
+extern int  ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
+				struct kstat *stat);
 extern void ext4_delete_inode (struct inode *);
 extern int  ext4_sync_inode (handle_t *, struct inode *);
 extern void ext4_discard_reservation (struct inode *);
 extern void ext4_dirty_inode(struct inode *);
 extern int ext4_change_inode_journal_flag(struct inode *, int);
 extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *);
+extern int ext4_can_truncate(struct inode *inode);
 extern void ext4_truncate (struct inode *);
 extern void ext4_set_inode_flags(struct inode *);
 extern void ext4_get_inode_flags(struct ext4_inode_info *);
 extern void ext4_set_aops(struct inode *inode);
 extern int ext4_writepage_trans_blocks(struct inode *);
-extern int ext4_block_truncate_page(handle_t *handle, struct page *page,
+extern int ext4_block_truncate_page(handle_t *handle,
 		struct address_space *mapping, loff_t from);
+extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page);
 
 /* ioctl.c */
 extern long ext4_ioctl(struct file *, unsigned int, unsigned long);
@@ -1159,10 +1189,21 @@ struct ext4_group_info *ext4_get_group_info(struct super_block *sb,
 }
 
 
+static inline ext4_group_t ext4_flex_group(struct ext4_sb_info *sbi,
+					     ext4_group_t block_group)
+{
+	return block_group >> sbi->s_log_groups_per_flex;
+}
+
+static inline unsigned int ext4_flex_bg_size(struct ext4_sb_info *sbi)
+{
+	return 1 << sbi->s_log_groups_per_flex;
+}
+
 #define ext4_std_error(sb, errno)				\
 do {								\
 	if ((errno))						\
-		__ext4_std_error((sb), __FUNCTION__, (errno));	\
+		__ext4_std_error((sb), __func__, (errno));	\
 } while (0)
 
 /*
@@ -1191,7 +1232,7 @@ extern int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 			ext4_lblk_t iblock,
 			unsigned long max_blocks, struct buffer_head *bh_result,
 			int create, int extend_disksize);
-extern void ext4_ext_truncate(struct inode *, struct page *);
+extern void ext4_ext_truncate(struct inode *);
 extern void ext4_ext_init(struct super_block *);
 extern void ext4_ext_release(struct super_block *);
 extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset,
@@ -1199,7 +1240,7 @@ extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset,
 extern int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode,
 			sector_t block, unsigned long max_blocks,
 			struct buffer_head *bh, int create,
-			int extend_disksize);
+			int extend_disksize, int flag);
 #endif	/* __KERNEL__ */
 
 #endif	/* _EXT4_H */
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index 75333b595fab..6c166c0a54b7 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -212,6 +212,7 @@ static inline int ext4_ext_get_actual_len(struct ext4_extent *ext)
 		(le16_to_cpu(ext->ee_len) - EXT_INIT_MAX_LEN));
 }
 
+extern int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks);
 extern ext4_fsblk_t idx_pblock(struct ext4_extent_idx *);
 extern void ext4_ext_store_pblock(struct ext4_extent *, ext4_fsblk_t);
 extern int ext4_extent_tree_init(handle_t *, struct inode *);
diff --git a/fs/ext4/ext4_i.h b/fs/ext4/ext4_i.h
index 26a4ae255d79..ef7409f0e7e4 100644
--- a/fs/ext4/ext4_i.h
+++ b/fs/ext4/ext4_i.h
@@ -79,7 +79,7 @@ struct ext4_ext_cache {
 };
 
 /*
- * third extended file system inode data in memory
+ * fourth extended file system inode data in memory
  */
 struct ext4_inode_info {
 	__le32	i_data[15];	/* unconverted */
@@ -150,6 +150,7 @@ struct ext4_inode_info {
 	 */
 	struct rw_semaphore i_data_sem;
 	struct inode vfs_inode;
+	struct jbd2_inode jinode;
 
 	unsigned long i_ext_generation;
 	struct ext4_ext_cache i_cached_extent;
@@ -162,6 +163,13 @@ struct ext4_inode_info {
 	/* mballoc */
 	struct list_head i_prealloc_list;
 	spinlock_t i_prealloc_lock;
+
+	/* allocation reservation info for delalloc */
+	unsigned long i_reserved_data_blocks;
+	unsigned long i_reserved_meta_blocks;
+	unsigned long i_allocated_meta_blocks;
+	unsigned short i_delalloc_reserved_flag;
+	spinlock_t i_block_reservation_lock;
 };
 
 #endif	/* _EXT4_I */
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index 9255a7d28b24..eb8bc3afe6e9 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -142,19 +142,17 @@ int __ext4_journal_dirty_metadata(const char *where,
 				handle_t *handle, struct buffer_head *bh);
 
 #define ext4_journal_get_undo_access(handle, bh) \
-	__ext4_journal_get_undo_access(__FUNCTION__, (handle), (bh))
+	__ext4_journal_get_undo_access(__func__, (handle), (bh))
 #define ext4_journal_get_write_access(handle, bh) \
-	__ext4_journal_get_write_access(__FUNCTION__, (handle), (bh))
+	__ext4_journal_get_write_access(__func__, (handle), (bh))
 #define ext4_journal_revoke(handle, blocknr, bh) \
-	__ext4_journal_revoke(__FUNCTION__, (handle), (blocknr), (bh))
+	__ext4_journal_revoke(__func__, (handle), (blocknr), (bh))
 #define ext4_journal_get_create_access(handle, bh) \
-	__ext4_journal_get_create_access(__FUNCTION__, (handle), (bh))
+	__ext4_journal_get_create_access(__func__, (handle), (bh))
 #define ext4_journal_dirty_metadata(handle, bh) \
-	__ext4_journal_dirty_metadata(__FUNCTION__, (handle), (bh))
+	__ext4_journal_dirty_metadata(__func__, (handle), (bh))
 #define ext4_journal_forget(handle, bh) \
-	__ext4_journal_forget(__FUNCTION__, (handle), (bh))
-
-int ext4_journal_dirty_data(handle_t *handle, struct buffer_head *bh);
+	__ext4_journal_forget(__func__, (handle), (bh))
 
 handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks);
 int __ext4_journal_stop(const char *where, handle_t *handle);
@@ -165,7 +163,7 @@ static inline handle_t *ext4_journal_start(struct inode *inode, int nblocks)
 }
 
 #define ext4_journal_stop(handle) \
-	__ext4_journal_stop(__FUNCTION__, (handle))
+	__ext4_journal_stop(__func__, (handle))
 
 static inline handle_t *ext4_journal_current_handle(void)
 {
@@ -192,6 +190,11 @@ static inline int ext4_journal_force_commit(journal_t *journal)
 	return jbd2_journal_force_commit(journal);
 }
 
+static inline int ext4_jbd2_file_inode(handle_t *handle, struct inode *inode)
+{
+	return jbd2_journal_file_inode(handle, &EXT4_I(inode)->jinode);
+}
+
 /* super.c */
 int ext4_force_commit(struct super_block *sb);
 
diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h
index 5802e69f2191..6300226d5531 100644
--- a/fs/ext4/ext4_sb.h
+++ b/fs/ext4/ext4_sb.h
@@ -25,7 +25,7 @@
 #include <linux/rbtree.h>
 
 /*
- * third extended-fs super-block data in memory
+ * fourth extended-fs super-block data in memory
  */
 struct ext4_sb_info {
 	unsigned long s_desc_size;	/* Size of a group descriptor in bytes */
@@ -143,6 +143,9 @@ struct ext4_sb_info {
 
 	/* locality groups */
 	struct ext4_locality_group *s_locality_groups;
+
+	unsigned int s_log_groups_per_flex;
+	struct flex_groups *s_flex_groups;
 };
 
 #endif	/* _EXT4_SB */
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 47929c4e3dae..42c4c0c892ed 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -92,17 +92,16 @@ static void ext4_idx_store_pblock(struct ext4_extent_idx *ix, ext4_fsblk_t pb)
 	ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff);
 }
 
-static handle_t *ext4_ext_journal_restart(handle_t *handle, int needed)
+static int ext4_ext_journal_restart(handle_t *handle, int needed)
 {
 	int err;
 
 	if (handle->h_buffer_credits > needed)
-		return handle;
-	if (!ext4_journal_extend(handle, needed))
-		return handle;
-	err = ext4_journal_restart(handle, needed);
-
-	return handle;
+		return 0;
+	err = ext4_journal_extend(handle, needed);
+	if (err)
+		return err;
+	return ext4_journal_restart(handle, needed);
 }
 
 /*
@@ -180,15 +179,18 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
 	return bg_start + colour + block;
 }
 
+/*
+ * Allocation for a meta data block
+ */
 static ext4_fsblk_t
-ext4_ext_new_block(handle_t *handle, struct inode *inode,
+ext4_ext_new_meta_block(handle_t *handle, struct inode *inode,
 			struct ext4_ext_path *path,
 			struct ext4_extent *ex, int *err)
 {
 	ext4_fsblk_t goal, newblock;
 
 	goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block));
-	newblock = ext4_new_block(handle, inode, goal, err);
+	newblock = ext4_new_meta_block(handle, inode, goal, err);
 	return newblock;
 }
 
@@ -246,6 +248,36 @@ static int ext4_ext_space_root_idx(struct inode *inode)
 	return size;
 }
 
+/*
+ * Calculate the number of metadata blocks needed
+ * to allocate @blocks
+ * Worse case is one block per extent
+ */
+int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks)
+{
+	int lcap, icap, rcap, leafs, idxs, num;
+	int newextents = blocks;
+
+	rcap = ext4_ext_space_root_idx(inode);
+	lcap = ext4_ext_space_block(inode);
+	icap = ext4_ext_space_block_idx(inode);
+
+	/* number of new leaf blocks needed */
+	num = leafs = (newextents + lcap - 1) / lcap;
+
+	/*
+	 * Worse case, we need separate index block(s)
+	 * to link all new leaf blocks
+	 */
+	idxs = (leafs + icap - 1) / icap;
+	do {
+		num += idxs;
+		idxs = (idxs + icap - 1) / icap;
+	} while (idxs > rcap);
+
+	return num;
+}
+
 static int
 ext4_ext_max_entries(struct inode *inode, int depth)
 {
@@ -524,6 +556,7 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
 		alloc = 1;
 	}
 	path[0].p_hdr = eh;
+	path[0].p_bh = NULL;
 
 	i = depth;
 	/* walk through the tree */
@@ -552,12 +585,14 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
 	}
 
 	path[ppos].p_depth = i;
-	path[ppos].p_hdr = eh;
 	path[ppos].p_ext = NULL;
 	path[ppos].p_idx = NULL;
 
 	/* find extent */
 	ext4_ext_binsearch(inode, path + ppos, block);
+	/* if not an empty leaf */
+	if (path[ppos].p_ext)
+		path[ppos].p_block = ext_pblock(path[ppos].p_ext);
 
 	ext4_ext_show_path(inode, path);
 
@@ -688,7 +723,8 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
 	/* allocate all needed blocks */
 	ext_debug("allocate %d blocks for indexes/leaf\n", depth - at);
 	for (a = 0; a < depth - at; a++) {
-		newblock = ext4_ext_new_block(handle, inode, path, newext, &err);
+		newblock = ext4_ext_new_meta_block(handle, inode, path,
+						   newext, &err);
 		if (newblock == 0)
 			goto cleanup;
 		ablocks[a] = newblock;
@@ -884,7 +920,7 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
 	ext4_fsblk_t newblock;
 	int err = 0;
 
-	newblock = ext4_ext_new_block(handle, inode, path, newext, &err);
+	newblock = ext4_ext_new_meta_block(handle, inode, path, newext, &err);
 	if (newblock == 0)
 		return err;
 
@@ -981,6 +1017,8 @@ repeat:
 		/* if we found index with free entry, then use that
 		 * entry: create all needed subtree and add new leaf */
 		err = ext4_ext_split(handle, inode, path, newext, i);
+		if (err)
+			goto out;
 
 		/* refill path */
 		ext4_ext_drop_refs(path);
@@ -1883,11 +1921,9 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
 		credits += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb);
 #endif
 
-		handle = ext4_ext_journal_restart(handle, credits);
-		if (IS_ERR(handle)) {
-			err = PTR_ERR(handle);
+		err = ext4_ext_journal_restart(handle, credits);
+		if (err)
 			goto out;
-		}
 
 		err = ext4_ext_get_access(handle, inode, path + depth);
 		if (err)
@@ -2529,6 +2565,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 	int err = 0, depth, ret;
 	unsigned long allocated = 0;
 	struct ext4_allocation_request ar;
+	loff_t disksize;
 
 	__clear_bit(BH_New, &bh_result->b_state);
 	ext_debug("blocks %u/%lu requested for inode %u\n",
@@ -2616,8 +2653,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 				 */
 				if (allocated > max_blocks)
 					allocated = max_blocks;
-				/* mark the buffer unwritten */
-				__set_bit(BH_Unwritten, &bh_result->b_state);
+				set_buffer_unwritten(bh_result);
 				goto out2;
 			}
 
@@ -2716,14 +2752,19 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 		goto out2;
 	}
 
-	if (extend_disksize && inode->i_size > EXT4_I(inode)->i_disksize)
-		EXT4_I(inode)->i_disksize = inode->i_size;
-
 	/* previous routine could use block we allocated */
 	newblock = ext_pblock(&newex);
 	allocated = ext4_ext_get_actual_len(&newex);
 outnew:
-	__set_bit(BH_New, &bh_result->b_state);
+	if (extend_disksize) {
+		disksize = ((loff_t) iblock + ar.len) << inode->i_blkbits;
+		if (disksize > i_size_read(inode))
+			disksize = i_size_read(inode);
+		if (disksize > EXT4_I(inode)->i_disksize)
+			EXT4_I(inode)->i_disksize = disksize;
+	}
+
+	set_buffer_new(bh_result);
 
 	/* Cache only when it is _not_ an uninitialized extent */
 	if (create != EXT4_CREATE_UNINITIALIZED_EXT)
@@ -2733,7 +2774,7 @@ out:
 	if (allocated > max_blocks)
 		allocated = max_blocks;
 	ext4_ext_show_leaf(inode, path);
-	__set_bit(BH_Mapped, &bh_result->b_state);
+	set_buffer_mapped(bh_result);
 	bh_result->b_bdev = inode->i_sb->s_bdev;
 	bh_result->b_blocknr = newblock;
 out2:
@@ -2744,7 +2785,7 @@ out2:
 	return err ? err : allocated;
 }
 
-void ext4_ext_truncate(struct inode * inode, struct page *page)
+void ext4_ext_truncate(struct inode *inode)
 {
 	struct address_space *mapping = inode->i_mapping;
 	struct super_block *sb = inode->i_sb;
@@ -2757,18 +2798,14 @@ void ext4_ext_truncate(struct inode * inode, struct page *page)
 	 */
 	err = ext4_writepage_trans_blocks(inode) + 3;
 	handle = ext4_journal_start(inode, err);
-	if (IS_ERR(handle)) {
-		if (page) {
-			clear_highpage(page);
-			flush_dcache_page(page);
-			unlock_page(page);
-			page_cache_release(page);
-		}
+	if (IS_ERR(handle))
 		return;
-	}
 
-	if (page)
-		ext4_block_truncate_page(handle, page, mapping, inode->i_size);
+	if (inode->i_size & (sb->s_blocksize - 1))
+		ext4_block_truncate_page(handle, mapping, inode->i_size);
+
+	if (ext4_orphan_add(handle, inode))
+		goto out_stop;
 
 	down_write(&EXT4_I(inode)->i_data_sem);
 	ext4_ext_invalidate_cache(inode);
@@ -2780,8 +2817,6 @@ void ext4_ext_truncate(struct inode * inode, struct page *page)
 	 * Probably we need not scan at all,
 	 * because page truncation is enough.
 	 */
-	if (ext4_orphan_add(handle, inode))
-		goto out_stop;
 
 	/* we have to know where to truncate from in crash case */
 	EXT4_I(inode)->i_disksize = inode->i_size;
@@ -2798,6 +2833,7 @@ void ext4_ext_truncate(struct inode * inode, struct page *page)
 		handle->h_sync = 1;
 
 out_stop:
+	up_write(&EXT4_I(inode)->i_data_sem);
 	/*
 	 * If this was a simple ftruncate() and the file will remain alive,
 	 * then we need to clear up the orphan record which we created above.
@@ -2808,7 +2844,6 @@ out_stop:
 	if (inode->i_nlink)
 		ext4_orphan_del(handle, inode);
 
-	up_write(&EXT4_I(inode)->i_data_sem);
 	inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
 	ext4_mark_inode_dirty(handle, inode);
 	ext4_journal_stop(handle);
@@ -2911,7 +2946,7 @@ retry:
 		}
 		ret = ext4_get_blocks_wrap(handle, inode, block,
 					  max_blocks, &map_bh,
-					  EXT4_CREATE_UNINITIALIZED_EXT, 0);
+					  EXT4_CREATE_UNINITIALIZED_EXT, 0, 0);
 		if (ret <= 0) {
 #ifdef EXT4FS_DEBUG
 			WARN_ON(ret <= 0);
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 4159be6366ab..430eb7978db4 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -123,6 +123,23 @@ force_commit:
 	return ret;
 }
 
+static struct vm_operations_struct ext4_file_vm_ops = {
+	.fault		= filemap_fault,
+	.page_mkwrite   = ext4_page_mkwrite,
+};
+
+static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	struct address_space *mapping = file->f_mapping;
+
+	if (!mapping->a_ops->readpage)
+		return -ENOEXEC;
+	file_accessed(file);
+	vma->vm_ops = &ext4_file_vm_ops;
+	vma->vm_flags |= VM_CAN_NONLINEAR;
+	return 0;
+}
+
 const struct file_operations ext4_file_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= do_sync_read,
@@ -133,7 +150,7 @@ const struct file_operations ext4_file_operations = {
 #ifdef CONFIG_COMPAT
 	.compat_ioctl	= ext4_compat_ioctl,
 #endif
-	.mmap		= generic_file_mmap,
+	.mmap		= ext4_file_mmap,
 	.open		= generic_file_open,
 	.release	= ext4_release_file,
 	.fsync		= ext4_sync_file,
@@ -144,6 +161,7 @@ const struct file_operations ext4_file_operations = {
 const struct inode_operations ext4_file_inode_operations = {
 	.truncate	= ext4_truncate,
 	.setattr	= ext4_setattr,
+	.getattr	= ext4_getattr,
 #ifdef CONFIG_EXT4DEV_FS_XATTR
 	.setxattr	= generic_setxattr,
 	.getxattr	= generic_getxattr,
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 1c8ba48d4f8d..a45c3737ad31 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -27,6 +27,7 @@
 #include <linux/sched.h>
 #include <linux/writeback.h>
 #include <linux/jbd2.h>
+#include <linux/blkdev.h>
 #include "ext4.h"
 #include "ext4_jbd2.h"
 
@@ -45,6 +46,7 @@
 int ext4_sync_file(struct file * file, struct dentry *dentry, int datasync)
 {
 	struct inode *inode = dentry->d_inode;
+	journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
 	int ret = 0;
 
 	J_ASSERT(ext4_journal_current_handle() == NULL);
@@ -85,6 +87,8 @@ int ext4_sync_file(struct file * file, struct dentry *dentry, int datasync)
 			.nr_to_write = 0, /* sys_fsync did this */
 		};
 		ret = sync_inode(inode, &wbc);
+		if (journal && (journal->j_flags & JBD2_BARRIER))
+			blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
 	}
 out:
 	return ret;
diff --git a/fs/ext4/group.h b/fs/ext4/group.h
index 7eb0604e7eea..c2c0a8d06d0e 100644
--- a/fs/ext4/group.h
+++ b/fs/ext4/group.h
@@ -13,7 +13,7 @@ extern __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 group,
 				   struct ext4_group_desc *gdp);
 extern int ext4_group_desc_csum_verify(struct ext4_sb_info *sbi, __u32 group,
 				       struct ext4_group_desc *gdp);
-struct buffer_head *read_block_bitmap(struct super_block *sb,
+struct buffer_head *ext4_read_block_bitmap(struct super_block *sb,
 				      ext4_group_t block_group);
 extern unsigned ext4_init_block_bitmap(struct super_block *sb,
 				       struct buffer_head *bh,
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index c6efbab0c801..a92eb305344f 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -157,6 +157,7 @@ void ext4_free_inode (handle_t *handle, struct inode * inode)
 	struct ext4_super_block * es;
 	struct ext4_sb_info *sbi;
 	int fatal = 0, err;
+	ext4_group_t flex_group;
 
 	if (atomic_read(&inode->i_count) > 1) {
 		printk ("ext4_free_inode: inode has count=%d\n",
@@ -232,6 +233,12 @@ void ext4_free_inode (handle_t *handle, struct inode * inode)
 			if (is_directory)
 				percpu_counter_dec(&sbi->s_dirs_counter);
 
+			if (sbi->s_log_groups_per_flex) {
+				flex_group = ext4_flex_group(sbi, block_group);
+				spin_lock(sb_bgl_lock(sbi, flex_group));
+				sbi->s_flex_groups[flex_group].free_inodes++;
+				spin_unlock(sb_bgl_lock(sbi, flex_group));
+			}
 		}
 		BUFFER_TRACE(bh2, "call ext4_journal_dirty_metadata");
 		err = ext4_journal_dirty_metadata(handle, bh2);
@@ -286,6 +293,80 @@ static int find_group_dir(struct super_block *sb, struct inode *parent,
 	return ret;
 }
 
+#define free_block_ratio 10
+
+static int find_group_flex(struct super_block *sb, struct inode *parent,
+			   ext4_group_t *best_group)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_group_desc *desc;
+	struct buffer_head *bh;
+	struct flex_groups *flex_group = sbi->s_flex_groups;
+	ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
+	ext4_group_t parent_fbg_group = ext4_flex_group(sbi, parent_group);
+	ext4_group_t ngroups = sbi->s_groups_count;
+	int flex_size = ext4_flex_bg_size(sbi);
+	ext4_group_t best_flex = parent_fbg_group;
+	int blocks_per_flex = sbi->s_blocks_per_group * flex_size;
+	int flexbg_free_blocks;
+	int flex_freeb_ratio;
+	ext4_group_t n_fbg_groups;
+	ext4_group_t i;
+
+	n_fbg_groups = (sbi->s_groups_count + flex_size - 1) >>
+		sbi->s_log_groups_per_flex;
+
+find_close_to_parent:
+	flexbg_free_blocks = flex_group[best_flex].free_blocks;
+	flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex;
+	if (flex_group[best_flex].free_inodes &&
+	    flex_freeb_ratio > free_block_ratio)
+		goto found_flexbg;
+
+	if (best_flex && best_flex == parent_fbg_group) {
+		best_flex--;
+		goto find_close_to_parent;
+	}
+
+	for (i = 0; i < n_fbg_groups; i++) {
+		if (i == parent_fbg_group || i == parent_fbg_group - 1)
+			continue;
+
+		flexbg_free_blocks = flex_group[i].free_blocks;
+		flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex;
+
+		if (flex_freeb_ratio > free_block_ratio &&
+		    flex_group[i].free_inodes) {
+			best_flex = i;
+			goto found_flexbg;
+		}
+
+		if (best_flex < 0 ||
+		    (flex_group[i].free_blocks >
+		     flex_group[best_flex].free_blocks &&
+		     flex_group[i].free_inodes))
+			best_flex = i;
+	}
+
+	if (!flex_group[best_flex].free_inodes ||
+	    !flex_group[best_flex].free_blocks)
+		return -1;
+
+found_flexbg:
+	for (i = best_flex * flex_size; i < ngroups &&
+		     i < (best_flex + 1) * flex_size; i++) {
+		desc = ext4_get_group_desc(sb, i, &bh);
+		if (le16_to_cpu(desc->bg_free_inodes_count)) {
+			*best_group = i;
+			goto out;
+		}
+	}
+
+	return -1;
+out:
+	return 0;
+}
+
 /*
  * Orlov's allocator for directories.
  *
@@ -501,6 +582,7 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode)
 	struct inode *ret;
 	ext4_group_t i;
 	int free = 0;
+	ext4_group_t flex_group;
 
 	/* Cannot create files in a deleted directory */
 	if (!dir || !dir->i_nlink)
@@ -514,6 +596,12 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode)
 
 	sbi = EXT4_SB(sb);
 	es = sbi->s_es;
+
+	if (sbi->s_log_groups_per_flex) {
+		ret2 = find_group_flex(sb, dir, &group);
+		goto got_group;
+	}
+
 	if (S_ISDIR(mode)) {
 		if (test_opt (sb, OLDALLOC))
 			ret2 = find_group_dir(sb, dir, &group);
@@ -522,6 +610,7 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode)
 	} else
 		ret2 = find_group_other(sb, dir, &group);
 
+got_group:
 	err = -ENOSPC;
 	if (ret2 == -1)
 		goto out;
@@ -600,7 +689,7 @@ got:
 	/* We may have to initialize the block bitmap if it isn't already */
 	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM) &&
 	    gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
-		struct buffer_head *block_bh = read_block_bitmap(sb, group);
+		struct buffer_head *block_bh = ext4_read_block_bitmap(sb, group);
 
 		BUFFER_TRACE(block_bh, "get block bitmap access");
 		err = ext4_journal_get_write_access(handle, block_bh);
@@ -676,6 +765,13 @@ got:
 		percpu_counter_inc(&sbi->s_dirs_counter);
 	sb->s_dirt = 1;
 
+	if (sbi->s_log_groups_per_flex) {
+		flex_group = ext4_flex_group(sbi, group);
+		spin_lock(sb_bgl_lock(sbi, flex_group));
+		sbi->s_flex_groups[flex_group].free_inodes--;
+		spin_unlock(sb_bgl_lock(sbi, flex_group));
+	}
+
 	inode->i_uid = current->fsuid;
 	if (test_opt (sb, GRPID))
 		inode->i_gid = dir->i_gid;
@@ -740,14 +836,10 @@ got:
 		goto fail_free_drop;
 
 	if (test_opt(sb, EXTENTS)) {
-		/* set extent flag only for diretory, file and normal symlink*/
+		/* set extent flag only for directory, file and normal symlink*/
 		if (S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode)) {
 			EXT4_I(inode)->i_flags |= EXT4_EXTENTS_FL;
 			ext4_ext_tree_init(handle, inode);
-			err = ext4_update_incompat_feature(handle, sb,
-					EXT4_FEATURE_INCOMPAT_EXTENTS);
-			if (err)
-				goto fail_free_drop;
 		}
 	}
 
@@ -817,6 +909,14 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino)
 	if (IS_ERR(inode))
 		goto iget_failed;
 
+	/*
+	 * If the orphans has i_nlinks > 0 then it should be able to be
+	 * truncated, otherwise it won't be removed from the orphan list
+	 * during processing and an infinite loop will result.
+	 */
+	if (inode->i_nlink && !ext4_can_truncate(inode))
+		goto bad_orphan;
+
 	if (NEXT_ORPHAN(inode) > max_ino)
 		goto bad_orphan;
 	brelse(bitmap_bh);
@@ -838,6 +938,7 @@ bad_orphan:
 		printk(KERN_NOTICE "NEXT_ORPHAN(inode)=%u\n",
 		       NEXT_ORPHAN(inode));
 		printk(KERN_NOTICE "max_ino=%lu\n", max_ino);
+		printk(KERN_NOTICE "i_nlink=%u\n", inode->i_nlink);
 		/* Avoid freeing blocks if we got a bad deleted inode */
 		if (inode->i_nlink == 0)
 			inode->i_blocks = 0;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 8d9707746413..8ca2763df091 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -32,12 +32,23 @@
 #include <linux/string.h>
 #include <linux/buffer_head.h>
 #include <linux/writeback.h>
+#include <linux/pagevec.h>
 #include <linux/mpage.h>
 #include <linux/uio.h>
 #include <linux/bio.h>
 #include "ext4_jbd2.h"
 #include "xattr.h"
 #include "acl.h"
+#include "ext4_extents.h"
+
+static inline int ext4_begin_ordered_truncate(struct inode *inode,
+					      loff_t new_size)
+{
+	return jbd2_journal_begin_ordered_truncate(&EXT4_I(inode)->jinode,
+						   new_size);
+}
+
+static void ext4_invalidatepage(struct page *page, unsigned long offset);
 
 /*
  * Test whether an inode is a fast symlink.
@@ -181,6 +192,8 @@ void ext4_delete_inode (struct inode * inode)
 {
 	handle_t *handle;
 
+	if (ext4_should_order_data(inode))
+		ext4_begin_ordered_truncate(inode, 0);
 	truncate_inode_pages(&inode->i_data, 0);
 
 	if (is_bad_inode(inode))
@@ -508,11 +521,12 @@ static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned long blks,
  *		direct blocks
  */
 static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
-			ext4_fsblk_t goal, int indirect_blks, int blks,
-			ext4_fsblk_t new_blocks[4], int *err)
+				ext4_lblk_t iblock, ext4_fsblk_t goal,
+				int indirect_blks, int blks,
+				ext4_fsblk_t new_blocks[4], int *err)
 {
 	int target, i;
-	unsigned long count = 0;
+	unsigned long count = 0, blk_allocated = 0;
 	int index = 0;
 	ext4_fsblk_t current_block = 0;
 	int ret = 0;
@@ -525,12 +539,13 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
 	 * the first direct block of this branch.  That's the
 	 * minimum number of blocks need to allocate(required)
 	 */
-	target = blks + indirect_blks;
-
-	while (1) {
+	/* first we try to allocate the indirect blocks */
+	target = indirect_blks;
+	while (target > 0) {
 		count = target;
 		/* allocating blocks for indirect blocks and direct blocks */
-		current_block = ext4_new_blocks(handle,inode,goal,&count,err);
+		current_block = ext4_new_meta_blocks(handle, inode,
+							goal, &count, err);
 		if (*err)
 			goto failed_out;
 
@@ -540,16 +555,48 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
 			new_blocks[index++] = current_block++;
 			count--;
 		}
-
-		if (count > 0)
+		if (count > 0) {
+			/*
+			 * save the new block number
+			 * for the first direct block
+			 */
+			new_blocks[index] = current_block;
+			printk(KERN_INFO "%s returned more blocks than "
+						"requested\n", __func__);
+			WARN_ON(1);
 			break;
+		}
 	}
 
-	/* save the new block number for the first direct block */
-	new_blocks[index] = current_block;
-
+	target = blks - count ;
+	blk_allocated = count;
+	if (!target)
+		goto allocated;
+	/* Now allocate data blocks */
+	count = target;
+	/* allocating blocks for data blocks */
+	current_block = ext4_new_blocks(handle, inode, iblock,
+						goal, &count, err);
+	if (*err && (target == blks)) {
+		/*
+		 * if the allocation failed and we didn't allocate
+		 * any blocks before
+		 */
+		goto failed_out;
+	}
+	if (!*err) {
+		if (target == blks) {
+		/*
+		 * save the new block number
+		 * for the first direct block
+		 */
+			new_blocks[index] = current_block;
+		}
+		blk_allocated += count;
+	}
+allocated:
 	/* total number of blocks allocated for direct blocks */
-	ret = count;
+	ret = blk_allocated;
 	*err = 0;
 	return ret;
 failed_out:
@@ -584,8 +631,9 @@ failed_out:
  *	as described above and return 0.
  */
 static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
-			int indirect_blks, int *blks, ext4_fsblk_t goal,
-			ext4_lblk_t *offsets, Indirect *branch)
+				ext4_lblk_t iblock, int indirect_blks,
+				int *blks, ext4_fsblk_t goal,
+				ext4_lblk_t *offsets, Indirect *branch)
 {
 	int blocksize = inode->i_sb->s_blocksize;
 	int i, n = 0;
@@ -595,7 +643,7 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
 	ext4_fsblk_t new_blocks[4];
 	ext4_fsblk_t current_block;
 
-	num = ext4_alloc_blocks(handle, inode, goal, indirect_blks,
+	num = ext4_alloc_blocks(handle, inode, iblock, goal, indirect_blks,
 				*blks, new_blocks, &err);
 	if (err)
 		return err;
@@ -799,6 +847,7 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
 	struct ext4_inode_info *ei = EXT4_I(inode);
 	int count = 0;
 	ext4_fsblk_t first_block = 0;
+	loff_t disksize;
 
 
 	J_ASSERT(!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL));
@@ -855,8 +904,9 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
 	/*
 	 * Block out ext4_truncate while we alter the tree
 	 */
-	err = ext4_alloc_branch(handle, inode, indirect_blks, &count, goal,
-				offsets + (partial - chain), partial);
+	err = ext4_alloc_branch(handle, inode, iblock, indirect_blks,
+					&count, goal,
+					offsets + (partial - chain), partial);
 
 	/*
 	 * The ext4_splice_branch call will free and forget any buffers
@@ -873,8 +923,13 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
 	 * protect it if you're about to implement concurrent
 	 * ext4_get_block() -bzzz
 	*/
-	if (!err && extend_disksize && inode->i_size > ei->i_disksize)
-		ei->i_disksize = inode->i_size;
+	if (!err && extend_disksize) {
+		disksize = ((loff_t) iblock + count) << inode->i_blkbits;
+		if (disksize > i_size_read(inode))
+			disksize = i_size_read(inode);
+		if (disksize > ei->i_disksize)
+			ei->i_disksize = disksize;
+	}
 	if (err)
 		goto cleanup;
 
@@ -934,7 +989,7 @@ out:
  */
 int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
 			unsigned long max_blocks, struct buffer_head *bh,
-			int create, int extend_disksize)
+			int create, int extend_disksize, int flag)
 {
 	int retval;
 
@@ -975,6 +1030,15 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
 	 * with create == 1 flag.
 	 */
 	down_write((&EXT4_I(inode)->i_data_sem));
+
+	/*
+	 * if the caller is from delayed allocation writeout path
+	 * we have already reserved fs blocks for allocation
+	 * let the underlying get_block() function know to
+	 * avoid double accounting
+	 */
+	if (flag)
+		EXT4_I(inode)->i_delalloc_reserved_flag = 1;
 	/*
 	 * We need to check for EXT4 here because migrate
 	 * could have changed the inode type in between
@@ -996,6 +1060,18 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
 							~EXT4_EXT_MIGRATE;
 		}
 	}
+
+	if (flag) {
+		EXT4_I(inode)->i_delalloc_reserved_flag = 0;
+		/*
+		 * Update reserved blocks/metadata blocks
+		 * after successful block allocation
+		 * which were deferred till now
+		 */
+		if ((retval > 0) && buffer_delay(bh))
+			ext4_da_release_space(inode, retval, 0);
+	}
+
 	up_write((&EXT4_I(inode)->i_data_sem));
 	return retval;
 }
@@ -1021,7 +1097,7 @@ static int ext4_get_block(struct inode *inode, sector_t iblock,
 	}
 
 	ret = ext4_get_blocks_wrap(handle, inode, iblock,
-					max_blocks, bh_result, create, 0);
+					max_blocks, bh_result, create, 0, 0);
 	if (ret > 0) {
 		bh_result->b_size = (ret << inode->i_blkbits);
 		ret = 0;
@@ -1047,7 +1123,7 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
 	dummy.b_blocknr = -1000;
 	buffer_trace_init(&dummy.b_history);
 	err = ext4_get_blocks_wrap(handle, inode, block, 1,
-					&dummy, create, 1);
+					&dummy, create, 1, 0);
 	/*
 	 * ext4_get_blocks_handle() returns number of blocks
 	 * mapped. 0 in case of a HOLE.
@@ -1203,19 +1279,20 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
  	to = from + len;
 
 retry:
- 	page = __grab_cache_page(mapping, index);
- 	if (!page)
- 		return -ENOMEM;
- 	*pagep = page;
-
   	handle = ext4_journal_start(inode, needed_blocks);
   	if (IS_ERR(handle)) {
- 		unlock_page(page);
- 		page_cache_release(page);
   		ret = PTR_ERR(handle);
   		goto out;
 	}
 
+	page = __grab_cache_page(mapping, index);
+	if (!page) {
+		ext4_journal_stop(handle);
+		ret = -ENOMEM;
+		goto out;
+	}
+	*pagep = page;
+
 	ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
 							ext4_get_block);
 
@@ -1225,8 +1302,8 @@ retry:
 	}
 
 	if (ret) {
-		ext4_journal_stop(handle);
  		unlock_page(page);
+		ext4_journal_stop(handle);
  		page_cache_release(page);
 	}
 
@@ -1236,15 +1313,6 @@ out:
 	return ret;
 }
 
-int ext4_journal_dirty_data(handle_t *handle, struct buffer_head *bh)
-{
-	int err = jbd2_journal_dirty_data(handle, bh);
-	if (err)
-		ext4_journal_abort_handle(__func__, __func__,
-						bh, handle, err);
-	return err;
-}
-
 /* For write_end() in data=journal mode */
 static int write_end_fn(handle_t *handle, struct buffer_head *bh)
 {
@@ -1255,29 +1323,6 @@ static int write_end_fn(handle_t *handle, struct buffer_head *bh)
 }
 
 /*
- * Generic write_end handler for ordered and writeback ext4 journal modes.
- * We can't use generic_write_end, because that unlocks the page and we need to
- * unlock the page after ext4_journal_stop, but ext4_journal_stop must run
- * after block_write_end.
- */
-static int ext4_generic_write_end(struct file *file,
-				struct address_space *mapping,
-				loff_t pos, unsigned len, unsigned copied,
-				struct page *page, void *fsdata)
-{
-	struct inode *inode = file->f_mapping->host;
-
-	copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
-
-	if (pos+copied > inode->i_size) {
-		i_size_write(inode, pos+copied);
-		mark_inode_dirty(inode);
-	}
-
-	return copied;
-}
-
-/*
  * We need to pick up the new inode size which generic_commit_write gave us
  * `file' can be NULL - eg, when called from page_symlink().
  *
@@ -1290,15 +1335,14 @@ static int ext4_ordered_write_end(struct file *file,
 				struct page *page, void *fsdata)
 {
 	handle_t *handle = ext4_journal_current_handle();
-	struct inode *inode = file->f_mapping->host;
+	struct inode *inode = mapping->host;
 	unsigned from, to;
 	int ret = 0, ret2;
 
 	from = pos & (PAGE_CACHE_SIZE - 1);
 	to = from + len;
 
-	ret = walk_page_buffers(handle, page_buffers(page),
-		from, to, NULL, ext4_journal_dirty_data);
+	ret = ext4_jbd2_file_inode(handle, inode);
 
 	if (ret == 0) {
 		/*
@@ -1311,7 +1355,7 @@ static int ext4_ordered_write_end(struct file *file,
 		new_i_size = pos + copied;
 		if (new_i_size > EXT4_I(inode)->i_disksize)
 			EXT4_I(inode)->i_disksize = new_i_size;
-		ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,
+		ret2 = generic_write_end(file, mapping, pos, len, copied,
 							page, fsdata);
 		copied = ret2;
 		if (ret2 < 0)
@@ -1320,8 +1364,6 @@ static int ext4_ordered_write_end(struct file *file,
 	ret2 = ext4_journal_stop(handle);
 	if (!ret)
 		ret = ret2;
-	unlock_page(page);
-	page_cache_release(page);
 
 	return ret ? ret : copied;
 }
@@ -1332,7 +1374,7 @@ static int ext4_writeback_write_end(struct file *file,
 				struct page *page, void *fsdata)
 {
 	handle_t *handle = ext4_journal_current_handle();
-	struct inode *inode = file->f_mapping->host;
+	struct inode *inode = mapping->host;
 	int ret = 0, ret2;
 	loff_t new_i_size;
 
@@ -1340,7 +1382,7 @@ static int ext4_writeback_write_end(struct file *file,
 	if (new_i_size > EXT4_I(inode)->i_disksize)
 		EXT4_I(inode)->i_disksize = new_i_size;
 
-	ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,
+	ret2 = generic_write_end(file, mapping, pos, len, copied,
 							page, fsdata);
 	copied = ret2;
 	if (ret2 < 0)
@@ -1349,8 +1391,6 @@ static int ext4_writeback_write_end(struct file *file,
 	ret2 = ext4_journal_stop(handle);
 	if (!ret)
 		ret = ret2;
-	unlock_page(page);
-	page_cache_release(page);
 
 	return ret ? ret : copied;
 }
@@ -1389,14 +1429,965 @@ static int ext4_journalled_write_end(struct file *file,
 			ret = ret2;
 	}
 
+	unlock_page(page);
 	ret2 = ext4_journal_stop(handle);
 	if (!ret)
 		ret = ret2;
-	unlock_page(page);
 	page_cache_release(page);
 
 	return ret ? ret : copied;
 }
+/*
+ * Calculate the number of metadata blocks need to reserve
+ * to allocate @blocks for non extent file based file
+ */
+static int ext4_indirect_calc_metadata_amount(struct inode *inode, int blocks)
+{
+	int icap = EXT4_ADDR_PER_BLOCK(inode->i_sb);
+	int ind_blks, dind_blks, tind_blks;
+
+	/* number of new indirect blocks needed */
+	ind_blks = (blocks + icap - 1) / icap;
+
+	dind_blks = (ind_blks + icap - 1) / icap;
+
+	tind_blks = 1;
+
+	return ind_blks + dind_blks + tind_blks;
+}
+
+/*
+ * Calculate the number of metadata blocks need to reserve
+ * to allocate given number of blocks
+ */
+static int ext4_calc_metadata_amount(struct inode *inode, int blocks)
+{
+	if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
+		return ext4_ext_calc_metadata_amount(inode, blocks);
+
+	return ext4_indirect_calc_metadata_amount(inode, blocks);
+}
+
+static int ext4_da_reserve_space(struct inode *inode, int nrblocks)
+{
+       struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+       unsigned long md_needed, mdblocks, total = 0;
+
+	/*
+	 * recalculate the amount of metadata blocks to reserve
+	 * in order to allocate nrblocks
+	 * worse case is one extent per block
+	 */
+	spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
+	total = EXT4_I(inode)->i_reserved_data_blocks + nrblocks;
+	mdblocks = ext4_calc_metadata_amount(inode, total);
+	BUG_ON(mdblocks < EXT4_I(inode)->i_reserved_meta_blocks);
+
+	md_needed = mdblocks - EXT4_I(inode)->i_reserved_meta_blocks;
+	total = md_needed + nrblocks;
+
+	if (ext4_has_free_blocks(sbi, total) < total) {
+		spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+		return -ENOSPC;
+	}
+
+	/* reduce fs free blocks counter */
+	percpu_counter_sub(&sbi->s_freeblocks_counter, total);
+
+	EXT4_I(inode)->i_reserved_data_blocks += nrblocks;
+	EXT4_I(inode)->i_reserved_meta_blocks = mdblocks;
+
+	spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+	return 0;       /* success */
+}
+
+void ext4_da_release_space(struct inode *inode, int used, int to_free)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+	int total, mdb, mdb_free, release;
+
+	spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
+	/* recalculate the number of metablocks still need to be reserved */
+	total = EXT4_I(inode)->i_reserved_data_blocks - used - to_free;
+	mdb = ext4_calc_metadata_amount(inode, total);
+
+	/* figure out how many metablocks to release */
+	BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
+	mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb;
+
+	/* Account for allocated meta_blocks */
+	mdb_free -= EXT4_I(inode)->i_allocated_meta_blocks;
+
+	release = to_free + mdb_free;
+
+	/* update fs free blocks counter for truncate case */
+	percpu_counter_add(&sbi->s_freeblocks_counter, release);
+
+	/* update per-inode reservations */
+	BUG_ON(used + to_free > EXT4_I(inode)->i_reserved_data_blocks);
+	EXT4_I(inode)->i_reserved_data_blocks -= (used + to_free);
+
+	BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
+	EXT4_I(inode)->i_reserved_meta_blocks = mdb;
+	EXT4_I(inode)->i_allocated_meta_blocks = 0;
+	spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+}
+
+static void ext4_da_page_release_reservation(struct page *page,
+						unsigned long offset)
+{
+	int to_release = 0;
+	struct buffer_head *head, *bh;
+	unsigned int curr_off = 0;
+
+	head = page_buffers(page);
+	bh = head;
+	do {
+		unsigned int next_off = curr_off + bh->b_size;
+
+		if ((offset <= curr_off) && (buffer_delay(bh))) {
+			to_release++;
+			clear_buffer_delay(bh);
+		}
+		curr_off = next_off;
+	} while ((bh = bh->b_this_page) != head);
+	ext4_da_release_space(page->mapping->host, 0, to_release);
+}
+
+/*
+ * Delayed allocation stuff
+ */
+
+struct mpage_da_data {
+	struct inode *inode;
+	struct buffer_head lbh;			/* extent of blocks */
+	unsigned long first_page, next_page;	/* extent of pages */
+	get_block_t *get_block;
+	struct writeback_control *wbc;
+};
+
+/*
+ * mpage_da_submit_io - walks through extent of pages and try to write
+ * them with __mpage_writepage()
+ *
+ * @mpd->inode: inode
+ * @mpd->first_page: first page of the extent
+ * @mpd->next_page: page after the last page of the extent
+ * @mpd->get_block: the filesystem's block mapper function
+ *
+ * By the time mpage_da_submit_io() is called we expect all blocks
+ * to be allocated. this may be wrong if allocation failed.
+ *
+ * As pages are already locked by write_cache_pages(), we can't use it
+ */
+static int mpage_da_submit_io(struct mpage_da_data *mpd)
+{
+	struct address_space *mapping = mpd->inode->i_mapping;
+	struct mpage_data mpd_pp = {
+		.bio = NULL,
+		.last_block_in_bio = 0,
+		.get_block = mpd->get_block,
+		.use_writepage = 1,
+	};
+	int ret = 0, err, nr_pages, i;
+	unsigned long index, end;
+	struct pagevec pvec;
+
+	BUG_ON(mpd->next_page <= mpd->first_page);
+
+	pagevec_init(&pvec, 0);
+	index = mpd->first_page;
+	end = mpd->next_page - 1;
+
+	while (index <= end) {
+		/* XXX: optimize tail */
+		nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
+		if (nr_pages == 0)
+			break;
+		for (i = 0; i < nr_pages; i++) {
+			struct page *page = pvec.pages[i];
+
+			index = page->index;
+			if (index > end)
+				break;
+			index++;
+
+			err = __mpage_writepage(page, mpd->wbc, &mpd_pp);
+
+			/*
+			 * In error case, we have to continue because
+			 * remaining pages are still locked
+			 * XXX: unlock and re-dirty them?
+			 */
+			if (ret == 0)
+				ret = err;
+		}
+		pagevec_release(&pvec);
+	}
+	if (mpd_pp.bio)
+		mpage_bio_submit(WRITE, mpd_pp.bio);
+
+	return ret;
+}
+
+/*
+ * mpage_put_bnr_to_bhs - walk blocks and assign them actual numbers
+ *
+ * @mpd->inode - inode to walk through
+ * @exbh->b_blocknr - first block on a disk
+ * @exbh->b_size - amount of space in bytes
+ * @logical - first logical block to start assignment with
+ *
+ * the function goes through all passed space and put actual disk
+ * block numbers into buffer heads, dropping BH_Delay
+ */
+static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
+				 struct buffer_head *exbh)
+{
+	struct inode *inode = mpd->inode;
+	struct address_space *mapping = inode->i_mapping;
+	int blocks = exbh->b_size >> inode->i_blkbits;
+	sector_t pblock = exbh->b_blocknr, cur_logical;
+	struct buffer_head *head, *bh;
+	unsigned long index, end;
+	struct pagevec pvec;
+	int nr_pages, i;
+
+	index = logical >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
+	end = (logical + blocks - 1) >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
+	cur_logical = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+
+	pagevec_init(&pvec, 0);
+
+	while (index <= end) {
+		/* XXX: optimize tail */
+		nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
+		if (nr_pages == 0)
+			break;
+		for (i = 0; i < nr_pages; i++) {
+			struct page *page = pvec.pages[i];
+
+			index = page->index;
+			if (index > end)
+				break;
+			index++;
+
+			BUG_ON(!PageLocked(page));
+			BUG_ON(PageWriteback(page));
+			BUG_ON(!page_has_buffers(page));
+
+			bh = page_buffers(page);
+			head = bh;
+
+			/* skip blocks out of the range */
+			do {
+				if (cur_logical >= logical)
+					break;
+				cur_logical++;
+			} while ((bh = bh->b_this_page) != head);
+
+			do {
+				if (cur_logical >= logical + blocks)
+					break;
+				if (buffer_delay(bh)) {
+					bh->b_blocknr = pblock;
+					clear_buffer_delay(bh);
+				} else if (buffer_mapped(bh))
+					BUG_ON(bh->b_blocknr != pblock);
+
+				cur_logical++;
+				pblock++;
+			} while ((bh = bh->b_this_page) != head);
+		}
+		pagevec_release(&pvec);
+	}
+}
+
+
+/*
+ * __unmap_underlying_blocks - just a helper function to unmap
+ * set of blocks described by @bh
+ */
+static inline void __unmap_underlying_blocks(struct inode *inode,
+					     struct buffer_head *bh)
+{
+	struct block_device *bdev = inode->i_sb->s_bdev;
+	int blocks, i;
+
+	blocks = bh->b_size >> inode->i_blkbits;
+	for (i = 0; i < blocks; i++)
+		unmap_underlying_metadata(bdev, bh->b_blocknr + i);
+}
+
+/*
+ * mpage_da_map_blocks - go through given space
+ *
+ * @mpd->lbh - bh describing space
+ * @mpd->get_block - the filesystem's block mapper function
+ *
+ * The function skips space we know is already mapped to disk blocks.
+ *
+ * The function ignores errors ->get_block() returns, thus real
+ * error handling is postponed to __mpage_writepage()
+ */
+static void mpage_da_map_blocks(struct mpage_da_data *mpd)
+{
+	struct buffer_head *lbh = &mpd->lbh;
+	int err = 0, remain = lbh->b_size;
+	sector_t next = lbh->b_blocknr;
+	struct buffer_head new;
+
+	/*
+	 * We consider only non-mapped and non-allocated blocks
+	 */
+	if (buffer_mapped(lbh) && !buffer_delay(lbh))
+		return;
+
+	while (remain) {
+		new.b_state = lbh->b_state;
+		new.b_blocknr = 0;
+		new.b_size = remain;
+		err = mpd->get_block(mpd->inode, next, &new, 1);
+		if (err) {
+			/*
+			 * Rather than implement own error handling
+			 * here, we just leave remaining blocks
+			 * unallocated and try again with ->writepage()
+			 */
+			break;
+		}
+		BUG_ON(new.b_size == 0);
+
+		if (buffer_new(&new))
+			__unmap_underlying_blocks(mpd->inode, &new);
+
+		/*
+		 * If blocks are delayed marked, we need to
+		 * put actual blocknr and drop delayed bit
+		 */
+		if (buffer_delay(lbh))
+			mpage_put_bnr_to_bhs(mpd, next, &new);
+
+		/* go for the remaining blocks */
+		next += new.b_size >> mpd->inode->i_blkbits;
+		remain -= new.b_size;
+	}
+}
+
+#define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | (1 << BH_Delay))
+
+/*
+ * mpage_add_bh_to_extent - try to add one more block to extent of blocks
+ *
+ * @mpd->lbh - extent of blocks
+ * @logical - logical number of the block in the file
+ * @bh - bh of the block (used to access block's state)
+ *
+ * the function is used to collect contig. blocks in same state
+ */
+static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
+				   sector_t logical, struct buffer_head *bh)
+{
+	struct buffer_head *lbh = &mpd->lbh;
+	sector_t next;
+
+	next = lbh->b_blocknr + (lbh->b_size >> mpd->inode->i_blkbits);
+
+	/*
+	 * First block in the extent
+	 */
+	if (lbh->b_size == 0) {
+		lbh->b_blocknr = logical;
+		lbh->b_size = bh->b_size;
+		lbh->b_state = bh->b_state & BH_FLAGS;
+		return;
+	}
+
+	/*
+	 * Can we merge the block to our big extent?
+	 */
+	if (logical == next && (bh->b_state & BH_FLAGS) == lbh->b_state) {
+		lbh->b_size += bh->b_size;
+		return;
+	}
+
+	/*
+	 * We couldn't merge the block to our extent, so we
+	 * need to flush current  extent and start new one
+	 */
+	mpage_da_map_blocks(mpd);
+
+	/*
+	 * Now start a new extent
+	 */
+	lbh->b_size = bh->b_size;
+	lbh->b_state = bh->b_state & BH_FLAGS;
+	lbh->b_blocknr = logical;
+}
+
+/*
+ * __mpage_da_writepage - finds extent of pages and blocks
+ *
+ * @page: page to consider
+ * @wbc: not used, we just follow rules
+ * @data: context
+ *
+ * The function finds extents of pages and scan them for all blocks.
+ */
+static int __mpage_da_writepage(struct page *page,
+				struct writeback_control *wbc, void *data)
+{
+	struct mpage_da_data *mpd = data;
+	struct inode *inode = mpd->inode;
+	struct buffer_head *bh, *head, fake;
+	sector_t logical;
+
+	/*
+	 * Can we merge this page to current extent?
+	 */
+	if (mpd->next_page != page->index) {
+		/*
+		 * Nope, we can't. So, we map non-allocated blocks
+		 * and start IO on them using __mpage_writepage()
+		 */
+		if (mpd->next_page != mpd->first_page) {
+			mpage_da_map_blocks(mpd);
+			mpage_da_submit_io(mpd);
+		}
+
+		/*
+		 * Start next extent of pages ...
+		 */
+		mpd->first_page = page->index;
+
+		/*
+		 * ... and blocks
+		 */
+		mpd->lbh.b_size = 0;
+		mpd->lbh.b_state = 0;
+		mpd->lbh.b_blocknr = 0;
+	}
+
+	mpd->next_page = page->index + 1;
+	logical = (sector_t) page->index <<
+		  (PAGE_CACHE_SHIFT - inode->i_blkbits);
+
+	if (!page_has_buffers(page)) {
+		/*
+		 * There is no attached buffer heads yet (mmap?)
+		 * we treat the page asfull of dirty blocks
+		 */
+		bh = &fake;
+		bh->b_size = PAGE_CACHE_SIZE;
+		bh->b_state = 0;
+		set_buffer_dirty(bh);
+		set_buffer_uptodate(bh);
+		mpage_add_bh_to_extent(mpd, logical, bh);
+	} else {
+		/*
+		 * Page with regular buffer heads, just add all dirty ones
+		 */
+		head = page_buffers(page);
+		bh = head;
+		do {
+			BUG_ON(buffer_locked(bh));
+			if (buffer_dirty(bh))
+				mpage_add_bh_to_extent(mpd, logical, bh);
+			logical++;
+		} while ((bh = bh->b_this_page) != head);
+	}
+
+	return 0;
+}
+
+/*
+ * mpage_da_writepages - walk the list of dirty pages of the given
+ * address space, allocates non-allocated blocks, maps newly-allocated
+ * blocks to existing bhs and issue IO them
+ *
+ * @mapping: address space structure to write
+ * @wbc: subtract the number of written pages from *@wbc->nr_to_write
+ * @get_block: the filesystem's block mapper function.
+ *
+ * This is a library function, which implements the writepages()
+ * address_space_operation.
+ *
+ * In order to avoid duplication of logic that deals with partial pages,
+ * multiple bio per page, etc, we find non-allocated blocks, allocate
+ * them with minimal calls to ->get_block() and re-use __mpage_writepage()
+ *
+ * It's important that we call __mpage_writepage() only once for each
+ * involved page, otherwise we'd have to implement more complicated logic
+ * to deal with pages w/o PG_lock or w/ PG_writeback and so on.
+ *
+ * See comments to mpage_writepages()
+ */
+static int mpage_da_writepages(struct address_space *mapping,
+			       struct writeback_control *wbc,
+			       get_block_t get_block)
+{
+	struct mpage_da_data mpd;
+	int ret;
+
+	if (!get_block)
+		return generic_writepages(mapping, wbc);
+
+	mpd.wbc = wbc;
+	mpd.inode = mapping->host;
+	mpd.lbh.b_size = 0;
+	mpd.lbh.b_state = 0;
+	mpd.lbh.b_blocknr = 0;
+	mpd.first_page = 0;
+	mpd.next_page = 0;
+	mpd.get_block = get_block;
+
+	ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, &mpd);
+
+	/*
+	 * Handle last extent of pages
+	 */
+	if (mpd.next_page != mpd.first_page) {
+		mpage_da_map_blocks(&mpd);
+		mpage_da_submit_io(&mpd);
+	}
+
+	return ret;
+}
+
+/*
+ * this is a special callback for ->write_begin() only
+ * it's intention is to return mapped block or reserve space
+ */
+static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
+				  struct buffer_head *bh_result, int create)
+{
+	int ret = 0;
+
+	BUG_ON(create == 0);
+	BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize);
+
+	/*
+	 * first, we need to know whether the block is allocated already
+	 * preallocated blocks are unmapped but should treated
+	 * the same as allocated blocks.
+	 */
+	ret = ext4_get_blocks_wrap(NULL, inode, iblock, 1,  bh_result, 0, 0, 0);
+	if ((ret == 0) && !buffer_delay(bh_result)) {
+		/* the block isn't (pre)allocated yet, let's reserve space */
+		/*
+		 * XXX: __block_prepare_write() unmaps passed block,
+		 * is it OK?
+		 */
+		ret = ext4_da_reserve_space(inode, 1);
+		if (ret)
+			/* not enough space to reserve */
+			return ret;
+
+		map_bh(bh_result, inode->i_sb, 0);
+		set_buffer_new(bh_result);
+		set_buffer_delay(bh_result);
+	} else if (ret > 0) {
+		bh_result->b_size = (ret << inode->i_blkbits);
+		ret = 0;
+	}
+
+	return ret;
+}
+#define		EXT4_DELALLOC_RSVED	1
+static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
+				   struct buffer_head *bh_result, int create)
+{
+	int ret;
+	unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
+	loff_t disksize = EXT4_I(inode)->i_disksize;
+	handle_t *handle = NULL;
+
+	handle = ext4_journal_current_handle();
+	if (!handle) {
+		ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
+				   bh_result, 0, 0, 0);
+		BUG_ON(!ret);
+	} else {
+		ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
+				   bh_result, create, 0, EXT4_DELALLOC_RSVED);
+	}
+
+	if (ret > 0) {
+		bh_result->b_size = (ret << inode->i_blkbits);
+
+		/*
+		 * Update on-disk size along with block allocation
+		 * we don't use 'extend_disksize' as size may change
+		 * within already allocated block -bzzz
+		 */
+		disksize = ((loff_t) iblock + ret) << inode->i_blkbits;
+		if (disksize > i_size_read(inode))
+			disksize = i_size_read(inode);
+		if (disksize > EXT4_I(inode)->i_disksize) {
+			/*
+			 * XXX: replace with spinlock if seen contended -bzzz
+			 */
+			down_write(&EXT4_I(inode)->i_data_sem);
+			if (disksize > EXT4_I(inode)->i_disksize)
+				EXT4_I(inode)->i_disksize = disksize;
+			up_write(&EXT4_I(inode)->i_data_sem);
+
+			if (EXT4_I(inode)->i_disksize == disksize) {
+				ret = ext4_mark_inode_dirty(handle, inode);
+				return ret;
+			}
+		}
+		ret = 0;
+	}
+	return ret;
+}
+
+static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh)
+{
+	/*
+	 * unmapped buffer is possible for holes.
+	 * delay buffer is possible with delayed allocation
+	 */
+	return ((!buffer_mapped(bh) || buffer_delay(bh)) && buffer_dirty(bh));
+}
+
+static int ext4_normal_get_block_write(struct inode *inode, sector_t iblock,
+				   struct buffer_head *bh_result, int create)
+{
+	int ret = 0;
+	unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
+
+	/*
+	 * we don't want to do block allocation in writepage
+	 * so call get_block_wrap with create = 0
+	 */
+	ret = ext4_get_blocks_wrap(NULL, inode, iblock, max_blocks,
+				   bh_result, 0, 0, 0);
+	if (ret > 0) {
+		bh_result->b_size = (ret << inode->i_blkbits);
+		ret = 0;
+	}
+	return ret;
+}
+
+/*
+ * get called vi ext4_da_writepages after taking page lock (have journal handle)
+ * get called via journal_submit_inode_data_buffers (no journal handle)
+ * get called via shrink_page_list via pdflush (no journal handle)
+ * or grab_page_cache when doing write_begin (have journal handle)
+ */
+static int ext4_da_writepage(struct page *page,
+				struct writeback_control *wbc)
+{
+	int ret = 0;
+	loff_t size;
+	unsigned long len;
+	struct buffer_head *page_bufs;
+	struct inode *inode = page->mapping->host;
+
+	size = i_size_read(inode);
+	if (page->index == size >> PAGE_CACHE_SHIFT)
+		len = size & ~PAGE_CACHE_MASK;
+	else
+		len = PAGE_CACHE_SIZE;
+
+	if (page_has_buffers(page)) {
+		page_bufs = page_buffers(page);
+		if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
+					ext4_bh_unmapped_or_delay)) {
+			/*
+			 * We don't want to do  block allocation
+			 * So redirty the page and return
+			 * We may reach here when we do a journal commit
+			 * via journal_submit_inode_data_buffers.
+			 * If we don't have mapping block we just ignore
+			 * them. We can also reach here via shrink_page_list
+			 */
+			redirty_page_for_writepage(wbc, page);
+			unlock_page(page);
+			return 0;
+		}
+	} else {
+		/*
+		 * The test for page_has_buffers() is subtle:
+		 * We know the page is dirty but it lost buffers. That means
+		 * that at some moment in time after write_begin()/write_end()
+		 * has been called all buffers have been clean and thus they
+		 * must have been written at least once. So they are all
+		 * mapped and we can happily proceed with mapping them
+		 * and writing the page.
+		 *
+		 * Try to initialize the buffer_heads and check whether
+		 * all are mapped and non delay. We don't want to
+		 * do block allocation here.
+		 */
+		ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
+						ext4_normal_get_block_write);
+		if (!ret) {
+			page_bufs = page_buffers(page);
+			/* check whether all are mapped and non delay */
+			if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
+						ext4_bh_unmapped_or_delay)) {
+				redirty_page_for_writepage(wbc, page);
+				unlock_page(page);
+				return 0;
+			}
+		} else {
+			/*
+			 * We can't do block allocation here
+			 * so just redity the page and unlock
+			 * and return
+			 */
+			redirty_page_for_writepage(wbc, page);
+			unlock_page(page);
+			return 0;
+		}
+	}
+
+	if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
+		ret = nobh_writepage(page, ext4_normal_get_block_write, wbc);
+	else
+		ret = block_write_full_page(page,
+						ext4_normal_get_block_write,
+						wbc);
+
+	return ret;
+}
+
+/*
+ * For now just follow the DIO way to estimate the max credits
+ * needed to write out EXT4_MAX_WRITEBACK_PAGES.
+ * todo: need to calculate the max credits need for
+ * extent based files, currently the DIO credits is based on
+ * indirect-blocks mapping way.
+ *
+ * Probably should have a generic way to calculate credits
+ * for DIO, writepages, and truncate
+ */
+#define EXT4_MAX_WRITEBACK_PAGES      DIO_MAX_BLOCKS
+#define EXT4_MAX_WRITEBACK_CREDITS    DIO_CREDITS
+
+static int ext4_da_writepages(struct address_space *mapping,
+				struct writeback_control *wbc)
+{
+	struct inode *inode = mapping->host;
+	handle_t *handle = NULL;
+	int needed_blocks;
+	int ret = 0;
+	long to_write;
+	loff_t range_start = 0;
+
+	/*
+	 * No pages to write? This is mainly a kludge to avoid starting
+	 * a transaction for special inodes like journal inode on last iput()
+	 * because that could violate lock ordering on umount
+	 */
+	if (!mapping->nrpages)
+		return 0;
+
+	/*
+	 * Estimate the worse case needed credits to write out
+	 * EXT4_MAX_BUF_BLOCKS pages
+	 */
+	needed_blocks = EXT4_MAX_WRITEBACK_CREDITS;
+
+	to_write = wbc->nr_to_write;
+	if (!wbc->range_cyclic) {
+		/*
+		 * If range_cyclic is not set force range_cont
+		 * and save the old writeback_index
+		 */
+		wbc->range_cont = 1;
+		range_start =  wbc->range_start;
+	}
+
+	while (!ret && to_write) {
+		/* start a new transaction*/
+		handle = ext4_journal_start(inode, needed_blocks);
+		if (IS_ERR(handle)) {
+			ret = PTR_ERR(handle);
+			goto out_writepages;
+		}
+		if (ext4_should_order_data(inode)) {
+			/*
+			 * With ordered mode we need to add
+			 * the inode to the journal handle
+			 * when we do block allocation.
+			 */
+			ret = ext4_jbd2_file_inode(handle, inode);
+			if (ret) {
+				ext4_journal_stop(handle);
+				goto out_writepages;
+			}
+
+		}
+		/*
+		 * set the max dirty pages could be write at a time
+		 * to fit into the reserved transaction credits
+		 */
+		if (wbc->nr_to_write > EXT4_MAX_WRITEBACK_PAGES)
+			wbc->nr_to_write = EXT4_MAX_WRITEBACK_PAGES;
+
+		to_write -= wbc->nr_to_write;
+		ret = mpage_da_writepages(mapping, wbc,
+						ext4_da_get_block_write);
+		ext4_journal_stop(handle);
+		if (wbc->nr_to_write) {
+			/*
+			 * There is no more writeout needed
+			 * or we requested for a noblocking writeout
+			 * and we found the device congested
+			 */
+			to_write += wbc->nr_to_write;
+			break;
+		}
+		wbc->nr_to_write = to_write;
+	}
+
+out_writepages:
+	wbc->nr_to_write = to_write;
+	if (range_start)
+		wbc->range_start = range_start;
+	return ret;
+}
+
+static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
+				loff_t pos, unsigned len, unsigned flags,
+				struct page **pagep, void **fsdata)
+{
+	int ret, retries = 0;
+	struct page *page;
+	pgoff_t index;
+	unsigned from, to;
+	struct inode *inode = mapping->host;
+	handle_t *handle;
+
+	index = pos >> PAGE_CACHE_SHIFT;
+	from = pos & (PAGE_CACHE_SIZE - 1);
+	to = from + len;
+
+retry:
+	/*
+	 * With delayed allocation, we don't log the i_disksize update
+	 * if there is delayed block allocation. But we still need
+	 * to journalling the i_disksize update if writes to the end
+	 * of file which has an already mapped buffer.
+	 */
+	handle = ext4_journal_start(inode, 1);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		goto out;
+	}
+
+	page = __grab_cache_page(mapping, index);
+	if (!page)
+		return -ENOMEM;
+	*pagep = page;
+
+	ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
+							ext4_da_get_block_prep);
+	if (ret < 0) {
+		unlock_page(page);
+		ext4_journal_stop(handle);
+		page_cache_release(page);
+	}
+
+	if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
+		goto retry;
+out:
+	return ret;
+}
+
+/*
+ * Check if we should update i_disksize
+ * when write to the end of file but not require block allocation
+ */
+static int ext4_da_should_update_i_disksize(struct page *page,
+					 unsigned long offset)
+{
+	struct buffer_head *bh;
+	struct inode *inode = page->mapping->host;
+	unsigned int idx;
+	int i;
+
+	bh = page_buffers(page);
+	idx = offset >> inode->i_blkbits;
+
+	for (i=0; i < idx; i++)
+		bh = bh->b_this_page;
+
+	if (!buffer_mapped(bh) || (buffer_delay(bh)))
+		return 0;
+	return 1;
+}
+
+static int ext4_da_write_end(struct file *file,
+				struct address_space *mapping,
+				loff_t pos, unsigned len, unsigned copied,
+				struct page *page, void *fsdata)
+{
+	struct inode *inode = mapping->host;
+	int ret = 0, ret2;
+	handle_t *handle = ext4_journal_current_handle();
+	loff_t new_i_size;
+	unsigned long start, end;
+
+	start = pos & (PAGE_CACHE_SIZE - 1);
+	end = start + copied -1;
+
+	/*
+	 * generic_write_end() will run mark_inode_dirty() if i_size
+	 * changes.  So let's piggyback the i_disksize mark_inode_dirty
+	 * into that.
+	 */
+
+	new_i_size = pos + copied;
+	if (new_i_size > EXT4_I(inode)->i_disksize) {
+		if (ext4_da_should_update_i_disksize(page, end)) {
+			down_write(&EXT4_I(inode)->i_data_sem);
+			if (new_i_size > EXT4_I(inode)->i_disksize) {
+				/*
+				 * Updating i_disksize when extending file
+				 * without needing block allocation
+				 */
+				if (ext4_should_order_data(inode))
+					ret = ext4_jbd2_file_inode(handle,
+								   inode);
+
+				EXT4_I(inode)->i_disksize = new_i_size;
+			}
+			up_write(&EXT4_I(inode)->i_data_sem);
+		}
+	}
+	ret2 = generic_write_end(file, mapping, pos, len, copied,
+							page, fsdata);
+	copied = ret2;
+	if (ret2 < 0)
+		ret = ret2;
+	ret2 = ext4_journal_stop(handle);
+	if (!ret)
+		ret = ret2;
+
+	return ret ? ret : copied;
+}
+
+static void ext4_da_invalidatepage(struct page *page, unsigned long offset)
+{
+	/*
+	 * Drop reserved blocks
+	 */
+	BUG_ON(!PageLocked(page));
+	if (!page_has_buffers(page))
+		goto out;
+
+	ext4_da_page_release_reservation(page, offset);
+
+out:
+	ext4_invalidatepage(page, offset);
+
+	return;
+}
+
 
 /*
  * bmap() is special.  It gets used by applications such as lilo and by
@@ -1418,6 +2409,16 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
 	journal_t *journal;
 	int err;
 
+	if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) &&
+			test_opt(inode->i_sb, DELALLOC)) {
+		/*
+		 * With delalloc we want to sync the file
+		 * so that we can make sure we allocate
+		 * blocks for file
+		 */
+		filemap_write_and_wait(mapping);
+	}
+
 	if (EXT4_I(inode)->i_state & EXT4_STATE_JDATA) {
 		/*
 		 * This is a REALLY heavyweight approach, but the use of
@@ -1462,21 +2463,17 @@ static int bput_one(handle_t *handle, struct buffer_head *bh)
 	return 0;
 }
 
-static int jbd2_journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
-{
-	if (buffer_mapped(bh))
-		return ext4_journal_dirty_data(handle, bh);
-	return 0;
-}
-
 /*
- * Note that we always start a transaction even if we're not journalling
- * data.  This is to preserve ordering: any hole instantiation within
- * __block_write_full_page -> ext4_get_block() should be journalled
- * along with the data so we don't crash and then get metadata which
- * refers to old data.
+ * Note that we don't need to start a transaction unless we're journaling data
+ * because we should have holes filled from ext4_page_mkwrite(). We even don't
+ * need to file the inode to the transaction's list in ordered mode because if
+ * we are writing back data added by write(), the inode is already there and if
+ * we are writing back data modified via mmap(), noone guarantees in which
+ * transaction the data will hit the disk. In case we are journaling data, we
+ * cannot start transaction directly because transaction start ranks above page
+ * lock so we have to do some magic.
  *
- * In all journalling modes block_write_full_page() will start the I/O.
+ * In all journaling modes block_write_full_page() will start the I/O.
  *
  * Problem:
  *
@@ -1518,105 +2515,103 @@ static int jbd2_journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
  * disastrous.  Any write() or metadata operation will sync the fs for
  * us.
  *
- * AKPM2: if all the page's buffers are mapped to disk and !data=journal,
- * we don't need to open a transaction here.
  */
-static int ext4_ordered_writepage(struct page *page,
+static int __ext4_normal_writepage(struct page *page,
 				struct writeback_control *wbc)
 {
 	struct inode *inode = page->mapping->host;
-	struct buffer_head *page_bufs;
-	handle_t *handle = NULL;
-	int ret = 0;
-	int err;
-
-	J_ASSERT(PageLocked(page));
-
-	/*
-	 * We give up here if we're reentered, because it might be for a
-	 * different filesystem.
-	 */
-	if (ext4_journal_current_handle())
-		goto out_fail;
 
-	handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
+	if (test_opt(inode->i_sb, NOBH))
+		return nobh_writepage(page,
+					ext4_normal_get_block_write, wbc);
+	else
+		return block_write_full_page(page,
+						ext4_normal_get_block_write,
+						wbc);
+}
 
-	if (IS_ERR(handle)) {
-		ret = PTR_ERR(handle);
-		goto out_fail;
-	}
+static int ext4_normal_writepage(struct page *page,
+				struct writeback_control *wbc)
+{
+	struct inode *inode = page->mapping->host;
+	loff_t size = i_size_read(inode);
+	loff_t len;
 
-	if (!page_has_buffers(page)) {
-		create_empty_buffers(page, inode->i_sb->s_blocksize,
-				(1 << BH_Dirty)|(1 << BH_Uptodate));
+	J_ASSERT(PageLocked(page));
+	if (page->index == size >> PAGE_CACHE_SHIFT)
+		len = size & ~PAGE_CACHE_MASK;
+	else
+		len = PAGE_CACHE_SIZE;
+
+	if (page_has_buffers(page)) {
+		/* if page has buffers it should all be mapped
+		 * and allocated. If there are not buffers attached
+		 * to the page we know the page is dirty but it lost
+		 * buffers. That means that at some moment in time
+		 * after write_begin() / write_end() has been called
+		 * all buffers have been clean and thus they must have been
+		 * written at least once. So they are all mapped and we can
+		 * happily proceed with mapping them and writing the page.
+		 */
+		BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
+					ext4_bh_unmapped_or_delay));
 	}
-	page_bufs = page_buffers(page);
-	walk_page_buffers(handle, page_bufs, 0,
-			PAGE_CACHE_SIZE, NULL, bget_one);
-
-	ret = block_write_full_page(page, ext4_get_block, wbc);
 
-	/*
-	 * The page can become unlocked at any point now, and
-	 * truncate can then come in and change things.  So we
-	 * can't touch *page from now on.  But *page_bufs is
-	 * safe due to elevated refcount.
-	 */
+	if (!ext4_journal_current_handle())
+		return __ext4_normal_writepage(page, wbc);
 
-	/*
-	 * And attach them to the current transaction.  But only if
-	 * block_write_full_page() succeeded.  Otherwise they are unmapped,
-	 * and generally junk.
-	 */
-	if (ret == 0) {
-		err = walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE,
-					NULL, jbd2_journal_dirty_data_fn);
-		if (!ret)
-			ret = err;
-	}
-	walk_page_buffers(handle, page_bufs, 0,
-			PAGE_CACHE_SIZE, NULL, bput_one);
-	err = ext4_journal_stop(handle);
-	if (!ret)
-		ret = err;
-	return ret;
-
-out_fail:
 	redirty_page_for_writepage(wbc, page);
 	unlock_page(page);
-	return ret;
+	return 0;
 }
 
-static int ext4_writeback_writepage(struct page *page,
+static int __ext4_journalled_writepage(struct page *page,
 				struct writeback_control *wbc)
 {
-	struct inode *inode = page->mapping->host;
+	struct address_space *mapping = page->mapping;
+	struct inode *inode = mapping->host;
+	struct buffer_head *page_bufs;
 	handle_t *handle = NULL;
 	int ret = 0;
 	int err;
 
-	if (ext4_journal_current_handle())
-		goto out_fail;
+	ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
+					ext4_normal_get_block_write);
+	if (ret != 0)
+		goto out_unlock;
+
+	page_bufs = page_buffers(page);
+	walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE, NULL,
+								bget_one);
+	/* As soon as we unlock the page, it can go away, but we have
+	 * references to buffers so we are safe */
+	unlock_page(page);
 
 	handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
 	if (IS_ERR(handle)) {
 		ret = PTR_ERR(handle);
-		goto out_fail;
+		goto out;
 	}
 
-	if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
-		ret = nobh_writepage(page, ext4_get_block, wbc);
-	else
-		ret = block_write_full_page(page, ext4_get_block, wbc);
+	ret = walk_page_buffers(handle, page_bufs, 0,
+			PAGE_CACHE_SIZE, NULL, do_journal_get_write_access);
 
+	err = walk_page_buffers(handle, page_bufs, 0,
+				PAGE_CACHE_SIZE, NULL, write_end_fn);
+	if (ret == 0)
+		ret = err;
 	err = ext4_journal_stop(handle);
 	if (!ret)
 		ret = err;
-	return ret;
 
-out_fail:
-	redirty_page_for_writepage(wbc, page);
+	walk_page_buffers(handle, page_bufs, 0,
+				PAGE_CACHE_SIZE, NULL, bput_one);
+	EXT4_I(inode)->i_state |= EXT4_STATE_JDATA;
+	goto out;
+
+out_unlock:
 	unlock_page(page);
+out:
 	return ret;
 }
 
@@ -1624,59 +2619,53 @@ static int ext4_journalled_writepage(struct page *page,
 				struct writeback_control *wbc)
 {
 	struct inode *inode = page->mapping->host;
-	handle_t *handle = NULL;
-	int ret = 0;
-	int err;
+	loff_t size = i_size_read(inode);
+	loff_t len;
 
-	if (ext4_journal_current_handle())
-		goto no_write;
+	J_ASSERT(PageLocked(page));
+	if (page->index == size >> PAGE_CACHE_SHIFT)
+		len = size & ~PAGE_CACHE_MASK;
+	else
+		len = PAGE_CACHE_SIZE;
+
+	if (page_has_buffers(page)) {
+		/* if page has buffers it should all be mapped
+		 * and allocated. If there are not buffers attached
+		 * to the page we know the page is dirty but it lost
+		 * buffers. That means that at some moment in time
+		 * after write_begin() / write_end() has been called
+		 * all buffers have been clean and thus they must have been
+		 * written at least once. So they are all mapped and we can
+		 * happily proceed with mapping them and writing the page.
+		 */
+		BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
+					ext4_bh_unmapped_or_delay));
+	}
 
-	handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
-	if (IS_ERR(handle)) {
-		ret = PTR_ERR(handle);
+	if (ext4_journal_current_handle())
 		goto no_write;
-	}
 
-	if (!page_has_buffers(page) || PageChecked(page)) {
+	if (PageChecked(page)) {
 		/*
 		 * It's mmapped pagecache.  Add buffers and journal it.  There
 		 * doesn't seem much point in redirtying the page here.
 		 */
 		ClearPageChecked(page);
-		ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
-					ext4_get_block);
-		if (ret != 0) {
-			ext4_journal_stop(handle);
-			goto out_unlock;
-		}
-		ret = walk_page_buffers(handle, page_buffers(page), 0,
-			PAGE_CACHE_SIZE, NULL, do_journal_get_write_access);
-
-		err = walk_page_buffers(handle, page_buffers(page), 0,
-				PAGE_CACHE_SIZE, NULL, write_end_fn);
-		if (ret == 0)
-			ret = err;
-		EXT4_I(inode)->i_state |= EXT4_STATE_JDATA;
-		unlock_page(page);
+		return __ext4_journalled_writepage(page, wbc);
 	} else {
 		/*
 		 * It may be a page full of checkpoint-mode buffers.  We don't
 		 * really know unless we go poke around in the buffer_heads.
 		 * But block_write_full_page will do the right thing.
 		 */
-		ret = block_write_full_page(page, ext4_get_block, wbc);
+		return block_write_full_page(page,
+						ext4_normal_get_block_write,
+						wbc);
 	}
-	err = ext4_journal_stop(handle);
-	if (!ret)
-		ret = err;
-out:
-	return ret;
-
 no_write:
 	redirty_page_for_writepage(wbc, page);
-out_unlock:
 	unlock_page(page);
-	goto out;
+	return 0;
 }
 
 static int ext4_readpage(struct file *file, struct page *page)
@@ -1819,7 +2808,7 @@ static int ext4_journalled_set_page_dirty(struct page *page)
 static const struct address_space_operations ext4_ordered_aops = {
 	.readpage	= ext4_readpage,
 	.readpages	= ext4_readpages,
-	.writepage	= ext4_ordered_writepage,
+	.writepage	= ext4_normal_writepage,
 	.sync_page	= block_sync_page,
 	.write_begin	= ext4_write_begin,
 	.write_end	= ext4_ordered_write_end,
@@ -1833,7 +2822,7 @@ static const struct address_space_operations ext4_ordered_aops = {
 static const struct address_space_operations ext4_writeback_aops = {
 	.readpage	= ext4_readpage,
 	.readpages	= ext4_readpages,
-	.writepage	= ext4_writeback_writepage,
+	.writepage	= ext4_normal_writepage,
 	.sync_page	= block_sync_page,
 	.write_begin	= ext4_write_begin,
 	.write_end	= ext4_writeback_write_end,
@@ -1857,10 +2846,31 @@ static const struct address_space_operations ext4_journalled_aops = {
 	.releasepage	= ext4_releasepage,
 };
 
+static const struct address_space_operations ext4_da_aops = {
+	.readpage	= ext4_readpage,
+	.readpages	= ext4_readpages,
+	.writepage	= ext4_da_writepage,
+	.writepages	= ext4_da_writepages,
+	.sync_page	= block_sync_page,
+	.write_begin	= ext4_da_write_begin,
+	.write_end	= ext4_da_write_end,
+	.bmap		= ext4_bmap,
+	.invalidatepage	= ext4_da_invalidatepage,
+	.releasepage	= ext4_releasepage,
+	.direct_IO	= ext4_direct_IO,
+	.migratepage	= buffer_migrate_page,
+};
+
 void ext4_set_aops(struct inode *inode)
 {
-	if (ext4_should_order_data(inode))
+	if (ext4_should_order_data(inode) &&
+		test_opt(inode->i_sb, DELALLOC))
+		inode->i_mapping->a_ops = &ext4_da_aops;
+	else if (ext4_should_order_data(inode))
 		inode->i_mapping->a_ops = &ext4_ordered_aops;
+	else if (ext4_should_writeback_data(inode) &&
+		 test_opt(inode->i_sb, DELALLOC))
+		inode->i_mapping->a_ops = &ext4_da_aops;
 	else if (ext4_should_writeback_data(inode))
 		inode->i_mapping->a_ops = &ext4_writeback_aops;
 	else
@@ -1873,7 +2883,7 @@ void ext4_set_aops(struct inode *inode)
  * This required during truncate. We need to physically zero the tail end
  * of that block so it doesn't yield old data if the file is later grown.
  */
-int ext4_block_truncate_page(handle_t *handle, struct page *page,
+int ext4_block_truncate_page(handle_t *handle,
 		struct address_space *mapping, loff_t from)
 {
 	ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
@@ -1882,8 +2892,13 @@ int ext4_block_truncate_page(handle_t *handle, struct page *page,
 	ext4_lblk_t iblock;
 	struct inode *inode = mapping->host;
 	struct buffer_head *bh;
+	struct page *page;
 	int err = 0;
 
+	page = grab_cache_page(mapping, from >> PAGE_CACHE_SHIFT);
+	if (!page)
+		return -EINVAL;
+
 	blocksize = inode->i_sb->s_blocksize;
 	length = blocksize - (offset & (blocksize - 1));
 	iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
@@ -1956,7 +2971,7 @@ int ext4_block_truncate_page(handle_t *handle, struct page *page,
 		err = ext4_journal_dirty_metadata(handle, bh);
 	} else {
 		if (ext4_should_order_data(inode))
-			err = ext4_journal_dirty_data(handle, bh);
+			err = ext4_jbd2_file_inode(handle, inode);
 		mark_buffer_dirty(bh);
 	}
 
@@ -2179,7 +3194,21 @@ static void ext4_free_data(handle_t *handle, struct inode *inode,
 
 	if (this_bh) {
 		BUFFER_TRACE(this_bh, "call ext4_journal_dirty_metadata");
-		ext4_journal_dirty_metadata(handle, this_bh);
+
+		/*
+		 * The buffer head should have an attached journal head at this
+		 * point. However, if the data is corrupted and an indirect
+		 * block pointed to itself, it would have been detached when
+		 * the block was cleared. Check for this instead of OOPSing.
+		 */
+		if (bh2jh(this_bh))
+			ext4_journal_dirty_metadata(handle, this_bh);
+		else
+			ext4_error(inode->i_sb, __func__,
+				   "circular indirect block detected, "
+				   "inode=%lu, block=%llu",
+				   inode->i_ino,
+				   (unsigned long long) this_bh->b_blocknr);
 	}
 }
 
@@ -2305,6 +3334,19 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
 	}
 }
 
+int ext4_can_truncate(struct inode *inode)
+{
+	if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
+		return 0;
+	if (S_ISREG(inode->i_mode))
+		return 1;
+	if (S_ISDIR(inode->i_mode))
+		return 1;
+	if (S_ISLNK(inode->i_mode))
+		return !ext4_inode_is_fast_symlink(inode);
+	return 0;
+}
+
 /*
  * ext4_truncate()
  *
@@ -2347,51 +3389,25 @@ void ext4_truncate(struct inode *inode)
 	int n;
 	ext4_lblk_t last_block;
 	unsigned blocksize = inode->i_sb->s_blocksize;
-	struct page *page;
 
-	if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
-	    S_ISLNK(inode->i_mode)))
-		return;
-	if (ext4_inode_is_fast_symlink(inode))
-		return;
-	if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
+	if (!ext4_can_truncate(inode))
 		return;
 
-	/*
-	 * We have to lock the EOF page here, because lock_page() nests
-	 * outside jbd2_journal_start().
-	 */
-	if ((inode->i_size & (blocksize - 1)) == 0) {
-		/* Block boundary? Nothing to do */
-		page = NULL;
-	} else {
-		page = grab_cache_page(mapping,
-				inode->i_size >> PAGE_CACHE_SHIFT);
-		if (!page)
-			return;
-	}
-
 	if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
-		ext4_ext_truncate(inode, page);
+		ext4_ext_truncate(inode);
 		return;
 	}
 
 	handle = start_transaction(inode);
-	if (IS_ERR(handle)) {
-		if (page) {
-			clear_highpage(page);
-			flush_dcache_page(page);
-			unlock_page(page);
-			page_cache_release(page);
-		}
+	if (IS_ERR(handle))
 		return;		/* AKPM: return what? */
-	}
 
 	last_block = (inode->i_size + blocksize-1)
 					>> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
 
-	if (page)
-		ext4_block_truncate_page(handle, page, mapping, inode->i_size);
+	if (inode->i_size & (blocksize - 1))
+		if (ext4_block_truncate_page(handle, mapping, inode->i_size))
+			goto out_stop;
 
 	n = ext4_block_to_path(inode, last_block, offsets, NULL);
 	if (n == 0)
@@ -2410,6 +3426,11 @@ void ext4_truncate(struct inode *inode)
 		goto out_stop;
 
 	/*
+	 * From here we block out all ext4_get_block() callers who want to
+	 * modify the block allocation tree.
+	 */
+	down_write(&ei->i_data_sem);
+	/*
 	 * The orphan list entry will now protect us from any crash which
 	 * occurs before the truncate completes, so it is now safe to propagate
 	 * the new, shorter inode size (held for now in i_size) into the
@@ -2418,12 +3439,6 @@ void ext4_truncate(struct inode *inode)
 	 */
 	ei->i_disksize = inode->i_size;
 
-	/*
-	 * From here we block out all ext4_get_block() callers who want to
-	 * modify the block allocation tree.
-	 */
-	down_write(&ei->i_data_sem);
-
 	if (n == 1) {		/* direct blocks */
 		ext4_free_data(handle, inode, NULL, i_data+offsets[0],
 			       i_data + EXT4_NDIR_BLOCKS);
@@ -3107,7 +4122,14 @@ int ext4_write_inode(struct inode *inode, int wait)
  * be freed, so we have a strong guarantee that no future commit will
  * leave these blocks visible to the user.)
  *
- * Called with inode->sem down.
+ * Another thing we have to assure is that if we are in ordered mode
+ * and inode is still attached to the committing transaction, we must
+ * we start writeout of all the dirty pages which are being truncated.
+ * This way we are sure that all the data written in the previous
+ * transaction are already on disk (truncate waits for pages under
+ * writeback).
+ *
+ * Called with inode->i_mutex down.
  */
 int ext4_setattr(struct dentry *dentry, struct iattr *attr)
 {
@@ -3173,6 +4195,22 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
 		if (!error)
 			error = rc;
 		ext4_journal_stop(handle);
+
+		if (ext4_should_order_data(inode)) {
+			error = ext4_begin_ordered_truncate(inode,
+							    attr->ia_size);
+			if (error) {
+				/* Do as much error cleanup as possible */
+				handle = ext4_journal_start(inode, 3);
+				if (IS_ERR(handle)) {
+					ext4_orphan_del(NULL, inode);
+					goto err_out;
+				}
+				ext4_orphan_del(handle, inode);
+				ext4_journal_stop(handle);
+				goto err_out;
+			}
+		}
 	}
 
 	rc = inode_setattr(inode, attr);
@@ -3193,6 +4231,32 @@ err_out:
 	return error;
 }
 
+int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
+		 struct kstat *stat)
+{
+	struct inode *inode;
+	unsigned long delalloc_blocks;
+
+	inode = dentry->d_inode;
+	generic_fillattr(inode, stat);
+
+	/*
+	 * We can't update i_blocks if the block allocation is delayed
+	 * otherwise in the case of system crash before the real block
+	 * allocation is done, we will have i_blocks inconsistent with
+	 * on-disk file blocks.
+	 * We always keep i_blocks updated together with real
+	 * allocation. But to not confuse with user, stat
+	 * will return the blocks that include the delayed allocation
+	 * blocks for this file.
+	 */
+	spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
+	delalloc_blocks = EXT4_I(inode)->i_reserved_data_blocks;
+	spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+
+	stat->blocks += (delalloc_blocks << inode->i_sb->s_blocksize_bits)>>9;
+	return 0;
+}
 
 /*
  * How many blocks doth make a writepage()?
@@ -3506,3 +4570,64 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
 
 	return err;
 }
+
+static int ext4_bh_unmapped(handle_t *handle, struct buffer_head *bh)
+{
+	return !buffer_mapped(bh);
+}
+
+int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page)
+{
+	loff_t size;
+	unsigned long len;
+	int ret = -EINVAL;
+	struct file *file = vma->vm_file;
+	struct inode *inode = file->f_path.dentry->d_inode;
+	struct address_space *mapping = inode->i_mapping;
+
+	/*
+	 * Get i_alloc_sem to stop truncates messing with the inode. We cannot
+	 * get i_mutex because we are already holding mmap_sem.
+	 */
+	down_read(&inode->i_alloc_sem);
+	size = i_size_read(inode);
+	if (page->mapping != mapping || size <= page_offset(page)
+	    || !PageUptodate(page)) {
+		/* page got truncated from under us? */
+		goto out_unlock;
+	}
+	ret = 0;
+	if (PageMappedToDisk(page))
+		goto out_unlock;
+
+	if (page->index == size >> PAGE_CACHE_SHIFT)
+		len = size & ~PAGE_CACHE_MASK;
+	else
+		len = PAGE_CACHE_SIZE;
+
+	if (page_has_buffers(page)) {
+		/* return if we have all the buffers mapped */
+		if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
+				       ext4_bh_unmapped))
+			goto out_unlock;
+	}
+	/*
+	 * OK, we need to fill the hole... Do write_begin write_end
+	 * to do block allocation/reservation.We are not holding
+	 * inode.i__mutex here. That allow * parallel write_begin,
+	 * write_end call. lock_page prevent this from happening
+	 * on the same page though
+	 */
+	ret = mapping->a_ops->write_begin(file, mapping, page_offset(page),
+			len, AOP_FLAG_UNINTERRUPTIBLE, &page, NULL);
+	if (ret < 0)
+		goto out_unlock;
+	ret = mapping->a_ops->write_end(file, mapping, page_offset(page),
+			len, len, page, NULL);
+	if (ret < 0)
+		goto out_unlock;
+	ret = 0;
+out_unlock:
+	up_read(&inode->i_alloc_sem);
+	return ret;
+}
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index fbec2ef93797..8d141a25bbee 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -381,22 +381,28 @@ static inline void mb_clear_bit_atomic(spinlock_t *lock, int bit, void *addr)
 
 static inline int mb_find_next_zero_bit(void *addr, int max, int start)
 {
-	int fix = 0;
+	int fix = 0, ret, tmpmax;
 	addr = mb_correct_addr_and_bit(&fix, addr);
-	max += fix;
+	tmpmax = max + fix;
 	start += fix;
 
-	return ext4_find_next_zero_bit(addr, max, start) - fix;
+	ret = ext4_find_next_zero_bit(addr, tmpmax, start) - fix;
+	if (ret > max)
+		return max;
+	return ret;
 }
 
 static inline int mb_find_next_bit(void *addr, int max, int start)
 {
-	int fix = 0;
+	int fix = 0, ret, tmpmax;
 	addr = mb_correct_addr_and_bit(&fix, addr);
-	max += fix;
+	tmpmax = max + fix;
 	start += fix;
 
-	return ext4_find_next_bit(addr, max, start) - fix;
+	ret = ext4_find_next_bit(addr, tmpmax, start) - fix;
+	if (ret > max)
+		return max;
+	return ret;
 }
 
 static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max)
@@ -803,6 +809,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
 		if (!buffer_uptodate(bh[i]))
 			goto out;
 
+	err = 0;
 	first_block = page->index * blocks_per_page;
 	for (i = 0; i < blocks_per_page; i++) {
 		int group;
@@ -883,6 +890,7 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
 	int pnum;
 	int poff;
 	struct page *page;
+	int ret;
 
 	mb_debug("load group %lu\n", group);
 
@@ -914,15 +922,21 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
 		if (page) {
 			BUG_ON(page->mapping != inode->i_mapping);
 			if (!PageUptodate(page)) {
-				ext4_mb_init_cache(page, NULL);
+				ret = ext4_mb_init_cache(page, NULL);
+				if (ret) {
+					unlock_page(page);
+					goto err;
+				}
 				mb_cmp_bitmaps(e4b, page_address(page) +
 					       (poff * sb->s_blocksize));
 			}
 			unlock_page(page);
 		}
 	}
-	if (page == NULL || !PageUptodate(page))
+	if (page == NULL || !PageUptodate(page)) {
+		ret = -EIO;
 		goto err;
+	}
 	e4b->bd_bitmap_page = page;
 	e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize);
 	mark_page_accessed(page);
@@ -938,14 +952,20 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
 		page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
 		if (page) {
 			BUG_ON(page->mapping != inode->i_mapping);
-			if (!PageUptodate(page))
-				ext4_mb_init_cache(page, e4b->bd_bitmap);
-
+			if (!PageUptodate(page)) {
+				ret = ext4_mb_init_cache(page, e4b->bd_bitmap);
+				if (ret) {
+					unlock_page(page);
+					goto err;
+				}
+			}
 			unlock_page(page);
 		}
 	}
-	if (page == NULL || !PageUptodate(page))
+	if (page == NULL || !PageUptodate(page)) {
+		ret = -EIO;
 		goto err;
+	}
 	e4b->bd_buddy_page = page;
 	e4b->bd_buddy = page_address(page) + (poff * sb->s_blocksize);
 	mark_page_accessed(page);
@@ -962,7 +982,7 @@ err:
 		page_cache_release(e4b->bd_buddy_page);
 	e4b->bd_buddy = NULL;
 	e4b->bd_bitmap = NULL;
-	return -EIO;
+	return ret;
 }
 
 static void ext4_mb_release_desc(struct ext4_buddy *e4b)
@@ -1031,7 +1051,7 @@ static void mb_set_bits(spinlock_t *lock, void *bm, int cur, int len)
 	}
 }
 
-static int mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
+static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
 			  int first, int count)
 {
 	int block = 0;
@@ -1071,11 +1091,12 @@ static int mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
 			blocknr += block;
 			blocknr +=
 			    le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
-
+			ext4_unlock_group(sb, e4b->bd_group);
 			ext4_error(sb, __func__, "double-free of inode"
 				   " %lu's block %llu(bit %u in group %lu)\n",
 				   inode ? inode->i_ino : 0, blocknr, block,
 				   e4b->bd_group);
+			ext4_lock_group(sb, e4b->bd_group);
 		}
 		mb_clear_bit(block, EXT4_MB_BITMAP(e4b));
 		e4b->bd_info->bb_counters[order]++;
@@ -1113,8 +1134,6 @@ static int mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
 		} while (1);
 	}
 	mb_check_buddy(e4b);
-
-	return 0;
 }
 
 static int mb_find_extent(struct ext4_buddy *e4b, int order, int block,
@@ -1730,10 +1749,6 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
 		ac->ac_g_ex.fe_start = sbi->s_mb_last_start;
 		spin_unlock(&sbi->s_md_lock);
 	}
-
-	/* searching for the right group start from the goal value specified */
-	group = ac->ac_g_ex.fe_group;
-
 	/* Let's just scan groups to find more-less suitable blocks */
 	cr = ac->ac_2order ? 0 : 1;
 	/*
@@ -1743,6 +1758,12 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
 repeat:
 	for (; cr < 4 && ac->ac_status == AC_STATUS_CONTINUE; cr++) {
 		ac->ac_criteria = cr;
+		/*
+		 * searching for the right group start
+		 * from the goal value specified
+		 */
+		group = ac->ac_g_ex.fe_group;
+
 		for (i = 0; i < EXT4_SB(sb)->s_groups_count; group++, i++) {
 			struct ext4_group_info *grp;
 			struct ext4_group_desc *desc;
@@ -1963,6 +1984,8 @@ static int ext4_mb_seq_history_open(struct inode *inode, struct file *file)
 	int rc;
 	int size;
 
+	if (unlikely(sbi->s_mb_history == NULL))
+		return -ENOMEM;
 	s = kmalloc(sizeof(*s), GFP_KERNEL);
 	if (s == NULL)
 		return -ENOMEM;
@@ -2165,9 +2188,7 @@ static void ext4_mb_history_init(struct super_block *sb)
 	sbi->s_mb_history_cur = 0;
 	spin_lock_init(&sbi->s_mb_history_lock);
 	i = sbi->s_mb_history_max * sizeof(struct ext4_mb_history);
-	sbi->s_mb_history = kmalloc(i, GFP_KERNEL);
-	if (likely(sbi->s_mb_history != NULL))
-		memset(sbi->s_mb_history, 0, i);
+	sbi->s_mb_history = kzalloc(i, GFP_KERNEL);
 	/* if we can't allocate history, then we simple won't use it */
 }
 
@@ -2215,21 +2236,192 @@ ext4_mb_store_history(struct ext4_allocation_context *ac)
 #define ext4_mb_history_init(sb)
 #endif
 
+
+/* Create and initialize ext4_group_info data for the given group. */
+int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
+			  struct ext4_group_desc *desc)
+{
+	int i, len;
+	int metalen = 0;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_group_info **meta_group_info;
+
+	/*
+	 * First check if this group is the first of a reserved block.
+	 * If it's true, we have to allocate a new table of pointers
+	 * to ext4_group_info structures
+	 */
+	if (group % EXT4_DESC_PER_BLOCK(sb) == 0) {
+		metalen = sizeof(*meta_group_info) <<
+			EXT4_DESC_PER_BLOCK_BITS(sb);
+		meta_group_info = kmalloc(metalen, GFP_KERNEL);
+		if (meta_group_info == NULL) {
+			printk(KERN_ERR "EXT4-fs: can't allocate mem for a "
+			       "buddy group\n");
+			goto exit_meta_group_info;
+		}
+		sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)] =
+			meta_group_info;
+	}
+
+	/*
+	 * calculate needed size. if change bb_counters size,
+	 * don't forget about ext4_mb_generate_buddy()
+	 */
+	len = offsetof(typeof(**meta_group_info),
+		       bb_counters[sb->s_blocksize_bits + 2]);
+
+	meta_group_info =
+		sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)];
+	i = group & (EXT4_DESC_PER_BLOCK(sb) - 1);
+
+	meta_group_info[i] = kzalloc(len, GFP_KERNEL);
+	if (meta_group_info[i] == NULL) {
+		printk(KERN_ERR "EXT4-fs: can't allocate buddy mem\n");
+		goto exit_group_info;
+	}
+	set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT,
+		&(meta_group_info[i]->bb_state));
+
+	/*
+	 * initialize bb_free to be able to skip
+	 * empty groups without initialization
+	 */
+	if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
+		meta_group_info[i]->bb_free =
+			ext4_free_blocks_after_init(sb, group, desc);
+	} else {
+		meta_group_info[i]->bb_free =
+			le16_to_cpu(desc->bg_free_blocks_count);
+	}
+
+	INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
+
+#ifdef DOUBLE_CHECK
+	{
+		struct buffer_head *bh;
+		meta_group_info[i]->bb_bitmap =
+			kmalloc(sb->s_blocksize, GFP_KERNEL);
+		BUG_ON(meta_group_info[i]->bb_bitmap == NULL);
+		bh = ext4_read_block_bitmap(sb, group);
+		BUG_ON(bh == NULL);
+		memcpy(meta_group_info[i]->bb_bitmap, bh->b_data,
+			sb->s_blocksize);
+		put_bh(bh);
+	}
+#endif
+
+	return 0;
+
+exit_group_info:
+	/* If a meta_group_info table has been allocated, release it now */
+	if (group % EXT4_DESC_PER_BLOCK(sb) == 0)
+		kfree(sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)]);
+exit_meta_group_info:
+	return -ENOMEM;
+} /* ext4_mb_add_groupinfo */
+
+/*
+ * Add a group to the existing groups.
+ * This function is used for online resize
+ */
+int ext4_mb_add_more_groupinfo(struct super_block *sb, ext4_group_t group,
+			       struct ext4_group_desc *desc)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct inode *inode = sbi->s_buddy_cache;
+	int blocks_per_page;
+	int block;
+	int pnum;
+	struct page *page;
+	int err;
+
+	/* Add group based on group descriptor*/
+	err = ext4_mb_add_groupinfo(sb, group, desc);
+	if (err)
+		return err;
+
+	/*
+	 * Cache pages containing dynamic mb_alloc datas (buddy and bitmap
+	 * datas) are set not up to date so that they will be re-initilaized
+	 * during the next call to ext4_mb_load_buddy
+	 */
+
+	/* Set buddy page as not up to date */
+	blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
+	block = group * 2;
+	pnum = block / blocks_per_page;
+	page = find_get_page(inode->i_mapping, pnum);
+	if (page != NULL) {
+		ClearPageUptodate(page);
+		page_cache_release(page);
+	}
+
+	/* Set bitmap page as not up to date */
+	block++;
+	pnum = block / blocks_per_page;
+	page = find_get_page(inode->i_mapping, pnum);
+	if (page != NULL) {
+		ClearPageUptodate(page);
+		page_cache_release(page);
+	}
+
+	return 0;
+}
+
+/*
+ * Update an existing group.
+ * This function is used for online resize
+ */
+void ext4_mb_update_group_info(struct ext4_group_info *grp, ext4_grpblk_t add)
+{
+	grp->bb_free += add;
+}
+
 static int ext4_mb_init_backend(struct super_block *sb)
 {
 	ext4_group_t i;
-	int j, len, metalen;
+	int metalen;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
-	int num_meta_group_infos =
-		(sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) >>
-			EXT4_DESC_PER_BLOCK_BITS(sb);
+	struct ext4_super_block *es = sbi->s_es;
+	int num_meta_group_infos;
+	int num_meta_group_infos_max;
+	int array_size;
 	struct ext4_group_info **meta_group_info;
+	struct ext4_group_desc *desc;
+
+	/* This is the number of blocks used by GDT */
+	num_meta_group_infos = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) -
+				1) >> EXT4_DESC_PER_BLOCK_BITS(sb);
 
+	/*
+	 * This is the total number of blocks used by GDT including
+	 * the number of reserved blocks for GDT.
+	 * The s_group_info array is allocated with this value
+	 * to allow a clean online resize without a complex
+	 * manipulation of pointer.
+	 * The drawback is the unused memory when no resize
+	 * occurs but it's very low in terms of pages
+	 * (see comments below)
+	 * Need to handle this properly when META_BG resizing is allowed
+	 */
+	num_meta_group_infos_max = num_meta_group_infos +
+				le16_to_cpu(es->s_reserved_gdt_blocks);
+
+	/*
+	 * array_size is the size of s_group_info array. We round it
+	 * to the next power of two because this approximation is done
+	 * internally by kmalloc so we can have some more memory
+	 * for free here (e.g. may be used for META_BG resize).
+	 */
+	array_size = 1;
+	while (array_size < sizeof(*sbi->s_group_info) *
+	       num_meta_group_infos_max)
+		array_size = array_size << 1;
 	/* An 8TB filesystem with 64-bit pointers requires a 4096 byte
 	 * kmalloc. A 128kb malloc should suffice for a 256TB filesystem.
 	 * So a two level scheme suffices for now. */
-	sbi->s_group_info = kmalloc(sizeof(*sbi->s_group_info) *
-				    num_meta_group_infos, GFP_KERNEL);
+	sbi->s_group_info = kmalloc(array_size, GFP_KERNEL);
 	if (sbi->s_group_info == NULL) {
 		printk(KERN_ERR "EXT4-fs: can't allocate buddy meta group\n");
 		return -ENOMEM;
@@ -2256,63 +2448,15 @@ static int ext4_mb_init_backend(struct super_block *sb)
 		sbi->s_group_info[i] = meta_group_info;
 	}
 
-	/*
-	 * calculate needed size. if change bb_counters size,
-	 * don't forget about ext4_mb_generate_buddy()
-	 */
-	len = sizeof(struct ext4_group_info);
-	len += sizeof(unsigned short) * (sb->s_blocksize_bits + 2);
 	for (i = 0; i < sbi->s_groups_count; i++) {
-		struct ext4_group_desc *desc;
-
-		meta_group_info =
-			sbi->s_group_info[i >> EXT4_DESC_PER_BLOCK_BITS(sb)];
-		j = i & (EXT4_DESC_PER_BLOCK(sb) - 1);
-
-		meta_group_info[j] = kzalloc(len, GFP_KERNEL);
-		if (meta_group_info[j] == NULL) {
-			printk(KERN_ERR "EXT4-fs: can't allocate buddy mem\n");
-			goto err_freebuddy;
-		}
 		desc = ext4_get_group_desc(sb, i, NULL);
 		if (desc == NULL) {
 			printk(KERN_ERR
 				"EXT4-fs: can't read descriptor %lu\n", i);
-			i++;
 			goto err_freebuddy;
 		}
-		memset(meta_group_info[j], 0, len);
-		set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT,
-			&(meta_group_info[j]->bb_state));
-
-		/*
-		 * initialize bb_free to be able to skip
-		 * empty groups without initialization
-		 */
-		if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
-			meta_group_info[j]->bb_free =
-				ext4_free_blocks_after_init(sb, i, desc);
-		} else {
-			meta_group_info[j]->bb_free =
-				le16_to_cpu(desc->bg_free_blocks_count);
-		}
-
-		INIT_LIST_HEAD(&meta_group_info[j]->bb_prealloc_list);
-
-#ifdef DOUBLE_CHECK
-		{
-			struct buffer_head *bh;
-			meta_group_info[j]->bb_bitmap =
-				kmalloc(sb->s_blocksize, GFP_KERNEL);
-			BUG_ON(meta_group_info[j]->bb_bitmap == NULL);
-			bh = read_block_bitmap(sb, i);
-			BUG_ON(bh == NULL);
-			memcpy(meta_group_info[j]->bb_bitmap, bh->b_data,
-					sb->s_blocksize);
-			put_bh(bh);
-		}
-#endif
-
+		if (ext4_mb_add_groupinfo(sb, i, desc) != 0)
+			goto err_freebuddy;
 	}
 
 	return 0;
@@ -2336,6 +2480,7 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
 	unsigned i;
 	unsigned offset;
 	unsigned max;
+	int ret;
 
 	if (!test_opt(sb, MBALLOC))
 		return 0;
@@ -2370,12 +2515,12 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
 	} while (i <= sb->s_blocksize_bits + 1);
 
 	/* init file for buddy data */
-	i = ext4_mb_init_backend(sb);
-	if (i) {
+	ret = ext4_mb_init_backend(sb);
+	if (ret != 0) {
 		clear_opt(sbi->s_mount_opt, MBALLOC);
 		kfree(sbi->s_mb_offsets);
 		kfree(sbi->s_mb_maxs);
-		return i;
+		return ret;
 	}
 
 	spin_lock_init(&sbi->s_md_lock);
@@ -2548,8 +2693,7 @@ ext4_mb_free_committed_blocks(struct super_block *sb)
 		ext4_lock_group(sb, md->group);
 		for (i = 0; i < md->num; i++) {
 			mb_debug(" %u", md->blocks[i]);
-			err = mb_free_blocks(NULL, &e4b, md->blocks[i], 1);
-			BUG_ON(err != 0);
+			mb_free_blocks(NULL, &e4b, md->blocks[i], 1);
 		}
 		mb_debug("\n");
 		ext4_unlock_group(sb, md->group);
@@ -2575,25 +2719,24 @@ ext4_mb_free_committed_blocks(struct super_block *sb)
 
 
 
-#define MB_PROC_VALUE_READ(name)				\
-static int ext4_mb_read_##name(char *page, char **start,	\
-		off_t off, int count, int *eof, void *data)	\
+#define MB_PROC_FOPS(name)					\
+static int ext4_mb_##name##_proc_show(struct seq_file *m, void *v)	\
 {								\
-	struct ext4_sb_info *sbi = data;			\
-	int len;						\
-	*eof = 1;						\
-	if (off != 0)						\
-		return 0;					\
-	len = sprintf(page, "%ld\n", sbi->s_mb_##name);		\
-	*start = page;						\
-	return len;						\
-}
-
-#define MB_PROC_VALUE_WRITE(name)				\
-static int ext4_mb_write_##name(struct file *file,		\
-		const char __user *buf, unsigned long cnt, void *data)	\
+	struct ext4_sb_info *sbi = m->private;			\
+								\
+	seq_printf(m, "%ld\n", sbi->s_mb_##name);		\
+	return 0;						\
+}								\
+								\
+static int ext4_mb_##name##_proc_open(struct inode *inode, struct file *file)\
 {								\
-	struct ext4_sb_info *sbi = data;			\
+	return single_open(file, ext4_mb_##name##_proc_show, PDE(inode)->data);\
+}								\
+								\
+static ssize_t ext4_mb_##name##_proc_write(struct file *file,	\
+		const char __user *buf, size_t cnt, loff_t *ppos)	\
+{								\
+	struct ext4_sb_info *sbi = PDE(file->f_path.dentry->d_inode)->data;\
 	char str[32];						\
 	long value;						\
 	if (cnt >= sizeof(str))					\
@@ -2605,31 +2748,32 @@ static int ext4_mb_write_##name(struct file *file,		\
 		return -ERANGE;					\
 	sbi->s_mb_##name = value;				\
 	return cnt;						\
-}
+}								\
+								\
+static const struct file_operations ext4_mb_##name##_proc_fops = {	\
+	.owner		= THIS_MODULE,				\
+	.open		= ext4_mb_##name##_proc_open,		\
+	.read		= seq_read,				\
+	.llseek		= seq_lseek,				\
+	.release	= single_release,			\
+	.write		= ext4_mb_##name##_proc_write,		\
+};
 
-MB_PROC_VALUE_READ(stats);
-MB_PROC_VALUE_WRITE(stats);
-MB_PROC_VALUE_READ(max_to_scan);
-MB_PROC_VALUE_WRITE(max_to_scan);
-MB_PROC_VALUE_READ(min_to_scan);
-MB_PROC_VALUE_WRITE(min_to_scan);
-MB_PROC_VALUE_READ(order2_reqs);
-MB_PROC_VALUE_WRITE(order2_reqs);
-MB_PROC_VALUE_READ(stream_request);
-MB_PROC_VALUE_WRITE(stream_request);
-MB_PROC_VALUE_READ(group_prealloc);
-MB_PROC_VALUE_WRITE(group_prealloc);
+MB_PROC_FOPS(stats);
+MB_PROC_FOPS(max_to_scan);
+MB_PROC_FOPS(min_to_scan);
+MB_PROC_FOPS(order2_reqs);
+MB_PROC_FOPS(stream_request);
+MB_PROC_FOPS(group_prealloc);
 
 #define	MB_PROC_HANDLER(name, var)					\
 do {									\
-	proc = create_proc_entry(name, mode, sbi->s_mb_proc);		\
+	proc = proc_create_data(name, mode, sbi->s_mb_proc,		\
+				&ext4_mb_##var##_proc_fops, sbi);	\
 	if (proc == NULL) {						\
 		printk(KERN_ERR "EXT4-fs: can't to create %s\n", name);	\
 		goto err_out;						\
 	}								\
-	proc->data = sbi;						\
-	proc->read_proc  = ext4_mb_read_##var ;				\
-	proc->write_proc = ext4_mb_write_##var;				\
 } while (0)
 
 static int ext4_mb_init_per_dev_proc(struct super_block *sb)
@@ -2639,8 +2783,11 @@ static int ext4_mb_init_per_dev_proc(struct super_block *sb)
 	struct proc_dir_entry *proc;
 	char devname[64];
 
-	snprintf(devname, sizeof(devname) - 1, "%s",
-		bdevname(sb->s_bdev, devname));
+	if (proc_root_ext4 == NULL) {
+		sbi->s_mb_proc = NULL;
+		return -EINVAL;
+	}
+	bdevname(sb->s_bdev, devname);
 	sbi->s_mb_proc = proc_mkdir(devname, proc_root_ext4);
 
 	MB_PROC_HANDLER(EXT4_MB_STATS_NAME, stats);
@@ -2674,8 +2821,7 @@ static int ext4_mb_destroy_per_dev_proc(struct super_block *sb)
 	if (sbi->s_mb_proc == NULL)
 		return -EINVAL;
 
-	snprintf(devname, sizeof(devname) - 1, "%s",
-		bdevname(sb->s_bdev, devname));
+	bdevname(sb->s_bdev, devname);
 	remove_proc_entry(EXT4_MB_GROUP_PREALLOC, sbi->s_mb_proc);
 	remove_proc_entry(EXT4_MB_STREAM_REQ, sbi->s_mb_proc);
 	remove_proc_entry(EXT4_MB_ORDER2_REQ, sbi->s_mb_proc);
@@ -2738,7 +2884,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
 	struct ext4_sb_info *sbi;
 	struct super_block *sb;
 	ext4_fsblk_t block;
-	int err;
+	int err, len;
 
 	BUG_ON(ac->ac_status != AC_STATUS_FOUND);
 	BUG_ON(ac->ac_b_ex.fe_len <= 0);
@@ -2747,11 +2893,9 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
 	sbi = EXT4_SB(sb);
 	es = sbi->s_es;
 
-	ext4_debug("using block group %lu(%d)\n", ac->ac_b_ex.fe_group,
-			gdp->bg_free_blocks_count);
 
 	err = -EIO;
-	bitmap_bh = read_block_bitmap(sb, ac->ac_b_ex.fe_group);
+	bitmap_bh = ext4_read_block_bitmap(sb, ac->ac_b_ex.fe_group);
 	if (!bitmap_bh)
 		goto out_err;
 
@@ -2764,6 +2908,9 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
 	if (!gdp)
 		goto out_err;
 
+	ext4_debug("using block group %lu(%d)\n", ac->ac_b_ex.fe_group,
+			gdp->bg_free_blocks_count);
+
 	err = ext4_journal_get_write_access(handle, gdp_bh);
 	if (err)
 		goto out_err;
@@ -2772,14 +2919,27 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
 		+ ac->ac_b_ex.fe_start
 		+ le32_to_cpu(es->s_first_data_block);
 
-	if (block == ext4_block_bitmap(sb, gdp) ||
-			block == ext4_inode_bitmap(sb, gdp) ||
-			in_range(block, ext4_inode_table(sb, gdp),
-				EXT4_SB(sb)->s_itb_per_group)) {
-
+	len = ac->ac_b_ex.fe_len;
+	if (in_range(ext4_block_bitmap(sb, gdp), block, len) ||
+	    in_range(ext4_inode_bitmap(sb, gdp), block, len) ||
+	    in_range(block, ext4_inode_table(sb, gdp),
+		     EXT4_SB(sb)->s_itb_per_group) ||
+	    in_range(block + len - 1, ext4_inode_table(sb, gdp),
+		     EXT4_SB(sb)->s_itb_per_group)) {
 		ext4_error(sb, __func__,
 			   "Allocating block in system zone - block = %llu",
 			   block);
+		/* File system mounted not to panic on error
+		 * Fix the bitmap and repeat the block allocation
+		 * We leak some of the blocks here.
+		 */
+		mb_set_bits(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group),
+				bitmap_bh->b_data, ac->ac_b_ex.fe_start,
+				ac->ac_b_ex.fe_len);
+		err = ext4_journal_dirty_metadata(handle, bitmap_bh);
+		if (!err)
+			err = -EAGAIN;
+		goto out_err;
 	}
 #ifdef AGGRESSIVE_CHECK
 	{
@@ -2804,7 +2964,23 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
 	le16_add_cpu(&gdp->bg_free_blocks_count, -ac->ac_b_ex.fe_len);
 	gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp);
 	spin_unlock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group));
-	percpu_counter_sub(&sbi->s_freeblocks_counter, ac->ac_b_ex.fe_len);
+
+	/*
+	 * free blocks account has already be reduced/reserved
+	 * at write_begin() time for delayed allocation
+	 * do not double accounting
+	 */
+	if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED))
+		percpu_counter_sub(&sbi->s_freeblocks_counter,
+					ac->ac_b_ex.fe_len);
+
+	if (sbi->s_log_groups_per_flex) {
+		ext4_group_t flex_group = ext4_flex_group(sbi,
+							  ac->ac_b_ex.fe_group);
+		spin_lock(sb_bgl_lock(sbi, flex_group));
+		sbi->s_flex_groups[flex_group].free_blocks -= ac->ac_b_ex.fe_len;
+		spin_unlock(sb_bgl_lock(sbi, flex_group));
+	}
 
 	err = ext4_journal_dirty_metadata(handle, bitmap_bh);
 	if (err)
@@ -2882,12 +3058,11 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
 	if (size < i_size_read(ac->ac_inode))
 		size = i_size_read(ac->ac_inode);
 
-	/* max available blocks in a free group */
-	max = EXT4_BLOCKS_PER_GROUP(ac->ac_sb) - 1 - 1 -
-				EXT4_SB(ac->ac_sb)->s_itb_per_group;
+	/* max size of free chunks */
+	max = 2 << bsbits;
 
-#define NRL_CHECK_SIZE(req, size, max,bits)	\
-		(req <= (size) || max <= ((size) >> bits))
+#define NRL_CHECK_SIZE(req, size, max, chunk_size)	\
+		(req <= (size) || max <= (chunk_size))
 
 	/* first, try to predict filesize */
 	/* XXX: should this table be tunable? */
@@ -2906,16 +3081,16 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
 		size = 512 * 1024;
 	} else if (size <= 1024 * 1024) {
 		size = 1024 * 1024;
-	} else if (NRL_CHECK_SIZE(size, 4 * 1024 * 1024, max, bsbits)) {
+	} else if (NRL_CHECK_SIZE(size, 4 * 1024 * 1024, max, 2 * 1024)) {
 		start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
-						(20 - bsbits)) << 20;
-		size = 1024 * 1024;
-	} else if (NRL_CHECK_SIZE(size, 8 * 1024 * 1024, max, bsbits)) {
+						(21 - bsbits)) << 21;
+		size = 2 * 1024 * 1024;
+	} else if (NRL_CHECK_SIZE(size, 8 * 1024 * 1024, max, 4 * 1024)) {
 		start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
 							(22 - bsbits)) << 22;
 		size = 4 * 1024 * 1024;
 	} else if (NRL_CHECK_SIZE(ac->ac_o_ex.fe_len,
-					(8<<20)>>bsbits, max, bsbits)) {
+					(8<<20)>>bsbits, max, 8 * 1024)) {
 		start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
 							(23 - bsbits)) << 23;
 		size = 8 * 1024 * 1024;
@@ -3084,8 +3259,7 @@ static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac,
 static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac,
 				struct ext4_prealloc_space *pa)
 {
-	unsigned len = ac->ac_o_ex.fe_len;
-
+	unsigned int len = ac->ac_o_ex.fe_len;
 	ext4_get_group_no_and_offset(ac->ac_sb, pa->pa_pstart,
 					&ac->ac_b_ex.fe_group,
 					&ac->ac_b_ex.fe_start);
@@ -3463,8 +3637,6 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
 		if (bit >= end)
 			break;
 		next = mb_find_next_bit(bitmap_bh->b_data, end, bit);
-		if (next > end)
-			next = end;
 		start = group * EXT4_BLOCKS_PER_GROUP(sb) + bit +
 				le32_to_cpu(sbi->s_es->s_first_data_block);
 		mb_debug("    free preallocated %u/%u in group %u\n",
@@ -3559,7 +3731,7 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
 	if (list_empty(&grp->bb_prealloc_list))
 		return 0;
 
-	bitmap_bh = read_block_bitmap(sb, group);
+	bitmap_bh = ext4_read_block_bitmap(sb, group);
 	if (bitmap_bh == NULL) {
 		/* error handling here */
 		ext4_mb_release_desc(&e4b);
@@ -3733,7 +3905,7 @@ repeat:
 		err = ext4_mb_load_buddy(sb, group, &e4b);
 		BUG_ON(err != 0); /* error handling here */
 
-		bitmap_bh = read_block_bitmap(sb, group);
+		bitmap_bh = ext4_read_block_bitmap(sb, group);
 		if (bitmap_bh == NULL) {
 			/* error handling here */
 			ext4_mb_release_desc(&e4b);
@@ -4001,10 +4173,21 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
 	sbi = EXT4_SB(sb);
 
 	if (!test_opt(sb, MBALLOC)) {
-		block = ext4_new_blocks_old(handle, ar->inode, ar->goal,
+		block = ext4_old_new_blocks(handle, ar->inode, ar->goal,
 					    &(ar->len), errp);
 		return block;
 	}
+	if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag) {
+		/*
+		 * With delalloc we already reserved the blocks
+		 */
+		ar->len = ext4_has_free_blocks(sbi, ar->len);
+	}
+
+	if (ar->len == 0) {
+		*errp = -ENOSPC;
+		return 0;
+	}
 
 	while (ar->len && DQUOT_ALLOC_BLOCK(ar->inode, ar->len)) {
 		ar->flags |= EXT4_MB_HINT_NOPREALLOC;
@@ -4016,10 +4199,14 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
 	}
 	inquota = ar->len;
 
+	if (EXT4_I(ar->inode)->i_delalloc_reserved_flag)
+		ar->flags |= EXT4_MB_DELALLOC_RESERVED;
+
 	ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
 	if (!ac) {
+		ar->len = 0;
 		*errp = -ENOMEM;
-		return 0;
+		goto out1;
 	}
 
 	ext4_mb_poll_new_transaction(sb, handle);
@@ -4027,15 +4214,13 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
 	*errp = ext4_mb_initialize_context(ac, ar);
 	if (*errp) {
 		ar->len = 0;
-		goto out;
+		goto out2;
 	}
 
 	ac->ac_op = EXT4_MB_HISTORY_PREALLOC;
 	if (!ext4_mb_use_preallocated(ac)) {
-
 		ac->ac_op = EXT4_MB_HISTORY_ALLOC;
 		ext4_mb_normalize_request(ac, ar);
-
 repeat:
 		/* allocate space in core */
 		ext4_mb_regular_allocator(ac);
@@ -4049,10 +4234,21 @@ repeat:
 	}
 
 	if (likely(ac->ac_status == AC_STATUS_FOUND)) {
-		ext4_mb_mark_diskspace_used(ac, handle);
-		*errp = 0;
-		block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
-		ar->len = ac->ac_b_ex.fe_len;
+		*errp = ext4_mb_mark_diskspace_used(ac, handle);
+		if (*errp ==  -EAGAIN) {
+			ac->ac_b_ex.fe_group = 0;
+			ac->ac_b_ex.fe_start = 0;
+			ac->ac_b_ex.fe_len = 0;
+			ac->ac_status = AC_STATUS_CONTINUE;
+			goto repeat;
+		} else if (*errp) {
+			ac->ac_b_ex.fe_len = 0;
+			ar->len = 0;
+			ext4_mb_show_ac(ac);
+		} else {
+			block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
+			ar->len = ac->ac_b_ex.fe_len;
+		}
 	} else {
 		freed  = ext4_mb_discard_preallocations(sb, ac->ac_o_ex.fe_len);
 		if (freed)
@@ -4065,11 +4261,12 @@ repeat:
 
 	ext4_mb_release_context(ac);
 
-out:
+out2:
+	kmem_cache_free(ext4_ac_cachep, ac);
+out1:
 	if (ar->len < inquota)
 		DQUOT_FREE_BLOCK(ar->inode, inquota - ar->len);
 
-	kmem_cache_free(ext4_ac_cachep, ac);
 	return block;
 }
 static void ext4_mb_poll_new_transaction(struct super_block *sb,
@@ -4222,7 +4419,7 @@ do_more:
 		overflow = bit + count - EXT4_BLOCKS_PER_GROUP(sb);
 		count -= overflow;
 	}
-	bitmap_bh = read_block_bitmap(sb, block_group);
+	bitmap_bh = ext4_read_block_bitmap(sb, block_group);
 	if (!bitmap_bh)
 		goto error_return;
 	gdp = ext4_get_group_desc(sb, block_group, &gd_bh);
@@ -4239,6 +4436,8 @@ do_more:
 		ext4_error(sb, __func__,
 			   "Freeing blocks in system zone - "
 			   "Block = %lu, count = %lu", block, count);
+		/* err = 0. ext4_std_error should be a no op */
+		goto error_return;
 	}
 
 	BUFFER_TRACE(bitmap_bh, "getting write access");
@@ -4287,10 +4486,9 @@ do_more:
 		ext4_mb_free_metadata(handle, &e4b, block_group, bit, count);
 	} else {
 		ext4_lock_group(sb, block_group);
-		err = mb_free_blocks(inode, &e4b, bit, count);
+		mb_free_blocks(inode, &e4b, bit, count);
 		ext4_mb_return_to_preallocation(inode, &e4b, block, count);
 		ext4_unlock_group(sb, block_group);
-		BUG_ON(err != 0);
 	}
 
 	spin_lock(sb_bgl_lock(sbi, block_group));
@@ -4299,6 +4497,13 @@ do_more:
 	spin_unlock(sb_bgl_lock(sbi, block_group));
 	percpu_counter_add(&sbi->s_freeblocks_counter, count);
 
+	if (sbi->s_log_groups_per_flex) {
+		ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
+		spin_lock(sb_bgl_lock(sbi, flex_group));
+		sbi->s_flex_groups[flex_group].free_blocks += count;
+		spin_unlock(sb_bgl_lock(sbi, flex_group));
+	}
+
 	ext4_mb_release_desc(&e4b);
 
 	*freed += count;
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index ab16beaa830d..387ad98350c3 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -183,6 +183,16 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
 			     struct inode *inode);
 
 /*
+ * p is at least 6 bytes before the end of page
+ */
+static inline struct ext4_dir_entry_2 *
+ext4_next_entry(struct ext4_dir_entry_2 *p)
+{
+	return (struct ext4_dir_entry_2 *)((char *)p +
+		ext4_rec_len_from_disk(p->rec_len));
+}
+
+/*
  * Future: use high four bits of block for coalesce-on-delete flags
  * Mask them off for now.
  */
@@ -231,13 +241,13 @@ static inline unsigned dx_root_limit (struct inode *dir, unsigned infosize)
 {
 	unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(1) -
 		EXT4_DIR_REC_LEN(2) - infosize;
-	return 0? 20: entry_space / sizeof(struct dx_entry);
+	return entry_space / sizeof(struct dx_entry);
 }
 
 static inline unsigned dx_node_limit (struct inode *dir)
 {
 	unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(0);
-	return 0? 22: entry_space / sizeof(struct dx_entry);
+	return entry_space / sizeof(struct dx_entry);
 }
 
 /*
@@ -554,15 +564,6 @@ static int ext4_htree_next_block(struct inode *dir, __u32 hash,
 
 
 /*
- * p is at least 6 bytes before the end of page
- */
-static inline struct ext4_dir_entry_2 *ext4_next_entry(struct ext4_dir_entry_2 *p)
-{
-	return (struct ext4_dir_entry_2 *)((char *)p +
-		ext4_rec_len_from_disk(p->rec_len));
-}
-
-/*
  * This function fills a red-black tree with information from a
  * directory block.  It returns the number directory entries loaded
  * into the tree.  If there is an error it is returned in err.
@@ -993,19 +994,21 @@ static struct buffer_head * ext4_dx_find_entry(struct dentry *dentry,
 		de = (struct ext4_dir_entry_2 *) bh->b_data;
 		top = (struct ext4_dir_entry_2 *) ((char *) de + sb->s_blocksize -
 				       EXT4_DIR_REC_LEN(0));
-		for (; de < top; de = ext4_next_entry(de))
-		if (ext4_match (namelen, name, de)) {
-			if (!ext4_check_dir_entry("ext4_find_entry",
-						  dir, de, bh,
-				  (block<<EXT4_BLOCK_SIZE_BITS(sb))
-					  +((char *)de - bh->b_data))) {
-				brelse (bh);
+		for (; de < top; de = ext4_next_entry(de)) {
+			int off = (block << EXT4_BLOCK_SIZE_BITS(sb))
+				  + ((char *) de - bh->b_data);
+
+			if (!ext4_check_dir_entry(__func__, dir, de, bh, off)) {
+				brelse(bh);
 				*err = ERR_BAD_DX_DIR;
 				goto errout;
 			}
-			*res_dir = de;
-			dx_release (frames);
-			return bh;
+
+			if (ext4_match(namelen, name, de)) {
+				*res_dir = de;
+				dx_release(frames);
+				return bh;
+			}
 		}
 		brelse (bh);
 		/* Check to see if we should continue to search */
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 9f086a6a472b..f000fbe2cd93 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -563,7 +563,8 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
 	}
 
 	blk = EXT4_SB(sb)->s_sbh->b_blocknr + 1 + EXT4_SB(sb)->s_gdb_count;
-	data = (__le32 *)dind->b_data + EXT4_SB(sb)->s_gdb_count;
+	data = (__le32 *)dind->b_data + (EXT4_SB(sb)->s_gdb_count %
+					 EXT4_ADDR_PER_BLOCK(sb));
 	end = (__le32 *)dind->b_data + EXT4_ADDR_PER_BLOCK(sb);
 
 	/* Get each reserved primary GDT block and verify it holds backups */
@@ -854,7 +855,8 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
 	 */
 
 	/* Update group descriptor block for new group */
-	gdp = (struct ext4_group_desc *)primary->b_data + gdb_off;
+	gdp = (struct ext4_group_desc *)((char *)primary->b_data +
+					 gdb_off * EXT4_DESC_SIZE(sb));
 
 	ext4_block_bitmap_set(sb, gdp, input->block_bitmap); /* LV FIXME */
 	ext4_inode_bitmap_set(sb, gdp, input->inode_bitmap); /* LV FIXME */
@@ -864,6 +866,15 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
 	gdp->bg_checksum = ext4_group_desc_csum(sbi, input->group, gdp);
 
 	/*
+	 * We can allocate memory for mb_alloc based on the new group
+	 * descriptor
+	 */
+	if (test_opt(sb, MBALLOC)) {
+		err = ext4_mb_add_more_groupinfo(sb, input->group, gdp);
+		if (err)
+			goto exit_journal;
+	}
+	/*
 	 * Make the new blocks and inodes valid next.  We do this before
 	 * increasing the group count so that once the group is enabled,
 	 * all of its blocks and inodes are already valid.
@@ -955,6 +966,8 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
 	handle_t *handle;
 	int err;
 	unsigned long freed_blocks;
+	ext4_group_t group;
+	struct ext4_group_info *grp;
 
 	/* We don't need to worry about locking wrt other resizers just
 	 * yet: we're going to revalidate es->s_blocks_count after
@@ -986,7 +999,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
 	}
 
 	/* Handle the remaining blocks in the last group only. */
-	ext4_get_group_no_and_offset(sb, o_blocks_count, NULL, &last);
+	ext4_get_group_no_and_offset(sb, o_blocks_count, &group, &last);
 
 	if (last == 0) {
 		ext4_warning(sb, __func__,
@@ -1058,6 +1071,45 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
 		   o_blocks_count + add);
 	if ((err = ext4_journal_stop(handle)))
 		goto exit_put;
+
+	/*
+	 * Mark mballoc pages as not up to date so that they will be updated
+	 * next time they are loaded by ext4_mb_load_buddy.
+	 */
+	if (test_opt(sb, MBALLOC)) {
+		struct ext4_sb_info *sbi = EXT4_SB(sb);
+		struct inode *inode = sbi->s_buddy_cache;
+		int blocks_per_page;
+		int block;
+		int pnum;
+		struct page *page;
+
+		/* Set buddy page as not up to date */
+		blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
+		block = group * 2;
+		pnum = block / blocks_per_page;
+		page = find_get_page(inode->i_mapping, pnum);
+		if (page != NULL) {
+			ClearPageUptodate(page);
+			page_cache_release(page);
+		}
+
+		/* Set bitmap page as not up to date */
+		block++;
+		pnum = block / blocks_per_page;
+		page = find_get_page(inode->i_mapping, pnum);
+		if (page != NULL) {
+			ClearPageUptodate(page);
+			page_cache_release(page);
+		}
+
+		/* Get the info on the last group */
+		grp = ext4_get_group_info(sb, group);
+
+		/* Update free blocks in group info */
+		ext4_mb_update_group_info(grp, add);
+	}
+
 	if (test_opt(sb, DEBUG))
 		printk(KERN_DEBUG "EXT4-fs: extended group to %llu blocks\n",
 		       ext4_blocks_count(es));
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 52dd0679a4e2..1cb371dcd609 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -506,6 +506,7 @@ static void ext4_put_super (struct super_block * sb)
 	ext4_ext_release(sb);
 	ext4_xattr_put_super(sb);
 	jbd2_journal_destroy(sbi->s_journal);
+	sbi->s_journal = NULL;
 	if (!(sb->s_flags & MS_RDONLY)) {
 		EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
 		es->s_state = cpu_to_le16(sbi->s_mount_state);
@@ -517,6 +518,7 @@ static void ext4_put_super (struct super_block * sb)
 	for (i = 0; i < sbi->s_gdb_count; i++)
 		brelse(sbi->s_group_desc[i]);
 	kfree(sbi->s_group_desc);
+	kfree(sbi->s_flex_groups);
 	percpu_counter_destroy(&sbi->s_freeblocks_counter);
 	percpu_counter_destroy(&sbi->s_freeinodes_counter);
 	percpu_counter_destroy(&sbi->s_dirs_counter);
@@ -571,6 +573,12 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
 	memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache));
 	INIT_LIST_HEAD(&ei->i_prealloc_list);
 	spin_lock_init(&ei->i_prealloc_lock);
+	jbd2_journal_init_jbd_inode(&ei->jinode, &ei->vfs_inode);
+	ei->i_reserved_data_blocks = 0;
+	ei->i_reserved_meta_blocks = 0;
+	ei->i_allocated_meta_blocks = 0;
+	ei->i_delalloc_reserved_flag = 0;
+	spin_lock_init(&(ei->i_block_reservation_lock));
 	return &ei->vfs_inode;
 }
 
@@ -635,6 +643,8 @@ static void ext4_clear_inode(struct inode *inode)
 	EXT4_I(inode)->i_block_alloc_info = NULL;
 	if (unlikely(rsv))
 		kfree(rsv);
+	jbd2_journal_release_jbd_inode(EXT4_SB(inode->i_sb)->s_journal,
+				       &EXT4_I(inode)->jinode);
 }
 
 static inline void ext4_show_quota_options(struct seq_file *seq, struct super_block *sb)
@@ -729,8 +739,15 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
 		seq_printf(seq, ",commit=%u",
 			   (unsigned) (sbi->s_commit_interval / HZ));
 	}
-	if (test_opt(sb, BARRIER))
-		seq_puts(seq, ",barrier=1");
+	/*
+	 * We're changing the default of barrier mount option, so
+	 * let's always display its mount state so it's clear what its
+	 * status is.
+	 */
+	seq_puts(seq, ",barrier=");
+	seq_puts(seq, test_opt(sb, BARRIER) ? "1" : "0");
+	if (test_opt(sb, JOURNAL_ASYNC_COMMIT))
+		seq_puts(seq, ",journal_async_commit");
 	if (test_opt(sb, NOBH))
 		seq_puts(seq, ",nobh");
 	if (!test_opt(sb, EXTENTS))
@@ -739,6 +756,9 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
 		seq_puts(seq, ",nomballoc");
 	if (test_opt(sb, I_VERSION))
 		seq_puts(seq, ",i_version");
+	if (!test_opt(sb, DELALLOC))
+		seq_puts(seq, ",nodelalloc");
+
 
 	if (sbi->s_stripe)
 		seq_printf(seq, ",stripe=%lu", sbi->s_stripe);
@@ -886,7 +906,7 @@ enum {
 	Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
 	Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
 	Opt_grpquota, Opt_extents, Opt_noextents, Opt_i_version,
-	Opt_mballoc, Opt_nomballoc, Opt_stripe,
+	Opt_mballoc, Opt_nomballoc, Opt_stripe, Opt_delalloc, Opt_nodelalloc,
 };
 
 static match_table_t tokens = {
@@ -945,6 +965,8 @@ static match_table_t tokens = {
 	{Opt_nomballoc, "nomballoc"},
 	{Opt_stripe, "stripe=%u"},
 	{Opt_resize, "resize"},
+	{Opt_delalloc, "delalloc"},
+	{Opt_nodelalloc, "nodelalloc"},
 	{Opt_err, NULL},
 };
 
@@ -979,9 +1001,10 @@ static int parse_options (char *options, struct super_block *sb,
 	int data_opt = 0;
 	int option;
 #ifdef CONFIG_QUOTA
-	int qtype;
+	int qtype, qfmt;
 	char *qname;
 #endif
+	ext4_fsblk_t last_block;
 
 	if (!options)
 		return 1;
@@ -1162,9 +1185,11 @@ static int parse_options (char *options, struct super_block *sb,
 		case Opt_grpjquota:
 			qtype = GRPQUOTA;
 set_qf_name:
-			if (sb_any_quota_enabled(sb)) {
+			if ((sb_any_quota_enabled(sb) ||
+			     sb_any_quota_suspended(sb)) &&
+			    !sbi->s_qf_names[qtype]) {
 				printk(KERN_ERR
-					"EXT4-fs: Cannot change journalled "
+					"EXT4-fs: Cannot change journaled "
 					"quota options when quota turned on.\n");
 				return 0;
 			}
@@ -1200,9 +1225,11 @@ set_qf_name:
 		case Opt_offgrpjquota:
 			qtype = GRPQUOTA;
 clear_qf_name:
-			if (sb_any_quota_enabled(sb)) {
+			if ((sb_any_quota_enabled(sb) ||
+			     sb_any_quota_suspended(sb)) &&
+			    sbi->s_qf_names[qtype]) {
 				printk(KERN_ERR "EXT4-fs: Cannot change "
-					"journalled quota options when "
+					"journaled quota options when "
 					"quota turned on.\n");
 				return 0;
 			}
@@ -1213,10 +1240,20 @@ clear_qf_name:
 			sbi->s_qf_names[qtype] = NULL;
 			break;
 		case Opt_jqfmt_vfsold:
-			sbi->s_jquota_fmt = QFMT_VFS_OLD;
-			break;
+			qfmt = QFMT_VFS_OLD;
+			goto set_qf_format;
 		case Opt_jqfmt_vfsv0:
-			sbi->s_jquota_fmt = QFMT_VFS_V0;
+			qfmt = QFMT_VFS_V0;
+set_qf_format:
+			if ((sb_any_quota_enabled(sb) ||
+			     sb_any_quota_suspended(sb)) &&
+			    sbi->s_jquota_fmt != qfmt) {
+				printk(KERN_ERR "EXT4-fs: Cannot change "
+					"journaled quota options when "
+					"quota turned on.\n");
+				return 0;
+			}
+			sbi->s_jquota_fmt = qfmt;
 			break;
 		case Opt_quota:
 		case Opt_usrquota:
@@ -1241,6 +1278,9 @@ clear_qf_name:
 		case Opt_quota:
 		case Opt_usrquota:
 		case Opt_grpquota:
+			printk(KERN_ERR
+				"EXT4-fs: quota options not supported.\n");
+			break;
 		case Opt_usrjquota:
 		case Opt_grpjquota:
 		case Opt_offusrjquota:
@@ -1248,7 +1288,7 @@ clear_qf_name:
 		case Opt_jqfmt_vfsold:
 		case Opt_jqfmt_vfsv0:
 			printk(KERN_ERR
-				"EXT4-fs: journalled quota options not "
+				"EXT4-fs: journaled quota options not "
 				"supported.\n");
 			break;
 		case Opt_noquota:
@@ -1284,15 +1324,39 @@ clear_qf_name:
 			clear_opt(sbi->s_mount_opt, NOBH);
 			break;
 		case Opt_extents:
+			if (!EXT4_HAS_INCOMPAT_FEATURE(sb,
+					EXT4_FEATURE_INCOMPAT_EXTENTS)) {
+				ext4_warning(sb, __func__,
+					"extents feature not enabled "
+					"on this filesystem, use tune2fs\n");
+				return 0;
+			}
 			set_opt (sbi->s_mount_opt, EXTENTS);
 			break;
 		case Opt_noextents:
+			/*
+			 * When e2fsprogs support resizing an already existing
+			 * ext3 file system to greater than 2**32 we need to
+			 * add support to block allocator to handle growing
+			 * already existing block  mapped inode so that blocks
+			 * allocated for them fall within 2**32
+			 */
+			last_block = ext4_blocks_count(sbi->s_es) - 1;
+			if (last_block  > 0xffffffffULL) {
+				printk(KERN_ERR "EXT4-fs: Filesystem too "
+						"large to mount with "
+						"-o noextents options\n");
+				return 0;
+			}
 			clear_opt (sbi->s_mount_opt, EXTENTS);
 			break;
 		case Opt_i_version:
 			set_opt(sbi->s_mount_opt, I_VERSION);
 			sb->s_flags |= MS_I_VERSION;
 			break;
+		case Opt_nodelalloc:
+			clear_opt(sbi->s_mount_opt, DELALLOC);
+			break;
 		case Opt_mballoc:
 			set_opt(sbi->s_mount_opt, MBALLOC);
 			break;
@@ -1306,6 +1370,9 @@ clear_qf_name:
 				return 0;
 			sbi->s_stripe = option;
 			break;
+		case Opt_delalloc:
+			set_opt(sbi->s_mount_opt, DELALLOC);
+			break;
 		default:
 			printk (KERN_ERR
 				"EXT4-fs: Unrecognized mount option \"%s\" "
@@ -1333,14 +1400,14 @@ clear_qf_name:
 		}
 
 		if (!sbi->s_jquota_fmt) {
-			printk(KERN_ERR "EXT4-fs: journalled quota format "
+			printk(KERN_ERR "EXT4-fs: journaled quota format "
 					"not specified.\n");
 			return 0;
 		}
 	} else {
 		if (sbi->s_jquota_fmt) {
-			printk(KERN_ERR "EXT4-fs: journalled quota format "
-					"specified with no journalling "
+			printk(KERN_ERR "EXT4-fs: journaled quota format "
+					"specified with no journaling "
 					"enabled.\n");
 			return 0;
 		}
@@ -1418,6 +1485,54 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
 	return res;
 }
 
+static int ext4_fill_flex_info(struct super_block *sb)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_group_desc *gdp = NULL;
+	struct buffer_head *bh;
+	ext4_group_t flex_group_count;
+	ext4_group_t flex_group;
+	int groups_per_flex = 0;
+	__u64 block_bitmap = 0;
+	int i;
+
+	if (!sbi->s_es->s_log_groups_per_flex) {
+		sbi->s_log_groups_per_flex = 0;
+		return 1;
+	}
+
+	sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex;
+	groups_per_flex = 1 << sbi->s_log_groups_per_flex;
+
+	flex_group_count = (sbi->s_groups_count + groups_per_flex - 1) /
+		groups_per_flex;
+	sbi->s_flex_groups = kmalloc(flex_group_count *
+				     sizeof(struct flex_groups), GFP_KERNEL);
+	if (sbi->s_flex_groups == NULL) {
+		printk(KERN_ERR "EXT4-fs: not enough memory\n");
+		goto failed;
+	}
+	memset(sbi->s_flex_groups, 0, flex_group_count *
+	       sizeof(struct flex_groups));
+
+	gdp = ext4_get_group_desc(sb, 1, &bh);
+	block_bitmap = ext4_block_bitmap(sb, gdp) - 1;
+
+	for (i = 0; i < sbi->s_groups_count; i++) {
+		gdp = ext4_get_group_desc(sb, i, &bh);
+
+		flex_group = ext4_flex_group(sbi, i);
+		sbi->s_flex_groups[flex_group].free_inodes +=
+			le16_to_cpu(gdp->bg_free_inodes_count);
+		sbi->s_flex_groups[flex_group].free_blocks +=
+			le16_to_cpu(gdp->bg_free_blocks_count);
+	}
+
+	return 1;
+failed:
+	return 0;
+}
+
 __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 block_group,
 			    struct ext4_group_desc *gdp)
 {
@@ -1581,7 +1696,7 @@ static void ext4_orphan_cleanup (struct super_block * sb,
 			int ret = ext4_quota_on_mount(sb, i);
 			if (ret < 0)
 				printk(KERN_ERR
-					"EXT4-fs: Cannot turn on journalled "
+					"EXT4-fs: Cannot turn on journaled "
 					"quota: error %d\n", ret);
 		}
 	}
@@ -1785,8 +1900,8 @@ static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi)
 }
 
 static int ext4_fill_super (struct super_block *sb, void *data, int silent)
-				__releases(kernel_sem)
-				__acquires(kernel_sem)
+				__releases(kernel_lock)
+				__acquires(kernel_lock)
 
 {
 	struct buffer_head * bh;
@@ -1826,11 +1941,6 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
 		goto out_fail;
 	}
 
-	if (!sb_set_blocksize(sb, blocksize)) {
-		printk(KERN_ERR "EXT4-fs: bad blocksize %d.\n", blocksize);
-		goto out_fail;
-	}
-
 	/*
 	 * The ext4 superblock will not be buffer aligned for other than 1kB
 	 * block sizes.  We need to calculate the offset from buffer start.
@@ -1890,18 +2000,32 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
 	sbi->s_resgid = le16_to_cpu(es->s_def_resgid);
 
 	set_opt(sbi->s_mount_opt, RESERVATION);
+	set_opt(sbi->s_mount_opt, BARRIER);
 
 	/*
 	 * turn on extents feature by default in ext4 filesystem
-	 * User -o noextents to turn it off
+	 * only if feature flag already set by mkfs or tune2fs.
+	 * Use -o noextents to turn it off
 	 */
-	set_opt(sbi->s_mount_opt, EXTENTS);
+	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS))
+		set_opt(sbi->s_mount_opt, EXTENTS);
+	else
+		ext4_warning(sb, __func__,
+			"extents feature not enabled on this filesystem, "
+			"use tune2fs.\n");
 	/*
-	 * turn on mballoc feature by default in ext4 filesystem
-	 * User -o nomballoc to turn it off
+	 * turn on mballoc code by default in ext4 filesystem
+	 * Use -o nomballoc to turn it off
 	 */
 	set_opt(sbi->s_mount_opt, MBALLOC);
 
+	/*
+	 * enable delayed allocation by default
+	 * Use -o nodelalloc to turn it off
+	 */
+	set_opt(sbi->s_mount_opt, DELALLOC);
+
+
 	if (!parse_options ((char *) data, sb, &journal_inum, &journal_devnum,
 			    NULL, 0))
 		goto failed_mount;
@@ -2112,6 +2236,14 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
 		printk(KERN_ERR "EXT4-fs: group descriptors corrupted!\n");
 		goto failed_mount2;
 	}
+	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG))
+		if (!ext4_fill_flex_info(sb)) {
+			printk(KERN_ERR
+			       "EXT4-fs: unable to initialize "
+			       "flex_bg meta info!\n");
+			goto failed_mount2;
+		}
+
 	sbi->s_gdb_count = db_count;
 	get_random_bytes(&sbi->s_next_generation, sizeof(u32));
 	spin_lock_init(&sbi->s_next_gen_lock);
@@ -2172,6 +2304,29 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
 	    EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) {
 		if (ext4_load_journal(sb, es, journal_devnum))
 			goto failed_mount3;
+		if (!(sb->s_flags & MS_RDONLY) &&
+		    EXT4_SB(sb)->s_journal->j_failed_commit) {
+			printk(KERN_CRIT "EXT4-fs error (device %s): "
+			       "ext4_fill_super: Journal transaction "
+			       "%u is corrupt\n", sb->s_id, 
+			       EXT4_SB(sb)->s_journal->j_failed_commit);
+			if (test_opt (sb, ERRORS_RO)) {
+				printk (KERN_CRIT
+					"Mounting filesystem read-only\n");
+				sb->s_flags |= MS_RDONLY;
+				EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
+				es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
+			}
+			if (test_opt(sb, ERRORS_PANIC)) {
+				EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
+				es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
+				ext4_commit_super(sb, es, 1);
+				printk(KERN_CRIT
+				       "EXT4-fs (device %s): mount failed\n",
+				      sb->s_id);
+				goto failed_mount4;
+			}
+		}
 	} else if (journal_inum) {
 		if (ext4_create_journal(sb, es, journal_inum))
 			goto failed_mount3;
@@ -2309,6 +2464,13 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
 		test_opt(sb,DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA ? "ordered":
 		"writeback");
 
+	if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
+		printk(KERN_WARNING "EXT4-fs: Ignoring delalloc option - "
+				"requested data journaling mode\n");
+		clear_opt(sbi->s_mount_opt, DELALLOC);
+	} else if (test_opt(sb, DELALLOC))
+		printk(KERN_INFO "EXT4-fs: delayed allocation enabled\n");
+
 	ext4_ext_init(sb);
 	ext4_mb_init(sb, needs_recovery);
 
@@ -2323,6 +2485,7 @@ cantfind_ext4:
 
 failed_mount4:
 	jbd2_journal_destroy(sbi->s_journal);
+	sbi->s_journal = NULL;
 failed_mount3:
 	percpu_counter_destroy(&sbi->s_freeblocks_counter);
 	percpu_counter_destroy(&sbi->s_freeinodes_counter);
@@ -3106,7 +3269,7 @@ static int ext4_release_dquot(struct dquot *dquot)
 
 static int ext4_mark_dquot_dirty(struct dquot *dquot)
 {
-	/* Are we journalling quotas? */
+	/* Are we journaling quotas? */
 	if (EXT4_SB(dquot->dq_sb)->s_qf_names[USRQUOTA] ||
 	    EXT4_SB(dquot->dq_sb)->s_qf_names[GRPQUOTA]) {
 		dquot_mark_dquot_dirty(dquot);
@@ -3153,23 +3316,42 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
 
 	if (!test_opt(sb, QUOTA))
 		return -EINVAL;
-	/* Not journalling quota? */
-	if ((!EXT4_SB(sb)->s_qf_names[USRQUOTA] &&
-	    !EXT4_SB(sb)->s_qf_names[GRPQUOTA]) || remount)
+	/* When remounting, no checks are needed and in fact, path is NULL */
+	if (remount)
 		return vfs_quota_on(sb, type, format_id, path, remount);
+
 	err = path_lookup(path, LOOKUP_FOLLOW, &nd);
 	if (err)
 		return err;
+
 	/* Quotafile not on the same filesystem? */
 	if (nd.path.mnt->mnt_sb != sb) {
 		path_put(&nd.path);
 		return -EXDEV;
 	}
-	/* Quotafile not of fs root? */
-	if (nd.path.dentry->d_parent->d_inode != sb->s_root->d_inode)
-		printk(KERN_WARNING
-			"EXT4-fs: Quota file not on filesystem root. "
-			"Journalled quota will not work.\n");
+	/* Journaling quota? */
+	if (EXT4_SB(sb)->s_qf_names[type]) {
+		/* Quotafile not of fs root? */
+		if (nd.path.dentry->d_parent->d_inode != sb->s_root->d_inode)
+			printk(KERN_WARNING
+				"EXT4-fs: Quota file not on filesystem root. "
+				"Journaled quota will not work.\n");
+ 	}
+
+	/*
+	 * When we journal data on quota file, we have to flush journal to see
+	 * all updates to the file when we bypass pagecache...
+	 */
+	if (ext4_should_journal_data(nd.path.dentry->d_inode)) {
+		/*
+		 * We don't need to lock updates but journal_flush() could
+		 * otherwise be livelocked...
+		 */
+		jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
+		jbd2_journal_flush(EXT4_SB(sb)->s_journal);
+		jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
+	}
+
 	path_put(&nd.path);
 	return vfs_quota_on(sb, type, format_id, path, remount);
 }
@@ -3257,7 +3439,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
 			err = ext4_journal_dirty_metadata(handle, bh);
 		else {
 			/* Always do at least ordered writes for quotas */
-			err = ext4_journal_dirty_data(handle, bh);
+			err = ext4_jbd2_file_inode(handle, inode);
 			mark_buffer_dirty(bh);
 		}
 		brelse(bh);
@@ -3269,8 +3451,10 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
 		blk++;
 	}
 out:
-	if (len == towrite)
+	if (len == towrite) {
+		mutex_unlock(&inode->i_mutex);
 		return err;
+	}
 	if (inode->i_size < off+len-towrite) {
 		i_size_write(inode, off+len-towrite);
 		EXT4_I(inode)->i_disksize = inode->i_size;
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 3fbc2c6c3d0e..93c5fdcdad2e 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -810,7 +810,7 @@ inserted:
 			/* We need to allocate a new block */
 			ext4_fsblk_t goal = ext4_group_first_block_no(sb,
 						EXT4_I(inode)->i_block_group);
-			ext4_fsblk_t block = ext4_new_block(handle, inode,
+			ext4_fsblk_t block = ext4_new_meta_block(handle, inode,
 							goal, &error);
 			if (error)
 				goto cleanup;
@@ -1009,6 +1009,11 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
 			i.value = NULL;
 			error = ext4_xattr_block_set(handle, inode, &i, &bs);
 		} else if (error == -ENOSPC) {
+			if (EXT4_I(inode)->i_file_acl && !bs.s.base) {
+				error = ext4_xattr_block_find(inode, &i, &bs);
+				if (error)
+					goto cleanup;
+			}
 			error = ext4_xattr_block_set(handle, inode, &i, &bs);
 			if (error)
 				goto cleanup;
diff --git a/fs/ext4/xattr_trusted.c b/fs/ext4/xattr_trusted.c
index fff33382cadc..ac1a52cf2a37 100644
--- a/fs/ext4/xattr_trusted.c
+++ b/fs/ext4/xattr_trusted.c
@@ -13,13 +13,11 @@
 #include "ext4.h"
 #include "xattr.h"
 
-#define XATTR_TRUSTED_PREFIX "trusted."
-
 static size_t
 ext4_xattr_trusted_list(struct inode *inode, char *list, size_t list_size,
 			const char *name, size_t name_len)
 {
-	const size_t prefix_len = sizeof(XATTR_TRUSTED_PREFIX)-1;
+	const size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN;
 	const size_t total_len = prefix_len + name_len + 1;
 
 	if (!capable(CAP_SYS_ADMIN))
diff --git a/fs/ext4/xattr_user.c b/fs/ext4/xattr_user.c
index 67be723fcc4e..d91aa61b42aa 100644
--- a/fs/ext4/xattr_user.c
+++ b/fs/ext4/xattr_user.c
@@ -12,13 +12,11 @@
 #include "ext4.h"
 #include "xattr.h"
 
-#define XATTR_USER_PREFIX "user."
-
 static size_t
 ext4_xattr_user_list(struct inode *inode, char *list, size_t list_size,
 		     const char *name, size_t name_len)
 {
-	const size_t prefix_len = sizeof(XATTR_USER_PREFIX)-1;
+	const size_t prefix_len = XATTR_USER_PREFIX_LEN;
 	const size_t total_len = prefix_len + name_len + 1;
 
 	if (!test_opt(inode->i_sb, XATTR_USER))
diff --git a/fs/fat/cache.c b/fs/fat/cache.c
index fda25479af26..3a9ecac8d61f 100644
--- a/fs/fat/cache.c
+++ b/fs/fat/cache.c
@@ -61,7 +61,7 @@ void fat_cache_destroy(void)
 
 static inline struct fat_cache *fat_cache_alloc(struct inode *inode)
 {
-	return kmem_cache_alloc(fat_cache_cachep, GFP_KERNEL);
+	return kmem_cache_alloc(fat_cache_cachep, GFP_NOFS);
 }
 
 static inline void fat_cache_free(struct fat_cache *cache)
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index 486725ee99ae..34541d06e626 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -472,7 +472,7 @@ static int __fat_readdir(struct inode *inode, struct file *filp, void *dirent,
 	loff_t cpos;
 	int ret = 0;
 
-	lock_kernel();
+	lock_super(sb);
 
 	cpos = filp->f_pos;
 	/* Fake . and .. for the root directory. */
@@ -654,7 +654,7 @@ FillFailed:
 	if (unicode)
 		__putname(unicode);
 out:
-	unlock_kernel();
+	unlock_super(sb);
 	return ret;
 }
 
diff --git a/fs/fat/file.c b/fs/fat/file.c
index 27cc1164ec36..c672df4036e9 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -11,7 +11,6 @@
 #include <linux/mount.h>
 #include <linux/time.h>
 #include <linux/msdos_fs.h>
-#include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
 #include <linux/writeback.h>
 #include <linux/backing-dev.h>
@@ -242,9 +241,7 @@ void fat_truncate(struct inode *inode)
 
 	nr_clusters = (inode->i_size + (cluster_size - 1)) >> sbi->cluster_bits;
 
-	lock_kernel();
 	fat_free(inode, nr_clusters);
-	unlock_kernel();
 	fat_flush_inodes(inode->i_sb, inode, NULL);
 }
 
@@ -257,26 +254,34 @@ int fat_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
 }
 EXPORT_SYMBOL_GPL(fat_getattr);
 
-static int fat_check_mode(const struct msdos_sb_info *sbi, struct inode *inode,
-			  mode_t mode)
+static int fat_sanitize_mode(const struct msdos_sb_info *sbi,
+			     struct inode *inode, umode_t *mode_ptr)
 {
-	mode_t mask, req = mode & ~S_IFMT;
+	mode_t mask, perm;
 
-	if (S_ISREG(mode))
+	/*
+	 * Note, the basic check is already done by a caller of
+	 * (attr->ia_mode & ~MSDOS_VALID_MODE)
+	 */
+
+	if (S_ISREG(inode->i_mode))
 		mask = sbi->options.fs_fmask;
 	else
 		mask = sbi->options.fs_dmask;
 
+	perm = *mode_ptr & ~(S_IFMT | mask);
+
 	/*
 	 * Of the r and x bits, all (subject to umask) must be present. Of the
 	 * w bits, either all (subject to umask) or none must be present.
 	 */
-	req &= ~mask;
-	if ((req & (S_IRUGO | S_IXUGO)) != (inode->i_mode & (S_IRUGO|S_IXUGO)))
+	if ((perm & (S_IRUGO | S_IXUGO)) != (inode->i_mode & (S_IRUGO|S_IXUGO)))
 		return -EPERM;
-	if ((req & S_IWUGO) && ((req & S_IWUGO) != (S_IWUGO & ~mask)))
+	if ((perm & S_IWUGO) && ((perm & S_IWUGO) != (S_IWUGO & ~mask)))
 		return -EPERM;
 
+	*mode_ptr &= S_IFMT | perm;
+
 	return 0;
 }
 
@@ -299,11 +304,9 @@ int fat_setattr(struct dentry *dentry, struct iattr *attr)
 {
 	struct msdos_sb_info *sbi = MSDOS_SB(dentry->d_sb);
 	struct inode *inode = dentry->d_inode;
-	int mask, error = 0;
+	int error = 0;
 	unsigned int ia_valid;
 
-	lock_kernel();
-
 	/*
 	 * Expand the file. Since inode_setattr() updates ->i_size
 	 * before calling the ->truncate(), but FAT needs to fill the
@@ -332,12 +335,13 @@ int fat_setattr(struct dentry *dentry, struct iattr *attr)
 			error = 0;
 		goto out;
 	}
+
 	if (((attr->ia_valid & ATTR_UID) &&
 	     (attr->ia_uid != sbi->options.fs_uid)) ||
 	    ((attr->ia_valid & ATTR_GID) &&
 	     (attr->ia_gid != sbi->options.fs_gid)) ||
 	    ((attr->ia_valid & ATTR_MODE) &&
-	     fat_check_mode(sbi, inode, attr->ia_mode) < 0))
+	     (attr->ia_mode & ~MSDOS_VALID_MODE)))
 		error = -EPERM;
 
 	if (error) {
@@ -346,17 +350,17 @@ int fat_setattr(struct dentry *dentry, struct iattr *attr)
 		goto out;
 	}
 
-	error = inode_setattr(inode, attr);
-	if (error)
-		goto out;
+	/*
+	 * We don't return -EPERM here. Yes, strange, but this is too
+	 * old behavior.
+	 */
+	if (attr->ia_valid & ATTR_MODE) {
+		if (fat_sanitize_mode(sbi, inode, &attr->ia_mode) < 0)
+			attr->ia_valid &= ~ATTR_MODE;
+	}
 
-	if (S_ISDIR(inode->i_mode))
-		mask = sbi->options.fs_dmask;
-	else
-		mask = sbi->options.fs_fmask;
-	inode->i_mode &= S_IFMT | (S_IRWXUGO & ~mask);
+	error = inode_setattr(inode, attr);
 out:
-	unlock_kernel();
 	return error;
 }
 EXPORT_SYMBOL_GPL(fat_setattr);
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 4e0a3dd9d677..46a4508ffd2e 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -440,14 +440,13 @@ static void fat_delete_inode(struct inode *inode)
 
 static void fat_clear_inode(struct inode *inode)
 {
-	struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
+	struct super_block *sb = inode->i_sb;
+	struct msdos_sb_info *sbi = MSDOS_SB(sb);
 
-	lock_kernel();
 	spin_lock(&sbi->inode_hash_lock);
 	fat_cache_inval_inode(inode);
 	hlist_del_init(&MSDOS_I(inode)->i_fat_hash);
 	spin_unlock(&sbi->inode_hash_lock);
-	unlock_kernel();
 }
 
 static void fat_write_super(struct super_block *sb)
@@ -485,7 +484,7 @@ static struct kmem_cache *fat_inode_cachep;
 static struct inode *fat_alloc_inode(struct super_block *sb)
 {
 	struct msdos_inode_info *ei;
-	ei = kmem_cache_alloc(fat_inode_cachep, GFP_KERNEL);
+	ei = kmem_cache_alloc(fat_inode_cachep, GFP_NOFS);
 	if (!ei)
 		return NULL;
 	return &ei->vfs_inode;
@@ -567,7 +566,7 @@ retry:
 	if (inode->i_ino == MSDOS_ROOT_INO || !i_pos)
 		return 0;
 
-	lock_kernel();
+	lock_super(sb);
 	bh = sb_bread(sb, i_pos >> sbi->dir_per_block_bits);
 	if (!bh) {
 		printk(KERN_ERR "FAT: unable to read inode block "
@@ -579,7 +578,7 @@ retry:
 	if (i_pos != MSDOS_I(inode)->i_pos) {
 		spin_unlock(&sbi->inode_hash_lock);
 		brelse(bh);
-		unlock_kernel();
+		unlock_super(sb);
 		goto retry;
 	}
 
@@ -606,7 +605,7 @@ retry:
 		err = sync_dirty_buffer(bh);
 	brelse(bh);
 out:
-	unlock_kernel();
+	unlock_super(sb);
 	return err;
 }
 
@@ -736,6 +735,7 @@ fat_encode_fh(struct dentry *de, __u32 *fh, int *lenp, int connectable)
 
 static struct dentry *fat_get_parent(struct dentry *child)
 {
+	struct super_block *sb = child->d_sb;
 	struct buffer_head *bh;
 	struct msdos_dir_entry *de;
 	loff_t i_pos;
@@ -743,14 +743,14 @@ static struct dentry *fat_get_parent(struct dentry *child)
 	struct inode *inode;
 	int err;
 
-	lock_kernel();
+	lock_super(sb);
 
 	err = fat_get_dotdot_entry(child->d_inode, &bh, &de, &i_pos);
 	if (err) {
 		parent = ERR_PTR(err);
 		goto out;
 	}
-	inode = fat_build_inode(child->d_sb, de, i_pos);
+	inode = fat_build_inode(sb, de, i_pos);
 	brelse(bh);
 	if (IS_ERR(inode)) {
 		parent = ERR_CAST(inode);
@@ -762,7 +762,7 @@ static struct dentry *fat_get_parent(struct dentry *child)
 		parent = ERR_PTR(-ENOMEM);
 	}
 out:
-	unlock_kernel();
+	unlock_super(sb);
 
 	return parent;
 }
@@ -1172,6 +1172,12 @@ int fat_fill_super(struct super_block *sb, void *data, int silent,
 	long error;
 	char buf[50];
 
+	/*
+	 * GFP_KERNEL is ok here, because while we do hold the
+	 * supeblock lock, memory pressure can't call back into
+	 * the filesystem, since we're only just about to mount
+	 * it and have no inodes etc active!
+	 */
 	sbi = kzalloc(sizeof(struct msdos_sb_info), GFP_KERNEL);
 	if (!sbi)
 		return -ENOMEM;
diff --git a/fs/fcntl.c b/fs/fcntl.c
index bfd776509a72..330a7d782591 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -12,7 +12,6 @@
 #include <linux/fdtable.h>
 #include <linux/capability.h>
 #include <linux/dnotify.h>
-#include <linux/smp_lock.h>
 #include <linux/slab.h>
 #include <linux/module.h>
 #include <linux/security.h>
@@ -227,7 +226,6 @@ static int setfl(int fd, struct file * filp, unsigned long arg)
 	if (error)
 		return error;
 
-	lock_kernel();
 	if ((arg ^ filp->f_flags) & FASYNC) {
 		if (filp->f_op && filp->f_op->fasync) {
 			error = filp->f_op->fasync(fd, filp, (arg & FASYNC) != 0);
@@ -238,7 +236,6 @@ static int setfl(int fd, struct file * filp, unsigned long arg)
 
 	filp->f_flags = (arg & SETFL_MASK) | (filp->f_flags & ~SETFL_MASK);
  out:
-	unlock_kernel();
 	return error;
 }
 
diff --git a/fs/file.c b/fs/file.c
index 4c6f0ea12c41..7b3887e054d0 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -26,6 +26,8 @@ struct fdtable_defer {
 };
 
 int sysctl_nr_open __read_mostly = 1024*1024;
+int sysctl_nr_open_min = BITS_PER_LONG;
+int sysctl_nr_open_max = 1024 * 1024; /* raised later */
 
 /*
  * We use this list to defer free fdtables that have vmalloced
@@ -119,8 +121,6 @@ static void copy_fdtable(struct fdtable *nfdt, struct fdtable *ofdt)
 	unsigned int cpy, set;
 
 	BUG_ON(nfdt->max_fds < ofdt->max_fds);
-	if (ofdt->max_fds == 0)
-		return;
 
 	cpy = ofdt->max_fds * sizeof(struct file *);
 	set = (nfdt->max_fds - ofdt->max_fds) * sizeof(struct file *);
@@ -261,6 +261,139 @@ int expand_files(struct files_struct *files, int nr)
 	return expand_fdtable(files, nr);
 }
 
+static int count_open_files(struct fdtable *fdt)
+{
+	int size = fdt->max_fds;
+	int i;
+
+	/* Find the last open fd */
+	for (i = size/(8*sizeof(long)); i > 0; ) {
+		if (fdt->open_fds->fds_bits[--i])
+			break;
+	}
+	i = (i+1) * 8 * sizeof(long);
+	return i;
+}
+
+/*
+ * Allocate a new files structure and copy contents from the
+ * passed in files structure.
+ * errorp will be valid only when the returned files_struct is NULL.
+ */
+struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
+{
+	struct files_struct *newf;
+	struct file **old_fds, **new_fds;
+	int open_files, size, i;
+	struct fdtable *old_fdt, *new_fdt;
+
+	*errorp = -ENOMEM;
+	newf = kmem_cache_alloc(files_cachep, GFP_KERNEL);
+	if (!newf)
+		goto out;
+
+	atomic_set(&newf->count, 1);
+
+	spin_lock_init(&newf->file_lock);
+	newf->next_fd = 0;
+	new_fdt = &newf->fdtab;
+	new_fdt->max_fds = NR_OPEN_DEFAULT;
+	new_fdt->close_on_exec = (fd_set *)&newf->close_on_exec_init;
+	new_fdt->open_fds = (fd_set *)&newf->open_fds_init;
+	new_fdt->fd = &newf->fd_array[0];
+	INIT_RCU_HEAD(&new_fdt->rcu);
+	new_fdt->next = NULL;
+
+	spin_lock(&oldf->file_lock);
+	old_fdt = files_fdtable(oldf);
+	open_files = count_open_files(old_fdt);
+
+	/*
+	 * Check whether we need to allocate a larger fd array and fd set.
+	 */
+	while (unlikely(open_files > new_fdt->max_fds)) {
+		spin_unlock(&oldf->file_lock);
+
+		if (new_fdt != &newf->fdtab) {
+			free_fdarr(new_fdt);
+			free_fdset(new_fdt);
+			kfree(new_fdt);
+		}
+
+		new_fdt = alloc_fdtable(open_files - 1);
+		if (!new_fdt) {
+			*errorp = -ENOMEM;
+			goto out_release;
+		}
+
+		/* beyond sysctl_nr_open; nothing to do */
+		if (unlikely(new_fdt->max_fds < open_files)) {
+			free_fdarr(new_fdt);
+			free_fdset(new_fdt);
+			kfree(new_fdt);
+			*errorp = -EMFILE;
+			goto out_release;
+		}
+
+		/*
+		 * Reacquire the oldf lock and a pointer to its fd table
+		 * who knows it may have a new bigger fd table. We need
+		 * the latest pointer.
+		 */
+		spin_lock(&oldf->file_lock);
+		old_fdt = files_fdtable(oldf);
+		open_files = count_open_files(old_fdt);
+	}
+
+	old_fds = old_fdt->fd;
+	new_fds = new_fdt->fd;
+
+	memcpy(new_fdt->open_fds->fds_bits,
+		old_fdt->open_fds->fds_bits, open_files/8);
+	memcpy(new_fdt->close_on_exec->fds_bits,
+		old_fdt->close_on_exec->fds_bits, open_files/8);
+
+	for (i = open_files; i != 0; i--) {
+		struct file *f = *old_fds++;
+		if (f) {
+			get_file(f);
+		} else {
+			/*
+			 * The fd may be claimed in the fd bitmap but not yet
+			 * instantiated in the files array if a sibling thread
+			 * is partway through open().  So make sure that this
+			 * fd is available to the new process.
+			 */
+			FD_CLR(open_files - i, new_fdt->open_fds);
+		}
+		rcu_assign_pointer(*new_fds++, f);
+	}
+	spin_unlock(&oldf->file_lock);
+
+	/* compute the remainder to be cleared */
+	size = (new_fdt->max_fds - open_files) * sizeof(struct file *);
+
+	/* This is long word aligned thus could use a optimized version */
+	memset(new_fds, 0, size);
+
+	if (new_fdt->max_fds > open_files) {
+		int left = (new_fdt->max_fds-open_files)/8;
+		int start = open_files / (8 * sizeof(unsigned long));
+
+		memset(&new_fdt->open_fds->fds_bits[start], 0, left);
+		memset(&new_fdt->close_on_exec->fds_bits[start], 0, left);
+	}
+
+	rcu_assign_pointer(newf->fdt, new_fdt);
+
+	return newf;
+
+out_release:
+	kmem_cache_free(files_cachep, newf);
+out:
+	return NULL;
+}
+
 static void __devinit fdtable_defer_list_init(int cpu)
 {
 	struct fdtable_defer *fddef = &per_cpu(fdtable_defer_list, cpu);
@@ -274,4 +407,19 @@ void __init files_defer_init(void)
 	int i;
 	for_each_possible_cpu(i)
 		fdtable_defer_list_init(i);
+	sysctl_nr_open_max = min((size_t)INT_MAX, ~(size_t)0/sizeof(void *)) &
+			     -BITS_PER_LONG;
 }
+
+struct files_struct init_files = {
+	.count		= ATOMIC_INIT(1),
+	.fdt		= &init_files.fdtab,
+	.fdtab		= {
+		.max_fds	= NR_OPEN_DEFAULT,
+		.fd		= &init_files.fd_array[0],
+		.close_on_exec	= (fd_set *)&init_files.close_on_exec_init,
+		.open_fds	= (fd_set *)&init_files.open_fds_init,
+		.rcu		= RCU_HEAD_INIT,
+	},
+	.file_lock	= __SPIN_LOCK_UNLOCKED(init_task.file_lock),
+};
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index f28cf8b46f80..8092f0d9fd1f 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -804,6 +804,8 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req,
 		if (offset == PAGE_CACHE_SIZE)
 			offset = 0;
 
+		if (!fc->big_writes)
+			break;
 	} while (iov_iter_count(ii) && count < fc->max_write &&
 		 req->num_pages < FUSE_MAX_PAGES_PER_REQ && offset == 0);
 
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index dadffa21a206..bae948657c4f 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -404,6 +404,9 @@ struct fuse_conn {
 	/** Is bmap not implemented by fs? */
 	unsigned no_bmap : 1;
 
+	/** Do multi-page cached writes */
+	unsigned big_writes : 1;
+
 	/** The number of requests waiting for completion */
 	atomic_t num_waiting;
 
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 79b615873838..3141690558c8 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -488,7 +488,12 @@ static struct fuse_conn *new_conn(struct super_block *sb)
 		err = bdi_init(&fc->bdi);
 		if (err)
 			goto error_kfree;
-		err = bdi_register_dev(&fc->bdi, fc->dev);
+		if (sb->s_bdev) {
+			err = bdi_register(&fc->bdi, NULL, "%u:%u-fuseblk",
+					   MAJOR(fc->dev), MINOR(fc->dev));
+		} else {
+			err = bdi_register_dev(&fc->bdi, fc->dev);
+		}
 		if (err)
 			goto error_bdi_destroy;
 		/*
@@ -576,6 +581,8 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
 				fc->no_lock = 1;
 			if (arg->flags & FUSE_ATOMIC_O_TRUNC)
 				fc->atomic_o_trunc = 1;
+			if (arg->flags & FUSE_BIG_WRITES)
+				fc->big_writes = 1;
 		} else {
 			ra_pages = fc->max_read / PAGE_CACHE_SIZE;
 			fc->no_lock = 1;
@@ -584,7 +591,7 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
 		fc->bdi.ra_pages = min(fc->bdi.ra_pages, ra_pages);
 		fc->minor = arg->minor;
 		fc->max_write = arg->minor < 5 ? 4096 : arg->max_write;
-		fc->max_write = min_t(unsigned, 4096, fc->max_write);
+		fc->max_write = max_t(unsigned, 4096, fc->max_write);
 		fc->conn_init = 1;
 	}
 	fuse_put_request(fc, req);
@@ -599,7 +606,8 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req)
 	arg->major = FUSE_KERNEL_VERSION;
 	arg->minor = FUSE_KERNEL_MINOR_VERSION;
 	arg->max_readahead = fc->bdi.ra_pages * PAGE_CACHE_SIZE;
-	arg->flags |= FUSE_ASYNC_READ | FUSE_POSIX_LOCKS | FUSE_ATOMIC_O_TRUNC;
+	arg->flags |= FUSE_ASYNC_READ | FUSE_POSIX_LOCKS | FUSE_ATOMIC_O_TRUNC |
+		FUSE_BIG_WRITES;
 	req->in.h.opcode = FUSE_INIT;
 	req->in.numargs = 1;
 	req->in.args[0].size = sizeof(*arg);
@@ -659,7 +667,7 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
 	fc->flags = d.flags;
 	fc->user_id = d.user_id;
 	fc->group_id = d.group_id;
-	fc->max_read = min_t(unsigned, 4096, d.max_read);
+	fc->max_read = max_t(unsigned, 4096, d.max_read);
 
 	/* Used by get_root_inode() */
 	sb->s_fs_info = fc;
diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig
index 7f7947e3dfbb..ab2f57e3fb87 100644
--- a/fs/gfs2/Kconfig
+++ b/fs/gfs2/Kconfig
@@ -14,23 +14,11 @@ config GFS2_FS
 	  GFS is perfect consistency -- changes made to the filesystem on one
 	  machine show up immediately on all other machines in the cluster.
 
-	  To use the GFS2 filesystem, you will need to enable one or more of
-	  the below locking modules. Documentation and utilities for GFS2 can
+	  To use the GFS2 filesystem in a cluster, you will need to enable
+	  the locking module below. Documentation and utilities for GFS2 can
 	  be found here: http://sources.redhat.com/cluster
 
-config GFS2_FS_LOCKING_NOLOCK
-	tristate "GFS2 \"nolock\" locking module"
-	depends on GFS2_FS
-	help
-	  Single node locking module for GFS2.
-
-	  Use this module if you want to use GFS2 on a single node without
-	  its clustering features. You can still take advantage of the
-	  large file support, and upgrade to running a full cluster later on
-	  if required.
-
-	  If you will only be using GFS2 in cluster mode, you do not need this
-	  module.
+	  The "nolock" lock module is now built in to GFS2 by default.
 
 config GFS2_FS_LOCKING_DLM
 	tristate "GFS2 DLM locking module"
diff --git a/fs/gfs2/Makefile b/fs/gfs2/Makefile
index e2350df02a07..ec65851ec80a 100644
--- a/fs/gfs2/Makefile
+++ b/fs/gfs2/Makefile
@@ -5,6 +5,5 @@ gfs2-y := acl.o bmap.o daemon.o dir.o eaops.o eattr.o glock.o \
 	ops_fstype.o ops_inode.o ops_super.o quota.o \
 	recovery.o rgrp.o super.o sys.o trans.o util.o
 
-obj-$(CONFIG_GFS2_FS_LOCKING_NOLOCK) += locking/nolock/
 obj-$(CONFIG_GFS2_FS_LOCKING_DLM) += locking/dlm/
 
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index c19184f2e70e..bec76b1c2bb0 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -246,15 +246,11 @@ static void find_metapath(const struct gfs2_sbd *sdp, u64 block,
 
 }
 
-static inline unsigned int zero_metapath_length(const struct metapath *mp,
-						unsigned height)
+static inline unsigned int metapath_branch_start(const struct metapath *mp)
 {
-	unsigned int i;
-	for (i = 0; i < height - 1; i++) {
-		if (mp->mp_list[i] != 0)
-			return i;
-	}
-	return height;
+	if (mp->mp_list[0] == 0)
+		return 2;
+	return 1;
 }
 
 /**
@@ -436,7 +432,7 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock,
 	struct gfs2_sbd *sdp = GFS2_SB(inode);
 	struct buffer_head *dibh = mp->mp_bh[0];
 	u64 bn, dblock = 0;
-	unsigned n, i, blks, alloced = 0, iblks = 0, zmpl = 0;
+	unsigned n, i, blks, alloced = 0, iblks = 0, branch_start = 0;
 	unsigned dblks = 0;
 	unsigned ptrs_per_blk;
 	const unsigned end_of_metadata = height - 1;
@@ -471,9 +467,8 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock,
 			/* Building up tree height */
 			state = ALLOC_GROW_HEIGHT;
 			iblks = height - ip->i_height;
-			zmpl = zero_metapath_length(mp, height);
-			iblks -= zmpl;
-			iblks += height;
+			branch_start = metapath_branch_start(mp);
+			iblks += (height - branch_start);
 		}
 	}
 
@@ -509,13 +504,13 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock,
 					sizeof(struct gfs2_meta_header));
 				*ptr = zero_bn;
 				state = ALLOC_GROW_DEPTH;
-				for(i = zmpl; i < height; i++) {
+				for(i = branch_start; i < height; i++) {
 					if (mp->mp_bh[i] == NULL)
 						break;
 					brelse(mp->mp_bh[i]);
 					mp->mp_bh[i] = NULL;
 				}
-				i = zmpl;
+				i = branch_start;
 			}
 			if (n == 0)
 				break;
diff --git a/fs/gfs2/gfs2.h b/fs/gfs2/gfs2.h
index 3bb11c0f8b56..ef606e3a5cf4 100644
--- a/fs/gfs2/gfs2.h
+++ b/fs/gfs2/gfs2.h
@@ -16,11 +16,6 @@ enum {
 };
 
 enum {
-	NO_WAIT = 0,
-	WAIT = 1,
-};
-
-enum {
 	NO_FORCE = 0,
 	FORCE = 1,
 };
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index d636b3e80f5d..13391e546616 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -45,21 +45,19 @@ struct gfs2_gl_hash_bucket {
         struct hlist_head hb_list;
 };
 
-struct glock_iter {
-	int hash;                     /* hash bucket index         */
-	struct gfs2_sbd *sdp;         /* incore superblock         */
-	struct gfs2_glock *gl;        /* current glock struct      */
-	struct seq_file *seq;         /* sequence file for debugfs */
-	char string[512];             /* scratch space             */
+struct gfs2_glock_iter {
+	int hash;			/* hash bucket index         */
+	struct gfs2_sbd *sdp;		/* incore superblock         */
+	struct gfs2_glock *gl;		/* current glock struct      */
+	char string[512];		/* scratch space             */
 };
 
 typedef void (*glock_examiner) (struct gfs2_glock * gl);
 
 static int gfs2_dump_lockstate(struct gfs2_sbd *sdp);
-static int dump_glock(struct glock_iter *gi, struct gfs2_glock *gl);
-static void gfs2_glock_xmote_th(struct gfs2_glock *gl, struct gfs2_holder *gh);
-static void gfs2_glock_drop_th(struct gfs2_glock *gl);
-static void run_queue(struct gfs2_glock *gl);
+static int __dump_glock(struct seq_file *seq, const struct gfs2_glock *gl);
+#define GLOCK_BUG_ON(gl,x) do { if (unlikely(x)) { __dump_glock(NULL, gl); BUG(); } } while(0)
+static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int target);
 
 static DECLARE_RWSEM(gfs2_umount_flush_sem);
 static struct dentry *gfs2_root;
@@ -123,33 +121,6 @@ static inline rwlock_t *gl_lock_addr(unsigned int x)
 #endif
 
 /**
- * relaxed_state_ok - is a requested lock compatible with the current lock mode?
- * @actual: the current state of the lock
- * @requested: the lock state that was requested by the caller
- * @flags: the modifier flags passed in by the caller
- *
- * Returns: 1 if the locks are compatible, 0 otherwise
- */
-
-static inline int relaxed_state_ok(unsigned int actual, unsigned requested,
-				   int flags)
-{
-	if (actual == requested)
-		return 1;
-
-	if (flags & GL_EXACT)
-		return 0;
-
-	if (actual == LM_ST_EXCLUSIVE && requested == LM_ST_SHARED)
-		return 1;
-
-	if (actual != LM_ST_UNLOCKED && (flags & LM_FLAG_ANY))
-		return 1;
-
-	return 0;
-}
-
-/**
  * gl_hash() - Turn glock number into hash bucket number
  * @lock: The glock number
  *
@@ -182,7 +153,7 @@ static void glock_free(struct gfs2_glock *gl)
 	struct gfs2_sbd *sdp = gl->gl_sbd;
 	struct inode *aspace = gl->gl_aspace;
 
-	if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+	if (sdp->sd_lockstruct.ls_ops->lm_put_lock)
 		sdp->sd_lockstruct.ls_ops->lm_put_lock(gl->gl_lock);
 
 	if (aspace)
@@ -211,17 +182,14 @@ static void gfs2_glock_hold(struct gfs2_glock *gl)
 int gfs2_glock_put(struct gfs2_glock *gl)
 {
 	int rv = 0;
-	struct gfs2_sbd *sdp = gl->gl_sbd;
 
 	write_lock(gl_lock_addr(gl->gl_hash));
 	if (atomic_dec_and_test(&gl->gl_ref)) {
 		hlist_del(&gl->gl_list);
 		write_unlock(gl_lock_addr(gl->gl_hash));
-		gfs2_assert(sdp, gl->gl_state == LM_ST_UNLOCKED);
-		gfs2_assert(sdp, list_empty(&gl->gl_reclaim));
-		gfs2_assert(sdp, list_empty(&gl->gl_holders));
-		gfs2_assert(sdp, list_empty(&gl->gl_waiters1));
-		gfs2_assert(sdp, list_empty(&gl->gl_waiters3));
+		GLOCK_BUG_ON(gl, gl->gl_state != LM_ST_UNLOCKED);
+		GLOCK_BUG_ON(gl, !list_empty(&gl->gl_reclaim));
+		GLOCK_BUG_ON(gl, !list_empty(&gl->gl_holders));
 		glock_free(gl);
 		rv = 1;
 		goto out;
@@ -281,22 +249,401 @@ static struct gfs2_glock *gfs2_glock_find(const struct gfs2_sbd *sdp,
 	return gl;
 }
 
+/**
+ * may_grant - check if its ok to grant a new lock
+ * @gl: The glock
+ * @gh: The lock request which we wish to grant
+ *
+ * Returns: true if its ok to grant the lock
+ */
+
+static inline int may_grant(const struct gfs2_glock *gl, const struct gfs2_holder *gh)
+{
+	const struct gfs2_holder *gh_head = list_entry(gl->gl_holders.next, const struct gfs2_holder, gh_list);
+	if ((gh->gh_state == LM_ST_EXCLUSIVE ||
+	     gh_head->gh_state == LM_ST_EXCLUSIVE) && gh != gh_head)
+		return 0;
+	if (gl->gl_state == gh->gh_state)
+		return 1;
+	if (gh->gh_flags & GL_EXACT)
+		return 0;
+	if (gl->gl_state == LM_ST_EXCLUSIVE) {
+		if (gh->gh_state == LM_ST_SHARED && gh_head->gh_state == LM_ST_SHARED)
+			return 1;
+		if (gh->gh_state == LM_ST_DEFERRED && gh_head->gh_state == LM_ST_DEFERRED)
+			return 1;
+	}
+	if (gl->gl_state != LM_ST_UNLOCKED && (gh->gh_flags & LM_FLAG_ANY))
+		return 1;
+	return 0;
+}
+
+static void gfs2_holder_wake(struct gfs2_holder *gh)
+{
+	clear_bit(HIF_WAIT, &gh->gh_iflags);
+	smp_mb__after_clear_bit();
+	wake_up_bit(&gh->gh_iflags, HIF_WAIT);
+}
+
+/**
+ * do_promote - promote as many requests as possible on the current queue
+ * @gl: The glock
+ * 
+ * Returns: true if there is a blocked holder at the head of the list
+ */
+
+static int do_promote(struct gfs2_glock *gl)
+{
+	const struct gfs2_glock_operations *glops = gl->gl_ops;
+	struct gfs2_holder *gh, *tmp;
+	int ret;
+
+restart:
+	list_for_each_entry_safe(gh, tmp, &gl->gl_holders, gh_list) {
+		if (test_bit(HIF_HOLDER, &gh->gh_iflags))
+			continue;
+		if (may_grant(gl, gh)) {
+			if (gh->gh_list.prev == &gl->gl_holders &&
+			    glops->go_lock) {
+				spin_unlock(&gl->gl_spin);
+				/* FIXME: eliminate this eventually */
+				ret = glops->go_lock(gh);
+				spin_lock(&gl->gl_spin);
+				if (ret) {
+					gh->gh_error = ret;
+					list_del_init(&gh->gh_list);
+					gfs2_holder_wake(gh);
+					goto restart;
+				}
+				set_bit(HIF_HOLDER, &gh->gh_iflags);
+				gfs2_holder_wake(gh);
+				goto restart;
+			}
+			set_bit(HIF_HOLDER, &gh->gh_iflags);
+			gfs2_holder_wake(gh);
+			continue;
+		}
+		if (gh->gh_list.prev == &gl->gl_holders)
+			return 1;
+		break;
+	}
+	return 0;
+}
+
+/**
+ * do_error - Something unexpected has happened during a lock request
+ *
+ */
+
+static inline void do_error(struct gfs2_glock *gl, const int ret)
+{
+	struct gfs2_holder *gh, *tmp;
+
+	list_for_each_entry_safe(gh, tmp, &gl->gl_holders, gh_list) {
+		if (test_bit(HIF_HOLDER, &gh->gh_iflags))
+			continue;
+		if (ret & LM_OUT_ERROR)
+			gh->gh_error = -EIO;
+		else if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))
+			gh->gh_error = GLR_TRYFAILED;
+		else
+			continue;
+		list_del_init(&gh->gh_list);
+		gfs2_holder_wake(gh);
+	}
+}
+
+/**
+ * find_first_waiter - find the first gh that's waiting for the glock
+ * @gl: the glock
+ */
+
+static inline struct gfs2_holder *find_first_waiter(const struct gfs2_glock *gl)
+{
+	struct gfs2_holder *gh;
+
+	list_for_each_entry(gh, &gl->gl_holders, gh_list) {
+		if (!test_bit(HIF_HOLDER, &gh->gh_iflags))
+			return gh;
+	}
+	return NULL;
+}
+
+/**
+ * state_change - record that the glock is now in a different state
+ * @gl: the glock
+ * @new_state the new state
+ *
+ */
+
+static void state_change(struct gfs2_glock *gl, unsigned int new_state)
+{
+	int held1, held2;
+
+	held1 = (gl->gl_state != LM_ST_UNLOCKED);
+	held2 = (new_state != LM_ST_UNLOCKED);
+
+	if (held1 != held2) {
+		if (held2)
+			gfs2_glock_hold(gl);
+		else
+			gfs2_glock_put(gl);
+	}
+
+	gl->gl_state = new_state;
+	gl->gl_tchange = jiffies;
+}
+
+static void gfs2_demote_wake(struct gfs2_glock *gl)
+{
+	gl->gl_demote_state = LM_ST_EXCLUSIVE;
+	clear_bit(GLF_DEMOTE, &gl->gl_flags);
+	smp_mb__after_clear_bit();
+	wake_up_bit(&gl->gl_flags, GLF_DEMOTE);
+}
+
+/**
+ * finish_xmote - The DLM has replied to one of our lock requests
+ * @gl: The glock
+ * @ret: The status from the DLM
+ *
+ */
+
+static void finish_xmote(struct gfs2_glock *gl, unsigned int ret)
+{
+	const struct gfs2_glock_operations *glops = gl->gl_ops;
+	struct gfs2_holder *gh;
+	unsigned state = ret & LM_OUT_ST_MASK;
+
+	spin_lock(&gl->gl_spin);
+	state_change(gl, state);
+	gh = find_first_waiter(gl);
+
+	/* Demote to UN request arrived during demote to SH or DF */
+	if (test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags) &&
+	    state != LM_ST_UNLOCKED && gl->gl_demote_state == LM_ST_UNLOCKED)
+		gl->gl_target = LM_ST_UNLOCKED;
+
+	/* Check for state != intended state */
+	if (unlikely(state != gl->gl_target)) {
+		if (gh && !test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags)) {
+			/* move to back of queue and try next entry */
+			if (ret & LM_OUT_CANCELED) {
+				if ((gh->gh_flags & LM_FLAG_PRIORITY) == 0)
+					list_move_tail(&gh->gh_list, &gl->gl_holders);
+				gh = find_first_waiter(gl);
+				gl->gl_target = gh->gh_state;
+				goto retry;
+			}
+			/* Some error or failed "try lock" - report it */
+			if ((ret & LM_OUT_ERROR) ||
+			    (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))) {
+				gl->gl_target = gl->gl_state;
+				do_error(gl, ret);
+				goto out;
+			}
+		}
+		switch(state) {
+		/* Unlocked due to conversion deadlock, try again */
+		case LM_ST_UNLOCKED:
+retry:
+			do_xmote(gl, gh, gl->gl_target);
+			break;
+		/* Conversion fails, unlock and try again */
+		case LM_ST_SHARED:
+		case LM_ST_DEFERRED:
+			do_xmote(gl, gh, LM_ST_UNLOCKED);
+			break;
+		default: /* Everything else */
+			printk(KERN_ERR "GFS2: wanted %u got %u\n", gl->gl_target, state);
+			GLOCK_BUG_ON(gl, 1);
+		}
+		spin_unlock(&gl->gl_spin);
+		gfs2_glock_put(gl);
+		return;
+	}
+
+	/* Fast path - we got what we asked for */
+	if (test_and_clear_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags))
+		gfs2_demote_wake(gl);
+	if (state != LM_ST_UNLOCKED) {
+		if (glops->go_xmote_bh) {
+			int rv;
+			spin_unlock(&gl->gl_spin);
+			rv = glops->go_xmote_bh(gl, gh);
+			if (rv == -EAGAIN)
+				return;
+			spin_lock(&gl->gl_spin);
+			if (rv) {
+				do_error(gl, rv);
+				goto out;
+			}
+		}
+		do_promote(gl);
+	}
+out:
+	clear_bit(GLF_LOCK, &gl->gl_flags);
+	spin_unlock(&gl->gl_spin);
+	gfs2_glock_put(gl);
+}
+
+static unsigned int gfs2_lm_lock(struct gfs2_sbd *sdp, void *lock,
+				 unsigned int cur_state, unsigned int req_state,
+				 unsigned int flags)
+{
+	int ret = LM_OUT_ERROR;
+
+	if (!sdp->sd_lockstruct.ls_ops->lm_lock)
+		return req_state == LM_ST_UNLOCKED ? 0 : req_state;
+
+	if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+		ret = sdp->sd_lockstruct.ls_ops->lm_lock(lock, cur_state,
+							 req_state, flags);
+	return ret;
+}
+
+/**
+ * do_xmote - Calls the DLM to change the state of a lock
+ * @gl: The lock state
+ * @gh: The holder (only for promotes)
+ * @target: The target lock state
+ *
+ */
+
+static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int target)
+{
+	const struct gfs2_glock_operations *glops = gl->gl_ops;
+	struct gfs2_sbd *sdp = gl->gl_sbd;
+	unsigned int lck_flags = gh ? gh->gh_flags : 0;
+	int ret;
+
+	lck_flags &= (LM_FLAG_TRY | LM_FLAG_TRY_1CB | LM_FLAG_NOEXP |
+		      LM_FLAG_PRIORITY);
+	BUG_ON(gl->gl_state == target);
+	BUG_ON(gl->gl_state == gl->gl_target);
+	if ((target == LM_ST_UNLOCKED || target == LM_ST_DEFERRED) &&
+	    glops->go_inval) {
+		set_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags);
+		do_error(gl, 0); /* Fail queued try locks */
+	}
+	spin_unlock(&gl->gl_spin);
+	if (glops->go_xmote_th)
+		glops->go_xmote_th(gl);
+	if (test_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags))
+		glops->go_inval(gl, target == LM_ST_DEFERRED ? 0 : DIO_METADATA);
+	clear_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags);
+
+	gfs2_glock_hold(gl);
+	if (target != LM_ST_UNLOCKED && (gl->gl_state == LM_ST_SHARED ||
+	    gl->gl_state == LM_ST_DEFERRED) &&
+	    !(lck_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)))
+		lck_flags |= LM_FLAG_TRY_1CB;
+	ret = gfs2_lm_lock(sdp, gl->gl_lock, gl->gl_state, target, lck_flags);
+
+	if (!(ret & LM_OUT_ASYNC)) {
+		finish_xmote(gl, ret);
+		gfs2_glock_hold(gl);
+		if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
+			gfs2_glock_put(gl);
+	} else {
+		GLOCK_BUG_ON(gl, ret != LM_OUT_ASYNC);
+	}
+	spin_lock(&gl->gl_spin);
+}
+
+/**
+ * find_first_holder - find the first "holder" gh
+ * @gl: the glock
+ */
+
+static inline struct gfs2_holder *find_first_holder(const struct gfs2_glock *gl)
+{
+	struct gfs2_holder *gh;
+
+	if (!list_empty(&gl->gl_holders)) {
+		gh = list_entry(gl->gl_holders.next, struct gfs2_holder, gh_list);
+		if (test_bit(HIF_HOLDER, &gh->gh_iflags))
+			return gh;
+	}
+	return NULL;
+}
+
+/**
+ * run_queue - do all outstanding tasks related to a glock
+ * @gl: The glock in question
+ * @nonblock: True if we must not block in run_queue
+ *
+ */
+
+static void run_queue(struct gfs2_glock *gl, const int nonblock)
+{
+	struct gfs2_holder *gh = NULL;
+
+	if (test_and_set_bit(GLF_LOCK, &gl->gl_flags))
+		return;
+
+	GLOCK_BUG_ON(gl, test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags));
+
+	if (test_bit(GLF_DEMOTE, &gl->gl_flags) &&
+	    gl->gl_demote_state != gl->gl_state) {
+		if (find_first_holder(gl))
+			goto out;
+		if (nonblock)
+			goto out_sched;
+		set_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags);
+		GLOCK_BUG_ON(gl, gl->gl_demote_state == LM_ST_EXCLUSIVE);
+		gl->gl_target = gl->gl_demote_state;
+	} else {
+		if (test_bit(GLF_DEMOTE, &gl->gl_flags))
+			gfs2_demote_wake(gl);
+		if (do_promote(gl) == 0)
+			goto out;
+		gh = find_first_waiter(gl);
+		gl->gl_target = gh->gh_state;
+		if (!(gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)))
+			do_error(gl, 0); /* Fail queued try locks */
+	}
+	do_xmote(gl, gh, gl->gl_target);
+	return;
+
+out_sched:
+	gfs2_glock_hold(gl);
+	if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
+		gfs2_glock_put(gl);
+out:
+	clear_bit(GLF_LOCK, &gl->gl_flags);
+}
+
 static void glock_work_func(struct work_struct *work)
 {
+	unsigned long delay = 0;
 	struct gfs2_glock *gl = container_of(work, struct gfs2_glock, gl_work.work);
 
+	if (test_and_clear_bit(GLF_REPLY_PENDING, &gl->gl_flags))
+		finish_xmote(gl, gl->gl_reply);
 	spin_lock(&gl->gl_spin);
-	if (test_and_clear_bit(GLF_PENDING_DEMOTE, &gl->gl_flags))
-		set_bit(GLF_DEMOTE, &gl->gl_flags);
-	run_queue(gl);
+	if (test_and_clear_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) &&
+	    gl->gl_state != LM_ST_UNLOCKED &&
+	    gl->gl_demote_state != LM_ST_EXCLUSIVE) {
+		unsigned long holdtime, now = jiffies;
+		holdtime = gl->gl_tchange + gl->gl_ops->go_min_hold_time;
+		if (time_before(now, holdtime))
+			delay = holdtime - now;
+		set_bit(delay ? GLF_PENDING_DEMOTE : GLF_DEMOTE, &gl->gl_flags);
+	}
+	run_queue(gl, 0);
 	spin_unlock(&gl->gl_spin);
-	gfs2_glock_put(gl);
+	if (!delay ||
+	    queue_delayed_work(glock_workqueue, &gl->gl_work, delay) == 0)
+		gfs2_glock_put(gl);
 }
 
 static int gfs2_lm_get_lock(struct gfs2_sbd *sdp, struct lm_lockname *name,
 		     void **lockp)
 {
 	int error = -EIO;
+	if (!sdp->sd_lockstruct.ls_ops->lm_get_lock)
+		return 0;
 	if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
 		error = sdp->sd_lockstruct.ls_ops->lm_get_lock(
 				sdp->sd_lockstruct.ls_lockspace, name, lockp);
@@ -342,12 +689,10 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
 	gl->gl_name = name;
 	atomic_set(&gl->gl_ref, 1);
 	gl->gl_state = LM_ST_UNLOCKED;
+	gl->gl_target = LM_ST_UNLOCKED;
 	gl->gl_demote_state = LM_ST_EXCLUSIVE;
 	gl->gl_hash = hash;
-	gl->gl_owner_pid = NULL;
-	gl->gl_ip = 0;
 	gl->gl_ops = glops;
-	gl->gl_req_gh = NULL;
 	gl->gl_stamp = jiffies;
 	gl->gl_tchange = jiffies;
 	gl->gl_object = NULL;
@@ -447,13 +792,6 @@ void gfs2_holder_uninit(struct gfs2_holder *gh)
 	gh->gh_ip = 0;
 }
 
-static void gfs2_holder_wake(struct gfs2_holder *gh)
-{
-	clear_bit(HIF_WAIT, &gh->gh_iflags);
-	smp_mb__after_clear_bit();
-	wake_up_bit(&gh->gh_iflags, HIF_WAIT);
-}
-
 static int just_schedule(void *word)
 {
         schedule();
@@ -466,14 +804,6 @@ static void wait_on_holder(struct gfs2_holder *gh)
 	wait_on_bit(&gh->gh_iflags, HIF_WAIT, just_schedule, TASK_UNINTERRUPTIBLE);
 }
 
-static void gfs2_demote_wake(struct gfs2_glock *gl)
-{
-	gl->gl_demote_state = LM_ST_EXCLUSIVE;
-        clear_bit(GLF_DEMOTE, &gl->gl_flags);
-        smp_mb__after_clear_bit();
-        wake_up_bit(&gl->gl_flags, GLF_DEMOTE);
-}
-
 static void wait_on_demote(struct gfs2_glock *gl)
 {
 	might_sleep();
@@ -481,217 +811,6 @@ static void wait_on_demote(struct gfs2_glock *gl)
 }
 
 /**
- * rq_mutex - process a mutex request in the queue
- * @gh: the glock holder
- *
- * Returns: 1 if the queue is blocked
- */
-
-static int rq_mutex(struct gfs2_holder *gh)
-{
-	struct gfs2_glock *gl = gh->gh_gl;
-
-	list_del_init(&gh->gh_list);
-	/*  gh->gh_error never examined.  */
-	set_bit(GLF_LOCK, &gl->gl_flags);
-	clear_bit(HIF_WAIT, &gh->gh_iflags);
-	smp_mb();
-	wake_up_bit(&gh->gh_iflags, HIF_WAIT);
-
-	return 1;
-}
-
-/**
- * rq_promote - process a promote request in the queue
- * @gh: the glock holder
- *
- * Acquire a new inter-node lock, or change a lock state to more restrictive.
- *
- * Returns: 1 if the queue is blocked
- */
-
-static int rq_promote(struct gfs2_holder *gh)
-{
-	struct gfs2_glock *gl = gh->gh_gl;
-
-	if (!relaxed_state_ok(gl->gl_state, gh->gh_state, gh->gh_flags)) {
-		if (list_empty(&gl->gl_holders)) {
-			gl->gl_req_gh = gh;
-			set_bit(GLF_LOCK, &gl->gl_flags);
-			spin_unlock(&gl->gl_spin);
-			gfs2_glock_xmote_th(gh->gh_gl, gh);
-			spin_lock(&gl->gl_spin);
-		}
-		return 1;
-	}
-
-	if (list_empty(&gl->gl_holders)) {
-		set_bit(HIF_FIRST, &gh->gh_iflags);
-		set_bit(GLF_LOCK, &gl->gl_flags);
-	} else {
-		struct gfs2_holder *next_gh;
-		if (gh->gh_state == LM_ST_EXCLUSIVE)
-			return 1;
-		next_gh = list_entry(gl->gl_holders.next, struct gfs2_holder,
-				     gh_list);
-		if (next_gh->gh_state == LM_ST_EXCLUSIVE)
-			 return 1;
-	}
-
-	list_move_tail(&gh->gh_list, &gl->gl_holders);
-	gh->gh_error = 0;
-	set_bit(HIF_HOLDER, &gh->gh_iflags);
-
-	gfs2_holder_wake(gh);
-
-	return 0;
-}
-
-/**
- * rq_demote - process a demote request in the queue
- * @gh: the glock holder
- *
- * Returns: 1 if the queue is blocked
- */
-
-static int rq_demote(struct gfs2_glock *gl)
-{
-	if (!list_empty(&gl->gl_holders))
-		return 1;
-
-	if (gl->gl_state == gl->gl_demote_state ||
-	    gl->gl_state == LM_ST_UNLOCKED) {
-		gfs2_demote_wake(gl);
-		return 0;
-	}
-
-	set_bit(GLF_LOCK, &gl->gl_flags);
-	set_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags);
-
-	if (gl->gl_demote_state == LM_ST_UNLOCKED ||
-	    gl->gl_state != LM_ST_EXCLUSIVE) {
-		spin_unlock(&gl->gl_spin);
-		gfs2_glock_drop_th(gl);
-	} else {
-		spin_unlock(&gl->gl_spin);
-		gfs2_glock_xmote_th(gl, NULL);
-	}
-
-	spin_lock(&gl->gl_spin);
-	clear_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags);
-
-	return 0;
-}
-
-/**
- * run_queue - process holder structures on a glock
- * @gl: the glock
- *
- */
-static void run_queue(struct gfs2_glock *gl)
-{
-	struct gfs2_holder *gh;
-	int blocked = 1;
-
-	for (;;) {
-		if (test_bit(GLF_LOCK, &gl->gl_flags))
-			break;
-
-		if (!list_empty(&gl->gl_waiters1)) {
-			gh = list_entry(gl->gl_waiters1.next,
-					struct gfs2_holder, gh_list);
-			blocked = rq_mutex(gh);
-		} else if (test_bit(GLF_DEMOTE, &gl->gl_flags)) {
-			blocked = rq_demote(gl);
-			if (test_bit(GLF_WAITERS2, &gl->gl_flags) &&
-				     !blocked) {
-				set_bit(GLF_DEMOTE, &gl->gl_flags);
-				gl->gl_demote_state = LM_ST_UNLOCKED;
-			}
-			clear_bit(GLF_WAITERS2, &gl->gl_flags);
-		} else if (!list_empty(&gl->gl_waiters3)) {
-			gh = list_entry(gl->gl_waiters3.next,
-					struct gfs2_holder, gh_list);
-			blocked = rq_promote(gh);
-		} else
-			break;
-
-		if (blocked)
-			break;
-	}
-}
-
-/**
- * gfs2_glmutex_lock - acquire a local lock on a glock
- * @gl: the glock
- *
- * Gives caller exclusive access to manipulate a glock structure.
- */
-
-static void gfs2_glmutex_lock(struct gfs2_glock *gl)
-{
-	spin_lock(&gl->gl_spin);
-	if (test_and_set_bit(GLF_LOCK, &gl->gl_flags)) {
-		struct gfs2_holder gh;
-
-		gfs2_holder_init(gl, 0, 0, &gh);
-		set_bit(HIF_WAIT, &gh.gh_iflags);
-		list_add_tail(&gh.gh_list, &gl->gl_waiters1);
-		spin_unlock(&gl->gl_spin);
-		wait_on_holder(&gh);
-		gfs2_holder_uninit(&gh);
-	} else {
-		gl->gl_owner_pid = get_pid(task_pid(current));
-		gl->gl_ip = (unsigned long)__builtin_return_address(0);
-		spin_unlock(&gl->gl_spin);
-	}
-}
-
-/**
- * gfs2_glmutex_trylock - try to acquire a local lock on a glock
- * @gl: the glock
- *
- * Returns: 1 if the glock is acquired
- */
-
-static int gfs2_glmutex_trylock(struct gfs2_glock *gl)
-{
-	int acquired = 1;
-
-	spin_lock(&gl->gl_spin);
-	if (test_and_set_bit(GLF_LOCK, &gl->gl_flags)) {
-		acquired = 0;
-	} else {
-		gl->gl_owner_pid = get_pid(task_pid(current));
-		gl->gl_ip = (unsigned long)__builtin_return_address(0);
-	}
-	spin_unlock(&gl->gl_spin);
-
-	return acquired;
-}
-
-/**
- * gfs2_glmutex_unlock - release a local lock on a glock
- * @gl: the glock
- *
- */
-
-static void gfs2_glmutex_unlock(struct gfs2_glock *gl)
-{
-	struct pid *pid;
-
-	spin_lock(&gl->gl_spin);
-	clear_bit(GLF_LOCK, &gl->gl_flags);
-	pid = gl->gl_owner_pid;
-	gl->gl_owner_pid = NULL;
-	gl->gl_ip = 0;
-	run_queue(gl);
-	spin_unlock(&gl->gl_spin);
-
-	put_pid(pid);
-}
-
-/**
  * handle_callback - process a demote request
  * @gl: the glock
  * @state: the state the caller wants us to change to
@@ -705,398 +824,45 @@ static void handle_callback(struct gfs2_glock *gl, unsigned int state,
 {
 	int bit = delay ? GLF_PENDING_DEMOTE : GLF_DEMOTE;
 
-	spin_lock(&gl->gl_spin);
 	set_bit(bit, &gl->gl_flags);
 	if (gl->gl_demote_state == LM_ST_EXCLUSIVE) {
 		gl->gl_demote_state = state;
 		gl->gl_demote_time = jiffies;
 		if (remote && gl->gl_ops->go_type == LM_TYPE_IOPEN &&
-		    gl->gl_object) {
+		    gl->gl_object)
 			gfs2_glock_schedule_for_reclaim(gl);
-			spin_unlock(&gl->gl_spin);
-			return;
-		}
 	} else if (gl->gl_demote_state != LM_ST_UNLOCKED &&
 			gl->gl_demote_state != state) {
-		if (test_bit(GLF_DEMOTE_IN_PROGRESS,  &gl->gl_flags)) 
-			set_bit(GLF_WAITERS2, &gl->gl_flags);
-		else 
-			gl->gl_demote_state = LM_ST_UNLOCKED;
-	}
-	spin_unlock(&gl->gl_spin);
-}
-
-/**
- * state_change - record that the glock is now in a different state
- * @gl: the glock
- * @new_state the new state
- *
- */
-
-static void state_change(struct gfs2_glock *gl, unsigned int new_state)
-{
-	int held1, held2;
-
-	held1 = (gl->gl_state != LM_ST_UNLOCKED);
-	held2 = (new_state != LM_ST_UNLOCKED);
-
-	if (held1 != held2) {
-		if (held2)
-			gfs2_glock_hold(gl);
-		else
-			gfs2_glock_put(gl);
+		gl->gl_demote_state = LM_ST_UNLOCKED;
 	}
-
-	gl->gl_state = new_state;
-	gl->gl_tchange = jiffies;
 }
 
 /**
- * drop_bh - Called after a lock module unlock completes
- * @gl: the glock
- * @ret: the return status
- *
- * Doesn't wake up the process waiting on the struct gfs2_holder (if any)
- * Doesn't drop the reference on the glock the top half took out
- *
- */
-
-static void drop_bh(struct gfs2_glock *gl, unsigned int ret)
-{
-	struct gfs2_sbd *sdp = gl->gl_sbd;
-	struct gfs2_holder *gh = gl->gl_req_gh;
-
-	gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
-	gfs2_assert_warn(sdp, list_empty(&gl->gl_holders));
-	gfs2_assert_warn(sdp, !ret);
-
-	state_change(gl, LM_ST_UNLOCKED);
-
-	if (test_and_clear_bit(GLF_CONV_DEADLK, &gl->gl_flags)) {
-		spin_lock(&gl->gl_spin);
-		gh->gh_error = 0;
-		spin_unlock(&gl->gl_spin);
-		gfs2_glock_xmote_th(gl, gl->gl_req_gh);
-		gfs2_glock_put(gl);
-		return;
-	}
-
-	spin_lock(&gl->gl_spin);
-	gfs2_demote_wake(gl);
-	clear_bit(GLF_LOCK, &gl->gl_flags);
-	spin_unlock(&gl->gl_spin);
-	gfs2_glock_put(gl);
-}
-
-/**
- * xmote_bh - Called after the lock module is done acquiring a lock
- * @gl: The glock in question
- * @ret: the int returned from the lock module
- *
- */
-
-static void xmote_bh(struct gfs2_glock *gl, unsigned int ret)
-{
-	struct gfs2_sbd *sdp = gl->gl_sbd;
-	const struct gfs2_glock_operations *glops = gl->gl_ops;
-	struct gfs2_holder *gh = gl->gl_req_gh;
-	int op_done = 1;
-
-	if (!gh && (ret & LM_OUT_ST_MASK) == LM_ST_UNLOCKED) {
-		drop_bh(gl, ret);
-		return;
-	}
-
-	gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
-	gfs2_assert_warn(sdp, list_empty(&gl->gl_holders));
-	gfs2_assert_warn(sdp, !(ret & LM_OUT_ASYNC));
-
-	state_change(gl, ret & LM_OUT_ST_MASK);
-
-	/*  Deal with each possible exit condition  */
-
-	if (!gh) {
-		gl->gl_stamp = jiffies;
-		if (ret & LM_OUT_CANCELED) {
-			op_done = 0;
-		} else {
-			spin_lock(&gl->gl_spin);
-			if (gl->gl_state != gl->gl_demote_state) {
-				spin_unlock(&gl->gl_spin);
-				gfs2_glock_drop_th(gl);
-				gfs2_glock_put(gl);
-				return;
-			}
-			gfs2_demote_wake(gl);
-			spin_unlock(&gl->gl_spin);
-		}
-	} else {
-		spin_lock(&gl->gl_spin);
-		if (ret & LM_OUT_CONV_DEADLK) {
-			gh->gh_error = 0;
-			set_bit(GLF_CONV_DEADLK, &gl->gl_flags);
-			spin_unlock(&gl->gl_spin);
-			gfs2_glock_drop_th(gl);
-			gfs2_glock_put(gl);
-			return;
-		}
-		list_del_init(&gh->gh_list);
-		gh->gh_error = -EIO;
-		if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) 
-			goto out;
-		gh->gh_error = GLR_CANCELED;
-		if (ret & LM_OUT_CANCELED) 
-			goto out;
-		if (relaxed_state_ok(gl->gl_state, gh->gh_state, gh->gh_flags)) {
-			list_add_tail(&gh->gh_list, &gl->gl_holders);
-			gh->gh_error = 0;
-			set_bit(HIF_HOLDER, &gh->gh_iflags);
-			set_bit(HIF_FIRST, &gh->gh_iflags);
-			op_done = 0;
-			goto out;
-		}
-		gh->gh_error = GLR_TRYFAILED;
-		if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))
-			goto out;
-		gh->gh_error = -EINVAL;
-		if (gfs2_assert_withdraw(sdp, 0) == -1)
-			fs_err(sdp, "ret = 0x%.8X\n", ret);
-out:
-		spin_unlock(&gl->gl_spin);
-	}
-
-	if (glops->go_xmote_bh)
-		glops->go_xmote_bh(gl);
-
-	if (op_done) {
-		spin_lock(&gl->gl_spin);
-		gl->gl_req_gh = NULL;
-		clear_bit(GLF_LOCK, &gl->gl_flags);
-		spin_unlock(&gl->gl_spin);
-	}
-
-	gfs2_glock_put(gl);
-
-	if (gh)
-		gfs2_holder_wake(gh);
-}
-
-static unsigned int gfs2_lm_lock(struct gfs2_sbd *sdp, void *lock,
-				 unsigned int cur_state, unsigned int req_state,
-				 unsigned int flags)
-{
-	int ret = 0;
-	if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
-		ret = sdp->sd_lockstruct.ls_ops->lm_lock(lock, cur_state,
-							 req_state, flags);
-	return ret;
-}
-
-/**
- * gfs2_glock_xmote_th - Call into the lock module to acquire or change a glock
- * @gl: The glock in question
- * @state: the requested state
- * @flags: modifier flags to the lock call
- *
- */
-
-static void gfs2_glock_xmote_th(struct gfs2_glock *gl, struct gfs2_holder *gh)
-{
-	struct gfs2_sbd *sdp = gl->gl_sbd;
-	int flags = gh ? gh->gh_flags : 0;
-	unsigned state = gh ? gh->gh_state : gl->gl_demote_state;
-	const struct gfs2_glock_operations *glops = gl->gl_ops;
-	int lck_flags = flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB |
-				 LM_FLAG_NOEXP | LM_FLAG_ANY |
-				 LM_FLAG_PRIORITY);
-	unsigned int lck_ret;
-
-	if (glops->go_xmote_th)
-		glops->go_xmote_th(gl);
-	if (state == LM_ST_DEFERRED && glops->go_inval)
-		glops->go_inval(gl, DIO_METADATA);
-
-	gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
-	gfs2_assert_warn(sdp, list_empty(&gl->gl_holders));
-	gfs2_assert_warn(sdp, state != LM_ST_UNLOCKED);
-	gfs2_assert_warn(sdp, state != gl->gl_state);
-
-	gfs2_glock_hold(gl);
-
-	lck_ret = gfs2_lm_lock(sdp, gl->gl_lock, gl->gl_state, state, lck_flags);
-
-	if (gfs2_assert_withdraw(sdp, !(lck_ret & LM_OUT_ERROR)))
-		return;
-
-	if (lck_ret & LM_OUT_ASYNC)
-		gfs2_assert_warn(sdp, lck_ret == LM_OUT_ASYNC);
-	else
-		xmote_bh(gl, lck_ret);
-}
-
-static unsigned int gfs2_lm_unlock(struct gfs2_sbd *sdp, void *lock,
-				   unsigned int cur_state)
-{
-	int ret = 0;
-	if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
-		ret =  sdp->sd_lockstruct.ls_ops->lm_unlock(lock, cur_state);
-	return ret;
-}
-
-/**
- * gfs2_glock_drop_th - call into the lock module to unlock a lock
- * @gl: the glock
- *
- */
-
-static void gfs2_glock_drop_th(struct gfs2_glock *gl)
-{
-	struct gfs2_sbd *sdp = gl->gl_sbd;
-	const struct gfs2_glock_operations *glops = gl->gl_ops;
-	unsigned int ret;
-
-	if (glops->go_xmote_th)
-		glops->go_xmote_th(gl);
-	if (glops->go_inval)
-		glops->go_inval(gl, DIO_METADATA);
-
-	gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
-	gfs2_assert_warn(sdp, list_empty(&gl->gl_holders));
-	gfs2_assert_warn(sdp, gl->gl_state != LM_ST_UNLOCKED);
-
-	gfs2_glock_hold(gl);
-
-	ret = gfs2_lm_unlock(sdp, gl->gl_lock, gl->gl_state);
-
-	if (gfs2_assert_withdraw(sdp, !(ret & LM_OUT_ERROR)))
-		return;
-
-	if (!ret)
-		drop_bh(gl, ret);
-	else
-		gfs2_assert_warn(sdp, ret == LM_OUT_ASYNC);
-}
-
-/**
- * do_cancels - cancel requests for locks stuck waiting on an expire flag
- * @gh: the LM_FLAG_PRIORITY holder waiting to acquire the lock
- *
- * Don't cancel GL_NOCANCEL requests.
- */
-
-static void do_cancels(struct gfs2_holder *gh)
-{
-	struct gfs2_glock *gl = gh->gh_gl;
-	struct gfs2_sbd *sdp = gl->gl_sbd;
-
-	spin_lock(&gl->gl_spin);
-
-	while (gl->gl_req_gh != gh &&
-	       !test_bit(HIF_HOLDER, &gh->gh_iflags) &&
-	       !list_empty(&gh->gh_list)) {
-		if (!(gl->gl_req_gh && (gl->gl_req_gh->gh_flags & GL_NOCANCEL))) {
-			spin_unlock(&gl->gl_spin);
-			if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
-				sdp->sd_lockstruct.ls_ops->lm_cancel(gl->gl_lock);
-			msleep(100);
-			spin_lock(&gl->gl_spin);
-		} else {
-			spin_unlock(&gl->gl_spin);
-			msleep(100);
-			spin_lock(&gl->gl_spin);
-		}
-	}
-
-	spin_unlock(&gl->gl_spin);
-}
-
-/**
- * glock_wait_internal - wait on a glock acquisition
+ * gfs2_glock_wait - wait on a glock acquisition
  * @gh: the glock holder
  *
  * Returns: 0 on success
  */
 
-static int glock_wait_internal(struct gfs2_holder *gh)
+int gfs2_glock_wait(struct gfs2_holder *gh)
 {
-	struct gfs2_glock *gl = gh->gh_gl;
-	struct gfs2_sbd *sdp = gl->gl_sbd;
-	const struct gfs2_glock_operations *glops = gl->gl_ops;
-
-	if (test_bit(HIF_ABORTED, &gh->gh_iflags))
-		return -EIO;
-
-	if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) {
-		spin_lock(&gl->gl_spin);
-		if (gl->gl_req_gh != gh &&
-		    !test_bit(HIF_HOLDER, &gh->gh_iflags) &&
-		    !list_empty(&gh->gh_list)) {
-			list_del_init(&gh->gh_list);
-			gh->gh_error = GLR_TRYFAILED;
-			run_queue(gl);
-			spin_unlock(&gl->gl_spin);
-			return gh->gh_error;
-		}
-		spin_unlock(&gl->gl_spin);
-	}
-
-	if (gh->gh_flags & LM_FLAG_PRIORITY)
-		do_cancels(gh);
-
 	wait_on_holder(gh);
-	if (gh->gh_error)
-		return gh->gh_error;
-
-	gfs2_assert_withdraw(sdp, test_bit(HIF_HOLDER, &gh->gh_iflags));
-	gfs2_assert_withdraw(sdp, relaxed_state_ok(gl->gl_state, gh->gh_state,
-						   gh->gh_flags));
-
-	if (test_bit(HIF_FIRST, &gh->gh_iflags)) {
-		gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
-
-		if (glops->go_lock) {
-			gh->gh_error = glops->go_lock(gh);
-			if (gh->gh_error) {
-				spin_lock(&gl->gl_spin);
-				list_del_init(&gh->gh_list);
-				spin_unlock(&gl->gl_spin);
-			}
-		}
-
-		spin_lock(&gl->gl_spin);
-		gl->gl_req_gh = NULL;
-		clear_bit(GLF_LOCK, &gl->gl_flags);
-		run_queue(gl);
-		spin_unlock(&gl->gl_spin);
-	}
-
 	return gh->gh_error;
 }
 
-static inline struct gfs2_holder *
-find_holder_by_owner(struct list_head *head, struct pid *pid)
-{
-	struct gfs2_holder *gh;
-
-	list_for_each_entry(gh, head, gh_list) {
-		if (gh->gh_owner_pid == pid)
-			return gh;
-	}
-
-	return NULL;
-}
-
-static void print_dbg(struct glock_iter *gi, const char *fmt, ...)
+void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...)
 {
 	va_list args;
 
 	va_start(args, fmt);
-	if (gi) {
+	if (seq) {
+		struct gfs2_glock_iter *gi = seq->private;
 		vsprintf(gi->string, fmt, args);
-		seq_printf(gi->seq, gi->string);
-	}
-	else
+		seq_printf(seq, gi->string);
+	} else {
+		printk(KERN_ERR " ");
 		vprintk(fmt, args);
+	}
 	va_end(args);
 }
 
@@ -1104,50 +870,76 @@ static void print_dbg(struct glock_iter *gi, const char *fmt, ...)
  * add_to_queue - Add a holder to the wait queue (but look for recursion)
  * @gh: the holder structure to add
  *
+ * Eventually we should move the recursive locking trap to a
+ * debugging option or something like that. This is the fast
+ * path and needs to have the minimum number of distractions.
+ * 
  */
 
-static void add_to_queue(struct gfs2_holder *gh)
+static inline void add_to_queue(struct gfs2_holder *gh)
 {
 	struct gfs2_glock *gl = gh->gh_gl;
-	struct gfs2_holder *existing;
+	struct gfs2_sbd *sdp = gl->gl_sbd;
+	struct list_head *insert_pt = NULL;
+	struct gfs2_holder *gh2;
+	int try_lock = 0;
 
 	BUG_ON(gh->gh_owner_pid == NULL);
 	if (test_and_set_bit(HIF_WAIT, &gh->gh_iflags))
 		BUG();
 
-	if (!(gh->gh_flags & GL_FLOCK)) {
-		existing = find_holder_by_owner(&gl->gl_holders, 
-						gh->gh_owner_pid);
-		if (existing) {
-			print_symbol(KERN_WARNING "original: %s\n", 
-				     existing->gh_ip);
-			printk(KERN_INFO "pid : %d\n",
-					pid_nr(existing->gh_owner_pid));
-			printk(KERN_INFO "lock type : %d lock state : %d\n",
-			       existing->gh_gl->gl_name.ln_type, 
-			       existing->gh_gl->gl_state);
-			print_symbol(KERN_WARNING "new: %s\n", gh->gh_ip);
-			printk(KERN_INFO "pid : %d\n",
-					pid_nr(gh->gh_owner_pid));
-			printk(KERN_INFO "lock type : %d lock state : %d\n",
-			       gl->gl_name.ln_type, gl->gl_state);
-			BUG();
-		}
-		
-		existing = find_holder_by_owner(&gl->gl_waiters3, 
-						gh->gh_owner_pid);
-		if (existing) {
-			print_symbol(KERN_WARNING "original: %s\n", 
-				     existing->gh_ip);
-			print_symbol(KERN_WARNING "new: %s\n", gh->gh_ip);
-			BUG();
+	if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) {
+		if (test_bit(GLF_LOCK, &gl->gl_flags))
+			try_lock = 1;
+		if (test_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags))
+			goto fail;
+	}
+
+	list_for_each_entry(gh2, &gl->gl_holders, gh_list) {
+		if (unlikely(gh2->gh_owner_pid == gh->gh_owner_pid &&
+		    (gh->gh_gl->gl_ops->go_type != LM_TYPE_FLOCK)))
+			goto trap_recursive;
+		if (try_lock &&
+		    !(gh2->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) &&
+		    !may_grant(gl, gh)) {
+fail:
+			gh->gh_error = GLR_TRYFAILED;
+			gfs2_holder_wake(gh);
+			return;
 		}
+		if (test_bit(HIF_HOLDER, &gh2->gh_iflags))
+			continue;
+		if (unlikely((gh->gh_flags & LM_FLAG_PRIORITY) && !insert_pt))
+			insert_pt = &gh2->gh_list;
+	}
+	if (likely(insert_pt == NULL)) {
+		list_add_tail(&gh->gh_list, &gl->gl_holders);
+		if (unlikely(gh->gh_flags & LM_FLAG_PRIORITY))
+			goto do_cancel;
+		return;
+	}
+	list_add_tail(&gh->gh_list, insert_pt);
+do_cancel:
+	gh = list_entry(gl->gl_holders.next, struct gfs2_holder, gh_list);
+	if (!(gh->gh_flags & LM_FLAG_PRIORITY)) {
+		spin_unlock(&gl->gl_spin);
+		if (sdp->sd_lockstruct.ls_ops->lm_cancel)
+			sdp->sd_lockstruct.ls_ops->lm_cancel(gl->gl_lock);
+		spin_lock(&gl->gl_spin);
 	}
+	return;
 
-	if (gh->gh_flags & LM_FLAG_PRIORITY)
-		list_add(&gh->gh_list, &gl->gl_waiters3);
-	else
-		list_add_tail(&gh->gh_list, &gl->gl_waiters3);
+trap_recursive:
+	print_symbol(KERN_ERR "original: %s\n", gh2->gh_ip);
+	printk(KERN_ERR "pid: %d\n", pid_nr(gh2->gh_owner_pid));
+	printk(KERN_ERR "lock type: %d req lock state : %d\n",
+	       gh2->gh_gl->gl_name.ln_type, gh2->gh_state);
+	print_symbol(KERN_ERR "new: %s\n", gh->gh_ip);
+	printk(KERN_ERR "pid: %d\n", pid_nr(gh->gh_owner_pid));
+	printk(KERN_ERR "lock type: %d req lock state : %d\n",
+	       gh->gh_gl->gl_name.ln_type, gh->gh_state);
+	__dump_glock(NULL, gl);
+	BUG();
 }
 
 /**
@@ -1165,24 +957,16 @@ int gfs2_glock_nq(struct gfs2_holder *gh)
 	struct gfs2_sbd *sdp = gl->gl_sbd;
 	int error = 0;
 
-restart:
-	if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) {
-		set_bit(HIF_ABORTED, &gh->gh_iflags);
+	if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
 		return -EIO;
-	}
 
 	spin_lock(&gl->gl_spin);
 	add_to_queue(gh);
-	run_queue(gl);
+	run_queue(gl, 1);
 	spin_unlock(&gl->gl_spin);
 
-	if (!(gh->gh_flags & GL_ASYNC)) {
-		error = glock_wait_internal(gh);
-		if (error == GLR_CANCELED) {
-			msleep(100);
-			goto restart;
-		}
-	}
+	if (!(gh->gh_flags & GL_ASYNC))
+		error = gfs2_glock_wait(gh);
 
 	return error;
 }
@@ -1196,48 +980,7 @@ restart:
 
 int gfs2_glock_poll(struct gfs2_holder *gh)
 {
-	struct gfs2_glock *gl = gh->gh_gl;
-	int ready = 0;
-
-	spin_lock(&gl->gl_spin);
-
-	if (test_bit(HIF_HOLDER, &gh->gh_iflags))
-		ready = 1;
-	else if (list_empty(&gh->gh_list)) {
-		if (gh->gh_error == GLR_CANCELED) {
-			spin_unlock(&gl->gl_spin);
-			msleep(100);
-			if (gfs2_glock_nq(gh))
-				return 1;
-			return 0;
-		} else
-			ready = 1;
-	}
-
-	spin_unlock(&gl->gl_spin);
-
-	return ready;
-}
-
-/**
- * gfs2_glock_wait - wait for a lock acquisition that ended in a GLR_ASYNC
- * @gh: the holder structure
- *
- * Returns: 0, GLR_TRYFAILED, or errno on failure
- */
-
-int gfs2_glock_wait(struct gfs2_holder *gh)
-{
-	int error;
-
-	error = glock_wait_internal(gh);
-	if (error == GLR_CANCELED) {
-		msleep(100);
-		gh->gh_flags &= ~GL_ASYNC;
-		error = gfs2_glock_nq(gh);
-	}
-
-	return error;
+	return test_bit(HIF_WAIT, &gh->gh_iflags) ? 0 : 1;
 }
 
 /**
@@ -1251,26 +994,30 @@ void gfs2_glock_dq(struct gfs2_holder *gh)
 	struct gfs2_glock *gl = gh->gh_gl;
 	const struct gfs2_glock_operations *glops = gl->gl_ops;
 	unsigned delay = 0;
+	int fast_path = 0;
 
+	spin_lock(&gl->gl_spin);
 	if (gh->gh_flags & GL_NOCACHE)
 		handle_callback(gl, LM_ST_UNLOCKED, 0, 0);
 
-	gfs2_glmutex_lock(gl);
-
-	spin_lock(&gl->gl_spin);
 	list_del_init(&gh->gh_list);
-
-	if (list_empty(&gl->gl_holders)) {
+	if (find_first_holder(gl) == NULL) {
 		if (glops->go_unlock) {
+			GLOCK_BUG_ON(gl, test_and_set_bit(GLF_LOCK, &gl->gl_flags));
 			spin_unlock(&gl->gl_spin);
 			glops->go_unlock(gh);
 			spin_lock(&gl->gl_spin);
+			clear_bit(GLF_LOCK, &gl->gl_flags);
 		}
 		gl->gl_stamp = jiffies;
+		if (list_empty(&gl->gl_holders) &&
+		    !test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) &&
+		    !test_bit(GLF_DEMOTE, &gl->gl_flags))
+			fast_path = 1;
 	}
-
-	clear_bit(GLF_LOCK, &gl->gl_flags);
 	spin_unlock(&gl->gl_spin);
+	if (likely(fast_path))
+		return;
 
 	gfs2_glock_hold(gl);
 	if (test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) &&
@@ -1454,6 +1201,8 @@ void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs)
 static int gfs2_lm_hold_lvb(struct gfs2_sbd *sdp, void *lock, char **lvbp)
 {
 	int error = -EIO;
+	if (!sdp->sd_lockstruct.ls_ops->lm_hold_lvb)
+		return 0;
 	if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
 		error = sdp->sd_lockstruct.ls_ops->lm_hold_lvb(lock, lvbp);
 	return error;
@@ -1469,20 +1218,14 @@ int gfs2_lvb_hold(struct gfs2_glock *gl)
 {
 	int error;
 
-	gfs2_glmutex_lock(gl);
-
 	if (!atomic_read(&gl->gl_lvb_count)) {
 		error = gfs2_lm_hold_lvb(gl->gl_sbd, gl->gl_lock, &gl->gl_lvb);
-		if (error) {
-			gfs2_glmutex_unlock(gl);
+		if (error) 
 			return error;
-		}
 		gfs2_glock_hold(gl);
 	}
 	atomic_inc(&gl->gl_lvb_count);
 
-	gfs2_glmutex_unlock(gl);
-
 	return 0;
 }
 
@@ -1497,17 +1240,13 @@ void gfs2_lvb_unhold(struct gfs2_glock *gl)
 	struct gfs2_sbd *sdp = gl->gl_sbd;
 
 	gfs2_glock_hold(gl);
-	gfs2_glmutex_lock(gl);
-
 	gfs2_assert(gl->gl_sbd, atomic_read(&gl->gl_lvb_count) > 0);
 	if (atomic_dec_and_test(&gl->gl_lvb_count)) {
-		if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+		if (sdp->sd_lockstruct.ls_ops->lm_unhold_lvb)
 			sdp->sd_lockstruct.ls_ops->lm_unhold_lvb(gl->gl_lock, gl->gl_lvb);
 		gl->gl_lvb = NULL;
 		gfs2_glock_put(gl);
 	}
-
-	gfs2_glmutex_unlock(gl);
 	gfs2_glock_put(gl);
 }
 
@@ -1527,7 +1266,9 @@ static void blocking_cb(struct gfs2_sbd *sdp, struct lm_lockname *name,
 	if (time_before(now, holdtime))
 		delay = holdtime - now;
 
+	spin_lock(&gl->gl_spin);
 	handle_callback(gl, state, 1, delay);
+	spin_unlock(&gl->gl_spin);
 	if (queue_delayed_work(glock_workqueue, &gl->gl_work, delay) == 0)
 		gfs2_glock_put(gl);
 }
@@ -1568,7 +1309,8 @@ void gfs2_glock_cb(void *cb_data, unsigned int type, void *data)
 		gl = gfs2_glock_find(sdp, &async->lc_name);
 		if (gfs2_assert_warn(sdp, gl))
 			return;
-		xmote_bh(gl, async->lc_ret);
+		gl->gl_reply = async->lc_ret;
+		set_bit(GLF_REPLY_PENDING, &gl->gl_flags);
 		if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
 			gfs2_glock_put(gl);
 		up_read(&gfs2_umount_flush_sem);
@@ -1581,11 +1323,6 @@ void gfs2_glock_cb(void *cb_data, unsigned int type, void *data)
 			wake_up_process(sdp->sd_recoverd_process);
 		return;
 
-	case LM_CB_DROPLOCKS:
-		gfs2_gl_hash_clear(sdp, NO_WAIT);
-		gfs2_quota_scan(sdp);
-		return;
-
 	default:
 		gfs2_assert_warn(sdp, 0);
 		return;
@@ -1646,6 +1383,7 @@ void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl)
 void gfs2_reclaim_glock(struct gfs2_sbd *sdp)
 {
 	struct gfs2_glock *gl;
+	int done_callback = 0;
 
 	spin_lock(&sdp->sd_reclaim_lock);
 	if (list_empty(&sdp->sd_reclaim_list)) {
@@ -1660,14 +1398,16 @@ void gfs2_reclaim_glock(struct gfs2_sbd *sdp)
 	atomic_dec(&sdp->sd_reclaim_count);
 	atomic_inc(&sdp->sd_reclaimed);
 
-	if (gfs2_glmutex_trylock(gl)) {
-		if (list_empty(&gl->gl_holders) &&
-		    gl->gl_state != LM_ST_UNLOCKED && demote_ok(gl))
-			handle_callback(gl, LM_ST_UNLOCKED, 0, 0);
-		gfs2_glmutex_unlock(gl);
+	spin_lock(&gl->gl_spin);
+	if (find_first_holder(gl) == NULL &&
+	    gl->gl_state != LM_ST_UNLOCKED && demote_ok(gl)) {
+		handle_callback(gl, LM_ST_UNLOCKED, 0, 0);
+		done_callback = 1;
 	}
-
-	gfs2_glock_put(gl);
+	spin_unlock(&gl->gl_spin);
+	if (!done_callback ||
+	    queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
+		gfs2_glock_put(gl);
 }
 
 /**
@@ -1724,18 +1464,14 @@ static void scan_glock(struct gfs2_glock *gl)
 {
 	if (gl->gl_ops == &gfs2_inode_glops && gl->gl_object)
 		return;
+	if (test_bit(GLF_LOCK, &gl->gl_flags))
+		return;
 
-	if (gfs2_glmutex_trylock(gl)) {
-		if (list_empty(&gl->gl_holders) &&
-		    gl->gl_state != LM_ST_UNLOCKED && demote_ok(gl))
-			goto out_schedule;
-		gfs2_glmutex_unlock(gl);
-	}
-	return;
-
-out_schedule:
-	gfs2_glmutex_unlock(gl);
-	gfs2_glock_schedule_for_reclaim(gl);
+	spin_lock(&gl->gl_spin);
+	if (find_first_holder(gl) == NULL &&
+	    gl->gl_state != LM_ST_UNLOCKED && demote_ok(gl))
+		gfs2_glock_schedule_for_reclaim(gl);
+	spin_unlock(&gl->gl_spin);
 }
 
 /**
@@ -1760,12 +1496,13 @@ static void clear_glock(struct gfs2_glock *gl)
 		spin_unlock(&sdp->sd_reclaim_lock);
 	}
 
-	if (gfs2_glmutex_trylock(gl)) {
-		if (list_empty(&gl->gl_holders) &&
-		    gl->gl_state != LM_ST_UNLOCKED)
-			handle_callback(gl, LM_ST_UNLOCKED, 0, 0);
-		gfs2_glmutex_unlock(gl);
-	}
+	spin_lock(&gl->gl_spin);
+	if (find_first_holder(gl) == NULL && gl->gl_state != LM_ST_UNLOCKED)
+		handle_callback(gl, LM_ST_UNLOCKED, 0, 0);
+	spin_unlock(&gl->gl_spin);
+	gfs2_glock_hold(gl);
+	if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
+		gfs2_glock_put(gl);
 }
 
 /**
@@ -1773,11 +1510,10 @@ static void clear_glock(struct gfs2_glock *gl)
  * @sdp: the filesystem
  * @wait: wait until it's all gone
  *
- * Called when unmounting the filesystem, or when inter-node lock manager
- * requests DROPLOCKS because it is running out of capacity.
+ * Called when unmounting the filesystem.
  */
 
-void gfs2_gl_hash_clear(struct gfs2_sbd *sdp, int wait)
+void gfs2_gl_hash_clear(struct gfs2_sbd *sdp)
 {
 	unsigned long t;
 	unsigned int x;
@@ -1792,7 +1528,7 @@ void gfs2_gl_hash_clear(struct gfs2_sbd *sdp, int wait)
 				cont = 1;
 		}
 
-		if (!wait || !cont)
+		if (!cont)
 			break;
 
 		if (time_after_eq(jiffies,
@@ -1810,180 +1546,164 @@ void gfs2_gl_hash_clear(struct gfs2_sbd *sdp, int wait)
 	}
 }
 
-/*
- *  Diagnostic routines to help debug distributed deadlock
- */
-
-static void gfs2_print_symbol(struct glock_iter *gi, const char *fmt,
-                              unsigned long address)
+static const char *state2str(unsigned state)
 {
-	char buffer[KSYM_SYMBOL_LEN];
-
-	sprint_symbol(buffer, address);
-	print_dbg(gi, fmt, buffer);
+	switch(state) {
+	case LM_ST_UNLOCKED:
+		return "UN";
+	case LM_ST_SHARED:
+		return "SH";
+	case LM_ST_DEFERRED:
+		return "DF";
+	case LM_ST_EXCLUSIVE:
+		return "EX";
+	}
+	return "??";
+}
+
+static const char *hflags2str(char *buf, unsigned flags, unsigned long iflags)
+{
+	char *p = buf;
+	if (flags & LM_FLAG_TRY)
+		*p++ = 't';
+	if (flags & LM_FLAG_TRY_1CB)
+		*p++ = 'T';
+	if (flags & LM_FLAG_NOEXP)
+		*p++ = 'e';
+	if (flags & LM_FLAG_ANY)
+		*p++ = 'a';
+	if (flags & LM_FLAG_PRIORITY)
+		*p++ = 'p';
+	if (flags & GL_ASYNC)
+		*p++ = 'a';
+	if (flags & GL_EXACT)
+		*p++ = 'E';
+	if (flags & GL_ATIME)
+		*p++ = 'a';
+	if (flags & GL_NOCACHE)
+		*p++ = 'c';
+	if (test_bit(HIF_HOLDER, &iflags))
+		*p++ = 'H';
+	if (test_bit(HIF_WAIT, &iflags))
+		*p++ = 'W';
+	if (test_bit(HIF_FIRST, &iflags))
+		*p++ = 'F';
+	*p = 0;
+	return buf;
 }
 
 /**
  * dump_holder - print information about a glock holder
- * @str: a string naming the type of holder
+ * @seq: the seq_file struct
  * @gh: the glock holder
  *
  * Returns: 0 on success, -ENOBUFS when we run out of space
  */
 
-static int dump_holder(struct glock_iter *gi, char *str,
-		       struct gfs2_holder *gh)
+static int dump_holder(struct seq_file *seq, const struct gfs2_holder *gh)
 {
-	unsigned int x;
-	struct task_struct *gh_owner;
+	struct task_struct *gh_owner = NULL;
+	char buffer[KSYM_SYMBOL_LEN];
+	char flags_buf[32];
 
-	print_dbg(gi, "  %s\n", str);
-	if (gh->gh_owner_pid) {
-		print_dbg(gi, "    owner = %ld ",
-				(long)pid_nr(gh->gh_owner_pid));
+	sprint_symbol(buffer, gh->gh_ip);
+	if (gh->gh_owner_pid)
 		gh_owner = pid_task(gh->gh_owner_pid, PIDTYPE_PID);
-		if (gh_owner)
-			print_dbg(gi, "(%s)\n", gh_owner->comm);
-		else
-			print_dbg(gi, "(ended)\n");
-	} else
-		print_dbg(gi, "    owner = -1\n");
-	print_dbg(gi, "    gh_state = %u\n", gh->gh_state);
-	print_dbg(gi, "    gh_flags =");
-	for (x = 0; x < 32; x++)
-		if (gh->gh_flags & (1 << x))
-			print_dbg(gi, " %u", x);
-	print_dbg(gi, " \n");
-	print_dbg(gi, "    error = %d\n", gh->gh_error);
-	print_dbg(gi, "    gh_iflags =");
-	for (x = 0; x < 32; x++)
-		if (test_bit(x, &gh->gh_iflags))
-			print_dbg(gi, " %u", x);
-	print_dbg(gi, " \n");
-        gfs2_print_symbol(gi, "    initialized at: %s\n", gh->gh_ip);
-
+	gfs2_print_dbg(seq, " H: s:%s f:%s e:%d p:%ld [%s] %s\n",
+		  state2str(gh->gh_state),
+		  hflags2str(flags_buf, gh->gh_flags, gh->gh_iflags),
+		  gh->gh_error, 
+		  gh->gh_owner_pid ? (long)pid_nr(gh->gh_owner_pid) : -1,
+		  gh_owner ? gh_owner->comm : "(ended)", buffer);
 	return 0;
 }
 
-/**
- * dump_inode - print information about an inode
- * @ip: the inode
- *
- * Returns: 0 on success, -ENOBUFS when we run out of space
- */
-
-static int dump_inode(struct glock_iter *gi, struct gfs2_inode *ip)
-{
-	unsigned int x;
-
-	print_dbg(gi, "  Inode:\n");
-	print_dbg(gi, "    num = %llu/%llu\n",
-		  (unsigned long long)ip->i_no_formal_ino,
-		  (unsigned long long)ip->i_no_addr);
-	print_dbg(gi, "    type = %u\n", IF2DT(ip->i_inode.i_mode));
-	print_dbg(gi, "    i_flags =");
-	for (x = 0; x < 32; x++)
-		if (test_bit(x, &ip->i_flags))
-			print_dbg(gi, " %u", x);
-	print_dbg(gi, " \n");
-	return 0;
+static const char *gflags2str(char *buf, const unsigned long *gflags)
+{
+	char *p = buf;
+	if (test_bit(GLF_LOCK, gflags))
+		*p++ = 'l';
+	if (test_bit(GLF_STICKY, gflags))
+		*p++ = 's';
+	if (test_bit(GLF_DEMOTE, gflags))
+		*p++ = 'D';
+	if (test_bit(GLF_PENDING_DEMOTE, gflags))
+		*p++ = 'd';
+	if (test_bit(GLF_DEMOTE_IN_PROGRESS, gflags))
+		*p++ = 'p';
+	if (test_bit(GLF_DIRTY, gflags))
+		*p++ = 'y';
+	if (test_bit(GLF_LFLUSH, gflags))
+		*p++ = 'f';
+	if (test_bit(GLF_INVALIDATE_IN_PROGRESS, gflags))
+		*p++ = 'i';
+	if (test_bit(GLF_REPLY_PENDING, gflags))
+		*p++ = 'r';
+	*p = 0;
+	return buf;
 }
 
 /**
- * dump_glock - print information about a glock
+ * __dump_glock - print information about a glock
+ * @seq: The seq_file struct
  * @gl: the glock
- * @count: where we are in the buffer
+ *
+ * The file format is as follows:
+ * One line per object, capital letters are used to indicate objects
+ * G = glock, I = Inode, R = rgrp, H = holder. Glocks are not indented,
+ * other objects are indented by a single space and follow the glock to
+ * which they are related. Fields are indicated by lower case letters
+ * followed by a colon and the field value, except for strings which are in
+ * [] so that its possible to see if they are composed of spaces for
+ * example. The field's are n = number (id of the object), f = flags,
+ * t = type, s = state, r = refcount, e = error, p = pid.
  *
  * Returns: 0 on success, -ENOBUFS when we run out of space
  */
 
-static int dump_glock(struct glock_iter *gi, struct gfs2_glock *gl)
+static int __dump_glock(struct seq_file *seq, const struct gfs2_glock *gl)
 {
-	struct gfs2_holder *gh;
-	unsigned int x;
-	int error = -ENOBUFS;
-	struct task_struct *gl_owner;
+	const struct gfs2_glock_operations *glops = gl->gl_ops;
+	unsigned long long dtime;
+	const struct gfs2_holder *gh;
+	char gflags_buf[32];
+	int error = 0;
 
-	spin_lock(&gl->gl_spin);
+	dtime = jiffies - gl->gl_demote_time;
+	dtime *= 1000000/HZ; /* demote time in uSec */
+	if (!test_bit(GLF_DEMOTE, &gl->gl_flags))
+		dtime = 0;
+	gfs2_print_dbg(seq, "G:  s:%s n:%u/%llu f:%s t:%s d:%s/%llu l:%d a:%d r:%d\n",
+		  state2str(gl->gl_state),
+		  gl->gl_name.ln_type,
+		  (unsigned long long)gl->gl_name.ln_number,
+		  gflags2str(gflags_buf, &gl->gl_flags),
+		  state2str(gl->gl_target),
+		  state2str(gl->gl_demote_state), dtime,
+		  atomic_read(&gl->gl_lvb_count),
+		  atomic_read(&gl->gl_ail_count),
+		  atomic_read(&gl->gl_ref));
 
-	print_dbg(gi, "Glock 0x%p (%u, 0x%llx)\n", gl, gl->gl_name.ln_type,
-		   (unsigned long long)gl->gl_name.ln_number);
-	print_dbg(gi, "  gl_flags =");
-	for (x = 0; x < 32; x++) {
-		if (test_bit(x, &gl->gl_flags))
-			print_dbg(gi, " %u", x);
-	}
-	if (!test_bit(GLF_LOCK, &gl->gl_flags))
-		print_dbg(gi, " (unlocked)");
-	print_dbg(gi, " \n");
-	print_dbg(gi, "  gl_ref = %d\n", atomic_read(&gl->gl_ref));
-	print_dbg(gi, "  gl_state = %u\n", gl->gl_state);
-	if (gl->gl_owner_pid) {
-		gl_owner = pid_task(gl->gl_owner_pid, PIDTYPE_PID);
-		if (gl_owner)
-			print_dbg(gi, "  gl_owner = pid %d (%s)\n",
-				  pid_nr(gl->gl_owner_pid), gl_owner->comm);
-		else
-			print_dbg(gi, "  gl_owner = %d (ended)\n",
-				  pid_nr(gl->gl_owner_pid));
-	} else
-		print_dbg(gi, "  gl_owner = -1\n");
-	print_dbg(gi, "  gl_ip = %lu\n", gl->gl_ip);
-	print_dbg(gi, "  req_gh = %s\n", (gl->gl_req_gh) ? "yes" : "no");
-	print_dbg(gi, "  lvb_count = %d\n", atomic_read(&gl->gl_lvb_count));
-	print_dbg(gi, "  object = %s\n", (gl->gl_object) ? "yes" : "no");
-	print_dbg(gi, "  reclaim = %s\n",
-		   (list_empty(&gl->gl_reclaim)) ? "no" : "yes");
-	if (gl->gl_aspace)
-		print_dbg(gi, "  aspace = 0x%p nrpages = %lu\n", gl->gl_aspace,
-			   gl->gl_aspace->i_mapping->nrpages);
-	else
-		print_dbg(gi, "  aspace = no\n");
-	print_dbg(gi, "  ail = %d\n", atomic_read(&gl->gl_ail_count));
-	if (gl->gl_req_gh) {
-		error = dump_holder(gi, "Request", gl->gl_req_gh);
-		if (error)
-			goto out;
-	}
 	list_for_each_entry(gh, &gl->gl_holders, gh_list) {
-		error = dump_holder(gi, "Holder", gh);
+		error = dump_holder(seq, gh);
 		if (error)
 			goto out;
 	}
-	list_for_each_entry(gh, &gl->gl_waiters1, gh_list) {
-		error = dump_holder(gi, "Waiter1", gh);
-		if (error)
-			goto out;
-	}
-	list_for_each_entry(gh, &gl->gl_waiters3, gh_list) {
-		error = dump_holder(gi, "Waiter3", gh);
-		if (error)
-			goto out;
-	}
-	if (test_bit(GLF_DEMOTE, &gl->gl_flags)) {
-		print_dbg(gi, "  Demotion req to state %u (%llu uS ago)\n",
-			  gl->gl_demote_state, (unsigned long long)
-			  (jiffies - gl->gl_demote_time)*(1000000/HZ));
-	}
-	if (gl->gl_ops == &gfs2_inode_glops && gl->gl_object) {
-		if (!test_bit(GLF_LOCK, &gl->gl_flags) &&
-			list_empty(&gl->gl_holders)) {
-			error = dump_inode(gi, gl->gl_object);
-			if (error)
-				goto out;
-		} else {
-			error = -ENOBUFS;
-			print_dbg(gi, "  Inode: busy\n");
-		}
-	}
-
-	error = 0;
-
+	if (gl->gl_state != LM_ST_UNLOCKED && glops->go_dump)
+		error = glops->go_dump(seq, gl);
 out:
-	spin_unlock(&gl->gl_spin);
 	return error;
 }
 
+static int dump_glock(struct seq_file *seq, struct gfs2_glock *gl)
+{
+	int ret;
+	spin_lock(&gl->gl_spin);
+	ret = __dump_glock(seq, gl);
+	spin_unlock(&gl->gl_spin);
+	return ret;
+}
+
 /**
  * gfs2_dump_lockstate - print out the current lockstate
  * @sdp: the filesystem
@@ -2086,7 +1806,7 @@ void gfs2_glock_exit(void)
 module_param(scand_secs, uint, S_IRUGO|S_IWUSR);
 MODULE_PARM_DESC(scand_secs, "The number of seconds between scand runs");
 
-static int gfs2_glock_iter_next(struct glock_iter *gi)
+static int gfs2_glock_iter_next(struct gfs2_glock_iter *gi)
 {
 	struct gfs2_glock *gl;
 
@@ -2104,7 +1824,7 @@ restart:
 		gfs2_glock_put(gl);
 	if (gl && gi->gl == NULL)
 		gi->hash++;
-	while(gi->gl == NULL) {
+	while (gi->gl == NULL) {
 		if (gi->hash >= GFS2_GL_HASH_SIZE)
 			return 1;
 		read_lock(gl_lock_addr(gi->hash));
@@ -2122,58 +1842,34 @@ restart:
 	return 0;
 }
 
-static void gfs2_glock_iter_free(struct glock_iter *gi)
+static void gfs2_glock_iter_free(struct gfs2_glock_iter *gi)
 {
 	if (gi->gl)
 		gfs2_glock_put(gi->gl);
-	kfree(gi);
-}
-
-static struct glock_iter *gfs2_glock_iter_init(struct gfs2_sbd *sdp)
-{
-	struct glock_iter *gi;
-
-	gi = kmalloc(sizeof (*gi), GFP_KERNEL);
-	if (!gi)
-		return NULL;
-
-	gi->sdp = sdp;
-	gi->hash = 0;
-	gi->seq = NULL;
 	gi->gl = NULL;
-	memset(gi->string, 0, sizeof(gi->string));
-
-	if (gfs2_glock_iter_next(gi)) {
-		gfs2_glock_iter_free(gi);
-		return NULL;
-	}
-
-	return gi;
 }
 
-static void *gfs2_glock_seq_start(struct seq_file *file, loff_t *pos)
+static void *gfs2_glock_seq_start(struct seq_file *seq, loff_t *pos)
 {
-	struct glock_iter *gi;
+	struct gfs2_glock_iter *gi = seq->private;
 	loff_t n = *pos;
 
-	gi = gfs2_glock_iter_init(file->private);
-	if (!gi)
-		return NULL;
+	gi->hash = 0;
 
-	while(n--) {
+	do {
 		if (gfs2_glock_iter_next(gi)) {
 			gfs2_glock_iter_free(gi);
 			return NULL;
 		}
-	}
+	} while (n--);
 
-	return gi;
+	return gi->gl;
 }
 
-static void *gfs2_glock_seq_next(struct seq_file *file, void *iter_ptr,
+static void *gfs2_glock_seq_next(struct seq_file *seq, void *iter_ptr,
 				 loff_t *pos)
 {
-	struct glock_iter *gi = iter_ptr;
+	struct gfs2_glock_iter *gi = seq->private;
 
 	(*pos)++;
 
@@ -2182,24 +1878,18 @@ static void *gfs2_glock_seq_next(struct seq_file *file, void *iter_ptr,
 		return NULL;
 	}
 
-	return gi;
+	return gi->gl;
 }
 
-static void gfs2_glock_seq_stop(struct seq_file *file, void *iter_ptr)
+static void gfs2_glock_seq_stop(struct seq_file *seq, void *iter_ptr)
 {
-	struct glock_iter *gi = iter_ptr;
-	if (gi)
-		gfs2_glock_iter_free(gi);
+	struct gfs2_glock_iter *gi = seq->private;
+	gfs2_glock_iter_free(gi);
 }
 
-static int gfs2_glock_seq_show(struct seq_file *file, void *iter_ptr)
+static int gfs2_glock_seq_show(struct seq_file *seq, void *iter_ptr)
 {
-	struct glock_iter *gi = iter_ptr;
-
-	gi->seq = file;
-	dump_glock(gi, gi->gl);
-
-	return 0;
+	return dump_glock(seq, iter_ptr);
 }
 
 static const struct seq_operations gfs2_glock_seq_ops = {
@@ -2211,17 +1901,14 @@ static const struct seq_operations gfs2_glock_seq_ops = {
 
 static int gfs2_debugfs_open(struct inode *inode, struct file *file)
 {
-	struct seq_file *seq;
-	int ret;
-
-	ret = seq_open(file, &gfs2_glock_seq_ops);
-	if (ret)
-		return ret;
-
-	seq = file->private_data;
-	seq->private = inode->i_private;
-
-	return 0;
+	int ret = seq_open_private(file, &gfs2_glock_seq_ops,
+				   sizeof(struct gfs2_glock_iter));
+	if (ret == 0) {
+		struct seq_file *seq = file->private_data;
+		struct gfs2_glock_iter *gi = seq->private;
+		gi->sdp = inode->i_private;
+	}
+	return ret;
 }
 
 static const struct file_operations gfs2_debug_fops = {
@@ -2229,7 +1916,7 @@ static const struct file_operations gfs2_debug_fops = {
 	.open    = gfs2_debugfs_open,
 	.read    = seq_read,
 	.llseek  = seq_lseek,
-	.release = seq_release
+	.release = seq_release_private,
 };
 
 int gfs2_create_debugfs_file(struct gfs2_sbd *sdp)
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index cdad3e6f8150..971d92af70fc 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -26,11 +26,8 @@
 #define GL_SKIP			0x00000100
 #define GL_ATIME		0x00000200
 #define GL_NOCACHE		0x00000400
-#define GL_FLOCK		0x00000800
-#define GL_NOCANCEL		0x00001000
 
 #define GLR_TRYFAILED		13
-#define GLR_CANCELED		14
 
 static inline struct gfs2_holder *gfs2_glock_is_locked_by_me(struct gfs2_glock *gl)
 {
@@ -41,6 +38,8 @@ static inline struct gfs2_holder *gfs2_glock_is_locked_by_me(struct gfs2_glock *
 	spin_lock(&gl->gl_spin);
 	pid = task_pid(current);
 	list_for_each_entry(gh, &gl->gl_holders, gh_list) {
+		if (!test_bit(HIF_HOLDER, &gh->gh_iflags))
+			break;
 		if (gh->gh_owner_pid == pid)
 			goto out;
 	}
@@ -70,7 +69,7 @@ static inline int gfs2_glock_is_blocking(struct gfs2_glock *gl)
 {
 	int ret;
 	spin_lock(&gl->gl_spin);
-	ret = test_bit(GLF_DEMOTE, &gl->gl_flags) || !list_empty(&gl->gl_waiters3);
+	ret = test_bit(GLF_DEMOTE, &gl->gl_flags);
 	spin_unlock(&gl->gl_spin);
 	return ret;
 }
@@ -98,6 +97,7 @@ int gfs2_glock_nq_num(struct gfs2_sbd *sdp,
 int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs);
 void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs);
 void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs);
+void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...);
 
 /**
  * gfs2_glock_nq_init - intialize a holder and enqueue it on a glock
@@ -130,10 +130,9 @@ int gfs2_lvb_hold(struct gfs2_glock *gl);
 void gfs2_lvb_unhold(struct gfs2_glock *gl);
 
 void gfs2_glock_cb(void *cb_data, unsigned int type, void *data);
-
 void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl);
 void gfs2_reclaim_glock(struct gfs2_sbd *sdp);
-void gfs2_gl_hash_clear(struct gfs2_sbd *sdp, int wait);
+void gfs2_gl_hash_clear(struct gfs2_sbd *sdp);
 
 int __init gfs2_glock_init(void);
 void gfs2_glock_exit(void);
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index d31badadef8f..c6c318c2a0f6 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -13,6 +13,7 @@
 #include <linux/buffer_head.h>
 #include <linux/gfs2_ondisk.h>
 #include <linux/lm_interface.h>
+#include <linux/bio.h>
 
 #include "gfs2.h"
 #include "incore.h"
@@ -172,26 +173,6 @@ static void inode_go_sync(struct gfs2_glock *gl)
 }
 
 /**
- * inode_go_xmote_bh - After promoting/demoting a glock
- * @gl: the glock
- *
- */
-
-static void inode_go_xmote_bh(struct gfs2_glock *gl)
-{
-	struct gfs2_holder *gh = gl->gl_req_gh;
-	struct buffer_head *bh;
-	int error;
-
-	if (gl->gl_state != LM_ST_UNLOCKED &&
-	    (!gh || !(gh->gh_flags & GL_SKIP))) {
-		error = gfs2_meta_read(gl, gl->gl_name.ln_number, 0, &bh);
-		if (!error)
-			brelse(bh);
-	}
-}
-
-/**
  * inode_go_inval - prepare a inode glock to be released
  * @gl: the glock
  * @flags:
@@ -249,7 +230,7 @@ static int inode_go_lock(struct gfs2_holder *gh)
 	struct gfs2_inode *ip = gl->gl_object;
 	int error = 0;
 
-	if (!ip)
+	if (!ip || (gh->gh_flags & GL_SKIP))
 		return 0;
 
 	if (test_bit(GIF_INVALID, &ip->i_flags)) {
@@ -267,6 +248,26 @@ static int inode_go_lock(struct gfs2_holder *gh)
 }
 
 /**
+ * inode_go_dump - print information about an inode
+ * @seq: The iterator
+ * @ip: the inode
+ *
+ * Returns: 0 on success, -ENOBUFS when we run out of space
+ */
+
+static int inode_go_dump(struct seq_file *seq, const struct gfs2_glock *gl)
+{
+	const struct gfs2_inode *ip = gl->gl_object;
+	if (ip == NULL)
+		return 0;
+	gfs2_print_dbg(seq, " I: n:%llu/%llu t:%u f:0x%08lx\n",
+		  (unsigned long long)ip->i_no_formal_ino,
+		  (unsigned long long)ip->i_no_addr,
+		  IF2DT(ip->i_inode.i_mode), ip->i_flags);
+	return 0;
+}
+
+/**
  * rgrp_go_demote_ok - Check to see if it's ok to unlock a RG's glock
  * @gl: the glock
  *
@@ -306,6 +307,22 @@ static void rgrp_go_unlock(struct gfs2_holder *gh)
 }
 
 /**
+ * rgrp_go_dump - print out an rgrp
+ * @seq: The iterator
+ * @gl: The glock in question
+ *
+ */
+
+static int rgrp_go_dump(struct seq_file *seq, const struct gfs2_glock *gl)
+{
+	const struct gfs2_rgrpd *rgd = gl->gl_object;
+	if (rgd == NULL)
+		return 0;
+	gfs2_print_dbg(seq, " R: n:%llu\n", (unsigned long long)rgd->rd_addr);
+	return 0;
+}
+
+/**
  * trans_go_sync - promote/demote the transaction glock
  * @gl: the glock
  * @state: the requested state
@@ -330,7 +347,7 @@ static void trans_go_sync(struct gfs2_glock *gl)
  *
  */
 
-static void trans_go_xmote_bh(struct gfs2_glock *gl)
+static int trans_go_xmote_bh(struct gfs2_glock *gl, struct gfs2_holder *gh)
 {
 	struct gfs2_sbd *sdp = gl->gl_sbd;
 	struct gfs2_inode *ip = GFS2_I(sdp->sd_jdesc->jd_inode);
@@ -338,8 +355,7 @@ static void trans_go_xmote_bh(struct gfs2_glock *gl)
 	struct gfs2_log_header_host head;
 	int error;
 
-	if (gl->gl_state != LM_ST_UNLOCKED &&
-	    test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
+	if (test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
 		j_gl->gl_ops->go_inval(j_gl, DIO_METADATA);
 
 		error = gfs2_find_jhead(sdp->sd_jdesc, &head);
@@ -354,6 +370,7 @@ static void trans_go_xmote_bh(struct gfs2_glock *gl)
 			gfs2_log_pointers_init(sdp, head.lh_blkno);
 		}
 	}
+	return 0;
 }
 
 /**
@@ -375,12 +392,12 @@ const struct gfs2_glock_operations gfs2_meta_glops = {
 
 const struct gfs2_glock_operations gfs2_inode_glops = {
 	.go_xmote_th = inode_go_sync,
-	.go_xmote_bh = inode_go_xmote_bh,
 	.go_inval = inode_go_inval,
 	.go_demote_ok = inode_go_demote_ok,
 	.go_lock = inode_go_lock,
+	.go_dump = inode_go_dump,
 	.go_type = LM_TYPE_INODE,
-	.go_min_hold_time = HZ / 10,
+	.go_min_hold_time = HZ / 5,
 };
 
 const struct gfs2_glock_operations gfs2_rgrp_glops = {
@@ -389,8 +406,9 @@ const struct gfs2_glock_operations gfs2_rgrp_glops = {
 	.go_demote_ok = rgrp_go_demote_ok,
 	.go_lock = rgrp_go_lock,
 	.go_unlock = rgrp_go_unlock,
+	.go_dump = rgrp_go_dump,
 	.go_type = LM_TYPE_RGRP,
-	.go_min_hold_time = HZ / 10,
+	.go_min_hold_time = HZ / 5,
 };
 
 const struct gfs2_glock_operations gfs2_trans_glops = {
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 9c2c0b90b22a..448697a5c462 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -77,7 +77,6 @@ struct gfs2_rgrp_host {
 struct gfs2_rgrpd {
 	struct list_head rd_list;	/* Link with superblock */
 	struct list_head rd_list_mru;
-	struct list_head rd_recent;	/* Recently used rgrps */
 	struct gfs2_glock *rd_gl;	/* Glock for this rgrp */
 	u64 rd_addr;			/* grp block disk address */
 	u64 rd_data0;			/* first data location */
@@ -128,20 +127,20 @@ struct gfs2_bufdata {
 
 struct gfs2_glock_operations {
 	void (*go_xmote_th) (struct gfs2_glock *gl);
-	void (*go_xmote_bh) (struct gfs2_glock *gl);
+	int (*go_xmote_bh) (struct gfs2_glock *gl, struct gfs2_holder *gh);
 	void (*go_inval) (struct gfs2_glock *gl, int flags);
 	int (*go_demote_ok) (struct gfs2_glock *gl);
 	int (*go_lock) (struct gfs2_holder *gh);
 	void (*go_unlock) (struct gfs2_holder *gh);
+	int (*go_dump)(struct seq_file *seq, const struct gfs2_glock *gl);
 	const int go_type;
 	const unsigned long go_min_hold_time;
 };
 
 enum {
 	/* States */
-	HIF_HOLDER		= 6,
+	HIF_HOLDER		= 6,  /* Set for gh that "holds" the glock */
 	HIF_FIRST		= 7,
-	HIF_ABORTED		= 9,
 	HIF_WAIT		= 10,
 };
 
@@ -154,20 +153,20 @@ struct gfs2_holder {
 	unsigned gh_flags;
 
 	int gh_error;
-	unsigned long gh_iflags;
+	unsigned long gh_iflags; /* HIF_... */
 	unsigned long gh_ip;
 };
 
 enum {
-	GLF_LOCK		= 1,
-	GLF_STICKY		= 2,
-	GLF_DEMOTE		= 3,
-	GLF_PENDING_DEMOTE	= 4,
-	GLF_DIRTY		= 5,
-	GLF_DEMOTE_IN_PROGRESS	= 6,
-	GLF_LFLUSH		= 7,
-	GLF_WAITERS2		= 8,
-	GLF_CONV_DEADLK		= 9,
+	GLF_LOCK			= 1,
+	GLF_STICKY			= 2,
+	GLF_DEMOTE			= 3,
+	GLF_PENDING_DEMOTE		= 4,
+	GLF_DEMOTE_IN_PROGRESS		= 5,
+	GLF_DIRTY			= 6,
+	GLF_LFLUSH			= 7,
+	GLF_INVALIDATE_IN_PROGRESS	= 8,
+	GLF_REPLY_PENDING		= 9,
 };
 
 struct gfs2_glock {
@@ -179,19 +178,14 @@ struct gfs2_glock {
 	spinlock_t gl_spin;
 
 	unsigned int gl_state;
+	unsigned int gl_target;
+	unsigned int gl_reply;
 	unsigned int gl_hash;
 	unsigned int gl_demote_state; /* state requested by remote node */
 	unsigned long gl_demote_time; /* time of first demote request */
-	struct pid *gl_owner_pid;
-	unsigned long gl_ip;
 	struct list_head gl_holders;
-	struct list_head gl_waiters1;	/* HIF_MUTEX */
-	struct list_head gl_waiters3;	/* HIF_PROMOTE */
 
 	const struct gfs2_glock_operations *gl_ops;
-
-	struct gfs2_holder *gl_req_gh;
-
 	void *gl_lock;
 	char *gl_lvb;
 	atomic_t gl_lvb_count;
@@ -236,6 +230,7 @@ enum {
 	GIF_INVALID		= 0,
 	GIF_QD_LOCKED		= 1,
 	GIF_SW_PAGED		= 3,
+	GIF_USER                = 4, /* user inode, not metadata addr space */
 };
 
 struct gfs2_dinode_host {
@@ -426,7 +421,6 @@ struct gfs2_tune {
 	unsigned int gt_quota_quantum; /* Secs between syncs to quota file */
 	unsigned int gt_atime_quantum; /* Min secs between atime updates */
 	unsigned int gt_new_files_jdata;
-	unsigned int gt_new_files_directio;
 	unsigned int gt_max_readahead; /* Max bytes to read-ahead from disk */
 	unsigned int gt_stall_secs; /* Detects trouble! */
 	unsigned int gt_complain_secs;
@@ -533,7 +527,6 @@ struct gfs2_sbd {
 	struct mutex sd_rindex_mutex;
 	struct list_head sd_rindex_list;
 	struct list_head sd_rindex_mru_list;
-	struct list_head sd_rindex_recent_list;
 	struct gfs2_rgrpd *sd_rindex_forward;
 	unsigned int sd_rgrps;
 
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 3a9ef526c308..6da0ab355b8a 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -47,8 +47,7 @@ static int iget_test(struct inode *inode, void *opaque)
 	struct gfs2_inode *ip = GFS2_I(inode);
 	u64 *no_addr = opaque;
 
-	if (ip->i_no_addr == *no_addr &&
-	    inode->i_private != NULL)
+	if (ip->i_no_addr == *no_addr && test_bit(GIF_USER, &ip->i_flags))
 		return 1;
 
 	return 0;
@@ -61,6 +60,7 @@ static int iget_set(struct inode *inode, void *opaque)
 
 	inode->i_ino = (unsigned long)*no_addr;
 	ip->i_no_addr = *no_addr;
+	set_bit(GIF_USER, &ip->i_flags);
 	return 0;
 }
 
@@ -86,7 +86,7 @@ static int iget_skip_test(struct inode *inode, void *opaque)
 	struct gfs2_inode *ip = GFS2_I(inode);
 	struct gfs2_skip_data *data = opaque;
 
-	if (ip->i_no_addr == data->no_addr && inode->i_private != NULL){
+	if (ip->i_no_addr == data->no_addr && test_bit(GIF_USER, &ip->i_flags)){
 		if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE)){
 			data->skipped = 1;
 			return 0;
@@ -105,6 +105,7 @@ static int iget_skip_set(struct inode *inode, void *opaque)
 		return 1;
 	inode->i_ino = (unsigned long)(data->no_addr);
 	ip->i_no_addr = data->no_addr;
+	set_bit(GIF_USER, &ip->i_flags);
 	return 0;
 }
 
@@ -166,7 +167,7 @@ void gfs2_set_iop(struct inode *inode)
  * Returns: A VFS inode, or an error
  */
 
-struct inode *gfs2_inode_lookup(struct super_block *sb, 
+struct inode *gfs2_inode_lookup(struct super_block *sb,
 				unsigned int type,
 				u64 no_addr,
 				u64 no_formal_ino, int skip_freeing)
@@ -187,7 +188,6 @@ struct inode *gfs2_inode_lookup(struct super_block *sb,
 
 	if (inode->i_state & I_NEW) {
 		struct gfs2_sbd *sdp = GFS2_SB(inode);
-		inode->i_private = ip;
 		ip->i_no_formal_ino = no_formal_ino;
 
 		error = gfs2_glock_get(sdp, no_addr, &gfs2_inode_glops, CREATE, &ip->i_gl);
@@ -504,7 +504,7 @@ struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name,
 	}
 
 	if (!is_root) {
-		error = permission(dir, MAY_EXEC, NULL);
+		error = gfs2_permission(dir, MAY_EXEC);
 		if (error)
 			goto out;
 	}
@@ -667,7 +667,7 @@ static int create_ok(struct gfs2_inode *dip, const struct qstr *name,
 {
 	int error;
 
-	error = permission(&dip->i_inode, MAY_WRITE | MAY_EXEC, NULL);
+	error = gfs2_permission(&dip->i_inode, MAY_WRITE | MAY_EXEC);
 	if (error)
 		return error;
 
@@ -789,13 +789,8 @@ static void init_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
 		if ((dip->i_di.di_flags & GFS2_DIF_INHERIT_JDATA) ||
 		    gfs2_tune_get(sdp, gt_new_files_jdata))
 			di->di_flags |= cpu_to_be32(GFS2_DIF_JDATA);
-		if ((dip->i_di.di_flags & GFS2_DIF_INHERIT_DIRECTIO) ||
-		    gfs2_tune_get(sdp, gt_new_files_directio))
-			di->di_flags |= cpu_to_be32(GFS2_DIF_DIRECTIO);
 	} else if (S_ISDIR(mode)) {
 		di->di_flags |= cpu_to_be32(dip->i_di.di_flags &
-					    GFS2_DIF_INHERIT_DIRECTIO);
-		di->di_flags |= cpu_to_be32(dip->i_di.di_flags &
 					    GFS2_DIF_INHERIT_JDATA);
 	}
 
@@ -1134,7 +1129,7 @@ int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
 	if (IS_APPEND(&dip->i_inode))
 		return -EPERM;
 
-	error = permission(&dip->i_inode, MAY_WRITE | MAY_EXEC, NULL);
+	error = gfs2_permission(&dip->i_inode, MAY_WRITE | MAY_EXEC);
 	if (error)
 		return error;
 
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index 580da454b38f..6074c2506f75 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -72,7 +72,6 @@ static inline void gfs2_inum_out(const struct gfs2_inode *ip,
 }
 
 
-void gfs2_inode_attr_in(struct gfs2_inode *ip);
 void gfs2_set_iop(struct inode *inode);
 struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type, 
 				u64 no_addr, u64 no_formal_ino,
@@ -91,6 +90,7 @@ int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name,
 		struct gfs2_inode *ip);
 int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
 		   const struct gfs2_inode *ip);
+int gfs2_permission(struct inode *inode, int mask);
 int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to);
 int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len);
 int gfs2_glock_nq_atime(struct gfs2_holder *gh);
diff --git a/fs/gfs2/locking.c b/fs/gfs2/locking.c
index 663fee728783..523243a13a21 100644
--- a/fs/gfs2/locking.c
+++ b/fs/gfs2/locking.c
@@ -23,12 +23,54 @@ struct lmh_wrapper {
 	const struct lm_lockops *lw_ops;
 };
 
+static int nolock_mount(char *table_name, char *host_data,
+			lm_callback_t cb, void *cb_data,
+			unsigned int min_lvb_size, int flags,
+			struct lm_lockstruct *lockstruct,
+			struct kobject *fskobj);
+
 /* List of registered low-level locking protocols.  A file system selects one
    of them by name at mount time, e.g. lock_nolock, lock_dlm. */
 
+static const struct lm_lockops nolock_ops = {
+	.lm_proto_name = "lock_nolock",
+	.lm_mount = nolock_mount,
+};
+
+static struct lmh_wrapper nolock_proto  = {
+	.lw_list = LIST_HEAD_INIT(nolock_proto.lw_list),
+	.lw_ops = &nolock_ops,
+};
+
 static LIST_HEAD(lmh_list);
 static DEFINE_MUTEX(lmh_lock);
 
+static int nolock_mount(char *table_name, char *host_data,
+			lm_callback_t cb, void *cb_data,
+			unsigned int min_lvb_size, int flags,
+			struct lm_lockstruct *lockstruct,
+			struct kobject *fskobj)
+{
+	char *c;
+	unsigned int jid;
+
+	c = strstr(host_data, "jid=");
+	if (!c)
+		jid = 0;
+	else {
+		c += 4;
+		sscanf(c, "%u", &jid);
+	}
+
+	lockstruct->ls_jid = jid;
+	lockstruct->ls_first = 1;
+	lockstruct->ls_lvb_size = min_lvb_size;
+	lockstruct->ls_ops = &nolock_ops;
+	lockstruct->ls_flags = LM_LSFLAG_LOCAL;
+
+	return 0;
+}
+
 /**
  * gfs2_register_lockproto - Register a low-level locking protocol
  * @proto: the protocol definition
@@ -116,9 +158,13 @@ int gfs2_mount_lockproto(char *proto_name, char *table_name, char *host_data,
 	int try = 0;
 	int error, found;
 
+
 retry:
 	mutex_lock(&lmh_lock);
 
+	if (list_empty(&nolock_proto.lw_list))
+		list_add(&nolock_proto.lw_list, &lmh_list);
+
 	found = 0;
 	list_for_each_entry(lw, &lmh_list, lw_list) {
 		if (!strcmp(lw->lw_ops->lm_proto_name, proto_name)) {
@@ -139,7 +185,8 @@ retry:
 		goto out;
 	}
 
-	if (!try_module_get(lw->lw_ops->lm_owner)) {
+	if (lw->lw_ops->lm_owner &&
+	    !try_module_get(lw->lw_ops->lm_owner)) {
 		try = 0;
 		mutex_unlock(&lmh_lock);
 		msleep(1000);
@@ -158,7 +205,8 @@ out:
 void gfs2_unmount_lockproto(struct lm_lockstruct *lockstruct)
 {
 	mutex_lock(&lmh_lock);
-	lockstruct->ls_ops->lm_unmount(lockstruct->ls_lockspace);
+	if (lockstruct->ls_ops->lm_unmount)
+		lockstruct->ls_ops->lm_unmount(lockstruct->ls_lockspace);
 	if (lockstruct->ls_ops->lm_owner)
 		module_put(lockstruct->ls_ops->lm_owner);
 	mutex_unlock(&lmh_lock);
diff --git a/fs/gfs2/locking/dlm/lock.c b/fs/gfs2/locking/dlm/lock.c
index cf7ea8abec87..2482c9047505 100644
--- a/fs/gfs2/locking/dlm/lock.c
+++ b/fs/gfs2/locking/dlm/lock.c
@@ -11,46 +11,60 @@
 
 static char junk_lvb[GDLM_LVB_SIZE];
 
-static void queue_complete(struct gdlm_lock *lp)
+
+/* convert dlm lock-mode to gfs lock-state */
+
+static s16 gdlm_make_lmstate(s16 dlmmode)
 {
-	struct gdlm_ls *ls = lp->ls;
+	switch (dlmmode) {
+	case DLM_LOCK_IV:
+	case DLM_LOCK_NL:
+		return LM_ST_UNLOCKED;
+	case DLM_LOCK_EX:
+		return LM_ST_EXCLUSIVE;
+	case DLM_LOCK_CW:
+		return LM_ST_DEFERRED;
+	case DLM_LOCK_PR:
+		return LM_ST_SHARED;
+	}
+	gdlm_assert(0, "unknown DLM mode %d", dlmmode);
+	return -1;
+}
 
-	clear_bit(LFL_ACTIVE, &lp->flags);
+/* A lock placed on this queue is re-submitted to DLM as soon as the lock_dlm
+   thread gets to it. */
+
+static void queue_submit(struct gdlm_lock *lp)
+{
+	struct gdlm_ls *ls = lp->ls;
 
 	spin_lock(&ls->async_lock);
-	list_add_tail(&lp->clist, &ls->complete);
+	list_add_tail(&lp->delay_list, &ls->submit);
 	spin_unlock(&ls->async_lock);
 	wake_up(&ls->thread_wait);
 }
 
-static inline void gdlm_ast(void *astarg)
+static void wake_up_ast(struct gdlm_lock *lp)
 {
-	queue_complete(astarg);
+	clear_bit(LFL_AST_WAIT, &lp->flags);
+	smp_mb__after_clear_bit();
+	wake_up_bit(&lp->flags, LFL_AST_WAIT);
 }
 
-static inline void gdlm_bast(void *astarg, int mode)
+static void gdlm_delete_lp(struct gdlm_lock *lp)
 {
-	struct gdlm_lock *lp = astarg;
 	struct gdlm_ls *ls = lp->ls;
 
-	if (!mode) {
-		printk(KERN_INFO "lock_dlm: bast mode zero %x,%llx\n",
-			lp->lockname.ln_type,
-			(unsigned long long)lp->lockname.ln_number);
-		return;
-	}
-
 	spin_lock(&ls->async_lock);
-	if (!lp->bast_mode) {
-		list_add_tail(&lp->blist, &ls->blocking);
-		lp->bast_mode = mode;
-	} else if (lp->bast_mode < mode)
-		lp->bast_mode = mode;
+	if (!list_empty(&lp->delay_list))
+		list_del_init(&lp->delay_list);
+	ls->all_locks_count--;
 	spin_unlock(&ls->async_lock);
-	wake_up(&ls->thread_wait);
+
+	kfree(lp);
 }
 
-void gdlm_queue_delayed(struct gdlm_lock *lp)
+static void gdlm_queue_delayed(struct gdlm_lock *lp)
 {
 	struct gdlm_ls *ls = lp->ls;
 
@@ -59,6 +73,236 @@ void gdlm_queue_delayed(struct gdlm_lock *lp)
 	spin_unlock(&ls->async_lock);
 }
 
+static void process_complete(struct gdlm_lock *lp)
+{
+	struct gdlm_ls *ls = lp->ls;
+	struct lm_async_cb acb;
+
+	memset(&acb, 0, sizeof(acb));
+
+	if (lp->lksb.sb_status == -DLM_ECANCEL) {
+		log_info("complete dlm cancel %x,%llx flags %lx",
+		 	 lp->lockname.ln_type,
+			 (unsigned long long)lp->lockname.ln_number,
+			 lp->flags);
+
+		lp->req = lp->cur;
+		acb.lc_ret |= LM_OUT_CANCELED;
+		if (lp->cur == DLM_LOCK_IV)
+			lp->lksb.sb_lkid = 0;
+		goto out;
+	}
+
+	if (test_and_clear_bit(LFL_DLM_UNLOCK, &lp->flags)) {
+		if (lp->lksb.sb_status != -DLM_EUNLOCK) {
+			log_info("unlock sb_status %d %x,%llx flags %lx",
+				 lp->lksb.sb_status, lp->lockname.ln_type,
+				 (unsigned long long)lp->lockname.ln_number,
+				 lp->flags);
+			return;
+		}
+
+		lp->cur = DLM_LOCK_IV;
+		lp->req = DLM_LOCK_IV;
+		lp->lksb.sb_lkid = 0;
+
+		if (test_and_clear_bit(LFL_UNLOCK_DELETE, &lp->flags)) {
+			gdlm_delete_lp(lp);
+			return;
+		}
+		goto out;
+	}
+
+	if (lp->lksb.sb_flags & DLM_SBF_VALNOTVALID)
+		memset(lp->lksb.sb_lvbptr, 0, GDLM_LVB_SIZE);
+
+	if (lp->lksb.sb_flags & DLM_SBF_ALTMODE) {
+		if (lp->req == DLM_LOCK_PR)
+			lp->req = DLM_LOCK_CW;
+		else if (lp->req == DLM_LOCK_CW)
+			lp->req = DLM_LOCK_PR;
+	}
+
+	/*
+	 * A canceled lock request.  The lock was just taken off the delayed
+	 * list and was never even submitted to dlm.
+	 */
+
+	if (test_and_clear_bit(LFL_CANCEL, &lp->flags)) {
+		log_info("complete internal cancel %x,%llx",
+		 	 lp->lockname.ln_type,
+			 (unsigned long long)lp->lockname.ln_number);
+		lp->req = lp->cur;
+		acb.lc_ret |= LM_OUT_CANCELED;
+		goto out;
+	}
+
+	/*
+	 * An error occured.
+	 */
+
+	if (lp->lksb.sb_status) {
+		/* a "normal" error */
+		if ((lp->lksb.sb_status == -EAGAIN) &&
+		    (lp->lkf & DLM_LKF_NOQUEUE)) {
+			lp->req = lp->cur;
+			if (lp->cur == DLM_LOCK_IV)
+				lp->lksb.sb_lkid = 0;
+			goto out;
+		}
+
+		/* this could only happen with cancels I think */
+		log_info("ast sb_status %d %x,%llx flags %lx",
+			 lp->lksb.sb_status, lp->lockname.ln_type,
+			 (unsigned long long)lp->lockname.ln_number,
+			 lp->flags);
+		return;
+	}
+
+	/*
+	 * This is an AST for an EX->EX conversion for sync_lvb from GFS.
+	 */
+
+	if (test_and_clear_bit(LFL_SYNC_LVB, &lp->flags)) {
+		wake_up_ast(lp);
+		return;
+	}
+
+	/*
+	 * A lock has been demoted to NL because it initially completed during
+	 * BLOCK_LOCKS.  Now it must be requested in the originally requested
+	 * mode.
+	 */
+
+	if (test_and_clear_bit(LFL_REREQUEST, &lp->flags)) {
+		gdlm_assert(lp->req == DLM_LOCK_NL, "%x,%llx",
+			    lp->lockname.ln_type,
+			    (unsigned long long)lp->lockname.ln_number);
+		gdlm_assert(lp->prev_req > DLM_LOCK_NL, "%x,%llx",
+			    lp->lockname.ln_type,
+			    (unsigned long long)lp->lockname.ln_number);
+
+		lp->cur = DLM_LOCK_NL;
+		lp->req = lp->prev_req;
+		lp->prev_req = DLM_LOCK_IV;
+		lp->lkf &= ~DLM_LKF_CONVDEADLK;
+
+		set_bit(LFL_NOCACHE, &lp->flags);
+
+		if (test_bit(DFL_BLOCK_LOCKS, &ls->flags) &&
+		    !test_bit(LFL_NOBLOCK, &lp->flags))
+			gdlm_queue_delayed(lp);
+		else
+			queue_submit(lp);
+		return;
+	}
+
+	/*
+	 * A request is granted during dlm recovery.  It may be granted
+	 * because the locks of a failed node were cleared.  In that case,
+	 * there may be inconsistent data beneath this lock and we must wait
+	 * for recovery to complete to use it.  When gfs recovery is done this
+	 * granted lock will be converted to NL and then reacquired in this
+	 * granted state.
+	 */
+
+	if (test_bit(DFL_BLOCK_LOCKS, &ls->flags) &&
+	    !test_bit(LFL_NOBLOCK, &lp->flags) &&
+	    lp->req != DLM_LOCK_NL) {
+
+		lp->cur = lp->req;
+		lp->prev_req = lp->req;
+		lp->req = DLM_LOCK_NL;
+		lp->lkf |= DLM_LKF_CONVERT;
+		lp->lkf &= ~DLM_LKF_CONVDEADLK;
+
+		log_debug("rereq %x,%llx id %x %d,%d",
+			  lp->lockname.ln_type,
+			  (unsigned long long)lp->lockname.ln_number,
+			  lp->lksb.sb_lkid, lp->cur, lp->req);
+
+		set_bit(LFL_REREQUEST, &lp->flags);
+		queue_submit(lp);
+		return;
+	}
+
+	/*
+	 * DLM demoted the lock to NL before it was granted so GFS must be
+	 * told it cannot cache data for this lock.
+	 */
+
+	if (lp->lksb.sb_flags & DLM_SBF_DEMOTED)
+		set_bit(LFL_NOCACHE, &lp->flags);
+
+out:
+	/*
+	 * This is an internal lock_dlm lock
+	 */
+
+	if (test_bit(LFL_INLOCK, &lp->flags)) {
+		clear_bit(LFL_NOBLOCK, &lp->flags);
+		lp->cur = lp->req;
+		wake_up_ast(lp);
+		return;
+	}
+
+	/*
+	 * Normal completion of a lock request.  Tell GFS it now has the lock.
+	 */
+
+	clear_bit(LFL_NOBLOCK, &lp->flags);
+	lp->cur = lp->req;
+
+	acb.lc_name = lp->lockname;
+	acb.lc_ret |= gdlm_make_lmstate(lp->cur);
+
+	ls->fscb(ls->sdp, LM_CB_ASYNC, &acb);
+}
+
+static void gdlm_ast(void *astarg)
+{
+	struct gdlm_lock *lp = astarg;
+	clear_bit(LFL_ACTIVE, &lp->flags);
+	process_complete(lp);
+}
+
+static void process_blocking(struct gdlm_lock *lp, int bast_mode)
+{
+	struct gdlm_ls *ls = lp->ls;
+	unsigned int cb = 0;
+
+	switch (gdlm_make_lmstate(bast_mode)) {
+	case LM_ST_EXCLUSIVE:
+		cb = LM_CB_NEED_E;
+		break;
+	case LM_ST_DEFERRED:
+		cb = LM_CB_NEED_D;
+		break;
+	case LM_ST_SHARED:
+		cb = LM_CB_NEED_S;
+		break;
+	default:
+		gdlm_assert(0, "unknown bast mode %u", bast_mode);
+	}
+
+	ls->fscb(ls->sdp, cb, &lp->lockname);
+}
+
+
+static void gdlm_bast(void *astarg, int mode)
+{
+	struct gdlm_lock *lp = astarg;
+
+	if (!mode) {
+		printk(KERN_INFO "lock_dlm: bast mode zero %x,%llx\n",
+			lp->lockname.ln_type,
+			(unsigned long long)lp->lockname.ln_number);
+		return;
+	}
+
+	process_blocking(lp, mode);
+}
+
 /* convert gfs lock-state to dlm lock-mode */
 
 static s16 make_mode(s16 lmstate)
@@ -77,24 +321,6 @@ static s16 make_mode(s16 lmstate)
 	return -1;
 }
 
-/* convert dlm lock-mode to gfs lock-state */
-
-s16 gdlm_make_lmstate(s16 dlmmode)
-{
-	switch (dlmmode) {
-	case DLM_LOCK_IV:
-	case DLM_LOCK_NL:
-		return LM_ST_UNLOCKED;
-	case DLM_LOCK_EX:
-		return LM_ST_EXCLUSIVE;
-	case DLM_LOCK_CW:
-		return LM_ST_DEFERRED;
-	case DLM_LOCK_PR:
-		return LM_ST_SHARED;
-	}
-	gdlm_assert(0, "unknown DLM mode %d", dlmmode);
-	return -1;
-}
 
 /* verify agreement with GFS on the current lock state, NB: DLM_LOCK_NL and
    DLM_LOCK_IV are both considered LM_ST_UNLOCKED by GFS. */
@@ -134,14 +360,6 @@ static inline unsigned int make_flags(struct gdlm_lock *lp,
 
 	if (lp->lksb.sb_lkid != 0) {
 		lkf |= DLM_LKF_CONVERT;
-
-		/* Conversion deadlock avoidance by DLM */
-
-		if (!(lp->ls->fsflags & LM_MFLAG_CONV_NODROP) &&
-		    !test_bit(LFL_FORCE_PROMOTE, &lp->flags) &&
-		    !(lkf & DLM_LKF_NOQUEUE) &&
-		    cur > DLM_LOCK_NL && req > DLM_LOCK_NL && cur != req)
-			lkf |= DLM_LKF_CONVDEADLK;
 	}
 
 	if (lp->lvb)
@@ -173,14 +391,9 @@ static int gdlm_create_lp(struct gdlm_ls *ls, struct lm_lockname *name,
 	make_strname(name, &lp->strname);
 	lp->ls = ls;
 	lp->cur = DLM_LOCK_IV;
-	lp->lvb = NULL;
-	lp->hold_null = NULL;
-	INIT_LIST_HEAD(&lp->clist);
-	INIT_LIST_HEAD(&lp->blist);
 	INIT_LIST_HEAD(&lp->delay_list);
 
 	spin_lock(&ls->async_lock);
-	list_add(&lp->all_list, &ls->all_locks);
 	ls->all_locks_count++;
 	spin_unlock(&ls->async_lock);
 
@@ -188,26 +401,6 @@ static int gdlm_create_lp(struct gdlm_ls *ls, struct lm_lockname *name,
 	return 0;
 }
 
-void gdlm_delete_lp(struct gdlm_lock *lp)
-{
-	struct gdlm_ls *ls = lp->ls;
-
-	spin_lock(&ls->async_lock);
-	if (!list_empty(&lp->clist))
-		list_del_init(&lp->clist);
-	if (!list_empty(&lp->blist))
-		list_del_init(&lp->blist);
-	if (!list_empty(&lp->delay_list))
-		list_del_init(&lp->delay_list);
-	gdlm_assert(!list_empty(&lp->all_list), "%x,%llx", lp->lockname.ln_type,
-		    (unsigned long long)lp->lockname.ln_number);
-	list_del_init(&lp->all_list);
-	ls->all_locks_count--;
-	spin_unlock(&ls->async_lock);
-
-	kfree(lp);
-}
-
 int gdlm_get_lock(void *lockspace, struct lm_lockname *name,
 		  void **lockp)
 {
@@ -261,7 +454,7 @@ unsigned int gdlm_do_lock(struct gdlm_lock *lp)
 
 	if ((error == -EAGAIN) && (lp->lkf & DLM_LKF_NOQUEUE)) {
 		lp->lksb.sb_status = -EAGAIN;
-		queue_complete(lp);
+		gdlm_ast(lp);
 		error = 0;
 	}
 
@@ -308,6 +501,12 @@ unsigned int gdlm_lock(void *lock, unsigned int cur_state,
 {
 	struct gdlm_lock *lp = lock;
 
+	if (req_state == LM_ST_UNLOCKED)
+		return gdlm_unlock(lock, cur_state);
+
+	if (req_state == LM_ST_UNLOCKED)
+		return gdlm_unlock(lock, cur_state);
+
 	clear_bit(LFL_DLM_CANCEL, &lp->flags);
 	if (flags & LM_FLAG_NOEXP)
 		set_bit(LFL_NOBLOCK, &lp->flags);
@@ -351,7 +550,7 @@ void gdlm_cancel(void *lock)
 	if (delay_list) {
 		set_bit(LFL_CANCEL, &lp->flags);
 		set_bit(LFL_ACTIVE, &lp->flags);
-		queue_complete(lp);
+		gdlm_ast(lp);
 		return;
 	}
 
@@ -507,22 +706,3 @@ void gdlm_submit_delayed(struct gdlm_ls *ls)
 	wake_up(&ls->thread_wait);
 }
 
-int gdlm_release_all_locks(struct gdlm_ls *ls)
-{
-	struct gdlm_lock *lp, *safe;
-	int count = 0;
-
-	spin_lock(&ls->async_lock);
-	list_for_each_entry_safe(lp, safe, &ls->all_locks, all_list) {
-		list_del_init(&lp->all_list);
-
-		if (lp->lvb && lp->lvb != junk_lvb)
-			kfree(lp->lvb);
-		kfree(lp);
-		count++;
-	}
-	spin_unlock(&ls->async_lock);
-
-	return count;
-}
-
diff --git a/fs/gfs2/locking/dlm/lock_dlm.h b/fs/gfs2/locking/dlm/lock_dlm.h
index a243cf69c54e..3c98e7c6f93b 100644
--- a/fs/gfs2/locking/dlm/lock_dlm.h
+++ b/fs/gfs2/locking/dlm/lock_dlm.h
@@ -72,19 +72,12 @@ struct gdlm_ls {
 	int			recover_jid_done;
 	int			recover_jid_status;
 	spinlock_t		async_lock;
-	struct list_head	complete;
-	struct list_head	blocking;
 	struct list_head	delayed;
 	struct list_head	submit;
-	struct list_head	all_locks;
 	u32		all_locks_count;
 	wait_queue_head_t	wait_control;
-	struct task_struct	*thread1;
-	struct task_struct	*thread2;
+	struct task_struct	*thread;
 	wait_queue_head_t	thread_wait;
-	unsigned long		drop_time;
-	int			drop_locks_count;
-	int			drop_locks_period;
 };
 
 enum {
@@ -117,12 +110,7 @@ struct gdlm_lock {
 	u32			lkf;		/* dlm flags DLM_LKF_ */
 	unsigned long		flags;		/* lock_dlm flags LFL_ */
 
-	int			bast_mode;	/* protected by async_lock */
-
-	struct list_head	clist;		/* complete */
-	struct list_head	blist;		/* blocking */
 	struct list_head	delay_list;	/* delayed */
-	struct list_head	all_list;	/* all locks for the fs */
 	struct gdlm_lock	*hold_null;	/* NL lock for hold_lvb */
 };
 
@@ -159,11 +147,7 @@ void gdlm_release_threads(struct gdlm_ls *);
 
 /* lock.c */
 
-s16 gdlm_make_lmstate(s16);
-void gdlm_queue_delayed(struct gdlm_lock *);
 void gdlm_submit_delayed(struct gdlm_ls *);
-int gdlm_release_all_locks(struct gdlm_ls *);
-void gdlm_delete_lp(struct gdlm_lock *);
 unsigned int gdlm_do_lock(struct gdlm_lock *);
 
 int gdlm_get_lock(void *, struct lm_lockname *, void **);
diff --git a/fs/gfs2/locking/dlm/mount.c b/fs/gfs2/locking/dlm/mount.c
index 470bdf650b50..09d78c216f48 100644
--- a/fs/gfs2/locking/dlm/mount.c
+++ b/fs/gfs2/locking/dlm/mount.c
@@ -22,22 +22,14 @@ static struct gdlm_ls *init_gdlm(lm_callback_t cb, struct gfs2_sbd *sdp,
 	if (!ls)
 		return NULL;
 
-	ls->drop_locks_count = GDLM_DROP_COUNT;
-	ls->drop_locks_period = GDLM_DROP_PERIOD;
 	ls->fscb = cb;
 	ls->sdp = sdp;
 	ls->fsflags = flags;
 	spin_lock_init(&ls->async_lock);
-	INIT_LIST_HEAD(&ls->complete);
-	INIT_LIST_HEAD(&ls->blocking);
 	INIT_LIST_HEAD(&ls->delayed);
 	INIT_LIST_HEAD(&ls->submit);
-	INIT_LIST_HEAD(&ls->all_locks);
 	init_waitqueue_head(&ls->thread_wait);
 	init_waitqueue_head(&ls->wait_control);
-	ls->thread1 = NULL;
-	ls->thread2 = NULL;
-	ls->drop_time = jiffies;
 	ls->jid = -1;
 
 	strncpy(buf, table_name, 256);
@@ -180,7 +172,6 @@ out:
 static void gdlm_unmount(void *lockspace)
 {
 	struct gdlm_ls *ls = lockspace;
-	int rv;
 
 	log_debug("unmount flags %lx", ls->flags);
 
@@ -194,9 +185,7 @@ static void gdlm_unmount(void *lockspace)
 	gdlm_kobject_release(ls);
 	dlm_release_lockspace(ls->dlm_lockspace, 2);
 	gdlm_release_threads(ls);
-	rv = gdlm_release_all_locks(ls);
-	if (rv)
-		log_info("gdlm_unmount: %d stray locks freed", rv);
+	BUG_ON(ls->all_locks_count);
 out:
 	kfree(ls);
 }
@@ -232,7 +221,6 @@ static void gdlm_withdraw(void *lockspace)
 
 	dlm_release_lockspace(ls->dlm_lockspace, 2);
 	gdlm_release_threads(ls);
-	gdlm_release_all_locks(ls);
 	gdlm_kobject_release(ls);
 }
 
diff --git a/fs/gfs2/locking/dlm/sysfs.c b/fs/gfs2/locking/dlm/sysfs.c
index a4ff271df9ee..4ec571c3d8a9 100644
--- a/fs/gfs2/locking/dlm/sysfs.c
+++ b/fs/gfs2/locking/dlm/sysfs.c
@@ -114,17 +114,6 @@ static ssize_t recover_status_show(struct gdlm_ls *ls, char *buf)
 	return sprintf(buf, "%d\n", ls->recover_jid_status);
 }
 
-static ssize_t drop_count_show(struct gdlm_ls *ls, char *buf)
-{
-	return sprintf(buf, "%d\n", ls->drop_locks_count);
-}
-
-static ssize_t drop_count_store(struct gdlm_ls *ls, const char *buf, size_t len)
-{
-	ls->drop_locks_count = simple_strtol(buf, NULL, 0);
-	return len;
-}
-
 struct gdlm_attr {
 	struct attribute attr;
 	ssize_t (*show)(struct gdlm_ls *, char *);
@@ -144,7 +133,6 @@ GDLM_ATTR(first_done,     0444, first_done_show,     NULL);
 GDLM_ATTR(recover,        0644, recover_show,        recover_store);
 GDLM_ATTR(recover_done,   0444, recover_done_show,   NULL);
 GDLM_ATTR(recover_status, 0444, recover_status_show, NULL);
-GDLM_ATTR(drop_count,     0644, drop_count_show,     drop_count_store);
 
 static struct attribute *gdlm_attrs[] = {
 	&gdlm_attr_proto_name.attr,
@@ -157,7 +145,6 @@ static struct attribute *gdlm_attrs[] = {
 	&gdlm_attr_recover.attr,
 	&gdlm_attr_recover_done.attr,
 	&gdlm_attr_recover_status.attr,
-	&gdlm_attr_drop_count.attr,
 	NULL,
 };
 
diff --git a/fs/gfs2/locking/dlm/thread.c b/fs/gfs2/locking/dlm/thread.c
index e53db6fd28ab..38823efd698c 100644
--- a/fs/gfs2/locking/dlm/thread.c
+++ b/fs/gfs2/locking/dlm/thread.c
@@ -9,367 +9,60 @@
 
 #include "lock_dlm.h"
 
-/* A lock placed on this queue is re-submitted to DLM as soon as the lock_dlm
-   thread gets to it. */
-
-static void queue_submit(struct gdlm_lock *lp)
-{
-	struct gdlm_ls *ls = lp->ls;
-
-	spin_lock(&ls->async_lock);
-	list_add_tail(&lp->delay_list, &ls->submit);
-	spin_unlock(&ls->async_lock);
-	wake_up(&ls->thread_wait);
-}
-
-static void process_blocking(struct gdlm_lock *lp, int bast_mode)
-{
-	struct gdlm_ls *ls = lp->ls;
-	unsigned int cb = 0;
-
-	switch (gdlm_make_lmstate(bast_mode)) {
-	case LM_ST_EXCLUSIVE:
-		cb = LM_CB_NEED_E;
-		break;
-	case LM_ST_DEFERRED:
-		cb = LM_CB_NEED_D;
-		break;
-	case LM_ST_SHARED:
-		cb = LM_CB_NEED_S;
-		break;
-	default:
-		gdlm_assert(0, "unknown bast mode %u", lp->bast_mode);
-	}
-
-	ls->fscb(ls->sdp, cb, &lp->lockname);
-}
-
-static void wake_up_ast(struct gdlm_lock *lp)
-{
-	clear_bit(LFL_AST_WAIT, &lp->flags);
-	smp_mb__after_clear_bit();
-	wake_up_bit(&lp->flags, LFL_AST_WAIT);
-}
-
-static void process_complete(struct gdlm_lock *lp)
-{
-	struct gdlm_ls *ls = lp->ls;
-	struct lm_async_cb acb;
-	s16 prev_mode = lp->cur;
-
-	memset(&acb, 0, sizeof(acb));
-
-	if (lp->lksb.sb_status == -DLM_ECANCEL) {
-		log_info("complete dlm cancel %x,%llx flags %lx",
-		 	 lp->lockname.ln_type,
-			 (unsigned long long)lp->lockname.ln_number,
-			 lp->flags);
-
-		lp->req = lp->cur;
-		acb.lc_ret |= LM_OUT_CANCELED;
-		if (lp->cur == DLM_LOCK_IV)
-			lp->lksb.sb_lkid = 0;
-		goto out;
-	}
-
-	if (test_and_clear_bit(LFL_DLM_UNLOCK, &lp->flags)) {
-		if (lp->lksb.sb_status != -DLM_EUNLOCK) {
-			log_info("unlock sb_status %d %x,%llx flags %lx",
-				 lp->lksb.sb_status, lp->lockname.ln_type,
-				 (unsigned long long)lp->lockname.ln_number,
-				 lp->flags);
-			return;
-		}
-
-		lp->cur = DLM_LOCK_IV;
-		lp->req = DLM_LOCK_IV;
-		lp->lksb.sb_lkid = 0;
-
-		if (test_and_clear_bit(LFL_UNLOCK_DELETE, &lp->flags)) {
-			gdlm_delete_lp(lp);
-			return;
-		}
-		goto out;
-	}
-
-	if (lp->lksb.sb_flags & DLM_SBF_VALNOTVALID)
-		memset(lp->lksb.sb_lvbptr, 0, GDLM_LVB_SIZE);
-
-	if (lp->lksb.sb_flags & DLM_SBF_ALTMODE) {
-		if (lp->req == DLM_LOCK_PR)
-			lp->req = DLM_LOCK_CW;
-		else if (lp->req == DLM_LOCK_CW)
-			lp->req = DLM_LOCK_PR;
-	}
-
-	/*
-	 * A canceled lock request.  The lock was just taken off the delayed
-	 * list and was never even submitted to dlm.
-	 */
-
-	if (test_and_clear_bit(LFL_CANCEL, &lp->flags)) {
-		log_info("complete internal cancel %x,%llx",
-		 	 lp->lockname.ln_type,
-			 (unsigned long long)lp->lockname.ln_number);
-		lp->req = lp->cur;
-		acb.lc_ret |= LM_OUT_CANCELED;
-		goto out;
-	}
-
-	/*
-	 * An error occured.
-	 */
-
-	if (lp->lksb.sb_status) {
-		/* a "normal" error */
-		if ((lp->lksb.sb_status == -EAGAIN) &&
-		    (lp->lkf & DLM_LKF_NOQUEUE)) {
-			lp->req = lp->cur;
-			if (lp->cur == DLM_LOCK_IV)
-				lp->lksb.sb_lkid = 0;
-			goto out;
-		}
-
-		/* this could only happen with cancels I think */
-		log_info("ast sb_status %d %x,%llx flags %lx",
-			 lp->lksb.sb_status, lp->lockname.ln_type,
-			 (unsigned long long)lp->lockname.ln_number,
-			 lp->flags);
-		if (lp->lksb.sb_status == -EDEADLOCK &&
-		    lp->ls->fsflags & LM_MFLAG_CONV_NODROP) {
-			lp->req = lp->cur;
-			acb.lc_ret |= LM_OUT_CONV_DEADLK;
-			if (lp->cur == DLM_LOCK_IV)
-				lp->lksb.sb_lkid = 0;
-			goto out;
-		} else
-			return;
-	}
-
-	/*
-	 * This is an AST for an EX->EX conversion for sync_lvb from GFS.
-	 */
-
-	if (test_and_clear_bit(LFL_SYNC_LVB, &lp->flags)) {
-		wake_up_ast(lp);
-		return;
-	}
-
-	/*
-	 * A lock has been demoted to NL because it initially completed during
-	 * BLOCK_LOCKS.  Now it must be requested in the originally requested
-	 * mode.
-	 */
-
-	if (test_and_clear_bit(LFL_REREQUEST, &lp->flags)) {
-		gdlm_assert(lp->req == DLM_LOCK_NL, "%x,%llx",
-			    lp->lockname.ln_type,
-			    (unsigned long long)lp->lockname.ln_number);
-		gdlm_assert(lp->prev_req > DLM_LOCK_NL, "%x,%llx",
-			    lp->lockname.ln_type,
-			    (unsigned long long)lp->lockname.ln_number);
-
-		lp->cur = DLM_LOCK_NL;
-		lp->req = lp->prev_req;
-		lp->prev_req = DLM_LOCK_IV;
-		lp->lkf &= ~DLM_LKF_CONVDEADLK;
-
-		set_bit(LFL_NOCACHE, &lp->flags);
-
-		if (test_bit(DFL_BLOCK_LOCKS, &ls->flags) &&
-		    !test_bit(LFL_NOBLOCK, &lp->flags))
-			gdlm_queue_delayed(lp);
-		else
-			queue_submit(lp);
-		return;
-	}
-
-	/*
-	 * A request is granted during dlm recovery.  It may be granted
-	 * because the locks of a failed node were cleared.  In that case,
-	 * there may be inconsistent data beneath this lock and we must wait
-	 * for recovery to complete to use it.  When gfs recovery is done this
-	 * granted lock will be converted to NL and then reacquired in this
-	 * granted state.
-	 */
-
-	if (test_bit(DFL_BLOCK_LOCKS, &ls->flags) &&
-	    !test_bit(LFL_NOBLOCK, &lp->flags) &&
-	    lp->req != DLM_LOCK_NL) {
-
-		lp->cur = lp->req;
-		lp->prev_req = lp->req;
-		lp->req = DLM_LOCK_NL;
-		lp->lkf |= DLM_LKF_CONVERT;
-		lp->lkf &= ~DLM_LKF_CONVDEADLK;
-
-		log_debug("rereq %x,%llx id %x %d,%d",
-			  lp->lockname.ln_type,
-			  (unsigned long long)lp->lockname.ln_number,
-			  lp->lksb.sb_lkid, lp->cur, lp->req);
-
-		set_bit(LFL_REREQUEST, &lp->flags);
-		queue_submit(lp);
-		return;
-	}
-
-	/*
-	 * DLM demoted the lock to NL before it was granted so GFS must be
-	 * told it cannot cache data for this lock.
-	 */
-
-	if (lp->lksb.sb_flags & DLM_SBF_DEMOTED)
-		set_bit(LFL_NOCACHE, &lp->flags);
-
-out:
-	/*
-	 * This is an internal lock_dlm lock
-	 */
-
-	if (test_bit(LFL_INLOCK, &lp->flags)) {
-		clear_bit(LFL_NOBLOCK, &lp->flags);
-		lp->cur = lp->req;
-		wake_up_ast(lp);
-		return;
-	}
-
-	/*
-	 * Normal completion of a lock request.  Tell GFS it now has the lock.
-	 */
-
-	clear_bit(LFL_NOBLOCK, &lp->flags);
-	lp->cur = lp->req;
-
-	acb.lc_name = lp->lockname;
-	acb.lc_ret |= gdlm_make_lmstate(lp->cur);
-
-	if (!test_and_clear_bit(LFL_NOCACHE, &lp->flags) &&
-	    (lp->cur > DLM_LOCK_NL) && (prev_mode > DLM_LOCK_NL))
-		acb.lc_ret |= LM_OUT_CACHEABLE;
-
-	ls->fscb(ls->sdp, LM_CB_ASYNC, &acb);
-}
-
-static inline int no_work(struct gdlm_ls *ls, int blocking)
+static inline int no_work(struct gdlm_ls *ls)
 {
 	int ret;
 
 	spin_lock(&ls->async_lock);
-	ret = list_empty(&ls->complete) && list_empty(&ls->submit);
-	if (ret && blocking)
-		ret = list_empty(&ls->blocking);
+	ret = list_empty(&ls->submit);
 	spin_unlock(&ls->async_lock);
 
 	return ret;
 }
 
-static inline int check_drop(struct gdlm_ls *ls)
-{
-	if (!ls->drop_locks_count)
-		return 0;
-
-	if (time_after(jiffies, ls->drop_time + ls->drop_locks_period * HZ)) {
-		ls->drop_time = jiffies;
-		if (ls->all_locks_count >= ls->drop_locks_count)
-			return 1;
-	}
-	return 0;
-}
-
-static int gdlm_thread(void *data, int blist)
+static int gdlm_thread(void *data)
 {
 	struct gdlm_ls *ls = (struct gdlm_ls *) data;
 	struct gdlm_lock *lp = NULL;
-	uint8_t complete, blocking, submit, drop;
-
-	/* Only thread1 is allowed to do blocking callbacks since gfs
-	   may wait for a completion callback within a blocking cb. */
 
 	while (!kthread_should_stop()) {
 		wait_event_interruptible(ls->thread_wait,
-				!no_work(ls, blist) || kthread_should_stop());
-
-		complete = blocking = submit = drop = 0;
+				!no_work(ls) || kthread_should_stop());
 
 		spin_lock(&ls->async_lock);
 
-		if (blist && !list_empty(&ls->blocking)) {
-			lp = list_entry(ls->blocking.next, struct gdlm_lock,
-					blist);
-			list_del_init(&lp->blist);
-			blocking = lp->bast_mode;
-			lp->bast_mode = 0;
-		} else if (!list_empty(&ls->complete)) {
-			lp = list_entry(ls->complete.next, struct gdlm_lock,
-					clist);
-			list_del_init(&lp->clist);
-			complete = 1;
-		} else if (!list_empty(&ls->submit)) {
+		if (!list_empty(&ls->submit)) {
 			lp = list_entry(ls->submit.next, struct gdlm_lock,
 					delay_list);
 			list_del_init(&lp->delay_list);
-			submit = 1;
+			spin_unlock(&ls->async_lock);
+			gdlm_do_lock(lp);
+			spin_lock(&ls->async_lock);
 		}
-
-		drop = check_drop(ls);
 		spin_unlock(&ls->async_lock);
-
-		if (complete)
-			process_complete(lp);
-
-		else if (blocking)
-			process_blocking(lp, blocking);
-
-		else if (submit)
-			gdlm_do_lock(lp);
-
-		if (drop)
-			ls->fscb(ls->sdp, LM_CB_DROPLOCKS, NULL);
-
-		schedule();
 	}
 
 	return 0;
 }
 
-static int gdlm_thread1(void *data)
-{
-	return gdlm_thread(data, 1);
-}
-
-static int gdlm_thread2(void *data)
-{
-	return gdlm_thread(data, 0);
-}
-
 int gdlm_init_threads(struct gdlm_ls *ls)
 {
 	struct task_struct *p;
 	int error;
 
-	p = kthread_run(gdlm_thread1, ls, "lock_dlm1");
-	error = IS_ERR(p);
-	if (error) {
-		log_error("can't start lock_dlm1 thread %d", error);
-		return error;
-	}
-	ls->thread1 = p;
-
-	p = kthread_run(gdlm_thread2, ls, "lock_dlm2");
+	p = kthread_run(gdlm_thread, ls, "lock_dlm");
 	error = IS_ERR(p);
 	if (error) {
-		log_error("can't start lock_dlm2 thread %d", error);
-		kthread_stop(ls->thread1);
+		log_error("can't start lock_dlm thread %d", error);
 		return error;
 	}
-	ls->thread2 = p;
+	ls->thread = p;
 
 	return 0;
 }
 
 void gdlm_release_threads(struct gdlm_ls *ls)
 {
-	kthread_stop(ls->thread1);
-	kthread_stop(ls->thread2);
+	kthread_stop(ls->thread);
 }
 
diff --git a/fs/gfs2/locking/nolock/Makefile b/fs/gfs2/locking/nolock/Makefile
deleted file mode 100644
index 35e9730bc3a8..000000000000
--- a/fs/gfs2/locking/nolock/Makefile
+++ /dev/null
@@ -1,3 +0,0 @@
-obj-$(CONFIG_GFS2_FS_LOCKING_NOLOCK) += lock_nolock.o
-lock_nolock-y := main.o
-
diff --git a/fs/gfs2/locking/nolock/main.c b/fs/gfs2/locking/nolock/main.c
deleted file mode 100644
index 284a5ece8d94..000000000000
--- a/fs/gfs2/locking/nolock/main.c
+++ /dev/null
@@ -1,238 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2005 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <linux/init.h>
-#include <linux/types.h>
-#include <linux/fs.h>
-#include <linux/lm_interface.h>
-
-struct nolock_lockspace {
-	unsigned int nl_lvb_size;
-};
-
-static const struct lm_lockops nolock_ops;
-
-static int nolock_mount(char *table_name, char *host_data,
-			lm_callback_t cb, void *cb_data,
-			unsigned int min_lvb_size, int flags,
-			struct lm_lockstruct *lockstruct,
-			struct kobject *fskobj)
-{
-	char *c;
-	unsigned int jid;
-	struct nolock_lockspace *nl;
-
-	c = strstr(host_data, "jid=");
-	if (!c)
-		jid = 0;
-	else {
-		c += 4;
-		sscanf(c, "%u", &jid);
-	}
-
-	nl = kzalloc(sizeof(struct nolock_lockspace), GFP_KERNEL);
-	if (!nl)
-		return -ENOMEM;
-
-	nl->nl_lvb_size = min_lvb_size;
-
-	lockstruct->ls_jid = jid;
-	lockstruct->ls_first = 1;
-	lockstruct->ls_lvb_size = min_lvb_size;
-	lockstruct->ls_lockspace = nl;
-	lockstruct->ls_ops = &nolock_ops;
-	lockstruct->ls_flags = LM_LSFLAG_LOCAL;
-
-	return 0;
-}
-
-static void nolock_others_may_mount(void *lockspace)
-{
-}
-
-static void nolock_unmount(void *lockspace)
-{
-	struct nolock_lockspace *nl = lockspace;
-	kfree(nl);
-}
-
-static void nolock_withdraw(void *lockspace)
-{
-}
-
-/**
- * nolock_get_lock - get a lm_lock_t given a descripton of the lock
- * @lockspace: the lockspace the lock lives in
- * @name: the name of the lock
- * @lockp: return the lm_lock_t here
- *
- * Returns: 0 on success, -EXXX on failure
- */
-
-static int nolock_get_lock(void *lockspace, struct lm_lockname *name,
-			   void **lockp)
-{
-	*lockp = lockspace;
-	return 0;
-}
-
-/**
- * nolock_put_lock - get rid of a lock structure
- * @lock: the lock to throw away
- *
- */
-
-static void nolock_put_lock(void *lock)
-{
-}
-
-/**
- * nolock_lock - acquire a lock
- * @lock: the lock to manipulate
- * @cur_state: the current state
- * @req_state: the requested state
- * @flags: modifier flags
- *
- * Returns: A bitmap of LM_OUT_*
- */
-
-static unsigned int nolock_lock(void *lock, unsigned int cur_state,
-				unsigned int req_state, unsigned int flags)
-{
-	return req_state | LM_OUT_CACHEABLE;
-}
-
-/**
- * nolock_unlock - unlock a lock
- * @lock: the lock to manipulate
- * @cur_state: the current state
- *
- * Returns: 0
- */
-
-static unsigned int nolock_unlock(void *lock, unsigned int cur_state)
-{
-	return 0;
-}
-
-static void nolock_cancel(void *lock)
-{
-}
-
-/**
- * nolock_hold_lvb - hold on to a lock value block
- * @lock: the lock the LVB is associated with
- * @lvbp: return the lm_lvb_t here
- *
- * Returns: 0 on success, -EXXX on failure
- */
-
-static int nolock_hold_lvb(void *lock, char **lvbp)
-{
-	struct nolock_lockspace *nl = lock;
-	int error = 0;
-
-	*lvbp = kzalloc(nl->nl_lvb_size, GFP_NOFS);
-	if (!*lvbp)
-		error = -ENOMEM;
-
-	return error;
-}
-
-/**
- * nolock_unhold_lvb - release a LVB
- * @lock: the lock the LVB is associated with
- * @lvb: the lock value block
- *
- */
-
-static void nolock_unhold_lvb(void *lock, char *lvb)
-{
-	kfree(lvb);
-}
-
-static int nolock_plock_get(void *lockspace, struct lm_lockname *name,
-			    struct file *file, struct file_lock *fl)
-{
-	posix_test_lock(file, fl);
-
-	return 0;
-}
-
-static int nolock_plock(void *lockspace, struct lm_lockname *name,
-			struct file *file, int cmd, struct file_lock *fl)
-{
-	int error;
-	error = posix_lock_file_wait(file, fl);
-	return error;
-}
-
-static int nolock_punlock(void *lockspace, struct lm_lockname *name,
-			  struct file *file, struct file_lock *fl)
-{
-	int error;
-	error = posix_lock_file_wait(file, fl);
-	return error;
-}
-
-static void nolock_recovery_done(void *lockspace, unsigned int jid,
-				 unsigned int message)
-{
-}
-
-static const struct lm_lockops nolock_ops = {
-	.lm_proto_name = "lock_nolock",
-	.lm_mount = nolock_mount,
-	.lm_others_may_mount = nolock_others_may_mount,
-	.lm_unmount = nolock_unmount,
-	.lm_withdraw = nolock_withdraw,
-	.lm_get_lock = nolock_get_lock,
-	.lm_put_lock = nolock_put_lock,
-	.lm_lock = nolock_lock,
-	.lm_unlock = nolock_unlock,
-	.lm_cancel = nolock_cancel,
-	.lm_hold_lvb = nolock_hold_lvb,
-	.lm_unhold_lvb = nolock_unhold_lvb,
-	.lm_plock_get = nolock_plock_get,
-	.lm_plock = nolock_plock,
-	.lm_punlock = nolock_punlock,
-	.lm_recovery_done = nolock_recovery_done,
-	.lm_owner = THIS_MODULE,
-};
-
-static int __init init_nolock(void)
-{
-	int error;
-
-	error = gfs2_register_lockproto(&nolock_ops);
-	if (error) {
-		printk(KERN_WARNING
-		       "lock_nolock: can't register protocol: %d\n", error);
-		return error;
-	}
-
-	printk(KERN_INFO
-	       "Lock_Nolock (built %s %s) installed\n", __DATE__, __TIME__);
-	return 0;
-}
-
-static void __exit exit_nolock(void)
-{
-	gfs2_unregister_lockproto(&nolock_ops);
-}
-
-module_init(init_nolock);
-module_exit(exit_nolock);
-
-MODULE_DESCRIPTION("GFS Nolock Locking Module");
-MODULE_AUTHOR("Red Hat, Inc.");
-MODULE_LICENSE("GPL");
-
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 548264b1836d..6c6af9f5e3ab 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -87,6 +87,8 @@ void gfs2_remove_from_ail(struct gfs2_bufdata *bd)
  */
 
 static void gfs2_ail1_start_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
+__releases(&sdp->sd_log_lock)
+__acquires(&sdp->sd_log_lock)
 {
 	struct gfs2_bufdata *bd, *s;
 	struct buffer_head *bh;
diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h
index 771152816508..7c64510ccfd2 100644
--- a/fs/gfs2/log.h
+++ b/fs/gfs2/log.h
@@ -21,6 +21,7 @@
  */
 
 static inline void gfs2_log_lock(struct gfs2_sbd *sdp)
+__acquires(&sdp->sd_log_lock)
 {
 	spin_lock(&sdp->sd_log_lock);
 }
@@ -32,6 +33,7 @@ static inline void gfs2_log_lock(struct gfs2_sbd *sdp)
  */
 
 static inline void gfs2_log_unlock(struct gfs2_sbd *sdp)
+__releases(&sdp->sd_log_lock)
 {
 	spin_unlock(&sdp->sd_log_lock);
 }
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index 053e2ebbbd50..bcc668d0fadd 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -40,8 +40,6 @@ static void gfs2_init_glock_once(struct kmem_cache *cachep, void *foo)
 	INIT_HLIST_NODE(&gl->gl_list);
 	spin_lock_init(&gl->gl_spin);
 	INIT_LIST_HEAD(&gl->gl_holders);
-	INIT_LIST_HEAD(&gl->gl_waiters1);
-	INIT_LIST_HEAD(&gl->gl_waiters3);
 	gl->gl_lvb = NULL;
 	atomic_set(&gl->gl_lvb_count, 0);
 	INIT_LIST_HEAD(&gl->gl_reclaim);
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 85aea27b4a86..09853620c951 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -1,6 +1,6 @@
 /*
  * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ * Copyright (C) 2004-2008 Red Hat, Inc.  All rights reserved.
  *
  * This copyrighted material is made available to anyone wishing to use,
  * modify, copy, or redistribute it subject to the terms and conditions
@@ -69,13 +69,15 @@ static const struct address_space_operations aspace_aops = {
 struct inode *gfs2_aspace_get(struct gfs2_sbd *sdp)
 {
 	struct inode *aspace;
+	struct gfs2_inode *ip;
 
 	aspace = new_inode(sdp->sd_vfs);
 	if (aspace) {
 		mapping_set_gfp_mask(aspace->i_mapping, GFP_NOFS);
 		aspace->i_mapping->a_ops = &aspace_aops;
 		aspace->i_size = ~0ULL;
-		aspace->i_private = NULL;
+		ip = GFS2_I(aspace);
+		clear_bit(GIF_USER, &ip->i_flags);
 		insert_inode_hash(aspace);
 	}
 	return aspace;
@@ -127,7 +129,7 @@ void gfs2_meta_sync(struct gfs2_glock *gl)
 }
 
 /**
- * getbuf - Get a buffer with a given address space
+ * gfs2_getbuf - Get a buffer with a given address space
  * @gl: the glock
  * @blkno: the block number (filesystem scope)
  * @create: 1 if the buffer should be created
@@ -135,7 +137,7 @@ void gfs2_meta_sync(struct gfs2_glock *gl)
  * Returns: the buffer
  */
 
-static struct buffer_head *getbuf(struct gfs2_glock *gl, u64 blkno, int create)
+struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create)
 {
 	struct address_space *mapping = gl->gl_aspace->i_mapping;
 	struct gfs2_sbd *sdp = gl->gl_sbd;
@@ -203,7 +205,7 @@ static void meta_prep_new(struct buffer_head *bh)
 struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno)
 {
 	struct buffer_head *bh;
-	bh = getbuf(gl, blkno, CREATE);
+	bh = gfs2_getbuf(gl, blkno, CREATE);
 	meta_prep_new(bh);
 	return bh;
 }
@@ -221,7 +223,7 @@ struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno)
 int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags,
 		   struct buffer_head **bhp)
 {
-	*bhp = getbuf(gl, blkno, CREATE);
+	*bhp = gfs2_getbuf(gl, blkno, CREATE);
 	if (!buffer_uptodate(*bhp)) {
 		ll_rw_block(READ_META, 1, bhp);
 		if (flags & DIO_WAIT) {
@@ -344,7 +346,7 @@ void gfs2_meta_wipe(struct gfs2_inode *ip, u64 bstart, u32 blen)
 	struct buffer_head *bh;
 
 	while (blen) {
-		bh = getbuf(ip->i_gl, bstart, NO_CREATE);
+		bh = gfs2_getbuf(ip->i_gl, bstart, NO_CREATE);
 		if (bh) {
 			lock_buffer(bh);
 			gfs2_log_lock(sdp);
@@ -419,7 +421,7 @@ struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen)
 	if (extlen > max_ra)
 		extlen = max_ra;
 
-	first_bh = getbuf(gl, dblock, CREATE);
+	first_bh = gfs2_getbuf(gl, dblock, CREATE);
 
 	if (buffer_uptodate(first_bh))
 		goto out;
@@ -430,7 +432,7 @@ struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen)
 	extlen--;
 
 	while (extlen) {
-		bh = getbuf(gl, dblock, CREATE);
+		bh = gfs2_getbuf(gl, dblock, CREATE);
 
 		if (!buffer_uptodate(bh) && !buffer_locked(bh))
 			ll_rw_block(READA, 1, &bh);
diff --git a/fs/gfs2/meta_io.h b/fs/gfs2/meta_io.h
index 73e3b1c76fe1..b1a5f3674d43 100644
--- a/fs/gfs2/meta_io.h
+++ b/fs/gfs2/meta_io.h
@@ -47,6 +47,7 @@ struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno);
 int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno,
 		   int flags, struct buffer_head **bhp);
 int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh);
+struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create);
 
 void gfs2_attach_bufdata(struct gfs2_glock *gl, struct buffer_head *bh,
 			 int meta);
diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c
index f55394e57cb2..e64a1b04117a 100644
--- a/fs/gfs2/ops_address.c
+++ b/fs/gfs2/ops_address.c
@@ -499,34 +499,34 @@ static int __gfs2_readpage(void *file, struct page *page)
  * @file: The file to read
  * @page: The page of the file
  *
- * This deals with the locking required. We use a trylock in order to
- * avoid the page lock / glock ordering problems returning AOP_TRUNCATED_PAGE
- * in the event that we are unable to get the lock.
+ * This deals with the locking required. We have to unlock and
+ * relock the page in order to get the locking in the right
+ * order.
  */
 
 static int gfs2_readpage(struct file *file, struct page *page)
 {
-	struct gfs2_inode *ip = GFS2_I(page->mapping->host);
-	struct gfs2_holder *gh;
+	struct address_space *mapping = page->mapping;
+	struct gfs2_inode *ip = GFS2_I(mapping->host);
+	struct gfs2_holder gh;
 	int error;
 
-	gh = gfs2_glock_is_locked_by_me(ip->i_gl);
-	if (!gh) {
-		gh = kmalloc(sizeof(struct gfs2_holder), GFP_NOFS);
-		if (!gh)
-			return -ENOBUFS;
-		gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, gh);
+	unlock_page(page);
+	gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &gh);
+	error = gfs2_glock_nq_atime(&gh);
+	if (unlikely(error))
+		goto out;
+	error = AOP_TRUNCATED_PAGE;
+	lock_page(page);
+	if (page->mapping == mapping && !PageUptodate(page))
+		error = __gfs2_readpage(file, page);
+	else
 		unlock_page(page);
-		error = gfs2_glock_nq_atime(gh);
-		if (likely(error != 0))
-			goto out;
-		return AOP_TRUNCATED_PAGE;
-	}
-	error = __gfs2_readpage(file, page);
-	gfs2_glock_dq(gh);
+	gfs2_glock_dq(&gh);
 out:
-	gfs2_holder_uninit(gh);
-	kfree(gh);
+	gfs2_holder_uninit(&gh);
+	if (error && error != AOP_TRUNCATED_PAGE)
+		lock_page(page);
 	return error;
 }
 
diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c
index e1b7d525a066..e9a366d4411c 100644
--- a/fs/gfs2/ops_file.c
+++ b/fs/gfs2/ops_file.c
@@ -15,6 +15,7 @@
 #include <linux/uio.h>
 #include <linux/blkdev.h>
 #include <linux/mm.h>
+#include <linux/mount.h>
 #include <linux/fs.h>
 #include <linux/gfs2_ondisk.h>
 #include <linux/ext2_fs.h>
@@ -62,11 +63,11 @@ static loff_t gfs2_llseek(struct file *file, loff_t offset, int origin)
 		error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY,
 					   &i_gh);
 		if (!error) {
-			error = remote_llseek(file, offset, origin);
+			error = generic_file_llseek_unlocked(file, offset, origin);
 			gfs2_glock_dq_uninit(&i_gh);
 		}
 	} else
-		error = remote_llseek(file, offset, origin);
+		error = generic_file_llseek_unlocked(file, offset, origin);
 
 	return error;
 }
@@ -133,7 +134,6 @@ static const u32 fsflags_to_gfs2[32] = {
 	[7] = GFS2_DIF_NOATIME,
 	[12] = GFS2_DIF_EXHASH,
 	[14] = GFS2_DIF_INHERIT_JDATA,
-	[20] = GFS2_DIF_INHERIT_DIRECTIO,
 };
 
 static const u32 gfs2_to_fsflags[32] = {
@@ -142,7 +142,6 @@ static const u32 gfs2_to_fsflags[32] = {
 	[gfs2fl_AppendOnly] = FS_APPEND_FL,
 	[gfs2fl_NoAtime] = FS_NOATIME_FL,
 	[gfs2fl_ExHash] = FS_INDEX_FL,
-	[gfs2fl_InheritDirectio] = FS_DIRECTIO_FL,
 	[gfs2fl_InheritJdata] = FS_JOURNAL_DATA_FL,
 };
 
@@ -160,12 +159,8 @@ static int gfs2_get_flags(struct file *filp, u32 __user *ptr)
 		return error;
 
 	fsflags = fsflags_cvt(gfs2_to_fsflags, ip->i_di.di_flags);
-	if (!S_ISDIR(inode->i_mode)) {
-		if (ip->i_di.di_flags & GFS2_DIF_JDATA)
-			fsflags |= FS_JOURNAL_DATA_FL;
-		if (ip->i_di.di_flags & GFS2_DIF_DIRECTIO)
-			fsflags |= FS_DIRECTIO_FL;
-	}
+	if (!S_ISDIR(inode->i_mode) && ip->i_di.di_flags & GFS2_DIF_JDATA)
+		fsflags |= FS_JOURNAL_DATA_FL;
 	if (put_user(fsflags, ptr))
 		error = -EFAULT;
 
@@ -194,13 +189,11 @@ void gfs2_set_inode_flags(struct inode *inode)
 
 /* Flags that can be set by user space */
 #define GFS2_FLAGS_USER_SET (GFS2_DIF_JDATA|			\
-			     GFS2_DIF_DIRECTIO|			\
 			     GFS2_DIF_IMMUTABLE|		\
 			     GFS2_DIF_APPENDONLY|		\
 			     GFS2_DIF_NOATIME|			\
 			     GFS2_DIF_SYNC|			\
 			     GFS2_DIF_SYSTEM|			\
-			     GFS2_DIF_INHERIT_DIRECTIO|		\
 			     GFS2_DIF_INHERIT_JDATA)
 
 /**
@@ -220,10 +213,14 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask)
 	int error;
 	u32 new_flags, flags;
 
-	error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
+	error = mnt_want_write(filp->f_path.mnt);
 	if (error)
 		return error;
 
+	error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
+	if (error)
+		goto out_drop_write;
+
 	flags = ip->i_di.di_flags;
 	new_flags = (flags & ~mask) | (reqflags & mask);
 	if ((new_flags ^ flags) == 0)
@@ -242,7 +239,7 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask)
 	    !capable(CAP_LINUX_IMMUTABLE))
 		goto out;
 	if (!IS_IMMUTABLE(inode)) {
-		error = permission(inode, MAY_WRITE, NULL);
+		error = gfs2_permission(inode, MAY_WRITE);
 		if (error)
 			goto out;
 	}
@@ -272,6 +269,8 @@ out_trans_end:
 	gfs2_trans_end(sdp);
 out:
 	gfs2_glock_dq_uninit(&gh);
+out_drop_write:
+	mnt_drop_write(filp->f_path.mnt);
 	return error;
 }
 
@@ -285,8 +284,6 @@ static int gfs2_set_flags(struct file *filp, u32 __user *ptr)
 	if (!S_ISDIR(inode->i_mode)) {
 		if (gfsflags & GFS2_DIF_INHERIT_JDATA)
 			gfsflags ^= (GFS2_DIF_JDATA | GFS2_DIF_INHERIT_JDATA);
-		if (gfsflags & GFS2_DIF_INHERIT_DIRECTIO)
-			gfsflags ^= (GFS2_DIF_DIRECTIO | GFS2_DIF_INHERIT_DIRECTIO);
 		return do_gfs2_set_flags(filp, gfsflags, ~0);
 	}
 	return do_gfs2_set_flags(filp, gfsflags, ~GFS2_DIF_JDATA);
@@ -487,11 +484,6 @@ static int gfs2_open(struct inode *inode, struct file *file)
 			goto fail_gunlock;
 		}
 
-		/* Listen to the Direct I/O flag */
-
-		if (ip->i_di.di_flags & GFS2_DIF_DIRECTIO)
-			file->f_flags |= O_DIRECT;
-
 		gfs2_glock_dq_uninit(&i_gh);
 	}
 
@@ -669,8 +661,7 @@ static int do_flock(struct file *file, int cmd, struct file_lock *fl)
 	int error = 0;
 
 	state = (fl->fl_type == F_WRLCK) ? LM_ST_EXCLUSIVE : LM_ST_SHARED;
-	flags = (IS_SETLKW(cmd) ? 0 : LM_FLAG_TRY) | GL_EXACT | GL_NOCACHE 
-		| GL_FLOCK;
+	flags = (IS_SETLKW(cmd) ? 0 : LM_FLAG_TRY) | GL_EXACT | GL_NOCACHE;
 
 	mutex_lock(&fp->f_fl_mutex);
 
@@ -683,9 +674,8 @@ static int do_flock(struct file *file, int cmd, struct file_lock *fl)
 		gfs2_glock_dq_wait(fl_gh);
 		gfs2_holder_reinit(state, flags, fl_gh);
 	} else {
-		error = gfs2_glock_get(GFS2_SB(&ip->i_inode),
-				      ip->i_no_addr, &gfs2_flock_glops,
-				      CREATE, &gl);
+		error = gfs2_glock_get(GFS2_SB(&ip->i_inode), ip->i_no_addr,
+				       &gfs2_flock_glops, CREATE, &gl);
 		if (error)
 			goto out;
 		gfs2_holder_init(gl, state, flags, fl_gh);
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index ef9c6c4f80f6..b4d1d6490633 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -64,7 +64,6 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
 	mutex_init(&sdp->sd_rindex_mutex);
 	INIT_LIST_HEAD(&sdp->sd_rindex_list);
 	INIT_LIST_HEAD(&sdp->sd_rindex_mru_list);
-	INIT_LIST_HEAD(&sdp->sd_rindex_recent_list);
 
 	INIT_LIST_HEAD(&sdp->sd_jindex_list);
 	spin_lock_init(&sdp->sd_jindex_spin);
@@ -142,8 +141,8 @@ static int init_names(struct gfs2_sbd *sdp, int silent)
 	if (!table[0])
 		table = sdp->sd_vfs->s_id;
 
-	snprintf(sdp->sd_proto_name, GFS2_FSNAME_LEN, "%s", proto);
-	snprintf(sdp->sd_table_name, GFS2_FSNAME_LEN, "%s", table);
+	strlcpy(sdp->sd_proto_name, proto, GFS2_FSNAME_LEN);
+	strlcpy(sdp->sd_table_name, table, GFS2_FSNAME_LEN);
 
 	table = sdp->sd_table_name;
 	while ((table = strchr(table, '/')))
@@ -364,6 +363,8 @@ static int map_journal_extents(struct gfs2_sbd *sdp)
 
 static void gfs2_lm_others_may_mount(struct gfs2_sbd *sdp)
 {
+	if (!sdp->sd_lockstruct.ls_ops->lm_others_may_mount)
+		return;
 	if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
 		sdp->sd_lockstruct.ls_ops->lm_others_may_mount(
 					sdp->sd_lockstruct.ls_lockspace);
@@ -741,8 +742,7 @@ static int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent)
 		goto out;
 	}
 
-	if (gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_lockspace) ||
-	    gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_ops) ||
+	if (gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_ops) ||
 	    gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_lvb_size >=
 				  GFS2_MIN_LVB_SIZE)) {
 		gfs2_unmount_lockproto(&sdp->sd_lockstruct);
@@ -873,7 +873,7 @@ fail_sb:
 fail_locking:
 	init_locking(sdp, &mount_gh, UNDO);
 fail_lm:
-	gfs2_gl_hash_clear(sdp, WAIT);
+	gfs2_gl_hash_clear(sdp);
 	gfs2_lm_unmount(sdp);
 	while (invalidate_inodes(sb))
 		yield();
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index 2686ad4c0029..1e252dfc5294 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -163,7 +163,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
 	if (error)
 		goto out;
 
-	error = permission(dir, MAY_WRITE | MAY_EXEC, NULL);
+	error = gfs2_permission(dir, MAY_WRITE | MAY_EXEC);
 	if (error)
 		goto out_gunlock;
 
@@ -669,7 +669,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
 			}
 		}
 	} else {
-		error = permission(ndir, MAY_WRITE | MAY_EXEC, NULL);
+		error = gfs2_permission(ndir, MAY_WRITE | MAY_EXEC);
 		if (error)
 			goto out_gunlock;
 
@@ -704,7 +704,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
 	/* Check out the dir to be renamed */
 
 	if (dir_rename) {
-		error = permission(odentry->d_inode, MAY_WRITE, NULL);
+		error = gfs2_permission(odentry->d_inode, MAY_WRITE);
 		if (error)
 			goto out_gunlock;
 	}
@@ -891,7 +891,7 @@ static void *gfs2_follow_link(struct dentry *dentry, struct nameidata *nd)
  * Returns: errno
  */
 
-static int gfs2_permission(struct inode *inode, int mask, struct nameidata *nd)
+int gfs2_permission(struct inode *inode, int mask)
 {
 	struct gfs2_inode *ip = GFS2_I(inode);
 	struct gfs2_holder i_gh;
@@ -905,13 +905,22 @@ static int gfs2_permission(struct inode *inode, int mask, struct nameidata *nd)
 		unlock = 1;
 	}
 
-	error = generic_permission(inode, mask, gfs2_check_acl);
+	if ((mask & MAY_WRITE) && IS_IMMUTABLE(inode))
+		error = -EACCES;
+	else
+		error = generic_permission(inode, mask, gfs2_check_acl);
 	if (unlock)
 		gfs2_glock_dq_uninit(&i_gh);
 
 	return error;
 }
 
+static int gfs2_iop_permission(struct inode *inode, int mask,
+			       struct nameidata *nd)
+{
+	return gfs2_permission(inode, mask);
+}
+
 static int setattr_size(struct inode *inode, struct iattr *attr)
 {
 	struct gfs2_inode *ip = GFS2_I(inode);
@@ -1141,7 +1150,7 @@ static int gfs2_removexattr(struct dentry *dentry, const char *name)
 }
 
 const struct inode_operations gfs2_file_iops = {
-	.permission = gfs2_permission,
+	.permission = gfs2_iop_permission,
 	.setattr = gfs2_setattr,
 	.getattr = gfs2_getattr,
 	.setxattr = gfs2_setxattr,
@@ -1160,7 +1169,7 @@ const struct inode_operations gfs2_dir_iops = {
 	.rmdir = gfs2_rmdir,
 	.mknod = gfs2_mknod,
 	.rename = gfs2_rename,
-	.permission = gfs2_permission,
+	.permission = gfs2_iop_permission,
 	.setattr = gfs2_setattr,
 	.getattr = gfs2_getattr,
 	.setxattr = gfs2_setxattr,
@@ -1172,7 +1181,7 @@ const struct inode_operations gfs2_dir_iops = {
 const struct inode_operations gfs2_symlink_iops = {
 	.readlink = gfs2_readlink,
 	.follow_link = gfs2_follow_link,
-	.permission = gfs2_permission,
+	.permission = gfs2_iop_permission,
 	.setattr = gfs2_setattr,
 	.getattr = gfs2_getattr,
 	.setxattr = gfs2_setxattr,
diff --git a/fs/gfs2/ops_super.c b/fs/gfs2/ops_super.c
index 2278c68b7e35..f66ea0f7a356 100644
--- a/fs/gfs2/ops_super.c
+++ b/fs/gfs2/ops_super.c
@@ -1,6 +1,6 @@
 /*
  * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
+ * Copyright (C) 2004-2008 Red Hat, Inc.  All rights reserved.
  *
  * This copyrighted material is made available to anyone wishing to use,
  * modify, copy, or redistribute it subject to the terms and conditions
@@ -52,7 +52,7 @@ static int gfs2_write_inode(struct inode *inode, int sync)
 	struct gfs2_inode *ip = GFS2_I(inode);
 
 	/* Check this is a "normal" inode */
-	if (inode->i_private) {
+	if (test_bit(GIF_USER, &ip->i_flags)) {
 		if (current->flags & PF_MEMALLOC)
 			return 0;
 		if (sync)
@@ -126,7 +126,7 @@ static void gfs2_put_super(struct super_block *sb)
 	gfs2_clear_rgrpd(sdp);
 	gfs2_jindex_free(sdp);
 	/*  Take apart glock structures and buffer lists  */
-	gfs2_gl_hash_clear(sdp, WAIT);
+	gfs2_gl_hash_clear(sdp);
 	/*  Unmount the locking protocol  */
 	gfs2_lm_unmount(sdp);
 
@@ -155,7 +155,7 @@ static void gfs2_write_super(struct super_block *sb)
 static int gfs2_sync_fs(struct super_block *sb, int wait)
 {
 	sb->s_dirt = 0;
-	if (wait)
+	if (wait && sb->s_fs_info)
 		gfs2_log_flush(sb->s_fs_info, NULL);
 	return 0;
 }
@@ -297,8 +297,9 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
  */
 static void gfs2_drop_inode(struct inode *inode)
 {
-	if (inode->i_private && inode->i_nlink) {
-		struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_inode *ip = GFS2_I(inode);
+
+	if (test_bit(GIF_USER, &ip->i_flags) && inode->i_nlink) {
 		struct gfs2_glock *gl = ip->i_iopen_gh.gh_gl;
 		if (gl && test_bit(GLF_DEMOTE, &gl->gl_flags))
 			clear_nlink(inode);
@@ -314,12 +315,13 @@ static void gfs2_drop_inode(struct inode *inode)
 
 static void gfs2_clear_inode(struct inode *inode)
 {
+	struct gfs2_inode *ip = GFS2_I(inode);
+
 	/* This tells us its a "real" inode and not one which only
 	 * serves to contain an address space (see rgrp.c, meta_io.c)
 	 * which therefore doesn't have its own glocks.
 	 */
-	if (inode->i_private) {
-		struct gfs2_inode *ip = GFS2_I(inode);
+	if (test_bit(GIF_USER, &ip->i_flags)) {
 		ip->i_gl->gl_object = NULL;
 		gfs2_glock_schedule_for_reclaim(ip->i_gl);
 		gfs2_glock_put(ip->i_gl);
@@ -419,7 +421,7 @@ static void gfs2_delete_inode(struct inode *inode)
 	struct gfs2_holder gh;
 	int error;
 
-	if (!inode->i_private)
+	if (!test_bit(GIF_USER, &ip->i_flags))
 		goto out;
 
 	error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 56aaf915c59a..3e073f5144fa 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -904,7 +904,7 @@ static int need_sync(struct gfs2_quota_data *qd)
 		do_sync = 0;
 	else {
 		value *= gfs2_jindex_size(sdp) * num;
-		do_div(value, den);
+		value = div_s64(value, den);
 		value += (s64)be64_to_cpu(qd->qd_qb.qb_value);
 		if (value < (s64)be64_to_cpu(qd->qd_qb.qb_limit))
 			do_sync = 0;
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
index 2888e4b4b1c5..d5e91f4f6a0b 100644
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@@ -428,6 +428,9 @@ static int clean_journal(struct gfs2_jdesc *jd, struct gfs2_log_header_host *hea
 static void gfs2_lm_recovery_done(struct gfs2_sbd *sdp, unsigned int jid,
 				  unsigned int message)
 {
+	if (!sdp->sd_lockstruct.ls_ops->lm_recovery_done)
+		return;
+
 	if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
 		sdp->sd_lockstruct.ls_ops->lm_recovery_done(
 			sdp->sd_lockstruct.ls_lockspace, jid, message);
@@ -505,7 +508,7 @@ int gfs2_recover_journal(struct gfs2_jdesc *jd)
 
 		error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED,
 					   LM_FLAG_NOEXP | LM_FLAG_PRIORITY |
-					   GL_NOCANCEL | GL_NOCACHE, &t_gh);
+					   GL_NOCACHE, &t_gh);
 		if (error)
 			goto fail_gunlock_ji;
 
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 7e8f0b1d6c6e..2d90fb253505 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -195,7 +195,7 @@ ulong_aligned:
 	   depending on architecture.  I've experimented with several ways
 	   of writing this section such as using an else before the goto
 	   but this one seems to be the fastest. */
-	while ((unsigned char *)plong < end - 1) {
+	while ((unsigned char *)plong < end - sizeof(unsigned long)) {
 		prefetch(plong + 1);
 		if (((*plong) & LBITMASK) != lskipval)
 			break;
@@ -371,11 +371,6 @@ static void clear_rgrpdi(struct gfs2_sbd *sdp)
 
 	spin_lock(&sdp->sd_rindex_spin);
 	sdp->sd_rindex_forward = NULL;
-	head = &sdp->sd_rindex_recent_list;
-	while (!list_empty(head)) {
-		rgd = list_entry(head->next, struct gfs2_rgrpd, rd_recent);
-		list_del(&rgd->rd_recent);
-	}
 	spin_unlock(&sdp->sd_rindex_spin);
 
 	head = &sdp->sd_rindex_list;
@@ -945,107 +940,30 @@ static struct inode *try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked)
 }
 
 /**
- * recent_rgrp_first - get first RG from "recent" list
- * @sdp: The GFS2 superblock
- * @rglast: address of the rgrp used last
- *
- * Returns: The first rgrp in the recent list
- */
-
-static struct gfs2_rgrpd *recent_rgrp_first(struct gfs2_sbd *sdp,
-					    u64 rglast)
-{
-	struct gfs2_rgrpd *rgd;
-
-	spin_lock(&sdp->sd_rindex_spin);
-
-	if (rglast) {
-		list_for_each_entry(rgd, &sdp->sd_rindex_recent_list, rd_recent) {
-			if (rgrp_contains_block(rgd, rglast))
-				goto out;
-		}
-	}
-	rgd = NULL;
-	if (!list_empty(&sdp->sd_rindex_recent_list))
-		rgd = list_entry(sdp->sd_rindex_recent_list.next,
-				 struct gfs2_rgrpd, rd_recent);
-out:
-	spin_unlock(&sdp->sd_rindex_spin);
-	return rgd;
-}
-
-/**
  * recent_rgrp_next - get next RG from "recent" list
  * @cur_rgd: current rgrp
- * @remove:
  *
  * Returns: The next rgrp in the recent list
  */
 
-static struct gfs2_rgrpd *recent_rgrp_next(struct gfs2_rgrpd *cur_rgd,
-					   int remove)
+static struct gfs2_rgrpd *recent_rgrp_next(struct gfs2_rgrpd *cur_rgd)
 {
 	struct gfs2_sbd *sdp = cur_rgd->rd_sbd;
 	struct list_head *head;
 	struct gfs2_rgrpd *rgd;
 
 	spin_lock(&sdp->sd_rindex_spin);
-
-	head = &sdp->sd_rindex_recent_list;
-
-	list_for_each_entry(rgd, head, rd_recent) {
-		if (rgd == cur_rgd) {
-			if (cur_rgd->rd_recent.next != head)
-				rgd = list_entry(cur_rgd->rd_recent.next,
-						 struct gfs2_rgrpd, rd_recent);
-			else
-				rgd = NULL;
-
-			if (remove)
-				list_del(&cur_rgd->rd_recent);
-
-			goto out;
-		}
+	head = &sdp->sd_rindex_mru_list;
+	if (unlikely(cur_rgd->rd_list_mru.next == head)) {
+		spin_unlock(&sdp->sd_rindex_spin);
+		return NULL;
 	}
-
-	rgd = NULL;
-	if (!list_empty(head))
-		rgd = list_entry(head->next, struct gfs2_rgrpd, rd_recent);
-
-out:
+	rgd = list_entry(cur_rgd->rd_list_mru.next, struct gfs2_rgrpd, rd_list_mru);
 	spin_unlock(&sdp->sd_rindex_spin);
 	return rgd;
 }
 
 /**
- * recent_rgrp_add - add an RG to tail of "recent" list
- * @new_rgd: The rgrp to add
- *
- */
-
-static void recent_rgrp_add(struct gfs2_rgrpd *new_rgd)
-{
-	struct gfs2_sbd *sdp = new_rgd->rd_sbd;
-	struct gfs2_rgrpd *rgd;
-	unsigned int count = 0;
-	unsigned int max = sdp->sd_rgrps / gfs2_jindex_size(sdp);
-
-	spin_lock(&sdp->sd_rindex_spin);
-
-	list_for_each_entry(rgd, &sdp->sd_rindex_recent_list, rd_recent) {
-		if (rgd == new_rgd)
-			goto out;
-
-		if (++count >= max)
-			goto out;
-	}
-	list_add_tail(&new_rgd->rd_recent, &sdp->sd_rindex_recent_list);
-
-out:
-	spin_unlock(&sdp->sd_rindex_spin);
-}
-
-/**
  * forward_rgrp_get - get an rgrp to try next from full list
  * @sdp: The GFS2 superblock
  *
@@ -1112,9 +1030,7 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
 	int loops = 0;
 	int error, rg_locked;
 
-	/* Try recently successful rgrps */
-
-	rgd = recent_rgrp_first(sdp, ip->i_goal);
+	rgd = gfs2_blk2rgrpd(sdp, ip->i_goal);
 
 	while (rgd) {
 		rg_locked = 0;
@@ -1136,11 +1052,9 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
 				gfs2_glock_dq_uninit(&al->al_rgd_gh);
 			if (inode)
 				return inode;
-			rgd = recent_rgrp_next(rgd, 1);
-			break;
-
+			/* fall through */
 		case GLR_TRYFAILED:
-			rgd = recent_rgrp_next(rgd, 0);
+			rgd = recent_rgrp_next(rgd);
 			break;
 
 		default:
@@ -1199,7 +1113,9 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
 
 out:
 	if (begin) {
-		recent_rgrp_add(rgd);
+		spin_lock(&sdp->sd_rindex_spin);
+		list_move(&rgd->rd_list_mru, &sdp->sd_rindex_mru_list);
+		spin_unlock(&sdp->sd_rindex_spin);
 		rgd = gfs2_rgrpd_get_next(rgd);
 		if (!rgd)
 			rgd = gfs2_rgrpd_get_first(sdp);
@@ -1495,7 +1411,7 @@ u64 gfs2_alloc_block(struct gfs2_inode *ip, unsigned int *n)
 
 	al->al_alloced += *n;
 
-	gfs2_statfs_change(sdp, 0, -*n, 0);
+	gfs2_statfs_change(sdp, 0, -(s64)*n, 0);
 	gfs2_quota_change(ip, *n, ip->i_inode.i_uid, ip->i_inode.i_gid);
 
 	spin_lock(&sdp->sd_rindex_spin);
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 7aeacbc65f35..63a8a902d9db 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -65,7 +65,6 @@ void gfs2_tune_init(struct gfs2_tune *gt)
 	gt->gt_quota_quantum = 60;
 	gt->gt_atime_quantum = 3600;
 	gt->gt_new_files_jdata = 0;
-	gt->gt_new_files_directio = 0;
 	gt->gt_max_readahead = 1 << 18;
 	gt->gt_stall_secs = 600;
 	gt->gt_complain_secs = 10;
@@ -941,8 +940,7 @@ static int gfs2_lock_fs_check_clean(struct gfs2_sbd *sdp,
 	}
 
 	error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_DEFERRED,
-			       LM_FLAG_PRIORITY | GL_NOCACHE,
-			       t_gh);
+				   GL_NOCACHE, t_gh);
 
 	list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
 		error = gfs2_jdesc_check(jd);
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 9ab9fc85ecd0..74846559fc3f 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -110,18 +110,6 @@ static ssize_t statfs_sync_store(struct gfs2_sbd *sdp, const char *buf,
 	return len;
 }
 
-static ssize_t shrink_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
-{
-	if (!capable(CAP_SYS_ADMIN))
-		return -EACCES;
-
-	if (simple_strtol(buf, NULL, 0) != 1)
-		return -EINVAL;
-
-	gfs2_gl_hash_clear(sdp, NO_WAIT);
-	return len;
-}
-
 static ssize_t quota_sync_store(struct gfs2_sbd *sdp, const char *buf,
 				size_t len)
 {
@@ -175,7 +163,6 @@ static struct gfs2_attr gfs2_attr_##name = __ATTR(name, mode, show, store)
 GFS2_ATTR(id,                  0444, id_show,       NULL);
 GFS2_ATTR(fsname,              0444, fsname_show,   NULL);
 GFS2_ATTR(freeze,              0644, freeze_show,   freeze_store);
-GFS2_ATTR(shrink,              0200, NULL,          shrink_store);
 GFS2_ATTR(withdraw,            0644, withdraw_show, withdraw_store);
 GFS2_ATTR(statfs_sync,         0200, NULL,          statfs_sync_store);
 GFS2_ATTR(quota_sync,          0200, NULL,          quota_sync_store);
@@ -186,7 +173,6 @@ static struct attribute *gfs2_attrs[] = {
 	&gfs2_attr_id.attr,
 	&gfs2_attr_fsname.attr,
 	&gfs2_attr_freeze.attr,
-	&gfs2_attr_shrink.attr,
 	&gfs2_attr_withdraw.attr,
 	&gfs2_attr_statfs_sync.attr,
 	&gfs2_attr_quota_sync.attr,
@@ -426,7 +412,6 @@ TUNE_ATTR(max_readahead, 0);
 TUNE_ATTR(complain_secs, 0);
 TUNE_ATTR(statfs_slow, 0);
 TUNE_ATTR(new_files_jdata, 0);
-TUNE_ATTR(new_files_directio, 0);
 TUNE_ATTR(quota_simul_sync, 1);
 TUNE_ATTR(quota_cache_secs, 1);
 TUNE_ATTR(stall_secs, 1);
@@ -455,7 +440,6 @@ static struct attribute *tune_attrs[] = {
 	&tune_attr_quotad_secs.attr,
 	&tune_attr_quota_scale.attr,
 	&tune_attr_new_files_jdata.attr,
-	&tune_attr_new_files_directio.attr,
 	NULL,
 };
 
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index d53b2af91c25..67e1c8b467c4 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -65,6 +65,8 @@ static int hfsplus_releasepage(struct page *page, gfp_t mask)
 		BUG();
 		return 0;
 	}
+	if (!tree)
+		return 0;
 	if (tree->node_size >= PAGE_CACHE_SIZE) {
 		nidx = page->index >> (tree->node_size_shift - PAGE_CACHE_SHIFT);
 		spin_lock(&tree->hash_lock);
diff --git a/fs/hppfs/Makefile b/fs/hppfs/Makefile
index 6890433f7595..3a982bd975d2 100644
--- a/fs/hppfs/Makefile
+++ b/fs/hppfs/Makefile
@@ -1,9 +1,6 @@
 #
-# Copyright (C) 2002, 2003 Jeff Dike (jdike@karaya.com)
+# Copyright (C) 2002 - 2008 Jeff Dike (jdike@{addtoit,linux.intel}.com)
 # Licensed under the GPL
 #
 
-hppfs-objs := hppfs_kern.o
-
-obj-y =
 obj-$(CONFIG_HPPFS) += hppfs.o
diff --git a/fs/hppfs/hppfs_kern.c b/fs/hppfs/hppfs.c
index 8601d8ef3b55..65077aa90f0a 100644
--- a/fs/hppfs/hppfs_kern.c
+++ b/fs/hppfs/hppfs.c
@@ -33,7 +33,7 @@ struct hppfs_private {
 };
 
 struct hppfs_inode_info {
-        struct dentry *proc_dentry;
+	struct dentry *proc_dentry;
 	struct inode vfs_inode;
 };
 
@@ -52,7 +52,7 @@ static int is_pid(struct dentry *dentry)
 	int i;
 
 	sb = dentry->d_sb;
-	if ((sb->s_op != &hppfs_sbops) || (dentry->d_parent != sb->s_root))
+	if (dentry->d_parent != sb->s_root)
 		return 0;
 
 	for (i = 0; i < dentry->d_name.len; i++) {
@@ -136,7 +136,7 @@ static int file_removed(struct dentry *dentry, const char *file)
 }
 
 static struct dentry *hppfs_lookup(struct inode *ino, struct dentry *dentry,
-                                  struct nameidata *nd)
+				   struct nameidata *nd)
 {
 	struct dentry *proc_dentry, *new, *parent;
 	struct inode *inode;
@@ -254,6 +254,8 @@ static ssize_t hppfs_read(struct file *file, char __user *buf, size_t count,
 	int err;
 
 	if (hppfs->contents != NULL) {
+		int rem;
+
 		if (*ppos >= hppfs->len)
 			return 0;
 
@@ -267,8 +269,10 @@ static ssize_t hppfs_read(struct file *file, char __user *buf, size_t count,
 
 		if (off + count > hppfs->len)
 			count = hppfs->len - off;
-		copy_to_user(buf, &data->contents[off], count);
-		*ppos += count;
+		rem = copy_to_user(buf, &data->contents[off], count);
+		*ppos += count - rem;
+		if (rem > 0)
+			return -EFAULT;
 	} else if (hppfs->host_fd != -1) {
 		err = os_seek_file(hppfs->host_fd, *ppos);
 		if (err) {
@@ -285,21 +289,15 @@ static ssize_t hppfs_read(struct file *file, char __user *buf, size_t count,
 	return count;
 }
 
-static ssize_t hppfs_write(struct file *file, const char __user *buf, size_t len,
-			   loff_t *ppos)
+static ssize_t hppfs_write(struct file *file, const char __user *buf,
+			   size_t len, loff_t *ppos)
 {
 	struct hppfs_private *data = file->private_data;
 	struct file *proc_file = data->proc_file;
 	ssize_t (*write)(struct file *, const char __user *, size_t, loff_t *);
-	int err;
 
 	write = proc_file->f_path.dentry->d_inode->i_fop->write;
-
-	proc_file->f_pos = file->f_pos;
-	err = (*write)(proc_file, buf, len, &proc_file->f_pos);
-	file->f_pos = proc_file->f_pos;
-
-	return err;
+	return (*write)(proc_file, buf, len, ppos);
 }
 
 static int open_host_sock(char *host_file, int *filter_out)
@@ -357,7 +355,7 @@ static struct hppfs_data *hppfs_get_data(int fd, int filter,
 
 	if (filter) {
 		while ((n = read_proc(proc_file, data->contents,
-				     sizeof(data->contents), NULL, 0)) > 0)
+				      sizeof(data->contents), NULL, 0)) > 0)
 			os_write_file(fd, data->contents, n);
 		err = os_shutdown_socket(fd, 0, 1);
 		if (err) {
@@ -429,8 +427,8 @@ static int file_mode(int fmode)
 static int hppfs_open(struct inode *inode, struct file *file)
 {
 	struct hppfs_private *data;
-	struct dentry *proc_dentry;
 	struct vfsmount *proc_mnt;
+	struct dentry *proc_dentry;
 	char *host_file;
 	int err, fd, type, filter;
 
@@ -492,8 +490,8 @@ static int hppfs_open(struct inode *inode, struct file *file)
 static int hppfs_dir_open(struct inode *inode, struct file *file)
 {
 	struct hppfs_private *data;
-	struct dentry *proc_dentry;
 	struct vfsmount *proc_mnt;
+	struct dentry *proc_dentry;
 	int err;
 
 	err = -ENOMEM;
@@ -620,6 +618,9 @@ static struct inode *hppfs_alloc_inode(struct super_block *sb)
 
 void hppfs_delete_inode(struct inode *ino)
 {
+	dput(HPPFS_I(ino)->proc_dentry);
+	mntput(ino->i_sb->s_fs_info);
+
 	clear_inode(ino);
 }
 
@@ -628,69 +629,46 @@ static void hppfs_destroy_inode(struct inode *inode)
 	kfree(HPPFS_I(inode));
 }
 
-static void hppfs_put_super(struct super_block *sb)
-{
-	mntput(sb->s_fs_info);
-}
-
 static const struct super_operations hppfs_sbops = {
 	.alloc_inode	= hppfs_alloc_inode,
 	.destroy_inode	= hppfs_destroy_inode,
 	.delete_inode	= hppfs_delete_inode,
 	.statfs		= hppfs_statfs,
-	.put_super	= hppfs_put_super,
 };
 
 static int hppfs_readlink(struct dentry *dentry, char __user *buffer,
 			  int buflen)
 {
-	struct file *proc_file;
 	struct dentry *proc_dentry;
-	struct vfsmount *proc_mnt;
-	int ret;
 
 	proc_dentry = HPPFS_I(dentry->d_inode)->proc_dentry;
-	proc_mnt = dentry->d_sb->s_fs_info;
-
-	proc_file = dentry_open(dget(proc_dentry), mntget(proc_mnt), O_RDONLY);
-	if (IS_ERR(proc_file))
-		return PTR_ERR(proc_file);
-
-	ret = proc_dentry->d_inode->i_op->readlink(proc_dentry, buffer, buflen);
-
-	fput(proc_file);
-
-	return ret;
+	return proc_dentry->d_inode->i_op->readlink(proc_dentry, buffer,
+						    buflen);
 }
 
-static void* hppfs_follow_link(struct dentry *dentry, struct nameidata *nd)
+static void *hppfs_follow_link(struct dentry *dentry, struct nameidata *nd)
 {
-	struct file *proc_file;
 	struct dentry *proc_dentry;
-	struct vfsmount *proc_mnt;
-	void *ret;
 
 	proc_dentry = HPPFS_I(dentry->d_inode)->proc_dentry;
-	proc_mnt = dentry->d_sb->s_fs_info;
-
-	proc_file = dentry_open(dget(proc_dentry), mntget(proc_mnt), O_RDONLY);
-	if (IS_ERR(proc_file))
-		return proc_file;
-
-	ret = proc_dentry->d_inode->i_op->follow_link(proc_dentry, nd);
 
-	fput(proc_file);
+	return proc_dentry->d_inode->i_op->follow_link(proc_dentry, nd);
+}
 
-	return ret;
+int hppfs_permission(struct inode *inode, int mask, struct nameidata *nd)
+{
+	return generic_permission(inode, mask, NULL);
 }
 
 static const struct inode_operations hppfs_dir_iops = {
 	.lookup		= hppfs_lookup,
+	.permission	= hppfs_permission,
 };
 
 static const struct inode_operations hppfs_link_iops = {
 	.readlink	= hppfs_readlink,
 	.follow_link	= hppfs_follow_link,
+	.permission	= hppfs_permission,
 };
 
 static struct inode *get_inode(struct super_block *sb, struct dentry *dentry)
@@ -712,7 +690,7 @@ static struct inode *get_inode(struct super_block *sb, struct dentry *dentry)
 		inode->i_fop = &hppfs_file_fops;
 	}
 
-	HPPFS_I(inode)->proc_dentry = dentry;
+	HPPFS_I(inode)->proc_dentry = dget(dentry);
 
 	inode->i_uid = proc_ino->i_uid;
 	inode->i_gid = proc_ino->i_gid;
@@ -725,7 +703,7 @@ static struct inode *get_inode(struct super_block *sb, struct dentry *dentry)
 	inode->i_size = proc_ino->i_size;
 	inode->i_blocks = proc_ino->i_blocks;
 
-	return 0;
+	return inode;
 }
 
 static int hppfs_fill_super(struct super_block *sb, void *d, int silent)
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index cd931ef1f000..5a8ca61498ca 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -470,7 +470,9 @@ void journal_commit_transaction(journal_t *journal)
 	 * transaction!  Now comes the tricky part: we need to write out
 	 * metadata.  Loop over the transaction's entire buffer list:
 	 */
+	spin_lock(&journal->j_state_lock);
 	commit_transaction->t_state = T_COMMIT;
+	spin_unlock(&journal->j_state_lock);
 
 	J_ASSERT(commit_transaction->t_nr_buffers <=
 		 commit_transaction->t_outstanding_credits);
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 6914598022ce..91389c8aee8a 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -688,7 +688,6 @@ void __jbd2_journal_drop_transaction(journal_t *journal, transaction_t *transact
 
 	J_ASSERT(transaction->t_state == T_FINISHED);
 	J_ASSERT(transaction->t_buffers == NULL);
-	J_ASSERT(transaction->t_sync_datalist == NULL);
 	J_ASSERT(transaction->t_forget == NULL);
 	J_ASSERT(transaction->t_iobuf_list == NULL);
 	J_ASSERT(transaction->t_shadow_list == NULL);
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index e0139786f717..f8b3be873226 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -22,6 +22,8 @@
 #include <linux/pagemap.h>
 #include <linux/jiffies.h>
 #include <linux/crc32.h>
+#include <linux/writeback.h>
+#include <linux/backing-dev.h>
 
 /*
  * Default IO end handler for temporary BJ_IO buffer_heads.
@@ -37,8 +39,8 @@ static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
 }
 
 /*
- * When an ext3-ordered file is truncated, it is possible that many pages are
- * not sucessfully freed, because they are attached to a committing transaction.
+ * When an ext4 file is truncated, it is possible that some pages are not
+ * successfully freed, because they are attached to a committing transaction.
  * After the transaction commits, these pages are left on the LRU, with no
  * ->mapping, and with attached buffers.  These pages are trivially reclaimable
  * by the VM, but their apparent absence upsets the VM accounting, and it makes
@@ -80,21 +82,6 @@ nope:
 }
 
 /*
- * Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is
- * held.  For ranking reasons we must trylock.  If we lose, schedule away and
- * return 0.  j_list_lock is dropped in this case.
- */
-static int inverted_lock(journal_t *journal, struct buffer_head *bh)
-{
-	if (!jbd_trylock_bh_state(bh)) {
-		spin_unlock(&journal->j_list_lock);
-		schedule();
-		return 0;
-	}
-	return 1;
-}
-
-/*
  * Done it all: now submit the commit record.  We should have
  * cleaned up our previous buffers by now, so if we are in abort
  * mode we can now just skip the rest of the journal write
@@ -112,6 +99,7 @@ static int journal_submit_commit_record(journal_t *journal,
 	struct buffer_head *bh;
 	int ret;
 	int barrier_done = 0;
+	struct timespec now = current_kernel_time();
 
 	if (is_journal_aborted(journal))
 		return 0;
@@ -126,6 +114,8 @@ static int journal_submit_commit_record(journal_t *journal,
 	tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
 	tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK);
 	tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
+	tmp->h_commit_sec = cpu_to_be64(now.tv_sec);
+	tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec);
 
 	if (JBD2_HAS_COMPAT_FEATURE(journal,
 				    JBD2_FEATURE_COMPAT_CHECKSUM)) {
@@ -168,6 +158,7 @@ static int journal_submit_commit_record(journal_t *journal,
 		spin_unlock(&journal->j_state_lock);
 
 		/* And try again, without the barrier */
+		lock_buffer(bh);
 		set_buffer_uptodate(bh);
 		set_buffer_dirty(bh);
 		ret = submit_bh(WRITE, bh);
@@ -196,159 +187,104 @@ static int journal_wait_on_commit_record(struct buffer_head *bh)
 }
 
 /*
- * Wait for all submitted IO to complete.
+ * write the filemap data using writepage() address_space_operations.
+ * We don't do block allocation here even for delalloc. We don't
+ * use writepages() because with dealyed allocation we may be doing
+ * block allocation in writepages().
  */
-static int journal_wait_on_locked_list(journal_t *journal,
-				       transaction_t *commit_transaction)
+static int journal_submit_inode_data_buffers(struct address_space *mapping)
 {
-	int ret = 0;
-	struct journal_head *jh;
-
-	while (commit_transaction->t_locked_list) {
-		struct buffer_head *bh;
-
-		jh = commit_transaction->t_locked_list->b_tprev;
-		bh = jh2bh(jh);
-		get_bh(bh);
-		if (buffer_locked(bh)) {
-			spin_unlock(&journal->j_list_lock);
-			wait_on_buffer(bh);
-			if (unlikely(!buffer_uptodate(bh)))
-				ret = -EIO;
-			spin_lock(&journal->j_list_lock);
-		}
-		if (!inverted_lock(journal, bh)) {
-			put_bh(bh);
-			spin_lock(&journal->j_list_lock);
-			continue;
-		}
-		if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) {
-			__jbd2_journal_unfile_buffer(jh);
-			jbd_unlock_bh_state(bh);
-			jbd2_journal_remove_journal_head(bh);
-			put_bh(bh);
-		} else {
-			jbd_unlock_bh_state(bh);
-		}
-		put_bh(bh);
-		cond_resched_lock(&journal->j_list_lock);
-	}
+	int ret;
+	struct writeback_control wbc = {
+		.sync_mode =  WB_SYNC_ALL,
+		.nr_to_write = mapping->nrpages * 2,
+		.range_start = 0,
+		.range_end = i_size_read(mapping->host),
+		.for_writepages = 1,
+	};
+
+	ret = generic_writepages(mapping, &wbc);
 	return ret;
-  }
+}
 
-static void journal_do_submit_data(struct buffer_head **wbuf, int bufs)
+/*
+ * Submit all the data buffers of inode associated with the transaction to
+ * disk.
+ *
+ * We are in a committing transaction. Therefore no new inode can be added to
+ * our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently
+ * operate on from being released while we write out pages.
+ */
+static int journal_submit_data_buffers(journal_t *journal,
+		transaction_t *commit_transaction)
 {
-	int i;
+	struct jbd2_inode *jinode;
+	int err, ret = 0;
+	struct address_space *mapping;
 
-	for (i = 0; i < bufs; i++) {
-		wbuf[i]->b_end_io = end_buffer_write_sync;
-		/* We use-up our safety reference in submit_bh() */
-		submit_bh(WRITE, wbuf[i]);
+	spin_lock(&journal->j_list_lock);
+	list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
+		mapping = jinode->i_vfs_inode->i_mapping;
+		jinode->i_flags |= JI_COMMIT_RUNNING;
+		spin_unlock(&journal->j_list_lock);
+		/*
+		 * submit the inode data buffers. We use writepage
+		 * instead of writepages. Because writepages can do
+		 * block allocation  with delalloc. We need to write
+		 * only allocated blocks here.
+		 */
+		err = journal_submit_inode_data_buffers(mapping);
+		if (!ret)
+			ret = err;
+		spin_lock(&journal->j_list_lock);
+		J_ASSERT(jinode->i_transaction == commit_transaction);
+		jinode->i_flags &= ~JI_COMMIT_RUNNING;
+		wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
 	}
+	spin_unlock(&journal->j_list_lock);
+	return ret;
 }
 
 /*
- *  Submit all the data buffers to disk
+ * Wait for data submitted for writeout, refile inodes to proper
+ * transaction if needed.
+ *
  */
-static void journal_submit_data_buffers(journal_t *journal,
-				transaction_t *commit_transaction)
+static int journal_finish_inode_data_buffers(journal_t *journal,
+		transaction_t *commit_transaction)
 {
-	struct journal_head *jh;
-	struct buffer_head *bh;
-	int locked;
-	int bufs = 0;
-	struct buffer_head **wbuf = journal->j_wbuf;
+	struct jbd2_inode *jinode, *next_i;
+	int err, ret = 0;
 
-	/*
-	 * Whenever we unlock the journal and sleep, things can get added
-	 * onto ->t_sync_datalist, so we have to keep looping back to
-	 * write_out_data until we *know* that the list is empty.
-	 *
-	 * Cleanup any flushed data buffers from the data list.  Even in
-	 * abort mode, we want to flush this out as soon as possible.
-	 */
-write_out_data:
-	cond_resched();
+	/* For locking, see the comment in journal_submit_data_buffers() */
 	spin_lock(&journal->j_list_lock);
+	list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
+		jinode->i_flags |= JI_COMMIT_RUNNING;
+		spin_unlock(&journal->j_list_lock);
+		err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping);
+		if (!ret)
+			ret = err;
+		spin_lock(&journal->j_list_lock);
+		jinode->i_flags &= ~JI_COMMIT_RUNNING;
+		wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
+	}
 
-	while (commit_transaction->t_sync_datalist) {
-		jh = commit_transaction->t_sync_datalist;
-		bh = jh2bh(jh);
-		locked = 0;
-
-		/* Get reference just to make sure buffer does not disappear
-		 * when we are forced to drop various locks */
-		get_bh(bh);
-		/* If the buffer is dirty, we need to submit IO and hence
-		 * we need the buffer lock. We try to lock the buffer without
-		 * blocking. If we fail, we need to drop j_list_lock and do
-		 * blocking lock_buffer().
-		 */
-		if (buffer_dirty(bh)) {
-			if (test_set_buffer_locked(bh)) {
-				BUFFER_TRACE(bh, "needs blocking lock");
-				spin_unlock(&journal->j_list_lock);
-				/* Write out all data to prevent deadlocks */
-				journal_do_submit_data(wbuf, bufs);
-				bufs = 0;
-				lock_buffer(bh);
-				spin_lock(&journal->j_list_lock);
-			}
-			locked = 1;
-		}
-		/* We have to get bh_state lock. Again out of order, sigh. */
-		if (!inverted_lock(journal, bh)) {
-			jbd_lock_bh_state(bh);
-			spin_lock(&journal->j_list_lock);
-		}
-		/* Someone already cleaned up the buffer? */
-		if (!buffer_jbd(bh)
-			|| jh->b_transaction != commit_transaction
-			|| jh->b_jlist != BJ_SyncData) {
-			jbd_unlock_bh_state(bh);
-			if (locked)
-				unlock_buffer(bh);
-			BUFFER_TRACE(bh, "already cleaned up");
-			put_bh(bh);
-			continue;
-		}
-		if (locked && test_clear_buffer_dirty(bh)) {
-			BUFFER_TRACE(bh, "needs writeout, adding to array");
-			wbuf[bufs++] = bh;
-			__jbd2_journal_file_buffer(jh, commit_transaction,
-						BJ_Locked);
-			jbd_unlock_bh_state(bh);
-			if (bufs == journal->j_wbufsize) {
-				spin_unlock(&journal->j_list_lock);
-				journal_do_submit_data(wbuf, bufs);
-				bufs = 0;
-				goto write_out_data;
-			}
-		} else if (!locked && buffer_locked(bh)) {
-			__jbd2_journal_file_buffer(jh, commit_transaction,
-						BJ_Locked);
-			jbd_unlock_bh_state(bh);
-			put_bh(bh);
+	/* Now refile inode to proper lists */
+	list_for_each_entry_safe(jinode, next_i,
+				 &commit_transaction->t_inode_list, i_list) {
+		list_del(&jinode->i_list);
+		if (jinode->i_next_transaction) {
+			jinode->i_transaction = jinode->i_next_transaction;
+			jinode->i_next_transaction = NULL;
+			list_add(&jinode->i_list,
+				&jinode->i_transaction->t_inode_list);
 		} else {
-			BUFFER_TRACE(bh, "writeout complete: unfile");
-			__jbd2_journal_unfile_buffer(jh);
-			jbd_unlock_bh_state(bh);
-			if (locked)
-				unlock_buffer(bh);
-			jbd2_journal_remove_journal_head(bh);
-			/* Once for our safety reference, once for
-			 * jbd2_journal_remove_journal_head() */
-			put_bh(bh);
-			put_bh(bh);
-		}
-
-		if (need_resched() || spin_needbreak(&journal->j_list_lock)) {
-			spin_unlock(&journal->j_list_lock);
-			goto write_out_data;
+			jinode->i_transaction = NULL;
 		}
 	}
 	spin_unlock(&journal->j_list_lock);
-	journal_do_submit_data(wbuf, bufs);
+
+	return ret;
 }
 
 static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
@@ -523,21 +459,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 	 * Now start flushing things to disk, in the order they appear
 	 * on the transaction lists.  Data blocks go first.
 	 */
-	err = 0;
-	journal_submit_data_buffers(journal, commit_transaction);
-
-	/*
-	 * Wait for all previously submitted IO to complete if commit
-	 * record is to be written synchronously.
-	 */
-	spin_lock(&journal->j_list_lock);
-	if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
-		JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT))
-		err = journal_wait_on_locked_list(journal,
-						commit_transaction);
-
-	spin_unlock(&journal->j_list_lock);
-
+	err = journal_submit_data_buffers(journal, commit_transaction);
 	if (err)
 		jbd2_journal_abort(journal, err);
 
@@ -546,21 +468,13 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 	jbd_debug(3, "JBD: commit phase 2\n");
 
 	/*
-	 * If we found any dirty or locked buffers, then we should have
-	 * looped back up to the write_out_data label.  If there weren't
-	 * any then journal_clean_data_list should have wiped the list
-	 * clean by now, so check that it is in fact empty.
-	 */
-	J_ASSERT (commit_transaction->t_sync_datalist == NULL);
-
-	jbd_debug (3, "JBD: commit phase 3\n");
-
-	/*
 	 * Way to go: we have now written out all of the data for a
 	 * transaction!  Now comes the tricky part: we need to write out
 	 * metadata.  Loop over the transaction's entire buffer list:
 	 */
+	spin_lock(&journal->j_state_lock);
 	commit_transaction->t_state = T_COMMIT;
+	spin_unlock(&journal->j_state_lock);
 
 	stats.u.run.rs_logging = jiffies;
 	stats.u.run.rs_flushing = jbd2_time_diff(stats.u.run.rs_flushing,
@@ -571,6 +485,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 	J_ASSERT(commit_transaction->t_nr_buffers <=
 		 commit_transaction->t_outstanding_credits);
 
+	err = 0;
 	descriptor = NULL;
 	bufs = 0;
 	while (commit_transaction->t_buffers) {
@@ -745,15 +660,19 @@ start_journal_io:
 						 &cbh, crc32_sum);
 		if (err)
 			__jbd2_journal_abort_hard(journal);
-
-		spin_lock(&journal->j_list_lock);
-		err = journal_wait_on_locked_list(journal,
-						commit_transaction);
-		spin_unlock(&journal->j_list_lock);
-		if (err)
-			__jbd2_journal_abort_hard(journal);
 	}
 
+	/*
+	 * This is the right place to wait for data buffers both for ASYNC
+	 * and !ASYNC commit. If commit is ASYNC, we need to wait only after
+	 * the commit block went to disk (which happens above). If commit is
+	 * SYNC, we need to wait for data buffers before we start writing
+	 * commit block, which happens below in such setting.
+	 */
+	err = journal_finish_inode_data_buffers(journal, commit_transaction);
+	if (err)
+		jbd2_journal_abort(journal, err);
+
 	/* Lo and behold: we have just managed to send a transaction to
            the log.  Before we can commit it, wait for the IO so far to
            complete.  Control buffers being written are on the
@@ -765,7 +684,7 @@ start_journal_io:
 	   so we incur less scheduling load.
 	*/
 
-	jbd_debug(3, "JBD: commit phase 4\n");
+	jbd_debug(3, "JBD: commit phase 3\n");
 
 	/*
 	 * akpm: these are BJ_IO, and j_list_lock is not needed.
@@ -824,7 +743,7 @@ wait_for_iobuf:
 
 	J_ASSERT (commit_transaction->t_shadow_list == NULL);
 
-	jbd_debug(3, "JBD: commit phase 5\n");
+	jbd_debug(3, "JBD: commit phase 4\n");
 
 	/* Here we wait for the revoke record and descriptor record buffers */
  wait_for_ctlbuf:
@@ -851,7 +770,7 @@ wait_for_iobuf:
 		/* AKPM: bforget here */
 	}
 
-	jbd_debug(3, "JBD: commit phase 6\n");
+	jbd_debug(3, "JBD: commit phase 5\n");
 
 	if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
 		JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
@@ -871,9 +790,9 @@ wait_for_iobuf:
            transaction can be removed from any checkpoint list it was on
            before. */
 
-	jbd_debug(3, "JBD: commit phase 7\n");
+	jbd_debug(3, "JBD: commit phase 6\n");
 
-	J_ASSERT(commit_transaction->t_sync_datalist == NULL);
+	J_ASSERT(list_empty(&commit_transaction->t_inode_list));
 	J_ASSERT(commit_transaction->t_buffers == NULL);
 	J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
 	J_ASSERT(commit_transaction->t_iobuf_list == NULL);
@@ -994,7 +913,7 @@ restart_loop:
 
 	/* Done with this transaction! */
 
-	jbd_debug(3, "JBD: commit phase 8\n");
+	jbd_debug(3, "JBD: commit phase 7\n");
 
 	J_ASSERT(commit_transaction->t_state == T_COMMIT);
 
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 53632e3e8457..b26c6d9fe6ae 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -50,7 +50,6 @@ EXPORT_SYMBOL(jbd2_journal_unlock_updates);
 EXPORT_SYMBOL(jbd2_journal_get_write_access);
 EXPORT_SYMBOL(jbd2_journal_get_create_access);
 EXPORT_SYMBOL(jbd2_journal_get_undo_access);
-EXPORT_SYMBOL(jbd2_journal_dirty_data);
 EXPORT_SYMBOL(jbd2_journal_dirty_metadata);
 EXPORT_SYMBOL(jbd2_journal_release_buffer);
 EXPORT_SYMBOL(jbd2_journal_forget);
@@ -82,6 +81,10 @@ EXPORT_SYMBOL(jbd2_journal_blocks_per_page);
 EXPORT_SYMBOL(jbd2_journal_invalidatepage);
 EXPORT_SYMBOL(jbd2_journal_try_to_free_buffers);
 EXPORT_SYMBOL(jbd2_journal_force_commit);
+EXPORT_SYMBOL(jbd2_journal_file_inode);
+EXPORT_SYMBOL(jbd2_journal_init_jbd_inode);
+EXPORT_SYMBOL(jbd2_journal_release_jbd_inode);
+EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate);
 
 static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *);
 static void __journal_abort_soft (journal_t *journal, int errno);
@@ -901,7 +904,7 @@ static void jbd2_stats_proc_init(journal_t *journal)
 {
 	char name[BDEVNAME_SIZE];
 
-	snprintf(name, sizeof(name) - 1, "%s", bdevname(journal->j_dev, name));
+	bdevname(journal->j_dev, name);
 	journal->j_proc_entry = proc_mkdir(name, proc_jbd2_stats);
 	if (journal->j_proc_entry) {
 		proc_create_data("history", S_IRUGO, journal->j_proc_entry,
@@ -915,7 +918,7 @@ static void jbd2_stats_proc_exit(journal_t *journal)
 {
 	char name[BDEVNAME_SIZE];
 
-	snprintf(name, sizeof(name) - 1, "%s", bdevname(journal->j_dev, name));
+	bdevname(journal->j_dev, name);
 	remove_proc_entry("info", journal->j_proc_entry);
 	remove_proc_entry("history", journal->j_proc_entry);
 	remove_proc_entry(name, proc_jbd2_stats);
@@ -2195,6 +2198,54 @@ void jbd2_journal_put_journal_head(struct journal_head *jh)
 }
 
 /*
+ * Initialize jbd inode head
+ */
+void jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode, struct inode *inode)
+{
+	jinode->i_transaction = NULL;
+	jinode->i_next_transaction = NULL;
+	jinode->i_vfs_inode = inode;
+	jinode->i_flags = 0;
+	INIT_LIST_HEAD(&jinode->i_list);
+}
+
+/*
+ * Function to be called before we start removing inode from memory (i.e.,
+ * clear_inode() is a fine place to be called from). It removes inode from
+ * transaction's lists.
+ */
+void jbd2_journal_release_jbd_inode(journal_t *journal,
+				    struct jbd2_inode *jinode)
+{
+	int writeout = 0;
+
+	if (!journal)
+		return;
+restart:
+	spin_lock(&journal->j_list_lock);
+	/* Is commit writing out inode - we have to wait */
+	if (jinode->i_flags & JI_COMMIT_RUNNING) {
+		wait_queue_head_t *wq;
+		DEFINE_WAIT_BIT(wait, &jinode->i_flags, __JI_COMMIT_RUNNING);
+		wq = bit_waitqueue(&jinode->i_flags, __JI_COMMIT_RUNNING);
+		prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
+		spin_unlock(&journal->j_list_lock);
+		schedule();
+		finish_wait(wq, &wait.wait);
+		goto restart;
+	}
+
+	/* Do we need to wait for data writeback? */
+	if (journal->j_committing_transaction == jinode->i_transaction)
+		writeout = 1;
+	if (jinode->i_transaction) {
+		list_del(&jinode->i_list);
+		jinode->i_transaction = NULL;
+	}
+	spin_unlock(&journal->j_list_lock);
+}
+
+/*
  * debugfs tunables
  */
 #ifdef CONFIG_JBD2_DEBUG
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index 5d0405a9e7ca..058f50f65b76 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -344,6 +344,7 @@ static int calc_chksums(journal_t *journal, struct buffer_head *bh,
 			*crc32_sum = crc32_be(*crc32_sum, (void *)obh->b_data,
 				     obh->b_size);
 		}
+		put_bh(obh);
 	}
 	return 0;
 }
@@ -610,9 +611,8 @@ static int do_one_pass(journal_t *journal,
 				chksum_err = chksum_seen = 0;
 
 				if (info->end_transaction) {
-					printk(KERN_ERR "JBD: Transaction %u "
-						"found to be corrupt.\n",
-						next_commit_ID - 1);
+					journal->j_failed_commit =
+						info->end_transaction;
 					brelse(bh);
 					break;
 				}
@@ -643,10 +643,8 @@ static int do_one_pass(journal_t *journal,
 
 					if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
 					   JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)){
-						printk(KERN_ERR
-						       "JBD: Transaction %u "
-						       "found to be corrupt.\n",
-						       next_commit_ID);
+						journal->j_failed_commit =
+							next_commit_ID;
 						brelse(bh);
 						break;
 					}
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index d6e006e67804..4f7cadbb19fa 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -41,7 +41,6 @@ static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh);
  *	new transaction	and we can't block without protecting against other
  *	processes trying to touch the journal while it is in transition.
  *
- * Called under j_state_lock
  */
 
 static transaction_t *
@@ -52,6 +51,7 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
 	transaction->t_tid = journal->j_transaction_sequence++;
 	transaction->t_expires = jiffies + journal->j_commit_interval;
 	spin_lock_init(&transaction->t_handle_lock);
+	INIT_LIST_HEAD(&transaction->t_inode_list);
 
 	/* Set up the commit timer for the new transaction. */
 	journal->j_commit_timer.expires = round_jiffies(transaction->t_expires);
@@ -943,183 +943,6 @@ out:
 }
 
 /**
- * int jbd2_journal_dirty_data() -  mark a buffer as containing dirty data which
- *                             needs to be flushed before we can commit the
- *                             current transaction.
- * @handle: transaction
- * @bh: bufferhead to mark
- *
- * The buffer is placed on the transaction's data list and is marked as
- * belonging to the transaction.
- *
- * Returns error number or 0 on success.
- *
- * jbd2_journal_dirty_data() can be called via page_launder->ext3_writepage
- * by kswapd.
- */
-int jbd2_journal_dirty_data(handle_t *handle, struct buffer_head *bh)
-{
-	journal_t *journal = handle->h_transaction->t_journal;
-	int need_brelse = 0;
-	struct journal_head *jh;
-
-	if (is_handle_aborted(handle))
-		return 0;
-
-	jh = jbd2_journal_add_journal_head(bh);
-	JBUFFER_TRACE(jh, "entry");
-
-	/*
-	 * The buffer could *already* be dirty.  Writeout can start
-	 * at any time.
-	 */
-	jbd_debug(4, "jh: %p, tid:%d\n", jh, handle->h_transaction->t_tid);
-
-	/*
-	 * What if the buffer is already part of a running transaction?
-	 *
-	 * There are two cases:
-	 * 1) It is part of the current running transaction.  Refile it,
-	 *    just in case we have allocated it as metadata, deallocated
-	 *    it, then reallocated it as data.
-	 * 2) It is part of the previous, still-committing transaction.
-	 *    If all we want to do is to guarantee that the buffer will be
-	 *    written to disk before this new transaction commits, then
-	 *    being sure that the *previous* transaction has this same
-	 *    property is sufficient for us!  Just leave it on its old
-	 *    transaction.
-	 *
-	 * In case (2), the buffer must not already exist as metadata
-	 * --- that would violate write ordering (a transaction is free
-	 * to write its data at any point, even before the previous
-	 * committing transaction has committed).  The caller must
-	 * never, ever allow this to happen: there's nothing we can do
-	 * about it in this layer.
-	 */
-	jbd_lock_bh_state(bh);
-	spin_lock(&journal->j_list_lock);
-
-	/* Now that we have bh_state locked, are we really still mapped? */
-	if (!buffer_mapped(bh)) {
-		JBUFFER_TRACE(jh, "unmapped buffer, bailing out");
-		goto no_journal;
-	}
-
-	if (jh->b_transaction) {
-		JBUFFER_TRACE(jh, "has transaction");
-		if (jh->b_transaction != handle->h_transaction) {
-			JBUFFER_TRACE(jh, "belongs to older transaction");
-			J_ASSERT_JH(jh, jh->b_transaction ==
-					journal->j_committing_transaction);
-
-			/* @@@ IS THIS TRUE  ? */
-			/*
-			 * Not any more.  Scenario: someone does a write()
-			 * in data=journal mode.  The buffer's transaction has
-			 * moved into commit.  Then someone does another
-			 * write() to the file.  We do the frozen data copyout
-			 * and set b_next_transaction to point to j_running_t.
-			 * And while we're in that state, someone does a
-			 * writepage() in an attempt to pageout the same area
-			 * of the file via a shared mapping.  At present that
-			 * calls jbd2_journal_dirty_data(), and we get right here.
-			 * It may be too late to journal the data.  Simply
-			 * falling through to the next test will suffice: the
-			 * data will be dirty and wil be checkpointed.  The
-			 * ordering comments in the next comment block still
-			 * apply.
-			 */
-			//J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
-
-			/*
-			 * If we're journalling data, and this buffer was
-			 * subject to a write(), it could be metadata, forget
-			 * or shadow against the committing transaction.  Now,
-			 * someone has dirtied the same darn page via a mapping
-			 * and it is being writepage()'d.
-			 * We *could* just steal the page from commit, with some
-			 * fancy locking there.  Instead, we just skip it -
-			 * don't tie the page's buffers to the new transaction
-			 * at all.
-			 * Implication: if we crash before the writepage() data
-			 * is written into the filesystem, recovery will replay
-			 * the write() data.
-			 */
-			if (jh->b_jlist != BJ_None &&
-					jh->b_jlist != BJ_SyncData &&
-					jh->b_jlist != BJ_Locked) {
-				JBUFFER_TRACE(jh, "Not stealing");
-				goto no_journal;
-			}
-
-			/*
-			 * This buffer may be undergoing writeout in commit.  We
-			 * can't return from here and let the caller dirty it
-			 * again because that can cause the write-out loop in
-			 * commit to never terminate.
-			 */
-			if (buffer_dirty(bh)) {
-				get_bh(bh);
-				spin_unlock(&journal->j_list_lock);
-				jbd_unlock_bh_state(bh);
-				need_brelse = 1;
-				sync_dirty_buffer(bh);
-				jbd_lock_bh_state(bh);
-				spin_lock(&journal->j_list_lock);
-				/* Since we dropped the lock... */
-				if (!buffer_mapped(bh)) {
-					JBUFFER_TRACE(jh, "buffer got unmapped");
-					goto no_journal;
-				}
-				/* The buffer may become locked again at any
-				   time if it is redirtied */
-			}
-
-			/* journal_clean_data_list() may have got there first */
-			if (jh->b_transaction != NULL) {
-				JBUFFER_TRACE(jh, "unfile from commit");
-				__jbd2_journal_temp_unlink_buffer(jh);
-				/* It still points to the committing
-				 * transaction; move it to this one so
-				 * that the refile assert checks are
-				 * happy. */
-				jh->b_transaction = handle->h_transaction;
-			}
-			/* The buffer will be refiled below */
-
-		}
-		/*
-		 * Special case --- the buffer might actually have been
-		 * allocated and then immediately deallocated in the previous,
-		 * committing transaction, so might still be left on that
-		 * transaction's metadata lists.
-		 */
-		if (jh->b_jlist != BJ_SyncData && jh->b_jlist != BJ_Locked) {
-			JBUFFER_TRACE(jh, "not on correct data list: unfile");
-			J_ASSERT_JH(jh, jh->b_jlist != BJ_Shadow);
-			__jbd2_journal_temp_unlink_buffer(jh);
-			jh->b_transaction = handle->h_transaction;
-			JBUFFER_TRACE(jh, "file as data");
-			__jbd2_journal_file_buffer(jh, handle->h_transaction,
-						BJ_SyncData);
-		}
-	} else {
-		JBUFFER_TRACE(jh, "not on a transaction");
-		__jbd2_journal_file_buffer(jh, handle->h_transaction, BJ_SyncData);
-	}
-no_journal:
-	spin_unlock(&journal->j_list_lock);
-	jbd_unlock_bh_state(bh);
-	if (need_brelse) {
-		BUFFER_TRACE(bh, "brelse");
-		__brelse(bh);
-	}
-	JBUFFER_TRACE(jh, "exit");
-	jbd2_journal_put_journal_head(jh);
-	return 0;
-}
-
-/**
  * int jbd2_journal_dirty_metadata() -  mark a buffer as containing dirty metadata
  * @handle: transaction to add buffer to.
  * @bh: buffer to mark
@@ -1541,10 +1364,10 @@ __blist_del_buffer(struct journal_head **list, struct journal_head *jh)
  * Remove a buffer from the appropriate transaction list.
  *
  * Note that this function can *change* the value of
- * bh->b_transaction->t_sync_datalist, t_buffers, t_forget,
- * t_iobuf_list, t_shadow_list, t_log_list or t_reserved_list.  If the caller
- * is holding onto a copy of one of thee pointers, it could go bad.
- * Generally the caller needs to re-read the pointer from the transaction_t.
+ * bh->b_transaction->t_buffers, t_forget, t_iobuf_list, t_shadow_list,
+ * t_log_list or t_reserved_list.  If the caller is holding onto a copy of one
+ * of these pointers, it could go bad.  Generally the caller needs to re-read
+ * the pointer from the transaction_t.
  *
  * Called under j_list_lock.  The journal may not be locked.
  */
@@ -1566,9 +1389,6 @@ void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh)
 	switch (jh->b_jlist) {
 	case BJ_None:
 		return;
-	case BJ_SyncData:
-		list = &transaction->t_sync_datalist;
-		break;
 	case BJ_Metadata:
 		transaction->t_nr_buffers--;
 		J_ASSERT_JH(jh, transaction->t_nr_buffers >= 0);
@@ -1589,9 +1409,6 @@ void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh)
 	case BJ_Reserved:
 		list = &transaction->t_reserved_list;
 		break;
-	case BJ_Locked:
-		list = &transaction->t_locked_list;
-		break;
 	}
 
 	__blist_del_buffer(list, jh);
@@ -1634,15 +1451,7 @@ __journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh)
 		goto out;
 
 	spin_lock(&journal->j_list_lock);
-	if (jh->b_transaction != NULL && jh->b_cp_transaction == NULL) {
-		if (jh->b_jlist == BJ_SyncData || jh->b_jlist == BJ_Locked) {
-			/* A written-back ordered data buffer */
-			JBUFFER_TRACE(jh, "release data");
-			__jbd2_journal_unfile_buffer(jh);
-			jbd2_journal_remove_journal_head(bh);
-			__brelse(bh);
-		}
-	} else if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) {
+	if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) {
 		/* written-back checkpointed metadata buffer */
 		if (jh->b_jlist == BJ_None) {
 			JBUFFER_TRACE(jh, "remove from checkpoint list");
@@ -1656,12 +1465,43 @@ out:
 	return;
 }
 
+/*
+ * jbd2_journal_try_to_free_buffers() could race with
+ * jbd2_journal_commit_transaction(). The later might still hold the
+ * reference count to the buffers when inspecting them on
+ * t_syncdata_list or t_locked_list.
+ *
+ * jbd2_journal_try_to_free_buffers() will call this function to
+ * wait for the current transaction to finish syncing data buffers, before
+ * try to free that buffer.
+ *
+ * Called with journal->j_state_lock hold.
+ */
+static void jbd2_journal_wait_for_transaction_sync_data(journal_t *journal)
+{
+	transaction_t *transaction;
+	tid_t tid;
+
+	spin_lock(&journal->j_state_lock);
+	transaction = journal->j_committing_transaction;
+
+	if (!transaction) {
+		spin_unlock(&journal->j_state_lock);
+		return;
+	}
+
+	tid = transaction->t_tid;
+	spin_unlock(&journal->j_state_lock);
+	jbd2_log_wait_commit(journal, tid);
+}
 
 /**
  * int jbd2_journal_try_to_free_buffers() - try to free page buffers.
  * @journal: journal for operation
  * @page: to try and free
- * @unused_gfp_mask: unused
+ * @gfp_mask: we use the mask to detect how hard should we try to release
+ * buffers. If __GFP_WAIT and __GFP_FS is set, we wait for commit code to
+ * release the buffers.
  *
  *
  * For all the buffers on this page,
@@ -1690,9 +1530,11 @@ out:
  * journal_try_to_free_buffer() is changing its state.  But that
  * cannot happen because we never reallocate freed data as metadata
  * while the data is part of a transaction.  Yes?
+ *
+ * Return 0 on failure, 1 on success
  */
 int jbd2_journal_try_to_free_buffers(journal_t *journal,
-				struct page *page, gfp_t unused_gfp_mask)
+				struct page *page, gfp_t gfp_mask)
 {
 	struct buffer_head *head;
 	struct buffer_head *bh;
@@ -1708,7 +1550,8 @@ int jbd2_journal_try_to_free_buffers(journal_t *journal,
 		/*
 		 * We take our own ref against the journal_head here to avoid
 		 * having to add tons of locking around each instance of
-		 * jbd2_journal_remove_journal_head() and jbd2_journal_put_journal_head().
+		 * jbd2_journal_remove_journal_head() and
+		 * jbd2_journal_put_journal_head().
 		 */
 		jh = jbd2_journal_grab_journal_head(bh);
 		if (!jh)
@@ -1721,7 +1564,28 @@ int jbd2_journal_try_to_free_buffers(journal_t *journal,
 		if (buffer_jbd(bh))
 			goto busy;
 	} while ((bh = bh->b_this_page) != head);
+
 	ret = try_to_free_buffers(page);
+
+	/*
+	 * There are a number of places where jbd2_journal_try_to_free_buffers()
+	 * could race with jbd2_journal_commit_transaction(), the later still
+	 * holds the reference to the buffers to free while processing them.
+	 * try_to_free_buffers() failed to free those buffers. Some of the
+	 * caller of releasepage() request page buffers to be dropped, otherwise
+	 * treat the fail-to-free as errors (such as generic_file_direct_IO())
+	 *
+	 * So, if the caller of try_to_release_page() wants the synchronous
+	 * behaviour(i.e make sure buffers are dropped upon return),
+	 * let's wait for the current transaction to finish flush of
+	 * dirty data buffers, then try to free those buffers again,
+	 * with the journal locked.
+	 */
+	if (ret == 0 && (gfp_mask & __GFP_WAIT) && (gfp_mask & __GFP_FS)) {
+		jbd2_journal_wait_for_transaction_sync_data(journal);
+		ret = try_to_free_buffers(page);
+	}
+
 busy:
 	return ret;
 }
@@ -1823,6 +1687,7 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
 	if (!buffer_jbd(bh))
 		goto zap_buffer_unlocked;
 
+	/* OK, we have data buffer in journaled mode */
 	spin_lock(&journal->j_state_lock);
 	jbd_lock_bh_state(bh);
 	spin_lock(&journal->j_list_lock);
@@ -1886,15 +1751,6 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
 		}
 	} else if (transaction == journal->j_committing_transaction) {
 		JBUFFER_TRACE(jh, "on committing transaction");
-		if (jh->b_jlist == BJ_Locked) {
-			/*
-			 * The buffer is on the committing transaction's locked
-			 * list.  We have the buffer locked, so I/O has
-			 * completed.  So we can nail the buffer now.
-			 */
-			may_free = __dispose_buffer(jh, transaction);
-			goto zap_buffer;
-		}
 		/*
 		 * If it is committing, we simply cannot touch it.  We
 		 * can remove it's next_transaction pointer from the
@@ -2027,9 +1883,6 @@ void __jbd2_journal_file_buffer(struct journal_head *jh,
 		J_ASSERT_JH(jh, !jh->b_committed_data);
 		J_ASSERT_JH(jh, !jh->b_frozen_data);
 		return;
-	case BJ_SyncData:
-		list = &transaction->t_sync_datalist;
-		break;
 	case BJ_Metadata:
 		transaction->t_nr_buffers++;
 		list = &transaction->t_buffers;
@@ -2049,9 +1902,6 @@ void __jbd2_journal_file_buffer(struct journal_head *jh,
 	case BJ_Reserved:
 		list = &transaction->t_reserved_list;
 		break;
-	case BJ_Locked:
-		list =  &transaction->t_locked_list;
-		break;
 	}
 
 	__blist_add_buffer(list, jh);
@@ -2141,3 +1991,88 @@ void jbd2_journal_refile_buffer(journal_t *journal, struct journal_head *jh)
 	spin_unlock(&journal->j_list_lock);
 	__brelse(bh);
 }
+
+/*
+ * File inode in the inode list of the handle's transaction
+ */
+int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode)
+{
+	transaction_t *transaction = handle->h_transaction;
+	journal_t *journal = transaction->t_journal;
+
+	if (is_handle_aborted(handle))
+		return -EIO;
+
+	jbd_debug(4, "Adding inode %lu, tid:%d\n", jinode->i_vfs_inode->i_ino,
+			transaction->t_tid);
+
+	/*
+	 * First check whether inode isn't already on the transaction's
+	 * lists without taking the lock. Note that this check is safe
+	 * without the lock as we cannot race with somebody removing inode
+	 * from the transaction. The reason is that we remove inode from the
+	 * transaction only in journal_release_jbd_inode() and when we commit
+	 * the transaction. We are guarded from the first case by holding
+	 * a reference to the inode. We are safe against the second case
+	 * because if jinode->i_transaction == transaction, commit code
+	 * cannot touch the transaction because we hold reference to it,
+	 * and if jinode->i_next_transaction == transaction, commit code
+	 * will only file the inode where we want it.
+	 */
+	if (jinode->i_transaction == transaction ||
+	    jinode->i_next_transaction == transaction)
+		return 0;
+
+	spin_lock(&journal->j_list_lock);
+
+	if (jinode->i_transaction == transaction ||
+	    jinode->i_next_transaction == transaction)
+		goto done;
+
+	/* On some different transaction's list - should be
+	 * the committing one */
+	if (jinode->i_transaction) {
+		J_ASSERT(jinode->i_next_transaction == NULL);
+		J_ASSERT(jinode->i_transaction ==
+					journal->j_committing_transaction);
+		jinode->i_next_transaction = transaction;
+		goto done;
+	}
+	/* Not on any transaction list... */
+	J_ASSERT(!jinode->i_next_transaction);
+	jinode->i_transaction = transaction;
+	list_add(&jinode->i_list, &transaction->t_inode_list);
+done:
+	spin_unlock(&journal->j_list_lock);
+
+	return 0;
+}
+
+/*
+ * This function must be called when inode is journaled in ordered mode
+ * before truncation happens. It starts writeout of truncated part in
+ * case it is in the committing transaction so that we stand to ordered
+ * mode consistency guarantees.
+ */
+int jbd2_journal_begin_ordered_truncate(struct jbd2_inode *inode,
+					loff_t new_size)
+{
+	journal_t *journal;
+	transaction_t *commit_trans;
+	int ret = 0;
+
+	if (!inode->i_transaction && !inode->i_next_transaction)
+		goto out;
+	journal = inode->i_transaction->t_journal;
+	spin_lock(&journal->j_state_lock);
+	commit_trans = journal->j_committing_transaction;
+	spin_unlock(&journal->j_state_lock);
+	if (inode->i_transaction == commit_trans) {
+		ret = filemap_fdatawrite_range(inode->i_vfs_inode->i_mapping,
+			new_size, LLONG_MAX);
+		if (ret)
+			jbd2_journal_abort(journal, ret);
+	}
+out:
+	return ret;
+}
diff --git a/fs/libfs.c b/fs/libfs.c
index b004dfadd891..baeb71ee1cde 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -512,6 +512,20 @@ void simple_release_fs(struct vfsmount **mount, int *count)
 	mntput(mnt);
 }
 
+/**
+ * simple_read_from_buffer - copy data from the buffer to user space
+ * @to: the user space buffer to read to
+ * @count: the maximum number of bytes to read
+ * @ppos: the current position in the buffer
+ * @from: the buffer to read from
+ * @available: the size of the buffer
+ *
+ * The simple_read_from_buffer() function reads up to @count bytes from the
+ * buffer @from at offset @ppos into the user space address starting at @to.
+ *
+ * On success, the number of bytes read is returned and the offset @ppos is
+ * advanced by this number, or negative value is returned on error.
+ **/
 ssize_t simple_read_from_buffer(void __user *to, size_t count, loff_t *ppos,
 				const void *from, size_t available)
 {
@@ -528,6 +542,37 @@ ssize_t simple_read_from_buffer(void __user *to, size_t count, loff_t *ppos,
 	return count;
 }
 
+/**
+ * memory_read_from_buffer - copy data from the buffer
+ * @to: the kernel space buffer to read to
+ * @count: the maximum number of bytes to read
+ * @ppos: the current position in the buffer
+ * @from: the buffer to read from
+ * @available: the size of the buffer
+ *
+ * The memory_read_from_buffer() function reads up to @count bytes from the
+ * buffer @from at offset @ppos into the kernel space address starting at @to.
+ *
+ * On success, the number of bytes read is returned and the offset @ppos is
+ * advanced by this number, or negative value is returned on error.
+ **/
+ssize_t memory_read_from_buffer(void *to, size_t count, loff_t *ppos,
+				const void *from, size_t available)
+{
+	loff_t pos = *ppos;
+
+	if (pos < 0)
+		return -EINVAL;
+	if (pos >= available)
+		return 0;
+	if (count > available - pos)
+		count = available - pos;
+	memcpy(to, from + pos, count);
+	*ppos = pos + count;
+
+	return count;
+}
+
 /*
  * Transaction based IO.
  * The file expects a single write which triggers the transaction, and then
@@ -800,6 +845,7 @@ EXPORT_SYMBOL(simple_statfs);
 EXPORT_SYMBOL(simple_sync_file);
 EXPORT_SYMBOL(simple_unlink);
 EXPORT_SYMBOL(simple_read_from_buffer);
+EXPORT_SYMBOL(memory_read_from_buffer);
 EXPORT_SYMBOL(simple_transaction_get);
 EXPORT_SYMBOL(simple_transaction_read);
 EXPORT_SYMBOL(simple_transaction_release);
diff --git a/fs/locks.c b/fs/locks.c
index 11dbf08651b7..dce8c747371c 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -561,9 +561,6 @@ static void locks_insert_lock(struct file_lock **pos, struct file_lock *fl)
 	/* insert into file's list */
 	fl->fl_next = *pos;
 	*pos = fl;
-
-	if (fl->fl_ops && fl->fl_ops->fl_insert)
-		fl->fl_ops->fl_insert(fl);
 }
 
 /*
@@ -586,9 +583,6 @@ static void locks_delete_lock(struct file_lock **thisfl_p)
 		fl->fl_fasync = NULL;
 	}
 
-	if (fl->fl_ops && fl->fl_ops->fl_remove)
-		fl->fl_ops->fl_remove(fl);
-
 	if (fl->fl_nspid) {
 		put_pid(fl->fl_nspid);
 		fl->fl_nspid = NULL;
diff --git a/fs/mpage.c b/fs/mpage.c
index 235e4d3873a8..dbcc7af76a15 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -82,7 +82,7 @@ static void mpage_end_io_write(struct bio *bio, int err)
 	bio_put(bio);
 }
 
-static struct bio *mpage_bio_submit(int rw, struct bio *bio)
+struct bio *mpage_bio_submit(int rw, struct bio *bio)
 {
 	bio->bi_end_io = mpage_end_io_read;
 	if (rw == WRITE)
@@ -90,6 +90,7 @@ static struct bio *mpage_bio_submit(int rw, struct bio *bio)
 	submit_bio(rw, bio);
 	return NULL;
 }
+EXPORT_SYMBOL(mpage_bio_submit);
 
 static struct bio *
 mpage_alloc(struct block_device *bdev,
@@ -435,15 +436,9 @@ EXPORT_SYMBOL(mpage_readpage);
  * written, so it can intelligently allocate a suitably-sized BIO.  For now,
  * just allocate full-size (16-page) BIOs.
  */
-struct mpage_data {
-	struct bio *bio;
-	sector_t last_block_in_bio;
-	get_block_t *get_block;
-	unsigned use_writepage;
-};
 
-static int __mpage_writepage(struct page *page, struct writeback_control *wbc,
-			     void *data)
+int __mpage_writepage(struct page *page, struct writeback_control *wbc,
+		      void *data)
 {
 	struct mpage_data *mpd = data;
 	struct bio *bio = mpd->bio;
@@ -651,6 +646,7 @@ out:
 	mpd->bio = bio;
 	return ret;
 }
+EXPORT_SYMBOL(__mpage_writepage);
 
 /**
  * mpage_writepages - walk the list of dirty pages of the given address space & writepage() all of them
diff --git a/fs/msdos/namei.c b/fs/msdos/namei.c
index 05ff4f1d7026..1f7f2956412a 100644
--- a/fs/msdos/namei.c
+++ b/fs/msdos/namei.c
@@ -214,7 +214,7 @@ static struct dentry *msdos_lookup(struct inode *dir, struct dentry *dentry,
 
 	dentry->d_op = &msdos_dentry_operations;
 
-	lock_kernel();
+	lock_super(sb);
 	res = msdos_find(dir, dentry->d_name.name, dentry->d_name.len, &sinfo);
 	if (res == -ENOENT)
 		goto add;
@@ -232,7 +232,7 @@ add:
 	if (dentry)
 		dentry->d_op = &msdos_dentry_operations;
 out:
-	unlock_kernel();
+	unlock_super(sb);
 	if (!res)
 		return dentry;
 	return ERR_PTR(res);
@@ -286,7 +286,7 @@ static int msdos_create(struct inode *dir, struct dentry *dentry, int mode,
 	unsigned char msdos_name[MSDOS_NAME];
 	int err, is_hid;
 
-	lock_kernel();
+	lock_super(sb);
 
 	err = msdos_format_name(dentry->d_name.name, dentry->d_name.len,
 				msdos_name, &MSDOS_SB(sb)->options);
@@ -315,7 +315,7 @@ static int msdos_create(struct inode *dir, struct dentry *dentry, int mode,
 
 	d_instantiate(dentry, inode);
 out:
-	unlock_kernel();
+	unlock_super(sb);
 	if (!err)
 		err = fat_flush_inodes(sb, dir, inode);
 	return err;
@@ -324,11 +324,12 @@ out:
 /***** Remove a directory */
 static int msdos_rmdir(struct inode *dir, struct dentry *dentry)
 {
+	struct super_block *sb = dir->i_sb;
 	struct inode *inode = dentry->d_inode;
 	struct fat_slot_info sinfo;
 	int err;
 
-	lock_kernel();
+	lock_super(sb);
 	/*
 	 * Check whether the directory is not in use, then check
 	 * whether it is empty.
@@ -349,9 +350,9 @@ static int msdos_rmdir(struct inode *dir, struct dentry *dentry)
 	inode->i_ctime = CURRENT_TIME_SEC;
 	fat_detach(inode);
 out:
-	unlock_kernel();
+	unlock_super(sb);
 	if (!err)
-		err = fat_flush_inodes(inode->i_sb, dir, inode);
+		err = fat_flush_inodes(sb, dir, inode);
 
 	return err;
 }
@@ -366,7 +367,7 @@ static int msdos_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	struct timespec ts;
 	int err, is_hid, cluster;
 
-	lock_kernel();
+	lock_super(sb);
 
 	err = msdos_format_name(dentry->d_name.name, dentry->d_name.len,
 				msdos_name, &MSDOS_SB(sb)->options);
@@ -404,14 +405,14 @@ static int msdos_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 
 	d_instantiate(dentry, inode);
 
-	unlock_kernel();
+	unlock_super(sb);
 	fat_flush_inodes(sb, dir, inode);
 	return 0;
 
 out_free:
 	fat_free_clusters(dir, cluster);
 out:
-	unlock_kernel();
+	unlock_super(sb);
 	return err;
 }
 
@@ -419,10 +420,11 @@ out:
 static int msdos_unlink(struct inode *dir, struct dentry *dentry)
 {
 	struct inode *inode = dentry->d_inode;
+	struct super_block *sb= inode->i_sb;
 	struct fat_slot_info sinfo;
 	int err;
 
-	lock_kernel();
+	lock_super(sb);
 	err = msdos_find(dir, dentry->d_name.name, dentry->d_name.len, &sinfo);
 	if (err)
 		goto out;
@@ -434,9 +436,9 @@ static int msdos_unlink(struct inode *dir, struct dentry *dentry)
 	inode->i_ctime = CURRENT_TIME_SEC;
 	fat_detach(inode);
 out:
-	unlock_kernel();
+	unlock_super(sb);
 	if (!err)
-		err = fat_flush_inodes(inode->i_sb, dir, inode);
+		err = fat_flush_inodes(sb, dir, inode);
 
 	return err;
 }
@@ -618,10 +620,11 @@ error_inode:
 static int msdos_rename(struct inode *old_dir, struct dentry *old_dentry,
 			struct inode *new_dir, struct dentry *new_dentry)
 {
+	struct super_block *sb = old_dir->i_sb;
 	unsigned char old_msdos_name[MSDOS_NAME], new_msdos_name[MSDOS_NAME];
 	int err, is_hid;
 
-	lock_kernel();
+	lock_super(sb);
 
 	err = msdos_format_name(old_dentry->d_name.name,
 				old_dentry->d_name.len, old_msdos_name,
@@ -640,9 +643,9 @@ static int msdos_rename(struct inode *old_dir, struct dentry *old_dentry,
 	err = do_msdos_rename(old_dir, old_msdos_name, old_dentry,
 			      new_dir, new_msdos_name, new_dentry, is_hid);
 out:
-	unlock_kernel();
+	unlock_super(sb);
 	if (!err)
-		err = fat_flush_inodes(old_dir->i_sb, old_dir, new_dir);
+		err = fat_flush_inodes(sb, old_dir, new_dir);
 	return err;
 }
 
diff --git a/fs/namei.c b/fs/namei.c
index 32fd9655485b..01e67dddcc3d 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -581,15 +581,13 @@ static __always_inline int link_path_walk(const char *name, struct nameidata *nd
 	int result;
 
 	/* make sure the stuff we saved doesn't go away */
-	dget(save.dentry);
-	mntget(save.mnt);
+	path_get(&save);
 
 	result = __link_path_walk(name, nd);
 	if (result == -ESTALE) {
 		/* nd->path had been dropped */
 		nd->path = save;
-		dget(nd->path.dentry);
-		mntget(nd->path.mnt);
+		path_get(&nd->path);
 		nd->flags |= LOOKUP_REVAL;
 		result = __link_path_walk(name, nd);
 	}
@@ -1216,8 +1214,9 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
 	nd->flags = flags;
 	nd->depth = 0;
 
-	nd->path.mnt = mntget(mnt);
-	nd->path.dentry = dget(dentry);
+	nd->path.dentry = dentry;
+	nd->path.mnt = mnt;
+	path_get(&nd->path);
 
 	retval = path_walk(name, nd);
 	if (unlikely(!retval && !audit_dummy_context() && nd->path.dentry &&
@@ -2003,18 +2002,22 @@ struct dentry *lookup_create(struct nameidata *nd, int is_dir)
 	if (IS_ERR(dentry))
 		goto fail;
 
+	if (dentry->d_inode)
+		goto eexist;
 	/*
 	 * Special case - lookup gave negative, but... we had foo/bar/
 	 * From the vfs_mknod() POV we just have a negative dentry -
 	 * all is fine. Let's be bastards - you had / on the end, you've
 	 * been asking for (non-existent) directory. -ENOENT for you.
 	 */
-	if (!is_dir && nd->last.name[nd->last.len] && !dentry->d_inode)
-		goto enoent;
+	if (unlikely(!is_dir && nd->last.name[nd->last.len])) {
+		dput(dentry);
+		dentry = ERR_PTR(-ENOENT);
+	}
 	return dentry;
-enoent:
+eexist:
 	dput(dentry);
-	dentry = ERR_PTR(-ENOENT);
+	dentry = ERR_PTR(-EEXIST);
 fail:
 	return dentry;
 }
@@ -2853,16 +2856,17 @@ int generic_readlink(struct dentry *dentry, char __user *buffer, int buflen)
 {
 	struct nameidata nd;
 	void *cookie;
+	int res;
 
 	nd.depth = 0;
 	cookie = dentry->d_inode->i_op->follow_link(dentry, &nd);
-	if (!IS_ERR(cookie)) {
-		int res = vfs_readlink(dentry, buffer, buflen, nd_get_link(&nd));
-		if (dentry->d_inode->i_op->put_link)
-			dentry->d_inode->i_op->put_link(dentry, &nd, cookie);
-		cookie = ERR_PTR(res);
-	}
-	return PTR_ERR(cookie);
+	if (IS_ERR(cookie))
+		return PTR_ERR(cookie);
+
+	res = vfs_readlink(dentry, buffer, buflen, nd_get_link(&nd));
+	if (dentry->d_inode->i_op->put_link)
+		dentry->d_inode->i_op->put_link(dentry, &nd, cookie);
+	return res;
 }
 
 int vfs_follow_link(struct nameidata *nd, const char *link)
diff --git a/fs/namespace.c b/fs/namespace.c
index 4fc302c2a0e0..4f6f7635b59c 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -750,7 +750,7 @@ struct proc_fs_info {
 	const char *str;
 };
 
-static void show_sb_opts(struct seq_file *m, struct super_block *sb)
+static int show_sb_opts(struct seq_file *m, struct super_block *sb)
 {
 	static const struct proc_fs_info fs_info[] = {
 		{ MS_SYNCHRONOUS, ",sync" },
@@ -764,6 +764,8 @@ static void show_sb_opts(struct seq_file *m, struct super_block *sb)
 		if (sb->s_flags & fs_infop->flag)
 			seq_puts(m, fs_infop->str);
 	}
+
+	return security_sb_show_options(m, sb);
 }
 
 static void show_mnt_opts(struct seq_file *m, struct vfsmount *mnt)
@@ -806,11 +808,14 @@ static int show_vfsmnt(struct seq_file *m, void *v)
 	seq_putc(m, ' ');
 	show_type(m, mnt->mnt_sb);
 	seq_puts(m, __mnt_is_readonly(mnt) ? " ro" : " rw");
-	show_sb_opts(m, mnt->mnt_sb);
+	err = show_sb_opts(m, mnt->mnt_sb);
+	if (err)
+		goto out;
 	show_mnt_opts(m, mnt);
 	if (mnt->mnt_sb->s_op->show_options)
 		err = mnt->mnt_sb->s_op->show_options(m, mnt);
 	seq_puts(m, " 0 0\n");
+out:
 	return err;
 }
 
@@ -865,10 +870,13 @@ static int show_mountinfo(struct seq_file *m, void *v)
 	seq_putc(m, ' ');
 	mangle(m, mnt->mnt_devname ? mnt->mnt_devname : "none");
 	seq_puts(m, sb->s_flags & MS_RDONLY ? " ro" : " rw");
-	show_sb_opts(m, sb);
+	err = show_sb_opts(m, sb);
+	if (err)
+		goto out;
 	if (sb->s_op->show_options)
 		err = sb->s_op->show_options(m, mnt);
 	seq_putc(m, '\n');
+out:
 	return err;
 }
 
diff --git a/fs/ncpfs/file.c b/fs/ncpfs/file.c
index 2b145de45b39..6a7d901f1936 100644
--- a/fs/ncpfs/file.c
+++ b/fs/ncpfs/file.c
@@ -18,6 +18,7 @@
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/sched.h>
+#include <linux/smp_lock.h>
 
 #include <linux/ncp_fs.h>
 #include "ncplib_kernel.h"
@@ -281,9 +282,18 @@ static int ncp_release(struct inode *inode, struct file *file) {
 	return 0;
 }
 
+static loff_t ncp_remote_llseek(struct file *file, loff_t offset, int origin)
+{
+	loff_t ret;
+	lock_kernel();
+	ret = generic_file_llseek_unlocked(file, offset, origin);
+	unlock_kernel();
+	return ret;
+}
+
 const struct file_operations ncp_file_operations =
 {
-	.llseek		= remote_llseek,
+	.llseek 	= ncp_remote_llseek,
 	.read		= ncp_file_read,
 	.write		= ncp_file_write,
 	.ioctl		= ncp_ioctl,
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 5606ae3d72d3..c1e7c8300629 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -182,7 +182,7 @@ static int nfs_callback_authenticate(struct svc_rqst *rqstp)
 	if (clp == NULL)
 		return SVC_DROP;
 
-	dprintk("%s: %s NFSv4 callback!\n", __FUNCTION__,
+	dprintk("%s: %s NFSv4 callback!\n", __func__,
 			svc_print_addr(rqstp, buf, sizeof(buf)));
 	nfs_put_client(clp);
 
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 15f7785048d3..f7e83e23cf9f 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -57,7 +57,7 @@ out_iput:
 out_putclient:
 	nfs_put_client(clp);
 out:
-	dprintk("%s: exit with status = %d\n", __FUNCTION__, ntohl(res->status));
+	dprintk("%s: exit with status = %d\n", __func__, ntohl(res->status));
 	return res->status;
 }
 
@@ -98,6 +98,6 @@ __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy)
 		nfs_put_client(prev);
 	} while (clp != NULL);
 out:
-	dprintk("%s: exit with status = %d\n", __FUNCTION__, ntohl(res));
+	dprintk("%s: exit with status = %d\n", __func__, ntohl(res));
 	return res;
 }
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index 13619d24f023..dd0ef34b5845 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -141,7 +141,7 @@ static __be32 decode_compound_hdr_arg(struct xdr_stream *xdr, struct cb_compound
 	/* We do not like overly long tags! */
 	if (hdr->taglen > CB_OP_TAGLEN_MAXSZ - 12) {
 		printk("NFSv4 CALLBACK %s: client sent tag of length %u\n",
-				__FUNCTION__, hdr->taglen);
+				__func__, hdr->taglen);
 		return htonl(NFS4ERR_RESOURCE);
 	}
 	p = read_buf(xdr, 12);
@@ -151,7 +151,7 @@ static __be32 decode_compound_hdr_arg(struct xdr_stream *xdr, struct cb_compound
 	/* Check minor version is zero. */
 	if (minor_version != 0) {
 		printk(KERN_WARNING "%s: NFSv4 server callback with illegal minor version %u!\n",
-				__FUNCTION__, minor_version);
+				__func__, minor_version);
 		return htonl(NFS4ERR_MINOR_VERS_MISMATCH);
 	}
 	hdr->callback_ident = ntohl(*p++);
@@ -179,7 +179,7 @@ static __be32 decode_getattr_args(struct svc_rqst *rqstp, struct xdr_stream *xdr
 	args->addr = svc_addr(rqstp);
 	status = decode_bitmap(xdr, args->bitmap);
 out:
-	dprintk("%s: exit with status = %d\n", __FUNCTION__, ntohl(status));
+	dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
 	return status;
 }
 
@@ -200,7 +200,7 @@ static __be32 decode_recall_args(struct svc_rqst *rqstp, struct xdr_stream *xdr,
 	args->truncate = ntohl(*p);
 	status = decode_fh(xdr, &args->fh);
 out:
-	dprintk("%s: exit with status = %d\n", __FUNCTION__, ntohl(status));
+	dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
 	return status;
 }
 
@@ -349,7 +349,7 @@ static __be32 encode_getattr_res(struct svc_rqst *rqstp, struct xdr_stream *xdr,
 	status = encode_attr_mtime(xdr, res->bitmap, &res->mtime);
 	*savep = htonl((unsigned int)((char *)xdr->p - (char *)(savep+1)));
 out:
-	dprintk("%s: exit with status = %d\n", __FUNCTION__, ntohl(status));
+	dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
 	return status;
 }
 
@@ -363,7 +363,7 @@ static __be32 process_op(struct svc_rqst *rqstp,
 	long maxlen;
 	__be32 res;
 
-	dprintk("%s: start\n", __FUNCTION__);
+	dprintk("%s: start\n", __func__);
 	status = decode_op_hdr(xdr_in, &op_nr);
 	if (likely(status == 0)) {
 		switch (op_nr) {
@@ -392,7 +392,7 @@ static __be32 process_op(struct svc_rqst *rqstp,
 		status = res;
 	if (op->encode_res != NULL && status == 0)
 		status = op->encode_res(rqstp, xdr_out, resp);
-	dprintk("%s: done, status = %d\n", __FUNCTION__, ntohl(status));
+	dprintk("%s: done, status = %d\n", __func__, ntohl(status));
 	return status;
 }
 
@@ -401,37 +401,37 @@ static __be32 process_op(struct svc_rqst *rqstp,
  */
 static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *resp)
 {
-	struct cb_compound_hdr_arg hdr_arg;
-	struct cb_compound_hdr_res hdr_res;
+	struct cb_compound_hdr_arg hdr_arg = { 0 };
+	struct cb_compound_hdr_res hdr_res = { NULL };
 	struct xdr_stream xdr_in, xdr_out;
 	__be32 *p;
 	__be32 status;
-	unsigned int nops = 1;
+	unsigned int nops = 0;
 
-	dprintk("%s: start\n", __FUNCTION__);
+	dprintk("%s: start\n", __func__);
 
 	xdr_init_decode(&xdr_in, &rqstp->rq_arg, rqstp->rq_arg.head[0].iov_base);
 
 	p = (__be32*)((char *)rqstp->rq_res.head[0].iov_base + rqstp->rq_res.head[0].iov_len);
 	xdr_init_encode(&xdr_out, &rqstp->rq_res, p);
 
-	decode_compound_hdr_arg(&xdr_in, &hdr_arg);
+	status = decode_compound_hdr_arg(&xdr_in, &hdr_arg);
+	if (status == __constant_htonl(NFS4ERR_RESOURCE))
+		return rpc_garbage_args;
+
 	hdr_res.taglen = hdr_arg.taglen;
 	hdr_res.tag = hdr_arg.tag;
-	hdr_res.nops = NULL;
-	encode_compound_hdr_res(&xdr_out, &hdr_res);
+	if (encode_compound_hdr_res(&xdr_out, &hdr_res) != 0)
+		return rpc_system_err;
 
-	for (;;) {
+	while (status == 0 && nops != hdr_arg.nops) {
 		status = process_op(rqstp, &xdr_in, argp, &xdr_out, resp);
-		if (status != 0)
-			break;
-		if (nops == hdr_arg.nops)
-			break;
 		nops++;
 	}
+
 	*hdr_res.status = status;
 	*hdr_res.nops = htonl(nops);
-	dprintk("%s: done, status = %u\n", __FUNCTION__, ntohl(status));
+	dprintk("%s: done, status = %u\n", __func__, ntohl(status));
 	return rpc_success;
 }
 
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 89ac5bb0401c..f2a092ca69b5 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -488,7 +488,7 @@ static int nfs_create_rpc_client(struct nfs_client *clp,
 	clnt = rpc_create(&args);
 	if (IS_ERR(clnt)) {
 		dprintk("%s: cannot create RPC client. Error = %ld\n",
-				__FUNCTION__, PTR_ERR(clnt));
+				__func__, PTR_ERR(clnt));
 		return PTR_ERR(clnt);
 	}
 
@@ -576,7 +576,7 @@ static int nfs_init_server_rpcclient(struct nfs_server *server,
 
 	server->client = rpc_clone_client(clp->cl_rpcclient);
 	if (IS_ERR(server->client)) {
-		dprintk("%s: couldn't create rpc_client!\n", __FUNCTION__);
+		dprintk("%s: couldn't create rpc_client!\n", __func__);
 		return PTR_ERR(server->client);
 	}
 
@@ -590,7 +590,7 @@ static int nfs_init_server_rpcclient(struct nfs_server *server,
 
 		auth = rpcauth_create(pseudoflavour, server->client);
 		if (IS_ERR(auth)) {
-			dprintk("%s: couldn't create credcache!\n", __FUNCTION__);
+			dprintk("%s: couldn't create credcache!\n", __func__);
 			return PTR_ERR(auth);
 		}
 	}
@@ -985,7 +985,7 @@ static int nfs4_init_client(struct nfs_client *clp,
 	error = nfs_idmap_new(clp);
 	if (error < 0) {
 		dprintk("%s: failed to create idmapper. Error = %d\n",
-			__FUNCTION__, error);
+			__func__, error);
 		goto error;
 	}
 	__set_bit(NFS_CS_IDMAP, &clp->cl_res_state);
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 00a5e4405e16..cc563cfa6940 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -60,7 +60,7 @@ static int nfs_delegation_claim_locks(struct nfs_open_context *ctx, struct nfs4_
 		switch (status) {
 			default:
 				printk(KERN_ERR "%s: unhandled error %d.\n",
-						__FUNCTION__, status);
+						__func__, status);
 			case -NFS4ERR_EXPIRED:
 				/* kill_proc(fl->fl_pid, SIGLOST, 1); */
 			case -NFS4ERR_STALE_CLIENTID:
@@ -186,7 +186,7 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct
 		 */
 		dfprintk(FILE, "%s: server %s handed out "
 				"a duplicate delegation!\n",
-				__FUNCTION__, clp->cl_hostname);
+				__func__, clp->cl_hostname);
 		if (delegation->type <= nfsi->delegation->type) {
 			freeme = delegation;
 			delegation = NULL;
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index f288b3ecab4a..982a2064fe4c 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -180,7 +180,7 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page *page)
 	int		error;
 
 	dfprintk(DIRCACHE, "NFS: %s: reading cookie %Lu into page %lu\n",
-			__FUNCTION__, (long long)desc->entry->cookie,
+			__func__, (long long)desc->entry->cookie,
 			page->index);
 
  again:
@@ -204,7 +204,7 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page *page)
 	 * Note: assumes we have exclusive access to this mapping either
 	 *	 through inode->i_mutex or some other mechanism.
 	 */
-	if (page->index == 0 && invalidate_inode_pages2_range(inode->i_mapping, PAGE_CACHE_SIZE, -1) < 0) {
+	if (invalidate_inode_pages2_range(inode->i_mapping, page->index + 1, -1) < 0) {
 		/* Should never happen */
 		nfs_zap_mapping(inode, inode->i_mapping);
 	}
@@ -256,7 +256,7 @@ int find_dirent(nfs_readdir_descriptor_t *desc)
 
 	while((status = dir_decode(desc)) == 0) {
 		dfprintk(DIRCACHE, "NFS: %s: examining cookie %Lu\n",
-				__FUNCTION__, (unsigned long long)entry->cookie);
+				__func__, (unsigned long long)entry->cookie);
 		if (entry->prev_cookie == *desc->dir_cookie)
 			break;
 		if (loop_count++ > 200) {
@@ -315,7 +315,7 @@ int find_dirent_page(nfs_readdir_descriptor_t *desc)
 	int		status;
 
 	dfprintk(DIRCACHE, "NFS: %s: searching page %ld for target %Lu\n",
-			__FUNCTION__, desc->page_index,
+			__func__, desc->page_index,
 			(long long) *desc->dir_cookie);
 
 	/* If we find the page in the page_cache, we cannot be sure
@@ -339,7 +339,7 @@ int find_dirent_page(nfs_readdir_descriptor_t *desc)
 	if (status < 0)
 		dir_page_release(desc);
  out:
-	dfprintk(DIRCACHE, "NFS: %s: returns %d\n", __FUNCTION__, status);
+	dfprintk(DIRCACHE, "NFS: %s: returns %d\n", __func__, status);
 	return status;
 }
 
@@ -380,7 +380,7 @@ int readdir_search_pagecache(nfs_readdir_descriptor_t *desc)
 		}
 	}
 
-	dfprintk(DIRCACHE, "NFS: %s: returns %d\n", __FUNCTION__, res);
+	dfprintk(DIRCACHE, "NFS: %s: returns %d\n", __func__, res);
 	return res;
 }
 
@@ -506,7 +506,7 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent,
 	desc->entry->eof = 0;
  out:
 	dfprintk(DIRCACHE, "NFS: %s: returns %d\n",
-			__FUNCTION__, status);
+			__func__, status);
 	return status;
  out_release:
 	dir_page_release(desc);
@@ -780,7 +780,7 @@ static int nfs_lookup_revalidate(struct dentry * dentry, struct nameidata *nd)
 
 	if (is_bad_inode(inode)) {
 		dfprintk(LOOKUPCACHE, "%s: %s/%s has dud inode\n",
-				__FUNCTION__, dentry->d_parent->d_name.name,
+				__func__, dentry->d_parent->d_name.name,
 				dentry->d_name.name);
 		goto out_bad;
 	}
@@ -808,7 +808,7 @@ static int nfs_lookup_revalidate(struct dentry * dentry, struct nameidata *nd)
 	unlock_kernel();
 	dput(parent);
 	dfprintk(LOOKUPCACHE, "NFS: %s(%s/%s) is valid\n",
-			__FUNCTION__, dentry->d_parent->d_name.name,
+			__func__, dentry->d_parent->d_name.name,
 			dentry->d_name.name);
 	return 1;
 out_zap_parent:
@@ -827,7 +827,7 @@ out_zap_parent:
 	unlock_kernel();
 	dput(parent);
 	dfprintk(LOOKUPCACHE, "NFS: %s(%s/%s) is invalid\n",
-			__FUNCTION__, dentry->d_parent->d_name.name,
+			__func__, dentry->d_parent->d_name.name,
 			dentry->d_name.name);
 	return 0;
 }
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 3536b01164f9..4e98a56a1777 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -170,6 +170,7 @@ force_reval:
 
 static loff_t nfs_file_llseek(struct file *filp, loff_t offset, int origin)
 {
+	loff_t loff;
 	/* origin == SEEK_END => we must revalidate the cached file length */
 	if (origin == SEEK_END) {
 		struct inode *inode = filp->f_mapping->host;
@@ -177,7 +178,10 @@ static loff_t nfs_file_llseek(struct file *filp, loff_t offset, int origin)
 		if (retval < 0)
 			return (loff_t)retval;
 	}
-	return remote_llseek(filp, offset, origin);
+	lock_kernel();	/* BKL needed? */
+	loff = generic_file_llseek_unlocked(filp, offset, origin);
+	unlock_kernel();
+	return loff;
 }
 
 /*
@@ -526,7 +530,7 @@ static int do_vfs_lock(struct file *file, struct file_lock *fl)
 	if (res < 0)
 		dprintk(KERN_WARNING "%s: VFS is out of sync with lock manager"
 			" - error %d!\n",
-				__FUNCTION__, res);
+				__func__, res);
 	return res;
 }
 
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 5cb3345eb694..596c5d8e86f4 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -541,8 +541,7 @@ static void __put_nfs_open_context(struct nfs_open_context *ctx, int wait)
 	}
 	if (ctx->cred != NULL)
 		put_rpccred(ctx->cred);
-	dput(ctx->path.dentry);
-	mntput(ctx->path.mnt);
+	path_put(&ctx->path);
 	kfree(ctx);
 }
 
@@ -707,6 +706,13 @@ int nfs_attribute_timeout(struct inode *inode)
 
 	if (nfs_have_delegation(inode, FMODE_READ))
 		return 0;
+	/*
+	 * Special case: if the attribute timeout is set to 0, then always
+	 * 		 treat the cache as having expired (unless holding
+	 * 		 a delegation).
+	 */
+	if (nfsi->attrtimeo == 0)
+		return 1;
 	return !time_in_range(jiffies, nfsi->read_cache_jiffies, nfsi->read_cache_jiffies + nfsi->attrtimeo);
 }
 
@@ -995,7 +1001,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 	unsigned long now = jiffies;
 
 	dfprintk(VFS, "NFS: %s(%s/%ld ct=%d info=0x%x)\n",
-			__FUNCTION__, inode->i_sb->s_id, inode->i_ino,
+			__func__, inode->i_sb->s_id, inode->i_ino,
 			atomic_read(&inode->i_count), fattr->valid);
 
 	if (nfsi->fileid != fattr->fileid)
@@ -1119,7 +1125,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 	 * Big trouble! The inode has become a different object.
 	 */
 	printk(KERN_DEBUG "%s: inode %ld mode changed, %07o to %07o\n",
-			__FUNCTION__, inode->i_ino, inode->i_mode, fattr->mode);
+			__func__, inode->i_ino, inode->i_mode, fattr->mode);
  out_err:
 	/*
 	 * No need to worry about unhashing the dentry, as the
diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c
index 49c7cd0502cc..779d2eb649c5 100644
--- a/fs/nfs/mount_clnt.c
+++ b/fs/nfs/mount_clnt.c
@@ -130,10 +130,11 @@ static int xdr_decode_fhstatus3(struct rpc_rqst *req, __be32 *p,
 				struct mnt_fhstatus *res)
 {
 	struct nfs_fh *fh = res->fh;
+	unsigned size;
 
 	if ((res->status = ntohl(*p++)) == 0) {
-		int size = ntohl(*p++);
-		if (size <= NFS3_FHSIZE) {
+		size = ntohl(*p++);
+		if (size <= NFS3_FHSIZE && size != 0) {
 			fh->size = size;
 			memcpy(fh->data, p, size);
 		} else
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index af4d0f1e402c..2f285ef76399 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -106,7 +106,7 @@ static void * nfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
 	dprintk("--> nfs_follow_mountpoint()\n");
 
 	BUG_ON(IS_ROOT(dentry));
-	dprintk("%s: enter\n", __FUNCTION__);
+	dprintk("%s: enter\n", __func__);
 	dput(nd->path.dentry);
 	nd->path.dentry = dget(dentry);
 
@@ -137,13 +137,12 @@ static void * nfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
 			goto out_follow;
 		goto out_err;
 	}
-	mntput(nd->path.mnt);
-	dput(nd->path.dentry);
+	path_put(&nd->path);
 	nd->path.mnt = mnt;
 	nd->path.dentry = dget(mnt->mnt_root);
 	schedule_delayed_work(&nfs_automount_task, nfs_mountpoint_expiry_timeout);
 out:
-	dprintk("%s: done, returned %d\n", __FUNCTION__, err);
+	dprintk("%s: done, returned %d\n", __func__, err);
 
 	dprintk("<-- nfs_follow_mountpoint() = %d\n", err);
 	return ERR_PTR(err);
@@ -230,7 +229,7 @@ static struct vfsmount *nfs_do_submount(const struct vfsmount *mnt_parent,
 
 	dprintk("--> nfs_do_submount()\n");
 
-	dprintk("%s: submounting on %s/%s\n", __FUNCTION__,
+	dprintk("%s: submounting on %s/%s\n", __func__,
 			dentry->d_parent->d_name.name,
 			dentry->d_name.name);
 	if (page == NULL)
@@ -243,7 +242,7 @@ static struct vfsmount *nfs_do_submount(const struct vfsmount *mnt_parent,
 free_page:
 	free_page((unsigned long)page);
 out:
-	dprintk("%s: done\n", __FUNCTION__);
+	dprintk("%s: done\n", __func__);
 
 	dprintk("<-- nfs_do_submount() = %p\n", mnt);
 	return mnt;
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 549dbce714a4..c3523ad03ed1 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -63,15 +63,15 @@ do_proc_get_root(struct rpc_clnt *client, struct nfs_fh *fhandle,
 	};
 	int	status;
 
-	dprintk("%s: call  fsinfo\n", __FUNCTION__);
+	dprintk("%s: call  fsinfo\n", __func__);
 	nfs_fattr_init(info->fattr);
 	status = rpc_call_sync(client, &msg, 0);
-	dprintk("%s: reply fsinfo: %d\n", __FUNCTION__, status);
+	dprintk("%s: reply fsinfo: %d\n", __func__, status);
 	if (!(info->fattr->valid & NFS_ATTR_FATTR)) {
 		msg.rpc_proc = &nfs3_procedures[NFS3PROC_GETATTR];
 		msg.rpc_resp = info->fattr;
 		status = rpc_call_sync(client, &msg, 0);
-		dprintk("%s: reply getattr: %d\n", __FUNCTION__, status);
+		dprintk("%s: reply getattr: %d\n", __func__, status);
 	}
 	return status;
 }
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index bd1b9d663fb9..ea790645fda6 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -206,7 +206,6 @@ struct rpc_cred *nfs4_get_renew_cred(struct nfs_client *clp);
 
 extern struct nfs4_state_owner * nfs4_get_state_owner(struct nfs_server *, struct rpc_cred *);
 extern void nfs4_put_state_owner(struct nfs4_state_owner *);
-extern void nfs4_drop_state_owner(struct nfs4_state_owner *);
 extern struct nfs4_state * nfs4_get_open_state(struct inode *, struct nfs4_state_owner *);
 extern void nfs4_put_open_state(struct nfs4_state *);
 extern void nfs4_close_state(struct path *, struct nfs4_state *, mode_t);
diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
index 5f9ba41ed5bf..b112857301f7 100644
--- a/fs/nfs/nfs4namespace.c
+++ b/fs/nfs/nfs4namespace.c
@@ -86,7 +86,7 @@ static int nfs4_validate_fspath(const struct vfsmount *mnt_parent,
 
 	if (strncmp(path, fs_path, strlen(fs_path)) != 0) {
 		dprintk("%s: path %s does not begin with fsroot %s\n",
-			__FUNCTION__, path, fs_path);
+			__func__, path, fs_path);
 		return -ENOENT;
 	}
 
@@ -134,7 +134,7 @@ static struct vfsmount *nfs_follow_referral(const struct vfsmount *mnt_parent,
 	if (locations == NULL || locations->nlocations <= 0)
 		goto out;
 
-	dprintk("%s: referral at %s/%s\n", __FUNCTION__,
+	dprintk("%s: referral at %s/%s\n", __func__,
 		dentry->d_parent->d_name.name, dentry->d_name.name);
 
 	page = (char *) __get_free_page(GFP_USER);
@@ -204,7 +204,7 @@ static struct vfsmount *nfs_follow_referral(const struct vfsmount *mnt_parent,
 out:
 	free_page((unsigned long) page);
 	free_page((unsigned long) page2);
-	dprintk("%s: done\n", __FUNCTION__);
+	dprintk("%s: done\n", __func__);
 	return mnt;
 }
 
@@ -223,7 +223,7 @@ struct vfsmount *nfs_do_refmount(const struct vfsmount *mnt_parent, struct dentr
 	int err;
 
 	/* BUG_ON(IS_ROOT(dentry)); */
-	dprintk("%s: enter\n", __FUNCTION__);
+	dprintk("%s: enter\n", __func__);
 
 	page = alloc_page(GFP_KERNEL);
 	if (page == NULL)
@@ -238,7 +238,7 @@ struct vfsmount *nfs_do_refmount(const struct vfsmount *mnt_parent, struct dentr
 
 	parent = dget_parent(dentry);
 	dprintk("%s: getting locations for %s/%s\n",
-		__FUNCTION__, parent->d_name.name, dentry->d_name.name);
+		__func__, parent->d_name.name, dentry->d_name.name);
 
 	err = nfs4_proc_fs_locations(parent->d_inode, &dentry->d_name, fs_locations, page);
 	dput(parent);
@@ -252,6 +252,6 @@ out_free:
 	__free_page(page);
 	kfree(fs_locations);
 out:
-	dprintk("%s: done\n", __FUNCTION__);
+	dprintk("%s: done\n", __func__);
 	return mnt;
 }
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index dbc09271af02..1293e0acd82b 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -73,7 +73,7 @@ int nfs4_map_errors(int err)
 {
 	if (err < -1000) {
 		dprintk("%s could not handle NFSv4 error %d\n",
-				__FUNCTION__, -err);
+				__func__, -err);
 		return -EIO;
 	}
 	return err;
@@ -306,8 +306,7 @@ static void nfs4_opendata_free(struct kref *kref)
 		nfs4_put_open_state(p->state);
 	nfs4_put_state_owner(p->owner);
 	dput(p->dir);
-	dput(p->path.dentry);
-	mntput(p->path.mnt);
+	path_put(&p->path);
 	kfree(p);
 }
 
@@ -1210,8 +1209,7 @@ static void nfs4_free_closedata(void *data)
 	nfs4_put_open_state(calldata->state);
 	nfs_free_seqid(calldata->arg.seqid);
 	nfs4_put_state_owner(sp);
-	dput(calldata->path.dentry);
-	mntput(calldata->path.mnt);
+	path_put(&calldata->path);
 	kfree(calldata);
 }
 
@@ -1578,7 +1576,7 @@ static int nfs4_get_referral(struct inode *dir, const struct qstr *name, struct
 		goto out;
 	/* Make sure server returned a different fsid for the referral */
 	if (nfs_fsid_equal(&NFS_SERVER(dir)->fsid, &locations->fattr.fsid)) {
-		dprintk("%s: server did not return a different fsid for a referral at %s\n", __FUNCTION__, name->name);
+		dprintk("%s: server did not return a different fsid for a referral at %s\n", __func__, name->name);
 		status = -EIO;
 		goto out;
 	}
@@ -2211,7 +2209,7 @@ static int _nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
 	};
 	int			status;
 
-	dprintk("%s: dentry = %s/%s, cookie = %Lu\n", __FUNCTION__,
+	dprintk("%s: dentry = %s/%s, cookie = %Lu\n", __func__,
 			dentry->d_parent->d_name.name,
 			dentry->d_name.name,
 			(unsigned long long)cookie);
@@ -2223,7 +2221,7 @@ static int _nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
 
 	nfs_invalidate_atime(dir);
 
-	dprintk("%s: returns %d\n", __FUNCTION__, status);
+	dprintk("%s: returns %d\n", __func__, status);
 	return status;
 }
 
@@ -3342,7 +3340,7 @@ static void nfs4_lock_prepare(struct rpc_task *task, void *calldata)
 	struct nfs4_lockdata *data = calldata;
 	struct nfs4_state *state = data->lsp->ls_state;
 
-	dprintk("%s: begin!\n", __FUNCTION__);
+	dprintk("%s: begin!\n", __func__);
 	if (nfs_wait_on_sequence(data->arg.lock_seqid, task) != 0)
 		return;
 	/* Do we need to do an open_to_lock_owner? */
@@ -3356,14 +3354,14 @@ static void nfs4_lock_prepare(struct rpc_task *task, void *calldata)
 		data->arg.new_lock_owner = 0;
 	data->timestamp = jiffies;
 	rpc_call_start(task);
-	dprintk("%s: done!, ret = %d\n", __FUNCTION__, data->rpc_status);
+	dprintk("%s: done!, ret = %d\n", __func__, data->rpc_status);
 }
 
 static void nfs4_lock_done(struct rpc_task *task, void *calldata)
 {
 	struct nfs4_lockdata *data = calldata;
 
-	dprintk("%s: begin!\n", __FUNCTION__);
+	dprintk("%s: begin!\n", __func__);
 
 	data->rpc_status = task->tk_status;
 	if (RPC_ASSASSINATED(task))
@@ -3381,14 +3379,14 @@ static void nfs4_lock_done(struct rpc_task *task, void *calldata)
 		renew_lease(NFS_SERVER(data->ctx->path.dentry->d_inode), data->timestamp);
 	}
 out:
-	dprintk("%s: done, ret = %d!\n", __FUNCTION__, data->rpc_status);
+	dprintk("%s: done, ret = %d!\n", __func__, data->rpc_status);
 }
 
 static void nfs4_lock_release(void *calldata)
 {
 	struct nfs4_lockdata *data = calldata;
 
-	dprintk("%s: begin!\n", __FUNCTION__);
+	dprintk("%s: begin!\n", __func__);
 	nfs_free_seqid(data->arg.open_seqid);
 	if (data->cancelled != 0) {
 		struct rpc_task *task;
@@ -3396,13 +3394,13 @@ static void nfs4_lock_release(void *calldata)
 				data->arg.lock_seqid);
 		if (!IS_ERR(task))
 			rpc_put_task(task);
-		dprintk("%s: cancelling lock!\n", __FUNCTION__);
+		dprintk("%s: cancelling lock!\n", __func__);
 	} else
 		nfs_free_seqid(data->arg.lock_seqid);
 	nfs4_put_lock_state(data->lsp);
 	put_nfs_open_context(data->ctx);
 	kfree(data);
-	dprintk("%s: done!\n", __FUNCTION__);
+	dprintk("%s: done!\n", __func__);
 }
 
 static const struct rpc_call_ops nfs4_lock_ops = {
@@ -3428,7 +3426,7 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f
 	};
 	int ret;
 
-	dprintk("%s: begin!\n", __FUNCTION__);
+	dprintk("%s: begin!\n", __func__);
 	data = nfs4_alloc_lockdata(fl, nfs_file_open_context(fl->fl_file),
 			fl->fl_u.nfs4_fl.owner);
 	if (data == NULL)
@@ -3451,7 +3449,7 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f
 	} else
 		data->cancelled = 1;
 	rpc_put_task(task);
-	dprintk("%s: done, ret = %d!\n", __FUNCTION__, ret);
+	dprintk("%s: done, ret = %d!\n", __func__, ret);
 	return ret;
 }
 
@@ -3527,7 +3525,7 @@ static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock
 	/* Note: we always want to sleep here! */
 	request->fl_flags = fl_flags | FL_SLEEP;
 	if (do_vfs_lock(request->fl_file, request) < 0)
-		printk(KERN_WARNING "%s: VFS is out of sync with lock manager!\n", __FUNCTION__);
+		printk(KERN_WARNING "%s: VFS is out of sync with lock manager!\n", __func__);
 out_unlock:
 	up_read(&clp->cl_sem);
 out:
@@ -3665,12 +3663,12 @@ int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name,
 	};
 	int status;
 
-	dprintk("%s: start\n", __FUNCTION__);
+	dprintk("%s: start\n", __func__);
 	nfs_fattr_init(&fs_locations->fattr);
 	fs_locations->server = server;
 	fs_locations->nlocations = 0;
 	status = rpc_call_sync(server->client, &msg, 0);
-	dprintk("%s: returned status = %d\n", __FUNCTION__, status);
+	dprintk("%s: returned status = %d\n", __func__, status);
 	return status;
 }
 
diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c
index 5e2e4af1a0e6..3305acbbe2ae 100644
--- a/fs/nfs/nfs4renewd.c
+++ b/fs/nfs/nfs4renewd.c
@@ -66,7 +66,7 @@ nfs4_renew_state(struct work_struct *work)
 	unsigned long last, now;
 
 	down_read(&clp->cl_sem);
-	dprintk("%s: start\n", __FUNCTION__);
+	dprintk("%s: start\n", __func__);
 	/* Are there any active superblocks? */
 	if (list_empty(&clp->cl_superblocks))
 		goto out;
@@ -92,17 +92,17 @@ nfs4_renew_state(struct work_struct *work)
 		spin_lock(&clp->cl_lock);
 	} else
 		dprintk("%s: failed to call renewd. Reason: lease not expired \n",
-				__FUNCTION__);
+				__func__);
 	if (timeout < 5 * HZ)    /* safeguard */
 		timeout = 5 * HZ;
 	dprintk("%s: requeueing work. Lease period = %ld\n",
-			__FUNCTION__, (timeout + HZ - 1) / HZ);
+			__func__, (timeout + HZ - 1) / HZ);
 	cancel_delayed_work(&clp->cl_renewd);
 	schedule_delayed_work(&clp->cl_renewd, timeout);
 	spin_unlock(&clp->cl_lock);
 out:
 	up_read(&clp->cl_sem);
-	dprintk("%s: done\n", __FUNCTION__);
+	dprintk("%s: done\n", __func__);
 }
 
 /* Must be called with clp->cl_sem locked for writes */
@@ -117,7 +117,7 @@ nfs4_schedule_state_renewal(struct nfs_client *clp)
 	if (timeout < 5 * HZ)
 		timeout = 5 * HZ;
 	dprintk("%s: requeueing work. Lease period = %ld\n",
-			__FUNCTION__, (timeout + HZ - 1) / HZ);
+			__func__, (timeout + HZ - 1) / HZ);
 	cancel_delayed_work(&clp->cl_renewd);
 	schedule_delayed_work(&clp->cl_renewd, timeout);
 	set_bit(NFS_CS_RENEWD, &clp->cl_res_state);
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 46eb624e4f16..856a8934f610 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -282,7 +282,7 @@ nfs4_alloc_state_owner(void)
 	return sp;
 }
 
-void
+static void
 nfs4_drop_state_owner(struct nfs4_state_owner *sp)
 {
 	if (!RB_EMPTY_NODE(&sp->so_client_node)) {
@@ -828,7 +828,7 @@ static int nfs4_reclaim_locks(struct nfs4_state_recovery_ops *ops, struct nfs4_s
 		switch (status) {
 			default:
 				printk(KERN_ERR "%s: unhandled error %d. Zeroing state\n",
-						__FUNCTION__, status);
+						__func__, status);
 			case -NFS4ERR_EXPIRED:
 			case -NFS4ERR_NO_GRACE:
 			case -NFS4ERR_RECLAIM_BAD:
@@ -869,14 +869,14 @@ static int nfs4_reclaim_open_state(struct nfs4_state_recovery_ops *ops, struct n
 			list_for_each_entry(lock, &state->lock_states, ls_locks) {
 				if (!(lock->ls_flags & NFS_LOCK_INITIALIZED))
 					printk("%s: Lock reclaim failed!\n",
-							__FUNCTION__);
+							__func__);
 			}
 			continue;
 		}
 		switch (status) {
 			default:
 				printk(KERN_ERR "%s: unhandled error %d. Zeroing state\n",
-						__FUNCTION__, status);
+						__func__, status);
 			case -ENOENT:
 			case -NFS4ERR_RECLAIM_BAD:
 			case -NFS4ERR_RECLAIM_CONFLICT:
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 5a2d64927b35..b916297d2334 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -1831,7 +1831,7 @@ static int nfs4_xdr_enc_readdir(struct rpc_rqst *req, __be32 *p, const struct nf
 	xdr_inline_pages(&req->rq_rcv_buf, replen, args->pages,
 			 args->pgbase, args->count);
 	dprintk("%s: inlined page args = (%u, %p, %u, %u)\n",
-			__FUNCTION__, replen, args->pages,
+			__func__, replen, args->pages,
 			args->pgbase, args->count);
 
 out:
@@ -2192,9 +2192,9 @@ out:
 	p = xdr_inline_decode(xdr, nbytes); \
 	if (unlikely(!p)) { \
 		dprintk("nfs: %s: prematurely hit end of receive" \
-				" buffer\n", __FUNCTION__); \
+				" buffer\n", __func__); \
 		dprintk("nfs: %s: xdr->p=%p, bytes=%u, xdr->end=%p\n", \
-				__FUNCTION__, xdr->p, nbytes, xdr->end); \
+				__func__, xdr->p, nbytes, xdr->end); \
 		return -EIO; \
 	} \
 } while (0)
@@ -2306,12 +2306,12 @@ static int decode_attr_type(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *
 		READ_BUF(4);
 		READ32(*type);
 		if (*type < NF4REG || *type > NF4NAMEDATTR) {
-			dprintk("%s: bad type %d\n", __FUNCTION__, *type);
+			dprintk("%s: bad type %d\n", __func__, *type);
 			return -EIO;
 		}
 		bitmap[0] &= ~FATTR4_WORD0_TYPE;
 	}
-	dprintk("%s: type=0%o\n", __FUNCTION__, nfs_type2fmt[*type].nfs2type);
+	dprintk("%s: type=0%o\n", __func__, nfs_type2fmt[*type].nfs2type);
 	return 0;
 }
 
@@ -2327,7 +2327,7 @@ static int decode_attr_change(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t
 		READ64(*change);
 		bitmap[0] &= ~FATTR4_WORD0_CHANGE;
 	}
-	dprintk("%s: change attribute=%Lu\n", __FUNCTION__,
+	dprintk("%s: change attribute=%Lu\n", __func__,
 			(unsigned long long)*change);
 	return 0;
 }
@@ -2344,7 +2344,7 @@ static int decode_attr_size(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *
 		READ64(*size);
 		bitmap[0] &= ~FATTR4_WORD0_SIZE;
 	}
-	dprintk("%s: file size=%Lu\n", __FUNCTION__, (unsigned long long)*size);
+	dprintk("%s: file size=%Lu\n", __func__, (unsigned long long)*size);
 	return 0;
 }
 
@@ -2360,7 +2360,7 @@ static int decode_attr_link_support(struct xdr_stream *xdr, uint32_t *bitmap, ui
 		READ32(*res);
 		bitmap[0] &= ~FATTR4_WORD0_LINK_SUPPORT;
 	}
-	dprintk("%s: link support=%s\n", __FUNCTION__, *res == 0 ? "false" : "true");
+	dprintk("%s: link support=%s\n", __func__, *res == 0 ? "false" : "true");
 	return 0;
 }
 
@@ -2376,7 +2376,7 @@ static int decode_attr_symlink_support(struct xdr_stream *xdr, uint32_t *bitmap,
 		READ32(*res);
 		bitmap[0] &= ~FATTR4_WORD0_SYMLINK_SUPPORT;
 	}
-	dprintk("%s: symlink support=%s\n", __FUNCTION__, *res == 0 ? "false" : "true");
+	dprintk("%s: symlink support=%s\n", __func__, *res == 0 ? "false" : "true");
 	return 0;
 }
 
@@ -2394,7 +2394,7 @@ static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs
 		READ64(fsid->minor);
 		bitmap[0] &= ~FATTR4_WORD0_FSID;
 	}
-	dprintk("%s: fsid=(0x%Lx/0x%Lx)\n", __FUNCTION__,
+	dprintk("%s: fsid=(0x%Lx/0x%Lx)\n", __func__,
 			(unsigned long long)fsid->major,
 			(unsigned long long)fsid->minor);
 	return 0;
@@ -2412,7 +2412,7 @@ static int decode_attr_lease_time(struct xdr_stream *xdr, uint32_t *bitmap, uint
 		READ32(*res);
 		bitmap[0] &= ~FATTR4_WORD0_LEASE_TIME;
 	}
-	dprintk("%s: file size=%u\n", __FUNCTION__, (unsigned int)*res);
+	dprintk("%s: file size=%u\n", __func__, (unsigned int)*res);
 	return 0;
 }
 
@@ -2428,7 +2428,7 @@ static int decode_attr_aclsupport(struct xdr_stream *xdr, uint32_t *bitmap, uint
 		READ32(*res);
 		bitmap[0] &= ~FATTR4_WORD0_ACLSUPPORT;
 	}
-	dprintk("%s: ACLs supported=%u\n", __FUNCTION__, (unsigned int)*res);
+	dprintk("%s: ACLs supported=%u\n", __func__, (unsigned int)*res);
 	return 0;
 }
 
@@ -2444,7 +2444,7 @@ static int decode_attr_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t
 		READ64(*fileid);
 		bitmap[0] &= ~FATTR4_WORD0_FILEID;
 	}
-	dprintk("%s: fileid=%Lu\n", __FUNCTION__, (unsigned long long)*fileid);
+	dprintk("%s: fileid=%Lu\n", __func__, (unsigned long long)*fileid);
 	return 0;
 }
 
@@ -2460,7 +2460,7 @@ static int decode_attr_mounted_on_fileid(struct xdr_stream *xdr, uint32_t *bitma
 		READ64(*fileid);
 		bitmap[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID;
 	}
-	dprintk("%s: fileid=%Lu\n", __FUNCTION__, (unsigned long long)*fileid);
+	dprintk("%s: fileid=%Lu\n", __func__, (unsigned long long)*fileid);
 	return 0;
 }
 
@@ -2477,7 +2477,7 @@ static int decode_attr_files_avail(struct xdr_stream *xdr, uint32_t *bitmap, uin
 		READ64(*res);
 		bitmap[0] &= ~FATTR4_WORD0_FILES_AVAIL;
 	}
-	dprintk("%s: files avail=%Lu\n", __FUNCTION__, (unsigned long long)*res);
+	dprintk("%s: files avail=%Lu\n", __func__, (unsigned long long)*res);
 	return status;
 }
 
@@ -2494,7 +2494,7 @@ static int decode_attr_files_free(struct xdr_stream *xdr, uint32_t *bitmap, uint
 		READ64(*res);
 		bitmap[0] &= ~FATTR4_WORD0_FILES_FREE;
 	}
-	dprintk("%s: files free=%Lu\n", __FUNCTION__, (unsigned long long)*res);
+	dprintk("%s: files free=%Lu\n", __func__, (unsigned long long)*res);
 	return status;
 }
 
@@ -2511,7 +2511,7 @@ static int decode_attr_files_total(struct xdr_stream *xdr, uint32_t *bitmap, uin
 		READ64(*res);
 		bitmap[0] &= ~FATTR4_WORD0_FILES_TOTAL;
 	}
-	dprintk("%s: files total=%Lu\n", __FUNCTION__, (unsigned long long)*res);
+	dprintk("%s: files total=%Lu\n", __func__, (unsigned long long)*res);
 	return status;
 }
 
@@ -2569,7 +2569,7 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st
 	status = 0;
 	if (unlikely(!(bitmap[0] & FATTR4_WORD0_FS_LOCATIONS)))
 		goto out;
-	dprintk("%s: fsroot ", __FUNCTION__);
+	dprintk("%s: fsroot ", __func__);
 	status = decode_pathname(xdr, &res->fs_path);
 	if (unlikely(status != 0))
 		goto out;
@@ -2586,7 +2586,7 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st
 		READ32(m);
 
 		loc->nservers = 0;
-		dprintk("%s: servers ", __FUNCTION__);
+		dprintk("%s: servers ", __func__);
 		while (loc->nservers < m) {
 			struct nfs4_string *server = &loc->servers[loc->nservers];
 			status = decode_opaque_inline(xdr, &server->len, &server->data);
@@ -2599,7 +2599,7 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st
 				unsigned int i;
 				dprintk("%s: using first %u of %u servers "
 					"returned for location %u\n",
-						__FUNCTION__,
+						__func__,
 						NFS4_FS_LOCATION_MAXSERVERS,
 						m, res->nlocations);
 				for (i = loc->nservers; i < m; i++) {
@@ -2618,7 +2618,7 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st
 			res->nlocations++;
 	}
 out:
-	dprintk("%s: fs_locations done, error = %d\n", __FUNCTION__, status);
+	dprintk("%s: fs_locations done, error = %d\n", __func__, status);
 	return status;
 out_eio:
 	status = -EIO;
@@ -2638,7 +2638,7 @@ static int decode_attr_maxfilesize(struct xdr_stream *xdr, uint32_t *bitmap, uin
 		READ64(*res);
 		bitmap[0] &= ~FATTR4_WORD0_MAXFILESIZE;
 	}
-	dprintk("%s: maxfilesize=%Lu\n", __FUNCTION__, (unsigned long long)*res);
+	dprintk("%s: maxfilesize=%Lu\n", __func__, (unsigned long long)*res);
 	return status;
 }
 
@@ -2655,7 +2655,7 @@ static int decode_attr_maxlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_
 		READ32(*maxlink);
 		bitmap[0] &= ~FATTR4_WORD0_MAXLINK;
 	}
-	dprintk("%s: maxlink=%u\n", __FUNCTION__, *maxlink);
+	dprintk("%s: maxlink=%u\n", __func__, *maxlink);
 	return status;
 }
 
@@ -2672,7 +2672,7 @@ static int decode_attr_maxname(struct xdr_stream *xdr, uint32_t *bitmap, uint32_
 		READ32(*maxname);
 		bitmap[0] &= ~FATTR4_WORD0_MAXNAME;
 	}
-	dprintk("%s: maxname=%u\n", __FUNCTION__, *maxname);
+	dprintk("%s: maxname=%u\n", __func__, *maxname);
 	return status;
 }
 
@@ -2693,7 +2693,7 @@ static int decode_attr_maxread(struct xdr_stream *xdr, uint32_t *bitmap, uint32_
 		*res = (uint32_t)maxread;
 		bitmap[0] &= ~FATTR4_WORD0_MAXREAD;
 	}
-	dprintk("%s: maxread=%lu\n", __FUNCTION__, (unsigned long)*res);
+	dprintk("%s: maxread=%lu\n", __func__, (unsigned long)*res);
 	return status;
 }
 
@@ -2714,7 +2714,7 @@ static int decode_attr_maxwrite(struct xdr_stream *xdr, uint32_t *bitmap, uint32
 		*res = (uint32_t)maxwrite;
 		bitmap[0] &= ~FATTR4_WORD0_MAXWRITE;
 	}
-	dprintk("%s: maxwrite=%lu\n", __FUNCTION__, (unsigned long)*res);
+	dprintk("%s: maxwrite=%lu\n", __func__, (unsigned long)*res);
 	return status;
 }
 
@@ -2731,7 +2731,7 @@ static int decode_attr_mode(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *
 		*mode &= ~S_IFMT;
 		bitmap[1] &= ~FATTR4_WORD1_MODE;
 	}
-	dprintk("%s: file mode=0%o\n", __FUNCTION__, (unsigned int)*mode);
+	dprintk("%s: file mode=0%o\n", __func__, (unsigned int)*mode);
 	return 0;
 }
 
@@ -2747,7 +2747,7 @@ static int decode_attr_nlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t
 		READ32(*nlink);
 		bitmap[1] &= ~FATTR4_WORD1_NUMLINKS;
 	}
-	dprintk("%s: nlink=%u\n", __FUNCTION__, (unsigned int)*nlink);
+	dprintk("%s: nlink=%u\n", __func__, (unsigned int)*nlink);
 	return 0;
 }
 
@@ -2766,13 +2766,13 @@ static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, struct nf
 		if (len < XDR_MAX_NETOBJ) {
 			if (nfs_map_name_to_uid(clp, (char *)p, len, uid) != 0)
 				dprintk("%s: nfs_map_name_to_uid failed!\n",
-						__FUNCTION__);
+						__func__);
 		} else
 			dprintk("%s: name too long (%u)!\n",
-					__FUNCTION__, len);
+					__func__, len);
 		bitmap[1] &= ~FATTR4_WORD1_OWNER;
 	}
-	dprintk("%s: uid=%d\n", __FUNCTION__, (int)*uid);
+	dprintk("%s: uid=%d\n", __func__, (int)*uid);
 	return 0;
 }
 
@@ -2791,13 +2791,13 @@ static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, struct nf
 		if (len < XDR_MAX_NETOBJ) {
 			if (nfs_map_group_to_gid(clp, (char *)p, len, gid) != 0)
 				dprintk("%s: nfs_map_group_to_gid failed!\n",
-						__FUNCTION__);
+						__func__);
 		} else
 			dprintk("%s: name too long (%u)!\n",
-					__FUNCTION__, len);
+					__func__, len);
 		bitmap[1] &= ~FATTR4_WORD1_OWNER_GROUP;
 	}
-	dprintk("%s: gid=%d\n", __FUNCTION__, (int)*gid);
+	dprintk("%s: gid=%d\n", __func__, (int)*gid);
 	return 0;
 }
 
@@ -2820,7 +2820,7 @@ static int decode_attr_rdev(struct xdr_stream *xdr, uint32_t *bitmap, dev_t *rde
 			*rdev = tmp;
 		bitmap[1] &= ~ FATTR4_WORD1_RAWDEV;
 	}
-	dprintk("%s: rdev=(0x%x:0x%x)\n", __FUNCTION__, major, minor);
+	dprintk("%s: rdev=(0x%x:0x%x)\n", __func__, major, minor);
 	return 0;
 }
 
@@ -2837,7 +2837,7 @@ static int decode_attr_space_avail(struct xdr_stream *xdr, uint32_t *bitmap, uin
 		READ64(*res);
 		bitmap[1] &= ~FATTR4_WORD1_SPACE_AVAIL;
 	}
-	dprintk("%s: space avail=%Lu\n", __FUNCTION__, (unsigned long long)*res);
+	dprintk("%s: space avail=%Lu\n", __func__, (unsigned long long)*res);
 	return status;
 }
 
@@ -2854,7 +2854,7 @@ static int decode_attr_space_free(struct xdr_stream *xdr, uint32_t *bitmap, uint
 		READ64(*res);
 		bitmap[1] &= ~FATTR4_WORD1_SPACE_FREE;
 	}
-	dprintk("%s: space free=%Lu\n", __FUNCTION__, (unsigned long long)*res);
+	dprintk("%s: space free=%Lu\n", __func__, (unsigned long long)*res);
 	return status;
 }
 
@@ -2871,7 +2871,7 @@ static int decode_attr_space_total(struct xdr_stream *xdr, uint32_t *bitmap, uin
 		READ64(*res);
 		bitmap[1] &= ~FATTR4_WORD1_SPACE_TOTAL;
 	}
-	dprintk("%s: space total=%Lu\n", __FUNCTION__, (unsigned long long)*res);
+	dprintk("%s: space total=%Lu\n", __func__, (unsigned long long)*res);
 	return status;
 }
 
@@ -2887,7 +2887,7 @@ static int decode_attr_space_used(struct xdr_stream *xdr, uint32_t *bitmap, uint
 		READ64(*used);
 		bitmap[1] &= ~FATTR4_WORD1_SPACE_USED;
 	}
-	dprintk("%s: space used=%Lu\n", __FUNCTION__,
+	dprintk("%s: space used=%Lu\n", __func__,
 			(unsigned long long)*used);
 	return 0;
 }
@@ -2918,7 +2918,7 @@ static int decode_attr_time_access(struct xdr_stream *xdr, uint32_t *bitmap, str
 		status = decode_attr_time(xdr, time);
 		bitmap[1] &= ~FATTR4_WORD1_TIME_ACCESS;
 	}
-	dprintk("%s: atime=%ld\n", __FUNCTION__, (long)time->tv_sec);
+	dprintk("%s: atime=%ld\n", __func__, (long)time->tv_sec);
 	return status;
 }
 
@@ -2934,7 +2934,7 @@ static int decode_attr_time_metadata(struct xdr_stream *xdr, uint32_t *bitmap, s
 		status = decode_attr_time(xdr, time);
 		bitmap[1] &= ~FATTR4_WORD1_TIME_METADATA;
 	}
-	dprintk("%s: ctime=%ld\n", __FUNCTION__, (long)time->tv_sec);
+	dprintk("%s: ctime=%ld\n", __func__, (long)time->tv_sec);
 	return status;
 }
 
@@ -2950,7 +2950,7 @@ static int decode_attr_time_modify(struct xdr_stream *xdr, uint32_t *bitmap, str
 		status = decode_attr_time(xdr, time);
 		bitmap[1] &= ~FATTR4_WORD1_TIME_MODIFY;
 	}
-	dprintk("%s: mtime=%ld\n", __FUNCTION__, (long)time->tv_sec);
+	dprintk("%s: mtime=%ld\n", __func__, (long)time->tv_sec);
 	return status;
 }
 
@@ -2962,7 +2962,7 @@ static int verify_attr_len(struct xdr_stream *xdr, __be32 *savep, uint32_t attrl
 	if (unlikely(attrwords != nwords)) {
 		dprintk("%s: server returned incorrect attribute length: "
 			"%u %c %u\n",
-				__FUNCTION__,
+				__func__,
 				attrwords << 2,
 				(attrwords < nwords) ? '<' : '>',
 				nwords << 2);
@@ -3067,7 +3067,7 @@ static int decode_server_caps(struct xdr_stream *xdr, struct nfs4_server_caps_re
 		goto xdr_error;
 	status = verify_attr_len(xdr, savep, attrlen);
 xdr_error:
-	dprintk("%s: xdr returned %d!\n", __FUNCTION__, -status);
+	dprintk("%s: xdr returned %d!\n", __func__, -status);
 	return status;
 }
 	
@@ -3100,7 +3100,7 @@ static int decode_statfs(struct xdr_stream *xdr, struct nfs_fsstat *fsstat)
 
 	status = verify_attr_len(xdr, savep, attrlen);
 xdr_error:
-	dprintk("%s: xdr returned %d!\n", __FUNCTION__, -status);
+	dprintk("%s: xdr returned %d!\n", __func__, -status);
 	return status;
 }
 
@@ -3125,7 +3125,7 @@ static int decode_pathconf(struct xdr_stream *xdr, struct nfs_pathconf *pathconf
 
 	status = verify_attr_len(xdr, savep, attrlen);
 xdr_error:
-	dprintk("%s: xdr returned %d!\n", __FUNCTION__, -status);
+	dprintk("%s: xdr returned %d!\n", __func__, -status);
 	return status;
 }
 
@@ -3193,7 +3193,7 @@ static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, cons
 	if ((status = verify_attr_len(xdr, savep, attrlen)) == 0)
 		fattr->valid = NFS_ATTR_FATTR | NFS_ATTR_FATTR_V3 | NFS_ATTR_FATTR_V4;
 xdr_error:
-	dprintk("%s: xdr returned %d\n", __FUNCTION__, -status);
+	dprintk("%s: xdr returned %d\n", __func__, -status);
 	return status;
 }
 
@@ -3226,7 +3226,7 @@ static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo)
 
 	status = verify_attr_len(xdr, savep, attrlen);
 xdr_error:
-	dprintk("%s: xdr returned %d!\n", __FUNCTION__, -status);
+	dprintk("%s: xdr returned %d!\n", __func__, -status);
 	return status;
 }
 
@@ -3418,7 +3418,7 @@ static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res)
 
 	return decode_delegation(xdr, res);
 xdr_error:
-	dprintk("%s: Bitmap too large! Length = %u\n", __FUNCTION__, bmlen);
+	dprintk("%s: Bitmap too large! Length = %u\n", __func__, bmlen);
 	return -EIO;
 }
 
@@ -3575,7 +3575,7 @@ short_pkt:
 	 * the call was successful, but incomplete. The caller can retry the
 	 * readdir starting at the last cookie.
 	 */
-	dprintk("%s: short packet at entry %d\n", __FUNCTION__, nr);
+	dprintk("%s: short packet at entry %d\n", __func__, nr);
 	entry[0] = entry[1] = 0;
 	if (nr)
 		goto out;
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index 5ccf7faee19c..03599bfe81cf 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -63,17 +63,17 @@ nfs_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle,
 	};
 	int status;
 
-	dprintk("%s: call getattr\n", __FUNCTION__);
+	dprintk("%s: call getattr\n", __func__);
 	nfs_fattr_init(fattr);
 	status = rpc_call_sync(server->nfs_client->cl_rpcclient, &msg, 0);
-	dprintk("%s: reply getattr: %d\n", __FUNCTION__, status);
+	dprintk("%s: reply getattr: %d\n", __func__, status);
 	if (status)
 		return status;
-	dprintk("%s: call statfs\n", __FUNCTION__);
+	dprintk("%s: call statfs\n", __func__);
 	msg.rpc_proc = &nfs_procedures[NFSPROC_STATFS];
 	msg.rpc_resp = &fsinfo;
 	status = rpc_call_sync(server->nfs_client->cl_rpcclient, &msg, 0);
-	dprintk("%s: reply statfs: %d\n", __FUNCTION__, status);
+	dprintk("%s: reply statfs: %d\n", __func__, status);
 	if (status)
 		return status;
 	info->rtmax  = NFS_MAXDATA;
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 16f57e0af999..40d17987d0e8 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -329,7 +329,7 @@ int nfs_readpage_result(struct rpc_task *task, struct nfs_read_data *data)
 {
 	int status;
 
-	dprintk("NFS: %s: %5u, (status %d)\n", __FUNCTION__, task->tk_pid,
+	dprintk("NFS: %s: %5u, (status %d)\n", __func__, task->tk_pid,
 			task->tk_status);
 
 	status = NFS_PROTO(data->inode)->read_done(task, data);
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 7226a506f3ca..614efeed5437 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -405,7 +405,7 @@ static int nfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 	return 0;
 
  out_err:
-	dprintk("%s: statfs error = %d\n", __FUNCTION__, -error);
+	dprintk("%s: statfs error = %d\n", __func__, -error);
 	unlock_kernel();
 	return error;
 }
@@ -1216,8 +1216,6 @@ static int nfs_validate_mount_data(void *options,
 {
 	struct nfs_mount_data *data = (struct nfs_mount_data *)options;
 
-	memset(args, 0, sizeof(*args));
-
 	if (data == NULL)
 		goto out_no_data;
 
@@ -1251,13 +1249,13 @@ static int nfs_validate_mount_data(void *options,
 	case 5:
 		memset(data->context, 0, sizeof(data->context));
 	case 6:
-		if (data->flags & NFS_MOUNT_VER3)
+		if (data->flags & NFS_MOUNT_VER3) {
+			if (data->root.size > NFS3_FHSIZE || data->root.size == 0)
+				goto out_invalid_fh;
 			mntfh->size = data->root.size;
-		else
+		} else
 			mntfh->size = NFS2_FHSIZE;
 
-		if (mntfh->size > sizeof(mntfh->data))
-			goto out_invalid_fh;
 
 		memcpy(mntfh->data, data->root.data, mntfh->size);
 		if (mntfh->size < sizeof(mntfh->data))
@@ -1585,24 +1583,29 @@ static int nfs_get_sb(struct file_system_type *fs_type,
 {
 	struct nfs_server *server = NULL;
 	struct super_block *s;
-	struct nfs_fh mntfh;
-	struct nfs_parsed_mount_data data;
+	struct nfs_parsed_mount_data *data;
+	struct nfs_fh *mntfh;
 	struct dentry *mntroot;
 	int (*compare_super)(struct super_block *, void *) = nfs_compare_super;
 	struct nfs_sb_mountdata sb_mntdata = {
 		.mntflags = flags,
 	};
-	int error;
+	int error = -ENOMEM;
 
-	security_init_mnt_opts(&data.lsm_opts);
+	data = kzalloc(sizeof(*data), GFP_KERNEL);
+	mntfh = kzalloc(sizeof(*mntfh), GFP_KERNEL);
+	if (data == NULL || mntfh == NULL)
+		goto out_free_fh;
+
+	security_init_mnt_opts(&data->lsm_opts);
 
 	/* Validate the mount data */
-	error = nfs_validate_mount_data(raw_data, &data, &mntfh, dev_name);
+	error = nfs_validate_mount_data(raw_data, data, mntfh, dev_name);
 	if (error < 0)
 		goto out;
 
 	/* Get a volume representation */
-	server = nfs_create_server(&data, &mntfh);
+	server = nfs_create_server(data, mntfh);
 	if (IS_ERR(server)) {
 		error = PTR_ERR(server);
 		goto out;
@@ -1630,16 +1633,16 @@ static int nfs_get_sb(struct file_system_type *fs_type,
 
 	if (!s->s_root) {
 		/* initial superblock/root creation */
-		nfs_fill_super(s, &data);
+		nfs_fill_super(s, data);
 	}
 
-	mntroot = nfs_get_root(s, &mntfh);
+	mntroot = nfs_get_root(s, mntfh);
 	if (IS_ERR(mntroot)) {
 		error = PTR_ERR(mntroot);
 		goto error_splat_super;
 	}
 
-	error = security_sb_set_mnt_opts(s, &data.lsm_opts);
+	error = security_sb_set_mnt_opts(s, &data->lsm_opts);
 	if (error)
 		goto error_splat_root;
 
@@ -1649,9 +1652,12 @@ static int nfs_get_sb(struct file_system_type *fs_type,
 	error = 0;
 
 out:
-	kfree(data.nfs_server.hostname);
-	kfree(data.mount_server.hostname);
-	security_free_mnt_opts(&data.lsm_opts);
+	kfree(data->nfs_server.hostname);
+	kfree(data->mount_server.hostname);
+	security_free_mnt_opts(&data->lsm_opts);
+out_free_fh:
+	kfree(mntfh);
+	kfree(data);
 	return error;
 
 out_err_nosb:
@@ -1800,8 +1806,6 @@ static int nfs4_validate_mount_data(void *options,
 	struct nfs4_mount_data *data = (struct nfs4_mount_data *)options;
 	char *c;
 
-	memset(args, 0, sizeof(*args));
-
 	if (data == NULL)
 		goto out_no_data;
 
@@ -1959,26 +1963,31 @@ out_no_client_address:
 static int nfs4_get_sb(struct file_system_type *fs_type,
 	int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt)
 {
-	struct nfs_parsed_mount_data data;
+	struct nfs_parsed_mount_data *data;
 	struct super_block *s;
 	struct nfs_server *server;
-	struct nfs_fh mntfh;
+	struct nfs_fh *mntfh;
 	struct dentry *mntroot;
 	int (*compare_super)(struct super_block *, void *) = nfs_compare_super;
 	struct nfs_sb_mountdata sb_mntdata = {
 		.mntflags = flags,
 	};
-	int error;
+	int error = -ENOMEM;
+
+	data = kzalloc(sizeof(*data), GFP_KERNEL);
+	mntfh = kzalloc(sizeof(*mntfh), GFP_KERNEL);
+	if (data == NULL || mntfh == NULL)
+		goto out_free_fh;
 
-	security_init_mnt_opts(&data.lsm_opts);
+	security_init_mnt_opts(&data->lsm_opts);
 
 	/* Validate the mount data */
-	error = nfs4_validate_mount_data(raw_data, &data, dev_name);
+	error = nfs4_validate_mount_data(raw_data, data, dev_name);
 	if (error < 0)
 		goto out;
 
 	/* Get a volume representation */
-	server = nfs4_create_server(&data, &mntfh);
+	server = nfs4_create_server(data, mntfh);
 	if (IS_ERR(server)) {
 		error = PTR_ERR(server);
 		goto out;
@@ -2009,28 +2018,37 @@ static int nfs4_get_sb(struct file_system_type *fs_type,
 		nfs4_fill_super(s);
 	}
 
-	mntroot = nfs4_get_root(s, &mntfh);
+	mntroot = nfs4_get_root(s, mntfh);
 	if (IS_ERR(mntroot)) {
 		error = PTR_ERR(mntroot);
 		goto error_splat_super;
 	}
 
+	error = security_sb_set_mnt_opts(s, &data->lsm_opts);
+	if (error)
+		goto error_splat_root;
+
 	s->s_flags |= MS_ACTIVE;
 	mnt->mnt_sb = s;
 	mnt->mnt_root = mntroot;
 	error = 0;
 
 out:
-	kfree(data.client_address);
-	kfree(data.nfs_server.export_path);
-	kfree(data.nfs_server.hostname);
-	security_free_mnt_opts(&data.lsm_opts);
+	kfree(data->client_address);
+	kfree(data->nfs_server.export_path);
+	kfree(data->nfs_server.hostname);
+	security_free_mnt_opts(&data->lsm_opts);
+out_free_fh:
+	kfree(mntfh);
+	kfree(data);
 	return error;
 
 out_free:
 	nfs_free_server(server);
 	goto out;
 
+error_splat_root:
+	dput(mntroot);
 error_splat_super:
 	up_write(&s->s_umount);
 	deactivate_super(s);
@@ -2114,6 +2132,8 @@ static int nfs4_xdev_get_sb(struct file_system_type *fs_type, int flags,
 	mnt->mnt_sb = s;
 	mnt->mnt_root = mntroot;
 
+	security_sb_clone_mnt_opts(data->sb, s);
+
 	dprintk("<-- nfs4_xdev_get_sb() = 0\n");
 	return 0;
 
@@ -2197,6 +2217,8 @@ static int nfs4_referral_get_sb(struct file_system_type *fs_type, int flags,
 	mnt->mnt_sb = s;
 	mnt->mnt_root = mntroot;
 
+	security_sb_clone_mnt_opts(data->sb, s);
+
 	dprintk("<-- nfs4_referral_get_sb() = 0\n");
 	return 0;
 
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 1ade11d1ba07..f333848fd3be 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -415,7 +415,7 @@ nfs_dirty_request(struct nfs_page *req)
 
 	if (page == NULL || test_bit(PG_NEED_COMMIT, &req->wb_flags))
 		return 0;
-	return !PageWriteback(req->wb_page);
+	return !PageWriteback(page);
 }
 
 #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
@@ -739,12 +739,13 @@ int nfs_updatepage(struct file *file, struct page *page,
 	}
 
 	status = nfs_writepage_setup(ctx, page, offset, count);
-	__set_page_dirty_nobuffers(page);
+	if (status < 0)
+		nfs_set_pageerror(page);
+	else
+		__set_page_dirty_nobuffers(page);
 
         dprintk("NFS:      nfs_updatepage returns %d (isize %Ld)\n",
 			status, (long long)i_size_read(inode));
-	if (status < 0)
-		nfs_set_pageerror(page);
 	return status;
 }
 
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 0b3ffa9840c2..4d4760e687c3 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -419,9 +419,9 @@ static int do_probe_callback(void *data)
 out_release_client:
 	rpc_shutdown_client(client);
 out_err:
-	put_nfs4_client(clp);
 	dprintk("NFSD: warning: no callback path to client %.*s\n",
 		(int)clp->cl_name.len, clp->cl_name.data);
+	put_nfs4_client(clp);
 	return status;
 }
 
diff --git a/fs/ntfs/upcase.c b/fs/ntfs/upcase.c
index 9101807dc81a..e2f72ca98037 100644
--- a/fs/ntfs/upcase.c
+++ b/fs/ntfs/upcase.c
@@ -77,11 +77,10 @@ ntfschar *generate_default_upcase(void)
 		uc[i] = cpu_to_le16(i);
 	for (r = 0; uc_run_table[r][0]; r++)
 		for (i = uc_run_table[r][0]; i < uc_run_table[r][1]; i++)
-			uc[i] = cpu_to_le16(le16_to_cpu(uc[i]) +
-					uc_run_table[r][2]);
+			le16_add_cpu(&uc[i], uc_run_table[r][2]);
 	for (r = 0; uc_dup_table[r][0]; r++)
 		for (i = uc_dup_table[r][0]; i < uc_dup_table[r][1]; i += 2)
-			uc[i + 1] = cpu_to_le16(le16_to_cpu(uc[i + 1]) - 1);
+			le16_add_cpu(&uc[i + 1], -1);
 	for (r = 0; uc_word_table[r][0]; r++)
 		uc[uc_word_table[r][0]] = cpu_to_le16(uc_word_table[r][1]);
 	return uc;
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 41f84c92094f..10bfb466e068 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -2788,7 +2788,7 @@ static int ocfs2_merge_rec_right(struct inode *inode,
 	BUG_ON(index >= le16_to_cpu(el->l_next_free_rec));
 	left_rec = &el->l_recs[index];
 
-	if (index == le16_to_cpu(el->l_next_free_rec - 1) &&
+	if (index == le16_to_cpu(el->l_next_free_rec) - 1 &&
 	    le16_to_cpu(el->l_next_free_rec) == le16_to_cpu(el->l_count)) {
 		/* we meet with a cross extent block merge. */
 		ret = ocfs2_get_right_path(inode, left_path, &right_path);
@@ -2802,7 +2802,7 @@ static int ocfs2_merge_rec_right(struct inode *inode,
 		BUG_ON(next_free <= 0);
 		right_rec = &right_el->l_recs[0];
 		if (ocfs2_is_empty_extent(right_rec)) {
-			BUG_ON(le16_to_cpu(next_free) <= 1);
+			BUG_ON(next_free <= 1);
 			right_rec = &right_el->l_recs[1];
 		}
 
diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c
index cf9401e8cd0b..cfdb08b484ed 100644
--- a/fs/ocfs2/cluster/nodemanager.c
+++ b/fs/ocfs2/cluster/nodemanager.c
@@ -21,7 +21,6 @@
 
 #include <linux/kernel.h>
 #include <linux/module.h>
-#include <linux/sysctl.h>
 #include <linux/configfs.h>
 
 #include "tcp.h"
@@ -36,65 +35,6 @@
  * cluster references throughout where nodes are looked up */
 struct o2nm_cluster *o2nm_single_cluster = NULL;
 
-#define OCFS2_MAX_HB_CTL_PATH 256
-static char ocfs2_hb_ctl_path[OCFS2_MAX_HB_CTL_PATH] = "/sbin/ocfs2_hb_ctl";
-
-static ctl_table ocfs2_nm_table[] = {
-	{
-		.ctl_name	= 1,
-		.procname	= "hb_ctl_path",
-		.data		= ocfs2_hb_ctl_path,
-		.maxlen		= OCFS2_MAX_HB_CTL_PATH,
-		.mode		= 0644,
-		.proc_handler	= &proc_dostring,
-		.strategy	= &sysctl_string,
-	},
-	{ .ctl_name = 0 }
-};
-
-static ctl_table ocfs2_mod_table[] = {
-	{
-		.ctl_name	= FS_OCFS2_NM,
-		.procname	= "nm",
-		.data		= NULL,
-		.maxlen		= 0,
-		.mode		= 0555,
-		.child		= ocfs2_nm_table
-	},
-	{ .ctl_name = 0}
-};
-
-static ctl_table ocfs2_kern_table[] = {
-	{
-		.ctl_name	= FS_OCFS2,
-		.procname	= "ocfs2",
-		.data		= NULL,
-		.maxlen		= 0,
-		.mode		= 0555,
-		.child		= ocfs2_mod_table
-	},
-	{ .ctl_name = 0}
-};
-
-static ctl_table ocfs2_root_table[] = {
-	{
-		.ctl_name	= CTL_FS,
-		.procname	= "fs",
-		.data		= NULL,
-		.maxlen		= 0,
-		.mode		= 0555,
-		.child		= ocfs2_kern_table
-	},
-	{ .ctl_name = 0 }
-};
-
-static struct ctl_table_header *ocfs2_table_header = NULL;
-
-const char *o2nm_get_hb_ctl_path(void)
-{
-	return ocfs2_hb_ctl_path;
-}
-EXPORT_SYMBOL_GPL(o2nm_get_hb_ctl_path);
 
 struct o2nm_node *o2nm_get_node_by_num(u8 node_num)
 {
@@ -941,9 +881,6 @@ void o2nm_undepend_this_node(void)
 
 static void __exit exit_o2nm(void)
 {
-	if (ocfs2_table_header)
-		unregister_sysctl_table(ocfs2_table_header);
-
 	/* XXX sync with hb callbacks and shut down hb? */
 	o2net_unregister_hb_callbacks();
 	configfs_unregister_subsystem(&o2nm_cluster_group.cs_subsys);
@@ -964,16 +901,9 @@ static int __init init_o2nm(void)
 	if (ret)
 		goto out;
 
-	ocfs2_table_header = register_sysctl_table(ocfs2_root_table);
-	if (!ocfs2_table_header) {
-		printk(KERN_ERR "nodemanager: unable to register sysctl\n");
-		ret = -ENOMEM; /* or something. */
-		goto out_o2net;
-	}
-
 	ret = o2net_register_hb_callbacks();
 	if (ret)
-		goto out_sysctl;
+		goto out_o2net;
 
 	config_group_init(&o2nm_cluster_group.cs_subsys.su_group);
 	mutex_init(&o2nm_cluster_group.cs_subsys.su_mutex);
@@ -990,8 +920,6 @@ static int __init init_o2nm(void)
 	configfs_unregister_subsystem(&o2nm_cluster_group.cs_subsys);
 out_callbacks:
 	o2net_unregister_hb_callbacks();
-out_sysctl:
-	unregister_sysctl_table(ocfs2_table_header);
 out_o2net:
 	o2net_exit();
 out:
diff --git a/fs/ocfs2/cluster/nodemanager.h b/fs/ocfs2/cluster/nodemanager.h
index 7c860361b8dd..c992ea0da4ad 100644
--- a/fs/ocfs2/cluster/nodemanager.h
+++ b/fs/ocfs2/cluster/nodemanager.h
@@ -33,10 +33,6 @@
 #include <linux/configfs.h>
 #include <linux/rbtree.h>
 
-#define FS_OCFS2_NM		1
-
-const char *o2nm_get_hb_ctl_path(void);
-
 struct o2nm_node {
 	spinlock_t		nd_lock;
 	struct config_item	nd_item;
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 1e44ad14881a..a27d61581bd6 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -142,53 +142,43 @@ static void o2net_idle_timer(unsigned long data);
 static void o2net_sc_postpone_idle(struct o2net_sock_container *sc);
 static void o2net_sc_reset_idle_timer(struct o2net_sock_container *sc);
 
-static void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype,
-			   u32 msgkey, struct task_struct *task, u8 node)
-{
 #ifdef CONFIG_DEBUG_FS
+void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype,
+		    u32 msgkey, struct task_struct *task, u8 node)
+{
 	INIT_LIST_HEAD(&nst->st_net_debug_item);
 	nst->st_task = task;
 	nst->st_msg_type = msgtype;
 	nst->st_msg_key = msgkey;
 	nst->st_node = node;
-#endif
 }
 
-static void o2net_set_nst_sock_time(struct o2net_send_tracking *nst)
+void o2net_set_nst_sock_time(struct o2net_send_tracking *nst)
 {
-#ifdef CONFIG_DEBUG_FS
 	do_gettimeofday(&nst->st_sock_time);
-#endif
 }
 
-static void o2net_set_nst_send_time(struct o2net_send_tracking *nst)
+void o2net_set_nst_send_time(struct o2net_send_tracking *nst)
 {
-#ifdef CONFIG_DEBUG_FS
 	do_gettimeofday(&nst->st_send_time);
-#endif
 }
 
-static void o2net_set_nst_status_time(struct o2net_send_tracking *nst)
+void o2net_set_nst_status_time(struct o2net_send_tracking *nst)
 {
-#ifdef CONFIG_DEBUG_FS
 	do_gettimeofday(&nst->st_status_time);
-#endif
 }
 
-static void o2net_set_nst_sock_container(struct o2net_send_tracking *nst,
+void o2net_set_nst_sock_container(struct o2net_send_tracking *nst,
 					 struct o2net_sock_container *sc)
 {
-#ifdef CONFIG_DEBUG_FS
 	nst->st_sc = sc;
-#endif
 }
 
-static void o2net_set_nst_msg_id(struct o2net_send_tracking *nst, u32 msg_id)
+void o2net_set_nst_msg_id(struct o2net_send_tracking *nst, u32 msg_id)
 {
-#ifdef CONFIG_DEBUG_FS
 	nst->st_id = msg_id;
-#endif
 }
+#endif /* CONFIG_DEBUG_FS */
 
 static inline int o2net_reconnect_delay(void)
 {
diff --git a/fs/ocfs2/cluster/tcp.h b/fs/ocfs2/cluster/tcp.h
index a705d5d19036..fd6179eb26d4 100644
--- a/fs/ocfs2/cluster/tcp.h
+++ b/fs/ocfs2/cluster/tcp.h
@@ -128,23 +128,23 @@ void o2net_debug_del_nst(struct o2net_send_tracking *nst);
 void o2net_debug_add_sc(struct o2net_sock_container *sc);
 void o2net_debug_del_sc(struct o2net_sock_container *sc);
 #else
-static int o2net_debugfs_init(void)
+static inline int o2net_debugfs_init(void)
 {
 	return 0;
 }
-static void o2net_debugfs_exit(void)
+static inline void o2net_debugfs_exit(void)
 {
 }
-static void o2net_debug_add_nst(struct o2net_send_tracking *nst)
+static inline void o2net_debug_add_nst(struct o2net_send_tracking *nst)
 {
 }
-static void o2net_debug_del_nst(struct o2net_send_tracking *nst)
+static inline void o2net_debug_del_nst(struct o2net_send_tracking *nst)
 {
 }
-static void o2net_debug_add_sc(struct o2net_sock_container *sc)
+static inline void o2net_debug_add_sc(struct o2net_sock_container *sc)
 {
 }
-static void o2net_debug_del_sc(struct o2net_sock_container *sc)
+static inline void o2net_debug_del_sc(struct o2net_sock_container *sc)
 {
 }
 #endif	/* CONFIG_DEBUG_FS */
diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h
index 8d58cfe410b1..18307ff81b77 100644
--- a/fs/ocfs2/cluster/tcp_internal.h
+++ b/fs/ocfs2/cluster/tcp_internal.h
@@ -224,10 +224,42 @@ struct o2net_send_tracking {
 	struct timeval			st_send_time;
 	struct timeval			st_status_time;
 };
+
+void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype,
+		    u32 msgkey, struct task_struct *task, u8 node);
+void o2net_set_nst_sock_time(struct o2net_send_tracking *nst);
+void o2net_set_nst_send_time(struct o2net_send_tracking *nst);
+void o2net_set_nst_status_time(struct o2net_send_tracking *nst);
+void o2net_set_nst_sock_container(struct o2net_send_tracking *nst,
+				  struct o2net_sock_container *sc);
+void o2net_set_nst_msg_id(struct o2net_send_tracking *nst, u32 msg_id);
+
 #else
 struct o2net_send_tracking {
 	u32	dummy;
 };
+
+static inline void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype,
+				  u32 msgkey, struct task_struct *task, u8 node)
+{
+}
+static inline void o2net_set_nst_sock_time(struct o2net_send_tracking *nst)
+{
+}
+static inline void o2net_set_nst_send_time(struct o2net_send_tracking *nst)
+{
+}
+static inline void o2net_set_nst_status_time(struct o2net_send_tracking *nst)
+{
+}
+static inline void o2net_set_nst_sock_container(struct o2net_send_tracking *nst,
+						struct o2net_sock_container *sc)
+{
+}
+static inline void o2net_set_nst_msg_id(struct o2net_send_tracking *nst,
+					u32 msg_id)
+{
+}
 #endif	/* CONFIG_DEBUG_FS */
 
 #endif /* O2CLUSTER_TCP_INTERNAL_H */
diff --git a/fs/ocfs2/dlm/dlmdebug.h b/fs/ocfs2/dlm/dlmdebug.h
index d34a62a3a625..8c686d22f9c7 100644
--- a/fs/ocfs2/dlm/dlmdebug.h
+++ b/fs/ocfs2/dlm/dlmdebug.h
@@ -60,25 +60,25 @@ void dlm_destroy_debugfs_root(void);
 
 #else
 
-static int dlm_debug_init(struct dlm_ctxt *dlm)
+static inline int dlm_debug_init(struct dlm_ctxt *dlm)
 {
 	return 0;
 }
-static void dlm_debug_shutdown(struct dlm_ctxt *dlm)
+static inline void dlm_debug_shutdown(struct dlm_ctxt *dlm)
 {
 }
-static int dlm_create_debugfs_subroot(struct dlm_ctxt *dlm)
+static inline int dlm_create_debugfs_subroot(struct dlm_ctxt *dlm)
 {
 	return 0;
 }
-static void dlm_destroy_debugfs_subroot(struct dlm_ctxt *dlm)
+static inline void dlm_destroy_debugfs_subroot(struct dlm_ctxt *dlm)
 {
 }
-static int dlm_create_debugfs_root(void)
+static inline int dlm_create_debugfs_root(void)
 {
 	return 0;
 }
-static void dlm_destroy_debugfs_root(void)
+static inline void dlm_destroy_debugfs_root(void)
 {
 }
 
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index efc015c6128a..44f87caf3683 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -606,7 +606,9 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm,
 
 	res->last_used = 0;
 
+	spin_lock(&dlm->spinlock);
 	list_add_tail(&res->tracking, &dlm->tracking_list);
+	spin_unlock(&dlm->spinlock);
 
 	memset(res->lvb, 0, DLM_LVB_LEN);
 	memset(res->refmap, 0, sizeof(res->refmap));
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 394d25a131a5..80e20d9f2780 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -1554,8 +1554,8 @@ out:
  */
 int ocfs2_file_lock(struct file *file, int ex, int trylock)
 {
-	int ret, level = ex ? LKM_EXMODE : LKM_PRMODE;
-	unsigned int lkm_flags = trylock ? LKM_NOQUEUE : 0;
+	int ret, level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
+	unsigned int lkm_flags = trylock ? DLM_LKF_NOQUEUE : 0;
 	unsigned long flags;
 	struct ocfs2_file_private *fp = file->private_data;
 	struct ocfs2_lock_res *lockres = &fp->fp_flock;
@@ -1582,7 +1582,7 @@ int ocfs2_file_lock(struct file *file, int ex, int trylock)
 		 * Get the lock at NLMODE to start - that way we
 		 * can cancel the upconvert request if need be.
 		 */
-		ret = ocfs2_lock_create(osb, lockres, LKM_NLMODE, 0);
+		ret = ocfs2_lock_create(osb, lockres, DLM_LOCK_NL, 0);
 		if (ret < 0) {
 			mlog_errno(ret);
 			goto out;
@@ -1597,7 +1597,7 @@ int ocfs2_file_lock(struct file *file, int ex, int trylock)
 	}
 
 	lockres->l_action = OCFS2_AST_CONVERT;
-	lkm_flags |= LKM_CONVERT;
+	lkm_flags |= DLM_LKF_CONVERT;
 	lockres->l_requested = level;
 	lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
 
@@ -1664,7 +1664,7 @@ void ocfs2_file_unlock(struct file *file)
 	if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED))
 		return;
 
-	if (lockres->l_level == LKM_NLMODE)
+	if (lockres->l_level == DLM_LOCK_NL)
 		return;
 
 	mlog(0, "Unlock: \"%s\" flags: 0x%lx, level: %d, act: %d\n",
@@ -1678,11 +1678,11 @@ void ocfs2_file_unlock(struct file *file)
 	lockres_or_flags(lockres, OCFS2_LOCK_BLOCKED);
 	lockres->l_blocking = DLM_LOCK_EX;
 
-	gen = ocfs2_prepare_downconvert(lockres, LKM_NLMODE);
+	gen = ocfs2_prepare_downconvert(lockres, DLM_LOCK_NL);
 	lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0);
 	spin_unlock_irqrestore(&lockres->l_lock, flags);
 
-	ret = ocfs2_downconvert_lock(osb, lockres, LKM_NLMODE, 0, gen);
+	ret = ocfs2_downconvert_lock(osb, lockres, DLM_LOCK_NL, 0, gen);
 	if (ret) {
 		mlog_errno(ret);
 		return;
diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c
index bbd1667aa7d3..fcd120f1493a 100644
--- a/fs/ocfs2/stack_o2cb.c
+++ b/fs/ocfs2/stack_o2cb.c
@@ -317,8 +317,7 @@ out:
 	return rc;
 }
 
-static int o2cb_cluster_disconnect(struct ocfs2_cluster_connection *conn,
-				   int hangup_pending)
+static int o2cb_cluster_disconnect(struct ocfs2_cluster_connection *conn)
 {
 	struct dlm_ctxt *dlm = conn->cc_lockspace;
 	struct o2dlm_private *priv = conn->cc_private;
@@ -333,43 +332,6 @@ static int o2cb_cluster_disconnect(struct ocfs2_cluster_connection *conn,
 	return 0;
 }
 
-static void o2hb_stop(const char *group)
-{
-	int ret;
-	char *argv[5], *envp[3];
-
-	argv[0] = (char *)o2nm_get_hb_ctl_path();
-	argv[1] = "-K";
-	argv[2] = "-u";
-	argv[3] = (char *)group;
-	argv[4] = NULL;
-
-	mlog(0, "Run: %s %s %s %s\n", argv[0], argv[1], argv[2], argv[3]);
-
-	/* minimal command environment taken from cpu_run_sbin_hotplug */
-	envp[0] = "HOME=/";
-	envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
-	envp[2] = NULL;
-
-	ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
-	if (ret < 0)
-		mlog_errno(ret);
-}
-
-/*
- * Hangup is a hack for tools compatibility.  Older ocfs2-tools software
- * expects the filesystem to call "ocfs2_hb_ctl" during unmount.  This
- * happens regardless of whether the DLM got started, so we can't do it
- * in ocfs2_cluster_disconnect().  We bring the o2hb_stop() function into
- * the glue and provide a "hangup" API for super.c to call.
- *
- * Other stacks will eventually provide a NULL ->hangup() pointer.
- */
-static void o2cb_cluster_hangup(const char *group, int grouplen)
-{
-	o2hb_stop(group);
-}
-
 static int o2cb_cluster_this_node(unsigned int *node)
 {
 	int node_num;
@@ -388,7 +350,6 @@ static int o2cb_cluster_this_node(unsigned int *node)
 static struct ocfs2_stack_operations o2cb_stack_ops = {
 	.connect	= o2cb_cluster_connect,
 	.disconnect	= o2cb_cluster_disconnect,
-	.hangup		= o2cb_cluster_hangup,
 	.this_node	= o2cb_cluster_this_node,
 	.dlm_lock	= o2cb_dlm_lock,
 	.dlm_unlock	= o2cb_dlm_unlock,
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index b503772cd0ec..bd7e0f3acfc7 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -21,6 +21,7 @@
 #include <linux/fs.h>
 #include <linux/miscdevice.h>
 #include <linux/mutex.h>
+#include <linux/smp_lock.h>
 #include <linux/reboot.h>
 #include <asm/uaccess.h>
 
@@ -61,7 +62,7 @@
  * negotiated by the client.  The client negotiates based on the maximum
  * version advertised in /sys/fs/ocfs2/max_locking_protocol.  The major
  * number from the "SETV" message must match
- * user_stack.sp_proto->lp_max_version.pv_major, and the minor number
+ * ocfs2_user_plugin.sp_proto->lp_max_version.pv_major, and the minor number
  * must be less than or equal to ...->lp_max_version.pv_minor.
  *
  * Once this information has been set, mounts will be allowed.  From this
@@ -153,7 +154,7 @@ union ocfs2_control_message {
 	struct ocfs2_control_message_down	u_down;
 };
 
-static struct ocfs2_stack_plugin user_stack;
+static struct ocfs2_stack_plugin ocfs2_user_plugin;
 
 static atomic_t ocfs2_control_opened;
 static int ocfs2_control_this_node = -1;
@@ -399,7 +400,7 @@ static int ocfs2_control_do_setversion_msg(struct file *file,
 	char *ptr = NULL;
 	struct ocfs2_control_private *p = file->private_data;
 	struct ocfs2_protocol_version *max =
-		&user_stack.sp_proto->lp_max_version;
+		&ocfs2_user_plugin.sp_proto->lp_max_version;
 
 	if (ocfs2_control_get_handshake_state(file) !=
 	    OCFS2_CONTROL_HANDSHAKE_PROTOCOL)
@@ -619,10 +620,12 @@ static int ocfs2_control_open(struct inode *inode, struct file *file)
 		return -ENOMEM;
 	p->op_this_node = -1;
 
+	lock_kernel();
 	mutex_lock(&ocfs2_control_lock);
 	file->private_data = p;
 	list_add(&p->op_list, &ocfs2_control_private_list);
 	mutex_unlock(&ocfs2_control_lock);
+	unlock_kernel();
 
 	return 0;
 }
@@ -680,7 +683,7 @@ static void fsdlm_lock_ast_wrapper(void *astarg)
 	struct dlm_lksb *lksb = fsdlm_astarg_to_lksb(astarg);
 	int status = lksb->sb_status;
 
-	BUG_ON(user_stack.sp_proto == NULL);
+	BUG_ON(ocfs2_user_plugin.sp_proto == NULL);
 
 	/*
 	 * For now we're punting on the issue of other non-standard errors
@@ -693,16 +696,16 @@ static void fsdlm_lock_ast_wrapper(void *astarg)
 	 */
 
 	if (status == -DLM_EUNLOCK || status == -DLM_ECANCEL)
-		user_stack.sp_proto->lp_unlock_ast(astarg, 0);
+		ocfs2_user_plugin.sp_proto->lp_unlock_ast(astarg, 0);
 	else
-		user_stack.sp_proto->lp_lock_ast(astarg);
+		ocfs2_user_plugin.sp_proto->lp_lock_ast(astarg);
 }
 
 static void fsdlm_blocking_ast_wrapper(void *astarg, int level)
 {
-	BUG_ON(user_stack.sp_proto == NULL);
+	BUG_ON(ocfs2_user_plugin.sp_proto == NULL);
 
-	user_stack.sp_proto->lp_blocking_ast(astarg, level);
+	ocfs2_user_plugin.sp_proto->lp_blocking_ast(astarg, level);
 }
 
 static int user_dlm_lock(struct ocfs2_cluster_connection *conn,
@@ -816,8 +819,7 @@ out:
 	return rc;
 }
 
-static int user_cluster_disconnect(struct ocfs2_cluster_connection *conn,
-				   int hangup_pending)
+static int user_cluster_disconnect(struct ocfs2_cluster_connection *conn)
 {
 	dlm_release_lockspace(conn->cc_lockspace, 2);
 	conn->cc_lockspace = NULL;
@@ -838,7 +840,7 @@ static int user_cluster_this_node(unsigned int *this_node)
 	return 0;
 }
 
-static struct ocfs2_stack_operations user_stack_ops = {
+static struct ocfs2_stack_operations ocfs2_user_plugin_ops = {
 	.connect	= user_cluster_connect,
 	.disconnect	= user_cluster_disconnect,
 	.this_node	= user_cluster_this_node,
@@ -849,20 +851,20 @@ static struct ocfs2_stack_operations user_stack_ops = {
 	.dump_lksb	= user_dlm_dump_lksb,
 };
 
-static struct ocfs2_stack_plugin user_stack = {
+static struct ocfs2_stack_plugin ocfs2_user_plugin = {
 	.sp_name	= "user",
-	.sp_ops		= &user_stack_ops,
+	.sp_ops		= &ocfs2_user_plugin_ops,
 	.sp_owner	= THIS_MODULE,
 };
 
 
-static int __init user_stack_init(void)
+static int __init ocfs2_user_plugin_init(void)
 {
 	int rc;
 
 	rc = ocfs2_control_init();
 	if (!rc) {
-		rc = ocfs2_stack_glue_register(&user_stack);
+		rc = ocfs2_stack_glue_register(&ocfs2_user_plugin);
 		if (rc)
 			ocfs2_control_exit();
 	}
@@ -870,14 +872,14 @@ static int __init user_stack_init(void)
 	return rc;
 }
 
-static void __exit user_stack_exit(void)
+static void __exit ocfs2_user_plugin_exit(void)
 {
-	ocfs2_stack_glue_unregister(&user_stack);
+	ocfs2_stack_glue_unregister(&ocfs2_user_plugin);
 	ocfs2_control_exit();
 }
 
 MODULE_AUTHOR("Oracle");
 MODULE_DESCRIPTION("ocfs2 driver for userspace cluster stacks");
 MODULE_LICENSE("GPL");
-module_init(user_stack_init);
-module_exit(user_stack_exit);
+module_init(ocfs2_user_plugin_init);
+module_exit(ocfs2_user_plugin_exit);
diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c
index 119f60cea9cc..10e149ae5e3a 100644
--- a/fs/ocfs2/stackglue.c
+++ b/fs/ocfs2/stackglue.c
@@ -26,6 +26,7 @@
 #include <linux/fs.h>
 #include <linux/kobject.h>
 #include <linux/sysfs.h>
+#include <linux/sysctl.h>
 
 #include "ocfs2_fs.h"
 
@@ -33,11 +34,13 @@
 
 #define OCFS2_STACK_PLUGIN_O2CB		"o2cb"
 #define OCFS2_STACK_PLUGIN_USER		"user"
+#define OCFS2_MAX_HB_CTL_PATH		256
 
 static struct ocfs2_locking_protocol *lproto;
 static DEFINE_SPINLOCK(ocfs2_stack_lock);
 static LIST_HEAD(ocfs2_stack_list);
 static char cluster_stack_name[OCFS2_STACK_LABEL_LEN + 1];
+static char ocfs2_hb_ctl_path[OCFS2_MAX_HB_CTL_PATH] = "/sbin/ocfs2_hb_ctl";
 
 /*
  * The stack currently in use.  If not null, active_stack->sp_count > 0,
@@ -349,7 +352,7 @@ int ocfs2_cluster_disconnect(struct ocfs2_cluster_connection *conn,
 
 	BUG_ON(conn == NULL);
 
-	ret = active_stack->sp_ops->disconnect(conn, hangup_pending);
+	ret = active_stack->sp_ops->disconnect(conn);
 
 	/* XXX Should we free it anyway? */
 	if (!ret) {
@@ -362,13 +365,48 @@ int ocfs2_cluster_disconnect(struct ocfs2_cluster_connection *conn,
 }
 EXPORT_SYMBOL_GPL(ocfs2_cluster_disconnect);
 
+/*
+ * Leave the group for this filesystem.  This is executed by a userspace
+ * program (stored in ocfs2_hb_ctl_path).
+ */
+static void ocfs2_leave_group(const char *group)
+{
+	int ret;
+	char *argv[5], *envp[3];
+
+	argv[0] = ocfs2_hb_ctl_path;
+	argv[1] = "-K";
+	argv[2] = "-u";
+	argv[3] = (char *)group;
+	argv[4] = NULL;
+
+	/* minimal command environment taken from cpu_run_sbin_hotplug */
+	envp[0] = "HOME=/";
+	envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
+	envp[2] = NULL;
+
+	ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
+	if (ret < 0) {
+		printk(KERN_ERR
+		       "ocfs2: Error %d running user helper "
+		       "\"%s %s %s %s\"\n",
+		       ret, argv[0], argv[1], argv[2], argv[3]);
+	}
+}
+
+/*
+ * Hangup is a required post-umount.  ocfs2-tools software expects the
+ * filesystem to call "ocfs2_hb_ctl" during unmount.  This happens
+ * regardless of whether the DLM got started, so we can't do it
+ * in ocfs2_cluster_disconnect().  The ocfs2_leave_group() function does
+ * the actual work.
+ */
 void ocfs2_cluster_hangup(const char *group, int grouplen)
 {
 	BUG_ON(group == NULL);
 	BUG_ON(group[grouplen] != '\0');
 
-	if (active_stack->sp_ops->hangup)
-		active_stack->sp_ops->hangup(group, grouplen);
+	ocfs2_leave_group(group);
 
 	/* cluster_disconnect() was called with hangup_pending==1 */
 	ocfs2_stack_driver_put();
@@ -548,10 +586,83 @@ error:
 	return ret;
 }
 
+/*
+ * Sysctl bits
+ *
+ * The sysctl lives at /proc/sys/fs/ocfs2/nm/hb_ctl_path.  The 'nm' doesn't
+ * make as much sense in a multiple cluster stack world, but it's safer
+ * and easier to preserve the name.
+ */
+
+#define FS_OCFS2_NM		1
+
+static ctl_table ocfs2_nm_table[] = {
+	{
+		.ctl_name	= 1,
+		.procname	= "hb_ctl_path",
+		.data		= ocfs2_hb_ctl_path,
+		.maxlen		= OCFS2_MAX_HB_CTL_PATH,
+		.mode		= 0644,
+		.proc_handler	= &proc_dostring,
+		.strategy	= &sysctl_string,
+	},
+	{ .ctl_name = 0 }
+};
+
+static ctl_table ocfs2_mod_table[] = {
+	{
+		.ctl_name	= FS_OCFS2_NM,
+		.procname	= "nm",
+		.data		= NULL,
+		.maxlen		= 0,
+		.mode		= 0555,
+		.child		= ocfs2_nm_table
+	},
+	{ .ctl_name = 0}
+};
+
+static ctl_table ocfs2_kern_table[] = {
+	{
+		.ctl_name	= FS_OCFS2,
+		.procname	= "ocfs2",
+		.data		= NULL,
+		.maxlen		= 0,
+		.mode		= 0555,
+		.child		= ocfs2_mod_table
+	},
+	{ .ctl_name = 0}
+};
+
+static ctl_table ocfs2_root_table[] = {
+	{
+		.ctl_name	= CTL_FS,
+		.procname	= "fs",
+		.data		= NULL,
+		.maxlen		= 0,
+		.mode		= 0555,
+		.child		= ocfs2_kern_table
+	},
+	{ .ctl_name = 0 }
+};
+
+static struct ctl_table_header *ocfs2_table_header = NULL;
+
+
+/*
+ * Initialization
+ */
+
 static int __init ocfs2_stack_glue_init(void)
 {
 	strcpy(cluster_stack_name, OCFS2_STACK_PLUGIN_O2CB);
 
+	ocfs2_table_header = register_sysctl_table(ocfs2_root_table);
+	if (!ocfs2_table_header) {
+		printk(KERN_ERR
+		       "ocfs2 stack glue: unable to register sysctl\n");
+		return -ENOMEM; /* or something. */
+	}
+
 	return ocfs2_sysfs_init();
 }
 
@@ -559,6 +670,8 @@ static void __exit ocfs2_stack_glue_exit(void)
 {
 	lproto = NULL;
 	ocfs2_sysfs_exit();
+	if (ocfs2_table_header)
+		unregister_sysctl_table(ocfs2_table_header);
 }
 
 MODULE_AUTHOR("Oracle");
diff --git a/fs/ocfs2/stackglue.h b/fs/ocfs2/stackglue.h
index 005e4f170e0f..db56281dd1be 100644
--- a/fs/ocfs2/stackglue.h
+++ b/fs/ocfs2/stackglue.h
@@ -134,22 +134,10 @@ struct ocfs2_stack_operations {
 	 * be freed.  Thus, a stack must not return from ->disconnect()
 	 * until it will no longer reference the conn pointer.
 	 *
-	 * If hangup_pending is zero, ocfs2_cluster_disconnect() will also
-	 * be dropping the reference on the module.
+	 * Once this call returns, the stack glue will be dropping this
+	 * connection's reference on the module.
 	 */
-	int (*disconnect)(struct ocfs2_cluster_connection *conn,
-			  int hangup_pending);
-
-	/*
-	 * ocfs2_cluster_hangup() exists for compatibility with older
-	 * ocfs2 tools.  Only the classic stack really needs it.  As such
-	 * ->hangup() is not required of all stacks.  See the comment by
-	 * ocfs2_cluster_hangup() for more details.
-	 *
-	 * Note that ocfs2_cluster_hangup() can only be called if
-	 * hangup_pending was passed to ocfs2_cluster_disconnect().
-	 */
-	void (*hangup)(const char *group, int grouplen);
+	int (*disconnect)(struct ocfs2_cluster_connection *conn);
 
 	/*
 	 * ->this_node() returns the cluster's unique identifier for the
@@ -258,4 +246,5 @@ void ocfs2_stack_glue_set_locking_protocol(struct ocfs2_locking_protocol *proto)
 /* Used by stack plugins */
 int ocfs2_stack_glue_register(struct ocfs2_stack_plugin *plugin);
 void ocfs2_stack_glue_unregister(struct ocfs2_stack_plugin *plugin);
+
 #endif  /* STACKGLUE_H */
diff --git a/fs/open.c b/fs/open.c
index a1450086e92f..a99ad09c3197 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -16,6 +16,7 @@
 #include <linux/namei.h>
 #include <linux/backing-dev.h>
 #include <linux/capability.h>
+#include <linux/securebits.h>
 #include <linux/security.h>
 #include <linux/mount.h>
 #include <linux/vfs.h>
@@ -425,7 +426,7 @@ asmlinkage long sys_faccessat(int dfd, const char __user *filename, int mode)
 {
 	struct nameidata nd;
 	int old_fsuid, old_fsgid;
-	kernel_cap_t old_cap;
+	kernel_cap_t uninitialized_var(old_cap);  /* !SECURE_NO_SETUID_FIXUP */
 	int res;
 
 	if (mode & ~S_IRWXO)	/* where's F_OK, X_OK, W_OK, R_OK? */
@@ -433,23 +434,27 @@ asmlinkage long sys_faccessat(int dfd, const char __user *filename, int mode)
 
 	old_fsuid = current->fsuid;
 	old_fsgid = current->fsgid;
-	old_cap = current->cap_effective;
 
 	current->fsuid = current->uid;
 	current->fsgid = current->gid;
 
-	/*
-	 * Clear the capabilities if we switch to a non-root user
-	 *
-	 * FIXME: There is a race here against sys_capset.  The
-	 * capabilities can change yet we will restore the old
-	 * value below.  We should hold task_capabilities_lock,
-	 * but we cannot because user_path_walk can sleep.
-	 */
-	if (current->uid)
-		cap_clear(current->cap_effective);
-	else
-		current->cap_effective = current->cap_permitted;
+	if (!issecure(SECURE_NO_SETUID_FIXUP)) {
+		/*
+		 * Clear the capabilities if we switch to a non-root user
+		 */
+#ifndef CONFIG_SECURITY_FILE_CAPABILITIES
+		/*
+		 * FIXME: There is a race here against sys_capset.  The
+		 * capabilities can change yet we will restore the old
+		 * value below.  We should hold task_capabilities_lock,
+		 * but we cannot because user_path_walk can sleep.
+		 */
+#endif /* ndef CONFIG_SECURITY_FILE_CAPABILITIES */
+		if (current->uid)
+			old_cap = cap_set_effective(__cap_empty_set);
+		else
+			old_cap = cap_set_effective(current->cap_permitted);
+	}
 
 	res = __user_walk_fd(dfd, filename, LOOKUP_FOLLOW|LOOKUP_ACCESS, &nd);
 	if (res)
@@ -478,7 +483,9 @@ out_path_release:
 out:
 	current->fsuid = old_fsuid;
 	current->fsgid = old_fsgid;
-	current->cap_effective = old_cap;
+
+	if (!issecure(SECURE_NO_SETUID_FIXUP))
+		cap_set_effective(old_cap);
 
 	return res;
 }
diff --git a/fs/pipe.c b/fs/pipe.c
index ec228bc9f882..700f4e0d9572 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -1003,8 +1003,7 @@ struct file *create_write_pipe(void)
 void free_write_pipe(struct file *f)
 {
 	free_pipe_info(f->f_dentry->d_inode);
-	dput(f->f_path.dentry);
-	mntput(f->f_path.mnt);
+	path_put(&f->f_path);
 	put_filp(f);
 }
 
@@ -1015,8 +1014,8 @@ struct file *create_read_pipe(struct file *wrf)
 		return ERR_PTR(-ENFILE);
 
 	/* Grab pipe from the writer */
-	f->f_path.mnt = mntget(wrf->f_path.mnt);
-	f->f_path.dentry = dget(wrf->f_path.dentry);
+	f->f_path = wrf->f_path;
+	path_get(&wrf->f_path);
 	f->f_mapping = wrf->f_path.dentry->d_inode->i_mapping;
 
 	f->f_pos = 0;
@@ -1068,8 +1067,7 @@ int do_pipe(int *fd)
  err_fdr:
 	put_unused_fd(fdr);
  err_read_pipe:
-	dput(fr->f_dentry);
-	mntput(fr->f_vfsmnt);
+	path_put(&fr->f_path);
 	put_filp(fr);
  err_write_pipe:
 	free_write_pipe(fw);
diff --git a/fs/proc/array.c b/fs/proc/array.c
index dca997a93bff..797d775e0354 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -288,7 +288,7 @@ static void render_cap_t(struct seq_file *m, const char *header,
 	seq_printf(m, "%s", header);
 	CAP_FOR_EACH_U32(__capi) {
 		seq_printf(m, "%08x",
-			   a->cap[(_LINUX_CAPABILITY_U32S-1) - __capi]);
+			   a->cap[(_KERNEL_CAPABILITY_U32S-1) - __capi]);
 	}
 	seq_printf(m, "\n");
 }
@@ -298,6 +298,7 @@ static inline void task_cap(struct seq_file *m, struct task_struct *p)
 	render_cap_t(m, "CapInh:\t", &p->cap_inheritable);
 	render_cap_t(m, "CapPrm:\t", &p->cap_permitted);
 	render_cap_t(m, "CapEff:\t", &p->cap_effective);
+	render_cap_t(m, "CapBnd:\t", &p->cap_bset);
 }
 
 static inline void task_context_switch_counts(struct seq_file *m,
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 808cbdc193d3..58c3e6a8e15e 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -127,6 +127,25 @@ struct pid_entry {
 		NULL, &proc_single_file_operations,	\
 		{ .proc_show = &proc_##OTYPE } )
 
+/*
+ * Count the number of hardlinks for the pid_entry table, excluding the .
+ * and .. links.
+ */
+static unsigned int pid_entry_count_dirs(const struct pid_entry *entries,
+	unsigned int n)
+{
+	unsigned int i;
+	unsigned int count;
+
+	count = 0;
+	for (i = 0; i < n; ++i) {
+		if (S_ISDIR(entries[i].mode))
+			++count;
+	}
+
+	return count;
+}
+
 int maps_protect;
 EXPORT_SYMBOL(maps_protect);
 
@@ -214,7 +233,7 @@ static int check_mem_permission(struct task_struct *task)
 	 */
 	if (task->parent == current && (task->ptrace & PT_PTRACED) &&
 	    task_is_stopped_or_traced(task) &&
-	    ptrace_may_attach(task))
+	    ptrace_may_access(task, PTRACE_MODE_ATTACH))
 		return 0;
 
 	/*
@@ -232,7 +251,8 @@ struct mm_struct *mm_for_maps(struct task_struct *task)
 	task_lock(task);
 	if (task->mm != mm)
 		goto out;
-	if (task->mm != current->mm && __ptrace_may_attach(task) < 0)
+	if (task->mm != current->mm &&
+	    __ptrace_may_access(task, PTRACE_MODE_READ) < 0)
 		goto out;
 	task_unlock(task);
 	return mm;
@@ -499,7 +519,7 @@ static int proc_fd_access_allowed(struct inode *inode)
 	 */
 	task = get_proc_task(inode);
 	if (task) {
-		allowed = ptrace_may_attach(task);
+		allowed = ptrace_may_access(task, PTRACE_MODE_READ);
 		put_task_struct(task);
 	}
 	return allowed;
@@ -885,7 +905,7 @@ static ssize_t environ_read(struct file *file, char __user *buf,
 	if (!task)
 		goto out_no_task;
 
-	if (!ptrace_may_attach(task))
+	if (!ptrace_may_access(task, PTRACE_MODE_READ))
 		goto out;
 
 	ret = -ENOMEM;
@@ -2441,7 +2461,7 @@ static const struct pid_entry tgid_base_stuff[] = {
 	REG("oom_adj",    S_IRUGO|S_IWUSR, oom_adjust),
 #ifdef CONFIG_AUDITSYSCALL
 	REG("loginuid",   S_IWUSR|S_IRUGO, loginuid),
-	REG("sessionid",  S_IRUSR, sessionid),
+	REG("sessionid",  S_IRUGO, sessionid),
 #endif
 #ifdef CONFIG_FAULT_INJECTION
 	REG("make-it-fail", S_IRUGO|S_IWUSR, fault_inject),
@@ -2585,10 +2605,9 @@ static struct dentry *proc_pid_instantiate(struct inode *dir,
 	inode->i_op = &proc_tgid_base_inode_operations;
 	inode->i_fop = &proc_tgid_base_operations;
 	inode->i_flags|=S_IMMUTABLE;
-	inode->i_nlink = 5;
-#ifdef CONFIG_SECURITY
-	inode->i_nlink += 1;
-#endif
+
+	inode->i_nlink = 2 + pid_entry_count_dirs(tgid_base_stuff,
+		ARRAY_SIZE(tgid_base_stuff));
 
 	dentry->d_op = &pid_dentry_operations;
 
@@ -2816,10 +2835,9 @@ static struct dentry *proc_task_instantiate(struct inode *dir,
 	inode->i_op = &proc_tid_base_inode_operations;
 	inode->i_fop = &proc_tid_base_operations;
 	inode->i_flags|=S_IMMUTABLE;
-	inode->i_nlink = 4;
-#ifdef CONFIG_SECURITY
-	inode->i_nlink += 1;
-#endif
+
+	inode->i_nlink = 2 + pid_entry_count_dirs(tid_base_stuff,
+		ARRAY_SIZE(tid_base_stuff));
 
 	dentry->d_op = &pid_dentry_operations;
 
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 6f4e8dc97da1..b08d10017911 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -425,7 +425,8 @@ struct inode *proc_get_inode(struct super_block *sb, unsigned int ino,
 			}
 		}
 		unlock_new_inode(inode);
-	}
+	} else
+	       module_put(de->owner);
 	return inode;
 
 out_ino:
diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index 74a323d2b850..c652d469dc08 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -123,6 +123,11 @@ static int uptime_read_proc(char *page, char **start, off_t off,
 	return proc_calc_metrics(page, start, off, count, eof, len);
 }
 
+int __attribute__((weak)) arch_report_meminfo(char *page)
+{
+	return 0;
+}
+
 static int meminfo_read_proc(char *page, char **start, off_t off,
 				 int count, int *eof, void *data)
 {
@@ -139,7 +144,7 @@ static int meminfo_read_proc(char *page, char **start, off_t off,
 #define K(x) ((x) << (PAGE_SHIFT - 10))
 	si_meminfo(&i);
 	si_swapinfo(&i);
-	committed = atomic_read(&vm_committed_space);
+	committed = atomic_long_read(&vm_committed_space);
 	allowed = ((totalram_pages - hugetlb_total_pages())
 		* sysctl_overcommit_ratio / 100) + total_swap_pages;
 
@@ -221,6 +226,8 @@ static int meminfo_read_proc(char *page, char **start, off_t off,
 
 		len += hugetlb_report_meminfo(page + len);
 
+	len += arch_report_meminfo(page + len);
+
 	return proc_calc_metrics(page, start, off, count, eof, len);
 #undef K
 }
@@ -472,6 +479,13 @@ static const struct file_operations proc_vmalloc_operations = {
 };
 #endif
 
+#ifndef arch_irq_stat_cpu
+#define arch_irq_stat_cpu(cpu) 0
+#endif
+#ifndef arch_irq_stat
+#define arch_irq_stat() 0
+#endif
+
 static int show_stat(struct seq_file *p, void *v)
 {
 	int i;
@@ -509,7 +523,9 @@ static int show_stat(struct seq_file *p, void *v)
 			sum += temp;
 			per_irq_sum[j] += temp;
 		}
+		sum += arch_irq_stat_cpu(i);
 	}
+	sum += arch_irq_stat();
 
 	seq_printf(p, "cpu  %llu %llu %llu %llu %llu %llu %llu %llu %llu\n",
 		(unsigned long long)cputime64_to_clock_t(user),
@@ -716,7 +732,7 @@ static ssize_t kpagecount_read(struct file *file, char __user *buf,
 	pfn = src / KPMSIZE;
 	count = min_t(size_t, count, (max_pfn * KPMSIZE) - src);
 	if (src & KPMMASK || count & KPMMASK)
-		return -EIO;
+		return -EINVAL;
 
 	while (count > 0) {
 		ppage = NULL;
@@ -726,7 +742,7 @@ static ssize_t kpagecount_read(struct file *file, char __user *buf,
 		if (!ppage)
 			pcount = 0;
 		else
-			pcount = atomic_read(&ppage->_count);
+			pcount = page_mapcount(ppage);
 
 		if (put_user(pcount, out++)) {
 			ret = -EFAULT;
@@ -782,7 +798,7 @@ static ssize_t kpageflags_read(struct file *file, char __user *buf,
 	pfn = src / KPMSIZE;
 	count = min_t(unsigned long, count, (max_pfn * KPMSIZE) - src);
 	if (src & KPMMASK || count & KPMMASK)
-		return -EIO;
+		return -EINVAL;
 
 	while (count > 0) {
 		ppage = NULL;
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 88717c0f941b..164bd9f9ede3 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -210,7 +210,7 @@ static int show_map(struct seq_file *m, void *v)
 	dev_t dev = 0;
 	int len;
 
-	if (maps_protect && !ptrace_may_attach(task))
+	if (maps_protect && !ptrace_may_access(task, PTRACE_MODE_READ))
 		return -EACCES;
 
 	if (file) {
@@ -315,9 +315,9 @@ struct mem_size_stats {
 };
 
 static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
-			   void *private)
+			   struct mm_walk *walk)
 {
-	struct mem_size_stats *mss = private;
+	struct mem_size_stats *mss = walk->private;
 	struct vm_area_struct *vma = mss->vma;
 	pte_t *pte, ptent;
 	spinlock_t *ptl;
@@ -365,19 +365,21 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
 	return 0;
 }
 
-static struct mm_walk smaps_walk = { .pmd_entry = smaps_pte_range };
-
 static int show_smap(struct seq_file *m, void *v)
 {
 	struct vm_area_struct *vma = v;
 	struct mem_size_stats mss;
 	int ret;
+	struct mm_walk smaps_walk = {
+		.pmd_entry = smaps_pte_range,
+		.mm = vma->vm_mm,
+		.private = &mss,
+	};
 
 	memset(&mss, 0, sizeof mss);
 	mss.vma = vma;
 	if (vma->vm_mm && !is_vm_hugetlb_page(vma))
-		walk_page_range(vma->vm_mm, vma->vm_start, vma->vm_end,
-				&smaps_walk, &mss);
+		walk_page_range(vma->vm_start, vma->vm_end, &smaps_walk);
 
 	ret = show_map(m, v);
 	if (ret)
@@ -426,9 +428,9 @@ const struct file_operations proc_smaps_operations = {
 };
 
 static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
-				unsigned long end, void *private)
+				unsigned long end, struct mm_walk *walk)
 {
-	struct vm_area_struct *vma = private;
+	struct vm_area_struct *vma = walk->private;
 	pte_t *pte, ptent;
 	spinlock_t *ptl;
 	struct page *page;
@@ -452,8 +454,6 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
 	return 0;
 }
 
-static struct mm_walk clear_refs_walk = { .pmd_entry = clear_refs_pte_range };
-
 static ssize_t clear_refs_write(struct file *file, const char __user *buf,
 				size_t count, loff_t *ppos)
 {
@@ -476,11 +476,17 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
 		return -ESRCH;
 	mm = get_task_mm(task);
 	if (mm) {
+		struct mm_walk clear_refs_walk = {
+			.pmd_entry = clear_refs_pte_range,
+			.mm = mm,
+		};
 		down_read(&mm->mmap_sem);
-		for (vma = mm->mmap; vma; vma = vma->vm_next)
+		for (vma = mm->mmap; vma; vma = vma->vm_next) {
+			clear_refs_walk.private = vma;
 			if (!is_vm_hugetlb_page(vma))
-				walk_page_range(mm, vma->vm_start, vma->vm_end,
-						&clear_refs_walk, vma);
+				walk_page_range(vma->vm_start, vma->vm_end,
+						&clear_refs_walk);
+		}
 		flush_tlb_mm(mm);
 		up_read(&mm->mmap_sem);
 		mmput(mm);
@@ -496,7 +502,7 @@ const struct file_operations proc_clear_refs_operations = {
 };
 
 struct pagemapread {
-	char __user *out, *end;
+	u64 __user *out, *end;
 };
 
 #define PM_ENTRY_BYTES      sizeof(u64)
@@ -519,28 +525,18 @@ struct pagemapread {
 static int add_to_pagemap(unsigned long addr, u64 pfn,
 			  struct pagemapread *pm)
 {
-	/*
-	 * Make sure there's room in the buffer for an
-	 * entire entry.  Otherwise, only copy part of
-	 * the pfn.
-	 */
-	if (pm->out + PM_ENTRY_BYTES >= pm->end) {
-		if (copy_to_user(pm->out, &pfn, pm->end - pm->out))
-			return -EFAULT;
-		pm->out = pm->end;
-		return PM_END_OF_BUFFER;
-	}
-
 	if (put_user(pfn, pm->out))
 		return -EFAULT;
-	pm->out += PM_ENTRY_BYTES;
+	pm->out++;
+	if (pm->out >= pm->end)
+		return PM_END_OF_BUFFER;
 	return 0;
 }
 
 static int pagemap_pte_hole(unsigned long start, unsigned long end,
-				void *private)
+				struct mm_walk *walk)
 {
-	struct pagemapread *pm = private;
+	struct pagemapread *pm = walk->private;
 	unsigned long addr;
 	int err = 0;
 	for (addr = start; addr < end; addr += PAGE_SIZE) {
@@ -557,24 +553,45 @@ static u64 swap_pte_to_pagemap_entry(pte_t pte)
 	return swp_type(e) | (swp_offset(e) << MAX_SWAPFILES_SHIFT);
 }
 
+static unsigned long pte_to_pagemap_entry(pte_t pte)
+{
+	unsigned long pme = 0;
+	if (is_swap_pte(pte))
+		pme = PM_PFRAME(swap_pte_to_pagemap_entry(pte))
+			| PM_PSHIFT(PAGE_SHIFT) | PM_SWAP;
+	else if (pte_present(pte))
+		pme = PM_PFRAME(pte_pfn(pte))
+			| PM_PSHIFT(PAGE_SHIFT) | PM_PRESENT;
+	return pme;
+}
+
 static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
-			     void *private)
+			     struct mm_walk *walk)
 {
-	struct pagemapread *pm = private;
+	struct vm_area_struct *vma;
+	struct pagemapread *pm = walk->private;
 	pte_t *pte;
 	int err = 0;
 
+	/* find the first VMA at or above 'addr' */
+	vma = find_vma(walk->mm, addr);
 	for (; addr != end; addr += PAGE_SIZE) {
 		u64 pfn = PM_NOT_PRESENT;
-		pte = pte_offset_map(pmd, addr);
-		if (is_swap_pte(*pte))
-			pfn = PM_PFRAME(swap_pte_to_pagemap_entry(*pte))
-				| PM_PSHIFT(PAGE_SHIFT) | PM_SWAP;
-		else if (pte_present(*pte))
-			pfn = PM_PFRAME(pte_pfn(*pte))
-				| PM_PSHIFT(PAGE_SHIFT) | PM_PRESENT;
-		/* unmap so we're not in atomic when we copy to userspace */
-		pte_unmap(pte);
+
+		/* check to see if we've left 'vma' behind
+		 * and need a new, higher one */
+		if (vma && (addr >= vma->vm_end))
+			vma = find_vma(walk->mm, addr);
+
+		/* check that 'vma' actually covers this address,
+		 * and that it isn't a huge page vma */
+		if (vma && (vma->vm_start <= addr) &&
+		    !is_vm_hugetlb_page(vma)) {
+			pte = pte_offset_map(pmd, addr);
+			pfn = pte_to_pagemap_entry(*pte);
+			/* unmap before userspace copy */
+			pte_unmap(pte);
+		}
 		err = add_to_pagemap(addr, pfn, pm);
 		if (err)
 			return err;
@@ -585,11 +602,6 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
 	return err;
 }
 
-static struct mm_walk pagemap_walk = {
-	.pmd_entry = pagemap_pte_range,
-	.pte_hole = pagemap_pte_hole
-};
-
 /*
  * /proc/pid/pagemap - an array mapping virtual pages to pfns
  *
@@ -624,17 +636,22 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
 	struct pagemapread pm;
 	int pagecount;
 	int ret = -ESRCH;
+	struct mm_walk pagemap_walk;
+	unsigned long src;
+	unsigned long svpfn;
+	unsigned long start_vaddr;
+	unsigned long end_vaddr;
 
 	if (!task)
 		goto out;
 
 	ret = -EACCES;
-	if (!ptrace_may_attach(task))
+	if (!ptrace_may_access(task, PTRACE_MODE_READ))
 		goto out_task;
 
 	ret = -EINVAL;
 	/* file position must be aligned */
-	if (*ppos % PM_ENTRY_BYTES)
+	if ((*ppos % PM_ENTRY_BYTES) || (count % PM_ENTRY_BYTES))
 		goto out_task;
 
 	ret = 0;
@@ -642,11 +659,15 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
 	if (!mm)
 		goto out_task;
 
-	ret = -ENOMEM;
+
 	uaddr = (unsigned long)buf & PAGE_MASK;
 	uend = (unsigned long)(buf + count);
 	pagecount = (PAGE_ALIGN(uend) - uaddr) / PAGE_SIZE;
-	pages = kmalloc(pagecount * sizeof(struct page *), GFP_KERNEL);
+	ret = 0;
+	if (pagecount == 0)
+		goto out_mm;
+	pages = kcalloc(pagecount, sizeof(struct page *), GFP_KERNEL);
+	ret = -ENOMEM;
 	if (!pages)
 		goto out_mm;
 
@@ -664,36 +685,36 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
 		goto out_pages;
 	}
 
-	pm.out = buf;
-	pm.end = buf + count;
+	pm.out = (u64 *)buf;
+	pm.end = (u64 *)(buf + count);
 
-	if (!ptrace_may_attach(task)) {
-		ret = -EIO;
-	} else {
-		unsigned long src = *ppos;
-		unsigned long svpfn = src / PM_ENTRY_BYTES;
-		unsigned long start_vaddr = svpfn << PAGE_SHIFT;
-		unsigned long end_vaddr = TASK_SIZE_OF(task);
-
-		/* watch out for wraparound */
-		if (svpfn > TASK_SIZE_OF(task) >> PAGE_SHIFT)
-			start_vaddr = end_vaddr;
-
-		/*
-		 * The odds are that this will stop walking way
-		 * before end_vaddr, because the length of the
-		 * user buffer is tracked in "pm", and the walk
-		 * will stop when we hit the end of the buffer.
-		 */
-		ret = walk_page_range(mm, start_vaddr, end_vaddr,
-					&pagemap_walk, &pm);
-		if (ret == PM_END_OF_BUFFER)
-			ret = 0;
-		/* don't need mmap_sem for these, but this looks cleaner */
-		*ppos += pm.out - buf;
-		if (!ret)
-			ret = pm.out - buf;
-	}
+	pagemap_walk.pmd_entry = pagemap_pte_range;
+	pagemap_walk.pte_hole = pagemap_pte_hole;
+	pagemap_walk.mm = mm;
+	pagemap_walk.private = &pm;
+
+	src = *ppos;
+	svpfn = src / PM_ENTRY_BYTES;
+	start_vaddr = svpfn << PAGE_SHIFT;
+	end_vaddr = TASK_SIZE_OF(task);
+
+	/* watch out for wraparound */
+	if (svpfn > TASK_SIZE_OF(task) >> PAGE_SHIFT)
+		start_vaddr = end_vaddr;
+
+	/*
+	 * The odds are that this will stop walking way
+	 * before end_vaddr, because the length of the
+	 * user buffer is tracked in "pm", and the walk
+	 * will stop when we hit the end of the buffer.
+	 */
+	ret = walk_page_range(start_vaddr, end_vaddr, &pagemap_walk);
+	if (ret == PM_END_OF_BUFFER)
+		ret = 0;
+	/* don't need mmap_sem for these, but this looks cleaner */
+	*ppos += (char *)pm.out - buf;
+	if (!ret)
+		ret = (char *)pm.out - buf;
 
 out_pages:
 	for (; pagecount; pagecount--) {
@@ -726,7 +747,7 @@ static int show_numa_map_checked(struct seq_file *m, void *v)
 	struct proc_maps_private *priv = m->private;
 	struct task_struct *task = priv->task;
 
-	if (maps_protect && !ptrace_may_attach(task))
+	if (maps_protect && !ptrace_may_access(task, PTRACE_MODE_READ))
 		return -EACCES;
 
 	return show_numa_map(m, v);
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 4b4f9cc2f186..5d84e7121df8 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -113,7 +113,7 @@ static int show_map(struct seq_file *m, void *_vml)
 	struct proc_maps_private *priv = m->private;
 	struct task_struct *task = priv->task;
 
-	if (maps_protect && !ptrace_may_attach(task))
+	if (maps_protect && !ptrace_may_access(task, PTRACE_MODE_READ))
 		return -EACCES;
 
 	return nommu_vma_show(m, vml->vma);
diff --git a/fs/ramfs/file-mmu.c b/fs/ramfs/file-mmu.c
index 9590b9024300..78f613cb9c76 100644
--- a/fs/ramfs/file-mmu.c
+++ b/fs/ramfs/file-mmu.c
@@ -45,6 +45,7 @@ const struct file_operations ramfs_file_operations = {
 	.mmap		= generic_file_mmap,
 	.fsync		= simple_sync_file,
 	.splice_read	= generic_file_splice_read,
+	.splice_write	= generic_file_splice_write,
 	.llseek		= generic_file_llseek,
 };
 
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index 0989bc2c2f69..52312ec93ff4 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -43,6 +43,7 @@ const struct file_operations ramfs_file_operations = {
 	.aio_write		= generic_file_aio_write,
 	.fsync			= simple_sync_file,
 	.splice_read		= generic_file_splice_read,
+	.splice_write		= generic_file_splice_write,
 	.llseek			= generic_file_llseek,
 };
 
diff --git a/fs/read_write.c b/fs/read_write.c
index f0d1240a5c69..9ba495d5a29b 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -31,12 +31,12 @@ const struct file_operations generic_ro_fops = {
 
 EXPORT_SYMBOL(generic_ro_fops);
 
-loff_t generic_file_llseek(struct file *file, loff_t offset, int origin)
+loff_t
+generic_file_llseek_unlocked(struct file *file, loff_t offset, int origin)
 {
 	loff_t retval;
 	struct inode *inode = file->f_mapping->host;
 
-	mutex_lock(&inode->i_mutex);
 	switch (origin) {
 		case SEEK_END:
 			offset += inode->i_size;
@@ -46,42 +46,26 @@ loff_t generic_file_llseek(struct file *file, loff_t offset, int origin)
 	}
 	retval = -EINVAL;
 	if (offset>=0 && offset<=inode->i_sb->s_maxbytes) {
+		/* Special lock needed here? */
 		if (offset != file->f_pos) {
 			file->f_pos = offset;
 			file->f_version = 0;
 		}
 		retval = offset;
 	}
-	mutex_unlock(&inode->i_mutex);
 	return retval;
 }
+EXPORT_SYMBOL(generic_file_llseek_unlocked);
 
-EXPORT_SYMBOL(generic_file_llseek);
-
-loff_t remote_llseek(struct file *file, loff_t offset, int origin)
+loff_t generic_file_llseek(struct file *file, loff_t offset, int origin)
 {
-	loff_t retval;
-
-	lock_kernel();
-	switch (origin) {
-		case SEEK_END:
-			offset += i_size_read(file->f_path.dentry->d_inode);
-			break;
-		case SEEK_CUR:
-			offset += file->f_pos;
-	}
-	retval = -EINVAL;
-	if (offset>=0 && offset<=file->f_path.dentry->d_inode->i_sb->s_maxbytes) {
-		if (offset != file->f_pos) {
-			file->f_pos = offset;
-			file->f_version = 0;
-		}
-		retval = offset;
-	}
-	unlock_kernel();
-	return retval;
+	loff_t n;
+	mutex_lock(&file->f_dentry->d_inode->i_mutex);
+	n = generic_file_llseek_unlocked(file, offset, origin);
+	mutex_unlock(&file->f_dentry->d_inode->i_mutex);
+	return n;
 }
-EXPORT_SYMBOL(remote_llseek);
+EXPORT_SYMBOL(generic_file_llseek);
 
 loff_t no_llseek(struct file *file, loff_t offset, int origin)
 {
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 57917932212e..192269698a8a 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -45,6 +45,8 @@ void reiserfs_delete_inode(struct inode *inode)
 			goto out;
 		reiserfs_update_inode_transaction(inode);
 
+		reiserfs_discard_prealloc(&th, inode);
+
 		err = reiserfs_delete_object(&th, inode);
 
 		/* Do quota update inside a transaction for journaled quotas. We must do that
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index ed424d708e69..1d40f2bd1970 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -2165,8 +2165,10 @@ static ssize_t reiserfs_quota_write(struct super_block *sb, int type,
 		blk++;
 	}
 out:
-	if (len == towrite)
+	if (len == towrite) {
+		mutex_unlock(&inode->i_mutex);
 		return err;
+	}
 	if (inode->i_size < off + len - towrite)
 		i_size_write(inode, off + len - towrite);
 	inode->i_version++;
diff --git a/fs/select.c b/fs/select.c
index 8dda969614a9..da0e88201c3a 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -249,7 +249,6 @@ int do_select(int n, fd_set_bits *fds, s64 *timeout)
 						retval++;
 					}
 				}
-				cond_resched();
 			}
 			if (res_in)
 				*rinp = res_in;
@@ -257,6 +256,7 @@ int do_select(int n, fd_set_bits *fds, s64 *timeout)
 				*routp = res_out;
 			if (res_ex)
 				*rexp = res_ex;
+			cond_resched();
 		}
 		wait = NULL;
 		if (retval || !*timeout || signal_pending(current))
diff --git a/fs/smbfs/file.c b/fs/smbfs/file.c
index efbe29af3d7a..2294783320cb 100644
--- a/fs/smbfs/file.c
+++ b/fs/smbfs/file.c
@@ -422,9 +422,18 @@ smb_file_permission(struct inode *inode, int mask, struct nameidata *nd)
 	return error;
 }
 
+static loff_t smb_remote_llseek(struct file *file, loff_t offset, int origin)
+{
+	loff_t ret;
+	lock_kernel();
+	ret = generic_file_llseek_unlocked(file, offset, origin);
+	unlock_kernel();
+	return ret;
+}
+
 const struct file_operations smb_file_operations =
 {
-	.llseek		= remote_llseek,
+	.llseek 	= smb_remote_llseek,
 	.read		= do_sync_read,
 	.aio_read	= smb_file_aio_read,
 	.write		= do_sync_write,
diff --git a/fs/splice.c b/fs/splice.c
index 78150038b584..399442179d89 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -58,8 +58,8 @@ static int page_cache_pipe_buf_steal(struct pipe_inode_info *pipe,
 		 */
 		wait_on_page_writeback(page);
 
-		if (PagePrivate(page))
-			try_to_release_page(page, GFP_KERNEL);
+		if (PagePrivate(page) && !try_to_release_page(page, GFP_KERNEL))
+			goto out_unlock;
 
 		/*
 		 * If we succeeded in removing the mapping, set LRU flag
@@ -75,6 +75,7 @@ static int page_cache_pipe_buf_steal(struct pipe_inode_info *pipe,
 	 * Raced with truncate or failed to remove page from current
 	 * address space, unlock and return failure.
 	 */
+out_unlock:
 	unlock_page(page);
 	return 1;
 }
@@ -378,13 +379,22 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
 				lock_page(page);
 
 			/*
-			 * page was truncated, stop here. if this isn't the
-			 * first page, we'll just complete what we already
-			 * added
+			 * Page was truncated, or invalidated by the
+			 * filesystem.  Redo the find/create, but this time the
+			 * page is kept locked, so there's no chance of another
+			 * race with truncate/invalidate.
 			 */
 			if (!page->mapping) {
 				unlock_page(page);
-				break;
+				page = find_or_create_page(mapping, index,
+						mapping_gfp_mask(mapping));
+
+				if (!page) {
+					error = -ENOMEM;
+					break;
+				}
+				page_cache_release(pages[page_nr]);
+				pages[page_nr] = page;
 			}
 			/*
 			 * page was already under io and is now done, great
@@ -983,7 +993,7 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,
 
 	while (len) {
 		size_t read_len;
-		loff_t pos = sd->pos;
+		loff_t pos = sd->pos, prev_pos = pos;
 
 		ret = do_splice_to(in, &pos, pipe, len, flags);
 		if (unlikely(ret <= 0))
@@ -998,15 +1008,19 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,
 		 * could get stuck data in the internal pipe:
 		 */
 		ret = actor(pipe, sd);
-		if (unlikely(ret <= 0))
+		if (unlikely(ret <= 0)) {
+			sd->pos = prev_pos;
 			goto out_release;
+		}
 
 		bytes += ret;
 		len -= ret;
 		sd->pos = pos;
 
-		if (ret < read_len)
+		if (ret < read_len) {
+			sd->pos = prev_pos + ret;
 			goto out_release;
+		}
 	}
 
 done:
@@ -1072,7 +1086,7 @@ long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
 
 	ret = splice_direct_to_actor(in, &sd, direct_splice_actor);
 	if (ret > 0)
-		*ppos += ret;
+		*ppos = sd.pos;
 
 	return ret;
 }
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index a1c3a1fab7f0..8c0e4b92574f 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -419,12 +419,8 @@ void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt,
  */
 int sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd)
 {
-	if (sysfs_find_dirent(acxt->parent_sd, sd->s_name)) {
-		printk(KERN_WARNING "sysfs: duplicate filename '%s' "
-		       "can not be created\n", sd->s_name);
-		WARN_ON(1);
+	if (sysfs_find_dirent(acxt->parent_sd, sd->s_name))
 		return -EEXIST;
-	}
 
 	sd->s_parent = sysfs_get(acxt->parent_sd);
 
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 7a5f69be6ac2..44cc702f96cc 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -682,38 +682,26 @@ static int udf_vrs(struct super_block *sb, int silent)
 /*
  * Check whether there is an anchor block in the given block
  */
-static int udf_check_anchor_block(struct super_block *sb, sector_t block,
-					bool varconv)
+static int udf_check_anchor_block(struct super_block *sb, sector_t block)
 {
-	struct buffer_head *bh = NULL;
-	tag *t;
+	struct buffer_head *bh;
 	uint16_t ident;
-	uint32_t location;
 
-	if (varconv) {
-		if (udf_fixed_to_variable(block) >=
-		    sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits)
-			return 0;
-		bh = sb_bread(sb, udf_fixed_to_variable(block));
-	}
-	else
-		bh = sb_bread(sb, block);
+	if (UDF_QUERY_FLAG(sb, UDF_FLAG_VARCONV) &&
+	    udf_fixed_to_variable(block) >=
+	    sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits)
+		return 0;
 
+	bh = udf_read_tagged(sb, block, block, &ident);
 	if (!bh)
 		return 0;
-
-	t = (tag *)bh->b_data;
-	ident = le16_to_cpu(t->tagIdent);
-	location = le32_to_cpu(t->tagLocation);
 	brelse(bh);
-	if (ident != TAG_IDENT_AVDP)
-		return 0;
-	return location == block;
+
+	return ident == TAG_IDENT_AVDP;
 }
 
 /* Search for an anchor volume descriptor pointer */
-static sector_t udf_scan_anchors(struct super_block *sb, bool varconv,
-					sector_t lastblock)
+static sector_t udf_scan_anchors(struct super_block *sb, sector_t lastblock)
 {
 	sector_t last[6];
 	int i;
@@ -739,7 +727,7 @@ static sector_t udf_scan_anchors(struct super_block *sb, bool varconv,
 				sb->s_blocksize_bits)
 			continue;
 
-		if (udf_check_anchor_block(sb, last[i], varconv)) {
+		if (udf_check_anchor_block(sb, last[i])) {
 			sbi->s_anchor[0] = last[i];
 			sbi->s_anchor[1] = last[i] - 256;
 			return last[i];
@@ -748,17 +736,17 @@ static sector_t udf_scan_anchors(struct super_block *sb, bool varconv,
 		if (last[i] < 256)
 			continue;
 
-		if (udf_check_anchor_block(sb, last[i] - 256, varconv)) {
+		if (udf_check_anchor_block(sb, last[i] - 256)) {
 			sbi->s_anchor[1] = last[i] - 256;
 			return last[i];
 		}
 	}
 
-	if (udf_check_anchor_block(sb, sbi->s_session + 256, varconv)) {
+	if (udf_check_anchor_block(sb, sbi->s_session + 256)) {
 		sbi->s_anchor[0] = sbi->s_session + 256;
 		return last[0];
 	}
-	if (udf_check_anchor_block(sb, sbi->s_session + 512, varconv)) {
+	if (udf_check_anchor_block(sb, sbi->s_session + 512)) {
 		sbi->s_anchor[0] = sbi->s_session + 512;
 		return last[0];
 	}
@@ -780,23 +768,24 @@ static void udf_find_anchor(struct super_block *sb)
 	int i;
 	struct udf_sb_info *sbi = UDF_SB(sb);
 
-	lastblock = udf_scan_anchors(sb, 0, sbi->s_last_block);
+	lastblock = udf_scan_anchors(sb, sbi->s_last_block);
 	if (lastblock)
 		goto check_anchor;
 
 	/* No anchor found? Try VARCONV conversion of block numbers */
+	UDF_SET_FLAG(sb, UDF_FLAG_VARCONV);
 	/* Firstly, we try to not convert number of the last block */
-	lastblock = udf_scan_anchors(sb, 1,
+	lastblock = udf_scan_anchors(sb,
 				udf_variable_to_fixed(sbi->s_last_block));
-	if (lastblock) {
-		UDF_SET_FLAG(sb, UDF_FLAG_VARCONV);
+	if (lastblock)
 		goto check_anchor;
-	}
 
 	/* Secondly, we try with converted number of the last block */
-	lastblock = udf_scan_anchors(sb, 1, sbi->s_last_block);
-	if (lastblock)
-		UDF_SET_FLAG(sb, UDF_FLAG_VARCONV);
+	lastblock = udf_scan_anchors(sb, sbi->s_last_block);
+	if (!lastblock) {
+		/* VARCONV didn't help. Clear it. */
+		UDF_CLEAR_FLAG(sb, UDF_FLAG_VARCONV);
+	}
 
 check_anchor:
 	/*
diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h
index 8fa9c2d70911..8ec865de5f13 100644
--- a/fs/udf/udfdecl.h
+++ b/fs/udf/udfdecl.h
@@ -16,7 +16,7 @@
 #define UDF_PREALLOCATE
 #define UDF_DEFAULT_PREALLOC_BLOCKS	8
 
-#define UDFFS_DEBUG
+#undef UDFFS_DEBUG
 
 #ifdef UDFFS_DEBUG
 #define udf_debug(f, a...) \
diff --git a/fs/ufs/ufs.h b/fs/ufs/ufs.h
index 244a1aaa940e..11c035168ea6 100644
--- a/fs/ufs/ufs.h
+++ b/fs/ufs/ufs.h
@@ -107,7 +107,6 @@ extern struct inode * ufs_new_inode (struct inode *, int);
 
 /* inode.c */
 extern struct inode *ufs_iget(struct super_block *, unsigned long);
-extern void ufs_put_inode (struct inode *);
 extern int ufs_write_inode (struct inode *, int);
 extern int ufs_sync_inode (struct inode *);
 extern void ufs_delete_inode (struct inode *);
diff --git a/fs/utimes.c b/fs/utimes.c
index af059d5cb485..b6b664e7145e 100644
--- a/fs/utimes.c
+++ b/fs/utimes.c
@@ -40,14 +40,9 @@ asmlinkage long sys_utime(char __user *filename, struct utimbuf __user *times)
 
 #endif
 
-static bool nsec_special(long nsec)
-{
-	return nsec == UTIME_OMIT || nsec == UTIME_NOW;
-}
-
 static bool nsec_valid(long nsec)
 {
-	if (nsec_special(nsec))
+	if (nsec == UTIME_OMIT || nsec == UTIME_NOW)
 		return true;
 
 	return nsec >= 0 && nsec <= 999999999;
@@ -102,7 +97,11 @@ long do_utimes(int dfd, char __user *filename, struct timespec *times, int flags
 	if (error)
 		goto dput_and_out;
 
-	/* Don't worry, the checks are done in inode_change_ok() */
+	if (times && times[0].tv_nsec == UTIME_NOW &&
+		     times[1].tv_nsec == UTIME_NOW)
+		times = NULL;
+
+	/* In most cases, the checks are done in inode_change_ok() */
 	newattrs.ia_valid = ATTR_CTIME | ATTR_MTIME | ATTR_ATIME;
 	if (times) {
 		error = -EPERM;
@@ -124,28 +123,34 @@ long do_utimes(int dfd, char __user *filename, struct timespec *times, int flags
 			newattrs.ia_mtime.tv_nsec = times[1].tv_nsec;
 			newattrs.ia_valid |= ATTR_MTIME_SET;
 		}
-	}
 
-	/*
-	 * If times is NULL or both times are either UTIME_OMIT or
-	 * UTIME_NOW, then need to check permissions, because
-	 * inode_change_ok() won't do it.
-	 */
-	if (!times || (nsec_special(times[0].tv_nsec) &&
-		       nsec_special(times[1].tv_nsec))) {
+		/*
+		 * For the UTIME_OMIT/UTIME_NOW and UTIME_NOW/UTIME_OMIT
+		 * cases, we need to make an extra check that is not done by
+		 * inode_change_ok().
+		 */
+		if (((times[0].tv_nsec == UTIME_NOW &&
+			    times[1].tv_nsec == UTIME_OMIT)
+		     ||
+		     (times[0].tv_nsec == UTIME_OMIT &&
+			    times[1].tv_nsec == UTIME_NOW))
+		    && !is_owner_or_cap(inode))
+			goto mnt_drop_write_and_out;
+	} else {
+
+		/*
+		 * If times is NULL (or both times are UTIME_NOW),
+		 * then we need to check permissions, because
+		 * inode_change_ok() won't do it.
+		 */
 		error = -EACCES;
                 if (IS_IMMUTABLE(inode))
 			goto mnt_drop_write_and_out;
 
 		if (!is_owner_or_cap(inode)) {
-			if (f) {
-				if (!(f->f_mode & FMODE_WRITE))
-					goto mnt_drop_write_and_out;
-			} else {
-				error = vfs_permission(&nd, MAY_WRITE);
-				if (error)
-					goto mnt_drop_write_and_out;
-			}
+			error = permission(inode, MAY_WRITE, NULL);
+			if (error)
+				goto mnt_drop_write_and_out;
 		}
 	}
 	mutex_lock(&inode->i_mutex);
@@ -169,14 +174,6 @@ asmlinkage long sys_utimensat(int dfd, char __user *filename, struct timespec __
 	if (utimes) {
 		if (copy_from_user(&tstimes, utimes, sizeof(tstimes)))
 			return -EFAULT;
-		if ((tstimes[0].tv_nsec == UTIME_OMIT ||
-		     tstimes[0].tv_nsec == UTIME_NOW) &&
-		    tstimes[0].tv_sec != 0)
-			return -EINVAL;
-		if ((tstimes[1].tv_nsec == UTIME_OMIT ||
-		     tstimes[1].tv_nsec == UTIME_NOW) &&
-		    tstimes[1].tv_sec != 0)
-			return -EINVAL;
 
 		/* Nothing to do, we must not even check the path.  */
 		if (tstimes[0].tv_nsec == UTIME_OMIT &&
diff --git a/fs/vfat/namei.c b/fs/vfat/namei.c
index a3522727ea5b..b546ba69be82 100644
--- a/fs/vfat/namei.c
+++ b/fs/vfat/namei.c
@@ -645,7 +645,7 @@ static int vfat_add_entry(struct inode *dir, struct qstr *qname, int is_dir,
 	if (len == 0)
 		return -ENOENT;
 
-	slots = kmalloc(sizeof(*slots) * MSDOS_SLOTS, GFP_KERNEL);
+	slots = kmalloc(sizeof(*slots) * MSDOS_SLOTS, GFP_NOFS);
 	if (slots == NULL)
 		return -ENOMEM;
 
@@ -687,7 +687,7 @@ static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry,
 	struct dentry *alias;
 	int err, table;
 
-	lock_kernel();
+	lock_super(sb);
 	table = (MSDOS_SB(sb)->options.name_check == 's') ? 2 : 0;
 	dentry->d_op = &vfat_dentry_ops[table];
 
@@ -699,7 +699,7 @@ static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry,
 	inode = fat_build_inode(sb, sinfo.de, sinfo.i_pos);
 	brelse(sinfo.bh);
 	if (IS_ERR(inode)) {
-		unlock_kernel();
+		unlock_super(sb);
 		return ERR_CAST(inode);
 	}
 	alias = d_find_alias(inode);
@@ -708,13 +708,13 @@ static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry,
 			dput(alias);
 		else {
 			iput(inode);
-			unlock_kernel();
+			unlock_super(sb);
 			return alias;
 		}
 
 	}
 error:
-	unlock_kernel();
+	unlock_super(sb);
 	dentry->d_op = &vfat_dentry_ops[table];
 	dentry->d_time = dentry->d_parent->d_inode->i_version;
 	dentry = d_splice_alias(inode, dentry);
@@ -734,7 +734,7 @@ static int vfat_create(struct inode *dir, struct dentry *dentry, int mode,
 	struct timespec ts;
 	int err;
 
-	lock_kernel();
+	lock_super(sb);
 
 	ts = CURRENT_TIME_SEC;
 	err = vfat_add_entry(dir, &dentry->d_name, 0, 0, &ts, &sinfo);
@@ -755,17 +755,18 @@ static int vfat_create(struct inode *dir, struct dentry *dentry, int mode,
 	dentry->d_time = dentry->d_parent->d_inode->i_version;
 	d_instantiate(dentry, inode);
 out:
-	unlock_kernel();
+	unlock_super(sb);
 	return err;
 }
 
 static int vfat_rmdir(struct inode *dir, struct dentry *dentry)
 {
 	struct inode *inode = dentry->d_inode;
+	struct super_block *sb = dir->i_sb;
 	struct fat_slot_info sinfo;
 	int err;
 
-	lock_kernel();
+	lock_super(sb);
 
 	err = fat_dir_empty(inode);
 	if (err)
@@ -783,7 +784,7 @@ static int vfat_rmdir(struct inode *dir, struct dentry *dentry)
 	inode->i_mtime = inode->i_atime = CURRENT_TIME_SEC;
 	fat_detach(inode);
 out:
-	unlock_kernel();
+	unlock_super(sb);
 
 	return err;
 }
@@ -791,10 +792,11 @@ out:
 static int vfat_unlink(struct inode *dir, struct dentry *dentry)
 {
 	struct inode *inode = dentry->d_inode;
+	struct super_block *sb = dir->i_sb;
 	struct fat_slot_info sinfo;
 	int err;
 
-	lock_kernel();
+	lock_super(sb);
 
 	err = vfat_find(dir, &dentry->d_name, &sinfo);
 	if (err)
@@ -807,7 +809,7 @@ static int vfat_unlink(struct inode *dir, struct dentry *dentry)
 	inode->i_mtime = inode->i_atime = CURRENT_TIME_SEC;
 	fat_detach(inode);
 out:
-	unlock_kernel();
+	unlock_super(sb);
 
 	return err;
 }
@@ -820,7 +822,7 @@ static int vfat_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	struct timespec ts;
 	int err, cluster;
 
-	lock_kernel();
+	lock_super(sb);
 
 	ts = CURRENT_TIME_SEC;
 	cluster = fat_alloc_new_dir(dir, &ts);
@@ -849,13 +851,13 @@ static int vfat_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	dentry->d_time = dentry->d_parent->d_inode->i_version;
 	d_instantiate(dentry, inode);
 
-	unlock_kernel();
+	unlock_super(sb);
 	return 0;
 
 out_free:
 	fat_free_clusters(dir, cluster);
 out:
-	unlock_kernel();
+	unlock_super(sb);
 	return err;
 }
 
@@ -869,11 +871,12 @@ static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry,
 	struct timespec ts;
 	loff_t dotdot_i_pos, new_i_pos;
 	int err, is_dir, update_dotdot, corrupt = 0;
+	struct super_block *sb = old_dir->i_sb;
 
 	old_sinfo.bh = sinfo.bh = dotdot_bh = NULL;
 	old_inode = old_dentry->d_inode;
 	new_inode = new_dentry->d_inode;
-	lock_kernel();
+	lock_super(sb);
 	err = vfat_find(old_dir, &old_dentry->d_name, &old_sinfo);
 	if (err)
 		goto out;
@@ -951,7 +954,7 @@ out:
 	brelse(sinfo.bh);
 	brelse(dotdot_bh);
 	brelse(old_sinfo.bh);
-	unlock_kernel();
+	unlock_super(sb);
 
 	return err;
 
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 5105015a75ad..98e0e86093b4 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -387,6 +387,8 @@ _xfs_buf_lookup_pages(
 		if (unlikely(page == NULL)) {
 			if (flags & XBF_READ_AHEAD) {
 				bp->b_page_count = i;
+				for (i = 0; i < bp->b_page_count; i++)
+					unlock_page(bp->b_pages[i]);
 				return -ENOMEM;
 			}
 
@@ -416,17 +418,24 @@ _xfs_buf_lookup_pages(
 		ASSERT(!PagePrivate(page));
 		if (!PageUptodate(page)) {
 			page_count--;
-			if (blocksize < PAGE_CACHE_SIZE && !PagePrivate(page)) {
+			if (blocksize >= PAGE_CACHE_SIZE) {
+				if (flags & XBF_READ)
+					bp->b_flags |= _XBF_PAGE_LOCKED;
+			} else if (!PagePrivate(page)) {
 				if (test_page_region(page, offset, nbytes))
 					page_count++;
 			}
 		}
 
-		unlock_page(page);
 		bp->b_pages[i] = page;
 		offset = 0;
 	}
 
+	if (!(bp->b_flags & _XBF_PAGE_LOCKED)) {
+		for (i = 0; i < bp->b_page_count; i++)
+			unlock_page(bp->b_pages[i]);
+	}
+
 	if (page_count == bp->b_page_count)
 		bp->b_flags |= XBF_DONE;
 
@@ -746,6 +755,7 @@ xfs_buf_associate_memory(
 	bp->b_count_desired = len;
 	bp->b_buffer_length = buflen;
 	bp->b_flags |= XBF_MAPPED;
+	bp->b_flags &= ~_XBF_PAGE_LOCKED;
 
 	return 0;
 }
@@ -1093,8 +1103,10 @@ _xfs_buf_ioend(
 	xfs_buf_t		*bp,
 	int			schedule)
 {
-	if (atomic_dec_and_test(&bp->b_io_remaining) == 1)
+	if (atomic_dec_and_test(&bp->b_io_remaining) == 1) {
+		bp->b_flags &= ~_XBF_PAGE_LOCKED;
 		xfs_buf_ioend(bp, schedule);
+	}
 }
 
 STATIC void
@@ -1125,6 +1137,9 @@ xfs_buf_bio_end_io(
 
 		if (--bvec >= bio->bi_io_vec)
 			prefetchw(&bvec->bv_page->flags);
+
+		if (bp->b_flags & _XBF_PAGE_LOCKED)
+			unlock_page(page);
 	} while (bvec >= bio->bi_io_vec);
 
 	_xfs_buf_ioend(bp, 1);
@@ -1163,7 +1178,8 @@ _xfs_buf_ioapply(
 	 * filesystem block size is not smaller than the page size.
 	 */
 	if ((bp->b_buffer_length < PAGE_CACHE_SIZE) &&
-	    (bp->b_flags & XBF_READ) &&
+	    ((bp->b_flags & (XBF_READ|_XBF_PAGE_LOCKED)) ==
+	      (XBF_READ|_XBF_PAGE_LOCKED)) &&
 	    (blocksize >= PAGE_CACHE_SIZE)) {
 		bio = bio_alloc(GFP_NOIO, 1);
 
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index 841d7883528d..f948ec7ba9a4 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -66,6 +66,25 @@ typedef enum {
 	_XBF_PAGES = (1 << 18),	    /* backed by refcounted pages	   */
 	_XBF_RUN_QUEUES = (1 << 19),/* run block device task queue	   */
 	_XBF_DELWRI_Q = (1 << 21),   /* buffer on delwri queue		   */
+
+	/*
+	 * Special flag for supporting metadata blocks smaller than a FSB.
+	 *
+	 * In this case we can have multiple xfs_buf_t on a single page and
+	 * need to lock out concurrent xfs_buf_t readers as they only
+	 * serialise access to the buffer.
+	 *
+	 * If the FSB size >= PAGE_CACHE_SIZE case, we have no serialisation
+	 * between reads of the page. Hence we can have one thread read the
+	 * page and modify it, but then race with another thread that thinks
+	 * the page is not up-to-date and hence reads it again.
+	 *
+	 * The result is that the first modifcation to the page is lost.
+	 * This sort of AGF/AGI reading race can happen when unlinking inodes
+	 * that require truncation and results in the AGI unlinked list
+	 * modifications being lost.
+	 */
+	_XBF_PAGE_LOCKED = (1 << 22),
 } xfs_buf_flags_t;
 
 typedef enum {
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index 65e78c13d4ae..5f60363b9343 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -184,19 +184,24 @@ xfs_file_release(
 	return -xfs_release(XFS_I(inode));
 }
 
+/*
+ * We ignore the datasync flag here because a datasync is effectively
+ * identical to an fsync. That is, datasync implies that we need to write
+ * only the metadata needed to be able to access the data that is written
+ * if we crash after the call completes. Hence if we are writing beyond
+ * EOF we have to log the inode size change as well, which makes it a
+ * full fsync. If we don't write beyond EOF, the inode core will be
+ * clean in memory and so we don't need to log the inode, just like
+ * fsync.
+ */
 STATIC int
 xfs_file_fsync(
 	struct file	*filp,
 	struct dentry	*dentry,
 	int		datasync)
 {
-	int		flags = FSYNC_WAIT;
-
-	if (datasync)
-		flags |= FSYNC_DATA;
 	xfs_iflags_clear(XFS_I(dentry->d_inode), XFS_ITRUNCATED);
-	return -xfs_fsync(XFS_I(dentry->d_inode), flags,
-			(xfs_off_t)0, (xfs_off_t)-1);
+	return -xfs_fsync(XFS_I(dentry->d_inode));
 }
 
 /*
diff --git a/fs/xfs/linux-2.6/xfs_vnode.h b/fs/xfs/linux-2.6/xfs_vnode.h
index 9d73cb5c0fc7..25eb2a9e8d9b 100644
--- a/fs/xfs/linux-2.6/xfs_vnode.h
+++ b/fs/xfs/linux-2.6/xfs_vnode.h
@@ -230,14 +230,6 @@ static inline void vn_atime_to_time_t(bhv_vnode_t *vp, time_t *tt)
 #define ATTR_NOSIZETOK	0x400	/* Don't get the SIZE token */
 
 /*
- * Flags to vop_fsync/reclaim.
- */
-#define FSYNC_NOWAIT	0	/* asynchronous flush */
-#define FSYNC_WAIT	0x1	/* synchronous fsync or forced reclaim */
-#define FSYNC_INVAL	0x2	/* flush and invalidate cached data */
-#define FSYNC_DATA	0x4	/* synchronous fsync of data only */
-
-/*
  * Tracking vnode activity.
  */
 #if defined(XFS_INODE_TRACE)
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index cf0bb9c1d621..e569bf5d6cf0 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -2974,6 +2974,7 @@ xfs_iflush_cluster(
 	xfs_mount_t		*mp = ip->i_mount;
 	xfs_perag_t		*pag = xfs_get_perag(mp, ip->i_ino);
 	unsigned long		first_index, mask;
+	unsigned long		inodes_per_cluster;
 	int			ilist_size;
 	xfs_inode_t		**ilist;
 	xfs_inode_t		*iq;
@@ -2985,8 +2986,9 @@ xfs_iflush_cluster(
 	ASSERT(pag->pagi_inodeok);
 	ASSERT(pag->pag_ici_init);
 
-	ilist_size = XFS_INODE_CLUSTER_SIZE(mp) * sizeof(xfs_inode_t *);
-	ilist = kmem_alloc(ilist_size, KM_MAYFAIL);
+	inodes_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog;
+	ilist_size = inodes_per_cluster * sizeof(xfs_inode_t *);
+	ilist = kmem_alloc(ilist_size, KM_MAYFAIL|KM_NOFS);
 	if (!ilist)
 		return 0;
 
@@ -2995,8 +2997,7 @@ xfs_iflush_cluster(
 	read_lock(&pag->pag_ici_lock);
 	/* really need a gang lookup range call here */
 	nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)ilist,
-					first_index,
-					XFS_INODE_CLUSTER_SIZE(mp));
+					first_index, inodes_per_cluster);
 	if (nr_found == 0)
 		goto out_free;
 
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index afaee301b0ee..ad3d26ddfe31 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -2427,13 +2427,20 @@ restart:
 	if (iclog->ic_size - iclog->ic_offset < 2*sizeof(xlog_op_header_t)) {
 		xlog_state_switch_iclogs(log, iclog, iclog->ic_size);
 
-		/* If I'm the only one writing to this iclog, sync it to disk */
-		if (atomic_read(&iclog->ic_refcnt) == 1) {
+		/*
+		 * If I'm the only one writing to this iclog, sync it to disk.
+		 * We need to do an atomic compare and decrement here to avoid
+		 * racing with concurrent atomic_dec_and_lock() calls in
+		 * xlog_state_release_iclog() when there is more than one
+		 * reference to the iclog.
+		 */
+		if (!atomic_add_unless(&iclog->ic_refcnt, -1, 1)) {
+			/* we are the only one */
 			spin_unlock(&log->l_icloglock);
-			if ((error = xlog_state_release_iclog(log, iclog)))
+			error = xlog_state_release_iclog(log, iclog);
+			if (error)
 				return error;
 		} else {
-			atomic_dec(&iclog->ic_refcnt);
 			spin_unlock(&log->l_icloglock);
 		}
 		goto restart;
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 70702a60b4bb..e475e3717eb3 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -856,18 +856,14 @@ xfs_readlink(
 /*
  * xfs_fsync
  *
- * This is called to sync the inode and its data out to disk.
- * We need to hold the I/O lock while flushing the data, and
- * the inode lock while flushing the inode.  The inode lock CANNOT
- * be held while flushing the data, so acquire after we're done
- * with that.
+ * This is called to sync the inode and its data out to disk.  We need to hold
+ * the I/O lock while flushing the data, and the inode lock while flushing the
+ * inode.  The inode lock CANNOT be held while flushing the data, so acquire
+ * after we're done with that.
  */
 int
 xfs_fsync(
-	xfs_inode_t	*ip,
-	int		flag,
-	xfs_off_t	start,
-	xfs_off_t	stop)
+	xfs_inode_t	*ip)
 {
 	xfs_trans_t	*tp;
 	int		error;
@@ -875,103 +871,79 @@ xfs_fsync(
 
 	xfs_itrace_entry(ip);
 
-	ASSERT(start >= 0 && stop >= -1);
-
 	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
 		return XFS_ERROR(EIO);
 
-	if (flag & FSYNC_DATA)
-		filemap_fdatawait(vn_to_inode(XFS_ITOV(ip))->i_mapping);
+	/* capture size updates in I/O completion before writing the inode. */
+	error = filemap_fdatawait(vn_to_inode(XFS_ITOV(ip))->i_mapping);
+	if (error)
+		return XFS_ERROR(error);
 
 	/*
-	 * We always need to make sure that the required inode state
-	 * is safe on disk.  The vnode might be clean but because
-	 * of committed transactions that haven't hit the disk yet.
-	 * Likewise, there could be unflushed non-transactional
-	 * changes to the inode core that have to go to disk.
+	 * We always need to make sure that the required inode state is safe on
+	 * disk.  The vnode might be clean but we still might need to force the
+	 * log because of committed transactions that haven't hit the disk yet.
+	 * Likewise, there could be unflushed non-transactional changes to the
+	 * inode core that have to go to disk and this requires us to issue
+	 * a synchronous transaction to capture these changes correctly.
 	 *
-	 * The following code depends on one assumption:  that
-	 * any transaction that changes an inode logs the core
-	 * because it has to change some field in the inode core
-	 * (typically nextents or nblocks).  That assumption
-	 * implies that any transactions against an inode will
-	 * catch any non-transactional updates.  If inode-altering
-	 * transactions exist that violate this assumption, the
-	 * code breaks.  Right now, it figures that if the involved
-	 * update_* field is clear and the inode is unpinned, the
-	 * inode is clean.  Either it's been flushed or it's been
-	 * committed and the commit has hit the disk unpinning the inode.
-	 * (Note that xfs_inode_item_format() called at commit clears
-	 * the update_* fields.)
+	 * This code relies on the assumption that if the update_* fields
+	 * of the inode are clear and the inode is unpinned then it is clean
+	 * and no action is required.
 	 */
 	xfs_ilock(ip, XFS_ILOCK_SHARED);
 
-	/* If we are flushing data then we care about update_size
-	 * being set, otherwise we care about update_core
-	 */
-	if ((flag & FSYNC_DATA) ?
-			(ip->i_update_size == 0) :
-			(ip->i_update_core == 0)) {
+	if (!(ip->i_update_size || ip->i_update_core)) {
 		/*
-		 * Timestamps/size haven't changed since last inode
-		 * flush or inode transaction commit.  That means
-		 * either nothing got written or a transaction
-		 * committed which caught the updates.	If the
-		 * latter happened and the transaction hasn't
-		 * hit the disk yet, the inode will be still
-		 * be pinned.  If it is, force the log.
+		 * Timestamps/size haven't changed since last inode flush or
+		 * inode transaction commit.  That means either nothing got
+		 * written or a transaction committed which caught the updates.
+		 * If the latter happened and the transaction hasn't hit the
+		 * disk yet, the inode will be still be pinned.  If it is,
+		 * force the log.
 		 */
 
 		xfs_iunlock(ip, XFS_ILOCK_SHARED);
 
 		if (xfs_ipincount(ip)) {
-			_xfs_log_force(ip->i_mount, (xfs_lsn_t)0,
-				      XFS_LOG_FORCE |
-				      ((flag & FSYNC_WAIT)
-				       ? XFS_LOG_SYNC : 0),
+			error = _xfs_log_force(ip->i_mount, (xfs_lsn_t)0,
+				      XFS_LOG_FORCE | XFS_LOG_SYNC,
 				      &log_flushed);
 		} else {
 			/*
-			 * If the inode is not pinned and nothing
-			 * has changed we don't need to flush the
-			 * cache.
+			 * If the inode is not pinned and nothing has changed
+			 * we don't need to flush the cache.
 			 */
 			changed = 0;
 		}
-		error = 0;
 	} else	{
 		/*
-		 * Kick off a transaction to log the inode
-		 * core to get the updates.  Make it
-		 * sync if FSYNC_WAIT is passed in (which
-		 * is done by everybody but specfs).  The
-		 * sync transaction will also force the log.
+		 * Kick off a transaction to log the inode core to get the
+		 * updates.  The sync transaction will also force the log.
 		 */
 		xfs_iunlock(ip, XFS_ILOCK_SHARED);
 		tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_FSYNC_TS);
-		if ((error = xfs_trans_reserve(tp, 0,
-				XFS_FSYNC_TS_LOG_RES(ip->i_mount),
-				0, 0, 0)))  {
+		error = xfs_trans_reserve(tp, 0,
+				XFS_FSYNC_TS_LOG_RES(ip->i_mount), 0, 0, 0);
+		if (error) {
 			xfs_trans_cancel(tp, 0);
 			return error;
 		}
 		xfs_ilock(ip, XFS_ILOCK_EXCL);
 
 		/*
-		 * Note - it's possible that we might have pushed
-		 * ourselves out of the way during trans_reserve
-		 * which would flush the inode.	 But there's no
-		 * guarantee that the inode buffer has actually
-		 * gone out yet (it's delwri).	Plus the buffer
-		 * could be pinned anyway if it's part of an
-		 * inode in another recent transaction.	 So we
-		 * play it safe and fire off the transaction anyway.
+		 * Note - it's possible that we might have pushed ourselves out
+		 * of the way during trans_reserve which would flush the inode.
+		 * But there's no guarantee that the inode buffer has actually
+		 * gone out yet (it's delwri).	Plus the buffer could be pinned
+		 * anyway if it's part of an inode in another recent
+		 * transaction.	 So we play it safe and fire off the
+		 * transaction anyway.
 		 */
 		xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
 		xfs_trans_ihold(tp, ip);
 		xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-		if (flag & FSYNC_WAIT)
-			xfs_trans_set_sync(tp);
+		xfs_trans_set_sync(tp);
 		error = _xfs_trans_commit(tp, 0, &log_flushed);
 
 		xfs_iunlock(ip, XFS_ILOCK_EXCL);
diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h
index 8abe8f186e20..57335ba4ce53 100644
--- a/fs/xfs/xfs_vnodeops.h
+++ b/fs/xfs/xfs_vnodeops.h
@@ -18,8 +18,7 @@ int xfs_open(struct xfs_inode *ip);
 int xfs_setattr(struct xfs_inode *ip, struct bhv_vattr *vap, int flags,
 		struct cred *credp);
 int xfs_readlink(struct xfs_inode *ip, char *link);
-int xfs_fsync(struct xfs_inode *ip, int flag, xfs_off_t start,
-		xfs_off_t stop);
+int xfs_fsync(struct xfs_inode *ip);
 int xfs_release(struct xfs_inode *ip);
 int xfs_inactive(struct xfs_inode *ip);
 int xfs_lookup(struct xfs_inode *dp, struct xfs_name *name,