115 files changed, 4050 insertions, 3510 deletions
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index 8f76b13d5549..d5990eb160bd 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -419,7 +419,7 @@ error_do_abort:
 	call->state = AFS_CALL_COMPLETE;
 	if (ret != -ECONNABORTED) {
 		rxrpc_kernel_abort_call(afs_socket, rxcall, RX_USER_ABORT,
-					-ret, "KSD");
+					ret, "KSD");
 	} else {
 		abort_code = 0;
 		offset = 0;
@@ -478,12 +478,12 @@ static void afs_deliver_to_call(struct afs_call *call)
 		case -ENOTCONN:
 			abort_code = RX_CALL_DEAD;
 			rxrpc_kernel_abort_call(afs_socket, call->rxcall,
-						abort_code, -ret, "KNC");
+						abort_code, ret, "KNC");
 			goto save_error;
 		case -ENOTSUPP:
 			abort_code = RXGEN_OPCODE;
 			rxrpc_kernel_abort_call(afs_socket, call->rxcall,
-						abort_code, -ret, "KIV");
+						abort_code, ret, "KIV");
 			goto save_error;
 		case -ENODATA:
 		case -EBADMSG:
@@ -493,7 +493,7 @@ static void afs_deliver_to_call(struct afs_call *call)
 			if (call->state != AFS_CALL_AWAIT_REPLY)
 				abort_code = RXGEN_SS_UNMARSHAL;
 			rxrpc_kernel_abort_call(afs_socket, call->rxcall,
-						abort_code, EBADMSG, "KUM");
+						abort_code, -EBADMSG, "KUM");
 			goto save_error;
 		}
 	}
@@ -754,7 +754,7 @@ void afs_send_empty_reply(struct afs_call *call)
 	case -ENOMEM:
 		_debug("oom");
 		rxrpc_kernel_abort_call(afs_socket, call->rxcall,
-					RX_USER_ABORT, ENOMEM, "KOO");
+					RX_USER_ABORT, -ENOMEM, "KOO");
 	default:
 		_leave(" [error]");
 		return;
@@ -792,7 +792,7 @@ void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len)
 	if (n == -ENOMEM) {
 		_debug("oom");
 		rxrpc_kernel_abort_call(afs_socket, call->rxcall,
-					RX_USER_ABORT, ENOMEM, "KOO");
+					RX_USER_ABORT, -ENOMEM, "KOO");
 	}
 	_leave(" [error]");
 }
diff --git a/fs/autofs4/Kconfig b/fs/autofs4/Kconfig
index 1204d6384d39..44727bf18297 100644
--- a/fs/autofs4/Kconfig
+++ b/fs/autofs4/Kconfig
@@ -7,7 +7,7 @@ config AUTOFS4_FS
 	  automounter (amd), which is a pure user space daemon.
 
 	  To use the automounter you need the user-space tools from
-	  <ftp://ftp.kernel.org/pub/linux/daemons/autofs/v4/>; you also
+	  <https://www.kernel.org/pub/linux/daemons/autofs/v4/>; you also
 	  want to answer Y to "NFS file system support", below.
 
 	  To compile this support as a module, choose M here: the module will be
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index c500e954debb..63e7c4760bfb 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -58,6 +58,7 @@ static struct dentry *befs_fh_to_dentry(struct super_block *sb,
 				struct fid *fid, int fh_len, int fh_type);
 static struct dentry *befs_fh_to_parent(struct super_block *sb,
 				struct fid *fid, int fh_len, int fh_type);
+static struct dentry *befs_get_parent(struct dentry *child);
 
 static const struct super_operations befs_sops = {
 	.alloc_inode	= befs_alloc_inode,	/* allocate a new inode */
@@ -93,6 +94,7 @@ static const struct address_space_operations befs_symlink_aops = {
 static const struct export_operations befs_export_operations = {
 	.fh_to_dentry	= befs_fh_to_dentry,
 	.fh_to_parent	= befs_fh_to_parent,
+	.get_parent	= befs_get_parent,
 };
 
 /*
@@ -667,6 +669,19 @@ static struct dentry *befs_fh_to_parent(struct super_block *sb,
 				    befs_nfs_get_inode);
 }
 
+static struct dentry *befs_get_parent(struct dentry *child)
+{
+	struct inode *parent;
+	struct befs_inode_info *befs_ino = BEFS_I(d_inode(child));
+
+	parent = befs_iget(child->d_sb,
+			   (unsigned long)befs_ino->i_parent.start);
+	if (IS_ERR(parent))
+		return ERR_CAST(parent);
+
+	return d_obtain_alias(parent);
+}
+
 enum {
 	Opt_uid, Opt_gid, Opt_charset, Opt_debug, Opt_err,
 };
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 9ccabe3bb7de..2a305c1a2d88 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -18,6 +18,7 @@
 #include <linux/module.h>
 #include <linux/blkpg.h>
 #include <linux/magic.h>
+#include <linux/dax.h>
 #include <linux/buffer_head.h>
 #include <linux/swap.h>
 #include <linux/pagevec.h>
@@ -103,12 +104,11 @@ void invalidate_bdev(struct block_device *bdev)
 {
 	struct address_space *mapping = bdev->bd_inode->i_mapping;
 
-	if (mapping->nrpages == 0)
-		return;
-
-	invalidate_bh_lrus();
-	lru_add_drain_all();	/* make sure all lru add caches are flushed */
-	invalidate_mapping_pages(mapping, 0, -1);
+	if (mapping->nrpages) {
+		invalidate_bh_lrus();
+		lru_add_drain_all();	/* make sure all lru add caches are flushed */
+		invalidate_mapping_pages(mapping, 0, -1);
+	}
 	/* 99% of the time, we don't need to flush the cleancache on the bdev.
 	 * But, for the strange corners, lets be cautious
 	 */
@@ -717,50 +717,18 @@ int bdev_write_page(struct block_device *bdev, sector_t sector,
 }
 EXPORT_SYMBOL_GPL(bdev_write_page);
 
-/**
- * bdev_direct_access() - Get the address for directly-accessibly memory
- * @bdev: The device containing the memory
- * @dax: control and output parameters for ->direct_access
- *
- * If a block device is made up of directly addressable memory, this function
- * will tell the caller the PFN and the address of the memory.  The address
- * may be directly dereferenced within the kernel without the need to call
- * ioremap(), kmap() or similar.  The PFN is suitable for inserting into
- * page tables.
- *
- * Return: negative errno if an error occurs, otherwise the number of bytes
- * accessible at this address.
- */
-long bdev_direct_access(struct block_device *bdev, struct blk_dax_ctl *dax)
+int bdev_dax_pgoff(struct block_device *bdev, sector_t sector, size_t size,
+		pgoff_t *pgoff)
 {
-	sector_t sector = dax->sector;
-	long avail, size = dax->size;
-	const struct block_device_operations *ops = bdev->bd_disk->fops;
-
-	/*
-	 * The device driver is allowed to sleep, in order to make the
-	 * memory directly accessible.
-	 */
-	might_sleep();
+	phys_addr_t phys_off = (get_start_sect(bdev) + sector) * 512;
 
-	if (size < 0)
-		return size;
-	if (!blk_queue_dax(bdev_get_queue(bdev)) || !ops->direct_access)
-		return -EOPNOTSUPP;
-	if ((sector + DIV_ROUND_UP(size, 512)) >
-					part_nr_sects_read(bdev->bd_part))
-		return -ERANGE;
-	sector += get_start_sect(bdev);
-	if (sector % (PAGE_SIZE / 512))
+	if (pgoff)
+		*pgoff = PHYS_PFN(phys_off);
+	if (phys_off % PAGE_SIZE || size % PAGE_SIZE)
 		return -EINVAL;
-	avail = ops->direct_access(bdev, sector, &dax->addr, &dax->pfn, size);
-	if (!avail)
-		return -ERANGE;
-	if (avail > 0 && avail & ~PAGE_MASK)
-		return -ENXIO;
-	return min(avail, size);
+	return 0;
 }
-EXPORT_SYMBOL_GPL(bdev_direct_access);
+EXPORT_SYMBOL(bdev_dax_pgoff);
 
 /**
  * bdev_dax_supported() - Check if the device supports dax for filesystem
@@ -774,62 +742,46 @@ EXPORT_SYMBOL_GPL(bdev_direct_access);
  */
 int bdev_dax_supported(struct super_block *sb, int blocksize)
 {
-	struct blk_dax_ctl dax = {
-		.sector = 0,
-		.size = PAGE_SIZE,
-	};
-	int err;
+	struct block_device *bdev = sb->s_bdev;
+	struct dax_device *dax_dev;
+	pgoff_t pgoff;
+	int err, id;
+	void *kaddr;
+	pfn_t pfn;
+	long len;
 
 	if (blocksize != PAGE_SIZE) {
 		vfs_msg(sb, KERN_ERR, "error: unsupported blocksize for dax");
 		return -EINVAL;
 	}
 
-	err = bdev_direct_access(sb->s_bdev, &dax);
-	if (err < 0) {
-		switch (err) {
-		case -EOPNOTSUPP:
-			vfs_msg(sb, KERN_ERR,
-				"error: device does not support dax");
-			break;
-		case -EINVAL:
-			vfs_msg(sb, KERN_ERR,
-				"error: unaligned partition for dax");
-			break;
-		default:
-			vfs_msg(sb, KERN_ERR,
-				"error: dax access failed (%d)", err);
-		}
+	err = bdev_dax_pgoff(bdev, 0, PAGE_SIZE, &pgoff);
+	if (err) {
+		vfs_msg(sb, KERN_ERR, "error: unaligned partition for dax");
 		return err;
 	}
 
-	return 0;
-}
-EXPORT_SYMBOL_GPL(bdev_dax_supported);
-
-/**
- * bdev_dax_capable() - Return if the raw device is capable for dax
- * @bdev: The device for raw block device access
- */
-bool bdev_dax_capable(struct block_device *bdev)
-{
-	struct blk_dax_ctl dax = {
-		.size = PAGE_SIZE,
-	};
+	dax_dev = dax_get_by_host(bdev->bd_disk->disk_name);
+	if (!dax_dev) {
+		vfs_msg(sb, KERN_ERR, "error: device does not support dax");
+		return -EOPNOTSUPP;
+	}
 
-	if (!IS_ENABLED(CONFIG_FS_DAX))
-		return false;
+	id = dax_read_lock();
+	len = dax_direct_access(dax_dev, pgoff, 1, &kaddr, &pfn);
+	dax_read_unlock(id);
 
-	dax.sector = 0;
-	if (bdev_direct_access(bdev, &dax) < 0)
-		return false;
+	put_dax(dax_dev);
 
-	dax.sector = bdev->bd_part->nr_sects - (PAGE_SIZE / 512);
-	if (bdev_direct_access(bdev, &dax) < 0)
-		return false;
+	if (len < 1) {
+		vfs_msg(sb, KERN_ERR,
+				"error: dax access failed (%ld)", len);
+		return len < 0 ? len : -EIO;
+	}
 
-	return true;
+	return 0;
 }
+EXPORT_SYMBOL_GPL(bdev_dax_supported);
 
 /*
  * pseudo-fs
diff --git a/fs/char_dev.c b/fs/char_dev.c
index 44a240c4bb65..fb8507f521b2 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -471,6 +471,85 @@ int cdev_add(struct cdev *p, dev_t dev, unsigned count)
 	return 0;
 }
 
+/**
+ * cdev_set_parent() - set the parent kobject for a char device
+ * @p: the cdev structure
+ * @kobj: the kobject to take a reference to
+ *
+ * cdev_set_parent() sets a parent kobject which will be referenced
+ * appropriately so the parent is not freed before the cdev. This
+ * should be called before cdev_add.
+ */
+void cdev_set_parent(struct cdev *p, struct kobject *kobj)
+{
+	WARN_ON(!kobj->state_initialized);
+	p->kobj.parent = kobj;
+}
+
+/**
+ * cdev_device_add() - add a char device and it's corresponding
+ *	struct device, linkink
+ * @dev: the device structure
+ * @cdev: the cdev structure
+ *
+ * cdev_device_add() adds the char device represented by @cdev to the system,
+ * just as cdev_add does. It then adds @dev to the system using device_add
+ * The dev_t for the char device will be taken from the struct device which
+ * needs to be initialized first. This helper function correctly takes a
+ * reference to the parent device so the parent will not get released until
+ * all references to the cdev are released.
+ *
+ * This helper uses dev->devt for the device number. If it is not set
+ * it will not add the cdev and it will be equivalent to device_add.
+ *
+ * This function should be used whenever the struct cdev and the
+ * struct device are members of the same structure whose lifetime is
+ * managed by the struct device.
+ *
+ * NOTE: Callers must assume that userspace was able to open the cdev and
+ * can call cdev fops callbacks at any time, even if this function fails.
+ */
+int cdev_device_add(struct cdev *cdev, struct device *dev)
+{
+	int rc = 0;
+
+	if (dev->devt) {
+		cdev_set_parent(cdev, &dev->kobj);
+
+		rc = cdev_add(cdev, dev->devt, 1);
+		if (rc)
+			return rc;
+	}
+
+	rc = device_add(dev);
+	if (rc)
+		cdev_del(cdev);
+
+	return rc;
+}
+
+/**
+ * cdev_device_del() - inverse of cdev_device_add
+ * @dev: the device structure
+ * @cdev: the cdev structure
+ *
+ * cdev_device_del() is a helper function to call cdev_del and device_del.
+ * It should be used whenever cdev_device_add is used.
+ *
+ * If dev->devt is not set it will not remove the cdev and will be equivalent
+ * to device_del.
+ *
+ * NOTE: This guarantees that associated sysfs callbacks are not running
+ * or runnable, however any cdevs already open will remain and their fops
+ * will still be callable even after this function returns.
+ */
+void cdev_device_del(struct cdev *cdev, struct device *dev)
+{
+	device_del(dev);
+	if (dev->devt)
+		cdev_del(cdev);
+}
+
 static void cdev_unmap(dev_t dev, unsigned count)
 {
 	kobj_unmap(cdev_map, dev, count);
@@ -482,6 +561,10 @@ static void cdev_unmap(dev_t dev, unsigned count)
  *
  * cdev_del() removes @p from the system, possibly freeing the structure
  * itself.
+ *
+ * NOTE: This guarantees that cdev device will no longer be able to be
+ * opened, however any cdevs already open will remain and their fops will
+ * still be callable even after cdev_del returns.
  */
 void cdev_del(struct cdev *p)
 {
@@ -570,5 +653,8 @@ EXPORT_SYMBOL(cdev_init);
 EXPORT_SYMBOL(cdev_alloc);
 EXPORT_SYMBOL(cdev_del);
 EXPORT_SYMBOL(cdev_add);
+EXPORT_SYMBOL(cdev_set_parent);
+EXPORT_SYMBOL(cdev_device_add);
+EXPORT_SYMBOL(cdev_device_del);
 EXPORT_SYMBOL(__register_chrdev);
 EXPORT_SYMBOL(__unregister_chrdev);
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 34fee9fb7e4f..d0d11b73b2af 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -37,6 +37,7 @@
 #include <linux/freezer.h>
 #include <linux/namei.h>
 #include <linux/random.h>
+#include <linux/uuid.h>
 #include <linux/xattr.h>
 #include <net/ipv6.h>
 #include "cifsfs.h"
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index b3c9d8c310f2..9bc0b4d6d065 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -35,6 +35,7 @@
 #include <linux/pagevec.h>
 #include <linux/freezer.h>
 #include <linux/namei.h>
+#include <linux/uuid.h>
 #include <linux/uaccess.h>
 #include <asm/processor.h>
 #include <linux/inet.h>
@@ -2912,16 +2913,14 @@ match_prepath(struct super_block *sb, struct cifs_mnt_data *mnt_data)
 {
 	struct cifs_sb_info *old = CIFS_SB(sb);
 	struct cifs_sb_info *new = mnt_data->cifs_sb;
+	bool old_set = old->mnt_cifs_flags & CIFS_MOUNT_USE_PREFIX_PATH;
+	bool new_set = new->mnt_cifs_flags & CIFS_MOUNT_USE_PREFIX_PATH;
 
-	if (old->mnt_cifs_flags & CIFS_MOUNT_USE_PREFIX_PATH) {
-		if (!(new->mnt_cifs_flags & CIFS_MOUNT_USE_PREFIX_PATH))
-			return 0;
-		/* The prepath should be null terminated strings */
-		if (strcmp(new->prepath, old->prepath))
-			return 0;
-
+	if (old_set && new_set && !strcmp(new->prepath, old->prepath))
 		return 1;
-	}
+	else if (!old_set && !new_set)
+		return 1;
+
 	return 0;
 }
 
diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c
index 265c45fe4ea5..7f4bba574930 100644
--- a/fs/cifs/ioctl.c
+++ b/fs/cifs/ioctl.c
@@ -74,7 +74,8 @@ static long cifs_ioctl_copychunk(unsigned int xid, struct file *dst_file,
 
 	rc = cifs_file_copychunk_range(xid, src_file.file, 0, dst_file, 0,
 					src_inode->i_size, 0);
-
+	if (rc > 0)
+		rc = 0;
 out_fput:
 	fdput(src_file);
 out_drop_write:
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index d3fb11529ed9..843787850435 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -167,13 +167,11 @@ cifs_buf_get(void)
 
 	/* clear the first few header bytes */
 	/* for most paths, more is cleared in header_assemble */
-	if (ret_buf) {
-		memset(ret_buf, 0, buf_size + 3);
-		atomic_inc(&bufAllocCount);
+	memset(ret_buf, 0, buf_size + 3);
+	atomic_inc(&bufAllocCount);
 #ifdef CONFIG_CIFS_STATS2
-		atomic_inc(&totBufAllocCount);
+	atomic_inc(&totBufAllocCount);
 #endif /* CONFIG_CIFS_STATS2 */
-	}
 
 	return ret_buf;
 }
@@ -201,15 +199,13 @@ cifs_small_buf_get(void)
    albeit slightly larger than necessary and maxbuffersize
    defaults to this and can not be bigger */
 	ret_buf = mempool_alloc(cifs_sm_req_poolp, GFP_NOFS);
-	if (ret_buf) {
 	/* No need to clear memory here, cleared in header assemble */
 	/*	memset(ret_buf, 0, sizeof(struct smb_hdr) + 27);*/
-		atomic_inc(&smBufAllocCount);
+	atomic_inc(&smBufAllocCount);
 #ifdef CONFIG_CIFS_STATS2
-		atomic_inc(&totSmBufAllocCount);
+	atomic_inc(&totSmBufAllocCount);
 #endif /* CONFIG_CIFS_STATS2 */
 
-	}
 	return ret_buf;
 }
 
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index 02da648041fc..fb0da096c2ce 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -33,6 +33,7 @@
 #include <linux/vfs.h>
 #include <linux/task_io_accounting_ops.h>
 #include <linux/uaccess.h>
+#include <linux/uuid.h>
 #include <linux/pagemap.h>
 #include <linux/xattr.h>
 #include "smb2pdu.h"
diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c
index 506b67fc93d9..c69ec96e92ac 100644
--- a/fs/cifs/smb2transport.c
+++ b/fs/cifs/smb2transport.c
@@ -538,23 +538,19 @@ smb2_mid_entry_alloc(const struct smb2_sync_hdr *shdr,
 	}
 
 	temp = mempool_alloc(cifs_mid_poolp, GFP_NOFS);
-	if (temp == NULL)
-		return temp;
-	else {
-		memset(temp, 0, sizeof(struct mid_q_entry));
-		temp->mid = le64_to_cpu(shdr->MessageId);
-		temp->pid = current->pid;
-		temp->command = shdr->Command; /* Always LE */
-		temp->when_alloc = jiffies;
-		temp->server = server;
-
-		/*
-		 * The default is for the mid to be synchronous, so the
-		 * default callback just wakes up the current task.
-		 */
-		temp->callback = cifs_wake_up_task;
-		temp->callback_data = current;
-	}
+	memset(temp, 0, sizeof(struct mid_q_entry));
+	temp->mid = le64_to_cpu(shdr->MessageId);
+	temp->pid = current->pid;
+	temp->command = shdr->Command; /* Always LE */
+	temp->when_alloc = jiffies;
+	temp->server = server;
+
+	/*
+	 * The default is for the mid to be synchronous, so the
+	 * default callback just wakes up the current task.
+	 */
+	temp->callback = cifs_wake_up_task;
+	temp->callback_data = current;
 
 	atomic_inc(&midCount);
 	temp->mid_state = MID_REQUEST_ALLOCATED;
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index f6e13a977fc8..4d64b5b8fc9c 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -55,26 +55,22 @@ AllocMidQEntry(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server)
 	}
 
 	temp = mempool_alloc(cifs_mid_poolp, GFP_NOFS);
-	if (temp == NULL)
-		return temp;
-	else {
-		memset(temp, 0, sizeof(struct mid_q_entry));
-		temp->mid = get_mid(smb_buffer);
-		temp->pid = current->pid;
-		temp->command = cpu_to_le16(smb_buffer->Command);
-		cifs_dbg(FYI, "For smb_command %d\n", smb_buffer->Command);
+	memset(temp, 0, sizeof(struct mid_q_entry));
+	temp->mid = get_mid(smb_buffer);
+	temp->pid = current->pid;
+	temp->command = cpu_to_le16(smb_buffer->Command);
+	cifs_dbg(FYI, "For smb_command %d\n", smb_buffer->Command);
 	/*	do_gettimeofday(&temp->when_sent);*/ /* easier to use jiffies */
-		/* when mid allocated can be before when sent */
-		temp->when_alloc = jiffies;
-		temp->server = server;
+	/* when mid allocated can be before when sent */
+	temp->when_alloc = jiffies;
+	temp->server = server;
 
-		/*
-		 * The default is for the mid to be synchronous, so the
-		 * default callback just wakes up the current task.
-		 */
-		temp->callback = cifs_wake_up_task;
-		temp->callback_data = current;
-	}
+	/*
+	 * The default is for the mid to be synchronous, so the
+	 * default callback just wakes up the current task.
+	 */
+	temp->callback = cifs_wake_up_task;
+	temp->callback_data = current;
 
 	atomic_inc(&midCount);
 	temp->mid_state = MID_REQUEST_ALLOCATED;
diff --git a/fs/compat.c b/fs/compat.c
index c61b506f5bc9..190b38b39d9e 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -15,555 +15,14 @@
  *  published by the Free Software Foundation.
  */
 
-#include <linux/stddef.h>
-#include <linux/kernel.h>
-#include <linux/linkage.h>
 #include <linux/compat.h>
-#include <linux/errno.h>
-#include <linux/time.h>
-#include <linux/cred.h>
-#include <linux/fs.h>
-#include <linux/fcntl.h>
-#include <linux/namei.h>
-#include <linux/file.h>
-#include <linux/fdtable.h>
-#include <linux/vfs.h>
-#include <linux/ioctl.h>
-#include <linux/init.h>
 #include <linux/ncp_mount.h>
 #include <linux/nfs4_mount.h>
 #include <linux/syscalls.h>
-#include <linux/ctype.h>
-#include <linux/dirent.h>
-#include <linux/fsnotify.h>
-#include <linux/highuid.h>
-#include <linux/personality.h>
-#include <linux/rwsem.h>
-#include <linux/tsacct_kern.h>
-#include <linux/security.h>
-#include <linux/highmem.h>
-#include <linux/signal.h>
-#include <linux/poll.h>
-#include <linux/mm.h>
-#include <linux/fs_struct.h>
 #include <linux/slab.h>
-#include <linux/pagemap.h>
-#include <linux/aio.h>
-
 #include <linux/uaccess.h>
-#include <asm/mmu_context.h>
-#include <asm/ioctls.h>
 #include "internal.h"
 
-/*
- * Not all architectures have sys_utime, so implement this in terms
- * of sys_utimes.
- */
-COMPAT_SYSCALL_DEFINE2(utime, const char __user *, filename,
-		       struct compat_utimbuf __user *, t)
-{
-	struct timespec tv[2];
-
-	if (t) {
-		if (get_user(tv[0].tv_sec, &t->actime) ||
-		    get_user(tv[1].tv_sec, &t->modtime))
-			return -EFAULT;
-		tv[0].tv_nsec = 0;
-		tv[1].tv_nsec = 0;
-	}
-	return do_utimes(AT_FDCWD, filename, t ? tv : NULL, 0);
-}
-
-COMPAT_SYSCALL_DEFINE4(utimensat, unsigned int, dfd, const char __user *, filename, struct compat_timespec __user *, t, int, flags)
-{
-	struct timespec tv[2];
-
-	if  (t) {
-		if (compat_get_timespec(&tv[0], &t[0]) ||
-		    compat_get_timespec(&tv[1], &t[1]))
-			return -EFAULT;
-
-		if (tv[0].tv_nsec == UTIME_OMIT && tv[1].tv_nsec == UTIME_OMIT)
-			return 0;
-	}
-	return do_utimes(dfd, filename, t ? tv : NULL, flags);
-}
-
-COMPAT_SYSCALL_DEFINE3(futimesat, unsigned int, dfd, const char __user *, filename, struct compat_timeval __user *, t)
-{
-	struct timespec tv[2];
-
-	if (t) {
-		if (get_user(tv[0].tv_sec, &t[0].tv_sec) ||
-		    get_user(tv[0].tv_nsec, &t[0].tv_usec) ||
-		    get_user(tv[1].tv_sec, &t[1].tv_sec) ||
-		    get_user(tv[1].tv_nsec, &t[1].tv_usec))
-			return -EFAULT;
-		if (tv[0].tv_nsec >= 1000000 || tv[0].tv_nsec < 0 ||
-		    tv[1].tv_nsec >= 1000000 || tv[1].tv_nsec < 0)
-			return -EINVAL;
-		tv[0].tv_nsec *= 1000;
-		tv[1].tv_nsec *= 1000;
-	}
-	return do_utimes(dfd, filename, t ? tv : NULL, 0);
-}
-
-COMPAT_SYSCALL_DEFINE2(utimes, const char __user *, filename, struct compat_timeval __user *, t)
-{
-	return compat_sys_futimesat(AT_FDCWD, filename, t);
-}
-
-static int cp_compat_stat(struct kstat *stat, struct compat_stat __user *ubuf)
-{
-	struct compat_stat tmp;
-
-	if (!old_valid_dev(stat->dev) || !old_valid_dev(stat->rdev))
-		return -EOVERFLOW;
-
-	memset(&tmp, 0, sizeof(tmp));
-	tmp.st_dev = old_encode_dev(stat->dev);
-	tmp.st_ino = stat->ino;
-	if (sizeof(tmp.st_ino) < sizeof(stat->ino) && tmp.st_ino != stat->ino)
-		return -EOVERFLOW;
-	tmp.st_mode = stat->mode;
-	tmp.st_nlink = stat->nlink;
-	if (tmp.st_nlink != stat->nlink)
-		return -EOVERFLOW;
-	SET_UID(tmp.st_uid, from_kuid_munged(current_user_ns(), stat->uid));
-	SET_GID(tmp.st_gid, from_kgid_munged(current_user_ns(), stat->gid));
-	tmp.st_rdev = old_encode_dev(stat->rdev);
-	if ((u64) stat->size > MAX_NON_LFS)
-		return -EOVERFLOW;
-	tmp.st_size = stat->size;
-	tmp.st_atime = stat->atime.tv_sec;
-	tmp.st_atime_nsec = stat->atime.tv_nsec;
-	tmp.st_mtime = stat->mtime.tv_sec;
-	tmp.st_mtime_nsec = stat->mtime.tv_nsec;
-	tmp.st_ctime = stat->ctime.tv_sec;
-	tmp.st_ctime_nsec = stat->ctime.tv_nsec;
-	tmp.st_blocks = stat->blocks;
-	tmp.st_blksize = stat->blksize;
-	return copy_to_user(ubuf, &tmp, sizeof(tmp)) ? -EFAULT : 0;
-}
-
-COMPAT_SYSCALL_DEFINE2(newstat, const char __user *, filename,
-		       struct compat_stat __user *, statbuf)
-{
-	struct kstat stat;
-	int error;
-
-	error = vfs_stat(filename, &stat);
-	if (error)
-		return error;
-	return cp_compat_stat(&stat, statbuf);
-}
-
-COMPAT_SYSCALL_DEFINE2(newlstat, const char __user *, filename,
-		       struct compat_stat __user *, statbuf)
-{
-	struct kstat stat;
-	int error;
-
-	error = vfs_lstat(filename, &stat);
-	if (error)
-		return error;
-	return cp_compat_stat(&stat, statbuf);
-}
-
-#ifndef __ARCH_WANT_STAT64
-COMPAT_SYSCALL_DEFINE4(newfstatat, unsigned int, dfd,
-		       const char __user *, filename,
-		       struct compat_stat __user *, statbuf, int, flag)
-{
-	struct kstat stat;
-	int error;
-
-	error = vfs_fstatat(dfd, filename, &stat, flag);
-	if (error)
-		return error;
-	return cp_compat_stat(&stat, statbuf);
-}
-#endif
-
-COMPAT_SYSCALL_DEFINE2(newfstat, unsigned int, fd,
-		       struct compat_stat __user *, statbuf)
-{
-	struct kstat stat;
-	int error = vfs_fstat(fd, &stat);
-
-	if (!error)
-		error = cp_compat_stat(&stat, statbuf);
-	return error;
-}
-
-static int put_compat_statfs(struct compat_statfs __user *ubuf, struct kstatfs *kbuf)
-{
-	
-	if (sizeof ubuf->f_blocks == 4) {
-		if ((kbuf->f_blocks | kbuf->f_bfree | kbuf->f_bavail |
-		     kbuf->f_bsize | kbuf->f_frsize) & 0xffffffff00000000ULL)
-			return -EOVERFLOW;
-		/* f_files and f_ffree may be -1; it's okay
-		 * to stuff that into 32 bits */
-		if (kbuf->f_files != 0xffffffffffffffffULL
-		 && (kbuf->f_files & 0xffffffff00000000ULL))
-			return -EOVERFLOW;
-		if (kbuf->f_ffree != 0xffffffffffffffffULL
-		 && (kbuf->f_ffree & 0xffffffff00000000ULL))
-			return -EOVERFLOW;
-	}
-	if (!access_ok(VERIFY_WRITE, ubuf, sizeof(*ubuf)) ||
-	    __put_user(kbuf->f_type, &ubuf->f_type) ||
-	    __put_user(kbuf->f_bsize, &ubuf->f_bsize) ||
-	    __put_user(kbuf->f_blocks, &ubuf->f_blocks) ||
-	    __put_user(kbuf->f_bfree, &ubuf->f_bfree) ||
-	    __put_user(kbuf->f_bavail, &ubuf->f_bavail) ||
-	    __put_user(kbuf->f_files, &ubuf->f_files) ||
-	    __put_user(kbuf->f_ffree, &ubuf->f_ffree) ||
-	    __put_user(kbuf->f_namelen, &ubuf->f_namelen) ||
-	    __put_user(kbuf->f_fsid.val[0], &ubuf->f_fsid.val[0]) ||
-	    __put_user(kbuf->f_fsid.val[1], &ubuf->f_fsid.val[1]) ||
-	    __put_user(kbuf->f_frsize, &ubuf->f_frsize) ||
-	    __put_user(kbuf->f_flags, &ubuf->f_flags) ||
-	    __clear_user(ubuf->f_spare, sizeof(ubuf->f_spare)))
-		return -EFAULT;
-	return 0;
-}
-
-/*
- * The following statfs calls are copies of code from fs/statfs.c and
- * should be checked against those from time to time
- */
-COMPAT_SYSCALL_DEFINE2(statfs, const char __user *, pathname, struct compat_statfs __user *, buf)
-{
-	struct kstatfs tmp;
-	int error = user_statfs(pathname, &tmp);
-	if (!error)
-		error = put_compat_statfs(buf, &tmp);
-	return error;
-}
-
-COMPAT_SYSCALL_DEFINE2(fstatfs, unsigned int, fd, struct compat_statfs __user *, buf)
-{
-	struct kstatfs tmp;
-	int error = fd_statfs(fd, &tmp);
-	if (!error)
-		error = put_compat_statfs(buf, &tmp);
-	return error;
-}
-
-static int put_compat_statfs64(struct compat_statfs64 __user *ubuf, struct kstatfs *kbuf)
-{
-	if (sizeof(ubuf->f_bsize) == 4) {
-		if ((kbuf->f_type | kbuf->f_bsize | kbuf->f_namelen |
-		     kbuf->f_frsize | kbuf->f_flags) & 0xffffffff00000000ULL)
-			return -EOVERFLOW;
-		/* f_files and f_ffree may be -1; it's okay
-		 * to stuff that into 32 bits */
-		if (kbuf->f_files != 0xffffffffffffffffULL
-		 && (kbuf->f_files & 0xffffffff00000000ULL))
-			return -EOVERFLOW;
-		if (kbuf->f_ffree != 0xffffffffffffffffULL
-		 && (kbuf->f_ffree & 0xffffffff00000000ULL))
-			return -EOVERFLOW;
-	}
-	if (!access_ok(VERIFY_WRITE, ubuf, sizeof(*ubuf)) ||
-	    __put_user(kbuf->f_type, &ubuf->f_type) ||
-	    __put_user(kbuf->f_bsize, &ubuf->f_bsize) ||
-	    __put_user(kbuf->f_blocks, &ubuf->f_blocks) ||
-	    __put_user(kbuf->f_bfree, &ubuf->f_bfree) ||
-	    __put_user(kbuf->f_bavail, &ubuf->f_bavail) ||
-	    __put_user(kbuf->f_files, &ubuf->f_files) ||
-	    __put_user(kbuf->f_ffree, &ubuf->f_ffree) ||
-	    __put_user(kbuf->f_namelen, &ubuf->f_namelen) ||
-	    __put_user(kbuf->f_fsid.val[0], &ubuf->f_fsid.val[0]) ||
-	    __put_user(kbuf->f_fsid.val[1], &ubuf->f_fsid.val[1]) ||
-	    __put_user(kbuf->f_frsize, &ubuf->f_frsize) ||
-	    __put_user(kbuf->f_flags, &ubuf->f_flags) ||
-	    __clear_user(ubuf->f_spare, sizeof(ubuf->f_spare)))
-		return -EFAULT;
-	return 0;
-}
-
-COMPAT_SYSCALL_DEFINE3(statfs64, const char __user *, pathname, compat_size_t, sz, struct compat_statfs64 __user *, buf)
-{
-	struct kstatfs tmp;
-	int error;
-
-	if (sz != sizeof(*buf))
-		return -EINVAL;
-
-	error = user_statfs(pathname, &tmp);
-	if (!error)
-		error = put_compat_statfs64(buf, &tmp);
-	return error;
-}
-
-COMPAT_SYSCALL_DEFINE3(fstatfs64, unsigned int, fd, compat_size_t, sz, struct compat_statfs64 __user *, buf)
-{
-	struct kstatfs tmp;
-	int error;
-
-	if (sz != sizeof(*buf))
-		return -EINVAL;
-
-	error = fd_statfs(fd, &tmp);
-	if (!error)
-		error = put_compat_statfs64(buf, &tmp);
-	return error;
-}
-
-/*
- * This is a copy of sys_ustat, just dealing with a structure layout.
- * Given how simple this syscall is that apporach is more maintainable
- * than the various conversion hacks.
- */
-COMPAT_SYSCALL_DEFINE2(ustat, unsigned, dev, struct compat_ustat __user *, u)
-{
-	struct compat_ustat tmp;
-	struct kstatfs sbuf;
-	int err = vfs_ustat(new_decode_dev(dev), &sbuf);
-	if (err)
-		return err;
-
-	memset(&tmp, 0, sizeof(struct compat_ustat));
-	tmp.f_tfree = sbuf.f_bfree;
-	tmp.f_tinode = sbuf.f_ffree;
-	if (copy_to_user(u, &tmp, sizeof(struct compat_ustat)))
-		return -EFAULT;
-	return 0;
-}
-
-static int get_compat_flock(struct flock *kfl, struct compat_flock __user *ufl)
-{
-	if (!access_ok(VERIFY_READ, ufl, sizeof(*ufl)) ||
-	    __get_user(kfl->l_type, &ufl->l_type) ||
-	    __get_user(kfl->l_whence, &ufl->l_whence) ||
-	    __get_user(kfl->l_start, &ufl->l_start) ||
-	    __get_user(kfl->l_len, &ufl->l_len) ||
-	    __get_user(kfl->l_pid, &ufl->l_pid))
-		return -EFAULT;
-	return 0;
-}
-
-static int put_compat_flock(struct flock *kfl, struct compat_flock __user *ufl)
-{
-	if (!access_ok(VERIFY_WRITE, ufl, sizeof(*ufl)) ||
-	    __put_user(kfl->l_type, &ufl->l_type) ||
-	    __put_user(kfl->l_whence, &ufl->l_whence) ||
-	    __put_user(kfl->l_start, &ufl->l_start) ||
-	    __put_user(kfl->l_len, &ufl->l_len) ||
-	    __put_user(kfl->l_pid, &ufl->l_pid))
-		return -EFAULT;
-	return 0;
-}
-
-#ifndef HAVE_ARCH_GET_COMPAT_FLOCK64
-static int get_compat_flock64(struct flock *kfl, struct compat_flock64 __user *ufl)
-{
-	if (!access_ok(VERIFY_READ, ufl, sizeof(*ufl)) ||
-	    __get_user(kfl->l_type, &ufl->l_type) ||
-	    __get_user(kfl->l_whence, &ufl->l_whence) ||
-	    __get_user(kfl->l_start, &ufl->l_start) ||
-	    __get_user(kfl->l_len, &ufl->l_len) ||
-	    __get_user(kfl->l_pid, &ufl->l_pid))
-		return -EFAULT;
-	return 0;
-}
-#endif
-
-#ifndef HAVE_ARCH_PUT_COMPAT_FLOCK64
-static int put_compat_flock64(struct flock *kfl, struct compat_flock64 __user *ufl)
-{
-	if (!access_ok(VERIFY_WRITE, ufl, sizeof(*ufl)) ||
-	    __put_user(kfl->l_type, &ufl->l_type) ||
-	    __put_user(kfl->l_whence, &ufl->l_whence) ||
-	    __put_user(kfl->l_start, &ufl->l_start) ||
-	    __put_user(kfl->l_len, &ufl->l_len) ||
-	    __put_user(kfl->l_pid, &ufl->l_pid))
-		return -EFAULT;
-	return 0;
-}
-#endif
-
-static unsigned int
-convert_fcntl_cmd(unsigned int cmd)
-{
-	switch (cmd) {
-	case F_GETLK64:
-		return F_GETLK;
-	case F_SETLK64:
-		return F_SETLK;
-	case F_SETLKW64:
-		return F_SETLKW;
-	}
-
-	return cmd;
-}
-
-COMPAT_SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd,
-		       compat_ulong_t, arg)
-{
-	mm_segment_t old_fs;
-	struct flock f;
-	long ret;
-	unsigned int conv_cmd;
-
-	switch (cmd) {
-	case F_GETLK:
-	case F_SETLK:
-	case F_SETLKW:
-		ret = get_compat_flock(&f, compat_ptr(arg));
-		if (ret != 0)
-			break;
-		old_fs = get_fs();
-		set_fs(KERNEL_DS);
-		ret = sys_fcntl(fd, cmd, (unsigned long)&f);
-		set_fs(old_fs);
-		if (cmd == F_GETLK && ret == 0) {
-			/* GETLK was successful and we need to return the data...
-			 * but it needs to fit in the compat structure.
-			 * l_start shouldn't be too big, unless the original
-			 * start + end is greater than COMPAT_OFF_T_MAX, in which
-			 * case the app was asking for trouble, so we return
-			 * -EOVERFLOW in that case.
-			 * l_len could be too big, in which case we just truncate it,
-			 * and only allow the app to see that part of the conflicting
-			 * lock that might make sense to it anyway
-			 */
-
-			if (f.l_start > COMPAT_OFF_T_MAX)
-				ret = -EOVERFLOW;
-			if (f.l_len > COMPAT_OFF_T_MAX)
-				f.l_len = COMPAT_OFF_T_MAX;
-			if (ret == 0)
-				ret = put_compat_flock(&f, compat_ptr(arg));
-		}
-		break;
-
-	case F_GETLK64:
-	case F_SETLK64:
-	case F_SETLKW64:
-	case F_OFD_GETLK:
-	case F_OFD_SETLK:
-	case F_OFD_SETLKW:
-		ret = get_compat_flock64(&f, compat_ptr(arg));
-		if (ret != 0)
-			break;
-		old_fs = get_fs();
-		set_fs(KERNEL_DS);
-		conv_cmd = convert_fcntl_cmd(cmd);
-		ret = sys_fcntl(fd, conv_cmd, (unsigned long)&f);
-		set_fs(old_fs);
-		if ((conv_cmd == F_GETLK || conv_cmd == F_OFD_GETLK) && ret == 0) {
-			/* need to return lock information - see above for commentary */
-			if (f.l_start > COMPAT_LOFF_T_MAX)
-				ret = -EOVERFLOW;
-			if (f.l_len > COMPAT_LOFF_T_MAX)
-				f.l_len = COMPAT_LOFF_T_MAX;
-			if (ret == 0)
-				ret = put_compat_flock64(&f, compat_ptr(arg));
-		}
-		break;
-
-	default:
-		ret = sys_fcntl(fd, cmd, arg);
-		break;
-	}
-	return ret;
-}
-
-COMPAT_SYSCALL_DEFINE3(fcntl, unsigned int, fd, unsigned int, cmd,
-		       compat_ulong_t, arg)
-{
-	switch (cmd) {
-	case F_GETLK64:
-	case F_SETLK64:
-	case F_SETLKW64:
-	case F_OFD_GETLK:
-	case F_OFD_SETLK:
-	case F_OFD_SETLKW:
-		return -EINVAL;
-	}
-	return compat_sys_fcntl64(fd, cmd, arg);
-}
-
-/* A write operation does a read from user space and vice versa */
-#define vrfy_dir(type) ((type) == READ ? VERIFY_WRITE : VERIFY_READ)
-
-ssize_t compat_rw_copy_check_uvector(int type,
-		const struct compat_iovec __user *uvector, unsigned long nr_segs,
-		unsigned long fast_segs, struct iovec *fast_pointer,
-		struct iovec **ret_pointer)
-{
-	compat_ssize_t tot_len;
-	struct iovec *iov = *ret_pointer = fast_pointer;
-	ssize_t ret = 0;
-	int seg;
-
-	/*
-	 * SuS says "The readv() function *may* fail if the iovcnt argument
-	 * was less than or equal to 0, or greater than {IOV_MAX}.  Linux has
-	 * traditionally returned zero for zero segments, so...
-	 */
-	if (nr_segs == 0)
-		goto out;
-
-	ret = -EINVAL;
-	if (nr_segs > UIO_MAXIOV)
-		goto out;
-	if (nr_segs > fast_segs) {
-		ret = -ENOMEM;
-		iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL);
-		if (iov == NULL)
-			goto out;
-	}
-	*ret_pointer = iov;
-
-	ret = -EFAULT;
-	if (!access_ok(VERIFY_READ, uvector, nr_segs*sizeof(*uvector)))
-		goto out;
-
-	/*
-	 * Single unix specification:
-	 * We should -EINVAL if an element length is not >= 0 and fitting an
-	 * ssize_t.
-	 *
-	 * In Linux, the total length is limited to MAX_RW_COUNT, there is
-	 * no overflow possibility.
-	 */
-	tot_len = 0;
-	ret = -EINVAL;
-	for (seg = 0; seg < nr_segs; seg++) {
-		compat_uptr_t buf;
-		compat_ssize_t len;
-
-		if (__get_user(len, &uvector->iov_len) ||
-		   __get_user(buf, &uvector->iov_base)) {
-			ret = -EFAULT;
-			goto out;
-		}
-		if (len < 0)	/* size_t not fitting in compat_ssize_t .. */
-			goto out;
-		if (type >= 0 &&
-		    !access_ok(vrfy_dir(type), compat_ptr(buf), len)) {
-			ret = -EFAULT;
-			goto out;
-		}
-		if (len > MAX_RW_COUNT - tot_len)
-			len = MAX_RW_COUNT - tot_len;
-		tot_len += len;
-		iov->iov_base = compat_ptr(buf);
-		iov->iov_len = (compat_size_t) len;
-		uvector++;
-		iov++;
-	}
-	ret = tot_len;
-
-out:
-	return ret;
-}
-
 struct compat_ncp_mount_data {
 	compat_int_t version;
 	compat_uint_t ncp_fd;
@@ -744,653 +203,3 @@ COMPAT_SYSCALL_DEFINE5(mount, const char __user *, dev_name,
  out:
 	return retval;
 }
-
-struct compat_old_linux_dirent {
-	compat_ulong_t	d_ino;
-	compat_ulong_t	d_offset;
-	unsigned short	d_namlen;
-	char		d_name[1];
-};
-
-struct compat_readdir_callback {
-	struct dir_context ctx;
-	struct compat_old_linux_dirent __user *dirent;
-	int result;
-};
-
-static int compat_fillonedir(struct dir_context *ctx, const char *name,
-			     int namlen, loff_t offset, u64 ino,
-			     unsigned int d_type)
-{
-	struct compat_readdir_callback *buf =
-		container_of(ctx, struct compat_readdir_callback, ctx);
-	struct compat_old_linux_dirent __user *dirent;
-	compat_ulong_t d_ino;
-
-	if (buf->result)
-		return -EINVAL;
-	d_ino = ino;
-	if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) {
-		buf->result = -EOVERFLOW;
-		return -EOVERFLOW;
-	}
-	buf->result++;
-	dirent = buf->dirent;
-	if (!access_ok(VERIFY_WRITE, dirent,
-			(unsigned long)(dirent->d_name + namlen + 1) -
-				(unsigned long)dirent))
-		goto efault;
-	if (	__put_user(d_ino, &dirent->d_ino) ||
-		__put_user(offset, &dirent->d_offset) ||
-		__put_user(namlen, &dirent->d_namlen) ||
-		__copy_to_user(dirent->d_name, name, namlen) ||
-		__put_user(0, dirent->d_name + namlen))
-		goto efault;
-	return 0;
-efault:
-	buf->result = -EFAULT;
-	return -EFAULT;
-}
-
-COMPAT_SYSCALL_DEFINE3(old_readdir, unsigned int, fd,
-		struct compat_old_linux_dirent __user *, dirent, unsigned int, count)
-{
-	int error;
-	struct fd f = fdget_pos(fd);
-	struct compat_readdir_callback buf = {
-		.ctx.actor = compat_fillonedir,
-		.dirent = dirent
-	};
-
-	if (!f.file)
-		return -EBADF;
-
-	error = iterate_dir(f.file, &buf.ctx);
-	if (buf.result)
-		error = buf.result;
-
-	fdput_pos(f);
-	return error;
-}
-
-struct compat_linux_dirent {
-	compat_ulong_t	d_ino;
-	compat_ulong_t	d_off;
-	unsigned short	d_reclen;
-	char		d_name[1];
-};
-
-struct compat_getdents_callback {
-	struct dir_context ctx;
-	struct compat_linux_dirent __user *current_dir;
-	struct compat_linux_dirent __user *previous;
-	int count;
-	int error;
-};
-
-static int compat_filldir(struct dir_context *ctx, const char *name, int namlen,
-		loff_t offset, u64 ino, unsigned int d_type)
-{
-	struct compat_linux_dirent __user * dirent;
-	struct compat_getdents_callback *buf =
-		container_of(ctx, struct compat_getdents_callback, ctx);
-	compat_ulong_t d_ino;
-	int reclen = ALIGN(offsetof(struct compat_linux_dirent, d_name) +
-		namlen + 2, sizeof(compat_long_t));
-
-	buf->error = -EINVAL;	/* only used if we fail.. */
-	if (reclen > buf->count)
-		return -EINVAL;
-	d_ino = ino;
-	if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) {
-		buf->error = -EOVERFLOW;
-		return -EOVERFLOW;
-	}
-	dirent = buf->previous;
-	if (dirent) {
-		if (signal_pending(current))
-			return -EINTR;
-		if (__put_user(offset, &dirent->d_off))
-			goto efault;
-	}
-	dirent = buf->current_dir;
-	if (__put_user(d_ino, &dirent->d_ino))
-		goto efault;
-	if (__put_user(reclen, &dirent->d_reclen))
-		goto efault;
-	if (copy_to_user(dirent->d_name, name, namlen))
-		goto efault;
-	if (__put_user(0, dirent->d_name + namlen))
-		goto efault;
-	if (__put_user(d_type, (char  __user *) dirent + reclen - 1))
-		goto efault;
-	buf->previous = dirent;
-	dirent = (void __user *)dirent + reclen;
-	buf->current_dir = dirent;
-	buf->count -= reclen;
-	return 0;
-efault:
-	buf->error = -EFAULT;
-	return -EFAULT;
-}
-
-COMPAT_SYSCALL_DEFINE3(getdents, unsigned int, fd,
-		struct compat_linux_dirent __user *, dirent, unsigned int, count)
-{
-	struct fd f;
-	struct compat_linux_dirent __user * lastdirent;
-	struct compat_getdents_callback buf = {
-		.ctx.actor = compat_filldir,
-		.current_dir = dirent,
-		.count = count
-	};
-	int error;
-
-	if (!access_ok(VERIFY_WRITE, dirent, count))
-		return -EFAULT;
-
-	f = fdget_pos(fd);
-	if (!f.file)
-		return -EBADF;
-
-	error = iterate_dir(f.file, &buf.ctx);
-	if (error >= 0)
-		error = buf.error;
-	lastdirent = buf.previous;
-	if (lastdirent) {
-		if (put_user(buf.ctx.pos, &lastdirent->d_off))
-			error = -EFAULT;
-		else
-			error = count - buf.count;
-	}
-	fdput_pos(f);
-	return error;
-}
-
-#ifdef __ARCH_WANT_COMPAT_SYS_GETDENTS64
-
-struct compat_getdents_callback64 {
-	struct dir_context ctx;
-	struct linux_dirent64 __user *current_dir;
-	struct linux_dirent64 __user *previous;
-	int count;
-	int error;
-};
-
-static int compat_filldir64(struct dir_context *ctx, const char *name,
-			    int namlen, loff_t offset, u64 ino,
-			    unsigned int d_type)
-{
-	struct linux_dirent64 __user *dirent;
-	struct compat_getdents_callback64 *buf =
-		container_of(ctx, struct compat_getdents_callback64, ctx);
-	int reclen = ALIGN(offsetof(struct linux_dirent64, d_name) + namlen + 1,
-		sizeof(u64));
-	u64 off;
-
-	buf->error = -EINVAL;	/* only used if we fail.. */
-	if (reclen > buf->count)
-		return -EINVAL;
-	dirent = buf->previous;
-
-	if (dirent) {
-		if (signal_pending(current))
-			return -EINTR;
-		if (__put_user_unaligned(offset, &dirent->d_off))
-			goto efault;
-	}
-	dirent = buf->current_dir;
-	if (__put_user_unaligned(ino, &dirent->d_ino))
-		goto efault;
-	off = 0;
-	if (__put_user_unaligned(off, &dirent->d_off))
-		goto efault;
-	if (__put_user(reclen, &dirent->d_reclen))
-		goto efault;
-	if (__put_user(d_type, &dirent->d_type))
-		goto efault;
-	if (copy_to_user(dirent->d_name, name, namlen))
-		goto efault;
-	if (__put_user(0, dirent->d_name + namlen))
-		goto efault;
-	buf->previous = dirent;
-	dirent = (void __user *)dirent + reclen;
-	buf->current_dir = dirent;
-	buf->count -= reclen;
-	return 0;
-efault:
-	buf->error = -EFAULT;
-	return -EFAULT;
-}
-
-COMPAT_SYSCALL_DEFINE3(getdents64, unsigned int, fd,
-		struct linux_dirent64 __user *, dirent, unsigned int, count)
-{
-	struct fd f;
-	struct linux_dirent64 __user * lastdirent;
-	struct compat_getdents_callback64 buf = {
-		.ctx.actor = compat_filldir64,
-		.current_dir = dirent,
-		.count = count
-	};
-	int error;
-
-	if (!access_ok(VERIFY_WRITE, dirent, count))
-		return -EFAULT;
-
-	f = fdget_pos(fd);
-	if (!f.file)
-		return -EBADF;
-
-	error = iterate_dir(f.file, &buf.ctx);
-	if (error >= 0)
-		error = buf.error;
-	lastdirent = buf.previous;
-	if (lastdirent) {
-		typeof(lastdirent->d_off) d_off = buf.ctx.pos;
-		if (__put_user_unaligned(d_off, &lastdirent->d_off))
-			error = -EFAULT;
-		else
-			error = count - buf.count;
-	}
-	fdput_pos(f);
-	return error;
-}
-#endif /* __ARCH_WANT_COMPAT_SYS_GETDENTS64 */
-
-/*
- * Exactly like fs/open.c:sys_open(), except that it doesn't set the
- * O_LARGEFILE flag.
- */
-COMPAT_SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, umode_t, mode)
-{
-	return do_sys_open(AT_FDCWD, filename, flags, mode);
-}
-
-/*
- * Exactly like fs/open.c:sys_openat(), except that it doesn't set the
- * O_LARGEFILE flag.
- */
-COMPAT_SYSCALL_DEFINE4(openat, int, dfd, const char __user *, filename, int, flags, umode_t, mode)
-{
-	return do_sys_open(dfd, filename, flags, mode);
-}
-
-#define __COMPAT_NFDBITS       (8 * sizeof(compat_ulong_t))
-
-static int poll_select_copy_remaining(struct timespec *end_time, void __user *p,
-				      int timeval, int ret)
-{
-	struct timespec ts;
-
-	if (!p)
-		return ret;
-
-	if (current->personality & STICKY_TIMEOUTS)
-		goto sticky;
-
-	/* No update for zero timeout */
-	if (!end_time->tv_sec && !end_time->tv_nsec)
-		return ret;
-
-	ktime_get_ts(&ts);
-	ts = timespec_sub(*end_time, ts);
-	if (ts.tv_sec < 0)
-		ts.tv_sec = ts.tv_nsec = 0;
-
-	if (timeval) {
-		struct compat_timeval rtv;
-
-		rtv.tv_sec = ts.tv_sec;
-		rtv.tv_usec = ts.tv_nsec / NSEC_PER_USEC;
-
-		if (!copy_to_user(p, &rtv, sizeof(rtv)))
-			return ret;
-	} else {
-		struct compat_timespec rts;
-
-		rts.tv_sec = ts.tv_sec;
-		rts.tv_nsec = ts.tv_nsec;
-
-		if (!copy_to_user(p, &rts, sizeof(rts)))
-			return ret;
-	}
-	/*
-	 * If an application puts its timeval in read-only memory, we
-	 * don't want the Linux-specific update to the timeval to
-	 * cause a fault after the select has completed
-	 * successfully. However, because we're not updating the
-	 * timeval, we can't restart the system call.
-	 */
-
-sticky:
-	if (ret == -ERESTARTNOHAND)
-		ret = -EINTR;
-	return ret;
-}
-
-/*
- * Ooo, nasty.  We need here to frob 32-bit unsigned longs to
- * 64-bit unsigned longs.
- */
-static
-int compat_get_fd_set(unsigned long nr, compat_ulong_t __user *ufdset,
-			unsigned long *fdset)
-{
-	nr = DIV_ROUND_UP(nr, __COMPAT_NFDBITS);
-	if (ufdset) {
-		unsigned long odd;
-
-		if (!access_ok(VERIFY_WRITE, ufdset, nr*sizeof(compat_ulong_t)))
-			return -EFAULT;
-
-		odd = nr & 1UL;
-		nr &= ~1UL;
-		while (nr) {
-			unsigned long h, l;
-			if (__get_user(l, ufdset) || __get_user(h, ufdset+1))
-				return -EFAULT;
-			ufdset += 2;
-			*fdset++ = h << 32 | l;
-			nr -= 2;
-		}
-		if (odd && __get_user(*fdset, ufdset))
-			return -EFAULT;
-	} else {
-		/* Tricky, must clear full unsigned long in the
-		 * kernel fdset at the end, this makes sure that
-		 * actually happens.
-		 */
-		memset(fdset, 0, ((nr + 1) & ~1)*sizeof(compat_ulong_t));
-	}
-	return 0;
-}
-
-static
-int compat_set_fd_set(unsigned long nr, compat_ulong_t __user *ufdset,
-		      unsigned long *fdset)
-{
-	unsigned long odd;
-	nr = DIV_ROUND_UP(nr, __COMPAT_NFDBITS);
-
-	if (!ufdset)
-		return 0;
-
-	odd = nr & 1UL;
-	nr &= ~1UL;
-	while (nr) {
-		unsigned long h, l;
-		l = *fdset++;
-		h = l >> 32;
-		if (__put_user(l, ufdset) || __put_user(h, ufdset+1))
-			return -EFAULT;
-		ufdset += 2;
-		nr -= 2;
-	}
-	if (odd && __put_user(*fdset, ufdset))
-		return -EFAULT;
-	return 0;
-}
-
-
-/*
- * This is a virtual copy of sys_select from fs/select.c and probably
- * should be compared to it from time to time
- */
-
-/*
- * We can actually return ERESTARTSYS instead of EINTR, but I'd
- * like to be certain this leads to no problems. So I return
- * EINTR just for safety.
- *
- * Update: ERESTARTSYS breaks at least the xview clock binary, so
- * I'm trying ERESTARTNOHAND which restart only when you want to.
- */
-int compat_core_sys_select(int n, compat_ulong_t __user *inp,
-	compat_ulong_t __user *outp, compat_ulong_t __user *exp,
-	struct timespec *end_time)
-{
-	fd_set_bits fds;
-	void *bits;
-	int size, max_fds, ret = -EINVAL;
-	struct fdtable *fdt;
-	long stack_fds[SELECT_STACK_ALLOC/sizeof(long)];
-
-	if (n < 0)
-		goto out_nofds;
-
-	/* max_fds can increase, so grab it once to avoid race */
-	rcu_read_lock();
-	fdt = files_fdtable(current->files);
-	max_fds = fdt->max_fds;
-	rcu_read_unlock();
-	if (n > max_fds)
-		n = max_fds;
-
-	/*
-	 * We need 6 bitmaps (in/out/ex for both incoming and outgoing),
-	 * since we used fdset we need to allocate memory in units of
-	 * long-words.
-	 */
-	size = FDS_BYTES(n);
-	bits = stack_fds;
-	if (size > sizeof(stack_fds) / 6) {
-		bits = kmalloc(6 * size, GFP_KERNEL);
-		ret = -ENOMEM;
-		if (!bits)
-			goto out_nofds;
-	}
-	fds.in      = (unsigned long *)  bits;
-	fds.out     = (unsigned long *) (bits +   size);
-	fds.ex      = (unsigned long *) (bits + 2*size);
-	fds.res_in  = (unsigned long *) (bits + 3*size);
-	fds.res_out = (unsigned long *) (bits + 4*size);
-	fds.res_ex  = (unsigned long *) (bits + 5*size);
-
-	if ((ret = compat_get_fd_set(n, inp, fds.in)) ||
-	    (ret = compat_get_fd_set(n, outp, fds.out)) ||
-	    (ret = compat_get_fd_set(n, exp, fds.ex)))
-		goto out;
-	zero_fd_set(n, fds.res_in);
-	zero_fd_set(n, fds.res_out);
-	zero_fd_set(n, fds.res_ex);
-
-	ret = do_select(n, &fds, end_time);
-
-	if (ret < 0)
-		goto out;
-	if (!ret) {
-		ret = -ERESTARTNOHAND;
-		if (signal_pending(current))
-			goto out;
-		ret = 0;
-	}
-
-	if (compat_set_fd_set(n, inp, fds.res_in) ||
-	    compat_set_fd_set(n, outp, fds.res_out) ||
-	    compat_set_fd_set(n, exp, fds.res_ex))
-		ret = -EFAULT;
-out:
-	if (bits != stack_fds)
-		kfree(bits);
-out_nofds:
-	return ret;
-}
-
-COMPAT_SYSCALL_DEFINE5(select, int, n, compat_ulong_t __user *, inp,
-	compat_ulong_t __user *, outp, compat_ulong_t __user *, exp,
-	struct compat_timeval __user *, tvp)
-{
-	struct timespec end_time, *to = NULL;
-	struct compat_timeval tv;
-	int ret;
-
-	if (tvp) {
-		if (copy_from_user(&tv, tvp, sizeof(tv)))
-			return -EFAULT;
-
-		to = &end_time;
-		if (poll_select_set_timeout(to,
-				tv.tv_sec + (tv.tv_usec / USEC_PER_SEC),
-				(tv.tv_usec % USEC_PER_SEC) * NSEC_PER_USEC))
-			return -EINVAL;
-	}
-
-	ret = compat_core_sys_select(n, inp, outp, exp, to);
-	ret = poll_select_copy_remaining(&end_time, tvp, 1, ret);
-
-	return ret;
-}
-
-struct compat_sel_arg_struct {
-	compat_ulong_t n;
-	compat_uptr_t inp;
-	compat_uptr_t outp;
-	compat_uptr_t exp;
-	compat_uptr_t tvp;
-};
-
-COMPAT_SYSCALL_DEFINE1(old_select, struct compat_sel_arg_struct __user *, arg)
-{
-	struct compat_sel_arg_struct a;
-
-	if (copy_from_user(&a, arg, sizeof(a)))
-		return -EFAULT;
-	return compat_sys_select(a.n, compat_ptr(a.inp), compat_ptr(a.outp),
-				 compat_ptr(a.exp), compat_ptr(a.tvp));
-}
-
-static long do_compat_pselect(int n, compat_ulong_t __user *inp,
-	compat_ulong_t __user *outp, compat_ulong_t __user *exp,
-	struct compat_timespec __user *tsp, compat_sigset_t __user *sigmask,
-	compat_size_t sigsetsize)
-{
-	compat_sigset_t ss32;
-	sigset_t ksigmask, sigsaved;
-	struct compat_timespec ts;
-	struct timespec end_time, *to = NULL;
-	int ret;
-
-	if (tsp) {
-		if (copy_from_user(&ts, tsp, sizeof(ts)))
-			return -EFAULT;
-
-		to = &end_time;
-		if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
-			return -EINVAL;
-	}
-
-	if (sigmask) {
-		if (sigsetsize != sizeof(compat_sigset_t))
-			return -EINVAL;
-		if (copy_from_user(&ss32, sigmask, sizeof(ss32)))
-			return -EFAULT;
-		sigset_from_compat(&ksigmask, &ss32);
-
-		sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP));
-		sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
-	}
-
-	ret = compat_core_sys_select(n, inp, outp, exp, to);
-	ret = poll_select_copy_remaining(&end_time, tsp, 0, ret);
-
-	if (ret == -ERESTARTNOHAND) {
-		/*
-		 * Don't restore the signal mask yet. Let do_signal() deliver
-		 * the signal on the way back to userspace, before the signal
-		 * mask is restored.
-		 */
-		if (sigmask) {
-			memcpy(&current->saved_sigmask, &sigsaved,
-					sizeof(sigsaved));
-			set_restore_sigmask();
-		}
-	} else if (sigmask)
-		sigprocmask(SIG_SETMASK, &sigsaved, NULL);
-
-	return ret;
-}
-
-COMPAT_SYSCALL_DEFINE6(pselect6, int, n, compat_ulong_t __user *, inp,
-	compat_ulong_t __user *, outp, compat_ulong_t __user *, exp,
-	struct compat_timespec __user *, tsp, void __user *, sig)
-{
-	compat_size_t sigsetsize = 0;
-	compat_uptr_t up = 0;
-
-	if (sig) {
-		if (!access_ok(VERIFY_READ, sig,
-				sizeof(compat_uptr_t)+sizeof(compat_size_t)) ||
-		    	__get_user(up, (compat_uptr_t __user *)sig) ||
-		    	__get_user(sigsetsize,
-				(compat_size_t __user *)(sig+sizeof(up))))
-			return -EFAULT;
-	}
-	return do_compat_pselect(n, inp, outp, exp, tsp, compat_ptr(up),
-				 sigsetsize);
-}
-
-COMPAT_SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds,
-	unsigned int,  nfds, struct compat_timespec __user *, tsp,
-	const compat_sigset_t __user *, sigmask, compat_size_t, sigsetsize)
-{
-	compat_sigset_t ss32;
-	sigset_t ksigmask, sigsaved;
-	struct compat_timespec ts;
-	struct timespec end_time, *to = NULL;
-	int ret;
-
-	if (tsp) {
-		if (copy_from_user(&ts, tsp, sizeof(ts)))
-			return -EFAULT;
-
-		to = &end_time;
-		if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
-			return -EINVAL;
-	}
-
-	if (sigmask) {
-		if (sigsetsize != sizeof(compat_sigset_t))
-			return -EINVAL;
-		if (copy_from_user(&ss32, sigmask, sizeof(ss32)))
-			return -EFAULT;
-		sigset_from_compat(&ksigmask, &ss32);
-
-		sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP));
-		sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
-	}
-
-	ret = do_sys_poll(ufds, nfds, to);
-
-	/* We can restart this syscall, usually */
-	if (ret == -EINTR) {
-		/*
-		 * Don't restore the signal mask yet. Let do_signal() deliver
-		 * the signal on the way back to userspace, before the signal
-		 * mask is restored.
-		 */
-		if (sigmask) {
-			memcpy(&current->saved_sigmask, &sigsaved,
-				sizeof(sigsaved));
-			set_restore_sigmask();
-		}
-		ret = -ERESTARTNOHAND;
-	} else if (sigmask)
-		sigprocmask(SIG_SETMASK, &sigsaved, NULL);
-
-	ret = poll_select_copy_remaining(&end_time, tsp, 0, ret);
-
-	return ret;
-}
-
-#ifdef CONFIG_FHANDLE
-/*
- * Exactly like fs/open.c:sys_open_by_handle_at(), except that it
- * doesn't set the O_LARGEFILE flag.
- */
-COMPAT_SYSCALL_DEFINE3(open_by_handle_at, int, mountdirfd,
-			     struct file_handle __user *, handle, int, flags)
-{
-	return do_handle_open(mountdirfd, handle, flags);
-}
-#endif
diff --git a/fs/dax.c b/fs/dax.c
index 6433650be833..43bbd6d1037d 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -55,32 +55,6 @@ static int __init init_dax_wait_table(void)
 }
 fs_initcall(init_dax_wait_table);
 
-static long dax_map_atomic(struct block_device *bdev, struct blk_dax_ctl *dax)
-{
-	struct request_queue *q = bdev->bd_queue;
-	long rc = -EIO;
-
-	dax->addr = ERR_PTR(-EIO);
-	if (blk_queue_enter(q, true) != 0)
-		return rc;
-
-	rc = bdev_direct_access(bdev, dax);
-	if (rc < 0) {
-		dax->addr = ERR_PTR(rc);
-		blk_queue_exit(q);
-		return rc;
-	}
-	return rc;
-}
-
-static void dax_unmap_atomic(struct block_device *bdev,
-		const struct blk_dax_ctl *dax)
-{
-	if (IS_ERR(dax->addr))
-		return;
-	blk_queue_exit(bdev->bd_queue);
-}
-
 static int dax_is_pmd_entry(void *entry)
 {
 	return (unsigned long)entry & RADIX_DAX_PMD;
@@ -101,26 +75,6 @@ static int dax_is_empty_entry(void *entry)
 	return (unsigned long)entry & RADIX_DAX_EMPTY;
 }
 
-struct page *read_dax_sector(struct block_device *bdev, sector_t n)
-{
-	struct page *page = alloc_pages(GFP_KERNEL, 0);
-	struct blk_dax_ctl dax = {
-		.size = PAGE_SIZE,
-		.sector = n & ~((((int) PAGE_SIZE) / 512) - 1),
-	};
-	long rc;
-
-	if (!page)
-		return ERR_PTR(-ENOMEM);
-
-	rc = dax_map_atomic(bdev, &dax);
-	if (rc < 0)
-		return ERR_PTR(rc);
-	memcpy_from_pmem(page_address(page), dax.addr, PAGE_SIZE);
-	dax_unmap_atomic(bdev, &dax);
-	return page;
-}
-
 /*
  * DAX radix tree locking
  */
@@ -582,21 +536,30 @@ static int dax_load_hole(struct address_space *mapping, void **entry,
 	return ret;
 }
 
-static int copy_user_dax(struct block_device *bdev, sector_t sector, size_t size,
-		struct page *to, unsigned long vaddr)
+static int copy_user_dax(struct block_device *bdev, struct dax_device *dax_dev,
+		sector_t sector, size_t size, struct page *to,
+		unsigned long vaddr)
 {
-	struct blk_dax_ctl dax = {
-		.sector = sector,
-		.size = size,
-	};
-	void *vto;
-
-	if (dax_map_atomic(bdev, &dax) < 0)
-		return PTR_ERR(dax.addr);
+	void *vto, *kaddr;
+	pgoff_t pgoff;
+	pfn_t pfn;
+	long rc;
+	int id;
+
+	rc = bdev_dax_pgoff(bdev, sector, size, &pgoff);
+	if (rc)
+		return rc;
+
+	id = dax_read_lock();
+	rc = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), &kaddr, &pfn);
+	if (rc < 0) {
+		dax_read_unlock(id);
+		return rc;
+	}
 	vto = kmap_atomic(to);
-	copy_user_page(vto, (void __force *)dax.addr, vaddr, to);
+	copy_user_page(vto, (void __force *)kaddr, vaddr, to);
 	kunmap_atomic(vto);
-	dax_unmap_atomic(bdev, &dax);
+	dax_read_unlock(id);
 	return 0;
 }
 
@@ -764,12 +727,16 @@ unlock_pte:
 }
 
 static int dax_writeback_one(struct block_device *bdev,
-		struct address_space *mapping, pgoff_t index, void *entry)
+		struct dax_device *dax_dev, struct address_space *mapping,
+		pgoff_t index, void *entry)
 {
 	struct radix_tree_root *page_tree = &mapping->page_tree;
-	struct blk_dax_ctl dax;
-	void *entry2, **slot;
-	int ret = 0;
+	void *entry2, **slot, *kaddr;
+	long ret = 0, id;
+	sector_t sector;
+	pgoff_t pgoff;
+	size_t size;
+	pfn_t pfn;
 
 	/*
 	 * A page got tagged dirty in DAX mapping? Something is seriously
@@ -818,26 +785,29 @@ static int dax_writeback_one(struct block_device *bdev,
 	 * 'entry'.  This allows us to flush for PMD_SIZE and not have to
 	 * worry about partial PMD writebacks.
 	 */
-	dax.sector = dax_radix_sector(entry);
-	dax.size = PAGE_SIZE << dax_radix_order(entry);
+	sector = dax_radix_sector(entry);
+	size = PAGE_SIZE << dax_radix_order(entry);
+
+	id = dax_read_lock();
+	ret = bdev_dax_pgoff(bdev, sector, size, &pgoff);
+	if (ret)
+		goto dax_unlock;
 
 	/*
-	 * We cannot hold tree_lock while calling dax_map_atomic() because it
-	 * eventually calls cond_resched().
+	 * dax_direct_access() may sleep, so cannot hold tree_lock over
+	 * its invocation.
 	 */
-	ret = dax_map_atomic(bdev, &dax);
-	if (ret < 0) {
-		put_locked_mapping_entry(mapping, index, entry);
-		return ret;
-	}
+	ret = dax_direct_access(dax_dev, pgoff, size / PAGE_SIZE, &kaddr, &pfn);
+	if (ret < 0)
+		goto dax_unlock;
 
-	if (WARN_ON_ONCE(ret < dax.size)) {
+	if (WARN_ON_ONCE(ret < size / PAGE_SIZE)) {
 		ret = -EIO;
-		goto unmap;
+		goto dax_unlock;
 	}
 
-	dax_mapping_entry_mkclean(mapping, index, pfn_t_to_pfn(dax.pfn));
-	wb_cache_pmem(dax.addr, dax.size);
+	dax_mapping_entry_mkclean(mapping, index, pfn_t_to_pfn(pfn));
+	wb_cache_pmem(kaddr, size);
 	/*
 	 * After we have flushed the cache, we can clear the dirty tag. There
 	 * cannot be new dirty data in the pfn after the flush has completed as
@@ -847,8 +817,8 @@ static int dax_writeback_one(struct block_device *bdev,
 	spin_lock_irq(&mapping->tree_lock);
 	radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_DIRTY);
 	spin_unlock_irq(&mapping->tree_lock);
- unmap:
-	dax_unmap_atomic(bdev, &dax);
+ dax_unlock:
+	dax_read_unlock(id);
 	put_locked_mapping_entry(mapping, index, entry);
 	return ret;
 
@@ -869,6 +839,7 @@ int dax_writeback_mapping_range(struct address_space *mapping,
 	struct inode *inode = mapping->host;
 	pgoff_t start_index, end_index;
 	pgoff_t indices[PAGEVEC_SIZE];
+	struct dax_device *dax_dev;
 	struct pagevec pvec;
 	bool done = false;
 	int i, ret = 0;
@@ -879,6 +850,10 @@ int dax_writeback_mapping_range(struct address_space *mapping,
 	if (!mapping->nrexceptional || wbc->sync_mode != WB_SYNC_ALL)
 		return 0;
 
+	dax_dev = dax_get_by_host(bdev->bd_disk->disk_name);
+	if (!dax_dev)
+		return -EIO;
+
 	start_index = wbc->range_start >> PAGE_SHIFT;
 	end_index = wbc->range_end >> PAGE_SHIFT;
 
@@ -899,38 +874,49 @@ int dax_writeback_mapping_range(struct address_space *mapping,
 				break;
 			}
 
-			ret = dax_writeback_one(bdev, mapping, indices[i],
-					pvec.pages[i]);
-			if (ret < 0)
+			ret = dax_writeback_one(bdev, dax_dev, mapping,
+					indices[i], pvec.pages[i]);
+			if (ret < 0) {
+				put_dax(dax_dev);
 				return ret;
+			}
 		}
 	}
+	put_dax(dax_dev);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
 
 static int dax_insert_mapping(struct address_space *mapping,
-		struct block_device *bdev, sector_t sector, size_t size,
-		void **entryp, struct vm_area_struct *vma, struct vm_fault *vmf)
+		struct block_device *bdev, struct dax_device *dax_dev,
+		sector_t sector, size_t size, void **entryp,
+		struct vm_area_struct *vma, struct vm_fault *vmf)
 {
 	unsigned long vaddr = vmf->address;
-	struct blk_dax_ctl dax = {
-		.sector = sector,
-		.size = size,
-	};
-	void *ret;
 	void *entry = *entryp;
+	void *ret, *kaddr;
+	pgoff_t pgoff;
+	int id, rc;
+	pfn_t pfn;
 
-	if (dax_map_atomic(bdev, &dax) < 0)
-		return PTR_ERR(dax.addr);
-	dax_unmap_atomic(bdev, &dax);
+	rc = bdev_dax_pgoff(bdev, sector, size, &pgoff);
+	if (rc)
+		return rc;
 
-	ret = dax_insert_mapping_entry(mapping, vmf, entry, dax.sector, 0);
+	id = dax_read_lock();
+	rc = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), &kaddr, &pfn);
+	if (rc < 0) {
+		dax_read_unlock(id);
+		return rc;
+	}
+	dax_read_unlock(id);
+
+	ret = dax_insert_mapping_entry(mapping, vmf, entry, sector, 0);
 	if (IS_ERR(ret))
 		return PTR_ERR(ret);
 	*entryp = ret;
 
-	return vm_insert_mixed(vma, vaddr, dax.pfn);
+	return vm_insert_mixed(vma, vaddr, pfn);
 }
 
 /**
@@ -979,24 +965,34 @@ static bool dax_range_is_aligned(struct block_device *bdev,
 	return true;
 }
 
-int __dax_zero_page_range(struct block_device *bdev, sector_t sector,
-		unsigned int offset, unsigned int length)
+int __dax_zero_page_range(struct block_device *bdev,
+		struct dax_device *dax_dev, sector_t sector,
+		unsigned int offset, unsigned int size)
 {
-	struct blk_dax_ctl dax = {
-		.sector		= sector,
-		.size		= PAGE_SIZE,
-	};
-
-	if (dax_range_is_aligned(bdev, offset, length)) {
-		sector_t start_sector = dax.sector + (offset >> 9);
+	if (dax_range_is_aligned(bdev, offset, size)) {
+		sector_t start_sector = sector + (offset >> 9);
 
 		return blkdev_issue_zeroout(bdev, start_sector,
-				length >> 9, GFP_NOFS, 0);
+				size >> 9, GFP_NOFS, 0);
 	} else {
-		if (dax_map_atomic(bdev, &dax) < 0)
-			return PTR_ERR(dax.addr);
-		clear_pmem(dax.addr + offset, length);
-		dax_unmap_atomic(bdev, &dax);
+		pgoff_t pgoff;
+		long rc, id;
+		void *kaddr;
+		pfn_t pfn;
+
+		rc = bdev_dax_pgoff(bdev, sector, size, &pgoff);
+		if (rc)
+			return rc;
+
+		id = dax_read_lock();
+		rc = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), &kaddr,
+				&pfn);
+		if (rc < 0) {
+			dax_read_unlock(id);
+			return rc;
+		}
+		clear_pmem(kaddr + offset, size);
+		dax_read_unlock(id);
 	}
 	return 0;
 }
@@ -1011,9 +1007,12 @@ static loff_t
 dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
 		struct iomap *iomap)
 {
+	struct block_device *bdev = iomap->bdev;
+	struct dax_device *dax_dev = iomap->dax_dev;
 	struct iov_iter *iter = data;
 	loff_t end = pos + length, done = 0;
 	ssize_t ret = 0;
+	int id;
 
 	if (iov_iter_rw(iter) == READ) {
 		end = min(end, i_size_read(inode));
@@ -1038,34 +1037,42 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
 					      (end - 1) >> PAGE_SHIFT);
 	}
 
+	id = dax_read_lock();
 	while (pos < end) {
 		unsigned offset = pos & (PAGE_SIZE - 1);
-		struct blk_dax_ctl dax = { 0 };
+		const size_t size = ALIGN(length + offset, PAGE_SIZE);
+		const sector_t sector = dax_iomap_sector(iomap, pos);
 		ssize_t map_len;
+		pgoff_t pgoff;
+		void *kaddr;
+		pfn_t pfn;
 
 		if (fatal_signal_pending(current)) {
 			ret = -EINTR;
 			break;
 		}
 
-		dax.sector = dax_iomap_sector(iomap, pos);
-		dax.size = (length + offset + PAGE_SIZE - 1) & PAGE_MASK;
-		map_len = dax_map_atomic(iomap->bdev, &dax);
+		ret = bdev_dax_pgoff(bdev, sector, size, &pgoff);
+		if (ret)
+			break;
+
+		map_len = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size),
+				&kaddr, &pfn);
 		if (map_len < 0) {
 			ret = map_len;
 			break;
 		}
 
-		dax.addr += offset;
+		map_len = PFN_PHYS(map_len);
+		kaddr += offset;
 		map_len -= offset;
 		if (map_len > end - pos)
 			map_len = end - pos;
 
 		if (iov_iter_rw(iter) == WRITE)
-			map_len = copy_from_iter_pmem(dax.addr, map_len, iter);
+			map_len = copy_from_iter_pmem(kaddr, map_len, iter);
 		else
-			map_len = copy_to_iter(dax.addr, map_len, iter);
-		dax_unmap_atomic(iomap->bdev, &dax);
+			map_len = copy_to_iter(kaddr, map_len, iter);
 		if (map_len <= 0) {
 			ret = map_len ? map_len : -EFAULT;
 			break;
@@ -1075,6 +1082,7 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
 		length -= map_len;
 		done += map_len;
 	}
+	dax_read_unlock(id);
 
 	return done ? done : ret;
 }
@@ -1181,8 +1189,8 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
 			clear_user_highpage(vmf->cow_page, vaddr);
 			break;
 		case IOMAP_MAPPED:
-			error = copy_user_dax(iomap.bdev, sector, PAGE_SIZE,
-					vmf->cow_page, vaddr);
+			error = copy_user_dax(iomap.bdev, iomap.dax_dev,
+					sector, PAGE_SIZE, vmf->cow_page, vaddr);
 			break;
 		default:
 			WARN_ON_ONCE(1);
@@ -1207,8 +1215,8 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
 			mem_cgroup_count_vm_event(vmf->vma->vm_mm, PGMAJFAULT);
 			major = VM_FAULT_MAJOR;
 		}
-		error = dax_insert_mapping(mapping, iomap.bdev, sector,
-				PAGE_SIZE, &entry, vmf->vma, vmf);
+		error = dax_insert_mapping(mapping, iomap.bdev, iomap.dax_dev,
+				sector, PAGE_SIZE, &entry, vmf->vma, vmf);
 		/* -EBUSY is fine, somebody else faulted on the same PTE */
 		if (error == -EBUSY)
 			error = 0;
@@ -1258,41 +1266,48 @@ static int dax_pmd_insert_mapping(struct vm_fault *vmf, struct iomap *iomap,
 		loff_t pos, void **entryp)
 {
 	struct address_space *mapping = vmf->vma->vm_file->f_mapping;
+	const sector_t sector = dax_iomap_sector(iomap, pos);
+	struct dax_device *dax_dev = iomap->dax_dev;
 	struct block_device *bdev = iomap->bdev;
 	struct inode *inode = mapping->host;
-	struct blk_dax_ctl dax = {
-		.sector = dax_iomap_sector(iomap, pos),
-		.size = PMD_SIZE,
-	};
-	long length = dax_map_atomic(bdev, &dax);
-	void *ret = NULL;
-
-	if (length < 0) /* dax_map_atomic() failed */
+	const size_t size = PMD_SIZE;
+	void *ret = NULL, *kaddr;
+	long length = 0;
+	pgoff_t pgoff;
+	pfn_t pfn;
+	int id;
+
+	if (bdev_dax_pgoff(bdev, sector, size, &pgoff) != 0)
 		goto fallback;
-	if (length < PMD_SIZE)
-		goto unmap_fallback;
-	if (pfn_t_to_pfn(dax.pfn) & PG_PMD_COLOUR)
-		goto unmap_fallback;
-	if (!pfn_t_devmap(dax.pfn))
-		goto unmap_fallback;
-
-	dax_unmap_atomic(bdev, &dax);
 
-	ret = dax_insert_mapping_entry(mapping, vmf, *entryp, dax.sector,
+	id = dax_read_lock();
+	length = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), &kaddr, &pfn);
+	if (length < 0)
+		goto unlock_fallback;
+	length = PFN_PHYS(length);
+
+	if (length < size)
+		goto unlock_fallback;
+	if (pfn_t_to_pfn(pfn) & PG_PMD_COLOUR)
+		goto unlock_fallback;
+	if (!pfn_t_devmap(pfn))
+		goto unlock_fallback;
+	dax_read_unlock(id);
+
+	ret = dax_insert_mapping_entry(mapping, vmf, *entryp, sector,
 			RADIX_DAX_PMD);
 	if (IS_ERR(ret))
 		goto fallback;
 	*entryp = ret;
 
-	trace_dax_pmd_insert_mapping(inode, vmf, length, dax.pfn, ret);
+	trace_dax_pmd_insert_mapping(inode, vmf, length, pfn, ret);
 	return vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd,
-			dax.pfn, vmf->flags & FAULT_FLAG_WRITE);
+			pfn, vmf->flags & FAULT_FLAG_WRITE);
 
- unmap_fallback:
-	dax_unmap_atomic(bdev, &dax);
+unlock_fallback:
+	dax_read_unlock(id);
 fallback:
-	trace_dax_pmd_insert_mapping_fallback(inode, vmf, length,
-			dax.pfn, ret);
+	trace_dax_pmd_insert_mapping_fallback(inode, vmf, length, pfn, ret);
 	return VM_FAULT_FALLBACK;
 }
 
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 341251421ced..5420767c9b68 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -42,6 +42,7 @@
 #include <linux/seq_file.h>
 #include <linux/compat.h>
 #include <linux/rculist.h>
+#include <net/busy_poll.h>
 
 /*
  * LOCKING:
@@ -224,6 +225,11 @@ struct eventpoll {
 	/* used to optimize loop detection check */
 	int visited;
 	struct list_head visited_list_link;
+
+#ifdef CONFIG_NET_RX_BUSY_POLL
+	/* used to track busy poll napi_id */
+	unsigned int napi_id;
+#endif
 };
 
 /* Wait structure used by the poll hooks */
@@ -384,6 +390,77 @@ static inline int ep_events_available(struct eventpoll *ep)
 	return !list_empty(&ep->rdllist) || ep->ovflist != EP_UNACTIVE_PTR;
 }
 
+#ifdef CONFIG_NET_RX_BUSY_POLL
+static bool ep_busy_loop_end(void *p, unsigned long start_time)
+{
+	struct eventpoll *ep = p;
+
+	return ep_events_available(ep) || busy_loop_timeout(start_time);
+}
+#endif /* CONFIG_NET_RX_BUSY_POLL */
+
+/*
+ * Busy poll if globally on and supporting sockets found && no events,
+ * busy loop will return if need_resched or ep_events_available.
+ *
+ * we must do our busy polling with irqs enabled
+ */
+static void ep_busy_loop(struct eventpoll *ep, int nonblock)
+{
+#ifdef CONFIG_NET_RX_BUSY_POLL
+	unsigned int napi_id = READ_ONCE(ep->napi_id);
+
+	if ((napi_id >= MIN_NAPI_ID) && net_busy_loop_on())
+		napi_busy_loop(napi_id, nonblock ? NULL : ep_busy_loop_end, ep);
+#endif
+}
+
+static inline void ep_reset_busy_poll_napi_id(struct eventpoll *ep)
+{
+#ifdef CONFIG_NET_RX_BUSY_POLL
+	if (ep->napi_id)
+		ep->napi_id = 0;
+#endif
+}
+
+/*
+ * Set epoll busy poll NAPI ID from sk.
+ */
+static inline void ep_set_busy_poll_napi_id(struct epitem *epi)
+{
+#ifdef CONFIG_NET_RX_BUSY_POLL
+	struct eventpoll *ep;
+	unsigned int napi_id;
+	struct socket *sock;
+	struct sock *sk;
+	int err;
+
+	if (!net_busy_loop_on())
+		return;
+
+	sock = sock_from_file(epi->ffd.file, &err);
+	if (!sock)
+		return;
+
+	sk = sock->sk;
+	if (!sk)
+		return;
+
+	napi_id = READ_ONCE(sk->sk_napi_id);
+	ep = epi->ep;
+
+	/* Non-NAPI IDs can be rejected
+	 *	or
+	 * Nothing to do if we already have this ID
+	 */
+	if (napi_id < MIN_NAPI_ID || napi_id == ep->napi_id)
+		return;
+
+	/* record NAPI ID for use in next busy poll */
+	ep->napi_id = napi_id;
+#endif
+}
+
 /**
  * ep_call_nested - Perform a bound (possibly) nested call, by checking
  *                  that the recursion limit is not exceeded, and that
@@ -1022,6 +1099,8 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k
 
 	spin_lock_irqsave(&ep->lock, flags);
 
+	ep_set_busy_poll_napi_id(epi);
+
 	/*
 	 * If the event mask does not contain any poll(2) event, we consider the
 	 * descriptor to be disabled. This condition is likely the effect of the
@@ -1363,6 +1442,9 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
 	/* We have to drop the new item inside our item list to keep track of it */
 	spin_lock_irqsave(&ep->lock, flags);
 
+	/* record NAPI ID of new item if present */
+	ep_set_busy_poll_napi_id(epi);
+
 	/* If the file is already "ready" we drop it inside the ready list */
 	if ((revents & event->events) && !ep_is_linked(&epi->rdllink)) {
 		list_add_tail(&epi->rdllink, &ep->rdllist);
@@ -1637,10 +1719,21 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
 	}
 
 fetch_events:
+
+	if (!ep_events_available(ep))
+		ep_busy_loop(ep, timed_out);
+
 	spin_lock_irqsave(&ep->lock, flags);
 
 	if (!ep_events_available(ep)) {
 		/*
+		 * Busy poll timed out.  Drop NAPI ID for now, we can add
+		 * it back in when we have moved a socket with a valid NAPI
+		 * ID onto the ready list.
+		 */
+		ep_reset_busy_poll_napi_id(ep);
+
+		/*
 		 * We don't have any available event to return to the caller.
 		 * We need to sleep here, and we will be wake up by
 		 * ep_poll_callback() when events will become available.
diff --git a/fs/exec.c b/fs/exec.c
index 65145a3df065..72934df68471 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1320,6 +1320,7 @@ void setup_new_exec(struct linux_binprm * bprm)
 	else
 		set_dumpable(current->mm, suid_dumpable);
 
+	arch_setup_new_exec();
 	perf_event_exec();
 	__set_task_comm(current, kbasename(bprm->filename), true);
 
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index 5e64de9c5093..03f5ce1d3dbe 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -779,7 +779,6 @@ extern void ext2_evict_inode(struct inode *);
 extern int ext2_get_block(struct inode *, sector_t, struct buffer_head *, int);
 extern int ext2_setattr (struct dentry *, struct iattr *);
 extern void ext2_set_inode_flags(struct inode *inode);
-extern void ext2_get_inode_flags(struct ext2_inode_info *);
 extern int ext2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 		       u64 start, u64 len);
 
@@ -796,7 +795,8 @@ void ext2_error(struct super_block *, const char *, const char *, ...);
 extern __printf(3, 4)
 void ext2_msg(struct super_block *, const char *, const char *, ...);
 extern void ext2_update_dynamic_rev (struct super_block *sb);
-extern void ext2_write_super (struct super_block *);
+extern void ext2_sync_super(struct super_block *sb, struct ext2_super_block *es,
+			    int wait);
 
 /*
  * Inodes and files operations
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 128cce540645..26d77f9f8c12 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -799,6 +799,7 @@ int ext2_get_block(struct inode *inode, sector_t iblock,
 static int ext2_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
 		unsigned flags, struct iomap *iomap)
 {
+	struct block_device *bdev;
 	unsigned int blkbits = inode->i_blkbits;
 	unsigned long first_block = offset >> blkbits;
 	unsigned long max_blocks = (length + (1 << blkbits) - 1) >> blkbits;
@@ -812,8 +813,13 @@ static int ext2_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
 		return ret;
 
 	iomap->flags = 0;
-	iomap->bdev = inode->i_sb->s_bdev;
+	bdev = inode->i_sb->s_bdev;
+	iomap->bdev = bdev;
 	iomap->offset = (u64)first_block << blkbits;
+	if (blk_queue_dax(bdev->bd_queue))
+		iomap->dax_dev = dax_get_by_host(bdev->bd_disk->disk_name);
+	else
+		iomap->dax_dev = NULL;
 
 	if (ret == 0) {
 		iomap->type = IOMAP_HOLE;
@@ -835,6 +841,7 @@ static int
 ext2_iomap_end(struct inode *inode, loff_t offset, loff_t length,
 		ssize_t written, unsigned flags, struct iomap *iomap)
 {
+	put_dax(iomap->dax_dev);
 	if (iomap->type == IOMAP_MAPPED &&
 	    written < length &&
 	    (flags & IOMAP_WRITE))
@@ -1384,25 +1391,6 @@ void ext2_set_inode_flags(struct inode *inode)
 		inode->i_flags |= S_DAX;
 }
 
-/* Propagate flags from i_flags to EXT2_I(inode)->i_flags */
-void ext2_get_inode_flags(struct ext2_inode_info *ei)
-{
-	unsigned int flags = ei->vfs_inode.i_flags;
-
-	ei->i_flags &= ~(EXT2_SYNC_FL|EXT2_APPEND_FL|
-			EXT2_IMMUTABLE_FL|EXT2_NOATIME_FL|EXT2_DIRSYNC_FL);
-	if (flags & S_SYNC)
-		ei->i_flags |= EXT2_SYNC_FL;
-	if (flags & S_APPEND)
-		ei->i_flags |= EXT2_APPEND_FL;
-	if (flags & S_IMMUTABLE)
-		ei->i_flags |= EXT2_IMMUTABLE_FL;
-	if (flags & S_NOATIME)
-		ei->i_flags |= EXT2_NOATIME_FL;
-	if (flags & S_DIRSYNC)
-		ei->i_flags |= EXT2_DIRSYNC_FL;
-}
-
 struct inode *ext2_iget (struct super_block *sb, unsigned long ino)
 {
 	struct ext2_inode_info *ei;
@@ -1563,7 +1551,6 @@ static int __ext2_write_inode(struct inode *inode, int do_sync)
 	if (ei->i_state & EXT2_STATE_NEW)
 		memset(raw_inode, 0, EXT2_SB(sb)->s_inode_size);
 
-	ext2_get_inode_flags(ei);
 	raw_inode->i_mode = cpu_to_le16(inode->i_mode);
 	if (!(test_opt(sb, NO_UID32))) {
 		raw_inode->i_uid_low = cpu_to_le16(low_16_bits(uid));
@@ -1615,7 +1602,7 @@ static int __ext2_write_inode(struct inode *inode, int do_sync)
 				EXT2_SET_RO_COMPAT_FEATURE(sb,
 					EXT2_FEATURE_RO_COMPAT_LARGE_FILE);
 				spin_unlock(&EXT2_SB(sb)->s_lock);
-				ext2_write_super(sb);
+				ext2_sync_super(sb, EXT2_SB(sb)->s_es, 1);
 			}
 		}
 	}
diff --git a/fs/ext2/ioctl.c b/fs/ext2/ioctl.c
index 191e02b28ce8..087f122cca42 100644
--- a/fs/ext2/ioctl.c
+++ b/fs/ext2/ioctl.c
@@ -29,7 +29,6 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 
 	switch (cmd) {
 	case EXT2_IOC_GETFLAGS:
-		ext2_get_inode_flags(ei);
 		flags = ei->i_flags & EXT2_FL_USER_VISIBLE;
 		return put_user(flags, (int __user *) arg);
 	case EXT2_IOC_SETFLAGS: {
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 9e25a71fe1a2..8ac673c71a36 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -36,8 +36,7 @@
 #include "xattr.h"
 #include "acl.h"
 
-static void ext2_sync_super(struct super_block *sb,
-			    struct ext2_super_block *es, int wait);
+static void ext2_write_super(struct super_block *sb);
 static int ext2_remount (struct super_block * sb, int * flags, char * data);
 static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf);
 static int ext2_sync_fs(struct super_block *sb, int wait);
@@ -123,13 +122,29 @@ void ext2_update_dynamic_rev(struct super_block *sb)
 	 */
 }
 
+#ifdef CONFIG_QUOTA
+static int ext2_quota_off(struct super_block *sb, int type);
+
+static void ext2_quota_off_umount(struct super_block *sb)
+{
+	int type;
+
+	for (type = 0; type < MAXQUOTAS; type++)
+		ext2_quota_off(sb, type);
+}
+#else
+static inline void ext2_quota_off_umount(struct super_block *sb)
+{
+}
+#endif
+
 static void ext2_put_super (struct super_block * sb)
 {
 	int db_count;
 	int i;
 	struct ext2_sb_info *sbi = EXT2_SB(sb);
 
-	dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
+	ext2_quota_off_umount(sb);
 
 	if (sbi->s_mb_cache) {
 		ext2_xattr_destroy_cache(sbi->s_mb_cache);
@@ -314,10 +329,23 @@ static int ext2_show_options(struct seq_file *seq, struct dentry *root)
 #ifdef CONFIG_QUOTA
 static ssize_t ext2_quota_read(struct super_block *sb, int type, char *data, size_t len, loff_t off);
 static ssize_t ext2_quota_write(struct super_block *sb, int type, const char *data, size_t len, loff_t off);
+static int ext2_quota_on(struct super_block *sb, int type, int format_id,
+			 const struct path *path);
 static struct dquot **ext2_get_dquots(struct inode *inode)
 {
 	return EXT2_I(inode)->i_dquot;
 }
+
+static const struct quotactl_ops ext2_quotactl_ops = {
+	.quota_on	= ext2_quota_on,
+	.quota_off	= ext2_quota_off,
+	.quota_sync	= dquot_quota_sync,
+	.get_state	= dquot_get_state,
+	.set_info	= dquot_set_dqinfo,
+	.get_dqblk	= dquot_get_dqblk,
+	.set_dqblk	= dquot_set_dqblk,
+	.get_nextdqblk	= dquot_get_next_dqblk,
+};
 #endif
 
 static const struct super_operations ext2_sops = {
@@ -1117,7 +1145,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
 
 #ifdef CONFIG_QUOTA
 	sb->dq_op = &dquot_operations;
-	sb->s_qcop = &dquot_quotactl_ops;
+	sb->s_qcop = &ext2_quotactl_ops;
 	sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP;
 #endif
 
@@ -1194,8 +1222,8 @@ static void ext2_clear_super_error(struct super_block *sb)
 	}
 }
 
-static void ext2_sync_super(struct super_block *sb, struct ext2_super_block *es,
-			    int wait)
+void ext2_sync_super(struct super_block *sb, struct ext2_super_block *es,
+		     int wait)
 {
 	ext2_clear_super_error(sb);
 	spin_lock(&EXT2_SB(sb)->s_lock);
@@ -1270,7 +1298,7 @@ static int ext2_unfreeze(struct super_block *sb)
 	return 0;
 }
 
-void ext2_write_super(struct super_block *sb)
+static void ext2_write_super(struct super_block *sb)
 {
 	if (!(sb->s_flags & MS_RDONLY))
 		ext2_sync_fs(sb, 1);
@@ -1548,6 +1576,51 @@ out:
 	return len - towrite;
 }
 
+static int ext2_quota_on(struct super_block *sb, int type, int format_id,
+			 const struct path *path)
+{
+	int err;
+	struct inode *inode;
+
+	err = dquot_quota_on(sb, type, format_id, path);
+	if (err)
+		return err;
+
+	inode = d_inode(path->dentry);
+	inode_lock(inode);
+	EXT2_I(inode)->i_flags |= EXT2_NOATIME_FL | EXT2_IMMUTABLE_FL;
+	inode_set_flags(inode, S_NOATIME | S_IMMUTABLE,
+			S_NOATIME | S_IMMUTABLE);
+	inode_unlock(inode);
+	mark_inode_dirty(inode);
+
+	return 0;
+}
+
+static int ext2_quota_off(struct super_block *sb, int type)
+{
+	struct inode *inode = sb_dqopt(sb)->files[type];
+	int err;
+
+	if (!inode || !igrab(inode))
+		goto out;
+
+	err = dquot_quota_off(sb, type);
+	if (err)
+		goto out_put;
+
+	inode_lock(inode);
+	EXT2_I(inode)->i_flags &= ~(EXT2_NOATIME_FL | EXT2_IMMUTABLE_FL);
+	inode_set_flags(inode, 0, S_NOATIME | S_IMMUTABLE);
+	inode_unlock(inode);
+	mark_inode_dirty(inode);
+out_put:
+	iput(inode);
+	return err;
+out:
+	return dquot_quota_off(sb, type);
+}
+
 #endif
 
 static struct file_system_type ext2_fs_type = {
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index fb69ee2388db..f7b465b4fb69 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2477,7 +2477,6 @@ extern int ext4_truncate(struct inode *);
 extern int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length);
 extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks);
 extern void ext4_set_inode_flags(struct inode *);
-extern void ext4_get_inode_flags(struct ext4_inode_info *);
 extern int ext4_alloc_da_blocks(struct inode *inode);
 extern void ext4_set_aops(struct inode *inode);
 extern int ext4_writepage_trans_blocks(struct inode *);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index b9ffa9f4191f..f0729b0705c7 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3305,6 +3305,7 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
 static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
 			    unsigned flags, struct iomap *iomap)
 {
+	struct block_device *bdev;
 	unsigned int blkbits = inode->i_blkbits;
 	unsigned long first_block = offset >> blkbits;
 	unsigned long last_block = (offset + length - 1) >> blkbits;
@@ -3373,7 +3374,12 @@ retry:
 	}
 
 	iomap->flags = 0;
-	iomap->bdev = inode->i_sb->s_bdev;
+	bdev = inode->i_sb->s_bdev;
+	iomap->bdev = bdev;
+	if (blk_queue_dax(bdev->bd_queue))
+		iomap->dax_dev = dax_get_by_host(bdev->bd_disk->disk_name);
+	else
+		iomap->dax_dev = NULL;
 	iomap->offset = first_block << blkbits;
 
 	if (ret == 0) {
@@ -3406,6 +3412,7 @@ static int ext4_iomap_end(struct inode *inode, loff_t offset, loff_t length,
 	int blkbits = inode->i_blkbits;
 	bool truncate = false;
 
+	put_dax(iomap->dax_dev);
 	if (!(flags & IOMAP_WRITE) || (flags & IOMAP_FAULT))
 		return 0;
 
@@ -4502,31 +4509,6 @@ void ext4_set_inode_flags(struct inode *inode)
 			S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|S_DAX);
 }
 
-/* Propagate flags from i_flags to EXT4_I(inode)->i_flags */
-void ext4_get_inode_flags(struct ext4_inode_info *ei)
-{
-	unsigned int vfs_fl;
-	unsigned long old_fl, new_fl;
-
-	do {
-		vfs_fl = ei->vfs_inode.i_flags;
-		old_fl = ei->i_flags;
-		new_fl = old_fl & ~(EXT4_SYNC_FL|EXT4_APPEND_FL|
-				EXT4_IMMUTABLE_FL|EXT4_NOATIME_FL|
-				EXT4_DIRSYNC_FL);
-		if (vfs_fl & S_SYNC)
-			new_fl |= EXT4_SYNC_FL;
-		if (vfs_fl & S_APPEND)
-			new_fl |= EXT4_APPEND_FL;
-		if (vfs_fl & S_IMMUTABLE)
-			new_fl |= EXT4_IMMUTABLE_FL;
-		if (vfs_fl & S_NOATIME)
-			new_fl |= EXT4_NOATIME_FL;
-		if (vfs_fl & S_DIRSYNC)
-			new_fl |= EXT4_DIRSYNC_FL;
-	} while (cmpxchg(&ei->i_flags, old_fl, new_fl) != old_fl);
-}
-
 static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode,
 				  struct ext4_inode_info *ei)
 {
@@ -4963,7 +4945,6 @@ static int ext4_do_update_inode(handle_t *handle,
 	if (ext4_test_inode_state(inode, EXT4_STATE_NEW))
 		memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size);
 
-	ext4_get_inode_flags(ei);
 	raw_inode->i_mode = cpu_to_le16(inode->i_mode);
 	i_uid = i_uid_read(inode);
 	i_gid = i_gid_read(inode);
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index a4273ddb9922..184e74eb3004 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -500,7 +500,6 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 
 	switch (cmd) {
 	case EXT4_IOC_GETFLAGS:
-		ext4_get_inode_flags(ei);
 		flags = ei->i_flags & EXT4_FL_USER_VISIBLE;
 		return put_user(flags, (int __user *) arg);
 	case EXT4_IOC_SETFLAGS: {
@@ -888,7 +887,6 @@ resizefs_out:
 		struct fsxattr fa;
 
 		memset(&fa, 0, sizeof(struct fsxattr));
-		ext4_get_inode_flags(ei);
 		fa.fsx_xflags = ext4_iflags_to_xflags(ei->i_flags & EXT4_FL_USER_VISIBLE);
 
 		if (ext4_has_feature_project(inode->i_sb)) {
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index a9448db1cf7e..a9c72e39a4ee 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -839,6 +839,28 @@ static void dump_orphan_list(struct super_block *sb, struct ext4_sb_info *sbi)
 	}
 }
 
+#ifdef CONFIG_QUOTA
+static int ext4_quota_off(struct super_block *sb, int type);
+
+static inline void ext4_quota_off_umount(struct super_block *sb)
+{
+	int type;
+
+	if (ext4_has_feature_quota(sb)) {
+		dquot_disable(sb, -1,
+			      DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
+	} else {
+		/* Use our quota_off function to clear inode flags etc. */
+		for (type = 0; type < EXT4_MAXQUOTAS; type++)
+			ext4_quota_off(sb, type);
+	}
+}
+#else
+static inline void ext4_quota_off_umount(struct super_block *sb)
+{
+}
+#endif
+
 static void ext4_put_super(struct super_block *sb)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -847,7 +869,7 @@ static void ext4_put_super(struct super_block *sb)
 	int i, err;
 
 	ext4_unregister_li_request(sb);
-	dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
+	ext4_quota_off_umount(sb);
 
 	flush_workqueue(sbi->rsv_conversion_wq);
 	destroy_workqueue(sbi->rsv_conversion_wq);
@@ -1218,7 +1240,6 @@ static int ext4_mark_dquot_dirty(struct dquot *dquot);
 static int ext4_write_info(struct super_block *sb, int type);
 static int ext4_quota_on(struct super_block *sb, int type, int format_id,
 			 const struct path *path);
-static int ext4_quota_off(struct super_block *sb, int type);
 static int ext4_quota_on_mount(struct super_block *sb, int type);
 static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
 			       size_t len, loff_t off);
@@ -5344,11 +5365,33 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
 		if (err)
 			return err;
 	}
+
 	lockdep_set_quota_inode(path->dentry->d_inode, I_DATA_SEM_QUOTA);
 	err = dquot_quota_on(sb, type, format_id, path);
-	if (err)
+	if (err) {
 		lockdep_set_quota_inode(path->dentry->d_inode,
 					     I_DATA_SEM_NORMAL);
+	} else {
+		struct inode *inode = d_inode(path->dentry);
+		handle_t *handle;
+
+		/*
+		 * Set inode flags to prevent userspace from messing with quota
+		 * files. If this fails, we return success anyway since quotas
+		 * are already enabled and this is not a hard failure.
+		 */
+		inode_lock(inode);
+		handle = ext4_journal_start(inode, EXT4_HT_QUOTA, 1);
+		if (IS_ERR(handle))
+			goto unlock_inode;
+		EXT4_I(inode)->i_flags |= EXT4_NOATIME_FL | EXT4_IMMUTABLE_FL;
+		inode_set_flags(inode, S_NOATIME | S_IMMUTABLE,
+				S_NOATIME | S_IMMUTABLE);
+		ext4_mark_inode_dirty(handle, inode);
+		ext4_journal_stop(handle);
+	unlock_inode:
+		inode_unlock(inode);
+	}
 	return err;
 }
 
@@ -5422,24 +5465,39 @@ static int ext4_quota_off(struct super_block *sb, int type)
 {
 	struct inode *inode = sb_dqopt(sb)->files[type];
 	handle_t *handle;
+	int err;
 
 	/* Force all delayed allocation blocks to be allocated.
 	 * Caller already holds s_umount sem */
 	if (test_opt(sb, DELALLOC))
 		sync_filesystem(sb);
 
-	if (!inode)
+	if (!inode || !igrab(inode))
 		goto out;
 
-	/* Update modification times of quota files when userspace can
-	 * start looking at them */
+	err = dquot_quota_off(sb, type);
+	if (err)
+		goto out_put;
+
+	inode_lock(inode);
+	/*
+	 * Update modification times of quota files when userspace can
+	 * start looking at them. If we fail, we return success anyway since
+	 * this is not a hard failure and quotas are already disabled.
+	 */
 	handle = ext4_journal_start(inode, EXT4_HT_QUOTA, 1);
 	if (IS_ERR(handle))
-		goto out;
+		goto out_unlock;
+	EXT4_I(inode)->i_flags &= ~(EXT4_NOATIME_FL | EXT4_IMMUTABLE_FL);
+	inode_set_flags(inode, 0, S_NOATIME | S_IMMUTABLE);
 	inode->i_mtime = inode->i_ctime = current_time(inode);
 	ext4_mark_inode_dirty(handle, inode);
 	ext4_journal_stop(handle);
-
+out_unlock:
+	inode_unlock(inode);
+out_put:
+	iput(inode);
+	return err;
 out:
 	return dquot_quota_off(sb, type);
 }
diff --git a/fs/fcntl.c b/fs/fcntl.c
index be8fbe289087..8bd81c2e89b2 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -23,6 +23,7 @@
 #include <linux/pid_namespace.h>
 #include <linux/user_namespace.h>
 #include <linux/shmem_fs.h>
+#include <linux/compat.h>
 
 #include <asm/poll.h>
 #include <asm/siginfo.h>
@@ -420,6 +421,162 @@ out:
 }
 #endif
 
+#ifdef CONFIG_COMPAT
+static int get_compat_flock(struct flock *kfl, struct compat_flock __user *ufl)
+{
+	if (!access_ok(VERIFY_READ, ufl, sizeof(*ufl)) ||
+	    __get_user(kfl->l_type, &ufl->l_type) ||
+	    __get_user(kfl->l_whence, &ufl->l_whence) ||
+	    __get_user(kfl->l_start, &ufl->l_start) ||
+	    __get_user(kfl->l_len, &ufl->l_len) ||
+	    __get_user(kfl->l_pid, &ufl->l_pid))
+		return -EFAULT;
+	return 0;
+}
+
+static int put_compat_flock(struct flock *kfl, struct compat_flock __user *ufl)
+{
+	if (!access_ok(VERIFY_WRITE, ufl, sizeof(*ufl)) ||
+	    __put_user(kfl->l_type, &ufl->l_type) ||
+	    __put_user(kfl->l_whence, &ufl->l_whence) ||
+	    __put_user(kfl->l_start, &ufl->l_start) ||
+	    __put_user(kfl->l_len, &ufl->l_len) ||
+	    __put_user(kfl->l_pid, &ufl->l_pid))
+		return -EFAULT;
+	return 0;
+}
+
+#ifndef HAVE_ARCH_GET_COMPAT_FLOCK64
+static int get_compat_flock64(struct flock *kfl, struct compat_flock64 __user *ufl)
+{
+	if (!access_ok(VERIFY_READ, ufl, sizeof(*ufl)) ||
+	    __get_user(kfl->l_type, &ufl->l_type) ||
+	    __get_user(kfl->l_whence, &ufl->l_whence) ||
+	    __get_user(kfl->l_start, &ufl->l_start) ||
+	    __get_user(kfl->l_len, &ufl->l_len) ||
+	    __get_user(kfl->l_pid, &ufl->l_pid))
+		return -EFAULT;
+	return 0;
+}
+#endif
+
+#ifndef HAVE_ARCH_PUT_COMPAT_FLOCK64
+static int put_compat_flock64(struct flock *kfl, struct compat_flock64 __user *ufl)
+{
+	if (!access_ok(VERIFY_WRITE, ufl, sizeof(*ufl)) ||
+	    __put_user(kfl->l_type, &ufl->l_type) ||
+	    __put_user(kfl->l_whence, &ufl->l_whence) ||
+	    __put_user(kfl->l_start, &ufl->l_start) ||
+	    __put_user(kfl->l_len, &ufl->l_len) ||
+	    __put_user(kfl->l_pid, &ufl->l_pid))
+		return -EFAULT;
+	return 0;
+}
+#endif
+
+static unsigned int
+convert_fcntl_cmd(unsigned int cmd)
+{
+	switch (cmd) {
+	case F_GETLK64:
+		return F_GETLK;
+	case F_SETLK64:
+		return F_SETLK;
+	case F_SETLKW64:
+		return F_SETLKW;
+	}
+
+	return cmd;
+}
+
+COMPAT_SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd,
+		       compat_ulong_t, arg)
+{
+	mm_segment_t old_fs;
+	struct flock f;
+	long ret;
+	unsigned int conv_cmd;
+
+	switch (cmd) {
+	case F_GETLK:
+	case F_SETLK:
+	case F_SETLKW:
+		ret = get_compat_flock(&f, compat_ptr(arg));
+		if (ret != 0)
+			break;
+		old_fs = get_fs();
+		set_fs(KERNEL_DS);
+		ret = sys_fcntl(fd, cmd, (unsigned long)&f);
+		set_fs(old_fs);
+		if (cmd == F_GETLK && ret == 0) {
+			/* GETLK was successful and we need to return the data...
+			 * but it needs to fit in the compat structure.
+			 * l_start shouldn't be too big, unless the original
+			 * start + end is greater than COMPAT_OFF_T_MAX, in which
+			 * case the app was asking for trouble, so we return
+			 * -EOVERFLOW in that case.
+			 * l_len could be too big, in which case we just truncate it,
+			 * and only allow the app to see that part of the conflicting
+			 * lock that might make sense to it anyway
+			 */
+
+			if (f.l_start > COMPAT_OFF_T_MAX)
+				ret = -EOVERFLOW;
+			if (f.l_len > COMPAT_OFF_T_MAX)
+				f.l_len = COMPAT_OFF_T_MAX;
+			if (ret == 0)
+				ret = put_compat_flock(&f, compat_ptr(arg));
+		}
+		break;
+
+	case F_GETLK64:
+	case F_SETLK64:
+	case F_SETLKW64:
+	case F_OFD_GETLK:
+	case F_OFD_SETLK:
+	case F_OFD_SETLKW:
+		ret = get_compat_flock64(&f, compat_ptr(arg));
+		if (ret != 0)
+			break;
+		old_fs = get_fs();
+		set_fs(KERNEL_DS);
+		conv_cmd = convert_fcntl_cmd(cmd);
+		ret = sys_fcntl(fd, conv_cmd, (unsigned long)&f);
+		set_fs(old_fs);
+		if ((conv_cmd == F_GETLK || conv_cmd == F_OFD_GETLK) && ret == 0) {
+			/* need to return lock information - see above for commentary */
+			if (f.l_start > COMPAT_LOFF_T_MAX)
+				ret = -EOVERFLOW;
+			if (f.l_len > COMPAT_LOFF_T_MAX)
+				f.l_len = COMPAT_LOFF_T_MAX;
+			if (ret == 0)
+				ret = put_compat_flock64(&f, compat_ptr(arg));
+		}
+		break;
+
+	default:
+		ret = sys_fcntl(fd, cmd, arg);
+		break;
+	}
+	return ret;
+}
+
+COMPAT_SYSCALL_DEFINE3(fcntl, unsigned int, fd, unsigned int, cmd,
+		       compat_ulong_t, arg)
+{
+	switch (cmd) {
+	case F_GETLK64:
+	case F_SETLK64:
+	case F_SETLKW64:
+	case F_OFD_GETLK:
+	case F_OFD_SETLK:
+	case F_OFD_SETLKW:
+		return -EINVAL;
+	}
+	return compat_sys_fcntl64(fd, cmd, arg);
+}
+#endif
+
 /* Table to convert sigio signal codes into poll band bitmaps */
 
 static const long band_table[NSIGPOLL] = {
diff --git a/fs/fhandle.c b/fs/fhandle.c
index 5559168d5637..58a61f55e0d0 100644
--- a/fs/fhandle.c
+++ b/fs/fhandle.c
@@ -9,6 +9,7 @@
 #include <linux/fsnotify.h>
 #include <linux/personality.h>
 #include <linux/uaccess.h>
+#include <linux/compat.h>
 #include "internal.h"
 #include "mount.h"
 
@@ -264,3 +265,15 @@ SYSCALL_DEFINE3(open_by_handle_at, int, mountdirfd,
 	ret = do_handle_open(mountdirfd, handle, flags);
 	return ret;
 }
+
+#ifdef CONFIG_COMPAT
+/*
+ * Exactly like fs/open.c:sys_open_by_handle_at(), except that it
+ * doesn't set the O_LARGEFILE flag.
+ */
+COMPAT_SYSCALL_DEFINE3(open_by_handle_at, int, mountdirfd,
+			     struct file_handle __user *, handle, int, flags)
+{
+	return do_handle_open(mountdirfd, handle, flags);
+}
+#endif
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 01b97c012c6e..3814a60e0aea 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -38,11 +38,6 @@ struct metapath {
 	__u16 mp_list[GFS2_MAX_META_HEIGHT];
 };
 
-struct strip_mine {
-	int sm_first;
-	unsigned int sm_height;
-};
-
 /**
  * gfs2_unstuffer_page - unstuff a stuffed inode into a block cached by a page
  * @ip: the inode
@@ -253,6 +248,19 @@ static inline unsigned int metapath_branch_start(const struct metapath *mp)
 }
 
 /**
+ * metaptr1 - Return the first possible metadata pointer in a metaath buffer
+ * @height: The metadata height (0 = dinode)
+ * @mp: The metapath
+ */
+static inline __be64 *metaptr1(unsigned int height, const struct metapath *mp)
+{
+	struct buffer_head *bh = mp->mp_bh[height];
+	if (height == 0)
+		return ((__be64 *)(bh->b_data + sizeof(struct gfs2_dinode)));
+	return ((__be64 *)(bh->b_data + sizeof(struct gfs2_meta_header)));
+}
+
+/**
  * metapointer - Return pointer to start of metadata in a buffer
  * @height: The metadata height (0 = dinode)
  * @mp: The metapath
@@ -264,10 +272,8 @@ static inline unsigned int metapath_branch_start(const struct metapath *mp)
 
 static inline __be64 *metapointer(unsigned int height, const struct metapath *mp)
 {
-	struct buffer_head *bh = mp->mp_bh[height];
-	unsigned int head_size = (height > 0) ?
-		sizeof(struct gfs2_meta_header) : sizeof(struct gfs2_dinode);
-	return ((__be64 *)(bh->b_data + head_size)) + mp->mp_list[height];
+	__be64 *p = metaptr1(height, mp);
+	return p + mp->mp_list[height];
 }
 
 static void gfs2_metapath_ra(struct gfs2_glock *gl,
@@ -296,6 +302,23 @@ static void gfs2_metapath_ra(struct gfs2_glock *gl,
 }
 
 /**
+ * lookup_mp_height - helper function for lookup_metapath
+ * @ip: the inode
+ * @mp: the metapath
+ * @h: the height which needs looking up
+ */
+static int lookup_mp_height(struct gfs2_inode *ip, struct metapath *mp, int h)
+{
+	__be64 *ptr = metapointer(h, mp);
+	u64 dblock = be64_to_cpu(*ptr);
+
+	if (!dblock)
+		return h + 1;
+
+	return gfs2_meta_indirect_buffer(ip, h + 1, dblock, &mp->mp_bh[h + 1]);
+}
+
+/**
  * lookup_metapath - Walk the metadata tree to a specific point
  * @ip: The inode
  * @mp: The metapath
@@ -316,17 +339,10 @@ static int lookup_metapath(struct gfs2_inode *ip, struct metapath *mp)
 {
 	unsigned int end_of_metadata = ip->i_height - 1;
 	unsigned int x;
-	__be64 *ptr;
-	u64 dblock;
 	int ret;
 
 	for (x = 0; x < end_of_metadata; x++) {
-		ptr = metapointer(x, mp);
-		dblock = be64_to_cpu(*ptr);
-		if (!dblock)
-			return x + 1;
-
-		ret = gfs2_meta_indirect_buffer(ip, x+1, dblock, &mp->mp_bh[x+1]);
+		ret = lookup_mp_height(ip, mp, x);
 		if (ret)
 			return ret;
 	}
@@ -334,6 +350,35 @@ static int lookup_metapath(struct gfs2_inode *ip, struct metapath *mp)
 	return ip->i_height;
 }
 
+/**
+ * fillup_metapath - fill up buffers for the metadata path to a specific height
+ * @ip: The inode
+ * @mp: The metapath
+ * @h: The height to which it should be mapped
+ *
+ * Similar to lookup_metapath, but does lookups for a range of heights
+ *
+ * Returns: error or height of metadata tree
+ */
+
+static int fillup_metapath(struct gfs2_inode *ip, struct metapath *mp, int h)
+{
+	unsigned int start_h = h - 1;
+	int ret;
+
+	if (h) {
+		/* find the first buffer we need to look up. */
+		while (start_h > 0 && mp->mp_bh[start_h] == NULL)
+			start_h--;
+		for (; start_h < h; start_h++) {
+			ret = lookup_mp_height(ip, mp, start_h);
+			if (ret)
+				return ret;
+		}
+	}
+	return ip->i_height;
+}
+
 static inline void release_metapath(struct metapath *mp)
 {
 	int i;
@@ -422,6 +467,13 @@ enum alloc_state {
 	/* ALLOC_UNSTUFF = 3,   TBD and rather complicated */
 };
 
+static inline unsigned int hptrs(struct gfs2_sbd *sdp, const unsigned int hgt)
+{
+	if (hgt)
+		return sdp->sd_inptrs;
+	return sdp->sd_diptrs;
+}
+
 /**
  * gfs2_bmap_alloc - Build a metadata tree of the requested height
  * @inode: The GFS2 inode
@@ -620,7 +672,7 @@ int gfs2_block_map(struct inode *inode, sector_t lblock,
 
 	BUG_ON(maxlen == 0);
 
-	memset(mp.mp_bh, 0, sizeof(mp.mp_bh));
+	memset(&mp, 0, sizeof(mp));
 	bmap_lock(ip, create);
 	clear_buffer_mapped(bh_map);
 	clear_buffer_new(bh_map);
@@ -702,252 +754,6 @@ int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsi
 }
 
 /**
- * do_strip - Look for a layer a particular layer of the file and strip it off
- * @ip: the inode
- * @dibh: the dinode buffer
- * @bh: A buffer of pointers
- * @top: The first pointer in the buffer
- * @bottom: One more than the last pointer
- * @height: the height this buffer is at
- * @sm: a pointer to a struct strip_mine
- *
- * Returns: errno
- */
-
-static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
-		    struct buffer_head *bh, __be64 *top, __be64 *bottom,
-		    unsigned int height, struct strip_mine *sm)
-{
-	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
-	struct gfs2_rgrp_list rlist;
-	struct gfs2_trans *tr;
-	u64 bn, bstart;
-	u32 blen, btotal;
-	__be64 *p;
-	unsigned int rg_blocks = 0;
-	int metadata;
-	unsigned int revokes = 0;
-	int x;
-	int error;
-	int jblocks_rqsted;
-
-	error = gfs2_rindex_update(sdp);
-	if (error)
-		return error;
-
-	if (!*top)
-		sm->sm_first = 0;
-
-	if (height != sm->sm_height)
-		return 0;
-
-	if (sm->sm_first) {
-		top++;
-		sm->sm_first = 0;
-	}
-
-	metadata = (height != ip->i_height - 1);
-	if (metadata)
-		revokes = (height) ? sdp->sd_inptrs : sdp->sd_diptrs;
-	else if (ip->i_depth)
-		revokes = sdp->sd_inptrs;
-
-	memset(&rlist, 0, sizeof(struct gfs2_rgrp_list));
-	bstart = 0;
-	blen = 0;
-
-	for (p = top; p < bottom; p++) {
-		if (!*p)
-			continue;
-
-		bn = be64_to_cpu(*p);
-
-		if (bstart + blen == bn)
-			blen++;
-		else {
-			if (bstart)
-				gfs2_rlist_add(ip, &rlist, bstart);
-
-			bstart = bn;
-			blen = 1;
-		}
-	}
-
-	if (bstart)
-		gfs2_rlist_add(ip, &rlist, bstart);
-	else
-		goto out; /* Nothing to do */
-
-	gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE);
-
-	for (x = 0; x < rlist.rl_rgrps; x++) {
-		struct gfs2_rgrpd *rgd;
-		rgd = rlist.rl_ghs[x].gh_gl->gl_object;
-		rg_blocks += rgd->rd_length;
-	}
-
-	error = gfs2_glock_nq_m(rlist.rl_rgrps, rlist.rl_ghs);
-	if (error)
-		goto out_rlist;
-
-	if (gfs2_rs_active(&ip->i_res)) /* needs to be done with the rgrp glock held */
-		gfs2_rs_deltree(&ip->i_res);
-
-restart:
-	jblocks_rqsted = rg_blocks + RES_DINODE +
-		RES_INDIRECT + RES_STATFS + RES_QUOTA +
-		gfs2_struct2blk(sdp, revokes, sizeof(u64));
-	if (jblocks_rqsted > atomic_read(&sdp->sd_log_thresh2))
-		jblocks_rqsted = atomic_read(&sdp->sd_log_thresh2);
-	error = gfs2_trans_begin(sdp, jblocks_rqsted, revokes);
-	if (error)
-		goto out_rg_gunlock;
-
-	tr = current->journal_info;
-	down_write(&ip->i_rw_mutex);
-
-	gfs2_trans_add_meta(ip->i_gl, dibh);
-	gfs2_trans_add_meta(ip->i_gl, bh);
-
-	bstart = 0;
-	blen = 0;
-	btotal = 0;
-
-	for (p = top; p < bottom; p++) {
-		if (!*p)
-			continue;
-
-		/* check for max reasonable journal transaction blocks */
-		if (tr->tr_num_buf_new + RES_STATFS +
-		    RES_QUOTA >= atomic_read(&sdp->sd_log_thresh2)) {
-			if (rg_blocks >= tr->tr_num_buf_new)
-				rg_blocks -= tr->tr_num_buf_new;
-			else
-				rg_blocks = 0;
-			break;
-		}
-
-		bn = be64_to_cpu(*p);
-
-		if (bstart + blen == bn)
-			blen++;
-		else {
-			if (bstart) {
-				__gfs2_free_blocks(ip, bstart, blen, metadata);
-				btotal += blen;
-			}
-
-			bstart = bn;
-			blen = 1;
-		}
-
-		*p = 0;
-		gfs2_add_inode_blocks(&ip->i_inode, -1);
-	}
-	if (p == bottom)
-		rg_blocks = 0;
-
-	if (bstart) {
-		__gfs2_free_blocks(ip, bstart, blen, metadata);
-		btotal += blen;
-	}
-
-	gfs2_statfs_change(sdp, 0, +btotal, 0);
-	gfs2_quota_change(ip, -(s64)btotal, ip->i_inode.i_uid,
-			  ip->i_inode.i_gid);
-
-	ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode);
-
-	gfs2_dinode_out(ip, dibh->b_data);
-
-	up_write(&ip->i_rw_mutex);
-
-	gfs2_trans_end(sdp);
-
-	if (rg_blocks)
-		goto restart;
-
-out_rg_gunlock:
-	gfs2_glock_dq_m(rlist.rl_rgrps, rlist.rl_ghs);
-out_rlist:
-	gfs2_rlist_free(&rlist);
-out:
-	return error;
-}
-
-/**
- * recursive_scan - recursively scan through the end of a file
- * @ip: the inode
- * @dibh: the dinode buffer
- * @mp: the path through the metadata to the point to start
- * @height: the height the recursion is at
- * @block: the indirect block to look at
- * @first: 1 if this is the first block
- * @sm: data opaque to this function to pass to @bc
- *
- * When this is first called @height and @block should be zero and
- * @first should be 1.
- *
- * Returns: errno
- */
-
-static int recursive_scan(struct gfs2_inode *ip, struct buffer_head *dibh,
-			  struct metapath *mp, unsigned int height,
-			  u64 block, int first, struct strip_mine *sm)
-{
-	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
-	struct buffer_head *bh = NULL;
-	__be64 *top, *bottom;
-	u64 bn;
-	int error;
-	int mh_size = sizeof(struct gfs2_meta_header);
-
-	if (!height) {
-		error = gfs2_meta_inode_buffer(ip, &bh);
-		if (error)
-			return error;
-		dibh = bh;
-
-		top = (__be64 *)(bh->b_data + sizeof(struct gfs2_dinode)) + mp->mp_list[0];
-		bottom = (__be64 *)(bh->b_data + sizeof(struct gfs2_dinode)) + sdp->sd_diptrs;
-	} else {
-		error = gfs2_meta_indirect_buffer(ip, height, block, &bh);
-		if (error)
-			return error;
-
-		top = (__be64 *)(bh->b_data + mh_size) +
-				  (first ? mp->mp_list[height] : 0);
-
-		bottom = (__be64 *)(bh->b_data + mh_size) + sdp->sd_inptrs;
-	}
-
-	error = do_strip(ip, dibh, bh, top, bottom, height, sm);
-	if (error)
-		goto out;
-
-	if (height < ip->i_height - 1) {
-
-		gfs2_metapath_ra(ip->i_gl, bh, top);
-
-		for (; top < bottom; top++, first = 0) {
-			if (!*top)
-				continue;
-
-			bn = be64_to_cpu(*top);
-
-			error = recursive_scan(ip, dibh, mp, height + 1, bn,
-					       first, sm);
-			if (error)
-				break;
-		}
-	}
-out:
-	brelse(bh);
-	return error;
-}
-
-
-/**
  * gfs2_block_truncate_page - Deal with zeroing out data for truncate
  *
  * This is partly borrowed from ext3.
@@ -1106,41 +912,406 @@ out:
 	return error;
 }
 
-static int trunc_dealloc(struct gfs2_inode *ip, u64 size)
+/**
+ * sweep_bh_for_rgrps - find an rgrp in a meta buffer and free blocks therein
+ * @ip: inode
+ * @rg_gh: holder of resource group glock
+ * @mp: current metapath fully populated with buffers
+ * @btotal: place to keep count of total blocks freed
+ * @hgt: height we're processing
+ * @first: true if this is the first call to this function for this height
+ *
+ * We sweep a metadata buffer (provided by the metapath) for blocks we need to
+ * free, and free them all. However, we do it one rgrp at a time. If this
+ * block has references to multiple rgrps, we break it into individual
+ * transactions. This allows other processes to use the rgrps while we're
+ * focused on a single one, for better concurrency / performance.
+ * At every transaction boundary, we rewrite the inode into the journal.
+ * That way the bitmaps are kept consistent with the inode and we can recover
+ * if we're interrupted by power-outages.
+ *
+ * Returns: 0, or return code if an error occurred.
+ *          *btotal has the total number of blocks freed
+ */
+static int sweep_bh_for_rgrps(struct gfs2_inode *ip, struct gfs2_holder *rd_gh,
+			      const struct metapath *mp, u32 *btotal, int hgt,
+			      bool preserve1)
 {
 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
-	unsigned int height = ip->i_height;
-	u64 lblock;
-	struct metapath mp;
-	int error;
+	struct gfs2_rgrpd *rgd;
+	struct gfs2_trans *tr;
+	struct buffer_head *bh = mp->mp_bh[hgt];
+	__be64 *top, *bottom, *p;
+	int blks_outside_rgrp;
+	u64 bn, bstart, isize_blks;
+	s64 blen; /* needs to be s64 or gfs2_add_inode_blocks breaks */
+	int meta = ((hgt != ip->i_height - 1) ? 1 : 0);
+	int ret = 0;
+	bool buf_in_tr = false; /* buffer was added to transaction */
+
+	if (gfs2_metatype_check(sdp, bh,
+				(hgt ? GFS2_METATYPE_IN : GFS2_METATYPE_DI)))
+		return -EIO;
+
+more_rgrps:
+	blks_outside_rgrp = 0;
+	bstart = 0;
+	blen = 0;
+	top = metapointer(hgt, mp); /* first ptr from metapath */
+	/* If we're keeping some data at the truncation point, we've got to
+	   preserve the metadata tree by adding 1 to the starting metapath. */
+	if (preserve1)
+		top++;
+
+	bottom = (__be64 *)(bh->b_data + bh->b_size);
+
+	for (p = top; p < bottom; p++) {
+		if (!*p)
+			continue;
+		bn = be64_to_cpu(*p);
+		if (gfs2_holder_initialized(rd_gh)) {
+			rgd = (struct gfs2_rgrpd *)rd_gh->gh_gl->gl_object;
+			gfs2_assert_withdraw(sdp,
+				     gfs2_glock_is_locked_by_me(rd_gh->gh_gl));
+		} else {
+			rgd = gfs2_blk2rgrpd(sdp, bn, false);
+			ret = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE,
+						 0, rd_gh);
+			if (ret)
+				goto out;
+
+			/* Must be done with the rgrp glock held: */
+			if (gfs2_rs_active(&ip->i_res) &&
+			    rgd == ip->i_res.rs_rbm.rgd)
+				gfs2_rs_deltree(&ip->i_res);
+		}
+
+		if (!rgrp_contains_block(rgd, bn)) {
+			blks_outside_rgrp++;
+			continue;
+		}
+
+		/* The size of our transactions will be unknown until we
+		   actually process all the metadata blocks that relate to
+		   the rgrp. So we estimate. We know it can't be more than
+		   the dinode's i_blocks and we don't want to exceed the
+		   journal flush threshold, sd_log_thresh2. */
+		if (current->journal_info == NULL) {
+			unsigned int jblocks_rqsted, revokes;
+
+			jblocks_rqsted = rgd->rd_length + RES_DINODE +
+				RES_INDIRECT;
+			isize_blks = gfs2_get_inode_blocks(&ip->i_inode);
+			if (isize_blks > atomic_read(&sdp->sd_log_thresh2))
+				jblocks_rqsted +=
+					atomic_read(&sdp->sd_log_thresh2);
+			else
+				jblocks_rqsted += isize_blks;
+			revokes = jblocks_rqsted;
+			if (meta)
+				revokes += hptrs(sdp, hgt);
+			else if (ip->i_depth)
+				revokes += sdp->sd_inptrs;
+			ret = gfs2_trans_begin(sdp, jblocks_rqsted, revokes);
+			if (ret)
+				goto out_unlock;
+			down_write(&ip->i_rw_mutex);
+		}
+		/* check if we will exceed the transaction blocks requested */
+		tr = current->journal_info;
+		if (tr->tr_num_buf_new + RES_STATFS +
+		    RES_QUOTA >= atomic_read(&sdp->sd_log_thresh2)) {
+			/* We set blks_outside_rgrp to ensure the loop will
+			   be repeated for the same rgrp, but with a new
+			   transaction. */
+			blks_outside_rgrp++;
+			/* This next part is tricky. If the buffer was added
+			   to the transaction, we've already set some block
+			   pointers to 0, so we better follow through and free
+			   them, or we will introduce corruption (so break).
+			   This may be impossible, or at least rare, but I
+			   decided to cover the case regardless.
+
+			   If the buffer was not added to the transaction
+			   (this call), doing so would exceed our transaction
+			   size, so we need to end the transaction and start a
+			   new one (so goto). */
+
+			if (buf_in_tr)
+				break;
+			goto out_unlock;
+		}
+
+		gfs2_trans_add_meta(ip->i_gl, bh);
+		buf_in_tr = true;
+		*p = 0;
+		if (bstart + blen == bn) {
+			blen++;
+			continue;
+		}
+		if (bstart) {
+			__gfs2_free_blocks(ip, bstart, (u32)blen, meta);
+			(*btotal) += blen;
+			gfs2_add_inode_blocks(&ip->i_inode, -blen);
+		}
+		bstart = bn;
+		blen = 1;
+	}
+	if (bstart) {
+		__gfs2_free_blocks(ip, bstart, (u32)blen, meta);
+		(*btotal) += blen;
+		gfs2_add_inode_blocks(&ip->i_inode, -blen);
+	}
+out_unlock:
+	if (!ret && blks_outside_rgrp) { /* If buffer still has non-zero blocks
+					    outside the rgrp we just processed,
+					    do it all over again. */
+		if (current->journal_info) {
+			struct buffer_head *dibh = mp->mp_bh[0];
+
+			/* Every transaction boundary, we rewrite the dinode
+			   to keep its di_blocks current in case of failure. */
+			ip->i_inode.i_mtime = ip->i_inode.i_ctime =
+				CURRENT_TIME;
+			gfs2_trans_add_meta(ip->i_gl, dibh);
+			gfs2_dinode_out(ip, dibh->b_data);
+			up_write(&ip->i_rw_mutex);
+			gfs2_trans_end(sdp);
+		}
+		gfs2_glock_dq_uninit(rd_gh);
+		cond_resched();
+		goto more_rgrps;
+	}
+out:
+	return ret;
+}
+
+/**
+ * find_nonnull_ptr - find a non-null pointer given a metapath and height
+ * assumes the metapath is valid (with buffers) out to height h
+ * @mp: starting metapath
+ * @h: desired height to search
+ *
+ * Returns: true if a non-null pointer was found in the metapath buffer
+ *          false if all remaining pointers are NULL in the buffer
+ */
+static bool find_nonnull_ptr(struct gfs2_sbd *sdp, struct metapath *mp,
+			     unsigned int h)
+{
+	__be64 *ptr;
+	unsigned int ptrs = hptrs(sdp, h) - 1;
+
+	while (true) {
+		ptr = metapointer(h, mp);
+		if (*ptr) /* if we have a non-null pointer */
+			return true;
+
+		if (mp->mp_list[h] < ptrs)
+			mp->mp_list[h]++;
+		else
+			return false; /* no more pointers in this buffer */
+	}
+}
+
+enum dealloc_states {
+	DEALLOC_MP_FULL = 0,    /* Strip a metapath with all buffers read in */
+	DEALLOC_MP_LOWER = 1,   /* lower the metapath strip height */
+	DEALLOC_FILL_MP = 2,  /* Fill in the metapath to the given height. */
+	DEALLOC_DONE = 3,       /* process complete */
+};
 
-	if (!size)
+/**
+ * trunc_dealloc - truncate a file down to a desired size
+ * @ip: inode to truncate
+ * @newsize: The desired size of the file
+ *
+ * This function truncates a file to newsize. It works from the
+ * bottom up, and from the right to the left. In other words, it strips off
+ * the highest layer (data) before stripping any of the metadata. Doing it
+ * this way is best in case the operation is interrupted by power failure, etc.
+ * The dinode is rewritten in every transaction to guarantee integrity.
+ */
+static int trunc_dealloc(struct gfs2_inode *ip, u64 newsize)
+{
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	struct metapath mp;
+	struct buffer_head *dibh, *bh;
+	struct gfs2_holder rd_gh;
+	u64 lblock;
+	__u16 nbof[GFS2_MAX_META_HEIGHT]; /* new beginning of truncation */
+	unsigned int strip_h = ip->i_height - 1;
+	u32 btotal = 0;
+	int ret, state;
+	int mp_h; /* metapath buffers are read in to this height */
+	sector_t last_ra = 0;
+	u64 prev_bnr = 0;
+	bool preserve1; /* need to preserve the first meta pointer? */
+
+	if (!newsize)
 		lblock = 0;
 	else
-		lblock = (size - 1) >> sdp->sd_sb.sb_bsize_shift;
+		lblock = (newsize - 1) >> sdp->sd_sb.sb_bsize_shift;
 
+	memset(&mp, 0, sizeof(mp));
 	find_metapath(sdp, lblock, &mp, ip->i_height);
-	error = gfs2_rindex_update(sdp);
-	if (error)
-		return error;
 
-	error = gfs2_quota_hold(ip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE);
-	if (error)
-		return error;
+	memcpy(&nbof, &mp.mp_list, sizeof(nbof));
+
+	ret = gfs2_meta_inode_buffer(ip, &dibh);
+	if (ret)
+		return ret;
 
-	while (height--) {
-		struct strip_mine sm;
-		sm.sm_first = !!size;
-		sm.sm_height = height;
+	mp.mp_bh[0] = dibh;
+	ret = lookup_metapath(ip, &mp);
+	if (ret == ip->i_height)
+		state = DEALLOC_MP_FULL; /* We have a complete metapath */
+	else
+		state = DEALLOC_FILL_MP; /* deal with partial metapath */
 
-		error = recursive_scan(ip, NULL, &mp, 0, 0, 1, &sm);
-		if (error)
+	ret = gfs2_rindex_update(sdp);
+	if (ret)
+		goto out_metapath;
+
+	ret = gfs2_quota_hold(ip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE);
+	if (ret)
+		goto out_metapath;
+	gfs2_holder_mark_uninitialized(&rd_gh);
+
+	mp_h = strip_h;
+
+	while (state != DEALLOC_DONE) {
+		switch (state) {
+		/* Truncate a full metapath at the given strip height.
+		 * Note that strip_h == mp_h in order to be in this state. */
+		case DEALLOC_MP_FULL:
+			if (mp_h > 0) { /* issue read-ahead on metadata */
+				__be64 *top;
+
+				bh = mp.mp_bh[mp_h - 1];
+				if (bh->b_blocknr != last_ra) {
+					last_ra = bh->b_blocknr;
+					top = metaptr1(mp_h - 1, &mp);
+					gfs2_metapath_ra(ip->i_gl, bh, top);
+				}
+			}
+			/* If we're truncating to a non-zero size and the mp is
+			   at the beginning of file for the strip height, we
+			   need to preserve the first metadata pointer. */
+			preserve1 = (newsize &&
+				     (mp.mp_list[mp_h] == nbof[mp_h]));
+			bh = mp.mp_bh[mp_h];
+			gfs2_assert_withdraw(sdp, bh);
+			if (gfs2_assert_withdraw(sdp,
+						 prev_bnr != bh->b_blocknr)) {
+				printk(KERN_EMERG "GFS2: fsid=%s:inode %llu, "
+				       "block:%llu, i_h:%u, s_h:%u, mp_h:%u\n",
+				       sdp->sd_fsname,
+				       (unsigned long long)ip->i_no_addr,
+				       prev_bnr, ip->i_height, strip_h, mp_h);
+			}
+			prev_bnr = bh->b_blocknr;
+			ret = sweep_bh_for_rgrps(ip, &rd_gh, &mp, &btotal,
+						 mp_h, preserve1);
+			/* If we hit an error or just swept dinode buffer,
+			   just exit. */
+			if (ret || !mp_h) {
+				state = DEALLOC_DONE;
+				break;
+			}
+			state = DEALLOC_MP_LOWER;
+			break;
+
+		/* lower the metapath strip height */
+		case DEALLOC_MP_LOWER:
+			/* We're done with the current buffer, so release it,
+			   unless it's the dinode buffer. Then back up to the
+			   previous pointer. */
+			if (mp_h) {
+				brelse(mp.mp_bh[mp_h]);
+				mp.mp_bh[mp_h] = NULL;
+			}
+			/* If we can't get any lower in height, we've stripped
+			   off all we can. Next step is to back up and start
+			   stripping the previous level of metadata. */
+			if (mp_h == 0) {
+				strip_h--;
+				memcpy(&mp.mp_list, &nbof, sizeof(nbof));
+				mp_h = strip_h;
+				state = DEALLOC_FILL_MP;
+				break;
+			}
+			mp.mp_list[mp_h] = 0;
+			mp_h--; /* search one metadata height down */
+			if (mp.mp_list[mp_h] >= hptrs(sdp, mp_h) - 1)
+				break; /* loop around in the same state */
+			mp.mp_list[mp_h]++;
+			/* Here we've found a part of the metapath that is not
+			 * allocated. We need to search at that height for the
+			 * next non-null pointer. */
+			if (find_nonnull_ptr(sdp, &mp, mp_h)) {
+				state = DEALLOC_FILL_MP;
+				mp_h++;
+			}
+			/* No more non-null pointers at this height. Back up
+			   to the previous height and try again. */
+			break; /* loop around in the same state */
+
+		/* Fill the metapath with buffers to the given height. */
+		case DEALLOC_FILL_MP:
+			/* Fill the buffers out to the current height. */
+			ret = fillup_metapath(ip, &mp, mp_h);
+			if (ret < 0)
+				goto out;
+
+			/* If buffers found for the entire strip height */
+			if ((ret == ip->i_height) && (mp_h == strip_h)) {
+				state = DEALLOC_MP_FULL;
+				break;
+			}
+			if (ret < ip->i_height) /* We have a partial height */
+				mp_h = ret - 1;
+
+			/* If we find a non-null block pointer, crawl a bit
+			   higher up in the metapath and try again, otherwise
+			   we need to look lower for a new starting point. */
+			if (find_nonnull_ptr(sdp, &mp, mp_h))
+				mp_h++;
+			else
+				state = DEALLOC_MP_LOWER;
 			break;
+		}
 	}
 
-	gfs2_quota_unhold(ip);
+	if (btotal) {
+		if (current->journal_info == NULL) {
+			ret = gfs2_trans_begin(sdp, RES_DINODE + RES_STATFS +
+					       RES_QUOTA, 0);
+			if (ret)
+				goto out;
+			down_write(&ip->i_rw_mutex);
+		}
+		gfs2_statfs_change(sdp, 0, +btotal, 0);
+		gfs2_quota_change(ip, -(s64)btotal, ip->i_inode.i_uid,
+				  ip->i_inode.i_gid);
+		ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
+		gfs2_trans_add_meta(ip->i_gl, dibh);
+		gfs2_dinode_out(ip, dibh->b_data);
+		up_write(&ip->i_rw_mutex);
+		gfs2_trans_end(sdp);
+	}
 
-	return error;
+out:
+	if (gfs2_holder_initialized(&rd_gh))
+		gfs2_glock_dq_uninit(&rd_gh);
+	if (current->journal_info) {
+		up_write(&ip->i_rw_mutex);
+		gfs2_trans_end(sdp);
+		cond_resched();
+	}
+	gfs2_quota_unhold(ip);
+out_metapath:
+	release_metapath(&mp);
+	return ret;
 }
 
 static int trunc_end(struct gfs2_inode *ip)
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 6fe2a59c6a9a..c2062a108d19 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -911,11 +911,15 @@ out_qunlock:
 static long gfs2_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
 {
 	struct inode *inode = file_inode(file);
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
 	struct gfs2_inode *ip = GFS2_I(inode);
 	struct gfs2_holder gh;
 	int ret;
 
-	if ((mode & ~FALLOC_FL_KEEP_SIZE) || gfs2_is_jdata(ip))
+	if (mode & ~FALLOC_FL_KEEP_SIZE)
+		return -EOPNOTSUPP;
+	/* fallocate is needed by gfs2_grow to reserve space in the rindex */
+	if (gfs2_is_jdata(ip) && inode != sdp->sd_rindex)
 		return -EOPNOTSUPP;
 
 	inode_lock(inode);
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index ec0848fcca02..959a19ced4d5 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -73,7 +73,7 @@ static DEFINE_SPINLOCK(lru_lock);
 
 static struct rhashtable_params ht_parms = {
 	.nelem_hint = GFS2_GL_HASH_SIZE * 3 / 4,
-	.key_len = sizeof(struct lm_lockname),
+	.key_len = offsetofend(struct lm_lockname, ln_type),
 	.key_offset = offsetof(struct gfs2_glock, gl_name),
 	.head_offset = offsetof(struct gfs2_glock, gl_node),
 };
@@ -449,6 +449,9 @@ __acquires(&gl->gl_lockref.lock)
 	unsigned int lck_flags = (unsigned int)(gh ? gh->gh_flags : 0);
 	int ret;
 
+	if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)) &&
+	    target != LM_ST_UNLOCKED)
+		return;
 	lck_flags &= (LM_FLAG_TRY | LM_FLAG_TRY_1CB | LM_FLAG_NOEXP |
 		      LM_FLAG_PRIORITY);
 	GLOCK_BUG_ON(gl, gl->gl_state == target);
@@ -484,7 +487,8 @@ __acquires(&gl->gl_lockref.lock)
 		}
 		else if (ret) {
 			pr_err("lm_lock ret %d\n", ret);
-			GLOCK_BUG_ON(gl, 1);
+			GLOCK_BUG_ON(gl, !test_bit(SDF_SHUTDOWN,
+						   &sdp->sd_flags));
 		}
 	} else { /* lock_nolock */
 		finish_xmote(gl, target);
@@ -653,10 +657,10 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
 	struct lm_lockname name = { .ln_number = number,
 				    .ln_type = glops->go_type,
 				    .ln_sbd = sdp };
-	struct gfs2_glock *gl, *tmp = NULL;
+	struct gfs2_glock *gl, *tmp;
 	struct address_space *mapping;
 	struct kmem_cache *cachep;
-	int ret, tries = 0;
+	int ret = 0;
 
 	rcu_read_lock();
 	gl = rhashtable_lookup_fast(&gl_hash_table, &name, ht_parms);
@@ -721,35 +725,32 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
 	}
 
 again:
-	ret = rhashtable_lookup_insert_fast(&gl_hash_table, &gl->gl_node,
-					    ht_parms);
-	if (ret == 0) {
+	rcu_read_lock();
+	tmp = rhashtable_lookup_get_insert_fast(&gl_hash_table, &gl->gl_node,
+						ht_parms);
+	if (!tmp) {
 		*glp = gl;
-		return 0;
+		goto out;
 	}
-
-	if (ret == -EEXIST) {
-		ret = 0;
-		rcu_read_lock();
-		tmp = rhashtable_lookup_fast(&gl_hash_table, &name, ht_parms);
-		if (tmp == NULL || !lockref_get_not_dead(&tmp->gl_lockref)) {
-			if (++tries < 100) {
-				rcu_read_unlock();
-				cond_resched();
-				goto again;
-			}
-			tmp = NULL;
-			ret = -ENOMEM;
-		}
-		rcu_read_unlock();
-	} else {
-		WARN_ON_ONCE(ret);
+	if (IS_ERR(tmp)) {
+		ret = PTR_ERR(tmp);
+		goto out_free;
 	}
+	if (lockref_get_not_dead(&tmp->gl_lockref)) {
+		*glp = tmp;
+		goto out_free;
+	}
+	rcu_read_unlock();
+	cond_resched();
+	goto again;
+
+out_free:
 	kfree(gl->gl_lksb.sb_lvbptr);
 	kmem_cache_free(cachep, gl);
 	atomic_dec(&sdp->sd_glock_disposal);
-	*glp = tmp;
 
+out:
+	rcu_read_unlock();
 	return ret;
 }
 
@@ -1918,10 +1919,10 @@ static const struct seq_operations gfs2_sbstats_seq_ops = {
 
 #define GFS2_SEQ_GOODSIZE min(PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER, 65536UL)
 
-static int gfs2_glocks_open(struct inode *inode, struct file *file)
+static int __gfs2_glocks_open(struct inode *inode, struct file *file,
+			      const struct seq_operations *ops)
 {
-	int ret = seq_open_private(file, &gfs2_glock_seq_ops,
-				   sizeof(struct gfs2_glock_iter));
+	int ret = seq_open_private(file, ops, sizeof(struct gfs2_glock_iter));
 	if (ret == 0) {
 		struct seq_file *seq = file->private_data;
 		struct gfs2_glock_iter *gi = seq->private;
@@ -1932,11 +1933,16 @@ static int gfs2_glocks_open(struct inode *inode, struct file *file)
 		if (seq->buf)
 			seq->size = GFS2_SEQ_GOODSIZE;
 		gi->gl = NULL;
-		ret = rhashtable_walk_init(&gl_hash_table, &gi->hti, GFP_KERNEL);
+		rhashtable_walk_enter(&gl_hash_table, &gi->hti);
 	}
 	return ret;
 }
 
+static int gfs2_glocks_open(struct inode *inode, struct file *file)
+{
+	return __gfs2_glocks_open(inode, file, &gfs2_glock_seq_ops);
+}
+
 static int gfs2_glocks_release(struct inode *inode, struct file *file)
 {
 	struct seq_file *seq = file->private_data;
@@ -1949,20 +1955,7 @@ static int gfs2_glocks_release(struct inode *inode, struct file *file)
 
 static int gfs2_glstats_open(struct inode *inode, struct file *file)
 {
-	int ret = seq_open_private(file, &gfs2_glstats_seq_ops,
-				   sizeof(struct gfs2_glock_iter));
-	if (ret == 0) {
-		struct seq_file *seq = file->private_data;
-		struct gfs2_glock_iter *gi = seq->private;
-		gi->sdp = inode->i_private;
-		gi->last_pos = 0;
-		seq->buf = kmalloc(GFS2_SEQ_GOODSIZE, GFP_KERNEL | __GFP_NOWARN);
-		if (seq->buf)
-			seq->size = GFS2_SEQ_GOODSIZE;
-		gi->gl = NULL;
-		ret = rhashtable_walk_init(&gl_hash_table, &gi->hti, GFP_KERNEL);
-	}
-	return ret;
+	return __gfs2_glocks_open(inode, file, &gfs2_glstats_seq_ops);
 }
 
 static int gfs2_sbstats_open(struct inode *inode, struct file *file)
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 511e1ed7e2de..b7cf65d13561 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -203,11 +203,15 @@ enum {
 	DFL_DLM_RECOVERY	= 6,
 };
 
+/*
+ * We are using struct lm_lockname as an rhashtable key.  Avoid holes within
+ * the struct; padding at the end is fine.
+ */
 struct lm_lockname {
-	struct gfs2_sbd *ln_sbd;
 	u64 ln_number;
+	struct gfs2_sbd *ln_sbd;
 	unsigned int ln_type;
-} __packed __aligned(sizeof(int));
+};
 
 #define lm_name_equal(name1, name2) \
         (((name1)->ln_number == (name2)->ln_number) &&	\
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index e279c3ce27be..9f605ea4810c 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -202,8 +202,7 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type,
 fail_refresh:
 	ip->i_iopen_gh.gh_flags |= GL_NOCACHE;
 	ip->i_iopen_gh.gh_gl->gl_object = NULL;
-	gfs2_glock_dq_wait(&ip->i_iopen_gh);
-	gfs2_holder_uninit(&ip->i_iopen_gh);
+	gfs2_glock_dq_uninit(&ip->i_iopen_gh);
 fail_put:
 	if (io_gl)
 		gfs2_glock_put(io_gl);
@@ -667,6 +666,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
 	ip->i_height = 0;
 	ip->i_depth = 0;
 	ip->i_entries = 0;
+	ip->i_no_addr = 0; /* Temporarily zero until real addr is assigned */
 
 	switch(mode & S_IFMT) {
 	case S_IFREG:
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 86ccc0159393..83c9909ff14a 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -483,13 +483,6 @@ void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd)
 	}
 }
 
-static inline int rgrp_contains_block(struct gfs2_rgrpd *rgd, u64 block)
-{
-	u64 first = rgd->rd_data0;
-	u64 last = first + rgd->rd_data;
-	return first <= block && block < last;
-}
-
 /**
  * gfs2_blk2rgrpd - Find resource group for a given data/meta block number
  * @sdp: The GFS2 superblock
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index 66b51cf66dfa..e90478e2f545 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -83,5 +83,12 @@ static inline bool gfs2_rs_active(const struct gfs2_blkreserv *rs)
 	return rs && !RB_EMPTY_NODE(&rs->rs_node);
 }
 
+static inline int rgrp_contains_block(struct gfs2_rgrpd *rgd, u64 block)
+{
+	u64 first = rgd->rd_data0;
+	u64 last = first + rgd->rd_data;
+	return first <= block && block < last;
+}
+
 extern void check_and_update_goal(struct gfs2_inode *ip);
 #endif /* __RGRP_DOT_H__ */
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 361796a84fce..29b0473f6e74 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -793,7 +793,8 @@ static void gfs2_dirty_inode(struct inode *inode, int flags)
 
 	if (!(flags & (I_DIRTY_DATASYNC|I_DIRTY_SYNC)))
 		return;
-
+	if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+		return;
 	if (!gfs2_glock_is_locked_by_me(ip->i_gl)) {
 		ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
 		if (ret) {
@@ -1538,8 +1539,7 @@ static void gfs2_evict_inode(struct inode *inode)
 	error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_SKIP, &gh);
 	if (unlikely(error)) {
 		ip->i_iopen_gh.gh_flags |= GL_NOCACHE;
-		gfs2_glock_dq_wait(&ip->i_iopen_gh);
-		gfs2_holder_uninit(&ip->i_iopen_gh);
+		gfs2_glock_dq_uninit(&ip->i_iopen_gh);
 		goto out;
 	}
 
@@ -1617,7 +1617,7 @@ out_unlock:
 	if (gfs2_holder_initialized(&ip->i_iopen_gh)) {
 		if (test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags)) {
 			ip->i_iopen_gh.gh_flags |= GL_NOCACHE;
-			gfs2_glock_dq_wait(&ip->i_iopen_gh);
+			gfs2_glock_dq(&ip->i_iopen_gh);
 		}
 		gfs2_holder_uninit(&ip->i_iopen_gh);
 	}
@@ -1639,8 +1639,7 @@ out:
 	if (gfs2_holder_initialized(&ip->i_iopen_gh)) {
 		ip->i_iopen_gh.gh_gl->gl_object = NULL;
 		ip->i_iopen_gh.gh_flags |= GL_NOCACHE;
-		gfs2_glock_dq_wait(&ip->i_iopen_gh);
-		gfs2_holder_uninit(&ip->i_iopen_gh);
+		gfs2_glock_dq_uninit(&ip->i_iopen_gh);
 	}
 }
 
diff --git a/fs/inode.c b/fs/inode.c
index 88110fd0b282..131b2bcebc48 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -371,9 +371,6 @@ void inode_init_once(struct inode *inode)
 	INIT_LIST_HEAD(&inode->i_lru);
 	address_space_init_once(&inode->i_data);
 	i_size_ordered_init(inode);
-#ifdef CONFIG_FSNOTIFY
-	INIT_HLIST_HEAD(&inode->i_fsnotify_marks);
-#endif
 }
 EXPORT_SYMBOL(inode_init_once);
 
diff --git a/fs/internal.h b/fs/internal.h
index 11c6d89dce9c..076751d90ba2 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -108,8 +108,6 @@ extern struct file *do_filp_open(int dfd, struct filename *pathname,
 extern struct file *do_file_open_root(struct dentry *, struct vfsmount *,
 		const char *, const struct open_flags *);
 
-extern long do_handle_open(int mountdirfd,
-			   struct file_handle __user *ufh, int open_flag);
 extern int open_check_o_direct(struct file *f);
 extern int vfs_open(const struct path *, struct file *, const struct cred *);
 extern struct file *filp_clone_open(struct file *);
diff --git a/fs/iomap.c b/fs/iomap.c
index 141c3cd55a8b..4add7d4ad006 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -360,7 +360,8 @@ static int iomap_dax_zero(loff_t pos, unsigned offset, unsigned bytes,
 	sector_t sector = iomap->blkno +
 		(((pos & ~(PAGE_SIZE - 1)) - iomap->offset) >> 9);
 
-	return __dax_zero_page_range(iomap->bdev, sector, offset, bytes);
+	return __dax_zero_page_range(iomap->bdev, iomap->dax_dev, sector,
+			offset, bytes);
 }
 
 static loff_t
@@ -887,16 +888,14 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
 		flags |= IOMAP_WRITE;
 	}
 
-	if (mapping->nrpages) {
-		ret = filemap_write_and_wait_range(mapping, start, end);
-		if (ret)
-			goto out_free_dio;
+	ret = filemap_write_and_wait_range(mapping, start, end);
+	if (ret)
+		goto out_free_dio;
 
-		ret = invalidate_inode_pages2_range(mapping,
-				start >> PAGE_SHIFT, end >> PAGE_SHIFT);
-		WARN_ON_ONCE(ret);
-		ret = 0;
-	}
+	ret = invalidate_inode_pages2_range(mapping,
+			start >> PAGE_SHIFT, end >> PAGE_SHIFT);
+	WARN_ON_ONCE(ret);
+	ret = 0;
 
 	inode_dio_begin(inode);
 
@@ -951,7 +950,7 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
 	 * one is a pretty crazy thing to do, so we don't support it 100%.  If
 	 * this invalidation fails, tough, the write still worked...
 	 */
-	if (iov_iter_rw(iter) == WRITE && mapping->nrpages) {
+	if (iov_iter_rw(iter) == WRITE) {
 		int err = invalidate_inode_pages2_range(mapping,
 				start >> PAGE_SHIFT, end >> PAGE_SHIFT);
 		WARN_ON_ONCE(err);
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 5adc2fb62b0f..c43fe83ee708 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -43,6 +43,7 @@
 #include <linux/backing-dev.h>
 #include <linux/bitops.h>
 #include <linux/ratelimit.h>
+#include <linux/sched/mm.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/jbd2.h>
@@ -206,6 +207,14 @@ static int kjournald2(void *arg)
 	wake_up(&journal->j_wait_done_commit);
 
 	/*
+	 * Make sure that no allocations from this kernel thread will ever
+	 * recurse to the fs layer because we are responsible for the
+	 * transaction commit and any fs involvement might get stuck waiting for
+	 * the trasn. commit.
+	 */
+	memalloc_nofs_save();
+
+	/*
 	 * And now, wait forever for commit wakeup events.
 	 */
 	write_lock(&journal->j_state_lock);
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 5e659ee08d6a..9ee4832b6f8b 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -29,6 +29,7 @@
 #include <linux/backing-dev.h>
 #include <linux/bug.h>
 #include <linux/module.h>
+#include <linux/sched/mm.h>
 
 #include <trace/events/jbd2.h>
 
@@ -388,6 +389,11 @@ repeat:
 
 	rwsem_acquire_read(&journal->j_trans_commit_map, 0, 0, _THIS_IP_);
 	jbd2_journal_free_transaction(new_transaction);
+	/*
+	 * Ensure that no allocations done while the transaction is open are
+	 * going to recurse back to the fs layer.
+	 */
+	handle->saved_alloc_context = memalloc_nofs_save();
 	return 0;
 }
 
@@ -466,6 +472,7 @@ handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int rsv_blocks,
 	trace_jbd2_handle_start(journal->j_fs_dev->bd_dev,
 				handle->h_transaction->t_tid, type,
 				line_no, nblocks);
+
 	return handle;
 }
 EXPORT_SYMBOL(jbd2__journal_start);
@@ -1760,6 +1767,11 @@ int jbd2_journal_stop(handle_t *handle)
 	if (handle->h_rsv_handle)
 		jbd2_journal_free_reserved(handle->h_rsv_handle);
 free_and_exit:
+	/*
+	 * Scope of the GFP_NOFS context is over here and so we can restore the
+	 * original alloc context.
+	 */
+	memalloc_nofs_restore(handle->saved_alloc_context);
 	jbd2_free_handle(handle);
 	return err;
 }
diff --git a/fs/jfs/ioctl.c b/fs/jfs/ioctl.c
index fc89f9436784..5c5ac5b3aec3 100644
--- a/fs/jfs/ioctl.c
+++ b/fs/jfs/ioctl.c
@@ -64,7 +64,6 @@ long jfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 
 	switch (cmd) {
 	case JFS_IOC_GETFLAGS:
-		jfs_get_inode_flags(jfs_inode);
 		flags = jfs_inode->mode2 & JFS_FL_USER_VISIBLE;
 		flags = jfs_map_ext2(flags, 0);
 		return put_user(flags, (int __user *) arg);
@@ -98,7 +97,6 @@ long jfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 		/* Lock against other parallel changes of flags */
 		inode_lock(inode);
 
-		jfs_get_inode_flags(jfs_inode);
 		oldflags = jfs_inode->mode2;
 
 		/*
diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c
index 6aca224a5d68..f36ef68905a7 100644
--- a/fs/jfs/jfs_imap.c
+++ b/fs/jfs/jfs_imap.c
@@ -3148,7 +3148,6 @@ static void copy_to_dinode(struct dinode * dip, struct inode *ip)
 	else
 		dip->di_gid = cpu_to_le32(from_kgid(&init_user_ns,
 						    jfs_ip->saved_gid));
-	jfs_get_inode_flags(jfs_ip);
 	/*
 	 * mode2 is only needed for storing the higher order bits.
 	 * Trust i_mode for the lower order ones
diff --git a/fs/jfs/jfs_inode.c b/fs/jfs/jfs_inode.c
index 375dd257a34f..5e9b7bb3aabf 100644
--- a/fs/jfs/jfs_inode.c
+++ b/fs/jfs/jfs_inode.c
@@ -45,24 +45,6 @@ void jfs_set_inode_flags(struct inode *inode)
 			S_DIRSYNC | S_SYNC);
 }
 
-void jfs_get_inode_flags(struct jfs_inode_info *jfs_ip)
-{
-	unsigned int flags = jfs_ip->vfs_inode.i_flags;
-
-	jfs_ip->mode2 &= ~(JFS_IMMUTABLE_FL | JFS_APPEND_FL | JFS_NOATIME_FL |
-			   JFS_DIRSYNC_FL | JFS_SYNC_FL);
-	if (flags & S_IMMUTABLE)
-		jfs_ip->mode2 |= JFS_IMMUTABLE_FL;
-	if (flags & S_APPEND)
-		jfs_ip->mode2 |= JFS_APPEND_FL;
-	if (flags & S_NOATIME)
-		jfs_ip->mode2 |= JFS_NOATIME_FL;
-	if (flags & S_DIRSYNC)
-		jfs_ip->mode2 |= JFS_DIRSYNC_FL;
-	if (flags & S_SYNC)
-		jfs_ip->mode2 |= JFS_SYNC_FL;
-}
-
 /*
  * NAME:	ialloc()
  *
diff --git a/fs/jfs/jfs_inode.h b/fs/jfs/jfs_inode.h
index 9271cfe4a149..7b0b3a40788f 100644
--- a/fs/jfs/jfs_inode.h
+++ b/fs/jfs/jfs_inode.h
@@ -33,7 +33,6 @@ extern void jfs_truncate(struct inode *);
 extern void jfs_truncate_nolock(struct inode *, loff_t);
 extern void jfs_free_zero_link(struct inode *);
 extern struct dentry *jfs_get_parent(struct dentry *dentry);
-extern void jfs_get_inode_flags(struct jfs_inode_info *);
 extern struct dentry *jfs_fh_to_dentry(struct super_block *sb, struct fid *fid,
 	int fh_len, int fh_type);
 extern struct dentry *jfs_fh_to_parent(struct super_block *sb, struct fid *fid,
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index c64c2574a0aa..e8aad7d87b8c 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -45,6 +45,7 @@
 #include "jfs_acl.h"
 #include "jfs_debug.h"
 #include "jfs_xattr.h"
+#include "jfs_dinode.h"
 
 MODULE_DESCRIPTION("The Journaled Filesystem (JFS)");
 MODULE_AUTHOR("Steve Best/Dave Kleikamp/Barry Arndt, IBM");
@@ -181,6 +182,35 @@ static int jfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 	return 0;
 }
 
+#ifdef CONFIG_QUOTA
+static int jfs_quota_off(struct super_block *sb, int type);
+static int jfs_quota_on(struct super_block *sb, int type, int format_id,
+			const struct path *path);
+
+static void jfs_quota_off_umount(struct super_block *sb)
+{
+	int type;
+
+	for (type = 0; type < MAXQUOTAS; type++)
+		jfs_quota_off(sb, type);
+}
+
+static const struct quotactl_ops jfs_quotactl_ops = {
+	.quota_on	= jfs_quota_on,
+	.quota_off	= jfs_quota_off,
+	.quota_sync	= dquot_quota_sync,
+	.get_state	= dquot_get_state,
+	.set_info	= dquot_set_dqinfo,
+	.get_dqblk	= dquot_get_dqblk,
+	.set_dqblk	= dquot_set_dqblk,
+	.get_nextdqblk	= dquot_get_next_dqblk,
+};
+#else
+static inline void jfs_quota_off_umount(struct super_block *sb)
+{
+}
+#endif
+
 static void jfs_put_super(struct super_block *sb)
 {
 	struct jfs_sb_info *sbi = JFS_SBI(sb);
@@ -188,7 +218,7 @@ static void jfs_put_super(struct super_block *sb)
 
 	jfs_info("In jfs_put_super");
 
-	dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
+	jfs_quota_off_umount(sb);
 
 	rc = jfs_umount(sb);
 	if (rc)
@@ -536,7 +566,7 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
 	sb->s_xattr = jfs_xattr_handlers;
 #ifdef CONFIG_QUOTA
 	sb->dq_op = &dquot_operations;
-	sb->s_qcop = &dquot_quotactl_ops;
+	sb->s_qcop = &jfs_quotactl_ops;
 	sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP;
 #endif
 
@@ -840,6 +870,51 @@ static struct dquot **jfs_get_dquots(struct inode *inode)
 {
 	return JFS_IP(inode)->i_dquot;
 }
+
+static int jfs_quota_on(struct super_block *sb, int type, int format_id,
+			const struct path *path)
+{
+	int err;
+	struct inode *inode;
+
+	err = dquot_quota_on(sb, type, format_id, path);
+	if (err)
+		return err;
+
+	inode = d_inode(path->dentry);
+	inode_lock(inode);
+	JFS_IP(inode)->mode2 |= JFS_NOATIME_FL | JFS_IMMUTABLE_FL;
+	inode_set_flags(inode, S_NOATIME | S_IMMUTABLE,
+			S_NOATIME | S_IMMUTABLE);
+	inode_unlock(inode);
+	mark_inode_dirty(inode);
+
+	return 0;
+}
+
+static int jfs_quota_off(struct super_block *sb, int type)
+{
+	struct inode *inode = sb_dqopt(sb)->files[type];
+	int err;
+
+	if (!inode || !igrab(inode))
+		goto out;
+
+	err = dquot_quota_off(sb, type);
+	if (err)
+		goto out_put;
+
+	inode_lock(inode);
+	JFS_IP(inode)->mode2 &= ~(JFS_NOATIME_FL | JFS_IMMUTABLE_FL);
+	inode_set_flags(inode, 0, S_NOATIME | S_IMMUTABLE);
+	inode_unlock(inode);
+	mark_inode_dirty(inode);
+out_put:
+	iput(inode);
+	return err;
+out:
+	return dquot_quota_off(sb, type);
+}
 #endif
 
 static const struct super_operations jfs_super_operations = {
diff --git a/fs/mount.h b/fs/mount.h
index 2826543a131d..bf1fda6eed8f 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -59,7 +59,7 @@ struct mount {
 	struct mountpoint *mnt_mp;	/* where is it mounted */
 	struct hlist_node mnt_mp_list;	/* list mounts with the same mountpoint */
 #ifdef CONFIG_FSNOTIFY
-	struct hlist_head mnt_fsnotify_marks;
+	struct fsnotify_mark_connector __rcu *mnt_fsnotify_marks;
 	__u32 mnt_fsnotify_mask;
 #endif
 	int mnt_id;			/* mount identifier */
diff --git a/fs/namei.c b/fs/namei.c
index 19dcf62133cc..9a7f8bd748d8 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -340,22 +340,14 @@ int generic_permission(struct inode *inode, int mask)
 
 	if (S_ISDIR(inode->i_mode)) {
 		/* DACs are overridable for directories */
-		if (capable_wrt_inode_uidgid(inode, CAP_DAC_OVERRIDE))
-			return 0;
 		if (!(mask & MAY_WRITE))
 			if (capable_wrt_inode_uidgid(inode,
 						     CAP_DAC_READ_SEARCH))
 				return 0;
-		return -EACCES;
-	}
-	/*
-	 * Read/write DACs are always overridable.
-	 * Executable DACs are overridable when there is
-	 * at least one exec bit set.
-	 */
-	if (!(mask & MAY_EXEC) || (inode->i_mode & S_IXUGO))
 		if (capable_wrt_inode_uidgid(inode, CAP_DAC_OVERRIDE))
 			return 0;
+		return -EACCES;
+	}
 
 	/*
 	 * Searching includes executable on directories, else just read.
@@ -364,6 +356,14 @@ int generic_permission(struct inode *inode, int mask)
 	if (mask == MAY_READ)
 		if (capable_wrt_inode_uidgid(inode, CAP_DAC_READ_SEARCH))
 			return 0;
+	/*
+	 * Read/write DACs are always overridable.
+	 * Executable DACs are overridable when there is
+	 * at least one exec bit set.
+	 */
+	if (!(mask & MAY_EXEC) || (inode->i_mode & S_IXUGO))
+		if (capable_wrt_inode_uidgid(inode, CAP_DAC_OVERRIDE))
+			return 0;
 
 	return -EACCES;
 }
diff --git a/fs/namespace.c b/fs/namespace.c
index cc1375eff88c..b3b115bd4e1e 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -236,9 +236,6 @@ static struct mount *alloc_vfsmnt(const char *name)
 		INIT_LIST_HEAD(&mnt->mnt_slave_list);
 		INIT_LIST_HEAD(&mnt->mnt_slave);
 		INIT_HLIST_NODE(&mnt->mnt_mp_list);
-#ifdef CONFIG_FSNOTIFY
-		INIT_HLIST_HEAD(&mnt->mnt_fsnotify_marks);
-#endif
 		init_fs_pin(&mnt->mnt_umount, drop_mountpoint);
 	}
 	return mnt;
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 19d50f600e8d..9aaf6ca77569 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1004,7 +1004,7 @@ out_nfserr:
 	else
 		err = nfserrno(host_err);
 	if (test_bit(RQ_LOCAL, &rqstp->rq_flags))
-		tsk_restore_flags(current, pflags, PF_LESS_THROTTLE);
+		current_restore_flags(pflags, PF_LESS_THROTTLE);
 	return err;
 }
 
diff --git a/fs/notify/Makefile b/fs/notify/Makefile
index 96d3420d0242..3e969ae91b60 100644
--- a/fs/notify/Makefile
+++ b/fs/notify/Makefile
@@ -1,5 +1,5 @@
-obj-$(CONFIG_FSNOTIFY)		+= fsnotify.o notification.o group.o inode_mark.o \
-				   mark.o vfsmount_mark.o fdinfo.o
+obj-$(CONFIG_FSNOTIFY)		+= fsnotify.o notification.o group.o mark.o \
+				   fdinfo.o
 
 obj-y			+= dnotify/
 obj-y			+= inotify/
diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index 5a4ec309e283..2430a0415995 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -52,7 +52,7 @@ struct dnotify_mark {
  */
 static void dnotify_recalc_inode_mask(struct fsnotify_mark *fsn_mark)
 {
-	__u32 new_mask, old_mask;
+	__u32 new_mask = 0;
 	struct dnotify_struct *dn;
 	struct dnotify_mark *dn_mark  = container_of(fsn_mark,
 						     struct dnotify_mark,
@@ -60,17 +60,13 @@ static void dnotify_recalc_inode_mask(struct fsnotify_mark *fsn_mark)
 
 	assert_spin_locked(&fsn_mark->lock);
 
-	old_mask = fsn_mark->mask;
-	new_mask = 0;
 	for (dn = dn_mark->dn; dn != NULL; dn = dn->dn_next)
 		new_mask |= (dn->dn_mask & ~FS_DN_MULTISHOT);
-	fsnotify_set_mark_mask_locked(fsn_mark, new_mask);
-
-	if (old_mask == new_mask)
+	if (fsn_mark->mask == new_mask)
 		return;
+	fsn_mark->mask = new_mask;
 
-	if (fsn_mark->inode)
-		fsnotify_recalc_inode_mask(fsn_mark->inode);
+	fsnotify_recalc_mask(fsn_mark->connector);
 }
 
 /*
@@ -86,7 +82,8 @@ static int dnotify_handle_event(struct fsnotify_group *group,
 				struct fsnotify_mark *inode_mark,
 				struct fsnotify_mark *vfsmount_mark,
 				u32 mask, const void *data, int data_type,
-				const unsigned char *file_name, u32 cookie)
+				const unsigned char *file_name, u32 cookie,
+				struct fsnotify_iter_info *iter_info)
 {
 	struct dnotify_mark *dn_mark;
 	struct dnotify_struct *dn;
@@ -138,6 +135,7 @@ static void dnotify_free_mark(struct fsnotify_mark *fsn_mark)
 
 static struct fsnotify_ops dnotify_fsnotify_ops = {
 	.handle_event = dnotify_handle_event,
+	.free_mark = dnotify_free_mark,
 };
 
 /*
@@ -160,7 +158,7 @@ void dnotify_flush(struct file *filp, fl_owner_t id)
 	if (!S_ISDIR(inode->i_mode))
 		return;
 
-	fsn_mark = fsnotify_find_inode_mark(dnotify_group, inode);
+	fsn_mark = fsnotify_find_mark(&inode->i_fsnotify_marks, dnotify_group);
 	if (!fsn_mark)
 		return;
 	dn_mark = container_of(fsn_mark, struct dnotify_mark, fsn_mark);
@@ -308,7 +306,7 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
 
 	/* set up the new_fsn_mark and new_dn_mark */
 	new_fsn_mark = &new_dn_mark->fsn_mark;
-	fsnotify_init_mark(new_fsn_mark, dnotify_free_mark);
+	fsnotify_init_mark(new_fsn_mark, dnotify_group);
 	new_fsn_mark->mask = mask;
 	new_dn_mark->dn = NULL;
 
@@ -316,13 +314,12 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
 	mutex_lock(&dnotify_group->mark_mutex);
 
 	/* add the new_fsn_mark or find an old one. */
-	fsn_mark = fsnotify_find_inode_mark(dnotify_group, inode);
+	fsn_mark = fsnotify_find_mark(&inode->i_fsnotify_marks, dnotify_group);
 	if (fsn_mark) {
 		dn_mark = container_of(fsn_mark, struct dnotify_mark, fsn_mark);
 		spin_lock(&fsn_mark->lock);
 	} else {
-		fsnotify_add_mark_locked(new_fsn_mark, dnotify_group, inode,
-					 NULL, 0);
+		fsnotify_add_mark_locked(new_fsn_mark, inode, NULL, 0);
 		spin_lock(&new_fsn_mark->lock);
 		fsn_mark = new_fsn_mark;
 		dn_mark = new_dn_mark;
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index e5f7e47de68e..2fa99aeaa095 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -57,14 +57,26 @@ static int fanotify_merge(struct list_head *list, struct fsnotify_event *event)
 
 #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
 static int fanotify_get_response(struct fsnotify_group *group,
-				 struct fanotify_perm_event_info *event)
+				 struct fanotify_perm_event_info *event,
+				 struct fsnotify_iter_info *iter_info)
 {
 	int ret;
 
 	pr_debug("%s: group=%p event=%p\n", __func__, group, event);
 
+	/*
+	 * fsnotify_prepare_user_wait() fails if we race with mark deletion.
+	 * Just let the operation pass in that case.
+	 */
+	if (!fsnotify_prepare_user_wait(iter_info)) {
+		event->response = FAN_ALLOW;
+		goto out;
+	}
+
 	wait_event(group->fanotify_data.access_waitq, event->response);
 
+	fsnotify_finish_user_wait(iter_info);
+out:
 	/* userspace responded, convert to something usable */
 	switch (event->response) {
 	case FAN_ALLOW:
@@ -174,7 +186,8 @@ static int fanotify_handle_event(struct fsnotify_group *group,
 				 struct fsnotify_mark *inode_mark,
 				 struct fsnotify_mark *fanotify_mark,
 				 u32 mask, const void *data, int data_type,
-				 const unsigned char *file_name, u32 cookie)
+				 const unsigned char *file_name, u32 cookie,
+				 struct fsnotify_iter_info *iter_info)
 {
 	int ret = 0;
 	struct fanotify_event_info *event;
@@ -215,7 +228,8 @@ static int fanotify_handle_event(struct fsnotify_group *group,
 
 #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
 	if (mask & FAN_ALL_PERM_EVENTS) {
-		ret = fanotify_get_response(group, FANOTIFY_PE(fsn_event));
+		ret = fanotify_get_response(group, FANOTIFY_PE(fsn_event),
+					    iter_info);
 		fsnotify_destroy_event(group, fsn_event);
 	}
 #endif
@@ -248,8 +262,14 @@ static void fanotify_free_event(struct fsnotify_event *fsn_event)
 	kmem_cache_free(fanotify_event_cachep, event);
 }
 
+static void fanotify_free_mark(struct fsnotify_mark *fsn_mark)
+{
+	kmem_cache_free(fanotify_mark_cache, fsn_mark);
+}
+
 const struct fsnotify_ops fanotify_fsnotify_ops = {
 	.handle_event = fanotify_handle_event,
 	.free_group_priv = fanotify_free_group_priv,
 	.free_event = fanotify_free_event,
+	.free_mark = fanotify_free_mark,
 };
diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h
index 4500a74f8d38..4eb6f5efa282 100644
--- a/fs/notify/fanotify/fanotify.h
+++ b/fs/notify/fanotify/fanotify.h
@@ -2,6 +2,7 @@
 #include <linux/path.h>
 #include <linux/slab.h>
 
+extern struct kmem_cache *fanotify_mark_cache;
 extern struct kmem_cache *fanotify_event_cachep;
 extern struct kmem_cache *fanotify_perm_event_cachep;
 
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 2b37f2785834..907a481ac781 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -41,7 +41,7 @@
 
 extern const struct fsnotify_ops fanotify_fsnotify_ops;
 
-static struct kmem_cache *fanotify_mark_cache __read_mostly;
+struct kmem_cache *fanotify_mark_cache __read_mostly;
 struct kmem_cache *fanotify_event_cachep __read_mostly;
 struct kmem_cache *fanotify_perm_event_cachep __read_mostly;
 
@@ -295,27 +295,37 @@ static ssize_t fanotify_read(struct file *file, char __user *buf,
 		}
 
 		ret = copy_event_to_user(group, kevent, buf);
+		if (unlikely(ret == -EOPENSTALE)) {
+			/*
+			 * We cannot report events with stale fd so drop it.
+			 * Setting ret to 0 will continue the event loop and
+			 * do the right thing if there are no more events to
+			 * read (i.e. return bytes read, -EAGAIN or wait).
+			 */
+			ret = 0;
+		}
+
 		/*
 		 * Permission events get queued to wait for response.  Other
 		 * events can be destroyed now.
 		 */
 		if (!(kevent->mask & FAN_ALL_PERM_EVENTS)) {
 			fsnotify_destroy_event(group, kevent);
-			if (ret < 0)
-				break;
 		} else {
 #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
-			if (ret < 0) {
+			if (ret <= 0) {
 				FANOTIFY_PE(kevent)->response = FAN_DENY;
 				wake_up(&group->fanotify_data.access_waitq);
-				break;
+			} else {
+				spin_lock(&group->notification_lock);
+				list_add_tail(&kevent->list,
+					&group->fanotify_data.access_list);
+				spin_unlock(&group->notification_lock);
 			}
-			spin_lock(&group->notification_lock);
-			list_add_tail(&kevent->list,
-				      &group->fanotify_data.access_list);
-			spin_unlock(&group->notification_lock);
 #endif
 		}
+		if (ret < 0)
+			break;
 		buf += ret;
 		count -= ret;
 	}
@@ -445,11 +455,6 @@ static const struct file_operations fanotify_fops = {
 	.llseek		= noop_llseek,
 };
 
-static void fanotify_free_mark(struct fsnotify_mark *fsn_mark)
-{
-	kmem_cache_free(fanotify_mark_cache, fsn_mark);
-}
-
 static int fanotify_find_path(int dfd, const char __user *filename,
 			      struct path *path, unsigned int flags)
 {
@@ -511,13 +516,12 @@ static __u32 fanotify_mark_remove_from_mask(struct fsnotify_mark *fsn_mark,
 			tmask &= ~FAN_ONDIR;
 
 		oldmask = fsn_mark->mask;
-		fsnotify_set_mark_mask_locked(fsn_mark, tmask);
+		fsn_mark->mask = tmask;
 	} else {
 		__u32 tmask = fsn_mark->ignored_mask & ~mask;
 		if (flags & FAN_MARK_ONDIR)
 			tmask &= ~FAN_ONDIR;
-
-		fsnotify_set_mark_ignored_mask_locked(fsn_mark, tmask);
+		fsn_mark->ignored_mask = tmask;
 	}
 	*destroy = !(fsn_mark->mask | fsn_mark->ignored_mask);
 	spin_unlock(&fsn_mark->lock);
@@ -534,7 +538,8 @@ static int fanotify_remove_vfsmount_mark(struct fsnotify_group *group,
 	int destroy_mark;
 
 	mutex_lock(&group->mark_mutex);
-	fsn_mark = fsnotify_find_vfsmount_mark(group, mnt);
+	fsn_mark = fsnotify_find_mark(&real_mount(mnt)->mnt_fsnotify_marks,
+				      group);
 	if (!fsn_mark) {
 		mutex_unlock(&group->mark_mutex);
 		return -ENOENT;
@@ -542,6 +547,8 @@ static int fanotify_remove_vfsmount_mark(struct fsnotify_group *group,
 
 	removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags,
 						 &destroy_mark);
+	if (removed & real_mount(mnt)->mnt_fsnotify_mask)
+		fsnotify_recalc_mask(real_mount(mnt)->mnt_fsnotify_marks);
 	if (destroy_mark)
 		fsnotify_detach_mark(fsn_mark);
 	mutex_unlock(&group->mark_mutex);
@@ -549,9 +556,6 @@ static int fanotify_remove_vfsmount_mark(struct fsnotify_group *group,
 		fsnotify_free_mark(fsn_mark);
 
 	fsnotify_put_mark(fsn_mark);
-	if (removed & real_mount(mnt)->mnt_fsnotify_mask)
-		fsnotify_recalc_vfsmount_mask(mnt);
-
 	return 0;
 }
 
@@ -564,7 +568,7 @@ static int fanotify_remove_inode_mark(struct fsnotify_group *group,
 	int destroy_mark;
 
 	mutex_lock(&group->mark_mutex);
-	fsn_mark = fsnotify_find_inode_mark(group, inode);
+	fsn_mark = fsnotify_find_mark(&inode->i_fsnotify_marks, group);
 	if (!fsn_mark) {
 		mutex_unlock(&group->mark_mutex);
 		return -ENOENT;
@@ -572,16 +576,16 @@ static int fanotify_remove_inode_mark(struct fsnotify_group *group,
 
 	removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags,
 						 &destroy_mark);
+	if (removed & inode->i_fsnotify_mask)
+		fsnotify_recalc_mask(inode->i_fsnotify_marks);
 	if (destroy_mark)
 		fsnotify_detach_mark(fsn_mark);
 	mutex_unlock(&group->mark_mutex);
 	if (destroy_mark)
 		fsnotify_free_mark(fsn_mark);
 
-	/* matches the fsnotify_find_inode_mark() */
+	/* matches the fsnotify_find_mark() */
 	fsnotify_put_mark(fsn_mark);
-	if (removed & inode->i_fsnotify_mask)
-		fsnotify_recalc_inode_mask(inode);
 
 	return 0;
 }
@@ -600,13 +604,13 @@ static __u32 fanotify_mark_add_to_mask(struct fsnotify_mark *fsn_mark,
 			tmask |= FAN_ONDIR;
 
 		oldmask = fsn_mark->mask;
-		fsnotify_set_mark_mask_locked(fsn_mark, tmask);
+		fsn_mark->mask = tmask;
 	} else {
 		__u32 tmask = fsn_mark->ignored_mask | mask;
 		if (flags & FAN_MARK_ONDIR)
 			tmask |= FAN_ONDIR;
 
-		fsnotify_set_mark_ignored_mask_locked(fsn_mark, tmask);
+		fsn_mark->ignored_mask = tmask;
 		if (flags & FAN_MARK_IGNORED_SURV_MODIFY)
 			fsn_mark->flags |= FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY;
 	}
@@ -629,8 +633,8 @@ static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group,
 	if (!mark)
 		return ERR_PTR(-ENOMEM);
 
-	fsnotify_init_mark(mark, fanotify_free_mark);
-	ret = fsnotify_add_mark_locked(mark, group, inode, mnt, 0);
+	fsnotify_init_mark(mark, group);
+	ret = fsnotify_add_mark_locked(mark, inode, mnt, 0);
 	if (ret) {
 		fsnotify_put_mark(mark);
 		return ERR_PTR(ret);
@@ -648,7 +652,8 @@ static int fanotify_add_vfsmount_mark(struct fsnotify_group *group,
 	__u32 added;
 
 	mutex_lock(&group->mark_mutex);
-	fsn_mark = fsnotify_find_vfsmount_mark(group, mnt);
+	fsn_mark = fsnotify_find_mark(&real_mount(mnt)->mnt_fsnotify_marks,
+				      group);
 	if (!fsn_mark) {
 		fsn_mark = fanotify_add_new_mark(group, NULL, mnt);
 		if (IS_ERR(fsn_mark)) {
@@ -657,10 +662,9 @@ static int fanotify_add_vfsmount_mark(struct fsnotify_group *group,
 		}
 	}
 	added = fanotify_mark_add_to_mask(fsn_mark, mask, flags);
-	mutex_unlock(&group->mark_mutex);
-
 	if (added & ~real_mount(mnt)->mnt_fsnotify_mask)
-		fsnotify_recalc_vfsmount_mask(mnt);
+		fsnotify_recalc_mask(real_mount(mnt)->mnt_fsnotify_marks);
+	mutex_unlock(&group->mark_mutex);
 
 	fsnotify_put_mark(fsn_mark);
 	return 0;
@@ -686,7 +690,7 @@ static int fanotify_add_inode_mark(struct fsnotify_group *group,
 		return 0;
 
 	mutex_lock(&group->mark_mutex);
-	fsn_mark = fsnotify_find_inode_mark(group, inode);
+	fsn_mark = fsnotify_find_mark(&inode->i_fsnotify_marks, group);
 	if (!fsn_mark) {
 		fsn_mark = fanotify_add_new_mark(group, inode, NULL);
 		if (IS_ERR(fsn_mark)) {
@@ -695,10 +699,9 @@ static int fanotify_add_inode_mark(struct fsnotify_group *group,
 		}
 	}
 	added = fanotify_mark_add_to_mask(fsn_mark, mask, flags);
-	mutex_unlock(&group->mark_mutex);
-
 	if (added & ~inode->i_fsnotify_mask)
-		fsnotify_recalc_inode_mask(inode);
+		fsnotify_recalc_mask(inode->i_fsnotify_marks);
+	mutex_unlock(&group->mark_mutex);
 
 	fsnotify_put_mark(fsn_mark);
 	return 0;
diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c
index fd98e5100cab..dd63aa9a6f9a 100644
--- a/fs/notify/fdinfo.c
+++ b/fs/notify/fdinfo.c
@@ -76,12 +76,11 @@ static void inotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark)
 	struct inotify_inode_mark *inode_mark;
 	struct inode *inode;
 
-	if (!(mark->flags & FSNOTIFY_MARK_FLAG_ALIVE) ||
-	    !(mark->flags & FSNOTIFY_MARK_FLAG_INODE))
+	if (!(mark->connector->flags & FSNOTIFY_OBJ_TYPE_INODE))
 		return;
 
 	inode_mark = container_of(mark, struct inotify_inode_mark, fsn_mark);
-	inode = igrab(mark->inode);
+	inode = igrab(mark->connector->inode);
 	if (inode) {
 		/*
 		 * IN_ALL_EVENTS represents all of the mask bits
@@ -113,14 +112,11 @@ static void fanotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark)
 	unsigned int mflags = 0;
 	struct inode *inode;
 
-	if (!(mark->flags & FSNOTIFY_MARK_FLAG_ALIVE))
-		return;
-
 	if (mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY)
 		mflags |= FAN_MARK_IGNORED_SURV_MODIFY;
 
-	if (mark->flags & FSNOTIFY_MARK_FLAG_INODE) {
-		inode = igrab(mark->inode);
+	if (mark->connector->flags & FSNOTIFY_OBJ_TYPE_INODE) {
+		inode = igrab(mark->connector->inode);
 		if (!inode)
 			return;
 		seq_printf(m, "fanotify ino:%lx sdev:%x mflags:%x mask:%x ignored_mask:%x ",
@@ -129,8 +125,8 @@ static void fanotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark)
 		show_mark_fhandle(m, inode);
 		seq_putc(m, '\n');
 		iput(inode);
-	} else if (mark->flags & FSNOTIFY_MARK_FLAG_VFSMOUNT) {
-		struct mount *mnt = real_mount(mark->mnt);
+	} else if (mark->connector->flags & FSNOTIFY_OBJ_TYPE_VFSMOUNT) {
+		struct mount *mnt = real_mount(mark->connector->mnt);
 
 		seq_printf(m, "fanotify mnt_id:%x mflags:%x mask:%x ignored_mask:%x\n",
 			   mnt->mnt_id, mflags, mark->mask, mark->ignored_mask);
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index b41515d3f081..01a9f0f007d4 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -41,6 +41,63 @@ void __fsnotify_vfsmount_delete(struct vfsmount *mnt)
 	fsnotify_clear_marks_by_mount(mnt);
 }
 
+/**
+ * fsnotify_unmount_inodes - an sb is unmounting.  handle any watched inodes.
+ * @sb: superblock being unmounted.
+ *
+ * Called during unmount with no locks held, so needs to be safe against
+ * concurrent modifiers. We temporarily drop sb->s_inode_list_lock and CAN block.
+ */
+void fsnotify_unmount_inodes(struct super_block *sb)
+{
+	struct inode *inode, *iput_inode = NULL;
+
+	spin_lock(&sb->s_inode_list_lock);
+	list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
+		/*
+		 * We cannot __iget() an inode in state I_FREEING,
+		 * I_WILL_FREE, or I_NEW which is fine because by that point
+		 * the inode cannot have any associated watches.
+		 */
+		spin_lock(&inode->i_lock);
+		if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) {
+			spin_unlock(&inode->i_lock);
+			continue;
+		}
+
+		/*
+		 * If i_count is zero, the inode cannot have any watches and
+		 * doing an __iget/iput with MS_ACTIVE clear would actually
+		 * evict all inodes with zero i_count from icache which is
+		 * unnecessarily violent and may in fact be illegal to do.
+		 */
+		if (!atomic_read(&inode->i_count)) {
+			spin_unlock(&inode->i_lock);
+			continue;
+		}
+
+		__iget(inode);
+		spin_unlock(&inode->i_lock);
+		spin_unlock(&sb->s_inode_list_lock);
+
+		if (iput_inode)
+			iput(iput_inode);
+
+		/* for each watch, send FS_UNMOUNT and then remove it */
+		fsnotify(inode, FS_UNMOUNT, inode, FSNOTIFY_EVENT_INODE, NULL, 0);
+
+		fsnotify_inode_delete(inode);
+
+		iput_inode = inode;
+
+		spin_lock(&sb->s_inode_list_lock);
+	}
+	spin_unlock(&sb->s_inode_list_lock);
+
+	if (iput_inode)
+		iput(iput_inode);
+}
+
 /*
  * Given an inode, first check if we care what happens to our children.  Inotify
  * and dnotify both tell their parents about events.  If we care about any event
@@ -127,7 +184,8 @@ static int send_to_group(struct inode *to_tell,
 			 struct fsnotify_mark *vfsmount_mark,
 			 __u32 mask, const void *data,
 			 int data_is, u32 cookie,
-			 const unsigned char *file_name)
+			 const unsigned char *file_name,
+			 struct fsnotify_iter_info *iter_info)
 {
 	struct fsnotify_group *group = NULL;
 	__u32 inode_test_mask = 0;
@@ -178,7 +236,7 @@ static int send_to_group(struct inode *to_tell,
 
 	return group->ops->handle_event(group, to_tell, inode_mark,
 					vfsmount_mark, mask, data, data_is,
-					file_name, cookie);
+					file_name, cookie, iter_info);
 }
 
 /*
@@ -193,8 +251,10 @@ int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is,
 	struct hlist_node *inode_node = NULL, *vfsmount_node = NULL;
 	struct fsnotify_mark *inode_mark = NULL, *vfsmount_mark = NULL;
 	struct fsnotify_group *inode_group, *vfsmount_group;
+	struct fsnotify_mark_connector *inode_conn, *vfsmount_conn;
+	struct fsnotify_iter_info iter_info;
 	struct mount *mnt;
-	int idx, ret = 0;
+	int ret = 0;
 	/* global tests shouldn't care about events on child only the specific event */
 	__u32 test_mask = (mask & ~FS_EVENT_ON_CHILD);
 
@@ -210,8 +270,8 @@ int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is,
 	 * SRCU because we have no references to any objects and do not
 	 * need SRCU to keep them "alive".
 	 */
-	if (hlist_empty(&to_tell->i_fsnotify_marks) &&
-	    (!mnt || hlist_empty(&mnt->mnt_fsnotify_marks)))
+	if (!to_tell->i_fsnotify_marks &&
+	    (!mnt || !mnt->mnt_fsnotify_marks))
 		return 0;
 	/*
 	 * if this is a modify event we may need to clear the ignored masks
@@ -223,19 +283,30 @@ int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is,
 	    !(mnt && test_mask & mnt->mnt_fsnotify_mask))
 		return 0;
 
-	idx = srcu_read_lock(&fsnotify_mark_srcu);
+	iter_info.srcu_idx = srcu_read_lock(&fsnotify_mark_srcu);
 
 	if ((mask & FS_MODIFY) ||
-	    (test_mask & to_tell->i_fsnotify_mask))
-		inode_node = srcu_dereference(to_tell->i_fsnotify_marks.first,
+	    (test_mask & to_tell->i_fsnotify_mask)) {
+		inode_conn = srcu_dereference(to_tell->i_fsnotify_marks,
 					      &fsnotify_mark_srcu);
+		if (inode_conn)
+			inode_node = srcu_dereference(inode_conn->list.first,
+						      &fsnotify_mark_srcu);
+	}
 
 	if (mnt && ((mask & FS_MODIFY) ||
 		    (test_mask & mnt->mnt_fsnotify_mask))) {
-		vfsmount_node = srcu_dereference(mnt->mnt_fsnotify_marks.first,
-						 &fsnotify_mark_srcu);
-		inode_node = srcu_dereference(to_tell->i_fsnotify_marks.first,
+		inode_conn = srcu_dereference(to_tell->i_fsnotify_marks,
 					      &fsnotify_mark_srcu);
+		if (inode_conn)
+			inode_node = srcu_dereference(inode_conn->list.first,
+						      &fsnotify_mark_srcu);
+		vfsmount_conn = srcu_dereference(mnt->mnt_fsnotify_marks,
+					         &fsnotify_mark_srcu);
+		if (vfsmount_conn)
+			vfsmount_node = srcu_dereference(
+						vfsmount_conn->list.first,
+						&fsnotify_mark_srcu);
 	}
 
 	/*
@@ -272,8 +343,13 @@ int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is,
 				vfsmount_mark = NULL;
 			}
 		}
+
+		iter_info.inode_mark = inode_mark;
+		iter_info.vfsmount_mark = vfsmount_mark;
+
 		ret = send_to_group(to_tell, inode_mark, vfsmount_mark, mask,
-				    data, data_is, cookie, file_name);
+				    data, data_is, cookie, file_name,
+				    &iter_info);
 
 		if (ret && (mask & ALL_FSNOTIFY_PERM_EVENTS))
 			goto out;
@@ -287,12 +363,14 @@ int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is,
 	}
 	ret = 0;
 out:
-	srcu_read_unlock(&fsnotify_mark_srcu, idx);
+	srcu_read_unlock(&fsnotify_mark_srcu, iter_info.srcu_idx);
 
 	return ret;
 }
 EXPORT_SYMBOL_GPL(fsnotify);
 
+extern struct kmem_cache *fsnotify_mark_connector_cachep;
+
 static __init int fsnotify_init(void)
 {
 	int ret;
@@ -303,6 +381,9 @@ static __init int fsnotify_init(void)
 	if (ret)
 		panic("initializing fsnotify_mark_srcu");
 
+	fsnotify_mark_connector_cachep = KMEM_CACHE(fsnotify_mark_connector,
+						    SLAB_PANIC);
+
 	return 0;
 }
 core_initcall(fsnotify_init);
diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h
index 0a3bc2cf192c..bf012e8ecd14 100644
--- a/fs/notify/fsnotify.h
+++ b/fs/notify/fsnotify.h
@@ -8,60 +8,36 @@
 
 #include "../mount.h"
 
+struct fsnotify_iter_info {
+	struct fsnotify_mark *inode_mark;
+	struct fsnotify_mark *vfsmount_mark;
+	int srcu_idx;
+};
+
 /* destroy all events sitting in this groups notification queue */
 extern void fsnotify_flush_notify(struct fsnotify_group *group);
 
 /* protects reads of inode and vfsmount marks list */
 extern struct srcu_struct fsnotify_mark_srcu;
 
-/* Calculate mask of events for a list of marks */
-extern u32 fsnotify_recalc_mask(struct hlist_head *head);
-
 /* compare two groups for sorting of marks lists */
 extern int fsnotify_compare_groups(struct fsnotify_group *a,
 				   struct fsnotify_group *b);
 
-extern void fsnotify_set_inode_mark_mask_locked(struct fsnotify_mark *fsn_mark,
-						__u32 mask);
-/* Add mark to a proper place in mark list */
-extern int fsnotify_add_mark_list(struct hlist_head *head,
-				  struct fsnotify_mark *mark,
-				  int allow_dups);
-/* add a mark to an inode */
-extern int fsnotify_add_inode_mark(struct fsnotify_mark *mark,
-				   struct fsnotify_group *group, struct inode *inode,
-				   int allow_dups);
-/* add a mark to a vfsmount */
-extern int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark,
-				      struct fsnotify_group *group, struct vfsmount *mnt,
-				      int allow_dups);
-
-/* vfsmount specific destruction of a mark */
-extern void fsnotify_destroy_vfsmount_mark(struct fsnotify_mark *mark);
-/* inode specific destruction of a mark */
-extern void fsnotify_destroy_inode_mark(struct fsnotify_mark *mark);
-/* Find mark belonging to given group in the list of marks */
-extern struct fsnotify_mark *fsnotify_find_mark(struct hlist_head *head,
-						struct fsnotify_group *group);
-/* Destroy all marks in the given list protected by 'lock' */
-extern void fsnotify_destroy_marks(struct hlist_head *head, spinlock_t *lock);
+/* Destroy all marks connected via given connector */
+extern void fsnotify_destroy_marks(struct fsnotify_mark_connector __rcu **connp);
 /* run the list of all marks associated with inode and destroy them */
 static inline void fsnotify_clear_marks_by_inode(struct inode *inode)
 {
-	fsnotify_destroy_marks(&inode->i_fsnotify_marks, &inode->i_lock);
+	fsnotify_destroy_marks(&inode->i_fsnotify_marks);
 }
 /* run the list of all marks associated with vfsmount and destroy them */
 static inline void fsnotify_clear_marks_by_mount(struct vfsmount *mnt)
 {
-	fsnotify_destroy_marks(&real_mount(mnt)->mnt_fsnotify_marks,
-			       &mnt->mnt_root->d_lock);
+	fsnotify_destroy_marks(&real_mount(mnt)->mnt_fsnotify_marks);
 }
-/* prepare for freeing all marks associated with given group */
-extern void fsnotify_detach_group_marks(struct fsnotify_group *group);
-/*
- * wait for fsnotify_mark_srcu period to end and free all marks in destroy_list
- */
-extern void fsnotify_mark_destroy_list(void);
+/* Wait until all marks queued for destruction are destroyed */
+extern void fsnotify_wait_marks_destroyed(void);
 
 /*
  * update the dentry->d_flags of all of inode's children to indicate if inode cares
diff --git a/fs/notify/group.c b/fs/notify/group.c
index fbe3cbebec16..32357534de18 100644
--- a/fs/notify/group.c
+++ b/fs/notify/group.c
@@ -66,14 +66,23 @@ void fsnotify_destroy_group(struct fsnotify_group *group)
 	 */
 	fsnotify_group_stop_queueing(group);
 
-	/* clear all inode marks for this group, attach them to destroy_list */
-	fsnotify_detach_group_marks(group);
+	/* Clear all marks for this group and queue them for destruction */
+	fsnotify_clear_marks_by_group(group, FSNOTIFY_OBJ_ALL_TYPES);
 
 	/*
-	 * Wait for fsnotify_mark_srcu period to end and free all marks in
-	 * destroy_list
+	 * Some marks can still be pinned when waiting for response from
+	 * userspace. Wait for those now. fsnotify_prepare_user_wait() will
+	 * not succeed now so this wait is race-free.
 	 */
-	fsnotify_mark_destroy_list();
+	wait_event(group->notification_waitq, !atomic_read(&group->user_waits));
+
+	/*
+	 * Wait until all marks get really destroyed. We could actually destroy
+	 * them ourselves instead of waiting for worker to do it, however that
+	 * would be racy as worker can already be processing some marks before
+	 * we even entered fsnotify_destroy_group().
+	 */
+	fsnotify_wait_marks_destroyed();
 
 	/*
 	 * Since we have waited for fsnotify_mark_srcu in
@@ -124,6 +133,7 @@ struct fsnotify_group *fsnotify_alloc_group(const struct fsnotify_ops *ops)
 	/* set to 0 when there a no external references to this group */
 	atomic_set(&group->refcnt, 1);
 	atomic_set(&group->num_marks, 0);
+	atomic_set(&group->user_waits, 0);
 
 	spin_lock_init(&group->notification_lock);
 	INIT_LIST_HEAD(&group->notification_list);
diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
deleted file mode 100644
index a3645249f7ec..000000000000
--- a/fs/notify/inode_mark.c
+++ /dev/null
@@ -1,199 +0,0 @@
-/*
- *  Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com>
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; either version 2, or (at your option)
- *  any later version.
- *
- *  This program is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with this program; see the file COPYING.  If not, write to
- *  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-
-#include <linux/fs.h>
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/mutex.h>
-#include <linux/spinlock.h>
-
-#include <linux/atomic.h>
-
-#include <linux/fsnotify_backend.h>
-#include "fsnotify.h"
-
-#include "../internal.h"
-
-/*
- * Recalculate the inode->i_fsnotify_mask, or the mask of all FS_* event types
- * any notifier is interested in hearing for this inode.
- */
-void fsnotify_recalc_inode_mask(struct inode *inode)
-{
-	spin_lock(&inode->i_lock);
-	inode->i_fsnotify_mask = fsnotify_recalc_mask(&inode->i_fsnotify_marks);
-	spin_unlock(&inode->i_lock);
-
-	__fsnotify_update_child_dentry_flags(inode);
-}
-
-void fsnotify_destroy_inode_mark(struct fsnotify_mark *mark)
-{
-	struct inode *inode = mark->inode;
-
-	BUG_ON(!mutex_is_locked(&mark->group->mark_mutex));
-	assert_spin_locked(&mark->lock);
-
-	spin_lock(&inode->i_lock);
-
-	hlist_del_init_rcu(&mark->obj_list);
-	mark->inode = NULL;
-
-	/*
-	 * this mark is now off the inode->i_fsnotify_marks list and we
-	 * hold the inode->i_lock, so this is the perfect time to update the
-	 * inode->i_fsnotify_mask
-	 */
-	inode->i_fsnotify_mask = fsnotify_recalc_mask(&inode->i_fsnotify_marks);
-	spin_unlock(&inode->i_lock);
-}
-
-/*
- * Given a group clear all of the inode marks associated with that group.
- */
-void fsnotify_clear_inode_marks_by_group(struct fsnotify_group *group)
-{
-	fsnotify_clear_marks_by_group_flags(group, FSNOTIFY_MARK_FLAG_INODE);
-}
-
-/*
- * given a group and inode, find the mark associated with that combination.
- * if found take a reference to that mark and return it, else return NULL
- */
-struct fsnotify_mark *fsnotify_find_inode_mark(struct fsnotify_group *group,
-					       struct inode *inode)
-{
-	struct fsnotify_mark *mark;
-
-	spin_lock(&inode->i_lock);
-	mark = fsnotify_find_mark(&inode->i_fsnotify_marks, group);
-	spin_unlock(&inode->i_lock);
-
-	return mark;
-}
-
-/*
- * If we are setting a mark mask on an inode mark we should pin the inode
- * in memory.
- */
-void fsnotify_set_inode_mark_mask_locked(struct fsnotify_mark *mark,
-					 __u32 mask)
-{
-	struct inode *inode;
-
-	assert_spin_locked(&mark->lock);
-
-	if (mask &&
-	    mark->inode &&
-	    !(mark->flags & FSNOTIFY_MARK_FLAG_OBJECT_PINNED)) {
-		mark->flags |= FSNOTIFY_MARK_FLAG_OBJECT_PINNED;
-		inode = igrab(mark->inode);
-		/*
-		 * we shouldn't be able to get here if the inode wasn't
-		 * already safely held in memory.  But bug in case it
-		 * ever is wrong.
-		 */
-		BUG_ON(!inode);
-	}
-}
-
-/*
- * Attach an initialized mark to a given inode.
- * These marks may be used for the fsnotify backend to determine which
- * event types should be delivered to which group and for which inodes.  These
- * marks are ordered according to priority, highest number first, and then by
- * the group's location in memory.
- */
-int fsnotify_add_inode_mark(struct fsnotify_mark *mark,
-			    struct fsnotify_group *group, struct inode *inode,
-			    int allow_dups)
-{
-	int ret;
-
-	mark->flags |= FSNOTIFY_MARK_FLAG_INODE;
-
-	BUG_ON(!mutex_is_locked(&group->mark_mutex));
-	assert_spin_locked(&mark->lock);
-
-	spin_lock(&inode->i_lock);
-	mark->inode = inode;
-	ret = fsnotify_add_mark_list(&inode->i_fsnotify_marks, mark,
-				     allow_dups);
-	inode->i_fsnotify_mask = fsnotify_recalc_mask(&inode->i_fsnotify_marks);
-	spin_unlock(&inode->i_lock);
-
-	return ret;
-}
-
-/**
- * fsnotify_unmount_inodes - an sb is unmounting.  handle any watched inodes.
- * @sb: superblock being unmounted.
- *
- * Called during unmount with no locks held, so needs to be safe against
- * concurrent modifiers. We temporarily drop sb->s_inode_list_lock and CAN block.
- */
-void fsnotify_unmount_inodes(struct super_block *sb)
-{
-	struct inode *inode, *iput_inode = NULL;
-
-	spin_lock(&sb->s_inode_list_lock);
-	list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
-		/*
-		 * We cannot __iget() an inode in state I_FREEING,
-		 * I_WILL_FREE, or I_NEW which is fine because by that point
-		 * the inode cannot have any associated watches.
-		 */
-		spin_lock(&inode->i_lock);
-		if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) {
-			spin_unlock(&inode->i_lock);
-			continue;
-		}
-
-		/*
-		 * If i_count is zero, the inode cannot have any watches and
-		 * doing an __iget/iput with MS_ACTIVE clear would actually
-		 * evict all inodes with zero i_count from icache which is
-		 * unnecessarily violent and may in fact be illegal to do.
-		 */
-		if (!atomic_read(&inode->i_count)) {
-			spin_unlock(&inode->i_lock);
-			continue;
-		}
-
-		__iget(inode);
-		spin_unlock(&inode->i_lock);
-		spin_unlock(&sb->s_inode_list_lock);
-
-		if (iput_inode)
-			iput(iput_inode);
-
-		/* for each watch, send FS_UNMOUNT and then remove it */
-		fsnotify(inode, FS_UNMOUNT, inode, FSNOTIFY_EVENT_INODE, NULL, 0);
-
-		fsnotify_inode_delete(inode);
-
-		iput_inode = inode;
-
-		spin_lock(&sb->s_inode_list_lock);
-	}
-	spin_unlock(&sb->s_inode_list_lock);
-
-	if (iput_inode)
-		iput(iput_inode);
-}
diff --git a/fs/notify/inotify/inotify.h b/fs/notify/inotify/inotify.h
index 7c461fd49c4c..9ff67b61da8a 100644
--- a/fs/notify/inotify/inotify.h
+++ b/fs/notify/inotify/inotify.h
@@ -27,9 +27,11 @@ extern int inotify_handle_event(struct fsnotify_group *group,
 				struct fsnotify_mark *inode_mark,
 				struct fsnotify_mark *vfsmount_mark,
 				u32 mask, const void *data, int data_type,
-				const unsigned char *file_name, u32 cookie);
+				const unsigned char *file_name, u32 cookie,
+				struct fsnotify_iter_info *iter_info);
 
 extern const struct fsnotify_ops inotify_fsnotify_ops;
+extern struct kmem_cache *inotify_inode_mark_cachep;
 
 #ifdef CONFIG_INOTIFY_USER
 static inline void dec_inotify_instances(struct ucounts *ucounts)
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index 1aeb837ae414..8b73332735ba 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -68,7 +68,8 @@ int inotify_handle_event(struct fsnotify_group *group,
 			 struct fsnotify_mark *inode_mark,
 			 struct fsnotify_mark *vfsmount_mark,
 			 u32 mask, const void *data, int data_type,
-			 const unsigned char *file_name, u32 cookie)
+			 const unsigned char *file_name, u32 cookie,
+			 struct fsnotify_iter_info *iter_info)
 {
 	struct inotify_inode_mark *i_mark;
 	struct inotify_event_info *event;
@@ -156,8 +157,8 @@ static int idr_callback(int id, void *p, void *data)
 	 * BUG() that was here.
 	 */
 	if (fsn_mark)
-		printk(KERN_WARNING "fsn_mark->group=%p inode=%p wd=%d\n",
-			fsn_mark->group, fsn_mark->inode, i_mark->wd);
+		printk(KERN_WARNING "fsn_mark->group=%p wd=%d\n",
+			fsn_mark->group, i_mark->wd);
 	return 0;
 }
 
@@ -175,9 +176,20 @@ static void inotify_free_event(struct fsnotify_event *fsn_event)
 	kfree(INOTIFY_E(fsn_event));
 }
 
+/* ding dong the mark is dead */
+static void inotify_free_mark(struct fsnotify_mark *fsn_mark)
+{
+	struct inotify_inode_mark *i_mark;
+
+	i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark);
+
+	kmem_cache_free(inotify_inode_mark_cachep, i_mark);
+}
+
 const struct fsnotify_ops inotify_fsnotify_ops = {
 	.handle_event = inotify_handle_event,
 	.free_group_priv = inotify_free_group_priv,
 	.free_event = inotify_free_event,
 	.freeing_mark = inotify_freeing_mark,
+	.free_mark = inotify_free_mark,
 };
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 498d609b26c7..7cc7d3fb1862 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -47,7 +47,7 @@
 /* configurable via /proc/sys/fs/inotify/ */
 static int inotify_max_queued_events __read_mostly;
 
-static struct kmem_cache *inotify_inode_mark_cachep __read_mostly;
+struct kmem_cache *inotify_inode_mark_cachep __read_mostly;
 
 #ifdef CONFIG_SYSCTL
 
@@ -395,21 +395,6 @@ static struct inotify_inode_mark *inotify_idr_find(struct fsnotify_group *group,
 	return i_mark;
 }
 
-static void do_inotify_remove_from_idr(struct fsnotify_group *group,
-				       struct inotify_inode_mark *i_mark)
-{
-	struct idr *idr = &group->inotify_data.idr;
-	spinlock_t *idr_lock = &group->inotify_data.idr_lock;
-	int wd = i_mark->wd;
-
-	assert_spin_locked(idr_lock);
-
-	idr_remove(idr, wd);
-
-	/* removed from the idr, drop that ref */
-	fsnotify_put_mark(&i_mark->fsn_mark);
-}
-
 /*
  * Remove the mark from the idr (if present) and drop the reference
  * on the mark because it was in the idr.
@@ -417,6 +402,7 @@ static void do_inotify_remove_from_idr(struct fsnotify_group *group,
 static void inotify_remove_from_idr(struct fsnotify_group *group,
 				    struct inotify_inode_mark *i_mark)
 {
+	struct idr *idr = &group->inotify_data.idr;
 	spinlock_t *idr_lock = &group->inotify_data.idr_lock;
 	struct inotify_inode_mark *found_i_mark = NULL;
 	int wd;
@@ -429,18 +415,16 @@ static void inotify_remove_from_idr(struct fsnotify_group *group,
 	 * if it wasn't....
 	 */
 	if (wd == -1) {
-		WARN_ONCE(1, "%s: i_mark=%p i_mark->wd=%d i_mark->group=%p"
-			" i_mark->inode=%p\n", __func__, i_mark, i_mark->wd,
-			i_mark->fsn_mark.group, i_mark->fsn_mark.inode);
+		WARN_ONCE(1, "%s: i_mark=%p i_mark->wd=%d i_mark->group=%p\n",
+			__func__, i_mark, i_mark->wd, i_mark->fsn_mark.group);
 		goto out;
 	}
 
 	/* Lets look in the idr to see if we find it */
 	found_i_mark = inotify_idr_find_locked(group, wd);
 	if (unlikely(!found_i_mark)) {
-		WARN_ONCE(1, "%s: i_mark=%p i_mark->wd=%d i_mark->group=%p"
-			" i_mark->inode=%p\n", __func__, i_mark, i_mark->wd,
-			i_mark->fsn_mark.group, i_mark->fsn_mark.inode);
+		WARN_ONCE(1, "%s: i_mark=%p i_mark->wd=%d i_mark->group=%p\n",
+			__func__, i_mark, i_mark->wd, i_mark->fsn_mark.group);
 		goto out;
 	}
 
@@ -451,35 +435,33 @@ static void inotify_remove_from_idr(struct fsnotify_group *group,
 	 */
 	if (unlikely(found_i_mark != i_mark)) {
 		WARN_ONCE(1, "%s: i_mark=%p i_mark->wd=%d i_mark->group=%p "
-			"mark->inode=%p found_i_mark=%p found_i_mark->wd=%d "
-			"found_i_mark->group=%p found_i_mark->inode=%p\n",
-			__func__, i_mark, i_mark->wd, i_mark->fsn_mark.group,
-			i_mark->fsn_mark.inode, found_i_mark, found_i_mark->wd,
-			found_i_mark->fsn_mark.group,
-			found_i_mark->fsn_mark.inode);
+			"found_i_mark=%p found_i_mark->wd=%d "
+			"found_i_mark->group=%p\n", __func__, i_mark,
+			i_mark->wd, i_mark->fsn_mark.group, found_i_mark,
+			found_i_mark->wd, found_i_mark->fsn_mark.group);
 		goto out;
 	}
 
 	/*
 	 * One ref for being in the idr
-	 * one ref held by the caller trying to kill us
 	 * one ref grabbed by inotify_idr_find
 	 */
-	if (unlikely(atomic_read(&i_mark->fsn_mark.refcnt) < 3)) {
-		printk(KERN_ERR "%s: i_mark=%p i_mark->wd=%d i_mark->group=%p"
-			" i_mark->inode=%p\n", __func__, i_mark, i_mark->wd,
-			i_mark->fsn_mark.group, i_mark->fsn_mark.inode);
+	if (unlikely(atomic_read(&i_mark->fsn_mark.refcnt) < 2)) {
+		printk(KERN_ERR "%s: i_mark=%p i_mark->wd=%d i_mark->group=%p\n",
+			 __func__, i_mark, i_mark->wd, i_mark->fsn_mark.group);
 		/* we can't really recover with bad ref cnting.. */
 		BUG();
 	}
 
-	do_inotify_remove_from_idr(group, i_mark);
+	idr_remove(idr, wd);
+	/* Removed from the idr, drop that ref. */
+	fsnotify_put_mark(&i_mark->fsn_mark);
 out:
+	i_mark->wd = -1;
+	spin_unlock(idr_lock);
 	/* match the ref taken by inotify_idr_find_locked() */
 	if (found_i_mark)
 		fsnotify_put_mark(&found_i_mark->fsn_mark);
-	i_mark->wd = -1;
-	spin_unlock(idr_lock);
 }
 
 /*
@@ -492,7 +474,7 @@ void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark,
 
 	/* Queue ignore event for the watch */
 	inotify_handle_event(group, NULL, fsn_mark, NULL, FS_IN_IGNORED,
-			     NULL, FSNOTIFY_EVENT_NONE, NULL, 0);
+			     NULL, FSNOTIFY_EVENT_NONE, NULL, 0, NULL);
 
 	i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark);
 	/* remove this mark from the idr */
@@ -501,16 +483,6 @@ void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark,
 	dec_inotify_watches(group->inotify_data.ucounts);
 }
 
-/* ding dong the mark is dead */
-static void inotify_free_mark(struct fsnotify_mark *fsn_mark)
-{
-	struct inotify_inode_mark *i_mark;
-
-	i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark);
-
-	kmem_cache_free(inotify_inode_mark_cachep, i_mark);
-}
-
 static int inotify_update_existing_watch(struct fsnotify_group *group,
 					 struct inode *inode,
 					 u32 arg)
@@ -524,21 +496,19 @@ static int inotify_update_existing_watch(struct fsnotify_group *group,
 
 	mask = inotify_arg_to_mask(arg);
 
-	fsn_mark = fsnotify_find_inode_mark(group, inode);
+	fsn_mark = fsnotify_find_mark(&inode->i_fsnotify_marks, group);
 	if (!fsn_mark)
 		return -ENOENT;
 
 	i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark);
 
 	spin_lock(&fsn_mark->lock);
-
 	old_mask = fsn_mark->mask;
 	if (add)
-		fsnotify_set_mark_mask_locked(fsn_mark, (fsn_mark->mask | mask));
+		fsn_mark->mask |= mask;
 	else
-		fsnotify_set_mark_mask_locked(fsn_mark, mask);
+		fsn_mark->mask = mask;
 	new_mask = fsn_mark->mask;
-
 	spin_unlock(&fsn_mark->lock);
 
 	if (old_mask != new_mask) {
@@ -549,7 +519,7 @@ static int inotify_update_existing_watch(struct fsnotify_group *group,
 
 		/* update the inode with this new fsn_mark */
 		if (dropped || do_inode)
-			fsnotify_recalc_inode_mask(inode);
+			fsnotify_recalc_mask(inode->i_fsnotify_marks);
 
 	}
 
@@ -578,7 +548,7 @@ static int inotify_new_watch(struct fsnotify_group *group,
 	if (unlikely(!tmp_i_mark))
 		return -ENOMEM;
 
-	fsnotify_init_mark(&tmp_i_mark->fsn_mark, inotify_free_mark);
+	fsnotify_init_mark(&tmp_i_mark->fsn_mark, group);
 	tmp_i_mark->fsn_mark.mask = mask;
 	tmp_i_mark->wd = -1;
 
@@ -594,8 +564,7 @@ static int inotify_new_watch(struct fsnotify_group *group,
 	}
 
 	/* we are on the idr, now get on the inode */
-	ret = fsnotify_add_mark_locked(&tmp_i_mark->fsn_mark, group, inode,
-				       NULL, 0);
+	ret = fsnotify_add_mark_locked(&tmp_i_mark->fsn_mark, inode, NULL, 0);
 	if (ret) {
 		/* we failed to get on the inode, get off the idr */
 		inotify_remove_from_idr(group, tmp_i_mark);
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index 6043306e8e21..9991f8826734 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -33,7 +33,7 @@
  *
  * group->mark_mutex
  * mark->lock
- * inode->i_lock
+ * mark->connector->lock
  *
  * group->mark_mutex protects the marks_list anchored inside a given group and
  * each mark is hooked via the g_list.  It also protects the groups private
@@ -44,14 +44,22 @@
  * is assigned to as well as the access to a reference of the inode/vfsmount
  * that is being watched by the mark.
  *
- * inode->i_lock protects the i_fsnotify_marks list anchored inside a
- * given inode and each mark is hooked via the i_list. (and sorta the
- * free_i_list)
+ * mark->connector->lock protects the list of marks anchored inside an
+ * inode / vfsmount and each mark is hooked via the i_list.
  *
+ * A list of notification marks relating to inode / mnt is contained in
+ * fsnotify_mark_connector. That structure is alive as long as there are any
+ * marks in the list and is also protected by fsnotify_mark_srcu. A mark gets
+ * detached from fsnotify_mark_connector when last reference to the mark is
+ * dropped.  Thus having mark reference is enough to protect mark->connector
+ * pointer and to make sure fsnotify_mark_connector cannot disappear. Also
+ * because we remove mark from g_list before dropping mark reference associated
+ * with that, any mark found through g_list is guaranteed to have
+ * mark->connector set until we drop group->mark_mutex.
  *
  * LIFETIME:
  * Inode marks survive between when they are added to an inode and when their
- * refcnt==0.
+ * refcnt==0. Marks are also protected by fsnotify_mark_srcu.
  *
  * The inode mark can be cleared for a number of different reasons including:
  * - The inode is unlinked for the last time.  (fsnotify_inode_remove)
@@ -61,17 +69,6 @@
  * - The fsnotify_group associated with the mark is going away and all such marks
  *   need to be cleaned up. (fsnotify_clear_marks_by_group)
  *
- * Worst case we are given an inode and need to clean up all the marks on that
- * inode.  We take i_lock and walk the i_fsnotify_marks safely.  For each
- * mark on the list we take a reference (so the mark can't disappear under us).
- * We remove that mark form the inode's list of marks and we add this mark to a
- * private list anchored on the stack using i_free_list; we walk i_free_list
- * and before we destroy the mark we make sure that we dont race with a
- * concurrent destroy_group by getting a ref to the marks group and taking the
- * groups mutex.
-
- * Very similarly for freeing by group, except we use free_g_list.
- *
  * This has the very interesting property of being able to run concurrently with
  * any (or all) other directions.
  */
@@ -94,94 +91,281 @@
 #define FSNOTIFY_REAPER_DELAY	(1)	/* 1 jiffy */
 
 struct srcu_struct fsnotify_mark_srcu;
+struct kmem_cache *fsnotify_mark_connector_cachep;
+
 static DEFINE_SPINLOCK(destroy_lock);
 static LIST_HEAD(destroy_list);
+static struct fsnotify_mark_connector *connector_destroy_list;
 
 static void fsnotify_mark_destroy_workfn(struct work_struct *work);
 static DECLARE_DELAYED_WORK(reaper_work, fsnotify_mark_destroy_workfn);
 
+static void fsnotify_connector_destroy_workfn(struct work_struct *work);
+static DECLARE_WORK(connector_reaper_work, fsnotify_connector_destroy_workfn);
+
 void fsnotify_get_mark(struct fsnotify_mark *mark)
 {
+	WARN_ON_ONCE(!atomic_read(&mark->refcnt));
 	atomic_inc(&mark->refcnt);
 }
 
-void fsnotify_put_mark(struct fsnotify_mark *mark)
+/*
+ * Get mark reference when we found the mark via lockless traversal of object
+ * list. Mark can be already removed from the list by now and on its way to be
+ * destroyed once SRCU period ends.
+ */
+static bool fsnotify_get_mark_safe(struct fsnotify_mark *mark)
 {
-	if (atomic_dec_and_test(&mark->refcnt)) {
-		if (mark->group)
-			fsnotify_put_group(mark->group);
-		mark->free_mark(mark);
-	}
+	return atomic_inc_not_zero(&mark->refcnt);
 }
 
-/* Calculate mask of events for a list of marks */
-u32 fsnotify_recalc_mask(struct hlist_head *head)
+static void __fsnotify_recalc_mask(struct fsnotify_mark_connector *conn)
 {
 	u32 new_mask = 0;
 	struct fsnotify_mark *mark;
 
-	hlist_for_each_entry(mark, head, obj_list)
-		new_mask |= mark->mask;
-	return new_mask;
+	assert_spin_locked(&conn->lock);
+	hlist_for_each_entry(mark, &conn->list, obj_list) {
+		if (mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED)
+			new_mask |= mark->mask;
+	}
+	if (conn->flags & FSNOTIFY_OBJ_TYPE_INODE)
+		conn->inode->i_fsnotify_mask = new_mask;
+	else if (conn->flags & FSNOTIFY_OBJ_TYPE_VFSMOUNT)
+		real_mount(conn->mnt)->mnt_fsnotify_mask = new_mask;
 }
 
 /*
- * Remove mark from inode / vfsmount list, group list, drop inode reference
- * if we got one.
- *
- * Must be called with group->mark_mutex held.
+ * Calculate mask of events for a list of marks. The caller must make sure
+ * connector and connector->inode cannot disappear under us.  Callers achieve
+ * this by holding a mark->lock or mark->group->mark_mutex for a mark on this
+ * list.
  */
-void fsnotify_detach_mark(struct fsnotify_mark *mark)
+void fsnotify_recalc_mask(struct fsnotify_mark_connector *conn)
+{
+	if (!conn)
+		return;
+
+	spin_lock(&conn->lock);
+	__fsnotify_recalc_mask(conn);
+	spin_unlock(&conn->lock);
+	if (conn->flags & FSNOTIFY_OBJ_TYPE_INODE)
+		__fsnotify_update_child_dentry_flags(conn->inode);
+}
+
+/* Free all connectors queued for freeing once SRCU period ends */
+static void fsnotify_connector_destroy_workfn(struct work_struct *work)
+{
+	struct fsnotify_mark_connector *conn, *free;
+
+	spin_lock(&destroy_lock);
+	conn = connector_destroy_list;
+	connector_destroy_list = NULL;
+	spin_unlock(&destroy_lock);
+
+	synchronize_srcu(&fsnotify_mark_srcu);
+	while (conn) {
+		free = conn;
+		conn = conn->destroy_next;
+		kmem_cache_free(fsnotify_mark_connector_cachep, free);
+	}
+}
+
+static struct inode *fsnotify_detach_connector_from_object(
+					struct fsnotify_mark_connector *conn)
 {
 	struct inode *inode = NULL;
+
+	if (conn->flags & FSNOTIFY_OBJ_TYPE_INODE) {
+		inode = conn->inode;
+		rcu_assign_pointer(inode->i_fsnotify_marks, NULL);
+		inode->i_fsnotify_mask = 0;
+		conn->inode = NULL;
+		conn->flags &= ~FSNOTIFY_OBJ_TYPE_INODE;
+	} else if (conn->flags & FSNOTIFY_OBJ_TYPE_VFSMOUNT) {
+		rcu_assign_pointer(real_mount(conn->mnt)->mnt_fsnotify_marks,
+				   NULL);
+		real_mount(conn->mnt)->mnt_fsnotify_mask = 0;
+		conn->mnt = NULL;
+		conn->flags &= ~FSNOTIFY_OBJ_TYPE_VFSMOUNT;
+	}
+
+	return inode;
+}
+
+static void fsnotify_final_mark_destroy(struct fsnotify_mark *mark)
+{
 	struct fsnotify_group *group = mark->group;
 
-	BUG_ON(!mutex_is_locked(&group->mark_mutex));
+	if (WARN_ON_ONCE(!group))
+		return;
+	group->ops->free_mark(mark);
+	fsnotify_put_group(group);
+}
 
-	spin_lock(&mark->lock);
+void fsnotify_put_mark(struct fsnotify_mark *mark)
+{
+	struct fsnotify_mark_connector *conn;
+	struct inode *inode = NULL;
+	bool free_conn = false;
 
-	/* something else already called this function on this mark */
-	if (!(mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED)) {
-		spin_unlock(&mark->lock);
+	/* Catch marks that were actually never attached to object */
+	if (!mark->connector) {
+		if (atomic_dec_and_test(&mark->refcnt))
+			fsnotify_final_mark_destroy(mark);
 		return;
 	}
 
-	mark->flags &= ~FSNOTIFY_MARK_FLAG_ATTACHED;
+	/*
+	 * We have to be careful so that traversals of obj_list under lock can
+	 * safely grab mark reference.
+	 */
+	if (!atomic_dec_and_lock(&mark->refcnt, &mark->connector->lock))
+		return;
 
-	if (mark->flags & FSNOTIFY_MARK_FLAG_INODE) {
-		inode = mark->inode;
-		fsnotify_destroy_inode_mark(mark);
-	} else if (mark->flags & FSNOTIFY_MARK_FLAG_VFSMOUNT)
-		fsnotify_destroy_vfsmount_mark(mark);
-	else
-		BUG();
+	conn = mark->connector;
+	hlist_del_init_rcu(&mark->obj_list);
+	if (hlist_empty(&conn->list)) {
+		inode = fsnotify_detach_connector_from_object(conn);
+		free_conn = true;
+	} else {
+		__fsnotify_recalc_mask(conn);
+	}
+	mark->connector = NULL;
+	spin_unlock(&conn->lock);
+
+	iput(inode);
+
+	if (free_conn) {
+		spin_lock(&destroy_lock);
+		conn->destroy_next = connector_destroy_list;
+		connector_destroy_list = conn;
+		spin_unlock(&destroy_lock);
+		queue_work(system_unbound_wq, &connector_reaper_work);
+	}
 	/*
 	 * Note that we didn't update flags telling whether inode cares about
 	 * what's happening with children. We update these flags from
 	 * __fsnotify_parent() lazily when next event happens on one of our
 	 * children.
 	 */
+	spin_lock(&destroy_lock);
+	list_add(&mark->g_list, &destroy_list);
+	spin_unlock(&destroy_lock);
+	queue_delayed_work(system_unbound_wq, &reaper_work,
+			   FSNOTIFY_REAPER_DELAY);
+}
 
-	list_del_init(&mark->g_list);
+bool fsnotify_prepare_user_wait(struct fsnotify_iter_info *iter_info)
+{
+	struct fsnotify_group *group;
 
-	spin_unlock(&mark->lock);
+	if (WARN_ON_ONCE(!iter_info->inode_mark && !iter_info->vfsmount_mark))
+		return false;
+
+	if (iter_info->inode_mark)
+		group = iter_info->inode_mark->group;
+	else
+		group = iter_info->vfsmount_mark->group;
+
+	/*
+	 * Since acquisition of mark reference is an atomic op as well, we can
+	 * be sure this inc is seen before any effect of refcount increment.
+	 */
+	atomic_inc(&group->user_waits);
+
+	if (iter_info->inode_mark) {
+		/* This can fail if mark is being removed */
+		if (!fsnotify_get_mark_safe(iter_info->inode_mark))
+			goto out_wait;
+	}
+	if (iter_info->vfsmount_mark) {
+		if (!fsnotify_get_mark_safe(iter_info->vfsmount_mark))
+			goto out_inode;
+	}
 
-	if (inode && (mark->flags & FSNOTIFY_MARK_FLAG_OBJECT_PINNED))
-		iput(inode);
+	/*
+	 * Now that both marks are pinned by refcount in the inode / vfsmount
+	 * lists, we can drop SRCU lock, and safely resume the list iteration
+	 * once userspace returns.
+	 */
+	srcu_read_unlock(&fsnotify_mark_srcu, iter_info->srcu_idx);
+
+	return true;
+out_inode:
+	if (iter_info->inode_mark)
+		fsnotify_put_mark(iter_info->inode_mark);
+out_wait:
+	if (atomic_dec_and_test(&group->user_waits) && group->shutdown)
+		wake_up(&group->notification_waitq);
+	return false;
+}
+
+void fsnotify_finish_user_wait(struct fsnotify_iter_info *iter_info)
+{
+	struct fsnotify_group *group = NULL;
+
+	iter_info->srcu_idx = srcu_read_lock(&fsnotify_mark_srcu);
+	if (iter_info->inode_mark) {
+		group = iter_info->inode_mark->group;
+		fsnotify_put_mark(iter_info->inode_mark);
+	}
+	if (iter_info->vfsmount_mark) {
+		group = iter_info->vfsmount_mark->group;
+		fsnotify_put_mark(iter_info->vfsmount_mark);
+	}
+	/*
+	 * We abuse notification_waitq on group shutdown for waiting for all
+	 * marks pinned when waiting for userspace.
+	 */
+	if (atomic_dec_and_test(&group->user_waits) && group->shutdown)
+		wake_up(&group->notification_waitq);
+}
+
+/*
+ * Mark mark as detached, remove it from group list. Mark still stays in object
+ * list until its last reference is dropped. Note that we rely on mark being
+ * removed from group list before corresponding reference to it is dropped. In
+ * particular we rely on mark->connector being valid while we hold
+ * group->mark_mutex if we found the mark through g_list.
+ *
+ * Must be called with group->mark_mutex held. The caller must either hold
+ * reference to the mark or be protected by fsnotify_mark_srcu.
+ */
+void fsnotify_detach_mark(struct fsnotify_mark *mark)
+{
+	struct fsnotify_group *group = mark->group;
+
+	WARN_ON_ONCE(!mutex_is_locked(&group->mark_mutex));
+	WARN_ON_ONCE(!srcu_read_lock_held(&fsnotify_mark_srcu) &&
+		     atomic_read(&mark->refcnt) < 1 +
+			!!(mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED));
+
+	spin_lock(&mark->lock);
+	/* something else already called this function on this mark */
+	if (!(mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED)) {
+		spin_unlock(&mark->lock);
+		return;
+	}
+	mark->flags &= ~FSNOTIFY_MARK_FLAG_ATTACHED;
+	list_del_init(&mark->g_list);
+	spin_unlock(&mark->lock);
 
 	atomic_dec(&group->num_marks);
+
+	/* Drop mark reference acquired in fsnotify_add_mark_locked() */
+	fsnotify_put_mark(mark);
 }
 
 /*
- * Prepare mark for freeing and add it to the list of marks prepared for
- * freeing. The actual freeing must happen after SRCU period ends and the
- * caller is responsible for this.
+ * Free fsnotify mark. The mark is actually only marked as being freed.  The
+ * freeing is actually happening only once last reference to the mark is
+ * dropped from a workqueue which first waits for srcu period end.
  *
- * The function returns true if the mark was added to the list of marks for
- * freeing. The function returns false if someone else has already called
- * __fsnotify_free_mark() for the mark.
+ * Caller must have a reference to the mark or be protected by
+ * fsnotify_mark_srcu.
  */
-static bool __fsnotify_free_mark(struct fsnotify_mark *mark)
+void fsnotify_free_mark(struct fsnotify_mark *mark)
 {
 	struct fsnotify_group *group = mark->group;
 
@@ -189,7 +373,7 @@ static bool __fsnotify_free_mark(struct fsnotify_mark *mark)
 	/* something else already called this function on this mark */
 	if (!(mark->flags & FSNOTIFY_MARK_FLAG_ALIVE)) {
 		spin_unlock(&mark->lock);
-		return false;
+		return;
 	}
 	mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE;
 	spin_unlock(&mark->lock);
@@ -201,25 +385,6 @@ static bool __fsnotify_free_mark(struct fsnotify_mark *mark)
 	 */
 	if (group->ops->freeing_mark)
 		group->ops->freeing_mark(mark, group);
-
-	spin_lock(&destroy_lock);
-	list_add(&mark->g_list, &destroy_list);
-	spin_unlock(&destroy_lock);
-
-	return true;
-}
-
-/*
- * Free fsnotify mark. The freeing is actually happening from a workqueue which
- * first waits for srcu period end. Caller must have a reference to the mark
- * or be protected by fsnotify_mark_srcu.
- */
-void fsnotify_free_mark(struct fsnotify_mark *mark)
-{
-	if (__fsnotify_free_mark(mark)) {
-		queue_delayed_work(system_unbound_wq, &reaper_work,
-				   FSNOTIFY_REAPER_DELAY);
-	}
 }
 
 void fsnotify_destroy_mark(struct fsnotify_mark *mark,
@@ -231,54 +396,6 @@ void fsnotify_destroy_mark(struct fsnotify_mark *mark,
 	fsnotify_free_mark(mark);
 }
 
-void fsnotify_destroy_marks(struct hlist_head *head, spinlock_t *lock)
-{
-	struct fsnotify_mark *mark;
-
-	while (1) {
-		/*
-		 * We have to be careful since we can race with e.g.
-		 * fsnotify_clear_marks_by_group() and once we drop 'lock',
-		 * mark can get removed from the obj_list and destroyed. But
-		 * we are holding mark reference so mark cannot be freed and
-		 * calling fsnotify_destroy_mark() more than once is fine.
-		 */
-		spin_lock(lock);
-		if (hlist_empty(head)) {
-			spin_unlock(lock);
-			break;
-		}
-		mark = hlist_entry(head->first, struct fsnotify_mark, obj_list);
-		/*
-		 * We don't update i_fsnotify_mask / mnt_fsnotify_mask here
-		 * since inode / mount is going away anyway. So just remove
-		 * mark from the list.
-		 */
-		hlist_del_init_rcu(&mark->obj_list);
-		fsnotify_get_mark(mark);
-		spin_unlock(lock);
-		fsnotify_destroy_mark(mark, mark->group);
-		fsnotify_put_mark(mark);
-	}
-}
-
-void fsnotify_set_mark_mask_locked(struct fsnotify_mark *mark, __u32 mask)
-{
-	assert_spin_locked(&mark->lock);
-
-	mark->mask = mask;
-
-	if (mark->flags & FSNOTIFY_MARK_FLAG_INODE)
-		fsnotify_set_inode_mark_mask_locked(mark, mask);
-}
-
-void fsnotify_set_mark_ignored_mask_locked(struct fsnotify_mark *mark, __u32 mask)
-{
-	assert_spin_locked(&mark->lock);
-
-	mark->ignored_mask = mask;
-}
-
 /*
  * Sorting function for lists of fsnotify marks.
  *
@@ -315,37 +432,133 @@ int fsnotify_compare_groups(struct fsnotify_group *a, struct fsnotify_group *b)
 	return -1;
 }
 
-/* Add mark into proper place in given list of marks */
-int fsnotify_add_mark_list(struct hlist_head *head, struct fsnotify_mark *mark,
-			   int allow_dups)
+static int fsnotify_attach_connector_to_object(
+				struct fsnotify_mark_connector __rcu **connp,
+				struct inode *inode,
+				struct vfsmount *mnt)
+{
+	struct fsnotify_mark_connector *conn;
+
+	conn = kmem_cache_alloc(fsnotify_mark_connector_cachep, GFP_KERNEL);
+	if (!conn)
+		return -ENOMEM;
+	spin_lock_init(&conn->lock);
+	INIT_HLIST_HEAD(&conn->list);
+	if (inode) {
+		conn->flags = FSNOTIFY_OBJ_TYPE_INODE;
+		conn->inode = igrab(inode);
+	} else {
+		conn->flags = FSNOTIFY_OBJ_TYPE_VFSMOUNT;
+		conn->mnt = mnt;
+	}
+	/*
+	 * cmpxchg() provides the barrier so that readers of *connp can see
+	 * only initialized structure
+	 */
+	if (cmpxchg(connp, NULL, conn)) {
+		/* Someone else created list structure for us */
+		if (inode)
+			iput(inode);
+		kmem_cache_free(fsnotify_mark_connector_cachep, conn);
+	}
+
+	return 0;
+}
+
+/*
+ * Get mark connector, make sure it is alive and return with its lock held.
+ * This is for users that get connector pointer from inode or mount. Users that
+ * hold reference to a mark on the list may directly lock connector->lock as
+ * they are sure list cannot go away under them.
+ */
+static struct fsnotify_mark_connector *fsnotify_grab_connector(
+				struct fsnotify_mark_connector __rcu **connp)
+{
+	struct fsnotify_mark_connector *conn;
+	int idx;
+
+	idx = srcu_read_lock(&fsnotify_mark_srcu);
+	conn = srcu_dereference(*connp, &fsnotify_mark_srcu);
+	if (!conn)
+		goto out;
+	spin_lock(&conn->lock);
+	if (!(conn->flags & (FSNOTIFY_OBJ_TYPE_INODE |
+			     FSNOTIFY_OBJ_TYPE_VFSMOUNT))) {
+		spin_unlock(&conn->lock);
+		srcu_read_unlock(&fsnotify_mark_srcu, idx);
+		return NULL;
+	}
+out:
+	srcu_read_unlock(&fsnotify_mark_srcu, idx);
+	return conn;
+}
+
+/*
+ * Add mark into proper place in given list of marks. These marks may be used
+ * for the fsnotify backend to determine which event types should be delivered
+ * to which group and for which inodes. These marks are ordered according to
+ * priority, highest number first, and then by the group's location in memory.
+ */
+static int fsnotify_add_mark_list(struct fsnotify_mark *mark,
+				  struct inode *inode, struct vfsmount *mnt,
+				  int allow_dups)
 {
 	struct fsnotify_mark *lmark, *last = NULL;
+	struct fsnotify_mark_connector *conn;
+	struct fsnotify_mark_connector __rcu **connp;
 	int cmp;
+	int err = 0;
+
+	if (WARN_ON(!inode && !mnt))
+		return -EINVAL;
+	if (inode)
+		connp = &inode->i_fsnotify_marks;
+	else
+		connp = &real_mount(mnt)->mnt_fsnotify_marks;
+restart:
+	spin_lock(&mark->lock);
+	conn = fsnotify_grab_connector(connp);
+	if (!conn) {
+		spin_unlock(&mark->lock);
+		err = fsnotify_attach_connector_to_object(connp, inode, mnt);
+		if (err)
+			return err;
+		goto restart;
+	}
 
 	/* is mark the first mark? */
-	if (hlist_empty(head)) {
-		hlist_add_head_rcu(&mark->obj_list, head);
-		return 0;
+	if (hlist_empty(&conn->list)) {
+		hlist_add_head_rcu(&mark->obj_list, &conn->list);
+		goto added;
 	}
 
 	/* should mark be in the middle of the current list? */
-	hlist_for_each_entry(lmark, head, obj_list) {
+	hlist_for_each_entry(lmark, &conn->list, obj_list) {
 		last = lmark;
 
-		if ((lmark->group == mark->group) && !allow_dups)
-			return -EEXIST;
+		if ((lmark->group == mark->group) &&
+		    (lmark->flags & FSNOTIFY_MARK_FLAG_ATTACHED) &&
+		    !allow_dups) {
+			err = -EEXIST;
+			goto out_err;
+		}
 
 		cmp = fsnotify_compare_groups(lmark->group, mark->group);
 		if (cmp >= 0) {
 			hlist_add_before_rcu(&mark->obj_list, &lmark->obj_list);
-			return 0;
+			goto added;
 		}
 	}
 
 	BUG_ON(last == NULL);
 	/* mark should be the last entry.  last is the current last entry */
 	hlist_add_behind_rcu(&mark->obj_list, &last->obj_list);
-	return 0;
+added:
+	mark->connector = conn;
+out_err:
+	spin_unlock(&conn->lock);
+	spin_unlock(&mark->lock);
+	return err;
 }
 
 /*
@@ -353,10 +566,10 @@ int fsnotify_add_mark_list(struct hlist_head *head, struct fsnotify_mark *mark,
  * These marks may be used for the fsnotify backend to determine which
  * event types should be delivered to which group.
  */
-int fsnotify_add_mark_locked(struct fsnotify_mark *mark,
-			     struct fsnotify_group *group, struct inode *inode,
+int fsnotify_add_mark_locked(struct fsnotify_mark *mark, struct inode *inode,
 			     struct vfsmount *mnt, int allow_dups)
 {
+	struct fsnotify_group *group = mark->group;
 	int ret = 0;
 
 	BUG_ON(inode && mnt);
@@ -367,61 +580,42 @@ int fsnotify_add_mark_locked(struct fsnotify_mark *mark,
 	 * LOCKING ORDER!!!!
 	 * group->mark_mutex
 	 * mark->lock
-	 * inode->i_lock
+	 * mark->connector->lock
 	 */
 	spin_lock(&mark->lock);
 	mark->flags |= FSNOTIFY_MARK_FLAG_ALIVE | FSNOTIFY_MARK_FLAG_ATTACHED;
 
-	fsnotify_get_group(group);
-	mark->group = group;
 	list_add(&mark->g_list, &group->marks_list);
 	atomic_inc(&group->num_marks);
-	fsnotify_get_mark(mark); /* for i_list and g_list */
-
-	if (inode) {
-		ret = fsnotify_add_inode_mark(mark, group, inode, allow_dups);
-		if (ret)
-			goto err;
-	} else if (mnt) {
-		ret = fsnotify_add_vfsmount_mark(mark, group, mnt, allow_dups);
-		if (ret)
-			goto err;
-	} else {
-		BUG();
-	}
-
-	/* this will pin the object if appropriate */
-	fsnotify_set_mark_mask_locked(mark, mark->mask);
+	fsnotify_get_mark(mark); /* for g_list */
 	spin_unlock(&mark->lock);
 
-	if (inode)
-		__fsnotify_update_child_dentry_flags(inode);
+	ret = fsnotify_add_mark_list(mark, inode, mnt, allow_dups);
+	if (ret)
+		goto err;
+
+	if (mark->mask)
+		fsnotify_recalc_mask(mark->connector);
 
 	return ret;
 err:
-	mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE;
+	mark->flags &= ~(FSNOTIFY_MARK_FLAG_ALIVE |
+			 FSNOTIFY_MARK_FLAG_ATTACHED);
 	list_del_init(&mark->g_list);
-	fsnotify_put_group(group);
-	mark->group = NULL;
 	atomic_dec(&group->num_marks);
 
-	spin_unlock(&mark->lock);
-
-	spin_lock(&destroy_lock);
-	list_add(&mark->g_list, &destroy_list);
-	spin_unlock(&destroy_lock);
-	queue_delayed_work(system_unbound_wq, &reaper_work,
-				FSNOTIFY_REAPER_DELAY);
-
+	fsnotify_put_mark(mark);
 	return ret;
 }
 
-int fsnotify_add_mark(struct fsnotify_mark *mark, struct fsnotify_group *group,
-		      struct inode *inode, struct vfsmount *mnt, int allow_dups)
+int fsnotify_add_mark(struct fsnotify_mark *mark, struct inode *inode,
+		      struct vfsmount *mnt, int allow_dups)
 {
 	int ret;
+	struct fsnotify_group *group = mark->group;
+
 	mutex_lock(&group->mark_mutex);
-	ret = fsnotify_add_mark_locked(mark, group, inode, mnt, allow_dups);
+	ret = fsnotify_add_mark_locked(mark, inode, mnt, allow_dups);
 	mutex_unlock(&group->mark_mutex);
 	return ret;
 }
@@ -430,29 +624,42 @@ int fsnotify_add_mark(struct fsnotify_mark *mark, struct fsnotify_group *group,
  * Given a list of marks, find the mark associated with given group. If found
  * take a reference to that mark and return it, else return NULL.
  */
-struct fsnotify_mark *fsnotify_find_mark(struct hlist_head *head,
-					 struct fsnotify_group *group)
+struct fsnotify_mark *fsnotify_find_mark(
+				struct fsnotify_mark_connector __rcu **connp,
+				struct fsnotify_group *group)
 {
+	struct fsnotify_mark_connector *conn;
 	struct fsnotify_mark *mark;
 
-	hlist_for_each_entry(mark, head, obj_list) {
-		if (mark->group == group) {
+	conn = fsnotify_grab_connector(connp);
+	if (!conn)
+		return NULL;
+
+	hlist_for_each_entry(mark, &conn->list, obj_list) {
+		if (mark->group == group &&
+		    (mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED)) {
 			fsnotify_get_mark(mark);
+			spin_unlock(&conn->lock);
 			return mark;
 		}
 	}
+	spin_unlock(&conn->lock);
 	return NULL;
 }
 
-/*
- * clear any marks in a group in which mark->flags & flags is true
- */
-void fsnotify_clear_marks_by_group_flags(struct fsnotify_group *group,
-					 unsigned int flags)
+/* Clear any marks in a group with given type */
+void fsnotify_clear_marks_by_group(struct fsnotify_group *group,
+				   unsigned int type)
 {
 	struct fsnotify_mark *lmark, *mark;
 	LIST_HEAD(to_free);
+	struct list_head *head = &to_free;
 
+	/* Skip selection step if we want to clear all marks. */
+	if (type == FSNOTIFY_OBJ_ALL_TYPES) {
+		head = &group->marks_list;
+		goto clear;
+	}
 	/*
 	 * We have to be really careful here. Anytime we drop mark_mutex, e.g.
 	 * fsnotify_clear_marks_by_inode() can come and free marks. Even in our
@@ -464,18 +671,19 @@ void fsnotify_clear_marks_by_group_flags(struct fsnotify_group *group,
 	 */
 	mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING);
 	list_for_each_entry_safe(mark, lmark, &group->marks_list, g_list) {
-		if (mark->flags & flags)
+		if (mark->connector->flags & type)
 			list_move(&mark->g_list, &to_free);
 	}
 	mutex_unlock(&group->mark_mutex);
 
+clear:
 	while (1) {
 		mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING);
-		if (list_empty(&to_free)) {
+		if (list_empty(head)) {
 			mutex_unlock(&group->mark_mutex);
 			break;
 		}
-		mark = list_first_entry(&to_free, struct fsnotify_mark, g_list);
+		mark = list_first_entry(head, struct fsnotify_mark, g_list);
 		fsnotify_get_mark(mark);
 		fsnotify_detach_mark(mark);
 		mutex_unlock(&group->mark_mutex);
@@ -484,49 +692,62 @@ void fsnotify_clear_marks_by_group_flags(struct fsnotify_group *group,
 	}
 }
 
-/*
- * Given a group, prepare for freeing all the marks associated with that group.
- * The marks are attached to the list of marks prepared for destruction, the
- * caller is responsible for freeing marks in that list after SRCU period has
- * ended.
- */
-void fsnotify_detach_group_marks(struct fsnotify_group *group)
+/* Destroy all marks attached to inode / vfsmount */
+void fsnotify_destroy_marks(struct fsnotify_mark_connector __rcu **connp)
 {
-	struct fsnotify_mark *mark;
+	struct fsnotify_mark_connector *conn;
+	struct fsnotify_mark *mark, *old_mark = NULL;
+	struct inode *inode;
 
-	while (1) {
-		mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING);
-		if (list_empty(&group->marks_list)) {
-			mutex_unlock(&group->mark_mutex);
-			break;
-		}
-		mark = list_first_entry(&group->marks_list,
-					struct fsnotify_mark, g_list);
+	conn = fsnotify_grab_connector(connp);
+	if (!conn)
+		return;
+	/*
+	 * We have to be careful since we can race with e.g.
+	 * fsnotify_clear_marks_by_group() and once we drop the conn->lock, the
+	 * list can get modified. However we are holding mark reference and
+	 * thus our mark cannot be removed from obj_list so we can continue
+	 * iteration after regaining conn->lock.
+	 */
+	hlist_for_each_entry(mark, &conn->list, obj_list) {
 		fsnotify_get_mark(mark);
-		fsnotify_detach_mark(mark);
-		mutex_unlock(&group->mark_mutex);
-		__fsnotify_free_mark(mark);
-		fsnotify_put_mark(mark);
+		spin_unlock(&conn->lock);
+		if (old_mark)
+			fsnotify_put_mark(old_mark);
+		old_mark = mark;
+		fsnotify_destroy_mark(mark, mark->group);
+		spin_lock(&conn->lock);
 	}
+	/*
+	 * Detach list from object now so that we don't pin inode until all
+	 * mark references get dropped. It would lead to strange results such
+	 * as delaying inode deletion or blocking unmount.
+	 */
+	inode = fsnotify_detach_connector_from_object(conn);
+	spin_unlock(&conn->lock);
+	if (old_mark)
+		fsnotify_put_mark(old_mark);
+	iput(inode);
 }
 
 /*
  * Nothing fancy, just initialize lists and locks and counters.
  */
 void fsnotify_init_mark(struct fsnotify_mark *mark,
-			void (*free_mark)(struct fsnotify_mark *mark))
+			struct fsnotify_group *group)
 {
 	memset(mark, 0, sizeof(*mark));
 	spin_lock_init(&mark->lock);
 	atomic_set(&mark->refcnt, 1);
-	mark->free_mark = free_mark;
+	fsnotify_get_group(group);
+	mark->group = group;
 }
 
 /*
  * Destroy all marks in destroy_list, waits for SRCU period to finish before
  * actually freeing marks.
  */
-void fsnotify_mark_destroy_list(void)
+static void fsnotify_mark_destroy_workfn(struct work_struct *work)
 {
 	struct fsnotify_mark *mark, *next;
 	struct list_head private_destroy_list;
@@ -540,11 +761,12 @@ void fsnotify_mark_destroy_list(void)
 
 	list_for_each_entry_safe(mark, next, &private_destroy_list, g_list) {
 		list_del_init(&mark->g_list);
-		fsnotify_put_mark(mark);
+		fsnotify_final_mark_destroy(mark);
 	}
 }
 
-static void fsnotify_mark_destroy_workfn(struct work_struct *work)
+/* Wait for all marks queued for destruction to be actually destroyed */
+void fsnotify_wait_marks_destroyed(void)
 {
-	fsnotify_mark_destroy_list();
+	flush_delayed_work(&reaper_work);
 }
diff --git a/fs/notify/vfsmount_mark.c b/fs/notify/vfsmount_mark.c
deleted file mode 100644
index a8fcab68faef..000000000000
--- a/fs/notify/vfsmount_mark.c
+++ /dev/null
@@ -1,108 +0,0 @@
-/*
- *  Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com>
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; either version 2, or (at your option)
- *  any later version.
- *
- *  This program is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with this program; see the file COPYING.  If not, write to
- *  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-
-#include <linux/fs.h>
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/mount.h>
-#include <linux/mutex.h>
-#include <linux/spinlock.h>
-
-#include <linux/atomic.h>
-
-#include <linux/fsnotify_backend.h>
-#include "fsnotify.h"
-
-void fsnotify_clear_vfsmount_marks_by_group(struct fsnotify_group *group)
-{
-	fsnotify_clear_marks_by_group_flags(group, FSNOTIFY_MARK_FLAG_VFSMOUNT);
-}
-
-/*
- * Recalculate the mnt->mnt_fsnotify_mask, or the mask of all FS_* event types
- * any notifier is interested in hearing for this mount point
- */
-void fsnotify_recalc_vfsmount_mask(struct vfsmount *mnt)
-{
-	struct mount *m = real_mount(mnt);
-
-	spin_lock(&mnt->mnt_root->d_lock);
-	m->mnt_fsnotify_mask = fsnotify_recalc_mask(&m->mnt_fsnotify_marks);
-	spin_unlock(&mnt->mnt_root->d_lock);
-}
-
-void fsnotify_destroy_vfsmount_mark(struct fsnotify_mark *mark)
-{
-	struct vfsmount *mnt = mark->mnt;
-	struct mount *m = real_mount(mnt);
-
-	BUG_ON(!mutex_is_locked(&mark->group->mark_mutex));
-	assert_spin_locked(&mark->lock);
-
-	spin_lock(&mnt->mnt_root->d_lock);
-
-	hlist_del_init_rcu(&mark->obj_list);
-	mark->mnt = NULL;
-
-	m->mnt_fsnotify_mask = fsnotify_recalc_mask(&m->mnt_fsnotify_marks);
-	spin_unlock(&mnt->mnt_root->d_lock);
-}
-
-/*
- * given a group and vfsmount, find the mark associated with that combination.
- * if found take a reference to that mark and return it, else return NULL
- */
-struct fsnotify_mark *fsnotify_find_vfsmount_mark(struct fsnotify_group *group,
-						  struct vfsmount *mnt)
-{
-	struct mount *m = real_mount(mnt);
-	struct fsnotify_mark *mark;
-
-	spin_lock(&mnt->mnt_root->d_lock);
-	mark = fsnotify_find_mark(&m->mnt_fsnotify_marks, group);
-	spin_unlock(&mnt->mnt_root->d_lock);
-
-	return mark;
-}
-
-/*
- * Attach an initialized mark to a given group and vfsmount.
- * These marks may be used for the fsnotify backend to determine which
- * event types should be delivered to which groups.
- */
-int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark,
-			       struct fsnotify_group *group, struct vfsmount *mnt,
-			       int allow_dups)
-{
-	struct mount *m = real_mount(mnt);
-	int ret;
-
-	mark->flags |= FSNOTIFY_MARK_FLAG_VFSMOUNT;
-
-	BUG_ON(!mutex_is_locked(&group->mark_mutex));
-	assert_spin_locked(&mark->lock);
-
-	spin_lock(&mnt->mnt_root->d_lock);
-	mark->mnt = mnt;
-	ret = fsnotify_add_mark_list(&m->mnt_fsnotify_marks, mark, allow_dups);
-	m->mnt_fsnotify_mask = fsnotify_recalc_mask(&m->mnt_fsnotify_marks);
-	spin_unlock(&mnt->mnt_root->d_lock);
-
-	return ret;
-}
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index f6e871760f8d..0da0332725aa 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -2242,13 +2242,13 @@ unlock:
 	spin_unlock(&o2hb_live_lock);
 }
 
-static ssize_t o2hb_heartbeat_group_threshold_show(struct config_item *item,
+static ssize_t o2hb_heartbeat_group_dead_threshold_show(struct config_item *item,
 		char *page)
 {
 	return sprintf(page, "%u\n", o2hb_dead_threshold);
 }
 
-static ssize_t o2hb_heartbeat_group_threshold_store(struct config_item *item,
+static ssize_t o2hb_heartbeat_group_dead_threshold_store(struct config_item *item,
 		const char *page, size_t count)
 {
 	unsigned long tmp;
@@ -2297,11 +2297,11 @@ static ssize_t o2hb_heartbeat_group_mode_store(struct config_item *item,
 
 }
 
-CONFIGFS_ATTR(o2hb_heartbeat_group_, threshold);
+CONFIGFS_ATTR(o2hb_heartbeat_group_, dead_threshold);
 CONFIGFS_ATTR(o2hb_heartbeat_group_, mode);
 
 static struct configfs_attribute *o2hb_heartbeat_group_attrs[] = {
-	&o2hb_heartbeat_group_attr_threshold,
+	&o2hb_heartbeat_group_attr_dead_threshold,
 	&o2hb_heartbeat_group_attr_mode,
 	NULL,
 };
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index d0ab7e56d0b4..8d779227370a 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -450,9 +450,8 @@ static struct o2net_sock_container *sc_alloc(struct o2nm_node *node)
 	INIT_WORK(&sc->sc_shutdown_work, o2net_shutdown_sc);
 	INIT_DELAYED_WORK(&sc->sc_keepalive_work, o2net_sc_send_keep_req);
 
-	init_timer(&sc->sc_idle_timeout);
-	sc->sc_idle_timeout.function = o2net_idle_timer;
-	sc->sc_idle_timeout.data = (unsigned long)sc;
+	setup_timer(&sc->sc_idle_timeout, o2net_idle_timer,
+		    (unsigned long)sc);
 
 	sclog(sc, "alloced\n");
 
@@ -956,7 +955,7 @@ static void o2net_sendpage(struct o2net_sock_container *sc,
 		mutex_lock(&sc->sc_send_lock);
 		ret = sc->sc_sock->ops->sendpage(sc->sc_sock,
 						 virt_to_page(kmalloced_virt),
-						 (long)kmalloced_virt & ~PAGE_MASK,
+						 offset_in_page(kmalloced_virt),
 						 size, MSG_DONTWAIT);
 		mutex_unlock(&sc->sc_send_lock);
 		if (ret == size)
@@ -1460,27 +1459,10 @@ static void o2net_rx_until_empty(struct work_struct *work)
 
 static int o2net_set_nodelay(struct socket *sock)
 {
-	int ret, val = 1;
-	mm_segment_t oldfs;
+	int val = 1;
 
-	oldfs = get_fs();
-	set_fs(KERNEL_DS);
-
-	/*
-	 * Dear unsuspecting programmer,
-	 *
-	 * Don't use sock_setsockopt() for SOL_TCP.  It doesn't check its level
-	 * argument and assumes SOL_SOCKET so, say, your TCP_NODELAY will
-	 * silently turn into SO_DEBUG.
-	 *
-	 * Yours,
-	 * Keeper of hilariously fragile interfaces.
-	 */
-	ret = sock->ops->setsockopt(sock, SOL_TCP, TCP_NODELAY,
-				    (char __user *)&val, sizeof(val));
-
-	set_fs(oldfs);
-	return ret;
+	return kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY,
+				    (void *)&val, sizeof(val));
 }
 
 static int o2net_set_usertimeout(struct socket *sock)
@@ -1488,7 +1470,7 @@ static int o2net_set_usertimeout(struct socket *sock)
 	int user_timeout = O2NET_TCP_USER_TIMEOUT;
 
 	return kernel_setsockopt(sock, SOL_TCP, TCP_USER_TIMEOUT,
-				(char *)&user_timeout, sizeof(user_timeout));
+				(void *)&user_timeout, sizeof(user_timeout));
 }
 
 static void o2net_initialize_handshake(void)
diff --git a/fs/open.c b/fs/open.c
index 949cef29c3bb..4d23f729dcc6 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -1078,6 +1078,26 @@ SYSCALL_DEFINE4(openat, int, dfd, const char __user *, filename, int, flags,
 	return do_sys_open(dfd, filename, flags, mode);
 }
 
+#ifdef CONFIG_COMPAT
+/*
+ * Exactly like sys_open(), except that it doesn't set the
+ * O_LARGEFILE flag.
+ */
+COMPAT_SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, umode_t, mode)
+{
+	return do_sys_open(AT_FDCWD, filename, flags, mode);
+}
+
+/*
+ * Exactly like sys_openat(), except that it doesn't set the
+ * O_LARGEFILE flag.
+ */
+COMPAT_SYSCALL_DEFINE4(openat, int, dfd, const char __user *, filename, int, flags, umode_t, mode)
+{
+	return do_sys_open(dfd, filename, flags, mode);
+}
+#endif
+
 #ifndef __alpha__
 
 /*
diff --git a/fs/orangefs/devorangefs-req.c b/fs/orangefs/devorangefs-req.c
index e1534c9bab16..c19f0787c9c6 100644
--- a/fs/orangefs/devorangefs-req.c
+++ b/fs/orangefs/devorangefs-req.c
@@ -180,6 +180,10 @@ static ssize_t orangefs_devreq_read(struct file *file,
 		return -EINVAL;
 	}
 
+	/* Check for an empty list before locking. */
+	if (list_empty(&orangefs_request_list))
+		return -EAGAIN;
+
 restart:
 	/* Get next op (if any) from top of list. */
 	spin_lock(&orangefs_request_list_lock);
diff --git a/fs/orangefs/dir.c b/fs/orangefs/dir.c
index 284373a57a08..d327cbd17756 100644
--- a/fs/orangefs/dir.c
+++ b/fs/orangefs/dir.c
@@ -1,396 +1,404 @@
 /*
- * (C) 2001 Clemson University and The University of Chicago
- *
- * See COPYING in top-level directory.
+ * Copyright 2017 Omnibond Systems, L.L.C.
  */
 
 #include "protocol.h"
 #include "orangefs-kernel.h"
 #include "orangefs-bufmap.h"
 
+struct orangefs_dir_part {
+	struct orangefs_dir_part *next;
+	size_t len;
+};
+
+struct orangefs_dir {
+	__u64 token;
+	struct orangefs_dir_part *part;
+	loff_t end;
+	int error;
+};
+
+#define PART_SHIFT (24)
+#define PART_SIZE (1<<24)
+#define PART_MASK (~(PART_SIZE - 1))
+
 /*
- * decode routine used by kmod to deal with the blob sent from
- * userspace for readdirs. The blob contains zero or more of these
- * sub-blobs:
- *   __u32 - represents length of the character string that follows.
- *   string - between 1 and ORANGEFS_NAME_MAX bytes long.
- *   padding - (if needed) to cause the __u32 plus the string to be
- *             eight byte aligned.
- *   khandle - sizeof(khandle) bytes.
+ * There can be up to 512 directory entries.  Each entry is encoded as
+ * follows:
+ * 4 bytes: string size (n)
+ * n bytes: string
+ * 1 byte: trailing zero
+ * padding to 8 bytes
+ * 16 bytes: khandle
+ * padding to 8 bytes
+ *
+ * The trailer_buf starts with a struct orangefs_readdir_response_s
+ * which must be skipped to get to the directory data.
+ *
+ * The data which is received from the userspace daemon is termed a
+ * part and is stored in a linked list in case more than one part is
+ * needed for a large directory.
+ *
+ * The position pointer (ctx->pos) encodes the part and offset on which
+ * to begin reading at.  Bits above PART_SHIFT encode the part and bits
+ * below PART_SHIFT encode the offset.  Parts are stored in a linked
+ * list which grows as data is received from the server.  The overhead
+ * associated with managing the list is presumed to be small compared to
+ * the overhead of communicating with the server.
+ *
+ * As data is received from the server, it is placed at the end of the
+ * part list.  Data is parsed from the current position as it is needed.
+ * When data is determined to be corrupt, it is either because the
+ * userspace component has sent back corrupt data or because the file
+ * pointer has been moved to an invalid location.  Since the two cannot
+ * be differentiated, return EIO.
+ *
+ * Part zero is synthesized to contains `.' and `..'.  Part one is the
+ * first part of the part list.
  */
-static long decode_dirents(char *ptr, size_t size,
-                           struct orangefs_readdir_response_s *readdir)
+
+static int do_readdir(struct orangefs_inode_s *oi,
+    struct orangefs_dir *od, struct dentry *dentry,
+    struct orangefs_kernel_op_s *op)
 {
-	int i;
-	struct orangefs_readdir_response_s *rd =
-		(struct orangefs_readdir_response_s *) ptr;
-	char *buf = ptr;
-	int khandle_size = sizeof(struct orangefs_khandle);
-	size_t offset = offsetof(struct orangefs_readdir_response_s,
-				dirent_array);
-	/* 8 reflects eight byte alignment */
-	int smallest_blob = khandle_size + 8;
-	__u32 len;
-	int aligned_len;
-	int sizeof_u32 = sizeof(__u32);
-	long ret;
-
-	gossip_debug(GOSSIP_DIR_DEBUG, "%s: size:%zu:\n", __func__, size);
-
-	/* size is = offset on empty dirs, > offset on non-empty dirs... */
-	if (size < offset) {
-		gossip_err("%s: size:%zu: offset:%zu:\n",
-			   __func__,
-			   size,
-			   offset);
-		ret = -EINVAL;
-		goto out;
-	}
+	struct orangefs_readdir_response_s *resp;
+	int bufi, r;
 
-        if ((size == offset) && (readdir->orangefs_dirent_outcount != 0)) {
-		gossip_err("%s: size:%zu: dirent_outcount:%d:\n",
-			   __func__,
-			   size,
-			   readdir->orangefs_dirent_outcount);
-		ret = -EINVAL;
-		goto out;
-	}
+	/*
+	 * Despite the badly named field, readdir does not use shared
+	 * memory.  However, there are a limited number of readdir
+	 * slots, which must be allocated here.  This flag simply tells
+	 * the op scheduler to return the op here for retry.
+	 */
+	op->uses_shared_memory = 1;
+	op->upcall.req.readdir.refn = oi->refn;
+	op->upcall.req.readdir.token = od->token;
+	op->upcall.req.readdir.max_dirent_count =
+	    ORANGEFS_MAX_DIRENT_COUNT_READDIR;
 
-	readdir->token = rd->token;
-	readdir->orangefs_dirent_outcount = rd->orangefs_dirent_outcount;
-	readdir->dirent_array = kcalloc(readdir->orangefs_dirent_outcount,
-					sizeof(*readdir->dirent_array),
-					GFP_KERNEL);
-	if (readdir->dirent_array == NULL) {
-		gossip_err("%s: kcalloc failed.\n", __func__);
-		ret = -ENOMEM;
-		goto out;
+again:
+	bufi = orangefs_readdir_index_get();
+	if (bufi < 0) {
+		od->error = bufi;
+		return bufi;
 	}
 
-	buf += offset;
-	size -= offset;
-
-	for (i = 0; i < readdir->orangefs_dirent_outcount; i++) {
-		if (size < smallest_blob) {
-			gossip_err("%s: size:%zu: smallest_blob:%d:\n",
-				   __func__,
-				   size,
-				   smallest_blob);
-			ret = -EINVAL;
-			goto free;
-		}
+	op->upcall.req.readdir.buf_index = bufi;
 
-		len = *(__u32 *)buf;
-		if ((len < 1) || (len > ORANGEFS_NAME_MAX)) {
-			gossip_err("%s: len:%d:\n", __func__, len);
-			ret = -EINVAL;
-			goto free;
-		}
+	r = service_operation(op, "orangefs_readdir",
+	    get_interruptible_flag(dentry->d_inode));
 
-		gossip_debug(GOSSIP_DIR_DEBUG,
-			     "%s: size:%zu: len:%d:\n",
-			     __func__,
-			     size,
-			     len);
+	orangefs_readdir_index_put(bufi);
 
-		readdir->dirent_array[i].d_name = buf + sizeof_u32;
-		readdir->dirent_array[i].d_length = len;
+	if (op_state_purged(op)) {
+		if (r == -EAGAIN) {
+			vfree(op->downcall.trailer_buf);
+			goto again;
+		} else if (r == -EIO) {
+			vfree(op->downcall.trailer_buf);
+			od->error = r;
+			return r;
+		}
+	}
 
-		/*
-		 * Calculate "aligned" length of this string and its
-		 * associated __u32 descriptor.
-		 */
-		aligned_len = ((sizeof_u32 + len + 1) + 7) & ~7;
-		gossip_debug(GOSSIP_DIR_DEBUG,
-			     "%s: aligned_len:%d:\n",
-			     __func__,
-			     aligned_len);
+	if (r < 0) {
+		vfree(op->downcall.trailer_buf);
+		od->error = r;
+		return r;
+	} else if (op->downcall.status) {
+		vfree(op->downcall.trailer_buf);
+		od->error = op->downcall.status;
+		return op->downcall.status;
+	}
 
-		/*
-		 * The end of the blob should coincide with the end
-		 * of the last sub-blob.
-		 */
-		if (size < aligned_len + khandle_size) {
-			gossip_err("%s: ran off the end of the blob.\n",
-				   __func__);
-			ret = -EINVAL;
-			goto free;
-		}
-		size -= aligned_len + khandle_size;
+	/*
+	 * The maximum size is size per entry times the 512 entries plus
+	 * the header.  This is well under the limit.
+	 */
+	if (op->downcall.trailer_size > PART_SIZE) {
+		vfree(op->downcall.trailer_buf);
+		od->error = -EIO;
+		return -EIO;
+	}
 
-		buf += aligned_len;
+	resp = (struct orangefs_readdir_response_s *)
+	    op->downcall.trailer_buf;
+	od->token = resp->token;
+	return 0;
+}
 
-		readdir->dirent_array[i].khandle =
-			*(struct orangefs_khandle *) buf;
-		buf += khandle_size;
+static int parse_readdir(struct orangefs_dir *od,
+    struct orangefs_kernel_op_s *op)
+{
+	struct orangefs_dir_part *part, *new;
+	size_t count;
+
+	count = 1;
+	part = od->part;
+	while (part) {
+		count++;
+		if (part->next)
+			part = part->next;
+		else
+			break;
 	}
-	ret = buf - ptr;
-	gossip_debug(GOSSIP_DIR_DEBUG, "%s: returning:%ld:\n", __func__, ret);
-	goto out;
 
-free:
-	kfree(readdir->dirent_array);
-	readdir->dirent_array = NULL;
+	new = (void *)op->downcall.trailer_buf;
+	new->next = NULL;
+	new->len = op->downcall.trailer_size -
+	    sizeof(struct orangefs_readdir_response_s);
+	if (!od->part)
+		od->part = new;
+	else
+		part->next = new;
+	count++;
+	od->end = count << PART_SHIFT;
 
-out:
-	return ret;
+	return 0;
 }
 
-/*
- * Read directory entries from an instance of an open directory.
- */
-static int orangefs_readdir(struct file *file, struct dir_context *ctx)
+static int orangefs_dir_more(struct orangefs_inode_s *oi,
+    struct orangefs_dir *od, struct dentry *dentry)
 {
-	int ret = 0;
-	int buffer_index;
-	/*
-	 * ptoken supports Orangefs' distributed directory logic, added
-	 * in 2.9.2.
-	 */
-	__u64 *ptoken = file->private_data;
-	__u64 pos = 0;
-	ino_t ino = 0;
-	struct dentry *dentry = file->f_path.dentry;
-	struct orangefs_kernel_op_s *new_op = NULL;
-	struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(dentry->d_inode);
-	struct orangefs_readdir_response_s readdir_response;
-	void *dents_buf;
-	int i = 0;
-	int len = 0;
-	ino_t current_ino = 0;
-	char *current_entry = NULL;
-	long bytes_decoded;
-
-	gossip_debug(GOSSIP_DIR_DEBUG,
-		     "%s: ctx->pos:%lld, ptoken = %llu\n",
-		     __func__,
-		     lld(ctx->pos),
-		     llu(*ptoken));
-
-	pos = (__u64) ctx->pos;
-
-	/* are we done? */
-	if (pos == ORANGEFS_READDIR_END) {
-		gossip_debug(GOSSIP_DIR_DEBUG,
-			     "Skipping to termination path\n");
-		return 0;
+	struct orangefs_kernel_op_s *op;
+	int r;
+
+	op = op_alloc(ORANGEFS_VFS_OP_READDIR);
+	if (!op) {
+		od->error = -ENOMEM;
+		return -ENOMEM;
+	}
+	r = do_readdir(oi, od, dentry, op);
+	if (r) {
+		od->error = r;
+		goto out;
+	}
+	r = parse_readdir(od, op);
+	if (r) {
+		od->error = r;
+		goto out;
 	}
 
-	gossip_debug(GOSSIP_DIR_DEBUG,
-		     "orangefs_readdir called on %pd (pos=%llu)\n",
-		     dentry, llu(pos));
+	od->error = 0;
+out:
+	op_release(op);
+	return od->error;
+}
 
-	memset(&readdir_response, 0, sizeof(readdir_response));
+static int fill_from_part(struct orangefs_dir_part *part,
+    struct dir_context *ctx)
+{
+	const int offset = sizeof(struct orangefs_readdir_response_s);
+	struct orangefs_khandle *khandle;
+	__u32 *len, padlen;
+	loff_t i;
+	char *s;
+	i = ctx->pos & ~PART_MASK;
 
-	new_op = op_alloc(ORANGEFS_VFS_OP_READDIR);
-	if (!new_op)
-		return -ENOMEM;
+	/* The file offset from userspace is too large. */
+	if (i > part->len)
+		return 1;
 
 	/*
-	 * Only the indices are shared. No memory is actually shared, but the
-	 * mechanism is used.
+	 * If the seek pointer is positioned just before an entry it
+	 * should find the next entry.
 	 */
-	new_op->uses_shared_memory = 1;
-	new_op->upcall.req.readdir.refn = orangefs_inode->refn;
-	new_op->upcall.req.readdir.max_dirent_count =
-	    ORANGEFS_MAX_DIRENT_COUNT_READDIR;
-
-	gossip_debug(GOSSIP_DIR_DEBUG,
-		     "%s: upcall.req.readdir.refn.khandle: %pU\n",
-		     __func__,
-		     &new_op->upcall.req.readdir.refn.khandle);
+	if (i % 8)
+		i = i + (8 - i%8)%8;
 
-	new_op->upcall.req.readdir.token = *ptoken;
-
-get_new_buffer_index:
-	buffer_index = orangefs_readdir_index_get();
-	if (buffer_index < 0) {
-		ret = buffer_index;
-		gossip_lerr("orangefs_readdir: orangefs_readdir_index_get() failure (%d)\n",
-			    ret);
-		goto out_free_op;
+	while (i < part->len) {
+		if (part->len < i + sizeof *len)
+			break;
+		len = (void *)part + offset + i;
+		/*
+		 * len is the size of the string itself.  padlen is the
+		 * total size of the encoded string.
+		 */
+		padlen = (sizeof *len + *len + 1) +
+		    (8 - (sizeof *len + *len + 1)%8)%8;
+		if (part->len < i + padlen + sizeof *khandle)
+			goto next;
+		s = (void *)part + offset + i + sizeof *len;
+		if (s[*len] != 0)
+			goto next;
+		khandle = (void *)part + offset + i + padlen;
+		if (!dir_emit(ctx, s, *len,
+		    orangefs_khandle_to_ino(khandle),
+		    DT_UNKNOWN))
+			return 0;
+		i += padlen + sizeof *khandle;
+		i = i + (8 - i%8)%8;
+		BUG_ON(i > part->len);
+		ctx->pos = (ctx->pos & PART_MASK) | i;
+		continue;
+next:
+		i += 8;
 	}
-	new_op->upcall.req.readdir.buf_index = buffer_index;
-
-	ret = service_operation(new_op,
-				"orangefs_readdir",
-				get_interruptible_flag(dentry->d_inode));
+	return 1;
+}
 
-	gossip_debug(GOSSIP_DIR_DEBUG,
-		     "Readdir downcall status is %d.  ret:%d\n",
-		     new_op->downcall.status,
-		     ret);
+static int orangefs_dir_fill(struct orangefs_inode_s *oi,
+    struct orangefs_dir *od, struct dentry *dentry,
+    struct dir_context *ctx)
+{
+	struct orangefs_dir_part *part;
+	size_t count;
 
-	orangefs_readdir_index_put(buffer_index);
+	count = ((ctx->pos & PART_MASK) >> PART_SHIFT) - 1;
 
-	if (ret == -EAGAIN && op_state_purged(new_op)) {
-		/* Client-core indices are invalid after it restarted. */
-		gossip_debug(GOSSIP_DIR_DEBUG,
-			"%s: Getting new buffer_index for retry of readdir..\n",
-			 __func__);
-		goto get_new_buffer_index;
+	part = od->part;
+	while (part->next && count) {
+		count--;
+		part = part->next;
 	}
-
-	if (ret == -EIO && op_state_purged(new_op)) {
-		gossip_err("%s: Client is down. Aborting readdir call.\n",
-			__func__);
-		goto out_free_op;
+	/* This means the userspace file offset is invalid. */
+	if (count) {
+		od->error = -EIO;
+		return -EIO;
 	}
 
-	if (ret < 0 || new_op->downcall.status != 0) {
-		gossip_debug(GOSSIP_DIR_DEBUG,
-			     "Readdir request failed.  Status:%d\n",
-			     new_op->downcall.status);
-		if (ret >= 0)
-			ret = new_op->downcall.status;
-		goto out_free_op;
+	while (part && part->len) {
+		int r;
+		r = fill_from_part(part, ctx);
+		if (r < 0) {
+			od->error = r;
+			return r;
+		} else if (r == 0) {
+			/* Userspace buffer is full. */
+			break;
+		} else {
+			/*
+			 * The part ran out of data.  Move to the next
+			 * part. */
+			ctx->pos = (ctx->pos & PART_MASK) +
+			    (1 << PART_SHIFT);
+			part = part->next;
+		}
 	}
+	return 0;
+}
 
-	dents_buf = new_op->downcall.trailer_buf;
-	if (dents_buf == NULL) {
-		gossip_err("Invalid NULL buffer in readdir response\n");
-		ret = -ENOMEM;
-		goto out_free_op;
+static loff_t orangefs_dir_llseek(struct file *file, loff_t offset,
+    int whence)
+{
+	struct orangefs_dir *od = file->private_data;
+	/*
+	 * Delete the stored data so userspace sees new directory
+	 * entries.
+	 */
+	if (!whence && offset < od->end) {
+		struct orangefs_dir_part *part = od->part;
+		while (part) {
+			struct orangefs_dir_part *next = part->next;
+			vfree(part);
+			part = next;
+		}
+		od->token = ORANGEFS_ITERATE_START;
+		od->part = NULL;
+		od->end = 1 << PART_SHIFT;
 	}
+	return default_llseek(file, offset, whence);
+}
 
-	bytes_decoded = decode_dirents(dents_buf, new_op->downcall.trailer_size,
-					&readdir_response);
-	if (bytes_decoded < 0) {
-		ret = bytes_decoded;
-		gossip_err("Could not decode readdir from buffer %d\n", ret);
-		goto out_vfree;
-	}
+static int orangefs_dir_iterate(struct file *file,
+    struct dir_context *ctx)
+{
+	struct orangefs_inode_s *oi;
+	struct orangefs_dir *od;
+	struct dentry *dentry;
+	int r;
 
-	if (bytes_decoded != new_op->downcall.trailer_size) {
-		gossip_err("orangefs_readdir: # bytes decoded (%ld) "
-			   "!= trailer size (%ld)\n",
-			   bytes_decoded,
-			   (long)new_op->downcall.trailer_size);
-		ret = -EINVAL;
-		goto out_destroy_handle;
-	}
+	dentry = file->f_path.dentry;
+	oi = ORANGEFS_I(dentry->d_inode);
+	od = file->private_data;
 
-	/*
-	 *  orangefs doesn't actually store dot and dot-dot, but
-	 *  we need to have them represented.
-	 */
-	if (pos == 0) {
-		ino = get_ino_from_khandle(dentry->d_inode);
-		gossip_debug(GOSSIP_DIR_DEBUG,
-			     "%s: calling dir_emit of \".\" with pos = %llu\n",
-			     __func__,
-			     llu(pos));
-		ret = dir_emit(ctx, ".", 1, ino, DT_DIR);
-		pos += 1;
-	}
+	if (od->error)
+		return od->error;
 
-	if (pos == 1) {
-		ino = get_parent_ino_from_dentry(dentry);
-		gossip_debug(GOSSIP_DIR_DEBUG,
-			     "%s: calling dir_emit of \"..\" with pos = %llu\n",
-			     __func__,
-			     llu(pos));
-		ret = dir_emit(ctx, "..", 2, ino, DT_DIR);
-		pos += 1;
+	if (ctx->pos == 0) {
+		if (!dir_emit_dot(file, ctx))
+			return 0;
+		ctx->pos++;
+	}
+	if (ctx->pos == 1) {
+		if (!dir_emit_dotdot(file, ctx))
+			return 0;
+		ctx->pos = 1 << PART_SHIFT;
 	}
 
 	/*
-	 * we stored ORANGEFS_ITERATE_NEXT in ctx->pos last time around
-	 * to prevent "finding" dot and dot-dot on any iteration
-	 * other than the first.
+	 * The seek position is in the first synthesized part but is not
+	 * valid.
 	 */
-	if (ctx->pos == ORANGEFS_ITERATE_NEXT)
-		ctx->pos = 0;
-
-	gossip_debug(GOSSIP_DIR_DEBUG,
-		     "%s: dirent_outcount:%d:\n",
-		     __func__,
-		     readdir_response.orangefs_dirent_outcount);
-	for (i = ctx->pos;
-	     i < readdir_response.orangefs_dirent_outcount;
-	     i++) {
-		len = readdir_response.dirent_array[i].d_length;
-		current_entry = readdir_response.dirent_array[i].d_name;
-		current_ino = orangefs_khandle_to_ino(
-			&readdir_response.dirent_array[i].khandle);
-
-		gossip_debug(GOSSIP_DIR_DEBUG,
-			     "calling dir_emit for %s with len %d"
-			     ", ctx->pos %ld\n",
-			     current_entry,
-			     len,
-			     (unsigned long)ctx->pos);
-		/*
-		 * type is unknown. We don't return object type
-		 * in the dirent_array. This leaves getdents
-		 * clueless about type.
-		 */
-		ret =
-		    dir_emit(ctx, current_entry, len, current_ino, DT_UNKNOWN);
-		if (!ret)
-			break;
-		ctx->pos++;
-		gossip_debug(GOSSIP_DIR_DEBUG,
-			      "%s: ctx->pos:%lld\n",
-			      __func__,
-			      lld(ctx->pos));
+	if ((ctx->pos & PART_MASK) == 0)
+		return -EIO;
 
-	}
+	r = 0;
 
 	/*
-	 * we ran all the way through the last batch, set up for
-	 * getting another batch...
+	 * Must read more if the user has sought past what has been read
+	 * so far.  Stop a user who has sought past the end.
 	 */
-	if (ret) {
-		*ptoken = readdir_response.token;
-		ctx->pos = ORANGEFS_ITERATE_NEXT;
+	while (od->token != ORANGEFS_ITERATE_END &&
+	    ctx->pos > od->end) {
+		r = orangefs_dir_more(oi, od, dentry);
+		if (r)
+			return r;
+	}
+	if (od->token == ORANGEFS_ITERATE_END && ctx->pos > od->end)
+		return -EIO;
+
+	/* Then try to fill if there's any left in the buffer. */
+	if (ctx->pos < od->end) {
+		r = orangefs_dir_fill(oi, od, dentry, ctx);
+		if (r)
+			return r;
 	}
 
-	/*
-	 * Did we hit the end of the directory?
-	 */
-	if (readdir_response.token == ORANGEFS_READDIR_END) {
-		gossip_debug(GOSSIP_DIR_DEBUG,
-		"End of dir detected; setting ctx->pos to ORANGEFS_READDIR_END.\n");
-		ctx->pos = ORANGEFS_READDIR_END;
+	/* Finally get some more and try to fill. */
+	if (od->token != ORANGEFS_ITERATE_END) {
+		r = orangefs_dir_more(oi, od, dentry);
+		if (r)
+			return r;
+		r = orangefs_dir_fill(oi, od, dentry, ctx);
 	}
 
-out_destroy_handle:
-	/* kfree(NULL) is safe */
-	kfree(readdir_response.dirent_array);
-out_vfree:
-	gossip_debug(GOSSIP_DIR_DEBUG, "vfree %p\n", dents_buf);
-	vfree(dents_buf);
-out_free_op:
-	op_release(new_op);
-	gossip_debug(GOSSIP_DIR_DEBUG, "orangefs_readdir returning %d\n", ret);
-	return ret;
+	return r;
 }
 
 static int orangefs_dir_open(struct inode *inode, struct file *file)
 {
-	__u64 *ptoken;
-
-	file->private_data = kmalloc(sizeof(__u64), GFP_KERNEL);
+	struct orangefs_dir *od;
+	file->private_data = kmalloc(sizeof(struct orangefs_dir),
+	    GFP_KERNEL);
 	if (!file->private_data)
 		return -ENOMEM;
-
-	ptoken = file->private_data;
-	*ptoken = ORANGEFS_READDIR_START;
+	od = file->private_data;
+	od->token = ORANGEFS_ITERATE_START;
+	od->part = NULL;
+	od->end = 1 << PART_SHIFT;
+	od->error = 0;
 	return 0;
 }
 
 static int orangefs_dir_release(struct inode *inode, struct file *file)
 {
+	struct orangefs_dir *od = file->private_data;
+	struct orangefs_dir_part *part = od->part;
 	orangefs_flush_inode(inode);
-	kfree(file->private_data);
+	while (part) {
+		struct orangefs_dir_part *next = part->next;
+		vfree(part);
+		part = next;
+	}
+	kfree(od);
 	return 0;
 }
 
-/** ORANGEFS implementation of VFS directory operations */
 const struct file_operations orangefs_dir_operations = {
+	.llseek = orangefs_dir_llseek,
 	.read = generic_read_dir,
-	.iterate = orangefs_readdir,
+	.iterate = orangefs_dir_iterate,
 	.open = orangefs_dir_open,
-	.release = orangefs_dir_release,
+	.release = orangefs_dir_release
 };
diff --git a/fs/orangefs/downcall.h b/fs/orangefs/downcall.h
index 3b8923f8bf21..163001c95501 100644
--- a/fs/orangefs/downcall.h
+++ b/fs/orangefs/downcall.h
@@ -40,16 +40,6 @@ struct orangefs_mkdir_response {
 	struct orangefs_object_kref refn;
 };
 
-/*
- * duplication of some system interface structures so that I don't have
- * to allocate extra memory
- */
-struct orangefs_dirent {
-	char *d_name;
-	int d_length;
-	struct orangefs_khandle khandle;
-};
-
 struct orangefs_statfs_response {
 	__s64 block_size;
 	__s64 blocks_total;
@@ -131,12 +121,16 @@ struct orangefs_downcall_s {
 	} resp;
 };
 
+/*
+ * The readdir response comes in the trailer.  It is followed by the
+ * directory entries as described in dir.c.
+ */
+
 struct orangefs_readdir_response_s {
 	__u64 token;
 	__u64 directory_version;
 	__u32 __pad2;
 	__u32 orangefs_dirent_outcount;
-	struct orangefs_dirent *dirent_array;
 };
 
 #endif /* __DOWNCALL_H */
diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c
index e6bbc8083d77..28f38d813ad2 100644
--- a/fs/orangefs/file.c
+++ b/fs/orangefs/file.c
@@ -114,7 +114,6 @@ static ssize_t wait_for_direct_io(enum ORANGEFS_io_type type, struct inode *inod
 	struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
 	struct orangefs_khandle *handle = &orangefs_inode->refn.khandle;
 	struct orangefs_kernel_op_s *new_op = NULL;
-	struct iov_iter saved = *iter;
 	int buffer_index = -1;
 	ssize_t ret;
 
@@ -193,7 +192,7 @@ populate_shared_memory:
 		orangefs_bufmap_put(buffer_index);
 		buffer_index = -1;
 		if (type == ORANGEFS_IO_WRITE)
-			*iter = saved;
+			iov_iter_revert(iter, total_size);
 		gossip_debug(GOSSIP_FILE_DEBUG,
 			     "%s:going to repopulate_shared_memory.\n",
 			     __func__);
@@ -475,7 +474,8 @@ static ssize_t orangefs_file_write_iter(struct kiocb *iocb, struct iov_iter *ite
 
 	/* Make sure generic_write_checks sees an up to date inode size. */
 	if (file->f_flags & O_APPEND) {
-		rc = orangefs_inode_getattr(file->f_mapping->host, 0, 1);
+		rc = orangefs_inode_getattr(file->f_mapping->host, 0, 1,
+		    STATX_SIZE);
 		if (rc == -ESTALE)
 			rc = -EIO;
 		if (rc) {
@@ -693,7 +693,8 @@ static loff_t orangefs_file_llseek(struct file *file, loff_t offset, int origin)
 		 * NOTE: We are only interested in file size here,
 		 * so we set mask accordingly.
 		 */
-		ret = orangefs_inode_getattr(file->f_mapping->host, 0, 1);
+		ret = orangefs_inode_getattr(file->f_mapping->host, 0, 1,
+		    STATX_SIZE);
 		if (ret == -ESTALE)
 			ret = -EIO;
 		if (ret) {
diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c
index a304bf34b212..9428ea0aac16 100644
--- a/fs/orangefs/inode.c
+++ b/fs/orangefs/inode.c
@@ -161,7 +161,7 @@ static int orangefs_setattr_size(struct inode *inode, struct iattr *iattr)
 		     iattr->ia_size);
 
 	/* Ensure that we have a up to date size, so we know if it changed. */
-	ret = orangefs_inode_getattr(inode, 0, 1);
+	ret = orangefs_inode_getattr(inode, 0, 1, STATX_SIZE);
 	if (ret == -ESTALE)
 		ret = -EIO;
 	if (ret) {
@@ -218,8 +218,7 @@ int orangefs_setattr(struct dentry *dentry, struct iattr *iattr)
 	if (ret)
 		goto out;
 
-	if ((iattr->ia_valid & ATTR_SIZE) &&
-	    iattr->ia_size != i_size_read(inode)) {
+	if (iattr->ia_valid & ATTR_SIZE) {
 		ret = orangefs_setattr_size(inode, iattr);
 		if (ret)
 			goto out;
@@ -256,13 +255,19 @@ int orangefs_getattr(const struct path *path, struct kstat *stat,
 		     "orangefs_getattr: called on %pd\n",
 		     path->dentry);
 
-	ret = orangefs_inode_getattr(inode, 0, 0);
+	ret = orangefs_inode_getattr(inode, 0, 0, request_mask);
 	if (ret == 0) {
 		generic_fillattr(inode, stat);
 
 		/* override block size reported to stat */
 		orangefs_inode = ORANGEFS_I(inode);
 		stat->blksize = orangefs_inode->blksize;
+
+		if (request_mask & STATX_SIZE)
+			stat->result_mask = STATX_BASIC_STATS;
+		else
+			stat->result_mask = STATX_BASIC_STATS &
+			    ~STATX_SIZE;
 	}
 	return ret;
 }
@@ -277,7 +282,7 @@ int orangefs_permission(struct inode *inode, int mask)
 	gossip_debug(GOSSIP_INODE_DEBUG, "%s: refreshing\n", __func__);
 
 	/* Make sure the permission (and other common attrs) are up to date. */
-	ret = orangefs_inode_getattr(inode, 0, 0);
+	ret = orangefs_inode_getattr(inode, 0, 0, STATX_MODE);
 	if (ret < 0)
 		return ret;
 
@@ -375,7 +380,7 @@ struct inode *orangefs_iget(struct super_block *sb, struct orangefs_object_kref
 	if (!inode || !(inode->i_state & I_NEW))
 		return inode;
 
-	error = orangefs_inode_getattr(inode, 1, 1);
+	error = orangefs_inode_getattr(inode, 1, 1, STATX_ALL);
 	if (error) {
 		iget_failed(inode);
 		return ERR_PTR(error);
@@ -420,7 +425,7 @@ struct inode *orangefs_new_inode(struct super_block *sb, struct inode *dir,
 	orangefs_set_inode(inode, ref);
 	inode->i_ino = hash;	/* needed for stat etc */
 
-	error = orangefs_inode_getattr(inode, 1, 1);
+	error = orangefs_inode_getattr(inode, 1, 1, STATX_ALL);
 	if (error)
 		goto out_iput;
 
diff --git a/fs/orangefs/namei.c b/fs/orangefs/namei.c
index a290ff6ec756..478e88bd7f9d 100644
--- a/fs/orangefs/namei.c
+++ b/fs/orangefs/namei.c
@@ -74,6 +74,7 @@ static int orangefs_create(struct inode *dir,
 	unlock_new_inode(inode);
 	orangefs_set_timeout(dentry);
 	ORANGEFS_I(inode)->getattr_time = jiffies - 1;
+	ORANGEFS_I(inode)->getattr_mask = STATX_BASIC_STATS;
 
 	gossip_debug(GOSSIP_NAME_DEBUG,
 		     "%s: dentry instantiated for %pd\n",
@@ -193,8 +194,6 @@ static struct dentry *orangefs_lookup(struct inode *dir, struct dentry *dentry,
 		goto out;
 	}
 
-	ORANGEFS_I(inode)->getattr_time = jiffies - 1;
-
 	gossip_debug(GOSSIP_NAME_DEBUG,
 		     "%s:%s:%d "
 		     "Found good inode [%lu] with count [%d]\n",
@@ -324,6 +323,7 @@ static int orangefs_symlink(struct inode *dir,
 	unlock_new_inode(inode);
 	orangefs_set_timeout(dentry);
 	ORANGEFS_I(inode)->getattr_time = jiffies - 1;
+	ORANGEFS_I(inode)->getattr_mask = STATX_BASIC_STATS;
 
 	gossip_debug(GOSSIP_NAME_DEBUG,
 		     "Inode (Symlink) %pU -> %pd\n",
@@ -388,6 +388,7 @@ static int orangefs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode
 	unlock_new_inode(inode);
 	orangefs_set_timeout(dentry);
 	ORANGEFS_I(inode)->getattr_time = jiffies - 1;
+	ORANGEFS_I(inode)->getattr_mask = STATX_BASIC_STATS;
 
 	gossip_debug(GOSSIP_NAME_DEBUG,
 		     "Inode (Directory) %pU -> %pd\n",
diff --git a/fs/orangefs/orangefs-debugfs.c b/fs/orangefs/orangefs-debugfs.c
index 791912da97d7..716ed337f166 100644
--- a/fs/orangefs/orangefs-debugfs.c
+++ b/fs/orangefs/orangefs-debugfs.c
@@ -440,6 +440,9 @@ static ssize_t orangefs_debug_write(struct file *file,
 		"orangefs_debug_write: %pD\n",
 		file);
 
+	if (count == 0)
+		return 0;
+
 	/*
 	 * Thwart users who try to jamb a ridiculous number
 	 * of bytes into the debug file...
diff --git a/fs/orangefs/orangefs-dev-proto.h b/fs/orangefs/orangefs-dev-proto.h
index f380f9ed1b28..efe08c763e56 100644
--- a/fs/orangefs/orangefs-dev-proto.h
+++ b/fs/orangefs/orangefs-dev-proto.h
@@ -52,12 +52,7 @@
  */
 #define ORANGEFS_MAX_DEBUG_STRING_LEN	0x00000800
 
-/*
- * The maximum number of directory entries in a single request is 96.
- * XXX: Why can this not be higher. The client-side code can handle up to 512.
- * XXX: What happens if we expect more than the client can return?
- */
-#define ORANGEFS_MAX_DIRENT_COUNT_READDIR 96
+#define ORANGEFS_MAX_DIRENT_COUNT_READDIR 512
 
 #include "upcall.h"
 #include "downcall.h"
diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h
index 8afac46fcc87..ea0ce507a6ab 100644
--- a/fs/orangefs/orangefs-kernel.h
+++ b/fs/orangefs/orangefs-kernel.h
@@ -215,6 +215,7 @@ struct orangefs_inode_s {
 	unsigned long pinode_flags;
 
 	unsigned long getattr_time;
+	u32 getattr_mask;
 };
 
 #define P_ATIME_FLAG 0
@@ -340,11 +341,6 @@ static inline struct orangefs_khandle *get_khandle_from_ino(struct inode *inode)
 	return &(ORANGEFS_I(inode)->refn.khandle);
 }
 
-static inline __s32 get_fsid_from_ino(struct inode *inode)
-{
-	return ORANGEFS_I(inode)->refn.fs_id;
-}
-
 static inline ino_t get_ino_from_khandle(struct inode *inode)
 {
 	struct orangefs_khandle *khandle;
@@ -500,7 +496,8 @@ int orangefs_inode_setxattr(struct inode *inode,
 			 size_t size,
 			 int flags);
 
-int orangefs_inode_getattr(struct inode *inode, int new, int bypass);
+int orangefs_inode_getattr(struct inode *inode, int new, int bypass,
+    u32 request_mask);
 
 int orangefs_inode_check_changed(struct inode *inode);
 
diff --git a/fs/orangefs/orangefs-utils.c b/fs/orangefs/orangefs-utils.c
index 9b96b99539d6..aab6f1842963 100644
--- a/fs/orangefs/orangefs-utils.c
+++ b/fs/orangefs/orangefs-utils.c
@@ -251,7 +251,8 @@ static int orangefs_inode_is_stale(struct inode *inode, int new,
 	return 0;
 }
 
-int orangefs_inode_getattr(struct inode *inode, int new, int bypass)
+int orangefs_inode_getattr(struct inode *inode, int new, int bypass,
+    u32 request_mask)
 {
 	struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
 	struct orangefs_kernel_op_s *new_op;
@@ -262,7 +263,13 @@ int orangefs_inode_getattr(struct inode *inode, int new, int bypass)
 	    get_khandle_from_ino(inode));
 
 	if (!new && !bypass) {
-		if (time_before(jiffies, orangefs_inode->getattr_time))
+		/*
+		 * Must have all the attributes in the mask and be within cache
+		 * time.
+		 */
+		if ((request_mask & orangefs_inode->getattr_mask) ==
+		    request_mask &&
+		    time_before(jiffies, orangefs_inode->getattr_time))
 			return 0;
 	}
 
@@ -270,7 +277,15 @@ int orangefs_inode_getattr(struct inode *inode, int new, int bypass)
 	if (!new_op)
 		return -ENOMEM;
 	new_op->upcall.req.getattr.refn = orangefs_inode->refn;
-	new_op->upcall.req.getattr.mask = ORANGEFS_ATTR_SYS_ALL_NOHINT;
+	/*
+	 * Size is the hardest attribute to get.  The incremental cost of any
+	 * other attribute is essentially zero.
+	 */
+	if (request_mask & STATX_SIZE || new)
+		new_op->upcall.req.getattr.mask = ORANGEFS_ATTR_SYS_ALL_NOHINT;
+	else
+		new_op->upcall.req.getattr.mask =
+		    ORANGEFS_ATTR_SYS_ALL_NOHINT & ~ORANGEFS_ATTR_SYS_SIZE;
 
 	ret = service_operation(new_op, __func__,
 	    get_interruptible_flag(inode));
@@ -291,25 +306,29 @@ int orangefs_inode_getattr(struct inode *inode, int new, int bypass)
 	case S_IFREG:
 		inode->i_flags = orangefs_inode_flags(&new_op->
 		    downcall.resp.getattr.attributes);
-		inode_size = (loff_t)new_op->
-		    downcall.resp.getattr.attributes.size;
-		rounded_up_size =
-		    (inode_size + (4096 - (inode_size % 4096)));
-		inode->i_size = inode_size;
-		orangefs_inode->blksize =
-		    new_op->downcall.resp.getattr.attributes.blksize;
-		spin_lock(&inode->i_lock);
-		inode->i_bytes = inode_size;
-		inode->i_blocks =
-		    (unsigned long)(rounded_up_size / 512);
-		spin_unlock(&inode->i_lock);
+		if (request_mask & STATX_SIZE || new) {
+			inode_size = (loff_t)new_op->
+			    downcall.resp.getattr.attributes.size;
+			rounded_up_size =
+			    (inode_size + (4096 - (inode_size % 4096)));
+			inode->i_size = inode_size;
+			orangefs_inode->blksize =
+			    new_op->downcall.resp.getattr.attributes.blksize;
+			spin_lock(&inode->i_lock);
+			inode->i_bytes = inode_size;
+			inode->i_blocks =
+			    (unsigned long)(rounded_up_size / 512);
+			spin_unlock(&inode->i_lock);
+		}
 		break;
 	case S_IFDIR:
-		inode->i_size = PAGE_SIZE;
-		orangefs_inode->blksize = i_blocksize(inode);
-		spin_lock(&inode->i_lock);
-		inode_set_bytes(inode, inode->i_size);
-		spin_unlock(&inode->i_lock);
+		if (request_mask & STATX_SIZE || new) {
+			inode->i_size = PAGE_SIZE;
+			orangefs_inode->blksize = i_blocksize(inode);
+			spin_lock(&inode->i_lock);
+			inode_set_bytes(inode, inode->i_size);
+			spin_unlock(&inode->i_lock);
+		}
 		set_nlink(inode, 1);
 		break;
 	case S_IFLNK:
@@ -349,6 +368,10 @@ int orangefs_inode_getattr(struct inode *inode, int new, int bypass)
 
 	orangefs_inode->getattr_time = jiffies +
 	    orangefs_getattr_timeout_msecs*HZ/1000;
+	if (request_mask & STATX_SIZE || new)
+		orangefs_inode->getattr_mask = STATX_BASIC_STATS;
+	else
+		orangefs_inode->getattr_mask = STATX_BASIC_STATS & ~STATX_SIZE;
 	ret = 0;
 out:
 	op_release(new_op);
@@ -500,41 +523,6 @@ int orangefs_flush_inode(struct inode *inode)
 	return ret;
 }
 
-int orangefs_unmount_sb(struct super_block *sb)
-{
-	int ret = -EINVAL;
-	struct orangefs_kernel_op_s *new_op = NULL;
-
-	gossip_debug(GOSSIP_UTILS_DEBUG,
-		     "orangefs_unmount_sb called on sb %p\n",
-		     sb);
-
-	new_op = op_alloc(ORANGEFS_VFS_OP_FS_UMOUNT);
-	if (!new_op)
-		return -ENOMEM;
-	new_op->upcall.req.fs_umount.id = ORANGEFS_SB(sb)->id;
-	new_op->upcall.req.fs_umount.fs_id = ORANGEFS_SB(sb)->fs_id;
-	strncpy(new_op->upcall.req.fs_umount.orangefs_config_server,
-		ORANGEFS_SB(sb)->devname,
-		ORANGEFS_MAX_SERVER_ADDR_LEN);
-
-	gossip_debug(GOSSIP_UTILS_DEBUG,
-		     "Attempting ORANGEFS Unmount via host %s\n",
-		     new_op->upcall.req.fs_umount.orangefs_config_server);
-
-	ret = service_operation(new_op, "orangefs_fs_umount", 0);
-
-	gossip_debug(GOSSIP_UTILS_DEBUG,
-		     "orangefs_unmount: got return value of %d\n", ret);
-	if (ret)
-		sb = ERR_PTR(ret);
-	else
-		ORANGEFS_SB(sb)->mount_pending = 1;
-
-	op_release(new_op);
-	return ret;
-}
-
 void orangefs_make_bad_inode(struct inode *inode)
 {
 	if (is_root_handle(inode)) {
diff --git a/fs/orangefs/protocol.h b/fs/orangefs/protocol.h
index 971307ad69be..48bcc1bbe415 100644
--- a/fs/orangefs/protocol.h
+++ b/fs/orangefs/protocol.h
@@ -138,13 +138,8 @@ typedef __s64 ORANGEFS_offset;
 #define ORANGEFS_G_SGID    (1 << 10)
 #define ORANGEFS_U_SUID    (1 << 11)
 
-/* definition taken from stdint.h */
-#define INT32_MAX (2147483647)
-#define ORANGEFS_ITERATE_START    (INT32_MAX - 1)
-#define ORANGEFS_ITERATE_END      (INT32_MAX - 2)
-#define ORANGEFS_ITERATE_NEXT     (INT32_MAX - 3)
-#define ORANGEFS_READDIR_START ORANGEFS_ITERATE_START
-#define ORANGEFS_READDIR_END   ORANGEFS_ITERATE_END
+#define ORANGEFS_ITERATE_START    2147483646
+#define ORANGEFS_ITERATE_END      2147483645
 #define ORANGEFS_IMMUTABLE_FL FS_IMMUTABLE_FL
 #define ORANGEFS_APPEND_FL    FS_APPEND_FL
 #define ORANGEFS_NOATIME_FL   FS_NOATIME_FL
diff --git a/fs/orangefs/super.c b/fs/orangefs/super.c
index 629d8c917fa6..5c7c273e17ec 100644
--- a/fs/orangefs/super.c
+++ b/fs/orangefs/super.c
@@ -376,6 +376,25 @@ static const struct export_operations orangefs_export_ops = {
 	.fh_to_dentry = orangefs_fh_to_dentry,
 };
 
+static int orangefs_unmount(int id, __s32 fs_id, const char *devname)
+{
+	struct orangefs_kernel_op_s *op;
+	int r;
+	op = op_alloc(ORANGEFS_VFS_OP_FS_UMOUNT);
+	if (!op)
+		return -ENOMEM;
+	op->upcall.req.fs_umount.id = id;
+	op->upcall.req.fs_umount.fs_id = fs_id;
+	strncpy(op->upcall.req.fs_umount.orangefs_config_server,
+	    devname, ORANGEFS_MAX_SERVER_ADDR_LEN);
+	r = service_operation(op, "orangefs_fs_umount", 0);
+	/* Not much to do about an error here. */
+	if (r)
+		gossip_err("orangefs_unmount: service_operation %d\n", r);
+	op_release(op);
+	return r;
+}
+
 static int orangefs_fill_sb(struct super_block *sb,
 		struct orangefs_fs_mount_response *fs_mount,
 		void *data, int silent)
@@ -484,6 +503,8 @@ struct dentry *orangefs_mount(struct file_system_type *fst,
 
 	if (IS_ERR(sb)) {
 		d = ERR_CAST(sb);
+		orangefs_unmount(new_op->downcall.resp.fs_mount.id,
+		    new_op->downcall.resp.fs_mount.fs_id, devname);
 		goto free_op;
 	}
 
@@ -539,6 +560,7 @@ struct dentry *orangefs_mount(struct file_system_type *fst,
 free_sb_and_op:
 	/* Will call orangefs_kill_sb with sb not in list. */
 	ORANGEFS_SB(sb)->no_list = 1;
+	/* ORANGEFS_VFS_OP_FS_UMOUNT is done by orangefs_kill_sb. */
 	deactivate_locked_super(sb);
 free_op:
 	gossip_err("orangefs_mount: mount request failed with %d\n", ret);
@@ -554,6 +576,7 @@ free_op:
 
 void orangefs_kill_sb(struct super_block *sb)
 {
+	int r;
 	gossip_debug(GOSSIP_SUPER_DEBUG, "orangefs_kill_sb: called\n");
 
 	/* provided sb cleanup */
@@ -563,7 +586,10 @@ void orangefs_kill_sb(struct super_block *sb)
 	 * issue the unmount to userspace to tell it to remove the
 	 * dynamic mount info it has for this superblock
 	 */
-	 orangefs_unmount_sb(sb);
+	r = orangefs_unmount(ORANGEFS_SB(sb)->id, ORANGEFS_SB(sb)->fs_id,
+	    ORANGEFS_SB(sb)->devname);
+	if (!r)
+		ORANGEFS_SB(sb)->mount_pending = 1;
 
 	if (!ORANGEFS_SB(sb)->no_list) {
 		/* remove the sb from our list of orangefs specific sb's */
diff --git a/fs/orangefs/waitqueue.c b/fs/orangefs/waitqueue.c
index abcfa3fa9992..61e2ca7fec55 100644
--- a/fs/orangefs/waitqueue.c
+++ b/fs/orangefs/waitqueue.c
@@ -124,7 +124,14 @@ retry_servicing:
 		gossip_debug(GOSSIP_WAIT_DEBUG,
 			     "%s:client core is NOT in service.\n",
 			     __func__);
-		timeout = op_timeout_secs * HZ;
+		/*
+		 * Don't wait for the userspace component to return if
+		 * the filesystem is being umounted anyway.
+		 */
+		if (op->upcall.type == ORANGEFS_VFS_OP_FS_UMOUNT)
+			timeout = 0;
+		else
+			timeout = op_timeout_secs * HZ;
 	}
 	spin_unlock(&orangefs_request_list_lock);
 
diff --git a/fs/orangefs/xattr.c b/fs/orangefs/xattr.c
index 74a81b1daaac..237c9c04dc3b 100644
--- a/fs/orangefs/xattr.c
+++ b/fs/orangefs/xattr.c
@@ -76,11 +76,8 @@ ssize_t orangefs_inode_getxattr(struct inode *inode, const char *name,
 	if (S_ISLNK(inode->i_mode))
 		return -EOPNOTSUPP;
 
-	if (strlen(name) >= ORANGEFS_MAX_XATTR_NAMELEN) {
-		gossip_err("Invalid key length (%d)\n",
-			   (int)strlen(name));
+	if (strlen(name) > ORANGEFS_MAX_XATTR_NAMELEN)
 		return -EINVAL;
-	}
 
 	fsuid = from_kuid(&init_user_ns, current_fsuid());
 	fsgid = from_kgid(&init_user_ns, current_fsgid());
@@ -172,6 +169,9 @@ static int orangefs_inode_removexattr(struct inode *inode, const char *name,
 	struct orangefs_kernel_op_s *new_op = NULL;
 	int ret = -ENOMEM;
 
+	if (strlen(name) > ORANGEFS_MAX_XATTR_NAMELEN)
+		return -EINVAL;
+
 	down_write(&orangefs_inode->xattr_sem);
 	new_op = op_alloc(ORANGEFS_VFS_OP_REMOVEXATTR);
 	if (!new_op)
@@ -231,23 +231,13 @@ int orangefs_inode_setxattr(struct inode *inode, const char *name,
 		     "%s: name %s, buffer_size %zd\n",
 		     __func__, name, size);
 
-	if (size >= ORANGEFS_MAX_XATTR_VALUELEN ||
-	    flags < 0) {
-		gossip_err("orangefs_inode_setxattr: bogus values of size(%d), flags(%d)\n",
-			   (int)size,
-			   flags);
+	if (size > ORANGEFS_MAX_XATTR_VALUELEN)
+		return -EINVAL;
+	if (strlen(name) > ORANGEFS_MAX_XATTR_NAMELEN)
 		return -EINVAL;
-	}
 
 	internal_flag = convert_to_internal_xattr_flags(flags);
 
-	if (strlen(name) >= ORANGEFS_MAX_XATTR_NAMELEN) {
-		gossip_err
-		    ("orangefs_inode_setxattr: bogus key size (%d)\n",
-		     (int)(strlen(name)));
-		return -EINVAL;
-	}
-
 	/* This is equivalent to a removexattr */
 	if (size == 0 && value == NULL) {
 		gossip_debug(GOSSIP_XATTR_DEBUG,
@@ -358,7 +348,7 @@ try_again:
 
 	returned_count = new_op->downcall.resp.listxattr.returned_count;
 	if (returned_count < 0 ||
-	    returned_count >= ORANGEFS_MAX_XATTR_LISTLEN) {
+	    returned_count > ORANGEFS_MAX_XATTR_LISTLEN) {
 		gossip_err("%s: impossible value for returned_count:%d:\n",
 		__func__,
 		returned_count);
diff --git a/fs/proc/base.c b/fs/proc/base.c
index c87b6b9a8a76..9e3ac5c11780 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2834,6 +2834,15 @@ static int proc_pid_personality(struct seq_file *m, struct pid_namespace *ns,
 	return err;
 }
 
+#ifdef CONFIG_LIVEPATCH
+static int proc_pid_patch_state(struct seq_file *m, struct pid_namespace *ns,
+				struct pid *pid, struct task_struct *task)
+{
+	seq_printf(m, "%d\n", task->patch_state);
+	return 0;
+}
+#endif /* CONFIG_LIVEPATCH */
+
 /*
  * Thread groups
  */
@@ -2933,6 +2942,9 @@ static const struct pid_entry tgid_base_stuff[] = {
 	REG("timers",	  S_IRUGO, proc_timers_operations),
 #endif
 	REG("timerslack_ns", S_IRUGO|S_IWUGO, proc_pid_set_timerslack_ns_operations),
+#ifdef CONFIG_LIVEPATCH
+	ONE("patch_state",  S_IRUSR, proc_pid_patch_state),
+#endif
 };
 
 static int proc_tgid_base_readdir(struct file *file, struct dir_context *ctx)
@@ -3315,6 +3327,9 @@ static const struct pid_entry tid_base_stuff[] = {
 	REG("projid_map", S_IRUGO|S_IWUSR, proc_projid_map_operations),
 	REG("setgroups",  S_IRUGO|S_IWUSR, proc_setgroups_operations),
 #endif
+#ifdef CONFIG_LIVEPATCH
+	ONE("patch_state",  S_IRUSR, proc_pid_patch_state),
+#endif
 };
 
 static int proc_tid_base_readdir(struct file *file, struct dir_context *ctx)
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index ee27feb34cf4..9425c0d97262 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -472,6 +472,7 @@ struct proc_dir_entry *proc_create_mount_point(const char *name)
 		ent->data = NULL;
 		ent->proc_fops = NULL;
 		ent->proc_iops = NULL;
+		parent->nlink++;
 		if (proc_register(parent, ent) < 0) {
 			kfree(ent);
 			parent->nlink--;
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index d04ea4349909..67985a7233c2 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -408,10 +408,6 @@ static void next_entry(struct ctl_table_header **phead, struct ctl_table **pentr
 	*pentry = entry;
 }
 
-void register_sysctl_root(struct ctl_table_root *root)
-{
-}
-
 /*
  * sysctl_perm does NOT grant the superuser all rights automatically, because
  * some sysctl variables are readonly even to root.
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 312578089544..f0c8b33d99b1 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -441,6 +441,7 @@ struct mem_size_stats {
 	unsigned long private_dirty;
 	unsigned long referenced;
 	unsigned long anonymous;
+	unsigned long lazyfree;
 	unsigned long anonymous_thp;
 	unsigned long shmem_thp;
 	unsigned long swap;
@@ -457,8 +458,11 @@ static void smaps_account(struct mem_size_stats *mss, struct page *page,
 	int i, nr = compound ? 1 << compound_order(page) : 1;
 	unsigned long size = nr * PAGE_SIZE;
 
-	if (PageAnon(page))
+	if (PageAnon(page)) {
 		mss->anonymous += size;
+		if (!PageSwapBacked(page) && !dirty && !PageDirty(page))
+			mss->lazyfree += size;
+	}
 
 	mss->resident += size;
 	/* Accumulate the size in pages that have been accessed. */
@@ -771,6 +775,7 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
 		   "Private_Dirty:  %8lu kB\n"
 		   "Referenced:     %8lu kB\n"
 		   "Anonymous:      %8lu kB\n"
+		   "LazyFree:       %8lu kB\n"
 		   "AnonHugePages:  %8lu kB\n"
 		   "ShmemPmdMapped: %8lu kB\n"
 		   "Shared_Hugetlb: %8lu kB\n"
@@ -789,6 +794,7 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
 		   mss.private_dirty >> 10,
 		   mss.referenced >> 10,
 		   mss.anonymous >> 10,
+		   mss.lazyfree >> 10,
 		   mss.anonymous_thp >> 10,
 		   mss.shmem_thp >> 10,
 		   mss.shared_hugetlb >> 10,
diff --git a/fs/pstore/ftrace.c b/fs/pstore/ftrace.c
index 899d0ba0bd6c..06aab07b6bb7 100644
--- a/fs/pstore/ftrace.c
+++ b/fs/pstore/ftrace.c
@@ -37,6 +37,12 @@ static void notrace pstore_ftrace_call(unsigned long ip,
 {
 	unsigned long flags;
 	struct pstore_ftrace_record rec = {};
+	struct pstore_record record = {
+		.type = PSTORE_TYPE_FTRACE,
+		.buf = (char *)&rec,
+		.size = sizeof(rec),
+		.psi = psinfo,
+	};
 
 	if (unlikely(oops_in_progress))
 		return;
@@ -47,8 +53,7 @@ static void notrace pstore_ftrace_call(unsigned long ip,
 	rec.parent_ip = parent_ip;
 	pstore_ftrace_write_timestamp(&rec, pstore_ftrace_stamp++);
 	pstore_ftrace_encode_cpu(&rec, raw_smp_processor_id());
-	psinfo->write_buf(PSTORE_TYPE_FTRACE, 0, NULL, 0, (void *)&rec,
-			  0, sizeof(rec), psinfo);
+	psinfo->write(&record);
 
 	local_irq_restore(flags);
 }
@@ -117,7 +122,7 @@ void pstore_register_ftrace(void)
 {
 	struct dentry *file;
 
-	if (!psinfo->write_buf)
+	if (!psinfo->write)
 		return;
 
 	pstore_ftrace_dir = debugfs_create_dir("pstore", NULL);
diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
index 57c0646479f5..792a4e5f9226 100644
--- a/fs/pstore/inode.c
+++ b/fs/pstore/inode.c
@@ -47,12 +47,8 @@ static LIST_HEAD(allpstore);
 
 struct pstore_private {
 	struct list_head list;
-	struct pstore_info *psi;
-	enum pstore_type_id type;
-	u64	id;
-	int	count;
-	ssize_t	size;
-	char	data[];
+	struct pstore_record *record;
+	size_t total_size;
 };
 
 struct pstore_ftrace_seq_data {
@@ -63,6 +59,17 @@ struct pstore_ftrace_seq_data {
 
 #define REC_SIZE sizeof(struct pstore_ftrace_record)
 
+static void free_pstore_private(struct pstore_private *private)
+{
+	if (!private)
+		return;
+	if (private->record) {
+		kfree(private->record->buf);
+		kfree(private->record);
+	}
+	kfree(private);
+}
+
 static void *pstore_ftrace_seq_start(struct seq_file *s, loff_t *pos)
 {
 	struct pstore_private *ps = s->private;
@@ -72,9 +79,9 @@ static void *pstore_ftrace_seq_start(struct seq_file *s, loff_t *pos)
 	if (!data)
 		return NULL;
 
-	data->off = ps->size % REC_SIZE;
+	data->off = ps->total_size % REC_SIZE;
 	data->off += *pos * REC_SIZE;
-	if (data->off + REC_SIZE > ps->size) {
+	if (data->off + REC_SIZE > ps->total_size) {
 		kfree(data);
 		return NULL;
 	}
@@ -94,7 +101,7 @@ static void *pstore_ftrace_seq_next(struct seq_file *s, void *v, loff_t *pos)
 	struct pstore_ftrace_seq_data *data = v;
 
 	data->off += REC_SIZE;
-	if (data->off + REC_SIZE > ps->size)
+	if (data->off + REC_SIZE > ps->total_size)
 		return NULL;
 
 	(*pos)++;
@@ -105,7 +112,9 @@ static int pstore_ftrace_seq_show(struct seq_file *s, void *v)
 {
 	struct pstore_private *ps = s->private;
 	struct pstore_ftrace_seq_data *data = v;
-	struct pstore_ftrace_record *rec = (void *)(ps->data + data->off);
+	struct pstore_ftrace_record *rec;
+
+	rec = (struct pstore_ftrace_record *)(ps->record->buf + data->off);
 
 	seq_printf(s, "CPU:%d ts:%llu %08lx  %08lx  %pf <- %pF\n",
 		   pstore_ftrace_decode_cpu(rec),
@@ -125,7 +134,7 @@ static const struct seq_operations pstore_ftrace_seq_ops = {
 
 static int pstore_check_syslog_permissions(struct pstore_private *ps)
 {
-	switch (ps->type) {
+	switch (ps->record->type) {
 	case PSTORE_TYPE_DMESG:
 	case PSTORE_TYPE_CONSOLE:
 		return check_syslog_permissions(SYSLOG_ACTION_READ_ALL,
@@ -141,9 +150,10 @@ static ssize_t pstore_file_read(struct file *file, char __user *userbuf,
 	struct seq_file *sf = file->private_data;
 	struct pstore_private *ps = sf->private;
 
-	if (ps->type == PSTORE_TYPE_FTRACE)
+	if (ps->record->type == PSTORE_TYPE_FTRACE)
 		return seq_read(file, userbuf, count, ppos);
-	return simple_read_from_buffer(userbuf, count, ppos, ps->data, ps->size);
+	return simple_read_from_buffer(userbuf, count, ppos,
+				       ps->record->buf, ps->total_size);
 }
 
 static int pstore_file_open(struct inode *inode, struct file *file)
@@ -157,7 +167,7 @@ static int pstore_file_open(struct inode *inode, struct file *file)
 	if (err)
 		return err;
 
-	if (ps->type == PSTORE_TYPE_FTRACE)
+	if (ps->record->type == PSTORE_TYPE_FTRACE)
 		sops = &pstore_ftrace_seq_ops;
 
 	err = seq_open(file, sops);
@@ -193,20 +203,19 @@ static const struct file_operations pstore_file_operations = {
 static int pstore_unlink(struct inode *dir, struct dentry *dentry)
 {
 	struct pstore_private *p = d_inode(dentry)->i_private;
+	struct pstore_record *record = p->record;
 	int err;
 
 	err = pstore_check_syslog_permissions(p);
 	if (err)
 		return err;
 
-	if (p->psi->erase) {
-		mutex_lock(&p->psi->read_mutex);
-		p->psi->erase(p->type, p->id, p->count,
-			      d_inode(dentry)->i_ctime, p->psi);
-		mutex_unlock(&p->psi->read_mutex);
-	} else {
+	if (!record->psi->erase)
 		return -EPERM;
-	}
+
+	mutex_lock(&record->psi->read_mutex);
+	record->psi->erase(record);
+	mutex_unlock(&record->psi->read_mutex);
 
 	return simple_unlink(dir, dentry);
 }
@@ -221,7 +230,7 @@ static void pstore_evict_inode(struct inode *inode)
 		spin_lock_irqsave(&allpstore_lock, flags);
 		list_del(&p->list);
 		spin_unlock_irqrestore(&allpstore_lock, flags);
-		kfree(p);
+		free_pstore_private(p);
 	}
 }
 
@@ -302,23 +311,23 @@ bool pstore_is_mounted(void)
  * Load it up with "size" bytes of data from "buf".
  * Set the mtime & ctime to the date that this record was originally stored.
  */
-int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, int count,
-		  char *data, bool compressed, size_t size,
-		  struct timespec time, struct pstore_info *psi)
+int pstore_mkfile(struct dentry *root, struct pstore_record *record)
 {
-	struct dentry		*root = pstore_sb->s_root;
 	struct dentry		*dentry;
 	struct inode		*inode;
 	int			rc = 0;
 	char			name[PSTORE_NAMELEN];
 	struct pstore_private	*private, *pos;
 	unsigned long		flags;
+	size_t			size = record->size + record->ecc_notice_size;
+
+	WARN_ON(!inode_is_locked(d_inode(root)));
 
 	spin_lock_irqsave(&allpstore_lock, flags);
 	list_for_each_entry(pos, &allpstore, list) {
-		if (pos->type == type &&
-		    pos->id == id &&
-		    pos->psi == psi) {
+		if (pos->record->type == record->type &&
+		    pos->record->id == record->id &&
+		    pos->record->psi == record->psi) {
 			rc = -EEXIST;
 			break;
 		}
@@ -328,72 +337,74 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, int count,
 		return rc;
 
 	rc = -ENOMEM;
-	inode = pstore_get_inode(pstore_sb);
+	inode = pstore_get_inode(root->d_sb);
 	if (!inode)
 		goto fail;
 	inode->i_mode = S_IFREG | 0444;
 	inode->i_fop = &pstore_file_operations;
-	private = kmalloc(sizeof *private + size, GFP_KERNEL);
+	private = kzalloc(sizeof(*private), GFP_KERNEL);
 	if (!private)
 		goto fail_alloc;
-	private->type = type;
-	private->id = id;
-	private->count = count;
-	private->psi = psi;
+	private->record = record;
 
-	switch (type) {
+	switch (record->type) {
 	case PSTORE_TYPE_DMESG:
 		scnprintf(name, sizeof(name), "dmesg-%s-%lld%s",
-			  psname, id, compressed ? ".enc.z" : "");
+			  record->psi->name, record->id,
+			  record->compressed ? ".enc.z" : "");
 		break;
 	case PSTORE_TYPE_CONSOLE:
-		scnprintf(name, sizeof(name), "console-%s-%lld", psname, id);
+		scnprintf(name, sizeof(name), "console-%s-%lld",
+			  record->psi->name, record->id);
 		break;
 	case PSTORE_TYPE_FTRACE:
-		scnprintf(name, sizeof(name), "ftrace-%s-%lld", psname, id);
+		scnprintf(name, sizeof(name), "ftrace-%s-%lld",
+			  record->psi->name, record->id);
 		break;
 	case PSTORE_TYPE_MCE:
-		scnprintf(name, sizeof(name), "mce-%s-%lld", psname, id);
+		scnprintf(name, sizeof(name), "mce-%s-%lld",
+			  record->psi->name, record->id);
 		break;
 	case PSTORE_TYPE_PPC_RTAS:
-		scnprintf(name, sizeof(name), "rtas-%s-%lld", psname, id);
+		scnprintf(name, sizeof(name), "rtas-%s-%lld",
+			  record->psi->name, record->id);
 		break;
 	case PSTORE_TYPE_PPC_OF:
 		scnprintf(name, sizeof(name), "powerpc-ofw-%s-%lld",
-			  psname, id);
+			  record->psi->name, record->id);
 		break;
 	case PSTORE_TYPE_PPC_COMMON:
 		scnprintf(name, sizeof(name), "powerpc-common-%s-%lld",
-			  psname, id);
+			  record->psi->name, record->id);
 		break;
 	case PSTORE_TYPE_PMSG:
-		scnprintf(name, sizeof(name), "pmsg-%s-%lld", psname, id);
+		scnprintf(name, sizeof(name), "pmsg-%s-%lld",
+			  record->psi->name, record->id);
 		break;
 	case PSTORE_TYPE_PPC_OPAL:
-		sprintf(name, "powerpc-opal-%s-%lld", psname, id);
+		scnprintf(name, sizeof(name), "powerpc-opal-%s-%lld",
+			  record->psi->name, record->id);
 		break;
 	case PSTORE_TYPE_UNKNOWN:
-		scnprintf(name, sizeof(name), "unknown-%s-%lld", psname, id);
+		scnprintf(name, sizeof(name), "unknown-%s-%lld",
+			  record->psi->name, record->id);
 		break;
 	default:
 		scnprintf(name, sizeof(name), "type%d-%s-%lld",
-			  type, psname, id);
+			  record->type, record->psi->name, record->id);
 		break;
 	}
 
-	inode_lock(d_inode(root));
-
 	dentry = d_alloc_name(root, name);
 	if (!dentry)
-		goto fail_lockedalloc;
+		goto fail_private;
 
-	memcpy(private->data, data, size);
-	inode->i_size = private->size = size;
+	inode->i_size = private->total_size = size;
 
 	inode->i_private = private;
 
-	if (time.tv_sec)
-		inode->i_mtime = inode->i_ctime = time;
+	if (record->time.tv_sec)
+		inode->i_mtime = inode->i_ctime = record->time;
 
 	d_add(dentry, inode);
 
@@ -401,13 +412,10 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, int count,
 	list_add(&private->list, &allpstore);
 	spin_unlock_irqrestore(&allpstore_lock, flags);
 
-	inode_unlock(d_inode(root));
-
 	return 0;
 
-fail_lockedalloc:
-	inode_unlock(d_inode(root));
-	kfree(private);
+fail_private:
+	free_pstore_private(private);
 fail_alloc:
 	iput(inode);
 
@@ -415,6 +423,27 @@ fail:
 	return rc;
 }
 
+/*
+ * Read all the records from the persistent store. Create
+ * files in our filesystem.  Don't warn about -EEXIST errors
+ * when we are re-scanning the backing store looking to add new
+ * error records.
+ */
+void pstore_get_records(int quiet)
+{
+	struct pstore_info *psi = psinfo;
+	struct dentry *root;
+
+	if (!psi || !pstore_sb)
+		return;
+
+	root = pstore_sb->s_root;
+
+	inode_lock(d_inode(root));
+	pstore_get_backend_records(psi, root, quiet);
+	inode_unlock(d_inode(root));
+}
+
 static int pstore_fill_super(struct super_block *sb, void *data, int silent)
 {
 	struct inode *inode;
diff --git a/fs/pstore/internal.h b/fs/pstore/internal.h
index da416e6591c9..c416e653dc4f 100644
--- a/fs/pstore/internal.h
+++ b/fs/pstore/internal.h
@@ -25,10 +25,10 @@ extern struct pstore_info *psinfo;
 
 extern void	pstore_set_kmsg_bytes(int);
 extern void	pstore_get_records(int);
-extern int	pstore_mkfile(enum pstore_type_id, char *psname, u64 id,
-			      int count, char *data, bool compressed,
-			      size_t size, struct timespec time,
-			      struct pstore_info *psi);
+extern void	pstore_get_backend_records(struct pstore_info *psi,
+					   struct dentry *root, int quiet);
+extern int	pstore_mkfile(struct dentry *root,
+			      struct pstore_record *record);
 extern bool	pstore_is_mounted(void);
 
 #endif
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index efab7b64925b..d468eec9b8a6 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -267,7 +267,7 @@ static void free_zlib(void)
 	big_oops_buf_sz = 0;
 }
 
-static struct pstore_zbackend backend_zlib = {
+static const struct pstore_zbackend backend_zlib = {
 	.compress	= compress_zlib,
 	.decompress	= decompress_zlib,
 	.allocate	= allocate_zlib,
@@ -328,7 +328,7 @@ static void free_lzo(void)
 	big_oops_buf_sz = 0;
 }
 
-static struct pstore_zbackend backend_lzo = {
+static const struct pstore_zbackend backend_lzo = {
 	.compress	= compress_lzo,
 	.decompress	= decompress_lzo,
 	.allocate	= allocate_lzo,
@@ -393,7 +393,7 @@ static void free_lz4(void)
 	big_oops_buf_sz = 0;
 }
 
-static struct pstore_zbackend backend_lz4 = {
+static const struct pstore_zbackend backend_lz4 = {
 	.compress	= compress_lz4,
 	.decompress	= decompress_lz4,
 	.allocate	= allocate_lz4,
@@ -402,7 +402,7 @@ static struct pstore_zbackend backend_lz4 = {
 };
 #endif
 
-static struct pstore_zbackend *zbackend =
+static const struct pstore_zbackend *zbackend =
 #if defined(CONFIG_PSTORE_ZLIB_COMPRESS)
 	&backend_zlib;
 #elif defined(CONFIG_PSTORE_LZO_COMPRESS)
@@ -484,7 +484,6 @@ static void pstore_dump(struct kmsg_dumper *dumper,
 {
 	unsigned long	total = 0;
 	const char	*why;
-	u64		id;
 	unsigned int	part = 1;
 	unsigned long	flags = 0;
 	int		is_locked;
@@ -506,48 +505,59 @@ static void pstore_dump(struct kmsg_dumper *dumper,
 	oopscount++;
 	while (total < kmsg_bytes) {
 		char *dst;
-		unsigned long size;
-		int hsize;
+		size_t dst_size;
+		int header_size;
 		int zipped_len = -1;
-		size_t len;
-		bool compressed = false;
-		size_t total_len;
+		size_t dump_size;
+		struct pstore_record record = {
+			.type = PSTORE_TYPE_DMESG,
+			.count = oopscount,
+			.reason = reason,
+			.part = part,
+			.compressed = false,
+			.buf = psinfo->buf,
+			.psi = psinfo,
+		};
 
 		if (big_oops_buf && is_locked) {
 			dst = big_oops_buf;
-			size = big_oops_buf_sz;
+			dst_size = big_oops_buf_sz;
 		} else {
 			dst = psinfo->buf;
-			size = psinfo->bufsize;
+			dst_size = psinfo->bufsize;
 		}
 
-		hsize = sprintf(dst, "%s#%d Part%u\n", why, oopscount, part);
-		size -= hsize;
+		/* Write dump header. */
+		header_size = snprintf(dst, dst_size, "%s#%d Part%u\n", why,
+				 oopscount, part);
+		dst_size -= header_size;
 
-		if (!kmsg_dump_get_buffer(dumper, true, dst + hsize,
-					  size, &len))
+		/* Write dump contents. */
+		if (!kmsg_dump_get_buffer(dumper, true, dst + header_size,
+					  dst_size, &dump_size))
 			break;
 
 		if (big_oops_buf && is_locked) {
 			zipped_len = pstore_compress(dst, psinfo->buf,
-						hsize + len, psinfo->bufsize);
+						header_size + dump_size,
+						psinfo->bufsize);
 
 			if (zipped_len > 0) {
-				compressed = true;
-				total_len = zipped_len;
+				record.compressed = true;
+				record.size = zipped_len;
 			} else {
-				total_len = copy_kmsg_to_buffer(hsize, len);
+				record.size = copy_kmsg_to_buffer(header_size,
+								  dump_size);
 			}
 		} else {
-			total_len = hsize + len;
+			record.size = header_size + dump_size;
 		}
 
-		ret = psinfo->write(PSTORE_TYPE_DMESG, reason, &id, part,
-				    oopscount, compressed, total_len, psinfo);
+		ret = psinfo->write(&record);
 		if (ret == 0 && reason == KMSG_DUMP_OOPS && pstore_is_mounted())
 			pstore_new_entry = 1;
 
-		total += total_len;
+		total += record.size;
 		part++;
 	}
 	if (is_locked)
@@ -577,8 +587,11 @@ static void pstore_console_write(struct console *con, const char *s, unsigned c)
 	const char *e = s + c;
 
 	while (s < e) {
+		struct pstore_record record = {
+			.type = PSTORE_TYPE_CONSOLE,
+			.psi = psinfo,
+		};
 		unsigned long flags;
-		u64 id;
 
 		if (c > psinfo->bufsize)
 			c = psinfo->bufsize;
@@ -589,8 +602,9 @@ static void pstore_console_write(struct console *con, const char *s, unsigned c)
 		} else {
 			spin_lock_irqsave(&psinfo->buf_lock, flags);
 		}
-		psinfo->write_buf(PSTORE_TYPE_CONSOLE, 0, &id, 0,
-				  s, 0, c, psinfo);
+		record.buf = (char *)s;
+		record.size = c;
+		psinfo->write(&record);
 		spin_unlock_irqrestore(&psinfo->buf_lock, flags);
 		s += c;
 		c = e - s;
@@ -618,48 +632,30 @@ static void pstore_register_console(void) {}
 static void pstore_unregister_console(void) {}
 #endif
 
-static int pstore_write_compat(enum pstore_type_id type,
-			       enum kmsg_dump_reason reason,
-			       u64 *id, unsigned int part, int count,
-			       bool compressed, size_t size,
-			       struct pstore_info *psi)
-{
-	return psi->write_buf(type, reason, id, part, psinfo->buf, compressed,
-			     size, psi);
-}
-
-static int pstore_write_buf_user_compat(enum pstore_type_id type,
-			       enum kmsg_dump_reason reason,
-			       u64 *id, unsigned int part,
-			       const char __user *buf,
-			       bool compressed, size_t size,
-			       struct pstore_info *psi)
-{
-	unsigned long flags = 0;
-	size_t i, bufsize = size;
-	long ret = 0;
-
-	if (unlikely(!access_ok(VERIFY_READ, buf, size)))
-		return -EFAULT;
-	if (bufsize > psinfo->bufsize)
-		bufsize = psinfo->bufsize;
-	spin_lock_irqsave(&psinfo->buf_lock, flags);
-	for (i = 0; i < size; ) {
-		size_t c = min(size - i, bufsize);
-
-		ret = __copy_from_user(psinfo->buf, buf + i, c);
-		if (unlikely(ret != 0)) {
-			ret = -EFAULT;
-			break;
-		}
-		ret = psi->write_buf(type, reason, id, part, psinfo->buf,
-				     compressed, c, psi);
-		if (unlikely(ret < 0))
-			break;
-		i += c;
+static int pstore_write_user_compat(struct pstore_record *record,
+				    const char __user *buf)
+{
+	int ret = 0;
+
+	if (record->buf)
+		return -EINVAL;
+
+	record->buf = kmalloc(record->size, GFP_KERNEL);
+	if (!record->buf)
+		return -ENOMEM;
+
+	if (unlikely(copy_from_user(record->buf, buf, record->size))) {
+		ret = -EFAULT;
+		goto out;
 	}
-	spin_unlock_irqrestore(&psinfo->buf_lock, flags);
-	return unlikely(ret < 0) ? ret : size;
+
+	ret = record->psi->write(record);
+
+out:
+	kfree(record->buf);
+	record->buf = NULL;
+
+	return unlikely(ret < 0) ? ret : record->size;
 }
 
 /*
@@ -673,19 +669,35 @@ int pstore_register(struct pstore_info *psi)
 {
 	struct module *owner = psi->owner;
 
-	if (backend && strcmp(backend, psi->name))
+	if (backend && strcmp(backend, psi->name)) {
+		pr_warn("ignoring unexpected backend '%s'\n", psi->name);
 		return -EPERM;
+	}
+
+	/* Sanity check flags. */
+	if (!psi->flags) {
+		pr_warn("backend '%s' must support at least one frontend\n",
+			psi->name);
+		return -EINVAL;
+	}
+
+	/* Check for required functions. */
+	if (!psi->read || !psi->write) {
+		pr_warn("backend '%s' must implement read() and write()\n",
+			psi->name);
+		return -EINVAL;
+	}
 
 	spin_lock(&pstore_lock);
 	if (psinfo) {
+		pr_warn("backend '%s' already loaded: ignoring '%s'\n",
+			psinfo->name, psi->name);
 		spin_unlock(&pstore_lock);
 		return -EBUSY;
 	}
 
-	if (!psi->write)
-		psi->write = pstore_write_compat;
-	if (!psi->write_buf_user)
-		psi->write_buf_user = pstore_write_buf_user_compat;
+	if (!psi->write_user)
+		psi->write_user = pstore_write_user_compat;
 	psinfo = psi;
 	mutex_init(&psinfo->read_mutex);
 	spin_unlock(&pstore_lock);
@@ -709,6 +721,7 @@ int pstore_register(struct pstore_info *psi)
 	if (psi->flags & PSTORE_FLAGS_PMSG)
 		pstore_register_pmsg();
 
+	/* Start watching for new records, if desired. */
 	if (pstore_update_ms >= 0) {
 		pstore_timer.expires = jiffies +
 			msecs_to_jiffies(pstore_update_ms);
@@ -721,16 +734,21 @@ int pstore_register(struct pstore_info *psi)
 	 */
 	backend = psi->name;
 
-	module_put(owner);
-
 	pr_info("Registered %s as persistent store backend\n", psi->name);
 
+	module_put(owner);
+
 	return 0;
 }
 EXPORT_SYMBOL_GPL(pstore_register);
 
 void pstore_unregister(struct pstore_info *psi)
 {
+	/* Stop timer and make sure all work has finished. */
+	pstore_update_ms = -1;
+	del_timer_sync(&pstore_timer);
+	flush_work(&pstore_work);
+
 	if (psi->flags & PSTORE_FLAGS_PMSG)
 		pstore_unregister_pmsg();
 	if (psi->flags & PSTORE_FLAGS_FTRACE)
@@ -747,66 +765,99 @@ void pstore_unregister(struct pstore_info *psi)
 }
 EXPORT_SYMBOL_GPL(pstore_unregister);
 
+static void decompress_record(struct pstore_record *record)
+{
+	int unzipped_len;
+	char *decompressed;
+
+	/* Only PSTORE_TYPE_DMESG support compression. */
+	if (!record->compressed || record->type != PSTORE_TYPE_DMESG) {
+		pr_warn("ignored compressed record type %d\n", record->type);
+		return;
+	}
+
+	/* No compression method has created the common buffer. */
+	if (!big_oops_buf) {
+		pr_warn("no decompression buffer allocated\n");
+		return;
+	}
+
+	unzipped_len = pstore_decompress(record->buf, big_oops_buf,
+					 record->size, big_oops_buf_sz);
+	if (unzipped_len <= 0) {
+		pr_err("decompression failed: %d\n", unzipped_len);
+		return;
+	}
+
+	/* Build new buffer for decompressed contents. */
+	decompressed = kmalloc(unzipped_len + record->ecc_notice_size,
+			       GFP_KERNEL);
+	if (!decompressed) {
+		pr_err("decompression ran out of memory\n");
+		return;
+	}
+	memcpy(decompressed, big_oops_buf, unzipped_len);
+
+	/* Append ECC notice to decompressed buffer. */
+	memcpy(decompressed + unzipped_len, record->buf + record->size,
+	       record->ecc_notice_size);
+
+	/* Swap out compresed contents with decompressed contents. */
+	kfree(record->buf);
+	record->buf = decompressed;
+	record->size = unzipped_len;
+	record->compressed = false;
+}
+
 /*
- * Read all the records from the persistent store. Create
+ * Read all the records from one persistent store backend. Create
  * files in our filesystem.  Don't warn about -EEXIST errors
  * when we are re-scanning the backing store looking to add new
  * error records.
  */
-void pstore_get_records(int quiet)
-{
-	struct pstore_info *psi = psinfo;
-	char			*buf = NULL;
-	ssize_t			size;
-	u64			id;
-	int			count;
-	enum pstore_type_id	type;
-	struct timespec		time;
-	int			failed = 0, rc;
-	bool			compressed;
-	int			unzipped_len = -1;
-	ssize_t			ecc_notice_size = 0;
-
-	if (!psi)
+void pstore_get_backend_records(struct pstore_info *psi,
+				struct dentry *root, int quiet)
+{
+	int failed = 0;
+
+	if (!psi || !root)
 		return;
 
 	mutex_lock(&psi->read_mutex);
 	if (psi->open && psi->open(psi))
 		goto out;
 
-	while ((size = psi->read(&id, &type, &count, &time, &buf, &compressed,
-				 &ecc_notice_size, psi)) > 0) {
-		if (compressed && (type == PSTORE_TYPE_DMESG)) {
-			if (big_oops_buf)
-				unzipped_len = pstore_decompress(buf,
-							big_oops_buf, size,
-							big_oops_buf_sz);
-
-			if (unzipped_len > 0) {
-				if (ecc_notice_size)
-					memcpy(big_oops_buf + unzipped_len,
-					       buf + size, ecc_notice_size);
-				kfree(buf);
-				buf = big_oops_buf;
-				size = unzipped_len;
-				compressed = false;
-			} else {
-				pr_err("decompression failed;returned %d\n",
-				       unzipped_len);
-				compressed = true;
-			}
+	/*
+	 * Backend callback read() allocates record.buf. decompress_record()
+	 * may reallocate record.buf. On success, pstore_mkfile() will keep
+	 * the record.buf, so free it only on failure.
+	 */
+	for (;;) {
+		struct pstore_record *record;
+		int rc;
+
+		record = kzalloc(sizeof(*record), GFP_KERNEL);
+		if (!record) {
+			pr_err("out of memory creating record\n");
+			break;
+		}
+		record->psi = psi;
+
+		record->size = psi->read(record);
+
+		/* No more records left in backend? */
+		if (record->size <= 0)
+			break;
+
+		decompress_record(record);
+		rc = pstore_mkfile(root, record);
+		if (rc) {
+			/* pstore_mkfile() did not take record, so free it. */
+			kfree(record->buf);
+			kfree(record);
+			if (rc != -EEXIST || !quiet)
+				failed++;
 		}
-		rc = pstore_mkfile(type, psi->name, id, count, buf,
-				   compressed, size + ecc_notice_size,
-				   time, psi);
-		if (unzipped_len < 0) {
-			/* Free buffer other than big oops */
-			kfree(buf);
-			buf = NULL;
-		} else
-			unzipped_len = -1;
-		if (rc && (rc != -EEXIST || !quiet))
-			failed++;
 	}
 	if (psi->close)
 		psi->close(psi);
@@ -830,7 +881,9 @@ static void pstore_timefunc(unsigned long dummy)
 		schedule_work(&pstore_work);
 	}
 
-	mod_timer(&pstore_timer, jiffies + msecs_to_jiffies(pstore_update_ms));
+	if (pstore_update_ms >= 0)
+		mod_timer(&pstore_timer,
+			  jiffies + msecs_to_jiffies(pstore_update_ms));
 }
 
 module_param(backend, charp, 0444);
diff --git a/fs/pstore/pmsg.c b/fs/pstore/pmsg.c
index 78f6176c020f..209755e0d7c8 100644
--- a/fs/pstore/pmsg.c
+++ b/fs/pstore/pmsg.c
@@ -15,7 +15,6 @@
 #include <linux/device.h>
 #include <linux/fs.h>
 #include <linux/uaccess.h>
-#include <linux/vmalloc.h>
 #include "internal.h"
 
 static DEFINE_MUTEX(pmsg_lock);
@@ -23,19 +22,22 @@ static DEFINE_MUTEX(pmsg_lock);
 static ssize_t write_pmsg(struct file *file, const char __user *buf,
 			  size_t count, loff_t *ppos)
 {
-	u64 id;
+	struct pstore_record record = {
+		.type = PSTORE_TYPE_PMSG,
+		.size = count,
+		.psi = psinfo,
+	};
 	int ret;
 
 	if (!count)
 		return 0;
 
-	/* check outside lock, page in any data. write_buf_user also checks */
+	/* check outside lock, page in any data. write_user also checks */
 	if (!access_ok(VERIFY_READ, buf, count))
 		return -EFAULT;
 
 	mutex_lock(&pmsg_lock);
-	ret = psinfo->write_buf_user(PSTORE_TYPE_PMSG, 0, &id, 0, buf, 0, count,
-				     psinfo);
+	ret = psinfo->write_user(&record, buf);
 	mutex_unlock(&pmsg_lock);
 	return ret ? ret : count;
 }
diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c
index 11f918d34b1e..5523df7f17ef 100644
--- a/fs/pstore/ram.c
+++ b/fs/pstore/ram.c
@@ -235,35 +235,34 @@ static ssize_t ftrace_log_combine(struct persistent_ram_zone *dest,
 	return 0;
 }
 
-static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type,
-				   int *count, struct timespec *time,
-				   char **buf, bool *compressed,
-				   ssize_t *ecc_notice_size,
-				   struct pstore_info *psi)
+static ssize_t ramoops_pstore_read(struct pstore_record *record)
 {
 	ssize_t size = 0;
-	struct ramoops_context *cxt = psi->data;
+	struct ramoops_context *cxt = record->psi->data;
 	struct persistent_ram_zone *prz = NULL;
 	int header_length = 0;
 	bool free_prz = false;
 
-	/* Ramoops headers provide time stamps for PSTORE_TYPE_DMESG, but
+	/*
+	 * Ramoops headers provide time stamps for PSTORE_TYPE_DMESG, but
 	 * PSTORE_TYPE_CONSOLE and PSTORE_TYPE_FTRACE don't currently have
 	 * valid time stamps, so it is initialized to zero.
 	 */
-	time->tv_sec = 0;
-	time->tv_nsec = 0;
-	*compressed = false;
+	record->time.tv_sec = 0;
+	record->time.tv_nsec = 0;
+	record->compressed = false;
 
 	/* Find the next valid persistent_ram_zone for DMESG */
 	while (cxt->dump_read_cnt < cxt->max_dump_cnt && !prz) {
 		prz = ramoops_get_next_prz(cxt->dprzs, &cxt->dump_read_cnt,
-					   cxt->max_dump_cnt, id, type,
+					   cxt->max_dump_cnt, &record->id,
+					   &record->type,
 					   PSTORE_TYPE_DMESG, 1);
 		if (!prz_ok(prz))
 			continue;
 		header_length = ramoops_read_kmsg_hdr(persistent_ram_old(prz),
-						      time, compressed);
+						      &record->time,
+						      &record->compressed);
 		/* Clear and skip this DMESG record if it has no valid header */
 		if (!header_length) {
 			persistent_ram_free_old(prz);
@@ -274,18 +273,20 @@ static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type,
 
 	if (!prz_ok(prz))
 		prz = ramoops_get_next_prz(&cxt->cprz, &cxt->console_read_cnt,
-					   1, id, type, PSTORE_TYPE_CONSOLE, 0);
+					   1, &record->id, &record->type,
+					   PSTORE_TYPE_CONSOLE, 0);
 
 	if (!prz_ok(prz))
 		prz = ramoops_get_next_prz(&cxt->mprz, &cxt->pmsg_read_cnt,
-					   1, id, type, PSTORE_TYPE_PMSG, 0);
+					   1, &record->id, &record->type,
+					   PSTORE_TYPE_PMSG, 0);
 
 	/* ftrace is last since it may want to dynamically allocate memory. */
 	if (!prz_ok(prz)) {
 		if (!(cxt->flags & RAMOOPS_FLAG_FTRACE_PER_CPU)) {
 			prz = ramoops_get_next_prz(cxt->fprzs,
-					&cxt->ftrace_read_cnt, 1, id, type,
-					PSTORE_TYPE_FTRACE, 0);
+					&cxt->ftrace_read_cnt, 1, &record->id,
+					&record->type, PSTORE_TYPE_FTRACE, 0);
 		} else {
 			/*
 			 * Build a new dummy record which combines all the
@@ -302,8 +303,10 @@ static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type,
 			while (cxt->ftrace_read_cnt < cxt->max_ftrace_cnt) {
 				prz_next = ramoops_get_next_prz(cxt->fprzs,
 						&cxt->ftrace_read_cnt,
-						cxt->max_ftrace_cnt, id,
-						type, PSTORE_TYPE_FTRACE, 0);
+						cxt->max_ftrace_cnt,
+						&record->id,
+						&record->type,
+						PSTORE_TYPE_FTRACE, 0);
 
 				if (!prz_ok(prz_next))
 					continue;
@@ -316,7 +319,7 @@ static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type,
 				if (size)
 					goto out;
 			}
-			*id = 0;
+			record->id = 0;
 			prz = tmp_prz;
 		}
 	}
@@ -329,17 +332,19 @@ static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type,
 	size = persistent_ram_old_size(prz) - header_length;
 
 	/* ECC correction notice */
-	*ecc_notice_size = persistent_ram_ecc_string(prz, NULL, 0);
+	record->ecc_notice_size = persistent_ram_ecc_string(prz, NULL, 0);
 
-	*buf = kmalloc(size + *ecc_notice_size + 1, GFP_KERNEL);
-	if (*buf == NULL) {
+	record->buf = kmalloc(size + record->ecc_notice_size + 1, GFP_KERNEL);
+	if (record->buf == NULL) {
 		size = -ENOMEM;
 		goto out;
 	}
 
-	memcpy(*buf, (char *)persistent_ram_old(prz) + header_length, size);
+	memcpy(record->buf, (char *)persistent_ram_old(prz) + header_length,
+	       size);
 
-	persistent_ram_ecc_string(prz, *buf + size, *ecc_notice_size + 1);
+	persistent_ram_ecc_string(prz, record->buf + size,
+				  record->ecc_notice_size + 1);
 
 out:
 	if (free_prz) {
@@ -373,23 +378,18 @@ static size_t ramoops_write_kmsg_hdr(struct persistent_ram_zone *prz,
 	return len;
 }
 
-static int notrace ramoops_pstore_write_buf(enum pstore_type_id type,
-					    enum kmsg_dump_reason reason,
-					    u64 *id, unsigned int part,
-					    const char *buf,
-					    bool compressed, size_t size,
-					    struct pstore_info *psi)
+static int notrace ramoops_pstore_write(struct pstore_record *record)
 {
-	struct ramoops_context *cxt = psi->data;
+	struct ramoops_context *cxt = record->psi->data;
 	struct persistent_ram_zone *prz;
-	size_t hlen;
+	size_t size, hlen;
 
-	if (type == PSTORE_TYPE_CONSOLE) {
+	if (record->type == PSTORE_TYPE_CONSOLE) {
 		if (!cxt->cprz)
 			return -ENOMEM;
-		persistent_ram_write(cxt->cprz, buf, size);
+		persistent_ram_write(cxt->cprz, record->buf, record->size);
 		return 0;
-	} else if (type == PSTORE_TYPE_FTRACE) {
+	} else if (record->type == PSTORE_TYPE_FTRACE) {
 		int zonenum;
 
 		if (!cxt->fprzs)
@@ -402,33 +402,36 @@ static int notrace ramoops_pstore_write_buf(enum pstore_type_id type,
 		else
 			zonenum = 0;
 
-		persistent_ram_write(cxt->fprzs[zonenum], buf, size);
+		persistent_ram_write(cxt->fprzs[zonenum], record->buf,
+				     record->size);
 		return 0;
-	} else if (type == PSTORE_TYPE_PMSG) {
+	} else if (record->type == PSTORE_TYPE_PMSG) {
 		pr_warn_ratelimited("PMSG shouldn't call %s\n", __func__);
 		return -EINVAL;
 	}
 
-	if (type != PSTORE_TYPE_DMESG)
+	if (record->type != PSTORE_TYPE_DMESG)
 		return -EINVAL;
 
-	/* Out of the various dmesg dump types, ramoops is currently designed
+	/*
+	 * Out of the various dmesg dump types, ramoops is currently designed
 	 * to only store crash logs, rather than storing general kernel logs.
 	 */
-	if (reason != KMSG_DUMP_OOPS &&
-	    reason != KMSG_DUMP_PANIC)
+	if (record->reason != KMSG_DUMP_OOPS &&
+	    record->reason != KMSG_DUMP_PANIC)
 		return -EINVAL;
 
 	/* Skip Oopes when configured to do so. */
-	if (reason == KMSG_DUMP_OOPS && !cxt->dump_oops)
+	if (record->reason == KMSG_DUMP_OOPS && !cxt->dump_oops)
 		return -EINVAL;
 
-	/* Explicitly only take the first part of any new crash.
+	/*
+	 * Explicitly only take the first part of any new crash.
 	 * If our buffer is larger than kmsg_bytes, this can never happen,
 	 * and if our buffer is smaller than kmsg_bytes, we don't want the
 	 * report split across multiple records.
 	 */
-	if (part != 1)
+	if (record->part != 1)
 		return -ENOSPC;
 
 	if (!cxt->dprzs)
@@ -436,53 +439,50 @@ static int notrace ramoops_pstore_write_buf(enum pstore_type_id type,
 
 	prz = cxt->dprzs[cxt->dump_write_cnt];
 
-	hlen = ramoops_write_kmsg_hdr(prz, compressed);
+	/* Build header and append record contents. */
+	hlen = ramoops_write_kmsg_hdr(prz, record->compressed);
+	size = record->size;
 	if (size + hlen > prz->buffer_size)
 		size = prz->buffer_size - hlen;
-	persistent_ram_write(prz, buf, size);
+	persistent_ram_write(prz, record->buf, size);
 
 	cxt->dump_write_cnt = (cxt->dump_write_cnt + 1) % cxt->max_dump_cnt;
 
 	return 0;
 }
 
-static int notrace ramoops_pstore_write_buf_user(enum pstore_type_id type,
-						 enum kmsg_dump_reason reason,
-						 u64 *id, unsigned int part,
-						 const char __user *buf,
-						 bool compressed, size_t size,
-						 struct pstore_info *psi)
+static int notrace ramoops_pstore_write_user(struct pstore_record *record,
+					     const char __user *buf)
 {
-	if (type == PSTORE_TYPE_PMSG) {
-		struct ramoops_context *cxt = psi->data;
+	if (record->type == PSTORE_TYPE_PMSG) {
+		struct ramoops_context *cxt = record->psi->data;
 
 		if (!cxt->mprz)
 			return -ENOMEM;
-		return persistent_ram_write_user(cxt->mprz, buf, size);
+		return persistent_ram_write_user(cxt->mprz, buf, record->size);
 	}
 
 	return -EINVAL;
 }
 
-static int ramoops_pstore_erase(enum pstore_type_id type, u64 id, int count,
-				struct timespec time, struct pstore_info *psi)
+static int ramoops_pstore_erase(struct pstore_record *record)
 {
-	struct ramoops_context *cxt = psi->data;
+	struct ramoops_context *cxt = record->psi->data;
 	struct persistent_ram_zone *prz;
 
-	switch (type) {
+	switch (record->type) {
 	case PSTORE_TYPE_DMESG:
-		if (id >= cxt->max_dump_cnt)
+		if (record->id >= cxt->max_dump_cnt)
 			return -EINVAL;
-		prz = cxt->dprzs[id];
+		prz = cxt->dprzs[record->id];
 		break;
 	case PSTORE_TYPE_CONSOLE:
 		prz = cxt->cprz;
 		break;
 	case PSTORE_TYPE_FTRACE:
-		if (id >= cxt->max_ftrace_cnt)
+		if (record->id >= cxt->max_ftrace_cnt)
 			return -EINVAL;
-		prz = cxt->fprzs[id];
+		prz = cxt->fprzs[record->id];
 		break;
 	case PSTORE_TYPE_PMSG:
 		prz = cxt->mprz;
@@ -503,8 +503,8 @@ static struct ramoops_context oops_cxt = {
 		.name	= "ramoops",
 		.open	= ramoops_pstore_open,
 		.read	= ramoops_pstore_read,
-		.write_buf	= ramoops_pstore_write_buf,
-		.write_buf_user	= ramoops_pstore_write_buf_user,
+		.write	= ramoops_pstore_write,
+		.write_user	= ramoops_pstore_write_user,
 		.erase	= ramoops_pstore_erase,
 	},
 };
diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c
index bc927e30bdcc..e11672aa4575 100644
--- a/fs/pstore/ram_core.c
+++ b/fs/pstore/ram_core.c
@@ -532,7 +532,7 @@ struct persistent_ram_zone *persistent_ram_new(phys_addr_t start, size_t size,
 	}
 
 	/* Initialize general buffer state. */
-	prz->buffer_lock = __RAW_SPIN_LOCK_UNLOCKED(buffer_lock);
+	raw_spin_lock_init(&prz->buffer_lock);
 	prz->flags = flags;
 
 	ret = persistent_ram_buffer_map(start, size, prz, memtype);
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 74b489e3714d..ebf80c7739e1 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -2188,8 +2188,7 @@ int dquot_disable(struct super_block *sb, int type, unsigned int flags)
 		/* This can happen when suspending quotas on remount-ro... */
 		if (toputinode[cnt] && !sb_has_quota_loaded(sb, cnt)) {
 			inode_lock(toputinode[cnt]);
-			toputinode[cnt]->i_flags &= ~(S_IMMUTABLE |
-				  S_NOATIME | S_NOQUOTA);
+			toputinode[cnt]->i_flags &= ~S_NOQUOTA;
 			truncate_inode_pages(&toputinode[cnt]->i_data, 0);
 			inode_unlock(toputinode[cnt]);
 			mark_inode_dirty_sync(toputinode[cnt]);
@@ -2237,7 +2236,6 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id,
 	struct super_block *sb = inode->i_sb;
 	struct quota_info *dqopt = sb_dqopt(sb);
 	int error;
-	int oldflags = -1;
 
 	if (!fmt)
 		return -ESRCH;
@@ -2285,9 +2283,7 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id,
 		 * possible) Also nobody should write to the file - we use
 		 * special IO operations which ignore the immutable bit. */
 		inode_lock(inode);
-		oldflags = inode->i_flags & (S_NOATIME | S_IMMUTABLE |
-					     S_NOQUOTA);
-		inode->i_flags |= S_NOQUOTA | S_NOATIME | S_IMMUTABLE;
+		inode->i_flags |= S_NOQUOTA;
 		inode_unlock(inode);
 		/*
 		 * When S_NOQUOTA is set, remove dquot references as no more
@@ -2329,14 +2325,9 @@ out_file_init:
 	dqopt->files[type] = NULL;
 	iput(inode);
 out_file_flags:
-	if (oldflags != -1) {
-		inode_lock(inode);
-		/* Set the flags back (in the case of accidental quotaon()
-		 * on a wrong file we don't want to mess up the flags) */
-		inode->i_flags &= ~(S_NOATIME | S_NOQUOTA | S_IMMUTABLE);
-		inode->i_flags |= oldflags;
-		inode_unlock(inode);
-	}
+	inode_lock(inode);
+	inode->i_flags &= ~S_NOQUOTA;
+	inode_unlock(inode);
 out_fmt:
 	put_quota_format(fmt);
 
@@ -2780,18 +2771,6 @@ int dquot_set_dqinfo(struct super_block *sb, int type, struct qc_info *ii)
 }
 EXPORT_SYMBOL(dquot_set_dqinfo);
 
-const struct quotactl_ops dquot_quotactl_ops = {
-	.quota_on	= dquot_quota_on,
-	.quota_off	= dquot_quota_off,
-	.quota_sync	= dquot_quota_sync,
-	.get_state	= dquot_get_state,
-	.set_info	= dquot_set_dqinfo,
-	.get_dqblk	= dquot_get_dqblk,
-	.get_nextdqblk	= dquot_get_next_dqblk,
-	.set_dqblk	= dquot_set_dqblk
-};
-EXPORT_SYMBOL(dquot_quotactl_ops);
-
 const struct quotactl_ops dquot_quotactl_sysfile_ops = {
 	.quota_enable	= dquot_quota_enable,
 	.quota_disable	= dquot_quota_disable,
diff --git a/fs/read_write.c b/fs/read_write.c
index c4f88afbc67f..47c1d4484df9 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -841,6 +841,81 @@ out:
 	return ret;
 }
 
+#ifdef CONFIG_COMPAT
+ssize_t compat_rw_copy_check_uvector(int type,
+		const struct compat_iovec __user *uvector, unsigned long nr_segs,
+		unsigned long fast_segs, struct iovec *fast_pointer,
+		struct iovec **ret_pointer)
+{
+	compat_ssize_t tot_len;
+	struct iovec *iov = *ret_pointer = fast_pointer;
+	ssize_t ret = 0;
+	int seg;
+
+	/*
+	 * SuS says "The readv() function *may* fail if the iovcnt argument
+	 * was less than or equal to 0, or greater than {IOV_MAX}.  Linux has
+	 * traditionally returned zero for zero segments, so...
+	 */
+	if (nr_segs == 0)
+		goto out;
+
+	ret = -EINVAL;
+	if (nr_segs > UIO_MAXIOV)
+		goto out;
+	if (nr_segs > fast_segs) {
+		ret = -ENOMEM;
+		iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL);
+		if (iov == NULL)
+			goto out;
+	}
+	*ret_pointer = iov;
+
+	ret = -EFAULT;
+	if (!access_ok(VERIFY_READ, uvector, nr_segs*sizeof(*uvector)))
+		goto out;
+
+	/*
+	 * Single unix specification:
+	 * We should -EINVAL if an element length is not >= 0 and fitting an
+	 * ssize_t.
+	 *
+	 * In Linux, the total length is limited to MAX_RW_COUNT, there is
+	 * no overflow possibility.
+	 */
+	tot_len = 0;
+	ret = -EINVAL;
+	for (seg = 0; seg < nr_segs; seg++) {
+		compat_uptr_t buf;
+		compat_ssize_t len;
+
+		if (__get_user(len, &uvector->iov_len) ||
+		   __get_user(buf, &uvector->iov_base)) {
+			ret = -EFAULT;
+			goto out;
+		}
+		if (len < 0)	/* size_t not fitting in compat_ssize_t .. */
+			goto out;
+		if (type >= 0 &&
+		    !access_ok(vrfy_dir(type), compat_ptr(buf), len)) {
+			ret = -EFAULT;
+			goto out;
+		}
+		if (len > MAX_RW_COUNT - tot_len)
+			len = MAX_RW_COUNT - tot_len;
+		tot_len += len;
+		iov->iov_base = compat_ptr(buf);
+		iov->iov_len = (compat_size_t) len;
+		uvector++;
+		iov++;
+	}
+	ret = tot_len;
+
+out:
+	return ret;
+}
+#endif
+
 static ssize_t __do_readv_writev(int type, struct file *file,
 				 struct iov_iter *iter, loff_t *pos, int flags)
 {
diff --git a/fs/readdir.c b/fs/readdir.c
index 0e8a7f355f7a..89659549c09d 100644
--- a/fs/readdir.c
+++ b/fs/readdir.c
@@ -18,6 +18,7 @@
 #include <linux/security.h>
 #include <linux/syscalls.h>
 #include <linux/unistd.h>
+#include <linux/compat.h>
 
 #include <linux/uaccess.h>
 
@@ -324,3 +325,167 @@ SYSCALL_DEFINE3(getdents64, unsigned int, fd,
 	fdput_pos(f);
 	return error;
 }
+
+#ifdef CONFIG_COMPAT
+struct compat_old_linux_dirent {
+	compat_ulong_t	d_ino;
+	compat_ulong_t	d_offset;
+	unsigned short	d_namlen;
+	char		d_name[1];
+};
+
+struct compat_readdir_callback {
+	struct dir_context ctx;
+	struct compat_old_linux_dirent __user *dirent;
+	int result;
+};
+
+static int compat_fillonedir(struct dir_context *ctx, const char *name,
+			     int namlen, loff_t offset, u64 ino,
+			     unsigned int d_type)
+{
+	struct compat_readdir_callback *buf =
+		container_of(ctx, struct compat_readdir_callback, ctx);
+	struct compat_old_linux_dirent __user *dirent;
+	compat_ulong_t d_ino;
+
+	if (buf->result)
+		return -EINVAL;
+	d_ino = ino;
+	if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) {
+		buf->result = -EOVERFLOW;
+		return -EOVERFLOW;
+	}
+	buf->result++;
+	dirent = buf->dirent;
+	if (!access_ok(VERIFY_WRITE, dirent,
+			(unsigned long)(dirent->d_name + namlen + 1) -
+				(unsigned long)dirent))
+		goto efault;
+	if (	__put_user(d_ino, &dirent->d_ino) ||
+		__put_user(offset, &dirent->d_offset) ||
+		__put_user(namlen, &dirent->d_namlen) ||
+		__copy_to_user(dirent->d_name, name, namlen) ||
+		__put_user(0, dirent->d_name + namlen))
+		goto efault;
+	return 0;
+efault:
+	buf->result = -EFAULT;
+	return -EFAULT;
+}
+
+COMPAT_SYSCALL_DEFINE3(old_readdir, unsigned int, fd,
+		struct compat_old_linux_dirent __user *, dirent, unsigned int, count)
+{
+	int error;
+	struct fd f = fdget_pos(fd);
+	struct compat_readdir_callback buf = {
+		.ctx.actor = compat_fillonedir,
+		.dirent = dirent
+	};
+
+	if (!f.file)
+		return -EBADF;
+
+	error = iterate_dir(f.file, &buf.ctx);
+	if (buf.result)
+		error = buf.result;
+
+	fdput_pos(f);
+	return error;
+}
+
+struct compat_linux_dirent {
+	compat_ulong_t	d_ino;
+	compat_ulong_t	d_off;
+	unsigned short	d_reclen;
+	char		d_name[1];
+};
+
+struct compat_getdents_callback {
+	struct dir_context ctx;
+	struct compat_linux_dirent __user *current_dir;
+	struct compat_linux_dirent __user *previous;
+	int count;
+	int error;
+};
+
+static int compat_filldir(struct dir_context *ctx, const char *name, int namlen,
+		loff_t offset, u64 ino, unsigned int d_type)
+{
+	struct compat_linux_dirent __user * dirent;
+	struct compat_getdents_callback *buf =
+		container_of(ctx, struct compat_getdents_callback, ctx);
+	compat_ulong_t d_ino;
+	int reclen = ALIGN(offsetof(struct compat_linux_dirent, d_name) +
+		namlen + 2, sizeof(compat_long_t));
+
+	buf->error = -EINVAL;	/* only used if we fail.. */
+	if (reclen > buf->count)
+		return -EINVAL;
+	d_ino = ino;
+	if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) {
+		buf->error = -EOVERFLOW;
+		return -EOVERFLOW;
+	}
+	dirent = buf->previous;
+	if (dirent) {
+		if (signal_pending(current))
+			return -EINTR;
+		if (__put_user(offset, &dirent->d_off))
+			goto efault;
+	}
+	dirent = buf->current_dir;
+	if (__put_user(d_ino, &dirent->d_ino))
+		goto efault;
+	if (__put_user(reclen, &dirent->d_reclen))
+		goto efault;
+	if (copy_to_user(dirent->d_name, name, namlen))
+		goto efault;
+	if (__put_user(0, dirent->d_name + namlen))
+		goto efault;
+	if (__put_user(d_type, (char  __user *) dirent + reclen - 1))
+		goto efault;
+	buf->previous = dirent;
+	dirent = (void __user *)dirent + reclen;
+	buf->current_dir = dirent;
+	buf->count -= reclen;
+	return 0;
+efault:
+	buf->error = -EFAULT;
+	return -EFAULT;
+}
+
+COMPAT_SYSCALL_DEFINE3(getdents, unsigned int, fd,
+		struct compat_linux_dirent __user *, dirent, unsigned int, count)
+{
+	struct fd f;
+	struct compat_linux_dirent __user * lastdirent;
+	struct compat_getdents_callback buf = {
+		.ctx.actor = compat_filldir,
+		.current_dir = dirent,
+		.count = count
+	};
+	int error;
+
+	if (!access_ok(VERIFY_WRITE, dirent, count))
+		return -EFAULT;
+
+	f = fdget_pos(fd);
+	if (!f.file)
+		return -EBADF;
+
+	error = iterate_dir(f.file, &buf.ctx);
+	if (error >= 0)
+		error = buf.error;
+	lastdirent = buf.previous;
+	if (lastdirent) {
+		if (put_user(buf.ctx.pos, &lastdirent->d_off))
+			error = -EFAULT;
+		else
+			error = count - buf.count;
+	}
+	fdput_pos(f);
+	return error;
+}
+#endif
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index a6ab9d64ea1b..873fc04e9403 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -1375,7 +1375,6 @@ static void init_inode(struct inode *inode, struct treepath *path)
 static void inode2sd(void *sd, struct inode *inode, loff_t size)
 {
 	struct stat_data *sd_v2 = (struct stat_data *)sd;
-	__u16 flags;
 
 	set_sd_v2_mode(sd_v2, inode->i_mode);
 	set_sd_v2_nlink(sd_v2, inode->i_nlink);
@@ -1390,9 +1389,7 @@ static void inode2sd(void *sd, struct inode *inode, loff_t size)
 		set_sd_v2_rdev(sd_v2, new_encode_dev(inode->i_rdev));
 	else
 		set_sd_v2_generation(sd_v2, inode->i_generation);
-	flags = REISERFS_I(inode)->i_attrs;
-	i_attrs_to_sd_attrs(inode, &flags);
-	set_sd_v2_attrs(sd_v2, flags);
+	set_sd_v2_attrs(sd_v2, REISERFS_I(inode)->i_attrs);
 }
 
 /* used to copy inode's fields to old stat data */
@@ -2002,10 +1999,6 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
 
 	/* uid and gid must already be set by the caller for quota init */
 
-	/* symlink cannot be immutable or append only, right? */
-	if (S_ISLNK(inode->i_mode))
-		inode->i_flags &= ~(S_IMMUTABLE | S_APPEND);
-
 	inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
 	inode->i_size = i_size;
 	inode->i_blocks = 0;
@@ -3095,28 +3088,6 @@ void sd_attrs_to_i_attrs(__u16 sd_attrs, struct inode *inode)
 	}
 }
 
-void i_attrs_to_sd_attrs(struct inode *inode, __u16 * sd_attrs)
-{
-	if (reiserfs_attrs(inode->i_sb)) {
-		if (inode->i_flags & S_IMMUTABLE)
-			*sd_attrs |= REISERFS_IMMUTABLE_FL;
-		else
-			*sd_attrs &= ~REISERFS_IMMUTABLE_FL;
-		if (inode->i_flags & S_SYNC)
-			*sd_attrs |= REISERFS_SYNC_FL;
-		else
-			*sd_attrs &= ~REISERFS_SYNC_FL;
-		if (inode->i_flags & S_NOATIME)
-			*sd_attrs |= REISERFS_NOATIME_FL;
-		else
-			*sd_attrs &= ~REISERFS_NOATIME_FL;
-		if (REISERFS_I(inode)->i_flags & i_nopack_mask)
-			*sd_attrs |= REISERFS_NOTAIL_FL;
-		else
-			*sd_attrs &= ~REISERFS_NOTAIL_FL;
-	}
-}
-
 /*
  * decide if this buffer needs to stay around for data logging or ordered
  * write purposes
diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c
index 1f4692a505a0..acbbaf7a0bb2 100644
--- a/fs/reiserfs/ioctl.c
+++ b/fs/reiserfs/ioctl.c
@@ -47,7 +47,6 @@ long reiserfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 		}
 
 		flags = REISERFS_I(inode)->i_attrs;
-		i_attrs_to_sd_attrs(inode, (__u16 *) & flags);
 		err = put_user(flags, (int __user *)arg);
 		break;
 	case REISERFS_IOC_SETFLAGS:{
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index aa40c242f1db..da01f497180a 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -1961,7 +1961,7 @@ static int do_journal_release(struct reiserfs_transaction_handle *th,
 	 * will be requeued because superblock is being shutdown and doesn't
 	 * have MS_ACTIVE set.
 	 */
-	cancel_delayed_work_sync(&REISERFS_SB(sb)->old_work);
+	reiserfs_cancel_old_flush(sb);
 	/* wait for all commits to finish */
 	cancel_delayed_work_sync(&SB_JOURNAL(sb)->j_work);
 
diff --git a/fs/reiserfs/lbalance.c b/fs/reiserfs/lbalance.c
index 249594a821e0..f5cebd70d903 100644
--- a/fs/reiserfs/lbalance.c
+++ b/fs/reiserfs/lbalance.c
@@ -475,7 +475,7 @@ static void leaf_item_bottle(struct buffer_info *dest_bi,
 			 * 'cpy_bytes'; create new item header;
 			 * n_ih = new item_header;
 			 */
-			memcpy(&n_ih, ih, SHORT_KEY_SIZE);
+			memcpy(&n_ih.ih_key, &ih->ih_key, KEY_SIZE);
 
 			/* Endian safe, both le */
 			n_ih.ih_version = ih->ih_version;
diff --git a/fs/reiserfs/reiserfs.h b/fs/reiserfs/reiserfs.h
index 2adcde137c3f..1d34377fef97 100644
--- a/fs/reiserfs/reiserfs.h
+++ b/fs/reiserfs/reiserfs.h
@@ -1326,7 +1326,6 @@ struct cpu_key {
 #define KEY_NOT_FOUND 0
 
 #define KEY_SIZE (sizeof(struct reiserfs_key))
-#define SHORT_KEY_SIZE (sizeof (__u32) + sizeof (__u32))
 
 /* return values for search_by_key and clones */
 #define ITEM_FOUND 1
@@ -2949,6 +2948,7 @@ int reiserfs_allocate_list_bitmaps(struct super_block *s,
 				   struct reiserfs_list_bitmap *, unsigned int);
 
 void reiserfs_schedule_old_flush(struct super_block *s);
+void reiserfs_cancel_old_flush(struct super_block *s);
 void add_save_link(struct reiserfs_transaction_handle *th,
 		   struct inode *inode, int truncate);
 int remove_save_link(struct inode *inode, int truncate);
@@ -3099,7 +3099,6 @@ static inline void reiserfs_update_sd(struct reiserfs_transaction_handle *th,
 }
 
 void sd_attrs_to_i_attrs(__u16 sd_attrs, struct inode *inode);
-void i_attrs_to_sd_attrs(struct inode *inode, __u16 * sd_attrs);
 int reiserfs_setattr(struct dentry *dentry, struct iattr *attr);
 
 int __reiserfs_write_begin(struct page *page, unsigned from, unsigned len);
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index feabcde0290d..685f1e056998 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -89,11 +89,27 @@ static void flush_old_commits(struct work_struct *work)
 	sbi = container_of(work, struct reiserfs_sb_info, old_work.work);
 	s = sbi->s_journal->j_work_sb;
 
+	/*
+	 * We need s_umount for protecting quota writeback. We have to use
+	 * trylock as reiserfs_cancel_old_flush() may be waiting for this work
+	 * to complete with s_umount held.
+	 */
+	if (!down_read_trylock(&s->s_umount)) {
+		/* Requeue work if we are not cancelling it */
+		spin_lock(&sbi->old_work_lock);
+		if (sbi->work_queued == 1)
+			queue_delayed_work(system_long_wq, &sbi->old_work, HZ);
+		spin_unlock(&sbi->old_work_lock);
+		return;
+	}
 	spin_lock(&sbi->old_work_lock);
-	sbi->work_queued = 0;
+	/* Avoid clobbering the cancel state... */
+	if (sbi->work_queued == 1)
+		sbi->work_queued = 0;
 	spin_unlock(&sbi->old_work_lock);
 
 	reiserfs_sync_fs(s, 1);
+	up_read(&s->s_umount);
 }
 
 void reiserfs_schedule_old_flush(struct super_block *s)
@@ -117,21 +133,22 @@ void reiserfs_schedule_old_flush(struct super_block *s)
 	spin_unlock(&sbi->old_work_lock);
 }
 
-static void cancel_old_flush(struct super_block *s)
+void reiserfs_cancel_old_flush(struct super_block *s)
 {
 	struct reiserfs_sb_info *sbi = REISERFS_SB(s);
 
-	cancel_delayed_work_sync(&REISERFS_SB(s)->old_work);
 	spin_lock(&sbi->old_work_lock);
-	sbi->work_queued = 0;
+	/* Make sure no new flushes will be queued */
+	sbi->work_queued = 2;
 	spin_unlock(&sbi->old_work_lock);
+	cancel_delayed_work_sync(&REISERFS_SB(s)->old_work);
 }
 
 static int reiserfs_freeze(struct super_block *s)
 {
 	struct reiserfs_transaction_handle th;
 
-	cancel_old_flush(s);
+	reiserfs_cancel_old_flush(s);
 
 	reiserfs_write_lock(s);
 	if (!(s->s_flags & MS_RDONLY)) {
@@ -152,7 +169,13 @@ static int reiserfs_freeze(struct super_block *s)
 
 static int reiserfs_unfreeze(struct super_block *s)
 {
+	struct reiserfs_sb_info *sbi = REISERFS_SB(s);
+
 	reiserfs_allow_writes(s);
+	spin_lock(&sbi->old_work_lock);
+	/* Allow old_work to run again */
+	sbi->work_queued = 0;
+	spin_unlock(&sbi->old_work_lock);
 	return 0;
 }
 
@@ -547,12 +570,28 @@ static void reiserfs_kill_sb(struct super_block *s)
 	kill_block_super(s);
 }
 
+#ifdef CONFIG_QUOTA
+static int reiserfs_quota_off(struct super_block *sb, int type);
+
+static void reiserfs_quota_off_umount(struct super_block *s)
+{
+	int type;
+
+	for (type = 0; type < REISERFS_MAXQUOTAS; type++)
+		reiserfs_quota_off(s, type);
+}
+#else
+static inline void reiserfs_quota_off_umount(struct super_block *s)
+{
+}
+#endif
+
 static void reiserfs_put_super(struct super_block *s)
 {
 	struct reiserfs_transaction_handle th;
 	th.t_trans_id = 0;
 
-	dquot_disable(s, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
+	reiserfs_quota_off_umount(s);
 
 	reiserfs_write_lock(s);
 
@@ -817,7 +856,7 @@ static const struct dquot_operations reiserfs_quota_operations = {
 
 static const struct quotactl_ops reiserfs_qctl_operations = {
 	.quota_on = reiserfs_quota_on,
-	.quota_off = dquot_quota_off,
+	.quota_off = reiserfs_quota_off,
 	.quota_sync = dquot_quota_sync,
 	.get_state = dquot_get_state,
 	.set_info = dquot_set_dqinfo,
@@ -2194,7 +2233,7 @@ error_unlocked:
 	if (sbi->commit_wq)
 		destroy_workqueue(sbi->commit_wq);
 
-	cancel_delayed_work_sync(&REISERFS_SB(s)->old_work);
+	reiserfs_cancel_old_flush(s);
 
 	reiserfs_free_bitmap_cache(s);
 	if (SB_BUFFER_WITH_SB(s))
@@ -2405,12 +2444,47 @@ static int reiserfs_quota_on(struct super_block *sb, int type, int format_id,
 			goto out;
 	}
 	reiserfs_write_unlock(sb);
-	return dquot_quota_on(sb, type, format_id, path);
+	err = dquot_quota_on(sb, type, format_id, path);
+	if (!err) {
+		inode_lock(inode);
+		REISERFS_I(inode)->i_attrs |= REISERFS_IMMUTABLE_FL |
+					      REISERFS_NOATIME_FL;
+		inode_set_flags(inode, S_IMMUTABLE | S_NOATIME,
+				S_IMMUTABLE | S_NOATIME);
+		inode_unlock(inode);
+		mark_inode_dirty(inode);
+	}
+	return err;
 out:
 	reiserfs_write_unlock(sb);
 	return err;
 }
 
+static int reiserfs_quota_off(struct super_block *sb, int type)
+{
+	int err;
+	struct inode *inode = sb_dqopt(sb)->files[type];
+
+	if (!inode || !igrab(inode))
+		goto out;
+
+	err = dquot_quota_off(sb, type);
+	if (err)
+		goto out_put;
+
+	inode_lock(inode);
+	REISERFS_I(inode)->i_attrs &= ~(REISERFS_IMMUTABLE_FL |
+					REISERFS_NOATIME_FL);
+	inode_set_flags(inode, 0, S_IMMUTABLE | S_NOATIME);
+	inode_unlock(inode);
+	mark_inode_dirty(inode);
+out_put:
+	iput(inode);
+	return err;
+out:
+	return dquot_quota_off(sb, type);
+}
+
 /*
  * Read data from quotafile - avoid pagecache and such because we cannot afford
  * acquiring the locks... As quota files are never truncated and quota code
diff --git a/fs/select.c b/fs/select.c
index e2112270d75a..bd4b2ccfd346 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -338,6 +338,53 @@ sticky:
 	return ret;
 }
 
+/*
+ * Scalable version of the fd_set.
+ */
+
+typedef struct {
+	unsigned long *in, *out, *ex;
+	unsigned long *res_in, *res_out, *res_ex;
+} fd_set_bits;
+
+/*
+ * How many longwords for "nr" bits?
+ */
+#define FDS_BITPERLONG	(8*sizeof(long))
+#define FDS_LONGS(nr)	(((nr)+FDS_BITPERLONG-1)/FDS_BITPERLONG)
+#define FDS_BYTES(nr)	(FDS_LONGS(nr)*sizeof(long))
+
+/*
+ * We do a VERIFY_WRITE here even though we are only reading this time:
+ * we'll write to it eventually..
+ *
+ * Use "unsigned long" accesses to let user-mode fd_set's be long-aligned.
+ */
+static inline
+int get_fd_set(unsigned long nr, void __user *ufdset, unsigned long *fdset)
+{
+	nr = FDS_BYTES(nr);
+	if (ufdset)
+		return copy_from_user(fdset, ufdset, nr) ? -EFAULT : 0;
+
+	memset(fdset, 0, nr);
+	return 0;
+}
+
+static inline unsigned long __must_check
+set_fd_set(unsigned long nr, void __user *ufdset, unsigned long *fdset)
+{
+	if (ufdset)
+		return __copy_to_user(ufdset, fdset, FDS_BYTES(nr));
+	return 0;
+}
+
+static inline
+void zero_fd_set(unsigned long nr, unsigned long *fdset)
+{
+	memset(fdset, 0, FDS_BYTES(nr));
+}
+
 #define FDS_IN(fds, n)		(fds->in + n)
 #define FDS_OUT(fds, n)		(fds->out + n)
 #define FDS_EX(fds, n)		(fds->ex + n)
@@ -401,7 +448,7 @@ static inline void wait_key_set(poll_table *wait, unsigned long in,
 		wait->_key |= POLLOUT_SET;
 }
 
-int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time)
+static int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time)
 {
 	ktime_t expire, *to = NULL;
 	struct poll_wqueues table;
@@ -409,7 +456,7 @@ int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time)
 	int retval, i, timed_out = 0;
 	u64 slack = 0;
 	unsigned int busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0;
-	unsigned long busy_end = 0;
+	unsigned long busy_start = 0;
 
 	rcu_read_lock();
 	retval = max_select_fd(n, fds);
@@ -512,11 +559,11 @@ int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time)
 
 		/* only if found POLL_BUSY_LOOP sockets && not out of time */
 		if (can_busy_loop && !need_resched()) {
-			if (!busy_end) {
-				busy_end = busy_loop_end_time();
+			if (!busy_start) {
+				busy_start = busy_loop_current_time();
 				continue;
 			}
-			if (!busy_loop_timeout(busy_end))
+			if (!busy_loop_timeout(busy_start))
 				continue;
 		}
 		busy_flag = 0;
@@ -800,7 +847,7 @@ static int do_poll(struct poll_list *list, struct poll_wqueues *wait,
 	int timed_out = 0, count = 0;
 	u64 slack = 0;
 	unsigned int busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0;
-	unsigned long busy_end = 0;
+	unsigned long busy_start = 0;
 
 	/* Optimise the no-wait case */
 	if (end_time && !end_time->tv_sec && !end_time->tv_nsec) {
@@ -853,11 +900,11 @@ static int do_poll(struct poll_list *list, struct poll_wqueues *wait,
 
 		/* only if found POLL_BUSY_LOOP sockets && not out of time */
 		if (can_busy_loop && !need_resched()) {
-			if (!busy_end) {
-				busy_end = busy_loop_end_time();
+			if (!busy_start) {
+				busy_start = busy_loop_current_time();
 				continue;
 			}
-			if (!busy_loop_timeout(busy_end))
+			if (!busy_loop_timeout(busy_start))
 				continue;
 		}
 		busy_flag = 0;
@@ -881,7 +928,7 @@ static int do_poll(struct poll_list *list, struct poll_wqueues *wait,
 #define N_STACK_PPS ((sizeof(stack_pps) - sizeof(struct poll_list))  / \
 			sizeof(struct pollfd))
 
-int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds,
+static int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds,
 		struct timespec64 *end_time)
 {
 	struct poll_wqueues table;
@@ -1053,3 +1100,373 @@ SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, unsigned int, nfds,
 
 	return ret;
 }
+
+#ifdef CONFIG_COMPAT
+#define __COMPAT_NFDBITS       (8 * sizeof(compat_ulong_t))
+
+static
+int compat_poll_select_copy_remaining(struct timespec *end_time, void __user *p,
+				      int timeval, int ret)
+{
+	struct timespec ts;
+
+	if (!p)
+		return ret;
+
+	if (current->personality & STICKY_TIMEOUTS)
+		goto sticky;
+
+	/* No update for zero timeout */
+	if (!end_time->tv_sec && !end_time->tv_nsec)
+		return ret;
+
+	ktime_get_ts(&ts);
+	ts = timespec_sub(*end_time, ts);
+	if (ts.tv_sec < 0)
+		ts.tv_sec = ts.tv_nsec = 0;
+
+	if (timeval) {
+		struct compat_timeval rtv;
+
+		rtv.tv_sec = ts.tv_sec;
+		rtv.tv_usec = ts.tv_nsec / NSEC_PER_USEC;
+
+		if (!copy_to_user(p, &rtv, sizeof(rtv)))
+			return ret;
+	} else {
+		struct compat_timespec rts;
+
+		rts.tv_sec = ts.tv_sec;
+		rts.tv_nsec = ts.tv_nsec;
+
+		if (!copy_to_user(p, &rts, sizeof(rts)))
+			return ret;
+	}
+	/*
+	 * If an application puts its timeval in read-only memory, we
+	 * don't want the Linux-specific update to the timeval to
+	 * cause a fault after the select has completed
+	 * successfully. However, because we're not updating the
+	 * timeval, we can't restart the system call.
+	 */
+
+sticky:
+	if (ret == -ERESTARTNOHAND)
+		ret = -EINTR;
+	return ret;
+}
+
+/*
+ * Ooo, nasty.  We need here to frob 32-bit unsigned longs to
+ * 64-bit unsigned longs.
+ */
+static
+int compat_get_fd_set(unsigned long nr, compat_ulong_t __user *ufdset,
+			unsigned long *fdset)
+{
+	nr = DIV_ROUND_UP(nr, __COMPAT_NFDBITS);
+	if (ufdset) {
+		unsigned long odd;
+
+		if (!access_ok(VERIFY_WRITE, ufdset, nr*sizeof(compat_ulong_t)))
+			return -EFAULT;
+
+		odd = nr & 1UL;
+		nr &= ~1UL;
+		while (nr) {
+			unsigned long h, l;
+			if (__get_user(l, ufdset) || __get_user(h, ufdset+1))
+				return -EFAULT;
+			ufdset += 2;
+			*fdset++ = h << 32 | l;
+			nr -= 2;
+		}
+		if (odd && __get_user(*fdset, ufdset))
+			return -EFAULT;
+	} else {
+		/* Tricky, must clear full unsigned long in the
+		 * kernel fdset at the end, this makes sure that
+		 * actually happens.
+		 */
+		memset(fdset, 0, ((nr + 1) & ~1)*sizeof(compat_ulong_t));
+	}
+	return 0;
+}
+
+static
+int compat_set_fd_set(unsigned long nr, compat_ulong_t __user *ufdset,
+		      unsigned long *fdset)
+{
+	unsigned long odd;
+	nr = DIV_ROUND_UP(nr, __COMPAT_NFDBITS);
+
+	if (!ufdset)
+		return 0;
+
+	odd = nr & 1UL;
+	nr &= ~1UL;
+	while (nr) {
+		unsigned long h, l;
+		l = *fdset++;
+		h = l >> 32;
+		if (__put_user(l, ufdset) || __put_user(h, ufdset+1))
+			return -EFAULT;
+		ufdset += 2;
+		nr -= 2;
+	}
+	if (odd && __put_user(*fdset, ufdset))
+		return -EFAULT;
+	return 0;
+}
+
+
+/*
+ * This is a virtual copy of sys_select from fs/select.c and probably
+ * should be compared to it from time to time
+ */
+
+/*
+ * We can actually return ERESTARTSYS instead of EINTR, but I'd
+ * like to be certain this leads to no problems. So I return
+ * EINTR just for safety.
+ *
+ * Update: ERESTARTSYS breaks at least the xview clock binary, so
+ * I'm trying ERESTARTNOHAND which restart only when you want to.
+ */
+static int compat_core_sys_select(int n, compat_ulong_t __user *inp,
+	compat_ulong_t __user *outp, compat_ulong_t __user *exp,
+	struct timespec *end_time)
+{
+	fd_set_bits fds;
+	void *bits;
+	int size, max_fds, ret = -EINVAL;
+	struct fdtable *fdt;
+	long stack_fds[SELECT_STACK_ALLOC/sizeof(long)];
+
+	if (n < 0)
+		goto out_nofds;
+
+	/* max_fds can increase, so grab it once to avoid race */
+	rcu_read_lock();
+	fdt = files_fdtable(current->files);
+	max_fds = fdt->max_fds;
+	rcu_read_unlock();
+	if (n > max_fds)
+		n = max_fds;
+
+	/*
+	 * We need 6 bitmaps (in/out/ex for both incoming and outgoing),
+	 * since we used fdset we need to allocate memory in units of
+	 * long-words.
+	 */
+	size = FDS_BYTES(n);
+	bits = stack_fds;
+	if (size > sizeof(stack_fds) / 6) {
+		bits = kmalloc(6 * size, GFP_KERNEL);
+		ret = -ENOMEM;
+		if (!bits)
+			goto out_nofds;
+	}
+	fds.in      = (unsigned long *)  bits;
+	fds.out     = (unsigned long *) (bits +   size);
+	fds.ex      = (unsigned long *) (bits + 2*size);
+	fds.res_in  = (unsigned long *) (bits + 3*size);
+	fds.res_out = (unsigned long *) (bits + 4*size);
+	fds.res_ex  = (unsigned long *) (bits + 5*size);
+
+	if ((ret = compat_get_fd_set(n, inp, fds.in)) ||
+	    (ret = compat_get_fd_set(n, outp, fds.out)) ||
+	    (ret = compat_get_fd_set(n, exp, fds.ex)))
+		goto out;
+	zero_fd_set(n, fds.res_in);
+	zero_fd_set(n, fds.res_out);
+	zero_fd_set(n, fds.res_ex);
+
+	ret = do_select(n, &fds, end_time);
+
+	if (ret < 0)
+		goto out;
+	if (!ret) {
+		ret = -ERESTARTNOHAND;
+		if (signal_pending(current))
+			goto out;
+		ret = 0;
+	}
+
+	if (compat_set_fd_set(n, inp, fds.res_in) ||
+	    compat_set_fd_set(n, outp, fds.res_out) ||
+	    compat_set_fd_set(n, exp, fds.res_ex))
+		ret = -EFAULT;
+out:
+	if (bits != stack_fds)
+		kfree(bits);
+out_nofds:
+	return ret;
+}
+
+COMPAT_SYSCALL_DEFINE5(select, int, n, compat_ulong_t __user *, inp,
+	compat_ulong_t __user *, outp, compat_ulong_t __user *, exp,
+	struct compat_timeval __user *, tvp)
+{
+	struct timespec end_time, *to = NULL;
+	struct compat_timeval tv;
+	int ret;
+
+	if (tvp) {
+		if (copy_from_user(&tv, tvp, sizeof(tv)))
+			return -EFAULT;
+
+		to = &end_time;
+		if (poll_select_set_timeout(to,
+				tv.tv_sec + (tv.tv_usec / USEC_PER_SEC),
+				(tv.tv_usec % USEC_PER_SEC) * NSEC_PER_USEC))
+			return -EINVAL;
+	}
+
+	ret = compat_core_sys_select(n, inp, outp, exp, to);
+	ret = compat_poll_select_copy_remaining(&end_time, tvp, 1, ret);
+
+	return ret;
+}
+
+struct compat_sel_arg_struct {
+	compat_ulong_t n;
+	compat_uptr_t inp;
+	compat_uptr_t outp;
+	compat_uptr_t exp;
+	compat_uptr_t tvp;
+};
+
+COMPAT_SYSCALL_DEFINE1(old_select, struct compat_sel_arg_struct __user *, arg)
+{
+	struct compat_sel_arg_struct a;
+
+	if (copy_from_user(&a, arg, sizeof(a)))
+		return -EFAULT;
+	return compat_sys_select(a.n, compat_ptr(a.inp), compat_ptr(a.outp),
+				 compat_ptr(a.exp), compat_ptr(a.tvp));
+}
+
+static long do_compat_pselect(int n, compat_ulong_t __user *inp,
+	compat_ulong_t __user *outp, compat_ulong_t __user *exp,
+	struct compat_timespec __user *tsp, compat_sigset_t __user *sigmask,
+	compat_size_t sigsetsize)
+{
+	compat_sigset_t ss32;
+	sigset_t ksigmask, sigsaved;
+	struct compat_timespec ts;
+	struct timespec end_time, *to = NULL;
+	int ret;
+
+	if (tsp) {
+		if (copy_from_user(&ts, tsp, sizeof(ts)))
+			return -EFAULT;
+
+		to = &end_time;
+		if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
+			return -EINVAL;
+	}
+
+	if (sigmask) {
+		if (sigsetsize != sizeof(compat_sigset_t))
+			return -EINVAL;
+		if (copy_from_user(&ss32, sigmask, sizeof(ss32)))
+			return -EFAULT;
+		sigset_from_compat(&ksigmask, &ss32);
+
+		sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP));
+		sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
+	}
+
+	ret = compat_core_sys_select(n, inp, outp, exp, to);
+	ret = compat_poll_select_copy_remaining(&end_time, tsp, 0, ret);
+
+	if (ret == -ERESTARTNOHAND) {
+		/*
+		 * Don't restore the signal mask yet. Let do_signal() deliver
+		 * the signal on the way back to userspace, before the signal
+		 * mask is restored.
+		 */
+		if (sigmask) {
+			memcpy(&current->saved_sigmask, &sigsaved,
+					sizeof(sigsaved));
+			set_restore_sigmask();
+		}
+	} else if (sigmask)
+		sigprocmask(SIG_SETMASK, &sigsaved, NULL);
+
+	return ret;
+}
+
+COMPAT_SYSCALL_DEFINE6(pselect6, int, n, compat_ulong_t __user *, inp,
+	compat_ulong_t __user *, outp, compat_ulong_t __user *, exp,
+	struct compat_timespec __user *, tsp, void __user *, sig)
+{
+	compat_size_t sigsetsize = 0;
+	compat_uptr_t up = 0;
+
+	if (sig) {
+		if (!access_ok(VERIFY_READ, sig,
+				sizeof(compat_uptr_t)+sizeof(compat_size_t)) ||
+		    	__get_user(up, (compat_uptr_t __user *)sig) ||
+		    	__get_user(sigsetsize,
+				(compat_size_t __user *)(sig+sizeof(up))))
+			return -EFAULT;
+	}
+	return do_compat_pselect(n, inp, outp, exp, tsp, compat_ptr(up),
+				 sigsetsize);
+}
+
+COMPAT_SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds,
+	unsigned int,  nfds, struct compat_timespec __user *, tsp,
+	const compat_sigset_t __user *, sigmask, compat_size_t, sigsetsize)
+{
+	compat_sigset_t ss32;
+	sigset_t ksigmask, sigsaved;
+	struct compat_timespec ts;
+	struct timespec end_time, *to = NULL;
+	int ret;
+
+	if (tsp) {
+		if (copy_from_user(&ts, tsp, sizeof(ts)))
+			return -EFAULT;
+
+		to = &end_time;
+		if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
+			return -EINVAL;
+	}
+
+	if (sigmask) {
+		if (sigsetsize != sizeof(compat_sigset_t))
+			return -EINVAL;
+		if (copy_from_user(&ss32, sigmask, sizeof(ss32)))
+			return -EFAULT;
+		sigset_from_compat(&ksigmask, &ss32);
+
+		sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP));
+		sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
+	}
+
+	ret = do_sys_poll(ufds, nfds, to);
+
+	/* We can restart this syscall, usually */
+	if (ret == -EINTR) {
+		/*
+		 * Don't restore the signal mask yet. Let do_signal() deliver
+		 * the signal on the way back to userspace, before the signal
+		 * mask is restored.
+		 */
+		if (sigmask) {
+			memcpy(&current->saved_sigmask, &sigsaved,
+				sizeof(sigsaved));
+			set_restore_sigmask();
+		}
+		ret = -ERESTARTNOHAND;
+	} else if (sigmask)
+		sigprocmask(SIG_SETMASK, &sigsaved, NULL);
+
+	ret = compat_poll_select_copy_remaining(&end_time, tsp, 0, ret);
+
+	return ret;
+}
+#endif
diff --git a/fs/splice.c b/fs/splice.c
index 006ba50f4ece..540c4a44756c 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -247,11 +247,6 @@ ssize_t add_to_pipe(struct pipe_inode_info *pipe, struct pipe_buffer *buf)
 }
 EXPORT_SYMBOL(add_to_pipe);
 
-void spd_release_page(struct splice_pipe_desc *spd, unsigned int i)
-{
-	put_page(spd->pages[i]);
-}
-
 /*
  * Check if we need to grow the arrays holding pages and partial page
  * descriptions.
@@ -393,7 +388,7 @@ static ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
 	struct iov_iter to;
 	struct page **pages;
 	unsigned int nr_pages;
-	size_t offset, dummy, copied = 0;
+	size_t offset, base, copied = 0;
 	ssize_t res;
 	int i;
 
@@ -408,12 +403,11 @@ static ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
 
 	iov_iter_pipe(&to, ITER_PIPE | READ, pipe, len + offset);
 
-	res = iov_iter_get_pages_alloc(&to, &pages, len + offset, &dummy);
+	res = iov_iter_get_pages_alloc(&to, &pages, len + offset, &base);
 	if (res <= 0)
 		return -ENOMEM;
 
-	BUG_ON(dummy);
-	nr_pages = DIV_ROUND_UP(res, PAGE_SIZE);
+	nr_pages = DIV_ROUND_UP(res + base, PAGE_SIZE);
 
 	vec = __vec;
 	if (nr_pages > PIPE_DEF_BUFFERS) {
@@ -1359,6 +1353,8 @@ SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, iov,
 	struct fd f;
 	long error;
 
+	if (unlikely(flags & ~SPLICE_F_ALL))
+		return -EINVAL;
 	if (unlikely(nr_segs > UIO_MAXIOV))
 		return -EINVAL;
 	else if (unlikely(!nr_segs))
@@ -1409,6 +1405,9 @@ SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in,
 	if (unlikely(!len))
 		return 0;
 
+	if (unlikely(flags & ~SPLICE_F_ALL))
+		return -EINVAL;
+
 	error = -EBADF;
 	in = fdget(fd_in);
 	if (in.file) {
@@ -1737,6 +1736,9 @@ SYSCALL_DEFINE4(tee, int, fdin, int, fdout, size_t, len, unsigned int, flags)
 	struct fd in;
 	int error;
 
+	if (unlikely(flags & ~SPLICE_F_ALL))
+		return -EINVAL;
+
 	if (unlikely(!len))
 		return 0;
 
diff --git a/fs/stat.c b/fs/stat.c
index a257b872a53d..f494b182c7c7 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -15,6 +15,7 @@
 #include <linux/cred.h>
 #include <linux/syscalls.h>
 #include <linux/pagemap.h>
+#include <linux/compat.h>
 
 #include <linux/uaccess.h>
 #include <asm/unistd.h>
@@ -575,6 +576,91 @@ SYSCALL_DEFINE5(statx,
 	return cp_statx(&stat, buffer);
 }
 
+#ifdef CONFIG_COMPAT
+static int cp_compat_stat(struct kstat *stat, struct compat_stat __user *ubuf)
+{
+	struct compat_stat tmp;
+
+	if (!old_valid_dev(stat->dev) || !old_valid_dev(stat->rdev))
+		return -EOVERFLOW;
+
+	memset(&tmp, 0, sizeof(tmp));
+	tmp.st_dev = old_encode_dev(stat->dev);
+	tmp.st_ino = stat->ino;
+	if (sizeof(tmp.st_ino) < sizeof(stat->ino) && tmp.st_ino != stat->ino)
+		return -EOVERFLOW;
+	tmp.st_mode = stat->mode;
+	tmp.st_nlink = stat->nlink;
+	if (tmp.st_nlink != stat->nlink)
+		return -EOVERFLOW;
+	SET_UID(tmp.st_uid, from_kuid_munged(current_user_ns(), stat->uid));
+	SET_GID(tmp.st_gid, from_kgid_munged(current_user_ns(), stat->gid));
+	tmp.st_rdev = old_encode_dev(stat->rdev);
+	if ((u64) stat->size > MAX_NON_LFS)
+		return -EOVERFLOW;
+	tmp.st_size = stat->size;
+	tmp.st_atime = stat->atime.tv_sec;
+	tmp.st_atime_nsec = stat->atime.tv_nsec;
+	tmp.st_mtime = stat->mtime.tv_sec;
+	tmp.st_mtime_nsec = stat->mtime.tv_nsec;
+	tmp.st_ctime = stat->ctime.tv_sec;
+	tmp.st_ctime_nsec = stat->ctime.tv_nsec;
+	tmp.st_blocks = stat->blocks;
+	tmp.st_blksize = stat->blksize;
+	return copy_to_user(ubuf, &tmp, sizeof(tmp)) ? -EFAULT : 0;
+}
+
+COMPAT_SYSCALL_DEFINE2(newstat, const char __user *, filename,
+		       struct compat_stat __user *, statbuf)
+{
+	struct kstat stat;
+	int error;
+
+	error = vfs_stat(filename, &stat);
+	if (error)
+		return error;
+	return cp_compat_stat(&stat, statbuf);
+}
+
+COMPAT_SYSCALL_DEFINE2(newlstat, const char __user *, filename,
+		       struct compat_stat __user *, statbuf)
+{
+	struct kstat stat;
+	int error;
+
+	error = vfs_lstat(filename, &stat);
+	if (error)
+		return error;
+	return cp_compat_stat(&stat, statbuf);
+}
+
+#ifndef __ARCH_WANT_STAT64
+COMPAT_SYSCALL_DEFINE4(newfstatat, unsigned int, dfd,
+		       const char __user *, filename,
+		       struct compat_stat __user *, statbuf, int, flag)
+{
+	struct kstat stat;
+	int error;
+
+	error = vfs_fstatat(dfd, filename, &stat, flag);
+	if (error)
+		return error;
+	return cp_compat_stat(&stat, statbuf);
+}
+#endif
+
+COMPAT_SYSCALL_DEFINE2(newfstat, unsigned int, fd,
+		       struct compat_stat __user *, statbuf)
+{
+	struct kstat stat;
+	int error = vfs_fstat(fd, &stat);
+
+	if (!error)
+		error = cp_compat_stat(&stat, statbuf);
+	return error;
+}
+#endif
+
 /* Caller is here responsible for sufficient locking (ie. inode->i_lock) */
 void __inode_add_bytes(struct inode *inode, loff_t bytes)
 {
diff --git a/fs/statfs.c b/fs/statfs.c
index 13ae259d4879..4e4623c7a126 100644
--- a/fs/statfs.c
+++ b/fs/statfs.c
@@ -7,6 +7,7 @@
 #include <linux/statfs.h>
 #include <linux/security.h>
 #include <linux/uaccess.h>
+#include <linux/compat.h>
 #include "internal.h"
 
 static int flags_by_mnt(int mnt_flags)
@@ -239,3 +240,142 @@ SYSCALL_DEFINE2(ustat, unsigned, dev, struct ustat __user *, ubuf)
 
 	return copy_to_user(ubuf, &tmp, sizeof(struct ustat)) ? -EFAULT : 0;
 }
+
+#ifdef CONFIG_COMPAT
+static int put_compat_statfs(struct compat_statfs __user *ubuf, struct kstatfs *kbuf)
+{
+	if (sizeof ubuf->f_blocks == 4) {
+		if ((kbuf->f_blocks | kbuf->f_bfree | kbuf->f_bavail |
+		     kbuf->f_bsize | kbuf->f_frsize) & 0xffffffff00000000ULL)
+			return -EOVERFLOW;
+		/* f_files and f_ffree may be -1; it's okay
+		 * to stuff that into 32 bits */
+		if (kbuf->f_files != 0xffffffffffffffffULL
+		 && (kbuf->f_files & 0xffffffff00000000ULL))
+			return -EOVERFLOW;
+		if (kbuf->f_ffree != 0xffffffffffffffffULL
+		 && (kbuf->f_ffree & 0xffffffff00000000ULL))
+			return -EOVERFLOW;
+	}
+	if (!access_ok(VERIFY_WRITE, ubuf, sizeof(*ubuf)) ||
+	    __put_user(kbuf->f_type, &ubuf->f_type) ||
+	    __put_user(kbuf->f_bsize, &ubuf->f_bsize) ||
+	    __put_user(kbuf->f_blocks, &ubuf->f_blocks) ||
+	    __put_user(kbuf->f_bfree, &ubuf->f_bfree) ||
+	    __put_user(kbuf->f_bavail, &ubuf->f_bavail) ||
+	    __put_user(kbuf->f_files, &ubuf->f_files) ||
+	    __put_user(kbuf->f_ffree, &ubuf->f_ffree) ||
+	    __put_user(kbuf->f_namelen, &ubuf->f_namelen) ||
+	    __put_user(kbuf->f_fsid.val[0], &ubuf->f_fsid.val[0]) ||
+	    __put_user(kbuf->f_fsid.val[1], &ubuf->f_fsid.val[1]) ||
+	    __put_user(kbuf->f_frsize, &ubuf->f_frsize) ||
+	    __put_user(kbuf->f_flags, &ubuf->f_flags) ||
+	    __clear_user(ubuf->f_spare, sizeof(ubuf->f_spare)))
+		return -EFAULT;
+	return 0;
+}
+
+/*
+ * The following statfs calls are copies of code from fs/statfs.c and
+ * should be checked against those from time to time
+ */
+COMPAT_SYSCALL_DEFINE2(statfs, const char __user *, pathname, struct compat_statfs __user *, buf)
+{
+	struct kstatfs tmp;
+	int error = user_statfs(pathname, &tmp);
+	if (!error)
+		error = put_compat_statfs(buf, &tmp);
+	return error;
+}
+
+COMPAT_SYSCALL_DEFINE2(fstatfs, unsigned int, fd, struct compat_statfs __user *, buf)
+{
+	struct kstatfs tmp;
+	int error = fd_statfs(fd, &tmp);
+	if (!error)
+		error = put_compat_statfs(buf, &tmp);
+	return error;
+}
+
+static int put_compat_statfs64(struct compat_statfs64 __user *ubuf, struct kstatfs *kbuf)
+{
+	if (sizeof(ubuf->f_bsize) == 4) {
+		if ((kbuf->f_type | kbuf->f_bsize | kbuf->f_namelen |
+		     kbuf->f_frsize | kbuf->f_flags) & 0xffffffff00000000ULL)
+			return -EOVERFLOW;
+		/* f_files and f_ffree may be -1; it's okay
+		 * to stuff that into 32 bits */
+		if (kbuf->f_files != 0xffffffffffffffffULL
+		 && (kbuf->f_files & 0xffffffff00000000ULL))
+			return -EOVERFLOW;
+		if (kbuf->f_ffree != 0xffffffffffffffffULL
+		 && (kbuf->f_ffree & 0xffffffff00000000ULL))
+			return -EOVERFLOW;
+	}
+	if (!access_ok(VERIFY_WRITE, ubuf, sizeof(*ubuf)) ||
+	    __put_user(kbuf->f_type, &ubuf->f_type) ||
+	    __put_user(kbuf->f_bsize, &ubuf->f_bsize) ||
+	    __put_user(kbuf->f_blocks, &ubuf->f_blocks) ||
+	    __put_user(kbuf->f_bfree, &ubuf->f_bfree) ||
+	    __put_user(kbuf->f_bavail, &ubuf->f_bavail) ||
+	    __put_user(kbuf->f_files, &ubuf->f_files) ||
+	    __put_user(kbuf->f_ffree, &ubuf->f_ffree) ||
+	    __put_user(kbuf->f_namelen, &ubuf->f_namelen) ||
+	    __put_user(kbuf->f_fsid.val[0], &ubuf->f_fsid.val[0]) ||
+	    __put_user(kbuf->f_fsid.val[1], &ubuf->f_fsid.val[1]) ||
+	    __put_user(kbuf->f_frsize, &ubuf->f_frsize) ||
+	    __put_user(kbuf->f_flags, &ubuf->f_flags) ||
+	    __clear_user(ubuf->f_spare, sizeof(ubuf->f_spare)))
+		return -EFAULT;
+	return 0;
+}
+
+COMPAT_SYSCALL_DEFINE3(statfs64, const char __user *, pathname, compat_size_t, sz, struct compat_statfs64 __user *, buf)
+{
+	struct kstatfs tmp;
+	int error;
+
+	if (sz != sizeof(*buf))
+		return -EINVAL;
+
+	error = user_statfs(pathname, &tmp);
+	if (!error)
+		error = put_compat_statfs64(buf, &tmp);
+	return error;
+}
+
+COMPAT_SYSCALL_DEFINE3(fstatfs64, unsigned int, fd, compat_size_t, sz, struct compat_statfs64 __user *, buf)
+{
+	struct kstatfs tmp;
+	int error;
+
+	if (sz != sizeof(*buf))
+		return -EINVAL;
+
+	error = fd_statfs(fd, &tmp);
+	if (!error)
+		error = put_compat_statfs64(buf, &tmp);
+	return error;
+}
+
+/*
+ * This is a copy of sys_ustat, just dealing with a structure layout.
+ * Given how simple this syscall is that apporach is more maintainable
+ * than the various conversion hacks.
+ */
+COMPAT_SYSCALL_DEFINE2(ustat, unsigned, dev, struct compat_ustat __user *, u)
+{
+	struct compat_ustat tmp;
+	struct kstatfs sbuf;
+	int err = vfs_ustat(new_decode_dev(dev), &sbuf);
+	if (err)
+		return err;
+
+	memset(&tmp, 0, sizeof(struct compat_ustat));
+	tmp.f_tfree = sbuf.f_bfree;
+	tmp.f_tinode = sbuf.f_ffree;
+	if (copy_to_user(u, &tmp, sizeof(struct compat_ustat)))
+		return -EFAULT;
+	return 0;
+}
+#endif
diff --git a/fs/udf/file.c b/fs/udf/file.c
index e04cc0cdca9d..f5eb2d5b3bac 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -44,12 +44,12 @@ static void __udf_adinicb_readpage(struct page *page)
 	char *kaddr;
 	struct udf_inode_info *iinfo = UDF_I(inode);
 
-	kaddr = kmap(page);
+	kaddr = kmap_atomic(page);
 	memcpy(kaddr, iinfo->i_ext.i_data + iinfo->i_lenEAttr, inode->i_size);
 	memset(kaddr + inode->i_size, 0, PAGE_SIZE - inode->i_size);
 	flush_dcache_page(page);
 	SetPageUptodate(page);
-	kunmap(page);
+	kunmap_atomic(kaddr);
 }
 
 static int udf_adinicb_readpage(struct file *file, struct page *page)
@@ -70,11 +70,11 @@ static int udf_adinicb_writepage(struct page *page,
 
 	BUG_ON(!PageLocked(page));
 
-	kaddr = kmap(page);
+	kaddr = kmap_atomic(page);
 	memcpy(iinfo->i_ext.i_data + iinfo->i_lenEAttr, kaddr, inode->i_size);
-	mark_inode_dirty(inode);
 	SetPageUptodate(page);
-	kunmap(page);
+	kunmap_atomic(kaddr);
+	mark_inode_dirty(inode);
 	unlock_page(page);
 
 	return 0;
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index a8d8f71ef8bd..98c510e17203 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -276,14 +276,14 @@ int udf_expand_file_adinicb(struct inode *inode)
 		return -ENOMEM;
 
 	if (!PageUptodate(page)) {
-		kaddr = kmap(page);
+		kaddr = kmap_atomic(page);
 		memset(kaddr + iinfo->i_lenAlloc, 0x00,
 		       PAGE_SIZE - iinfo->i_lenAlloc);
 		memcpy(kaddr, iinfo->i_ext.i_data + iinfo->i_lenEAttr,
 			iinfo->i_lenAlloc);
 		flush_dcache_page(page);
 		SetPageUptodate(page);
-		kunmap(page);
+		kunmap_atomic(kaddr);
 	}
 	down_write(&iinfo->i_data_sem);
 	memset(iinfo->i_ext.i_data + iinfo->i_lenEAttr, 0x00,
@@ -300,11 +300,11 @@ int udf_expand_file_adinicb(struct inode *inode)
 	if (err) {
 		/* Restore everything back so that we don't lose data... */
 		lock_page(page);
-		kaddr = kmap(page);
 		down_write(&iinfo->i_data_sem);
+		kaddr = kmap_atomic(page);
 		memcpy(iinfo->i_ext.i_data + iinfo->i_lenEAttr, kaddr,
 		       inode->i_size);
-		kunmap(page);
+		kunmap_atomic(kaddr);
 		unlock_page(page);
 		iinfo->i_alloc_type = ICBTAG_FLAG_AD_IN_ICB;
 		inode->i_data.a_ops = &udf_adinicb_aops;
@@ -1535,7 +1535,7 @@ reread:
 		inode->i_data.a_ops = &udf_symlink_aops;
 		inode->i_op = &udf_symlink_inode_operations;
 		inode_nohighmem(inode);
-		inode->i_mode = S_IFLNK | S_IRWXUGO;
+		inode->i_mode = S_IFLNK | 0777;
 		break;
 	case ICBTAG_FILE_TYPE_MAIN:
 		udf_debug("METADATA FILE-----\n");
@@ -1591,9 +1591,9 @@ static umode_t udf_convert_permissions(struct fileEntry *fe)
 	permissions = le32_to_cpu(fe->permissions);
 	flags = le16_to_cpu(fe->icbTag.flags);
 
-	mode =	((permissions) & S_IRWXO) |
-		((permissions >> 2) & S_IRWXG) |
-		((permissions >> 4) & S_IRWXU) |
+	mode =	((permissions) & 0007) |
+		((permissions >> 2) & 0070) |
+		((permissions >> 4) & 0700) |
 		((flags & ICBTAG_FLAG_SETUID) ? S_ISUID : 0) |
 		((flags & ICBTAG_FLAG_SETGID) ? S_ISGID : 0) |
 		((flags & ICBTAG_FLAG_STICKY) ? S_ISVTX : 0);
@@ -1669,9 +1669,9 @@ static int udf_update_inode(struct inode *inode, int do_sync)
 	else
 		fe->gid = cpu_to_le32(i_gid_read(inode));
 
-	udfperms = ((inode->i_mode & S_IRWXO)) |
-		   ((inode->i_mode & S_IRWXG) << 2) |
-		   ((inode->i_mode & S_IRWXU) << 4);
+	udfperms = ((inode->i_mode & 0007)) |
+		   ((inode->i_mode & 0070) << 2) |
+		   ((inode->i_mode & 0700) << 4);
 
 	udfperms |= (le32_to_cpu(fe->permissions) &
 		    (FE_PERM_O_DELETE | FE_PERM_O_CHATTR |
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index babf48d0e553..385ee89d5824 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -906,7 +906,7 @@ out:
 static int udf_symlink(struct inode *dir, struct dentry *dentry,
 		       const char *symname)
 {
-	struct inode *inode = udf_new_inode(dir, S_IFLNK | S_IRWXUGO);
+	struct inode *inode = udf_new_inode(dir, S_IFLNK | 0777);
 	struct pathComponent *pc;
 	const char *compstart;
 	struct extent_position epos = {};
diff --git a/fs/utimes.c b/fs/utimes.c
index 32b15b3f6629..6571d8c848a0 100644
--- a/fs/utimes.c
+++ b/fs/utimes.c
@@ -1,14 +1,10 @@
-#include <linux/compiler.h>
 #include <linux/file.h>
-#include <linux/fs.h>
-#include <linux/linkage.h>
 #include <linux/mount.h>
 #include <linux/namei.h>
-#include <linux/sched.h>
-#include <linux/stat.h>
 #include <linux/utime.h>
 #include <linux/syscalls.h>
 #include <linux/uaccess.h>
+#include <linux/compat.h>
 #include <asm/unistd.h>
 
 #ifdef __ARCH_WANT_SYS_UTIME
@@ -219,3 +215,63 @@ SYSCALL_DEFINE2(utimes, char __user *, filename,
 {
 	return sys_futimesat(AT_FDCWD, filename, utimes);
 }
+
+#ifdef CONFIG_COMPAT
+/*
+ * Not all architectures have sys_utime, so implement this in terms
+ * of sys_utimes.
+ */
+COMPAT_SYSCALL_DEFINE2(utime, const char __user *, filename,
+		       struct compat_utimbuf __user *, t)
+{
+	struct timespec tv[2];
+
+	if (t) {
+		if (get_user(tv[0].tv_sec, &t->actime) ||
+		    get_user(tv[1].tv_sec, &t->modtime))
+			return -EFAULT;
+		tv[0].tv_nsec = 0;
+		tv[1].tv_nsec = 0;
+	}
+	return do_utimes(AT_FDCWD, filename, t ? tv : NULL, 0);
+}
+
+COMPAT_SYSCALL_DEFINE4(utimensat, unsigned int, dfd, const char __user *, filename, struct compat_timespec __user *, t, int, flags)
+{
+	struct timespec tv[2];
+
+	if  (t) {
+		if (compat_get_timespec(&tv[0], &t[0]) ||
+		    compat_get_timespec(&tv[1], &t[1]))
+			return -EFAULT;
+
+		if (tv[0].tv_nsec == UTIME_OMIT && tv[1].tv_nsec == UTIME_OMIT)
+			return 0;
+	}
+	return do_utimes(dfd, filename, t ? tv : NULL, flags);
+}
+
+COMPAT_SYSCALL_DEFINE3(futimesat, unsigned int, dfd, const char __user *, filename, struct compat_timeval __user *, t)
+{
+	struct timespec tv[2];
+
+	if (t) {
+		if (get_user(tv[0].tv_sec, &t[0].tv_sec) ||
+		    get_user(tv[0].tv_nsec, &t[0].tv_usec) ||
+		    get_user(tv[1].tv_sec, &t[1].tv_sec) ||
+		    get_user(tv[1].tv_nsec, &t[1].tv_usec))
+			return -EFAULT;
+		if (tv[0].tv_nsec >= 1000000 || tv[0].tv_nsec < 0 ||
+		    tv[1].tv_nsec >= 1000000 || tv[1].tv_nsec < 0)
+			return -EINVAL;
+		tv[0].tv_nsec *= 1000;
+		tv[1].tv_nsec *= 1000;
+	}
+	return do_utimes(dfd, filename, t ? tv : NULL, 0);
+}
+
+COMPAT_SYSCALL_DEFINE2(utimes, const char __user *, filename, struct compat_timeval __user *, t)
+{
+	return compat_sys_futimesat(AT_FDCWD, filename, t);
+}
+#endif
diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c
index 70a5b55e0870..780fc8986dab 100644
--- a/fs/xfs/kmem.c
+++ b/fs/xfs/kmem.c
@@ -48,7 +48,7 @@ kmem_alloc(size_t size, xfs_km_flags_t flags)
 void *
 kmem_zalloc_large(size_t size, xfs_km_flags_t flags)
 {
-	unsigned noio_flag = 0;
+	unsigned nofs_flag = 0;
 	void	*ptr;
 	gfp_t	lflags;
 
@@ -60,17 +60,17 @@ kmem_zalloc_large(size_t size, xfs_km_flags_t flags)
 	 * __vmalloc() will allocate data pages and auxillary structures (e.g.
 	 * pagetables) with GFP_KERNEL, yet we may be under GFP_NOFS context
 	 * here. Hence we need to tell memory reclaim that we are in such a
-	 * context via PF_MEMALLOC_NOIO to prevent memory reclaim re-entering
+	 * context via PF_MEMALLOC_NOFS to prevent memory reclaim re-entering
 	 * the filesystem here and potentially deadlocking.
 	 */
-	if ((current->flags & PF_FSTRANS) || (flags & KM_NOFS))
-		noio_flag = memalloc_noio_save();
+	if (flags & KM_NOFS)
+		nofs_flag = memalloc_nofs_save();
 
 	lflags = kmem_flags_convert(flags);
 	ptr = __vmalloc(size, lflags | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL);
 
-	if ((current->flags & PF_FSTRANS) || (flags & KM_NOFS))
-		memalloc_noio_restore(noio_flag);
+	if (flags & KM_NOFS)
+		memalloc_nofs_restore(nofs_flag);
 
 	return ptr;
 }
diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h
index f0fc84fcaac2..d6ea520162b2 100644
--- a/fs/xfs/kmem.h
+++ b/fs/xfs/kmem.h
@@ -50,7 +50,7 @@ kmem_flags_convert(xfs_km_flags_t flags)
 		lflags = GFP_ATOMIC | __GFP_NOWARN;
 	} else {
 		lflags = GFP_KERNEL | __GFP_NOWARN;
-		if ((current->flags & PF_FSTRANS) || (flags & KM_NOFS))
+		if (flags & KM_NOFS)
 			lflags &= ~__GFP_FS;
 	}
 
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index c3decedc9455..3059a3ec7ecb 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -2886,7 +2886,7 @@ xfs_btree_split_worker(
 	struct xfs_btree_split_args	*args = container_of(work,
 						struct xfs_btree_split_args, work);
 	unsigned long		pflags;
-	unsigned long		new_pflags = PF_FSTRANS;
+	unsigned long		new_pflags = PF_MEMALLOC_NOFS;
 
 	/*
 	 * we are in a transaction context here, but may also be doing work
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 61494295d92f..05eca126c688 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -189,7 +189,7 @@ xfs_setfilesize_trans_alloc(
 	 * We hand off the transaction to the completion thread now, so
 	 * clear the flag here.
 	 */
-	current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
+	current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
 	return 0;
 }
 
@@ -252,7 +252,7 @@ xfs_setfilesize_ioend(
 	 * thus we need to mark ourselves as being in a transaction manually.
 	 * Similarly for freeze protection.
 	 */
-	current_set_flags_nested(&tp->t_pflags, PF_FSTRANS);
+	current_set_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
 	__sb_writers_acquired(VFS_I(ip)->i_sb, SB_FREEZE_FS);
 
 	/* we abort the update if there was an IO error */
@@ -1016,7 +1016,7 @@ xfs_do_writepage(
 	 * Given that we do not allow direct reclaim to call us, we should
 	 * never be called while in a filesystem transaction.
 	 */
-	if (WARN_ON_ONCE(current->flags & PF_FSTRANS))
+	if (WARN_ON_ONCE(current->flags & PF_MEMALLOC_NOFS))
 		goto redirty;
 
 	/*
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index b6208728ba39..ca09061369cb 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -443,17 +443,17 @@ _xfs_buf_map_pages(
 		bp->b_addr = NULL;
 	} else {
 		int retried = 0;
-		unsigned noio_flag;
+		unsigned nofs_flag;
 
 		/*
 		 * vm_map_ram() will allocate auxillary structures (e.g.
 		 * pagetables) with GFP_KERNEL, yet we are likely to be under
 		 * GFP_NOFS context here. Hence we need to tell memory reclaim
-		 * that we are in such a context via PF_MEMALLOC_NOIO to prevent
+		 * that we are in such a context via PF_MEMALLOC_NOFS to prevent
 		 * memory reclaim re-entering the filesystem here and
 		 * potentially deadlocking.
 		 */
-		noio_flag = memalloc_noio_save();
+		nofs_flag = memalloc_nofs_save();
 		do {
 			bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count,
 						-1, PAGE_KERNEL);
@@ -461,7 +461,7 @@ _xfs_buf_map_pages(
 				break;
 			vm_unmap_aliases();
 		} while (retried++ <= 1);
-		memalloc_noio_restore(noio_flag);
+		memalloc_nofs_restore(nofs_flag);
 
 		if (!bp->b_addr)
 			return -ENOMEM;
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 288ee5b840d7..4b47403f8089 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -976,6 +976,7 @@ xfs_file_iomap_begin(
 	int			nimaps = 1, error = 0;
 	bool			shared = false, trimmed = false;
 	unsigned		lockmode;
+	struct block_device	*bdev;
 
 	if (XFS_FORCED_SHUTDOWN(mp))
 		return -EIO;
@@ -1063,6 +1064,14 @@ xfs_file_iomap_begin(
 	}
 
 	xfs_bmbt_to_iomap(ip, iomap, &imap);
+
+	/* optionally associate a dax device with the iomap bdev */
+	bdev = iomap->bdev;
+	if (blk_queue_dax(bdev->bd_queue))
+		iomap->dax_dev = dax_get_by_host(bdev->bd_disk->disk_name);
+	else
+		iomap->dax_dev = NULL;
+
 	if (shared)
 		iomap->flags |= IOMAP_F_SHARED;
 	return 0;
@@ -1140,6 +1149,7 @@ xfs_file_iomap_end(
 	unsigned		flags,
 	struct iomap		*iomap)
 {
+	put_dax(iomap->dax_dev);
 	if ((flags & IOMAP_WRITE) && iomap->type == IOMAP_DELALLOC)
 		return xfs_file_iomap_end_delalloc(XFS_I(inode), offset,
 				length, written, iomap);
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 70f42ea86dfb..f5969c8274fc 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -134,7 +134,7 @@ xfs_trans_reserve(
 	bool		rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0;
 
 	/* Mark this thread as being in a transaction */
-	current_set_flags_nested(&tp->t_pflags, PF_FSTRANS);
+	current_set_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
 
 	/*
 	 * Attempt to reserve the needed disk blocks by decrementing
@@ -144,7 +144,7 @@ xfs_trans_reserve(
 	if (blocks > 0) {
 		error = xfs_mod_fdblocks(tp->t_mountp, -((int64_t)blocks), rsvd);
 		if (error != 0) {
-			current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
+			current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
 			return -ENOSPC;
 		}
 		tp->t_blk_res += blocks;
@@ -221,7 +221,7 @@ undo_blocks:
 		tp->t_blk_res = 0;
 	}
 
-	current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
+	current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
 
 	return error;
 }
@@ -914,7 +914,7 @@ __xfs_trans_commit(
 
 	xfs_log_commit_cil(mp, tp, &commit_lsn, regrant);
 
-	current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
+	current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
 	xfs_trans_free(tp);
 
 	/*
@@ -944,7 +944,7 @@ out_unreserve:
 		if (commit_lsn == -1 && !error)
 			error = -EIO;
 	}
-	current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
+	current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
 	xfs_trans_free_items(tp, NULLCOMMITLSN, !!error);
 	xfs_trans_free(tp);
 
@@ -998,7 +998,7 @@ xfs_trans_cancel(
 		xfs_log_done(mp, tp->t_ticket, NULL, false);
 
 	/* mark this thread as no longer being in a transaction */
-	current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
+	current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
 
 	xfs_trans_free_items(tp, NULLCOMMITLSN, dirty);
 	xfs_trans_free(tp);