257 files changed, 6457 insertions, 3410 deletions
diff --git a/fs/9p/acl.c b/fs/9p/acl.c
index b3c2cc79c20d..082d227fa56b 100644
--- a/fs/9p/acl.c
+++ b/fs/9p/acl.c
@@ -277,6 +277,7 @@ static int v9fs_xattr_set_acl(const struct xattr_handler *handler,
 	case ACL_TYPE_ACCESS:
 		if (acl) {
 			struct iattr iattr;
+			struct posix_acl *old_acl = acl;
 
 			retval = posix_acl_update_mode(inode, &iattr.ia_mode, &acl);
 			if (retval)
@@ -287,6 +288,7 @@ static int v9fs_xattr_set_acl(const struct xattr_handler *handler,
 				 * by the mode bits. So don't
 				 * update ACL.
 				 */
+				posix_acl_release(old_acl);
 				value = NULL;
 				size = 0;
 			}
diff --git a/fs/Kconfig b/fs/Kconfig
index c2a377cdda2b..83eab52fb3f6 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -38,6 +38,7 @@ config FS_DAX
 	bool "Direct Access (DAX) support"
 	depends on MMU
 	depends on !(ARM || MIPS || SPARC)
+	select FS_IOMAP
 	help
 	  Direct Access (DAX) can be used on memory-backed block devices.
 	  If the block device supports DAX and the filesystem supports DAX,
diff --git a/fs/afs/callback.c b/fs/afs/callback.c
index 1e9d2f84e5b5..b29447e03ede 100644
--- a/fs/afs/callback.c
+++ b/fs/afs/callback.c
@@ -343,7 +343,7 @@ void afs_dispatch_give_up_callbacks(struct work_struct *work)
 	 *   had callbacks entirely, and the server will call us later to break
 	 *   them
 	 */
-	afs_fs_give_up_callbacks(server, &afs_async_call);
+	afs_fs_give_up_callbacks(server, true);
 }
 
 /*
diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c
index d764236072b1..2edbdcbf6432 100644
--- a/fs/afs/cmservice.c
+++ b/fs/afs/cmservice.c
@@ -24,65 +24,86 @@ static int afs_deliver_cb_callback(struct afs_call *);
 static int afs_deliver_cb_probe_uuid(struct afs_call *);
 static int afs_deliver_cb_tell_me_about_yourself(struct afs_call *);
 static void afs_cm_destructor(struct afs_call *);
+static void SRXAFSCB_CallBack(struct work_struct *);
+static void SRXAFSCB_InitCallBackState(struct work_struct *);
+static void SRXAFSCB_Probe(struct work_struct *);
+static void SRXAFSCB_ProbeUuid(struct work_struct *);
+static void SRXAFSCB_TellMeAboutYourself(struct work_struct *);
+
+#define CM_NAME(name) \
+	const char afs_SRXCB##name##_name[] __tracepoint_string =	\
+		"CB." #name
 
 /*
  * CB.CallBack operation type
  */
+static CM_NAME(CallBack);
 static const struct afs_call_type afs_SRXCBCallBack = {
-	.name		= "CB.CallBack",
+	.name		= afs_SRXCBCallBack_name,
 	.deliver	= afs_deliver_cb_callback,
 	.abort_to_error	= afs_abort_to_error,
 	.destructor	= afs_cm_destructor,
+	.work		= SRXAFSCB_CallBack,
 };
 
 /*
  * CB.InitCallBackState operation type
  */
+static CM_NAME(InitCallBackState);
 static const struct afs_call_type afs_SRXCBInitCallBackState = {
-	.name		= "CB.InitCallBackState",
+	.name		= afs_SRXCBInitCallBackState_name,
 	.deliver	= afs_deliver_cb_init_call_back_state,
 	.abort_to_error	= afs_abort_to_error,
 	.destructor	= afs_cm_destructor,
+	.work		= SRXAFSCB_InitCallBackState,
 };
 
 /*
  * CB.InitCallBackState3 operation type
  */
+static CM_NAME(InitCallBackState3);
 static const struct afs_call_type afs_SRXCBInitCallBackState3 = {
-	.name		= "CB.InitCallBackState3",
+	.name		= afs_SRXCBInitCallBackState3_name,
 	.deliver	= afs_deliver_cb_init_call_back_state3,
 	.abort_to_error	= afs_abort_to_error,
 	.destructor	= afs_cm_destructor,
+	.work		= SRXAFSCB_InitCallBackState,
 };
 
 /*
  * CB.Probe operation type
  */
+static CM_NAME(Probe);
 static const struct afs_call_type afs_SRXCBProbe = {
-	.name		= "CB.Probe",
+	.name		= afs_SRXCBProbe_name,
 	.deliver	= afs_deliver_cb_probe,
 	.abort_to_error	= afs_abort_to_error,
 	.destructor	= afs_cm_destructor,
+	.work		= SRXAFSCB_Probe,
 };
 
 /*
  * CB.ProbeUuid operation type
  */
+static CM_NAME(ProbeUuid);
 static const struct afs_call_type afs_SRXCBProbeUuid = {
-	.name		= "CB.ProbeUuid",
+	.name		= afs_SRXCBProbeUuid_name,
 	.deliver	= afs_deliver_cb_probe_uuid,
 	.abort_to_error	= afs_abort_to_error,
 	.destructor	= afs_cm_destructor,
+	.work		= SRXAFSCB_ProbeUuid,
 };
 
 /*
  * CB.TellMeAboutYourself operation type
  */
+static CM_NAME(TellMeAboutYourself);
 static const struct afs_call_type afs_SRXCBTellMeAboutYourself = {
-	.name		= "CB.TellMeAboutYourself",
+	.name		= afs_SRXCBTellMeAboutYourself_name,
 	.deliver	= afs_deliver_cb_tell_me_about_yourself,
 	.abort_to_error	= afs_abort_to_error,
 	.destructor	= afs_cm_destructor,
+	.work		= SRXAFSCB_TellMeAboutYourself,
 };
 
 /*
@@ -153,6 +174,7 @@ static void SRXAFSCB_CallBack(struct work_struct *work)
 	afs_send_empty_reply(call);
 
 	afs_break_callbacks(call->server, call->count, call->request);
+	afs_put_call(call);
 	_leave("");
 }
 
@@ -274,9 +296,7 @@ static int afs_deliver_cb_callback(struct afs_call *call)
 		return -ENOTCONN;
 	call->server = server;
 
-	INIT_WORK(&call->work, SRXAFSCB_CallBack);
-	queue_work(afs_wq, &call->work);
-	return 0;
+	return afs_queue_call_work(call);
 }
 
 /*
@@ -290,6 +310,7 @@ static void SRXAFSCB_InitCallBackState(struct work_struct *work)
 
 	afs_init_callback_state(call->server);
 	afs_send_empty_reply(call);
+	afs_put_call(call);
 	_leave("");
 }
 
@@ -320,9 +341,7 @@ static int afs_deliver_cb_init_call_back_state(struct afs_call *call)
 		return -ENOTCONN;
 	call->server = server;
 
-	INIT_WORK(&call->work, SRXAFSCB_InitCallBackState);
-	queue_work(afs_wq, &call->work);
-	return 0;
+	return afs_queue_call_work(call);
 }
 
 /*
@@ -332,7 +351,7 @@ static int afs_deliver_cb_init_call_back_state3(struct afs_call *call)
 {
 	struct sockaddr_rxrpc srx;
 	struct afs_server *server;
-	struct afs_uuid *r;
+	struct uuid_v1 *r;
 	unsigned loop;
 	__be32 *b;
 	int ret;
@@ -362,15 +381,15 @@ static int afs_deliver_cb_init_call_back_state3(struct afs_call *call)
 		}
 
 		_debug("unmarshall UUID");
-		call->request = kmalloc(sizeof(struct afs_uuid), GFP_KERNEL);
+		call->request = kmalloc(sizeof(struct uuid_v1), GFP_KERNEL);
 		if (!call->request)
 			return -ENOMEM;
 
 		b = call->buffer;
 		r = call->request;
-		r->time_low			= ntohl(b[0]);
-		r->time_mid			= ntohl(b[1]);
-		r->time_hi_and_version		= ntohl(b[2]);
+		r->time_low			= b[0];
+		r->time_mid			= htons(ntohl(b[1]));
+		r->time_hi_and_version		= htons(ntohl(b[2]));
 		r->clock_seq_hi_and_reserved 	= ntohl(b[3]);
 		r->clock_seq_low		= ntohl(b[4]);
 
@@ -394,9 +413,7 @@ static int afs_deliver_cb_init_call_back_state3(struct afs_call *call)
 		return -ENOTCONN;
 	call->server = server;
 
-	INIT_WORK(&call->work, SRXAFSCB_InitCallBackState);
-	queue_work(afs_wq, &call->work);
-	return 0;
+	return afs_queue_call_work(call);
 }
 
 /*
@@ -408,6 +425,7 @@ static void SRXAFSCB_Probe(struct work_struct *work)
 
 	_enter("");
 	afs_send_empty_reply(call);
+	afs_put_call(call);
 	_leave("");
 }
 
@@ -427,9 +445,7 @@ static int afs_deliver_cb_probe(struct afs_call *call)
 	/* no unmarshalling required */
 	call->state = AFS_CALL_REPLYING;
 
-	INIT_WORK(&call->work, SRXAFSCB_Probe);
-	queue_work(afs_wq, &call->work);
-	return 0;
+	return afs_queue_call_work(call);
 }
 
 /*
@@ -438,7 +454,7 @@ static int afs_deliver_cb_probe(struct afs_call *call)
 static void SRXAFSCB_ProbeUuid(struct work_struct *work)
 {
 	struct afs_call *call = container_of(work, struct afs_call, work);
-	struct afs_uuid *r = call->request;
+	struct uuid_v1 *r = call->request;
 
 	struct {
 		__be32	match;
@@ -452,6 +468,7 @@ static void SRXAFSCB_ProbeUuid(struct work_struct *work)
 		reply.match = htonl(1);
 
 	afs_send_simple_reply(call, &reply, sizeof(reply));
+	afs_put_call(call);
 	_leave("");
 }
 
@@ -460,7 +477,7 @@ static void SRXAFSCB_ProbeUuid(struct work_struct *work)
  */
 static int afs_deliver_cb_probe_uuid(struct afs_call *call)
 {
-	struct afs_uuid *r;
+	struct uuid_v1 *r;
 	unsigned loop;
 	__be32 *b;
 	int ret;
@@ -486,15 +503,15 @@ static int afs_deliver_cb_probe_uuid(struct afs_call *call)
 		}
 
 		_debug("unmarshall UUID");
-		call->request = kmalloc(sizeof(struct afs_uuid), GFP_KERNEL);
+		call->request = kmalloc(sizeof(struct uuid_v1), GFP_KERNEL);
 		if (!call->request)
 			return -ENOMEM;
 
 		b = call->buffer;
 		r = call->request;
-		r->time_low			= ntohl(b[0]);
-		r->time_mid			= ntohl(b[1]);
-		r->time_hi_and_version		= ntohl(b[2]);
+		r->time_low			= b[0];
+		r->time_mid			= htons(ntohl(b[1]));
+		r->time_hi_and_version		= htons(ntohl(b[2]));
 		r->clock_seq_hi_and_reserved 	= ntohl(b[3]);
 		r->clock_seq_low		= ntohl(b[4]);
 
@@ -510,9 +527,7 @@ static int afs_deliver_cb_probe_uuid(struct afs_call *call)
 
 	call->state = AFS_CALL_REPLYING;
 
-	INIT_WORK(&call->work, SRXAFSCB_ProbeUuid);
-	queue_work(afs_wq, &call->work);
-	return 0;
+	return afs_queue_call_work(call);
 }
 
 /*
@@ -554,9 +569,9 @@ static void SRXAFSCB_TellMeAboutYourself(struct work_struct *work)
 	memset(&reply, 0, sizeof(reply));
 	reply.ia.nifs = htonl(nifs);
 
-	reply.ia.uuid[0] = htonl(afs_uuid.time_low);
-	reply.ia.uuid[1] = htonl(afs_uuid.time_mid);
-	reply.ia.uuid[2] = htonl(afs_uuid.time_hi_and_version);
+	reply.ia.uuid[0] = afs_uuid.time_low;
+	reply.ia.uuid[1] = htonl(ntohs(afs_uuid.time_mid));
+	reply.ia.uuid[2] = htonl(ntohs(afs_uuid.time_hi_and_version));
 	reply.ia.uuid[3] = htonl((s8) afs_uuid.clock_seq_hi_and_reserved);
 	reply.ia.uuid[4] = htonl((s8) afs_uuid.clock_seq_low);
 	for (loop = 0; loop < 6; loop++)
@@ -574,7 +589,7 @@ static void SRXAFSCB_TellMeAboutYourself(struct work_struct *work)
 	reply.cap.capcount = htonl(1);
 	reply.cap.caps[0] = htonl(AFS_CAP_ERROR_TRANSLATION);
 	afs_send_simple_reply(call, &reply, sizeof(reply));
-
+	afs_put_call(call);
 	_leave("");
 }
 
@@ -594,7 +609,5 @@ static int afs_deliver_cb_tell_me_about_yourself(struct afs_call *call)
 	/* no unmarshalling required */
 	call->state = AFS_CALL_REPLYING;
 
-	INIT_WORK(&call->work, SRXAFSCB_TellMeAboutYourself);
-	queue_work(afs_wq, &call->work);
-	return 0;
+	return afs_queue_call_work(call);
 }
diff --git a/fs/afs/file.c b/fs/afs/file.c
index 6344aee4ac4b..ba7b71fba34b 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -16,6 +16,7 @@
 #include <linux/pagemap.h>
 #include <linux/writeback.h>
 #include <linux/gfp.h>
+#include <linux/task_io_accounting_ops.h>
 #include "internal.h"
 
 static int afs_readpage(struct file *file, struct page *page);
@@ -101,6 +102,21 @@ int afs_release(struct inode *inode, struct file *file)
 	return 0;
 }
 
+/*
+ * Dispose of a ref to a read record.
+ */
+void afs_put_read(struct afs_read *req)
+{
+	int i;
+
+	if (atomic_dec_and_test(&req->usage)) {
+		for (i = 0; i < req->nr_pages; i++)
+			if (req->pages[i])
+				put_page(req->pages[i]);
+		kfree(req);
+	}
+}
+
 #ifdef CONFIG_AFS_FSCACHE
 /*
  * deal with notification that a page was read from the cache
@@ -126,9 +142,8 @@ int afs_page_filler(void *data, struct page *page)
 {
 	struct inode *inode = page->mapping->host;
 	struct afs_vnode *vnode = AFS_FS_I(inode);
+	struct afs_read *req;
 	struct key *key = data;
-	size_t len;
-	off_t offset;
 	int ret;
 
 	_enter("{%x},{%lu},{%lu}", key_serial(key), inode->i_ino, page->index);
@@ -164,12 +179,23 @@ int afs_page_filler(void *data, struct page *page)
 		_debug("cache said ENOBUFS");
 	default:
 	go_on:
-		offset = page->index << PAGE_SHIFT;
-		len = min_t(size_t, i_size_read(inode) - offset, PAGE_SIZE);
+		req = kzalloc(sizeof(struct afs_read) + sizeof(struct page *),
+			      GFP_KERNEL);
+		if (!req)
+			goto enomem;
+
+		atomic_set(&req->usage, 1);
+		req->pos = (loff_t)page->index << PAGE_SHIFT;
+		req->len = min_t(size_t, i_size_read(inode) - req->pos,
+				 PAGE_SIZE);
+		req->nr_pages = 1;
+		req->pages[0] = page;
+		get_page(page);
 
 		/* read the contents of the file from the server into the
 		 * page */
-		ret = afs_vnode_fetch_data(vnode, key, offset, len, page);
+		ret = afs_vnode_fetch_data(vnode, key, req);
+		afs_put_read(req);
 		if (ret < 0) {
 			if (ret == -ENOENT) {
 				_debug("got NOENT from server"
@@ -201,6 +227,8 @@ int afs_page_filler(void *data, struct page *page)
 	_leave(" = 0");
 	return 0;
 
+enomem:
+	ret = -ENOMEM;
 error:
 	SetPageError(page);
 	unlock_page(page);
@@ -235,6 +263,131 @@ static int afs_readpage(struct file *file, struct page *page)
 }
 
 /*
+ * Make pages available as they're filled.
+ */
+static void afs_readpages_page_done(struct afs_call *call, struct afs_read *req)
+{
+#ifdef CONFIG_AFS_FSCACHE
+	struct afs_vnode *vnode = call->reply;
+#endif
+	struct page *page = req->pages[req->index];
+
+	req->pages[req->index] = NULL;
+	SetPageUptodate(page);
+
+	/* send the page to the cache */
+#ifdef CONFIG_AFS_FSCACHE
+	if (PageFsCache(page) &&
+	    fscache_write_page(vnode->cache, page, GFP_KERNEL) != 0) {
+		fscache_uncache_page(vnode->cache, page);
+		BUG_ON(PageFsCache(page));
+	}
+#endif
+	unlock_page(page);
+	put_page(page);
+}
+
+/*
+ * Read a contiguous set of pages.
+ */
+static int afs_readpages_one(struct file *file, struct address_space *mapping,
+			     struct list_head *pages)
+{
+	struct afs_vnode *vnode = AFS_FS_I(mapping->host);
+	struct afs_read *req;
+	struct list_head *p;
+	struct page *first, *page;
+	struct key *key = file->private_data;
+	pgoff_t index;
+	int ret, n, i;
+
+	/* Count the number of contiguous pages at the front of the list.  Note
+	 * that the list goes prev-wards rather than next-wards.
+	 */
+	first = list_entry(pages->prev, struct page, lru);
+	index = first->index + 1;
+	n = 1;
+	for (p = first->lru.prev; p != pages; p = p->prev) {
+		page = list_entry(p, struct page, lru);
+		if (page->index != index)
+			break;
+		index++;
+		n++;
+	}
+
+	req = kzalloc(sizeof(struct afs_read) + sizeof(struct page *) * n,
+		      GFP_NOFS);
+	if (!req)
+		return -ENOMEM;
+
+	atomic_set(&req->usage, 1);
+	req->page_done = afs_readpages_page_done;
+	req->pos = first->index;
+	req->pos <<= PAGE_SHIFT;
+
+	/* Transfer the pages to the request.  We add them in until one fails
+	 * to add to the LRU and then we stop (as that'll make a hole in the
+	 * contiguous run.
+	 *
+	 * Note that it's possible for the file size to change whilst we're
+	 * doing this, but we rely on the server returning less than we asked
+	 * for if the file shrank.  We also rely on this to deal with a partial
+	 * page at the end of the file.
+	 */
+	do {
+		page = list_entry(pages->prev, struct page, lru);
+		list_del(&page->lru);
+		index = page->index;
+		if (add_to_page_cache_lru(page, mapping, index,
+					  readahead_gfp_mask(mapping))) {
+#ifdef CONFIG_AFS_FSCACHE
+			fscache_uncache_page(vnode->cache, page);
+#endif
+			put_page(page);
+			break;
+		}
+
+		req->pages[req->nr_pages++] = page;
+		req->len += PAGE_SIZE;
+	} while (req->nr_pages < n);
+
+	if (req->nr_pages == 0) {
+		kfree(req);
+		return 0;
+	}
+
+	ret = afs_vnode_fetch_data(vnode, key, req);
+	if (ret < 0)
+		goto error;
+
+	task_io_account_read(PAGE_SIZE * req->nr_pages);
+	afs_put_read(req);
+	return 0;
+
+error:
+	if (ret == -ENOENT) {
+		_debug("got NOENT from server"
+		       " - marking file deleted and stale");
+		set_bit(AFS_VNODE_DELETED, &vnode->flags);
+		ret = -ESTALE;
+	}
+
+	for (i = 0; i < req->nr_pages; i++) {
+		page = req->pages[i];
+		if (page) {
+#ifdef CONFIG_AFS_FSCACHE
+			fscache_uncache_page(vnode->cache, page);
+#endif
+			SetPageError(page);
+			unlock_page(page);
+		}
+	}
+
+	afs_put_read(req);
+	return ret;
+}
+
+/*
  * read a set of pages
  */
 static int afs_readpages(struct file *file, struct address_space *mapping,
@@ -287,8 +440,11 @@ static int afs_readpages(struct file *file, struct address_space *mapping,
 		return ret;
 	}
 
-	/* load the missing pages from the network */
-	ret = read_cache_pages(mapping, pages, afs_page_filler, key);
+	while (!list_empty(pages)) {
+		ret = afs_readpages_one(file, mapping, pages);
+		if (ret < 0)
+			break;
+	}
 
 	_leave(" = %d [netting]", ret);
 	return ret;
diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c
index 31c616ab9b40..ac8e766978dc 100644
--- a/fs/afs/fsclient.c
+++ b/fs/afs/fsclient.c
@@ -275,7 +275,7 @@ int afs_fs_fetch_file_status(struct afs_server *server,
 			     struct key *key,
 			     struct afs_vnode *vnode,
 			     struct afs_volsync *volsync,
-			     const struct afs_wait_mode *wait_mode)
+			     bool async)
 {
 	struct afs_call *call;
 	__be32 *bp;
@@ -300,7 +300,7 @@ int afs_fs_fetch_file_status(struct afs_server *server,
 	bp[2] = htonl(vnode->fid.vnode);
 	bp[3] = htonl(vnode->fid.unique);
 
-	return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
+	return afs_make_call(&server->addr, call, GFP_NOFS, async);
 }
 
 /*
@@ -309,15 +309,19 @@ int afs_fs_fetch_file_status(struct afs_server *server,
 static int afs_deliver_fs_fetch_data(struct afs_call *call)
 {
 	struct afs_vnode *vnode = call->reply;
+	struct afs_read *req = call->reply3;
 	const __be32 *bp;
-	struct page *page;
+	unsigned int size;
 	void *buffer;
 	int ret;
 
-	_enter("{%u}", call->unmarshall);
+	_enter("{%u,%zu/%u;%u/%llu}",
+	       call->unmarshall, call->offset, call->count,
+	       req->remain, req->actual_len);
 
 	switch (call->unmarshall) {
 	case 0:
+		req->actual_len = 0;
 		call->offset = 0;
 		call->unmarshall++;
 		if (call->operation_ID != FSFETCHDATA64) {
@@ -334,10 +338,8 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call)
 		if (ret < 0)
 			return ret;
 
-		call->count = ntohl(call->tmp);
-		_debug("DATA length MSW: %u", call->count);
-		if (call->count > 0)
-			return -EBADMSG;
+		req->actual_len = ntohl(call->tmp);
+		req->actual_len <<= 32;
 		call->offset = 0;
 		call->unmarshall++;
 
@@ -349,26 +351,52 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call)
 		if (ret < 0)
 			return ret;
 
-		call->count = ntohl(call->tmp);
-		_debug("DATA length: %u", call->count);
-		if (call->count > PAGE_SIZE)
+		req->actual_len |= ntohl(call->tmp);
+		_debug("DATA length: %llu", req->actual_len);
+		/* Check that the server didn't want to send us extra.  We
+		 * might want to just discard instead, but that requires
+		 * cooperation from AF_RXRPC.
+		 */
+		if (req->actual_len > req->len)
 			return -EBADMSG;
-		call->offset = 0;
+
+		req->remain = req->actual_len;
+		call->offset = req->pos & (PAGE_SIZE - 1);
+		req->index = 0;
+		if (req->actual_len == 0)
+			goto no_more_data;
 		call->unmarshall++;
 
+	begin_page:
+		if (req->remain > PAGE_SIZE - call->offset)
+			size = PAGE_SIZE - call->offset;
+		else
+			size = req->remain;
+		call->count = call->offset + size;
+		ASSERTCMP(call->count, <=, PAGE_SIZE);
+		req->remain -= size;
+
 		/* extract the returned data */
 	case 3:
-		_debug("extract data");
-		if (call->count > 0) {
-			page = call->reply3;
-			buffer = kmap(page);
-			ret = afs_extract_data(call, buffer,
-					       call->count, true);
-			kunmap(page);
-			if (ret < 0)
-				return ret;
+		_debug("extract data %u/%llu %zu/%u",
+		       req->remain, req->actual_len, call->offset, call->count);
+
+		buffer = kmap(req->pages[req->index]);
+		ret = afs_extract_data(call, buffer, call->count, true);
+		kunmap(req->pages[req->index]);
+		if (ret < 0)
+			return ret;
+		if (call->offset == PAGE_SIZE) {
+			if (req->page_done)
+				req->page_done(call, req);
+			if (req->remain > 0) {
+				req->index++;
+				call->offset = 0;
+				goto begin_page;
+			}
 		}
 
+	no_more_data:
 		call->offset = 0;
 		call->unmarshall++;
 
@@ -393,17 +421,25 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call)
 	}
 
 	if (call->count < PAGE_SIZE) {
-		_debug("clear");
-		page = call->reply3;
-		buffer = kmap(page);
+		buffer = kmap(req->pages[req->index]);
 		memset(buffer + call->count, 0, PAGE_SIZE - call->count);
-		kunmap(page);
+		kunmap(req->pages[req->index]);
+		if (req->page_done)
+			req->page_done(call, req);
 	}
 
 	_leave(" = 0 [done]");
 	return 0;
 }
 
+static void afs_fetch_data_destructor(struct afs_call *call)
+{
+	struct afs_read *req = call->reply3;
+
+	afs_put_read(req);
+	afs_flat_call_destructor(call);
+}
+
 /*
  * FS.FetchData operation type
  */
@@ -411,14 +447,14 @@ static const struct afs_call_type afs_RXFSFetchData = {
 	.name		= "FS.FetchData",
 	.deliver	= afs_deliver_fs_fetch_data,
 	.abort_to_error	= afs_abort_to_error,
-	.destructor	= afs_flat_call_destructor,
+	.destructor	= afs_fetch_data_destructor,
 };
 
 static const struct afs_call_type afs_RXFSFetchData64 = {
 	.name		= "FS.FetchData64",
 	.deliver	= afs_deliver_fs_fetch_data,
 	.abort_to_error	= afs_abort_to_error,
-	.destructor	= afs_flat_call_destructor,
+	.destructor	= afs_fetch_data_destructor,
 };
 
 /*
@@ -427,17 +463,14 @@ static const struct afs_call_type afs_RXFSFetchData64 = {
 static int afs_fs_fetch_data64(struct afs_server *server,
 			       struct key *key,
 			       struct afs_vnode *vnode,
-			       off_t offset, size_t length,
-			       struct page *buffer,
-			       const struct afs_wait_mode *wait_mode)
+			       struct afs_read *req,
+			       bool async)
 {
 	struct afs_call *call;
 	__be32 *bp;
 
 	_enter("");
 
-	ASSERTCMP(length, <, ULONG_MAX);
-
 	call = afs_alloc_flat_call(&afs_RXFSFetchData64, 32, (21 + 3 + 6) * 4);
 	if (!call)
 		return -ENOMEM;
@@ -445,7 +478,7 @@ static int afs_fs_fetch_data64(struct afs_server *server,
 	call->key = key;
 	call->reply = vnode;
 	call->reply2 = NULL; /* volsync */
-	call->reply3 = buffer;
+	call->reply3 = req;
 	call->service_id = FS_SERVICE;
 	call->port = htons(AFS_FS_PORT);
 	call->operation_ID = FSFETCHDATA64;
@@ -456,12 +489,13 @@ static int afs_fs_fetch_data64(struct afs_server *server,
 	bp[1] = htonl(vnode->fid.vid);
 	bp[2] = htonl(vnode->fid.vnode);
 	bp[3] = htonl(vnode->fid.unique);
-	bp[4] = htonl(upper_32_bits(offset));
-	bp[5] = htonl((u32) offset);
+	bp[4] = htonl(upper_32_bits(req->pos));
+	bp[5] = htonl(lower_32_bits(req->pos));
 	bp[6] = 0;
-	bp[7] = htonl((u32) length);
+	bp[7] = htonl(lower_32_bits(req->len));
 
-	return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
+	atomic_inc(&req->usage);
+	return afs_make_call(&server->addr, call, GFP_NOFS, async);
 }
 
 /*
@@ -470,16 +504,16 @@ static int afs_fs_fetch_data64(struct afs_server *server,
 int afs_fs_fetch_data(struct afs_server *server,
 		      struct key *key,
 		      struct afs_vnode *vnode,
-		      off_t offset, size_t length,
-		      struct page *buffer,
-		      const struct afs_wait_mode *wait_mode)
+		      struct afs_read *req,
+		      bool async)
 {
 	struct afs_call *call;
 	__be32 *bp;
 
-	if (upper_32_bits(offset) || upper_32_bits(offset + length))
-		return afs_fs_fetch_data64(server, key, vnode, offset, length,
-					   buffer, wait_mode);
+	if (upper_32_bits(req->pos) ||
+	    upper_32_bits(req->len) ||
+	    upper_32_bits(req->pos + req->len))
+		return afs_fs_fetch_data64(server, key, vnode, req, async);
 
 	_enter("");
 
@@ -490,7 +524,7 @@ int afs_fs_fetch_data(struct afs_server *server,
 	call->key = key;
 	call->reply = vnode;
 	call->reply2 = NULL; /* volsync */
-	call->reply3 = buffer;
+	call->reply3 = req;
 	call->service_id = FS_SERVICE;
 	call->port = htons(AFS_FS_PORT);
 	call->operation_ID = FSFETCHDATA;
@@ -501,10 +535,11 @@ int afs_fs_fetch_data(struct afs_server *server,
 	bp[1] = htonl(vnode->fid.vid);
 	bp[2] = htonl(vnode->fid.vnode);
 	bp[3] = htonl(vnode->fid.unique);
-	bp[4] = htonl(offset);
-	bp[5] = htonl(length);
+	bp[4] = htonl(lower_32_bits(req->pos));
+	bp[5] = htonl(lower_32_bits(req->len));
 
-	return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
+	atomic_inc(&req->usage);
+	return afs_make_call(&server->addr, call, GFP_NOFS, async);
 }
 
 /*
@@ -533,7 +568,7 @@ static const struct afs_call_type afs_RXFSGiveUpCallBacks = {
  * - the callbacks are held in the server->cb_break ring
  */
 int afs_fs_give_up_callbacks(struct afs_server *server,
-			     const struct afs_wait_mode *wait_mode)
+			     bool async)
 {
 	struct afs_call *call;
 	size_t ncallbacks;
@@ -587,7 +622,7 @@ int afs_fs_give_up_callbacks(struct afs_server *server,
 	ASSERT(ncallbacks > 0);
 	wake_up_nr(&server->cb_break_waitq, ncallbacks);
 
-	return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
+	return afs_make_call(&server->addr, call, GFP_NOFS, async);
 }
 
 /*
@@ -638,7 +673,7 @@ int afs_fs_create(struct afs_server *server,
 		  struct afs_fid *newfid,
 		  struct afs_file_status *newstatus,
 		  struct afs_callback *newcb,
-		  const struct afs_wait_mode *wait_mode)
+		  bool async)
 {
 	struct afs_call *call;
 	size_t namesz, reqsz, padsz;
@@ -683,7 +718,7 @@ int afs_fs_create(struct afs_server *server,
 	*bp++ = htonl(mode & S_IALLUGO); /* unix mode */
 	*bp++ = 0; /* segment size */
 
-	return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
+	return afs_make_call(&server->addr, call, GFP_NOFS, async);
 }
 
 /*
@@ -728,7 +763,7 @@ int afs_fs_remove(struct afs_server *server,
 		  struct afs_vnode *vnode,
 		  const char *name,
 		  bool isdir,
-		  const struct afs_wait_mode *wait_mode)
+		  bool async)
 {
 	struct afs_call *call;
 	size_t namesz, reqsz, padsz;
@@ -763,7 +798,7 @@ int afs_fs_remove(struct afs_server *server,
 		bp = (void *) bp + padsz;
 	}
 
-	return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
+	return afs_make_call(&server->addr, call, GFP_NOFS, async);
 }
 
 /*
@@ -809,7 +844,7 @@ int afs_fs_link(struct afs_server *server,
 		struct afs_vnode *dvnode,
 		struct afs_vnode *vnode,
 		const char *name,
-		const struct afs_wait_mode *wait_mode)
+		bool async)
 {
 	struct afs_call *call;
 	size_t namesz, reqsz, padsz;
@@ -848,7 +883,7 @@ int afs_fs_link(struct afs_server *server,
 	*bp++ = htonl(vnode->fid.vnode);
 	*bp++ = htonl(vnode->fid.unique);
 
-	return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
+	return afs_make_call(&server->addr, call, GFP_NOFS, async);
 }
 
 /*
@@ -897,7 +932,7 @@ int afs_fs_symlink(struct afs_server *server,
 		   const char *contents,
 		   struct afs_fid *newfid,
 		   struct afs_file_status *newstatus,
-		   const struct afs_wait_mode *wait_mode)
+		   bool async)
 {
 	struct afs_call *call;
 	size_t namesz, reqsz, padsz, c_namesz, c_padsz;
@@ -952,7 +987,7 @@ int afs_fs_symlink(struct afs_server *server,
 	*bp++ = htonl(S_IRWXUGO); /* unix mode */
 	*bp++ = 0; /* segment size */
 
-	return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
+	return afs_make_call(&server->addr, call, GFP_NOFS, async);
 }
 
 /*
@@ -1001,7 +1036,7 @@ int afs_fs_rename(struct afs_server *server,
 		  const char *orig_name,
 		  struct afs_vnode *new_dvnode,
 		  const char *new_name,
-		  const struct afs_wait_mode *wait_mode)
+		  bool async)
 {
 	struct afs_call *call;
 	size_t reqsz, o_namesz, o_padsz, n_namesz, n_padsz;
@@ -1055,7 +1090,7 @@ int afs_fs_rename(struct afs_server *server,
 		bp = (void *) bp + n_padsz;
 	}
 
-	return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
+	return afs_make_call(&server->addr, call, GFP_NOFS, async);
 }
 
 /*
@@ -1110,7 +1145,7 @@ static int afs_fs_store_data64(struct afs_server *server,
 			       pgoff_t first, pgoff_t last,
 			       unsigned offset, unsigned to,
 			       loff_t size, loff_t pos, loff_t i_size,
-			       const struct afs_wait_mode *wait_mode)
+			       bool async)
 {
 	struct afs_vnode *vnode = wb->vnode;
 	struct afs_call *call;
@@ -1159,7 +1194,7 @@ static int afs_fs_store_data64(struct afs_server *server,
 	*bp++ = htonl(i_size >> 32);
 	*bp++ = htonl((u32) i_size);
 
-	return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
+	return afs_make_call(&server->addr, call, GFP_NOFS, async);
 }
 
 /*
@@ -1168,7 +1203,7 @@ static int afs_fs_store_data64(struct afs_server *server,
 int afs_fs_store_data(struct afs_server *server, struct afs_writeback *wb,
 		      pgoff_t first, pgoff_t last,
 		      unsigned offset, unsigned to,
-		      const struct afs_wait_mode *wait_mode)
+		      bool async)
 {
 	struct afs_vnode *vnode = wb->vnode;
 	struct afs_call *call;
@@ -1194,7 +1229,7 @@ int afs_fs_store_data(struct afs_server *server, struct afs_writeback *wb,
 
 	if (pos >> 32 || i_size >> 32 || size >> 32 || (pos + size) >> 32)
 		return afs_fs_store_data64(server, wb, first, last, offset, to,
-					   size, pos, i_size, wait_mode);
+					   size, pos, i_size, async);
 
 	call = afs_alloc_flat_call(&afs_RXFSStoreData,
 				   (4 + 6 + 3) * 4,
@@ -1233,7 +1268,7 @@ int afs_fs_store_data(struct afs_server *server, struct afs_writeback *wb,
 	*bp++ = htonl(size);
 	*bp++ = htonl(i_size);
 
-	return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
+	return afs_make_call(&server->addr, call, GFP_NOFS, async);
 }
 
 /*
@@ -1295,7 +1330,7 @@ static const struct afs_call_type afs_RXFSStoreData64_as_Status = {
  */
 static int afs_fs_setattr_size64(struct afs_server *server, struct key *key,
 				 struct afs_vnode *vnode, struct iattr *attr,
-				 const struct afs_wait_mode *wait_mode)
+				 bool async)
 {
 	struct afs_call *call;
 	__be32 *bp;
@@ -1334,7 +1369,7 @@ static int afs_fs_setattr_size64(struct afs_server *server, struct key *key,
 	*bp++ = htonl(attr->ia_size >> 32);	/* new file length */
 	*bp++ = htonl((u32) attr->ia_size);
 
-	return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
+	return afs_make_call(&server->addr, call, GFP_NOFS, async);
 }
 
 /*
@@ -1343,7 +1378,7 @@ static int afs_fs_setattr_size64(struct afs_server *server, struct key *key,
  */
 static int afs_fs_setattr_size(struct afs_server *server, struct key *key,
 			       struct afs_vnode *vnode, struct iattr *attr,
-			       const struct afs_wait_mode *wait_mode)
+			       bool async)
 {
 	struct afs_call *call;
 	__be32 *bp;
@@ -1354,7 +1389,7 @@ static int afs_fs_setattr_size(struct afs_server *server, struct key *key,
 	ASSERT(attr->ia_valid & ATTR_SIZE);
 	if (attr->ia_size >> 32)
 		return afs_fs_setattr_size64(server, key, vnode, attr,
-					     wait_mode);
+					     async);
 
 	call = afs_alloc_flat_call(&afs_RXFSStoreData_as_Status,
 				   (4 + 6 + 3) * 4,
@@ -1382,7 +1417,7 @@ static int afs_fs_setattr_size(struct afs_server *server, struct key *key,
 	*bp++ = 0;				/* size of write */
 	*bp++ = htonl(attr->ia_size);		/* new file length */
 
-	return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
+	return afs_make_call(&server->addr, call, GFP_NOFS, async);
 }
 
 /*
@@ -1391,14 +1426,14 @@ static int afs_fs_setattr_size(struct afs_server *server, struct key *key,
  */
 int afs_fs_setattr(struct afs_server *server, struct key *key,
 		   struct afs_vnode *vnode, struct iattr *attr,
-		   const struct afs_wait_mode *wait_mode)
+		   bool async)
 {
 	struct afs_call *call;
 	__be32 *bp;
 
 	if (attr->ia_valid & ATTR_SIZE)
 		return afs_fs_setattr_size(server, key, vnode, attr,
-					   wait_mode);
+					   async);
 
 	_enter(",%x,{%x:%u},,",
 	       key_serial(key), vnode->fid.vid, vnode->fid.vnode);
@@ -1424,7 +1459,7 @@ int afs_fs_setattr(struct afs_server *server, struct key *key,
 
 	xdr_encode_AFS_StoreStatus(&bp, attr);
 
-	return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
+	return afs_make_call(&server->addr, call, GFP_NOFS, async);
 }
 
 /*
@@ -1626,7 +1661,7 @@ int afs_fs_get_volume_status(struct afs_server *server,
 			     struct key *key,
 			     struct afs_vnode *vnode,
 			     struct afs_volume_status *vs,
-			     const struct afs_wait_mode *wait_mode)
+			     bool async)
 {
 	struct afs_call *call;
 	__be32 *bp;
@@ -1656,7 +1691,7 @@ int afs_fs_get_volume_status(struct afs_server *server,
 	bp[0] = htonl(FSGETVOLUMESTATUS);
 	bp[1] = htonl(vnode->fid.vid);
 
-	return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
+	return afs_make_call(&server->addr, call, GFP_NOFS, async);
 }
 
 /*
@@ -1718,7 +1753,7 @@ int afs_fs_set_lock(struct afs_server *server,
 		    struct key *key,
 		    struct afs_vnode *vnode,
 		    afs_lock_type_t type,
-		    const struct afs_wait_mode *wait_mode)
+		    bool async)
 {
 	struct afs_call *call;
 	__be32 *bp;
@@ -1742,7 +1777,7 @@ int afs_fs_set_lock(struct afs_server *server,
 	*bp++ = htonl(vnode->fid.unique);
 	*bp++ = htonl(type);
 
-	return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
+	return afs_make_call(&server->addr, call, GFP_NOFS, async);
 }
 
 /*
@@ -1751,7 +1786,7 @@ int afs_fs_set_lock(struct afs_server *server,
 int afs_fs_extend_lock(struct afs_server *server,
 		       struct key *key,
 		       struct afs_vnode *vnode,
-		       const struct afs_wait_mode *wait_mode)
+		       bool async)
 {
 	struct afs_call *call;
 	__be32 *bp;
@@ -1774,7 +1809,7 @@ int afs_fs_extend_lock(struct afs_server *server,
 	*bp++ = htonl(vnode->fid.vnode);
 	*bp++ = htonl(vnode->fid.unique);
 
-	return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
+	return afs_make_call(&server->addr, call, GFP_NOFS, async);
 }
 
 /*
@@ -1783,7 +1818,7 @@ int afs_fs_extend_lock(struct afs_server *server,
 int afs_fs_release_lock(struct afs_server *server,
 			struct key *key,
 			struct afs_vnode *vnode,
-			const struct afs_wait_mode *wait_mode)
+			bool async)
 {
 	struct afs_call *call;
 	__be32 *bp;
@@ -1806,5 +1841,5 @@ int afs_fs_release_lock(struct afs_server *server,
 	*bp++ = htonl(vnode->fid.vnode);
 	*bp++ = htonl(vnode->fid.unique);
 
-	return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
+	return afs_make_call(&server->addr, call, GFP_NOFS, async);
 }
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 535a38d2c1d0..8acf3670e756 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -19,6 +19,7 @@
 #include <linux/sched.h>
 #include <linux/fscache.h>
 #include <linux/backing-dev.h>
+#include <linux/uuid.h>
 #include <net/af_rxrpc.h>
 
 #include "afs.h"
@@ -51,31 +52,22 @@ struct afs_mount_params {
 	struct key		*key;		/* key to use for secure mounting */
 };
 
-/*
- * definition of how to wait for the completion of an operation
- */
-struct afs_wait_mode {
-	/* RxRPC received message notification */
-	rxrpc_notify_rx_t notify_rx;
-
-	/* synchronous call waiter and call dispatched notification */
-	int (*wait)(struct afs_call *call);
-
-	/* asynchronous call completion */
-	void (*async_complete)(void *reply, int error);
+enum afs_call_state {
+	AFS_CALL_REQUESTING,	/* request is being sent for outgoing call */
+	AFS_CALL_AWAIT_REPLY,	/* awaiting reply to outgoing call */
+	AFS_CALL_AWAIT_OP_ID,	/* awaiting op ID on incoming call */
+	AFS_CALL_AWAIT_REQUEST,	/* awaiting request data on incoming call */
+	AFS_CALL_REPLYING,	/* replying to incoming call */
+	AFS_CALL_AWAIT_ACK,	/* awaiting final ACK of incoming call */
+	AFS_CALL_COMPLETE,	/* Completed or failed */
 };
-
-extern const struct afs_wait_mode afs_sync_call;
-extern const struct afs_wait_mode afs_async_call;
-
 /*
  * a record of an in-progress RxRPC call
  */
 struct afs_call {
 	const struct afs_call_type *type;	/* type of call */
-	const struct afs_wait_mode *wait_mode;	/* completion wait mode */
 	wait_queue_head_t	waitq;		/* processes awaiting completion */
-	struct work_struct	async_work;	/* asynchronous work processor */
+	struct work_struct	async_work;	/* async I/O processor */
 	struct work_struct	work;		/* actual work processor */
 	struct rxrpc_call	*rxcall;	/* RxRPC call handle */
 	struct key		*key;		/* security for this call */
@@ -91,15 +83,8 @@ struct afs_call {
 	pgoff_t			first;		/* first page in mapping to deal with */
 	pgoff_t			last;		/* last page in mapping to deal with */
 	size_t			offset;		/* offset into received data store */
-	enum {					/* call state */
-		AFS_CALL_REQUESTING,	/* request is being sent for outgoing call */
-		AFS_CALL_AWAIT_REPLY,	/* awaiting reply to outgoing call */
-		AFS_CALL_AWAIT_OP_ID,	/* awaiting op ID on incoming call */
-		AFS_CALL_AWAIT_REQUEST,	/* awaiting request data on incoming call */
-		AFS_CALL_REPLYING,	/* replying to incoming call */
-		AFS_CALL_AWAIT_ACK,	/* awaiting final ACK of incoming call */
-		AFS_CALL_COMPLETE,	/* Completed or failed */
-	}			state;
+	atomic_t		usage;
+	enum afs_call_state	state;
 	int			error;		/* error code */
 	u32			abort_code;	/* Remote abort ID or 0 */
 	unsigned		request_size;	/* size of request data */
@@ -110,6 +95,7 @@ struct afs_call {
 	bool			incoming;	/* T if incoming call */
 	bool			send_pages;	/* T if data from mapping should be sent */
 	bool			need_attention;	/* T if RxRPC poked us */
+	bool			async;		/* T if asynchronous */
 	u16			service_id;	/* RxRPC service ID to call */
 	__be16			port;		/* target UDP port */
 	u32			operation_ID;	/* operation ID for an incoming call */
@@ -131,6 +117,25 @@ struct afs_call_type {
 
 	/* clean up a call */
 	void (*destructor)(struct afs_call *call);
+
+	/* Work function */
+	void (*work)(struct work_struct *work);
+};
+
+/*
+ * Record of an outstanding read operation on a vnode.
+ */
+struct afs_read {
+	loff_t			pos;		/* Where to start reading */
+	loff_t			len;		/* How much to read */
+	loff_t			actual_len;	/* How much we're actually getting */
+	atomic_t		usage;
+	unsigned int		remain;		/* Amount remaining */
+	unsigned int		index;		/* Which page we're reading into */
+	unsigned int		pg_offset;	/* Offset in page we're at */
+	unsigned int		nr_pages;
+	void (*page_done)(struct afs_call *, struct afs_read *);
+	struct page		*pages[];
 };
 
 /*
@@ -403,30 +408,6 @@ struct afs_interface {
 	unsigned	mtu;		/* MTU of interface */
 };
 
-/*
- * UUID definition [internet draft]
- * - the timestamp is a 60-bit value, split 32/16/12, and goes in 100ns
- *   increments since midnight 15th October 1582
- *   - add AFS_UUID_TO_UNIX_TIME to convert unix time in 100ns units to UUID
- *     time
- * - the clock sequence is a 14-bit counter to avoid duplicate times
- */
-struct afs_uuid {
-	u32		time_low;			/* low part of timestamp */
-	u16		time_mid;			/* mid part of timestamp */
-	u16		time_hi_and_version;		/* high part of timestamp and version  */
-#define AFS_UUID_TO_UNIX_TIME	0x01b21dd213814000ULL
-#define AFS_UUID_TIMEHI_MASK	0x0fff
-#define AFS_UUID_VERSION_TIME	0x1000	/* time-based UUID */
-#define AFS_UUID_VERSION_NAME	0x3000	/* name-based UUID */
-#define AFS_UUID_VERSION_RANDOM	0x4000	/* (pseudo-)random generated UUID */
-	u8		clock_seq_hi_and_reserved;	/* clock seq hi and variant */
-#define AFS_UUID_CLOCKHI_MASK	0x3f
-#define AFS_UUID_VARIANT_STD	0x80
-	u8		clock_seq_low;			/* clock seq low */
-	u8		node[6];			/* spatially unique node ID (MAC addr) */
-};
-
 /*****************************************************************************/
 /*
  * cache.c
@@ -494,6 +475,7 @@ extern const struct file_operations afs_file_operations;
 extern int afs_open(struct inode *, struct file *);
 extern int afs_release(struct inode *, struct file *);
 extern int afs_page_filler(void *, struct page *);
+extern void afs_put_read(struct afs_read *);
 
 /*
  * flock.c
@@ -509,50 +491,37 @@ extern int afs_flock(struct file *, int, struct file_lock *);
  */
 extern int afs_fs_fetch_file_status(struct afs_server *, struct key *,
 				    struct afs_vnode *, struct afs_volsync *,
-				    const struct afs_wait_mode *);
-extern int afs_fs_give_up_callbacks(struct afs_server *,
-				    const struct afs_wait_mode *);
+				    bool);
+extern int afs_fs_give_up_callbacks(struct afs_server *, bool);
 extern int afs_fs_fetch_data(struct afs_server *, struct key *,
-			     struct afs_vnode *, off_t, size_t, struct page *,
-			     const struct afs_wait_mode *);
+			     struct afs_vnode *, struct afs_read *, bool);
 extern int afs_fs_create(struct afs_server *, struct key *,
 			 struct afs_vnode *, const char *, umode_t,
 			 struct afs_fid *, struct afs_file_status *,
-			 struct afs_callback *,
-			 const struct afs_wait_mode *);
+			 struct afs_callback *, bool);
 extern int afs_fs_remove(struct afs_server *, struct key *,
-			 struct afs_vnode *, const char *, bool,
-			 const struct afs_wait_mode *);
+			 struct afs_vnode *, const char *, bool, bool);
 extern int afs_fs_link(struct afs_server *, struct key *, struct afs_vnode *,
-		       struct afs_vnode *, const char *,
-		       const struct afs_wait_mode *);
+		       struct afs_vnode *, const char *, bool);
 extern int afs_fs_symlink(struct afs_server *, struct key *,
 			  struct afs_vnode *, const char *, const char *,
-			  struct afs_fid *, struct afs_file_status *,
-			  const struct afs_wait_mode *);
+			  struct afs_fid *, struct afs_file_status *, bool);
 extern int afs_fs_rename(struct afs_server *, struct key *,
 			 struct afs_vnode *, const char *,
-			 struct afs_vnode *, const char *,
-			 const struct afs_wait_mode *);
+			 struct afs_vnode *, const char *, bool);
 extern int afs_fs_store_data(struct afs_server *, struct afs_writeback *,
-			     pgoff_t, pgoff_t, unsigned, unsigned,
-			     const struct afs_wait_mode *);
+			     pgoff_t, pgoff_t, unsigned, unsigned, bool);
 extern int afs_fs_setattr(struct afs_server *, struct key *,
-			  struct afs_vnode *, struct iattr *,
-			  const struct afs_wait_mode *);
+			  struct afs_vnode *, struct iattr *, bool);
 extern int afs_fs_get_volume_status(struct afs_server *, struct key *,
 				    struct afs_vnode *,
-				    struct afs_volume_status *,
-				    const struct afs_wait_mode *);
+				    struct afs_volume_status *, bool);
 extern int afs_fs_set_lock(struct afs_server *, struct key *,
-			   struct afs_vnode *, afs_lock_type_t,
-			   const struct afs_wait_mode *);
+			   struct afs_vnode *, afs_lock_type_t, bool);
 extern int afs_fs_extend_lock(struct afs_server *, struct key *,
-			      struct afs_vnode *,
-			      const struct afs_wait_mode *);
+			      struct afs_vnode *, bool);
 extern int afs_fs_release_lock(struct afs_server *, struct key *,
-			       struct afs_vnode *,
-			       const struct afs_wait_mode *);
+			       struct afs_vnode *, bool);
 
 /*
  * inode.c
@@ -573,7 +542,7 @@ extern int afs_drop_inode(struct inode *);
  * main.c
  */
 extern struct workqueue_struct *afs_wq;
-extern struct afs_uuid afs_uuid;
+extern struct uuid_v1 afs_uuid;
 
 /*
  * misc.c
@@ -592,6 +561,11 @@ extern int afs_mntpt_check_symlink(struct afs_vnode *, struct key *);
 extern void afs_mntpt_kill_timer(void);
 
 /*
+ * netdevices.c
+ */
+extern int afs_get_ipv4_interfaces(struct afs_interface *, size_t, bool);
+
+/*
  * proc.c
  */
 extern int afs_proc_init(void);
@@ -603,11 +577,13 @@ extern void afs_proc_cell_remove(struct afs_cell *);
  * rxrpc.c
  */
 extern struct socket *afs_socket;
+extern atomic_t afs_outstanding_calls;
 
 extern int afs_open_socket(void);
 extern void afs_close_socket(void);
-extern int afs_make_call(struct in_addr *, struct afs_call *, gfp_t,
-			 const struct afs_wait_mode *);
+extern void afs_put_call(struct afs_call *);
+extern int afs_queue_call_work(struct afs_call *);
+extern int afs_make_call(struct in_addr *, struct afs_call *, gfp_t, bool);
 extern struct afs_call *afs_alloc_flat_call(const struct afs_call_type *,
 					    size_t, size_t);
 extern void afs_flat_call_destructor(struct afs_call *);
@@ -653,21 +629,14 @@ extern int afs_fs_init(void);
 extern void afs_fs_exit(void);
 
 /*
- * use-rtnetlink.c
- */
-extern int afs_get_ipv4_interfaces(struct afs_interface *, size_t, bool);
-extern int afs_get_MAC_address(u8 *, size_t);
-
-/*
  * vlclient.c
  */
 extern int afs_vl_get_entry_by_name(struct in_addr *, struct key *,
 				    const char *, struct afs_cache_vlocation *,
-				    const struct afs_wait_mode *);
+				    bool);
 extern int afs_vl_get_entry_by_id(struct in_addr *, struct key *,
 				  afs_volid_t, afs_voltype_t,
-				  struct afs_cache_vlocation *,
-				  const struct afs_wait_mode *);
+				  struct afs_cache_vlocation *, bool);
 
 /*
  * vlocation.c
@@ -699,7 +668,7 @@ extern void afs_vnode_finalise_status_update(struct afs_vnode *,
 extern int afs_vnode_fetch_status(struct afs_vnode *, struct afs_vnode *,
 				  struct key *);
 extern int afs_vnode_fetch_data(struct afs_vnode *, struct key *,
-				off_t, size_t, struct page *);
+				struct afs_read *);
 extern int afs_vnode_create(struct afs_vnode *, struct key *, const char *,
 			    umode_t, struct afs_fid *, struct afs_file_status *,
 			    struct afs_callback *, struct afs_server **);
@@ -756,6 +725,8 @@ extern int afs_fsync(struct file *, loff_t, loff_t, int);
 /*
  * debug tracing
  */
+#include <trace/events/afs.h>
+
 extern unsigned afs_debug;
 
 #define dbgprintk(FMT,...) \
diff --git a/fs/afs/main.c b/fs/afs/main.c
index 0b187ef3b5b7..51d7d17bca57 100644
--- a/fs/afs/main.c
+++ b/fs/afs/main.c
@@ -15,6 +15,7 @@
 #include <linux/completion.h>
 #include <linux/sched.h>
 #include <linux/random.h>
+#define CREATE_TRACE_POINTS
 #include "internal.h"
 
 MODULE_DESCRIPTION("AFS Client File System");
@@ -30,53 +31,10 @@ static char *rootcell;
 module_param(rootcell, charp, 0);
 MODULE_PARM_DESC(rootcell, "root AFS cell name and VL server IP addr list");
 
-struct afs_uuid afs_uuid;
+struct uuid_v1 afs_uuid;
 struct workqueue_struct *afs_wq;
 
 /*
- * get a client UUID
- */
-static int __init afs_get_client_UUID(void)
-{
-	struct timespec ts;
-	u64 uuidtime;
-	u16 clockseq;
-	int ret;
-
-	/* read the MAC address of one of the external interfaces and construct
-	 * a UUID from it */
-	ret = afs_get_MAC_address(afs_uuid.node, sizeof(afs_uuid.node));
-	if (ret < 0)
-		return ret;
-
-	getnstimeofday(&ts);
-	uuidtime = (u64) ts.tv_sec * 1000 * 1000 * 10;
-	uuidtime += ts.tv_nsec / 100;
-	uuidtime += AFS_UUID_TO_UNIX_TIME;
-	afs_uuid.time_low = uuidtime;
-	afs_uuid.time_mid = uuidtime >> 32;
-	afs_uuid.time_hi_and_version = (uuidtime >> 48) & AFS_UUID_TIMEHI_MASK;
-	afs_uuid.time_hi_and_version |= AFS_UUID_VERSION_TIME;
-
-	get_random_bytes(&clockseq, 2);
-	afs_uuid.clock_seq_low = clockseq;
-	afs_uuid.clock_seq_hi_and_reserved =
-		(clockseq >> 8) & AFS_UUID_CLOCKHI_MASK;
-	afs_uuid.clock_seq_hi_and_reserved |= AFS_UUID_VARIANT_STD;
-
-	_debug("AFS UUID: %08x-%04x-%04x-%02x%02x-%02x%02x%02x%02x%02x%02x",
-	       afs_uuid.time_low,
-	       afs_uuid.time_mid,
-	       afs_uuid.time_hi_and_version,
-	       afs_uuid.clock_seq_hi_and_reserved,
-	       afs_uuid.clock_seq_low,
-	       afs_uuid.node[0], afs_uuid.node[1], afs_uuid.node[2],
-	       afs_uuid.node[3], afs_uuid.node[4], afs_uuid.node[5]);
-
-	return 0;
-}
-
-/*
  * initialise the AFS client FS module
  */
 static int __init afs_init(void)
@@ -85,9 +43,7 @@ static int __init afs_init(void)
 
 	printk(KERN_INFO "kAFS: Red Hat AFS client v0.1 registering.\n");
 
-	ret = afs_get_client_UUID();
-	if (ret < 0)
-		return ret;
+	generate_random_uuid((unsigned char *)&afs_uuid);
 
 	/* create workqueue */
 	ret = -ENOMEM;
diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index 81dd075356b9..d4fb0afc0097 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -202,7 +202,7 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt)
 
 	/* try and do the mount */
 	_debug("--- attempting mount %s -o %s ---", devname, options);
-	mnt = vfs_kern_mount(&afs_fs_type, 0, devname, options);
+	mnt = vfs_submount(mntpt, &afs_fs_type, devname, options);
 	_debug("--- mount result %p ---", mnt);
 
 	free_page((unsigned long) devname);
diff --git a/fs/afs/netdevices.c b/fs/afs/netdevices.c
index 7ad36506c256..40b2bab3e401 100644
--- a/fs/afs/netdevices.c
+++ b/fs/afs/netdevices.c
@@ -12,27 +12,6 @@
 #include "internal.h"
 
 /*
- * get a MAC address from a random ethernet interface that has a real one
- * - the buffer will normally be 6 bytes in size
- */
-int afs_get_MAC_address(u8 *mac, size_t maclen)
-{
-	struct net_device *dev;
-	int ret = -ENODEV;
-
-	BUG_ON(maclen != ETH_ALEN);
-
-	rtnl_lock();
-	dev = __dev_getfirstbyhwtype(&init_net, ARPHRD_ETHER);
-	if (dev) {
-		memcpy(mac, dev->dev_addr, maclen);
-		ret = 0;
-	}
-	rtnl_unlock();
-	return ret;
-}
-
-/*
  * get a list of this system's interface IPv4 addresses, netmasks and MTUs
  * - maxbufs must be at least 1
  * - returns the number of interface records in the buffer
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index 25f05a8d21b1..95f42872b787 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -19,35 +19,16 @@
 struct socket *afs_socket; /* my RxRPC socket */
 static struct workqueue_struct *afs_async_calls;
 static struct afs_call *afs_spare_incoming_call;
-static atomic_t afs_outstanding_calls;
+atomic_t afs_outstanding_calls;
 
-static void afs_free_call(struct afs_call *);
 static void afs_wake_up_call_waiter(struct sock *, struct rxrpc_call *, unsigned long);
 static int afs_wait_for_call_to_complete(struct afs_call *);
 static void afs_wake_up_async_call(struct sock *, struct rxrpc_call *, unsigned long);
-static int afs_dont_wait_for_call_to_complete(struct afs_call *);
 static void afs_process_async_call(struct work_struct *);
 static void afs_rx_new_call(struct sock *, struct rxrpc_call *, unsigned long);
 static void afs_rx_discard_new_call(struct rxrpc_call *, unsigned long);
 static int afs_deliver_cm_op_id(struct afs_call *);
 
-/* synchronous call management */
-const struct afs_wait_mode afs_sync_call = {
-	.notify_rx	= afs_wake_up_call_waiter,
-	.wait		= afs_wait_for_call_to_complete,
-};
-
-/* asynchronous call management */
-const struct afs_wait_mode afs_async_call = {
-	.notify_rx	= afs_wake_up_async_call,
-	.wait		= afs_dont_wait_for_call_to_complete,
-};
-
-/* asynchronous incoming call management */
-static const struct afs_wait_mode afs_async_incoming_call = {
-	.notify_rx	= afs_wake_up_async_call,
-};
-
 /* asynchronous incoming call initial processing */
 static const struct afs_call_type afs_RXCMxxxx = {
 	.name		= "CB.xxxx",
@@ -130,9 +111,11 @@ void afs_close_socket(void)
 {
 	_enter("");
 
+	kernel_listen(afs_socket, 0);
+	flush_workqueue(afs_async_calls);
+
 	if (afs_spare_incoming_call) {
-		atomic_inc(&afs_outstanding_calls);
-		afs_free_call(afs_spare_incoming_call);
+		afs_put_call(afs_spare_incoming_call);
 		afs_spare_incoming_call = NULL;
 	}
 
@@ -141,7 +124,6 @@ void afs_close_socket(void)
 			 TASK_UNINTERRUPTIBLE);
 	_debug("no outstanding calls");
 
-	flush_workqueue(afs_async_calls);
 	kernel_sock_shutdown(afs_socket, SHUT_RDWR);
 	flush_workqueue(afs_async_calls);
 	sock_release(afs_socket);
@@ -152,44 +134,79 @@ void afs_close_socket(void)
 }
 
 /*
- * free a call
+ * Allocate a call.
  */
-static void afs_free_call(struct afs_call *call)
+static struct afs_call *afs_alloc_call(const struct afs_call_type *type,
+				       gfp_t gfp)
 {
-	_debug("DONE %p{%s} [%d]",
-	       call, call->type->name, atomic_read(&afs_outstanding_calls));
+	struct afs_call *call;
+	int o;
 
-	ASSERTCMP(call->rxcall, ==, NULL);
-	ASSERT(!work_pending(&call->async_work));
-	ASSERT(call->type->name != NULL);
+	call = kzalloc(sizeof(*call), gfp);
+	if (!call)
+		return NULL;
 
-	kfree(call->request);
-	kfree(call);
+	call->type = type;
+	atomic_set(&call->usage, 1);
+	INIT_WORK(&call->async_work, afs_process_async_call);
+	init_waitqueue_head(&call->waitq);
 
-	if (atomic_dec_and_test(&afs_outstanding_calls))
-		wake_up_atomic_t(&afs_outstanding_calls);
+	o = atomic_inc_return(&afs_outstanding_calls);
+	trace_afs_call(call, afs_call_trace_alloc, 1, o,
+		       __builtin_return_address(0));
+	return call;
 }
 
 /*
- * End a call but do not free it
+ * Dispose of a reference on a call.
  */
-static void afs_end_call_nofree(struct afs_call *call)
+void afs_put_call(struct afs_call *call)
 {
-	if (call->rxcall) {
-		rxrpc_kernel_end_call(afs_socket, call->rxcall);
-		call->rxcall = NULL;
+	int n = atomic_dec_return(&call->usage);
+	int o = atomic_read(&afs_outstanding_calls);
+
+	trace_afs_call(call, afs_call_trace_put, n + 1, o,
+		       __builtin_return_address(0));
+
+	ASSERTCMP(n, >=, 0);
+	if (n == 0) {
+		ASSERT(!work_pending(&call->async_work));
+		ASSERT(call->type->name != NULL);
+
+		if (call->rxcall) {
+			rxrpc_kernel_end_call(afs_socket, call->rxcall);
+			call->rxcall = NULL;
+		}
+		if (call->type->destructor)
+			call->type->destructor(call);
+
+		kfree(call->request);
+		kfree(call);
+
+		o = atomic_dec_return(&afs_outstanding_calls);
+		trace_afs_call(call, afs_call_trace_free, 0, o,
+			       __builtin_return_address(0));
+		if (o == 0)
+			wake_up_atomic_t(&afs_outstanding_calls);
 	}
-	if (call->type->destructor)
-		call->type->destructor(call);
 }
 
 /*
- * End a call and free it
+ * Queue the call for actual work.  Returns 0 unconditionally for convenience.
  */
-static void afs_end_call(struct afs_call *call)
+int afs_queue_call_work(struct afs_call *call)
 {
-	afs_end_call_nofree(call);
-	afs_free_call(call);
+	int u = atomic_inc_return(&call->usage);
+
+	trace_afs_call(call, afs_call_trace_work, u,
+		       atomic_read(&afs_outstanding_calls),
+		       __builtin_return_address(0));
+
+	INIT_WORK(&call->work, call->type->work);
+
+	if (!queue_work(afs_wq, &call->work))
+		afs_put_call(call);
+	return 0;
 }
 
 /*
@@ -200,25 +217,19 @@ struct afs_call *afs_alloc_flat_call(const struct afs_call_type *type,
 {
 	struct afs_call *call;
 
-	call = kzalloc(sizeof(*call), GFP_NOFS);
+	call = afs_alloc_call(type, GFP_NOFS);
 	if (!call)
 		goto nomem_call;
 
-	_debug("CALL %p{%s} [%d]",
-	       call, type->name, atomic_read(&afs_outstanding_calls));
-	atomic_inc(&afs_outstanding_calls);
-
-	call->type = type;
-	call->request_size = request_size;
-	call->reply_max = reply_max;
-
 	if (request_size) {
+		call->request_size = request_size;
 		call->request = kmalloc(request_size, GFP_NOFS);
 		if (!call->request)
 			goto nomem_free;
 	}
 
 	if (reply_max) {
+		call->reply_max = reply_max;
 		call->buffer = kmalloc(reply_max, GFP_NOFS);
 		if (!call->buffer)
 			goto nomem_free;
@@ -228,7 +239,7 @@ struct afs_call *afs_alloc_flat_call(const struct afs_call_type *type,
 	return call;
 
 nomem_free:
-	afs_free_call(call);
+	afs_put_call(call);
 nomem_call:
 	return NULL;
 }
@@ -315,7 +326,7 @@ static int afs_send_pages(struct afs_call *call, struct msghdr *msg,
  * initiate a call
  */
 int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp,
-		  const struct afs_wait_mode *wait_mode)
+		  bool async)
 {
 	struct sockaddr_rxrpc srx;
 	struct rxrpc_call *rxcall;
@@ -332,8 +343,7 @@ int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp,
 	       call, call->type->name, key_serial(call->key),
 	       atomic_read(&afs_outstanding_calls));
 
-	call->wait_mode = wait_mode;
-	INIT_WORK(&call->async_work, afs_process_async_call);
+	call->async = async;
 
 	memset(&srx, 0, sizeof(srx));
 	srx.srx_family = AF_RXRPC;
@@ -347,7 +357,9 @@ int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp,
 	/* create a call */
 	rxcall = rxrpc_kernel_begin_call(afs_socket, &srx, call->key,
 					 (unsigned long) call, gfp,
-					 wait_mode->notify_rx);
+					 (async ?
+					  afs_wake_up_async_call :
+					  afs_wake_up_call_waiter));
 	call->key = NULL;
 	if (IS_ERR(rxcall)) {
 		ret = PTR_ERR(rxcall);
@@ -386,12 +398,15 @@ int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp,
 
 	/* at this point, an async call may no longer exist as it may have
 	 * already completed */
-	return wait_mode->wait(call);
+	if (call->async)
+		return -EINPROGRESS;
+
+	return afs_wait_for_call_to_complete(call);
 
 error_do_abort:
 	rxrpc_kernel_abort_call(afs_socket, rxcall, RX_USER_ABORT, -ret, "KSD");
 error_kill_call:
-	afs_end_call(call);
+	afs_put_call(call);
 	_leave(" = %d", ret);
 	return ret;
 }
@@ -416,6 +431,8 @@ static void afs_deliver_to_call(struct afs_call *call)
 			ret = rxrpc_kernel_recv_data(afs_socket, call->rxcall,
 						     NULL, 0, &offset, false,
 						     &call->abort_code);
+			trace_afs_recv_data(call, 0, offset, false, ret);
+
 			if (ret == -EINPROGRESS || ret == -EAGAIN)
 				return;
 			if (ret == 1 || ret < 0) {
@@ -459,7 +476,7 @@ static void afs_deliver_to_call(struct afs_call *call)
 
 done:
 	if (call->state == AFS_CALL_COMPLETE && call->incoming)
-		afs_end_call(call);
+		afs_put_call(call);
 out:
 	_leave("");
 	return;
@@ -516,7 +533,7 @@ static int afs_wait_for_call_to_complete(struct afs_call *call)
 	}
 
 	_debug("call complete");
-	afs_end_call(call);
+	afs_put_call(call);
 	_leave(" = %d", ret);
 	return ret;
 }
@@ -540,24 +557,25 @@ static void afs_wake_up_async_call(struct sock *sk, struct rxrpc_call *rxcall,
 				   unsigned long call_user_ID)
 {
 	struct afs_call *call = (struct afs_call *)call_user_ID;
+	int u;
 
+	trace_afs_notify_call(rxcall, call);
 	call->need_attention = true;
-	queue_work(afs_async_calls, &call->async_work);
-}
 
-/*
- * put a call into asynchronous mode
- * - mustn't touch the call descriptor as the call my have completed by the
- *   time we get here
- */
-static int afs_dont_wait_for_call_to_complete(struct afs_call *call)
-{
-	_enter("");
-	return -EINPROGRESS;
+	u = __atomic_add_unless(&call->usage, 1, 0);
+	if (u != 0) {
+		trace_afs_call(call, afs_call_trace_wake, u,
+			       atomic_read(&afs_outstanding_calls),
+			       __builtin_return_address(0));
+
+		if (!queue_work(afs_async_calls, &call->async_work))
+			afs_put_call(call);
+	}
 }
 
 /*
- * delete an asynchronous call
+ * Delete an asynchronous call.  The work item carries a ref to the call struct
+ * that we need to release.
  */
 static void afs_delete_async_call(struct work_struct *work)
 {
@@ -565,13 +583,14 @@ static void afs_delete_async_call(struct work_struct *work)
 
 	_enter("");
 
-	afs_free_call(call);
+	afs_put_call(call);
 
 	_leave("");
 }
 
 /*
- * perform processing on an asynchronous call
+ * Perform I/O processing on an asynchronous call.  The work item carries a ref
+ * to the call struct that we either need to release or to pass on.
  */
 static void afs_process_async_call(struct work_struct *work)
 {
@@ -584,21 +603,19 @@ static void afs_process_async_call(struct work_struct *work)
 		afs_deliver_to_call(call);
 	}
 
-	if (call->state == AFS_CALL_COMPLETE && call->wait_mode) {
-		if (call->wait_mode->async_complete)
-			call->wait_mode->async_complete(call->reply,
-							call->error);
+	if (call->state == AFS_CALL_COMPLETE) {
 		call->reply = NULL;
 
-		/* kill the call */
-		afs_end_call_nofree(call);
-
-		/* we can't just delete the call because the work item may be
-		 * queued */
+		/* We have two refs to release - one from the alloc and one
+		 * queued with the work item - and we can't just deallocate the
+		 * call because the work item may be queued again.
+		 */
 		call->async_work.func = afs_delete_async_call;
-		queue_work(afs_async_calls, &call->async_work);
+		if (!queue_work(afs_async_calls, &call->async_work))
+			afs_put_call(call);
 	}
 
+	afs_put_call(call);
 	_leave("");
 }
 
@@ -618,15 +635,13 @@ static void afs_charge_preallocation(struct work_struct *work)
 
 	for (;;) {
 		if (!call) {
-			call = kzalloc(sizeof(struct afs_call), GFP_KERNEL);
+			call = afs_alloc_call(&afs_RXCMxxxx, GFP_KERNEL);
 			if (!call)
 				break;
 
-			INIT_WORK(&call->async_work, afs_process_async_call);
-			call->wait_mode = &afs_async_incoming_call;
-			call->type = &afs_RXCMxxxx;
-			init_waitqueue_head(&call->waitq);
+			call->async = true;
 			call->state = AFS_CALL_AWAIT_OP_ID;
+			init_waitqueue_head(&call->waitq);
 		}
 
 		if (rxrpc_kernel_charge_accept(afs_socket,
@@ -648,9 +663,8 @@ static void afs_rx_discard_new_call(struct rxrpc_call *rxcall,
 {
 	struct afs_call *call = (struct afs_call *)user_call_ID;
 
-	atomic_inc(&afs_outstanding_calls);
 	call->rxcall = NULL;
-	afs_free_call(call);
+	afs_put_call(call);
 }
 
 /*
@@ -659,7 +673,6 @@ static void afs_rx_discard_new_call(struct rxrpc_call *rxcall,
 static void afs_rx_new_call(struct sock *sk, struct rxrpc_call *rxcall,
 			    unsigned long user_call_ID)
 {
-	atomic_inc(&afs_outstanding_calls);
 	queue_work(afs_wq, &afs_charge_preallocation_work);
 }
 
@@ -689,6 +702,8 @@ static int afs_deliver_cm_op_id(struct afs_call *call)
 	if (!afs_cm_incoming_call(call))
 		return -ENOTSUPP;
 
+	trace_afs_cb_call(call);
+
 	/* pass responsibility for the remainer of this message off to the
 	 * cache manager op */
 	return call->type->deliver(call);
@@ -721,7 +736,6 @@ void afs_send_empty_reply(struct afs_call *call)
 		rxrpc_kernel_abort_call(afs_socket, call->rxcall,
 					RX_USER_ABORT, ENOMEM, "KOO");
 	default:
-		afs_end_call(call);
 		_leave(" [error]");
 		return;
 	}
@@ -760,7 +774,6 @@ void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len)
 		rxrpc_kernel_abort_call(afs_socket, call->rxcall,
 					RX_USER_ABORT, ENOMEM, "KOO");
 	}
-	afs_end_call(call);
 	_leave(" [error]");
 }
 
@@ -780,6 +793,7 @@ int afs_extract_data(struct afs_call *call, void *buf, size_t count,
 	ret = rxrpc_kernel_recv_data(afs_socket, call->rxcall,
 				     buf, count, &call->offset,
 				     want_more, &call->abort_code);
+	trace_afs_recv_data(call, count, call->offset, want_more, ret);
 	if (ret == 0 || ret == -EAGAIN)
 		return ret;
 
diff --git a/fs/afs/vlclient.c b/fs/afs/vlclient.c
index 94bcd97d22b8..a5e4cc561b6c 100644
--- a/fs/afs/vlclient.c
+++ b/fs/afs/vlclient.c
@@ -147,7 +147,7 @@ int afs_vl_get_entry_by_name(struct in_addr *addr,
 			     struct key *key,
 			     const char *volname,
 			     struct afs_cache_vlocation *entry,
-			     const struct afs_wait_mode *wait_mode)
+			     bool async)
 {
 	struct afs_call *call;
 	size_t volnamesz, reqsz, padsz;
@@ -177,7 +177,7 @@ int afs_vl_get_entry_by_name(struct in_addr *addr,
 		memset((void *) bp + volnamesz, 0, padsz);
 
 	/* initiate the call */
-	return afs_make_call(addr, call, GFP_KERNEL, wait_mode);
+	return afs_make_call(addr, call, GFP_KERNEL, async);
 }
 
 /*
@@ -188,7 +188,7 @@ int afs_vl_get_entry_by_id(struct in_addr *addr,
 			   afs_volid_t volid,
 			   afs_voltype_t voltype,
 			   struct afs_cache_vlocation *entry,
-			   const struct afs_wait_mode *wait_mode)
+			   bool async)
 {
 	struct afs_call *call;
 	__be32 *bp;
@@ -211,5 +211,5 @@ int afs_vl_get_entry_by_id(struct in_addr *addr,
 	*bp   = htonl(voltype);
 
 	/* initiate the call */
-	return afs_make_call(addr, call, GFP_KERNEL, wait_mode);
+	return afs_make_call(addr, call, GFP_KERNEL, async);
 }
diff --git a/fs/afs/vlocation.c b/fs/afs/vlocation.c
index 45a86396fd2d..d7d8dd8c0b31 100644
--- a/fs/afs/vlocation.c
+++ b/fs/afs/vlocation.c
@@ -53,7 +53,7 @@ static int afs_vlocation_access_vl_by_name(struct afs_vlocation *vl,
 
 		/* attempt to access the VL server */
 		ret = afs_vl_get_entry_by_name(&addr, key, vl->vldb.name, vldb,
-					       &afs_sync_call);
+					       false);
 		switch (ret) {
 		case 0:
 			goto out;
@@ -111,7 +111,7 @@ static int afs_vlocation_access_vl_by_id(struct afs_vlocation *vl,
 
 		/* attempt to access the VL server */
 		ret = afs_vl_get_entry_by_id(&addr, key, volid, voltype, vldb,
-					     &afs_sync_call);
+					     false);
 		switch (ret) {
 		case 0:
 			goto out;
diff --git a/fs/afs/vnode.c b/fs/afs/vnode.c
index 25cf4c3f4ff7..dcb956143c86 100644
--- a/fs/afs/vnode.c
+++ b/fs/afs/vnode.c
@@ -358,7 +358,7 @@ get_anyway:
 		       server, ntohl(server->addr.s_addr));
 
 		ret = afs_fs_fetch_file_status(server, key, vnode, NULL,
-					       &afs_sync_call);
+					       false);
 
 	} while (!afs_volume_release_fileserver(vnode, server, ret));
 
@@ -393,7 +393,7 @@ no_server:
  * - TODO implement caching
  */
 int afs_vnode_fetch_data(struct afs_vnode *vnode, struct key *key,
-			 off_t offset, size_t length, struct page *page)
+			 struct afs_read *desc)
 {
 	struct afs_server *server;
 	int ret;
@@ -420,8 +420,8 @@ int afs_vnode_fetch_data(struct afs_vnode *vnode, struct key *key,
 
 		_debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr));
 
-		ret = afs_fs_fetch_data(server, key, vnode, offset, length,
-					page, &afs_sync_call);
+		ret = afs_fs_fetch_data(server, key, vnode, desc,
+					false);
 
 	} while (!afs_volume_release_fileserver(vnode, server, ret));
 
@@ -477,7 +477,7 @@ int afs_vnode_create(struct afs_vnode *vnode, struct key *key,
 		_debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr));
 
 		ret = afs_fs_create(server, key, vnode, name, mode, newfid,
-				    newstatus, newcb, &afs_sync_call);
+				    newstatus, newcb, false);
 
 	} while (!afs_volume_release_fileserver(vnode, server, ret));
 
@@ -533,7 +533,7 @@ int afs_vnode_remove(struct afs_vnode *vnode, struct key *key, const char *name,
 		_debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr));
 
 		ret = afs_fs_remove(server, key, vnode, name, isdir,
-				    &afs_sync_call);
+				    false);
 
 	} while (!afs_volume_release_fileserver(vnode, server, ret));
 
@@ -595,7 +595,7 @@ int afs_vnode_link(struct afs_vnode *dvnode, struct afs_vnode *vnode,
 		_debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr));
 
 		ret = afs_fs_link(server, key, dvnode, vnode, name,
-				  &afs_sync_call);
+				  false);
 
 	} while (!afs_volume_release_fileserver(dvnode, server, ret));
 
@@ -659,7 +659,7 @@ int afs_vnode_symlink(struct afs_vnode *vnode, struct key *key,
 		_debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr));
 
 		ret = afs_fs_symlink(server, key, vnode, name, content,
-				     newfid, newstatus, &afs_sync_call);
+				     newfid, newstatus, false);
 
 	} while (!afs_volume_release_fileserver(vnode, server, ret));
 
@@ -729,7 +729,7 @@ int afs_vnode_rename(struct afs_vnode *orig_dvnode,
 		_debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr));
 
 		ret = afs_fs_rename(server, key, orig_dvnode, orig_name,
-				    new_dvnode, new_name, &afs_sync_call);
+				    new_dvnode, new_name, false);
 
 	} while (!afs_volume_release_fileserver(orig_dvnode, server, ret));
 
@@ -795,7 +795,7 @@ int afs_vnode_store_data(struct afs_writeback *wb, pgoff_t first, pgoff_t last,
 		_debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr));
 
 		ret = afs_fs_store_data(server, wb, first, last, offset, to,
-					&afs_sync_call);
+					false);
 
 	} while (!afs_volume_release_fileserver(vnode, server, ret));
 
@@ -847,7 +847,7 @@ int afs_vnode_setattr(struct afs_vnode *vnode, struct key *key,
 
 		_debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr));
 
-		ret = afs_fs_setattr(server, key, vnode, attr, &afs_sync_call);
+		ret = afs_fs_setattr(server, key, vnode, attr, false);
 
 	} while (!afs_volume_release_fileserver(vnode, server, ret));
 
@@ -894,7 +894,7 @@ int afs_vnode_get_volume_status(struct afs_vnode *vnode, struct key *key,
 
 		_debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr));
 
-		ret = afs_fs_get_volume_status(server, key, vnode, vs, &afs_sync_call);
+		ret = afs_fs_get_volume_status(server, key, vnode, vs, false);
 
 	} while (!afs_volume_release_fileserver(vnode, server, ret));
 
@@ -933,7 +933,7 @@ int afs_vnode_set_lock(struct afs_vnode *vnode, struct key *key,
 
 		_debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr));
 
-		ret = afs_fs_set_lock(server, key, vnode, type, &afs_sync_call);
+		ret = afs_fs_set_lock(server, key, vnode, type, false);
 
 	} while (!afs_volume_release_fileserver(vnode, server, ret));
 
@@ -971,7 +971,7 @@ int afs_vnode_extend_lock(struct afs_vnode *vnode, struct key *key)
 
 		_debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr));
 
-		ret = afs_fs_extend_lock(server, key, vnode, &afs_sync_call);
+		ret = afs_fs_extend_lock(server, key, vnode, false);
 
 	} while (!afs_volume_release_fileserver(vnode, server, ret));
 
@@ -1009,7 +1009,7 @@ int afs_vnode_release_lock(struct afs_vnode *vnode, struct key *key)
 
 		_debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr));
 
-		ret = afs_fs_release_lock(server, key, vnode, &afs_sync_call);
+		ret = afs_fs_release_lock(server, key, vnode, false);
 
 	} while (!afs_volume_release_fileserver(vnode, server, ret));
 
diff --git a/fs/afs/volume.c b/fs/afs/volume.c
index d142a2449e65..546f9d01710b 100644
--- a/fs/afs/volume.c
+++ b/fs/afs/volume.c
@@ -106,6 +106,7 @@ struct afs_volume *afs_volume_lookup(struct afs_mount_params *params)
 	volume->cell		= params->cell;
 	volume->vid		= vlocation->vldb.vid[params->type];
 
+	volume->bdi.ra_pages	= VM_MAX_READAHEAD*1024/PAGE_SIZE; 
 	ret = bdi_setup_and_register(&volume->bdi, "afs");
 	if (ret)
 		goto error_bdi;
diff --git a/fs/afs/write.c b/fs/afs/write.c
index f865c3f05bea..c83c1a0e851f 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -86,19 +86,30 @@ void afs_put_writeback(struct afs_writeback *wb)
 static int afs_fill_page(struct afs_vnode *vnode, struct key *key,
 			 loff_t pos, struct page *page)
 {
+	struct afs_read *req;
 	loff_t i_size;
 	int ret;
-	int len;
 
 	_enter(",,%llu", (unsigned long long)pos);
 
+	req = kzalloc(sizeof(struct afs_read) + sizeof(struct page *),
+		      GFP_KERNEL);
+	if (!req)
+		return -ENOMEM;
+
+	atomic_set(&req->usage, 1);
+	req->pos = pos;
+	req->nr_pages = 1;
+	req->pages[0] = page;
+
 	i_size = i_size_read(&vnode->vfs_inode);
 	if (pos + PAGE_SIZE > i_size)
-		len = i_size - pos;
+		req->len = i_size - pos;
 	else
-		len = PAGE_SIZE;
+		req->len = PAGE_SIZE;
 
-	ret = afs_vnode_fetch_data(vnode, key, pos, len, page);
+	ret = afs_vnode_fetch_data(vnode, key, req);
+	afs_put_read(req);
 	if (ret < 0) {
 		if (ret == -ENOENT) {
 			_debug("got NOENT from server"
diff --git a/fs/aio.c b/fs/aio.c
index 9f4a9a2e7ae4..63e97f3168e7 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1085,7 +1085,8 @@ static void aio_complete(struct kiocb *kiocb, long res, long res2)
 		 * Tell lockdep we inherited freeze protection from submission
 		 * thread.
 		 */
-		__sb_writers_acquired(file_inode(file)->i_sb, SB_FREEZE_WRITE);
+		if (S_ISREG(file_inode(file)->i_mode))
+			__sb_writers_acquired(file_inode(file)->i_sb, SB_FREEZE_WRITE);
 		file_end_write(file);
 	}
 
@@ -1525,7 +1526,8 @@ static ssize_t aio_write(struct kiocb *req, struct iocb *iocb, bool vectored,
 		 * by telling it the lock got released so that it doesn't
 		 * complain about held lock when we return to userspace.
 		 */
-		__sb_writers_release(file_inode(file)->i_sb, SB_FREEZE_WRITE);
+		if (S_ISREG(file_inode(file)->i_mode))
+			__sb_writers_release(file_inode(file)->i_sb, SB_FREEZE_WRITE);
 	}
 	kfree(iovec);
 	return ret;
diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index 1278335ce366..79fbd85db4ba 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -436,8 +436,8 @@ int autofs4_wait(struct autofs_sb_info *sbi,
 		memcpy(&wq->name, &qstr, sizeof(struct qstr));
 		wq->dev = autofs4_get_dev(sbi);
 		wq->ino = autofs4_get_ino(sbi);
-		wq->uid = current_real_cred()->uid;
-		wq->gid = current_real_cred()->gid;
+		wq->uid = current_cred()->uid;
+		wq->gid = current_cred()->gid;
 		wq->pid = pid;
 		wq->tgid = tgid;
 		wq->status = -EINTR; /* Status return if interrupted */
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 29a02daf08a9..443a6f537d56 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -91,12 +91,18 @@ static struct linux_binfmt elf_format = {
 
 #define BAD_ADDR(x) ((unsigned long)(x) >= TASK_SIZE)
 
-static int set_brk(unsigned long start, unsigned long end)
+static int set_brk(unsigned long start, unsigned long end, int prot)
 {
 	start = ELF_PAGEALIGN(start);
 	end = ELF_PAGEALIGN(end);
 	if (end > start) {
-		int error = vm_brk(start, end - start);
+		/*
+		 * Map the last of the bss segment.
+		 * If the header is requesting these pages to be
+		 * executable, honour that (ppc32 needs this).
+		 */
+		int error = vm_brk_flags(start, end - start,
+				prot & PROT_EXEC ? VM_EXEC : 0);
 		if (error)
 			return error;
 	}
@@ -524,6 +530,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
 	unsigned long load_addr = 0;
 	int load_addr_set = 0;
 	unsigned long last_bss = 0, elf_bss = 0;
+	int bss_prot = 0;
 	unsigned long error = ~0UL;
 	unsigned long total_size;
 	int i;
@@ -606,8 +613,10 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
 			 * elf_bss and last_bss is the bss section.
 			 */
 			k = load_addr + eppnt->p_vaddr + eppnt->p_memsz;
-			if (k > last_bss)
+			if (k > last_bss) {
 				last_bss = k;
+				bss_prot = elf_prot;
+			}
 		}
 	}
 
@@ -623,13 +632,14 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
 	/*
 	 * Next, align both the file and mem bss up to the page size,
 	 * since this is where elf_bss was just zeroed up to, and where
-	 * last_bss will end after the vm_brk() below.
+	 * last_bss will end after the vm_brk_flags() below.
 	 */
 	elf_bss = ELF_PAGEALIGN(elf_bss);
 	last_bss = ELF_PAGEALIGN(last_bss);
 	/* Finally, if there is still more bss to allocate, do it. */
 	if (last_bss > elf_bss) {
-		error = vm_brk(elf_bss, last_bss - elf_bss);
+		error = vm_brk_flags(elf_bss, last_bss - elf_bss,
+				bss_prot & PROT_EXEC ? VM_EXEC : 0);
 		if (error)
 			goto out;
 	}
@@ -674,6 +684,7 @@ static int load_elf_binary(struct linux_binprm *bprm)
 	unsigned long error;
 	struct elf_phdr *elf_ppnt, *elf_phdata, *interp_elf_phdata = NULL;
 	unsigned long elf_bss, elf_brk;
+	int bss_prot = 0;
 	int retval, i;
 	unsigned long elf_entry;
 	unsigned long interp_load_addr = 0;
@@ -882,7 +893,8 @@ static int load_elf_binary(struct linux_binprm *bprm)
 			   before this one. Map anonymous pages, if needed,
 			   and clear the area.  */
 			retval = set_brk(elf_bss + load_bias,
-					 elf_brk + load_bias);
+					 elf_brk + load_bias,
+					 bss_prot);
 			if (retval)
 				goto out_free_dentry;
 			nbyte = ELF_PAGEOFFSET(elf_bss);
@@ -976,8 +988,10 @@ static int load_elf_binary(struct linux_binprm *bprm)
 		if (end_data < k)
 			end_data = k;
 		k = elf_ppnt->p_vaddr + elf_ppnt->p_memsz;
-		if (k > elf_brk)
+		if (k > elf_brk) {
+			bss_prot = elf_prot;
 			elf_brk = k;
+		}
 	}
 
 	loc->elf_ex.e_entry += load_bias;
@@ -993,7 +1007,7 @@ static int load_elf_binary(struct linux_binprm *bprm)
 	 * mapping in the interpreter, to make sure it doesn't wind
 	 * up getting placed where the bss needs to go.
 	 */
-	retval = set_brk(elf_bss, elf_brk);
+	retval = set_brk(elf_bss, elf_brk, bss_prot);
 	if (retval)
 		goto out_free_dentry;
 	if (likely(elf_bss != elf_brk) && unlikely(padzero(elf_bss))) {
@@ -1428,17 +1442,18 @@ static void fill_prstatus(struct elf_prstatus *prstatus,
 		 * group-wide total, not its individual thread total.
 		 */
 		thread_group_cputime(p, &cputime);
-		cputime_to_timeval(cputime.utime, &prstatus->pr_utime);
-		cputime_to_timeval(cputime.stime, &prstatus->pr_stime);
+		prstatus->pr_utime = ns_to_timeval(cputime.utime);
+		prstatus->pr_stime = ns_to_timeval(cputime.stime);
 	} else {
-		cputime_t utime, stime;
+		u64 utime, stime;
 
 		task_cputime(p, &utime, &stime);
-		cputime_to_timeval(utime, &prstatus->pr_utime);
-		cputime_to_timeval(stime, &prstatus->pr_stime);
+		prstatus->pr_utime = ns_to_timeval(utime);
+		prstatus->pr_stime = ns_to_timeval(stime);
 	}
-	cputime_to_timeval(p->signal->cutime, &prstatus->pr_cutime);
-	cputime_to_timeval(p->signal->cstime, &prstatus->pr_cstime);
+
+	prstatus->pr_cutime = ns_to_timeval(p->signal->cutime);
+	prstatus->pr_cstime = ns_to_timeval(p->signal->cstime);
 }
 
 static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p,
@@ -2298,6 +2313,7 @@ static int elf_core_dump(struct coredump_params *cprm)
 				goto end_coredump;
 		}
 	}
+	dump_truncate(cprm);
 
 	if (!elf_core_write_extra_data(cprm))
 		goto end_coredump;
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index d2e36f82c35d..ffca4bbc3d63 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -1349,17 +1349,17 @@ static void fill_prstatus(struct elf_prstatus *prstatus,
 		 * group-wide total, not its individual thread total.
 		 */
 		thread_group_cputime(p, &cputime);
-		cputime_to_timeval(cputime.utime, &prstatus->pr_utime);
-		cputime_to_timeval(cputime.stime, &prstatus->pr_stime);
+		prstatus->pr_utime = ns_to_timeval(cputime.utime);
+		prstatus->pr_stime = ns_to_timeval(cputime.stime);
 	} else {
-		cputime_t utime, stime;
+		u64 utime, stime;
 
 		task_cputime(p, &utime, &stime);
-		cputime_to_timeval(utime, &prstatus->pr_utime);
-		cputime_to_timeval(stime, &prstatus->pr_stime);
+		prstatus->pr_utime = ns_to_timeval(utime);
+		prstatus->pr_stime = ns_to_timeval(stime);
 	}
-	cputime_to_timeval(p->signal->cutime, &prstatus->pr_cutime);
-	cputime_to_timeval(p->signal->cstime, &prstatus->pr_cstime);
+	prstatus->pr_cutime = ns_to_timeval(p->signal->cutime);
+	prstatus->pr_cstime = ns_to_timeval(p->signal->cstime);
 
 	prstatus->pr_exec_fdpic_loadmap = p->mm->context.exec_fdpic_loadmap;
 	prstatus->pr_interp_fdpic_loadmap = p->mm->context.interp_fdpic_loadmap;
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 6254cee8f8f3..73031ec54a7b 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -328,9 +328,10 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = bdev_file_inode(file);
 	struct block_device *bdev = I_BDEV(inode);
+	struct blk_plug plug;
 	struct blkdev_dio *dio;
 	struct bio *bio;
-	bool is_read = (iov_iter_rw(iter) == READ);
+	bool is_read = (iov_iter_rw(iter) == READ), is_sync;
 	loff_t pos = iocb->ki_pos;
 	blk_qc_t qc = BLK_QC_T_NONE;
 	int ret;
@@ -343,7 +344,7 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
 	bio_get(bio); /* extra ref for the completion handler */
 
 	dio = container_of(bio, struct blkdev_dio, bio);
-	dio->is_sync = is_sync_kiocb(iocb);
+	dio->is_sync = is_sync = is_sync_kiocb(iocb);
 	if (dio->is_sync)
 		dio->waiter = current;
 	else
@@ -353,6 +354,7 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
 	dio->multi_bio = false;
 	dio->should_dirty = is_read && (iter->type == ITER_IOVEC);
 
+	blk_start_plug(&plug);
 	for (;;) {
 		bio->bi_bdev = bdev;
 		bio->bi_iter.bi_sector = pos >> 9;
@@ -394,8 +396,9 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
 		submit_bio(bio);
 		bio = bio_alloc(GFP_KERNEL, nr_pages);
 	}
+	blk_finish_plug(&plug);
 
-	if (!dio->is_sync)
+	if (!is_sync)
 		return -EIOCBQUEUED;
 
 	for (;;) {
@@ -881,6 +884,8 @@ static void bdev_evict_inode(struct inode *inode)
 	spin_lock(&bdev_lock);
 	list_del_init(&bdev->bd_list);
 	spin_unlock(&bdev_lock);
+	if (bdev->bd_bdi != &noop_backing_dev_info)
+		bdi_put(bdev->bd_bdi);
 }
 
 static const struct super_operations bdev_sops = {
@@ -951,6 +956,21 @@ static int bdev_set(struct inode *inode, void *data)
 
 static LIST_HEAD(all_bdevs);
 
+/*
+ * If there is a bdev inode for this device, unhash it so that it gets evicted
+ * as soon as last inode reference is dropped.
+ */
+void bdev_unhash_inode(dev_t dev)
+{
+	struct inode *inode;
+
+	inode = ilookup5(blockdev_superblock, hash(dev), bdev_test, &dev);
+	if (inode) {
+		remove_inode_hash(inode);
+		iput(inode);
+	}
+}
+
 struct block_device *bdget(dev_t dev)
 {
 	struct block_device *bdev;
@@ -968,6 +988,7 @@ struct block_device *bdget(dev_t dev)
 		bdev->bd_contains = NULL;
 		bdev->bd_super = NULL;
 		bdev->bd_inode = inode;
+		bdev->bd_bdi = &noop_backing_dev_info;
 		bdev->bd_block_size = (1 << inode->i_blkbits);
 		bdev->bd_part_count = 0;
 		bdev->bd_invalidated = 0;
@@ -1524,6 +1545,8 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 		bdev->bd_disk = disk;
 		bdev->bd_queue = disk->queue;
 		bdev->bd_contains = bdev;
+		if (bdev->bd_bdi == &noop_backing_dev_info)
+			bdev->bd_bdi = bdi_get(disk->queue->backing_dev_info);
 
 		if (!partno) {
 			ret = -ENXIO;
@@ -1619,6 +1642,8 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 	bdev->bd_disk = NULL;
 	bdev->bd_part = NULL;
 	bdev->bd_queue = NULL;
+	bdi_put(bdev->bd_bdi);
+	bdev->bd_bdi = &noop_backing_dev_info;
 	if (bdev != bdev->bd_contains)
 		__blkdev_put(bdev->bd_contains, mode, 1);
 	bdev->bd_contains = NULL;
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 63d197724519..ff0b0be92d61 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -273,6 +273,8 @@ static void run_ordered_work(struct __btrfs_workqueue *wq)
 	unsigned long flags;
 
 	while (1) {
+		void *wtag;
+
 		spin_lock_irqsave(lock, flags);
 		if (list_empty(list))
 			break;
@@ -299,11 +301,13 @@ static void run_ordered_work(struct __btrfs_workqueue *wq)
 		spin_unlock_irqrestore(lock, flags);
 
 		/*
-		 * we don't want to call the ordered free functions
-		 * with the lock held though
+		 * We don't want to call the ordered free functions with the
+		 * lock held though. Save the work as tag for the trace event,
+		 * because the callback could free the structure.
 		 */
+		wtag = work;
 		work->ordered_free(work);
-		trace_btrfs_all_work_done(work);
+		trace_btrfs_all_work_done(wq->fs_info, wtag);
 	}
 	spin_unlock_irqrestore(lock, flags);
 }
@@ -311,6 +315,7 @@ static void run_ordered_work(struct __btrfs_workqueue *wq)
 static void normal_work_helper(struct btrfs_work *work)
 {
 	struct __btrfs_workqueue *wq;
+	void *wtag;
 	int need_order = 0;
 
 	/*
@@ -324,6 +329,8 @@ static void normal_work_helper(struct btrfs_work *work)
 	if (work->ordered_func)
 		need_order = 1;
 	wq = work->wq;
+	/* Safe for tracepoints in case work gets freed by the callback */
+	wtag = work;
 
 	trace_btrfs_work_sched(work);
 	thresh_exec_hook(wq);
@@ -333,7 +340,7 @@ static void normal_work_helper(struct btrfs_work *work)
 		run_ordered_work(wq);
 	}
 	if (!need_order)
-		trace_btrfs_all_work_done(work);
+		trace_btrfs_all_work_done(wq->fs_info, wtag);
 }
 
 void btrfs_init_work(struct btrfs_work *work, btrfs_work_func_t uniq_func,
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 7f390849343b..c4444d6f439f 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -1024,6 +1024,7 @@ int btrfs_decompress_buf2page(char *buf, unsigned long buf_start,
 	unsigned long buf_offset;
 	unsigned long current_buf_start;
 	unsigned long start_byte;
+	unsigned long prev_start_byte;
 	unsigned long working_bytes = total_out - buf_start;
 	unsigned long bytes;
 	char *kaddr;
@@ -1071,26 +1072,34 @@ int btrfs_decompress_buf2page(char *buf, unsigned long buf_start,
 		if (!bio->bi_iter.bi_size)
 			return 0;
 		bvec = bio_iter_iovec(bio, bio->bi_iter);
-
+		prev_start_byte = start_byte;
 		start_byte = page_offset(bvec.bv_page) - disk_start;
 
 		/*
-		 * make sure our new page is covered by this
-		 * working buffer
+		 * We need to make sure we're only adjusting
+		 * our offset into compression working buffer when
+		 * we're switching pages.  Otherwise we can incorrectly
+		 * keep copying when we were actually done.
 		 */
-		if (total_out <= start_byte)
-			return 1;
+		if (start_byte != prev_start_byte) {
+			/*
+			 * make sure our new page is covered by this
+			 * working buffer
+			 */
+			if (total_out <= start_byte)
+				return 1;
 
-		/*
-		 * the next page in the biovec might not be adjacent
-		 * to the last page, but it might still be found
-		 * inside this working buffer. bump our offset pointer
-		 */
-		if (total_out > start_byte &&
-		    current_buf_start < start_byte) {
-			buf_offset = start_byte - buf_start;
-			working_bytes = total_out - start_byte;
-			current_buf_start = buf_start + buf_offset;
+			/*
+			 * the next page in the biovec might not be adjacent
+			 * to the last page, but it might still be found
+			 * inside this working buffer. bump our offset pointer
+			 */
+			if (total_out > start_byte &&
+			    current_buf_start < start_byte) {
+				buf_offset = start_byte - buf_start;
+				working_bytes = total_out - start_byte;
+				current_buf_start = buf_start + buf_offset;
+			}
 		}
 	}
 
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 18004169552c..37a31b12bb0c 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1800,7 +1800,7 @@ static int btrfs_congested_fn(void *congested_data, int bdi_bits)
 	list_for_each_entry_rcu(device, &info->fs_devices->devices, dev_list) {
 		if (!device->bdev)
 			continue;
-		bdi = blk_get_backing_dev_info(device->bdev);
+		bdi = device->bdev->bd_bdi;
 		if (bdi_congested(bdi, bdi_bits)) {
 			ret = 1;
 			break;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index e97302f437a1..dcd2e798767e 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2522,11 +2522,11 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
 		if (ref && ref->seq &&
 		    btrfs_check_delayed_seq(fs_info, delayed_refs, ref->seq)) {
 			spin_unlock(&locked_ref->lock);
-			btrfs_delayed_ref_unlock(locked_ref);
 			spin_lock(&delayed_refs->lock);
 			locked_ref->processing = 0;
 			delayed_refs->num_heads_ready++;
 			spin_unlock(&delayed_refs->lock);
+			btrfs_delayed_ref_unlock(locked_ref);
 			locked_ref = NULL;
 			cond_resched();
 			count++;
@@ -2572,7 +2572,10 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
 					 */
 					if (must_insert_reserved)
 						locked_ref->must_insert_reserved = 1;
+					spin_lock(&delayed_refs->lock);
 					locked_ref->processing = 0;
+					delayed_refs->num_heads_ready++;
+					spin_unlock(&delayed_refs->lock);
 					btrfs_debug(fs_info,
 						    "run_delayed_extent_op returned %d",
 						    ret);
@@ -7384,7 +7387,8 @@ btrfs_lock_cluster(struct btrfs_block_group_cache *block_group,
 
 		spin_unlock(&cluster->refill_lock);
 
-		down_read(&used_bg->data_rwsem);
+		/* We should only have one-level nested. */
+		down_read_nested(&used_bg->data_rwsem, SINGLE_DEPTH_NESTING);
 
 		spin_lock(&cluster->refill_lock);
 		if (used_bg == cluster->block_group)
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index f2b281ad7af6..1e861a063721 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3835,10 +3835,7 @@ cache_acl:
 		break;
 	case S_IFDIR:
 		inode->i_fop = &btrfs_dir_file_operations;
-		if (root == fs_info->tree_root)
-			inode->i_op = &btrfs_dir_ro_inode_operations;
-		else
-			inode->i_op = &btrfs_dir_inode_operations;
+		inode->i_op = &btrfs_dir_inode_operations;
 		break;
 	case S_IFLNK:
 		inode->i_op = &btrfs_symlink_inode_operations;
@@ -4505,8 +4502,19 @@ search_again:
 		if (found_type > min_type) {
 			del_item = 1;
 		} else {
-			if (item_end < new_size)
+			if (item_end < new_size) {
+				/*
+				 * With NO_HOLES mode, for the following mapping
+				 *
+				 * [0-4k][hole][8k-12k]
+				 *
+				 * if truncating isize down to 6k, it ends up
+				 * isize being 8k.
+				 */
+				if (btrfs_fs_incompat(root->fs_info, NO_HOLES))
+					last_size = new_size;
 				break;
+			}
 			if (found_key.offset >= new_size)
 				del_item = 1;
 			else
@@ -5710,6 +5718,7 @@ static struct inode *new_simple_dir(struct super_block *s,
 
 	inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID;
 	inode->i_op = &btrfs_dir_ro_inode_operations;
+	inode->i_opflags &= ~IOP_XATTR;
 	inode->i_fop = &simple_dir_operations;
 	inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO;
 	inode->i_mtime = current_time(inode);
@@ -7059,7 +7068,7 @@ insert:
 	write_unlock(&em_tree->lock);
 out:
 
-	trace_btrfs_get_extent(root, em);
+	trace_btrfs_get_extent(root, inode, em);
 
 	btrfs_free_path(path);
 	if (trans) {
@@ -7215,7 +7224,6 @@ static struct extent_map *btrfs_create_dio_extent(struct inode *inode,
 	struct extent_map *em = NULL;
 	int ret;
 
-	down_read(&BTRFS_I(inode)->dio_sem);
 	if (type != BTRFS_ORDERED_NOCOW) {
 		em = create_pinned_em(inode, start, len, orig_start,
 				      block_start, block_len, orig_block_len,
@@ -7234,7 +7242,6 @@ static struct extent_map *btrfs_create_dio_extent(struct inode *inode,
 		em = ERR_PTR(ret);
 	}
  out:
-	up_read(&BTRFS_I(inode)->dio_sem);
 
 	return em;
 }
@@ -7623,11 +7630,18 @@ static void adjust_dio_outstanding_extents(struct inode *inode,
 	 * within our reservation, otherwise we need to adjust our inode
 	 * counter appropriately.
 	 */
-	if (dio_data->outstanding_extents) {
+	if (dio_data->outstanding_extents >= num_extents) {
 		dio_data->outstanding_extents -= num_extents;
 	} else {
+		/*
+		 * If dio write length has been split due to no large enough
+		 * contiguous space, we need to compensate our inode counter
+		 * appropriately.
+		 */
+		u64 num_needed = num_extents - dio_data->outstanding_extents;
+
 		spin_lock(&BTRFS_I(inode)->lock);
-		BTRFS_I(inode)->outstanding_extents += num_extents;
+		BTRFS_I(inode)->outstanding_extents += num_needed;
 		spin_unlock(&BTRFS_I(inode)->lock);
 	}
 }
@@ -8685,6 +8699,7 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 		dio_data.unsubmitted_oe_range_start = (u64)offset;
 		dio_data.unsubmitted_oe_range_end = (u64)offset;
 		current->journal_info = &dio_data;
+		down_read(&BTRFS_I(inode)->dio_sem);
 	} else if (test_bit(BTRFS_INODE_READDIO_NEED_LOCK,
 				     &BTRFS_I(inode)->runtime_flags)) {
 		inode_dio_end(inode);
@@ -8697,6 +8712,7 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 				   iter, btrfs_get_blocks_direct, NULL,
 				   btrfs_submit_direct, flags);
 	if (iov_iter_rw(iter) == WRITE) {
+		up_read(&BTRFS_I(inode)->dio_sem);
 		current->journal_info = NULL;
 		if (ret < 0 && ret != -EIOCBQUEUED) {
 			if (dio_data.reserve)
@@ -9205,6 +9221,7 @@ static int btrfs_truncate(struct inode *inode)
 			break;
 		}
 
+		btrfs_block_rsv_release(fs_info, rsv, -1);
 		ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv,
 					      rsv, min_size, 0);
 		BUG_ON(ret);	/* shouldn't happen */
@@ -10572,8 +10589,6 @@ static const struct inode_operations btrfs_dir_inode_operations = {
 static const struct inode_operations btrfs_dir_ro_inode_operations = {
 	.lookup		= btrfs_lookup,
 	.permission	= btrfs_permission,
-	.get_acl	= btrfs_get_acl,
-	.set_acl	= btrfs_set_acl,
 	.update_time	= btrfs_update_time,
 };
 
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 33f967d30b2a..21e51b0ba188 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -5653,6 +5653,10 @@ long btrfs_ioctl(struct file *file, unsigned int
 #ifdef CONFIG_COMPAT
 long btrfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 {
+	/*
+	 * These all access 32-bit values anyway so no further
+	 * handling is necessary.
+	 */
 	switch (cmd) {
 	case FS_IOC32_GETFLAGS:
 		cmd = FS_IOC_GETFLAGS;
@@ -5663,8 +5667,6 @@ long btrfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	case FS_IOC32_GETVERSION:
 		cmd = FS_IOC_GETVERSION;
 		break;
-	default:
-		return -ENOIOCTLCMD;
 	}
 
 	return btrfs_ioctl(file, cmd, (unsigned long) compat_ptr(arg));
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index f10bf5213ed8..eeffff84f280 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -37,6 +37,7 @@
  */
 #define LOG_INODE_ALL 0
 #define LOG_INODE_EXISTS 1
+#define LOG_OTHER_INODE 2
 
 /*
  * directory trouble cases
@@ -4641,7 +4642,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
 	if (S_ISDIR(inode->i_mode) ||
 	    (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
 		       &BTRFS_I(inode)->runtime_flags) &&
-	     inode_only == LOG_INODE_EXISTS))
+	     inode_only >= LOG_INODE_EXISTS))
 		max_key.type = BTRFS_XATTR_ITEM_KEY;
 	else
 		max_key.type = (u8)-1;
@@ -4665,7 +4666,13 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
 		return ret;
 	}
 
-	mutex_lock(&BTRFS_I(inode)->log_mutex);
+	if (inode_only == LOG_OTHER_INODE) {
+		inode_only = LOG_INODE_EXISTS;
+		mutex_lock_nested(&BTRFS_I(inode)->log_mutex,
+				  SINGLE_DEPTH_NESTING);
+	} else {
+		mutex_lock(&BTRFS_I(inode)->log_mutex);
+	}
 
 	/*
 	 * a brute force approach to making sure we get the most uptodate
@@ -4817,7 +4824,7 @@ again:
 				 * unpin it.
 				 */
 				err = btrfs_log_inode(trans, root, other_inode,
-						      LOG_INODE_EXISTS,
+						      LOG_OTHER_INODE,
 						      0, LLONG_MAX, ctx);
 				iput(other_inode);
 				if (err)
diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c
index 161342b73ce5..726f928238d0 100644
--- a/fs/btrfs/uuid-tree.c
+++ b/fs/btrfs/uuid-tree.c
@@ -352,7 +352,5 @@ skip:
 
 out:
 	btrfs_free_path(path);
-	if (ret)
-		btrfs_warn(fs_info, "btrfs_uuid_tree_iterate failed %d", ret);
-	return 0;
+	return ret;
 }
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 3c3c69c0eee4..b2e70073a10d 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -366,7 +366,7 @@ static noinline void run_scheduled_bios(struct btrfs_device *device)
 	 */
 	blk_start_plug(&plug);
 
-	bdi = blk_get_backing_dev_info(device->bdev);
+	bdi = device->bdev->bd_bdi;
 	limit = btrfs_async_submit_limit(fs_info);
 	limit = limit * 2 / 3;
 
diff --git a/fs/buffer.c b/fs/buffer.c
index d21771fcf7d3..0e87401cf335 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1660,7 +1660,7 @@ void clean_bdev_aliases(struct block_device *bdev, sector_t block, sector_t len)
 			head = page_buffers(page);
 			bh = head;
 			do {
-				if (!buffer_mapped(bh))
+				if (!buffer_mapped(bh) || (bh->b_blocknr < block))
 					goto next;
 				if (bh->b_blocknr >= block + len)
 					break;
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 9cd0c0ea7cdb..e4b066cd912a 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -502,9 +502,9 @@ static struct ceph_snap_context *get_oldest_context(struct inode *inode,
 		dout(" head snapc %p has %d dirty pages\n",
 		     snapc, ci->i_wrbuffer_ref_head);
 		if (truncate_size)
-			*truncate_size = capsnap->truncate_size;
+			*truncate_size = ci->i_truncate_size;
 		if (truncate_seq)
-			*truncate_seq = capsnap->truncate_seq;
+			*truncate_seq = ci->i_truncate_seq;
 	}
 	spin_unlock(&ci->i_ceph_lock);
 	return snapc;
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index baea866a6751..94fd76d04683 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -2591,8 +2591,13 @@ int ceph_get_caps(struct ceph_inode_info *ci, int need, int want,
 			add_wait_queue(&ci->i_cap_wq, &wait);
 
 			while (!try_get_cap_refs(ci, need, want, endoff,
-						 true, &_got, &err))
+						 true, &_got, &err)) {
+				if (signal_pending(current)) {
+					ret = -ERESTARTSYS;
+					break;
+				}
 				wait_woken(&wait, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
+			}
 
 			remove_wait_queue(&ci->i_cap_wq, &wait);
 
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index d7a93696663b..8ab1fdf0bd49 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -1230,7 +1230,8 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
 		struct ceph_mds_client *mdsc =
 			ceph_sb_to_client(dir->i_sb)->mdsc;
 		struct ceph_mds_request *req;
-		int op, mask, err;
+		int op, err;
+		u32 mask;
 
 		if (flags & LOOKUP_RCU)
 			return -ECHILD;
@@ -1245,7 +1246,7 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
 			mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED;
 			if (ceph_security_xattr_wanted(dir))
 				mask |= CEPH_CAP_XATTR_SHARED;
-			req->r_args.getattr.mask = mask;
+			req->r_args.getattr.mask = cpu_to_le32(mask);
 
 			err = ceph_mdsc_do_request(mdsc, NULL, req);
 			switch (err) {
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 398e5328b309..5e659d054b40 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -305,7 +305,8 @@ static int frag_tree_split_cmp(const void *l, const void *r)
 {
 	struct ceph_frag_tree_split *ls = (struct ceph_frag_tree_split*)l;
 	struct ceph_frag_tree_split *rs = (struct ceph_frag_tree_split*)r;
-	return ceph_frag_compare(ls->frag, rs->frag);
+	return ceph_frag_compare(le32_to_cpu(ls->frag),
+				 le32_to_cpu(rs->frag));
 }
 
 static bool is_frag_child(u32 f, struct ceph_inode_frag *frag)
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 4f49253387a0..c9d2e553a6c4 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -288,12 +288,13 @@ static int parse_reply_info_extra(void **p, void *end,
 				  struct ceph_mds_reply_info_parsed *info,
 				  u64 features)
 {
-	if (info->head->op == CEPH_MDS_OP_GETFILELOCK)
+	u32 op = le32_to_cpu(info->head->op);
+
+	if (op == CEPH_MDS_OP_GETFILELOCK)
 		return parse_reply_info_filelock(p, end, info, features);
-	else if (info->head->op == CEPH_MDS_OP_READDIR ||
-		 info->head->op == CEPH_MDS_OP_LSSNAP)
+	else if (op == CEPH_MDS_OP_READDIR || op == CEPH_MDS_OP_LSSNAP)
 		return parse_reply_info_dir(p, end, info, features);
-	else if (info->head->op == CEPH_MDS_OP_CREATE)
+	else if (op == CEPH_MDS_OP_CREATE)
 		return parse_reply_info_create(p, end, info, features);
 	else
 		return -EIO;
@@ -2106,6 +2107,11 @@ static int __do_request(struct ceph_mds_client *mdsc,
 			dout("do_request mdsmap err %d\n", err);
 			goto finish;
 		}
+		if (mdsc->mdsmap->m_epoch == 0) {
+			dout("do_request no mdsmap, waiting for map\n");
+			list_add(&req->r_wait, &mdsc->waiting_for_map);
+			goto finish;
+		}
 		if (!(mdsc->fsc->mount_options->flags &
 		      CEPH_MOUNT_OPT_MOUNTWAIT) &&
 		    !ceph_mdsmap_is_cluster_available(mdsc->mdsmap)) {
diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig
index e7b478b49985..034f00f21390 100644
--- a/fs/cifs/Kconfig
+++ b/fs/cifs/Kconfig
@@ -9,8 +9,6 @@ config CIFS
 	select CRYPTO_ARC4
 	select CRYPTO_ECB
 	select CRYPTO_DES
-	select CRYPTO_SHA256
-	select CRYPTO_CMAC
 	help
 	  This is the client VFS module for the Common Internet File System
 	  (CIFS) protocol which is the successor to the Server Message Block
@@ -169,11 +167,15 @@ config CIFS_NFSD_EXPORT
 
 config CIFS_SMB2
 	bool "SMB2 and SMB3 network file system support"
-	depends on CIFS && INET
-	select NLS
+	depends on CIFS
 	select KEYS
 	select FSCACHE
 	select DNS_RESOLVER
+	select CRYPTO_AES
+	select CRYPTO_SHA256
+	select CRYPTO_CMAC
+	select CRYPTO_AEAD2
+	select CRYPTO_CCM
 
 	help
 	  This enables support for the Server Message Block version 2
@@ -194,7 +196,7 @@ config CIFS_SMB2
 
 config CIFS_SMB311
 	bool "SMB3.1.1 network file system support (Experimental)"
-	depends on CIFS_SMB2 && INET
+	depends on CIFS_SMB2
 
 	help
 	  This enables experimental support for the newest, SMB3.1.1, dialect.
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index ec9dbbcca3b9..9156be545b0f 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -245,7 +245,8 @@ compose_mount_options_err:
  * @fullpath:		full path in UNC format
  * @ref:		server's referral
  */
-static struct vfsmount *cifs_dfs_do_refmount(struct cifs_sb_info *cifs_sb,
+static struct vfsmount *cifs_dfs_do_refmount(struct dentry *mntpt,
+		struct cifs_sb_info *cifs_sb,
 		const char *fullpath, const struct dfs_info3_param *ref)
 {
 	struct vfsmount *mnt;
@@ -259,7 +260,7 @@ static struct vfsmount *cifs_dfs_do_refmount(struct cifs_sb_info *cifs_sb,
 	if (IS_ERR(mountdata))
 		return (struct vfsmount *)mountdata;
 
-	mnt = vfs_kern_mount(&cifs_fs_type, 0, devname, mountdata);
+	mnt = vfs_submount(mntpt, &cifs_fs_type, devname, mountdata);
 	kfree(mountdata);
 	kfree(devname);
 	return mnt;
@@ -334,7 +335,7 @@ static struct vfsmount *cifs_dfs_do_automount(struct dentry *mntpt)
 			mnt = ERR_PTR(-EINVAL);
 			break;
 		}
-		mnt = cifs_dfs_do_refmount(cifs_sb,
+		mnt = cifs_dfs_do_refmount(mntpt, cifs_sb,
 				full_path, referrals + i);
 		cifs_dbg(FYI, "%s: cifs_dfs_do_refmount:%s , mnt:%p\n",
 			 __func__, referrals[i].node_name, mnt);
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index 66bd7fa9b7a6..058ac9b36f04 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -34,6 +34,7 @@
 #include <linux/random.h>
 #include <linux/highmem.h>
 #include <crypto/skcipher.h>
+#include <crypto/aead.h>
 
 static int
 cifs_crypto_shash_md5_allocate(struct TCP_Server_Info *server)
@@ -75,24 +76,20 @@ int __cifs_calc_signature(struct smb_rqst *rqst,
 	struct kvec *iov = rqst->rq_iov;
 	int n_vec = rqst->rq_nvec;
 
-	for (i = 0; i < n_vec; i++) {
+	if (n_vec < 2 || iov[0].iov_len != 4)
+		return -EIO;
+
+	for (i = 1; i < n_vec; i++) {
 		if (iov[i].iov_len == 0)
 			continue;
 		if (iov[i].iov_base == NULL) {
 			cifs_dbg(VFS, "null iovec entry\n");
 			return -EIO;
 		}
-		/* The first entry includes a length field (which does not get
-		   signed that occupies the first 4 bytes before the header */
-		if (i == 0) {
-			if (iov[0].iov_len <= 8) /* cmd field at offset 9 */
-				break; /* nothing to sign or corrupt header */
-			rc = crypto_shash_update(shash,
-				iov[i].iov_base + 4, iov[i].iov_len - 4);
-		} else {
-			rc = crypto_shash_update(shash,
-				iov[i].iov_base, iov[i].iov_len);
-		}
+		if (i == 1 && iov[1].iov_len <= 4)
+			break; /* nothing to sign or corrupt header */
+		rc = crypto_shash_update(shash,
+					 iov[i].iov_base, iov[i].iov_len);
 		if (rc) {
 			cifs_dbg(VFS, "%s: Could not update with payload\n",
 				 __func__);
@@ -168,6 +165,10 @@ int cifs_sign_rqst(struct smb_rqst *rqst, struct TCP_Server_Info *server,
 	char smb_signature[20];
 	struct smb_hdr *cifs_pdu = (struct smb_hdr *)rqst->rq_iov[0].iov_base;
 
+	if (rqst->rq_iov[0].iov_len != 4 ||
+	    rqst->rq_iov[0].iov_base + 4 != rqst->rq_iov[1].iov_base)
+		return -EIO;
+
 	if ((cifs_pdu == NULL) || (server == NULL))
 		return -EINVAL;
 
@@ -209,12 +210,14 @@ int cifs_sign_smbv(struct kvec *iov, int n_vec, struct TCP_Server_Info *server,
 int cifs_sign_smb(struct smb_hdr *cifs_pdu, struct TCP_Server_Info *server,
 		  __u32 *pexpected_response_sequence_number)
 {
-	struct kvec iov;
+	struct kvec iov[2];
 
-	iov.iov_base = cifs_pdu;
-	iov.iov_len = be32_to_cpu(cifs_pdu->smb_buf_length) + 4;
+	iov[0].iov_base = cifs_pdu;
+	iov[0].iov_len = 4;
+	iov[1].iov_base = (char *)cifs_pdu + 4;
+	iov[1].iov_len = be32_to_cpu(cifs_pdu->smb_buf_length);
 
-	return cifs_sign_smbv(&iov, 1, server,
+	return cifs_sign_smbv(iov, 2, server,
 			      pexpected_response_sequence_number);
 }
 
@@ -227,6 +230,10 @@ int cifs_verify_signature(struct smb_rqst *rqst,
 	char what_we_think_sig_should_be[20];
 	struct smb_hdr *cifs_pdu = (struct smb_hdr *)rqst->rq_iov[0].iov_base;
 
+	if (rqst->rq_iov[0].iov_len != 4 ||
+	    rqst->rq_iov[0].iov_base + 4 != rqst->rq_iov[1].iov_base)
+		return -EIO;
+
 	if (cifs_pdu == NULL || server == NULL)
 		return -EINVAL;
 
@@ -868,7 +875,7 @@ out:
 }
 
 void
-cifs_crypto_shash_release(struct TCP_Server_Info *server)
+cifs_crypto_secmech_release(struct TCP_Server_Info *server)
 {
 	if (server->secmech.cmacaes) {
 		crypto_free_shash(server->secmech.cmacaes);
@@ -890,6 +897,16 @@ cifs_crypto_shash_release(struct TCP_Server_Info *server)
 		server->secmech.hmacmd5 = NULL;
 	}
 
+	if (server->secmech.ccmaesencrypt) {
+		crypto_free_aead(server->secmech.ccmaesencrypt);
+		server->secmech.ccmaesencrypt = NULL;
+	}
+
+	if (server->secmech.ccmaesdecrypt) {
+		crypto_free_aead(server->secmech.ccmaesdecrypt);
+		server->secmech.ccmaesdecrypt = NULL;
+	}
+
 	kfree(server->secmech.sdesccmacaes);
 	server->secmech.sdesccmacaes = NULL;
 	kfree(server->secmech.sdeschmacsha256);
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 70f4e65fced2..15e1db8738ae 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -1365,5 +1365,19 @@ MODULE_DESCRIPTION
     ("VFS to access servers complying with the SNIA CIFS Specification "
      "e.g. Samba and Windows");
 MODULE_VERSION(CIFS_VERSION);
+MODULE_SOFTDEP("pre: arc4");
+MODULE_SOFTDEP("pre: des");
+MODULE_SOFTDEP("pre: ecb");
+MODULE_SOFTDEP("pre: hmac");
+MODULE_SOFTDEP("pre: md4");
+MODULE_SOFTDEP("pre: md5");
+MODULE_SOFTDEP("pre: nls");
+#ifdef CONFIG_CIFS_SMB2
+MODULE_SOFTDEP("pre: aes");
+MODULE_SOFTDEP("pre: cmac");
+MODULE_SOFTDEP("pre: sha256");
+MODULE_SOFTDEP("pre: aead2");
+MODULE_SOFTDEP("pre: ccm");
+#endif /* CONFIG_CIFS_SMB2 */
 module_init(init_cifs)
 module_exit(exit_cifs)
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 7ea8a3393936..1a90bb3e2986 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -136,6 +136,8 @@ struct cifs_secmech {
 	struct sdesc *sdescmd5; /* ctxt to generate cifs/smb signature */
 	struct sdesc *sdeschmacsha256;  /* ctxt to generate smb2 signature */
 	struct sdesc *sdesccmacaes;  /* ctxt to generate smb3 signature */
+	struct crypto_aead *ccmaesencrypt; /* smb3 encryption aead */
+	struct crypto_aead *ccmaesdecrypt; /* smb3 decryption aead */
 };
 
 /* per smb session structure/fields */
@@ -208,7 +210,7 @@ struct cifsInodeInfo;
 struct cifs_open_parms;
 
 struct smb_version_operations {
-	int (*send_cancel)(struct TCP_Server_Info *, void *,
+	int (*send_cancel)(struct TCP_Server_Info *, struct smb_rqst *,
 			   struct mid_q_entry *);
 	bool (*compare_fids)(struct cifsFileInfo *, struct cifsFileInfo *);
 	/* setup request: allocate mid, sign message */
@@ -433,6 +435,14 @@ struct smb_version_operations {
 	bool (*dir_needs_close)(struct cifsFileInfo *);
 	long (*fallocate)(struct file *, struct cifs_tcon *, int, loff_t,
 			  loff_t);
+	/* init transform request - used for encryption for now */
+	int (*init_transform_rq)(struct TCP_Server_Info *, struct smb_rqst *,
+				 struct smb_rqst *);
+	/* free transform request */
+	void (*free_transform_rq)(struct smb_rqst *);
+	int (*is_transform_hdr)(void *buf);
+	int (*receive_transform)(struct TCP_Server_Info *,
+				 struct mid_q_entry **);
 };
 
 struct smb_version_values {
@@ -1119,7 +1129,10 @@ struct cifs_readdata {
 	int (*read_into_pages)(struct TCP_Server_Info *server,
 				struct cifs_readdata *rdata,
 				unsigned int len);
-	struct kvec			iov;
+	int (*copy_into_pages)(struct TCP_Server_Info *server,
+				struct cifs_readdata *rdata,
+				struct iov_iter *iter);
+	struct kvec			iov[2];
 	unsigned int			pagesz;
 	unsigned int			tailsz;
 	unsigned int			credits;
@@ -1302,6 +1315,13 @@ typedef int (mid_receive_t)(struct TCP_Server_Info *server,
  */
 typedef void (mid_callback_t)(struct mid_q_entry *mid);
 
+/*
+ * This is the protopyte for mid handle function. This is called once the mid
+ * has been recognized after decryption of the message.
+ */
+typedef int (mid_handle_t)(struct TCP_Server_Info *server,
+			    struct mid_q_entry *mid);
+
 /* one of these for every pending CIFS request to the server */
 struct mid_q_entry {
 	struct list_head qhead;	/* mids waiting on reply from this server */
@@ -1316,6 +1336,7 @@ struct mid_q_entry {
 #endif
 	mid_receive_t *receive; /* call receive callback */
 	mid_callback_t *callback; /* call completion callback */
+	mid_handle_t *handle; /* call handle mid callback */
 	void *callback_data;	  /* general purpose pointer for callback */
 	void *resp_buf;		/* pointer to received SMB header */
 	int mid_state;	/* wish this were enum but can not pass to wait_event */
@@ -1323,6 +1344,7 @@ struct mid_q_entry {
 	bool large_buf:1;	/* if valid response, is pointer to large buf */
 	bool multiRsp:1;	/* multiple trans2 responses for one request  */
 	bool multiEnd:1;	/* both received */
+	bool decrypted:1;	/* decrypted entry */
 };
 
 /*	Make code in transport.c a little cleaner by moving
@@ -1475,7 +1497,9 @@ static inline void free_dfs_info_array(struct dfs_info3_param *param,
 #define   CIFS_OBREAK_OP   0x0100    /* oplock break request */
 #define   CIFS_NEG_OP      0x0200    /* negotiate request */
 #define   CIFS_OP_MASK     0x0380    /* mask request type */
+
 #define   CIFS_HAS_CREDITS 0x0400    /* already has credits */
+#define   CIFS_TRANSFORM_REQ 0x0800    /* transform request before sending */
 
 /* Security Flags: indicate type of session setup needed */
 #define   CIFSSEC_MAY_SIGN	0x00001
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index c7b3c841e660..406d2c10ba78 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -75,10 +75,16 @@ extern struct mid_q_entry *AllocMidQEntry(const struct smb_hdr *smb_buffer,
 extern void DeleteMidQEntry(struct mid_q_entry *midEntry);
 extern void cifs_delete_mid(struct mid_q_entry *mid);
 extern void cifs_wake_up_task(struct mid_q_entry *mid);
+extern int cifs_handle_standard(struct TCP_Server_Info *server,
+				struct mid_q_entry *mid);
+extern int cifs_discard_remaining_data(struct TCP_Server_Info *server);
 extern int cifs_call_async(struct TCP_Server_Info *server,
 			struct smb_rqst *rqst,
 			mid_receive_t *receive, mid_callback_t *callback,
-			void *cbdata, const int flags);
+			mid_handle_t *handle, void *cbdata, const int flags);
+extern int cifs_send_recv(const unsigned int xid, struct cifs_ses *ses,
+			  struct smb_rqst *rqst, int *resp_buf_type,
+			  const int flags, struct kvec *resp_iov);
 extern int SendReceive(const unsigned int /* xid */ , struct cifs_ses *,
 			struct smb_hdr * /* input */ ,
 			struct smb_hdr * /* out */ ,
@@ -96,7 +102,8 @@ extern int cifs_wait_mtu_credits(struct TCP_Server_Info *server,
 				 unsigned int *credits);
 extern int SendReceive2(const unsigned int /* xid */ , struct cifs_ses *,
 			struct kvec *, int /* nvec to send */,
-			int * /* type of buf returned */ , const int flags);
+			int * /* type of buf returned */, const int flags,
+			struct kvec * /* resp vec */);
 extern int SendReceiveBlockingLock(const unsigned int xid,
 			struct cifs_tcon *ptcon,
 			struct smb_hdr *in_buf ,
@@ -441,7 +448,7 @@ extern int SMBNTencrypt(unsigned char *, unsigned char *, unsigned char *,
 			const struct nls_table *);
 extern int setup_ntlm_response(struct cifs_ses *, const struct nls_table *);
 extern int setup_ntlmv2_rsp(struct cifs_ses *, const struct nls_table *);
-extern void cifs_crypto_shash_release(struct TCP_Server_Info *);
+extern void cifs_crypto_secmech_release(struct TCP_Server_Info *server);
 extern int calc_seckey(struct cifs_ses *);
 extern int generate_smb30signingkey(struct cifs_ses *);
 extern int generate_smb311signingkey(struct cifs_ses *);
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index b47261858e6d..f5099fb8a22f 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -673,6 +673,7 @@ CIFSSMBTDis(const unsigned int xid, struct cifs_tcon *tcon)
 		return rc;
 
 	rc = SendReceiveNoRsp(xid, tcon->ses, (char *)smb_buffer, 0);
+	cifs_small_buf_release(smb_buffer);
 	if (rc)
 		cifs_dbg(FYI, "Tree disconnect failed %d\n", rc);
 
@@ -707,9 +708,9 @@ CIFSSMBEcho(struct TCP_Server_Info *server)
 {
 	ECHO_REQ *smb;
 	int rc = 0;
-	struct kvec iov;
-	struct smb_rqst rqst = { .rq_iov = &iov,
-				 .rq_nvec = 1 };
+	struct kvec iov[2];
+	struct smb_rqst rqst = { .rq_iov = iov,
+				 .rq_nvec = 2 };
 
 	cifs_dbg(FYI, "In echo request\n");
 
@@ -724,10 +725,13 @@ CIFSSMBEcho(struct TCP_Server_Info *server)
 	put_bcc(1, &smb->hdr);
 	smb->Data[0] = 'a';
 	inc_rfc1001_len(smb, 3);
-	iov.iov_base = smb;
-	iov.iov_len = be32_to_cpu(smb->hdr.smb_buf_length) + 4;
 
-	rc = cifs_call_async(server, &rqst, NULL, cifs_echo_callback,
+	iov[0].iov_len = 4;
+	iov[0].iov_base = smb;
+	iov[1].iov_len = get_rfc1002_length(smb);
+	iov[1].iov_base = (char *)smb + 4;
+
+	rc = cifs_call_async(server, &rqst, NULL, cifs_echo_callback, NULL,
 			     server, CIFS_ASYNC_OP | CIFS_ECHO_OP);
 	if (rc)
 		cifs_dbg(FYI, "Echo request failed: %d\n", rc);
@@ -772,6 +776,7 @@ CIFSSMBLogoff(const unsigned int xid, struct cifs_ses *ses)
 
 	pSMB->AndXCommand = 0xFF;
 	rc = SendReceiveNoRsp(xid, ses, (char *) pSMB, 0);
+	cifs_small_buf_release(pSMB);
 session_already_dead:
 	mutex_unlock(&ses->session_mutex);
 
@@ -1394,8 +1399,8 @@ openRetry:
  * Discard any remaining data in the current SMB. To do this, we borrow the
  * current bigbuf.
  */
-static int
-discard_remaining_data(struct TCP_Server_Info *server)
+int
+cifs_discard_remaining_data(struct TCP_Server_Info *server)
 {
 	unsigned int rfclen = get_rfc1002_length(server->smallbuf);
 	int remaining = rfclen + 4 - server->total_read;
@@ -1421,7 +1426,7 @@ cifs_readv_discard(struct TCP_Server_Info *server, struct mid_q_entry *mid)
 	int length;
 	struct cifs_readdata *rdata = mid->callback_data;
 
-	length = discard_remaining_data(server);
+	length = cifs_discard_remaining_data(server);
 	dequeue_mid(mid, rdata->result);
 	return length;
 }
@@ -1454,7 +1459,7 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid)
 
 	if (server->ops->is_status_pending &&
 	    server->ops->is_status_pending(buf, server, 0)) {
-		discard_remaining_data(server);
+		cifs_discard_remaining_data(server);
 		return -1;
 	}
 
@@ -1507,10 +1512,12 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid)
 	}
 
 	/* set up first iov for signature check */
-	rdata->iov.iov_base = buf;
-	rdata->iov.iov_len = server->total_read;
-	cifs_dbg(FYI, "0: iov_base=%p iov_len=%zu\n",
-		 rdata->iov.iov_base, rdata->iov.iov_len);
+	rdata->iov[0].iov_base = buf;
+	rdata->iov[0].iov_len = 4;
+	rdata->iov[1].iov_base = buf + 4;
+	rdata->iov[1].iov_len = server->total_read - 4;
+	cifs_dbg(FYI, "0: iov_base=%p iov_len=%u\n",
+		 rdata->iov[0].iov_base, server->total_read);
 
 	/* how much data is in the response? */
 	data_len = server->ops->read_data_length(buf);
@@ -1543,8 +1550,8 @@ cifs_readv_callback(struct mid_q_entry *mid)
 	struct cifs_readdata *rdata = mid->callback_data;
 	struct cifs_tcon *tcon = tlink_tcon(rdata->cfile->tlink);
 	struct TCP_Server_Info *server = tcon->ses->server;
-	struct smb_rqst rqst = { .rq_iov = &rdata->iov,
-				 .rq_nvec = 1,
+	struct smb_rqst rqst = { .rq_iov = rdata->iov,
+				 .rq_nvec = 2,
 				 .rq_pages = rdata->pages,
 				 .rq_npages = rdata->nr_pages,
 				 .rq_pagesz = rdata->pagesz,
@@ -1599,8 +1606,8 @@ cifs_async_readv(struct cifs_readdata *rdata)
 	READ_REQ *smb = NULL;
 	int wct;
 	struct cifs_tcon *tcon = tlink_tcon(rdata->cfile->tlink);
-	struct smb_rqst rqst = { .rq_iov = &rdata->iov,
-				 .rq_nvec = 1 };
+	struct smb_rqst rqst = { .rq_iov = rdata->iov,
+				 .rq_nvec = 2 };
 
 	cifs_dbg(FYI, "%s: offset=%llu bytes=%u\n",
 		 __func__, rdata->offset, rdata->bytes);
@@ -1640,12 +1647,14 @@ cifs_async_readv(struct cifs_readdata *rdata)
 	}
 
 	/* 4 for RFC1001 length + 1 for BCC */
-	rdata->iov.iov_base = smb;
-	rdata->iov.iov_len = be32_to_cpu(smb->hdr.smb_buf_length) + 4;
+	rdata->iov[0].iov_base = smb;
+	rdata->iov[0].iov_len = 4;
+	rdata->iov[1].iov_base = (char *)smb + 4;
+	rdata->iov[1].iov_len = get_rfc1002_length(smb);
 
 	kref_get(&rdata->refcount);
 	rc = cifs_call_async(tcon->ses->server, &rqst, cifs_readv_receive,
-			     cifs_readv_callback, rdata, 0);
+			     cifs_readv_callback, NULL, rdata, 0);
 
 	if (rc == 0)
 		cifs_stats_inc(&tcon->stats.cifs_stats.num_reads);
@@ -1667,6 +1676,7 @@ CIFSSMBRead(const unsigned int xid, struct cifs_io_parms *io_parms,
 	int wct;
 	int resp_buf_type = 0;
 	struct kvec iov[1];
+	struct kvec rsp_iov;
 	__u32 pid = io_parms->pid;
 	__u16 netfid = io_parms->netfid;
 	__u64 offset = io_parms->offset;
@@ -1716,10 +1726,11 @@ CIFSSMBRead(const unsigned int xid, struct cifs_io_parms *io_parms,
 
 	iov[0].iov_base = (char *)pSMB;
 	iov[0].iov_len = be32_to_cpu(pSMB->hdr.smb_buf_length) + 4;
-	rc = SendReceive2(xid, tcon->ses, iov, 1 /* num iovecs */,
-			 &resp_buf_type, CIFS_LOG_ERROR);
+	rc = SendReceive2(xid, tcon->ses, iov, 1, &resp_buf_type,
+			  CIFS_LOG_ERROR, &rsp_iov);
+	cifs_small_buf_release(pSMB);
 	cifs_stats_inc(&tcon->stats.cifs_stats.num_reads);
-	pSMBr = (READ_RSP *)iov[0].iov_base;
+	pSMBr = (READ_RSP *)rsp_iov.iov_base;
 	if (rc) {
 		cifs_dbg(VFS, "Send error in read = %d\n", rc);
 	} else {
@@ -1747,12 +1758,11 @@ CIFSSMBRead(const unsigned int xid, struct cifs_io_parms *io_parms,
 		}
 	}
 
-/*	cifs_small_buf_release(pSMB); */ /* Freed earlier now in SendReceive2 */
 	if (*buf) {
-		free_rsp_buf(resp_buf_type, iov[0].iov_base);
+		free_rsp_buf(resp_buf_type, rsp_iov.iov_base);
 	} else if (resp_buf_type != CIFS_NO_BUFFER) {
 		/* return buffer to caller to free */
-		*buf = iov[0].iov_base;
+		*buf = rsp_iov.iov_base;
 		if (resp_buf_type == CIFS_SMALL_BUFFER)
 			*pbuf_type = CIFS_SMALL_BUFFER;
 		else if (resp_buf_type == CIFS_LARGE_BUFFER)
@@ -2093,7 +2103,7 @@ cifs_async_writev(struct cifs_writedata *wdata,
 	WRITE_REQ *smb = NULL;
 	int wct;
 	struct cifs_tcon *tcon = tlink_tcon(wdata->cfile->tlink);
-	struct kvec iov;
+	struct kvec iov[2];
 	struct smb_rqst rqst = { };
 
 	if (tcon->ses->capabilities & CAP_LARGE_FILES) {
@@ -2126,11 +2136,13 @@ cifs_async_writev(struct cifs_writedata *wdata,
 	    cpu_to_le16(offsetof(struct smb_com_write_req, Data) - 4);
 
 	/* 4 for RFC1001 length + 1 for BCC */
-	iov.iov_len = be32_to_cpu(smb->hdr.smb_buf_length) + 4 + 1;
-	iov.iov_base = smb;
+	iov[0].iov_len = 4;
+	iov[0].iov_base = smb;
+	iov[1].iov_len = get_rfc1002_length(smb) + 1;
+	iov[1].iov_base = (char *)smb + 4;
 
-	rqst.rq_iov = &iov;
-	rqst.rq_nvec = 1;
+	rqst.rq_iov = iov;
+	rqst.rq_nvec = 2;
 	rqst.rq_pages = wdata->pages;
 	rqst.rq_npages = wdata->nr_pages;
 	rqst.rq_pagesz = wdata->pagesz;
@@ -2151,12 +2163,12 @@ cifs_async_writev(struct cifs_writedata *wdata,
 				(struct smb_com_writex_req *)smb;
 		inc_rfc1001_len(&smbw->hdr, wdata->bytes + 5);
 		put_bcc(wdata->bytes + 5, &smbw->hdr);
-		iov.iov_len += 4; /* pad bigger by four bytes */
+		iov[1].iov_len += 4; /* pad bigger by four bytes */
 	}
 
 	kref_get(&wdata->refcount);
 	rc = cifs_call_async(tcon->ses->server, &rqst, NULL,
-				cifs_writev_callback, wdata, 0);
+				cifs_writev_callback, NULL, wdata, 0);
 
 	if (rc == 0)
 		cifs_stats_inc(&tcon->stats.cifs_stats.num_writes);
@@ -2182,6 +2194,7 @@ CIFSSMBWrite2(const unsigned int xid, struct cifs_io_parms *io_parms,
 	__u64 offset = io_parms->offset;
 	struct cifs_tcon *tcon = io_parms->tcon;
 	unsigned int count = io_parms->length;
+	struct kvec rsp_iov;
 
 	*nbytes = 0;
 
@@ -2240,8 +2253,9 @@ CIFSSMBWrite2(const unsigned int xid, struct cifs_io_parms *io_parms,
 	else /* wct == 12 pad bigger by four bytes */
 		iov[0].iov_len = smb_hdr_len + 8;
 
-
-	rc = SendReceive2(xid, tcon->ses, iov, n_vec + 1, &resp_buf_type, 0);
+	rc = SendReceive2(xid, tcon->ses, iov, n_vec + 1, &resp_buf_type, 0,
+			  &rsp_iov);
+	cifs_small_buf_release(pSMB);
 	cifs_stats_inc(&tcon->stats.cifs_stats.num_writes);
 	if (rc) {
 		cifs_dbg(FYI, "Send error Write2 = %d\n", rc);
@@ -2249,7 +2263,7 @@ CIFSSMBWrite2(const unsigned int xid, struct cifs_io_parms *io_parms,
 		/* presumably this can not happen, but best to be safe */
 		rc = -EIO;
 	} else {
-		WRITE_RSP *pSMBr = (WRITE_RSP *)iov[0].iov_base;
+		WRITE_RSP *pSMBr = (WRITE_RSP *)rsp_iov.iov_base;
 		*nbytes = le16_to_cpu(pSMBr->CountHigh);
 		*nbytes = (*nbytes) << 16;
 		*nbytes += le16_to_cpu(pSMBr->Count);
@@ -2263,8 +2277,7 @@ CIFSSMBWrite2(const unsigned int xid, struct cifs_io_parms *io_parms,
 			*nbytes &= 0xFFFF;
 	}
 
-/*	cifs_small_buf_release(pSMB); */ /* Freed earlier now in SendReceive2 */
-	free_rsp_buf(resp_buf_type, iov[0].iov_base);
+	free_rsp_buf(resp_buf_type, rsp_iov.iov_base);
 
 	/* Note: On -EAGAIN error only caller can retry on handle based calls
 		since file handle passed in no longer valid */
@@ -2279,6 +2292,7 @@ int cifs_lockv(const unsigned int xid, struct cifs_tcon *tcon,
 	int rc = 0;
 	LOCK_REQ *pSMB = NULL;
 	struct kvec iov[2];
+	struct kvec rsp_iov;
 	int resp_buf_type;
 	__u16 count;
 
@@ -2307,7 +2321,9 @@ int cifs_lockv(const unsigned int xid, struct cifs_tcon *tcon,
 	iov[1].iov_len = (num_unlock + num_lock) * sizeof(LOCKING_ANDX_RANGE);
 
 	cifs_stats_inc(&tcon->stats.cifs_stats.num_locks);
-	rc = SendReceive2(xid, tcon->ses, iov, 2, &resp_buf_type, CIFS_NO_RESP);
+	rc = SendReceive2(xid, tcon->ses, iov, 2, &resp_buf_type, CIFS_NO_RESP,
+			  &rsp_iov);
+	cifs_small_buf_release(pSMB);
 	if (rc)
 		cifs_dbg(FYI, "Send error in cifs_lockv = %d\n", rc);
 
@@ -2368,14 +2384,12 @@ CIFSSMBLock(const unsigned int xid, struct cifs_tcon *tcon,
 	inc_rfc1001_len(pSMB, count);
 	pSMB->ByteCount = cpu_to_le16(count);
 
-	if (waitFlag) {
+	if (waitFlag)
 		rc = SendReceiveBlockingLock(xid, tcon, (struct smb_hdr *) pSMB,
 			(struct smb_hdr *) pSMB, &bytes_returned);
-		cifs_small_buf_release(pSMB);
-	} else {
+	else
 		rc = SendReceiveNoRsp(xid, tcon->ses, (char *)pSMB, flags);
-		/* SMB buffer freed by function above */
-	}
+	cifs_small_buf_release(pSMB);
 	cifs_stats_inc(&tcon->stats.cifs_stats.num_locks);
 	if (rc)
 		cifs_dbg(FYI, "Send error in Lock = %d\n", rc);
@@ -2401,6 +2415,7 @@ CIFSSMBPosixLock(const unsigned int xid, struct cifs_tcon *tcon,
 	int resp_buf_type = 0;
 	__u16 params, param_offset, offset, byte_count, count;
 	struct kvec iov[1];
+	struct kvec rsp_iov;
 
 	cifs_dbg(FYI, "Posix Lock\n");
 
@@ -2462,11 +2477,10 @@ CIFSSMBPosixLock(const unsigned int xid, struct cifs_tcon *tcon,
 		iov[0].iov_base = (char *)pSMB;
 		iov[0].iov_len = be32_to_cpu(pSMB->hdr.smb_buf_length) + 4;
 		rc = SendReceive2(xid, tcon->ses, iov, 1 /* num iovecs */,
-				&resp_buf_type, timeout);
-		pSMB = NULL; /* request buf already freed by SendReceive2. Do
-				not try to free it twice below on exit */
-		pSMBr = (struct smb_com_transaction2_sfi_rsp *)iov[0].iov_base;
+				&resp_buf_type, timeout, &rsp_iov);
+		pSMBr = (struct smb_com_transaction2_sfi_rsp *)rsp_iov.iov_base;
 	}
+	cifs_small_buf_release(pSMB);
 
 	if (rc) {
 		cifs_dbg(FYI, "Send error in Posix Lock = %d\n", rc);
@@ -2506,10 +2520,7 @@ CIFSSMBPosixLock(const unsigned int xid, struct cifs_tcon *tcon,
 	}
 
 plk_err_exit:
-	if (pSMB)
-		cifs_small_buf_release(pSMB);
-
-	free_rsp_buf(resp_buf_type, iov[0].iov_base);
+	free_rsp_buf(resp_buf_type, rsp_iov.iov_base);
 
 	/* Note: On -EAGAIN error only caller can retry on handle based calls
 	   since file handle passed in no longer valid */
@@ -2536,6 +2547,7 @@ CIFSSMBClose(const unsigned int xid, struct cifs_tcon *tcon, int smb_file_id)
 	pSMB->LastWriteTime = 0xFFFFFFFF;
 	pSMB->ByteCount = 0;
 	rc = SendReceiveNoRsp(xid, tcon->ses, (char *) pSMB, 0);
+	cifs_small_buf_release(pSMB);
 	cifs_stats_inc(&tcon->stats.cifs_stats.num_closes);
 	if (rc) {
 		if (rc != -EINTR) {
@@ -2565,6 +2577,7 @@ CIFSSMBFlush(const unsigned int xid, struct cifs_tcon *tcon, int smb_file_id)
 	pSMB->FileID = (__u16) smb_file_id;
 	pSMB->ByteCount = 0;
 	rc = SendReceiveNoRsp(xid, tcon->ses, (char *) pSMB, 0);
+	cifs_small_buf_release(pSMB);
 	cifs_stats_inc(&tcon->stats.cifs_stats.num_flushes);
 	if (rc)
 		cifs_dbg(VFS, "Send error in Flush = %d\n", rc);
@@ -3820,6 +3833,7 @@ CIFSSMBGetCIFSACL(const unsigned int xid, struct cifs_tcon *tcon, __u16 fid,
 	int buf_type = 0;
 	QUERY_SEC_DESC_REQ *pSMB;
 	struct kvec iov[1];
+	struct kvec rsp_iov;
 
 	cifs_dbg(FYI, "GetCifsACL\n");
 
@@ -3843,7 +3857,8 @@ CIFSSMBGetCIFSACL(const unsigned int xid, struct cifs_tcon *tcon, __u16 fid,
 	iov[0].iov_len = be32_to_cpu(pSMB->hdr.smb_buf_length) + 4;
 
 	rc = SendReceive2(xid, tcon->ses, iov, 1 /* num iovec */, &buf_type,
-			 0);
+			  0, &rsp_iov);
+	cifs_small_buf_release(pSMB);
 	cifs_stats_inc(&tcon->stats.cifs_stats.num_acl_get);
 	if (rc) {
 		cifs_dbg(FYI, "Send error in QuerySecDesc = %d\n", rc);
@@ -3855,11 +3870,11 @@ CIFSSMBGetCIFSACL(const unsigned int xid, struct cifs_tcon *tcon, __u16 fid,
 		char *pdata;
 
 /* validate_nttransact */
-		rc = validate_ntransact(iov[0].iov_base, (char **)&parm,
+		rc = validate_ntransact(rsp_iov.iov_base, (char **)&parm,
 					&pdata, &parm_len, pbuflen);
 		if (rc)
 			goto qsec_out;
-		pSMBr = (struct smb_com_ntransact_rsp *)iov[0].iov_base;
+		pSMBr = (struct smb_com_ntransact_rsp *)rsp_iov.iov_base;
 
 		cifs_dbg(FYI, "smb %p parm %p data %p\n",
 			 pSMBr, parm, *acl_inf);
@@ -3896,8 +3911,7 @@ CIFSSMBGetCIFSACL(const unsigned int xid, struct cifs_tcon *tcon, __u16 fid,
 		}
 	}
 qsec_out:
-	free_rsp_buf(buf_type, iov[0].iov_base);
-/*	cifs_small_buf_release(pSMB); */ /* Freed earlier now in SendReceive2 */
+	free_rsp_buf(buf_type, rsp_iov.iov_base);
 	return rc;
 }
 
@@ -4666,6 +4680,7 @@ CIFSFindClose(const unsigned int xid, struct cifs_tcon *tcon,
 	pSMB->FileID = searchHandle;
 	pSMB->ByteCount = 0;
 	rc = SendReceiveNoRsp(xid, tcon->ses, (char *) pSMB, 0);
+	cifs_small_buf_release(pSMB);
 	if (rc)
 		cifs_dbg(VFS, "Send error in FindClose = %d\n", rc);
 
@@ -5687,6 +5702,7 @@ CIFSSMBSetFileSize(const unsigned int xid, struct cifs_tcon *tcon,
 	inc_rfc1001_len(pSMB, byte_count);
 	pSMB->ByteCount = cpu_to_le16(byte_count);
 	rc = SendReceiveNoRsp(xid, tcon->ses, (char *) pSMB, 0);
+	cifs_small_buf_release(pSMB);
 	if (rc) {
 		cifs_dbg(FYI, "Send error in SetFileInfo (SetFileSize) = %d\n",
 			 rc);
@@ -5758,6 +5774,7 @@ CIFSSMBSetFileInfo(const unsigned int xid, struct cifs_tcon *tcon,
 	pSMB->ByteCount = cpu_to_le16(byte_count);
 	memcpy(data_offset, data, sizeof(FILE_BASIC_INFO));
 	rc = SendReceiveNoRsp(xid, tcon->ses, (char *) pSMB, 0);
+	cifs_small_buf_release(pSMB);
 	if (rc)
 		cifs_dbg(FYI, "Send error in Set Time (SetFileInfo) = %d\n",
 			 rc);
@@ -5818,6 +5835,7 @@ CIFSSMBSetFileDisposition(const unsigned int xid, struct cifs_tcon *tcon,
 	pSMB->ByteCount = cpu_to_le16(byte_count);
 	*data_offset = delete_file ? 1 : 0;
 	rc = SendReceiveNoRsp(xid, tcon->ses, (char *) pSMB, 0);
+	cifs_small_buf_release(pSMB);
 	if (rc)
 		cifs_dbg(FYI, "Send error in SetFileDisposition = %d\n", rc);
 
@@ -6057,6 +6075,7 @@ CIFSSMBUnixSetFileInfo(const unsigned int xid, struct cifs_tcon *tcon,
 	cifs_fill_unix_set_info((FILE_UNIX_BASIC_INFO *)data_offset, args);
 
 	rc = SendReceiveNoRsp(xid, tcon->ses, (char *) pSMB, 0);
+	cifs_small_buf_release(pSMB);
 	if (rc)
 		cifs_dbg(FYI, "Send error in Set Time (SetFileInfo) = %d\n",
 			 rc);
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 35ae49ed1f76..777ad9f4fc3c 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -787,6 +787,15 @@ standard_receive3(struct TCP_Server_Info *server, struct mid_q_entry *mid)
 
 	dump_smb(buf, server->total_read);
 
+	return cifs_handle_standard(server, mid);
+}
+
+int
+cifs_handle_standard(struct TCP_Server_Info *server, struct mid_q_entry *mid)
+{
+	char *buf = server->large_buf ? server->bigbuf : server->smallbuf;
+	int length;
+
 	/*
 	 * We know that we received enough to get to the MID as we
 	 * checked the pdu_length earlier. Now check to see
@@ -872,12 +881,19 @@ cifs_demultiplex_thread(void *p)
 			continue;
 		server->total_read += length;
 
-		mid_entry = server->ops->find_mid(server, buf);
+		if (server->ops->is_transform_hdr &&
+		    server->ops->receive_transform &&
+		    server->ops->is_transform_hdr(buf)) {
+			length = server->ops->receive_transform(server,
+								&mid_entry);
+		} else {
+			mid_entry = server->ops->find_mid(server, buf);
 
-		if (!mid_entry || !mid_entry->receive)
-			length = standard_receive3(server, mid_entry);
-		else
-			length = mid_entry->receive(server, mid_entry);
+			if (!mid_entry || !mid_entry->receive)
+				length = standard_receive3(server, mid_entry);
+			else
+				length = mid_entry->receive(server, mid_entry);
+		}
 
 		if (length < 0)
 			continue;
@@ -2154,7 +2170,7 @@ cifs_put_tcp_session(struct TCP_Server_Info *server, int from_reconnect)
 	server->tcpStatus = CifsExiting;
 	spin_unlock(&GlobalMid_Lock);
 
-	cifs_crypto_shash_release(server);
+	cifs_crypto_secmech_release(server);
 	cifs_fscache_release_client_cookie(server);
 
 	kfree(server->session_key.response);
@@ -2273,7 +2289,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
 	return tcp_ses;
 
 out_err_crypto_release:
-	cifs_crypto_shash_release(tcp_ses);
+	cifs_crypto_secmech_release(tcp_ses);
 
 	put_net(cifs_net_ns(tcp_ses));
 
@@ -2614,12 +2630,18 @@ get_ses_fail:
 	return ERR_PTR(rc);
 }
 
-static int match_tcon(struct cifs_tcon *tcon, const char *unc)
+static int match_tcon(struct cifs_tcon *tcon, struct smb_vol *volume_info)
 {
 	if (tcon->tidStatus == CifsExiting)
 		return 0;
-	if (strncmp(tcon->treeName, unc, MAX_TREE_SIZE))
+	if (strncmp(tcon->treeName, volume_info->UNC, MAX_TREE_SIZE))
 		return 0;
+	if (tcon->seal != volume_info->seal)
+		return 0;
+#ifdef CONFIG_CIFS_SMB2
+	if (tcon->snapshot_time != volume_info->snapshot_time)
+		return 0;
+#endif /* CONFIG_CIFS_SMB2 */
 	return 1;
 }
 
@@ -2632,14 +2654,8 @@ cifs_find_tcon(struct cifs_ses *ses, struct smb_vol *volume_info)
 	spin_lock(&cifs_tcp_ses_lock);
 	list_for_each(tmp, &ses->tcon_list) {
 		tcon = list_entry(tmp, struct cifs_tcon, tcon_list);
-		if (!match_tcon(tcon, volume_info->UNC))
-			continue;
-
-#ifdef CONFIG_CIFS_SMB2
-		if (tcon->snapshot_time != volume_info->snapshot_time)
+		if (!match_tcon(tcon, volume_info))
 			continue;
-#endif /* CONFIG_CIFS_SMB2 */
-
 		++tcon->tc_count;
 		spin_unlock(&cifs_tcp_ses_lock);
 		return tcon;
@@ -2685,8 +2701,6 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb_vol *volume_info)
 		cifs_dbg(FYI, "Found match on UNC path\n");
 		/* existing tcon already has a reference */
 		cifs_put_smb_ses(ses);
-		if (tcon->seal != volume_info->seal)
-			cifs_dbg(VFS, "transport encryption setting conflicts with existing tid\n");
 		return tcon;
 	}
 
@@ -2742,7 +2756,6 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb_vol *volume_info)
 		tcon->Flags &= ~SMB_SHARE_IS_IN_DFS;
 		cifs_dbg(FYI, "DFS disabled (%d)\n", tcon->Flags);
 	}
-	tcon->seal = volume_info->seal;
 	tcon->use_persistent = false;
 	/* check if SMB2 or later, CIFS does not support persistent handles */
 	if (volume_info->persistent) {
@@ -2779,6 +2792,24 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb_vol *volume_info)
 		tcon->use_resilient = true;
 	}
 
+	if (volume_info->seal) {
+		if (ses->server->vals->protocol_id == 0) {
+			cifs_dbg(VFS,
+				 "SMB3 or later required for encryption\n");
+			rc = -EOPNOTSUPP;
+			goto out_fail;
+#ifdef CONFIG_CIFS_SMB2
+		} else if (tcon->ses->server->capabilities &
+					SMB2_GLOBAL_CAP_ENCRYPTION)
+			tcon->seal = true;
+		else {
+			cifs_dbg(VFS, "Encryption is not supported on share\n");
+			rc = -EOPNOTSUPP;
+			goto out_fail;
+#endif /* CONFIG_CIFS_SMB2 */
+		}
+	}
+
 	/*
 	 * We can have only one retry value for a connection to a share so for
 	 * resources mounted more than once to the same server share the last
@@ -2910,7 +2941,7 @@ cifs_match_super(struct super_block *sb, void *data)
 
 	if (!match_server(tcp_srv, volume_info) ||
 	    !match_session(ses, volume_info) ||
-	    !match_tcon(tcon, volume_info->UNC) ||
+	    !match_tcon(tcon, volume_info) ||
 	    !match_prepath(sb, mnt_data)) {
 		rc = 0;
 		goto out;
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 18a1e1d6671f..98dc842e7245 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -2884,7 +2884,15 @@ cifs_readdata_to_iov(struct cifs_readdata *rdata, struct iov_iter *iter)
 	for (i = 0; i < rdata->nr_pages; i++) {
 		struct page *page = rdata->pages[i];
 		size_t copy = min_t(size_t, remaining, PAGE_SIZE);
-		size_t written = copy_page_to_iter(page, 0, copy, iter);
+		size_t written;
+
+		if (unlikely(iter->type & ITER_PIPE)) {
+			void *addr = kmap_atomic(page);
+
+			written = copy_to_iter(addr, copy, iter);
+			kunmap_atomic(addr);
+		} else
+			written = copy_page_to_iter(page, 0, copy, iter);
 		remaining -= written;
 		if (written < copy && iov_iter_count(iter) > 0)
 			break;
@@ -2903,8 +2911,9 @@ cifs_uncached_readv_complete(struct work_struct *work)
 }
 
 static int
-cifs_uncached_read_into_pages(struct TCP_Server_Info *server,
-			struct cifs_readdata *rdata, unsigned int len)
+uncached_fill_pages(struct TCP_Server_Info *server,
+		    struct cifs_readdata *rdata, struct iov_iter *iter,
+		    unsigned int len)
 {
 	int result = 0;
 	unsigned int i;
@@ -2933,7 +2942,10 @@ cifs_uncached_read_into_pages(struct TCP_Server_Info *server,
 			rdata->tailsz = len;
 			len = 0;
 		}
-		result = cifs_read_page_from_socket(server, page, n);
+		if (iter)
+			result = copy_page_from_iter(page, 0, n, iter);
+		else
+			result = cifs_read_page_from_socket(server, page, n);
 		if (result < 0)
 			break;
 
@@ -2945,6 +2957,21 @@ cifs_uncached_read_into_pages(struct TCP_Server_Info *server,
 }
 
 static int
+cifs_uncached_read_into_pages(struct TCP_Server_Info *server,
+			      struct cifs_readdata *rdata, unsigned int len)
+{
+	return uncached_fill_pages(server, rdata, NULL, len);
+}
+
+static int
+cifs_uncached_copy_into_pages(struct TCP_Server_Info *server,
+			      struct cifs_readdata *rdata,
+			      struct iov_iter *iter)
+{
+	return uncached_fill_pages(server, rdata, iter, iter->count);
+}
+
+static int
 cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo *open_file,
 		     struct cifs_sb_info *cifs_sb, struct list_head *rdata_list)
 {
@@ -2991,6 +3018,7 @@ cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo *open_file,
 		rdata->pid = pid;
 		rdata->pagesz = PAGE_SIZE;
 		rdata->read_into_pages = cifs_uncached_read_into_pages;
+		rdata->copy_into_pages = cifs_uncached_copy_into_pages;
 		rdata->credits = credits;
 
 		if (!rdata->cfile->invalidHandle ||
@@ -3341,8 +3369,9 @@ cifs_readv_complete(struct work_struct *work)
 }
 
 static int
-cifs_readpages_read_into_pages(struct TCP_Server_Info *server,
-			struct cifs_readdata *rdata, unsigned int len)
+readpages_fill_pages(struct TCP_Server_Info *server,
+		     struct cifs_readdata *rdata, struct iov_iter *iter,
+		     unsigned int len)
 {
 	int result = 0;
 	unsigned int i;
@@ -3396,7 +3425,10 @@ cifs_readpages_read_into_pages(struct TCP_Server_Info *server,
 			continue;
 		}
 
-		result = cifs_read_page_from_socket(server, page, n);
+		if (iter)
+			result = copy_page_from_iter(page, 0, n, iter);
+		else
+			result = cifs_read_page_from_socket(server, page, n);
 		if (result < 0)
 			break;
 
@@ -3408,6 +3440,21 @@ cifs_readpages_read_into_pages(struct TCP_Server_Info *server,
 }
 
 static int
+cifs_readpages_read_into_pages(struct TCP_Server_Info *server,
+			       struct cifs_readdata *rdata, unsigned int len)
+{
+	return readpages_fill_pages(server, rdata, NULL, len);
+}
+
+static int
+cifs_readpages_copy_into_pages(struct TCP_Server_Info *server,
+			       struct cifs_readdata *rdata,
+			       struct iov_iter *iter)
+{
+	return readpages_fill_pages(server, rdata, iter, iter->count);
+}
+
+static int
 readpages_get_pages(struct address_space *mapping, struct list_head *page_list,
 		    unsigned int rsize, struct list_head *tmplist,
 		    unsigned int *nr_pages, loff_t *offset, unsigned int *bytes)
@@ -3561,6 +3608,7 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
 		rdata->pid = pid;
 		rdata->pagesz = PAGE_SIZE;
 		rdata->read_into_pages = cifs_readpages_read_into_pages;
+		rdata->copy_into_pages = cifs_readpages_copy_into_pages;
 		rdata->credits = credits;
 
 		list_for_each_entry_safe(page, tpage, &tmplist, lru) {
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 8f6a2a5863b9..a27fc8791551 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -285,6 +285,7 @@ initiate_cifs_search(const unsigned int xid, struct file *file)
 			rc = -ENOMEM;
 			goto error_exit;
 		}
+		spin_lock_init(&cifsFile->file_info_lock);
 		file->private_data = cifsFile;
 		cifsFile->tlink = cifs_get_tlink(tlink);
 		tcon = tlink_tcon(tlink);
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 538d9b55699a..dcbcc927399a 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -344,13 +344,12 @@ void build_ntlmssp_negotiate_blob(unsigned char *pbuffer,
 	/* BB is NTLMV2 session security format easier to use here? */
 	flags = NTLMSSP_NEGOTIATE_56 |	NTLMSSP_REQUEST_TARGET |
 		NTLMSSP_NEGOTIATE_128 | NTLMSSP_NEGOTIATE_UNICODE |
-		NTLMSSP_NEGOTIATE_NTLM | NTLMSSP_NEGOTIATE_EXTENDED_SEC;
-	if (ses->server->sign) {
+		NTLMSSP_NEGOTIATE_NTLM | NTLMSSP_NEGOTIATE_EXTENDED_SEC |
+		NTLMSSP_NEGOTIATE_SEAL;
+	if (ses->server->sign)
 		flags |= NTLMSSP_NEGOTIATE_SIGN;
-		if (!ses->server->session_estab ||
-				ses->ntlmssp->sesskey_per_smbsess)
-			flags |= NTLMSSP_NEGOTIATE_KEY_XCH;
-	}
+	if (!ses->server->session_estab || ses->ntlmssp->sesskey_per_smbsess)
+		flags |= NTLMSSP_NEGOTIATE_KEY_XCH;
 
 	sec_blob->NegotiateFlags = cpu_to_le32(flags);
 
@@ -407,13 +406,12 @@ int build_ntlmssp_auth_blob(unsigned char **pbuffer,
 	flags = NTLMSSP_NEGOTIATE_56 |
 		NTLMSSP_REQUEST_TARGET | NTLMSSP_NEGOTIATE_TARGET_INFO |
 		NTLMSSP_NEGOTIATE_128 | NTLMSSP_NEGOTIATE_UNICODE |
-		NTLMSSP_NEGOTIATE_NTLM | NTLMSSP_NEGOTIATE_EXTENDED_SEC;
-	if (ses->server->sign) {
+		NTLMSSP_NEGOTIATE_NTLM | NTLMSSP_NEGOTIATE_EXTENDED_SEC |
+		NTLMSSP_NEGOTIATE_SEAL;
+	if (ses->server->sign)
 		flags |= NTLMSSP_NEGOTIATE_SIGN;
-		if (!ses->server->session_estab ||
-				ses->ntlmssp->sesskey_per_smbsess)
-			flags |= NTLMSSP_NEGOTIATE_KEY_XCH;
-	}
+	if (!ses->server->session_estab || ses->ntlmssp->sesskey_per_smbsess)
+		flags |= NTLMSSP_NEGOTIATE_KEY_XCH;
 
 	tmp = *pbuffer + sizeof(AUTHENTICATE_MESSAGE);
 	sec_blob->NegotiateFlags = cpu_to_le32(flags);
@@ -652,6 +650,7 @@ sess_sendreceive(struct sess_data *sess_data)
 	int rc;
 	struct smb_hdr *smb_buf = (struct smb_hdr *) sess_data->iov[0].iov_base;
 	__u16 count;
+	struct kvec rsp_iov = { NULL, 0 };
 
 	count = sess_data->iov[1].iov_len + sess_data->iov[2].iov_len;
 	smb_buf->smb_buf_length =
@@ -661,7 +660,9 @@ sess_sendreceive(struct sess_data *sess_data)
 	rc = SendReceive2(sess_data->xid, sess_data->ses,
 			  sess_data->iov, 3 /* num_iovecs */,
 			  &sess_data->buf0_type,
-			  CIFS_LOG_ERROR);
+			  CIFS_LOG_ERROR, &rsp_iov);
+	cifs_small_buf_release(sess_data->iov[0].iov_base);
+	memcpy(&sess_data->iov[0], &rsp_iov, sizeof(struct kvec));
 
 	return rc;
 }
diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c
index fc537c29044e..67a987e4d026 100644
--- a/fs/cifs/smb1ops.c
+++ b/fs/cifs/smb1ops.c
@@ -36,11 +36,11 @@
  * SMB_COM_NT_CANCEL request and then sends it.
  */
 static int
-send_nt_cancel(struct TCP_Server_Info *server, void *buf,
+send_nt_cancel(struct TCP_Server_Info *server, struct smb_rqst *rqst,
 	       struct mid_q_entry *mid)
 {
 	int rc = 0;
-	struct smb_hdr *in_buf = (struct smb_hdr *)buf;
+	struct smb_hdr *in_buf = (struct smb_hdr *)rqst->rq_iov[0].iov_base;
 
 	/* -4 for RFC1001 length and +2 for BCC field */
 	in_buf->smb_buf_length = cpu_to_be32(sizeof(struct smb_hdr) - 4  + 2);
diff --git a/fs/cifs/smb2glob.h b/fs/cifs/smb2glob.h
index 0ffa18094335..401a5d856636 100644
--- a/fs/cifs/smb2glob.h
+++ b/fs/cifs/smb2glob.h
@@ -61,4 +61,9 @@
 /* Maximum buffer size value we can send with 1 credit */
 #define SMB2_MAX_BUFFER_SIZE 65536
 
+static inline struct smb2_sync_hdr *get_sync_hdr(void *buf)
+{
+	return &(((struct smb2_hdr *)buf)->sync_hdr);
+}
+
 #endif	/* _SMB2_GLOB_H */
diff --git a/fs/cifs/smb2maperror.c b/fs/cifs/smb2maperror.c
index 8257a5a97cc0..3030a9dfb0dd 100644
--- a/fs/cifs/smb2maperror.c
+++ b/fs/cifs/smb2maperror.c
@@ -26,6 +26,7 @@
 #include "smb2pdu.h"
 #include "smb2proto.h"
 #include "smb2status.h"
+#include "smb2glob.h"
 
 struct status_to_posix_error {
 	__le32 smb2_status;
@@ -2449,10 +2450,10 @@ smb2_print_status(__le32 status)
 int
 map_smb2_to_linux_error(char *buf, bool log_err)
 {
-	struct smb2_hdr *hdr = (struct smb2_hdr *)buf;
+	struct smb2_sync_hdr *shdr = get_sync_hdr(buf);
 	unsigned int i;
 	int rc = -EIO;
-	__le32 smb2err = hdr->Status;
+	__le32 smb2err = shdr->Status;
 
 	if (smb2err == 0)
 		return 0;
diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c
index 3d383489b9cf..fd516ea8b8f8 100644
--- a/fs/cifs/smb2misc.c
+++ b/fs/cifs/smb2misc.c
@@ -28,31 +28,32 @@
 #include "cifs_debug.h"
 #include "cifs_unicode.h"
 #include "smb2status.h"
+#include "smb2glob.h"
 
 static int
-check_smb2_hdr(struct smb2_hdr *hdr, __u64 mid)
+check_smb2_hdr(struct smb2_sync_hdr *shdr, __u64 mid)
 {
-	__u64 wire_mid = le64_to_cpu(hdr->MessageId);
+	__u64 wire_mid = le64_to_cpu(shdr->MessageId);
 
 	/*
 	 * Make sure that this really is an SMB, that it is a response,
 	 * and that the message ids match.
 	 */
-	if ((hdr->ProtocolId == SMB2_PROTO_NUMBER) &&
+	if ((shdr->ProtocolId == SMB2_PROTO_NUMBER) &&
 	    (mid == wire_mid)) {
-		if (hdr->Flags & SMB2_FLAGS_SERVER_TO_REDIR)
+		if (shdr->Flags & SMB2_FLAGS_SERVER_TO_REDIR)
 			return 0;
 		else {
 			/* only one valid case where server sends us request */
-			if (hdr->Command == SMB2_OPLOCK_BREAK)
+			if (shdr->Command == SMB2_OPLOCK_BREAK)
 				return 0;
 			else
 				cifs_dbg(VFS, "Received Request not response\n");
 		}
 	} else { /* bad signature or mid */
-		if (hdr->ProtocolId != SMB2_PROTO_NUMBER)
+		if (shdr->ProtocolId != SMB2_PROTO_NUMBER)
 			cifs_dbg(VFS, "Bad protocol string signature header %x\n",
-				 le32_to_cpu(hdr->ProtocolId));
+				 le32_to_cpu(shdr->ProtocolId));
 		if (mid != wire_mid)
 			cifs_dbg(VFS, "Mids do not match: %llu and %llu\n",
 				 mid, wire_mid);
@@ -95,8 +96,9 @@ static const __le16 smb2_rsp_struct_sizes[NUMBER_OF_SMB2_COMMANDS] = {
 int
 smb2_check_message(char *buf, unsigned int length, struct TCP_Server_Info *srvr)
 {
-	struct smb2_hdr *hdr = (struct smb2_hdr *)buf;
-	struct smb2_pdu *pdu = (struct smb2_pdu *)hdr;
+	struct smb2_pdu *pdu = (struct smb2_pdu *)buf;
+	struct smb2_hdr *hdr = &pdu->hdr;
+	struct smb2_sync_hdr *shdr = get_sync_hdr(buf);
 	__u64 mid;
 	__u32 len = get_rfc1002_length(buf);
 	__u32 clc_len;  /* calculated length */
@@ -111,7 +113,7 @@ smb2_check_message(char *buf, unsigned int length, struct TCP_Server_Info *srvr)
 	 * ie Validate the wct via smb2_struct_sizes table above
 	 */
 
-	if (hdr->ProtocolId == SMB2_TRANSFORM_PROTO_NUM) {
+	if (shdr->ProtocolId == SMB2_TRANSFORM_PROTO_NUM) {
 		struct smb2_transform_hdr *thdr =
 			(struct smb2_transform_hdr *)buf;
 		struct cifs_ses *ses = NULL;
@@ -133,10 +135,10 @@ smb2_check_message(char *buf, unsigned int length, struct TCP_Server_Info *srvr)
 		}
 	}
 
-
-	mid = le64_to_cpu(hdr->MessageId);
+	mid = le64_to_cpu(shdr->MessageId);
 	if (length < sizeof(struct smb2_pdu)) {
-		if ((length >= sizeof(struct smb2_hdr)) && (hdr->Status != 0)) {
+		if ((length >= sizeof(struct smb2_hdr))
+		    && (shdr->Status != 0)) {
 			pdu->StructureSize2 = 0;
 			/*
 			 * As with SMB/CIFS, on some error cases servers may
@@ -154,29 +156,30 @@ smb2_check_message(char *buf, unsigned int length, struct TCP_Server_Info *srvr)
 		return 1;
 	}
 
-	if (check_smb2_hdr(hdr, mid))
+	if (check_smb2_hdr(shdr, mid))
 		return 1;
 
-	if (hdr->StructureSize != SMB2_HEADER_STRUCTURE_SIZE) {
+	if (shdr->StructureSize != SMB2_HEADER_STRUCTURE_SIZE) {
 		cifs_dbg(VFS, "Illegal structure size %u\n",
-			 le16_to_cpu(hdr->StructureSize));
+			 le16_to_cpu(shdr->StructureSize));
 		return 1;
 	}
 
-	command = le16_to_cpu(hdr->Command);
+	command = le16_to_cpu(shdr->Command);
 	if (command >= NUMBER_OF_SMB2_COMMANDS) {
 		cifs_dbg(VFS, "Illegal SMB2 command %d\n", command);
 		return 1;
 	}
 
 	if (smb2_rsp_struct_sizes[command] != pdu->StructureSize2) {
-		if (command != SMB2_OPLOCK_BREAK_HE && (hdr->Status == 0 ||
+		if (command != SMB2_OPLOCK_BREAK_HE && (shdr->Status == 0 ||
 		    pdu->StructureSize2 != SMB2_ERROR_STRUCTURE_SIZE2)) {
 			/* error packets have 9 byte structure size */
 			cifs_dbg(VFS, "Illegal response size %u for command %d\n",
 				 le16_to_cpu(pdu->StructureSize2), command);
 			return 1;
-		} else if (command == SMB2_OPLOCK_BREAK_HE && (hdr->Status == 0)
+		} else if (command == SMB2_OPLOCK_BREAK_HE
+			   && (shdr->Status == 0)
 			   && (le16_to_cpu(pdu->StructureSize2) != 44)
 			   && (le16_to_cpu(pdu->StructureSize2) != 36)) {
 			/* special case for SMB2.1 lease break message */
@@ -199,7 +202,7 @@ smb2_check_message(char *buf, unsigned int length, struct TCP_Server_Info *srvr)
 			 clc_len, 4 + len, mid);
 		/* create failed on symlink */
 		if (command == SMB2_CREATE_HE &&
-		    hdr->Status == STATUS_STOPPED_ON_SYMLINK)
+		    shdr->Status == STATUS_STOPPED_ON_SYMLINK)
 			return 0;
 		/* Windows 7 server returns 24 bytes more */
 		if (clc_len + 20 == len && command == SMB2_OPLOCK_BREAK_HE)
@@ -261,11 +264,12 @@ static const bool has_smb2_data_area[NUMBER_OF_SMB2_COMMANDS] = {
 char *
 smb2_get_data_area_len(int *off, int *len, struct smb2_hdr *hdr)
 {
+	struct smb2_sync_hdr *shdr = get_sync_hdr(hdr);
 	*off = 0;
 	*len = 0;
 
 	/* error responses do not have data area */
-	if (hdr->Status && hdr->Status != STATUS_MORE_PROCESSING_REQUIRED &&
+	if (shdr->Status && shdr->Status != STATUS_MORE_PROCESSING_REQUIRED &&
 	    (((struct smb2_err_rsp *)hdr)->StructureSize) ==
 						SMB2_ERROR_STRUCTURE_SIZE2)
 		return NULL;
@@ -275,7 +279,7 @@ smb2_get_data_area_len(int *off, int *len, struct smb2_hdr *hdr)
 	 * of the data buffer offset and data buffer length for the particular
 	 * command.
 	 */
-	switch (hdr->Command) {
+	switch (shdr->Command) {
 	case SMB2_NEGOTIATE:
 		*off = le16_to_cpu(
 		    ((struct smb2_negotiate_rsp *)hdr)->SecurityBufferOffset);
@@ -346,7 +350,7 @@ smb2_get_data_area_len(int *off, int *len, struct smb2_hdr *hdr)
 
 	/* return pointer to beginning of data area, ie offset from SMB start */
 	if ((*off != 0) && (*len != 0))
-		return (char *)(&hdr->ProtocolId) + *off;
+		return (char *)shdr + *off;
 	else
 		return NULL;
 }
@@ -358,12 +362,13 @@ smb2_get_data_area_len(int *off, int *len, struct smb2_hdr *hdr)
 unsigned int
 smb2_calc_size(void *buf)
 {
-	struct smb2_hdr *hdr = (struct smb2_hdr *)buf;
-	struct smb2_pdu *pdu = (struct smb2_pdu *)hdr;
+	struct smb2_pdu *pdu = (struct smb2_pdu *)buf;
+	struct smb2_hdr *hdr = &pdu->hdr;
+	struct smb2_sync_hdr *shdr = get_sync_hdr(hdr);
 	int offset; /* the offset from the beginning of SMB to data area */
 	int data_length; /* the length of the variable length data area */
 	/* Structure Size has already been checked to make sure it is 64 */
-	int len = 4 + le16_to_cpu(pdu->hdr.StructureSize);
+	int len = 4 + le16_to_cpu(shdr->StructureSize);
 
 	/*
 	 * StructureSize2, ie length of fixed parameter area has already
@@ -371,7 +376,7 @@ smb2_calc_size(void *buf)
 	 */
 	len += le16_to_cpu(pdu->StructureSize2);
 
-	if (has_smb2_data_area[le16_to_cpu(hdr->Command)] == false)
+	if (has_smb2_data_area[le16_to_cpu(shdr->Command)] == false)
 		goto calc_size_exit;
 
 	smb2_get_data_area_len(&offset, &data_length, hdr);
@@ -582,7 +587,7 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server)
 
 	cifs_dbg(FYI, "Checking for oplock break\n");
 
-	if (rsp->hdr.Command != SMB2_OPLOCK_BREAK)
+	if (rsp->hdr.sync_hdr.Command != SMB2_OPLOCK_BREAK)
 		return false;
 
 	if (rsp->StructureSize !=
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index 5d456ebb3813..a44b4dbe4aae 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -20,6 +20,8 @@
 #include <linux/pagemap.h>
 #include <linux/vfs.h>
 #include <linux/falloc.h>
+#include <linux/scatterlist.h>
+#include <crypto/aead.h>
 #include "cifsglob.h"
 #include "smb2pdu.h"
 #include "smb2proto.h"
@@ -119,7 +121,9 @@ smb2_get_credits_field(struct TCP_Server_Info *server, const int optype)
 static unsigned int
 smb2_get_credits(struct mid_q_entry *mid)
 {
-	return le16_to_cpu(((struct smb2_hdr *)mid->resp_buf)->CreditRequest);
+	struct smb2_sync_hdr *shdr = get_sync_hdr(mid->resp_buf);
+
+	return le16_to_cpu(shdr->CreditRequest);
 }
 
 static int
@@ -184,10 +188,10 @@ static struct mid_q_entry *
 smb2_find_mid(struct TCP_Server_Info *server, char *buf)
 {
 	struct mid_q_entry *mid;
-	struct smb2_hdr *hdr = (struct smb2_hdr *)buf;
-	__u64 wire_mid = le64_to_cpu(hdr->MessageId);
+	struct smb2_sync_hdr *shdr = get_sync_hdr(buf);
+	__u64 wire_mid = le64_to_cpu(shdr->MessageId);
 
-	if (hdr->ProtocolId == SMB2_TRANSFORM_PROTO_NUM) {
+	if (shdr->ProtocolId == SMB2_TRANSFORM_PROTO_NUM) {
 		cifs_dbg(VFS, "encrypted frame parsing not supported yet");
 		return NULL;
 	}
@@ -196,7 +200,7 @@ smb2_find_mid(struct TCP_Server_Info *server, char *buf)
 	list_for_each_entry(mid, &server->pending_mid_q, qhead) {
 		if ((mid->mid == wire_mid) &&
 		    (mid->mid_state == MID_REQUEST_SUBMITTED) &&
-		    (mid->command == hdr->Command)) {
+		    (mid->command == shdr->Command)) {
 			spin_unlock(&GlobalMid_Lock);
 			return mid;
 		}
@@ -209,12 +213,12 @@ static void
 smb2_dump_detail(void *buf)
 {
 #ifdef CONFIG_CIFS_DEBUG2
-	struct smb2_hdr *smb = (struct smb2_hdr *)buf;
+	struct smb2_sync_hdr *shdr = get_sync_hdr(buf);
 
 	cifs_dbg(VFS, "Cmd: %d Err: 0x%x Flags: 0x%x Mid: %llu Pid: %d\n",
-		 smb->Command, smb->Status, smb->Flags, smb->MessageId,
-		 smb->ProcessId);
-	cifs_dbg(VFS, "smb buf %p len %u\n", smb, smb2_calc_size(smb));
+		 shdr->Command, shdr->Status, shdr->Flags, shdr->MessageId,
+		 shdr->ProcessId);
+	cifs_dbg(VFS, "smb buf %p len %u\n", buf, smb2_calc_size(buf));
 #endif
 }
 
@@ -1002,14 +1006,14 @@ smb2_close_dir(const unsigned int xid, struct cifs_tcon *tcon,
 static bool
 smb2_is_status_pending(char *buf, struct TCP_Server_Info *server, int length)
 {
-	struct smb2_hdr *hdr = (struct smb2_hdr *)buf;
+	struct smb2_sync_hdr *shdr = get_sync_hdr(buf);
 
-	if (hdr->Status != STATUS_PENDING)
+	if (shdr->Status != STATUS_PENDING)
 		return false;
 
 	if (!length) {
 		spin_lock(&server->req_lock);
-		server->credits += le16_to_cpu(hdr->CreditRequest);
+		server->credits += le16_to_cpu(shdr->CreditRequest);
 		spin_unlock(&server->req_lock);
 		wake_up(&server->request_q);
 	}
@@ -1545,6 +1549,633 @@ smb2_dir_needs_close(struct cifsFileInfo *cfile)
 	return !cfile->invalidHandle;
 }
 
+static void
+fill_transform_hdr(struct smb2_transform_hdr *tr_hdr, struct smb_rqst *old_rq)
+{
+	struct smb2_sync_hdr *shdr =
+			(struct smb2_sync_hdr *)old_rq->rq_iov[1].iov_base;
+	unsigned int orig_len = get_rfc1002_length(old_rq->rq_iov[0].iov_base);
+
+	memset(tr_hdr, 0, sizeof(struct smb2_transform_hdr));
+	tr_hdr->ProtocolId = SMB2_TRANSFORM_PROTO_NUM;
+	tr_hdr->OriginalMessageSize = cpu_to_le32(orig_len);
+	tr_hdr->Flags = cpu_to_le16(0x01);
+	get_random_bytes(&tr_hdr->Nonce, SMB3_AES128CMM_NONCE);
+	memcpy(&tr_hdr->SessionId, &shdr->SessionId, 8);
+	inc_rfc1001_len(tr_hdr, sizeof(struct smb2_transform_hdr) - 4);
+	inc_rfc1001_len(tr_hdr, orig_len);
+}
+
+static struct scatterlist *
+init_sg(struct smb_rqst *rqst, u8 *sign)
+{
+	unsigned int sg_len = rqst->rq_nvec + rqst->rq_npages + 1;
+	unsigned int assoc_data_len = sizeof(struct smb2_transform_hdr) - 24;
+	struct scatterlist *sg;
+	unsigned int i;
+	unsigned int j;
+
+	sg = kmalloc_array(sg_len, sizeof(struct scatterlist), GFP_KERNEL);
+	if (!sg)
+		return NULL;
+
+	sg_init_table(sg, sg_len);
+	sg_set_buf(&sg[0], rqst->rq_iov[0].iov_base + 24, assoc_data_len);
+	for (i = 1; i < rqst->rq_nvec; i++)
+		sg_set_buf(&sg[i], rqst->rq_iov[i].iov_base,
+						rqst->rq_iov[i].iov_len);
+	for (j = 0; i < sg_len - 1; i++, j++) {
+		unsigned int len = (j < rqst->rq_npages - 1) ? rqst->rq_pagesz
+							: rqst->rq_tailsz;
+		sg_set_page(&sg[i], rqst->rq_pages[j], len, 0);
+	}
+	sg_set_buf(&sg[sg_len - 1], sign, SMB2_SIGNATURE_SIZE);
+	return sg;
+}
+
+struct cifs_crypt_result {
+	int err;
+	struct completion completion;
+};
+
+static void cifs_crypt_complete(struct crypto_async_request *req, int err)
+{
+	struct cifs_crypt_result *res = req->data;
+
+	if (err == -EINPROGRESS)
+		return;
+
+	res->err = err;
+	complete(&res->completion);
+}
+
+/*
+ * Encrypt or decrypt @rqst message. @rqst has the following format:
+ * iov[0] - transform header (associate data),
+ * iov[1-N] and pages - data to encrypt.
+ * On success return encrypted data in iov[1-N] and pages, leave iov[0]
+ * untouched.
+ */
+static int
+crypt_message(struct TCP_Server_Info *server, struct smb_rqst *rqst, int enc)
+{
+	struct smb2_transform_hdr *tr_hdr =
+			(struct smb2_transform_hdr *)rqst->rq_iov[0].iov_base;
+	unsigned int assoc_data_len = sizeof(struct smb2_transform_hdr) - 24;
+	struct cifs_ses *ses;
+	int rc = 0;
+	struct scatterlist *sg;
+	u8 sign[SMB2_SIGNATURE_SIZE] = {};
+	struct aead_request *req;
+	char *iv;
+	unsigned int iv_len;
+	struct cifs_crypt_result result = {0, };
+	struct crypto_aead *tfm;
+	unsigned int crypt_len = le32_to_cpu(tr_hdr->OriginalMessageSize);
+
+	init_completion(&result.completion);
+
+	ses = smb2_find_smb_ses(server, tr_hdr->SessionId);
+	if (!ses) {
+		cifs_dbg(VFS, "%s: Could not find session\n", __func__);
+		return 0;
+	}
+
+	rc = smb3_crypto_aead_allocate(server);
+	if (rc) {
+		cifs_dbg(VFS, "%s: crypto alloc failed\n", __func__);
+		return rc;
+	}
+
+	tfm = enc ? server->secmech.ccmaesencrypt :
+						server->secmech.ccmaesdecrypt;
+	rc = crypto_aead_setkey(tfm, enc ? ses->smb3encryptionkey :
+				ses->smb3decryptionkey, SMB3_SIGN_KEY_SIZE);
+	if (rc) {
+		cifs_dbg(VFS, "%s: Failed to set aead key %d\n", __func__, rc);
+		return rc;
+	}
+
+	rc = crypto_aead_setauthsize(tfm, SMB2_SIGNATURE_SIZE);
+	if (rc) {
+		cifs_dbg(VFS, "%s: Failed to set authsize %d\n", __func__, rc);
+		return rc;
+	}
+
+	req = aead_request_alloc(tfm, GFP_KERNEL);
+	if (!req) {
+		cifs_dbg(VFS, "%s: Failed to alloc aead request", __func__);
+		return -ENOMEM;
+	}
+
+	if (!enc) {
+		memcpy(sign, &tr_hdr->Signature, SMB2_SIGNATURE_SIZE);
+		crypt_len += SMB2_SIGNATURE_SIZE;
+	}
+
+	sg = init_sg(rqst, sign);
+	if (!sg) {
+		cifs_dbg(VFS, "%s: Failed to init sg %d", __func__, rc);
+		goto free_req;
+	}
+
+	iv_len = crypto_aead_ivsize(tfm);
+	iv = kzalloc(iv_len, GFP_KERNEL);
+	if (!iv) {
+		cifs_dbg(VFS, "%s: Failed to alloc IV", __func__);
+		goto free_sg;
+	}
+	iv[0] = 3;
+	memcpy(iv + 1, (char *)tr_hdr->Nonce, SMB3_AES128CMM_NONCE);
+
+	aead_request_set_crypt(req, sg, sg, crypt_len, iv);
+	aead_request_set_ad(req, assoc_data_len);
+
+	aead_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG,
+				  cifs_crypt_complete, &result);
+
+	rc = enc ? crypto_aead_encrypt(req) : crypto_aead_decrypt(req);
+
+	if (rc == -EINPROGRESS || rc == -EBUSY) {
+		wait_for_completion(&result.completion);
+		rc = result.err;
+	}
+
+	if (!rc && enc)
+		memcpy(&tr_hdr->Signature, sign, SMB2_SIGNATURE_SIZE);
+
+	kfree(iv);
+free_sg:
+	kfree(sg);
+free_req:
+	kfree(req);
+	return rc;
+}
+
+static int
+smb3_init_transform_rq(struct TCP_Server_Info *server, struct smb_rqst *new_rq,
+		       struct smb_rqst *old_rq)
+{
+	struct kvec *iov;
+	struct page **pages;
+	struct smb2_transform_hdr *tr_hdr;
+	unsigned int npages = old_rq->rq_npages;
+	int i;
+	int rc = -ENOMEM;
+
+	pages = kmalloc_array(npages, sizeof(struct page *), GFP_KERNEL);
+	if (!pages)
+		return rc;
+
+	new_rq->rq_pages = pages;
+	new_rq->rq_npages = old_rq->rq_npages;
+	new_rq->rq_pagesz = old_rq->rq_pagesz;
+	new_rq->rq_tailsz = old_rq->rq_tailsz;
+
+	for (i = 0; i < npages; i++) {
+		pages[i] = alloc_page(GFP_KERNEL|__GFP_HIGHMEM);
+		if (!pages[i])
+			goto err_free_pages;
+	}
+
+	iov = kmalloc_array(old_rq->rq_nvec, sizeof(struct kvec), GFP_KERNEL);
+	if (!iov)
+		goto err_free_pages;
+
+	/* copy all iovs from the old except the 1st one (rfc1002 length) */
+	memcpy(&iov[1], &old_rq->rq_iov[1],
+				sizeof(struct kvec) * (old_rq->rq_nvec - 1));
+	new_rq->rq_iov = iov;
+	new_rq->rq_nvec = old_rq->rq_nvec;
+
+	tr_hdr = kmalloc(sizeof(struct smb2_transform_hdr), GFP_KERNEL);
+	if (!tr_hdr)
+		goto err_free_iov;
+
+	/* fill the 1st iov with a transform header */
+	fill_transform_hdr(tr_hdr, old_rq);
+	new_rq->rq_iov[0].iov_base = tr_hdr;
+	new_rq->rq_iov[0].iov_len = sizeof(struct smb2_transform_hdr);
+
+	/* copy pages form the old */
+	for (i = 0; i < npages; i++) {
+		char *dst = kmap(new_rq->rq_pages[i]);
+		char *src = kmap(old_rq->rq_pages[i]);
+		unsigned int len = (i < npages - 1) ? new_rq->rq_pagesz :
+							new_rq->rq_tailsz;
+		memcpy(dst, src, len);
+		kunmap(new_rq->rq_pages[i]);
+		kunmap(old_rq->rq_pages[i]);
+	}
+
+	rc = crypt_message(server, new_rq, 1);
+	cifs_dbg(FYI, "encrypt message returned %d", rc);
+	if (rc)
+		goto err_free_tr_hdr;
+
+	return rc;
+
+err_free_tr_hdr:
+	kfree(tr_hdr);
+err_free_iov:
+	kfree(iov);
+err_free_pages:
+	for (i = i - 1; i >= 0; i--)
+		put_page(pages[i]);
+	kfree(pages);
+	return rc;
+}
+
+static void
+smb3_free_transform_rq(struct smb_rqst *rqst)
+{
+	int i = rqst->rq_npages - 1;
+
+	for (; i >= 0; i--)
+		put_page(rqst->rq_pages[i]);
+	kfree(rqst->rq_pages);
+	/* free transform header */
+	kfree(rqst->rq_iov[0].iov_base);
+	kfree(rqst->rq_iov);
+}
+
+static int
+smb3_is_transform_hdr(void *buf)
+{
+	struct smb2_transform_hdr *trhdr = buf;
+
+	return trhdr->ProtocolId == SMB2_TRANSFORM_PROTO_NUM;
+}
+
+static int
+decrypt_raw_data(struct TCP_Server_Info *server, char *buf,
+		 unsigned int buf_data_size, struct page **pages,
+		 unsigned int npages, unsigned int page_data_size)
+{
+	struct kvec iov[2];
+	struct smb_rqst rqst = {NULL};
+	struct smb2_hdr *hdr;
+	int rc;
+
+	iov[0].iov_base = buf;
+	iov[0].iov_len = sizeof(struct smb2_transform_hdr);
+	iov[1].iov_base = buf + sizeof(struct smb2_transform_hdr);
+	iov[1].iov_len = buf_data_size;
+
+	rqst.rq_iov = iov;
+	rqst.rq_nvec = 2;
+	rqst.rq_pages = pages;
+	rqst.rq_npages = npages;
+	rqst.rq_pagesz = PAGE_SIZE;
+	rqst.rq_tailsz = (page_data_size % PAGE_SIZE) ? : PAGE_SIZE;
+
+	rc = crypt_message(server, &rqst, 0);
+	cifs_dbg(FYI, "decrypt message returned %d\n", rc);
+
+	if (rc)
+		return rc;
+
+	memmove(buf + 4, iov[1].iov_base, buf_data_size);
+	hdr = (struct smb2_hdr *)buf;
+	hdr->smb2_buf_length = cpu_to_be32(buf_data_size + page_data_size);
+	server->total_read = buf_data_size + page_data_size + 4;
+
+	return rc;
+}
+
+static int
+read_data_into_pages(struct TCP_Server_Info *server, struct page **pages,
+		     unsigned int npages, unsigned int len)
+{
+	int i;
+	int length;
+
+	for (i = 0; i < npages; i++) {
+		struct page *page = pages[i];
+		size_t n;
+
+		n = len;
+		if (len >= PAGE_SIZE) {
+			/* enough data to fill the page */
+			n = PAGE_SIZE;
+			len -= n;
+		} else {
+			zero_user(page, len, PAGE_SIZE - len);
+			len = 0;
+		}
+		length = cifs_read_page_from_socket(server, page, n);
+		if (length < 0)
+			return length;
+		server->total_read += length;
+	}
+
+	return 0;
+}
+
+static int
+init_read_bvec(struct page **pages, unsigned int npages, unsigned int data_size,
+	       unsigned int cur_off, struct bio_vec **page_vec)
+{
+	struct bio_vec *bvec;
+	int i;
+
+	bvec = kcalloc(npages, sizeof(struct bio_vec), GFP_KERNEL);
+	if (!bvec)
+		return -ENOMEM;
+
+	for (i = 0; i < npages; i++) {
+		bvec[i].bv_page = pages[i];
+		bvec[i].bv_offset = (i == 0) ? cur_off : 0;
+		bvec[i].bv_len = min_t(unsigned int, PAGE_SIZE, data_size);
+		data_size -= bvec[i].bv_len;
+	}
+
+	if (data_size != 0) {
+		cifs_dbg(VFS, "%s: something went wrong\n", __func__);
+		kfree(bvec);
+		return -EIO;
+	}
+
+	*page_vec = bvec;
+	return 0;
+}
+
+static int
+handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid,
+		 char *buf, unsigned int buf_len, struct page **pages,
+		 unsigned int npages, unsigned int page_data_size)
+{
+	unsigned int data_offset;
+	unsigned int data_len;
+	unsigned int cur_off;
+	unsigned int cur_page_idx;
+	unsigned int pad_len;
+	struct cifs_readdata *rdata = mid->callback_data;
+	struct smb2_sync_hdr *shdr = get_sync_hdr(buf);
+	struct bio_vec *bvec = NULL;
+	struct iov_iter iter;
+	struct kvec iov;
+	int length;
+
+	if (shdr->Command != SMB2_READ) {
+		cifs_dbg(VFS, "only big read responses are supported\n");
+		return -ENOTSUPP;
+	}
+
+	if (server->ops->is_status_pending &&
+			server->ops->is_status_pending(buf, server, 0))
+		return -1;
+
+	rdata->result = server->ops->map_error(buf, false);
+	if (rdata->result != 0) {
+		cifs_dbg(FYI, "%s: server returned error %d\n",
+			 __func__, rdata->result);
+		dequeue_mid(mid, rdata->result);
+		return 0;
+	}
+
+	data_offset = server->ops->read_data_offset(buf) + 4;
+	data_len = server->ops->read_data_length(buf);
+
+	if (data_offset < server->vals->read_rsp_size) {
+		/*
+		 * win2k8 sometimes sends an offset of 0 when the read
+		 * is beyond the EOF. Treat it as if the data starts just after
+		 * the header.
+		 */
+		cifs_dbg(FYI, "%s: data offset (%u) inside read response header\n",
+			 __func__, data_offset);
+		data_offset = server->vals->read_rsp_size;
+	} else if (data_offset > MAX_CIFS_SMALL_BUFFER_SIZE) {
+		/* data_offset is beyond the end of smallbuf */
+		cifs_dbg(FYI, "%s: data offset (%u) beyond end of smallbuf\n",
+			 __func__, data_offset);
+		rdata->result = -EIO;
+		dequeue_mid(mid, rdata->result);
+		return 0;
+	}
+
+	pad_len = data_offset - server->vals->read_rsp_size;
+
+	if (buf_len <= data_offset) {
+		/* read response payload is in pages */
+		cur_page_idx = pad_len / PAGE_SIZE;
+		cur_off = pad_len % PAGE_SIZE;
+
+		if (cur_page_idx != 0) {
+			/* data offset is beyond the 1st page of response */
+			cifs_dbg(FYI, "%s: data offset (%u) beyond 1st page of response\n",
+				 __func__, data_offset);
+			rdata->result = -EIO;
+			dequeue_mid(mid, rdata->result);
+			return 0;
+		}
+
+		if (data_len > page_data_size - pad_len) {
+			/* data_len is corrupt -- discard frame */
+			rdata->result = -EIO;
+			dequeue_mid(mid, rdata->result);
+			return 0;
+		}
+
+		rdata->result = init_read_bvec(pages, npages, page_data_size,
+					       cur_off, &bvec);
+		if (rdata->result != 0) {
+			dequeue_mid(mid, rdata->result);
+			return 0;
+		}
+
+		iov_iter_bvec(&iter, WRITE | ITER_BVEC, bvec, npages, data_len);
+	} else if (buf_len >= data_offset + data_len) {
+		/* read response payload is in buf */
+		WARN_ONCE(npages > 0, "read data can be either in buf or in pages");
+		iov.iov_base = buf + data_offset;
+		iov.iov_len = data_len;
+		iov_iter_kvec(&iter, WRITE | ITER_KVEC, &iov, 1, data_len);
+	} else {
+		/* read response payload cannot be in both buf and pages */
+		WARN_ONCE(1, "buf can not contain only a part of read data");
+		rdata->result = -EIO;
+		dequeue_mid(mid, rdata->result);
+		return 0;
+	}
+
+	/* set up first iov for signature check */
+	rdata->iov[0].iov_base = buf;
+	rdata->iov[0].iov_len = 4;
+	rdata->iov[1].iov_base = buf + 4;
+	rdata->iov[1].iov_len = server->vals->read_rsp_size - 4;
+	cifs_dbg(FYI, "0: iov_base=%p iov_len=%zu\n",
+		 rdata->iov[0].iov_base, server->vals->read_rsp_size);
+
+	length = rdata->copy_into_pages(server, rdata, &iter);
+
+	kfree(bvec);
+
+	if (length < 0)
+		return length;
+
+	dequeue_mid(mid, false);
+	return length;
+}
+
+static int
+receive_encrypted_read(struct TCP_Server_Info *server, struct mid_q_entry **mid)
+{
+	char *buf = server->smallbuf;
+	struct smb2_transform_hdr *tr_hdr = (struct smb2_transform_hdr *)buf;
+	unsigned int npages;
+	struct page **pages;
+	unsigned int len;
+	unsigned int buflen = get_rfc1002_length(buf) + 4;
+	int rc;
+	int i = 0;
+
+	len = min_t(unsigned int, buflen, server->vals->read_rsp_size - 4 +
+		sizeof(struct smb2_transform_hdr)) - HEADER_SIZE(server) + 1;
+
+	rc = cifs_read_from_socket(server, buf + HEADER_SIZE(server) - 1, len);
+	if (rc < 0)
+		return rc;
+	server->total_read += rc;
+
+	len = le32_to_cpu(tr_hdr->OriginalMessageSize) + 4 -
+						server->vals->read_rsp_size;
+	npages = DIV_ROUND_UP(len, PAGE_SIZE);
+
+	pages = kmalloc_array(npages, sizeof(struct page *), GFP_KERNEL);
+	if (!pages) {
+		rc = -ENOMEM;
+		goto discard_data;
+	}
+
+	for (; i < npages; i++) {
+		pages[i] = alloc_page(GFP_KERNEL|__GFP_HIGHMEM);
+		if (!pages[i]) {
+			rc = -ENOMEM;
+			goto discard_data;
+		}
+	}
+
+	/* read read data into pages */
+	rc = read_data_into_pages(server, pages, npages, len);
+	if (rc)
+		goto free_pages;
+
+	rc = cifs_discard_remaining_data(server);
+	if (rc)
+		goto free_pages;
+
+	rc = decrypt_raw_data(server, buf, server->vals->read_rsp_size - 4,
+			      pages, npages, len);
+	if (rc)
+		goto free_pages;
+
+	*mid = smb2_find_mid(server, buf);
+	if (*mid == NULL)
+		cifs_dbg(FYI, "mid not found\n");
+	else {
+		cifs_dbg(FYI, "mid found\n");
+		(*mid)->decrypted = true;
+		rc = handle_read_data(server, *mid, buf,
+				      server->vals->read_rsp_size,
+				      pages, npages, len);
+	}
+
+free_pages:
+	for (i = i - 1; i >= 0; i--)
+		put_page(pages[i]);
+	kfree(pages);
+	return rc;
+discard_data:
+	cifs_discard_remaining_data(server);
+	goto free_pages;
+}
+
+static int
+receive_encrypted_standard(struct TCP_Server_Info *server,
+			   struct mid_q_entry **mid)
+{
+	int length;
+	char *buf = server->smallbuf;
+	unsigned int pdu_length = get_rfc1002_length(buf);
+	unsigned int buf_size;
+	struct mid_q_entry *mid_entry;
+
+	/* switch to large buffer if too big for a small one */
+	if (pdu_length + 4 > MAX_CIFS_SMALL_BUFFER_SIZE) {
+		server->large_buf = true;
+		memcpy(server->bigbuf, buf, server->total_read);
+		buf = server->bigbuf;
+	}
+
+	/* now read the rest */
+	length = cifs_read_from_socket(server, buf + HEADER_SIZE(server) - 1,
+				pdu_length - HEADER_SIZE(server) + 1 + 4);
+	if (length < 0)
+		return length;
+	server->total_read += length;
+
+	buf_size = pdu_length + 4 - sizeof(struct smb2_transform_hdr);
+	length = decrypt_raw_data(server, buf, buf_size, NULL, 0, 0);
+	if (length)
+		return length;
+
+	mid_entry = smb2_find_mid(server, buf);
+	if (mid_entry == NULL)
+		cifs_dbg(FYI, "mid not found\n");
+	else {
+		cifs_dbg(FYI, "mid found\n");
+		mid_entry->decrypted = true;
+	}
+
+	*mid = mid_entry;
+
+	if (mid_entry && mid_entry->handle)
+		return mid_entry->handle(server, mid_entry);
+
+	return cifs_handle_standard(server, mid_entry);
+}
+
+static int
+smb3_receive_transform(struct TCP_Server_Info *server, struct mid_q_entry **mid)
+{
+	char *buf = server->smallbuf;
+	unsigned int pdu_length = get_rfc1002_length(buf);
+	struct smb2_transform_hdr *tr_hdr = (struct smb2_transform_hdr *)buf;
+	unsigned int orig_len = le32_to_cpu(tr_hdr->OriginalMessageSize);
+
+	if (pdu_length + 4 < sizeof(struct smb2_transform_hdr) +
+						sizeof(struct smb2_sync_hdr)) {
+		cifs_dbg(VFS, "Transform message is too small (%u)\n",
+			 pdu_length);
+		cifs_reconnect(server);
+		wake_up(&server->response_q);
+		return -ECONNABORTED;
+	}
+
+	if (pdu_length + 4 < orig_len + sizeof(struct smb2_transform_hdr)) {
+		cifs_dbg(VFS, "Transform message is broken\n");
+		cifs_reconnect(server);
+		wake_up(&server->response_q);
+		return -ECONNABORTED;
+	}
+
+	if (pdu_length + 4 > CIFSMaxBufSize + MAX_HEADER_SIZE(server))
+		return receive_encrypted_read(server, mid);
+
+	return receive_encrypted_standard(server, mid);
+}
+
+int
+smb3_handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid)
+{
+	char *buf = server->large_buf ? server->bigbuf : server->smallbuf;
+
+	return handle_read_data(server, mid, buf, get_rfc1002_length(buf) + 4,
+				NULL, 0, 0);
+}
+
 struct smb_version_operations smb20_operations = {
 	.compare_fids = smb2_compare_fids,
 	.setup_request = smb2_setup_request,
@@ -1791,6 +2422,10 @@ struct smb_version_operations smb30_operations = {
 	.dir_needs_close = smb2_dir_needs_close,
 	.fallocate = smb3_fallocate,
 	.enum_snapshots = smb3_enum_snapshots,
+	.init_transform_rq = smb3_init_transform_rq,
+	.free_transform_rq = smb3_free_transform_rq,
+	.is_transform_hdr = smb3_is_transform_hdr,
+	.receive_transform = smb3_receive_transform,
 };
 
 #ifdef CONFIG_CIFS_SMB311
@@ -1879,6 +2514,10 @@ struct smb_version_operations smb311_operations = {
 	.dir_needs_close = smb2_dir_needs_close,
 	.fallocate = smb3_fallocate,
 	.enum_snapshots = smb3_enum_snapshots,
+	.init_transform_rq = smb3_init_transform_rq,
+	.free_transform_rq = smb3_free_transform_rq,
+	.is_transform_hdr = smb3_is_transform_hdr,
+	.receive_transform = smb3_receive_transform,
 };
 #endif /* CIFS_SMB311 */
 
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index 87457227812c..ad83b3db2840 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -77,45 +77,42 @@ static const int smb2_req_struct_sizes[NUMBER_OF_SMB2_COMMANDS] = {
 	/* SMB2_OPLOCK_BREAK */ 24 /* BB this is 36 for LEASE_BREAK variant */
 };
 
+static int encryption_required(const struct cifs_tcon *tcon)
+{
+	if (!tcon)
+		return 0;
+	if ((tcon->ses->session_flags & SMB2_SESSION_FLAG_ENCRYPT_DATA) ||
+	    (tcon->share_flags & SHI1005_FLAGS_ENCRYPT_DATA))
+		return 1;
+	if (tcon->seal &&
+	    (tcon->ses->server->capabilities & SMB2_GLOBAL_CAP_ENCRYPTION))
+		return 1;
+	return 0;
+}
 
 static void
-smb2_hdr_assemble(struct smb2_hdr *hdr, __le16 smb2_cmd /* command */ ,
+smb2_hdr_assemble(struct smb2_sync_hdr *shdr, __le16 smb2_cmd,
 		  const struct cifs_tcon *tcon)
 {
-	struct smb2_pdu *pdu = (struct smb2_pdu *)hdr;
-	char *temp = (char *)hdr;
-	/* lookup word count ie StructureSize from table */
-	__u16 parmsize = smb2_req_struct_sizes[le16_to_cpu(smb2_cmd)];
-
-	/*
-	 * smaller than SMALL_BUFFER_SIZE but bigger than fixed area of
-	 * largest operations (Create)
-	 */
-	memset(temp, 0, 256);
-
-	/* Note this is only network field converted to big endian */
-	hdr->smb2_buf_length = cpu_to_be32(parmsize + sizeof(struct smb2_hdr)
-			- 4 /*  RFC 1001 length field itself not counted */);
-
-	hdr->ProtocolId = SMB2_PROTO_NUMBER;
-	hdr->StructureSize = cpu_to_le16(64);
-	hdr->Command = smb2_cmd;
+	shdr->ProtocolId = SMB2_PROTO_NUMBER;
+	shdr->StructureSize = cpu_to_le16(64);
+	shdr->Command = smb2_cmd;
 	if (tcon && tcon->ses && tcon->ses->server) {
 		struct TCP_Server_Info *server = tcon->ses->server;
 
 		spin_lock(&server->req_lock);
 		/* Request up to 2 credits but don't go over the limit. */
 		if (server->credits >= server->max_credits)
-			hdr->CreditRequest = cpu_to_le16(0);
+			shdr->CreditRequest = cpu_to_le16(0);
 		else
-			hdr->CreditRequest = cpu_to_le16(
+			shdr->CreditRequest = cpu_to_le16(
 				min_t(int, server->max_credits -
 						server->credits, 2));
 		spin_unlock(&server->req_lock);
 	} else {
-		hdr->CreditRequest = cpu_to_le16(2);
+		shdr->CreditRequest = cpu_to_le16(2);
 	}
-	hdr->ProcessId = cpu_to_le32((__u16)current->tgid);
+	shdr->ProcessId = cpu_to_le32((__u16)current->tgid);
 
 	if (!tcon)
 		goto out;
@@ -124,13 +121,13 @@ smb2_hdr_assemble(struct smb2_hdr *hdr, __le16 smb2_cmd /* command */ ,
 	/* See sections 2.2.4 and 3.2.4.1.5 of MS-SMB2 */
 	if ((tcon->ses) && (tcon->ses->server) &&
 	    (tcon->ses->server->capabilities & SMB2_GLOBAL_CAP_LARGE_MTU))
-		hdr->CreditCharge = cpu_to_le16(1);
+		shdr->CreditCharge = cpu_to_le16(1);
 	/* else CreditCharge MBZ */
 
-	hdr->TreeId = tcon->tid;
+	shdr->TreeId = tcon->tid;
 	/* Uid is not converted */
 	if (tcon->ses)
-		hdr->SessionId = tcon->ses->Suid;
+		shdr->SessionId = tcon->ses->Suid;
 
 	/*
 	 * If we would set SMB2_FLAGS_DFS_OPERATIONS on open we also would have
@@ -143,12 +140,12 @@ smb2_hdr_assemble(struct smb2_hdr *hdr, __le16 smb2_cmd /* command */ ,
 	 * but it is safer to net set it for now.
 	 */
 /*	if (tcon->share_flags & SHI1005_FLAGS_DFS)
-		hdr->Flags |= SMB2_FLAGS_DFS_OPERATIONS; */
+		shdr->Flags |= SMB2_FLAGS_DFS_OPERATIONS; */
 
-	if (tcon->ses && tcon->ses->server && tcon->ses->server->sign)
-		hdr->Flags |= SMB2_FLAGS_SIGNED;
+	if (tcon->ses && tcon->ses->server && tcon->ses->server->sign &&
+	    !encryption_required(tcon))
+		shdr->Flags |= SMB2_FLAGS_SIGNED;
 out:
-	pdu->StructureSize2 = cpu_to_le16(parmsize);
 	return;
 }
 
@@ -289,16 +286,74 @@ out:
 	return rc;
 }
 
+static void
+fill_small_buf(__le16 smb2_command, struct cifs_tcon *tcon, void *buf,
+	       unsigned int *total_len)
+{
+	struct smb2_sync_pdu *spdu = (struct smb2_sync_pdu *)buf;
+	/* lookup word count ie StructureSize from table */
+	__u16 parmsize = smb2_req_struct_sizes[le16_to_cpu(smb2_command)];
+
+	/*
+	 * smaller than SMALL_BUFFER_SIZE but bigger than fixed area of
+	 * largest operations (Create)
+	 */
+	memset(buf, 0, 256);
+
+	smb2_hdr_assemble(&spdu->sync_hdr, smb2_command, tcon);
+	spdu->StructureSize2 = cpu_to_le16(parmsize);
+
+	*total_len = parmsize + sizeof(struct smb2_sync_hdr);
+}
+
+/* init request without RFC1001 length at the beginning */
+static int
+smb2_plain_req_init(__le16 smb2_command, struct cifs_tcon *tcon,
+		    void **request_buf, unsigned int *total_len)
+{
+	int rc;
+	struct smb2_sync_hdr *shdr;
+
+	rc = smb2_reconnect(smb2_command, tcon);
+	if (rc)
+		return rc;
+
+	/* BB eventually switch this to SMB2 specific small buf size */
+	*request_buf = cifs_small_buf_get();
+	if (*request_buf == NULL) {
+		/* BB should we add a retry in here if not a writepage? */
+		return -ENOMEM;
+	}
+
+	shdr = (struct smb2_sync_hdr *)(*request_buf);
+
+	fill_small_buf(smb2_command, tcon, shdr, total_len);
+
+	if (tcon != NULL) {
+#ifdef CONFIG_CIFS_STATS2
+		uint16_t com_code = le16_to_cpu(smb2_command);
+
+		cifs_stats_inc(&tcon->stats.smb2_stats.smb2_com_sent[com_code]);
+#endif
+		cifs_stats_inc(&tcon->num_smbs_sent);
+	}
+
+	return rc;
+}
+
 /*
  * Allocate and return pointer to an SMB request hdr, and set basic
  * SMB information in the SMB header. If the return code is zero, this
- * function must have filled in request_buf pointer.
+ * function must have filled in request_buf pointer. The returned buffer
+ * has RFC1001 length at the beginning.
  */
 static int
 small_smb2_init(__le16 smb2_command, struct cifs_tcon *tcon,
 		void **request_buf)
 {
-	int rc = 0;
+	int rc;
+	unsigned int total_len;
+	struct smb2_pdu *pdu;
 
 	rc = smb2_reconnect(smb2_command, tcon);
 	if (rc)
@@ -311,7 +366,12 @@ small_smb2_init(__le16 smb2_command, struct cifs_tcon *tcon,
 		return -ENOMEM;
 	}
 
-	smb2_hdr_assemble((struct smb2_hdr *) *request_buf, smb2_command, tcon);
+	pdu = (struct smb2_pdu *)(*request_buf);
+
+	fill_small_buf(smb2_command, tcon, get_sync_hdr(pdu), &total_len);
+
+	/* Note this is only network field converted to big endian */
+	pdu->hdr.smb2_buf_length = cpu_to_be32(total_len);
 
 	if (tcon != NULL) {
 #ifdef CONFIG_CIFS_STATS2
@@ -376,7 +436,6 @@ static void assemble_neg_contexts(struct smb2_negotiate_req *req)
 }
 #endif /* SMB311 */
 
-
 /*
  *
  *	SMB2 Worker functions follow:
@@ -398,6 +457,7 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses)
 	struct smb2_negotiate_req *req;
 	struct smb2_negotiate_rsp *rsp;
 	struct kvec iov[1];
+	struct kvec rsp_iov;
 	int rc = 0;
 	int resp_buftype;
 	struct TCP_Server_Info *server = ses->server;
@@ -416,7 +476,7 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses)
 	if (rc)
 		return rc;
 
-	req->hdr.SessionId = 0;
+	req->hdr.sync_hdr.SessionId = 0;
 
 	req->Dialects[0] = cpu_to_le16(ses->server->vals->protocol_id);
 
@@ -446,9 +506,9 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses)
 	/* 4 for rfc1002 length field */
 	iov[0].iov_len = get_rfc1002_length(req) + 4;
 
-	rc = SendReceive2(xid, ses, iov, 1, &resp_buftype, flags);
-
-	rsp = (struct smb2_negotiate_rsp *)iov[0].iov_base;
+	rc = SendReceive2(xid, ses, iov, 1, &resp_buftype, flags, &rsp_iov);
+	cifs_small_buf_release(req);
+	rsp = (struct smb2_negotiate_rsp *)rsp_iov.iov_base;
 	/*
 	 * No tcon so can't do
 	 * cifs_stats_inc(&tcon->stats.smb2_stats.smb2_com_fail[SMB2...]);
@@ -627,14 +687,15 @@ SMB2_sess_alloc_buffer(struct SMB2_sess_data *sess_data)
 	if (rc)
 		return rc;
 
-	req->hdr.SessionId = 0; /* First session, not a reauthenticate */
+	/* First session, not a reauthenticate */
+	req->hdr.sync_hdr.SessionId = 0;
 
 	/* if reconnect, we need to send previous sess id, otherwise it is 0 */
 	req->PreviousSessionId = sess_data->previous_session;
 
 	req->Flags = 0; /* MBZ */
 	/* to enable echos and oplocks */
-	req->hdr.CreditRequest = cpu_to_le16(3);
+	req->hdr.sync_hdr.CreditRequest = cpu_to_le16(3);
 
 	/* only one of SMB2 signing flags may be set in SMB2 request */
 	if (server->sign)
@@ -671,6 +732,7 @@ SMB2_sess_sendreceive(struct SMB2_sess_data *sess_data)
 {
 	int rc;
 	struct smb2_sess_setup_req *req = sess_data->iov[0].iov_base;
+	struct kvec rsp_iov = { NULL, 0 };
 
 	/* Testing shows that buffer offset must be at location of Buffer[0] */
 	req->SecurityBufferOffset =
@@ -685,7 +747,9 @@ SMB2_sess_sendreceive(struct SMB2_sess_data *sess_data)
 	rc = SendReceive2(sess_data->xid, sess_data->ses,
 				sess_data->iov, 2,
 				&sess_data->buf0_type,
-				CIFS_LOG_ERROR | CIFS_NEG_OP);
+				CIFS_LOG_ERROR | CIFS_NEG_OP, &rsp_iov);
+	cifs_small_buf_release(sess_data->iov[0].iov_base);
+	memcpy(&sess_data->iov[0], &rsp_iov, sizeof(struct kvec));
 
 	return rc;
 }
@@ -697,15 +761,13 @@ SMB2_sess_establish_session(struct SMB2_sess_data *sess_data)
 	struct cifs_ses *ses = sess_data->ses;
 
 	mutex_lock(&ses->server->srv_mutex);
-	if (ses->server->sign && ses->server->ops->generate_signingkey) {
+	if (ses->server->ops->generate_signingkey) {
 		rc = ses->server->ops->generate_signingkey(ses);
-		kfree(ses->auth_key.response);
-		ses->auth_key.response = NULL;
 		if (rc) {
 			cifs_dbg(FYI,
 				"SMB3 session key generation failed\n");
 			mutex_unlock(&ses->server->srv_mutex);
-			goto keygen_exit;
+			return rc;
 		}
 	}
 	if (!ses->server->session_estab) {
@@ -719,12 +781,6 @@ SMB2_sess_establish_session(struct SMB2_sess_data *sess_data)
 	ses->status = CifsGood;
 	ses->need_reconnect = false;
 	spin_unlock(&GlobalMid_Lock);
-
-keygen_exit:
-	if (!ses->server->sign) {
-		kfree(ses->auth_key.response);
-		ses->auth_key.response = NULL;
-	}
 	return rc;
 }
 
@@ -781,11 +837,9 @@ SMB2_auth_kerberos(struct SMB2_sess_data *sess_data)
 		goto out_put_spnego_key;
 
 	rsp = (struct smb2_sess_setup_rsp *)sess_data->iov[0].iov_base;
-	ses->Suid = rsp->hdr.SessionId;
+	ses->Suid = rsp->hdr.sync_hdr.SessionId;
 
 	ses->session_flags = le16_to_cpu(rsp->SessionFlags);
-	if (ses->session_flags & SMB2_SESSION_FLAG_ENCRYPT_DATA)
-		cifs_dbg(VFS, "SMB3 encryption not supported yet\n");
 
 	rc = SMB2_sess_establish_session(sess_data);
 out_put_spnego_key:
@@ -859,7 +913,7 @@ SMB2_sess_auth_rawntlmssp_negotiate(struct SMB2_sess_data *sess_data)
 
 	/* If true, rc here is expected and not an error */
 	if (sess_data->buf0_type != CIFS_NO_BUFFER &&
-		rsp->hdr.Status == STATUS_MORE_PROCESSING_REQUIRED)
+		rsp->hdr.sync_hdr.Status == STATUS_MORE_PROCESSING_REQUIRED)
 		rc = 0;
 
 	if (rc)
@@ -880,10 +934,8 @@ SMB2_sess_auth_rawntlmssp_negotiate(struct SMB2_sess_data *sess_data)
 	cifs_dbg(FYI, "rawntlmssp session setup challenge phase\n");
 
 
-	ses->Suid = rsp->hdr.SessionId;
+	ses->Suid = rsp->hdr.sync_hdr.SessionId;
 	ses->session_flags = le16_to_cpu(rsp->SessionFlags);
-	if (ses->session_flags & SMB2_SESSION_FLAG_ENCRYPT_DATA)
-		cifs_dbg(VFS, "SMB3 encryption not supported yet\n");
 
 out:
 	kfree(ntlmssp_blob);
@@ -916,7 +968,7 @@ SMB2_sess_auth_rawntlmssp_authenticate(struct SMB2_sess_data *sess_data)
 		goto out;
 
 	req = (struct smb2_sess_setup_req *) sess_data->iov[0].iov_base;
-	req->hdr.SessionId = ses->Suid;
+	req->hdr.sync_hdr.SessionId = ses->Suid;
 
 	rc = build_ntlmssp_auth_blob(&ntlmssp_blob, &blob_length, ses,
 					sess_data->nls_cp);
@@ -940,10 +992,8 @@ SMB2_sess_auth_rawntlmssp_authenticate(struct SMB2_sess_data *sess_data)
 
 	rsp = (struct smb2_sess_setup_rsp *)sess_data->iov[0].iov_base;
 
-	ses->Suid = rsp->hdr.SessionId;
+	ses->Suid = rsp->hdr.sync_hdr.SessionId;
 	ses->session_flags = le16_to_cpu(rsp->SessionFlags);
-	if (ses->session_flags & SMB2_SESSION_FLAG_ENCRYPT_DATA)
-		cifs_dbg(VFS, "SMB3 encryption not supported yet\n");
 
 	rc = SMB2_sess_establish_session(sess_data);
 out:
@@ -1018,6 +1068,7 @@ SMB2_logoff(const unsigned int xid, struct cifs_ses *ses)
 	struct smb2_logoff_req *req; /* response is also trivial struct */
 	int rc = 0;
 	struct TCP_Server_Info *server;
+	int flags = 0;
 
 	cifs_dbg(FYI, "disconnect session %p\n", ses);
 
@@ -1035,11 +1086,15 @@ SMB2_logoff(const unsigned int xid, struct cifs_ses *ses)
 		return rc;
 
 	 /* since no tcon, smb2_init can not do this, so do here */
-	req->hdr.SessionId = ses->Suid;
-	if (server->sign)
-		req->hdr.Flags |= SMB2_FLAGS_SIGNED;
+	req->hdr.sync_hdr.SessionId = ses->Suid;
+
+	if (ses->session_flags & SMB2_SESSION_FLAG_ENCRYPT_DATA)
+		flags |= CIFS_TRANSFORM_REQ;
+	else if (server->sign)
+		req->hdr.sync_hdr.Flags |= SMB2_FLAGS_SIGNED;
 
-	rc = SendReceiveNoRsp(xid, ses, (char *) &req->hdr, 0);
+	rc = SendReceiveNoRsp(xid, ses, (char *) req, flags);
+	cifs_small_buf_release(req);
 	/*
 	 * No tcon so can't do
 	 * cifs_stats_inc(&tcon->stats.smb2_stats.smb2_com_fail[SMB2...]);
@@ -1071,11 +1126,13 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree,
 	struct smb2_tree_connect_req *req;
 	struct smb2_tree_connect_rsp *rsp = NULL;
 	struct kvec iov[2];
+	struct kvec rsp_iov;
 	int rc = 0;
 	int resp_buftype;
 	int unc_path_len;
 	struct TCP_Server_Info *server;
 	__le16 *unc_path = NULL;
+	int flags = 0;
 
 	cifs_dbg(FYI, "TCON\n");
 
@@ -1087,12 +1144,6 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree,
 	if (tcon && tcon->bad_network_name)
 		return -ENOENT;
 
-	if ((tcon && tcon->seal) &&
-	    ((ses->server->capabilities & SMB2_GLOBAL_CAP_ENCRYPTION) == 0)) {
-		cifs_dbg(VFS, "encryption requested but no server support");
-		return -EOPNOTSUPP;
-	}
-
 	unc_path = kmalloc(MAX_SHARENAME_LENGTH * 2, GFP_KERNEL);
 	if (unc_path == NULL)
 		return -ENOMEM;
@@ -1111,11 +1162,15 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree,
 	}
 
 	if (tcon == NULL) {
+		if ((ses->session_flags & SMB2_SESSION_FLAG_ENCRYPT_DATA))
+			flags |= CIFS_TRANSFORM_REQ;
+
 		/* since no tcon, smb2_init can not do this, so do here */
-		req->hdr.SessionId = ses->Suid;
+		req->hdr.sync_hdr.SessionId = ses->Suid;
 		/* if (ses->server->sec_mode & SECMODE_SIGN_REQUIRED)
 			req->hdr.Flags |= SMB2_FLAGS_SIGNED; */
-	}
+	} else if (encryption_required(tcon))
+		flags |= CIFS_TRANSFORM_REQ;
 
 	iov[0].iov_base = (char *)req;
 	/* 4 for rfc1002 length field and 1 for pad */
@@ -1130,8 +1185,9 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree,
 
 	inc_rfc1001_len(req, unc_path_len - 1 /* pad */);
 
-	rc = SendReceive2(xid, ses, iov, 2, &resp_buftype, 0);
-	rsp = (struct smb2_tree_connect_rsp *)iov[0].iov_base;
+	rc = SendReceive2(xid, ses, iov, 2, &resp_buftype, flags, &rsp_iov);
+	cifs_small_buf_release(req);
+	rsp = (struct smb2_tree_connect_rsp *)rsp_iov.iov_base;
 
 	if (rc != 0) {
 		if (tcon) {
@@ -1142,7 +1198,7 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree,
 	}
 
 	if (tcon == NULL) {
-		ses->ipc_tid = rsp->hdr.TreeId;
+		ses->ipc_tid = rsp->hdr.sync_hdr.TreeId;
 		goto tcon_exit;
 	}
 
@@ -1165,15 +1221,18 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree,
 	tcon->maximal_access = le32_to_cpu(rsp->MaximalAccess);
 	tcon->tidStatus = CifsGood;
 	tcon->need_reconnect = false;
-	tcon->tid = rsp->hdr.TreeId;
+	tcon->tid = rsp->hdr.sync_hdr.TreeId;
 	strlcpy(tcon->treeName, tree, sizeof(tcon->treeName));
 
 	if ((rsp->Capabilities & SMB2_SHARE_CAP_DFS) &&
 	    ((tcon->share_flags & SHI1005_FLAGS_DFS) == 0))
 		cifs_dbg(VFS, "DFS capability contradicts DFS flag\n");
+
+	if (tcon->seal &&
+	    !(tcon->ses->server->capabilities & SMB2_GLOBAL_CAP_ENCRYPTION))
+		cifs_dbg(VFS, "Encryption is requested but not supported\n");
+
 	init_copy_chunk_defaults(tcon);
-	if (tcon->share_flags & SHI1005_FLAGS_ENCRYPT_DATA)
-		cifs_dbg(VFS, "Encrypted shares not supported");
 	if (tcon->ses->server->ops->validate_negotiate)
 		rc = tcon->ses->server->ops->validate_negotiate(xid, tcon);
 tcon_exit:
@@ -1182,7 +1241,7 @@ tcon_exit:
 	return rc;
 
 tcon_error_exit:
-	if (rsp->hdr.Status == STATUS_BAD_NETWORK_NAME) {
+	if (rsp->hdr.sync_hdr.Status == STATUS_BAD_NETWORK_NAME) {
 		cifs_dbg(VFS, "BAD_NETWORK_NAME: %s\n", tree);
 		if (tcon)
 			tcon->bad_network_name = true;
@@ -1197,6 +1256,7 @@ SMB2_tdis(const unsigned int xid, struct cifs_tcon *tcon)
 	int rc = 0;
 	struct TCP_Server_Info *server;
 	struct cifs_ses *ses = tcon->ses;
+	int flags = 0;
 
 	cifs_dbg(FYI, "Tree Disconnect\n");
 
@@ -1212,7 +1272,11 @@ SMB2_tdis(const unsigned int xid, struct cifs_tcon *tcon)
 	if (rc)
 		return rc;
 
-	rc = SendReceiveNoRsp(xid, ses, (char *)&req->hdr, 0);
+	if (encryption_required(tcon))
+		flags |= CIFS_TRANSFORM_REQ;
+
+	rc = SendReceiveNoRsp(xid, ses, (char *)req, flags);
+	cifs_small_buf_release(req);
 	if (rc)
 		cifs_stats_fail_inc(tcon, SMB2_TREE_DISCONNECT_HE);
 
@@ -1474,14 +1538,16 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path,
 	struct cifs_tcon *tcon = oparms->tcon;
 	struct cifs_ses *ses = tcon->ses;
 	struct kvec iov[4];
+	struct kvec rsp_iov;
 	int resp_buftype;
 	int uni_path_len;
 	__le16 *copy_path = NULL;
 	int copy_size;
 	int rc = 0;
-	unsigned int num_iovecs = 2;
+	unsigned int n_iov = 2;
 	__u32 file_attributes = 0;
 	char *dhc_buf = NULL, *lc_buf = NULL;
+	int flags = 0;
 
 	cifs_dbg(FYI, "create/open\n");
 
@@ -1494,6 +1560,9 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path,
 	if (rc)
 		return rc;
 
+	if (encryption_required(tcon))
+		flags |= CIFS_TRANSFORM_REQ;
+
 	if (oparms->create_options & CREATE_OPTION_READONLY)
 		file_attributes |= ATTR_READONLY;
 	if (oparms->create_options & CREATE_OPTION_SPECIAL)
@@ -1544,25 +1613,25 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path,
 	    *oplock == SMB2_OPLOCK_LEVEL_NONE)
 		req->RequestedOplockLevel = *oplock;
 	else {
-		rc = add_lease_context(server, iov, &num_iovecs, oplock);
+		rc = add_lease_context(server, iov, &n_iov, oplock);
 		if (rc) {
 			cifs_small_buf_release(req);
 			kfree(copy_path);
 			return rc;
 		}
-		lc_buf = iov[num_iovecs-1].iov_base;
+		lc_buf = iov[n_iov-1].iov_base;
 	}
 
 	if (*oplock == SMB2_OPLOCK_LEVEL_BATCH) {
 		/* need to set Next field of lease context if we request it */
 		if (server->capabilities & SMB2_GLOBAL_CAP_LEASING) {
 			struct create_context *ccontext =
-			    (struct create_context *)iov[num_iovecs-1].iov_base;
+			    (struct create_context *)iov[n_iov-1].iov_base;
 			ccontext->Next =
 				cpu_to_le32(server->vals->create_lease_size);
 		}
 
-		rc = add_durable_context(iov, &num_iovecs, oparms,
+		rc = add_durable_context(iov, &n_iov, oparms,
 					tcon->use_persistent);
 		if (rc) {
 			cifs_small_buf_release(req);
@@ -1570,11 +1639,12 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path,
 			kfree(lc_buf);
 			return rc;
 		}
-		dhc_buf = iov[num_iovecs-1].iov_base;
+		dhc_buf = iov[n_iov-1].iov_base;
 	}
 
-	rc = SendReceive2(xid, ses, iov, num_iovecs, &resp_buftype, 0);
-	rsp = (struct smb2_create_rsp *)iov[0].iov_base;
+	rc = SendReceive2(xid, ses, iov, n_iov, &resp_buftype, flags, &rsp_iov);
+	cifs_small_buf_release(req);
+	rsp = (struct smb2_create_rsp *)rsp_iov.iov_base;
 
 	if (rc != 0) {
 		cifs_stats_fail_inc(tcon, SMB2_CREATE_HE);
@@ -1618,12 +1688,15 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
 {
 	struct smb2_ioctl_req *req;
 	struct smb2_ioctl_rsp *rsp;
+	struct smb2_sync_hdr *shdr;
 	struct TCP_Server_Info *server;
 	struct cifs_ses *ses;
 	struct kvec iov[2];
+	struct kvec rsp_iov;
 	int resp_buftype;
-	int num_iovecs;
+	int n_iov;
 	int rc = 0;
+	int flags = 0;
 
 	cifs_dbg(FYI, "SMB2 IOCTL\n");
 
@@ -1648,6 +1721,9 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
 	if (rc)
 		return rc;
 
+	if (encryption_required(tcon))
+		flags |= CIFS_TRANSFORM_REQ;
+
 	req->CtlCode = cpu_to_le32(opcode);
 	req->PersistentFileId = persistent_fid;
 	req->VolatileFileId = volatile_fid;
@@ -1659,9 +1735,9 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
 		       cpu_to_le32(offsetof(struct smb2_ioctl_req, Buffer) - 4);
 		iov[1].iov_base = in_data;
 		iov[1].iov_len = indatalen;
-		num_iovecs = 2;
+		n_iov = 2;
 	} else
-		num_iovecs = 1;
+		n_iov = 1;
 
 	req->OutputOffset = 0;
 	req->OutputCount = 0; /* MBZ */
@@ -1698,8 +1774,9 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
 		iov[0].iov_len = get_rfc1002_length(req) + 4;
 
 
-	rc = SendReceive2(xid, ses, iov, num_iovecs, &resp_buftype, 0);
-	rsp = (struct smb2_ioctl_rsp *)iov[0].iov_base;
+	rc = SendReceive2(xid, ses, iov, n_iov, &resp_buftype, flags, &rsp_iov);
+	cifs_small_buf_release(req);
+	rsp = (struct smb2_ioctl_rsp *)rsp_iov.iov_base;
 
 	if ((rc != 0) && (rc != -EINVAL)) {
 		cifs_stats_fail_inc(tcon, SMB2_IOCTL_HE);
@@ -1742,9 +1819,8 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
 		goto ioctl_exit;
 	}
 
-	memcpy(*out_data,
-	       (char *)&rsp->hdr.ProtocolId + le32_to_cpu(rsp->OutputOffset),
-	       *plen);
+	shdr = get_sync_hdr(rsp);
+	memcpy(*out_data, (char *)shdr + le32_to_cpu(rsp->OutputOffset), *plen);
 ioctl_exit:
 	free_rsp_buf(resp_buftype, rsp);
 	return rc;
@@ -1784,8 +1860,10 @@ SMB2_close(const unsigned int xid, struct cifs_tcon *tcon,
 	struct TCP_Server_Info *server;
 	struct cifs_ses *ses = tcon->ses;
 	struct kvec iov[1];
+	struct kvec rsp_iov;
 	int resp_buftype;
 	int rc = 0;
+	int flags = 0;
 
 	cifs_dbg(FYI, "Close\n");
 
@@ -1798,6 +1876,9 @@ SMB2_close(const unsigned int xid, struct cifs_tcon *tcon,
 	if (rc)
 		return rc;
 
+	if (encryption_required(tcon))
+		flags |= CIFS_TRANSFORM_REQ;
+
 	req->PersistentFileId = persistent_fid;
 	req->VolatileFileId = volatile_fid;
 
@@ -1805,8 +1886,9 @@ SMB2_close(const unsigned int xid, struct cifs_tcon *tcon,
 	/* 4 for rfc1002 length field */
 	iov[0].iov_len = get_rfc1002_length(req) + 4;
 
-	rc = SendReceive2(xid, ses, iov, 1, &resp_buftype, 0);
-	rsp = (struct smb2_close_rsp *)iov[0].iov_base;
+	rc = SendReceive2(xid, ses, iov, 1, &resp_buftype, flags, &rsp_iov);
+	cifs_small_buf_release(req);
+	rsp = (struct smb2_close_rsp *)rsp_iov.iov_base;
 
 	if (rc != 0) {
 		cifs_stats_fail_inc(tcon, SMB2_CLOSE_HE);
@@ -1885,10 +1967,12 @@ query_info(const unsigned int xid, struct cifs_tcon *tcon,
 	struct smb2_query_info_req *req;
 	struct smb2_query_info_rsp *rsp = NULL;
 	struct kvec iov[2];
+	struct kvec rsp_iov;
 	int rc = 0;
 	int resp_buftype;
 	struct TCP_Server_Info *server;
 	struct cifs_ses *ses = tcon->ses;
+	int flags = 0;
 
 	cifs_dbg(FYI, "Query Info\n");
 
@@ -1901,6 +1985,9 @@ query_info(const unsigned int xid, struct cifs_tcon *tcon,
 	if (rc)
 		return rc;
 
+	if (encryption_required(tcon))
+		flags |= CIFS_TRANSFORM_REQ;
+
 	req->InfoType = SMB2_O_INFO_FILE;
 	req->FileInfoClass = info_class;
 	req->PersistentFileId = persistent_fid;
@@ -1914,8 +2001,9 @@ query_info(const unsigned int xid, struct cifs_tcon *tcon,
 	/* 4 for rfc1002 length field */
 	iov[0].iov_len = get_rfc1002_length(req) + 4;
 
-	rc = SendReceive2(xid, ses, iov, 1, &resp_buftype, 0);
-	rsp = (struct smb2_query_info_rsp *)iov[0].iov_base;
+	rc = SendReceive2(xid, ses, iov, 1, &resp_buftype, flags, &rsp_iov);
+	cifs_small_buf_release(req);
+	rsp = (struct smb2_query_info_rsp *)rsp_iov.iov_base;
 
 	if (rc) {
 		cifs_stats_fail_inc(tcon, SMB2_QUERY_INFO_HE);
@@ -1963,11 +2051,11 @@ static void
 smb2_echo_callback(struct mid_q_entry *mid)
 {
 	struct TCP_Server_Info *server = mid->callback_data;
-	struct smb2_echo_rsp *smb2 = (struct smb2_echo_rsp *)mid->resp_buf;
+	struct smb2_echo_rsp *rsp = (struct smb2_echo_rsp *)mid->resp_buf;
 	unsigned int credits_received = 1;
 
 	if (mid->mid_state == MID_RESPONSE_RECEIVED)
-		credits_received = le16_to_cpu(smb2->hdr.CreditRequest);
+		credits_received = le16_to_cpu(rsp->hdr.sync_hdr.CreditRequest);
 
 	mutex_lock(&server->srv_mutex);
 	DeleteMidQEntry(mid);
@@ -2029,9 +2117,9 @@ SMB2_echo(struct TCP_Server_Info *server)
 {
 	struct smb2_echo_req *req;
 	int rc = 0;
-	struct kvec iov;
-	struct smb_rqst rqst = { .rq_iov = &iov,
-				 .rq_nvec = 1 };
+	struct kvec iov[2];
+	struct smb_rqst rqst = { .rq_iov = iov,
+				 .rq_nvec = 2 };
 
 	cifs_dbg(FYI, "In echo request\n");
 
@@ -2045,14 +2133,16 @@ SMB2_echo(struct TCP_Server_Info *server)
 	if (rc)
 		return rc;
 
-	req->hdr.CreditRequest = cpu_to_le16(1);
+	req->hdr.sync_hdr.CreditRequest = cpu_to_le16(1);
 
-	iov.iov_base = (char *)req;
 	/* 4 for rfc1002 length field */
-	iov.iov_len = get_rfc1002_length(req) + 4;
+	iov[0].iov_len = 4;
+	iov[0].iov_base = (char *)req;
+	iov[1].iov_len = get_rfc1002_length(req);
+	iov[1].iov_base = (char *)req + 4;
 
-	rc = cifs_call_async(server, &rqst, NULL, smb2_echo_callback, server,
-			     CIFS_ECHO_OP);
+	rc = cifs_call_async(server, &rqst, NULL, smb2_echo_callback, NULL,
+			     server, CIFS_ECHO_OP);
 	if (rc)
 		cifs_dbg(FYI, "Echo request failed: %d\n", rc);
 
@@ -2068,8 +2158,10 @@ SMB2_flush(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
 	struct TCP_Server_Info *server;
 	struct cifs_ses *ses = tcon->ses;
 	struct kvec iov[1];
+	struct kvec rsp_iov;
 	int resp_buftype;
 	int rc = 0;
+	int flags = 0;
 
 	cifs_dbg(FYI, "Flush\n");
 
@@ -2082,6 +2174,9 @@ SMB2_flush(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
 	if (rc)
 		return rc;
 
+	if (encryption_required(tcon))
+		flags |= CIFS_TRANSFORM_REQ;
+
 	req->PersistentFileId = persistent_fid;
 	req->VolatileFileId = volatile_fid;
 
@@ -2089,12 +2184,13 @@ SMB2_flush(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
 	/* 4 for rfc1002 length field */
 	iov[0].iov_len = get_rfc1002_length(req) + 4;
 
-	rc = SendReceive2(xid, ses, iov, 1, &resp_buftype, 0);
+	rc = SendReceive2(xid, ses, iov, 1, &resp_buftype, flags, &rsp_iov);
+	cifs_small_buf_release(req);
 
 	if (rc != 0)
 		cifs_stats_fail_inc(tcon, SMB2_FLUSH_HE);
 
-	free_rsp_buf(resp_buftype, iov[0].iov_base);
+	free_rsp_buf(resp_buftype, rsp_iov.iov_base);
 	return rc;
 }
 
@@ -2103,19 +2199,23 @@ SMB2_flush(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
  * have the end_of_chain boolean set to true.
  */
 static int
-smb2_new_read_req(struct kvec *iov, struct cifs_io_parms *io_parms,
-		  unsigned int remaining_bytes, int request_type)
+smb2_new_read_req(void **buf, unsigned int *total_len,
+		  struct cifs_io_parms *io_parms, unsigned int remaining_bytes,
+		  int request_type)
 {
 	int rc = -EACCES;
-	struct smb2_read_req *req = NULL;
+	struct smb2_read_plain_req *req = NULL;
+	struct smb2_sync_hdr *shdr;
 
-	rc = small_smb2_init(SMB2_READ, io_parms->tcon, (void **) &req);
+	rc = smb2_plain_req_init(SMB2_READ, io_parms->tcon, (void **) &req,
+				 total_len);
 	if (rc)
 		return rc;
 	if (io_parms->tcon->ses->server == NULL)
 		return -ECONNABORTED;
 
-	req->hdr.ProcessId = cpu_to_le32(io_parms->pid);
+	shdr = &req->sync_hdr;
+	shdr->ProcessId = cpu_to_le32(io_parms->pid);
 
 	req->PersistentFileId = io_parms->persistent_fid;
 	req->VolatileFileId = io_parms->volatile_fid;
@@ -2128,19 +2228,19 @@ smb2_new_read_req(struct kvec *iov, struct cifs_io_parms *io_parms,
 
 	if (request_type & CHAINED_REQUEST) {
 		if (!(request_type & END_OF_CHAIN)) {
-			/* 4 for rfc1002 length field */
-			req->hdr.NextCommand =
-				cpu_to_le32(get_rfc1002_length(req) + 4);
+			/* next 8-byte aligned request */
+			*total_len = DIV_ROUND_UP(*total_len, 8) * 8;
+			shdr->NextCommand = cpu_to_le32(*total_len);
 		} else /* END_OF_CHAIN */
-			req->hdr.NextCommand = 0;
+			shdr->NextCommand = 0;
 		if (request_type & RELATED_REQUEST) {
-			req->hdr.Flags |= SMB2_FLAGS_RELATED_OPERATIONS;
+			shdr->Flags |= SMB2_FLAGS_RELATED_OPERATIONS;
 			/*
 			 * Related requests use info from previous read request
 			 * in chain.
 			 */
-			req->hdr.SessionId = 0xFFFFFFFF;
-			req->hdr.TreeId = 0xFFFFFFFF;
+			shdr->SessionId = 0xFFFFFFFF;
+			shdr->TreeId = 0xFFFFFFFF;
 			req->PersistentFileId = 0xFFFFFFFF;
 			req->VolatileFileId = 0xFFFFFFFF;
 		}
@@ -2150,9 +2250,7 @@ smb2_new_read_req(struct kvec *iov, struct cifs_io_parms *io_parms,
 	else
 		req->RemainingBytes = 0;
 
-	iov[0].iov_base = (char *)req;
-	/* 4 for rfc1002 length field */
-	iov[0].iov_len = get_rfc1002_length(req) + 4;
+	*buf = req;
 	return rc;
 }
 
@@ -2162,10 +2260,11 @@ smb2_readv_callback(struct mid_q_entry *mid)
 	struct cifs_readdata *rdata = mid->callback_data;
 	struct cifs_tcon *tcon = tlink_tcon(rdata->cfile->tlink);
 	struct TCP_Server_Info *server = tcon->ses->server;
-	struct smb2_hdr *buf = (struct smb2_hdr *)rdata->iov.iov_base;
+	struct smb2_sync_hdr *shdr =
+				(struct smb2_sync_hdr *)rdata->iov[1].iov_base;
 	unsigned int credits_received = 1;
-	struct smb_rqst rqst = { .rq_iov = &rdata->iov,
-				 .rq_nvec = 1,
+	struct smb_rqst rqst = { .rq_iov = rdata->iov,
+				 .rq_nvec = 2,
 				 .rq_pages = rdata->pages,
 				 .rq_npages = rdata->nr_pages,
 				 .rq_pagesz = rdata->pagesz,
@@ -2177,9 +2276,9 @@ smb2_readv_callback(struct mid_q_entry *mid)
 
 	switch (mid->mid_state) {
 	case MID_RESPONSE_RECEIVED:
-		credits_received = le16_to_cpu(buf->CreditRequest);
+		credits_received = le16_to_cpu(shdr->CreditRequest);
 		/* result already set, check signature */
-		if (server->sign) {
+		if (server->sign && !mid->decrypted) {
 			int rc;
 
 			rc = smb2_verify_signature(&rqst, server);
@@ -2216,16 +2315,19 @@ smb2_readv_callback(struct mid_q_entry *mid)
 	add_credits(server, credits_received, 0);
 }
 
-/* smb2_async_readv - send an async write, and set up mid to handle result */
+/* smb2_async_readv - send an async read, and set up mid to handle result */
 int
 smb2_async_readv(struct cifs_readdata *rdata)
 {
 	int rc, flags = 0;
-	struct smb2_hdr *buf;
+	char *buf;
+	struct smb2_sync_hdr *shdr;
 	struct cifs_io_parms io_parms;
-	struct smb_rqst rqst = { .rq_iov = &rdata->iov,
-				 .rq_nvec = 1 };
+	struct smb_rqst rqst = { .rq_iov = rdata->iov,
+				 .rq_nvec = 2 };
 	struct TCP_Server_Info *server;
+	unsigned int total_len;
+	__be32 req_len;
 
 	cifs_dbg(FYI, "%s: offset=%llu bytes=%u\n",
 		 __func__, rdata->offset, rdata->bytes);
@@ -2239,7 +2341,7 @@ smb2_async_readv(struct cifs_readdata *rdata)
 
 	server = io_parms.tcon->ses->server;
 
-	rc = smb2_new_read_req(&rdata->iov, &io_parms, 0, 0);
+	rc = smb2_new_read_req((void **) &buf, &total_len, &io_parms, 0, 0);
 	if (rc) {
 		if (rc == -EAGAIN && rdata->credits) {
 			/* credits was reset by reconnect */
@@ -2252,26 +2354,34 @@ smb2_async_readv(struct cifs_readdata *rdata)
 		return rc;
 	}
 
-	buf = (struct smb2_hdr *)rdata->iov.iov_base;
-	/* 4 for rfc1002 length field */
-	rdata->iov.iov_len = get_rfc1002_length(rdata->iov.iov_base) + 4;
+	if (encryption_required(io_parms.tcon))
+		flags |= CIFS_TRANSFORM_REQ;
+
+	req_len = cpu_to_be32(total_len);
+
+	rdata->iov[0].iov_base = &req_len;
+	rdata->iov[0].iov_len = sizeof(__be32);
+	rdata->iov[1].iov_base = buf;
+	rdata->iov[1].iov_len = total_len;
+
+	shdr = (struct smb2_sync_hdr *)buf;
 
 	if (rdata->credits) {
-		buf->CreditCharge = cpu_to_le16(DIV_ROUND_UP(rdata->bytes,
+		shdr->CreditCharge = cpu_to_le16(DIV_ROUND_UP(rdata->bytes,
 						SMB2_MAX_BUFFER_SIZE));
-		buf->CreditRequest = buf->CreditCharge;
+		shdr->CreditRequest = shdr->CreditCharge;
 		spin_lock(&server->req_lock);
 		server->credits += rdata->credits -
-						le16_to_cpu(buf->CreditCharge);
+						le16_to_cpu(shdr->CreditCharge);
 		spin_unlock(&server->req_lock);
 		wake_up(&server->request_q);
-		flags = CIFS_HAS_CREDITS;
+		flags |= CIFS_HAS_CREDITS;
 	}
 
 	kref_get(&rdata->refcount);
 	rc = cifs_call_async(io_parms.tcon->ses->server, &rqst,
 			     cifs_readv_receive, smb2_readv_callback,
-			     rdata, flags);
+			     smb3_handle_read_data, rdata, flags);
 	if (rc) {
 		kref_put(&rdata->refcount, cifs_readdata_release);
 		cifs_stats_fail_inc(io_parms.tcon, SMB2_READ_HE);
@@ -2286,21 +2396,41 @@ SMB2_read(const unsigned int xid, struct cifs_io_parms *io_parms,
 	  unsigned int *nbytes, char **buf, int *buf_type)
 {
 	int resp_buftype, rc = -EACCES;
+	struct smb2_read_plain_req *req = NULL;
 	struct smb2_read_rsp *rsp = NULL;
-	struct kvec iov[1];
+	struct smb2_sync_hdr *shdr;
+	struct kvec iov[2];
+	struct kvec rsp_iov;
+	unsigned int total_len;
+	__be32 req_len;
+	struct smb_rqst rqst = { .rq_iov = iov,
+				 .rq_nvec = 2 };
+	int flags = CIFS_LOG_ERROR;
+	struct cifs_ses *ses = io_parms->tcon->ses;
 
 	*nbytes = 0;
-	rc = smb2_new_read_req(iov, io_parms, 0, 0);
+	rc = smb2_new_read_req((void **)&req, &total_len, io_parms, 0, 0);
 	if (rc)
 		return rc;
 
-	rc = SendReceive2(xid, io_parms->tcon->ses, iov, 1,
-			  &resp_buftype, CIFS_LOG_ERROR);
+	if (encryption_required(io_parms->tcon))
+		flags |= CIFS_TRANSFORM_REQ;
 
-	rsp = (struct smb2_read_rsp *)iov[0].iov_base;
+	req_len = cpu_to_be32(total_len);
 
-	if (rsp->hdr.Status == STATUS_END_OF_FILE) {
-		free_rsp_buf(resp_buftype, iov[0].iov_base);
+	iov[0].iov_base = &req_len;
+	iov[0].iov_len = sizeof(__be32);
+	iov[1].iov_base = req;
+	iov[1].iov_len = total_len;
+
+	rc = cifs_send_recv(xid, ses, &rqst, &resp_buftype, flags, &rsp_iov);
+	cifs_small_buf_release(req);
+
+	rsp = (struct smb2_read_rsp *)rsp_iov.iov_base;
+	shdr = get_sync_hdr(rsp);
+
+	if (shdr->Status == STATUS_END_OF_FILE) {
+		free_rsp_buf(resp_buftype, rsp_iov.iov_base);
 		return 0;
 	}
 
@@ -2319,11 +2449,10 @@ SMB2_read(const unsigned int xid, struct cifs_io_parms *io_parms,
 	}
 
 	if (*buf) {
-		memcpy(*buf, (char *)&rsp->hdr.ProtocolId + rsp->DataOffset,
-		       *nbytes);
-		free_rsp_buf(resp_buftype, iov[0].iov_base);
+		memcpy(*buf, (char *)shdr + rsp->DataOffset, *nbytes);
+		free_rsp_buf(resp_buftype, rsp_iov.iov_base);
 	} else if (resp_buftype != CIFS_NO_BUFFER) {
-		*buf = iov[0].iov_base;
+		*buf = rsp_iov.iov_base;
 		if (resp_buftype == CIFS_SMALL_BUFFER)
 			*buf_type = CIFS_SMALL_BUFFER;
 		else if (resp_buftype == CIFS_LARGE_BUFFER)
@@ -2348,7 +2477,7 @@ smb2_writev_callback(struct mid_q_entry *mid)
 
 	switch (mid->mid_state) {
 	case MID_RESPONSE_RECEIVED:
-		credits_received = le16_to_cpu(rsp->hdr.CreditRequest);
+		credits_received = le16_to_cpu(rsp->hdr.sync_hdr.CreditRequest);
 		wdata->result = smb2_check_receive(mid, tcon->ses->server, 0);
 		if (wdata->result != 0)
 			break;
@@ -2394,10 +2523,11 @@ smb2_async_writev(struct cifs_writedata *wdata,
 {
 	int rc = -EACCES, flags = 0;
 	struct smb2_write_req *req = NULL;
+	struct smb2_sync_hdr *shdr;
 	struct cifs_tcon *tcon = tlink_tcon(wdata->cfile->tlink);
 	struct TCP_Server_Info *server = tcon->ses->server;
-	struct kvec iov;
-	struct smb_rqst rqst;
+	struct kvec iov[2];
+	struct smb_rqst rqst = { };
 
 	rc = small_smb2_init(SMB2_WRITE, tcon, (void **) &req);
 	if (rc) {
@@ -2412,7 +2542,11 @@ smb2_async_writev(struct cifs_writedata *wdata,
 		goto async_writev_out;
 	}
 
-	req->hdr.ProcessId = cpu_to_le32(wdata->cfile->pid);
+	if (encryption_required(tcon))
+		flags |= CIFS_TRANSFORM_REQ;
+
+	shdr = get_sync_hdr(req);
+	shdr->ProcessId = cpu_to_le32(wdata->cfile->pid);
 
 	req->PersistentFileId = wdata->cfile->fid.persistent_fid;
 	req->VolatileFileId = wdata->cfile->fid.volatile_fid;
@@ -2426,11 +2560,13 @@ smb2_async_writev(struct cifs_writedata *wdata,
 	req->RemainingBytes = 0;
 
 	/* 4 for rfc1002 length field and 1 for Buffer */
-	iov.iov_len = get_rfc1002_length(req) + 4 - 1;
-	iov.iov_base = req;
+	iov[0].iov_len = 4;
+	iov[0].iov_base = req;
+	iov[1].iov_len = get_rfc1002_length(req) - 1;
+	iov[1].iov_base = (char *)req + 4;
 
-	rqst.rq_iov = &iov;
-	rqst.rq_nvec = 1;
+	rqst.rq_iov = iov;
+	rqst.rq_nvec = 2;
 	rqst.rq_pages = wdata->pages;
 	rqst.rq_npages = wdata->nr_pages;
 	rqst.rq_pagesz = wdata->pagesz;
@@ -2444,20 +2580,20 @@ smb2_async_writev(struct cifs_writedata *wdata,
 	inc_rfc1001_len(&req->hdr, wdata->bytes - 1 /* Buffer */);
 
 	if (wdata->credits) {
-		req->hdr.CreditCharge = cpu_to_le16(DIV_ROUND_UP(wdata->bytes,
+		shdr->CreditCharge = cpu_to_le16(DIV_ROUND_UP(wdata->bytes,
 						    SMB2_MAX_BUFFER_SIZE));
-		req->hdr.CreditRequest = req->hdr.CreditCharge;
+		shdr->CreditRequest = shdr->CreditCharge;
 		spin_lock(&server->req_lock);
 		server->credits += wdata->credits -
-					le16_to_cpu(req->hdr.CreditCharge);
+						le16_to_cpu(shdr->CreditCharge);
 		spin_unlock(&server->req_lock);
 		wake_up(&server->request_q);
-		flags = CIFS_HAS_CREDITS;
+		flags |= CIFS_HAS_CREDITS;
 	}
 
 	kref_get(&wdata->refcount);
-	rc = cifs_call_async(server, &rqst, NULL, smb2_writev_callback, wdata,
-			     flags);
+	rc = cifs_call_async(server, &rqst, NULL, smb2_writev_callback, NULL,
+			     wdata, flags);
 
 	if (rc) {
 		kref_put(&wdata->refcount, release);
@@ -2483,6 +2619,9 @@ SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms,
 	struct smb2_write_req *req = NULL;
 	struct smb2_write_rsp *rsp = NULL;
 	int resp_buftype;
+	struct kvec rsp_iov;
+	int flags = 0;
+
 	*nbytes = 0;
 
 	if (n_vec < 1)
@@ -2495,7 +2634,10 @@ SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms,
 	if (io_parms->tcon->ses->server == NULL)
 		return -ECONNABORTED;
 
-	req->hdr.ProcessId = cpu_to_le32(io_parms->pid);
+	if (encryption_required(io_parms->tcon))
+		flags |= CIFS_TRANSFORM_REQ;
+
+	req->hdr.sync_hdr.ProcessId = cpu_to_le32(io_parms->pid);
 
 	req->PersistentFileId = io_parms->persistent_fid;
 	req->VolatileFileId = io_parms->volatile_fid;
@@ -2517,8 +2659,9 @@ SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms,
 	inc_rfc1001_len(req, io_parms->length - 1 /* Buffer */);
 
 	rc = SendReceive2(xid, io_parms->tcon->ses, iov, n_vec + 1,
-			  &resp_buftype, 0);
-	rsp = (struct smb2_write_rsp *)iov[0].iov_base;
+			  &resp_buftype, flags, &rsp_iov);
+	cifs_small_buf_release(req);
+	rsp = (struct smb2_write_rsp *)rsp_iov.iov_base;
 
 	if (rc) {
 		cifs_stats_fail_inc(io_parms->tcon, SMB2_WRITE_HE);
@@ -2581,6 +2724,7 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon,
 	struct smb2_query_directory_req *req;
 	struct smb2_query_directory_rsp *rsp = NULL;
 	struct kvec iov[2];
+	struct kvec rsp_iov;
 	int rc = 0;
 	int len;
 	int resp_buftype = CIFS_NO_BUFFER;
@@ -2591,6 +2735,7 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon,
 	char *end_of_smb;
 	unsigned int output_size = CIFSMaxBufSize;
 	size_t info_buf_size;
+	int flags = 0;
 
 	if (ses && (ses->server))
 		server = ses->server;
@@ -2601,6 +2746,9 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon,
 	if (rc)
 		return rc;
 
+	if (encryption_required(tcon))
+		flags |= CIFS_TRANSFORM_REQ;
+
 	switch (srch_inf->info_level) {
 	case SMB_FIND_FILE_DIRECTORY_INFO:
 		req->FileInformationClass = FILE_DIRECTORY_INFORMATION;
@@ -2645,11 +2793,13 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon,
 
 	inc_rfc1001_len(req, len - 1 /* Buffer */);
 
-	rc = SendReceive2(xid, ses, iov, 2, &resp_buftype, 0);
-	rsp = (struct smb2_query_directory_rsp *)iov[0].iov_base;
+	rc = SendReceive2(xid, ses, iov, 2, &resp_buftype, flags, &rsp_iov);
+	cifs_small_buf_release(req);
+	rsp = (struct smb2_query_directory_rsp *)rsp_iov.iov_base;
 
 	if (rc) {
-		if (rc == -ENODATA && rsp->hdr.Status == STATUS_NO_MORE_FILES) {
+		if (rc == -ENODATA &&
+		    rsp->hdr.sync_hdr.Status == STATUS_NO_MORE_FILES) {
 			srch_inf->endOfSearch = true;
 			rc = 0;
 		}
@@ -2705,11 +2855,13 @@ send_set_info(const unsigned int xid, struct cifs_tcon *tcon,
 	struct smb2_set_info_req *req;
 	struct smb2_set_info_rsp *rsp = NULL;
 	struct kvec *iov;
+	struct kvec rsp_iov;
 	int rc = 0;
 	int resp_buftype;
 	unsigned int i;
 	struct TCP_Server_Info *server;
 	struct cifs_ses *ses = tcon->ses;
+	int flags = 0;
 
 	if (ses && (ses->server))
 		server = ses->server;
@@ -2729,7 +2881,10 @@ send_set_info(const unsigned int xid, struct cifs_tcon *tcon,
 		return rc;
 	}
 
-	req->hdr.ProcessId = cpu_to_le32(pid);
+	if (encryption_required(tcon))
+		flags |= CIFS_TRANSFORM_REQ;
+
+	req->hdr.sync_hdr.ProcessId = cpu_to_le32(pid);
 
 	req->InfoType = SMB2_O_INFO_FILE;
 	req->FileInfoClass = info_class;
@@ -2756,8 +2911,9 @@ send_set_info(const unsigned int xid, struct cifs_tcon *tcon,
 		iov[i].iov_len = size[i];
 	}
 
-	rc = SendReceive2(xid, ses, iov, num, &resp_buftype, 0);
-	rsp = (struct smb2_set_info_rsp *)iov[0].iov_base;
+	rc = SendReceive2(xid, ses, iov, num, &resp_buftype, flags, &rsp_iov);
+	cifs_small_buf_release(req);
+	rsp = (struct smb2_set_info_rsp *)rsp_iov.iov_base;
 
 	if (rc != 0)
 		cifs_stats_fail_inc(tcon, SMB2_SET_INFO_HE);
@@ -2885,20 +3041,23 @@ SMB2_oplock_break(const unsigned int xid, struct cifs_tcon *tcon,
 {
 	int rc;
 	struct smb2_oplock_break *req = NULL;
+	int flags = CIFS_OBREAK_OP;
 
 	cifs_dbg(FYI, "SMB2_oplock_break\n");
 	rc = small_smb2_init(SMB2_OPLOCK_BREAK, tcon, (void **) &req);
-
 	if (rc)
 		return rc;
 
+	if (encryption_required(tcon))
+		flags |= CIFS_TRANSFORM_REQ;
+
 	req->VolatileFid = volatile_fid;
 	req->PersistentFid = persistent_fid;
 	req->OplockLevel = oplock_level;
-	req->hdr.CreditRequest = cpu_to_le16(1);
+	req->hdr.sync_hdr.CreditRequest = cpu_to_le16(1);
 
-	rc = SendReceiveNoRsp(xid, tcon->ses, (char *) req, CIFS_OBREAK_OP);
-	/* SMB2 buffer freed by function above */
+	rc = SendReceiveNoRsp(xid, tcon->ses, (char *) req, flags);
+	cifs_small_buf_release(req);
 
 	if (rc) {
 		cifs_stats_fail_inc(tcon, SMB2_OPLOCK_BREAK_HE);
@@ -2958,10 +3117,12 @@ SMB2_QFS_info(const unsigned int xid, struct cifs_tcon *tcon,
 {
 	struct smb2_query_info_rsp *rsp = NULL;
 	struct kvec iov;
+	struct kvec rsp_iov;
 	int rc = 0;
 	int resp_buftype;
 	struct cifs_ses *ses = tcon->ses;
 	struct smb2_fs_full_size_info *info = NULL;
+	int flags = 0;
 
 	rc = build_qfs_info_req(&iov, tcon, FS_FULL_SIZE_INFORMATION,
 				sizeof(struct smb2_fs_full_size_info),
@@ -2969,12 +3130,16 @@ SMB2_QFS_info(const unsigned int xid, struct cifs_tcon *tcon,
 	if (rc)
 		return rc;
 
-	rc = SendReceive2(xid, ses, &iov, 1, &resp_buftype, 0);
+	if (encryption_required(tcon))
+		flags |= CIFS_TRANSFORM_REQ;
+
+	rc = SendReceive2(xid, ses, &iov, 1, &resp_buftype, flags, &rsp_iov);
+	cifs_small_buf_release(iov.iov_base);
 	if (rc) {
 		cifs_stats_fail_inc(tcon, SMB2_QUERY_INFO_HE);
 		goto qfsinf_exit;
 	}
-	rsp = (struct smb2_query_info_rsp *)iov.iov_base;
+	rsp = (struct smb2_query_info_rsp *)rsp_iov.iov_base;
 
 	info = (struct smb2_fs_full_size_info *)(4 /* RFC1001 len */ +
 		le16_to_cpu(rsp->OutputBufferOffset) + (char *)&rsp->hdr);
@@ -2985,7 +3150,7 @@ SMB2_QFS_info(const unsigned int xid, struct cifs_tcon *tcon,
 		copy_fs_info_to_kstatfs(info, fsdata);
 
 qfsinf_exit:
-	free_rsp_buf(resp_buftype, iov.iov_base);
+	free_rsp_buf(resp_buftype, rsp_iov.iov_base);
 	return rc;
 }
 
@@ -2995,10 +3160,12 @@ SMB2_QFS_attr(const unsigned int xid, struct cifs_tcon *tcon,
 {
 	struct smb2_query_info_rsp *rsp = NULL;
 	struct kvec iov;
+	struct kvec rsp_iov;
 	int rc = 0;
 	int resp_buftype, max_len, min_len;
 	struct cifs_ses *ses = tcon->ses;
 	unsigned int rsp_len, offset;
+	int flags = 0;
 
 	if (level == FS_DEVICE_INFORMATION) {
 		max_len = sizeof(FILE_SYSTEM_DEVICE_INFO);
@@ -3019,12 +3186,16 @@ SMB2_QFS_attr(const unsigned int xid, struct cifs_tcon *tcon,
 	if (rc)
 		return rc;
 
-	rc = SendReceive2(xid, ses, &iov, 1, &resp_buftype, 0);
+	if (encryption_required(tcon))
+		flags |= CIFS_TRANSFORM_REQ;
+
+	rc = SendReceive2(xid, ses, &iov, 1, &resp_buftype, flags, &rsp_iov);
+	cifs_small_buf_release(iov.iov_base);
 	if (rc) {
 		cifs_stats_fail_inc(tcon, SMB2_QUERY_INFO_HE);
 		goto qfsattr_exit;
 	}
-	rsp = (struct smb2_query_info_rsp *)iov.iov_base;
+	rsp = (struct smb2_query_info_rsp *)rsp_iov.iov_base;
 
 	rsp_len = le32_to_cpu(rsp->OutputBufferLength);
 	offset = le16_to_cpu(rsp->OutputBufferOffset);
@@ -3048,7 +3219,7 @@ SMB2_QFS_attr(const unsigned int xid, struct cifs_tcon *tcon,
 	}
 
 qfsattr_exit:
-	free_rsp_buf(resp_buftype, iov.iov_base);
+	free_rsp_buf(resp_buftype, rsp_iov.iov_base);
 	return rc;
 }
 
@@ -3060,8 +3231,10 @@ smb2_lockv(const unsigned int xid, struct cifs_tcon *tcon,
 	int rc = 0;
 	struct smb2_lock_req *req = NULL;
 	struct kvec iov[2];
+	struct kvec rsp_iov;
 	int resp_buf_type;
 	unsigned int count;
+	int flags = CIFS_NO_RESP;
 
 	cifs_dbg(FYI, "smb2_lockv num lock %d\n", num_lock);
 
@@ -3069,7 +3242,10 @@ smb2_lockv(const unsigned int xid, struct cifs_tcon *tcon,
 	if (rc)
 		return rc;
 
-	req->hdr.ProcessId = cpu_to_le32(pid);
+	if (encryption_required(tcon))
+		flags |= CIFS_TRANSFORM_REQ;
+
+	req->hdr.sync_hdr.ProcessId = cpu_to_le32(pid);
 	req->LockCount = cpu_to_le16(num_lock);
 
 	req->PersistentFileId = persist_fid;
@@ -3085,7 +3261,9 @@ smb2_lockv(const unsigned int xid, struct cifs_tcon *tcon,
 	iov[1].iov_len = count;
 
 	cifs_stats_inc(&tcon->stats.cifs_stats.num_locks);
-	rc = SendReceive2(xid, tcon->ses, iov, 2, &resp_buf_type, CIFS_NO_RESP);
+	rc = SendReceive2(xid, tcon->ses, iov, 2, &resp_buf_type, flags,
+			  &rsp_iov);
+	cifs_small_buf_release(req);
 	if (rc) {
 		cifs_dbg(FYI, "Send error in smb2_lockv = %d\n", rc);
 		cifs_stats_fail_inc(tcon, SMB2_LOCK_HE);
@@ -3117,22 +3295,25 @@ SMB2_lease_break(const unsigned int xid, struct cifs_tcon *tcon,
 {
 	int rc;
 	struct smb2_lease_ack *req = NULL;
+	int flags = CIFS_OBREAK_OP;
 
 	cifs_dbg(FYI, "SMB2_lease_break\n");
 	rc = small_smb2_init(SMB2_OPLOCK_BREAK, tcon, (void **) &req);
-
 	if (rc)
 		return rc;
 
-	req->hdr.CreditRequest = cpu_to_le16(1);
+	if (encryption_required(tcon))
+		flags |= CIFS_TRANSFORM_REQ;
+
+	req->hdr.sync_hdr.CreditRequest = cpu_to_le16(1);
 	req->StructureSize = cpu_to_le16(36);
 	inc_rfc1001_len(req, 12);
 
 	memcpy(req->LeaseKey, lease_key, 16);
 	req->LeaseState = lease_state;
 
-	rc = SendReceiveNoRsp(xid, tcon->ses, (char *) req, CIFS_OBREAK_OP);
-	/* SMB2 buffer freed by function above */
+	rc = SendReceiveNoRsp(xid, tcon->ses, (char *) req, flags);
+	cifs_small_buf_release(req);
 
 	if (rc) {
 		cifs_stats_fail_inc(tcon, SMB2_OPLOCK_BREAK_HE);
diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h
index dc0d141f33e2..c03b252501a1 100644
--- a/fs/cifs/smb2pdu.h
+++ b/fs/cifs/smb2pdu.h
@@ -101,10 +101,7 @@
 
 #define SMB2_HEADER_STRUCTURE_SIZE cpu_to_le16(64)
 
-struct smb2_hdr {
-	__be32 smb2_buf_length;	/* big endian on wire */
-				/* length is only two or three bytes - with
-				 one or two byte type preceding it that MBZ */
+struct smb2_sync_hdr {
 	__le32 ProtocolId;	/* 0xFE 'S' 'M' 'B' */
 	__le16 StructureSize;	/* 64 */
 	__le16 CreditCharge;	/* MBZ */
@@ -120,16 +117,31 @@ struct smb2_hdr {
 	__u8   Signature[16];
 } __packed;
 
+struct smb2_sync_pdu {
+	struct smb2_sync_hdr sync_hdr;
+	__le16 StructureSize2; /* size of wct area (varies, request specific) */
+} __packed;
+
+struct smb2_hdr {
+	__be32 smb2_buf_length;	/* big endian on wire */
+				/* length is only two or three bytes - with */
+				/* one or two byte type preceding it that MBZ */
+	struct smb2_sync_hdr sync_hdr;
+} __packed;
+
 struct smb2_pdu {
 	struct smb2_hdr hdr;
 	__le16 StructureSize2; /* size of wct area (varies, request specific) */
 } __packed;
 
+#define SMB3_AES128CMM_NONCE 11
+#define SMB3_AES128GCM_NONCE 12
+
 struct smb2_transform_hdr {
 	__be32 smb2_buf_length;	/* big endian on wire */
 				/* length is only two or three bytes - with
 				 one or two byte type preceding it that MBZ */
-	__u8   ProtocolId[4];	/* 0xFD 'S' 'M' 'B' */
+	__le32 ProtocolId;	/* 0xFD 'S' 'M' 'B' */
 	__u8   Signature[16];
 	__u8   Nonce[16];
 	__le32 OriginalMessageSize;
@@ -814,8 +826,9 @@ struct smb2_flush_rsp {
 #define SMB2_CHANNEL_RDMA_V1		0x00000001 /* SMB3 or later */
 #define SMB2_CHANNEL_RDMA_V1_INVALIDATE 0x00000001 /* SMB3.02 or later */
 
-struct smb2_read_req {
-	struct smb2_hdr hdr;
+/* SMB2 read request without RFC1001 length at the beginning */
+struct smb2_read_plain_req {
+	struct smb2_sync_hdr sync_hdr;
 	__le16 StructureSize; /* Must be 49 */
 	__u8   Padding; /* offset from start of SMB2 header to place read */
 	__u8   Flags; /* MBZ unless SMB3.02 or later */
diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h
index f2d511a6971b..85fc7a789334 100644
--- a/fs/cifs/smb2proto.h
+++ b/fs/cifs/smb2proto.h
@@ -56,6 +56,10 @@ extern void smb2_echo_request(struct work_struct *work);
 extern __le32 smb2_get_lease_state(struct cifsInodeInfo *cinode);
 extern bool smb2_is_valid_oplock_break(char *buffer,
 				       struct TCP_Server_Info *srv);
+extern struct cifs_ses *smb2_find_smb_ses(struct TCP_Server_Info *server,
+					  __u64 ses_id);
+extern int smb3_handle_read_data(struct TCP_Server_Info *server,
+				 struct mid_q_entry *mid);
 
 extern void move_smb2_info_to_cifs(FILE_ALL_INFO *dst,
 				   struct smb2_file_all_info *src);
@@ -97,6 +101,7 @@ extern int smb2_unlock_range(struct cifsFileInfo *cfile,
 			     struct file_lock *flock, const unsigned int xid);
 extern int smb2_push_mandatory_locks(struct cifsFileInfo *cfile);
 extern void smb2_reconnect_server(struct work_struct *work);
+extern int smb3_crypto_aead_allocate(struct TCP_Server_Info *server);
 
 /*
  * SMB2 Worker functions - most of protocol specific implementation details
diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c
index bc9a7b634643..7c3bb1bd7eed 100644
--- a/fs/cifs/smb2transport.c
+++ b/fs/cifs/smb2transport.c
@@ -31,6 +31,7 @@
 #include <asm/processor.h>
 #include <linux/mempool.h>
 #include <linux/highmem.h>
+#include <crypto/aead.h>
 #include "smb2pdu.h"
 #include "cifsglob.h"
 #include "cifsproto.h"
@@ -114,14 +115,14 @@ smb3_crypto_shash_allocate(struct TCP_Server_Info *server)
 	return 0;
 }
 
-static struct cifs_ses *
-smb2_find_smb_ses(struct smb2_hdr *smb2hdr, struct TCP_Server_Info *server)
+struct cifs_ses *
+smb2_find_smb_ses(struct TCP_Server_Info *server, __u64 ses_id)
 {
 	struct cifs_ses *ses;
 
 	spin_lock(&cifs_tcp_ses_lock);
 	list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) {
-		if (ses->Suid != smb2hdr->SessionId)
+		if (ses->Suid != ses_id)
 			continue;
 		spin_unlock(&cifs_tcp_ses_lock);
 		return ses;
@@ -131,7 +132,6 @@ smb2_find_smb_ses(struct smb2_hdr *smb2hdr, struct TCP_Server_Info *server)
 	return NULL;
 }
 
-
 int
 smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
 {
@@ -139,17 +139,17 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
 	unsigned char smb2_signature[SMB2_HMACSHA256_SIZE];
 	unsigned char *sigptr = smb2_signature;
 	struct kvec *iov = rqst->rq_iov;
-	struct smb2_hdr *smb2_pdu = (struct smb2_hdr *)iov[0].iov_base;
+	struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)iov[1].iov_base;
 	struct cifs_ses *ses;
 
-	ses = smb2_find_smb_ses(smb2_pdu, server);
+	ses = smb2_find_smb_ses(server, shdr->SessionId);
 	if (!ses) {
 		cifs_dbg(VFS, "%s: Could not find session\n", __func__);
 		return 0;
 	}
 
 	memset(smb2_signature, 0x0, SMB2_HMACSHA256_SIZE);
-	memset(smb2_pdu->Signature, 0x0, SMB2_SIGNATURE_SIZE);
+	memset(shdr->Signature, 0x0, SMB2_SIGNATURE_SIZE);
 
 	rc = smb2_crypto_shash_allocate(server);
 	if (rc) {
@@ -174,7 +174,7 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
 		&server->secmech.sdeschmacsha256->shash);
 
 	if (!rc)
-		memcpy(smb2_pdu->Signature, sigptr, SMB2_SIGNATURE_SIZE);
+		memcpy(shdr->Signature, sigptr, SMB2_SIGNATURE_SIZE);
 
 	return rc;
 }
@@ -356,17 +356,17 @@ smb3_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
 	unsigned char smb3_signature[SMB2_CMACAES_SIZE];
 	unsigned char *sigptr = smb3_signature;
 	struct kvec *iov = rqst->rq_iov;
-	struct smb2_hdr *smb2_pdu = (struct smb2_hdr *)iov[0].iov_base;
+	struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)iov[1].iov_base;
 	struct cifs_ses *ses;
 
-	ses = smb2_find_smb_ses(smb2_pdu, server);
+	ses = smb2_find_smb_ses(server, shdr->SessionId);
 	if (!ses) {
 		cifs_dbg(VFS, "%s: Could not find session\n", __func__);
 		return 0;
 	}
 
 	memset(smb3_signature, 0x0, SMB2_CMACAES_SIZE);
-	memset(smb2_pdu->Signature, 0x0, SMB2_SIGNATURE_SIZE);
+	memset(shdr->Signature, 0x0, SMB2_SIGNATURE_SIZE);
 
 	rc = crypto_shash_setkey(server->secmech.cmacaes,
 		ses->smb3signingkey, SMB2_CMACAES_SIZE);
@@ -391,7 +391,7 @@ smb3_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
 				   &server->secmech.sdesccmacaes->shash);
 
 	if (!rc)
-		memcpy(smb2_pdu->Signature, sigptr, SMB2_SIGNATURE_SIZE);
+		memcpy(shdr->Signature, sigptr, SMB2_SIGNATURE_SIZE);
 
 	return rc;
 }
@@ -401,14 +401,15 @@ static int
 smb2_sign_rqst(struct smb_rqst *rqst, struct TCP_Server_Info *server)
 {
 	int rc = 0;
-	struct smb2_hdr *smb2_pdu = rqst->rq_iov[0].iov_base;
+	struct smb2_sync_hdr *shdr =
+			(struct smb2_sync_hdr *)rqst->rq_iov[1].iov_base;
 
-	if (!(smb2_pdu->Flags & SMB2_FLAGS_SIGNED) ||
+	if (!(shdr->Flags & SMB2_FLAGS_SIGNED) ||
 	    server->tcpStatus == CifsNeedNegotiate)
 		return rc;
 
 	if (!server->session_estab) {
-		strncpy(smb2_pdu->Signature, "BSRSPYL", 8);
+		strncpy(shdr->Signature, "BSRSPYL", 8);
 		return rc;
 	}
 
@@ -422,11 +423,12 @@ smb2_verify_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
 {
 	unsigned int rc;
 	char server_response_sig[16];
-	struct smb2_hdr *smb2_pdu = (struct smb2_hdr *)rqst->rq_iov[0].iov_base;
+	struct smb2_sync_hdr *shdr =
+			(struct smb2_sync_hdr *)rqst->rq_iov[1].iov_base;
 
-	if ((smb2_pdu->Command == SMB2_NEGOTIATE) ||
-	    (smb2_pdu->Command == SMB2_SESSION_SETUP) ||
-	    (smb2_pdu->Command == SMB2_OPLOCK_BREAK) ||
+	if ((shdr->Command == SMB2_NEGOTIATE) ||
+	    (shdr->Command == SMB2_SESSION_SETUP) ||
+	    (shdr->Command == SMB2_OPLOCK_BREAK) ||
 	    (!server->session_estab))
 		return 0;
 
@@ -436,17 +438,17 @@ smb2_verify_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
 	 */
 
 	/* Do not need to verify session setups with signature "BSRSPYL " */
-	if (memcmp(smb2_pdu->Signature, "BSRSPYL ", 8) == 0)
+	if (memcmp(shdr->Signature, "BSRSPYL ", 8) == 0)
 		cifs_dbg(FYI, "dummy signature received for smb command 0x%x\n",
-			 smb2_pdu->Command);
+			 shdr->Command);
 
 	/*
 	 * Save off the origiginal signature so we can modify the smb and check
 	 * our calculated signature against what the server sent.
 	 */
-	memcpy(server_response_sig, smb2_pdu->Signature, SMB2_SIGNATURE_SIZE);
+	memcpy(server_response_sig, shdr->Signature, SMB2_SIGNATURE_SIZE);
 
-	memset(smb2_pdu->Signature, 0, SMB2_SIGNATURE_SIZE);
+	memset(shdr->Signature, 0, SMB2_SIGNATURE_SIZE);
 
 	mutex_lock(&server->srv_mutex);
 	rc = server->ops->calc_signature(rqst, server);
@@ -455,8 +457,7 @@ smb2_verify_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
 	if (rc)
 		return rc;
 
-	if (memcmp(server_response_sig, smb2_pdu->Signature,
-		   SMB2_SIGNATURE_SIZE))
+	if (memcmp(server_response_sig, shdr->Signature, SMB2_SIGNATURE_SIZE))
 		return -EACCES;
 	else
 		return 0;
@@ -467,18 +468,19 @@ smb2_verify_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
  * and when srv_mutex is held.
  */
 static inline void
-smb2_seq_num_into_buf(struct TCP_Server_Info *server, struct smb2_hdr *hdr)
+smb2_seq_num_into_buf(struct TCP_Server_Info *server,
+		      struct smb2_sync_hdr *shdr)
 {
-	unsigned int i, num = le16_to_cpu(hdr->CreditCharge);
+	unsigned int i, num = le16_to_cpu(shdr->CreditCharge);
 
-	hdr->MessageId = get_next_mid64(server);
+	shdr->MessageId = get_next_mid64(server);
 	/* skip message numbers according to CreditCharge field */
 	for (i = 1; i < num; i++)
 		get_next_mid(server);
 }
 
 static struct mid_q_entry *
-smb2_mid_entry_alloc(const struct smb2_hdr *smb_buffer,
+smb2_mid_entry_alloc(const struct smb2_sync_hdr *shdr,
 		     struct TCP_Server_Info *server)
 {
 	struct mid_q_entry *temp;
@@ -493,9 +495,9 @@ smb2_mid_entry_alloc(const struct smb2_hdr *smb_buffer,
 		return temp;
 	else {
 		memset(temp, 0, sizeof(struct mid_q_entry));
-		temp->mid = le64_to_cpu(smb_buffer->MessageId);
+		temp->mid = le64_to_cpu(shdr->MessageId);
 		temp->pid = current->pid;
-		temp->command = smb_buffer->Command;	/* Always LE */
+		temp->command = shdr->Command; /* Always LE */
 		temp->when_alloc = jiffies;
 		temp->server = server;
 
@@ -513,7 +515,7 @@ smb2_mid_entry_alloc(const struct smb2_hdr *smb_buffer,
 }
 
 static int
-smb2_get_mid_entry(struct cifs_ses *ses, struct smb2_hdr *buf,
+smb2_get_mid_entry(struct cifs_ses *ses, struct smb2_sync_hdr *shdr,
 		   struct mid_q_entry **mid)
 {
 	if (ses->server->tcpStatus == CifsExiting)
@@ -525,19 +527,19 @@ smb2_get_mid_entry(struct cifs_ses *ses, struct smb2_hdr *buf,
 	}
 
 	if (ses->status == CifsNew) {
-		if ((buf->Command != SMB2_SESSION_SETUP) &&
-		    (buf->Command != SMB2_NEGOTIATE))
+		if ((shdr->Command != SMB2_SESSION_SETUP) &&
+		    (shdr->Command != SMB2_NEGOTIATE))
 			return -EAGAIN;
 		/* else ok - we are setting up session */
 	}
 
 	if (ses->status == CifsExiting) {
-		if (buf->Command != SMB2_LOGOFF)
+		if (shdr->Command != SMB2_LOGOFF)
 			return -EAGAIN;
 		/* else ok - we are shutting down the session */
 	}
 
-	*mid = smb2_mid_entry_alloc(buf, ses->server);
+	*mid = smb2_mid_entry_alloc(shdr, ses->server);
 	if (*mid == NULL)
 		return -ENOMEM;
 	spin_lock(&GlobalMid_Lock);
@@ -551,16 +553,18 @@ smb2_check_receive(struct mid_q_entry *mid, struct TCP_Server_Info *server,
 		   bool log_error)
 {
 	unsigned int len = get_rfc1002_length(mid->resp_buf);
-	struct kvec iov;
-	struct smb_rqst rqst = { .rq_iov = &iov,
-				 .rq_nvec = 1 };
+	struct kvec iov[2];
+	struct smb_rqst rqst = { .rq_iov = iov,
+				 .rq_nvec = 2 };
 
-	iov.iov_base = (char *)mid->resp_buf;
-	iov.iov_len = get_rfc1002_length(mid->resp_buf) + 4;
+	iov[0].iov_base = (char *)mid->resp_buf;
+	iov[0].iov_len = 4;
+	iov[1].iov_base = (char *)mid->resp_buf + 4;
+	iov[1].iov_len = len;
 
 	dump_smb(mid->resp_buf, min_t(u32, 80, len));
 	/* convert the length into a more usable form */
-	if (len > 24 && server->sign) {
+	if (len > 24 && server->sign && !mid->decrypted) {
 		int rc;
 
 		rc = smb2_verify_signature(&rqst, server);
@@ -576,12 +580,13 @@ struct mid_q_entry *
 smb2_setup_request(struct cifs_ses *ses, struct smb_rqst *rqst)
 {
 	int rc;
-	struct smb2_hdr *hdr = (struct smb2_hdr *)rqst->rq_iov[0].iov_base;
+	struct smb2_sync_hdr *shdr =
+			(struct smb2_sync_hdr *)rqst->rq_iov[1].iov_base;
 	struct mid_q_entry *mid;
 
-	smb2_seq_num_into_buf(ses->server, hdr);
+	smb2_seq_num_into_buf(ses->server, shdr);
 
-	rc = smb2_get_mid_entry(ses, hdr, &mid);
+	rc = smb2_get_mid_entry(ses, shdr, &mid);
 	if (rc)
 		return ERR_PTR(rc);
 	rc = smb2_sign_rqst(rqst, ses->server);
@@ -596,12 +601,13 @@ struct mid_q_entry *
 smb2_setup_async_request(struct TCP_Server_Info *server, struct smb_rqst *rqst)
 {
 	int rc;
-	struct smb2_hdr *hdr = (struct smb2_hdr *)rqst->rq_iov[0].iov_base;
+	struct smb2_sync_hdr *shdr =
+			(struct smb2_sync_hdr *)rqst->rq_iov[1].iov_base;
 	struct mid_q_entry *mid;
 
-	smb2_seq_num_into_buf(server, hdr);
+	smb2_seq_num_into_buf(server, shdr);
 
-	mid = smb2_mid_entry_alloc(hdr, server);
+	mid = smb2_mid_entry_alloc(shdr, server);
 	if (mid == NULL)
 		return ERR_PTR(-ENOMEM);
 
@@ -613,3 +619,33 @@ smb2_setup_async_request(struct TCP_Server_Info *server, struct smb_rqst *rqst)
 
 	return mid;
 }
+
+int
+smb3_crypto_aead_allocate(struct TCP_Server_Info *server)
+{
+	struct crypto_aead *tfm;
+
+	if (!server->secmech.ccmaesencrypt) {
+		tfm = crypto_alloc_aead("ccm(aes)", 0, 0);
+		if (IS_ERR(tfm)) {
+			cifs_dbg(VFS, "%s: Failed to alloc encrypt aead\n",
+				 __func__);
+			return PTR_ERR(tfm);
+		}
+		server->secmech.ccmaesencrypt = tfm;
+	}
+
+	if (!server->secmech.ccmaesdecrypt) {
+		tfm = crypto_alloc_aead("ccm(aes)", 0, 0);
+		if (IS_ERR(tfm)) {
+			crypto_free_aead(server->secmech.ccmaesencrypt);
+			server->secmech.ccmaesencrypt = NULL;
+			cifs_dbg(VFS, "%s: Failed to alloc decrypt aead\n",
+				 __func__);
+			return PTR_ERR(tfm);
+		}
+		server->secmech.ccmaesdecrypt = tfm;
+	}
+
+	return 0;
+}
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index fbb84c08e3cd..526f0533cb4e 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -221,7 +221,7 @@ rqst_len(struct smb_rqst *rqst)
 }
 
 static int
-smb_send_rqst(struct TCP_Server_Info *server, struct smb_rqst *rqst)
+__smb_send_rqst(struct TCP_Server_Info *server, struct smb_rqst *rqst)
 {
 	int rc;
 	struct kvec *iov = rqst->rq_iov;
@@ -245,8 +245,12 @@ smb_send_rqst(struct TCP_Server_Info *server, struct smb_rqst *rqst)
 		return -EIO;
 	}
 
+	if (n_vec < 2)
+		return -EIO;
+
 	cifs_dbg(FYI, "Sending smb: smb_len=%u\n", smb_buf_length);
 	dump_smb(iov[0].iov_base, iov[0].iov_len);
+	dump_smb(iov[1].iov_base, iov[1].iov_len);
 
 	/* cork the socket */
 	kernel_setsockopt(ssocket, SOL_TCP, TCP_CORK,
@@ -309,24 +313,43 @@ uncork:
 }
 
 static int
-smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec)
+smb_send_rqst(struct TCP_Server_Info *server, struct smb_rqst *rqst, int flags)
 {
-	struct smb_rqst rqst = { .rq_iov = iov,
-				 .rq_nvec = n_vec };
+	struct smb_rqst cur_rqst;
+	int rc;
+
+	if (!(flags & CIFS_TRANSFORM_REQ))
+		return __smb_send_rqst(server, rqst);
+
+	if (!server->ops->init_transform_rq ||
+	    !server->ops->free_transform_rq) {
+		cifs_dbg(VFS, "Encryption requested but transform callbacks are missed\n");
+		return -EIO;
+	}
+
+	rc = server->ops->init_transform_rq(server, &cur_rqst, rqst);
+	if (rc)
+		return rc;
 
-	return smb_send_rqst(server, &rqst);
+	rc = __smb_send_rqst(server, &cur_rqst);
+	server->ops->free_transform_rq(&cur_rqst);
+	return rc;
 }
 
 int
 smb_send(struct TCP_Server_Info *server, struct smb_hdr *smb_buffer,
 	 unsigned int smb_buf_length)
 {
-	struct kvec iov;
+	struct kvec iov[2];
+	struct smb_rqst rqst = { .rq_iov = iov,
+				 .rq_nvec = 2 };
 
-	iov.iov_base = smb_buffer;
-	iov.iov_len = smb_buf_length + 4;
+	iov[0].iov_base = smb_buffer;
+	iov[0].iov_len = 4;
+	iov[1].iov_base = (char *)smb_buffer + 4;
+	iov[1].iov_len = smb_buf_length;
 
-	return smb_sendv(server, &iov, 1);
+	return __smb_send_rqst(server, &rqst);
 }
 
 static int
@@ -454,6 +477,10 @@ cifs_setup_async_request(struct TCP_Server_Info *server, struct smb_rqst *rqst)
 	struct smb_hdr *hdr = (struct smb_hdr *)rqst->rq_iov[0].iov_base;
 	struct mid_q_entry *mid;
 
+	if (rqst->rq_iov[0].iov_len != 4 ||
+	    rqst->rq_iov[0].iov_base + 4 != rqst->rq_iov[1].iov_base)
+		return ERR_PTR(-EIO);
+
 	/* enable signing if server requires it */
 	if (server->sign)
 		hdr->Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
@@ -478,7 +505,7 @@ cifs_setup_async_request(struct TCP_Server_Info *server, struct smb_rqst *rqst)
 int
 cifs_call_async(struct TCP_Server_Info *server, struct smb_rqst *rqst,
 		mid_receive_t *receive, mid_callback_t *callback,
-		void *cbdata, const int flags)
+		mid_handle_t *handle, void *cbdata, const int flags)
 {
 	int rc, timeout, optype;
 	struct mid_q_entry *mid;
@@ -505,6 +532,7 @@ cifs_call_async(struct TCP_Server_Info *server, struct smb_rqst *rqst,
 	mid->receive = receive;
 	mid->callback = callback;
 	mid->callback_data = cbdata;
+	mid->handle = handle;
 	mid->mid_state = MID_REQUEST_SUBMITTED;
 
 	/* put it on the pending_mid_q */
@@ -514,7 +542,7 @@ cifs_call_async(struct TCP_Server_Info *server, struct smb_rqst *rqst,
 
 
 	cifs_in_send_inc(server);
-	rc = smb_send_rqst(server, rqst);
+	rc = smb_send_rqst(server, rqst, flags);
 	cifs_in_send_dec(server);
 	cifs_save_when_sent(mid);
 
@@ -547,12 +575,13 @@ SendReceiveNoRsp(const unsigned int xid, struct cifs_ses *ses,
 {
 	int rc;
 	struct kvec iov[1];
+	struct kvec rsp_iov;
 	int resp_buf_type;
 
 	iov[0].iov_base = in_buf;
 	iov[0].iov_len = get_rfc1002_length(in_buf) + 4;
 	flags |= CIFS_NO_RESP;
-	rc = SendReceive2(xid, ses, iov, 1, &resp_buf_type, flags);
+	rc = SendReceive2(xid, ses, iov, 1, &resp_buf_type, flags, &rsp_iov);
 	cifs_dbg(NOISY, "SendRcvNoRsp flags %d rc %d\n", flags, rc);
 
 	return rc;
@@ -595,10 +624,11 @@ cifs_sync_mid_result(struct mid_q_entry *mid, struct TCP_Server_Info *server)
 }
 
 static inline int
-send_cancel(struct TCP_Server_Info *server, void *buf, struct mid_q_entry *mid)
+send_cancel(struct TCP_Server_Info *server, struct smb_rqst *rqst,
+	    struct mid_q_entry *mid)
 {
 	return server->ops->send_cancel ?
-				server->ops->send_cancel(server, buf, mid) : 0;
+				server->ops->send_cancel(server, rqst, mid) : 0;
 }
 
 int
@@ -611,13 +641,15 @@ cifs_check_receive(struct mid_q_entry *mid, struct TCP_Server_Info *server,
 
 	/* convert the length into a more usable form */
 	if (server->sign) {
-		struct kvec iov;
+		struct kvec iov[2];
 		int rc = 0;
-		struct smb_rqst rqst = { .rq_iov = &iov,
-					 .rq_nvec = 1 };
+		struct smb_rqst rqst = { .rq_iov = iov,
+					 .rq_nvec = 2 };
 
-		iov.iov_base = mid->resp_buf;
-		iov.iov_len = len;
+		iov[0].iov_base = mid->resp_buf;
+		iov[0].iov_len = 4;
+		iov[1].iov_base = (char *)mid->resp_buf + 4;
+		iov[1].iov_len = len - 4;
 		/* FIXME: add code to kill session */
 		rc = cifs_verify_signature(&rqst, server,
 					   mid->sequence_number);
@@ -637,6 +669,10 @@ cifs_setup_request(struct cifs_ses *ses, struct smb_rqst *rqst)
 	struct smb_hdr *hdr = (struct smb_hdr *)rqst->rq_iov[0].iov_base;
 	struct mid_q_entry *mid;
 
+	if (rqst->rq_iov[0].iov_len != 4 ||
+	    rqst->rq_iov[0].iov_base + 4 != rqst->rq_iov[1].iov_base)
+		return ERR_PTR(-EIO);
+
 	rc = allocate_mid(ses, hdr, &mid);
 	if (rc)
 		return ERR_PTR(rc);
@@ -649,17 +685,15 @@ cifs_setup_request(struct cifs_ses *ses, struct smb_rqst *rqst)
 }
 
 int
-SendReceive2(const unsigned int xid, struct cifs_ses *ses,
-	     struct kvec *iov, int n_vec, int *resp_buf_type /* ret */,
-	     const int flags)
+cifs_send_recv(const unsigned int xid, struct cifs_ses *ses,
+	       struct smb_rqst *rqst, int *resp_buf_type, const int flags,
+	       struct kvec *resp_iov)
 {
 	int rc = 0;
 	int timeout, optype;
 	struct mid_q_entry *midQ;
-	char *buf = iov[0].iov_base;
 	unsigned int credits = 1;
-	struct smb_rqst rqst = { .rq_iov = iov,
-				 .rq_nvec = n_vec };
+	char *buf;
 
 	timeout = flags & CIFS_TIMEOUT_MASK;
 	optype = flags & CIFS_OP_MASK;
@@ -667,15 +701,12 @@ SendReceive2(const unsigned int xid, struct cifs_ses *ses,
 	*resp_buf_type = CIFS_NO_BUFFER;  /* no response buf yet */
 
 	if ((ses == NULL) || (ses->server == NULL)) {
-		cifs_small_buf_release(buf);
 		cifs_dbg(VFS, "Null session\n");
 		return -EIO;
 	}
 
-	if (ses->server->tcpStatus == CifsExiting) {
-		cifs_small_buf_release(buf);
+	if (ses->server->tcpStatus == CifsExiting)
 		return -ENOENT;
-	}
 
 	/*
 	 * Ensure that we do not send more than 50 overlapping requests
@@ -684,10 +715,8 @@ SendReceive2(const unsigned int xid, struct cifs_ses *ses,
 	 */
 
 	rc = wait_for_free_request(ses->server, timeout, optype);
-	if (rc) {
-		cifs_small_buf_release(buf);
+	if (rc)
 		return rc;
-	}
 
 	/*
 	 * Make sure that we sign in the same order that we send on this socket
@@ -697,10 +726,9 @@ SendReceive2(const unsigned int xid, struct cifs_ses *ses,
 
 	mutex_lock(&ses->server->srv_mutex);
 
-	midQ = ses->server->ops->setup_request(ses, &rqst);
+	midQ = ses->server->ops->setup_request(ses, rqst);
 	if (IS_ERR(midQ)) {
 		mutex_unlock(&ses->server->srv_mutex);
-		cifs_small_buf_release(buf);
 		/* Update # of requests on wire to server */
 		add_credits(ses->server, 1, optype);
 		return PTR_ERR(midQ);
@@ -708,7 +736,7 @@ SendReceive2(const unsigned int xid, struct cifs_ses *ses,
 
 	midQ->mid_state = MID_REQUEST_SUBMITTED;
 	cifs_in_send_inc(ses->server);
-	rc = smb_sendv(ses->server, iov, n_vec);
+	rc = smb_send_rqst(ses->server, rqst, flags);
 	cifs_in_send_dec(ses->server);
 	cifs_save_when_sent(midQ);
 
@@ -716,32 +744,25 @@ SendReceive2(const unsigned int xid, struct cifs_ses *ses,
 		ses->server->sequence_number -= 2;
 	mutex_unlock(&ses->server->srv_mutex);
 
-	if (rc < 0) {
-		cifs_small_buf_release(buf);
+	if (rc < 0)
 		goto out;
-	}
 
-	if (timeout == CIFS_ASYNC_OP) {
-		cifs_small_buf_release(buf);
+	if (timeout == CIFS_ASYNC_OP)
 		goto out;
-	}
 
 	rc = wait_for_response(ses->server, midQ);
 	if (rc != 0) {
-		send_cancel(ses->server, buf, midQ);
+		send_cancel(ses->server, rqst, midQ);
 		spin_lock(&GlobalMid_Lock);
 		if (midQ->mid_state == MID_REQUEST_SUBMITTED) {
 			midQ->callback = DeleteMidQEntry;
 			spin_unlock(&GlobalMid_Lock);
-			cifs_small_buf_release(buf);
 			add_credits(ses->server, 1, optype);
 			return rc;
 		}
 		spin_unlock(&GlobalMid_Lock);
 	}
 
-	cifs_small_buf_release(buf);
-
 	rc = cifs_sync_mid_result(midQ, ses->server);
 	if (rc != 0) {
 		add_credits(ses->server, 1, optype);
@@ -755,8 +776,8 @@ SendReceive2(const unsigned int xid, struct cifs_ses *ses,
 	}
 
 	buf = (char *)midQ->resp_buf;
-	iov[0].iov_base = buf;
-	iov[0].iov_len = get_rfc1002_length(buf) + 4;
+	resp_iov->iov_base = buf;
+	resp_iov->iov_len = get_rfc1002_length(buf) + 4;
 	if (midQ->large_buf)
 		*resp_buf_type = CIFS_LARGE_BUFFER;
 	else
@@ -778,12 +799,45 @@ out:
 }
 
 int
+SendReceive2(const unsigned int xid, struct cifs_ses *ses,
+	     struct kvec *iov, int n_vec, int *resp_buf_type /* ret */,
+	     const int flags, struct kvec *resp_iov)
+{
+	struct smb_rqst rqst;
+	struct kvec *new_iov;
+	int rc;
+
+	new_iov = kmalloc(sizeof(struct kvec) * (n_vec + 1), GFP_KERNEL);
+	if (!new_iov)
+		return -ENOMEM;
+
+	/* 1st iov is a RFC1001 length followed by the rest of the packet */
+	memcpy(new_iov + 1, iov, (sizeof(struct kvec) * n_vec));
+
+	new_iov[0].iov_base = new_iov[1].iov_base;
+	new_iov[0].iov_len = 4;
+	new_iov[1].iov_base += 4;
+	new_iov[1].iov_len -= 4;
+
+	memset(&rqst, 0, sizeof(struct smb_rqst));
+	rqst.rq_iov = new_iov;
+	rqst.rq_nvec = n_vec + 1;
+
+	rc = cifs_send_recv(xid, ses, &rqst, resp_buf_type, flags, resp_iov);
+	kfree(new_iov);
+	return rc;
+}
+
+int
 SendReceive(const unsigned int xid, struct cifs_ses *ses,
 	    struct smb_hdr *in_buf, struct smb_hdr *out_buf,
 	    int *pbytes_returned, const int timeout)
 {
 	int rc = 0;
 	struct mid_q_entry *midQ;
+	unsigned int len = be32_to_cpu(in_buf->smb_buf_length);
+	struct kvec iov = { .iov_base = in_buf, .iov_len = len };
+	struct smb_rqst rqst = { .rq_iov = &iov, .rq_nvec = 1 };
 
 	if (ses == NULL) {
 		cifs_dbg(VFS, "Null smb session\n");
@@ -801,10 +855,9 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses,
 	   to the same server. We may make this configurable later or
 	   use ses->maxReq */
 
-	if (be32_to_cpu(in_buf->smb_buf_length) > CIFSMaxBufSize +
-			MAX_CIFS_HDR_SIZE - 4) {
+	if (len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) {
 		cifs_dbg(VFS, "Illegal length, greater than maximum frame, %d\n",
-			 be32_to_cpu(in_buf->smb_buf_length));
+			 len);
 		return -EIO;
 	}
 
@@ -835,7 +888,7 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses,
 	midQ->mid_state = MID_REQUEST_SUBMITTED;
 
 	cifs_in_send_inc(ses->server);
-	rc = smb_send(ses->server, in_buf, be32_to_cpu(in_buf->smb_buf_length));
+	rc = smb_send(ses->server, in_buf, len);
 	cifs_in_send_dec(ses->server);
 	cifs_save_when_sent(midQ);
 
@@ -852,7 +905,7 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses,
 
 	rc = wait_for_response(ses->server, midQ);
 	if (rc != 0) {
-		send_cancel(ses->server, in_buf, midQ);
+		send_cancel(ses->server, &rqst, midQ);
 		spin_lock(&GlobalMid_Lock);
 		if (midQ->mid_state == MID_REQUEST_SUBMITTED) {
 			/* no longer considered to be "in-flight" */
@@ -921,6 +974,9 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon,
 	int rstart = 0;
 	struct mid_q_entry *midQ;
 	struct cifs_ses *ses;
+	unsigned int len = be32_to_cpu(in_buf->smb_buf_length);
+	struct kvec iov = { .iov_base = in_buf, .iov_len = len };
+	struct smb_rqst rqst = { .rq_iov = &iov, .rq_nvec = 1 };
 
 	if (tcon == NULL || tcon->ses == NULL) {
 		cifs_dbg(VFS, "Null smb session\n");
@@ -940,10 +996,9 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon,
 	   to the same server. We may make this configurable later or
 	   use ses->maxReq */
 
-	if (be32_to_cpu(in_buf->smb_buf_length) > CIFSMaxBufSize +
-			MAX_CIFS_HDR_SIZE - 4) {
+	if (len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) {
 		cifs_dbg(VFS, "Illegal length, greater than maximum frame, %d\n",
-			 be32_to_cpu(in_buf->smb_buf_length));
+			 len);
 		return -EIO;
 	}
 
@@ -972,7 +1027,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon,
 
 	midQ->mid_state = MID_REQUEST_SUBMITTED;
 	cifs_in_send_inc(ses->server);
-	rc = smb_send(ses->server, in_buf, be32_to_cpu(in_buf->smb_buf_length));
+	rc = smb_send(ses->server, in_buf, len);
 	cifs_in_send_dec(ses->server);
 	cifs_save_when_sent(midQ);
 
@@ -1001,7 +1056,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon,
 		if (in_buf->Command == SMB_COM_TRANSACTION2) {
 			/* POSIX lock. We send a NT_CANCEL SMB to cause the
 			   blocking lock to return. */
-			rc = send_cancel(ses->server, in_buf, midQ);
+			rc = send_cancel(ses->server, &rqst, midQ);
 			if (rc) {
 				cifs_delete_mid(midQ);
 				return rc;
@@ -1022,7 +1077,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon,
 
 		rc = wait_for_response(ses->server, midQ);
 		if (rc) {
-			send_cancel(ses->server, in_buf, midQ);
+			send_cancel(ses->server, &rqst, midQ);
 			spin_lock(&GlobalMid_Lock);
 			if (midQ->mid_state == MID_REQUEST_SUBMITTED) {
 				/* no longer considered to be "in-flight" */
diff --git a/fs/compat_binfmt_elf.c b/fs/compat_binfmt_elf.c
index 4d24d17bcfc1..504b3c3539dc 100644
--- a/fs/compat_binfmt_elf.c
+++ b/fs/compat_binfmt_elf.c
@@ -51,22 +51,8 @@
 #define elf_prstatus	compat_elf_prstatus
 #define elf_prpsinfo	compat_elf_prpsinfo
 
-/*
- * Compat version of cputime_to_compat_timeval, perhaps this
- * should be an inline in <linux/compat.h>.
- */
-static void cputime_to_compat_timeval(const cputime_t cputime,
-				      struct compat_timeval *value)
-{
-	struct timeval tv;
-	cputime_to_timeval(cputime, &tv);
-	value->tv_sec = tv.tv_sec;
-	value->tv_usec = tv.tv_usec;
-}
-
-#undef cputime_to_timeval
-#define cputime_to_timeval cputime_to_compat_timeval
-
+#undef ns_to_timeval
+#define ns_to_timeval ns_to_compat_timeval
 
 /*
  * To use this file, asm/elf.h must define compat_elf_check_arch.
diff --git a/fs/coredump.c b/fs/coredump.c
index e525b6017cdf..ae6b05629ca1 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -833,3 +833,21 @@ int dump_align(struct coredump_params *cprm, int align)
 	return mod ? dump_skip(cprm, align - mod) : 1;
 }
 EXPORT_SYMBOL(dump_align);
+
+/*
+ * Ensures that file size is big enough to contain the current file
+ * postion. This prevents gdb from complaining about a truncated file
+ * if the last "write" to the file was dump_skip.
+ */
+void dump_truncate(struct coredump_params *cprm)
+{
+	struct file *file = cprm->file;
+	loff_t offset;
+
+	if (file->f_op->llseek && file->f_op->llseek != no_llseek) {
+		offset = file->f_op->llseek(file, 0, SEEK_CUR);
+		if (i_size_read(file->f_mapping->host) < offset)
+			do_truncate(file->f_path.dentry, offset, 0, file);
+	}
+}
+EXPORT_SYMBOL(dump_truncate);
diff --git a/fs/crypto/Kconfig b/fs/crypto/Kconfig
index f514978f6688..08b46e6e3995 100644
--- a/fs/crypto/Kconfig
+++ b/fs/crypto/Kconfig
@@ -1,6 +1,5 @@
 config FS_ENCRYPTION
 	tristate "FS Encryption (Per-file encryption)"
-	depends on BLOCK
 	select CRYPTO
 	select CRYPTO_AES
 	select CRYPTO_CBC
diff --git a/fs/crypto/Makefile b/fs/crypto/Makefile
index f17684c48739..9f6607f17b53 100644
--- a/fs/crypto/Makefile
+++ b/fs/crypto/Makefile
@@ -1,3 +1,4 @@
 obj-$(CONFIG_FS_ENCRYPTION)	+= fscrypto.o
 
 fscrypto-y := crypto.o fname.o policy.o keyinfo.o
+fscrypto-$(CONFIG_BLOCK) += bio.o
diff --git a/fs/crypto/bio.c b/fs/crypto/bio.c
new file mode 100644
index 000000000000..a409a84f1bca
--- /dev/null
+++ b/fs/crypto/bio.c
@@ -0,0 +1,145 @@
+/*
+ * This contains encryption functions for per-file encryption.
+ *
+ * Copyright (C) 2015, Google, Inc.
+ * Copyright (C) 2015, Motorola Mobility
+ *
+ * Written by Michael Halcrow, 2014.
+ *
+ * Filename encryption additions
+ *	Uday Savagaonkar, 2014
+ * Encryption policy handling additions
+ *	Ildar Muslukhov, 2014
+ * Add fscrypt_pullback_bio_page()
+ *	Jaegeuk Kim, 2015.
+ *
+ * This has not yet undergone a rigorous security audit.
+ *
+ * The usage of AES-XTS should conform to recommendations in NIST
+ * Special Publication 800-38E and IEEE P1619/D16.
+ */
+
+#include <linux/pagemap.h>
+#include <linux/module.h>
+#include <linux/bio.h>
+#include <linux/namei.h>
+#include "fscrypt_private.h"
+
+/*
+ * Call fscrypt_decrypt_page on every single page, reusing the encryption
+ * context.
+ */
+static void completion_pages(struct work_struct *work)
+{
+	struct fscrypt_ctx *ctx =
+		container_of(work, struct fscrypt_ctx, r.work);
+	struct bio *bio = ctx->r.bio;
+	struct bio_vec *bv;
+	int i;
+
+	bio_for_each_segment_all(bv, bio, i) {
+		struct page *page = bv->bv_page;
+		int ret = fscrypt_decrypt_page(page->mapping->host, page,
+				PAGE_SIZE, 0, page->index);
+
+		if (ret) {
+			WARN_ON_ONCE(1);
+			SetPageError(page);
+		} else {
+			SetPageUptodate(page);
+		}
+		unlock_page(page);
+	}
+	fscrypt_release_ctx(ctx);
+	bio_put(bio);
+}
+
+void fscrypt_decrypt_bio_pages(struct fscrypt_ctx *ctx, struct bio *bio)
+{
+	INIT_WORK(&ctx->r.work, completion_pages);
+	ctx->r.bio = bio;
+	queue_work(fscrypt_read_workqueue, &ctx->r.work);
+}
+EXPORT_SYMBOL(fscrypt_decrypt_bio_pages);
+
+void fscrypt_pullback_bio_page(struct page **page, bool restore)
+{
+	struct fscrypt_ctx *ctx;
+	struct page *bounce_page;
+
+	/* The bounce data pages are unmapped. */
+	if ((*page)->mapping)
+		return;
+
+	/* The bounce data page is unmapped. */
+	bounce_page = *page;
+	ctx = (struct fscrypt_ctx *)page_private(bounce_page);
+
+	/* restore control page */
+	*page = ctx->w.control_page;
+
+	if (restore)
+		fscrypt_restore_control_page(bounce_page);
+}
+EXPORT_SYMBOL(fscrypt_pullback_bio_page);
+
+int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk,
+				sector_t pblk, unsigned int len)
+{
+	struct fscrypt_ctx *ctx;
+	struct page *ciphertext_page = NULL;
+	struct bio *bio;
+	int ret, err = 0;
+
+	BUG_ON(inode->i_sb->s_blocksize != PAGE_SIZE);
+
+	ctx = fscrypt_get_ctx(inode, GFP_NOFS);
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	ciphertext_page = fscrypt_alloc_bounce_page(ctx, GFP_NOWAIT);
+	if (IS_ERR(ciphertext_page)) {
+		err = PTR_ERR(ciphertext_page);
+		goto errout;
+	}
+
+	while (len--) {
+		err = fscrypt_do_page_crypto(inode, FS_ENCRYPT, lblk,
+					     ZERO_PAGE(0), ciphertext_page,
+					     PAGE_SIZE, 0, GFP_NOFS);
+		if (err)
+			goto errout;
+
+		bio = bio_alloc(GFP_NOWAIT, 1);
+		if (!bio) {
+			err = -ENOMEM;
+			goto errout;
+		}
+		bio->bi_bdev = inode->i_sb->s_bdev;
+		bio->bi_iter.bi_sector =
+			pblk << (inode->i_sb->s_blocksize_bits - 9);
+		bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
+		ret = bio_add_page(bio, ciphertext_page,
+					inode->i_sb->s_blocksize, 0);
+		if (ret != inode->i_sb->s_blocksize) {
+			/* should never happen! */
+			WARN_ON(1);
+			bio_put(bio);
+			err = -EIO;
+			goto errout;
+		}
+		err = submit_bio_wait(bio);
+		if ((err == 0) && bio->bi_error)
+			err = -EIO;
+		bio_put(bio);
+		if (err)
+			goto errout;
+		lblk++;
+		pblk++;
+	}
+	err = 0;
+errout:
+	fscrypt_release_ctx(ctx);
+	return err;
+}
+EXPORT_SYMBOL(fscrypt_zeroout_range);
diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c
index ac8e4f6a3773..02a7a9286449 100644
--- a/fs/crypto/crypto.c
+++ b/fs/crypto/crypto.c
@@ -24,7 +24,6 @@
 #include <linux/module.h>
 #include <linux/scatterlist.h>
 #include <linux/ratelimit.h>
-#include <linux/bio.h>
 #include <linux/dcache.h>
 #include <linux/namei.h>
 #include "fscrypt_private.h"
@@ -44,7 +43,7 @@ static mempool_t *fscrypt_bounce_page_pool = NULL;
 static LIST_HEAD(fscrypt_free_ctxs);
 static DEFINE_SPINLOCK(fscrypt_ctx_lock);
 
-static struct workqueue_struct *fscrypt_read_workqueue;
+struct workqueue_struct *fscrypt_read_workqueue;
 static DEFINE_MUTEX(fscrypt_init_mutex);
 
 static struct kmem_cache *fscrypt_ctx_cachep;
@@ -141,16 +140,10 @@ static void page_crypt_complete(struct crypto_async_request *req, int res)
 	complete(&ecr->completion);
 }
 
-typedef enum {
-	FS_DECRYPT = 0,
-	FS_ENCRYPT,
-} fscrypt_direction_t;
-
-static int do_page_crypto(const struct inode *inode,
-			fscrypt_direction_t rw, u64 lblk_num,
-			struct page *src_page, struct page *dest_page,
-			unsigned int len, unsigned int offs,
-			gfp_t gfp_flags)
+int fscrypt_do_page_crypto(const struct inode *inode, fscrypt_direction_t rw,
+			   u64 lblk_num, struct page *src_page,
+			   struct page *dest_page, unsigned int len,
+			   unsigned int offs, gfp_t gfp_flags)
 {
 	struct {
 		__le64 index;
@@ -205,7 +198,8 @@ static int do_page_crypto(const struct inode *inode,
 	return 0;
 }
 
-static struct page *alloc_bounce_page(struct fscrypt_ctx *ctx, gfp_t gfp_flags)
+struct page *fscrypt_alloc_bounce_page(struct fscrypt_ctx *ctx,
+				       gfp_t gfp_flags)
 {
 	ctx->w.bounce_page = mempool_alloc(fscrypt_bounce_page_pool, gfp_flags);
 	if (ctx->w.bounce_page == NULL)
@@ -260,9 +254,9 @@ struct page *fscrypt_encrypt_page(const struct inode *inode,
 
 	if (inode->i_sb->s_cop->flags & FS_CFLG_OWN_PAGES) {
 		/* with inplace-encryption we just encrypt the page */
-		err = do_page_crypto(inode, FS_ENCRYPT, lblk_num,
-					page, ciphertext_page,
-					len, offs, gfp_flags);
+		err = fscrypt_do_page_crypto(inode, FS_ENCRYPT, lblk_num, page,
+					     ciphertext_page, len, offs,
+					     gfp_flags);
 		if (err)
 			return ERR_PTR(err);
 
@@ -276,14 +270,14 @@ struct page *fscrypt_encrypt_page(const struct inode *inode,
 		return (struct page *)ctx;
 
 	/* The encryption operation will require a bounce page. */
-	ciphertext_page = alloc_bounce_page(ctx, gfp_flags);
+	ciphertext_page = fscrypt_alloc_bounce_page(ctx, gfp_flags);
 	if (IS_ERR(ciphertext_page))
 		goto errout;
 
 	ctx->w.control_page = page;
-	err = do_page_crypto(inode, FS_ENCRYPT, lblk_num,
-					page, ciphertext_page,
-					len, offs, gfp_flags);
+	err = fscrypt_do_page_crypto(inode, FS_ENCRYPT, lblk_num,
+				     page, ciphertext_page, len, offs,
+				     gfp_flags);
 	if (err) {
 		ciphertext_page = ERR_PTR(err);
 		goto errout;
@@ -320,72 +314,11 @@ int fscrypt_decrypt_page(const struct inode *inode, struct page *page,
 	if (!(inode->i_sb->s_cop->flags & FS_CFLG_OWN_PAGES))
 		BUG_ON(!PageLocked(page));
 
-	return do_page_crypto(inode, FS_DECRYPT, lblk_num, page, page, len,
-			offs, GFP_NOFS);
+	return fscrypt_do_page_crypto(inode, FS_DECRYPT, lblk_num, page, page,
+				      len, offs, GFP_NOFS);
 }
 EXPORT_SYMBOL(fscrypt_decrypt_page);
 
-int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk,
-				sector_t pblk, unsigned int len)
-{
-	struct fscrypt_ctx *ctx;
-	struct page *ciphertext_page = NULL;
-	struct bio *bio;
-	int ret, err = 0;
-
-	BUG_ON(inode->i_sb->s_blocksize != PAGE_SIZE);
-
-	ctx = fscrypt_get_ctx(inode, GFP_NOFS);
-	if (IS_ERR(ctx))
-		return PTR_ERR(ctx);
-
-	ciphertext_page = alloc_bounce_page(ctx, GFP_NOWAIT);
-	if (IS_ERR(ciphertext_page)) {
-		err = PTR_ERR(ciphertext_page);
-		goto errout;
-	}
-
-	while (len--) {
-		err = do_page_crypto(inode, FS_ENCRYPT, lblk,
-					ZERO_PAGE(0), ciphertext_page,
-					PAGE_SIZE, 0, GFP_NOFS);
-		if (err)
-			goto errout;
-
-		bio = bio_alloc(GFP_NOWAIT, 1);
-		if (!bio) {
-			err = -ENOMEM;
-			goto errout;
-		}
-		bio->bi_bdev = inode->i_sb->s_bdev;
-		bio->bi_iter.bi_sector =
-			pblk << (inode->i_sb->s_blocksize_bits - 9);
-		bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
-		ret = bio_add_page(bio, ciphertext_page,
-					inode->i_sb->s_blocksize, 0);
-		if (ret != inode->i_sb->s_blocksize) {
-			/* should never happen! */
-			WARN_ON(1);
-			bio_put(bio);
-			err = -EIO;
-			goto errout;
-		}
-		err = submit_bio_wait(bio);
-		if ((err == 0) && bio->bi_error)
-			err = -EIO;
-		bio_put(bio);
-		if (err)
-			goto errout;
-		lblk++;
-		pblk++;
-	}
-	err = 0;
-errout:
-	fscrypt_release_ctx(ctx);
-	return err;
-}
-EXPORT_SYMBOL(fscrypt_zeroout_range);
-
 /*
  * Validate dentries for encrypted directories to make sure we aren't
  * potentially caching stale data after a key has been added or
@@ -442,64 +375,6 @@ const struct dentry_operations fscrypt_d_ops = {
 };
 EXPORT_SYMBOL(fscrypt_d_ops);
 
-/*
- * Call fscrypt_decrypt_page on every single page, reusing the encryption
- * context.
- */
-static void completion_pages(struct work_struct *work)
-{
-	struct fscrypt_ctx *ctx =
-		container_of(work, struct fscrypt_ctx, r.work);
-	struct bio *bio = ctx->r.bio;
-	struct bio_vec *bv;
-	int i;
-
-	bio_for_each_segment_all(bv, bio, i) {
-		struct page *page = bv->bv_page;
-		int ret = fscrypt_decrypt_page(page->mapping->host, page,
-				PAGE_SIZE, 0, page->index);
-
-		if (ret) {
-			WARN_ON_ONCE(1);
-			SetPageError(page);
-		} else {
-			SetPageUptodate(page);
-		}
-		unlock_page(page);
-	}
-	fscrypt_release_ctx(ctx);
-	bio_put(bio);
-}
-
-void fscrypt_decrypt_bio_pages(struct fscrypt_ctx *ctx, struct bio *bio)
-{
-	INIT_WORK(&ctx->r.work, completion_pages);
-	ctx->r.bio = bio;
-	queue_work(fscrypt_read_workqueue, &ctx->r.work);
-}
-EXPORT_SYMBOL(fscrypt_decrypt_bio_pages);
-
-void fscrypt_pullback_bio_page(struct page **page, bool restore)
-{
-	struct fscrypt_ctx *ctx;
-	struct page *bounce_page;
-
-	/* The bounce data pages are unmapped. */
-	if ((*page)->mapping)
-		return;
-
-	/* The bounce data page is unmapped. */
-	bounce_page = *page;
-	ctx = (struct fscrypt_ctx *)page_private(bounce_page);
-
-	/* restore control page */
-	*page = ctx->w.control_page;
-
-	if (restore)
-		fscrypt_restore_control_page(bounce_page);
-}
-EXPORT_SYMBOL(fscrypt_pullback_bio_page);
-
 void fscrypt_restore_control_page(struct page *page)
 {
 	struct fscrypt_ctx *ctx;
diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c
index 56ad9d195f18..13052b85c393 100644
--- a/fs/crypto/fname.c
+++ b/fs/crypto/fname.c
@@ -332,7 +332,7 @@ int fscrypt_fname_usr_to_disk(struct inode *inode,
 	 * in a directory. Consequently, a user space name cannot be mapped to
 	 * a disk-space name
 	 */
-	return -EACCES;
+	return -ENOKEY;
 }
 EXPORT_SYMBOL(fscrypt_fname_usr_to_disk);
 
@@ -367,7 +367,7 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname,
 		return 0;
 	}
 	if (!lookup)
-		return -EACCES;
+		return -ENOKEY;
 
 	/*
 	 * We don't have the key and we are doing a lookup; decode the
diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h
index aeab032d7d35..fdbb8af32eaf 100644
--- a/fs/crypto/fscrypt_private.h
+++ b/fs/crypto/fscrypt_private.h
@@ -11,7 +11,7 @@
 #ifndef _FSCRYPT_PRIVATE_H
 #define _FSCRYPT_PRIVATE_H
 
-#include <linux/fscrypto.h>
+#include <linux/fscrypt_supp.h>
 
 #define FS_FNAME_CRYPTO_DIGEST_SIZE	32
 
@@ -71,6 +71,11 @@ struct fscrypt_info {
 	u8 ci_master_key[FS_KEY_DESCRIPTOR_SIZE];
 };
 
+typedef enum {
+	FS_DECRYPT = 0,
+	FS_ENCRYPT,
+} fscrypt_direction_t;
+
 #define FS_CTX_REQUIRES_FREE_ENCRYPT_FL		0x00000001
 #define FS_CTX_HAS_BOUNCE_BUFFER_FL		0x00000002
 
@@ -81,11 +86,20 @@ struct fscrypt_completion_result {
 
 #define DECLARE_FS_COMPLETION_RESULT(ecr) \
 	struct fscrypt_completion_result ecr = { \
-		COMPLETION_INITIALIZER((ecr).completion), 0 }
+		COMPLETION_INITIALIZER_ONSTACK((ecr).completion), 0 }
 
 
 /* crypto.c */
-int fscrypt_initialize(unsigned int cop_flags);
+extern int fscrypt_initialize(unsigned int cop_flags);
+extern struct workqueue_struct *fscrypt_read_workqueue;
+extern int fscrypt_do_page_crypto(const struct inode *inode,
+				  fscrypt_direction_t rw, u64 lblk_num,
+				  struct page *src_page,
+				  struct page *dest_page,
+				  unsigned int len, unsigned int offs,
+				  gfp_t gfp_flags);
+extern struct page *fscrypt_alloc_bounce_page(struct fscrypt_ctx *ctx,
+					      gfp_t gfp_flags);
 
 /* keyinfo.c */
 extern int fscrypt_get_crypt_info(struct inode *);
diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c
index 6eeea1dcba41..02eb6b9e4438 100644
--- a/fs/crypto/keyinfo.c
+++ b/fs/crypto/keyinfo.c
@@ -77,26 +77,22 @@ out:
 
 static int validate_user_key(struct fscrypt_info *crypt_info,
 			struct fscrypt_context *ctx, u8 *raw_key,
-			u8 *prefix, int prefix_size)
+			const char *prefix)
 {
-	u8 *full_key_descriptor;
+	char *description;
 	struct key *keyring_key;
 	struct fscrypt_key *master_key;
 	const struct user_key_payload *ukp;
-	int full_key_len = prefix_size + (FS_KEY_DESCRIPTOR_SIZE * 2) + 1;
 	int res;
 
-	full_key_descriptor = kmalloc(full_key_len, GFP_NOFS);
-	if (!full_key_descriptor)
+	description = kasprintf(GFP_NOFS, "%s%*phN", prefix,
+				FS_KEY_DESCRIPTOR_SIZE,
+				ctx->master_key_descriptor);
+	if (!description)
 		return -ENOMEM;
 
-	memcpy(full_key_descriptor, prefix, prefix_size);
-	sprintf(full_key_descriptor + prefix_size,
-			"%*phN", FS_KEY_DESCRIPTOR_SIZE,
-			ctx->master_key_descriptor);
-	full_key_descriptor[full_key_len - 1] = '\0';
-	keyring_key = request_key(&key_type_logon, full_key_descriptor, NULL);
-	kfree(full_key_descriptor);
+	keyring_key = request_key(&key_type_logon, description, NULL);
+	kfree(description);
 	if (IS_ERR(keyring_key))
 		return PTR_ERR(keyring_key);
 
@@ -206,12 +202,15 @@ retry:
 
 	res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx));
 	if (res < 0) {
-		if (!fscrypt_dummy_context_enabled(inode))
+		if (!fscrypt_dummy_context_enabled(inode) ||
+		    inode->i_sb->s_cop->is_encrypted(inode))
 			return res;
+		/* Fake up a context for an unencrypted directory */
+		memset(&ctx, 0, sizeof(ctx));
 		ctx.format = FS_ENCRYPTION_CONTEXT_FORMAT_V1;
 		ctx.contents_encryption_mode = FS_ENCRYPTION_MODE_AES_256_XTS;
 		ctx.filenames_encryption_mode = FS_ENCRYPTION_MODE_AES_256_CTS;
-		ctx.flags = 0;
+		memset(ctx.master_key_descriptor, 0x42, FS_KEY_DESCRIPTOR_SIZE);
 	} else if (res != sizeof(ctx)) {
 		return -EINVAL;
 	}
@@ -247,20 +246,10 @@ retry:
 	if (!raw_key)
 		goto out;
 
-	if (fscrypt_dummy_context_enabled(inode)) {
-		memset(raw_key, 0x42, FS_AES_256_XTS_KEY_SIZE);
-		goto got_key;
-	}
-
-	res = validate_user_key(crypt_info, &ctx, raw_key,
-			FS_KEY_DESC_PREFIX, FS_KEY_DESC_PREFIX_SIZE);
+	res = validate_user_key(crypt_info, &ctx, raw_key, FS_KEY_DESC_PREFIX);
 	if (res && inode->i_sb->s_cop->key_prefix) {
-		u8 *prefix = NULL;
-		int prefix_size, res2;
-
-		prefix_size = inode->i_sb->s_cop->key_prefix(inode, &prefix);
-		res2 = validate_user_key(crypt_info, &ctx, raw_key,
-							prefix, prefix_size);
+		int res2 = validate_user_key(crypt_info, &ctx, raw_key,
+					     inode->i_sb->s_cop->key_prefix);
 		if (res2) {
 			if (res2 == -ENOKEY)
 				res = -ENOKEY;
@@ -269,7 +258,6 @@ retry:
 	} else if (res) {
 		goto out;
 	}
-got_key:
 	ctfm = crypto_alloc_skcipher(cipher_str, 0, 0);
 	if (!ctfm || IS_ERR(ctfm)) {
 		res = ctfm ? PTR_ERR(ctfm) : -ENOMEM;
diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c
index 6ed7c2eebeec..14b76da71269 100644
--- a/fs/crypto/policy.c
+++ b/fs/crypto/policy.c
@@ -13,37 +13,20 @@
 #include <linux/mount.h>
 #include "fscrypt_private.h"
 
-static int inode_has_encryption_context(struct inode *inode)
-{
-	if (!inode->i_sb->s_cop->get_context)
-		return 0;
-	return (inode->i_sb->s_cop->get_context(inode, NULL, 0L) > 0);
-}
-
 /*
- * check whether the policy is consistent with the encryption context
- * for the inode
+ * check whether an encryption policy is consistent with an encryption context
  */
-static int is_encryption_context_consistent_with_policy(struct inode *inode,
+static bool is_encryption_context_consistent_with_policy(
+				const struct fscrypt_context *ctx,
 				const struct fscrypt_policy *policy)
 {
-	struct fscrypt_context ctx;
-	int res;
-
-	if (!inode->i_sb->s_cop->get_context)
-		return 0;
-
-	res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx));
-	if (res != sizeof(ctx))
-		return 0;
-
-	return (memcmp(ctx.master_key_descriptor, policy->master_key_descriptor,
-			FS_KEY_DESCRIPTOR_SIZE) == 0 &&
-			(ctx.flags == policy->flags) &&
-			(ctx.contents_encryption_mode ==
-			 policy->contents_encryption_mode) &&
-			(ctx.filenames_encryption_mode ==
-			 policy->filenames_encryption_mode));
+	return memcmp(ctx->master_key_descriptor, policy->master_key_descriptor,
+		      FS_KEY_DESCRIPTOR_SIZE) == 0 &&
+		(ctx->flags == policy->flags) &&
+		(ctx->contents_encryption_mode ==
+		 policy->contents_encryption_mode) &&
+		(ctx->filenames_encryption_mode ==
+		 policy->filenames_encryption_mode);
 }
 
 static int create_encryption_context_from_policy(struct inode *inode,
@@ -66,20 +49,12 @@ static int create_encryption_context_from_policy(struct inode *inode,
 					FS_KEY_DESCRIPTOR_SIZE);
 
 	if (!fscrypt_valid_contents_enc_mode(
-				policy->contents_encryption_mode)) {
-		printk(KERN_WARNING
-		       "%s: Invalid contents encryption mode %d\n", __func__,
-			policy->contents_encryption_mode);
+				policy->contents_encryption_mode))
 		return -EINVAL;
-	}
 
 	if (!fscrypt_valid_filenames_enc_mode(
-				policy->filenames_encryption_mode)) {
-		printk(KERN_WARNING
-			"%s: Invalid filenames encryption mode %d\n", __func__,
-			policy->filenames_encryption_mode);
+				policy->filenames_encryption_mode))
 		return -EINVAL;
-	}
 
 	if (policy->flags & ~FS_POLICY_FLAGS_VALID)
 		return -EINVAL;
@@ -98,6 +73,7 @@ int fscrypt_ioctl_set_policy(struct file *filp, const void __user *arg)
 	struct fscrypt_policy policy;
 	struct inode *inode = file_inode(filp);
 	int ret;
+	struct fscrypt_context ctx;
 
 	if (copy_from_user(&policy, arg, sizeof(policy)))
 		return -EFAULT;
@@ -114,9 +90,10 @@ int fscrypt_ioctl_set_policy(struct file *filp, const void __user *arg)
 
 	inode_lock(inode);
 
-	if (!inode_has_encryption_context(inode)) {
+	ret = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx));
+	if (ret == -ENODATA) {
 		if (!S_ISDIR(inode->i_mode))
-			ret = -EINVAL;
+			ret = -ENOTDIR;
 		else if (!inode->i_sb->s_cop->empty_dir)
 			ret = -EOPNOTSUPP;
 		else if (!inode->i_sb->s_cop->empty_dir(inode))
@@ -124,12 +101,14 @@ int fscrypt_ioctl_set_policy(struct file *filp, const void __user *arg)
 		else
 			ret = create_encryption_context_from_policy(inode,
 								    &policy);
-	} else if (!is_encryption_context_consistent_with_policy(inode,
-								 &policy)) {
-		printk(KERN_WARNING
-		       "%s: Policy inconsistent with encryption context\n",
-		       __func__);
-		ret = -EINVAL;
+	} else if (ret == sizeof(ctx) &&
+		   is_encryption_context_consistent_with_policy(&ctx,
+								&policy)) {
+		/* The file already uses the same encryption policy. */
+		ret = 0;
+	} else if (ret >= 0 || ret == -ERANGE) {
+		/* The file already uses a different encryption policy. */
+		ret = -EEXIST;
 	}
 
 	inode_unlock(inode);
@@ -151,8 +130,10 @@ int fscrypt_ioctl_get_policy(struct file *filp, void __user *arg)
 		return -ENODATA;
 
 	res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx));
+	if (res < 0 && res != -ERANGE)
+		return res;
 	if (res != sizeof(ctx))
-		return -ENODATA;
+		return -EINVAL;
 	if (ctx.format != FS_ENCRYPTION_CONTEXT_FORMAT_V1)
 		return -EINVAL;
 
@@ -179,6 +160,11 @@ int fscrypt_has_permitted_context(struct inode *parent, struct inode *child)
 		BUG_ON(1);
 	}
 
+	/* No restrictions on file types which are never encrypted */
+	if (!S_ISREG(child->i_mode) && !S_ISDIR(child->i_mode) &&
+	    !S_ISLNK(child->i_mode))
+		return 1;
+
 	/* no restrictions if the parent directory is not encrypted */
 	if (!parent->i_sb->s_cop->is_encrypted(parent))
 		return 1;
@@ -212,9 +198,9 @@ EXPORT_SYMBOL(fscrypt_has_permitted_context);
  * @parent: Parent inode from which the context is inherited.
  * @child:  Child inode that inherits the context from @parent.
  * @fs_data:  private data given by FS.
- * @preload:  preload child i_crypt_info
+ * @preload:  preload child i_crypt_info if true
  *
- * Return: Zero on success, non-zero otherwise
+ * Return: 0 on success, -errno on failure
  */
 int fscrypt_inherit_context(struct inode *parent, struct inode *child,
 						void *fs_data, bool preload)
@@ -235,19 +221,11 @@ int fscrypt_inherit_context(struct inode *parent, struct inode *child,
 		return -ENOKEY;
 
 	ctx.format = FS_ENCRYPTION_CONTEXT_FORMAT_V1;
-	if (fscrypt_dummy_context_enabled(parent)) {
-		ctx.contents_encryption_mode = FS_ENCRYPTION_MODE_AES_256_XTS;
-		ctx.filenames_encryption_mode = FS_ENCRYPTION_MODE_AES_256_CTS;
-		ctx.flags = 0;
-		memset(ctx.master_key_descriptor, 0x42, FS_KEY_DESCRIPTOR_SIZE);
-		res = 0;
-	} else {
-		ctx.contents_encryption_mode = ci->ci_data_mode;
-		ctx.filenames_encryption_mode = ci->ci_filename_mode;
-		ctx.flags = ci->ci_flags;
-		memcpy(ctx.master_key_descriptor, ci->ci_master_key,
-				FS_KEY_DESCRIPTOR_SIZE);
-	}
+	ctx.contents_encryption_mode = ci->ci_data_mode;
+	ctx.filenames_encryption_mode = ci->ci_filename_mode;
+	ctx.flags = ci->ci_flags;
+	memcpy(ctx.master_key_descriptor, ci->ci_master_key,
+	       FS_KEY_DESCRIPTOR_SIZE);
 	get_random_bytes(ctx.nonce, FS_KEY_DERIVATION_NONCE_SIZE);
 	res = parent->i_sb->s_cop->set_context(child, &ctx,
 						sizeof(ctx), fs_data);
diff --git a/fs/dax.c b/fs/dax.c
index a8732fbed381..3f1181563fb1 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -35,6 +35,9 @@
 #include <linux/iomap.h>
 #include "internal.h"
 
+#define CREATE_TRACE_POINTS
+#include <trace/events/fs_dax.h>
+
 /* We choose 4096 entries - same as per-zone page wait tables */
 #define DAX_WAIT_TABLE_BITS 12
 #define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS)
@@ -451,16 +454,37 @@ void dax_wake_mapping_entry_waiter(struct address_space *mapping,
 		__wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key);
 }
 
+static int __dax_invalidate_mapping_entry(struct address_space *mapping,
+					  pgoff_t index, bool trunc)
+{
+	int ret = 0;
+	void *entry;
+	struct radix_tree_root *page_tree = &mapping->page_tree;
+
+	spin_lock_irq(&mapping->tree_lock);
+	entry = get_unlocked_mapping_entry(mapping, index, NULL);
+	if (!entry || !radix_tree_exceptional_entry(entry))
+		goto out;
+	if (!trunc &&
+	    (radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_DIRTY) ||
+	     radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE)))
+		goto out;
+	radix_tree_delete(page_tree, index);
+	mapping->nrexceptional--;
+	ret = 1;
+out:
+	put_unlocked_mapping_entry(mapping, index, entry);
+	spin_unlock_irq(&mapping->tree_lock);
+	return ret;
+}
 /*
  * Delete exceptional DAX entry at @index from @mapping. Wait for radix tree
  * entry to get unlocked before deleting it.
  */
 int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index)
 {
-	void *entry;
+	int ret = __dax_invalidate_mapping_entry(mapping, index, true);
 
-	spin_lock_irq(&mapping->tree_lock);
-	entry = get_unlocked_mapping_entry(mapping, index, NULL);
 	/*
 	 * This gets called from truncate / punch_hole path. As such, the caller
 	 * must hold locks protecting against concurrent modifications of the
@@ -468,16 +492,46 @@ int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index)
 	 * caller has seen exceptional entry for this index, we better find it
 	 * at that index as well...
 	 */
-	if (WARN_ON_ONCE(!entry || !radix_tree_exceptional_entry(entry))) {
-		spin_unlock_irq(&mapping->tree_lock);
-		return 0;
-	}
-	radix_tree_delete(&mapping->page_tree, index);
+	WARN_ON_ONCE(!ret);
+	return ret;
+}
+
+/*
+ * Invalidate exceptional DAX entry if easily possible. This handles DAX
+ * entries for invalidate_inode_pages() so we evict the entry only if we can
+ * do so without blocking.
+ */
+int dax_invalidate_mapping_entry(struct address_space *mapping, pgoff_t index)
+{
+	int ret = 0;
+	void *entry, **slot;
+	struct radix_tree_root *page_tree = &mapping->page_tree;
+
+	spin_lock_irq(&mapping->tree_lock);
+	entry = __radix_tree_lookup(page_tree, index, NULL, &slot);
+	if (!entry || !radix_tree_exceptional_entry(entry) ||
+	    slot_locked(mapping, slot))
+		goto out;
+	if (radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_DIRTY) ||
+	    radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE))
+		goto out;
+	radix_tree_delete(page_tree, index);
 	mapping->nrexceptional--;
+	ret = 1;
+out:
 	spin_unlock_irq(&mapping->tree_lock);
-	dax_wake_mapping_entry_waiter(mapping, index, entry, true);
+	if (ret)
+		dax_wake_mapping_entry_waiter(mapping, index, entry, true);
+	return ret;
+}
 
-	return 1;
+/*
+ * Invalidate exceptional DAX entry if it is clean.
+ */
+int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
+				      pgoff_t index)
+{
+	return __dax_invalidate_mapping_entry(mapping, index, false);
 }
 
 /*
@@ -488,15 +542,16 @@ int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index)
  * otherwise it will simply fall out of the page cache under memory
  * pressure without ever having been dirtied.
  */
-static int dax_load_hole(struct address_space *mapping, void *entry,
+static int dax_load_hole(struct address_space *mapping, void **entry,
 			 struct vm_fault *vmf)
 {
 	struct page *page;
+	int ret;
 
 	/* Hole page already exists? Return it...  */
-	if (!radix_tree_exceptional_entry(entry)) {
-		vmf->page = entry;
-		return VM_FAULT_LOCKED;
+	if (!radix_tree_exceptional_entry(*entry)) {
+		page = *entry;
+		goto out;
 	}
 
 	/* This will replace locked radix tree entry with a hole page */
@@ -504,8 +559,17 @@ static int dax_load_hole(struct address_space *mapping, void *entry,
 				   vmf->gfp_mask | __GFP_ZERO);
 	if (!page)
 		return VM_FAULT_OOM;
+ out:
 	vmf->page = page;
-	return VM_FAULT_LOCKED;
+	ret = finish_fault(vmf);
+	vmf->page = NULL;
+	*entry = page;
+	if (!ret) {
+		/* Grab reference for PTE that is now referencing the page */
+		get_page(page);
+		return VM_FAULT_NOPAGE;
+	}
+	return ret;
 }
 
 static int copy_user_dax(struct block_device *bdev, sector_t sector, size_t size,
@@ -630,8 +694,8 @@ static void dax_mapping_entry_mkclean(struct address_space *mapping,
 				      pgoff_t index, unsigned long pfn)
 {
 	struct vm_area_struct *vma;
-	pte_t *ptep;
-	pte_t pte;
+	pte_t pte, *ptep = NULL;
+	pmd_t *pmdp = NULL;
 	spinlock_t *ptl;
 	bool changed;
 
@@ -646,21 +710,42 @@ static void dax_mapping_entry_mkclean(struct address_space *mapping,
 
 		address = pgoff_address(index, vma);
 		changed = false;
-		if (follow_pte(vma->vm_mm, address, &ptep, &ptl))
+		if (follow_pte_pmd(vma->vm_mm, address, &ptep, &pmdp, &ptl))
 			continue;
-		if (pfn != pte_pfn(*ptep))
-			goto unlock;
-		if (!pte_dirty(*ptep) && !pte_write(*ptep))
-			goto unlock;
 
-		flush_cache_page(vma, address, pfn);
-		pte = ptep_clear_flush(vma, address, ptep);
-		pte = pte_wrprotect(pte);
-		pte = pte_mkclean(pte);
-		set_pte_at(vma->vm_mm, address, ptep, pte);
-		changed = true;
-unlock:
-		pte_unmap_unlock(ptep, ptl);
+		if (pmdp) {
+#ifdef CONFIG_FS_DAX_PMD
+			pmd_t pmd;
+
+			if (pfn != pmd_pfn(*pmdp))
+				goto unlock_pmd;
+			if (!pmd_dirty(*pmdp) && !pmd_write(*pmdp))
+				goto unlock_pmd;
+
+			flush_cache_page(vma, address, pfn);
+			pmd = pmdp_huge_clear_flush(vma, address, pmdp);
+			pmd = pmd_wrprotect(pmd);
+			pmd = pmd_mkclean(pmd);
+			set_pmd_at(vma->vm_mm, address, pmdp, pmd);
+			changed = true;
+unlock_pmd:
+			spin_unlock(ptl);
+#endif
+		} else {
+			if (pfn != pte_pfn(*ptep))
+				goto unlock_pte;
+			if (!pte_dirty(*ptep) && !pte_write(*ptep))
+				goto unlock_pte;
+
+			flush_cache_page(vma, address, pfn);
+			pte = ptep_clear_flush(vma, address, ptep);
+			pte = pte_wrprotect(pte);
+			pte = pte_mkclean(pte);
+			set_pte_at(vma->vm_mm, address, ptep, pte);
+			changed = true;
+unlock_pte:
+			pte_unmap_unlock(ptep, ptl);
+		}
 
 		if (changed)
 			mmu_notifier_invalidate_page(vma->vm_mm, address);
@@ -908,7 +993,6 @@ int __dax_zero_page_range(struct block_device *bdev, sector_t sector,
 }
 EXPORT_SYMBOL_GPL(__dax_zero_page_range);
 
-#ifdef CONFIG_FS_IOMAP
 static sector_t dax_iomap_sector(struct iomap *iomap, loff_t pos)
 {
 	return iomap->blkno + (((pos & PAGE_MASK) - iomap->offset) >> 9);
@@ -934,11 +1018,27 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
 	if (WARN_ON_ONCE(iomap->type != IOMAP_MAPPED))
 		return -EIO;
 
+	/*
+	 * Write can allocate block for an area which has a hole page mapped
+	 * into page tables. We have to tear down these mappings so that data
+	 * written by write(2) is visible in mmap.
+	 */
+	if ((iomap->flags & IOMAP_F_NEW) && inode->i_mapping->nrpages) {
+		invalidate_inode_pages2_range(inode->i_mapping,
+					      pos >> PAGE_SHIFT,
+					      (end - 1) >> PAGE_SHIFT);
+	}
+
 	while (pos < end) {
 		unsigned offset = pos & (PAGE_SIZE - 1);
 		struct blk_dax_ctl dax = { 0 };
 		ssize_t map_len;
 
+		if (fatal_signal_pending(current)) {
+			ret = -EINTR;
+			break;
+		}
+
 		dax.sector = dax_iomap_sector(iomap, pos);
 		dax.size = (length + offset + PAGE_SIZE - 1) & PAGE_MASK;
 		map_len = dax_map_atomic(iomap->bdev, &dax);
@@ -982,31 +1082,18 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
  */
 ssize_t
 dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
-		struct iomap_ops *ops)
+		const struct iomap_ops *ops)
 {
 	struct address_space *mapping = iocb->ki_filp->f_mapping;
 	struct inode *inode = mapping->host;
 	loff_t pos = iocb->ki_pos, ret = 0, done = 0;
 	unsigned flags = 0;
 
-	if (iov_iter_rw(iter) == WRITE)
+	if (iov_iter_rw(iter) == WRITE) {
+		lockdep_assert_held_exclusive(&inode->i_rwsem);
 		flags |= IOMAP_WRITE;
-
-	/*
-	 * Yes, even DAX files can have page cache attached to them:  A zeroed
-	 * page is inserted into the pagecache when we have to serve a write
-	 * fault on a hole.  It should never be dirtied and can simply be
-	 * dropped from the pagecache once we get real data for the page.
-	 *
-	 * XXX: This is racy against mmap, and there's nothing we can do about
-	 * it. We'll eventually need to shift this down even further so that
-	 * we can check if we allocated blocks over a hole first.
-	 */
-	if (mapping->nrpages) {
-		ret = invalidate_inode_pages2_range(mapping,
-				pos >> PAGE_SHIFT,
-				(pos + iov_iter_count(iter) - 1) >> PAGE_SHIFT);
-		WARN_ON_ONCE(ret);
+	} else {
+		lockdep_assert_held(&inode->i_rwsem);
 	}
 
 	while (iov_iter_count(iter)) {
@@ -1023,6 +1110,15 @@ dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
 }
 EXPORT_SYMBOL_GPL(dax_iomap_rw);
 
+static int dax_fault_return(int error)
+{
+	if (error == 0)
+		return VM_FAULT_NOPAGE;
+	if (error == -ENOMEM)
+		return VM_FAULT_OOM;
+	return VM_FAULT_SIGBUS;
+}
+
 /**
  * dax_iomap_fault - handle a page fault on a DAX file
  * @vma: The virtual memory area where the fault occurred
@@ -1034,7 +1130,7 @@ EXPORT_SYMBOL_GPL(dax_iomap_rw);
  * necessary locking for the page fault to proceed successfully.
  */
 int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
-			struct iomap_ops *ops)
+			const struct iomap_ops *ops)
 {
 	struct address_space *mapping = vma->vm_file->f_mapping;
 	struct inode *inode = mapping->host;
@@ -1055,12 +1151,6 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 	if (pos >= i_size_read(inode))
 		return VM_FAULT_SIGBUS;
 
-	entry = grab_mapping_entry(mapping, vmf->pgoff, 0);
-	if (IS_ERR(entry)) {
-		error = PTR_ERR(entry);
-		goto out;
-	}
-
 	if ((vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page)
 		flags |= IOMAP_WRITE;
 
@@ -1071,9 +1161,15 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 	 */
 	error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap);
 	if (error)
-		goto unlock_entry;
+		return dax_fault_return(error);
 	if (WARN_ON_ONCE(iomap.offset + iomap.length < pos + PAGE_SIZE)) {
-		error = -EIO;		/* fs corruption? */
+		vmf_ret = dax_fault_return(-EIO);	/* fs corruption? */
+		goto finish_iomap;
+	}
+
+	entry = grab_mapping_entry(mapping, vmf->pgoff, 0);
+	if (IS_ERR(entry)) {
+		vmf_ret = dax_fault_return(PTR_ERR(entry));
 		goto finish_iomap;
 	}
 
@@ -1096,13 +1192,13 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 		}
 
 		if (error)
-			goto finish_iomap;
+			goto error_unlock_entry;
 
 		__SetPageUptodate(vmf->cow_page);
 		vmf_ret = finish_fault(vmf);
 		if (!vmf_ret)
 			vmf_ret = VM_FAULT_DONE_COW;
-		goto finish_iomap;
+		goto unlock_entry;
 	}
 
 	switch (iomap.type) {
@@ -1114,12 +1210,15 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 		}
 		error = dax_insert_mapping(mapping, iomap.bdev, sector,
 				PAGE_SIZE, &entry, vma, vmf);
+		/* -EBUSY is fine, somebody else faulted on the same PTE */
+		if (error == -EBUSY)
+			error = 0;
 		break;
 	case IOMAP_UNWRITTEN:
 	case IOMAP_HOLE:
 		if (!(vmf->flags & FAULT_FLAG_WRITE)) {
-			vmf_ret = dax_load_hole(mapping, entry, vmf);
-			break;
+			vmf_ret = dax_load_hole(mapping, &entry, vmf);
+			goto unlock_entry;
 		}
 		/*FALLTHRU*/
 	default:
@@ -1128,31 +1227,25 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 		break;
 	}
 
+ error_unlock_entry:
+	vmf_ret = dax_fault_return(error) | major;
+ unlock_entry:
+	put_locked_mapping_entry(mapping, vmf->pgoff, entry);
  finish_iomap:
 	if (ops->iomap_end) {
-		if (error || (vmf_ret & VM_FAULT_ERROR)) {
-			/* keep previous error */
-			ops->iomap_end(inode, pos, PAGE_SIZE, 0, flags,
-					&iomap);
-		} else {
-			error = ops->iomap_end(inode, pos, PAGE_SIZE,
-					PAGE_SIZE, flags, &iomap);
-		}
-	}
- unlock_entry:
-	if (vmf_ret != VM_FAULT_LOCKED || error)
-		put_locked_mapping_entry(mapping, vmf->pgoff, entry);
- out:
-	if (error == -ENOMEM)
-		return VM_FAULT_OOM | major;
-	/* -EBUSY is fine, somebody else faulted on the same PTE */
-	if (error < 0 && error != -EBUSY)
-		return VM_FAULT_SIGBUS | major;
-	if (vmf_ret) {
-		WARN_ON_ONCE(error); /* -EBUSY from ops->iomap_end? */
-		return vmf_ret;
+		int copied = PAGE_SIZE;
+
+		if (vmf_ret & VM_FAULT_ERROR)
+			copied = 0;
+		/*
+		 * The fault is done by now and there's no way back (other
+		 * thread may be already happily using PTE we have installed).
+		 * Just ignore error from ->iomap_end since we cannot do much
+		 * with it.
+		 */
+		ops->iomap_end(inode, pos, PAGE_SIZE, copied, flags, &iomap);
 	}
-	return VM_FAULT_NOPAGE | major;
+	return vmf_ret;
 }
 EXPORT_SYMBOL_GPL(dax_iomap_fault);
 
@@ -1163,21 +1256,21 @@ EXPORT_SYMBOL_GPL(dax_iomap_fault);
  */
 #define PG_PMD_COLOUR	((PMD_SIZE >> PAGE_SHIFT) - 1)
 
-static int dax_pmd_insert_mapping(struct vm_area_struct *vma, pmd_t *pmd,
-		struct vm_fault *vmf, unsigned long address,
-		struct iomap *iomap, loff_t pos, bool write, void **entryp)
+static int dax_pmd_insert_mapping(struct vm_fault *vmf, struct iomap *iomap,
+		loff_t pos, void **entryp)
 {
-	struct address_space *mapping = vma->vm_file->f_mapping;
+	struct address_space *mapping = vmf->vma->vm_file->f_mapping;
 	struct block_device *bdev = iomap->bdev;
+	struct inode *inode = mapping->host;
 	struct blk_dax_ctl dax = {
 		.sector = dax_iomap_sector(iomap, pos),
 		.size = PMD_SIZE,
 	};
 	long length = dax_map_atomic(bdev, &dax);
-	void *ret;
+	void *ret = NULL;
 
 	if (length < 0) /* dax_map_atomic() failed */
-		return VM_FAULT_FALLBACK;
+		goto fallback;
 	if (length < PMD_SIZE)
 		goto unmap_fallback;
 	if (pfn_t_to_pfn(dax.pfn) & PG_PMD_COLOUR)
@@ -1190,67 +1283,86 @@ static int dax_pmd_insert_mapping(struct vm_area_struct *vma, pmd_t *pmd,
 	ret = dax_insert_mapping_entry(mapping, vmf, *entryp, dax.sector,
 			RADIX_DAX_PMD);
 	if (IS_ERR(ret))
-		return VM_FAULT_FALLBACK;
+		goto fallback;
 	*entryp = ret;
 
-	return vmf_insert_pfn_pmd(vma, address, pmd, dax.pfn, write);
+	trace_dax_pmd_insert_mapping(inode, vmf, length, dax.pfn, ret);
+	return vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd,
+			dax.pfn, vmf->flags & FAULT_FLAG_WRITE);
 
  unmap_fallback:
 	dax_unmap_atomic(bdev, &dax);
+fallback:
+	trace_dax_pmd_insert_mapping_fallback(inode, vmf, length,
+			dax.pfn, ret);
 	return VM_FAULT_FALLBACK;
 }
 
-static int dax_pmd_load_hole(struct vm_area_struct *vma, pmd_t *pmd,
-		struct vm_fault *vmf, unsigned long address,
-		struct iomap *iomap, void **entryp)
+static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap,
+		void **entryp)
 {
-	struct address_space *mapping = vma->vm_file->f_mapping;
-	unsigned long pmd_addr = address & PMD_MASK;
+	struct address_space *mapping = vmf->vma->vm_file->f_mapping;
+	unsigned long pmd_addr = vmf->address & PMD_MASK;
+	struct inode *inode = mapping->host;
 	struct page *zero_page;
+	void *ret = NULL;
 	spinlock_t *ptl;
 	pmd_t pmd_entry;
-	void *ret;
 
-	zero_page = mm_get_huge_zero_page(vma->vm_mm);
+	zero_page = mm_get_huge_zero_page(vmf->vma->vm_mm);
 
 	if (unlikely(!zero_page))
-		return VM_FAULT_FALLBACK;
+		goto fallback;
 
 	ret = dax_insert_mapping_entry(mapping, vmf, *entryp, 0,
 			RADIX_DAX_PMD | RADIX_DAX_HZP);
 	if (IS_ERR(ret))
-		return VM_FAULT_FALLBACK;
+		goto fallback;
 	*entryp = ret;
 
-	ptl = pmd_lock(vma->vm_mm, pmd);
-	if (!pmd_none(*pmd)) {
+	ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);
+	if (!pmd_none(*(vmf->pmd))) {
 		spin_unlock(ptl);
-		return VM_FAULT_FALLBACK;
+		goto fallback;
 	}
 
-	pmd_entry = mk_pmd(zero_page, vma->vm_page_prot);
+	pmd_entry = mk_pmd(zero_page, vmf->vma->vm_page_prot);
 	pmd_entry = pmd_mkhuge(pmd_entry);
-	set_pmd_at(vma->vm_mm, pmd_addr, pmd, pmd_entry);
+	set_pmd_at(vmf->vma->vm_mm, pmd_addr, vmf->pmd, pmd_entry);
 	spin_unlock(ptl);
+	trace_dax_pmd_load_hole(inode, vmf, zero_page, ret);
 	return VM_FAULT_NOPAGE;
+
+fallback:
+	trace_dax_pmd_load_hole_fallback(inode, vmf, zero_page, ret);
+	return VM_FAULT_FALLBACK;
 }
 
-int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
-		pmd_t *pmd, unsigned int flags, struct iomap_ops *ops)
+int dax_iomap_pmd_fault(struct vm_fault *vmf, const struct iomap_ops *ops)
 {
+	struct vm_area_struct *vma = vmf->vma;
 	struct address_space *mapping = vma->vm_file->f_mapping;
-	unsigned long pmd_addr = address & PMD_MASK;
-	bool write = flags & FAULT_FLAG_WRITE;
+	unsigned long pmd_addr = vmf->address & PMD_MASK;
+	bool write = vmf->flags & FAULT_FLAG_WRITE;
 	unsigned int iomap_flags = (write ? IOMAP_WRITE : 0) | IOMAP_FAULT;
 	struct inode *inode = mapping->host;
 	int result = VM_FAULT_FALLBACK;
 	struct iomap iomap = { 0 };
 	pgoff_t max_pgoff, pgoff;
-	struct vm_fault vmf;
 	void *entry;
 	loff_t pos;
 	int error;
 
+	/*
+	 * Check whether offset isn't beyond end of file now. Caller is
+	 * supposed to hold locks serializing us with truncate / punch hole so
+	 * this is a reliable test.
+	 */
+	pgoff = linear_page_index(vma, pmd_addr);
+	max_pgoff = (i_size_read(inode) - 1) >> PAGE_SHIFT;
+
+	trace_dax_pmd_fault(inode, vmf, max_pgoff, 0);
+
 	/* Fall back to PTEs if we're going to COW */
 	if (write && !(vma->vm_flags & VM_SHARED))
 		goto fallback;
@@ -1261,32 +1373,16 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
 	if ((pmd_addr + PMD_SIZE) > vma->vm_end)
 		goto fallback;
 
-	/*
-	 * Check whether offset isn't beyond end of file now. Caller is
-	 * supposed to hold locks serializing us with truncate / punch hole so
-	 * this is a reliable test.
-	 */
-	pgoff = linear_page_index(vma, pmd_addr);
-	max_pgoff = (i_size_read(inode) - 1) >> PAGE_SHIFT;
-
-	if (pgoff > max_pgoff)
-		return VM_FAULT_SIGBUS;
+	if (pgoff > max_pgoff) {
+		result = VM_FAULT_SIGBUS;
+		goto out;
+	}
 
 	/* If the PMD would extend beyond the file size */
 	if ((pgoff | PG_PMD_COLOUR) > max_pgoff)
 		goto fallback;
 
 	/*
-	 * grab_mapping_entry() will make sure we get a 2M empty entry, a DAX
-	 * PMD or a HZP entry.  If it can't (because a 4k page is already in
-	 * the tree, for instance), it will return -EEXIST and we just fall
-	 * back to 4k entries.
-	 */
-	entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD);
-	if (IS_ERR(entry))
-		goto fallback;
-
-	/*
 	 * Note that we don't use iomap_apply here.  We aren't doing I/O, only
 	 * setting up a mapping, so really we're using iomap_begin() as a way
 	 * to look up our filesystem block.
@@ -1294,52 +1390,61 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
 	pos = (loff_t)pgoff << PAGE_SHIFT;
 	error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap);
 	if (error)
-		goto unlock_entry;
+		goto fallback;
+
 	if (iomap.offset + iomap.length < pos + PMD_SIZE)
 		goto finish_iomap;
 
-	vmf.pgoff = pgoff;
-	vmf.flags = flags;
-	vmf.gfp_mask = mapping_gfp_mask(mapping) | __GFP_IO;
+	/*
+	 * grab_mapping_entry() will make sure we get a 2M empty entry, a DAX
+	 * PMD or a HZP entry.  If it can't (because a 4k page is already in
+	 * the tree, for instance), it will return -EEXIST and we just fall
+	 * back to 4k entries.
+	 */
+	entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD);
+	if (IS_ERR(entry))
+		goto finish_iomap;
 
 	switch (iomap.type) {
 	case IOMAP_MAPPED:
-		result = dax_pmd_insert_mapping(vma, pmd, &vmf, address,
-				&iomap, pos, write, &entry);
+		result = dax_pmd_insert_mapping(vmf, &iomap, pos, &entry);
 		break;
 	case IOMAP_UNWRITTEN:
 	case IOMAP_HOLE:
 		if (WARN_ON_ONCE(write))
-			goto finish_iomap;
-		result = dax_pmd_load_hole(vma, pmd, &vmf, address, &iomap,
-				&entry);
+			goto unlock_entry;
+		result = dax_pmd_load_hole(vmf, &iomap, &entry);
 		break;
 	default:
 		WARN_ON_ONCE(1);
 		break;
 	}
 
+ unlock_entry:
+	put_locked_mapping_entry(mapping, pgoff, entry);
  finish_iomap:
 	if (ops->iomap_end) {
-		if (result == VM_FAULT_FALLBACK) {
-			ops->iomap_end(inode, pos, PMD_SIZE, 0, iomap_flags,
-					&iomap);
-		} else {
-			error = ops->iomap_end(inode, pos, PMD_SIZE, PMD_SIZE,
-					iomap_flags, &iomap);
-			if (error)
-				result = VM_FAULT_FALLBACK;
-		}
+		int copied = PMD_SIZE;
+
+		if (result == VM_FAULT_FALLBACK)
+			copied = 0;
+		/*
+		 * The fault is done by now and there's no way back (other
+		 * thread may be already happily using PMD we have installed).
+		 * Just ignore error from ->iomap_end since we cannot do much
+		 * with it.
+		 */
+		ops->iomap_end(inode, pos, PMD_SIZE, copied, iomap_flags,
+				&iomap);
 	}
- unlock_entry:
-	put_locked_mapping_entry(mapping, pgoff, entry);
  fallback:
 	if (result == VM_FAULT_FALLBACK) {
-		split_huge_pmd(vma, pmd, address);
+		split_huge_pmd(vma, vmf->pmd, vmf->address);
 		count_vm_event(THP_FAULT_FALLBACK);
 	}
+out:
+	trace_dax_pmd_fault_done(inode, vmf, max_pgoff, result);
 	return result;
 }
 EXPORT_SYMBOL_GPL(dax_iomap_pmd_fault);
 #endif /* CONFIG_FS_DAX_PMD */
-#endif /* CONFIG_FS_IOMAP */
diff --git a/fs/dcache.c b/fs/dcache.c
index 769903dbc19d..95d71eda8142 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1336,8 +1336,11 @@ int d_set_mounted(struct dentry *dentry)
 	}
 	spin_lock(&dentry->d_lock);
 	if (!d_unlinked(dentry)) {
-		dentry->d_flags |= DCACHE_MOUNTED;
-		ret = 0;
+		ret = -EBUSY;
+		if (!d_mountpoint(dentry)) {
+			dentry->d_flags |= DCACHE_MOUNTED;
+			ret = 0;
+		}
 	}
  	spin_unlock(&dentry->d_lock);
 out:
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index f17fcf89e18e..7fd4ec4bb214 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -187,9 +187,9 @@ static const struct super_operations debugfs_super_operations = {
 
 static struct vfsmount *debugfs_automount(struct path *path)
 {
-	struct vfsmount *(*f)(void *);
-	f = (struct vfsmount *(*)(void *))path->dentry->d_fsdata;
-	return f(d_inode(path->dentry)->i_private);
+	debugfs_automount_t f;
+	f = (debugfs_automount_t)path->dentry->d_fsdata;
+	return f(path->dentry, d_inode(path->dentry)->i_private);
 }
 
 static const struct dentry_operations debugfs_dops = {
@@ -248,6 +248,42 @@ static struct file_system_type debug_fs_type = {
 };
 MODULE_ALIAS_FS("debugfs");
 
+/**
+ * debugfs_lookup() - look up an existing debugfs file
+ * @name: a pointer to a string containing the name of the file to look up.
+ * @parent: a pointer to the parent dentry of the file.
+ *
+ * This function will return a pointer to a dentry if it succeeds.  If the file
+ * doesn't exist or an error occurs, %NULL will be returned.  The returned
+ * dentry must be passed to dput() when it is no longer needed.
+ *
+ * If debugfs is not enabled in the kernel, the value -%ENODEV will be
+ * returned.
+ */
+struct dentry *debugfs_lookup(const char *name, struct dentry *parent)
+{
+	struct dentry *dentry;
+
+	if (IS_ERR(parent))
+		return NULL;
+
+	if (!parent)
+		parent = debugfs_mount->mnt_root;
+
+	inode_lock(d_inode(parent));
+	dentry = lookup_one_len(name, parent, strlen(name));
+	inode_unlock(d_inode(parent));
+
+	if (IS_ERR(dentry))
+		return NULL;
+	if (!d_really_is_positive(dentry)) {
+		dput(dentry);
+		return NULL;
+	}
+	return dentry;
+}
+EXPORT_SYMBOL_GPL(debugfs_lookup);
+
 static struct dentry *start_creating(const char *name, struct dentry *parent)
 {
 	struct dentry *dentry;
@@ -504,7 +540,7 @@ EXPORT_SYMBOL_GPL(debugfs_create_dir);
  */
 struct dentry *debugfs_create_automount(const char *name,
 					struct dentry *parent,
-					struct vfsmount *(*f)(void *),
+					debugfs_automount_t f,
 					void *data)
 {
 	struct dentry *dentry = start_creating(name, parent);
diff --git a/fs/direct-io.c b/fs/direct-io.c
index aeae8c063451..c87bae4376b8 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -906,6 +906,7 @@ static int do_direct_IO(struct dio *dio, struct dio_submit *sdio,
 			struct buffer_head *map_bh)
 {
 	const unsigned blkbits = sdio->blkbits;
+	const unsigned i_blkbits = blkbits + sdio->blkfactor;
 	int ret = 0;
 
 	while (sdio->block_in_file < sdio->final_block_in_request) {
@@ -949,7 +950,7 @@ static int do_direct_IO(struct dio *dio, struct dio_submit *sdio,
 					clean_bdev_aliases(
 						map_bh->b_bdev,
 						map_bh->b_blocknr,
-						map_bh->b_size >> blkbits);
+						map_bh->b_size >> i_blkbits);
 				}
 
 				if (!sdio->blkfactor)
diff --git a/fs/exec.c b/fs/exec.c
index e57946610733..698a86094f76 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1426,12 +1426,8 @@ static void check_unsafe_exec(struct linux_binprm *bprm)
 	struct task_struct *p = current, *t;
 	unsigned n_fs;
 
-	if (p->ptrace) {
-		if (ptracer_capable(p, current_user_ns()))
-			bprm->unsafe |= LSM_UNSAFE_PTRACE_CAP;
-		else
-			bprm->unsafe |= LSM_UNSAFE_PTRACE;
-	}
+	if (p->ptrace)
+		bprm->unsafe |= LSM_UNSAFE_PTRACE;
 
 	/*
 	 * This isn't strictly necessary, but it makes it harder for LSMs to
@@ -1479,7 +1475,7 @@ static void bprm_fill_uid(struct linux_binprm *bprm)
 	if (task_no_new_privs(current))
 		return;
 
-	inode = file_inode(bprm->file);
+	inode = bprm->file->f_path.dentry->d_inode;
 	mode = READ_ONCE(inode->i_mode);
 	if (!(mode & (S_ISUID|S_ISGID)))
 		return;
diff --git a/fs/exofs/sys.c b/fs/exofs/sys.c
index 5e6a2c0a1f0b..1f7d5e46cdda 100644
--- a/fs/exofs/sys.c
+++ b/fs/exofs/sys.c
@@ -122,7 +122,7 @@ void exofs_sysfs_dbg_print(void)
 	list_for_each_entry_safe(k_name, k_tmp, &exofs_kset->list, entry) {
 		printk(KERN_INFO "%s: name %s ref %d\n",
 			__func__, kobject_name(k_name),
-			(int)atomic_read(&k_name->kref.refcount));
+			(int)kref_read(&k_name->kref));
 	}
 #endif
 }
diff --git a/fs/ext2/Kconfig b/fs/ext2/Kconfig
index 36bea5adcaba..c634874e12d9 100644
--- a/fs/ext2/Kconfig
+++ b/fs/ext2/Kconfig
@@ -1,6 +1,5 @@
 config EXT2_FS
 	tristate "Second extended fs support"
-	select FS_IOMAP if FS_DAX
 	help
 	  Ext2 is a standard Linux file system for hard disks.
 
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index 37e2be784ac7..5e64de9c5093 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -814,7 +814,7 @@ extern const struct file_operations ext2_file_operations;
 /* inode.c */
 extern const struct address_space_operations ext2_aops;
 extern const struct address_space_operations ext2_nobh_aops;
-extern struct iomap_ops ext2_iomap_ops;
+extern const struct iomap_ops ext2_iomap_ops;
 
 /* namei.c */
 extern const struct inode_operations ext2_dir_inode_operations;
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 0093ea2512a8..128cce540645 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -751,9 +751,8 @@ static int ext2_get_blocks(struct inode *inode,
 			mutex_unlock(&ei->truncate_mutex);
 			goto cleanup;
 		}
-	} else {
-		*new = true;
 	}
+	*new = true;
 
 	ext2_splice_branch(inode, iblock, partial, indirect_blks, count);
 	mutex_unlock(&ei->truncate_mutex);
@@ -843,13 +842,13 @@ ext2_iomap_end(struct inode *inode, loff_t offset, loff_t length,
 	return 0;
 }
 
-struct iomap_ops ext2_iomap_ops = {
+const struct iomap_ops ext2_iomap_ops = {
 	.iomap_begin		= ext2_iomap_begin,
 	.iomap_end		= ext2_iomap_end,
 };
 #else
 /* Define empty ops for !CONFIG_FS_DAX case to avoid ugly ifdefs */
-struct iomap_ops ext2_iomap_ops;
+const struct iomap_ops ext2_iomap_ops;
 #endif /* CONFIG_FS_DAX */
 
 int ext2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig
index 7b90691e98c4..e38039fd96ff 100644
--- a/fs/ext4/Kconfig
+++ b/fs/ext4/Kconfig
@@ -37,7 +37,6 @@ config EXT4_FS
 	select CRC16
 	select CRYPTO
 	select CRYPTO_CRC32C
-	select FS_IOMAP if FS_DAX
 	help
 	  This is the next generation of the ext3 filesystem.
 
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 2163c1e69f2a..cee23b684f47 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -32,7 +32,11 @@
 #include <linux/percpu_counter.h>
 #include <linux/ratelimit.h>
 #include <crypto/hash.h>
-#include <linux/fscrypto.h>
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
+#include <linux/fscrypt_supp.h>
+#else
+#include <linux/fscrypt_notsupp.h>
+#endif
 #include <linux/falloc.h>
 #include <linux/percpu-rwsem.h>
 #ifdef __KERNEL__
@@ -679,6 +683,16 @@ struct fsxattr {
 #define EXT4_IOC_FSGETXATTR		FS_IOC_FSGETXATTR
 #define EXT4_IOC_FSSETXATTR		FS_IOC_FSSETXATTR
 
+#define EXT4_IOC_SHUTDOWN _IOR ('X', 125, __u32)
+
+/*
+ * Flags for going down operation
+ */
+#define EXT4_GOING_FLAGS_DEFAULT		0x0	/* going down */
+#define EXT4_GOING_FLAGS_LOGFLUSH		0x1	/* flush log but not data */
+#define EXT4_GOING_FLAGS_NOLOGFLUSH		0x2	/* don't flush log nor data */
+
+
 #if defined(__KERNEL__) && defined(CONFIG_COMPAT)
 /*
  * ioctl commands in 32 bit emulation
@@ -1343,11 +1357,6 @@ struct ext4_super_block {
 /* Number of quota types we support */
 #define EXT4_MAXQUOTAS 3
 
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
-#define EXT4_KEY_DESC_PREFIX "ext4:"
-#define EXT4_KEY_DESC_PREFIX_SIZE 5
-#endif
-
 /*
  * fourth extended-fs super-block data in memory
  */
@@ -1404,8 +1413,7 @@ struct ext4_sb_info {
 	struct journal_s *s_journal;
 	struct list_head s_orphan;
 	struct mutex s_orphan_lock;
-	unsigned long s_resize_flags;		/* Flags indicating if there
-						   is a resizer */
+	unsigned long s_ext4_flags;		/* Ext4 superblock flags */
 	unsigned long s_commit_interval;
 	u32 s_max_batch_time;
 	u32 s_min_batch_time;
@@ -1517,12 +1525,6 @@ struct ext4_sb_info {
 
 	/* Barrier between changing inodes' journal flags and writepages ops. */
 	struct percpu_rw_semaphore s_journal_flag_rwsem;
-
-	/* Encryption support */
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
-	u8 key_prefix[EXT4_KEY_DESC_PREFIX_SIZE];
-	u8 key_prefix_size;
-#endif
 };
 
 static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
@@ -1845,6 +1847,18 @@ static inline bool ext4_has_incompat_features(struct super_block *sb)
 }
 
 /*
+ * Superblock flags
+ */
+#define EXT4_FLAGS_RESIZING	0
+#define EXT4_FLAGS_SHUTDOWN	1
+
+static inline int ext4_forced_shutdown(struct ext4_sb_info *sbi)
+{
+	return test_bit(EXT4_FLAGS_SHUTDOWN, &sbi->s_ext4_flags);
+}
+
+
+/*
  * Default values for user and/or group using reserved blocks
  */
 #define	EXT4_DEF_RESUID		0
@@ -2320,28 +2334,6 @@ static inline int ext4_fname_setup_filename(struct inode *dir,
 }
 static inline void ext4_fname_free_filename(struct ext4_filename *fname) { }
 
-#define fscrypt_set_d_op(i)
-#define fscrypt_get_ctx			fscrypt_notsupp_get_ctx
-#define fscrypt_release_ctx		fscrypt_notsupp_release_ctx
-#define fscrypt_encrypt_page		fscrypt_notsupp_encrypt_page
-#define fscrypt_decrypt_page		fscrypt_notsupp_decrypt_page
-#define fscrypt_decrypt_bio_pages	fscrypt_notsupp_decrypt_bio_pages
-#define fscrypt_pullback_bio_page	fscrypt_notsupp_pullback_bio_page
-#define fscrypt_restore_control_page	fscrypt_notsupp_restore_control_page
-#define fscrypt_zeroout_range		fscrypt_notsupp_zeroout_range
-#define fscrypt_ioctl_set_policy	fscrypt_notsupp_ioctl_set_policy
-#define fscrypt_ioctl_get_policy	fscrypt_notsupp_ioctl_get_policy
-#define fscrypt_has_permitted_context	fscrypt_notsupp_has_permitted_context
-#define fscrypt_inherit_context		fscrypt_notsupp_inherit_context
-#define fscrypt_get_encryption_info	fscrypt_notsupp_get_encryption_info
-#define fscrypt_put_encryption_info	fscrypt_notsupp_put_encryption_info
-#define fscrypt_setup_filename		fscrypt_notsupp_setup_filename
-#define fscrypt_free_filename		fscrypt_notsupp_free_filename
-#define fscrypt_fname_encrypted_size	fscrypt_notsupp_fname_encrypted_size
-#define fscrypt_fname_alloc_buffer	fscrypt_notsupp_fname_alloc_buffer
-#define fscrypt_fname_free_buffer	fscrypt_notsupp_fname_free_buffer
-#define fscrypt_fname_disk_to_usr	fscrypt_notsupp_fname_disk_to_usr
-#define fscrypt_fname_usr_to_disk	fscrypt_notsupp_fname_usr_to_disk
 #endif
 
 /* dir.c */
@@ -3034,7 +3026,7 @@ extern int ext4_inline_data_fiemap(struct inode *inode,
 extern int ext4_try_to_evict_inline_data(handle_t *handle,
 					 struct inode *inode,
 					 int needed);
-extern void ext4_inline_data_truncate(struct inode *inode, int *has_inline);
+extern int ext4_inline_data_truncate(struct inode *inode, int *has_inline);
 
 extern int ext4_convert_inline_data(struct inode *inode);
 
@@ -3228,7 +3220,6 @@ static inline void ext4_inode_resume_unlocked_dio(struct inode *inode)
 					    EXT4_WQ_HASH_SZ])
 extern wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ];
 
-#define EXT4_RESIZING	0
 extern int ext4_resize_begin(struct super_block *sb);
 extern void ext4_resize_end(struct super_block *sb);
 
@@ -3253,7 +3244,7 @@ static inline void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end)
 	}
 }
 
-extern struct iomap_ops ext4_iomap_ops;
+extern const struct iomap_ops ext4_iomap_ops;
 
 #endif	/* __KERNEL__ */
 
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index e770c1ee4613..dd106b1d5d89 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -43,6 +43,10 @@ static int ext4_journal_check_start(struct super_block *sb)
 	journal_t *journal;
 
 	might_sleep();
+
+	if (unlikely(ext4_forced_shutdown(EXT4_SB(sb))))
+		return -EIO;
+
 	if (sb->s_flags & MS_RDONLY)
 		return -EROFS;
 	WARN_ON(sb->s_writers.frozen == SB_FREEZE_COMPLETE);
@@ -161,6 +165,13 @@ int __ext4_journal_get_write_access(const char *where, unsigned int line,
 	might_sleep();
 
 	if (ext4_handle_valid(handle)) {
+		struct super_block *sb;
+
+		sb = handle->h_transaction->t_journal->j_private;
+		if (unlikely(ext4_forced_shutdown(EXT4_SB(sb)))) {
+			jbd2_journal_abort_handle(handle);
+			return -EIO;
+		}
 		err = jbd2_journal_get_write_access(handle, bh);
 		if (err)
 			ext4_journal_abort_handle(where, line, __func__, bh,
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 3e295d3350a9..2a97dff87b96 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -5334,7 +5334,8 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle,
 	ext4_lblk_t stop, *iterator, ex_start, ex_end;
 
 	/* Let path point to the last extent */
-	path = ext4_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL, 0);
+	path = ext4_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL,
+				EXT4_EX_NOCACHE);
 	if (IS_ERR(path))
 		return PTR_ERR(path);
 
@@ -5343,15 +5344,15 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle,
 	if (!extent)
 		goto out;
 
-	stop = le32_to_cpu(extent->ee_block) +
-			ext4_ext_get_actual_len(extent);
+	stop = le32_to_cpu(extent->ee_block);
 
        /*
 	 * In case of left shift, Don't start shifting extents until we make
 	 * sure the hole is big enough to accommodate the shift.
 	*/
 	if (SHIFT == SHIFT_LEFT) {
-		path = ext4_find_extent(inode, start - 1, &path, 0);
+		path = ext4_find_extent(inode, start - 1, &path,
+					EXT4_EX_NOCACHE);
 		if (IS_ERR(path))
 			return PTR_ERR(path);
 		depth = path->p_depth;
@@ -5383,9 +5384,14 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle,
 	else
 		iterator = &stop;
 
-	/* Its safe to start updating extents */
-	while (start < stop) {
-		path = ext4_find_extent(inode, *iterator, &path, 0);
+	/*
+	 * Its safe to start updating extents.  Start and stop are unsigned, so
+	 * in case of right shift if extent with 0 block is reached, iterator
+	 * becomes NULL to indicate the end of the loop.
+	 */
+	while (iterator && start <= stop) {
+		path = ext4_find_extent(inode, *iterator, &path,
+					EXT4_EX_NOCACHE);
 		if (IS_ERR(path))
 			return PTR_ERR(path);
 		depth = path->p_depth;
@@ -5412,8 +5418,11 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle,
 					ext4_ext_get_actual_len(extent);
 		} else {
 			extent = EXT_FIRST_EXTENT(path[depth].p_hdr);
-			*iterator =  le32_to_cpu(extent->ee_block) > 0 ?
-				le32_to_cpu(extent->ee_block) - 1 : 0;
+			if (le32_to_cpu(extent->ee_block) > 0)
+				*iterator = le32_to_cpu(extent->ee_block) - 1;
+			else
+				/* Beginning is reached, end of the loop */
+				iterator = NULL;
 			/* Update path extent in case we need to stop */
 			while (le32_to_cpu(extent->ee_block) < start)
 				extent++;
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index b5f184493c57..13021a054fc0 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -57,6 +57,9 @@ static ssize_t ext4_dax_read_iter(struct kiocb *iocb, struct iov_iter *to)
 
 static ssize_t ext4_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
 {
+	if (unlikely(ext4_forced_shutdown(EXT4_SB(file_inode(iocb->ki_filp)->i_sb))))
+		return -EIO;
+
 	if (!iov_iter_count(to))
 		return 0; /* skip atime */
 
@@ -175,7 +178,6 @@ ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from)
 {
 	struct inode *inode = file_inode(iocb->ki_filp);
 	ssize_t ret;
-	bool overwrite = false;
 
 	inode_lock(inode);
 	ret = ext4_write_checks(iocb, from);
@@ -188,16 +190,9 @@ ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	if (ret)
 		goto out;
 
-	if (ext4_overwrite_io(inode, iocb->ki_pos, iov_iter_count(from))) {
-		overwrite = true;
-		downgrade_write(&inode->i_rwsem);
-	}
 	ret = dax_iomap_rw(iocb, from, &ext4_iomap_ops);
 out:
-	if (!overwrite)
-		inode_unlock(inode);
-	else
-		inode_unlock_shared(inode);
+	inode_unlock(inode);
 	if (ret > 0)
 		ret = generic_write_sync(iocb, ret);
 	return ret;
@@ -213,6 +208,9 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	int overwrite = 0;
 	ssize_t ret;
 
+	if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
+		return -EIO;
+
 #ifdef CONFIG_FS_DAX
 	if (IS_DAX(inode))
 		return ext4_dax_write_iter(iocb, from);
@@ -258,7 +256,6 @@ out:
 static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
 	int result;
-	handle_t *handle = NULL;
 	struct inode *inode = file_inode(vma->vm_file);
 	struct super_block *sb = inode->i_sb;
 	bool write = vmf->flags & FAULT_FLAG_WRITE;
@@ -266,61 +263,33 @@ static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 	if (write) {
 		sb_start_pagefault(sb);
 		file_update_time(vma->vm_file);
-		down_read(&EXT4_I(inode)->i_mmap_sem);
-		handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE,
-						EXT4_DATA_TRANS_BLOCKS(sb));
-	} else
-		down_read(&EXT4_I(inode)->i_mmap_sem);
-
-	if (IS_ERR(handle))
-		result = VM_FAULT_SIGBUS;
-	else
-		result = dax_iomap_fault(vma, vmf, &ext4_iomap_ops);
-
-	if (write) {
-		if (!IS_ERR(handle))
-			ext4_journal_stop(handle);
-		up_read(&EXT4_I(inode)->i_mmap_sem);
+	}
+	down_read(&EXT4_I(inode)->i_mmap_sem);
+	result = dax_iomap_fault(vma, vmf, &ext4_iomap_ops);
+	up_read(&EXT4_I(inode)->i_mmap_sem);
+	if (write)
 		sb_end_pagefault(sb);
-	} else
-		up_read(&EXT4_I(inode)->i_mmap_sem);
 
 	return result;
 }
 
-static int ext4_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
-						pmd_t *pmd, unsigned int flags)
+static int
+ext4_dax_pmd_fault(struct vm_fault *vmf)
 {
 	int result;
-	handle_t *handle = NULL;
-	struct inode *inode = file_inode(vma->vm_file);
+	struct inode *inode = file_inode(vmf->vma->vm_file);
 	struct super_block *sb = inode->i_sb;
-	bool write = flags & FAULT_FLAG_WRITE;
+	bool write = vmf->flags & FAULT_FLAG_WRITE;
 
 	if (write) {
 		sb_start_pagefault(sb);
-		file_update_time(vma->vm_file);
-		down_read(&EXT4_I(inode)->i_mmap_sem);
-		handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE,
-				ext4_chunk_trans_blocks(inode,
-							PMD_SIZE / PAGE_SIZE));
-	} else
-		down_read(&EXT4_I(inode)->i_mmap_sem);
-
-	if (IS_ERR(handle))
-		result = VM_FAULT_SIGBUS;
-	else {
-		result = dax_iomap_pmd_fault(vma, addr, pmd, flags,
-					     &ext4_iomap_ops);
+		file_update_time(vmf->vma->vm_file);
 	}
-
-	if (write) {
-		if (!IS_ERR(handle))
-			ext4_journal_stop(handle);
-		up_read(&EXT4_I(inode)->i_mmap_sem);
+	down_read(&EXT4_I(inode)->i_mmap_sem);
+	result = dax_iomap_pmd_fault(vmf, &ext4_iomap_ops);
+	up_read(&EXT4_I(inode)->i_mmap_sem);
+	if (write)
 		sb_end_pagefault(sb);
-	} else
-		up_read(&EXT4_I(inode)->i_mmap_sem);
 
 	return result;
 }
@@ -376,6 +345,9 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
 {
 	struct inode *inode = file->f_mapping->host;
 
+	if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
+		return -EIO;
+
 	if (ext4_encrypted_inode(inode)) {
 		int err = fscrypt_get_encryption_info(inode);
 		if (err)
@@ -403,6 +375,9 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
 	char buf[64], *cp;
 	int ret;
 
+	if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
+		return -EIO;
+
 	if (unlikely(!(sbi->s_mount_flags & EXT4_MF_MNTDIR_SAMPLED) &&
 		     !(sb->s_flags & MS_RDONLY))) {
 		sbi->s_mount_flags |= EXT4_MF_MNTDIR_SAMPLED;
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 88effb1053c7..9d549608fd30 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -100,6 +100,9 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 	tid_t commit_tid;
 	bool needs_barrier = false;
 
+	if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
+		return -EIO;
+
 	J_ASSERT(ext4_journal_current_handle() == NULL);
 
 	trace_ext4_sync_file_enter(file, datasync);
diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c
index e026aa941fd5..38b8a96eb97c 100644
--- a/fs/ext4/hash.c
+++ b/fs/ext4/hash.c
@@ -10,7 +10,8 @@
  */
 
 #include <linux/fs.h>
-#include <linux/cryptohash.h>
+#include <linux/compiler.h>
+#include <linux/bitops.h>
 #include "ext4.h"
 
 #define DELTA 0x9E3779B9
@@ -32,6 +33,74 @@ static void TEA_transform(__u32 buf[4], __u32 const in[])
 	buf[1] += b1;
 }
 
+/* F, G and H are basic MD4 functions: selection, majority, parity */
+#define F(x, y, z) ((z) ^ ((x) & ((y) ^ (z))))
+#define G(x, y, z) (((x) & (y)) + (((x) ^ (y)) & (z)))
+#define H(x, y, z) ((x) ^ (y) ^ (z))
+
+/*
+ * The generic round function.  The application is so specific that
+ * we don't bother protecting all the arguments with parens, as is generally
+ * good macro practice, in favor of extra legibility.
+ * Rotation is separate from addition to prevent recomputation
+ */
+#define ROUND(f, a, b, c, d, x, s)	\
+	(a += f(b, c, d) + x, a = rol32(a, s))
+#define K1 0
+#define K2 013240474631UL
+#define K3 015666365641UL
+
+/*
+ * Basic cut-down MD4 transform.  Returns only 32 bits of result.
+ */
+static __u32 half_md4_transform(__u32 buf[4], __u32 const in[8])
+{
+	__u32 a = buf[0], b = buf[1], c = buf[2], d = buf[3];
+
+	/* Round 1 */
+	ROUND(F, a, b, c, d, in[0] + K1,  3);
+	ROUND(F, d, a, b, c, in[1] + K1,  7);
+	ROUND(F, c, d, a, b, in[2] + K1, 11);
+	ROUND(F, b, c, d, a, in[3] + K1, 19);
+	ROUND(F, a, b, c, d, in[4] + K1,  3);
+	ROUND(F, d, a, b, c, in[5] + K1,  7);
+	ROUND(F, c, d, a, b, in[6] + K1, 11);
+	ROUND(F, b, c, d, a, in[7] + K1, 19);
+
+	/* Round 2 */
+	ROUND(G, a, b, c, d, in[1] + K2,  3);
+	ROUND(G, d, a, b, c, in[3] + K2,  5);
+	ROUND(G, c, d, a, b, in[5] + K2,  9);
+	ROUND(G, b, c, d, a, in[7] + K2, 13);
+	ROUND(G, a, b, c, d, in[0] + K2,  3);
+	ROUND(G, d, a, b, c, in[2] + K2,  5);
+	ROUND(G, c, d, a, b, in[4] + K2,  9);
+	ROUND(G, b, c, d, a, in[6] + K2, 13);
+
+	/* Round 3 */
+	ROUND(H, a, b, c, d, in[3] + K3,  3);
+	ROUND(H, d, a, b, c, in[7] + K3,  9);
+	ROUND(H, c, d, a, b, in[2] + K3, 11);
+	ROUND(H, b, c, d, a, in[6] + K3, 15);
+	ROUND(H, a, b, c, d, in[1] + K3,  3);
+	ROUND(H, d, a, b, c, in[5] + K3,  9);
+	ROUND(H, c, d, a, b, in[0] + K3, 11);
+	ROUND(H, b, c, d, a, in[4] + K3, 15);
+
+	buf[0] += a;
+	buf[1] += b;
+	buf[2] += c;
+	buf[3] += d;
+
+	return buf[1]; /* "most hashed" word */
+}
+#undef ROUND
+#undef K1
+#undef K2
+#undef K3
+#undef F
+#undef G
+#undef H
 
 /* The old legacy hash */
 static __u32 dx_hack_hash_unsigned(const char *name, int len)
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index e57e8d90ea54..b14bae2598bc 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -764,6 +764,9 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
 	if (!dir || !dir->i_nlink)
 		return ERR_PTR(-EPERM);
 
+	if (unlikely(ext4_forced_shutdown(EXT4_SB(dir->i_sb))))
+		return ERR_PTR(-EIO);
+
 	if ((ext4_encrypted_inode(dir) ||
 	     DUMMY_ENCRYPTION_ENABLED(EXT4_SB(dir->i_sb))) &&
 	    (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) {
@@ -771,7 +774,7 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
 		if (err)
 			return ERR_PTR(err);
 		if (!fscrypt_has_encryption_key(dir))
-			return ERR_PTR(-EPERM);
+			return ERR_PTR(-ENOKEY);
 		if (!handle)
 			nblocks += EXT4_DATA_TRANS_BLOCKS(dir->i_sb);
 		encrypt = 1;
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index 437df6a1a841..30a9f210d1e3 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -215,6 +215,9 @@ static void ext4_write_inline_data(struct inode *inode, struct ext4_iloc *iloc,
 	struct ext4_inode *raw_inode;
 	int cp_len = 0;
 
+	if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
+		return;
+
 	BUG_ON(!EXT4_I(inode)->i_inline_off);
 	BUG_ON(pos + len > EXT4_I(inode)->i_inline_size);
 
@@ -381,7 +384,7 @@ out:
 static int ext4_prepare_inline_data(handle_t *handle, struct inode *inode,
 				    unsigned int len)
 {
-	int ret, size;
+	int ret, size, no_expand;
 	struct ext4_inode_info *ei = EXT4_I(inode);
 
 	if (!ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA))
@@ -391,15 +394,14 @@ static int ext4_prepare_inline_data(handle_t *handle, struct inode *inode,
 	if (size < len)
 		return -ENOSPC;
 
-	down_write(&EXT4_I(inode)->xattr_sem);
+	ext4_write_lock_xattr(inode, &no_expand);
 
 	if (ei->i_inline_off)
 		ret = ext4_update_inline_data(handle, inode, len);
 	else
 		ret = ext4_create_inline_data(handle, inode, len);
 
-	up_write(&EXT4_I(inode)->xattr_sem);
-
+	ext4_write_unlock_xattr(inode, &no_expand);
 	return ret;
 }
 
@@ -533,7 +535,7 @@ static int ext4_convert_inline_data_to_extent(struct address_space *mapping,
 					      struct inode *inode,
 					      unsigned flags)
 {
-	int ret, needed_blocks;
+	int ret, needed_blocks, no_expand;
 	handle_t *handle = NULL;
 	int retries = 0, sem_held = 0;
 	struct page *page = NULL;
@@ -573,7 +575,7 @@ retry:
 		goto out;
 	}
 
-	down_write(&EXT4_I(inode)->xattr_sem);
+	ext4_write_lock_xattr(inode, &no_expand);
 	sem_held = 1;
 	/* If some one has already done this for us, just exit. */
 	if (!ext4_has_inline_data(inode)) {
@@ -610,7 +612,7 @@ retry:
 		put_page(page);
 		page = NULL;
 		ext4_orphan_add(handle, inode);
-		up_write(&EXT4_I(inode)->xattr_sem);
+		ext4_write_unlock_xattr(inode, &no_expand);
 		sem_held = 0;
 		ext4_journal_stop(handle);
 		handle = NULL;
@@ -636,7 +638,7 @@ out:
 		put_page(page);
 	}
 	if (sem_held)
-		up_write(&EXT4_I(inode)->xattr_sem);
+		ext4_write_unlock_xattr(inode, &no_expand);
 	if (handle)
 		ext4_journal_stop(handle);
 	brelse(iloc.bh);
@@ -729,7 +731,7 @@ convert:
 int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len,
 			       unsigned copied, struct page *page)
 {
-	int ret;
+	int ret, no_expand;
 	void *kaddr;
 	struct ext4_iloc iloc;
 
@@ -747,7 +749,7 @@ int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len,
 		goto out;
 	}
 
-	down_write(&EXT4_I(inode)->xattr_sem);
+	ext4_write_lock_xattr(inode, &no_expand);
 	BUG_ON(!ext4_has_inline_data(inode));
 
 	kaddr = kmap_atomic(page);
@@ -757,7 +759,7 @@ int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len,
 	/* clear page dirty so that writepages wouldn't work for us. */
 	ClearPageDirty(page);
 
-	up_write(&EXT4_I(inode)->xattr_sem);
+	ext4_write_unlock_xattr(inode, &no_expand);
 	brelse(iloc.bh);
 out:
 	return copied;
@@ -768,7 +770,7 @@ ext4_journalled_write_inline_data(struct inode *inode,
 				  unsigned len,
 				  struct page *page)
 {
-	int ret;
+	int ret, no_expand;
 	void *kaddr;
 	struct ext4_iloc iloc;
 
@@ -778,11 +780,11 @@ ext4_journalled_write_inline_data(struct inode *inode,
 		return NULL;
 	}
 
-	down_write(&EXT4_I(inode)->xattr_sem);
+	ext4_write_lock_xattr(inode, &no_expand);
 	kaddr = kmap_atomic(page);
 	ext4_write_inline_data(inode, &iloc, kaddr, 0, len);
 	kunmap_atomic(kaddr);
-	up_write(&EXT4_I(inode)->xattr_sem);
+	ext4_write_unlock_xattr(inode, &no_expand);
 
 	return iloc.bh;
 }
@@ -944,8 +946,15 @@ int ext4_da_write_inline_data_end(struct inode *inode, loff_t pos,
 				  struct page *page)
 {
 	int i_size_changed = 0;
+	int ret;
 
-	copied = ext4_write_inline_data_end(inode, pos, len, copied, page);
+	ret = ext4_write_inline_data_end(inode, pos, len, copied, page);
+	if (ret < 0) {
+		unlock_page(page);
+		put_page(page);
+		return ret;
+	}
+	copied = ret;
 
 	/*
 	 * No need to use i_size_read() here, the i_size
@@ -1043,7 +1052,6 @@ static int ext4_add_dirent_to_inline(handle_t *handle,
 	dir->i_mtime = dir->i_ctime = current_time(dir);
 	ext4_update_dx_flag(dir);
 	dir->i_version++;
-	ext4_mark_inode_dirty(handle, dir);
 	return 1;
 }
 
@@ -1259,7 +1267,7 @@ out:
 int ext4_try_add_inline_entry(handle_t *handle, struct ext4_filename *fname,
 			      struct inode *dir, struct inode *inode)
 {
-	int ret, inline_size;
+	int ret, inline_size, no_expand;
 	void *inline_start;
 	struct ext4_iloc iloc;
 
@@ -1267,7 +1275,7 @@ int ext4_try_add_inline_entry(handle_t *handle, struct ext4_filename *fname,
 	if (ret)
 		return ret;
 
-	down_write(&EXT4_I(dir)->xattr_sem);
+	ext4_write_lock_xattr(dir, &no_expand);
 	if (!ext4_has_inline_data(dir))
 		goto out;
 
@@ -1312,8 +1320,8 @@ int ext4_try_add_inline_entry(handle_t *handle, struct ext4_filename *fname,
 	ret = ext4_convert_inline_data_nolock(handle, dir, &iloc);
 
 out:
+	ext4_write_unlock_xattr(dir, &no_expand);
 	ext4_mark_inode_dirty(handle, dir);
-	up_write(&EXT4_I(dir)->xattr_sem);
 	brelse(iloc.bh);
 	return ret;
 }
@@ -1673,7 +1681,7 @@ int ext4_delete_inline_entry(handle_t *handle,
 			     struct buffer_head *bh,
 			     int *has_inline_data)
 {
-	int err, inline_size;
+	int err, inline_size, no_expand;
 	struct ext4_iloc iloc;
 	void *inline_start;
 
@@ -1681,7 +1689,7 @@ int ext4_delete_inline_entry(handle_t *handle,
 	if (err)
 		return err;
 
-	down_write(&EXT4_I(dir)->xattr_sem);
+	ext4_write_lock_xattr(dir, &no_expand);
 	if (!ext4_has_inline_data(dir)) {
 		*has_inline_data = 0;
 		goto out;
@@ -1709,13 +1717,11 @@ int ext4_delete_inline_entry(handle_t *handle,
 	if (err)
 		goto out;
 
-	err = ext4_mark_inode_dirty(handle, dir);
-	if (unlikely(err))
-		goto out;
-
 	ext4_show_inline_dir(dir, iloc.bh, inline_start, inline_size);
 out:
-	up_write(&EXT4_I(dir)->xattr_sem);
+	ext4_write_unlock_xattr(dir, &no_expand);
+	if (likely(err == 0))
+		err = ext4_mark_inode_dirty(handle, dir);
 	brelse(iloc.bh);
 	if (err != -ENOENT)
 		ext4_std_error(dir->i_sb, err);
@@ -1814,11 +1820,11 @@ out:
 
 int ext4_destroy_inline_data(handle_t *handle, struct inode *inode)
 {
-	int ret;
+	int ret, no_expand;
 
-	down_write(&EXT4_I(inode)->xattr_sem);
+	ext4_write_lock_xattr(inode, &no_expand);
 	ret = ext4_destroy_inline_data_nolock(handle, inode);
-	up_write(&EXT4_I(inode)->xattr_sem);
+	ext4_write_unlock_xattr(inode, &no_expand);
 
 	return ret;
 }
@@ -1900,10 +1906,10 @@ out:
 	return error;
 }
 
-void ext4_inline_data_truncate(struct inode *inode, int *has_inline)
+int ext4_inline_data_truncate(struct inode *inode, int *has_inline)
 {
 	handle_t *handle;
-	int inline_size, value_len, needed_blocks;
+	int inline_size, value_len, needed_blocks, no_expand, err = 0;
 	size_t i_size;
 	void *value = NULL;
 	struct ext4_xattr_ibody_find is = {
@@ -1918,19 +1924,19 @@ void ext4_inline_data_truncate(struct inode *inode, int *has_inline)
 	needed_blocks = ext4_writepage_trans_blocks(inode);
 	handle = ext4_journal_start(inode, EXT4_HT_INODE, needed_blocks);
 	if (IS_ERR(handle))
-		return;
+		return PTR_ERR(handle);
 
-	down_write(&EXT4_I(inode)->xattr_sem);
+	ext4_write_lock_xattr(inode, &no_expand);
 	if (!ext4_has_inline_data(inode)) {
 		*has_inline = 0;
 		ext4_journal_stop(handle);
-		return;
+		return 0;
 	}
 
-	if (ext4_orphan_add(handle, inode))
+	if ((err = ext4_orphan_add(handle, inode)) != 0)
 		goto out;
 
-	if (ext4_get_inode_loc(inode, &is.iloc))
+	if ((err = ext4_get_inode_loc(inode, &is.iloc)) != 0)
 		goto out;
 
 	down_write(&EXT4_I(inode)->i_data_sem);
@@ -1941,24 +1947,29 @@ void ext4_inline_data_truncate(struct inode *inode, int *has_inline)
 	if (i_size < inline_size) {
 		/* Clear the content in the xattr space. */
 		if (inline_size > EXT4_MIN_INLINE_DATA_SIZE) {
-			if (ext4_xattr_ibody_find(inode, &i, &is))
+			if ((err = ext4_xattr_ibody_find(inode, &i, &is)) != 0)
 				goto out_error;
 
 			BUG_ON(is.s.not_found);
 
 			value_len = le32_to_cpu(is.s.here->e_value_size);
 			value = kmalloc(value_len, GFP_NOFS);
-			if (!value)
+			if (!value) {
+				err = -ENOMEM;
 				goto out_error;
+			}
 
-			if (ext4_xattr_ibody_get(inode, i.name_index, i.name,
-						value, value_len))
+			err = ext4_xattr_ibody_get(inode, i.name_index,
+						   i.name, value, value_len);
+			if (err <= 0)
 				goto out_error;
 
 			i.value = value;
 			i.value_len = i_size > EXT4_MIN_INLINE_DATA_SIZE ?
 					i_size - EXT4_MIN_INLINE_DATA_SIZE : 0;
-			if (ext4_xattr_ibody_inline_set(handle, inode, &i, &is))
+			err = ext4_xattr_ibody_inline_set(handle, inode,
+							  &i, &is);
+			if (err)
 				goto out_error;
 		}
 
@@ -1978,23 +1989,24 @@ out_error:
 	up_write(&EXT4_I(inode)->i_data_sem);
 out:
 	brelse(is.iloc.bh);
-	up_write(&EXT4_I(inode)->xattr_sem);
+	ext4_write_unlock_xattr(inode, &no_expand);
 	kfree(value);
 	if (inode->i_nlink)
 		ext4_orphan_del(handle, inode);
 
-	inode->i_mtime = inode->i_ctime = current_time(inode);
-	ext4_mark_inode_dirty(handle, inode);
-	if (IS_SYNC(inode))
-		ext4_handle_sync(handle);
-
+	if (err == 0) {
+		inode->i_mtime = inode->i_ctime = current_time(inode);
+		err = ext4_mark_inode_dirty(handle, inode);
+		if (IS_SYNC(inode))
+			ext4_handle_sync(handle);
+	}
 	ext4_journal_stop(handle);
-	return;
+	return err;
 }
 
 int ext4_convert_inline_data(struct inode *inode)
 {
-	int error, needed_blocks;
+	int error, needed_blocks, no_expand;
 	handle_t *handle;
 	struct ext4_iloc iloc;
 
@@ -2016,15 +2028,10 @@ int ext4_convert_inline_data(struct inode *inode)
 		goto out_free;
 	}
 
-	down_write(&EXT4_I(inode)->xattr_sem);
-	if (!ext4_has_inline_data(inode)) {
-		up_write(&EXT4_I(inode)->xattr_sem);
-		goto out;
-	}
-
-	error = ext4_convert_inline_data_nolock(handle, inode, &iloc);
-	up_write(&EXT4_I(inode)->xattr_sem);
-out:
+	ext4_write_lock_xattr(inode, &no_expand);
+	if (ext4_has_inline_data(inode))
+		error = ext4_convert_inline_data_nolock(handle, inode, &iloc);
+	ext4_write_unlock_xattr(inode, &no_expand);
 	ext4_journal_stop(handle);
 out_free:
 	brelse(iloc.bh);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 88d57af1b516..75212a6e69f8 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1189,6 +1189,9 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
 	pgoff_t index;
 	unsigned from, to;
 
+	if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
+		return -EIO;
+
 	trace_ext4_write_begin(inode, pos, len, flags);
 	/*
 	 * Reserve one block more for addition to orphan list in case
@@ -1330,8 +1333,11 @@ static int ext4_write_end(struct file *file,
 	if (ext4_has_inline_data(inode)) {
 		ret = ext4_write_inline_data_end(inode, pos, len,
 						 copied, page);
-		if (ret < 0)
+		if (ret < 0) {
+			unlock_page(page);
+			put_page(page);
 			goto errout;
+		}
 		copied = ret;
 	} else
 		copied = block_write_end(file, mapping, pos,
@@ -1385,7 +1391,9 @@ errout:
  * set the buffer to be dirty, since in data=journalled mode we need
  * to call ext4_handle_dirty_metadata() instead.
  */
-static void zero_new_buffers(struct page *page, unsigned from, unsigned to)
+static void ext4_journalled_zero_new_buffers(handle_t *handle,
+					    struct page *page,
+					    unsigned from, unsigned to)
 {
 	unsigned int block_start = 0, block_end;
 	struct buffer_head *head, *bh;
@@ -1402,7 +1410,7 @@ static void zero_new_buffers(struct page *page, unsigned from, unsigned to)
 					size = min(to, block_end) - start;
 
 					zero_user(page, start, size);
-					set_buffer_uptodate(bh);
+					write_end_fn(handle, bh);
 				}
 				clear_buffer_new(bh);
 			}
@@ -1431,18 +1439,25 @@ static int ext4_journalled_write_end(struct file *file,
 
 	BUG_ON(!ext4_handle_valid(handle));
 
-	if (ext4_has_inline_data(inode))
-		copied = ext4_write_inline_data_end(inode, pos, len,
-						    copied, page);
-	else {
-		if (copied < len) {
-			if (!PageUptodate(page))
-				copied = 0;
-			zero_new_buffers(page, from+copied, to);
+	if (ext4_has_inline_data(inode)) {
+		ret = ext4_write_inline_data_end(inode, pos, len,
+						 copied, page);
+		if (ret < 0) {
+			unlock_page(page);
+			put_page(page);
+			goto errout;
 		}
-
+		copied = ret;
+	} else if (unlikely(copied < len) && !PageUptodate(page)) {
+		copied = 0;
+		ext4_journalled_zero_new_buffers(handle, page, from, to);
+	} else {
+		if (unlikely(copied < len))
+			ext4_journalled_zero_new_buffers(handle, page,
+							 from + copied, to);
 		ret = ext4_walk_page_buffers(handle, page_buffers(page), from,
-					     to, &partial, write_end_fn);
+					     from + copied, &partial,
+					     write_end_fn);
 		if (!partial)
 			SetPageUptodate(page);
 	}
@@ -1468,6 +1483,7 @@ static int ext4_journalled_write_end(struct file *file,
 		 */
 		ext4_orphan_add(handle, inode);
 
+errout:
 	ret2 = ext4_journal_stop(handle);
 	if (!ret)
 		ret = ret2;
@@ -2034,6 +2050,12 @@ static int ext4_writepage(struct page *page,
 	struct ext4_io_submit io_submit;
 	bool keep_towrite = false;
 
+	if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) {
+		ext4_invalidatepage(page, 0, PAGE_SIZE);
+		unlock_page(page);
+		return -EIO;
+	}
+
 	trace_ext4_writepage(page);
 	size = i_size_read(inode);
 	if (page->index == size >> PAGE_SHIFT)
@@ -2409,7 +2431,8 @@ static int mpage_map_and_submit_extent(handle_t *handle,
 		if (err < 0) {
 			struct super_block *sb = inode->i_sb;
 
-			if (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)
+			if (ext4_forced_shutdown(EXT4_SB(sb)) ||
+			    EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)
 				goto invalidate_dirty_pages;
 			/*
 			 * Let the uper layers retry transient errors.
@@ -2464,8 +2487,8 @@ update_disksize:
 			disksize = i_size;
 		if (disksize > EXT4_I(inode)->i_disksize)
 			EXT4_I(inode)->i_disksize = disksize;
-		err2 = ext4_mark_inode_dirty(handle, inode);
 		up_write(&EXT4_I(inode)->i_data_sem);
+		err2 = ext4_mark_inode_dirty(handle, inode);
 		if (err2)
 			ext4_error(inode->i_sb,
 				   "Failed to mark inode %lu dirty",
@@ -2631,6 +2654,9 @@ static int ext4_writepages(struct address_space *mapping,
 	struct blk_plug plug;
 	bool give_up_on_write = false;
 
+	if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
+		return -EIO;
+
 	percpu_down_read(&sbi->s_journal_flag_rwsem);
 	trace_ext4_writepages(inode, wbc);
 
@@ -2667,7 +2693,8 @@ static int ext4_writepages(struct address_space *mapping,
 	 * *never* be called, so if that ever happens, we would want
 	 * the stack trace.
 	 */
-	if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED)) {
+	if (unlikely(ext4_forced_shutdown(EXT4_SB(mapping->host->i_sb)) ||
+		     sbi->s_mount_flags & EXT4_MF_FS_ABORTED)) {
 		ret = -EROFS;
 		goto out_writepages;
 	}
@@ -2892,6 +2919,9 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
 	struct inode *inode = mapping->host;
 	handle_t *handle;
 
+	if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
+		return -EIO;
+
 	index = pos >> PAGE_SHIFT;
 
 	if (ext4_nonda_switch(inode->i_sb) ||
@@ -3420,7 +3450,7 @@ orphan_del:
 	return ret;
 }
 
-struct iomap_ops ext4_iomap_ops = {
+const struct iomap_ops ext4_iomap_ops = {
 	.iomap_begin		= ext4_iomap_begin,
 	.iomap_end		= ext4_iomap_end,
 };
@@ -3914,6 +3944,10 @@ static int ext4_block_truncate_page(handle_t *handle,
 	unsigned blocksize;
 	struct inode *inode = mapping->host;
 
+	/* If we are processing an encrypted inode during orphan list handling */
+	if (ext4_encrypted_inode(inode) && !fscrypt_has_encryption_key(inode))
+		return 0;
+
 	blocksize = inode->i_sb->s_blocksize;
 	length = blocksize - (offset & (blocksize - 1));
 
@@ -4222,7 +4256,9 @@ int ext4_truncate(struct inode *inode)
 	if (ext4_has_inline_data(inode)) {
 		int has_inline = 1;
 
-		ext4_inline_data_truncate(inode, &has_inline);
+		err = ext4_inline_data_truncate(inode, &has_inline);
+		if (err)
+			return err;
 		if (has_inline)
 			return 0;
 	}
@@ -5197,6 +5233,9 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
 	int orphan = 0;
 	const unsigned int ia_valid = attr->ia_valid;
 
+	if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
+		return -EIO;
+
 	error = setattr_prepare(dentry, attr);
 	if (error)
 		return error;
@@ -5483,6 +5522,9 @@ int ext4_mark_iloc_dirty(handle_t *handle,
 {
 	int err = 0;
 
+	if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
+		return -EIO;
+
 	if (IS_I_VERSION(inode))
 		inode_inc_iversion(inode);
 
@@ -5506,6 +5548,9 @@ ext4_reserve_inode_write(handle_t *handle, struct inode *inode,
 {
 	int err;
 
+	if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
+		return -EIO;
+
 	err = ext4_get_inode_loc(inode, iloc);
 	if (!err) {
 		BUFFER_TRACE(iloc->bh, "get_write_access");
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index d534399cf607..a4273ddb9922 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -16,6 +16,7 @@
 #include <linux/quotaops.h>
 #include <linux/uuid.h>
 #include <linux/uaccess.h>
+#include <linux/delay.h>
 #include "ext4_jbd2.h"
 #include "ext4.h"
 
@@ -442,6 +443,52 @@ static inline unsigned long ext4_xflags_to_iflags(__u32 xflags)
 	return iflags;
 }
 
+int ext4_shutdown(struct super_block *sb, unsigned long arg)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	__u32 flags;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (get_user(flags, (__u32 __user *)arg))
+		return -EFAULT;
+
+	if (flags > EXT4_GOING_FLAGS_NOLOGFLUSH)
+		return -EINVAL;
+
+	if (ext4_forced_shutdown(sbi))
+		return 0;
+
+	ext4_msg(sb, KERN_ALERT, "shut down requested (%d)", flags);
+
+	switch (flags) {
+	case EXT4_GOING_FLAGS_DEFAULT:
+		freeze_bdev(sb->s_bdev);
+		set_bit(EXT4_FLAGS_SHUTDOWN, &sbi->s_ext4_flags);
+		thaw_bdev(sb->s_bdev, sb);
+		break;
+	case EXT4_GOING_FLAGS_LOGFLUSH:
+		set_bit(EXT4_FLAGS_SHUTDOWN, &sbi->s_ext4_flags);
+		if (sbi->s_journal && !is_journal_aborted(sbi->s_journal)) {
+			(void) ext4_force_commit(sb);
+			jbd2_journal_abort(sbi->s_journal, 0);
+		}
+		break;
+	case EXT4_GOING_FLAGS_NOLOGFLUSH:
+		set_bit(EXT4_FLAGS_SHUTDOWN, &sbi->s_ext4_flags);
+		if (sbi->s_journal && !is_journal_aborted(sbi->s_journal)) {
+			msleep(100);
+			jbd2_journal_abort(sbi->s_journal, 0);
+		}
+		break;
+	default:
+		return -EINVAL;
+	}
+	clear_opt(sb, DISCARD);
+	return 0;
+}
+
 long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
 	struct inode *inode = file_inode(filp);
@@ -893,6 +940,8 @@ resizefs_out:
 
 		return 0;
 	}
+	case EXT4_IOC_SHUTDOWN:
+		return ext4_shutdown(sb, arg);
 	default:
 		return -ENOTTY;
 	}
@@ -959,6 +1008,7 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	case EXT4_IOC_SET_ENCRYPTION_POLICY:
 	case EXT4_IOC_GET_ENCRYPTION_PWSALT:
 	case EXT4_IOC_GET_ENCRYPTION_POLICY:
+	case EXT4_IOC_SHUTDOWN:
 		break;
 	default:
 		return -ENOIOCTLCMD;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 7ae43c59bc79..10c62de642c6 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -1556,7 +1556,17 @@ static int mb_find_extent(struct ext4_buddy *e4b, int block,
 		ex->fe_len += 1 << order;
 	}
 
-	BUG_ON(ex->fe_start + ex->fe_len > (1 << (e4b->bd_blkbits + 3)));
+	if (ex->fe_start + ex->fe_len > (1 << (e4b->bd_blkbits + 3))) {
+		/* Should never happen! (but apparently sometimes does?!?) */
+		WARN_ON(1);
+		ext4_error(e4b->bd_sb, "corruption or bug in mb_find_extent "
+			   "block=%d, order=%d needed=%d ex=%u/%d/%d@%u",
+			   block, order, needed, ex->fe_group, ex->fe_start,
+			   ex->fe_len, ex->fe_logical);
+		ex->fe_len = 0;
+		ex->fe_start = 0;
+		ex->fe_group = 0;
+	}
 	return ex->fe_len;
 }
 
@@ -2136,8 +2146,10 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
 	 * We search using buddy data only if the order of the request
 	 * is greater than equal to the sbi_s_mb_order2_reqs
 	 * You can tune it via /sys/fs/ext4/<partition>/mb_order2_req
+	 * We also support searching for power-of-two requests only for
+	 * requests upto maximum buddy size we have constructed.
 	 */
-	if (i >= sbi->s_mb_order2_reqs) {
+	if (i >= sbi->s_mb_order2_reqs && i <= sb->s_blocksize_bits + 2) {
 		/*
 		 * This should tell if fe_len is exactly power of 2
 		 */
@@ -2207,7 +2219,7 @@ repeat:
 			}
 
 			ac->ac_groups_scanned++;
-			if (cr == 0 && ac->ac_2order < sb->s_blocksize_bits+2)
+			if (cr == 0)
 				ext4_mb_simple_scan_group(ac, &e4b);
 			else if (cr == 1 && sbi->s_stripe &&
 					!(ac->ac_g_ex.fe_len % sbi->s_stripe))
@@ -3123,6 +3135,13 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
 	if (ar->pright && start + size - 1 >= ar->lright)
 		size -= start + size - ar->lright;
 
+	/*
+	 * Trim allocation request for filesystems with artificially small
+	 * groups.
+	 */
+	if (size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb))
+		size = EXT4_BLOCKS_PER_GROUP(ac->ac_sb);
+
 	end = start + size;
 
 	/* check we don't cross already preallocated blocks */
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index eadba919f26b..6ad612c576fc 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1378,6 +1378,8 @@ static struct buffer_head * ext4_find_entry (struct inode *dir,
 		return NULL;
 
 	retval = ext4_fname_setup_filename(dir, d_name, 1, &fname);
+	if (retval == -ENOENT)
+		return NULL;
 	if (retval)
 		return ERR_PTR(retval);
 
@@ -1616,13 +1618,15 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi
 		    !fscrypt_has_permitted_context(dir, inode)) {
 			int nokey = ext4_encrypted_inode(inode) &&
 				!fscrypt_has_encryption_key(inode);
-			iput(inode);
-			if (nokey)
+			if (nokey) {
+				iput(inode);
 				return ERR_PTR(-ENOKEY);
+			}
 			ext4_warning(inode->i_sb,
 				     "Inconsistent encryption contexts: %lu/%lu",
 				     (unsigned long) dir->i_ino,
 				     (unsigned long) inode->i_ino);
+			iput(inode);
 			return ERR_PTR(-EPERM);
 		}
 	}
@@ -2935,6 +2939,9 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
 	struct ext4_dir_entry_2 *de;
 	handle_t *handle = NULL;
 
+	if (unlikely(ext4_forced_shutdown(EXT4_SB(dir->i_sb))))
+		return -EIO;
+
 	/* Initialize quotas before so that eventual writes go in
 	 * separate transaction */
 	retval = dquot_initialize(dir);
@@ -3008,6 +3015,9 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
 	struct ext4_dir_entry_2 *de;
 	handle_t *handle = NULL;
 
+	if (unlikely(ext4_forced_shutdown(EXT4_SB(dir->i_sb))))
+		return -EIO;
+
 	trace_ext4_unlink_enter(dir, dentry);
 	/* Initialize quotas before so that eventual writes go
 	 * in separate transaction */
@@ -3078,6 +3088,9 @@ static int ext4_symlink(struct inode *dir,
 	struct fscrypt_str disk_link;
 	struct fscrypt_symlink_data *sd = NULL;
 
+	if (unlikely(ext4_forced_shutdown(EXT4_SB(dir->i_sb))))
+		return -EIO;
+
 	disk_link.len = len + 1;
 	disk_link.name = (char *) symname;
 
@@ -3088,7 +3101,7 @@ static int ext4_symlink(struct inode *dir,
 		if (err)
 			return err;
 		if (!fscrypt_has_encryption_key(dir))
-			return -EPERM;
+			return -ENOKEY;
 		disk_link.len = (fscrypt_fname_encrypted_size(dir, len) +
 				 sizeof(struct fscrypt_symlink_data));
 		sd = kzalloc(disk_link.len, GFP_KERNEL);
@@ -3525,6 +3538,12 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
 			EXT4_I(old_dentry->d_inode)->i_projid)))
 		return -EXDEV;
 
+	if ((ext4_encrypted_inode(old_dir) &&
+	     !fscrypt_has_encryption_key(old_dir)) ||
+	    (ext4_encrypted_inode(new_dir) &&
+	     !fscrypt_has_encryption_key(new_dir)))
+		return -ENOKEY;
+
 	retval = dquot_initialize(old.dir);
 	if (retval)
 		return retval;
@@ -3725,6 +3744,12 @@ static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
 	int retval;
 	struct timespec ctime;
 
+	if ((ext4_encrypted_inode(old_dir) &&
+	     !fscrypt_has_encryption_key(old_dir)) ||
+	    (ext4_encrypted_inode(new_dir) &&
+	     !fscrypt_has_encryption_key(new_dir)))
+		return -ENOKEY;
+
 	if ((ext4_encrypted_inode(old_dir) ||
 	     ext4_encrypted_inode(new_dir)) &&
 	    (old_dir != new_dir) &&
@@ -3858,6 +3883,9 @@ static int ext4_rename2(struct inode *old_dir, struct dentry *old_dentry,
 			struct inode *new_dir, struct dentry *new_dentry,
 			unsigned int flags)
 {
+	if (unlikely(ext4_forced_shutdown(EXT4_SB(old_dir->i_sb))))
+		return -EIO;
+
 	if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
 		return -EINVAL;
 
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index d83b0f3c5fe9..208241b06662 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -24,7 +24,6 @@
 #include <linux/slab.h>
 #include <linux/mm.h>
 #include <linux/backing-dev.h>
-#include <linux/fscrypto.h>
 
 #include "ext4_jbd2.h"
 #include "xattr.h"
@@ -158,7 +157,7 @@ static int ext4_end_io(ext4_io_end_t *io)
 
 	io->handle = NULL;	/* Following call will use up the handle */
 	ret = ext4_convert_unwritten_extents(handle, inode, offset, size);
-	if (ret < 0) {
+	if (ret < 0 && !ext4_forced_shutdown(EXT4_SB(inode->i_sb))) {
 		ext4_msg(inode->i_sb, KERN_EMERG,
 			 "failed to convert unwritten extents to written "
 			 "extents -- potential data loss!  "
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index cf681004b196..c3ed9021b781 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -45,7 +45,8 @@ int ext4_resize_begin(struct super_block *sb)
 		return -EPERM;
 	}
 
-	if (test_and_set_bit_lock(EXT4_RESIZING, &EXT4_SB(sb)->s_resize_flags))
+	if (test_and_set_bit_lock(EXT4_FLAGS_RESIZING,
+				  &EXT4_SB(sb)->s_ext4_flags))
 		ret = -EBUSY;
 
 	return ret;
@@ -53,7 +54,7 @@ int ext4_resize_begin(struct super_block *sb)
 
 void ext4_resize_end(struct super_block *sb)
 {
-	clear_bit_unlock(EXT4_RESIZING, &EXT4_SB(sb)->s_resize_flags);
+	clear_bit_unlock(EXT4_FLAGS_RESIZING, &EXT4_SB(sb)->s_ext4_flags);
 	smp_mb__after_atomic();
 }
 
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 66845a08a87a..2e03a0a88d92 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -438,6 +438,9 @@ void __ext4_error(struct super_block *sb, const char *function,
 	struct va_format vaf;
 	va_list args;
 
+	if (unlikely(ext4_forced_shutdown(EXT4_SB(sb))))
+		return;
+
 	if (ext4_error_ratelimit(sb)) {
 		va_start(args, fmt);
 		vaf.fmt = fmt;
@@ -459,6 +462,9 @@ void __ext4_error_inode(struct inode *inode, const char *function,
 	struct va_format vaf;
 	struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
 
+	if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
+		return;
+
 	es->s_last_error_ino = cpu_to_le32(inode->i_ino);
 	es->s_last_error_block = cpu_to_le64(block);
 	if (ext4_error_ratelimit(inode->i_sb)) {
@@ -491,6 +497,9 @@ void __ext4_error_file(struct file *file, const char *function,
 	struct inode *inode = file_inode(file);
 	char pathname[80], *path;
 
+	if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
+		return;
+
 	es = EXT4_SB(inode->i_sb)->s_es;
 	es->s_last_error_ino = cpu_to_le32(inode->i_ino);
 	if (ext4_error_ratelimit(inode->i_sb)) {
@@ -567,6 +576,9 @@ void __ext4_std_error(struct super_block *sb, const char *function,
 	char nbuf[16];
 	const char *errstr;
 
+	if (unlikely(ext4_forced_shutdown(EXT4_SB(sb))))
+		return;
+
 	/* Special case: if the error is EROFS, and we're not already
 	 * inside a transaction, then there's really no point in logging
 	 * an error. */
@@ -600,6 +612,9 @@ void __ext4_abort(struct super_block *sb, const char *function,
 	struct va_format vaf;
 	va_list args;
 
+	if (unlikely(ext4_forced_shutdown(EXT4_SB(sb))))
+		return;
+
 	save_error_info(sb, function, line);
 	va_start(args, fmt);
 	vaf.fmt = fmt;
@@ -695,6 +710,9 @@ __acquires(bitlock)
 	va_list args;
 	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
 
+	if (unlikely(ext4_forced_shutdown(EXT4_SB(sb))))
+		return;
+
 	es->s_last_error_ino = cpu_to_le32(ino);
 	es->s_last_error_block = cpu_to_le64(block);
 	__save_error_info(sb, function, line);
@@ -825,6 +843,7 @@ static void ext4_put_super(struct super_block *sb)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	struct ext4_super_block *es = sbi->s_es;
+	int aborted = 0;
 	int i, err;
 
 	ext4_unregister_li_request(sb);
@@ -834,9 +853,10 @@ static void ext4_put_super(struct super_block *sb)
 	destroy_workqueue(sbi->rsv_conversion_wq);
 
 	if (sbi->s_journal) {
+		aborted = is_journal_aborted(sbi->s_journal);
 		err = jbd2_journal_destroy(sbi->s_journal);
 		sbi->s_journal = NULL;
-		if (err < 0)
+		if ((err < 0) && !aborted)
 			ext4_abort(sb, "Couldn't clean up the journal");
 	}
 
@@ -847,7 +867,7 @@ static void ext4_put_super(struct super_block *sb)
 	ext4_mb_release(sb);
 	ext4_ext_release(sb);
 
-	if (!(sb->s_flags & MS_RDONLY)) {
+	if (!(sb->s_flags & MS_RDONLY) && !aborted) {
 		ext4_clear_feature_journal_needs_recovery(sb);
 		es->s_state = cpu_to_le16(sbi->s_mount_state);
 	}
@@ -1100,12 +1120,6 @@ static int ext4_get_context(struct inode *inode, void *ctx, size_t len)
 				 EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, ctx, len);
 }
 
-static int ext4_key_prefix(struct inode *inode, u8 **key)
-{
-	*key = EXT4_SB(inode->i_sb)->key_prefix;
-	return EXT4_SB(inode->i_sb)->key_prefix_size;
-}
-
 static int ext4_prepare_context(struct inode *inode)
 {
 	return ext4_convert_inline_data(inode);
@@ -1179,9 +1193,9 @@ static unsigned ext4_max_namelen(struct inode *inode)
 		EXT4_NAME_LEN;
 }
 
-static struct fscrypt_operations ext4_cryptops = {
+static const struct fscrypt_operations ext4_cryptops = {
+	.key_prefix		= "ext4:",
 	.get_context		= ext4_get_context,
-	.key_prefix		= ext4_key_prefix,
 	.prepare_context	= ext4_prepare_context,
 	.set_context		= ext4_set_context,
 	.dummy_context		= ext4_dummy_context,
@@ -1190,7 +1204,7 @@ static struct fscrypt_operations ext4_cryptops = {
 	.max_namelen		= ext4_max_namelen,
 };
 #else
-static struct fscrypt_operations ext4_cryptops = {
+static const struct fscrypt_operations ext4_cryptops = {
 	.is_encrypted		= ext4_encrypted_inode,
 };
 #endif
@@ -1290,7 +1304,7 @@ enum {
 	Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err,
 	Opt_usrquota, Opt_grpquota, Opt_prjquota, Opt_i_version, Opt_dax,
 	Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_mblk_io_submit,
-	Opt_lazytime, Opt_nolazytime,
+	Opt_lazytime, Opt_nolazytime, Opt_debug_want_extra_isize,
 	Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity,
 	Opt_inode_readahead_blks, Opt_journal_ioprio,
 	Opt_dioread_nolock, Opt_dioread_lock,
@@ -1358,6 +1372,7 @@ static const match_table_t tokens = {
 	{Opt_delalloc, "delalloc"},
 	{Opt_lazytime, "lazytime"},
 	{Opt_nolazytime, "nolazytime"},
+	{Opt_debug_want_extra_isize, "debug_want_extra_isize=%u"},
 	{Opt_nodelalloc, "nodelalloc"},
 	{Opt_removed, "mblk_io_submit"},
 	{Opt_removed, "nomblk_io_submit"},
@@ -1563,6 +1578,7 @@ static const struct mount_opts {
 #endif
 	{Opt_nouid32, EXT4_MOUNT_NO_UID32, MOPT_SET},
 	{Opt_debug, EXT4_MOUNT_DEBUG, MOPT_SET},
+	{Opt_debug_want_extra_isize, 0, MOPT_GTE0},
 	{Opt_quota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA, MOPT_SET | MOPT_Q},
 	{Opt_usrquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA,
 							MOPT_SET | MOPT_Q},
@@ -1676,6 +1692,8 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
 		if (arg == 0)
 			arg = JBD2_DEFAULT_MAX_COMMIT_AGE;
 		sbi->s_commit_interval = HZ * arg;
+	} else if (token == Opt_debug_want_extra_isize) {
+		sbi->s_want_extra_isize = arg;
 	} else if (token == Opt_max_batch_time) {
 		sbi->s_max_batch_time = arg;
 	} else if (token == Opt_min_batch_time) {
@@ -2619,9 +2637,9 @@ static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi)
 
 	if (sbi->s_stripe && sbi->s_stripe <= sbi->s_blocks_per_group)
 		ret = sbi->s_stripe;
-	else if (stripe_width <= sbi->s_blocks_per_group)
+	else if (stripe_width && stripe_width <= sbi->s_blocks_per_group)
 		ret = stripe_width;
-	else if (stride <= sbi->s_blocks_per_group)
+	else if (stride && stride <= sbi->s_blocks_per_group)
 		ret = stride;
 	else
 		ret = 0;
@@ -3842,7 +3860,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) /
 		   EXT4_DESC_PER_BLOCK(sb);
 	if (ext4_has_feature_meta_bg(sb)) {
-		if (le32_to_cpu(es->s_first_meta_bg) >= db_count) {
+		if (le32_to_cpu(es->s_first_meta_bg) > db_count) {
 			ext4_msg(sb, KERN_WARNING,
 				 "first meta block group too large: %u "
 				 "(group descriptor block count %u)",
@@ -3925,7 +3943,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	 * root first: it may be modified in the journal!
 	 */
 	if (!test_opt(sb, NOLOAD) && ext4_has_feature_journal(sb)) {
-		if (ext4_load_journal(sb, es, journal_devnum))
+		err = ext4_load_journal(sb, es, journal_devnum);
+		if (err)
 			goto failed_mount3a;
 	} else if (test_opt(sb, NOLOAD) && !(sb->s_flags & MS_RDONLY) &&
 		   ext4_has_feature_journal_needs_recovery(sb)) {
@@ -4087,7 +4106,8 @@ no_journal:
 		sb->s_flags |= MS_RDONLY;
 
 	/* determine the minimum size of new large inodes, if present */
-	if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE) {
+	if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE &&
+	    sbi->s_want_extra_isize == 0) {
 		sbi->s_want_extra_isize = sizeof(struct ext4_inode) -
 						     EXT4_GOOD_OLD_INODE_SIZE;
 		if (ext4_has_feature_extra_isize(sb)) {
@@ -4218,11 +4238,6 @@ no_journal:
 	ratelimit_state_init(&sbi->s_msg_ratelimit_state, 5 * HZ, 10);
 
 	kfree(orig_data);
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
-	memcpy(sbi->key_prefix, EXT4_KEY_DESC_PREFIX,
-				EXT4_KEY_DESC_PREFIX_SIZE);
-	sbi->key_prefix_size = EXT4_KEY_DESC_PREFIX_SIZE;
-#endif
 	return 0;
 
 cantfind_ext4:
@@ -4720,6 +4735,9 @@ static int ext4_sync_fs(struct super_block *sb, int wait)
 	bool needs_barrier = false;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 
+	if (unlikely(ext4_forced_shutdown(EXT4_SB(sb))))
+		return 0;
+
 	trace_ext4_sync_fs(sb, wait);
 	flush_workqueue(sbi->rsv_conversion_wq);
 	/*
@@ -4803,7 +4821,7 @@ out:
  */
 static int ext4_unfreeze(struct super_block *sb)
 {
-	if (sb->s_flags & MS_RDONLY)
+	if ((sb->s_flags & MS_RDONLY) || ext4_forced_shutdown(EXT4_SB(sb)))
 		return 0;
 
 	if (EXT4_SB(sb)->s_journal) {
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 5a94fa52b74f..67636acf7624 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -411,6 +411,9 @@ ext4_xattr_get(struct inode *inode, int name_index, const char *name,
 {
 	int error;
 
+	if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
+		return -EIO;
+
 	if (strlen(name) > 255)
 		return -ERANGE;
 
@@ -1188,16 +1191,14 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
 	struct ext4_xattr_block_find bs = {
 		.s = { .not_found = -ENODATA, },
 	};
-	unsigned long no_expand;
+	int no_expand;
 	int error;
 
 	if (!name)
 		return -EINVAL;
 	if (strlen(name) > 255)
 		return -ERANGE;
-	down_write(&EXT4_I(inode)->xattr_sem);
-	no_expand = ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND);
-	ext4_set_inode_state(inode, EXT4_STATE_NO_EXPAND);
+	ext4_write_lock_xattr(inode, &no_expand);
 
 	error = ext4_reserve_inode_write(handle, inode, &is.iloc);
 	if (error)
@@ -1264,7 +1265,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
 		ext4_xattr_update_super_block(handle, inode->i_sb);
 		inode->i_ctime = current_time(inode);
 		if (!value)
-			ext4_clear_inode_state(inode, EXT4_STATE_NO_EXPAND);
+			no_expand = 0;
 		error = ext4_mark_iloc_dirty(handle, inode, &is.iloc);
 		/*
 		 * The bh is consumed by ext4_mark_iloc_dirty, even with
@@ -1278,9 +1279,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
 cleanup:
 	brelse(is.iloc.bh);
 	brelse(bs.bh);
-	if (no_expand == 0)
-		ext4_clear_inode_state(inode, EXT4_STATE_NO_EXPAND);
-	up_write(&EXT4_I(inode)->xattr_sem);
+	ext4_write_unlock_xattr(inode, &no_expand);
 	return error;
 }
 
@@ -1497,12 +1496,11 @@ int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
 	int error = 0, tried_min_extra_isize = 0;
 	int s_min_extra_isize = le16_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_min_extra_isize);
 	int isize_diff;	/* How much do we need to grow i_extra_isize */
+	int no_expand;
+
+	if (ext4_write_trylock_xattr(inode, &no_expand) == 0)
+		return 0;
 
-	down_write(&EXT4_I(inode)->xattr_sem);
-	/*
-	 * Set EXT4_STATE_NO_EXPAND to avoid recursion when marking inode dirty
-	 */
-	ext4_set_inode_state(inode, EXT4_STATE_NO_EXPAND);
 retry:
 	isize_diff = new_extra_isize - EXT4_I(inode)->i_extra_isize;
 	if (EXT4_I(inode)->i_extra_isize >= new_extra_isize)
@@ -1584,17 +1582,16 @@ shift:
 	EXT4_I(inode)->i_extra_isize = new_extra_isize;
 	brelse(bh);
 out:
-	ext4_clear_inode_state(inode, EXT4_STATE_NO_EXPAND);
-	up_write(&EXT4_I(inode)->xattr_sem);
+	ext4_write_unlock_xattr(inode, &no_expand);
 	return 0;
 
 cleanup:
 	brelse(bh);
 	/*
-	 * We deliberately leave EXT4_STATE_NO_EXPAND set here since inode
-	 * size expansion failed.
+	 * Inode size expansion failed; don't try again
 	 */
-	up_write(&EXT4_I(inode)->xattr_sem);
+	no_expand = 1;
+	ext4_write_unlock_xattr(inode, &no_expand);
 	return error;
 }
 
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index a92e783fa057..099c8b670ef5 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -102,6 +102,38 @@ extern const struct xattr_handler ext4_xattr_security_handler;
 
 #define EXT4_XATTR_NAME_ENCRYPTION_CONTEXT "c"
 
+/*
+ * The EXT4_STATE_NO_EXPAND is overloaded and used for two purposes.
+ * The first is to signal that there the inline xattrs and data are
+ * taking up so much space that we might as well not keep trying to
+ * expand it.  The second is that xattr_sem is taken for writing, so
+ * we shouldn't try to recurse into the inode expansion.  For this
+ * second case, we need to make sure that we take save and restore the
+ * NO_EXPAND state flag appropriately.
+ */
+static inline void ext4_write_lock_xattr(struct inode *inode, int *save)
+{
+	down_write(&EXT4_I(inode)->xattr_sem);
+	*save = ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND);
+	ext4_set_inode_state(inode, EXT4_STATE_NO_EXPAND);
+}
+
+static inline int ext4_write_trylock_xattr(struct inode *inode, int *save)
+{
+	if (down_write_trylock(&EXT4_I(inode)->xattr_sem) == 0)
+		return 0;
+	*save = ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND);
+	ext4_set_inode_state(inode, EXT4_STATE_NO_EXPAND);
+	return 1;
+}
+
+static inline void ext4_write_unlock_xattr(struct inode *inode, int *save)
+{
+	if (*save == 0)
+		ext4_clear_inode_state(inode, EXT4_STATE_NO_EXPAND);
+	up_write(&EXT4_I(inode)->xattr_sem);
+}
+
 extern ssize_t ext4_listxattr(struct dentry *, char *, size_t);
 
 extern int ext4_xattr_get(struct inode *, int, const char *, void *, size_t);
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index 827c5daef4fc..18607fc5240d 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -268,7 +268,10 @@ struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir,
 
 	err = fscrypt_setup_filename(dir, child, 1, &fname);
 	if (err) {
-		*res_page = ERR_PTR(err);
+		if (err == -ENOENT)
+			*res_page = NULL;
+		else
+			*res_page = ERR_PTR(err);
 		return NULL;
 	}
 
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 2da8c3aa0ce5..069fc7277d8d 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -22,7 +22,11 @@
 #include <linux/vmalloc.h>
 #include <linux/bio.h>
 #include <linux/blkdev.h>
-#include <linux/fscrypto.h>
+#ifdef CONFIG_F2FS_FS_ENCRYPTION
+#include <linux/fscrypt_supp.h>
+#else
+#include <linux/fscrypt_notsupp.h>
+#endif
 #include <crypto/hash.h>
 
 #ifdef CONFIG_F2FS_CHECK_FS
@@ -760,10 +764,6 @@ enum {
 	MAX_TIME,
 };
 
-#ifdef CONFIG_F2FS_FS_ENCRYPTION
-#define F2FS_KEY_DESC_PREFIX "f2fs:"
-#define F2FS_KEY_DESC_PREFIX_SIZE 5
-#endif
 struct f2fs_sb_info {
 	struct super_block *sb;			/* pointer to VFS super block */
 	struct proc_dir_entry *s_proc;		/* proc entry */
@@ -771,11 +771,6 @@ struct f2fs_sb_info {
 	int valid_super_block;			/* valid super block no */
 	unsigned long s_flag;				/* flags for sbi */
 
-#ifdef CONFIG_F2FS_FS_ENCRYPTION
-	u8 key_prefix[F2FS_KEY_DESC_PREFIX_SIZE];
-	u8 key_prefix_size;
-#endif
-
 #ifdef CONFIG_BLK_DEV_ZONED
 	unsigned int blocks_per_blkz;		/* F2FS blocks per zone */
 	unsigned int log_blocks_per_blkz;	/* log2 F2FS blocks per zone */
@@ -2510,28 +2505,4 @@ static inline bool f2fs_may_encrypt(struct inode *inode)
 #endif
 }
 
-#ifndef CONFIG_F2FS_FS_ENCRYPTION
-#define fscrypt_set_d_op(i)
-#define fscrypt_get_ctx			fscrypt_notsupp_get_ctx
-#define fscrypt_release_ctx		fscrypt_notsupp_release_ctx
-#define fscrypt_encrypt_page		fscrypt_notsupp_encrypt_page
-#define fscrypt_decrypt_page		fscrypt_notsupp_decrypt_page
-#define fscrypt_decrypt_bio_pages	fscrypt_notsupp_decrypt_bio_pages
-#define fscrypt_pullback_bio_page	fscrypt_notsupp_pullback_bio_page
-#define fscrypt_restore_control_page	fscrypt_notsupp_restore_control_page
-#define fscrypt_zeroout_range		fscrypt_notsupp_zeroout_range
-#define fscrypt_ioctl_set_policy	fscrypt_notsupp_ioctl_set_policy
-#define fscrypt_ioctl_get_policy	fscrypt_notsupp_ioctl_get_policy
-#define fscrypt_has_permitted_context	fscrypt_notsupp_has_permitted_context
-#define fscrypt_inherit_context		fscrypt_notsupp_inherit_context
-#define fscrypt_get_encryption_info	fscrypt_notsupp_get_encryption_info
-#define fscrypt_put_encryption_info	fscrypt_notsupp_put_encryption_info
-#define fscrypt_setup_filename		fscrypt_notsupp_setup_filename
-#define fscrypt_free_filename		fscrypt_notsupp_free_filename
-#define fscrypt_fname_encrypted_size	fscrypt_notsupp_fname_encrypted_size
-#define fscrypt_fname_alloc_buffer	fscrypt_notsupp_fname_alloc_buffer
-#define fscrypt_fname_free_buffer	fscrypt_notsupp_fname_free_buffer
-#define fscrypt_fname_disk_to_usr	fscrypt_notsupp_fname_disk_to_usr
-#define fscrypt_fname_usr_to_disk	fscrypt_notsupp_fname_usr_to_disk
-#endif
 #endif
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index 56c19b0610a8..11cabcadb1a3 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -403,7 +403,7 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry,
 			return err;
 
 		if (!fscrypt_has_encryption_key(dir))
-			return -EPERM;
+			return -ENOKEY;
 
 		disk_link.len = (fscrypt_fname_encrypted_size(dir, len) +
 				sizeof(struct fscrypt_symlink_data));
@@ -447,7 +447,7 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry,
 			goto err_out;
 
 		if (!fscrypt_has_encryption_key(inode)) {
-			err = -EPERM;
+			err = -ENOKEY;
 			goto err_out;
 		}
 
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 0738f48293cc..0d8802453758 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -713,8 +713,8 @@ static int __f2fs_issue_discard_zone(struct f2fs_sb_info *sbi,
 	}
 	sector = SECTOR_FROM_BLOCK(blkstart);
 
-	if (sector & (bdev_zone_size(bdev) - 1) ||
-				nr_sects != bdev_zone_size(bdev)) {
+	if (sector & (bdev_zone_sectors(bdev) - 1) ||
+	    nr_sects != bdev_zone_sectors(bdev)) {
 		f2fs_msg(sbi->sb, KERN_INFO,
 			"(%d) %s: Unaligned discard attempted (block %x + %x)",
 			devi, sbi->s_ndevs ? FDEV(devi).path: "",
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 702638e21c76..a831303bb777 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -1156,12 +1156,6 @@ static int f2fs_get_context(struct inode *inode, void *ctx, size_t len)
 				ctx, len, NULL);
 }
 
-static int f2fs_key_prefix(struct inode *inode, u8 **key)
-{
-	*key = F2FS_I_SB(inode)->key_prefix;
-	return F2FS_I_SB(inode)->key_prefix_size;
-}
-
 static int f2fs_set_context(struct inode *inode, const void *ctx, size_t len,
 							void *fs_data)
 {
@@ -1176,16 +1170,16 @@ static unsigned f2fs_max_namelen(struct inode *inode)
 			inode->i_sb->s_blocksize : F2FS_NAME_LEN;
 }
 
-static struct fscrypt_operations f2fs_cryptops = {
+static const struct fscrypt_operations f2fs_cryptops = {
+	.key_prefix	= "f2fs:",
 	.get_context	= f2fs_get_context,
-	.key_prefix	= f2fs_key_prefix,
 	.set_context	= f2fs_set_context,
 	.is_encrypted	= f2fs_encrypted_inode,
 	.empty_dir	= f2fs_empty_dir,
 	.max_namelen	= f2fs_max_namelen,
 };
 #else
-static struct fscrypt_operations f2fs_cryptops = {
+static const struct fscrypt_operations f2fs_cryptops = {
 	.is_encrypted	= f2fs_encrypted_inode,
 };
 #endif
@@ -1518,12 +1512,6 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
 	mutex_init(&sbi->wio_mutex[NODE]);
 	mutex_init(&sbi->wio_mutex[DATA]);
 	spin_lock_init(&sbi->cp_lock);
-
-#ifdef CONFIG_F2FS_FS_ENCRYPTION
-	memcpy(sbi->key_prefix, F2FS_KEY_DESC_PREFIX,
-				F2FS_KEY_DESC_PREFIX_SIZE);
-	sbi->key_prefix_size = F2FS_KEY_DESC_PREFIX_SIZE;
-#endif
 }
 
 static int init_percpu_info(struct f2fs_sb_info *sbi)
@@ -1553,16 +1541,16 @@ static int init_blkz_info(struct f2fs_sb_info *sbi, int devi)
 		return 0;
 
 	if (sbi->blocks_per_blkz && sbi->blocks_per_blkz !=
-				SECTOR_TO_BLOCK(bdev_zone_size(bdev)))
+				SECTOR_TO_BLOCK(bdev_zone_sectors(bdev)))
 		return -EINVAL;
-	sbi->blocks_per_blkz = SECTOR_TO_BLOCK(bdev_zone_size(bdev));
+	sbi->blocks_per_blkz = SECTOR_TO_BLOCK(bdev_zone_sectors(bdev));
 	if (sbi->log_blocks_per_blkz && sbi->log_blocks_per_blkz !=
 				__ilog2_u32(sbi->blocks_per_blkz))
 		return -EINVAL;
 	sbi->log_blocks_per_blkz = __ilog2_u32(sbi->blocks_per_blkz);
 	FDEV(devi).nr_blkz = SECTOR_TO_BLOCK(nr_sectors) >>
 					sbi->log_blocks_per_blkz;
-	if (nr_sectors & (bdev_zone_size(bdev) - 1))
+	if (nr_sectors & (bdev_zone_sectors(bdev) - 1))
 		FDEV(devi).nr_blkz++;
 
 	FDEV(devi).blkz_type = kmalloc(FDEV(devi).nr_blkz, GFP_KERNEL);
diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c
index 4304072161aa..40d61077bead 100644
--- a/fs/fscache/cookie.c
+++ b/fs/fscache/cookie.c
@@ -542,6 +542,7 @@ void __fscache_disable_cookie(struct fscache_cookie *cookie, bool invalidate)
 		hlist_for_each_entry(object, &cookie->backing_objects, cookie_link) {
 			if (invalidate)
 				set_bit(FSCACHE_OBJECT_RETIRED, &object->flags);
+			clear_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags);
 			fscache_raise_event(object, FSCACHE_OBJECT_EV_KILL);
 		}
 	} else {
@@ -560,6 +561,10 @@ void __fscache_disable_cookie(struct fscache_cookie *cookie, bool invalidate)
 		wait_on_atomic_t(&cookie->n_active, fscache_wait_atomic_t,
 				 TASK_UNINTERRUPTIBLE);
 
+	/* Make sure any pending writes are cancelled. */
+	if (cookie->def->type != FSCACHE_COOKIE_TYPE_INDEX)
+		fscache_invalidate_writes(cookie);
+
 	/* Reset the cookie state if it wasn't relinquished */
 	if (!test_bit(FSCACHE_COOKIE_RELINQUISHED, &cookie->flags)) {
 		atomic_inc(&cookie->n_active);
diff --git a/fs/fscache/netfs.c b/fs/fscache/netfs.c
index 9b28649df3a1..a8aa00be4444 100644
--- a/fs/fscache/netfs.c
+++ b/fs/fscache/netfs.c
@@ -48,6 +48,7 @@ int __fscache_register_netfs(struct fscache_netfs *netfs)
 	cookie->flags		= 1 << FSCACHE_COOKIE_ENABLED;
 
 	spin_lock_init(&cookie->lock);
+	spin_lock_init(&cookie->stores_lock);
 	INIT_HLIST_HEAD(&cookie->backing_objects);
 
 	/* check the netfs type is not already present */
diff --git a/fs/fscache/object.c b/fs/fscache/object.c
index 9e792e30f4db..7a182c87f378 100644
--- a/fs/fscache/object.c
+++ b/fs/fscache/object.c
@@ -30,6 +30,7 @@ static const struct fscache_state *fscache_look_up_object(struct fscache_object
 static const struct fscache_state *fscache_object_available(struct fscache_object *, int);
 static const struct fscache_state *fscache_parent_ready(struct fscache_object *, int);
 static const struct fscache_state *fscache_update_object(struct fscache_object *, int);
+static const struct fscache_state *fscache_object_dead(struct fscache_object *, int);
 
 #define __STATE_NAME(n) fscache_osm_##n
 #define STATE(n) (&__STATE_NAME(n))
@@ -91,7 +92,7 @@ static WORK_STATE(LOOKUP_FAILURE,	"LCFL", fscache_lookup_failure);
 static WORK_STATE(KILL_OBJECT,		"KILL", fscache_kill_object);
 static WORK_STATE(KILL_DEPENDENTS,	"KDEP", fscache_kill_dependents);
 static WORK_STATE(DROP_OBJECT,		"DROP", fscache_drop_object);
-static WORK_STATE(OBJECT_DEAD,		"DEAD", (void*)2UL);
+static WORK_STATE(OBJECT_DEAD,		"DEAD", fscache_object_dead);
 
 static WAIT_STATE(WAIT_FOR_INIT,	"?INI",
 		  TRANSIT_TO(INIT_OBJECT,	1 << FSCACHE_OBJECT_EV_NEW_CHILD));
@@ -229,6 +230,10 @@ execute_work_state:
 	event = -1;
 	if (new_state == NO_TRANSIT) {
 		_debug("{OBJ%x} %s notrans", object->debug_id, state->name);
+		if (unlikely(state == STATE(OBJECT_DEAD))) {
+			_leave(" [dead]");
+			return;
+		}
 		fscache_enqueue_object(object);
 		event_mask = object->oob_event_mask;
 		goto unmask_events;
@@ -239,7 +244,7 @@ execute_work_state:
 	object->state = state = new_state;
 
 	if (state->work) {
-		if (unlikely(state->work == ((void *)2UL))) {
+		if (unlikely(state == STATE(OBJECT_DEAD))) {
 			_leave(" [dead]");
 			return;
 		}
@@ -645,6 +650,12 @@ static const struct fscache_state *fscache_kill_object(struct fscache_object *ob
 	fscache_mark_object_dead(object);
 	object->oob_event_mask = 0;
 
+	if (test_bit(FSCACHE_OBJECT_RETIRED, &object->flags)) {
+		/* Reject any new read/write ops and abort any that are pending. */
+		clear_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags);
+		fscache_cancel_all_ops(object);
+	}
+
 	if (list_empty(&object->dependents) &&
 	    object->n_ops == 0 &&
 	    object->n_children == 0)
@@ -1077,3 +1088,20 @@ void fscache_object_mark_killed(struct fscache_object *object,
 	}
 }
 EXPORT_SYMBOL(fscache_object_mark_killed);
+
+/*
+ * The object is dead.  We can get here if an object gets queued by an event
+ * that would lead to its death (such as EV_KILL) when the dispatcher is
+ * already running (and so can be requeued) but hasn't yet cleared the event
+ * mask.
+ */
+static const struct fscache_state *fscache_object_dead(struct fscache_object *object,
+						       int event)
+{
+	if (!test_and_set_bit(FSCACHE_OBJECT_RUN_AFTER_DEAD,
+			      &object->flags))
+		return NO_TRANSIT;
+
+	WARN(true, "FS-Cache object redispatched after death");
+	return NO_TRANSIT;
+}
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 70ea57c7b6bb..f11792672977 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -399,6 +399,10 @@ static void request_end(struct fuse_conn *fc, struct fuse_req *req)
 static void queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req)
 {
 	spin_lock(&fiq->waitq.lock);
+	if (test_bit(FR_FINISHED, &req->flags)) {
+		spin_unlock(&fiq->waitq.lock);
+		return;
+	}
 	if (list_empty(&req->intr_entry)) {
 		list_add_tail(&req->intr_entry, &fiq->interrupts);
 		wake_up_locked(&fiq->waitq);
@@ -1372,6 +1376,7 @@ static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos,
 		 * code can Oops if the buffer persists after module unload.
 		 */
 		bufs[page_nr].ops = &nosteal_pipe_buf_ops;
+		bufs[page_nr].flags = 0;
 		ret = add_to_pipe(pipe, &bufs[page_nr++]);
 		if (unlikely(ret < 0))
 			break;
@@ -2025,7 +2030,6 @@ static void end_requests(struct fuse_conn *fc, struct list_head *head)
 		struct fuse_req *req;
 		req = list_entry(head->next, struct fuse_req, list);
 		req->out.h.error = -ECONNABORTED;
-		clear_bit(FR_PENDING, &req->flags);
 		clear_bit(FR_SENT, &req->flags);
 		list_del_init(&req->list);
 		request_end(fc, req);
@@ -2103,6 +2107,8 @@ void fuse_abort_conn(struct fuse_conn *fc)
 		spin_lock(&fiq->waitq.lock);
 		fiq->connected = 0;
 		list_splice_init(&fiq->pending, &to_end2);
+		list_for_each_entry(req, &to_end2, list)
+			clear_bit(FR_PENDING, &req->flags);
 		while (forget_pending(fiq))
 			kfree(dequeue_forget(fiq, 1, NULL));
 		wake_up_all_locked(&fiq->waitq);
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 1f7c732f32b0..811fd8929a18 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -68,7 +68,7 @@ static u64 time_to_jiffies(u64 sec, u32 nsec)
 	if (sec || nsec) {
 		struct timespec64 ts = {
 			sec,
-			max_t(u32, nsec, NSEC_PER_SEC - 1)
+			min_t(u32, nsec, NSEC_PER_SEC - 1)
 		};
 
 		return get_jiffies_64() + timespec64_to_jiffies(&ts);
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 91307940c8ac..052f8d3c41cb 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -256,7 +256,7 @@ struct fuse_io_priv {
 
 #define FUSE_IO_PRIV_SYNC(f) \
 {					\
-	.refcnt = { ATOMIC_INIT(1) },	\
+	.refcnt = KREF_INIT(1),		\
 	.async = 0,			\
 	.file = f,			\
 }
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 6b039d7ce160..ed7a2e252ad8 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -143,8 +143,8 @@ static int gfs2_writepage(struct page *page, struct writeback_control *wbc)
 /* This is the same as calling block_write_full_page, but it also
  * writes pages outside of i_size
  */
-int gfs2_write_full_page(struct page *page, get_block_t *get_block,
-			 struct writeback_control *wbc)
+static int gfs2_write_full_page(struct page *page, get_block_t *get_block,
+				struct writeback_control *wbc)
 {
 	struct inode * const inode = page->mapping->host;
 	loff_t i_size = i_size_read(inode);
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index fc5da4cbe88c..01b97c012c6e 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -720,6 +720,7 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
 {
 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
 	struct gfs2_rgrp_list rlist;
+	struct gfs2_trans *tr;
 	u64 bn, bstart;
 	u32 blen, btotal;
 	__be64 *p;
@@ -728,6 +729,7 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
 	unsigned int revokes = 0;
 	int x;
 	int error;
+	int jblocks_rqsted;
 
 	error = gfs2_rindex_update(sdp);
 	if (error)
@@ -791,12 +793,17 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
 	if (gfs2_rs_active(&ip->i_res)) /* needs to be done with the rgrp glock held */
 		gfs2_rs_deltree(&ip->i_res);
 
-	error = gfs2_trans_begin(sdp, rg_blocks + RES_DINODE +
-				 RES_INDIRECT + RES_STATFS + RES_QUOTA,
-				 revokes);
+restart:
+	jblocks_rqsted = rg_blocks + RES_DINODE +
+		RES_INDIRECT + RES_STATFS + RES_QUOTA +
+		gfs2_struct2blk(sdp, revokes, sizeof(u64));
+	if (jblocks_rqsted > atomic_read(&sdp->sd_log_thresh2))
+		jblocks_rqsted = atomic_read(&sdp->sd_log_thresh2);
+	error = gfs2_trans_begin(sdp, jblocks_rqsted, revokes);
 	if (error)
 		goto out_rg_gunlock;
 
+	tr = current->journal_info;
 	down_write(&ip->i_rw_mutex);
 
 	gfs2_trans_add_meta(ip->i_gl, dibh);
@@ -810,6 +817,16 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
 		if (!*p)
 			continue;
 
+		/* check for max reasonable journal transaction blocks */
+		if (tr->tr_num_buf_new + RES_STATFS +
+		    RES_QUOTA >= atomic_read(&sdp->sd_log_thresh2)) {
+			if (rg_blocks >= tr->tr_num_buf_new)
+				rg_blocks -= tr->tr_num_buf_new;
+			else
+				rg_blocks = 0;
+			break;
+		}
+
 		bn = be64_to_cpu(*p);
 
 		if (bstart + blen == bn)
@@ -827,6 +844,9 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
 		*p = 0;
 		gfs2_add_inode_blocks(&ip->i_inode, -1);
 	}
+	if (p == bottom)
+		rg_blocks = 0;
+
 	if (bstart) {
 		__gfs2_free_blocks(ip, bstart, blen, metadata);
 		btotal += blen;
@@ -844,6 +864,9 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
 
 	gfs2_trans_end(sdp);
 
+	if (rg_blocks)
+		goto restart;
+
 out_rg_gunlock:
 	gfs2_glock_dq_m(rlist.rl_rgrps, rlist.rl_ghs);
 out_rlist:
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 94f50cac91c6..ec0848fcca02 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -658,9 +658,11 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
 	struct kmem_cache *cachep;
 	int ret, tries = 0;
 
+	rcu_read_lock();
 	gl = rhashtable_lookup_fast(&gl_hash_table, &name, ht_parms);
 	if (gl && !lockref_get_not_dead(&gl->gl_lockref))
 		gl = NULL;
+	rcu_read_unlock();
 
 	*glp = gl;
 	if (gl)
@@ -728,15 +730,18 @@ again:
 
 	if (ret == -EEXIST) {
 		ret = 0;
+		rcu_read_lock();
 		tmp = rhashtable_lookup_fast(&gl_hash_table, &name, ht_parms);
 		if (tmp == NULL || !lockref_get_not_dead(&tmp->gl_lockref)) {
 			if (++tries < 100) {
+				rcu_read_unlock();
 				cond_resched();
 				goto again;
 			}
 			tmp = NULL;
 			ret = -ENOMEM;
 		}
+		rcu_read_unlock();
 	} else {
 		WARN_ON_ONCE(ret);
 	}
@@ -1420,26 +1425,32 @@ static struct shrinker glock_shrinker = {
  * @sdp: the filesystem
  * @bucket: the bucket
  *
+ * Note that the function can be called multiple times on the same
+ * object.  So the user must ensure that the function can cope with
+ * that.
  */
 
 static void glock_hash_walk(glock_examiner examiner, const struct gfs2_sbd *sdp)
 {
 	struct gfs2_glock *gl;
-	struct rhash_head *pos;
-	const struct bucket_table *tbl;
-	int i;
+	struct rhashtable_iter iter;
 
-	rcu_read_lock();
-	tbl = rht_dereference_rcu(gl_hash_table.tbl, &gl_hash_table);
-	for (i = 0; i < tbl->size; i++) {
-		rht_for_each_entry_rcu(gl, pos, tbl, i, gl_node) {
+	rhashtable_walk_enter(&gl_hash_table, &iter);
+
+	do {
+		gl = ERR_PTR(rhashtable_walk_start(&iter));
+		if (gl)
+			continue;
+
+		while ((gl = rhashtable_walk_next(&iter)) && !IS_ERR(gl))
 			if ((gl->gl_name.ln_sbd == sdp) &&
 			    lockref_get_not_dead(&gl->gl_lockref))
 				examiner(gl);
-		}
-	}
-	rcu_read_unlock();
-	cond_resched();
+
+		rhashtable_walk_stop(&iter);
+	} while (cond_resched(), gl == ERR_PTR(-EAGAIN));
+
+	rhashtable_walk_exit(&iter);
 }
 
 /**
@@ -1802,16 +1813,18 @@ void gfs2_glock_exit(void)
 
 static void gfs2_glock_iter_next(struct gfs2_glock_iter *gi)
 {
-	do {
-		gi->gl = rhashtable_walk_next(&gi->hti);
+	while ((gi->gl = rhashtable_walk_next(&gi->hti))) {
 		if (IS_ERR(gi->gl)) {
 			if (PTR_ERR(gi->gl) == -EAGAIN)
 				continue;
 			gi->gl = NULL;
+			return;
 		}
-	/* Skip entries for other sb and dead entries */
-	} while ((gi->gl) && ((gi->sdp != gi->gl->gl_name.ln_sbd) ||
-			      __lockref_is_dead(&gi->gl->gl_lockref)));
+		/* Skip entries for other sb and dead entries */
+		if (gi->sdp == gi->gl->gl_name.ln_sbd &&
+		    !__lockref_is_dead(&gi->gl->gl_lockref))
+			return;
+	}
 }
 
 static void *gfs2_glock_seq_start(struct seq_file *seq, loff_t *pos)
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index a6a3389a07fc..c45084ac642d 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -470,15 +470,19 @@ struct gfs2_quota_data {
 	struct rcu_head qd_rcu;
 };
 
+enum {
+	TR_TOUCHED = 1,
+	TR_ATTACHED = 2,
+	TR_ALLOCED = 3,
+};
+
 struct gfs2_trans {
 	unsigned long tr_ip;
 
 	unsigned int tr_blocks;
 	unsigned int tr_revokes;
 	unsigned int tr_reserved;
-	unsigned int tr_touched:1;
-	unsigned int tr_attached:1;
-	unsigned int tr_alloced:1;
+	unsigned long tr_flags;
 
 	unsigned int tr_num_buf_new;
 	unsigned int tr_num_databuf_new;
@@ -794,6 +798,7 @@ struct gfs2_sbd {
 	atomic_t sd_log_thresh1;
 	atomic_t sd_log_thresh2;
 	atomic_t sd_log_blks_free;
+	atomic_t sd_log_blks_needed;
 	wait_queue_head_t sd_log_waitq;
 	wait_queue_head_t sd_logd_waitq;
 
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 27c00a16def0..f865b96374df 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -349,6 +349,7 @@ int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks)
 	if (gfs2_assert_warn(sdp, blks) ||
 	    gfs2_assert_warn(sdp, blks <= sdp->sd_jdesc->jd_blocks))
 		return -EINVAL;
+	atomic_add(blks, &sdp->sd_log_blks_needed);
 retry:
 	free_blocks = atomic_read(&sdp->sd_log_blks_free);
 	if (unlikely(free_blocks <= wanted)) {
@@ -370,6 +371,7 @@ retry:
 			wake_up(&sdp->sd_reserving_log_wait);
 		goto retry;
 	}
+	atomic_sub(blks, &sdp->sd_log_blks_needed);
 	trace_gfs2_log_blocks(sdp, -blks);
 
 	/*
@@ -797,7 +799,7 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl,
 
 static void gfs2_merge_trans(struct gfs2_trans *old, struct gfs2_trans *new)
 {
-	WARN_ON_ONCE(old->tr_attached != 1);
+	WARN_ON_ONCE(!test_bit(TR_ATTACHED, &old->tr_flags));
 
 	old->tr_num_buf_new	+= new->tr_num_buf_new;
 	old->tr_num_databuf_new	+= new->tr_num_databuf_new;
@@ -821,9 +823,9 @@ static void log_refund(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
 	if (sdp->sd_log_tr) {
 		gfs2_merge_trans(sdp->sd_log_tr, tr);
 	} else if (tr->tr_num_buf_new || tr->tr_num_databuf_new) {
-		gfs2_assert_withdraw(sdp, tr->tr_alloced);
+		gfs2_assert_withdraw(sdp, test_bit(TR_ALLOCED, &tr->tr_flags));
 		sdp->sd_log_tr = tr;
-		tr->tr_attached = 1;
+		set_bit(TR_ATTACHED, &tr->tr_flags);
 	}
 
 	sdp->sd_log_commited_revoke += tr->tr_num_revoke - tr->tr_num_revoke_rm;
@@ -891,13 +893,16 @@ void gfs2_log_shutdown(struct gfs2_sbd *sdp)
 
 static inline int gfs2_jrnl_flush_reqd(struct gfs2_sbd *sdp)
 {
-	return (atomic_read(&sdp->sd_log_pinned) >= atomic_read(&sdp->sd_log_thresh1));
+	return (atomic_read(&sdp->sd_log_pinned) +
+		atomic_read(&sdp->sd_log_blks_needed) >=
+		atomic_read(&sdp->sd_log_thresh1));
 }
 
 static inline int gfs2_ail_flush_reqd(struct gfs2_sbd *sdp)
 {
 	unsigned int used_blocks = sdp->sd_jdesc->jd_blocks - atomic_read(&sdp->sd_log_blks_free);
-	return used_blocks >= atomic_read(&sdp->sd_log_thresh2);
+	return used_blocks + atomic_read(&sdp->sd_log_blks_needed) >=
+		atomic_read(&sdp->sd_log_thresh2);
 }
 
 /**
@@ -913,12 +918,15 @@ int gfs2_logd(void *data)
 	struct gfs2_sbd *sdp = data;
 	unsigned long t = 1;
 	DEFINE_WAIT(wait);
+	bool did_flush;
 
 	while (!kthread_should_stop()) {
 
+		did_flush = false;
 		if (gfs2_jrnl_flush_reqd(sdp) || t == 0) {
 			gfs2_ail1_empty(sdp);
 			gfs2_log_flush(sdp, NULL, NORMAL_FLUSH);
+			did_flush = true;
 		}
 
 		if (gfs2_ail_flush_reqd(sdp)) {
@@ -926,9 +934,10 @@ int gfs2_logd(void *data)
 			gfs2_ail1_wait(sdp);
 			gfs2_ail1_empty(sdp);
 			gfs2_log_flush(sdp, NULL, NORMAL_FLUSH);
+			did_flush = true;
 		}
 
-		if (!gfs2_ail_flush_reqd(sdp))
+		if (!gfs2_ail_flush_reqd(sdp) || did_flush)
 			wake_up(&sdp->sd_log_waitq);
 
 		t = gfs2_tune_get(sdp, gt_logd_secs) * HZ;
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 49db8ef13fdf..663ffc135ef3 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -292,7 +292,7 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags,
 	wait_on_buffer(bh);
 	if (unlikely(!buffer_uptodate(bh))) {
 		struct gfs2_trans *tr = current->journal_info;
-		if (tr && tr->tr_touched)
+		if (tr && test_bit(TR_TOUCHED, &tr->tr_flags))
 			gfs2_io_error_bh(sdp, bh);
 		brelse(bh);
 		*bhp = NULL;
@@ -319,7 +319,7 @@ int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh)
 
 	if (!buffer_uptodate(bh)) {
 		struct gfs2_trans *tr = current->journal_info;
-		if (tr && tr->tr_touched)
+		if (tr && test_bit(TR_TOUCHED, &tr->tr_flags))
 			gfs2_io_error_bh(sdp, bh);
 		return -EIO;
 	}
@@ -345,7 +345,7 @@ void gfs2_remove_from_journal(struct buffer_head *bh, int meta)
 			tr->tr_num_buf_rm++;
 		else
 			tr->tr_num_databuf_rm++;
-		tr->tr_touched = 1;
+		set_bit(TR_TOUCHED, &tr->tr_flags);
 		was_pinned = 1;
 		brelse(bh);
 	}
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index a34308df927f..b108e7ba81af 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -683,6 +683,7 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
 		goto fail_jindex;
 	}
 
+	atomic_set(&sdp->sd_log_blks_needed, 0);
 	if (sdp->sd_args.ar_spectator) {
 		sdp->sd_jdesc = gfs2_jdesc_find(sdp, 0);
 		atomic_set(&sdp->sd_log_blks_free, sdp->sd_jdesc->jd_blocks);
@@ -1226,7 +1227,7 @@ static int set_gfs2_super(struct super_block *s, void *data)
 	 * We set the bdi here to the queue backing, file systems can
 	 * overwrite this in ->fill_super()
 	 */
-	s->s_bdi = &bdev_get_queue(s->s_bdev)->backing_dev_info;
+	s->s_bdi = bdev_get_queue(s->s_bdev)->backing_dev_info;
 	return 0;
 }
 
diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c
index 0c1bde395062..affef3c066e0 100644
--- a/fs/gfs2/trans.c
+++ b/fs/gfs2/trans.c
@@ -48,7 +48,7 @@ int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks,
 	tr->tr_blocks = blocks;
 	tr->tr_revokes = revokes;
 	tr->tr_reserved = 1;
-	tr->tr_alloced = 1;
+	set_bit(TR_ALLOCED, &tr->tr_flags);
 	if (blocks)
 		tr->tr_reserved += 6 + blocks;
 	if (revokes)
@@ -78,7 +78,8 @@ static void gfs2_print_trans(const struct gfs2_trans *tr)
 {
 	pr_warn("Transaction created at: %pSR\n", (void *)tr->tr_ip);
 	pr_warn("blocks=%u revokes=%u reserved=%u touched=%u\n",
-		tr->tr_blocks, tr->tr_revokes, tr->tr_reserved, tr->tr_touched);
+		tr->tr_blocks, tr->tr_revokes, tr->tr_reserved,
+		test_bit(TR_TOUCHED, &tr->tr_flags));
 	pr_warn("Buf %u/%u Databuf %u/%u Revoke %u/%u\n",
 		tr->tr_num_buf_new, tr->tr_num_buf_rm,
 		tr->tr_num_databuf_new, tr->tr_num_databuf_rm,
@@ -89,12 +90,12 @@ void gfs2_trans_end(struct gfs2_sbd *sdp)
 {
 	struct gfs2_trans *tr = current->journal_info;
 	s64 nbuf;
-	int alloced = tr->tr_alloced;
+	int alloced = test_bit(TR_ALLOCED, &tr->tr_flags);
 
 	BUG_ON(!tr);
 	current->journal_info = NULL;
 
-	if (!tr->tr_touched) {
+	if (!test_bit(TR_TOUCHED, &tr->tr_flags)) {
 		gfs2_log_release(sdp, tr->tr_reserved);
 		if (alloced) {
 			kfree(tr);
@@ -112,8 +113,8 @@ void gfs2_trans_end(struct gfs2_sbd *sdp)
 		gfs2_print_trans(tr);
 
 	gfs2_log_commit(sdp, tr);
-	if (alloced && !tr->tr_attached)
-			kfree(tr);
+	if (alloced && !test_bit(TR_ATTACHED, &tr->tr_flags))
+		kfree(tr);
 	up_read(&sdp->sd_log_flush_lock);
 
 	if (sdp->sd_vfs->s_flags & MS_SYNCHRONOUS)
@@ -169,6 +170,10 @@ void gfs2_trans_add_data(struct gfs2_glock *gl, struct buffer_head *bh)
 	}
 
 	lock_buffer(bh);
+	if (buffer_pinned(bh)) {
+		set_bit(TR_TOUCHED, &tr->tr_flags);
+		goto out;
+	}
 	gfs2_log_lock(sdp);
 	bd = bh->b_private;
 	if (bd == NULL) {
@@ -182,7 +187,7 @@ void gfs2_trans_add_data(struct gfs2_glock *gl, struct buffer_head *bh)
 		gfs2_log_lock(sdp);
 	}
 	gfs2_assert(sdp, bd->bd_gl == gl);
-	tr->tr_touched = 1;
+	set_bit(TR_TOUCHED, &tr->tr_flags);
 	if (list_empty(&bd->bd_list)) {
 		set_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags);
 		set_bit(GLF_DIRTY, &bd->bd_gl->gl_flags);
@@ -191,45 +196,24 @@ void gfs2_trans_add_data(struct gfs2_glock *gl, struct buffer_head *bh)
 		list_add_tail(&bd->bd_list, &tr->tr_databuf);
 	}
 	gfs2_log_unlock(sdp);
+out:
 	unlock_buffer(bh);
 }
 
-static void meta_lo_add(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd)
-{
-	struct gfs2_meta_header *mh;
-	struct gfs2_trans *tr;
-	enum gfs2_freeze_state state = atomic_read(&sdp->sd_freeze_state);
-
-	tr = current->journal_info;
-	tr->tr_touched = 1;
-	if (!list_empty(&bd->bd_list))
-		return;
-	set_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags);
-	set_bit(GLF_DIRTY, &bd->bd_gl->gl_flags);
-	mh = (struct gfs2_meta_header *)bd->bd_bh->b_data;
-	if (unlikely(mh->mh_magic != cpu_to_be32(GFS2_MAGIC))) {
-		pr_err("Attempting to add uninitialised block to journal (inplace block=%lld)\n",
-		       (unsigned long long)bd->bd_bh->b_blocknr);
-		BUG();
-	}
-	if (unlikely(state == SFS_FROZEN)) {
-		printk(KERN_INFO "GFS2:adding buf while frozen\n");
-		gfs2_assert_withdraw(sdp, 0);
-	}
-	gfs2_pin(sdp, bd->bd_bh);
-	mh->__pad0 = cpu_to_be64(0);
-	mh->mh_jid = cpu_to_be32(sdp->sd_jdesc->jd_jid);
-	list_add(&bd->bd_list, &tr->tr_buf);
-	tr->tr_num_buf_new++;
-}
-
 void gfs2_trans_add_meta(struct gfs2_glock *gl, struct buffer_head *bh)
 {
 
 	struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
 	struct gfs2_bufdata *bd;
+	struct gfs2_meta_header *mh;
+	struct gfs2_trans *tr = current->journal_info;
+	enum gfs2_freeze_state state = atomic_read(&sdp->sd_freeze_state);
 
 	lock_buffer(bh);
+	if (buffer_pinned(bh)) {
+		set_bit(TR_TOUCHED, &tr->tr_flags);
+		goto out;
+	}
 	gfs2_log_lock(sdp);
 	bd = bh->b_private;
 	if (bd == NULL) {
@@ -245,8 +229,29 @@ void gfs2_trans_add_meta(struct gfs2_glock *gl, struct buffer_head *bh)
 		gfs2_log_lock(sdp);
 	}
 	gfs2_assert(sdp, bd->bd_gl == gl);
-	meta_lo_add(sdp, bd);
+	set_bit(TR_TOUCHED, &tr->tr_flags);
+	if (!list_empty(&bd->bd_list))
+		goto out_unlock;
+	set_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags);
+	set_bit(GLF_DIRTY, &bd->bd_gl->gl_flags);
+	mh = (struct gfs2_meta_header *)bd->bd_bh->b_data;
+	if (unlikely(mh->mh_magic != cpu_to_be32(GFS2_MAGIC))) {
+		pr_err("Attempting to add uninitialised block to journal (inplace block=%lld)\n",
+		       (unsigned long long)bd->bd_bh->b_blocknr);
+		BUG();
+	}
+	if (unlikely(state == SFS_FROZEN)) {
+		printk(KERN_INFO "GFS2:adding buf while frozen\n");
+		gfs2_assert_withdraw(sdp, 0);
+	}
+	gfs2_pin(sdp, bd->bd_bh);
+	mh->__pad0 = cpu_to_be64(0);
+	mh->mh_jid = cpu_to_be32(sdp->sd_jdesc->jd_jid);
+	list_add(&bd->bd_list, &tr->tr_buf);
+	tr->tr_num_buf_new++;
+out_unlock:
 	gfs2_log_unlock(sdp);
+out:
 	unlock_buffer(bh);
 }
 
@@ -256,7 +261,7 @@ void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd)
 
 	BUG_ON(!list_empty(&bd->bd_list));
 	gfs2_add_revoke(sdp, bd);
-	tr->tr_touched = 1;
+	set_bit(TR_TOUCHED, &tr->tr_flags);
 	tr->tr_num_revoke++;
 }
 
diff --git a/fs/internal.h b/fs/internal.h
index b63cf3af2dc2..11c6d89dce9c 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -182,7 +182,7 @@ typedef loff_t (*iomap_actor_t)(struct inode *inode, loff_t pos, loff_t len,
 		void *data, struct iomap *iomap);
 
 loff_t iomap_apply(struct inode *inode, loff_t pos, loff_t length,
-		unsigned flags, struct iomap_ops *ops, void *data,
+		unsigned flags, const struct iomap_ops *ops, void *data,
 		iomap_actor_t actor);
 
 /* direct-io.c: */
diff --git a/fs/iomap.c b/fs/iomap.c
index 354a123f170e..d89f70bbb952 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -41,7 +41,7 @@
  */
 loff_t
 iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags,
-		struct iomap_ops *ops, void *data, iomap_actor_t actor)
+		const struct iomap_ops *ops, void *data, iomap_actor_t actor)
 {
 	struct iomap iomap = { 0 };
 	loff_t written = 0, ret;
@@ -114,6 +114,9 @@ iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags,
 
 	BUG_ON(pos + len > iomap->offset + iomap->length);
 
+	if (fatal_signal_pending(current))
+		return -EINTR;
+
 	page = grab_cache_page_write_begin(inode->i_mapping, index, flags);
 	if (!page)
 		return -ENOMEM;
@@ -232,7 +235,7 @@ again:
 
 ssize_t
 iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *iter,
-		struct iomap_ops *ops)
+		const struct iomap_ops *ops)
 {
 	struct inode *inode = iocb->ki_filp->f_mapping->host;
 	loff_t pos = iocb->ki_pos, ret = 0, written = 0;
@@ -315,7 +318,7 @@ iomap_dirty_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
 
 int
 iomap_file_dirty(struct inode *inode, loff_t pos, loff_t len,
-		struct iomap_ops *ops)
+		const struct iomap_ops *ops)
 {
 	loff_t ret;
 
@@ -395,7 +398,7 @@ iomap_zero_range_actor(struct inode *inode, loff_t pos, loff_t count,
 
 int
 iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
-		struct iomap_ops *ops)
+		const struct iomap_ops *ops)
 {
 	loff_t ret;
 
@@ -415,7 +418,7 @@ EXPORT_SYMBOL_GPL(iomap_zero_range);
 
 int
 iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
-		struct iomap_ops *ops)
+		const struct iomap_ops *ops)
 {
 	unsigned blocksize = (1 << inode->i_blkbits);
 	unsigned off = pos & (blocksize - 1);
@@ -443,7 +446,7 @@ iomap_page_mkwrite_actor(struct inode *inode, loff_t pos, loff_t length,
 }
 
 int iomap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
-		struct iomap_ops *ops)
+		const struct iomap_ops *ops)
 {
 	struct page *page = vmf->page;
 	struct inode *inode = file_inode(vma->vm_file);
@@ -542,7 +545,7 @@ iomap_fiemap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
 }
 
 int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fi,
-		loff_t start, loff_t len, struct iomap_ops *ops)
+		loff_t start, loff_t len, const struct iomap_ops *ops)
 {
 	struct fiemap_ctx ctx;
 	loff_t ret;
@@ -836,8 +839,8 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length,
 }
 
 ssize_t
-iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, struct iomap_ops *ops,
-		iomap_dio_end_io_t end_io)
+iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
+		const struct iomap_ops *ops, iomap_dio_end_io_t end_io)
 {
 	struct address_space *mapping = iocb->ki_filp->f_mapping;
 	struct inode *inode = file_inode(iocb->ki_filp);
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 8c514367ba5a..b6b194ec1b4f 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -393,7 +393,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 	/* Do we need to erase the effects of a prior jbd2_journal_flush? */
 	if (journal->j_flags & JBD2_FLUSHED) {
 		jbd_debug(3, "super block updated\n");
-		mutex_lock(&journal->j_checkpoint_mutex);
+		mutex_lock_io(&journal->j_checkpoint_mutex);
 		/*
 		 * We hold j_checkpoint_mutex so tail cannot change under us.
 		 * We don't need any special data guarantees for writing sb
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index a097048ed1a3..a1a359bfcc9c 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -276,11 +276,11 @@ loop:
 	goto loop;
 
 end_loop:
-	write_unlock(&journal->j_state_lock);
 	del_timer_sync(&journal->j_commit_timer);
 	journal->j_task = NULL;
 	wake_up(&journal->j_wait_done_commit);
 	jbd_debug(1, "Journal thread exiting.\n");
+	write_unlock(&journal->j_state_lock);
 	return 0;
 }
 
@@ -944,7 +944,7 @@ out:
  */
 void jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block)
 {
-	mutex_lock(&journal->j_checkpoint_mutex);
+	mutex_lock_io(&journal->j_checkpoint_mutex);
 	if (tid_gt(tid, journal->j_tail_sequence))
 		__jbd2_update_log_tail(journal, tid, block);
 	mutex_unlock(&journal->j_checkpoint_mutex);
@@ -1304,7 +1304,7 @@ static int journal_reset(journal_t *journal)
 		journal->j_flags |= JBD2_FLUSHED;
 	} else {
 		/* Lock here to make assertions happy... */
-		mutex_lock(&journal->j_checkpoint_mutex);
+		mutex_lock_io(&journal->j_checkpoint_mutex);
 		/*
 		 * Update log tail information. We use REQ_FUA since new
 		 * transaction will start reusing journal space and so we
@@ -1691,7 +1691,7 @@ int jbd2_journal_destroy(journal_t *journal)
 	spin_lock(&journal->j_list_lock);
 	while (journal->j_checkpoint_transactions != NULL) {
 		spin_unlock(&journal->j_list_lock);
-		mutex_lock(&journal->j_checkpoint_mutex);
+		mutex_lock_io(&journal->j_checkpoint_mutex);
 		err = jbd2_log_do_checkpoint(journal);
 		mutex_unlock(&journal->j_checkpoint_mutex);
 		/*
@@ -1713,7 +1713,7 @@ int jbd2_journal_destroy(journal_t *journal)
 
 	if (journal->j_sb_buffer) {
 		if (!is_journal_aborted(journal)) {
-			mutex_lock(&journal->j_checkpoint_mutex);
+			mutex_lock_io(&journal->j_checkpoint_mutex);
 
 			write_lock(&journal->j_state_lock);
 			journal->j_tail_sequence =
@@ -1955,7 +1955,7 @@ int jbd2_journal_flush(journal_t *journal)
 	spin_lock(&journal->j_list_lock);
 	while (!err && journal->j_checkpoint_transactions != NULL) {
 		spin_unlock(&journal->j_list_lock);
-		mutex_lock(&journal->j_checkpoint_mutex);
+		mutex_lock_io(&journal->j_checkpoint_mutex);
 		err = jbd2_log_do_checkpoint(journal);
 		mutex_unlock(&journal->j_checkpoint_mutex);
 		spin_lock(&journal->j_list_lock);
@@ -1965,7 +1965,7 @@ int jbd2_journal_flush(journal_t *journal)
 	if (is_journal_aborted(journal))
 		return -EIO;
 
-	mutex_lock(&journal->j_checkpoint_mutex);
+	mutex_lock_io(&journal->j_checkpoint_mutex);
 	if (!err) {
 		err = jbd2_cleanup_journal_tail(journal);
 		if (err < 0) {
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index e1652665bd93..5e659ee08d6a 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -1863,7 +1863,9 @@ static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh)
 
 	__blist_del_buffer(list, jh);
 	jh->b_jlist = BJ_None;
-	if (test_clear_buffer_jbddirty(bh))
+	if (transaction && is_journal_aborted(transaction->t_journal))
+		clear_buffer_jbddirty(bh);
+	else if (test_clear_buffer_jbddirty(bh))
 		mark_buffer_dirty(bh);	/* Expose it to the VM */
 }
 
diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index cf4c636ff4da..439b946c4808 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -41,6 +41,9 @@ static bool kernfs_lockdep(struct kernfs_node *kn)
 
 static int kernfs_name_locked(struct kernfs_node *kn, char *buf, size_t buflen)
 {
+	if (!kn)
+		return strlcpy(buf, "(null)", buflen);
+
 	return strlcpy(buf, kn->parent ? kn->name : "/", buflen);
 }
 
@@ -110,6 +113,8 @@ static struct kernfs_node *kernfs_common_ancestor(struct kernfs_node *a,
  * kn_to:   /n1/n2/n3         [depth=3]
  * result:  /../..
  *
+ * [3] when @kn_to is NULL result will be "(null)"
+ *
  * Returns the length of the full path.  If the full length is equal to or
  * greater than @buflen, @buf contains the truncated path with the trailing
  * '\0'.  On error, -errno is returned.
@@ -123,6 +128,9 @@ static int kernfs_path_from_node_locked(struct kernfs_node *kn_to,
 	size_t depth_from, depth_to, len = 0;
 	int i, j;
 
+	if (!kn_to)
+		return strlcpy(buf, "(null)", buflen);
+
 	if (!kn_from)
 		kn_from = kernfs_root(kn_to)->kn;
 
@@ -166,6 +174,8 @@ static int kernfs_path_from_node_locked(struct kernfs_node *kn_to,
  * similar to strlcpy().  It returns the length of @kn's name and if @buf
  * isn't long enough, it's filled upto @buflen-1 and nul terminated.
  *
+ * Fills buffer with "(null)" if @kn is NULL.
+ *
  * This function can be called from any context.
  */
 int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen)
diff --git a/fs/libfs.c b/fs/libfs.c
index e973cd51f126..28d6f35feed6 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -245,7 +245,8 @@ struct dentry *mount_pseudo_xattr(struct file_system_type *fs_type, char *name,
 	struct inode *root;
 	struct qstr d_name = QSTR_INIT(name, strlen(name));
 
-	s = sget(fs_type, NULL, set_anon_super, MS_NOUSER, NULL);
+	s = sget_userns(fs_type, NULL, set_anon_super, MS_KERNMOUNT|MS_NOUSER,
+			&init_user_ns, NULL);
 	if (IS_ERR(s))
 		return ERR_CAST(s);
 
diff --git a/fs/mount.h b/fs/mount.h
index 2c856fc47ae3..2826543a131d 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -89,7 +89,6 @@ static inline int is_mounted(struct vfsmount *mnt)
 }
 
 extern struct mount *__lookup_mnt(struct vfsmount *, struct dentry *);
-extern struct mount *__lookup_mnt_last(struct vfsmount *, struct dentry *);
 
 extern int __legitimize_mnt(struct vfsmount *, unsigned);
 extern bool legitimize_mnt(struct vfsmount *, unsigned);
diff --git a/fs/namei.c b/fs/namei.c
index 7d87699c3e2e..d41fab78798b 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -672,17 +672,15 @@ static bool legitimize_links(struct nameidata *nd)
 /**
  * unlazy_walk - try to switch to ref-walk mode.
  * @nd: nameidata pathwalk data
- * @dentry: child of nd->path.dentry or NULL
- * @seq: seq number to check dentry against
  * Returns: 0 on success, -ECHILD on failure
  *
- * unlazy_walk attempts to legitimize the current nd->path, nd->root and dentry
- * for ref-walk mode.  @dentry must be a path found by a do_lookup call on
- * @nd or NULL.  Must be called from rcu-walk context.
+ * unlazy_walk attempts to legitimize the current nd->path and nd->root
+ * for ref-walk mode.
+ * Must be called from rcu-walk context.
  * Nothing should touch nameidata between unlazy_walk() failure and
  * terminate_walk().
  */
-static int unlazy_walk(struct nameidata *nd, struct dentry *dentry, unsigned seq)
+static int unlazy_walk(struct nameidata *nd)
 {
 	struct dentry *parent = nd->path.dentry;
 
@@ -691,33 +689,66 @@ static int unlazy_walk(struct nameidata *nd, struct dentry *dentry, unsigned seq
 	nd->flags &= ~LOOKUP_RCU;
 	if (unlikely(!legitimize_links(nd)))
 		goto out2;
+	if (unlikely(!legitimize_path(nd, &nd->path, nd->seq)))
+		goto out1;
+	if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {
+		if (unlikely(!legitimize_path(nd, &nd->root, nd->root_seq)))
+			goto out;
+	}
+	rcu_read_unlock();
+	BUG_ON(nd->inode != parent->d_inode);
+	return 0;
+
+out2:
+	nd->path.mnt = NULL;
+	nd->path.dentry = NULL;
+out1:
+	if (!(nd->flags & LOOKUP_ROOT))
+		nd->root.mnt = NULL;
+out:
+	rcu_read_unlock();
+	return -ECHILD;
+}
+
+/**
+ * unlazy_child - try to switch to ref-walk mode.
+ * @nd: nameidata pathwalk data
+ * @dentry: child of nd->path.dentry
+ * @seq: seq number to check dentry against
+ * Returns: 0 on success, -ECHILD on failure
+ *
+ * unlazy_child attempts to legitimize the current nd->path, nd->root and dentry
+ * for ref-walk mode.  @dentry must be a path found by a do_lookup call on
+ * @nd.  Must be called from rcu-walk context.
+ * Nothing should touch nameidata between unlazy_child() failure and
+ * terminate_walk().
+ */
+static int unlazy_child(struct nameidata *nd, struct dentry *dentry, unsigned seq)
+{
+	BUG_ON(!(nd->flags & LOOKUP_RCU));
+
+	nd->flags &= ~LOOKUP_RCU;
+	if (unlikely(!legitimize_links(nd)))
+		goto out2;
 	if (unlikely(!legitimize_mnt(nd->path.mnt, nd->m_seq)))
 		goto out2;
-	if (unlikely(!lockref_get_not_dead(&parent->d_lockref)))
+	if (unlikely(!lockref_get_not_dead(&nd->path.dentry->d_lockref)))
 		goto out1;
 
 	/*
-	 * For a negative lookup, the lookup sequence point is the parents
-	 * sequence point, and it only needs to revalidate the parent dentry.
-	 *
-	 * For a positive lookup, we need to move both the parent and the
-	 * dentry from the RCU domain to be properly refcounted. And the
-	 * sequence number in the dentry validates *both* dentry counters,
-	 * since we checked the sequence number of the parent after we got
-	 * the child sequence number. So we know the parent must still
-	 * be valid if the child sequence number is still valid.
+	 * We need to move both the parent and the dentry from the RCU domain
+	 * to be properly refcounted. And the sequence number in the dentry
+	 * validates *both* dentry counters, since we checked the sequence
+	 * number of the parent after we got the child sequence number. So we
+	 * know the parent must still be valid if the child sequence number is
 	 */
-	if (!dentry) {
-		if (read_seqcount_retry(&parent->d_seq, nd->seq))
-			goto out;
-		BUG_ON(nd->inode != parent->d_inode);
-	} else {
-		if (!lockref_get_not_dead(&dentry->d_lockref))
-			goto out;
-		if (read_seqcount_retry(&dentry->d_seq, seq))
-			goto drop_dentry;
+	if (unlikely(!lockref_get_not_dead(&dentry->d_lockref)))
+		goto out;
+	if (unlikely(read_seqcount_retry(&dentry->d_seq, seq))) {
+		rcu_read_unlock();
+		dput(dentry);
+		goto drop_root_mnt;
 	}
-
 	/*
 	 * Sequence counts matched. Now make sure that the root is
 	 * still valid and get it if required.
@@ -733,10 +764,6 @@ static int unlazy_walk(struct nameidata *nd, struct dentry *dentry, unsigned seq
 	rcu_read_unlock();
 	return 0;
 
-drop_dentry:
-	rcu_read_unlock();
-	dput(dentry);
-	goto drop_root_mnt;
 out2:
 	nd->path.mnt = NULL;
 out1:
@@ -749,27 +776,12 @@ drop_root_mnt:
 	return -ECHILD;
 }
 
-static int unlazy_link(struct nameidata *nd, struct path *link, unsigned seq)
-{
-	if (unlikely(!legitimize_path(nd, link, seq))) {
-		drop_links(nd);
-		nd->depth = 0;
-		nd->flags &= ~LOOKUP_RCU;
-		nd->path.mnt = NULL;
-		nd->path.dentry = NULL;
-		if (!(nd->flags & LOOKUP_ROOT))
-			nd->root.mnt = NULL;
-		rcu_read_unlock();
-	} else if (likely(unlazy_walk(nd, NULL, 0)) == 0) {
-		return 0;
-	}
-	path_put(link);
-	return -ECHILD;
-}
-
 static inline int d_revalidate(struct dentry *dentry, unsigned int flags)
 {
-	return dentry->d_op->d_revalidate(dentry, flags);
+	if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE))
+		return dentry->d_op->d_revalidate(dentry, flags);
+	else
+		return 1;
 }
 
 /**
@@ -790,7 +802,7 @@ static int complete_walk(struct nameidata *nd)
 	if (nd->flags & LOOKUP_RCU) {
 		if (!(nd->flags & LOOKUP_ROOT))
 			nd->root.mnt = NULL;
-		if (unlikely(unlazy_walk(nd, NULL, 0)))
+		if (unlikely(unlazy_walk(nd)))
 			return -ECHILD;
 	}
 
@@ -1016,7 +1028,7 @@ const char *get_link(struct nameidata *nd)
 		touch_atime(&last->link);
 		cond_resched();
 	} else if (atime_needs_update_rcu(&last->link, inode)) {
-		if (unlikely(unlazy_walk(nd, NULL, 0)))
+		if (unlikely(unlazy_walk(nd)))
 			return ERR_PTR(-ECHILD);
 		touch_atime(&last->link);
 	}
@@ -1035,7 +1047,7 @@ const char *get_link(struct nameidata *nd)
 		if (nd->flags & LOOKUP_RCU) {
 			res = get(NULL, inode, &last->done);
 			if (res == ERR_PTR(-ECHILD)) {
-				if (unlikely(unlazy_walk(nd, NULL, 0)))
+				if (unlikely(unlazy_walk(nd)))
 					return ERR_PTR(-ECHILD);
 				res = get(dentry, inode, &last->done);
 			}
@@ -1100,7 +1112,6 @@ static int follow_automount(struct path *path, struct nameidata *nd,
 			    bool *need_mntput)
 {
 	struct vfsmount *mnt;
-	const struct cred *old_cred;
 	int err;
 
 	if (!path->dentry->d_op || !path->dentry->d_op->d_automount)
@@ -1129,9 +1140,7 @@ static int follow_automount(struct path *path, struct nameidata *nd,
 	if (nd->total_link_count >= 40)
 		return -ELOOP;
 
-	old_cred = override_creds(&init_cred);
 	mnt = path->dentry->d_op->d_automount(path);
-	revert_creds(old_cred);
 	if (IS_ERR(mnt)) {
 		/*
 		 * The filesystem is allowed to return -EISDIR here to indicate
@@ -1472,19 +1481,14 @@ static struct dentry *lookup_dcache(const struct qstr *name,
 				    struct dentry *dir,
 				    unsigned int flags)
 {
-	struct dentry *dentry;
-	int error;
-
-	dentry = d_lookup(dir, name);
+	struct dentry *dentry = d_lookup(dir, name);
 	if (dentry) {
-		if (dentry->d_flags & DCACHE_OP_REVALIDATE) {
-			error = d_revalidate(dentry, flags);
-			if (unlikely(error <= 0)) {
-				if (!error)
-					d_invalidate(dentry);
-				dput(dentry);
-				return ERR_PTR(error);
-			}
+		int error = d_revalidate(dentry, flags);
+		if (unlikely(error <= 0)) {
+			if (!error)
+				d_invalidate(dentry);
+			dput(dentry);
+			return ERR_PTR(error);
 		}
 	}
 	return dentry;
@@ -1549,7 +1553,7 @@ static int lookup_fast(struct nameidata *nd,
 		bool negative;
 		dentry = __d_lookup_rcu(parent, &nd->last, &seq);
 		if (unlikely(!dentry)) {
-			if (unlazy_walk(nd, NULL, 0))
+			if (unlazy_walk(nd))
 				return -ECHILD;
 			return 0;
 		}
@@ -1574,14 +1578,8 @@ static int lookup_fast(struct nameidata *nd,
 			return -ECHILD;
 
 		*seqp = seq;
-		if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE))
-			status = d_revalidate(dentry, nd->flags);
-		if (unlikely(status <= 0)) {
-			if (unlazy_walk(nd, dentry, seq))
-				return -ECHILD;
-			if (status == -ECHILD)
-				status = d_revalidate(dentry, nd->flags);
-		} else {
+		status = d_revalidate(dentry, nd->flags);
+		if (likely(status > 0)) {
 			/*
 			 * Note: do negative dentry check after revalidation in
 			 * case that drops it.
@@ -1592,15 +1590,17 @@ static int lookup_fast(struct nameidata *nd,
 			path->dentry = dentry;
 			if (likely(__follow_mount_rcu(nd, path, inode, seqp)))
 				return 1;
-			if (unlazy_walk(nd, dentry, seq))
-				return -ECHILD;
 		}
+		if (unlazy_child(nd, dentry, seq))
+			return -ECHILD;
+		if (unlikely(status == -ECHILD))
+			/* we'd been told to redo it in non-rcu mode */
+			status = d_revalidate(dentry, nd->flags);
 	} else {
 		dentry = __d_lookup(parent, &nd->last);
 		if (unlikely(!dentry))
 			return 0;
-		if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE))
-			status = d_revalidate(dentry, nd->flags);
+		status = d_revalidate(dentry, nd->flags);
 	}
 	if (unlikely(status <= 0)) {
 		if (!status)
@@ -1639,8 +1639,7 @@ again:
 	if (IS_ERR(dentry))
 		goto out;
 	if (unlikely(!d_in_lookup(dentry))) {
-		if ((dentry->d_flags & DCACHE_OP_REVALIDATE) &&
-		    !(flags & LOOKUP_NO_REVAL)) {
+		if (!(flags & LOOKUP_NO_REVAL)) {
 			int error = d_revalidate(dentry, flags);
 			if (unlikely(error <= 0)) {
 				if (!error) {
@@ -1671,7 +1670,7 @@ static inline int may_lookup(struct nameidata *nd)
 		int err = inode_permission(nd->inode, MAY_EXEC|MAY_NOT_BLOCK);
 		if (err != -ECHILD)
 			return err;
-		if (unlazy_walk(nd, NULL, 0))
+		if (unlazy_walk(nd))
 			return -ECHILD;
 	}
 	return inode_permission(nd->inode, MAY_EXEC);
@@ -1706,9 +1705,17 @@ static int pick_link(struct nameidata *nd, struct path *link,
 	error = nd_alloc_stack(nd);
 	if (unlikely(error)) {
 		if (error == -ECHILD) {
-			if (unlikely(unlazy_link(nd, link, seq)))
-				return -ECHILD;
-			error = nd_alloc_stack(nd);
+			if (unlikely(!legitimize_path(nd, link, seq))) {
+				drop_links(nd);
+				nd->depth = 0;
+				nd->flags &= ~LOOKUP_RCU;
+				nd->path.mnt = NULL;
+				nd->path.dentry = NULL;
+				if (!(nd->flags & LOOKUP_ROOT))
+					nd->root.mnt = NULL;
+				rcu_read_unlock();
+			} else if (likely(unlazy_walk(nd)) == 0)
+				error = nd_alloc_stack(nd);
 		}
 		if (error) {
 			path_put(link);
@@ -2125,7 +2132,7 @@ OK:
 		}
 		if (unlikely(!d_can_lookup(nd->path.dentry))) {
 			if (nd->flags & LOOKUP_RCU) {
-				if (unlazy_walk(nd, NULL, 0))
+				if (unlazy_walk(nd))
 					return -ECHILD;
 			}
 			return -ENOTDIR;
@@ -2582,7 +2589,7 @@ mountpoint_last(struct nameidata *nd)
 
 	/* If we're in rcuwalk, drop out of it to handle last component */
 	if (nd->flags & LOOKUP_RCU) {
-		if (unlazy_walk(nd, NULL, 0))
+		if (unlazy_walk(nd))
 			return -ECHILD;
 	}
 
@@ -2941,10 +2948,16 @@ static inline int open_to_namei_flags(int flag)
 
 static int may_o_create(const struct path *dir, struct dentry *dentry, umode_t mode)
 {
+	struct user_namespace *s_user_ns;
 	int error = security_path_mknod(dir, dentry, mode, 0);
 	if (error)
 		return error;
 
+	s_user_ns = dir->dentry->d_sb->s_user_ns;
+	if (!kuid_has_mapping(s_user_ns, current_fsuid()) ||
+	    !kgid_has_mapping(s_user_ns, current_fsgid()))
+		return -EOVERFLOW;
+
 	error = inode_permission(dir->dentry->d_inode, MAY_WRITE | MAY_EXEC);
 	if (error)
 		return error;
@@ -3069,9 +3082,6 @@ static int lookup_open(struct nameidata *nd, struct path *path,
 		if (d_in_lookup(dentry))
 			break;
 
-		if (!(dentry->d_flags & DCACHE_OP_REVALIDATE))
-			break;
-
 		error = d_revalidate(dentry, nd->flags);
 		if (likely(error > 0))
 			break;
diff --git a/fs/namespace.c b/fs/namespace.c
index b5b1259e064f..8bfad42c1ccf 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -637,28 +637,6 @@ struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry)
 }
 
 /*
- * find the last mount at @dentry on vfsmount @mnt.
- * mount_lock must be held.
- */
-struct mount *__lookup_mnt_last(struct vfsmount *mnt, struct dentry *dentry)
-{
-	struct mount *p, *res = NULL;
-	p = __lookup_mnt(mnt, dentry);
-	if (!p)
-		goto out;
-	if (!(p->mnt.mnt_flags & MNT_UMOUNT))
-		res = p;
-	hlist_for_each_entry_continue(p, mnt_hash) {
-		if (&p->mnt_parent->mnt != mnt || p->mnt_mountpoint != dentry)
-			break;
-		if (!(p->mnt.mnt_flags & MNT_UMOUNT))
-			res = p;
-	}
-out:
-	return res;
-}
-
-/*
  * lookup_mnt - Return the first child mount mounted at path
  *
  * "First" means first mounted chronologically.  If you create the
@@ -742,26 +720,50 @@ static struct mountpoint *lookup_mountpoint(struct dentry *dentry)
 	return NULL;
 }
 
-static struct mountpoint *new_mountpoint(struct dentry *dentry)
+static struct mountpoint *get_mountpoint(struct dentry *dentry)
 {
-	struct hlist_head *chain = mp_hash(dentry);
-	struct mountpoint *mp;
+	struct mountpoint *mp, *new = NULL;
 	int ret;
 
-	mp = kmalloc(sizeof(struct mountpoint), GFP_KERNEL);
-	if (!mp)
+	if (d_mountpoint(dentry)) {
+mountpoint:
+		read_seqlock_excl(&mount_lock);
+		mp = lookup_mountpoint(dentry);
+		read_sequnlock_excl(&mount_lock);
+		if (mp)
+			goto done;
+	}
+
+	if (!new)
+		new = kmalloc(sizeof(struct mountpoint), GFP_KERNEL);
+	if (!new)
 		return ERR_PTR(-ENOMEM);
 
+
+	/* Exactly one processes may set d_mounted */
 	ret = d_set_mounted(dentry);
-	if (ret) {
-		kfree(mp);
-		return ERR_PTR(ret);
-	}
 
-	mp->m_dentry = dentry;
-	mp->m_count = 1;
-	hlist_add_head(&mp->m_hash, chain);
-	INIT_HLIST_HEAD(&mp->m_list);
+	/* Someone else set d_mounted? */
+	if (ret == -EBUSY)
+		goto mountpoint;
+
+	/* The dentry is not available as a mountpoint? */
+	mp = ERR_PTR(ret);
+	if (ret)
+		goto done;
+
+	/* Add the new mountpoint to the hash table */
+	read_seqlock_excl(&mount_lock);
+	new->m_dentry = dentry;
+	new->m_count = 1;
+	hlist_add_head(&new->m_hash, mp_hash(dentry));
+	INIT_HLIST_HEAD(&new->m_list);
+	read_sequnlock_excl(&mount_lock);
+
+	mp = new;
+	new = NULL;
+done:
+	kfree(new);
 	return mp;
 }
 
@@ -854,6 +856,13 @@ void mnt_set_mountpoint(struct mount *mnt,
 	hlist_add_head(&child_mnt->mnt_mp_list, &mp->m_list);
 }
 
+static void __attach_mnt(struct mount *mnt, struct mount *parent)
+{
+	hlist_add_head_rcu(&mnt->mnt_hash,
+			   m_hash(&parent->mnt, mnt->mnt_mountpoint));
+	list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
+}
+
 /*
  * vfsmount lock must be held for write
  */
@@ -862,28 +871,45 @@ static void attach_mnt(struct mount *mnt,
 			struct mountpoint *mp)
 {
 	mnt_set_mountpoint(parent, mp, mnt);
-	hlist_add_head_rcu(&mnt->mnt_hash, m_hash(&parent->mnt, mp->m_dentry));
-	list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
+	__attach_mnt(mnt, parent);
 }
 
-static void attach_shadowed(struct mount *mnt,
-			struct mount *parent,
-			struct mount *shadows)
+void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp, struct mount *mnt)
 {
-	if (shadows) {
-		hlist_add_behind_rcu(&mnt->mnt_hash, &shadows->mnt_hash);
-		list_add(&mnt->mnt_child, &shadows->mnt_child);
-	} else {
-		hlist_add_head_rcu(&mnt->mnt_hash,
-				m_hash(&parent->mnt, mnt->mnt_mountpoint));
-		list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
-	}
+	struct mountpoint *old_mp = mnt->mnt_mp;
+	struct dentry *old_mountpoint = mnt->mnt_mountpoint;
+	struct mount *old_parent = mnt->mnt_parent;
+
+	list_del_init(&mnt->mnt_child);
+	hlist_del_init(&mnt->mnt_mp_list);
+	hlist_del_init_rcu(&mnt->mnt_hash);
+
+	attach_mnt(mnt, parent, mp);
+
+	put_mountpoint(old_mp);
+
+	/*
+	 * Safely avoid even the suggestion this code might sleep or
+	 * lock the mount hash by taking advantage of the knowledge that
+	 * mnt_change_mountpoint will not release the final reference
+	 * to a mountpoint.
+	 *
+	 * During mounting, the mount passed in as the parent mount will
+	 * continue to use the old mountpoint and during unmounting, the
+	 * old mountpoint will continue to exist until namespace_unlock,
+	 * which happens well after mnt_change_mountpoint.
+	 */
+	spin_lock(&old_mountpoint->d_lock);
+	old_mountpoint->d_lockref.count--;
+	spin_unlock(&old_mountpoint->d_lock);
+
+	mnt_add_count(old_parent, -1);
 }
 
 /*
  * vfsmount lock must be held for write
  */
-static void commit_tree(struct mount *mnt, struct mount *shadows)
+static void commit_tree(struct mount *mnt)
 {
 	struct mount *parent = mnt->mnt_parent;
 	struct mount *m;
@@ -901,7 +927,7 @@ static void commit_tree(struct mount *mnt, struct mount *shadows)
 	n->mounts += n->pending_mounts;
 	n->pending_mounts = 0;
 
-	attach_shadowed(mnt, parent, shadows);
+	__attach_mnt(mnt, parent);
 	touch_mnt_namespace(n);
 }
 
@@ -965,6 +991,21 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void
 }
 EXPORT_SYMBOL_GPL(vfs_kern_mount);
 
+struct vfsmount *
+vfs_submount(const struct dentry *mountpoint, struct file_system_type *type,
+	     const char *name, void *data)
+{
+	/* Until it is worked out how to pass the user namespace
+	 * through from the parent mount to the submount don't support
+	 * unprivileged mounts with submounts.
+	 */
+	if (mountpoint->d_sb->s_user_ns != &init_user_ns)
+		return ERR_PTR(-EPERM);
+
+	return vfs_kern_mount(type, MS_SUBMOUNT, name, data);
+}
+EXPORT_SYMBOL_GPL(vfs_submount);
+
 static struct mount *clone_mnt(struct mount *old, struct dentry *root,
 					int flag)
 {
@@ -1595,11 +1636,11 @@ void __detach_mounts(struct dentry *dentry)
 	struct mount *mnt;
 
 	namespace_lock();
+	lock_mount_hash();
 	mp = lookup_mountpoint(dentry);
 	if (IS_ERR_OR_NULL(mp))
 		goto out_unlock;
 
-	lock_mount_hash();
 	event++;
 	while (!hlist_empty(&mp->m_list)) {
 		mnt = hlist_entry(mp->m_list.first, struct mount, mnt_mp_list);
@@ -1609,9 +1650,9 @@ void __detach_mounts(struct dentry *dentry)
 		}
 		else umount_tree(mnt, UMOUNT_CONNECTED);
 	}
-	unlock_mount_hash();
 	put_mountpoint(mp);
 out_unlock:
+	unlock_mount_hash();
 	namespace_unlock();
 }
 
@@ -1740,7 +1781,6 @@ struct mount *copy_tree(struct mount *mnt, struct dentry *dentry,
 			continue;
 
 		for (s = r; s; s = next_mnt(s, r)) {
-			struct mount *t = NULL;
 			if (!(flag & CL_COPY_UNBINDABLE) &&
 			    IS_MNT_UNBINDABLE(s)) {
 				s = skip_mnt_tree(s);
@@ -1762,14 +1802,7 @@ struct mount *copy_tree(struct mount *mnt, struct dentry *dentry,
 				goto out;
 			lock_mount_hash();
 			list_add_tail(&q->mnt_list, &res->mnt_list);
-			mnt_set_mountpoint(parent, p->mnt_mp, q);
-			if (!list_empty(&parent->mnt_mounts)) {
-				t = list_last_entry(&parent->mnt_mounts,
-					struct mount, mnt_child);
-				if (t->mnt_mp != p->mnt_mp)
-					t = NULL;
-			}
-			attach_shadowed(q, parent, t);
+			attach_mnt(q, parent, p->mnt_mp);
 			unlock_mount_hash();
 		}
 	}
@@ -1968,10 +2001,18 @@ static int attach_recursive_mnt(struct mount *source_mnt,
 {
 	HLIST_HEAD(tree_list);
 	struct mnt_namespace *ns = dest_mnt->mnt_ns;
+	struct mountpoint *smp;
 	struct mount *child, *p;
 	struct hlist_node *n;
 	int err;
 
+	/* Preallocate a mountpoint in case the new mounts need
+	 * to be tucked under other mounts.
+	 */
+	smp = get_mountpoint(source_mnt->mnt.mnt_root);
+	if (IS_ERR(smp))
+		return PTR_ERR(smp);
+
 	/* Is there space to add these mounts to the mount namespace? */
 	if (!parent_path) {
 		err = count_mounts(ns, source_mnt);
@@ -1998,16 +2039,19 @@ static int attach_recursive_mnt(struct mount *source_mnt,
 		touch_mnt_namespace(source_mnt->mnt_ns);
 	} else {
 		mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt);
-		commit_tree(source_mnt, NULL);
+		commit_tree(source_mnt);
 	}
 
 	hlist_for_each_entry_safe(child, n, &tree_list, mnt_hash) {
 		struct mount *q;
 		hlist_del_init(&child->mnt_hash);
-		q = __lookup_mnt_last(&child->mnt_parent->mnt,
-				      child->mnt_mountpoint);
-		commit_tree(child, q);
+		q = __lookup_mnt(&child->mnt_parent->mnt,
+				 child->mnt_mountpoint);
+		if (q)
+			mnt_change_mountpoint(child, smp, q);
+		commit_tree(child);
 	}
+	put_mountpoint(smp);
 	unlock_mount_hash();
 
 	return 0;
@@ -2022,6 +2066,11 @@ static int attach_recursive_mnt(struct mount *source_mnt,
 	cleanup_group_ids(source_mnt, NULL);
  out:
 	ns->pending_mounts = 0;
+
+	read_seqlock_excl(&mount_lock);
+	put_mountpoint(smp);
+	read_sequnlock_excl(&mount_lock);
+
 	return err;
 }
 
@@ -2038,9 +2087,7 @@ retry:
 	namespace_lock();
 	mnt = lookup_mnt(path);
 	if (likely(!mnt)) {
-		struct mountpoint *mp = lookup_mountpoint(dentry);
-		if (!mp)
-			mp = new_mountpoint(dentry);
+		struct mountpoint *mp = get_mountpoint(dentry);
 		if (IS_ERR(mp)) {
 			namespace_unlock();
 			inode_unlock(dentry->d_inode);
@@ -2059,7 +2106,11 @@ retry:
 static void unlock_mount(struct mountpoint *where)
 {
 	struct dentry *dentry = where->m_dentry;
+
+	read_seqlock_excl(&mount_lock);
 	put_mountpoint(where);
+	read_sequnlock_excl(&mount_lock);
+
 	namespace_unlock();
 	inode_unlock(dentry->d_inode);
 }
@@ -2768,7 +2819,7 @@ long do_mount(const char *dev_name, const char __user *dir_name,
 
 	flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | MS_BORN |
 		   MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT |
-		   MS_STRICTATIME | MS_NOREMOTELOCK);
+		   MS_STRICTATIME | MS_NOREMOTELOCK | MS_SUBMOUNT);
 
 	if (flags & MS_REMOUNT)
 		retval = do_remount(&path, flags & ~MS_REMOUNT, mnt_flags,
@@ -3135,9 +3186,9 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
 	touch_mnt_namespace(current->nsproxy->mnt_ns);
 	/* A moved mount should not expire automatically */
 	list_del_init(&new_mnt->mnt_expire);
+	put_mountpoint(root_mp);
 	unlock_mount_hash();
 	chroot_fs_refs(&root, &new);
-	put_mountpoint(root_mp);
 	error = 0;
 out4:
 	unlock_mount(old_mp);
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index 5551e8ef67fd..e49d831c4e85 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -226,7 +226,7 @@ static struct vfsmount *nfs_do_clone_mount(struct nfs_server *server,
 					   const char *devname,
 					   struct nfs_clone_mount *mountdata)
 {
-	return vfs_kern_mount(&nfs_xdev_fs_type, 0, devname, mountdata);
+	return vfs_submount(mountdata->dentry, &nfs_xdev_fs_type, devname, mountdata);
 }
 
 /**
diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
index d21104912676..d8b040bd9814 100644
--- a/fs/nfs/nfs4namespace.c
+++ b/fs/nfs/nfs4namespace.c
@@ -279,7 +279,7 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata,
 				mountdata->hostname,
 				mountdata->mnt_path);
 
-		mnt = vfs_kern_mount(&nfs4_referral_fs_type, 0, page, mountdata);
+		mnt = vfs_submount(mountdata->dentry, &nfs4_referral_fs_type, page, mountdata);
 		if (!IS_ERR(mnt))
 			break;
 	}
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 6dcbc5defb7a..0a0eaecf9676 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -38,7 +38,6 @@
 #include <linux/mm.h>
 #include <linux/delay.h>
 #include <linux/errno.h>
-#include <linux/file.h>
 #include <linux/string.h>
 #include <linux/ratelimit.h>
 #include <linux/printk.h>
@@ -1083,7 +1082,8 @@ int nfs4_call_sync(struct rpc_clnt *clnt,
 	return nfs4_call_sync_sequence(clnt, server, msg, args, res);
 }
 
-static void update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo)
+static void update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo,
+		unsigned long timestamp)
 {
 	struct nfs_inode *nfsi = NFS_I(dir);
 
@@ -1099,6 +1099,7 @@ static void update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo)
 				NFS_INO_INVALID_ACL;
 	}
 	dir->i_version = cinfo->after;
+	nfsi->read_cache_jiffies = timestamp;
 	nfsi->attr_gencount = nfs_inc_attr_generation_counter();
 	nfs_fscache_invalidate(dir);
 	spin_unlock(&dir->i_lock);
@@ -2391,11 +2392,13 @@ static int _nfs4_proc_open(struct nfs4_opendata *data)
 	nfs_fattr_map_and_free_names(server, &data->f_attr);
 
 	if (o_arg->open_flags & O_CREAT) {
-		update_changeattr(dir, &o_res->cinfo);
 		if (o_arg->open_flags & O_EXCL)
 			data->file_created = 1;
 		else if (o_res->cinfo.before != o_res->cinfo.after)
 			data->file_created = 1;
+		if (data->file_created || dir->i_version != o_res->cinfo.after)
+			update_changeattr(dir, &o_res->cinfo,
+					o_res->f_attr->time_start);
 	}
 	if ((o_res->rflags & NFS4_OPEN_RESULT_LOCKTYPE_POSIX) == 0)
 		server->caps &= ~NFS_CAP_POSIX_LOCK;
@@ -2697,7 +2700,8 @@ static inline void nfs4_exclusive_attrset(struct nfs4_opendata *opendata,
 		sattr->ia_valid |= ATTR_MTIME;
 
 	/* Except MODE, it seems harmless of setting twice. */
-	if ((attrset[1] & FATTR4_WORD1_MODE))
+	if (opendata->o_arg.createmode != NFS4_CREATE_EXCLUSIVE &&
+		attrset[1] & FATTR4_WORD1_MODE)
 		sattr->ia_valid &= ~ATTR_MODE;
 
 	if (attrset[2] & FATTR4_WORD2_SECURITY_LABEL)
@@ -4073,11 +4077,12 @@ static int _nfs4_proc_remove(struct inode *dir, const struct qstr *name)
 		.rpc_argp = &args,
 		.rpc_resp = &res,
 	};
+	unsigned long timestamp = jiffies;
 	int status;
 
 	status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 1);
 	if (status == 0)
-		update_changeattr(dir, &res.cinfo);
+		update_changeattr(dir, &res.cinfo, timestamp);
 	return status;
 }
 
@@ -4125,7 +4130,8 @@ static int nfs4_proc_unlink_done(struct rpc_task *task, struct inode *dir)
 	if (nfs4_async_handle_error(task, res->server, NULL,
 				    &data->timeout) == -EAGAIN)
 		return 0;
-	update_changeattr(dir, &res->cinfo);
+	if (task->tk_status == 0)
+		update_changeattr(dir, &res->cinfo, res->dir_attr->time_start);
 	return 1;
 }
 
@@ -4159,8 +4165,11 @@ static int nfs4_proc_rename_done(struct rpc_task *task, struct inode *old_dir,
 	if (nfs4_async_handle_error(task, res->server, NULL, &data->timeout) == -EAGAIN)
 		return 0;
 
-	update_changeattr(old_dir, &res->old_cinfo);
-	update_changeattr(new_dir, &res->new_cinfo);
+	if (task->tk_status == 0) {
+		update_changeattr(old_dir, &res->old_cinfo, res->old_fattr->time_start);
+		if (new_dir != old_dir)
+			update_changeattr(new_dir, &res->new_cinfo, res->new_fattr->time_start);
+	}
 	return 1;
 }
 
@@ -4197,7 +4206,7 @@ static int _nfs4_proc_link(struct inode *inode, struct inode *dir, const struct
 
 	status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1);
 	if (!status) {
-		update_changeattr(dir, &res.cinfo);
+		update_changeattr(dir, &res.cinfo, res.fattr->time_start);
 		status = nfs_post_op_update_inode(inode, res.fattr);
 		if (!status)
 			nfs_setsecurity(inode, res.fattr, res.label);
@@ -4272,7 +4281,8 @@ static int nfs4_do_create(struct inode *dir, struct dentry *dentry, struct nfs4_
 	int status = nfs4_call_sync(NFS_SERVER(dir)->client, NFS_SERVER(dir), &data->msg,
 				    &data->arg.seq_args, &data->res.seq_res, 1);
 	if (status == 0) {
-		update_changeattr(dir, &data->res.dir_cinfo);
+		update_changeattr(dir, &data->res.dir_cinfo,
+				data->res.fattr->time_start);
 		status = nfs_instantiate(dentry, data->res.fh, data->res.fattr, data->res.label);
 	}
 	return status;
@@ -6127,7 +6137,6 @@ static struct nfs4_lockdata *nfs4_alloc_lockdata(struct file_lock *fl,
 	p->server = server;
 	atomic_inc(&lsp->ls_count);
 	p->ctx = get_nfs_open_context(ctx);
-	get_file(fl->fl_file);
 	memcpy(&p->fl, fl, sizeof(p->fl));
 	return p;
 out_free_seqid:
@@ -6240,7 +6249,6 @@ static void nfs4_lock_release(void *calldata)
 		nfs_free_seqid(data->arg.lock_seqid);
 	nfs4_put_lock_state(data->lsp);
 	put_nfs_open_context(data->ctx);
-	fput(data->fl.fl_file);
 	kfree(data);
 	dprintk("%s: done!\n", __func__);
 }
@@ -8483,6 +8491,7 @@ nfs4_layoutget_handle_exception(struct rpc_task *task,
 		goto out;
 	}
 
+	nfs4_sequence_free_slot(&lgp->res.seq_res);
 	err = nfs4_handle_exception(server, nfs4err, exception);
 	if (!status) {
 		if (exception->retry)
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 1d152f4470cd..daeb94e3acd4 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -1091,6 +1091,7 @@ static void nfs_increment_seqid(int status, struct nfs_seqid *seqid)
 		case -NFS4ERR_BADXDR:
 		case -NFS4ERR_RESOURCE:
 		case -NFS4ERR_NOFILEHANDLE:
+		case -NFS4ERR_MOVED:
 			/* Non-seqid mutating errors */
 			return;
 	};
@@ -1729,7 +1730,6 @@ static int nfs4_recovery_handle_error(struct nfs_client *clp, int error)
 			break;
 		case -NFS4ERR_STALE_CLIENTID:
 			set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
-			nfs4_state_clear_reclaim_reboot(clp);
 			nfs4_state_start_reclaim_reboot(clp);
 			break;
 		case -NFS4ERR_EXPIRED:
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 59554f3adf29..dd042498ce7c 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -1200,10 +1200,10 @@ _pnfs_return_layout(struct inode *ino)
 
 	send = pnfs_prepare_layoutreturn(lo, &stateid, NULL);
 	spin_unlock(&ino->i_lock);
-	pnfs_free_lseg_list(&tmp_list);
 	if (send)
 		status = pnfs_send_layoutreturn(lo, &stateid, IOMODE_ANY, true);
 out_put_layout_hdr:
+	pnfs_free_lseg_list(&tmp_list);
 	pnfs_put_layout_hdr(lo);
 out:
 	dprintk("<-- %s status: %d\n", __func__, status);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index b00d53d13d47..006068526542 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -728,8 +728,6 @@ static void nfs_inode_remove_request(struct nfs_page *req)
 		if (likely(head->wb_page && !PageSwapCache(head->wb_page))) {
 			set_page_private(head->wb_page, 0);
 			ClearPagePrivate(head->wb_page);
-			smp_mb__after_atomic();
-			wake_up_page(head->wb_page, PG_private);
 			clear_bit(PG_MAPPED, &head->wb_flags);
 		}
 		nfsi->nrequests--;
diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig
index 47febcf99185..20b1c17320d5 100644
--- a/fs/nfsd/Kconfig
+++ b/fs/nfsd/Kconfig
@@ -104,6 +104,7 @@ config NFSD_SCSILAYOUT
 	depends on NFSD_V4 && BLOCK
 	select NFSD_PNFS
 	select EXPORTFS_BLOCK_OPS
+	select BLK_SCSI_REQUEST
 	help
 	  This option enables support for the exporting pNFS SCSI layouts
 	  in the kernel's NFS server. The pNFS SCSI layout enables NFS
diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c
index 0780ff864539..a06115e31612 100644
--- a/fs/nfsd/blocklayout.c
+++ b/fs/nfsd/blocklayout.c
@@ -10,6 +10,7 @@
 #include <linux/nfsd/debug.h>
 #include <scsi/scsi_proto.h>
 #include <scsi/scsi_common.h>
+#include <scsi/scsi_request.h>
 
 #include "blocklayoutxdr.h"
 #include "pnfs.h"
@@ -213,6 +214,7 @@ static int nfsd4_scsi_identify_device(struct block_device *bdev,
 {
 	struct request_queue *q = bdev->bd_disk->queue;
 	struct request *rq;
+	struct scsi_request *req;
 	size_t bufflen = 252, len, id_len;
 	u8 *buf, *d, type, assoc;
 	int error;
@@ -221,23 +223,24 @@ static int nfsd4_scsi_identify_device(struct block_device *bdev,
 	if (!buf)
 		return -ENOMEM;
 
-	rq = blk_get_request(q, READ, GFP_KERNEL);
+	rq = blk_get_request(q, REQ_OP_SCSI_IN, GFP_KERNEL);
 	if (IS_ERR(rq)) {
 		error = -ENOMEM;
 		goto out_free_buf;
 	}
-	blk_rq_set_block_pc(rq);
+	req = scsi_req(rq);
+	scsi_req_init(rq);
 
 	error = blk_rq_map_kern(q, rq, buf, bufflen, GFP_KERNEL);
 	if (error)
 		goto out_put_request;
 
-	rq->cmd[0] = INQUIRY;
-	rq->cmd[1] = 1;
-	rq->cmd[2] = 0x83;
-	rq->cmd[3] = bufflen >> 8;
-	rq->cmd[4] = bufflen & 0xff;
-	rq->cmd_len = COMMAND_SIZE(INQUIRY);
+	req->cmd[0] = INQUIRY;
+	req->cmd[1] = 1;
+	req->cmd[2] = 0x83;
+	req->cmd[3] = bufflen >> 8;
+	req->cmd[4] = bufflen & 0xff;
+	req->cmd_len = COMMAND_SIZE(INQUIRY);
 
 	error = blk_execute_rq(rq->q, NULL, rq, 1);
 	if (error) {
diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c
index 596205d939a1..e122da696f1b 100644
--- a/fs/nfsd/nfs4layouts.c
+++ b/fs/nfsd/nfs4layouts.c
@@ -223,10 +223,11 @@ nfsd4_alloc_layout_stateid(struct nfsd4_compound_state *cstate,
 	struct nfs4_layout_stateid *ls;
 	struct nfs4_stid *stp;
 
-	stp = nfs4_alloc_stid(cstate->clp, nfs4_layout_stateid_cache);
+	stp = nfs4_alloc_stid(cstate->clp, nfs4_layout_stateid_cache,
+					nfsd4_free_layout_stateid);
 	if (!stp)
 		return NULL;
-	stp->sc_free = nfsd4_free_layout_stateid;
+
 	get_nfs4_file(fp);
 	stp->sc_file = fp;
 
@@ -613,6 +614,7 @@ nfsd4_cb_layout_fail(struct nfs4_layout_stateid *ls)
 {
 	struct nfs4_client *clp = ls->ls_stid.sc_client;
 	char addr_str[INET6_ADDRSTRLEN];
+	static char const nfsd_recall_failed[] = "/sbin/nfsd-recall-failed";
 	static char *envp[] = {
 		"HOME=/",
 		"TERM=linux",
@@ -628,12 +630,13 @@ nfsd4_cb_layout_fail(struct nfs4_layout_stateid *ls)
 		"nfsd: client %s failed to respond to layout recall. "
 		"  Fencing..\n", addr_str);
 
-	argv[0] = "/sbin/nfsd-recall-failed";
+	argv[0] = (char *)nfsd_recall_failed;
 	argv[1] = addr_str;
 	argv[2] = ls->ls_file->f_path.mnt->mnt_sb->s_id;
 	argv[3] = NULL;
 
-	error = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
+	error = call_usermodehelper(nfsd_recall_failed, argv, envp,
+				    UMH_WAIT_PROC);
 	if (error) {
 		printk(KERN_ERR "nfsd: fence failed for client %s: %d!\n",
 			addr_str, error);
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 4b4beaaa4eaa..a0dee8ae9f97 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -633,8 +633,8 @@ out:
 	return co;
 }
 
-struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl,
-					 struct kmem_cache *slab)
+struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl, struct kmem_cache *slab,
+				  void (*sc_free)(struct nfs4_stid *))
 {
 	struct nfs4_stid *stid;
 	int new_id;
@@ -650,6 +650,8 @@ struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl,
 	idr_preload_end();
 	if (new_id < 0)
 		goto out_free;
+
+	stid->sc_free = sc_free;
 	stid->sc_client = cl;
 	stid->sc_stateid.si_opaque.so_id = new_id;
 	stid->sc_stateid.si_opaque.so_clid = cl->cl_clientid;
@@ -675,15 +677,12 @@ out_free:
 static struct nfs4_ol_stateid * nfs4_alloc_open_stateid(struct nfs4_client *clp)
 {
 	struct nfs4_stid *stid;
-	struct nfs4_ol_stateid *stp;
 
-	stid = nfs4_alloc_stid(clp, stateid_slab);
+	stid = nfs4_alloc_stid(clp, stateid_slab, nfs4_free_ol_stateid);
 	if (!stid)
 		return NULL;
 
-	stp = openlockstateid(stid);
-	stp->st_stid.sc_free = nfs4_free_ol_stateid;
-	return stp;
+	return openlockstateid(stid);
 }
 
 static void nfs4_free_deleg(struct nfs4_stid *stid)
@@ -781,11 +780,10 @@ alloc_init_deleg(struct nfs4_client *clp, struct svc_fh *current_fh,
 		goto out_dec;
 	if (delegation_blocked(&current_fh->fh_handle))
 		goto out_dec;
-	dp = delegstateid(nfs4_alloc_stid(clp, deleg_slab));
+	dp = delegstateid(nfs4_alloc_stid(clp, deleg_slab, nfs4_free_deleg));
 	if (dp == NULL)
 		goto out_dec;
 
-	dp->dl_stid.sc_free = nfs4_free_deleg;
 	/*
 	 * delegation seqid's are never incremented.  The 4.1 special
 	 * meaning of seqid 0 isn't meaningful, really, but let's avoid
@@ -5580,7 +5578,6 @@ init_lock_stateid(struct nfs4_ol_stateid *stp, struct nfs4_lockowner *lo,
 	stp->st_stateowner = nfs4_get_stateowner(&lo->lo_owner);
 	get_nfs4_file(fp);
 	stp->st_stid.sc_file = fp;
-	stp->st_stid.sc_free = nfs4_free_lock_stateid;
 	stp->st_access_bmap = 0;
 	stp->st_deny_bmap = open_stp->st_deny_bmap;
 	stp->st_openstp = open_stp;
@@ -5623,7 +5620,7 @@ find_or_create_lock_stateid(struct nfs4_lockowner *lo, struct nfs4_file *fi,
 	lst = find_lock_stateid(lo, fi);
 	if (lst == NULL) {
 		spin_unlock(&clp->cl_lock);
-		ns = nfs4_alloc_stid(clp, stateid_slab);
+		ns = nfs4_alloc_stid(clp, stateid_slab, nfs4_free_lock_stateid);
 		if (ns == NULL)
 			return NULL;
 
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 7ecf16be4a44..8fae53ce21d1 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -2440,7 +2440,9 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp,
 	p++;                /* to be backfilled later */
 
 	if (bmval0 & FATTR4_WORD0_SUPPORTED_ATTRS) {
-		u32 *supp = nfsd_suppattrs[minorversion];
+		u32 supp[3];
+
+		memcpy(supp, nfsd_suppattrs[minorversion], sizeof(supp));
 
 		if (!IS_POSIXACL(dentry->d_inode))
 			supp[0] &= ~FATTR4_WORD0_ACL;
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index c9399366f9df..4516e8b7d776 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -603,8 +603,8 @@ extern __be32 nfs4_preprocess_stateid_op(struct svc_rqst *rqstp,
 __be32 nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate,
 		     stateid_t *stateid, unsigned char typemask,
 		     struct nfs4_stid **s, struct nfsd_net *nn);
-struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl,
-		struct kmem_cache *slab);
+struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl, struct kmem_cache *slab,
+				  void (*sc_free)(struct nfs4_stid *));
 void nfs4_unhash_stid(struct nfs4_stid *s);
 void nfs4_put_stid(struct nfs4_stid *s);
 void nfs4_inc_and_copy_stateid(stateid_t *dst, struct nfs4_stid *stid);
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 12eeae62a2b1..e1872f36147f 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -1068,7 +1068,7 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent)
 	sb->s_time_gran = 1;
 	sb->s_max_links = NILFS_LINK_MAX;
 
-	sb->s_bdi = &bdev_get_queue(sb->s_bdev)->backing_dev_info;
+	sb->s_bdi = bdev_get_queue(sb->s_bdev)->backing_dev_info;
 
 	err = load_nilfs(nilfs, sb);
 	if (err)
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index bbc175d4213d..a4c46221755e 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -31,7 +31,6 @@ static bool should_merge(struct fsnotify_event *old_fsn,
 static int fanotify_merge(struct list_head *list, struct fsnotify_event *event)
 {
 	struct fsnotify_event *test_event;
-	bool do_merge = false;
 
 	pr_debug("%s: list=%p event=%p\n", __func__, list, event);
 
@@ -47,16 +46,12 @@ static int fanotify_merge(struct list_head *list, struct fsnotify_event *event)
 
 	list_for_each_entry_reverse(test_event, list, list) {
 		if (should_merge(test_event, event)) {
-			do_merge = true;
-			break;
+			test_event->mask |= event->mask;
+			return 1;
 		}
 	}
 
-	if (!do_merge)
-		return 0;
-
-	test_event->mask |= event->mask;
-	return 1;
+	return 0;
 }
 
 #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
diff --git a/fs/notify/inotify/inotify.h b/fs/notify/inotify/inotify.h
index a6f5907a3fee..7c461fd49c4c 100644
--- a/fs/notify/inotify/inotify.h
+++ b/fs/notify/inotify/inotify.h
@@ -30,3 +30,20 @@ extern int inotify_handle_event(struct fsnotify_group *group,
 				const unsigned char *file_name, u32 cookie);
 
 extern const struct fsnotify_ops inotify_fsnotify_ops;
+
+#ifdef CONFIG_INOTIFY_USER
+static inline void dec_inotify_instances(struct ucounts *ucounts)
+{
+	dec_ucount(ucounts, UCOUNT_INOTIFY_INSTANCES);
+}
+
+static inline struct ucounts *inc_inotify_watches(struct ucounts *ucounts)
+{
+	return inc_ucount(ucounts->ns, ucounts->uid, UCOUNT_INOTIFY_WATCHES);
+}
+
+static inline void dec_inotify_watches(struct ucounts *ucounts)
+{
+	dec_ucount(ucounts, UCOUNT_INOTIFY_WATCHES);
+}
+#endif
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index 19e7ec109a75..f36c29398de3 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -165,10 +165,8 @@ static void inotify_free_group_priv(struct fsnotify_group *group)
 	/* ideally the idr is empty and we won't hit the BUG in the callback */
 	idr_for_each(&group->inotify_data.idr, idr_callback, group);
 	idr_destroy(&group->inotify_data.idr);
-	if (group->inotify_data.user) {
-		atomic_dec(&group->inotify_data.user->inotify_devs);
-		free_uid(group->inotify_data.user);
-	}
+	if (group->inotify_data.ucounts)
+		dec_inotify_instances(group->inotify_data.ucounts);
 }
 
 static void inotify_free_event(struct fsnotify_event *fsn_event)
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 69d1ea3d292a..1cf41c623be1 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -44,10 +44,8 @@
 
 #include <asm/ioctls.h>
 
-/* these are configurable via /proc/sys/fs/inotify/ */
-static int inotify_max_user_instances __read_mostly;
+/* configurable via /proc/sys/fs/inotify/ */
 static int inotify_max_queued_events __read_mostly;
-static int inotify_max_user_watches __read_mostly;
 
 static struct kmem_cache *inotify_inode_mark_cachep __read_mostly;
 
@@ -60,7 +58,7 @@ static int zero;
 struct ctl_table inotify_table[] = {
 	{
 		.procname	= "max_user_instances",
-		.data		= &inotify_max_user_instances,
+		.data		= &init_user_ns.ucount_max[UCOUNT_INOTIFY_INSTANCES],
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
@@ -68,7 +66,7 @@ struct ctl_table inotify_table[] = {
 	},
 	{
 		.procname	= "max_user_watches",
-		.data		= &inotify_max_user_watches,
+		.data		= &init_user_ns.ucount_max[UCOUNT_INOTIFY_WATCHES],
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
@@ -500,7 +498,7 @@ void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark,
 	/* remove this mark from the idr */
 	inotify_remove_from_idr(group, i_mark);
 
-	atomic_dec(&group->inotify_data.user->inotify_watches);
+	dec_inotify_watches(group->inotify_data.ucounts);
 }
 
 /* ding dong the mark is dead */
@@ -584,14 +582,17 @@ static int inotify_new_watch(struct fsnotify_group *group,
 	tmp_i_mark->fsn_mark.mask = mask;
 	tmp_i_mark->wd = -1;
 
-	ret = -ENOSPC;
-	if (atomic_read(&group->inotify_data.user->inotify_watches) >= inotify_max_user_watches)
-		goto out_err;
-
 	ret = inotify_add_to_idr(idr, idr_lock, tmp_i_mark);
 	if (ret)
 		goto out_err;
 
+	/* increment the number of watches the user has */
+	if (!inc_inotify_watches(group->inotify_data.ucounts)) {
+		inotify_remove_from_idr(group, tmp_i_mark);
+		ret = -ENOSPC;
+		goto out_err;
+	}
+
 	/* we are on the idr, now get on the inode */
 	ret = fsnotify_add_mark_locked(&tmp_i_mark->fsn_mark, group, inode,
 				       NULL, 0);
@@ -601,8 +602,6 @@ static int inotify_new_watch(struct fsnotify_group *group,
 		goto out_err;
 	}
 
-	/* increment the number of watches the user has */
-	atomic_inc(&group->inotify_data.user->inotify_watches);
 
 	/* return the watch descriptor for this new mark */
 	ret = tmp_i_mark->wd;
@@ -653,10 +652,11 @@ static struct fsnotify_group *inotify_new_group(unsigned int max_events)
 
 	spin_lock_init(&group->inotify_data.idr_lock);
 	idr_init(&group->inotify_data.idr);
-	group->inotify_data.user = get_current_user();
+	group->inotify_data.ucounts = inc_ucount(current_user_ns(),
+						 current_euid(),
+						 UCOUNT_INOTIFY_INSTANCES);
 
-	if (atomic_inc_return(&group->inotify_data.user->inotify_devs) >
-	    inotify_max_user_instances) {
+	if (!group->inotify_data.ucounts) {
 		fsnotify_destroy_group(group);
 		return ERR_PTR(-EMFILE);
 	}
@@ -819,8 +819,8 @@ static int __init inotify_user_setup(void)
 	inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark, SLAB_PANIC);
 
 	inotify_max_queued_events = 16384;
-	inotify_max_user_instances = 128;
-	inotify_max_user_watches = 8192;
+	init_user_ns.ucount_max[UCOUNT_INOTIFY_INSTANCES] = 128;
+	init_user_ns.ucount_max[UCOUNT_INOTIFY_WATCHES] = 8192;
 
 	return 0;
 }
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index d3fea0bd89e2..6043306e8e21 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -510,18 +510,6 @@ void fsnotify_detach_group_marks(struct fsnotify_group *group)
 	}
 }
 
-void fsnotify_duplicate_mark(struct fsnotify_mark *new, struct fsnotify_mark *old)
-{
-	assert_spin_locked(&old->lock);
-	new->inode = old->inode;
-	new->mnt = old->mnt;
-	if (old->group)
-		fsnotify_get_group(old->group);
-	new->group = old->group;
-	new->mask = old->mask;
-	new->free_mark = old->free_mark;
-}
-
 /*
  * Nothing fancy, just initialize lists and locks and counters.
  */
diff --git a/fs/nsfs.c b/fs/nsfs.c
index 8c9fb29c6673..1656843e87d2 100644
--- a/fs/nsfs.c
+++ b/fs/nsfs.c
@@ -7,6 +7,7 @@
 #include <linux/seq_file.h>
 #include <linux/user_namespace.h>
 #include <linux/nsfs.h>
+#include <linux/uaccess.h>
 
 static struct vfsmount *nsfs_mnt;
 
@@ -163,7 +164,10 @@ int open_related_ns(struct ns_common *ns,
 static long ns_ioctl(struct file *filp, unsigned int ioctl,
 			unsigned long arg)
 {
+	struct user_namespace *user_ns;
 	struct ns_common *ns = get_proc_ns(file_inode(filp));
+	uid_t __user *argp;
+	uid_t uid;
 
 	switch (ioctl) {
 	case NS_GET_USERNS:
@@ -172,6 +176,15 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl,
 		if (!ns->ops->get_parent)
 			return -EINVAL;
 		return open_related_ns(ns, ns->ops->get_parent);
+	case NS_GET_NSTYPE:
+		return ns->ops->type;
+	case NS_GET_OWNER_UID:
+		if (ns->ops->type != CLONE_NEWUSER)
+			return -EINVAL;
+		user_ns = container_of(ns, struct user_namespace, ns);
+		argp = (uid_t __user *) arg;
+		uid = from_kuid_munged(current_user_ns(), user_ns->owner);
+		return put_user(uid, argp);
 	default:
 		return -ENOTTY;
 	}
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index bed1fcb63088..dc22ba8c710f 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -283,16 +283,14 @@ int ocfs2_set_acl(handle_t *handle,
 int ocfs2_iop_set_acl(struct inode *inode, struct posix_acl *acl, int type)
 {
 	struct buffer_head *bh = NULL;
-	int status = 0;
+	int status, had_lock;
+	struct ocfs2_lock_holder oh;
 
-	status = ocfs2_inode_lock(inode, &bh, 1);
-	if (status < 0) {
-		if (status != -ENOENT)
-			mlog_errno(status);
-		return status;
-	}
+	had_lock = ocfs2_inode_lock_tracker(inode, &bh, 1, &oh);
+	if (had_lock < 0)
+		return had_lock;
 	status = ocfs2_set_acl(NULL, inode, bh, type, acl, NULL, NULL);
-	ocfs2_inode_unlock(inode, 1);
+	ocfs2_inode_unlock_tracker(inode, 1, &oh, had_lock);
 	brelse(bh);
 	return status;
 }
@@ -302,21 +300,20 @@ struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type)
 	struct ocfs2_super *osb;
 	struct buffer_head *di_bh = NULL;
 	struct posix_acl *acl;
-	int ret;
+	int had_lock;
+	struct ocfs2_lock_holder oh;
 
 	osb = OCFS2_SB(inode->i_sb);
 	if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
 		return NULL;
-	ret = ocfs2_inode_lock(inode, &di_bh, 0);
-	if (ret < 0) {
-		if (ret != -ENOENT)
-			mlog_errno(ret);
-		return ERR_PTR(ret);
-	}
+
+	had_lock = ocfs2_inode_lock_tracker(inode, &di_bh, 0, &oh);
+	if (had_lock < 0)
+		return ERR_PTR(had_lock);
 
 	acl = ocfs2_get_acl_nolock(inode, type, di_bh);
 
-	ocfs2_inode_unlock(inode, 0);
+	ocfs2_inode_unlock_tracker(inode, 0, &oh, had_lock);
 	brelse(di_bh);
 	return acl;
 }
diff --git a/fs/ocfs2/cluster/netdebug.c b/fs/ocfs2/cluster/netdebug.c
index 27d1242c8383..564c504d6efd 100644
--- a/fs/ocfs2/cluster/netdebug.c
+++ b/fs/ocfs2/cluster/netdebug.c
@@ -349,7 +349,7 @@ static void sc_show_sock_container(struct seq_file *seq,
 		   "  func key:        0x%08x\n"
 		   "  func type:       %u\n",
 		   sc,
-		   atomic_read(&sc->sc_kref.refcount),
+		   kref_read(&sc->sc_kref),
 		   &saddr, inet ? ntohs(sport) : 0,
 		   &daddr, inet ? ntohs(dport) : 0,
 		   sc->sc_node->nd_name,
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index d4b5c81f0445..ec000575e863 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -97,7 +97,7 @@
 	typeof(sc) __sc = (sc);						\
 	mlog(ML_SOCKET, "[sc %p refs %d sock %p node %u page %p "	\
 	     "pg_off %zu] " fmt, __sc,					\
-	     atomic_read(&__sc->sc_kref.refcount), __sc->sc_sock,	\
+	     kref_read(&__sc->sc_kref), __sc->sc_sock,	\
 	    __sc->sc_node->nd_num, __sc->sc_page, __sc->sc_page_off ,	\
 	    ##args);							\
 } while (0)
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index e7b760deefae..9b984cae4c4e 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -81,7 +81,7 @@ static void __dlm_print_lock(struct dlm_lock *lock)
 	       lock->ml.type, lock->ml.convert_type, lock->ml.node,
 	       dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
 	       dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
-	       atomic_read(&lock->lock_refs.refcount),
+	       kref_read(&lock->lock_refs),
 	       (list_empty(&lock->ast_list) ? 'y' : 'n'),
 	       (lock->ast_pending ? 'y' : 'n'),
 	       (list_empty(&lock->bast_list) ? 'y' : 'n'),
@@ -106,7 +106,7 @@ void __dlm_print_one_lock_resource(struct dlm_lock_resource *res)
 	printk("lockres: %s, owner=%u, state=%u\n",
 	       buf, res->owner, res->state);
 	printk("  last used: %lu, refcnt: %u, on purge list: %s\n",
-	       res->last_used, atomic_read(&res->refs.refcount),
+	       res->last_used, kref_read(&res->refs),
 	       list_empty(&res->purge) ? "no" : "yes");
 	printk("  on dirty list: %s, on reco list: %s, "
 	       "migrating pending: %s\n",
@@ -298,7 +298,7 @@ static int dump_mle(struct dlm_master_list_entry *mle, char *buf, int len)
 			mle_type, mle->master, mle->new_master,
 			!list_empty(&mle->hb_events),
 			!!mle->inuse,
-			atomic_read(&mle->mle_refs.refcount));
+			kref_read(&mle->mle_refs));
 
 	out += snprintf(buf + out, len - out, "Maybe=");
 	out += stringify_nodemap(mle->maybe_map, O2NM_MAX_NODES,
@@ -494,7 +494,7 @@ static int dump_lock(struct dlm_lock *lock, int list_type, char *buf, int len)
 		       lock->ast_pending, lock->bast_pending,
 		       lock->convert_pending, lock->lock_pending,
 		       lock->cancel_pending, lock->unlock_pending,
-		       atomic_read(&lock->lock_refs.refcount));
+		       kref_read(&lock->lock_refs));
 	spin_unlock(&lock->spinlock);
 
 	return out;
@@ -521,7 +521,7 @@ static int dump_lockres(struct dlm_lock_resource *res, char *buf, int len)
 			!list_empty(&res->recovering),
 			res->inflight_locks, res->migration_pending,
 			atomic_read(&res->asts_reserved),
-			atomic_read(&res->refs.refcount));
+			kref_read(&res->refs));
 
 	/* refmap */
 	out += snprintf(buf + out, len - out, "RMAP:");
@@ -777,7 +777,7 @@ static int debug_state_print(struct dlm_ctxt *dlm, char *buf, int len)
 	/* Purge Count: xxx  Refs: xxx */
 	out += snprintf(buf + out, len - out,
 			"Purge Count: %d  Refs: %d\n", dlm->purge_count,
-			atomic_read(&dlm->dlm_refs.refcount));
+			kref_read(&dlm->dlm_refs));
 
 	/* Dead Node: xxx */
 	out += snprintf(buf + out, len - out,
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 733e4e79c8e2..32fd261ae13d 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -2072,7 +2072,7 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
 	INIT_LIST_HEAD(&dlm->dlm_eviction_callbacks);
 
 	mlog(0, "context init: refcount %u\n",
-		  atomic_read(&dlm->dlm_refs.refcount));
+		  kref_read(&dlm->dlm_refs));
 
 leave:
 	if (ret < 0 && dlm) {
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index a464c8088170..7025d8c27999 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -233,7 +233,7 @@ static void __dlm_put_mle(struct dlm_master_list_entry *mle)
 
 	assert_spin_locked(&dlm->spinlock);
 	assert_spin_locked(&dlm->master_lock);
-	if (!atomic_read(&mle->mle_refs.refcount)) {
+	if (!kref_read(&mle->mle_refs)) {
 		/* this may or may not crash, but who cares.
 		 * it's a BUG. */
 		mlog(ML_ERROR, "bad mle: %p\n", mle);
@@ -1124,9 +1124,9 @@ recheck:
 		unsigned long timeo = msecs_to_jiffies(DLM_MASTERY_TIMEOUT_MS);
 
 		/*
-		if (atomic_read(&mle->mle_refs.refcount) < 2)
+		if (kref_read(&mle->mle_refs) < 2)
 			mlog(ML_ERROR, "mle (%p) refs=%d, name=%.*s\n", mle,
-			atomic_read(&mle->mle_refs.refcount),
+			kref_read(&mle->mle_refs),
 			res->lockname.len, res->lockname.name);
 		*/
 		atomic_set(&mle->woken, 0);
@@ -1979,7 +1979,7 @@ ok:
 		 * on this mle. */
 		spin_lock(&dlm->master_lock);
 
-		rr = atomic_read(&mle->mle_refs.refcount);
+		rr = kref_read(&mle->mle_refs);
 		if (mle->inuse > 0) {
 			if (extra_ref && rr < 3)
 				err = 1;
diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c
index 1082b2c3014b..63d701cd1e2e 100644
--- a/fs/ocfs2/dlm/dlmunlock.c
+++ b/fs/ocfs2/dlm/dlmunlock.c
@@ -251,7 +251,7 @@ leave:
 		mlog(0, "lock %u:%llu should be gone now! refs=%d\n",
 		     dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
 		     dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
-		     atomic_read(&lock->lock_refs.refcount)-1);
+		     kref_read(&lock->lock_refs)-1);
 		dlm_lock_put(lock);
 	}
 	if (actions & DLM_UNLOCK_CALL_AST)
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 83d576f6a287..8dce4099a6ca 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -532,6 +532,7 @@ void ocfs2_lock_res_init_once(struct ocfs2_lock_res *res)
 	init_waitqueue_head(&res->l_event);
 	INIT_LIST_HEAD(&res->l_blocked_list);
 	INIT_LIST_HEAD(&res->l_mask_waiters);
+	INIT_LIST_HEAD(&res->l_holders);
 }
 
 void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res,
@@ -749,6 +750,50 @@ void ocfs2_lock_res_free(struct ocfs2_lock_res *res)
 	res->l_flags = 0UL;
 }
 
+/*
+ * Keep a list of processes who have interest in a lockres.
+ * Note: this is now only uesed for check recursive cluster locking.
+ */
+static inline void ocfs2_add_holder(struct ocfs2_lock_res *lockres,
+				   struct ocfs2_lock_holder *oh)
+{
+	INIT_LIST_HEAD(&oh->oh_list);
+	oh->oh_owner_pid = get_pid(task_pid(current));
+
+	spin_lock(&lockres->l_lock);
+	list_add_tail(&oh->oh_list, &lockres->l_holders);
+	spin_unlock(&lockres->l_lock);
+}
+
+static inline void ocfs2_remove_holder(struct ocfs2_lock_res *lockres,
+				       struct ocfs2_lock_holder *oh)
+{
+	spin_lock(&lockres->l_lock);
+	list_del(&oh->oh_list);
+	spin_unlock(&lockres->l_lock);
+
+	put_pid(oh->oh_owner_pid);
+}
+
+static inline int ocfs2_is_locked_by_me(struct ocfs2_lock_res *lockres)
+{
+	struct ocfs2_lock_holder *oh;
+	struct pid *pid;
+
+	/* look in the list of holders for one with the current task as owner */
+	spin_lock(&lockres->l_lock);
+	pid = task_pid(current);
+	list_for_each_entry(oh, &lockres->l_holders, oh_list) {
+		if (oh->oh_owner_pid == pid) {
+			spin_unlock(&lockres->l_lock);
+			return 1;
+		}
+	}
+	spin_unlock(&lockres->l_lock);
+
+	return 0;
+}
+
 static inline void ocfs2_inc_holders(struct ocfs2_lock_res *lockres,
 				     int level)
 {
@@ -2333,8 +2378,9 @@ int ocfs2_inode_lock_full_nested(struct inode *inode,
 		goto getbh;
 	}
 
-	if (ocfs2_mount_local(osb))
-		goto local;
+	if ((arg_flags & OCFS2_META_LOCK_GETBH) ||
+	    ocfs2_mount_local(osb))
+		goto update;
 
 	if (!(arg_flags & OCFS2_META_LOCK_RECOVERY))
 		ocfs2_wait_for_recovery(osb);
@@ -2363,7 +2409,7 @@ int ocfs2_inode_lock_full_nested(struct inode *inode,
 	if (!(arg_flags & OCFS2_META_LOCK_RECOVERY))
 		ocfs2_wait_for_recovery(osb);
 
-local:
+update:
 	/*
 	 * We only see this flag if we're being called from
 	 * ocfs2_read_locked_inode(). It means we're locking an inode
@@ -2497,6 +2543,59 @@ void ocfs2_inode_unlock(struct inode *inode,
 		ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level);
 }
 
+/*
+ * This _tracker variantes are introduced to deal with the recursive cluster
+ * locking issue. The idea is to keep track of a lock holder on the stack of
+ * the current process. If there's a lock holder on the stack, we know the
+ * task context is already protected by cluster locking. Currently, they're
+ * used in some VFS entry routines.
+ *
+ * return < 0 on error, return == 0 if there's no lock holder on the stack
+ * before this call, return == 1 if this call would be a recursive locking.
+ */
+int ocfs2_inode_lock_tracker(struct inode *inode,
+			     struct buffer_head **ret_bh,
+			     int ex,
+			     struct ocfs2_lock_holder *oh)
+{
+	int status;
+	int arg_flags = 0, has_locked;
+	struct ocfs2_lock_res *lockres;
+
+	lockres = &OCFS2_I(inode)->ip_inode_lockres;
+	has_locked = ocfs2_is_locked_by_me(lockres);
+	/* Just get buffer head if the cluster lock has been taken */
+	if (has_locked)
+		arg_flags = OCFS2_META_LOCK_GETBH;
+
+	if (likely(!has_locked || ret_bh)) {
+		status = ocfs2_inode_lock_full(inode, ret_bh, ex, arg_flags);
+		if (status < 0) {
+			if (status != -ENOENT)
+				mlog_errno(status);
+			return status;
+		}
+	}
+	if (!has_locked)
+		ocfs2_add_holder(lockres, oh);
+
+	return has_locked;
+}
+
+void ocfs2_inode_unlock_tracker(struct inode *inode,
+				int ex,
+				struct ocfs2_lock_holder *oh,
+				int had_lock)
+{
+	struct ocfs2_lock_res *lockres;
+
+	lockres = &OCFS2_I(inode)->ip_inode_lockres;
+	if (!had_lock) {
+		ocfs2_remove_holder(lockres, oh);
+		ocfs2_inode_unlock(inode, ex);
+	}
+}
+
 int ocfs2_orphan_scan_lock(struct ocfs2_super *osb, u32 *seqno)
 {
 	struct ocfs2_lock_res *lockres;
@@ -3303,6 +3402,16 @@ static int ocfs2_downconvert_lock(struct ocfs2_super *osb,
 	mlog(ML_BASTS, "lockres %s, level %d => %d\n", lockres->l_name,
 	     lockres->l_level, new_level);
 
+	/*
+	 * On DLM_LKF_VALBLK, fsdlm behaves differently with o2cb. It always
+	 * expects DLM_LKF_VALBLK being set if the LKB has LVB, so that
+	 * we can recover correctly from node failure. Otherwise, we may get
+	 * invalid LVB in LKB, but without DLM_SBF_VALNOTVALID being set.
+	 */
+	if (!ocfs2_is_o2cb_active() &&
+	    lockres->l_ops->flags & LOCK_TYPE_USES_LVB)
+		lvb = 1;
+
 	if (lvb)
 		dlm_flags |= DLM_LKF_VALBLK;
 
diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h
index d293a22c32c5..a7fc18ba0dc1 100644
--- a/fs/ocfs2/dlmglue.h
+++ b/fs/ocfs2/dlmglue.h
@@ -70,6 +70,11 @@ struct ocfs2_orphan_scan_lvb {
 	__be32	lvb_os_seqno;
 };
 
+struct ocfs2_lock_holder {
+	struct list_head oh_list;
+	struct pid *oh_owner_pid;
+};
+
 /* ocfs2_inode_lock_full() 'arg_flags' flags */
 /* don't wait on recovery. */
 #define OCFS2_META_LOCK_RECOVERY	(0x01)
@@ -77,6 +82,8 @@ struct ocfs2_orphan_scan_lvb {
 #define OCFS2_META_LOCK_NOQUEUE		(0x02)
 /* don't block waiting for the downconvert thread, instead return -EAGAIN */
 #define OCFS2_LOCK_NONBLOCK		(0x04)
+/* just get back disk inode bh if we've got cluster lock. */
+#define OCFS2_META_LOCK_GETBH		(0x08)
 
 /* Locking subclasses of inode cluster lock */
 enum {
@@ -170,4 +177,15 @@ void ocfs2_put_dlm_debug(struct ocfs2_dlm_debug *dlm_debug);
 
 /* To set the locking protocol on module initialization */
 void ocfs2_set_locking_protocol(void);
+
+/* The _tracker pair is used to avoid cluster recursive locking */
+int ocfs2_inode_lock_tracker(struct inode *inode,
+			     struct buffer_head **ret_bh,
+			     int ex,
+			     struct ocfs2_lock_holder *oh);
+void ocfs2_inode_unlock_tracker(struct inode *inode,
+				int ex,
+				struct ocfs2_lock_holder *oh,
+				int had_lock);
+
 #endif	/* DLMGLUE_H */
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index c4889655d32b..7b6a146327d7 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1138,6 +1138,8 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
 	handle_t *handle = NULL;
 	struct dquot *transfer_to[MAXQUOTAS] = { };
 	int qtype;
+	int had_lock;
+	struct ocfs2_lock_holder oh;
 
 	trace_ocfs2_setattr(inode, dentry,
 			    (unsigned long long)OCFS2_I(inode)->ip_blkno,
@@ -1173,11 +1175,30 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
 		}
 	}
 
-	status = ocfs2_inode_lock(inode, &bh, 1);
-	if (status < 0) {
-		if (status != -ENOENT)
-			mlog_errno(status);
+	had_lock = ocfs2_inode_lock_tracker(inode, &bh, 1, &oh);
+	if (had_lock < 0) {
+		status = had_lock;
 		goto bail_unlock_rw;
+	} else if (had_lock) {
+		/*
+		 * As far as we know, ocfs2_setattr() could only be the first
+		 * VFS entry point in the call chain of recursive cluster
+		 * locking issue.
+		 *
+		 * For instance:
+		 * chmod_common()
+		 *  notify_change()
+		 *   ocfs2_setattr()
+		 *    posix_acl_chmod()
+		 *     ocfs2_iop_get_acl()
+		 *
+		 * But, we're not 100% sure if it's always true, because the
+		 * ordering of the VFS entry points in the call chain is out
+		 * of our control. So, we'd better dump the stack here to
+		 * catch the other cases of recursive locking.
+		 */
+		mlog(ML_ERROR, "Another case of recursive locking:\n");
+		dump_stack();
 	}
 	inode_locked = 1;
 
@@ -1260,8 +1281,8 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
 bail_commit:
 	ocfs2_commit_trans(osb, handle);
 bail_unlock:
-	if (status) {
-		ocfs2_inode_unlock(inode, 1);
+	if (status && inode_locked) {
+		ocfs2_inode_unlock_tracker(inode, 1, &oh, had_lock);
 		inode_locked = 0;
 	}
 bail_unlock_rw:
@@ -1279,7 +1300,7 @@ bail:
 			mlog_errno(status);
 	}
 	if (inode_locked)
-		ocfs2_inode_unlock(inode, 1);
+		ocfs2_inode_unlock_tracker(inode, 1, &oh, had_lock);
 
 	brelse(bh);
 	return status;
@@ -1320,21 +1341,32 @@ bail:
 
 int ocfs2_permission(struct inode *inode, int mask)
 {
-	int ret;
+	int ret, had_lock;
+	struct ocfs2_lock_holder oh;
 
 	if (mask & MAY_NOT_BLOCK)
 		return -ECHILD;
 
-	ret = ocfs2_inode_lock(inode, NULL, 0);
-	if (ret) {
-		if (ret != -ENOENT)
-			mlog_errno(ret);
+	had_lock = ocfs2_inode_lock_tracker(inode, NULL, 0, &oh);
+	if (had_lock < 0) {
+		ret = had_lock;
 		goto out;
+	} else if (had_lock) {
+		/* See comments in ocfs2_setattr() for details.
+		 * The call chain of this case could be:
+		 * do_sys_open()
+		 *  may_open()
+		 *   inode_permission()
+		 *    ocfs2_permission()
+		 *     ocfs2_iop_get_acl()
+		 */
+		mlog(ML_ERROR, "Another case of recursive locking:\n");
+		dump_stack();
 	}
 
 	ret = generic_permission(inode, mask);
 
-	ocfs2_inode_unlock(inode, 0);
+	ocfs2_inode_unlock_tracker(inode, 0, &oh, had_lock);
 out:
 	return ret;
 }
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 7e5958b0be6b..0c39d71c67a1 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -172,6 +172,7 @@ struct ocfs2_lock_res {
 
 	struct list_head         l_blocked_list;
 	struct list_head         l_mask_waiters;
+	struct list_head	 l_holders;
 
 	unsigned long		 l_flags;
 	char                     l_name[OCFS2_LOCK_ID_MAX_LEN];
diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c
index 52c07346bea3..820359096c7a 100644
--- a/fs/ocfs2/stackglue.c
+++ b/fs/ocfs2/stackglue.c
@@ -48,6 +48,12 @@ static char ocfs2_hb_ctl_path[OCFS2_MAX_HB_CTL_PATH] = "/sbin/ocfs2_hb_ctl";
  */
 static struct ocfs2_stack_plugin *active_stack;
 
+inline int ocfs2_is_o2cb_active(void)
+{
+	return !strcmp(active_stack->sp_name, OCFS2_STACK_PLUGIN_O2CB);
+}
+EXPORT_SYMBOL_GPL(ocfs2_is_o2cb_active);
+
 static struct ocfs2_stack_plugin *ocfs2_stack_lookup(const char *name)
 {
 	struct ocfs2_stack_plugin *p;
diff --git a/fs/ocfs2/stackglue.h b/fs/ocfs2/stackglue.h
index f2dce10fae54..e3036e1790e8 100644
--- a/fs/ocfs2/stackglue.h
+++ b/fs/ocfs2/stackglue.h
@@ -298,6 +298,9 @@ void ocfs2_stack_glue_set_max_proto_version(struct ocfs2_protocol_version *max_p
 int ocfs2_stack_glue_register(struct ocfs2_stack_plugin *plugin);
 void ocfs2_stack_glue_unregister(struct ocfs2_stack_plugin *plugin);
 
+/* In ocfs2_downconvert_lock(), we need to know which stack we are using */
+int ocfs2_is_o2cb_active(void);
+
 extern struct kset *ocfs2_kset;
 
 #endif  /* STACKGLUE_H */
diff --git a/fs/orangefs/super.c b/fs/orangefs/super.c
index c48859f16e7b..67c24351a67f 100644
--- a/fs/orangefs/super.c
+++ b/fs/orangefs/super.c
@@ -115,6 +115,13 @@ static struct inode *orangefs_alloc_inode(struct super_block *sb)
 	return &orangefs_inode->vfs_inode;
 }
 
+static void orangefs_i_callback(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+	struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
+	kmem_cache_free(orangefs_inode_cache, orangefs_inode);
+}
+
 static void orangefs_destroy_inode(struct inode *inode)
 {
 	struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
@@ -123,7 +130,7 @@ static void orangefs_destroy_inode(struct inode *inode)
 			"%s: deallocated %p destroying inode %pU\n",
 			__func__, orangefs_inode, get_khandle_from_ino(inode));
 
-	kmem_cache_free(orangefs_inode_cache, orangefs_inode);
+	call_rcu(&inode->i_rcu, orangefs_i_callback);
 }
 
 /*
diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c
index 9ad48d9202a9..023bb0b03352 100644
--- a/fs/overlayfs/namei.c
+++ b/fs/overlayfs/namei.c
@@ -154,29 +154,38 @@ out_err:
 static int ovl_lookup_layer(struct dentry *base, struct ovl_lookup_data *d,
 			    struct dentry **ret)
 {
-	const char *s = d->name.name;
+	/* Counting down from the end, since the prefix can change */
+	size_t rem = d->name.len - 1;
 	struct dentry *dentry = NULL;
 	int err;
 
-	if (*s != '/')
+	if (d->name.name[0] != '/')
 		return ovl_lookup_single(base, d, d->name.name, d->name.len,
 					 0, "", ret);
 
-	while (*s++ == '/' && !IS_ERR_OR_NULL(base) && d_can_lookup(base)) {
+	while (!IS_ERR_OR_NULL(base) && d_can_lookup(base)) {
+		const char *s = d->name.name + d->name.len - rem;
 		const char *next = strchrnul(s, '/');
-		size_t slen = strlen(s);
+		size_t thislen = next - s;
+		bool end = !next[0];
 
-		if (WARN_ON(slen > d->name.len) ||
-		    WARN_ON(strcmp(d->name.name + d->name.len - slen, s)))
+		/* Verify we did not go off the rails */
+		if (WARN_ON(s[-1] != '/'))
 			return -EIO;
 
-		err = ovl_lookup_single(base, d, s, next - s,
-					d->name.len - slen, next, &base);
+		err = ovl_lookup_single(base, d, s, thislen,
+					d->name.len - rem, next, &base);
 		dput(dentry);
 		if (err)
 			return err;
 		dentry = base;
-		s = next;
+		if (end)
+			break;
+
+		rem -= thislen + 1;
+
+		if (WARN_ON(rem >= d->name.len))
+			return -EIO;
 	}
 	*ret = dentry;
 	return 0;
diff --git a/fs/pnode.c b/fs/pnode.c
index 06a793f4ae38..5bc7896d122a 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -322,6 +322,21 @@ out:
 	return ret;
 }
 
+static struct mount *find_topper(struct mount *mnt)
+{
+	/* If there is exactly one mount covering mnt completely return it. */
+	struct mount *child;
+
+	if (!list_is_singular(&mnt->mnt_mounts))
+		return NULL;
+
+	child = list_first_entry(&mnt->mnt_mounts, struct mount, mnt_child);
+	if (child->mnt_mountpoint != mnt->mnt.mnt_root)
+		return NULL;
+
+	return child;
+}
+
 /*
  * return true if the refcount is greater than count
  */
@@ -342,9 +357,8 @@ static inline int do_refcount_check(struct mount *mnt, int count)
  */
 int propagate_mount_busy(struct mount *mnt, int refcnt)
 {
-	struct mount *m, *child;
+	struct mount *m, *child, *topper;
 	struct mount *parent = mnt->mnt_parent;
-	int ret = 0;
 
 	if (mnt == parent)
 		return do_refcount_check(mnt, refcnt);
@@ -359,12 +373,24 @@ int propagate_mount_busy(struct mount *mnt, int refcnt)
 
 	for (m = propagation_next(parent, parent); m;
 	     		m = propagation_next(m, parent)) {
-		child = __lookup_mnt_last(&m->mnt, mnt->mnt_mountpoint);
-		if (child && list_empty(&child->mnt_mounts) &&
-		    (ret = do_refcount_check(child, 1)))
-			break;
+		int count = 1;
+		child = __lookup_mnt(&m->mnt, mnt->mnt_mountpoint);
+		if (!child)
+			continue;
+
+		/* Is there exactly one mount on the child that covers
+		 * it completely whose reference should be ignored?
+		 */
+		topper = find_topper(child);
+		if (topper)
+			count += 1;
+		else if (!list_empty(&child->mnt_mounts))
+			continue;
+
+		if (do_refcount_check(child, count))
+			return 1;
 	}
-	return ret;
+	return 0;
 }
 
 /*
@@ -381,7 +407,7 @@ void propagate_mount_unlock(struct mount *mnt)
 
 	for (m = propagation_next(parent, parent); m;
 			m = propagation_next(m, parent)) {
-		child = __lookup_mnt_last(&m->mnt, mnt->mnt_mountpoint);
+		child = __lookup_mnt(&m->mnt, mnt->mnt_mountpoint);
 		if (child)
 			child->mnt.mnt_flags &= ~MNT_LOCKED;
 	}
@@ -399,9 +425,11 @@ static void mark_umount_candidates(struct mount *mnt)
 
 	for (m = propagation_next(parent, parent); m;
 			m = propagation_next(m, parent)) {
-		struct mount *child = __lookup_mnt_last(&m->mnt,
+		struct mount *child = __lookup_mnt(&m->mnt,
 						mnt->mnt_mountpoint);
-		if (child && (!IS_MNT_LOCKED(child) || IS_MNT_MARKED(m))) {
+		if (!child || (child->mnt.mnt_flags & MNT_UMOUNT))
+			continue;
+		if (!IS_MNT_LOCKED(child) || IS_MNT_MARKED(m)) {
 			SET_MNT_MARK(child);
 		}
 	}
@@ -420,8 +448,8 @@ static void __propagate_umount(struct mount *mnt)
 
 	for (m = propagation_next(parent, parent); m;
 			m = propagation_next(m, parent)) {
-
-		struct mount *child = __lookup_mnt_last(&m->mnt,
+		struct mount *topper;
+		struct mount *child = __lookup_mnt(&m->mnt,
 						mnt->mnt_mountpoint);
 		/*
 		 * umount the child only if the child has no children
@@ -430,6 +458,15 @@ static void __propagate_umount(struct mount *mnt)
 		if (!child || !IS_MNT_MARKED(child))
 			continue;
 		CLEAR_MNT_MARK(child);
+
+		/* If there is exactly one mount covering all of child
+		 * replace child with that mount.
+		 */
+		topper = find_topper(child);
+		if (topper)
+			mnt_change_mountpoint(child->mnt_parent, child->mnt_mp,
+					      topper);
+
 		if (list_empty(&child->mnt_mounts)) {
 			list_del_init(&child->mnt_child);
 			child->mnt.mnt_flags |= MNT_UMOUNT;
diff --git a/fs/pnode.h b/fs/pnode.h
index 550f5a8b4fcf..dc87e65becd2 100644
--- a/fs/pnode.h
+++ b/fs/pnode.h
@@ -49,6 +49,8 @@ int get_dominating_id(struct mount *mnt, const struct path *root);
 unsigned int mnt_get_count(struct mount *mnt);
 void mnt_set_mountpoint(struct mount *, struct mountpoint *,
 			struct mount *);
+void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp,
+			   struct mount *mnt);
 struct mount *copy_tree(struct mount *, struct dentry *, int);
 bool is_path_reachable(struct mount *, struct dentry *,
 			 const struct path *root);
diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index 595522022aca..c9d48dc78495 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -922,11 +922,10 @@ int simple_set_acl(struct inode *inode, struct posix_acl *acl, int type)
 	int error;
 
 	if (type == ACL_TYPE_ACCESS) {
-		error = posix_acl_equiv_mode(acl, &inode->i_mode);
-		if (error < 0)
-			return 0;
-		if (error == 0)
-			acl = NULL;
+		error = posix_acl_update_mode(inode,
+				&inode->i_mode, &acl);
+		if (error)
+			return error;
 	}
 
 	inode->i_ctime = current_time(inode);
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 51a4213afa2e..fe12b519d09b 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -401,8 +401,8 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
 	unsigned long long start_time;
 	unsigned long cmin_flt = 0, cmaj_flt = 0;
 	unsigned long  min_flt = 0,  maj_flt = 0;
-	cputime_t cutime, cstime, utime, stime;
-	cputime_t cgtime, gtime;
+	u64 cutime, cstime, utime, stime;
+	u64 cgtime, gtime;
 	unsigned long rsslim = 0;
 	char tcomm[sizeof(task->comm)];
 	unsigned long flags;
@@ -497,10 +497,10 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
 	seq_put_decimal_ull(m, " ", cmin_flt);
 	seq_put_decimal_ull(m, " ", maj_flt);
 	seq_put_decimal_ull(m, " ", cmaj_flt);
-	seq_put_decimal_ull(m, " ", cputime_to_clock_t(utime));
-	seq_put_decimal_ull(m, " ", cputime_to_clock_t(stime));
-	seq_put_decimal_ll(m, " ", cputime_to_clock_t(cutime));
-	seq_put_decimal_ll(m, " ", cputime_to_clock_t(cstime));
+	seq_put_decimal_ull(m, " ", nsec_to_clock_t(utime));
+	seq_put_decimal_ull(m, " ", nsec_to_clock_t(stime));
+	seq_put_decimal_ll(m, " ", nsec_to_clock_t(cutime));
+	seq_put_decimal_ll(m, " ", nsec_to_clock_t(cstime));
 	seq_put_decimal_ll(m, " ", priority);
 	seq_put_decimal_ll(m, " ", nice);
 	seq_put_decimal_ll(m, " ", num_threads);
@@ -542,8 +542,8 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
 	seq_put_decimal_ull(m, " ", task->rt_priority);
 	seq_put_decimal_ull(m, " ", task->policy);
 	seq_put_decimal_ull(m, " ", delayacct_blkio_ticks(task));
-	seq_put_decimal_ull(m, " ", cputime_to_clock_t(gtime));
-	seq_put_decimal_ll(m, " ", cputime_to_clock_t(cgtime));
+	seq_put_decimal_ull(m, " ", nsec_to_clock_t(gtime));
+	seq_put_decimal_ll(m, " ", nsec_to_clock_t(cgtime));
 
 	if (mm && permitted) {
 		seq_put_decimal_ull(m, " ", mm->start_data);
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 8e7e61b28f31..b73b4de8fb36 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1667,12 +1667,63 @@ const struct inode_operations proc_pid_link_inode_operations = {
 
 /* building an inode */
 
+void task_dump_owner(struct task_struct *task, mode_t mode,
+		     kuid_t *ruid, kgid_t *rgid)
+{
+	/* Depending on the state of dumpable compute who should own a
+	 * proc file for a task.
+	 */
+	const struct cred *cred;
+	kuid_t uid;
+	kgid_t gid;
+
+	/* Default to the tasks effective ownership */
+	rcu_read_lock();
+	cred = __task_cred(task);
+	uid = cred->euid;
+	gid = cred->egid;
+	rcu_read_unlock();
+
+	/*
+	 * Before the /proc/pid/status file was created the only way to read
+	 * the effective uid of a /process was to stat /proc/pid.  Reading
+	 * /proc/pid/status is slow enough that procps and other packages
+	 * kept stating /proc/pid.  To keep the rules in /proc simple I have
+	 * made this apply to all per process world readable and executable
+	 * directories.
+	 */
+	if (mode != (S_IFDIR|S_IRUGO|S_IXUGO)) {
+		struct mm_struct *mm;
+		task_lock(task);
+		mm = task->mm;
+		/* Make non-dumpable tasks owned by some root */
+		if (mm) {
+			if (get_dumpable(mm) != SUID_DUMP_USER) {
+				struct user_namespace *user_ns = mm->user_ns;
+
+				uid = make_kuid(user_ns, 0);
+				if (!uid_valid(uid))
+					uid = GLOBAL_ROOT_UID;
+
+				gid = make_kgid(user_ns, 0);
+				if (!gid_valid(gid))
+					gid = GLOBAL_ROOT_GID;
+			}
+		} else {
+			uid = GLOBAL_ROOT_UID;
+			gid = GLOBAL_ROOT_GID;
+		}
+		task_unlock(task);
+	}
+	*ruid = uid;
+	*rgid = gid;
+}
+
 struct inode *proc_pid_make_inode(struct super_block * sb,
 				  struct task_struct *task, umode_t mode)
 {
 	struct inode * inode;
 	struct proc_inode *ei;
-	const struct cred *cred;
 
 	/* We need a new inode */
 
@@ -1694,13 +1745,7 @@ struct inode *proc_pid_make_inode(struct super_block * sb,
 	if (!ei->pid)
 		goto out_unlock;
 
-	if (task_dumpable(task)) {
-		rcu_read_lock();
-		cred = __task_cred(task);
-		inode->i_uid = cred->euid;
-		inode->i_gid = cred->egid;
-		rcu_read_unlock();
-	}
+	task_dump_owner(task, 0, &inode->i_uid, &inode->i_gid);
 	security_task_to_inode(task, inode);
 
 out:
@@ -1715,7 +1760,6 @@ int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
 {
 	struct inode *inode = d_inode(dentry);
 	struct task_struct *task;
-	const struct cred *cred;
 	struct pid_namespace *pid = dentry->d_sb->s_fs_info;
 
 	generic_fillattr(inode, stat);
@@ -1733,12 +1777,7 @@ int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
 			 */
 			return -ENOENT;
 		}
-		if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) ||
-		    task_dumpable(task)) {
-			cred = __task_cred(task);
-			stat->uid = cred->euid;
-			stat->gid = cred->egid;
-		}
+		task_dump_owner(task, inode->i_mode, &stat->uid, &stat->gid);
 	}
 	rcu_read_unlock();
 	return 0;
@@ -1754,18 +1793,11 @@ int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
  * Rewrite the inode's ownerships here because the owning task may have
  * performed a setuid(), etc.
  *
- * Before the /proc/pid/status file was created the only way to read
- * the effective uid of a /process was to stat /proc/pid.  Reading
- * /proc/pid/status is slow enough that procps and other packages
- * kept stating /proc/pid.  To keep the rules in /proc simple I have
- * made this apply to all per process world readable and executable
- * directories.
  */
 int pid_revalidate(struct dentry *dentry, unsigned int flags)
 {
 	struct inode *inode;
 	struct task_struct *task;
-	const struct cred *cred;
 
 	if (flags & LOOKUP_RCU)
 		return -ECHILD;
@@ -1774,17 +1806,8 @@ int pid_revalidate(struct dentry *dentry, unsigned int flags)
 	task = get_proc_task(inode);
 
 	if (task) {
-		if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) ||
-		    task_dumpable(task)) {
-			rcu_read_lock();
-			cred = __task_cred(task);
-			inode->i_uid = cred->euid;
-			inode->i_gid = cred->egid;
-			rcu_read_unlock();
-		} else {
-			inode->i_uid = GLOBAL_ROOT_UID;
-			inode->i_gid = GLOBAL_ROOT_GID;
-		}
+		task_dump_owner(task, inode->i_mode, &inode->i_uid, &inode->i_gid);
+
 		inode->i_mode &= ~(S_ISUID | S_ISGID);
 		security_task_to_inode(task, inode);
 		put_task_struct(task);
@@ -1881,7 +1904,6 @@ static int map_files_d_revalidate(struct dentry *dentry, unsigned int flags)
 	bool exact_vma_exists = false;
 	struct mm_struct *mm = NULL;
 	struct task_struct *task;
-	const struct cred *cred;
 	struct inode *inode;
 	int status = 0;
 
@@ -1906,16 +1928,8 @@ static int map_files_d_revalidate(struct dentry *dentry, unsigned int flags)
 	mmput(mm);
 
 	if (exact_vma_exists) {
-		if (task_dumpable(task)) {
-			rcu_read_lock();
-			cred = __task_cred(task);
-			inode->i_uid = cred->euid;
-			inode->i_gid = cred->egid;
-			rcu_read_unlock();
-		} else {
-			inode->i_uid = GLOBAL_ROOT_UID;
-			inode->i_gid = GLOBAL_ROOT_GID;
-		}
+		task_dump_owner(task, 0, &inode->i_uid, &inode->i_gid);
+
 		security_task_to_inode(task, inode);
 		status = 1;
 	}
@@ -2179,7 +2193,7 @@ static const struct file_operations proc_map_files_operations = {
 	.llseek		= generic_file_llseek,
 };
 
-#ifdef CONFIG_CHECKPOINT_RESTORE
+#if defined(CONFIG_CHECKPOINT_RESTORE) && defined(CONFIG_POSIX_TIMERS)
 struct timers_private {
 	struct pid *pid;
 	struct task_struct *task;
@@ -2488,6 +2502,12 @@ static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf,
 	length = -ESRCH;
 	if (!task)
 		goto out_no_task;
+
+	/* A task may only write its own attributes. */
+	length = -EACCES;
+	if (current != task)
+		goto out;
+
 	if (count > PAGE_SIZE)
 		count = PAGE_SIZE;
 
@@ -2503,14 +2523,13 @@ static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf,
 	}
 
 	/* Guard against adverse ptrace interaction */
-	length = mutex_lock_interruptible(&task->signal->cred_guard_mutex);
+	length = mutex_lock_interruptible(&current->signal->cred_guard_mutex);
 	if (length < 0)
 		goto out_free;
 
-	length = security_setprocattr(task,
-				      (char*)file->f_path.dentry->d_name.name,
+	length = security_setprocattr(file->f_path.dentry->d_name.name,
 				      page, count);
-	mutex_unlock(&task->signal->cred_guard_mutex);
+	mutex_unlock(&current->signal->cred_guard_mutex);
 out_free:
 	kfree(page);
 out:
@@ -2936,7 +2955,7 @@ static const struct pid_entry tgid_base_stuff[] = {
 	REG("projid_map", S_IRUGO|S_IWUSR, proc_projid_map_operations),
 	REG("setgroups",  S_IRUGO|S_IWUSR, proc_setgroups_operations),
 #endif
-#ifdef CONFIG_CHECKPOINT_RESTORE
+#if defined(CONFIG_CHECKPOINT_RESTORE) && defined(CONFIG_POSIX_TIMERS)
 	REG("timers",	  S_IRUGO, proc_timers_operations),
 #endif
 	REG("timerslack_ns", S_IRUGO|S_IWUGO, proc_pid_set_timerslack_ns_operations),
@@ -3179,6 +3198,8 @@ int proc_pid_readdir(struct file *file, struct dir_context *ctx)
 	     iter.tgid += 1, iter = next_tgid(ns, iter)) {
 		char name[PROC_NUMBUF];
 		int len;
+
+		cond_resched();
 		if (!has_pid_permissions(ns, iter.task, 2))
 			continue;
 
diff --git a/fs/proc/fd.c b/fs/proc/fd.c
index 4274f83bf100..00ce1531b2f5 100644
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c
@@ -84,7 +84,6 @@ static int tid_fd_revalidate(struct dentry *dentry, unsigned int flags)
 {
 	struct files_struct *files;
 	struct task_struct *task;
-	const struct cred *cred;
 	struct inode *inode;
 	unsigned int fd;
 
@@ -108,16 +107,7 @@ static int tid_fd_revalidate(struct dentry *dentry, unsigned int flags)
 				rcu_read_unlock();
 				put_files_struct(files);
 
-				if (task_dumpable(task)) {
-					rcu_read_lock();
-					cred = __task_cred(task);
-					inode->i_uid = cred->euid;
-					inode->i_gid = cred->egid;
-					rcu_read_unlock();
-				} else {
-					inode->i_uid = GLOBAL_ROOT_UID;
-					inode->i_gid = GLOBAL_ROOT_GID;
-				}
+				task_dump_owner(task, 0, &inode->i_uid, &inode->i_gid);
 
 				if (S_ISLNK(inode->i_mode)) {
 					unsigned i_mode = S_IFLNK;
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 842a5ff5b85c..7ad9ed7958af 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -43,10 +43,11 @@ static void proc_evict_inode(struct inode *inode)
 	de = PDE(inode);
 	if (de)
 		pde_put(de);
+
 	head = PROC_I(inode)->sysctl;
 	if (head) {
 		RCU_INIT_POINTER(PROC_I(inode)->sysctl, NULL);
-		sysctl_head_put(head);
+		proc_sys_evict_inode(inode, head);
 	}
 }
 
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 2de5194ba378..5d6960f5f1c0 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -65,6 +65,7 @@ struct proc_inode {
 	struct proc_dir_entry *pde;
 	struct ctl_table_header *sysctl;
 	struct ctl_table *sysctl_entry;
+	struct list_head sysctl_inodes;
 	const struct proc_ns_operations *ns_ops;
 	struct inode vfs_inode;
 };
@@ -97,20 +98,8 @@ static inline struct task_struct *get_proc_task(struct inode *inode)
 	return get_pid_task(proc_pid(inode), PIDTYPE_PID);
 }
 
-static inline int task_dumpable(struct task_struct *task)
-{
-	int dumpable = 0;
-	struct mm_struct *mm;
-
-	task_lock(task);
-	mm = task->mm;
-	if (mm)
-		dumpable = get_dumpable(mm);
-	task_unlock(task);
-	if (dumpable == SUID_DUMP_USER)
-		return 1;
-	return 0;
-}
+void task_dump_owner(struct task_struct *task, mode_t mode,
+		     kuid_t *ruid, kgid_t *rgid);
 
 static inline unsigned name_to_int(const struct qstr *qstr)
 {
@@ -249,10 +238,12 @@ extern void proc_thread_self_init(void);
  */
 #ifdef CONFIG_PROC_SYSCTL
 extern int proc_sys_init(void);
-extern void sysctl_head_put(struct ctl_table_header *);
+extern void proc_sys_evict_inode(struct inode *inode,
+				 struct ctl_table_header *head);
 #else
 static inline void proc_sys_init(void) { }
-static inline void sysctl_head_put(struct ctl_table_header *head) { }
+static inline void proc_sys_evict_inode(struct  inode *inode,
+					struct ctl_table_header *head) { }
 #endif
 
 /*
diff --git a/fs/proc/page.c b/fs/proc/page.c
index a2066e6dee90..2726536489b1 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -173,7 +173,8 @@ u64 stable_page_flags(struct page *page)
 	u |= kpf_copy_bit(k, KPF_ACTIVE,	PG_active);
 	u |= kpf_copy_bit(k, KPF_RECLAIM,	PG_reclaim);
 
-	u |= kpf_copy_bit(k, KPF_SWAPCACHE,	PG_swapcache);
+	if (PageSwapCache(page))
+		u |= 1 << KPF_SWAPCACHE;
 	u |= kpf_copy_bit(k, KPF_SWAPBACKED,	PG_swapbacked);
 
 	u |= kpf_copy_bit(k, KPF_UNEVICTABLE,	PG_unevictable);
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 55313d994895..3e64c6502dc8 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -190,6 +190,7 @@ static void init_header(struct ctl_table_header *head,
 	head->set = set;
 	head->parent = NULL;
 	head->node = node;
+	INIT_LIST_HEAD(&head->inodes);
 	if (node) {
 		struct ctl_table *entry;
 		for (entry = table; entry->procname; entry++, node++)
@@ -259,6 +260,27 @@ static void unuse_table(struct ctl_table_header *p)
 			complete(p->unregistering);
 }
 
+/* called under sysctl_lock */
+static void proc_sys_prune_dcache(struct ctl_table_header *head)
+{
+	struct inode *inode, *prev = NULL;
+	struct proc_inode *ei;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(ei, &head->inodes, sysctl_inodes) {
+		inode = igrab(&ei->vfs_inode);
+		if (inode) {
+			rcu_read_unlock();
+			iput(prev);
+			prev = inode;
+			d_prune_aliases(inode);
+			rcu_read_lock();
+		}
+	}
+	rcu_read_unlock();
+	iput(prev);
+}
+
 /* called under sysctl_lock, will reacquire if has to wait */
 static void start_unregistering(struct ctl_table_header *p)
 {
@@ -272,31 +294,22 @@ static void start_unregistering(struct ctl_table_header *p)
 		p->unregistering = &wait;
 		spin_unlock(&sysctl_lock);
 		wait_for_completion(&wait);
-		spin_lock(&sysctl_lock);
 	} else {
 		/* anything non-NULL; we'll never dereference it */
 		p->unregistering = ERR_PTR(-EINVAL);
+		spin_unlock(&sysctl_lock);
 	}
 	/*
+	 * Prune dentries for unregistered sysctls: namespaced sysctls
+	 * can have duplicate names and contaminate dcache very badly.
+	 */
+	proc_sys_prune_dcache(p);
+	/*
 	 * do not remove from the list until nobody holds it; walking the
 	 * list in do_sysctl() relies on that.
 	 */
-	erase_header(p);
-}
-
-static void sysctl_head_get(struct ctl_table_header *head)
-{
-	spin_lock(&sysctl_lock);
-	head->count++;
-	spin_unlock(&sysctl_lock);
-}
-
-void sysctl_head_put(struct ctl_table_header *head)
-{
 	spin_lock(&sysctl_lock);
-	if (!--head->count)
-		kfree_rcu(head, rcu);
-	spin_unlock(&sysctl_lock);
+	erase_header(p);
 }
 
 static struct ctl_table_header *sysctl_head_grab(struct ctl_table_header *head)
@@ -440,10 +453,20 @@ static struct inode *proc_sys_make_inode(struct super_block *sb,
 
 	inode->i_ino = get_next_ino();
 
-	sysctl_head_get(head);
 	ei = PROC_I(inode);
+
+	spin_lock(&sysctl_lock);
+	if (unlikely(head->unregistering)) {
+		spin_unlock(&sysctl_lock);
+		iput(inode);
+		inode = NULL;
+		goto out;
+	}
 	ei->sysctl = head;
 	ei->sysctl_entry = table;
+	list_add_rcu(&ei->sysctl_inodes, &head->inodes);
+	head->count++;
+	spin_unlock(&sysctl_lock);
 
 	inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
 	inode->i_mode = table->mode;
@@ -466,6 +489,15 @@ out:
 	return inode;
 }
 
+void proc_sys_evict_inode(struct inode *inode, struct ctl_table_header *head)
+{
+	spin_lock(&sysctl_lock);
+	list_del_rcu(&PROC_I(inode)->sysctl_inodes);
+	if (!--head->count)
+		kfree_rcu(head, rcu);
+	spin_unlock(&sysctl_lock);
+}
+
 static struct ctl_table_header *grab_header(struct inode *inode)
 {
 	struct ctl_table_header *head = PROC_I(inode)->sysctl;
@@ -709,7 +741,7 @@ static int proc_sys_readdir(struct file *file, struct dir_context *ctx)
 	ctl_dir = container_of(head, struct ctl_dir, header);
 
 	if (!dir_emit_dots(file, ctx))
-		return 0;
+		goto out;
 
 	pos = 2;
 
@@ -719,6 +751,7 @@ static int proc_sys_readdir(struct file *file, struct dir_context *ctx)
 			break;
 		}
 	}
+out:
 	sysctl_head_finish(head);
 	return 0;
 }
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index d700c42b3572..e47c3e8c4dfe 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -21,9 +21,9 @@
 
 #ifdef arch_idle_time
 
-static cputime64_t get_idle_time(int cpu)
+static u64 get_idle_time(int cpu)
 {
-	cputime64_t idle;
+	u64 idle;
 
 	idle = kcpustat_cpu(cpu).cpustat[CPUTIME_IDLE];
 	if (cpu_online(cpu) && !nr_iowait_cpu(cpu))
@@ -31,9 +31,9 @@ static cputime64_t get_idle_time(int cpu)
 	return idle;
 }
 
-static cputime64_t get_iowait_time(int cpu)
+static u64 get_iowait_time(int cpu)
 {
-	cputime64_t iowait;
+	u64 iowait;
 
 	iowait = kcpustat_cpu(cpu).cpustat[CPUTIME_IOWAIT];
 	if (cpu_online(cpu) && nr_iowait_cpu(cpu))
@@ -45,32 +45,32 @@ static cputime64_t get_iowait_time(int cpu)
 
 static u64 get_idle_time(int cpu)
 {
-	u64 idle, idle_time = -1ULL;
+	u64 idle, idle_usecs = -1ULL;
 
 	if (cpu_online(cpu))
-		idle_time = get_cpu_idle_time_us(cpu, NULL);
+		idle_usecs = get_cpu_idle_time_us(cpu, NULL);
 
-	if (idle_time == -1ULL)
+	if (idle_usecs == -1ULL)
 		/* !NO_HZ or cpu offline so we can rely on cpustat.idle */
 		idle = kcpustat_cpu(cpu).cpustat[CPUTIME_IDLE];
 	else
-		idle = usecs_to_cputime64(idle_time);
+		idle = idle_usecs * NSEC_PER_USEC;
 
 	return idle;
 }
 
 static u64 get_iowait_time(int cpu)
 {
-	u64 iowait, iowait_time = -1ULL;
+	u64 iowait, iowait_usecs = -1ULL;
 
 	if (cpu_online(cpu))
-		iowait_time = get_cpu_iowait_time_us(cpu, NULL);
+		iowait_usecs = get_cpu_iowait_time_us(cpu, NULL);
 
-	if (iowait_time == -1ULL)
+	if (iowait_usecs == -1ULL)
 		/* !NO_HZ or cpu offline so we can rely on cpustat.iowait */
 		iowait = kcpustat_cpu(cpu).cpustat[CPUTIME_IOWAIT];
 	else
-		iowait = usecs_to_cputime64(iowait_time);
+		iowait = iowait_usecs * NSEC_PER_USEC;
 
 	return iowait;
 }
@@ -115,16 +115,16 @@ static int show_stat(struct seq_file *p, void *v)
 	}
 	sum += arch_irq_stat();
 
-	seq_put_decimal_ull(p, "cpu  ", cputime64_to_clock_t(user));
-	seq_put_decimal_ull(p, " ", cputime64_to_clock_t(nice));
-	seq_put_decimal_ull(p, " ", cputime64_to_clock_t(system));
-	seq_put_decimal_ull(p, " ", cputime64_to_clock_t(idle));
-	seq_put_decimal_ull(p, " ", cputime64_to_clock_t(iowait));
-	seq_put_decimal_ull(p, " ", cputime64_to_clock_t(irq));
-	seq_put_decimal_ull(p, " ", cputime64_to_clock_t(softirq));
-	seq_put_decimal_ull(p, " ", cputime64_to_clock_t(steal));
-	seq_put_decimal_ull(p, " ", cputime64_to_clock_t(guest));
-	seq_put_decimal_ull(p, " ", cputime64_to_clock_t(guest_nice));
+	seq_put_decimal_ull(p, "cpu  ", nsec_to_clock_t(user));
+	seq_put_decimal_ull(p, " ", nsec_to_clock_t(nice));
+	seq_put_decimal_ull(p, " ", nsec_to_clock_t(system));
+	seq_put_decimal_ull(p, " ", nsec_to_clock_t(idle));
+	seq_put_decimal_ull(p, " ", nsec_to_clock_t(iowait));
+	seq_put_decimal_ull(p, " ", nsec_to_clock_t(irq));
+	seq_put_decimal_ull(p, " ", nsec_to_clock_t(softirq));
+	seq_put_decimal_ull(p, " ", nsec_to_clock_t(steal));
+	seq_put_decimal_ull(p, " ", nsec_to_clock_t(guest));
+	seq_put_decimal_ull(p, " ", nsec_to_clock_t(guest_nice));
 	seq_putc(p, '\n');
 
 	for_each_online_cpu(i) {
@@ -140,16 +140,16 @@ static int show_stat(struct seq_file *p, void *v)
 		guest = kcpustat_cpu(i).cpustat[CPUTIME_GUEST];
 		guest_nice = kcpustat_cpu(i).cpustat[CPUTIME_GUEST_NICE];
 		seq_printf(p, "cpu%d", i);
-		seq_put_decimal_ull(p, " ", cputime64_to_clock_t(user));
-		seq_put_decimal_ull(p, " ", cputime64_to_clock_t(nice));
-		seq_put_decimal_ull(p, " ", cputime64_to_clock_t(system));
-		seq_put_decimal_ull(p, " ", cputime64_to_clock_t(idle));
-		seq_put_decimal_ull(p, " ", cputime64_to_clock_t(iowait));
-		seq_put_decimal_ull(p, " ", cputime64_to_clock_t(irq));
-		seq_put_decimal_ull(p, " ", cputime64_to_clock_t(softirq));
-		seq_put_decimal_ull(p, " ", cputime64_to_clock_t(steal));
-		seq_put_decimal_ull(p, " ", cputime64_to_clock_t(guest));
-		seq_put_decimal_ull(p, " ", cputime64_to_clock_t(guest_nice));
+		seq_put_decimal_ull(p, " ", nsec_to_clock_t(user));
+		seq_put_decimal_ull(p, " ", nsec_to_clock_t(nice));
+		seq_put_decimal_ull(p, " ", nsec_to_clock_t(system));
+		seq_put_decimal_ull(p, " ", nsec_to_clock_t(idle));
+		seq_put_decimal_ull(p, " ", nsec_to_clock_t(iowait));
+		seq_put_decimal_ull(p, " ", nsec_to_clock_t(irq));
+		seq_put_decimal_ull(p, " ", nsec_to_clock_t(softirq));
+		seq_put_decimal_ull(p, " ", nsec_to_clock_t(steal));
+		seq_put_decimal_ull(p, " ", nsec_to_clock_t(guest));
+		seq_put_decimal_ull(p, " ", nsec_to_clock_t(guest_nice));
 		seq_putc(p, '\n');
 	}
 	seq_put_decimal_ull(p, "intr ", (unsigned long long)sum);
diff --git a/fs/proc/uptime.c b/fs/proc/uptime.c
index 33de567c25af..7981c4ffe787 100644
--- a/fs/proc/uptime.c
+++ b/fs/proc/uptime.c
@@ -5,23 +5,20 @@
 #include <linux/seq_file.h>
 #include <linux/time.h>
 #include <linux/kernel_stat.h>
-#include <linux/cputime.h>
 
 static int uptime_proc_show(struct seq_file *m, void *v)
 {
 	struct timespec uptime;
 	struct timespec idle;
-	u64 idletime;
 	u64 nsec;
 	u32 rem;
 	int i;
 
-	idletime = 0;
+	nsec = 0;
 	for_each_possible_cpu(i)
-		idletime += (__force u64) kcpustat_cpu(i).cpustat[CPUTIME_IDLE];
+		nsec += (__force u64) kcpustat_cpu(i).cpustat[CPUTIME_IDLE];
 
 	get_monotonic_boottime(&uptime);
-	nsec = cputime64_to_jiffies64(idletime) * TICK_NSEC;
 	idle.tv_sec = div_u64_rem(nsec, NSEC_PER_SEC, &rem);
 	idle.tv_nsec = rem;
 	seq_printf(m, "%lu.%02lu %lu.%02lu\n",
diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c
index 27c059e1760a..11f918d34b1e 100644
--- a/fs/pstore/ram.c
+++ b/fs/pstore/ram.c
@@ -133,7 +133,8 @@ ramoops_get_next_prz(struct persistent_ram_zone *przs[], uint *c, uint max,
 	struct persistent_ram_zone *prz;
 	int i = (*c)++;
 
-	if (i >= max)
+	/* Give up if we never existed or have hit the end. */
+	if (!przs || i >= max)
 		return NULL;
 
 	prz = przs[i];
diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c
index a857338b7dab..bc927e30bdcc 100644
--- a/fs/pstore/ram_core.c
+++ b/fs/pstore/ram_core.c
@@ -467,8 +467,7 @@ static int persistent_ram_buffer_map(phys_addr_t start, phys_addr_t size,
 }
 
 static int persistent_ram_post_init(struct persistent_ram_zone *prz, u32 sig,
-				    struct persistent_ram_ecc_info *ecc_info,
-				    unsigned long flags)
+				    struct persistent_ram_ecc_info *ecc_info)
 {
 	int ret;
 
@@ -494,10 +493,9 @@ static int persistent_ram_post_init(struct persistent_ram_zone *prz, u32 sig,
 			 prz->buffer->sig);
 	}
 
+	/* Rewind missing or invalid memory area. */
 	prz->buffer->sig = sig;
 	persistent_ram_zap(prz);
-	prz->buffer_lock = __RAW_SPIN_LOCK_UNLOCKED(buffer_lock);
-	prz->flags = flags;
 
 	return 0;
 }
@@ -533,11 +531,15 @@ struct persistent_ram_zone *persistent_ram_new(phys_addr_t start, size_t size,
 		goto err;
 	}
 
+	/* Initialize general buffer state. */
+	prz->buffer_lock = __RAW_SPIN_LOCK_UNLOCKED(buffer_lock);
+	prz->flags = flags;
+
 	ret = persistent_ram_buffer_map(start, size, prz, memtype);
 	if (ret)
 		goto err;
 
-	ret = persistent_ram_post_init(prz, sig, ecc_info, flags);
+	ret = persistent_ram_post_init(prz, sig, ecc_info);
 	if (ret)
 		goto err;
 
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
index d0f8a38dfafa..0186fe6d39f3 100644
--- a/fs/romfs/super.c
+++ b/fs/romfs/super.c
@@ -74,6 +74,7 @@
 #include <linux/highmem.h>
 #include <linux/pagemap.h>
 #include <linux/uaccess.h>
+#include <linux/major.h>
 #include "internal.h"
 
 static struct kmem_cache *romfs_inode_cachep;
@@ -416,7 +417,22 @@ static void romfs_destroy_inode(struct inode *inode)
 static int romfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
 	struct super_block *sb = dentry->d_sb;
-	u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
+	u64 id = 0;
+
+	/* When calling huge_encode_dev(),
+	 * use sb->s_bdev->bd_dev when,
+	 *   - CONFIG_ROMFS_ON_BLOCK defined
+	 * use sb->s_dev when,
+	 *   - CONFIG_ROMFS_ON_BLOCK undefined and
+	 *   - CONFIG_ROMFS_ON_MTD defined
+	 * leave id as 0 when,
+	 *   - CONFIG_ROMFS_ON_BLOCK undefined and
+	 *   - CONFIG_ROMFS_ON_MTD undefined
+	 */
+	if (sb->s_bdev)
+		id = huge_encode_dev(sb->s_bdev->bd_dev);
+	else if (sb->s_dev)
+		id = huge_encode_dev(sb->s_dev);
 
 	buf->f_type = ROMFS_MAGIC;
 	buf->f_namelen = ROMFS_MAXFN;
@@ -489,6 +505,11 @@ static int romfs_fill_super(struct super_block *sb, void *data, int silent)
 	sb->s_flags |= MS_RDONLY | MS_NOATIME;
 	sb->s_op = &romfs_super_ops;
 
+#ifdef CONFIG_ROMFS_ON_MTD
+	/* Use same dev ID from the underlying mtdblock device */
+	if (sb->s_mtd)
+		sb->s_dev = MKDEV(MTD_BLOCK_MAJOR, sb->s_mtd->index);
+#endif
 	/* read the image superblock and check it */
 	rsb = kmalloc(512, GFP_KERNEL);
 	if (!rsb)
diff --git a/fs/splice.c b/fs/splice.c
index 6518f058bd7f..eaafa3d8869a 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -204,6 +204,7 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
 		buf->len = spd->partial[page_nr].len;
 		buf->private = spd->partial[page_nr].private;
 		buf->ops = spd->ops;
+		buf->flags = 0;
 
 		pipe->nrbufs++;
 		page_nr++;
diff --git a/fs/super.c b/fs/super.c
index 1709ed029a2c..b8b6a086c03b 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -469,7 +469,7 @@ struct super_block *sget_userns(struct file_system_type *type,
 	struct super_block *old;
 	int err;
 
-	if (!(flags & MS_KERNMOUNT) &&
+	if (!(flags & (MS_KERNMOUNT|MS_SUBMOUNT)) &&
 	    !(type->fs_flags & FS_USERNS_MOUNT) &&
 	    !capable(CAP_SYS_ADMIN))
 		return ERR_PTR(-EPERM);
@@ -499,7 +499,7 @@ retry:
 	}
 	if (!s) {
 		spin_unlock(&sb_lock);
-		s = alloc_super(type, flags, user_ns);
+		s = alloc_super(type, (flags & ~MS_SUBMOUNT), user_ns);
 		if (!s)
 			return ERR_PTR(-ENOMEM);
 		goto retry;
@@ -540,8 +540,15 @@ struct super_block *sget(struct file_system_type *type,
 {
 	struct user_namespace *user_ns = current_user_ns();
 
+	/* We don't yet pass the user namespace of the parent
+	 * mount through to here so always use &init_user_ns
+	 * until that changes.
+	 */
+	if (flags & MS_SUBMOUNT)
+		user_ns = &init_user_ns;
+
 	/* Ensure the requestor has permissions over the target filesystem */
-	if (!(flags & MS_KERNMOUNT) && !ns_capable(user_ns, CAP_SYS_ADMIN))
+	if (!(flags & (MS_KERNMOUNT|MS_SUBMOUNT)) && !ns_capable(user_ns, CAP_SYS_ADMIN))
 		return ERR_PTR(-EPERM);
 
 	return sget_userns(type, test, set, flags, user_ns, data);
@@ -1047,7 +1054,7 @@ static int set_bdev_super(struct super_block *s, void *data)
 	 * We set the bdi here to the queue backing, file systems can
 	 * overwrite this in ->fill_super()
 	 */
-	s->s_bdi = &bdev_get_queue(s->s_bdev)->backing_dev_info;
+	s->s_bdi = bdev_get_queue(s->s_bdev)->backing_dev_info;
 	return 0;
 }
 
diff --git a/fs/timerfd.c b/fs/timerfd.c
index c173cc196175..384fa759a563 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -40,6 +40,7 @@ struct timerfd_ctx {
 	short unsigned settime_flags;	/* to show in fdinfo */
 	struct rcu_head rcu;
 	struct list_head clist;
+	spinlock_t cancel_lock;
 	bool might_cancel;
 };
 
@@ -112,7 +113,7 @@ void timerfd_clock_was_set(void)
 	rcu_read_unlock();
 }
 
-static void timerfd_remove_cancel(struct timerfd_ctx *ctx)
+static void __timerfd_remove_cancel(struct timerfd_ctx *ctx)
 {
 	if (ctx->might_cancel) {
 		ctx->might_cancel = false;
@@ -122,6 +123,13 @@ static void timerfd_remove_cancel(struct timerfd_ctx *ctx)
 	}
 }
 
+static void timerfd_remove_cancel(struct timerfd_ctx *ctx)
+{
+	spin_lock(&ctx->cancel_lock);
+	__timerfd_remove_cancel(ctx);
+	spin_unlock(&ctx->cancel_lock);
+}
+
 static bool timerfd_canceled(struct timerfd_ctx *ctx)
 {
 	if (!ctx->might_cancel || ctx->moffs != KTIME_MAX)
@@ -132,6 +140,7 @@ static bool timerfd_canceled(struct timerfd_ctx *ctx)
 
 static void timerfd_setup_cancel(struct timerfd_ctx *ctx, int flags)
 {
+	spin_lock(&ctx->cancel_lock);
 	if ((ctx->clockid == CLOCK_REALTIME ||
 	     ctx->clockid == CLOCK_REALTIME_ALARM) &&
 	    (flags & TFD_TIMER_ABSTIME) && (flags & TFD_TIMER_CANCEL_ON_SET)) {
@@ -141,9 +150,10 @@ static void timerfd_setup_cancel(struct timerfd_ctx *ctx, int flags)
 			list_add_rcu(&ctx->clist, &cancel_list);
 			spin_unlock(&cancel_lock);
 		}
-	} else if (ctx->might_cancel) {
-		timerfd_remove_cancel(ctx);
+	} else {
+		__timerfd_remove_cancel(ctx);
 	}
+	spin_unlock(&ctx->cancel_lock);
 }
 
 static ktime_t timerfd_get_remaining(struct timerfd_ctx *ctx)
@@ -400,6 +410,7 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags)
 		return -ENOMEM;
 
 	init_waitqueue_head(&ctx->wqh);
+	spin_lock_init(&ctx->cancel_lock);
 	ctx->clockid = clockid;
 
 	if (isalarm(ctx))
diff --git a/fs/ubifs/Kconfig b/fs/ubifs/Kconfig
index 0a908ae7af13..b0d0623c83ed 100644
--- a/fs/ubifs/Kconfig
+++ b/fs/ubifs/Kconfig
@@ -53,7 +53,7 @@ config UBIFS_ATIME_SUPPORT
 
 config UBIFS_FS_ENCRYPTION
 	bool "UBIFS Encryption"
-	depends on UBIFS_FS
+	depends on UBIFS_FS && BLOCK
 	select FS_ENCRYPTION
 	default n
 	help
diff --git a/fs/ubifs/crypto.c b/fs/ubifs/crypto.c
index 3402720f2b28..382ed428cfd2 100644
--- a/fs/ubifs/crypto.c
+++ b/fs/ubifs/crypto.c
@@ -26,15 +26,6 @@ static unsigned int ubifs_crypt_max_namelen(struct inode *inode)
 		return UBIFS_MAX_NLEN;
 }
 
-static int ubifs_key_prefix(struct inode *inode, u8 **key)
-{
-	static char prefix[] = "ubifs:";
-
-	*key = prefix;
-
-	return sizeof(prefix) - 1;
-}
-
 int ubifs_encrypt(const struct inode *inode, struct ubifs_data_node *dn,
 		  unsigned int in_len, unsigned int *out_len, int block)
 {
@@ -86,12 +77,12 @@ int ubifs_decrypt(const struct inode *inode, struct ubifs_data_node *dn,
 	return 0;
 }
 
-struct fscrypt_operations ubifs_crypt_operations = {
+const struct fscrypt_operations ubifs_crypt_operations = {
 	.flags			= FS_CFLG_OWN_PAGES,
+	.key_prefix		= "ubifs:",
 	.get_context		= ubifs_crypt_get_context,
 	.set_context		= ubifs_crypt_set_context,
 	.is_encrypted		= __ubifs_crypt_is_encrypted,
 	.empty_dir		= ubifs_crypt_empty_dir,
 	.max_namelen		= ubifs_crypt_max_namelen,
-	.key_prefix		= ubifs_key_prefix,
 };
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index 1c5331ac9614..528369f3e472 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -390,16 +390,6 @@ static int do_tmpfile(struct inode *dir, struct dentry *dentry,
 	dbg_gen("dent '%pd', mode %#hx in dir ino %lu",
 		dentry, mode, dir->i_ino);
 
-	if (ubifs_crypt_is_encrypted(dir)) {
-		err = fscrypt_get_encryption_info(dir);
-		if (err)
-			return err;
-
-		if (!fscrypt_has_encryption_key(dir)) {
-			return -EPERM;
-		}
-	}
-
 	err = fscrypt_setup_filename(dir, &dentry->d_name, 0, &nm);
 	if (err)
 		return err;
@@ -741,17 +731,9 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir,
 	ubifs_assert(inode_is_locked(dir));
 	ubifs_assert(inode_is_locked(inode));
 
-	if (ubifs_crypt_is_encrypted(dir)) {
-		if (!fscrypt_has_permitted_context(dir, inode))
-			return -EPERM;
-
-		err = fscrypt_get_encryption_info(inode);
-		if (err)
-			return err;
-
-		if (!fscrypt_has_encryption_key(inode))
-			return -EPERM;
-	}
+	if (ubifs_crypt_is_encrypted(dir) &&
+	    !fscrypt_has_permitted_context(dir, inode))
+		return -EPERM;
 
 	err = fscrypt_setup_filename(dir, &dentry->d_name, 0, &nm);
 	if (err)
@@ -1000,17 +982,6 @@ static int ubifs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 	if (err)
 		return err;
 
-	if (ubifs_crypt_is_encrypted(dir)) {
-		err = fscrypt_get_encryption_info(dir);
-		if (err)
-			goto out_budg;
-
-		if (!fscrypt_has_encryption_key(dir)) {
-			err = -EPERM;
-			goto out_budg;
-		}
-	}
-
 	err = fscrypt_setup_filename(dir, &dentry->d_name, 0, &nm);
 	if (err)
 		goto out_budg;
@@ -1096,17 +1067,6 @@ static int ubifs_mknod(struct inode *dir, struct dentry *dentry,
 		return err;
 	}
 
-	if (ubifs_crypt_is_encrypted(dir)) {
-		err = fscrypt_get_encryption_info(dir);
-		if (err)
-			goto out_budg;
-
-		if (!fscrypt_has_encryption_key(dir)) {
-			err = -EPERM;
-			goto out_budg;
-		}
-	}
-
 	err = fscrypt_setup_filename(dir, &dentry->d_name, 0, &nm);
 	if (err)
 		goto out_budg;
@@ -1231,18 +1191,6 @@ static int ubifs_symlink(struct inode *dir, struct dentry *dentry,
 			goto out_inode;
 		}
 
-		err = fscrypt_get_encryption_info(inode);
-		if (err) {
-			kfree(sd);
-			goto out_inode;
-		}
-
-		if (!fscrypt_has_encryption_key(inode)) {
-			kfree(sd);
-			err = -EPERM;
-			goto out_inode;
-		}
-
 		ostr.name = sd->encrypted_path;
 		ostr.len = disk_link.len;
 
diff --git a/fs/ubifs/ioctl.c b/fs/ubifs/ioctl.c
index 78d713644df3..da519ba205f6 100644
--- a/fs/ubifs/ioctl.c
+++ b/fs/ubifs/ioctl.c
@@ -217,6 +217,9 @@ long ubifs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	case FS_IOC32_SETFLAGS:
 		cmd = FS_IOC_SETFLAGS;
 		break;
+	case FS_IOC_SET_ENCRYPTION_POLICY:
+	case FS_IOC_GET_ENCRYPTION_POLICY:
+		break;
 	default:
 		return -ENOIOCTLCMD;
 	}
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index a459211a1c21..294519b98874 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -744,6 +744,7 @@ int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode,
 
 	} else {
 		data->compr_size = 0;
+		out_len = compr_len;
 	}
 
 	dlen = UBIFS_DATA_NODE_SZ + out_len;
@@ -1319,6 +1320,7 @@ static int truncate_data_node(const struct ubifs_info *c, const struct inode *in
 	dn->compr_type = cpu_to_le16(compr_type);
 	dn->size = cpu_to_le32(*new_len);
 	*new_len = UBIFS_DATA_NODE_SZ + out_len;
+	err = 0;
 out:
 	kfree(buf);
 	return err;
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index e08aa04fc835..b73811bd7676 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -2000,7 +2000,7 @@ static struct ubifs_info *alloc_ubifs_info(struct ubi_volume_desc *ubi)
 }
 
 #ifndef CONFIG_UBIFS_FS_ENCRYPTION
-struct fscrypt_operations ubifs_crypt_operations = {
+const struct fscrypt_operations ubifs_crypt_operations = {
 	.is_encrypted		= __ubifs_crypt_is_encrypted,
 };
 #endif
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index 74ae2de949df..709aa098dd46 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -34,6 +34,11 @@
 #include <linux/slab.h>
 #include "ubifs.h"
 
+static int try_read_node(const struct ubifs_info *c, void *buf, int type,
+			 int len, int lnum, int offs);
+static int fallible_read_node(struct ubifs_info *c, const union ubifs_key *key,
+			      struct ubifs_zbranch *zbr, void *node);
+
 /*
  * Returned codes of 'matches_name()' and 'fallible_matches_name()' functions.
  * @NAME_LESS: name corresponding to the first argument is less than second
@@ -402,7 +407,19 @@ static int tnc_read_hashed_node(struct ubifs_info *c, struct ubifs_zbranch *zbr,
 		return 0;
 	}
 
-	err = ubifs_tnc_read_node(c, zbr, node);
+	if (c->replaying) {
+		err = fallible_read_node(c, &zbr->key, zbr, node);
+		/*
+		 * When the node was not found, return -ENOENT, 0 otherwise.
+		 * Negative return codes stay as-is.
+		 */
+		if (err == 0)
+			err = -ENOENT;
+		else if (err == 1)
+			err = 0;
+	} else {
+		err = ubifs_tnc_read_node(c, zbr, node);
+	}
 	if (err)
 		return err;
 
@@ -2857,7 +2874,11 @@ struct ubifs_dent_node *ubifs_tnc_next_ent(struct ubifs_info *c,
 	if (fname_len(nm) > 0) {
 		if (err) {
 			/* Handle collisions */
-			err = resolve_collision(c, key, &znode, &n, nm);
+			if (c->replaying)
+				err = fallible_resolve_collision(c, key, &znode, &n,
+							 nm, 0);
+			else
+				err = resolve_collision(c, key, &znode, &n, nm);
 			dbg_tnc("rc returned %d, znode %p, n %d",
 				err, znode, n);
 			if (unlikely(err < 0))
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index ca72382ce6cc..f0c86f076535 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -38,7 +38,11 @@
 #include <linux/backing-dev.h>
 #include <linux/security.h>
 #include <linux/xattr.h>
-#include <linux/fscrypto.h>
+#ifdef CONFIG_UBIFS_FS_ENCRYPTION
+#include <linux/fscrypt_supp.h>
+#else
+#include <linux/fscrypt_notsupp.h>
+#endif
 #include <linux/random.h>
 #include "ubifs-media.h"
 
@@ -1797,28 +1801,6 @@ int ubifs_decompress(const struct ubifs_info *c, const void *buf, int len,
 #include "key.h"
 
 #ifndef CONFIG_UBIFS_FS_ENCRYPTION
-#define fscrypt_set_d_op(i)
-#define fscrypt_get_ctx                 fscrypt_notsupp_get_ctx
-#define fscrypt_release_ctx             fscrypt_notsupp_release_ctx
-#define fscrypt_encrypt_page            fscrypt_notsupp_encrypt_page
-#define fscrypt_decrypt_page            fscrypt_notsupp_decrypt_page
-#define fscrypt_decrypt_bio_pages       fscrypt_notsupp_decrypt_bio_pages
-#define fscrypt_pullback_bio_page       fscrypt_notsupp_pullback_bio_page
-#define fscrypt_restore_control_page    fscrypt_notsupp_restore_control_page
-#define fscrypt_zeroout_range           fscrypt_notsupp_zeroout_range
-#define fscrypt_ioctl_set_policy	fscrypt_notsupp_ioctl_set_policy
-#define fscrypt_ioctl_get_policy	fscrypt_notsupp_ioctl_get_policy
-#define fscrypt_has_permitted_context   fscrypt_notsupp_has_permitted_context
-#define fscrypt_inherit_context         fscrypt_notsupp_inherit_context
-#define fscrypt_get_encryption_info     fscrypt_notsupp_get_encryption_info
-#define fscrypt_put_encryption_info     fscrypt_notsupp_put_encryption_info
-#define fscrypt_setup_filename          fscrypt_notsupp_setup_filename
-#define fscrypt_free_filename           fscrypt_notsupp_free_filename
-#define fscrypt_fname_encrypted_size    fscrypt_notsupp_fname_encrypted_size
-#define fscrypt_fname_alloc_buffer      fscrypt_notsupp_fname_alloc_buffer
-#define fscrypt_fname_free_buffer       fscrypt_notsupp_fname_free_buffer
-#define fscrypt_fname_disk_to_usr       fscrypt_notsupp_fname_disk_to_usr
-#define fscrypt_fname_usr_to_disk       fscrypt_notsupp_fname_usr_to_disk
 static inline int ubifs_encrypt(const struct inode *inode,
 				struct ubifs_data_node *dn,
 				unsigned int in_len, unsigned int *out_len,
@@ -1842,7 +1824,7 @@ int ubifs_decrypt(const struct inode *inode, struct ubifs_data_node *dn,
 		  unsigned int *out_len, int block);
 #endif
 
-extern struct fscrypt_operations ubifs_crypt_operations;
+extern const struct fscrypt_operations ubifs_crypt_operations;
 
 static inline bool __ubifs_crypt_is_encrypted(struct inode *inode)
 {
diff --git a/fs/udf/ecma_167.h b/fs/udf/ecma_167.h
index 4792b771aa80..9f24bd1a9f44 100644
--- a/fs/udf/ecma_167.h
+++ b/fs/udf/ecma_167.h
@@ -41,7 +41,7 @@
 struct charspec {
 	uint8_t		charSetType;
 	uint8_t		charSetInfo[63];
-} __attribute__ ((packed));
+} __packed;
 
 /* Character Set Type (ECMA 167r3 1/7.2.1.1) */
 #define CHARSPEC_TYPE_CS0		0x00	/* (1/7.2.2) */
@@ -68,7 +68,7 @@ struct timestamp {
 	uint8_t		centiseconds;
 	uint8_t		hundredsOfMicroseconds;
 	uint8_t		microseconds;
-} __attribute__ ((packed));
+} __packed;
 
 /* Type and Time Zone (ECMA 167r3 1/7.3.1) */
 #define TIMESTAMP_TYPE_MASK		0xF000
@@ -82,7 +82,7 @@ struct regid {
 	uint8_t		flags;
 	uint8_t		ident[23];
 	uint8_t		identSuffix[8];
-} __attribute__ ((packed));
+} __packed;
 
 /* Flags (ECMA 167r3 1/7.4.1) */
 #define ENTITYID_FLAGS_DIRTY		0x00
@@ -95,7 +95,7 @@ struct volStructDesc {
 	uint8_t		stdIdent[VSD_STD_ID_LEN];
 	uint8_t		structVersion;
 	uint8_t		structData[2041];
-} __attribute__ ((packed));
+} __packed;
 
 /* Standard Identifier (EMCA 167r2 2/9.1.2) */
 #define VSD_STD_ID_NSR02		"NSR02"	/* (3/9.1) */
@@ -114,7 +114,7 @@ struct beginningExtendedAreaDesc {
 	uint8_t		stdIdent[VSD_STD_ID_LEN];
 	uint8_t		structVersion;
 	uint8_t		structData[2041];
-} __attribute__ ((packed));
+} __packed;
 
 /* Terminating Extended Area Descriptor (ECMA 167r3 2/9.3) */
 struct terminatingExtendedAreaDesc {
@@ -122,7 +122,7 @@ struct terminatingExtendedAreaDesc {
 	uint8_t		stdIdent[VSD_STD_ID_LEN];
 	uint8_t		structVersion;
 	uint8_t		structData[2041];
-} __attribute__ ((packed));
+} __packed;
 
 /* Boot Descriptor (ECMA 167r3 2/9.4) */
 struct bootDesc {
@@ -140,7 +140,7 @@ struct bootDesc {
 	__le16			flags;
 	uint8_t			reserved2[32];
 	uint8_t			bootUse[1906];
-} __attribute__ ((packed));
+} __packed;
 
 /* Flags (ECMA 167r3 2/9.4.12) */
 #define BOOT_FLAGS_ERASE		0x01
@@ -149,7 +149,7 @@ struct bootDesc {
 struct extent_ad {
 	__le32		extLength;
 	__le32		extLocation;
-} __attribute__ ((packed));
+} __packed;
 
 struct kernel_extent_ad {
 	uint32_t	extLength;
@@ -166,7 +166,7 @@ struct tag {
 	__le16		descCRC;
 	__le16		descCRCLength;
 	__le32		tagLocation;
-} __attribute__ ((packed));
+} __packed;
 
 /* Tag Identifier (ECMA 167r3 3/7.2.1) */
 #define TAG_IDENT_PVD			0x0001
@@ -186,7 +186,7 @@ struct NSRDesc {
 	uint8_t		structVersion;
 	uint8_t		reserved;
 	uint8_t		structData[2040];
-} __attribute__ ((packed));
+} __packed;
 
 /* Primary Volume Descriptor (ECMA 167r3 3/10.1) */
 struct primaryVolDesc {
@@ -212,7 +212,7 @@ struct primaryVolDesc {
 	__le32			predecessorVolDescSeqLocation;
 	__le16			flags;
 	uint8_t			reserved[22];
-} __attribute__ ((packed));
+} __packed;
 
 /* Flags (ECMA 167r3 3/10.1.21) */
 #define PVD_FLAGS_VSID_COMMON		0x0001
@@ -223,7 +223,7 @@ struct anchorVolDescPtr {
 	struct extent_ad	mainVolDescSeqExt;
 	struct extent_ad	reserveVolDescSeqExt;
 	uint8_t	 		reserved[480];
-} __attribute__ ((packed));
+} __packed;
 
 /* Volume Descriptor Pointer (ECMA 167r3 3/10.3) */
 struct volDescPtr {
@@ -231,7 +231,7 @@ struct volDescPtr {
 	__le32			volDescSeqNum;
 	struct extent_ad	nextVolDescSeqExt;
 	uint8_t			reserved[484];
-} __attribute__ ((packed));
+} __packed;
 
 /* Implementation Use Volume Descriptor (ECMA 167r3 3/10.4) */
 struct impUseVolDesc {
@@ -239,7 +239,7 @@ struct impUseVolDesc {
 	__le32		volDescSeqNum;
 	struct regid	impIdent;
 	uint8_t		impUse[460];
-} __attribute__ ((packed));
+} __packed;
 
 /* Partition Descriptor (ECMA 167r3 3/10.5) */
 struct partitionDesc {
@@ -255,7 +255,7 @@ struct partitionDesc {
 	struct regid impIdent;
 	uint8_t impUse[128];
 	uint8_t reserved[156];
-} __attribute__ ((packed));
+} __packed;
 
 /* Partition Flags (ECMA 167r3 3/10.5.3) */
 #define PD_PARTITION_FLAGS_ALLOC	0x0001
@@ -291,14 +291,14 @@ struct logicalVolDesc {
 	uint8_t			impUse[128];
 	struct extent_ad	integritySeqExt;
 	uint8_t			partitionMaps[0];
-} __attribute__ ((packed));
+} __packed;
 
 /* Generic Partition Map (ECMA 167r3 3/10.7.1) */
 struct genericPartitionMap {
 	uint8_t		partitionMapType;
 	uint8_t		partitionMapLength;
 	uint8_t		partitionMapping[0];
-} __attribute__ ((packed));
+} __packed;
 
 /* Partition Map Type (ECMA 167r3 3/10.7.1.1) */
 #define GP_PARTITION_MAP_TYPE_UNDEF	0x00
@@ -311,14 +311,14 @@ struct genericPartitionMap1 {
 	uint8_t		partitionMapLength;
 	__le16		volSeqNum;
 	__le16		partitionNum;
-} __attribute__ ((packed));
+} __packed;
 
 /* Type 2 Partition Map (ECMA 167r3 3/10.7.3) */
 struct genericPartitionMap2 {
 	uint8_t		partitionMapType;
 	uint8_t		partitionMapLength;
 	uint8_t		partitionIdent[62];
-} __attribute__ ((packed));
+} __packed;
 
 /* Unallocated Space Descriptor (ECMA 167r3 3/10.8) */
 struct unallocSpaceDesc {
@@ -326,13 +326,13 @@ struct unallocSpaceDesc {
 	__le32			volDescSeqNum;
 	__le32			numAllocDescs;
 	struct extent_ad	allocDescs[0];
-} __attribute__ ((packed));
+} __packed;
 
 /* Terminating Descriptor (ECMA 167r3 3/10.9) */
 struct terminatingDesc {
 	struct tag	descTag;
 	uint8_t		reserved[496];
-} __attribute__ ((packed));
+} __packed;
 
 /* Logical Volume Integrity Descriptor (ECMA 167r3 3/10.10) */
 struct logicalVolIntegrityDesc {
@@ -346,7 +346,7 @@ struct logicalVolIntegrityDesc {
 	__le32			freeSpaceTable[0];
 	__le32			sizeTable[0];
 	uint8_t			impUse[0];
-} __attribute__ ((packed));
+} __packed;
 
 /* Integrity Type (ECMA 167r3 3/10.10.3) */
 #define LVID_INTEGRITY_TYPE_OPEN	0x00000000
@@ -356,7 +356,7 @@ struct logicalVolIntegrityDesc {
 struct lb_addr {
 	__le32		logicalBlockNum;
 	__le16	 	partitionReferenceNum;
-} __attribute__ ((packed));
+} __packed;
 
 /* ... and its in-core analog */
 struct kernel_lb_addr {
@@ -368,14 +368,14 @@ struct kernel_lb_addr {
 struct short_ad {
         __le32		extLength;
         __le32		extPosition;
-} __attribute__ ((packed));
+} __packed;
 
 /* Long Allocation Descriptor (ECMA 167r3 4/14.14.2) */
 struct long_ad {
 	__le32		extLength;
 	struct lb_addr	extLocation;
 	uint8_t		impUse[6];
-} __attribute__ ((packed));
+} __packed;
 
 struct kernel_long_ad {
 	uint32_t		extLength;
@@ -389,7 +389,7 @@ struct ext_ad {
 	__le32		recordedLength;
 	__le32		informationLength;
 	struct lb_addr	extLocation;
-} __attribute__ ((packed));
+} __packed;
 
 struct kernel_ext_ad {
 	uint32_t		extLength;
@@ -434,7 +434,7 @@ struct fileSetDesc {
 	struct long_ad		nextExt;
 	struct long_ad		streamDirectoryICB;
 	uint8_t			reserved[32];
-} __attribute__ ((packed));
+} __packed;
 
 /* Partition Header Descriptor (ECMA 167r3 4/14.3) */
 struct partitionHeaderDesc {
@@ -444,7 +444,7 @@ struct partitionHeaderDesc {
 	struct short_ad	freedSpaceTable;
 	struct short_ad	freedSpaceBitmap;
 	uint8_t		reserved[88];
-} __attribute__ ((packed));
+} __packed;
 
 /* File Identifier Descriptor (ECMA 167r3 4/14.4) */
 struct fileIdentDesc {
@@ -457,7 +457,7 @@ struct fileIdentDesc {
 	uint8_t		impUse[0];
 	uint8_t		fileIdent[0];
 	uint8_t		padding[0];
-} __attribute__ ((packed));
+} __packed;
 
 /* File Characteristics (ECMA 167r3 4/14.4.3) */
 #define FID_FILE_CHAR_HIDDEN		0x01
@@ -471,7 +471,7 @@ struct allocExtDesc {
 	struct tag	descTag;
 	__le32		previousAllocExtLocation;
 	__le32		lengthAllocDescs;
-} __attribute__ ((packed));
+} __packed;
 
 /* ICB Tag (ECMA 167r3 4/14.6) */
 struct icbtag {
@@ -483,7 +483,7 @@ struct icbtag {
 	uint8_t		fileType;
 	struct lb_addr	parentICBLocation;
 	__le16		flags;
-} __attribute__ ((packed));
+} __packed;
 
 /* Strategy Type (ECMA 167r3 4/14.6.2) */
 #define ICBTAG_STRATEGY_TYPE_UNDEF	0x0000
@@ -531,13 +531,13 @@ struct indirectEntry {
 	struct tag	descTag;
 	struct icbtag	icbTag;
 	struct long_ad	indirectICB;
-} __attribute__ ((packed));
+} __packed;
 
 /* Terminal Entry (ECMA 167r3 4/14.8) */
 struct terminalEntry {
 	struct tag	descTag;
 	struct icbtag	icbTag;
-} __attribute__ ((packed));
+} __packed;
 
 /* File Entry (ECMA 167r3 4/14.9) */
 struct fileEntry {
@@ -563,7 +563,7 @@ struct fileEntry {
 	__le32			lengthAllocDescs;
 	uint8_t			extendedAttr[0];
 	uint8_t			allocDescs[0];
-} __attribute__ ((packed));
+} __packed;
 
 /* Permissions (ECMA 167r3 4/14.9.5) */
 #define FE_PERM_O_EXEC			0x00000001U
@@ -607,7 +607,7 @@ struct extendedAttrHeaderDesc {
 	struct tag	descTag;
 	__le32		impAttrLocation;
 	__le32		appAttrLocation;
-} __attribute__ ((packed));
+} __packed;
 
 /* Generic Format (ECMA 167r3 4/14.10.2) */
 struct genericFormat {
@@ -616,7 +616,7 @@ struct genericFormat {
 	uint8_t		reserved[3];
 	__le32		attrLength;
 	uint8_t		attrData[0];
-} __attribute__ ((packed));
+} __packed;
 
 /* Character Set Information (ECMA 167r3 4/14.10.3) */
 struct charSetInfo {
@@ -627,7 +627,7 @@ struct charSetInfo {
 	__le32		escapeSeqLength;
 	uint8_t		charSetType;
 	uint8_t		escapeSeq[0];
-} __attribute__ ((packed));
+} __packed;
 
 /* Alternate Permissions (ECMA 167r3 4/14.10.4) */
 struct altPerms {
@@ -638,7 +638,7 @@ struct altPerms {
 	__le16		ownerIdent;
 	__le16		groupIdent;
 	__le16		permission;
-} __attribute__ ((packed));
+} __packed;
 
 /* File Times Extended Attribute (ECMA 167r3 4/14.10.5) */
 struct fileTimesExtAttr {
@@ -649,7 +649,7 @@ struct fileTimesExtAttr {
 	__le32		dataLength;
 	__le32		fileTimeExistence;
 	uint8_t		fileTimes;
-} __attribute__ ((packed));
+} __packed;
 
 /* FileTimeExistence (ECMA 167r3 4/14.10.5.6) */
 #define FTE_CREATION			0x00000001
@@ -666,7 +666,7 @@ struct infoTimesExtAttr {
 	__le32		dataLength;
 	__le32		infoTimeExistence;
 	uint8_t		infoTimes[0];
-} __attribute__ ((packed));
+} __packed;
 
 /* Device Specification (ECMA 167r3 4/14.10.7) */
 struct deviceSpec {
@@ -678,7 +678,7 @@ struct deviceSpec {
 	__le32		majorDeviceIdent;
 	__le32		minorDeviceIdent;
 	uint8_t		impUse[0];
-} __attribute__ ((packed));
+} __packed;
 
 /* Implementation Use Extended Attr (ECMA 167r3 4/14.10.8) */
 struct impUseExtAttr {
@@ -689,7 +689,7 @@ struct impUseExtAttr {
 	__le32		impUseLength;
 	struct regid	impIdent;
 	uint8_t		impUse[0];
-} __attribute__ ((packed));
+} __packed;
 
 /* Application Use Extended Attribute (ECMA 167r3 4/14.10.9) */
 struct appUseExtAttr {
@@ -700,7 +700,7 @@ struct appUseExtAttr {
 	__le32		appUseLength;
 	struct regid	appIdent;
 	uint8_t		appUse[0];
-} __attribute__ ((packed));
+} __packed;
 
 #define EXTATTR_CHAR_SET		1
 #define EXTATTR_ALT_PERMS		3
@@ -716,7 +716,7 @@ struct unallocSpaceEntry {
 	struct icbtag	icbTag;
 	__le32		lengthAllocDescs;
 	uint8_t		allocDescs[0];
-} __attribute__ ((packed));
+} __packed;
 
 /* Space Bitmap Descriptor (ECMA 167r3 4/14.12) */
 struct spaceBitmapDesc {
@@ -724,7 +724,7 @@ struct spaceBitmapDesc {
 	__le32		numOfBits;
 	__le32		numOfBytes;
 	uint8_t		bitmap[0];
-} __attribute__ ((packed));
+} __packed;
 
 /* Partition Integrity Entry (ECMA 167r3 4/14.13) */
 struct partitionIntegrityEntry {
@@ -735,7 +735,7 @@ struct partitionIntegrityEntry {
 	uint8_t			reserved[175];
 	struct regid		impIdent;
 	uint8_t			impUse[256];
-} __attribute__ ((packed));
+} __packed;
 
 /* Short Allocation Descriptor (ECMA 167r3 4/14.14.1) */
 
@@ -753,7 +753,7 @@ struct partitionIntegrityEntry {
 struct logicalVolHeaderDesc {
 	__le64		uniqueID;
 	uint8_t		reserved[24];
-} __attribute__ ((packed));
+} __packed;
 
 /* Path Component (ECMA 167r3 4/14.16.1) */
 struct pathComponent {
@@ -761,7 +761,7 @@ struct pathComponent {
 	uint8_t		lengthComponentIdent;
 	__le16		componentFileVersionNum;
 	dstring		componentIdent[0];
-} __attribute__ ((packed));
+} __packed;
 
 /* File Entry (ECMA 167r3 4/14.17) */
 struct extendedFileEntry {
@@ -791,6 +791,6 @@ struct extendedFileEntry {
 	__le32			lengthAllocDescs;
 	uint8_t			extendedAttr[0];
 	uint8_t			allocDescs[0];
-} __attribute__ ((packed));
+} __packed;
 
 #endif /* _ECMA_167_H */
diff --git a/fs/udf/file.c b/fs/udf/file.c
index dbcb3a4a0cb9..e04cc0cdca9d 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -176,54 +176,46 @@ long udf_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
 	struct inode *inode = file_inode(filp);
 	long old_block, new_block;
-	int result = -EINVAL;
+	int result;
 
 	if (inode_permission(inode, MAY_READ) != 0) {
 		udf_debug("no permission to access inode %lu\n", inode->i_ino);
-		result = -EPERM;
-		goto out;
+		return -EPERM;
 	}
 
-	if (!arg) {
+	if (!arg && ((cmd == UDF_GETVOLIDENT) || (cmd == UDF_GETEASIZE) ||
+		     (cmd == UDF_RELOCATE_BLOCKS) || (cmd == UDF_GETEABLOCK))) {
 		udf_debug("invalid argument to udf_ioctl\n");
-		result = -EINVAL;
-		goto out;
+		return -EINVAL;
 	}
 
 	switch (cmd) {
 	case UDF_GETVOLIDENT:
 		if (copy_to_user((char __user *)arg,
 				 UDF_SB(inode->i_sb)->s_volume_ident, 32))
-			result = -EFAULT;
-		else
-			result = 0;
-		goto out;
+			return -EFAULT;
+		return 0;
 	case UDF_RELOCATE_BLOCKS:
-		if (!capable(CAP_SYS_ADMIN)) {
-			result = -EPERM;
-			goto out;
-		}
-		if (get_user(old_block, (long __user *)arg)) {
-			result = -EFAULT;
-			goto out;
-		}
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+		if (get_user(old_block, (long __user *)arg))
+			return -EFAULT;
 		result = udf_relocate_blocks(inode->i_sb,
 						old_block, &new_block);
 		if (result == 0)
 			result = put_user(new_block, (long __user *)arg);
-		goto out;
+		return result;
 	case UDF_GETEASIZE:
-		result = put_user(UDF_I(inode)->i_lenEAttr, (int __user *)arg);
-		goto out;
+		return put_user(UDF_I(inode)->i_lenEAttr, (int __user *)arg);
 	case UDF_GETEABLOCK:
-		result = copy_to_user((char __user *)arg,
-				      UDF_I(inode)->i_ext.i_data,
-				      UDF_I(inode)->i_lenEAttr) ? -EFAULT : 0;
-		goto out;
+		return copy_to_user((char __user *)arg,
+				    UDF_I(inode)->i_ext.i_data,
+				    UDF_I(inode)->i_lenEAttr) ? -EFAULT : 0;
+	default:
+		return -ENOIOCTLCMD;
 	}
 
-out:
-	return result;
+	return 0;
 }
 
 static int udf_release_file(struct inode *inode, struct file *filp)
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 0f3db71753aa..8ec6b3df0bc7 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -43,10 +43,6 @@
 #include "udf_i.h"
 #include "udf_sb.h"
 
-MODULE_AUTHOR("Ben Fennema");
-MODULE_DESCRIPTION("Universal Disk Format Filesystem");
-MODULE_LICENSE("GPL");
-
 #define EXTENT_MERGE_SIZE 5
 
 static umode_t udf_convert_permissions(struct fileEntry *);
@@ -57,14 +53,12 @@ static sector_t inode_getblk(struct inode *, sector_t, int *, int *);
 static int8_t udf_insert_aext(struct inode *, struct extent_position,
 			      struct kernel_lb_addr, uint32_t);
 static void udf_split_extents(struct inode *, int *, int, int,
-			      struct kernel_long_ad[EXTENT_MERGE_SIZE], int *);
+			      struct kernel_long_ad *, int *);
 static void udf_prealloc_extents(struct inode *, int, int,
-				 struct kernel_long_ad[EXTENT_MERGE_SIZE], int *);
-static void udf_merge_extents(struct inode *,
-			      struct kernel_long_ad[EXTENT_MERGE_SIZE], int *);
-static void udf_update_extents(struct inode *,
-			       struct kernel_long_ad[EXTENT_MERGE_SIZE], int, int,
-			       struct extent_position *);
+				 struct kernel_long_ad *, int *);
+static void udf_merge_extents(struct inode *, struct kernel_long_ad *, int *);
+static void udf_update_extents(struct inode *, struct kernel_long_ad *, int,
+			       int, struct extent_position *);
 static int udf_get_block(struct inode *, sector_t, struct buffer_head *, int);
 
 static void __udf_clear_extent_cache(struct inode *inode)
@@ -111,7 +105,7 @@ static int udf_read_extent_cache(struct inode *inode, loff_t bcount,
 
 /* Add extent to extent cache */
 static void udf_update_extent_cache(struct inode *inode, loff_t estart,
-				    struct extent_position *pos, int next_epos)
+				    struct extent_position *pos)
 {
 	struct udf_inode_info *iinfo = UDF_I(inode);
 
@@ -120,19 +114,16 @@ static void udf_update_extent_cache(struct inode *inode, loff_t estart,
 	__udf_clear_extent_cache(inode);
 	if (pos->bh)
 		get_bh(pos->bh);
-	memcpy(&iinfo->cached_extent.epos, pos,
-	       sizeof(struct extent_position));
+	memcpy(&iinfo->cached_extent.epos, pos, sizeof(struct extent_position));
 	iinfo->cached_extent.lstart = estart;
-	if (next_epos)
-		switch (iinfo->i_alloc_type) {
-		case ICBTAG_FLAG_AD_SHORT:
-			iinfo->cached_extent.epos.offset -=
-			sizeof(struct short_ad);
-			break;
-		case ICBTAG_FLAG_AD_LONG:
-			iinfo->cached_extent.epos.offset -=
-			sizeof(struct long_ad);
-		}
+	switch (iinfo->i_alloc_type) {
+	case ICBTAG_FLAG_AD_SHORT:
+		iinfo->cached_extent.epos.offset -= sizeof(struct short_ad);
+		break;
+	case ICBTAG_FLAG_AD_LONG:
+		iinfo->cached_extent.epos.offset -= sizeof(struct long_ad);
+		break;
+	}
 	spin_unlock(&iinfo->i_extent_cache_lock);
 }
 
@@ -747,11 +738,8 @@ static sector_t inode_getblk(struct inode *inode, sector_t block,
 				 ~(inode->i_sb->s_blocksize - 1));
 			udf_write_aext(inode, &cur_epos, &eloc, elen, 1);
 		}
-		brelse(prev_epos.bh);
-		brelse(cur_epos.bh);
-		brelse(next_epos.bh);
 		newblock = udf_get_lb_pblock(inode->i_sb, &eloc, offset);
-		return newblock;
+		goto out_free;
 	}
 
 	/* Are we beyond EOF? */
@@ -774,11 +762,9 @@ static sector_t inode_getblk(struct inode *inode, sector_t block,
 		/* Create extents for the hole between EOF and offset */
 		ret = udf_do_extend_file(inode, &prev_epos, laarr, offset);
 		if (ret < 0) {
-			brelse(prev_epos.bh);
-			brelse(cur_epos.bh);
-			brelse(next_epos.bh);
 			*err = ret;
-			return 0;
+			newblock = 0;
+			goto out_free;
 		}
 		c = 0;
 		offset = 0;
@@ -841,11 +827,9 @@ static sector_t inode_getblk(struct inode *inode, sector_t block,
 				iinfo->i_location.partitionReferenceNum,
 				goal, err);
 		if (!newblocknum) {
-			brelse(prev_epos.bh);
-			brelse(cur_epos.bh);
-			brelse(next_epos.bh);
 			*err = -ENOSPC;
-			return 0;
+			newblock = 0;
+			goto out_free;
 		}
 		if (isBeyondEOF)
 			iinfo->i_lenExtents += inode->i_sb->s_blocksize;
@@ -857,14 +841,12 @@ static sector_t inode_getblk(struct inode *inode, sector_t block,
 	 * block */
 	udf_split_extents(inode, &c, offset, newblocknum, laarr, &endnum);
 
-#ifdef UDF_PREALLOCATE
 	/* We preallocate blocks only for regular files. It also makes sense
 	 * for directories but there's a problem when to drop the
 	 * preallocation. We might use some delayed work for that but I feel
 	 * it's overengineering for a filesystem like UDF. */
 	if (S_ISREG(inode->i_mode))
 		udf_prealloc_extents(inode, c, lastblock, laarr, &endnum);
-#endif
 
 	/* merge any continuous blocks in laarr */
 	udf_merge_extents(inode, laarr, &endnum);
@@ -874,15 +856,11 @@ static sector_t inode_getblk(struct inode *inode, sector_t block,
 	 * the new number of extents is less than the old number */
 	udf_update_extents(inode, laarr, startnum, endnum, &prev_epos);
 
-	brelse(prev_epos.bh);
-	brelse(cur_epos.bh);
-	brelse(next_epos.bh);
-
 	newblock = udf_get_pblock(inode->i_sb, newblocknum,
 				iinfo->i_location.partitionReferenceNum, 0);
 	if (!newblock) {
 		*err = -EIO;
-		return 0;
+		goto out_free;
 	}
 	*new = 1;
 	iinfo->i_next_alloc_block = block;
@@ -893,13 +871,15 @@ static sector_t inode_getblk(struct inode *inode, sector_t block,
 		udf_sync_inode(inode);
 	else
 		mark_inode_dirty(inode);
-
+out_free:
+	brelse(prev_epos.bh);
+	brelse(cur_epos.bh);
+	brelse(next_epos.bh);
 	return newblock;
 }
 
 static void udf_split_extents(struct inode *inode, int *c, int offset,
-			      int newblocknum,
-			      struct kernel_long_ad laarr[EXTENT_MERGE_SIZE],
+			      int newblocknum, struct kernel_long_ad *laarr,
 			      int *endnum)
 {
 	unsigned long blocksize = inode->i_sb->s_blocksize;
@@ -963,7 +943,7 @@ static void udf_split_extents(struct inode *inode, int *c, int offset,
 }
 
 static void udf_prealloc_extents(struct inode *inode, int c, int lastblock,
-				 struct kernel_long_ad laarr[EXTENT_MERGE_SIZE],
+				 struct kernel_long_ad *laarr,
 				 int *endnum)
 {
 	int start, length = 0, currlength = 0, i;
@@ -1058,8 +1038,7 @@ static void udf_prealloc_extents(struct inode *inode, int c, int lastblock,
 	}
 }
 
-static void udf_merge_extents(struct inode *inode,
-			      struct kernel_long_ad laarr[EXTENT_MERGE_SIZE],
+static void udf_merge_extents(struct inode *inode, struct kernel_long_ad *laarr,
 			      int *endnum)
 {
 	int i;
@@ -1158,8 +1137,7 @@ static void udf_merge_extents(struct inode *inode,
 	}
 }
 
-static void udf_update_extents(struct inode *inode,
-			       struct kernel_long_ad laarr[EXTENT_MERGE_SIZE],
+static void udf_update_extents(struct inode *inode, struct kernel_long_ad *laarr,
 			       int startnum, int endnum,
 			       struct extent_position *epos)
 {
@@ -1299,6 +1277,12 @@ static int udf_read_inode(struct inode *inode, bool hidden_inode)
 	int ret = -EIO;
 
 reread:
+	if (iloc->partitionReferenceNum >= sbi->s_partitions) {
+		udf_debug("partition reference: %d > logical volume partitions: %d\n",
+			  iloc->partitionReferenceNum, sbi->s_partitions);
+		return -EIO;
+	}
+
 	if (iloc->logicalBlockNum >=
 	    sbi->s_partmaps[iloc->partitionReferenceNum].s_partition_len) {
 		udf_debug("block=%d, partition=%d out of range\n",
@@ -1549,7 +1533,7 @@ reread:
 		break;
 	case ICBTAG_FILE_TYPE_SYMLINK:
 		inode->i_data.a_ops = &udf_symlink_aops;
-		inode->i_op = &page_symlink_inode_operations;
+		inode->i_op = &udf_symlink_inode_operations;
 		inode_nohighmem(inode);
 		inode->i_mode = S_IFLNK | S_IRWXUGO;
 		break;
@@ -1627,6 +1611,14 @@ static int udf_sync_inode(struct inode *inode)
 	return udf_update_inode(inode, 1);
 }
 
+static void udf_adjust_time(struct udf_inode_info *iinfo, struct timespec time)
+{
+	if (iinfo->i_crtime.tv_sec > time.tv_sec ||
+	    (iinfo->i_crtime.tv_sec == time.tv_sec &&
+	     iinfo->i_crtime.tv_nsec > time.tv_nsec))
+		iinfo->i_crtime = time;
+}
+
 static int udf_update_inode(struct inode *inode, int do_sync)
 {
 	struct buffer_head *bh = NULL;
@@ -1753,20 +1745,9 @@ static int udf_update_inode(struct inode *inode, int do_sync)
 		efe->objectSize = cpu_to_le64(inode->i_size);
 		efe->logicalBlocksRecorded = cpu_to_le64(lb_recorded);
 
-		if (iinfo->i_crtime.tv_sec > inode->i_atime.tv_sec ||
-		    (iinfo->i_crtime.tv_sec == inode->i_atime.tv_sec &&
-		     iinfo->i_crtime.tv_nsec > inode->i_atime.tv_nsec))
-			iinfo->i_crtime = inode->i_atime;
-
-		if (iinfo->i_crtime.tv_sec > inode->i_mtime.tv_sec ||
-		    (iinfo->i_crtime.tv_sec == inode->i_mtime.tv_sec &&
-		     iinfo->i_crtime.tv_nsec > inode->i_mtime.tv_nsec))
-			iinfo->i_crtime = inode->i_mtime;
-
-		if (iinfo->i_crtime.tv_sec > inode->i_ctime.tv_sec ||
-		    (iinfo->i_crtime.tv_sec == inode->i_ctime.tv_sec &&
-		     iinfo->i_crtime.tv_nsec > inode->i_ctime.tv_nsec))
-			iinfo->i_crtime = inode->i_ctime;
+		udf_adjust_time(iinfo, inode->i_atime);
+		udf_adjust_time(iinfo, inode->i_mtime);
+		udf_adjust_time(iinfo, inode->i_ctime);
 
 		udf_time_to_disk_stamp(&efe->accessTime, inode->i_atime);
 		udf_time_to_disk_stamp(&efe->modificationTime, inode->i_mtime);
@@ -2286,8 +2267,7 @@ int8_t inode_bmap(struct inode *inode, sector_t block,
 		  uint32_t *elen, sector_t *offset)
 {
 	unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits;
-	loff_t lbcount = 0, bcount =
-	    (loff_t) block << blocksize_bits;
+	loff_t lbcount = 0, bcount = (loff_t) block << blocksize_bits;
 	int8_t etype;
 	struct udf_inode_info *iinfo;
 
@@ -2308,7 +2288,7 @@ int8_t inode_bmap(struct inode *inode, sector_t block,
 		lbcount += *elen;
 	} while (lbcount <= bcount);
 	/* update extent cache */
-	udf_update_extent_cache(inode, lbcount - *elen, pos, 1);
+	udf_update_extent_cache(inode, lbcount - *elen, pos);
 	*offset = (bcount + *elen - lbcount) >> blocksize_bits;
 
 	return etype;
diff --git a/fs/udf/lowlevel.c b/fs/udf/lowlevel.c
index 6ad5a453af97..5c7ec121990d 100644
--- a/fs/udf/lowlevel.c
+++ b/fs/udf/lowlevel.c
@@ -58,7 +58,7 @@ unsigned long udf_get_last_block(struct super_block *sb)
 	 */
 	if (ioctl_by_bdev(bdev, CDROM_LAST_WRITTEN, (unsigned long) &lblock) ||
 	    lblock == 0)
-		lblock = bdev->bd_inode->i_size >> sb->s_blocksize_bits;
+		lblock = i_size_read(bdev->bd_inode) >> sb->s_blocksize_bits;
 
 	if (lblock)
 		return lblock - 1;
diff --git a/fs/udf/misc.c b/fs/udf/misc.c
index 71d1c25f360d..3949c4bec3a3 100644
--- a/fs/udf/misc.c
+++ b/fs/udf/misc.c
@@ -141,8 +141,6 @@ struct genericFormat *udf_add_extendedattr(struct inode *inode, uint32_t size,
 		iinfo->i_lenEAttr += size;
 		return (struct genericFormat *)&ea[offset];
 	}
-	if (loc & 0x02)
-		;
 
 	return NULL;
 }
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index 2d65e280748b..babf48d0e553 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -931,7 +931,7 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
 	}
 
 	inode->i_data.a_ops = &udf_symlink_aops;
-	inode->i_op = &page_symlink_inode_operations;
+	inode->i_op = &udf_symlink_inode_operations;
 	inode_nohighmem(inode);
 
 	if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) {
diff --git a/fs/udf/osta_udf.h b/fs/udf/osta_udf.h
index fbff74654df2..a4da59e38b7f 100644
--- a/fs/udf/osta_udf.h
+++ b/fs/udf/osta_udf.h
@@ -70,17 +70,17 @@ struct UDFIdentSuffix {
 	uint8_t		OSClass;
 	uint8_t		OSIdentifier;
 	uint8_t		reserved[4];
-} __attribute__ ((packed));
+} __packed;
 
 struct impIdentSuffix {
 	uint8_t		OSClass;
 	uint8_t		OSIdentifier;
 	uint8_t		reserved[6];
-} __attribute__ ((packed));
+} __packed;
 
 struct appIdentSuffix {
 	uint8_t		impUse[8];
-} __attribute__ ((packed));
+} __packed;
 
 /* Logical Volume Integrity Descriptor (UDF 2.50 2.2.6) */
 /* Implementation Use (UDF 2.50 2.2.6.4) */
@@ -92,7 +92,7 @@ struct logicalVolIntegrityDescImpUse {
 	__le16		minUDFWriteRev;
 	__le16		maxUDFWriteRev;
 	uint8_t		impUse[0];
-} __attribute__ ((packed));
+} __packed;
 
 /* Implementation Use Volume Descriptor (UDF 2.50 2.2.7) */
 /* Implementation Use (UDF 2.50 2.2.7.2) */
@@ -104,7 +104,7 @@ struct impUseVolDescImpUse {
 	dstring		LVInfo3[36];
 	struct regid	impIdent;
 	uint8_t		impUse[128];
-} __attribute__ ((packed));
+} __packed;
 
 struct udfPartitionMap2 {
 	uint8_t		partitionMapType;
@@ -113,7 +113,7 @@ struct udfPartitionMap2 {
 	struct regid	partIdent;
 	__le16		volSeqNum;
 	__le16		partitionNum;
-} __attribute__ ((packed));
+} __packed;
 
 /* Virtual Partition Map (UDF 2.50 2.2.8) */
 struct virtualPartitionMap {
@@ -124,7 +124,7 @@ struct virtualPartitionMap {
 	__le16		volSeqNum;
 	__le16		partitionNum;
 	uint8_t		reserved2[24];
-} __attribute__ ((packed));
+} __packed;
 
 /* Sparable Partition Map (UDF 2.50 2.2.9) */
 struct sparablePartitionMap {
@@ -139,7 +139,7 @@ struct sparablePartitionMap {
 	uint8_t reserved2[1];
 	__le32 sizeSparingTable;
 	__le32 locSparingTable[4];
-} __attribute__ ((packed));
+} __packed;
 
 /* Metadata Partition Map (UDF 2.4.0 2.2.10) */
 struct metadataPartitionMap {
@@ -156,14 +156,14 @@ struct metadataPartitionMap {
 	__le16		alignUnitSize;
 	uint8_t		flags;
 	uint8_t		reserved2[5];
-} __attribute__ ((packed));
+} __packed;
 
 /* Virtual Allocation Table (UDF 1.5 2.2.10) */
 struct virtualAllocationTable15 {
 	__le32		VirtualSector[0];
 	struct regid	vatIdent;
 	__le32		previousVATICBLoc;
-} __attribute__ ((packed));
+} __packed;
 
 #define ICBTAG_FILE_TYPE_VAT15		0x00U
 
@@ -181,7 +181,7 @@ struct virtualAllocationTable20 {
 	__le16		reserved;
 	uint8_t		impUse[0];
 	__le32		vatEntry[0];
-} __attribute__ ((packed));
+} __packed;
 
 #define ICBTAG_FILE_TYPE_VAT20		0xF8U
 
@@ -189,7 +189,7 @@ struct virtualAllocationTable20 {
 struct sparingEntry {
 	__le32		origLocation;
 	__le32		mappedLocation;
-} __attribute__ ((packed));
+} __packed;
 
 struct sparingTable {
 	struct tag	descTag;
@@ -199,7 +199,7 @@ struct sparingTable {
 	__le32		sequenceNum;
 	struct sparingEntry
 			mapEntry[0];
-} __attribute__ ((packed));
+} __packed;
 
 /* Metadata File (and Metadata Mirror File) (UDF 2.50 2.2.13.1) */
 #define ICBTAG_FILE_TYPE_MAIN		0xFA
@@ -210,7 +210,7 @@ struct sparingTable {
 struct allocDescImpUse {
 	__le16		flags;
 	uint8_t		impUse[4];
-} __attribute__ ((packed));
+} __packed;
 
 #define AD_IU_EXT_ERASED		0x0001
 
@@ -222,7 +222,7 @@ struct allocDescImpUse {
 struct freeEaSpace {
 	__le16		headerChecksum;
 	uint8_t		freeEASpace[0];
-} __attribute__ ((packed));
+} __packed;
 
 /* DVD Copyright Management Information (UDF 2.50 3.3.4.5.1.2) */
 struct DVDCopyrightImpUse {
@@ -230,14 +230,14 @@ struct DVDCopyrightImpUse {
 	uint8_t		CGMSInfo;
 	uint8_t		dataType;
 	uint8_t		protectionSystemInfo[4];
-} __attribute__ ((packed));
+} __packed;
 
 /* Application Use Extended Attribute (UDF 2.50 3.3.4.6) */
 /* FreeAppEASpace (UDF 2.50 3.3.4.6.1) */
 struct freeAppEASpace {
 	__le16		headerChecksum;
 	uint8_t		freeEASpace[0];
-} __attribute__ ((packed));
+} __packed;
 
 /* UDF Defined System Stream (UDF 2.50 3.3.7) */
 #define UDF_ID_UNIQUE_ID		"*UDF Unique ID Mapping Data"
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 4942549e7dc8..14b4bc1f6801 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -264,9 +264,6 @@ static void __exit exit_udf_fs(void)
 	destroy_inodecache();
 }
 
-module_init(init_udf_fs)
-module_exit(exit_udf_fs)
-
 static int udf_sb_alloc_partition_maps(struct super_block *sb, u32 count)
 {
 	struct udf_sb_info *sbi = UDF_SB(sb);
@@ -1216,7 +1213,8 @@ static int udf_load_vat(struct super_block *sb, int p_index, int type1_index)
 	struct udf_inode_info *vati;
 	uint32_t pos;
 	struct virtualAllocationTable20 *vat20;
-	sector_t blocks = sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits;
+	sector_t blocks = i_size_read(sb->s_bdev->bd_inode) >>
+			  sb->s_blocksize_bits;
 
 	udf_find_vat_block(sb, p_index, type1_index, sbi->s_last_block);
 	if (!sbi->s_vat_inode &&
@@ -1806,7 +1804,7 @@ static int udf_check_anchor_block(struct super_block *sb, sector_t block,
 
 	if (UDF_QUERY_FLAG(sb, UDF_FLAG_VARCONV) &&
 	    udf_fixed_to_variable(block) >=
-	    sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits)
+	    i_size_read(sb->s_bdev->bd_inode) >> sb->s_blocksize_bits)
 		return -EAGAIN;
 
 	bh = udf_read_tagged(sb, block, block, &ident);
@@ -1868,7 +1866,7 @@ static int udf_scan_anchors(struct super_block *sb, sector_t *lastblock,
 		last[last_count++] = *lastblock - 152;
 
 	for (i = 0; i < last_count; i++) {
-		if (last[i] >= sb->s_bdev->bd_inode->i_size >>
+		if (last[i] >= i_size_read(sb->s_bdev->bd_inode) >>
 				sb->s_blocksize_bits)
 			continue;
 		ret = udf_check_anchor_block(sb, last[i], fileset);
@@ -1957,7 +1955,7 @@ static int udf_load_vrs(struct super_block *sb, struct udf_options *uopt,
 		if (!nsr_off) {
 			if (!silent)
 				udf_warn(sb, "No VRS found\n");
-			return 0;
+			return -EINVAL;
 		}
 		if (nsr_off == -1)
 			udf_debug("Failed to read sector at offset %d. "
@@ -1986,6 +1984,7 @@ static void udf_open_lvid(struct super_block *sb)
 	struct buffer_head *bh = sbi->s_lvid_bh;
 	struct logicalVolIntegrityDesc *lvid;
 	struct logicalVolIntegrityDescImpUse *lvidiu;
+	struct timespec ts;
 
 	if (!bh)
 		return;
@@ -1997,8 +1996,8 @@ static void udf_open_lvid(struct super_block *sb)
 	mutex_lock(&sbi->s_alloc_mutex);
 	lvidiu->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX;
 	lvidiu->impIdent.identSuffix[1] = UDF_OS_ID_LINUX;
-	udf_time_to_disk_stamp(&lvid->recordingDateAndTime,
-				CURRENT_TIME);
+	ktime_get_real_ts(&ts);
+	udf_time_to_disk_stamp(&lvid->recordingDateAndTime, ts);
 	lvid->integrityType = cpu_to_le32(LVID_INTEGRITY_TYPE_OPEN);
 
 	lvid->descTag.descCRC = cpu_to_le16(
@@ -2019,6 +2018,7 @@ static void udf_close_lvid(struct super_block *sb)
 	struct buffer_head *bh = sbi->s_lvid_bh;
 	struct logicalVolIntegrityDesc *lvid;
 	struct logicalVolIntegrityDescImpUse *lvidiu;
+	struct timespec ts;
 
 	if (!bh)
 		return;
@@ -2030,7 +2030,8 @@ static void udf_close_lvid(struct super_block *sb)
 	mutex_lock(&sbi->s_alloc_mutex);
 	lvidiu->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX;
 	lvidiu->impIdent.identSuffix[1] = UDF_OS_ID_LINUX;
-	udf_time_to_disk_stamp(&lvid->recordingDateAndTime, CURRENT_TIME);
+	ktime_get_real_ts(&ts);
+	udf_time_to_disk_stamp(&lvid->recordingDateAndTime, ts);
 	if (UDF_MAX_WRITE_VERSION > le16_to_cpu(lvidiu->maxUDFWriteRev))
 		lvidiu->maxUDFWriteRev = cpu_to_le16(UDF_MAX_WRITE_VERSION);
 	if (sbi->s_udfrev > le16_to_cpu(lvidiu->minUDFReadRev))
@@ -2158,15 +2159,25 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
 		ret = udf_load_vrs(sb, &uopt, silent, &fileset);
 	} else {
 		uopt.blocksize = bdev_logical_block_size(sb->s_bdev);
-		ret = udf_load_vrs(sb, &uopt, silent, &fileset);
-		if (ret == -EAGAIN && uopt.blocksize != UDF_DEFAULT_BLOCKSIZE) {
-			if (!silent)
-				pr_notice("Rescanning with blocksize %d\n",
-					  UDF_DEFAULT_BLOCKSIZE);
-			brelse(sbi->s_lvid_bh);
-			sbi->s_lvid_bh = NULL;
-			uopt.blocksize = UDF_DEFAULT_BLOCKSIZE;
+		while (uopt.blocksize <= 4096) {
 			ret = udf_load_vrs(sb, &uopt, silent, &fileset);
+			if (ret < 0) {
+				if (!silent && ret != -EACCES) {
+					pr_notice("Scanning with blocksize %d failed\n",
+						  uopt.blocksize);
+				}
+				brelse(sbi->s_lvid_bh);
+				sbi->s_lvid_bh = NULL;
+				/*
+				 * EACCES is special - we want to propagate to
+				 * upper layers that we cannot handle RW mount.
+				 */
+				if (ret == -EACCES)
+					break;
+			} else
+				break;
+
+			uopt.blocksize <<= 1;
 		}
 	}
 	if (ret < 0) {
@@ -2497,3 +2508,9 @@ static unsigned int udf_count_free(struct super_block *sb)
 
 	return accum;
 }
+
+MODULE_AUTHOR("Ben Fennema");
+MODULE_DESCRIPTION("Universal Disk Format Filesystem");
+MODULE_LICENSE("GPL");
+module_init(init_udf_fs)
+module_exit(exit_udf_fs)
diff --git a/fs/udf/symlink.c b/fs/udf/symlink.c
index 8d619773056b..f7dfef53f739 100644
--- a/fs/udf/symlink.c
+++ b/fs/udf/symlink.c
@@ -152,9 +152,39 @@ out_unmap:
 	return err;
 }
 
+static int udf_symlink_getattr(struct vfsmount *mnt, struct dentry *dentry,
+			       struct kstat *stat)
+{
+	struct inode *inode = d_backing_inode(dentry);
+	struct page *page;
+
+	generic_fillattr(inode, stat);
+	page = read_mapping_page(inode->i_mapping, 0, NULL);
+	if (IS_ERR(page))
+		return PTR_ERR(page);
+	/*
+	 * UDF uses non-trivial encoding of symlinks so i_size does not match
+	 * number of characters reported by readlink(2) which apparently some
+	 * applications expect. Also POSIX says that "The value returned in the
+	 * st_size field shall be the length of the contents of the symbolic
+	 * link, and shall not count a trailing null if one is present." So
+	 * let's report the length of string returned by readlink(2) for
+	 * st_size.
+	 */
+	stat->size = strlen(page_address(page));
+	put_page(page);
+
+	return 0;
+}
+
 /*
  * symlinks can't do much...
  */
 const struct address_space_operations udf_symlink_aops = {
 	.readpage		= udf_symlink_filler,
 };
+
+const struct inode_operations udf_symlink_inode_operations = {
+	.get_link	= page_get_link,
+	.getattr	= udf_symlink_getattr,
+};
diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h
index 263829ef1873..63b034984378 100644
--- a/fs/udf/udfdecl.h
+++ b/fs/udf/udfdecl.h
@@ -15,7 +15,6 @@
 #include "udfend.h"
 #include "udf_i.h"
 
-#define UDF_PREALLOCATE
 #define UDF_DEFAULT_PREALLOC_BLOCKS	8
 
 extern __printf(3, 4) void _udf_err(struct super_block *sb,
@@ -85,6 +84,7 @@ extern const struct inode_operations udf_dir_inode_operations;
 extern const struct file_operations udf_dir_operations;
 extern const struct inode_operations udf_file_inode_operations;
 extern const struct file_operations udf_file_operations;
+extern const struct inode_operations udf_symlink_inode_operations;
 extern const struct address_space_operations udf_aops;
 extern const struct address_space_operations udf_adinicb_aops;
 extern const struct address_space_operations udf_symlink_aops;
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index d96e2f30084b..18406158e13f 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -12,6 +12,7 @@
  *  mm/ksm.c (mm hashing).
  */
 
+#include <linux/list.h>
 #include <linux/hashtable.h>
 #include <linux/sched.h>
 #include <linux/mm.h>
@@ -26,6 +27,7 @@
 #include <linux/mempolicy.h>
 #include <linux/ioctl.h>
 #include <linux/security.h>
+#include <linux/hugetlb.h>
 
 static struct kmem_cache *userfaultfd_ctx_cachep __read_mostly;
 
@@ -45,12 +47,16 @@ struct userfaultfd_ctx {
 	wait_queue_head_t fault_wqh;
 	/* waitqueue head for the pseudo fd to wakeup poll/read */
 	wait_queue_head_t fd_wqh;
+	/* waitqueue head for events */
+	wait_queue_head_t event_wqh;
 	/* a refile sequence protected by fault_pending_wqh lock */
 	struct seqcount refile_seq;
 	/* pseudo fd refcounting */
 	atomic_t refcount;
 	/* userfaultfd syscall flags */
 	unsigned int flags;
+	/* features requested from the userspace */
+	unsigned int features;
 	/* state machine */
 	enum userfaultfd_state state;
 	/* released */
@@ -59,10 +65,17 @@ struct userfaultfd_ctx {
 	struct mm_struct *mm;
 };
 
+struct userfaultfd_fork_ctx {
+	struct userfaultfd_ctx *orig;
+	struct userfaultfd_ctx *new;
+	struct list_head list;
+};
+
 struct userfaultfd_wait_queue {
 	struct uffd_msg msg;
 	wait_queue_t wq;
 	struct userfaultfd_ctx *ctx;
+	bool waken;
 };
 
 struct userfaultfd_wake_range {
@@ -86,6 +99,12 @@ static int userfaultfd_wake_function(wait_queue_t *wq, unsigned mode,
 	if (len && (start > uwq->msg.arg.pagefault.address ||
 		    start + len <= uwq->msg.arg.pagefault.address))
 		goto out;
+	WRITE_ONCE(uwq->waken, true);
+	/*
+	 * The implicit smp_mb__before_spinlock in try_to_wake_up()
+	 * renders uwq->waken visible to other CPUs before the task is
+	 * waken.
+	 */
 	ret = wake_up_state(wq->private, mode);
 	if (ret)
 		/*
@@ -135,6 +154,8 @@ static void userfaultfd_ctx_put(struct userfaultfd_ctx *ctx)
 		VM_BUG_ON(waitqueue_active(&ctx->fault_pending_wqh));
 		VM_BUG_ON(spin_is_locked(&ctx->fault_wqh.lock));
 		VM_BUG_ON(waitqueue_active(&ctx->fault_wqh));
+		VM_BUG_ON(spin_is_locked(&ctx->event_wqh.lock));
+		VM_BUG_ON(waitqueue_active(&ctx->event_wqh));
 		VM_BUG_ON(spin_is_locked(&ctx->fd_wqh.lock));
 		VM_BUG_ON(waitqueue_active(&ctx->fd_wqh));
 		mmdrop(ctx->mm);
@@ -162,7 +183,7 @@ static inline struct uffd_msg userfault_msg(unsigned long address,
 	msg.arg.pagefault.address = address;
 	if (flags & FAULT_FLAG_WRITE)
 		/*
-		 * If UFFD_FEATURE_PAGEFAULT_FLAG_WRITE was set in the
+		 * If UFFD_FEATURE_PAGEFAULT_FLAG_WP was set in the
 		 * uffdio_api.features and UFFD_PAGEFAULT_FLAG_WRITE
 		 * was not set in a UFFD_EVENT_PAGEFAULT, it means it
 		 * was a read fault, otherwise if set it means it's
@@ -181,6 +202,49 @@ static inline struct uffd_msg userfault_msg(unsigned long address,
 	return msg;
 }
 
+#ifdef CONFIG_HUGETLB_PAGE
+/*
+ * Same functionality as userfaultfd_must_wait below with modifications for
+ * hugepmd ranges.
+ */
+static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
+					 unsigned long address,
+					 unsigned long flags,
+					 unsigned long reason)
+{
+	struct mm_struct *mm = ctx->mm;
+	pte_t *pte;
+	bool ret = true;
+
+	VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem));
+
+	pte = huge_pte_offset(mm, address);
+	if (!pte)
+		goto out;
+
+	ret = false;
+
+	/*
+	 * Lockless access: we're in a wait_event so it's ok if it
+	 * changes under us.
+	 */
+	if (huge_pte_none(*pte))
+		ret = true;
+	if (!huge_pte_write(*pte) && (reason & VM_UFFD_WP))
+		ret = true;
+out:
+	return ret;
+}
+#else
+static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
+					 unsigned long address,
+					 unsigned long flags,
+					 unsigned long reason)
+{
+	return false;	/* should never get here */
+}
+#endif /* CONFIG_HUGETLB_PAGE */
+
 /*
  * Verify the pagetables are still not ok after having reigstered into
  * the fault_pending_wqh to avoid userland having to UFFDIO_WAKE any
@@ -264,6 +328,7 @@ int handle_userfault(struct vm_fault *vmf, unsigned long reason)
 	struct userfaultfd_wait_queue uwq;
 	int ret;
 	bool must_wait, return_to_userland;
+	long blocking_state;
 
 	BUG_ON(!rwsem_is_locked(&mm->mmap_sem));
 
@@ -334,10 +399,13 @@ int handle_userfault(struct vm_fault *vmf, unsigned long reason)
 	uwq.wq.private = current;
 	uwq.msg = userfault_msg(vmf->address, vmf->flags, reason);
 	uwq.ctx = ctx;
+	uwq.waken = false;
 
 	return_to_userland =
 		(vmf->flags & (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE)) ==
 		(FAULT_FLAG_USER|FAULT_FLAG_KILLABLE);
+	blocking_state = return_to_userland ? TASK_INTERRUPTIBLE :
+			 TASK_KILLABLE;
 
 	spin_lock(&ctx->fault_pending_wqh.lock);
 	/*
@@ -350,12 +418,15 @@ int handle_userfault(struct vm_fault *vmf, unsigned long reason)
 	 * following the spin_unlock to happen before the list_add in
 	 * __add_wait_queue.
 	 */
-	set_current_state(return_to_userland ? TASK_INTERRUPTIBLE :
-			  TASK_KILLABLE);
+	set_current_state(blocking_state);
 	spin_unlock(&ctx->fault_pending_wqh.lock);
 
-	must_wait = userfaultfd_must_wait(ctx, vmf->address, vmf->flags,
-					  reason);
+	if (!is_vm_hugetlb_page(vmf->vma))
+		must_wait = userfaultfd_must_wait(ctx, vmf->address, vmf->flags,
+						  reason);
+	else
+		must_wait = userfaultfd_huge_must_wait(ctx, vmf->address,
+						       vmf->flags, reason);
 	up_read(&mm->mmap_sem);
 
 	if (likely(must_wait && !ACCESS_ONCE(ctx->released) &&
@@ -364,6 +435,29 @@ int handle_userfault(struct vm_fault *vmf, unsigned long reason)
 		wake_up_poll(&ctx->fd_wqh, POLLIN);
 		schedule();
 		ret |= VM_FAULT_MAJOR;
+
+		/*
+		 * False wakeups can orginate even from rwsem before
+		 * up_read() however userfaults will wait either for a
+		 * targeted wakeup on the specific uwq waitqueue from
+		 * wake_userfault() or for signals or for uffd
+		 * release.
+		 */
+		while (!READ_ONCE(uwq.waken)) {
+			/*
+			 * This needs the full smp_store_mb()
+			 * guarantee as the state write must be
+			 * visible to other CPUs before reading
+			 * uwq.waken from other CPUs.
+			 */
+			set_current_state(blocking_state);
+			if (READ_ONCE(uwq.waken) ||
+			    READ_ONCE(ctx->released) ||
+			    (return_to_userland ? signal_pending(current) :
+			     fatal_signal_pending(current)))
+				break;
+			schedule();
+		}
 	}
 
 	__set_current_state(TASK_RUNNING);
@@ -425,6 +519,196 @@ out:
 	return ret;
 }
 
+static int userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
+					     struct userfaultfd_wait_queue *ewq)
+{
+	int ret = 0;
+
+	ewq->ctx = ctx;
+	init_waitqueue_entry(&ewq->wq, current);
+
+	spin_lock(&ctx->event_wqh.lock);
+	/*
+	 * After the __add_wait_queue the uwq is visible to userland
+	 * through poll/read().
+	 */
+	__add_wait_queue(&ctx->event_wqh, &ewq->wq);
+	for (;;) {
+		set_current_state(TASK_KILLABLE);
+		if (ewq->msg.event == 0)
+			break;
+		if (ACCESS_ONCE(ctx->released) ||
+		    fatal_signal_pending(current)) {
+			ret = -1;
+			__remove_wait_queue(&ctx->event_wqh, &ewq->wq);
+			break;
+		}
+
+		spin_unlock(&ctx->event_wqh.lock);
+
+		wake_up_poll(&ctx->fd_wqh, POLLIN);
+		schedule();
+
+		spin_lock(&ctx->event_wqh.lock);
+	}
+	__set_current_state(TASK_RUNNING);
+	spin_unlock(&ctx->event_wqh.lock);
+
+	/*
+	 * ctx may go away after this if the userfault pseudo fd is
+	 * already released.
+	 */
+
+	userfaultfd_ctx_put(ctx);
+	return ret;
+}
+
+static void userfaultfd_event_complete(struct userfaultfd_ctx *ctx,
+				       struct userfaultfd_wait_queue *ewq)
+{
+	ewq->msg.event = 0;
+	wake_up_locked(&ctx->event_wqh);
+	__remove_wait_queue(&ctx->event_wqh, &ewq->wq);
+}
+
+int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs)
+{
+	struct userfaultfd_ctx *ctx = NULL, *octx;
+	struct userfaultfd_fork_ctx *fctx;
+
+	octx = vma->vm_userfaultfd_ctx.ctx;
+	if (!octx || !(octx->features & UFFD_FEATURE_EVENT_FORK)) {
+		vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
+		vma->vm_flags &= ~(VM_UFFD_WP | VM_UFFD_MISSING);
+		return 0;
+	}
+
+	list_for_each_entry(fctx, fcs, list)
+		if (fctx->orig == octx) {
+			ctx = fctx->new;
+			break;
+		}
+
+	if (!ctx) {
+		fctx = kmalloc(sizeof(*fctx), GFP_KERNEL);
+		if (!fctx)
+			return -ENOMEM;
+
+		ctx = kmem_cache_alloc(userfaultfd_ctx_cachep, GFP_KERNEL);
+		if (!ctx) {
+			kfree(fctx);
+			return -ENOMEM;
+		}
+
+		atomic_set(&ctx->refcount, 1);
+		ctx->flags = octx->flags;
+		ctx->state = UFFD_STATE_RUNNING;
+		ctx->features = octx->features;
+		ctx->released = false;
+		ctx->mm = vma->vm_mm;
+		atomic_inc(&ctx->mm->mm_count);
+
+		userfaultfd_ctx_get(octx);
+		fctx->orig = octx;
+		fctx->new = ctx;
+		list_add_tail(&fctx->list, fcs);
+	}
+
+	vma->vm_userfaultfd_ctx.ctx = ctx;
+	return 0;
+}
+
+static int dup_fctx(struct userfaultfd_fork_ctx *fctx)
+{
+	struct userfaultfd_ctx *ctx = fctx->orig;
+	struct userfaultfd_wait_queue ewq;
+
+	msg_init(&ewq.msg);
+
+	ewq.msg.event = UFFD_EVENT_FORK;
+	ewq.msg.arg.reserved.reserved1 = (unsigned long)fctx->new;
+
+	return userfaultfd_event_wait_completion(ctx, &ewq);
+}
+
+void dup_userfaultfd_complete(struct list_head *fcs)
+{
+	int ret = 0;
+	struct userfaultfd_fork_ctx *fctx, *n;
+
+	list_for_each_entry_safe(fctx, n, fcs, list) {
+		if (!ret)
+			ret = dup_fctx(fctx);
+		list_del(&fctx->list);
+		kfree(fctx);
+	}
+}
+
+void mremap_userfaultfd_prep(struct vm_area_struct *vma,
+			     struct vm_userfaultfd_ctx *vm_ctx)
+{
+	struct userfaultfd_ctx *ctx;
+
+	ctx = vma->vm_userfaultfd_ctx.ctx;
+	if (ctx && (ctx->features & UFFD_FEATURE_EVENT_REMAP)) {
+		vm_ctx->ctx = ctx;
+		userfaultfd_ctx_get(ctx);
+	}
+}
+
+void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx *vm_ctx,
+				 unsigned long from, unsigned long to,
+				 unsigned long len)
+{
+	struct userfaultfd_ctx *ctx = vm_ctx->ctx;
+	struct userfaultfd_wait_queue ewq;
+
+	if (!ctx)
+		return;
+
+	if (to & ~PAGE_MASK) {
+		userfaultfd_ctx_put(ctx);
+		return;
+	}
+
+	msg_init(&ewq.msg);
+
+	ewq.msg.event = UFFD_EVENT_REMAP;
+	ewq.msg.arg.remap.from = from;
+	ewq.msg.arg.remap.to = to;
+	ewq.msg.arg.remap.len = len;
+
+	userfaultfd_event_wait_completion(ctx, &ewq);
+}
+
+void madvise_userfault_dontneed(struct vm_area_struct *vma,
+				struct vm_area_struct **prev,
+				unsigned long start, unsigned long end)
+{
+	struct mm_struct *mm = vma->vm_mm;
+	struct userfaultfd_ctx *ctx;
+	struct userfaultfd_wait_queue ewq;
+
+	ctx = vma->vm_userfaultfd_ctx.ctx;
+	if (!ctx || !(ctx->features & UFFD_FEATURE_EVENT_MADVDONTNEED))
+		return;
+
+	userfaultfd_ctx_get(ctx);
+	up_read(&mm->mmap_sem);
+
+	*prev = NULL; /* We wait for ACK w/o the mmap semaphore */
+
+	msg_init(&ewq.msg);
+
+	ewq.msg.event = UFFD_EVENT_MADVDONTNEED;
+	ewq.msg.arg.madv_dn.start = start;
+	ewq.msg.arg.madv_dn.end = end;
+
+	userfaultfd_event_wait_completion(ctx, &ewq);
+
+	down_read(&mm->mmap_sem);
+}
+
 static int userfaultfd_release(struct inode *inode, struct file *file)
 {
 	struct userfaultfd_ctx *ctx = file->private_data;
@@ -489,25 +773,36 @@ wakeup:
 }
 
 /* fault_pending_wqh.lock must be hold by the caller */
-static inline struct userfaultfd_wait_queue *find_userfault(
-	struct userfaultfd_ctx *ctx)
+static inline struct userfaultfd_wait_queue *find_userfault_in(
+		wait_queue_head_t *wqh)
 {
 	wait_queue_t *wq;
 	struct userfaultfd_wait_queue *uwq;
 
-	VM_BUG_ON(!spin_is_locked(&ctx->fault_pending_wqh.lock));
+	VM_BUG_ON(!spin_is_locked(&wqh->lock));
 
 	uwq = NULL;
-	if (!waitqueue_active(&ctx->fault_pending_wqh))
+	if (!waitqueue_active(wqh))
 		goto out;
 	/* walk in reverse to provide FIFO behavior to read userfaults */
-	wq = list_last_entry(&ctx->fault_pending_wqh.task_list,
-			     typeof(*wq), task_list);
+	wq = list_last_entry(&wqh->task_list, typeof(*wq), task_list);
 	uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
 out:
 	return uwq;
 }
 
+static inline struct userfaultfd_wait_queue *find_userfault(
+		struct userfaultfd_ctx *ctx)
+{
+	return find_userfault_in(&ctx->fault_pending_wqh);
+}
+
+static inline struct userfaultfd_wait_queue *find_userfault_evt(
+		struct userfaultfd_ctx *ctx)
+{
+	return find_userfault_in(&ctx->event_wqh);
+}
+
 static unsigned int userfaultfd_poll(struct file *file, poll_table *wait)
 {
 	struct userfaultfd_ctx *ctx = file->private_data;
@@ -539,18 +834,59 @@ static unsigned int userfaultfd_poll(struct file *file, poll_table *wait)
 		smp_mb();
 		if (waitqueue_active(&ctx->fault_pending_wqh))
 			ret = POLLIN;
+		else if (waitqueue_active(&ctx->event_wqh))
+			ret = POLLIN;
+
 		return ret;
 	default:
-		BUG();
+		WARN_ON_ONCE(1);
+		return POLLERR;
 	}
 }
 
+static const struct file_operations userfaultfd_fops;
+
+static int resolve_userfault_fork(struct userfaultfd_ctx *ctx,
+				  struct userfaultfd_ctx *new,
+				  struct uffd_msg *msg)
+{
+	int fd;
+	struct file *file;
+	unsigned int flags = new->flags & UFFD_SHARED_FCNTL_FLAGS;
+
+	fd = get_unused_fd_flags(flags);
+	if (fd < 0)
+		return fd;
+
+	file = anon_inode_getfile("[userfaultfd]", &userfaultfd_fops, new,
+				  O_RDWR | flags);
+	if (IS_ERR(file)) {
+		put_unused_fd(fd);
+		return PTR_ERR(file);
+	}
+
+	fd_install(fd, file);
+	msg->arg.reserved.reserved1 = 0;
+	msg->arg.fork.ufd = fd;
+
+	return 0;
+}
+
 static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
 				    struct uffd_msg *msg)
 {
 	ssize_t ret;
 	DECLARE_WAITQUEUE(wait, current);
 	struct userfaultfd_wait_queue *uwq;
+	/*
+	 * Handling fork event requires sleeping operations, so
+	 * we drop the event_wqh lock, then do these ops, then
+	 * lock it back and wake up the waiter. While the lock is
+	 * dropped the ewq may go away so we keep track of it
+	 * carefully.
+	 */
+	LIST_HEAD(fork_event);
+	struct userfaultfd_ctx *fork_nctx = NULL;
 
 	/* always take the fd_wqh lock before the fault_pending_wqh lock */
 	spin_lock(&ctx->fd_wqh.lock);
@@ -602,6 +938,29 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
 			break;
 		}
 		spin_unlock(&ctx->fault_pending_wqh.lock);
+
+		spin_lock(&ctx->event_wqh.lock);
+		uwq = find_userfault_evt(ctx);
+		if (uwq) {
+			*msg = uwq->msg;
+
+			if (uwq->msg.event == UFFD_EVENT_FORK) {
+				fork_nctx = (struct userfaultfd_ctx *)
+					(unsigned long)
+					uwq->msg.arg.reserved.reserved1;
+				list_move(&uwq->wq.task_list, &fork_event);
+				spin_unlock(&ctx->event_wqh.lock);
+				ret = 0;
+				break;
+			}
+
+			userfaultfd_event_complete(ctx, uwq);
+			spin_unlock(&ctx->event_wqh.lock);
+			ret = 0;
+			break;
+		}
+		spin_unlock(&ctx->event_wqh.lock);
+
 		if (signal_pending(current)) {
 			ret = -ERESTARTSYS;
 			break;
@@ -618,6 +977,23 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
 	__set_current_state(TASK_RUNNING);
 	spin_unlock(&ctx->fd_wqh.lock);
 
+	if (!ret && msg->event == UFFD_EVENT_FORK) {
+		ret = resolve_userfault_fork(ctx, fork_nctx, msg);
+
+		if (!ret) {
+			spin_lock(&ctx->event_wqh.lock);
+			if (!list_empty(&fork_event)) {
+				uwq = list_first_entry(&fork_event,
+						       typeof(*uwq),
+						       wq.task_list);
+				list_del(&uwq->wq.task_list);
+				__add_wait_queue(&ctx->event_wqh, &uwq->wq);
+				userfaultfd_event_complete(ctx, uwq);
+			}
+			spin_unlock(&ctx->event_wqh.lock);
+		}
+	}
+
 	return ret;
 }
 
@@ -720,6 +1096,12 @@ static __always_inline int validate_range(struct mm_struct *mm,
 	return 0;
 }
 
+static inline bool vma_can_userfault(struct vm_area_struct *vma)
+{
+	return vma_is_anonymous(vma) || is_vm_hugetlb_page(vma) ||
+		vma_is_shmem(vma);
+}
+
 static int userfaultfd_register(struct userfaultfd_ctx *ctx,
 				unsigned long arg)
 {
@@ -730,6 +1112,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
 	struct uffdio_register __user *user_uffdio_register;
 	unsigned long vm_flags, new_flags;
 	bool found;
+	bool non_anon_pages;
 	unsigned long start, end, vma_end;
 
 	user_uffdio_register = (struct uffdio_register __user *) arg;
@@ -781,13 +1164,21 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
 		goto out_unlock;
 
 	/*
+	 * If the first vma contains huge pages, make sure start address
+	 * is aligned to huge page size.
+	 */
+	if (is_vm_hugetlb_page(vma)) {
+		unsigned long vma_hpagesize = vma_kernel_pagesize(vma);
+
+		if (start & (vma_hpagesize - 1))
+			goto out_unlock;
+	}
+
+	/*
 	 * Search for not compatible vmas.
-	 *
-	 * FIXME: this shall be relaxed later so that it doesn't fail
-	 * on tmpfs backed vmas (in addition to the current allowance
-	 * on anonymous vmas).
 	 */
 	found = false;
+	non_anon_pages = false;
 	for (cur = vma; cur && cur->vm_start < end; cur = cur->vm_next) {
 		cond_resched();
 
@@ -796,8 +1187,21 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
 
 		/* check not compatible vmas */
 		ret = -EINVAL;
-		if (cur->vm_ops)
+		if (!vma_can_userfault(cur))
 			goto out_unlock;
+		/*
+		 * If this vma contains ending address, and huge pages
+		 * check alignment.
+		 */
+		if (is_vm_hugetlb_page(cur) && end <= cur->vm_end &&
+		    end > cur->vm_start) {
+			unsigned long vma_hpagesize = vma_kernel_pagesize(cur);
+
+			ret = -EINVAL;
+
+			if (end & (vma_hpagesize - 1))
+				goto out_unlock;
+		}
 
 		/*
 		 * Check that this vma isn't already owned by a
@@ -810,6 +1214,12 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
 		    cur->vm_userfaultfd_ctx.ctx != ctx)
 			goto out_unlock;
 
+		/*
+		 * Note vmas containing huge pages
+		 */
+		if (is_vm_hugetlb_page(cur) || vma_is_shmem(cur))
+			non_anon_pages = true;
+
 		found = true;
 	}
 	BUG_ON(!found);
@@ -821,7 +1231,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
 	do {
 		cond_resched();
 
-		BUG_ON(vma->vm_ops);
+		BUG_ON(!vma_can_userfault(vma));
 		BUG_ON(vma->vm_userfaultfd_ctx.ctx &&
 		       vma->vm_userfaultfd_ctx.ctx != ctx);
 
@@ -879,7 +1289,8 @@ out_unlock:
 		 * userland which ioctls methods are guaranteed to
 		 * succeed on this range.
 		 */
-		if (put_user(UFFD_API_RANGE_IOCTLS,
+		if (put_user(non_anon_pages ? UFFD_API_RANGE_IOCTLS_BASIC :
+			     UFFD_API_RANGE_IOCTLS,
 			     &user_uffdio_register->ioctls))
 			ret = -EFAULT;
 	}
@@ -926,11 +1337,18 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
 		goto out_unlock;
 
 	/*
+	 * If the first vma contains huge pages, make sure start address
+	 * is aligned to huge page size.
+	 */
+	if (is_vm_hugetlb_page(vma)) {
+		unsigned long vma_hpagesize = vma_kernel_pagesize(vma);
+
+		if (start & (vma_hpagesize - 1))
+			goto out_unlock;
+	}
+
+	/*
 	 * Search for not compatible vmas.
-	 *
-	 * FIXME: this shall be relaxed later so that it doesn't fail
-	 * on tmpfs backed vmas (in addition to the current allowance
-	 * on anonymous vmas).
 	 */
 	found = false;
 	ret = -EINVAL;
@@ -947,7 +1365,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
 		 * provides for more strict behavior to notice
 		 * unregistration errors.
 		 */
-		if (cur->vm_ops)
+		if (!vma_can_userfault(cur))
 			goto out_unlock;
 
 		found = true;
@@ -961,7 +1379,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
 	do {
 		cond_resched();
 
-		BUG_ON(vma->vm_ops);
+		BUG_ON(!vma_can_userfault(vma));
 
 		/*
 		 * Nothing to do: this vma is already registered into this
@@ -974,6 +1392,19 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
 			start = vma->vm_start;
 		vma_end = min(end, vma->vm_end);
 
+		if (userfaultfd_missing(vma)) {
+			/*
+			 * Wake any concurrent pending userfault while
+			 * we unregister, so they will not hang
+			 * permanently and it avoids userland to call
+			 * UFFDIO_WAKE explicitly.
+			 */
+			struct userfaultfd_wake_range range;
+			range.start = start;
+			range.len = vma_end - start;
+			wake_userfault(vma->vm_userfaultfd_ctx.ctx, &range);
+		}
+
 		new_flags = vma->vm_flags & ~(VM_UFFD_MISSING | VM_UFFD_WP);
 		prev = vma_merge(mm, prev, start, vma_end, new_flags,
 				 vma->anon_vma, vma->vm_file, vma->vm_pgoff,
@@ -1145,6 +1576,14 @@ out:
 	return ret;
 }
 
+static inline unsigned int uffd_ctx_features(__u64 user_features)
+{
+	/*
+	 * For the current set of features the bits just coincide
+	 */
+	return (unsigned int)user_features;
+}
+
 /*
  * userland asks for a certain API version and we return which bits
  * and ioctl commands are implemented in this kernel for such API
@@ -1156,6 +1595,7 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx,
 	struct uffdio_api uffdio_api;
 	void __user *buf = (void __user *)arg;
 	int ret;
+	__u64 features;
 
 	ret = -EINVAL;
 	if (ctx->state != UFFD_STATE_WAIT_API)
@@ -1163,19 +1603,23 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx,
 	ret = -EFAULT;
 	if (copy_from_user(&uffdio_api, buf, sizeof(uffdio_api)))
 		goto out;
-	if (uffdio_api.api != UFFD_API || uffdio_api.features) {
+	features = uffdio_api.features;
+	if (uffdio_api.api != UFFD_API || (features & ~UFFD_API_FEATURES)) {
 		memset(&uffdio_api, 0, sizeof(uffdio_api));
 		if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api)))
 			goto out;
 		ret = -EINVAL;
 		goto out;
 	}
+	/* report all available features and ioctls to userland */
 	uffdio_api.features = UFFD_API_FEATURES;
 	uffdio_api.ioctls = UFFD_API_IOCTLS;
 	ret = -EFAULT;
 	if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api)))
 		goto out;
 	ctx->state = UFFD_STATE_RUNNING;
+	/* only enable the requested features for this uffd context */
+	ctx->features = uffd_ctx_features(features);
 	ret = 0;
 out:
 	return ret;
@@ -1262,6 +1706,7 @@ static void init_once_userfaultfd_ctx(void *mem)
 
 	init_waitqueue_head(&ctx->fault_pending_wqh);
 	init_waitqueue_head(&ctx->fault_wqh);
+	init_waitqueue_head(&ctx->event_wqh);
 	init_waitqueue_head(&ctx->fd_wqh);
 	seqcount_init(&ctx->refile_seq);
 }
@@ -1302,6 +1747,7 @@ static struct file *userfaultfd_file_create(int flags)
 
 	atomic_set(&ctx->refcount, 1);
 	ctx->flags = flags;
+	ctx->features = 0;
 	ctx->state = UFFD_STATE_WAIT_API;
 	ctx->released = false;
 	ctx->mm = current->mm;
diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c
index e5ebc3770460..33db69be4832 100644
--- a/fs/xfs/libxfs/xfs_ag_resv.c
+++ b/fs/xfs/libxfs/xfs_ag_resv.c
@@ -39,6 +39,7 @@
 #include "xfs_rmap_btree.h"
 #include "xfs_btree.h"
 #include "xfs_refcount_btree.h"
+#include "xfs_ialloc_btree.h"
 
 /*
  * Per-AG Block Reservations
@@ -200,22 +201,30 @@ __xfs_ag_resv_init(
 	struct xfs_mount		*mp = pag->pag_mount;
 	struct xfs_ag_resv		*resv;
 	int				error;
+	xfs_extlen_t			reserved;
 
-	resv = xfs_perag_resv(pag, type);
 	if (used > ask)
 		ask = used;
-	resv->ar_asked = ask;
-	resv->ar_reserved = resv->ar_orig_reserved = ask - used;
-	mp->m_ag_max_usable -= ask;
+	reserved = ask - used;
 
-	trace_xfs_ag_resv_init(pag, type, ask);
-
-	error = xfs_mod_fdblocks(mp, -(int64_t)resv->ar_reserved, true);
-	if (error)
+	error = xfs_mod_fdblocks(mp, -(int64_t)reserved, true);
+	if (error) {
 		trace_xfs_ag_resv_init_error(pag->pag_mount, pag->pag_agno,
 				error, _RET_IP_);
+		xfs_warn(mp,
+"Per-AG reservation for AG %u failed.  Filesystem may run out of space.",
+				pag->pag_agno);
+		return error;
+	}
 
-	return error;
+	mp->m_ag_max_usable -= ask;
+
+	resv = xfs_perag_resv(pag, type);
+	resv->ar_asked = ask;
+	resv->ar_reserved = resv->ar_orig_reserved = reserved;
+
+	trace_xfs_ag_resv_init(pag, type, ask);
+	return 0;
 }
 
 /* Create a per-AG block reservation. */
@@ -223,6 +232,8 @@ int
 xfs_ag_resv_init(
 	struct xfs_perag		*pag)
 {
+	struct xfs_mount		*mp = pag->pag_mount;
+	xfs_agnumber_t			agno = pag->pag_agno;
 	xfs_extlen_t			ask;
 	xfs_extlen_t			used;
 	int				error = 0;
@@ -231,23 +242,45 @@ xfs_ag_resv_init(
 	if (pag->pag_meta_resv.ar_asked == 0) {
 		ask = used = 0;
 
-		error = xfs_refcountbt_calc_reserves(pag->pag_mount,
-				pag->pag_agno, &ask, &used);
+		error = xfs_refcountbt_calc_reserves(mp, agno, &ask, &used);
 		if (error)
 			goto out;
 
-		error = __xfs_ag_resv_init(pag, XFS_AG_RESV_METADATA,
-				ask, used);
+		error = xfs_finobt_calc_reserves(mp, agno, &ask, &used);
 		if (error)
 			goto out;
+
+		error = __xfs_ag_resv_init(pag, XFS_AG_RESV_METADATA,
+				ask, used);
+		if (error) {
+			/*
+			 * Because we didn't have per-AG reservations when the
+			 * finobt feature was added we might not be able to
+			 * reserve all needed blocks.  Warn and fall back to the
+			 * old and potentially buggy code in that case, but
+			 * ensure we do have the reservation for the refcountbt.
+			 */
+			ask = used = 0;
+
+			mp->m_inotbt_nores = true;
+
+			error = xfs_refcountbt_calc_reserves(mp, agno, &ask,
+					&used);
+			if (error)
+				goto out;
+
+			error = __xfs_ag_resv_init(pag, XFS_AG_RESV_METADATA,
+					ask, used);
+			if (error)
+				goto out;
+		}
 	}
 
 	/* Create the AGFL metadata reservation */
 	if (pag->pag_agfl_resv.ar_asked == 0) {
 		ask = used = 0;
 
-		error = xfs_rmapbt_calc_reserves(pag->pag_mount, pag->pag_agno,
-				&ask, &used);
+		error = xfs_rmapbt_calc_reserves(mp, agno, &ask, &used);
 		if (error)
 			goto out;
 
@@ -256,6 +289,16 @@ xfs_ag_resv_init(
 			goto out;
 	}
 
+#ifdef DEBUG
+	/* need to read in the AGF for the ASSERT below to work */
+	error = xfs_alloc_pagf_init(pag->pag_mount, NULL, pag->pag_agno, 0);
+	if (error)
+		return error;
+
+	ASSERT(xfs_perag_resv(pag, XFS_AG_RESV_METADATA)->ar_reserved +
+	       xfs_perag_resv(pag, XFS_AG_RESV_AGFL)->ar_reserved <=
+	       pag->pagf_freeblks + pag->pagf_flcount);
+#endif
 out:
 	return error;
 }
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index 5050056a0b06..369adcc18c02 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -95,10 +95,7 @@ unsigned int
 xfs_alloc_set_aside(
 	struct xfs_mount	*mp)
 {
-	unsigned int		blocks;
-
-	blocks = 4 + (mp->m_sb.sb_agcount * XFS_ALLOC_AGFL_RESERVE);
-	return blocks;
+	return mp->m_sb.sb_agcount * (XFS_ALLOC_AGFL_RESERVE + 4);
 }
 
 /*
@@ -224,20 +221,22 @@ xfs_alloc_get_rec(
  * Compute aligned version of the found extent.
  * Takes alignment and min length into account.
  */
-STATIC void
+STATIC bool
 xfs_alloc_compute_aligned(
 	xfs_alloc_arg_t	*args,		/* allocation argument structure */
 	xfs_agblock_t	foundbno,	/* starting block in found extent */
 	xfs_extlen_t	foundlen,	/* length in found extent */
 	xfs_agblock_t	*resbno,	/* result block number */
-	xfs_extlen_t	*reslen)	/* result length */
+	xfs_extlen_t	*reslen,	/* result length */
+	unsigned	*busy_gen)
 {
-	xfs_agblock_t	bno;
-	xfs_extlen_t	len;
+	xfs_agblock_t	bno = foundbno;
+	xfs_extlen_t	len = foundlen;
 	xfs_extlen_t	diff;
+	bool		busy;
 
 	/* Trim busy sections out of found extent */
-	xfs_extent_busy_trim(args, foundbno, foundlen, &bno, &len);
+	busy = xfs_extent_busy_trim(args, &bno, &len, busy_gen);
 
 	/*
 	 * If we have a largish extent that happens to start before min_agbno,
@@ -262,6 +261,8 @@ xfs_alloc_compute_aligned(
 		*resbno = bno;
 		*reslen = len;
 	}
+
+	return busy;
 }
 
 /*
@@ -365,36 +366,12 @@ xfs_alloc_fix_len(
 		return;
 	ASSERT(rlen >= args->minlen && rlen <= args->maxlen);
 	ASSERT(rlen % args->prod == args->mod);
+	ASSERT(args->pag->pagf_freeblks + args->pag->pagf_flcount >=
+		rlen + args->minleft);
 	args->len = rlen;
 }
 
 /*
- * Fix up length if there is too little space left in the a.g.
- * Return 1 if ok, 0 if too little, should give up.
- */
-STATIC int
-xfs_alloc_fix_minleft(
-	xfs_alloc_arg_t	*args)		/* allocation argument structure */
-{
-	xfs_agf_t	*agf;		/* a.g. freelist header */
-	int		diff;		/* free space difference */
-
-	if (args->minleft == 0)
-		return 1;
-	agf = XFS_BUF_TO_AGF(args->agbp);
-	diff = be32_to_cpu(agf->agf_freeblks)
-		- args->len - args->minleft;
-	if (diff >= 0)
-		return 1;
-	args->len += diff;		/* shrink the allocated space */
-	/* casts to (int) catch length underflows */
-	if ((int)args->len >= (int)args->minlen)
-		return 1;
-	args->agbno = NULLAGBLOCK;
-	return 0;
-}
-
-/*
  * Update the two btrees, logically removing from freespace the extent
  * starting at rbno, rlen blocks.  The extent is contained within the
  * actual (current) free extent fbno for flen blocks.
@@ -689,8 +666,6 @@ xfs_alloc_ag_vextent(
 	xfs_alloc_arg_t	*args)	/* argument structure for allocation */
 {
 	int		error=0;
-	xfs_extlen_t	reservation;
-	xfs_extlen_t	oldmax;
 
 	ASSERT(args->minlen > 0);
 	ASSERT(args->maxlen > 0);
@@ -699,20 +674,6 @@ xfs_alloc_ag_vextent(
 	ASSERT(args->alignment > 0);
 
 	/*
-	 * Clamp maxlen to the amount of free space minus any reservations
-	 * that have been made.
-	 */
-	oldmax = args->maxlen;
-	reservation = xfs_ag_resv_needed(args->pag, args->resv);
-	if (args->maxlen > args->pag->pagf_freeblks - reservation)
-		args->maxlen = args->pag->pagf_freeblks - reservation;
-	if (args->maxlen == 0) {
-		args->agbno = NULLAGBLOCK;
-		args->maxlen = oldmax;
-		return 0;
-	}
-
-	/*
 	 * Branch to correct routine based on the type.
 	 */
 	args->wasfromfl = 0;
@@ -731,8 +692,6 @@ xfs_alloc_ag_vextent(
 		/* NOTREACHED */
 	}
 
-	args->maxlen = oldmax;
-
 	if (error || args->agbno == NULLAGBLOCK)
 		return error;
 
@@ -782,10 +741,11 @@ xfs_alloc_ag_vextent_exact(
 	int		error;
 	xfs_agblock_t	fbno;	/* start block of found extent */
 	xfs_extlen_t	flen;	/* length of found extent */
-	xfs_agblock_t	tbno;	/* start block of trimmed extent */
-	xfs_extlen_t	tlen;	/* length of trimmed extent */
-	xfs_agblock_t	tend;	/* end block of trimmed extent */
+	xfs_agblock_t	tbno;	/* start block of busy extent */
+	xfs_extlen_t	tlen;	/* length of busy extent */
+	xfs_agblock_t	tend;	/* end block of busy extent */
 	int		i;	/* success/failure of operation */
+	unsigned	busy_gen;
 
 	ASSERT(args->alignment == 1);
 
@@ -818,7 +778,9 @@ xfs_alloc_ag_vextent_exact(
 	/*
 	 * Check for overlapping busy extents.
 	 */
-	xfs_extent_busy_trim(args, fbno, flen, &tbno, &tlen);
+	tbno = fbno;
+	tlen = flen;
+	xfs_extent_busy_trim(args, &tbno, &tlen, &busy_gen);
 
 	/*
 	 * Give up if the start of the extent is busy, or the freespace isn't
@@ -841,9 +803,6 @@ xfs_alloc_ag_vextent_exact(
 	args->len = XFS_AGBLOCK_MIN(tend, args->agbno + args->maxlen)
 						- args->agbno;
 	xfs_alloc_fix_len(args);
-	if (!xfs_alloc_fix_minleft(args))
-		goto not_found;
-
 	ASSERT(args->agbno + args->len <= tend);
 
 	/*
@@ -901,6 +860,7 @@ xfs_alloc_find_best_extent(
 	xfs_agblock_t		sdiff;
 	int			error;
 	int			i;
+	unsigned		busy_gen;
 
 	/* The good extent is perfect, no need to  search. */
 	if (!gdiff)
@@ -914,7 +874,8 @@ xfs_alloc_find_best_extent(
 		if (error)
 			goto error0;
 		XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
-		xfs_alloc_compute_aligned(args, *sbno, *slen, sbnoa, slena);
+		xfs_alloc_compute_aligned(args, *sbno, *slen,
+				sbnoa, slena, &busy_gen);
 
 		/*
 		 * The good extent is closer than this one.
@@ -1003,7 +964,8 @@ xfs_alloc_ag_vextent_near(
 	xfs_extlen_t	ltlena;		/* aligned ... */
 	xfs_agblock_t	ltnew;		/* useful start bno of left side */
 	xfs_extlen_t	rlen;		/* length of returned extent */
-	int		forced = 0;
+	bool		busy;
+	unsigned	busy_gen;
 #ifdef DEBUG
 	/*
 	 * Randomly don't execute the first algorithm.
@@ -1030,6 +992,7 @@ restart:
 	ltlen = 0;
 	gtlena = 0;
 	ltlena = 0;
+	busy = false;
 
 	/*
 	 * Get a cursor for the by-size btree.
@@ -1112,8 +1075,8 @@ restart:
 			if ((error = xfs_alloc_get_rec(cnt_cur, &ltbno, &ltlen, &i)))
 				goto error0;
 			XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
-			xfs_alloc_compute_aligned(args, ltbno, ltlen,
-						  &ltbnoa, &ltlena);
+			busy = xfs_alloc_compute_aligned(args, ltbno, ltlen,
+					&ltbnoa, &ltlena, &busy_gen);
 			if (ltlena < args->minlen)
 				continue;
 			if (ltbnoa < args->min_agbno || ltbnoa > args->max_agbno)
@@ -1149,12 +1112,7 @@ restart:
 		XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
 		ASSERT(ltbno + ltlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
 		args->len = blen;
-		if (!xfs_alloc_fix_minleft(args)) {
-			xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
-			trace_xfs_alloc_near_nominleft(args);
-			return 0;
-		}
-		blen = args->len;
+
 		/*
 		 * We are allocating starting at bnew for blen blocks.
 		 */
@@ -1236,8 +1194,8 @@ restart:
 			if ((error = xfs_alloc_get_rec(bno_cur_lt, &ltbno, &ltlen, &i)))
 				goto error0;
 			XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
-			xfs_alloc_compute_aligned(args, ltbno, ltlen,
-						  &ltbnoa, &ltlena);
+			busy |= xfs_alloc_compute_aligned(args, ltbno, ltlen,
+					&ltbnoa, &ltlena, &busy_gen);
 			if (ltlena >= args->minlen && ltbnoa >= args->min_agbno)
 				break;
 			if ((error = xfs_btree_decrement(bno_cur_lt, 0, &i)))
@@ -1252,8 +1210,8 @@ restart:
 			if ((error = xfs_alloc_get_rec(bno_cur_gt, &gtbno, &gtlen, &i)))
 				goto error0;
 			XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
-			xfs_alloc_compute_aligned(args, gtbno, gtlen,
-						  &gtbnoa, &gtlena);
+			busy |= xfs_alloc_compute_aligned(args, gtbno, gtlen,
+					&gtbnoa, &gtlena, &busy_gen);
 			if (gtlena >= args->minlen && gtbnoa <= args->max_agbno)
 				break;
 			if ((error = xfs_btree_increment(bno_cur_gt, 0, &i)))
@@ -1314,9 +1272,9 @@ restart:
 	if (bno_cur_lt == NULL && bno_cur_gt == NULL) {
 		xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
 
-		if (!forced++) {
+		if (busy) {
 			trace_xfs_alloc_near_busy(args);
-			xfs_log_force(args->mp, XFS_LOG_SYNC);
+			xfs_extent_busy_flush(args->mp, args->pag, busy_gen);
 			goto restart;
 		}
 		trace_xfs_alloc_size_neither(args);
@@ -1346,12 +1304,6 @@ restart:
 	 */
 	args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
 	xfs_alloc_fix_len(args);
-	if (!xfs_alloc_fix_minleft(args)) {
-		trace_xfs_alloc_near_nominleft(args);
-		xfs_btree_del_cursor(bno_cur_lt, XFS_BTREE_NOERROR);
-		xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
-		return 0;
-	}
 	rlen = args->len;
 	(void)xfs_alloc_compute_diff(args->agbno, rlen, args->alignment,
 				     args->datatype, ltbnoa, ltlena, &ltnew);
@@ -1403,7 +1355,8 @@ xfs_alloc_ag_vextent_size(
 	int		i;		/* temp status variable */
 	xfs_agblock_t	rbno;		/* returned block number */
 	xfs_extlen_t	rlen;		/* length of returned extent */
-	int		forced = 0;
+	bool		busy;
+	unsigned	busy_gen;
 
 restart:
 	/*
@@ -1412,6 +1365,7 @@ restart:
 	cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
 		args->agno, XFS_BTNUM_CNT);
 	bno_cur = NULL;
+	busy = false;
 
 	/*
 	 * Look for an entry >= maxlen+alignment-1 blocks.
@@ -1421,14 +1375,13 @@ restart:
 		goto error0;
 
 	/*
-	 * If none or we have busy extents that we cannot allocate from, then
-	 * we have to settle for a smaller extent. In the case that there are
-	 * no large extents, this will return the last entry in the tree unless
-	 * the tree is empty. In the case that there are only busy large
-	 * extents, this will return the largest small extent unless there
+	 * If none then we have to settle for a smaller extent. In the case that
+	 * there are no large extents, this will return the last entry in the
+	 * tree unless the tree is empty. In the case that there are only busy
+	 * large extents, this will return the largest small extent unless there
 	 * are no smaller extents available.
 	 */
-	if (!i || forced > 1) {
+	if (!i) {
 		error = xfs_alloc_ag_vextent_small(args, cnt_cur,
 						   &fbno, &flen, &i);
 		if (error)
@@ -1439,13 +1392,11 @@ restart:
 			return 0;
 		}
 		ASSERT(i == 1);
-		xfs_alloc_compute_aligned(args, fbno, flen, &rbno, &rlen);
+		busy = xfs_alloc_compute_aligned(args, fbno, flen, &rbno,
+				&rlen, &busy_gen);
 	} else {
 		/*
 		 * Search for a non-busy extent that is large enough.
-		 * If we are at low space, don't check, or if we fall of
-		 * the end of the btree, turn off the busy check and
-		 * restart.
 		 */
 		for (;;) {
 			error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, &i);
@@ -1453,8 +1404,8 @@ restart:
 				goto error0;
 			XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
 
-			xfs_alloc_compute_aligned(args, fbno, flen,
-						  &rbno, &rlen);
+			busy = xfs_alloc_compute_aligned(args, fbno, flen,
+					&rbno, &rlen, &busy_gen);
 
 			if (rlen >= args->maxlen)
 				break;
@@ -1466,18 +1417,13 @@ restart:
 				/*
 				 * Our only valid extents must have been busy.
 				 * Make it unbusy by forcing the log out and
-				 * retrying. If we've been here before, forcing
-				 * the log isn't making the extents available,
-				 * which means they have probably been freed in
-				 * this transaction.  In that case, we have to
-				 * give up on them and we'll attempt a minlen
-				 * allocation the next time around.
+				 * retrying.
 				 */
 				xfs_btree_del_cursor(cnt_cur,
 						     XFS_BTREE_NOERROR);
 				trace_xfs_alloc_size_busy(args);
-				if (!forced++)
-					xfs_log_force(args->mp, XFS_LOG_SYNC);
+				xfs_extent_busy_flush(args->mp,
+							args->pag, busy_gen);
 				goto restart;
 			}
 		}
@@ -1513,8 +1459,8 @@ restart:
 			XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
 			if (flen < bestrlen)
 				break;
-			xfs_alloc_compute_aligned(args, fbno, flen,
-						  &rbno, &rlen);
+			busy = xfs_alloc_compute_aligned(args, fbno, flen,
+					&rbno, &rlen, &busy_gen);
 			rlen = XFS_EXTLEN_MIN(args->maxlen, rlen);
 			XFS_WANT_CORRUPTED_GOTO(args->mp, rlen == 0 ||
 				(rlen <= flen && rbno + rlen <= fbno + flen),
@@ -1543,18 +1489,16 @@ restart:
 	 */
 	args->len = rlen;
 	if (rlen < args->minlen) {
-		if (!forced++) {
+		if (busy) {
 			xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
 			trace_xfs_alloc_size_busy(args);
-			xfs_log_force(args->mp, XFS_LOG_SYNC);
+			xfs_extent_busy_flush(args->mp, args->pag, busy_gen);
 			goto restart;
 		}
 		goto out_nominleft;
 	}
 	xfs_alloc_fix_len(args);
 
-	if (!xfs_alloc_fix_minleft(args))
-		goto out_nominleft;
 	rlen = args->len;
 	XFS_WANT_CORRUPTED_GOTO(args->mp, rlen <= flen, error0);
 	/*
@@ -2056,7 +2000,7 @@ xfs_alloc_space_available(
 	int			flags)
 {
 	struct xfs_perag	*pag = args->pag;
-	xfs_extlen_t		longest;
+	xfs_extlen_t		alloc_len, longest;
 	xfs_extlen_t		reservation; /* blocks that are still reserved */
 	int			available;
 
@@ -2066,17 +2010,28 @@ xfs_alloc_space_available(
 	reservation = xfs_ag_resv_needed(pag, args->resv);
 
 	/* do we have enough contiguous free space for the allocation? */
+	alloc_len = args->minlen + (args->alignment - 1) + args->minalignslop;
 	longest = xfs_alloc_longest_free_extent(args->mp, pag, min_free,
 			reservation);
-	if ((args->minlen + args->alignment + args->minalignslop - 1) > longest)
+	if (longest < alloc_len)
 		return false;
 
 	/* do we have enough free space remaining for the allocation? */
 	available = (int)(pag->pagf_freeblks + pag->pagf_flcount -
-			  reservation - min_free - args->total);
-	if (available < (int)args->minleft || available <= 0)
+			  reservation - min_free - args->minleft);
+	if (available < (int)max(args->total, alloc_len))
 		return false;
 
+	/*
+	 * Clamp maxlen to the amount of free space available for the actual
+	 * extent allocation.
+	 */
+	if (available < (int)args->maxlen && !(flags & XFS_ALLOC_FLAG_CHECK)) {
+		args->maxlen = available;
+		ASSERT(args->maxlen > 0);
+		ASSERT(args->maxlen >= args->minlen);
+	}
+
 	return true;
 }
 
@@ -2122,7 +2077,8 @@ xfs_alloc_fix_freelist(
 	}
 
 	need = xfs_alloc_min_freelist(mp, pag);
-	if (!xfs_alloc_space_available(args, need, flags))
+	if (!xfs_alloc_space_available(args, need, flags |
+			XFS_ALLOC_FLAG_CHECK))
 		goto out_agbp_relse;
 
 	/*
@@ -2638,12 +2594,10 @@ xfs_alloc_vextent(
 	xfs_agblock_t	agsize;	/* allocation group size */
 	int		error;
 	int		flags;	/* XFS_ALLOC_FLAG_... locking flags */
-	xfs_extlen_t	minleft;/* minimum left value, temp copy */
 	xfs_mount_t	*mp;	/* mount structure pointer */
 	xfs_agnumber_t	sagno;	/* starting allocation group number */
 	xfs_alloctype_t	type;	/* input allocation type */
 	int		bump_rotor = 0;
-	int		no_min = 0;
 	xfs_agnumber_t	rotorstep = xfs_rotorstep; /* inode32 agf stepper */
 
 	mp = args->mp;
@@ -2672,7 +2626,6 @@ xfs_alloc_vextent(
 		trace_xfs_alloc_vextent_badargs(args);
 		return 0;
 	}
-	minleft = args->minleft;
 
 	switch (type) {
 	case XFS_ALLOCTYPE_THIS_AG:
@@ -2683,9 +2636,7 @@ xfs_alloc_vextent(
 		 */
 		args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno);
 		args->pag = xfs_perag_get(mp, args->agno);
-		args->minleft = 0;
 		error = xfs_alloc_fix_freelist(args, 0);
-		args->minleft = minleft;
 		if (error) {
 			trace_xfs_alloc_vextent_nofix(args);
 			goto error0;
@@ -2713,21 +2664,11 @@ xfs_alloc_vextent(
 		args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno);
 		args->type = XFS_ALLOCTYPE_NEAR_BNO;
 		/* FALLTHROUGH */
-	case XFS_ALLOCTYPE_ANY_AG:
-	case XFS_ALLOCTYPE_START_AG:
 	case XFS_ALLOCTYPE_FIRST_AG:
 		/*
 		 * Rotate through the allocation groups looking for a winner.
 		 */
-		if (type == XFS_ALLOCTYPE_ANY_AG) {
-			/*
-			 * Start with the last place we left off.
-			 */
-			args->agno = sagno = (mp->m_agfrotor / rotorstep) %
-					mp->m_sb.sb_agcount;
-			args->type = XFS_ALLOCTYPE_THIS_AG;
-			flags = XFS_ALLOC_FLAG_TRYLOCK;
-		} else if (type == XFS_ALLOCTYPE_FIRST_AG) {
+		if (type == XFS_ALLOCTYPE_FIRST_AG) {
 			/*
 			 * Start with allocation group given by bno.
 			 */
@@ -2736,8 +2677,6 @@ xfs_alloc_vextent(
 			sagno = 0;
 			flags = 0;
 		} else {
-			if (type == XFS_ALLOCTYPE_START_AG)
-				args->type = XFS_ALLOCTYPE_THIS_AG;
 			/*
 			 * Start with the given allocation group.
 			 */
@@ -2750,9 +2689,7 @@ xfs_alloc_vextent(
 		 */
 		for (;;) {
 			args->pag = xfs_perag_get(mp, args->agno);
-			if (no_min) args->minleft = 0;
 			error = xfs_alloc_fix_freelist(args, flags);
-			args->minleft = minleft;
 			if (error) {
 				trace_xfs_alloc_vextent_nofix(args);
 				goto error0;
@@ -2792,25 +2729,22 @@ xfs_alloc_vextent(
 			 * or switch to non-trylock mode.
 			 */
 			if (args->agno == sagno) {
-				if (no_min == 1) {
+				if (flags == 0) {
 					args->agbno = NULLAGBLOCK;
 					trace_xfs_alloc_vextent_allfailed(args);
 					break;
 				}
-				if (flags == 0) {
-					no_min = 1;
-				} else {
-					flags = 0;
-					if (type == XFS_ALLOCTYPE_START_BNO) {
-						args->agbno = XFS_FSB_TO_AGBNO(mp,
-							args->fsbno);
-						args->type = XFS_ALLOCTYPE_NEAR_BNO;
-					}
+
+				flags = 0;
+				if (type == XFS_ALLOCTYPE_START_BNO) {
+					args->agbno = XFS_FSB_TO_AGBNO(mp,
+						args->fsbno);
+					args->type = XFS_ALLOCTYPE_NEAR_BNO;
 				}
 			}
 			xfs_perag_put(args->pag);
 		}
-		if (bump_rotor || (type == XFS_ALLOCTYPE_ANY_AG)) {
+		if (bump_rotor) {
 			if (args->agno == sagno)
 				mp->m_agfrotor = (mp->m_agfrotor + 1) %
 					(mp->m_sb.sb_agcount * rotorstep);
diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h
index 7c404a6b0ae3..2a8d0fa6fbbe 100644
--- a/fs/xfs/libxfs/xfs_alloc.h
+++ b/fs/xfs/libxfs/xfs_alloc.h
@@ -29,9 +29,7 @@ extern struct workqueue_struct *xfs_alloc_wq;
 /*
  * Freespace allocation types.  Argument to xfs_alloc_[v]extent.
  */
-#define XFS_ALLOCTYPE_ANY_AG	0x01	/* allocate anywhere, use rotor */
 #define XFS_ALLOCTYPE_FIRST_AG	0x02	/* ... start at ag 0 */
-#define XFS_ALLOCTYPE_START_AG	0x04	/* anywhere, start in this a.g. */
 #define XFS_ALLOCTYPE_THIS_AG	0x08	/* anywhere in this a.g. */
 #define XFS_ALLOCTYPE_START_BNO	0x10	/* near this block else anywhere */
 #define XFS_ALLOCTYPE_NEAR_BNO	0x20	/* in this a.g. and near this block */
@@ -41,9 +39,7 @@ extern struct workqueue_struct *xfs_alloc_wq;
 typedef unsigned int xfs_alloctype_t;
 
 #define XFS_ALLOC_TYPES \
-	{ XFS_ALLOCTYPE_ANY_AG,		"ANY_AG" }, \
 	{ XFS_ALLOCTYPE_FIRST_AG,	"FIRST_AG" }, \
-	{ XFS_ALLOCTYPE_START_AG,	"START_AG" }, \
 	{ XFS_ALLOCTYPE_THIS_AG,	"THIS_AG" }, \
 	{ XFS_ALLOCTYPE_START_BNO,	"START_BNO" }, \
 	{ XFS_ALLOCTYPE_NEAR_BNO,	"NEAR_BNO" }, \
@@ -56,7 +52,7 @@ typedef unsigned int xfs_alloctype_t;
 #define	XFS_ALLOC_FLAG_FREEING	0x00000002  /* indicate caller is freeing extents*/
 #define	XFS_ALLOC_FLAG_NORMAP	0x00000004  /* don't modify the rmapbt */
 #define	XFS_ALLOC_FLAG_NOSHRINK	0x00000008  /* don't shrink the freelist */
-
+#define	XFS_ALLOC_FLAG_CHECK	0x00000010  /* test only, don't modify args */
 
 /*
  * Argument structure for xfs_alloc routines.
diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c
index af1ecb19121e..6622d46ddec3 100644
--- a/fs/xfs/libxfs/xfs_attr.c
+++ b/fs/xfs/libxfs/xfs_attr.c
@@ -131,9 +131,6 @@ xfs_attr_get(
 	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
 		return -EIO;
 
-	if (!xfs_inode_hasattr(ip))
-		return -ENOATTR;
-
 	error = xfs_attr_args_init(&args, ip, name, flags);
 	if (error)
 		return error;
@@ -392,9 +389,6 @@ xfs_attr_remove(
 	if (XFS_FORCED_SHUTDOWN(dp->i_mount))
 		return -EIO;
 
-	if (!xfs_inode_hasattr(dp))
-		return -ENOATTR;
-
 	error = xfs_attr_args_init(&args, dp, name, flags);
 	if (error)
 		return error;
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 2760bc3b2536..a9c66d47757a 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -740,15 +740,9 @@ xfs_bmap_extents_to_btree(
 	 * Fill in the root.
 	 */
 	block = ifp->if_broot;
-	if (xfs_sb_version_hascrc(&mp->m_sb))
-		xfs_btree_init_block_int(mp, block, XFS_BUF_DADDR_NULL,
-				 XFS_BMAP_CRC_MAGIC, 1, 1, ip->i_ino,
-				 XFS_BTREE_LONG_PTRS | XFS_BTREE_CRC_BLOCKS);
-	else
-		xfs_btree_init_block_int(mp, block, XFS_BUF_DADDR_NULL,
-				 XFS_BMAP_MAGIC, 1, 1, ip->i_ino,
+	xfs_btree_init_block_int(mp, block, XFS_BUF_DADDR_NULL,
+				 XFS_BTNUM_BMAP, 1, 1, ip->i_ino,
 				 XFS_BTREE_LONG_PTRS);
-
 	/*
 	 * Need a cursor.  Can't allocate until bb_level is filled in.
 	 */
@@ -804,9 +798,7 @@ try_another_ag:
 	 */
 	ASSERT(args.fsbno != NULLFSBLOCK);
 	ASSERT(*firstblock == NULLFSBLOCK ||
-	       args.agno == XFS_FSB_TO_AGNO(mp, *firstblock) ||
-	       (dfops->dop_low &&
-		args.agno > XFS_FSB_TO_AGNO(mp, *firstblock)));
+	       args.agno >= XFS_FSB_TO_AGNO(mp, *firstblock));
 	*firstblock = cur->bc_private.b.firstblock = args.fsbno;
 	cur->bc_private.b.allocated++;
 	ip->i_d.di_nblocks++;
@@ -817,13 +809,8 @@ try_another_ag:
 	 */
 	abp->b_ops = &xfs_bmbt_buf_ops;
 	ablock = XFS_BUF_TO_BLOCK(abp);
-	if (xfs_sb_version_hascrc(&mp->m_sb))
-		xfs_btree_init_block_int(mp, ablock, abp->b_bn,
-				XFS_BMAP_CRC_MAGIC, 0, 0, ip->i_ino,
-				XFS_BTREE_LONG_PTRS | XFS_BTREE_CRC_BLOCKS);
-	else
-		xfs_btree_init_block_int(mp, ablock, abp->b_bn,
-				XFS_BMAP_MAGIC, 0, 0, ip->i_ino,
+	xfs_btree_init_block_int(mp, ablock, abp->b_bn,
+				XFS_BTNUM_BMAP, 0, 0, ip->i_ino,
 				XFS_BTREE_LONG_PTRS);
 
 	arp = XFS_BMBT_REC_ADDR(mp, ablock, 1);
@@ -1278,7 +1265,6 @@ xfs_bmap_read_extents(
 	/* REFERENCED */
 	xfs_extnum_t		room;	/* number of entries there's room for */
 
-	bno = NULLFSBLOCK;
 	mp = ip->i_mount;
 	ifp = XFS_IFORK_PTR(ip, whichfork);
 	exntf = (whichfork != XFS_DATA_FORK) ? XFS_EXTFMT_NOSTATE :
@@ -1291,9 +1277,7 @@ xfs_bmap_read_extents(
 	ASSERT(level > 0);
 	pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes);
 	bno = be64_to_cpu(*pp);
-	ASSERT(bno != NULLFSBLOCK);
-	ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount);
-	ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks);
+
 	/*
 	 * Go down the tree until leaf level is reached, following the first
 	 * pointer (leftmost) at each level.
@@ -1864,6 +1848,7 @@ xfs_bmap_add_extent_delay_real(
 		 */
 		trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
 		xfs_bmbt_set_startblock(ep, new->br_startblock);
+		xfs_bmbt_set_state(ep, new->br_state);
 		trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
 
 		(*nextents)++;
@@ -2202,6 +2187,7 @@ STATIC int				/* error */
 xfs_bmap_add_extent_unwritten_real(
 	struct xfs_trans	*tp,
 	xfs_inode_t		*ip,	/* incore inode pointer */
+	int			whichfork,
 	xfs_extnum_t		*idx,	/* extent number to update/insert */
 	xfs_btree_cur_t		**curp,	/* if *curp is null, not a btree */
 	xfs_bmbt_irec_t		*new,	/* new data to add to file extents */
@@ -2221,12 +2207,14 @@ xfs_bmap_add_extent_unwritten_real(
 					/* left is 0, right is 1, prev is 2 */
 	int			rval=0;	/* return value (logging flags) */
 	int			state = 0;/* state bits, accessed thru macros */
-	struct xfs_mount	*mp = tp->t_mountp;
+	struct xfs_mount	*mp = ip->i_mount;
 
 	*logflagsp = 0;
 
 	cur = *curp;
-	ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+	ifp = XFS_IFORK_PTR(ip, whichfork);
+	if (whichfork == XFS_COW_FORK)
+		state |= BMAP_COWFORK;
 
 	ASSERT(*idx >= 0);
 	ASSERT(*idx <= xfs_iext_count(ifp));
@@ -2285,7 +2273,7 @@ xfs_bmap_add_extent_unwritten_real(
 	 * Don't set contiguous if the combined extent would be too large.
 	 * Also check for all-three-contiguous being too large.
 	 */
-	if (*idx < xfs_iext_count(&ip->i_df) - 1) {
+	if (*idx < xfs_iext_count(ifp) - 1) {
 		state |= BMAP_RIGHT_VALID;
 		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx + 1), &RIGHT);
 		if (isnullstartblock(RIGHT.br_startblock))
@@ -2325,7 +2313,8 @@ xfs_bmap_add_extent_unwritten_real(
 		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
 
 		xfs_iext_remove(ip, *idx + 1, 2, state);
-		ip->i_d.di_nextents -= 2;
+		XFS_IFORK_NEXT_SET(ip, whichfork,
+				XFS_IFORK_NEXTENTS(ip, whichfork) - 2);
 		if (cur == NULL)
 			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
 		else {
@@ -2368,7 +2357,8 @@ xfs_bmap_add_extent_unwritten_real(
 		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
 
 		xfs_iext_remove(ip, *idx + 1, 1, state);
-		ip->i_d.di_nextents--;
+		XFS_IFORK_NEXT_SET(ip, whichfork,
+				XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
 		if (cur == NULL)
 			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
 		else {
@@ -2403,7 +2393,8 @@ xfs_bmap_add_extent_unwritten_real(
 		xfs_bmbt_set_state(ep, newext);
 		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
 		xfs_iext_remove(ip, *idx + 1, 1, state);
-		ip->i_d.di_nextents--;
+		XFS_IFORK_NEXT_SET(ip, whichfork,
+				XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
 		if (cur == NULL)
 			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
 		else {
@@ -2515,7 +2506,8 @@ xfs_bmap_add_extent_unwritten_real(
 		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
 
 		xfs_iext_insert(ip, *idx, 1, new, state);
-		ip->i_d.di_nextents++;
+		XFS_IFORK_NEXT_SET(ip, whichfork,
+				XFS_IFORK_NEXTENTS(ip, whichfork) + 1);
 		if (cur == NULL)
 			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
 		else {
@@ -2593,7 +2585,8 @@ xfs_bmap_add_extent_unwritten_real(
 		++*idx;
 		xfs_iext_insert(ip, *idx, 1, new, state);
 
-		ip->i_d.di_nextents++;
+		XFS_IFORK_NEXT_SET(ip, whichfork,
+				XFS_IFORK_NEXTENTS(ip, whichfork) + 1);
 		if (cur == NULL)
 			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
 		else {
@@ -2641,7 +2634,8 @@ xfs_bmap_add_extent_unwritten_real(
 		++*idx;
 		xfs_iext_insert(ip, *idx, 2, &r[0], state);
 
-		ip->i_d.di_nextents += 2;
+		XFS_IFORK_NEXT_SET(ip, whichfork,
+				XFS_IFORK_NEXTENTS(ip, whichfork) + 2);
 		if (cur == NULL)
 			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
 		else {
@@ -2695,17 +2689,17 @@ xfs_bmap_add_extent_unwritten_real(
 	}
 
 	/* update reverse mappings */
-	error = xfs_rmap_convert_extent(mp, dfops, ip, XFS_DATA_FORK, new);
+	error = xfs_rmap_convert_extent(mp, dfops, ip, whichfork, new);
 	if (error)
 		goto done;
 
 	/* convert to a btree if necessary */
-	if (xfs_bmap_needs_btree(ip, XFS_DATA_FORK)) {
+	if (xfs_bmap_needs_btree(ip, whichfork)) {
 		int	tmp_logflags;	/* partial log flag return val */
 
 		ASSERT(cur == NULL);
 		error = xfs_bmap_extents_to_btree(tp, ip, first, dfops, &cur,
-				0, &tmp_logflags, XFS_DATA_FORK);
+				0, &tmp_logflags, whichfork);
 		*logflagsp |= tmp_logflags;
 		if (error)
 			goto done;
@@ -2717,7 +2711,7 @@ xfs_bmap_add_extent_unwritten_real(
 		*curp = cur;
 	}
 
-	xfs_bmap_check_leaf_extents(*curp, ip, XFS_DATA_FORK);
+	xfs_bmap_check_leaf_extents(*curp, ip, whichfork);
 done:
 	*logflagsp |= rval;
 	return error;
@@ -2809,7 +2803,8 @@ xfs_bmap_add_extent_hole_delay(
 		oldlen = startblockval(left.br_startblock) +
 			startblockval(new->br_startblock) +
 			startblockval(right.br_startblock);
-		newlen = xfs_bmap_worst_indlen(ip, temp);
+		newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
+					 oldlen);
 		xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, *idx),
 			nullstartblock((int)newlen));
 		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
@@ -2830,7 +2825,8 @@ xfs_bmap_add_extent_hole_delay(
 		xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), temp);
 		oldlen = startblockval(left.br_startblock) +
 			startblockval(new->br_startblock);
-		newlen = xfs_bmap_worst_indlen(ip, temp);
+		newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
+					 oldlen);
 		xfs_bmbt_set_startblock(xfs_iext_get_ext(ifp, *idx),
 			nullstartblock((int)newlen));
 		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
@@ -2846,7 +2842,8 @@ xfs_bmap_add_extent_hole_delay(
 		temp = new->br_blockcount + right.br_blockcount;
 		oldlen = startblockval(new->br_startblock) +
 			startblockval(right.br_startblock);
-		newlen = xfs_bmap_worst_indlen(ip, temp);
+		newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
+					 oldlen);
 		xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx),
 			new->br_startoff,
 			nullstartblock((int)newlen), temp, right.br_state);
@@ -2899,13 +2896,14 @@ xfs_bmap_add_extent_hole_real(
 	ASSERT(!isnullstartblock(new->br_startblock));
 	ASSERT(!bma->cur ||
 	       !(bma->cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL));
-	ASSERT(whichfork != XFS_COW_FORK);
 
 	XFS_STATS_INC(mp, xs_add_exlist);
 
 	state = 0;
 	if (whichfork == XFS_ATTR_FORK)
 		state |= BMAP_ATTRFORK;
+	if (whichfork == XFS_COW_FORK)
+		state |= BMAP_COWFORK;
 
 	/*
 	 * Check and set flags if this segment has a left neighbor.
@@ -3629,7 +3627,7 @@ xfs_bmap_btalloc(
 		align = xfs_get_cowextsz_hint(ap->ip);
 	else if (xfs_alloc_is_userdata(ap->datatype))
 		align = xfs_get_extsz_hint(ap->ip);
-	if (unlikely(align)) {
+	if (align) {
 		error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev,
 						align, 0, ap->eof, 0, ap->conv,
 						&ap->offset, &ap->length);
@@ -3701,7 +3699,7 @@ xfs_bmap_btalloc(
 		args.minlen = ap->minlen;
 	}
 	/* apply extent size hints if obtained earlier */
-	if (unlikely(align)) {
+	if (align) {
 		args.prod = align;
 		if ((args.mod = (xfs_extlen_t)do_mod(ap->offset, args.prod)))
 			args.mod = (xfs_extlen_t)(args.prod - args.mod);
@@ -3812,7 +3810,6 @@ xfs_bmap_btalloc(
 		args.fsbno = 0;
 		args.type = XFS_ALLOCTYPE_FIRST_AG;
 		args.total = ap->minlen;
-		args.minleft = 0;
 		if ((error = xfs_alloc_vextent(&args)))
 			return error;
 		ap->dfops->dop_low = true;
@@ -3823,17 +3820,13 @@ xfs_bmap_btalloc(
 		 * the first block that was allocated.
 		 */
 		ASSERT(*ap->firstblock == NULLFSBLOCK ||
-		       XFS_FSB_TO_AGNO(mp, *ap->firstblock) ==
-		       XFS_FSB_TO_AGNO(mp, args.fsbno) ||
-		       (ap->dfops->dop_low &&
-			XFS_FSB_TO_AGNO(mp, *ap->firstblock) <
-			XFS_FSB_TO_AGNO(mp, args.fsbno)));
+		       XFS_FSB_TO_AGNO(mp, *ap->firstblock) <=
+		       XFS_FSB_TO_AGNO(mp, args.fsbno));
 
 		ap->blkno = args.fsbno;
 		if (*ap->firstblock == NULLFSBLOCK)
 			*ap->firstblock = args.fsbno;
-		ASSERT(nullfb || fb_agno == args.agno ||
-		       (ap->dfops->dop_low && fb_agno < args.agno));
+		ASSERT(nullfb || fb_agno <= args.agno);
 		ap->length = args.len;
 		if (!(ap->flags & XFS_BMAPI_COWFORK))
 			ap->ip->i_d.di_nblocks += args.len;
@@ -4344,8 +4337,6 @@ xfs_bmapi_allocate(
 	if (error)
 		return error;
 
-	if (bma->dfops->dop_low)
-		bma->minleft = 0;
 	if (bma->cur)
 		bma->cur->bc_private.b.firstblock = *bma->firstblock;
 	if (bma->blkno == NULLFSBLOCK)
@@ -4371,10 +4362,16 @@ xfs_bmapi_allocate(
 	bma->got.br_state = XFS_EXT_NORM;
 
 	/*
-	 * A wasdelay extent has been initialized, so shouldn't be flagged
-	 * as unwritten.
+	 * In the data fork, a wasdelay extent has been initialized, so
+	 * shouldn't be flagged as unwritten.
+	 *
+	 * For the cow fork, however, we convert delalloc reservations
+	 * (extents allocated for speculative preallocation) to
+	 * allocated unwritten extents, and only convert the unwritten
+	 * extents to real extents when we're about to write the data.
 	 */
-	if (!bma->wasdel && (bma->flags & XFS_BMAPI_PREALLOC) &&
+	if ((!bma->wasdel || (bma->flags & XFS_BMAPI_COWFORK)) &&
+	    (bma->flags & XFS_BMAPI_PREALLOC) &&
 	    xfs_sb_version_hasextflgbit(&mp->m_sb))
 		bma->got.br_state = XFS_EXT_UNWRITTEN;
 
@@ -4425,8 +4422,6 @@ xfs_bmapi_convert_unwritten(
 			(XFS_BMAPI_PREALLOC | XFS_BMAPI_CONVERT))
 		return 0;
 
-	ASSERT(whichfork != XFS_COW_FORK);
-
 	/*
 	 * Modify (by adding) the state flag, if writing.
 	 */
@@ -4451,8 +4446,8 @@ xfs_bmapi_convert_unwritten(
 			return error;
 	}
 
-	error = xfs_bmap_add_extent_unwritten_real(bma->tp, bma->ip, &bma->idx,
-			&bma->cur, mval, bma->firstblock, bma->dfops,
+	error = xfs_bmap_add_extent_unwritten_real(bma->tp, bma->ip, whichfork,
+			&bma->idx, &bma->cur, mval, bma->firstblock, bma->dfops,
 			&tmp_logflags);
 	/*
 	 * Log the inode core unconditionally in the unwritten extent conversion
@@ -4461,8 +4456,12 @@ xfs_bmapi_convert_unwritten(
 	 * in the transaction for the sake of fsync(), even if nothing has
 	 * changed, because fsync() will not force the log for this transaction
 	 * unless it sees the inode pinned.
+	 *
+	 * Note: If we're only converting cow fork extents, there aren't
+	 * any on-disk updates to make, so we don't need to log anything.
 	 */
-	bma->logflags |= tmp_logflags | XFS_ILOG_CORE;
+	if (whichfork != XFS_COW_FORK)
+		bma->logflags |= tmp_logflags | XFS_ILOG_CORE;
 	if (error)
 		return error;
 
@@ -4517,8 +4516,6 @@ xfs_bmapi_write(
 	int			n;		/* current extent index */
 	xfs_fileoff_t		obno;		/* old block number (offset) */
 	int			whichfork;	/* data or attr fork */
-	char			inhole;		/* current location is hole in file */
-	char			wasdelay;	/* old extent was delayed */
 
 #ifdef DEBUG
 	xfs_fileoff_t		orig_bno;	/* original block number value */
@@ -4538,15 +4535,15 @@ xfs_bmapi_write(
 	ASSERT(*nmap >= 1);
 	ASSERT(*nmap <= XFS_BMAP_MAX_NMAP);
 	ASSERT(!(flags & XFS_BMAPI_IGSTATE));
-	ASSERT(tp != NULL);
+	ASSERT(tp != NULL ||
+	       (flags & (XFS_BMAPI_CONVERT | XFS_BMAPI_COWFORK)) ==
+			(XFS_BMAPI_CONVERT | XFS_BMAPI_COWFORK));
 	ASSERT(len > 0);
 	ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL);
 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
 	ASSERT(!(flags & XFS_BMAPI_REMAP) || whichfork == XFS_DATA_FORK);
 	ASSERT(!(flags & XFS_BMAPI_PREALLOC) || !(flags & XFS_BMAPI_REMAP));
 	ASSERT(!(flags & XFS_BMAPI_CONVERT) || !(flags & XFS_BMAPI_REMAP));
-	ASSERT(!(flags & XFS_BMAPI_PREALLOC) || whichfork != XFS_COW_FORK);
-	ASSERT(!(flags & XFS_BMAPI_CONVERT) || whichfork != XFS_COW_FORK);
 
 	/* zeroing is for currently only for data extents, not metadata */
 	ASSERT((flags & (XFS_BMAPI_METADATA | XFS_BMAPI_ZERO)) !=
@@ -4606,22 +4603,44 @@ xfs_bmapi_write(
 	bma.firstblock = firstblock;
 
 	while (bno < end && n < *nmap) {
-		inhole = eof || bma.got.br_startoff > bno;
-		wasdelay = !inhole && isnullstartblock(bma.got.br_startblock);
+		bool			need_alloc = false, wasdelay = false;
 
-		/*
-		 * Make sure we only reflink into a hole.
-		 */
-		if (flags & XFS_BMAPI_REMAP)
-			ASSERT(inhole);
-		if (flags & XFS_BMAPI_COWFORK)
-			ASSERT(!inhole);
+		/* in hole or beyoned EOF? */
+		if (eof || bma.got.br_startoff > bno) {
+			if (flags & XFS_BMAPI_DELALLOC) {
+				/*
+				 * For the COW fork we can reasonably get a
+				 * request for converting an extent that races
+				 * with other threads already having converted
+				 * part of it, as there converting COW to
+				 * regular blocks is not protected using the
+				 * IOLOCK.
+				 */
+				ASSERT(flags & XFS_BMAPI_COWFORK);
+				if (!(flags & XFS_BMAPI_COWFORK)) {
+					error = -EIO;
+					goto error0;
+				}
+
+				if (eof || bno >= end)
+					break;
+			} else {
+				need_alloc = true;
+			}
+		} else {
+			/*
+			 * Make sure we only reflink into a hole.
+			 */
+			ASSERT(!(flags & XFS_BMAPI_REMAP));
+			if (isnullstartblock(bma.got.br_startblock))
+				wasdelay = true;
+		}
 
 		/*
 		 * First, deal with the hole before the allocated space
 		 * that we found, if any.
 		 */
-		if (inhole || wasdelay) {
+		if (need_alloc || wasdelay) {
 			bma.eof = eof;
 			bma.conv = !!(flags & XFS_BMAPI_CONVERT);
 			bma.wasdel = wasdelay;
@@ -4729,13 +4748,9 @@ error0:
 	if (bma.cur) {
 		if (!error) {
 			ASSERT(*firstblock == NULLFSBLOCK ||
-			       XFS_FSB_TO_AGNO(mp, *firstblock) ==
+			       XFS_FSB_TO_AGNO(mp, *firstblock) <=
 			       XFS_FSB_TO_AGNO(mp,
-				       bma.cur->bc_private.b.firstblock) ||
-			       (dfops->dop_low &&
-				XFS_FSB_TO_AGNO(mp, *firstblock) <
-				XFS_FSB_TO_AGNO(mp,
-					bma.cur->bc_private.b.firstblock)));
+				       bma.cur->bc_private.b.firstblock));
 			*firstblock = bma.cur->bc_private.b.firstblock;
 		}
 		xfs_btree_del_cursor(bma.cur,
@@ -4770,34 +4785,59 @@ xfs_bmap_split_indlen(
 	xfs_filblks_t			len2 = *indlen2;
 	xfs_filblks_t			nres = len1 + len2; /* new total res. */
 	xfs_filblks_t			stolen = 0;
+	xfs_filblks_t			resfactor;
 
 	/*
 	 * Steal as many blocks as we can to try and satisfy the worst case
 	 * indlen for both new extents.
 	 */
-	while (nres > ores && avail) {
-		nres--;
-		avail--;
-		stolen++;
-	}
+	if (ores < nres && avail)
+		stolen = XFS_FILBLKS_MIN(nres - ores, avail);
+	ores += stolen;
+
+	 /* nothing else to do if we've satisfied the new reservation */
+	if (ores >= nres)
+		return stolen;
+
+	/*
+	 * We can't meet the total required reservation for the two extents.
+	 * Calculate the percent of the overall shortage between both extents
+	 * and apply this percentage to each of the requested indlen values.
+	 * This distributes the shortage fairly and reduces the chances that one
+	 * of the two extents is left with nothing when extents are repeatedly
+	 * split.
+	 */
+	resfactor = (ores * 100);
+	do_div(resfactor, nres);
+	len1 *= resfactor;
+	do_div(len1, 100);
+	len2 *= resfactor;
+	do_div(len2, 100);
+	ASSERT(len1 + len2 <= ores);
+	ASSERT(len1 < *indlen1 && len2 < *indlen2);
 
 	/*
-	 * The only blocks available are those reserved for the original
-	 * extent and what we can steal from the extent being removed.
-	 * If this still isn't enough to satisfy the combined
-	 * requirements for the two new extents, skim blocks off of each
-	 * of the new reservations until they match what is available.
+	 * Hand out the remainder to each extent. If one of the two reservations
+	 * is zero, we want to make sure that one gets a block first. The loop
+	 * below starts with len1, so hand len2 a block right off the bat if it
+	 * is zero.
 	 */
-	while (nres > ores) {
-		if (len1) {
-			len1--;
-			nres--;
+	ores -= (len1 + len2);
+	ASSERT((*indlen1 - len1) + (*indlen2 - len2) >= ores);
+	if (ores && !len2 && *indlen2) {
+		len2++;
+		ores--;
+	}
+	while (ores) {
+		if (len1 < *indlen1) {
+			len1++;
+			ores--;
 		}
-		if (nres == ores)
+		if (!ores)
 			break;
-		if (len2) {
-			len2--;
-			nres--;
+		if (len2 < *indlen2) {
+			len2++;
+			ores--;
 		}
 	}
 
@@ -5539,8 +5579,8 @@ __xfs_bunmapi(
 			}
 			del.br_state = XFS_EXT_UNWRITTEN;
 			error = xfs_bmap_add_extent_unwritten_real(tp, ip,
-					&lastx, &cur, &del, firstblock, dfops,
-					&logflags);
+					whichfork, &lastx, &cur, &del,
+					firstblock, dfops, &logflags);
 			if (error)
 				goto error0;
 			goto nodelete;
@@ -5593,8 +5633,9 @@ __xfs_bunmapi(
 				prev.br_state = XFS_EXT_UNWRITTEN;
 				lastx--;
 				error = xfs_bmap_add_extent_unwritten_real(tp,
-						ip, &lastx, &cur, &prev,
-						firstblock, dfops, &logflags);
+						ip, whichfork, &lastx, &cur,
+						&prev, firstblock, dfops,
+						&logflags);
 				if (error)
 					goto error0;
 				goto nodelete;
@@ -5602,8 +5643,9 @@ __xfs_bunmapi(
 				ASSERT(del.br_state == XFS_EXT_NORM);
 				del.br_state = XFS_EXT_UNWRITTEN;
 				error = xfs_bmap_add_extent_unwritten_real(tp,
-						ip, &lastx, &cur, &del,
-						firstblock, dfops, &logflags);
+						ip, whichfork, &lastx, &cur,
+						&del, firstblock, dfops,
+						&logflags);
 				if (error)
 					goto error0;
 				goto nodelete;
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index cecd094404cc..cdef87db5262 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -110,6 +110,9 @@ struct xfs_extent_free_item
 /* Map something in the CoW fork. */
 #define XFS_BMAPI_COWFORK	0x200
 
+/* Only convert delalloc space, don't allocate entirely new extents */
+#define XFS_BMAPI_DELALLOC	0x400
+
 #define XFS_BMAPI_FLAGS \
 	{ XFS_BMAPI_ENTIRE,	"ENTIRE" }, \
 	{ XFS_BMAPI_METADATA,	"METADATA" }, \
@@ -120,7 +123,8 @@ struct xfs_extent_free_item
 	{ XFS_BMAPI_CONVERT,	"CONVERT" }, \
 	{ XFS_BMAPI_ZERO,	"ZERO" }, \
 	{ XFS_BMAPI_REMAP,	"REMAP" }, \
-	{ XFS_BMAPI_COWFORK,	"COWFORK" }
+	{ XFS_BMAPI_COWFORK,	"COWFORK" }, \
+	{ XFS_BMAPI_DELALLOC,	"DELALLOC" }
 
 
 static inline int xfs_bmapi_aflag(int w)
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c
index d6330c297ca0..f93072b58a58 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.c
+++ b/fs/xfs/libxfs/xfs_bmap_btree.c
@@ -71,15 +71,9 @@ xfs_bmdr_to_bmbt(
 	xfs_bmbt_key_t		*tkp;
 	__be64			*tpp;
 
-	if (xfs_sb_version_hascrc(&mp->m_sb))
-		xfs_btree_init_block_int(mp, rblock, XFS_BUF_DADDR_NULL,
-				 XFS_BMAP_CRC_MAGIC, 0, 0, ip->i_ino,
-				 XFS_BTREE_LONG_PTRS | XFS_BTREE_CRC_BLOCKS);
-	else
-		xfs_btree_init_block_int(mp, rblock, XFS_BUF_DADDR_NULL,
-				 XFS_BMAP_MAGIC, 0, 0, ip->i_ino,
+	xfs_btree_init_block_int(mp, rblock, XFS_BUF_DADDR_NULL,
+				 XFS_BTNUM_BMAP, 0, 0, ip->i_ino,
 				 XFS_BTREE_LONG_PTRS);
-
 	rblock->bb_level = dblock->bb_level;
 	ASSERT(be16_to_cpu(rblock->bb_level) > 0);
 	rblock->bb_numrecs = dblock->bb_numrecs;
@@ -502,12 +496,11 @@ try_another_ag:
 	if (args.fsbno == NULLFSBLOCK && args.minleft) {
 		/*
 		 * Could not find an AG with enough free space to satisfy
-		 * a full btree split.  Try again without minleft and if
+		 * a full btree split.  Try again and if
 		 * successful activate the lowspace algorithm.
 		 */
 		args.fsbno = 0;
 		args.type = XFS_ALLOCTYPE_FIRST_AG;
-		args.minleft = 0;
 		error = xfs_alloc_vextent(&args);
 		if (error)
 			goto error0;
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index 21e6a6ab6b9a..c3decedc9455 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -50,8 +50,18 @@ static const __uint32_t xfs_magics[2][XFS_BTNUM_MAX] = {
 	  XFS_BMAP_CRC_MAGIC, XFS_IBT_CRC_MAGIC, XFS_FIBT_CRC_MAGIC,
 	  XFS_REFC_CRC_MAGIC }
 };
-#define xfs_btree_magic(cur) \
-	xfs_magics[!!((cur)->bc_flags & XFS_BTREE_CRC_BLOCKS)][cur->bc_btnum]
+
+__uint32_t
+xfs_btree_magic(
+	int			crc,
+	xfs_btnum_t		btnum)
+{
+	__uint32_t		magic = xfs_magics[crc][btnum];
+
+	/* Ensure we asked for crc for crc-only magics. */
+	ASSERT(magic != 0);
+	return magic;
+}
 
 STATIC int				/* error (0 or EFSCORRUPTED) */
 xfs_btree_check_lblock(
@@ -62,10 +72,13 @@ xfs_btree_check_lblock(
 {
 	int			lblock_ok = 1; /* block passes checks */
 	struct xfs_mount	*mp;	/* file system mount point */
+	xfs_btnum_t		btnum = cur->bc_btnum;
+	int			crc;
 
 	mp = cur->bc_mp;
+	crc = xfs_sb_version_hascrc(&mp->m_sb);
 
-	if (xfs_sb_version_hascrc(&mp->m_sb)) {
+	if (crc) {
 		lblock_ok = lblock_ok &&
 			uuid_equal(&block->bb_u.l.bb_uuid,
 				   &mp->m_sb.sb_meta_uuid) &&
@@ -74,7 +87,7 @@ xfs_btree_check_lblock(
 	}
 
 	lblock_ok = lblock_ok &&
-		be32_to_cpu(block->bb_magic) == xfs_btree_magic(cur) &&
+		be32_to_cpu(block->bb_magic) == xfs_btree_magic(crc, btnum) &&
 		be16_to_cpu(block->bb_level) == level &&
 		be16_to_cpu(block->bb_numrecs) <=
 			cur->bc_ops->get_maxrecs(cur, level) &&
@@ -110,13 +123,16 @@ xfs_btree_check_sblock(
 	struct xfs_agf		*agf;	/* ag. freespace structure */
 	xfs_agblock_t		agflen;	/* native ag. freespace length */
 	int			sblock_ok = 1; /* block passes checks */
+	xfs_btnum_t		btnum = cur->bc_btnum;
+	int			crc;
 
 	mp = cur->bc_mp;
+	crc = xfs_sb_version_hascrc(&mp->m_sb);
 	agbp = cur->bc_private.a.agbp;
 	agf = XFS_BUF_TO_AGF(agbp);
 	agflen = be32_to_cpu(agf->agf_length);
 
-	if (xfs_sb_version_hascrc(&mp->m_sb)) {
+	if (crc) {
 		sblock_ok = sblock_ok &&
 			uuid_equal(&block->bb_u.s.bb_uuid,
 				   &mp->m_sb.sb_meta_uuid) &&
@@ -125,7 +141,7 @@ xfs_btree_check_sblock(
 	}
 
 	sblock_ok = sblock_ok &&
-		be32_to_cpu(block->bb_magic) == xfs_btree_magic(cur) &&
+		be32_to_cpu(block->bb_magic) == xfs_btree_magic(crc, btnum) &&
 		be16_to_cpu(block->bb_level) == level &&
 		be16_to_cpu(block->bb_numrecs) <=
 			cur->bc_ops->get_maxrecs(cur, level) &&
@@ -810,7 +826,8 @@ xfs_btree_read_bufl(
 	xfs_daddr_t		d;		/* real disk block address */
 	int			error;
 
-	ASSERT(fsbno != NULLFSBLOCK);
+	if (!XFS_FSB_SANITY_CHECK(mp, fsbno))
+		return -EFSCORRUPTED;
 	d = XFS_FSB_TO_DADDR(mp, fsbno);
 	error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d,
 				   mp->m_bsize, lock, &bp, ops);
@@ -1084,12 +1101,15 @@ xfs_btree_init_block_int(
 	struct xfs_mount	*mp,
 	struct xfs_btree_block	*buf,
 	xfs_daddr_t		blkno,
-	__u32			magic,
+	xfs_btnum_t		btnum,
 	__u16			level,
 	__u16			numrecs,
 	__u64			owner,
 	unsigned int		flags)
 {
+	int			crc = xfs_sb_version_hascrc(&mp->m_sb);
+	__u32			magic = xfs_btree_magic(crc, btnum);
+
 	buf->bb_magic = cpu_to_be32(magic);
 	buf->bb_level = cpu_to_be16(level);
 	buf->bb_numrecs = cpu_to_be16(numrecs);
@@ -1097,7 +1117,7 @@ xfs_btree_init_block_int(
 	if (flags & XFS_BTREE_LONG_PTRS) {
 		buf->bb_u.l.bb_leftsib = cpu_to_be64(NULLFSBLOCK);
 		buf->bb_u.l.bb_rightsib = cpu_to_be64(NULLFSBLOCK);
-		if (flags & XFS_BTREE_CRC_BLOCKS) {
+		if (crc) {
 			buf->bb_u.l.bb_blkno = cpu_to_be64(blkno);
 			buf->bb_u.l.bb_owner = cpu_to_be64(owner);
 			uuid_copy(&buf->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid);
@@ -1110,7 +1130,7 @@ xfs_btree_init_block_int(
 
 		buf->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK);
 		buf->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK);
-		if (flags & XFS_BTREE_CRC_BLOCKS) {
+		if (crc) {
 			buf->bb_u.s.bb_blkno = cpu_to_be64(blkno);
 			buf->bb_u.s.bb_owner = cpu_to_be32(__owner);
 			uuid_copy(&buf->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid);
@@ -1123,14 +1143,14 @@ void
 xfs_btree_init_block(
 	struct xfs_mount *mp,
 	struct xfs_buf	*bp,
-	__u32		magic,
+	xfs_btnum_t	btnum,
 	__u16		level,
 	__u16		numrecs,
 	__u64		owner,
 	unsigned int	flags)
 {
 	xfs_btree_init_block_int(mp, XFS_BUF_TO_BLOCK(bp), bp->b_bn,
-				 magic, level, numrecs, owner, flags);
+				 btnum, level, numrecs, owner, flags);
 }
 
 STATIC void
@@ -1140,7 +1160,7 @@ xfs_btree_init_block_cur(
 	int			level,
 	int			numrecs)
 {
-	__u64 owner;
+	__u64			owner;
 
 	/*
 	 * we can pull the owner from the cursor right now as the different
@@ -1154,7 +1174,7 @@ xfs_btree_init_block_cur(
 		owner = cur->bc_private.a.agno;
 
 	xfs_btree_init_block_int(cur->bc_mp, XFS_BUF_TO_BLOCK(bp), bp->b_bn,
-				 xfs_btree_magic(cur), level, numrecs,
+				 cur->bc_btnum, level, numrecs,
 				 owner, cur->bc_flags);
 }
 
diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h
index b69b947c4c1b..4bb62580a7fd 100644
--- a/fs/xfs/libxfs/xfs_btree.h
+++ b/fs/xfs/libxfs/xfs_btree.h
@@ -76,6 +76,8 @@ union xfs_btree_rec {
 #define	XFS_BTNUM_RMAP	((xfs_btnum_t)XFS_BTNUM_RMAPi)
 #define	XFS_BTNUM_REFC	((xfs_btnum_t)XFS_BTNUM_REFCi)
 
+__uint32_t xfs_btree_magic(int crc, xfs_btnum_t btnum);
+
 /*
  * For logging record fields.
  */
@@ -378,7 +380,7 @@ void
 xfs_btree_init_block(
 	struct xfs_mount *mp,
 	struct xfs_buf	*bp,
-	__u32		magic,
+	xfs_btnum_t	btnum,
 	__u16		level,
 	__u16		numrecs,
 	__u64		owner,
@@ -389,7 +391,7 @@ xfs_btree_init_block_int(
 	struct xfs_mount	*mp,
 	struct xfs_btree_block	*buf,
 	xfs_daddr_t		blkno,
-	__u32			magic,
+	xfs_btnum_t		btnum,
 	__u16			level,
 	__u16			numrecs,
 	__u64			owner,
@@ -456,7 +458,7 @@ static inline int xfs_btree_get_level(struct xfs_btree_block *block)
 #define	XFS_FILBLKS_MAX(a,b)	max_t(xfs_filblks_t, (a), (b))
 
 #define	XFS_FSB_SANITY_CHECK(mp,fsb)	\
-	(XFS_FSB_TO_AGNO(mp, fsb) < mp->m_sb.sb_agcount && \
+	(fsb && XFS_FSB_TO_AGNO(mp, fsb) < mp->m_sb.sb_agcount && \
 		XFS_FSB_TO_AGBNO(mp, fsb) < mp->m_sb.sb_agblocks)
 
 /*
diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c
index f2dc1a950c85..1bdf2888295b 100644
--- a/fs/xfs/libxfs/xfs_da_btree.c
+++ b/fs/xfs/libxfs/xfs_da_btree.c
@@ -2633,7 +2633,7 @@ out_free:
 /*
  * Readahead the dir/attr block.
  */
-xfs_daddr_t
+int
 xfs_da_reada_buf(
 	struct xfs_inode	*dp,
 	xfs_dablk_t		bno,
@@ -2664,7 +2664,5 @@ out_free:
 	if (mapp != &map)
 		kmem_free(mapp);
 
-	if (error)
-		return -1;
-	return mappedbno;
+	return error;
 }
diff --git a/fs/xfs/libxfs/xfs_da_btree.h b/fs/xfs/libxfs/xfs_da_btree.h
index 98c75cbe6ac2..4e29cb6a3627 100644
--- a/fs/xfs/libxfs/xfs_da_btree.h
+++ b/fs/xfs/libxfs/xfs_da_btree.h
@@ -201,7 +201,7 @@ int	xfs_da_read_buf(struct xfs_trans *trans, struct xfs_inode *dp,
 			       xfs_dablk_t bno, xfs_daddr_t mappedbno,
 			       struct xfs_buf **bpp, int whichfork,
 			       const struct xfs_buf_ops *ops);
-xfs_daddr_t	xfs_da_reada_buf(struct xfs_inode *dp, xfs_dablk_t bno,
+int	xfs_da_reada_buf(struct xfs_inode *dp, xfs_dablk_t bno,
 				xfs_daddr_t mapped_bno, int whichfork,
 				const struct xfs_buf_ops *ops);
 int	xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno,
diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c
index c58d72c220f5..2f389d366e93 100644
--- a/fs/xfs/libxfs/xfs_dir2.c
+++ b/fs/xfs/libxfs/xfs_dir2.c
@@ -36,21 +36,29 @@
 struct xfs_name xfs_name_dotdot = { (unsigned char *)"..", 2, XFS_DIR3_FT_DIR };
 
 /*
- * @mode, if set, indicates that the type field needs to be set up.
- * This uses the transformation from file mode to DT_* as defined in linux/fs.h
- * for file type specification. This will be propagated into the directory
- * structure if appropriate for the given operation and filesystem config.
+ * Convert inode mode to directory entry filetype
  */
-const unsigned char xfs_mode_to_ftype[S_IFMT >> S_SHIFT] = {
-	[0]			= XFS_DIR3_FT_UNKNOWN,
-	[S_IFREG >> S_SHIFT]    = XFS_DIR3_FT_REG_FILE,
-	[S_IFDIR >> S_SHIFT]    = XFS_DIR3_FT_DIR,
-	[S_IFCHR >> S_SHIFT]    = XFS_DIR3_FT_CHRDEV,
-	[S_IFBLK >> S_SHIFT]    = XFS_DIR3_FT_BLKDEV,
-	[S_IFIFO >> S_SHIFT]    = XFS_DIR3_FT_FIFO,
-	[S_IFSOCK >> S_SHIFT]   = XFS_DIR3_FT_SOCK,
-	[S_IFLNK >> S_SHIFT]    = XFS_DIR3_FT_SYMLINK,
-};
+unsigned char xfs_mode_to_ftype(int mode)
+{
+	switch (mode & S_IFMT) {
+	case S_IFREG:
+		return XFS_DIR3_FT_REG_FILE;
+	case S_IFDIR:
+		return XFS_DIR3_FT_DIR;
+	case S_IFCHR:
+		return XFS_DIR3_FT_CHRDEV;
+	case S_IFBLK:
+		return XFS_DIR3_FT_BLKDEV;
+	case S_IFIFO:
+		return XFS_DIR3_FT_FIFO;
+	case S_IFSOCK:
+		return XFS_DIR3_FT_SOCK;
+	case S_IFLNK:
+		return XFS_DIR3_FT_SYMLINK;
+	default:
+		return XFS_DIR3_FT_UNKNOWN;
+	}
+}
 
 /*
  * ASCII case-insensitive (ie. A-Z) support for directories that was
@@ -631,7 +639,8 @@ xfs_dir2_isblock(
 	if ((rval = xfs_bmap_last_offset(args->dp, &last, XFS_DATA_FORK)))
 		return rval;
 	rval = XFS_FSB_TO_B(args->dp->i_mount, last) == args->geo->blksize;
-	ASSERT(rval == 0 || args->dp->i_d.di_size == args->geo->blksize);
+	if (rval != 0 && args->dp->i_d.di_size != args->geo->blksize)
+		return -EFSCORRUPTED;
 	*vp = rval;
 	return 0;
 }
diff --git a/fs/xfs/libxfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h
index 0197590fa7d7..d6e6d9d16f6c 100644
--- a/fs/xfs/libxfs/xfs_dir2.h
+++ b/fs/xfs/libxfs/xfs_dir2.h
@@ -18,6 +18,9 @@
 #ifndef __XFS_DIR2_H__
 #define __XFS_DIR2_H__
 
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+
 struct xfs_defer_ops;
 struct xfs_da_args;
 struct xfs_inode;
@@ -32,10 +35,9 @@ struct xfs_dir2_data_unused;
 extern struct xfs_name	xfs_name_dotdot;
 
 /*
- * directory filetype conversion tables.
+ * Convert inode mode to directory entry filetype
  */
-#define S_SHIFT 12
-extern const unsigned char xfs_mode_to_ftype[];
+extern unsigned char xfs_mode_to_ftype(int mode);
 
 /*
  * directory operations vector for encode/decode routines
diff --git a/fs/xfs/libxfs/xfs_dir2_node.c b/fs/xfs/libxfs/xfs_dir2_node.c
index 75a557432d0f..bbd1238852b3 100644
--- a/fs/xfs/libxfs/xfs_dir2_node.c
+++ b/fs/xfs/libxfs/xfs_dir2_node.c
@@ -155,6 +155,42 @@ const struct xfs_buf_ops xfs_dir3_free_buf_ops = {
 	.verify_write = xfs_dir3_free_write_verify,
 };
 
+/* Everything ok in the free block header? */
+static bool
+xfs_dir3_free_header_check(
+	struct xfs_inode	*dp,
+	xfs_dablk_t		fbno,
+	struct xfs_buf		*bp)
+{
+	struct xfs_mount	*mp = dp->i_mount;
+	unsigned int		firstdb;
+	int			maxbests;
+
+	maxbests = dp->d_ops->free_max_bests(mp->m_dir_geo);
+	firstdb = (xfs_dir2_da_to_db(mp->m_dir_geo, fbno) -
+		   xfs_dir2_byte_to_db(mp->m_dir_geo, XFS_DIR2_FREE_OFFSET)) *
+			maxbests;
+	if (xfs_sb_version_hascrc(&mp->m_sb)) {
+		struct xfs_dir3_free_hdr *hdr3 = bp->b_addr;
+
+		if (be32_to_cpu(hdr3->firstdb) != firstdb)
+			return false;
+		if (be32_to_cpu(hdr3->nvalid) > maxbests)
+			return false;
+		if (be32_to_cpu(hdr3->nvalid) < be32_to_cpu(hdr3->nused))
+			return false;
+	} else {
+		struct xfs_dir2_free_hdr *hdr = bp->b_addr;
+
+		if (be32_to_cpu(hdr->firstdb) != firstdb)
+			return false;
+		if (be32_to_cpu(hdr->nvalid) > maxbests)
+			return false;
+		if (be32_to_cpu(hdr->nvalid) < be32_to_cpu(hdr->nused))
+			return false;
+	}
+	return true;
+}
 
 static int
 __xfs_dir3_free_read(
@@ -168,11 +204,22 @@ __xfs_dir3_free_read(
 
 	err = xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp,
 				XFS_DATA_FORK, &xfs_dir3_free_buf_ops);
+	if (err || !*bpp)
+		return err;
+
+	/* Check things that we can't do in the verifier. */
+	if (!xfs_dir3_free_header_check(dp, fbno, *bpp)) {
+		xfs_buf_ioerror(*bpp, -EFSCORRUPTED);
+		xfs_verifier_error(*bpp);
+		xfs_trans_brelse(tp, *bpp);
+		return -EFSCORRUPTED;
+	}
 
 	/* try read returns without an error or *bpp if it lands in a hole */
-	if (!err && tp && *bpp)
+	if (tp)
 		xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_FREE_BUF);
-	return err;
+
+	return 0;
 }
 
 int
diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index f272abff11e1..d41ade5d293e 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -51,8 +51,7 @@ xfs_ialloc_cluster_alignment(
 	struct xfs_mount	*mp)
 {
 	if (xfs_sb_version_hasalign(&mp->m_sb) &&
-	    mp->m_sb.sb_inoalignmt >=
-			XFS_B_TO_FSBT(mp, mp->m_inode_cluster_size))
+	    mp->m_sb.sb_inoalignmt >= xfs_icluster_size_fsb(mp))
 		return mp->m_sb.sb_inoalignmt;
 	return 1;
 }
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c
index 0fd086d03d41..7c471881c9a6 100644
--- a/fs/xfs/libxfs/xfs_ialloc_btree.c
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.c
@@ -82,11 +82,12 @@ xfs_finobt_set_root(
 }
 
 STATIC int
-xfs_inobt_alloc_block(
+__xfs_inobt_alloc_block(
 	struct xfs_btree_cur	*cur,
 	union xfs_btree_ptr	*start,
 	union xfs_btree_ptr	*new,
-	int			*stat)
+	int			*stat,
+	enum xfs_ag_resv_type	resv)
 {
 	xfs_alloc_arg_t		args;		/* block allocation args */
 	int			error;		/* error return value */
@@ -103,6 +104,7 @@ xfs_inobt_alloc_block(
 	args.maxlen = 1;
 	args.prod = 1;
 	args.type = XFS_ALLOCTYPE_NEAR_BNO;
+	args.resv = resv;
 
 	error = xfs_alloc_vextent(&args);
 	if (error) {
@@ -123,6 +125,27 @@ xfs_inobt_alloc_block(
 }
 
 STATIC int
+xfs_inobt_alloc_block(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_ptr	*start,
+	union xfs_btree_ptr	*new,
+	int			*stat)
+{
+	return __xfs_inobt_alloc_block(cur, start, new, stat, XFS_AG_RESV_NONE);
+}
+
+STATIC int
+xfs_finobt_alloc_block(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_ptr	*start,
+	union xfs_btree_ptr	*new,
+	int			*stat)
+{
+	return __xfs_inobt_alloc_block(cur, start, new, stat,
+			XFS_AG_RESV_METADATA);
+}
+
+STATIC int
 xfs_inobt_free_block(
 	struct xfs_btree_cur	*cur,
 	struct xfs_buf		*bp)
@@ -328,7 +351,7 @@ static const struct xfs_btree_ops xfs_finobt_ops = {
 
 	.dup_cursor		= xfs_inobt_dup_cursor,
 	.set_root		= xfs_finobt_set_root,
-	.alloc_block		= xfs_inobt_alloc_block,
+	.alloc_block		= xfs_finobt_alloc_block,
 	.free_block		= xfs_inobt_free_block,
 	.get_minrecs		= xfs_inobt_get_minrecs,
 	.get_maxrecs		= xfs_inobt_get_maxrecs,
@@ -480,3 +503,64 @@ xfs_inobt_rec_check_count(
 	return 0;
 }
 #endif	/* DEBUG */
+
+static xfs_extlen_t
+xfs_inobt_max_size(
+	struct xfs_mount	*mp)
+{
+	/* Bail out if we're uninitialized, which can happen in mkfs. */
+	if (mp->m_inobt_mxr[0] == 0)
+		return 0;
+
+	return xfs_btree_calc_size(mp, mp->m_inobt_mnr,
+		(uint64_t)mp->m_sb.sb_agblocks * mp->m_sb.sb_inopblock /
+				XFS_INODES_PER_CHUNK);
+}
+
+static int
+xfs_inobt_count_blocks(
+	struct xfs_mount	*mp,
+	xfs_agnumber_t		agno,
+	xfs_btnum_t		btnum,
+	xfs_extlen_t		*tree_blocks)
+{
+	struct xfs_buf		*agbp;
+	struct xfs_btree_cur	*cur;
+	int			error;
+
+	error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp);
+	if (error)
+		return error;
+
+	cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno, btnum);
+	error = xfs_btree_count_blocks(cur, tree_blocks);
+	xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
+	xfs_buf_relse(agbp);
+
+	return error;
+}
+
+/*
+ * Figure out how many blocks to reserve and how many are used by this btree.
+ */
+int
+xfs_finobt_calc_reserves(
+	struct xfs_mount	*mp,
+	xfs_agnumber_t		agno,
+	xfs_extlen_t		*ask,
+	xfs_extlen_t		*used)
+{
+	xfs_extlen_t		tree_len = 0;
+	int			error;
+
+	if (!xfs_sb_version_hasfinobt(&mp->m_sb))
+		return 0;
+
+	error = xfs_inobt_count_blocks(mp, agno, XFS_BTNUM_FINO, &tree_len);
+	if (error)
+		return error;
+
+	*ask += xfs_inobt_max_size(mp);
+	*used += tree_len;
+	return 0;
+}
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.h b/fs/xfs/libxfs/xfs_ialloc_btree.h
index bd88453217ce..aa81e2e63f3f 100644
--- a/fs/xfs/libxfs/xfs_ialloc_btree.h
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.h
@@ -72,4 +72,7 @@ int xfs_inobt_rec_check_count(struct xfs_mount *,
 #define xfs_inobt_rec_check_count(mp, rec)	0
 #endif	/* DEBUG */
 
+int xfs_finobt_calc_reserves(struct xfs_mount *mp, xfs_agnumber_t agno,
+		xfs_extlen_t *ask, xfs_extlen_t *used);
+
 #endif	/* __XFS_IALLOC_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index dd483e2767f7..d93f9d918cfc 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -29,6 +29,7 @@
 #include "xfs_icache.h"
 #include "xfs_trans.h"
 #include "xfs_ialloc.h"
+#include "xfs_dir2.h"
 
 /*
  * Check that none of the inode's in the buffer have a next
@@ -386,6 +387,7 @@ xfs_dinode_verify(
 	xfs_ino_t		ino,
 	struct xfs_dinode	*dip)
 {
+	uint16_t		mode;
 	uint16_t		flags;
 	uint64_t		flags2;
 
@@ -396,8 +398,12 @@ xfs_dinode_verify(
 	if (be64_to_cpu(dip->di_size) & (1ULL << 63))
 		return false;
 
-	/* No zero-length symlinks. */
-	if (S_ISLNK(be16_to_cpu(dip->di_mode)) && dip->di_size == 0)
+	mode = be16_to_cpu(dip->di_mode);
+	if (mode && xfs_mode_to_ftype(mode) == XFS_DIR3_FT_UNKNOWN)
+		return false;
+
+	/* No zero-length symlinks/dirs. */
+	if ((S_ISLNK(mode) || S_ISDIR(mode)) && dip->di_size == 0)
 		return false;
 
 	/* only version 3 or greater inodes are extensively verified here */
diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
index 222e103356c6..25c1e078aef6 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.c
+++ b/fs/xfs/libxfs/xfs_inode_fork.c
@@ -26,6 +26,7 @@
 #include "xfs_inode.h"
 #include "xfs_trans.h"
 #include "xfs_inode_item.h"
+#include "xfs_btree.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_bmap.h"
 #include "xfs_error.h"
@@ -429,11 +430,13 @@ xfs_iformat_btree(
 	/* REFERENCED */
 	int			nrecs;
 	int			size;
+	int			level;
 
 	ifp = XFS_IFORK_PTR(ip, whichfork);
 	dfp = (xfs_bmdr_block_t *)XFS_DFORK_PTR(dip, whichfork);
 	size = XFS_BMAP_BROOT_SPACE(mp, dfp);
 	nrecs = be16_to_cpu(dfp->bb_numrecs);
+	level = be16_to_cpu(dfp->bb_level);
 
 	/*
 	 * blow out if -- fork has less extents than can fit in
@@ -446,7 +449,8 @@ xfs_iformat_btree(
 					XFS_IFORK_MAXEXT(ip, whichfork) ||
 		     XFS_BMDR_SPACE_CALC(nrecs) >
 					XFS_DFORK_SIZE(dip, mp, whichfork) ||
-		     XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks)) {
+		     XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks) ||
+		     level == 0 || level > XFS_BTREE_MAXLEVELS) {
 		xfs_warn(mp, "corrupt inode %Lu (btree).",
 					(unsigned long long) ip->i_ino);
 		XFS_CORRUPTION_ERROR("xfs_iformat_btree", XFS_ERRLEVEL_LOW,
@@ -497,15 +501,14 @@ xfs_iread_extents(
 	 * We know that the size is valid (it's checked in iformat_btree)
 	 */
 	ifp->if_bytes = ifp->if_real_bytes = 0;
-	ifp->if_flags |= XFS_IFEXTENTS;
 	xfs_iext_add(ifp, 0, nextents);
 	error = xfs_bmap_read_extents(tp, ip, whichfork);
 	if (error) {
 		xfs_iext_destroy(ifp);
-		ifp->if_flags &= ~XFS_IFEXTENTS;
 		return error;
 	}
 	xfs_validate_extents(ifp, nextents, XFS_EXTFMT_INODE(ip));
+	ifp->if_flags |= XFS_IFEXTENTS;
 	return 0;
 }
 /*
diff --git a/fs/xfs/libxfs/xfs_log_recover.h b/fs/xfs/libxfs/xfs_log_recover.h
index d9f65e2d5cc8..29a01ec89dd0 100644
--- a/fs/xfs/libxfs/xfs_log_recover.h
+++ b/fs/xfs/libxfs/xfs_log_recover.h
@@ -42,7 +42,6 @@ typedef struct xlog_recover_item {
 	xfs_log_iovec_t		*ri_buf;	/* ptr to regions buffer */
 } xlog_recover_item_t;
 
-struct xlog_tid;
 typedef struct xlog_recover {
 	struct hlist_node	r_list;
 	xlog_tid_t		r_log_tid;	/* log's transaction id */
diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c
index 6fb2215f8ff7..50add5272807 100644
--- a/fs/xfs/libxfs/xfs_refcount_btree.c
+++ b/fs/xfs/libxfs/xfs_refcount_btree.c
@@ -409,13 +409,14 @@ xfs_refcountbt_calc_size(
  */
 xfs_extlen_t
 xfs_refcountbt_max_size(
-	struct xfs_mount	*mp)
+	struct xfs_mount	*mp,
+	xfs_agblock_t		agblocks)
 {
 	/* Bail out if we're uninitialized, which can happen in mkfs. */
 	if (mp->m_refc_mxr[0] == 0)
 		return 0;
 
-	return xfs_refcountbt_calc_size(mp, mp->m_sb.sb_agblocks);
+	return xfs_refcountbt_calc_size(mp, agblocks);
 }
 
 /*
@@ -430,22 +431,24 @@ xfs_refcountbt_calc_reserves(
 {
 	struct xfs_buf		*agbp;
 	struct xfs_agf		*agf;
+	xfs_agblock_t		agblocks;
 	xfs_extlen_t		tree_len;
 	int			error;
 
 	if (!xfs_sb_version_hasreflink(&mp->m_sb))
 		return 0;
 
-	*ask += xfs_refcountbt_max_size(mp);
 
 	error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp);
 	if (error)
 		return error;
 
 	agf = XFS_BUF_TO_AGF(agbp);
+	agblocks = be32_to_cpu(agf->agf_length);
 	tree_len = be32_to_cpu(agf->agf_refcount_blocks);
 	xfs_buf_relse(agbp);
 
+	*ask += xfs_refcountbt_max_size(mp, agblocks);
 	*used += tree_len;
 
 	return error;
diff --git a/fs/xfs/libxfs/xfs_refcount_btree.h b/fs/xfs/libxfs/xfs_refcount_btree.h
index 3be7768bd51a..9db008b955b7 100644
--- a/fs/xfs/libxfs/xfs_refcount_btree.h
+++ b/fs/xfs/libxfs/xfs_refcount_btree.h
@@ -66,7 +66,8 @@ extern void xfs_refcountbt_compute_maxlevels(struct xfs_mount *mp);
 
 extern xfs_extlen_t xfs_refcountbt_calc_size(struct xfs_mount *mp,
 		unsigned long long len);
-extern xfs_extlen_t xfs_refcountbt_max_size(struct xfs_mount *mp);
+extern xfs_extlen_t xfs_refcountbt_max_size(struct xfs_mount *mp,
+		xfs_agblock_t agblocks);
 
 extern int xfs_refcountbt_calc_reserves(struct xfs_mount *mp,
 		xfs_agnumber_t agno, xfs_extlen_t *ask, xfs_extlen_t *used);
diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c
index de25771764ba..74e5a54bc428 100644
--- a/fs/xfs/libxfs/xfs_rmap_btree.c
+++ b/fs/xfs/libxfs/xfs_rmap_btree.c
@@ -550,13 +550,14 @@ xfs_rmapbt_calc_size(
  */
 xfs_extlen_t
 xfs_rmapbt_max_size(
-	struct xfs_mount	*mp)
+	struct xfs_mount	*mp,
+	xfs_agblock_t		agblocks)
 {
 	/* Bail out if we're uninitialized, which can happen in mkfs. */
 	if (mp->m_rmap_mxr[0] == 0)
 		return 0;
 
-	return xfs_rmapbt_calc_size(mp, mp->m_sb.sb_agblocks);
+	return xfs_rmapbt_calc_size(mp, agblocks);
 }
 
 /*
@@ -571,25 +572,24 @@ xfs_rmapbt_calc_reserves(
 {
 	struct xfs_buf		*agbp;
 	struct xfs_agf		*agf;
-	xfs_extlen_t		pool_len;
+	xfs_agblock_t		agblocks;
 	xfs_extlen_t		tree_len;
 	int			error;
 
 	if (!xfs_sb_version_hasrmapbt(&mp->m_sb))
 		return 0;
 
-	/* Reserve 1% of the AG or enough for 1 block per record. */
-	pool_len = max(mp->m_sb.sb_agblocks / 100, xfs_rmapbt_max_size(mp));
-	*ask += pool_len;
-
 	error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp);
 	if (error)
 		return error;
 
 	agf = XFS_BUF_TO_AGF(agbp);
+	agblocks = be32_to_cpu(agf->agf_length);
 	tree_len = be32_to_cpu(agf->agf_rmap_blocks);
 	xfs_buf_relse(agbp);
 
+	/* Reserve 1% of the AG or enough for 1 block per record. */
+	*ask += max(agblocks / 100, xfs_rmapbt_max_size(mp, agblocks));
 	*used += tree_len;
 
 	return error;
diff --git a/fs/xfs/libxfs/xfs_rmap_btree.h b/fs/xfs/libxfs/xfs_rmap_btree.h
index 2a9ac472fb15..19c08e933049 100644
--- a/fs/xfs/libxfs/xfs_rmap_btree.h
+++ b/fs/xfs/libxfs/xfs_rmap_btree.h
@@ -60,7 +60,8 @@ extern void xfs_rmapbt_compute_maxlevels(struct xfs_mount *mp);
 
 extern xfs_extlen_t xfs_rmapbt_calc_size(struct xfs_mount *mp,
 		unsigned long long len);
-extern xfs_extlen_t xfs_rmapbt_max_size(struct xfs_mount *mp);
+extern xfs_extlen_t xfs_rmapbt_max_size(struct xfs_mount *mp,
+		xfs_agblock_t agblocks);
 
 extern int xfs_rmapbt_calc_reserves(struct xfs_mount *mp,
 		xfs_agnumber_t agno, xfs_extlen_t *ask, xfs_extlen_t *used);
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index 2580262e4ea0..584ec896a533 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -242,7 +242,7 @@ xfs_mount_validate_sb(
 	    sbp->sb_blocklog < XFS_MIN_BLOCKSIZE_LOG			||
 	    sbp->sb_blocklog > XFS_MAX_BLOCKSIZE_LOG			||
 	    sbp->sb_blocksize != (1 << sbp->sb_blocklog)		||
-	    sbp->sb_dirblklog > XFS_MAX_BLOCKSIZE_LOG			||
+	    sbp->sb_dirblklog + sbp->sb_blocklog > XFS_MAX_BLOCKSIZE_LOG ||
 	    sbp->sb_inodesize < XFS_DINODE_MIN_SIZE			||
 	    sbp->sb_inodesize > XFS_DINODE_MAX_SIZE			||
 	    sbp->sb_inodelog < XFS_DINODE_MIN_LOG			||
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 0f56fcd3a5d5..1ff9df7a3ce8 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -481,6 +481,12 @@ xfs_submit_ioend(
 	struct xfs_ioend	*ioend,
 	int			status)
 {
+	/* Convert CoW extents to regular */
+	if (!status && ioend->io_type == XFS_IO_COW) {
+		status = xfs_reflink_convert_cow(XFS_I(ioend->io_inode),
+				ioend->io_offset, ioend->io_size);
+	}
+
 	/* Reserve log space if we might write beyond the on-disk inode size. */
 	if (!status &&
 	    ioend->io_type != XFS_IO_UNWRITTEN &&
@@ -1152,19 +1158,22 @@ xfs_vm_releasepage(
 	 * block_invalidatepage() can send pages that are still marked dirty
 	 * but otherwise have invalidated buffers.
 	 *
-	 * We've historically freed buffers on the latter. Instead, quietly
-	 * filter out all dirty pages to avoid spurious buffer state warnings.
-	 * This can likely be removed once shrink_active_list() is fixed.
+	 * We want to release the latter to avoid unnecessary buildup of the
+	 * LRU, skip the former and warn if we've left any lingering
+	 * delalloc/unwritten buffers on clean pages. Skip pages with delalloc
+	 * or unwritten buffers and warn if the page is not dirty. Otherwise
+	 * try to release the buffers.
 	 */
-	if (PageDirty(page))
-		return 0;
-
 	xfs_count_page_state(page, &delalloc, &unwritten);
 
-	if (WARN_ON_ONCE(delalloc))
+	if (delalloc) {
+		WARN_ON_ONCE(!PageDirty(page));
 		return 0;
-	if (WARN_ON_ONCE(unwritten))
+	}
+	if (unwritten) {
+		WARN_ON_ONCE(!PageDirty(page));
 		return 0;
+	}
 
 	return try_to_free_buffers(page);
 }
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index b9abce524c33..8b75dcea5966 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -88,7 +88,6 @@ int
 xfs_bmap_rtalloc(
 	struct xfs_bmalloca	*ap)	/* bmap alloc argument struct */
 {
-	xfs_alloctype_t	atype = 0;	/* type for allocation routines */
 	int		error;		/* error return value */
 	xfs_mount_t	*mp;		/* mount point structure */
 	xfs_extlen_t	prod = 0;	/* product factor for allocators */
@@ -155,18 +154,14 @@ xfs_bmap_rtalloc(
 	/*
 	 * Realtime allocation, done through xfs_rtallocate_extent.
 	 */
-	atype = ap->blkno == 0 ?  XFS_ALLOCTYPE_ANY_AG : XFS_ALLOCTYPE_NEAR_BNO;
 	do_div(ap->blkno, mp->m_sb.sb_rextsize);
 	rtb = ap->blkno;
 	ap->length = ralen;
-	if ((error = xfs_rtallocate_extent(ap->tp, ap->blkno, 1, ap->length,
-				&ralen, atype, ap->wasdel, prod, &rtb)))
-		return error;
-	if (rtb == NULLFSBLOCK && prod > 1 &&
-	    (error = xfs_rtallocate_extent(ap->tp, ap->blkno, 1,
-					   ap->length, &ralen, atype,
-					   ap->wasdel, 1, &rtb)))
+	error = xfs_rtallocate_extent(ap->tp, ap->blkno, 1, ap->length,
+				&ralen, ap->wasdel, prod, &rtb);
+	if (error)
 		return error;
+
 	ap->blkno = rtb;
 	if (ap->blkno != NULLFSBLOCK) {
 		ap->blkno *= mp->m_sb.sb_rextsize;
@@ -528,7 +523,6 @@ xfs_getbmap(
 	xfs_bmbt_irec_t		*map;		/* buffer for user's data */
 	xfs_mount_t		*mp;		/* file system mount point */
 	int			nex;		/* # of user extents can do */
-	int			nexleft;	/* # of user extents left */
 	int			subnex;		/* # of bmapi's can do */
 	int			nmap;		/* number of map entries */
 	struct getbmapx		*out;		/* output structure */
@@ -686,10 +680,8 @@ xfs_getbmap(
 		goto out_free_map;
 	}
 
-	nexleft = nex;
-
 	do {
-		nmap = (nexleft > subnex) ? subnex : nexleft;
+		nmap = (nex> subnex) ? subnex : nex;
 		error = xfs_bmapi_read(ip, XFS_BB_TO_FSBT(mp, bmv->bmv_offset),
 				       XFS_BB_TO_FSB(mp, bmv->bmv_length),
 				       map, &nmap, bmapi_flags);
@@ -697,8 +689,8 @@ xfs_getbmap(
 			goto out_free_map;
 		ASSERT(nmap <= subnex);
 
-		for (i = 0; i < nmap && nexleft && bmv->bmv_length &&
-				cur_ext < bmv->bmv_count; i++) {
+		for (i = 0; i < nmap && bmv->bmv_length &&
+				cur_ext < bmv->bmv_count - 1; i++) {
 			out[cur_ext].bmv_oflags = 0;
 			if (map[i].br_state == XFS_EXT_UNWRITTEN)
 				out[cur_ext].bmv_oflags |= BMV_OF_PREALLOC;
@@ -760,16 +752,27 @@ xfs_getbmap(
 				continue;
 			}
 
+			/*
+			 * In order to report shared extents accurately,
+			 * we report each distinct shared/unshared part
+			 * of a single bmbt record using multiple bmap
+			 * extents.  To make that happen, we iterate the
+			 * same map array item multiple times, each
+			 * time trimming out the subextent that we just
+			 * reported.
+			 *
+			 * Because of this, we must check the out array
+			 * index (cur_ext) directly against bmv_count-1
+			 * to avoid overflows.
+			 */
 			if (inject_map.br_startblock != NULLFSBLOCK) {
 				map[i] = inject_map;
 				i--;
-			} else
-				nexleft--;
+			}
 			bmv->bmv_entries++;
 			cur_ext++;
 		}
-	} while (nmap && nexleft && bmv->bmv_length &&
-		 cur_ext < bmv->bmv_count);
+	} while (nmap && bmv->bmv_length && cur_ext < bmv->bmv_count - 1);
 
  out_free_map:
 	kmem_free(map);
@@ -779,11 +782,9 @@ xfs_getbmap(
 	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
 
 	for (i = 0; i < cur_ext; i++) {
-		int full = 0;	/* user array is full */
-
 		/* format results & advance arg */
-		error = formatter(&arg, &out[i], &full);
-		if (error || full)
+		error = formatter(&arg, &out[i]);
+		if (error)
 			break;
 	}
 
@@ -909,17 +910,18 @@ xfs_can_free_eofblocks(struct xfs_inode *ip, bool force)
  */
 int
 xfs_free_eofblocks(
-	xfs_mount_t	*mp,
-	xfs_inode_t	*ip,
-	bool		need_iolock)
+	struct xfs_inode	*ip)
 {
-	xfs_trans_t	*tp;
-	int		error;
-	xfs_fileoff_t	end_fsb;
-	xfs_fileoff_t	last_fsb;
-	xfs_filblks_t	map_len;
-	int		nimaps;
-	xfs_bmbt_irec_t	imap;
+	struct xfs_trans	*tp;
+	int			error;
+	xfs_fileoff_t		end_fsb;
+	xfs_fileoff_t		last_fsb;
+	xfs_filblks_t		map_len;
+	int			nimaps;
+	struct xfs_bmbt_irec	imap;
+	struct xfs_mount	*mp = ip->i_mount;
+
+	ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
 
 	/*
 	 * Figure out if there are any blocks beyond the end
@@ -936,6 +938,10 @@ xfs_free_eofblocks(
 	error = xfs_bmapi_read(ip, end_fsb, map_len, &imap, &nimaps, 0);
 	xfs_iunlock(ip, XFS_ILOCK_SHARED);
 
+	/*
+	 * If there are blocks after the end of file, truncate the file to its
+	 * current size to free them up.
+	 */
 	if (!error && (nimaps != 0) &&
 	    (imap.br_startblock != HOLESTARTBLOCK ||
 	     ip->i_delayed_blks)) {
@@ -946,22 +952,13 @@ xfs_free_eofblocks(
 		if (error)
 			return error;
 
-		/*
-		 * There are blocks after the end of file.
-		 * Free them up now by truncating the file to
-		 * its current size.
-		 */
-		if (need_iolock) {
-			if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL))
-				return -EAGAIN;
-		}
+		/* wait on dio to ensure i_size has settled */
+		inode_dio_wait(VFS_I(ip));
 
 		error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0,
 				&tp);
 		if (error) {
 			ASSERT(XFS_FORCED_SHUTDOWN(mp));
-			if (need_iolock)
-				xfs_iunlock(ip, XFS_IOLOCK_EXCL);
 			return error;
 		}
 
@@ -989,8 +986,6 @@ xfs_free_eofblocks(
 		}
 
 		xfs_iunlock(ip, XFS_ILOCK_EXCL);
-		if (need_iolock)
-			xfs_iunlock(ip, XFS_IOLOCK_EXCL);
 	}
 	return error;
 }
@@ -1385,10 +1380,16 @@ xfs_shift_file_space(
 	xfs_fileoff_t		stop_fsb;
 	xfs_fileoff_t		next_fsb;
 	xfs_fileoff_t		shift_fsb;
+	uint			resblks;
 
 	ASSERT(direction == SHIFT_LEFT || direction == SHIFT_RIGHT);
 
 	if (direction == SHIFT_LEFT) {
+		/*
+		 * Reserve blocks to cover potential extent merges after left
+		 * shift operations.
+		 */
+		resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
 		next_fsb = XFS_B_TO_FSB(mp, offset + len);
 		stop_fsb = XFS_B_TO_FSB(mp, VFS_I(ip)->i_size);
 	} else {
@@ -1396,6 +1397,7 @@ xfs_shift_file_space(
 		 * If right shift, delegate the work of initialization of
 		 * next_fsb to xfs_bmap_shift_extent as it has ilock held.
 		 */
+		resblks = 0;
 		next_fsb = NULLFSBLOCK;
 		stop_fsb = XFS_B_TO_FSB(mp, offset);
 	}
@@ -1407,7 +1409,7 @@ xfs_shift_file_space(
 	 * into the accessible region of the file.
 	 */
 	if (xfs_can_free_eofblocks(ip, true)) {
-		error = xfs_free_eofblocks(mp, ip, false);
+		error = xfs_free_eofblocks(ip);
 		if (error)
 			return error;
 	}
@@ -1437,21 +1439,14 @@ xfs_shift_file_space(
 	}
 
 	while (!error && !done) {
-		/*
-		 * We would need to reserve permanent block for transaction.
-		 * This will come into picture when after shifting extent into
-		 * hole we found that adjacent extents can be merged which
-		 * may lead to freeing of a block during record update.
-		 */
-		error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write,
-				XFS_DIOSTRAT_SPACE_RES(mp, 0), 0, 0, &tp);
+		error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0,
+					&tp);
 		if (error)
 			break;
 
 		xfs_ilock(ip, XFS_ILOCK_EXCL);
 		error = xfs_trans_reserve_quota(tp, mp, ip->i_udquot,
-				ip->i_gdquot, ip->i_pdquot,
-				XFS_DIOSTRAT_SPACE_RES(mp, 0), 0,
+				ip->i_gdquot, ip->i_pdquot, resblks, 0,
 				XFS_QMOPT_RES_REGBLKS);
 		if (error)
 			goto out_trans_cancel;
diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h
index 68a621a8e0c0..135d8267e284 100644
--- a/fs/xfs/xfs_bmap_util.h
+++ b/fs/xfs/xfs_bmap_util.h
@@ -35,7 +35,7 @@ int	xfs_bmap_punch_delalloc_range(struct xfs_inode *ip,
 		xfs_fileoff_t start_fsb, xfs_fileoff_t length);
 
 /* bmap to userspace formatter - copy to user & advance pointer */
-typedef int (*xfs_bmap_format_t)(void **, struct getbmapx *, int *);
+typedef int (*xfs_bmap_format_t)(void **, struct getbmapx *);
 int	xfs_getbmap(struct xfs_inode *ip, struct getbmapx *bmv,
 		xfs_bmap_format_t formatter, void *arg);
 
@@ -63,8 +63,7 @@ int	xfs_insert_file_space(struct xfs_inode *, xfs_off_t offset,
 
 /* EOF block manipulation functions */
 bool	xfs_can_free_eofblocks(struct xfs_inode *ip, bool force);
-int	xfs_free_eofblocks(struct xfs_mount *mp, struct xfs_inode *ip,
-			   bool need_iolock);
+int	xfs_free_eofblocks(struct xfs_inode *ip);
 
 int	xfs_swap_extents(struct xfs_inode *ip, struct xfs_inode *tip,
 			 struct xfs_swapext *sx);
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 7f0a01f7b592..8c7d01b75922 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -422,6 +422,7 @@ retry:
 out_free_pages:
 	for (i = 0; i < bp->b_page_count; i++)
 		__free_page(bp->b_pages[i]);
+	bp->b_flags &= ~_XBF_PAGES;
 	return error;
 }
 
@@ -757,7 +758,7 @@ xfs_buf_readahead_map(
 	int			nmaps,
 	const struct xfs_buf_ops *ops)
 {
-	if (bdi_read_congested(target->bt_bdi))
+	if (bdi_read_congested(target->bt_bdev->bd_bdi))
 		return;
 
 	xfs_buf_read_map(target, map, nmaps,
@@ -1790,7 +1791,6 @@ xfs_alloc_buftarg(
 	btp->bt_mount = mp;
 	btp->bt_dev =  bdev->bd_dev;
 	btp->bt_bdev = bdev;
-	btp->bt_bdi = blk_get_backing_dev_info(bdev);
 
 	if (xfs_setsize_buftarg_early(btp, bdev))
 		goto error;
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 8a9d3a9599f0..3c867e5a63e1 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -109,7 +109,6 @@ typedef unsigned int xfs_buf_flags_t;
 typedef struct xfs_buftarg {
 	dev_t			bt_dev;
 	struct block_device	*bt_bdev;
-	struct backing_dev_info	*bt_bdi;
 	struct xfs_mount	*bt_mount;
 	unsigned int		bt_meta_sectorsize;
 	size_t			bt_meta_sectormask;
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 2975cb2319f4..0306168af332 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -1162,6 +1162,7 @@ xfs_buf_iodone_callbacks(
 	 */
 	bp->b_last_error = 0;
 	bp->b_retries = 0;
+	bp->b_first_retry_time = 0;
 
 	xfs_buf_do_callbacks(bp);
 	bp->b_fspriv = NULL;
diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c
index 4ff499aa7338..d796ffac7296 100644
--- a/fs/xfs/xfs_discard.c
+++ b/fs/xfs/xfs_discard.c
@@ -208,32 +208,3 @@ xfs_ioc_trim(
 		return -EFAULT;
 	return 0;
 }
-
-int
-xfs_discard_extents(
-	struct xfs_mount	*mp,
-	struct list_head	*list)
-{
-	struct xfs_extent_busy	*busyp;
-	int			error = 0;
-
-	list_for_each_entry(busyp, list, list) {
-		trace_xfs_discard_extent(mp, busyp->agno, busyp->bno,
-					 busyp->length);
-
-		error = blkdev_issue_discard(mp->m_ddev_targp->bt_bdev,
-				XFS_AGB_TO_DADDR(mp, busyp->agno, busyp->bno),
-				XFS_FSB_TO_BB(mp, busyp->length),
-				GFP_NOFS, 0);
-		if (error && error != -EOPNOTSUPP) {
-			xfs_info(mp,
-	 "discard failed for extent [0x%llx,%u], error %d",
-				 (unsigned long long)busyp->bno,
-				 busyp->length,
-				 error);
-			return error;
-		}
-	}
-
-	return 0;
-}
diff --git a/fs/xfs/xfs_discard.h b/fs/xfs/xfs_discard.h
index 344879aea646..0f070f9e44e1 100644
--- a/fs/xfs/xfs_discard.h
+++ b/fs/xfs/xfs_discard.h
@@ -5,6 +5,5 @@ struct fstrim_range;
 struct list_head;
 
 extern int	xfs_ioc_trim(struct xfs_mount *, struct fstrim_range __user *);
-extern int	xfs_discard_extents(struct xfs_mount *, struct list_head *);
 
 #endif /* XFS_DISCARD_H */
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 7a30b8f11db7..9d06cc30e875 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -710,6 +710,10 @@ xfs_dq_get_next_id(
 	/* Simple advance */
 	next_id = *id + 1;
 
+	/* If we'd wrap past the max ID, stop */
+	if (next_id < *id)
+		return -ENOENT;
+
 	/* If new ID is within the current chunk, advancing it sufficed */
 	if (next_id % mp->m_quotainfo->qi_dqperchunk) {
 		*id = next_id;
diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c
index 162dc186cf04..77760dbf0242 100644
--- a/fs/xfs/xfs_extent_busy.c
+++ b/fs/xfs/xfs_extent_busy.c
@@ -45,18 +45,7 @@ xfs_extent_busy_insert(
 	struct rb_node		**rbp;
 	struct rb_node		*parent = NULL;
 
-	new = kmem_zalloc(sizeof(struct xfs_extent_busy), KM_MAYFAIL);
-	if (!new) {
-		/*
-		 * No Memory!  Since it is now not possible to track the free
-		 * block, make this a synchronous transaction to insure that
-		 * the block is not reused before this transaction commits.
-		 */
-		trace_xfs_extent_busy_enomem(tp->t_mountp, agno, bno, len);
-		xfs_trans_set_sync(tp);
-		return;
-	}
-
+	new = kmem_zalloc(sizeof(struct xfs_extent_busy), KM_SLEEP);
 	new->agno = agno;
 	new->bno = bno;
 	new->length = len;
@@ -345,25 +334,31 @@ restart:
  * subset of the extent that is not busy.  If *rlen is smaller than
  * args->minlen no suitable extent could be found, and the higher level
  * code needs to force out the log and retry the allocation.
+ *
+ * Return the current busy generation for the AG if the extent is busy. This
+ * value can be used to wait for at least one of the currently busy extents
+ * to be cleared. Note that the busy list is not guaranteed to be empty after
+ * the gen is woken. The state of a specific extent must always be confirmed
+ * with another call to xfs_extent_busy_trim() before it can be used.
  */
-void
+bool
 xfs_extent_busy_trim(
 	struct xfs_alloc_arg	*args,
-	xfs_agblock_t		bno,
-	xfs_extlen_t		len,
-	xfs_agblock_t		*rbno,
-	xfs_extlen_t		*rlen)
+	xfs_agblock_t		*bno,
+	xfs_extlen_t		*len,
+	unsigned		*busy_gen)
 {
 	xfs_agblock_t		fbno;
 	xfs_extlen_t		flen;
 	struct rb_node		*rbp;
+	bool			ret = false;
 
-	ASSERT(len > 0);
+	ASSERT(*len > 0);
 
 	spin_lock(&args->pag->pagb_lock);
 restart:
-	fbno = bno;
-	flen = len;
+	fbno = *bno;
+	flen = *len;
 	rbp = args->pag->pagb_tree.rb_node;
 	while (rbp && flen >= args->minlen) {
 		struct xfs_extent_busy *busyp =
@@ -515,24 +510,25 @@ restart:
 
 		flen = fend - fbno;
 	}
-	spin_unlock(&args->pag->pagb_lock);
+out:
 
-	if (fbno != bno || flen != len) {
-		trace_xfs_extent_busy_trim(args->mp, args->agno, bno, len,
+	if (fbno != *bno || flen != *len) {
+		trace_xfs_extent_busy_trim(args->mp, args->agno, *bno, *len,
 					  fbno, flen);
+		*bno = fbno;
+		*len = flen;
+		*busy_gen = args->pag->pagb_gen;
+		ret = true;
 	}
-	*rbno = fbno;
-	*rlen = flen;
-	return;
+	spin_unlock(&args->pag->pagb_lock);
+	return ret;
 fail:
 	/*
 	 * Return a zero extent length as failure indications.  All callers
 	 * re-check if the trimmed extent satisfies the minlen requirement.
 	 */
-	spin_unlock(&args->pag->pagb_lock);
-	trace_xfs_extent_busy_trim(args->mp, args->agno, bno, len, fbno, 0);
-	*rbno = fbno;
-	*rlen = 0;
+	flen = 0;
+	goto out;
 }
 
 STATIC void
@@ -551,6 +547,21 @@ xfs_extent_busy_clear_one(
 	kmem_free(busyp);
 }
 
+static void
+xfs_extent_busy_put_pag(
+	struct xfs_perag	*pag,
+	bool			wakeup)
+		__releases(pag->pagb_lock)
+{
+	if (wakeup) {
+		pag->pagb_gen++;
+		wake_up_all(&pag->pagb_wait);
+	}
+
+	spin_unlock(&pag->pagb_lock);
+	xfs_perag_put(pag);
+}
+
 /*
  * Remove all extents on the passed in list from the busy extents tree.
  * If do_discard is set skip extents that need to be discarded, and mark
@@ -565,27 +576,76 @@ xfs_extent_busy_clear(
 	struct xfs_extent_busy	*busyp, *n;
 	struct xfs_perag	*pag = NULL;
 	xfs_agnumber_t		agno = NULLAGNUMBER;
+	bool			wakeup = false;
 
 	list_for_each_entry_safe(busyp, n, list, list) {
 		if (busyp->agno != agno) {
-			if (pag) {
-				spin_unlock(&pag->pagb_lock);
-				xfs_perag_put(pag);
-			}
-			pag = xfs_perag_get(mp, busyp->agno);
-			spin_lock(&pag->pagb_lock);
+			if (pag)
+				xfs_extent_busy_put_pag(pag, wakeup);
 			agno = busyp->agno;
+			pag = xfs_perag_get(mp, agno);
+			spin_lock(&pag->pagb_lock);
+			wakeup = false;
 		}
 
 		if (do_discard && busyp->length &&
-		    !(busyp->flags & XFS_EXTENT_BUSY_SKIP_DISCARD))
+		    !(busyp->flags & XFS_EXTENT_BUSY_SKIP_DISCARD)) {
 			busyp->flags = XFS_EXTENT_BUSY_DISCARDED;
-		else
+		} else {
 			xfs_extent_busy_clear_one(mp, pag, busyp);
+			wakeup = true;
+		}
 	}
 
-	if (pag) {
-		spin_unlock(&pag->pagb_lock);
+	if (pag)
+		xfs_extent_busy_put_pag(pag, wakeup);
+}
+
+/*
+ * Flush out all busy extents for this AG.
+ */
+void
+xfs_extent_busy_flush(
+	struct xfs_mount	*mp,
+	struct xfs_perag	*pag,
+	unsigned		busy_gen)
+{
+	DEFINE_WAIT		(wait);
+	int			log_flushed = 0, error;
+
+	trace_xfs_log_force(mp, 0, _THIS_IP_);
+	error = _xfs_log_force(mp, XFS_LOG_SYNC, &log_flushed);
+	if (error)
+		return;
+
+	do {
+		prepare_to_wait(&pag->pagb_wait, &wait, TASK_KILLABLE);
+		if  (busy_gen != READ_ONCE(pag->pagb_gen))
+			break;
+		schedule();
+	} while (1);
+
+	finish_wait(&pag->pagb_wait, &wait);
+}
+
+void
+xfs_extent_busy_wait_all(
+	struct xfs_mount	*mp)
+{
+	DEFINE_WAIT		(wait);
+	xfs_agnumber_t		agno;
+
+	for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
+		struct xfs_perag *pag = xfs_perag_get(mp, agno);
+
+		do {
+			prepare_to_wait(&pag->pagb_wait, &wait, TASK_KILLABLE);
+			if  (RB_EMPTY_ROOT(&pag->pagb_tree))
+				break;
+			schedule();
+		} while (1);
+		finish_wait(&pag->pagb_wait, &wait);
+
 		xfs_perag_put(pag);
 	}
 }
@@ -596,9 +656,17 @@ xfs_extent_busy_clear(
 int
 xfs_extent_busy_ag_cmp(
 	void			*priv,
-	struct list_head	*a,
-	struct list_head	*b)
+	struct list_head	*l1,
+	struct list_head	*l2)
 {
-	return container_of(a, struct xfs_extent_busy, list)->agno -
-		container_of(b, struct xfs_extent_busy, list)->agno;
+	struct xfs_extent_busy	*b1 =
+		container_of(l1, struct xfs_extent_busy, list);
+	struct xfs_extent_busy	*b2 =
+		container_of(l2, struct xfs_extent_busy, list);
+	s32 diff;
+
+	diff = b1->agno - b2->agno;
+	if (!diff)
+		diff = b1->bno - b2->bno;
+	return diff;
 }
diff --git a/fs/xfs/xfs_extent_busy.h b/fs/xfs/xfs_extent_busy.h
index bfff284d2dcc..60195ea1b84a 100644
--- a/fs/xfs/xfs_extent_busy.h
+++ b/fs/xfs/xfs_extent_busy.h
@@ -58,9 +58,16 @@ void
 xfs_extent_busy_reuse(struct xfs_mount *mp, xfs_agnumber_t agno,
 	xfs_agblock_t fbno, xfs_extlen_t flen, bool userdata);
 
+bool
+xfs_extent_busy_trim(struct xfs_alloc_arg *args, xfs_agblock_t *bno,
+		xfs_extlen_t *len, unsigned *busy_gen);
+
+void
+xfs_extent_busy_flush(struct xfs_mount *mp, struct xfs_perag *pag,
+	unsigned busy_gen);
+
 void
-xfs_extent_busy_trim(struct xfs_alloc_arg *args, xfs_agblock_t bno,
-	xfs_extlen_t len, xfs_agblock_t *rbno, xfs_extlen_t *rlen);
+xfs_extent_busy_wait_all(struct xfs_mount *mp);
 
 int
 xfs_extent_busy_ag_cmp(void *priv, struct list_head *a, struct list_head *b);
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index bbb9eb6811b2..022014016d80 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -527,6 +527,15 @@ xfs_file_dio_aio_write(
 	if ((iocb->ki_pos & mp->m_blockmask) ||
 	    ((iocb->ki_pos + count) & mp->m_blockmask)) {
 		unaligned_io = 1;
+
+		/*
+		 * We can't properly handle unaligned direct I/O to reflink
+		 * files yet, as we can't unshare a partial block.
+		 */
+		if (xfs_is_reflink_inode(ip)) {
+			trace_xfs_reflink_bounce_dio_write(ip, iocb->ki_pos, count);
+			return -EREMCHG;
+		}
 		iolock = XFS_IOLOCK_EXCL;
 	} else {
 		iolock = XFS_IOLOCK_SHARED;
@@ -552,14 +561,6 @@ xfs_file_dio_aio_write(
 	}
 
 	trace_xfs_file_direct_write(ip, count, iocb->ki_pos);
-
-	/* If this is a block-aligned directio CoW, remap immediately. */
-	if (xfs_is_reflink_inode(ip) && !unaligned_io) {
-		ret = xfs_reflink_allocate_cow_range(ip, iocb->ki_pos, count);
-		if (ret)
-			goto out;
-	}
-
 	ret = iomap_dio_rw(iocb, from, &xfs_iomap_ops, xfs_dio_write_end_io);
 out:
 	xfs_iunlock(ip, iolock);
@@ -614,8 +615,10 @@ xfs_file_buffered_aio_write(
 	struct xfs_inode	*ip = XFS_I(inode);
 	ssize_t			ret;
 	int			enospc = 0;
-	int			iolock = XFS_IOLOCK_EXCL;
+	int			iolock;
 
+write_retry:
+	iolock = XFS_IOLOCK_EXCL;
 	xfs_ilock(ip, iolock);
 
 	ret = xfs_file_aio_write_checks(iocb, from, &iolock);
@@ -625,7 +628,6 @@ xfs_file_buffered_aio_write(
 	/* We can write back this queue in page reclaim */
 	current->backing_dev_info = inode_to_bdi(inode);
 
-write_retry:
 	trace_xfs_file_buffered_write(ip, iov_iter_count(from), iocb->ki_pos);
 	ret = iomap_file_buffered_write(iocb, from, &xfs_iomap_ops);
 	if (likely(ret >= 0))
@@ -641,18 +643,21 @@ write_retry:
 	 * running at the same time.
 	 */
 	if (ret == -EDQUOT && !enospc) {
+		xfs_iunlock(ip, iolock);
 		enospc = xfs_inode_free_quota_eofblocks(ip);
 		if (enospc)
 			goto write_retry;
 		enospc = xfs_inode_free_quota_cowblocks(ip);
 		if (enospc)
 			goto write_retry;
+		iolock = 0;
 	} else if (ret == -ENOSPC && !enospc) {
 		struct xfs_eofblocks eofb = {0};
 
 		enospc = 1;
 		xfs_flush_inodes(ip->i_mount);
-		eofb.eof_scan_owner = ip->i_ino; /* for locking */
+
+		xfs_iunlock(ip, iolock);
 		eofb.eof_flags = XFS_EOF_FLAGS_SYNC;
 		xfs_icache_free_eofblocks(ip->i_mount, &eofb);
 		goto write_retry;
@@ -660,7 +665,8 @@ write_retry:
 
 	current->backing_dev_info = NULL;
 out:
-	xfs_iunlock(ip, iolock);
+	if (iolock)
+		xfs_iunlock(ip, iolock);
 	return ret;
 }
 
@@ -908,9 +914,9 @@ xfs_dir_open(
 	 */
 	mode = xfs_ilock_data_map_shared(ip);
 	if (ip->i_d.di_nextents > 0)
-		xfs_dir3_data_readahead(ip, 0, -1);
+		error = xfs_dir3_data_readahead(ip, 0, -1);
 	xfs_iunlock(ip, mode);
-	return 0;
+	return error;
 }
 
 STATIC int
@@ -1431,12 +1437,9 @@ xfs_filemap_fault(
  */
 STATIC int
 xfs_filemap_pmd_fault(
-	struct vm_area_struct	*vma,
-	unsigned long		addr,
-	pmd_t			*pmd,
-	unsigned int		flags)
+	struct vm_fault		*vmf)
 {
-	struct inode		*inode = file_inode(vma->vm_file);
+	struct inode		*inode = file_inode(vmf->vma->vm_file);
 	struct xfs_inode	*ip = XFS_I(inode);
 	int			ret;
 
@@ -1445,16 +1448,16 @@ xfs_filemap_pmd_fault(
 
 	trace_xfs_filemap_pmd_fault(ip);
 
-	if (flags & FAULT_FLAG_WRITE) {
+	if (vmf->flags & FAULT_FLAG_WRITE) {
 		sb_start_pagefault(inode->i_sb);
-		file_update_time(vma->vm_file);
+		file_update_time(vmf->vma->vm_file);
 	}
 
 	xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
-	ret = dax_iomap_pmd_fault(vma, addr, pmd, flags, &xfs_iomap_ops);
+	ret = dax_iomap_pmd_fault(vmf, &xfs_iomap_ops);
 	xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
 
-	if (flags & FAULT_FLAG_WRITE)
+	if (vmf->flags & FAULT_FLAG_WRITE)
 		sb_end_pagefault(inode->i_sb);
 
 	return ret;
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 93d12fa2670d..6ccaae9eb0ee 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -352,12 +352,7 @@ xfs_growfs_data_private(
 			goto error0;
 		}
 
-		if (xfs_sb_version_hascrc(&mp->m_sb))
-			xfs_btree_init_block(mp, bp, XFS_ABTB_CRC_MAGIC, 0, 1,
-						agno, XFS_BTREE_CRC_BLOCKS);
-		else
-			xfs_btree_init_block(mp, bp, XFS_ABTB_MAGIC, 0, 1,
-						agno, 0);
+		xfs_btree_init_block(mp, bp, XFS_BTNUM_BNO, 0, 1, agno, 0);
 
 		arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1);
 		arec->ar_startblock = cpu_to_be32(mp->m_ag_prealloc_blocks);
@@ -381,12 +376,7 @@ xfs_growfs_data_private(
 			goto error0;
 		}
 
-		if (xfs_sb_version_hascrc(&mp->m_sb))
-			xfs_btree_init_block(mp, bp, XFS_ABTC_CRC_MAGIC, 0, 1,
-						agno, XFS_BTREE_CRC_BLOCKS);
-		else
-			xfs_btree_init_block(mp, bp, XFS_ABTC_MAGIC, 0, 1,
-						agno, 0);
+		xfs_btree_init_block(mp, bp, XFS_BTNUM_CNT, 0, 1, agno, 0);
 
 		arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1);
 		arec->ar_startblock = cpu_to_be32(mp->m_ag_prealloc_blocks);
@@ -413,8 +403,8 @@ xfs_growfs_data_private(
 				goto error0;
 			}
 
-			xfs_btree_init_block(mp, bp, XFS_RMAP_CRC_MAGIC, 0, 0,
-						agno, XFS_BTREE_CRC_BLOCKS);
+			xfs_btree_init_block(mp, bp, XFS_BTNUM_RMAP, 0, 0,
+						agno, 0);
 			block = XFS_BUF_TO_BLOCK(bp);
 
 
@@ -488,12 +478,7 @@ xfs_growfs_data_private(
 			goto error0;
 		}
 
-		if (xfs_sb_version_hascrc(&mp->m_sb))
-			xfs_btree_init_block(mp, bp, XFS_IBT_CRC_MAGIC, 0, 0,
-						agno, XFS_BTREE_CRC_BLOCKS);
-		else
-			xfs_btree_init_block(mp, bp, XFS_IBT_MAGIC, 0, 0,
-						agno, 0);
+		xfs_btree_init_block(mp, bp, XFS_BTNUM_INO , 0, 0, agno, 0);
 
 		error = xfs_bwrite(bp);
 		xfs_buf_relse(bp);
@@ -513,13 +498,8 @@ xfs_growfs_data_private(
 				goto error0;
 			}
 
-			if (xfs_sb_version_hascrc(&mp->m_sb))
-				xfs_btree_init_block(mp, bp, XFS_FIBT_CRC_MAGIC,
-						     0, 0, agno,
-						     XFS_BTREE_CRC_BLOCKS);
-			else
-				xfs_btree_init_block(mp, bp, XFS_FIBT_MAGIC, 0,
-						     0, agno, 0);
+			xfs_btree_init_block(mp, bp, XFS_BTNUM_FINO,
+						     0, 0, agno, 0);
 
 			error = xfs_bwrite(bp);
 			xfs_buf_relse(bp);
@@ -540,9 +520,8 @@ xfs_growfs_data_private(
 				goto error0;
 			}
 
-			xfs_btree_init_block(mp, bp, XFS_REFC_CRC_MAGIC,
-					     0, 0, agno,
-					     XFS_BTREE_CRC_BLOCKS);
+			xfs_btree_init_block(mp, bp, XFS_BTNUM_REFC,
+					     0, 0, agno, 0);
 
 			error = xfs_bwrite(bp);
 			xfs_buf_relse(bp);
@@ -631,6 +610,20 @@ xfs_growfs_data_private(
 	xfs_set_low_space_thresholds(mp);
 	mp->m_alloc_set_aside = xfs_alloc_set_aside(mp);
 
+	/*
+	 * If we expanded the last AG, free the per-AG reservation
+	 * so we can reinitialize it with the new size.
+	 */
+	if (new) {
+		struct xfs_perag	*pag;
+
+		pag = xfs_perag_get(mp, agno);
+		error = xfs_ag_resv_free(pag);
+		xfs_perag_put(pag);
+		if (error)
+			goto out;
+	}
+
 	/* Reserve AG metadata blocks. */
 	error = xfs_fs_reserve_ag_blocks(mp);
 	if (error && error != -ENOSPC)
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index ff4d6311c7f4..7234b9748c36 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -1322,13 +1322,10 @@ xfs_inode_free_eofblocks(
 	int			flags,
 	void			*args)
 {
-	int ret;
+	int ret = 0;
 	struct xfs_eofblocks *eofb = args;
-	bool need_iolock = true;
 	int match;
 
-	ASSERT(!eofb || (eofb && eofb->eof_scan_owner != 0));
-
 	if (!xfs_can_free_eofblocks(ip, false)) {
 		/* inode could be preallocated or append-only */
 		trace_xfs_inode_free_eofblocks_invalid(ip);
@@ -1356,21 +1353,19 @@ xfs_inode_free_eofblocks(
 		if (eofb->eof_flags & XFS_EOF_FLAGS_MINFILESIZE &&
 		    XFS_ISIZE(ip) < eofb->eof_min_file_size)
 			return 0;
-
-		/*
-		 * A scan owner implies we already hold the iolock. Skip it in
-		 * xfs_free_eofblocks() to avoid deadlock. This also eliminates
-		 * the possibility of EAGAIN being returned.
-		 */
-		if (eofb->eof_scan_owner == ip->i_ino)
-			need_iolock = false;
 	}
 
-	ret = xfs_free_eofblocks(ip->i_mount, ip, need_iolock);
-
-	/* don't revisit the inode if we're not waiting */
-	if (ret == -EAGAIN && !(flags & SYNC_WAIT))
-		ret = 0;
+	/*
+	 * If the caller is waiting, return -EAGAIN to keep the background
+	 * scanner moving and revisit the inode in a subsequent pass.
+	 */
+	if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
+		if (flags & SYNC_WAIT)
+			ret = -EAGAIN;
+		return ret;
+	}
+	ret = xfs_free_eofblocks(ip);
+	xfs_iunlock(ip, XFS_IOLOCK_EXCL);
 
 	return ret;
 }
@@ -1417,15 +1412,10 @@ __xfs_inode_free_quota_eofblocks(
 	struct xfs_eofblocks eofb = {0};
 	struct xfs_dquot *dq;
 
-	ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
-
 	/*
-	 * Set the scan owner to avoid a potential livelock. Otherwise, the scan
-	 * can repeatedly trylock on the inode we're currently processing. We
-	 * run a sync scan to increase effectiveness and use the union filter to
+	 * Run a sync scan to increase effectiveness and use the union filter to
 	 * cover all applicable quotas in a single scan.
 	 */
-	eofb.eof_scan_owner = ip->i_ino;
 	eofb.eof_flags = XFS_EOF_FLAGS_UNION|XFS_EOF_FLAGS_SYNC;
 
 	if (XFS_IS_UQUOTA_ENFORCED(ip->i_mount)) {
@@ -1577,12 +1567,9 @@ xfs_inode_free_cowblocks(
 {
 	int ret;
 	struct xfs_eofblocks *eofb = args;
-	bool need_iolock = true;
 	int match;
 	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
 
-	ASSERT(!eofb || (eofb && eofb->eof_scan_owner != 0));
-
 	/*
 	 * Just clear the tag if we have an empty cow fork or none at all. It's
 	 * possible the inode was fully unshared since it was originally tagged.
@@ -1597,7 +1584,8 @@ xfs_inode_free_cowblocks(
 	 * If the mapping is dirty or under writeback we cannot touch the
 	 * CoW fork.  Leave it alone if we're in the midst of a directio.
 	 */
-	if (mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY) ||
+	if ((VFS_I(ip)->i_state & I_DIRTY_PAGES) ||
+	    mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY) ||
 	    mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_WRITEBACK) ||
 	    atomic_read(&VFS_I(ip)->i_dio_count))
 		return 0;
@@ -1614,28 +1602,16 @@ xfs_inode_free_cowblocks(
 		if (eofb->eof_flags & XFS_EOF_FLAGS_MINFILESIZE &&
 		    XFS_ISIZE(ip) < eofb->eof_min_file_size)
 			return 0;
-
-		/*
-		 * A scan owner implies we already hold the iolock. Skip it in
-		 * xfs_free_eofblocks() to avoid deadlock. This also eliminates
-		 * the possibility of EAGAIN being returned.
-		 */
-		if (eofb->eof_scan_owner == ip->i_ino)
-			need_iolock = false;
 	}
 
 	/* Free the CoW blocks */
-	if (need_iolock) {
-		xfs_ilock(ip, XFS_IOLOCK_EXCL);
-		xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
-	}
+	xfs_ilock(ip, XFS_IOLOCK_EXCL);
+	xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
 
 	ret = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF);
 
-	if (need_iolock) {
-		xfs_iunlock(ip, XFS_MMAPLOCK_EXCL);
-		xfs_iunlock(ip, XFS_IOLOCK_EXCL);
-	}
+	xfs_iunlock(ip, XFS_MMAPLOCK_EXCL);
+	xfs_iunlock(ip, XFS_IOLOCK_EXCL);
 
 	return ret;
 }
diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h
index a1e02f4708ab..8a7c849b4dea 100644
--- a/fs/xfs/xfs_icache.h
+++ b/fs/xfs/xfs_icache.h
@@ -27,7 +27,6 @@ struct xfs_eofblocks {
 	kgid_t		eof_gid;
 	prid_t		eof_prid;
 	__u64		eof_min_file_size;
-	xfs_ino_t	eof_scan_owner;
 };
 
 #define SYNC_WAIT		0x0001	/* wait for i/o to complete */
@@ -102,7 +101,6 @@ xfs_fs_eofblocks_from_user(
 	dst->eof_flags = src->eof_flags;
 	dst->eof_prid = src->eof_prid;
 	dst->eof_min_file_size = src->eof_min_file_size;
-	dst->eof_scan_owner = NULLFSINO;
 
 	dst->eof_uid = INVALID_UID;
 	if (src->eof_flags & XFS_EOF_FLAGS_UID) {
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index b9557795eb74..edfa6a55b064 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1692,32 +1692,34 @@ xfs_release(
 	if (xfs_can_free_eofblocks(ip, false)) {
 
 		/*
+		 * Check if the inode is being opened, written and closed
+		 * frequently and we have delayed allocation blocks outstanding
+		 * (e.g. streaming writes from the NFS server), truncating the
+		 * blocks past EOF will cause fragmentation to occur.
+		 *
+		 * In this case don't do the truncation, but we have to be
+		 * careful how we detect this case. Blocks beyond EOF show up as
+		 * i_delayed_blks even when the inode is clean, so we need to
+		 * truncate them away first before checking for a dirty release.
+		 * Hence on the first dirty close we will still remove the
+		 * speculative allocation, but after that we will leave it in
+		 * place.
+		 */
+		if (xfs_iflags_test(ip, XFS_IDIRTY_RELEASE))
+			return 0;
+		/*
 		 * If we can't get the iolock just skip truncating the blocks
 		 * past EOF because we could deadlock with the mmap_sem
-		 * otherwise.  We'll get another chance to drop them once the
+		 * otherwise. We'll get another chance to drop them once the
 		 * last reference to the inode is dropped, so we'll never leak
 		 * blocks permanently.
-		 *
-		 * Further, check if the inode is being opened, written and
-		 * closed frequently and we have delayed allocation blocks
-		 * outstanding (e.g. streaming writes from the NFS server),
-		 * truncating the blocks past EOF will cause fragmentation to
-		 * occur.
-		 *
-		 * In this case don't do the truncation, either, but we have to
-		 * be careful how we detect this case. Blocks beyond EOF show
-		 * up as i_delayed_blks even when the inode is clean, so we
-		 * need to truncate them away first before checking for a dirty
-		 * release. Hence on the first dirty close we will still remove
-		 * the speculative allocation, but after that we will leave it
-		 * in place.
 		 */
-		if (xfs_iflags_test(ip, XFS_IDIRTY_RELEASE))
-			return 0;
-
-		error = xfs_free_eofblocks(mp, ip, true);
-		if (error && error != -EAGAIN)
-			return error;
+		if (xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
+			error = xfs_free_eofblocks(ip);
+			xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+			if (error)
+				return error;
+		}
 
 		/* delalloc blocks after truncation means it really is dirty */
 		if (ip->i_delayed_blks)
@@ -1792,22 +1794,23 @@ xfs_inactive_ifree(
 	int			error;
 
 	/*
-	 * The ifree transaction might need to allocate blocks for record
-	 * insertion to the finobt. We don't want to fail here at ENOSPC, so
-	 * allow ifree to dip into the reserved block pool if necessary.
-	 *
-	 * Freeing large sets of inodes generally means freeing inode chunks,
-	 * directory and file data blocks, so this should be relatively safe.
-	 * Only under severe circumstances should it be possible to free enough
-	 * inodes to exhaust the reserve block pool via finobt expansion while
-	 * at the same time not creating free space in the filesystem.
+	 * We try to use a per-AG reservation for any block needed by the finobt
+	 * tree, but as the finobt feature predates the per-AG reservation
+	 * support a degraded file system might not have enough space for the
+	 * reservation at mount time.  In that case try to dip into the reserved
+	 * pool and pray.
 	 *
 	 * Send a warning if the reservation does happen to fail, as the inode
 	 * now remains allocated and sits on the unlinked list until the fs is
 	 * repaired.
 	 */
-	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree,
-			XFS_IFREE_SPACE_RES(mp), 0, XFS_TRANS_RESERVE, &tp);
+	if (unlikely(mp->m_inotbt_nores)) {
+		error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree,
+				XFS_IFREE_SPACE_RES(mp), 0, XFS_TRANS_RESERVE,
+				&tp);
+	} else {
+		error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree, 0, 0, 0, &tp);
+	}
 	if (error) {
 		if (error == -ENOSPC) {
 			xfs_warn_ratelimited(mp,
@@ -1903,8 +1906,11 @@ xfs_inactive(
 		 * cache. Post-eof blocks must be freed, lest we end up with
 		 * broken free space accounting.
 		 */
-		if (xfs_can_free_eofblocks(ip, true))
-			xfs_free_eofblocks(mp, ip, false);
+		if (xfs_can_free_eofblocks(ip, true)) {
+			xfs_ilock(ip, XFS_IOLOCK_EXCL);
+			xfs_free_eofblocks(ip);
+			xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+		}
 
 		return;
 	}
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index c67cfb451fd3..cf1363dbf32b 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -1524,7 +1524,7 @@ out_drop_write:
 }
 
 STATIC int
-xfs_getbmap_format(void **ap, struct getbmapx *bmv, int *full)
+xfs_getbmap_format(void **ap, struct getbmapx *bmv)
 {
 	struct getbmap __user	*base = (struct getbmap __user *)*ap;
 
@@ -1567,7 +1567,7 @@ xfs_ioc_getbmap(
 }
 
 STATIC int
-xfs_getbmapx_format(void **ap, struct getbmapx *bmv, int *full)
+xfs_getbmapx_format(void **ap, struct getbmapx *bmv)
 {
 	struct getbmapx __user	*base = (struct getbmapx __user *)*ap;
 
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 0d147428971e..41662fb14e87 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -162,7 +162,7 @@ xfs_iomap_write_direct(
 	xfs_fileoff_t	last_fsb;
 	xfs_filblks_t	count_fsb, resaligned;
 	xfs_fsblock_t	firstfsb;
-	xfs_extlen_t	extsz, temp;
+	xfs_extlen_t	extsz;
 	int		nimaps;
 	int		quota_flag;
 	int		rt;
@@ -203,14 +203,7 @@ xfs_iomap_write_direct(
 	}
 	count_fsb = last_fsb - offset_fsb;
 	ASSERT(count_fsb > 0);
-
-	resaligned = count_fsb;
-	if (unlikely(extsz)) {
-		if ((temp = do_mod(offset_fsb, extsz)))
-			resaligned += temp;
-		if ((temp = do_mod(resaligned, extsz)))
-			resaligned += extsz - temp;
-	}
+	resaligned = xfs_aligned_fsb_count(offset_fsb, count_fsb, extsz);
 
 	if (unlikely(rt)) {
 		resrtextents = qblocks = resaligned;
@@ -681,11 +674,11 @@ xfs_iomap_write_allocate(
 	xfs_trans_t	*tp;
 	int		nimaps;
 	int		error = 0;
-	int		flags = 0;
+	int		flags = XFS_BMAPI_DELALLOC;
 	int		nres;
 
 	if (whichfork == XFS_COW_FORK)
-		flags |= XFS_BMAPI_COWFORK;
+		flags |= XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC;
 
 	/*
 	 * Make sure that the dquots are there.
@@ -1002,47 +995,31 @@ xfs_file_iomap_begin(
 	offset_fsb = XFS_B_TO_FSBT(mp, offset);
 	end_fsb = XFS_B_TO_FSB(mp, offset + length);
 
-	if (xfs_is_reflink_inode(ip) &&
-	    (flags & IOMAP_WRITE) && (flags & IOMAP_DIRECT)) {
-		shared = xfs_reflink_find_cow_mapping(ip, offset, &imap);
-		if (shared) {
-			xfs_iunlock(ip, lockmode);
-			goto alloc_done;
-		}
-		ASSERT(!isnullstartblock(imap.br_startblock));
-	}
-
 	error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap,
 			       &nimaps, 0);
 	if (error)
 		goto out_unlock;
 
-	if ((flags & IOMAP_REPORT) ||
-	    (xfs_is_reflink_inode(ip) &&
-	     (flags & IOMAP_WRITE) && (flags & IOMAP_DIRECT))) {
+	if (flags & IOMAP_REPORT) {
 		/* Trim the mapping to the nearest shared extent boundary. */
 		error = xfs_reflink_trim_around_shared(ip, &imap, &shared,
 				&trimmed);
 		if (error)
 			goto out_unlock;
-
-		/*
-		 * We're here because we're trying to do a directio write to a
-		 * region that isn't aligned to a filesystem block.  If the
-		 * extent is shared, fall back to buffered mode to handle the
-		 * RMW.
-		 */
-		if (!(flags & IOMAP_REPORT) && shared) {
-			trace_xfs_reflink_bounce_dio_write(ip, &imap);
-			error = -EREMCHG;
-			goto out_unlock;
-		}
 	}
 
 	if ((flags & (IOMAP_WRITE | IOMAP_ZERO)) && xfs_is_reflink_inode(ip)) {
-		error = xfs_reflink_reserve_cow(ip, &imap, &shared);
-		if (error)
-			goto out_unlock;
+		if (flags & IOMAP_DIRECT) {
+			/* may drop and re-acquire the ilock */
+			error = xfs_reflink_allocate_cow(ip, &imap, &shared,
+					&lockmode);
+			if (error)
+				goto out_unlock;
+		} else {
+			error = xfs_reflink_reserve_cow(ip, &imap, &shared);
+			if (error)
+				goto out_unlock;
+		}
 
 		end_fsb = imap.br_startoff + imap.br_blockcount;
 		length = XFS_FSB_TO_B(mp, end_fsb) - offset;
@@ -1071,7 +1048,6 @@ xfs_file_iomap_begin(
 		if (error)
 			return error;
 
-alloc_done:
 		iomap->flags = IOMAP_F_NEW;
 		trace_xfs_iomap_alloc(ip, offset, length, 0, &imap);
 	} else {
@@ -1102,7 +1078,19 @@ xfs_file_iomap_end_delalloc(
 	xfs_fileoff_t		end_fsb;
 	int			error = 0;
 
-	start_fsb = XFS_B_TO_FSB(mp, offset + written);
+	/* behave as if the write failed if drop writes is enabled */
+	if (xfs_mp_drop_writes(mp))
+		written = 0;
+
+	/*
+	 * start_fsb refers to the first unused block after a short write. If
+	 * nothing was written, round offset down to point at the first block in
+	 * the range.
+	 */
+	if (unlikely(!written))
+		start_fsb = XFS_B_TO_FSBT(mp, offset);
+	else
+		start_fsb = XFS_B_TO_FSB(mp, offset + written);
 	end_fsb = XFS_B_TO_FSB(mp, offset + length);
 
 	/*
@@ -1114,6 +1102,9 @@ xfs_file_iomap_end_delalloc(
 	 * blocks in the range, they are ours.
 	 */
 	if (start_fsb < end_fsb) {
+		truncate_pagecache_range(VFS_I(ip), XFS_FSB_TO_B(mp, start_fsb),
+					 XFS_FSB_TO_B(mp, end_fsb) - 1);
+
 		xfs_ilock(ip, XFS_ILOCK_EXCL);
 		error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
 					       end_fsb - start_fsb);
@@ -1144,7 +1135,7 @@ xfs_file_iomap_end(
 	return 0;
 }
 
-struct iomap_ops xfs_iomap_ops = {
+const struct iomap_ops xfs_iomap_ops = {
 	.iomap_begin		= xfs_file_iomap_begin,
 	.iomap_end		= xfs_file_iomap_end,
 };
@@ -1190,6 +1181,6 @@ out_unlock:
 	return error;
 }
 
-struct iomap_ops xfs_xattr_iomap_ops = {
+const struct iomap_ops xfs_xattr_iomap_ops = {
 	.iomap_begin		= xfs_xattr_iomap_begin,
 };
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index 6d45cf01fcff..00db3ecea084 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -33,7 +33,27 @@ void xfs_bmbt_to_iomap(struct xfs_inode *, struct iomap *,
 		struct xfs_bmbt_irec *);
 xfs_extlen_t xfs_eof_alignment(struct xfs_inode *ip, xfs_extlen_t extsize);
 
-extern struct iomap_ops xfs_iomap_ops;
-extern struct iomap_ops xfs_xattr_iomap_ops;
+static inline xfs_filblks_t
+xfs_aligned_fsb_count(
+	xfs_fileoff_t		offset_fsb,
+	xfs_filblks_t		count_fsb,
+	xfs_extlen_t		extsz)
+{
+	if (extsz) {
+		xfs_extlen_t	align;
+
+		align = do_mod(offset_fsb, extsz);
+		if (align)
+			count_fsb += align;
+		align = do_mod(count_fsb, extsz);
+		if (align)
+			count_fsb += extsz - align;
+	}
+
+	return count_fsb;
+}
+
+extern const struct iomap_ops xfs_iomap_ops;
+extern const struct iomap_ops xfs_xattr_iomap_ops;
 
 #endif /* __XFS_IOMAP_H__*/
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 308bebb6dfd2..22c16155f1b4 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -98,12 +98,27 @@ xfs_init_security(
 static void
 xfs_dentry_to_name(
 	struct xfs_name	*namep,
+	struct dentry	*dentry)
+{
+	namep->name = dentry->d_name.name;
+	namep->len = dentry->d_name.len;
+	namep->type = XFS_DIR3_FT_UNKNOWN;
+}
+
+static int
+xfs_dentry_mode_to_name(
+	struct xfs_name	*namep,
 	struct dentry	*dentry,
 	int		mode)
 {
 	namep->name = dentry->d_name.name;
 	namep->len = dentry->d_name.len;
-	namep->type = xfs_mode_to_ftype[(mode & S_IFMT) >> S_SHIFT];
+	namep->type = xfs_mode_to_ftype(mode);
+
+	if (unlikely(namep->type == XFS_DIR3_FT_UNKNOWN))
+		return -EFSCORRUPTED;
+
+	return 0;
 }
 
 STATIC void
@@ -119,7 +134,7 @@ xfs_cleanup_inode(
 	 * xfs_init_security we must back out.
 	 * ENOSPC can hit here, among other things.
 	 */
-	xfs_dentry_to_name(&teardown, dentry, 0);
+	xfs_dentry_to_name(&teardown, dentry);
 
 	xfs_remove(XFS_I(dir), &teardown, XFS_I(inode));
 }
@@ -154,8 +169,12 @@ xfs_generic_create(
 	if (error)
 		return error;
 
+	/* Verify mode is valid also for tmpfile case */
+	error = xfs_dentry_mode_to_name(&name, dentry, mode);
+	if (unlikely(error))
+		goto out_free_acl;
+
 	if (!tmpfile) {
-		xfs_dentry_to_name(&name, dentry, mode);
 		error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip);
 	} else {
 		error = xfs_create_tmpfile(XFS_I(dir), dentry, mode, &ip);
@@ -248,7 +267,7 @@ xfs_vn_lookup(
 	if (dentry->d_name.len >= MAXNAMELEN)
 		return ERR_PTR(-ENAMETOOLONG);
 
-	xfs_dentry_to_name(&name, dentry, 0);
+	xfs_dentry_to_name(&name, dentry);
 	error = xfs_lookup(XFS_I(dir), &name, &cip, NULL);
 	if (unlikely(error)) {
 		if (unlikely(error != -ENOENT))
@@ -275,7 +294,7 @@ xfs_vn_ci_lookup(
 	if (dentry->d_name.len >= MAXNAMELEN)
 		return ERR_PTR(-ENAMETOOLONG);
 
-	xfs_dentry_to_name(&xname, dentry, 0);
+	xfs_dentry_to_name(&xname, dentry);
 	error = xfs_lookup(XFS_I(dir), &xname, &ip, &ci_name);
 	if (unlikely(error)) {
 		if (unlikely(error != -ENOENT))
@@ -310,7 +329,9 @@ xfs_vn_link(
 	struct xfs_name	name;
 	int		error;
 
-	xfs_dentry_to_name(&name, dentry, inode->i_mode);
+	error = xfs_dentry_mode_to_name(&name, dentry, inode->i_mode);
+	if (unlikely(error))
+		return error;
 
 	error = xfs_link(XFS_I(dir), XFS_I(inode), &name);
 	if (unlikely(error))
@@ -329,7 +350,7 @@ xfs_vn_unlink(
 	struct xfs_name	name;
 	int		error;
 
-	xfs_dentry_to_name(&name, dentry, 0);
+	xfs_dentry_to_name(&name, dentry);
 
 	error = xfs_remove(XFS_I(dir), &name, XFS_I(d_inode(dentry)));
 	if (error)
@@ -359,7 +380,9 @@ xfs_vn_symlink(
 
 	mode = S_IFLNK |
 		(irix_symlink_mode ? 0777 & ~current_umask() : S_IRWXUGO);
-	xfs_dentry_to_name(&name, dentry, mode);
+	error = xfs_dentry_mode_to_name(&name, dentry, mode);
+	if (unlikely(error))
+		goto out;
 
 	error = xfs_symlink(XFS_I(dir), &name, symname, mode, &cip);
 	if (unlikely(error))
@@ -395,6 +418,7 @@ xfs_vn_rename(
 {
 	struct inode	*new_inode = d_inode(ndentry);
 	int		omode = 0;
+	int		error;
 	struct xfs_name	oname;
 	struct xfs_name	nname;
 
@@ -405,8 +429,14 @@ xfs_vn_rename(
 	if (flags & RENAME_EXCHANGE)
 		omode = d_inode(ndentry)->i_mode;
 
-	xfs_dentry_to_name(&oname, odentry, omode);
-	xfs_dentry_to_name(&nname, ndentry, d_inode(odentry)->i_mode);
+	error = xfs_dentry_mode_to_name(&oname, odentry, omode);
+	if (omode && unlikely(error))
+		return error;
+
+	error = xfs_dentry_mode_to_name(&nname, ndentry,
+					d_inode(odentry)->i_mode);
+	if (unlikely(error))
+		return error;
 
 	return xfs_rename(XFS_I(odir), &oname, XFS_I(d_inode(odentry)),
 			  XFS_I(ndir), &nname,
diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h
index e467218c0098..7a989de224f4 100644
--- a/fs/xfs/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -331,11 +331,11 @@ static inline __uint64_t howmany_64(__uint64_t x, __uint32_t y)
 }
 
 #define ASSERT_ALWAYS(expr)	\
-	(unlikely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__))
+	(likely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__))
 
 #ifdef DEBUG
 #define ASSERT(expr)	\
-	(unlikely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__))
+	(likely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__))
 
 #ifndef STATIC
 # define STATIC noinline
@@ -346,7 +346,7 @@ static inline __uint64_t howmany_64(__uint64_t x, __uint32_t y)
 #ifdef XFS_WARN
 
 #define ASSERT(expr)	\
-	(unlikely(expr) ? (void)0 : asswarn(#expr, __FILE__, __LINE__))
+	(likely(expr) ? (void)0 : asswarn(#expr, __FILE__, __LINE__))
 
 #ifndef STATIC
 # define STATIC static noinline
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index c39ac14ff540..b1469f0a91a6 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -3317,12 +3317,8 @@ xfs_log_force(
 	xfs_mount_t	*mp,
 	uint		flags)
 {
-	int	error;
-
 	trace_xfs_log_force(mp, 0, _RET_IP_);
-	error = _xfs_log_force(mp, flags, NULL);
-	if (error)
-		xfs_warn(mp, "%s: error %d returned.", __func__, error);
+	_xfs_log_force(mp, flags, NULL);
 }
 
 /*
@@ -3466,12 +3462,8 @@ xfs_log_force_lsn(
 	xfs_lsn_t	lsn,
 	uint		flags)
 {
-	int	error;
-
 	trace_xfs_log_force(mp, lsn, _RET_IP_);
-	error = _xfs_log_force_lsn(mp, lsn, flags, NULL);
-	if (error)
-		xfs_warn(mp, "%s: error %d returned.", __func__, error);
+	_xfs_log_force_lsn(mp, lsn, flags, NULL);
 }
 
 /*
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index b5e71072fde5..cc5a9f1574e7 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -124,7 +124,6 @@ struct xlog_ticket;
 struct xfs_log_item;
 struct xfs_item_ops;
 struct xfs_trans;
-struct xfs_log_callback;
 
 xfs_lsn_t xfs_log_done(struct xfs_mount *mp,
 		       struct xlog_ticket *ticket,
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index a4ab192e1792..82f1cbcc4de1 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -30,6 +30,9 @@
 #include "xfs_trans_priv.h"
 #include "xfs_log.h"
 #include "xfs_log_priv.h"
+#include "xfs_trace.h"
+
+struct workqueue_struct *xfs_discard_wq;
 
 /*
  * Allocate a new ticket. Failing to get a new ticket makes it really hard to
@@ -491,6 +494,75 @@ xlog_cil_free_logvec(
 	}
 }
 
+static void
+xlog_discard_endio_work(
+	struct work_struct	*work)
+{
+	struct xfs_cil_ctx	*ctx =
+		container_of(work, struct xfs_cil_ctx, discard_endio_work);
+	struct xfs_mount	*mp = ctx->cil->xc_log->l_mp;
+
+	xfs_extent_busy_clear(mp, &ctx->busy_extents, false);
+	kmem_free(ctx);
+}
+
+/*
+ * Queue up the actual completion to a thread to avoid IRQ-safe locking for
+ * pagb_lock.  Note that we need a unbounded workqueue, otherwise we might
+ * get the execution delayed up to 30 seconds for weird reasons.
+ */
+static void
+xlog_discard_endio(
+	struct bio		*bio)
+{
+	struct xfs_cil_ctx	*ctx = bio->bi_private;
+
+	INIT_WORK(&ctx->discard_endio_work, xlog_discard_endio_work);
+	queue_work(xfs_discard_wq, &ctx->discard_endio_work);
+}
+
+static void
+xlog_discard_busy_extents(
+	struct xfs_mount	*mp,
+	struct xfs_cil_ctx	*ctx)
+{
+	struct list_head	*list = &ctx->busy_extents;
+	struct xfs_extent_busy	*busyp;
+	struct bio		*bio = NULL;
+	struct blk_plug		plug;
+	int			error = 0;
+
+	ASSERT(mp->m_flags & XFS_MOUNT_DISCARD);
+
+	blk_start_plug(&plug);
+	list_for_each_entry(busyp, list, list) {
+		trace_xfs_discard_extent(mp, busyp->agno, busyp->bno,
+					 busyp->length);
+
+		error = __blkdev_issue_discard(mp->m_ddev_targp->bt_bdev,
+				XFS_AGB_TO_DADDR(mp, busyp->agno, busyp->bno),
+				XFS_FSB_TO_BB(mp, busyp->length),
+				GFP_NOFS, 0, &bio);
+		if (error && error != -EOPNOTSUPP) {
+			xfs_info(mp,
+	 "discard failed for extent [0x%llx,%u], error %d",
+				 (unsigned long long)busyp->bno,
+				 busyp->length,
+				 error);
+			break;
+		}
+	}
+
+	if (bio) {
+		bio->bi_private = ctx;
+		bio->bi_end_io = xlog_discard_endio;
+		submit_bio(bio);
+	} else {
+		xlog_discard_endio_work(&ctx->discard_endio_work);
+	}
+	blk_finish_plug(&plug);
+}
+
 /*
  * Mark all items committed and clear busy extents. We free the log vector
  * chains in a separate pass so that we unpin the log items as quickly as
@@ -525,14 +597,10 @@ xlog_cil_committed(
 
 	xlog_cil_free_logvec(ctx->lv_chain);
 
-	if (!list_empty(&ctx->busy_extents)) {
-		ASSERT(mp->m_flags & XFS_MOUNT_DISCARD);
-
-		xfs_discard_extents(mp, &ctx->busy_extents);
-		xfs_extent_busy_clear(mp, &ctx->busy_extents, false);
-	}
-
-	kmem_free(ctx);
+	if (!list_empty(&ctx->busy_extents))
+		xlog_discard_busy_extents(mp, ctx);
+	else
+		kmem_free(ctx);
 }
 
 /*
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index 2b6eec52178e..c2604a5366f2 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -257,6 +257,7 @@ struct xfs_cil_ctx {
 	struct xfs_log_vec	*lv_chain;	/* logvecs being pushed */
 	struct xfs_log_callback	log_cb;		/* completion callback hook. */
 	struct list_head	committing;	/* ctx committing list */
+	struct work_struct	discard_endio_work;
 };
 
 /*
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 9b9540db17a6..450bde68bb75 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -45,6 +45,7 @@
 #include "xfs_rmap_btree.h"
 #include "xfs_refcount_btree.h"
 #include "xfs_reflink.h"
+#include "xfs_extent_busy.h"
 
 
 static DEFINE_MUTEX(xfs_uuid_table_mutex);
@@ -187,7 +188,7 @@ xfs_initialize_perag(
 	xfs_agnumber_t	*maxagi)
 {
 	xfs_agnumber_t	index;
-	xfs_agnumber_t	first_initialised = 0;
+	xfs_agnumber_t	first_initialised = NULLAGNUMBER;
 	xfs_perag_t	*pag;
 	int		error = -ENOMEM;
 
@@ -202,22 +203,21 @@ xfs_initialize_perag(
 			xfs_perag_put(pag);
 			continue;
 		}
-		if (!first_initialised)
-			first_initialised = index;
 
 		pag = kmem_zalloc(sizeof(*pag), KM_MAYFAIL);
 		if (!pag)
-			goto out_unwind;
+			goto out_unwind_new_pags;
 		pag->pag_agno = index;
 		pag->pag_mount = mp;
 		spin_lock_init(&pag->pag_ici_lock);
 		mutex_init(&pag->pag_ici_reclaim_lock);
 		INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC);
 		if (xfs_buf_hash_init(pag))
-			goto out_unwind;
+			goto out_free_pag;
+		init_waitqueue_head(&pag->pagb_wait);
 
 		if (radix_tree_preload(GFP_NOFS))
-			goto out_unwind;
+			goto out_hash_destroy;
 
 		spin_lock(&mp->m_perag_lock);
 		if (radix_tree_insert(&mp->m_perag_tree, index, pag)) {
@@ -225,10 +225,13 @@ xfs_initialize_perag(
 			spin_unlock(&mp->m_perag_lock);
 			radix_tree_preload_end();
 			error = -EEXIST;
-			goto out_unwind;
+			goto out_hash_destroy;
 		}
 		spin_unlock(&mp->m_perag_lock);
 		radix_tree_preload_end();
+		/* first new pag is fully initialized */
+		if (first_initialised == NULLAGNUMBER)
+			first_initialised = index;
 	}
 
 	index = xfs_set_inode_alloc(mp, agcount);
@@ -239,11 +242,16 @@ xfs_initialize_perag(
 	mp->m_ag_prealloc_blocks = xfs_prealloc_blocks(mp);
 	return 0;
 
-out_unwind:
+out_hash_destroy:
 	xfs_buf_hash_destroy(pag);
+out_free_pag:
 	kmem_free(pag);
-	for (; index > first_initialised; index--) {
+out_unwind_new_pags:
+	/* unwind any prior newly initialized pags */
+	for (index = first_initialised; index < agcount; index++) {
 		pag = radix_tree_delete(&mp->m_perag_tree, index);
+		if (!pag)
+			break;
 		xfs_buf_hash_destroy(pag);
 		kmem_free(pag);
 	}
@@ -1073,6 +1081,13 @@ xfs_unmountfs(
 	xfs_log_force(mp, XFS_LOG_SYNC);
 
 	/*
+	 * Wait for all busy extents to be freed, including completion of
+	 * any discard operation.
+	 */
+	xfs_extent_busy_wait_all(mp);
+	flush_workqueue(xfs_discard_wq);
+
+	/*
 	 * We now need to tell the world we are unmounting. This will allow
 	 * us to detect that the filesystem is going away and we should error
 	 * out anything that we have been retrying in the background. This will
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 84f785218907..6db6fd6b82b0 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -140,6 +140,7 @@ typedef struct xfs_mount {
 	int			m_fixedfsid[2];	/* unchanged for life of FS */
 	uint			m_dmevmask;	/* DMI events for this FS */
 	__uint64_t		m_flags;	/* global mount flags */
+	bool			m_inotbt_nores; /* no per-AG finobt resv. */
 	int			m_ialloc_inos;	/* inodes in inode allocation */
 	int			m_ialloc_blks;	/* blocks in inode allocation */
 	int			m_ialloc_min_blks;/* min blocks in sparse inode
@@ -199,11 +200,12 @@ typedef struct xfs_mount {
 	/*
 	 * DEBUG mode instrumentation to test and/or trigger delayed allocation
 	 * block killing in the event of failed writes. When enabled, all
-	 * buffered writes are forced to fail. All delalloc blocks in the range
-	 * of the write (including pre-existing delalloc blocks!) are tossed as
-	 * part of the write failure error handling sequence.
+	 * buffered writes are silenty dropped and handled as if they failed.
+	 * All delalloc blocks in the range of the write (including pre-existing
+	 * delalloc blocks!) are tossed as part of the write failure error
+	 * handling sequence.
 	 */
-	bool			m_fail_writes;
+	bool			m_drop_writes;
 #endif
 } xfs_mount_t;
 
@@ -324,13 +326,13 @@ xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d)
 
 #ifdef DEBUG
 static inline bool
-xfs_mp_fail_writes(struct xfs_mount *mp)
+xfs_mp_drop_writes(struct xfs_mount *mp)
 {
-	return mp->m_fail_writes;
+	return mp->m_drop_writes;
 }
 #else
 static inline bool
-xfs_mp_fail_writes(struct xfs_mount *mp)
+xfs_mp_drop_writes(struct xfs_mount *mp)
 {
 	return 0;
 }
@@ -383,6 +385,8 @@ typedef struct xfs_perag {
 	xfs_agino_t	pagl_rightrec;
 	spinlock_t	pagb_lock;	/* lock for pagb_tree */
 	struct rb_root	pagb_tree;	/* ordered tree of busy extents */
+	unsigned int	pagb_gen;	/* generation count for pagb_tree */
+	wait_queue_head_t pagb_wait;	/* woken when pagb_gen changes */
 
 	atomic_t        pagf_fstrms;    /* # of filestreams active in this AG */
 
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 45e50ea90769..b669b123287b 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -1177,7 +1177,8 @@ xfs_qm_dqusage_adjust(
 	 * the case in all other instances. It's OK that we do this because
 	 * quotacheck is done only at mount time.
 	 */
-	error = xfs_iget(mp, NULL, ino, 0, XFS_ILOCK_EXCL, &ip);
+	error = xfs_iget(mp, NULL, ino, XFS_IGET_DONTCACHE, XFS_ILOCK_EXCL,
+			 &ip);
 	if (error) {
 		*res = BULKSTAT_RV_NOTHING;
 		return error;
diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c
index fe86a668a57e..6e4c7446c3d4 100644
--- a/fs/xfs/xfs_refcount_item.c
+++ b/fs/xfs/xfs_refcount_item.c
@@ -526,13 +526,14 @@ xfs_cui_recover(
 	xfs_refcount_finish_one_cleanup(tp, rcur, error);
 	error = xfs_defer_finish(&tp, &dfops, NULL);
 	if (error)
-		goto abort_error;
+		goto abort_defer;
 	set_bit(XFS_CUI_RECOVERED, &cuip->cui_flags);
 	error = xfs_trans_commit(tp);
 	return error;
 
 abort_error:
 	xfs_refcount_finish_one_cleanup(tp, rcur, error);
+abort_defer:
 	xfs_defer_cancel(&dfops);
 	xfs_trans_cancel(tp);
 	return error;
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index 07593a362cd0..da6d08fb359c 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -82,11 +82,22 @@
  * mappings are a reservation against the free space in the filesystem;
  * adjacent mappings can also be combined into fewer larger mappings.
  *
+ * As an optimization, the CoW extent size hint (cowextsz) creates
+ * outsized aligned delalloc reservations in the hope of landing out of
+ * order nearby CoW writes in a single extent on disk, thereby reducing
+ * fragmentation and improving future performance.
+ *
+ * D: --RRRRRRSSSRRRRRRRR--- (data fork)
+ * C: ------DDDDDDD--------- (CoW fork)
+ *
  * When dirty pages are being written out (typically in writepage), the
- * delalloc reservations are converted into real mappings by allocating
- * blocks and replacing the delalloc mapping with real ones.  A delalloc
- * mapping can be replaced by several real ones if the free space is
- * fragmented.
+ * delalloc reservations are converted into unwritten mappings by
+ * allocating blocks and replacing the delalloc mapping with real ones.
+ * A delalloc mapping can be replaced by several unwritten ones if the
+ * free space is fragmented.
+ *
+ * D: --RRRRRRSSSRRRRRRRR---
+ * C: ------UUUUUUU---------
  *
  * We want to adapt the delalloc mechanism for copy-on-write, since the
  * write paths are similar.  The first two steps (creating the reservation
@@ -101,13 +112,29 @@
  * Block-aligned directio writes will use the same mechanism as buffered
  * writes.
  *
+ * Just prior to submitting the actual disk write requests, we convert
+ * the extents representing the range of the file actually being written
+ * (as opposed to extra pieces created for the cowextsize hint) to real
+ * extents.  This will become important in the next step:
+ *
+ * D: --RRRRRRSSSRRRRRRRR---
+ * C: ------UUrrUUU---------
+ *
  * CoW remapping must be done after the data block write completes,
  * because we don't want to destroy the old data fork map until we're sure
  * the new block has been written.  Since the new mappings are kept in a
  * separate fork, we can simply iterate these mappings to find the ones
  * that cover the file blocks that we just CoW'd.  For each extent, simply
  * unmap the corresponding range in the data fork, map the new range into
- * the data fork, and remove the extent from the CoW fork.
+ * the data fork, and remove the extent from the CoW fork.  Because of
+ * the presence of the cowextsize hint, however, we must be careful
+ * only to remap the blocks that we've actually written out --  we must
+ * never remap delalloc reservations nor CoW staging blocks that have
+ * yet to be written.  This corresponds exactly to the real extents in
+ * the CoW fork:
+ *
+ * D: --RRRRRRrrSRRRRRRRR---
+ * C: ------UU--UUU---------
  *
  * Since the remapping operation can be applied to an arbitrary file
  * range, we record the need for the remap step as a flag in the ioend
@@ -296,103 +323,165 @@ xfs_reflink_reserve_cow(
 	return 0;
 }
 
-/* Allocate all CoW reservations covering a range of blocks in a file. */
-static int
-__xfs_reflink_allocate_cow(
-	struct xfs_inode	*ip,
-	xfs_fileoff_t		*offset_fsb,
-	xfs_fileoff_t		end_fsb)
+/* Convert part of an unwritten CoW extent to a real one. */
+STATIC int
+xfs_reflink_convert_cow_extent(
+	struct xfs_inode		*ip,
+	struct xfs_bmbt_irec		*imap,
+	xfs_fileoff_t			offset_fsb,
+	xfs_filblks_t			count_fsb,
+	struct xfs_defer_ops		*dfops)
 {
-	struct xfs_mount	*mp = ip->i_mount;
-	struct xfs_bmbt_irec	imap;
-	struct xfs_defer_ops	dfops;
-	struct xfs_trans	*tp;
-	xfs_fsblock_t		first_block;
-	int			nimaps = 1, error;
-	bool			shared;
-
-	xfs_defer_init(&dfops, &first_block);
+	xfs_fsblock_t			first_block;
+	int				nimaps = 1;
 
-	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, 0,
-			XFS_TRANS_RESERVE, &tp);
-	if (error)
-		return error;
+	if (imap->br_state == XFS_EXT_NORM)
+		return 0;
 
-	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	xfs_trim_extent(imap, offset_fsb, count_fsb);
+	trace_xfs_reflink_convert_cow(ip, imap);
+	if (imap->br_blockcount == 0)
+		return 0;
+	return xfs_bmapi_write(NULL, ip, imap->br_startoff, imap->br_blockcount,
+			XFS_BMAPI_COWFORK | XFS_BMAPI_CONVERT, &first_block,
+			0, imap, &nimaps, dfops);
+}
 
-	/* Read extent from the source file. */
-	nimaps = 1;
-	error = xfs_bmapi_read(ip, *offset_fsb, end_fsb - *offset_fsb,
-			&imap, &nimaps, 0);
-	if (error)
-		goto out_unlock;
-	ASSERT(nimaps == 1);
+/* Convert all of the unwritten CoW extents in a file's range to real ones. */
+int
+xfs_reflink_convert_cow(
+	struct xfs_inode	*ip,
+	xfs_off_t		offset,
+	xfs_off_t		count)
+{
+	struct xfs_bmbt_irec	got;
+	struct xfs_defer_ops	dfops;
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
+	xfs_fileoff_t		offset_fsb = XFS_B_TO_FSBT(mp, offset);
+	xfs_fileoff_t		end_fsb = XFS_B_TO_FSB(mp, offset + count);
+	xfs_extnum_t		idx;
+	bool			found;
+	int			error = 0;
 
-	error = xfs_reflink_reserve_cow(ip, &imap, &shared);
-	if (error)
-		goto out_trans_cancel;
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
 
-	if (!shared) {
-		*offset_fsb = imap.br_startoff + imap.br_blockcount;
-		goto out_trans_cancel;
+	/* Convert all the extents to real from unwritten. */
+	for (found = xfs_iext_lookup_extent(ip, ifp, offset_fsb, &idx, &got);
+	     found && got.br_startoff < end_fsb;
+	     found = xfs_iext_get_extent(ifp, ++idx, &got)) {
+		error = xfs_reflink_convert_cow_extent(ip, &got, offset_fsb,
+				end_fsb - offset_fsb, &dfops);
+		if (error)
+			break;
 	}
 
-	xfs_trans_ijoin(tp, ip, 0);
-	error = xfs_bmapi_write(tp, ip, imap.br_startoff, imap.br_blockcount,
-			XFS_BMAPI_COWFORK, &first_block,
-			XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK),
-			&imap, &nimaps, &dfops);
-	if (error)
-		goto out_trans_cancel;
-
-	error = xfs_defer_finish(&tp, &dfops, NULL);
-	if (error)
-		goto out_trans_cancel;
-
-	error = xfs_trans_commit(tp);
-
-	*offset_fsb = imap.br_startoff + imap.br_blockcount;
-out_unlock:
+	/* Finish up. */
 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
 	return error;
-out_trans_cancel:
-	xfs_defer_cancel(&dfops);
-	xfs_trans_cancel(tp);
-	goto out_unlock;
 }
 
-/* Allocate all CoW reservations covering a part of a file. */
+/* Allocate all CoW reservations covering a range of blocks in a file. */
 int
-xfs_reflink_allocate_cow_range(
+xfs_reflink_allocate_cow(
 	struct xfs_inode	*ip,
-	xfs_off_t		offset,
-	xfs_off_t		count)
+	struct xfs_bmbt_irec	*imap,
+	bool			*shared,
+	uint			*lockmode)
 {
 	struct xfs_mount	*mp = ip->i_mount;
-	xfs_fileoff_t		offset_fsb = XFS_B_TO_FSBT(mp, offset);
-	xfs_fileoff_t		end_fsb = XFS_B_TO_FSB(mp, offset + count);
-	int			error;
+	xfs_fileoff_t		offset_fsb = imap->br_startoff;
+	xfs_filblks_t		count_fsb = imap->br_blockcount;
+	struct xfs_bmbt_irec	got;
+	struct xfs_defer_ops	dfops;
+	struct xfs_trans	*tp = NULL;
+	xfs_fsblock_t		first_block;
+	int			nimaps, error = 0;
+	bool			trimmed;
+	xfs_filblks_t		resaligned;
+	xfs_extlen_t		resblks = 0;
+	xfs_extnum_t		idx;
 
+retry:
 	ASSERT(xfs_is_reflink_inode(ip));
-
-	trace_xfs_reflink_allocate_cow_range(ip, offset, count);
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL | XFS_ILOCK_SHARED));
 
 	/*
-	 * Make sure that the dquots are there.
+	 * Even if the extent is not shared we might have a preallocation for
+	 * it in the COW fork.  If so use it.
 	 */
-	error = xfs_qm_dqattach(ip, 0);
-	if (error)
-		return error;
+	if (xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &idx, &got) &&
+	    got.br_startoff <= offset_fsb) {
+		*shared = true;
 
-	while (offset_fsb < end_fsb) {
-		error = __xfs_reflink_allocate_cow(ip, &offset_fsb, end_fsb);
-		if (error) {
-			trace_xfs_reflink_allocate_cow_range_error(ip, error,
-					_RET_IP_);
-			break;
+		/* If we have a real allocation in the COW fork we're done. */
+		if (!isnullstartblock(got.br_startblock)) {
+			xfs_trim_extent(&got, offset_fsb, count_fsb);
+			*imap = got;
+			goto convert;
 		}
+
+		xfs_trim_extent(imap, got.br_startoff, got.br_blockcount);
+	} else {
+		error = xfs_reflink_trim_around_shared(ip, imap, shared, &trimmed);
+		if (error || !*shared)
+			goto out;
+	}
+
+	if (!tp) {
+		resaligned = xfs_aligned_fsb_count(imap->br_startoff,
+			imap->br_blockcount, xfs_get_cowextsz_hint(ip));
+		resblks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned);
+
+		xfs_iunlock(ip, *lockmode);
+		error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp);
+		*lockmode = XFS_ILOCK_EXCL;
+		xfs_ilock(ip, *lockmode);
+
+		if (error)
+			return error;
+
+		error = xfs_qm_dqattach_locked(ip, 0);
+		if (error)
+			goto out;
+		goto retry;
 	}
 
+	error = xfs_trans_reserve_quota_nblks(tp, ip, resblks, 0,
+			XFS_QMOPT_RES_REGBLKS);
+	if (error)
+		goto out;
+
+	xfs_trans_ijoin(tp, ip, 0);
+
+	xfs_defer_init(&dfops, &first_block);
+	nimaps = 1;
+
+	/* Allocate the entire reservation as unwritten blocks. */
+	error = xfs_bmapi_write(tp, ip, imap->br_startoff, imap->br_blockcount,
+			XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC, &first_block,
+			resblks, imap, &nimaps, &dfops);
+	if (error)
+		goto out_bmap_cancel;
+
+	/* Finish up. */
+	error = xfs_defer_finish(&tp, &dfops, NULL);
+	if (error)
+		goto out_bmap_cancel;
+
+	error = xfs_trans_commit(tp);
+	if (error)
+		return error;
+convert:
+	return xfs_reflink_convert_cow_extent(ip, imap, offset_fsb, count_fsb,
+			&dfops);
+out_bmap_cancel:
+	xfs_defer_cancel(&dfops);
+	xfs_trans_unreserve_quota_nblks(tp, ip, (long)resblks, 0,
+			XFS_QMOPT_RES_REGBLKS);
+out:
+	if (tp)
+		xfs_trans_cancel(tp);
 	return error;
 }
 
@@ -641,6 +730,16 @@ xfs_reflink_end_cow(
 
 		ASSERT(!isnullstartblock(got.br_startblock));
 
+		/*
+		 * Don't remap unwritten extents; these are
+		 * speculatively preallocated CoW extents that have been
+		 * allocated but have not yet been involved in a write.
+		 */
+		if (got.br_state == XFS_EXT_UNWRITTEN) {
+			idx--;
+			goto next_extent;
+		}
+
 		/* Unmap the old blocks in the data fork. */
 		xfs_defer_init(&dfops, &firstfsb);
 		rlen = del.br_blockcount;
@@ -855,13 +954,14 @@ STATIC int
 xfs_reflink_update_dest(
 	struct xfs_inode	*dest,
 	xfs_off_t		newlen,
-	xfs_extlen_t		cowextsize)
+	xfs_extlen_t		cowextsize,
+	bool			is_dedupe)
 {
 	struct xfs_mount	*mp = dest->i_mount;
 	struct xfs_trans	*tp;
 	int			error;
 
-	if (newlen <= i_size_read(VFS_I(dest)) && cowextsize == 0)
+	if (is_dedupe && newlen <= i_size_read(VFS_I(dest)) && cowextsize == 0)
 		return 0;
 
 	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp);
@@ -882,6 +982,10 @@ xfs_reflink_update_dest(
 		dest->i_d.di_flags2 |= XFS_DIFLAG2_COWEXTSIZE;
 	}
 
+	if (!is_dedupe) {
+		xfs_trans_ichgtime(tp, dest,
+				   XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+	}
 	xfs_trans_log_inode(tp, dest, XFS_ILOG_CORE);
 
 	error = xfs_trans_commit(tp);
@@ -1195,7 +1299,8 @@ xfs_reflink_remap_range(
 	    !(dest->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE))
 		cowextsize = src->i_d.di_cowextsize;
 
-	ret = xfs_reflink_update_dest(dest, pos_out + len, cowextsize);
+	ret = xfs_reflink_update_dest(dest, pos_out + len, cowextsize,
+			is_dedupe);
 
 out_unlock:
 	xfs_iunlock(src, XFS_MMAPLOCK_EXCL);
diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h
index aa6a4d64bd35..33ac9b8db683 100644
--- a/fs/xfs/xfs_reflink.h
+++ b/fs/xfs/xfs_reflink.h
@@ -28,8 +28,10 @@ extern int xfs_reflink_trim_around_shared(struct xfs_inode *ip,
 
 extern int xfs_reflink_reserve_cow(struct xfs_inode *ip,
 		struct xfs_bmbt_irec *imap, bool *shared);
-extern int xfs_reflink_allocate_cow_range(struct xfs_inode *ip,
-		xfs_off_t offset, xfs_off_t count);
+extern int xfs_reflink_allocate_cow(struct xfs_inode *ip,
+		struct xfs_bmbt_irec *imap, bool *shared, uint *lockmode);
+extern int xfs_reflink_convert_cow(struct xfs_inode *ip, xfs_off_t offset,
+		xfs_off_t count);
 extern bool xfs_reflink_find_cow_mapping(struct xfs_inode *ip, xfs_off_t offset,
 		struct xfs_bmbt_irec *imap);
 extern void xfs_reflink_trim_irec_to_next_cow(struct xfs_inode *ip,
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 802bcc326d9f..c57aa7f18087 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -1093,7 +1093,6 @@ xfs_rtallocate_extent(
 	xfs_extlen_t	minlen,		/* minimum length to allocate */
 	xfs_extlen_t	maxlen,		/* maximum length to allocate */
 	xfs_extlen_t	*len,		/* out: actual length allocated */
-	xfs_alloctype_t	type,		/* allocation type XFS_ALLOCTYPE... */
 	int		wasdel,		/* was a delayed allocation extent */
 	xfs_extlen_t	prod,		/* extent product factor */
 	xfs_rtblock_t	*rtblock)	/* out: start block allocated */
@@ -1123,27 +1122,16 @@ xfs_rtallocate_extent(
 		}
 	}
 
+retry:
 	sumbp = NULL;
-	/*
-	 * Allocate by size, or near another block, or exactly at some block.
-	 */
-	switch (type) {
-	case XFS_ALLOCTYPE_ANY_AG:
+	if (bno == 0) {
 		error = xfs_rtallocate_extent_size(mp, tp, minlen, maxlen, len,
 				&sumbp,	&sb, prod, &r);
-		break;
-	case XFS_ALLOCTYPE_NEAR_BNO:
+	} else {
 		error = xfs_rtallocate_extent_near(mp, tp, bno, minlen, maxlen,
 				len, &sumbp, &sb, prod, &r);
-		break;
-	case XFS_ALLOCTYPE_THIS_BNO:
-		error = xfs_rtallocate_extent_exact(mp, tp, bno, minlen, maxlen,
-				len, &sumbp, &sb, prod, &r);
-		break;
-	default:
-		error = -EIO;
-		ASSERT(0);
 	}
+
 	if (error)
 		return error;
 
@@ -1158,7 +1146,11 @@ xfs_rtallocate_extent(
 			xfs_trans_mod_sb(tp, XFS_TRANS_SB_RES_FREXTENTS, -slen);
 		else
 			xfs_trans_mod_sb(tp, XFS_TRANS_SB_FREXTENTS, -slen);
+	} else if (prod > 1) {
+		prod = 1;
+		goto retry;
 	}
+
 	*rtblock = r;
 	return 0;
 }
diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h
index 355dd9e1cb64..51dd3c726608 100644
--- a/fs/xfs/xfs_rtalloc.h
+++ b/fs/xfs/xfs_rtalloc.h
@@ -40,7 +40,6 @@ xfs_rtallocate_extent(
 	xfs_extlen_t		minlen,	/* minimum length to allocate */
 	xfs_extlen_t		maxlen,	/* maximum length to allocate */
 	xfs_extlen_t		*len,	/* out: actual length allocated */
-	xfs_alloctype_t		type,	/* allocation type XFS_ALLOCTYPE... */
 	int			wasdel,	/* was a delayed allocation extent */
 	xfs_extlen_t		prod,	/* extent product factor */
 	xfs_rtblock_t		*rtblock); /* out: start block allocated */
@@ -122,7 +121,7 @@ int xfs_rtfree_range(struct xfs_mount *mp, struct xfs_trans *tp,
 
 
 #else
-# define xfs_rtallocate_extent(t,b,min,max,l,a,f,p,rb)  (ENOSYS)
+# define xfs_rtallocate_extent(t,b,min,max,l,f,p,rb)    (ENOSYS)
 # define xfs_rtfree_extent(t,b,l)                       (ENOSYS)
 # define xfs_rtpick_extent(m,t,l,rb)                    (ENOSYS)
 # define xfs_growfs_rt(mp,in)                           (ENOSYS)
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index eecbaac08eba..890862f2447c 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1956,12 +1956,20 @@ xfs_init_workqueues(void)
 	if (!xfs_alloc_wq)
 		return -ENOMEM;
 
+	xfs_discard_wq = alloc_workqueue("xfsdiscard", WQ_UNBOUND, 0);
+	if (!xfs_discard_wq)
+		goto out_free_alloc_wq;
+
 	return 0;
+out_free_alloc_wq:
+	destroy_workqueue(xfs_alloc_wq);
+	return -ENOMEM;
 }
 
 STATIC void
 xfs_destroy_workqueues(void)
 {
+	destroy_workqueue(xfs_discard_wq);
 	destroy_workqueue(xfs_alloc_wq);
 }
 
diff --git a/fs/xfs/xfs_super.h b/fs/xfs/xfs_super.h
index b6418abd85ad..5f2f32408011 100644
--- a/fs/xfs/xfs_super.h
+++ b/fs/xfs/xfs_super.h
@@ -73,6 +73,8 @@ extern const struct quotactl_ops xfs_quotactl_operations;
 
 extern void xfs_reinit_percpu_counters(struct xfs_mount *mp);
 
+extern struct workqueue_struct *xfs_discard_wq;
+
 #define XFS_M(sb)		((struct xfs_mount *)((sb)->s_fs_info))
 
 #endif	/* __XFS_SUPER_H__ */
diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c
index 276d3023d60f..80ac15fb9638 100644
--- a/fs/xfs/xfs_sysfs.c
+++ b/fs/xfs/xfs_sysfs.c
@@ -93,7 +93,7 @@ to_mp(struct kobject *kobject)
 #ifdef DEBUG
 
 STATIC ssize_t
-fail_writes_store(
+drop_writes_store(
 	struct kobject		*kobject,
 	const char		*buf,
 	size_t			count)
@@ -107,9 +107,9 @@ fail_writes_store(
 		return ret;
 
 	if (val == 1)
-		mp->m_fail_writes = true;
+		mp->m_drop_writes = true;
 	else if (val == 0)
-		mp->m_fail_writes = false;
+		mp->m_drop_writes = false;
 	else
 		return -EINVAL;
 
@@ -117,21 +117,21 @@ fail_writes_store(
 }
 
 STATIC ssize_t
-fail_writes_show(
+drop_writes_show(
 	struct kobject		*kobject,
 	char			*buf)
 {
 	struct xfs_mount	*mp = to_mp(kobject);
 
-	return snprintf(buf, PAGE_SIZE, "%d\n", mp->m_fail_writes ? 1 : 0);
+	return snprintf(buf, PAGE_SIZE, "%d\n", mp->m_drop_writes ? 1 : 0);
 }
-XFS_SYSFS_ATTR_RW(fail_writes);
+XFS_SYSFS_ATTR_RW(drop_writes);
 
 #endif /* DEBUG */
 
 static struct attribute *xfs_mp_attrs[] = {
 #ifdef DEBUG
-	ATTR_LIST(fail_writes),
+	ATTR_LIST(drop_writes),
 #endif
 	NULL,
 };
@@ -396,7 +396,7 @@ max_retries_show(
 	int		retries;
 	struct xfs_error_cfg *cfg = to_error_cfg(kobject);
 
-	if (cfg->retry_timeout == XFS_ERR_RETRY_FOREVER)
+	if (cfg->max_retries == XFS_ERR_RETRY_FOREVER)
 		retries = -1;
 	else
 		retries = cfg->max_retries;
@@ -422,7 +422,7 @@ max_retries_store(
 		return -EINVAL;
 
 	if (val == -1)
-		cfg->retry_timeout = XFS_ERR_RETRY_FOREVER;
+		cfg->max_retries = XFS_ERR_RETRY_FOREVER;
 	else
 		cfg->max_retries = val;
 	return count;
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 69c5bcd9a51b..fb7555e73a62 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -2245,7 +2245,6 @@ DEFINE_BTREE_CUR_EVENT(xfs_btree_overlapped_query_range);
 
 /* deferred ops */
 struct xfs_defer_pending;
-struct xfs_defer_intake;
 struct xfs_defer_ops;
 
 DECLARE_EVENT_CLASS(xfs_defer_class,
@@ -3089,6 +3088,7 @@ DECLARE_EVENT_CLASS(xfs_inode_irec_class,
 		__field(xfs_fileoff_t, lblk)
 		__field(xfs_extlen_t, len)
 		__field(xfs_fsblock_t, pblk)
+		__field(int, state)
 	),
 	TP_fast_assign(
 		__entry->dev = VFS_I(ip)->i_sb->s_dev;
@@ -3096,13 +3096,15 @@ DECLARE_EVENT_CLASS(xfs_inode_irec_class,
 		__entry->lblk = irec->br_startoff;
 		__entry->len = irec->br_blockcount;
 		__entry->pblk = irec->br_startblock;
+		__entry->state = irec->br_state;
 	),
-	TP_printk("dev %d:%d ino 0x%llx lblk 0x%llx len 0x%x pblk %llu",
+	TP_printk("dev %d:%d ino 0x%llx lblk 0x%llx len 0x%x pblk %llu st %d",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  __entry->ino,
 		  __entry->lblk,
 		  __entry->len,
-		  __entry->pblk)
+		  __entry->pblk,
+		  __entry->state)
 );
 #define DEFINE_INODE_IREC_EVENT(name) \
 DEFINE_EVENT(xfs_inode_irec_class, name, \
@@ -3242,11 +3244,11 @@ DEFINE_INODE_IREC_EVENT(xfs_reflink_trim_around_shared);
 DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_alloc);
 DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_found);
 DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_enospc);
+DEFINE_INODE_IREC_EVENT(xfs_reflink_convert_cow);
 
 DEFINE_RW_EVENT(xfs_reflink_reserve_cow);
-DEFINE_RW_EVENT(xfs_reflink_allocate_cow_range);
 
-DEFINE_INODE_IREC_EVENT(xfs_reflink_bounce_dio_write);
+DEFINE_SIMPLE_IO_EVENT(xfs_reflink_bounce_dio_write);
 DEFINE_IOMAP_EVENT(xfs_reflink_find_cow_mapping);
 DEFINE_INODE_IREC_EVENT(xfs_reflink_trim_irec);
 
@@ -3254,7 +3256,6 @@ DEFINE_SIMPLE_IO_EVENT(xfs_reflink_cancel_cow_range);
 DEFINE_SIMPLE_IO_EVENT(xfs_reflink_end_cow);
 DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap);
 
-DEFINE_INODE_ERROR_EVENT(xfs_reflink_allocate_cow_range_error);
 DEFINE_INODE_ERROR_EVENT(xfs_reflink_cancel_cow_range_error);
 DEFINE_INODE_ERROR_EVENT(xfs_reflink_end_cow_error);
 
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 61b7fbdd3ebd..1646f659b60f 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -32,7 +32,6 @@ struct xfs_mount;
 struct xfs_trans;
 struct xfs_trans_res;
 struct xfs_dquot_acct;
-struct xfs_busy_extent;
 struct xfs_rud_log_item;
 struct xfs_rui_log_item;
 struct xfs_btree_cur;