From 322b2b9032f4beba6f1c4158852a5a5b9ab841d7 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 11 Jan 2013 16:39:51 -0500
Subject: Revert "NFS: add nfs_sb_deactive_async to avoid deadlock"

This reverts commit 324d003b0cd82151adbaecefef57b73f7959a469.

The deadlock turned out to be caused by a workqueue limitation that has
now been worked around in the RPC code (see comment in rpc_free_task).

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/inode.c    |  5 +----
 fs/nfs/internal.h |  1 -
 fs/nfs/nfs4proc.c |  2 +-
 fs/nfs/super.c    | 49 -------------------------------------------------
 fs/nfs/unlink.c   |  2 +-
 5 files changed, 3 insertions(+), 56 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index ebeb94ce1b0b..6acc73c80d7f 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -694,10 +694,7 @@ static void __put_nfs_open_context(struct nfs_open_context *ctx, int is_sync)
 	if (ctx->cred != NULL)
 		put_rpccred(ctx->cred);
 	dput(ctx->dentry);
-	if (is_sync)
-		nfs_sb_deactive(sb);
-	else
-		nfs_sb_deactive_async(sb);
+	nfs_sb_deactive(sb);
 	kfree(ctx->mdsthreshold);
 	kfree(ctx);
 }
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index f0e6c7df1a07..541c9ebdbc5a 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -329,7 +329,6 @@ extern int __init register_nfs_fs(void);
 extern void __exit unregister_nfs_fs(void);
 extern void nfs_sb_active(struct super_block *sb);
 extern void nfs_sb_deactive(struct super_block *sb);
-extern void nfs_sb_deactive_async(struct super_block *sb);
 
 /* namespace.c */
 #define NFS_PATH_CANONICAL 1
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index cf747ef86650..62fa4001e47a 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2088,7 +2088,7 @@ static void nfs4_free_closedata(void *data)
 	nfs4_put_open_state(calldata->state);
 	nfs_free_seqid(calldata->arg.seqid);
 	nfs4_put_state_owner(sp);
-	nfs_sb_deactive_async(sb);
+	nfs_sb_deactive(sb);
 	kfree(calldata);
 }
 
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index b056b1628722..befbae0cce41 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -54,7 +54,6 @@
 #include <linux/parser.h>
 #include <linux/nsproxy.h>
 #include <linux/rcupdate.h>
-#include <linux/kthread.h>
 
 #include <asm/uaccess.h>
 
@@ -418,54 +417,6 @@ void nfs_sb_deactive(struct super_block *sb)
 }
 EXPORT_SYMBOL_GPL(nfs_sb_deactive);
 
-static int nfs_deactivate_super_async_work(void *ptr)
-{
-	struct super_block *sb = ptr;
-
-	deactivate_super(sb);
-	module_put_and_exit(0);
-	return 0;
-}
-
-/*
- * same effect as deactivate_super, but will do final unmount in kthread
- * context
- */
-static void nfs_deactivate_super_async(struct super_block *sb)
-{
-	struct task_struct *task;
-	char buf[INET6_ADDRSTRLEN + 1];
-	struct nfs_server *server = NFS_SB(sb);
-	struct nfs_client *clp = server->nfs_client;
-
-	if (!atomic_add_unless(&sb->s_active, -1, 1)) {
-		rcu_read_lock();
-		snprintf(buf, sizeof(buf),
-			rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR));
-		rcu_read_unlock();
-
-		__module_get(THIS_MODULE);
-		task = kthread_run(nfs_deactivate_super_async_work, sb,
-				"%s-deactivate-super", buf);
-		if (IS_ERR(task)) {
-			pr_err("%s: kthread_run: %ld\n",
-				__func__, PTR_ERR(task));
-			/* make synchronous call and hope for the best */
-			deactivate_super(sb);
-			module_put(THIS_MODULE);
-		}
-	}
-}
-
-void nfs_sb_deactive_async(struct super_block *sb)
-{
-	struct nfs_server *server = NFS_SB(sb);
-
-	if (atomic_dec_and_test(&server->active))
-		nfs_deactivate_super_async(sb);
-}
-EXPORT_SYMBOL_GPL(nfs_sb_deactive_async);
-
 /*
  * Deliver file system statistics to userspace
  */
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index 3f79c77153b8..13cea637eff8 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -95,7 +95,7 @@ static void nfs_async_unlink_release(void *calldata)
 
 	nfs_dec_sillycount(data->dir);
 	nfs_free_unlinkdata(data);
-	nfs_sb_deactive_async(sb);
+	nfs_sb_deactive(sb);
 }
 
 static void nfs_unlink_prepare(struct rpc_task *task, void *calldata)
-- 
cgit v1.2.3


From c137afabe330f64eddcd4dd281258807e27fd430 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Thu, 7 Feb 2013 14:41:11 -0500
Subject: NFSv4: Allow the state manager to mark an open_owner as being
 recovered

This patch adds a seqcount_t lock for use by the state manager to
signal that an open owner has been recovered. This mechanism will be
used by the delegation, open and byte range lock code in order to
figure out if they need to replay requests due to collisions with
lock recovery.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4_fs.h   |  3 +++
 fs/nfs/nfs4state.c | 10 +++++++++-
 2 files changed, 12 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index a3f488b074a2..b12b73472020 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -13,6 +13,8 @@
 
 #define NFS4_MAX_LOOP_ON_RECOVER (10)
 
+#include <linux/seqlock.h>
+
 struct idmap;
 
 enum nfs4_client_state {
@@ -90,6 +92,7 @@ struct nfs4_state_owner {
 	unsigned long	     so_flags;
 	struct list_head     so_states;
 	struct nfs_seqid_counter so_seqid;
+	seqcount_t	     so_reclaim_seqcount;
 };
 
 enum {
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index e61f68d5ef21..fff97228cdec 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -518,6 +518,7 @@ nfs4_alloc_state_owner(struct nfs_server *server,
 	nfs4_init_seqid_counter(&sp->so_seqid);
 	atomic_set(&sp->so_count, 1);
 	INIT_LIST_HEAD(&sp->so_lru);
+	seqcount_init(&sp->so_reclaim_seqcount);
 	return sp;
 }
 
@@ -1390,8 +1391,9 @@ static int nfs4_reclaim_open_state(struct nfs4_state_owner *sp, const struct nfs
 	 * recovering after a network partition or a reboot from a
 	 * server that doesn't support a grace period.
 	 */
-restart:
 	spin_lock(&sp->so_lock);
+	write_seqcount_begin(&sp->so_reclaim_seqcount);
+restart:
 	list_for_each_entry(state, &sp->so_states, open_states) {
 		if (!test_and_clear_bit(ops->state_flag_bit, &state->flags))
 			continue;
@@ -1412,6 +1414,7 @@ restart:
 				}
 				spin_unlock(&state->state_lock);
 				nfs4_put_open_state(state);
+				spin_lock(&sp->so_lock);
 				goto restart;
 			}
 		}
@@ -1449,12 +1452,17 @@ restart:
 				goto out_err;
 		}
 		nfs4_put_open_state(state);
+		spin_lock(&sp->so_lock);
 		goto restart;
 	}
+	write_seqcount_end(&sp->so_reclaim_seqcount);
 	spin_unlock(&sp->so_lock);
 	return 0;
 out_err:
 	nfs4_put_open_state(state);
+	spin_lock(&sp->so_lock);
+	write_seqcount_end(&sp->so_reclaim_seqcount);
+	spin_unlock(&sp->so_lock);
 	return status;
 }
 
-- 
cgit v1.2.3


From 9a99af494bd7141d567d00b5ef94b141821e158c Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Mon, 4 Feb 2013 20:17:49 -0500
Subject: NFSv4.1: Prevent deadlocks between state recovery and file locking

We currently have a deadlock in which the state recovery thread
ends up blocking due to one of the locks which it is trying to
recover holding the nfs_inode->rwsem.
The situation is as follows: the state recovery thread is
scheduled in order to recover from a reboot. It immediately
drains the session, forcing all ordinary NFSv4.1 calls to
nfs41_setup_sequence() to be put to sleep.  This includes the
file locking process that holds the nfs_inode->rwsem.
When the thread gets to nfs4_reclaim_locks(), it tries to
grab a write lock on nfs_inode->rwsem, and boom...

Fix is to have the lock drop the nfs_inode->rwsem while it is
doing RPC calls. We use a sequence lock in order to signal to
the locking process whether or not a state recovery thread has
run on that inode, in which case it should retry the lock.

Reported-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 62fa4001e47a..92584c1ea725 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -4813,8 +4813,10 @@ static int nfs41_lock_expired(struct nfs4_state *state, struct file_lock *reques
 
 static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock *request)
 {
+	struct nfs4_state_owner *sp = state->owner;
 	struct nfs_inode *nfsi = NFS_I(state->inode);
 	unsigned char fl_flags = request->fl_flags;
+	unsigned int seq;
 	int status = -ENOLCK;
 
 	if ((fl_flags & FL_POSIX) &&
@@ -4836,9 +4838,16 @@ static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock
 		status = do_vfs_lock(request->fl_file, request);
 		goto out_unlock;
 	}
+	seq = raw_seqcount_begin(&sp->so_reclaim_seqcount);
+	up_read(&nfsi->rwsem);
 	status = _nfs4_do_setlk(state, cmd, request, NFS_LOCK_NEW);
 	if (status != 0)
+		goto out;
+	down_read(&nfsi->rwsem);
+	if (read_seqcount_retry(&sp->so_reclaim_seqcount, seq)) {
+		status = -NFS4ERR_DELAY;
 		goto out_unlock;
+	}
 	/* Note: we always want to sleep here! */
 	request->fl_flags = fl_flags | FL_SLEEP;
 	if (do_vfs_lock(request->fl_file, request) < 0)
-- 
cgit v1.2.3


From d25be546a86fcb2ea919fb909a42b79aa40b816c Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 5 Feb 2013 11:43:28 -0500
Subject: NFSv4.1: Don't lose locks when a server reboots during delegation
 return

If the server reboots while we are converting a delegation into
OPEN/LOCK stateids as part of a delegation return, the current code
will simply exit with an error. This causes us to lose both
delegation state and locking state (i.e. locking atomicity).

Deal with this by exposing the delegation stateid during delegation
return, so that we can recover the delegation, and then resume
open/lock recovery.

Note that not having to hold the nfs_inode->rwsem across the
calls to nfs_delegation_claim_opens() also fixes a deadlock against
the NFSv4.1 reboot recovery code.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/delegation.c | 147 +++++++++++++++++++++++++++++++++++++---------------
 fs/nfs/delegation.h |   1 +
 fs/nfs/nfs4proc.c   |   3 ++
 3 files changed, 108 insertions(+), 43 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 81c5eec3cf38..2542cdaa1116 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -55,7 +55,8 @@ int nfs4_have_delegation(struct inode *inode, fmode_t flags)
 	flags &= FMODE_READ|FMODE_WRITE;
 	rcu_read_lock();
 	delegation = rcu_dereference(NFS_I(inode)->delegation);
-	if (delegation != NULL && (delegation->type & flags) == flags) {
+	if (delegation != NULL && (delegation->type & flags) == flags &&
+	    !test_bit(NFS_DELEGATION_RETURNING, &delegation->flags)) {
 		nfs_mark_delegation_referenced(delegation);
 		ret = 1;
 	}
@@ -94,7 +95,9 @@ static int nfs_delegation_claim_opens(struct inode *inode, const nfs4_stateid *s
 {
 	struct nfs_inode *nfsi = NFS_I(inode);
 	struct nfs_open_context *ctx;
+	struct nfs4_state_owner *sp;
 	struct nfs4_state *state;
+	unsigned int seq;
 	int err;
 
 again:
@@ -109,9 +112,13 @@ again:
 			continue;
 		get_nfs_open_context(ctx);
 		spin_unlock(&inode->i_lock);
+		sp = state->owner;
+		seq = raw_seqcount_begin(&sp->so_reclaim_seqcount);
 		err = nfs4_open_delegation_recall(ctx, state, stateid);
-		if (err >= 0)
+		if (!err)
 			err = nfs_delegation_claim_locks(ctx, state);
+		if (!err && read_seqcount_retry(&sp->so_reclaim_seqcount, seq))
+			err = -EAGAIN;
 		put_nfs_open_context(ctx);
 		if (err != 0)
 			return err;
@@ -181,40 +188,92 @@ static struct inode *nfs_delegation_grab_inode(struct nfs_delegation *delegation
 	return inode;
 }
 
+static struct nfs_delegation *
+nfs_start_delegation_return_locked(struct nfs_inode *nfsi)
+{
+	struct nfs_delegation *ret = NULL;
+	struct nfs_delegation *delegation = rcu_dereference(nfsi->delegation);
+
+	if (delegation == NULL)
+		goto out;
+	spin_lock(&delegation->lock);
+	if (!test_and_set_bit(NFS_DELEGATION_RETURNING, &delegation->flags))
+		ret = delegation;
+	spin_unlock(&delegation->lock);
+out:
+	return ret;
+}
+
+static struct nfs_delegation *
+nfs_start_delegation_return(struct nfs_inode *nfsi)
+{
+	struct nfs_delegation *delegation;
+
+	rcu_read_lock();
+	delegation = nfs_start_delegation_return_locked(nfsi);
+	rcu_read_unlock();
+	return delegation;
+}
+
+static void
+nfs_abort_delegation_return(struct nfs_delegation *delegation,
+		struct nfs_client *clp)
+{
+
+	spin_lock(&delegation->lock);
+	clear_bit(NFS_DELEGATION_RETURNING, &delegation->flags);
+	set_bit(NFS_DELEGATION_RETURN, &delegation->flags);
+	spin_unlock(&delegation->lock);
+	set_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state);
+}
+
 static struct nfs_delegation *
 nfs_detach_delegation_locked(struct nfs_inode *nfsi,
-			     struct nfs_server *server)
+		struct nfs_delegation *delegation,
+		struct nfs_client *clp)
 {
-	struct nfs_delegation *delegation =
+	struct nfs_delegation *deleg_cur =
 		rcu_dereference_protected(nfsi->delegation,
-				lockdep_is_held(&server->nfs_client->cl_lock));
+				lockdep_is_held(&clp->cl_lock));
 
-	if (delegation == NULL)
-		goto nomatch;
+	if (deleg_cur == NULL || delegation != deleg_cur)
+		return NULL;
 
 	spin_lock(&delegation->lock);
+	set_bit(NFS_DELEGATION_RETURNING, &delegation->flags);
 	list_del_rcu(&delegation->super_list);
 	delegation->inode = NULL;
 	nfsi->delegation_state = 0;
 	rcu_assign_pointer(nfsi->delegation, NULL);
 	spin_unlock(&delegation->lock);
 	return delegation;
-nomatch:
-	return NULL;
 }
 
 static struct nfs_delegation *nfs_detach_delegation(struct nfs_inode *nfsi,
-						    struct nfs_server *server)
+		struct nfs_delegation *delegation,
+		struct nfs_server *server)
 {
 	struct nfs_client *clp = server->nfs_client;
-	struct nfs_delegation *delegation;
 
 	spin_lock(&clp->cl_lock);
-	delegation = nfs_detach_delegation_locked(nfsi, server);
+	delegation = nfs_detach_delegation_locked(nfsi, delegation, clp);
 	spin_unlock(&clp->cl_lock);
 	return delegation;
 }
 
+static struct nfs_delegation *
+nfs_inode_detach_delegation(struct inode *inode)
+{
+	struct nfs_inode *nfsi = NFS_I(inode);
+	struct nfs_server *server = NFS_SERVER(inode);
+	struct nfs_delegation *delegation;
+
+	delegation = nfs_start_delegation_return(nfsi);
+	if (delegation == NULL)
+		return NULL;
+	return nfs_detach_delegation(nfsi, delegation, server);
+}
+
 /**
  * nfs_inode_set_delegation - set up a delegation on an inode
  * @inode: inode to which delegation applies
@@ -268,7 +327,10 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct
 			delegation = NULL;
 			goto out;
 		}
-		freeme = nfs_detach_delegation_locked(nfsi, server);
+		freeme = nfs_detach_delegation_locked(nfsi, 
+				old_delegation, clp);
+		if (freeme == NULL)
+			goto out;
 	}
 	list_add_rcu(&delegation->super_list, &server->delegations);
 	nfsi->delegation_state = delegation->type;
@@ -292,19 +354,29 @@ out:
 /*
  * Basic procedure for returning a delegation to the server
  */
-static int __nfs_inode_return_delegation(struct inode *inode, struct nfs_delegation *delegation, int issync)
+static int nfs_end_delegation_return(struct inode *inode, struct nfs_delegation *delegation, int issync)
 {
+	struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
 	struct nfs_inode *nfsi = NFS_I(inode);
 	int err;
 
-	/*
-	 * Guard against new delegated open/lock/unlock calls and against
-	 * state recovery
-	 */
-	down_write(&nfsi->rwsem);
-	err = nfs_delegation_claim_opens(inode, &delegation->stateid);
-	up_write(&nfsi->rwsem);
-	if (err)
+	if (delegation == NULL)
+		return 0;
+	do {
+		err = nfs_delegation_claim_opens(inode, &delegation->stateid);
+		if (!issync || err != -EAGAIN)
+			break;
+		/*
+		 * Guard against state recovery
+		 */
+		err = nfs4_wait_clnt_recover(clp);
+	} while (err == 0);
+
+	if (err) {
+		nfs_abort_delegation_return(delegation, clp);
+		goto out;
+	}
+	if (!nfs_detach_delegation(nfsi, delegation, NFS_SERVER(inode)))
 		goto out;
 
 	err = nfs_do_return_delegation(inode, delegation, issync);
@@ -340,13 +412,10 @@ restart:
 			inode = nfs_delegation_grab_inode(delegation);
 			if (inode == NULL)
 				continue;
-			delegation = nfs_detach_delegation(NFS_I(inode),
-								server);
+			delegation = nfs_start_delegation_return_locked(NFS_I(inode));
 			rcu_read_unlock();
 
-			if (delegation != NULL)
-				err = __nfs_inode_return_delegation(inode,
-								delegation, 0);
+			err = nfs_end_delegation_return(inode, delegation, 0);
 			iput(inode);
 			if (!err)
 				goto restart;
@@ -367,15 +436,11 @@ restart:
  */
 void nfs_inode_return_delegation_noreclaim(struct inode *inode)
 {
-	struct nfs_server *server = NFS_SERVER(inode);
-	struct nfs_inode *nfsi = NFS_I(inode);
 	struct nfs_delegation *delegation;
 
-	if (rcu_access_pointer(nfsi->delegation) != NULL) {
-		delegation = nfs_detach_delegation(nfsi, server);
-		if (delegation != NULL)
-			nfs_do_return_delegation(inode, delegation, 0);
-	}
+	delegation = nfs_inode_detach_delegation(inode);
+	if (delegation != NULL)
+		nfs_do_return_delegation(inode, delegation, 0);
 }
 
 /**
@@ -390,18 +455,14 @@ void nfs_inode_return_delegation_noreclaim(struct inode *inode)
  */
 int nfs4_inode_return_delegation(struct inode *inode)
 {
-	struct nfs_server *server = NFS_SERVER(inode);
 	struct nfs_inode *nfsi = NFS_I(inode);
 	struct nfs_delegation *delegation;
 	int err = 0;
 
 	nfs_wb_all(inode);
-	if (rcu_access_pointer(nfsi->delegation) != NULL) {
-		delegation = nfs_detach_delegation(nfsi, server);
-		if (delegation != NULL) {
-			err = __nfs_inode_return_delegation(inode, delegation, 1);
-		}
-	}
+	delegation = nfs_start_delegation_return(nfsi);
+	if (delegation != NULL)
+		err = nfs_end_delegation_return(inode, delegation, 1);
 	return err;
 }
 
@@ -471,7 +532,7 @@ void nfs_remove_bad_delegation(struct inode *inode)
 {
 	struct nfs_delegation *delegation;
 
-	delegation = nfs_detach_delegation(NFS_I(inode), NFS_SERVER(inode));
+	delegation = nfs_inode_detach_delegation(inode);
 	if (delegation) {
 		nfs_inode_find_state_and_recover(inode, &delegation->stateid);
 		nfs_free_delegation(delegation);
@@ -649,7 +710,7 @@ restart:
 			if (inode == NULL)
 				continue;
 			delegation = nfs_detach_delegation(NFS_I(inode),
-								server);
+					delegation, server);
 			rcu_read_unlock();
 
 			if (delegation != NULL)
diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h
index bbc6a4dba0d8..d54d4fca6793 100644
--- a/fs/nfs/delegation.h
+++ b/fs/nfs/delegation.h
@@ -29,6 +29,7 @@ enum {
 	NFS_DELEGATION_NEED_RECLAIM = 0,
 	NFS_DELEGATION_RETURN,
 	NFS_DELEGATION_REFERENCED,
+	NFS_DELEGATION_RETURNING,
 };
 
 int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res);
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 92584c1ea725..a1e89e270ad8 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -896,6 +896,8 @@ static int can_open_delegated(struct nfs_delegation *delegation, fmode_t fmode)
 		return 0;
 	if (test_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags))
 		return 0;
+	if (test_bit(NFS_DELEGATION_RETURNING, &delegation->flags))
+		return 0;
 	nfs_mark_delegation_referenced(delegation);
 	return 1;
 }
@@ -973,6 +975,7 @@ static int update_open_stateid(struct nfs4_state *state, nfs4_stateid *open_stat
 
 	spin_lock(&deleg_cur->lock);
 	if (nfsi->delegation != deleg_cur ||
+	   test_bit(NFS_DELEGATION_RETURNING, &deleg_cur->flags) ||
 	    (deleg_cur->type & fmode) != fmode)
 		goto no_delegation_unlock;
 
-- 
cgit v1.2.3


From 37380e4264dbda9753e470a30d4322097aab7152 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 5 Feb 2013 12:35:16 -0500
Subject: NFSv4: Fix up the return values of nfs4_open_delegation_recall

Adjust the return values so that they return EAGAIN to the caller in
cases where we might want to retry the delegation recall after
the state recovery has run.
Note that we can't wait and retry in this routine, because the caller
may be the state manager thread.

If delegation recall fails due to a session or reboot related issue,
also ensure that we mark the stateid as delegated so that
nfs_delegation_claim_opens can find it again later.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c | 25 +++++++++++--------------
 1 file changed, 11 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index a1e89e270ad8..f7e05ade5572 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -1355,19 +1355,18 @@ int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state
 			case -NFS4ERR_BAD_HIGH_SLOT:
 			case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
 			case -NFS4ERR_DEADSESSION:
+				set_bit(NFS_DELEGATED_STATE, &state->flags);
 				nfs4_schedule_session_recovery(server->nfs_client->cl_session, err);
+				err = -EAGAIN;
 				goto out;
 			case -NFS4ERR_STALE_CLIENTID:
 			case -NFS4ERR_STALE_STATEID:
+				set_bit(NFS_DELEGATED_STATE, &state->flags);
 			case -NFS4ERR_EXPIRED:
 				/* Don't recall a delegation if it was lost */
 				nfs4_schedule_lease_recovery(server->nfs_client);
+				err = -EAGAIN;
 				goto out;
-			case -ERESTARTSYS:
-				/*
-				 * The show must go on: exit, but mark the
-				 * stateid as needing recovery.
-				 */
 			case -NFS4ERR_DELEG_REVOKED:
 			case -NFS4ERR_ADMIN_REVOKED:
 			case -NFS4ERR_BAD_STATEID:
@@ -1378,6 +1377,7 @@ int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state
 				err = 0;
 				goto out;
 		}
+		set_bit(NFS_DELEGATED_STATE, &state->flags);
 		err = nfs4_handle_exception(server, err, &exception);
 	} while (exception.retry);
 out:
@@ -4957,24 +4957,22 @@ int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl)
 			case 0:
 			case -ESTALE:
 				goto out;
-			case -NFS4ERR_EXPIRED:
-				nfs4_schedule_stateid_recovery(server, state);
 			case -NFS4ERR_STALE_CLIENTID:
 			case -NFS4ERR_STALE_STATEID:
+				set_bit(NFS_DELEGATED_STATE, &state->flags);
+			case -NFS4ERR_EXPIRED:
 				nfs4_schedule_lease_recovery(server->nfs_client);
+				err = -EAGAIN;
 				goto out;
 			case -NFS4ERR_BADSESSION:
 			case -NFS4ERR_BADSLOT:
 			case -NFS4ERR_BAD_HIGH_SLOT:
 			case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
 			case -NFS4ERR_DEADSESSION:
+				set_bit(NFS_DELEGATED_STATE, &state->flags);
 				nfs4_schedule_session_recovery(server->nfs_client->cl_session, err);
+				err = -EAGAIN;
 				goto out;
-			case -ERESTARTSYS:
-				/*
-				 * The show must go on: exit, but mark the
-				 * stateid as needing recovery.
-				 */
 			case -NFS4ERR_DELEG_REVOKED:
 			case -NFS4ERR_ADMIN_REVOKED:
 			case -NFS4ERR_BAD_STATEID:
@@ -4987,9 +4985,8 @@ int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl)
 				/* kill_proc(fl->fl_pid, SIGLOST, 1); */
 				err = 0;
 				goto out;
-			case -NFS4ERR_DELAY:
-				break;
 		}
+		set_bit(NFS_DELEGATED_STATE, &state->flags);
 		err = nfs4_handle_exception(server, err, &exception);
 	} while (exception.retry);
 out:
-- 
cgit v1.2.3


From 65b62a29f719e937b5be1df472287f4c61e53ac6 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Thu, 7 Feb 2013 10:54:07 -0500
Subject: NFSv4: Ensure delegation recall and byte range lock removal don't
 conflict

Add a mutex to the struct nfs4_state_owner to ensure that delegation
recall doesn't conflict with byte range lock removal.

Note that we nest the new mutex _outside_ the state manager reclaim
protection (nfsi->rwsem) in order to avoid deadlocks.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/delegation.c | 7 ++++++-
 fs/nfs/nfs4_fs.h    | 1 +
 fs/nfs/nfs4proc.c   | 9 ++++++++-
 fs/nfs/nfs4state.c  | 1 +
 4 files changed, 16 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 2542cdaa1116..6390a4b5fee7 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -71,8 +71,10 @@ static int nfs_delegation_claim_locks(struct nfs_open_context *ctx, struct nfs4_
 	int status = 0;
 
 	if (inode->i_flock == NULL)
-		goto out;
+		return 0;
 
+	if (inode->i_flock == NULL)
+		goto out;
 	/* Protect inode->i_flock using the file locks lock */
 	lock_flocks();
 	for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
@@ -113,12 +115,15 @@ again:
 		get_nfs_open_context(ctx);
 		spin_unlock(&inode->i_lock);
 		sp = state->owner;
+		/* Block nfs4_proc_unlck */
+		mutex_lock(&sp->so_delegreturn_mutex);
 		seq = raw_seqcount_begin(&sp->so_reclaim_seqcount);
 		err = nfs4_open_delegation_recall(ctx, state, stateid);
 		if (!err)
 			err = nfs_delegation_claim_locks(ctx, state);
 		if (!err && read_seqcount_retry(&sp->so_reclaim_seqcount, seq))
 			err = -EAGAIN;
+		mutex_unlock(&sp->so_delegreturn_mutex);
 		put_nfs_open_context(ctx);
 		if (err != 0)
 			return err;
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index b12b73472020..944c9a5c1039 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -93,6 +93,7 @@ struct nfs4_state_owner {
 	struct list_head     so_states;
 	struct nfs_seqid_counter so_seqid;
 	seqcount_t	     so_reclaim_seqcount;
+	struct mutex	     so_delegreturn_mutex;
 };
 
 enum {
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index f7e05ade5572..d51227371c67 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -4485,7 +4485,9 @@ static struct rpc_task *nfs4_do_unlck(struct file_lock *fl,
 
 static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock *request)
 {
-	struct nfs_inode *nfsi = NFS_I(state->inode);
+	struct inode *inode = state->inode;
+	struct nfs4_state_owner *sp = state->owner;
+	struct nfs_inode *nfsi = NFS_I(inode);
 	struct nfs_seqid *seqid;
 	struct nfs4_lock_state *lsp;
 	struct rpc_task *task;
@@ -4495,12 +4497,17 @@ static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock *
 	status = nfs4_set_lock_state(state, request);
 	/* Unlock _before_ we do the RPC call */
 	request->fl_flags |= FL_EXISTS;
+	/* Exclude nfs_delegation_claim_locks() */
+	mutex_lock(&sp->so_delegreturn_mutex);
+	/* Exclude nfs4_reclaim_open_stateid() - note nesting! */
 	down_read(&nfsi->rwsem);
 	if (do_vfs_lock(request->fl_file, request) == -ENOENT) {
 		up_read(&nfsi->rwsem);
+		mutex_unlock(&sp->so_delegreturn_mutex);
 		goto out;
 	}
 	up_read(&nfsi->rwsem);
+	mutex_unlock(&sp->so_delegreturn_mutex);
 	if (status != 0)
 		goto out;
 	/* Is this a delegated lock? */
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index fff97228cdec..6ace365c6334 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -519,6 +519,7 @@ nfs4_alloc_state_owner(struct nfs_server *server,
 	atomic_set(&sp->so_count, 1);
 	INIT_LIST_HEAD(&sp->so_lru);
 	seqcount_init(&sp->so_reclaim_seqcount);
+	mutex_init(&sp->so_delegreturn_mutex);
 	return sp;
 }
 
-- 
cgit v1.2.3


From c21443c2c792cd9b463646d982b0fe48aa6feb0f Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Thu, 7 Feb 2013 14:26:21 -0500
Subject: NFSv4: Fix a reboot recovery race when opening a file

If the server reboots after it has replied to our OPEN, but before we
call nfs4_opendata_to_nfs4_state(), then the reboot recovery thread
will not see a stateid for this open, and so will fail to recover it.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c | 50 ++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 38 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index d51227371c67..7cbf7aa6c634 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -1848,6 +1848,43 @@ static inline void nfs4_exclusive_attrset(struct nfs4_opendata *opendata, struct
 		sattr->ia_valid |= ATTR_MTIME;
 }
 
+static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata,
+		fmode_t fmode,
+		int flags,
+		struct nfs4_state **res)
+{
+	struct nfs4_state_owner *sp = opendata->owner;
+	struct nfs_server *server = sp->so_server;
+	struct nfs4_state *state;
+	unsigned int seq;
+	int ret;
+
+	seq = raw_seqcount_begin(&sp->so_reclaim_seqcount);
+
+	ret = _nfs4_proc_open(opendata);
+	if (ret != 0)
+		goto out;
+
+	state = nfs4_opendata_to_nfs4_state(opendata);
+	ret = PTR_ERR(state);
+	if (IS_ERR(state))
+		goto out;
+	if (server->caps & NFS_CAP_POSIX_LOCK)
+		set_bit(NFS_STATE_POSIX_LOCKS, &state->flags);
+
+	ret = nfs4_opendata_access(sp->so_cred, opendata, state, fmode, flags);
+	if (ret != 0)
+		goto out;
+
+	if (read_seqcount_retry(&sp->so_reclaim_seqcount, seq)) {
+		nfs4_schedule_stateid_recovery(server, state);
+		nfs4_wait_clnt_recover(server->nfs_client);
+	}
+	*res = state;
+out:
+	return ret;
+}
+
 /*
  * Returns a referenced nfs4_state
  */
@@ -1892,18 +1929,7 @@ static int _nfs4_do_open(struct inode *dir,
 	if (dentry->d_inode != NULL)
 		opendata->state = nfs4_get_open_state(dentry->d_inode, sp);
 
-	status = _nfs4_proc_open(opendata);
-	if (status != 0)
-		goto err_opendata_put;
-
-	state = nfs4_opendata_to_nfs4_state(opendata);
-	status = PTR_ERR(state);
-	if (IS_ERR(state))
-		goto err_opendata_put;
-	if (server->caps & NFS_CAP_POSIX_LOCK)
-		set_bit(NFS_STATE_POSIX_LOCKS, &state->flags);
-
-	status = nfs4_opendata_access(cred, opendata, state, fmode, flags);
+	status = _nfs4_open_and_get_state(opendata, fmode, flags, &state);
 	if (status != 0)
 		goto err_opendata_put;
 
-- 
cgit v1.2.3


From c8da19b9866ea84e9ad1c369393ea95d54ee7845 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Mon, 11 Feb 2013 19:01:21 -0500
Subject: NFSv4.1: Fix an ABBA locking issue with session and state
 serialisation

Ensure that if nfs_wait_on_sequence() causes our rpc task to wait for
an NFSv4 state serialisation lock, then we also drop the session slot.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Cc: stable@vger.kernel.org
---
 fs/nfs/nfs4proc.c | 32 ++++++++++++++++++++------------
 1 file changed, 20 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 7cbf7aa6c634..cf252dfd91f8 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -1466,7 +1466,7 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata)
 	struct nfs4_state_owner *sp = data->owner;
 
 	if (nfs_wait_on_sequence(data->o_arg.seqid, task) != 0)
-		return;
+		goto out_wait;
 	/*
 	 * Check if we still need to send an OPEN call, or if we can use
 	 * a delegation instead.
@@ -1501,6 +1501,7 @@ unlock_no_action:
 	rcu_read_unlock();
 out_no_action:
 	task->tk_action = NULL;
+out_wait:
 	nfs4_sequence_done(task, &data->o_res.seq_res);
 }
 
@@ -2179,7 +2180,7 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
 
 	dprintk("%s: begin!\n", __func__);
 	if (nfs_wait_on_sequence(calldata->arg.seqid, task) != 0)
-		return;
+		goto out_wait;
 
 	task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_DOWNGRADE];
 	calldata->arg.fmode = FMODE_READ|FMODE_WRITE;
@@ -2201,16 +2202,14 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
 
 	if (!call_close) {
 		/* Note: exit _without_ calling nfs4_close_done */
-		task->tk_action = NULL;
-		nfs4_sequence_done(task, &calldata->res.seq_res);
-		goto out;
+		goto out_no_action;
 	}
 
 	if (calldata->arg.fmode == 0) {
 		task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CLOSE];
 		if (calldata->roc &&
 		    pnfs_roc_drain(inode, &calldata->roc_barrier, task))
-			goto out;
+			goto out_wait;
 	}
 
 	nfs_fattr_init(calldata->res.fattr);
@@ -2220,8 +2219,12 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
 				&calldata->res.seq_res,
 				task) != 0)
 		nfs_release_seqid(calldata->arg.seqid);
-out:
 	dprintk("%s: done!\n", __func__);
+	return;
+out_no_action:
+	task->tk_action = NULL;
+out_wait:
+	nfs4_sequence_done(task, &calldata->res.seq_res);
 }
 
 static const struct rpc_call_ops nfs4_close_ops = {
@@ -4452,12 +4455,10 @@ static void nfs4_locku_prepare(struct rpc_task *task, void *data)
 	struct nfs4_unlockdata *calldata = data;
 
 	if (nfs_wait_on_sequence(calldata->arg.seqid, task) != 0)
-		return;
+		goto out_wait;
 	if (test_bit(NFS_LOCK_INITIALIZED, &calldata->lsp->ls_flags) == 0) {
 		/* Note: exit _without_ running nfs4_locku_done */
-		task->tk_action = NULL;
-		nfs4_sequence_done(task, &calldata->res.seq_res);
-		return;
+		goto out_no_action;
 	}
 	calldata->timestamp = jiffies;
 	if (nfs4_setup_sequence(calldata->server,
@@ -4465,6 +4466,11 @@ static void nfs4_locku_prepare(struct rpc_task *task, void *data)
 				&calldata->res.seq_res,
 				task) != 0)
 		nfs_release_seqid(calldata->arg.seqid);
+	return;
+out_no_action:
+	task->tk_action = NULL;
+out_wait:
+	nfs4_sequence_done(task, &calldata->res.seq_res);
 }
 
 static const struct rpc_call_ops nfs4_locku_ops = {
@@ -4612,7 +4618,7 @@ static void nfs4_lock_prepare(struct rpc_task *task, void *calldata)
 
 	dprintk("%s: begin!\n", __func__);
 	if (nfs_wait_on_sequence(data->arg.lock_seqid, task) != 0)
-		return;
+		goto out_wait;
 	/* Do we need to do an open_to_lock_owner? */
 	if (!(data->arg.lock_seqid->sequence->flags & NFS_SEQID_CONFIRMED)) {
 		if (nfs_wait_on_sequence(data->arg.open_seqid, task) != 0) {
@@ -4632,6 +4638,8 @@ static void nfs4_lock_prepare(struct rpc_task *task, void *calldata)
 	nfs_release_seqid(data->arg.open_seqid);
 out_release_lock_seqid:
 	nfs_release_seqid(data->arg.lock_seqid);
+out_wait:
+	nfs4_sequence_done(task, &data->res.seq_res);
 	dprintk("%s: done!, ret = %d\n", __func__, data->rpc_status);
 }
 
-- 
cgit v1.2.3


From fd9a8d7160937f94aad36ac80d7255b4988740ac Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 12 Feb 2013 09:48:42 -0500
Subject: NFSv4.1: Fix bulk recall and destroy of layouts

The current code in pnfs_destroy_all_layouts() assumes that removing
the layout from the server->layouts list is sufficient to make it
invisible to other processes. This ignores the fact that most
users access the layout through the nfs_inode->layout...
There is further breakage due to lack of reference counting of the
layouts, meaning that the whole thing Oopses at the drop of a hat.

The code in initiate_bulk_draining() is almost correct, and can be
used as a model for pnfs_destroy_all_layouts(), so move that
code to pnfs.c, and refactor the code to allow us to choose between
a single filesystem bulk recall, and a recall of all layouts.
Also note that initiate_bulk_draining() currently calls iput() while
holding locks. Fix that too.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Cc: stable@vger.kernel.org
---
 fs/nfs/callback_proc.c |  61 +++-----------------
 fs/nfs/pnfs.c          | 150 ++++++++++++++++++++++++++++++++++++++++++-------
 fs/nfs/pnfs.h          |   7 ++-
 3 files changed, 144 insertions(+), 74 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 264d1aa935f2..2960512792c2 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -183,60 +183,15 @@ static u32 initiate_file_draining(struct nfs_client *clp,
 static u32 initiate_bulk_draining(struct nfs_client *clp,
 				  struct cb_layoutrecallargs *args)
 {
-	struct nfs_server *server;
-	struct pnfs_layout_hdr *lo;
-	struct inode *ino;
-	u32 rv = NFS4ERR_NOMATCHING_LAYOUT;
-	struct pnfs_layout_hdr *tmp;
-	LIST_HEAD(recall_list);
-	LIST_HEAD(free_me_list);
-	struct pnfs_layout_range range = {
-		.iomode = IOMODE_ANY,
-		.offset = 0,
-		.length = NFS4_MAX_UINT64,
-	};
-
-	spin_lock(&clp->cl_lock);
-	rcu_read_lock();
-	list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
-		if ((args->cbl_recall_type == RETURN_FSID) &&
-		    memcmp(&server->fsid, &args->cbl_fsid,
-			   sizeof(struct nfs_fsid)))
-			continue;
+	int stat;
 
-		list_for_each_entry(lo, &server->layouts, plh_layouts) {
-			ino = igrab(lo->plh_inode);
-			if (!ino)
-				continue;
-			spin_lock(&ino->i_lock);
-			/* Is this layout in the process of being freed? */
-			if (NFS_I(ino)->layout != lo) {
-				spin_unlock(&ino->i_lock);
-				iput(ino);
-				continue;
-			}
-			pnfs_get_layout_hdr(lo);
-			spin_unlock(&ino->i_lock);
-			list_add(&lo->plh_bulk_recall, &recall_list);
-		}
-	}
-	rcu_read_unlock();
-	spin_unlock(&clp->cl_lock);
-
-	list_for_each_entry_safe(lo, tmp,
-				 &recall_list, plh_bulk_recall) {
-		ino = lo->plh_inode;
-		spin_lock(&ino->i_lock);
-		set_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
-		if (pnfs_mark_matching_lsegs_invalid(lo, &free_me_list, &range))
-			rv = NFS4ERR_DELAY;
-		list_del_init(&lo->plh_bulk_recall);
-		spin_unlock(&ino->i_lock);
-		pnfs_free_lseg_list(&free_me_list);
-		pnfs_put_layout_hdr(lo);
-		iput(ino);
-	}
-	return rv;
+	if (args->cbl_recall_type == RETURN_FSID)
+		stat = pnfs_destroy_layouts_byfsid(clp, &args->cbl_fsid, true);
+	else
+		stat = pnfs_destroy_layouts_byclid(clp, true);
+	if (stat != 0)
+		return NFS4ERR_DELAY;
+	return NFS4ERR_NOMATCHING_LAYOUT;
 }
 
 static u32 do_callback_layoutrecall(struct nfs_client *clp,
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index d00260b08103..6be70f622b62 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -505,37 +505,147 @@ pnfs_destroy_layout(struct nfs_inode *nfsi)
 }
 EXPORT_SYMBOL_GPL(pnfs_destroy_layout);
 
-/*
- * Called by the state manger to remove all layouts established under an
- * expired lease.
- */
-void
-pnfs_destroy_all_layouts(struct nfs_client *clp)
+static bool
+pnfs_layout_add_bulk_destroy_list(struct inode *inode,
+		struct list_head *layout_list)
 {
-	struct nfs_server *server;
 	struct pnfs_layout_hdr *lo;
-	LIST_HEAD(tmp_list);
+	bool ret = false;
 
-	nfs4_deviceid_mark_client_invalid(clp);
-	nfs4_deviceid_purge_client(clp);
+	spin_lock(&inode->i_lock);
+	lo = NFS_I(inode)->layout;
+	if (lo != NULL && list_empty(&lo->plh_bulk_destroy)) {
+		pnfs_get_layout_hdr(lo);
+		list_add(&lo->plh_bulk_destroy, layout_list);
+		ret = true;
+	}
+	spin_unlock(&inode->i_lock);
+	return ret;
+}
+
+/* Caller must hold rcu_read_lock and clp->cl_lock */
+static int
+pnfs_layout_bulk_destroy_byserver_locked(struct nfs_client *clp,
+		struct nfs_server *server,
+		struct list_head *layout_list)
+{
+	struct pnfs_layout_hdr *lo, *next;
+	struct inode *inode;
+
+	list_for_each_entry_safe(lo, next, &server->layouts, plh_layouts) {
+		inode = igrab(lo->plh_inode);
+		if (inode == NULL)
+			continue;
+		list_del_init(&lo->plh_layouts);
+		if (pnfs_layout_add_bulk_destroy_list(inode, layout_list))
+			continue;
+		rcu_read_unlock();
+		spin_unlock(&clp->cl_lock);
+		iput(inode);
+		spin_lock(&clp->cl_lock);
+		rcu_read_lock();
+		return -EAGAIN;
+	}
+	return 0;
+}
+
+static int
+pnfs_layout_free_bulk_destroy_list(struct list_head *layout_list,
+		bool is_bulk_recall)
+{
+	struct pnfs_layout_hdr *lo;
+	struct inode *inode;
+	struct pnfs_layout_range range = {
+		.iomode = IOMODE_ANY,
+		.offset = 0,
+		.length = NFS4_MAX_UINT64,
+	};
+	LIST_HEAD(lseg_list);
+	int ret = 0;
+
+	while (!list_empty(layout_list)) {
+		lo = list_entry(layout_list->next, struct pnfs_layout_hdr,
+				plh_bulk_destroy);
+		dprintk("%s freeing layout for inode %lu\n", __func__,
+			lo->plh_inode->i_ino);
+		inode = lo->plh_inode;
+		spin_lock(&inode->i_lock);
+		list_del_init(&lo->plh_bulk_destroy);
+		lo->plh_block_lgets++; /* permanently block new LAYOUTGETs */
+		if (is_bulk_recall)
+			set_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
+		if (pnfs_mark_matching_lsegs_invalid(lo, &lseg_list, &range))
+			ret = -EAGAIN;
+		spin_unlock(&inode->i_lock);
+		pnfs_free_lseg_list(&lseg_list);
+		pnfs_put_layout_hdr(lo);
+		iput(inode);
+	}
+	return ret;
+}
+
+int
+pnfs_destroy_layouts_byfsid(struct nfs_client *clp,
+		struct nfs_fsid *fsid,
+		bool is_recall)
+{
+	struct nfs_server *server;
+	LIST_HEAD(layout_list);
 
 	spin_lock(&clp->cl_lock);
 	rcu_read_lock();
+restart:
 	list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
-		if (!list_empty(&server->layouts))
-			list_splice_init(&server->layouts, &tmp_list);
+		if (memcmp(&server->fsid, fsid, sizeof(*fsid)) != 0)
+			continue;
+		if (pnfs_layout_bulk_destroy_byserver_locked(clp,
+				server,
+				&layout_list) != 0)
+			goto restart;
 	}
 	rcu_read_unlock();
 	spin_unlock(&clp->cl_lock);
 
-	while (!list_empty(&tmp_list)) {
-		lo = list_entry(tmp_list.next, struct pnfs_layout_hdr,
-				plh_layouts);
-		dprintk("%s freeing layout for inode %lu\n", __func__,
-			lo->plh_inode->i_ino);
-		list_del_init(&lo->plh_layouts);
-		pnfs_destroy_layout(NFS_I(lo->plh_inode));
+	if (list_empty(&layout_list))
+		return 0;
+	return pnfs_layout_free_bulk_destroy_list(&layout_list, is_recall);
+}
+
+int
+pnfs_destroy_layouts_byclid(struct nfs_client *clp,
+		bool is_recall)
+{
+	struct nfs_server *server;
+	LIST_HEAD(layout_list);
+
+	spin_lock(&clp->cl_lock);
+	rcu_read_lock();
+restart:
+	list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
+		if (pnfs_layout_bulk_destroy_byserver_locked(clp,
+					server,
+					&layout_list) != 0)
+			goto restart;
 	}
+	rcu_read_unlock();
+	spin_unlock(&clp->cl_lock);
+
+	if (list_empty(&layout_list))
+		return 0;
+	return pnfs_layout_free_bulk_destroy_list(&layout_list, is_recall);
+}
+
+/*
+ * Called by the state manger to remove all layouts established under an
+ * expired lease.
+ */
+void
+pnfs_destroy_all_layouts(struct nfs_client *clp)
+{
+	nfs4_deviceid_mark_client_invalid(clp);
+	nfs4_deviceid_purge_client(clp);
+
+	pnfs_destroy_layouts_byclid(clp, false);
 }
 
 /*
@@ -888,7 +998,7 @@ alloc_init_layout_hdr(struct inode *ino,
 	atomic_set(&lo->plh_refcount, 1);
 	INIT_LIST_HEAD(&lo->plh_layouts);
 	INIT_LIST_HEAD(&lo->plh_segs);
-	INIT_LIST_HEAD(&lo->plh_bulk_recall);
+	INIT_LIST_HEAD(&lo->plh_bulk_destroy);
 	lo->plh_inode = ino;
 	lo->plh_lc_cred = get_rpccred(ctx->state->owner->so_cred);
 	return lo;
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index dbf7bba52da0..97cb358bb882 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -132,7 +132,7 @@ struct pnfs_layoutdriver_type {
 struct pnfs_layout_hdr {
 	atomic_t		plh_refcount;
 	struct list_head	plh_layouts;   /* other client layouts */
-	struct list_head	plh_bulk_recall; /* clnt list of bulk recalls */
+	struct list_head	plh_bulk_destroy;
 	struct list_head	plh_segs;      /* layout segments list */
 	nfs4_stateid		plh_stateid;
 	atomic_t		plh_outstanding; /* number of RPCs out */
@@ -196,6 +196,11 @@ struct pnfs_layout_segment *pnfs_layout_process(struct nfs4_layoutget *lgp);
 void pnfs_free_lseg_list(struct list_head *tmp_list);
 void pnfs_destroy_layout(struct nfs_inode *);
 void pnfs_destroy_all_layouts(struct nfs_client *);
+int pnfs_destroy_layouts_byfsid(struct nfs_client *clp,
+		struct nfs_fsid *fsid,
+		bool is_recall);
+int pnfs_destroy_layouts_byclid(struct nfs_client *clp,
+		bool is_recall);
 void pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo);
 void pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo,
 			     const nfs4_stateid *new,
-- 
cgit v1.2.3


From 085b7a45c63d3da5be155faab9249a5cab224561 Mon Sep 17 00:00:00 2001
From: Weston Andros Adamson <dros@netapp.com>
Date: Fri, 15 Feb 2013 16:03:46 -0500
Subject: NFSv4.1: Don't decode skipped layoutgets

layoutget's prepare hook can call rpc_exit with status = NFS4_OK (0).
Because of this, nfs4_proc_layoutget can't depend on a 0 status to mean
that the RPC was successfully sent, received and parsed.

To fix this, use the result's len member to see if parsing took place.

This fixes the following OOPS -- calling xdr_init_decode() with a buffer length
0 doesn't set the stream's 'p' member and ends up using uninitialized memory
in filelayout_decode_layout.

BUG: unable to handle kernel paging request at 0000000000008050
IP: [<ffffffff81282e78>] memcpy+0x18/0x120
PGD 0
Oops: 0000 [#1] SMP
last sysfs file: /sys/devices/pci0000:00/0000:00:11.0/0000:02:01.0/irq
CPU 1
Modules linked in: nfs_layout_nfsv41_files nfs lockd fscache auth_rpcgss nfs_acl autofs4 sunrpc ipt_REJECT nf_conntrack_ipv4 nf_defrag_ipv4 iptable_filter ip_tables ip6t_REJECT nf_conntrack_ipv6 nf_defrag_ipv6 xt_state nf_conntrack ip6table_filter ip6_tables ipv6 dm_mirror dm_region_hash dm_log dm_mod ppdev parport_pc parport snd_ens1371 snd_rawmidi snd_ac97_codec ac97_bus snd_seq snd_seq_device snd_pcm snd_timer snd soundcore snd_page_alloc e1000 microcode vmware_balloon i2c_piix4 i2c_core sg shpchp ext4 mbcache jbd2 sr_mod cdrom sd_mod crc_t10dif pata_acpi ata_generic ata_piix mptspi mptscsih mptbase scsi_transport_spi [last unloaded: speedstep_lib]

Pid: 1665, comm: flush-0:22 Not tainted 2.6.32-356-test-2 #2 VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform
RIP: 0010:[<ffffffff81282e78>]  [<ffffffff81282e78>] memcpy+0x18/0x120
RSP: 0018:ffff88003dfab588  EFLAGS: 00010206
RAX: ffff88003dc42000 RBX: ffff88003dfab610 RCX: 0000000000000009
RDX: 000000003f807ff0 RSI: 0000000000008050 RDI: ffff88003dc42000
RBP: ffff88003dfab5b0 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000080 R12: 0000000000000024
R13: ffff88003dc42000 R14: ffff88003f808030 R15: ffff88003dfab6a0
FS:  0000000000000000(0000) GS:ffff880003420000(0000) knlGS:0000000000000000
CS:  0010 DS: 0018 ES: 0018 CR0: 000000008005003b
CR2: 0000000000008050 CR3: 000000003bc92000 CR4: 00000000001407e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
Process flush-0:22 (pid: 1665, threadinfo ffff88003dfaa000, task ffff880037f77540)
Stack:
ffffffffa0398ac1 ffff8800397c5940 ffff88003dfab610 ffff88003dfab6a0
<d> ffff88003dfab5d0 ffff88003dfab680 ffffffffa01c150b ffffea0000d82e70
<d> 000000508116713b 0000000000000000 0000000000000000 0000000000000000
Call Trace:
[<ffffffffa0398ac1>] ? xdr_inline_decode+0xb1/0x120 [sunrpc]
[<ffffffffa01c150b>] filelayout_decode_layout+0xeb/0x350 [nfs_layout_nfsv41_files]
[<ffffffffa01c17fc>] filelayout_alloc_lseg+0x8c/0x3c0 [nfs_layout_nfsv41_files]
[<ffffffff8150e6ce>] ? __wait_on_bit+0x7e/0x90

Signed-off-by: Weston Andros Adamson <dros@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Cc: stable@vger.kernel.org
---
 fs/nfs/nfs4proc.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index cf252dfd91f8..eae83bf96c6d 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -6184,7 +6184,8 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags)
 	status = nfs4_wait_for_completion_rpc_task(task);
 	if (status == 0)
 		status = task->tk_status;
-	if (status == 0)
+	/* if layoutp->len is 0, nfs4_layoutget_prepare called rpc_exit */
+	if (status == 0 && lgp->res.layoutp->len)
 		lseg = pnfs_layout_process(lgp);
 	rpc_put_task(task);
 	dprintk("<-- %s status=%d\n", __func__, status);
-- 
cgit v1.2.3


From 96aa1549afa6c79ae4a4f099de861efd218c38d8 Mon Sep 17 00:00:00 2001
From: Tim Gardner <tim.gardner@canonical.com>
Date: Tue, 12 Feb 2013 13:03:42 -0700
Subject: nfs: remove kfree() redundant null checks

smatch analysis:

fs/nfs/getroot.c:130 nfs_get_root() info: redundant null
 check on name calling kfree()

fs/nfs/unlink.c:272 nfs_async_unlink() info: redundant null
 check on devname_garbage calling kfree()

Cc: Trond Myklebust <Trond.Myklebust@netapp.com>
Cc: linux-nfs@vger.kernel.org
Signed-off-by: Tim Gardner <tim.gardner@canonical.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/getroot.c | 3 +--
 fs/nfs/unlink.c  | 3 +--
 2 files changed, 2 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c
index 033803c36644..44efaa8c5f78 100644
--- a/fs/nfs/getroot.c
+++ b/fs/nfs/getroot.c
@@ -126,8 +126,7 @@ struct dentry *nfs_get_root(struct super_block *sb, struct nfs_fh *mntfh,
 	}
 	spin_unlock(&ret->d_lock);
 out:
-	if (name)
-		kfree(name);
+	kfree(name);
 	nfs_free_fattr(fsinfo.fattr);
 	return ret;
 }
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index 13cea637eff8..d26a32f5b53b 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -268,8 +268,7 @@ nfs_async_unlink(struct inode *dir, struct dentry *dentry)
 	 * point dentry is definitely not a root, so we won't need
 	 * that anymore.
 	 */
-	if (devname_garbage)
-		kfree(devname_garbage);
+	kfree(devname_garbage);
 	return 0;
 out_unlock:
 	spin_unlock(&dentry->d_lock);
-- 
cgit v1.2.3


From 5a12cca697aca5dfba42a7d4c3356acc0445a2b0 Mon Sep 17 00:00:00 2001
From: fanchaoting <fanchaoting@cn.fujitsu.com>
Date: Mon, 4 Feb 2013 21:15:02 +0800
Subject: umount oops when remove blocklayoutdriver first

now pnfs client uses block layout, maybe we can remove
blocklayoutdriver first. if we umount later,
it can cause oops in unset_pnfs_layoutdriver.
because nfss->pnfs_curr_ld->clear_layoutdriver is invalid.

reproduce it:
 modprobe  blocklayoutdriver
 mount -t nfs4 -o minorversion=1 pnfsip:/ /mnt/
 rmmod blocklayoutdriver
 umount /mnt

then you can see following

CPU 0
Pid: 17023, comm: umount.nfs4 Tainted: GF          O 3.7.0-rc6-pnfs #1 VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform
RIP: 0010:[<ffffffffa04cfe6d>]  [<ffffffffa04cfe6d>] unset_pnfs_layoutdriver+0x1d/0x70 [nfsv4]
RSP: 0018:ffff8800022d9e48  EFLAGS: 00010286
RAX: ffffffffa04a1b00 RBX: ffff88000b013800 RCX: 0000000000000001
RDX: ffffffff81ae8ee0 RSI: ffff880001ee94b8 RDI: ffff88000b013800
RBP: ffff8800022d9e58 R08: 0000000000000001 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000000 R12: ffff880001ee9400
R13: ffff8800105978c0 R14: 00007fff25846c08 R15: 0000000001bba550
FS:  00007f45ae7f0700(0000) GS:ffff880012c00000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
CR2: ffffffffa04a1b38 CR3: 0000000002c0c000 CR4: 00000000000006f0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
Process umount.nfs4 (pid: 17023, threadinfo ffff8800022d8000, task ffff880006e48aa0)
Stack:
ffff8800105978c0 ffff88000b013800 ffff8800022d9e78 ffffffffa04cd0ce
ffff8800022d9e78 ffff88000b013800 ffff8800022d9ea8 ffffffffa04755a7
ffff8800022d9ea8 ffff880002f96400 ffff88000b013800 ffff880002f96400
Call Trace:
[<ffffffffa04cd0ce>] nfs4_destroy_server+0x1e/0x30 [nfsv4]
[<ffffffffa04755a7>] nfs_free_server+0xb7/0x150 [nfs]
[<ffffffffa047d4d5>] nfs_kill_super+0x35/0x40 [nfs]
[<ffffffff81178d35>] deactivate_locked_super+0x45/0x70
[<ffffffff8117986a>] deactivate_super+0x4a/0x70
[<ffffffff81193ee2>] mntput_no_expire+0xd2/0x130
[<ffffffff81194d62>] sys_umount+0x72/0xe0
[<ffffffff8154af59>] system_call_fastpath+0x16/0x1b
Code: 06 e1 b8 ea ff ff ff eb 9e 0f 1f 44 00 00 55 48 89 e5 53 48 83 ec 08 66 66 66 66 90 48 8b 87 80 03 00 00 48 89 fb 48 85 c0 74 29 <48> 8b 40 38 48 85 c0 74 02 ff d0 48 8b 03 3e ff 48 04 0f 94 c2
RIP  [<ffffffffa04cfe6d>] unset_pnfs_layoutdriver+0x1d/0x70 [nfsv4]
RSP <ffff8800022d9e48>
CR2: ffffffffa04a1b38
---[ end trace 29f75aaedda058bf ]---

Signed-off-by: fanchaoting<fanchaoting@cn.fujitsu.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Cc: stable@vger.kernel.org
---
 fs/nfs/blocklayout/blocklayout.c | 1 +
 fs/nfs/objlayout/objio_osd.c     | 1 +
 2 files changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 4fa788c93f46..434b93ec0970 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -1273,6 +1273,7 @@ static const struct nfs_pageio_ops bl_pg_write_ops = {
 static struct pnfs_layoutdriver_type blocklayout_type = {
 	.id				= LAYOUT_BLOCK_VOLUME,
 	.name				= "LAYOUT_BLOCK_VOLUME",
+	.owner				= THIS_MODULE,
 	.read_pagelist			= bl_read_pagelist,
 	.write_pagelist			= bl_write_pagelist,
 	.alloc_layout_hdr		= bl_alloc_layout_hdr,
diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
index c6f990656f89..88f9611a945c 100644
--- a/fs/nfs/objlayout/objio_osd.c
+++ b/fs/nfs/objlayout/objio_osd.c
@@ -647,6 +647,7 @@ static struct pnfs_layoutdriver_type objlayout_type = {
 	.flags                   = PNFS_LAYOUTRET_ON_SETATTR |
 				   PNFS_LAYOUTRET_ON_ERROR,
 
+	.owner		       	 = THIS_MODULE,
 	.alloc_layout_hdr        = objlayout_alloc_layout_hdr,
 	.free_layout_hdr         = objlayout_free_layout_hdr,
 
-- 
cgit v1.2.3


From 666b3d803a511fbc9bc5e5ea8ce66010cf03ea13 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 19 Feb 2013 12:04:42 -0500
Subject: NLM: Ensure that we resend all pending blocking locks after a reclaim

Currently, nlmclnt_lock will break out of the for(;;) loop when
the reclaimer wakes up the blocking lock thread by setting
nlm_lck_denied_grace_period. This causes the lock request to fail
with an ENOLCK error.
The intention was always to ensure that we resend the lock request
after the grace period has expired.

Reported-by: Wangyuan Zhang <Wangyuan.Zhang@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Cc: stable@vger.kernel.org
---
 fs/lockd/clntproc.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index 54f9e6ce0430..52e5120bb159 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -550,6 +550,9 @@ again:
 		status = nlmclnt_block(block, req, NLMCLNT_POLL_TIMEOUT);
 		if (status < 0)
 			break;
+		/* Resend the blocking lock request after a server reboot */
+		if (resp->status ==  nlm_lck_denied_grace_period)
+			continue;
 		if (resp->status != nlm_lck_blocked)
 			break;
 	}
-- 
cgit v1.2.3