From 64a38e840ce5940253208eaba40265c73decc4ee Mon Sep 17 00:00:00 2001
From: Dave Wysochanski <dwysocha@redhat.com>
Date: Fri, 26 Jul 2019 18:33:01 -0400
Subject: SUNRPC: Track writers of the 'channel' file to improve
 cache_listeners_exist

The sunrpc cache interface is susceptible to being fooled by a rogue
process just reading a 'channel' file.  If this happens the kernel
may think a valid daemon exists to service the cache when it does not.
For example, the following may fool the kernel:
cat /proc/net/rpc/auth.unix.gid/channel

Change the tracking of readers to writers when considering whether a
listener exists as all valid daemon processes either open a channel
file O_RDWR or O_WRONLY.  While this does not prevent a rogue process
from "stealing" a message from the kernel, it does at least improve
the kernels perception of whether a valid process servicing the cache
exists.

Signed-off-by: Dave Wysochanski <dwysocha@redhat.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 include/linux/sunrpc/cache.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/sunrpc/cache.h b/include/linux/sunrpc/cache.h
index c7f38e897174..f7d086b77a21 100644
--- a/include/linux/sunrpc/cache.h
+++ b/include/linux/sunrpc/cache.h
@@ -107,9 +107,9 @@ struct cache_detail {
 	/* fields for communication over channel */
 	struct list_head	queue;
 
-	atomic_t		readers;		/* how many time is /chennel open */
-	time_t			last_close;		/* if no readers, when did last close */
-	time_t			last_warn;		/* when we last warned about no readers */
+	atomic_t		writers;		/* how many time is /channel open */
+	time_t			last_close;		/* if no writers, when did last close */
+	time_t			last_warn;		/* when we last warned about no writers */
 
 	union {
 		struct proc_dir_entry	*procfs;
-- 
cgit v1.2.3


From d6dfe43ec6062beea5ba1172b957e74a13c95b86 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 16 Aug 2019 17:48:36 -0400
Subject: svcrdma: Remove svc_rdma_wq

Clean up: the system workqueue will work just as well.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 include/linux/sunrpc/svc_rdma.h          | 1 -
 net/sunrpc/xprtrdma/svc_rdma.c           | 7 -------
 net/sunrpc/xprtrdma/svc_rdma_transport.c | 3 ++-
 3 files changed, 2 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index 981f0d726ad4..edb39900fe04 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -200,7 +200,6 @@ extern struct svc_xprt_class svc_rdma_bc_class;
 #endif
 
 /* svc_rdma.c */
-extern struct workqueue_struct *svc_rdma_wq;
 extern int svc_rdma_init(void);
 extern void svc_rdma_cleanup(void);
 
diff --git a/net/sunrpc/xprtrdma/svc_rdma.c b/net/sunrpc/xprtrdma/svc_rdma.c
index abdb3004a1e3..97bca509a391 100644
--- a/net/sunrpc/xprtrdma/svc_rdma.c
+++ b/net/sunrpc/xprtrdma/svc_rdma.c
@@ -73,8 +73,6 @@ atomic_t rdma_stat_rq_prod;
 atomic_t rdma_stat_sq_poll;
 atomic_t rdma_stat_sq_prod;
 
-struct workqueue_struct *svc_rdma_wq;
-
 /*
  * This function implements reading and resetting an atomic_t stat
  * variable through read/write to a proc file. Any write to the file
@@ -230,7 +228,6 @@ static struct ctl_table svcrdma_root_table[] = {
 void svc_rdma_cleanup(void)
 {
 	dprintk("SVCRDMA Module Removed, deregister RPC RDMA transport\n");
-	destroy_workqueue(svc_rdma_wq);
 	if (svcrdma_table_header) {
 		unregister_sysctl_table(svcrdma_table_header);
 		svcrdma_table_header = NULL;
@@ -246,10 +243,6 @@ int svc_rdma_init(void)
 	dprintk("\tmax_bc_requests  : %u\n", svcrdma_max_bc_requests);
 	dprintk("\tmax_inline       : %d\n", svcrdma_max_req_size);
 
-	svc_rdma_wq = alloc_workqueue("svc_rdma", 0, 0);
-	if (!svc_rdma_wq)
-		return -ENOMEM;
-
 	if (!svcrdma_table_header)
 		svcrdma_table_header =
 			register_sysctl_table(svcrdma_root_table);
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index 3fe665152d95..18d6eb3686e7 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -630,8 +630,9 @@ static void svc_rdma_free(struct svc_xprt *xprt)
 {
 	struct svcxprt_rdma *rdma =
 		container_of(xprt, struct svcxprt_rdma, sc_xprt);
+
 	INIT_WORK(&rdma->sc_work, __svc_rdma_free);
-	queue_work(svc_rdma_wq, &rdma->sc_work);
+	schedule_work(&rdma->sc_work);
 }
 
 static int svc_rdma_has_wspace(struct svc_xprt *xprt)
-- 
cgit v1.2.3


From 4866073e6ddf03066c925d3237903d7f4ca68982 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 16 Aug 2019 17:49:38 -0400
Subject: svcrdma: Use llist for managing cache of recv_ctxts

Use a wait-free mechanism for managing the svc_rdma_recv_ctxts free
list. Subsequently, sc_recv_lock can be eliminated.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 include/linux/sunrpc/svc_rdma.h          |  5 +++--
 net/sunrpc/xprtrdma/svc_rdma_recvfrom.c  | 24 ++++++++++--------------
 net/sunrpc/xprtrdma/svc_rdma_transport.c |  3 +--
 3 files changed, 14 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index edb39900fe04..40f65888dd38 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -42,6 +42,7 @@
 
 #ifndef SVC_RDMA_H
 #define SVC_RDMA_H
+#include <linux/llist.h>
 #include <linux/sunrpc/xdr.h>
 #include <linux/sunrpc/svcsock.h>
 #include <linux/sunrpc/rpc_rdma.h>
@@ -107,8 +108,7 @@ struct svcxprt_rdma {
 	struct list_head     sc_read_complete_q;
 	struct work_struct   sc_work;
 
-	spinlock_t	     sc_recv_lock;
-	struct list_head     sc_recv_ctxts;
+	struct llist_head    sc_recv_ctxts;
 };
 /* sc_flags */
 #define RDMAXPRT_CONN_PENDING	3
@@ -125,6 +125,7 @@ enum {
 #define RPCSVC_MAXPAYLOAD_RDMA	RPCSVC_MAXPAYLOAD
 
 struct svc_rdma_recv_ctxt {
+	struct llist_node	rc_node;
 	struct list_head	rc_list;
 	struct ib_recv_wr	rc_recv_wr;
 	struct ib_cqe		rc_cqe;
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index 65e2fb9aac65..96bccd398469 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -172,9 +172,10 @@ static void svc_rdma_recv_ctxt_destroy(struct svcxprt_rdma *rdma,
 void svc_rdma_recv_ctxts_destroy(struct svcxprt_rdma *rdma)
 {
 	struct svc_rdma_recv_ctxt *ctxt;
+	struct llist_node *node;
 
-	while ((ctxt = svc_rdma_next_recv_ctxt(&rdma->sc_recv_ctxts))) {
-		list_del(&ctxt->rc_list);
+	while ((node = llist_del_first(&rdma->sc_recv_ctxts))) {
+		ctxt = llist_entry(node, struct svc_rdma_recv_ctxt, rc_node);
 		svc_rdma_recv_ctxt_destroy(rdma, ctxt);
 	}
 }
@@ -183,21 +184,18 @@ static struct svc_rdma_recv_ctxt *
 svc_rdma_recv_ctxt_get(struct svcxprt_rdma *rdma)
 {
 	struct svc_rdma_recv_ctxt *ctxt;
+	struct llist_node *node;
 
-	spin_lock(&rdma->sc_recv_lock);
-	ctxt = svc_rdma_next_recv_ctxt(&rdma->sc_recv_ctxts);
-	if (!ctxt)
+	node = llist_del_first(&rdma->sc_recv_ctxts);
+	if (!node)
 		goto out_empty;
-	list_del(&ctxt->rc_list);
-	spin_unlock(&rdma->sc_recv_lock);
+	ctxt = llist_entry(node, struct svc_rdma_recv_ctxt, rc_node);
 
 out:
 	ctxt->rc_page_count = 0;
 	return ctxt;
 
 out_empty:
-	spin_unlock(&rdma->sc_recv_lock);
-
 	ctxt = svc_rdma_recv_ctxt_alloc(rdma);
 	if (!ctxt)
 		return NULL;
@@ -218,11 +216,9 @@ void svc_rdma_recv_ctxt_put(struct svcxprt_rdma *rdma,
 	for (i = 0; i < ctxt->rc_page_count; i++)
 		put_page(ctxt->rc_pages[i]);
 
-	if (!ctxt->rc_temp) {
-		spin_lock(&rdma->sc_recv_lock);
-		list_add(&ctxt->rc_list, &rdma->sc_recv_ctxts);
-		spin_unlock(&rdma->sc_recv_lock);
-	} else
+	if (!ctxt->rc_temp)
+		llist_add(&ctxt->rc_node, &rdma->sc_recv_ctxts);
+	else
 		svc_rdma_recv_ctxt_destroy(rdma, ctxt);
 }
 
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index 18d6eb3686e7..4182d569b5cf 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -140,14 +140,13 @@ static struct svcxprt_rdma *svc_rdma_create_xprt(struct svc_serv *serv,
 	INIT_LIST_HEAD(&cma_xprt->sc_rq_dto_q);
 	INIT_LIST_HEAD(&cma_xprt->sc_read_complete_q);
 	INIT_LIST_HEAD(&cma_xprt->sc_send_ctxts);
-	INIT_LIST_HEAD(&cma_xprt->sc_recv_ctxts);
+	init_llist_head(&cma_xprt->sc_recv_ctxts);
 	INIT_LIST_HEAD(&cma_xprt->sc_rw_ctxts);
 	init_waitqueue_head(&cma_xprt->sc_send_wait);
 
 	spin_lock_init(&cma_xprt->sc_lock);
 	spin_lock_init(&cma_xprt->sc_rq_dto_lock);
 	spin_lock_init(&cma_xprt->sc_send_lock);
-	spin_lock_init(&cma_xprt->sc_recv_lock);
 	spin_lock_init(&cma_xprt->sc_rw_ctxt_lock);
 
 	/*
-- 
cgit v1.2.3


From f69d6d8eef7807f8d937b81da24bebd2e926e4d2 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jeff.layton@primarydata.com>
Date: Sun, 18 Aug 2019 14:18:44 -0400
Subject: sunrpc: add a new cache_detail operation for when a cache is flushed

When the exports table is changed, exportfs will usually write a new
time to the "flush" file in the nfsd.export cache procfile. This tells
the kernel to flush any entries that are older than that value.

This gives us a mechanism to tell whether an unexport might have
occurred. Add a new ->flush cache_detail operation that is called after
flushing the cache whenever someone writes to a "flush" file.

Signed-off-by: Jeff Layton <jeff.layton@primarydata.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 include/linux/sunrpc/cache.h | 1 +
 net/sunrpc/cache.c           | 3 +++
 2 files changed, 4 insertions(+)

(limited to 'include')

diff --git a/include/linux/sunrpc/cache.h b/include/linux/sunrpc/cache.h
index f7d086b77a21..f8603724fbee 100644
--- a/include/linux/sunrpc/cache.h
+++ b/include/linux/sunrpc/cache.h
@@ -87,6 +87,7 @@ struct cache_detail {
 					      int has_died);
 
 	struct cache_head *	(*alloc)(void);
+	void			(*flush)(void);
 	int			(*match)(struct cache_head *orig, struct cache_head *new);
 	void			(*init)(struct cache_head *orig, struct cache_head *new);
 	void			(*update)(struct cache_head *orig, struct cache_head *new);
diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c
index a6a6190ad37a..a349094f6fb7 100644
--- a/net/sunrpc/cache.c
+++ b/net/sunrpc/cache.c
@@ -1524,6 +1524,9 @@ static ssize_t write_flush(struct file *file, const char __user *buf,
 	cd->nextcheck = now;
 	cache_flush();
 
+	if (cd->flush)
+		cd->flush();
+
 	*ppos += count;
 	return count;
 }
-- 
cgit v1.2.3


From 18f6622ebbdea56a83f8e553c159ce2d62d3ad0c Mon Sep 17 00:00:00 2001
From: Jeff Layton <jeff.layton@primarydata.com>
Date: Sun, 18 Aug 2019 14:18:45 -0400
Subject: locks: create a new notifier chain for lease attempts

With the new file caching infrastructure in nfsd, we can end up holding
files open for an indefinite period of time, even when they are still
idle. This may prevent the kernel from handing out leases on the file,
which is something we don't want to block.

Fix this by running a SRCU notifier call chain whenever on any
lease attempt. nfsd can then purge the cache for that inode before
returning.

Since SRCU is only conditionally compiled in, we must only define the
new chain if it's enabled, and users of the chain must ensure that
SRCU is enabled.

Signed-off-by: Jeff Layton <jeff.layton@primarydata.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/locks.c         | 61 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/fs.h |  5 +++++
 2 files changed, 66 insertions(+)

(limited to 'include')

diff --git a/fs/locks.c b/fs/locks.c
index 686eae21daf6..1913481bfbf7 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1990,6 +1990,64 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp,
 }
 EXPORT_SYMBOL(generic_setlease);
 
+#if IS_ENABLED(CONFIG_SRCU)
+/*
+ * Kernel subsystems can register to be notified on any attempt to set
+ * a new lease with the lease_notifier_chain. This is used by (e.g.) nfsd
+ * to close files that it may have cached when there is an attempt to set a
+ * conflicting lease.
+ */
+static struct srcu_notifier_head lease_notifier_chain;
+
+static inline void
+lease_notifier_chain_init(void)
+{
+	srcu_init_notifier_head(&lease_notifier_chain);
+}
+
+static inline void
+setlease_notifier(long arg, struct file_lock *lease)
+{
+	if (arg != F_UNLCK)
+		srcu_notifier_call_chain(&lease_notifier_chain, arg, lease);
+}
+
+int lease_register_notifier(struct notifier_block *nb)
+{
+	return srcu_notifier_chain_register(&lease_notifier_chain, nb);
+}
+EXPORT_SYMBOL_GPL(lease_register_notifier);
+
+void lease_unregister_notifier(struct notifier_block *nb)
+{
+	srcu_notifier_chain_unregister(&lease_notifier_chain, nb);
+}
+EXPORT_SYMBOL_GPL(lease_unregister_notifier);
+
+#else /* !IS_ENABLED(CONFIG_SRCU) */
+static inline void
+lease_notifier_chain_init(void)
+{
+}
+
+static inline void
+setlease_notifier(long arg, struct file_lock *lease)
+{
+}
+
+int lease_register_notifier(struct notifier_block *nb)
+{
+	return 0;
+}
+EXPORT_SYMBOL_GPL(lease_register_notifier);
+
+void lease_unregister_notifier(struct notifier_block *nb)
+{
+}
+EXPORT_SYMBOL_GPL(lease_unregister_notifier);
+
+#endif /* IS_ENABLED(CONFIG_SRCU) */
+
 /**
  * vfs_setlease        -       sets a lease on an open file
  * @filp:	file pointer
@@ -2010,6 +2068,8 @@ EXPORT_SYMBOL(generic_setlease);
 int
 vfs_setlease(struct file *filp, long arg, struct file_lock **lease, void **priv)
 {
+	if (lease)
+		setlease_notifier(arg, *lease);
 	if (filp->f_op->setlease)
 		return filp->f_op->setlease(filp, arg, lease, priv);
 	else
@@ -2923,6 +2983,7 @@ static int __init filelock_init(void)
 		INIT_HLIST_HEAD(&fll->hlist);
 	}
 
+	lease_notifier_chain_init();
 	return 0;
 }
 core_initcall(filelock_init);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 56b8e358af5c..0f106c7f4bb9 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1155,6 +1155,11 @@ extern void lease_get_mtime(struct inode *, struct timespec64 *time);
 extern int generic_setlease(struct file *, long, struct file_lock **, void **priv);
 extern int vfs_setlease(struct file *, long, struct file_lock **, void **);
 extern int lease_modify(struct file_lock *, int, struct list_head *);
+
+struct notifier_block;
+extern int lease_register_notifier(struct notifier_block *);
+extern void lease_unregister_notifier(struct notifier_block *);
+
 struct files_struct;
 extern void show_fd_locks(struct seq_file *f,
 			 struct file *filp, struct files_struct *files);
-- 
cgit v1.2.3


From b72679ee89a0a0ecd26f7b6fcae96cdaababff94 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Sun, 18 Aug 2019 14:18:46 -0400
Subject: notify: export symbols for use by the knfsd file cache

The knfsd file cache will need to detect when files are unlinked, so that
it can close the associated cached files. Export a minimal set of notifier
functions to allow it to do so.

Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/notify/fsnotify.h             | 2 --
 fs/notify/group.c                | 2 ++
 fs/notify/mark.c                 | 6 ++++++
 include/linux/fsnotify_backend.h | 2 ++
 4 files changed, 10 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h
index 5a00121fb219..f3462828a0e2 100644
--- a/fs/notify/fsnotify.h
+++ b/fs/notify/fsnotify.h
@@ -54,8 +54,6 @@ static inline void fsnotify_clear_marks_by_sb(struct super_block *sb)
 {
 	fsnotify_destroy_marks(&sb->s_fsnotify_marks);
 }
-/* Wait until all marks queued for destruction are destroyed */
-extern void fsnotify_wait_marks_destroyed(void);
 
 /*
  * update the dentry->d_flags of all of inode's children to indicate if inode cares
diff --git a/fs/notify/group.c b/fs/notify/group.c
index 0391190305cc..133f723aca07 100644
--- a/fs/notify/group.c
+++ b/fs/notify/group.c
@@ -108,6 +108,7 @@ void fsnotify_put_group(struct fsnotify_group *group)
 	if (refcount_dec_and_test(&group->refcnt))
 		fsnotify_final_destroy_group(group);
 }
+EXPORT_SYMBOL_GPL(fsnotify_put_group);
 
 /*
  * Create a new fsnotify_group and hold a reference for the group returned.
@@ -137,6 +138,7 @@ struct fsnotify_group *fsnotify_alloc_group(const struct fsnotify_ops *ops)
 
 	return group;
 }
+EXPORT_SYMBOL_GPL(fsnotify_alloc_group);
 
 int fsnotify_fasync(int fd, struct file *file, int on)
 {
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index 99ddd126f6f0..1d96216dffd1 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -276,6 +276,7 @@ void fsnotify_put_mark(struct fsnotify_mark *mark)
 	queue_delayed_work(system_unbound_wq, &reaper_work,
 			   FSNOTIFY_REAPER_DELAY);
 }
+EXPORT_SYMBOL_GPL(fsnotify_put_mark);
 
 /*
  * Get mark reference when we found the mark via lockless traversal of object
@@ -430,6 +431,7 @@ void fsnotify_destroy_mark(struct fsnotify_mark *mark,
 	mutex_unlock(&group->mark_mutex);
 	fsnotify_free_mark(mark);
 }
+EXPORT_SYMBOL_GPL(fsnotify_destroy_mark);
 
 /*
  * Sorting function for lists of fsnotify marks.
@@ -685,6 +687,7 @@ int fsnotify_add_mark(struct fsnotify_mark *mark, fsnotify_connp_t *connp,
 	mutex_unlock(&group->mark_mutex);
 	return ret;
 }
+EXPORT_SYMBOL_GPL(fsnotify_add_mark);
 
 /*
  * Given a list of marks, find the mark associated with given group. If found
@@ -711,6 +714,7 @@ struct fsnotify_mark *fsnotify_find_mark(fsnotify_connp_t *connp,
 	spin_unlock(&conn->lock);
 	return NULL;
 }
+EXPORT_SYMBOL_GPL(fsnotify_find_mark);
 
 /* Clear any marks in a group with given type mask */
 void fsnotify_clear_marks_by_group(struct fsnotify_group *group,
@@ -809,6 +813,7 @@ void fsnotify_init_mark(struct fsnotify_mark *mark,
 	mark->group = group;
 	WRITE_ONCE(mark->connector, NULL);
 }
+EXPORT_SYMBOL_GPL(fsnotify_init_mark);
 
 /*
  * Destroy all marks in destroy_list, waits for SRCU period to finish before
@@ -837,3 +842,4 @@ void fsnotify_wait_marks_destroyed(void)
 {
 	flush_delayed_work(&reaper_work);
 }
+EXPORT_SYMBOL_GPL(fsnotify_wait_marks_destroyed);
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 2de3b2ddd19a..1915bdba2fad 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -475,6 +475,8 @@ extern void fsnotify_destroy_mark(struct fsnotify_mark *mark,
 extern void fsnotify_detach_mark(struct fsnotify_mark *mark);
 /* free mark */
 extern void fsnotify_free_mark(struct fsnotify_mark *mark);
+/* Wait until all marks queued for destruction are destroyed */
+extern void fsnotify_wait_marks_destroyed(void);
 /* run all the marks in a group, and clear all of the marks attached to given object type */
 extern void fsnotify_clear_marks_by_group(struct fsnotify_group *group, unsigned int type);
 /* run all the marks in a group, and clear all of the vfsmount marks */
-- 
cgit v1.2.3


From 11a60d159259dbadf9188534b508e5003769af61 Mon Sep 17 00:00:00 2001
From: Scott Mayhew <smayhew@redhat.com>
Date: Mon, 9 Sep 2019 16:10:30 -0400
Subject: nfsd: add a "GetVersion" upcall for nfsdcld

Add a "GetVersion" upcall to allow nfsd to determine the maximum upcall
version that the nfsdcld userspace daemon supports.  If the daemon
responds with -EOPNOTSUPP, then we know it only supports v1.

Signed-off-by: Scott Mayhew <smayhew@redhat.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4recover.c         | 167 +++++++++++++++++++++++++++++-------------
 include/uapi/linux/nfsd/cld.h |  11 ++-
 2 files changed, 127 insertions(+), 51 deletions(-)

(limited to 'include')

diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 87679557d0d6..58a61339d40c 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -59,8 +59,12 @@ struct nfsd4_client_tracking_ops {
 	void (*remove)(struct nfs4_client *);
 	int (*check)(struct nfs4_client *);
 	void (*grace_done)(struct nfsd_net *);
+	uint8_t version;
+	size_t msglen;
 };
 
+static const struct nfsd4_client_tracking_ops nfsd4_cld_tracking_ops;
+
 /* Globals */
 static char user_recovery_dirname[PATH_MAX] = "/var/lib/nfs/v4recovery";
 
@@ -718,6 +722,8 @@ static const struct nfsd4_client_tracking_ops nfsd4_legacy_tracking_ops = {
 	.remove		= nfsd4_remove_clid_dir,
 	.check		= nfsd4_check_legacy_client,
 	.grace_done	= nfsd4_recdir_purge_old,
+	.version	= 1,
+	.msglen		= 0,
 };
 
 /* Globals */
@@ -737,19 +743,24 @@ struct cld_upcall {
 	struct list_head	 cu_list;
 	struct cld_net		*cu_net;
 	struct completion	 cu_done;
-	struct cld_msg		 cu_msg;
+	union {
+		struct cld_msg_hdr	 cu_hdr;
+		struct cld_msg		 cu_msg;
+	} cu_u;
 };
 
 static int
-__cld_pipe_upcall(struct rpc_pipe *pipe, struct cld_msg *cmsg)
+__cld_pipe_upcall(struct rpc_pipe *pipe, void *cmsg)
 {
 	int ret;
 	struct rpc_pipe_msg msg;
-	struct cld_upcall *cup = container_of(cmsg, struct cld_upcall, cu_msg);
+	struct cld_upcall *cup = container_of(cmsg, struct cld_upcall, cu_u);
+	struct nfsd_net *nn = net_generic(pipe->dentry->d_sb->s_fs_info,
+					  nfsd_net_id);
 
 	memset(&msg, 0, sizeof(msg));
 	msg.data = cmsg;
-	msg.len = sizeof(*cmsg);
+	msg.len = nn->client_tracking_ops->msglen;
 
 	ret = rpc_queue_upcall(pipe, &msg);
 	if (ret < 0) {
@@ -765,7 +776,7 @@ out:
 }
 
 static int
-cld_pipe_upcall(struct rpc_pipe *pipe, struct cld_msg *cmsg)
+cld_pipe_upcall(struct rpc_pipe *pipe, void *cmsg)
 {
 	int ret;
 
@@ -809,7 +820,7 @@ __cld_pipe_inprogress_downcall(const struct cld_msg __user *cmsg,
 			kfree(name.data);
 			return -EFAULT;
 		}
-		return sizeof(*cmsg);
+		return nn->client_tracking_ops->msglen;
 	}
 	return -EFAULT;
 }
@@ -818,6 +829,7 @@ static ssize_t
 cld_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
 {
 	struct cld_upcall *tmp, *cup;
+	struct cld_msg_hdr __user *hdr = (struct cld_msg_hdr __user *)src;
 	struct cld_msg __user *cmsg = (struct cld_msg __user *)src;
 	uint32_t xid;
 	struct nfsd_net *nn = net_generic(file_inode(filp)->i_sb->s_fs_info,
@@ -825,14 +837,14 @@ cld_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
 	struct cld_net *cn = nn->cld_net;
 	int16_t status;
 
-	if (mlen != sizeof(*cmsg)) {
+	if (mlen != nn->client_tracking_ops->msglen) {
 		dprintk("%s: got %zu bytes, expected %zu\n", __func__, mlen,
-			sizeof(*cmsg));
+			nn->client_tracking_ops->msglen);
 		return -EINVAL;
 	}
 
 	/* copy just the xid so we can try to find that */
-	if (copy_from_user(&xid, &cmsg->cm_xid, sizeof(xid)) != 0) {
+	if (copy_from_user(&xid, &hdr->cm_xid, sizeof(xid)) != 0) {
 		dprintk("%s: error when copying xid from userspace", __func__);
 		return -EFAULT;
 	}
@@ -842,7 +854,7 @@ cld_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
 	 * list (for -EINPROGRESS, we just want to make sure the xid is
 	 * valid, not remove the upcall from the list)
 	 */
-	if (get_user(status, &cmsg->cm_status)) {
+	if (get_user(status, &hdr->cm_status)) {
 		dprintk("%s: error when copying status from userspace", __func__);
 		return -EFAULT;
 	}
@@ -851,7 +863,7 @@ cld_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
 	cup = NULL;
 	spin_lock(&cn->cn_lock);
 	list_for_each_entry(tmp, &cn->cn_list, cu_list) {
-		if (get_unaligned(&tmp->cu_msg.cm_xid) == xid) {
+		if (get_unaligned(&tmp->cu_u.cu_hdr.cm_xid) == xid) {
 			cup = tmp;
 			if (status != -EINPROGRESS)
 				list_del_init(&cup->cu_list);
@@ -869,7 +881,7 @@ cld_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
 	if (status == -EINPROGRESS)
 		return __cld_pipe_inprogress_downcall(cmsg, nn);
 
-	if (copy_from_user(&cup->cu_msg, src, mlen) != 0)
+	if (copy_from_user(&cup->cu_u.cu_msg, src, mlen) != 0)
 		return -EFAULT;
 
 	complete(&cup->cu_done);
@@ -881,7 +893,7 @@ cld_pipe_destroy_msg(struct rpc_pipe_msg *msg)
 {
 	struct cld_msg *cmsg = msg->data;
 	struct cld_upcall *cup = container_of(cmsg, struct cld_upcall,
-						 cu_msg);
+						 cu_u.cu_msg);
 
 	/* errno >= 0 means we got a downcall */
 	if (msg->errno >= 0)
@@ -1012,9 +1024,10 @@ nfsd4_remove_cld_pipe(struct net *net)
 }
 
 static struct cld_upcall *
-alloc_cld_upcall(struct cld_net *cn)
+alloc_cld_upcall(struct nfsd_net *nn)
 {
 	struct cld_upcall *new, *tmp;
+	struct cld_net *cn = nn->cld_net;
 
 	new = kzalloc(sizeof(*new), GFP_KERNEL);
 	if (!new)
@@ -1024,20 +1037,20 @@ alloc_cld_upcall(struct cld_net *cn)
 restart_search:
 	spin_lock(&cn->cn_lock);
 	list_for_each_entry(tmp, &cn->cn_list, cu_list) {
-		if (tmp->cu_msg.cm_xid == cn->cn_xid) {
+		if (tmp->cu_u.cu_msg.cm_xid == cn->cn_xid) {
 			cn->cn_xid++;
 			spin_unlock(&cn->cn_lock);
 			goto restart_search;
 		}
 	}
 	init_completion(&new->cu_done);
-	new->cu_msg.cm_vers = CLD_UPCALL_VERSION;
-	put_unaligned(cn->cn_xid++, &new->cu_msg.cm_xid);
+	new->cu_u.cu_msg.cm_vers = nn->client_tracking_ops->version;
+	put_unaligned(cn->cn_xid++, &new->cu_u.cu_msg.cm_xid);
 	new->cu_net = cn;
 	list_add(&new->cu_list, &cn->cn_list);
 	spin_unlock(&cn->cn_lock);
 
-	dprintk("%s: allocated xid %u\n", __func__, new->cu_msg.cm_xid);
+	dprintk("%s: allocated xid %u\n", __func__, new->cu_u.cu_msg.cm_xid);
 
 	return new;
 }
@@ -1066,20 +1079,20 @@ nfsd4_cld_create(struct nfs4_client *clp)
 	if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags))
 		return;
 
-	cup = alloc_cld_upcall(cn);
+	cup = alloc_cld_upcall(nn);
 	if (!cup) {
 		ret = -ENOMEM;
 		goto out_err;
 	}
 
-	cup->cu_msg.cm_cmd = Cld_Create;
-	cup->cu_msg.cm_u.cm_name.cn_len = clp->cl_name.len;
-	memcpy(cup->cu_msg.cm_u.cm_name.cn_id, clp->cl_name.data,
+	cup->cu_u.cu_msg.cm_cmd = Cld_Create;
+	cup->cu_u.cu_msg.cm_u.cm_name.cn_len = clp->cl_name.len;
+	memcpy(cup->cu_u.cu_msg.cm_u.cm_name.cn_id, clp->cl_name.data,
 			clp->cl_name.len);
 
-	ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_msg);
+	ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_u.cu_msg);
 	if (!ret) {
-		ret = cup->cu_msg.cm_status;
+		ret = cup->cu_u.cu_msg.cm_status;
 		set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags);
 	}
 
@@ -1103,20 +1116,20 @@ nfsd4_cld_remove(struct nfs4_client *clp)
 	if (!test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags))
 		return;
 
-	cup = alloc_cld_upcall(cn);
+	cup = alloc_cld_upcall(nn);
 	if (!cup) {
 		ret = -ENOMEM;
 		goto out_err;
 	}
 
-	cup->cu_msg.cm_cmd = Cld_Remove;
-	cup->cu_msg.cm_u.cm_name.cn_len = clp->cl_name.len;
-	memcpy(cup->cu_msg.cm_u.cm_name.cn_id, clp->cl_name.data,
+	cup->cu_u.cu_msg.cm_cmd = Cld_Remove;
+	cup->cu_u.cu_msg.cm_u.cm_name.cn_len = clp->cl_name.len;
+	memcpy(cup->cu_u.cu_msg.cm_u.cm_name.cn_id, clp->cl_name.data,
 			clp->cl_name.len);
 
-	ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_msg);
+	ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_u.cu_msg);
 	if (!ret) {
-		ret = cup->cu_msg.cm_status;
+		ret = cup->cu_u.cu_msg.cm_status;
 		clear_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags);
 	}
 
@@ -1145,21 +1158,21 @@ nfsd4_cld_check_v0(struct nfs4_client *clp)
 	if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags))
 		return 0;
 
-	cup = alloc_cld_upcall(cn);
+	cup = alloc_cld_upcall(nn);
 	if (!cup) {
 		printk(KERN_ERR "NFSD: Unable to check client record on "
 				"stable storage: %d\n", -ENOMEM);
 		return -ENOMEM;
 	}
 
-	cup->cu_msg.cm_cmd = Cld_Check;
-	cup->cu_msg.cm_u.cm_name.cn_len = clp->cl_name.len;
-	memcpy(cup->cu_msg.cm_u.cm_name.cn_id, clp->cl_name.data,
+	cup->cu_u.cu_msg.cm_cmd = Cld_Check;
+	cup->cu_u.cu_msg.cm_u.cm_name.cn_len = clp->cl_name.len;
+	memcpy(cup->cu_u.cu_msg.cm_u.cm_name.cn_id, clp->cl_name.data,
 			clp->cl_name.len);
 
-	ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_msg);
+	ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_u.cu_msg);
 	if (!ret) {
-		ret = cup->cu_msg.cm_status;
+		ret = cup->cu_u.cu_msg.cm_status;
 		set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags);
 	}
 
@@ -1223,16 +1236,16 @@ nfsd4_cld_grace_start(struct nfsd_net *nn)
 	struct cld_upcall *cup;
 	struct cld_net *cn = nn->cld_net;
 
-	cup = alloc_cld_upcall(cn);
+	cup = alloc_cld_upcall(nn);
 	if (!cup) {
 		ret = -ENOMEM;
 		goto out_err;
 	}
 
-	cup->cu_msg.cm_cmd = Cld_GraceStart;
-	ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_msg);
+	cup->cu_u.cu_msg.cm_cmd = Cld_GraceStart;
+	ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_u.cu_msg);
 	if (!ret)
-		ret = cup->cu_msg.cm_status;
+		ret = cup->cu_u.cu_msg.cm_status;
 
 	free_cld_upcall(cup);
 out_err:
@@ -1250,17 +1263,17 @@ nfsd4_cld_grace_done_v0(struct nfsd_net *nn)
 	struct cld_upcall *cup;
 	struct cld_net *cn = nn->cld_net;
 
-	cup = alloc_cld_upcall(cn);
+	cup = alloc_cld_upcall(nn);
 	if (!cup) {
 		ret = -ENOMEM;
 		goto out_err;
 	}
 
-	cup->cu_msg.cm_cmd = Cld_GraceDone;
-	cup->cu_msg.cm_u.cm_gracetime = (int64_t)nn->boot_time;
-	ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_msg);
+	cup->cu_u.cu_msg.cm_cmd = Cld_GraceDone;
+	cup->cu_u.cu_msg.cm_u.cm_gracetime = (int64_t)nn->boot_time;
+	ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_u.cu_msg);
 	if (!ret)
-		ret = cup->cu_msg.cm_status;
+		ret = cup->cu_u.cu_msg.cm_status;
 
 	free_cld_upcall(cup);
 out_err:
@@ -1279,16 +1292,16 @@ nfsd4_cld_grace_done(struct nfsd_net *nn)
 	struct cld_upcall *cup;
 	struct cld_net *cn = nn->cld_net;
 
-	cup = alloc_cld_upcall(cn);
+	cup = alloc_cld_upcall(nn);
 	if (!cup) {
 		ret = -ENOMEM;
 		goto out_err;
 	}
 
-	cup->cu_msg.cm_cmd = Cld_GraceDone;
-	ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_msg);
+	cup->cu_u.cu_msg.cm_cmd = Cld_GraceDone;
+	ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_u.cu_msg);
 	if (!ret)
-		ret = cup->cu_msg.cm_status;
+		ret = cup->cu_u.cu_msg.cm_status;
 
 	free_cld_upcall(cup);
 out_err:
@@ -1336,6 +1349,50 @@ cld_running(struct nfsd_net *nn)
 	return pipe->nreaders || pipe->nwriters;
 }
 
+static int
+nfsd4_cld_get_version(struct nfsd_net *nn)
+{
+	int ret = 0;
+	struct cld_upcall *cup;
+	struct cld_net *cn = nn->cld_net;
+	uint8_t version;
+
+	cup = alloc_cld_upcall(nn);
+	if (!cup) {
+		ret = -ENOMEM;
+		goto out_err;
+	}
+	cup->cu_u.cu_msg.cm_cmd = Cld_GetVersion;
+	ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_u.cu_msg);
+	if (!ret) {
+		ret = cup->cu_u.cu_msg.cm_status;
+		if (ret)
+			goto out_free;
+		version = cup->cu_u.cu_msg.cm_u.cm_version;
+		dprintk("%s: userspace returned version %u\n",
+				__func__, version);
+		if (version < 1)
+			version = 1;
+		else if (version > CLD_UPCALL_VERSION)
+			version = CLD_UPCALL_VERSION;
+
+		switch (version) {
+		case 1:
+			nn->client_tracking_ops = &nfsd4_cld_tracking_ops;
+			break;
+		default:
+			break;
+		}
+	}
+out_free:
+	free_cld_upcall(cup);
+out_err:
+	if (ret)
+		dprintk("%s: Unable to get version from userspace: %d\n",
+			__func__, ret);
+	return ret;
+}
+
 static int
 nfsd4_cld_tracking_init(struct net *net)
 {
@@ -1368,10 +1425,14 @@ nfsd4_cld_tracking_init(struct net *net)
 		goto err_remove;
 	}
 
+	status = nfsd4_cld_get_version(nn);
+	if (status == -EOPNOTSUPP)
+		pr_warn("NFSD: nfsdcld GetVersion upcall failed. Please upgrade nfsdcld.\n");
+
 	status = nfsd4_cld_grace_start(nn);
 	if (status) {
 		if (status == -EOPNOTSUPP)
-			printk(KERN_WARNING "NFSD: Please upgrade nfsdcld.\n");
+			pr_warn("NFSD: nfsdcld GraceStart upcall failed. Please upgrade nfsdcld.\n");
 		nfs4_release_reclaim(nn);
 		goto err_remove;
 	} else
@@ -1403,6 +1464,8 @@ static const struct nfsd4_client_tracking_ops nfsd4_cld_tracking_ops_v0 = {
 	.remove		= nfsd4_cld_remove,
 	.check		= nfsd4_cld_check_v0,
 	.grace_done	= nfsd4_cld_grace_done_v0,
+	.version	= 1,
+	.msglen		= sizeof(struct cld_msg),
 };
 
 /* For newer nfsdcld's */
@@ -1413,6 +1476,8 @@ static const struct nfsd4_client_tracking_ops nfsd4_cld_tracking_ops = {
 	.remove		= nfsd4_cld_remove,
 	.check		= nfsd4_cld_check,
 	.grace_done	= nfsd4_cld_grace_done,
+	.version	= 1,
+	.msglen		= sizeof(struct cld_msg),
 };
 
 /* upcall via usermodehelper */
@@ -1760,6 +1825,8 @@ static const struct nfsd4_client_tracking_ops nfsd4_umh_tracking_ops = {
 	.remove		= nfsd4_umh_cltrack_remove,
 	.check		= nfsd4_umh_cltrack_check,
 	.grace_done	= nfsd4_umh_cltrack_grace_done,
+	.version	= 1,
+	.msglen		= 0,
 };
 
 int
diff --git a/include/uapi/linux/nfsd/cld.h b/include/uapi/linux/nfsd/cld.h
index b1e9de4f07d5..c5aad16d10c0 100644
--- a/include/uapi/linux/nfsd/cld.h
+++ b/include/uapi/linux/nfsd/cld.h
@@ -36,7 +36,8 @@ enum cld_command {
 	Cld_Remove,		/* remove record of this cm_id */
 	Cld_Check,		/* is this cm_id allowed? */
 	Cld_GraceDone,		/* grace period is complete */
-	Cld_GraceStart,
+	Cld_GraceStart,		/* grace start (upload client records) */
+	Cld_GetVersion,		/* query max supported upcall version */
 };
 
 /* representation of long-form NFSv4 client ID */
@@ -54,7 +55,15 @@ struct cld_msg {
 	union {
 		__s64		cm_gracetime;	/* grace period start time */
 		struct cld_name	cm_name;
+		__u8		cm_version;	/* for getting max version */
 	} __attribute__((packed)) cm_u;
 } __attribute__((packed));
 
+struct cld_msg_hdr {
+	__u8		cm_vers;		/* upcall version */
+	__u8		cm_cmd;			/* upcall command */
+	__s16		cm_status;		/* return code */
+	__u32		cm_xid;			/* transaction id */
+} __attribute__((packed));
+
 #endif /* !_NFSD_CLD_H */
-- 
cgit v1.2.3


From 6ee95d1c899186c0798cafd25998d436bcdb9618 Mon Sep 17 00:00:00 2001
From: Scott Mayhew <smayhew@redhat.com>
Date: Mon, 9 Sep 2019 16:10:31 -0400
Subject: nfsd: add support for upcall version 2

Version 2 upcalls will allow the nfsd to include a hash of the kerberos
principal string in the Cld_Create upcall.  If a principal is present in
the svc_cred, then the hash will be included in the Cld_Create upcall.
We attempt to use the svc_cred.cr_raw_principal (which is returned by
gssproxy) first, and then fall back to using the svc_cred.cr_principal
(which is returned by both gssproxy and rpc.svcgssd).  Upon a subsequent
restart, the hash will be returned in the Cld_Gracestart downcall and
stored in the reclaim_str_hashtbl so it can be used when handling
reclaim opens.

Signed-off-by: Scott Mayhew <smayhew@redhat.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4recover.c         | 223 +++++++++++++++++++++++++++++++++++++++---
 fs/nfsd/nfs4state.c           |   6 +-
 fs/nfsd/state.h               |   3 +-
 include/uapi/linux/nfsd/cld.h |  30 +++++-
 4 files changed, 245 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 58a61339d40c..cdc75ad4438b 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -64,6 +64,7 @@ struct nfsd4_client_tracking_ops {
 };
 
 static const struct nfsd4_client_tracking_ops nfsd4_cld_tracking_ops;
+static const struct nfsd4_client_tracking_ops nfsd4_cld_tracking_ops_v2;
 
 /* Globals */
 static char user_recovery_dirname[PATH_MAX] = "/var/lib/nfs/v4recovery";
@@ -177,6 +178,7 @@ __nfsd4_create_reclaim_record_grace(struct nfs4_client *clp,
 		const char *dname, int len, struct nfsd_net *nn)
 {
 	struct xdr_netobj name;
+	struct xdr_netobj princhash = { .len = 0, .data = NULL };
 	struct nfs4_client_reclaim *crp;
 
 	name.data = kmemdup(dname, len, GFP_KERNEL);
@@ -186,7 +188,7 @@ __nfsd4_create_reclaim_record_grace(struct nfs4_client *clp,
 		return;
 	}
 	name.len = len;
-	crp = nfs4_client_to_reclaim(name, nn);
+	crp = nfs4_client_to_reclaim(name, princhash, nn);
 	if (!crp) {
 		kfree(name.data);
 		return;
@@ -486,6 +488,7 @@ static int
 load_recdir(struct dentry *parent, struct dentry *child, struct nfsd_net *nn)
 {
 	struct xdr_netobj name;
+	struct xdr_netobj princhash = { .len = 0, .data = NULL };
 
 	if (child->d_name.len != HEXDIR_LEN - 1) {
 		printk("%s: illegal name %pd in recovery directory\n",
@@ -500,7 +503,7 @@ load_recdir(struct dentry *parent, struct dentry *child, struct nfsd_net *nn)
 		goto out;
 	}
 	name.len = HEXDIR_LEN;
-	if (!nfs4_client_to_reclaim(name, nn))
+	if (!nfs4_client_to_reclaim(name, princhash, nn))
 		kfree(name.data);
 out:
 	return 0;
@@ -737,6 +740,7 @@ struct cld_net {
 	struct list_head	 cn_list;
 	unsigned int		 cn_xid;
 	bool			 cn_has_legacy;
+	struct crypto_shash	*cn_tfm;
 };
 
 struct cld_upcall {
@@ -746,6 +750,7 @@ struct cld_upcall {
 	union {
 		struct cld_msg_hdr	 cu_hdr;
 		struct cld_msg		 cu_msg;
+		struct cld_msg_v2	 cu_msg_v2;
 	} cu_u;
 };
 
@@ -792,11 +797,11 @@ cld_pipe_upcall(struct rpc_pipe *pipe, void *cmsg)
 }
 
 static ssize_t
-__cld_pipe_inprogress_downcall(const struct cld_msg __user *cmsg,
+__cld_pipe_inprogress_downcall(const struct cld_msg_v2 __user *cmsg,
 		struct nfsd_net *nn)
 {
-	uint8_t cmd;
-	struct xdr_netobj name;
+	uint8_t cmd, princhashlen;
+	struct xdr_netobj name, princhash = { .len = 0, .data = NULL };
 	uint16_t namelen;
 	struct cld_net *cn = nn->cld_net;
 
@@ -805,19 +810,45 @@ __cld_pipe_inprogress_downcall(const struct cld_msg __user *cmsg,
 		return -EFAULT;
 	}
 	if (cmd == Cld_GraceStart) {
-		if (get_user(namelen, &cmsg->cm_u.cm_name.cn_len))
-			return -EFAULT;
-		name.data = memdup_user(&cmsg->cm_u.cm_name.cn_id, namelen);
-		if (IS_ERR_OR_NULL(name.data))
-			return -EFAULT;
-		name.len = namelen;
+		if (nn->client_tracking_ops->version >= 2) {
+			const struct cld_clntinfo __user *ci;
+
+			ci = &cmsg->cm_u.cm_clntinfo;
+			if (get_user(namelen, &ci->cc_name.cn_len))
+				return -EFAULT;
+			name.data = memdup_user(&ci->cc_name.cn_id, namelen);
+			if (IS_ERR_OR_NULL(name.data))
+				return -EFAULT;
+			name.len = namelen;
+			get_user(princhashlen, &ci->cc_princhash.cp_len);
+			if (princhashlen > 0) {
+				princhash.data = memdup_user(
+						&ci->cc_princhash.cp_data,
+						princhashlen);
+				if (IS_ERR_OR_NULL(princhash.data))
+					return -EFAULT;
+				princhash.len = princhashlen;
+			} else
+				princhash.len = 0;
+		} else {
+			const struct cld_name __user *cnm;
+
+			cnm = &cmsg->cm_u.cm_name;
+			if (get_user(namelen, &cnm->cn_len))
+				return -EFAULT;
+			name.data = memdup_user(&cnm->cn_id, namelen);
+			if (IS_ERR_OR_NULL(name.data))
+				return -EFAULT;
+			name.len = namelen;
+		}
 		if (name.len > 5 && memcmp(name.data, "hash:", 5) == 0) {
 			name.len = name.len - 5;
 			memmove(name.data, name.data + 5, name.len);
 			cn->cn_has_legacy = true;
 		}
-		if (!nfs4_client_to_reclaim(name, nn)) {
+		if (!nfs4_client_to_reclaim(name, princhash, nn)) {
 			kfree(name.data);
+			kfree(princhash.data);
 			return -EFAULT;
 		}
 		return nn->client_tracking_ops->msglen;
@@ -830,7 +861,7 @@ cld_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
 {
 	struct cld_upcall *tmp, *cup;
 	struct cld_msg_hdr __user *hdr = (struct cld_msg_hdr __user *)src;
-	struct cld_msg __user *cmsg = (struct cld_msg __user *)src;
+	struct cld_msg_v2 __user *cmsg = (struct cld_msg_v2 __user *)src;
 	uint32_t xid;
 	struct nfsd_net *nn = net_generic(file_inode(filp)->i_sb->s_fs_info,
 						nfsd_net_id);
@@ -881,7 +912,7 @@ cld_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
 	if (status == -EINPROGRESS)
 		return __cld_pipe_inprogress_downcall(cmsg, nn);
 
-	if (copy_from_user(&cup->cu_u.cu_msg, src, mlen) != 0)
+	if (copy_from_user(&cup->cu_u.cu_msg_v2, src, mlen) != 0)
 		return -EFAULT;
 
 	complete(&cup->cu_done);
@@ -1019,6 +1050,8 @@ nfsd4_remove_cld_pipe(struct net *net)
 
 	nfsd4_cld_unregister_net(net, cn->cn_pipe);
 	rpc_destroy_pipe_data(cn->cn_pipe);
+	if (cn->cn_tfm)
+		crypto_free_shash(cn->cn_tfm);
 	kfree(nn->cld_net);
 	nn->cld_net = NULL;
 }
@@ -1103,6 +1136,75 @@ out_err:
 				"record on stable storage: %d\n", ret);
 }
 
+/* Ask daemon to create a new record */
+static void
+nfsd4_cld_create_v2(struct nfs4_client *clp)
+{
+	int ret;
+	struct cld_upcall *cup;
+	struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+	struct cld_net *cn = nn->cld_net;
+	struct cld_msg_v2 *cmsg;
+	struct crypto_shash *tfm = cn->cn_tfm;
+	struct xdr_netobj cksum;
+	char *principal = NULL;
+	SHASH_DESC_ON_STACK(desc, tfm);
+
+	/* Don't upcall if it's already stored */
+	if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags))
+		return;
+
+	cup = alloc_cld_upcall(nn);
+	if (!cup) {
+		ret = -ENOMEM;
+		goto out_err;
+	}
+
+	cmsg = &cup->cu_u.cu_msg_v2;
+	cmsg->cm_cmd = Cld_Create;
+	cmsg->cm_u.cm_clntinfo.cc_name.cn_len = clp->cl_name.len;
+	memcpy(cmsg->cm_u.cm_clntinfo.cc_name.cn_id, clp->cl_name.data,
+			clp->cl_name.len);
+	if (clp->cl_cred.cr_raw_principal)
+		principal = clp->cl_cred.cr_raw_principal;
+	else if (clp->cl_cred.cr_principal)
+		principal = clp->cl_cred.cr_principal;
+	if (principal) {
+		desc->tfm = tfm;
+		cksum.len = crypto_shash_digestsize(tfm);
+		cksum.data = kmalloc(cksum.len, GFP_KERNEL);
+		if (cksum.data == NULL) {
+			ret = -ENOMEM;
+			goto out;
+		}
+		ret = crypto_shash_digest(desc, principal, strlen(principal),
+					  cksum.data);
+		shash_desc_zero(desc);
+		if (ret) {
+			kfree(cksum.data);
+			goto out;
+		}
+		cmsg->cm_u.cm_clntinfo.cc_princhash.cp_len = cksum.len;
+		memcpy(cmsg->cm_u.cm_clntinfo.cc_princhash.cp_data,
+		       cksum.data, cksum.len);
+		kfree(cksum.data);
+	} else
+		cmsg->cm_u.cm_clntinfo.cc_princhash.cp_len = 0;
+
+	ret = cld_pipe_upcall(cn->cn_pipe, cmsg);
+	if (!ret) {
+		ret = cmsg->cm_status;
+		set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags);
+	}
+
+out:
+	free_cld_upcall(cup);
+out_err:
+	if (ret)
+		pr_err("NFSD: Unable to create client record on stable storage: %d\n",
+				ret);
+}
+
 /* Ask daemon to create a new record */
 static void
 nfsd4_cld_remove(struct nfs4_client *clp)
@@ -1229,6 +1331,79 @@ found:
 	return 0;
 }
 
+static int
+nfsd4_cld_check_v2(struct nfs4_client *clp)
+{
+	struct nfs4_client_reclaim *crp;
+	struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+	struct cld_net *cn = nn->cld_net;
+	int status;
+	char dname[HEXDIR_LEN];
+	struct xdr_netobj name;
+	struct crypto_shash *tfm = cn->cn_tfm;
+	struct xdr_netobj cksum;
+	char *principal = NULL;
+	SHASH_DESC_ON_STACK(desc, tfm);
+
+	/* did we already find that this client is stable? */
+	if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags))
+		return 0;
+
+	/* look for it in the reclaim hashtable otherwise */
+	crp = nfsd4_find_reclaim_client(clp->cl_name, nn);
+	if (crp)
+		goto found;
+
+	if (cn->cn_has_legacy) {
+		status = nfs4_make_rec_clidname(dname, &clp->cl_name);
+		if (status)
+			return -ENOENT;
+
+		name.data = kmemdup(dname, HEXDIR_LEN, GFP_KERNEL);
+		if (!name.data) {
+			dprintk("%s: failed to allocate memory for name.data\n",
+					__func__);
+			return -ENOENT;
+		}
+		name.len = HEXDIR_LEN;
+		crp = nfsd4_find_reclaim_client(name, nn);
+		kfree(name.data);
+		if (crp)
+			goto found;
+
+	}
+	return -ENOENT;
+found:
+	if (crp->cr_princhash.len) {
+		if (clp->cl_cred.cr_raw_principal)
+			principal = clp->cl_cred.cr_raw_principal;
+		else if (clp->cl_cred.cr_principal)
+			principal = clp->cl_cred.cr_principal;
+		if (principal == NULL)
+			return -ENOENT;
+		desc->tfm = tfm;
+		cksum.len = crypto_shash_digestsize(tfm);
+		cksum.data = kmalloc(cksum.len, GFP_KERNEL);
+		if (cksum.data == NULL)
+			return -ENOENT;
+		status = crypto_shash_digest(desc, principal, strlen(principal),
+					     cksum.data);
+		shash_desc_zero(desc);
+		if (status) {
+			kfree(cksum.data);
+			return -ENOENT;
+		}
+		if (memcmp(crp->cr_princhash.data, cksum.data,
+				crp->cr_princhash.len)) {
+			kfree(cksum.data);
+			return -ENOENT;
+		}
+		kfree(cksum.data);
+	}
+	crp->cr_clp = clp;
+	return 0;
+}
+
 static int
 nfsd4_cld_grace_start(struct nfsd_net *nn)
 {
@@ -1380,6 +1555,9 @@ nfsd4_cld_get_version(struct nfsd_net *nn)
 		case 1:
 			nn->client_tracking_ops = &nfsd4_cld_tracking_ops;
 			break;
+		case 2:
+			nn->client_tracking_ops = &nfsd4_cld_tracking_ops_v2;
+			break;
 		default:
 			break;
 		}
@@ -1408,6 +1586,11 @@ nfsd4_cld_tracking_init(struct net *net)
 	status = __nfsd4_init_cld_pipe(net);
 	if (status)
 		goto err_shutdown;
+	nn->cld_net->cn_tfm = crypto_alloc_shash("sha256", 0, 0);
+	if (IS_ERR(nn->cld_net->cn_tfm)) {
+		status = PTR_ERR(nn->cld_net->cn_tfm);
+		goto err_remove;
+	}
 
 	/*
 	 * rpc pipe upcalls take 30 seconds to time out, so we don't want to
@@ -1480,6 +1663,18 @@ static const struct nfsd4_client_tracking_ops nfsd4_cld_tracking_ops = {
 	.msglen		= sizeof(struct cld_msg),
 };
 
+/* v2 create/check ops include the principal, if available */
+static const struct nfsd4_client_tracking_ops nfsd4_cld_tracking_ops_v2 = {
+	.init		= nfsd4_cld_tracking_init,
+	.exit		= nfsd4_cld_tracking_exit,
+	.create		= nfsd4_cld_create_v2,
+	.remove		= nfsd4_cld_remove,
+	.check		= nfsd4_cld_check_v2,
+	.grace_done	= nfsd4_cld_grace_done,
+	.version	= 2,
+	.msglen		= sizeof(struct cld_msg_v2),
+};
+
 /* upcall via usermodehelper */
 static char cltrack_prog[PATH_MAX] = "/sbin/nfsdcltrack";
 module_param_string(cltrack_prog, cltrack_prog, sizeof(cltrack_prog),
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 906e214dcce8..da1093dd5016 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -6887,7 +6887,8 @@ nfs4_has_reclaimed_state(struct xdr_netobj name, struct nfsd_net *nn)
  * will be freed in nfs4_remove_reclaim_record in the normal case).
  */
 struct nfs4_client_reclaim *
-nfs4_client_to_reclaim(struct xdr_netobj name, struct nfsd_net *nn)
+nfs4_client_to_reclaim(struct xdr_netobj name, struct xdr_netobj princhash,
+		struct nfsd_net *nn)
 {
 	unsigned int strhashval;
 	struct nfs4_client_reclaim *crp;
@@ -6900,6 +6901,8 @@ nfs4_client_to_reclaim(struct xdr_netobj name, struct nfsd_net *nn)
 		list_add(&crp->cr_strhash, &nn->reclaim_str_hashtbl[strhashval]);
 		crp->cr_name.data = name.data;
 		crp->cr_name.len = name.len;
+		crp->cr_princhash.data = princhash.data;
+		crp->cr_princhash.len = princhash.len;
 		crp->cr_clp = NULL;
 		nn->reclaim_str_hashtbl_size++;
 	}
@@ -6911,6 +6914,7 @@ nfs4_remove_reclaim_record(struct nfs4_client_reclaim *crp, struct nfsd_net *nn)
 {
 	list_del(&crp->cr_strhash);
 	kfree(crp->cr_name.data);
+	kfree(crp->cr_princhash.data);
 	kfree(crp);
 	nn->reclaim_str_hashtbl_size--;
 }
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index d00c86d05beb..46f56afb6cb8 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -378,6 +378,7 @@ struct nfs4_client_reclaim {
 	struct list_head	cr_strhash;	/* hash by cr_name */
 	struct nfs4_client	*cr_clp;	/* pointer to associated clp */
 	struct xdr_netobj	cr_name;	/* recovery dir name */
+	struct xdr_netobj	cr_princhash;
 };
 
 /* A reasonable value for REPLAY_ISIZE was estimated as follows:  
@@ -645,7 +646,7 @@ extern void nfsd4_shutdown_callback(struct nfs4_client *);
 extern void nfsd4_shutdown_copy(struct nfs4_client *clp);
 extern void nfsd4_prepare_cb_recall(struct nfs4_delegation *dp);
 extern struct nfs4_client_reclaim *nfs4_client_to_reclaim(struct xdr_netobj name,
-							struct nfsd_net *nn);
+				struct xdr_netobj princhash, struct nfsd_net *nn);
 extern bool nfs4_has_reclaimed_state(struct xdr_netobj name, struct nfsd_net *nn);
 
 struct nfs4_file *find_file(struct knfsd_fh *fh);
diff --git a/include/uapi/linux/nfsd/cld.h b/include/uapi/linux/nfsd/cld.h
index c5aad16d10c0..a519313af953 100644
--- a/include/uapi/linux/nfsd/cld.h
+++ b/include/uapi/linux/nfsd/cld.h
@@ -26,11 +26,15 @@
 #include <linux/types.h>
 
 /* latest upcall version available */
-#define CLD_UPCALL_VERSION 1
+#define CLD_UPCALL_VERSION 2
 
 /* defined by RFC3530 */
 #define NFS4_OPAQUE_LIMIT 1024
 
+#ifndef SHA256_DIGEST_SIZE
+#define SHA256_DIGEST_SIZE      32
+#endif
+
 enum cld_command {
 	Cld_Create,		/* create a record for this cm_id */
 	Cld_Remove,		/* remove record of this cm_id */
@@ -46,6 +50,17 @@ struct cld_name {
 	unsigned char	cn_id[NFS4_OPAQUE_LIMIT];	/* client-provided */
 } __attribute__((packed));
 
+/* sha256 hash of the kerberos principal */
+struct cld_princhash {
+	__u8		cp_len;				/* length of cp_data */
+	unsigned char	cp_data[SHA256_DIGEST_SIZE];	/* hash of principal */
+} __attribute__((packed));
+
+struct cld_clntinfo {
+	struct cld_name		cc_name;
+	struct cld_princhash	cc_princhash;
+} __attribute__((packed));
+
 /* message struct for communication with userspace */
 struct cld_msg {
 	__u8		cm_vers;		/* upcall version */
@@ -59,6 +74,19 @@ struct cld_msg {
 	} __attribute__((packed)) cm_u;
 } __attribute__((packed));
 
+/* version 2 message can include hash of kerberos principal */
+struct cld_msg_v2 {
+	__u8		cm_vers;		/* upcall version */
+	__u8		cm_cmd;			/* upcall command */
+	__s16		cm_status;		/* return code */
+	__u32		cm_xid;			/* transaction id */
+	union {
+		struct cld_name	cm_name;
+		__u8		cm_version;	/* for getting max version */
+		struct cld_clntinfo cm_clntinfo; /* name & princ hash */
+	} __attribute__((packed)) cm_u;
+} __attribute__((packed));
+
 struct cld_msg_hdr {
 	__u8		cm_vers;		/* upcall version */
 	__u8		cm_cmd;			/* upcall command */
-- 
cgit v1.2.3