From c1326210477ecc06c53221f0005c64419aba30d6 Mon Sep 17 00:00:00 2001 From: Frank van der Linden Date: Tue, 23 Jun 2020 22:39:20 +0000 Subject: nfs,nfsd: NFSv4.2 extended attribute protocol definitions Add definitions for the new operations, errors and flags as defined in RFC 8276 (File System Extended Attributes in NFSv4). Signed-off-by: Frank van der Linden Signed-off-by: Chuck Lever --- include/linux/nfs4.h | 20 ++++++++++++++++++++ include/uapi/linux/nfs4.h | 3 +++ 2 files changed, 23 insertions(+) diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h index 4dba3c948932..e6ca9d1d2e76 100644 --- a/include/linux/nfs4.h +++ b/include/linux/nfs4.h @@ -150,6 +150,12 @@ enum nfs_opnum4 { OP_WRITE_SAME = 70, OP_CLONE = 71, + /* xattr support (RFC8726) */ + OP_GETXATTR = 72, + OP_SETXATTR = 73, + OP_LISTXATTRS = 74, + OP_REMOVEXATTR = 75, + OP_ILLEGAL = 10044, }; @@ -280,6 +286,10 @@ enum nfsstat4 { NFS4ERR_WRONG_LFS = 10092, NFS4ERR_BADLABEL = 10093, NFS4ERR_OFFLOAD_NO_REQS = 10094, + + /* xattr (RFC8276) */ + NFS4ERR_NOXATTR = 10095, + NFS4ERR_XATTR2BIG = 10096, }; static inline bool seqid_mutating_err(u32 err) @@ -452,6 +462,7 @@ enum change_attr_type4 { #define FATTR4_WORD2_CHANGE_ATTR_TYPE (1UL << 15) #define FATTR4_WORD2_SECURITY_LABEL (1UL << 16) #define FATTR4_WORD2_MODE_UMASK (1UL << 17) +#define FATTR4_WORD2_XATTR_SUPPORT (1UL << 18) /* MDS threshold bitmap bits */ #define THRESHOLD_RD (1UL << 0) @@ -700,4 +711,13 @@ struct nl4_server { struct nfs42_netaddr nl4_addr; /* NL4_NETADDR */ } u; }; + +/* + * Options for setxattr. These match the flags for setxattr(2). + */ +enum nfs4_setxattr_options { + SETXATTR4_EITHER = 0, + SETXATTR4_CREATE = 1, + SETXATTR4_REPLACE = 2, +}; #endif diff --git a/include/uapi/linux/nfs4.h b/include/uapi/linux/nfs4.h index 8572930cf5b0..bf197e99b98f 100644 --- a/include/uapi/linux/nfs4.h +++ b/include/uapi/linux/nfs4.h @@ -33,6 +33,9 @@ #define NFS4_ACCESS_EXTEND 0x0008 #define NFS4_ACCESS_DELETE 0x0010 #define NFS4_ACCESS_EXECUTE 0x0020 +#define NFS4_ACCESS_XAREAD 0x0040 +#define NFS4_ACCESS_XAWRITE 0x0080 +#define NFS4_ACCESS_XALIST 0x0100 #define NFS4_FH_PERSISTENT 0x0000 #define NFS4_FH_NOEXPIRE_WITH_OPEN 0x0001 -- cgit v1.2.3 From 08b5d5014a27e717826999ad20e394a8811aae92 Mon Sep 17 00:00:00 2001 From: Frank van der Linden Date: Tue, 23 Jun 2020 22:39:18 +0000 Subject: xattr: break delegations in {set,remove}xattr set/removexattr on an exported filesystem should break NFS delegations. This is true in general, but also for the upcoming support for RFC 8726 (NFSv4 extended attribute support). Make sure that they do. Additionally, they need to grow a _locked variant, since callers might call this with i_rwsem held (like the NFS server code). Cc: stable@vger.kernel.org # v4.9+ Cc: linux-fsdevel@vger.kernel.org Cc: Al Viro Signed-off-by: Frank van der Linden Signed-off-by: Chuck Lever --- fs/xattr.c | 84 ++++++++++++++++++++++++++++++++++++++++++++++----- include/linux/xattr.h | 2 ++ 2 files changed, 79 insertions(+), 7 deletions(-) diff --git a/fs/xattr.c b/fs/xattr.c index 91608d9bfc6a..95f38f57347f 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -204,10 +204,22 @@ int __vfs_setxattr_noperm(struct dentry *dentry, const char *name, return error; } - +/** + * __vfs_setxattr_locked: set an extended attribute while holding the inode + * lock + * + * @dentry - object to perform setxattr on + * @name - xattr name to set + * @value - value to set @name to + * @size - size of @value + * @flags - flags to pass into filesystem operations + * @delegated_inode - on return, will contain an inode pointer that + * a delegation was broken on, NULL if none. + */ int -vfs_setxattr(struct dentry *dentry, const char *name, const void *value, - size_t size, int flags) +__vfs_setxattr_locked(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags, + struct inode **delegated_inode) { struct inode *inode = dentry->d_inode; int error; @@ -216,15 +228,40 @@ vfs_setxattr(struct dentry *dentry, const char *name, const void *value, if (error) return error; - inode_lock(inode); error = security_inode_setxattr(dentry, name, value, size, flags); if (error) goto out; + error = try_break_deleg(inode, delegated_inode); + if (error) + goto out; + error = __vfs_setxattr_noperm(dentry, name, value, size, flags); out: + return error; +} +EXPORT_SYMBOL_GPL(__vfs_setxattr_locked); + +int +vfs_setxattr(struct dentry *dentry, const char *name, const void *value, + size_t size, int flags) +{ + struct inode *inode = dentry->d_inode; + struct inode *delegated_inode = NULL; + int error; + +retry_deleg: + inode_lock(inode); + error = __vfs_setxattr_locked(dentry, name, value, size, flags, + &delegated_inode); inode_unlock(inode); + + if (delegated_inode) { + error = break_deleg_wait(&delegated_inode); + if (!error) + goto retry_deleg; + } return error; } EXPORT_SYMBOL_GPL(vfs_setxattr); @@ -378,8 +415,18 @@ __vfs_removexattr(struct dentry *dentry, const char *name) } EXPORT_SYMBOL(__vfs_removexattr); +/** + * __vfs_removexattr_locked: set an extended attribute while holding the inode + * lock + * + * @dentry - object to perform setxattr on + * @name - name of xattr to remove + * @delegated_inode - on return, will contain an inode pointer that + * a delegation was broken on, NULL if none. + */ int -vfs_removexattr(struct dentry *dentry, const char *name) +__vfs_removexattr_locked(struct dentry *dentry, const char *name, + struct inode **delegated_inode) { struct inode *inode = dentry->d_inode; int error; @@ -388,11 +435,14 @@ vfs_removexattr(struct dentry *dentry, const char *name) if (error) return error; - inode_lock(inode); error = security_inode_removexattr(dentry, name); if (error) goto out; + error = try_break_deleg(inode, delegated_inode); + if (error) + goto out; + error = __vfs_removexattr(dentry, name); if (!error) { @@ -401,12 +451,32 @@ vfs_removexattr(struct dentry *dentry, const char *name) } out: + return error; +} +EXPORT_SYMBOL_GPL(__vfs_removexattr_locked); + +int +vfs_removexattr(struct dentry *dentry, const char *name) +{ + struct inode *inode = dentry->d_inode; + struct inode *delegated_inode = NULL; + int error; + +retry_deleg: + inode_lock(inode); + error = __vfs_removexattr_locked(dentry, name, &delegated_inode); inode_unlock(inode); + + if (delegated_inode) { + error = break_deleg_wait(&delegated_inode); + if (!error) + goto retry_deleg; + } + return error; } EXPORT_SYMBOL_GPL(vfs_removexattr); - /* * Extended attribute SET operations */ diff --git a/include/linux/xattr.h b/include/linux/xattr.h index 47eaa34f8761..a2f3cd02653c 100644 --- a/include/linux/xattr.h +++ b/include/linux/xattr.h @@ -51,8 +51,10 @@ ssize_t vfs_getxattr(struct dentry *, const char *, void *, size_t); ssize_t vfs_listxattr(struct dentry *d, char *list, size_t size); int __vfs_setxattr(struct dentry *, struct inode *, const char *, const void *, size_t, int); int __vfs_setxattr_noperm(struct dentry *, const char *, const void *, size_t, int); +int __vfs_setxattr_locked(struct dentry *, const char *, const void *, size_t, int, struct inode **); int vfs_setxattr(struct dentry *, const char *, const void *, size_t, int); int __vfs_removexattr(struct dentry *, const char *); +int __vfs_removexattr_locked(struct dentry *, const char *, struct inode **); int vfs_removexattr(struct dentry *, const char *); ssize_t generic_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size); -- cgit v1.2.3 From cab8d289c5ad541a5351a651d95c4086b7f84d7c Mon Sep 17 00:00:00 2001 From: Frank van der Linden Date: Tue, 23 Jun 2020 22:39:19 +0000 Subject: xattr: add a function to check if a namespace is supported Add a function that checks is an extended attribute namespace is supported for an inode, meaning that a handler must be present for either the whole namespace, or at least one synthetic xattr in the namespace. To be used by the nfs server code when being queried for extended attributes support. Cc: linux-fsdevel@vger.kernel.org Cc: Al Viro Signed-off-by: Frank van der Linden Signed-off-by: Chuck Lever --- fs/xattr.c | 27 +++++++++++++++++++++++++++ include/linux/xattr.h | 2 ++ 2 files changed, 29 insertions(+) diff --git a/fs/xattr.c b/fs/xattr.c index 95f38f57347f..386b45676d7e 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -134,6 +134,33 @@ xattr_permission(struct inode *inode, const char *name, int mask) return inode_permission(inode, mask); } +/* + * Look for any handler that deals with the specified namespace. + */ +int +xattr_supported_namespace(struct inode *inode, const char *prefix) +{ + const struct xattr_handler **handlers = inode->i_sb->s_xattr; + const struct xattr_handler *handler; + size_t preflen; + + if (!(inode->i_opflags & IOP_XATTR)) { + if (unlikely(is_bad_inode(inode))) + return -EIO; + return -EOPNOTSUPP; + } + + preflen = strlen(prefix); + + for_each_xattr_handler(handlers, handler) { + if (!strncmp(xattr_prefix(handler), prefix, preflen)) + return 0; + } + + return -EOPNOTSUPP; +} +EXPORT_SYMBOL(xattr_supported_namespace); + int __vfs_setxattr(struct dentry *dentry, struct inode *inode, const char *name, const void *value, size_t size, int flags) diff --git a/include/linux/xattr.h b/include/linux/xattr.h index a2f3cd02653c..fac75810d9d3 100644 --- a/include/linux/xattr.h +++ b/include/linux/xattr.h @@ -61,6 +61,8 @@ ssize_t generic_listxattr(struct dentry *dentry, char *buffer, size_t buffer_siz ssize_t vfs_getxattr_alloc(struct dentry *dentry, const char *name, char **xattr_value, size_t size, gfp_t flags); +int xattr_supported_namespace(struct inode *inode, const char *prefix); + static inline const char *xattr_prefix(const struct xattr_handler *handler) { return handler->prefix ?: handler->name; -- cgit v1.2.3 From 874c7b8ea545c7ebc50990843a234eafbc5c0c7f Mon Sep 17 00:00:00 2001 From: Frank van der Linden Date: Tue, 23 Jun 2020 22:39:21 +0000 Subject: nfsd: split off the write decode code into a separate function nfs4_decode_write has code to parse incoming XDR write data in to a kvec head, and a list of pages. Put this code in to a separate function, so that it can be used later by the xattr code, for setxattr. No functional change. Signed-off-by: Frank van der Linden Signed-off-by: Chuck Lever --- fs/nfsd/nfs4xdr.c | 72 ++++++++++++++++++++++++++++++++----------------------- 1 file changed, 42 insertions(+), 30 deletions(-) diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 996ac01ee977..48806b493eba 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -257,6 +257,44 @@ svcxdr_dupstr(struct nfsd4_compoundargs *argp, void *buf, u32 len) return p; } +static __be32 +svcxdr_construct_vector(struct nfsd4_compoundargs *argp, struct kvec *head, + struct page ***pagelist, u32 buflen) +{ + int avail; + int len; + int pages; + + /* Sorry .. no magic macros for this.. * + * READ_BUF(write->wr_buflen); + * SAVEMEM(write->wr_buf, write->wr_buflen); + */ + avail = (char *)argp->end - (char *)argp->p; + if (avail + argp->pagelen < buflen) { + dprintk("NFSD: xdr error (%s:%d)\n", + __FILE__, __LINE__); + return nfserr_bad_xdr; + } + head->iov_base = argp->p; + head->iov_len = avail; + *pagelist = argp->pagelist; + + len = XDR_QUADLEN(buflen) << 2; + if (len >= avail) { + len -= avail; + + pages = len >> PAGE_SHIFT; + argp->pagelist += pages; + argp->pagelen -= pages * PAGE_SIZE; + len -= pages * PAGE_SIZE; + + next_decode_page(argp); + } + argp->p += XDR_QUADLEN(len); + + return 0; +} + /** * savemem - duplicate a chunk of memory for later processing * @argp: NFSv4 compound argument structure to be freed with @@ -1265,8 +1303,6 @@ nfsd4_decode_verify(struct nfsd4_compoundargs *argp, struct nfsd4_verify *verify static __be32 nfsd4_decode_write(struct nfsd4_compoundargs *argp, struct nfsd4_write *write) { - int avail; - int len; DECODE_HEAD; status = nfsd4_decode_stateid(argp, &write->wr_stateid); @@ -1279,34 +1315,10 @@ nfsd4_decode_write(struct nfsd4_compoundargs *argp, struct nfsd4_write *write) goto xdr_error; write->wr_buflen = be32_to_cpup(p++); - /* Sorry .. no magic macros for this.. * - * READ_BUF(write->wr_buflen); - * SAVEMEM(write->wr_buf, write->wr_buflen); - */ - avail = (char*)argp->end - (char*)argp->p; - if (avail + argp->pagelen < write->wr_buflen) { - dprintk("NFSD: xdr error (%s:%d)\n", - __FILE__, __LINE__); - goto xdr_error; - } - write->wr_head.iov_base = p; - write->wr_head.iov_len = avail; - write->wr_pagelist = argp->pagelist; - - len = XDR_QUADLEN(write->wr_buflen) << 2; - if (len >= avail) { - int pages; - - len -= avail; - - pages = len >> PAGE_SHIFT; - argp->pagelist += pages; - argp->pagelen -= pages * PAGE_SIZE; - len -= pages * PAGE_SIZE; - - next_decode_page(argp); - } - argp->p += XDR_QUADLEN(len); + status = svcxdr_construct_vector(argp, &write->wr_head, + &write->wr_pagelist, write->wr_buflen); + if (status) + return status; DECODE_TAIL; } -- cgit v1.2.3 From 4dd05fceb7eeceac4daeceec0d6a2e6a2528a3e4 Mon Sep 17 00:00:00 2001 From: Frank van der Linden Date: Tue, 23 Jun 2020 22:39:22 +0000 Subject: nfsd: add defines for NFSv4.2 extended attribute support Add defines for server-side extended attribute support. Most have already been added as part of client support, but these are the network order error codes for the noxattr and xattr2big errors, and the addition of the xattr support to the supported file attributes (if configured). Signed-off-by: Frank van der Linden Signed-off-by: Chuck Lever --- fs/nfsd/nfsd.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h index 57c832d1b30f..cb742e17e04a 100644 --- a/fs/nfsd/nfsd.h +++ b/fs/nfsd/nfsd.h @@ -286,6 +286,8 @@ void nfsd_lockd_shutdown(void); #define nfserr_wrong_lfs cpu_to_be32(NFS4ERR_WRONG_LFS) #define nfserr_badlabel cpu_to_be32(NFS4ERR_BADLABEL) #define nfserr_file_open cpu_to_be32(NFS4ERR_FILE_OPEN) +#define nfserr_xattr2big cpu_to_be32(NFS4ERR_XATTR2BIG) +#define nfserr_noxattr cpu_to_be32(NFS4ERR_NOXATTR) /* error codes for internal use */ /* if a request fails due to kmalloc failure, it gets dropped. @@ -387,7 +389,8 @@ void nfsd_lockd_shutdown(void); (NFSD4_1_SUPPORTED_ATTRS_WORD2 | \ FATTR4_WORD2_CHANGE_ATTR_TYPE | \ FATTR4_WORD2_MODE_UMASK | \ - NFSD4_2_SECURITY_ATTRS) + NFSD4_2_SECURITY_ATTRS | \ + FATTR4_WORD2_XATTR_SUPPORT) extern const u32 nfsd_suppattrs[3][3]; -- cgit v1.2.3 From 32119446bb65da559eb6f05236086fe449d2a024 Mon Sep 17 00:00:00 2001 From: Frank van der Linden Date: Tue, 23 Jun 2020 22:39:23 +0000 Subject: nfsd: define xattr functions to call into their vfs counterparts This adds the filehandle based functions for the xattr operations that call in to the vfs layer to do the actual work. Signed-off-by: Frank van der Linden [ cel: address checkpatch.pl complaint ] Signed-off-by: Chuck Lever --- fs/nfsd/vfs.c | 227 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ fs/nfsd/vfs.h | 10 +++ 2 files changed, 237 insertions(+) diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index d22a056da477..6d2955253f73 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -2065,6 +2065,233 @@ static int exp_rdonly(struct svc_rqst *rqstp, struct svc_export *exp) return nfsexp_flags(rqstp, exp) & NFSEXP_READONLY; } +#ifdef CONFIG_NFSD_V4 +/* + * Helper function to translate error numbers. In the case of xattr operations, + * some error codes need to be translated outside of the standard translations. + * + * ENODATA needs to be translated to nfserr_noxattr. + * E2BIG to nfserr_xattr2big. + * + * Additionally, vfs_listxattr can return -ERANGE. This means that the + * file has too many extended attributes to retrieve inside an + * XATTR_LIST_MAX sized buffer. This is a bug in the xattr implementation: + * filesystems will allow the adding of extended attributes until they hit + * their own internal limit. This limit may be larger than XATTR_LIST_MAX. + * So, at that point, the attributes are present and valid, but can't + * be retrieved using listxattr, since the upper level xattr code enforces + * the XATTR_LIST_MAX limit. + * + * This bug means that we need to deal with listxattr returning -ERANGE. The + * best mapping is to return TOOSMALL. + */ +static __be32 +nfsd_xattr_errno(int err) +{ + switch (err) { + case -ENODATA: + return nfserr_noxattr; + case -E2BIG: + return nfserr_xattr2big; + case -ERANGE: + return nfserr_toosmall; + } + return nfserrno(err); +} + +/* + * Retrieve the specified user extended attribute. To avoid always + * having to allocate the maximum size (since we are not getting + * a maximum size from the RPC), do a probe + alloc. Hold a reader + * lock on i_rwsem to prevent the extended attribute from changing + * size while we're doing this. + */ +__be32 +nfsd_getxattr(struct svc_rqst *rqstp, struct svc_fh *fhp, char *name, + void **bufp, int *lenp) +{ + ssize_t len; + __be32 err; + char *buf; + struct inode *inode; + struct dentry *dentry; + + err = fh_verify(rqstp, fhp, 0, NFSD_MAY_READ); + if (err) + return err; + + err = nfs_ok; + dentry = fhp->fh_dentry; + inode = d_inode(dentry); + + inode_lock_shared(inode); + + len = vfs_getxattr(dentry, name, NULL, 0); + + /* + * Zero-length attribute, just return. + */ + if (len == 0) { + *bufp = NULL; + *lenp = 0; + goto out; + } + + if (len < 0) { + err = nfsd_xattr_errno(len); + goto out; + } + + if (len > *lenp) { + err = nfserr_toosmall; + goto out; + } + + buf = kvmalloc(len, GFP_KERNEL | GFP_NOFS); + if (buf == NULL) { + err = nfserr_jukebox; + goto out; + } + + len = vfs_getxattr(dentry, name, buf, len); + if (len <= 0) { + kvfree(buf); + buf = NULL; + err = nfsd_xattr_errno(len); + } + + *lenp = len; + *bufp = buf; + +out: + inode_unlock_shared(inode); + + return err; +} + +/* + * Retrieve the xattr names. Since we can't know how many are + * user extended attributes, we must get all attributes here, + * and have the XDR encode filter out the "user." ones. + * + * While this could always just allocate an XATTR_LIST_MAX + * buffer, that's a waste, so do a probe + allocate. To + * avoid any changes between the probe and allocate, wrap + * this in inode_lock. + */ +__be32 +nfsd_listxattr(struct svc_rqst *rqstp, struct svc_fh *fhp, char **bufp, + int *lenp) +{ + ssize_t len; + __be32 err; + char *buf; + struct inode *inode; + struct dentry *dentry; + + err = fh_verify(rqstp, fhp, 0, NFSD_MAY_READ); + if (err) + return err; + + dentry = fhp->fh_dentry; + inode = d_inode(dentry); + *lenp = 0; + + inode_lock_shared(inode); + + len = vfs_listxattr(dentry, NULL, 0); + if (len <= 0) { + err = nfsd_xattr_errno(len); + goto out; + } + + if (len > XATTR_LIST_MAX) { + err = nfserr_xattr2big; + goto out; + } + + /* + * We're holding i_rwsem - use GFP_NOFS. + */ + buf = kvmalloc(len, GFP_KERNEL | GFP_NOFS); + if (buf == NULL) { + err = nfserr_jukebox; + goto out; + } + + len = vfs_listxattr(dentry, buf, len); + if (len <= 0) { + kvfree(buf); + err = nfsd_xattr_errno(len); + goto out; + } + + *lenp = len; + *bufp = buf; + + err = nfs_ok; +out: + inode_unlock_shared(inode); + + return err; +} + +/* + * Removexattr and setxattr need to call fh_lock to both lock the inode + * and set the change attribute. Since the top-level vfs_removexattr + * and vfs_setxattr calls already do their own inode_lock calls, call + * the _locked variant. Pass in a NULL pointer for delegated_inode, + * and let the client deal with NFS4ERR_DELAY (same as with e.g. + * setattr and remove). + */ +__be32 +nfsd_removexattr(struct svc_rqst *rqstp, struct svc_fh *fhp, char *name) +{ + int err, ret; + + err = fh_verify(rqstp, fhp, 0, NFSD_MAY_WRITE); + if (err) + return err; + + ret = fh_want_write(fhp); + if (ret) + return nfserrno(ret); + + fh_lock(fhp); + + ret = __vfs_removexattr_locked(fhp->fh_dentry, name, NULL); + + fh_unlock(fhp); + fh_drop_write(fhp); + + return nfsd_xattr_errno(ret); +} + +__be32 +nfsd_setxattr(struct svc_rqst *rqstp, struct svc_fh *fhp, char *name, + void *buf, u32 len, u32 flags) +{ + int err, ret; + + err = fh_verify(rqstp, fhp, 0, NFSD_MAY_WRITE); + if (err) + return err; + + ret = fh_want_write(fhp); + if (ret) + return nfserrno(ret); + fh_lock(fhp); + + ret = __vfs_setxattr_locked(fhp->fh_dentry, name, buf, len, flags, + NULL); + + fh_unlock(fhp); + fh_drop_write(fhp); + + return nfsd_xattr_errno(ret); +} +#endif + /* * Check for a user's access permissions to this inode. */ diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h index 3eb660ad80d1..a2442ebe5acf 100644 --- a/fs/nfsd/vfs.h +++ b/fs/nfsd/vfs.h @@ -76,6 +76,16 @@ __be32 do_nfsd_create(struct svc_rqst *, struct svc_fh *, __be32 nfsd_commit(struct svc_rqst *, struct svc_fh *, loff_t, unsigned long, __be32 *verf); #endif /* CONFIG_NFSD_V3 */ +#ifdef CONFIG_NFSD_V4 +__be32 nfsd_getxattr(struct svc_rqst *rqstp, struct svc_fh *fhp, + char *name, void **bufp, int *lenp); +__be32 nfsd_listxattr(struct svc_rqst *rqstp, struct svc_fh *fhp, + char **bufp, int *lenp); +__be32 nfsd_removexattr(struct svc_rqst *rqstp, struct svc_fh *fhp, + char *name); +__be32 nfsd_setxattr(struct svc_rqst *rqstp, struct svc_fh *fhp, + char *name, void *buf, u32 len, u32 flags); +#endif int nfsd_open_break_lease(struct inode *, int); __be32 nfsd_open(struct svc_rqst *, struct svc_fh *, umode_t, int, struct file **); -- cgit v1.2.3 From c11d7fd1b3178cc651d532bab54adca7f26ae0d0 Mon Sep 17 00:00:00 2001 From: Frank van der Linden Date: Tue, 23 Jun 2020 22:39:24 +0000 Subject: nfsd: take xattr bits into account for permission checks Since the NFSv4.2 extended attributes extension defines 3 new access bits for xattr operations, take them in to account when validating what the client is asking for, and when checking permissions. Signed-off-by: Frank van der Linden Signed-off-by: Chuck Lever --- fs/nfsd/nfs4proc.c | 8 +++++++- fs/nfsd/vfs.c | 12 ++++++++++++ 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index a09c35f0f6f0..841aad772798 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -566,8 +566,14 @@ nfsd4_access(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) { struct nfsd4_access *access = &u->access; + u32 access_full; - if (access->ac_req_access & ~NFS3_ACCESS_FULL) + access_full = NFS3_ACCESS_FULL; + if (cstate->minorversion >= 2) + access_full |= NFS4_ACCESS_XALIST | NFS4_ACCESS_XAREAD | + NFS4_ACCESS_XAWRITE; + + if (access->ac_req_access & ~access_full) return nfserr_inval; access->ac_resp_access = access->ac_req_access; diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 6d2955253f73..7d2933b85b65 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -612,6 +612,12 @@ static struct accessmap nfs3_regaccess[] = { { NFS3_ACCESS_MODIFY, NFSD_MAY_WRITE|NFSD_MAY_TRUNC }, { NFS3_ACCESS_EXTEND, NFSD_MAY_WRITE }, +#ifdef CONFIG_NFSD_V4 + { NFS4_ACCESS_XAREAD, NFSD_MAY_READ }, + { NFS4_ACCESS_XAWRITE, NFSD_MAY_WRITE }, + { NFS4_ACCESS_XALIST, NFSD_MAY_READ }, +#endif + { 0, 0 } }; @@ -622,6 +628,12 @@ static struct accessmap nfs3_diraccess[] = { { NFS3_ACCESS_EXTEND, NFSD_MAY_EXEC|NFSD_MAY_WRITE }, { NFS3_ACCESS_DELETE, NFSD_MAY_REMOVE }, +#ifdef CONFIG_NFSD_V4 + { NFS4_ACCESS_XAREAD, NFSD_MAY_READ }, + { NFS4_ACCESS_XAWRITE, NFSD_MAY_WRITE }, + { NFS4_ACCESS_XALIST, NFSD_MAY_READ }, +#endif + { 0, 0 } }; -- cgit v1.2.3 From 6178713bd46b06a1115f5bc6a3ff38e95b6da9ca Mon Sep 17 00:00:00 2001 From: Frank van der Linden Date: Tue, 23 Jun 2020 22:39:25 +0000 Subject: nfsd: add structure definitions for xattr requests / responses Add the structures used in extended attribute request / response handling. Signed-off-by: Frank van der Linden Signed-off-by: Chuck Lever --- fs/nfsd/xdr4.h | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index db63d39b1507..66499fb6b567 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -224,6 +224,32 @@ struct nfsd4_putfh { bool no_verify; /* represents foreigh fh */ }; +struct nfsd4_getxattr { + char *getxa_name; /* request */ + u32 getxa_len; /* request */ + void *getxa_buf; +}; + +struct nfsd4_setxattr { + u32 setxa_flags; /* request */ + char *setxa_name; /* request */ + char *setxa_buf; /* request */ + u32 setxa_len; /* request */ + struct nfsd4_change_info setxa_cinfo; /* response */ +}; + +struct nfsd4_removexattr { + char *rmxa_name; /* request */ + struct nfsd4_change_info rmxa_cinfo; /* response */ +}; + +struct nfsd4_listxattrs { + u64 lsxa_cookie; /* request */ + u32 lsxa_maxcount; /* request */ + char *lsxa_buf; /* unfiltered buffer (reply) */ + u32 lsxa_len; /* unfiltered len (reply) */ +}; + struct nfsd4_open { u32 op_claim_type; /* request */ struct xdr_netobj op_fname; /* request - everything but CLAIM_PREV */ @@ -649,6 +675,11 @@ struct nfsd4_op { struct nfsd4_offload_status offload_status; struct nfsd4_copy_notify copy_notify; struct nfsd4_seek seek; + + struct nfsd4_getxattr getxattr; + struct nfsd4_setxattr setxattr; + struct nfsd4_listxattrs listxattrs; + struct nfsd4_removexattr removexattr; } u; struct nfs4_replay * replay; }; -- cgit v1.2.3 From 23e50fe3a5e6045a573c69d4b0e3d78aa6183323 Mon Sep 17 00:00:00 2001 From: Frank van der Linden Date: Tue, 23 Jun 2020 22:39:26 +0000 Subject: nfsd: implement the xattr functions and en/decode logic Implement the main entry points for the *XATTR operations. Add functions to calculate the reply size for the user extended attribute operations, and implement the XDR encode / decode logic for these operations. Add the user extended attributes operations to nfsd4_ops. Signed-off-by: Frank van der Linden Signed-off-by: Chuck Lever --- fs/nfsd/nfs4proc.c | 120 ++++++++++++++ fs/nfsd/nfs4xdr.c | 450 +++++++++++++++++++++++++++++++++++++++++++++++++++ include/linux/nfs4.h | 2 +- 3 files changed, 571 insertions(+), 1 deletion(-) diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 841aad772798..a527da3d8052 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -2097,6 +2097,68 @@ out: } #endif /* CONFIG_NFSD_PNFS */ +static __be32 +nfsd4_getxattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + union nfsd4_op_u *u) +{ + struct nfsd4_getxattr *getxattr = &u->getxattr; + + return nfsd_getxattr(rqstp, &cstate->current_fh, + getxattr->getxa_name, &getxattr->getxa_buf, + &getxattr->getxa_len); +} + +static __be32 +nfsd4_setxattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + union nfsd4_op_u *u) +{ + struct nfsd4_setxattr *setxattr = &u->setxattr; + __be32 ret; + + if (opens_in_grace(SVC_NET(rqstp))) + return nfserr_grace; + + ret = nfsd_setxattr(rqstp, &cstate->current_fh, setxattr->setxa_name, + setxattr->setxa_buf, setxattr->setxa_len, + setxattr->setxa_flags); + + if (!ret) + set_change_info(&setxattr->setxa_cinfo, &cstate->current_fh); + + return ret; +} + +static __be32 +nfsd4_listxattrs(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + union nfsd4_op_u *u) +{ + /* + * Get the entire list, then copy out only the user attributes + * in the encode function. + */ + return nfsd_listxattr(rqstp, &cstate->current_fh, + &u->listxattrs.lsxa_buf, &u->listxattrs.lsxa_len); +} + +static __be32 +nfsd4_removexattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + union nfsd4_op_u *u) +{ + struct nfsd4_removexattr *removexattr = &u->removexattr; + __be32 ret; + + if (opens_in_grace(SVC_NET(rqstp))) + return nfserr_grace; + + ret = nfsd_removexattr(rqstp, &cstate->current_fh, + removexattr->rmxa_name); + + if (!ret) + set_change_info(&removexattr->rmxa_cinfo, &cstate->current_fh); + + return ret; +} + /* * NULL call. */ @@ -2706,6 +2768,42 @@ static inline u32 nfsd4_seek_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) return (op_encode_hdr_size + 3) * sizeof(__be32); } +static inline u32 nfsd4_getxattr_rsize(struct svc_rqst *rqstp, + struct nfsd4_op *op) +{ + u32 maxcount, rlen; + + maxcount = svc_max_payload(rqstp); + rlen = min_t(u32, XATTR_SIZE_MAX, maxcount); + + return (op_encode_hdr_size + 1 + XDR_QUADLEN(rlen)) * sizeof(__be32); +} + +static inline u32 nfsd4_setxattr_rsize(struct svc_rqst *rqstp, + struct nfsd4_op *op) +{ + return (op_encode_hdr_size + op_encode_change_info_maxsz) + * sizeof(__be32); +} +static inline u32 nfsd4_listxattrs_rsize(struct svc_rqst *rqstp, + struct nfsd4_op *op) +{ + u32 maxcount, rlen; + + maxcount = svc_max_payload(rqstp); + rlen = min(op->u.listxattrs.lsxa_maxcount, maxcount); + + return (op_encode_hdr_size + 4 + XDR_QUADLEN(rlen)) * sizeof(__be32); +} + +static inline u32 nfsd4_removexattr_rsize(struct svc_rqst *rqstp, + struct nfsd4_op *op) +{ + return (op_encode_hdr_size + op_encode_change_info_maxsz) + * sizeof(__be32); +} + + static const struct nfsd4_operation nfsd4_ops[] = { [OP_ACCESS] = { .op_func = nfsd4_access, @@ -3087,6 +3185,28 @@ static const struct nfsd4_operation nfsd4_ops[] = { .op_name = "OP_COPY_NOTIFY", .op_rsize_bop = nfsd4_copy_notify_rsize, }, + [OP_GETXATTR] = { + .op_func = nfsd4_getxattr, + .op_name = "OP_GETXATTR", + .op_rsize_bop = nfsd4_getxattr_rsize, + }, + [OP_SETXATTR] = { + .op_func = nfsd4_setxattr, + .op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME, + .op_name = "OP_SETXATTR", + .op_rsize_bop = nfsd4_setxattr_rsize, + }, + [OP_LISTXATTRS] = { + .op_func = nfsd4_listxattrs, + .op_name = "OP_LISTXATTRS", + .op_rsize_bop = nfsd4_listxattrs_rsize, + }, + [OP_REMOVEXATTR] = { + .op_func = nfsd4_removexattr, + .op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME, + .op_name = "OP_REMOVEXATTR", + .op_rsize_bop = nfsd4_removexattr_rsize, + }, }; /** diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 48806b493eba..8bacc0ceae19 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -41,6 +41,8 @@ #include #include #include +#include +#include #include "idmap.h" #include "acl.h" @@ -1877,6 +1879,208 @@ nfsd4_decode_seek(struct nfsd4_compoundargs *argp, struct nfsd4_seek *seek) DECODE_TAIL; } +/* + * XDR data that is more than PAGE_SIZE in size is normally part of a + * read or write. However, the size of extended attributes is limited + * by the maximum request size, and then further limited by the underlying + * filesystem limits. This can exceed PAGE_SIZE (currently, XATTR_SIZE_MAX + * is 64k). Since there is no kvec- or page-based interface to xattrs, + * and we're not dealing with contiguous pages, we need to do some copying. + */ + +/* + * Decode data into buffer. Uses head and pages constructed by + * svcxdr_construct_vector. + */ +static __be32 +nfsd4_vbuf_from_vector(struct nfsd4_compoundargs *argp, struct kvec *head, + struct page **pages, char **bufp, u32 buflen) +{ + char *tmp, *dp; + u32 len; + + if (buflen <= head->iov_len) { + /* + * We're in luck, the head has enough space. Just return + * the head, no need for copying. + */ + *bufp = head->iov_base; + return 0; + } + + tmp = svcxdr_tmpalloc(argp, buflen); + if (tmp == NULL) + return nfserr_jukebox; + + dp = tmp; + memcpy(dp, head->iov_base, head->iov_len); + buflen -= head->iov_len; + dp += head->iov_len; + + while (buflen > 0) { + len = min_t(u32, buflen, PAGE_SIZE); + memcpy(dp, page_address(*pages), len); + + buflen -= len; + dp += len; + pages++; + } + + *bufp = tmp; + return 0; +} + +/* + * Get a user extended attribute name from the XDR buffer. + * It will not have the "user." prefix, so prepend it. + * Lastly, check for nul characters in the name. + */ +static __be32 +nfsd4_decode_xattr_name(struct nfsd4_compoundargs *argp, char **namep) +{ + DECODE_HEAD; + char *name, *sp, *dp; + u32 namelen, cnt; + + READ_BUF(4); + namelen = be32_to_cpup(p++); + + if (namelen > (XATTR_NAME_MAX - XATTR_USER_PREFIX_LEN)) + return nfserr_nametoolong; + + if (namelen == 0) + goto xdr_error; + + READ_BUF(namelen); + + name = svcxdr_tmpalloc(argp, namelen + XATTR_USER_PREFIX_LEN + 1); + if (!name) + return nfserr_jukebox; + + memcpy(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN); + + /* + * Copy the extended attribute name over while checking for 0 + * characters. + */ + sp = (char *)p; + dp = name + XATTR_USER_PREFIX_LEN; + cnt = namelen; + + while (cnt-- > 0) { + if (*sp == '\0') + goto xdr_error; + *dp++ = *sp++; + } + *dp = '\0'; + + *namep = name; + + DECODE_TAIL; +} + +/* + * A GETXATTR op request comes without a length specifier. We just set the + * maximum length for the reply based on XATTR_SIZE_MAX and the maximum + * channel reply size. nfsd_getxattr will probe the length of the xattr, + * check it against getxa_len, and allocate + return the value. + */ +static __be32 +nfsd4_decode_getxattr(struct nfsd4_compoundargs *argp, + struct nfsd4_getxattr *getxattr) +{ + __be32 status; + u32 maxcount; + + status = nfsd4_decode_xattr_name(argp, &getxattr->getxa_name); + if (status) + return status; + + maxcount = svc_max_payload(argp->rqstp); + maxcount = min_t(u32, XATTR_SIZE_MAX, maxcount); + + getxattr->getxa_len = maxcount; + + return status; +} + +static __be32 +nfsd4_decode_setxattr(struct nfsd4_compoundargs *argp, + struct nfsd4_setxattr *setxattr) +{ + DECODE_HEAD; + u32 flags, maxcount, size; + struct kvec head; + struct page **pagelist; + + READ_BUF(4); + flags = be32_to_cpup(p++); + + if (flags > SETXATTR4_REPLACE) + return nfserr_inval; + setxattr->setxa_flags = flags; + + status = nfsd4_decode_xattr_name(argp, &setxattr->setxa_name); + if (status) + return status; + + maxcount = svc_max_payload(argp->rqstp); + maxcount = min_t(u32, XATTR_SIZE_MAX, maxcount); + + READ_BUF(4); + size = be32_to_cpup(p++); + if (size > maxcount) + return nfserr_xattr2big; + + setxattr->setxa_len = size; + if (size > 0) { + status = svcxdr_construct_vector(argp, &head, &pagelist, size); + if (status) + return status; + + status = nfsd4_vbuf_from_vector(argp, &head, pagelist, + &setxattr->setxa_buf, size); + } + + DECODE_TAIL; +} + +static __be32 +nfsd4_decode_listxattrs(struct nfsd4_compoundargs *argp, + struct nfsd4_listxattrs *listxattrs) +{ + DECODE_HEAD; + u32 maxcount; + + READ_BUF(12); + p = xdr_decode_hyper(p, &listxattrs->lsxa_cookie); + + /* + * If the cookie is too large to have even one user.x attribute + * plus trailing '\0' left in a maximum size buffer, it's invalid. + */ + if (listxattrs->lsxa_cookie >= + (XATTR_LIST_MAX / (XATTR_USER_PREFIX_LEN + 2))) + return nfserr_badcookie; + + maxcount = be32_to_cpup(p++); + if (maxcount < 8) + /* Always need at least 2 words (length and one character) */ + return nfserr_inval; + + maxcount = min(maxcount, svc_max_payload(argp->rqstp)); + listxattrs->lsxa_maxcount = maxcount; + + DECODE_TAIL; +} + +static __be32 +nfsd4_decode_removexattr(struct nfsd4_compoundargs *argp, + struct nfsd4_removexattr *removexattr) +{ + return nfsd4_decode_xattr_name(argp, &removexattr->rmxa_name); +} + static __be32 nfsd4_decode_noop(struct nfsd4_compoundargs *argp, void *p) { @@ -1973,6 +2177,11 @@ static const nfsd4_dec nfsd4_dec_ops[] = { [OP_SEEK] = (nfsd4_dec)nfsd4_decode_seek, [OP_WRITE_SAME] = (nfsd4_dec)nfsd4_decode_notsupp, [OP_CLONE] = (nfsd4_dec)nfsd4_decode_clone, + /* RFC 8276 extended atributes operations */ + [OP_GETXATTR] = (nfsd4_dec)nfsd4_decode_getxattr, + [OP_SETXATTR] = (nfsd4_dec)nfsd4_decode_setxattr, + [OP_LISTXATTRS] = (nfsd4_dec)nfsd4_decode_listxattrs, + [OP_REMOVEXATTR] = (nfsd4_dec)nfsd4_decode_removexattr, }; static inline bool @@ -4458,6 +4667,241 @@ nfsd4_encode_noop(struct nfsd4_compoundres *resp, __be32 nfserr, void *p) return nfserr; } +/* + * Encode kmalloc-ed buffer in to XDR stream. + */ +static int +nfsd4_vbuf_to_stream(struct xdr_stream *xdr, char *buf, u32 buflen) +{ + u32 cplen; + __be32 *p; + + cplen = min_t(unsigned long, buflen, + ((void *)xdr->end - (void *)xdr->p)); + p = xdr_reserve_space(xdr, cplen); + if (!p) + return nfserr_resource; + + memcpy(p, buf, cplen); + buf += cplen; + buflen -= cplen; + + while (buflen) { + cplen = min_t(u32, buflen, PAGE_SIZE); + p = xdr_reserve_space(xdr, cplen); + if (!p) + return nfserr_resource; + + memcpy(p, buf, cplen); + + if (cplen < PAGE_SIZE) { + /* + * We're done, with a length that wasn't page + * aligned, so possibly not word aligned. Pad + * any trailing bytes with 0. + */ + xdr_encode_opaque_fixed(p, NULL, cplen); + break; + } + + buflen -= PAGE_SIZE; + buf += PAGE_SIZE; + } + + return 0; +} + +static __be32 +nfsd4_encode_getxattr(struct nfsd4_compoundres *resp, __be32 nfserr, + struct nfsd4_getxattr *getxattr) +{ + struct xdr_stream *xdr = &resp->xdr; + __be32 *p, err; + + p = xdr_reserve_space(xdr, 4); + if (!p) + return nfserr_resource; + + *p = cpu_to_be32(getxattr->getxa_len); + + if (getxattr->getxa_len == 0) + return 0; + + err = nfsd4_vbuf_to_stream(xdr, getxattr->getxa_buf, + getxattr->getxa_len); + + kvfree(getxattr->getxa_buf); + + return err; +} + +static __be32 +nfsd4_encode_setxattr(struct nfsd4_compoundres *resp, __be32 nfserr, + struct nfsd4_setxattr *setxattr) +{ + struct xdr_stream *xdr = &resp->xdr; + __be32 *p; + + p = xdr_reserve_space(xdr, 20); + if (!p) + return nfserr_resource; + + encode_cinfo(p, &setxattr->setxa_cinfo); + + return 0; +} + +/* + * See if there are cookie values that can be rejected outright. + */ +static __be32 +nfsd4_listxattr_validate_cookie(struct nfsd4_listxattrs *listxattrs, + u32 *offsetp) +{ + u64 cookie = listxattrs->lsxa_cookie; + + /* + * If the cookie is larger than the maximum number we can fit + * in either the buffer we just got back from vfs_listxattr, or, + * XDR-encoded, in the return buffer, it's invalid. + */ + if (cookie > (listxattrs->lsxa_len) / (XATTR_USER_PREFIX_LEN + 2)) + return nfserr_badcookie; + + if (cookie > (listxattrs->lsxa_maxcount / + (XDR_QUADLEN(XATTR_USER_PREFIX_LEN + 2) + 4))) + return nfserr_badcookie; + + *offsetp = (u32)cookie; + return 0; +} + +static __be32 +nfsd4_encode_listxattrs(struct nfsd4_compoundres *resp, __be32 nfserr, + struct nfsd4_listxattrs *listxattrs) +{ + struct xdr_stream *xdr = &resp->xdr; + u32 cookie_offset, count_offset, eof; + u32 left, xdrleft, slen, count; + u32 xdrlen, offset; + u64 cookie; + char *sp; + __be32 status; + __be32 *p; + u32 nuser; + + eof = 1; + + status = nfsd4_listxattr_validate_cookie(listxattrs, &offset); + if (status) + goto out; + + /* + * Reserve space for the cookie and the name array count. Record + * the offsets to save them later. + */ + cookie_offset = xdr->buf->len; + count_offset = cookie_offset + 8; + p = xdr_reserve_space(xdr, 12); + if (!p) { + status = nfserr_resource; + goto out; + } + + count = 0; + left = listxattrs->lsxa_len; + sp = listxattrs->lsxa_buf; + nuser = 0; + + xdrleft = listxattrs->lsxa_maxcount; + + while (left > 0 && xdrleft > 0) { + slen = strlen(sp); + + /* + * Check if this a user. attribute, skip it if not. + */ + if (strncmp(sp, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)) + goto contloop; + + slen -= XATTR_USER_PREFIX_LEN; + xdrlen = 4 + ((slen + 3) & ~3); + if (xdrlen > xdrleft) { + if (count == 0) { + /* + * Can't even fit the first attribute name. + */ + status = nfserr_toosmall; + goto out; + } + eof = 0; + goto wreof; + } + + left -= XATTR_USER_PREFIX_LEN; + sp += XATTR_USER_PREFIX_LEN; + if (nuser++ < offset) + goto contloop; + + + p = xdr_reserve_space(xdr, xdrlen); + if (!p) { + status = nfserr_resource; + goto out; + } + + p = xdr_encode_opaque(p, sp, slen); + + xdrleft -= xdrlen; + count++; +contloop: + sp += slen + 1; + left -= slen + 1; + } + + /* + * If there were user attributes to copy, but we didn't copy + * any, the offset was too large (e.g. the cookie was invalid). + */ + if (nuser > 0 && count == 0) { + status = nfserr_badcookie; + goto out; + } + +wreof: + p = xdr_reserve_space(xdr, 4); + if (!p) { + status = nfserr_resource; + goto out; + } + *p = cpu_to_be32(eof); + + cookie = offset + count; + + write_bytes_to_xdr_buf(xdr->buf, cookie_offset, &cookie, 8); + count = htonl(count); + write_bytes_to_xdr_buf(xdr->buf, count_offset, &count, 4); +out: + if (listxattrs->lsxa_len) + kvfree(listxattrs->lsxa_buf); + return status; +} + +static __be32 +nfsd4_encode_removexattr(struct nfsd4_compoundres *resp, __be32 nfserr, + struct nfsd4_removexattr *removexattr) +{ + struct xdr_stream *xdr = &resp->xdr; + __be32 *p; + + p = xdr_reserve_space(xdr, 20); + if (!p) + return nfserr_resource; + + p = encode_cinfo(p, &removexattr->rmxa_cinfo); + return 0; +} + typedef __be32(* nfsd4_enc)(struct nfsd4_compoundres *, __be32, void *); /* @@ -4547,6 +4991,12 @@ static const nfsd4_enc nfsd4_enc_ops[] = { [OP_SEEK] = (nfsd4_enc)nfsd4_encode_seek, [OP_WRITE_SAME] = (nfsd4_enc)nfsd4_encode_noop, [OP_CLONE] = (nfsd4_enc)nfsd4_encode_noop, + + /* RFC 8276 extended atributes operations */ + [OP_GETXATTR] = (nfsd4_enc)nfsd4_encode_getxattr, + [OP_SETXATTR] = (nfsd4_enc)nfsd4_encode_setxattr, + [OP_LISTXATTRS] = (nfsd4_enc)nfsd4_encode_listxattrs, + [OP_REMOVEXATTR] = (nfsd4_enc)nfsd4_encode_removexattr, }; /* diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h index e6ca9d1d2e76..33ebe476428e 100644 --- a/include/linux/nfs4.h +++ b/include/linux/nfs4.h @@ -165,7 +165,7 @@ Needs to be updated if more operations are defined in future.*/ #define FIRST_NFS4_OP OP_ACCESS #define LAST_NFS40_OP OP_RELEASE_LOCKOWNER #define LAST_NFS41_OP OP_RECLAIM_COMPLETE -#define LAST_NFS42_OP OP_CLONE +#define LAST_NFS42_OP OP_REMOVEXATTR #define LAST_NFS4_OP LAST_NFS42_OP enum nfsstat4 { -- cgit v1.2.3 From 0e885e846d96df0c8a4a829b1ad355a82ccda656 Mon Sep 17 00:00:00 2001 From: Frank van der Linden Date: Tue, 23 Jun 2020 22:39:27 +0000 Subject: nfsd: add fattr support for user extended attributes Check if user extended attributes are supported for an inode, and return the answer when being queried for file attributes. An exported filesystem can now signal its RFC8276 user extended attributes capability. Signed-off-by: Frank van der Linden Signed-off-by: Chuck Lever --- fs/nfsd/nfs4xdr.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 8bacc0ceae19..259d5ad0e3f4 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -3213,6 +3213,15 @@ out_acl: } #endif + if (bmval2 & FATTR4_WORD2_XATTR_SUPPORT) { + p = xdr_reserve_space(xdr, 4); + if (!p) + goto out_resource; + err = xattr_supported_namespace(d_inode(dentry), + XATTR_USER_PREFIX); + *p++ = cpu_to_be32(err == 0); + } + attrlen = htonl(xdr->buf->len - attrlen_offset - 4); write_bytes_to_xdr_buf(xdr->buf, attrlen_offset, &attrlen, 4); status = nfs_ok; -- cgit v1.2.3 From 10b9d99a3dbbf5e9af838d1887a1047875dcafd9 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Sat, 18 Apr 2020 18:30:42 -0400 Subject: SUNRPC: Augment server-side rpcgss tracepoints Add similar tracepoints to those that were recently added on the client side to track failures in the integ and priv unwrap paths. And, let's collect the seqno-specific tracepoints together with a common naming convention. Regarding the gss_check_seq_num() changes: everywhere else treats the GSS sequence number as an unsigned 32-bit integer. As far back as 2.6.12, I couldn't find a compelling reason to do things differently here. As a defensive change it's better to eliminate needless implicit sign conversions. Signed-off-by: Chuck Lever --- include/trace/events/rpcgss.h | 168 ++++++++++++++++++++++++++++++++------ net/sunrpc/auth_gss/svcauth_gss.c | 117 +++++++++++++++++--------- net/sunrpc/auth_gss/trace.c | 3 + 3 files changed, 224 insertions(+), 64 deletions(-) diff --git a/include/trace/events/rpcgss.h b/include/trace/events/rpcgss.h index b9b51a4b1db1..ffdbe6f85da8 100644 --- a/include/trace/events/rpcgss.h +++ b/include/trace/events/rpcgss.h @@ -170,55 +170,144 @@ DECLARE_EVENT_CLASS(rpcgss_ctx_class, DEFINE_CTX_EVENT(init); DEFINE_CTX_EVENT(destroy); +DECLARE_EVENT_CLASS(rpcgss_svc_gssapi_class, + TP_PROTO( + const struct svc_rqst *rqstp, + u32 maj_stat + ), + + TP_ARGS(rqstp, maj_stat), + + TP_STRUCT__entry( + __field(u32, xid) + __field(u32, maj_stat) + __string(addr, rqstp->rq_xprt->xpt_remotebuf) + ), + + TP_fast_assign( + __entry->xid = __be32_to_cpu(rqstp->rq_xid); + __entry->maj_stat = maj_stat; + __assign_str(addr, rqstp->rq_xprt->xpt_remotebuf); + ), + + TP_printk("addr=%s xid=0x%08x maj_stat=%s", + __get_str(addr), __entry->xid, + __entry->maj_stat == 0 ? + "GSS_S_COMPLETE" : show_gss_status(__entry->maj_stat)) +); + +#define DEFINE_SVC_GSSAPI_EVENT(name) \ + DEFINE_EVENT(rpcgss_svc_gssapi_class, rpcgss_svc_##name, \ + TP_PROTO( \ + const struct svc_rqst *rqstp, \ + u32 maj_stat \ + ), \ + TP_ARGS(rqstp, maj_stat)) + +DEFINE_SVC_GSSAPI_EVENT(unwrap); +DEFINE_SVC_GSSAPI_EVENT(mic); + +TRACE_EVENT(rpcgss_svc_unwrap_failed, + TP_PROTO( + const struct svc_rqst *rqstp + ), + + TP_ARGS(rqstp), + + TP_STRUCT__entry( + __field(u32, xid) + __string(addr, rqstp->rq_xprt->xpt_remotebuf) + ), + + TP_fast_assign( + __entry->xid = be32_to_cpu(rqstp->rq_xid); + __assign_str(addr, rqstp->rq_xprt->xpt_remotebuf); + ), + + TP_printk("addr=%s xid=0x%08x", __get_str(addr), __entry->xid) +); + +TRACE_EVENT(rpcgss_svc_seqno_bad, + TP_PROTO( + const struct svc_rqst *rqstp, + u32 expected, + u32 received + ), + + TP_ARGS(rqstp, expected, received), + + TP_STRUCT__entry( + __field(u32, expected) + __field(u32, received) + __field(u32, xid) + __string(addr, rqstp->rq_xprt->xpt_remotebuf) + ), + + TP_fast_assign( + __entry->expected = expected; + __entry->received = received; + __entry->xid = __be32_to_cpu(rqstp->rq_xid); + __assign_str(addr, rqstp->rq_xprt->xpt_remotebuf); + ), + + TP_printk("addr=%s xid=0x%08x expected seqno %u, received seqno %u", + __get_str(addr), __entry->xid, + __entry->expected, __entry->received) +); + TRACE_EVENT(rpcgss_svc_accept_upcall, TP_PROTO( - __be32 xid, + const struct svc_rqst *rqstp, u32 major_status, u32 minor_status ), - TP_ARGS(xid, major_status, minor_status), + TP_ARGS(rqstp, major_status, minor_status), TP_STRUCT__entry( - __field(u32, xid) __field(u32, minor_status) __field(unsigned long, major_status) + __field(u32, xid) + __string(addr, rqstp->rq_xprt->xpt_remotebuf) ), TP_fast_assign( - __entry->xid = be32_to_cpu(xid); __entry->minor_status = minor_status; __entry->major_status = major_status; + __entry->xid = be32_to_cpu(rqstp->rq_xid); + __assign_str(addr, rqstp->rq_xprt->xpt_remotebuf); ), - TP_printk("xid=0x%08x major_status=%s (0x%08lx) minor_status=%u", - __entry->xid, __entry->major_status == 0 ? "GSS_S_COMPLETE" : - show_gss_status(__entry->major_status), + TP_printk("addr=%s xid=0x%08x major_status=%s (0x%08lx) minor_status=%u", + __get_str(addr), __entry->xid, + (__entry->major_status == 0) ? "GSS_S_COMPLETE" : + show_gss_status(__entry->major_status), __entry->major_status, __entry->minor_status ) ); -TRACE_EVENT(rpcgss_svc_accept, +TRACE_EVENT(rpcgss_svc_authenticate, TP_PROTO( - __be32 xid, - size_t len + const struct svc_rqst *rqstp, + const struct rpc_gss_wire_cred *gc ), - TP_ARGS(xid, len), + TP_ARGS(rqstp, gc), TP_STRUCT__entry( + __field(u32, seqno) __field(u32, xid) - __field(size_t, len) + __string(addr, rqstp->rq_xprt->xpt_remotebuf) ), TP_fast_assign( - __entry->xid = be32_to_cpu(xid); - __entry->len = len; + __entry->xid = be32_to_cpu(rqstp->rq_xid); + __entry->seqno = gc->gc_seq; + __assign_str(addr, rqstp->rq_xprt->xpt_remotebuf); ), - TP_printk("xid=0x%08x len=%zu", - __entry->xid, __entry->len - ) + TP_printk("addr=%s xid=0x%08x seqno=%u", __get_str(addr), + __entry->xid, __entry->seqno) ); @@ -371,11 +460,11 @@ TRACE_EVENT(rpcgss_update_slack, DECLARE_EVENT_CLASS(rpcgss_svc_seqno_class, TP_PROTO( - __be32 xid, + const struct svc_rqst *rqstp, u32 seqno ), - TP_ARGS(xid, seqno), + TP_ARGS(rqstp, seqno), TP_STRUCT__entry( __field(u32, xid) @@ -383,25 +472,52 @@ DECLARE_EVENT_CLASS(rpcgss_svc_seqno_class, ), TP_fast_assign( - __entry->xid = be32_to_cpu(xid); + __entry->xid = be32_to_cpu(rqstp->rq_xid); __entry->seqno = seqno; ), - TP_printk("xid=0x%08x seqno=%u, request discarded", + TP_printk("xid=0x%08x seqno=%u", __entry->xid, __entry->seqno) ); #define DEFINE_SVC_SEQNO_EVENT(name) \ - DEFINE_EVENT(rpcgss_svc_seqno_class, rpcgss_svc_##name, \ + DEFINE_EVENT(rpcgss_svc_seqno_class, rpcgss_svc_seqno_##name, \ TP_PROTO( \ - __be32 xid, \ + const struct svc_rqst *rqstp, \ u32 seqno \ ), \ - TP_ARGS(xid, seqno)) + TP_ARGS(rqstp, seqno)) -DEFINE_SVC_SEQNO_EVENT(large_seqno); -DEFINE_SVC_SEQNO_EVENT(old_seqno); +DEFINE_SVC_SEQNO_EVENT(large); +DEFINE_SVC_SEQNO_EVENT(seen); +TRACE_EVENT(rpcgss_svc_seqno_low, + TP_PROTO( + const struct svc_rqst *rqstp, + u32 seqno, + u32 min, + u32 max + ), + + TP_ARGS(rqstp, seqno, min, max), + + TP_STRUCT__entry( + __field(u32, xid) + __field(u32, seqno) + __field(u32, min) + __field(u32, max) + ), + + TP_fast_assign( + __entry->xid = be32_to_cpu(rqstp->rq_xid); + __entry->seqno = seqno; + __entry->min = min; + __entry->max = max; + ), + + TP_printk("xid=0x%08x seqno=%u window=[%u..%u]", + __entry->xid, __entry->seqno, __entry->min, __entry->max) +); /** ** gssd upcall related trace events diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c index 46027d0c903f..7d83f54aaaa6 100644 --- a/net/sunrpc/auth_gss/svcauth_gss.c +++ b/net/sunrpc/auth_gss/svcauth_gss.c @@ -332,7 +332,7 @@ static struct rsi *rsi_update(struct cache_detail *cd, struct rsi *new, struct r struct gss_svc_seq_data { /* highest seq number seen so far: */ - int sd_max; + u32 sd_max; /* for i such that sd_max-GSS_SEQ_WIN < i <= sd_max, the i-th bit of * sd_win is nonzero iff sequence number i has been seen already: */ unsigned long sd_win[GSS_SEQ_WIN/BITS_PER_LONG]; @@ -613,16 +613,29 @@ gss_svc_searchbyctx(struct cache_detail *cd, struct xdr_netobj *handle) return found; } -/* Implements sequence number algorithm as specified in RFC 2203. */ -static int -gss_check_seq_num(struct rsc *rsci, int seq_num) +/** + * gss_check_seq_num - GSS sequence number window check + * @rqstp: RPC Call to use when reporting errors + * @rsci: cached GSS context state (updated on return) + * @seq_num: sequence number to check + * + * Implements sequence number algorithm as specified in + * RFC 2203, Section 5.3.3.1. "Context Management". + * + * Return values: + * %true: @rqstp's GSS sequence number is inside the window + * %false: @rqstp's GSS sequence number is outside the window + */ +static bool gss_check_seq_num(const struct svc_rqst *rqstp, struct rsc *rsci, + u32 seq_num) { struct gss_svc_seq_data *sd = &rsci->seqdata; + bool result = false; spin_lock(&sd->sd_lock); if (seq_num > sd->sd_max) { if (seq_num >= sd->sd_max + GSS_SEQ_WIN) { - memset(sd->sd_win,0,sizeof(sd->sd_win)); + memset(sd->sd_win, 0, sizeof(sd->sd_win)); sd->sd_max = seq_num; } else while (sd->sd_max < seq_num) { sd->sd_max++; @@ -631,17 +644,25 @@ gss_check_seq_num(struct rsc *rsci, int seq_num) __set_bit(seq_num % GSS_SEQ_WIN, sd->sd_win); goto ok; } else if (seq_num <= sd->sd_max - GSS_SEQ_WIN) { - goto drop; + goto toolow; } - /* sd_max - GSS_SEQ_WIN < seq_num <= sd_max */ if (__test_and_set_bit(seq_num % GSS_SEQ_WIN, sd->sd_win)) - goto drop; + goto alreadyseen; + ok: + result = true; +out: spin_unlock(&sd->sd_lock); - return 1; -drop: - spin_unlock(&sd->sd_lock); - return 0; + return result; + +toolow: + trace_rpcgss_svc_seqno_low(rqstp, seq_num, + sd->sd_max - GSS_SEQ_WIN, + sd->sd_max); + goto out; +alreadyseen: + trace_rpcgss_svc_seqno_seen(rqstp, seq_num); + goto out; } static inline u32 round_up_to_quad(u32 i) @@ -721,14 +742,12 @@ gss_verify_header(struct svc_rqst *rqstp, struct rsc *rsci, } if (gc->gc_seq > MAXSEQ) { - trace_rpcgss_svc_large_seqno(rqstp->rq_xid, gc->gc_seq); + trace_rpcgss_svc_seqno_large(rqstp, gc->gc_seq); *authp = rpcsec_gsserr_ctxproblem; return SVC_DENIED; } - if (!gss_check_seq_num(rsci, gc->gc_seq)) { - trace_rpcgss_svc_old_seqno(rqstp->rq_xid, gc->gc_seq); + if (!gss_check_seq_num(rqstp, rsci, gc->gc_seq)) return SVC_DROP; - } return SVC_OK; } @@ -866,11 +885,13 @@ read_u32_from_xdr_buf(struct xdr_buf *buf, int base, u32 *obj) static int unwrap_integ_data(struct svc_rqst *rqstp, struct xdr_buf *buf, u32 seq, struct gss_ctx *ctx) { + u32 integ_len, rseqno, maj_stat; int stat = -EINVAL; - u32 integ_len, maj_stat; struct xdr_netobj mic; struct xdr_buf integ_buf; + mic.data = NULL; + /* NFS READ normally uses splice to send data in-place. However * the data in cache can change after the reply's MIC is computed * but before the RPC reply is sent. To prevent the client from @@ -885,34 +906,44 @@ unwrap_integ_data(struct svc_rqst *rqstp, struct xdr_buf *buf, u32 seq, struct g integ_len = svc_getnl(&buf->head[0]); if (integ_len & 3) - return stat; + goto unwrap_failed; if (integ_len > buf->len) - return stat; - if (xdr_buf_subsegment(buf, &integ_buf, 0, integ_len)) { - WARN_ON_ONCE(1); - return stat; - } + goto unwrap_failed; + if (xdr_buf_subsegment(buf, &integ_buf, 0, integ_len)) + goto unwrap_failed; + /* copy out mic... */ if (read_u32_from_xdr_buf(buf, integ_len, &mic.len)) - return stat; + goto unwrap_failed; if (mic.len > RPC_MAX_AUTH_SIZE) - return stat; + goto unwrap_failed; mic.data = kmalloc(mic.len, GFP_KERNEL); if (!mic.data) - return stat; + goto unwrap_failed; if (read_bytes_from_xdr_buf(buf, integ_len + 4, mic.data, mic.len)) - goto out; + goto unwrap_failed; maj_stat = gss_verify_mic(ctx, &integ_buf, &mic); if (maj_stat != GSS_S_COMPLETE) - goto out; - if (svc_getnl(&buf->head[0]) != seq) - goto out; + goto bad_mic; + rseqno = svc_getnl(&buf->head[0]); + if (rseqno != seq) + goto bad_seqno; /* trim off the mic and padding at the end before returning */ xdr_buf_trim(buf, round_up_to_quad(mic.len) + 4); stat = 0; out: kfree(mic.data); return stat; + +unwrap_failed: + trace_rpcgss_svc_unwrap_failed(rqstp); + goto out; +bad_seqno: + trace_rpcgss_svc_seqno_bad(rqstp, seq, rseqno); + goto out; +bad_mic: + trace_rpcgss_svc_mic(rqstp, maj_stat); + goto out; } static inline int @@ -937,6 +968,7 @@ unwrap_priv_data(struct svc_rqst *rqstp, struct xdr_buf *buf, u32 seq, struct gs { u32 priv_len, maj_stat; int pad, remaining_len, offset; + u32 rseqno; clear_bit(RQ_SPLICE_OK, &rqstp->rq_flags); @@ -951,7 +983,7 @@ unwrap_priv_data(struct svc_rqst *rqstp, struct xdr_buf *buf, u32 seq, struct gs * not yet read from the head, so these two values are different: */ remaining_len = total_buf_len(buf); if (priv_len > remaining_len) - return -EINVAL; + goto unwrap_failed; pad = remaining_len - priv_len; buf->len -= pad; fix_priv_head(buf, pad); @@ -972,11 +1004,22 @@ unwrap_priv_data(struct svc_rqst *rqstp, struct xdr_buf *buf, u32 seq, struct gs fix_priv_head(buf, pad); } if (maj_stat != GSS_S_COMPLETE) - return -EINVAL; + goto bad_unwrap; out_seq: - if (svc_getnl(&buf->head[0]) != seq) - return -EINVAL; + rseqno = svc_getnl(&buf->head[0]); + if (rseqno != seq) + goto bad_seqno; return 0; + +unwrap_failed: + trace_rpcgss_svc_unwrap_failed(rqstp); + return -EINVAL; +bad_seqno: + trace_rpcgss_svc_seqno_bad(rqstp, seq, rseqno); + return -EINVAL; +bad_unwrap: + trace_rpcgss_svc_unwrap(rqstp, maj_stat); + return -EINVAL; } struct gss_svc_data { @@ -1314,8 +1357,7 @@ static int svcauth_gss_proxy_init(struct svc_rqst *rqstp, if (status) goto out; - trace_rpcgss_svc_accept_upcall(rqstp->rq_xid, ud.major_status, - ud.minor_status); + trace_rpcgss_svc_accept_upcall(rqstp, ud.major_status, ud.minor_status); switch (ud.major_status) { case GSS_S_CONTINUE_NEEDED: @@ -1490,8 +1532,6 @@ svcauth_gss_accept(struct svc_rqst *rqstp, __be32 *authp) int ret; struct sunrpc_net *sn = net_generic(SVC_NET(rqstp), sunrpc_net_id); - trace_rpcgss_svc_accept(rqstp->rq_xid, argv->iov_len); - *authp = rpc_autherr_badcred; if (!svcdata) svcdata = kmalloc(sizeof(*svcdata), GFP_KERNEL); @@ -1608,6 +1648,7 @@ svcauth_gss_accept(struct svc_rqst *rqstp, __be32 *authp) GSS_C_QOP_DEFAULT, gc->gc_svc); ret = SVC_OK; + trace_rpcgss_svc_authenticate(rqstp, gc); goto out; } garbage_args: diff --git a/net/sunrpc/auth_gss/trace.c b/net/sunrpc/auth_gss/trace.c index 49fa583d7f91..d26036a57443 100644 --- a/net/sunrpc/auth_gss/trace.c +++ b/net/sunrpc/auth_gss/trace.c @@ -5,6 +5,9 @@ #include #include +#include +#include +#include #include #include -- cgit v1.2.3 From e814eecbe3bbeaa8b004d25a4b8974d232b765a9 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 11 Jun 2020 12:44:56 -0400 Subject: svcrdma: Fix page leak in svc_rdma_recv_read_chunk() Commit 07d0ff3b0cd2 ("svcrdma: Clean up Read chunk path") moved the page saver logic so that it gets executed event when an error occurs. In that case, the I/O is never posted, and those pages are then leaked. Errors in this path, however, are quite rare. Fixes: 07d0ff3b0cd2 ("svcrdma: Clean up Read chunk path") Signed-off-by: Chuck Lever --- net/sunrpc/xprtrdma/svc_rdma_rw.c | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c index 5eb35309ecef..83806fa94def 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_rw.c +++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c @@ -684,7 +684,6 @@ static int svc_rdma_build_read_chunk(struct svc_rqst *rqstp, struct svc_rdma_read_info *info, __be32 *p) { - unsigned int i; int ret; ret = -EINVAL; @@ -707,12 +706,6 @@ static int svc_rdma_build_read_chunk(struct svc_rqst *rqstp, info->ri_chunklen += rs_length; } - /* Pages under I/O have been copied to head->rc_pages. - * Prevent their premature release by svc_xprt_release() . - */ - for (i = 0; i < info->ri_readctxt->rc_page_count; i++) - rqstp->rq_pages[i] = NULL; - return ret; } @@ -807,6 +800,26 @@ out: return ret; } +/* Pages under I/O have been copied to head->rc_pages. Ensure they + * are not released by svc_xprt_release() until the I/O is complete. + * + * This has to be done after all Read WRs are constructed to properly + * handle a page that is part of I/O on behalf of two different RDMA + * segments. + * + * Do this only if I/O has been posted. Otherwise, we do indeed want + * svc_xprt_release() to clean things up properly. + */ +static void svc_rdma_save_io_pages(struct svc_rqst *rqstp, + const unsigned int start, + const unsigned int num_pages) +{ + unsigned int i; + + for (i = start; i < num_pages + start; i++) + rqstp->rq_pages[i] = NULL; +} + /** * svc_rdma_recv_read_chunk - Pull a Read chunk from the client * @rdma: controlling RDMA transport @@ -860,6 +873,7 @@ int svc_rdma_recv_read_chunk(struct svcxprt_rdma *rdma, struct svc_rqst *rqstp, ret = svc_rdma_post_chunk_ctxt(&info->ri_cc); if (ret < 0) goto out_err; + svc_rdma_save_io_pages(rqstp, 0, head->rc_page_count); return 0; out_err: -- cgit v1.2.3 From 6e9fab7073e5b6bdba4d5891cd486ffdd7de373f Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 25 Mar 2020 11:15:55 -0400 Subject: svcrdma: Remove save_io_pages() call from send_error_msg() Commit 4757d90b15d8 ("svcrdma: Report Write/Reply chunk overruns") made an effort to preserve I/O pages until RDMA Write completion. In a subsequent patch, I intend to de-duplicate the two functions that send ERR_CHUNK responses. Pull the save_io_pages() call out of svc_rdma_send_error_msg() to make it more like svc_rdma_send_error(). Signed-off-by: Chuck Lever --- net/sunrpc/xprtrdma/svc_rdma_sendto.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index 38e7c3c8c4a9..2f88d01e8d27 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -806,8 +806,7 @@ static int svc_rdma_send_reply_msg(struct svcxprt_rdma *rdma, /* Given the client-provided Write and Reply chunks, the server was not * able to form a complete reply. Return an RDMA_ERROR message so the - * client can retire this RPC transaction. As above, the Send completion - * routine releases payload pages that were part of a previous RDMA Write. + * client can retire this RPC transaction. * * Remote Invalidation is skipped for simplicity. */ @@ -834,8 +833,6 @@ static int svc_rdma_send_error_msg(struct svcxprt_rdma *rdma, *p = err_chunk; trace_svcrdma_err_chunk(*rdma_argp); - svc_rdma_save_io_pages(rqstp, ctxt); - ctxt->sc_send_wr.num_sge = 1; ctxt->sc_send_wr.opcode = IB_WR_SEND; ctxt->sc_sges[0].length = ctxt->sc_hdrbuf.len; @@ -930,6 +927,10 @@ int svc_rdma_sendto(struct svc_rqst *rqstp) if (ret != -E2BIG && ret != -EINVAL) goto err1; + /* Send completion releases payload pages that were part + * of previously posted RDMA Writes. + */ + svc_rdma_save_io_pages(rqstp, sctxt); ret = svc_rdma_send_error_msg(rdma, sctxt, rqstp); if (ret < 0) goto err1; -- cgit v1.2.3 From d1f6e2369c63f2cbc7a7441d3ee9b0eabfa9a327 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 25 Mar 2020 11:31:37 -0400 Subject: svcrdma: Add @rctxt parameter to svc_rdma_send_error() functions Another step towards making svc_rdma_send_error_msg() and svc_rdma_send_error() similar enough to eliminate one of them. Signed-off-by: Chuck Lever --- net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 9 +++++---- net/sunrpc/xprtrdma/svc_rdma_sendto.c | 23 +++++++++++------------ 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index e426fedb9524..60d855116ae7 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -715,10 +715,11 @@ static void rdma_read_complete(struct svc_rqst *rqstp, } static void svc_rdma_send_error(struct svcxprt_rdma *xprt, - __be32 *rdma_argp, int status) + struct svc_rdma_recv_ctxt *rctxt, + int status) { + __be32 *p, *rdma_argp = rctxt->rc_recv_buf; struct svc_rdma_send_ctxt *ctxt; - __be32 *p; int ret; ctxt = svc_rdma_send_ctxt_get(xprt); @@ -900,13 +901,13 @@ out_readchunk: return 0; out_err: - svc_rdma_send_error(rdma_xprt, p, ret); + svc_rdma_send_error(rdma_xprt, ctxt, ret); svc_rdma_recv_ctxt_put(rdma_xprt, ctxt); return 0; out_postfail: if (ret == -EINVAL) - svc_rdma_send_error(rdma_xprt, p, ret); + svc_rdma_send_error(rdma_xprt, ctxt, ret); svc_rdma_recv_ctxt_put(rdma_xprt, ctxt); return ret; diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index 2f88d01e8d27..47ada61411c3 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -811,18 +811,17 @@ static int svc_rdma_send_reply_msg(struct svcxprt_rdma *rdma, * Remote Invalidation is skipped for simplicity. */ static int svc_rdma_send_error_msg(struct svcxprt_rdma *rdma, - struct svc_rdma_send_ctxt *ctxt, - struct svc_rqst *rqstp) + struct svc_rdma_send_ctxt *sctxt, + struct svc_rdma_recv_ctxt *rctxt) { - struct svc_rdma_recv_ctxt *rctxt = rqstp->rq_xprt_ctxt; __be32 *rdma_argp = rctxt->rc_recv_buf; __be32 *p; - rpcrdma_set_xdrlen(&ctxt->sc_hdrbuf, 0); - xdr_init_encode(&ctxt->sc_stream, &ctxt->sc_hdrbuf, ctxt->sc_xprt_buf, - NULL); + rpcrdma_set_xdrlen(&sctxt->sc_hdrbuf, 0); + xdr_init_encode(&sctxt->sc_stream, &sctxt->sc_hdrbuf, + sctxt->sc_xprt_buf, NULL); - p = xdr_reserve_space(&ctxt->sc_stream, RPCRDMA_HDRLEN_ERR); + p = xdr_reserve_space(&sctxt->sc_stream, RPCRDMA_HDRLEN_ERR); if (!p) return -ENOMSG; @@ -833,10 +832,10 @@ static int svc_rdma_send_error_msg(struct svcxprt_rdma *rdma, *p = err_chunk; trace_svcrdma_err_chunk(*rdma_argp); - ctxt->sc_send_wr.num_sge = 1; - ctxt->sc_send_wr.opcode = IB_WR_SEND; - ctxt->sc_sges[0].length = ctxt->sc_hdrbuf.len; - return svc_rdma_send(rdma, &ctxt->sc_send_wr); + sctxt->sc_send_wr.num_sge = 1; + sctxt->sc_send_wr.opcode = IB_WR_SEND; + sctxt->sc_sges[0].length = sctxt->sc_hdrbuf.len; + return svc_rdma_send(rdma, &sctxt->sc_send_wr); } /** @@ -931,7 +930,7 @@ int svc_rdma_sendto(struct svc_rqst *rqstp) * of previously posted RDMA Writes. */ svc_rdma_save_io_pages(rqstp, sctxt); - ret = svc_rdma_send_error_msg(rdma, sctxt, rqstp); + ret = svc_rdma_send_error_msg(rdma, sctxt, rctxt); if (ret < 0) goto err1; return 0; -- cgit v1.2.3 From 4f200bd8affbccc3152d497b4ce5cfaca5a7c53f Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 25 Mar 2020 11:57:51 -0400 Subject: svcrdma: Add a @status parameter to svc_rdma_send_error_msg() The common "send RDMA_ERR" function should be in svc_rdma_sendto.c, since that is where the other Send-related functions are located. So from here, I will beef up svc_rdma_send_error_msg() and deprecate svc_rdma_send_error(). A generic svc_rdma_send_error_msg() will need to handle both ERR_CHUNK and ERR_VERS. Copy that logic from svc_rdma_send_error() to svc_rdma_send_error_msg(). Signed-off-by: Chuck Lever --- net/sunrpc/xprtrdma/svc_rdma_sendto.c | 32 ++++++++++++++++++++++++++------ 1 file changed, 26 insertions(+), 6 deletions(-) diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index 47ada61411c3..73fe7a213169 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -812,7 +812,8 @@ static int svc_rdma_send_reply_msg(struct svcxprt_rdma *rdma, */ static int svc_rdma_send_error_msg(struct svcxprt_rdma *rdma, struct svc_rdma_send_ctxt *sctxt, - struct svc_rdma_recv_ctxt *rctxt) + struct svc_rdma_recv_ctxt *rctxt, + int status) { __be32 *rdma_argp = rctxt->rc_recv_buf; __be32 *p; @@ -821,16 +822,35 @@ static int svc_rdma_send_error_msg(struct svcxprt_rdma *rdma, xdr_init_encode(&sctxt->sc_stream, &sctxt->sc_hdrbuf, sctxt->sc_xprt_buf, NULL); - p = xdr_reserve_space(&sctxt->sc_stream, RPCRDMA_HDRLEN_ERR); + p = xdr_reserve_space(&sctxt->sc_stream, + rpcrdma_fixed_maxsz * sizeof(*p)); if (!p) return -ENOMSG; *p++ = *rdma_argp; *p++ = *(rdma_argp + 1); *p++ = rdma->sc_fc_credits; - *p++ = rdma_error; - *p = err_chunk; - trace_svcrdma_err_chunk(*rdma_argp); + *p = rdma_error; + + switch (status) { + case -EPROTONOSUPPORT: + p = xdr_reserve_space(&sctxt->sc_stream, 3 * sizeof(*p)); + if (!p) + return -ENOMSG; + + *p++ = err_vers; + *p++ = rpcrdma_version; + *p = rpcrdma_version; + trace_svcrdma_err_vers(*rdma_argp); + break; + default: + p = xdr_reserve_space(&sctxt->sc_stream, sizeof(*p)); + if (!p) + return -ENOMSG; + + *p = err_chunk; + trace_svcrdma_err_chunk(*rdma_argp); + } sctxt->sc_send_wr.num_sge = 1; sctxt->sc_send_wr.opcode = IB_WR_SEND; @@ -930,7 +950,7 @@ int svc_rdma_sendto(struct svc_rqst *rqstp) * of previously posted RDMA Writes. */ svc_rdma_save_io_pages(rqstp, sctxt); - ret = svc_rdma_send_error_msg(rdma, sctxt, rctxt); + ret = svc_rdma_send_error_msg(rdma, sctxt, rctxt, ret); if (ret < 0) goto err1; return 0; -- cgit v1.2.3 From 605c61bee5b13bc9d597192779865a1a707166ed Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 25 Mar 2020 14:38:07 -0400 Subject: svcrdma: Eliminate return value for svc_rdma_send_error_msg() Like svc_rdma_send_error(), have svc_rdma_send_error_msg() handle any error conditions internally, rather than duplicating that recovery logic at every call site. Signed-off-by: Chuck Lever --- net/sunrpc/xprtrdma/svc_rdma_sendto.c | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index 73fe7a213169..fb548b548c4b 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -810,10 +810,10 @@ static int svc_rdma_send_reply_msg(struct svcxprt_rdma *rdma, * * Remote Invalidation is skipped for simplicity. */ -static int svc_rdma_send_error_msg(struct svcxprt_rdma *rdma, - struct svc_rdma_send_ctxt *sctxt, - struct svc_rdma_recv_ctxt *rctxt, - int status) +static void svc_rdma_send_error_msg(struct svcxprt_rdma *rdma, + struct svc_rdma_send_ctxt *sctxt, + struct svc_rdma_recv_ctxt *rctxt, + int status) { __be32 *rdma_argp = rctxt->rc_recv_buf; __be32 *p; @@ -825,7 +825,7 @@ static int svc_rdma_send_error_msg(struct svcxprt_rdma *rdma, p = xdr_reserve_space(&sctxt->sc_stream, rpcrdma_fixed_maxsz * sizeof(*p)); if (!p) - return -ENOMSG; + goto put_ctxt; *p++ = *rdma_argp; *p++ = *(rdma_argp + 1); @@ -836,7 +836,7 @@ static int svc_rdma_send_error_msg(struct svcxprt_rdma *rdma, case -EPROTONOSUPPORT: p = xdr_reserve_space(&sctxt->sc_stream, 3 * sizeof(*p)); if (!p) - return -ENOMSG; + goto put_ctxt; *p++ = err_vers; *p++ = rpcrdma_version; @@ -846,7 +846,7 @@ static int svc_rdma_send_error_msg(struct svcxprt_rdma *rdma, default: p = xdr_reserve_space(&sctxt->sc_stream, sizeof(*p)); if (!p) - return -ENOMSG; + goto put_ctxt; *p = err_chunk; trace_svcrdma_err_chunk(*rdma_argp); @@ -855,7 +855,12 @@ static int svc_rdma_send_error_msg(struct svcxprt_rdma *rdma, sctxt->sc_send_wr.num_sge = 1; sctxt->sc_send_wr.opcode = IB_WR_SEND; sctxt->sc_sges[0].length = sctxt->sc_hdrbuf.len; - return svc_rdma_send(rdma, &sctxt->sc_send_wr); + if (svc_rdma_send(rdma, &sctxt->sc_send_wr)) + goto put_ctxt; + return; + +put_ctxt: + svc_rdma_send_ctxt_put(rdma, sctxt); } /** @@ -950,9 +955,7 @@ int svc_rdma_sendto(struct svc_rqst *rqstp) * of previously posted RDMA Writes. */ svc_rdma_save_io_pages(rqstp, sctxt); - ret = svc_rdma_send_error_msg(rdma, sctxt, rctxt, ret); - if (ret < 0) - goto err1; + svc_rdma_send_error_msg(rdma, sctxt, rctxt, ret); return 0; err1: -- cgit v1.2.3 From c65b326b1eb983bca35ed43d0e453d1b15705f10 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 25 Mar 2020 14:41:46 -0400 Subject: svcrdma: Make svc_rdma_send_error_msg() a global function Prepare for svc_rdma_send_error_msg() to be invoked from another source file. Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc_rdma.h | 4 ++++ net/sunrpc/xprtrdma/svc_rdma_sendto.c | 28 +++++++++++++++++++--------- 2 files changed, 23 insertions(+), 9 deletions(-) diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index 7ed82625dc0b..1579f7a14ab4 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -195,6 +195,10 @@ extern int svc_rdma_map_reply_msg(struct svcxprt_rdma *rdma, struct svc_rdma_send_ctxt *sctxt, const struct svc_rdma_recv_ctxt *rctxt, struct xdr_buf *xdr); +extern void svc_rdma_send_error_msg(struct svcxprt_rdma *rdma, + struct svc_rdma_send_ctxt *sctxt, + struct svc_rdma_recv_ctxt *rctxt, + int status); extern int svc_rdma_sendto(struct svc_rqst *); extern int svc_rdma_read_payload(struct svc_rqst *rqstp, unsigned int offset, unsigned int length); diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index fb548b548c4b..57041298fe4f 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -804,16 +804,25 @@ static int svc_rdma_send_reply_msg(struct svcxprt_rdma *rdma, return svc_rdma_send(rdma, &sctxt->sc_send_wr); } -/* Given the client-provided Write and Reply chunks, the server was not - * able to form a complete reply. Return an RDMA_ERROR message so the - * client can retire this RPC transaction. - * - * Remote Invalidation is skipped for simplicity. +/** + * svc_rdma_send_error_msg - Send an RPC/RDMA v1 error response + * @rdma: controlling transport context + * @sctxt: Send context for the response + * @rctxt: Receive context for incoming bad message + * @status: negative errno indicating error that occurred + * + * Given the client-provided Read, Write, and Reply chunks, the + * server was not able to parse the Call or form a complete Reply. + * Return an RDMA_ERROR message so the client can retire the RPC + * transaction. + * + * The caller does not have to release @sctxt. It is released by + * Send completion, or by this function on error. */ -static void svc_rdma_send_error_msg(struct svcxprt_rdma *rdma, - struct svc_rdma_send_ctxt *sctxt, - struct svc_rdma_recv_ctxt *rctxt, - int status) +void svc_rdma_send_error_msg(struct svcxprt_rdma *rdma, + struct svc_rdma_send_ctxt *sctxt, + struct svc_rdma_recv_ctxt *rctxt, + int status) { __be32 *rdma_argp = rctxt->rc_recv_buf; __be32 *p; @@ -852,6 +861,7 @@ static void svc_rdma_send_error_msg(struct svcxprt_rdma *rdma, trace_svcrdma_err_chunk(*rdma_argp); } + /* Remote Invalidation is skipped for simplicity. */ sctxt->sc_send_wr.num_sge = 1; sctxt->sc_send_wr.opcode = IB_WR_SEND; sctxt->sc_sges[0].length = sctxt->sc_hdrbuf.len; -- cgit v1.2.3 From ba6cc97738a15751350d3f85a6b856e4d6d4c202 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 24 Mar 2020 17:57:18 -0400 Subject: svcrdma: Consolidate send_error helper functions Final refactor: Replace internals of svc_rdma_send_error() with a simple call to svc_rdma_send_error_msg(). Signed-off-by: Chuck Lever --- net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 52 ++++----------------------------- 1 file changed, 5 insertions(+), 47 deletions(-) diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index 60d855116ae7..c072ce61b393 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -714,58 +714,16 @@ static void rdma_read_complete(struct svc_rqst *rqstp, rqstp->rq_arg.buflen = head->rc_arg.buflen; } -static void svc_rdma_send_error(struct svcxprt_rdma *xprt, +static void svc_rdma_send_error(struct svcxprt_rdma *rdma, struct svc_rdma_recv_ctxt *rctxt, int status) { - __be32 *p, *rdma_argp = rctxt->rc_recv_buf; - struct svc_rdma_send_ctxt *ctxt; - int ret; + struct svc_rdma_send_ctxt *sctxt; - ctxt = svc_rdma_send_ctxt_get(xprt); - if (!ctxt) + sctxt = svc_rdma_send_ctxt_get(rdma); + if (!sctxt) return; - - p = xdr_reserve_space(&ctxt->sc_stream, - rpcrdma_fixed_maxsz * sizeof(*p)); - if (!p) - goto put_ctxt; - - *p++ = *rdma_argp; - *p++ = *(rdma_argp + 1); - *p++ = xprt->sc_fc_credits; - *p = rdma_error; - - switch (status) { - case -EPROTONOSUPPORT: - p = xdr_reserve_space(&ctxt->sc_stream, 3 * sizeof(*p)); - if (!p) - goto put_ctxt; - - *p++ = err_vers; - *p++ = rpcrdma_version; - *p = rpcrdma_version; - trace_svcrdma_err_vers(*rdma_argp); - break; - default: - p = xdr_reserve_space(&ctxt->sc_stream, sizeof(*p)); - if (!p) - goto put_ctxt; - - *p = err_chunk; - trace_svcrdma_err_chunk(*rdma_argp); - } - - ctxt->sc_send_wr.num_sge = 1; - ctxt->sc_send_wr.opcode = IB_WR_SEND; - ctxt->sc_sges[0].length = ctxt->sc_hdrbuf.len; - ret = svc_rdma_send(xprt, &ctxt->sc_send_wr); - if (ret) - goto put_ctxt; - return; - -put_ctxt: - svc_rdma_send_ctxt_put(xprt, ctxt); + svc_rdma_send_error_msg(rdma, sctxt, rctxt, status); } /* By convention, backchannel calls arrive via rdma_msg type -- cgit v1.2.3 From 3f8f25c696f9c4e352a4d705ba767af676421564 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 30 Apr 2020 14:17:40 -0400 Subject: svcrdma: Clean up trace_svcrdma_send_failed() tracepoint - Use the _err naming convention instead - Remove display of kernel memory address of the controlling xprt Signed-off-by: Chuck Lever --- include/trace/events/rpcrdma.h | 7 ++----- net/sunrpc/xprtrdma/svc_rdma_sendto.c | 2 +- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h index 0f05a6e2b9cb..0eff80dee066 100644 --- a/include/trace/events/rpcrdma.h +++ b/include/trace/events/rpcrdma.h @@ -1716,7 +1716,7 @@ TRACE_EVENT(svcrdma_send_pullup, TP_printk("len=%u", __entry->len) ); -TRACE_EVENT(svcrdma_send_failed, +TRACE_EVENT(svcrdma_send_err, TP_PROTO( const struct svc_rqst *rqst, int status @@ -1727,19 +1727,16 @@ TRACE_EVENT(svcrdma_send_failed, TP_STRUCT__entry( __field(int, status) __field(u32, xid) - __field(const void *, xprt) __string(addr, rqst->rq_xprt->xpt_remotebuf) ), TP_fast_assign( __entry->status = status; __entry->xid = __be32_to_cpu(rqst->rq_xid); - __entry->xprt = rqst->rq_xprt; __assign_str(addr, rqst->rq_xprt->xpt_remotebuf); ), - TP_printk("xprt=%p addr=%s xid=0x%08x status=%d", - __entry->xprt, __get_str(addr), + TP_printk("addr=%s xid=0x%08x status=%d", __get_str(addr), __entry->xid, __entry->status ) ); diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index 57041298fe4f..f985f548346a 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -971,7 +971,7 @@ int svc_rdma_sendto(struct svc_rqst *rqstp) err1: svc_rdma_send_ctxt_put(rdma, sctxt); err0: - trace_svcrdma_send_failed(rqstp, ret); + trace_svcrdma_send_err(rqstp, ret); set_bit(XPT_CLOSE, &xprt->xpt_flags); return -ENOTCONN; } -- cgit v1.2.3 From 0b8dc1b69995cbd81c2c9a2f1730c46cce085f62 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 18 May 2020 11:34:47 -0400 Subject: svcrdma: Remove declarations for functions long removed MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pavane pour une infante défunte. Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc_rdma.h | 4 ---- 1 file changed, 4 deletions(-) diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index 1579f7a14ab4..d28ca1b6f2eb 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -204,10 +204,6 @@ extern int svc_rdma_read_payload(struct svc_rqst *rqstp, unsigned int offset, unsigned int length); /* svc_rdma_transport.c */ -extern int svc_rdma_create_listen(struct svc_serv *, int, struct sockaddr *); -extern void svc_sq_reap(struct svcxprt_rdma *); -extern void svc_rq_reap(struct svcxprt_rdma *); - extern struct svc_xprt_class svc_rdma_class; #ifdef CONFIG_SUNRPC_BACKCHANNEL extern struct svc_xprt_class svc_rdma_bc_class; -- cgit v1.2.3 From 07e9a6325a35fb9655f7b52e2b9dc632da6eef51 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Sat, 28 Mar 2020 13:43:22 -0400 Subject: SUNRPC: Add helpers for decoding list discriminators symbolically Use these helpers in a few spots to demonstrate their use. The remaining open-coded discriminator checks in rpcrdma will be addressed in subsequent patches. Signed-off-by: Chuck Lever --- include/linux/sunrpc/xdr.h | 26 ++++++++++++++++++++++++++ net/sunrpc/xprtrdma/rpc_rdma.c | 12 ++++++------ net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 17 ++++++++--------- 3 files changed, 40 insertions(+), 15 deletions(-) diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h index 22c207b2425f..5a6a81b7cd9f 100644 --- a/include/linux/sunrpc/xdr.h +++ b/include/linux/sunrpc/xdr.h @@ -474,6 +474,32 @@ xdr_stream_encode_uint32_array(struct xdr_stream *xdr, return ret; } +/** + * xdr_item_is_absent - symbolically handle XDR discriminators + * @p: pointer to undecoded discriminator + * + * Return values: + * %true if the following XDR item is absent + * %false if the following XDR item is present + */ +static inline bool xdr_item_is_absent(const __be32 *p) +{ + return *p == xdr_zero; +} + +/** + * xdr_item_is_present - symbolically handle XDR discriminators + * @p: pointer to undecoded discriminator + * + * Return values: + * %true if the following XDR item is present + * %false if the following XDR item is absent + */ +static inline bool xdr_item_is_present(const __be32 *p) +{ + return *p != xdr_zero; +} + /** * xdr_stream_decode_u32 - Decode a 32-bit integer * @xdr: pointer to xdr_stream diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 935bbef2f7be..feecd1f55f18 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -1133,11 +1133,11 @@ rpcrdma_is_bcall(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep) p = xdr_inline_decode(xdr, 0); /* Chunk lists */ - if (*p++ != xdr_zero) + if (xdr_item_is_present(p++)) return false; - if (*p++ != xdr_zero) + if (xdr_item_is_present(p++)) return false; - if (*p++ != xdr_zero) + if (xdr_item_is_present(p++)) return false; /* RPC header */ @@ -1215,7 +1215,7 @@ static int decode_read_list(struct xdr_stream *xdr) p = xdr_inline_decode(xdr, sizeof(*p)); if (unlikely(!p)) return -EIO; - if (unlikely(*p != xdr_zero)) + if (unlikely(xdr_item_is_present(p))) return -EIO; return 0; } @@ -1234,7 +1234,7 @@ static int decode_write_list(struct xdr_stream *xdr, u32 *length) p = xdr_inline_decode(xdr, sizeof(*p)); if (unlikely(!p)) return -EIO; - if (*p == xdr_zero) + if (xdr_item_is_absent(p)) break; if (!first) return -EIO; @@ -1256,7 +1256,7 @@ static int decode_reply_chunk(struct xdr_stream *xdr, u32 *length) return -EIO; *length = 0; - if (*p != xdr_zero) + if (xdr_item_is_present(p)) if (decode_write_chunk(xdr, length)) return -EIO; return 0; diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index c072ce61b393..5e78067889f3 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -419,7 +419,7 @@ static bool xdr_check_read_list(struct svc_rdma_recv_ctxt *rctxt) len = 0; first = true; - while (*p != xdr_zero) { + while (xdr_item_is_present(p)) { p = xdr_inline_decode(&rctxt->rc_stream, rpcrdma_readseg_maxsz * sizeof(*p)); if (!p) @@ -500,7 +500,7 @@ static bool xdr_check_write_list(struct svc_rdma_recv_ctxt *rctxt) if (!p) return false; rctxt->rc_write_list = p; - while (*p != xdr_zero) { + while (xdr_item_is_present(p)) { if (!xdr_check_write_chunk(rctxt, MAX_BYTES_WRITE_CHUNK)) return false; ++chcount; @@ -532,12 +532,11 @@ static bool xdr_check_reply_chunk(struct svc_rdma_recv_ctxt *rctxt) p = xdr_inline_decode(&rctxt->rc_stream, sizeof(*p)); if (!p) return false; - rctxt->rc_reply_chunk = p; - if (*p != xdr_zero) { + rctxt->rc_reply_chunk = NULL; + if (xdr_item_is_present(p)) { if (!xdr_check_write_chunk(rctxt, MAX_BYTES_SPECIAL_CHUNK)) return false; - } else { - rctxt->rc_reply_chunk = NULL; + rctxt->rc_reply_chunk = p; } return true; } @@ -568,7 +567,7 @@ static void svc_rdma_get_inv_rkey(struct svcxprt_rdma *rdma, p += rpcrdma_fixed_maxsz; /* Read list */ - while (*p++ != xdr_zero) { + while (xdr_item_is_present(p++)) { p++; /* position */ if (inv_rkey == xdr_zero) inv_rkey = *p; @@ -578,7 +577,7 @@ static void svc_rdma_get_inv_rkey(struct svcxprt_rdma *rdma, } /* Write list */ - while (*p++ != xdr_zero) { + while (xdr_item_is_present(p++)) { segcount = be32_to_cpup(p++); for (i = 0; i < segcount; i++) { if (inv_rkey == xdr_zero) @@ -590,7 +589,7 @@ static void svc_rdma_get_inv_rkey(struct svcxprt_rdma *rdma, } /* Reply chunk */ - if (*p++ != xdr_zero) { + if (xdr_item_is_present(p++)) { segcount = be32_to_cpup(p++); for (i = 0; i < segcount; i++) { if (inv_rkey == xdr_zero) -- cgit v1.2.3 From f60a08697d28b138c73b14c3204947bc8e637197 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Sun, 29 Mar 2020 16:44:13 -0400 Subject: svcrdma: Add common XDR decoders for RDMA and Read segments Clean up: De-duplicate some code. Signed-off-by: Chuck Lever --- include/linux/sunrpc/rpc_rdma.h | 37 ++++++++++++++++++++++++++++++++ net/sunrpc/xprtrdma/frwr_ops.c | 1 - net/sunrpc/xprtrdma/rpc_rdma.c | 5 +---- net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 4 +--- net/sunrpc/xprtrdma/svc_rdma_rw.c | 37 ++++++++++++++------------------ net/sunrpc/xprtrdma/svc_rdma_sendto.c | 5 +---- net/sunrpc/xprtrdma/svc_rdma_transport.c | 1 - 7 files changed, 56 insertions(+), 34 deletions(-) diff --git a/include/linux/sunrpc/rpc_rdma.h b/include/linux/sunrpc/rpc_rdma.h index 320c672d84de..db50380f64f4 100644 --- a/include/linux/sunrpc/rpc_rdma.h +++ b/include/linux/sunrpc/rpc_rdma.h @@ -124,4 +124,41 @@ rpcrdma_decode_buffer_size(u8 val) return ((unsigned int)val + 1) << 10; } +/** + * xdr_decode_rdma_segment - Decode contents of an RDMA segment + * @p: Pointer to the undecoded RDMA segment + * @handle: Upon return, the RDMA handle + * @length: Upon return, the RDMA length + * @offset: Upon return, the RDMA offset + * + * Return value: + * Pointer to the XDR item that follows the RDMA segment + */ +static inline __be32 *xdr_decode_rdma_segment(__be32 *p, u32 *handle, + u32 *length, u64 *offset) +{ + *handle = be32_to_cpup(p++); + *length = be32_to_cpup(p++); + return xdr_decode_hyper(p, offset); +} + +/** + * xdr_decode_read_segment - Decode contents of a Read segment + * @p: Pointer to the undecoded Read segment + * @position: Upon return, the segment's position + * @handle: Upon return, the RDMA handle + * @length: Upon return, the RDMA length + * @offset: Upon return, the RDMA offset + * + * Return value: + * Pointer to the XDR item that follows the Read segment + */ +static inline __be32 *xdr_decode_read_segment(__be32 *p, u32 *position, + u32 *handle, u32 *length, + u64 *offset) +{ + *position = be32_to_cpup(p++); + return xdr_decode_rdma_segment(p, handle, length, offset); +} + #endif /* _LINUX_SUNRPC_RPC_RDMA_H */ diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index b647562a26dd..7f94c9a19fd3 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -40,7 +40,6 @@ * New MRs are created on demand. */ -#include #include #include "xprt_rdma.h" diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index feecd1f55f18..5461f01eeca6 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -1176,10 +1176,7 @@ static int decode_rdma_segment(struct xdr_stream *xdr, u32 *length) if (unlikely(!p)) return -EIO; - handle = be32_to_cpup(p++); - *length = be32_to_cpup(p++); - xdr_decode_hyper(p, &offset); - + xdr_decode_rdma_segment(p, &handle, length, &offset); trace_xprtrdma_decode_seg(handle, *length, offset); return 0; } diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index 5e78067889f3..c0587d3cd389 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -466,9 +466,7 @@ static bool xdr_check_write_chunk(struct svc_rdma_recv_ctxt *rctxt, u32 maxlen) if (!p) return false; - handle = be32_to_cpup(p++); - length = be32_to_cpup(p++); - xdr_decode_hyper(p, &offset); + xdr_decode_rdma_segment(p, &handle, &length, &offset); trace_svcrdma_decode_wseg(handle, length, offset); total += length; diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c index 83806fa94def..2038b1b286dd 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_rw.c +++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c @@ -7,6 +7,7 @@ #include +#include #include #include @@ -441,34 +442,32 @@ svc_rdma_build_writes(struct svc_rdma_write_info *info, seg = info->wi_segs + info->wi_seg_no * rpcrdma_segment_maxsz; do { unsigned int write_len; - u32 seg_length, seg_handle; - u64 seg_offset; + u32 handle, length; + u64 offset; if (info->wi_seg_no >= info->wi_nsegs) goto out_overflow; - seg_handle = be32_to_cpup(seg); - seg_length = be32_to_cpup(seg + 1); - xdr_decode_hyper(seg + 2, &seg_offset); - seg_offset += info->wi_seg_off; + xdr_decode_rdma_segment(seg, &handle, &length, &offset); + offset += info->wi_seg_off; - write_len = min(remaining, seg_length - info->wi_seg_off); + write_len = min(remaining, length - info->wi_seg_off); ctxt = svc_rdma_get_rw_ctxt(rdma, (write_len >> PAGE_SHIFT) + 2); if (!ctxt) return -ENOMEM; constructor(info, write_len, ctxt); - ret = svc_rdma_rw_ctx_init(rdma, ctxt, seg_offset, seg_handle, + ret = svc_rdma_rw_ctx_init(rdma, ctxt, offset, handle, DMA_TO_DEVICE); if (ret < 0) return -EIO; - trace_svcrdma_send_wseg(seg_handle, write_len, seg_offset); + trace_svcrdma_send_wseg(handle, write_len, offset); list_add(&ctxt->rw_list, &cc->cc_rwctxts); cc->cc_sqecount += ret; - if (write_len == seg_length - info->wi_seg_off) { + if (write_len == length - info->wi_seg_off) { seg += 4; info->wi_seg_no++; info->wi_seg_off = 0; @@ -689,21 +688,17 @@ static int svc_rdma_build_read_chunk(struct svc_rqst *rqstp, ret = -EINVAL; info->ri_chunklen = 0; while (*p++ != xdr_zero && be32_to_cpup(p++) == info->ri_position) { - u32 rs_handle, rs_length; - u64 rs_offset; + u32 handle, length; + u64 offset; - rs_handle = be32_to_cpup(p++); - rs_length = be32_to_cpup(p++); - p = xdr_decode_hyper(p, &rs_offset); - - ret = svc_rdma_build_read_segment(info, rqstp, - rs_handle, rs_length, - rs_offset); + p = xdr_decode_rdma_segment(p, &handle, &length, &offset); + ret = svc_rdma_build_read_segment(info, rqstp, handle, length, + offset); if (ret < 0) break; - trace_svcrdma_send_rseg(rs_handle, rs_length, rs_offset); - info->ri_chunklen += rs_length; + trace_svcrdma_send_rseg(handle, length, offset); + info->ri_chunklen += length; } return ret; diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index f985f548346a..a78f1d22e9bb 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -106,7 +106,6 @@ #include #include -#include #include #include "xprt_rdma.h" @@ -375,9 +374,7 @@ static ssize_t svc_rdma_encode_write_segment(__be32 *src, if (!p) return -EMSGSIZE; - handle = be32_to_cpup(src++); - length = be32_to_cpup(src++); - xdr_decode_hyper(src, &offset); + xdr_decode_rdma_segment(src, &handle, &length, &offset); *p++ = cpu_to_be32(handle); if (*remaining < length) { diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index d38be57b00ed..3da7901a49e6 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -55,7 +55,6 @@ #include #include -#include #include #include -- cgit v1.2.3 From 379c3bc6b4eb989ee37c4ce8ab403719e06fe35f Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 7 Apr 2020 15:32:14 -0400 Subject: svcrdma: Add common XDR encoders for RDMA and Read segments Clean up: De-duplicate some code. Signed-off-by: Chuck Lever --- include/linux/sunrpc/rpc_rdma.h | 37 +++++++++++++++++++++++++++++++++++ net/sunrpc/xprtrdma/rpc_rdma.c | 14 +++---------- net/sunrpc/xprtrdma/svc_rdma_sendto.c | 4 +--- 3 files changed, 41 insertions(+), 14 deletions(-) diff --git a/include/linux/sunrpc/rpc_rdma.h b/include/linux/sunrpc/rpc_rdma.h index db50380f64f4..4af31bbc8802 100644 --- a/include/linux/sunrpc/rpc_rdma.h +++ b/include/linux/sunrpc/rpc_rdma.h @@ -124,6 +124,43 @@ rpcrdma_decode_buffer_size(u8 val) return ((unsigned int)val + 1) << 10; } +/** + * xdr_encode_rdma_segment - Encode contents of an RDMA segment + * @p: Pointer into a send buffer + * @handle: The RDMA handle to encode + * @length: The RDMA length to encode + * @offset: The RDMA offset to encode + * + * Return value: + * Pointer to the XDR position that follows the encoded RDMA segment + */ +static inline __be32 *xdr_encode_rdma_segment(__be32 *p, u32 handle, + u32 length, u64 offset) +{ + *p++ = cpu_to_be32(handle); + *p++ = cpu_to_be32(length); + return xdr_encode_hyper(p, offset); +} + +/** + * xdr_encode_read_segment - Encode contents of a Read segment + * @p: Pointer into a send buffer + * @position: The position to encode + * @handle: The RDMA handle to encode + * @length: The RDMA length to encode + * @offset: The RDMA offset to encode + * + * Return value: + * Pointer to the XDR position that follows the encoded Read segment + */ +static inline __be32 *xdr_encode_read_segment(__be32 *p, u32 position, + u32 handle, u32 length, + u64 offset) +{ + *p++ = cpu_to_be32(position); + return xdr_encode_rdma_segment(p, handle, length, offset); +} + /** * xdr_decode_rdma_segment - Decode contents of an RDMA segment * @p: Pointer to the undecoded RDMA segment diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 5461f01eeca6..73ed51893175 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -275,14 +275,6 @@ out: return n; } -static void -xdr_encode_rdma_segment(__be32 *iptr, struct rpcrdma_mr *mr) -{ - *iptr++ = cpu_to_be32(mr->mr_handle); - *iptr++ = cpu_to_be32(mr->mr_length); - xdr_encode_hyper(iptr, mr->mr_offset); -} - static int encode_rdma_segment(struct xdr_stream *xdr, struct rpcrdma_mr *mr) { @@ -292,7 +284,7 @@ encode_rdma_segment(struct xdr_stream *xdr, struct rpcrdma_mr *mr) if (unlikely(!p)) return -EMSGSIZE; - xdr_encode_rdma_segment(p, mr); + xdr_encode_rdma_segment(p, mr->mr_handle, mr->mr_length, mr->mr_offset); return 0; } @@ -307,8 +299,8 @@ encode_read_segment(struct xdr_stream *xdr, struct rpcrdma_mr *mr, return -EMSGSIZE; *p++ = xdr_one; /* Item present */ - *p++ = cpu_to_be32(position); - xdr_encode_rdma_segment(p, mr); + xdr_encode_read_segment(p, position, mr->mr_handle, mr->mr_length, + mr->mr_offset); return 0; } diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index a78f1d22e9bb..38d8f0ee35ec 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -376,7 +376,6 @@ static ssize_t svc_rdma_encode_write_segment(__be32 *src, xdr_decode_rdma_segment(src, &handle, &length, &offset); - *p++ = cpu_to_be32(handle); if (*remaining < length) { /* segment only partly filled */ length = *remaining; @@ -385,8 +384,7 @@ static ssize_t svc_rdma_encode_write_segment(__be32 *src, /* entire segment was consumed */ *remaining -= length; } - *p++ = cpu_to_be32(length); - xdr_encode_hyper(p, offset); + xdr_encode_rdma_segment(p, handle, length, offset); trace_svcrdma_encode_wseg(handle, length, offset); return len; -- cgit v1.2.3 From f7bd657b55e3484cadc37a6439de23d2fd703bd6 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 19 May 2020 09:30:32 -0400 Subject: svcrdma: Introduce infrastructure to support completion IDs The goal is to replace CQE kernel memory addresses in completion- related tracepoints. Each completion ID matches an incoming Send or Receive completion to a Completion Queue and to a previous ib_post_*(). The ID can then be displayed in an error message or recorded in a trace record. Signed-off-by: Chuck Lever --- include/linux/sunrpc/rpc_rdma_cid.h | 24 +++++++++++++++++++++ include/trace/events/rpcrdma.h | 43 +++++++++++++++++++++++++++++++++++++ 2 files changed, 67 insertions(+) create mode 100644 include/linux/sunrpc/rpc_rdma_cid.h diff --git a/include/linux/sunrpc/rpc_rdma_cid.h b/include/linux/sunrpc/rpc_rdma_cid.h new file mode 100644 index 000000000000..be24ab2baa6a --- /dev/null +++ b/include/linux/sunrpc/rpc_rdma_cid.h @@ -0,0 +1,24 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * * Copyright (c) 2020, Oracle and/or its affiliates. + */ + +#ifndef RPC_RDMA_CID_H +#define RPC_RDMA_CID_H + +/* + * The rpc_rdma_cid struct records completion ID information. A + * completion ID matches an incoming Send or Receive completion + * to a Completion Queue and to a previous ib_post_*(). The ID + * can then be displayed in an error message or recorded in a + * trace record. + * + * This struct is shared between the server and client RPC/RDMA + * transport implementations. + */ +struct rpc_rdma_cid { + u32 ci_queue_id; + int ci_completion_id; +}; + +#endif /* RPC_RDMA_CID_H */ diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h index 0eff80dee066..70ab989aa3b7 100644 --- a/include/trace/events/rpcrdma.h +++ b/include/trace/events/rpcrdma.h @@ -11,6 +11,7 @@ #define _TRACE_RPCRDMA_H #include +#include #include #include @@ -18,6 +19,48 @@ ** Event classes **/ +DECLARE_EVENT_CLASS(rpcrdma_completion_class, + TP_PROTO( + const struct ib_wc *wc, + const struct rpc_rdma_cid *cid + ), + + TP_ARGS(wc, cid), + + TP_STRUCT__entry( + __field(u32, cq_id) + __field(int, completion_id) + __field(unsigned long, status) + __field(unsigned int, vendor_err) + ), + + TP_fast_assign( + __entry->cq_id = cid->ci_queue_id; + __entry->completion_id = cid->ci_completion_id; + __entry->status = wc->status; + if (wc->status) + __entry->vendor_err = wc->vendor_err; + else + __entry->vendor_err = 0; + ), + + TP_printk("cq.id=%u cid=%d status=%s (%lu/0x%x)", + __entry->cq_id, __entry->completion_id, + rdma_show_wc_status(__entry->status), + __entry->status, __entry->vendor_err + ) +); + +#define DEFINE_COMPLETION_EVENT(name) \ + DEFINE_EVENT(rpcrdma_completion_class, name, \ + TP_PROTO( \ + const struct ib_wc *wc, \ + const struct rpc_rdma_cid *cid \ + ), \ + TP_ARGS(wc, cid)) + +DEFINE_COMPLETION_EVENT(dummy); + DECLARE_EVENT_CLASS(xprtrdma_reply_event, TP_PROTO( const struct rpcrdma_rep *rep -- cgit v1.2.3 From 9b3bcf8c5c134038e30624db5b57992ae50b80a9 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 29 Apr 2020 16:22:26 -0400 Subject: svcrdma: Introduce Receive completion IDs Set up a completion ID in each svc_rdma_recv_ctxt. The ID is used to match an incoming Receive completion to a transport and to a previous ib_post_recv(). Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc_rdma.h | 4 +++ include/trace/events/rpcrdma.h | 51 +++++++++++++-------------------- net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 15 ++++++++-- 3 files changed, 36 insertions(+), 34 deletions(-) diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index d28ca1b6f2eb..c3c1e46f510f 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -46,6 +46,7 @@ #include #include #include +#include #include #include @@ -109,6 +110,8 @@ struct svcxprt_rdma { struct work_struct sc_work; struct llist_head sc_recv_ctxts; + + atomic_t sc_completion_ids; }; /* sc_flags */ #define RDMAXPRT_CONN_PENDING 3 @@ -129,6 +132,7 @@ struct svc_rdma_recv_ctxt { struct list_head rc_list; struct ib_recv_wr rc_recv_wr; struct ib_cqe rc_cqe; + struct rpc_rdma_cid rc_cid; struct ib_sge rc_recv_sge; void *rc_recv_buf; struct xdr_buf rc_arg; diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h index 70ab989aa3b7..a0330a557e34 100644 --- a/include/trace/events/rpcrdma.h +++ b/include/trace/events/rpcrdma.h @@ -59,8 +59,6 @@ DECLARE_EVENT_CLASS(rpcrdma_completion_class, ), \ TP_ARGS(wc, cid)) -DEFINE_COMPLETION_EVENT(dummy); - DECLARE_EVENT_CLASS(xprtrdma_reply_event, TP_PROTO( const struct rpcrdma_rep *rep @@ -1849,57 +1847,48 @@ DEFINE_SENDCOMP_EVENT(send); TRACE_EVENT(svcrdma_post_recv, TP_PROTO( - const struct ib_recv_wr *wr, - int status + const struct svc_rdma_recv_ctxt *ctxt ), - TP_ARGS(wr, status), + TP_ARGS(ctxt), TP_STRUCT__entry( - __field(const void *, cqe) - __field(int, status) + __field(u32, cq_id) + __field(int, completion_id) ), TP_fast_assign( - __entry->cqe = wr->wr_cqe; - __entry->status = status; + __entry->cq_id = ctxt->rc_cid.ci_queue_id; + __entry->completion_id = ctxt->rc_cid.ci_completion_id; ), - TP_printk("cqe=%p status=%d", - __entry->cqe, __entry->status + TP_printk("cq.id=%d cid=%d", + __entry->cq_id, __entry->completion_id ) ); -TRACE_EVENT(svcrdma_wc_receive, +DEFINE_COMPLETION_EVENT(svcrdma_wc_receive); + +TRACE_EVENT(svcrdma_rq_post_err, TP_PROTO( - const struct ib_wc *wc + const struct svcxprt_rdma *rdma, + int status ), - TP_ARGS(wc), + TP_ARGS(rdma, status), TP_STRUCT__entry( - __field(const void *, cqe) - __field(u32, byte_len) - __field(unsigned int, status) - __field(u32, vendor_err) + __field(int, status) + __string(addr, rdma->sc_xprt.xpt_remotebuf) ), TP_fast_assign( - __entry->cqe = wc->wr_cqe; - __entry->status = wc->status; - if (wc->status) { - __entry->byte_len = 0; - __entry->vendor_err = wc->vendor_err; - } else { - __entry->byte_len = wc->byte_len; - __entry->vendor_err = 0; - } + __entry->status = status; + __assign_str(addr, rdma->sc_xprt.xpt_remotebuf); ), - TP_printk("cqe=%p byte_len=%u status=%s (%u/0x%x)", - __entry->cqe, __entry->byte_len, - rdma_show_wc_status(__entry->status), - __entry->status, __entry->vendor_err + TP_printk("addr=%s status=%d", + __get_str(addr), __entry->status ) ); diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index c0587d3cd389..e6d7401232d2 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -117,6 +117,13 @@ svc_rdma_next_recv_ctxt(struct list_head *list) rc_list); } +static void svc_rdma_recv_cid_init(struct svcxprt_rdma *rdma, + struct rpc_rdma_cid *cid) +{ + cid->ci_queue_id = rdma->sc_rq_cq->res.id; + cid->ci_completion_id = atomic_inc_return(&rdma->sc_completion_ids); +} + static struct svc_rdma_recv_ctxt * svc_rdma_recv_ctxt_alloc(struct svcxprt_rdma *rdma) { @@ -135,6 +142,8 @@ svc_rdma_recv_ctxt_alloc(struct svcxprt_rdma *rdma) if (ib_dma_mapping_error(rdma->sc_pd->device, addr)) goto fail2; + svc_rdma_recv_cid_init(rdma, &ctxt->rc_cid); + ctxt->rc_recv_wr.next = NULL; ctxt->rc_recv_wr.wr_cqe = &ctxt->rc_cqe; ctxt->rc_recv_wr.sg_list = &ctxt->rc_recv_sge; @@ -249,13 +258,14 @@ static int __svc_rdma_post_recv(struct svcxprt_rdma *rdma, int ret; svc_xprt_get(&rdma->sc_xprt); + trace_svcrdma_post_recv(ctxt); ret = ib_post_recv(rdma->sc_qp, &ctxt->rc_recv_wr, NULL); - trace_svcrdma_post_recv(&ctxt->rc_recv_wr, ret); if (ret) goto err_post; return 0; err_post: + trace_svcrdma_rq_post_err(rdma, ret); svc_rdma_recv_ctxt_put(rdma, ctxt); svc_xprt_put(&rdma->sc_xprt); return ret; @@ -309,11 +319,10 @@ static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc) struct ib_cqe *cqe = wc->wr_cqe; struct svc_rdma_recv_ctxt *ctxt; - trace_svcrdma_wc_receive(wc); - /* WARNING: Only wc->wr_cqe and wc->status are reliable */ ctxt = container_of(cqe, struct svc_rdma_recv_ctxt, rc_cqe); + trace_svcrdma_wc_receive(wc, &ctxt->rc_cid); if (wc->status != IB_WC_SUCCESS) goto flushed; -- cgit v1.2.3 From 007140ee9b4fc4e59538677799c916890a2f13e2 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 29 Apr 2020 17:16:31 -0400 Subject: svcrdma: Record Receive completion ID in svc_rdma_decode_rqst When recording a trace event in the Receive path, tie decoding results and errors to an incoming Receive completion. Signed-off-by: Chuck Lever --- include/trace/events/rpcrdma.h | 34 ++++++++++++++++++++++++++------- net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 12 ++++++------ 2 files changed, 33 insertions(+), 13 deletions(-) diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h index a0330a557e34..df49ae5d447b 100644 --- a/include/trace/events/rpcrdma.h +++ b/include/trace/events/rpcrdma.h @@ -1369,13 +1369,16 @@ TRACE_DEFINE_ENUM(RDMA_ERROR); TRACE_EVENT(svcrdma_decode_rqst, TP_PROTO( + const struct svc_rdma_recv_ctxt *ctxt, __be32 *p, unsigned int hdrlen ), - TP_ARGS(p, hdrlen), + TP_ARGS(ctxt, p, hdrlen), TP_STRUCT__entry( + __field(u32, cq_id) + __field(int, completion_id) __field(u32, xid) __field(u32, vers) __field(u32, proc) @@ -1384,6 +1387,8 @@ TRACE_EVENT(svcrdma_decode_rqst, ), TP_fast_assign( + __entry->cq_id = ctxt->rc_cid.ci_queue_id; + __entry->completion_id = ctxt->rc_cid.ci_completion_id; __entry->xid = be32_to_cpup(p++); __entry->vers = be32_to_cpup(p++); __entry->credits = be32_to_cpup(p++); @@ -1391,37 +1396,48 @@ TRACE_EVENT(svcrdma_decode_rqst, __entry->hdrlen = hdrlen; ), - TP_printk("xid=0x%08x vers=%u credits=%u proc=%s hdrlen=%u", + TP_printk("cq.id=%u cid=%d xid=0x%08x vers=%u credits=%u proc=%s hdrlen=%u", + __entry->cq_id, __entry->completion_id, __entry->xid, __entry->vers, __entry->credits, show_rpcrdma_proc(__entry->proc), __entry->hdrlen) ); TRACE_EVENT(svcrdma_decode_short_err, TP_PROTO( + const struct svc_rdma_recv_ctxt *ctxt, unsigned int hdrlen ), - TP_ARGS(hdrlen), + TP_ARGS(ctxt, hdrlen), TP_STRUCT__entry( + __field(u32, cq_id) + __field(int, completion_id) __field(unsigned int, hdrlen) ), TP_fast_assign( + __entry->cq_id = ctxt->rc_cid.ci_queue_id; + __entry->completion_id = ctxt->rc_cid.ci_completion_id; __entry->hdrlen = hdrlen; ), - TP_printk("hdrlen=%u", __entry->hdrlen) + TP_printk("cq.id=%u cid=%d hdrlen=%u", + __entry->cq_id, __entry->completion_id, + __entry->hdrlen) ); DECLARE_EVENT_CLASS(svcrdma_badreq_event, TP_PROTO( + const struct svc_rdma_recv_ctxt *ctxt, __be32 *p ), - TP_ARGS(p), + TP_ARGS(ctxt, p), TP_STRUCT__entry( + __field(u32, cq_id) + __field(int, completion_id) __field(u32, xid) __field(u32, vers) __field(u32, proc) @@ -1429,13 +1445,16 @@ DECLARE_EVENT_CLASS(svcrdma_badreq_event, ), TP_fast_assign( + __entry->cq_id = ctxt->rc_cid.ci_queue_id; + __entry->completion_id = ctxt->rc_cid.ci_completion_id; __entry->xid = be32_to_cpup(p++); __entry->vers = be32_to_cpup(p++); __entry->credits = be32_to_cpup(p++); __entry->proc = be32_to_cpup(p); ), - TP_printk("xid=0x%08x vers=%u credits=%u proc=%u", + TP_printk("cq.id=%u cid=%d xid=0x%08x vers=%u credits=%u proc=%u", + __entry->cq_id, __entry->completion_id, __entry->xid, __entry->vers, __entry->credits, __entry->proc) ); @@ -1443,9 +1462,10 @@ DECLARE_EVENT_CLASS(svcrdma_badreq_event, DEFINE_EVENT(svcrdma_badreq_event, \ svcrdma_decode_##name##_err, \ TP_PROTO( \ + const struct svc_rdma_recv_ctxt *ctxt, \ __be32 *p \ ), \ - TP_ARGS(p)) + TP_ARGS(ctxt, p)) DEFINE_BADREQ_EVENT(badvers); DEFINE_BADREQ_EVENT(drop); diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index e6d7401232d2..d5ec85cb652c 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -667,27 +667,27 @@ static int svc_rdma_xdr_decode_req(struct xdr_buf *rq_arg, hdr_len = xdr_stream_pos(&rctxt->rc_stream); rq_arg->head[0].iov_len -= hdr_len; rq_arg->len -= hdr_len; - trace_svcrdma_decode_rqst(rdma_argp, hdr_len); + trace_svcrdma_decode_rqst(rctxt, rdma_argp, hdr_len); return hdr_len; out_short: - trace_svcrdma_decode_short_err(rq_arg->len); + trace_svcrdma_decode_short_err(rctxt, rq_arg->len); return -EINVAL; out_version: - trace_svcrdma_decode_badvers_err(rdma_argp); + trace_svcrdma_decode_badvers_err(rctxt, rdma_argp); return -EPROTONOSUPPORT; out_drop: - trace_svcrdma_decode_drop_err(rdma_argp); + trace_svcrdma_decode_drop_err(rctxt, rdma_argp); return 0; out_proc: - trace_svcrdma_decode_badproc_err(rdma_argp); + trace_svcrdma_decode_badproc_err(rctxt, rdma_argp); return -EINVAL; out_inval: - trace_svcrdma_decode_parse_err(rdma_argp); + trace_svcrdma_decode_parse_err(rctxt, rdma_argp); return -EINVAL; } -- cgit v1.2.3 From 3ac56c2fb166fea25974d8c48bb4a72ee298361b Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 30 Apr 2020 13:47:07 -0400 Subject: svcrdma: Introduce Send completion IDs Set up a completion ID in each svc_rdma_send_ctxt. The ID is used to match an incoming Send completion to a transport and to a previous ib_post_send(). Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc_rdma.h | 2 ++ include/trace/events/rpcrdma.h | 2 +- net/sunrpc/xprtrdma/svc_rdma_sendto.c | 15 ++++++++++++--- 3 files changed, 15 insertions(+), 4 deletions(-) diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index c3c1e46f510f..c91e00bc937e 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -151,6 +151,8 @@ struct svc_rdma_recv_ctxt { struct svc_rdma_send_ctxt { struct list_head sc_list; + struct rpc_rdma_cid sc_cid; + struct ib_send_wr sc_send_wr; struct ib_cqe sc_cqe; struct xdr_buf sc_hdrbuf; diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h index df49ae5d447b..782a4d826a4b 100644 --- a/include/trace/events/rpcrdma.h +++ b/include/trace/events/rpcrdma.h @@ -1863,7 +1863,7 @@ TRACE_EVENT(svcrdma_post_send, ) ); -DEFINE_SENDCOMP_EVENT(send); +DEFINE_COMPLETION_EVENT(svcrdma_wc_send); TRACE_EVENT(svcrdma_post_recv, TP_PROTO( diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index 38d8f0ee35ec..c720dcf56231 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -122,6 +122,13 @@ svc_rdma_next_send_ctxt(struct list_head *list) sc_list); } +static void svc_rdma_send_cid_init(struct svcxprt_rdma *rdma, + struct rpc_rdma_cid *cid) +{ + cid->ci_queue_id = rdma->sc_sq_cq->res.id; + cid->ci_completion_id = atomic_inc_return(&rdma->sc_completion_ids); +} + static struct svc_rdma_send_ctxt * svc_rdma_send_ctxt_alloc(struct svcxprt_rdma *rdma) { @@ -144,6 +151,8 @@ svc_rdma_send_ctxt_alloc(struct svcxprt_rdma *rdma) if (ib_dma_mapping_error(rdma->sc_pd->device, addr)) goto fail2; + svc_rdma_send_cid_init(rdma, &ctxt->sc_cid); + ctxt->sc_send_wr.next = NULL; ctxt->sc_send_wr.wr_cqe = &ctxt->sc_cqe; ctxt->sc_send_wr.sg_list = ctxt->sc_sges; @@ -268,14 +277,14 @@ static void svc_rdma_wc_send(struct ib_cq *cq, struct ib_wc *wc) { struct svcxprt_rdma *rdma = cq->cq_context; struct ib_cqe *cqe = wc->wr_cqe; - struct svc_rdma_send_ctxt *ctxt; + struct svc_rdma_send_ctxt *ctxt = + container_of(cqe, struct svc_rdma_send_ctxt, sc_cqe); - trace_svcrdma_wc_send(wc); + trace_svcrdma_wc_send(wc, &ctxt->sc_cid); atomic_inc(&rdma->sc_sq_avail); wake_up(&rdma->sc_send_wait); - ctxt = container_of(cqe, struct svc_rdma_send_ctxt, sc_cqe); svc_rdma_send_ctxt_put(rdma, ctxt); if (unlikely(wc->status != IB_WC_SUCCESS)) { -- cgit v1.2.3 From 17f70f8dd52be3723250d21093403bb3a9f2162f Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 29 Apr 2020 11:05:33 -0400 Subject: svcrdma: Record send_ctxt completion ID in trace_svcrdma_post_send() First, refactor: Dereference the svc_rdma_send_ctxt inside svc_rdma_send() instead of at every call site. Then, it can be passed into trace_svcrdma_post_send() to get the proper completion ID. Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc_rdma.h | 3 ++- include/trace/events/rpcrdma.h | 18 +++++++++++------- net/sunrpc/xprtrdma/svc_rdma_backchannel.c | 2 +- net/sunrpc/xprtrdma/svc_rdma_sendto.c | 11 ++++++----- 4 files changed, 20 insertions(+), 14 deletions(-) diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index c91e00bc937e..9dc3a3b88391 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -196,7 +196,8 @@ extern struct svc_rdma_send_ctxt * svc_rdma_send_ctxt_get(struct svcxprt_rdma *rdma); extern void svc_rdma_send_ctxt_put(struct svcxprt_rdma *rdma, struct svc_rdma_send_ctxt *ctxt); -extern int svc_rdma_send(struct svcxprt_rdma *rdma, struct ib_send_wr *wr); +extern int svc_rdma_send(struct svcxprt_rdma *rdma, + struct svc_rdma_send_ctxt *ctxt); extern int svc_rdma_map_reply_msg(struct svcxprt_rdma *rdma, struct svc_rdma_send_ctxt *sctxt, const struct svc_rdma_recv_ctxt *rctxt, diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h index 782a4d826a4b..aeeba9188ed5 100644 --- a/include/trace/events/rpcrdma.h +++ b/include/trace/events/rpcrdma.h @@ -1839,27 +1839,31 @@ DECLARE_EVENT_CLASS(svcrdma_sendcomp_event, TRACE_EVENT(svcrdma_post_send, TP_PROTO( - const struct ib_send_wr *wr + const struct svc_rdma_send_ctxt *ctxt ), - TP_ARGS(wr), + TP_ARGS(ctxt), TP_STRUCT__entry( - __field(const void *, cqe) + __field(u32, cq_id) + __field(int, completion_id) __field(unsigned int, num_sge) __field(u32, inv_rkey) ), TP_fast_assign( - __entry->cqe = wr->wr_cqe; + const struct ib_send_wr *wr = &ctxt->sc_send_wr; + + __entry->cq_id = ctxt->sc_cid.ci_queue_id; + __entry->completion_id = ctxt->sc_cid.ci_completion_id; __entry->num_sge = wr->num_sge; __entry->inv_rkey = (wr->opcode == IB_WR_SEND_WITH_INV) ? wr->ex.invalidate_rkey : 0; ), - TP_printk("cqe=%p num_sge=%u inv_rkey=0x%08x", - __entry->cqe, __entry->num_sge, - __entry->inv_rkey + TP_printk("cq_id=%u cid=%d num_sge=%u inv_rkey=0x%08x", + __entry->cq_id, __entry->completion_id, + __entry->num_sge, __entry->inv_rkey ) ); diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c index 1ee73f7cf931..5e7c4ba9e147 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c +++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c @@ -87,7 +87,7 @@ static int svc_rdma_bc_sendto(struct svcxprt_rdma *rdma, */ get_page(virt_to_page(rqst->rq_buffer)); ctxt->sc_send_wr.opcode = IB_WR_SEND; - return svc_rdma_send(rdma, &ctxt->sc_send_wr); + return svc_rdma_send(rdma, ctxt); } /* Server-side transport endpoint wants a whole page for its send diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index c720dcf56231..73d46e8cdc16 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -298,13 +298,14 @@ static void svc_rdma_wc_send(struct ib_cq *cq, struct ib_wc *wc) /** * svc_rdma_send - Post a single Send WR * @rdma: transport on which to post the WR - * @wr: prepared Send WR to post + * @ctxt: send ctxt with a Send WR ready to post * * Returns zero the Send WR was posted successfully. Otherwise, a * negative errno is returned. */ -int svc_rdma_send(struct svcxprt_rdma *rdma, struct ib_send_wr *wr) +int svc_rdma_send(struct svcxprt_rdma *rdma, struct svc_rdma_send_ctxt *ctxt) { + struct ib_send_wr *wr = &ctxt->sc_send_wr; int ret; might_sleep(); @@ -330,7 +331,7 @@ int svc_rdma_send(struct svcxprt_rdma *rdma, struct ib_send_wr *wr) } svc_xprt_get(&rdma->sc_xprt); - trace_svcrdma_post_send(wr); + trace_svcrdma_post_send(ctxt); ret = ib_post_send(rdma->sc_qp, wr, NULL); if (ret) break; @@ -805,7 +806,7 @@ static int svc_rdma_send_reply_msg(struct svcxprt_rdma *rdma, } else { sctxt->sc_send_wr.opcode = IB_WR_SEND; } - return svc_rdma_send(rdma, &sctxt->sc_send_wr); + return svc_rdma_send(rdma, sctxt); } /** @@ -869,7 +870,7 @@ void svc_rdma_send_error_msg(struct svcxprt_rdma *rdma, sctxt->sc_send_wr.num_sge = 1; sctxt->sc_send_wr.opcode = IB_WR_SEND; sctxt->sc_sges[0].length = sctxt->sc_hdrbuf.len; - if (svc_rdma_send(rdma, &sctxt->sc_send_wr)) + if (svc_rdma_send(rdma, sctxt)) goto put_ctxt; return; -- cgit v1.2.3 From 6787f0bea27a24e4c306616565b02234ee558cfb Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 29 Apr 2020 17:25:36 -0400 Subject: svcrdma: Display chunk completion ID when posting a rw_ctxt Re-use the post_rw tracepoint (safely) to trace cc_info lifetime events, including completion IDs. Signed-off-by: Chuck Lever --- include/trace/events/rpcrdma.h | 56 +++++++++------------------------------ net/sunrpc/xprtrdma/svc_rdma_rw.c | 14 ++++++++-- 2 files changed, 24 insertions(+), 46 deletions(-) diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h index aeeba9188ed5..abe942225637 100644 --- a/include/trace/events/rpcrdma.h +++ b/include/trace/events/rpcrdma.h @@ -1802,41 +1802,6 @@ TRACE_EVENT(svcrdma_send_err, ) ); -DECLARE_EVENT_CLASS(svcrdma_sendcomp_event, - TP_PROTO( - const struct ib_wc *wc - ), - - TP_ARGS(wc), - - TP_STRUCT__entry( - __field(const void *, cqe) - __field(unsigned int, status) - __field(unsigned int, vendor_err) - ), - - TP_fast_assign( - __entry->cqe = wc->wr_cqe; - __entry->status = wc->status; - if (wc->status) - __entry->vendor_err = wc->vendor_err; - else - __entry->vendor_err = 0; - ), - - TP_printk("cqe=%p status=%s (%u/0x%x)", - __entry->cqe, rdma_show_wc_status(__entry->status), - __entry->status, __entry->vendor_err - ) -); - -#define DEFINE_SENDCOMP_EVENT(name) \ - DEFINE_EVENT(svcrdma_sendcomp_event, svcrdma_wc_##name, \ - TP_PROTO( \ - const struct ib_wc *wc \ - ), \ - TP_ARGS(wc)) - TRACE_EVENT(svcrdma_post_send, TP_PROTO( const struct svc_rdma_send_ctxt *ctxt @@ -1916,31 +1881,34 @@ TRACE_EVENT(svcrdma_rq_post_err, ) ); -TRACE_EVENT(svcrdma_post_rw, +TRACE_EVENT(svcrdma_post_chunk, TP_PROTO( - const void *cqe, + const struct rpc_rdma_cid *cid, int sqecount ), - TP_ARGS(cqe, sqecount), + TP_ARGS(cid, sqecount), TP_STRUCT__entry( - __field(const void *, cqe) + __field(u32, cq_id) + __field(int, completion_id) __field(int, sqecount) ), TP_fast_assign( - __entry->cqe = cqe; + __entry->cq_id = cid->ci_queue_id; + __entry->completion_id = cid->ci_completion_id; __entry->sqecount = sqecount; ), - TP_printk("cqe=%p sqecount=%d", - __entry->cqe, __entry->sqecount + TP_printk("cq.id=%u cid=%d sqecount=%d", + __entry->cq_id, __entry->completion_id, + __entry->sqecount ) ); -DEFINE_SENDCOMP_EVENT(read); -DEFINE_SENDCOMP_EVENT(write); +DEFINE_COMPLETION_EVENT(svcrdma_wc_read); +DEFINE_COMPLETION_EVENT(svcrdma_wc_write); TRACE_EVENT(svcrdma_qp_error, TP_PROTO( diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c index 2038b1b286dd..c16d10601d65 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_rw.c +++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c @@ -145,15 +145,24 @@ static int svc_rdma_rw_ctx_init(struct svcxprt_rdma *rdma, * demand, and not cached. */ struct svc_rdma_chunk_ctxt { + struct rpc_rdma_cid cc_cid; struct ib_cqe cc_cqe; struct svcxprt_rdma *cc_rdma; struct list_head cc_rwctxts; int cc_sqecount; }; +static void svc_rdma_cc_cid_init(struct svcxprt_rdma *rdma, + struct rpc_rdma_cid *cid) +{ + cid->ci_queue_id = rdma->sc_sq_cq->res.id; + cid->ci_completion_id = atomic_inc_return(&rdma->sc_completion_ids); +} + static void svc_rdma_cc_init(struct svcxprt_rdma *rdma, struct svc_rdma_chunk_ctxt *cc) { + svc_rdma_cc_cid_init(rdma, &cc->cc_cid); cc->cc_rdma = rdma; svc_xprt_get(&rdma->sc_xprt); @@ -237,7 +246,7 @@ static void svc_rdma_write_done(struct ib_cq *cq, struct ib_wc *wc) struct svc_rdma_write_info *info = container_of(cc, struct svc_rdma_write_info, wi_cc); - trace_svcrdma_wc_write(wc); + trace_svcrdma_wc_write(wc, &cc->cc_cid); atomic_add(cc->cc_sqecount, &rdma->sc_sq_avail); wake_up(&rdma->sc_send_wait); @@ -295,7 +304,7 @@ static void svc_rdma_wc_read_done(struct ib_cq *cq, struct ib_wc *wc) struct svc_rdma_read_info *info = container_of(cc, struct svc_rdma_read_info, ri_cc); - trace_svcrdma_wc_read(wc); + trace_svcrdma_wc_read(wc, &cc->cc_cid); atomic_add(cc->cc_sqecount, &rdma->sc_sq_avail); wake_up(&rdma->sc_send_wait); @@ -351,6 +360,7 @@ static int svc_rdma_post_chunk_ctxt(struct svc_rdma_chunk_ctxt *cc) do { if (atomic_sub_return(cc->cc_sqecount, &rdma->sc_sq_avail) > 0) { + trace_svcrdma_post_chunk(&cc->cc_cid, cc->cc_sqecount); ret = ib_post_send(rdma->sc_qp, first_wr, &bad_wr); if (ret) break; -- cgit v1.2.3 From 0b7cd9d9ca66e5181f3a120cb5b44433efd0ad55 Mon Sep 17 00:00:00 2001 From: Xu Wang Date: Thu, 18 Jun 2020 01:56:13 +0000 Subject: nfsd: Use seq_putc() in two functions A single character (line break) should be put into a sequence. Thus use the corresponding function "seq_putc()". Signed-off-by: Xu Wang Signed-off-by: Chuck Lever --- fs/nfsd/nfs4idmap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c index 9460be8a8321..f92161ce1f97 100644 --- a/fs/nfsd/nfs4idmap.c +++ b/fs/nfsd/nfs4idmap.c @@ -168,7 +168,7 @@ idtoname_show(struct seq_file *m, struct cache_detail *cd, struct cache_head *h) ent->id); if (test_bit(CACHE_VALID, &h->flags)) seq_printf(m, " %s", ent->name); - seq_printf(m, "\n"); + seq_putc(m, '\n'); return 0; } @@ -346,7 +346,7 @@ nametoid_show(struct seq_file *m, struct cache_detail *cd, struct cache_head *h) ent->name); if (test_bit(CACHE_VALID, &h->flags)) seq_printf(m, " %u", ent->id); - seq_printf(m, "\n"); + seq_putc(m, '\n'); return 0; } -- cgit v1.2.3 From 94415b06eb8aed13481646026dc995f04a3a534a Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Tue, 7 Jul 2020 09:28:05 -0400 Subject: nfsd4: a client's own opens needn't prevent delegations We recently fixed lease breaking so that a client's actions won't break its own delegations. But we still have an unnecessary self-conflict when granting delegations: a client's own write opens will prevent us from handing out a read delegation even when no other client has the file open for write. Fix that by turning off the checks for conflicting opens under vfs_setlease, and instead performing those checks in the nfsd code. We don't depend much on locks here: instead we acquire the delegation, then check for conflicts, and drop the delegation again if we find any. The check beforehand is an optimization of sorts, just to avoid acquiring the delegation unnecessarily. There's a race where the first check could cause us to deny the delegation when we could have granted it. But, that's OK, delegation grants are optional (and probably not even a good idea in that case). Signed-off-by: J. Bruce Fields Signed-off-by: Chuck Lever --- fs/locks.c | 3 +++ fs/nfsd/nfs4state.c | 54 +++++++++++++++++++++++++++++++++++++++-------------- 2 files changed, 43 insertions(+), 14 deletions(-) diff --git a/fs/locks.c b/fs/locks.c index 7df0f9fa66f4..d5de9039dbd7 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -1807,6 +1807,9 @@ check_conflicting_open(struct file *filp, const long arg, int flags) if (flags & FL_LAYOUT) return 0; + if (flags & FL_DELEG) + /* We leave these checks to the caller. */ + return 0; if (arg == F_RDLCK) return inode_is_open_for_write(inode) ? -EAGAIN : 0; diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index cce2510b2cca..fdba971d06c3 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -4922,6 +4922,32 @@ static struct file_lock *nfs4_alloc_init_lease(struct nfs4_delegation *dp, return fl; } +static int nfsd4_check_conflicting_opens(struct nfs4_client *clp, + struct nfs4_file *fp) +{ + struct nfs4_clnt_odstate *co; + struct file *f = fp->fi_deleg_file->nf_file; + struct inode *ino = locks_inode(f); + int writes = atomic_read(&ino->i_writecount); + + if (fp->fi_fds[O_WRONLY]) + writes--; + if (fp->fi_fds[O_RDWR]) + writes--; + WARN_ON_ONCE(writes < 0); + if (writes > 0) + return -EAGAIN; + spin_lock(&fp->fi_lock); + list_for_each_entry(co, &fp->fi_clnt_odstate, co_perfile) { + if (co->co_client != clp) { + spin_unlock(&fp->fi_lock); + return -EAGAIN; + } + } + spin_unlock(&fp->fi_lock); + return 0; +} + static struct nfs4_delegation * nfs4_set_delegation(struct nfs4_client *clp, struct svc_fh *fh, struct nfs4_file *fp, struct nfs4_clnt_odstate *odstate) @@ -4941,9 +4967,12 @@ nfs4_set_delegation(struct nfs4_client *clp, struct svc_fh *fh, nf = find_readable_file(fp); if (!nf) { - /* We should always have a readable file here */ - WARN_ON_ONCE(1); - return ERR_PTR(-EBADF); + /* + * We probably could attempt another open and get a read + * delegation, but for now, don't bother until the + * client actually sends us one. + */ + return ERR_PTR(-EAGAIN); } spin_lock(&state_lock); spin_lock(&fp->fi_lock); @@ -4973,11 +5002,19 @@ nfs4_set_delegation(struct nfs4_client *clp, struct svc_fh *fh, if (!fl) goto out_clnt_odstate; + status = nfsd4_check_conflicting_opens(clp, fp); + if (status) { + locks_free_lock(fl); + goto out_clnt_odstate; + } status = vfs_setlease(fp->fi_deleg_file->nf_file, fl->fl_type, &fl, NULL); if (fl) locks_free_lock(fl); if (status) goto out_clnt_odstate; + status = nfsd4_check_conflicting_opens(clp, fp); + if (status) + goto out_clnt_odstate; spin_lock(&state_lock); spin_lock(&fp->fi_lock); @@ -5059,17 +5096,6 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, goto out_no_deleg; if (!cb_up || !(oo->oo_flags & NFS4_OO_CONFIRMED)) goto out_no_deleg; - /* - * Also, if the file was opened for write or - * create, there's a good chance the client's - * about to write to it, resulting in an - * immediate recall (since we don't support - * write delegations): - */ - if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE) - goto out_no_deleg; - if (open->op_create == NFS4_OPEN_CREATE) - goto out_no_deleg; break; default: goto out_no_deleg; -- cgit v1.2.3 From df60446cd1fb487becd1f36f4c0da9e0e523c0cf Mon Sep 17 00:00:00 2001 From: Scott Mayhew Date: Fri, 10 Jul 2020 16:33:07 -0400 Subject: nfsd: avoid a NULL dereference in __cld_pipe_upcall() If the rpc_pipefs is unmounted, then the rpc_pipe->dentry becomes NULL and dereferencing the dentry->d_sb will trigger an oops. The only reason we're doing that is to determine the nfsd_net, which could instead be passed in by the caller. So do that instead. Fixes: 11a60d159259 ("nfsd: add a "GetVersion" upcall for nfsdcld") Signed-off-by: Scott Mayhew Signed-off-by: Chuck Lever --- fs/nfsd/nfs4recover.c | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index 9e40dfecf1b1..186fa2c2c6ba 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -747,13 +747,11 @@ struct cld_upcall { }; static int -__cld_pipe_upcall(struct rpc_pipe *pipe, void *cmsg) +__cld_pipe_upcall(struct rpc_pipe *pipe, void *cmsg, struct nfsd_net *nn) { int ret; struct rpc_pipe_msg msg; struct cld_upcall *cup = container_of(cmsg, struct cld_upcall, cu_u); - struct nfsd_net *nn = net_generic(pipe->dentry->d_sb->s_fs_info, - nfsd_net_id); memset(&msg, 0, sizeof(msg)); msg.data = cmsg; @@ -773,7 +771,7 @@ out: } static int -cld_pipe_upcall(struct rpc_pipe *pipe, void *cmsg) +cld_pipe_upcall(struct rpc_pipe *pipe, void *cmsg, struct nfsd_net *nn) { int ret; @@ -782,7 +780,7 @@ cld_pipe_upcall(struct rpc_pipe *pipe, void *cmsg) * upcalls queued. */ do { - ret = __cld_pipe_upcall(pipe, cmsg); + ret = __cld_pipe_upcall(pipe, cmsg, nn); } while (ret == -EAGAIN); return ret; @@ -1115,7 +1113,7 @@ nfsd4_cld_create(struct nfs4_client *clp) memcpy(cup->cu_u.cu_msg.cm_u.cm_name.cn_id, clp->cl_name.data, clp->cl_name.len); - ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_u.cu_msg); + ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_u.cu_msg, nn); if (!ret) { ret = cup->cu_u.cu_msg.cm_status; set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags); @@ -1180,7 +1178,7 @@ nfsd4_cld_create_v2(struct nfs4_client *clp) } else cmsg->cm_u.cm_clntinfo.cc_princhash.cp_len = 0; - ret = cld_pipe_upcall(cn->cn_pipe, cmsg); + ret = cld_pipe_upcall(cn->cn_pipe, cmsg, nn); if (!ret) { ret = cmsg->cm_status; set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags); @@ -1218,7 +1216,7 @@ nfsd4_cld_remove(struct nfs4_client *clp) memcpy(cup->cu_u.cu_msg.cm_u.cm_name.cn_id, clp->cl_name.data, clp->cl_name.len); - ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_u.cu_msg); + ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_u.cu_msg, nn); if (!ret) { ret = cup->cu_u.cu_msg.cm_status; clear_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags); @@ -1261,7 +1259,7 @@ nfsd4_cld_check_v0(struct nfs4_client *clp) memcpy(cup->cu_u.cu_msg.cm_u.cm_name.cn_id, clp->cl_name.data, clp->cl_name.len); - ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_u.cu_msg); + ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_u.cu_msg, nn); if (!ret) { ret = cup->cu_u.cu_msg.cm_status; set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags); @@ -1404,7 +1402,7 @@ nfsd4_cld_grace_start(struct nfsd_net *nn) } cup->cu_u.cu_msg.cm_cmd = Cld_GraceStart; - ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_u.cu_msg); + ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_u.cu_msg, nn); if (!ret) ret = cup->cu_u.cu_msg.cm_status; @@ -1432,7 +1430,7 @@ nfsd4_cld_grace_done_v0(struct nfsd_net *nn) cup->cu_u.cu_msg.cm_cmd = Cld_GraceDone; cup->cu_u.cu_msg.cm_u.cm_gracetime = nn->boot_time; - ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_u.cu_msg); + ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_u.cu_msg, nn); if (!ret) ret = cup->cu_u.cu_msg.cm_status; @@ -1460,7 +1458,7 @@ nfsd4_cld_grace_done(struct nfsd_net *nn) } cup->cu_u.cu_msg.cm_cmd = Cld_GraceDone; - ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_u.cu_msg); + ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_u.cu_msg, nn); if (!ret) ret = cup->cu_u.cu_msg.cm_status; @@ -1524,7 +1522,7 @@ nfsd4_cld_get_version(struct nfsd_net *nn) goto out_err; } cup->cu_u.cu_msg.cm_cmd = Cld_GetVersion; - ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_u.cu_msg); + ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_u.cu_msg, nn); if (!ret) { ret = cup->cu_u.cu_msg.cm_status; if (ret) -- cgit v1.2.3 From 986a4b63d3bc5f2c0eb4083b05aff2bf883b7b2f Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 24 Jul 2020 17:08:57 -0400 Subject: SUNRPC: Fix ("SUNRPC: Add "@len" parameter to gss_unwrap()") Braino when converting "buf->len -=" to "buf->len = len -". The result is under-estimation of the ralign and rslack values. On krb5p mounts, this has caused READDIR to fail with EIO, and KASAN splats when decoding READLINK replies. As a result of fixing this oversight, the gss_unwrap method now returns a buf->len that can be shorter than priv_len for small RPC messages. The additional adjustment done in unwrap_priv_data() can underflow buf->len. This causes the nfsd_request_too_large check to fail during some NFSv3 operations. Reported-by: Marian Rainer-Harbach Reported-by: Pierre Sauter BugLink: https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1886277 Fixes: 31c9590ae468 ("SUNRPC: Add "@len" parameter to gss_unwrap()") Reviewed-by: J. Bruce Fields Signed-off-by: Chuck Lever --- net/sunrpc/auth_gss/gss_krb5_wrap.c | 2 +- net/sunrpc/auth_gss/svcauth_gss.c | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/net/sunrpc/auth_gss/gss_krb5_wrap.c b/net/sunrpc/auth_gss/gss_krb5_wrap.c index cf0fd170ac18..90b8329fef82 100644 --- a/net/sunrpc/auth_gss/gss_krb5_wrap.c +++ b/net/sunrpc/auth_gss/gss_krb5_wrap.c @@ -584,7 +584,7 @@ gss_unwrap_kerberos_v2(struct krb5_ctx *kctx, int offset, int len, buf->head[0].iov_len); memmove(ptr, ptr + GSS_KRB5_TOK_HDR_LEN + headskip, movelen); buf->head[0].iov_len -= GSS_KRB5_TOK_HDR_LEN + headskip; - buf->len = len - GSS_KRB5_TOK_HDR_LEN + headskip; + buf->len = len - (GSS_KRB5_TOK_HDR_LEN + headskip); /* Trim off the trailing "extra count" and checksum blob */ xdr_buf_trim(buf, ec + GSS_KRB5_TOK_HDR_LEN + tailskip); diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c index 7d83f54aaaa6..258b04372f85 100644 --- a/net/sunrpc/auth_gss/svcauth_gss.c +++ b/net/sunrpc/auth_gss/svcauth_gss.c @@ -990,7 +990,6 @@ unwrap_priv_data(struct svc_rqst *rqstp, struct xdr_buf *buf, u32 seq, struct gs maj_stat = gss_unwrap(ctx, 0, priv_len, buf); pad = priv_len - buf->len; - buf->len -= pad; /* The upper layers assume the buffer is aligned on 4-byte boundaries. * In the krb5p case, at least, the data ends up offset, so we need to * move it around. */ -- cgit v1.2.3 From 94a4beaa6b3756dbb661bc9a748574b9bc0cb4ae Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 19 Jul 2020 17:14:03 -0700 Subject: nfsd: netns.h: delete a duplicated word Drop the repeated word "the" in a comment. Signed-off-by: Randy Dunlap Cc: "J. Bruce Fields" Cc: Chuck Lever Cc: linux-nfs@vger.kernel.org Signed-off-by: Chuck Lever --- fs/nfsd/netns.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h index 9217cb64bf0e..7346acda9d76 100644 --- a/fs/nfsd/netns.h +++ b/fs/nfsd/netns.h @@ -171,7 +171,7 @@ struct nfsd_net { unsigned int longest_chain_cachesize; struct shrinker nfsd_reply_cache_shrinker; - /* utsname taken from the the process that starts the server */ + /* utsname taken from the process that starts the server */ char nfsd_name[UNX_MAXNODENAME+1]; }; -- cgit v1.2.3 From a68d5a502bbacfbd31f98371f777d574b3a91baf Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 24 Jul 2020 15:26:47 -0400 Subject: SUNRPC: Refresh the show_rqstp_flags() macro Ensure that show_rqstp_flags() can recognize and display the RQ_AUTHERR flag, added in commit 83dd59a0b9af ("SUNRPC/nfs: Fix return value for nfs4_callback_compound()") and the RQ_DATA flag, added in commit ff3ac5c3dc23 ("SUNRPC: Add a server side per-connection limit"). Signed-off-by: Chuck Lever --- include/trace/events/sunrpc.h | 35 +++++++++++++++++++++++++++-------- 1 file changed, 27 insertions(+), 8 deletions(-) diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h index 6a12935b8b14..65d7dfbbc9cd 100644 --- a/include/trace/events/sunrpc.h +++ b/include/trace/events/sunrpc.h @@ -1250,15 +1250,34 @@ DECLARE_EVENT_CLASS(svc_xdr_buf_class, DEFINE_SVCXDRBUF_EVENT(recvfrom); DEFINE_SVCXDRBUF_EVENT(sendto); +/* + * from include/linux/sunrpc/svc.h + */ +#define SVC_RQST_FLAG_LIST \ + svc_rqst_flag(SECURE) \ + svc_rqst_flag(LOCAL) \ + svc_rqst_flag(USEDEFERRAL) \ + svc_rqst_flag(DROPME) \ + svc_rqst_flag(SPLICE_OK) \ + svc_rqst_flag(VICTIM) \ + svc_rqst_flag(BUSY) \ + svc_rqst_flag(DATA) \ + svc_rqst_flag_end(AUTHERR) + +#undef svc_rqst_flag +#undef svc_rqst_flag_end +#define svc_rqst_flag(x) TRACE_DEFINE_ENUM(RQ_##x); +#define svc_rqst_flag_end(x) TRACE_DEFINE_ENUM(RQ_##x); + +SVC_RQST_FLAG_LIST + +#undef svc_rqst_flag +#undef svc_rqst_flag_end +#define svc_rqst_flag(x) { BIT(RQ_##x), #x }, +#define svc_rqst_flag_end(x) { BIT(RQ_##x), #x } + #define show_rqstp_flags(flags) \ - __print_flags(flags, "|", \ - { (1UL << RQ_SECURE), "RQ_SECURE"}, \ - { (1UL << RQ_LOCAL), "RQ_LOCAL"}, \ - { (1UL << RQ_USEDEFERRAL), "RQ_USEDEFERRAL"}, \ - { (1UL << RQ_DROPME), "RQ_DROPME"}, \ - { (1UL << RQ_SPLICE_OK), "RQ_SPLICE_OK"}, \ - { (1UL << RQ_VICTIM), "RQ_VICTIM"}, \ - { (1UL << RQ_BUSY), "RQ_BUSY"}) + __print_flags(flags, "|", SVC_RQST_FLAG_LIST) TRACE_EVENT(svc_recv, TP_PROTO(struct svc_rqst *rqst, int len), -- cgit v1.2.3 From 64d26422516b2e347b32e6d9b1d40b3c19a62aae Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 30 Jun 2020 15:55:45 -0400 Subject: svcrdma: Fix another Receive buffer leak During a connection tear down, the Receive queue is flushed before the device resources are freed. Typically, all the Receives flush with IB_WR_FLUSH_ERR. However, any pending successful Receives flush with IB_WR_SUCCESS, and the server automatically posts a fresh Receive to replace the completing one. This happens even after the connection has closed and the RQ is drained. Receives that are posted after the RQ is drained appear never to complete, causing a Receive resource leak. The leaked Receive buffer is left DMA-mapped. To prevent these late-posted recv_ctxt's from leaking, block new Receive posting after XPT_CLOSE is set. Signed-off-by: Chuck Lever --- net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index d5ec85cb652c..5bb97b5f4606 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -275,6 +275,8 @@ static int svc_rdma_post_recv(struct svcxprt_rdma *rdma) { struct svc_rdma_recv_ctxt *ctxt; + if (test_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags)) + return 0; ctxt = svc_rdma_recv_ctxt_get(rdma); if (!ctxt) return -ENOMEM; -- cgit v1.2.3 From 365e9992b90fbbfda89f72866783fd52bbd58e64 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 30 Jun 2020 13:25:41 -0400 Subject: svcrdma: Remove transport reference counting Jason tells me that a ULP cannot rely on getting an ESTABLISHED and DISCONNECTED event pair for each connection, so transport reference counting in the CM event handler will never be reliable. Now that we have ib_drain_qp(), svcrdma should no longer need to hold transport references while Sends and Receives are posted. So remove the get/put call sites in the CM event handlers. This eliminates a significant source of locked memory bus traffic. Signed-off-by: Chuck Lever --- net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 6 +----- net/sunrpc/xprtrdma/svc_rdma_rw.c | 2 -- net/sunrpc/xprtrdma/svc_rdma_sendto.c | 4 ---- net/sunrpc/xprtrdma/svc_rdma_transport.c | 17 +---------------- 4 files changed, 2 insertions(+), 27 deletions(-) diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index 5bb97b5f4606..c6ea2903c21a 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -257,7 +257,6 @@ static int __svc_rdma_post_recv(struct svcxprt_rdma *rdma, { int ret; - svc_xprt_get(&rdma->sc_xprt); trace_svcrdma_post_recv(ctxt); ret = ib_post_recv(rdma->sc_qp, &ctxt->rc_recv_wr, NULL); if (ret) @@ -267,7 +266,6 @@ static int __svc_rdma_post_recv(struct svcxprt_rdma *rdma, err_post: trace_svcrdma_rq_post_err(rdma, ret); svc_rdma_recv_ctxt_put(rdma, ctxt); - svc_xprt_put(&rdma->sc_xprt); return ret; } @@ -344,15 +342,13 @@ static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc) spin_unlock(&rdma->sc_rq_dto_lock); if (!test_bit(RDMAXPRT_CONN_PENDING, &rdma->sc_flags)) svc_xprt_enqueue(&rdma->sc_xprt); - goto out; + return; flushed: post_err: svc_rdma_recv_ctxt_put(rdma, ctxt); set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags); svc_xprt_enqueue(&rdma->sc_xprt); -out: - svc_xprt_put(&rdma->sc_xprt); } /** diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c index c16d10601d65..fe54cbe97a46 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_rw.c +++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c @@ -164,7 +164,6 @@ static void svc_rdma_cc_init(struct svcxprt_rdma *rdma, { svc_rdma_cc_cid_init(rdma, &cc->cc_cid); cc->cc_rdma = rdma; - svc_xprt_get(&rdma->sc_xprt); INIT_LIST_HEAD(&cc->cc_rwctxts); cc->cc_sqecount = 0; @@ -184,7 +183,6 @@ static void svc_rdma_cc_release(struct svc_rdma_chunk_ctxt *cc, ctxt->rw_nents, dir); svc_rdma_put_rw_ctxt(rdma, ctxt); } - svc_xprt_put(&rdma->sc_xprt); } /* State for sending a Write or Reply chunk. diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index 73d46e8cdc16..7b94d971feb3 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -291,8 +291,6 @@ static void svc_rdma_wc_send(struct ib_cq *cq, struct ib_wc *wc) set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags); svc_xprt_enqueue(&rdma->sc_xprt); } - - svc_xprt_put(&rdma->sc_xprt); } /** @@ -330,7 +328,6 @@ int svc_rdma_send(struct svcxprt_rdma *rdma, struct svc_rdma_send_ctxt *ctxt) continue; } - svc_xprt_get(&rdma->sc_xprt); trace_svcrdma_post_send(ctxt); ret = ib_post_send(rdma->sc_qp, wr, NULL); if (ret) @@ -340,7 +337,6 @@ int svc_rdma_send(struct svcxprt_rdma *rdma, struct svc_rdma_send_ctxt *ctxt) trace_svcrdma_sq_post_err(rdma, ret); set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags); - svc_xprt_put(&rdma->sc_xprt); wake_up(&rdma->sc_send_wait); return ret; } diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index 3da7901a49e6..aa60f75c8c1d 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -271,7 +271,6 @@ static int rdma_cma_handler(struct rdma_cm_id *cma_id, switch (event->event) { case RDMA_CM_EVENT_ESTABLISHED: /* Accept complete */ - svc_xprt_get(xprt); dprintk("svcrdma: Connection completed on DTO xprt=%p, " "cm_id=%p\n", xprt, cma_id); clear_bit(RDMAXPRT_CONN_PENDING, &rdma->sc_flags); @@ -282,7 +281,6 @@ static int rdma_cma_handler(struct rdma_cm_id *cma_id, xprt, cma_id); set_bit(XPT_CLOSE, &xprt->xpt_flags); svc_xprt_enqueue(xprt); - svc_xprt_put(xprt); break; case RDMA_CM_EVENT_DEVICE_REMOVAL: dprintk("svcrdma: Device removal cma_id=%p, xprt = %p, " @@ -290,7 +288,6 @@ static int rdma_cma_handler(struct rdma_cm_id *cma_id, rdma_event_msg(event->event), event->event); set_bit(XPT_CLOSE, &xprt->xpt_flags); svc_xprt_enqueue(xprt); - svc_xprt_put(xprt); break; default: dprintk("svcrdma: Unexpected event on DTO endpoint %p, " @@ -539,24 +536,11 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) return NULL; } -/* - * When connected, an svc_xprt has at least two references: - * - * - A reference held by the cm_id between the ESTABLISHED and - * DISCONNECTED events. If the remote peer disconnected first, this - * reference could be gone. - * - * - A reference held by the svc_recv code that called this function - * as part of close processing. - * - * At a minimum one references should still be held. - */ static void svc_rdma_detach(struct svc_xprt *xprt) { struct svcxprt_rdma *rdma = container_of(xprt, struct svcxprt_rdma, sc_xprt); - /* Disconnect and flush posted WQE */ rdma_disconnect(rdma->sc_cm_id); } @@ -566,6 +550,7 @@ static void __svc_rdma_free(struct work_struct *work) container_of(work, struct svcxprt_rdma, sc_work); struct svc_xprt *xprt = &rdma->sc_xprt; + /* This blocks until the Completion Queues are empty */ if (rdma->sc_qp && !IS_ERR(rdma->sc_qp)) ib_drain_qp(rdma->sc_qp); -- cgit v1.2.3 From b297fed699ad9e50315b27e78de42ac631c9990d Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 30 Jun 2020 17:16:35 -0400 Subject: svcrdma: CM event handler clean up Now that there's a core tracepoint that reports these events, there's no need to maintain dprintk() call sites in each arm of the switch statements. We also refresh the documenting comments. Signed-off-by: Chuck Lever --- net/sunrpc/xprtrdma/svc_rdma_transport.c | 56 ++++++++++++++------------------ 1 file changed, 25 insertions(+), 31 deletions(-) diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index aa60f75c8c1d..fb044792b571 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -237,62 +237,56 @@ static void handle_connect_req(struct rdma_cm_id *new_cma_id, svc_xprt_enqueue(&listen_xprt->sc_xprt); } -/* - * Handles events generated on the listening endpoint. These events will be - * either be incoming connect requests or adapter removal events. +/** + * svc_rdma_listen_handler - Handle CM events generated on a listening endpoint + * @cma_id: the server's listener rdma_cm_id + * @event: details of the event + * + * Return values: + * %0: Do not destroy @cma_id + * %1: Destroy @cma_id (never returned here) + * + * NB: There is never a DEVICE_REMOVAL event for INADDR_ANY listeners. */ -static int rdma_listen_handler(struct rdma_cm_id *cma_id, - struct rdma_cm_event *event) +static int svc_rdma_listen_handler(struct rdma_cm_id *cma_id, + struct rdma_cm_event *event) { switch (event->event) { case RDMA_CM_EVENT_CONNECT_REQUEST: - dprintk("svcrdma: Connect request on cma_id=%p, xprt = %p, " - "event = %s (%d)\n", cma_id, cma_id->context, - rdma_event_msg(event->event), event->event); handle_connect_req(cma_id, &event->param.conn); break; default: - /* NB: No device removal upcall for INADDR_ANY listeners */ - dprintk("svcrdma: Unexpected event on listening endpoint %p, " - "event = %s (%d)\n", cma_id, - rdma_event_msg(event->event), event->event); break; } - return 0; } -static int rdma_cma_handler(struct rdma_cm_id *cma_id, - struct rdma_cm_event *event) +/** + * svc_rdma_cma_handler - Handle CM events on client connections + * @cma_id: the server's listener rdma_cm_id + * @event: details of the event + * + * Return values: + * %0: Do not destroy @cma_id + * %1: Destroy @cma_id (never returned here) + */ +static int svc_rdma_cma_handler(struct rdma_cm_id *cma_id, + struct rdma_cm_event *event) { struct svcxprt_rdma *rdma = cma_id->context; struct svc_xprt *xprt = &rdma->sc_xprt; switch (event->event) { case RDMA_CM_EVENT_ESTABLISHED: - /* Accept complete */ - dprintk("svcrdma: Connection completed on DTO xprt=%p, " - "cm_id=%p\n", xprt, cma_id); clear_bit(RDMAXPRT_CONN_PENDING, &rdma->sc_flags); svc_xprt_enqueue(xprt); break; case RDMA_CM_EVENT_DISCONNECTED: - dprintk("svcrdma: Disconnect on DTO xprt=%p, cm_id=%p\n", - xprt, cma_id); - set_bit(XPT_CLOSE, &xprt->xpt_flags); - svc_xprt_enqueue(xprt); - break; case RDMA_CM_EVENT_DEVICE_REMOVAL: - dprintk("svcrdma: Device removal cma_id=%p, xprt = %p, " - "event = %s (%d)\n", cma_id, xprt, - rdma_event_msg(event->event), event->event); set_bit(XPT_CLOSE, &xprt->xpt_flags); svc_xprt_enqueue(xprt); break; default: - dprintk("svcrdma: Unexpected event on DTO endpoint %p, " - "event = %s (%d)\n", cma_id, - rdma_event_msg(event->event), event->event); break; } return 0; @@ -318,7 +312,7 @@ static struct svc_xprt *svc_rdma_create(struct svc_serv *serv, set_bit(XPT_LISTENER, &cma_xprt->sc_xprt.xpt_flags); strcpy(cma_xprt->sc_xprt.xpt_remotebuf, "listener"); - listen_id = rdma_create_id(net, rdma_listen_handler, cma_xprt, + listen_id = rdma_create_id(net, svc_rdma_listen_handler, cma_xprt, RDMA_PS_TCP, IB_QPT_RC); if (IS_ERR(listen_id)) { ret = PTR_ERR(listen_id); @@ -482,7 +476,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) goto errout; /* Swap out the handler */ - newxprt->sc_cm_id->event_handler = rdma_cma_handler; + newxprt->sc_cm_id->event_handler = svc_rdma_cma_handler; /* Construct RDMA-CM private message */ pmsg.cp_magic = rpcrdma_cmp_magic; -- cgit v1.2.3