From 5fdca6531434c1c1b2d584873afdda52e5ad448c Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 29 Nov 2016 11:04:42 -0500 Subject: svcrdma: Renovate sendto chunk list parsing The current sendto code appears to support clients that provide only one of a Read list, a Write list, or a Reply chunk. My reading of that code is that it doesn't support the following cases: - Read list + Write list - Read list + Reply chunk - Write list + Reply chunk - Read list + Write list + Reply chunk The protocol allows more than one Read or Write chunk in those lists. Some clients do send a Read list and Reply chunk simultaneously. NFSv4 WRITE uses a Read list for the data payload, and a Reply chunk because the GETATTR result in the reply can contain a large object like an ACL. Generalize one of the sendto code paths needed to support all of the above cases, and attempt to ensure that only one pass is done through the RPC Call's transport header to gather chunk list information for building the reply. Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/svc_rdma.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index cc3ae16eac68..6aef63b9a669 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -236,8 +236,6 @@ extern int rdma_read_chunk_frmr(struct svcxprt_rdma *, struct svc_rqst *, extern int svc_rdma_map_xdr(struct svcxprt_rdma *, struct xdr_buf *, struct svc_rdma_req_map *, bool); extern int svc_rdma_sendto(struct svc_rqst *); -extern struct rpcrdma_read_chunk * - svc_rdma_get_read_chunk(struct rpcrdma_msg *); extern void svc_rdma_send_error(struct svcxprt_rdma *, struct rpcrdma_msg *, int); -- cgit v1.2.3 From e4eb42cecc6dc546aac888ee4913d59121e886ee Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 29 Nov 2016 11:04:50 -0500 Subject: svcrdma: Remove BH-disabled spin locking in svc_rdma_send() svcrdma's current SQ accounting algorithm takes sc_lock and disables bottom-halves while posting all RDMA Read, Write, and Send WRs. This is relatively heavyweight serialization. And note that Write and Send are already fully serialized by the xpt_mutex. Using a single atomic_t should be all that is necessary to guarantee that ib_post_send() is called only when there is enough space on the send queue. This is what the other RDMA-enabled storage targets do. Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/svc_rdma.h | 2 +- net/sunrpc/xprtrdma/svc_rdma_sendto.c | 7 ++++++- net/sunrpc/xprtrdma/svc_rdma_transport.c | 25 ++++++++++--------------- 3 files changed, 17 insertions(+), 17 deletions(-) (limited to 'include') diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index 6aef63b9a669..601cb07aa746 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -139,7 +139,7 @@ struct svcxprt_rdma { int sc_max_sge_rd; /* max sge for read target */ bool sc_snd_w_inv; /* OK to use Send With Invalidate */ - atomic_t sc_sq_count; /* Number of SQ WR on queue */ + atomic_t sc_sq_avail; /* SQEs ready to be consumed */ unsigned int sc_sq_depth; /* Depth of SQ */ unsigned int sc_rq_depth; /* Depth of RQ */ u32 sc_max_requests; /* Forward credits */ diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index 0a58d4062f2f..30eeab527bd0 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -594,7 +594,12 @@ int svc_rdma_sendto(struct svc_rqst *rqstp) goto err0; inline_bytes = rqstp->rq_res.len; - /* Create the RDMA response header */ + /* Create the RDMA response header. xprt->xpt_mutex, + * acquired in svc_send(), serializes RPC replies. The + * code path below that inserts the credit grant value + * into each transport header runs only inside this + * critical section. + */ ret = -ENOMEM; res_page = alloc_page(GFP_KERNEL); if (!res_page) diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index 6864fb967038..da990d7f8b20 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -434,7 +434,7 @@ static void svc_rdma_send_wc_common(struct svcxprt_rdma *xprt, goto err; out: - atomic_dec(&xprt->sc_sq_count); + atomic_inc(&xprt->sc_sq_avail); wake_up(&xprt->sc_send_wait); return; @@ -1008,6 +1008,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) newxprt->sc_rq_depth = newxprt->sc_max_requests + newxprt->sc_max_bc_requests; newxprt->sc_sq_depth = RPCRDMA_SQ_DEPTH_MULT * newxprt->sc_rq_depth; + atomic_set(&newxprt->sc_sq_avail, newxprt->sc_sq_depth); if (!svc_rdma_prealloc_ctxts(newxprt)) goto errout; @@ -1333,15 +1334,13 @@ int svc_rdma_send(struct svcxprt_rdma *xprt, struct ib_send_wr *wr) /* If the SQ is full, wait until an SQ entry is available */ while (1) { - spin_lock_bh(&xprt->sc_lock); - if (xprt->sc_sq_depth < atomic_read(&xprt->sc_sq_count) + wr_count) { - spin_unlock_bh(&xprt->sc_lock); + if ((atomic_sub_return(wr_count, &xprt->sc_sq_avail) < 0)) { atomic_inc(&rdma_stat_sq_starve); /* Wait until SQ WR available if SQ still full */ + atomic_add(wr_count, &xprt->sc_sq_avail); wait_event(xprt->sc_send_wait, - atomic_read(&xprt->sc_sq_count) < - xprt->sc_sq_depth); + atomic_read(&xprt->sc_sq_avail) > wr_count); if (test_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags)) return -ENOTCONN; continue; @@ -1351,21 +1350,17 @@ int svc_rdma_send(struct svcxprt_rdma *xprt, struct ib_send_wr *wr) svc_xprt_get(&xprt->sc_xprt); /* Bump used SQ WR count and post */ - atomic_add(wr_count, &xprt->sc_sq_count); ret = ib_post_send(xprt->sc_qp, wr, &bad_wr); if (ret) { set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags); - atomic_sub(wr_count, &xprt->sc_sq_count); for (i = 0; i < wr_count; i ++) svc_xprt_put(&xprt->sc_xprt); - dprintk("svcrdma: failed to post SQ WR rc=%d, " - "sc_sq_count=%d, sc_sq_depth=%d\n", - ret, atomic_read(&xprt->sc_sq_count), - xprt->sc_sq_depth); - } - spin_unlock_bh(&xprt->sc_lock); - if (ret) + dprintk("svcrdma: failed to post SQ WR rc=%d\n", ret); + dprintk(" sc_sq_avail=%d, sc_sq_depth=%d\n", + atomic_read(&xprt->sc_sq_avail), + xprt->sc_sq_depth); wake_up(&xprt->sc_send_wait); + } break; } return ret; -- cgit v1.2.3 From dd6fd213b05e7a1f590b470500343dd97c3a32c1 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 29 Nov 2016 11:04:58 -0500 Subject: svcrdma: Remove DMA map accounting Clean up: sc_dma_used is not required for correct operation. It is simply a debugging tool to report when svcrdma has leaked DMA maps. However, manipulating an atomic has a measurable CPU cost, and DMA map accounting specific to svcrdma will be meaningless once svcrdma is converted to use the new generic r/w API. A similar kind of debug accounting can be done simply by enabling the IOMMU or by using CONFIG_DMA_API_DEBUG, CONFIG_IOMMU_DEBUG, and CONFIG_IOMMU_LEAK. Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/svc_rdma.h | 2 -- net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 1 - net/sunrpc/xprtrdma/svc_rdma_transport.c | 13 +++---------- 3 files changed, 3 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index 601cb07aa746..43d7c709d117 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -148,7 +148,6 @@ struct svcxprt_rdma { struct ib_pd *sc_pd; - atomic_t sc_dma_used; spinlock_t sc_ctxt_lock; struct list_head sc_ctxts; int sc_ctxt_used; @@ -200,7 +199,6 @@ static inline void svc_rdma_count_mappings(struct svcxprt_rdma *rdma, struct svc_rdma_op_ctxt *ctxt) { ctxt->mapped_sges++; - atomic_inc(&rdma->sc_dma_used); } /* svc_rdma_backchannel.c */ diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index 873c2a938d35..283246e0afc2 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -279,7 +279,6 @@ int rdma_read_chunk_frmr(struct svcxprt_rdma *xprt, frmr->sg); return -ENOMEM; } - atomic_inc(&xprt->sc_dma_used); n = ib_map_mr_sg(frmr->mr, frmr->sg, frmr->sg_nents, NULL, PAGE_SIZE); if (unlikely(n != frmr->sg_nents)) { diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index da990d7f8b20..a613ebd1dd85 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -224,25 +224,22 @@ void svc_rdma_unmap_dma(struct svc_rdma_op_ctxt *ctxt) struct svcxprt_rdma *xprt = ctxt->xprt; struct ib_device *device = xprt->sc_cm_id->device; u32 lkey = xprt->sc_pd->local_dma_lkey; - unsigned int i, count; + unsigned int i; - for (count = 0, i = 0; i < ctxt->mapped_sges; i++) { + for (i = 0; i < ctxt->mapped_sges; i++) { /* * Unmap the DMA addr in the SGE if the lkey matches * the local_dma_lkey, otherwise, ignore it since it is * an FRMR lkey and will be unmapped later when the * last WR that uses it completes. */ - if (ctxt->sge[i].lkey == lkey) { - count++; + if (ctxt->sge[i].lkey == lkey) ib_dma_unmap_page(device, ctxt->sge[i].addr, ctxt->sge[i].length, ctxt->direction); - } } ctxt->mapped_sges = 0; - atomic_sub(count, &xprt->sc_dma_used); } void svc_rdma_put_context(struct svc_rdma_op_ctxt *ctxt, int free_pages) @@ -944,7 +941,6 @@ void svc_rdma_put_frmr(struct svcxprt_rdma *rdma, if (frmr) { ib_dma_unmap_sg(rdma->sc_cm_id->device, frmr->sg, frmr->sg_nents, frmr->direction); - atomic_dec(&rdma->sc_dma_used); spin_lock_bh(&rdma->sc_frmr_q_lock); WARN_ON_ONCE(!list_empty(&frmr->frmr_list)); list_add(&frmr->frmr_list, &rdma->sc_frmr_q); @@ -1256,9 +1252,6 @@ static void __svc_rdma_free(struct work_struct *work) if (rdma->sc_ctxt_used != 0) pr_err("svcrdma: ctxt still in use? (%d)\n", rdma->sc_ctxt_used); - if (atomic_read(&rdma->sc_dma_used) != 0) - pr_err("svcrdma: dma still in use? (%d)\n", - atomic_read(&rdma->sc_dma_used)); /* Final put of backchannel client transport */ if (xprt->xpt_bc_xprt) { -- cgit v1.2.3 From 96a58f9c1921f28fab5ed008be791adacb540cc6 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 29 Nov 2016 11:05:07 -0500 Subject: svcrdma: Remove svc_rdma_op_ctxt::wc_status Clean up: Completion status is already reported in the individual completion handlers. Save a few bytes in struct svc_rdma_op_ctxt. Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields --- include/linux/sunrpc/svc_rdma.h | 1 - net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 4 ++-- net/sunrpc/xprtrdma/svc_rdma_transport.c | 1 - 3 files changed, 2 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index 43d7c709d117..757fb963696c 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -79,7 +79,6 @@ struct svc_rdma_op_ctxt { struct ib_cqe reg_cqe; struct ib_cqe inv_cqe; struct list_head dto_q; - enum ib_wc_status wc_status; u32 byte_len; u32 position; struct svcxprt_rdma *xprt; diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index 283246e0afc2..428ab4ef77f3 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -640,8 +640,8 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp) goto defer; goto out; } - dprintk("svcrdma: processing ctxt=%p on xprt=%p, rqstp=%p, status=%d\n", - ctxt, rdma_xprt, rqstp, ctxt->wc_status); + dprintk("svcrdma: processing ctxt=%p on xprt=%p, rqstp=%p\n", + ctxt, rdma_xprt, rqstp); atomic_inc(&rdma_stat_recv); /* Build up the XDR from the receive buffers. */ diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index a613ebd1dd85..f9d961c00dd2 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -393,7 +393,6 @@ static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc) /* WARNING: Only wc->wr_cqe and wc->status are reliable */ ctxt = container_of(cqe, struct svc_rdma_op_ctxt, cqe); - ctxt->wc_status = wc->status; svc_rdma_unmap_dma(ctxt); if (wc->status != IB_WC_SUCCESS) -- cgit v1.2.3 From 47057abde515155a4fee53038e7772d6b387e0aa Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Tue, 12 Jan 2016 20:24:14 +0100 Subject: nfsd: add support for the umask attribute Clients can set the umask attribute when creating files to cause the server to apply it always except when inheriting permissions from the parent directory. That way, the new files will end up with the same permissions as files created locally. See https://tools.ietf.org/html/draft-ietf-nfsv4-umask-02 for more details. Signed-off-by: Andreas Gruenbacher Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4proc.c | 3 +++ fs/nfsd/nfs4xdr.c | 26 +++++++++++++++++++++----- fs/nfsd/nfsd.h | 9 +++++++-- fs/nfsd/nfssvc.c | 4 ++-- include/linux/nfs4.h | 1 + 5 files changed, 34 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index e901cf124e9f..74a6e573e061 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -102,6 +102,9 @@ check_attr_support(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, return nfserr_attrnotsupp; if (writable && !bmval_is_subset(bmval, writable)) return nfserr_inval; + if (writable && (bmval[2] & FATTR4_WORD2_MODE_UMASK) && + (bmval[1] & FATTR4_WORD1_MODE)) + return nfserr_inval; return nfs_ok; } diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 281739e1d477..79edde4577b2 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -33,6 +33,7 @@ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ +#include #include #include #include @@ -299,7 +300,7 @@ nfsd4_decode_bitmap(struct nfsd4_compoundargs *argp, u32 *bmval) static __be32 nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, struct iattr *iattr, struct nfs4_acl **acl, - struct xdr_netobj *label) + struct xdr_netobj *label, int *umask) { int expected_len, len = 0; u32 dummy32; @@ -457,6 +458,17 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, return nfserr_jukebox; } #endif + if (bmval[2] & FATTR4_WORD2_MODE_UMASK) { + if (!umask) + goto xdr_error; + READ_BUF(8); + len += 8; + dummy32 = be32_to_cpup(p++); + iattr->ia_mode = dummy32 & (S_IFMT | S_IALLUGO); + dummy32 = be32_to_cpup(p++); + *umask = dummy32 & S_IRWXUGO; + iattr->ia_valid |= ATTR_MODE; + } if (len != expected_len) goto xdr_error; @@ -651,7 +663,8 @@ nfsd4_decode_create(struct nfsd4_compoundargs *argp, struct nfsd4_create *create return status; status = nfsd4_decode_fattr(argp, create->cr_bmval, &create->cr_iattr, - &create->cr_acl, &create->cr_label); + &create->cr_acl, &create->cr_label, + ¤t->fs->umask); if (status) goto out; @@ -896,13 +909,15 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open) case NFS4_OPEN_NOCREATE: break; case NFS4_OPEN_CREATE: + current->fs->umask = 0; READ_BUF(4); open->op_createmode = be32_to_cpup(p++); switch (open->op_createmode) { case NFS4_CREATE_UNCHECKED: case NFS4_CREATE_GUARDED: status = nfsd4_decode_fattr(argp, open->op_bmval, - &open->op_iattr, &open->op_acl, &open->op_label); + &open->op_iattr, &open->op_acl, &open->op_label, + ¤t->fs->umask); if (status) goto out; break; @@ -916,7 +931,8 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open) READ_BUF(NFS4_VERIFIER_SIZE); COPYMEM(open->op_verf.data, NFS4_VERIFIER_SIZE); status = nfsd4_decode_fattr(argp, open->op_bmval, - &open->op_iattr, &open->op_acl, &open->op_label); + &open->op_iattr, &open->op_acl, &open->op_label, + ¤t->fs->umask); if (status) goto out; break; @@ -1153,7 +1169,7 @@ nfsd4_decode_setattr(struct nfsd4_compoundargs *argp, struct nfsd4_setattr *seta if (status) return status; return nfsd4_decode_fattr(argp, setattr->sa_bmval, &setattr->sa_iattr, - &setattr->sa_acl, &setattr->sa_label); + &setattr->sa_acl, &setattr->sa_label, NULL); } static __be32 diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h index 7155239b2908..d74c8c44dc35 100644 --- a/fs/nfsd/nfsd.h +++ b/fs/nfsd/nfsd.h @@ -359,6 +359,7 @@ void nfsd_lockd_shutdown(void); #define NFSD4_2_SUPPORTED_ATTRS_WORD2 \ (NFSD4_1_SUPPORTED_ATTRS_WORD2 | \ + FATTR4_WORD2_MODE_UMASK | \ NFSD4_2_SECURITY_ATTRS) extern u32 nfsd_suppattrs[3][3]; @@ -390,10 +391,14 @@ static inline bool nfsd_attrs_supported(u32 minorversion, u32 *bmval) (FATTR4_WORD1_MODE | FATTR4_WORD1_OWNER | FATTR4_WORD1_OWNER_GROUP \ | FATTR4_WORD1_TIME_ACCESS_SET | FATTR4_WORD1_TIME_MODIFY_SET) #ifdef CONFIG_NFSD_V4_SECURITY_LABEL -#define NFSD_WRITEABLE_ATTRS_WORD2 FATTR4_WORD2_SECURITY_LABEL +#define MAYBE_FATTR4_WORD2_SECURITY_LABEL \ + FATTR4_WORD2_SECURITY_LABEL #else -#define NFSD_WRITEABLE_ATTRS_WORD2 0 +#define MAYBE_FATTR4_WORD2_SECURITY_LABEL 0 #endif +#define NFSD_WRITEABLE_ATTRS_WORD2 \ + (FATTR4_WORD2_MODE_UMASK \ + | MAYBE_FATTR4_WORD2_SECURITY_LABEL) #define NFSD_SUPPATTR_EXCLCREAT_WORD0 \ NFSD_WRITEABLE_ATTRS_WORD0 diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index a2b65fc56dd6..e6bfd96734c0 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -661,8 +661,8 @@ nfsd(void *vrqstp) mutex_lock(&nfsd_mutex); /* At this point, the thread shares current->fs - * with the init process. We need to create files with a - * umask of 0 instead of init's umask. */ + * with the init process. We need to create files with the + * umask as defined by the client instead of init's umask. */ if (unshare_fs_struct() < 0) { printk("Unable to start nfsd thread: out of memory\n"); goto out; diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h index 9094faf0699d..bca536341d1a 100644 --- a/include/linux/nfs4.h +++ b/include/linux/nfs4.h @@ -440,6 +440,7 @@ enum lock_type4 { #define FATTR4_WORD2_MDSTHRESHOLD (1UL << 4) #define FATTR4_WORD2_CLONE_BLKSIZE (1UL << 13) #define FATTR4_WORD2_SECURITY_LABEL (1UL << 16) +#define FATTR4_WORD2_MODE_UMASK (1UL << 17) /* MDS threshold bitmap bits */ #define THRESHOLD_RD (1UL << 0) -- cgit v1.2.3