diff options
Diffstat (limited to 'net/sunrpc/xprtrdma')
-rw-r--r-- | net/sunrpc/xprtrdma/Makefile | 4 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/fmr_ops.c | 47 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/frwr_ops.c | 69 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/rpc_rdma.c | 125 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/svc_rdma_marshal.c | 168 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 734 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/svc_rdma_rw.c | 449 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/svc_rdma_sendto.c | 15 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/svc_rdma_transport.c | 250 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/transport.c | 3 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/verbs.c | 55 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/xprt_rdma.h | 40 |
12 files changed, 951 insertions, 1008 deletions
diff --git a/net/sunrpc/xprtrdma/Makefile b/net/sunrpc/xprtrdma/Makefile index c1ae8142ab73..b8213ddce2f2 100644 --- a/net/sunrpc/xprtrdma/Makefile +++ b/net/sunrpc/xprtrdma/Makefile @@ -3,6 +3,6 @@ obj-$(CONFIG_SUNRPC_XPRT_RDMA) += rpcrdma.o rpcrdma-y := transport.o rpc_rdma.o verbs.o \ fmr_ops.o frwr_ops.o \ svc_rdma.o svc_rdma_backchannel.o svc_rdma_transport.o \ - svc_rdma_marshal.o svc_rdma_sendto.o svc_rdma_recvfrom.o \ - svc_rdma_rw.o module.o + svc_rdma_sendto.o svc_rdma_recvfrom.o svc_rdma_rw.o \ + module.o rpcrdma-$(CONFIG_SUNRPC_BACKCHANNEL) += backchannel.o diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c index 59e64025ed96..d3f84bb1d443 100644 --- a/net/sunrpc/xprtrdma/fmr_ops.c +++ b/net/sunrpc/xprtrdma/fmr_ops.c @@ -91,7 +91,7 @@ __fmr_unmap(struct rpcrdma_mw *mw) list_add(&mw->fmr.fm_mr->list, &l); rc = ib_unmap_fmr(&l); - list_del_init(&mw->fmr.fm_mr->list); + list_del(&mw->fmr.fm_mr->list); return rc; } @@ -213,13 +213,11 @@ fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len)) break; } - mw->mw_nents = i; mw->mw_dir = rpcrdma_data_dir(writing); - if (i == 0) - goto out_dmamap_err; - if (!ib_dma_map_sg(r_xprt->rx_ia.ri_device, - mw->mw_sg, mw->mw_nents, mw->mw_dir)) + mw->mw_nents = ib_dma_map_sg(r_xprt->rx_ia.ri_device, + mw->mw_sg, i, mw->mw_dir); + if (!mw->mw_nents) goto out_dmamap_err; for (i = 0, dma_pages = mw->fmr.fm_physaddrs; i < mw->mw_nents; i++) @@ -237,16 +235,18 @@ fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, return mw->mw_nents; out_dmamap_err: - pr_err("rpcrdma: failed to dma map sg %p sg_nents %u\n", - mw->mw_sg, mw->mw_nents); - rpcrdma_defer_mr_recovery(mw); + pr_err("rpcrdma: failed to DMA map sg %p sg_nents %d\n", + mw->mw_sg, i); + rpcrdma_put_mw(r_xprt, mw); return -EIO; out_maperr: pr_err("rpcrdma: ib_map_phys_fmr %u@0x%llx+%i (%d) status %i\n", len, (unsigned long long)dma_pages[0], pageoff, mw->mw_nents, rc); - rpcrdma_defer_mr_recovery(mw); + ib_dma_unmap_sg(r_xprt->rx_ia.ri_device, + mw->mw_sg, mw->mw_nents, mw->mw_dir); + rpcrdma_put_mw(r_xprt, mw); return -EIO; } @@ -255,24 +255,26 @@ out_maperr: * Sleeps until it is safe for the host CPU to access the * previously mapped memory regions. * - * Caller ensures that req->rl_registered is not empty. + * Caller ensures that @mws is not empty before the call. This + * function empties the list. */ static void -fmr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) +fmr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct list_head *mws) { - struct rpcrdma_mw *mw, *tmp; + struct rpcrdma_mw *mw; LIST_HEAD(unmap_list); int rc; - dprintk("RPC: %s: req %p\n", __func__, req); - /* ORDER: Invalidate all of the req's MRs first * * ib_unmap_fmr() is slow, so use a single call instead * of one call per mapped FMR. */ - list_for_each_entry(mw, &req->rl_registered, mw_list) + list_for_each_entry(mw, mws, mw_list) { + dprintk("RPC: %s: unmapping fmr %p\n", + __func__, &mw->fmr); list_add_tail(&mw->fmr.fm_mr->list, &unmap_list); + } r_xprt->rx_stats.local_inv_needed++; rc = ib_unmap_fmr(&unmap_list); if (rc) @@ -281,9 +283,11 @@ fmr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) /* ORDER: Now DMA unmap all of the req's MRs, and return * them to the free MW list. */ - list_for_each_entry_safe(mw, tmp, &req->rl_registered, mw_list) { - list_del_init(&mw->mw_list); - list_del_init(&mw->fmr.fm_mr->list); + while (!list_empty(mws)) { + mw = rpcrdma_pop_mw(mws); + dprintk("RPC: %s: DMA unmapping fmr %p\n", + __func__, &mw->fmr); + list_del(&mw->fmr.fm_mr->list); ib_dma_unmap_sg(r_xprt->rx_ia.ri_device, mw->mw_sg, mw->mw_nents, mw->mw_dir); rpcrdma_put_mw(r_xprt, mw); @@ -294,8 +298,9 @@ fmr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) out_reset: pr_err("rpcrdma: ib_unmap_fmr failed (%i)\n", rc); - list_for_each_entry_safe(mw, tmp, &req->rl_registered, mw_list) { - list_del_init(&mw->fmr.fm_mr->list); + while (!list_empty(mws)) { + mw = rpcrdma_pop_mw(mws); + list_del(&mw->fmr.fm_mr->list); fmr_op_recover_mr(mw); } } diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index f81dd93176c0..6aea36a38bfd 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -277,7 +277,7 @@ __frwr_sendcompletion_flush(struct ib_wc *wc, const char *wr) } /** - * frwr_wc_fastreg - Invoked by RDMA provider for each polled FastReg WC + * frwr_wc_fastreg - Invoked by RDMA provider for a flushed FastReg WC * @cq: completion queue (ignored) * @wc: completed WR * @@ -298,7 +298,7 @@ frwr_wc_fastreg(struct ib_cq *cq, struct ib_wc *wc) } /** - * frwr_wc_localinv - Invoked by RDMA provider for each polled LocalInv WC + * frwr_wc_localinv - Invoked by RDMA provider for a flushed LocalInv WC * @cq: completion queue (ignored) * @wc: completed WR * @@ -319,7 +319,7 @@ frwr_wc_localinv(struct ib_cq *cq, struct ib_wc *wc) } /** - * frwr_wc_localinv - Invoked by RDMA provider for each polled LocalInv WC + * frwr_wc_localinv_wake - Invoked by RDMA provider for a signaled LocalInv WC * @cq: completion queue (ignored) * @wc: completed WR * @@ -355,7 +355,7 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, struct ib_mr *mr; struct ib_reg_wr *reg_wr; struct ib_send_wr *bad_wr; - int rc, i, n, dma_nents; + int rc, i, n; u8 key; mw = NULL; @@ -391,14 +391,10 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len)) break; } - mw->mw_nents = i; mw->mw_dir = rpcrdma_data_dir(writing); - if (i == 0) - goto out_dmamap_err; - dma_nents = ib_dma_map_sg(ia->ri_device, - mw->mw_sg, mw->mw_nents, mw->mw_dir); - if (!dma_nents) + mw->mw_nents = ib_dma_map_sg(ia->ri_device, mw->mw_sg, i, mw->mw_dir); + if (!mw->mw_nents) goto out_dmamap_err; n = ib_map_mr_sg(mr, mw->mw_sg, mw->mw_nents, NULL, PAGE_SIZE); @@ -436,13 +432,14 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, return mw->mw_nents; out_dmamap_err: - pr_err("rpcrdma: failed to dma map sg %p sg_nents %u\n", - mw->mw_sg, mw->mw_nents); - rpcrdma_defer_mr_recovery(mw); + pr_err("rpcrdma: failed to DMA map sg %p sg_nents %d\n", + mw->mw_sg, i); + frmr->fr_state = FRMR_IS_INVALID; + rpcrdma_put_mw(r_xprt, mw); return -EIO; out_mapmr_err: - pr_err("rpcrdma: failed to map mr %p (%u/%u)\n", + pr_err("rpcrdma: failed to map mr %p (%d/%d)\n", frmr->fr_mr, n, mw->mw_nents); rpcrdma_defer_mr_recovery(mw); return -EIO; @@ -458,21 +455,19 @@ out_senderr: * Sleeps until it is safe for the host CPU to access the * previously mapped memory regions. * - * Caller ensures that req->rl_registered is not empty. + * Caller ensures that @mws is not empty before the call. This + * function empties the list. */ static void -frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) +frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct list_head *mws) { struct ib_send_wr *first, **prev, *last, *bad_wr; - struct rpcrdma_rep *rep = req->rl_reply; struct rpcrdma_ia *ia = &r_xprt->rx_ia; struct rpcrdma_frmr *f; struct rpcrdma_mw *mw; int count, rc; - dprintk("RPC: %s: req %p\n", __func__, req); - - /* ORDER: Invalidate all of the req's MRs first + /* ORDER: Invalidate all of the MRs first * * Chain the LOCAL_INV Work Requests and post them with * a single ib_post_send() call. @@ -480,11 +475,10 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) f = NULL; count = 0; prev = &first; - list_for_each_entry(mw, &req->rl_registered, mw_list) { + list_for_each_entry(mw, mws, mw_list) { mw->frmr.fr_state = FRMR_IS_INVALID; - if ((rep->rr_wc_flags & IB_WC_WITH_INVALIDATE) && - (mw->mw_handle == rep->rr_inv_rkey)) + if (mw->mw_flags & RPCRDMA_MW_F_RI) continue; f = &mw->frmr; @@ -524,18 +518,19 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) * unless ri_id->qp is a valid pointer. */ r_xprt->rx_stats.local_inv_needed++; + bad_wr = NULL; rc = ib_post_send(ia->ri_id->qp, first, &bad_wr); + if (bad_wr != first) + wait_for_completion(&f->fr_linv_done); if (rc) goto reset_mrs; - wait_for_completion(&f->fr_linv_done); - - /* ORDER: Now DMA unmap all of the req's MRs, and return + /* ORDER: Now DMA unmap all of the MRs, and return * them to the free MW list. */ unmap: - while (!list_empty(&req->rl_registered)) { - mw = rpcrdma_pop_mw(&req->rl_registered); + while (!list_empty(mws)) { + mw = rpcrdma_pop_mw(mws); dprintk("RPC: %s: DMA unmapping frmr %p\n", __func__, &mw->frmr); ib_dma_unmap_sg(ia->ri_device, @@ -546,17 +541,19 @@ unmap: reset_mrs: pr_err("rpcrdma: FRMR invalidate ib_post_send returned %i\n", rc); - rdma_disconnect(ia->ri_id); /* Find and reset the MRs in the LOCAL_INV WRs that did not - * get posted. This is synchronous, and slow. + * get posted. */ - list_for_each_entry(mw, &req->rl_registered, mw_list) { - f = &mw->frmr; - if (mw->mw_handle == bad_wr->ex.invalidate_rkey) { - __frwr_reset_mr(ia, mw); - bad_wr = bad_wr->next; - } + rpcrdma_init_cqcount(&r_xprt->rx_ep, -count); + while (bad_wr) { + f = container_of(bad_wr, struct rpcrdma_frmr, + fr_invwr); + mw = container_of(f, struct rpcrdma_mw, frmr); + + __frwr_reset_mr(ia, mw); + + bad_wr = bad_wr->next; } goto unmap; } diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 694e9b13ecf0..ca4d6e4528f3 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -141,7 +141,7 @@ static bool rpcrdma_args_inline(struct rpcrdma_xprt *r_xprt, if (xdr->page_len) { remaining = xdr->page_len; - offset = xdr->page_base & ~PAGE_MASK; + offset = offset_in_page(xdr->page_base); count = 0; while (remaining) { remaining -= min_t(unsigned int, @@ -222,7 +222,7 @@ rpcrdma_convert_iovs(struct rpcrdma_xprt *r_xprt, struct xdr_buf *xdrbuf, len = xdrbuf->page_len; ppages = xdrbuf->pages + (xdrbuf->page_base >> PAGE_SHIFT); - page_base = xdrbuf->page_base & ~PAGE_MASK; + page_base = offset_in_page(xdrbuf->page_base); p = 0; while (len && n < RPCRDMA_MAX_SEGS) { if (!ppages[p]) { @@ -540,7 +540,7 @@ rpcrdma_prepare_msg_sges(struct rpcrdma_ia *ia, struct rpcrdma_req *req, goto out; page = virt_to_page(xdr->tail[0].iov_base); - page_base = (unsigned long)xdr->tail[0].iov_base & ~PAGE_MASK; + page_base = offset_in_page(xdr->tail[0].iov_base); /* If the content in the page list is an odd length, * xdr_write_pages() has added a pad at the beginning @@ -557,7 +557,7 @@ rpcrdma_prepare_msg_sges(struct rpcrdma_ia *ia, struct rpcrdma_req *req, */ if (xdr->page_len) { ppages = xdr->pages + (xdr->page_base >> PAGE_SHIFT); - page_base = xdr->page_base & ~PAGE_MASK; + page_base = offset_in_page(xdr->page_base); remaining = xdr->page_len; while (remaining) { sge_no++; @@ -587,7 +587,7 @@ rpcrdma_prepare_msg_sges(struct rpcrdma_ia *ia, struct rpcrdma_req *req, */ if (xdr->tail[0].iov_len) { page = virt_to_page(xdr->tail[0].iov_base); - page_base = (unsigned long)xdr->tail[0].iov_base & ~PAGE_MASK; + page_base = offset_in_page(xdr->tail[0].iov_base); len = xdr->tail[0].iov_len; map_tail: @@ -734,6 +734,9 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) rpclen = 0; } + req->rl_xid = rqst->rq_xid; + rpcrdma_insert_req(&r_xprt->rx_buf, req); + /* This implementation supports the following combinations * of chunk lists in one RPC-over-RDMA Call message: * @@ -875,9 +878,9 @@ rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad) srcp += curlen; copy_len -= curlen; - page_base = rqst->rq_rcv_buf.page_base; - ppages = rqst->rq_rcv_buf.pages + (page_base >> PAGE_SHIFT); - page_base &= ~PAGE_MASK; + ppages = rqst->rq_rcv_buf.pages + + (rqst->rq_rcv_buf.page_base >> PAGE_SHIFT); + page_base = offset_in_page(rqst->rq_rcv_buf.page_base); fixup_copy_count = 0; if (copy_len && rqst->rq_rcv_buf.page_len) { int pagelist_len; @@ -928,6 +931,24 @@ rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad) return fixup_copy_count; } +/* Caller must guarantee @rep remains stable during this call. + */ +static void +rpcrdma_mark_remote_invalidation(struct list_head *mws, + struct rpcrdma_rep *rep) +{ + struct rpcrdma_mw *mw; + + if (!(rep->rr_wc_flags & IB_WC_WITH_INVALIDATE)) + return; + + list_for_each_entry(mw, mws, mw_list) + if (mw->mw_handle == rep->rr_inv_rkey) { + mw->mw_flags = RPCRDMA_MW_F_RI; + break; /* only one invalidated MR per RPC */ + } +} + #if defined(CONFIG_SUNRPC_BACKCHANNEL) /* By convention, backchannel calls arrive via rdma_msg type * messages, and never populate the chunk lists. This makes @@ -969,14 +990,16 @@ rpcrdma_reply_handler(struct work_struct *work) { struct rpcrdma_rep *rep = container_of(work, struct rpcrdma_rep, rr_work); + struct rpcrdma_xprt *r_xprt = rep->rr_rxprt; + struct rpcrdma_buffer *buf = &r_xprt->rx_buf; + struct rpc_xprt *xprt = &r_xprt->rx_xprt; struct rpcrdma_msg *headerp; struct rpcrdma_req *req; struct rpc_rqst *rqst; - struct rpcrdma_xprt *r_xprt = rep->rr_rxprt; - struct rpc_xprt *xprt = &r_xprt->rx_xprt; __be32 *iptr; int rdmalen, status, rmerr; unsigned long cwnd; + struct list_head mws; dprintk("RPC: %s: incoming rep %p\n", __func__, rep); @@ -994,27 +1017,45 @@ rpcrdma_reply_handler(struct work_struct *work) /* Match incoming rpcrdma_rep to an rpcrdma_req to * get context for handling any incoming chunks. */ - spin_lock_bh(&xprt->transport_lock); - rqst = xprt_lookup_rqst(xprt, headerp->rm_xid); - if (!rqst) + spin_lock(&buf->rb_lock); + req = rpcrdma_lookup_req_locked(&r_xprt->rx_buf, + headerp->rm_xid); + if (!req) goto out_nomatch; - - req = rpcr_to_rdmar(rqst); if (req->rl_reply) goto out_duplicate; - /* Sanity checking has passed. We are now committed - * to complete this transaction. + list_replace_init(&req->rl_registered, &mws); + rpcrdma_mark_remote_invalidation(&mws, rep); + + /* Avoid races with signals and duplicate replies + * by marking this req as matched. */ - list_del_init(&rqst->rq_list); - spin_unlock_bh(&xprt->transport_lock); + req->rl_reply = rep; + spin_unlock(&buf->rb_lock); + dprintk("RPC: %s: reply %p completes request %p (xid 0x%08x)\n", __func__, rep, req, be32_to_cpu(headerp->rm_xid)); - /* from here on, the reply is no longer an orphan */ - req->rl_reply = rep; - xprt->reestablish_timeout = 0; + /* Invalidate and unmap the data payloads before waking the + * waiting application. This guarantees the memory regions + * are properly fenced from the server before the application + * accesses the data. It also ensures proper send flow control: + * waking the next RPC waits until this RPC has relinquished + * all its Send Queue entries. + */ + if (!list_empty(&mws)) + r_xprt->rx_ia.ri_ops->ro_unmap_sync(r_xprt, &mws); + /* Perform XID lookup, reconstruction of the RPC reply, and + * RPC completion while holding the transport lock to ensure + * the rep, rqst, and rq_task pointers remain stable. + */ + spin_lock_bh(&xprt->transport_lock); + rqst = xprt_lookup_rqst(xprt, headerp->rm_xid); + if (!rqst) + goto out_norqst; + xprt->reestablish_timeout = 0; if (headerp->rm_vers != rpcrdma_version) goto out_badversion; @@ -1024,12 +1065,9 @@ rpcrdma_reply_handler(struct work_struct *work) case rdma_msg: /* never expect read chunks */ /* never expect reply chunks (two ways to check) */ - /* never expect write chunks without having offered RDMA */ if (headerp->rm_body.rm_chunks[0] != xdr_zero || (headerp->rm_body.rm_chunks[1] == xdr_zero && - headerp->rm_body.rm_chunks[2] != xdr_zero) || - (headerp->rm_body.rm_chunks[1] != xdr_zero && - list_empty(&req->rl_registered))) + headerp->rm_body.rm_chunks[2] != xdr_zero)) goto badheader; if (headerp->rm_body.rm_chunks[1] != xdr_zero) { /* count any expected write chunks in read reply */ @@ -1066,8 +1104,7 @@ rpcrdma_reply_handler(struct work_struct *work) /* never expect read or write chunks, always reply chunks */ if (headerp->rm_body.rm_chunks[0] != xdr_zero || headerp->rm_body.rm_chunks[1] != xdr_zero || - headerp->rm_body.rm_chunks[2] != xdr_one || - list_empty(&req->rl_registered)) + headerp->rm_body.rm_chunks[2] != xdr_one) goto badheader; iptr = (__be32 *)((unsigned char *)headerp + RPCRDMA_HDRLEN_MIN); @@ -1093,17 +1130,6 @@ badheader: } out: - /* Invalidate and flush the data payloads before waking the - * waiting application. This guarantees the memory region is - * properly fenced from the server before the application - * accesses the data. It also ensures proper send flow - * control: waking the next RPC waits until this RPC has - * relinquished all its Send Queue entries. - */ - if (!list_empty(&req->rl_registered)) - r_xprt->rx_ia.ri_ops->ro_unmap_sync(r_xprt, req); - - spin_lock_bh(&xprt->transport_lock); cwnd = xprt->cwnd; xprt->cwnd = atomic_read(&r_xprt->rx_buf.rb_credits) << RPC_CWNDSHIFT; if (xprt->cwnd > cwnd) @@ -1112,7 +1138,7 @@ out: xprt_complete_rqst(rqst->rq_task, status); spin_unlock_bh(&xprt->transport_lock); dprintk("RPC: %s: xprt_complete_rqst(0x%p, 0x%p, %d)\n", - __func__, xprt, rqst, status); + __func__, xprt, rqst, status); return; out_badstatus: @@ -1161,26 +1187,37 @@ out_rdmaerr: r_xprt->rx_stats.bad_reply_count++; goto out; -/* If no pending RPC transaction was matched, post a replacement - * receive buffer before returning. +/* The req was still available, but by the time the transport_lock + * was acquired, the rqst and task had been released. Thus the RPC + * has already been terminated. */ +out_norqst: + spin_unlock_bh(&xprt->transport_lock); + rpcrdma_buffer_put(req); + dprintk("RPC: %s: race, no rqst left for req %p\n", + __func__, req); + return; + out_shortreply: dprintk("RPC: %s: short/invalid reply\n", __func__); goto repost; out_nomatch: - spin_unlock_bh(&xprt->transport_lock); + spin_unlock(&buf->rb_lock); dprintk("RPC: %s: no match for incoming xid 0x%08x len %d\n", __func__, be32_to_cpu(headerp->rm_xid), rep->rr_len); goto repost; out_duplicate: - spin_unlock_bh(&xprt->transport_lock); + spin_unlock(&buf->rb_lock); dprintk("RPC: %s: " "duplicate reply %p to RPC request %p: xid 0x%08x\n", __func__, rep, req, be32_to_cpu(headerp->rm_xid)); +/* If no pending RPC transaction was matched, post a replacement + * receive buffer before returning. + */ repost: r_xprt->rx_stats.bad_reply_count++; if (rpcrdma_ep_post_recv(&r_xprt->rx_ia, rep)) diff --git a/net/sunrpc/xprtrdma/svc_rdma_marshal.c b/net/sunrpc/xprtrdma/svc_rdma_marshal.c deleted file mode 100644 index bdcf7d85a3dc..000000000000 --- a/net/sunrpc/xprtrdma/svc_rdma_marshal.c +++ /dev/null @@ -1,168 +0,0 @@ -/* - * Copyright (c) 2016 Oracle. All rights reserved. - * Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the BSD-type - * license below: - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials provided - * with the distribution. - * - * Neither the name of the Network Appliance, Inc. nor the names of - * its contributors may be used to endorse or promote products - * derived from this software without specific prior written - * permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * Author: Tom Tucker <tom@opengridcomputing.com> - */ - -#include <linux/sunrpc/xdr.h> -#include <linux/sunrpc/debug.h> -#include <asm/unaligned.h> -#include <linux/sunrpc/rpc_rdma.h> -#include <linux/sunrpc/svc_rdma.h> - -#define RPCDBG_FACILITY RPCDBG_SVCXPRT - -static __be32 *xdr_check_read_list(__be32 *p, __be32 *end) -{ - __be32 *next; - - while (*p++ != xdr_zero) { - next = p + rpcrdma_readchunk_maxsz - 1; - if (next > end) - return NULL; - p = next; - } - return p; -} - -static __be32 *xdr_check_write_list(__be32 *p, __be32 *end) -{ - __be32 *next; - - while (*p++ != xdr_zero) { - next = p + 1 + be32_to_cpup(p) * rpcrdma_segment_maxsz; - if (next > end) - return NULL; - p = next; - } - return p; -} - -static __be32 *xdr_check_reply_chunk(__be32 *p, __be32 *end) -{ - __be32 *next; - - if (*p++ != xdr_zero) { - next = p + 1 + be32_to_cpup(p) * rpcrdma_segment_maxsz; - if (next > end) - return NULL; - p = next; - } - return p; -} - -/** - * svc_rdma_xdr_decode_req - Parse incoming RPC-over-RDMA header - * @rq_arg: Receive buffer - * - * On entry, xdr->head[0].iov_base points to first byte in the - * RPC-over-RDMA header. - * - * On successful exit, head[0] points to first byte past the - * RPC-over-RDMA header. For RDMA_MSG, this is the RPC message. - * The length of the RPC-over-RDMA header is returned. - */ -int svc_rdma_xdr_decode_req(struct xdr_buf *rq_arg) -{ - __be32 *p, *end, *rdma_argp; - unsigned int hdr_len; - - /* Verify that there's enough bytes for header + something */ - if (rq_arg->len <= RPCRDMA_HDRLEN_ERR) - goto out_short; - - rdma_argp = rq_arg->head[0].iov_base; - if (*(rdma_argp + 1) != rpcrdma_version) - goto out_version; - - switch (*(rdma_argp + 3)) { - case rdma_msg: - case rdma_nomsg: - break; - - case rdma_done: - goto out_drop; - - case rdma_error: - goto out_drop; - - default: - goto out_proc; - } - - end = (__be32 *)((unsigned long)rdma_argp + rq_arg->len); - p = xdr_check_read_list(rdma_argp + 4, end); - if (!p) - goto out_inval; - p = xdr_check_write_list(p, end); - if (!p) - goto out_inval; - p = xdr_check_reply_chunk(p, end); - if (!p) - goto out_inval; - if (p > end) - goto out_inval; - - rq_arg->head[0].iov_base = p; - hdr_len = (unsigned long)p - (unsigned long)rdma_argp; - rq_arg->head[0].iov_len -= hdr_len; - return hdr_len; - -out_short: - dprintk("svcrdma: header too short = %d\n", rq_arg->len); - return -EINVAL; - -out_version: - dprintk("svcrdma: bad xprt version: %u\n", - be32_to_cpup(rdma_argp + 1)); - return -EPROTONOSUPPORT; - -out_drop: - dprintk("svcrdma: dropping RDMA_DONE/ERROR message\n"); - return 0; - -out_proc: - dprintk("svcrdma: bad rdma procedure (%u)\n", - be32_to_cpup(rdma_argp + 3)); - return -EINVAL; - -out_inval: - dprintk("svcrdma: failed to parse transport header\n"); - return -EINVAL; -} diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index 27a99bf5b1a6..ad4bd62eebf1 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -1,4 +1,5 @@ /* + * Copyright (c) 2016, 2017 Oracle. All rights reserved. * Copyright (c) 2014 Open Grid Computing, Inc. All rights reserved. * Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved. * @@ -40,12 +41,66 @@ * Author: Tom Tucker <tom@opengridcomputing.com> */ -#include <linux/sunrpc/debug.h> -#include <linux/sunrpc/rpc_rdma.h> -#include <linux/spinlock.h> +/* Operation + * + * The main entry point is svc_rdma_recvfrom. This is called from + * svc_recv when the transport indicates there is incoming data to + * be read. "Data Ready" is signaled when an RDMA Receive completes, + * or when a set of RDMA Reads complete. + * + * An svc_rqst is passed in. This structure contains an array of + * free pages (rq_pages) that will contain the incoming RPC message. + * + * Short messages are moved directly into svc_rqst::rq_arg, and + * the RPC Call is ready to be processed by the Upper Layer. + * svc_rdma_recvfrom returns the length of the RPC Call message, + * completing the reception of the RPC Call. + * + * However, when an incoming message has Read chunks, + * svc_rdma_recvfrom must post RDMA Reads to pull the RPC Call's + * data payload from the client. svc_rdma_recvfrom sets up the + * RDMA Reads using pages in svc_rqst::rq_pages, which are + * transferred to an svc_rdma_op_ctxt for the duration of the + * I/O. svc_rdma_recvfrom then returns zero, since the RPC message + * is still not yet ready. + * + * When the Read chunk payloads have become available on the + * server, "Data Ready" is raised again, and svc_recv calls + * svc_rdma_recvfrom again. This second call may use a different + * svc_rqst than the first one, thus any information that needs + * to be preserved across these two calls is kept in an + * svc_rdma_op_ctxt. + * + * The second call to svc_rdma_recvfrom performs final assembly + * of the RPC Call message, using the RDMA Read sink pages kept in + * the svc_rdma_op_ctxt. The xdr_buf is copied from the + * svc_rdma_op_ctxt to the second svc_rqst. The second call returns + * the length of the completed RPC Call message. + * + * Page Management + * + * Pages under I/O must be transferred from the first svc_rqst to an + * svc_rdma_op_ctxt before the first svc_rdma_recvfrom call returns. + * + * The first svc_rqst supplies pages for RDMA Reads. These are moved + * from rqstp::rq_pages into ctxt::pages. The consumed elements of + * the rq_pages array are set to NULL and refilled with the first + * svc_rdma_recvfrom call returns. + * + * During the second svc_rdma_recvfrom call, RDMA Read sink pages + * are transferred from the svc_rdma_op_ctxt to the second svc_rqst + * (see rdma_read_complete() below). + */ + #include <asm/unaligned.h> #include <rdma/ib_verbs.h> #include <rdma/rdma_cm.h> + +#include <linux/spinlock.h> + +#include <linux/sunrpc/xdr.h> +#include <linux/sunrpc/debug.h> +#include <linux/sunrpc/rpc_rdma.h> #include <linux/sunrpc/svc_rdma.h> #define RPCDBG_FACILITY RPCDBG_SVCXPRT @@ -59,7 +114,6 @@ static void rdma_build_arg_xdr(struct svc_rqst *rqstp, struct svc_rdma_op_ctxt *ctxt, u32 byte_count) { - struct rpcrdma_msg *rmsgp; struct page *page; u32 bc; int sge_no; @@ -83,20 +137,12 @@ static void rdma_build_arg_xdr(struct svc_rqst *rqstp, rqstp->rq_arg.page_len = bc; rqstp->rq_arg.page_base = 0; - /* RDMA_NOMSG: RDMA READ data should land just after RDMA RECV data */ - rmsgp = (struct rpcrdma_msg *)rqstp->rq_arg.head[0].iov_base; - if (rmsgp->rm_type == rdma_nomsg) - rqstp->rq_arg.pages = &rqstp->rq_pages[0]; - else - rqstp->rq_arg.pages = &rqstp->rq_pages[1]; - sge_no = 1; while (bc && sge_no < ctxt->count) { page = ctxt->pages[sge_no]; put_page(rqstp->rq_pages[sge_no]); rqstp->rq_pages[sge_no] = page; bc -= min_t(u32, bc, ctxt->sge[sge_no].length); - rqstp->rq_arg.buflen += ctxt->sge[sge_no].length; sge_no++; } rqstp->rq_respages = &rqstp->rq_pages[sge_no]; @@ -115,406 +161,208 @@ static void rdma_build_arg_xdr(struct svc_rqst *rqstp, rqstp->rq_arg.tail[0].iov_len = 0; } -/* Issue an RDMA_READ using the local lkey to map the data sink */ -int rdma_read_chunk_lcl(struct svcxprt_rdma *xprt, - struct svc_rqst *rqstp, - struct svc_rdma_op_ctxt *head, - int *page_no, - u32 *page_offset, - u32 rs_handle, - u32 rs_length, - u64 rs_offset, - bool last) -{ - struct ib_rdma_wr read_wr; - int pages_needed = PAGE_ALIGN(*page_offset + rs_length) >> PAGE_SHIFT; - struct svc_rdma_op_ctxt *ctxt = svc_rdma_get_context(xprt); - int ret, read, pno; - u32 pg_off = *page_offset; - u32 pg_no = *page_no; - - ctxt->direction = DMA_FROM_DEVICE; - ctxt->read_hdr = head; - pages_needed = min_t(int, pages_needed, xprt->sc_max_sge_rd); - read = min_t(int, (pages_needed << PAGE_SHIFT) - *page_offset, - rs_length); - - for (pno = 0; pno < pages_needed; pno++) { - int len = min_t(int, rs_length, PAGE_SIZE - pg_off); - - head->arg.pages[pg_no] = rqstp->rq_arg.pages[pg_no]; - head->arg.page_len += len; - - head->arg.len += len; - if (!pg_off) - head->count++; - rqstp->rq_respages = &rqstp->rq_arg.pages[pg_no+1]; - rqstp->rq_next_page = rqstp->rq_respages + 1; - ctxt->sge[pno].addr = - ib_dma_map_page(xprt->sc_cm_id->device, - head->arg.pages[pg_no], pg_off, - PAGE_SIZE - pg_off, - DMA_FROM_DEVICE); - ret = ib_dma_mapping_error(xprt->sc_cm_id->device, - ctxt->sge[pno].addr); - if (ret) - goto err; - svc_rdma_count_mappings(xprt, ctxt); - - ctxt->sge[pno].lkey = xprt->sc_pd->local_dma_lkey; - ctxt->sge[pno].length = len; - ctxt->count++; - - /* adjust offset and wrap to next page if needed */ - pg_off += len; - if (pg_off == PAGE_SIZE) { - pg_off = 0; - pg_no++; - } - rs_length -= len; - } - - if (last && rs_length == 0) - set_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags); - else - clear_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags); - - memset(&read_wr, 0, sizeof(read_wr)); - ctxt->cqe.done = svc_rdma_wc_read; - read_wr.wr.wr_cqe = &ctxt->cqe; - read_wr.wr.opcode = IB_WR_RDMA_READ; - read_wr.wr.send_flags = IB_SEND_SIGNALED; - read_wr.rkey = rs_handle; - read_wr.remote_addr = rs_offset; - read_wr.wr.sg_list = ctxt->sge; - read_wr.wr.num_sge = pages_needed; - - ret = svc_rdma_send(xprt, &read_wr.wr); - if (ret) { - pr_err("svcrdma: Error %d posting RDMA_READ\n", ret); - set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags); - goto err; - } +/* This accommodates the largest possible Write chunk, + * in one segment. + */ +#define MAX_BYTES_WRITE_SEG ((u32)(RPCSVC_MAXPAGES << PAGE_SHIFT)) - /* return current location in page array */ - *page_no = pg_no; - *page_offset = pg_off; - ret = read; - atomic_inc(&rdma_stat_read); - return ret; - err: - svc_rdma_unmap_dma(ctxt); - svc_rdma_put_context(ctxt, 0); - return ret; -} +/* This accommodates the largest possible Position-Zero + * Read chunk or Reply chunk, in one segment. + */ +#define MAX_BYTES_SPECIAL_SEG ((u32)((RPCSVC_MAXPAGES + 2) << PAGE_SHIFT)) -/* Issue an RDMA_READ using an FRMR to map the data sink */ -int rdma_read_chunk_frmr(struct svcxprt_rdma *xprt, - struct svc_rqst *rqstp, - struct svc_rdma_op_ctxt *head, - int *page_no, - u32 *page_offset, - u32 rs_handle, - u32 rs_length, - u64 rs_offset, - bool last) +/* Sanity check the Read list. + * + * Implementation limits: + * - This implementation supports only one Read chunk. + * + * Sanity checks: + * - Read list does not overflow buffer. + * - Segment size limited by largest NFS data payload. + * + * The segment count is limited to how many segments can + * fit in the transport header without overflowing the + * buffer. That's about 40 Read segments for a 1KB inline + * threshold. + * + * Returns pointer to the following Write list. + */ +static __be32 *xdr_check_read_list(__be32 *p, const __be32 *end) { - struct ib_rdma_wr read_wr; - struct ib_send_wr inv_wr; - struct ib_reg_wr reg_wr; - u8 key; - int nents = PAGE_ALIGN(*page_offset + rs_length) >> PAGE_SHIFT; - struct svc_rdma_op_ctxt *ctxt = svc_rdma_get_context(xprt); - struct svc_rdma_fastreg_mr *frmr = svc_rdma_get_frmr(xprt); - int ret, read, pno, dma_nents, n; - u32 pg_off = *page_offset; - u32 pg_no = *page_no; - - if (IS_ERR(frmr)) - return -ENOMEM; - - ctxt->direction = DMA_FROM_DEVICE; - ctxt->frmr = frmr; - nents = min_t(unsigned int, nents, xprt->sc_frmr_pg_list_len); - read = min_t(int, (nents << PAGE_SHIFT) - *page_offset, rs_length); - - frmr->direction = DMA_FROM_DEVICE; - frmr->access_flags = (IB_ACCESS_LOCAL_WRITE|IB_ACCESS_REMOTE_WRITE); - frmr->sg_nents = nents; - - for (pno = 0; pno < nents; pno++) { - int len = min_t(int, rs_length, PAGE_SIZE - pg_off); - - head->arg.pages[pg_no] = rqstp->rq_arg.pages[pg_no]; - head->arg.page_len += len; - head->arg.len += len; - if (!pg_off) - head->count++; - - sg_set_page(&frmr->sg[pno], rqstp->rq_arg.pages[pg_no], - len, pg_off); - - rqstp->rq_respages = &rqstp->rq_arg.pages[pg_no+1]; - rqstp->rq_next_page = rqstp->rq_respages + 1; - - /* adjust offset and wrap to next page if needed */ - pg_off += len; - if (pg_off == PAGE_SIZE) { - pg_off = 0; - pg_no++; + u32 position; + bool first; + + first = true; + while (*p++ != xdr_zero) { + if (first) { + position = be32_to_cpup(p++); + first = false; + } else if (be32_to_cpup(p++) != position) { + return NULL; } - rs_length -= len; - } + p++; /* handle */ + if (be32_to_cpup(p++) > MAX_BYTES_SPECIAL_SEG) + return NULL; + p += 2; /* offset */ - if (last && rs_length == 0) - set_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags); - else - clear_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags); - - dma_nents = ib_dma_map_sg(xprt->sc_cm_id->device, - frmr->sg, frmr->sg_nents, - frmr->direction); - if (!dma_nents) { - pr_err("svcrdma: failed to dma map sg %p\n", - frmr->sg); - return -ENOMEM; + if (p > end) + return NULL; } + return p; +} - n = ib_map_mr_sg(frmr->mr, frmr->sg, frmr->sg_nents, NULL, PAGE_SIZE); - if (unlikely(n != frmr->sg_nents)) { - pr_err("svcrdma: failed to map mr %p (%d/%d elements)\n", - frmr->mr, n, frmr->sg_nents); - return n < 0 ? n : -EINVAL; - } +/* The segment count is limited to how many segments can + * fit in the transport header without overflowing the + * buffer. That's about 60 Write segments for a 1KB inline + * threshold. + */ +static __be32 *xdr_check_write_chunk(__be32 *p, const __be32 *end, + u32 maxlen) +{ + u32 i, segcount; - /* Bump the key */ - key = (u8)(frmr->mr->lkey & 0x000000FF); - ib_update_fast_reg_key(frmr->mr, ++key); - - ctxt->sge[0].addr = frmr->mr->iova; - ctxt->sge[0].lkey = frmr->mr->lkey; - ctxt->sge[0].length = frmr->mr->length; - ctxt->count = 1; - ctxt->read_hdr = head; - - /* Prepare REG WR */ - ctxt->reg_cqe.done = svc_rdma_wc_reg; - reg_wr.wr.wr_cqe = &ctxt->reg_cqe; - reg_wr.wr.opcode = IB_WR_REG_MR; - reg_wr.wr.send_flags = IB_SEND_SIGNALED; - reg_wr.wr.num_sge = 0; - reg_wr.mr = frmr->mr; - reg_wr.key = frmr->mr->lkey; - reg_wr.access = frmr->access_flags; - reg_wr.wr.next = &read_wr.wr; - - /* Prepare RDMA_READ */ - memset(&read_wr, 0, sizeof(read_wr)); - ctxt->cqe.done = svc_rdma_wc_read; - read_wr.wr.wr_cqe = &ctxt->cqe; - read_wr.wr.send_flags = IB_SEND_SIGNALED; - read_wr.rkey = rs_handle; - read_wr.remote_addr = rs_offset; - read_wr.wr.sg_list = ctxt->sge; - read_wr.wr.num_sge = 1; - if (xprt->sc_dev_caps & SVCRDMA_DEVCAP_READ_W_INV) { - read_wr.wr.opcode = IB_WR_RDMA_READ_WITH_INV; - read_wr.wr.ex.invalidate_rkey = ctxt->frmr->mr->lkey; - } else { - read_wr.wr.opcode = IB_WR_RDMA_READ; - read_wr.wr.next = &inv_wr; - /* Prepare invalidate */ - memset(&inv_wr, 0, sizeof(inv_wr)); - ctxt->inv_cqe.done = svc_rdma_wc_inv; - inv_wr.wr_cqe = &ctxt->inv_cqe; - inv_wr.opcode = IB_WR_LOCAL_INV; - inv_wr.send_flags = IB_SEND_SIGNALED | IB_SEND_FENCE; - inv_wr.ex.invalidate_rkey = frmr->mr->lkey; - } + segcount = be32_to_cpup(p++); + for (i = 0; i < segcount; i++) { + p++; /* handle */ + if (be32_to_cpup(p++) > maxlen) + return NULL; + p += 2; /* offset */ - /* Post the chain */ - ret = svc_rdma_send(xprt, ®_wr.wr); - if (ret) { - pr_err("svcrdma: Error %d posting RDMA_READ\n", ret); - set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags); - goto err; + if (p > end) + return NULL; } - /* return current location in page array */ - *page_no = pg_no; - *page_offset = pg_off; - ret = read; - atomic_inc(&rdma_stat_read); - return ret; - err: - svc_rdma_put_context(ctxt, 0); - svc_rdma_put_frmr(xprt, frmr); - return ret; -} - -static unsigned int -rdma_rcl_chunk_count(struct rpcrdma_read_chunk *ch) -{ - unsigned int count; - - for (count = 0; ch->rc_discrim != xdr_zero; ch++) - count++; - return count; + return p; } -/* If there was additional inline content, append it to the end of arg.pages. - * Tail copy has to be done after the reader function has determined how many - * pages are needed for RDMA READ. +/* Sanity check the Write list. + * + * Implementation limits: + * - This implementation supports only one Write chunk. + * + * Sanity checks: + * - Write list does not overflow buffer. + * - Segment size limited by largest NFS data payload. + * + * Returns pointer to the following Reply chunk. */ -static int -rdma_copy_tail(struct svc_rqst *rqstp, struct svc_rdma_op_ctxt *head, - u32 position, u32 byte_count, u32 page_offset, int page_no) +static __be32 *xdr_check_write_list(__be32 *p, const __be32 *end) { - char *srcp, *destp; - - srcp = head->arg.head[0].iov_base + position; - byte_count = head->arg.head[0].iov_len - position; - if (byte_count > PAGE_SIZE) { - dprintk("svcrdma: large tail unsupported\n"); - return 0; - } - - /* Fit as much of the tail on the current page as possible */ - if (page_offset != PAGE_SIZE) { - destp = page_address(rqstp->rq_arg.pages[page_no]); - destp += page_offset; - while (byte_count--) { - *destp++ = *srcp++; - page_offset++; - if (page_offset == PAGE_SIZE && byte_count) - goto more; - } - goto done; + u32 chcount; + + chcount = 0; + while (*p++ != xdr_zero) { + p = xdr_check_write_chunk(p, end, MAX_BYTES_WRITE_SEG); + if (!p) + return NULL; + if (chcount++ > 1) + return NULL; } - -more: - /* Fit the rest on the next page */ - page_no++; - destp = page_address(rqstp->rq_arg.pages[page_no]); - while (byte_count--) - *destp++ = *srcp++; - - rqstp->rq_respages = &rqstp->rq_arg.pages[page_no+1]; - rqstp->rq_next_page = rqstp->rq_respages + 1; - -done: - byte_count = head->arg.head[0].iov_len - position; - head->arg.page_len += byte_count; - head->arg.len += byte_count; - head->arg.buflen += byte_count; - return 1; + return p; } -/* Returns the address of the first read chunk or <nul> if no read chunk - * is present +/* Sanity check the Reply chunk. + * + * Sanity checks: + * - Reply chunk does not overflow buffer. + * - Segment size limited by largest NFS data payload. + * + * Returns pointer to the following RPC header. */ -static struct rpcrdma_read_chunk * -svc_rdma_get_read_chunk(struct rpcrdma_msg *rmsgp) +static __be32 *xdr_check_reply_chunk(__be32 *p, const __be32 *end) { - struct rpcrdma_read_chunk *ch = - (struct rpcrdma_read_chunk *)&rmsgp->rm_body.rm_chunks[0]; - - if (ch->rc_discrim == xdr_zero) - return NULL; - return ch; + if (*p++ != xdr_zero) { + p = xdr_check_write_chunk(p, end, MAX_BYTES_SPECIAL_SEG); + if (!p) + return NULL; + } + return p; } -static int rdma_read_chunks(struct svcxprt_rdma *xprt, - struct rpcrdma_msg *rmsgp, - struct svc_rqst *rqstp, - struct svc_rdma_op_ctxt *head) +/* On entry, xdr->head[0].iov_base points to first byte in the + * RPC-over-RDMA header. + * + * On successful exit, head[0] points to first byte past the + * RPC-over-RDMA header. For RDMA_MSG, this is the RPC message. + * The length of the RPC-over-RDMA header is returned. + * + * Assumptions: + * - The transport header is entirely contained in the head iovec. + */ +static int svc_rdma_xdr_decode_req(struct xdr_buf *rq_arg) { - int page_no, ret; - struct rpcrdma_read_chunk *ch; - u32 handle, page_offset, byte_count; - u32 position; - u64 rs_offset; - bool last; - - /* If no read list is present, return 0 */ - ch = svc_rdma_get_read_chunk(rmsgp); - if (!ch) - return 0; + __be32 *p, *end, *rdma_argp; + unsigned int hdr_len; + char *proc; + + /* Verify that there's enough bytes for header + something */ + if (rq_arg->len <= RPCRDMA_HDRLEN_ERR) + goto out_short; + + rdma_argp = rq_arg->head[0].iov_base; + if (*(rdma_argp + 1) != rpcrdma_version) + goto out_version; + + switch (*(rdma_argp + 3)) { + case rdma_msg: + proc = "RDMA_MSG"; + break; + case rdma_nomsg: + proc = "RDMA_NOMSG"; + break; + + case rdma_done: + goto out_drop; - if (rdma_rcl_chunk_count(ch) > RPCSVC_MAXPAGES) - return -EINVAL; - - /* The request is completed when the RDMA_READs complete. The - * head context keeps all the pages that comprise the - * request. - */ - head->arg.head[0] = rqstp->rq_arg.head[0]; - head->arg.tail[0] = rqstp->rq_arg.tail[0]; - head->hdr_count = head->count; - head->arg.page_base = 0; - head->arg.page_len = 0; - head->arg.len = rqstp->rq_arg.len; - head->arg.buflen = rqstp->rq_arg.buflen; - - /* RDMA_NOMSG: RDMA READ data should land just after RDMA RECV data */ - position = be32_to_cpu(ch->rc_position); - if (position == 0) { - head->arg.pages = &head->pages[0]; - page_offset = head->byte_len; - } else { - head->arg.pages = &head->pages[head->count]; - page_offset = 0; - } + case rdma_error: + goto out_drop; - ret = 0; - page_no = 0; - for (; ch->rc_discrim != xdr_zero; ch++) { - if (be32_to_cpu(ch->rc_position) != position) - goto err; - - handle = be32_to_cpu(ch->rc_target.rs_handle), - byte_count = be32_to_cpu(ch->rc_target.rs_length); - xdr_decode_hyper((__be32 *)&ch->rc_target.rs_offset, - &rs_offset); - - while (byte_count > 0) { - last = (ch + 1)->rc_discrim == xdr_zero; - ret = xprt->sc_reader(xprt, rqstp, head, - &page_no, &page_offset, - handle, byte_count, - rs_offset, last); - if (ret < 0) - goto err; - byte_count -= ret; - rs_offset += ret; - head->arg.buflen += ret; - } + default: + goto out_proc; } - /* Read list may need XDR round-up (see RFC 5666, s. 3.7) */ - if (page_offset & 3) { - u32 pad = 4 - (page_offset & 3); - - head->arg.tail[0].iov_len += pad; - head->arg.len += pad; - head->arg.buflen += pad; - page_offset += pad; - } + end = (__be32 *)((unsigned long)rdma_argp + rq_arg->len); + p = xdr_check_read_list(rdma_argp + 4, end); + if (!p) + goto out_inval; + p = xdr_check_write_list(p, end); + if (!p) + goto out_inval; + p = xdr_check_reply_chunk(p, end); + if (!p) + goto out_inval; + if (p > end) + goto out_inval; + + rq_arg->head[0].iov_base = p; + hdr_len = (unsigned long)p - (unsigned long)rdma_argp; + rq_arg->head[0].iov_len -= hdr_len; + rq_arg->len -= hdr_len; + dprintk("svcrdma: received %s request for XID 0x%08x, hdr_len=%u\n", + proc, be32_to_cpup(rdma_argp), hdr_len); + return hdr_len; + +out_short: + dprintk("svcrdma: header too short = %d\n", rq_arg->len); + return -EINVAL; + +out_version: + dprintk("svcrdma: bad xprt version: %u\n", + be32_to_cpup(rdma_argp + 1)); + return -EPROTONOSUPPORT; - ret = 1; - if (position && position < head->arg.head[0].iov_len) - ret = rdma_copy_tail(rqstp, head, position, - byte_count, page_offset, page_no); - head->arg.head[0].iov_len = position; - head->position = position; +out_drop: + dprintk("svcrdma: dropping RDMA_DONE/ERROR message\n"); + return 0; - err: - /* Detach arg pages. svc_recv will replenish them */ - for (page_no = 0; - &rqstp->rq_pages[page_no] < rqstp->rq_respages; page_no++) - rqstp->rq_pages[page_no] = NULL; +out_proc: + dprintk("svcrdma: bad rdma procedure (%u)\n", + be32_to_cpup(rdma_argp + 3)); + return -EINVAL; - return ret; +out_inval: + dprintk("svcrdma: failed to parse transport header\n"); + return -EINVAL; } static void rdma_read_complete(struct svc_rqst *rqstp, @@ -528,24 +376,9 @@ static void rdma_read_complete(struct svc_rqst *rqstp, rqstp->rq_pages[page_no] = head->pages[page_no]; } - /* Adjustments made for RDMA_NOMSG type requests */ - if (head->position == 0) { - if (head->arg.len <= head->sge[0].length) { - head->arg.head[0].iov_len = head->arg.len - - head->byte_len; - head->arg.page_len = 0; - } else { - head->arg.head[0].iov_len = head->sge[0].length - - head->byte_len; - head->arg.page_len = head->arg.len - - head->sge[0].length; - } - } - /* Point rq_arg.pages past header */ rqstp->rq_arg.pages = &rqstp->rq_pages[head->hdr_count]; rqstp->rq_arg.page_len = head->arg.page_len; - rqstp->rq_arg.page_base = head->arg.page_base; /* rq_respages starts after the last arg page */ rqstp->rq_respages = &rqstp->rq_pages[page_no]; @@ -642,21 +475,44 @@ static bool svc_rdma_is_backchannel_reply(struct svc_xprt *xprt, return true; } -/* - * Set up the rqstp thread context to point to the RQ buffer. If - * necessary, pull additional data from the client with an RDMA_READ - * request. +/** + * svc_rdma_recvfrom - Receive an RPC call + * @rqstp: request structure into which to receive an RPC Call + * + * Returns: + * The positive number of bytes in the RPC Call message, + * %0 if there were no Calls ready to return, + * %-EINVAL if the Read chunk data is too large, + * %-ENOMEM if rdma_rw context pool was exhausted, + * %-ENOTCONN if posting failed (connection is lost), + * %-EIO if rdma_rw initialization failed (DMA mapping, etc). + * + * Called in a loop when XPT_DATA is set. XPT_DATA is cleared only + * when there are no remaining ctxt's to process. + * + * The next ctxt is removed from the "receive" lists. + * + * - If the ctxt completes a Read, then finish assembling the Call + * message and return the number of bytes in the message. + * + * - If the ctxt completes a Receive, then construct the Call + * message from the contents of the Receive buffer. + * + * - If there are no Read chunks in this message, then finish + * assembling the Call message and return the number of bytes + * in the message. + * + * - If there are Read chunks in this message, post Read WRs to + * pull that payload and return 0. */ int svc_rdma_recvfrom(struct svc_rqst *rqstp) { struct svc_xprt *xprt = rqstp->rq_xprt; struct svcxprt_rdma *rdma_xprt = container_of(xprt, struct svcxprt_rdma, sc_xprt); - struct svc_rdma_op_ctxt *ctxt = NULL; - struct rpcrdma_msg *rmsgp; - int ret = 0; - - dprintk("svcrdma: rqstp=%p\n", rqstp); + struct svc_rdma_op_ctxt *ctxt; + __be32 *p; + int ret; spin_lock(&rdma_xprt->sc_rq_dto_lock); if (!list_empty(&rdma_xprt->sc_read_complete_q)) { @@ -671,22 +527,14 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp) struct svc_rdma_op_ctxt, list); list_del(&ctxt->list); } else { - atomic_inc(&rdma_stat_rq_starve); + /* No new incoming requests, terminate the loop */ clear_bit(XPT_DATA, &xprt->xpt_flags); - ctxt = NULL; + spin_unlock(&rdma_xprt->sc_rq_dto_lock); + return 0; } spin_unlock(&rdma_xprt->sc_rq_dto_lock); - if (!ctxt) { - /* This is the EAGAIN path. The svc_recv routine will - * return -EAGAIN, the nfsd thread will go to call into - * svc_recv again and we shouldn't be on the active - * transport list - */ - if (test_bit(XPT_CLOSE, &xprt->xpt_flags)) - goto defer; - goto out; - } - dprintk("svcrdma: processing ctxt=%p on xprt=%p, rqstp=%p\n", + + dprintk("svcrdma: recvfrom: ctxt=%p on xprt=%p, rqstp=%p\n", ctxt, rdma_xprt, rqstp); atomic_inc(&rdma_stat_recv); @@ -694,7 +542,7 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp) rdma_build_arg_xdr(rqstp, ctxt, ctxt->byte_len); /* Decode the RDMA header. */ - rmsgp = (struct rpcrdma_msg *)rqstp->rq_arg.head[0].iov_base; + p = (__be32 *)rqstp->rq_arg.head[0].iov_base; ret = svc_rdma_xdr_decode_req(&rqstp->rq_arg); if (ret < 0) goto out_err; @@ -702,9 +550,8 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp) goto out_drop; rqstp->rq_xprt_hlen = ret; - if (svc_rdma_is_backchannel_reply(xprt, &rmsgp->rm_xid)) { - ret = svc_rdma_handle_bc_reply(xprt->xpt_bc_xprt, - &rmsgp->rm_xid, + if (svc_rdma_is_backchannel_reply(xprt, p)) { + ret = svc_rdma_handle_bc_reply(xprt->xpt_bc_xprt, p, &rqstp->rq_arg); svc_rdma_put_context(ctxt, 0); if (ret) @@ -712,39 +559,34 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp) return ret; } - /* Read read-list data. */ - ret = rdma_read_chunks(rdma_xprt, rmsgp, rqstp, ctxt); - if (ret > 0) { - /* read-list posted, defer until data received from client. */ - goto defer; - } else if (ret < 0) { - /* Post of read-list failed, free context. */ - svc_rdma_put_context(ctxt, 1); - return 0; - } + p += rpcrdma_fixed_maxsz; + if (*p != xdr_zero) + goto out_readchunk; complete: - ret = rqstp->rq_arg.head[0].iov_len - + rqstp->rq_arg.page_len - + rqstp->rq_arg.tail[0].iov_len; svc_rdma_put_context(ctxt, 0); - out: - dprintk("svcrdma: ret=%d, rq_arg.len=%u, " - "rq_arg.head[0].iov_base=%p, rq_arg.head[0].iov_len=%zd\n", - ret, rqstp->rq_arg.len, - rqstp->rq_arg.head[0].iov_base, - rqstp->rq_arg.head[0].iov_len); + dprintk("svcrdma: recvfrom: xprt=%p, rqstp=%p, rq_arg.len=%u\n", + rdma_xprt, rqstp, rqstp->rq_arg.len); rqstp->rq_prot = IPPROTO_MAX; svc_xprt_copy_addrs(rqstp, xprt); - return ret; + return rqstp->rq_arg.len; + +out_readchunk: + ret = svc_rdma_recv_read_chunk(rdma_xprt, rqstp, ctxt, p); + if (ret < 0) + goto out_postfail; + return 0; out_err: - svc_rdma_send_error(rdma_xprt, &rmsgp->rm_xid, ret); + svc_rdma_send_error(rdma_xprt, p, ret); svc_rdma_put_context(ctxt, 0); return 0; -defer: - return 0; +out_postfail: + if (ret == -EINVAL) + svc_rdma_send_error(rdma_xprt, p, ret); + svc_rdma_put_context(ctxt, 1); + return ret; out_drop: svc_rdma_put_context(ctxt, 1); diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c index 0cf620277693..933f79bed270 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_rw.c +++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c @@ -12,6 +12,9 @@ #define RPCDBG_FACILITY RPCDBG_SVCXPRT +static void svc_rdma_write_done(struct ib_cq *cq, struct ib_wc *wc); +static void svc_rdma_wc_read_done(struct ib_cq *cq, struct ib_wc *wc); + /* Each R/W context contains state for one chain of RDMA Read or * Write Work Requests. * @@ -113,22 +116,20 @@ struct svc_rdma_chunk_ctxt { struct svcxprt_rdma *cc_rdma; struct list_head cc_rwctxts; int cc_sqecount; - enum dma_data_direction cc_dir; }; static void svc_rdma_cc_init(struct svcxprt_rdma *rdma, - struct svc_rdma_chunk_ctxt *cc, - enum dma_data_direction dir) + struct svc_rdma_chunk_ctxt *cc) { cc->cc_rdma = rdma; svc_xprt_get(&rdma->sc_xprt); INIT_LIST_HEAD(&cc->cc_rwctxts); cc->cc_sqecount = 0; - cc->cc_dir = dir; } -static void svc_rdma_cc_release(struct svc_rdma_chunk_ctxt *cc) +static void svc_rdma_cc_release(struct svc_rdma_chunk_ctxt *cc, + enum dma_data_direction dir) { struct svcxprt_rdma *rdma = cc->cc_rdma; struct svc_rdma_rw_ctxt *ctxt; @@ -138,7 +139,7 @@ static void svc_rdma_cc_release(struct svc_rdma_chunk_ctxt *cc) rdma_rw_ctx_destroy(&ctxt->rw_ctx, rdma->sc_qp, rdma->sc_port_num, ctxt->rw_sg_table.sgl, - ctxt->rw_nents, cc->cc_dir); + ctxt->rw_nents, dir); svc_rdma_put_rw_ctxt(rdma, ctxt); } svc_xprt_put(&rdma->sc_xprt); @@ -176,13 +177,14 @@ svc_rdma_write_info_alloc(struct svcxprt_rdma *rdma, __be32 *chunk) info->wi_seg_no = 0; info->wi_nsegs = be32_to_cpup(++chunk); info->wi_segs = ++chunk; - svc_rdma_cc_init(rdma, &info->wi_cc, DMA_TO_DEVICE); + svc_rdma_cc_init(rdma, &info->wi_cc); + info->wi_cc.cc_cqe.done = svc_rdma_write_done; return info; } static void svc_rdma_write_info_free(struct svc_rdma_write_info *info) { - svc_rdma_cc_release(&info->wi_cc); + svc_rdma_cc_release(&info->wi_cc, DMA_TO_DEVICE); kfree(info); } @@ -216,6 +218,76 @@ static void svc_rdma_write_done(struct ib_cq *cq, struct ib_wc *wc) svc_rdma_write_info_free(info); } +/* State for pulling a Read chunk. + */ +struct svc_rdma_read_info { + struct svc_rdma_op_ctxt *ri_readctxt; + unsigned int ri_position; + unsigned int ri_pageno; + unsigned int ri_pageoff; + unsigned int ri_chunklen; + + struct svc_rdma_chunk_ctxt ri_cc; +}; + +static struct svc_rdma_read_info * +svc_rdma_read_info_alloc(struct svcxprt_rdma *rdma) +{ + struct svc_rdma_read_info *info; + + info = kmalloc(sizeof(*info), GFP_KERNEL); + if (!info) + return info; + + svc_rdma_cc_init(rdma, &info->ri_cc); + info->ri_cc.cc_cqe.done = svc_rdma_wc_read_done; + return info; +} + +static void svc_rdma_read_info_free(struct svc_rdma_read_info *info) +{ + svc_rdma_cc_release(&info->ri_cc, DMA_FROM_DEVICE); + kfree(info); +} + +/** + * svc_rdma_wc_read_done - Handle completion of an RDMA Read ctx + * @cq: controlling Completion Queue + * @wc: Work Completion + * + */ +static void svc_rdma_wc_read_done(struct ib_cq *cq, struct ib_wc *wc) +{ + struct ib_cqe *cqe = wc->wr_cqe; + struct svc_rdma_chunk_ctxt *cc = + container_of(cqe, struct svc_rdma_chunk_ctxt, cc_cqe); + struct svcxprt_rdma *rdma = cc->cc_rdma; + struct svc_rdma_read_info *info = + container_of(cc, struct svc_rdma_read_info, ri_cc); + + atomic_add(cc->cc_sqecount, &rdma->sc_sq_avail); + wake_up(&rdma->sc_send_wait); + + if (unlikely(wc->status != IB_WC_SUCCESS)) { + set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags); + if (wc->status != IB_WC_WR_FLUSH_ERR) + pr_err("svcrdma: read ctx: %s (%u/0x%x)\n", + ib_wc_status_msg(wc->status), + wc->status, wc->vendor_err); + svc_rdma_put_context(info->ri_readctxt, 1); + } else { + spin_lock(&rdma->sc_rq_dto_lock); + list_add_tail(&info->ri_readctxt->list, + &rdma->sc_read_complete_q); + spin_unlock(&rdma->sc_rq_dto_lock); + + set_bit(XPT_DATA, &rdma->sc_xprt.xpt_flags); + svc_xprt_enqueue(&rdma->sc_xprt); + } + + svc_rdma_read_info_free(info); +} + /* This function sleeps when the transport's Send Queue is congested. * * Assumptions: @@ -232,6 +304,9 @@ static int svc_rdma_post_chunk_ctxt(struct svc_rdma_chunk_ctxt *cc) struct ib_cqe *cqe; int ret; + if (cc->cc_sqecount > rdma->sc_sq_depth) + return -EINVAL; + first_wr = NULL; cqe = &cc->cc_cqe; list_for_each(tmp, &cc->cc_rwctxts) { @@ -295,8 +370,9 @@ static void svc_rdma_pagelist_to_sg(struct svc_rdma_write_info *info, struct scatterlist *sg; struct page **page; - page_off = (info->wi_next_off + xdr->page_base) & ~PAGE_MASK; - page_no = (info->wi_next_off + xdr->page_base) >> PAGE_SHIFT; + page_off = info->wi_next_off + xdr->page_base; + page_no = page_off >> PAGE_SHIFT; + page_off = offset_in_page(page_off); page = xdr->pages + page_no; info->wi_next_off += remaining; sg = ctxt->rw_sg_table.sgl; @@ -332,7 +408,6 @@ svc_rdma_build_writes(struct svc_rdma_write_info *info, __be32 *seg; int ret; - cc->cc_cqe.done = svc_rdma_write_done; seg = info->wi_segs + info->wi_seg_no * rpcrdma_segment_maxsz; do { unsigned int write_len; @@ -425,6 +500,7 @@ static int svc_rdma_send_xdr_pagelist(struct svc_rdma_write_info *info, * * Returns a non-negative number of bytes the chunk consumed, or * %-E2BIG if the payload was larger than the Write chunk, + * %-EINVAL if client provided too many segments, * %-ENOMEM if rdma_rw context pool was exhausted, * %-ENOTCONN if posting failed (connection is lost), * %-EIO if rdma_rw initialization failed (DMA mapping, etc). @@ -465,6 +541,7 @@ out_err: * * Returns a non-negative number of bytes the chunk consumed, or * %-E2BIG if the payload was larger than the Reply chunk, + * %-EINVAL if client provided too many segments, * %-ENOMEM if rdma_rw context pool was exhausted, * %-ENOTCONN if posting failed (connection is lost), * %-EIO if rdma_rw initialization failed (DMA mapping, etc). @@ -510,3 +587,353 @@ out_err: svc_rdma_write_info_free(info); return ret; } + +static int svc_rdma_build_read_segment(struct svc_rdma_read_info *info, + struct svc_rqst *rqstp, + u32 rkey, u32 len, u64 offset) +{ + struct svc_rdma_op_ctxt *head = info->ri_readctxt; + struct svc_rdma_chunk_ctxt *cc = &info->ri_cc; + struct svc_rdma_rw_ctxt *ctxt; + unsigned int sge_no, seg_len; + struct scatterlist *sg; + int ret; + + sge_no = PAGE_ALIGN(info->ri_pageoff + len) >> PAGE_SHIFT; + ctxt = svc_rdma_get_rw_ctxt(cc->cc_rdma, sge_no); + if (!ctxt) + goto out_noctx; + ctxt->rw_nents = sge_no; + + dprintk("svcrdma: reading segment %u@0x%016llx:0x%08x (%u sges)\n", + len, offset, rkey, sge_no); + + sg = ctxt->rw_sg_table.sgl; + for (sge_no = 0; sge_no < ctxt->rw_nents; sge_no++) { + seg_len = min_t(unsigned int, len, + PAGE_SIZE - info->ri_pageoff); + + head->arg.pages[info->ri_pageno] = + rqstp->rq_pages[info->ri_pageno]; + if (!info->ri_pageoff) + head->count++; + + sg_set_page(sg, rqstp->rq_pages[info->ri_pageno], + seg_len, info->ri_pageoff); + sg = sg_next(sg); + + info->ri_pageoff += seg_len; + if (info->ri_pageoff == PAGE_SIZE) { + info->ri_pageno++; + info->ri_pageoff = 0; + } + len -= seg_len; + + /* Safety check */ + if (len && + &rqstp->rq_pages[info->ri_pageno + 1] > rqstp->rq_page_end) + goto out_overrun; + } + + ret = rdma_rw_ctx_init(&ctxt->rw_ctx, cc->cc_rdma->sc_qp, + cc->cc_rdma->sc_port_num, + ctxt->rw_sg_table.sgl, ctxt->rw_nents, + 0, offset, rkey, DMA_FROM_DEVICE); + if (ret < 0) + goto out_initerr; + + list_add(&ctxt->rw_list, &cc->cc_rwctxts); + cc->cc_sqecount += ret; + return 0; + +out_noctx: + dprintk("svcrdma: no R/W ctxs available\n"); + return -ENOMEM; + +out_overrun: + dprintk("svcrdma: request overruns rq_pages\n"); + return -EINVAL; + +out_initerr: + svc_rdma_put_rw_ctxt(cc->cc_rdma, ctxt); + pr_err("svcrdma: failed to map pagelist (%d)\n", ret); + return -EIO; +} + +static int svc_rdma_build_read_chunk(struct svc_rqst *rqstp, + struct svc_rdma_read_info *info, + __be32 *p) +{ + int ret; + + info->ri_chunklen = 0; + while (*p++ != xdr_zero) { + u32 rs_handle, rs_length; + u64 rs_offset; + + if (be32_to_cpup(p++) != info->ri_position) + break; + rs_handle = be32_to_cpup(p++); + rs_length = be32_to_cpup(p++); + p = xdr_decode_hyper(p, &rs_offset); + + ret = svc_rdma_build_read_segment(info, rqstp, + rs_handle, rs_length, + rs_offset); + if (ret < 0) + break; + + info->ri_chunklen += rs_length; + } + + return ret; +} + +/* If there is inline content following the Read chunk, append it to + * the page list immediately following the data payload. This has to + * be done after the reader function has determined how many pages + * were consumed for RDMA Read. + * + * On entry, ri_pageno and ri_pageoff point directly to the end of the + * page list. On exit, both have been updated to the new "next byte". + * + * Assumptions: + * - Inline content fits entirely in rq_pages[0] + * - Trailing content is only a handful of bytes + */ +static int svc_rdma_copy_tail(struct svc_rqst *rqstp, + struct svc_rdma_read_info *info) +{ + struct svc_rdma_op_ctxt *head = info->ri_readctxt; + unsigned int tail_length, remaining; + u8 *srcp, *destp; + + /* Assert that all inline content fits in page 0. This is an + * implementation limit, not a protocol limit. + */ + if (head->arg.head[0].iov_len > PAGE_SIZE) { + pr_warn_once("svcrdma: too much trailing inline content\n"); + return -EINVAL; + } + + srcp = head->arg.head[0].iov_base; + srcp += info->ri_position; + tail_length = head->arg.head[0].iov_len - info->ri_position; + remaining = tail_length; + + /* If there is room on the last page in the page list, try to + * fit the trailing content there. + */ + if (info->ri_pageoff > 0) { + unsigned int len; + + len = min_t(unsigned int, remaining, + PAGE_SIZE - info->ri_pageoff); + destp = page_address(rqstp->rq_pages[info->ri_pageno]); + destp += info->ri_pageoff; + + memcpy(destp, srcp, len); + srcp += len; + destp += len; + info->ri_pageoff += len; + remaining -= len; + + if (info->ri_pageoff == PAGE_SIZE) { + info->ri_pageno++; + info->ri_pageoff = 0; + } + } + + /* Otherwise, a fresh page is needed. */ + if (remaining) { + head->arg.pages[info->ri_pageno] = + rqstp->rq_pages[info->ri_pageno]; + head->count++; + + destp = page_address(rqstp->rq_pages[info->ri_pageno]); + memcpy(destp, srcp, remaining); + info->ri_pageoff += remaining; + } + + head->arg.page_len += tail_length; + head->arg.len += tail_length; + head->arg.buflen += tail_length; + return 0; +} + +/* Construct RDMA Reads to pull over a normal Read chunk. The chunk + * data lands in the page list of head->arg.pages. + * + * Currently NFSD does not look at the head->arg.tail[0] iovec. + * Therefore, XDR round-up of the Read chunk and trailing + * inline content must both be added at the end of the pagelist. + */ +static int svc_rdma_build_normal_read_chunk(struct svc_rqst *rqstp, + struct svc_rdma_read_info *info, + __be32 *p) +{ + struct svc_rdma_op_ctxt *head = info->ri_readctxt; + int ret; + + dprintk("svcrdma: Reading Read chunk at position %u\n", + info->ri_position); + + info->ri_pageno = head->hdr_count; + info->ri_pageoff = 0; + + ret = svc_rdma_build_read_chunk(rqstp, info, p); + if (ret < 0) + goto out; + + /* Read chunk may need XDR round-up (see RFC 5666, s. 3.7). + */ + if (info->ri_chunklen & 3) { + u32 padlen = 4 - (info->ri_chunklen & 3); + + info->ri_chunklen += padlen; + + /* NB: data payload always starts on XDR alignment, + * thus the pad can never contain a page boundary. + */ + info->ri_pageoff += padlen; + if (info->ri_pageoff == PAGE_SIZE) { + info->ri_pageno++; + info->ri_pageoff = 0; + } + } + + head->arg.page_len = info->ri_chunklen; + head->arg.len += info->ri_chunklen; + head->arg.buflen += info->ri_chunklen; + + if (info->ri_position < head->arg.head[0].iov_len) { + ret = svc_rdma_copy_tail(rqstp, info); + if (ret < 0) + goto out; + } + head->arg.head[0].iov_len = info->ri_position; + +out: + return ret; +} + +/* Construct RDMA Reads to pull over a Position Zero Read chunk. + * The start of the data lands in the first page just after + * the Transport header, and the rest lands in the page list of + * head->arg.pages. + * + * Assumptions: + * - A PZRC has an XDR-aligned length (no implicit round-up). + * - There can be no trailing inline content (IOW, we assume + * a PZRC is never sent in an RDMA_MSG message, though it's + * allowed by spec). + */ +static int svc_rdma_build_pz_read_chunk(struct svc_rqst *rqstp, + struct svc_rdma_read_info *info, + __be32 *p) +{ + struct svc_rdma_op_ctxt *head = info->ri_readctxt; + int ret; + + dprintk("svcrdma: Reading Position Zero Read chunk\n"); + + info->ri_pageno = head->hdr_count - 1; + info->ri_pageoff = offset_in_page(head->byte_len); + + ret = svc_rdma_build_read_chunk(rqstp, info, p); + if (ret < 0) + goto out; + + head->arg.len += info->ri_chunklen; + head->arg.buflen += info->ri_chunklen; + + if (head->arg.buflen <= head->sge[0].length) { + /* Transport header and RPC message fit entirely + * in page where head iovec resides. + */ + head->arg.head[0].iov_len = info->ri_chunklen; + } else { + /* Transport header and part of RPC message reside + * in the head iovec's page. + */ + head->arg.head[0].iov_len = + head->sge[0].length - head->byte_len; + head->arg.page_len = + info->ri_chunklen - head->arg.head[0].iov_len; + } + +out: + return ret; +} + +/** + * svc_rdma_recv_read_chunk - Pull a Read chunk from the client + * @rdma: controlling RDMA transport + * @rqstp: set of pages to use as Read sink buffers + * @head: pages under I/O collect here + * @p: pointer to start of Read chunk + * + * Returns: + * %0 if all needed RDMA Reads were posted successfully, + * %-EINVAL if client provided too many segments, + * %-ENOMEM if rdma_rw context pool was exhausted, + * %-ENOTCONN if posting failed (connection is lost), + * %-EIO if rdma_rw initialization failed (DMA mapping, etc). + * + * Assumptions: + * - All Read segments in @p have the same Position value. + */ +int svc_rdma_recv_read_chunk(struct svcxprt_rdma *rdma, struct svc_rqst *rqstp, + struct svc_rdma_op_ctxt *head, __be32 *p) +{ + struct svc_rdma_read_info *info; + struct page **page; + int ret; + + /* The request (with page list) is constructed in + * head->arg. Pages involved with RDMA Read I/O are + * transferred there. + */ + head->hdr_count = head->count; + head->arg.head[0] = rqstp->rq_arg.head[0]; + head->arg.tail[0] = rqstp->rq_arg.tail[0]; + head->arg.pages = head->pages; + head->arg.page_base = 0; + head->arg.page_len = 0; + head->arg.len = rqstp->rq_arg.len; + head->arg.buflen = rqstp->rq_arg.buflen; + + info = svc_rdma_read_info_alloc(rdma); + if (!info) + return -ENOMEM; + info->ri_readctxt = head; + + info->ri_position = be32_to_cpup(p + 1); + if (info->ri_position) + ret = svc_rdma_build_normal_read_chunk(rqstp, info, p); + else + ret = svc_rdma_build_pz_read_chunk(rqstp, info, p); + + /* Mark the start of the pages that can be used for the reply */ + if (info->ri_pageoff > 0) + info->ri_pageno++; + rqstp->rq_respages = &rqstp->rq_pages[info->ri_pageno]; + rqstp->rq_next_page = rqstp->rq_respages + 1; + + if (ret < 0) + goto out; + + ret = svc_rdma_post_chunk_ctxt(&info->ri_cc); + +out: + /* Read sink pages have been moved from rqstp->rq_pages to + * head->arg.pages. Force svc_recv to refill those slots + * in rq_pages. + */ + for (page = rqstp->rq_pages; page < rqstp->rq_respages; page++) + *page = NULL; + + if (ret < 0) + svc_rdma_read_info_free(info); + return ret; +} diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index 1736337f3a55..7c3a211e0e9a 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -313,13 +313,17 @@ static int svc_rdma_dma_map_buf(struct svcxprt_rdma *rdma, dma_addr = ib_dma_map_page(dev, virt_to_page(base), offset, len, DMA_TO_DEVICE); if (ib_dma_mapping_error(dev, dma_addr)) - return -EIO; + goto out_maperr; ctxt->sge[sge_no].addr = dma_addr; ctxt->sge[sge_no].length = len; ctxt->sge[sge_no].lkey = rdma->sc_pd->local_dma_lkey; svc_rdma_count_mappings(rdma, ctxt); return 0; + +out_maperr: + pr_err("svcrdma: failed to map buffer\n"); + return -EIO; } static int svc_rdma_dma_map_page(struct svcxprt_rdma *rdma, @@ -334,13 +338,17 @@ static int svc_rdma_dma_map_page(struct svcxprt_rdma *rdma, dma_addr = ib_dma_map_page(dev, page, offset, len, DMA_TO_DEVICE); if (ib_dma_mapping_error(dev, dma_addr)) - return -EIO; + goto out_maperr; ctxt->sge[sge_no].addr = dma_addr; ctxt->sge[sge_no].length = len; ctxt->sge[sge_no].lkey = rdma->sc_pd->local_dma_lkey; svc_rdma_count_mappings(rdma, ctxt); return 0; + +out_maperr: + pr_err("svcrdma: failed to map page\n"); + return -EIO; } /** @@ -547,7 +555,6 @@ static int svc_rdma_send_reply_msg(struct svcxprt_rdma *rdma, return 0; err: - pr_err("svcrdma: failed to post Send WR (%d)\n", ret); svc_rdma_unmap_dma(ctxt); svc_rdma_put_context(ctxt, 1); return ret; @@ -677,7 +684,7 @@ int svc_rdma_sendto(struct svc_rqst *rqstp) return 0; err2: - if (ret != -E2BIG) + if (ret != -E2BIG && ret != -EINVAL) goto err1; ret = svc_rdma_post_recv(rdma, GFP_KERNEL); diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index a9d9cb1ba4c6..e660d4965b18 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -202,7 +202,6 @@ struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *xprt) out: ctxt->count = 0; ctxt->mapped_sges = 0; - ctxt->frmr = NULL; return ctxt; out_empty: @@ -226,22 +225,13 @@ void svc_rdma_unmap_dma(struct svc_rdma_op_ctxt *ctxt) { struct svcxprt_rdma *xprt = ctxt->xprt; struct ib_device *device = xprt->sc_cm_id->device; - u32 lkey = xprt->sc_pd->local_dma_lkey; unsigned int i; - for (i = 0; i < ctxt->mapped_sges; i++) { - /* - * Unmap the DMA addr in the SGE if the lkey matches - * the local_dma_lkey, otherwise, ignore it since it is - * an FRMR lkey and will be unmapped later when the - * last WR that uses it completes. - */ - if (ctxt->sge[i].lkey == lkey) - ib_dma_unmap_page(device, - ctxt->sge[i].addr, - ctxt->sge[i].length, - ctxt->direction); - } + for (i = 0; i < ctxt->mapped_sges; i++) + ib_dma_unmap_page(device, + ctxt->sge[i].addr, + ctxt->sge[i].length, + ctxt->direction); ctxt->mapped_sges = 0; } @@ -346,36 +336,6 @@ out: svc_xprt_put(&xprt->sc_xprt); } -static void svc_rdma_send_wc_common(struct svcxprt_rdma *xprt, - struct ib_wc *wc, - const char *opname) -{ - if (wc->status != IB_WC_SUCCESS) - goto err; - -out: - atomic_inc(&xprt->sc_sq_avail); - wake_up(&xprt->sc_send_wait); - return; - -err: - set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags); - if (wc->status != IB_WC_WR_FLUSH_ERR) - pr_err("svcrdma: %s: %s (%u/0x%x)\n", - opname, ib_wc_status_msg(wc->status), - wc->status, wc->vendor_err); - goto out; -} - -static void svc_rdma_send_wc_common_put(struct ib_cq *cq, struct ib_wc *wc, - const char *opname) -{ - struct svcxprt_rdma *xprt = cq->cq_context; - - svc_rdma_send_wc_common(xprt, wc, opname); - svc_xprt_put(&xprt->sc_xprt); -} - /** * svc_rdma_wc_send - Invoked by RDMA provider for each polled Send WC * @cq: completion queue @@ -384,73 +344,28 @@ static void svc_rdma_send_wc_common_put(struct ib_cq *cq, struct ib_wc *wc, */ void svc_rdma_wc_send(struct ib_cq *cq, struct ib_wc *wc) { - struct ib_cqe *cqe = wc->wr_cqe; - struct svc_rdma_op_ctxt *ctxt; - - svc_rdma_send_wc_common_put(cq, wc, "send"); - - ctxt = container_of(cqe, struct svc_rdma_op_ctxt, cqe); - svc_rdma_unmap_dma(ctxt); - svc_rdma_put_context(ctxt, 1); -} - -/** - * svc_rdma_wc_reg - Invoked by RDMA provider for each polled FASTREG WC - * @cq: completion queue - * @wc: completed WR - * - */ -void svc_rdma_wc_reg(struct ib_cq *cq, struct ib_wc *wc) -{ - svc_rdma_send_wc_common_put(cq, wc, "fastreg"); -} - -/** - * svc_rdma_wc_read - Invoked by RDMA provider for each polled Read WC - * @cq: completion queue - * @wc: completed WR - * - */ -void svc_rdma_wc_read(struct ib_cq *cq, struct ib_wc *wc) -{ struct svcxprt_rdma *xprt = cq->cq_context; struct ib_cqe *cqe = wc->wr_cqe; struct svc_rdma_op_ctxt *ctxt; - svc_rdma_send_wc_common(xprt, wc, "read"); + atomic_inc(&xprt->sc_sq_avail); + wake_up(&xprt->sc_send_wait); ctxt = container_of(cqe, struct svc_rdma_op_ctxt, cqe); svc_rdma_unmap_dma(ctxt); - svc_rdma_put_frmr(xprt, ctxt->frmr); - - if (test_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags)) { - struct svc_rdma_op_ctxt *read_hdr; - - read_hdr = ctxt->read_hdr; - spin_lock(&xprt->sc_rq_dto_lock); - list_add_tail(&read_hdr->list, - &xprt->sc_read_complete_q); - spin_unlock(&xprt->sc_rq_dto_lock); + svc_rdma_put_context(ctxt, 1); - set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags); - svc_xprt_enqueue(&xprt->sc_xprt); + if (unlikely(wc->status != IB_WC_SUCCESS)) { + set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags); + if (wc->status != IB_WC_WR_FLUSH_ERR) + pr_err("svcrdma: Send: %s (%u/0x%x)\n", + ib_wc_status_msg(wc->status), + wc->status, wc->vendor_err); } - svc_rdma_put_context(ctxt, 0); svc_xprt_put(&xprt->sc_xprt); } -/** - * svc_rdma_wc_inv - Invoked by RDMA provider for each polled LOCAL_INV WC - * @cq: completion queue - * @wc: completed WR - * - */ -void svc_rdma_wc_inv(struct ib_cq *cq, struct ib_wc *wc) -{ - svc_rdma_send_wc_common_put(cq, wc, "localInv"); -} - static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv, int listener) { @@ -462,14 +377,12 @@ static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv, INIT_LIST_HEAD(&cma_xprt->sc_accept_q); INIT_LIST_HEAD(&cma_xprt->sc_rq_dto_q); INIT_LIST_HEAD(&cma_xprt->sc_read_complete_q); - INIT_LIST_HEAD(&cma_xprt->sc_frmr_q); INIT_LIST_HEAD(&cma_xprt->sc_ctxts); INIT_LIST_HEAD(&cma_xprt->sc_rw_ctxts); init_waitqueue_head(&cma_xprt->sc_send_wait); spin_lock_init(&cma_xprt->sc_lock); spin_lock_init(&cma_xprt->sc_rq_dto_lock); - spin_lock_init(&cma_xprt->sc_frmr_q_lock); spin_lock_init(&cma_xprt->sc_ctxt_lock); spin_lock_init(&cma_xprt->sc_rw_ctxt_lock); @@ -780,86 +693,6 @@ static struct svc_xprt *svc_rdma_create(struct svc_serv *serv, return ERR_PTR(ret); } -static struct svc_rdma_fastreg_mr *rdma_alloc_frmr(struct svcxprt_rdma *xprt) -{ - struct ib_mr *mr; - struct scatterlist *sg; - struct svc_rdma_fastreg_mr *frmr; - u32 num_sg; - - frmr = kmalloc(sizeof(*frmr), GFP_KERNEL); - if (!frmr) - goto err; - - num_sg = min_t(u32, RPCSVC_MAXPAGES, xprt->sc_frmr_pg_list_len); - mr = ib_alloc_mr(xprt->sc_pd, IB_MR_TYPE_MEM_REG, num_sg); - if (IS_ERR(mr)) - goto err_free_frmr; - - sg = kcalloc(RPCSVC_MAXPAGES, sizeof(*sg), GFP_KERNEL); - if (!sg) - goto err_free_mr; - - sg_init_table(sg, RPCSVC_MAXPAGES); - - frmr->mr = mr; - frmr->sg = sg; - INIT_LIST_HEAD(&frmr->frmr_list); - return frmr; - - err_free_mr: - ib_dereg_mr(mr); - err_free_frmr: - kfree(frmr); - err: - return ERR_PTR(-ENOMEM); -} - -static void rdma_dealloc_frmr_q(struct svcxprt_rdma *xprt) -{ - struct svc_rdma_fastreg_mr *frmr; - - while (!list_empty(&xprt->sc_frmr_q)) { - frmr = list_entry(xprt->sc_frmr_q.next, - struct svc_rdma_fastreg_mr, frmr_list); - list_del_init(&frmr->frmr_list); - kfree(frmr->sg); - ib_dereg_mr(frmr->mr); - kfree(frmr); - } -} - -struct svc_rdma_fastreg_mr *svc_rdma_get_frmr(struct svcxprt_rdma *rdma) -{ - struct svc_rdma_fastreg_mr *frmr = NULL; - - spin_lock(&rdma->sc_frmr_q_lock); - if (!list_empty(&rdma->sc_frmr_q)) { - frmr = list_entry(rdma->sc_frmr_q.next, - struct svc_rdma_fastreg_mr, frmr_list); - list_del_init(&frmr->frmr_list); - frmr->sg_nents = 0; - } - spin_unlock(&rdma->sc_frmr_q_lock); - if (frmr) - return frmr; - - return rdma_alloc_frmr(rdma); -} - -void svc_rdma_put_frmr(struct svcxprt_rdma *rdma, - struct svc_rdma_fastreg_mr *frmr) -{ - if (frmr) { - ib_dma_unmap_sg(rdma->sc_cm_id->device, - frmr->sg, frmr->sg_nents, frmr->direction); - spin_lock(&rdma->sc_frmr_q_lock); - WARN_ON_ONCE(!list_empty(&frmr->frmr_list)); - list_add(&frmr->frmr_list, &rdma->sc_frmr_q); - spin_unlock(&rdma->sc_frmr_q_lock); - } -} - /* * This is the xpo_recvfrom function for listening endpoints. Its * purpose is to accept incoming connections. The CMA callback handler @@ -908,8 +741,6 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) * capabilities of this particular device */ newxprt->sc_max_sge = min((size_t)dev->attrs.max_sge, (size_t)RPCSVC_MAXPAGES); - newxprt->sc_max_sge_rd = min_t(size_t, dev->attrs.max_sge_rd, - RPCSVC_MAXPAGES); newxprt->sc_max_req_size = svcrdma_max_req_size; newxprt->sc_max_requests = min_t(u32, dev->attrs.max_qp_wr, svcrdma_max_requests); @@ -952,7 +783,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) memset(&qp_attr, 0, sizeof qp_attr); qp_attr.event_handler = qp_event_handler; qp_attr.qp_context = &newxprt->sc_xprt; - qp_attr.port_num = newxprt->sc_cm_id->port_num; + qp_attr.port_num = newxprt->sc_port_num; qp_attr.cap.max_rdma_ctxs = newxprt->sc_max_requests; qp_attr.cap.max_send_wr = newxprt->sc_sq_depth; qp_attr.cap.max_recv_wr = newxprt->sc_rq_depth; @@ -976,47 +807,12 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) } newxprt->sc_qp = newxprt->sc_cm_id->qp; - /* - * Use the most secure set of MR resources based on the - * transport type and available memory management features in - * the device. Here's the table implemented below: - * - * Fast Global DMA Remote WR - * Reg LKEY MR Access - * Sup'd Sup'd Needed Needed - * - * IWARP N N Y Y - * N Y Y Y - * Y N Y N - * Y Y N - - * - * IB N N Y N - * N Y N - - * Y N Y N - * Y Y N - - * - * NB: iWARP requires remote write access for the data sink - * of an RDMA_READ. IB does not. - */ - newxprt->sc_reader = rdma_read_chunk_lcl; - if (dev->attrs.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) { - newxprt->sc_frmr_pg_list_len = - dev->attrs.max_fast_reg_page_list_len; - newxprt->sc_dev_caps |= SVCRDMA_DEVCAP_FAST_REG; - newxprt->sc_reader = rdma_read_chunk_frmr; - } else + if (!(dev->attrs.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS)) newxprt->sc_snd_w_inv = false; - - /* - * Determine if a DMA MR is required and if so, what privs are required - */ - if (!rdma_protocol_iwarp(dev, newxprt->sc_cm_id->port_num) && - !rdma_ib_or_roce(dev, newxprt->sc_cm_id->port_num)) + if (!rdma_protocol_iwarp(dev, newxprt->sc_port_num) && + !rdma_ib_or_roce(dev, newxprt->sc_port_num)) goto errout; - if (rdma_protocol_iwarp(dev, newxprt->sc_cm_id->port_num)) - newxprt->sc_dev_caps |= SVCRDMA_DEVCAP_READ_W_INV; - /* Post receive buffers */ for (i = 0; i < newxprt->sc_max_requests; i++) { ret = svc_rdma_post_recv(newxprt, GFP_KERNEL); @@ -1056,7 +852,6 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) sap = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.dst_addr; dprintk(" remote address : %pIS:%u\n", sap, rpc_get_port(sap)); dprintk(" max_sge : %d\n", newxprt->sc_max_sge); - dprintk(" max_sge_rd : %d\n", newxprt->sc_max_sge_rd); dprintk(" sq_depth : %d\n", newxprt->sc_sq_depth); dprintk(" max_requests : %d\n", newxprt->sc_max_requests); dprintk(" ord : %d\n", newxprt->sc_ord); @@ -1117,12 +912,6 @@ static void __svc_rdma_free(struct work_struct *work) pr_err("svcrdma: sc_xprt still in use? (%d)\n", kref_read(&xprt->xpt_ref)); - /* - * Destroy queued, but not processed read completions. Note - * that this cleanup has to be done before destroying the - * cm_id because the device ptr is needed to unmap the dma in - * svc_rdma_put_context. - */ while (!list_empty(&rdma->sc_read_complete_q)) { struct svc_rdma_op_ctxt *ctxt; ctxt = list_first_entry(&rdma->sc_read_complete_q, @@ -1130,8 +919,6 @@ static void __svc_rdma_free(struct work_struct *work) list_del(&ctxt->list); svc_rdma_put_context(ctxt, 1); } - - /* Destroy queued, but not processed recv completions */ while (!list_empty(&rdma->sc_rq_dto_q)) { struct svc_rdma_op_ctxt *ctxt; ctxt = list_first_entry(&rdma->sc_rq_dto_q, @@ -1151,7 +938,6 @@ static void __svc_rdma_free(struct work_struct *work) xprt->xpt_bc_xprt = NULL; } - rdma_dealloc_frmr_q(rdma); svc_rdma_destroy_rw_ctxts(rdma); svc_rdma_destroy_ctxts(rdma); diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 62ecbccd9748..d1c458e5ec4d 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -684,7 +684,8 @@ xprt_rdma_free(struct rpc_task *task) dprintk("RPC: %s: called on 0x%p\n", __func__, req->rl_reply); - if (unlikely(!list_empty(&req->rl_registered))) + rpcrdma_remove_req(&r_xprt->rx_buf, req); + if (!list_empty(&req->rl_registered)) ia->ri_ops->ro_unmap_safe(r_xprt, req, !RPC_IS_ASYNC(task)); rpcrdma_unmap_sges(ia, req); rpcrdma_buffer_put(req); diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 3dbce9ac4327..e4171f2abe37 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -243,8 +243,6 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event) #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) struct sockaddr *sap = (struct sockaddr *)&ep->rep_remote_addr; #endif - struct ib_qp_attr *attr = &ia->ri_qp_attr; - struct ib_qp_init_attr *iattr = &ia->ri_qp_init_attr; int connstate = 0; switch (event->event) { @@ -267,7 +265,8 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event) break; case RDMA_CM_EVENT_DEVICE_REMOVAL: #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) - pr_info("rpcrdma: removing device for %pIS:%u\n", + pr_info("rpcrdma: removing device %s for %pIS:%u\n", + ia->ri_device->name, sap, rpc_get_port(sap)); #endif set_bit(RPCRDMA_IAF_REMOVING, &ia->ri_flags); @@ -282,13 +281,6 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event) return 1; case RDMA_CM_EVENT_ESTABLISHED: connstate = 1; - ib_query_qp(ia->ri_id->qp, attr, - IB_QP_MAX_QP_RD_ATOMIC | IB_QP_MAX_DEST_RD_ATOMIC, - iattr); - dprintk("RPC: %s: %d responder resources" - " (%d initiator)\n", - __func__, attr->max_dest_rd_atomic, - attr->max_rd_atomic); rpcrdma_update_connect_private(xprt, &event->param.conn); goto connected; case RDMA_CM_EVENT_CONNECT_ERROR: @@ -298,11 +290,9 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event) connstate = -ENETDOWN; goto connected; case RDMA_CM_EVENT_REJECTED: -#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) - pr_info("rpcrdma: connection to %pIS:%u on %s rejected: %s\n", - sap, rpc_get_port(sap), ia->ri_device->name, + dprintk("rpcrdma: connection to %pIS:%u rejected: %s\n", + sap, rpc_get_port(sap), rdma_reject_msg(id, event->status)); -#endif connstate = -ECONNREFUSED; if (event->status == IB_CM_REJ_STALE_CONN) connstate = -EAGAIN; @@ -310,37 +300,19 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event) case RDMA_CM_EVENT_DISCONNECTED: connstate = -ECONNABORTED; connected: - dprintk("RPC: %s: %sconnected\n", - __func__, connstate > 0 ? "" : "dis"); atomic_set(&xprt->rx_buf.rb_credits, 1); ep->rep_connected = connstate; rpcrdma_conn_func(ep); wake_up_all(&ep->rep_connect_wait); /*FALLTHROUGH*/ default: - dprintk("RPC: %s: %pIS:%u (ep 0x%p): %s\n", - __func__, sap, rpc_get_port(sap), ep, - rdma_event_msg(event->event)); + dprintk("RPC: %s: %pIS:%u on %s/%s (ep 0x%p): %s\n", + __func__, sap, rpc_get_port(sap), + ia->ri_device->name, ia->ri_ops->ro_displayname, + ep, rdma_event_msg(event->event)); break; } -#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) - if (connstate == 1) { - int ird = attr->max_dest_rd_atomic; - int tird = ep->rep_remote_cma.responder_resources; - - pr_info("rpcrdma: connection to %pIS:%u on %s, memreg '%s', %d credits, %d responders%s\n", - sap, rpc_get_port(sap), - ia->ri_device->name, - ia->ri_ops->ro_displayname, - xprt->rx_buf.rb_max_requests, - ird, ird < 4 && ird < tird / 2 ? " (low!)" : ""); - } else if (connstate < 0) { - pr_info("rpcrdma: connection to %pIS:%u closed (%d)\n", - sap, rpc_get_port(sap), connstate); - } -#endif - return 0; } @@ -971,7 +943,6 @@ rpcrdma_create_req(struct rpcrdma_xprt *r_xprt) if (req == NULL) return ERR_PTR(-ENOMEM); - INIT_LIST_HEAD(&req->rl_free); spin_lock(&buffer->rb_reqslock); list_add(&req->rl_all, &buffer->rb_allreqs); spin_unlock(&buffer->rb_reqslock); @@ -1033,6 +1004,7 @@ rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt) spin_lock_init(&buf->rb_recovery_lock); INIT_LIST_HEAD(&buf->rb_mws); INIT_LIST_HEAD(&buf->rb_all); + INIT_LIST_HEAD(&buf->rb_pending); INIT_LIST_HEAD(&buf->rb_stale_mrs); INIT_DELAYED_WORK(&buf->rb_refresh_worker, rpcrdma_mr_refresh_worker); @@ -1055,7 +1027,7 @@ rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt) goto out; } req->rl_backchannel = false; - list_add(&req->rl_free, &buf->rb_send_bufs); + list_add(&req->rl_list, &buf->rb_send_bufs); } INIT_LIST_HEAD(&buf->rb_recv_bufs); @@ -1084,8 +1056,8 @@ rpcrdma_buffer_get_req_locked(struct rpcrdma_buffer *buf) struct rpcrdma_req *req; req = list_first_entry(&buf->rb_send_bufs, - struct rpcrdma_req, rl_free); - list_del(&req->rl_free); + struct rpcrdma_req, rl_list); + list_del_init(&req->rl_list); return req; } @@ -1187,6 +1159,7 @@ rpcrdma_get_mw(struct rpcrdma_xprt *r_xprt) if (!mw) goto out_nomws; + mw->mw_flags = 0; return mw; out_nomws: @@ -1267,7 +1240,7 @@ rpcrdma_buffer_put(struct rpcrdma_req *req) spin_lock(&buffers->rb_lock); buffers->rb_send_count--; - list_add_tail(&req->rl_free, &buffers->rb_send_bufs); + list_add_tail(&req->rl_list, &buffers->rb_send_bufs); if (rep) { buffers->rb_recv_count--; list_add_tail(&rep->rr_list, &buffers->rb_recv_bufs); diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 1d66acf1a723..b282d3f8cdd8 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -271,6 +271,7 @@ struct rpcrdma_mw { struct scatterlist *mw_sg; int mw_nents; enum dma_data_direction mw_dir; + unsigned long mw_flags; union { struct rpcrdma_fmr fmr; struct rpcrdma_frmr frmr; @@ -282,6 +283,11 @@ struct rpcrdma_mw { struct list_head mw_all; }; +/* mw_flags */ +enum { + RPCRDMA_MW_F_RI = 1, +}; + /* * struct rpcrdma_req -- structure central to the request/reply sequence. * @@ -334,7 +340,8 @@ enum { struct rpcrdma_buffer; struct rpcrdma_req { - struct list_head rl_free; + struct list_head rl_list; + __be32 rl_xid; unsigned int rl_mapped_sges; unsigned int rl_connect_cookie; struct rpcrdma_buffer *rl_buffer; @@ -396,6 +403,7 @@ struct rpcrdma_buffer { int rb_send_count, rb_recv_count; struct list_head rb_send_bufs; struct list_head rb_recv_bufs; + struct list_head rb_pending; u32 rb_max_requests; atomic_t rb_credits; /* most recent credit grant */ @@ -461,7 +469,7 @@ struct rpcrdma_memreg_ops { struct rpcrdma_mr_seg *, int, bool, struct rpcrdma_mw **); void (*ro_unmap_sync)(struct rpcrdma_xprt *, - struct rpcrdma_req *); + struct list_head *); void (*ro_unmap_safe)(struct rpcrdma_xprt *, struct rpcrdma_req *, bool); void (*ro_recover_mr)(struct rpcrdma_mw *); @@ -544,6 +552,34 @@ void rpcrdma_destroy_req(struct rpcrdma_req *); int rpcrdma_buffer_create(struct rpcrdma_xprt *); void rpcrdma_buffer_destroy(struct rpcrdma_buffer *); +static inline void +rpcrdma_insert_req(struct rpcrdma_buffer *buffers, struct rpcrdma_req *req) +{ + spin_lock(&buffers->rb_lock); + if (list_empty(&req->rl_list)) + list_add_tail(&req->rl_list, &buffers->rb_pending); + spin_unlock(&buffers->rb_lock); +} + +static inline struct rpcrdma_req * +rpcrdma_lookup_req_locked(struct rpcrdma_buffer *buffers, __be32 xid) +{ + struct rpcrdma_req *pos; + + list_for_each_entry(pos, &buffers->rb_pending, rl_list) + if (pos->rl_xid == xid) + return pos; + return NULL; +} + +static inline void +rpcrdma_remove_req(struct rpcrdma_buffer *buffers, struct rpcrdma_req *req) +{ + spin_lock(&buffers->rb_lock); + list_del(&req->rl_list); + spin_unlock(&buffers->rb_lock); +} + struct rpcrdma_mw *rpcrdma_get_mw(struct rpcrdma_xprt *); void rpcrdma_put_mw(struct rpcrdma_xprt *, struct rpcrdma_mw *); struct rpcrdma_req *rpcrdma_buffer_get(struct rpcrdma_buffer *); |