diff options
author | Trond Myklebust <trond.myklebust@primarydata.com> | 2014-08-03 17:04:51 -0400 |
---|---|---|
committer | Trond Myklebust <trond.myklebust@primarydata.com> | 2014-08-03 17:04:51 -0400 |
commit | 9806755c56a9a8b3ff8bc9ce3092dcad4d562989 (patch) | |
tree | 19c1a92a65cfdee4bd2717f032a1f072cbe0c7af /net/sunrpc | |
parent | 3a505845cd58a7ff3bc75f96572045d8de34e34e (diff) | |
parent | 8079fb785e34de6dff34bd846b8b79c212861edf (diff) | |
download | linux-9806755c56a9a8b3ff8bc9ce3092dcad4d562989.tar.bz2 |
Merge branch 'nfs-rdma' of git://git.linux-nfs.org/projects/anna/nfs-rdma into linux-next
* 'nfs-rdma' of git://git.linux-nfs.org/projects/anna/nfs-rdma: (916 commits)
xprtrdma: Handle additional connection events
xprtrdma: Remove RPCRDMA_PERSISTENT_REGISTRATION macro
xprtrdma: Make rpcrdma_ep_disconnect() return void
xprtrdma: Schedule reply tasklet once per upcall
xprtrdma: Allocate each struct rpcrdma_mw separately
xprtrdma: Rename frmr_wr
xprtrdma: Disable completions for LOCAL_INV Work Requests
xprtrdma: Disable completions for FAST_REG_MR Work Requests
xprtrdma: Don't post a LOCAL_INV in rpcrdma_register_frmr_external()
xprtrdma: Reset FRMRs after a flushed LOCAL_INV Work Request
xprtrdma: Reset FRMRs when FAST_REG_MR is flushed by a disconnect
xprtrdma: Properly handle exhaustion of the rb_mws list
xprtrdma: Chain together all MWs in same buffer pool
xprtrdma: Back off rkey when FAST_REG_MR fails
xprtrdma: Unclutter struct rpcrdma_mr_seg
xprtrdma: Don't invalidate FRMRs if registration fails
xprtrdma: On disconnect, don't ignore pending CQEs
xprtrdma: Update rkeys after transport reconnect
xprtrdma: Limit data payload size for ALLPHYSICAL
xprtrdma: Protect ia->ri_id when unmapping/invalidating MRs
...
Diffstat (limited to 'net/sunrpc')
-rw-r--r-- | net/sunrpc/xprtrdma/rpc_rdma.c | 83 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/transport.c | 17 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/verbs.c | 739 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/xprt_rdma.h | 61 |
4 files changed, 604 insertions, 296 deletions
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 693966d3f33b..6166c985fe24 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -53,14 +53,6 @@ # define RPCDBG_FACILITY RPCDBG_TRANS #endif -enum rpcrdma_chunktype { - rpcrdma_noch = 0, - rpcrdma_readch, - rpcrdma_areadch, - rpcrdma_writech, - rpcrdma_replych -}; - #ifdef RPC_DEBUG static const char transfertypes[][12] = { "pure inline", /* no chunks */ @@ -279,13 +271,37 @@ rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target, return (unsigned char *)iptr - (unsigned char *)headerp; out: - for (pos = 0; nchunks--;) - pos += rpcrdma_deregister_external( - &req->rl_segments[pos], r_xprt); + if (r_xprt->rx_ia.ri_memreg_strategy != RPCRDMA_FRMR) { + for (pos = 0; nchunks--;) + pos += rpcrdma_deregister_external( + &req->rl_segments[pos], r_xprt); + } return n; } /* + * Marshal chunks. This routine returns the header length + * consumed by marshaling. + * + * Returns positive RPC/RDMA header size, or negative errno. + */ + +ssize_t +rpcrdma_marshal_chunks(struct rpc_rqst *rqst, ssize_t result) +{ + struct rpcrdma_req *req = rpcr_to_rdmar(rqst); + struct rpcrdma_msg *headerp = (struct rpcrdma_msg *)req->rl_base; + + if (req->rl_rtype != rpcrdma_noch) + result = rpcrdma_create_chunks(rqst, &rqst->rq_snd_buf, + headerp, req->rl_rtype); + else if (req->rl_wtype != rpcrdma_noch) + result = rpcrdma_create_chunks(rqst, &rqst->rq_rcv_buf, + headerp, req->rl_wtype); + return result; +} + +/* * Copy write data inline. * This function is used for "small" requests. Data which is passed * to RPC via iovecs (or page list) is copied directly into the @@ -377,7 +393,6 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) char *base; size_t rpclen, padlen; ssize_t hdrlen; - enum rpcrdma_chunktype rtype, wtype; struct rpcrdma_msg *headerp; /* @@ -415,13 +430,13 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) * into pages; otherwise use reply chunks. */ if (rqst->rq_rcv_buf.buflen <= RPCRDMA_INLINE_READ_THRESHOLD(rqst)) - wtype = rpcrdma_noch; + req->rl_wtype = rpcrdma_noch; else if (rqst->rq_rcv_buf.page_len == 0) - wtype = rpcrdma_replych; + req->rl_wtype = rpcrdma_replych; else if (rqst->rq_rcv_buf.flags & XDRBUF_READ) - wtype = rpcrdma_writech; + req->rl_wtype = rpcrdma_writech; else - wtype = rpcrdma_replych; + req->rl_wtype = rpcrdma_replych; /* * Chunks needed for arguments? @@ -438,16 +453,16 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) * TBD check NFSv4 setacl */ if (rqst->rq_snd_buf.len <= RPCRDMA_INLINE_WRITE_THRESHOLD(rqst)) - rtype = rpcrdma_noch; + req->rl_rtype = rpcrdma_noch; else if (rqst->rq_snd_buf.page_len == 0) - rtype = rpcrdma_areadch; + req->rl_rtype = rpcrdma_areadch; else - rtype = rpcrdma_readch; + req->rl_rtype = rpcrdma_readch; /* The following simplification is not true forever */ - if (rtype != rpcrdma_noch && wtype == rpcrdma_replych) - wtype = rpcrdma_noch; - if (rtype != rpcrdma_noch && wtype != rpcrdma_noch) { + if (req->rl_rtype != rpcrdma_noch && req->rl_wtype == rpcrdma_replych) + req->rl_wtype = rpcrdma_noch; + if (req->rl_rtype != rpcrdma_noch && req->rl_wtype != rpcrdma_noch) { dprintk("RPC: %s: cannot marshal multiple chunk lists\n", __func__); return -EIO; @@ -461,7 +476,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) * When padding is in use and applies to the transfer, insert * it and change the message type. */ - if (rtype == rpcrdma_noch) { + if (req->rl_rtype == rpcrdma_noch) { padlen = rpcrdma_inline_pullup(rqst, RPCRDMA_INLINE_PAD_VALUE(rqst)); @@ -476,7 +491,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) headerp->rm_body.rm_padded.rm_pempty[1] = xdr_zero; headerp->rm_body.rm_padded.rm_pempty[2] = xdr_zero; hdrlen += 2 * sizeof(u32); /* extra words in padhdr */ - if (wtype != rpcrdma_noch) { + if (req->rl_wtype != rpcrdma_noch) { dprintk("RPC: %s: invalid chunk list\n", __func__); return -EIO; @@ -497,30 +512,18 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) * on receive. Therefore, we request a reply chunk * for non-writes wherever feasible and efficient. */ - if (wtype == rpcrdma_noch) - wtype = rpcrdma_replych; + if (req->rl_wtype == rpcrdma_noch) + req->rl_wtype = rpcrdma_replych; } } - /* - * Marshal chunks. This routine will return the header length - * consumed by marshaling. - */ - if (rtype != rpcrdma_noch) { - hdrlen = rpcrdma_create_chunks(rqst, - &rqst->rq_snd_buf, headerp, rtype); - wtype = rtype; /* simplify dprintk */ - - } else if (wtype != rpcrdma_noch) { - hdrlen = rpcrdma_create_chunks(rqst, - &rqst->rq_rcv_buf, headerp, wtype); - } + hdrlen = rpcrdma_marshal_chunks(rqst, hdrlen); if (hdrlen < 0) return hdrlen; dprintk("RPC: %s: %s: hdrlen %zd rpclen %zd padlen %zd" " headerp 0x%p base 0x%p lkey 0x%x\n", - __func__, transfertypes[wtype], hdrlen, rpclen, padlen, + __func__, transfertypes[req->rl_wtype], hdrlen, rpclen, padlen, headerp, base, req->rl_iov.lkey); /* diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 66f91f0d071a..2faac4940563 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -296,7 +296,6 @@ xprt_setup_rdma(struct xprt_create *args) xprt->resvport = 0; /* privileged port not needed */ xprt->tsh_size = 0; /* RPC-RDMA handles framing */ - xprt->max_payload = RPCRDMA_MAX_DATA_SEGS * PAGE_SIZE; xprt->ops = &xprt_rdma_procs; /* @@ -382,6 +381,9 @@ xprt_setup_rdma(struct xprt_create *args) new_ep->rep_xprt = xprt; xprt_rdma_format_addresses(xprt); + xprt->max_payload = rpcrdma_max_payload(new_xprt); + dprintk("RPC: %s: transport data payload maximum: %zu bytes\n", + __func__, xprt->max_payload); if (!try_module_get(THIS_MODULE)) goto out4; @@ -412,7 +414,7 @@ xprt_rdma_close(struct rpc_xprt *xprt) if (r_xprt->rx_ep.rep_connected > 0) xprt->reestablish_timeout = 0; xprt_disconnect_done(xprt); - (void) rpcrdma_ep_disconnect(&r_xprt->rx_ep, &r_xprt->rx_ia); + rpcrdma_ep_disconnect(&r_xprt->rx_ep, &r_xprt->rx_ia); } static void @@ -595,13 +597,14 @@ xprt_rdma_send_request(struct rpc_task *task) struct rpc_xprt *xprt = rqst->rq_xprt; struct rpcrdma_req *req = rpcr_to_rdmar(rqst); struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); - int rc; + int rc = 0; - if (req->rl_niovs == 0) { + if (req->rl_niovs == 0) rc = rpcrdma_marshal_req(rqst); - if (rc < 0) - goto failed_marshal; - } + else if (r_xprt->rx_ia.ri_memreg_strategy == RPCRDMA_FRMR) + rc = rpcrdma_marshal_chunks(rqst, 0); + if (rc < 0) + goto failed_marshal; if (req->rl_reply == NULL) /* e.g. reconnection */ rpcrdma_recv_buffer_get(req); diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 13dbd1c389ff..61c41298b4ea 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -61,6 +61,8 @@ # define RPCDBG_FACILITY RPCDBG_TRANS #endif +static void rpcrdma_reset_frmrs(struct rpcrdma_ia *); + /* * internal functions */ @@ -103,17 +105,6 @@ rpcrdma_run_tasklet(unsigned long data) static DECLARE_TASKLET(rpcrdma_tasklet_g, rpcrdma_run_tasklet, 0UL); -static inline void -rpcrdma_schedule_tasklet(struct rpcrdma_rep *rep) -{ - unsigned long flags; - - spin_lock_irqsave(&rpcrdma_tk_lock_g, flags); - list_add_tail(&rep->rr_list, &rpcrdma_tasklets_g); - spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags); - tasklet_schedule(&rpcrdma_tasklet_g); -} - static void rpcrdma_qp_async_error_upcall(struct ib_event *event, void *context) { @@ -153,12 +144,7 @@ rpcrdma_sendcq_process_wc(struct ib_wc *wc) if (wc->wr_id == 0ULL) return; if (wc->status != IB_WC_SUCCESS) - return; - - if (wc->opcode == IB_WC_FAST_REG_MR) - frmr->r.frmr.state = FRMR_IS_VALID; - else if (wc->opcode == IB_WC_LOCAL_INV) - frmr->r.frmr.state = FRMR_IS_INVALID; + frmr->r.frmr.fr_state = FRMR_IS_STALE; } static int @@ -217,7 +203,7 @@ rpcrdma_sendcq_upcall(struct ib_cq *cq, void *cq_context) } static void -rpcrdma_recvcq_process_wc(struct ib_wc *wc) +rpcrdma_recvcq_process_wc(struct ib_wc *wc, struct list_head *sched_list) { struct rpcrdma_rep *rep = (struct rpcrdma_rep *)(unsigned long)wc->wr_id; @@ -248,28 +234,38 @@ rpcrdma_recvcq_process_wc(struct ib_wc *wc) } out_schedule: - rpcrdma_schedule_tasklet(rep); + list_add_tail(&rep->rr_list, sched_list); } static int rpcrdma_recvcq_poll(struct ib_cq *cq, struct rpcrdma_ep *ep) { + struct list_head sched_list; struct ib_wc *wcs; int budget, count, rc; + unsigned long flags; + INIT_LIST_HEAD(&sched_list); budget = RPCRDMA_WC_BUDGET / RPCRDMA_POLLSIZE; do { wcs = ep->rep_recv_wcs; rc = ib_poll_cq(cq, RPCRDMA_POLLSIZE, wcs); if (rc <= 0) - return rc; + goto out_schedule; count = rc; while (count-- > 0) - rpcrdma_recvcq_process_wc(wcs++); + rpcrdma_recvcq_process_wc(wcs++, &sched_list); } while (rc == RPCRDMA_POLLSIZE && --budget); - return 0; + rc = 0; + +out_schedule: + spin_lock_irqsave(&rpcrdma_tk_lock_g, flags); + list_splice_tail(&sched_list, &rpcrdma_tasklets_g); + spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags); + tasklet_schedule(&rpcrdma_tasklet_g); + return rc; } /* @@ -310,6 +306,13 @@ rpcrdma_recvcq_upcall(struct ib_cq *cq, void *cq_context) rpcrdma_recvcq_poll(cq, ep); } +static void +rpcrdma_flush_cqs(struct rpcrdma_ep *ep) +{ + rpcrdma_recvcq_upcall(ep->rep_attr.recv_cq, ep); + rpcrdma_sendcq_upcall(ep->rep_attr.send_cq, ep); +} + #ifdef RPC_DEBUG static const char * const conn[] = { "address resolved", @@ -323,8 +326,16 @@ static const char * const conn[] = { "rejected", "established", "disconnected", - "device removal" + "device removal", + "multicast join", + "multicast error", + "address change", + "timewait exit", }; + +#define CONNECTION_MSG(status) \ + ((status) < ARRAY_SIZE(conn) ? \ + conn[(status)] : "unrecognized connection error") #endif static int @@ -382,23 +393,18 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event) case RDMA_CM_EVENT_DEVICE_REMOVAL: connstate = -ENODEV; connected: - dprintk("RPC: %s: %s: %pI4:%u (ep 0x%p event 0x%x)\n", - __func__, - (event->event <= 11) ? conn[event->event] : - "unknown connection error", - &addr->sin_addr.s_addr, - ntohs(addr->sin_port), - ep, event->event); atomic_set(&rpcx_to_rdmax(ep->rep_xprt)->rx_buf.rb_credits, 1); dprintk("RPC: %s: %sconnected\n", __func__, connstate > 0 ? "" : "dis"); ep->rep_connected = connstate; ep->rep_func(ep); wake_up_all(&ep->rep_connect_wait); - break; + /*FALLTHROUGH*/ default: - dprintk("RPC: %s: unexpected CM event %d\n", - __func__, event->event); + dprintk("RPC: %s: %pI4:%u (ep 0x%p): %s\n", + __func__, &addr->sin_addr.s_addr, + ntohs(addr->sin_port), ep, + CONNECTION_MSG(event->event)); break; } @@ -558,12 +564,7 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) if (!ia->ri_id->device->alloc_fmr) { dprintk("RPC: %s: MTHCAFMR registration " "not supported by HCA\n", __func__); -#if RPCRDMA_PERSISTENT_REGISTRATION memreg = RPCRDMA_ALLPHYSICAL; -#else - rc = -ENOMEM; - goto out2; -#endif } } @@ -578,20 +579,16 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) switch (memreg) { case RPCRDMA_FRMR: break; -#if RPCRDMA_PERSISTENT_REGISTRATION case RPCRDMA_ALLPHYSICAL: mem_priv = IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ; goto register_setup; -#endif case RPCRDMA_MTHCAFMR: if (ia->ri_have_dma_lkey) break; mem_priv = IB_ACCESS_LOCAL_WRITE; -#if RPCRDMA_PERSISTENT_REGISTRATION register_setup: -#endif ia->ri_bind_mem = ib_get_dma_mr(ia->ri_pd, mem_priv); if (IS_ERR(ia->ri_bind_mem)) { printk(KERN_ALERT "%s: ib_get_dma_mr for " @@ -613,6 +610,7 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) /* Else will do memory reg/dereg for each chunk */ ia->ri_memreg_strategy = memreg; + rwlock_init(&ia->ri_qplock); return 0; out2: rdma_destroy_id(ia->ri_id); @@ -826,10 +824,7 @@ rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) cancel_delayed_work_sync(&ep->rep_connect_worker); if (ia->ri_id->qp) { - rc = rpcrdma_ep_disconnect(ep, ia); - if (rc) - dprintk("RPC: %s: rpcrdma_ep_disconnect" - " returned %i\n", __func__, rc); + rpcrdma_ep_disconnect(ep, ia); rdma_destroy_qp(ia->ri_id); ia->ri_id->qp = NULL; } @@ -859,7 +854,7 @@ rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) int rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) { - struct rdma_cm_id *id; + struct rdma_cm_id *id, *old; int rc = 0; int retry_count = 0; @@ -867,13 +862,12 @@ rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) struct rpcrdma_xprt *xprt; retry: dprintk("RPC: %s: reconnecting...\n", __func__); - rc = rpcrdma_ep_disconnect(ep, ia); - if (rc && rc != -ENOTCONN) - dprintk("RPC: %s: rpcrdma_ep_disconnect" - " status %i\n", __func__, rc); - rpcrdma_clean_cq(ep->rep_attr.recv_cq); - rpcrdma_clean_cq(ep->rep_attr.send_cq); + rpcrdma_ep_disconnect(ep, ia); + rpcrdma_flush_cqs(ep); + + if (ia->ri_memreg_strategy == RPCRDMA_FRMR) + rpcrdma_reset_frmrs(ia); xprt = container_of(ia, struct rpcrdma_xprt, rx_ia); id = rpcrdma_create_id(xprt, ia, @@ -905,9 +899,14 @@ retry: rc = -ENETUNREACH; goto out; } - rdma_destroy_qp(ia->ri_id); - rdma_destroy_id(ia->ri_id); + + write_lock(&ia->ri_qplock); + old = ia->ri_id; ia->ri_id = id; + write_unlock(&ia->ri_qplock); + + rdma_destroy_qp(old); + rdma_destroy_id(old); } else { dprintk("RPC: %s: connecting...\n", __func__); rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr); @@ -974,13 +973,12 @@ out: * This call is not reentrant, and must not be made in parallel * on the same endpoint. */ -int +void rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) { int rc; - rpcrdma_clean_cq(ep->rep_attr.recv_cq); - rpcrdma_clean_cq(ep->rep_attr.send_cq); + rpcrdma_flush_cqs(ep); rc = rdma_disconnect(ia->ri_id); if (!rc) { /* returns without wait if not connected */ @@ -992,12 +990,93 @@ rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) dprintk("RPC: %s: rdma_disconnect %i\n", __func__, rc); ep->rep_connected = rc; } +} + +static int +rpcrdma_init_fmrs(struct rpcrdma_ia *ia, struct rpcrdma_buffer *buf) +{ + int mr_access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ; + struct ib_fmr_attr fmr_attr = { + .max_pages = RPCRDMA_MAX_DATA_SEGS, + .max_maps = 1, + .page_shift = PAGE_SHIFT + }; + struct rpcrdma_mw *r; + int i, rc; + + i = (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS; + dprintk("RPC: %s: initalizing %d FMRs\n", __func__, i); + + while (i--) { + r = kzalloc(sizeof(*r), GFP_KERNEL); + if (r == NULL) + return -ENOMEM; + + r->r.fmr = ib_alloc_fmr(ia->ri_pd, mr_access_flags, &fmr_attr); + if (IS_ERR(r->r.fmr)) { + rc = PTR_ERR(r->r.fmr); + dprintk("RPC: %s: ib_alloc_fmr failed %i\n", + __func__, rc); + goto out_free; + } + + list_add(&r->mw_list, &buf->rb_mws); + list_add(&r->mw_all, &buf->rb_all); + } + return 0; + +out_free: + kfree(r); + return rc; +} + +static int +rpcrdma_init_frmrs(struct rpcrdma_ia *ia, struct rpcrdma_buffer *buf) +{ + struct rpcrdma_frmr *f; + struct rpcrdma_mw *r; + int i, rc; + + i = (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS; + dprintk("RPC: %s: initalizing %d FRMRs\n", __func__, i); + + while (i--) { + r = kzalloc(sizeof(*r), GFP_KERNEL); + if (r == NULL) + return -ENOMEM; + f = &r->r.frmr; + + f->fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd, + ia->ri_max_frmr_depth); + if (IS_ERR(f->fr_mr)) { + rc = PTR_ERR(f->fr_mr); + dprintk("RPC: %s: ib_alloc_fast_reg_mr " + "failed %i\n", __func__, rc); + goto out_free; + } + + f->fr_pgl = ib_alloc_fast_reg_page_list(ia->ri_id->device, + ia->ri_max_frmr_depth); + if (IS_ERR(f->fr_pgl)) { + rc = PTR_ERR(f->fr_pgl); + dprintk("RPC: %s: ib_alloc_fast_reg_page_list " + "failed %i\n", __func__, rc); + + ib_dereg_mr(f->fr_mr); + goto out_free; + } + + list_add(&r->mw_list, &buf->rb_mws); + list_add(&r->mw_all, &buf->rb_all); + } + + return 0; + +out_free: + kfree(r); return rc; } -/* - * Initialize buffer memory - */ int rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, struct rpcrdma_create_data_internal *cdata) @@ -1005,7 +1084,6 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep, char *p; size_t len, rlen, wlen; int i, rc; - struct rpcrdma_mw *r; buf->rb_max_requests = cdata->max_requests; spin_lock_init(&buf->rb_lock); @@ -1016,28 +1094,12 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep, * 2. arrays of struct rpcrdma_req to fill in pointers * 3. array of struct rpcrdma_rep for replies * 4. padding, if any - * 5. mw's, fmr's or frmr's, if any * Send/recv buffers in req/rep need to be registered */ - len = buf->rb_max_requests * (sizeof(struct rpcrdma_req *) + sizeof(struct rpcrdma_rep *)); len += cdata->padding; - switch (ia->ri_memreg_strategy) { - case RPCRDMA_FRMR: - len += buf->rb_max_requests * RPCRDMA_MAX_SEGS * - sizeof(struct rpcrdma_mw); - break; - case RPCRDMA_MTHCAFMR: - /* TBD we are perhaps overallocating here */ - len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS * - sizeof(struct rpcrdma_mw); - break; - default: - break; - } - /* allocate 1, 4 and 5 in one shot */ p = kzalloc(len, GFP_KERNEL); if (p == NULL) { dprintk("RPC: %s: req_t/rep_t/pad kzalloc(%zd) failed\n", @@ -1064,51 +1126,17 @@ rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep, p += cdata->padding; INIT_LIST_HEAD(&buf->rb_mws); - r = (struct rpcrdma_mw *)p; + INIT_LIST_HEAD(&buf->rb_all); switch (ia->ri_memreg_strategy) { case RPCRDMA_FRMR: - for (i = buf->rb_max_requests * RPCRDMA_MAX_SEGS; i; i--) { - r->r.frmr.fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd, - ia->ri_max_frmr_depth); - if (IS_ERR(r->r.frmr.fr_mr)) { - rc = PTR_ERR(r->r.frmr.fr_mr); - dprintk("RPC: %s: ib_alloc_fast_reg_mr" - " failed %i\n", __func__, rc); - goto out; - } - r->r.frmr.fr_pgl = ib_alloc_fast_reg_page_list( - ia->ri_id->device, - ia->ri_max_frmr_depth); - if (IS_ERR(r->r.frmr.fr_pgl)) { - rc = PTR_ERR(r->r.frmr.fr_pgl); - dprintk("RPC: %s: " - "ib_alloc_fast_reg_page_list " - "failed %i\n", __func__, rc); - - ib_dereg_mr(r->r.frmr.fr_mr); - goto out; - } - list_add(&r->mw_list, &buf->rb_mws); - ++r; - } + rc = rpcrdma_init_frmrs(ia, buf); + if (rc) + goto out; break; case RPCRDMA_MTHCAFMR: - /* TBD we are perhaps overallocating here */ - for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) { - static struct ib_fmr_attr fa = - { RPCRDMA_MAX_DATA_SEGS, 1, PAGE_SHIFT }; - r->r.fmr = ib_alloc_fmr(ia->ri_pd, - IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ, - &fa); - if (IS_ERR(r->r.fmr)) { - rc = PTR_ERR(r->r.fmr); - dprintk("RPC: %s: ib_alloc_fmr" - " failed %i\n", __func__, rc); - goto out; - } - list_add(&r->mw_list, &buf->rb_mws); - ++r; - } + rc = rpcrdma_init_fmrs(ia, buf); + if (rc) + goto out; break; default: break; @@ -1176,24 +1204,57 @@ out: return rc; } -/* - * Unregister and destroy buffer memory. Need to deal with - * partial initialization, so it's callable from failed create. - * Must be called before destroying endpoint, as registrations - * reference it. - */ +static void +rpcrdma_destroy_fmrs(struct rpcrdma_buffer *buf) +{ + struct rpcrdma_mw *r; + int rc; + + while (!list_empty(&buf->rb_all)) { + r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all); + list_del(&r->mw_all); + list_del(&r->mw_list); + + rc = ib_dealloc_fmr(r->r.fmr); + if (rc) + dprintk("RPC: %s: ib_dealloc_fmr failed %i\n", + __func__, rc); + + kfree(r); + } +} + +static void +rpcrdma_destroy_frmrs(struct rpcrdma_buffer *buf) +{ + struct rpcrdma_mw *r; + int rc; + + while (!list_empty(&buf->rb_all)) { + r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all); + list_del(&r->mw_all); + list_del(&r->mw_list); + + rc = ib_dereg_mr(r->r.frmr.fr_mr); + if (rc) + dprintk("RPC: %s: ib_dereg_mr failed %i\n", + __func__, rc); + ib_free_fast_reg_page_list(r->r.frmr.fr_pgl); + + kfree(r); + } +} + void rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) { - int rc, i; struct rpcrdma_ia *ia = rdmab_to_ia(buf); - struct rpcrdma_mw *r; + int i; /* clean up in reverse order from create * 1. recv mr memory (mr free, then kfree) * 2. send mr memory (mr free, then kfree) - * 3. padding (if any) [moved to rpcrdma_ep_destroy] - * 4. arrays + * 3. MWs */ dprintk("RPC: %s: entering\n", __func__); @@ -1212,34 +1273,217 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) } } + switch (ia->ri_memreg_strategy) { + case RPCRDMA_FRMR: + rpcrdma_destroy_frmrs(buf); + break; + case RPCRDMA_MTHCAFMR: + rpcrdma_destroy_fmrs(buf); + break; + default: + break; + } + + kfree(buf->rb_pool); +} + +/* After a disconnect, a flushed FAST_REG_MR can leave an FRMR in + * an unusable state. Find FRMRs in this state and dereg / reg + * each. FRMRs that are VALID and attached to an rpcrdma_req are + * also torn down. + * + * This gives all in-use FRMRs a fresh rkey and leaves them INVALID. + * + * This is invoked only in the transport connect worker in order + * to serialize with rpcrdma_register_frmr_external(). + */ +static void +rpcrdma_reset_frmrs(struct rpcrdma_ia *ia) +{ + struct rpcrdma_xprt *r_xprt = + container_of(ia, struct rpcrdma_xprt, rx_ia); + struct rpcrdma_buffer *buf = &r_xprt->rx_buf; + struct list_head *pos; + struct rpcrdma_mw *r; + int rc; + + list_for_each(pos, &buf->rb_all) { + r = list_entry(pos, struct rpcrdma_mw, mw_all); + + if (r->r.frmr.fr_state == FRMR_IS_INVALID) + continue; + + rc = ib_dereg_mr(r->r.frmr.fr_mr); + if (rc) + dprintk("RPC: %s: ib_dereg_mr failed %i\n", + __func__, rc); + ib_free_fast_reg_page_list(r->r.frmr.fr_pgl); + + r->r.frmr.fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd, + ia->ri_max_frmr_depth); + if (IS_ERR(r->r.frmr.fr_mr)) { + rc = PTR_ERR(r->r.frmr.fr_mr); + dprintk("RPC: %s: ib_alloc_fast_reg_mr" + " failed %i\n", __func__, rc); + continue; + } + r->r.frmr.fr_pgl = ib_alloc_fast_reg_page_list( + ia->ri_id->device, + ia->ri_max_frmr_depth); + if (IS_ERR(r->r.frmr.fr_pgl)) { + rc = PTR_ERR(r->r.frmr.fr_pgl); + dprintk("RPC: %s: " + "ib_alloc_fast_reg_page_list " + "failed %i\n", __func__, rc); + + ib_dereg_mr(r->r.frmr.fr_mr); + continue; + } + r->r.frmr.fr_state = FRMR_IS_INVALID; + } +} + +/* "*mw" can be NULL when rpcrdma_buffer_get_mrs() fails, leaving + * some req segments uninitialized. + */ +static void +rpcrdma_buffer_put_mr(struct rpcrdma_mw **mw, struct rpcrdma_buffer *buf) +{ + if (*mw) { + list_add_tail(&(*mw)->mw_list, &buf->rb_mws); + *mw = NULL; + } +} + +/* Cycle mw's back in reverse order, and "spin" them. + * This delays and scrambles reuse as much as possible. + */ +static void +rpcrdma_buffer_put_mrs(struct rpcrdma_req *req, struct rpcrdma_buffer *buf) +{ + struct rpcrdma_mr_seg *seg = req->rl_segments; + struct rpcrdma_mr_seg *seg1 = seg; + int i; + + for (i = 1, seg++; i < RPCRDMA_MAX_SEGS; seg++, i++) + rpcrdma_buffer_put_mr(&seg->mr_chunk.rl_mw, buf); + rpcrdma_buffer_put_mr(&seg1->mr_chunk.rl_mw, buf); +} + +static void +rpcrdma_buffer_put_sendbuf(struct rpcrdma_req *req, struct rpcrdma_buffer *buf) +{ + buf->rb_send_bufs[--buf->rb_send_index] = req; + req->rl_niovs = 0; + if (req->rl_reply) { + buf->rb_recv_bufs[--buf->rb_recv_index] = req->rl_reply; + req->rl_reply->rr_func = NULL; + req->rl_reply = NULL; + } +} + +/* rpcrdma_unmap_one() was already done by rpcrdma_deregister_frmr_external(). + * Redo only the ib_post_send(). + */ +static void +rpcrdma_retry_local_inv(struct rpcrdma_mw *r, struct rpcrdma_ia *ia) +{ + struct rpcrdma_xprt *r_xprt = + container_of(ia, struct rpcrdma_xprt, rx_ia); + struct ib_send_wr invalidate_wr, *bad_wr; + int rc; + + dprintk("RPC: %s: FRMR %p is stale\n", __func__, r); + + /* When this FRMR is re-inserted into rb_mws, it is no longer stale */ + r->r.frmr.fr_state = FRMR_IS_INVALID; + + memset(&invalidate_wr, 0, sizeof(invalidate_wr)); + invalidate_wr.wr_id = (unsigned long)(void *)r; + invalidate_wr.opcode = IB_WR_LOCAL_INV; + invalidate_wr.ex.invalidate_rkey = r->r.frmr.fr_mr->rkey; + DECR_CQCOUNT(&r_xprt->rx_ep); + + dprintk("RPC: %s: frmr %p invalidating rkey %08x\n", + __func__, r, r->r.frmr.fr_mr->rkey); + + read_lock(&ia->ri_qplock); + rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr); + read_unlock(&ia->ri_qplock); + if (rc) { + /* Force rpcrdma_buffer_get() to retry */ + r->r.frmr.fr_state = FRMR_IS_STALE; + dprintk("RPC: %s: ib_post_send failed, %i\n", + __func__, rc); + } +} + +static void +rpcrdma_retry_flushed_linv(struct list_head *stale, + struct rpcrdma_buffer *buf) +{ + struct rpcrdma_ia *ia = rdmab_to_ia(buf); + struct list_head *pos; + struct rpcrdma_mw *r; + unsigned long flags; + + list_for_each(pos, stale) { + r = list_entry(pos, struct rpcrdma_mw, mw_list); + rpcrdma_retry_local_inv(r, ia); + } + + spin_lock_irqsave(&buf->rb_lock, flags); + list_splice_tail(stale, &buf->rb_mws); + spin_unlock_irqrestore(&buf->rb_lock, flags); +} + +static struct rpcrdma_req * +rpcrdma_buffer_get_frmrs(struct rpcrdma_req *req, struct rpcrdma_buffer *buf, + struct list_head *stale) +{ + struct rpcrdma_mw *r; + int i; + + i = RPCRDMA_MAX_SEGS - 1; while (!list_empty(&buf->rb_mws)) { r = list_entry(buf->rb_mws.next, - struct rpcrdma_mw, mw_list); + struct rpcrdma_mw, mw_list); list_del(&r->mw_list); - switch (ia->ri_memreg_strategy) { - case RPCRDMA_FRMR: - rc = ib_dereg_mr(r->r.frmr.fr_mr); - if (rc) - dprintk("RPC: %s:" - " ib_dereg_mr" - " failed %i\n", - __func__, rc); - ib_free_fast_reg_page_list(r->r.frmr.fr_pgl); - break; - case RPCRDMA_MTHCAFMR: - rc = ib_dealloc_fmr(r->r.fmr); - if (rc) - dprintk("RPC: %s:" - " ib_dealloc_fmr" - " failed %i\n", - __func__, rc); - break; - default: - break; + if (r->r.frmr.fr_state == FRMR_IS_STALE) { + list_add(&r->mw_list, stale); + continue; } + req->rl_segments[i].mr_chunk.rl_mw = r; + if (unlikely(i-- == 0)) + return req; /* Success */ } - kfree(buf->rb_pool); + /* Not enough entries on rb_mws for this req */ + rpcrdma_buffer_put_sendbuf(req, buf); + rpcrdma_buffer_put_mrs(req, buf); + return NULL; +} + +static struct rpcrdma_req * +rpcrdma_buffer_get_fmrs(struct rpcrdma_req *req, struct rpcrdma_buffer *buf) +{ + struct rpcrdma_mw *r; + int i; + + i = RPCRDMA_MAX_SEGS - 1; + while (!list_empty(&buf->rb_mws)) { + r = list_entry(buf->rb_mws.next, + struct rpcrdma_mw, mw_list); + list_del(&r->mw_list); + req->rl_segments[i].mr_chunk.rl_mw = r; + if (unlikely(i-- == 0)) + return req; /* Success */ + } + + /* Not enough entries on rb_mws for this req */ + rpcrdma_buffer_put_sendbuf(req, buf); + rpcrdma_buffer_put_mrs(req, buf); + return NULL; } /* @@ -1254,10 +1498,10 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) struct rpcrdma_req * rpcrdma_buffer_get(struct rpcrdma_buffer *buffers) { + struct rpcrdma_ia *ia = rdmab_to_ia(buffers); + struct list_head stale; struct rpcrdma_req *req; unsigned long flags; - int i; - struct rpcrdma_mw *r; spin_lock_irqsave(&buffers->rb_lock, flags); if (buffers->rb_send_index == buffers->rb_max_requests) { @@ -1277,16 +1521,21 @@ rpcrdma_buffer_get(struct rpcrdma_buffer *buffers) buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL; } buffers->rb_send_bufs[buffers->rb_send_index++] = NULL; - if (!list_empty(&buffers->rb_mws)) { - i = RPCRDMA_MAX_SEGS - 1; - do { - r = list_entry(buffers->rb_mws.next, - struct rpcrdma_mw, mw_list); - list_del(&r->mw_list); - req->rl_segments[i].mr_chunk.rl_mw = r; - } while (--i >= 0); + + INIT_LIST_HEAD(&stale); + switch (ia->ri_memreg_strategy) { + case RPCRDMA_FRMR: + req = rpcrdma_buffer_get_frmrs(req, buffers, &stale); + break; + case RPCRDMA_MTHCAFMR: + req = rpcrdma_buffer_get_fmrs(req, buffers); + break; + default: + break; } spin_unlock_irqrestore(&buffers->rb_lock, flags); + if (!list_empty(&stale)) + rpcrdma_retry_flushed_linv(&stale, buffers); return req; } @@ -1299,34 +1548,14 @@ rpcrdma_buffer_put(struct rpcrdma_req *req) { struct rpcrdma_buffer *buffers = req->rl_buffer; struct rpcrdma_ia *ia = rdmab_to_ia(buffers); - int i; unsigned long flags; spin_lock_irqsave(&buffers->rb_lock, flags); - buffers->rb_send_bufs[--buffers->rb_send_index] = req; - req->rl_niovs = 0; - if (req->rl_reply) { - buffers->rb_recv_bufs[--buffers->rb_recv_index] = req->rl_reply; - req->rl_reply->rr_func = NULL; - req->rl_reply = NULL; - } + rpcrdma_buffer_put_sendbuf(req, buffers); switch (ia->ri_memreg_strategy) { case RPCRDMA_FRMR: case RPCRDMA_MTHCAFMR: - /* - * Cycle mw's back in reverse order, and "spin" them. - * This delays and scrambles reuse as much as possible. - */ - i = 1; - do { - struct rpcrdma_mw **mw; - mw = &req->rl_segments[i].mr_chunk.rl_mw; - list_add_tail(&(*mw)->mw_list, &buffers->rb_mws); - *mw = NULL; - } while (++i < RPCRDMA_MAX_SEGS); - list_add_tail(&req->rl_segments[0].mr_chunk.rl_mw->mw_list, - &buffers->rb_mws); - req->rl_segments[0].mr_chunk.rl_mw = NULL; + rpcrdma_buffer_put_mrs(req, buffers); break; default: break; @@ -1388,6 +1617,9 @@ rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len, */ iov->addr = ib_dma_map_single(ia->ri_id->device, va, len, DMA_BIDIRECTIONAL); + if (ib_dma_mapping_error(ia->ri_id->device, iov->addr)) + return -ENOMEM; + iov->length = len; if (ia->ri_have_dma_lkey) { @@ -1483,8 +1715,10 @@ rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg, struct rpcrdma_xprt *r_xprt) { struct rpcrdma_mr_seg *seg1 = seg; - struct ib_send_wr invalidate_wr, frmr_wr, *bad_wr, *post_wr; - + struct rpcrdma_mw *mw = seg1->mr_chunk.rl_mw; + struct rpcrdma_frmr *frmr = &mw->r.frmr; + struct ib_mr *mr = frmr->fr_mr; + struct ib_send_wr fastreg_wr, *bad_wr; u8 key; int len, pageoff; int i, rc; @@ -1502,8 +1736,7 @@ rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg, rpcrdma_map_one(ia, seg, writing); pa = seg->mr_dma; for (seg_len = seg->mr_len; seg_len > 0; seg_len -= PAGE_SIZE) { - seg1->mr_chunk.rl_mw->r.frmr.fr_pgl-> - page_list[page_no++] = pa; + frmr->fr_pgl->page_list[page_no++] = pa; pa += PAGE_SIZE; } len += seg->mr_len; @@ -1515,65 +1748,51 @@ rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg, break; } dprintk("RPC: %s: Using frmr %p to map %d segments\n", - __func__, seg1->mr_chunk.rl_mw, i); - - if (unlikely(seg1->mr_chunk.rl_mw->r.frmr.state == FRMR_IS_VALID)) { - dprintk("RPC: %s: frmr %x left valid, posting invalidate.\n", - __func__, - seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey); - /* Invalidate before using. */ - memset(&invalidate_wr, 0, sizeof invalidate_wr); - invalidate_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw; - invalidate_wr.next = &frmr_wr; - invalidate_wr.opcode = IB_WR_LOCAL_INV; - invalidate_wr.send_flags = IB_SEND_SIGNALED; - invalidate_wr.ex.invalidate_rkey = - seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey; - DECR_CQCOUNT(&r_xprt->rx_ep); - post_wr = &invalidate_wr; - } else - post_wr = &frmr_wr; - - /* Prepare FRMR WR */ - memset(&frmr_wr, 0, sizeof frmr_wr); - frmr_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw; - frmr_wr.opcode = IB_WR_FAST_REG_MR; - frmr_wr.send_flags = IB_SEND_SIGNALED; - frmr_wr.wr.fast_reg.iova_start = seg1->mr_dma; - frmr_wr.wr.fast_reg.page_list = seg1->mr_chunk.rl_mw->r.frmr.fr_pgl; - frmr_wr.wr.fast_reg.page_list_len = page_no; - frmr_wr.wr.fast_reg.page_shift = PAGE_SHIFT; - frmr_wr.wr.fast_reg.length = page_no << PAGE_SHIFT; - if (frmr_wr.wr.fast_reg.length < len) { - while (seg1->mr_nsegs--) - rpcrdma_unmap_one(ia, seg++); - return -EIO; + __func__, mw, i); + + frmr->fr_state = FRMR_IS_VALID; + + memset(&fastreg_wr, 0, sizeof(fastreg_wr)); + fastreg_wr.wr_id = (unsigned long)(void *)mw; + fastreg_wr.opcode = IB_WR_FAST_REG_MR; + fastreg_wr.wr.fast_reg.iova_start = seg1->mr_dma; + fastreg_wr.wr.fast_reg.page_list = frmr->fr_pgl; + fastreg_wr.wr.fast_reg.page_list_len = page_no; + fastreg_wr.wr.fast_reg.page_shift = PAGE_SHIFT; + fastreg_wr.wr.fast_reg.length = page_no << PAGE_SHIFT; + if (fastreg_wr.wr.fast_reg.length < len) { + rc = -EIO; + goto out_err; } /* Bump the key */ - key = (u8)(seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey & 0x000000FF); - ib_update_fast_reg_key(seg1->mr_chunk.rl_mw->r.frmr.fr_mr, ++key); + key = (u8)(mr->rkey & 0x000000FF); + ib_update_fast_reg_key(mr, ++key); - frmr_wr.wr.fast_reg.access_flags = (writing ? + fastreg_wr.wr.fast_reg.access_flags = (writing ? IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE : IB_ACCESS_REMOTE_READ); - frmr_wr.wr.fast_reg.rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey; + fastreg_wr.wr.fast_reg.rkey = mr->rkey; DECR_CQCOUNT(&r_xprt->rx_ep); - rc = ib_post_send(ia->ri_id->qp, post_wr, &bad_wr); - + rc = ib_post_send(ia->ri_id->qp, &fastreg_wr, &bad_wr); if (rc) { dprintk("RPC: %s: failed ib_post_send for register," " status %i\n", __func__, rc); - while (i--) - rpcrdma_unmap_one(ia, --seg); + ib_update_fast_reg_key(mr, --key); + goto out_err; } else { - seg1->mr_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey; + seg1->mr_rkey = mr->rkey; seg1->mr_base = seg1->mr_dma + pageoff; seg1->mr_nsegs = i; seg1->mr_len = len; } *nsegs = i; + return 0; +out_err: + frmr->fr_state = FRMR_IS_INVALID; + while (i--) + rpcrdma_unmap_one(ia, --seg); return rc; } @@ -1585,20 +1804,25 @@ rpcrdma_deregister_frmr_external(struct rpcrdma_mr_seg *seg, struct ib_send_wr invalidate_wr, *bad_wr; int rc; - while (seg1->mr_nsegs--) - rpcrdma_unmap_one(ia, seg++); + seg1->mr_chunk.rl_mw->r.frmr.fr_state = FRMR_IS_INVALID; memset(&invalidate_wr, 0, sizeof invalidate_wr); invalidate_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw; invalidate_wr.opcode = IB_WR_LOCAL_INV; - invalidate_wr.send_flags = IB_SEND_SIGNALED; invalidate_wr.ex.invalidate_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey; DECR_CQCOUNT(&r_xprt->rx_ep); + read_lock(&ia->ri_qplock); + while (seg1->mr_nsegs--) + rpcrdma_unmap_one(ia, seg++); rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr); - if (rc) + read_unlock(&ia->ri_qplock); + if (rc) { + /* Force rpcrdma_buffer_get() to retry */ + seg1->mr_chunk.rl_mw->r.frmr.fr_state = FRMR_IS_STALE; dprintk("RPC: %s: failed ib_post_send for invalidate," " status %i\n", __func__, rc); + } return rc; } @@ -1656,8 +1880,10 @@ rpcrdma_deregister_fmr_external(struct rpcrdma_mr_seg *seg, list_add(&seg1->mr_chunk.rl_mw->r.fmr->list, &l); rc = ib_unmap_fmr(&l); + read_lock(&ia->ri_qplock); while (seg1->mr_nsegs--) rpcrdma_unmap_one(ia, seg++); + read_unlock(&ia->ri_qplock); if (rc) dprintk("RPC: %s: failed ib_unmap_fmr," " status %i\n", __func__, rc); @@ -1673,7 +1899,6 @@ rpcrdma_register_external(struct rpcrdma_mr_seg *seg, switch (ia->ri_memreg_strategy) { -#if RPCRDMA_PERSISTENT_REGISTRATION case RPCRDMA_ALLPHYSICAL: rpcrdma_map_one(ia, seg, writing); seg->mr_rkey = ia->ri_bind_mem->rkey; @@ -1681,7 +1906,6 @@ rpcrdma_register_external(struct rpcrdma_mr_seg *seg, seg->mr_nsegs = 1; nsegs = 1; break; -#endif /* Registration using frmr registration */ case RPCRDMA_FRMR: @@ -1711,11 +1935,11 @@ rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg, switch (ia->ri_memreg_strategy) { -#if RPCRDMA_PERSISTENT_REGISTRATION case RPCRDMA_ALLPHYSICAL: + read_lock(&ia->ri_qplock); rpcrdma_unmap_one(ia, seg); + read_unlock(&ia->ri_qplock); break; -#endif case RPCRDMA_FRMR: rc = rpcrdma_deregister_frmr_external(seg, ia, r_xprt); @@ -1809,3 +2033,44 @@ rpcrdma_ep_post_recv(struct rpcrdma_ia *ia, rc); return rc; } + +/* Physical mapping means one Read/Write list entry per-page. + * All list entries must fit within an inline buffer + * + * NB: The server must return a Write list for NFS READ, + * which has the same constraint. Factor in the inline + * rsize as well. + */ +static size_t +rpcrdma_physical_max_payload(struct rpcrdma_xprt *r_xprt) +{ + struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data; + unsigned int inline_size, pages; + + inline_size = min_t(unsigned int, + cdata->inline_wsize, cdata->inline_rsize); + inline_size -= RPCRDMA_HDRLEN_MIN; + pages = inline_size / sizeof(struct rpcrdma_segment); + return pages << PAGE_SHIFT; +} + +static size_t +rpcrdma_mr_max_payload(struct rpcrdma_xprt *r_xprt) +{ + return RPCRDMA_MAX_DATA_SEGS << PAGE_SHIFT; +} + +size_t +rpcrdma_max_payload(struct rpcrdma_xprt *r_xprt) +{ + size_t result; + + switch (r_xprt->rx_ia.ri_memreg_strategy) { + case RPCRDMA_ALLPHYSICAL: + result = rpcrdma_physical_max_payload(r_xprt); + break; + default: + result = rpcrdma_mr_max_payload(r_xprt); + } + return result; +} diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 89e7cd479705..c419498b8f46 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -59,6 +59,7 @@ * Interface Adapter -- one per transport instance */ struct rpcrdma_ia { + rwlock_t ri_qplock; struct rdma_cm_id *ri_id; struct ib_pd *ri_pd; struct ib_mr *ri_bind_mem; @@ -98,6 +99,14 @@ struct rpcrdma_ep { #define INIT_CQCOUNT(ep) atomic_set(&(ep)->rep_cqcount, (ep)->rep_cqinit) #define DECR_CQCOUNT(ep) atomic_sub_return(1, &(ep)->rep_cqcount) +enum rpcrdma_chunktype { + rpcrdma_noch = 0, + rpcrdma_readch, + rpcrdma_areadch, + rpcrdma_writech, + rpcrdma_replych +}; + /* * struct rpcrdma_rep -- this structure encapsulates state required to recv * and complete a reply, asychronously. It needs several pieces of @@ -137,6 +146,40 @@ struct rpcrdma_rep { }; /* + * struct rpcrdma_mw - external memory region metadata + * + * An external memory region is any buffer or page that is registered + * on the fly (ie, not pre-registered). + * + * Each rpcrdma_buffer has a list of free MWs anchored in rb_mws. During + * call_allocate, rpcrdma_buffer_get() assigns one to each segment in + * an rpcrdma_req. Then rpcrdma_register_external() grabs these to keep + * track of registration metadata while each RPC is pending. + * rpcrdma_deregister_external() uses this metadata to unmap and + * release these resources when an RPC is complete. + */ +enum rpcrdma_frmr_state { + FRMR_IS_INVALID, /* ready to be used */ + FRMR_IS_VALID, /* in use */ + FRMR_IS_STALE, /* failed completion */ +}; + +struct rpcrdma_frmr { + struct ib_fast_reg_page_list *fr_pgl; + struct ib_mr *fr_mr; + enum rpcrdma_frmr_state fr_state; +}; + +struct rpcrdma_mw { + union { + struct ib_fmr *fmr; + struct rpcrdma_frmr frmr; + } r; + struct list_head mw_list; + struct list_head mw_all; +}; + +/* * struct rpcrdma_req -- structure central to the request/reply sequence. * * N of these are associated with a transport instance, and stored in @@ -163,17 +206,7 @@ struct rpcrdma_rep { struct rpcrdma_mr_seg { /* chunk descriptors */ union { /* chunk memory handles */ struct ib_mr *rl_mr; /* if registered directly */ - struct rpcrdma_mw { /* if registered from region */ - union { - struct ib_fmr *fmr; - struct { - struct ib_fast_reg_page_list *fr_pgl; - struct ib_mr *fr_mr; - enum { FRMR_IS_INVALID, FRMR_IS_VALID } state; - } frmr; - } r; - struct list_head mw_list; - } *rl_mw; + struct rpcrdma_mw *rl_mw; /* if registered from region */ } mr_chunk; u64 mr_base; /* registration result */ u32 mr_rkey; /* registration result */ @@ -191,6 +224,7 @@ struct rpcrdma_req { unsigned int rl_niovs; /* 0, 2 or 4 */ unsigned int rl_nchunks; /* non-zero if chunks */ unsigned int rl_connect_cookie; /* retry detection */ + enum rpcrdma_chunktype rl_rtype, rl_wtype; struct rpcrdma_buffer *rl_buffer; /* home base for this structure */ struct rpcrdma_rep *rl_reply;/* holder for reply buffer */ struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS];/* chunk segments */ @@ -214,6 +248,7 @@ struct rpcrdma_buffer { atomic_t rb_credits; /* most recent server credits */ int rb_max_requests;/* client max requests */ struct list_head rb_mws; /* optional memory windows/fmrs/frmrs */ + struct list_head rb_all; int rb_send_index; struct rpcrdma_req **rb_send_bufs; int rb_recv_index; @@ -306,7 +341,7 @@ int rpcrdma_ep_create(struct rpcrdma_ep *, struct rpcrdma_ia *, struct rpcrdma_create_data_internal *); void rpcrdma_ep_destroy(struct rpcrdma_ep *, struct rpcrdma_ia *); int rpcrdma_ep_connect(struct rpcrdma_ep *, struct rpcrdma_ia *); -int rpcrdma_ep_disconnect(struct rpcrdma_ep *, struct rpcrdma_ia *); +void rpcrdma_ep_disconnect(struct rpcrdma_ep *, struct rpcrdma_ia *); int rpcrdma_ep_post(struct rpcrdma_ia *, struct rpcrdma_ep *, struct rpcrdma_req *); @@ -346,7 +381,9 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *); /* * RPC/RDMA protocol calls - xprtrdma/rpc_rdma.c */ +ssize_t rpcrdma_marshal_chunks(struct rpc_rqst *, ssize_t); int rpcrdma_marshal_req(struct rpc_rqst *); +size_t rpcrdma_max_payload(struct rpcrdma_xprt *); /* Temporary NFS request map cache. Created in svc_rdma.c */ extern struct kmem_cache *svc_rdma_map_cachep; |