diff options
Diffstat (limited to 'net/sunrpc/xprtrdma/svc_rdma_transport.c')
-rw-r--r-- | net/sunrpc/xprtrdma/svc_rdma_transport.c | 364 |
1 files changed, 303 insertions, 61 deletions
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index 900cb69728c6..6fb493cbd29f 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -100,20 +100,29 @@ struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *xprt) ctxt->xprt = xprt; INIT_LIST_HEAD(&ctxt->dto_q); ctxt->count = 0; + ctxt->frmr = NULL; atomic_inc(&xprt->sc_ctxt_used); return ctxt; } -static void svc_rdma_unmap_dma(struct svc_rdma_op_ctxt *ctxt) +void svc_rdma_unmap_dma(struct svc_rdma_op_ctxt *ctxt) { struct svcxprt_rdma *xprt = ctxt->xprt; int i; for (i = 0; i < ctxt->count && ctxt->sge[i].length; i++) { - atomic_dec(&xprt->sc_dma_used); - ib_dma_unmap_single(xprt->sc_cm_id->device, - ctxt->sge[i].addr, - ctxt->sge[i].length, - ctxt->direction); + /* + * Unmap the DMA addr in the SGE if the lkey matches + * the sc_dma_lkey, otherwise, ignore it since it is + * an FRMR lkey and will be unmapped later when the + * last WR that uses it completes. + */ + if (ctxt->sge[i].lkey == xprt->sc_dma_lkey) { + atomic_dec(&xprt->sc_dma_used); + ib_dma_unmap_single(xprt->sc_cm_id->device, + ctxt->sge[i].addr, + ctxt->sge[i].length, + ctxt->direction); + } } } @@ -150,6 +159,7 @@ struct svc_rdma_req_map *svc_rdma_get_req_map(void) schedule_timeout_uninterruptible(msecs_to_jiffies(500)); } map->count = 0; + map->frmr = NULL; return map; } @@ -316,6 +326,50 @@ static void rq_cq_reap(struct svcxprt_rdma *xprt) } /* + * Processs a completion context + */ +static void process_context(struct svcxprt_rdma *xprt, + struct svc_rdma_op_ctxt *ctxt) +{ + svc_rdma_unmap_dma(ctxt); + + switch (ctxt->wr_op) { + case IB_WR_SEND: + if (test_bit(RDMACTXT_F_FAST_UNREG, &ctxt->flags)) + svc_rdma_put_frmr(xprt, ctxt->frmr); + svc_rdma_put_context(ctxt, 1); + break; + + case IB_WR_RDMA_WRITE: + svc_rdma_put_context(ctxt, 0); + break; + + case IB_WR_RDMA_READ: + case IB_WR_RDMA_READ_WITH_INV: + if (test_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags)) { + struct svc_rdma_op_ctxt *read_hdr = ctxt->read_hdr; + BUG_ON(!read_hdr); + if (test_bit(RDMACTXT_F_FAST_UNREG, &ctxt->flags)) + svc_rdma_put_frmr(xprt, ctxt->frmr); + spin_lock_bh(&xprt->sc_rq_dto_lock); + set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags); + list_add_tail(&read_hdr->dto_q, + &xprt->sc_read_complete_q); + spin_unlock_bh(&xprt->sc_rq_dto_lock); + svc_xprt_enqueue(&xprt->sc_xprt); + } + svc_rdma_put_context(ctxt, 0); + break; + + default: + printk(KERN_ERR "svcrdma: unexpected completion type, " + "opcode=%d\n", + ctxt->wr_op); + break; + } +} + +/* * Send Queue Completion Handler - potentially called on interrupt context. * * Note that caller must hold a transport reference. @@ -327,17 +381,12 @@ static void sq_cq_reap(struct svcxprt_rdma *xprt) struct ib_cq *cq = xprt->sc_sq_cq; int ret; - if (!test_and_clear_bit(RDMAXPRT_SQ_PENDING, &xprt->sc_flags)) return; ib_req_notify_cq(xprt->sc_sq_cq, IB_CQ_NEXT_COMP); atomic_inc(&rdma_stat_sq_poll); while ((ret = ib_poll_cq(cq, 1, &wc)) > 0) { - ctxt = (struct svc_rdma_op_ctxt *)(unsigned long)wc.wr_id; - xprt = ctxt->xprt; - - svc_rdma_unmap_dma(ctxt); if (wc.status != IB_WC_SUCCESS) /* Close the transport */ set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags); @@ -346,35 +395,10 @@ static void sq_cq_reap(struct svcxprt_rdma *xprt) atomic_dec(&xprt->sc_sq_count); wake_up(&xprt->sc_send_wait); - switch (ctxt->wr_op) { - case IB_WR_SEND: - svc_rdma_put_context(ctxt, 1); - break; - - case IB_WR_RDMA_WRITE: - svc_rdma_put_context(ctxt, 0); - break; - - case IB_WR_RDMA_READ: - if (test_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags)) { - struct svc_rdma_op_ctxt *read_hdr = ctxt->read_hdr; - BUG_ON(!read_hdr); - spin_lock_bh(&xprt->sc_rq_dto_lock); - set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags); - list_add_tail(&read_hdr->dto_q, - &xprt->sc_read_complete_q); - spin_unlock_bh(&xprt->sc_rq_dto_lock); - svc_xprt_enqueue(&xprt->sc_xprt); - } - svc_rdma_put_context(ctxt, 0); - break; + ctxt = (struct svc_rdma_op_ctxt *)(unsigned long)wc.wr_id; + if (ctxt) + process_context(xprt, ctxt); - default: - printk(KERN_ERR "svcrdma: unexpected completion type, " - "opcode=%d, status=%d\n", - wc.opcode, wc.status); - break; - } svc_xprt_put(&xprt->sc_xprt); } @@ -425,10 +449,12 @@ static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv, INIT_LIST_HEAD(&cma_xprt->sc_dto_q); INIT_LIST_HEAD(&cma_xprt->sc_rq_dto_q); INIT_LIST_HEAD(&cma_xprt->sc_read_complete_q); + INIT_LIST_HEAD(&cma_xprt->sc_frmr_q); init_waitqueue_head(&cma_xprt->sc_send_wait); spin_lock_init(&cma_xprt->sc_lock); spin_lock_init(&cma_xprt->sc_rq_dto_lock); + spin_lock_init(&cma_xprt->sc_frmr_q_lock); cma_xprt->sc_ord = svcrdma_ord; @@ -462,7 +488,7 @@ int svc_rdma_post_recv(struct svcxprt_rdma *xprt) struct ib_recv_wr recv_wr, *bad_recv_wr; struct svc_rdma_op_ctxt *ctxt; struct page *page; - unsigned long pa; + dma_addr_t pa; int sge_no; int buflen; int ret; @@ -474,13 +500,15 @@ int svc_rdma_post_recv(struct svcxprt_rdma *xprt) BUG_ON(sge_no >= xprt->sc_max_sge); page = svc_rdma_get_page(); ctxt->pages[sge_no] = page; - atomic_inc(&xprt->sc_dma_used); pa = ib_dma_map_page(xprt->sc_cm_id->device, page, 0, PAGE_SIZE, DMA_FROM_DEVICE); + if (ib_dma_mapping_error(xprt->sc_cm_id->device, pa)) + goto err_put_ctxt; + atomic_inc(&xprt->sc_dma_used); ctxt->sge[sge_no].addr = pa; ctxt->sge[sge_no].length = PAGE_SIZE; - ctxt->sge[sge_no].lkey = xprt->sc_phys_mr->lkey; + ctxt->sge[sge_no].lkey = xprt->sc_dma_lkey; buflen += PAGE_SIZE; } ctxt->count = sge_no; @@ -496,6 +524,10 @@ int svc_rdma_post_recv(struct svcxprt_rdma *xprt) svc_rdma_put_context(ctxt, 1); } return ret; + + err_put_ctxt: + svc_rdma_put_context(ctxt, 1); + return -ENOMEM; } /* @@ -566,7 +598,7 @@ static int rdma_listen_handler(struct rdma_cm_id *cma_id, dprintk("svcrdma: Connect request on cma_id=%p, xprt = %p, " "event=%d\n", cma_id, cma_id->context, event->event); handle_connect_req(cma_id, - event->param.conn.responder_resources); + event->param.conn.initiator_depth); break; case RDMA_CM_EVENT_ESTABLISHED: @@ -686,6 +718,97 @@ static struct svc_xprt *svc_rdma_create(struct svc_serv *serv, return ERR_PTR(ret); } +static struct svc_rdma_fastreg_mr *rdma_alloc_frmr(struct svcxprt_rdma *xprt) +{ + struct ib_mr *mr; + struct ib_fast_reg_page_list *pl; + struct svc_rdma_fastreg_mr *frmr; + + frmr = kmalloc(sizeof(*frmr), GFP_KERNEL); + if (!frmr) + goto err; + + mr = ib_alloc_fast_reg_mr(xprt->sc_pd, RPCSVC_MAXPAGES); + if (!mr) + goto err_free_frmr; + + pl = ib_alloc_fast_reg_page_list(xprt->sc_cm_id->device, + RPCSVC_MAXPAGES); + if (!pl) + goto err_free_mr; + + frmr->mr = mr; + frmr->page_list = pl; + INIT_LIST_HEAD(&frmr->frmr_list); + return frmr; + + err_free_mr: + ib_dereg_mr(mr); + err_free_frmr: + kfree(frmr); + err: + return ERR_PTR(-ENOMEM); +} + +static void rdma_dealloc_frmr_q(struct svcxprt_rdma *xprt) +{ + struct svc_rdma_fastreg_mr *frmr; + + while (!list_empty(&xprt->sc_frmr_q)) { + frmr = list_entry(xprt->sc_frmr_q.next, + struct svc_rdma_fastreg_mr, frmr_list); + list_del_init(&frmr->frmr_list); + ib_dereg_mr(frmr->mr); + ib_free_fast_reg_page_list(frmr->page_list); + kfree(frmr); + } +} + +struct svc_rdma_fastreg_mr *svc_rdma_get_frmr(struct svcxprt_rdma *rdma) +{ + struct svc_rdma_fastreg_mr *frmr = NULL; + + spin_lock_bh(&rdma->sc_frmr_q_lock); + if (!list_empty(&rdma->sc_frmr_q)) { + frmr = list_entry(rdma->sc_frmr_q.next, + struct svc_rdma_fastreg_mr, frmr_list); + list_del_init(&frmr->frmr_list); + frmr->map_len = 0; + frmr->page_list_len = 0; + } + spin_unlock_bh(&rdma->sc_frmr_q_lock); + if (frmr) + return frmr; + + return rdma_alloc_frmr(rdma); +} + +static void frmr_unmap_dma(struct svcxprt_rdma *xprt, + struct svc_rdma_fastreg_mr *frmr) +{ + int page_no; + for (page_no = 0; page_no < frmr->page_list_len; page_no++) { + dma_addr_t addr = frmr->page_list->page_list[page_no]; + if (ib_dma_mapping_error(frmr->mr->device, addr)) + continue; + atomic_dec(&xprt->sc_dma_used); + ib_dma_unmap_single(frmr->mr->device, addr, PAGE_SIZE, + frmr->direction); + } +} + +void svc_rdma_put_frmr(struct svcxprt_rdma *rdma, + struct svc_rdma_fastreg_mr *frmr) +{ + if (frmr) { + frmr_unmap_dma(rdma, frmr); + spin_lock_bh(&rdma->sc_frmr_q_lock); + BUG_ON(!list_empty(&frmr->frmr_list)); + list_add(&frmr->frmr_list, &rdma->sc_frmr_q); + spin_unlock_bh(&rdma->sc_frmr_q_lock); + } +} + /* * This is the xpo_recvfrom function for listening endpoints. Its * purpose is to accept incoming connections. The CMA callback handler @@ -704,6 +827,8 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) struct rdma_conn_param conn_param; struct ib_qp_init_attr qp_attr; struct ib_device_attr devattr; + int dma_mr_acc; + int need_dma_mr; int ret; int i; @@ -819,15 +944,77 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) } newxprt->sc_qp = newxprt->sc_cm_id->qp; - /* Register all of physical memory */ - newxprt->sc_phys_mr = ib_get_dma_mr(newxprt->sc_pd, - IB_ACCESS_LOCAL_WRITE | - IB_ACCESS_REMOTE_WRITE); - if (IS_ERR(newxprt->sc_phys_mr)) { - dprintk("svcrdma: Failed to create DMA MR ret=%d\n", ret); + /* + * Use the most secure set of MR resources based on the + * transport type and available memory management features in + * the device. Here's the table implemented below: + * + * Fast Global DMA Remote WR + * Reg LKEY MR Access + * Sup'd Sup'd Needed Needed + * + * IWARP N N Y Y + * N Y Y Y + * Y N Y N + * Y Y N - + * + * IB N N Y N + * N Y N - + * Y N Y N + * Y Y N - + * + * NB: iWARP requires remote write access for the data sink + * of an RDMA_READ. IB does not. + */ + if (devattr.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) { + newxprt->sc_frmr_pg_list_len = + devattr.max_fast_reg_page_list_len; + newxprt->sc_dev_caps |= SVCRDMA_DEVCAP_FAST_REG; + } + + /* + * Determine if a DMA MR is required and if so, what privs are required + */ + switch (rdma_node_get_transport(newxprt->sc_cm_id->device->node_type)) { + case RDMA_TRANSPORT_IWARP: + newxprt->sc_dev_caps |= SVCRDMA_DEVCAP_READ_W_INV; + if (!(newxprt->sc_dev_caps & SVCRDMA_DEVCAP_FAST_REG)) { + need_dma_mr = 1; + dma_mr_acc = + (IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_REMOTE_WRITE); + } else if (!(devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY)) { + need_dma_mr = 1; + dma_mr_acc = IB_ACCESS_LOCAL_WRITE; + } else + need_dma_mr = 0; + break; + case RDMA_TRANSPORT_IB: + if (!(devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY)) { + need_dma_mr = 1; + dma_mr_acc = IB_ACCESS_LOCAL_WRITE; + } else + need_dma_mr = 0; + break; + default: goto errout; } + /* Create the DMA MR if needed, otherwise, use the DMA LKEY */ + if (need_dma_mr) { + /* Register all of physical memory */ + newxprt->sc_phys_mr = + ib_get_dma_mr(newxprt->sc_pd, dma_mr_acc); + if (IS_ERR(newxprt->sc_phys_mr)) { + dprintk("svcrdma: Failed to create DMA MR ret=%d\n", + ret); + goto errout; + } + newxprt->sc_dma_lkey = newxprt->sc_phys_mr->lkey; + } else + newxprt->sc_dma_lkey = + newxprt->sc_cm_id->device->local_dma_lkey; + /* Post receive buffers */ for (i = 0; i < newxprt->sc_max_requests; i++) { ret = svc_rdma_post_recv(newxprt); @@ -961,6 +1148,9 @@ static void __svc_rdma_free(struct work_struct *work) WARN_ON(atomic_read(&rdma->sc_ctxt_used) != 0); WARN_ON(atomic_read(&rdma->sc_dma_used) != 0); + /* De-allocate fastreg mr */ + rdma_dealloc_frmr_q(rdma); + /* Destroy the QP if present (not a listener) */ if (rdma->sc_qp && !IS_ERR(rdma->sc_qp)) ib_destroy_qp(rdma->sc_qp); @@ -1014,21 +1204,59 @@ static int svc_rdma_has_wspace(struct svc_xprt *xprt) return 1; } +/* + * Attempt to register the kvec representing the RPC memory with the + * device. + * + * Returns: + * NULL : The device does not support fastreg or there were no more + * fastreg mr. + * frmr : The kvec register request was successfully posted. + * <0 : An error was encountered attempting to register the kvec. + */ +int svc_rdma_fastreg(struct svcxprt_rdma *xprt, + struct svc_rdma_fastreg_mr *frmr) +{ + struct ib_send_wr fastreg_wr; + u8 key; + + /* Bump the key */ + key = (u8)(frmr->mr->lkey & 0x000000FF); + ib_update_fast_reg_key(frmr->mr, ++key); + + /* Prepare FASTREG WR */ + memset(&fastreg_wr, 0, sizeof fastreg_wr); + fastreg_wr.opcode = IB_WR_FAST_REG_MR; + fastreg_wr.send_flags = IB_SEND_SIGNALED; + fastreg_wr.wr.fast_reg.iova_start = (unsigned long)frmr->kva; + fastreg_wr.wr.fast_reg.page_list = frmr->page_list; + fastreg_wr.wr.fast_reg.page_list_len = frmr->page_list_len; + fastreg_wr.wr.fast_reg.page_shift = PAGE_SHIFT; + fastreg_wr.wr.fast_reg.length = frmr->map_len; + fastreg_wr.wr.fast_reg.access_flags = frmr->access_flags; + fastreg_wr.wr.fast_reg.rkey = frmr->mr->lkey; + return svc_rdma_send(xprt, &fastreg_wr); +} + int svc_rdma_send(struct svcxprt_rdma *xprt, struct ib_send_wr *wr) { - struct ib_send_wr *bad_wr; + struct ib_send_wr *bad_wr, *n_wr; + int wr_count; + int i; int ret; if (test_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags)) return -ENOTCONN; BUG_ON(wr->send_flags != IB_SEND_SIGNALED); - BUG_ON(((struct svc_rdma_op_ctxt *)(unsigned long)wr->wr_id)->wr_op != - wr->opcode); + wr_count = 1; + for (n_wr = wr->next; n_wr; n_wr = n_wr->next) + wr_count++; + /* If the SQ is full, wait until an SQ entry is available */ while (1) { spin_lock_bh(&xprt->sc_lock); - if (xprt->sc_sq_depth == atomic_read(&xprt->sc_sq_count)) { + if (xprt->sc_sq_depth < atomic_read(&xprt->sc_sq_count) + wr_count) { spin_unlock_bh(&xprt->sc_lock); atomic_inc(&rdma_stat_sq_starve); @@ -1043,19 +1271,26 @@ int svc_rdma_send(struct svcxprt_rdma *xprt, struct ib_send_wr *wr) return 0; continue; } - /* Bumped used SQ WR count and post */ - svc_xprt_get(&xprt->sc_xprt); + /* Take a transport ref for each WR posted */ + for (i = 0; i < wr_count; i++) + svc_xprt_get(&xprt->sc_xprt); + + /* Bump used SQ WR count and post */ + atomic_add(wr_count, &xprt->sc_sq_count); ret = ib_post_send(xprt->sc_qp, wr, &bad_wr); - if (!ret) - atomic_inc(&xprt->sc_sq_count); - else { - svc_xprt_put(&xprt->sc_xprt); + if (ret) { + set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags); + atomic_sub(wr_count, &xprt->sc_sq_count); + for (i = 0; i < wr_count; i ++) + svc_xprt_put(&xprt->sc_xprt); dprintk("svcrdma: failed to post SQ WR rc=%d, " "sc_sq_count=%d, sc_sq_depth=%d\n", ret, atomic_read(&xprt->sc_sq_count), xprt->sc_sq_depth); } spin_unlock_bh(&xprt->sc_lock); + if (ret) + wake_up(&xprt->sc_send_wait); break; } return ret; @@ -1079,10 +1314,14 @@ void svc_rdma_send_error(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rmsgp, length = svc_rdma_xdr_encode_error(xprt, rmsgp, err, va); /* Prepare SGE for local address */ - atomic_inc(&xprt->sc_dma_used); sge.addr = ib_dma_map_page(xprt->sc_cm_id->device, p, 0, PAGE_SIZE, DMA_FROM_DEVICE); - sge.lkey = xprt->sc_phys_mr->lkey; + if (ib_dma_mapping_error(xprt->sc_cm_id->device, sge.addr)) { + put_page(p); + return; + } + atomic_inc(&xprt->sc_dma_used); + sge.lkey = xprt->sc_dma_lkey; sge.length = length; ctxt = svc_rdma_get_context(xprt); @@ -1103,6 +1342,9 @@ void svc_rdma_send_error(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rmsgp, if (ret) { dprintk("svcrdma: Error %d posting send for protocol error\n", ret); + ib_dma_unmap_page(xprt->sc_cm_id->device, + sge.addr, PAGE_SIZE, + DMA_FROM_DEVICE); svc_rdma_put_context(ctxt, 1); } } |