From 6720a89933739cb8dec748cd253f7c8df2c0ae4d Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 28 Feb 2018 15:30:27 -0500 Subject: xprtrdma: Fix latency regression on NUMA NFS/RDMA clients With v4.15, on one of my NFS/RDMA clients I measured a nearly doubling in the latency of small read and write system calls. There was no change in server round trip time. The extra latency appears in the whole RPC execution path. "git bisect" settled on commit ccede7598588 ("xprtrdma: Spread reply processing over more CPUs") . After some experimentation, I found that leaving the WQ bound and allowing the scheduler to pick the dispatch CPU seems to eliminate the long latencies, and it does not introduce any new regressions. The fix is implemented by reverting only the part of commit ccede7598588 ("xprtrdma: Spread reply processing over more CPUs") that dispatches RPC replies specifically on the CPU where the matching RPC call was made. Interestingly, saving the CPU number and later queuing reply processing there was effective _only_ for a NFS READ and WRITE request. On my NUMA client, in-kernel RPC reply processing for asynchronous RPCs was dispatched on the same CPU where the RPC call was made, as expected. However synchronous RPCs seem to get their reply dispatched on some other CPU than where the call was placed, every time. Fixes: ccede7598588 ("xprtrdma: Spread reply processing over ... ") Signed-off-by: Chuck Lever Cc: stable@vger.kernel.org # v4.15+ Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/rpc_rdma.c | 2 +- net/sunrpc/xprtrdma/transport.c | 2 -- net/sunrpc/xprtrdma/xprt_rdma.h | 1 - 3 files changed, 1 insertion(+), 4 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index f0855a959a27..4bc0f4d94a01 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -1366,7 +1366,7 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *rep) trace_xprtrdma_reply(rqst->rq_task, rep, req, credits); - queue_work_on(req->rl_cpu, rpcrdma_receive_wq, &rep->rr_work); + queue_work(rpcrdma_receive_wq, &rep->rr_work); return; out_badstatus: diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 4b1ecfe979cf..f86021e3b853 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -52,7 +52,6 @@ #include #include #include -#include #include "xprt_rdma.h" @@ -651,7 +650,6 @@ xprt_rdma_allocate(struct rpc_task *task) if (!rpcrdma_get_recvbuf(r_xprt, req, rqst->rq_rcvsize, flags)) goto out_fail; - req->rl_cpu = smp_processor_id(); req->rl_connect_cookie = 0; /* our reserved value */ rpcrdma_set_xprtdata(rqst, req); rqst->rq_buffer = req->rl_sendbuf->rg_base; diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 69883a960a3f..430a6de8300e 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -334,7 +334,6 @@ enum { struct rpcrdma_buffer; struct rpcrdma_req { struct list_head rl_list; - int rl_cpu; unsigned int rl_connect_cookie; struct rpcrdma_buffer *rl_buffer; struct rpcrdma_rep *rl_reply; -- cgit v1.2.3 From b7e85fff52eec510eac147462615fd6bc580969a Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 28 Feb 2018 15:30:33 -0500 Subject: xprtrdma: Remove arbitrary limit on initiator depth Clean up: We need to check only that the value does not exceed the range of the u8 field it's going into. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/verbs.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index e6f84a6434a0..7cc346599384 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -589,11 +589,8 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, /* Client offers RDMA Read but does not initiate */ ep->rep_remote_cma.initiator_depth = 0; - if (ia->ri_device->attrs.max_qp_rd_atom > 32) /* arbitrary but <= 255 */ - ep->rep_remote_cma.responder_resources = 32; - else - ep->rep_remote_cma.responder_resources = - ia->ri_device->attrs.max_qp_rd_atom; + ep->rep_remote_cma.responder_resources = + min_t(int, U8_MAX, ia->ri_device->attrs.max_qp_rd_atom); /* Limit transport retries so client can detect server * GID changes quickly. RPC layer handles re-establishing -- cgit v1.2.3 From 8a14793e7aa718d16382e18cadec92e2e531e62a Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 28 Feb 2018 15:30:38 -0500 Subject: xprtrdma: Remove xprt-specific connect cookie Clean up: The generic rq_connect_cookie is sufficient to detect RPC Call retransmission. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/transport.c | 6 +----- net/sunrpc/xprtrdma/verbs.c | 2 ++ net/sunrpc/xprtrdma/xprt_rdma.h | 1 - 3 files changed, 3 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index f86021e3b853..47b4604e820a 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -236,8 +236,6 @@ rpcrdma_connect_worker(struct work_struct *work) struct rpc_xprt *xprt = &r_xprt->rx_xprt; spin_lock_bh(&xprt->transport_lock); - if (++xprt->connect_cookie == 0) /* maintain a reserved value */ - ++xprt->connect_cookie; if (ep->rep_connected > 0) { if (!xprt_test_and_set_connected(xprt)) xprt_wake_pending_tasks(xprt, 0); @@ -650,7 +648,6 @@ xprt_rdma_allocate(struct rpc_task *task) if (!rpcrdma_get_recvbuf(r_xprt, req, rqst->rq_rcvsize, flags)) goto out_fail; - req->rl_connect_cookie = 0; /* our reserved value */ rpcrdma_set_xprtdata(rqst, req); rqst->rq_buffer = req->rl_sendbuf->rg_base; rqst->rq_rbuffer = req->rl_recvbuf->rg_base; @@ -721,9 +718,8 @@ xprt_rdma_send_request(struct rpc_task *task) rpcrdma_recv_buffer_get(req); /* Must suppress retransmit to maintain credits */ - if (req->rl_connect_cookie == xprt->connect_cookie) + if (rqst->rq_connect_cookie == xprt->connect_cookie) goto drop_connection; - req->rl_connect_cookie = xprt->connect_cookie; __set_bit(RPCRDMA_REQ_F_PENDING, &req->rl_flags); if (rpcrdma_ep_post(&r_xprt->rx_ia, &r_xprt->rx_ep, req)) diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 7cc346599384..520e7e4fe203 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -255,6 +255,7 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event) /* Return 1 to ensure the core destroys the id. */ return 1; case RDMA_CM_EVENT_ESTABLISHED: + ++xprt->rx_xprt.connect_cookie; connstate = 1; rpcrdma_update_connect_private(xprt, &event->param.conn); goto connected; @@ -273,6 +274,7 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event) connstate = -EAGAIN; goto connected; case RDMA_CM_EVENT_DISCONNECTED: + ++xprt->rx_xprt.connect_cookie; connstate = -ECONNABORTED; connected: xprt->rx_buf.rb_credits = 1; diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 430a6de8300e..29ea0b4e98f9 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -334,7 +334,6 @@ enum { struct rpcrdma_buffer; struct rpcrdma_req { struct list_head rl_list; - unsigned int rl_connect_cookie; struct rpcrdma_buffer *rl_buffer; struct rpcrdma_rep *rl_reply; struct xdr_stream rl_stream; -- cgit v1.2.3 From 9e679d5e7660eb3e75255a4f583d44789ad1b743 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 28 Feb 2018 15:30:44 -0500 Subject: xprtrdma: ->send_request returns -EAGAIN when there are no free MRs Currently, when the MR free list is exhausted during marshaling, the RPC/RDMA transport places the RPC task on the delayq, which forces a wait for HZ >> 2 before the marshal and send is retried. With this change, the transport now places such an RPC task on the pending queue, and wakes it just as soon as more MRs have been created. Creating more MRs typically takes less than a millisecond, and this waking mechanism is less deadlock-prone. Moreover, the waiting RPC task is holding the transport's write lock, which blocks the transport from sending RPCs. Therefore faster recovery from MR exhaustion is desirable. This is the same mechanism that the TCP transport utilizes when handling write buffer space exhaustion. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/fmr_ops.c | 2 +- net/sunrpc/xprtrdma/frwr_ops.c | 2 +- net/sunrpc/xprtrdma/rpc_rdma.c | 30 +++++++++++++++++++++--------- net/sunrpc/xprtrdma/transport.c | 3 ++- net/sunrpc/xprtrdma/verbs.c | 3 ++- 5 files changed, 27 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c index d5f95bb39300..629e53973321 100644 --- a/net/sunrpc/xprtrdma/fmr_ops.c +++ b/net/sunrpc/xprtrdma/fmr_ops.c @@ -191,7 +191,7 @@ fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, mr = rpcrdma_mr_get(r_xprt); if (!mr) - return ERR_PTR(-ENOBUFS); + return ERR_PTR(-EAGAIN); pageoff = offset_in_page(seg1->mr_offset); seg1->mr_offset -= pageoff; /* start of page */ diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 90f688f19783..e21781cb20be 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -367,7 +367,7 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, rpcrdma_mr_defer_recovery(mr); mr = rpcrdma_mr_get(r_xprt); if (!mr) - return ERR_PTR(-ENOBUFS); + return ERR_PTR(-EAGAIN); } while (mr->frwr.fr_state != FRWR_IS_INVALID); frwr = &mr->frwr; frwr->fr_state = FRWR_IS_VALID; diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 4bc0f4d94a01..e8adad33d0bb 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -365,7 +365,7 @@ rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, seg = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, false, &mr); if (IS_ERR(seg)) - return PTR_ERR(seg); + goto out_maperr; rpcrdma_mr_push(mr, &req->rl_registered); if (encode_read_segment(xdr, mr, pos) < 0) @@ -377,6 +377,11 @@ rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, } while (nsegs); return 0; + +out_maperr: + if (PTR_ERR(seg) == -EAGAIN) + xprt_wait_for_buffer_space(rqst->rq_task, NULL); + return PTR_ERR(seg); } /* Register and XDR encode the Write list. Supports encoding a list @@ -423,7 +428,7 @@ rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, seg = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, true, &mr); if (IS_ERR(seg)) - return PTR_ERR(seg); + goto out_maperr; rpcrdma_mr_push(mr, &req->rl_registered); if (encode_rdma_segment(xdr, mr) < 0) @@ -440,6 +445,11 @@ rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, *segcount = cpu_to_be32(nchunks); return 0; + +out_maperr: + if (PTR_ERR(seg) == -EAGAIN) + xprt_wait_for_buffer_space(rqst->rq_task, NULL); + return PTR_ERR(seg); } /* Register and XDR encode the Reply chunk. Supports encoding an array @@ -481,7 +491,7 @@ rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, seg = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, true, &mr); if (IS_ERR(seg)) - return PTR_ERR(seg); + goto out_maperr; rpcrdma_mr_push(mr, &req->rl_registered); if (encode_rdma_segment(xdr, mr) < 0) @@ -498,6 +508,11 @@ rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, *segcount = cpu_to_be32(nchunks); return 0; + +out_maperr: + if (PTR_ERR(seg) == -EAGAIN) + xprt_wait_for_buffer_space(rqst->rq_task, NULL); + return PTR_ERR(seg); } /** @@ -724,8 +739,8 @@ rpcrdma_prepare_send_sges(struct rpcrdma_xprt *r_xprt, * Returns: * %0 if the RPC was sent successfully, * %-ENOTCONN if the connection was lost, - * %-EAGAIN if not enough pages are available for on-demand reply buffer, - * %-ENOBUFS if no MRs are available to register chunks, + * %-EAGAIN if the caller should call again with the same arguments, + * %-ENOBUFS if the caller should call again after a delay, * %-EMSGSIZE if the transport header is too small, * %-EIO if a permanent problem occurred while marshaling. */ @@ -868,10 +883,7 @@ rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst) return 0; out_err: - if (ret != -ENOBUFS) { - pr_err("rpcrdma: header marshaling failed (%d)\n", ret); - r_xprt->rx_stats.failed_marshal_count++; - } + r_xprt->rx_stats.failed_marshal_count++; return ret; } diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 47b4604e820a..08196896953d 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -689,7 +689,8 @@ xprt_rdma_free(struct rpc_task *task) * Returns: * %0 if the RPC message has been sent * %-ENOTCONN if the caller should reconnect and call again - * %-ENOBUFS if the caller should call again later + * %-EAGAIN if the caller should call again + * %-ENOBUFS if the caller should call again after a delay * %-EIO if a permanent error occurred and the request was not * sent. Do not try to send this message again. */ diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 520e7e4fe203..d36c18f26cd3 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -1048,8 +1048,9 @@ rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt) list_splice(&all, &buf->rb_all); r_xprt->rx_stats.mrs_allocated += count; spin_unlock(&buf->rb_mrlock); - trace_xprtrdma_createmrs(r_xprt, count); + + xprt_write_space(&r_xprt->rx_xprt); } static void -- cgit v1.2.3 From ae741a855170fa97adabce7e48bdf9de71186a5f Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 28 Feb 2018 15:30:49 -0500 Subject: xprtrdma: Reduce number of MRs created by rpcrdma_mrs_create Create fewer MRs on average. Many workloads don't need as many as 32 MRs, and the transport can now quickly restock the MR free list. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/verbs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index d36c18f26cd3..ab8672443c3a 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -1023,7 +1023,7 @@ rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt) LIST_HEAD(free); LIST_HEAD(all); - for (count = 0; count < 32; count++) { + for (count = 0; count < 3; count++) { struct rpcrdma_mr *mr; int rc; -- cgit v1.2.3 From fb14ae8853e4f0347950f98e604fa2f4f3b3abe1 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 28 Feb 2018 15:30:54 -0500 Subject: xprtrdma: "Support" call-only RPCs RPC-over-RDMA version 1 credit accounting relies on there being a response message for every RPC Call. This means that RPC procedures that have no reply will disrupt credit accounting, just in the same way as a retransmit would (since it is sent because no reply has arrived). Deal with the "no reply" case the same way. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- include/linux/sunrpc/clnt.h | 7 +++++++ net/sunrpc/sunrpc.h | 6 ------ net/sunrpc/xprtrdma/transport.c | 6 ++++++ 3 files changed, 13 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h index ed761f751ecb..9b11b6a0978c 100644 --- a/include/linux/sunrpc/clnt.h +++ b/include/linux/sunrpc/clnt.h @@ -217,5 +217,12 @@ void rpc_clnt_xprt_switch_add_xprt(struct rpc_clnt *, struct rpc_xprt *); bool rpc_clnt_xprt_switch_has_addr(struct rpc_clnt *clnt, const struct sockaddr *sap); void rpc_cleanup_clids(void); + +static inline int rpc_reply_expected(struct rpc_task *task) +{ + return (task->tk_msg.rpc_proc != NULL) && + (task->tk_msg.rpc_proc->p_decode != NULL); +} + #endif /* __KERNEL__ */ #endif /* _LINUX_SUNRPC_CLNT_H */ diff --git a/net/sunrpc/sunrpc.h b/net/sunrpc/sunrpc.h index f2b7cb540e61..09a0315ea77b 100644 --- a/net/sunrpc/sunrpc.h +++ b/net/sunrpc/sunrpc.h @@ -37,12 +37,6 @@ struct rpc_buffer { char data[]; }; -static inline int rpc_reply_expected(struct rpc_task *task) -{ - return (task->tk_msg.rpc_proc != NULL) && - (task->tk_msg.rpc_proc->p_decode != NULL); -} - static inline int sock_is_loopback(struct sock *sk) { struct dst_entry *dst; diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 08196896953d..7e39faa90c41 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -728,6 +728,12 @@ xprt_rdma_send_request(struct rpc_task *task) rqst->rq_xmit_bytes_sent += rqst->rq_snd_buf.len; rqst->rq_bytes_sent = 0; + + /* An RPC with no reply will throw off credit accounting, + * so drop the connection to reset the credit grant. + */ + if (!rpc_reply_expected(task)) + goto drop_connection; return 0; failed_marshal: -- cgit v1.2.3 From f2877623082b720c1424b163cf905fff8eed4126 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 28 Feb 2018 15:30:59 -0500 Subject: xprtrdma: Chain Send to FastReg WRs With FRWR, the client transport can perform memory registration and post a Send with just a single ib_post_send. This reduces contention between the send_request path and the Send Completion handlers, and reduces the overhead of registering a chunk that has multiple segments. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/fmr_ops.c | 11 +++++++++ net/sunrpc/xprtrdma/frwr_ops.c | 51 ++++++++++++++++++++++++++++------------- net/sunrpc/xprtrdma/verbs.c | 3 +-- net/sunrpc/xprtrdma/xprt_rdma.h | 2 ++ 4 files changed, 49 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c index 629e53973321..5cc68a824f45 100644 --- a/net/sunrpc/xprtrdma/fmr_ops.c +++ b/net/sunrpc/xprtrdma/fmr_ops.c @@ -251,6 +251,16 @@ out_maperr: return ERR_PTR(-EIO); } +/* Post Send WR containing the RPC Call message. + */ +static int +fmr_op_send(struct rpcrdma_ia *ia, struct rpcrdma_req *req) +{ + struct ib_send_wr *bad_wr; + + return ib_post_send(ia->ri_id->qp, &req->rl_sendctx->sc_wr, &bad_wr); +} + /* Invalidate all memory regions that were registered for "req". * * Sleeps until it is safe for the host CPU to access the @@ -305,6 +315,7 @@ out_reset: const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops = { .ro_map = fmr_op_map, + .ro_send = fmr_op_send, .ro_unmap_sync = fmr_op_unmap_sync, .ro_recover_mr = fmr_op_recover_mr, .ro_open = fmr_op_open, diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index e21781cb20be..c5743a0960be 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -357,8 +357,7 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, struct rpcrdma_mr *mr; struct ib_mr *ibmr; struct ib_reg_wr *reg_wr; - struct ib_send_wr *bad_wr; - int rc, i, n; + int i, n; u8 key; mr = NULL; @@ -407,22 +406,12 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, ib_update_fast_reg_key(ibmr, ++key); reg_wr = &frwr->fr_regwr; - reg_wr->wr.next = NULL; - reg_wr->wr.opcode = IB_WR_REG_MR; - frwr->fr_cqe.done = frwr_wc_fastreg; - reg_wr->wr.wr_cqe = &frwr->fr_cqe; - reg_wr->wr.num_sge = 0; - reg_wr->wr.send_flags = 0; reg_wr->mr = ibmr; reg_wr->key = ibmr->rkey; reg_wr->access = writing ? IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE : IB_ACCESS_REMOTE_READ; - rc = ib_post_send(ia->ri_id->qp, ®_wr->wr, &bad_wr); - if (rc) - goto out_senderr; - mr->mr_handle = ibmr->rkey; mr->mr_length = ibmr->length; mr->mr_offset = ibmr->iova; @@ -442,11 +431,40 @@ out_mapmr_err: frwr->fr_mr, n, mr->mr_nents); rpcrdma_mr_defer_recovery(mr); return ERR_PTR(-EIO); +} -out_senderr: - pr_err("rpcrdma: FRWR registration ib_post_send returned %i\n", rc); - rpcrdma_mr_defer_recovery(mr); - return ERR_PTR(-ENOTCONN); +/* Post Send WR containing the RPC Call message. + * + * For FRMR, chain any FastReg WRs to the Send WR. Only a + * single ib_post_send call is needed to register memory + * and then post the Send WR. + */ +static int +frwr_op_send(struct rpcrdma_ia *ia, struct rpcrdma_req *req) +{ + struct ib_send_wr *post_wr, *bad_wr; + struct rpcrdma_mr *mr; + + post_wr = &req->rl_sendctx->sc_wr; + list_for_each_entry(mr, &req->rl_registered, mr_list) { + struct rpcrdma_frwr *frwr; + + frwr = &mr->frwr; + + frwr->fr_cqe.done = frwr_wc_fastreg; + frwr->fr_regwr.wr.next = post_wr; + frwr->fr_regwr.wr.wr_cqe = &frwr->fr_cqe; + frwr->fr_regwr.wr.num_sge = 0; + frwr->fr_regwr.wr.opcode = IB_WR_REG_MR; + frwr->fr_regwr.wr.send_flags = 0; + + post_wr = &frwr->fr_regwr.wr; + } + + /* If ib_post_send fails, the next ->send_request for + * @req will queue these MWs for recovery. + */ + return ib_post_send(ia->ri_id->qp, post_wr, &bad_wr); } /* Handle a remotely invalidated mr on the @mrs list @@ -561,6 +579,7 @@ reset_mrs: const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops = { .ro_map = frwr_op_map, + .ro_send = frwr_op_send, .ro_reminv = frwr_op_reminv, .ro_unmap_sync = frwr_op_unmap_sync, .ro_recover_mr = frwr_op_recover_mr, diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index ab8672443c3a..626fd3074186 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -1535,7 +1535,6 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia, struct rpcrdma_req *req) { struct ib_send_wr *send_wr = &req->rl_sendctx->sc_wr; - struct ib_send_wr *send_wr_fail; int rc; if (req->rl_reply) { @@ -1554,7 +1553,7 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia, --ep->rep_send_count; } - rc = ib_post_send(ia->ri_id->qp, send_wr, &send_wr_fail); + rc = ia->ri_ops->ro_send(ia, req); trace_xprtrdma_post_send(req, rc); if (rc) return -ENOTCONN; diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 29ea0b4e98f9..3d3b423fa9c1 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -472,6 +472,8 @@ struct rpcrdma_memreg_ops { (*ro_map)(struct rpcrdma_xprt *, struct rpcrdma_mr_seg *, int, bool, struct rpcrdma_mr **); + int (*ro_send)(struct rpcrdma_ia *ia, + struct rpcrdma_req *req); void (*ro_reminv)(struct rpcrdma_rep *rep, struct list_head *mrs); void (*ro_unmap_sync)(struct rpcrdma_xprt *, -- cgit v1.2.3 From 2dd4a012d9e73c423a8c48d7e0f2e427caecce3d Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Wed, 28 Feb 2018 15:31:05 -0500 Subject: xprtrdma: Move creation of rl_rdmabuf to rpcrdma_create_req Refactor: Both rpcrdma_create_req call sites have to allocate the buffer where the transport header is built, so just move that allocation into rpcrdma_create_req. This buffer is a fixed size. There's no needed information available in call_allocate that is not also available when the transport is created. The original purpose for allocating these buffers on demand was to reduce the possibility that an allocation failure during transport creation will hork the mount operation during low memory scenarios. Some relief for this rare possibility is coming up in the next few patches. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/backchannel.c | 7 ------- net/sunrpc/xprtrdma/transport.c | 25 ------------------------- net/sunrpc/xprtrdma/verbs.c | 14 ++++++++++++-- 3 files changed, 12 insertions(+), 34 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c index ed1a4a3065ee..47ebac949769 100644 --- a/net/sunrpc/xprtrdma/backchannel.c +++ b/net/sunrpc/xprtrdma/backchannel.c @@ -44,13 +44,6 @@ static int rpcrdma_bc_setup_rqst(struct rpcrdma_xprt *r_xprt, if (IS_ERR(req)) return PTR_ERR(req); - rb = rpcrdma_alloc_regbuf(RPCRDMA_HDRBUF_SIZE, - DMA_TO_DEVICE, GFP_KERNEL); - if (IS_ERR(rb)) - goto out_fail; - req->rl_rdmabuf = rb; - xdr_buf_init(&req->rl_hdrbuf, rb->rg_base, rdmab_length(rb)); - size = r_xprt->rx_data.inline_rsize; rb = rpcrdma_alloc_regbuf(size, DMA_TO_DEVICE, GFP_KERNEL); if (IS_ERR(rb)) diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 7e39faa90c41..67e438612c18 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -537,29 +537,6 @@ xprt_rdma_connect(struct rpc_xprt *xprt, struct rpc_task *task) } } -/* Allocate a fixed-size buffer in which to construct and send the - * RPC-over-RDMA header for this request. - */ -static bool -rpcrdma_get_rdmabuf(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, - gfp_t flags) -{ - size_t size = RPCRDMA_HDRBUF_SIZE; - struct rpcrdma_regbuf *rb; - - if (req->rl_rdmabuf) - return true; - - rb = rpcrdma_alloc_regbuf(size, DMA_TO_DEVICE, flags); - if (IS_ERR(rb)) - return false; - - r_xprt->rx_stats.hardway_register_count += size; - req->rl_rdmabuf = rb; - xdr_buf_init(&req->rl_hdrbuf, rb->rg_base, rdmab_length(rb)); - return true; -} - static bool rpcrdma_get_sendbuf(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, size_t size, gfp_t flags) @@ -641,8 +618,6 @@ xprt_rdma_allocate(struct rpc_task *task) if (RPC_IS_SWAPPER(task)) flags = __GFP_MEMALLOC | GFP_NOWAIT | __GFP_NOWARN; - if (!rpcrdma_get_rdmabuf(r_xprt, req, flags)) - goto out_fail; if (!rpcrdma_get_sendbuf(r_xprt, req, rqst->rq_callsize, flags)) goto out_fail; if (!rpcrdma_get_recvbuf(r_xprt, req, rqst->rq_rcvsize, flags)) diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 626fd3074186..6a7a5a277e75 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -1068,17 +1068,27 @@ struct rpcrdma_req * rpcrdma_create_req(struct rpcrdma_xprt *r_xprt) { struct rpcrdma_buffer *buffer = &r_xprt->rx_buf; + struct rpcrdma_regbuf *rb; struct rpcrdma_req *req; req = kzalloc(sizeof(*req), GFP_KERNEL); if (req == NULL) return ERR_PTR(-ENOMEM); + rb = rpcrdma_alloc_regbuf(RPCRDMA_HDRBUF_SIZE, + DMA_TO_DEVICE, GFP_KERNEL); + if (IS_ERR(rb)) { + kfree(req); + return ERR_PTR(-ENOMEM); + } + req->rl_rdmabuf = rb; + xdr_buf_init(&req->rl_hdrbuf, rb->rg_base, rdmab_length(rb)); + req->rl_buffer = buffer; + INIT_LIST_HEAD(&req->rl_registered); + spin_lock(&buffer->rb_reqslock); list_add(&req->rl_all, &buffer->rb_allreqs); spin_unlock(&buffer->rb_reqslock); - req->rl_buffer = &r_xprt->rx_buf; - INIT_LIST_HEAD(&req->rl_registered); return req; } -- cgit v1.2.3 From ecd465ee88bb6648c06c82b1abae6ec28cf5fccb Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 5 Mar 2018 15:12:57 -0500 Subject: SUNRPC: Move xprt_update_rtt callsite Since commit 33849792cbcd ("xprtrdma: Detect unreachable NFS/RDMA servers more reliably"), the xprtrdma transport now has a ->timer callout. But xprtrdma does not need to compute RTT data, only UDP needs that. Move the xprt_update_rtt call into the UDP transport implementation. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- include/linux/sunrpc/xprt.h | 1 + net/sunrpc/xprt.c | 11 ++++++++--- net/sunrpc/xprtsock.c | 1 + 3 files changed, 10 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index 7fad83881ce1..ad322cec049a 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -373,6 +373,7 @@ void xprt_wait_for_buffer_space(struct rpc_task *task, rpc_action action); void xprt_write_space(struct rpc_xprt *xprt); void xprt_adjust_cwnd(struct rpc_xprt *xprt, struct rpc_task *task, int result); struct rpc_rqst * xprt_lookup_rqst(struct rpc_xprt *xprt, __be32 xid); +void xprt_update_rtt(struct rpc_task *task); void xprt_complete_rqst(struct rpc_task *task, int copied); void xprt_pin_rqst(struct rpc_rqst *req); void xprt_unpin_rqst(struct rpc_rqst *req); diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 8f0ad4f268da..13fbb4849188 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -889,7 +889,13 @@ __must_hold(&req->rq_xprt->recv_lock) } } -static void xprt_update_rtt(struct rpc_task *task) +/** + * xprt_update_rtt - Update RPC RTT statistics + * @task: RPC request that recently completed + * + * Caller holds xprt->recv_lock. + */ +void xprt_update_rtt(struct rpc_task *task) { struct rpc_rqst *req = task->tk_rqstp; struct rpc_rtt *rtt = task->tk_client->cl_rtt; @@ -902,6 +908,7 @@ static void xprt_update_rtt(struct rpc_task *task) rpc_set_timeo(rtt, timer, req->rq_ntrans - 1); } } +EXPORT_SYMBOL_GPL(xprt_update_rtt); /** * xprt_complete_rqst - called when reply processing is complete @@ -921,8 +928,6 @@ void xprt_complete_rqst(struct rpc_task *task, int copied) xprt->stat.recvs++; req->rq_rtt = ktime_sub(ktime_get(), req->rq_xtime); - if (xprt->ops->timer != NULL) - xprt_update_rtt(task); list_del_init(&req->rq_list); req->rq_private_buf.len = copied; diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index a6b8c1f8f92a..61df77f41304 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -1060,6 +1060,7 @@ static void xs_udp_data_read_skb(struct rpc_xprt *xprt, if (!rovr) goto out_unlock; xprt_pin_rqst(rovr); + xprt_update_rtt(rovr->rq_task); spin_unlock(&xprt->recv_lock); task = rovr->rq_task; -- cgit v1.2.3 From 0b87a46b437c1629bc7d79f3c5a0ba3608c37544 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 5 Mar 2018 15:13:02 -0500 Subject: SUNRPC: Make RTT measurement more precise (Receive) Some RPC transports have more overhead in their reply handlers than others. For example, for RPC-over-RDMA: - RPC completion has to wait for memory invalidation, which is not a part of the server/network round trip - Recently a context switch was introduced into the reply handler, which further artificially inflates the measure of RPC RTT To capture just server and network latencies more precisely: when receiving a reply, compute the RTT as soon as the XID is recognized rather than at RPC completion time. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprt.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 13fbb4849188..cb7784c5b741 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -826,6 +826,7 @@ static void xprt_connect_status(struct rpc_task *task) * @xprt: transport on which the original request was transmitted * @xid: RPC XID of incoming reply * + * Caller holds xprt->recv_lock. */ struct rpc_rqst *xprt_lookup_rqst(struct rpc_xprt *xprt, __be32 xid) { @@ -834,6 +835,7 @@ struct rpc_rqst *xprt_lookup_rqst(struct rpc_xprt *xprt, __be32 xid) list_for_each_entry(entry, &xprt->recv, rq_list) if (entry->rq_xid == xid) { trace_xprt_lookup_rqst(xprt, xid, 0); + entry->rq_rtt = ktime_sub(ktime_get(), entry->rq_xtime); return entry; } @@ -915,7 +917,7 @@ EXPORT_SYMBOL_GPL(xprt_update_rtt); * @task: RPC request that recently completed * @copied: actual number of bytes received from the transport * - * Caller holds transport lock. + * Caller holds xprt->recv_lock. */ void xprt_complete_rqst(struct rpc_task *task, int copied) { @@ -927,7 +929,6 @@ void xprt_complete_rqst(struct rpc_task *task, int copied) trace_xprt_complete_rqst(xprt, req->rq_xid, copied); xprt->stat.recvs++; - req->rq_rtt = ktime_sub(ktime_get(), req->rq_xtime); list_del_init(&req->rq_list); req->rq_private_buf.len = copied; -- cgit v1.2.3 From 78215759e20d859b8f1de7d0aebd08878fbc4eed Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 5 Mar 2018 15:13:07 -0500 Subject: SUNRPC: Make RTT measurement more precise (Send) Some RPC transports have more overhead in their send_request callouts than others. For example, for RPC-over-RDMA: - Marshaling an RPC often has to DMA map the RPC arguments - Registration methods perform memory registration as part of marshaling To capture just server and network latencies more precisely: when sending a Call, capture the rq_xtime timestamp _after_ the transport header has been marshaled. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- net/sunrpc/xprt.c | 1 - net/sunrpc/xprtrdma/transport.c | 1 + net/sunrpc/xprtsock.c | 3 +++ 3 files changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index cb7784c5b741..73f05a1fbc19 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -1033,7 +1033,6 @@ void xprt_transmit(struct rpc_task *task) return; connect_cookie = xprt->connect_cookie; - req->rq_xtime = ktime_get(); status = xprt->ops->send_request(task); trace_xprt_transmit(xprt, req->rq_xid, status); if (status != 0) { diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 67e438612c18..cc1aad325496 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -696,6 +696,7 @@ xprt_rdma_send_request(struct rpc_task *task) /* Must suppress retransmit to maintain credits */ if (rqst->rq_connect_cookie == xprt->connect_cookie) goto drop_connection; + rqst->rq_xtime = ktime_get(); __set_bit(RPCRDMA_REQ_F_PENDING, &req->rl_flags); if (rpcrdma_ep_post(&r_xprt->rx_ia, &r_xprt->rx_ep, req)) diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 61df77f41304..e3c6a3df278b 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -527,6 +527,7 @@ static int xs_local_send_request(struct rpc_task *task) xs_pktdump("packet data:", req->rq_svec->iov_base, req->rq_svec->iov_len); + req->rq_xtime = ktime_get(); status = xs_sendpages(transport->sock, NULL, 0, xdr, req->rq_bytes_sent, true, &sent); dprintk("RPC: %s(%u) = %d\n", @@ -589,6 +590,7 @@ static int xs_udp_send_request(struct rpc_task *task) if (!xprt_bound(xprt)) return -ENOTCONN; + req->rq_xtime = ktime_get(); status = xs_sendpages(transport->sock, xs_addr(xprt), xprt->addrlen, xdr, req->rq_bytes_sent, true, &sent); @@ -678,6 +680,7 @@ static int xs_tcp_send_request(struct rpc_task *task) /* Continue transmitting the packet/record. We must be careful * to cope with writespace callbacks arriving _after_ we have * called sendmsg(). */ + req->rq_xtime = ktime_get(); while (1) { sent = 0; status = xs_sendpages(transport->sock, NULL, 0, xdr, -- cgit v1.2.3 From ff699ea8269a02d977c6ee42d58f76efe83a34f9 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 5 Mar 2018 15:13:13 -0500 Subject: SUNRPC: Make num_reqs a non-atomic integer If recording xprt->stat.max_slots is moved into xprt_alloc_slot, then xprt->num_reqs is never manipulated outside xprt->reserve_lock. There's no longer a need for xprt->num_reqs to be atomic. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- include/linux/sunrpc/xprt.h | 2 +- net/sunrpc/xprt.c | 17 +++++++++-------- 2 files changed, 10 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index ad322cec049a..5fea0fb420df 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -197,7 +197,7 @@ struct rpc_xprt { struct list_head free; /* free slots */ unsigned int max_reqs; /* max number of slots */ unsigned int min_reqs; /* min number of slots */ - atomic_t num_reqs; /* total slots */ + unsigned int num_reqs; /* total slots */ unsigned long state; /* transport state */ unsigned char resvport : 1; /* use a reserved port */ atomic_t swapper; /* we're swapping over this diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 73f05a1fbc19..70f005044f06 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -1009,7 +1009,7 @@ void xprt_transmit(struct rpc_task *task) struct rpc_rqst *req = task->tk_rqstp; struct rpc_xprt *xprt = req->rq_xprt; unsigned int connect_cookie; - int status, numreqs; + int status; dprintk("RPC: %5u xprt_transmit(%u)\n", task->tk_pid, req->rq_slen); @@ -1047,9 +1047,6 @@ void xprt_transmit(struct rpc_task *task) xprt->ops->set_retrans_timeout(task); - numreqs = atomic_read(&xprt->num_reqs); - if (numreqs > xprt->stat.max_slots) - xprt->stat.max_slots = numreqs; xprt->stat.sends++; xprt->stat.req_u += xprt->stat.sends - xprt->stat.recvs; xprt->stat.bklog_u += xprt->backlog.qlen; @@ -1111,14 +1108,15 @@ static struct rpc_rqst *xprt_dynamic_alloc_slot(struct rpc_xprt *xprt) { struct rpc_rqst *req = ERR_PTR(-EAGAIN); - if (!atomic_add_unless(&xprt->num_reqs, 1, xprt->max_reqs)) + if (xprt->num_reqs >= xprt->max_reqs) goto out; + ++xprt->num_reqs; spin_unlock(&xprt->reserve_lock); req = kzalloc(sizeof(struct rpc_rqst), GFP_NOFS); spin_lock(&xprt->reserve_lock); if (req != NULL) goto out; - atomic_dec(&xprt->num_reqs); + --xprt->num_reqs; req = ERR_PTR(-ENOMEM); out: return req; @@ -1126,7 +1124,8 @@ out: static bool xprt_dynamic_free_slot(struct rpc_xprt *xprt, struct rpc_rqst *req) { - if (atomic_add_unless(&xprt->num_reqs, -1, xprt->min_reqs)) { + if (xprt->num_reqs > xprt->min_reqs) { + --xprt->num_reqs; kfree(req); return true; } @@ -1162,6 +1161,8 @@ void xprt_alloc_slot(struct rpc_xprt *xprt, struct rpc_task *task) spin_unlock(&xprt->reserve_lock); return; out_init_req: + xprt->stat.max_slots = max_t(unsigned int, xprt->stat.max_slots, + xprt->num_reqs); task->tk_status = 0; task->tk_rqstp = req; xprt_request_init(task, xprt); @@ -1229,7 +1230,7 @@ struct rpc_xprt *xprt_alloc(struct net *net, size_t size, else xprt->max_reqs = num_prealloc; xprt->min_reqs = num_prealloc; - atomic_set(&xprt->num_reqs, num_prealloc); + xprt->num_reqs = num_prealloc; return xprt; -- cgit v1.2.3 From e671edb9428c8a61662aaf8c39f5edced7cc45c7 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 16 Mar 2018 10:33:44 -0400 Subject: sunrpc: Simplify synopsis of some trace points Clean up: struct rpc_task carries a pointer to a struct rpc_clnt, and in fact task->tk_client is always what is passed into trace points that are already passing @task. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- include/trace/events/sunrpc.h | 40 +++++++++++++++++++++------------------- net/sunrpc/clnt.c | 2 +- net/sunrpc/sched.c | 10 +++++----- 3 files changed, 27 insertions(+), 25 deletions(-) (limited to 'net') diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h index 970c91a83173..53bbaeac08dd 100644 --- a/include/trace/events/sunrpc.h +++ b/include/trace/events/sunrpc.h @@ -50,9 +50,9 @@ DEFINE_EVENT(rpc_task_status, rpc_bind_status, ); TRACE_EVENT(rpc_connect_status, - TP_PROTO(struct rpc_task *task, int status), + TP_PROTO(const struct rpc_task *task), - TP_ARGS(task, status), + TP_ARGS(task), TP_STRUCT__entry( __field(unsigned int, task_id) @@ -63,7 +63,7 @@ TRACE_EVENT(rpc_connect_status, TP_fast_assign( __entry->task_id = task->tk_pid; __entry->client_id = task->tk_client->cl_clid; - __entry->status = status; + __entry->status = task->tk_status; ), TP_printk("task:%u@%u status=%d", @@ -103,9 +103,9 @@ TRACE_EVENT(rpc_request, DECLARE_EVENT_CLASS(rpc_task_running, - TP_PROTO(const struct rpc_clnt *clnt, const struct rpc_task *task, const void *action), + TP_PROTO(const struct rpc_task *task, const void *action), - TP_ARGS(clnt, task, action), + TP_ARGS(task, action), TP_STRUCT__entry( __field(unsigned int, task_id) @@ -117,7 +117,8 @@ DECLARE_EVENT_CLASS(rpc_task_running, ), TP_fast_assign( - __entry->client_id = clnt ? clnt->cl_clid : -1; + __entry->client_id = task->tk_client ? + task->tk_client->cl_clid : -1; __entry->task_id = task->tk_pid; __entry->action = action; __entry->runstate = task->tk_runstate; @@ -136,33 +137,33 @@ DECLARE_EVENT_CLASS(rpc_task_running, DEFINE_EVENT(rpc_task_running, rpc_task_begin, - TP_PROTO(const struct rpc_clnt *clnt, const struct rpc_task *task, const void *action), + TP_PROTO(const struct rpc_task *task, const void *action), - TP_ARGS(clnt, task, action) + TP_ARGS(task, action) ); DEFINE_EVENT(rpc_task_running, rpc_task_run_action, - TP_PROTO(const struct rpc_clnt *clnt, const struct rpc_task *task, const void *action), + TP_PROTO(const struct rpc_task *task, const void *action), - TP_ARGS(clnt, task, action) + TP_ARGS(task, action) ); DEFINE_EVENT(rpc_task_running, rpc_task_complete, - TP_PROTO(const struct rpc_clnt *clnt, const struct rpc_task *task, const void *action), + TP_PROTO(const struct rpc_task *task, const void *action), - TP_ARGS(clnt, task, action) + TP_ARGS(task, action) ); DECLARE_EVENT_CLASS(rpc_task_queued, - TP_PROTO(const struct rpc_clnt *clnt, const struct rpc_task *task, const struct rpc_wait_queue *q), + TP_PROTO(const struct rpc_task *task, const struct rpc_wait_queue *q), - TP_ARGS(clnt, task, q), + TP_ARGS(task, q), TP_STRUCT__entry( __field(unsigned int, task_id) @@ -175,7 +176,8 @@ DECLARE_EVENT_CLASS(rpc_task_queued, ), TP_fast_assign( - __entry->client_id = clnt ? clnt->cl_clid : -1; + __entry->client_id = task->tk_client ? + task->tk_client->cl_clid : -1; __entry->task_id = task->tk_pid; __entry->timeout = task->tk_timeout; __entry->runstate = task->tk_runstate; @@ -196,17 +198,17 @@ DECLARE_EVENT_CLASS(rpc_task_queued, DEFINE_EVENT(rpc_task_queued, rpc_task_sleep, - TP_PROTO(const struct rpc_clnt *clnt, const struct rpc_task *task, const struct rpc_wait_queue *q), + TP_PROTO(const struct rpc_task *task, const struct rpc_wait_queue *q), - TP_ARGS(clnt, task, q) + TP_ARGS(task, q) ); DEFINE_EVENT(rpc_task_queued, rpc_task_wakeup, - TP_PROTO(const struct rpc_clnt *clnt, const struct rpc_task *task, const struct rpc_wait_queue *q), + TP_PROTO(const struct rpc_task *task, const struct rpc_wait_queue *q), - TP_ARGS(clnt, task, q) + TP_ARGS(task, q) ); diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 6e432ecd7f99..079f06fe2c5d 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -1887,7 +1887,7 @@ call_connect_status(struct rpc_task *task) dprint_status(task); - trace_rpc_connect_status(task, status); + trace_rpc_connect_status(task); task->tk_status = 0; switch (status) { case -ECONNREFUSED: diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index d9db2eab3a8d..3fe5d60ab0e2 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -276,7 +276,7 @@ static void rpc_set_active(struct rpc_task *task) { rpc_task_set_debuginfo(task); set_bit(RPC_TASK_ACTIVE, &task->tk_runstate); - trace_rpc_task_begin(task->tk_client, task, NULL); + trace_rpc_task_begin(task, NULL); } /* @@ -291,7 +291,7 @@ static int rpc_complete_task(struct rpc_task *task) unsigned long flags; int ret; - trace_rpc_task_complete(task->tk_client, task, NULL); + trace_rpc_task_complete(task, NULL); spin_lock_irqsave(&wq->lock, flags); clear_bit(RPC_TASK_ACTIVE, &task->tk_runstate); @@ -358,7 +358,7 @@ static void __rpc_sleep_on_priority(struct rpc_wait_queue *q, dprintk("RPC: %5u sleep_on(queue \"%s\" time %lu)\n", task->tk_pid, rpc_qname(q), jiffies); - trace_rpc_task_sleep(task->tk_client, task, q); + trace_rpc_task_sleep(task, q); __rpc_add_wait_queue(q, task, queue_priority); @@ -428,7 +428,7 @@ static void __rpc_do_wake_up_task_on_wq(struct workqueue_struct *wq, return; } - trace_rpc_task_wakeup(task->tk_client, task, queue); + trace_rpc_task_wakeup(task, queue); __rpc_remove_wait_queue(queue, task); @@ -780,7 +780,7 @@ static void __rpc_execute(struct rpc_task *task) } if (!do_action) break; - trace_rpc_task_run_action(task->tk_client, task, do_action); + trace_rpc_task_run_action(task, do_action); do_action(task); /* -- cgit v1.2.3 From 40bf7eb304b5659991ed932c0cd5bee6a7c88f4f Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 16 Mar 2018 10:33:49 -0400 Subject: sunrpc: Add static trace point to report RPC latency stats Introduce a low-overhead mechanism to report information about latencies of individual RPCs. The goal is to enable user space to filter the trace record for latency outliers, or build histograms, etc. Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- include/trace/events/sunrpc.h | 45 +++++++++++++++++++++++++++++++++++++++++++ net/sunrpc/stats.c | 16 ++++++++++----- 2 files changed, 56 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h index 53bbaeac08dd..fda9e726f1cc 100644 --- a/include/trace/events/sunrpc.h +++ b/include/trace/events/sunrpc.h @@ -212,6 +212,51 @@ DEFINE_EVENT(rpc_task_queued, rpc_task_wakeup, ); +TRACE_EVENT(rpc_stats_latency, + + TP_PROTO( + const struct rpc_task *task, + ktime_t backlog, + ktime_t rtt, + ktime_t execute + ), + + TP_ARGS(task, backlog, rtt, execute), + + TP_STRUCT__entry( + __field(u32, xid) + __field(int, version) + __string(progname, task->tk_client->cl_program->name) + __string(procname, rpc_proc_name(task)) + __field(unsigned long, backlog) + __field(unsigned long, rtt) + __field(unsigned long, execute) + __string(addr, + task->tk_xprt->address_strings[RPC_DISPLAY_ADDR]) + __string(port, + task->tk_xprt->address_strings[RPC_DISPLAY_PORT]) + ), + + TP_fast_assign( + __entry->xid = be32_to_cpu(task->tk_rqstp->rq_xid); + __entry->version = task->tk_client->cl_vers; + __assign_str(progname, task->tk_client->cl_program->name) + __assign_str(procname, rpc_proc_name(task)) + __entry->backlog = ktime_to_us(backlog); + __entry->rtt = ktime_to_us(rtt); + __entry->execute = ktime_to_us(execute); + __assign_str(addr, + task->tk_xprt->address_strings[RPC_DISPLAY_ADDR]); + __assign_str(port, + task->tk_xprt->address_strings[RPC_DISPLAY_PORT]); + ), + + TP_printk("peer=[%s]:%s xid=0x%08x %sv%d %s backlog=%lu rtt=%lu execute=%lu", + __get_str(addr), __get_str(port), __entry->xid, + __get_str(progname), __entry->version, __get_str(procname), + __entry->backlog, __entry->rtt, __entry->execute) +); + /* * First define the enums in the below macros to be exported to userspace * via TRACE_DEFINE_ENUM(). diff --git a/net/sunrpc/stats.c b/net/sunrpc/stats.c index 1e671333c3d5..f68aa46c9dd7 100644 --- a/net/sunrpc/stats.c +++ b/net/sunrpc/stats.c @@ -24,6 +24,8 @@ #include #include +#include + #include "netns.h" #define RPCDBG_FACILITY RPCDBG_MISC @@ -148,7 +150,7 @@ void rpc_count_iostats_metrics(const struct rpc_task *task, struct rpc_iostats *op_metrics) { struct rpc_rqst *req = task->tk_rqstp; - ktime_t delta, now; + ktime_t backlog, execute, now; if (!op_metrics || !req) return; @@ -164,16 +166,20 @@ void rpc_count_iostats_metrics(const struct rpc_task *task, op_metrics->om_bytes_sent += req->rq_xmit_bytes_sent; op_metrics->om_bytes_recv += req->rq_reply_bytes_recvd; + backlog = 0; if (ktime_to_ns(req->rq_xtime)) { - delta = ktime_sub(req->rq_xtime, task->tk_start); - op_metrics->om_queue = ktime_add(op_metrics->om_queue, delta); + backlog = ktime_sub(req->rq_xtime, task->tk_start); + op_metrics->om_queue = ktime_add(op_metrics->om_queue, backlog); } + op_metrics->om_rtt = ktime_add(op_metrics->om_rtt, req->rq_rtt); - delta = ktime_sub(now, task->tk_start); - op_metrics->om_execute = ktime_add(op_metrics->om_execute, delta); + execute = ktime_sub(now, task->tk_start); + op_metrics->om_execute = ktime_add(op_metrics->om_execute, execute); spin_unlock(&op_metrics->om_lock); + + trace_rpc_stats_latency(req->rq_task, backlog, req->rq_rtt, execute); } EXPORT_SYMBOL_GPL(rpc_count_iostats_metrics); -- cgit v1.2.3 From a25a4cb3af177a4cf5621ffbf4fa89ae60c6d4d7 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 16 Mar 2018 10:33:55 -0400 Subject: sunrpc: Add static trace point to report result of RPC ping This information can help track down local misconfiguration issues as well as network partitions and unresponsive servers. There are several ways to send a ping, and with transport multi- plexing, the exact rpc_xprt that is used is sometimes not known by the upper layer. The rpc_xprt pointer passed to the trace point call also has to be RCU-safe. I found a spot inside the client FSM where an rpc_xprt pointer is always available and safe to use. Suggested-by: Bill Baker Signed-off-by: Chuck Lever Signed-off-by: Anna Schumaker --- include/trace/events/sunrpc.h | 21 +++++++++++++++++++++ net/sunrpc/clnt.c | 6 ++++++ 2 files changed, 27 insertions(+) (limited to 'net') diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h index fda9e726f1cc..76887d60f0c0 100644 --- a/include/trace/events/sunrpc.h +++ b/include/trace/events/sunrpc.h @@ -453,6 +453,27 @@ DEFINE_EVENT(rpc_xprt_event, xprt_complete_rqst, TP_PROTO(struct rpc_xprt *xprt, __be32 xid, int status), TP_ARGS(xprt, xid, status)); +TRACE_EVENT(xprt_ping, + TP_PROTO(const struct rpc_xprt *xprt, int status), + + TP_ARGS(xprt, status), + + TP_STRUCT__entry( + __field(int, status) + __string(addr, xprt->address_strings[RPC_DISPLAY_ADDR]) + __string(port, xprt->address_strings[RPC_DISPLAY_PORT]) + ), + + TP_fast_assign( + __entry->status = status; + __assign_str(addr, xprt->address_strings[RPC_DISPLAY_ADDR]); + __assign_str(port, xprt->address_strings[RPC_DISPLAY_PORT]); + ), + + TP_printk("peer=[%s]:%s status=%d", + __get_str(addr), __get_str(port), __entry->status) +); + TRACE_EVENT(xs_tcp_data_ready, TP_PROTO(struct rpc_xprt *xprt, int err, unsigned int total), diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 079f06fe2c5d..166f8c1680d1 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -2014,6 +2014,9 @@ call_transmit_status(struct rpc_task *task) case -EPERM: if (RPC_IS_SOFTCONN(task)) { xprt_end_transmit(task); + if (!task->tk_msg.rpc_proc->p_proc) + trace_xprt_ping(task->tk_xprt, + task->tk_status); rpc_exit(task, task->tk_status); break; } @@ -2112,6 +2115,9 @@ call_status(struct rpc_task *task) struct rpc_rqst *req = task->tk_rqstp; int status; + if (!task->tk_msg.rpc_proc->p_proc) + trace_xprt_ping(task->tk_xprt, task->tk_status); + if (req->rq_reply_bytes_recvd > 0 && !req->rq_bytes_sent) task->tk_status = req->rq_reply_bytes_recvd; -- cgit v1.2.3 From 25524288631fc5b7d33259fca1e0dc38146be5d6 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 19 Mar 2018 14:23:16 -0400 Subject: xprtrdma: Fix corner cases when handling device removal Michal Kalderon has found some corner cases around device unload with active NFS mounts that I didn't have the imagination to test when xprtrdma device removal was added last year. - The ULP device removal handler is responsible for deallocating the PD. That wasn't clear to me initially, and my own testing suggested it was not necessary, but that is incorrect. - The transport destruction path can no longer assume that there is a valid ID. - When destroying a transport, ensure that ib_free_cq() is not invoked on a CQ that was already released. Reported-by: Michal Kalderon Fixes: bebd031866ca ("xprtrdma: Support unplugging an HCA from ...") Signed-off-by: Chuck Lever Cc: stable@vger.kernel.org # v4.12+ Signed-off-by: Anna Schumaker --- net/sunrpc/xprtrdma/verbs.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 6a7a5a277e75..fe5eaca2d197 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -250,7 +250,6 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event) wait_for_completion(&ia->ri_remove_done); ia->ri_id = NULL; - ia->ri_pd = NULL; ia->ri_device = NULL; /* Return 1 to ensure the core destroys the id. */ return 1; @@ -447,7 +446,9 @@ rpcrdma_ia_remove(struct rpcrdma_ia *ia) ia->ri_id->qp = NULL; } ib_free_cq(ep->rep_attr.recv_cq); + ep->rep_attr.recv_cq = NULL; ib_free_cq(ep->rep_attr.send_cq); + ep->rep_attr.send_cq = NULL; /* The ULP is responsible for ensuring all DMA * mappings and MRs are gone. @@ -460,6 +461,8 @@ rpcrdma_ia_remove(struct rpcrdma_ia *ia) rpcrdma_dma_unmap_regbuf(req->rl_recvbuf); } rpcrdma_mrs_destroy(buf); + ib_dealloc_pd(ia->ri_pd); + ia->ri_pd = NULL; /* Allow waiters to continue */ complete(&ia->ri_remove_done); @@ -627,14 +630,16 @@ rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) { cancel_delayed_work_sync(&ep->rep_connect_worker); - if (ia->ri_id->qp) { + if (ia->ri_id && ia->ri_id->qp) { rpcrdma_ep_disconnect(ep, ia); rdma_destroy_qp(ia->ri_id); ia->ri_id->qp = NULL; } - ib_free_cq(ep->rep_attr.recv_cq); - ib_free_cq(ep->rep_attr.send_cq); + if (ep->rep_attr.recv_cq) + ib_free_cq(ep->rep_attr.recv_cq); + if (ep->rep_attr.send_cq) + ib_free_cq(ep->rep_attr.send_cq); } /* Re-establish a connection after a device removal event. -- cgit v1.2.3 From 0e779aa70308462e45f7cd1a54de418dfe101694 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 20 Mar 2018 17:03:05 -0400 Subject: SUNRPC: Add helpers for decoding opaque and string types Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- include/linux/sunrpc/xdr.h | 6 ++++ net/sunrpc/xdr.c | 82 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 88 insertions(+) (limited to 'net') diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h index d950223c64b1..7e609de34d85 100644 --- a/include/linux/sunrpc/xdr.h +++ b/include/linux/sunrpc/xdr.h @@ -253,6 +253,12 @@ xdr_stream_remaining(const struct xdr_stream *xdr) return xdr->nwords << 2; } +ssize_t xdr_stream_decode_opaque(struct xdr_stream *xdr, void *ptr, + size_t size); +ssize_t xdr_stream_decode_opaque_dup(struct xdr_stream *xdr, void **ptr, + size_t maxlen, gfp_t gfp_flags); +ssize_t xdr_stream_decode_string(struct xdr_stream *xdr, char *str, + size_t size); ssize_t xdr_stream_decode_string_dup(struct xdr_stream *xdr, char **str, size_t maxlen, gfp_t gfp_flags); /** diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c index e34f4ee7f2b6..30afbd236656 100644 --- a/net/sunrpc/xdr.c +++ b/net/sunrpc/xdr.c @@ -1518,6 +1518,88 @@ out: } EXPORT_SYMBOL_GPL(xdr_process_buf); +/** + * xdr_stream_decode_opaque - Decode variable length opaque + * @xdr: pointer to xdr_stream + * @ptr: location to store opaque data + * @size: size of storage buffer @ptr + * + * Return values: + * On success, returns size of object stored in *@ptr + * %-EBADMSG on XDR buffer overflow + * %-EMSGSIZE on overflow of storage buffer @ptr + */ +ssize_t xdr_stream_decode_opaque(struct xdr_stream *xdr, void *ptr, size_t size) +{ + ssize_t ret; + void *p; + + ret = xdr_stream_decode_opaque_inline(xdr, &p, size); + if (ret <= 0) + return ret; + memcpy(ptr, p, ret); + return ret; +} +EXPORT_SYMBOL_GPL(xdr_stream_decode_opaque); + +/** + * xdr_stream_decode_opaque_dup - Decode and duplicate variable length opaque + * @xdr: pointer to xdr_stream + * @ptr: location to store pointer to opaque data + * @maxlen: maximum acceptable object size + * @gfp_flags: GFP mask to use + * + * Return values: + * On success, returns size of object stored in *@ptr + * %-EBADMSG on XDR buffer overflow + * %-EMSGSIZE if the size of the object would exceed @maxlen + * %-ENOMEM on memory allocation failure + */ +ssize_t xdr_stream_decode_opaque_dup(struct xdr_stream *xdr, void **ptr, + size_t maxlen, gfp_t gfp_flags) +{ + ssize_t ret; + void *p; + + ret = xdr_stream_decode_opaque_inline(xdr, &p, maxlen); + if (ret > 0) { + *ptr = kmemdup(p, ret, gfp_flags); + if (*ptr != NULL) + return ret; + ret = -ENOMEM; + } + *ptr = NULL; + return ret; +} +EXPORT_SYMBOL_GPL(xdr_stream_decode_opaque_dup); + +/** + * xdr_stream_decode_string - Decode variable length string + * @xdr: pointer to xdr_stream + * @str: location to store string + * @size: size of storage buffer @str + * + * Return values: + * On success, returns length of NUL-terminated string stored in *@str + * %-EBADMSG on XDR buffer overflow + * %-EMSGSIZE on overflow of storage buffer @str + */ +ssize_t xdr_stream_decode_string(struct xdr_stream *xdr, char *str, size_t size) +{ + ssize_t ret; + void *p; + + ret = xdr_stream_decode_opaque_inline(xdr, &p, size); + if (ret > 0) { + memcpy(str, p, ret); + str[ret] = '\0'; + return strlen(str); + } + *str = '\0'; + return ret; +} +EXPORT_SYMBOL_GPL(xdr_stream_decode_string); + /** * xdr_stream_decode_string_dup - Decode and duplicate variable length string * @xdr: pointer to xdr_stream -- cgit v1.2.3