summaryrefslogtreecommitdiffstats
path: root/drivers/infiniband
diff options
context:
space:
mode:
authorSteve Wise <swise@opengridcomputing.com>2014-04-09 09:38:25 -0500
committerRoland Dreier <roland@purestorage.com>2014-04-11 11:36:07 -0700
commitb33bd0cbfa102b8f87702338aa72742fe3c7f220 (patch)
tree0a57d3b48b92f125c028b26b1b922c96f2c30c05 /drivers/infiniband
parentfa658a98a2d08352c514758b3394caf91360aa44 (diff)
downloadlinux-b33bd0cbfa102b8f87702338aa72742fe3c7f220.tar.bz2
RDMA/cxgb4: Endpoint timeout fixes
1) timedout endpoint processing can be starved. If there are continual CPL messages flowing into the driver, the endpoint timeout processing can be starved. This condition exposed the other bugs below. Solution: In process_work(), call process_timedout_eps() after each CPL is processed. 2) Connection events can be processed even though the endpoint is on the timeout list. If the endpoint is scheduled for timeout processing, then we must ignore MPA Start Requests and Replies. Solution: Change stop_ep_timer() to return 1 if the ep has already been queued for timeout processing. All the callers of stop_ep_timer() need to check this and act accordingly. There are just a few cases where the caller needs to do something different if stop_ep_timer() returns 1: 1) in process_mpa_reply(), ignore the reply and process_timeout() will abort the connection. 2) in process_mpa_request, ignore the request and process_timeout() will abort the connection. It is ok for callers of stop_ep_timer() to abort the connection since that will leave the state in ABORTING or DEAD, and process_timeout() now ignores timeouts when the ep is in these states. 3) Double insertion on the timeout list. Since the endpoint timers are used for connection setup and teardown, we need to guard against the possibility that an endpoint is already on the timeout list. This is a rare condition and only seen under heavy load and in the presense of the above 2 bugs. Solution: In ep_timeout(), don't queue the endpoint if it is already on the queue. Signed-off-by: Steve Wise <swise@opengridcomputing.com> Signed-off-by: Roland Dreier <roland@purestorage.com>
Diffstat (limited to 'drivers/infiniband')
-rw-r--r--drivers/infiniband/hw/cxgb4/cm.c89
1 files changed, 56 insertions, 33 deletions
diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c
index 02436d5d0dab..185452abf32c 100644
--- a/drivers/infiniband/hw/cxgb4/cm.c
+++ b/drivers/infiniband/hw/cxgb4/cm.c
@@ -173,12 +173,15 @@ static void start_ep_timer(struct c4iw_ep *ep)
add_timer(&ep->timer);
}
-static void stop_ep_timer(struct c4iw_ep *ep)
+static int stop_ep_timer(struct c4iw_ep *ep)
{
PDBG("%s ep %p stopping\n", __func__, ep);
del_timer_sync(&ep->timer);
- if (!test_and_set_bit(TIMEOUT, &ep->com.flags))
+ if (!test_and_set_bit(TIMEOUT, &ep->com.flags)) {
c4iw_put_ep(&ep->com);
+ return 0;
+ }
+ return 1;
}
static int c4iw_l2t_send(struct c4iw_rdev *rdev, struct sk_buff *skb,
@@ -1165,12 +1168,11 @@ static void process_mpa_reply(struct c4iw_ep *ep, struct sk_buff *skb)
PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid);
/*
- * Stop mpa timer. If it expired, then the state has
- * changed and we bail since ep_timeout already aborted
- * the connection.
+ * Stop mpa timer. If it expired, then
+ * we ignore the MPA reply. process_timeout()
+ * will abort the connection.
*/
- stop_ep_timer(ep);
- if (ep->com.state != MPA_REQ_SENT)
+ if (stop_ep_timer(ep))
return;
/*
@@ -1375,15 +1377,12 @@ static void process_mpa_request(struct c4iw_ep *ep, struct sk_buff *skb)
PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid);
- if (ep->com.state != MPA_REQ_WAIT)
- return;
-
/*
* If we get more than the supported amount of private data
* then we must fail this connection.
*/
if (ep->mpa_pkt_len + skb->len > sizeof(ep->mpa_pkt)) {
- stop_ep_timer(ep);
+ (void)stop_ep_timer(ep);
abort_connection(ep, skb, GFP_KERNEL);
return;
}
@@ -1413,13 +1412,13 @@ static void process_mpa_request(struct c4iw_ep *ep, struct sk_buff *skb)
if (mpa->revision > mpa_rev) {
printk(KERN_ERR MOD "%s MPA version mismatch. Local = %d,"
" Received = %d\n", __func__, mpa_rev, mpa->revision);
- stop_ep_timer(ep);
+ (void)stop_ep_timer(ep);
abort_connection(ep, skb, GFP_KERNEL);
return;
}
if (memcmp(mpa->key, MPA_KEY_REQ, sizeof(mpa->key))) {
- stop_ep_timer(ep);
+ (void)stop_ep_timer(ep);
abort_connection(ep, skb, GFP_KERNEL);
return;
}
@@ -1430,7 +1429,7 @@ static void process_mpa_request(struct c4iw_ep *ep, struct sk_buff *skb)
* Fail if there's too much private data.
*/
if (plen > MPA_MAX_PRIVATE_DATA) {
- stop_ep_timer(ep);
+ (void)stop_ep_timer(ep);
abort_connection(ep, skb, GFP_KERNEL);
return;
}
@@ -1439,7 +1438,7 @@ static void process_mpa_request(struct c4iw_ep *ep, struct sk_buff *skb)
* If plen does not account for pkt size
*/
if (ep->mpa_pkt_len > (sizeof(*mpa) + plen)) {
- stop_ep_timer(ep);
+ (void)stop_ep_timer(ep);
abort_connection(ep, skb, GFP_KERNEL);
return;
}
@@ -1496,18 +1495,24 @@ static void process_mpa_request(struct c4iw_ep *ep, struct sk_buff *skb)
ep->mpa_attr.xmit_marker_enabled, ep->mpa_attr.version,
ep->mpa_attr.p2p_type);
- __state_set(&ep->com, MPA_REQ_RCVD);
- stop_ep_timer(ep);
-
- /* drive upcall */
- mutex_lock(&ep->parent_ep->com.mutex);
- if (ep->parent_ep->com.state != DEAD) {
- if (connect_request_upcall(ep))
+ /*
+ * If the endpoint timer already expired, then we ignore
+ * the start request. process_timeout() will abort
+ * the connection.
+ */
+ if (!stop_ep_timer(ep)) {
+ __state_set(&ep->com, MPA_REQ_RCVD);
+
+ /* drive upcall */
+ mutex_lock(&ep->parent_ep->com.mutex);
+ if (ep->parent_ep->com.state != DEAD) {
+ if (connect_request_upcall(ep))
+ abort_connection(ep, skb, GFP_KERNEL);
+ } else {
abort_connection(ep, skb, GFP_KERNEL);
- } else {
- abort_connection(ep, skb, GFP_KERNEL);
+ }
+ mutex_unlock(&ep->parent_ep->com.mutex);
}
- mutex_unlock(&ep->parent_ep->com.mutex);
return;
}
@@ -2265,7 +2270,7 @@ static int peer_close(struct c4iw_dev *dev, struct sk_buff *skb)
disconnect = 0;
break;
case MORIBUND:
- stop_ep_timer(ep);
+ (void)stop_ep_timer(ep);
if (ep->com.cm_id && ep->com.qp) {
attrs.next_state = C4IW_QP_STATE_IDLE;
c4iw_modify_qp(ep->com.qp->rhp, ep->com.qp,
@@ -2325,10 +2330,10 @@ static int peer_abort(struct c4iw_dev *dev, struct sk_buff *skb)
case CONNECTING:
break;
case MPA_REQ_WAIT:
- stop_ep_timer(ep);
+ (void)stop_ep_timer(ep);
break;
case MPA_REQ_SENT:
- stop_ep_timer(ep);
+ (void)stop_ep_timer(ep);
if (mpa_rev == 1 || (mpa_rev == 2 && ep->tried_with_mpa_v1))
connect_reply_upcall(ep, -ECONNRESET);
else {
@@ -2433,7 +2438,7 @@ static int close_con_rpl(struct c4iw_dev *dev, struct sk_buff *skb)
__state_set(&ep->com, MORIBUND);
break;
case MORIBUND:
- stop_ep_timer(ep);
+ (void)stop_ep_timer(ep);
if ((ep->com.cm_id) && (ep->com.qp)) {
attrs.next_state = C4IW_QP_STATE_IDLE;
c4iw_modify_qp(ep->com.qp->rhp,
@@ -3028,7 +3033,7 @@ int c4iw_ep_disconnect(struct c4iw_ep *ep, int abrupt, gfp_t gfp)
if (!test_and_set_bit(CLOSE_SENT, &ep->com.flags)) {
close = 1;
if (abrupt) {
- stop_ep_timer(ep);
+ (void)stop_ep_timer(ep);
ep->com.state = ABORTING;
} else
ep->com.state = MORIBUND;
@@ -3462,6 +3467,16 @@ static void process_timeout(struct c4iw_ep *ep)
__state_set(&ep->com, ABORTING);
close_complete_upcall(ep, -ETIMEDOUT);
break;
+ case ABORTING:
+ case DEAD:
+
+ /*
+ * These states are expected if the ep timed out at the same
+ * time as another thread was calling stop_ep_timer().
+ * So we silently do nothing for these states.
+ */
+ abort = 0;
+ break;
default:
WARN(1, "%s unexpected state ep %p tid %u state %u\n",
__func__, ep, ep->hwtid, ep->com.state);
@@ -3483,6 +3498,8 @@ static void process_timedout_eps(void)
tmp = timeout_list.next;
list_del(tmp);
+ tmp->next = NULL;
+ tmp->prev = NULL;
spin_unlock_irq(&timeout_lock);
ep = list_entry(tmp, struct c4iw_ep, entry);
process_timeout(ep);
@@ -3499,6 +3516,7 @@ static void process_work(struct work_struct *work)
unsigned int opcode;
int ret;
+ process_timedout_eps();
while ((skb = skb_dequeue(&rxq))) {
rpl = cplhdr(skb);
dev = *((struct c4iw_dev **) (skb->cb + sizeof(void *)));
@@ -3508,8 +3526,8 @@ static void process_work(struct work_struct *work)
ret = work_handlers[opcode](dev, skb);
if (!ret)
kfree_skb(skb);
+ process_timedout_eps();
}
- process_timedout_eps();
}
static DECLARE_WORK(skb_work, process_work);
@@ -3521,8 +3539,13 @@ static void ep_timeout(unsigned long arg)
spin_lock(&timeout_lock);
if (!test_and_set_bit(TIMEOUT, &ep->com.flags)) {
- list_add_tail(&ep->entry, &timeout_list);
- kickit = 1;
+ /*
+ * Only insert if it is not already on the list.
+ */
+ if (!ep->entry.next) {
+ list_add_tail(&ep->entry, &timeout_list);
+ kickit = 1;
+ }
}
spin_unlock(&timeout_lock);
if (kickit)