summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--drivers/infiniband/hw/hfi1/chip.c2
-rw-r--r--drivers/infiniband/hw/hfi1/chip.h1
-rw-r--r--drivers/infiniband/hw/hfi1/common.h4
-rw-r--r--drivers/infiniband/hw/hfi1/driver.c58
-rw-r--r--drivers/infiniband/hw/hfi1/hfi.h19
-rw-r--r--drivers/infiniband/hw/hfi1/init.c10
-rw-r--r--drivers/infiniband/hw/hfi1/qp.c5
-rw-r--r--drivers/infiniband/hw/hfi1/qp.h2
-rw-r--r--drivers/infiniband/hw/hfi1/rc.c542
-rw-r--r--drivers/infiniband/hw/hfi1/rc.h50
-rw-r--r--drivers/infiniband/hw/hfi1/tid_rdma.c2719
-rw-r--r--drivers/infiniband/hw/hfi1/tid_rdma.h196
-rw-r--r--drivers/infiniband/hw/hfi1/trace.c52
-rw-r--r--drivers/infiniband/hw/hfi1/trace_ibhdrs.h2
-rw-r--r--drivers/infiniband/hw/hfi1/trace_rc.h48
-rw-r--r--drivers/infiniband/hw/hfi1/trace_tid.h758
-rw-r--r--drivers/infiniband/hw/hfi1/trace_tx.h12
-rw-r--r--drivers/infiniband/hw/hfi1/user_exp_rcv.h1
-rw-r--r--drivers/infiniband/hw/hfi1/verbs.c176
-rw-r--r--drivers/infiniband/hw/hfi1/verbs.h61
-rw-r--r--drivers/infiniband/hw/qib/qib_rc.c7
-rw-r--r--drivers/infiniband/sw/rdmavt/qp.c13
-rw-r--r--drivers/infiniband/sw/rdmavt/rc.c13
-rw-r--r--include/rdma/ib_hdrs.h9
-rw-r--r--include/rdma/rdma_vt.h12
-rw-r--r--include/rdma/rdmavt_qp.h18
-rw-r--r--include/rdma/tid_rdma_defs.h52
27 files changed, 4669 insertions, 173 deletions
diff --git a/drivers/infiniband/hw/hfi1/chip.c b/drivers/infiniband/hw/hfi1/chip.c
index 4d40311f082e..612f04190ed8 100644
--- a/drivers/infiniband/hw/hfi1/chip.c
+++ b/drivers/infiniband/hw/hfi1/chip.c
@@ -4253,6 +4253,8 @@ static struct cntr_entry dev_cntrs[DEV_CNTR_LAST] = {
access_sw_pio_drain),
[C_SW_KMEM_WAIT] = CNTR_ELEM("KmemWait", 0, 0, CNTR_NORMAL,
access_sw_kmem_wait),
+[C_SW_TID_WAIT] = CNTR_ELEM("TidWait", 0, 0, CNTR_NORMAL,
+ hfi1_access_sw_tid_wait),
[C_SW_SEND_SCHED] = CNTR_ELEM("SendSched", 0, 0, CNTR_NORMAL,
access_sw_send_schedule),
[C_SDMA_DESC_FETCHED_CNT] = CNTR_ELEM("SDEDscFdCn",
diff --git a/drivers/infiniband/hw/hfi1/chip.h b/drivers/infiniband/hw/hfi1/chip.h
index ba3d99e6e33b..6c27c1c6a868 100644
--- a/drivers/infiniband/hw/hfi1/chip.h
+++ b/drivers/infiniband/hw/hfi1/chip.h
@@ -927,6 +927,7 @@ enum {
C_SW_PIO_WAIT,
C_SW_PIO_DRAIN,
C_SW_KMEM_WAIT,
+ C_SW_TID_WAIT,
C_SW_SEND_SCHED,
C_SDMA_DESC_FETCHED_CNT,
C_SDMA_INT_CNT,
diff --git a/drivers/infiniband/hw/hfi1/common.h b/drivers/infiniband/hw/hfi1/common.h
index 40d3cfb58bd1..7310a5dba420 100644
--- a/drivers/infiniband/hw/hfi1/common.h
+++ b/drivers/infiniband/hw/hfi1/common.h
@@ -340,6 +340,10 @@ struct diag_pkt {
#define HFI1_PSM_IOC_BASE_SEQ 0x0
+/* Number of BTH.PSN bits used for sequence number in expected rcvs */
+#define HFI1_KDETH_BTH_SEQ_SHIFT 11
+#define HFI1_KDETH_BTH_SEQ_MASK (BIT(HFI1_KDETH_BTH_SEQ_SHIFT) - 1)
+
static inline __u64 rhf_to_cpu(const __le32 *rbuf)
{
return __le64_to_cpu(*((__le64 *)rbuf));
diff --git a/drivers/infiniband/hw/hfi1/driver.c b/drivers/infiniband/hw/hfi1/driver.c
index a8ad70730203..2a9d2912f5db 100644
--- a/drivers/infiniband/hw/hfi1/driver.c
+++ b/drivers/infiniband/hw/hfi1/driver.c
@@ -1575,25 +1575,32 @@ drop:
return -EINVAL;
}
-void handle_eflags(struct hfi1_packet *packet)
+static void show_eflags_errs(struct hfi1_packet *packet)
{
struct hfi1_ctxtdata *rcd = packet->rcd;
u32 rte = rhf_rcv_type_err(packet->rhf);
+ dd_dev_err(rcd->dd,
+ "receive context %d: rhf 0x%016llx, errs [ %s%s%s%s%s%s%s%s] rte 0x%x\n",
+ rcd->ctxt, packet->rhf,
+ packet->rhf & RHF_K_HDR_LEN_ERR ? "k_hdr_len " : "",
+ packet->rhf & RHF_DC_UNC_ERR ? "dc_unc " : "",
+ packet->rhf & RHF_DC_ERR ? "dc " : "",
+ packet->rhf & RHF_TID_ERR ? "tid " : "",
+ packet->rhf & RHF_LEN_ERR ? "len " : "",
+ packet->rhf & RHF_ECC_ERR ? "ecc " : "",
+ packet->rhf & RHF_VCRC_ERR ? "vcrc " : "",
+ packet->rhf & RHF_ICRC_ERR ? "icrc " : "",
+ rte);
+}
+
+void handle_eflags(struct hfi1_packet *packet)
+{
+ struct hfi1_ctxtdata *rcd = packet->rcd;
+
rcv_hdrerr(rcd, rcd->ppd, packet);
if (rhf_err_flags(packet->rhf))
- dd_dev_err(rcd->dd,
- "receive context %d: rhf 0x%016llx, errs [ %s%s%s%s%s%s%s%s] rte 0x%x\n",
- rcd->ctxt, packet->rhf,
- packet->rhf & RHF_K_HDR_LEN_ERR ? "k_hdr_len " : "",
- packet->rhf & RHF_DC_UNC_ERR ? "dc_unc " : "",
- packet->rhf & RHF_DC_ERR ? "dc " : "",
- packet->rhf & RHF_TID_ERR ? "tid " : "",
- packet->rhf & RHF_LEN_ERR ? "len " : "",
- packet->rhf & RHF_ECC_ERR ? "ecc " : "",
- packet->rhf & RHF_VCRC_ERR ? "vcrc " : "",
- packet->rhf & RHF_ICRC_ERR ? "icrc " : "",
- rte);
+ show_eflags_errs(packet);
}
/*
@@ -1699,11 +1706,14 @@ static int kdeth_process_expected(struct hfi1_packet *packet)
if (unlikely(hfi1_dbg_should_fault_rx(packet)))
return RHF_RCV_CONTINUE;
- if (unlikely(rhf_err_flags(packet->rhf)))
- handle_eflags(packet);
+ if (unlikely(rhf_err_flags(packet->rhf))) {
+ struct hfi1_ctxtdata *rcd = packet->rcd;
- dd_dev_err(packet->rcd->dd,
- "Unhandled expected packet received. Dropping.\n");
+ if (hfi1_handle_kdeth_eflags(rcd, rcd->ppd, packet))
+ return RHF_RCV_CONTINUE;
+ }
+
+ hfi1_kdeth_expected_rcv(packet);
return RHF_RCV_CONTINUE;
}
@@ -1712,11 +1722,17 @@ static int kdeth_process_eager(struct hfi1_packet *packet)
hfi1_setup_9B_packet(packet);
if (unlikely(hfi1_dbg_should_fault_rx(packet)))
return RHF_RCV_CONTINUE;
- if (unlikely(rhf_err_flags(packet->rhf)))
- handle_eflags(packet);
- dd_dev_err(packet->rcd->dd,
- "Unhandled eager packet received. Dropping.\n");
+ trace_hfi1_rcvhdr(packet);
+ if (unlikely(rhf_err_flags(packet->rhf))) {
+ struct hfi1_ctxtdata *rcd = packet->rcd;
+
+ show_eflags_errs(packet);
+ if (hfi1_handle_kdeth_eflags(rcd, rcd->ppd, packet))
+ return RHF_RCV_CONTINUE;
+ }
+
+ hfi1_kdeth_eager_rcv(packet);
return RHF_RCV_CONTINUE;
}
diff --git a/drivers/infiniband/hw/hfi1/hfi.h b/drivers/infiniband/hw/hfi1/hfi.h
index 9aa0357e17b7..6582184cc985 100644
--- a/drivers/infiniband/hw/hfi1/hfi.h
+++ b/drivers/infiniband/hw/hfi1/hfi.h
@@ -198,6 +198,14 @@ struct exp_tid_set {
};
typedef int (*rhf_rcv_function_ptr)(struct hfi1_packet *packet);
+
+struct tid_queue {
+ struct list_head queue_head;
+ /* queue head for QP TID resource waiters */
+ u32 enqueue; /* count of tid enqueues */
+ u32 dequeue; /* count of tid dequeues */
+};
+
struct hfi1_ctxtdata {
/* rcvhdrq base, needs mmap before useful */
void *rcvhdrq;
@@ -291,6 +299,12 @@ struct hfi1_ctxtdata {
/* PSM Specific fields */
/* lock protecting all Expected TID data */
struct mutex exp_mutex;
+ /* lock protecting all Expected TID data of kernel contexts */
+ spinlock_t exp_lock;
+ /* Queue for QP's waiting for HW TID flows */
+ struct tid_queue flow_queue;
+ /* Queue for QP's waiting for HW receive array entries */
+ struct tid_queue rarr_queue;
/* when waiting for rcv or pioavail */
wait_queue_head_t wait;
/* uuid from PSM */
@@ -323,6 +337,9 @@ struct hfi1_ctxtdata {
*/
u8 subctxt_cnt;
+ /* Bit mask to track free TID RDMA HW flows */
+ unsigned long flow_mask;
+ struct tid_flow_state flows[RXE_NUM_TID_FLOWS];
};
/**
@@ -2103,7 +2120,7 @@ static inline u64 hfi1_pkt_default_send_ctxt_mask(struct hfi1_devdata *dd,
SEND_CTXT_CHECK_ENABLE_DISALLOW_PBC_TEST_SMASK |
#endif
HFI1_PKT_USER_SC_INTEGRITY;
- else
+ else if (ctxt_type != SC_KERNEL)
base_sc_integrity |= HFI1_PKT_KERNEL_SC_INTEGRITY;
/* turn on send-side job key checks if !A0 */
diff --git a/drivers/infiniband/hw/hfi1/init.c b/drivers/infiniband/hw/hfi1/init.c
index a8dbd0f191f5..d13304f7340d 100644
--- a/drivers/infiniband/hw/hfi1/init.c
+++ b/drivers/infiniband/hw/hfi1/init.c
@@ -370,6 +370,9 @@ int hfi1_create_ctxtdata(struct hfi1_pportdata *ppd, int numa,
rcd->rhf_rcv_function_map = normal_rhf_rcv_functions;
mutex_init(&rcd->exp_mutex);
+ spin_lock_init(&rcd->exp_lock);
+ INIT_LIST_HEAD(&rcd->flow_queue.queue_head);
+ INIT_LIST_HEAD(&rcd->rarr_queue.queue_head);
hfi1_cdbg(PROC, "setting up context %u\n", rcd->ctxt);
@@ -472,6 +475,9 @@ int hfi1_create_ctxtdata(struct hfi1_pportdata *ppd, int numa,
GFP_KERNEL, numa);
if (!rcd->opstats)
goto bail;
+
+ /* Initialize TID flow generations for the context */
+ hfi1_kern_init_ctxt_generations(rcd);
}
*context = rcd;
@@ -771,6 +777,8 @@ static void enable_chip(struct hfi1_devdata *dd)
rcvmask |= HFI1_RCVCTRL_NO_RHQ_DROP_ENB;
if (HFI1_CAP_KGET_MASK(rcd->flags, NODROP_EGR_FULL))
rcvmask |= HFI1_RCVCTRL_NO_EGR_DROP_ENB;
+ if (HFI1_CAP_IS_KSET(TID_RDMA))
+ rcvmask |= HFI1_RCVCTRL_TIDFLOW_ENB;
hfi1_rcvctrl(dd, rcvmask, rcd);
sc_enable(rcd->sc);
hfi1_rcd_put(rcd);
@@ -1589,7 +1597,7 @@ static void cleanup_device_data(struct hfi1_devdata *dd)
struct hfi1_ctxtdata *rcd = dd->rcd[ctxt];
if (rcd) {
- hfi1_clear_tids(rcd);
+ hfi1_free_ctxt_rcv_groups(rcd);
hfi1_free_ctxt(rcd);
}
}
diff --git a/drivers/infiniband/hw/hfi1/qp.c b/drivers/infiniband/hw/hfi1/qp.c
index f822f92b415f..acdd9eba189b 100644
--- a/drivers/infiniband/hw/hfi1/qp.c
+++ b/drivers/infiniband/hw/hfi1/qp.c
@@ -319,6 +319,7 @@ int hfi1_setup_wqe(struct rvt_qp *qp, struct rvt_swqe *wqe, bool *call_send)
switch (qp->ibqp.qp_type) {
case IB_QPT_RC:
+ hfi1_setup_tid_rdma_wqe(qp, wqe);
case IB_QPT_UC:
if (wqe->length > 0x80000000U)
return -EINVAL;
@@ -738,6 +739,7 @@ void flush_qp_waiters(struct rvt_qp *qp)
{
lockdep_assert_held(&qp->s_lock);
flush_iowait(qp);
+ hfi1_tid_rdma_flush_wait(qp);
}
void stop_send_queue(struct rvt_qp *qp)
@@ -745,6 +747,8 @@ void stop_send_queue(struct rvt_qp *qp)
struct hfi1_qp_priv *priv = qp->priv;
iowait_cancel_work(&priv->s_iowait);
+ if (cancel_work_sync(&priv->tid_rdma.trigger_work))
+ rvt_put_qp(qp);
}
void quiesce_qp(struct rvt_qp *qp)
@@ -758,6 +762,7 @@ void quiesce_qp(struct rvt_qp *qp)
void notify_qp_reset(struct rvt_qp *qp)
{
+ hfi1_qp_kern_exp_rcv_clear_all(qp);
qp->r_adefered = 0;
clear_ahg(qp);
diff --git a/drivers/infiniband/hw/hfi1/qp.h b/drivers/infiniband/hw/hfi1/qp.h
index 7adb6dff6813..ce25a27aa4a1 100644
--- a/drivers/infiniband/hw/hfi1/qp.h
+++ b/drivers/infiniband/hw/hfi1/qp.h
@@ -63,11 +63,13 @@ extern const struct rvt_operation_params hfi1_post_parms[];
* HFI1_S_AHG_VALID - ahg header valid on chip
* HFI1_S_AHG_CLEAR - have send engine clear ahg state
* HFI1_S_WAIT_PIO_DRAIN - qp waiting for PIOs to drain
+ * HFI1_S_WAIT_TID_SPACE - a QP is waiting for TID resource
* HFI1_S_MIN_BIT_MASK - the lowest bit that can be used by hfi1
*/
#define HFI1_S_AHG_VALID 0x80000000
#define HFI1_S_AHG_CLEAR 0x40000000
#define HFI1_S_WAIT_PIO_DRAIN 0x20000000
+#define HFI1_S_WAIT_TID_SPACE 0x10000000
#define HFI1_S_MIN_BIT_MASK 0x01000000
/*
diff --git a/drivers/infiniband/hw/hfi1/rc.c b/drivers/infiniband/hw/hfi1/rc.c
index 092d5eba980f..6c9ef572fc69 100644
--- a/drivers/infiniband/hw/hfi1/rc.c
+++ b/drivers/infiniband/hw/hfi1/rc.c
@@ -51,28 +51,48 @@
#include "hfi.h"
#include "qp.h"
+#include "rc.h"
#include "verbs_txreq.h"
#include "trace.h"
-/* cut down ridiculously long IB macro names */
-#define OP(x) RC_OP(x)
-
-static struct rvt_swqe *do_rc_completion(struct rvt_qp *qp,
- struct rvt_swqe *wqe,
- struct hfi1_ibport *ibp);
-
-static u32 restart_sge(struct rvt_sge_state *ss, struct rvt_swqe *wqe,
- u32 psn, u32 pmtu)
+struct rvt_ack_entry *find_prev_entry(struct rvt_qp *qp, u32 psn, u8 *prev,
+ u8 *prev_ack, bool *scheduled)
+ __must_hold(&qp->s_lock)
{
- u32 len;
-
- len = delta_psn(psn, wqe->psn) * pmtu;
- ss->sge = wqe->sg_list[0];
- ss->sg_list = wqe->sg_list + 1;
- ss->num_sge = wqe->wr.num_sge;
- ss->total_len = wqe->length;
- rvt_skip_sge(ss, len, false);
- return wqe->length - len;
+ struct rvt_ack_entry *e = NULL;
+ u8 i, p;
+ bool s = true;
+
+ for (i = qp->r_head_ack_queue; ; i = p) {
+ if (i == qp->s_tail_ack_queue)
+ s = false;
+ if (i)
+ p = i - 1;
+ else
+ p = rvt_size_atomic(ib_to_rvt(qp->ibqp.device));
+ if (p == qp->r_head_ack_queue) {
+ e = NULL;
+ break;
+ }
+ e = &qp->s_ack_queue[p];
+ if (!e->opcode) {
+ e = NULL;
+ break;
+ }
+ if (cmp_psn(psn, e->psn) >= 0) {
+ if (p == qp->s_tail_ack_queue &&
+ cmp_psn(psn, e->lpsn) <= 0)
+ s = false;
+ break;
+ }
+ }
+ if (prev)
+ *prev = p;
+ if (prev_ack)
+ *prev_ack = i;
+ if (scheduled)
+ *scheduled = s;
+ return e;
}
/**
@@ -92,13 +112,16 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp,
{
struct rvt_ack_entry *e;
u32 hwords;
- u32 len;
- u32 bth0, bth2;
+ u32 len = 0;
+ u32 bth0 = 0, bth2 = 0;
u32 bth1 = qp->remote_qpn | (HFI1_CAP_IS_KSET(OPFN) << IB_BTHE_E_SHIFT);
int middle = 0;
u32 pmtu = qp->pmtu;
struct hfi1_qp_priv *priv = qp->priv;
+ bool last_pkt;
+ u32 delta;
+ trace_hfi1_rsp_make_rc_ack(qp, 0);
lockdep_assert_held(&qp->s_lock);
/* Don't send an ACK if we aren't supposed to. */
if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK))
@@ -170,6 +193,26 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp,
hwords++;
qp->s_ack_rdma_psn = e->psn;
bth2 = mask_psn(qp->s_ack_rdma_psn++);
+ } else if (e->opcode == TID_OP(READ_REQ)) {
+ /*
+ * If a TID RDMA read response is being resent and
+ * we haven't seen the duplicate request yet,
+ * then stop sending the remaining responses the
+ * responder has seen until the requester re-sends it.
+ */
+ len = e->rdma_sge.sge_length;
+ if (len && !e->rdma_sge.mr) {
+ qp->s_tail_ack_queue = qp->r_head_ack_queue;
+ goto bail;
+ }
+ /* Copy SGE state in case we need to resend */
+ ps->s_txreq->mr = e->rdma_sge.mr;
+ if (ps->s_txreq->mr)
+ rvt_get_mr(ps->s_txreq->mr);
+ qp->s_ack_rdma_sge.sge = e->rdma_sge;
+ qp->s_ack_rdma_sge.num_sge = 1;
+ qp->s_ack_state = TID_OP(READ_RESP);
+ goto read_resp;
} else {
/* COMPARE_SWAP or FETCH_ADD */
ps->s_txreq->ss = NULL;
@@ -207,6 +250,28 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp,
bth2 = mask_psn(qp->s_ack_rdma_psn++);
break;
+ case TID_OP(READ_RESP):
+read_resp:
+ e = &qp->s_ack_queue[qp->s_tail_ack_queue];
+ ps->s_txreq->ss = &qp->s_ack_rdma_sge;
+ delta = hfi1_build_tid_rdma_read_resp(qp, e, ohdr, &bth0,
+ &bth1, &bth2, &len,
+ &last_pkt);
+ if (delta == 0)
+ goto error_qp;
+ hwords += delta;
+ if (last_pkt) {
+ e->sent = 1;
+ /*
+ * Increment qp->s_tail_ack_queue through s_ack_state
+ * transition.
+ */
+ qp->s_ack_state = OP(RDMA_READ_RESPONSE_LAST);
+ }
+ break;
+ case TID_OP(READ_REQ):
+ goto bail;
+
default:
normal:
/*
@@ -236,7 +301,14 @@ normal:
ps->s_txreq->hdr_dwords = hwords;
hfi1_make_ruc_header(qp, ohdr, bth0, bth1, bth2, middle, ps);
return 1;
-
+error_qp:
+ spin_unlock_irqrestore(&qp->s_lock, ps->flags);
+ spin_lock_irqsave(&qp->r_lock, ps->flags);
+ spin_lock(&qp->s_lock);
+ rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR);
+ spin_unlock(&qp->s_lock);
+ spin_unlock_irqrestore(&qp->r_lock, ps->flags);
+ spin_lock_irqsave(&qp->s_lock, ps->flags);
bail:
qp->s_ack_state = OP(ACKNOWLEDGE);
/*
@@ -263,17 +335,22 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
struct hfi1_qp_priv *priv = qp->priv;
struct hfi1_ibdev *dev = to_idev(qp->ibqp.device);
struct ib_other_headers *ohdr;
- struct rvt_sge_state *ss;
+ struct rvt_sge_state *ss = NULL;
struct rvt_swqe *wqe;
- u32 hwords;
- u32 len;
- u32 bth0 = 0, bth2;
+ struct hfi1_swqe_priv *wpriv;
+ struct tid_rdma_request *req = NULL;
+ /* header size in 32-bit words LRH+BTH = (8+12)/4. */
+ u32 hwords = 5;
+ u32 len = 0;
+ u32 bth0 = 0, bth2 = 0;
u32 bth1 = qp->remote_qpn | (HFI1_CAP_IS_KSET(OPFN) << IB_BTHE_E_SHIFT);
u32 pmtu = qp->pmtu;
char newreq;
int middle = 0;
int delta;
+ struct tid_rdma_flow *flow = NULL;
+ trace_hfi1_sender_make_rc_req(qp);
lockdep_assert_held(&qp->s_lock);
ps->s_txreq = get_txreq(ps->dev, qp);
if (!ps->s_txreq)
@@ -314,8 +391,8 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
}
clear_ahg(qp);
wqe = rvt_get_swqe_ptr(qp, qp->s_last);
- rvt_send_complete(qp, wqe, qp->s_last != qp->s_acked ?
- IB_WC_SUCCESS : IB_WC_WR_FLUSH_ERR);
+ hfi1_trdma_send_complete(qp, wqe, qp->s_last != qp->s_acked ?
+ IB_WC_SUCCESS : IB_WC_WR_FLUSH_ERR);
/* will get called again */
goto done_free_tx;
}
@@ -334,6 +411,7 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
/* Send a request. */
wqe = rvt_get_swqe_ptr(qp, qp->s_cur);
+check_s_state:
switch (qp->s_state) {
default:
if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_NEXT_SEND_OK))
@@ -355,9 +433,13 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
/*
* If a fence is requested, wait for previous
* RDMA read and atomic operations to finish.
+ * However, there is no need to guard against
+ * TID RDMA READ after TID RDMA READ.
*/
if ((wqe->wr.send_flags & IB_SEND_FENCE) &&
- qp->s_num_rd_atomic) {
+ qp->s_num_rd_atomic &&
+ (wqe->wr.opcode != IB_WR_TID_RDMA_READ ||
+ priv->pending_tid_r_segs < qp->s_num_rd_atomic)) {
qp->s_flags |= RVT_S_WAIT_FENCE;
goto bail;
}
@@ -402,6 +484,15 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
len = wqe->length;
ss = &qp->s_sge;
bth2 = mask_psn(qp->s_psn);
+
+ /*
+ * Interlock between various IB requests and TID RDMA
+ * if necessary.
+ */
+ if ((priv->s_flags & HFI1_S_TID_WAIT_INTERLCK) ||
+ hfi1_tid_rdma_wqe_interlock(qp, wqe))
+ goto bail;
+
switch (wqe->wr.opcode) {
case IB_WR_SEND:
case IB_WR_SEND_WITH_IMM:
@@ -483,16 +574,14 @@ no_flow_control:
* Don't allow more operations to be started
* than the QP limits allow.
*/
- if (newreq) {
- if (qp->s_num_rd_atomic >=
- qp->s_max_rd_atomic) {
- qp->s_flags |= RVT_S_WAIT_RDMAR;
- goto bail;
- }
- qp->s_num_rd_atomic++;
- if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT))
- qp->s_lsn++;
+ if (qp->s_num_rd_atomic >=
+ qp->s_max_rd_atomic) {
+ qp->s_flags |= RVT_S_WAIT_RDMAR;
+ goto bail;
}
+ qp->s_num_rd_atomic++;
+ if (newreq && !(qp->s_flags & RVT_S_UNLIMITED_CREDIT))
+ qp->s_lsn++;
put_ib_reth_vaddr(
wqe->rdma_wr.remote_addr,
&ohdr->u.rc.reth);
@@ -508,20 +597,92 @@ no_flow_control:
qp->s_cur = 0;
break;
+ case IB_WR_TID_RDMA_READ:
+ trace_hfi1_tid_read_sender_make_req(qp, newreq);
+ wpriv = wqe->priv;
+ req = wqe_to_tid_req(wqe);
+ trace_hfi1_tid_req_make_req_read(qp, newreq,
+ wqe->wr.opcode,
+ wqe->psn, wqe->lpsn,
+ req);
+ delta = cmp_psn(qp->s_psn, wqe->psn);
+
+ /*
+ * Don't allow more operations to be started
+ * than the QP limits allow. We could get here under
+ * three conditions; (1) It's a new request; (2) We are
+ * sending the second or later segment of a request,
+ * but the qp->s_state is set to OP(RDMA_READ_REQUEST)
+ * when the last segment of a previous request is
+ * received just before this; (3) We are re-sending a
+ * request.
+ */
+ if (qp->s_num_rd_atomic >= qp->s_max_rd_atomic) {
+ qp->s_flags |= RVT_S_WAIT_RDMAR;
+ goto bail;
+ }
+ if (newreq) {
+ struct tid_rdma_flow *flow =
+ &req->flows[req->setup_head];
+
+ /*
+ * Set up s_sge as it is needed for TID
+ * allocation. However, if the pages have been
+ * walked and mapped, skip it. An earlier try
+ * has failed to allocate the TID entries.
+ */
+ if (!flow->npagesets) {
+ qp->s_sge.sge = wqe->sg_list[0];
+ qp->s_sge.sg_list = wqe->sg_list + 1;
+ qp->s_sge.num_sge = wqe->wr.num_sge;
+ qp->s_sge.total_len = wqe->length;
+ qp->s_len = wqe->length;
+ req->isge = 0;
+ req->clear_tail = req->setup_head;
+ req->flow_idx = req->setup_head;
+ req->state = TID_REQUEST_ACTIVE;
+ }
+ } else if (delta == 0) {
+ /* Re-send a request */
+ req->cur_seg = 0;
+ req->comp_seg = 0;
+ req->ack_pending = 0;
+ req->flow_idx = req->clear_tail;
+ req->state = TID_REQUEST_RESEND;
+ }
+ req->s_next_psn = qp->s_psn;
+ /* Read one segment at a time */
+ len = min_t(u32, req->seg_len,
+ wqe->length - req->seg_len * req->cur_seg);
+ delta = hfi1_build_tid_rdma_read_req(qp, wqe, ohdr,
+ &bth1, &bth2,
+ &len);
+ if (delta <= 0) {
+ /* Wait for TID space */
+ goto bail;
+ }
+ if (newreq && !(qp->s_flags & RVT_S_UNLIMITED_CREDIT))
+ qp->s_lsn++;
+ hwords += delta;
+ ss = &wpriv->ss;
+ /* Check if this is the last segment */
+ if (req->cur_seg >= req->total_segs &&
+ ++qp->s_cur == qp->s_size)
+ qp->s_cur = 0;
+ break;
+
case IB_WR_ATOMIC_CMP_AND_SWP:
case IB_WR_ATOMIC_FETCH_AND_ADD:
/*
* Don't allow more operations to be started
* than the QP limits allow.
*/
- if (newreq) {
- if (qp->s_num_rd_atomic >=
- qp->s_max_rd_atomic) {
- qp->s_flags |= RVT_S_WAIT_RDMAR;
- goto bail;
- }
- qp->s_num_rd_atomic++;
+ if (qp->s_num_rd_atomic >=
+ qp->s_max_rd_atomic) {
+ qp->s_flags |= RVT_S_WAIT_RDMAR;
+ goto bail;
}
+ qp->s_num_rd_atomic++;
/* FALLTHROUGH */
case IB_WR_OPFN:
@@ -555,11 +716,13 @@ no_flow_control:
default:
goto bail;
}
- qp->s_sge.sge = wqe->sg_list[0];
- qp->s_sge.sg_list = wqe->sg_list + 1;
- qp->s_sge.num_sge = wqe->wr.num_sge;
- qp->s_sge.total_len = wqe->length;
- qp->s_len = wqe->length;
+ if (wqe->wr.opcode != IB_WR_TID_RDMA_READ) {
+ qp->s_sge.sge = wqe->sg_list[0];
+ qp->s_sge.sg_list = wqe->sg_list + 1;
+ qp->s_sge.num_sge = wqe->wr.num_sge;
+ qp->s_sge.total_len = wqe->length;
+ qp->s_len = wqe->length;
+ }
if (newreq) {
qp->s_tail++;
if (qp->s_tail >= qp->s_size)
@@ -567,6 +730,8 @@ no_flow_control:
}
if (wqe->wr.opcode == IB_WR_RDMA_READ)
qp->s_psn = wqe->lpsn + 1;
+ else if (wqe->wr.opcode == IB_WR_TID_RDMA_READ)
+ qp->s_psn = req->s_next_psn;
else
qp->s_psn++;
break;
@@ -683,6 +848,103 @@ no_flow_control:
if (qp->s_cur == qp->s_size)
qp->s_cur = 0;
break;
+ case TID_OP(READ_RESP):
+ if (wqe->wr.opcode != IB_WR_TID_RDMA_READ)
+ goto bail;
+ /* This is used to restart a TID read request */
+ req = wqe_to_tid_req(wqe);
+ wpriv = wqe->priv;
+ /*
+ * Back down. The field qp->s_psn has been set to the psn with
+ * which the request should be restart. It's OK to use division
+ * as this is on the retry path.
+ */
+ req->cur_seg = delta_psn(qp->s_psn, wqe->psn) / priv->pkts_ps;
+
+ /*
+ * The following function need to be redefined to return the
+ * status to make sure that we find the flow. At the same
+ * time, we can use the req->state change to check if the
+ * call succeeds or not.
+ */
+ req->state = TID_REQUEST_RESEND;
+ hfi1_tid_rdma_restart_req(qp, wqe, &bth2);
+ if (req->state != TID_REQUEST_ACTIVE) {
+ /*
+ * Failed to find the flow. Release all allocated tid
+ * resources.
+ */
+ hfi1_kern_exp_rcv_clear_all(req);
+ hfi1_kern_clear_hw_flow(priv->rcd, qp);
+
+ hfi1_trdma_send_complete(qp, wqe, IB_WC_LOC_QP_OP_ERR);
+ goto bail;
+ }
+ req->state = TID_REQUEST_RESEND;
+ len = min_t(u32, req->seg_len,
+ wqe->length - req->seg_len * req->cur_seg);
+ flow = &req->flows[req->flow_idx];
+ len -= flow->sent;
+ req->s_next_psn = flow->flow_state.ib_lpsn + 1;
+ delta = hfi1_build_tid_rdma_read_packet(wqe, ohdr, &bth1,
+ &bth2, &len);
+ if (delta <= 0) {
+ /* Wait for TID space */
+ goto bail;
+ }
+ hwords += delta;
+ ss = &wpriv->ss;
+ /* Check if this is the last segment */
+ if (req->cur_seg >= req->total_segs &&
+ ++qp->s_cur == qp->s_size)
+ qp->s_cur = 0;
+ qp->s_psn = req->s_next_psn;
+ trace_hfi1_tid_req_make_req_read(qp, 0, wqe->wr.opcode,
+ wqe->psn, wqe->lpsn, req);
+ break;
+ case TID_OP(READ_REQ):
+ req = wqe_to_tid_req(wqe);
+ delta = cmp_psn(qp->s_psn, wqe->psn);
+ /*
+ * If the current WR is not TID RDMA READ, or this is the start
+ * of a new request, we need to change the qp->s_state so that
+ * the request can be set up properly.
+ */
+ if (wqe->wr.opcode != IB_WR_TID_RDMA_READ || delta == 0 ||
+ qp->s_cur == qp->s_tail) {
+ qp->s_state = OP(RDMA_READ_REQUEST);
+ if (delta == 0 || qp->s_cur == qp->s_tail)
+ goto check_s_state;
+ else
+ goto bail;
+ }
+
+ /* Rate limiting */
+ if (qp->s_num_rd_atomic >= qp->s_max_rd_atomic) {
+ qp->s_flags |= RVT_S_WAIT_RDMAR;
+ goto bail;
+ }
+
+ wpriv = wqe->priv;
+ /* Read one segment at a time */
+ len = min_t(u32, req->seg_len,
+ wqe->length - req->seg_len * req->cur_seg);
+ delta = hfi1_build_tid_rdma_read_req(qp, wqe, ohdr, &bth1,
+ &bth2, &len);
+ if (delta <= 0) {
+ /* Wait for TID space */
+ goto bail;
+ }
+ hwords += delta;
+ ss = &wpriv->ss;
+ /* Check if this is the last segment */
+ if (req->cur_seg >= req->total_segs &&
+ ++qp->s_cur == qp->s_size)
+ qp->s_cur = 0;
+ qp->s_psn = req->s_next_psn;
+ trace_hfi1_tid_req_make_req_read(qp, 0, wqe->wr.opcode,
+ wqe->psn, wqe->lpsn, req);
+ break;
}
qp->s_sending_hpsn = bth2;
delta = delta_psn(bth2, wqe->psn);
@@ -951,6 +1213,43 @@ void hfi1_send_rc_ack(struct hfi1_packet *packet, bool is_fecn)
}
/**
+ * update_num_rd_atomic - update the qp->s_num_rd_atomic
+ * @qp: the QP
+ * @psn: the packet sequence number to restart at
+ * @wqe: the wqe
+ *
+ * This is called from reset_psn() to update qp->s_num_rd_atomic
+ * for the current wqe.
+ * Called at interrupt level with the QP s_lock held.
+ */
+static void update_num_rd_atomic(struct rvt_qp *qp, u32 psn,
+ struct rvt_swqe *wqe)
+{
+ u32 opcode = wqe->wr.opcode;
+
+ if (opcode == IB_WR_RDMA_READ ||
+ opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
+ opcode == IB_WR_ATOMIC_FETCH_AND_ADD) {
+ qp->s_num_rd_atomic++;
+ } else if (opcode == IB_WR_TID_RDMA_READ) {
+ struct tid_rdma_request *req = wqe_to_tid_req(wqe);
+ struct hfi1_qp_priv *priv = qp->priv;
+
+ if (cmp_psn(psn, wqe->lpsn) <= 0) {
+ u32 cur_seg;
+
+ cur_seg = (psn - wqe->psn) / priv->pkts_ps;
+ req->ack_pending = cur_seg - req->comp_seg;
+ priv->pending_tid_r_segs += req->ack_pending;
+ qp->s_num_rd_atomic += req->ack_pending;
+ } else {
+ priv->pending_tid_r_segs += req->total_segs;
+ qp->s_num_rd_atomic += req->total_segs;
+ }
+ }
+}
+
+/**
* reset_psn - reset the QP state to send starting from PSN
* @qp: the QP
* @psn: the packet sequence number to restart at
@@ -964,9 +1263,12 @@ static void reset_psn(struct rvt_qp *qp, u32 psn)
u32 n = qp->s_acked;
struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, n);
u32 opcode;
+ struct hfi1_qp_priv *priv = qp->priv;
lockdep_assert_held(&qp->s_lock);
qp->s_cur = n;
+ priv->pending_tid_r_segs = 0;
+ qp->s_num_rd_atomic = 0;
/*
* If we are starting the request from the beginning,
@@ -976,9 +1278,9 @@ static void reset_psn(struct rvt_qp *qp, u32 psn)
qp->s_state = OP(SEND_LAST);
goto done;
}
+ update_num_rd_atomic(qp, psn, wqe);
/* Find the work request opcode corresponding to the given PSN. */
- opcode = wqe->wr.opcode;
for (;;) {
int diff;
@@ -988,8 +1290,11 @@ static void reset_psn(struct rvt_qp *qp, u32 psn)
break;
wqe = rvt_get_swqe_ptr(qp, n);
diff = cmp_psn(psn, wqe->psn);
- if (diff < 0)
+ if (diff < 0) {
+ /* Point wqe back to the previous one*/
+ wqe = rvt_get_swqe_ptr(qp, qp->s_cur);
break;
+ }
qp->s_cur = n;
/*
* If we are starting the request from the beginning,
@@ -999,8 +1304,10 @@ static void reset_psn(struct rvt_qp *qp, u32 psn)
qp->s_state = OP(SEND_LAST);
goto done;
}
- opcode = wqe->wr.opcode;
+
+ update_num_rd_atomic(qp, psn, wqe);
}
+ opcode = wqe->wr.opcode;
/*
* Set the state to restart in the middle of a request.
@@ -1022,6 +1329,10 @@ static void reset_psn(struct rvt_qp *qp, u32 psn)
qp->s_state = OP(RDMA_READ_RESPONSE_MIDDLE);
break;
+ case IB_WR_TID_RDMA_READ:
+ qp->s_state = TID_OP(READ_RESP);
+ break;
+
default:
/*
* This case shouldn't happen since its only
@@ -1030,6 +1341,7 @@ static void reset_psn(struct rvt_qp *qp, u32 psn)
qp->s_state = OP(SEND_LAST);
}
done:
+ priv->s_flags &= ~HFI1_S_TID_WAIT_INTERLCK;
qp->s_psn = psn;
/*
* Set RVT_S_WAIT_PSN as rc_complete() may start the timer
@@ -1040,6 +1352,7 @@ done:
(cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) <= 0))
qp->s_flags |= RVT_S_WAIT_PSN;
qp->s_flags &= ~HFI1_S_AHG_VALID;
+ trace_hfi1_sender_reset_psn(qp);
}
/*
@@ -1054,6 +1367,7 @@ void hfi1_restart_rc(struct rvt_qp *qp, u32 psn, int wait)
lockdep_assert_held(&qp->r_lock);
lockdep_assert_held(&qp->s_lock);
+ trace_hfi1_sender_restart_rc(qp);
if (qp->s_retry == 0) {
if (qp->s_mig_state == IB_MIG_ARMED) {
hfi1_migrate_qp(qp);
@@ -1075,8 +1389,16 @@ void hfi1_restart_rc(struct rvt_qp *qp, u32 psn, int wait)
wqe = do_rc_completion(qp, wqe, ibp);
qp->s_flags &= ~RVT_S_WAIT_ACK;
} else {
- rvt_send_complete(qp, wqe,
- IB_WC_RETRY_EXC_ERR);
+ if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) {
+ struct tid_rdma_request *req;
+
+ req = wqe_to_tid_req(wqe);
+ hfi1_kern_exp_rcv_clear_all(req);
+ hfi1_kern_clear_hw_flow(priv->rcd, qp);
+ }
+
+ hfi1_trdma_send_complete(qp, wqe,
+ IB_WC_RETRY_EXC_ERR);
rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR);
}
return;
@@ -1088,7 +1410,8 @@ void hfi1_restart_rc(struct rvt_qp *qp, u32 psn, int wait)
}
ibp = to_iport(qp->ibqp.device, qp->port_num);
- if (wqe->wr.opcode == IB_WR_RDMA_READ)
+ if (wqe->wr.opcode == IB_WR_RDMA_READ ||
+ wqe->wr.opcode == IB_WR_TID_RDMA_READ)
ibp->rvp.n_rc_resends++;
else
ibp->rvp.n_rc_resends += delta_psn(qp->s_psn, psn);
@@ -1115,7 +1438,8 @@ static void reset_sending_psn(struct rvt_qp *qp, u32 psn)
for (;;) {
wqe = rvt_get_swqe_ptr(qp, n);
if (cmp_psn(psn, wqe->lpsn) <= 0) {
- if (wqe->wr.opcode == IB_WR_RDMA_READ)
+ if (wqe->wr.opcode == IB_WR_RDMA_READ ||
+ wqe->wr.opcode == IB_WR_TID_RDMA_READ)
qp->s_sending_psn = wqe->lpsn + 1;
else
qp->s_sending_psn = psn + 1;
@@ -1164,8 +1488,9 @@ void hfi1_rc_send_complete(struct rvt_qp *qp, struct hfi1_opa_header *opah)
}
opcode = ib_bth_get_opcode(ohdr);
- if (opcode >= OP(RDMA_READ_RESPONSE_FIRST) &&
- opcode <= OP(ATOMIC_ACKNOWLEDGE)) {
+ if ((opcode >= OP(RDMA_READ_RESPONSE_FIRST) &&
+ opcode <= OP(ATOMIC_ACKNOWLEDGE)) ||
+ opcode == TID_OP(READ_RESP)) {
WARN_ON(!qp->s_rdma_ack_cnt);
qp->s_rdma_ack_cnt--;
return;
@@ -1181,8 +1506,12 @@ void hfi1_rc_send_complete(struct rvt_qp *qp, struct hfi1_opa_header *opah)
if ((psn & IB_BTH_REQ_ACK) && qp->s_acked != qp->s_tail &&
!(qp->s_flags &
(RVT_S_TIMER | RVT_S_WAIT_RNR | RVT_S_WAIT_PSN)) &&
- (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK))
- rvt_add_retry_timer(qp);
+ (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) {
+ if (opcode == TID_OP(READ_REQ))
+ rvt_add_retry_timer_ext(qp, priv->timeout_shift);
+ else
+ rvt_add_retry_timer(qp);
+ }
while (qp->s_last != qp->s_acked) {
u32 s_last;
@@ -1191,6 +1520,7 @@ void hfi1_rc_send_complete(struct rvt_qp *qp, struct hfi1_opa_header *opah)
if (cmp_psn(wqe->lpsn, qp->s_sending_psn) >= 0 &&
cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) <= 0)
break;
+ trdma_clean_swqe(qp, wqe);
rvt_qp_wqe_unreserve(qp, wqe);
s_last = qp->s_last;
trace_hfi1_qp_send_completion(qp, wqe, s_last);
@@ -1229,20 +1559,24 @@ static inline void update_last_psn(struct rvt_qp *qp, u32 psn)
* This is similar to hfi1_send_complete but has to check to be sure
* that the SGEs are not being referenced if the SWQE is being resent.
*/
-static struct rvt_swqe *do_rc_completion(struct rvt_qp *qp,
- struct rvt_swqe *wqe,
- struct hfi1_ibport *ibp)
+struct rvt_swqe *do_rc_completion(struct rvt_qp *qp,
+ struct rvt_swqe *wqe,
+ struct hfi1_ibport *ibp)
{
+ struct hfi1_qp_priv *priv = qp->priv;
+
lockdep_assert_held(&qp->s_lock);
/*
* Don't decrement refcount and don't generate a
* completion if the SWQE is being resent until the send
* is finished.
*/
+ trace_hfi1_rc_completion(qp, wqe->lpsn);
if (cmp_psn(wqe->lpsn, qp->s_sending_psn) < 0 ||
cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) > 0) {
u32 s_last;
+ trdma_clean_swqe(qp, wqe);
rvt_put_swqe(wqe);
rvt_qp_wqe_unreserve(qp, wqe);
s_last = qp->s_last;
@@ -1300,6 +1634,10 @@ static struct rvt_swqe *do_rc_completion(struct rvt_qp *qp,
qp->s_draining = 0;
wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
}
+ if (priv->s_flags & HFI1_S_TID_WAIT_INTERLCK) {
+ priv->s_flags &= ~HFI1_S_TID_WAIT_INTERLCK;
+ hfi1_schedule_send(qp);
+ }
return wqe;
}
@@ -1314,11 +1652,12 @@ static struct rvt_swqe *do_rc_completion(struct rvt_qp *qp,
* May be called at interrupt level, with the QP s_lock held.
* Returns 1 if OK, 0 if current operation should be aborted (NAK).
*/
-static int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode,
- u64 val, struct hfi1_ctxtdata *rcd)
+int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode,
+ u64 val, struct hfi1_ctxtdata *rcd)
{
struct hfi1_ibport *ibp;
enum ib_wc_status status;
+ struct hfi1_qp_priv *qpriv = qp->priv;
struct rvt_swqe *wqe;
int ret = 0;
u32 ack_psn;
@@ -1365,6 +1704,8 @@ static int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode,
*/
if ((wqe->wr.opcode == IB_WR_RDMA_READ &&
(opcode != OP(RDMA_READ_RESPONSE_LAST) || diff != 0)) ||
+ (wqe->wr.opcode == IB_WR_TID_RDMA_READ &&
+ (opcode != TID_OP(READ_RESP) || diff != 0)) ||
((wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) &&
(opcode != OP(ATOMIC_ACKNOWLEDGE) || diff != 0))) {
@@ -1415,10 +1756,18 @@ static int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode,
break;
}
+ trace_hfi1_rc_ack_do(qp, aeth, psn, wqe);
+ trace_hfi1_sender_do_rc_ack(qp);
switch (aeth >> IB_AETH_NAK_SHIFT) {
case 0: /* ACK */
this_cpu_inc(*ibp->rvp.rc_acks);
- if (qp->s_acked != qp->s_tail) {
+ if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) {
+ if (wqe_to_tid_req(wqe)->ack_pending)
+ rvt_mod_retry_timer_ext(qp,
+ qpriv->timeout_shift);
+ else
+ rvt_stop_rc_timers(qp);
+ } else if (qp->s_acked != qp->s_tail) {
/*
* We are expecting more ACKs so
* mod the retry timer.
@@ -1507,7 +1856,10 @@ static int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode,
ibp->rvp.n_other_naks++;
class_b:
if (qp->s_last == qp->s_acked) {
- rvt_send_complete(qp, wqe, status);
+ if (wqe->wr.opcode == IB_WR_TID_RDMA_READ)
+ hfi1_kern_read_tid_flow_free(qp);
+
+ hfi1_trdma_send_complete(qp, wqe, status);
rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR);
}
break;
@@ -1548,6 +1900,7 @@ static void rdma_seq_err(struct rvt_qp *qp, struct hfi1_ibport *ibp, u32 psn,
while (cmp_psn(psn, wqe->lpsn) > 0) {
if (wqe->wr.opcode == IB_WR_RDMA_READ ||
+ wqe->wr.opcode == IB_WR_TID_RDMA_READ ||
wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD)
break;
@@ -1754,16 +2107,6 @@ bail:
return;
}
-static inline void rc_defered_ack(struct hfi1_ctxtdata *rcd,
- struct rvt_qp *qp)
-{
- if (list_empty(&qp->rspwait)) {
- qp->r_flags |= RVT_R_RSP_NAK;
- rvt_get_qp(qp);
- list_add_tail(&qp->rspwait, &rcd->qp_wait_list);
- }
-}
-
static inline void rc_cancel_ack(struct rvt_qp *qp)
{
qp->r_adefered = 0;
@@ -1796,8 +2139,9 @@ static noinline int rc_rcv_error(struct ib_other_headers *ohdr, void *data,
struct hfi1_ibport *ibp = rcd_to_iport(rcd);
struct rvt_ack_entry *e;
unsigned long flags;
- u8 i, prev;
- int old_req;
+ u8 prev;
+ u8 mra; /* most recent ACK */
+ bool old_req;
trace_hfi1_rcv_error(qp, psn);
if (diff > 0) {
@@ -1843,29 +2187,8 @@ static noinline int rc_rcv_error(struct ib_other_headers *ohdr, void *data,
spin_lock_irqsave(&qp->s_lock, flags);
- for (i = qp->r_head_ack_queue; ; i = prev) {
- if (i == qp->s_tail_ack_queue)
- old_req = 0;
- if (i)
- prev = i - 1;
- else
- prev = rvt_size_atomic(ib_to_rvt(qp->ibqp.device));
- if (prev == qp->r_head_ack_queue) {
- e = NULL;
- break;
- }
- e = &qp->s_ack_queue[prev];
- if (!e->opcode) {
- e = NULL;
- break;
- }
- if (cmp_psn(psn, e->psn) >= 0) {
- if (prev == qp->s_tail_ack_queue &&
- cmp_psn(psn, e->lpsn) <= 0)
- old_req = 0;
- break;
- }
- }
+ e = find_prev_entry(qp, psn, &prev, &mra, &old_req);
+
switch (opcode) {
case OP(RDMA_READ_REQUEST): {
struct ib_reth *reth;
@@ -1940,7 +2263,7 @@ static noinline int rc_rcv_error(struct ib_other_headers *ohdr, void *data,
* Resend the most recent ACK if this request is
* after all the previous RDMA reads and atomics.
*/
- if (i == qp->r_head_ack_queue) {
+ if (mra == qp->r_head_ack_queue) {
spin_unlock_irqrestore(&qp->s_lock, flags);
qp->r_nak_state = 0;
qp->r_ack_psn = qp->r_psn - 1;
@@ -1951,7 +2274,7 @@ static noinline int rc_rcv_error(struct ib_other_headers *ohdr, void *data,
* Resend the RDMA read or atomic op which
* ACKs this duplicate request.
*/
- qp->s_tail_ack_queue = i;
+ qp->s_tail_ack_queue = mra;
break;
}
qp->s_ack_state = OP(ACKNOWLEDGE);
@@ -1968,17 +2291,6 @@ send_ack:
return 0;
}
-static inline void update_ack_queue(struct rvt_qp *qp, unsigned n)
-{
- unsigned next;
-
- next = n + 1;
- if (next > rvt_size_atomic(ib_to_rvt(qp->ibqp.device)))
- next = 0;
- qp->s_tail_ack_queue = next;
- qp->s_ack_state = OP(ACKNOWLEDGE);
-}
-
static void log_cca_event(struct hfi1_pportdata *ppd, u8 sl, u32 rlid,
u32 lqpn, u32 rqpn, u8 svc_type)
{
diff --git a/drivers/infiniband/hw/hfi1/rc.h b/drivers/infiniband/hw/hfi1/rc.h
new file mode 100644
index 000000000000..4329eadcb3df
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/rc.h
@@ -0,0 +1,50 @@
+/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */
+/*
+ * Copyright(c) 2018 Intel Corporation.
+ *
+ */
+
+#ifndef HFI1_RC_H
+#define HFI1_RC_H
+
+/* cut down ridiculously long IB macro names */
+#define OP(x) IB_OPCODE_RC_##x
+
+static inline void update_ack_queue(struct rvt_qp *qp, unsigned int n)
+{
+ unsigned int next;
+
+ next = n + 1;
+ if (next > rvt_size_atomic(ib_to_rvt(qp->ibqp.device)))
+ next = 0;
+ qp->s_tail_ack_queue = next;
+ qp->s_ack_state = OP(ACKNOWLEDGE);
+}
+
+static inline void rc_defered_ack(struct hfi1_ctxtdata *rcd,
+ struct rvt_qp *qp)
+{
+ if (list_empty(&qp->rspwait)) {
+ qp->r_flags |= RVT_R_RSP_NAK;
+ rvt_get_qp(qp);
+ list_add_tail(&qp->rspwait, &rcd->qp_wait_list);
+ }
+}
+
+static inline u32 restart_sge(struct rvt_sge_state *ss, struct rvt_swqe *wqe,
+ u32 psn, u32 pmtu)
+{
+ u32 len;
+
+ len = delta_psn(psn, wqe->psn) * pmtu;
+ return rvt_restart_sge(ss, wqe, len);
+}
+
+struct rvt_ack_entry *find_prev_entry(struct rvt_qp *qp, u32 psn, u8 *prev,
+ u8 *prev_ack, bool *scheduled);
+int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode, u64 val,
+ struct hfi1_ctxtdata *rcd);
+struct rvt_swqe *do_rc_completion(struct rvt_qp *qp, struct rvt_swqe *wqe,
+ struct hfi1_ibport *ibp);
+
+#endif /* HFI1_RC_H */
diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.c b/drivers/infiniband/hw/hfi1/tid_rdma.c
index e8f57c0cd8bc..0ee79403acaf 100644
--- a/drivers/infiniband/hw/hfi1/tid_rdma.c
+++ b/drivers/infiniband/hw/hfi1/tid_rdma.c
@@ -5,10 +5,48 @@
*/
#include "hfi.h"
+#include "qp.h"
+#include "rc.h"
#include "verbs.h"
#include "tid_rdma.h"
+#include "exp_rcv.h"
#include "trace.h"
+/**
+ * DOC: TID RDMA READ protocol
+ *
+ * This is an end-to-end protocol at the hfi1 level between two nodes that
+ * improves performance by avoiding data copy on the requester side. It
+ * converts a qualified RDMA READ request into a TID RDMA READ request on
+ * the requester side and thereafter handles the request and response
+ * differently. To be qualified, the RDMA READ request should meet the
+ * following:
+ * -- The total data length should be greater than 256K;
+ * -- The total data length should be a multiple of 4K page size;
+ * -- Each local scatter-gather entry should be 4K page aligned;
+ * -- Each local scatter-gather entry should be a multiple of 4K page size;
+ */
+
+#define RCV_TID_FLOW_TABLE_CTRL_FLOW_VALID_SMASK BIT_ULL(32)
+#define RCV_TID_FLOW_TABLE_CTRL_HDR_SUPP_EN_SMASK BIT_ULL(33)
+#define RCV_TID_FLOW_TABLE_CTRL_KEEP_AFTER_SEQ_ERR_SMASK BIT_ULL(34)
+#define RCV_TID_FLOW_TABLE_CTRL_KEEP_ON_GEN_ERR_SMASK BIT_ULL(35)
+#define RCV_TID_FLOW_TABLE_STATUS_SEQ_MISMATCH_SMASK BIT_ULL(37)
+#define RCV_TID_FLOW_TABLE_STATUS_GEN_MISMATCH_SMASK BIT_ULL(38)
+
+/* Maximum number of packets within a flow generation. */
+#define MAX_TID_FLOW_PSN BIT(HFI1_KDETH_BTH_SEQ_SHIFT)
+
+#define GENERATION_MASK 0xFFFFF
+
+static u32 mask_generation(u32 a)
+{
+ return a & GENERATION_MASK;
+}
+
+/* Reserved generation value to set to unused flows for kernel contexts */
+#define KERN_GENERATION_RESERVED mask_generation(U32_MAX)
+
/*
* J_KEY for kernel contexts when TID RDMA is used.
* See generate_jkey() in hfi.h for more information.
@@ -17,8 +55,19 @@
#define HFI1_KERNEL_MIN_JKEY HFI1_ADMIN_JKEY_RANGE
#define HFI1_KERNEL_MAX_JKEY (2 * HFI1_ADMIN_JKEY_RANGE - 1)
+/* Maximum number of segments in flight per QP request. */
#define TID_RDMA_MAX_READ_SEGS_PER_REQ 6
#define TID_RDMA_MAX_WRITE_SEGS_PER_REQ 4
+#define MAX_REQ max_t(u16, TID_RDMA_MAX_READ_SEGS_PER_REQ, \
+ TID_RDMA_MAX_WRITE_SEGS_PER_REQ)
+#define MAX_FLOWS roundup_pow_of_two(MAX_REQ + 1)
+
+#define MAX_EXPECTED_PAGES (MAX_EXPECTED_BUFFER / PAGE_SIZE)
+
+#define TID_RDMA_DESTQP_FLOW_SHIFT 11
+#define TID_RDMA_DESTQP_FLOW_MASK 0x1f
+
+#define TID_FLOW_SW_PSN BIT(0)
#define TID_OPFN_QP_CTXT_MASK 0xff
#define TID_OPFN_QP_CTXT_SHIFT 56
@@ -60,6 +109,13 @@
* C - Capcode
*/
+static void tid_rdma_trigger_resume(struct work_struct *work);
+static void hfi1_kern_exp_rcv_free_flows(struct tid_rdma_request *req);
+static int hfi1_kern_exp_rcv_alloc_flows(struct tid_rdma_request *req,
+ gfp_t gfp);
+static void hfi1_init_trdma_req(struct rvt_qp *qp,
+ struct tid_rdma_request *req);
+
static u64 tid_rdma_opfn_encode(struct tid_rdma_params *p)
{
return
@@ -210,7 +266,7 @@ int hfi1_kern_exp_rcv_init(struct hfi1_ctxtdata *rcd, int reinit)
BUILD_BUG_ON(TID_RDMA_JKEY > HFI1_KERNEL_MAX_JKEY);
rcd->jkey = TID_RDMA_JKEY;
hfi1_set_ctxt_jkey(rcd->dd, rcd, rcd->jkey);
- return 0;
+ return hfi1_alloc_ctxt_rcv_groups(rcd);
}
/**
@@ -246,19 +302,2676 @@ int hfi1_qp_priv_init(struct rvt_dev_info *rdi, struct rvt_qp *qp,
struct ib_qp_init_attr *init_attr)
{
struct hfi1_qp_priv *qpriv = qp->priv;
+ int i, ret;
qpriv->rcd = qp_to_rcd(rdi, qp);
spin_lock_init(&qpriv->opfn.lock);
INIT_WORK(&qpriv->opfn.opfn_work, opfn_send_conn_request);
+ INIT_WORK(&qpriv->tid_rdma.trigger_work, tid_rdma_trigger_resume);
+ qpriv->flow_state.psn = 0;
+ qpriv->flow_state.index = RXE_NUM_TID_FLOWS;
+ qpriv->flow_state.last_index = RXE_NUM_TID_FLOWS;
+ qpriv->flow_state.generation = KERN_GENERATION_RESERVED;
+ INIT_LIST_HEAD(&qpriv->tid_wait);
+
+ if (init_attr->qp_type == IB_QPT_RC && HFI1_CAP_IS_KSET(TID_RDMA)) {
+ struct hfi1_devdata *dd = qpriv->rcd->dd;
+
+ qpriv->pages = kzalloc_node(TID_RDMA_MAX_PAGES *
+ sizeof(*qpriv->pages),
+ GFP_KERNEL, dd->node);
+ if (!qpriv->pages)
+ return -ENOMEM;
+ for (i = 0; i < qp->s_size; i++) {
+ struct hfi1_swqe_priv *priv;
+ struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, i);
+
+ priv = kzalloc_node(sizeof(*priv), GFP_KERNEL,
+ dd->node);
+ if (!priv)
+ return -ENOMEM;
+
+ hfi1_init_trdma_req(qp, &priv->tid_req);
+ priv->tid_req.e.swqe = wqe;
+ wqe->priv = priv;
+ }
+ for (i = 0; i < rvt_max_atomic(rdi); i++) {
+ struct hfi1_ack_priv *priv;
+
+ priv = kzalloc_node(sizeof(*priv), GFP_KERNEL,
+ dd->node);
+ if (!priv)
+ return -ENOMEM;
+
+ hfi1_init_trdma_req(qp, &priv->tid_req);
+ priv->tid_req.e.ack = &qp->s_ack_queue[i];
+
+ ret = hfi1_kern_exp_rcv_alloc_flows(&priv->tid_req,
+ GFP_KERNEL);
+ if (ret) {
+ kfree(priv);
+ return ret;
+ }
+ qp->s_ack_queue[i].priv = priv;
+ }
+ }
return 0;
}
void hfi1_qp_priv_tid_free(struct rvt_dev_info *rdi, struct rvt_qp *qp)
{
+ struct hfi1_qp_priv *qpriv = qp->priv;
+ struct rvt_swqe *wqe;
+ u32 i;
+
+ if (qp->ibqp.qp_type == IB_QPT_RC && HFI1_CAP_IS_KSET(TID_RDMA)) {
+ for (i = 0; i < qp->s_size; i++) {
+ wqe = rvt_get_swqe_ptr(qp, i);
+ kfree(wqe->priv);
+ wqe->priv = NULL;
+ }
+ for (i = 0; i < rvt_max_atomic(rdi); i++) {
+ struct hfi1_ack_priv *priv = qp->s_ack_queue[i].priv;
+
+ if (priv)
+ hfi1_kern_exp_rcv_free_flows(&priv->tid_req);
+ kfree(priv);
+ qp->s_ack_queue[i].priv = NULL;
+ }
+ cancel_work_sync(&qpriv->opfn.opfn_work);
+ kfree(qpriv->pages);
+ qpriv->pages = NULL;
+ }
+}
+
+/* Flow and tid waiter functions */
+/**
+ * DOC: lock ordering
+ *
+ * There are two locks involved with the queuing
+ * routines: the qp s_lock and the exp_lock.
+ *
+ * Since the tid space allocation is called from
+ * the send engine, the qp s_lock is already held.
+ *
+ * The allocation routines will get the exp_lock.
+ *
+ * The first_qp() call is provided to allow the head of
+ * the rcd wait queue to be fetched under the exp_lock and
+ * followed by a drop of the exp_lock.
+ *
+ * Any qp in the wait list will have the qp reference count held
+ * to hold the qp in memory.
+ */
+
+/*
+ * return head of rcd wait list
+ *
+ * Must hold the exp_lock.
+ *
+ * Get a reference to the QP to hold the QP in memory.
+ *
+ * The caller must release the reference when the local
+ * is no longer being used.
+ */
+static struct rvt_qp *first_qp(struct hfi1_ctxtdata *rcd,
+ struct tid_queue *queue)
+ __must_hold(&rcd->exp_lock)
+{
+ struct hfi1_qp_priv *priv;
+
+ lockdep_assert_held(&rcd->exp_lock);
+ priv = list_first_entry_or_null(&queue->queue_head,
+ struct hfi1_qp_priv,
+ tid_wait);
+ if (!priv)
+ return NULL;
+ rvt_get_qp(priv->owner);
+ return priv->owner;
+}
+
+/**
+ * kernel_tid_waiters - determine rcd wait
+ * @rcd: the receive context
+ * @qp: the head of the qp being processed
+ *
+ * This routine will return false IFF
+ * the list is NULL or the head of the
+ * list is the indicated qp.
+ *
+ * Must hold the qp s_lock and the exp_lock.
+ *
+ * Return:
+ * false if either of the conditions below are statisfied:
+ * 1. The list is empty or
+ * 2. The indicated qp is at the head of the list and the
+ * HFI1_S_WAIT_TID_SPACE bit is set in qp->s_flags.
+ * true is returned otherwise.
+ */
+static bool kernel_tid_waiters(struct hfi1_ctxtdata *rcd,
+ struct tid_queue *queue, struct rvt_qp *qp)
+ __must_hold(&rcd->exp_lock) __must_hold(&qp->s_lock)
+{
+ struct rvt_qp *fqp;
+ bool ret = true;
+
+ lockdep_assert_held(&qp->s_lock);
+ lockdep_assert_held(&rcd->exp_lock);
+ fqp = first_qp(rcd, queue);
+ if (!fqp || (fqp == qp && (qp->s_flags & HFI1_S_WAIT_TID_SPACE)))
+ ret = false;
+ rvt_put_qp(fqp);
+ return ret;
+}
+
+/**
+ * dequeue_tid_waiter - dequeue the qp from the list
+ * @qp - the qp to remove the wait list
+ *
+ * This routine removes the indicated qp from the
+ * wait list if it is there.
+ *
+ * This should be done after the hardware flow and
+ * tid array resources have been allocated.
+ *
+ * Must hold the qp s_lock and the rcd exp_lock.
+ *
+ * It assumes the s_lock to protect the s_flags
+ * field and to reliably test the HFI1_S_WAIT_TID_SPACE flag.
+ */
+static void dequeue_tid_waiter(struct hfi1_ctxtdata *rcd,
+ struct tid_queue *queue, struct rvt_qp *qp)
+ __must_hold(&rcd->exp_lock) __must_hold(&qp->s_lock)
+{
+ struct hfi1_qp_priv *priv = qp->priv;
+
+ lockdep_assert_held(&qp->s_lock);
+ lockdep_assert_held(&rcd->exp_lock);
+ if (list_empty(&priv->tid_wait))
+ return;
+ list_del_init(&priv->tid_wait);
+ qp->s_flags &= ~HFI1_S_WAIT_TID_SPACE;
+ queue->dequeue++;
+ rvt_put_qp(qp);
+}
+
+/**
+ * queue_qp_for_tid_wait - suspend QP on tid space
+ * @rcd: the receive context
+ * @qp: the qp
+ *
+ * The qp is inserted at the tail of the rcd
+ * wait queue and the HFI1_S_WAIT_TID_SPACE s_flag is set.
+ *
+ * Must hold the qp s_lock and the exp_lock.
+ */
+static void queue_qp_for_tid_wait(struct hfi1_ctxtdata *rcd,
+ struct tid_queue *queue, struct rvt_qp *qp)
+ __must_hold(&rcd->exp_lock) __must_hold(&qp->s_lock)
+{
+ struct hfi1_qp_priv *priv = qp->priv;
+
+ lockdep_assert_held(&qp->s_lock);
+ lockdep_assert_held(&rcd->exp_lock);
+ if (list_empty(&priv->tid_wait)) {
+ qp->s_flags |= HFI1_S_WAIT_TID_SPACE;
+ list_add_tail(&priv->tid_wait, &queue->queue_head);
+ priv->tid_enqueue = ++queue->enqueue;
+ rcd->dd->verbs_dev.n_tidwait++;
+ trace_hfi1_qpsleep(qp, HFI1_S_WAIT_TID_SPACE);
+ rvt_get_qp(qp);
+ }
+}
+
+/**
+ * __trigger_tid_waiter - trigger tid waiter
+ * @qp: the qp
+ *
+ * This is a private entrance to schedule the qp
+ * assuming the caller is holding the qp->s_lock.
+ */
+static void __trigger_tid_waiter(struct rvt_qp *qp)
+ __must_hold(&qp->s_lock)
+{
+ lockdep_assert_held(&qp->s_lock);
+ if (!(qp->s_flags & HFI1_S_WAIT_TID_SPACE))
+ return;
+ trace_hfi1_qpwakeup(qp, HFI1_S_WAIT_TID_SPACE);
+ hfi1_schedule_send(qp);
+}
+
+/**
+ * tid_rdma_schedule_tid_wakeup - schedule wakeup for a qp
+ * @qp - the qp
+ *
+ * trigger a schedule or a waiting qp in a deadlock
+ * safe manner. The qp reference is held prior
+ * to this call via first_qp().
+ *
+ * If the qp trigger was already scheduled (!rval)
+ * the the reference is dropped, otherwise the resume
+ * or the destroy cancel will dispatch the reference.
+ */
+static void tid_rdma_schedule_tid_wakeup(struct rvt_qp *qp)
+{
+ struct hfi1_qp_priv *priv;
+ struct hfi1_ibport *ibp;
+ struct hfi1_pportdata *ppd;
+ struct hfi1_devdata *dd;
+ bool rval;
+
+ if (!qp)
+ return;
+
+ priv = qp->priv;
+ ibp = to_iport(qp->ibqp.device, qp->port_num);
+ ppd = ppd_from_ibp(ibp);
+ dd = dd_from_ibdev(qp->ibqp.device);
+
+ rval = queue_work_on(priv->s_sde ?
+ priv->s_sde->cpu :
+ cpumask_first(cpumask_of_node(dd->node)),
+ ppd->hfi1_wq,
+ &priv->tid_rdma.trigger_work);
+ if (!rval)
+ rvt_put_qp(qp);
+}
+
+/**
+ * tid_rdma_trigger_resume - field a trigger work request
+ * @work - the work item
+ *
+ * Complete the off qp trigger processing by directly
+ * calling the progress routine.
+ */
+static void tid_rdma_trigger_resume(struct work_struct *work)
+{
+ struct tid_rdma_qp_params *tr;
+ struct hfi1_qp_priv *priv;
+ struct rvt_qp *qp;
+
+ tr = container_of(work, struct tid_rdma_qp_params, trigger_work);
+ priv = container_of(tr, struct hfi1_qp_priv, tid_rdma);
+ qp = priv->owner;
+ spin_lock_irq(&qp->s_lock);
+ if (qp->s_flags & HFI1_S_WAIT_TID_SPACE) {
+ spin_unlock_irq(&qp->s_lock);
+ hfi1_do_send(priv->owner, true);
+ } else {
+ spin_unlock_irq(&qp->s_lock);
+ }
+ rvt_put_qp(qp);
+}
+
+/**
+ * tid_rdma_flush_wait - unwind any tid space wait
+ *
+ * This is called when resetting a qp to
+ * allow a destroy or reset to get rid
+ * of any tid space linkage and reference counts.
+ */
+static void _tid_rdma_flush_wait(struct rvt_qp *qp, struct tid_queue *queue)
+ __must_hold(&qp->s_lock)
+{
+ struct hfi1_qp_priv *priv;
+
+ if (!qp)
+ return;
+ lockdep_assert_held(&qp->s_lock);
+ priv = qp->priv;
+ qp->s_flags &= ~HFI1_S_WAIT_TID_SPACE;
+ spin_lock(&priv->rcd->exp_lock);
+ if (!list_empty(&priv->tid_wait)) {
+ list_del_init(&priv->tid_wait);
+ qp->s_flags &= ~HFI1_S_WAIT_TID_SPACE;
+ queue->dequeue++;
+ rvt_put_qp(qp);
+ }
+ spin_unlock(&priv->rcd->exp_lock);
+}
+
+void hfi1_tid_rdma_flush_wait(struct rvt_qp *qp)
+ __must_hold(&qp->s_lock)
+{
+ struct hfi1_qp_priv *priv = qp->priv;
+
+ _tid_rdma_flush_wait(qp, &priv->rcd->flow_queue);
+ _tid_rdma_flush_wait(qp, &priv->rcd->rarr_queue);
+}
+
+/* Flow functions */
+/**
+ * kern_reserve_flow - allocate a hardware flow
+ * @rcd - the context to use for allocation
+ * @last - the index of the preferred flow. Use RXE_NUM_TID_FLOWS to
+ * signify "don't care".
+ *
+ * Use a bit mask based allocation to reserve a hardware
+ * flow for use in receiving KDETH data packets. If a preferred flow is
+ * specified the function will attempt to reserve that flow again, if
+ * available.
+ *
+ * The exp_lock must be held.
+ *
+ * Return:
+ * On success: a value postive value between 0 and RXE_NUM_TID_FLOWS - 1
+ * On failure: -EAGAIN
+ */
+static int kern_reserve_flow(struct hfi1_ctxtdata *rcd, int last)
+ __must_hold(&rcd->exp_lock)
+{
+ int nr;
+
+ /* Attempt to reserve the preferred flow index */
+ if (last >= 0 && last < RXE_NUM_TID_FLOWS &&
+ !test_and_set_bit(last, &rcd->flow_mask))
+ return last;
+
+ nr = ffz(rcd->flow_mask);
+ BUILD_BUG_ON(RXE_NUM_TID_FLOWS >=
+ (sizeof(rcd->flow_mask) * BITS_PER_BYTE));
+ if (nr > (RXE_NUM_TID_FLOWS - 1))
+ return -EAGAIN;
+ set_bit(nr, &rcd->flow_mask);
+ return nr;
+}
+
+static void kern_set_hw_flow(struct hfi1_ctxtdata *rcd, u32 generation,
+ u32 flow_idx)
+{
+ u64 reg;
+
+ reg = ((u64)generation << HFI1_KDETH_BTH_SEQ_SHIFT) |
+ RCV_TID_FLOW_TABLE_CTRL_FLOW_VALID_SMASK |
+ RCV_TID_FLOW_TABLE_CTRL_KEEP_AFTER_SEQ_ERR_SMASK |
+ RCV_TID_FLOW_TABLE_CTRL_KEEP_ON_GEN_ERR_SMASK |
+ RCV_TID_FLOW_TABLE_STATUS_SEQ_MISMATCH_SMASK |
+ RCV_TID_FLOW_TABLE_STATUS_GEN_MISMATCH_SMASK;
+
+ if (generation != KERN_GENERATION_RESERVED)
+ reg |= RCV_TID_FLOW_TABLE_CTRL_HDR_SUPP_EN_SMASK;
+
+ write_uctxt_csr(rcd->dd, rcd->ctxt,
+ RCV_TID_FLOW_TABLE + 8 * flow_idx, reg);
+}
+
+static u32 kern_setup_hw_flow(struct hfi1_ctxtdata *rcd, u32 flow_idx)
+ __must_hold(&rcd->exp_lock)
+{
+ u32 generation = rcd->flows[flow_idx].generation;
+
+ kern_set_hw_flow(rcd, generation, flow_idx);
+ return generation;
+}
+
+static u32 kern_flow_generation_next(u32 gen)
+{
+ u32 generation = mask_generation(gen + 1);
+
+ if (generation == KERN_GENERATION_RESERVED)
+ generation = mask_generation(generation + 1);
+ return generation;
+}
+
+static void kern_clear_hw_flow(struct hfi1_ctxtdata *rcd, u32 flow_idx)
+ __must_hold(&rcd->exp_lock)
+{
+ rcd->flows[flow_idx].generation =
+ kern_flow_generation_next(rcd->flows[flow_idx].generation);
+ kern_set_hw_flow(rcd, KERN_GENERATION_RESERVED, flow_idx);
+}
+
+int hfi1_kern_setup_hw_flow(struct hfi1_ctxtdata *rcd, struct rvt_qp *qp)
+{
+ struct hfi1_qp_priv *qpriv = (struct hfi1_qp_priv *)qp->priv;
+ struct tid_flow_state *fs = &qpriv->flow_state;
+ struct rvt_qp *fqp;
+ unsigned long flags;
+ int ret = 0;
+
+ /* The QP already has an allocated flow */
+ if (fs->index != RXE_NUM_TID_FLOWS)
+ return ret;
+
+ spin_lock_irqsave(&rcd->exp_lock, flags);
+ if (kernel_tid_waiters(rcd, &rcd->flow_queue, qp))
+ goto queue;
+
+ ret = kern_reserve_flow(rcd, fs->last_index);
+ if (ret < 0)
+ goto queue;
+ fs->index = ret;
+ fs->last_index = fs->index;
+
+ /* Generation received in a RESYNC overrides default flow generation */
+ if (fs->generation != KERN_GENERATION_RESERVED)
+ rcd->flows[fs->index].generation = fs->generation;
+ fs->generation = kern_setup_hw_flow(rcd, fs->index);
+ fs->psn = 0;
+ fs->flags = 0;
+ dequeue_tid_waiter(rcd, &rcd->flow_queue, qp);
+ /* get head before dropping lock */
+ fqp = first_qp(rcd, &rcd->flow_queue);
+ spin_unlock_irqrestore(&rcd->exp_lock, flags);
+
+ tid_rdma_schedule_tid_wakeup(fqp);
+ return 0;
+queue:
+ queue_qp_for_tid_wait(rcd, &rcd->flow_queue, qp);
+ spin_unlock_irqrestore(&rcd->exp_lock, flags);
+ return -EAGAIN;
+}
+
+void hfi1_kern_clear_hw_flow(struct hfi1_ctxtdata *rcd, struct rvt_qp *qp)
+{
+ struct hfi1_qp_priv *qpriv = (struct hfi1_qp_priv *)qp->priv;
+ struct tid_flow_state *fs = &qpriv->flow_state;
+ struct rvt_qp *fqp;
+ unsigned long flags;
+
+ if (fs->index >= RXE_NUM_TID_FLOWS)
+ return;
+ spin_lock_irqsave(&rcd->exp_lock, flags);
+ kern_clear_hw_flow(rcd, fs->index);
+ clear_bit(fs->index, &rcd->flow_mask);
+ fs->index = RXE_NUM_TID_FLOWS;
+ fs->psn = 0;
+ fs->generation = KERN_GENERATION_RESERVED;
+
+ /* get head before dropping lock */
+ fqp = first_qp(rcd, &rcd->flow_queue);
+ spin_unlock_irqrestore(&rcd->exp_lock, flags);
+
+ if (fqp == qp) {
+ __trigger_tid_waiter(fqp);
+ rvt_put_qp(fqp);
+ } else {
+ tid_rdma_schedule_tid_wakeup(fqp);
+ }
+}
+
+void hfi1_kern_init_ctxt_generations(struct hfi1_ctxtdata *rcd)
+{
+ int i;
+
+ for (i = 0; i < RXE_NUM_TID_FLOWS; i++) {
+ rcd->flows[i].generation = mask_generation(prandom_u32());
+ kern_set_hw_flow(rcd, KERN_GENERATION_RESERVED, i);
+ }
+}
+
+/* TID allocation functions */
+static u8 trdma_pset_order(struct tid_rdma_pageset *s)
+{
+ u8 count = s->count;
+
+ return ilog2(count) + 1;
+}
+
+/**
+ * tid_rdma_find_phys_blocks_4k - get groups base on mr info
+ * @npages - number of pages
+ * @pages - pointer to an array of page structs
+ * @list - page set array to return
+ *
+ * This routine returns the number of groups associated with
+ * the current sge information. This implementation is based
+ * on the expected receive find_phys_blocks() adjusted to
+ * use the MR information vs. the pfn.
+ *
+ * Return:
+ * the number of RcvArray entries
+ */
+static u32 tid_rdma_find_phys_blocks_4k(struct tid_rdma_flow *flow,
+ struct page **pages,
+ u32 npages,
+ struct tid_rdma_pageset *list)
+{
+ u32 pagecount, pageidx, setcount = 0, i;
+ void *vaddr, *this_vaddr;
+
+ if (!npages)
+ return 0;
+
+ /*
+ * Look for sets of physically contiguous pages in the user buffer.
+ * This will allow us to optimize Expected RcvArray entry usage by
+ * using the bigger supported sizes.
+ */
+ vaddr = page_address(pages[0]);
+ trace_hfi1_tid_flow_page(flow->req->qp, flow, 0, 0, 0, vaddr);
+ for (pageidx = 0, pagecount = 1, i = 1; i <= npages; i++) {
+ this_vaddr = i < npages ? page_address(pages[i]) : NULL;
+ trace_hfi1_tid_flow_page(flow->req->qp, flow, i, 0, 0,
+ this_vaddr);
+ /*
+ * If the vaddr's are not sequential, pages are not physically
+ * contiguous.
+ */
+ if (this_vaddr != (vaddr + PAGE_SIZE)) {
+ /*
+ * At this point we have to loop over the set of
+ * physically contiguous pages and break them down it
+ * sizes supported by the HW.
+ * There are two main constraints:
+ * 1. The max buffer size is MAX_EXPECTED_BUFFER.
+ * If the total set size is bigger than that
+ * program only a MAX_EXPECTED_BUFFER chunk.
+ * 2. The buffer size has to be a power of two. If
+ * it is not, round down to the closes power of
+ * 2 and program that size.
+ */
+ while (pagecount) {
+ int maxpages = pagecount;
+ u32 bufsize = pagecount * PAGE_SIZE;
+
+ if (bufsize > MAX_EXPECTED_BUFFER)
+ maxpages =
+ MAX_EXPECTED_BUFFER >>
+ PAGE_SHIFT;
+ else if (!is_power_of_2(bufsize))
+ maxpages =
+ rounddown_pow_of_two(bufsize) >>
+ PAGE_SHIFT;
+
+ list[setcount].idx = pageidx;
+ list[setcount].count = maxpages;
+ trace_hfi1_tid_pageset(flow->req->qp, setcount,
+ list[setcount].idx,
+ list[setcount].count);
+ pagecount -= maxpages;
+ pageidx += maxpages;
+ setcount++;
+ }
+ pageidx = i;
+ pagecount = 1;
+ vaddr = this_vaddr;
+ } else {
+ vaddr += PAGE_SIZE;
+ pagecount++;
+ }
+ }
+ /* insure we always return an even number of sets */
+ if (setcount & 1)
+ list[setcount++].count = 0;
+ return setcount;
+}
+
+/**
+ * tid_flush_pages - dump out pages into pagesets
+ * @list - list of pagesets
+ * @idx - pointer to current page index
+ * @pages - number of pages to dump
+ * @sets - current number of pagesset
+ *
+ * This routine flushes out accumuated pages.
+ *
+ * To insure an even number of sets the
+ * code may add a filler.
+ *
+ * This can happen with when pages is not
+ * a power of 2 or pages is a power of 2
+ * less than the maximum pages.
+ *
+ * Return:
+ * The new number of sets
+ */
+
+static u32 tid_flush_pages(struct tid_rdma_pageset *list,
+ u32 *idx, u32 pages, u32 sets)
+{
+ while (pages) {
+ u32 maxpages = pages;
+
+ if (maxpages > MAX_EXPECTED_PAGES)
+ maxpages = MAX_EXPECTED_PAGES;
+ else if (!is_power_of_2(maxpages))
+ maxpages = rounddown_pow_of_two(maxpages);
+ list[sets].idx = *idx;
+ list[sets++].count = maxpages;
+ *idx += maxpages;
+ pages -= maxpages;
+ }
+ /* might need a filler */
+ if (sets & 1)
+ list[sets++].count = 0;
+ return sets;
+}
+
+/**
+ * tid_rdma_find_phys_blocks_8k - get groups base on mr info
+ * @pages - pointer to an array of page structs
+ * @npages - number of pages
+ * @list - page set array to return
+ *
+ * This routine parses an array of pages to compute pagesets
+ * in an 8k compatible way.
+ *
+ * pages are tested two at a time, i, i + 1 for contiguous
+ * pages and i - 1 and i contiguous pages.
+ *
+ * If any condition is false, any accumlated pages are flushed and
+ * v0,v1 are emitted as separate PAGE_SIZE pagesets
+ *
+ * Otherwise, the current 8k is totaled for a future flush.
+ *
+ * Return:
+ * The number of pagesets
+ * list set with the returned number of pagesets
+ *
+ */
+static u32 tid_rdma_find_phys_blocks_8k(struct tid_rdma_flow *flow,
+ struct page **pages,
+ u32 npages,
+ struct tid_rdma_pageset *list)
+{
+ u32 idx, sets = 0, i;
+ u32 pagecnt = 0;
+ void *v0, *v1, *vm1;
+
+ if (!npages)
+ return 0;
+ for (idx = 0, i = 0, vm1 = NULL; i < npages; i += 2) {
+ /* get a new v0 */
+ v0 = page_address(pages[i]);
+ trace_hfi1_tid_flow_page(flow->req->qp, flow, i, 1, 0, v0);
+ v1 = i + 1 < npages ?
+ page_address(pages[i + 1]) : NULL;
+ trace_hfi1_tid_flow_page(flow->req->qp, flow, i, 1, 1, v1);
+ /* compare i, i + 1 vaddr */
+ if (v1 != (v0 + PAGE_SIZE)) {
+ /* flush out pages */
+ sets = tid_flush_pages(list, &idx, pagecnt, sets);
+ /* output v0,v1 as two pagesets */
+ list[sets].idx = idx++;
+ list[sets++].count = 1;
+ if (v1) {
+ list[sets].count = 1;
+ list[sets++].idx = idx++;
+ } else {
+ list[sets++].count = 0;
+ }
+ vm1 = NULL;
+ pagecnt = 0;
+ continue;
+ }
+ /* i,i+1 consecutive, look at i-1,i */
+ if (vm1 && v0 != (vm1 + PAGE_SIZE)) {
+ /* flush out pages */
+ sets = tid_flush_pages(list, &idx, pagecnt, sets);
+ pagecnt = 0;
+ }
+ /* pages will always be a multiple of 8k */
+ pagecnt += 2;
+ /* save i-1 */
+ vm1 = v1;
+ /* move to next pair */
+ }
+ /* dump residual pages at end */
+ sets = tid_flush_pages(list, &idx, npages - idx, sets);
+ /* by design cannot be odd sets */
+ WARN_ON(sets & 1);
+ return sets;
+}
+
+/**
+ * Find pages for one segment of a sge array represented by @ss. The function
+ * does not check the sge, the sge must have been checked for alignment with a
+ * prior call to hfi1_kern_trdma_ok. Other sge checking is done as part of
+ * rvt_lkey_ok and rvt_rkey_ok. Also, the function only modifies the local sge
+ * copy maintained in @ss->sge, the original sge is not modified.
+ *
+ * Unlike IB RDMA WRITE, we can't decrement ss->num_sge here because we are not
+ * releasing the MR reference count at the same time. Otherwise, we'll "leak"
+ * references to the MR. This difference requires that we keep track of progress
+ * into the sg_list. This is done by the cur_seg cursor in the tid_rdma_request
+ * structure.
+ */
+static u32 kern_find_pages(struct tid_rdma_flow *flow,
+ struct page **pages,
+ struct rvt_sge_state *ss, bool *last)
+{
+ struct tid_rdma_request *req = flow->req;
+ struct rvt_sge *sge = &ss->sge;
+ u32 length = flow->req->seg_len;
+ u32 len = PAGE_SIZE;
+ u32 i = 0;
+
+ while (length && req->isge < ss->num_sge) {
+ pages[i++] = virt_to_page(sge->vaddr);
+
+ sge->vaddr += len;
+ sge->length -= len;
+ sge->sge_length -= len;
+ if (!sge->sge_length) {
+ if (++req->isge < ss->num_sge)
+ *sge = ss->sg_list[req->isge - 1];
+ } else if (sge->length == 0 && sge->mr->lkey) {
+ if (++sge->n >= RVT_SEGSZ) {
+ ++sge->m;
+ sge->n = 0;
+ }
+ sge->vaddr = sge->mr->map[sge->m]->segs[sge->n].vaddr;
+ sge->length = sge->mr->map[sge->m]->segs[sge->n].length;
+ }
+ length -= len;
+ }
+
+ flow->length = flow->req->seg_len - length;
+ *last = req->isge == ss->num_sge ? false : true;
+ return i;
+}
+
+static void dma_unmap_flow(struct tid_rdma_flow *flow)
+{
+ struct hfi1_devdata *dd;
+ int i;
+ struct tid_rdma_pageset *pset;
+
+ dd = flow->req->rcd->dd;
+ for (i = 0, pset = &flow->pagesets[0]; i < flow->npagesets;
+ i++, pset++) {
+ if (pset->count && pset->addr) {
+ dma_unmap_page(&dd->pcidev->dev,
+ pset->addr,
+ PAGE_SIZE * pset->count,
+ DMA_FROM_DEVICE);
+ pset->mapped = 0;
+ }
+ }
+}
+
+static int dma_map_flow(struct tid_rdma_flow *flow, struct page **pages)
+{
+ int i;
+ struct hfi1_devdata *dd = flow->req->rcd->dd;
+ struct tid_rdma_pageset *pset;
+
+ for (i = 0, pset = &flow->pagesets[0]; i < flow->npagesets;
+ i++, pset++) {
+ if (pset->count) {
+ pset->addr = dma_map_page(&dd->pcidev->dev,
+ pages[pset->idx],
+ 0,
+ PAGE_SIZE * pset->count,
+ DMA_FROM_DEVICE);
+
+ if (dma_mapping_error(&dd->pcidev->dev, pset->addr)) {
+ dma_unmap_flow(flow);
+ return -ENOMEM;
+ }
+ pset->mapped = 1;
+ }
+ }
+ return 0;
+}
+
+static inline bool dma_mapped(struct tid_rdma_flow *flow)
+{
+ return !!flow->pagesets[0].mapped;
+}
+
+/*
+ * Get pages pointers and identify contiguous physical memory chunks for a
+ * segment. All segments are of length flow->req->seg_len.
+ */
+static int kern_get_phys_blocks(struct tid_rdma_flow *flow,
+ struct page **pages,
+ struct rvt_sge_state *ss, bool *last)
+{
+ u8 npages;
+
+ /* Reuse previously computed pagesets, if any */
+ if (flow->npagesets) {
+ trace_hfi1_tid_flow_alloc(flow->req->qp, flow->req->setup_head,
+ flow);
+ if (!dma_mapped(flow))
+ return dma_map_flow(flow, pages);
+ return 0;
+ }
+
+ npages = kern_find_pages(flow, pages, ss, last);
+
+ if (flow->req->qp->pmtu == enum_to_mtu(OPA_MTU_4096))
+ flow->npagesets =
+ tid_rdma_find_phys_blocks_4k(flow, pages, npages,
+ flow->pagesets);
+ else
+ flow->npagesets =
+ tid_rdma_find_phys_blocks_8k(flow, pages, npages,
+ flow->pagesets);
+
+ return dma_map_flow(flow, pages);
+}
+
+static inline void kern_add_tid_node(struct tid_rdma_flow *flow,
+ struct hfi1_ctxtdata *rcd, char *s,
+ struct tid_group *grp, u8 cnt)
+{
+ struct kern_tid_node *node = &flow->tnode[flow->tnode_cnt++];
+
+ WARN_ON_ONCE(flow->tnode_cnt >=
+ (TID_RDMA_MAX_SEGMENT_SIZE >> PAGE_SHIFT));
+ if (WARN_ON_ONCE(cnt & 1))
+ dd_dev_err(rcd->dd,
+ "unexpected odd allocation cnt %u map 0x%x used %u",
+ cnt, grp->map, grp->used);
+
+ node->grp = grp;
+ node->map = grp->map;
+ node->cnt = cnt;
+ trace_hfi1_tid_node_add(flow->req->qp, s, flow->tnode_cnt - 1,
+ grp->base, grp->map, grp->used, cnt);
+}
+
+/*
+ * Try to allocate pageset_count TID's from TID groups for a context
+ *
+ * This function allocates TID's without moving groups between lists or
+ * modifying grp->map. This is done as follows, being cogizant of the lists
+ * between which the TID groups will move:
+ * 1. First allocate complete groups of 8 TID's since this is more efficient,
+ * these groups will move from group->full without affecting used
+ * 2. If more TID's are needed allocate from used (will move from used->full or
+ * stay in used)
+ * 3. If we still don't have the required number of TID's go back and look again
+ * at a complete group (will move from group->used)
+ */
+static int kern_alloc_tids(struct tid_rdma_flow *flow)
+{
+ struct hfi1_ctxtdata *rcd = flow->req->rcd;
+ struct hfi1_devdata *dd = rcd->dd;
+ u32 ngroups, pageidx = 0;
+ struct tid_group *group = NULL, *used;
+ u8 use;
+
+ flow->tnode_cnt = 0;
+ ngroups = flow->npagesets / dd->rcv_entries.group_size;
+ if (!ngroups)
+ goto used_list;
+
+ /* First look at complete groups */
+ list_for_each_entry(group, &rcd->tid_group_list.list, list) {
+ kern_add_tid_node(flow, rcd, "complete groups", group,
+ group->size);
+
+ pageidx += group->size;
+ if (!--ngroups)
+ break;
+ }
+
+ if (pageidx >= flow->npagesets)
+ goto ok;
+
+used_list:
+ /* Now look at partially used groups */
+ list_for_each_entry(used, &rcd->tid_used_list.list, list) {
+ use = min_t(u32, flow->npagesets - pageidx,
+ used->size - used->used);
+ kern_add_tid_node(flow, rcd, "used groups", used, use);
+
+ pageidx += use;
+ if (pageidx >= flow->npagesets)
+ goto ok;
+ }
+
+ /*
+ * Look again at a complete group, continuing from where we left.
+ * However, if we are at the head, we have reached the end of the
+ * complete groups list from the first loop above
+ */
+ if (group && &group->list == &rcd->tid_group_list.list)
+ goto bail_eagain;
+ group = list_prepare_entry(group, &rcd->tid_group_list.list,
+ list);
+ if (list_is_last(&group->list, &rcd->tid_group_list.list))
+ goto bail_eagain;
+ group = list_next_entry(group, list);
+ use = min_t(u32, flow->npagesets - pageidx, group->size);
+ kern_add_tid_node(flow, rcd, "complete continue", group, use);
+ pageidx += use;
+ if (pageidx >= flow->npagesets)
+ goto ok;
+bail_eagain:
+ trace_hfi1_msg_alloc_tids(flow->req->qp, " insufficient tids: needed ",
+ (u64)flow->npagesets);
+ return -EAGAIN;
+ok:
+ return 0;
+}
+
+static void kern_program_rcv_group(struct tid_rdma_flow *flow, int grp_num,
+ u32 *pset_idx)
+{
+ struct hfi1_ctxtdata *rcd = flow->req->rcd;
+ struct hfi1_devdata *dd = rcd->dd;
+ struct kern_tid_node *node = &flow->tnode[grp_num];
+ struct tid_group *grp = node->grp;
+ struct tid_rdma_pageset *pset;
+ u32 pmtu_pg = flow->req->qp->pmtu >> PAGE_SHIFT;
+ u32 rcventry, npages = 0, pair = 0, tidctrl;
+ u8 i, cnt = 0;
+
+ for (i = 0; i < grp->size; i++) {
+ rcventry = grp->base + i;
+
+ if (node->map & BIT(i) || cnt >= node->cnt) {
+ rcv_array_wc_fill(dd, rcventry);
+ continue;
+ }
+ pset = &flow->pagesets[(*pset_idx)++];
+ if (pset->count) {
+ hfi1_put_tid(dd, rcventry, PT_EXPECTED,
+ pset->addr, trdma_pset_order(pset));
+ } else {
+ hfi1_put_tid(dd, rcventry, PT_INVALID, 0, 0);
+ }
+ npages += pset->count;
+
+ rcventry -= rcd->expected_base;
+ tidctrl = pair ? 0x3 : rcventry & 0x1 ? 0x2 : 0x1;
+ /*
+ * A single TID entry will be used to use a rcvarr pair (with
+ * tidctrl 0x3), if ALL these are true (a) the bit pos is even
+ * (b) the group map shows current and the next bits as free
+ * indicating two consecutive rcvarry entries are available (c)
+ * we actually need 2 more entries
+ */
+ pair = !(i & 0x1) && !((node->map >> i) & 0x3) &&
+ node->cnt >= cnt + 2;
+ if (!pair) {
+ if (!pset->count)
+ tidctrl = 0x1;
+ flow->tid_entry[flow->tidcnt++] =
+ EXP_TID_SET(IDX, rcventry >> 1) |
+ EXP_TID_SET(CTRL, tidctrl) |
+ EXP_TID_SET(LEN, npages);
+ trace_hfi1_tid_entry_alloc(/* entry */
+ flow->req->qp, flow->tidcnt - 1,
+ flow->tid_entry[flow->tidcnt - 1]);
+
+ /* Efficient DIV_ROUND_UP(npages, pmtu_pg) */
+ flow->npkts += (npages + pmtu_pg - 1) >> ilog2(pmtu_pg);
+ npages = 0;
+ }
+
+ if (grp->used == grp->size - 1)
+ tid_group_move(grp, &rcd->tid_used_list,
+ &rcd->tid_full_list);
+ else if (!grp->used)
+ tid_group_move(grp, &rcd->tid_group_list,
+ &rcd->tid_used_list);
+
+ grp->used++;
+ grp->map |= BIT(i);
+ cnt++;
+ }
+}
+
+static void kern_unprogram_rcv_group(struct tid_rdma_flow *flow, int grp_num)
+{
+ struct hfi1_ctxtdata *rcd = flow->req->rcd;
+ struct hfi1_devdata *dd = rcd->dd;
+ struct kern_tid_node *node = &flow->tnode[grp_num];
+ struct tid_group *grp = node->grp;
+ u32 rcventry;
+ u8 i, cnt = 0;
+
+ for (i = 0; i < grp->size; i++) {
+ rcventry = grp->base + i;
+
+ if (node->map & BIT(i) || cnt >= node->cnt) {
+ rcv_array_wc_fill(dd, rcventry);
+ continue;
+ }
+
+ hfi1_put_tid(dd, rcventry, PT_INVALID, 0, 0);
+
+ grp->used--;
+ grp->map &= ~BIT(i);
+ cnt++;
+
+ if (grp->used == grp->size - 1)
+ tid_group_move(grp, &rcd->tid_full_list,
+ &rcd->tid_used_list);
+ else if (!grp->used)
+ tid_group_move(grp, &rcd->tid_used_list,
+ &rcd->tid_group_list);
+ }
+ if (WARN_ON_ONCE(cnt & 1)) {
+ struct hfi1_ctxtdata *rcd = flow->req->rcd;
+ struct hfi1_devdata *dd = rcd->dd;
+
+ dd_dev_err(dd, "unexpected odd free cnt %u map 0x%x used %u",
+ cnt, grp->map, grp->used);
+ }
+}
+
+static void kern_program_rcvarray(struct tid_rdma_flow *flow)
+{
+ u32 pset_idx = 0;
+ int i;
+
+ flow->npkts = 0;
+ flow->tidcnt = 0;
+ for (i = 0; i < flow->tnode_cnt; i++)
+ kern_program_rcv_group(flow, i, &pset_idx);
+ trace_hfi1_tid_flow_alloc(flow->req->qp, flow->req->setup_head, flow);
+}
+
+/**
+ * hfi1_kern_exp_rcv_setup() - setup TID's and flow for one segment of a
+ * TID RDMA request
+ *
+ * @req: TID RDMA request for which the segment/flow is being set up
+ * @ss: sge state, maintains state across successive segments of a sge
+ * @last: set to true after the last sge segment has been processed
+ *
+ * This function
+ * (1) finds a free flow entry in the flow circular buffer
+ * (2) finds pages and continuous physical chunks constituing one segment
+ * of an sge
+ * (3) allocates TID group entries for those chunks
+ * (4) programs rcvarray entries in the hardware corresponding to those
+ * TID's
+ * (5) computes a tidarray with formatted TID entries which can be sent
+ * to the sender
+ * (6) Reserves and programs HW flows.
+ * (7) It also manages queing the QP when TID/flow resources are not
+ * available.
+ *
+ * @req points to struct tid_rdma_request of which the segments are a part. The
+ * function uses qp, rcd and seg_len members of @req. In the absence of errors,
+ * req->flow_idx is the index of the flow which has been prepared in this
+ * invocation of function call. With flow = &req->flows[req->flow_idx],
+ * flow->tid_entry contains the TID array which the sender can use for TID RDMA
+ * sends and flow->npkts contains number of packets required to send the
+ * segment.
+ *
+ * hfi1_check_sge_align should be called prior to calling this function and if
+ * it signals error TID RDMA cannot be used for this sge and this function
+ * should not be called.
+ *
+ * For the queuing, caller must hold the flow->req->qp s_lock from the send
+ * engine and the function will procure the exp_lock.
+ *
+ * Return:
+ * The function returns -EAGAIN if sufficient number of TID/flow resources to
+ * map the segment could not be allocated. In this case the function should be
+ * called again with previous arguments to retry the TID allocation. There are
+ * no other error returns. The function returns 0 on success.
+ */
+int hfi1_kern_exp_rcv_setup(struct tid_rdma_request *req,
+ struct rvt_sge_state *ss, bool *last)
+ __must_hold(&req->qp->s_lock)
+{
+ struct tid_rdma_flow *flow = &req->flows[req->setup_head];
+ struct hfi1_ctxtdata *rcd = req->rcd;
+ struct hfi1_qp_priv *qpriv = req->qp->priv;
+ unsigned long flags;
+ struct rvt_qp *fqp;
+ u16 clear_tail = req->clear_tail;
+
+ lockdep_assert_held(&req->qp->s_lock);
+ /*
+ * We return error if either (a) we don't have space in the flow
+ * circular buffer, or (b) we already have max entries in the buffer.
+ * Max entries depend on the type of request we are processing and the
+ * negotiated TID RDMA parameters.
+ */
+ if (!CIRC_SPACE(req->setup_head, clear_tail, MAX_FLOWS) ||
+ CIRC_CNT(req->setup_head, clear_tail, MAX_FLOWS) >=
+ req->n_flows)
+ return -EINVAL;
+
+ /*
+ * Get pages, identify contiguous physical memory chunks for the segment
+ * If we can not determine a DMA address mapping we will treat it just
+ * like if we ran out of space above.
+ */
+ if (kern_get_phys_blocks(flow, qpriv->pages, ss, last)) {
+ hfi1_wait_kmem(flow->req->qp);
+ return -ENOMEM;
+ }
+
+ spin_lock_irqsave(&rcd->exp_lock, flags);
+ if (kernel_tid_waiters(rcd, &rcd->rarr_queue, flow->req->qp))
+ goto queue;
+
+ /*
+ * At this point we know the number of pagesets and hence the number of
+ * TID's to map the segment. Allocate the TID's from the TID groups. If
+ * we cannot allocate the required number we exit and try again later
+ */
+ if (kern_alloc_tids(flow))
+ goto queue;
+ /*
+ * Finally program the TID entries with the pagesets, compute the
+ * tidarray and enable the HW flow
+ */
+ kern_program_rcvarray(flow);
+
+ /*
+ * Setup the flow state with relevant information.
+ * This information is used for tracking the sequence of data packets
+ * for the segment.
+ * The flow is setup here as this is the most accurate time and place
+ * to do so. Doing at a later time runs the risk of the flow data in
+ * qpriv getting out of sync.
+ */
+ memset(&flow->flow_state, 0x0, sizeof(flow->flow_state));
+ flow->idx = qpriv->flow_state.index;
+ flow->flow_state.generation = qpriv->flow_state.generation;
+ flow->flow_state.spsn = qpriv->flow_state.psn;
+ flow->flow_state.lpsn = flow->flow_state.spsn + flow->npkts - 1;
+ flow->flow_state.r_next_psn =
+ full_flow_psn(flow, flow->flow_state.spsn);
+ qpriv->flow_state.psn += flow->npkts;
+
+ dequeue_tid_waiter(rcd, &rcd->rarr_queue, flow->req->qp);
+ /* get head before dropping lock */
+ fqp = first_qp(rcd, &rcd->rarr_queue);
+ spin_unlock_irqrestore(&rcd->exp_lock, flags);
+ tid_rdma_schedule_tid_wakeup(fqp);
+
+ req->setup_head = (req->setup_head + 1) & (MAX_FLOWS - 1);
+ return 0;
+queue:
+ queue_qp_for_tid_wait(rcd, &rcd->rarr_queue, flow->req->qp);
+ spin_unlock_irqrestore(&rcd->exp_lock, flags);
+ return -EAGAIN;
+}
+
+static void hfi1_tid_rdma_reset_flow(struct tid_rdma_flow *flow)
+{
+ flow->npagesets = 0;
+}
+
+/*
+ * This function is called after one segment has been successfully sent to
+ * release the flow and TID HW/SW resources for that segment. The segments for a
+ * TID RDMA request are setup and cleared in FIFO order which is managed using a
+ * circular buffer.
+ */
+int hfi1_kern_exp_rcv_clear(struct tid_rdma_request *req)
+ __must_hold(&req->qp->s_lock)
+{
+ struct tid_rdma_flow *flow = &req->flows[req->clear_tail];
+ struct hfi1_ctxtdata *rcd = req->rcd;
+ unsigned long flags;
+ int i;
+ struct rvt_qp *fqp;
+
+ lockdep_assert_held(&req->qp->s_lock);
+ /* Exit if we have nothing in the flow circular buffer */
+ if (!CIRC_CNT(req->setup_head, req->clear_tail, MAX_FLOWS))
+ return -EINVAL;
+
+ spin_lock_irqsave(&rcd->exp_lock, flags);
+
+ for (i = 0; i < flow->tnode_cnt; i++)
+ kern_unprogram_rcv_group(flow, i);
+ /* To prevent double unprogramming */
+ flow->tnode_cnt = 0;
+ /* get head before dropping lock */
+ fqp = first_qp(rcd, &rcd->rarr_queue);
+ spin_unlock_irqrestore(&rcd->exp_lock, flags);
+
+ dma_unmap_flow(flow);
+
+ hfi1_tid_rdma_reset_flow(flow);
+ req->clear_tail = (req->clear_tail + 1) & (MAX_FLOWS - 1);
+
+ if (fqp == req->qp) {
+ __trigger_tid_waiter(fqp);
+ rvt_put_qp(fqp);
+ } else {
+ tid_rdma_schedule_tid_wakeup(fqp);
+ }
+
+ return 0;
+}
+
+/*
+ * This function is called to release all the tid entries for
+ * a request.
+ */
+void hfi1_kern_exp_rcv_clear_all(struct tid_rdma_request *req)
+ __must_hold(&req->qp->s_lock)
+{
+ /* Use memory barrier for proper ordering */
+ while (CIRC_CNT(req->setup_head, req->clear_tail, MAX_FLOWS)) {
+ if (hfi1_kern_exp_rcv_clear(req))
+ break;
+ }
+}
+
+/**
+ * hfi1_kern_exp_rcv_free_flows - free priviously allocated flow information
+ * @req - the tid rdma request to be cleaned
+ */
+static void hfi1_kern_exp_rcv_free_flows(struct tid_rdma_request *req)
+{
+ kfree(req->flows);
+ req->flows = NULL;
+}
+
+/**
+ * __trdma_clean_swqe - clean up for large sized QPs
+ * @qp: the queue patch
+ * @wqe: the send wqe
+ */
+void __trdma_clean_swqe(struct rvt_qp *qp, struct rvt_swqe *wqe)
+{
+ struct hfi1_swqe_priv *p = wqe->priv;
+
+ hfi1_kern_exp_rcv_free_flows(&p->tid_req);
+}
+
+/*
+ * This can be called at QP create time or in the data path.
+ */
+static int hfi1_kern_exp_rcv_alloc_flows(struct tid_rdma_request *req,
+ gfp_t gfp)
+{
+ struct tid_rdma_flow *flows;
+ int i;
+
+ if (likely(req->flows))
+ return 0;
+ flows = kmalloc_node(MAX_FLOWS * sizeof(*flows), gfp,
+ req->rcd->numa_id);
+ if (!flows)
+ return -ENOMEM;
+ /* mini init */
+ for (i = 0; i < MAX_FLOWS; i++) {
+ flows[i].req = req;
+ flows[i].npagesets = 0;
+ flows[i].pagesets[0].mapped = 0;
+ }
+ req->flows = flows;
+ return 0;
+}
+
+static void hfi1_init_trdma_req(struct rvt_qp *qp,
+ struct tid_rdma_request *req)
+{
+ struct hfi1_qp_priv *qpriv = qp->priv;
+
+ /*
+ * Initialize various TID RDMA request variables.
+ * These variables are "static", which is why they
+ * can be pre-initialized here before the WRs has
+ * even been submitted.
+ * However, non-NULL values for these variables do not
+ * imply that this WQE has been enabled for TID RDMA.
+ * Drivers should check the WQE's opcode to determine
+ * if a request is a TID RDMA one or not.
+ */
+ req->qp = qp;
+ req->rcd = qpriv->rcd;
+}
+
+u64 hfi1_access_sw_tid_wait(const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = context;
+
+ return dd->verbs_dev.n_tidwait;
+}
+
+static struct tid_rdma_flow *find_flow_ib(struct tid_rdma_request *req,
+ u32 psn, u16 *fidx)
+{
+ u16 head, tail;
+ struct tid_rdma_flow *flow;
+
+ head = req->setup_head;
+ tail = req->clear_tail;
+ for ( ; CIRC_CNT(head, tail, MAX_FLOWS);
+ tail = CIRC_NEXT(tail, MAX_FLOWS)) {
+ flow = &req->flows[tail];
+ if (cmp_psn(psn, flow->flow_state.ib_spsn) >= 0 &&
+ cmp_psn(psn, flow->flow_state.ib_lpsn) <= 0) {
+ if (fidx)
+ *fidx = tail;
+ return flow;
+ }
+ }
+ return NULL;
+}
+
+static struct tid_rdma_flow *
+__find_flow_ranged(struct tid_rdma_request *req, u16 head, u16 tail,
+ u32 psn, u16 *fidx)
+{
+ for ( ; CIRC_CNT(head, tail, MAX_FLOWS);
+ tail = CIRC_NEXT(tail, MAX_FLOWS)) {
+ struct tid_rdma_flow *flow = &req->flows[tail];
+ u32 spsn, lpsn;
+
+ spsn = full_flow_psn(flow, flow->flow_state.spsn);
+ lpsn = full_flow_psn(flow, flow->flow_state.lpsn);
+
+ if (cmp_psn(psn, spsn) >= 0 && cmp_psn(psn, lpsn) <= 0) {
+ if (fidx)
+ *fidx = tail;
+ return flow;
+ }
+ }
+ return NULL;
+}
+
+static struct tid_rdma_flow *find_flow(struct tid_rdma_request *req,
+ u32 psn, u16 *fidx)
+{
+ return __find_flow_ranged(req, req->setup_head, req->clear_tail, psn,
+ fidx);
+}
+
+/* TID RDMA READ functions */
+u32 hfi1_build_tid_rdma_read_packet(struct rvt_swqe *wqe,
+ struct ib_other_headers *ohdr, u32 *bth1,
+ u32 *bth2, u32 *len)
+{
+ struct tid_rdma_request *req = wqe_to_tid_req(wqe);
+ struct tid_rdma_flow *flow = &req->flows[req->flow_idx];
+ struct rvt_qp *qp = req->qp;
+ struct hfi1_qp_priv *qpriv = qp->priv;
+ struct hfi1_swqe_priv *wpriv = wqe->priv;
+ struct tid_rdma_read_req *rreq = &ohdr->u.tid_rdma.r_req;
+ struct tid_rdma_params *remote;
+ u32 req_len = 0;
+ void *req_addr = NULL;
+
+ /* This is the IB psn used to send the request */
+ *bth2 = mask_psn(flow->flow_state.ib_spsn + flow->pkt);
+ trace_hfi1_tid_flow_build_read_pkt(qp, req->flow_idx, flow);
+
+ /* TID Entries for TID RDMA READ payload */
+ req_addr = &flow->tid_entry[flow->tid_idx];
+ req_len = sizeof(*flow->tid_entry) *
+ (flow->tidcnt - flow->tid_idx);
+
+ memset(&ohdr->u.tid_rdma.r_req, 0, sizeof(ohdr->u.tid_rdma.r_req));
+ wpriv->ss.sge.vaddr = req_addr;
+ wpriv->ss.sge.sge_length = req_len;
+ wpriv->ss.sge.length = wpriv->ss.sge.sge_length;
+ /*
+ * We can safely zero these out. Since the first SGE covers the
+ * entire packet, nothing else should even look at the MR.
+ */
+ wpriv->ss.sge.mr = NULL;
+ wpriv->ss.sge.m = 0;
+ wpriv->ss.sge.n = 0;
+
+ wpriv->ss.sg_list = NULL;
+ wpriv->ss.total_len = wpriv->ss.sge.sge_length;
+ wpriv->ss.num_sge = 1;
+
+ /* Construct the TID RDMA READ REQ packet header */
+ rcu_read_lock();
+ remote = rcu_dereference(qpriv->tid_rdma.remote);
+
+ KDETH_RESET(rreq->kdeth0, KVER, 0x1);
+ KDETH_RESET(rreq->kdeth1, JKEY, remote->jkey);
+ rreq->reth.vaddr = cpu_to_be64(wqe->rdma_wr.remote_addr +
+ req->cur_seg * req->seg_len + flow->sent);
+ rreq->reth.rkey = cpu_to_be32(wqe->rdma_wr.rkey);
+ rreq->reth.length = cpu_to_be32(*len);
+ rreq->tid_flow_psn =
+ cpu_to_be32((flow->flow_state.generation <<
+ HFI1_KDETH_BTH_SEQ_SHIFT) |
+ ((flow->flow_state.spsn + flow->pkt) &
+ HFI1_KDETH_BTH_SEQ_MASK));
+ rreq->tid_flow_qp =
+ cpu_to_be32(qpriv->tid_rdma.local.qp |
+ ((flow->idx & TID_RDMA_DESTQP_FLOW_MASK) <<
+ TID_RDMA_DESTQP_FLOW_SHIFT) |
+ qpriv->rcd->ctxt);
+ rreq->verbs_qp = cpu_to_be32(qp->remote_qpn);
+ *bth1 &= ~RVT_QPN_MASK;
+ *bth1 |= remote->qp;
+ *bth2 |= IB_BTH_REQ_ACK;
+ rcu_read_unlock();
+
+ /* We are done with this segment */
+ flow->sent += *len;
+ req->cur_seg++;
+ qp->s_state = TID_OP(READ_REQ);
+ req->ack_pending++;
+ req->flow_idx = (req->flow_idx + 1) & (MAX_FLOWS - 1);
+ qpriv->pending_tid_r_segs++;
+ qp->s_num_rd_atomic++;
+
+ /* Set the TID RDMA READ request payload size */
+ *len = req_len;
+
+ return sizeof(ohdr->u.tid_rdma.r_req) / sizeof(u32);
+}
+
+/*
+ * @len: contains the data length to read upon entry and the read request
+ * payload length upon exit.
+ */
+u32 hfi1_build_tid_rdma_read_req(struct rvt_qp *qp, struct rvt_swqe *wqe,
+ struct ib_other_headers *ohdr, u32 *bth1,
+ u32 *bth2, u32 *len)
+ __must_hold(&qp->s_lock)
+{
+ struct hfi1_qp_priv *qpriv = qp->priv;
+ struct tid_rdma_request *req = wqe_to_tid_req(wqe);
+ struct tid_rdma_flow *flow = NULL;
+ u32 hdwords = 0;
+ bool last;
+ bool retry = true;
+ u32 npkts = rvt_div_round_up_mtu(qp, *len);
+
+ trace_hfi1_tid_req_build_read_req(qp, 0, wqe->wr.opcode, wqe->psn,
+ wqe->lpsn, req);
+ /*
+ * Check sync conditions. Make sure that there are no pending
+ * segments before freeing the flow.
+ */
+sync_check:
+ if (req->state == TID_REQUEST_SYNC) {
+ if (qpriv->pending_tid_r_segs)
+ goto done;
+
+ hfi1_kern_clear_hw_flow(req->rcd, qp);
+ req->state = TID_REQUEST_ACTIVE;
+ }
+
+ /*
+ * If the request for this segment is resent, the tid resources should
+ * have been allocated before. In this case, req->flow_idx should
+ * fall behind req->setup_head.
+ */
+ if (req->flow_idx == req->setup_head) {
+ retry = false;
+ if (req->state == TID_REQUEST_RESEND) {
+ /*
+ * This is the first new segment for a request whose
+ * earlier segments have been re-sent. We need to
+ * set up the sge pointer correctly.
+ */
+ restart_sge(&qp->s_sge, wqe, req->s_next_psn,
+ qp->pmtu);
+ req->isge = 0;
+ req->state = TID_REQUEST_ACTIVE;
+ }
+
+ /*
+ * Check sync. The last PSN of each generation is reserved for
+ * RESYNC.
+ */
+ if ((qpriv->flow_state.psn + npkts) > MAX_TID_FLOW_PSN - 1) {
+ req->state = TID_REQUEST_SYNC;
+ goto sync_check;
+ }
+
+ /* Allocate the flow if not yet */
+ if (hfi1_kern_setup_hw_flow(qpriv->rcd, qp))
+ goto done;
+
+ /*
+ * The following call will advance req->setup_head after
+ * allocating the tid entries.
+ */
+ if (hfi1_kern_exp_rcv_setup(req, &qp->s_sge, &last)) {
+ req->state = TID_REQUEST_QUEUED;
+
+ /*
+ * We don't have resources for this segment. The QP has
+ * already been queued.
+ */
+ goto done;
+ }
+ }
+
+ /* req->flow_idx should only be one slot behind req->setup_head */
+ flow = &req->flows[req->flow_idx];
+ flow->pkt = 0;
+ flow->tid_idx = 0;
+ flow->sent = 0;
+ if (!retry) {
+ /* Set the first and last IB PSN for the flow in use.*/
+ flow->flow_state.ib_spsn = req->s_next_psn;
+ flow->flow_state.ib_lpsn =
+ flow->flow_state.ib_spsn + flow->npkts - 1;
+ }
+
+ /* Calculate the next segment start psn.*/
+ req->s_next_psn += flow->npkts;
+
+ /* Build the packet header */
+ hdwords = hfi1_build_tid_rdma_read_packet(wqe, ohdr, bth1, bth2, len);
+done:
+ return hdwords;
+}
+
+/*
+ * Validate and accept the TID RDMA READ request parameters.
+ * Return 0 if the request is accepted successfully;
+ * Return 1 otherwise.
+ */
+static int tid_rdma_rcv_read_request(struct rvt_qp *qp,
+ struct rvt_ack_entry *e,
+ struct hfi1_packet *packet,
+ struct ib_other_headers *ohdr,
+ u32 bth0, u32 psn, u64 vaddr, u32 len)
+{
+ struct hfi1_qp_priv *qpriv = qp->priv;
+ struct tid_rdma_request *req;
+ struct tid_rdma_flow *flow;
+ u32 flow_psn, i, tidlen = 0, pktlen, tlen;
+
+ req = ack_to_tid_req(e);
+
+ /* Validate the payload first */
+ flow = &req->flows[req->setup_head];
+
+ /* payload length = packet length - (header length + ICRC length) */
+ pktlen = packet->tlen - (packet->hlen + 4);
+ if (pktlen > sizeof(flow->tid_entry))
+ return 1;
+ memcpy(flow->tid_entry, packet->ebuf, pktlen);
+ flow->tidcnt = pktlen / sizeof(*flow->tid_entry);
+
+ /*
+ * Walk the TID_ENTRY list to make sure we have enough space for a
+ * complete segment. Also calculate the number of required packets.
+ */
+ flow->npkts = rvt_div_round_up_mtu(qp, len);
+ for (i = 0; i < flow->tidcnt; i++) {
+ trace_hfi1_tid_entry_rcv_read_req(qp, i,
+ flow->tid_entry[i]);
+ tlen = EXP_TID_GET(flow->tid_entry[i], LEN);
+ if (!tlen)
+ return 1;
+
+ /*
+ * For tid pair (tidctr == 3), the buffer size of the pair
+ * should be the sum of the buffer size described by each
+ * tid entry. However, only the first entry needs to be
+ * specified in the request (see WFR HAS Section 8.5.7.1).
+ */
+ tidlen += tlen;
+ }
+ if (tidlen * PAGE_SIZE < len)
+ return 1;
+
+ /* Empty the flow array */
+ req->clear_tail = req->setup_head;
+ flow->pkt = 0;
+ flow->tid_idx = 0;
+ flow->tid_offset = 0;
+ flow->sent = 0;
+ flow->tid_qpn = be32_to_cpu(ohdr->u.tid_rdma.r_req.tid_flow_qp);
+ flow->idx = (flow->tid_qpn >> TID_RDMA_DESTQP_FLOW_SHIFT) &
+ TID_RDMA_DESTQP_FLOW_MASK;
+ flow_psn = mask_psn(be32_to_cpu(ohdr->u.tid_rdma.r_req.tid_flow_psn));
+ flow->flow_state.generation = flow_psn >> HFI1_KDETH_BTH_SEQ_SHIFT;
+ flow->flow_state.spsn = flow_psn & HFI1_KDETH_BTH_SEQ_MASK;
+ flow->length = len;
+
+ flow->flow_state.lpsn = flow->flow_state.spsn +
+ flow->npkts - 1;
+ flow->flow_state.ib_spsn = psn;
+ flow->flow_state.ib_lpsn = flow->flow_state.ib_spsn + flow->npkts - 1;
+
+ trace_hfi1_tid_flow_rcv_read_req(qp, req->setup_head, flow);
+ /* Set the initial flow index to the current flow. */
+ req->flow_idx = req->setup_head;
+
+ /* advance circular buffer head */
+ req->setup_head = (req->setup_head + 1) & (MAX_FLOWS - 1);
+
+ /*
+ * Compute last PSN for request.
+ */
+ e->opcode = (bth0 >> 24) & 0xff;
+ e->psn = psn;
+ e->lpsn = psn + flow->npkts - 1;
+ e->sent = 0;
+
+ req->n_flows = qpriv->tid_rdma.local.max_read;
+ req->state = TID_REQUEST_ACTIVE;
+ req->cur_seg = 0;
+ req->comp_seg = 0;
+ req->ack_seg = 0;
+ req->isge = 0;
+ req->seg_len = qpriv->tid_rdma.local.max_len;
+ req->total_len = len;
+ req->total_segs = 1;
+ req->r_flow_psn = e->psn;
+
+ trace_hfi1_tid_req_rcv_read_req(qp, 0, e->opcode, e->psn, e->lpsn,
+ req);
+ return 0;
+}
+
+static int tid_rdma_rcv_error(struct hfi1_packet *packet,
+ struct ib_other_headers *ohdr,
+ struct rvt_qp *qp, u32 psn, int diff)
+{
+ struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
+ struct hfi1_ctxtdata *rcd = ((struct hfi1_qp_priv *)qp->priv)->rcd;
+ struct rvt_ack_entry *e;
+ struct tid_rdma_request *req;
+ unsigned long flags;
+ u8 prev;
+ bool old_req;
+
+ trace_hfi1_rsp_tid_rcv_error(qp, psn);
+ trace_hfi1_tid_rdma_rcv_err(qp, 0, psn, diff);
+ if (diff > 0) {
+ /* sequence error */
+ if (!qp->r_nak_state) {
+ ibp->rvp.n_rc_seqnak++;
+ qp->r_nak_state = IB_NAK_PSN_ERROR;
+ qp->r_ack_psn = qp->r_psn;
+ rc_defered_ack(rcd, qp);
+ }
+ goto done;
+ }
+
+ ibp->rvp.n_rc_dupreq++;
+
+ spin_lock_irqsave(&qp->s_lock, flags);
+ e = find_prev_entry(qp, psn, &prev, NULL, &old_req);
+ if (!e || e->opcode != TID_OP(READ_REQ))
+ goto unlock;
+
+ req = ack_to_tid_req(e);
+ req->r_flow_psn = psn;
+ trace_hfi1_tid_req_rcv_err(qp, 0, e->opcode, e->psn, e->lpsn, req);
+ if (e->opcode == TID_OP(READ_REQ)) {
+ struct ib_reth *reth;
+ u32 offset;
+ u32 len;
+ u32 rkey;
+ u64 vaddr;
+ int ok;
+ u32 bth0;
+
+ reth = &ohdr->u.tid_rdma.r_req.reth;
+ /*
+ * The requester always restarts from the start of the original
+ * request.
+ */
+ offset = delta_psn(psn, e->psn) * qp->pmtu;
+ len = be32_to_cpu(reth->length);
+ if (psn != e->psn || len != req->total_len)
+ goto unlock;
+
+ if (e->rdma_sge.mr) {
+ rvt_put_mr(e->rdma_sge.mr);
+ e->rdma_sge.mr = NULL;
+ }
+
+ rkey = be32_to_cpu(reth->rkey);
+ vaddr = get_ib_reth_vaddr(reth);
+
+ qp->r_len = len;
+ ok = rvt_rkey_ok(qp, &e->rdma_sge, len, vaddr, rkey,
+ IB_ACCESS_REMOTE_READ);
+ if (unlikely(!ok))
+ goto unlock;
+
+ /*
+ * If all the response packets for the current request have
+ * been sent out and this request is complete (old_request
+ * == false) and the TID flow may be unusable (the
+ * req->clear_tail is advanced). However, when an earlier
+ * request is received, this request will not be complete any
+ * more (qp->s_tail_ack_queue is moved back, see below).
+ * Consequently, we need to update the TID flow info everytime
+ * a duplicate request is received.
+ */
+ bth0 = be32_to_cpu(ohdr->bth[0]);
+ if (tid_rdma_rcv_read_request(qp, e, packet, ohdr, bth0, psn,
+ vaddr, len))
+ goto unlock;
+
+ /*
+ * True if the request is already scheduled (between
+ * qp->s_tail_ack_queue and qp->r_head_ack_queue);
+ */
+ if (old_req)
+ goto unlock;
+ }
+ /* Re-process old requests.*/
+ qp->s_tail_ack_queue = prev;
+ /*
+ * Since the qp->s_tail_ack_queue is modified, the
+ * qp->s_ack_state must be changed to re-initialize
+ * qp->s_ack_rdma_sge; Otherwise, we will end up in
+ * wrong memory region.
+ */
+ qp->s_ack_state = OP(ACKNOWLEDGE);
+ qp->r_state = e->opcode;
+ qp->r_nak_state = 0;
+ qp->s_flags |= RVT_S_RESP_PENDING;
+ hfi1_schedule_send(qp);
+unlock:
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+done:
+ return 1;
+}
+
+void hfi1_rc_rcv_tid_rdma_read_req(struct hfi1_packet *packet)
+{
+ /* HANDLER FOR TID RDMA READ REQUEST packet (Responder side)*/
+
+ /*
+ * 1. Verify TID RDMA READ REQ as per IB_OPCODE_RC_RDMA_READ
+ * (see hfi1_rc_rcv())
+ * 2. Put TID RDMA READ REQ into the response queueu (s_ack_queue)
+ * - Setup struct tid_rdma_req with request info
+ * - Initialize struct tid_rdma_flow info;
+ * - Copy TID entries;
+ * 3. Set the qp->s_ack_state.
+ * 4. Set RVT_S_RESP_PENDING in s_flags.
+ * 5. Kick the send engine (hfi1_schedule_send())
+ */
+ struct hfi1_ctxtdata *rcd = packet->rcd;
+ struct rvt_qp *qp = packet->qp;
+ struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
+ struct ib_other_headers *ohdr = packet->ohdr;
+ struct rvt_ack_entry *e;
+ unsigned long flags;
+ struct ib_reth *reth;
+ struct hfi1_qp_priv *qpriv = qp->priv;
+ u32 bth0, psn, len, rkey;
+ bool is_fecn;
+ u8 next;
+ u64 vaddr;
+ int diff;
+ u8 nack_state = IB_NAK_INVALID_REQUEST;
+
+ bth0 = be32_to_cpu(ohdr->bth[0]);
+ if (hfi1_ruc_check_hdr(ibp, packet))
+ return;
+
+ is_fecn = process_ecn(qp, packet);
+ psn = mask_psn(be32_to_cpu(ohdr->bth[2]));
+ trace_hfi1_rsp_rcv_tid_read_req(qp, psn);
+
+ if (qp->state == IB_QPS_RTR && !(qp->r_flags & RVT_R_COMM_EST))
+ rvt_comm_est(qp);
+
+ if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ)))
+ goto nack_inv;
+
+ reth = &ohdr->u.tid_rdma.r_req.reth;
+ vaddr = be64_to_cpu(reth->vaddr);
+ len = be32_to_cpu(reth->length);
+ /* The length needs to be in multiples of PAGE_SIZE */
+ if (!len || len & ~PAGE_MASK || len > qpriv->tid_rdma.local.max_len)
+ goto nack_inv;
+
+ diff = delta_psn(psn, qp->r_psn);
+ if (unlikely(diff)) {
+ if (tid_rdma_rcv_error(packet, ohdr, qp, psn, diff))
+ return;
+ goto send_ack;
+ }
+
+ /* We've verified the request, insert it into the ack queue. */
+ next = qp->r_head_ack_queue + 1;
+ if (next > rvt_size_atomic(ib_to_rvt(qp->ibqp.device)))
+ next = 0;
+ spin_lock_irqsave(&qp->s_lock, flags);
+ if (unlikely(next == qp->s_tail_ack_queue)) {
+ if (!qp->s_ack_queue[next].sent) {
+ nack_state = IB_NAK_REMOTE_OPERATIONAL_ERROR;
+ goto nack_inv_unlock;
+ }
+ update_ack_queue(qp, next);
+ }
+ e = &qp->s_ack_queue[qp->r_head_ack_queue];
+ if (e->rdma_sge.mr) {
+ rvt_put_mr(e->rdma_sge.mr);
+ e->rdma_sge.mr = NULL;
+ }
+
+ rkey = be32_to_cpu(reth->rkey);
+ qp->r_len = len;
+
+ if (unlikely(!rvt_rkey_ok(qp, &e->rdma_sge, qp->r_len, vaddr,
+ rkey, IB_ACCESS_REMOTE_READ)))
+ goto nack_acc;
+
+ /* Accept the request parameters */
+ if (tid_rdma_rcv_read_request(qp, e, packet, ohdr, bth0, psn, vaddr,
+ len))
+ goto nack_inv_unlock;
+
+ qp->r_state = e->opcode;
+ qp->r_nak_state = 0;
+ /*
+ * We need to increment the MSN here instead of when we
+ * finish sending the result since a duplicate request would
+ * increment it more than once.
+ */
+ qp->r_msn++;
+ qp->r_psn += e->lpsn - e->psn + 1;
+
+ qp->r_head_ack_queue = next;
+
+ /* Schedule the send tasklet. */
+ qp->s_flags |= RVT_S_RESP_PENDING;
+ hfi1_schedule_send(qp);
+
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+ if (is_fecn)
+ goto send_ack;
+ return;
+
+nack_inv_unlock:
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+nack_inv:
+ rvt_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
+ qp->r_nak_state = nack_state;
+ qp->r_ack_psn = qp->r_psn;
+ /* Queue NAK for later */
+ rc_defered_ack(rcd, qp);
+ return;
+nack_acc:
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+ rvt_rc_error(qp, IB_WC_LOC_PROT_ERR);
+ qp->r_nak_state = IB_NAK_REMOTE_ACCESS_ERROR;
+ qp->r_ack_psn = qp->r_psn;
+send_ack:
+ hfi1_send_rc_ack(packet, is_fecn);
+}
+
+u32 hfi1_build_tid_rdma_read_resp(struct rvt_qp *qp, struct rvt_ack_entry *e,
+ struct ib_other_headers *ohdr, u32 *bth0,
+ u32 *bth1, u32 *bth2, u32 *len, bool *last)
+{
+ struct hfi1_ack_priv *epriv = e->priv;
+ struct tid_rdma_request *req = &epriv->tid_req;
+ struct hfi1_qp_priv *qpriv = qp->priv;
+ struct tid_rdma_flow *flow = &req->flows[req->clear_tail];
+ u32 tidentry = flow->tid_entry[flow->tid_idx];
+ u32 tidlen = EXP_TID_GET(tidentry, LEN) << PAGE_SHIFT;
+ struct tid_rdma_read_resp *resp = &ohdr->u.tid_rdma.r_rsp;
+ u32 next_offset, om = KDETH_OM_LARGE;
+ bool last_pkt;
+ u32 hdwords = 0;
+ struct tid_rdma_params *remote;
+
+ *len = min_t(u32, qp->pmtu, tidlen - flow->tid_offset);
+ flow->sent += *len;
+ next_offset = flow->tid_offset + *len;
+ last_pkt = (flow->sent >= flow->length);
+
+ trace_hfi1_tid_entry_build_read_resp(qp, flow->tid_idx, tidentry);
+ trace_hfi1_tid_flow_build_read_resp(qp, req->clear_tail, flow);
+
+ rcu_read_lock();
+ remote = rcu_dereference(qpriv->tid_rdma.remote);
+ if (!remote) {
+ rcu_read_unlock();
+ goto done;
+ }
+ KDETH_RESET(resp->kdeth0, KVER, 0x1);
+ KDETH_SET(resp->kdeth0, SH, !last_pkt);
+ KDETH_SET(resp->kdeth0, INTR, !!(!last_pkt && remote->urg));
+ KDETH_SET(resp->kdeth0, TIDCTRL, EXP_TID_GET(tidentry, CTRL));
+ KDETH_SET(resp->kdeth0, TID, EXP_TID_GET(tidentry, IDX));
+ KDETH_SET(resp->kdeth0, OM, om == KDETH_OM_LARGE);
+ KDETH_SET(resp->kdeth0, OFFSET, flow->tid_offset / om);
+ KDETH_RESET(resp->kdeth1, JKEY, remote->jkey);
+ resp->verbs_qp = cpu_to_be32(qp->remote_qpn);
+ rcu_read_unlock();
+
+ resp->aeth = rvt_compute_aeth(qp);
+ resp->verbs_psn = cpu_to_be32(mask_psn(flow->flow_state.ib_spsn +
+ flow->pkt));
+
+ *bth0 = TID_OP(READ_RESP) << 24;
+ *bth1 = flow->tid_qpn;
+ *bth2 = mask_psn(((flow->flow_state.spsn + flow->pkt++) &
+ HFI1_KDETH_BTH_SEQ_MASK) |
+ (flow->flow_state.generation <<
+ HFI1_KDETH_BTH_SEQ_SHIFT));
+ *last = last_pkt;
+ if (last_pkt)
+ /* Advance to next flow */
+ req->clear_tail = (req->clear_tail + 1) &
+ (MAX_FLOWS - 1);
+
+ if (next_offset >= tidlen) {
+ flow->tid_offset = 0;
+ flow->tid_idx++;
+ } else {
+ flow->tid_offset = next_offset;
+ }
+
+ hdwords = sizeof(ohdr->u.tid_rdma.r_rsp) / sizeof(u32);
+
+done:
+ return hdwords;
+}
+
+static inline struct tid_rdma_request *
+find_tid_request(struct rvt_qp *qp, u32 psn, enum ib_wr_opcode opcode)
+ __must_hold(&qp->s_lock)
+{
+ struct rvt_swqe *wqe;
+ struct tid_rdma_request *req = NULL;
+ u32 i, end;
+
+ end = qp->s_cur + 1;
+ if (end == qp->s_size)
+ end = 0;
+ for (i = qp->s_acked; i != end;) {
+ wqe = rvt_get_swqe_ptr(qp, i);
+ if (cmp_psn(psn, wqe->psn) >= 0 &&
+ cmp_psn(psn, wqe->lpsn) <= 0) {
+ if (wqe->wr.opcode == opcode)
+ req = wqe_to_tid_req(wqe);
+ break;
+ }
+ if (++i == qp->s_size)
+ i = 0;
+ }
+
+ return req;
+}
+
+void hfi1_rc_rcv_tid_rdma_read_resp(struct hfi1_packet *packet)
+{
+ /* HANDLER FOR TID RDMA READ RESPONSE packet (Requestor side */
+
+ /*
+ * 1. Find matching SWQE
+ * 2. Check that the entire segment has been read.
+ * 3. Remove HFI1_S_WAIT_TID_RESP from s_flags.
+ * 4. Free the TID flow resources.
+ * 5. Kick the send engine (hfi1_schedule_send())
+ */
+ struct ib_other_headers *ohdr = packet->ohdr;
+ struct rvt_qp *qp = packet->qp;
+ struct hfi1_qp_priv *priv = qp->priv;
+ struct hfi1_ctxtdata *rcd = packet->rcd;
+ struct tid_rdma_request *req;
+ struct tid_rdma_flow *flow;
+ u32 opcode, aeth;
+ bool is_fecn;
+ unsigned long flags;
+ u32 kpsn, ipsn;
+
+ trace_hfi1_sender_rcv_tid_read_resp(qp);
+ is_fecn = process_ecn(qp, packet);
+ kpsn = mask_psn(be32_to_cpu(ohdr->bth[2]));
+ aeth = be32_to_cpu(ohdr->u.tid_rdma.r_rsp.aeth);
+ opcode = (be32_to_cpu(ohdr->bth[0]) >> 24) & 0xff;
+
+ spin_lock_irqsave(&qp->s_lock, flags);
+ ipsn = mask_psn(be32_to_cpu(ohdr->u.tid_rdma.r_rsp.verbs_psn));
+ req = find_tid_request(qp, ipsn, IB_WR_TID_RDMA_READ);
+ if (unlikely(!req))
+ goto ack_op_err;
+
+ flow = &req->flows[req->clear_tail];
+ /* When header suppression is disabled */
+ if (cmp_psn(ipsn, flow->flow_state.ib_lpsn))
+ goto ack_done;
+ req->ack_pending--;
+ priv->pending_tid_r_segs--;
+ qp->s_num_rd_atomic--;
+ if ((qp->s_flags & RVT_S_WAIT_FENCE) &&
+ !qp->s_num_rd_atomic) {
+ qp->s_flags &= ~(RVT_S_WAIT_FENCE |
+ RVT_S_WAIT_ACK);
+ hfi1_schedule_send(qp);
+ }
+ if (qp->s_flags & RVT_S_WAIT_RDMAR) {
+ qp->s_flags &= ~(RVT_S_WAIT_RDMAR | RVT_S_WAIT_ACK);
+ hfi1_schedule_send(qp);
+ }
+
+ trace_hfi1_ack(qp, ipsn);
+ trace_hfi1_tid_req_rcv_read_resp(qp, 0, req->e.swqe->wr.opcode,
+ req->e.swqe->psn, req->e.swqe->lpsn,
+ req);
+ trace_hfi1_tid_flow_rcv_read_resp(qp, req->clear_tail, flow);
+
+ /* Release the tid resources */
+ hfi1_kern_exp_rcv_clear(req);
+
+ if (!do_rc_ack(qp, aeth, ipsn, opcode, 0, rcd))
+ goto ack_done;
+
+ /* If not done yet, build next read request */
+ if (++req->comp_seg >= req->total_segs) {
+ priv->tid_r_comp++;
+ req->state = TID_REQUEST_COMPLETE;
+ }
+
+ /*
+ * Clear the hw flow under two conditions:
+ * 1. This request is a sync point and it is complete;
+ * 2. Current request is completed and there are no more requests.
+ */
+ if ((req->state == TID_REQUEST_SYNC &&
+ req->comp_seg == req->cur_seg) ||
+ priv->tid_r_comp == priv->tid_r_reqs) {
+ hfi1_kern_clear_hw_flow(priv->rcd, qp);
+ if (req->state == TID_REQUEST_SYNC)
+ req->state = TID_REQUEST_ACTIVE;
+ }
+
+ hfi1_schedule_send(qp);
+ goto ack_done;
+
+ack_op_err:
+ /*
+ * The test indicates that the send engine has finished its cleanup
+ * after sending the request and it's now safe to put the QP into error
+ * state. However, if the wqe queue is empty (qp->s_acked == qp->s_tail
+ * == qp->s_head), it would be unsafe to complete the wqe pointed by
+ * qp->s_acked here. Putting the qp into error state will safely flush
+ * all remaining requests.
+ */
+ if (qp->s_last == qp->s_acked)
+ rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR);
+
+ack_done:
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+ if (is_fecn)
+ hfi1_send_rc_ack(packet, is_fecn);
+}
+
+void hfi1_kern_read_tid_flow_free(struct rvt_qp *qp)
+ __must_hold(&qp->s_lock)
+{
+ u32 n = qp->s_acked;
+ struct rvt_swqe *wqe;
+ struct tid_rdma_request *req;
+ struct hfi1_qp_priv *priv = qp->priv;
+
+ lockdep_assert_held(&qp->s_lock);
+ /* Free any TID entries */
+ while (n != qp->s_tail) {
+ wqe = rvt_get_swqe_ptr(qp, n);
+ if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) {
+ req = wqe_to_tid_req(wqe);
+ hfi1_kern_exp_rcv_clear_all(req);
+ }
+
+ if (++n == qp->s_size)
+ n = 0;
+ }
+ /* Free flow */
+ hfi1_kern_clear_hw_flow(priv->rcd, qp);
+}
+
+static bool tid_rdma_tid_err(struct hfi1_ctxtdata *rcd,
+ struct hfi1_packet *packet, u8 rcv_type,
+ u8 opcode)
+{
+ struct rvt_qp *qp = packet->qp;
+ u32 ipsn;
+ struct ib_other_headers *ohdr = packet->ohdr;
+
+ if (rcv_type >= RHF_RCV_TYPE_IB)
+ goto done;
+
+ spin_lock(&qp->s_lock);
+ /*
+ * For TID READ response, error out QP after freeing the tid
+ * resources.
+ */
+ if (opcode == TID_OP(READ_RESP)) {
+ ipsn = mask_psn(be32_to_cpu(ohdr->u.tid_rdma.r_rsp.verbs_psn));
+ if (cmp_psn(ipsn, qp->s_last_psn) > 0 &&
+ cmp_psn(ipsn, qp->s_psn) < 0) {
+ hfi1_kern_read_tid_flow_free(qp);
+ spin_unlock(&qp->s_lock);
+ rvt_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
+ goto done;
+ }
+ }
+
+ spin_unlock(&qp->s_lock);
+done:
+ return true;
+}
+
+static void restart_tid_rdma_read_req(struct hfi1_ctxtdata *rcd,
+ struct rvt_qp *qp, struct rvt_swqe *wqe)
+{
+ struct tid_rdma_request *req;
+ struct tid_rdma_flow *flow;
+
+ /* Start from the right segment */
+ qp->r_flags |= RVT_R_RDMAR_SEQ;
+ req = wqe_to_tid_req(wqe);
+ flow = &req->flows[req->clear_tail];
+ hfi1_restart_rc(qp, flow->flow_state.ib_spsn, 0);
+ if (list_empty(&qp->rspwait)) {
+ qp->r_flags |= RVT_R_RSP_SEND;
+ rvt_get_qp(qp);
+ list_add_tail(&qp->rspwait, &rcd->qp_wait_list);
+ }
+}
+
+/*
+ * Handle the KDETH eflags for TID RDMA READ response.
+ *
+ * Return true if the last packet for a segment has been received and it is
+ * time to process the response normally; otherwise, return true.
+ *
+ * The caller must hold the packet->qp->r_lock and the rcu_read_lock.
+ */
+static bool handle_read_kdeth_eflags(struct hfi1_ctxtdata *rcd,
+ struct hfi1_packet *packet, u8 rcv_type,
+ u8 rte, u32 psn, u32 ibpsn)
+ __must_hold(&packet->qp->r_lock) __must_hold(RCU)
+{
+ struct hfi1_pportdata *ppd = rcd->ppd;
+ struct hfi1_devdata *dd = ppd->dd;
+ struct hfi1_ibport *ibp;
+ struct rvt_swqe *wqe;
+ struct tid_rdma_request *req;
+ struct tid_rdma_flow *flow;
+ u32 ack_psn;
+ struct rvt_qp *qp = packet->qp;
struct hfi1_qp_priv *priv = qp->priv;
+ bool ret = true;
+ int diff = 0;
+ u32 fpsn;
+
+ lockdep_assert_held(&qp->r_lock);
+ /* If the psn is out of valid range, drop the packet */
+ if (cmp_psn(ibpsn, qp->s_last_psn) < 0 ||
+ cmp_psn(ibpsn, qp->s_psn) > 0)
+ return ret;
+
+ spin_lock(&qp->s_lock);
+ /*
+ * Note that NAKs implicitly ACK outstanding SEND and RDMA write
+ * requests and implicitly NAK RDMA read and atomic requests issued
+ * before the NAK'ed request.
+ */
+ ack_psn = ibpsn - 1;
+ wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
+ ibp = to_iport(qp->ibqp.device, qp->port_num);
+
+ /* Complete WQEs that the PSN finishes. */
+ while ((int)delta_psn(ack_psn, wqe->lpsn) >= 0) {
+ /*
+ * If this request is a RDMA read or atomic, and the NACK is
+ * for a later operation, this NACK NAKs the RDMA read or
+ * atomic.
+ */
+ if (wqe->wr.opcode == IB_WR_RDMA_READ ||
+ wqe->wr.opcode == IB_WR_TID_RDMA_READ ||
+ wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
+ wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) {
+ /* Retry this request. */
+ if (!(qp->r_flags & RVT_R_RDMAR_SEQ)) {
+ qp->r_flags |= RVT_R_RDMAR_SEQ;
+ if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) {
+ restart_tid_rdma_read_req(rcd, qp,
+ wqe);
+ } else {
+ hfi1_restart_rc(qp, qp->s_last_psn + 1,
+ 0);
+ if (list_empty(&qp->rspwait)) {
+ qp->r_flags |= RVT_R_RSP_SEND;
+ rvt_get_qp(qp);
+ list_add_tail(/* wait */
+ &qp->rspwait,
+ &rcd->qp_wait_list);
+ }
+ }
+ }
+ /*
+ * No need to process the NAK since we are
+ * restarting an earlier request.
+ */
+ break;
+ }
+
+ wqe = do_rc_completion(qp, wqe, ibp);
+ if (qp->s_acked == qp->s_tail)
+ break;
+ }
+
+ /* Handle the eflags for the request */
+ if (wqe->wr.opcode != IB_WR_TID_RDMA_READ)
+ goto s_unlock;
+
+ req = wqe_to_tid_req(wqe);
+ switch (rcv_type) {
+ case RHF_RCV_TYPE_EXPECTED:
+ switch (rte) {
+ case RHF_RTE_EXPECTED_FLOW_SEQ_ERR:
+ /*
+ * On the first occurrence of a Flow Sequence error,
+ * the flag TID_FLOW_SW_PSN is set.
+ *
+ * After that, the flow is *not* reprogrammed and the
+ * protocol falls back to SW PSN checking. This is done
+ * to prevent continuous Flow Sequence errors for any
+ * packets that could be still in the fabric.
+ */
+ flow = find_flow(req, psn, NULL);
+ if (!flow) {
+ /*
+ * We can't find the IB PSN matching the
+ * received KDETH PSN. The only thing we can
+ * do at this point is report the error to
+ * the QP.
+ */
+ hfi1_kern_read_tid_flow_free(qp);
+ spin_unlock(&qp->s_lock);
+ rvt_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
+ return ret;
+ }
+ if (priv->flow_state.flags & TID_FLOW_SW_PSN) {
+ diff = cmp_psn(psn,
+ priv->flow_state.r_next_psn);
+ if (diff > 0) {
+ if (!(qp->r_flags & RVT_R_RDMAR_SEQ))
+ restart_tid_rdma_read_req(rcd,
+ qp,
+ wqe);
+
+ /* Drop the packet.*/
+ goto s_unlock;
+ } else if (diff < 0) {
+ /*
+ * If a response packet for a restarted
+ * request has come back, reset the
+ * restart flag.
+ */
+ if (qp->r_flags & RVT_R_RDMAR_SEQ)
+ qp->r_flags &=
+ ~RVT_R_RDMAR_SEQ;
- if (qp->ibqp.qp_type == IB_QPT_RC && HFI1_CAP_IS_KSET(TID_RDMA))
- cancel_work_sync(&priv->opfn.opfn_work);
+ /* Drop the packet.*/
+ goto s_unlock;
+ }
+
+ /*
+ * If SW PSN verification is successful and
+ * this is the last packet in the segment, tell
+ * the caller to process it as a normal packet.
+ */
+ fpsn = full_flow_psn(flow,
+ flow->flow_state.lpsn);
+ if (cmp_psn(fpsn, psn) == 0) {
+ ret = false;
+ if (qp->r_flags & RVT_R_RDMAR_SEQ)
+ qp->r_flags &=
+ ~RVT_R_RDMAR_SEQ;
+ }
+ priv->flow_state.r_next_psn++;
+ } else {
+ u64 reg;
+ u32 last_psn;
+
+ /*
+ * The only sane way to get the amount of
+ * progress is to read the HW flow state.
+ */
+ reg = read_uctxt_csr(dd, rcd->ctxt,
+ RCV_TID_FLOW_TABLE +
+ (8 * flow->idx));
+ last_psn = mask_psn(reg);
+
+ priv->flow_state.r_next_psn = last_psn;
+ priv->flow_state.flags |= TID_FLOW_SW_PSN;
+ /*
+ * If no request has been restarted yet,
+ * restart the current one.
+ */
+ if (!(qp->r_flags & RVT_R_RDMAR_SEQ))
+ restart_tid_rdma_read_req(rcd, qp,
+ wqe);
+ }
+
+ break;
+
+ case RHF_RTE_EXPECTED_FLOW_GEN_ERR:
+ /*
+ * Since the TID flow is able to ride through
+ * generation mismatch, drop this stale packet.
+ */
+ break;
+
+ default:
+ break;
+ }
+ break;
+
+ case RHF_RCV_TYPE_ERROR:
+ switch (rte) {
+ case RHF_RTE_ERROR_OP_CODE_ERR:
+ case RHF_RTE_ERROR_KHDR_MIN_LEN_ERR:
+ case RHF_RTE_ERROR_KHDR_HCRC_ERR:
+ case RHF_RTE_ERROR_KHDR_KVER_ERR:
+ case RHF_RTE_ERROR_CONTEXT_ERR:
+ case RHF_RTE_ERROR_KHDR_TID_ERR:
+ default:
+ break;
+ }
+ default:
+ break;
+ }
+s_unlock:
+ spin_unlock(&qp->s_lock);
+ return ret;
+}
+
+bool hfi1_handle_kdeth_eflags(struct hfi1_ctxtdata *rcd,
+ struct hfi1_pportdata *ppd,
+ struct hfi1_packet *packet)
+{
+ struct hfi1_ibport *ibp = &ppd->ibport_data;
+ struct hfi1_devdata *dd = ppd->dd;
+ struct rvt_dev_info *rdi = &dd->verbs_dev.rdi;
+ u8 rcv_type = rhf_rcv_type(packet->rhf);
+ u8 rte = rhf_rcv_type_err(packet->rhf);
+ struct ib_header *hdr = packet->hdr;
+ struct ib_other_headers *ohdr = NULL;
+ int lnh = be16_to_cpu(hdr->lrh[0]) & 3;
+ u16 lid = be16_to_cpu(hdr->lrh[1]);
+ u8 opcode;
+ u32 qp_num, psn, ibpsn;
+ struct rvt_qp *qp;
+ unsigned long flags;
+ bool ret = true;
+
+ trace_hfi1_msg_handle_kdeth_eflags(NULL, "Kdeth error: rhf ",
+ packet->rhf);
+ if (packet->rhf & (RHF_VCRC_ERR | RHF_ICRC_ERR))
+ return ret;
+
+ packet->ohdr = &hdr->u.oth;
+ ohdr = packet->ohdr;
+ trace_input_ibhdr(rcd->dd, packet, !!(rhf_dc_info(packet->rhf)));
+
+ /* Get the destination QP number. */
+ qp_num = be32_to_cpu(ohdr->u.tid_rdma.r_rsp.verbs_qp) &
+ RVT_QPN_MASK;
+ if (lid >= be16_to_cpu(IB_MULTICAST_LID_BASE))
+ goto drop;
+
+ psn = mask_psn(be32_to_cpu(ohdr->bth[2]));
+ opcode = (be32_to_cpu(ohdr->bth[0]) >> 24) & 0xff;
+
+ rcu_read_lock();
+ qp = rvt_lookup_qpn(rdi, &ibp->rvp, qp_num);
+ if (!qp)
+ goto rcu_unlock;
+
+ packet->qp = qp;
+
+ /* Check for valid receive state. */
+ spin_lock_irqsave(&qp->r_lock, flags);
+ if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) {
+ ibp->rvp.n_pkt_drops++;
+ goto r_unlock;
+ }
+
+ if (packet->rhf & RHF_TID_ERR) {
+ /* For TIDERR and RC QPs preemptively schedule a NAK */
+ u32 tlen = rhf_pkt_len(packet->rhf); /* in bytes */
+
+ /* Sanity check packet */
+ if (tlen < 24)
+ goto r_unlock;
+
+ /*
+ * Check for GRH. We should never get packets with GRH in this
+ * path.
+ */
+ if (lnh == HFI1_LRH_GRH)
+ goto r_unlock;
+
+ if (tid_rdma_tid_err(rcd, packet, rcv_type, opcode))
+ goto r_unlock;
+ }
+
+ /* handle TID RDMA READ */
+ if (opcode == TID_OP(READ_RESP)) {
+ ibpsn = be32_to_cpu(ohdr->u.tid_rdma.r_rsp.verbs_psn);
+ ibpsn = mask_psn(ibpsn);
+ ret = handle_read_kdeth_eflags(rcd, packet, rcv_type, rte, psn,
+ ibpsn);
+ }
+
+r_unlock:
+ spin_unlock_irqrestore(&qp->r_lock, flags);
+rcu_unlock:
+ rcu_read_unlock();
+drop:
+ return ret;
+}
+
+/*
+ * "Rewind" the TID request information.
+ * This means that we reset the state back to ACTIVE,
+ * find the proper flow, set the flow index to that flow,
+ * and reset the flow information.
+ */
+void hfi1_tid_rdma_restart_req(struct rvt_qp *qp, struct rvt_swqe *wqe,
+ u32 *bth2)
+{
+ struct tid_rdma_request *req = wqe_to_tid_req(wqe);
+ struct tid_rdma_flow *flow;
+ int diff;
+ u32 tididx = 0;
+ u16 fidx;
+
+ if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) {
+ *bth2 = mask_psn(qp->s_psn);
+ flow = find_flow_ib(req, *bth2, &fidx);
+ if (!flow) {
+ trace_hfi1_msg_tid_restart_req(/* msg */
+ qp, "!!!!!! Could not find flow to restart: bth2 ",
+ (u64)*bth2);
+ trace_hfi1_tid_req_restart_req(qp, 0, wqe->wr.opcode,
+ wqe->psn, wqe->lpsn,
+ req);
+ return;
+ }
+ } else {
+ return;
+ }
+
+ trace_hfi1_tid_flow_restart_req(qp, fidx, flow);
+ diff = delta_psn(*bth2, flow->flow_state.ib_spsn);
+
+ flow->sent = 0;
+ flow->pkt = 0;
+ flow->tid_idx = 0;
+ flow->tid_offset = 0;
+ if (diff) {
+ for (tididx = 0; tididx < flow->tidcnt; tididx++) {
+ u32 tidentry = flow->tid_entry[tididx], tidlen,
+ tidnpkts, npkts;
+
+ flow->tid_offset = 0;
+ tidlen = EXP_TID_GET(tidentry, LEN) * PAGE_SIZE;
+ tidnpkts = rvt_div_round_up_mtu(qp, tidlen);
+ npkts = min_t(u32, diff, tidnpkts);
+ flow->pkt += npkts;
+ flow->sent += (npkts == tidnpkts ? tidlen :
+ npkts * qp->pmtu);
+ flow->tid_offset += npkts * qp->pmtu;
+ diff -= npkts;
+ if (!diff)
+ break;
+ }
+ }
+
+ if (flow->tid_offset ==
+ EXP_TID_GET(flow->tid_entry[tididx], LEN) * PAGE_SIZE) {
+ tididx++;
+ flow->tid_offset = 0;
+ }
+ flow->tid_idx = tididx;
+ /* Move flow_idx to correct index */
+ req->flow_idx = fidx;
+
+ trace_hfi1_tid_flow_restart_req(qp, fidx, flow);
+ trace_hfi1_tid_req_restart_req(qp, 0, wqe->wr.opcode, wqe->psn,
+ wqe->lpsn, req);
+ req->state = TID_REQUEST_ACTIVE;
+}
+
+void hfi1_qp_kern_exp_rcv_clear_all(struct rvt_qp *qp)
+{
+ int i, ret;
+ struct hfi1_qp_priv *qpriv = qp->priv;
+ struct tid_flow_state *fs;
+
+ if (qp->ibqp.qp_type != IB_QPT_RC || !HFI1_CAP_IS_KSET(TID_RDMA))
+ return;
+
+ /*
+ * First, clear the flow to help prevent any delayed packets from
+ * being delivered.
+ */
+ fs = &qpriv->flow_state;
+ if (fs->index != RXE_NUM_TID_FLOWS)
+ hfi1_kern_clear_hw_flow(qpriv->rcd, qp);
+
+ for (i = qp->s_acked; i != qp->s_head;) {
+ struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, i);
+
+ if (++i == qp->s_size)
+ i = 0;
+ /* Free only locally allocated TID entries */
+ if (wqe->wr.opcode != IB_WR_TID_RDMA_READ)
+ continue;
+ do {
+ struct hfi1_swqe_priv *priv = wqe->priv;
+
+ ret = hfi1_kern_exp_rcv_clear(&priv->tid_req);
+ } while (!ret);
+ }
+}
+
+bool hfi1_tid_rdma_wqe_interlock(struct rvt_qp *qp, struct rvt_swqe *wqe)
+{
+ struct rvt_swqe *prev;
+ struct hfi1_qp_priv *priv = qp->priv;
+ u32 s_prev;
+
+ s_prev = (qp->s_cur == 0 ? qp->s_size : qp->s_cur) - 1;
+ prev = rvt_get_swqe_ptr(qp, s_prev);
+
+ switch (wqe->wr.opcode) {
+ case IB_WR_SEND:
+ case IB_WR_SEND_WITH_IMM:
+ case IB_WR_SEND_WITH_INV:
+ case IB_WR_ATOMIC_CMP_AND_SWP:
+ case IB_WR_ATOMIC_FETCH_AND_ADD:
+ case IB_WR_RDMA_WRITE:
+ case IB_WR_RDMA_READ:
+ break;
+ case IB_WR_TID_RDMA_READ:
+ switch (prev->wr.opcode) {
+ case IB_WR_RDMA_READ:
+ if (qp->s_acked != qp->s_cur)
+ goto interlock;
+ break;
+ default:
+ break;
+ }
+ default:
+ break;
+ }
+ return false;
+
+interlock:
+ priv->s_flags |= HFI1_S_TID_WAIT_INTERLCK;
+ return true;
+}
+
+/* Does @sge meet the alignment requirements for tid rdma? */
+static inline bool hfi1_check_sge_align(struct rvt_qp *qp,
+ struct rvt_sge *sge, int num_sge)
+{
+ int i;
+
+ for (i = 0; i < num_sge; i++, sge++) {
+ trace_hfi1_sge_check_align(qp, i, sge);
+ if ((u64)sge->vaddr & ~PAGE_MASK ||
+ sge->sge_length & ~PAGE_MASK)
+ return false;
+ }
+ return true;
+}
+
+void setup_tid_rdma_wqe(struct rvt_qp *qp, struct rvt_swqe *wqe)
+{
+ struct hfi1_qp_priv *qpriv = (struct hfi1_qp_priv *)qp->priv;
+ struct hfi1_swqe_priv *priv = wqe->priv;
+ struct tid_rdma_params *remote;
+ enum ib_wr_opcode new_opcode;
+ bool do_tid_rdma = false;
+ struct hfi1_pportdata *ppd = qpriv->rcd->ppd;
+
+ if ((rdma_ah_get_dlid(&qp->remote_ah_attr) & ~((1 << ppd->lmc) - 1)) ==
+ ppd->lid)
+ return;
+ if (qpriv->hdr_type != HFI1_PKT_TYPE_9B)
+ return;
+
+ rcu_read_lock();
+ remote = rcu_dereference(qpriv->tid_rdma.remote);
+ /*
+ * If TID RDMA is disabled by the negotiation, don't
+ * use it.
+ */
+ if (!remote)
+ goto exit;
+
+ if (wqe->wr.opcode == IB_WR_RDMA_READ) {
+ if (hfi1_check_sge_align(qp, &wqe->sg_list[0],
+ wqe->wr.num_sge)) {
+ new_opcode = IB_WR_TID_RDMA_READ;
+ do_tid_rdma = true;
+ }
+ }
+
+ if (do_tid_rdma) {
+ if (hfi1_kern_exp_rcv_alloc_flows(&priv->tid_req, GFP_ATOMIC))
+ goto exit;
+ wqe->wr.opcode = new_opcode;
+ priv->tid_req.seg_len =
+ min_t(u32, remote->max_len, wqe->length);
+ priv->tid_req.total_segs =
+ DIV_ROUND_UP(wqe->length, priv->tid_req.seg_len);
+ /* Compute the last PSN of the request */
+ wqe->lpsn = wqe->psn;
+ if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) {
+ priv->tid_req.n_flows = remote->max_read;
+ qpriv->tid_r_reqs++;
+ wqe->lpsn += rvt_div_round_up_mtu(qp, wqe->length) - 1;
+ }
+
+ priv->tid_req.cur_seg = 0;
+ priv->tid_req.comp_seg = 0;
+ priv->tid_req.ack_seg = 0;
+ priv->tid_req.state = TID_REQUEST_INACTIVE;
+ trace_hfi1_tid_req_setup_tid_wqe(qp, 1, wqe->wr.opcode,
+ wqe->psn, wqe->lpsn,
+ &priv->tid_req);
+ }
+exit:
+ rcu_read_unlock();
}
diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.h b/drivers/infiniband/hw/hfi1/tid_rdma.h
index ee8151558e3f..a53598ce45b2 100644
--- a/drivers/infiniband/hw/hfi1/tid_rdma.h
+++ b/drivers/infiniband/hw/hfi1/tid_rdma.h
@@ -6,7 +6,27 @@
#ifndef HFI1_TID_RDMA_H
#define HFI1_TID_RDMA_H
+#include <linux/circ_buf.h>
+#include "common.h"
+
+/* Add a convenience helper */
+#define CIRC_ADD(val, add, size) (((val) + (add)) & ((size) - 1))
+#define CIRC_NEXT(val, size) CIRC_ADD(val, 1, size)
+#define CIRC_PREV(val, size) CIRC_ADD(val, -1, size)
+
+#define TID_RDMA_MIN_SEGMENT_SIZE BIT(18) /* 256 KiB (for now) */
#define TID_RDMA_MAX_SEGMENT_SIZE BIT(18) /* 256 KiB (for now) */
+#define TID_RDMA_MAX_PAGES (BIT(18) >> PAGE_SHIFT)
+
+/*
+ * Bit definitions for priv->s_flags.
+ * These bit flags overload the bit flags defined for the QP's s_flags.
+ * Due to the fact that these bit fields are used only for the QP priv
+ * s_flags, there are no collisions.
+ *
+ * HFI1_S_TID_WAIT_INTERLCK - QP is waiting for requester interlock
+ */
+#define HFI1_S_TID_WAIT_INTERLCK BIT(5)
struct tid_rdma_params {
struct rcu_head rcu_head;
@@ -21,10 +41,128 @@ struct tid_rdma_params {
};
struct tid_rdma_qp_params {
+ struct work_struct trigger_work;
struct tid_rdma_params local;
struct tid_rdma_params __rcu *remote;
};
+/* Track state for each hardware flow */
+struct tid_flow_state {
+ u32 generation;
+ u32 psn;
+ u32 r_next_psn; /* next PSN to be received (in TID space) */
+ u8 index;
+ u8 last_index;
+ u8 flags;
+};
+
+enum tid_rdma_req_state {
+ TID_REQUEST_INACTIVE = 0,
+ TID_REQUEST_INIT,
+ TID_REQUEST_INIT_RESEND,
+ TID_REQUEST_ACTIVE,
+ TID_REQUEST_RESEND,
+ TID_REQUEST_RESEND_ACTIVE,
+ TID_REQUEST_QUEUED,
+ TID_REQUEST_SYNC,
+ TID_REQUEST_RNR_NAK,
+ TID_REQUEST_COMPLETE,
+};
+
+struct tid_rdma_request {
+ struct rvt_qp *qp;
+ struct hfi1_ctxtdata *rcd;
+ union {
+ struct rvt_swqe *swqe;
+ struct rvt_ack_entry *ack;
+ } e;
+
+ struct tid_rdma_flow *flows; /* array of tid flows */
+ u16 n_flows; /* size of the flow buffer window */
+ u16 setup_head; /* flow index we are setting up */
+ u16 clear_tail; /* flow index we are clearing */
+ u16 flow_idx; /* flow index most recently set up */
+
+ u32 seg_len;
+ u32 total_len;
+ u32 r_flow_psn; /* IB PSN of next segment start */
+ u32 s_next_psn; /* IB PSN of next segment start for read */
+
+ u32 total_segs; /* segments required to complete a request */
+ u32 cur_seg; /* index of current segment */
+ u32 comp_seg; /* index of last completed segment */
+ u32 ack_seg; /* index of last ack'ed segment */
+ u32 isge; /* index of "current" sge */
+ u32 ack_pending; /* num acks pending for this request */
+
+ enum tid_rdma_req_state state;
+};
+
+/*
+ * When header suppression is used, PSNs associated with a "flow" are
+ * relevant (and not the PSNs maintained by verbs). Track per-flow
+ * PSNs here for a TID RDMA segment.
+ *
+ */
+struct flow_state {
+ u32 flags;
+ u32 resp_ib_psn; /* The IB PSN of the response for this flow */
+ u32 generation; /* generation of flow */
+ u32 spsn; /* starting PSN in TID space */
+ u32 lpsn; /* last PSN in TID space */
+ u32 r_next_psn; /* next PSN to be received (in TID space) */
+
+ /* For tid rdma read */
+ u32 ib_spsn; /* starting PSN in Verbs space */
+ u32 ib_lpsn; /* last PSn in Verbs space */
+};
+
+struct tid_rdma_pageset {
+ dma_addr_t addr : 48; /* Only needed for the first page */
+ u8 idx: 8;
+ u8 count : 7;
+ u8 mapped: 1;
+};
+
+/**
+ * kern_tid_node - used for managing TID's in TID groups
+ *
+ * @grp_idx: rcd relative index to tid_group
+ * @map: grp->map captured prior to programming this TID group in HW
+ * @cnt: Only @cnt of available group entries are actually programmed
+ */
+struct kern_tid_node {
+ struct tid_group *grp;
+ u8 map;
+ u8 cnt;
+};
+
+/* Overall info for a TID RDMA segment */
+struct tid_rdma_flow {
+ /*
+ * While a TID RDMA segment is being transferred, it uses a QP number
+ * from the "KDETH section of QP numbers" (which is different from the
+ * QP number that originated the request). Bits 11-15 of these QP
+ * numbers identify the "TID flow" for the segment.
+ */
+ struct flow_state flow_state;
+ struct tid_rdma_request *req;
+ u32 tid_qpn;
+ u32 tid_offset;
+ u32 length;
+ u32 sent;
+ u8 tnode_cnt;
+ u8 tidcnt;
+ u8 tid_idx;
+ u8 idx;
+ u8 npagesets;
+ u8 npkts;
+ u8 pkt;
+ struct kern_tid_node tnode[TID_RDMA_MAX_PAGES];
+ struct tid_rdma_pageset pagesets[TID_RDMA_MAX_PAGES];
+ u32 tid_entry[TID_RDMA_MAX_PAGES];
+};
+
bool tid_rdma_conn_req(struct rvt_qp *qp, u64 *data);
bool tid_rdma_conn_reply(struct rvt_qp *qp, u64 data);
bool tid_rdma_conn_resp(struct rvt_qp *qp, u64 *data);
@@ -32,9 +170,67 @@ void tid_rdma_conn_error(struct rvt_qp *qp);
void tid_rdma_opfn_init(struct rvt_qp *qp, struct tid_rdma_params *p);
int hfi1_kern_exp_rcv_init(struct hfi1_ctxtdata *rcd, int reinit);
+int hfi1_kern_exp_rcv_setup(struct tid_rdma_request *req,
+ struct rvt_sge_state *ss, bool *last);
+int hfi1_kern_exp_rcv_clear(struct tid_rdma_request *req);
+void hfi1_kern_exp_rcv_clear_all(struct tid_rdma_request *req);
+void __trdma_clean_swqe(struct rvt_qp *qp, struct rvt_swqe *wqe);
+
+/**
+ * trdma_clean_swqe - clean flows for swqe if large send queue
+ * @qp: the qp
+ * @wqe: the send wqe
+ */
+static inline void trdma_clean_swqe(struct rvt_qp *qp, struct rvt_swqe *wqe)
+{
+ if (!wqe->priv)
+ return;
+ __trdma_clean_swqe(qp, wqe);
+}
+
+void hfi1_kern_read_tid_flow_free(struct rvt_qp *qp);
int hfi1_qp_priv_init(struct rvt_dev_info *rdi, struct rvt_qp *qp,
struct ib_qp_init_attr *init_attr);
void hfi1_qp_priv_tid_free(struct rvt_dev_info *rdi, struct rvt_qp *qp);
+void hfi1_tid_rdma_flush_wait(struct rvt_qp *qp);
+
+int hfi1_kern_setup_hw_flow(struct hfi1_ctxtdata *rcd, struct rvt_qp *qp);
+void hfi1_kern_clear_hw_flow(struct hfi1_ctxtdata *rcd, struct rvt_qp *qp);
+void hfi1_kern_init_ctxt_generations(struct hfi1_ctxtdata *rcd);
+
+struct cntr_entry;
+u64 hfi1_access_sw_tid_wait(const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data);
+
+u32 hfi1_build_tid_rdma_read_packet(struct rvt_swqe *wqe,
+ struct ib_other_headers *ohdr,
+ u32 *bth1, u32 *bth2, u32 *len);
+u32 hfi1_build_tid_rdma_read_req(struct rvt_qp *qp, struct rvt_swqe *wqe,
+ struct ib_other_headers *ohdr, u32 *bth1,
+ u32 *bth2, u32 *len);
+void hfi1_rc_rcv_tid_rdma_read_req(struct hfi1_packet *packet);
+u32 hfi1_build_tid_rdma_read_resp(struct rvt_qp *qp, struct rvt_ack_entry *e,
+ struct ib_other_headers *ohdr, u32 *bth0,
+ u32 *bth1, u32 *bth2, u32 *len, bool *last);
+void hfi1_rc_rcv_tid_rdma_read_resp(struct hfi1_packet *packet);
+bool hfi1_handle_kdeth_eflags(struct hfi1_ctxtdata *rcd,
+ struct hfi1_pportdata *ppd,
+ struct hfi1_packet *packet);
+void hfi1_tid_rdma_restart_req(struct rvt_qp *qp, struct rvt_swqe *wqe,
+ u32 *bth2);
+void hfi1_qp_kern_exp_rcv_clear_all(struct rvt_qp *qp);
+bool hfi1_tid_rdma_wqe_interlock(struct rvt_qp *qp, struct rvt_swqe *wqe);
+
+void setup_tid_rdma_wqe(struct rvt_qp *qp, struct rvt_swqe *wqe);
+static inline void hfi1_setup_tid_rdma_wqe(struct rvt_qp *qp,
+ struct rvt_swqe *wqe)
+{
+ if (wqe->priv &&
+ wqe->wr.opcode == IB_WR_RDMA_READ &&
+ wqe->length >= TID_RDMA_MIN_SEGMENT_SIZE)
+ setup_tid_rdma_wqe(qp, wqe);
+}
+
#endif /* HFI1_TID_RDMA_H */
diff --git a/drivers/infiniband/hw/hfi1/trace.c b/drivers/infiniband/hw/hfi1/trace.c
index 7c8aed0ffc07..28181d711fed 100644
--- a/drivers/infiniband/hw/hfi1/trace.c
+++ b/drivers/infiniband/hw/hfi1/trace.c
@@ -46,6 +46,7 @@
*/
#define CREATE_TRACE_POINTS
#include "trace.h"
+#include "exp_rcv.h"
static u8 __get_ib_hdr_len(struct ib_header *hdr)
{
@@ -128,6 +129,10 @@ const char *hfi1_trace_get_packet_l2_str(u8 l2)
#define IETH_PRN "ieth rkey:0x%.8x"
#define ATOMICACKETH_PRN "origdata:%llx"
#define ATOMICETH_PRN "vaddr:0x%llx rkey:0x%.8x sdata:%llx cdata:%llx"
+#define TID_RDMA_KDETH "kdeth0 0x%x kdeth1 0x%x"
+#define TID_RDMA_KDETH_DATA "kdeth0 0x%x: kver %u sh %u intr %u tidctrl %u tid %x offset %x kdeth1 0x%x: jkey %x"
+#define TID_READ_REQ_PRN "tid_flow_psn 0x%x tid_flow_qp 0x%x verbs_qp 0x%x"
+#define TID_READ_RSP_PRN "verbs_qp 0x%x"
#define OP(transport, op) IB_OPCODE_## transport ## _ ## op
@@ -322,6 +327,38 @@ const char *parse_everbs_hdrs(
parse_syndrome(be32_to_cpu(eh->aeth) >> 24),
be32_to_cpu(eh->aeth) & IB_MSN_MASK);
break;
+ case OP(TID_RDMA, READ_REQ):
+ trace_seq_printf(p, TID_RDMA_KDETH " " RETH_PRN " "
+ TID_READ_REQ_PRN,
+ le32_to_cpu(eh->tid_rdma.r_req.kdeth0),
+ le32_to_cpu(eh->tid_rdma.r_req.kdeth1),
+ ib_u64_get(&eh->tid_rdma.r_req.reth.vaddr),
+ be32_to_cpu(eh->tid_rdma.r_req.reth.rkey),
+ be32_to_cpu(eh->tid_rdma.r_req.reth.length),
+ be32_to_cpu(eh->tid_rdma.r_req.tid_flow_psn),
+ be32_to_cpu(eh->tid_rdma.r_req.tid_flow_qp),
+ be32_to_cpu(eh->tid_rdma.r_req.verbs_qp));
+ break;
+ case OP(TID_RDMA, READ_RESP):
+ trace_seq_printf(p, TID_RDMA_KDETH_DATA " " AETH_PRN " "
+ TID_READ_RSP_PRN,
+ le32_to_cpu(eh->tid_rdma.r_rsp.kdeth0),
+ KDETH_GET(eh->tid_rdma.r_rsp.kdeth0, KVER),
+ KDETH_GET(eh->tid_rdma.r_rsp.kdeth0, SH),
+ KDETH_GET(eh->tid_rdma.r_rsp.kdeth0, INTR),
+ KDETH_GET(eh->tid_rdma.r_rsp.kdeth0, TIDCTRL),
+ KDETH_GET(eh->tid_rdma.r_rsp.kdeth0, TID),
+ KDETH_GET(eh->tid_rdma.r_rsp.kdeth0, OFFSET),
+ le32_to_cpu(eh->tid_rdma.r_rsp.kdeth1),
+ KDETH_GET(eh->tid_rdma.r_rsp.kdeth1, JKEY),
+ be32_to_cpu(eh->tid_rdma.r_rsp.aeth) >> 24,
+ parse_syndrome(/* aeth */
+ be32_to_cpu(eh->tid_rdma.r_rsp.aeth)
+ >> 24),
+ (be32_to_cpu(eh->tid_rdma.r_rsp.aeth) &
+ IB_MSN_MASK),
+ be32_to_cpu(eh->tid_rdma.r_rsp.verbs_qp));
+ break;
/* aeth + atomicacketh */
case OP(RC, ATOMIC_ACKNOWLEDGE):
trace_seq_printf(p, AETH_PRN " " ATOMICACKETH_PRN,
@@ -394,6 +431,21 @@ const char *print_u32_array(
return ret;
}
+u8 hfi1_trace_get_tid_ctrl(u32 ent)
+{
+ return EXP_TID_GET(ent, CTRL);
+}
+
+u16 hfi1_trace_get_tid_len(u32 ent)
+{
+ return EXP_TID_GET(ent, LEN);
+}
+
+u16 hfi1_trace_get_tid_idx(u32 ent)
+{
+ return EXP_TID_GET(ent, IDX);
+}
+
__hfi1_trace_fn(AFFINITY);
__hfi1_trace_fn(PKT);
__hfi1_trace_fn(PROC);
diff --git a/drivers/infiniband/hw/hfi1/trace_ibhdrs.h b/drivers/infiniband/hw/hfi1/trace_ibhdrs.h
index 1dc2c28fc96e..1116238bf24d 100644
--- a/drivers/infiniband/hw/hfi1/trace_ibhdrs.h
+++ b/drivers/infiniband/hw/hfi1/trace_ibhdrs.h
@@ -79,6 +79,8 @@ __print_symbolic(opcode, \
ib_opcode_name(RC_ATOMIC_ACKNOWLEDGE), \
ib_opcode_name(RC_COMPARE_SWAP), \
ib_opcode_name(RC_FETCH_ADD), \
+ ib_opcode_name(TID_RDMA_READ_REQ), \
+ ib_opcode_name(TID_RDMA_READ_RESP), \
ib_opcode_name(UC_SEND_FIRST), \
ib_opcode_name(UC_SEND_MIDDLE), \
ib_opcode_name(UC_SEND_LAST), \
diff --git a/drivers/infiniband/hw/hfi1/trace_rc.h b/drivers/infiniband/hw/hfi1/trace_rc.h
index 8ce476570462..1ebca37862e0 100644
--- a/drivers/infiniband/hw/hfi1/trace_rc.h
+++ b/drivers/infiniband/hw/hfi1/trace_rc.h
@@ -109,6 +109,54 @@ DEFINE_EVENT(hfi1_rc_template, hfi1_rcv_error,
TP_ARGS(qp, psn)
);
+DEFINE_EVENT(/* event */
+ hfi1_rc_template, hfi1_rc_completion,
+ TP_PROTO(struct rvt_qp *qp, u32 psn),
+ TP_ARGS(qp, psn)
+);
+
+DECLARE_EVENT_CLASS(/* rc_ack */
+ hfi1_rc_ack_template,
+ TP_PROTO(struct rvt_qp *qp, u32 aeth, u32 psn,
+ struct rvt_swqe *wqe),
+ TP_ARGS(qp, aeth, psn, wqe),
+ TP_STRUCT__entry(/* entry */
+ DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device))
+ __field(u32, qpn)
+ __field(u32, aeth)
+ __field(u32, psn)
+ __field(u8, opcode)
+ __field(u32, spsn)
+ __field(u32, lpsn)
+ ),
+ TP_fast_assign(/* assign */
+ DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device))
+ __entry->qpn = qp->ibqp.qp_num;
+ __entry->aeth = aeth;
+ __entry->psn = psn;
+ __entry->opcode = wqe->wr.opcode;
+ __entry->spsn = wqe->psn;
+ __entry->lpsn = wqe->lpsn;
+ ),
+ TP_printk(/* print */
+ "[%s] qpn 0x%x aeth 0x%x psn 0x%x opcode 0x%x spsn 0x%x lpsn 0x%x",
+ __get_str(dev),
+ __entry->qpn,
+ __entry->aeth,
+ __entry->psn,
+ __entry->opcode,
+ __entry->spsn,
+ __entry->lpsn
+ )
+);
+
+DEFINE_EVENT(/* do_rc_ack */
+ hfi1_rc_ack_template, hfi1_rc_ack_do,
+ TP_PROTO(struct rvt_qp *qp, u32 aeth, u32 psn,
+ struct rvt_swqe *wqe),
+ TP_ARGS(qp, aeth, psn, wqe)
+);
+
#endif /* __HFI1_TRACE_RC_H */
#undef TRACE_INCLUDE_PATH
diff --git a/drivers/infiniband/hw/hfi1/trace_tid.h b/drivers/infiniband/hw/hfi1/trace_tid.h
index 57a973c97cde..b71638c22d4b 100644
--- a/drivers/infiniband/hw/hfi1/trace_tid.h
+++ b/drivers/infiniband/hw/hfi1/trace_tid.h
@@ -21,10 +21,51 @@ __print_symbolic(type, \
#undef TRACE_SYSTEM
#define TRACE_SYSTEM hfi1_tid
+u8 hfi1_trace_get_tid_ctrl(u32 ent);
+u16 hfi1_trace_get_tid_len(u32 ent);
+u16 hfi1_trace_get_tid_idx(u32 ent);
+
#define OPFN_PARAM_PRN "[%s] qpn 0x%x %s OPFN: qp 0x%x, max read %u, " \
"max write %u, max length %u, jkey 0x%x timeout %u " \
"urg %u"
+#define TID_FLOW_PRN "[%s] qpn 0x%x flow %d: idx %d resp_ib_psn 0x%x " \
+ "generation 0x%x fpsn 0x%x-%x r_next_psn 0x%x " \
+ "ib_psn 0x%x-%x npagesets %u tnode_cnt %u " \
+ "tidcnt %u tid_idx %u tid_offset %u length %u sent %u"
+
+#define TID_NODE_PRN "[%s] qpn 0x%x %s idx %u grp base 0x%x map 0x%x " \
+ "used %u cnt %u"
+
+#define RSP_INFO_PRN "[%s] qpn 0x%x state 0x%x s_state 0x%x psn 0x%x " \
+ "r_psn 0x%x r_state 0x%x r_flags 0x%x " \
+ "r_head_ack_queue %u s_tail_ack_queue %u " \
+ "s_ack_state 0x%x " \
+ "s_nak_state 0x%x s_flags 0x%x ps_flags 0x%x " \
+ "iow_flags 0x%lx"
+
+#define SENDER_INFO_PRN "[%s] qpn 0x%x state 0x%x s_cur %u s_tail %u " \
+ "s_head %u s_acked %u s_last %u s_psn 0x%x " \
+ "s_last_psn 0x%x s_flags 0x%x ps_flags 0x%x " \
+ "iow_flags 0x%lx s_state 0x%x s_num_rd %u s_retry %u"
+
+#define TID_READ_SENDER_PRN "[%s] qpn 0x%x newreq %u tid_r_reqs %u " \
+ "tid_r_comp %u pending_tid_r_segs %u " \
+ "s_flags 0x%x ps_flags 0x%x iow_flags 0x%lx " \
+ "hw_flow_index %u generation 0x%x " \
+ "fpsn 0x%x flow_flags 0x%x"
+
+#define TID_REQ_PRN "[%s] qpn 0x%x newreq %u opcode 0x%x psn 0x%x lpsn 0x%x " \
+ "cur_seg %u comp_seg %u ack_seg %u " \
+ "total_segs %u setup_head %u clear_tail %u flow_idx %u " \
+ "state %u r_flow_psn 0x%x " \
+ "s_next_psn 0x%x"
+
+#define RCV_ERR_PRN "[%s] qpn 0x%x s_flags 0x%x state 0x%x " \
+ "s_tail_ack_queue %u " \
+ "r_head_ack_queue %u opcode 0x%x psn 0x%x r_psn 0x%x " \
+ " diff %d"
+
DECLARE_EVENT_CLASS(/* class */
hfi1_exp_tid_reg_unreg,
TP_PROTO(unsigned int ctxt, u16 subctxt, u32 rarr, u32 npages,
@@ -323,6 +364,723 @@ DEFINE_EVENT(/* event */
TP_ARGS(qp, msg, more)
);
+DEFINE_EVENT(/* event */
+ hfi1_msg_template, hfi1_msg_alloc_tids,
+ TP_PROTO(struct rvt_qp *qp, const char *msg, u64 more),
+ TP_ARGS(qp, msg, more)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_msg_template, hfi1_msg_tid_restart_req,
+ TP_PROTO(struct rvt_qp *qp, const char *msg, u64 more),
+ TP_ARGS(qp, msg, more)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_msg_template, hfi1_msg_handle_kdeth_eflags,
+ TP_PROTO(struct rvt_qp *qp, const char *msg, u64 more),
+ TP_ARGS(qp, msg, more)
+);
+
+DECLARE_EVENT_CLASS(/* tid_flow_page */
+ hfi1_tid_flow_page_template,
+ TP_PROTO(struct rvt_qp *qp, struct tid_rdma_flow *flow, u32 index,
+ char mtu8k, char v1, void *vaddr),
+ TP_ARGS(qp, flow, index, mtu8k, v1, vaddr),
+ TP_STRUCT__entry(/* entry */
+ DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device))
+ __field(u32, qpn)
+ __field(char, mtu8k)
+ __field(char, v1)
+ __field(u32, index)
+ __field(u64, page)
+ __field(u64, vaddr)
+ ),
+ TP_fast_assign(/* assign */
+ DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device));
+ __entry->qpn = qp->ibqp.qp_num;
+ __entry->mtu8k = mtu8k;
+ __entry->v1 = v1;
+ __entry->index = index;
+ __entry->page = vaddr ? (u64)virt_to_page(vaddr) : 0ULL;
+ __entry->vaddr = (u64)vaddr;
+ ),
+ TP_printk(/* print */
+ "[%s] qpn 0x%x page[%u]: page 0x%llx %s 0x%llx",
+ __get_str(dev),
+ __entry->qpn,
+ __entry->index,
+ __entry->page,
+ __entry->mtu8k ? (__entry->v1 ? "v1" : "v0") : "vaddr",
+ __entry->vaddr
+ )
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_flow_page_template, hfi1_tid_flow_page,
+ TP_PROTO(struct rvt_qp *qp, struct tid_rdma_flow *flow, u32 index,
+ char mtu8k, char v1, void *vaddr),
+ TP_ARGS(qp, flow, index, mtu8k, v1, vaddr)
+);
+
+DECLARE_EVENT_CLASS(/* tid_pageset */
+ hfi1_tid_pageset_template,
+ TP_PROTO(struct rvt_qp *qp, u32 index, u16 idx, u16 count),
+ TP_ARGS(qp, index, idx, count),
+ TP_STRUCT__entry(/* entry */
+ DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device))
+ __field(u32, qpn)
+ __field(u32, index)
+ __field(u16, idx)
+ __field(u16, count)
+ ),
+ TP_fast_assign(/* assign */
+ DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device));
+ __entry->qpn = qp->ibqp.qp_num;
+ __entry->index = index;
+ __entry->idx = idx;
+ __entry->count = count;
+ ),
+ TP_printk(/* print */
+ "[%s] qpn 0x%x list[%u]: idx %u count %u",
+ __get_str(dev),
+ __entry->qpn,
+ __entry->index,
+ __entry->idx,
+ __entry->count
+ )
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_pageset_template, hfi1_tid_pageset,
+ TP_PROTO(struct rvt_qp *qp, u32 index, u16 idx, u16 count),
+ TP_ARGS(qp, index, idx, count)
+);
+
+DECLARE_EVENT_CLASS(/* tid_fow */
+ hfi1_tid_flow_template,
+ TP_PROTO(struct rvt_qp *qp, int index, struct tid_rdma_flow *flow),
+ TP_ARGS(qp, index, flow),
+ TP_STRUCT__entry(/* entry */
+ DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device))
+ __field(u32, qpn)
+ __field(int, index)
+ __field(int, idx)
+ __field(u32, resp_ib_psn)
+ __field(u32, generation)
+ __field(u32, fspsn)
+ __field(u32, flpsn)
+ __field(u32, r_next_psn)
+ __field(u32, ib_spsn)
+ __field(u32, ib_lpsn)
+ __field(u32, npagesets)
+ __field(u32, tnode_cnt)
+ __field(u32, tidcnt)
+ __field(u32, tid_idx)
+ __field(u32, tid_offset)
+ __field(u32, length)
+ __field(u32, sent)
+ ),
+ TP_fast_assign(/* assign */
+ DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device));
+ __entry->qpn = qp->ibqp.qp_num;
+ __entry->index = index;
+ __entry->idx = flow->idx;
+ __entry->resp_ib_psn = flow->flow_state.resp_ib_psn;
+ __entry->generation = flow->flow_state.generation;
+ __entry->fspsn = full_flow_psn(flow,
+ flow->flow_state.spsn);
+ __entry->flpsn = full_flow_psn(flow,
+ flow->flow_state.lpsn);
+ __entry->r_next_psn = flow->flow_state.r_next_psn;
+ __entry->ib_spsn = flow->flow_state.ib_spsn;
+ __entry->ib_lpsn = flow->flow_state.ib_lpsn;
+ __entry->npagesets = flow->npagesets;
+ __entry->tnode_cnt = flow->tnode_cnt;
+ __entry->tidcnt = flow->tidcnt;
+ __entry->tid_idx = flow->tid_idx;
+ __entry->tid_offset = flow->tid_offset;
+ __entry->length = flow->length;
+ __entry->sent = flow->sent;
+ ),
+ TP_printk(/* print */
+ TID_FLOW_PRN,
+ __get_str(dev),
+ __entry->qpn,
+ __entry->index,
+ __entry->idx,
+ __entry->resp_ib_psn,
+ __entry->generation,
+ __entry->fspsn,
+ __entry->flpsn,
+ __entry->r_next_psn,
+ __entry->ib_spsn,
+ __entry->ib_lpsn,
+ __entry->npagesets,
+ __entry->tnode_cnt,
+ __entry->tidcnt,
+ __entry->tid_idx,
+ __entry->tid_offset,
+ __entry->length,
+ __entry->sent
+ )
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_flow_template, hfi1_tid_flow_alloc,
+ TP_PROTO(struct rvt_qp *qp, int index, struct tid_rdma_flow *flow),
+ TP_ARGS(qp, index, flow)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_flow_template, hfi1_tid_flow_build_read_pkt,
+ TP_PROTO(struct rvt_qp *qp, int index, struct tid_rdma_flow *flow),
+ TP_ARGS(qp, index, flow)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_flow_template, hfi1_tid_flow_build_read_resp,
+ TP_PROTO(struct rvt_qp *qp, int index, struct tid_rdma_flow *flow),
+ TP_ARGS(qp, index, flow)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_flow_template, hfi1_tid_flow_rcv_read_req,
+ TP_PROTO(struct rvt_qp *qp, int index, struct tid_rdma_flow *flow),
+ TP_ARGS(qp, index, flow)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_flow_template, hfi1_tid_flow_rcv_read_resp,
+ TP_PROTO(struct rvt_qp *qp, int index, struct tid_rdma_flow *flow),
+ TP_ARGS(qp, index, flow)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_flow_template, hfi1_tid_flow_restart_req,
+ TP_PROTO(struct rvt_qp *qp, int index, struct tid_rdma_flow *flow),
+ TP_ARGS(qp, index, flow)
+);
+
+DECLARE_EVENT_CLASS(/* tid_node */
+ hfi1_tid_node_template,
+ TP_PROTO(struct rvt_qp *qp, const char *msg, u32 index, u32 base,
+ u8 map, u8 used, u8 cnt),
+ TP_ARGS(qp, msg, index, base, map, used, cnt),
+ TP_STRUCT__entry(/* entry */
+ DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device))
+ __field(u32, qpn)
+ __string(msg, msg)
+ __field(u32, index)
+ __field(u32, base)
+ __field(u8, map)
+ __field(u8, used)
+ __field(u8, cnt)
+ ),
+ TP_fast_assign(/* assign */
+ DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device));
+ __entry->qpn = qp->ibqp.qp_num;
+ __assign_str(msg, msg);
+ __entry->index = index;
+ __entry->base = base;
+ __entry->map = map;
+ __entry->used = used;
+ __entry->cnt = cnt;
+ ),
+ TP_printk(/* print */
+ TID_NODE_PRN,
+ __get_str(dev),
+ __entry->qpn,
+ __get_str(msg),
+ __entry->index,
+ __entry->base,
+ __entry->map,
+ __entry->used,
+ __entry->cnt
+ )
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_node_template, hfi1_tid_node_add,
+ TP_PROTO(struct rvt_qp *qp, const char *msg, u32 index, u32 base,
+ u8 map, u8 used, u8 cnt),
+ TP_ARGS(qp, msg, index, base, map, used, cnt)
+);
+
+DECLARE_EVENT_CLASS(/* tid_entry */
+ hfi1_tid_entry_template,
+ TP_PROTO(struct rvt_qp *qp, int index, u32 ent),
+ TP_ARGS(qp, index, ent),
+ TP_STRUCT__entry(/* entry */
+ DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device))
+ __field(u32, qpn)
+ __field(int, index)
+ __field(u8, ctrl)
+ __field(u16, idx)
+ __field(u16, len)
+ ),
+ TP_fast_assign(/* assign */
+ DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device));
+ __entry->qpn = qp->ibqp.qp_num;
+ __entry->index = index;
+ __entry->ctrl = hfi1_trace_get_tid_ctrl(ent);
+ __entry->idx = hfi1_trace_get_tid_idx(ent);
+ __entry->len = hfi1_trace_get_tid_len(ent);
+ ),
+ TP_printk(/* print */
+ "[%s] qpn 0x%x TID entry %d: idx %u len %u ctrl 0x%x",
+ __get_str(dev),
+ __entry->qpn,
+ __entry->index,
+ __entry->idx,
+ __entry->len,
+ __entry->ctrl
+ )
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_entry_template, hfi1_tid_entry_alloc,
+ TP_PROTO(struct rvt_qp *qp, int index, u32 entry),
+ TP_ARGS(qp, index, entry)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_entry_template, hfi1_tid_entry_build_read_resp,
+ TP_PROTO(struct rvt_qp *qp, int index, u32 ent),
+ TP_ARGS(qp, index, ent)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_entry_template, hfi1_tid_entry_rcv_read_req,
+ TP_PROTO(struct rvt_qp *qp, int index, u32 ent),
+ TP_ARGS(qp, index, ent)
+);
+
+DECLARE_EVENT_CLASS(/* rsp_info */
+ hfi1_responder_info_template,
+ TP_PROTO(struct rvt_qp *qp, u32 psn),
+ TP_ARGS(qp, psn),
+ TP_STRUCT__entry(/* entry */
+ DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device))
+ __field(u32, qpn)
+ __field(u8, state)
+ __field(u8, s_state)
+ __field(u32, psn)
+ __field(u32, r_psn)
+ __field(u8, r_state)
+ __field(u8, r_flags)
+ __field(u8, r_head_ack_queue)
+ __field(u8, s_tail_ack_queue)
+ __field(u8, s_ack_state)
+ __field(u8, s_nak_state)
+ __field(u8, r_nak_state)
+ __field(u32, s_flags)
+ __field(u32, ps_flags)
+ __field(unsigned long, iow_flags)
+ ),
+ TP_fast_assign(/* assign */
+ struct hfi1_qp_priv *priv = qp->priv;
+
+ DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device));
+ __entry->qpn = qp->ibqp.qp_num;
+ __entry->state = qp->state;
+ __entry->s_state = qp->s_state;
+ __entry->psn = psn;
+ __entry->r_psn = qp->r_psn;
+ __entry->r_state = qp->r_state;
+ __entry->r_flags = qp->r_flags;
+ __entry->r_head_ack_queue = qp->r_head_ack_queue;
+ __entry->s_tail_ack_queue = qp->s_tail_ack_queue;
+ __entry->s_ack_state = qp->s_ack_state;
+ __entry->s_nak_state = qp->s_nak_state;
+ __entry->s_flags = qp->s_flags;
+ __entry->ps_flags = priv->s_flags;
+ __entry->iow_flags = priv->s_iowait.flags;
+ ),
+ TP_printk(/* print */
+ RSP_INFO_PRN,
+ __get_str(dev),
+ __entry->qpn,
+ __entry->state,
+ __entry->s_state,
+ __entry->psn,
+ __entry->r_psn,
+ __entry->r_state,
+ __entry->r_flags,
+ __entry->r_head_ack_queue,
+ __entry->s_tail_ack_queue,
+ __entry->s_ack_state,
+ __entry->s_nak_state,
+ __entry->s_flags,
+ __entry->ps_flags,
+ __entry->iow_flags
+ )
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_responder_info_template, hfi1_rsp_make_rc_ack,
+ TP_PROTO(struct rvt_qp *qp, u32 psn),
+ TP_ARGS(qp, psn)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_responder_info_template, hfi1_rsp_rcv_tid_read_req,
+ TP_PROTO(struct rvt_qp *qp, u32 psn),
+ TP_ARGS(qp, psn)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_responder_info_template, hfi1_rsp_tid_rcv_error,
+ TP_PROTO(struct rvt_qp *qp, u32 psn),
+ TP_ARGS(qp, psn)
+);
+
+DECLARE_EVENT_CLASS(/* sender_info */
+ hfi1_sender_info_template,
+ TP_PROTO(struct rvt_qp *qp),
+ TP_ARGS(qp),
+ TP_STRUCT__entry(/* entry */
+ DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device))
+ __field(u32, qpn)
+ __field(u8, state)
+ __field(u32, s_cur)
+ __field(u32, s_tail)
+ __field(u32, s_head)
+ __field(u32, s_acked)
+ __field(u32, s_last)
+ __field(u32, s_psn)
+ __field(u32, s_last_psn)
+ __field(u32, s_flags)
+ __field(u32, ps_flags)
+ __field(unsigned long, iow_flags)
+ __field(u8, s_state)
+ __field(u8, s_num_rd)
+ __field(u8, s_retry)
+ ),
+ TP_fast_assign(/* assign */
+ DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device))
+ __entry->qpn = qp->ibqp.qp_num;
+ __entry->state = qp->state;
+ __entry->s_cur = qp->s_cur;
+ __entry->s_tail = qp->s_tail;
+ __entry->s_head = qp->s_head;
+ __entry->s_acked = qp->s_acked;
+ __entry->s_last = qp->s_last;
+ __entry->s_psn = qp->s_psn;
+ __entry->s_last_psn = qp->s_last_psn;
+ __entry->s_flags = qp->s_flags;
+ __entry->ps_flags = ((struct hfi1_qp_priv *)qp->priv)->s_flags;
+ __entry->iow_flags =
+ ((struct hfi1_qp_priv *)qp->priv)->s_iowait.flags;
+ __entry->s_state = qp->s_state;
+ __entry->s_num_rd = qp->s_num_rd_atomic;
+ __entry->s_retry = qp->s_retry;
+ ),
+ TP_printk(/* print */
+ SENDER_INFO_PRN,
+ __get_str(dev),
+ __entry->qpn,
+ __entry->state,
+ __entry->s_cur,
+ __entry->s_tail,
+ __entry->s_head,
+ __entry->s_acked,
+ __entry->s_last,
+ __entry->s_psn,
+ __entry->s_last_psn,
+ __entry->s_flags,
+ __entry->ps_flags,
+ __entry->iow_flags,
+ __entry->s_state,
+ __entry->s_num_rd,
+ __entry->s_retry
+ )
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_sender_info_template, hfi1_sender_make_rc_req,
+ TP_PROTO(struct rvt_qp *qp),
+ TP_ARGS(qp)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_sender_info_template, hfi1_sender_reset_psn,
+ TP_PROTO(struct rvt_qp *qp),
+ TP_ARGS(qp)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_sender_info_template, hfi1_sender_restart_rc,
+ TP_PROTO(struct rvt_qp *qp),
+ TP_ARGS(qp)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_sender_info_template, hfi1_sender_do_rc_ack,
+ TP_PROTO(struct rvt_qp *qp),
+ TP_ARGS(qp)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_sender_info_template, hfi1_sender_rcv_tid_read_resp,
+ TP_PROTO(struct rvt_qp *qp),
+ TP_ARGS(qp)
+);
+
+DECLARE_EVENT_CLASS(/* tid_read_sender */
+ hfi1_tid_read_sender_template,
+ TP_PROTO(struct rvt_qp *qp, char newreq),
+ TP_ARGS(qp, newreq),
+ TP_STRUCT__entry(/* entry */
+ DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device))
+ __field(u32, qpn)
+ __field(char, newreq)
+ __field(u32, tid_r_reqs)
+ __field(u32, tid_r_comp)
+ __field(u32, pending_tid_r_segs)
+ __field(u32, s_flags)
+ __field(u32, ps_flags)
+ __field(unsigned long, iow_flags)
+ __field(u32, hw_flow_index)
+ __field(u32, generation)
+ __field(u32, fpsn)
+ __field(u32, flow_flags)
+ ),
+ TP_fast_assign(/* assign */
+ struct hfi1_qp_priv *priv = qp->priv;
+
+ DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device));
+ __entry->qpn = qp->ibqp.qp_num;
+ __entry->newreq = newreq;
+ __entry->tid_r_reqs = priv->tid_r_reqs;
+ __entry->tid_r_comp = priv->tid_r_comp;
+ __entry->pending_tid_r_segs = priv->pending_tid_r_segs;
+ __entry->s_flags = qp->s_flags;
+ __entry->ps_flags = priv->s_flags;
+ __entry->iow_flags = priv->s_iowait.flags;
+ __entry->hw_flow_index = priv->flow_state.index;
+ __entry->generation = priv->flow_state.generation;
+ __entry->fpsn = priv->flow_state.psn;
+ __entry->flow_flags = priv->flow_state.flags;
+ ),
+ TP_printk(/* print */
+ TID_READ_SENDER_PRN,
+ __get_str(dev),
+ __entry->qpn,
+ __entry->newreq,
+ __entry->tid_r_reqs,
+ __entry->tid_r_comp,
+ __entry->pending_tid_r_segs,
+ __entry->s_flags,
+ __entry->ps_flags,
+ __entry->iow_flags,
+ __entry->hw_flow_index,
+ __entry->generation,
+ __entry->fpsn,
+ __entry->flow_flags
+ )
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_read_sender_template, hfi1_tid_read_sender_make_req,
+ TP_PROTO(struct rvt_qp *qp, char newreq),
+ TP_ARGS(qp, newreq)
+);
+
+DECLARE_EVENT_CLASS(/* tid_rdma_request */
+ hfi1_tid_rdma_request_template,
+ TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn,
+ struct tid_rdma_request *req),
+ TP_ARGS(qp, newreq, opcode, psn, lpsn, req),
+ TP_STRUCT__entry(/* entry */
+ DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device))
+ __field(u32, qpn)
+ __field(char, newreq)
+ __field(u8, opcode)
+ __field(u32, psn)
+ __field(u32, lpsn)
+ __field(u32, cur_seg)
+ __field(u32, comp_seg)
+ __field(u32, ack_seg)
+ __field(u32, total_segs)
+ __field(u16, setup_head)
+ __field(u16, clear_tail)
+ __field(u16, flow_idx)
+ __field(u32, state)
+ __field(u32, r_flow_psn)
+ __field(u32, s_next_psn)
+ ),
+ TP_fast_assign(/* assign */
+ DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device));
+ __entry->qpn = qp->ibqp.qp_num;
+ __entry->newreq = newreq;
+ __entry->opcode = opcode;
+ __entry->psn = psn;
+ __entry->lpsn = lpsn;
+ __entry->cur_seg = req->cur_seg;
+ __entry->comp_seg = req->comp_seg;
+ __entry->ack_seg = req->ack_seg;
+ __entry->total_segs = req->total_segs;
+ __entry->setup_head = req->setup_head;
+ __entry->clear_tail = req->clear_tail;
+ __entry->flow_idx = req->flow_idx;
+ __entry->state = req->state;
+ __entry->r_flow_psn = req->r_flow_psn;
+ __entry->s_next_psn = req->s_next_psn;
+ ),
+ TP_printk(/* print */
+ TID_REQ_PRN,
+ __get_str(dev),
+ __entry->qpn,
+ __entry->newreq,
+ __entry->opcode,
+ __entry->psn,
+ __entry->lpsn,
+ __entry->cur_seg,
+ __entry->comp_seg,
+ __entry->ack_seg,
+ __entry->total_segs,
+ __entry->setup_head,
+ __entry->clear_tail,
+ __entry->flow_idx,
+ __entry->state,
+ __entry->r_flow_psn,
+ __entry->s_next_psn
+ )
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_rdma_request_template, hfi1_tid_req_make_req_read,
+ TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn,
+ struct tid_rdma_request *req),
+ TP_ARGS(qp, newreq, opcode, psn, lpsn, req)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_rdma_request_template, hfi1_tid_req_build_read_req,
+ TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn,
+ struct tid_rdma_request *req),
+ TP_ARGS(qp, newreq, opcode, psn, lpsn, req)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_rdma_request_template, hfi1_tid_req_rcv_read_req,
+ TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn,
+ struct tid_rdma_request *req),
+ TP_ARGS(qp, newreq, opcode, psn, lpsn, req)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_rdma_request_template, hfi1_tid_req_rcv_read_resp,
+ TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn,
+ struct tid_rdma_request *req),
+ TP_ARGS(qp, newreq, opcode, psn, lpsn, req)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_rdma_request_template, hfi1_tid_req_rcv_err,
+ TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn,
+ struct tid_rdma_request *req),
+ TP_ARGS(qp, newreq, opcode, psn, lpsn, req)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_rdma_request_template, hfi1_tid_req_restart_req,
+ TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn,
+ struct tid_rdma_request *req),
+ TP_ARGS(qp, newreq, opcode, psn, lpsn, req)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_rdma_request_template, hfi1_tid_req_setup_tid_wqe,
+ TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn,
+ struct tid_rdma_request *req),
+ TP_ARGS(qp, newreq, opcode, psn, lpsn, req)
+);
+
+DECLARE_EVENT_CLASS(/* rc_rcv_err */
+ hfi1_rc_rcv_err_template,
+ TP_PROTO(struct rvt_qp *qp, u32 opcode, u32 psn, int diff),
+ TP_ARGS(qp, opcode, psn, diff),
+ TP_STRUCT__entry(/* entry */
+ DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device))
+ __field(u32, qpn)
+ __field(u32, s_flags)
+ __field(u8, state)
+ __field(u8, s_tail_ack_queue)
+ __field(u8, r_head_ack_queue)
+ __field(u32, opcode)
+ __field(u32, psn)
+ __field(u32, r_psn)
+ __field(int, diff)
+ ),
+ TP_fast_assign(/* assign */
+ DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device))
+ __entry->qpn = qp->ibqp.qp_num;
+ __entry->s_flags = qp->s_flags;
+ __entry->state = qp->state;
+ __entry->s_tail_ack_queue = qp->s_tail_ack_queue;
+ __entry->r_head_ack_queue = qp->r_head_ack_queue;
+ __entry->opcode = opcode;
+ __entry->psn = psn;
+ __entry->r_psn = qp->r_psn;
+ __entry->diff = diff;
+ ),
+ TP_printk(/* print */
+ RCV_ERR_PRN,
+ __get_str(dev),
+ __entry->qpn,
+ __entry->s_flags,
+ __entry->state,
+ __entry->s_tail_ack_queue,
+ __entry->r_head_ack_queue,
+ __entry->opcode,
+ __entry->psn,
+ __entry->r_psn,
+ __entry->diff
+ )
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_rc_rcv_err_template, hfi1_tid_rdma_rcv_err,
+ TP_PROTO(struct rvt_qp *qp, u32 opcode, u32 psn, int diff),
+ TP_ARGS(qp, opcode, psn, diff)
+);
+
+DECLARE_EVENT_CLASS(/* sge */
+ hfi1_sge_template,
+ TP_PROTO(struct rvt_qp *qp, int index, struct rvt_sge *sge),
+ TP_ARGS(qp, index, sge),
+ TP_STRUCT__entry(/* entry */
+ DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device))
+ __field(u32, qpn)
+ __field(int, index)
+ __field(u64, vaddr)
+ __field(u32, sge_length)
+ ),
+ TP_fast_assign(/* assign */
+ DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device));
+ __entry->qpn = qp->ibqp.qp_num;
+ __entry->index = index;
+ __entry->vaddr = (u64)sge->vaddr;
+ __entry->sge_length = sge->sge_length;
+ ),
+ TP_printk(/* print */
+ "[%s] qpn 0x%x sge %d: vaddr 0x%llx sge_length %u",
+ __get_str(dev),
+ __entry->qpn,
+ __entry->index,
+ __entry->vaddr,
+ __entry->sge_length
+ )
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_sge_template, hfi1_sge_check_align,
+ TP_PROTO(struct rvt_qp *qp, int index, struct rvt_sge *sge),
+ TP_ARGS(qp, index, sge)
+);
+
#endif /* __HFI1_TRACE_TID_H */
#undef TRACE_INCLUDE_PATH
diff --git a/drivers/infiniband/hw/hfi1/trace_tx.h b/drivers/infiniband/hw/hfi1/trace_tx.h
index c57af3b31fe1..37dbb3e599c3 100644
--- a/drivers/infiniband/hw/hfi1/trace_tx.h
+++ b/drivers/infiniband/hw/hfi1/trace_tx.h
@@ -114,19 +114,27 @@ DECLARE_EVENT_CLASS(hfi1_qpsleepwakeup_template,
__field(u32, qpn)
__field(u32, flags)
__field(u32, s_flags)
+ __field(u32, ps_flags)
+ __field(unsigned long, iow_flags)
),
TP_fast_assign(
DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device))
__entry->flags = flags;
__entry->qpn = qp->ibqp.qp_num;
__entry->s_flags = qp->s_flags;
+ __entry->ps_flags =
+ ((struct hfi1_qp_priv *)qp->priv)->s_flags;
+ __entry->iow_flags =
+ ((struct hfi1_qp_priv *)qp->priv)->s_iowait.flags;
),
TP_printk(
- "[%s] qpn 0x%x flags 0x%x s_flags 0x%x",
+ "[%s] qpn 0x%x flags 0x%x s_flags 0x%x ps_flags 0x%x iow_flags 0x%lx",
__get_str(dev),
__entry->qpn,
__entry->flags,
- __entry->s_flags
+ __entry->s_flags,
+ __entry->ps_flags,
+ __entry->iow_flags
)
);
diff --git a/drivers/infiniband/hw/hfi1/user_exp_rcv.h b/drivers/infiniband/hw/hfi1/user_exp_rcv.h
index e383cc01a2bf..43b105de1d54 100644
--- a/drivers/infiniband/hw/hfi1/user_exp_rcv.h
+++ b/drivers/infiniband/hw/hfi1/user_exp_rcv.h
@@ -48,7 +48,6 @@
*/
#include "hfi.h"
-
#include "exp_rcv.h"
struct tid_pageset {
diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c
index 571bfd549c2a..88676ca79fda 100644
--- a/drivers/infiniband/hw/hfi1/verbs.c
+++ b/drivers/infiniband/hw/hfi1/verbs.c
@@ -165,6 +165,7 @@ const enum ib_wc_opcode ib_hfi1_wc_opcode[] = {
[IB_WR_SEND] = IB_WC_SEND,
[IB_WR_SEND_WITH_IMM] = IB_WC_SEND,
[IB_WR_RDMA_READ] = IB_WC_RDMA_READ,
+ [IB_WR_TID_RDMA_READ] = IB_WC_RDMA_READ,
[IB_WR_ATOMIC_CMP_AND_SWP] = IB_WC_COMP_SWAP,
[IB_WR_ATOMIC_FETCH_AND_ADD] = IB_WC_FETCH_ADD,
[IB_WR_SEND_WITH_INV] = IB_WC_SEND,
@@ -200,6 +201,8 @@ const u8 hdr_len_by_opcode[256] = {
[IB_OPCODE_RC_FETCH_ADD] = 12 + 8 + 28,
[IB_OPCODE_RC_SEND_LAST_WITH_INVALIDATE] = 12 + 8 + 4,
[IB_OPCODE_RC_SEND_ONLY_WITH_INVALIDATE] = 12 + 8 + 4,
+ [IB_OPCODE_TID_RDMA_READ_REQ] = 12 + 8 + 36,
+ [IB_OPCODE_TID_RDMA_READ_RESP] = 12 + 8 + 36,
/* UC */
[IB_OPCODE_UC_SEND_FIRST] = 12 + 8,
[IB_OPCODE_UC_SEND_MIDDLE] = 12 + 8,
@@ -243,6 +246,11 @@ static const opcode_handler opcode_handler_tbl[256] = {
[IB_OPCODE_RC_FETCH_ADD] = &hfi1_rc_rcv,
[IB_OPCODE_RC_SEND_LAST_WITH_INVALIDATE] = &hfi1_rc_rcv,
[IB_OPCODE_RC_SEND_ONLY_WITH_INVALIDATE] = &hfi1_rc_rcv,
+
+ /* TID RDMA has separate handlers for different opcodes.*/
+ [IB_OPCODE_TID_RDMA_READ_REQ] = &hfi1_rc_rcv_tid_rdma_read_req,
+ [IB_OPCODE_TID_RDMA_READ_RESP] = &hfi1_rc_rcv_tid_rdma_read_resp,
+
/* UC */
[IB_OPCODE_UC_SEND_FIRST] = &hfi1_uc_rcv,
[IB_OPCODE_UC_SEND_MIDDLE] = &hfi1_uc_rcv,
@@ -308,7 +316,7 @@ static inline opcode_handler qp_ok(struct hfi1_packet *packet)
static u64 hfi1_fault_tx(struct rvt_qp *qp, u8 opcode, u64 pbc)
{
#ifdef CONFIG_FAULT_INJECTION
- if ((opcode & IB_OPCODE_MSP) == IB_OPCODE_MSP)
+ if ((opcode & IB_OPCODE_MSP) == IB_OPCODE_MSP) {
/*
* In order to drop non-IB traffic we
* set PbcInsertHrc to NONE (0x2).
@@ -319,8 +327,9 @@ static u64 hfi1_fault_tx(struct rvt_qp *qp, u8 opcode, u64 pbc)
* packet will not be delivered to the
* correct context.
*/
+ pbc &= ~PBC_INSERT_HCRC_SMASK;
pbc |= (u64)PBC_IHCRC_NONE << PBC_INSERT_HCRC_SHIFT;
- else
+ } else {
/*
* In order to drop regular verbs
* traffic we set the PbcTestEbp
@@ -330,10 +339,129 @@ static u64 hfi1_fault_tx(struct rvt_qp *qp, u8 opcode, u64 pbc)
* triggered and will be dropped.
*/
pbc |= PBC_TEST_EBP;
+ }
#endif
return pbc;
}
+static opcode_handler tid_qp_ok(int opcode, struct hfi1_packet *packet)
+{
+ if (packet->qp->ibqp.qp_type != IB_QPT_RC ||
+ !(ib_rvt_state_ops[packet->qp->state] & RVT_PROCESS_RECV_OK))
+ return NULL;
+ if ((opcode & RVT_OPCODE_QP_MASK) == IB_OPCODE_TID_RDMA)
+ return opcode_handler_tbl[opcode];
+ return NULL;
+}
+
+void hfi1_kdeth_eager_rcv(struct hfi1_packet *packet)
+{
+ struct hfi1_ctxtdata *rcd = packet->rcd;
+ struct ib_header *hdr = packet->hdr;
+ u32 tlen = packet->tlen;
+ struct hfi1_pportdata *ppd = rcd->ppd;
+ struct hfi1_ibport *ibp = &ppd->ibport_data;
+ struct rvt_dev_info *rdi = &ppd->dd->verbs_dev.rdi;
+ opcode_handler opcode_handler;
+ unsigned long flags;
+ u32 qp_num;
+ int lnh;
+ u8 opcode;
+
+ /* DW == LRH (2) + BTH (3) + KDETH (9) + CRC (1) */
+ if (unlikely(tlen < 15 * sizeof(u32)))
+ goto drop;
+
+ lnh = be16_to_cpu(hdr->lrh[0]) & 3;
+ if (lnh != HFI1_LRH_BTH)
+ goto drop;
+
+ packet->ohdr = &hdr->u.oth;
+ trace_input_ibhdr(rcd->dd, packet, !!(rhf_dc_info(packet->rhf)));
+
+ opcode = (be32_to_cpu(packet->ohdr->bth[0]) >> 24);
+ inc_opstats(tlen, &rcd->opstats->stats[opcode]);
+
+ /* verbs_qp can be picked up from any tid_rdma header struct */
+ qp_num = be32_to_cpu(packet->ohdr->u.tid_rdma.r_req.verbs_qp) &
+ RVT_QPN_MASK;
+
+ rcu_read_lock();
+ packet->qp = rvt_lookup_qpn(rdi, &ibp->rvp, qp_num);
+ if (!packet->qp)
+ goto drop_rcu;
+ spin_lock_irqsave(&packet->qp->r_lock, flags);
+ opcode_handler = tid_qp_ok(opcode, packet);
+ if (likely(opcode_handler))
+ opcode_handler(packet);
+ else
+ goto drop_unlock;
+ spin_unlock_irqrestore(&packet->qp->r_lock, flags);
+ rcu_read_unlock();
+
+ return;
+drop_unlock:
+ spin_unlock_irqrestore(&packet->qp->r_lock, flags);
+drop_rcu:
+ rcu_read_unlock();
+drop:
+ ibp->rvp.n_pkt_drops++;
+}
+
+void hfi1_kdeth_expected_rcv(struct hfi1_packet *packet)
+{
+ struct hfi1_ctxtdata *rcd = packet->rcd;
+ struct ib_header *hdr = packet->hdr;
+ u32 tlen = packet->tlen;
+ struct hfi1_pportdata *ppd = rcd->ppd;
+ struct hfi1_ibport *ibp = &ppd->ibport_data;
+ struct rvt_dev_info *rdi = &ppd->dd->verbs_dev.rdi;
+ opcode_handler opcode_handler;
+ unsigned long flags;
+ u32 qp_num;
+ int lnh;
+ u8 opcode;
+
+ /* DW == LRH (2) + BTH (3) + KDETH (9) + CRC (1) */
+ if (unlikely(tlen < 15 * sizeof(u32)))
+ goto drop;
+
+ lnh = be16_to_cpu(hdr->lrh[0]) & 3;
+ if (lnh != HFI1_LRH_BTH)
+ goto drop;
+
+ packet->ohdr = &hdr->u.oth;
+ trace_input_ibhdr(rcd->dd, packet, !!(rhf_dc_info(packet->rhf)));
+
+ opcode = (be32_to_cpu(packet->ohdr->bth[0]) >> 24);
+ inc_opstats(tlen, &rcd->opstats->stats[opcode]);
+
+ /* verbs_qp can be picked up from any tid_rdma header struct */
+ qp_num = be32_to_cpu(packet->ohdr->u.tid_rdma.r_rsp.verbs_qp) &
+ RVT_QPN_MASK;
+
+ rcu_read_lock();
+ packet->qp = rvt_lookup_qpn(rdi, &ibp->rvp, qp_num);
+ if (!packet->qp)
+ goto drop_rcu;
+ spin_lock_irqsave(&packet->qp->r_lock, flags);
+ opcode_handler = tid_qp_ok(opcode, packet);
+ if (likely(opcode_handler))
+ opcode_handler(packet);
+ else
+ goto drop_unlock;
+ spin_unlock_irqrestore(&packet->qp->r_lock, flags);
+ rcu_read_unlock();
+
+ return;
+drop_unlock:
+ spin_unlock_irqrestore(&packet->qp->r_lock, flags);
+drop_rcu:
+ rcu_read_unlock();
+drop:
+ ibp->rvp.n_pkt_drops++;
+}
+
static int hfi1_do_pkey_check(struct hfi1_packet *packet)
{
struct hfi1_ctxtdata *rcd = packet->rcd;
@@ -504,11 +632,28 @@ static void verbs_sdma_complete(
hfi1_put_txreq(tx);
}
+void hfi1_wait_kmem(struct rvt_qp *qp)
+{
+ struct hfi1_qp_priv *priv = qp->priv;
+ struct ib_qp *ibqp = &qp->ibqp;
+ struct ib_device *ibdev = ibqp->device;
+ struct hfi1_ibdev *dev = to_idev(ibdev);
+
+ if (list_empty(&priv->s_iowait.list)) {
+ if (list_empty(&dev->memwait))
+ mod_timer(&dev->mem_timer, jiffies + 1);
+ qp->s_flags |= RVT_S_WAIT_KMEM;
+ list_add_tail(&priv->s_iowait.list, &dev->memwait);
+ priv->s_iowait.lock = &dev->iowait_lock;
+ trace_hfi1_qpsleep(qp, RVT_S_WAIT_KMEM);
+ rvt_get_qp(qp);
+ }
+}
+
static int wait_kmem(struct hfi1_ibdev *dev,
struct rvt_qp *qp,
struct hfi1_pkt_state *ps)
{
- struct hfi1_qp_priv *priv = qp->priv;
unsigned long flags;
int ret = 0;
@@ -517,15 +662,7 @@ static int wait_kmem(struct hfi1_ibdev *dev,
write_seqlock(&dev->iowait_lock);
list_add_tail(&ps->s_txreq->txreq.list,
&ps->wait->tx_head);
- if (list_empty(&priv->s_iowait.list)) {
- if (list_empty(&dev->memwait))
- mod_timer(&dev->mem_timer, jiffies + 1);
- qp->s_flags |= RVT_S_WAIT_KMEM;
- list_add_tail(&priv->s_iowait.list, &dev->memwait);
- priv->s_iowait.lock = &dev->iowait_lock;
- trace_hfi1_qpsleep(qp, RVT_S_WAIT_KMEM);
- rvt_get_qp(qp);
- }
+ hfi1_wait_kmem(qp);
write_sequnlock(&dev->iowait_lock);
hfi1_qp_unbusy(qp, ps->wait);
ret = -EBUSY;
@@ -674,6 +811,15 @@ bail_txadd:
return ret;
}
+static u64 update_hcrc(u8 opcode, u64 pbc)
+{
+ if ((opcode & IB_OPCODE_TID_RDMA) == IB_OPCODE_TID_RDMA) {
+ pbc &= ~PBC_INSERT_HCRC_SMASK;
+ pbc |= (u64)PBC_IHCRC_LKDETH << PBC_INSERT_HCRC_SHIFT;
+ }
+ return pbc;
+}
+
int hfi1_verbs_send_dma(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
u64 pbc)
{
@@ -719,6 +865,9 @@ int hfi1_verbs_send_dma(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
qp->srate_mbps,
vl,
plen);
+
+ /* Update HCRC based on packet opcode */
+ pbc = update_hcrc(ps->opcode, pbc);
}
tx->wqe = qp->s_wqe;
ret = build_verbs_tx_desc(tx->sde, len, tx, ahg_info, pbc);
@@ -867,6 +1016,9 @@ int hfi1_verbs_send_pio(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
if (unlikely(hfi1_dbg_should_fault_tx(qp, ps->opcode)))
pbc = hfi1_fault_tx(qp, ps->opcode, pbc);
pbc = create_pbc(ppd, pbc, qp->srate_mbps, vl, plen);
+
+ /* Update HCRC based on packet opcode */
+ pbc = update_hcrc(ps->opcode, pbc);
}
if (cb)
iowait_pio_inc(&priv->s_iowait);
diff --git a/drivers/infiniband/hw/hfi1/verbs.h b/drivers/infiniband/hw/hfi1/verbs.h
index c8baa1e38ff6..841727a684d5 100644
--- a/drivers/infiniband/hw/hfi1/verbs.h
+++ b/drivers/infiniband/hw/hfi1/verbs.h
@@ -159,17 +159,38 @@ struct hfi1_qp_priv {
struct sdma_engine *s_sde; /* current sde */
struct send_context *s_sendcontext; /* current sendcontext */
struct hfi1_ctxtdata *rcd; /* QP's receive context */
+ struct page **pages; /* for TID page scan */
+ u32 tid_enqueue; /* saved when tid waited */
u8 s_sc; /* SC[0..4] for next packet */
struct iowait s_iowait;
+ struct list_head tid_wait; /* for queueing tid space */
struct hfi1_opfn_data opfn;
+ struct tid_flow_state flow_state;
struct tid_rdma_qp_params tid_rdma;
struct rvt_qp *owner;
u8 hdr_type; /* 9B or 16B */
unsigned long tid_timer_timeout_jiffies;
+
+ /* variables for the TID RDMA SE state machine */
+ u32 s_flags;
+
+ /* For TID RDMA READ */
+ u32 tid_r_reqs; /* Num of tid reads requested */
+ u32 tid_r_comp; /* Num of tid reads completed */
+ u32 pending_tid_r_segs; /* Num of pending tid read segments */
u16 pkts_ps; /* packets per segment */
u8 timeout_shift; /* account for number of packets per segment */
};
+struct hfi1_swqe_priv {
+ struct tid_rdma_request tid_req;
+ struct rvt_sge_state ss; /* Used for TID RDMA READ Request */
+};
+
+struct hfi1_ack_priv {
+ struct tid_rdma_request tid_req;
+};
+
/*
* This structure is used to hold commonly lookedup and computed values during
* the send engine progress.
@@ -231,6 +252,7 @@ struct hfi1_ibdev {
struct kmem_cache *verbs_txreq_cache;
u64 n_txwait;
u64 n_kmem_wait;
+ u64 n_tidwait;
/* protect iowait lists */
seqlock_t iowait_lock ____cacheline_aligned_in_smp;
@@ -318,6 +340,31 @@ static inline u32 delta_psn(u32 a, u32 b)
return (((int)a - (int)b) << PSN_SHIFT) >> PSN_SHIFT;
}
+static inline struct tid_rdma_request *wqe_to_tid_req(struct rvt_swqe *wqe)
+{
+ return &((struct hfi1_swqe_priv *)wqe->priv)->tid_req;
+}
+
+static inline struct tid_rdma_request *ack_to_tid_req(struct rvt_ack_entry *e)
+{
+ return &((struct hfi1_ack_priv *)e->priv)->tid_req;
+}
+
+/*
+ * Look through all the active flows for a TID RDMA request and find
+ * the one (if it exists) that contains the specified PSN.
+ */
+static inline u32 __full_flow_psn(struct flow_state *state, u32 psn)
+{
+ return mask_psn((state->generation << HFI1_KDETH_BTH_SEQ_SHIFT) |
+ (psn & HFI1_KDETH_BTH_SEQ_MASK));
+}
+
+static inline u32 full_flow_psn(struct tid_rdma_flow *flow, u32 psn)
+{
+ return __full_flow_psn(&flow->flow_state, psn);
+}
+
struct verbs_txreq;
void hfi1_put_txreq(struct verbs_txreq *tx);
@@ -383,6 +430,10 @@ int hfi1_register_ib_device(struct hfi1_devdata *);
void hfi1_unregister_ib_device(struct hfi1_devdata *);
+void hfi1_kdeth_eager_rcv(struct hfi1_packet *packet);
+
+void hfi1_kdeth_expected_rcv(struct hfi1_packet *packet);
+
void hfi1_ib_rcv(struct hfi1_packet *packet);
void hfi1_16B_rcv(struct hfi1_packet *packet);
@@ -400,6 +451,16 @@ static inline bool opa_bth_is_migration(struct ib_other_headers *ohdr)
return ohdr->bth[1] & cpu_to_be32(OPA_BTH_MIG_REQ);
}
+void hfi1_wait_kmem(struct rvt_qp *qp);
+
+static inline void hfi1_trdma_send_complete(struct rvt_qp *qp,
+ struct rvt_swqe *wqe,
+ enum ib_wc_status status)
+{
+ trdma_clean_swqe(qp, wqe);
+ rvt_send_complete(qp, wqe, status);
+}
+
extern const enum ib_wc_opcode ib_hfi1_wc_opcode[];
extern const u8 hdr_len_by_opcode[];
diff --git a/drivers/infiniband/hw/qib/qib_rc.c b/drivers/infiniband/hw/qib/qib_rc.c
index 6fa002940451..50dd9811b088 100644
--- a/drivers/infiniband/hw/qib/qib_rc.c
+++ b/drivers/infiniband/hw/qib/qib_rc.c
@@ -45,12 +45,7 @@ static u32 restart_sge(struct rvt_sge_state *ss, struct rvt_swqe *wqe,
u32 len;
len = ((psn - wqe->psn) & QIB_PSN_MASK) * pmtu;
- ss->sge = wqe->sg_list[0];
- ss->sg_list = wqe->sg_list + 1;
- ss->num_sge = wqe->wr.num_sge;
- ss->total_len = wqe->length;
- rvt_skip_sge(ss, len, false);
- return wqe->length - len;
+ return rvt_restart_sge(ss, wqe, len);
}
/**
diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c
index 16247d2a671d..2769ebdf89fb 100644
--- a/drivers/infiniband/sw/rdmavt/qp.c
+++ b/drivers/infiniband/sw/rdmavt/qp.c
@@ -1642,11 +1642,11 @@ int rvt_destroy_qp(struct ib_qp *ibqp)
kref_put(&qp->ip->ref, rvt_release_mmap_info);
else
vfree(qp->r_rq.wq);
- vfree(qp->s_wq);
rdi->driver_f.qp_priv_free(rdi, qp);
kfree(qp->s_ack_queue);
rdma_destroy_ah_attr(&qp->remote_ah_attr);
rdma_destroy_ah_attr(&qp->alt_ah_attr);
+ vfree(qp->s_wq);
kfree(qp);
return 0;
}
@@ -2393,11 +2393,12 @@ static inline unsigned long rvt_aeth_to_usec(u32 aeth)
}
/*
- * rvt_add_retry_timer - add/start a retry timer
+ * rvt_add_retry_timer_ext - add/start a retry timer
* @qp - the QP
+ * @shift - timeout shift to wait for multiple packets
* add a retry timer on the QP
*/
-void rvt_add_retry_timer(struct rvt_qp *qp)
+void rvt_add_retry_timer_ext(struct rvt_qp *qp, u8 shift)
{
struct ib_qp *ibqp = &qp->ibqp;
struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device);
@@ -2405,11 +2406,11 @@ void rvt_add_retry_timer(struct rvt_qp *qp)
lockdep_assert_held(&qp->s_lock);
qp->s_flags |= RVT_S_TIMER;
/* 4.096 usec. * (1 << qp->timeout) */
- qp->s_timer.expires = jiffies + qp->timeout_jiffies +
- rdi->busy_jiffies;
+ qp->s_timer.expires = jiffies + rdi->busy_jiffies +
+ (qp->timeout_jiffies << shift);
add_timer(&qp->s_timer);
}
-EXPORT_SYMBOL(rvt_add_retry_timer);
+EXPORT_SYMBOL(rvt_add_retry_timer_ext);
/**
* rvt_add_rnr_timer - add/start an rnr timer
diff --git a/drivers/infiniband/sw/rdmavt/rc.c b/drivers/infiniband/sw/rdmavt/rc.c
index 6131cc558bdb..8d71647820a8 100644
--- a/drivers/infiniband/sw/rdmavt/rc.c
+++ b/drivers/infiniband/sw/rdmavt/rc.c
@@ -187,3 +187,16 @@ void rvt_get_credit(struct rvt_qp *qp, u32 aeth)
}
}
EXPORT_SYMBOL(rvt_get_credit);
+
+/* rvt_restart_sge - rewind the sge state for a wqe */
+u32 rvt_restart_sge(struct rvt_sge_state *ss, struct rvt_swqe *wqe, u32 len)
+{
+ ss->sge = wqe->sg_list[0];
+ ss->sg_list = wqe->sg_list + 1;
+ ss->num_sge = wqe->wr.num_sge;
+ ss->total_len = wqe->length;
+ rvt_skip_sge(ss, len, false);
+ return wqe->length - len;
+}
+EXPORT_SYMBOL(rvt_restart_sge);
+
diff --git a/include/rdma/ib_hdrs.h b/include/rdma/ib_hdrs.h
index 6e35416170a3..58a0a0f99e7f 100644
--- a/include/rdma/ib_hdrs.h
+++ b/include/rdma/ib_hdrs.h
@@ -1,5 +1,5 @@
/*
- * Copyright(c) 2016 Intel Corporation.
+ * Copyright(c) 2016 - 2018 Intel Corporation.
*
* This file is provided under a dual BSD/GPLv2 license. When using or
* redistributing this file, you may do so under either license.
@@ -100,6 +100,8 @@ struct ib_atomic_eth {
__be64 compare_data; /* potentially unaligned */
} __packed;
+#include <rdma/tid_rdma_defs.h>
+
union ib_ehdrs {
struct {
__be32 deth[2];
@@ -117,6 +119,11 @@ union ib_ehdrs {
__be32 aeth;
__be32 ieth;
struct ib_atomic_eth atomic_eth;
+ /* TID RDMA headers */
+ union {
+ struct tid_rdma_read_req r_req;
+ struct tid_rdma_read_resp r_rsp;
+ } tid_rdma;
} __packed;
struct ib_other_headers {
diff --git a/include/rdma/rdma_vt.h b/include/rdma/rdma_vt.h
index 168e40be183c..87d66c9630d7 100644
--- a/include/rdma/rdma_vt.h
+++ b/include/rdma/rdma_vt.h
@@ -574,9 +574,10 @@ static inline struct rvt_qp *rvt_lookup_qpn(struct rvt_dev_info *rdi,
/**
* rvt_mod_retry_timer - mod a retry timer
* @qp - the QP
+ * @shift - timeout shift to wait for multiple packets
* Modify a potentially already running retry timer
*/
-static inline void rvt_mod_retry_timer(struct rvt_qp *qp)
+static inline void rvt_mod_retry_timer_ext(struct rvt_qp *qp, u8 shift)
{
struct ib_qp *ibqp = &qp->ibqp;
struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device);
@@ -584,8 +585,13 @@ static inline void rvt_mod_retry_timer(struct rvt_qp *qp)
lockdep_assert_held(&qp->s_lock);
qp->s_flags |= RVT_S_TIMER;
/* 4.096 usec. * (1 << qp->timeout) */
- mod_timer(&qp->s_timer, jiffies + qp->timeout_jiffies +
- rdi->busy_jiffies);
+ mod_timer(&qp->s_timer, jiffies + rdi->busy_jiffies +
+ (qp->timeout_jiffies << shift));
+}
+
+static inline void rvt_mod_retry_timer(struct rvt_qp *qp)
+{
+ return rvt_mod_retry_timer_ext(qp, 0);
}
struct rvt_dev_info *rvt_alloc_device(size_t size, int nports);
diff --git a/include/rdma/rdmavt_qp.h b/include/rdma/rdmavt_qp.h
index cbafb1878669..d8d88d023092 100644
--- a/include/rdma/rdmavt_qp.h
+++ b/include/rdma/rdmavt_qp.h
@@ -174,6 +174,7 @@ struct rvt_swqe {
u32 lpsn; /* last packet sequence number */
u32 ssn; /* send sequence number */
u32 length; /* total length of data in sg_list */
+ void *priv; /* driver dependent field */
struct rvt_sge sg_list[0];
};
@@ -235,6 +236,7 @@ struct rvt_ack_entry {
u32 lpsn;
u8 opcode;
u8 sent;
+ void *priv;
};
#define RC_QP_SCALING_INTERVAL 5
@@ -629,6 +631,16 @@ __be32 rvt_compute_aeth(struct rvt_qp *qp);
void rvt_get_credit(struct rvt_qp *qp, u32 aeth);
/**
+ * rvt_restart_sge - rewind the sge state for a wqe
+ * @ss: the sge state pointer
+ * @wqe: the wqe to rewind
+ * @len: the data length from the start of the wqe in bytes
+ *
+ * Returns the remaining data length.
+ */
+u32 rvt_restart_sge(struct rvt_sge_state *ss, struct rvt_swqe *wqe, u32 len);
+
+/**
* @qp - the qp pair
* @len - the length
*
@@ -676,7 +688,11 @@ enum hrtimer_restart rvt_rc_rnr_retry(struct hrtimer *t);
void rvt_add_rnr_timer(struct rvt_qp *qp, u32 aeth);
void rvt_del_timers_sync(struct rvt_qp *qp);
void rvt_stop_rc_timers(struct rvt_qp *qp);
-void rvt_add_retry_timer(struct rvt_qp *qp);
+void rvt_add_retry_timer_ext(struct rvt_qp *qp, u8 shift);
+static inline void rvt_add_retry_timer(struct rvt_qp *qp)
+{
+ rvt_add_retry_timer_ext(qp, 0);
+}
void rvt_copy_sge(struct rvt_qp *qp, struct rvt_sge_state *ss,
void *data, u32 length,
diff --git a/include/rdma/tid_rdma_defs.h b/include/rdma/tid_rdma_defs.h
new file mode 100644
index 000000000000..1c431ea32b52
--- /dev/null
+++ b/include/rdma/tid_rdma_defs.h
@@ -0,0 +1,52 @@
+/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */
+/*
+ * Copyright(c) 2018 Intel Corporation.
+ *
+ */
+
+#ifndef TID_RDMA_DEFS_H
+#define TID_RDMA_DEFS_H
+
+#include <rdma/ib_pack.h>
+
+struct tid_rdma_read_req {
+ __le32 kdeth0;
+ __le32 kdeth1;
+ struct ib_reth reth;
+ __be32 tid_flow_psn;
+ __be32 tid_flow_qp;
+ __be32 verbs_qp;
+};
+
+struct tid_rdma_read_resp {
+ __le32 kdeth0;
+ __le32 kdeth1;
+ __be32 aeth;
+ __be32 reserved[4];
+ __be32 verbs_psn;
+ __be32 verbs_qp;
+};
+
+/*
+ * TID RDMA Opcodes
+ */
+#define IB_OPCODE_TID_RDMA 0xe0
+enum {
+ IB_OPCODE_READ_REQ = 0x4,
+ IB_OPCODE_READ_RESP = 0x5,
+
+ IB_OPCODE(TID_RDMA, READ_REQ),
+ IB_OPCODE(TID_RDMA, READ_RESP),
+};
+
+#define TID_OP(x) IB_OPCODE_TID_RDMA_##x
+
+/*
+ * Define TID RDMA specific WR opcodes. The ib_wr_opcode
+ * enum already provides some reserved values for use by
+ * low level drivers. Two of those are used but renamed
+ * to be more descriptive.
+ */
+#define IB_WR_TID_RDMA_READ IB_WR_RESERVED2
+
+#endif /* TID_RDMA_DEFS_H */