summaryrefslogtreecommitdiffstats
path: root/drivers/infiniband/hw
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/infiniband/hw')
-rw-r--r--drivers/infiniband/hw/hfi1/init.c1
-rw-r--r--drivers/infiniband/hw/hfi1/iowait.c34
-rw-r--r--drivers/infiniband/hw/hfi1/iowait.h99
-rw-r--r--drivers/infiniband/hw/hfi1/opfn.c5
-rw-r--r--drivers/infiniband/hw/hfi1/pio.c18
-rw-r--r--drivers/infiniband/hw/hfi1/qp.c57
-rw-r--r--drivers/infiniband/hw/hfi1/qp.h5
-rw-r--r--drivers/infiniband/hw/hfi1/rc.c542
-rw-r--r--drivers/infiniband/hw/hfi1/rc.h1
-rw-r--r--drivers/infiniband/hw/hfi1/ruc.c32
-rw-r--r--drivers/infiniband/hw/hfi1/sdma.c24
-rw-r--r--drivers/infiniband/hw/hfi1/sdma_txreq.h1
-rw-r--r--drivers/infiniband/hw/hfi1/tid_rdma.c2504
-rw-r--r--drivers/infiniband/hw/hfi1/tid_rdma.h88
-rw-r--r--drivers/infiniband/hw/hfi1/trace.c66
-rw-r--r--drivers/infiniband/hw/hfi1/trace_ibhdrs.h6
-rw-r--r--drivers/infiniband/hw/hfi1/trace_tid.h532
-rw-r--r--drivers/infiniband/hw/hfi1/trace_tx.h6
-rw-r--r--drivers/infiniband/hw/hfi1/user_sdma.c9
-rw-r--r--drivers/infiniband/hw/hfi1/verbs.c20
-rw-r--r--drivers/infiniband/hw/hfi1/verbs.h35
-rw-r--r--drivers/infiniband/hw/hfi1/verbs_txreq.h1
-rw-r--r--drivers/infiniband/hw/hfi1/vnic_sdma.c6
23 files changed, 3961 insertions, 131 deletions
diff --git a/drivers/infiniband/hw/hfi1/init.c b/drivers/infiniband/hw/hfi1/init.c
index d13304f7340d..7841a0ad7cb6 100644
--- a/drivers/infiniband/hw/hfi1/init.c
+++ b/drivers/infiniband/hw/hfi1/init.c
@@ -1512,6 +1512,7 @@ static int __init hfi1_mod_init(void)
goto bail_dev;
}
+ hfi1_compute_tid_rdma_flow_wt();
/*
* These must be called before the driver is registered with
* the PCI subsystem.
diff --git a/drivers/infiniband/hw/hfi1/iowait.c b/drivers/infiniband/hw/hfi1/iowait.c
index 582f1ba136ff..adb4a1ba921b 100644
--- a/drivers/infiniband/hw/hfi1/iowait.c
+++ b/drivers/infiniband/hw/hfi1/iowait.c
@@ -6,6 +6,9 @@
#include "iowait.h"
#include "trace_iowait.h"
+/* 1 priority == 16 starve_cnt */
+#define IOWAIT_PRIORITY_STARVE_SHIFT 4
+
void iowait_set_flag(struct iowait *wait, u32 flag)
{
trace_hfi1_iowait_set(wait, flag);
@@ -44,7 +47,8 @@ void iowait_init(struct iowait *wait, u32 tx_limit,
uint seq,
bool pkts_sent),
void (*wakeup)(struct iowait *wait, int reason),
- void (*sdma_drained)(struct iowait *wait))
+ void (*sdma_drained)(struct iowait *wait),
+ void (*init_priority)(struct iowait *wait))
{
int i;
@@ -58,6 +62,7 @@ void iowait_init(struct iowait *wait, u32 tx_limit,
wait->sleep = sleep;
wait->wakeup = wakeup;
wait->sdma_drained = sdma_drained;
+ wait->init_priority = init_priority;
wait->flags = 0;
for (i = 0; i < IOWAIT_SES; i++) {
wait->wait[i].iow = wait;
@@ -92,3 +97,30 @@ int iowait_set_work_flag(struct iowait_work *w)
iowait_set_flag(w->iow, IOWAIT_PENDING_TID);
return IOWAIT_TID_SE;
}
+
+/**
+ * iowait_priority_update_top - update the top priority entry
+ * @w: the iowait struct
+ * @top: a pointer to the top priority entry
+ * @idx: the index of the current iowait in an array
+ * @top_idx: the array index for the iowait entry that has the top priority
+ *
+ * This function is called to compare the priority of a given
+ * iowait with the given top priority entry. The top index will
+ * be returned.
+ */
+uint iowait_priority_update_top(struct iowait *w,
+ struct iowait *top,
+ uint idx, uint top_idx)
+{
+ u8 cnt, tcnt;
+
+ /* Convert priority into starve_cnt and compare the total.*/
+ cnt = (w->priority << IOWAIT_PRIORITY_STARVE_SHIFT) + w->starved_cnt;
+ tcnt = (top->priority << IOWAIT_PRIORITY_STARVE_SHIFT) +
+ top->starved_cnt;
+ if (cnt > tcnt)
+ return idx;
+ else
+ return top_idx;
+}
diff --git a/drivers/infiniband/hw/hfi1/iowait.h b/drivers/infiniband/hw/hfi1/iowait.h
index 23a58ac0d47c..07847cb72169 100644
--- a/drivers/infiniband/hw/hfi1/iowait.h
+++ b/drivers/infiniband/hw/hfi1/iowait.h
@@ -100,6 +100,7 @@ struct iowait_work {
* @sleep: no space callback
* @wakeup: space callback wakeup
* @sdma_drained: sdma count drained
+ * @init_priority: callback to manipulate priority
* @lock: lock protected head of wait queue
* @iowork: workqueue overhead
* @wait_dma: wait for sdma_busy == 0
@@ -109,7 +110,7 @@ struct iowait_work {
* @tx_limit: limit for overflow queuing
* @tx_count: number of tx entry's in tx_head'ed list
* @flags: wait flags (one per QP)
- * @wait: SE array
+ * @wait: SE array for multiple legs
*
* This is to be embedded in user's state structure
* (QP or PQ).
@@ -120,10 +121,13 @@ struct iowait_work {
* are callbacks for the ULP to implement
* what ever queuing/dequeuing of
* the embedded iowait and its containing struct
- * when a resource shortage like SDMA ring space is seen.
+ * when a resource shortage like SDMA ring space
+ * or PIO credit space is seen.
*
* Both potentially have locks help
- * so sleeping is not allowed.
+ * so sleeping is not allowed and it is not
+ * supported to submit txreqs from the wakeup
+ * call directly because of lock conflicts.
*
* The wait_dma member along with the iow
*
@@ -143,6 +147,7 @@ struct iowait {
);
void (*wakeup)(struct iowait *wait, int reason);
void (*sdma_drained)(struct iowait *wait);
+ void (*init_priority)(struct iowait *wait);
seqlock_t *lock;
wait_queue_head_t wait_dma;
wait_queue_head_t wait_pio;
@@ -152,6 +157,7 @@ struct iowait {
u32 tx_limit;
u32 tx_count;
u8 starved_cnt;
+ u8 priority;
unsigned long flags;
struct iowait_work wait[IOWAIT_SES];
};
@@ -171,7 +177,8 @@ void iowait_init(struct iowait *wait, u32 tx_limit,
uint seq,
bool pkts_sent),
void (*wakeup)(struct iowait *wait, int reason),
- void (*sdma_drained)(struct iowait *wait));
+ void (*sdma_drained)(struct iowait *wait),
+ void (*init_priority)(struct iowait *wait));
/**
* iowait_schedule() - schedule the default send engine work
@@ -186,6 +193,18 @@ static inline bool iowait_schedule(struct iowait *wait,
}
/**
+ * iowait_tid_schedule - schedule the tid SE
+ * @wait: the iowait structure
+ * @wq: the work queue
+ * @cpu: the cpu
+ */
+static inline bool iowait_tid_schedule(struct iowait *wait,
+ struct workqueue_struct *wq, int cpu)
+{
+ return !!queue_work_on(cpu, wq, &wait->wait[IOWAIT_TID_SE].iowork);
+}
+
+/**
* iowait_sdma_drain() - wait for DMAs to drain
*
* @wait: iowait structure
@@ -327,6 +346,8 @@ static inline u16 iowait_get_desc(struct iowait_work *w)
tx = list_first_entry(&w->tx_head, struct sdma_txreq,
list);
num_desc = tx->num_desc;
+ if (tx->flags & SDMA_TXREQ_F_VIP)
+ w->iow->priority++;
}
return num_desc;
}
@@ -340,6 +361,37 @@ static inline u32 iowait_get_all_desc(struct iowait *w)
return num_desc;
}
+static inline void iowait_update_priority(struct iowait_work *w)
+{
+ struct sdma_txreq *tx = NULL;
+
+ if (!list_empty(&w->tx_head)) {
+ tx = list_first_entry(&w->tx_head, struct sdma_txreq,
+ list);
+ if (tx->flags & SDMA_TXREQ_F_VIP)
+ w->iow->priority++;
+ }
+}
+
+static inline void iowait_update_all_priority(struct iowait *w)
+{
+ iowait_update_priority(&w->wait[IOWAIT_IB_SE]);
+ iowait_update_priority(&w->wait[IOWAIT_TID_SE]);
+}
+
+static inline void iowait_init_priority(struct iowait *w)
+{
+ w->priority = 0;
+ if (w->init_priority)
+ w->init_priority(w);
+}
+
+static inline void iowait_get_priority(struct iowait *w)
+{
+ iowait_init_priority(w);
+ iowait_update_all_priority(w);
+}
+
/**
* iowait_queue - Put the iowait on a wait queue
* @pkts_sent: have some packets been sent before queuing?
@@ -356,14 +408,18 @@ static inline void iowait_queue(bool pkts_sent, struct iowait *w,
/*
* To play fair, insert the iowait at the tail of the wait queue if it
* has already sent some packets; Otherwise, put it at the head.
+ * However, if it has priority packets to send, also put it at the
+ * head.
*/
- if (pkts_sent) {
- list_add_tail(&w->list, wait_head);
+ if (pkts_sent)
w->starved_cnt = 0;
- } else {
- list_add(&w->list, wait_head);
+ else
w->starved_cnt++;
- }
+
+ if (w->priority > 0 || !pkts_sent)
+ list_add(&w->list, wait_head);
+ else
+ list_add_tail(&w->list, wait_head);
}
/**
@@ -380,27 +436,10 @@ static inline void iowait_starve_clear(bool pkts_sent, struct iowait *w)
w->starved_cnt = 0;
}
-/**
- * iowait_starve_find_max - Find the maximum of the starve count
- * @w: the iowait struct
- * @max: a variable containing the max starve count
- * @idx: the index of the current iowait in an array
- * @max_idx: a variable containing the array index for the
- * iowait entry that has the max starve count
- *
- * This function is called to compare the starve count of a
- * given iowait with the given max starve count. The max starve
- * count and the index will be updated if the iowait's start
- * count is larger.
- */
-static inline void iowait_starve_find_max(struct iowait *w, u8 *max,
- uint idx, uint *max_idx)
-{
- if (w->starved_cnt > *max) {
- *max = w->starved_cnt;
- *max_idx = idx;
- }
-}
+/* Update the top priority index */
+uint iowait_priority_update_top(struct iowait *w,
+ struct iowait *top,
+ uint idx, uint top_idx);
/**
* iowait_packet_queued() - determine if a packet is queued
diff --git a/drivers/infiniband/hw/hfi1/opfn.c b/drivers/infiniband/hw/hfi1/opfn.c
index 2ca070690b2f..370a5a8eaa71 100644
--- a/drivers/infiniband/hw/hfi1/opfn.c
+++ b/drivers/infiniband/hw/hfi1/opfn.c
@@ -245,10 +245,15 @@ void opfn_qp_init(struct rvt_qp *qp, struct ib_qp_attr *attr, int attr_mask)
struct hfi1_qp_priv *priv = qp->priv;
unsigned long flags;
+ if (attr_mask & IB_QP_RETRY_CNT)
+ priv->s_retry = attr->retry_cnt;
+
spin_lock_irqsave(&priv->opfn.lock, flags);
if (ibqp->qp_type == IB_QPT_RC && HFI1_CAP_IS_KSET(TID_RDMA)) {
struct tid_rdma_params *local = &priv->tid_rdma.local;
+ if (attr_mask & IB_QP_TIMEOUT)
+ priv->tid_retry_timeout_jiffies = qp->timeout_jiffies;
if (qp->pmtu == enum_to_mtu(OPA_MTU_4096) ||
qp->pmtu == enum_to_mtu(OPA_MTU_8192)) {
tid_rdma_opfn_init(qp, local);
diff --git a/drivers/infiniband/hw/hfi1/pio.c b/drivers/infiniband/hw/hfi1/pio.c
index 04126d7e318d..a1de566fe95e 100644
--- a/drivers/infiniband/hw/hfi1/pio.c
+++ b/drivers/infiniband/hw/hfi1/pio.c
@@ -1599,8 +1599,7 @@ static void sc_piobufavail(struct send_context *sc)
struct rvt_qp *qp;
struct hfi1_qp_priv *priv;
unsigned long flags;
- uint i, n = 0, max_idx = 0;
- u8 max_starved_cnt = 0;
+ uint i, n = 0, top_idx = 0;
if (dd->send_contexts[sc->sw_index].type != SC_KERNEL &&
dd->send_contexts[sc->sw_index].type != SC_VL15)
@@ -1619,11 +1618,18 @@ static void sc_piobufavail(struct send_context *sc)
if (n == ARRAY_SIZE(qps))
break;
wait = list_first_entry(list, struct iowait, list);
+ iowait_get_priority(wait);
qp = iowait_to_qp(wait);
priv = qp->priv;
list_del_init(&priv->s_iowait.list);
priv->s_iowait.lock = NULL;
- iowait_starve_find_max(wait, &max_starved_cnt, n, &max_idx);
+ if (n) {
+ priv = qps[top_idx]->priv;
+ top_idx = iowait_priority_update_top(wait,
+ &priv->s_iowait,
+ n, top_idx);
+ }
+
/* refcount held until actual wake up */
qps[n++] = qp;
}
@@ -1638,12 +1644,12 @@ static void sc_piobufavail(struct send_context *sc)
}
write_sequnlock_irqrestore(&sc->waitlock, flags);
- /* Wake up the most starved one first */
+ /* Wake up the top-priority one first */
if (n)
- hfi1_qp_wakeup(qps[max_idx],
+ hfi1_qp_wakeup(qps[top_idx],
RVT_S_WAIT_PIO | HFI1_S_WAIT_PIO_DRAIN);
for (i = 0; i < n; i++)
- if (i != max_idx)
+ if (i != top_idx)
hfi1_qp_wakeup(qps[i],
RVT_S_WAIT_PIO | HFI1_S_WAIT_PIO_DRAIN);
}
diff --git a/drivers/infiniband/hw/hfi1/qp.c b/drivers/infiniband/hw/hfi1/qp.c
index acdd9eba189b..d8f7add935df 100644
--- a/drivers/infiniband/hw/hfi1/qp.c
+++ b/drivers/infiniband/hw/hfi1/qp.c
@@ -138,6 +138,12 @@ const struct rvt_operation_params hfi1_post_parms[RVT_OPERATION_MAX] = {
.flags = RVT_OPERATION_USE_RESERVE,
},
+[IB_WR_TID_RDMA_WRITE] = {
+ .length = sizeof(struct ib_rdma_wr),
+ .qpt_support = BIT(IB_QPT_RC),
+ .flags = RVT_OPERATION_IGN_RNR_CNT,
+},
+
};
static void flush_list_head(struct list_head *l)
@@ -431,6 +437,11 @@ static void hfi1_qp_schedule(struct rvt_qp *qp)
if (ret)
iowait_clear_flag(&priv->s_iowait, IOWAIT_PENDING_IB);
}
+ if (iowait_flag_set(&priv->s_iowait, IOWAIT_PENDING_TID)) {
+ ret = hfi1_schedule_tid_send(qp);
+ if (ret)
+ iowait_clear_flag(&priv->s_iowait, IOWAIT_PENDING_TID);
+ }
}
void hfi1_qp_wakeup(struct rvt_qp *qp, u32 flag)
@@ -450,8 +461,27 @@ void hfi1_qp_wakeup(struct rvt_qp *qp, u32 flag)
void hfi1_qp_unbusy(struct rvt_qp *qp, struct iowait_work *wait)
{
- if (iowait_set_work_flag(wait) == IOWAIT_IB_SE)
+ struct hfi1_qp_priv *priv = qp->priv;
+
+ if (iowait_set_work_flag(wait) == IOWAIT_IB_SE) {
qp->s_flags &= ~RVT_S_BUSY;
+ /*
+ * If we are sending a first-leg packet from the second leg,
+ * we need to clear the busy flag from priv->s_flags to
+ * avoid a race condition when the qp wakes up before
+ * the call to hfi1_verbs_send() returns to the second
+ * leg. In that case, the second leg will terminate without
+ * being re-scheduled, resulting in failure to send TID RDMA
+ * WRITE DATA and TID RDMA ACK packets.
+ */
+ if (priv->s_flags & HFI1_S_TID_BUSY_SET) {
+ priv->s_flags &= ~(HFI1_S_TID_BUSY_SET |
+ RVT_S_BUSY);
+ iowait_set_flag(&priv->s_iowait, IOWAIT_PENDING_TID);
+ }
+ } else {
+ priv->s_flags &= ~RVT_S_BUSY;
+ }
}
static int iowait_sleep(
@@ -488,6 +518,7 @@ static int iowait_sleep(
ibp->rvp.n_dmawait++;
qp->s_flags |= RVT_S_WAIT_DMA_DESC;
+ iowait_get_priority(&priv->s_iowait);
iowait_queue(pkts_sent, &priv->s_iowait,
&sde->dmawait);
priv->s_iowait.lock = &sde->waitlock;
@@ -537,6 +568,17 @@ static void iowait_sdma_drained(struct iowait *wait)
spin_unlock_irqrestore(&qp->s_lock, flags);
}
+static void hfi1_init_priority(struct iowait *w)
+{
+ struct rvt_qp *qp = iowait_to_qp(w);
+ struct hfi1_qp_priv *priv = qp->priv;
+
+ if (qp->s_flags & RVT_S_ACK_PENDING)
+ w->priority++;
+ if (priv->s_flags & RVT_S_ACK_PENDING)
+ w->priority++;
+}
+
/**
* qp_to_sdma_engine - map a qp to a send engine
* @qp: the QP
@@ -694,10 +736,11 @@ void *qp_priv_alloc(struct rvt_dev_info *rdi, struct rvt_qp *qp)
&priv->s_iowait,
1,
_hfi1_do_send,
- NULL,
+ _hfi1_do_tid_send,
iowait_sleep,
iowait_wakeup,
- iowait_sdma_drained);
+ iowait_sdma_drained,
+ hfi1_init_priority);
return priv;
}
@@ -755,6 +798,8 @@ void quiesce_qp(struct rvt_qp *qp)
{
struct hfi1_qp_priv *priv = qp->priv;
+ hfi1_del_tid_reap_timer(qp);
+ hfi1_del_tid_retry_timer(qp);
iowait_sdma_drain(&priv->s_iowait);
qp_pio_drain(qp);
flush_tx_list(qp);
@@ -850,7 +895,8 @@ void notify_error_qp(struct rvt_qp *qp)
if (lock) {
write_seqlock(lock);
if (!list_empty(&priv->s_iowait.list) &&
- !(qp->s_flags & RVT_S_BUSY)) {
+ !(qp->s_flags & RVT_S_BUSY) &&
+ !(priv->s_flags & RVT_S_BUSY)) {
qp->s_flags &= ~RVT_S_ANY_WAIT_IO;
list_del_init(&priv->s_iowait.list);
priv->s_iowait.lock = NULL;
@@ -859,7 +905,8 @@ void notify_error_qp(struct rvt_qp *qp)
write_sequnlock(lock);
}
- if (!(qp->s_flags & RVT_S_BUSY)) {
+ if (!(qp->s_flags & RVT_S_BUSY) && !(priv->s_flags & RVT_S_BUSY)) {
+ qp->s_hdrwords = 0;
if (qp->s_rdma_mr) {
rvt_put_mr(qp->s_rdma_mr);
qp->s_rdma_mr = NULL;
diff --git a/drivers/infiniband/hw/hfi1/qp.h b/drivers/infiniband/hw/hfi1/qp.h
index ce25a27aa4a1..b670321365d3 100644
--- a/drivers/infiniband/hw/hfi1/qp.h
+++ b/drivers/infiniband/hw/hfi1/qp.h
@@ -64,12 +64,16 @@ extern const struct rvt_operation_params hfi1_post_parms[];
* HFI1_S_AHG_CLEAR - have send engine clear ahg state
* HFI1_S_WAIT_PIO_DRAIN - qp waiting for PIOs to drain
* HFI1_S_WAIT_TID_SPACE - a QP is waiting for TID resource
+ * HFI1_S_WAIT_TID_RESP - waiting for a TID RDMA WRITE response
+ * HFI1_S_WAIT_HALT - halt the first leg send engine
* HFI1_S_MIN_BIT_MASK - the lowest bit that can be used by hfi1
*/
#define HFI1_S_AHG_VALID 0x80000000
#define HFI1_S_AHG_CLEAR 0x40000000
#define HFI1_S_WAIT_PIO_DRAIN 0x20000000
#define HFI1_S_WAIT_TID_SPACE 0x10000000
+#define HFI1_S_WAIT_TID_RESP 0x08000000
+#define HFI1_S_WAIT_HALT 0x04000000
#define HFI1_S_MIN_BIT_MASK 0x01000000
/*
@@ -78,6 +82,7 @@ extern const struct rvt_operation_params hfi1_post_parms[];
#define HFI1_S_ANY_WAIT_IO (RVT_S_ANY_WAIT_IO | HFI1_S_WAIT_PIO_DRAIN)
#define HFI1_S_ANY_WAIT (HFI1_S_ANY_WAIT_IO | RVT_S_ANY_WAIT_SEND)
+#define HFI1_S_ANY_TID_WAIT_SEND (RVT_S_WAIT_SSN_CREDIT | RVT_S_WAIT_DMA)
/*
* Send if not busy or waiting for I/O and either
diff --git a/drivers/infiniband/hw/hfi1/rc.c b/drivers/infiniband/hw/hfi1/rc.c
index 6c9ef572fc69..e6726c1ab866 100644
--- a/drivers/infiniband/hw/hfi1/rc.c
+++ b/drivers/infiniband/hw/hfi1/rc.c
@@ -111,15 +111,17 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp,
struct hfi1_pkt_state *ps)
{
struct rvt_ack_entry *e;
- u32 hwords;
+ u32 hwords, hdrlen;
u32 len = 0;
u32 bth0 = 0, bth2 = 0;
u32 bth1 = qp->remote_qpn | (HFI1_CAP_IS_KSET(OPFN) << IB_BTHE_E_SHIFT);
int middle = 0;
u32 pmtu = qp->pmtu;
- struct hfi1_qp_priv *priv = qp->priv;
+ struct hfi1_qp_priv *qpriv = qp->priv;
bool last_pkt;
u32 delta;
+ u8 next = qp->s_tail_ack_queue;
+ struct tid_rdma_request *req;
trace_hfi1_rsp_make_rc_ack(qp, 0);
lockdep_assert_held(&qp->s_lock);
@@ -127,7 +129,7 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp,
if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK))
goto bail;
- if (priv->hdr_type == HFI1_PKT_TYPE_9B)
+ if (qpriv->hdr_type == HFI1_PKT_TYPE_9B)
/* header size in 32-bit words LRH+BTH = (8+12)/4. */
hwords = 5;
else
@@ -149,9 +151,18 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp,
* response has been sent instead of only being
* constructed.
*/
- if (++qp->s_tail_ack_queue >
- rvt_size_atomic(ib_to_rvt(qp->ibqp.device)))
- qp->s_tail_ack_queue = 0;
+ if (++next > rvt_size_atomic(&dev->rdi))
+ next = 0;
+ /*
+ * Only advance the s_acked_ack_queue pointer if there
+ * have been no TID RDMA requests.
+ */
+ e = &qp->s_ack_queue[qp->s_tail_ack_queue];
+ if (e->opcode != TID_OP(WRITE_REQ) &&
+ qp->s_acked_ack_queue == qp->s_tail_ack_queue)
+ qp->s_acked_ack_queue = next;
+ qp->s_tail_ack_queue = next;
+ trace_hfi1_rsp_make_rc_ack(qp, e->psn);
/* FALLTHROUGH */
case OP(SEND_ONLY):
case OP(ACKNOWLEDGE):
@@ -163,6 +174,12 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp,
}
e = &qp->s_ack_queue[qp->s_tail_ack_queue];
+ /* Check for tid write fence */
+ if ((qpriv->s_flags & HFI1_R_TID_WAIT_INTERLCK) ||
+ hfi1_tid_rdma_ack_interlock(qp, e)) {
+ iowait_set_flag(&qpriv->s_iowait, IOWAIT_PENDING_IB);
+ goto bail;
+ }
if (e->opcode == OP(RDMA_READ_REQUEST)) {
/*
* If a RDMA read response is being resent and
@@ -172,6 +189,10 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp,
*/
len = e->rdma_sge.sge_length;
if (len && !e->rdma_sge.mr) {
+ if (qp->s_acked_ack_queue ==
+ qp->s_tail_ack_queue)
+ qp->s_acked_ack_queue =
+ qp->r_head_ack_queue;
qp->s_tail_ack_queue = qp->r_head_ack_queue;
goto bail;
}
@@ -193,6 +214,21 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp,
hwords++;
qp->s_ack_rdma_psn = e->psn;
bth2 = mask_psn(qp->s_ack_rdma_psn++);
+ } else if (e->opcode == TID_OP(WRITE_REQ)) {
+ /*
+ * If a TID RDMA WRITE RESP is being resent, we have to
+ * wait for the actual request. All requests that are to
+ * be resent will have their state set to
+ * TID_REQUEST_RESEND. When the new request arrives, the
+ * state will be changed to TID_REQUEST_RESEND_ACTIVE.
+ */
+ req = ack_to_tid_req(e);
+ if (req->state == TID_REQUEST_RESEND ||
+ req->state == TID_REQUEST_INIT_RESEND)
+ goto bail;
+ qp->s_ack_state = TID_OP(WRITE_RESP);
+ qp->s_ack_rdma_psn = mask_psn(e->psn + req->cur_seg);
+ goto write_resp;
} else if (e->opcode == TID_OP(READ_REQ)) {
/*
* If a TID RDMA read response is being resent and
@@ -202,6 +238,10 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp,
*/
len = e->rdma_sge.sge_length;
if (len && !e->rdma_sge.mr) {
+ if (qp->s_acked_ack_queue ==
+ qp->s_tail_ack_queue)
+ qp->s_acked_ack_queue =
+ qp->r_head_ack_queue;
qp->s_tail_ack_queue = qp->r_head_ack_queue;
goto bail;
}
@@ -224,6 +264,7 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp,
bth2 = mask_psn(e->psn);
e->sent = 1;
}
+ trace_hfi1_tid_write_rsp_make_rc_ack(qp);
bth0 = qp->s_ack_state << 24;
break;
@@ -250,6 +291,61 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp,
bth2 = mask_psn(qp->s_ack_rdma_psn++);
break;
+ case TID_OP(WRITE_RESP):
+write_resp:
+ /*
+ * 1. Check if RVT_S_ACK_PENDING is set. If yes,
+ * goto normal.
+ * 2. Attempt to allocate TID resources.
+ * 3. Remove RVT_S_RESP_PENDING flags from s_flags
+ * 4. If resources not available:
+ * 4.1 Set RVT_S_WAIT_TID_SPACE
+ * 4.2 Queue QP on RCD TID queue
+ * 4.3 Put QP on iowait list.
+ * 4.4 Build IB RNR NAK with appropriate timeout value
+ * 4.5 Return indication progress made.
+ * 5. If resources are available:
+ * 5.1 Program HW flow CSRs
+ * 5.2 Build TID RDMA WRITE RESP packet
+ * 5.3 If more resources needed, do 2.1 - 2.3.
+ * 5.4 Wake up next QP on RCD TID queue.
+ * 5.5 Return indication progress made.
+ */
+
+ e = &qp->s_ack_queue[qp->s_tail_ack_queue];
+ req = ack_to_tid_req(e);
+
+ /*
+ * Send scheduled RNR NAK's. RNR NAK's need to be sent at
+ * segment boundaries, not at request boundaries. Don't change
+ * s_ack_state because we are still in the middle of a request
+ */
+ if (qpriv->rnr_nak_state == TID_RNR_NAK_SEND &&
+ qp->s_tail_ack_queue == qpriv->r_tid_alloc &&
+ req->cur_seg == req->alloc_seg) {
+ qpriv->rnr_nak_state = TID_RNR_NAK_SENT;
+ goto normal_no_state;
+ }
+
+ bth2 = mask_psn(qp->s_ack_rdma_psn);
+ hdrlen = hfi1_build_tid_rdma_write_resp(qp, e, ohdr, &bth1,
+ bth2, &len,
+ &ps->s_txreq->ss);
+ if (!hdrlen)
+ return 0;
+
+ hwords += hdrlen;
+ bth0 = qp->s_ack_state << 24;
+ qp->s_ack_rdma_psn++;
+ trace_hfi1_tid_req_make_rc_ack_write(qp, 0, e->opcode, e->psn,
+ e->lpsn, req);
+ if (req->cur_seg != req->total_segs)
+ break;
+
+ e->sent = 1;
+ qp->s_ack_state = OP(RDMA_READ_RESPONSE_LAST);
+ break;
+
case TID_OP(READ_RESP):
read_resp:
e = &qp->s_ack_queue[qp->s_tail_ack_queue];
@@ -281,8 +377,7 @@ normal:
* (see above).
*/
qp->s_ack_state = OP(SEND_ONLY);
- qp->s_flags &= ~RVT_S_ACK_PENDING;
- ps->s_txreq->ss = NULL;
+normal_no_state:
if (qp->s_nak_state)
ohdr->u.aeth =
cpu_to_be32((qp->r_msn & IB_MSN_MASK) |
@@ -294,9 +389,12 @@ normal:
len = 0;
bth0 = OP(ACKNOWLEDGE) << 24;
bth2 = mask_psn(qp->s_ack_psn);
+ qp->s_flags &= ~RVT_S_ACK_PENDING;
+ ps->s_txreq->txreq.flags |= SDMA_TXREQ_F_VIP;
+ ps->s_txreq->ss = NULL;
}
qp->s_rdma_ack_cnt++;
- ps->s_txreq->sde = priv->s_sde;
+ ps->s_txreq->sde = qpriv->s_sde;
ps->s_txreq->s_cur_size = len;
ps->s_txreq->hdr_dwords = hwords;
hfi1_make_ruc_header(qp, ohdr, bth0, bth1, bth2, middle, ps);
@@ -349,6 +447,7 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
int middle = 0;
int delta;
struct tid_rdma_flow *flow = NULL;
+ struct tid_rdma_params *remote;
trace_hfi1_sender_make_rc_req(qp);
lockdep_assert_held(&qp->s_lock);
@@ -397,7 +496,7 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
goto done_free_tx;
}
- if (qp->s_flags & (RVT_S_WAIT_RNR | RVT_S_WAIT_ACK))
+ if (qp->s_flags & (RVT_S_WAIT_RNR | RVT_S_WAIT_ACK | HFI1_S_WAIT_HALT))
goto bail;
if (cmp_psn(qp->s_psn, qp->s_sending_hpsn) <= 0) {
@@ -569,6 +668,113 @@ no_flow_control:
qp->s_cur = 0;
break;
+ case IB_WR_TID_RDMA_WRITE:
+ if (newreq) {
+ /*
+ * Limit the number of TID RDMA WRITE requests.
+ */
+ if (atomic_read(&priv->n_tid_requests) >=
+ HFI1_TID_RDMA_WRITE_CNT)
+ goto bail;
+
+ if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT))
+ qp->s_lsn++;
+ }
+
+ hwords += hfi1_build_tid_rdma_write_req(qp, wqe, ohdr,
+ &bth1, &bth2,
+ &len);
+ ss = NULL;
+ if (priv->s_tid_cur == HFI1_QP_WQE_INVALID) {
+ priv->s_tid_cur = qp->s_cur;
+ if (priv->s_tid_tail == HFI1_QP_WQE_INVALID) {
+ priv->s_tid_tail = qp->s_cur;
+ priv->s_state = TID_OP(WRITE_RESP);
+ }
+ } else if (priv->s_tid_cur == priv->s_tid_head) {
+ struct rvt_swqe *__w;
+ struct tid_rdma_request *__r;
+
+ __w = rvt_get_swqe_ptr(qp, priv->s_tid_cur);
+ __r = wqe_to_tid_req(__w);
+
+ /*
+ * The s_tid_cur pointer is advanced to s_cur if
+ * any of the following conditions about the WQE
+ * to which s_ti_cur currently points to are
+ * satisfied:
+ * 1. The request is not a TID RDMA WRITE
+ * request,
+ * 2. The request is in the INACTIVE or
+ * COMPLETE states (TID RDMA READ requests
+ * stay at INACTIVE and TID RDMA WRITE
+ * transition to COMPLETE when done),
+ * 3. The request is in the ACTIVE or SYNC
+ * state and the number of completed
+ * segments is equal to the total segment
+ * count.
+ * (If ACTIVE, the request is waiting for
+ * ACKs. If SYNC, the request has not
+ * received any responses because it's
+ * waiting on a sync point.)
+ */
+ if (__w->wr.opcode != IB_WR_TID_RDMA_WRITE ||
+ __r->state == TID_REQUEST_INACTIVE ||
+ __r->state == TID_REQUEST_COMPLETE ||
+ ((__r->state == TID_REQUEST_ACTIVE ||
+ __r->state == TID_REQUEST_SYNC) &&
+ __r->comp_seg == __r->total_segs)) {
+ if (priv->s_tid_tail ==
+ priv->s_tid_cur &&
+ priv->s_state ==
+ TID_OP(WRITE_DATA_LAST)) {
+ priv->s_tid_tail = qp->s_cur;
+ priv->s_state =
+ TID_OP(WRITE_RESP);
+ }
+ priv->s_tid_cur = qp->s_cur;
+ }
+ /*
+ * A corner case: when the last TID RDMA WRITE
+ * request was completed, s_tid_head,
+ * s_tid_cur, and s_tid_tail all point to the
+ * same location. Other requests are posted and
+ * s_cur wraps around to the same location,
+ * where a new TID RDMA WRITE is posted. In
+ * this case, none of the indices need to be
+ * updated. However, the priv->s_state should.
+ */
+ if (priv->s_tid_tail == qp->s_cur &&
+ priv->s_state == TID_OP(WRITE_DATA_LAST))
+ priv->s_state = TID_OP(WRITE_RESP);
+ }
+ req = wqe_to_tid_req(wqe);
+ if (newreq) {
+ priv->s_tid_head = qp->s_cur;
+ priv->pending_tid_w_resp += req->total_segs;
+ atomic_inc(&priv->n_tid_requests);
+ atomic_dec(&priv->n_requests);
+ } else {
+ req->state = TID_REQUEST_RESEND;
+ req->comp_seg = delta_psn(bth2, wqe->psn);
+ /*
+ * Pull back any segments since we are going
+ * to re-receive them.
+ */
+ req->setup_head = req->clear_tail;
+ priv->pending_tid_w_resp +=
+ delta_psn(wqe->lpsn, bth2) + 1;
+ }
+
+ trace_hfi1_tid_write_sender_make_req(qp, newreq);
+ trace_hfi1_tid_req_make_req_write(qp, newreq,
+ wqe->wr.opcode,
+ wqe->psn, wqe->lpsn,
+ req);
+ if (++qp->s_cur == qp->s_size)
+ qp->s_cur = 0;
+ break;
+
case IB_WR_RDMA_READ:
/*
* Don't allow more operations to be started
@@ -728,7 +934,8 @@ no_flow_control:
if (qp->s_tail >= qp->s_size)
qp->s_tail = 0;
}
- if (wqe->wr.opcode == IB_WR_RDMA_READ)
+ if (wqe->wr.opcode == IB_WR_RDMA_READ ||
+ wqe->wr.opcode == IB_WR_TID_RDMA_WRITE)
qp->s_psn = wqe->lpsn + 1;
else if (wqe->wr.opcode == IB_WR_TID_RDMA_READ)
qp->s_psn = req->s_next_psn;
@@ -848,6 +1055,35 @@ no_flow_control:
if (qp->s_cur == qp->s_size)
qp->s_cur = 0;
break;
+
+ case TID_OP(WRITE_RESP):
+ /*
+ * This value for s_state is used for restarting a TID RDMA
+ * WRITE request. See comment in OP(RDMA_READ_RESPONSE_MIDDLE
+ * for more).
+ */
+ req = wqe_to_tid_req(wqe);
+ req->state = TID_REQUEST_RESEND;
+ rcu_read_lock();
+ remote = rcu_dereference(priv->tid_rdma.remote);
+ req->comp_seg = delta_psn(qp->s_psn, wqe->psn);
+ len = wqe->length - (req->comp_seg * remote->max_len);
+ rcu_read_unlock();
+
+ bth2 = mask_psn(qp->s_psn);
+ hwords += hfi1_build_tid_rdma_write_req(qp, wqe, ohdr, &bth1,
+ &bth2, &len);
+ qp->s_psn = wqe->lpsn + 1;
+ ss = NULL;
+ qp->s_state = TID_OP(WRITE_REQ);
+ priv->pending_tid_w_resp += delta_psn(wqe->lpsn, bth2) + 1;
+ priv->s_tid_cur = qp->s_cur;
+ if (++qp->s_cur == qp->s_size)
+ qp->s_cur = 0;
+ trace_hfi1_tid_req_make_req_write(qp, 0, wqe->wr.opcode,
+ wqe->psn, wqe->lpsn, req);
+ break;
+
case TID_OP(READ_RESP):
if (wqe->wr.opcode != IB_WR_TID_RDMA_READ)
goto bail;
@@ -948,7 +1184,8 @@ no_flow_control:
}
qp->s_sending_hpsn = bth2;
delta = delta_psn(bth2, wqe->psn);
- if (delta && delta % HFI1_PSN_CREDIT == 0)
+ if (delta && delta % HFI1_PSN_CREDIT == 0 &&
+ wqe->wr.opcode != IB_WR_TID_RDMA_WRITE)
bth2 |= IB_BTH_REQ_ACK;
if (qp->s_flags & RVT_S_SEND_ONE) {
qp->s_flags &= ~RVT_S_SEND_ONE;
@@ -981,6 +1218,12 @@ bail:
bail_no_tx:
ps->s_txreq = NULL;
qp->s_flags &= ~RVT_S_BUSY;
+ /*
+ * If we didn't get a txreq, the QP will be woken up later to try
+ * again. Set the flags to indicate which work item to wake
+ * up.
+ */
+ iowait_set_flag(&priv->s_iowait, IOWAIT_PENDING_IB);
return 0;
}
@@ -1268,6 +1511,7 @@ static void reset_psn(struct rvt_qp *qp, u32 psn)
lockdep_assert_held(&qp->s_lock);
qp->s_cur = n;
priv->pending_tid_r_segs = 0;
+ priv->pending_tid_w_resp = 0;
qp->s_num_rd_atomic = 0;
/*
@@ -1325,6 +1569,10 @@ static void reset_psn(struct rvt_qp *qp, u32 psn)
qp->s_state = OP(RDMA_READ_RESPONSE_LAST);
break;
+ case IB_WR_TID_RDMA_WRITE:
+ qp->s_state = TID_OP(WRITE_RESP);
+ break;
+
case IB_WR_RDMA_READ:
qp->s_state = OP(RDMA_READ_RESPONSE_MIDDLE);
break;
@@ -1389,6 +1637,7 @@ void hfi1_restart_rc(struct rvt_qp *qp, u32 psn, int wait)
wqe = do_rc_completion(qp, wqe, ibp);
qp->s_flags &= ~RVT_S_WAIT_ACK;
} else {
+ trace_hfi1_tid_write_sender_restart_rc(qp, 0);
if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) {
struct tid_rdma_request *req;
@@ -1418,7 +1667,7 @@ void hfi1_restart_rc(struct rvt_qp *qp, u32 psn, int wait)
qp->s_flags &= ~(RVT_S_WAIT_FENCE | RVT_S_WAIT_RDMAR |
RVT_S_WAIT_SSN_CREDIT | RVT_S_WAIT_PSN |
- RVT_S_WAIT_ACK);
+ RVT_S_WAIT_ACK | HFI1_S_WAIT_TID_RESP);
if (wait)
qp->s_flags |= RVT_S_SEND_ONE;
reset_psn(qp, psn);
@@ -1426,7 +1675,8 @@ void hfi1_restart_rc(struct rvt_qp *qp, u32 psn, int wait)
/*
* Set qp->s_sending_psn to the next PSN after the given one.
- * This would be psn+1 except when RDMA reads are present.
+ * This would be psn+1 except when RDMA reads or TID RDMA ops
+ * are present.
*/
static void reset_sending_psn(struct rvt_qp *qp, u32 psn)
{
@@ -1439,7 +1689,8 @@ static void reset_sending_psn(struct rvt_qp *qp, u32 psn)
wqe = rvt_get_swqe_ptr(qp, n);
if (cmp_psn(psn, wqe->lpsn) <= 0) {
if (wqe->wr.opcode == IB_WR_RDMA_READ ||
- wqe->wr.opcode == IB_WR_TID_RDMA_READ)
+ wqe->wr.opcode == IB_WR_TID_RDMA_READ ||
+ wqe->wr.opcode == IB_WR_TID_RDMA_WRITE)
qp->s_sending_psn = wqe->lpsn + 1;
else
qp->s_sending_psn = psn + 1;
@@ -1462,8 +1713,9 @@ void hfi1_rc_send_complete(struct rvt_qp *qp, struct hfi1_opa_header *opah)
struct rvt_swqe *wqe;
struct ib_header *hdr = NULL;
struct hfi1_16b_header *hdr_16b = NULL;
- u32 opcode;
+ u32 opcode, head, tail;
u32 psn;
+ struct tid_rdma_request *req;
lockdep_assert_held(&qp->s_lock);
if (!(ib_rvt_state_ops[qp->state] & RVT_SEND_OR_FLUSH_OR_RECV_OK))
@@ -1490,29 +1742,84 @@ void hfi1_rc_send_complete(struct rvt_qp *qp, struct hfi1_opa_header *opah)
opcode = ib_bth_get_opcode(ohdr);
if ((opcode >= OP(RDMA_READ_RESPONSE_FIRST) &&
opcode <= OP(ATOMIC_ACKNOWLEDGE)) ||
- opcode == TID_OP(READ_RESP)) {
+ opcode == TID_OP(READ_RESP) ||
+ opcode == TID_OP(WRITE_RESP)) {
WARN_ON(!qp->s_rdma_ack_cnt);
qp->s_rdma_ack_cnt--;
return;
}
psn = ib_bth_get_psn(ohdr);
- reset_sending_psn(qp, psn);
+ /*
+ * Don't attempt to reset the sending PSN for packets in the
+ * KDETH PSN space since the PSN does not match anything.
+ */
+ if (opcode != TID_OP(WRITE_DATA) &&
+ opcode != TID_OP(WRITE_DATA_LAST) &&
+ opcode != TID_OP(ACK) && opcode != TID_OP(RESYNC))
+ reset_sending_psn(qp, psn);
+
+ /* Handle TID RDMA WRITE packets differently */
+ if (opcode >= TID_OP(WRITE_REQ) &&
+ opcode <= TID_OP(WRITE_DATA_LAST)) {
+ head = priv->s_tid_head;
+ tail = priv->s_tid_cur;
+ /*
+ * s_tid_cur is set to s_tid_head in the case, where
+ * a new TID RDMA request is being started and all
+ * previous ones have been completed.
+ * Therefore, we need to do a secondary check in order
+ * to properly determine whether we should start the
+ * RC timer.
+ */
+ wqe = rvt_get_swqe_ptr(qp, tail);
+ req = wqe_to_tid_req(wqe);
+ if (head == tail && req->comp_seg < req->total_segs) {
+ if (tail == 0)
+ tail = qp->s_size - 1;
+ else
+ tail -= 1;
+ }
+ } else {
+ head = qp->s_tail;
+ tail = qp->s_acked;
+ }
/*
* Start timer after a packet requesting an ACK has been sent and
* there are still requests that haven't been acked.
*/
- if ((psn & IB_BTH_REQ_ACK) && qp->s_acked != qp->s_tail &&
+ if ((psn & IB_BTH_REQ_ACK) && tail != head &&
+ opcode != TID_OP(WRITE_DATA) && opcode != TID_OP(WRITE_DATA_LAST) &&
+ opcode != TID_OP(RESYNC) &&
!(qp->s_flags &
- (RVT_S_TIMER | RVT_S_WAIT_RNR | RVT_S_WAIT_PSN)) &&
- (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) {
+ (RVT_S_TIMER | RVT_S_WAIT_RNR | RVT_S_WAIT_PSN)) &&
+ (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) {
if (opcode == TID_OP(READ_REQ))
rvt_add_retry_timer_ext(qp, priv->timeout_shift);
else
rvt_add_retry_timer(qp);
}
+ /* Start TID RDMA ACK timer */
+ if ((opcode == TID_OP(WRITE_DATA) ||
+ opcode == TID_OP(WRITE_DATA_LAST) ||
+ opcode == TID_OP(RESYNC)) &&
+ (psn & IB_BTH_REQ_ACK) &&
+ !(priv->s_flags & HFI1_S_TID_RETRY_TIMER) &&
+ (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) {
+ /*
+ * The TID RDMA ACK packet could be received before this
+ * function is called. Therefore, add the timer only if TID
+ * RDMA ACK packets are actually pending.
+ */
+ wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
+ req = wqe_to_tid_req(wqe);
+ if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE &&
+ req->ack_seg < req->cur_seg)
+ hfi1_add_tid_retry_timer(qp);
+ }
+
while (qp->s_last != qp->s_acked) {
u32 s_last;
@@ -1611,7 +1918,16 @@ struct rvt_swqe *do_rc_completion(struct rvt_qp *qp,
}
qp->s_retry = qp->s_retry_cnt;
- update_last_psn(qp, wqe->lpsn);
+ /*
+ * Don't update the last PSN if the request being completed is
+ * a TID RDMA WRITE request.
+ * Completion of the TID RDMA WRITE requests are done by the
+ * TID RDMA ACKs and as such could be for a request that has
+ * already been ACKed as far as the IB state machine is
+ * concerned.
+ */
+ if (wqe->wr.opcode != IB_WR_TID_RDMA_WRITE)
+ update_last_psn(qp, wqe->lpsn);
/*
* If we are completing a request which is in the process of
@@ -1641,6 +1957,54 @@ struct rvt_swqe *do_rc_completion(struct rvt_qp *qp,
return wqe;
}
+static void set_restart_qp(struct rvt_qp *qp, struct hfi1_ctxtdata *rcd)
+{
+ /* Retry this request. */
+ if (!(qp->r_flags & RVT_R_RDMAR_SEQ)) {
+ qp->r_flags |= RVT_R_RDMAR_SEQ;
+ hfi1_restart_rc(qp, qp->s_last_psn + 1, 0);
+ if (list_empty(&qp->rspwait)) {
+ qp->r_flags |= RVT_R_RSP_SEND;
+ rvt_get_qp(qp);
+ list_add_tail(&qp->rspwait, &rcd->qp_wait_list);
+ }
+ }
+}
+
+/**
+ * update_qp_retry_state - Update qp retry state.
+ * @qp: the QP
+ * @psn: the packet sequence number of the TID RDMA WRITE RESP.
+ * @spsn: The start psn for the given TID RDMA WRITE swqe.
+ * @lpsn: The last psn for the given TID RDMA WRITE swqe.
+ *
+ * This function is called to update the qp retry state upon
+ * receiving a TID WRITE RESP after the qp is scheduled to retry
+ * a request.
+ */
+static void update_qp_retry_state(struct rvt_qp *qp, u32 psn, u32 spsn,
+ u32 lpsn)
+{
+ struct hfi1_qp_priv *qpriv = qp->priv;
+
+ qp->s_psn = psn + 1;
+ /*
+ * If this is the first TID RDMA WRITE RESP packet for the current
+ * request, change the s_state so that the retry will be processed
+ * correctly. Similarly, if this is the last TID RDMA WRITE RESP
+ * packet, change the s_state and advance the s_cur.
+ */
+ if (cmp_psn(psn, lpsn) >= 0) {
+ qp->s_cur = qpriv->s_tid_cur + 1;
+ if (qp->s_cur >= qp->s_size)
+ qp->s_cur = 0;
+ qp->s_state = TID_OP(WRITE_REQ);
+ } else if (!cmp_psn(psn, spsn)) {
+ qp->s_cur = qpriv->s_tid_cur;
+ qp->s_state = TID_OP(WRITE_RESP);
+ }
+}
+
/**
* do_rc_ack - process an incoming RC ACK
* @qp: the QP the ACK came in on
@@ -1662,6 +2026,7 @@ int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode,
int ret = 0;
u32 ack_psn;
int diff;
+ struct rvt_dev_info *rdi;
lockdep_assert_held(&qp->s_lock);
/*
@@ -1708,18 +2073,10 @@ int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode,
(opcode != TID_OP(READ_RESP) || diff != 0)) ||
((wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) &&
- (opcode != OP(ATOMIC_ACKNOWLEDGE) || diff != 0))) {
- /* Retry this request. */
- if (!(qp->r_flags & RVT_R_RDMAR_SEQ)) {
- qp->r_flags |= RVT_R_RDMAR_SEQ;
- hfi1_restart_rc(qp, qp->s_last_psn + 1, 0);
- if (list_empty(&qp->rspwait)) {
- qp->r_flags |= RVT_R_RSP_SEND;
- rvt_get_qp(qp);
- list_add_tail(&qp->rspwait,
- &rcd->qp_wait_list);
- }
- }
+ (opcode != OP(ATOMIC_ACKNOWLEDGE) || diff != 0)) ||
+ (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE &&
+ (delta_psn(psn, qp->s_last_psn) != 1))) {
+ set_restart_qp(qp, rcd);
/*
* No need to process the ACK/NAK since we are
* restarting an earlier request.
@@ -1751,6 +2108,14 @@ int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode,
hfi1_schedule_send(qp);
}
}
+
+ /*
+ * TID RDMA WRITE requests will be completed by the TID RDMA
+ * ACK packet handler (see tid_rdma.c).
+ */
+ if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE)
+ break;
+
wqe = do_rc_completion(qp, wqe, ibp);
if (qp->s_acked == qp->s_tail)
break;
@@ -1768,17 +2133,60 @@ int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode,
else
rvt_stop_rc_timers(qp);
} else if (qp->s_acked != qp->s_tail) {
+ struct rvt_swqe *__w = NULL;
+
+ if (qpriv->s_tid_cur != HFI1_QP_WQE_INVALID)
+ __w = rvt_get_swqe_ptr(qp, qpriv->s_tid_cur);
+
/*
- * We are expecting more ACKs so
- * mod the retry timer.
- */
- rvt_mod_retry_timer(qp);
- /*
- * We can stop re-sending the earlier packets and
- * continue with the next packet the receiver wants.
+ * Stop timers if we've received all of the TID RDMA
+ * WRITE * responses.
*/
- if (cmp_psn(qp->s_psn, psn) <= 0)
- reset_psn(qp, psn + 1);
+ if (__w && __w->wr.opcode == IB_WR_TID_RDMA_WRITE &&
+ opcode == TID_OP(WRITE_RESP)) {
+ /*
+ * Normally, the loop above would correctly
+ * process all WQEs from s_acked onward and
+ * either complete them or check for correct
+ * PSN sequencing.
+ * However, for TID RDMA, due to pipelining,
+ * the response may not be for the request at
+ * s_acked so the above look would just be
+ * skipped. This does not allow for checking
+ * the PSN sequencing. It has to be done
+ * separately.
+ */
+ if (cmp_psn(psn, qp->s_last_psn + 1)) {
+ set_restart_qp(qp, rcd);
+ goto bail_stop;
+ }
+ /*
+ * If the psn is being resent, stop the
+ * resending.
+ */
+ if (qp->s_cur != qp->s_tail &&
+ cmp_psn(qp->s_psn, psn) <= 0)
+ update_qp_retry_state(qp, psn,
+ __w->psn,
+ __w->lpsn);
+ else if (--qpriv->pending_tid_w_resp)
+ rvt_mod_retry_timer(qp);
+ else
+ rvt_stop_rc_timers(qp);
+ } else {
+ /*
+ * We are expecting more ACKs so
+ * mod the retry timer.
+ */
+ rvt_mod_retry_timer(qp);
+ /*
+ * We can stop re-sending the earlier packets
+ * and continue with the next packet the
+ * receiver wants.
+ */
+ if (cmp_psn(qp->s_psn, psn) <= 0)
+ reset_psn(qp, psn + 1);
+ }
} else {
/* No more acks - kill all timers */
rvt_stop_rc_timers(qp);
@@ -1794,6 +2202,15 @@ int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode,
rvt_get_credit(qp, aeth);
qp->s_rnr_retry = qp->s_rnr_retry_cnt;
qp->s_retry = qp->s_retry_cnt;
+ /*
+ * If the current request is a TID RDMA WRITE request and the
+ * response is not a TID RDMA WRITE RESP packet, s_last_psn
+ * can't be advanced.
+ */
+ if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE &&
+ opcode != TID_OP(WRITE_RESP) &&
+ cmp_psn(psn, wqe->psn) >= 0)
+ return 1;
update_last_psn(qp, psn);
return 1;
@@ -1803,20 +2220,31 @@ int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode,
goto bail_stop;
if (qp->s_flags & RVT_S_WAIT_RNR)
goto bail_stop;
- if (qp->s_rnr_retry == 0) {
+ rdi = ib_to_rvt(qp->ibqp.device);
+ if (qp->s_rnr_retry == 0 &&
+ !((rdi->post_parms[wqe->wr.opcode].flags &
+ RVT_OPERATION_IGN_RNR_CNT) &&
+ qp->s_rnr_retry_cnt == 0)) {
status = IB_WC_RNR_RETRY_EXC_ERR;
goto class_b;
}
- if (qp->s_rnr_retry_cnt < 7)
+ if (qp->s_rnr_retry_cnt < 7 && qp->s_rnr_retry_cnt > 0)
qp->s_rnr_retry--;
- /* The last valid PSN is the previous PSN. */
- update_last_psn(qp, psn - 1);
+ /*
+ * The last valid PSN is the previous PSN. For TID RDMA WRITE
+ * request, s_last_psn should be incremented only when a TID
+ * RDMA WRITE RESP is received to avoid skipping lost TID RDMA
+ * WRITE RESP packets.
+ */
+ if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE) {
+ reset_psn(qp, qp->s_last_psn + 1);
+ } else {
+ update_last_psn(qp, psn - 1);
+ reset_psn(qp, psn);
+ }
ibp->rvp.n_rc_resends += delta_psn(qp->s_psn, psn);
-
- reset_psn(qp, psn);
-
qp->s_flags &= ~(RVT_S_WAIT_SSN_CREDIT | RVT_S_WAIT_ACK);
rvt_stop_rc_timers(qp);
rvt_add_rnr_timer(qp, aeth);
@@ -1901,6 +2329,7 @@ static void rdma_seq_err(struct rvt_qp *qp, struct hfi1_ibport *ibp, u32 psn,
while (cmp_psn(psn, wqe->lpsn) > 0) {
if (wqe->wr.opcode == IB_WR_RDMA_READ ||
wqe->wr.opcode == IB_WR_TID_RDMA_READ ||
+ wqe->wr.opcode == IB_WR_TID_RDMA_WRITE ||
wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD)
break;
@@ -2235,6 +2664,8 @@ static noinline int rc_rcv_error(struct ib_other_headers *ohdr, void *data,
e->psn = psn;
if (old_req)
goto unlock_done;
+ if (qp->s_acked_ack_queue == qp->s_tail_ack_queue)
+ qp->s_acked_ack_queue = prev;
qp->s_tail_ack_queue = prev;
break;
}
@@ -2248,6 +2679,8 @@ static noinline int rc_rcv_error(struct ib_other_headers *ohdr, void *data,
*/
if (!e || e->opcode != (u8)opcode || old_req)
goto unlock_done;
+ if (qp->s_tail_ack_queue == qp->s_acked_ack_queue)
+ qp->s_acked_ack_queue = prev;
qp->s_tail_ack_queue = prev;
break;
}
@@ -2274,6 +2707,8 @@ static noinline int rc_rcv_error(struct ib_other_headers *ohdr, void *data,
* Resend the RDMA read or atomic op which
* ACKs this duplicate request.
*/
+ if (qp->s_tail_ack_queue == qp->s_acked_ack_queue)
+ qp->s_acked_ack_queue = mra;
qp->s_tail_ack_queue = mra;
break;
}
@@ -2388,6 +2823,7 @@ void hfi1_rc_rcv(struct hfi1_packet *packet)
void *data = packet->payload;
u32 tlen = packet->tlen;
struct rvt_qp *qp = packet->qp;
+ struct hfi1_qp_priv *qpriv = qp->priv;
struct hfi1_ibport *ibp = rcd_to_iport(rcd);
struct ib_other_headers *ohdr = packet->ohdr;
u32 opcode = packet->opcode;
@@ -2646,7 +3082,7 @@ send_last:
if (next > rvt_size_atomic(ib_to_rvt(qp->ibqp.device)))
next = 0;
spin_lock_irqsave(&qp->s_lock, flags);
- if (unlikely(next == qp->s_tail_ack_queue)) {
+ if (unlikely(next == qp->s_acked_ack_queue)) {
if (!qp->s_ack_queue[next].sent)
goto nack_inv_unlck;
update_ack_queue(qp, next);
@@ -2693,6 +3129,7 @@ send_last:
qp->r_state = opcode;
qp->r_nak_state = 0;
qp->r_head_ack_queue = next;
+ qpriv->r_tid_alloc = qp->r_head_ack_queue;
/* Schedule the send engine. */
qp->s_flags |= RVT_S_RESP_PENDING;
@@ -2723,7 +3160,7 @@ send_last:
if (next > rvt_size_atomic(ib_to_rvt(qp->ibqp.device)))
next = 0;
spin_lock_irqsave(&qp->s_lock, flags);
- if (unlikely(next == qp->s_tail_ack_queue)) {
+ if (unlikely(next == qp->s_acked_ack_queue)) {
if (!qp->s_ack_queue[next].sent)
goto nack_inv_unlck;
update_ack_queue(qp, next);
@@ -2766,6 +3203,7 @@ ack:
qp->r_state = opcode;
qp->r_nak_state = 0;
qp->r_head_ack_queue = next;
+ qpriv->r_tid_alloc = qp->r_head_ack_queue;
/* Schedule the send engine. */
qp->s_flags |= RVT_S_RESP_PENDING;
diff --git a/drivers/infiniband/hw/hfi1/rc.h b/drivers/infiniband/hw/hfi1/rc.h
index 4329eadcb3df..8e0935b9bf2a 100644
--- a/drivers/infiniband/hw/hfi1/rc.h
+++ b/drivers/infiniband/hw/hfi1/rc.h
@@ -18,6 +18,7 @@ static inline void update_ack_queue(struct rvt_qp *qp, unsigned int n)
if (next > rvt_size_atomic(ib_to_rvt(qp->ibqp.device)))
next = 0;
qp->s_tail_ack_queue = next;
+ qp->s_acked_ack_queue = next;
qp->s_ack_state = OP(ACKNOWLEDGE);
}
diff --git a/drivers/infiniband/hw/hfi1/ruc.c b/drivers/infiniband/hw/hfi1/ruc.c
index f96c0f544cb0..124a3ec1e15c 100644
--- a/drivers/infiniband/hw/hfi1/ruc.c
+++ b/drivers/infiniband/hw/hfi1/ruc.c
@@ -453,11 +453,13 @@ void hfi1_make_ruc_header(struct rvt_qp *qp, struct ib_other_headers *ohdr,
#define SEND_RESCHED_TIMEOUT (5 * HZ) /* 5s in jiffies */
/**
- * schedule_send_yield - test for a yield required for QP send engine
+ * hfi1_schedule_send_yield - test for a yield required for QP
+ * send engine
* @timeout: Final time for timeout slice for jiffies
* @qp: a pointer to QP
* @ps: a pointer to a structure with commonly lookup values for
* the the send engine progress
+ * @tid - true if it is the tid leg
*
* This routine checks if the time slice for the QP has expired
* for RC QPs, if so an additional work entry is queued. At this
@@ -465,8 +467,8 @@ void hfi1_make_ruc_header(struct rvt_qp *qp, struct ib_other_headers *ohdr,
* returns true if a yield is required, otherwise, false
* is returned.
*/
-static bool schedule_send_yield(struct rvt_qp *qp,
- struct hfi1_pkt_state *ps)
+bool hfi1_schedule_send_yield(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
+ bool tid)
{
ps->pkts_sent = true;
@@ -474,8 +476,24 @@ static bool schedule_send_yield(struct rvt_qp *qp,
if (!ps->in_thread ||
workqueue_congested(ps->cpu, ps->ppd->hfi1_wq)) {
spin_lock_irqsave(&qp->s_lock, ps->flags);
- qp->s_flags &= ~RVT_S_BUSY;
- hfi1_schedule_send(qp);
+ if (!tid) {
+ qp->s_flags &= ~RVT_S_BUSY;
+ hfi1_schedule_send(qp);
+ } else {
+ struct hfi1_qp_priv *priv = qp->priv;
+
+ if (priv->s_flags &
+ HFI1_S_TID_BUSY_SET) {
+ qp->s_flags &= ~RVT_S_BUSY;
+ priv->s_flags &=
+ ~(HFI1_S_TID_BUSY_SET |
+ RVT_S_BUSY);
+ } else {
+ priv->s_flags &= ~RVT_S_BUSY;
+ }
+ hfi1_schedule_tid_send(qp);
+ }
+
spin_unlock_irqrestore(&qp->s_lock, ps->flags);
this_cpu_inc(*ps->ppd->dd->send_schedule);
trace_hfi1_rc_expired_time_slice(qp, true);
@@ -576,6 +594,8 @@ void hfi1_do_send(struct rvt_qp *qp, bool in_thread)
do {
/* Check for a constructed packet to be sent. */
if (ps.s_txreq) {
+ if (priv->s_flags & HFI1_S_TID_BUSY_SET)
+ qp->s_flags |= RVT_S_BUSY;
spin_unlock_irqrestore(&qp->s_lock, ps.flags);
/*
* If the packet cannot be sent now, return and
@@ -585,7 +605,7 @@ void hfi1_do_send(struct rvt_qp *qp, bool in_thread)
return;
/* allow other tasks to run */
- if (schedule_send_yield(qp, &ps))
+ if (hfi1_schedule_send_yield(qp, &ps, false))
return;
spin_lock_irqsave(&qp->s_lock, ps.flags);
diff --git a/drivers/infiniband/hw/hfi1/sdma.c b/drivers/infiniband/hw/hfi1/sdma.c
index 96897a91fb0a..b0110728f541 100644
--- a/drivers/infiniband/hw/hfi1/sdma.c
+++ b/drivers/infiniband/hw/hfi1/sdma.c
@@ -1747,10 +1747,9 @@ retry:
*/
static void sdma_desc_avail(struct sdma_engine *sde, uint avail)
{
- struct iowait *wait, *nw;
+ struct iowait *wait, *nw, *twait;
struct iowait *waits[SDMA_WAIT_BATCH_SIZE];
- uint i, n = 0, seq, max_idx = 0;
- u8 max_starved_cnt = 0;
+ uint i, n = 0, seq, tidx = 0;
#ifdef CONFIG_SDMA_VERBOSITY
dd_dev_err(sde->dd, "CONFIG SDMA(%u) %s:%d %s()\n", sde->this_idx,
@@ -1775,13 +1774,20 @@ static void sdma_desc_avail(struct sdma_engine *sde, uint avail)
continue;
if (n == ARRAY_SIZE(waits))
break;
+ iowait_init_priority(wait);
num_desc = iowait_get_all_desc(wait);
if (num_desc > avail)
break;
avail -= num_desc;
- /* Find the most starved wait memeber */
- iowait_starve_find_max(wait, &max_starved_cnt,
- n, &max_idx);
+ /* Find the top-priority wait memeber */
+ if (n) {
+ twait = waits[tidx];
+ tidx =
+ iowait_priority_update_top(wait,
+ twait,
+ n,
+ tidx);
+ }
list_del_init(&wait->list);
waits[n++] = wait;
}
@@ -1790,12 +1796,12 @@ static void sdma_desc_avail(struct sdma_engine *sde, uint avail)
}
} while (read_seqretry(&sde->waitlock, seq));
- /* Schedule the most starved one first */
+ /* Schedule the top-priority entry first */
if (n)
- waits[max_idx]->wakeup(waits[max_idx], SDMA_AVAIL_REASON);
+ waits[tidx]->wakeup(waits[tidx], SDMA_AVAIL_REASON);
for (i = 0; i < n; i++)
- if (i != max_idx)
+ if (i != tidx)
waits[i]->wakeup(waits[i], SDMA_AVAIL_REASON);
}
diff --git a/drivers/infiniband/hw/hfi1/sdma_txreq.h b/drivers/infiniband/hw/hfi1/sdma_txreq.h
index bf7d777d756e..514a4784566b 100644
--- a/drivers/infiniband/hw/hfi1/sdma_txreq.h
+++ b/drivers/infiniband/hw/hfi1/sdma_txreq.h
@@ -91,6 +91,7 @@ struct sdma_desc {
#define SDMA_TXREQ_F_URGENT 0x0001
#define SDMA_TXREQ_F_AHG_COPY 0x0002
#define SDMA_TXREQ_F_USE_AHG 0x0004
+#define SDMA_TXREQ_F_VIP 0x0010
struct sdma_txreq;
typedef void (*callback_t)(struct sdma_txreq *, int);
diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.c b/drivers/infiniband/hw/hfi1/tid_rdma.c
index 0ee79403acaf..bc2ff83026f7 100644
--- a/drivers/infiniband/hw/hfi1/tid_rdma.c
+++ b/drivers/infiniband/hw/hfi1/tid_rdma.c
@@ -109,12 +109,25 @@ static u32 mask_generation(u32 a)
* C - Capcode
*/
+static u32 tid_rdma_flow_wt;
+
static void tid_rdma_trigger_resume(struct work_struct *work);
static void hfi1_kern_exp_rcv_free_flows(struct tid_rdma_request *req);
static int hfi1_kern_exp_rcv_alloc_flows(struct tid_rdma_request *req,
gfp_t gfp);
static void hfi1_init_trdma_req(struct rvt_qp *qp,
struct tid_rdma_request *req);
+static void hfi1_tid_write_alloc_resources(struct rvt_qp *qp, bool intr_ctx);
+static void hfi1_tid_timeout(struct timer_list *t);
+static void hfi1_add_tid_reap_timer(struct rvt_qp *qp);
+static void hfi1_mod_tid_reap_timer(struct rvt_qp *qp);
+static void hfi1_mod_tid_retry_timer(struct rvt_qp *qp);
+static int hfi1_stop_tid_retry_timer(struct rvt_qp *qp);
+static void hfi1_tid_retry_timeout(struct timer_list *t);
+static int make_tid_rdma_ack(struct rvt_qp *qp,
+ struct ib_other_headers *ohdr,
+ struct hfi1_pkt_state *ps);
+static void hfi1_do_tid_send(struct rvt_qp *qp);
static u64 tid_rdma_opfn_encode(struct tid_rdma_params *p)
{
@@ -313,6 +326,19 @@ int hfi1_qp_priv_init(struct rvt_dev_info *rdi, struct rvt_qp *qp,
qpriv->flow_state.index = RXE_NUM_TID_FLOWS;
qpriv->flow_state.last_index = RXE_NUM_TID_FLOWS;
qpriv->flow_state.generation = KERN_GENERATION_RESERVED;
+ qpriv->s_state = TID_OP(WRITE_RESP);
+ qpriv->s_tid_cur = HFI1_QP_WQE_INVALID;
+ qpriv->s_tid_head = HFI1_QP_WQE_INVALID;
+ qpriv->s_tid_tail = HFI1_QP_WQE_INVALID;
+ qpriv->rnr_nak_state = TID_RNR_NAK_INIT;
+ qpriv->r_tid_head = HFI1_QP_WQE_INVALID;
+ qpriv->r_tid_tail = HFI1_QP_WQE_INVALID;
+ qpriv->r_tid_ack = HFI1_QP_WQE_INVALID;
+ qpriv->r_tid_alloc = HFI1_QP_WQE_INVALID;
+ atomic_set(&qpriv->n_requests, 0);
+ atomic_set(&qpriv->n_tid_requests, 0);
+ timer_setup(&qpriv->s_tid_timer, hfi1_tid_timeout, 0);
+ timer_setup(&qpriv->s_tid_retry_timer, hfi1_tid_retry_timeout, 0);
INIT_LIST_HEAD(&qpriv->tid_wait);
if (init_attr->qp_type == IB_QPT_RC && HFI1_CAP_IS_KSET(TID_RDMA)) {
@@ -1959,6 +1985,8 @@ static int tid_rdma_rcv_error(struct hfi1_packet *packet,
{
struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
struct hfi1_ctxtdata *rcd = ((struct hfi1_qp_priv *)qp->priv)->rcd;
+ struct hfi1_ibdev *dev = to_idev(qp->ibqp.device);
+ struct hfi1_qp_priv *qpriv = qp->priv;
struct rvt_ack_entry *e;
struct tid_rdma_request *req;
unsigned long flags;
@@ -1982,7 +2010,8 @@ static int tid_rdma_rcv_error(struct hfi1_packet *packet,
spin_lock_irqsave(&qp->s_lock, flags);
e = find_prev_entry(qp, psn, &prev, NULL, &old_req);
- if (!e || e->opcode != TID_OP(READ_REQ))
+ if (!e || (e->opcode != TID_OP(READ_REQ) &&
+ e->opcode != TID_OP(WRITE_REQ)))
goto unlock;
req = ack_to_tid_req(e);
@@ -2042,8 +2071,119 @@ static int tid_rdma_rcv_error(struct hfi1_packet *packet,
*/
if (old_req)
goto unlock;
+ } else {
+ struct flow_state *fstate;
+ bool schedule = false;
+ u8 i;
+
+ if (req->state == TID_REQUEST_RESEND) {
+ req->state = TID_REQUEST_RESEND_ACTIVE;
+ } else if (req->state == TID_REQUEST_INIT_RESEND) {
+ req->state = TID_REQUEST_INIT;
+ schedule = true;
+ }
+
+ /*
+ * True if the request is already scheduled (between
+ * qp->s_tail_ack_queue and qp->r_head_ack_queue).
+ * Also, don't change requests, which are at the SYNC
+ * point and haven't generated any responses yet.
+ * There is nothing to retransmit for them yet.
+ */
+ if (old_req || req->state == TID_REQUEST_INIT ||
+ (req->state == TID_REQUEST_SYNC && !req->cur_seg)) {
+ for (i = prev + 1; ; i++) {
+ if (i > rvt_size_atomic(&dev->rdi))
+ i = 0;
+ if (i == qp->r_head_ack_queue)
+ break;
+ e = &qp->s_ack_queue[i];
+ req = ack_to_tid_req(e);
+ if (e->opcode == TID_OP(WRITE_REQ) &&
+ req->state == TID_REQUEST_INIT)
+ req->state = TID_REQUEST_INIT_RESEND;
+ }
+ /*
+ * If the state of the request has been changed,
+ * the first leg needs to get scheduled in order to
+ * pick up the change. Otherwise, normal response
+ * processing should take care of it.
+ */
+ if (!schedule)
+ goto unlock;
+ }
+
+ /*
+ * If there is no more allocated segment, just schedule the qp
+ * without changing any state.
+ */
+ if (req->clear_tail == req->setup_head)
+ goto schedule;
+ /*
+ * If this request has sent responses for segments, which have
+ * not received data yet (flow_idx != clear_tail), the flow_idx
+ * pointer needs to be adjusted so the same responses can be
+ * re-sent.
+ */
+ if (CIRC_CNT(req->flow_idx, req->clear_tail, MAX_FLOWS)) {
+ fstate = &req->flows[req->clear_tail].flow_state;
+ qpriv->pending_tid_w_segs -=
+ CIRC_CNT(req->flow_idx, req->clear_tail,
+ MAX_FLOWS);
+ req->flow_idx =
+ CIRC_ADD(req->clear_tail,
+ delta_psn(psn, fstate->resp_ib_psn),
+ MAX_FLOWS);
+ qpriv->pending_tid_w_segs +=
+ delta_psn(psn, fstate->resp_ib_psn);
+ /*
+ * When flow_idx == setup_head, we've gotten a duplicate
+ * request for a segment, which has not been allocated
+ * yet. In that case, don't adjust this request.
+ * However, we still want to go through the loop below
+ * to adjust all subsequent requests.
+ */
+ if (CIRC_CNT(req->setup_head, req->flow_idx,
+ MAX_FLOWS)) {
+ req->cur_seg = delta_psn(psn, e->psn);
+ req->state = TID_REQUEST_RESEND_ACTIVE;
+ }
+ }
+
+ for (i = prev + 1; ; i++) {
+ /*
+ * Look at everything up to and including
+ * s_tail_ack_queue
+ */
+ if (i > rvt_size_atomic(&dev->rdi))
+ i = 0;
+ if (i == qp->r_head_ack_queue)
+ break;
+ e = &qp->s_ack_queue[i];
+ req = ack_to_tid_req(e);
+ trace_hfi1_tid_req_rcv_err(qp, 0, e->opcode, e->psn,
+ e->lpsn, req);
+ if (e->opcode != TID_OP(WRITE_REQ) ||
+ req->cur_seg == req->comp_seg ||
+ req->state == TID_REQUEST_INIT ||
+ req->state == TID_REQUEST_INIT_RESEND) {
+ if (req->state == TID_REQUEST_INIT)
+ req->state = TID_REQUEST_INIT_RESEND;
+ continue;
+ }
+ qpriv->pending_tid_w_segs -=
+ CIRC_CNT(req->flow_idx,
+ req->clear_tail,
+ MAX_FLOWS);
+ req->flow_idx = req->clear_tail;
+ req->state = TID_REQUEST_RESEND;
+ req->cur_seg = req->comp_seg;
+ }
+ qpriv->s_flags &= ~HFI1_R_TID_WAIT_INTERLCK;
}
/* Re-process old requests.*/
+ if (qp->s_acked_ack_queue == qp->s_tail_ack_queue)
+ qp->s_acked_ack_queue = prev;
qp->s_tail_ack_queue = prev;
/*
* Since the qp->s_tail_ack_queue is modified, the
@@ -2052,6 +2192,18 @@ static int tid_rdma_rcv_error(struct hfi1_packet *packet,
* wrong memory region.
*/
qp->s_ack_state = OP(ACKNOWLEDGE);
+schedule:
+ /*
+ * It's possible to receive a retry psn that is earlier than an RNRNAK
+ * psn. In this case, the rnrnak state should be cleared.
+ */
+ if (qpriv->rnr_nak_state) {
+ qp->s_nak_state = 0;
+ qpriv->rnr_nak_state = TID_RNR_NAK_INIT;
+ qp->r_psn = e->lpsn + 1;
+ hfi1_tid_write_alloc_resources(qp, true);
+ }
+
qp->r_state = e->opcode;
qp->r_nak_state = 0;
qp->s_flags |= RVT_S_RESP_PENDING;
@@ -2162,6 +2314,14 @@ void hfi1_rc_rcv_tid_rdma_read_req(struct hfi1_packet *packet)
qp->r_head_ack_queue = next;
+ /*
+ * For all requests other than TID WRITE which are added to the ack
+ * queue, qpriv->r_tid_alloc follows qp->r_head_ack_queue. It is ok to
+ * do this because of interlocks between these and TID WRITE
+ * requests. The same change has also been made in hfi1_rc_rcv().
+ */
+ qpriv->r_tid_alloc = qp->r_head_ack_queue;
+
/* Schedule the send tasklet. */
qp->s_flags |= RVT_S_RESP_PENDING;
hfi1_schedule_send(qp);
@@ -2418,13 +2578,32 @@ static bool tid_rdma_tid_err(struct hfi1_ctxtdata *rcd,
u8 opcode)
{
struct rvt_qp *qp = packet->qp;
+ struct hfi1_qp_priv *qpriv = qp->priv;
u32 ipsn;
struct ib_other_headers *ohdr = packet->ohdr;
+ struct rvt_ack_entry *e;
+ struct tid_rdma_request *req;
+ struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device);
+ u32 i;
if (rcv_type >= RHF_RCV_TYPE_IB)
goto done;
spin_lock(&qp->s_lock);
+
+ /*
+ * We've ran out of space in the eager buffer.
+ * Eagerly received KDETH packets which require space in the
+ * Eager buffer (packet that have payload) are TID RDMA WRITE
+ * response packets. In this case, we have to re-transmit the
+ * TID RDMA WRITE request.
+ */
+ if (rcv_type == RHF_RCV_TYPE_EAGER) {
+ hfi1_restart_rc(qp, qp->s_last_psn + 1, 1);
+ hfi1_schedule_send(qp);
+ goto done_unlock;
+ }
+
/*
* For TID READ response, error out QP after freeing the tid
* resources.
@@ -2438,8 +2617,25 @@ static bool tid_rdma_tid_err(struct hfi1_ctxtdata *rcd,
rvt_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
goto done;
}
+ goto done_unlock;
}
+ /*
+ * Error out the qp for TID RDMA WRITE
+ */
+ hfi1_kern_clear_hw_flow(qpriv->rcd, qp);
+ for (i = 0; i < rvt_max_atomic(rdi); i++) {
+ e = &qp->s_ack_queue[i];
+ if (e->opcode == TID_OP(WRITE_REQ)) {
+ req = ack_to_tid_req(e);
+ hfi1_kern_exp_rcv_clear_all(req);
+ }
+ }
+ spin_unlock(&qp->s_lock);
+ rvt_rc_error(qp, IB_WC_LOC_LEN_ERR);
+ goto done;
+
+done_unlock:
spin_unlock(&qp->s_lock);
done:
return true;
@@ -2689,8 +2885,12 @@ bool hfi1_handle_kdeth_eflags(struct hfi1_ctxtdata *rcd,
u8 opcode;
u32 qp_num, psn, ibpsn;
struct rvt_qp *qp;
+ struct hfi1_qp_priv *qpriv;
unsigned long flags;
bool ret = true;
+ struct rvt_ack_entry *e;
+ struct tid_rdma_request *req;
+ struct tid_rdma_flow *flow;
trace_hfi1_msg_handle_kdeth_eflags(NULL, "Kdeth error: rhf ",
packet->rhf);
@@ -2749,14 +2949,116 @@ bool hfi1_handle_kdeth_eflags(struct hfi1_ctxtdata *rcd,
ibpsn = mask_psn(ibpsn);
ret = handle_read_kdeth_eflags(rcd, packet, rcv_type, rte, psn,
ibpsn);
+ goto r_unlock;
+ }
+
+ /*
+ * qp->s_tail_ack_queue points to the rvt_ack_entry currently being
+ * processed. These a completed sequentially so we can be sure that
+ * the pointer will not change until the entire request has completed.
+ */
+ spin_lock(&qp->s_lock);
+ qpriv = qp->priv;
+ e = &qp->s_ack_queue[qpriv->r_tid_tail];
+ req = ack_to_tid_req(e);
+ flow = &req->flows[req->clear_tail];
+ trace_hfi1_eflags_err_write(qp, rcv_type, rte, psn);
+ trace_hfi1_rsp_handle_kdeth_eflags(qp, psn);
+ trace_hfi1_tid_write_rsp_handle_kdeth_eflags(qp);
+ trace_hfi1_tid_req_handle_kdeth_eflags(qp, 0, e->opcode, e->psn,
+ e->lpsn, req);
+ trace_hfi1_tid_flow_handle_kdeth_eflags(qp, req->clear_tail, flow);
+
+ switch (rcv_type) {
+ case RHF_RCV_TYPE_EXPECTED:
+ switch (rte) {
+ case RHF_RTE_EXPECTED_FLOW_SEQ_ERR:
+ if (!(qpriv->s_flags & HFI1_R_TID_SW_PSN)) {
+ u64 reg;
+
+ qpriv->s_flags |= HFI1_R_TID_SW_PSN;
+ /*
+ * The only sane way to get the amount of
+ * progress is to read the HW flow state.
+ */
+ reg = read_uctxt_csr(dd, rcd->ctxt,
+ RCV_TID_FLOW_TABLE +
+ (8 * flow->idx));
+ flow->flow_state.r_next_psn = mask_psn(reg);
+ qpriv->r_next_psn_kdeth =
+ flow->flow_state.r_next_psn;
+ goto nak_psn;
+ } else {
+ /*
+ * If the received PSN does not match the next
+ * expected PSN, NAK the packet.
+ * However, only do that if we know that the a
+ * NAK has already been sent. Otherwise, this
+ * mismatch could be due to packets that were
+ * already in flight.
+ */
+ if (psn != flow->flow_state.r_next_psn) {
+ psn = flow->flow_state.r_next_psn;
+ goto nak_psn;
+ }
+
+ qpriv->s_nak_state = 0;
+ /*
+ * If SW PSN verification is successful and this
+ * is the last packet in the segment, tell the
+ * caller to process it as a normal packet.
+ */
+ if (psn == full_flow_psn(flow,
+ flow->flow_state.lpsn))
+ ret = false;
+ qpriv->r_next_psn_kdeth =
+ ++flow->flow_state.r_next_psn;
+ }
+ break;
+
+ case RHF_RTE_EXPECTED_FLOW_GEN_ERR:
+ goto nak_psn;
+
+ default:
+ break;
+ }
+ break;
+
+ case RHF_RCV_TYPE_ERROR:
+ switch (rte) {
+ case RHF_RTE_ERROR_OP_CODE_ERR:
+ case RHF_RTE_ERROR_KHDR_MIN_LEN_ERR:
+ case RHF_RTE_ERROR_KHDR_HCRC_ERR:
+ case RHF_RTE_ERROR_KHDR_KVER_ERR:
+ case RHF_RTE_ERROR_CONTEXT_ERR:
+ case RHF_RTE_ERROR_KHDR_TID_ERR:
+ default:
+ break;
+ }
+ default:
+ break;
}
+unlock:
+ spin_unlock(&qp->s_lock);
r_unlock:
spin_unlock_irqrestore(&qp->r_lock, flags);
rcu_unlock:
rcu_read_unlock();
drop:
return ret;
+nak_psn:
+ ibp->rvp.n_rc_seqnak++;
+ if (!qpriv->s_nak_state) {
+ qpriv->s_nak_state = IB_NAK_PSN_ERROR;
+ /* We are NAK'ing the next expected PSN */
+ qpriv->s_nak_psn = mask_psn(flow->flow_state.r_next_psn);
+ qpriv->s_flags |= RVT_S_ACK_PENDING;
+ if (qpriv->r_tid_ack == HFI1_QP_WQE_INVALID)
+ qpriv->r_tid_ack = qpriv->r_tid_tail;
+ hfi1_schedule_tid_send(qp);
+ }
+ goto unlock;
}
/*
@@ -2770,8 +3072,9 @@ void hfi1_tid_rdma_restart_req(struct rvt_qp *qp, struct rvt_swqe *wqe,
{
struct tid_rdma_request *req = wqe_to_tid_req(wqe);
struct tid_rdma_flow *flow;
- int diff;
- u32 tididx = 0;
+ struct hfi1_qp_priv *qpriv = qp->priv;
+ int diff, delta_pkts;
+ u32 tididx = 0, i;
u16 fidx;
if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) {
@@ -2787,11 +3090,20 @@ void hfi1_tid_rdma_restart_req(struct rvt_qp *qp, struct rvt_swqe *wqe,
return;
}
} else {
- return;
+ fidx = req->acked_tail;
+ flow = &req->flows[fidx];
+ *bth2 = mask_psn(req->r_ack_psn);
}
+ if (wqe->wr.opcode == IB_WR_TID_RDMA_READ)
+ delta_pkts = delta_psn(*bth2, flow->flow_state.ib_spsn);
+ else
+ delta_pkts = delta_psn(*bth2,
+ full_flow_psn(flow,
+ flow->flow_state.spsn));
+
trace_hfi1_tid_flow_restart_req(qp, fidx, flow);
- diff = delta_psn(*bth2, flow->flow_state.ib_spsn);
+ diff = delta_pkts + flow->resync_npkts;
flow->sent = 0;
flow->pkt = 0;
@@ -2815,6 +3127,18 @@ void hfi1_tid_rdma_restart_req(struct rvt_qp *qp, struct rvt_swqe *wqe,
break;
}
}
+ if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE) {
+ rvt_skip_sge(&qpriv->tid_ss, (req->cur_seg * req->seg_len) +
+ flow->sent, 0);
+ /*
+ * Packet PSN is based on flow_state.spsn + flow->pkt. However,
+ * during a RESYNC, the generation is incremented and the
+ * sequence is reset to 0. Since we've adjusted the npkts in the
+ * flow and the SGE has been sufficiently advanced, we have to
+ * adjust flow->pkt in order to calculate the correct PSN.
+ */
+ flow->pkt -= flow->resync_npkts;
+ }
if (flow->tid_offset ==
EXP_TID_GET(flow->tid_entry[tididx], LEN) * PAGE_SIZE) {
@@ -2822,13 +3146,42 @@ void hfi1_tid_rdma_restart_req(struct rvt_qp *qp, struct rvt_swqe *wqe,
flow->tid_offset = 0;
}
flow->tid_idx = tididx;
- /* Move flow_idx to correct index */
- req->flow_idx = fidx;
+ if (wqe->wr.opcode == IB_WR_TID_RDMA_READ)
+ /* Move flow_idx to correct index */
+ req->flow_idx = fidx;
+ else
+ req->clear_tail = fidx;
trace_hfi1_tid_flow_restart_req(qp, fidx, flow);
trace_hfi1_tid_req_restart_req(qp, 0, wqe->wr.opcode, wqe->psn,
wqe->lpsn, req);
req->state = TID_REQUEST_ACTIVE;
+ if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE) {
+ /* Reset all the flows that we are going to resend */
+ fidx = CIRC_NEXT(fidx, MAX_FLOWS);
+ i = qpriv->s_tid_tail;
+ do {
+ for (; CIRC_CNT(req->setup_head, fidx, MAX_FLOWS);
+ fidx = CIRC_NEXT(fidx, MAX_FLOWS)) {
+ req->flows[fidx].sent = 0;
+ req->flows[fidx].pkt = 0;
+ req->flows[fidx].tid_idx = 0;
+ req->flows[fidx].tid_offset = 0;
+ req->flows[fidx].resync_npkts = 0;
+ }
+ if (i == qpriv->s_tid_cur)
+ break;
+ do {
+ i = (++i == qp->s_size ? 0 : i);
+ wqe = rvt_get_swqe_ptr(qp, i);
+ } while (wqe->wr.opcode != IB_WR_TID_RDMA_WRITE);
+ req = wqe_to_tid_req(wqe);
+ req->cur_seg = req->ack_seg;
+ fidx = req->acked_tail;
+ /* Pull req->clear_tail back */
+ req->clear_tail = fidx;
+ } while (1);
+ }
}
void hfi1_qp_kern_exp_rcv_clear_all(struct rvt_qp *qp)
@@ -2862,6 +3215,20 @@ void hfi1_qp_kern_exp_rcv_clear_all(struct rvt_qp *qp)
ret = hfi1_kern_exp_rcv_clear(&priv->tid_req);
} while (!ret);
}
+ for (i = qp->s_acked_ack_queue; i != qp->r_head_ack_queue;) {
+ struct rvt_ack_entry *e = &qp->s_ack_queue[i];
+
+ if (++i == rvt_max_atomic(ib_to_rvt(qp->ibqp.device)))
+ i = 0;
+ /* Free only locally allocated TID entries */
+ if (e->opcode != TID_OP(WRITE_REQ))
+ continue;
+ do {
+ struct hfi1_ack_priv *priv = e->priv;
+
+ ret = hfi1_kern_exp_rcv_clear(&priv->tid_req);
+ } while (!ret);
+ }
}
bool hfi1_tid_rdma_wqe_interlock(struct rvt_qp *qp, struct rvt_swqe *wqe)
@@ -2869,6 +3236,7 @@ bool hfi1_tid_rdma_wqe_interlock(struct rvt_qp *qp, struct rvt_swqe *wqe)
struct rvt_swqe *prev;
struct hfi1_qp_priv *priv = qp->priv;
u32 s_prev;
+ struct tid_rdma_request *req;
s_prev = (qp->s_cur == 0 ? qp->s_size : qp->s_cur) - 1;
prev = rvt_get_swqe_ptr(qp, s_prev);
@@ -2880,14 +3248,28 @@ bool hfi1_tid_rdma_wqe_interlock(struct rvt_qp *qp, struct rvt_swqe *wqe)
case IB_WR_ATOMIC_CMP_AND_SWP:
case IB_WR_ATOMIC_FETCH_AND_ADD:
case IB_WR_RDMA_WRITE:
+ switch (prev->wr.opcode) {
+ case IB_WR_TID_RDMA_WRITE:
+ req = wqe_to_tid_req(prev);
+ if (req->ack_seg != req->total_segs)
+ goto interlock;
+ default:
+ break;
+ }
case IB_WR_RDMA_READ:
- break;
+ if (prev->wr.opcode != IB_WR_TID_RDMA_WRITE)
+ break;
+ /* fall through */
case IB_WR_TID_RDMA_READ:
switch (prev->wr.opcode) {
case IB_WR_RDMA_READ:
if (qp->s_acked != qp->s_cur)
goto interlock;
break;
+ case IB_WR_TID_RDMA_WRITE:
+ req = wqe_to_tid_req(prev);
+ if (req->ack_seg != req->total_segs)
+ goto interlock;
default:
break;
}
@@ -2946,6 +3328,18 @@ void setup_tid_rdma_wqe(struct rvt_qp *qp, struct rvt_swqe *wqe)
new_opcode = IB_WR_TID_RDMA_READ;
do_tid_rdma = true;
}
+ } else if (wqe->wr.opcode == IB_WR_RDMA_WRITE) {
+ /*
+ * TID RDMA is enabled for this RDMA WRITE request iff:
+ * 1. The remote address is page-aligned,
+ * 2. The length is larger than the minimum segment size,
+ * 3. The length is page-multiple.
+ */
+ if (!(wqe->rdma_wr.remote_addr & ~PAGE_MASK) &&
+ !(wqe->length & ~PAGE_MASK)) {
+ new_opcode = IB_WR_TID_RDMA_WRITE;
+ do_tid_rdma = true;
+ }
}
if (do_tid_rdma) {
@@ -2962,12 +3356,22 @@ void setup_tid_rdma_wqe(struct rvt_qp *qp, struct rvt_swqe *wqe)
priv->tid_req.n_flows = remote->max_read;
qpriv->tid_r_reqs++;
wqe->lpsn += rvt_div_round_up_mtu(qp, wqe->length) - 1;
+ } else {
+ wqe->lpsn += priv->tid_req.total_segs - 1;
+ atomic_inc(&qpriv->n_requests);
}
priv->tid_req.cur_seg = 0;
priv->tid_req.comp_seg = 0;
priv->tid_req.ack_seg = 0;
priv->tid_req.state = TID_REQUEST_INACTIVE;
+ /*
+ * Reset acked_tail.
+ * TID RDMA READ does not have ACKs so it does not
+ * update the pointer. We have to reset it so TID RDMA
+ * WRITE does not get confused.
+ */
+ priv->tid_req.acked_tail = priv->tid_req.setup_head;
trace_hfi1_tid_req_setup_tid_wqe(qp, 1, wqe->wr.opcode,
wqe->psn, wqe->lpsn,
&priv->tid_req);
@@ -2975,3 +3379,2087 @@ void setup_tid_rdma_wqe(struct rvt_qp *qp, struct rvt_swqe *wqe)
exit:
rcu_read_unlock();
}
+
+/* TID RDMA WRITE functions */
+
+u32 hfi1_build_tid_rdma_write_req(struct rvt_qp *qp, struct rvt_swqe *wqe,
+ struct ib_other_headers *ohdr,
+ u32 *bth1, u32 *bth2, u32 *len)
+{
+ struct hfi1_qp_priv *qpriv = qp->priv;
+ struct tid_rdma_request *req = wqe_to_tid_req(wqe);
+ struct tid_rdma_params *remote;
+
+ rcu_read_lock();
+ remote = rcu_dereference(qpriv->tid_rdma.remote);
+ /*
+ * Set the number of flow to be used based on negotiated
+ * parameters.
+ */
+ req->n_flows = remote->max_write;
+ req->state = TID_REQUEST_ACTIVE;
+
+ KDETH_RESET(ohdr->u.tid_rdma.w_req.kdeth0, KVER, 0x1);
+ KDETH_RESET(ohdr->u.tid_rdma.w_req.kdeth1, JKEY, remote->jkey);
+ ohdr->u.tid_rdma.w_req.reth.vaddr =
+ cpu_to_be64(wqe->rdma_wr.remote_addr + (wqe->length - *len));
+ ohdr->u.tid_rdma.w_req.reth.rkey =
+ cpu_to_be32(wqe->rdma_wr.rkey);
+ ohdr->u.tid_rdma.w_req.reth.length = cpu_to_be32(*len);
+ ohdr->u.tid_rdma.w_req.verbs_qp = cpu_to_be32(qp->remote_qpn);
+ *bth1 &= ~RVT_QPN_MASK;
+ *bth1 |= remote->qp;
+ qp->s_state = TID_OP(WRITE_REQ);
+ qp->s_flags |= HFI1_S_WAIT_TID_RESP;
+ *bth2 |= IB_BTH_REQ_ACK;
+ *len = 0;
+
+ rcu_read_unlock();
+ return sizeof(ohdr->u.tid_rdma.w_req) / sizeof(u32);
+}
+
+void hfi1_compute_tid_rdma_flow_wt(void)
+{
+ /*
+ * Heuristic for computing the RNR timeout when waiting on the flow
+ * queue. Rather than a computationaly expensive exact estimate of when
+ * a flow will be available, we assume that if a QP is at position N in
+ * the flow queue it has to wait approximately (N + 1) * (number of
+ * segments between two sync points), assuming PMTU of 4K. The rationale
+ * for this is that flows are released and recycled at each sync point.
+ */
+ tid_rdma_flow_wt = MAX_TID_FLOW_PSN * enum_to_mtu(OPA_MTU_4096) /
+ TID_RDMA_MAX_SEGMENT_SIZE;
+}
+
+static u32 position_in_queue(struct hfi1_qp_priv *qpriv,
+ struct tid_queue *queue)
+{
+ return qpriv->tid_enqueue - queue->dequeue;
+}
+
+/*
+ * @qp: points to rvt_qp context.
+ * @to_seg: desired RNR timeout in segments.
+ * Return: index of the next highest timeout in the ib_hfi1_rnr_table[]
+ */
+static u32 hfi1_compute_tid_rnr_timeout(struct rvt_qp *qp, u32 to_seg)
+{
+ struct hfi1_qp_priv *qpriv = qp->priv;
+ u64 timeout;
+ u32 bytes_per_us;
+ u8 i;
+
+ bytes_per_us = active_egress_rate(qpriv->rcd->ppd) / 8;
+ timeout = (to_seg * TID_RDMA_MAX_SEGMENT_SIZE) / bytes_per_us;
+ /*
+ * Find the next highest value in the RNR table to the required
+ * timeout. This gives the responder some padding.
+ */
+ for (i = 1; i <= IB_AETH_CREDIT_MASK; i++)
+ if (rvt_rnr_tbl_to_usec(i) >= timeout)
+ return i;
+ return 0;
+}
+
+/**
+ * Central place for resource allocation at TID write responder,
+ * is called from write_req and write_data interrupt handlers as
+ * well as the send thread when a queued QP is scheduled for
+ * resource allocation.
+ *
+ * Iterates over (a) segments of a request and then (b) queued requests
+ * themselves to allocate resources for up to local->max_write
+ * segments across multiple requests. Stop allocating when we
+ * hit a sync point, resume allocating after data packets at
+ * sync point have been received.
+ *
+ * Resource allocation and sending of responses is decoupled. The
+ * request/segment which are being allocated and sent are as follows.
+ * Resources are allocated for:
+ * [request: qpriv->r_tid_alloc, segment: req->alloc_seg]
+ * The send thread sends:
+ * [request: qp->s_tail_ack_queue, segment:req->cur_seg]
+ */
+static void hfi1_tid_write_alloc_resources(struct rvt_qp *qp, bool intr_ctx)
+{
+ struct tid_rdma_request *req;
+ struct hfi1_qp_priv *qpriv = qp->priv;
+ struct hfi1_ctxtdata *rcd = qpriv->rcd;
+ struct tid_rdma_params *local = &qpriv->tid_rdma.local;
+ struct rvt_ack_entry *e;
+ u32 npkts, to_seg;
+ bool last;
+ int ret = 0;
+
+ lockdep_assert_held(&qp->s_lock);
+
+ while (1) {
+ trace_hfi1_rsp_tid_write_alloc_res(qp, 0);
+ trace_hfi1_tid_write_rsp_alloc_res(qp);
+ /*
+ * Don't allocate more segments if a RNR NAK has already been
+ * scheduled to avoid messing up qp->r_psn: the RNR NAK will
+ * be sent only when all allocated segments have been sent.
+ * However, if more segments are allocated before that, TID RDMA
+ * WRITE RESP packets will be sent out for these new segments
+ * before the RNR NAK packet. When the requester receives the
+ * RNR NAK packet, it will restart with qp->s_last_psn + 1,
+ * which does not match qp->r_psn and will be dropped.
+ * Consequently, the requester will exhaust its retries and
+ * put the qp into error state.
+ */
+ if (qpriv->rnr_nak_state == TID_RNR_NAK_SEND)
+ break;
+
+ /* No requests left to process */
+ if (qpriv->r_tid_alloc == qpriv->r_tid_head) {
+ /* If all data has been received, clear the flow */
+ if (qpriv->flow_state.index < RXE_NUM_TID_FLOWS &&
+ !qpriv->alloc_w_segs)
+ hfi1_kern_clear_hw_flow(rcd, qp);
+ break;
+ }
+
+ e = &qp->s_ack_queue[qpriv->r_tid_alloc];
+ if (e->opcode != TID_OP(WRITE_REQ))
+ goto next_req;
+ req = ack_to_tid_req(e);
+ trace_hfi1_tid_req_write_alloc_res(qp, 0, e->opcode, e->psn,
+ e->lpsn, req);
+ /* Finished allocating for all segments of this request */
+ if (req->alloc_seg >= req->total_segs)
+ goto next_req;
+
+ /* Can allocate only a maximum of local->max_write for a QP */
+ if (qpriv->alloc_w_segs >= local->max_write)
+ break;
+
+ /* Don't allocate at a sync point with data packets pending */
+ if (qpriv->sync_pt && qpriv->alloc_w_segs)
+ break;
+
+ /* All data received at the sync point, continue */
+ if (qpriv->sync_pt && !qpriv->alloc_w_segs) {
+ hfi1_kern_clear_hw_flow(rcd, qp);
+ qpriv->sync_pt = false;
+ if (qpriv->s_flags & HFI1_R_TID_SW_PSN)
+ qpriv->s_flags &= ~HFI1_R_TID_SW_PSN;
+ }
+
+ /* Allocate flow if we don't have one */
+ if (qpriv->flow_state.index >= RXE_NUM_TID_FLOWS) {
+ ret = hfi1_kern_setup_hw_flow(qpriv->rcd, qp);
+ if (ret) {
+ to_seg = tid_rdma_flow_wt *
+ position_in_queue(qpriv,
+ &rcd->flow_queue);
+ break;
+ }
+ }
+
+ npkts = rvt_div_round_up_mtu(qp, req->seg_len);
+
+ /*
+ * We are at a sync point if we run out of KDETH PSN space.
+ * Last PSN of every generation is reserved for RESYNC.
+ */
+ if (qpriv->flow_state.psn + npkts > MAX_TID_FLOW_PSN - 1) {
+ qpriv->sync_pt = true;
+ break;
+ }
+
+ /*
+ * If overtaking req->acked_tail, send an RNR NAK. Because the
+ * QP is not queued in this case, and the issue can only be
+ * caused due a delay in scheduling the second leg which we
+ * cannot estimate, we use a rather arbitrary RNR timeout of
+ * (MAX_FLOWS / 2) segments
+ */
+ if (!CIRC_SPACE(req->setup_head, req->acked_tail,
+ MAX_FLOWS)) {
+ ret = -EAGAIN;
+ to_seg = MAX_FLOWS >> 1;
+ qpriv->s_flags |= RVT_S_ACK_PENDING;
+ hfi1_schedule_tid_send(qp);
+ break;
+ }
+
+ /* Try to allocate rcv array / TID entries */
+ ret = hfi1_kern_exp_rcv_setup(req, &req->ss, &last);
+ if (ret == -EAGAIN)
+ to_seg = position_in_queue(qpriv, &rcd->rarr_queue);
+ if (ret)
+ break;
+
+ qpriv->alloc_w_segs++;
+ req->alloc_seg++;
+ continue;
+next_req:
+ /* Begin processing the next request */
+ if (++qpriv->r_tid_alloc >
+ rvt_size_atomic(ib_to_rvt(qp->ibqp.device)))
+ qpriv->r_tid_alloc = 0;
+ }
+
+ /*
+ * Schedule an RNR NAK to be sent if (a) flow or rcv array allocation
+ * has failed (b) we are called from the rcv handler interrupt context
+ * (c) an RNR NAK has not already been scheduled
+ */
+ if (ret == -EAGAIN && intr_ctx && !qp->r_nak_state)
+ goto send_rnr_nak;
+
+ return;
+
+send_rnr_nak:
+ lockdep_assert_held(&qp->r_lock);
+
+ /* Set r_nak_state to prevent unrelated events from generating NAK's */
+ qp->r_nak_state = hfi1_compute_tid_rnr_timeout(qp, to_seg) | IB_RNR_NAK;
+
+ /* Pull back r_psn to the segment being RNR NAK'd */
+ qp->r_psn = e->psn + req->alloc_seg;
+ qp->r_ack_psn = qp->r_psn;
+ /*
+ * Pull back r_head_ack_queue to the ack entry following the request
+ * being RNR NAK'd. This allows resources to be allocated to the request
+ * if the queued QP is scheduled.
+ */
+ qp->r_head_ack_queue = qpriv->r_tid_alloc + 1;
+ if (qp->r_head_ack_queue > rvt_size_atomic(ib_to_rvt(qp->ibqp.device)))
+ qp->r_head_ack_queue = 0;
+ qpriv->r_tid_head = qp->r_head_ack_queue;
+ /*
+ * These send side fields are used in make_rc_ack(). They are set in
+ * hfi1_send_rc_ack() but must be set here before dropping qp->s_lock
+ * for consistency
+ */
+ qp->s_nak_state = qp->r_nak_state;
+ qp->s_ack_psn = qp->r_ack_psn;
+ /*
+ * Clear the ACK PENDING flag to prevent unwanted ACK because we
+ * have modified qp->s_ack_psn here.
+ */
+ qp->s_flags &= ~(RVT_S_ACK_PENDING);
+
+ trace_hfi1_rsp_tid_write_alloc_res(qp, qp->r_psn);
+ /*
+ * qpriv->rnr_nak_state is used to determine when the scheduled RNR NAK
+ * has actually been sent. qp->s_flags RVT_S_ACK_PENDING bit cannot be
+ * used for this because qp->s_lock is dropped before calling
+ * hfi1_send_rc_ack() leading to inconsistency between the receive
+ * interrupt handlers and the send thread in make_rc_ack()
+ */
+ qpriv->rnr_nak_state = TID_RNR_NAK_SEND;
+
+ /*
+ * Schedule RNR NAK to be sent. RNR NAK's are scheduled from the receive
+ * interrupt handlers but will be sent from the send engine behind any
+ * previous responses that may have been scheduled
+ */
+ rc_defered_ack(rcd, qp);
+}
+
+void hfi1_rc_rcv_tid_rdma_write_req(struct hfi1_packet *packet)
+{
+ /* HANDLER FOR TID RDMA WRITE REQUEST packet (Responder side)*/
+
+ /*
+ * 1. Verify TID RDMA WRITE REQ as per IB_OPCODE_RC_RDMA_WRITE_FIRST
+ * (see hfi1_rc_rcv())
+ * - Don't allow 0-length requests.
+ * 2. Put TID RDMA WRITE REQ into the response queueu (s_ack_queue)
+ * - Setup struct tid_rdma_req with request info
+ * - Prepare struct tid_rdma_flow array?
+ * 3. Set the qp->s_ack_state as state diagram in design doc.
+ * 4. Set RVT_S_RESP_PENDING in s_flags.
+ * 5. Kick the send engine (hfi1_schedule_send())
+ */
+ struct hfi1_ctxtdata *rcd = packet->rcd;
+ struct rvt_qp *qp = packet->qp;
+ struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
+ struct ib_other_headers *ohdr = packet->ohdr;
+ struct rvt_ack_entry *e;
+ unsigned long flags;
+ struct ib_reth *reth;
+ struct hfi1_qp_priv *qpriv = qp->priv;
+ struct tid_rdma_request *req;
+ u32 bth0, psn, len, rkey, num_segs;
+ bool is_fecn;
+ u8 next;
+ u64 vaddr;
+ int diff;
+
+ bth0 = be32_to_cpu(ohdr->bth[0]);
+ if (hfi1_ruc_check_hdr(ibp, packet))
+ return;
+
+ is_fecn = process_ecn(qp, packet);
+ psn = mask_psn(be32_to_cpu(ohdr->bth[2]));
+ trace_hfi1_rsp_rcv_tid_write_req(qp, psn);
+
+ if (qp->state == IB_QPS_RTR && !(qp->r_flags & RVT_R_COMM_EST))
+ rvt_comm_est(qp);
+
+ if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE)))
+ goto nack_inv;
+
+ reth = &ohdr->u.tid_rdma.w_req.reth;
+ vaddr = be64_to_cpu(reth->vaddr);
+ len = be32_to_cpu(reth->length);
+
+ num_segs = DIV_ROUND_UP(len, qpriv->tid_rdma.local.max_len);
+ diff = delta_psn(psn, qp->r_psn);
+ if (unlikely(diff)) {
+ if (tid_rdma_rcv_error(packet, ohdr, qp, psn, diff))
+ return;
+ goto send_ack;
+ }
+
+ /*
+ * The resent request which was previously RNR NAK'd is inserted at the
+ * location of the original request, which is one entry behind
+ * r_head_ack_queue
+ */
+ if (qpriv->rnr_nak_state)
+ qp->r_head_ack_queue = qp->r_head_ack_queue ?
+ qp->r_head_ack_queue - 1 :
+ rvt_size_atomic(ib_to_rvt(qp->ibqp.device));
+
+ /* We've verified the request, insert it into the ack queue. */
+ next = qp->r_head_ack_queue + 1;
+ if (next > rvt_size_atomic(ib_to_rvt(qp->ibqp.device)))
+ next = 0;
+ spin_lock_irqsave(&qp->s_lock, flags);
+ if (unlikely(next == qp->s_acked_ack_queue)) {
+ if (!qp->s_ack_queue[next].sent)
+ goto nack_inv_unlock;
+ update_ack_queue(qp, next);
+ }
+ e = &qp->s_ack_queue[qp->r_head_ack_queue];
+ req = ack_to_tid_req(e);
+
+ /* Bring previously RNR NAK'd request back to life */
+ if (qpriv->rnr_nak_state) {
+ qp->r_nak_state = 0;
+ qp->s_nak_state = 0;
+ qpriv->rnr_nak_state = TID_RNR_NAK_INIT;
+ qp->r_psn = e->lpsn + 1;
+ req->state = TID_REQUEST_INIT;
+ goto update_head;
+ }
+
+ if (e->rdma_sge.mr) {
+ rvt_put_mr(e->rdma_sge.mr);
+ e->rdma_sge.mr = NULL;
+ }
+
+ /* The length needs to be in multiples of PAGE_SIZE */
+ if (!len || len & ~PAGE_MASK)
+ goto nack_inv_unlock;
+
+ rkey = be32_to_cpu(reth->rkey);
+ qp->r_len = len;
+
+ if (e->opcode == TID_OP(WRITE_REQ) &&
+ (req->setup_head != req->clear_tail ||
+ req->clear_tail != req->acked_tail))
+ goto nack_inv_unlock;
+
+ if (unlikely(!rvt_rkey_ok(qp, &e->rdma_sge, qp->r_len, vaddr,
+ rkey, IB_ACCESS_REMOTE_WRITE)))
+ goto nack_acc;
+
+ qp->r_psn += num_segs - 1;
+
+ e->opcode = (bth0 >> 24) & 0xff;
+ e->psn = psn;
+ e->lpsn = qp->r_psn;
+ e->sent = 0;
+
+ req->n_flows = min_t(u16, num_segs, qpriv->tid_rdma.local.max_write);
+ req->state = TID_REQUEST_INIT;
+ req->cur_seg = 0;
+ req->comp_seg = 0;
+ req->ack_seg = 0;
+ req->alloc_seg = 0;
+ req->isge = 0;
+ req->seg_len = qpriv->tid_rdma.local.max_len;
+ req->total_len = len;
+ req->total_segs = num_segs;
+ req->r_flow_psn = e->psn;
+ req->ss.sge = e->rdma_sge;
+ req->ss.num_sge = 1;
+
+ req->flow_idx = req->setup_head;
+ req->clear_tail = req->setup_head;
+ req->acked_tail = req->setup_head;
+
+ qp->r_state = e->opcode;
+ qp->r_nak_state = 0;
+ /*
+ * We need to increment the MSN here instead of when we
+ * finish sending the result since a duplicate request would
+ * increment it more than once.
+ */
+ qp->r_msn++;
+ qp->r_psn++;
+
+ trace_hfi1_tid_req_rcv_write_req(qp, 0, e->opcode, e->psn, e->lpsn,
+ req);
+
+ if (qpriv->r_tid_tail == HFI1_QP_WQE_INVALID) {
+ qpriv->r_tid_tail = qp->r_head_ack_queue;
+ } else if (qpriv->r_tid_tail == qpriv->r_tid_head) {
+ struct tid_rdma_request *ptr;
+
+ e = &qp->s_ack_queue[qpriv->r_tid_tail];
+ ptr = ack_to_tid_req(e);
+
+ if (e->opcode != TID_OP(WRITE_REQ) ||
+ ptr->comp_seg == ptr->total_segs) {
+ if (qpriv->r_tid_tail == qpriv->r_tid_ack)
+ qpriv->r_tid_ack = qp->r_head_ack_queue;
+ qpriv->r_tid_tail = qp->r_head_ack_queue;
+ }
+ }
+update_head:
+ qp->r_head_ack_queue = next;
+ qpriv->r_tid_head = qp->r_head_ack_queue;
+
+ hfi1_tid_write_alloc_resources(qp, true);
+ trace_hfi1_tid_write_rsp_rcv_req(qp);
+
+ /* Schedule the send tasklet. */
+ qp->s_flags |= RVT_S_RESP_PENDING;
+ hfi1_schedule_send(qp);
+
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+ if (is_fecn)
+ goto send_ack;
+ return;
+
+nack_inv_unlock:
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+nack_inv:
+ rvt_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
+ qp->r_nak_state = IB_NAK_INVALID_REQUEST;
+ qp->r_ack_psn = qp->r_psn;
+ /* Queue NAK for later */
+ rc_defered_ack(rcd, qp);
+ return;
+nack_acc:
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+ rvt_rc_error(qp, IB_WC_LOC_PROT_ERR);
+ qp->r_nak_state = IB_NAK_REMOTE_ACCESS_ERROR;
+ qp->r_ack_psn = qp->r_psn;
+send_ack:
+ hfi1_send_rc_ack(packet, is_fecn);
+}
+
+u32 hfi1_build_tid_rdma_write_resp(struct rvt_qp *qp, struct rvt_ack_entry *e,
+ struct ib_other_headers *ohdr, u32 *bth1,
+ u32 bth2, u32 *len,
+ struct rvt_sge_state **ss)
+{
+ struct hfi1_ack_priv *epriv = e->priv;
+ struct tid_rdma_request *req = &epriv->tid_req;
+ struct hfi1_qp_priv *qpriv = qp->priv;
+ struct tid_rdma_flow *flow = NULL;
+ u32 resp_len = 0, hdwords = 0;
+ void *resp_addr = NULL;
+ struct tid_rdma_params *remote;
+
+ trace_hfi1_tid_req_build_write_resp(qp, 0, e->opcode, e->psn, e->lpsn,
+ req);
+ trace_hfi1_tid_write_rsp_build_resp(qp);
+ trace_hfi1_rsp_build_tid_write_resp(qp, bth2);
+ flow = &req->flows[req->flow_idx];
+ switch (req->state) {
+ default:
+ /*
+ * Try to allocate resources here in case QP was queued and was
+ * later scheduled when resources became available
+ */
+ hfi1_tid_write_alloc_resources(qp, false);
+
+ /* We've already sent everything which is ready */
+ if (req->cur_seg >= req->alloc_seg)
+ goto done;
+
+ /*
+ * Resources can be assigned but responses cannot be sent in
+ * rnr_nak state, till the resent request is received
+ */
+ if (qpriv->rnr_nak_state == TID_RNR_NAK_SENT)
+ goto done;
+
+ req->state = TID_REQUEST_ACTIVE;
+ trace_hfi1_tid_flow_build_write_resp(qp, req->flow_idx, flow);
+ req->flow_idx = CIRC_NEXT(req->flow_idx, MAX_FLOWS);
+ hfi1_add_tid_reap_timer(qp);
+ break;
+
+ case TID_REQUEST_RESEND_ACTIVE:
+ case TID_REQUEST_RESEND:
+ trace_hfi1_tid_flow_build_write_resp(qp, req->flow_idx, flow);
+ req->flow_idx = CIRC_NEXT(req->flow_idx, MAX_FLOWS);
+ if (!CIRC_CNT(req->setup_head, req->flow_idx, MAX_FLOWS))
+ req->state = TID_REQUEST_ACTIVE;
+
+ hfi1_mod_tid_reap_timer(qp);
+ break;
+ }
+ flow->flow_state.resp_ib_psn = bth2;
+ resp_addr = (void *)flow->tid_entry;
+ resp_len = sizeof(*flow->tid_entry) * flow->tidcnt;
+ req->cur_seg++;
+
+ memset(&ohdr->u.tid_rdma.w_rsp, 0, sizeof(ohdr->u.tid_rdma.w_rsp));
+ epriv->ss.sge.vaddr = resp_addr;
+ epriv->ss.sge.sge_length = resp_len;
+ epriv->ss.sge.length = epriv->ss.sge.sge_length;
+ /*
+ * We can safely zero these out. Since the first SGE covers the
+ * entire packet, nothing else should even look at the MR.
+ */
+ epriv->ss.sge.mr = NULL;
+ epriv->ss.sge.m = 0;
+ epriv->ss.sge.n = 0;
+
+ epriv->ss.sg_list = NULL;
+ epriv->ss.total_len = epriv->ss.sge.sge_length;
+ epriv->ss.num_sge = 1;
+
+ *ss = &epriv->ss;
+ *len = epriv->ss.total_len;
+
+ /* Construct the TID RDMA WRITE RESP packet header */
+ rcu_read_lock();
+ remote = rcu_dereference(qpriv->tid_rdma.remote);
+
+ KDETH_RESET(ohdr->u.tid_rdma.w_rsp.kdeth0, KVER, 0x1);
+ KDETH_RESET(ohdr->u.tid_rdma.w_rsp.kdeth1, JKEY, remote->jkey);
+ ohdr->u.tid_rdma.w_rsp.aeth = rvt_compute_aeth(qp);
+ ohdr->u.tid_rdma.w_rsp.tid_flow_psn =
+ cpu_to_be32((flow->flow_state.generation <<
+ HFI1_KDETH_BTH_SEQ_SHIFT) |
+ (flow->flow_state.spsn &
+ HFI1_KDETH_BTH_SEQ_MASK));
+ ohdr->u.tid_rdma.w_rsp.tid_flow_qp =
+ cpu_to_be32(qpriv->tid_rdma.local.qp |
+ ((flow->idx & TID_RDMA_DESTQP_FLOW_MASK) <<
+ TID_RDMA_DESTQP_FLOW_SHIFT) |
+ qpriv->rcd->ctxt);
+ ohdr->u.tid_rdma.w_rsp.verbs_qp = cpu_to_be32(qp->remote_qpn);
+ *bth1 = remote->qp;
+ rcu_read_unlock();
+ hdwords = sizeof(ohdr->u.tid_rdma.w_rsp) / sizeof(u32);
+ qpriv->pending_tid_w_segs++;
+done:
+ return hdwords;
+}
+
+static void hfi1_add_tid_reap_timer(struct rvt_qp *qp)
+{
+ struct hfi1_qp_priv *qpriv = qp->priv;
+
+ lockdep_assert_held(&qp->s_lock);
+ if (!(qpriv->s_flags & HFI1_R_TID_RSC_TIMER)) {
+ qpriv->s_flags |= HFI1_R_TID_RSC_TIMER;
+ qpriv->s_tid_timer.expires = jiffies +
+ qpriv->tid_timer_timeout_jiffies;
+ add_timer(&qpriv->s_tid_timer);
+ }
+}
+
+static void hfi1_mod_tid_reap_timer(struct rvt_qp *qp)
+{
+ struct hfi1_qp_priv *qpriv = qp->priv;
+
+ lockdep_assert_held(&qp->s_lock);
+ qpriv->s_flags |= HFI1_R_TID_RSC_TIMER;
+ mod_timer(&qpriv->s_tid_timer, jiffies +
+ qpriv->tid_timer_timeout_jiffies);
+}
+
+static int hfi1_stop_tid_reap_timer(struct rvt_qp *qp)
+{
+ struct hfi1_qp_priv *qpriv = qp->priv;
+ int rval = 0;
+
+ lockdep_assert_held(&qp->s_lock);
+ if (qpriv->s_flags & HFI1_R_TID_RSC_TIMER) {
+ rval = del_timer(&qpriv->s_tid_timer);
+ qpriv->s_flags &= ~HFI1_R_TID_RSC_TIMER;
+ }
+ return rval;
+}
+
+void hfi1_del_tid_reap_timer(struct rvt_qp *qp)
+{
+ struct hfi1_qp_priv *qpriv = qp->priv;
+
+ del_timer_sync(&qpriv->s_tid_timer);
+ qpriv->s_flags &= ~HFI1_R_TID_RSC_TIMER;
+}
+
+static void hfi1_tid_timeout(struct timer_list *t)
+{
+ struct hfi1_qp_priv *qpriv = from_timer(qpriv, t, s_tid_timer);
+ struct rvt_qp *qp = qpriv->owner;
+ struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device);
+ unsigned long flags;
+ u32 i;
+
+ spin_lock_irqsave(&qp->r_lock, flags);
+ spin_lock(&qp->s_lock);
+ if (qpriv->s_flags & HFI1_R_TID_RSC_TIMER) {
+ dd_dev_warn(dd_from_ibdev(qp->ibqp.device), "[QP%u] %s %d\n",
+ qp->ibqp.qp_num, __func__, __LINE__);
+ trace_hfi1_msg_tid_timeout(/* msg */
+ qp, "resource timeout = ",
+ (u64)qpriv->tid_timer_timeout_jiffies);
+ hfi1_stop_tid_reap_timer(qp);
+ /*
+ * Go though the entire ack queue and clear any outstanding
+ * HW flow and RcvArray resources.
+ */
+ hfi1_kern_clear_hw_flow(qpriv->rcd, qp);
+ for (i = 0; i < rvt_max_atomic(rdi); i++) {
+ struct tid_rdma_request *req =
+ ack_to_tid_req(&qp->s_ack_queue[i]);
+
+ hfi1_kern_exp_rcv_clear_all(req);
+ }
+ spin_unlock(&qp->s_lock);
+ if (qp->ibqp.event_handler) {
+ struct ib_event ev;
+
+ ev.device = qp->ibqp.device;
+ ev.element.qp = &qp->ibqp;
+ ev.event = IB_EVENT_QP_FATAL;
+ qp->ibqp.event_handler(&ev, qp->ibqp.qp_context);
+ }
+ rvt_rc_error(qp, IB_WC_RESP_TIMEOUT_ERR);
+ goto unlock_r_lock;
+ }
+ spin_unlock(&qp->s_lock);
+unlock_r_lock:
+ spin_unlock_irqrestore(&qp->r_lock, flags);
+}
+
+void hfi1_rc_rcv_tid_rdma_write_resp(struct hfi1_packet *packet)
+{
+ /* HANDLER FOR TID RDMA WRITE RESPONSE packet (Requestor side */
+
+ /*
+ * 1. Find matching SWQE
+ * 2. Check that TIDENTRY array has enough space for a complete
+ * segment. If not, put QP in error state.
+ * 3. Save response data in struct tid_rdma_req and struct tid_rdma_flow
+ * 4. Remove HFI1_S_WAIT_TID_RESP from s_flags.
+ * 5. Set qp->s_state
+ * 6. Kick the send engine (hfi1_schedule_send())
+ */
+ struct ib_other_headers *ohdr = packet->ohdr;
+ struct rvt_qp *qp = packet->qp;
+ struct hfi1_qp_priv *qpriv = qp->priv;
+ struct hfi1_ctxtdata *rcd = packet->rcd;
+ struct rvt_swqe *wqe;
+ struct tid_rdma_request *req;
+ struct tid_rdma_flow *flow;
+ enum ib_wc_status status;
+ u32 opcode, aeth, psn, flow_psn, i, tidlen = 0, pktlen;
+ bool is_fecn;
+ unsigned long flags;
+
+ is_fecn = process_ecn(qp, packet);
+ psn = mask_psn(be32_to_cpu(ohdr->bth[2]));
+ aeth = be32_to_cpu(ohdr->u.tid_rdma.w_rsp.aeth);
+ opcode = (be32_to_cpu(ohdr->bth[0]) >> 24) & 0xff;
+
+ spin_lock_irqsave(&qp->s_lock, flags);
+
+ /* Ignore invalid responses */
+ if (cmp_psn(psn, qp->s_next_psn) >= 0)
+ goto ack_done;
+
+ /* Ignore duplicate responses. */
+ if (unlikely(cmp_psn(psn, qp->s_last_psn) <= 0))
+ goto ack_done;
+
+ if (unlikely(qp->s_acked == qp->s_tail))
+ goto ack_done;
+
+ /*
+ * If we are waiting for a particular packet sequence number
+ * due to a request being resent, check for it. Otherwise,
+ * ensure that we haven't missed anything.
+ */
+ if (qp->r_flags & RVT_R_RDMAR_SEQ) {
+ if (cmp_psn(psn, qp->s_last_psn + 1) != 0)
+ goto ack_done;
+ qp->r_flags &= ~RVT_R_RDMAR_SEQ;
+ }
+
+ wqe = rvt_get_swqe_ptr(qp, qpriv->s_tid_cur);
+ if (unlikely(wqe->wr.opcode != IB_WR_TID_RDMA_WRITE))
+ goto ack_op_err;
+
+ req = wqe_to_tid_req(wqe);
+ /*
+ * If we've lost ACKs and our acked_tail pointer is too far
+ * behind, don't overwrite segments. Just drop the packet and
+ * let the reliability protocol take care of it.
+ */
+ if (!CIRC_SPACE(req->setup_head, req->acked_tail, MAX_FLOWS))
+ goto ack_done;
+
+ /*
+ * The call to do_rc_ack() should be last in the chain of
+ * packet checks because it will end up updating the QP state.
+ * Therefore, anything that would prevent the packet from
+ * being accepted as a successful response should be prior
+ * to it.
+ */
+ if (!do_rc_ack(qp, aeth, psn, opcode, 0, rcd))
+ goto ack_done;
+
+ trace_hfi1_ack(qp, psn);
+
+ flow = &req->flows[req->setup_head];
+ flow->pkt = 0;
+ flow->tid_idx = 0;
+ flow->tid_offset = 0;
+ flow->sent = 0;
+ flow->resync_npkts = 0;
+ flow->tid_qpn = be32_to_cpu(ohdr->u.tid_rdma.w_rsp.tid_flow_qp);
+ flow->idx = (flow->tid_qpn >> TID_RDMA_DESTQP_FLOW_SHIFT) &
+ TID_RDMA_DESTQP_FLOW_MASK;
+ flow_psn = mask_psn(be32_to_cpu(ohdr->u.tid_rdma.w_rsp.tid_flow_psn));
+ flow->flow_state.generation = flow_psn >> HFI1_KDETH_BTH_SEQ_SHIFT;
+ flow->flow_state.spsn = flow_psn & HFI1_KDETH_BTH_SEQ_MASK;
+ flow->flow_state.resp_ib_psn = psn;
+ flow->length = min_t(u32, req->seg_len,
+ (wqe->length - (req->comp_seg * req->seg_len)));
+
+ flow->npkts = rvt_div_round_up_mtu(qp, flow->length);
+ flow->flow_state.lpsn = flow->flow_state.spsn +
+ flow->npkts - 1;
+ /* payload length = packet length - (header length + ICRC length) */
+ pktlen = packet->tlen - (packet->hlen + 4);
+ if (pktlen > sizeof(flow->tid_entry)) {
+ status = IB_WC_LOC_LEN_ERR;
+ goto ack_err;
+ }
+ memcpy(flow->tid_entry, packet->ebuf, pktlen);
+ flow->tidcnt = pktlen / sizeof(*flow->tid_entry);
+ trace_hfi1_tid_flow_rcv_write_resp(qp, req->setup_head, flow);
+
+ req->comp_seg++;
+ trace_hfi1_tid_write_sender_rcv_resp(qp, 0);
+ /*
+ * Walk the TID_ENTRY list to make sure we have enough space for a
+ * complete segment.
+ */
+ for (i = 0; i < flow->tidcnt; i++) {
+ trace_hfi1_tid_entry_rcv_write_resp(/* entry */
+ qp, i, flow->tid_entry[i]);
+ if (!EXP_TID_GET(flow->tid_entry[i], LEN)) {
+ status = IB_WC_LOC_LEN_ERR;
+ goto ack_err;
+ }
+ tidlen += EXP_TID_GET(flow->tid_entry[i], LEN);
+ }
+ if (tidlen * PAGE_SIZE < flow->length) {
+ status = IB_WC_LOC_LEN_ERR;
+ goto ack_err;
+ }
+
+ trace_hfi1_tid_req_rcv_write_resp(qp, 0, wqe->wr.opcode, wqe->psn,
+ wqe->lpsn, req);
+ /*
+ * If this is the first response for this request, set the initial
+ * flow index to the current flow.
+ */
+ if (!cmp_psn(psn, wqe->psn)) {
+ req->r_last_acked = mask_psn(wqe->psn - 1);
+ /* Set acked flow index to head index */
+ req->acked_tail = req->setup_head;
+ }
+
+ /* advance circular buffer head */
+ req->setup_head = CIRC_NEXT(req->setup_head, MAX_FLOWS);
+ req->state = TID_REQUEST_ACTIVE;
+
+ /*
+ * If all responses for this TID RDMA WRITE request have been received
+ * advance the pointer to the next one.
+ * Since TID RDMA requests could be mixed in with regular IB requests,
+ * they might not appear sequentially in the queue. Therefore, the
+ * next request needs to be "found".
+ */
+ if (qpriv->s_tid_cur != qpriv->s_tid_head &&
+ req->comp_seg == req->total_segs) {
+ for (i = qpriv->s_tid_cur + 1; ; i++) {
+ if (i == qp->s_size)
+ i = 0;
+ wqe = rvt_get_swqe_ptr(qp, i);
+ if (i == qpriv->s_tid_head)
+ break;
+ if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE)
+ break;
+ }
+ qpriv->s_tid_cur = i;
+ }
+ qp->s_flags &= ~HFI1_S_WAIT_TID_RESP;
+
+ hfi1_schedule_tid_send(qp);
+ goto ack_done;
+
+ack_op_err:
+ status = IB_WC_LOC_QP_OP_ERR;
+ack_err:
+ rvt_error_qp(qp, status);
+ack_done:
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+ if (is_fecn)
+ hfi1_send_rc_ack(packet, is_fecn);
+}
+
+bool hfi1_build_tid_rdma_packet(struct rvt_swqe *wqe,
+ struct ib_other_headers *ohdr,
+ u32 *bth1, u32 *bth2, u32 *len)
+{
+ struct tid_rdma_request *req = wqe_to_tid_req(wqe);
+ struct tid_rdma_flow *flow = &req->flows[req->clear_tail];
+ struct tid_rdma_params *remote;
+ struct rvt_qp *qp = req->qp;
+ struct hfi1_qp_priv *qpriv = qp->priv;
+ u32 tidentry = flow->tid_entry[flow->tid_idx];
+ u32 tidlen = EXP_TID_GET(tidentry, LEN) << PAGE_SHIFT;
+ struct tid_rdma_write_data *wd = &ohdr->u.tid_rdma.w_data;
+ u32 next_offset, om = KDETH_OM_LARGE;
+ bool last_pkt;
+
+ if (!tidlen) {
+ hfi1_trdma_send_complete(qp, wqe, IB_WC_REM_INV_RD_REQ_ERR);
+ rvt_error_qp(qp, IB_WC_REM_INV_RD_REQ_ERR);
+ }
+
+ *len = min_t(u32, qp->pmtu, tidlen - flow->tid_offset);
+ flow->sent += *len;
+ next_offset = flow->tid_offset + *len;
+ last_pkt = (flow->tid_idx == (flow->tidcnt - 1) &&
+ next_offset >= tidlen) || (flow->sent >= flow->length);
+ trace_hfi1_tid_entry_build_write_data(qp, flow->tid_idx, tidentry);
+ trace_hfi1_tid_flow_build_write_data(qp, req->clear_tail, flow);
+
+ rcu_read_lock();
+ remote = rcu_dereference(qpriv->tid_rdma.remote);
+ KDETH_RESET(wd->kdeth0, KVER, 0x1);
+ KDETH_SET(wd->kdeth0, SH, !last_pkt);
+ KDETH_SET(wd->kdeth0, INTR, !!(!last_pkt && remote->urg));
+ KDETH_SET(wd->kdeth0, TIDCTRL, EXP_TID_GET(tidentry, CTRL));
+ KDETH_SET(wd->kdeth0, TID, EXP_TID_GET(tidentry, IDX));
+ KDETH_SET(wd->kdeth0, OM, om == KDETH_OM_LARGE);
+ KDETH_SET(wd->kdeth0, OFFSET, flow->tid_offset / om);
+ KDETH_RESET(wd->kdeth1, JKEY, remote->jkey);
+ wd->verbs_qp = cpu_to_be32(qp->remote_qpn);
+ rcu_read_unlock();
+
+ *bth1 = flow->tid_qpn;
+ *bth2 = mask_psn(((flow->flow_state.spsn + flow->pkt++) &
+ HFI1_KDETH_BTH_SEQ_MASK) |
+ (flow->flow_state.generation <<
+ HFI1_KDETH_BTH_SEQ_SHIFT));
+ if (last_pkt) {
+ /* PSNs are zero-based, so +1 to count number of packets */
+ if (flow->flow_state.lpsn + 1 +
+ rvt_div_round_up_mtu(qp, req->seg_len) >
+ MAX_TID_FLOW_PSN)
+ req->state = TID_REQUEST_SYNC;
+ *bth2 |= IB_BTH_REQ_ACK;
+ }
+
+ if (next_offset >= tidlen) {
+ flow->tid_offset = 0;
+ flow->tid_idx++;
+ } else {
+ flow->tid_offset = next_offset;
+ }
+ return last_pkt;
+}
+
+void hfi1_rc_rcv_tid_rdma_write_data(struct hfi1_packet *packet)
+{
+ struct rvt_qp *qp = packet->qp;
+ struct hfi1_qp_priv *priv = qp->priv;
+ struct hfi1_ctxtdata *rcd = priv->rcd;
+ struct ib_other_headers *ohdr = packet->ohdr;
+ struct rvt_ack_entry *e;
+ struct tid_rdma_request *req;
+ struct tid_rdma_flow *flow;
+ struct hfi1_ibdev *dev = to_idev(qp->ibqp.device);
+ unsigned long flags;
+ u32 psn, next;
+ u8 opcode;
+
+ psn = mask_psn(be32_to_cpu(ohdr->bth[2]));
+ opcode = (be32_to_cpu(ohdr->bth[0]) >> 24) & 0xff;
+
+ /*
+ * All error handling should be done by now. If we are here, the packet
+ * is either good or been accepted by the error handler.
+ */
+ spin_lock_irqsave(&qp->s_lock, flags);
+ e = &qp->s_ack_queue[priv->r_tid_tail];
+ req = ack_to_tid_req(e);
+ flow = &req->flows[req->clear_tail];
+ if (cmp_psn(psn, full_flow_psn(flow, flow->flow_state.lpsn))) {
+ if (cmp_psn(psn, flow->flow_state.r_next_psn))
+ goto send_nak;
+ flow->flow_state.r_next_psn++;
+ goto exit;
+ }
+ flow->flow_state.r_next_psn = mask_psn(psn + 1);
+ hfi1_kern_exp_rcv_clear(req);
+ priv->alloc_w_segs--;
+ rcd->flows[flow->idx].psn = psn & HFI1_KDETH_BTH_SEQ_MASK;
+ req->comp_seg++;
+ priv->s_nak_state = 0;
+
+ /*
+ * Release the flow if one of the following conditions has been met:
+ * - The request has reached a sync point AND all outstanding
+ * segments have been completed, or
+ * - The entire request is complete and there are no more requests
+ * (of any kind) in the queue.
+ */
+ trace_hfi1_rsp_rcv_tid_write_data(qp, psn);
+ trace_hfi1_tid_req_rcv_write_data(qp, 0, e->opcode, e->psn, e->lpsn,
+ req);
+ trace_hfi1_tid_write_rsp_rcv_data(qp);
+ if (priv->r_tid_ack == HFI1_QP_WQE_INVALID)
+ priv->r_tid_ack = priv->r_tid_tail;
+
+ if (opcode == TID_OP(WRITE_DATA_LAST)) {
+ for (next = priv->r_tid_tail + 1; ; next++) {
+ if (next > rvt_size_atomic(&dev->rdi))
+ next = 0;
+ if (next == priv->r_tid_head)
+ break;
+ e = &qp->s_ack_queue[next];
+ if (e->opcode == TID_OP(WRITE_REQ))
+ break;
+ }
+ priv->r_tid_tail = next;
+ if (++qp->s_acked_ack_queue > rvt_size_atomic(&dev->rdi))
+ qp->s_acked_ack_queue = 0;
+ }
+
+ hfi1_tid_write_alloc_resources(qp, true);
+
+ /*
+ * If we need to generate more responses, schedule the
+ * send engine.
+ */
+ if (req->cur_seg < req->total_segs ||
+ qp->s_tail_ack_queue != qp->r_head_ack_queue) {
+ qp->s_flags |= RVT_S_RESP_PENDING;
+ hfi1_schedule_send(qp);
+ }
+
+ priv->pending_tid_w_segs--;
+ if (priv->s_flags & HFI1_R_TID_RSC_TIMER) {
+ if (priv->pending_tid_w_segs)
+ hfi1_mod_tid_reap_timer(req->qp);
+ else
+ hfi1_stop_tid_reap_timer(req->qp);
+ }
+
+done:
+ priv->s_flags |= RVT_S_ACK_PENDING;
+ hfi1_schedule_tid_send(qp);
+exit:
+ priv->r_next_psn_kdeth = flow->flow_state.r_next_psn;
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+ return;
+
+send_nak:
+ if (!priv->s_nak_state) {
+ priv->s_nak_state = IB_NAK_PSN_ERROR;
+ priv->s_nak_psn = flow->flow_state.r_next_psn;
+ priv->s_flags |= RVT_S_ACK_PENDING;
+ if (priv->r_tid_ack == HFI1_QP_WQE_INVALID)
+ priv->r_tid_ack = priv->r_tid_tail;
+ hfi1_schedule_tid_send(qp);
+ }
+ goto done;
+}
+
+static bool hfi1_tid_rdma_is_resync_psn(u32 psn)
+{
+ return (bool)((psn & HFI1_KDETH_BTH_SEQ_MASK) ==
+ HFI1_KDETH_BTH_SEQ_MASK);
+}
+
+u32 hfi1_build_tid_rdma_write_ack(struct rvt_qp *qp, struct rvt_ack_entry *e,
+ struct ib_other_headers *ohdr, u16 iflow,
+ u32 *bth1, u32 *bth2)
+{
+ struct hfi1_qp_priv *qpriv = qp->priv;
+ struct tid_flow_state *fs = &qpriv->flow_state;
+ struct tid_rdma_request *req = ack_to_tid_req(e);
+ struct tid_rdma_flow *flow = &req->flows[iflow];
+ struct tid_rdma_params *remote;
+
+ rcu_read_lock();
+ remote = rcu_dereference(qpriv->tid_rdma.remote);
+ KDETH_RESET(ohdr->u.tid_rdma.ack.kdeth1, JKEY, remote->jkey);
+ ohdr->u.tid_rdma.ack.verbs_qp = cpu_to_be32(qp->remote_qpn);
+ *bth1 = remote->qp;
+ rcu_read_unlock();
+
+ if (qpriv->resync) {
+ *bth2 = mask_psn((fs->generation <<
+ HFI1_KDETH_BTH_SEQ_SHIFT) - 1);
+ ohdr->u.tid_rdma.ack.aeth = rvt_compute_aeth(qp);
+ } else if (qpriv->s_nak_state) {
+ *bth2 = mask_psn(qpriv->s_nak_psn);
+ ohdr->u.tid_rdma.ack.aeth =
+ cpu_to_be32((qp->r_msn & IB_MSN_MASK) |
+ (qpriv->s_nak_state <<
+ IB_AETH_CREDIT_SHIFT));
+ } else {
+ *bth2 = full_flow_psn(flow, flow->flow_state.lpsn);
+ ohdr->u.tid_rdma.ack.aeth = rvt_compute_aeth(qp);
+ }
+ KDETH_RESET(ohdr->u.tid_rdma.ack.kdeth0, KVER, 0x1);
+ ohdr->u.tid_rdma.ack.tid_flow_qp =
+ cpu_to_be32(qpriv->tid_rdma.local.qp |
+ ((flow->idx & TID_RDMA_DESTQP_FLOW_MASK) <<
+ TID_RDMA_DESTQP_FLOW_SHIFT) |
+ qpriv->rcd->ctxt);
+
+ ohdr->u.tid_rdma.ack.tid_flow_psn = 0;
+ ohdr->u.tid_rdma.ack.verbs_psn =
+ cpu_to_be32(flow->flow_state.resp_ib_psn);
+
+ if (qpriv->resync) {
+ /*
+ * If the PSN before the current expect KDETH PSN is the
+ * RESYNC PSN, then we never received a good TID RDMA WRITE
+ * DATA packet after a previous RESYNC.
+ * In this case, the next expected KDETH PSN stays the same.
+ */
+ if (hfi1_tid_rdma_is_resync_psn(qpriv->r_next_psn_kdeth - 1)) {
+ ohdr->u.tid_rdma.ack.tid_flow_psn =
+ cpu_to_be32(qpriv->r_next_psn_kdeth_save);
+ } else {
+ /*
+ * Because the KDETH PSNs jump during a RESYNC, it's
+ * not possible to infer (or compute) the previous value
+ * of r_next_psn_kdeth in the case of back-to-back
+ * RESYNC packets. Therefore, we save it.
+ */
+ qpriv->r_next_psn_kdeth_save =
+ qpriv->r_next_psn_kdeth - 1;
+ ohdr->u.tid_rdma.ack.tid_flow_psn =
+ cpu_to_be32(qpriv->r_next_psn_kdeth_save);
+ qpriv->r_next_psn_kdeth = mask_psn(*bth2 + 1);
+ }
+ qpriv->resync = false;
+ }
+
+ return sizeof(ohdr->u.tid_rdma.ack) / sizeof(u32);
+}
+
+void hfi1_rc_rcv_tid_rdma_ack(struct hfi1_packet *packet)
+{
+ struct ib_other_headers *ohdr = packet->ohdr;
+ struct rvt_qp *qp = packet->qp;
+ struct hfi1_qp_priv *qpriv = qp->priv;
+ struct rvt_swqe *wqe;
+ struct tid_rdma_request *req;
+ struct tid_rdma_flow *flow;
+ u32 aeth, psn, req_psn, ack_psn, fspsn, resync_psn, ack_kpsn;
+ bool is_fecn;
+ unsigned long flags;
+ u16 fidx;
+
+ trace_hfi1_tid_write_sender_rcv_tid_ack(qp, 0);
+ is_fecn = process_ecn(qp, packet);
+ psn = mask_psn(be32_to_cpu(ohdr->bth[2]));
+ aeth = be32_to_cpu(ohdr->u.tid_rdma.ack.aeth);
+ req_psn = mask_psn(be32_to_cpu(ohdr->u.tid_rdma.ack.verbs_psn));
+ resync_psn = mask_psn(be32_to_cpu(ohdr->u.tid_rdma.ack.tid_flow_psn));
+
+ spin_lock_irqsave(&qp->s_lock, flags);
+ trace_hfi1_rcv_tid_ack(qp, aeth, psn, req_psn, resync_psn);
+
+ /* If we are waiting for an ACK to RESYNC, drop any other packets */
+ if ((qp->s_flags & HFI1_S_WAIT_HALT) &&
+ cmp_psn(psn, qpriv->s_resync_psn))
+ goto ack_op_err;
+
+ ack_psn = req_psn;
+ if (hfi1_tid_rdma_is_resync_psn(psn))
+ ack_kpsn = resync_psn;
+ else
+ ack_kpsn = psn;
+ if (aeth >> 29) {
+ ack_psn--;
+ ack_kpsn--;
+ }
+
+ wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
+
+ if (wqe->wr.opcode != IB_WR_TID_RDMA_WRITE)
+ goto ack_op_err;
+
+ req = wqe_to_tid_req(wqe);
+ trace_hfi1_tid_req_rcv_tid_ack(qp, 0, wqe->wr.opcode, wqe->psn,
+ wqe->lpsn, req);
+ flow = &req->flows[req->acked_tail];
+ trace_hfi1_tid_flow_rcv_tid_ack(qp, req->acked_tail, flow);
+
+ /* Drop stale ACK/NAK */
+ if (cmp_psn(psn, full_flow_psn(flow, flow->flow_state.spsn)) < 0)
+ goto ack_op_err;
+
+ while (cmp_psn(ack_kpsn,
+ full_flow_psn(flow, flow->flow_state.lpsn)) >= 0 &&
+ req->ack_seg < req->cur_seg) {
+ req->ack_seg++;
+ /* advance acked segment pointer */
+ req->acked_tail = CIRC_NEXT(req->acked_tail, MAX_FLOWS);
+ req->r_last_acked = flow->flow_state.resp_ib_psn;
+ trace_hfi1_tid_req_rcv_tid_ack(qp, 0, wqe->wr.opcode, wqe->psn,
+ wqe->lpsn, req);
+ if (req->ack_seg == req->total_segs) {
+ req->state = TID_REQUEST_COMPLETE;
+ wqe = do_rc_completion(qp, wqe,
+ to_iport(qp->ibqp.device,
+ qp->port_num));
+ trace_hfi1_sender_rcv_tid_ack(qp);
+ atomic_dec(&qpriv->n_tid_requests);
+ if (qp->s_acked == qp->s_tail)
+ break;
+ if (wqe->wr.opcode != IB_WR_TID_RDMA_WRITE)
+ break;
+ req = wqe_to_tid_req(wqe);
+ }
+ flow = &req->flows[req->acked_tail];
+ trace_hfi1_tid_flow_rcv_tid_ack(qp, req->acked_tail, flow);
+ }
+
+ trace_hfi1_tid_req_rcv_tid_ack(qp, 0, wqe->wr.opcode, wqe->psn,
+ wqe->lpsn, req);
+ switch (aeth >> 29) {
+ case 0: /* ACK */
+ if (qpriv->s_flags & RVT_S_WAIT_ACK)
+ qpriv->s_flags &= ~RVT_S_WAIT_ACK;
+ if (!hfi1_tid_rdma_is_resync_psn(psn)) {
+ /* Check if there is any pending TID ACK */
+ if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE &&
+ req->ack_seg < req->cur_seg)
+ hfi1_mod_tid_retry_timer(qp);
+ else
+ hfi1_stop_tid_retry_timer(qp);
+ hfi1_schedule_send(qp);
+ } else {
+ u32 spsn, fpsn, last_acked, generation;
+ struct tid_rdma_request *rptr;
+
+ /* ACK(RESYNC) */
+ hfi1_stop_tid_retry_timer(qp);
+ /* Allow new requests (see hfi1_make_tid_rdma_pkt) */
+ qp->s_flags &= ~HFI1_S_WAIT_HALT;
+ /*
+ * Clear RVT_S_SEND_ONE flag in case that the TID RDMA
+ * ACK is received after the TID retry timer is fired
+ * again. In this case, do not send any more TID
+ * RESYNC request or wait for any more TID ACK packet.
+ */
+ qpriv->s_flags &= ~RVT_S_SEND_ONE;
+ hfi1_schedule_send(qp);
+
+ if ((qp->s_acked == qpriv->s_tid_tail &&
+ req->ack_seg == req->total_segs) ||
+ qp->s_acked == qp->s_tail) {
+ qpriv->s_state = TID_OP(WRITE_DATA_LAST);
+ goto done;
+ }
+
+ if (req->ack_seg == req->comp_seg) {
+ qpriv->s_state = TID_OP(WRITE_DATA);
+ goto done;
+ }
+
+ /*
+ * The PSN to start with is the next PSN after the
+ * RESYNC PSN.
+ */
+ psn = mask_psn(psn + 1);
+ generation = psn >> HFI1_KDETH_BTH_SEQ_SHIFT;
+ spsn = 0;
+
+ /*
+ * Update to the correct WQE when we get an ACK(RESYNC)
+ * in the middle of a request.
+ */
+ if (delta_psn(ack_psn, wqe->lpsn))
+ wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
+ req = wqe_to_tid_req(wqe);
+ flow = &req->flows[req->acked_tail];
+ /*
+ * RESYNC re-numbers the PSN ranges of all remaining
+ * segments. Also, PSN's start from 0 in the middle of a
+ * segment and the first segment size is less than the
+ * default number of packets. flow->resync_npkts is used
+ * to track the number of packets from the start of the
+ * real segment to the point of 0 PSN after the RESYNC
+ * in order to later correctly rewind the SGE.
+ */
+ fpsn = full_flow_psn(flow, flow->flow_state.spsn);
+ req->r_ack_psn = psn;
+ flow->resync_npkts +=
+ delta_psn(mask_psn(resync_psn + 1), fpsn);
+ /*
+ * Renumber all packet sequence number ranges
+ * based on the new generation.
+ */
+ last_acked = qp->s_acked;
+ rptr = req;
+ while (1) {
+ /* start from last acked segment */
+ for (fidx = rptr->acked_tail;
+ CIRC_CNT(rptr->setup_head, fidx,
+ MAX_FLOWS);
+ fidx = CIRC_NEXT(fidx, MAX_FLOWS)) {
+ u32 lpsn;
+ u32 gen;
+
+ flow = &rptr->flows[fidx];
+ gen = flow->flow_state.generation;
+ if (WARN_ON(gen == generation &&
+ flow->flow_state.spsn !=
+ spsn))
+ continue;
+ lpsn = flow->flow_state.lpsn;
+ lpsn = full_flow_psn(flow, lpsn);
+ flow->npkts =
+ delta_psn(lpsn,
+ mask_psn(resync_psn)
+ );
+ flow->flow_state.generation =
+ generation;
+ flow->flow_state.spsn = spsn;
+ flow->flow_state.lpsn =
+ flow->flow_state.spsn +
+ flow->npkts - 1;
+ flow->pkt = 0;
+ spsn += flow->npkts;
+ resync_psn += flow->npkts;
+ trace_hfi1_tid_flow_rcv_tid_ack(qp,
+ fidx,
+ flow);
+ }
+ if (++last_acked == qpriv->s_tid_cur + 1)
+ break;
+ if (last_acked == qp->s_size)
+ last_acked = 0;
+ wqe = rvt_get_swqe_ptr(qp, last_acked);
+ rptr = wqe_to_tid_req(wqe);
+ }
+ req->cur_seg = req->ack_seg;
+ qpriv->s_tid_tail = qp->s_acked;
+ qpriv->s_state = TID_OP(WRITE_REQ);
+ hfi1_schedule_tid_send(qp);
+ }
+done:
+ qpriv->s_retry = qp->s_retry_cnt;
+ break;
+
+ case 3: /* NAK */
+ hfi1_stop_tid_retry_timer(qp);
+ switch ((aeth >> IB_AETH_CREDIT_SHIFT) &
+ IB_AETH_CREDIT_MASK) {
+ case 0: /* PSN sequence error */
+ flow = &req->flows[req->acked_tail];
+ fspsn = full_flow_psn(flow, flow->flow_state.spsn);
+ trace_hfi1_tid_flow_rcv_tid_ack(qp, req->acked_tail,
+ flow);
+ req->r_ack_psn = mask_psn(be32_to_cpu(ohdr->bth[2]));
+ req->cur_seg = req->ack_seg;
+ qpriv->s_tid_tail = qp->s_acked;
+ qpriv->s_state = TID_OP(WRITE_REQ);
+ qpriv->s_retry = qp->s_retry_cnt;
+ hfi1_schedule_tid_send(qp);
+ break;
+
+ default:
+ break;
+ }
+ break;
+
+ default:
+ break;
+ }
+
+ack_op_err:
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+}
+
+void hfi1_add_tid_retry_timer(struct rvt_qp *qp)
+{
+ struct hfi1_qp_priv *priv = qp->priv;
+ struct ib_qp *ibqp = &qp->ibqp;
+ struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device);
+
+ lockdep_assert_held(&qp->s_lock);
+ if (!(priv->s_flags & HFI1_S_TID_RETRY_TIMER)) {
+ priv->s_flags |= HFI1_S_TID_RETRY_TIMER;
+ priv->s_tid_retry_timer.expires = jiffies +
+ priv->tid_retry_timeout_jiffies + rdi->busy_jiffies;
+ add_timer(&priv->s_tid_retry_timer);
+ }
+}
+
+static void hfi1_mod_tid_retry_timer(struct rvt_qp *qp)
+{
+ struct hfi1_qp_priv *priv = qp->priv;
+ struct ib_qp *ibqp = &qp->ibqp;
+ struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device);
+
+ lockdep_assert_held(&qp->s_lock);
+ priv->s_flags |= HFI1_S_TID_RETRY_TIMER;
+ mod_timer(&priv->s_tid_retry_timer, jiffies +
+ priv->tid_retry_timeout_jiffies + rdi->busy_jiffies);
+}
+
+static int hfi1_stop_tid_retry_timer(struct rvt_qp *qp)
+{
+ struct hfi1_qp_priv *priv = qp->priv;
+ int rval = 0;
+
+ lockdep_assert_held(&qp->s_lock);
+ if (priv->s_flags & HFI1_S_TID_RETRY_TIMER) {
+ rval = del_timer(&priv->s_tid_retry_timer);
+ priv->s_flags &= ~HFI1_S_TID_RETRY_TIMER;
+ }
+ return rval;
+}
+
+void hfi1_del_tid_retry_timer(struct rvt_qp *qp)
+{
+ struct hfi1_qp_priv *priv = qp->priv;
+
+ del_timer_sync(&priv->s_tid_retry_timer);
+ priv->s_flags &= ~HFI1_S_TID_RETRY_TIMER;
+}
+
+static void hfi1_tid_retry_timeout(struct timer_list *t)
+{
+ struct hfi1_qp_priv *priv = from_timer(priv, t, s_tid_retry_timer);
+ struct rvt_qp *qp = priv->owner;
+ struct rvt_swqe *wqe;
+ unsigned long flags;
+ struct tid_rdma_request *req;
+
+ spin_lock_irqsave(&qp->r_lock, flags);
+ spin_lock(&qp->s_lock);
+ trace_hfi1_tid_write_sender_retry_timeout(qp, 0);
+ if (priv->s_flags & HFI1_S_TID_RETRY_TIMER) {
+ hfi1_stop_tid_retry_timer(qp);
+ if (!priv->s_retry) {
+ trace_hfi1_msg_tid_retry_timeout(/* msg */
+ qp,
+ "Exhausted retries. Tid retry timeout = ",
+ (u64)priv->tid_retry_timeout_jiffies);
+
+ wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
+ hfi1_trdma_send_complete(qp, wqe, IB_WC_RETRY_EXC_ERR);
+ rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR);
+ } else {
+ wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
+ req = wqe_to_tid_req(wqe);
+ trace_hfi1_tid_req_tid_retry_timeout(/* req */
+ qp, 0, wqe->wr.opcode, wqe->psn, wqe->lpsn, req);
+
+ priv->s_flags &= ~RVT_S_WAIT_ACK;
+ /* Only send one packet (the RESYNC) */
+ priv->s_flags |= RVT_S_SEND_ONE;
+ /*
+ * No additional request shall be made by this QP until
+ * the RESYNC has been complete.
+ */
+ qp->s_flags |= HFI1_S_WAIT_HALT;
+ priv->s_state = TID_OP(RESYNC);
+ priv->s_retry--;
+ hfi1_schedule_tid_send(qp);
+ }
+ }
+ spin_unlock(&qp->s_lock);
+ spin_unlock_irqrestore(&qp->r_lock, flags);
+}
+
+u32 hfi1_build_tid_rdma_resync(struct rvt_qp *qp, struct rvt_swqe *wqe,
+ struct ib_other_headers *ohdr, u32 *bth1,
+ u32 *bth2, u16 fidx)
+{
+ struct hfi1_qp_priv *qpriv = qp->priv;
+ struct tid_rdma_params *remote;
+ struct tid_rdma_request *req = wqe_to_tid_req(wqe);
+ struct tid_rdma_flow *flow = &req->flows[fidx];
+ u32 generation;
+
+ rcu_read_lock();
+ remote = rcu_dereference(qpriv->tid_rdma.remote);
+ KDETH_RESET(ohdr->u.tid_rdma.ack.kdeth1, JKEY, remote->jkey);
+ ohdr->u.tid_rdma.ack.verbs_qp = cpu_to_be32(qp->remote_qpn);
+ *bth1 = remote->qp;
+ rcu_read_unlock();
+
+ generation = kern_flow_generation_next(flow->flow_state.generation);
+ *bth2 = mask_psn((generation << HFI1_KDETH_BTH_SEQ_SHIFT) - 1);
+ qpriv->s_resync_psn = *bth2;
+ *bth2 |= IB_BTH_REQ_ACK;
+ KDETH_RESET(ohdr->u.tid_rdma.ack.kdeth0, KVER, 0x1);
+
+ return sizeof(ohdr->u.tid_rdma.resync) / sizeof(u32);
+}
+
+void hfi1_rc_rcv_tid_rdma_resync(struct hfi1_packet *packet)
+{
+ struct ib_other_headers *ohdr = packet->ohdr;
+ struct rvt_qp *qp = packet->qp;
+ struct hfi1_qp_priv *qpriv = qp->priv;
+ struct hfi1_ctxtdata *rcd = qpriv->rcd;
+ struct hfi1_ibdev *dev = to_idev(qp->ibqp.device);
+ struct rvt_ack_entry *e;
+ struct tid_rdma_request *req;
+ struct tid_rdma_flow *flow;
+ struct tid_flow_state *fs = &qpriv->flow_state;
+ u32 psn, generation, idx, gen_next;
+ bool is_fecn;
+ unsigned long flags;
+
+ is_fecn = process_ecn(qp, packet);
+ psn = mask_psn(be32_to_cpu(ohdr->bth[2]));
+
+ generation = mask_psn(psn + 1) >> HFI1_KDETH_BTH_SEQ_SHIFT;
+ spin_lock_irqsave(&qp->s_lock, flags);
+
+ gen_next = (fs->generation == KERN_GENERATION_RESERVED) ?
+ generation : kern_flow_generation_next(fs->generation);
+ /*
+ * RESYNC packet contains the "next" generation and can only be
+ * from the current or previous generations
+ */
+ if (generation != mask_generation(gen_next - 1) &&
+ generation != gen_next)
+ goto bail;
+ /* Already processing a resync */
+ if (qpriv->resync)
+ goto bail;
+
+ spin_lock(&rcd->exp_lock);
+ if (fs->index >= RXE_NUM_TID_FLOWS) {
+ /*
+ * If we don't have a flow, save the generation so it can be
+ * applied when a new flow is allocated
+ */
+ fs->generation = generation;
+ } else {
+ /* Reprogram the QP flow with new generation */
+ rcd->flows[fs->index].generation = generation;
+ fs->generation = kern_setup_hw_flow(rcd, fs->index);
+ }
+ fs->psn = 0;
+ /*
+ * Disable SW PSN checking since a RESYNC is equivalent to a
+ * sync point and the flow has/will be reprogrammed
+ */
+ qpriv->s_flags &= ~HFI1_R_TID_SW_PSN;
+ trace_hfi1_tid_write_rsp_rcv_resync(qp);
+
+ /*
+ * Reset all TID flow information with the new generation.
+ * This is done for all requests and segments after the
+ * last received segment
+ */
+ for (idx = qpriv->r_tid_tail; ; idx++) {
+ u16 flow_idx;
+
+ if (idx > rvt_size_atomic(&dev->rdi))
+ idx = 0;
+ e = &qp->s_ack_queue[idx];
+ if (e->opcode == TID_OP(WRITE_REQ)) {
+ req = ack_to_tid_req(e);
+ trace_hfi1_tid_req_rcv_resync(qp, 0, e->opcode, e->psn,
+ e->lpsn, req);
+
+ /* start from last unacked segment */
+ for (flow_idx = req->clear_tail;
+ CIRC_CNT(req->setup_head, flow_idx,
+ MAX_FLOWS);
+ flow_idx = CIRC_NEXT(flow_idx, MAX_FLOWS)) {
+ u32 lpsn;
+ u32 next;
+
+ flow = &req->flows[flow_idx];
+ lpsn = full_flow_psn(flow,
+ flow->flow_state.lpsn);
+ next = flow->flow_state.r_next_psn;
+ flow->npkts = delta_psn(lpsn, next - 1);
+ flow->flow_state.generation = fs->generation;
+ flow->flow_state.spsn = fs->psn;
+ flow->flow_state.lpsn =
+ flow->flow_state.spsn + flow->npkts - 1;
+ flow->flow_state.r_next_psn =
+ full_flow_psn(flow,
+ flow->flow_state.spsn);
+ fs->psn += flow->npkts;
+ trace_hfi1_tid_flow_rcv_resync(qp, flow_idx,
+ flow);
+ }
+ }
+ if (idx == qp->s_tail_ack_queue)
+ break;
+ }
+
+ spin_unlock(&rcd->exp_lock);
+ qpriv->resync = true;
+ /* RESYNC request always gets a TID RDMA ACK. */
+ qpriv->s_nak_state = 0;
+ qpriv->s_flags |= RVT_S_ACK_PENDING;
+ hfi1_schedule_tid_send(qp);
+bail:
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+}
+
+/*
+ * Call this function when the last TID RDMA WRITE DATA packet for a request
+ * is built.
+ */
+static void update_tid_tail(struct rvt_qp *qp)
+ __must_hold(&qp->s_lock)
+{
+ struct hfi1_qp_priv *priv = qp->priv;
+ u32 i;
+ struct rvt_swqe *wqe;
+
+ lockdep_assert_held(&qp->s_lock);
+ /* Can't move beyond s_tid_cur */
+ if (priv->s_tid_tail == priv->s_tid_cur)
+ return;
+ for (i = priv->s_tid_tail + 1; ; i++) {
+ if (i == qp->s_size)
+ i = 0;
+
+ if (i == priv->s_tid_cur)
+ break;
+ wqe = rvt_get_swqe_ptr(qp, i);
+ if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE)
+ break;
+ }
+ priv->s_tid_tail = i;
+ priv->s_state = TID_OP(WRITE_RESP);
+}
+
+int hfi1_make_tid_rdma_pkt(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
+ __must_hold(&qp->s_lock)
+{
+ struct hfi1_qp_priv *priv = qp->priv;
+ struct rvt_swqe *wqe;
+ u32 bth1 = 0, bth2 = 0, hwords = 5, len, middle = 0;
+ struct ib_other_headers *ohdr;
+ struct rvt_sge_state *ss = &qp->s_sge;
+ struct rvt_ack_entry *e = &qp->s_ack_queue[qp->s_tail_ack_queue];
+ struct tid_rdma_request *req = ack_to_tid_req(e);
+ bool last = false;
+ u8 opcode = TID_OP(WRITE_DATA);
+
+ lockdep_assert_held(&qp->s_lock);
+ trace_hfi1_tid_write_sender_make_tid_pkt(qp, 0);
+ /*
+ * Prioritize the sending of the requests and responses over the
+ * sending of the TID RDMA data packets.
+ */
+ if (((atomic_read(&priv->n_tid_requests) < HFI1_TID_RDMA_WRITE_CNT) &&
+ atomic_read(&priv->n_requests) &&
+ !(qp->s_flags & (RVT_S_BUSY | RVT_S_WAIT_ACK |
+ HFI1_S_ANY_WAIT_IO))) ||
+ (e->opcode == TID_OP(WRITE_REQ) && req->cur_seg < req->alloc_seg &&
+ !(qp->s_flags & (RVT_S_BUSY | HFI1_S_ANY_WAIT_IO)))) {
+ struct iowait_work *iowork;
+
+ iowork = iowait_get_ib_work(&priv->s_iowait);
+ ps->s_txreq = get_waiting_verbs_txreq(iowork);
+ if (ps->s_txreq || hfi1_make_rc_req(qp, ps)) {
+ priv->s_flags |= HFI1_S_TID_BUSY_SET;
+ return 1;
+ }
+ }
+
+ ps->s_txreq = get_txreq(ps->dev, qp);
+ if (!ps->s_txreq)
+ goto bail_no_tx;
+
+ ohdr = &ps->s_txreq->phdr.hdr.ibh.u.oth;
+
+ if ((priv->s_flags & RVT_S_ACK_PENDING) &&
+ make_tid_rdma_ack(qp, ohdr, ps))
+ return 1;
+
+ if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_SEND_OK)) {
+ if (!(ib_rvt_state_ops[qp->state] & RVT_FLUSH_SEND))
+ goto bail;
+ /* We are in the error state, flush the work request. */
+ if (qp->s_last == READ_ONCE(qp->s_head))
+ goto bail;
+ /* If DMAs are in progress, we can't flush immediately. */
+ if (iowait_sdma_pending(&priv->s_iowait)) {
+ qp->s_flags |= RVT_S_WAIT_DMA;
+ goto bail;
+ }
+ clear_ahg(qp);
+ wqe = rvt_get_swqe_ptr(qp, qp->s_last);
+ hfi1_trdma_send_complete(qp, wqe, qp->s_last != qp->s_acked ?
+ IB_WC_SUCCESS : IB_WC_WR_FLUSH_ERR);
+ /* will get called again */
+ goto done_free_tx;
+ }
+
+ if (priv->s_flags & RVT_S_WAIT_ACK)
+ goto bail;
+
+ /* Check whether there is anything to do. */
+ if (priv->s_tid_tail == HFI1_QP_WQE_INVALID)
+ goto bail;
+ wqe = rvt_get_swqe_ptr(qp, priv->s_tid_tail);
+ req = wqe_to_tid_req(wqe);
+ trace_hfi1_tid_req_make_tid_pkt(qp, 0, wqe->wr.opcode, wqe->psn,
+ wqe->lpsn, req);
+ switch (priv->s_state) {
+ case TID_OP(WRITE_REQ):
+ case TID_OP(WRITE_RESP):
+ priv->tid_ss.sge = wqe->sg_list[0];
+ priv->tid_ss.sg_list = wqe->sg_list + 1;
+ priv->tid_ss.num_sge = wqe->wr.num_sge;
+ priv->tid_ss.total_len = wqe->length;
+
+ if (priv->s_state == TID_OP(WRITE_REQ))
+ hfi1_tid_rdma_restart_req(qp, wqe, &bth2);
+ priv->s_state = TID_OP(WRITE_DATA);
+ /* fall through */
+
+ case TID_OP(WRITE_DATA):
+ /*
+ * 1. Check whether TID RDMA WRITE RESP available.
+ * 2. If no:
+ * 2.1 If have more segments and no TID RDMA WRITE RESP,
+ * set HFI1_S_WAIT_TID_RESP
+ * 2.2 Return indicating no progress made.
+ * 3. If yes:
+ * 3.1 Build TID RDMA WRITE DATA packet.
+ * 3.2 If last packet in segment:
+ * 3.2.1 Change KDETH header bits
+ * 3.2.2 Advance RESP pointers.
+ * 3.3 Return indicating progress made.
+ */
+ trace_hfi1_sender_make_tid_pkt(qp);
+ trace_hfi1_tid_write_sender_make_tid_pkt(qp, 0);
+ wqe = rvt_get_swqe_ptr(qp, priv->s_tid_tail);
+ req = wqe_to_tid_req(wqe);
+ len = wqe->length;
+
+ if (!req->comp_seg || req->cur_seg == req->comp_seg)
+ goto bail;
+
+ trace_hfi1_tid_req_make_tid_pkt(qp, 0, wqe->wr.opcode,
+ wqe->psn, wqe->lpsn, req);
+ last = hfi1_build_tid_rdma_packet(wqe, ohdr, &bth1, &bth2,
+ &len);
+
+ if (last) {
+ /* move pointer to next flow */
+ req->clear_tail = CIRC_NEXT(req->clear_tail,
+ MAX_FLOWS);
+ if (++req->cur_seg < req->total_segs) {
+ if (!CIRC_CNT(req->setup_head, req->clear_tail,
+ MAX_FLOWS))
+ qp->s_flags |= HFI1_S_WAIT_TID_RESP;
+ } else {
+ priv->s_state = TID_OP(WRITE_DATA_LAST);
+ opcode = TID_OP(WRITE_DATA_LAST);
+
+ /* Advance the s_tid_tail now */
+ update_tid_tail(qp);
+ }
+ }
+ hwords += sizeof(ohdr->u.tid_rdma.w_data) / sizeof(u32);
+ ss = &priv->tid_ss;
+ break;
+
+ case TID_OP(RESYNC):
+ trace_hfi1_sender_make_tid_pkt(qp);
+ /* Use generation from the most recently received response */
+ wqe = rvt_get_swqe_ptr(qp, priv->s_tid_cur);
+ req = wqe_to_tid_req(wqe);
+ /* If no responses for this WQE look at the previous one */
+ if (!req->comp_seg) {
+ wqe = rvt_get_swqe_ptr(qp,
+ (!priv->s_tid_cur ? qp->s_size :
+ priv->s_tid_cur) - 1);
+ req = wqe_to_tid_req(wqe);
+ }
+ hwords += hfi1_build_tid_rdma_resync(qp, wqe, ohdr, &bth1,
+ &bth2,
+ CIRC_PREV(req->setup_head,
+ MAX_FLOWS));
+ ss = NULL;
+ len = 0;
+ opcode = TID_OP(RESYNC);
+ break;
+
+ default:
+ goto bail;
+ }
+ if (priv->s_flags & RVT_S_SEND_ONE) {
+ priv->s_flags &= ~RVT_S_SEND_ONE;
+ priv->s_flags |= RVT_S_WAIT_ACK;
+ bth2 |= IB_BTH_REQ_ACK;
+ }
+ qp->s_len -= len;
+ ps->s_txreq->hdr_dwords = hwords;
+ ps->s_txreq->sde = priv->s_sde;
+ ps->s_txreq->ss = ss;
+ ps->s_txreq->s_cur_size = len;
+ hfi1_make_ruc_header(qp, ohdr, (opcode << 24), bth1, bth2,
+ middle, ps);
+ return 1;
+done_free_tx:
+ hfi1_put_txreq(ps->s_txreq);
+ ps->s_txreq = NULL;
+ return 1;
+
+bail:
+ hfi1_put_txreq(ps->s_txreq);
+bail_no_tx:
+ ps->s_txreq = NULL;
+ priv->s_flags &= ~RVT_S_BUSY;
+ /*
+ * If we didn't get a txreq, the QP will be woken up later to try
+ * again, set the flags to the the wake up which work item to wake
+ * up.
+ * (A better algorithm should be found to do this and generalize the
+ * sleep/wakeup flags.)
+ */
+ iowait_set_flag(&priv->s_iowait, IOWAIT_PENDING_TID);
+ return 0;
+}
+
+static int make_tid_rdma_ack(struct rvt_qp *qp,
+ struct ib_other_headers *ohdr,
+ struct hfi1_pkt_state *ps)
+{
+ struct rvt_ack_entry *e;
+ struct hfi1_qp_priv *qpriv = qp->priv;
+ struct hfi1_ibdev *dev = to_idev(qp->ibqp.device);
+ u32 hwords, next;
+ u32 len = 0;
+ u32 bth1 = 0, bth2 = 0;
+ int middle = 0;
+ u16 flow;
+ struct tid_rdma_request *req, *nreq;
+
+ trace_hfi1_tid_write_rsp_make_tid_ack(qp);
+ /* Don't send an ACK if we aren't supposed to. */
+ if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK))
+ goto bail;
+
+ /* header size in 32-bit words LRH+BTH = (8+12)/4. */
+ hwords = 5;
+
+ e = &qp->s_ack_queue[qpriv->r_tid_ack];
+ req = ack_to_tid_req(e);
+ /*
+ * In the RESYNC case, we are exactly one segment past the
+ * previously sent ack or at the previously sent NAK. So to send
+ * the resync ack, we go back one segment (which might be part of
+ * the previous request) and let the do-while loop execute again.
+ * The advantage of executing the do-while loop is that any data
+ * received after the previous ack is automatically acked in the
+ * RESYNC ack. It turns out that for the do-while loop we only need
+ * to pull back qpriv->r_tid_ack, not the segment
+ * indices/counters. The scheme works even if the previous request
+ * was not a TID WRITE request.
+ */
+ if (qpriv->resync) {
+ if (!req->ack_seg || req->ack_seg == req->total_segs)
+ qpriv->r_tid_ack = !qpriv->r_tid_ack ?
+ rvt_size_atomic(&dev->rdi) :
+ qpriv->r_tid_ack - 1;
+ e = &qp->s_ack_queue[qpriv->r_tid_ack];
+ req = ack_to_tid_req(e);
+ }
+
+ trace_hfi1_rsp_make_tid_ack(qp, e->psn);
+ trace_hfi1_tid_req_make_tid_ack(qp, 0, e->opcode, e->psn, e->lpsn,
+ req);
+ /*
+ * If we've sent all the ACKs that we can, we are done
+ * until we get more segments...
+ */
+ if (!qpriv->s_nak_state && !qpriv->resync &&
+ req->ack_seg == req->comp_seg)
+ goto bail;
+
+ do {
+ /*
+ * To deal with coalesced ACKs, the acked_tail pointer
+ * into the flow array is used. The distance between it
+ * and the clear_tail is the number of flows that are
+ * being ACK'ed.
+ */
+ req->ack_seg +=
+ /* Get up-to-date value */
+ CIRC_CNT(req->clear_tail, req->acked_tail,
+ MAX_FLOWS);
+ /* Advance acked index */
+ req->acked_tail = req->clear_tail;
+
+ /*
+ * req->clear_tail points to the segment currently being
+ * received. So, when sending an ACK, the previous
+ * segment is being ACK'ed.
+ */
+ flow = CIRC_PREV(req->acked_tail, MAX_FLOWS);
+ if (req->ack_seg != req->total_segs)
+ break;
+ req->state = TID_REQUEST_COMPLETE;
+
+ next = qpriv->r_tid_ack + 1;
+ if (next > rvt_size_atomic(&dev->rdi))
+ next = 0;
+ qpriv->r_tid_ack = next;
+ if (qp->s_ack_queue[next].opcode != TID_OP(WRITE_REQ))
+ break;
+ nreq = ack_to_tid_req(&qp->s_ack_queue[next]);
+ if (!nreq->comp_seg || nreq->ack_seg == nreq->comp_seg)
+ break;
+
+ /* Move to the next ack entry now */
+ e = &qp->s_ack_queue[qpriv->r_tid_ack];
+ req = ack_to_tid_req(e);
+ } while (1);
+
+ /*
+ * At this point qpriv->r_tid_ack == qpriv->r_tid_tail but e and
+ * req could be pointing at the previous ack queue entry
+ */
+ if (qpriv->s_nak_state ||
+ (qpriv->resync &&
+ !hfi1_tid_rdma_is_resync_psn(qpriv->r_next_psn_kdeth - 1) &&
+ (cmp_psn(qpriv->r_next_psn_kdeth - 1,
+ full_flow_psn(&req->flows[flow],
+ req->flows[flow].flow_state.lpsn)) > 0))) {
+ /*
+ * A NAK will implicitly acknowledge all previous TID RDMA
+ * requests. Therefore, we NAK with the req->acked_tail
+ * segment for the request at qpriv->r_tid_ack (same at
+ * this point as the req->clear_tail segment for the
+ * qpriv->r_tid_tail request)
+ */
+ e = &qp->s_ack_queue[qpriv->r_tid_ack];
+ req = ack_to_tid_req(e);
+ flow = req->acked_tail;
+ } else if (req->ack_seg == req->total_segs &&
+ qpriv->s_flags & HFI1_R_TID_WAIT_INTERLCK)
+ qpriv->s_flags &= ~HFI1_R_TID_WAIT_INTERLCK;
+
+ trace_hfi1_tid_write_rsp_make_tid_ack(qp);
+ trace_hfi1_tid_req_make_tid_ack(qp, 0, e->opcode, e->psn, e->lpsn,
+ req);
+ hwords += hfi1_build_tid_rdma_write_ack(qp, e, ohdr, flow, &bth1,
+ &bth2);
+ len = 0;
+ qpriv->s_flags &= ~RVT_S_ACK_PENDING;
+ ps->s_txreq->hdr_dwords = hwords;
+ ps->s_txreq->sde = qpriv->s_sde;
+ ps->s_txreq->s_cur_size = len;
+ ps->s_txreq->ss = NULL;
+ hfi1_make_ruc_header(qp, ohdr, (TID_OP(ACK) << 24), bth1, bth2, middle,
+ ps);
+ ps->s_txreq->txreq.flags |= SDMA_TXREQ_F_VIP;
+ return 1;
+bail:
+ /*
+ * Ensure s_rdma_ack_cnt changes are committed prior to resetting
+ * RVT_S_RESP_PENDING
+ */
+ smp_wmb();
+ qpriv->s_flags &= ~RVT_S_ACK_PENDING;
+ return 0;
+}
+
+static int hfi1_send_tid_ok(struct rvt_qp *qp)
+{
+ struct hfi1_qp_priv *priv = qp->priv;
+
+ return !(priv->s_flags & RVT_S_BUSY ||
+ qp->s_flags & HFI1_S_ANY_WAIT_IO) &&
+ (verbs_txreq_queued(iowait_get_tid_work(&priv->s_iowait)) ||
+ (priv->s_flags & RVT_S_RESP_PENDING) ||
+ !(qp->s_flags & HFI1_S_ANY_TID_WAIT_SEND));
+}
+
+void _hfi1_do_tid_send(struct work_struct *work)
+{
+ struct iowait_work *w = container_of(work, struct iowait_work, iowork);
+ struct rvt_qp *qp = iowait_to_qp(w->iow);
+
+ hfi1_do_tid_send(qp);
+}
+
+static void hfi1_do_tid_send(struct rvt_qp *qp)
+{
+ struct hfi1_pkt_state ps;
+ struct hfi1_qp_priv *priv = qp->priv;
+
+ ps.dev = to_idev(qp->ibqp.device);
+ ps.ibp = to_iport(qp->ibqp.device, qp->port_num);
+ ps.ppd = ppd_from_ibp(ps.ibp);
+ ps.wait = iowait_get_tid_work(&priv->s_iowait);
+ ps.in_thread = false;
+ ps.timeout_int = qp->timeout_jiffies / 8;
+
+ trace_hfi1_rc_do_tid_send(qp, false);
+ spin_lock_irqsave(&qp->s_lock, ps.flags);
+
+ /* Return if we are already busy processing a work request. */
+ if (!hfi1_send_tid_ok(qp)) {
+ if (qp->s_flags & HFI1_S_ANY_WAIT_IO)
+ iowait_set_flag(&priv->s_iowait, IOWAIT_PENDING_TID);
+ spin_unlock_irqrestore(&qp->s_lock, ps.flags);
+ return;
+ }
+
+ priv->s_flags |= RVT_S_BUSY;
+
+ ps.timeout = jiffies + ps.timeout_int;
+ ps.cpu = priv->s_sde ? priv->s_sde->cpu :
+ cpumask_first(cpumask_of_node(ps.ppd->dd->node));
+ ps.pkts_sent = false;
+
+ /* insure a pre-built packet is handled */
+ ps.s_txreq = get_waiting_verbs_txreq(ps.wait);
+ do {
+ /* Check for a constructed packet to be sent. */
+ if (ps.s_txreq) {
+ if (priv->s_flags & HFI1_S_TID_BUSY_SET) {
+ qp->s_flags |= RVT_S_BUSY;
+ ps.wait = iowait_get_ib_work(&priv->s_iowait);
+ }
+ spin_unlock_irqrestore(&qp->s_lock, ps.flags);
+
+ /*
+ * If the packet cannot be sent now, return and
+ * the send tasklet will be woken up later.
+ */
+ if (hfi1_verbs_send(qp, &ps))
+ return;
+
+ /* allow other tasks to run */
+ if (hfi1_schedule_send_yield(qp, &ps, true))
+ return;
+
+ spin_lock_irqsave(&qp->s_lock, ps.flags);
+ if (priv->s_flags & HFI1_S_TID_BUSY_SET) {
+ qp->s_flags &= ~RVT_S_BUSY;
+ priv->s_flags &= ~HFI1_S_TID_BUSY_SET;
+ ps.wait = iowait_get_tid_work(&priv->s_iowait);
+ if (iowait_flag_set(&priv->s_iowait,
+ IOWAIT_PENDING_IB))
+ hfi1_schedule_send(qp);
+ }
+ }
+ } while (hfi1_make_tid_rdma_pkt(qp, &ps));
+ iowait_starve_clear(ps.pkts_sent, &priv->s_iowait);
+ spin_unlock_irqrestore(&qp->s_lock, ps.flags);
+}
+
+static bool _hfi1_schedule_tid_send(struct rvt_qp *qp)
+{
+ struct hfi1_qp_priv *priv = qp->priv;
+ struct hfi1_ibport *ibp =
+ to_iport(qp->ibqp.device, qp->port_num);
+ struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+ struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device);
+
+ return iowait_tid_schedule(&priv->s_iowait, ppd->hfi1_wq,
+ priv->s_sde ?
+ priv->s_sde->cpu :
+ cpumask_first(cpumask_of_node(dd->node)));
+}
+
+/**
+ * hfi1_schedule_tid_send - schedule progress on TID RDMA state machine
+ * @qp: the QP
+ *
+ * This schedules qp progress on the TID RDMA state machine. Caller
+ * should hold the s_lock.
+ * Unlike hfi1_schedule_send(), this cannot use hfi1_send_ok() because
+ * the two state machines can step on each other with respect to the
+ * RVT_S_BUSY flag.
+ * Therefore, a modified test is used.
+ * @return true if the second leg is scheduled;
+ * false if the second leg is not scheduled.
+ */
+bool hfi1_schedule_tid_send(struct rvt_qp *qp)
+{
+ lockdep_assert_held(&qp->s_lock);
+ if (hfi1_send_tid_ok(qp)) {
+ /*
+ * The following call returns true if the qp is not on the
+ * queue and false if the qp is already on the queue before
+ * this call. Either way, the qp will be on the queue when the
+ * call returns.
+ */
+ _hfi1_schedule_tid_send(qp);
+ return true;
+ }
+ if (qp->s_flags & HFI1_S_ANY_WAIT_IO)
+ iowait_set_flag(&((struct hfi1_qp_priv *)qp->priv)->s_iowait,
+ IOWAIT_PENDING_TID);
+ return false;
+}
+
+bool hfi1_tid_rdma_ack_interlock(struct rvt_qp *qp, struct rvt_ack_entry *e)
+{
+ struct rvt_ack_entry *prev;
+ struct tid_rdma_request *req;
+ struct hfi1_ibdev *dev = to_idev(qp->ibqp.device);
+ struct hfi1_qp_priv *priv = qp->priv;
+ u32 s_prev;
+
+ s_prev = qp->s_tail_ack_queue == 0 ? rvt_size_atomic(&dev->rdi) :
+ (qp->s_tail_ack_queue - 1);
+ prev = &qp->s_ack_queue[s_prev];
+
+ if ((e->opcode == TID_OP(READ_REQ) ||
+ e->opcode == OP(RDMA_READ_REQUEST)) &&
+ prev->opcode == TID_OP(WRITE_REQ)) {
+ req = ack_to_tid_req(prev);
+ if (req->ack_seg != req->total_segs) {
+ priv->s_flags |= HFI1_R_TID_WAIT_INTERLCK;
+ return true;
+ }
+ }
+ return false;
+}
diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.h b/drivers/infiniband/hw/hfi1/tid_rdma.h
index a53598ce45b2..53ab24ef4f02 100644
--- a/drivers/infiniband/hw/hfi1/tid_rdma.h
+++ b/drivers/infiniband/hw/hfi1/tid_rdma.h
@@ -25,8 +25,34 @@
* s_flags, there are no collisions.
*
* HFI1_S_TID_WAIT_INTERLCK - QP is waiting for requester interlock
+ * HFI1_R_TID_WAIT_INTERLCK - QP is waiting for responder interlock
*/
+#define HFI1_S_TID_BUSY_SET BIT(0)
+/* BIT(1) reserved for RVT_S_BUSY. */
+#define HFI1_R_TID_RSC_TIMER BIT(2)
+/* BIT(3) reserved for RVT_S_RESP_PENDING. */
+/* BIT(4) reserved for RVT_S_ACK_PENDING. */
#define HFI1_S_TID_WAIT_INTERLCK BIT(5)
+#define HFI1_R_TID_WAIT_INTERLCK BIT(6)
+/* BIT(7) - BIT(15) reserved for RVT_S_WAIT_*. */
+/* BIT(16) reserved for RVT_S_SEND_ONE */
+#define HFI1_S_TID_RETRY_TIMER BIT(17)
+/* BIT(18) reserved for RVT_S_ECN. */
+#define HFI1_R_TID_SW_PSN BIT(19)
+/* BIT(26) reserved for HFI1_S_WAIT_HALT */
+/* BIT(27) reserved for HFI1_S_WAIT_TID_RESP */
+/* BIT(28) reserved for HFI1_S_WAIT_TID_SPACE */
+
+/*
+ * Unlike regular IB RDMA VERBS, which do not require an entry
+ * in the s_ack_queue, TID RDMA WRITE requests do because they
+ * generate responses.
+ * Therefore, the s_ack_queue needs to be extended by a certain
+ * amount. The key point is that the queue needs to be extended
+ * without letting the "user" know so they user doesn't end up
+ * using these extra entries.
+ */
+#define HFI1_TID_RDMA_WRITE_CNT 8
struct tid_rdma_params {
struct rcu_head rcu_head;
@@ -78,20 +104,25 @@ struct tid_rdma_request {
} e;
struct tid_rdma_flow *flows; /* array of tid flows */
+ struct rvt_sge_state ss; /* SGE state for TID RDMA requests */
u16 n_flows; /* size of the flow buffer window */
u16 setup_head; /* flow index we are setting up */
u16 clear_tail; /* flow index we are clearing */
u16 flow_idx; /* flow index most recently set up */
+ u16 acked_tail;
u32 seg_len;
u32 total_len;
+ u32 r_ack_psn; /* next expected ack PSN */
u32 r_flow_psn; /* IB PSN of next segment start */
+ u32 r_last_acked; /* IB PSN of last ACK'ed packet */
u32 s_next_psn; /* IB PSN of next segment start for read */
u32 total_segs; /* segments required to complete a request */
u32 cur_seg; /* index of current segment */
u32 comp_seg; /* index of last completed segment */
u32 ack_seg; /* index of last ack'ed segment */
+ u32 alloc_seg; /* index of next segment to be allocated */
u32 isge; /* index of "current" sge */
u32 ack_pending; /* num acks pending for this request */
@@ -158,11 +189,18 @@ struct tid_rdma_flow {
u8 npagesets;
u8 npkts;
u8 pkt;
+ u8 resync_npkts;
struct kern_tid_node tnode[TID_RDMA_MAX_PAGES];
struct tid_rdma_pageset pagesets[TID_RDMA_MAX_PAGES];
u32 tid_entry[TID_RDMA_MAX_PAGES];
};
+enum tid_rnr_nak_state {
+ TID_RNR_NAK_INIT = 0,
+ TID_RNR_NAK_SEND,
+ TID_RNR_NAK_SENT,
+};
+
bool tid_rdma_conn_req(struct rvt_qp *qp, u64 *data);
bool tid_rdma_conn_reply(struct rvt_qp *qp, u64 data);
bool tid_rdma_conn_resp(struct rvt_qp *qp, u64 *data);
@@ -228,9 +266,57 @@ static inline void hfi1_setup_tid_rdma_wqe(struct rvt_qp *qp,
struct rvt_swqe *wqe)
{
if (wqe->priv &&
- wqe->wr.opcode == IB_WR_RDMA_READ &&
+ (wqe->wr.opcode == IB_WR_RDMA_READ ||
+ wqe->wr.opcode == IB_WR_RDMA_WRITE) &&
wqe->length >= TID_RDMA_MIN_SEGMENT_SIZE)
setup_tid_rdma_wqe(qp, wqe);
}
+u32 hfi1_build_tid_rdma_write_req(struct rvt_qp *qp, struct rvt_swqe *wqe,
+ struct ib_other_headers *ohdr,
+ u32 *bth1, u32 *bth2, u32 *len);
+
+void hfi1_compute_tid_rdma_flow_wt(void);
+
+void hfi1_rc_rcv_tid_rdma_write_req(struct hfi1_packet *packet);
+
+u32 hfi1_build_tid_rdma_write_resp(struct rvt_qp *qp, struct rvt_ack_entry *e,
+ struct ib_other_headers *ohdr, u32 *bth1,
+ u32 bth2, u32 *len,
+ struct rvt_sge_state **ss);
+
+void hfi1_del_tid_reap_timer(struct rvt_qp *qp);
+
+void hfi1_rc_rcv_tid_rdma_write_resp(struct hfi1_packet *packet);
+
+bool hfi1_build_tid_rdma_packet(struct rvt_swqe *wqe,
+ struct ib_other_headers *ohdr,
+ u32 *bth1, u32 *bth2, u32 *len);
+
+void hfi1_rc_rcv_tid_rdma_write_data(struct hfi1_packet *packet);
+
+u32 hfi1_build_tid_rdma_write_ack(struct rvt_qp *qp, struct rvt_ack_entry *e,
+ struct ib_other_headers *ohdr, u16 iflow,
+ u32 *bth1, u32 *bth2);
+
+void hfi1_rc_rcv_tid_rdma_ack(struct hfi1_packet *packet);
+
+void hfi1_add_tid_retry_timer(struct rvt_qp *qp);
+void hfi1_del_tid_retry_timer(struct rvt_qp *qp);
+
+u32 hfi1_build_tid_rdma_resync(struct rvt_qp *qp, struct rvt_swqe *wqe,
+ struct ib_other_headers *ohdr, u32 *bth1,
+ u32 *bth2, u16 fidx);
+
+void hfi1_rc_rcv_tid_rdma_resync(struct hfi1_packet *packet);
+
+struct hfi1_pkt_state;
+int hfi1_make_tid_rdma_pkt(struct rvt_qp *qp, struct hfi1_pkt_state *ps);
+
+void _hfi1_do_tid_send(struct work_struct *work);
+
+bool hfi1_schedule_tid_send(struct rvt_qp *qp);
+
+bool hfi1_tid_rdma_ack_interlock(struct rvt_qp *qp, struct rvt_ack_entry *e);
+
#endif /* HFI1_TID_RDMA_H */
diff --git a/drivers/infiniband/hw/hfi1/trace.c b/drivers/infiniband/hw/hfi1/trace.c
index 28181d711fed..9a3d236bcc88 100644
--- a/drivers/infiniband/hw/hfi1/trace.c
+++ b/drivers/infiniband/hw/hfi1/trace.c
@@ -133,6 +133,11 @@ const char *hfi1_trace_get_packet_l2_str(u8 l2)
#define TID_RDMA_KDETH_DATA "kdeth0 0x%x: kver %u sh %u intr %u tidctrl %u tid %x offset %x kdeth1 0x%x: jkey %x"
#define TID_READ_REQ_PRN "tid_flow_psn 0x%x tid_flow_qp 0x%x verbs_qp 0x%x"
#define TID_READ_RSP_PRN "verbs_qp 0x%x"
+#define TID_WRITE_REQ_PRN "original_qp 0x%x"
+#define TID_WRITE_RSP_PRN "tid_flow_psn 0x%x tid_flow_qp 0x%x verbs_qp 0x%x"
+#define TID_WRITE_DATA_PRN "verbs_qp 0x%x"
+#define TID_ACK_PRN "tid_flow_psn 0x%x verbs_psn 0x%x tid_flow_qp 0x%x verbs_qp 0x%x"
+#define TID_RESYNC_PRN "verbs_qp 0x%x"
#define OP(transport, op) IB_OPCODE_## transport ## _ ## op
@@ -327,6 +332,45 @@ const char *parse_everbs_hdrs(
parse_syndrome(be32_to_cpu(eh->aeth) >> 24),
be32_to_cpu(eh->aeth) & IB_MSN_MASK);
break;
+ case OP(TID_RDMA, WRITE_REQ):
+ trace_seq_printf(p, TID_RDMA_KDETH " " RETH_PRN " "
+ TID_WRITE_REQ_PRN,
+ le32_to_cpu(eh->tid_rdma.w_req.kdeth0),
+ le32_to_cpu(eh->tid_rdma.w_req.kdeth1),
+ ib_u64_get(&eh->tid_rdma.w_req.reth.vaddr),
+ be32_to_cpu(eh->tid_rdma.w_req.reth.rkey),
+ be32_to_cpu(eh->tid_rdma.w_req.reth.length),
+ be32_to_cpu(eh->tid_rdma.w_req.verbs_qp));
+ break;
+ case OP(TID_RDMA, WRITE_RESP):
+ trace_seq_printf(p, TID_RDMA_KDETH " " AETH_PRN " "
+ TID_WRITE_RSP_PRN,
+ le32_to_cpu(eh->tid_rdma.w_rsp.kdeth0),
+ le32_to_cpu(eh->tid_rdma.w_rsp.kdeth1),
+ be32_to_cpu(eh->tid_rdma.w_rsp.aeth) >> 24,
+ parse_syndrome(/* aeth */
+ be32_to_cpu(eh->tid_rdma.w_rsp.aeth)
+ >> 24),
+ (be32_to_cpu(eh->tid_rdma.w_rsp.aeth) &
+ IB_MSN_MASK),
+ be32_to_cpu(eh->tid_rdma.w_rsp.tid_flow_psn),
+ be32_to_cpu(eh->tid_rdma.w_rsp.tid_flow_qp),
+ be32_to_cpu(eh->tid_rdma.w_rsp.verbs_qp));
+ break;
+ case OP(TID_RDMA, WRITE_DATA_LAST):
+ case OP(TID_RDMA, WRITE_DATA):
+ trace_seq_printf(p, TID_RDMA_KDETH_DATA " " TID_WRITE_DATA_PRN,
+ le32_to_cpu(eh->tid_rdma.w_data.kdeth0),
+ KDETH_GET(eh->tid_rdma.w_data.kdeth0, KVER),
+ KDETH_GET(eh->tid_rdma.w_data.kdeth0, SH),
+ KDETH_GET(eh->tid_rdma.w_data.kdeth0, INTR),
+ KDETH_GET(eh->tid_rdma.w_data.kdeth0, TIDCTRL),
+ KDETH_GET(eh->tid_rdma.w_data.kdeth0, TID),
+ KDETH_GET(eh->tid_rdma.w_data.kdeth0, OFFSET),
+ le32_to_cpu(eh->tid_rdma.w_data.kdeth1),
+ KDETH_GET(eh->tid_rdma.w_data.kdeth1, JKEY),
+ be32_to_cpu(eh->tid_rdma.w_data.verbs_qp));
+ break;
case OP(TID_RDMA, READ_REQ):
trace_seq_printf(p, TID_RDMA_KDETH " " RETH_PRN " "
TID_READ_REQ_PRN,
@@ -359,6 +403,28 @@ const char *parse_everbs_hdrs(
IB_MSN_MASK),
be32_to_cpu(eh->tid_rdma.r_rsp.verbs_qp));
break;
+ case OP(TID_RDMA, ACK):
+ trace_seq_printf(p, TID_RDMA_KDETH " " AETH_PRN " "
+ TID_ACK_PRN,
+ le32_to_cpu(eh->tid_rdma.ack.kdeth0),
+ le32_to_cpu(eh->tid_rdma.ack.kdeth1),
+ be32_to_cpu(eh->tid_rdma.ack.aeth) >> 24,
+ parse_syndrome(/* aeth */
+ be32_to_cpu(eh->tid_rdma.ack.aeth)
+ >> 24),
+ (be32_to_cpu(eh->tid_rdma.ack.aeth) &
+ IB_MSN_MASK),
+ be32_to_cpu(eh->tid_rdma.ack.tid_flow_psn),
+ be32_to_cpu(eh->tid_rdma.ack.verbs_psn),
+ be32_to_cpu(eh->tid_rdma.ack.tid_flow_qp),
+ be32_to_cpu(eh->tid_rdma.ack.verbs_qp));
+ break;
+ case OP(TID_RDMA, RESYNC):
+ trace_seq_printf(p, TID_RDMA_KDETH " " TID_RESYNC_PRN,
+ le32_to_cpu(eh->tid_rdma.resync.kdeth0),
+ le32_to_cpu(eh->tid_rdma.resync.kdeth1),
+ be32_to_cpu(eh->tid_rdma.resync.verbs_qp));
+ break;
/* aeth + atomicacketh */
case OP(RC, ATOMIC_ACKNOWLEDGE):
trace_seq_printf(p, AETH_PRN " " ATOMICACKETH_PRN,
diff --git a/drivers/infiniband/hw/hfi1/trace_ibhdrs.h b/drivers/infiniband/hw/hfi1/trace_ibhdrs.h
index 1116238bf24d..d1372cc66de6 100644
--- a/drivers/infiniband/hw/hfi1/trace_ibhdrs.h
+++ b/drivers/infiniband/hw/hfi1/trace_ibhdrs.h
@@ -79,8 +79,14 @@ __print_symbolic(opcode, \
ib_opcode_name(RC_ATOMIC_ACKNOWLEDGE), \
ib_opcode_name(RC_COMPARE_SWAP), \
ib_opcode_name(RC_FETCH_ADD), \
+ ib_opcode_name(TID_RDMA_WRITE_REQ), \
+ ib_opcode_name(TID_RDMA_WRITE_RESP), \
+ ib_opcode_name(TID_RDMA_WRITE_DATA), \
+ ib_opcode_name(TID_RDMA_WRITE_DATA_LAST), \
ib_opcode_name(TID_RDMA_READ_REQ), \
ib_opcode_name(TID_RDMA_READ_RESP), \
+ ib_opcode_name(TID_RDMA_RESYNC), \
+ ib_opcode_name(TID_RDMA_ACK), \
ib_opcode_name(UC_SEND_FIRST), \
ib_opcode_name(UC_SEND_MIDDLE), \
ib_opcode_name(UC_SEND_LAST), \
diff --git a/drivers/infiniband/hw/hfi1/trace_tid.h b/drivers/infiniband/hw/hfi1/trace_tid.h
index b71638c22d4b..548dfc45a407 100644
--- a/drivers/infiniband/hw/hfi1/trace_tid.h
+++ b/drivers/infiniband/hw/hfi1/trace_tid.h
@@ -40,7 +40,7 @@ u16 hfi1_trace_get_tid_idx(u32 ent);
#define RSP_INFO_PRN "[%s] qpn 0x%x state 0x%x s_state 0x%x psn 0x%x " \
"r_psn 0x%x r_state 0x%x r_flags 0x%x " \
"r_head_ack_queue %u s_tail_ack_queue %u " \
- "s_ack_state 0x%x " \
+ "s_acked_ack_queue %u s_ack_state 0x%x " \
"s_nak_state 0x%x s_flags 0x%x ps_flags 0x%x " \
"iow_flags 0x%lx"
@@ -52,20 +52,37 @@ u16 hfi1_trace_get_tid_idx(u32 ent);
#define TID_READ_SENDER_PRN "[%s] qpn 0x%x newreq %u tid_r_reqs %u " \
"tid_r_comp %u pending_tid_r_segs %u " \
"s_flags 0x%x ps_flags 0x%x iow_flags 0x%lx " \
- "hw_flow_index %u generation 0x%x " \
+ "s_state 0x%x hw_flow_index %u generation 0x%x " \
"fpsn 0x%x flow_flags 0x%x"
#define TID_REQ_PRN "[%s] qpn 0x%x newreq %u opcode 0x%x psn 0x%x lpsn 0x%x " \
- "cur_seg %u comp_seg %u ack_seg %u " \
+ "cur_seg %u comp_seg %u ack_seg %u alloc_seg %u " \
"total_segs %u setup_head %u clear_tail %u flow_idx %u " \
- "state %u r_flow_psn 0x%x " \
- "s_next_psn 0x%x"
+ "acked_tail %u state %u r_ack_psn 0x%x r_flow_psn 0x%x " \
+ "r_last_ackd 0x%x s_next_psn 0x%x"
#define RCV_ERR_PRN "[%s] qpn 0x%x s_flags 0x%x state 0x%x " \
- "s_tail_ack_queue %u " \
+ "s_acked_ack_queue %u s_tail_ack_queue %u " \
"r_head_ack_queue %u opcode 0x%x psn 0x%x r_psn 0x%x " \
" diff %d"
+#define TID_WRITE_RSPDR_PRN "[%s] qpn 0x%x r_tid_head %u r_tid_tail %u " \
+ "r_tid_ack %u r_tid_alloc %u alloc_w_segs %u " \
+ "pending_tid_w_segs %u sync_pt %s " \
+ "ps_nak_psn 0x%x ps_nak_state 0x%x " \
+ "prnr_nak_state 0x%x hw_flow_index %u generation "\
+ "0x%x fpsn 0x%x flow_flags 0x%x resync %s" \
+ "r_next_psn_kdeth 0x%x"
+
+#define TID_WRITE_SENDER_PRN "[%s] qpn 0x%x newreq %u s_tid_cur %u " \
+ "s_tid_tail %u s_tid_head %u " \
+ "pending_tid_w_resp %u n_requests %u " \
+ "n_tid_requests %u s_flags 0x%x ps_flags 0x%x "\
+ "iow_flags 0x%lx s_state 0x%x s_retry %u"
+
+#define KDETH_EFLAGS_ERR_PRN "[%s] qpn 0x%x TID ERR: RcvType 0x%x " \
+ "RcvTypeError 0x%x PSN 0x%x"
+
DECLARE_EVENT_CLASS(/* class */
hfi1_exp_tid_reg_unreg,
TP_PROTO(unsigned int ctxt, u16 subctxt, u32 rarr, u32 npages,
@@ -382,6 +399,18 @@ DEFINE_EVENT(/* event */
TP_ARGS(qp, msg, more)
);
+DEFINE_EVENT(/* event */
+ hfi1_msg_template, hfi1_msg_tid_timeout,
+ TP_PROTO(struct rvt_qp *qp, const char *msg, u64 more),
+ TP_ARGS(qp, msg, more)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_msg_template, hfi1_msg_tid_retry_timeout,
+ TP_PROTO(struct rvt_qp *qp, const char *msg, u64 more),
+ TP_ARGS(qp, msg, more)
+);
+
DECLARE_EVENT_CLASS(/* tid_flow_page */
hfi1_tid_flow_page_template,
TP_PROTO(struct rvt_qp *qp, struct tid_rdma_flow *flow, u32 index,
@@ -562,6 +591,42 @@ DEFINE_EVENT(/* event */
TP_ARGS(qp, index, flow)
);
+DEFINE_EVENT(/* event */
+ hfi1_tid_flow_template, hfi1_tid_flow_build_write_resp,
+ TP_PROTO(struct rvt_qp *qp, int index, struct tid_rdma_flow *flow),
+ TP_ARGS(qp, index, flow)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_flow_template, hfi1_tid_flow_rcv_write_resp,
+ TP_PROTO(struct rvt_qp *qp, int index, struct tid_rdma_flow *flow),
+ TP_ARGS(qp, index, flow)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_flow_template, hfi1_tid_flow_build_write_data,
+ TP_PROTO(struct rvt_qp *qp, int index, struct tid_rdma_flow *flow),
+ TP_ARGS(qp, index, flow)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_flow_template, hfi1_tid_flow_rcv_tid_ack,
+ TP_PROTO(struct rvt_qp *qp, int index, struct tid_rdma_flow *flow),
+ TP_ARGS(qp, index, flow)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_flow_template, hfi1_tid_flow_rcv_resync,
+ TP_PROTO(struct rvt_qp *qp, int index, struct tid_rdma_flow *flow),
+ TP_ARGS(qp, index, flow)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_flow_template, hfi1_tid_flow_handle_kdeth_eflags,
+ TP_PROTO(struct rvt_qp *qp, int index, struct tid_rdma_flow *flow),
+ TP_ARGS(qp, index, flow)
+);
+
DECLARE_EVENT_CLASS(/* tid_node */
hfi1_tid_node_template,
TP_PROTO(struct rvt_qp *qp, const char *msg, u32 index, u32 base,
@@ -656,6 +721,18 @@ DEFINE_EVENT(/* event */
TP_ARGS(qp, index, ent)
);
+DEFINE_EVENT(/* event */
+ hfi1_tid_entry_template, hfi1_tid_entry_rcv_write_resp,
+ TP_PROTO(struct rvt_qp *qp, int index, u32 entry),
+ TP_ARGS(qp, index, entry)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_entry_template, hfi1_tid_entry_build_write_data,
+ TP_PROTO(struct rvt_qp *qp, int index, u32 entry),
+ TP_ARGS(qp, index, entry)
+);
+
DECLARE_EVENT_CLASS(/* rsp_info */
hfi1_responder_info_template,
TP_PROTO(struct rvt_qp *qp, u32 psn),
@@ -671,6 +748,7 @@ DECLARE_EVENT_CLASS(/* rsp_info */
__field(u8, r_flags)
__field(u8, r_head_ack_queue)
__field(u8, s_tail_ack_queue)
+ __field(u8, s_acked_ack_queue)
__field(u8, s_ack_state)
__field(u8, s_nak_state)
__field(u8, r_nak_state)
@@ -691,6 +769,7 @@ DECLARE_EVENT_CLASS(/* rsp_info */
__entry->r_flags = qp->r_flags;
__entry->r_head_ack_queue = qp->r_head_ack_queue;
__entry->s_tail_ack_queue = qp->s_tail_ack_queue;
+ __entry->s_acked_ack_queue = qp->s_acked_ack_queue;
__entry->s_ack_state = qp->s_ack_state;
__entry->s_nak_state = qp->s_nak_state;
__entry->s_flags = qp->s_flags;
@@ -709,6 +788,7 @@ DECLARE_EVENT_CLASS(/* rsp_info */
__entry->r_flags,
__entry->r_head_ack_queue,
__entry->s_tail_ack_queue,
+ __entry->s_acked_ack_queue,
__entry->s_ack_state,
__entry->s_nak_state,
__entry->s_flags,
@@ -735,6 +815,42 @@ DEFINE_EVENT(/* event */
TP_ARGS(qp, psn)
);
+DEFINE_EVENT(/* event */
+ hfi1_responder_info_template, hfi1_rsp_tid_write_alloc_res,
+ TP_PROTO(struct rvt_qp *qp, u32 psn),
+ TP_ARGS(qp, psn)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_responder_info_template, hfi1_rsp_rcv_tid_write_req,
+ TP_PROTO(struct rvt_qp *qp, u32 psn),
+ TP_ARGS(qp, psn)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_responder_info_template, hfi1_rsp_build_tid_write_resp,
+ TP_PROTO(struct rvt_qp *qp, u32 psn),
+ TP_ARGS(qp, psn)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_responder_info_template, hfi1_rsp_rcv_tid_write_data,
+ TP_PROTO(struct rvt_qp *qp, u32 psn),
+ TP_ARGS(qp, psn)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_responder_info_template, hfi1_rsp_make_tid_ack,
+ TP_PROTO(struct rvt_qp *qp, u32 psn),
+ TP_ARGS(qp, psn)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_responder_info_template, hfi1_rsp_handle_kdeth_eflags,
+ TP_PROTO(struct rvt_qp *qp, u32 psn),
+ TP_ARGS(qp, psn)
+);
+
DECLARE_EVENT_CLASS(/* sender_info */
hfi1_sender_info_template,
TP_PROTO(struct rvt_qp *qp),
@@ -827,6 +943,18 @@ DEFINE_EVENT(/* event */
TP_ARGS(qp)
);
+DEFINE_EVENT(/* event */
+ hfi1_sender_info_template, hfi1_sender_rcv_tid_ack,
+ TP_PROTO(struct rvt_qp *qp),
+ TP_ARGS(qp)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_sender_info_template, hfi1_sender_make_tid_pkt,
+ TP_PROTO(struct rvt_qp *qp),
+ TP_ARGS(qp)
+);
+
DECLARE_EVENT_CLASS(/* tid_read_sender */
hfi1_tid_read_sender_template,
TP_PROTO(struct rvt_qp *qp, char newreq),
@@ -841,6 +969,7 @@ DECLARE_EVENT_CLASS(/* tid_read_sender */
__field(u32, s_flags)
__field(u32, ps_flags)
__field(unsigned long, iow_flags)
+ __field(u8, s_state)
__field(u32, hw_flow_index)
__field(u32, generation)
__field(u32, fpsn)
@@ -858,6 +987,7 @@ DECLARE_EVENT_CLASS(/* tid_read_sender */
__entry->s_flags = qp->s_flags;
__entry->ps_flags = priv->s_flags;
__entry->iow_flags = priv->s_iowait.flags;
+ __entry->s_state = priv->s_state;
__entry->hw_flow_index = priv->flow_state.index;
__entry->generation = priv->flow_state.generation;
__entry->fpsn = priv->flow_state.psn;
@@ -874,6 +1004,7 @@ DECLARE_EVENT_CLASS(/* tid_read_sender */
__entry->s_flags,
__entry->ps_flags,
__entry->iow_flags,
+ __entry->s_state,
__entry->hw_flow_index,
__entry->generation,
__entry->fpsn,
@@ -902,12 +1033,16 @@ DECLARE_EVENT_CLASS(/* tid_rdma_request */
__field(u32, cur_seg)
__field(u32, comp_seg)
__field(u32, ack_seg)
+ __field(u32, alloc_seg)
__field(u32, total_segs)
__field(u16, setup_head)
__field(u16, clear_tail)
__field(u16, flow_idx)
+ __field(u16, acked_tail)
__field(u32, state)
+ __field(u32, r_ack_psn)
__field(u32, r_flow_psn)
+ __field(u32, r_last_acked)
__field(u32, s_next_psn)
),
TP_fast_assign(/* assign */
@@ -920,12 +1055,16 @@ DECLARE_EVENT_CLASS(/* tid_rdma_request */
__entry->cur_seg = req->cur_seg;
__entry->comp_seg = req->comp_seg;
__entry->ack_seg = req->ack_seg;
+ __entry->alloc_seg = req->alloc_seg;
__entry->total_segs = req->total_segs;
__entry->setup_head = req->setup_head;
__entry->clear_tail = req->clear_tail;
__entry->flow_idx = req->flow_idx;
+ __entry->acked_tail = req->acked_tail;
__entry->state = req->state;
+ __entry->r_ack_psn = req->r_ack_psn;
__entry->r_flow_psn = req->r_flow_psn;
+ __entry->r_last_acked = req->r_last_acked;
__entry->s_next_psn = req->s_next_psn;
),
TP_printk(/* print */
@@ -939,12 +1078,16 @@ DECLARE_EVENT_CLASS(/* tid_rdma_request */
__entry->cur_seg,
__entry->comp_seg,
__entry->ack_seg,
+ __entry->alloc_seg,
__entry->total_segs,
__entry->setup_head,
__entry->clear_tail,
__entry->flow_idx,
+ __entry->acked_tail,
__entry->state,
+ __entry->r_ack_psn,
__entry->r_flow_psn,
+ __entry->r_last_acked,
__entry->s_next_psn
)
);
@@ -998,6 +1141,97 @@ DEFINE_EVENT(/* event */
TP_ARGS(qp, newreq, opcode, psn, lpsn, req)
);
+DEFINE_EVENT(/* event */
+ hfi1_tid_rdma_request_template, hfi1_tid_req_write_alloc_res,
+ TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn,
+ struct tid_rdma_request *req),
+ TP_ARGS(qp, newreq, opcode, psn, lpsn, req)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_rdma_request_template, hfi1_tid_req_rcv_write_req,
+ TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn,
+ struct tid_rdma_request *req),
+ TP_ARGS(qp, newreq, opcode, psn, lpsn, req)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_rdma_request_template, hfi1_tid_req_build_write_resp,
+ TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn,
+ struct tid_rdma_request *req),
+ TP_ARGS(qp, newreq, opcode, psn, lpsn, req)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_rdma_request_template, hfi1_tid_req_rcv_write_resp,
+ TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn,
+ struct tid_rdma_request *req),
+ TP_ARGS(qp, newreq, opcode, psn, lpsn, req)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_rdma_request_template, hfi1_tid_req_rcv_write_data,
+ TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn,
+ struct tid_rdma_request *req),
+ TP_ARGS(qp, newreq, opcode, psn, lpsn, req)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_rdma_request_template, hfi1_tid_req_rcv_tid_ack,
+ TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn,
+ struct tid_rdma_request *req),
+ TP_ARGS(qp, newreq, opcode, psn, lpsn, req)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_rdma_request_template, hfi1_tid_req_tid_retry_timeout,
+ TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn,
+ struct tid_rdma_request *req),
+ TP_ARGS(qp, newreq, opcode, psn, lpsn, req)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_rdma_request_template, hfi1_tid_req_rcv_resync,
+ TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn,
+ struct tid_rdma_request *req),
+ TP_ARGS(qp, newreq, opcode, psn, lpsn, req)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_rdma_request_template, hfi1_tid_req_make_tid_pkt,
+ TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn,
+ struct tid_rdma_request *req),
+ TP_ARGS(qp, newreq, opcode, psn, lpsn, req)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_rdma_request_template, hfi1_tid_req_make_tid_ack,
+ TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn,
+ struct tid_rdma_request *req),
+ TP_ARGS(qp, newreq, opcode, psn, lpsn, req)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_rdma_request_template, hfi1_tid_req_handle_kdeth_eflags,
+ TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn,
+ struct tid_rdma_request *req),
+ TP_ARGS(qp, newreq, opcode, psn, lpsn, req)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_rdma_request_template, hfi1_tid_req_make_rc_ack_write,
+ TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn,
+ struct tid_rdma_request *req),
+ TP_ARGS(qp, newreq, opcode, psn, lpsn, req)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_rdma_request_template, hfi1_tid_req_make_req_write,
+ TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn,
+ struct tid_rdma_request *req),
+ TP_ARGS(qp, newreq, opcode, psn, lpsn, req)
+);
+
DECLARE_EVENT_CLASS(/* rc_rcv_err */
hfi1_rc_rcv_err_template,
TP_PROTO(struct rvt_qp *qp, u32 opcode, u32 psn, int diff),
@@ -1007,6 +1241,7 @@ DECLARE_EVENT_CLASS(/* rc_rcv_err */
__field(u32, qpn)
__field(u32, s_flags)
__field(u8, state)
+ __field(u8, s_acked_ack_queue)
__field(u8, s_tail_ack_queue)
__field(u8, r_head_ack_queue)
__field(u32, opcode)
@@ -1019,6 +1254,7 @@ DECLARE_EVENT_CLASS(/* rc_rcv_err */
__entry->qpn = qp->ibqp.qp_num;
__entry->s_flags = qp->s_flags;
__entry->state = qp->state;
+ __entry->s_acked_ack_queue = qp->s_acked_ack_queue;
__entry->s_tail_ack_queue = qp->s_tail_ack_queue;
__entry->r_head_ack_queue = qp->r_head_ack_queue;
__entry->opcode = opcode;
@@ -1032,6 +1268,7 @@ DECLARE_EVENT_CLASS(/* rc_rcv_err */
__entry->qpn,
__entry->s_flags,
__entry->state,
+ __entry->s_acked_ack_queue,
__entry->s_tail_ack_queue,
__entry->r_head_ack_queue,
__entry->opcode,
@@ -1081,6 +1318,289 @@ DEFINE_EVENT(/* event */
TP_ARGS(qp, index, sge)
);
+DECLARE_EVENT_CLASS(/* tid_write_sp */
+ hfi1_tid_write_rsp_template,
+ TP_PROTO(struct rvt_qp *qp),
+ TP_ARGS(qp),
+ TP_STRUCT__entry(/* entry */
+ DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device))
+ __field(u32, qpn)
+ __field(u32, r_tid_head)
+ __field(u32, r_tid_tail)
+ __field(u32, r_tid_ack)
+ __field(u32, r_tid_alloc)
+ __field(u32, alloc_w_segs)
+ __field(u32, pending_tid_w_segs)
+ __field(bool, sync_pt)
+ __field(u32, ps_nak_psn)
+ __field(u8, ps_nak_state)
+ __field(u8, prnr_nak_state)
+ __field(u32, hw_flow_index)
+ __field(u32, generation)
+ __field(u32, fpsn)
+ __field(u32, flow_flags)
+ __field(bool, resync)
+ __field(u32, r_next_psn_kdeth)
+ ),
+ TP_fast_assign(/* assign */
+ struct hfi1_qp_priv *priv = qp->priv;
+
+ DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device));
+ __entry->qpn = qp->ibqp.qp_num;
+ __entry->r_tid_head = priv->r_tid_head;
+ __entry->r_tid_tail = priv->r_tid_tail;
+ __entry->r_tid_ack = priv->r_tid_ack;
+ __entry->r_tid_alloc = priv->r_tid_alloc;
+ __entry->alloc_w_segs = priv->alloc_w_segs;
+ __entry->pending_tid_w_segs = priv->pending_tid_w_segs;
+ __entry->sync_pt = priv->sync_pt;
+ __entry->ps_nak_psn = priv->s_nak_psn;
+ __entry->ps_nak_state = priv->s_nak_state;
+ __entry->prnr_nak_state = priv->rnr_nak_state;
+ __entry->hw_flow_index = priv->flow_state.index;
+ __entry->generation = priv->flow_state.generation;
+ __entry->fpsn = priv->flow_state.psn;
+ __entry->flow_flags = priv->flow_state.flags;
+ __entry->resync = priv->resync;
+ __entry->r_next_psn_kdeth = priv->r_next_psn_kdeth;
+ ),
+ TP_printk(/* print */
+ TID_WRITE_RSPDR_PRN,
+ __get_str(dev),
+ __entry->qpn,
+ __entry->r_tid_head,
+ __entry->r_tid_tail,
+ __entry->r_tid_ack,
+ __entry->r_tid_alloc,
+ __entry->alloc_w_segs,
+ __entry->pending_tid_w_segs,
+ __entry->sync_pt ? "yes" : "no",
+ __entry->ps_nak_psn,
+ __entry->ps_nak_state,
+ __entry->prnr_nak_state,
+ __entry->hw_flow_index,
+ __entry->generation,
+ __entry->fpsn,
+ __entry->flow_flags,
+ __entry->resync ? "yes" : "no",
+ __entry->r_next_psn_kdeth
+ )
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_write_rsp_template, hfi1_tid_write_rsp_alloc_res,
+ TP_PROTO(struct rvt_qp *qp),
+ TP_ARGS(qp)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_write_rsp_template, hfi1_tid_write_rsp_rcv_req,
+ TP_PROTO(struct rvt_qp *qp),
+ TP_ARGS(qp)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_write_rsp_template, hfi1_tid_write_rsp_build_resp,
+ TP_PROTO(struct rvt_qp *qp),
+ TP_ARGS(qp)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_write_rsp_template, hfi1_tid_write_rsp_rcv_data,
+ TP_PROTO(struct rvt_qp *qp),
+ TP_ARGS(qp)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_write_rsp_template, hfi1_tid_write_rsp_rcv_resync,
+ TP_PROTO(struct rvt_qp *qp),
+ TP_ARGS(qp)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_write_rsp_template, hfi1_tid_write_rsp_make_tid_ack,
+ TP_PROTO(struct rvt_qp *qp),
+ TP_ARGS(qp)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_write_rsp_template, hfi1_tid_write_rsp_handle_kdeth_eflags,
+ TP_PROTO(struct rvt_qp *qp),
+ TP_ARGS(qp)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_write_rsp_template, hfi1_tid_write_rsp_make_rc_ack,
+ TP_PROTO(struct rvt_qp *qp),
+ TP_ARGS(qp)
+);
+
+DECLARE_EVENT_CLASS(/* tid_write_sender */
+ hfi1_tid_write_sender_template,
+ TP_PROTO(struct rvt_qp *qp, char newreq),
+ TP_ARGS(qp, newreq),
+ TP_STRUCT__entry(/* entry */
+ DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device))
+ __field(u32, qpn)
+ __field(char, newreq)
+ __field(u32, s_tid_cur)
+ __field(u32, s_tid_tail)
+ __field(u32, s_tid_head)
+ __field(u32, pending_tid_w_resp)
+ __field(u32, n_requests)
+ __field(u32, n_tid_requests)
+ __field(u32, s_flags)
+ __field(u32, ps_flags)
+ __field(unsigned long, iow_flags)
+ __field(u8, s_state)
+ __field(u8, s_retry)
+ ),
+ TP_fast_assign(/* assign */
+ struct hfi1_qp_priv *priv = qp->priv;
+
+ DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device));
+ __entry->qpn = qp->ibqp.qp_num;
+ __entry->newreq = newreq;
+ __entry->s_tid_cur = priv->s_tid_cur;
+ __entry->s_tid_tail = priv->s_tid_tail;
+ __entry->s_tid_head = priv->s_tid_head;
+ __entry->pending_tid_w_resp = priv->pending_tid_w_resp;
+ __entry->n_requests = atomic_read(&priv->n_requests);
+ __entry->n_tid_requests = atomic_read(&priv->n_tid_requests);
+ __entry->s_flags = qp->s_flags;
+ __entry->ps_flags = priv->s_flags;
+ __entry->iow_flags = priv->s_iowait.flags;
+ __entry->s_state = priv->s_state;
+ __entry->s_retry = priv->s_retry;
+ ),
+ TP_printk(/* print */
+ TID_WRITE_SENDER_PRN,
+ __get_str(dev),
+ __entry->qpn,
+ __entry->newreq,
+ __entry->s_tid_cur,
+ __entry->s_tid_tail,
+ __entry->s_tid_head,
+ __entry->pending_tid_w_resp,
+ __entry->n_requests,
+ __entry->n_tid_requests,
+ __entry->s_flags,
+ __entry->ps_flags,
+ __entry->iow_flags,
+ __entry->s_state,
+ __entry->s_retry
+ )
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_write_sender_template, hfi1_tid_write_sender_rcv_resp,
+ TP_PROTO(struct rvt_qp *qp, char newreq),
+ TP_ARGS(qp, newreq)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_write_sender_template, hfi1_tid_write_sender_rcv_tid_ack,
+ TP_PROTO(struct rvt_qp *qp, char newreq),
+ TP_ARGS(qp, newreq)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_write_sender_template, hfi1_tid_write_sender_retry_timeout,
+ TP_PROTO(struct rvt_qp *qp, char newreq),
+ TP_ARGS(qp, newreq)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_write_sender_template, hfi1_tid_write_sender_make_tid_pkt,
+ TP_PROTO(struct rvt_qp *qp, char newreq),
+ TP_ARGS(qp, newreq)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_write_sender_template, hfi1_tid_write_sender_make_req,
+ TP_PROTO(struct rvt_qp *qp, char newreq),
+ TP_ARGS(qp, newreq)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_write_sender_template, hfi1_tid_write_sender_restart_rc,
+ TP_PROTO(struct rvt_qp *qp, char newreq),
+ TP_ARGS(qp, newreq)
+);
+
+DECLARE_EVENT_CLASS(/* tid_ack */
+ hfi1_tid_ack_template,
+ TP_PROTO(struct rvt_qp *qp, u32 aeth, u32 psn,
+ u32 req_psn, u32 resync_psn),
+ TP_ARGS(qp, aeth, psn, req_psn, resync_psn),
+ TP_STRUCT__entry(/* entry */
+ DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device))
+ __field(u32, qpn)
+ __field(u32, aeth)
+ __field(u32, psn)
+ __field(u32, req_psn)
+ __field(u32, resync_psn)
+ ),
+ TP_fast_assign(/* assign */
+ DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device))
+ __entry->qpn = qp->ibqp.qp_num;
+ __entry->aeth = aeth;
+ __entry->psn = psn;
+ __entry->req_psn = req_psn;
+ __entry->resync_psn = resync_psn;
+ ),
+ TP_printk(/* print */
+ "[%s] qpn 0x%x aeth 0x%x psn 0x%x req_psn 0x%x resync_psn 0x%x",
+ __get_str(dev),
+ __entry->qpn,
+ __entry->aeth,
+ __entry->psn,
+ __entry->req_psn,
+ __entry->resync_psn
+ )
+);
+
+DEFINE_EVENT(/* rcv_tid_ack */
+ hfi1_tid_ack_template, hfi1_rcv_tid_ack,
+ TP_PROTO(struct rvt_qp *qp, u32 aeth, u32 psn,
+ u32 req_psn, u32 resync_psn),
+ TP_ARGS(qp, aeth, psn, req_psn, resync_psn)
+);
+
+DECLARE_EVENT_CLASS(/* kdeth_eflags_error */
+ hfi1_kdeth_eflags_error_template,
+ TP_PROTO(struct rvt_qp *qp, u8 rcv_type, u8 rte, u32 psn),
+ TP_ARGS(qp, rcv_type, rte, psn),
+ TP_STRUCT__entry(/* entry */
+ DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device))
+ __field(u32, qpn)
+ __field(u8, rcv_type)
+ __field(u8, rte)
+ __field(u32, psn)
+ ),
+ TP_fast_assign(/* assign */
+ DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device));
+ __entry->qpn = qp->ibqp.qp_num;
+ __entry->rcv_type = rcv_type;
+ __entry->rte = rte;
+ __entry->psn = psn;
+ ),
+ TP_printk(/* print */
+ KDETH_EFLAGS_ERR_PRN,
+ __get_str(dev),
+ __entry->qpn,
+ __entry->rcv_type,
+ __entry->rte,
+ __entry->psn
+ )
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_kdeth_eflags_error_template, hfi1_eflags_err_write,
+ TP_PROTO(struct rvt_qp *qp, u8 rcv_type, u8 rte, u32 psn),
+ TP_ARGS(qp, rcv_type, rte, psn)
+);
+
#endif /* __HFI1_TRACE_TID_H */
#undef TRACE_INCLUDE_PATH
diff --git a/drivers/infiniband/hw/hfi1/trace_tx.h b/drivers/infiniband/hw/hfi1/trace_tx.h
index 37dbb3e599c3..09eb0c9ada00 100644
--- a/drivers/infiniband/hw/hfi1/trace_tx.h
+++ b/drivers/infiniband/hw/hfi1/trace_tx.h
@@ -846,6 +846,12 @@ DEFINE_EVENT(
TP_ARGS(qp, flag)
);
+DEFINE_EVENT(/* event */
+ hfi1_do_send_template, hfi1_rc_do_tid_send,
+ TP_PROTO(struct rvt_qp *qp, bool flag),
+ TP_ARGS(qp, flag)
+);
+
DEFINE_EVENT(
hfi1_do_send_template, hfi1_rc_expired_time_slice,
TP_PROTO(struct rvt_qp *qp, bool flag),
diff --git a/drivers/infiniband/hw/hfi1/user_sdma.c b/drivers/infiniband/hw/hfi1/user_sdma.c
index e5e7fad09f32..8bfbc6d7ea34 100644
--- a/drivers/infiniband/hw/hfi1/user_sdma.c
+++ b/drivers/infiniband/hw/hfi1/user_sdma.c
@@ -144,8 +144,10 @@ static int defer_packet_queue(
*/
xchg(&pq->state, SDMA_PKT_Q_DEFERRED);
write_seqlock(&sde->waitlock);
- if (list_empty(&pq->busy.list))
+ if (list_empty(&pq->busy.list)) {
+ iowait_get_priority(&pq->busy);
iowait_queue(pkts_sent, &pq->busy, &sde->dmawait);
+ }
write_sequnlock(&sde->waitlock);
return -EBUSY;
eagain:
@@ -191,7 +193,7 @@ int hfi1_user_sdma_alloc_queues(struct hfi1_ctxtdata *uctxt,
pq->mm = fd->mm;
iowait_init(&pq->busy, 0, NULL, NULL, defer_packet_queue,
- activate_packet_queue, NULL);
+ activate_packet_queue, NULL, NULL);
pq->reqidx = 0;
pq->reqs = kcalloc(hfi1_sdma_comp_ring_size,
@@ -1126,7 +1128,8 @@ static inline u32 set_pkt_bth_psn(__be32 bthpsn, u8 expct, u32 frags)
0xffffffull),
psn = val & mask;
if (expct)
- psn = (psn & ~BTH_SEQ_MASK) | ((psn + frags) & BTH_SEQ_MASK);
+ psn = (psn & ~HFI1_KDETH_BTH_SEQ_MASK) |
+ ((psn + frags) & HFI1_KDETH_BTH_SEQ_MASK);
else
psn = psn + frags;
return psn & mask;
diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c
index 88676ca79fda..55a56b3d7f83 100644
--- a/drivers/infiniband/hw/hfi1/verbs.c
+++ b/drivers/infiniband/hw/hfi1/verbs.c
@@ -161,6 +161,7 @@ MODULE_PARM_DESC(wss_clean_period, "Count of verbs copies before an entry in the
*/
const enum ib_wc_opcode ib_hfi1_wc_opcode[] = {
[IB_WR_RDMA_WRITE] = IB_WC_RDMA_WRITE,
+ [IB_WR_TID_RDMA_WRITE] = IB_WC_RDMA_WRITE,
[IB_WR_RDMA_WRITE_WITH_IMM] = IB_WC_RDMA_WRITE,
[IB_WR_SEND] = IB_WC_SEND,
[IB_WR_SEND_WITH_IMM] = IB_WC_SEND,
@@ -203,6 +204,12 @@ const u8 hdr_len_by_opcode[256] = {
[IB_OPCODE_RC_SEND_ONLY_WITH_INVALIDATE] = 12 + 8 + 4,
[IB_OPCODE_TID_RDMA_READ_REQ] = 12 + 8 + 36,
[IB_OPCODE_TID_RDMA_READ_RESP] = 12 + 8 + 36,
+ [IB_OPCODE_TID_RDMA_WRITE_REQ] = 12 + 8 + 36,
+ [IB_OPCODE_TID_RDMA_WRITE_RESP] = 12 + 8 + 36,
+ [IB_OPCODE_TID_RDMA_WRITE_DATA] = 12 + 8 + 36,
+ [IB_OPCODE_TID_RDMA_WRITE_DATA_LAST] = 12 + 8 + 36,
+ [IB_OPCODE_TID_RDMA_ACK] = 12 + 8 + 36,
+ [IB_OPCODE_TID_RDMA_RESYNC] = 12 + 8 + 36,
/* UC */
[IB_OPCODE_UC_SEND_FIRST] = 12 + 8,
[IB_OPCODE_UC_SEND_MIDDLE] = 12 + 8,
@@ -248,8 +255,14 @@ static const opcode_handler opcode_handler_tbl[256] = {
[IB_OPCODE_RC_SEND_ONLY_WITH_INVALIDATE] = &hfi1_rc_rcv,
/* TID RDMA has separate handlers for different opcodes.*/
+ [IB_OPCODE_TID_RDMA_WRITE_REQ] = &hfi1_rc_rcv_tid_rdma_write_req,
+ [IB_OPCODE_TID_RDMA_WRITE_RESP] = &hfi1_rc_rcv_tid_rdma_write_resp,
+ [IB_OPCODE_TID_RDMA_WRITE_DATA] = &hfi1_rc_rcv_tid_rdma_write_data,
+ [IB_OPCODE_TID_RDMA_WRITE_DATA_LAST] = &hfi1_rc_rcv_tid_rdma_write_data,
[IB_OPCODE_TID_RDMA_READ_REQ] = &hfi1_rc_rcv_tid_rdma_read_req,
[IB_OPCODE_TID_RDMA_READ_RESP] = &hfi1_rc_rcv_tid_rdma_read_resp,
+ [IB_OPCODE_TID_RDMA_RESYNC] = &hfi1_rc_rcv_tid_rdma_resync,
+ [IB_OPCODE_TID_RDMA_ACK] = &hfi1_rc_rcv_tid_rdma_ack,
/* UC */
[IB_OPCODE_UC_SEND_FIRST] = &hfi1_uc_rcv,
@@ -932,6 +945,7 @@ static int pio_wait(struct rvt_qp *qp,
dev->n_piodrain += !!(flag & HFI1_S_WAIT_PIO_DRAIN);
qp->s_flags |= flag;
was_empty = list_empty(&sc->piowait);
+ iowait_get_priority(&priv->s_iowait);
iowait_queue(ps->pkts_sent, &priv->s_iowait,
&sc->piowait);
priv->s_iowait.lock = &sc->waitlock;
@@ -1332,7 +1346,9 @@ static void hfi1_fill_device_attr(struct hfi1_devdata *dd)
rdi->dparms.props.max_mr_size = U64_MAX;
rdi->dparms.props.max_fast_reg_page_list_len = UINT_MAX;
rdi->dparms.props.max_qp = hfi1_max_qps;
- rdi->dparms.props.max_qp_wr = hfi1_max_qp_wrs;
+ rdi->dparms.props.max_qp_wr =
+ (hfi1_max_qp_wrs >= HFI1_QP_WQE_INVALID ?
+ HFI1_QP_WQE_INVALID - 1 : hfi1_max_qp_wrs);
rdi->dparms.props.max_send_sge = hfi1_max_sges;
rdi->dparms.props.max_recv_sge = hfi1_max_sges;
rdi->dparms.props.max_sge_rd = hfi1_max_sges;
@@ -1888,7 +1904,7 @@ int hfi1_register_ib_device(struct hfi1_devdata *dd)
dd->verbs_dev.rdi.dparms.wss_threshold = wss_threshold;
dd->verbs_dev.rdi.dparms.wss_clean_period = wss_clean_period;
dd->verbs_dev.rdi.dparms.reserved_operations = 1;
- dd->verbs_dev.rdi.dparms.extra_rdma_atomic = 1;
+ dd->verbs_dev.rdi.dparms.extra_rdma_atomic = HFI1_TID_RDMA_WRITE_CNT;
/* post send table */
dd->verbs_dev.rdi.post_parms = hfi1_post_parms;
diff --git a/drivers/infiniband/hw/hfi1/verbs.h b/drivers/infiniband/hw/hfi1/verbs.h
index 841727a684d5..62ace0b2d17a 100644
--- a/drivers/infiniband/hw/hfi1/verbs.h
+++ b/drivers/infiniband/hw/hfi1/verbs.h
@@ -163,16 +163,39 @@ struct hfi1_qp_priv {
u32 tid_enqueue; /* saved when tid waited */
u8 s_sc; /* SC[0..4] for next packet */
struct iowait s_iowait;
+ struct timer_list s_tid_timer; /* for timing tid wait */
+ struct timer_list s_tid_retry_timer; /* for timing tid ack */
struct list_head tid_wait; /* for queueing tid space */
struct hfi1_opfn_data opfn;
struct tid_flow_state flow_state;
struct tid_rdma_qp_params tid_rdma;
struct rvt_qp *owner;
u8 hdr_type; /* 9B or 16B */
+ struct rvt_sge_state tid_ss; /* SGE state pointer for 2nd leg */
+ atomic_t n_requests; /* # of TID RDMA requests in the */
+ /* queue */
+ atomic_t n_tid_requests; /* # of sent TID RDMA requests */
unsigned long tid_timer_timeout_jiffies;
+ unsigned long tid_retry_timeout_jiffies;
/* variables for the TID RDMA SE state machine */
+ u8 s_state;
+ u8 s_retry;
+ u8 rnr_nak_state; /* RNR NAK state */
+ u8 s_nak_state;
+ u32 s_nak_psn;
u32 s_flags;
+ u32 s_tid_cur;
+ u32 s_tid_head;
+ u32 s_tid_tail;
+ u32 r_tid_head; /* Most recently added TID RDMA request */
+ u32 r_tid_tail; /* the last completed TID RDMA request */
+ u32 r_tid_ack; /* the TID RDMA request to be ACK'ed */
+ u32 r_tid_alloc; /* Request for which we are allocating resources */
+ u32 pending_tid_w_segs; /* Num of pending tid write segments */
+ u32 pending_tid_w_resp; /* Num of pending tid write responses */
+ u32 alloc_w_segs; /* Number of segments for which write */
+ /* resources have been allocated for this QP */
/* For TID RDMA READ */
u32 tid_r_reqs; /* Num of tid reads requested */
@@ -180,14 +203,23 @@ struct hfi1_qp_priv {
u32 pending_tid_r_segs; /* Num of pending tid read segments */
u16 pkts_ps; /* packets per segment */
u8 timeout_shift; /* account for number of packets per segment */
+
+ u32 r_next_psn_kdeth;
+ u32 r_next_psn_kdeth_save;
+ u32 s_resync_psn;
+ u8 sync_pt; /* Set when QP reaches sync point */
+ u8 resync;
};
+#define HFI1_QP_WQE_INVALID ((u32)-1)
+
struct hfi1_swqe_priv {
struct tid_rdma_request tid_req;
struct rvt_sge_state ss; /* Used for TID RDMA READ Request */
};
struct hfi1_ack_priv {
+ struct rvt_sge_state ss; /* used for TID WRITE RESP */
struct tid_rdma_request tid_req;
};
@@ -412,6 +444,9 @@ void hfi1_make_ruc_header(struct rvt_qp *qp, struct ib_other_headers *ohdr,
u32 bth0, u32 bth1, u32 bth2, int middle,
struct hfi1_pkt_state *ps);
+bool hfi1_schedule_send_yield(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
+ bool tid);
+
void _hfi1_do_send(struct work_struct *work);
void hfi1_do_send_from_rvt(struct rvt_qp *qp);
diff --git a/drivers/infiniband/hw/hfi1/verbs_txreq.h b/drivers/infiniband/hw/hfi1/verbs_txreq.h
index 2a77af26a231..b002e96eb335 100644
--- a/drivers/infiniband/hw/hfi1/verbs_txreq.h
+++ b/drivers/infiniband/hw/hfi1/verbs_txreq.h
@@ -94,6 +94,7 @@ static inline struct verbs_txreq *get_txreq(struct hfi1_ibdev *dev,
tx->txreq.num_desc = 0;
/* Set the header type */
tx->phdr.hdr.hdr_type = priv->hdr_type;
+ tx->txreq.flags = 0;
return tx;
}
diff --git a/drivers/infiniband/hw/hfi1/vnic_sdma.c b/drivers/infiniband/hw/hfi1/vnic_sdma.c
index 1f81c480e028..af1b1ffcb38e 100644
--- a/drivers/infiniband/hw/hfi1/vnic_sdma.c
+++ b/drivers/infiniband/hw/hfi1/vnic_sdma.c
@@ -240,8 +240,10 @@ static int hfi1_vnic_sdma_sleep(struct sdma_engine *sde,
}
vnic_sdma->state = HFI1_VNIC_SDMA_Q_DEFERRED;
- if (list_empty(&vnic_sdma->wait.list))
+ if (list_empty(&vnic_sdma->wait.list)) {
+ iowait_get_priority(wait->iow);
iowait_queue(pkts_sent, wait->iow, &sde->dmawait);
+ }
write_sequnlock(&sde->waitlock);
return -EBUSY;
}
@@ -281,7 +283,7 @@ void hfi1_vnic_sdma_init(struct hfi1_vnic_vport_info *vinfo)
iowait_init(&vnic_sdma->wait, 0, NULL, NULL,
hfi1_vnic_sdma_sleep,
- hfi1_vnic_sdma_wakeup, NULL);
+ hfi1_vnic_sdma_wakeup, NULL, NULL);
vnic_sdma->sde = &vinfo->dd->per_sdma[i];
vnic_sdma->dd = vinfo->dd;
vnic_sdma->vinfo = vinfo;