23 files changed, 3961 insertions, 131 deletions
diff --git a/drivers/infiniband/hw/hfi1/init.c b/drivers/infiniband/hw/hfi1/init.c
index d13304f7340d..7841a0ad7cb6 100644
--- a/drivers/infiniband/hw/hfi1/init.c
+++ b/drivers/infiniband/hw/hfi1/init.c
@@ -1512,6 +1512,7 @@ static int __init hfi1_mod_init(void)
 		goto bail_dev;
 	}
 
+	hfi1_compute_tid_rdma_flow_wt();
 	/*
 	 * These must be called before the driver is registered with
 	 * the PCI subsystem.
diff --git a/drivers/infiniband/hw/hfi1/iowait.c b/drivers/infiniband/hw/hfi1/iowait.c
index 582f1ba136ff..adb4a1ba921b 100644
--- a/drivers/infiniband/hw/hfi1/iowait.c
+++ b/drivers/infiniband/hw/hfi1/iowait.c
@@ -6,6 +6,9 @@
 #include "iowait.h"
 #include "trace_iowait.h"
 
+/* 1 priority == 16 starve_cnt */
+#define IOWAIT_PRIORITY_STARVE_SHIFT 4
+
 void iowait_set_flag(struct iowait *wait, u32 flag)
 {
 	trace_hfi1_iowait_set(wait, flag);
@@ -44,7 +47,8 @@ void iowait_init(struct iowait *wait, u32 tx_limit,
 			      uint seq,
 			      bool pkts_sent),
 		 void (*wakeup)(struct iowait *wait, int reason),
-		 void (*sdma_drained)(struct iowait *wait))
+		 void (*sdma_drained)(struct iowait *wait),
+		 void (*init_priority)(struct iowait *wait))
 {
 	int i;
 
@@ -58,6 +62,7 @@ void iowait_init(struct iowait *wait, u32 tx_limit,
 	wait->sleep = sleep;
 	wait->wakeup = wakeup;
 	wait->sdma_drained = sdma_drained;
+	wait->init_priority = init_priority;
 	wait->flags = 0;
 	for (i = 0; i < IOWAIT_SES; i++) {
 		wait->wait[i].iow = wait;
@@ -92,3 +97,30 @@ int iowait_set_work_flag(struct iowait_work *w)
 	iowait_set_flag(w->iow, IOWAIT_PENDING_TID);
 	return IOWAIT_TID_SE;
 }
+
+/**
+ * iowait_priority_update_top - update the top priority entry
+ * @w: the iowait struct
+ * @top: a pointer to the top priority entry
+ * @idx: the index of the current iowait in an array
+ * @top_idx: the array index for the iowait entry that has the top priority
+ *
+ * This function is called to compare the priority of a given
+ * iowait with the given top priority entry. The top index will
+ * be returned.
+ */
+uint iowait_priority_update_top(struct iowait *w,
+				struct iowait *top,
+				uint idx, uint top_idx)
+{
+	u8 cnt, tcnt;
+
+	/* Convert priority into starve_cnt and compare the total.*/
+	cnt = (w->priority << IOWAIT_PRIORITY_STARVE_SHIFT) + w->starved_cnt;
+	tcnt = (top->priority << IOWAIT_PRIORITY_STARVE_SHIFT) +
+		top->starved_cnt;
+	if (cnt > tcnt)
+		return idx;
+	else
+		return top_idx;
+}
diff --git a/drivers/infiniband/hw/hfi1/iowait.h b/drivers/infiniband/hw/hfi1/iowait.h
index 23a58ac0d47c..07847cb72169 100644
--- a/drivers/infiniband/hw/hfi1/iowait.h
+++ b/drivers/infiniband/hw/hfi1/iowait.h
@@ -100,6 +100,7 @@ struct iowait_work {
  * @sleep: no space callback
  * @wakeup: space callback wakeup
  * @sdma_drained: sdma count drained
+ * @init_priority: callback to manipulate priority
  * @lock: lock protected head of wait queue
  * @iowork: workqueue overhead
  * @wait_dma: wait for sdma_busy == 0
@@ -109,7 +110,7 @@ struct iowait_work {
  * @tx_limit: limit for overflow queuing
  * @tx_count: number of tx entry's in tx_head'ed list
  * @flags: wait flags (one per QP)
- * @wait: SE array
+ * @wait: SE array for multiple legs
  *
  * This is to be embedded in user's state structure
  * (QP or PQ).
@@ -120,10 +121,13 @@ struct iowait_work {
  * are callbacks for the ULP to implement
  * what ever queuing/dequeuing of
  * the embedded iowait and its containing struct
- * when a resource shortage like SDMA ring space is seen.
+ * when a resource shortage like SDMA ring space
+ * or PIO credit space is seen.
  *
  * Both potentially have locks help
- * so sleeping is not allowed.
+ * so sleeping is not allowed and it is not
+ * supported to submit txreqs from the wakeup
+ * call directly because of lock conflicts.
  *
  * The wait_dma member along with the iow
  *
@@ -143,6 +147,7 @@ struct iowait {
 		);
 	void (*wakeup)(struct iowait *wait, int reason);
 	void (*sdma_drained)(struct iowait *wait);
+	void (*init_priority)(struct iowait *wait);
 	seqlock_t *lock;
 	wait_queue_head_t wait_dma;
 	wait_queue_head_t wait_pio;
@@ -152,6 +157,7 @@ struct iowait {
 	u32 tx_limit;
 	u32 tx_count;
 	u8 starved_cnt;
+	u8 priority;
 	unsigned long flags;
 	struct iowait_work wait[IOWAIT_SES];
 };
@@ -171,7 +177,8 @@ void iowait_init(struct iowait *wait, u32 tx_limit,
 			      uint seq,
 			      bool pkts_sent),
 		 void (*wakeup)(struct iowait *wait, int reason),
-		 void (*sdma_drained)(struct iowait *wait));
+		 void (*sdma_drained)(struct iowait *wait),
+		 void (*init_priority)(struct iowait *wait));
 
 /**
  * iowait_schedule() - schedule the default send engine work
@@ -186,6 +193,18 @@ static inline bool iowait_schedule(struct iowait *wait,
 }
 
 /**
+ * iowait_tid_schedule - schedule the tid SE
+ * @wait: the iowait structure
+ * @wq: the work queue
+ * @cpu: the cpu
+ */
+static inline bool iowait_tid_schedule(struct iowait *wait,
+				       struct workqueue_struct *wq, int cpu)
+{
+	return !!queue_work_on(cpu, wq, &wait->wait[IOWAIT_TID_SE].iowork);
+}
+
+/**
  * iowait_sdma_drain() - wait for DMAs to drain
  *
  * @wait: iowait structure
@@ -327,6 +346,8 @@ static inline u16 iowait_get_desc(struct iowait_work *w)
 		tx = list_first_entry(&w->tx_head, struct sdma_txreq,
 				      list);
 		num_desc = tx->num_desc;
+		if (tx->flags & SDMA_TXREQ_F_VIP)
+			w->iow->priority++;
 	}
 	return num_desc;
 }
@@ -340,6 +361,37 @@ static inline u32 iowait_get_all_desc(struct iowait *w)
 	return num_desc;
 }
 
+static inline void iowait_update_priority(struct iowait_work *w)
+{
+	struct sdma_txreq *tx = NULL;
+
+	if (!list_empty(&w->tx_head)) {
+		tx = list_first_entry(&w->tx_head, struct sdma_txreq,
+				      list);
+		if (tx->flags & SDMA_TXREQ_F_VIP)
+			w->iow->priority++;
+	}
+}
+
+static inline void iowait_update_all_priority(struct iowait *w)
+{
+	iowait_update_priority(&w->wait[IOWAIT_IB_SE]);
+	iowait_update_priority(&w->wait[IOWAIT_TID_SE]);
+}
+
+static inline void iowait_init_priority(struct iowait *w)
+{
+	w->priority = 0;
+	if (w->init_priority)
+		w->init_priority(w);
+}
+
+static inline void iowait_get_priority(struct iowait *w)
+{
+	iowait_init_priority(w);
+	iowait_update_all_priority(w);
+}
+
 /**
  * iowait_queue - Put the iowait on a wait queue
  * @pkts_sent: have some packets been sent before queuing?
@@ -356,14 +408,18 @@ static inline void iowait_queue(bool pkts_sent, struct iowait *w,
 	/*
 	 * To play fair, insert the iowait at the tail of the wait queue if it
 	 * has already sent some packets; Otherwise, put it at the head.
+	 * However, if it has priority packets to send, also put it at the
+	 * head.
 	 */
-	if (pkts_sent) {
-		list_add_tail(&w->list, wait_head);
+	if (pkts_sent)
 		w->starved_cnt = 0;
-	} else {
-		list_add(&w->list, wait_head);
+	else
 		w->starved_cnt++;
-	}
+
+	if (w->priority > 0 || !pkts_sent)
+		list_add(&w->list, wait_head);
+	else
+		list_add_tail(&w->list, wait_head);
 }
 
 /**
@@ -380,27 +436,10 @@ static inline void iowait_starve_clear(bool pkts_sent, struct iowait *w)
 		w->starved_cnt = 0;
 }
 
-/**
- * iowait_starve_find_max - Find the maximum of the starve count
- * @w: the iowait struct
- * @max: a variable containing the max starve count
- * @idx: the index of the current iowait in an array
- * @max_idx: a variable containing the array index for the
- *         iowait entry that has the max starve count
- *
- * This function is called to compare the starve count of a
- * given iowait with the given max starve count. The max starve
- * count and the index will be updated if the iowait's start
- * count is larger.
- */
-static inline void iowait_starve_find_max(struct iowait *w, u8 *max,
-					  uint idx, uint *max_idx)
-{
-	if (w->starved_cnt > *max) {
-		*max = w->starved_cnt;
-		*max_idx = idx;
-	}
-}
+/* Update the top priority index */
+uint iowait_priority_update_top(struct iowait *w,
+				struct iowait *top,
+				uint idx, uint top_idx);
 
 /**
  * iowait_packet_queued() - determine if a packet is queued
diff --git a/drivers/infiniband/hw/hfi1/opfn.c b/drivers/infiniband/hw/hfi1/opfn.c
index 2ca070690b2f..370a5a8eaa71 100644
--- a/drivers/infiniband/hw/hfi1/opfn.c
+++ b/drivers/infiniband/hw/hfi1/opfn.c
@@ -245,10 +245,15 @@ void opfn_qp_init(struct rvt_qp *qp, struct ib_qp_attr *attr, int attr_mask)
 	struct hfi1_qp_priv *priv = qp->priv;
 	unsigned long flags;
 
+	if (attr_mask & IB_QP_RETRY_CNT)
+		priv->s_retry = attr->retry_cnt;
+
 	spin_lock_irqsave(&priv->opfn.lock, flags);
 	if (ibqp->qp_type == IB_QPT_RC && HFI1_CAP_IS_KSET(TID_RDMA)) {
 		struct tid_rdma_params *local = &priv->tid_rdma.local;
 
+		if (attr_mask & IB_QP_TIMEOUT)
+			priv->tid_retry_timeout_jiffies = qp->timeout_jiffies;
 		if (qp->pmtu == enum_to_mtu(OPA_MTU_4096) ||
 		    qp->pmtu == enum_to_mtu(OPA_MTU_8192)) {
 			tid_rdma_opfn_init(qp, local);
diff --git a/drivers/infiniband/hw/hfi1/pio.c b/drivers/infiniband/hw/hfi1/pio.c
index 04126d7e318d..a1de566fe95e 100644
--- a/drivers/infiniband/hw/hfi1/pio.c
+++ b/drivers/infiniband/hw/hfi1/pio.c
@@ -1599,8 +1599,7 @@ static void sc_piobufavail(struct send_context *sc)
 	struct rvt_qp *qp;
 	struct hfi1_qp_priv *priv;
 	unsigned long flags;
-	uint i, n = 0, max_idx = 0;
-	u8 max_starved_cnt = 0;
+	uint i, n = 0, top_idx = 0;
 
 	if (dd->send_contexts[sc->sw_index].type != SC_KERNEL &&
 	    dd->send_contexts[sc->sw_index].type != SC_VL15)
@@ -1619,11 +1618,18 @@ static void sc_piobufavail(struct send_context *sc)
 		if (n == ARRAY_SIZE(qps))
 			break;
 		wait = list_first_entry(list, struct iowait, list);
+		iowait_get_priority(wait);
 		qp = iowait_to_qp(wait);
 		priv = qp->priv;
 		list_del_init(&priv->s_iowait.list);
 		priv->s_iowait.lock = NULL;
-		iowait_starve_find_max(wait, &max_starved_cnt, n, &max_idx);
+		if (n) {
+			priv = qps[top_idx]->priv;
+			top_idx = iowait_priority_update_top(wait,
+							     &priv->s_iowait,
+							     n, top_idx);
+		}
+
 		/* refcount held until actual wake up */
 		qps[n++] = qp;
 	}
@@ -1638,12 +1644,12 @@ static void sc_piobufavail(struct send_context *sc)
 	}
 	write_sequnlock_irqrestore(&sc->waitlock, flags);
 
-	/* Wake up the most starved one first */
+	/* Wake up the top-priority one first */
 	if (n)
-		hfi1_qp_wakeup(qps[max_idx],
+		hfi1_qp_wakeup(qps[top_idx],
 			       RVT_S_WAIT_PIO | HFI1_S_WAIT_PIO_DRAIN);
 	for (i = 0; i < n; i++)
-		if (i != max_idx)
+		if (i != top_idx)
 			hfi1_qp_wakeup(qps[i],
 				       RVT_S_WAIT_PIO | HFI1_S_WAIT_PIO_DRAIN);
 }
diff --git a/drivers/infiniband/hw/hfi1/qp.c b/drivers/infiniband/hw/hfi1/qp.c
index acdd9eba189b..d8f7add935df 100644
--- a/drivers/infiniband/hw/hfi1/qp.c
+++ b/drivers/infiniband/hw/hfi1/qp.c
@@ -138,6 +138,12 @@ const struct rvt_operation_params hfi1_post_parms[RVT_OPERATION_MAX] = {
 	.flags = RVT_OPERATION_USE_RESERVE,
 },
 
+[IB_WR_TID_RDMA_WRITE] = {
+	.length = sizeof(struct ib_rdma_wr),
+	.qpt_support = BIT(IB_QPT_RC),
+	.flags = RVT_OPERATION_IGN_RNR_CNT,
+},
+
 };
 
 static void flush_list_head(struct list_head *l)
@@ -431,6 +437,11 @@ static void hfi1_qp_schedule(struct rvt_qp *qp)
 		if (ret)
 			iowait_clear_flag(&priv->s_iowait, IOWAIT_PENDING_IB);
 	}
+	if (iowait_flag_set(&priv->s_iowait, IOWAIT_PENDING_TID)) {
+		ret = hfi1_schedule_tid_send(qp);
+		if (ret)
+			iowait_clear_flag(&priv->s_iowait, IOWAIT_PENDING_TID);
+	}
 }
 
 void hfi1_qp_wakeup(struct rvt_qp *qp, u32 flag)
@@ -450,8 +461,27 @@ void hfi1_qp_wakeup(struct rvt_qp *qp, u32 flag)
 
 void hfi1_qp_unbusy(struct rvt_qp *qp, struct iowait_work *wait)
 {
-	if (iowait_set_work_flag(wait) == IOWAIT_IB_SE)
+	struct hfi1_qp_priv *priv = qp->priv;
+
+	if (iowait_set_work_flag(wait) == IOWAIT_IB_SE) {
 		qp->s_flags &= ~RVT_S_BUSY;
+		/*
+		 * If we are sending a first-leg packet from the second leg,
+		 * we need to clear the busy flag from priv->s_flags to
+		 * avoid a race condition when the qp wakes up before
+		 * the call to hfi1_verbs_send() returns to the second
+		 * leg. In that case, the second leg will terminate without
+		 * being re-scheduled, resulting in failure to send TID RDMA
+		 * WRITE DATA and TID RDMA ACK packets.
+		 */
+		if (priv->s_flags & HFI1_S_TID_BUSY_SET) {
+			priv->s_flags &= ~(HFI1_S_TID_BUSY_SET |
+					   RVT_S_BUSY);
+			iowait_set_flag(&priv->s_iowait, IOWAIT_PENDING_TID);
+		}
+	} else {
+		priv->s_flags &= ~RVT_S_BUSY;
+	}
 }
 
 static int iowait_sleep(
@@ -488,6 +518,7 @@ static int iowait_sleep(
 
 			ibp->rvp.n_dmawait++;
 			qp->s_flags |= RVT_S_WAIT_DMA_DESC;
+			iowait_get_priority(&priv->s_iowait);
 			iowait_queue(pkts_sent, &priv->s_iowait,
 				     &sde->dmawait);
 			priv->s_iowait.lock = &sde->waitlock;
@@ -537,6 +568,17 @@ static void iowait_sdma_drained(struct iowait *wait)
 	spin_unlock_irqrestore(&qp->s_lock, flags);
 }
 
+static void hfi1_init_priority(struct iowait *w)
+{
+	struct rvt_qp *qp = iowait_to_qp(w);
+	struct hfi1_qp_priv *priv = qp->priv;
+
+	if (qp->s_flags & RVT_S_ACK_PENDING)
+		w->priority++;
+	if (priv->s_flags & RVT_S_ACK_PENDING)
+		w->priority++;
+}
+
 /**
  * qp_to_sdma_engine - map a qp to a send engine
  * @qp: the QP
@@ -694,10 +736,11 @@ void *qp_priv_alloc(struct rvt_dev_info *rdi, struct rvt_qp *qp)
 		&priv->s_iowait,
 		1,
 		_hfi1_do_send,
-		NULL,
+		_hfi1_do_tid_send,
 		iowait_sleep,
 		iowait_wakeup,
-		iowait_sdma_drained);
+		iowait_sdma_drained,
+		hfi1_init_priority);
 	return priv;
 }
 
@@ -755,6 +798,8 @@ void quiesce_qp(struct rvt_qp *qp)
 {
 	struct hfi1_qp_priv *priv = qp->priv;
 
+	hfi1_del_tid_reap_timer(qp);
+	hfi1_del_tid_retry_timer(qp);
 	iowait_sdma_drain(&priv->s_iowait);
 	qp_pio_drain(qp);
 	flush_tx_list(qp);
@@ -850,7 +895,8 @@ void notify_error_qp(struct rvt_qp *qp)
 	if (lock) {
 		write_seqlock(lock);
 		if (!list_empty(&priv->s_iowait.list) &&
-		    !(qp->s_flags & RVT_S_BUSY)) {
+		    !(qp->s_flags & RVT_S_BUSY) &&
+		    !(priv->s_flags & RVT_S_BUSY)) {
 			qp->s_flags &= ~RVT_S_ANY_WAIT_IO;
 			list_del_init(&priv->s_iowait.list);
 			priv->s_iowait.lock = NULL;
@@ -859,7 +905,8 @@ void notify_error_qp(struct rvt_qp *qp)
 		write_sequnlock(lock);
 	}
 
-	if (!(qp->s_flags & RVT_S_BUSY)) {
+	if (!(qp->s_flags & RVT_S_BUSY) && !(priv->s_flags & RVT_S_BUSY)) {
+		qp->s_hdrwords = 0;
 		if (qp->s_rdma_mr) {
 			rvt_put_mr(qp->s_rdma_mr);
 			qp->s_rdma_mr = NULL;
diff --git a/drivers/infiniband/hw/hfi1/qp.h b/drivers/infiniband/hw/hfi1/qp.h
index ce25a27aa4a1..b670321365d3 100644
--- a/drivers/infiniband/hw/hfi1/qp.h
+++ b/drivers/infiniband/hw/hfi1/qp.h
@@ -64,12 +64,16 @@ extern const struct rvt_operation_params hfi1_post_parms[];
  * HFI1_S_AHG_CLEAR - have send engine clear ahg state
  * HFI1_S_WAIT_PIO_DRAIN - qp waiting for PIOs to drain
  * HFI1_S_WAIT_TID_SPACE - a QP is waiting for TID resource
+ * HFI1_S_WAIT_TID_RESP - waiting for a TID RDMA WRITE response
+ * HFI1_S_WAIT_HALT - halt the first leg send engine
  * HFI1_S_MIN_BIT_MASK - the lowest bit that can be used by hfi1
  */
 #define HFI1_S_AHG_VALID         0x80000000
 #define HFI1_S_AHG_CLEAR         0x40000000
 #define HFI1_S_WAIT_PIO_DRAIN    0x20000000
 #define HFI1_S_WAIT_TID_SPACE    0x10000000
+#define HFI1_S_WAIT_TID_RESP     0x08000000
+#define HFI1_S_WAIT_HALT         0x04000000
 #define HFI1_S_MIN_BIT_MASK      0x01000000
 
 /*
@@ -78,6 +82,7 @@ extern const struct rvt_operation_params hfi1_post_parms[];
 
 #define HFI1_S_ANY_WAIT_IO (RVT_S_ANY_WAIT_IO | HFI1_S_WAIT_PIO_DRAIN)
 #define HFI1_S_ANY_WAIT (HFI1_S_ANY_WAIT_IO | RVT_S_ANY_WAIT_SEND)
+#define HFI1_S_ANY_TID_WAIT_SEND (RVT_S_WAIT_SSN_CREDIT | RVT_S_WAIT_DMA)
 
 /*
  * Send if not busy or waiting for I/O and either
diff --git a/drivers/infiniband/hw/hfi1/rc.c b/drivers/infiniband/hw/hfi1/rc.c
index 6c9ef572fc69..e6726c1ab866 100644
--- a/drivers/infiniband/hw/hfi1/rc.c
+++ b/drivers/infiniband/hw/hfi1/rc.c
@@ -111,15 +111,17 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp,
 		       struct hfi1_pkt_state *ps)
 {
 	struct rvt_ack_entry *e;
-	u32 hwords;
+	u32 hwords, hdrlen;
 	u32 len = 0;
 	u32 bth0 = 0, bth2 = 0;
 	u32 bth1 = qp->remote_qpn | (HFI1_CAP_IS_KSET(OPFN) << IB_BTHE_E_SHIFT);
 	int middle = 0;
 	u32 pmtu = qp->pmtu;
-	struct hfi1_qp_priv *priv = qp->priv;
+	struct hfi1_qp_priv *qpriv = qp->priv;
 	bool last_pkt;
 	u32 delta;
+	u8 next = qp->s_tail_ack_queue;
+	struct tid_rdma_request *req;
 
 	trace_hfi1_rsp_make_rc_ack(qp, 0);
 	lockdep_assert_held(&qp->s_lock);
@@ -127,7 +129,7 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp,
 	if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK))
 		goto bail;
 
-	if (priv->hdr_type == HFI1_PKT_TYPE_9B)
+	if (qpriv->hdr_type == HFI1_PKT_TYPE_9B)
 		/* header size in 32-bit words LRH+BTH = (8+12)/4. */
 		hwords = 5;
 	else
@@ -149,9 +151,18 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp,
 		 * response has been sent instead of only being
 		 * constructed.
 		 */
-		if (++qp->s_tail_ack_queue >
-		    rvt_size_atomic(ib_to_rvt(qp->ibqp.device)))
-			qp->s_tail_ack_queue = 0;
+		if (++next > rvt_size_atomic(&dev->rdi))
+			next = 0;
+		/*
+		 * Only advance the s_acked_ack_queue pointer if there
+		 * have been no TID RDMA requests.
+		 */
+		e = &qp->s_ack_queue[qp->s_tail_ack_queue];
+		if (e->opcode != TID_OP(WRITE_REQ) &&
+		    qp->s_acked_ack_queue == qp->s_tail_ack_queue)
+			qp->s_acked_ack_queue = next;
+		qp->s_tail_ack_queue = next;
+		trace_hfi1_rsp_make_rc_ack(qp, e->psn);
 		/* FALLTHROUGH */
 	case OP(SEND_ONLY):
 	case OP(ACKNOWLEDGE):
@@ -163,6 +174,12 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp,
 		}
 
 		e = &qp->s_ack_queue[qp->s_tail_ack_queue];
+		/* Check for tid write fence */
+		if ((qpriv->s_flags & HFI1_R_TID_WAIT_INTERLCK) ||
+		    hfi1_tid_rdma_ack_interlock(qp, e)) {
+			iowait_set_flag(&qpriv->s_iowait, IOWAIT_PENDING_IB);
+			goto bail;
+		}
 		if (e->opcode == OP(RDMA_READ_REQUEST)) {
 			/*
 			 * If a RDMA read response is being resent and
@@ -172,6 +189,10 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp,
 			 */
 			len = e->rdma_sge.sge_length;
 			if (len && !e->rdma_sge.mr) {
+				if (qp->s_acked_ack_queue ==
+				    qp->s_tail_ack_queue)
+					qp->s_acked_ack_queue =
+						qp->r_head_ack_queue;
 				qp->s_tail_ack_queue = qp->r_head_ack_queue;
 				goto bail;
 			}
@@ -193,6 +214,21 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp,
 			hwords++;
 			qp->s_ack_rdma_psn = e->psn;
 			bth2 = mask_psn(qp->s_ack_rdma_psn++);
+		} else if (e->opcode == TID_OP(WRITE_REQ)) {
+			/*
+			 * If a TID RDMA WRITE RESP is being resent, we have to
+			 * wait for the actual request. All requests that are to
+			 * be resent will have their state set to
+			 * TID_REQUEST_RESEND. When the new request arrives, the
+			 * state will be changed to TID_REQUEST_RESEND_ACTIVE.
+			 */
+			req = ack_to_tid_req(e);
+			if (req->state == TID_REQUEST_RESEND ||
+			    req->state == TID_REQUEST_INIT_RESEND)
+				goto bail;
+			qp->s_ack_state = TID_OP(WRITE_RESP);
+			qp->s_ack_rdma_psn = mask_psn(e->psn + req->cur_seg);
+			goto write_resp;
 		} else if (e->opcode == TID_OP(READ_REQ)) {
 			/*
 			 * If a TID RDMA read response is being resent and
@@ -202,6 +238,10 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp,
 			 */
 			len = e->rdma_sge.sge_length;
 			if (len && !e->rdma_sge.mr) {
+				if (qp->s_acked_ack_queue ==
+				    qp->s_tail_ack_queue)
+					qp->s_acked_ack_queue =
+						qp->r_head_ack_queue;
 				qp->s_tail_ack_queue = qp->r_head_ack_queue;
 				goto bail;
 			}
@@ -224,6 +264,7 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp,
 			bth2 = mask_psn(e->psn);
 			e->sent = 1;
 		}
+		trace_hfi1_tid_write_rsp_make_rc_ack(qp);
 		bth0 = qp->s_ack_state << 24;
 		break;
 
@@ -250,6 +291,61 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp,
 		bth2 = mask_psn(qp->s_ack_rdma_psn++);
 		break;
 
+	case TID_OP(WRITE_RESP):
+write_resp:
+		/*
+		 * 1. Check if RVT_S_ACK_PENDING is set. If yes,
+		 *    goto normal.
+		 * 2. Attempt to allocate TID resources.
+		 * 3. Remove RVT_S_RESP_PENDING flags from s_flags
+		 * 4. If resources not available:
+		 *    4.1 Set RVT_S_WAIT_TID_SPACE
+		 *    4.2 Queue QP on RCD TID queue
+		 *    4.3 Put QP on iowait list.
+		 *    4.4 Build IB RNR NAK with appropriate timeout value
+		 *    4.5 Return indication progress made.
+		 * 5. If resources are available:
+		 *    5.1 Program HW flow CSRs
+		 *    5.2 Build TID RDMA WRITE RESP packet
+		 *    5.3 If more resources needed, do 2.1 - 2.3.
+		 *    5.4 Wake up next QP on RCD TID queue.
+		 *    5.5 Return indication progress made.
+		 */
+
+		e = &qp->s_ack_queue[qp->s_tail_ack_queue];
+		req = ack_to_tid_req(e);
+
+		/*
+		 * Send scheduled RNR NAK's. RNR NAK's need to be sent at
+		 * segment boundaries, not at request boundaries. Don't change
+		 * s_ack_state because we are still in the middle of a request
+		 */
+		if (qpriv->rnr_nak_state == TID_RNR_NAK_SEND &&
+		    qp->s_tail_ack_queue == qpriv->r_tid_alloc &&
+		    req->cur_seg == req->alloc_seg) {
+			qpriv->rnr_nak_state = TID_RNR_NAK_SENT;
+			goto normal_no_state;
+		}
+
+		bth2 = mask_psn(qp->s_ack_rdma_psn);
+		hdrlen = hfi1_build_tid_rdma_write_resp(qp, e, ohdr, &bth1,
+							bth2, &len,
+							&ps->s_txreq->ss);
+		if (!hdrlen)
+			return 0;
+
+		hwords += hdrlen;
+		bth0 = qp->s_ack_state << 24;
+		qp->s_ack_rdma_psn++;
+		trace_hfi1_tid_req_make_rc_ack_write(qp, 0, e->opcode, e->psn,
+						     e->lpsn, req);
+		if (req->cur_seg != req->total_segs)
+			break;
+
+		e->sent = 1;
+		qp->s_ack_state = OP(RDMA_READ_RESPONSE_LAST);
+		break;
+
 	case TID_OP(READ_RESP):
 read_resp:
 		e = &qp->s_ack_queue[qp->s_tail_ack_queue];
@@ -281,8 +377,7 @@ normal:
 		 * (see above).
 		 */
 		qp->s_ack_state = OP(SEND_ONLY);
-		qp->s_flags &= ~RVT_S_ACK_PENDING;
-		ps->s_txreq->ss = NULL;
+normal_no_state:
 		if (qp->s_nak_state)
 			ohdr->u.aeth =
 				cpu_to_be32((qp->r_msn & IB_MSN_MASK) |
@@ -294,9 +389,12 @@ normal:
 		len = 0;
 		bth0 = OP(ACKNOWLEDGE) << 24;
 		bth2 = mask_psn(qp->s_ack_psn);
+		qp->s_flags &= ~RVT_S_ACK_PENDING;
+		ps->s_txreq->txreq.flags |= SDMA_TXREQ_F_VIP;
+		ps->s_txreq->ss = NULL;
 	}
 	qp->s_rdma_ack_cnt++;
-	ps->s_txreq->sde = priv->s_sde;
+	ps->s_txreq->sde = qpriv->s_sde;
 	ps->s_txreq->s_cur_size = len;
 	ps->s_txreq->hdr_dwords = hwords;
 	hfi1_make_ruc_header(qp, ohdr, bth0, bth1, bth2, middle, ps);
@@ -349,6 +447,7 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
 	int middle = 0;
 	int delta;
 	struct tid_rdma_flow *flow = NULL;
+	struct tid_rdma_params *remote;
 
 	trace_hfi1_sender_make_rc_req(qp);
 	lockdep_assert_held(&qp->s_lock);
@@ -397,7 +496,7 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
 		goto done_free_tx;
 	}
 
-	if (qp->s_flags & (RVT_S_WAIT_RNR | RVT_S_WAIT_ACK))
+	if (qp->s_flags & (RVT_S_WAIT_RNR | RVT_S_WAIT_ACK | HFI1_S_WAIT_HALT))
 		goto bail;
 
 	if (cmp_psn(qp->s_psn, qp->s_sending_hpsn) <= 0) {
@@ -569,6 +668,113 @@ no_flow_control:
 				qp->s_cur = 0;
 			break;
 
+		case IB_WR_TID_RDMA_WRITE:
+			if (newreq) {
+				/*
+				 * Limit the number of TID RDMA WRITE requests.
+				 */
+				if (atomic_read(&priv->n_tid_requests) >=
+				    HFI1_TID_RDMA_WRITE_CNT)
+					goto bail;
+
+				if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT))
+					qp->s_lsn++;
+			}
+
+			hwords += hfi1_build_tid_rdma_write_req(qp, wqe, ohdr,
+								&bth1, &bth2,
+								&len);
+			ss = NULL;
+			if (priv->s_tid_cur == HFI1_QP_WQE_INVALID) {
+				priv->s_tid_cur = qp->s_cur;
+				if (priv->s_tid_tail == HFI1_QP_WQE_INVALID) {
+					priv->s_tid_tail = qp->s_cur;
+					priv->s_state = TID_OP(WRITE_RESP);
+				}
+			} else if (priv->s_tid_cur == priv->s_tid_head) {
+				struct rvt_swqe *__w;
+				struct tid_rdma_request *__r;
+
+				__w = rvt_get_swqe_ptr(qp, priv->s_tid_cur);
+				__r = wqe_to_tid_req(__w);
+
+				/*
+				 * The s_tid_cur pointer is advanced to s_cur if
+				 * any of the following conditions about the WQE
+				 * to which s_ti_cur currently points to are
+				 * satisfied:
+				 *   1. The request is not a TID RDMA WRITE
+				 *      request,
+				 *   2. The request is in the INACTIVE or
+				 *      COMPLETE states (TID RDMA READ requests
+				 *      stay at INACTIVE and TID RDMA WRITE
+				 *      transition to COMPLETE when done),
+				 *   3. The request is in the ACTIVE or SYNC
+				 *      state and the number of completed
+				 *      segments is equal to the total segment
+				 *      count.
+				 *      (If ACTIVE, the request is waiting for
+				 *       ACKs. If SYNC, the request has not
+				 *       received any responses because it's
+				 *       waiting on a sync point.)
+				 */
+				if (__w->wr.opcode != IB_WR_TID_RDMA_WRITE ||
+				    __r->state == TID_REQUEST_INACTIVE ||
+				    __r->state == TID_REQUEST_COMPLETE ||
+				    ((__r->state == TID_REQUEST_ACTIVE ||
+				      __r->state == TID_REQUEST_SYNC) &&
+				     __r->comp_seg == __r->total_segs)) {
+					if (priv->s_tid_tail ==
+					    priv->s_tid_cur &&
+					    priv->s_state ==
+					    TID_OP(WRITE_DATA_LAST)) {
+						priv->s_tid_tail = qp->s_cur;
+						priv->s_state =
+							TID_OP(WRITE_RESP);
+					}
+					priv->s_tid_cur = qp->s_cur;
+				}
+				/*
+				 * A corner case: when the last TID RDMA WRITE
+				 * request was completed, s_tid_head,
+				 * s_tid_cur, and s_tid_tail all point to the
+				 * same location. Other requests are posted and
+				 * s_cur wraps around to the same location,
+				 * where a new TID RDMA WRITE is posted. In
+				 * this case, none of the indices need to be
+				 * updated. However, the priv->s_state should.
+				 */
+				if (priv->s_tid_tail == qp->s_cur &&
+				    priv->s_state == TID_OP(WRITE_DATA_LAST))
+					priv->s_state = TID_OP(WRITE_RESP);
+			}
+			req = wqe_to_tid_req(wqe);
+			if (newreq) {
+				priv->s_tid_head = qp->s_cur;
+				priv->pending_tid_w_resp += req->total_segs;
+				atomic_inc(&priv->n_tid_requests);
+				atomic_dec(&priv->n_requests);
+			} else {
+				req->state = TID_REQUEST_RESEND;
+				req->comp_seg = delta_psn(bth2, wqe->psn);
+				/*
+				 * Pull back any segments since we are going
+				 * to re-receive them.
+				 */
+				req->setup_head = req->clear_tail;
+				priv->pending_tid_w_resp +=
+					delta_psn(wqe->lpsn, bth2) + 1;
+			}
+
+			trace_hfi1_tid_write_sender_make_req(qp, newreq);
+			trace_hfi1_tid_req_make_req_write(qp, newreq,
+							  wqe->wr.opcode,
+							  wqe->psn, wqe->lpsn,
+							  req);
+			if (++qp->s_cur == qp->s_size)
+				qp->s_cur = 0;
+			break;
+
 		case IB_WR_RDMA_READ:
 			/*
 			 * Don't allow more operations to be started
@@ -728,7 +934,8 @@ no_flow_control:
 			if (qp->s_tail >= qp->s_size)
 				qp->s_tail = 0;
 		}
-		if (wqe->wr.opcode == IB_WR_RDMA_READ)
+		if (wqe->wr.opcode == IB_WR_RDMA_READ ||
+		    wqe->wr.opcode == IB_WR_TID_RDMA_WRITE)
 			qp->s_psn = wqe->lpsn + 1;
 		else if (wqe->wr.opcode == IB_WR_TID_RDMA_READ)
 			qp->s_psn = req->s_next_psn;
@@ -848,6 +1055,35 @@ no_flow_control:
 		if (qp->s_cur == qp->s_size)
 			qp->s_cur = 0;
 		break;
+
+	case TID_OP(WRITE_RESP):
+		/*
+		 * This value for s_state is used for restarting a TID RDMA
+		 * WRITE request. See comment in OP(RDMA_READ_RESPONSE_MIDDLE
+		 * for more).
+		 */
+		req = wqe_to_tid_req(wqe);
+		req->state = TID_REQUEST_RESEND;
+		rcu_read_lock();
+		remote = rcu_dereference(priv->tid_rdma.remote);
+		req->comp_seg = delta_psn(qp->s_psn, wqe->psn);
+		len = wqe->length - (req->comp_seg * remote->max_len);
+		rcu_read_unlock();
+
+		bth2 = mask_psn(qp->s_psn);
+		hwords += hfi1_build_tid_rdma_write_req(qp, wqe, ohdr, &bth1,
+							&bth2, &len);
+		qp->s_psn = wqe->lpsn + 1;
+		ss = NULL;
+		qp->s_state = TID_OP(WRITE_REQ);
+		priv->pending_tid_w_resp += delta_psn(wqe->lpsn, bth2) + 1;
+		priv->s_tid_cur = qp->s_cur;
+		if (++qp->s_cur == qp->s_size)
+			qp->s_cur = 0;
+		trace_hfi1_tid_req_make_req_write(qp, 0, wqe->wr.opcode,
+						  wqe->psn, wqe->lpsn, req);
+		break;
+
 	case TID_OP(READ_RESP):
 		if (wqe->wr.opcode != IB_WR_TID_RDMA_READ)
 			goto bail;
@@ -948,7 +1184,8 @@ no_flow_control:
 	}
 	qp->s_sending_hpsn = bth2;
 	delta = delta_psn(bth2, wqe->psn);
-	if (delta && delta % HFI1_PSN_CREDIT == 0)
+	if (delta && delta % HFI1_PSN_CREDIT == 0 &&
+	    wqe->wr.opcode != IB_WR_TID_RDMA_WRITE)
 		bth2 |= IB_BTH_REQ_ACK;
 	if (qp->s_flags & RVT_S_SEND_ONE) {
 		qp->s_flags &= ~RVT_S_SEND_ONE;
@@ -981,6 +1218,12 @@ bail:
 bail_no_tx:
 	ps->s_txreq = NULL;
 	qp->s_flags &= ~RVT_S_BUSY;
+	/*
+	 * If we didn't get a txreq, the QP will be woken up later to try
+	 * again. Set the flags to indicate which work item to wake
+	 * up.
+	 */
+	iowait_set_flag(&priv->s_iowait, IOWAIT_PENDING_IB);
 	return 0;
 }
 
@@ -1268,6 +1511,7 @@ static void reset_psn(struct rvt_qp *qp, u32 psn)
 	lockdep_assert_held(&qp->s_lock);
 	qp->s_cur = n;
 	priv->pending_tid_r_segs = 0;
+	priv->pending_tid_w_resp = 0;
 	qp->s_num_rd_atomic = 0;
 
 	/*
@@ -1325,6 +1569,10 @@ static void reset_psn(struct rvt_qp *qp, u32 psn)
 		qp->s_state = OP(RDMA_READ_RESPONSE_LAST);
 		break;
 
+	case IB_WR_TID_RDMA_WRITE:
+		qp->s_state = TID_OP(WRITE_RESP);
+		break;
+
 	case IB_WR_RDMA_READ:
 		qp->s_state = OP(RDMA_READ_RESPONSE_MIDDLE);
 		break;
@@ -1389,6 +1637,7 @@ void hfi1_restart_rc(struct rvt_qp *qp, u32 psn, int wait)
 				wqe = do_rc_completion(qp, wqe, ibp);
 				qp->s_flags &= ~RVT_S_WAIT_ACK;
 			} else {
+				trace_hfi1_tid_write_sender_restart_rc(qp, 0);
 				if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) {
 					struct tid_rdma_request *req;
 
@@ -1418,7 +1667,7 @@ void hfi1_restart_rc(struct rvt_qp *qp, u32 psn, int wait)
 
 	qp->s_flags &= ~(RVT_S_WAIT_FENCE | RVT_S_WAIT_RDMAR |
 			 RVT_S_WAIT_SSN_CREDIT | RVT_S_WAIT_PSN |
-			 RVT_S_WAIT_ACK);
+			 RVT_S_WAIT_ACK | HFI1_S_WAIT_TID_RESP);
 	if (wait)
 		qp->s_flags |= RVT_S_SEND_ONE;
 	reset_psn(qp, psn);
@@ -1426,7 +1675,8 @@ void hfi1_restart_rc(struct rvt_qp *qp, u32 psn, int wait)
 
 /*
  * Set qp->s_sending_psn to the next PSN after the given one.
- * This would be psn+1 except when RDMA reads are present.
+ * This would be psn+1 except when RDMA reads or TID RDMA ops
+ * are present.
  */
 static void reset_sending_psn(struct rvt_qp *qp, u32 psn)
 {
@@ -1439,7 +1689,8 @@ static void reset_sending_psn(struct rvt_qp *qp, u32 psn)
 		wqe = rvt_get_swqe_ptr(qp, n);
 		if (cmp_psn(psn, wqe->lpsn) <= 0) {
 			if (wqe->wr.opcode == IB_WR_RDMA_READ ||
-			    wqe->wr.opcode == IB_WR_TID_RDMA_READ)
+			    wqe->wr.opcode == IB_WR_TID_RDMA_READ ||
+			    wqe->wr.opcode == IB_WR_TID_RDMA_WRITE)
 				qp->s_sending_psn = wqe->lpsn + 1;
 			else
 				qp->s_sending_psn = psn + 1;
@@ -1462,8 +1713,9 @@ void hfi1_rc_send_complete(struct rvt_qp *qp, struct hfi1_opa_header *opah)
 	struct rvt_swqe *wqe;
 	struct ib_header *hdr = NULL;
 	struct hfi1_16b_header *hdr_16b = NULL;
-	u32 opcode;
+	u32 opcode, head, tail;
 	u32 psn;
+	struct tid_rdma_request *req;
 
 	lockdep_assert_held(&qp->s_lock);
 	if (!(ib_rvt_state_ops[qp->state] & RVT_SEND_OR_FLUSH_OR_RECV_OK))
@@ -1490,29 +1742,84 @@ void hfi1_rc_send_complete(struct rvt_qp *qp, struct hfi1_opa_header *opah)
 	opcode = ib_bth_get_opcode(ohdr);
 	if ((opcode >= OP(RDMA_READ_RESPONSE_FIRST) &&
 	     opcode <= OP(ATOMIC_ACKNOWLEDGE)) ||
-	    opcode == TID_OP(READ_RESP)) {
+	    opcode == TID_OP(READ_RESP) ||
+	    opcode == TID_OP(WRITE_RESP)) {
 		WARN_ON(!qp->s_rdma_ack_cnt);
 		qp->s_rdma_ack_cnt--;
 		return;
 	}
 
 	psn = ib_bth_get_psn(ohdr);
-	reset_sending_psn(qp, psn);
+	/*
+	 * Don't attempt to reset the sending PSN for packets in the
+	 * KDETH PSN space since the PSN does not match anything.
+	 */
+	if (opcode != TID_OP(WRITE_DATA) &&
+	    opcode != TID_OP(WRITE_DATA_LAST) &&
+	    opcode != TID_OP(ACK) && opcode != TID_OP(RESYNC))
+		reset_sending_psn(qp, psn);
+
+	/* Handle TID RDMA WRITE packets differently */
+	if (opcode >= TID_OP(WRITE_REQ) &&
+	    opcode <= TID_OP(WRITE_DATA_LAST)) {
+		head = priv->s_tid_head;
+		tail = priv->s_tid_cur;
+		/*
+		 * s_tid_cur is set to s_tid_head in the case, where
+		 * a new TID RDMA request is being started and all
+		 * previous ones have been completed.
+		 * Therefore, we need to do a secondary check in order
+		 * to properly determine whether we should start the
+		 * RC timer.
+		 */
+		wqe = rvt_get_swqe_ptr(qp, tail);
+		req = wqe_to_tid_req(wqe);
+		if (head == tail && req->comp_seg < req->total_segs) {
+			if (tail == 0)
+				tail = qp->s_size - 1;
+			else
+				tail -= 1;
+		}
+	} else {
+		head = qp->s_tail;
+		tail = qp->s_acked;
+	}
 
 	/*
 	 * Start timer after a packet requesting an ACK has been sent and
 	 * there are still requests that haven't been acked.
 	 */
-	if ((psn & IB_BTH_REQ_ACK) && qp->s_acked != qp->s_tail &&
+	if ((psn & IB_BTH_REQ_ACK) && tail != head &&
+	    opcode != TID_OP(WRITE_DATA) && opcode != TID_OP(WRITE_DATA_LAST) &&
+	    opcode != TID_OP(RESYNC) &&
 	    !(qp->s_flags &
-		(RVT_S_TIMER | RVT_S_WAIT_RNR | RVT_S_WAIT_PSN)) &&
-		(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) {
+	      (RVT_S_TIMER | RVT_S_WAIT_RNR | RVT_S_WAIT_PSN)) &&
+	    (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) {
 		if (opcode == TID_OP(READ_REQ))
 			rvt_add_retry_timer_ext(qp, priv->timeout_shift);
 		else
 			rvt_add_retry_timer(qp);
 	}
 
+	/* Start TID RDMA ACK timer */
+	if ((opcode == TID_OP(WRITE_DATA) ||
+	     opcode == TID_OP(WRITE_DATA_LAST) ||
+	     opcode == TID_OP(RESYNC)) &&
+	    (psn & IB_BTH_REQ_ACK) &&
+	    !(priv->s_flags & HFI1_S_TID_RETRY_TIMER) &&
+	    (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) {
+		/*
+		 * The TID RDMA ACK packet could be received before this
+		 * function is called. Therefore, add the timer only if TID
+		 * RDMA ACK packets are actually pending.
+		 */
+		wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
+		req = wqe_to_tid_req(wqe);
+		if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE &&
+		    req->ack_seg < req->cur_seg)
+			hfi1_add_tid_retry_timer(qp);
+	}
+
 	while (qp->s_last != qp->s_acked) {
 		u32 s_last;
 
@@ -1611,7 +1918,16 @@ struct rvt_swqe *do_rc_completion(struct rvt_qp *qp,
 	}
 
 	qp->s_retry = qp->s_retry_cnt;
-	update_last_psn(qp, wqe->lpsn);
+	/*
+	 * Don't update the last PSN if the request being completed is
+	 * a TID RDMA WRITE request.
+	 * Completion of the TID RDMA WRITE requests are done by the
+	 * TID RDMA ACKs and as such could be for a request that has
+	 * already been ACKed as far as the IB state machine is
+	 * concerned.
+	 */
+	if (wqe->wr.opcode != IB_WR_TID_RDMA_WRITE)
+		update_last_psn(qp, wqe->lpsn);
 
 	/*
 	 * If we are completing a request which is in the process of
@@ -1641,6 +1957,54 @@ struct rvt_swqe *do_rc_completion(struct rvt_qp *qp,
 	return wqe;
 }
 
+static void set_restart_qp(struct rvt_qp *qp, struct hfi1_ctxtdata *rcd)
+{
+	/* Retry this request. */
+	if (!(qp->r_flags & RVT_R_RDMAR_SEQ)) {
+		qp->r_flags |= RVT_R_RDMAR_SEQ;
+		hfi1_restart_rc(qp, qp->s_last_psn + 1, 0);
+		if (list_empty(&qp->rspwait)) {
+			qp->r_flags |= RVT_R_RSP_SEND;
+			rvt_get_qp(qp);
+			list_add_tail(&qp->rspwait, &rcd->qp_wait_list);
+		}
+	}
+}
+
+/**
+ * update_qp_retry_state - Update qp retry state.
+ * @qp: the QP
+ * @psn: the packet sequence number of the TID RDMA WRITE RESP.
+ * @spsn:  The start psn for the given TID RDMA WRITE swqe.
+ * @lpsn:  The last psn for the given TID RDMA WRITE swqe.
+ *
+ * This function is called to update the qp retry state upon
+ * receiving a TID WRITE RESP after the qp is scheduled to retry
+ * a request.
+ */
+static void update_qp_retry_state(struct rvt_qp *qp, u32 psn, u32 spsn,
+				  u32 lpsn)
+{
+	struct hfi1_qp_priv *qpriv = qp->priv;
+
+	qp->s_psn = psn + 1;
+	/*
+	 * If this is the first TID RDMA WRITE RESP packet for the current
+	 * request, change the s_state so that the retry will be processed
+	 * correctly. Similarly, if this is the last TID RDMA WRITE RESP
+	 * packet, change the s_state and advance the s_cur.
+	 */
+	if (cmp_psn(psn, lpsn) >= 0) {
+		qp->s_cur = qpriv->s_tid_cur + 1;
+		if (qp->s_cur >= qp->s_size)
+			qp->s_cur = 0;
+		qp->s_state = TID_OP(WRITE_REQ);
+	} else  if (!cmp_psn(psn, spsn)) {
+		qp->s_cur = qpriv->s_tid_cur;
+		qp->s_state = TID_OP(WRITE_RESP);
+	}
+}
+
 /**
  * do_rc_ack - process an incoming RC ACK
  * @qp: the QP the ACK came in on
@@ -1662,6 +2026,7 @@ int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode,
 	int ret = 0;
 	u32 ack_psn;
 	int diff;
+	struct rvt_dev_info *rdi;
 
 	lockdep_assert_held(&qp->s_lock);
 	/*
@@ -1708,18 +2073,10 @@ int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode,
 		     (opcode != TID_OP(READ_RESP) || diff != 0)) ||
 		    ((wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
 		      wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) &&
-		     (opcode != OP(ATOMIC_ACKNOWLEDGE) || diff != 0))) {
-			/* Retry this request. */
-			if (!(qp->r_flags & RVT_R_RDMAR_SEQ)) {
-				qp->r_flags |= RVT_R_RDMAR_SEQ;
-				hfi1_restart_rc(qp, qp->s_last_psn + 1, 0);
-				if (list_empty(&qp->rspwait)) {
-					qp->r_flags |= RVT_R_RSP_SEND;
-					rvt_get_qp(qp);
-					list_add_tail(&qp->rspwait,
-						      &rcd->qp_wait_list);
-				}
-			}
+		     (opcode != OP(ATOMIC_ACKNOWLEDGE) || diff != 0)) ||
+		    (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE &&
+		     (delta_psn(psn, qp->s_last_psn) != 1))) {
+			set_restart_qp(qp, rcd);
 			/*
 			 * No need to process the ACK/NAK since we are
 			 * restarting an earlier request.
@@ -1751,6 +2108,14 @@ int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode,
 				hfi1_schedule_send(qp);
 			}
 		}
+
+		/*
+		 * TID RDMA WRITE requests will be completed by the TID RDMA
+		 * ACK packet handler (see tid_rdma.c).
+		 */
+		if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE)
+			break;
+
 		wqe = do_rc_completion(qp, wqe, ibp);
 		if (qp->s_acked == qp->s_tail)
 			break;
@@ -1768,17 +2133,60 @@ int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode,
 			else
 				rvt_stop_rc_timers(qp);
 		} else if (qp->s_acked != qp->s_tail) {
+			struct rvt_swqe *__w = NULL;
+
+			if (qpriv->s_tid_cur != HFI1_QP_WQE_INVALID)
+				__w = rvt_get_swqe_ptr(qp, qpriv->s_tid_cur);
+
 			/*
-			 * We are expecting more ACKs so
-			 * mod the retry timer.
-			 */
-			rvt_mod_retry_timer(qp);
-			/*
-			 * We can stop re-sending the earlier packets and
-			 * continue with the next packet the receiver wants.
+			 * Stop timers if we've received all of the TID RDMA
+			 * WRITE * responses.
 			 */
-			if (cmp_psn(qp->s_psn, psn) <= 0)
-				reset_psn(qp, psn + 1);
+			if (__w && __w->wr.opcode == IB_WR_TID_RDMA_WRITE &&
+			    opcode == TID_OP(WRITE_RESP)) {
+				/*
+				 * Normally, the loop above would correctly
+				 * process all WQEs from s_acked onward and
+				 * either complete them or check for correct
+				 * PSN sequencing.
+				 * However, for TID RDMA, due to pipelining,
+				 * the response may not be for the request at
+				 * s_acked so the above look would just be
+				 * skipped. This does not allow for checking
+				 * the PSN sequencing. It has to be done
+				 * separately.
+				 */
+				if (cmp_psn(psn, qp->s_last_psn + 1)) {
+					set_restart_qp(qp, rcd);
+					goto bail_stop;
+				}
+				/*
+				 * If the psn is being resent, stop the
+				 * resending.
+				 */
+				if (qp->s_cur != qp->s_tail &&
+				    cmp_psn(qp->s_psn, psn) <= 0)
+					update_qp_retry_state(qp, psn,
+							      __w->psn,
+							      __w->lpsn);
+				else if (--qpriv->pending_tid_w_resp)
+					rvt_mod_retry_timer(qp);
+				else
+					rvt_stop_rc_timers(qp);
+			} else {
+				/*
+				 * We are expecting more ACKs so
+				 * mod the retry timer.
+				 */
+				rvt_mod_retry_timer(qp);
+				/*
+				 * We can stop re-sending the earlier packets
+				 * and continue with the next packet the
+				 * receiver wants.
+				 */
+				if (cmp_psn(qp->s_psn, psn) <= 0)
+					reset_psn(qp, psn + 1);
+			}
 		} else {
 			/* No more acks - kill all timers */
 			rvt_stop_rc_timers(qp);
@@ -1794,6 +2202,15 @@ int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode,
 		rvt_get_credit(qp, aeth);
 		qp->s_rnr_retry = qp->s_rnr_retry_cnt;
 		qp->s_retry = qp->s_retry_cnt;
+		/*
+		 * If the current request is a TID RDMA WRITE request and the
+		 * response is not a TID RDMA WRITE RESP packet, s_last_psn
+		 * can't be advanced.
+		 */
+		if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE &&
+		    opcode != TID_OP(WRITE_RESP) &&
+		    cmp_psn(psn, wqe->psn) >= 0)
+			return 1;
 		update_last_psn(qp, psn);
 		return 1;
 
@@ -1803,20 +2220,31 @@ int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode,
 			goto bail_stop;
 		if (qp->s_flags & RVT_S_WAIT_RNR)
 			goto bail_stop;
-		if (qp->s_rnr_retry == 0) {
+		rdi = ib_to_rvt(qp->ibqp.device);
+		if (qp->s_rnr_retry == 0 &&
+		    !((rdi->post_parms[wqe->wr.opcode].flags &
+		      RVT_OPERATION_IGN_RNR_CNT) &&
+		      qp->s_rnr_retry_cnt == 0)) {
 			status = IB_WC_RNR_RETRY_EXC_ERR;
 			goto class_b;
 		}
-		if (qp->s_rnr_retry_cnt < 7)
+		if (qp->s_rnr_retry_cnt < 7 && qp->s_rnr_retry_cnt > 0)
 			qp->s_rnr_retry--;
 
-		/* The last valid PSN is the previous PSN. */
-		update_last_psn(qp, psn - 1);
+		/*
+		 * The last valid PSN is the previous PSN. For TID RDMA WRITE
+		 * request, s_last_psn should be incremented only when a TID
+		 * RDMA WRITE RESP is received to avoid skipping lost TID RDMA
+		 * WRITE RESP packets.
+		 */
+		if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE) {
+			reset_psn(qp, qp->s_last_psn + 1);
+		} else {
+			update_last_psn(qp, psn - 1);
+			reset_psn(qp, psn);
+		}
 
 		ibp->rvp.n_rc_resends += delta_psn(qp->s_psn, psn);
-
-		reset_psn(qp, psn);
-
 		qp->s_flags &= ~(RVT_S_WAIT_SSN_CREDIT | RVT_S_WAIT_ACK);
 		rvt_stop_rc_timers(qp);
 		rvt_add_rnr_timer(qp, aeth);
@@ -1901,6 +2329,7 @@ static void rdma_seq_err(struct rvt_qp *qp, struct hfi1_ibport *ibp, u32 psn,
 	while (cmp_psn(psn, wqe->lpsn) > 0) {
 		if (wqe->wr.opcode == IB_WR_RDMA_READ ||
 		    wqe->wr.opcode == IB_WR_TID_RDMA_READ ||
+		    wqe->wr.opcode == IB_WR_TID_RDMA_WRITE ||
 		    wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
 		    wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD)
 			break;
@@ -2235,6 +2664,8 @@ static noinline int rc_rcv_error(struct ib_other_headers *ohdr, void *data,
 		e->psn = psn;
 		if (old_req)
 			goto unlock_done;
+		if (qp->s_acked_ack_queue == qp->s_tail_ack_queue)
+			qp->s_acked_ack_queue = prev;
 		qp->s_tail_ack_queue = prev;
 		break;
 	}
@@ -2248,6 +2679,8 @@ static noinline int rc_rcv_error(struct ib_other_headers *ohdr, void *data,
 		 */
 		if (!e || e->opcode != (u8)opcode || old_req)
 			goto unlock_done;
+		if (qp->s_tail_ack_queue == qp->s_acked_ack_queue)
+			qp->s_acked_ack_queue = prev;
 		qp->s_tail_ack_queue = prev;
 		break;
 	}
@@ -2274,6 +2707,8 @@ static noinline int rc_rcv_error(struct ib_other_headers *ohdr, void *data,
 		 * Resend the RDMA read or atomic op which
 		 * ACKs this duplicate request.
 		 */
+		if (qp->s_tail_ack_queue == qp->s_acked_ack_queue)
+			qp->s_acked_ack_queue = mra;
 		qp->s_tail_ack_queue = mra;
 		break;
 	}
@@ -2388,6 +2823,7 @@ void hfi1_rc_rcv(struct hfi1_packet *packet)
 	void *data = packet->payload;
 	u32 tlen = packet->tlen;
 	struct rvt_qp *qp = packet->qp;
+	struct hfi1_qp_priv *qpriv = qp->priv;
 	struct hfi1_ibport *ibp = rcd_to_iport(rcd);
 	struct ib_other_headers *ohdr = packet->ohdr;
 	u32 opcode = packet->opcode;
@@ -2646,7 +3082,7 @@ send_last:
 		if (next > rvt_size_atomic(ib_to_rvt(qp->ibqp.device)))
 			next = 0;
 		spin_lock_irqsave(&qp->s_lock, flags);
-		if (unlikely(next == qp->s_tail_ack_queue)) {
+		if (unlikely(next == qp->s_acked_ack_queue)) {
 			if (!qp->s_ack_queue[next].sent)
 				goto nack_inv_unlck;
 			update_ack_queue(qp, next);
@@ -2693,6 +3129,7 @@ send_last:
 		qp->r_state = opcode;
 		qp->r_nak_state = 0;
 		qp->r_head_ack_queue = next;
+		qpriv->r_tid_alloc = qp->r_head_ack_queue;
 
 		/* Schedule the send engine. */
 		qp->s_flags |= RVT_S_RESP_PENDING;
@@ -2723,7 +3160,7 @@ send_last:
 		if (next > rvt_size_atomic(ib_to_rvt(qp->ibqp.device)))
 			next = 0;
 		spin_lock_irqsave(&qp->s_lock, flags);
-		if (unlikely(next == qp->s_tail_ack_queue)) {
+		if (unlikely(next == qp->s_acked_ack_queue)) {
 			if (!qp->s_ack_queue[next].sent)
 				goto nack_inv_unlck;
 			update_ack_queue(qp, next);
@@ -2766,6 +3203,7 @@ ack:
 		qp->r_state = opcode;
 		qp->r_nak_state = 0;
 		qp->r_head_ack_queue = next;
+		qpriv->r_tid_alloc = qp->r_head_ack_queue;
 
 		/* Schedule the send engine. */
 		qp->s_flags |= RVT_S_RESP_PENDING;
diff --git a/drivers/infiniband/hw/hfi1/rc.h b/drivers/infiniband/hw/hfi1/rc.h
index 4329eadcb3df..8e0935b9bf2a 100644
--- a/drivers/infiniband/hw/hfi1/rc.h
+++ b/drivers/infiniband/hw/hfi1/rc.h
@@ -18,6 +18,7 @@ static inline void update_ack_queue(struct rvt_qp *qp, unsigned int n)
 	if (next > rvt_size_atomic(ib_to_rvt(qp->ibqp.device)))
 		next = 0;
 	qp->s_tail_ack_queue = next;
+	qp->s_acked_ack_queue = next;
 	qp->s_ack_state = OP(ACKNOWLEDGE);
 }
 
diff --git a/drivers/infiniband/hw/hfi1/ruc.c b/drivers/infiniband/hw/hfi1/ruc.c
index f96c0f544cb0..124a3ec1e15c 100644
--- a/drivers/infiniband/hw/hfi1/ruc.c
+++ b/drivers/infiniband/hw/hfi1/ruc.c
@@ -453,11 +453,13 @@ void hfi1_make_ruc_header(struct rvt_qp *qp, struct ib_other_headers *ohdr,
 #define SEND_RESCHED_TIMEOUT (5 * HZ)  /* 5s in jiffies */
 
 /**
- * schedule_send_yield - test for a yield required for QP send engine
+ * hfi1_schedule_send_yield - test for a yield required for QP
+ * send engine
  * @timeout: Final time for timeout slice for jiffies
  * @qp: a pointer to QP
  * @ps: a pointer to a structure with commonly lookup values for
  *      the the send engine progress
+ * @tid - true if it is the tid leg
  *
  * This routine checks if the time slice for the QP has expired
  * for RC QPs, if so an additional work entry is queued. At this
@@ -465,8 +467,8 @@ void hfi1_make_ruc_header(struct rvt_qp *qp, struct ib_other_headers *ohdr,
  * returns true if a yield is required, otherwise, false
  * is returned.
  */
-static bool schedule_send_yield(struct rvt_qp *qp,
-				struct hfi1_pkt_state *ps)
+bool hfi1_schedule_send_yield(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
+			      bool tid)
 {
 	ps->pkts_sent = true;
 
@@ -474,8 +476,24 @@ static bool schedule_send_yield(struct rvt_qp *qp,
 		if (!ps->in_thread ||
 		    workqueue_congested(ps->cpu, ps->ppd->hfi1_wq)) {
 			spin_lock_irqsave(&qp->s_lock, ps->flags);
-			qp->s_flags &= ~RVT_S_BUSY;
-			hfi1_schedule_send(qp);
+			if (!tid) {
+				qp->s_flags &= ~RVT_S_BUSY;
+				hfi1_schedule_send(qp);
+			} else {
+				struct hfi1_qp_priv *priv = qp->priv;
+
+				if (priv->s_flags &
+				    HFI1_S_TID_BUSY_SET) {
+					qp->s_flags &= ~RVT_S_BUSY;
+					priv->s_flags &=
+						~(HFI1_S_TID_BUSY_SET |
+						  RVT_S_BUSY);
+				} else {
+					priv->s_flags &= ~RVT_S_BUSY;
+				}
+				hfi1_schedule_tid_send(qp);
+			}
+
 			spin_unlock_irqrestore(&qp->s_lock, ps->flags);
 			this_cpu_inc(*ps->ppd->dd->send_schedule);
 			trace_hfi1_rc_expired_time_slice(qp, true);
@@ -576,6 +594,8 @@ void hfi1_do_send(struct rvt_qp *qp, bool in_thread)
 	do {
 		/* Check for a constructed packet to be sent. */
 		if (ps.s_txreq) {
+			if (priv->s_flags & HFI1_S_TID_BUSY_SET)
+				qp->s_flags |= RVT_S_BUSY;
 			spin_unlock_irqrestore(&qp->s_lock, ps.flags);
 			/*
 			 * If the packet cannot be sent now, return and
@@ -585,7 +605,7 @@ void hfi1_do_send(struct rvt_qp *qp, bool in_thread)
 				return;
 
 			/* allow other tasks to run */
-			if (schedule_send_yield(qp, &ps))
+			if (hfi1_schedule_send_yield(qp, &ps, false))
 				return;
 
 			spin_lock_irqsave(&qp->s_lock, ps.flags);
diff --git a/drivers/infiniband/hw/hfi1/sdma.c b/drivers/infiniband/hw/hfi1/sdma.c
index 96897a91fb0a..b0110728f541 100644
--- a/drivers/infiniband/hw/hfi1/sdma.c
+++ b/drivers/infiniband/hw/hfi1/sdma.c
@@ -1747,10 +1747,9 @@ retry:
  */
 static void sdma_desc_avail(struct sdma_engine *sde, uint avail)
 {
-	struct iowait *wait, *nw;
+	struct iowait *wait, *nw, *twait;
 	struct iowait *waits[SDMA_WAIT_BATCH_SIZE];
-	uint i, n = 0, seq, max_idx = 0;
-	u8 max_starved_cnt = 0;
+	uint i, n = 0, seq, tidx = 0;
 
 #ifdef CONFIG_SDMA_VERBOSITY
 	dd_dev_err(sde->dd, "CONFIG SDMA(%u) %s:%d %s()\n", sde->this_idx,
@@ -1775,13 +1774,20 @@ static void sdma_desc_avail(struct sdma_engine *sde, uint avail)
 					continue;
 				if (n == ARRAY_SIZE(waits))
 					break;
+				iowait_init_priority(wait);
 				num_desc = iowait_get_all_desc(wait);
 				if (num_desc > avail)
 					break;
 				avail -= num_desc;
-				/* Find the most starved wait memeber */
-				iowait_starve_find_max(wait, &max_starved_cnt,
-						       n, &max_idx);
+				/* Find the top-priority wait memeber */
+				if (n) {
+					twait = waits[tidx];
+					tidx =
+					    iowait_priority_update_top(wait,
+								       twait,
+								       n,
+								       tidx);
+				}
 				list_del_init(&wait->list);
 				waits[n++] = wait;
 			}
@@ -1790,12 +1796,12 @@ static void sdma_desc_avail(struct sdma_engine *sde, uint avail)
 		}
 	} while (read_seqretry(&sde->waitlock, seq));
 
-	/* Schedule the most starved one first */
+	/* Schedule the top-priority entry first */
 	if (n)
-		waits[max_idx]->wakeup(waits[max_idx], SDMA_AVAIL_REASON);
+		waits[tidx]->wakeup(waits[tidx], SDMA_AVAIL_REASON);
 
 	for (i = 0; i < n; i++)
-		if (i != max_idx)
+		if (i != tidx)
 			waits[i]->wakeup(waits[i], SDMA_AVAIL_REASON);
 }
 
diff --git a/drivers/infiniband/hw/hfi1/sdma_txreq.h b/drivers/infiniband/hw/hfi1/sdma_txreq.h
index bf7d777d756e..514a4784566b 100644
--- a/drivers/infiniband/hw/hfi1/sdma_txreq.h
+++ b/drivers/infiniband/hw/hfi1/sdma_txreq.h
@@ -91,6 +91,7 @@ struct sdma_desc {
 #define SDMA_TXREQ_F_URGENT       0x0001
 #define SDMA_TXREQ_F_AHG_COPY     0x0002
 #define SDMA_TXREQ_F_USE_AHG      0x0004
+#define SDMA_TXREQ_F_VIP          0x0010
 
 struct sdma_txreq;
 typedef void (*callback_t)(struct sdma_txreq *, int);
diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.c b/drivers/infiniband/hw/hfi1/tid_rdma.c
index 0ee79403acaf..bc2ff83026f7 100644
--- a/drivers/infiniband/hw/hfi1/tid_rdma.c
+++ b/drivers/infiniband/hw/hfi1/tid_rdma.c
@@ -109,12 +109,25 @@ static u32 mask_generation(u32 a)
  * C - Capcode
  */
 
+static u32 tid_rdma_flow_wt;
+
 static void tid_rdma_trigger_resume(struct work_struct *work);
 static void hfi1_kern_exp_rcv_free_flows(struct tid_rdma_request *req);
 static int hfi1_kern_exp_rcv_alloc_flows(struct tid_rdma_request *req,
 					 gfp_t gfp);
 static void hfi1_init_trdma_req(struct rvt_qp *qp,
 				struct tid_rdma_request *req);
+static void hfi1_tid_write_alloc_resources(struct rvt_qp *qp, bool intr_ctx);
+static void hfi1_tid_timeout(struct timer_list *t);
+static void hfi1_add_tid_reap_timer(struct rvt_qp *qp);
+static void hfi1_mod_tid_reap_timer(struct rvt_qp *qp);
+static void hfi1_mod_tid_retry_timer(struct rvt_qp *qp);
+static int hfi1_stop_tid_retry_timer(struct rvt_qp *qp);
+static void hfi1_tid_retry_timeout(struct timer_list *t);
+static int make_tid_rdma_ack(struct rvt_qp *qp,
+			     struct ib_other_headers *ohdr,
+			     struct hfi1_pkt_state *ps);
+static void hfi1_do_tid_send(struct rvt_qp *qp);
 
 static u64 tid_rdma_opfn_encode(struct tid_rdma_params *p)
 {
@@ -313,6 +326,19 @@ int hfi1_qp_priv_init(struct rvt_dev_info *rdi, struct rvt_qp *qp,
 	qpriv->flow_state.index = RXE_NUM_TID_FLOWS;
 	qpriv->flow_state.last_index = RXE_NUM_TID_FLOWS;
 	qpriv->flow_state.generation = KERN_GENERATION_RESERVED;
+	qpriv->s_state = TID_OP(WRITE_RESP);
+	qpriv->s_tid_cur = HFI1_QP_WQE_INVALID;
+	qpriv->s_tid_head = HFI1_QP_WQE_INVALID;
+	qpriv->s_tid_tail = HFI1_QP_WQE_INVALID;
+	qpriv->rnr_nak_state = TID_RNR_NAK_INIT;
+	qpriv->r_tid_head = HFI1_QP_WQE_INVALID;
+	qpriv->r_tid_tail = HFI1_QP_WQE_INVALID;
+	qpriv->r_tid_ack = HFI1_QP_WQE_INVALID;
+	qpriv->r_tid_alloc = HFI1_QP_WQE_INVALID;
+	atomic_set(&qpriv->n_requests, 0);
+	atomic_set(&qpriv->n_tid_requests, 0);
+	timer_setup(&qpriv->s_tid_timer, hfi1_tid_timeout, 0);
+	timer_setup(&qpriv->s_tid_retry_timer, hfi1_tid_retry_timeout, 0);
 	INIT_LIST_HEAD(&qpriv->tid_wait);
 
 	if (init_attr->qp_type == IB_QPT_RC && HFI1_CAP_IS_KSET(TID_RDMA)) {
@@ -1959,6 +1985,8 @@ static int tid_rdma_rcv_error(struct hfi1_packet *packet,
 {
 	struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
 	struct hfi1_ctxtdata *rcd = ((struct hfi1_qp_priv *)qp->priv)->rcd;
+	struct hfi1_ibdev *dev = to_idev(qp->ibqp.device);
+	struct hfi1_qp_priv *qpriv = qp->priv;
 	struct rvt_ack_entry *e;
 	struct tid_rdma_request *req;
 	unsigned long flags;
@@ -1982,7 +2010,8 @@ static int tid_rdma_rcv_error(struct hfi1_packet *packet,
 
 	spin_lock_irqsave(&qp->s_lock, flags);
 	e = find_prev_entry(qp, psn, &prev, NULL, &old_req);
-	if (!e || e->opcode != TID_OP(READ_REQ))
+	if (!e || (e->opcode != TID_OP(READ_REQ) &&
+		   e->opcode != TID_OP(WRITE_REQ)))
 		goto unlock;
 
 	req = ack_to_tid_req(e);
@@ -2042,8 +2071,119 @@ static int tid_rdma_rcv_error(struct hfi1_packet *packet,
 		 */
 		if (old_req)
 			goto unlock;
+	} else {
+		struct flow_state *fstate;
+		bool schedule = false;
+		u8 i;
+
+		if (req->state == TID_REQUEST_RESEND) {
+			req->state = TID_REQUEST_RESEND_ACTIVE;
+		} else if (req->state == TID_REQUEST_INIT_RESEND) {
+			req->state = TID_REQUEST_INIT;
+			schedule = true;
+		}
+
+		/*
+		 * True if the request is already scheduled (between
+		 * qp->s_tail_ack_queue and qp->r_head_ack_queue).
+		 * Also, don't change requests, which are at the SYNC
+		 * point and haven't generated any responses yet.
+		 * There is nothing to retransmit for them yet.
+		 */
+		if (old_req || req->state == TID_REQUEST_INIT ||
+		    (req->state == TID_REQUEST_SYNC && !req->cur_seg)) {
+			for (i = prev + 1; ; i++) {
+				if (i > rvt_size_atomic(&dev->rdi))
+					i = 0;
+				if (i == qp->r_head_ack_queue)
+					break;
+				e = &qp->s_ack_queue[i];
+				req = ack_to_tid_req(e);
+				if (e->opcode == TID_OP(WRITE_REQ) &&
+				    req->state == TID_REQUEST_INIT)
+					req->state = TID_REQUEST_INIT_RESEND;
+			}
+			/*
+			 * If the state of the request has been changed,
+			 * the first leg needs to get scheduled in order to
+			 * pick up the change. Otherwise, normal response
+			 * processing should take care of it.
+			 */
+			if (!schedule)
+				goto unlock;
+		}
+
+		/*
+		 * If there is no more allocated segment, just schedule the qp
+		 * without changing any state.
+		 */
+		if (req->clear_tail == req->setup_head)
+			goto schedule;
+		/*
+		 * If this request has sent responses for segments, which have
+		 * not received data yet (flow_idx != clear_tail), the flow_idx
+		 * pointer needs to be adjusted so the same responses can be
+		 * re-sent.
+		 */
+		if (CIRC_CNT(req->flow_idx, req->clear_tail, MAX_FLOWS)) {
+			fstate = &req->flows[req->clear_tail].flow_state;
+			qpriv->pending_tid_w_segs -=
+				CIRC_CNT(req->flow_idx, req->clear_tail,
+					 MAX_FLOWS);
+			req->flow_idx =
+				CIRC_ADD(req->clear_tail,
+					 delta_psn(psn, fstate->resp_ib_psn),
+					 MAX_FLOWS);
+			qpriv->pending_tid_w_segs +=
+				delta_psn(psn, fstate->resp_ib_psn);
+			/*
+			 * When flow_idx == setup_head, we've gotten a duplicate
+			 * request for a segment, which has not been allocated
+			 * yet. In that case, don't adjust this request.
+			 * However, we still want to go through the loop below
+			 * to adjust all subsequent requests.
+			 */
+			if (CIRC_CNT(req->setup_head, req->flow_idx,
+				     MAX_FLOWS)) {
+				req->cur_seg = delta_psn(psn, e->psn);
+				req->state = TID_REQUEST_RESEND_ACTIVE;
+			}
+		}
+
+		for (i = prev + 1; ; i++) {
+			/*
+			 * Look at everything up to and including
+			 * s_tail_ack_queue
+			 */
+			if (i > rvt_size_atomic(&dev->rdi))
+				i = 0;
+			if (i == qp->r_head_ack_queue)
+				break;
+			e = &qp->s_ack_queue[i];
+			req = ack_to_tid_req(e);
+			trace_hfi1_tid_req_rcv_err(qp, 0, e->opcode, e->psn,
+						   e->lpsn, req);
+			if (e->opcode != TID_OP(WRITE_REQ) ||
+			    req->cur_seg == req->comp_seg ||
+			    req->state == TID_REQUEST_INIT ||
+			    req->state == TID_REQUEST_INIT_RESEND) {
+				if (req->state == TID_REQUEST_INIT)
+					req->state = TID_REQUEST_INIT_RESEND;
+				continue;
+			}
+			qpriv->pending_tid_w_segs -=
+				CIRC_CNT(req->flow_idx,
+					 req->clear_tail,
+					 MAX_FLOWS);
+			req->flow_idx = req->clear_tail;
+			req->state = TID_REQUEST_RESEND;
+			req->cur_seg = req->comp_seg;
+		}
+		qpriv->s_flags &= ~HFI1_R_TID_WAIT_INTERLCK;
 	}
 	/* Re-process old requests.*/
+	if (qp->s_acked_ack_queue == qp->s_tail_ack_queue)
+		qp->s_acked_ack_queue = prev;
 	qp->s_tail_ack_queue = prev;
 	/*
 	 * Since the qp->s_tail_ack_queue is modified, the
@@ -2052,6 +2192,18 @@ static int tid_rdma_rcv_error(struct hfi1_packet *packet,
 	 * wrong memory region.
 	 */
 	qp->s_ack_state = OP(ACKNOWLEDGE);
+schedule:
+	/*
+	 * It's possible to receive a retry psn that is earlier than an RNRNAK
+	 * psn. In this case, the rnrnak state should be cleared.
+	 */
+	if (qpriv->rnr_nak_state) {
+		qp->s_nak_state = 0;
+		qpriv->rnr_nak_state = TID_RNR_NAK_INIT;
+		qp->r_psn = e->lpsn + 1;
+		hfi1_tid_write_alloc_resources(qp, true);
+	}
+
 	qp->r_state = e->opcode;
 	qp->r_nak_state = 0;
 	qp->s_flags |= RVT_S_RESP_PENDING;
@@ -2162,6 +2314,14 @@ void hfi1_rc_rcv_tid_rdma_read_req(struct hfi1_packet *packet)
 
 	qp->r_head_ack_queue = next;
 
+	/*
+	 * For all requests other than TID WRITE which are added to the ack
+	 * queue, qpriv->r_tid_alloc follows qp->r_head_ack_queue. It is ok to
+	 * do this because of interlocks between these and TID WRITE
+	 * requests. The same change has also been made in hfi1_rc_rcv().
+	 */
+	qpriv->r_tid_alloc = qp->r_head_ack_queue;
+
 	/* Schedule the send tasklet. */
 	qp->s_flags |= RVT_S_RESP_PENDING;
 	hfi1_schedule_send(qp);
@@ -2418,13 +2578,32 @@ static bool tid_rdma_tid_err(struct hfi1_ctxtdata *rcd,
 			     u8 opcode)
 {
 	struct rvt_qp *qp = packet->qp;
+	struct hfi1_qp_priv *qpriv = qp->priv;
 	u32 ipsn;
 	struct ib_other_headers *ohdr = packet->ohdr;
+	struct rvt_ack_entry *e;
+	struct tid_rdma_request *req;
+	struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device);
+	u32 i;
 
 	if (rcv_type >= RHF_RCV_TYPE_IB)
 		goto done;
 
 	spin_lock(&qp->s_lock);
+
+	/*
+	 * We've ran out of space in the eager buffer.
+	 * Eagerly received KDETH packets which require space in the
+	 * Eager buffer (packet that have payload) are TID RDMA WRITE
+	 * response packets. In this case, we have to re-transmit the
+	 * TID RDMA WRITE request.
+	 */
+	if (rcv_type == RHF_RCV_TYPE_EAGER) {
+		hfi1_restart_rc(qp, qp->s_last_psn + 1, 1);
+		hfi1_schedule_send(qp);
+		goto done_unlock;
+	}
+
 	/*
 	 * For TID READ response, error out QP after freeing the tid
 	 * resources.
@@ -2438,8 +2617,25 @@ static bool tid_rdma_tid_err(struct hfi1_ctxtdata *rcd,
 			rvt_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
 			goto done;
 		}
+		goto done_unlock;
 	}
 
+	/*
+	 * Error out the qp for TID RDMA WRITE
+	 */
+	hfi1_kern_clear_hw_flow(qpriv->rcd, qp);
+	for (i = 0; i < rvt_max_atomic(rdi); i++) {
+		e = &qp->s_ack_queue[i];
+		if (e->opcode == TID_OP(WRITE_REQ)) {
+			req = ack_to_tid_req(e);
+			hfi1_kern_exp_rcv_clear_all(req);
+		}
+	}
+	spin_unlock(&qp->s_lock);
+	rvt_rc_error(qp, IB_WC_LOC_LEN_ERR);
+	goto done;
+
+done_unlock:
 	spin_unlock(&qp->s_lock);
 done:
 	return true;
@@ -2689,8 +2885,12 @@ bool hfi1_handle_kdeth_eflags(struct hfi1_ctxtdata *rcd,
 	u8 opcode;
 	u32 qp_num, psn, ibpsn;
 	struct rvt_qp *qp;
+	struct hfi1_qp_priv *qpriv;
 	unsigned long flags;
 	bool ret = true;
+	struct rvt_ack_entry *e;
+	struct tid_rdma_request *req;
+	struct tid_rdma_flow *flow;
 
 	trace_hfi1_msg_handle_kdeth_eflags(NULL, "Kdeth error: rhf ",
 					   packet->rhf);
@@ -2749,14 +2949,116 @@ bool hfi1_handle_kdeth_eflags(struct hfi1_ctxtdata *rcd,
 		ibpsn = mask_psn(ibpsn);
 		ret = handle_read_kdeth_eflags(rcd, packet, rcv_type, rte, psn,
 					       ibpsn);
+		goto r_unlock;
+	}
+
+	/*
+	 * qp->s_tail_ack_queue points to the rvt_ack_entry currently being
+	 * processed. These a completed sequentially so we can be sure that
+	 * the pointer will not change until the entire request has completed.
+	 */
+	spin_lock(&qp->s_lock);
+	qpriv = qp->priv;
+	e = &qp->s_ack_queue[qpriv->r_tid_tail];
+	req = ack_to_tid_req(e);
+	flow = &req->flows[req->clear_tail];
+	trace_hfi1_eflags_err_write(qp, rcv_type, rte, psn);
+	trace_hfi1_rsp_handle_kdeth_eflags(qp, psn);
+	trace_hfi1_tid_write_rsp_handle_kdeth_eflags(qp);
+	trace_hfi1_tid_req_handle_kdeth_eflags(qp, 0, e->opcode, e->psn,
+					       e->lpsn, req);
+	trace_hfi1_tid_flow_handle_kdeth_eflags(qp, req->clear_tail, flow);
+
+	switch (rcv_type) {
+	case RHF_RCV_TYPE_EXPECTED:
+		switch (rte) {
+		case RHF_RTE_EXPECTED_FLOW_SEQ_ERR:
+			if (!(qpriv->s_flags & HFI1_R_TID_SW_PSN)) {
+				u64 reg;
+
+				qpriv->s_flags |= HFI1_R_TID_SW_PSN;
+				/*
+				 * The only sane way to get the amount of
+				 * progress is to read the HW flow state.
+				 */
+				reg = read_uctxt_csr(dd, rcd->ctxt,
+						     RCV_TID_FLOW_TABLE +
+						     (8 * flow->idx));
+				flow->flow_state.r_next_psn = mask_psn(reg);
+				qpriv->r_next_psn_kdeth =
+					flow->flow_state.r_next_psn;
+				goto nak_psn;
+			} else {
+				/*
+				 * If the received PSN does not match the next
+				 * expected PSN, NAK the packet.
+				 * However, only do that if we know that the a
+				 * NAK has already been sent. Otherwise, this
+				 * mismatch could be due to packets that were
+				 * already in flight.
+				 */
+				if (psn != flow->flow_state.r_next_psn) {
+					psn = flow->flow_state.r_next_psn;
+					goto nak_psn;
+				}
+
+				qpriv->s_nak_state = 0;
+				/*
+				 * If SW PSN verification is successful and this
+				 * is the last packet in the segment, tell the
+				 * caller to process it as a normal packet.
+				 */
+				if (psn == full_flow_psn(flow,
+							 flow->flow_state.lpsn))
+					ret = false;
+				qpriv->r_next_psn_kdeth =
+					++flow->flow_state.r_next_psn;
+			}
+			break;
+
+		case RHF_RTE_EXPECTED_FLOW_GEN_ERR:
+			goto nak_psn;
+
+		default:
+			break;
+		}
+		break;
+
+	case RHF_RCV_TYPE_ERROR:
+		switch (rte) {
+		case RHF_RTE_ERROR_OP_CODE_ERR:
+		case RHF_RTE_ERROR_KHDR_MIN_LEN_ERR:
+		case RHF_RTE_ERROR_KHDR_HCRC_ERR:
+		case RHF_RTE_ERROR_KHDR_KVER_ERR:
+		case RHF_RTE_ERROR_CONTEXT_ERR:
+		case RHF_RTE_ERROR_KHDR_TID_ERR:
+		default:
+			break;
+		}
+	default:
+		break;
 	}
 
+unlock:
+	spin_unlock(&qp->s_lock);
 r_unlock:
 	spin_unlock_irqrestore(&qp->r_lock, flags);
 rcu_unlock:
 	rcu_read_unlock();
 drop:
 	return ret;
+nak_psn:
+	ibp->rvp.n_rc_seqnak++;
+	if (!qpriv->s_nak_state) {
+		qpriv->s_nak_state = IB_NAK_PSN_ERROR;
+		/* We are NAK'ing the next expected PSN */
+		qpriv->s_nak_psn = mask_psn(flow->flow_state.r_next_psn);
+		qpriv->s_flags |= RVT_S_ACK_PENDING;
+		if (qpriv->r_tid_ack == HFI1_QP_WQE_INVALID)
+			qpriv->r_tid_ack = qpriv->r_tid_tail;
+		hfi1_schedule_tid_send(qp);
+	}
+	goto unlock;
 }
 
 /*
@@ -2770,8 +3072,9 @@ void hfi1_tid_rdma_restart_req(struct rvt_qp *qp, struct rvt_swqe *wqe,
 {
 	struct tid_rdma_request *req = wqe_to_tid_req(wqe);
 	struct tid_rdma_flow *flow;
-	int diff;
-	u32 tididx = 0;
+	struct hfi1_qp_priv *qpriv = qp->priv;
+	int diff, delta_pkts;
+	u32 tididx = 0, i;
 	u16 fidx;
 
 	if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) {
@@ -2787,11 +3090,20 @@ void hfi1_tid_rdma_restart_req(struct rvt_qp *qp, struct rvt_swqe *wqe,
 			return;
 		}
 	} else {
-		return;
+		fidx = req->acked_tail;
+		flow = &req->flows[fidx];
+		*bth2 = mask_psn(req->r_ack_psn);
 	}
 
+	if (wqe->wr.opcode == IB_WR_TID_RDMA_READ)
+		delta_pkts = delta_psn(*bth2, flow->flow_state.ib_spsn);
+	else
+		delta_pkts = delta_psn(*bth2,
+				       full_flow_psn(flow,
+						     flow->flow_state.spsn));
+
 	trace_hfi1_tid_flow_restart_req(qp, fidx, flow);
-	diff = delta_psn(*bth2, flow->flow_state.ib_spsn);
+	diff = delta_pkts + flow->resync_npkts;
 
 	flow->sent = 0;
 	flow->pkt = 0;
@@ -2815,6 +3127,18 @@ void hfi1_tid_rdma_restart_req(struct rvt_qp *qp, struct rvt_swqe *wqe,
 				break;
 		}
 	}
+	if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE) {
+		rvt_skip_sge(&qpriv->tid_ss, (req->cur_seg * req->seg_len) +
+			     flow->sent, 0);
+		/*
+		 * Packet PSN is based on flow_state.spsn + flow->pkt. However,
+		 * during a RESYNC, the generation is incremented and the
+		 * sequence is reset to 0. Since we've adjusted the npkts in the
+		 * flow and the SGE has been sufficiently advanced, we have to
+		 * adjust flow->pkt in order to calculate the correct PSN.
+		 */
+		flow->pkt -= flow->resync_npkts;
+	}
 
 	if (flow->tid_offset ==
 	    EXP_TID_GET(flow->tid_entry[tididx], LEN) * PAGE_SIZE) {
@@ -2822,13 +3146,42 @@ void hfi1_tid_rdma_restart_req(struct rvt_qp *qp, struct rvt_swqe *wqe,
 		flow->tid_offset = 0;
 	}
 	flow->tid_idx = tididx;
-	/* Move flow_idx to correct index */
-	req->flow_idx = fidx;
+	if (wqe->wr.opcode == IB_WR_TID_RDMA_READ)
+		/* Move flow_idx to correct index */
+		req->flow_idx = fidx;
+	else
+		req->clear_tail = fidx;
 
 	trace_hfi1_tid_flow_restart_req(qp, fidx, flow);
 	trace_hfi1_tid_req_restart_req(qp, 0, wqe->wr.opcode, wqe->psn,
 				       wqe->lpsn, req);
 	req->state = TID_REQUEST_ACTIVE;
+	if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE) {
+		/* Reset all the flows that we are going to resend */
+		fidx = CIRC_NEXT(fidx, MAX_FLOWS);
+		i = qpriv->s_tid_tail;
+		do {
+			for (; CIRC_CNT(req->setup_head, fidx, MAX_FLOWS);
+			      fidx = CIRC_NEXT(fidx, MAX_FLOWS)) {
+				req->flows[fidx].sent = 0;
+				req->flows[fidx].pkt = 0;
+				req->flows[fidx].tid_idx = 0;
+				req->flows[fidx].tid_offset = 0;
+				req->flows[fidx].resync_npkts = 0;
+			}
+			if (i == qpriv->s_tid_cur)
+				break;
+			do {
+				i = (++i == qp->s_size ? 0 : i);
+				wqe = rvt_get_swqe_ptr(qp, i);
+			} while (wqe->wr.opcode != IB_WR_TID_RDMA_WRITE);
+			req = wqe_to_tid_req(wqe);
+			req->cur_seg = req->ack_seg;
+			fidx = req->acked_tail;
+			/* Pull req->clear_tail back */
+			req->clear_tail = fidx;
+		} while (1);
+	}
 }
 
 void hfi1_qp_kern_exp_rcv_clear_all(struct rvt_qp *qp)
@@ -2862,6 +3215,20 @@ void hfi1_qp_kern_exp_rcv_clear_all(struct rvt_qp *qp)
 			ret = hfi1_kern_exp_rcv_clear(&priv->tid_req);
 		} while (!ret);
 	}
+	for (i = qp->s_acked_ack_queue; i != qp->r_head_ack_queue;) {
+		struct rvt_ack_entry *e = &qp->s_ack_queue[i];
+
+		if (++i == rvt_max_atomic(ib_to_rvt(qp->ibqp.device)))
+			i = 0;
+		/* Free only locally allocated TID entries */
+		if (e->opcode != TID_OP(WRITE_REQ))
+			continue;
+		do {
+			struct hfi1_ack_priv *priv = e->priv;
+
+			ret = hfi1_kern_exp_rcv_clear(&priv->tid_req);
+		} while (!ret);
+	}
 }
 
 bool hfi1_tid_rdma_wqe_interlock(struct rvt_qp *qp, struct rvt_swqe *wqe)
@@ -2869,6 +3236,7 @@ bool hfi1_tid_rdma_wqe_interlock(struct rvt_qp *qp, struct rvt_swqe *wqe)
 	struct rvt_swqe *prev;
 	struct hfi1_qp_priv *priv = qp->priv;
 	u32 s_prev;
+	struct tid_rdma_request *req;
 
 	s_prev = (qp->s_cur == 0 ? qp->s_size : qp->s_cur) - 1;
 	prev = rvt_get_swqe_ptr(qp, s_prev);
@@ -2880,14 +3248,28 @@ bool hfi1_tid_rdma_wqe_interlock(struct rvt_qp *qp, struct rvt_swqe *wqe)
 	case IB_WR_ATOMIC_CMP_AND_SWP:
 	case IB_WR_ATOMIC_FETCH_AND_ADD:
 	case IB_WR_RDMA_WRITE:
+		switch (prev->wr.opcode) {
+		case IB_WR_TID_RDMA_WRITE:
+			req = wqe_to_tid_req(prev);
+			if (req->ack_seg != req->total_segs)
+				goto interlock;
+		default:
+			break;
+		}
 	case IB_WR_RDMA_READ:
-		break;
+		if (prev->wr.opcode != IB_WR_TID_RDMA_WRITE)
+			break;
+		/* fall through */
 	case IB_WR_TID_RDMA_READ:
 		switch (prev->wr.opcode) {
 		case IB_WR_RDMA_READ:
 			if (qp->s_acked != qp->s_cur)
 				goto interlock;
 			break;
+		case IB_WR_TID_RDMA_WRITE:
+			req = wqe_to_tid_req(prev);
+			if (req->ack_seg != req->total_segs)
+				goto interlock;
 		default:
 			break;
 		}
@@ -2946,6 +3328,18 @@ void setup_tid_rdma_wqe(struct rvt_qp *qp, struct rvt_swqe *wqe)
 			new_opcode = IB_WR_TID_RDMA_READ;
 			do_tid_rdma = true;
 		}
+	} else if (wqe->wr.opcode == IB_WR_RDMA_WRITE) {
+		/*
+		 * TID RDMA is enabled for this RDMA WRITE request iff:
+		 *   1. The remote address is page-aligned,
+		 *   2. The length is larger than the minimum segment size,
+		 *   3. The length is page-multiple.
+		 */
+		if (!(wqe->rdma_wr.remote_addr & ~PAGE_MASK) &&
+		    !(wqe->length & ~PAGE_MASK)) {
+			new_opcode = IB_WR_TID_RDMA_WRITE;
+			do_tid_rdma = true;
+		}
 	}
 
 	if (do_tid_rdma) {
@@ -2962,12 +3356,22 @@ void setup_tid_rdma_wqe(struct rvt_qp *qp, struct rvt_swqe *wqe)
 			priv->tid_req.n_flows = remote->max_read;
 			qpriv->tid_r_reqs++;
 			wqe->lpsn += rvt_div_round_up_mtu(qp, wqe->length) - 1;
+		} else {
+			wqe->lpsn += priv->tid_req.total_segs - 1;
+			atomic_inc(&qpriv->n_requests);
 		}
 
 		priv->tid_req.cur_seg = 0;
 		priv->tid_req.comp_seg = 0;
 		priv->tid_req.ack_seg = 0;
 		priv->tid_req.state = TID_REQUEST_INACTIVE;
+		/*
+		 * Reset acked_tail.
+		 * TID RDMA READ does not have ACKs so it does not
+		 * update the pointer. We have to reset it so TID RDMA
+		 * WRITE does not get confused.
+		 */
+		priv->tid_req.acked_tail = priv->tid_req.setup_head;
 		trace_hfi1_tid_req_setup_tid_wqe(qp, 1, wqe->wr.opcode,
 						 wqe->psn, wqe->lpsn,
 						 &priv->tid_req);
@@ -2975,3 +3379,2087 @@ void setup_tid_rdma_wqe(struct rvt_qp *qp, struct rvt_swqe *wqe)
 exit:
 	rcu_read_unlock();
 }
+
+/* TID RDMA WRITE functions */
+
+u32 hfi1_build_tid_rdma_write_req(struct rvt_qp *qp, struct rvt_swqe *wqe,
+				  struct ib_other_headers *ohdr,
+				  u32 *bth1, u32 *bth2, u32 *len)
+{
+	struct hfi1_qp_priv *qpriv = qp->priv;
+	struct tid_rdma_request *req = wqe_to_tid_req(wqe);
+	struct tid_rdma_params *remote;
+
+	rcu_read_lock();
+	remote = rcu_dereference(qpriv->tid_rdma.remote);
+	/*
+	 * Set the number of flow to be used based on negotiated
+	 * parameters.
+	 */
+	req->n_flows = remote->max_write;
+	req->state = TID_REQUEST_ACTIVE;
+
+	KDETH_RESET(ohdr->u.tid_rdma.w_req.kdeth0, KVER, 0x1);
+	KDETH_RESET(ohdr->u.tid_rdma.w_req.kdeth1, JKEY, remote->jkey);
+	ohdr->u.tid_rdma.w_req.reth.vaddr =
+		cpu_to_be64(wqe->rdma_wr.remote_addr + (wqe->length - *len));
+	ohdr->u.tid_rdma.w_req.reth.rkey =
+		cpu_to_be32(wqe->rdma_wr.rkey);
+	ohdr->u.tid_rdma.w_req.reth.length = cpu_to_be32(*len);
+	ohdr->u.tid_rdma.w_req.verbs_qp = cpu_to_be32(qp->remote_qpn);
+	*bth1 &= ~RVT_QPN_MASK;
+	*bth1 |= remote->qp;
+	qp->s_state = TID_OP(WRITE_REQ);
+	qp->s_flags |= HFI1_S_WAIT_TID_RESP;
+	*bth2 |= IB_BTH_REQ_ACK;
+	*len = 0;
+
+	rcu_read_unlock();
+	return sizeof(ohdr->u.tid_rdma.w_req) / sizeof(u32);
+}
+
+void hfi1_compute_tid_rdma_flow_wt(void)
+{
+	/*
+	 * Heuristic for computing the RNR timeout when waiting on the flow
+	 * queue. Rather than a computationaly expensive exact estimate of when
+	 * a flow will be available, we assume that if a QP is at position N in
+	 * the flow queue it has to wait approximately (N + 1) * (number of
+	 * segments between two sync points), assuming PMTU of 4K. The rationale
+	 * for this is that flows are released and recycled at each sync point.
+	 */
+	tid_rdma_flow_wt = MAX_TID_FLOW_PSN * enum_to_mtu(OPA_MTU_4096) /
+		TID_RDMA_MAX_SEGMENT_SIZE;
+}
+
+static u32 position_in_queue(struct hfi1_qp_priv *qpriv,
+			     struct tid_queue *queue)
+{
+	return qpriv->tid_enqueue - queue->dequeue;
+}
+
+/*
+ * @qp: points to rvt_qp context.
+ * @to_seg: desired RNR timeout in segments.
+ * Return: index of the next highest timeout in the ib_hfi1_rnr_table[]
+ */
+static u32 hfi1_compute_tid_rnr_timeout(struct rvt_qp *qp, u32 to_seg)
+{
+	struct hfi1_qp_priv *qpriv = qp->priv;
+	u64 timeout;
+	u32 bytes_per_us;
+	u8 i;
+
+	bytes_per_us = active_egress_rate(qpriv->rcd->ppd) / 8;
+	timeout = (to_seg * TID_RDMA_MAX_SEGMENT_SIZE) / bytes_per_us;
+	/*
+	 * Find the next highest value in the RNR table to the required
+	 * timeout. This gives the responder some padding.
+	 */
+	for (i = 1; i <= IB_AETH_CREDIT_MASK; i++)
+		if (rvt_rnr_tbl_to_usec(i) >= timeout)
+			return i;
+	return 0;
+}
+
+/**
+ * Central place for resource allocation at TID write responder,
+ * is called from write_req and write_data interrupt handlers as
+ * well as the send thread when a queued QP is scheduled for
+ * resource allocation.
+ *
+ * Iterates over (a) segments of a request and then (b) queued requests
+ * themselves to allocate resources for up to local->max_write
+ * segments across multiple requests. Stop allocating when we
+ * hit a sync point, resume allocating after data packets at
+ * sync point have been received.
+ *
+ * Resource allocation and sending of responses is decoupled. The
+ * request/segment which are being allocated and sent are as follows.
+ * Resources are allocated for:
+ *     [request: qpriv->r_tid_alloc, segment: req->alloc_seg]
+ * The send thread sends:
+ *     [request: qp->s_tail_ack_queue, segment:req->cur_seg]
+ */
+static void hfi1_tid_write_alloc_resources(struct rvt_qp *qp, bool intr_ctx)
+{
+	struct tid_rdma_request *req;
+	struct hfi1_qp_priv *qpriv = qp->priv;
+	struct hfi1_ctxtdata *rcd = qpriv->rcd;
+	struct tid_rdma_params *local = &qpriv->tid_rdma.local;
+	struct rvt_ack_entry *e;
+	u32 npkts, to_seg;
+	bool last;
+	int ret = 0;
+
+	lockdep_assert_held(&qp->s_lock);
+
+	while (1) {
+		trace_hfi1_rsp_tid_write_alloc_res(qp, 0);
+		trace_hfi1_tid_write_rsp_alloc_res(qp);
+		/*
+		 * Don't allocate more segments if a RNR NAK has already been
+		 * scheduled to avoid messing up qp->r_psn: the RNR NAK will
+		 * be sent only when all allocated segments have been sent.
+		 * However, if more segments are allocated before that, TID RDMA
+		 * WRITE RESP packets will be sent out for these new segments
+		 * before the RNR NAK packet. When the requester receives the
+		 * RNR NAK packet, it will restart with qp->s_last_psn + 1,
+		 * which does not match qp->r_psn and will be dropped.
+		 * Consequently, the requester will exhaust its retries and
+		 * put the qp into error state.
+		 */
+		if (qpriv->rnr_nak_state == TID_RNR_NAK_SEND)
+			break;
+
+		/* No requests left to process */
+		if (qpriv->r_tid_alloc == qpriv->r_tid_head) {
+			/* If all data has been received, clear the flow */
+			if (qpriv->flow_state.index < RXE_NUM_TID_FLOWS &&
+			    !qpriv->alloc_w_segs)
+				hfi1_kern_clear_hw_flow(rcd, qp);
+			break;
+		}
+
+		e = &qp->s_ack_queue[qpriv->r_tid_alloc];
+		if (e->opcode != TID_OP(WRITE_REQ))
+			goto next_req;
+		req = ack_to_tid_req(e);
+		trace_hfi1_tid_req_write_alloc_res(qp, 0, e->opcode, e->psn,
+						   e->lpsn, req);
+		/* Finished allocating for all segments of this request */
+		if (req->alloc_seg >= req->total_segs)
+			goto next_req;
+
+		/* Can allocate only a maximum of local->max_write for a QP */
+		if (qpriv->alloc_w_segs >= local->max_write)
+			break;
+
+		/* Don't allocate at a sync point with data packets pending */
+		if (qpriv->sync_pt && qpriv->alloc_w_segs)
+			break;
+
+		/* All data received at the sync point, continue */
+		if (qpriv->sync_pt && !qpriv->alloc_w_segs) {
+			hfi1_kern_clear_hw_flow(rcd, qp);
+			qpriv->sync_pt = false;
+			if (qpriv->s_flags & HFI1_R_TID_SW_PSN)
+				qpriv->s_flags &= ~HFI1_R_TID_SW_PSN;
+		}
+
+		/* Allocate flow if we don't have one */
+		if (qpriv->flow_state.index >= RXE_NUM_TID_FLOWS) {
+			ret = hfi1_kern_setup_hw_flow(qpriv->rcd, qp);
+			if (ret) {
+				to_seg = tid_rdma_flow_wt *
+					position_in_queue(qpriv,
+							  &rcd->flow_queue);
+				break;
+			}
+		}
+
+		npkts = rvt_div_round_up_mtu(qp, req->seg_len);
+
+		/*
+		 * We are at a sync point if we run out of KDETH PSN space.
+		 * Last PSN of every generation is reserved for RESYNC.
+		 */
+		if (qpriv->flow_state.psn + npkts > MAX_TID_FLOW_PSN - 1) {
+			qpriv->sync_pt = true;
+			break;
+		}
+
+		/*
+		 * If overtaking req->acked_tail, send an RNR NAK. Because the
+		 * QP is not queued in this case, and the issue can only be
+		 * caused due a delay in scheduling the second leg which we
+		 * cannot estimate, we use a rather arbitrary RNR timeout of
+		 * (MAX_FLOWS / 2) segments
+		 */
+		if (!CIRC_SPACE(req->setup_head, req->acked_tail,
+				MAX_FLOWS)) {
+			ret = -EAGAIN;
+			to_seg = MAX_FLOWS >> 1;
+			qpriv->s_flags |= RVT_S_ACK_PENDING;
+			hfi1_schedule_tid_send(qp);
+			break;
+		}
+
+		/* Try to allocate rcv array / TID entries */
+		ret = hfi1_kern_exp_rcv_setup(req, &req->ss, &last);
+		if (ret == -EAGAIN)
+			to_seg = position_in_queue(qpriv, &rcd->rarr_queue);
+		if (ret)
+			break;
+
+		qpriv->alloc_w_segs++;
+		req->alloc_seg++;
+		continue;
+next_req:
+		/* Begin processing the next request */
+		if (++qpriv->r_tid_alloc >
+		    rvt_size_atomic(ib_to_rvt(qp->ibqp.device)))
+			qpriv->r_tid_alloc = 0;
+	}
+
+	/*
+	 * Schedule an RNR NAK to be sent if (a) flow or rcv array allocation
+	 * has failed (b) we are called from the rcv handler interrupt context
+	 * (c) an RNR NAK has not already been scheduled
+	 */
+	if (ret == -EAGAIN && intr_ctx && !qp->r_nak_state)
+		goto send_rnr_nak;
+
+	return;
+
+send_rnr_nak:
+	lockdep_assert_held(&qp->r_lock);
+
+	/* Set r_nak_state to prevent unrelated events from generating NAK's */
+	qp->r_nak_state = hfi1_compute_tid_rnr_timeout(qp, to_seg) | IB_RNR_NAK;
+
+	/* Pull back r_psn to the segment being RNR NAK'd */
+	qp->r_psn = e->psn + req->alloc_seg;
+	qp->r_ack_psn = qp->r_psn;
+	/*
+	 * Pull back r_head_ack_queue to the ack entry following the request
+	 * being RNR NAK'd. This allows resources to be allocated to the request
+	 * if the queued QP is scheduled.
+	 */
+	qp->r_head_ack_queue = qpriv->r_tid_alloc + 1;
+	if (qp->r_head_ack_queue > rvt_size_atomic(ib_to_rvt(qp->ibqp.device)))
+		qp->r_head_ack_queue = 0;
+	qpriv->r_tid_head = qp->r_head_ack_queue;
+	/*
+	 * These send side fields are used in make_rc_ack(). They are set in
+	 * hfi1_send_rc_ack() but must be set here before dropping qp->s_lock
+	 * for consistency
+	 */
+	qp->s_nak_state = qp->r_nak_state;
+	qp->s_ack_psn = qp->r_ack_psn;
+	/*
+	 * Clear the ACK PENDING flag to prevent unwanted ACK because we
+	 * have modified qp->s_ack_psn here.
+	 */
+	qp->s_flags &= ~(RVT_S_ACK_PENDING);
+
+	trace_hfi1_rsp_tid_write_alloc_res(qp, qp->r_psn);
+	/*
+	 * qpriv->rnr_nak_state is used to determine when the scheduled RNR NAK
+	 * has actually been sent. qp->s_flags RVT_S_ACK_PENDING bit cannot be
+	 * used for this because qp->s_lock is dropped before calling
+	 * hfi1_send_rc_ack() leading to inconsistency between the receive
+	 * interrupt handlers and the send thread in make_rc_ack()
+	 */
+	qpriv->rnr_nak_state = TID_RNR_NAK_SEND;
+
+	/*
+	 * Schedule RNR NAK to be sent. RNR NAK's are scheduled from the receive
+	 * interrupt handlers but will be sent from the send engine behind any
+	 * previous responses that may have been scheduled
+	 */
+	rc_defered_ack(rcd, qp);
+}
+
+void hfi1_rc_rcv_tid_rdma_write_req(struct hfi1_packet *packet)
+{
+	/* HANDLER FOR TID RDMA WRITE REQUEST packet (Responder side)*/
+
+	/*
+	 * 1. Verify TID RDMA WRITE REQ as per IB_OPCODE_RC_RDMA_WRITE_FIRST
+	 *    (see hfi1_rc_rcv())
+	 *     - Don't allow 0-length requests.
+	 * 2. Put TID RDMA WRITE REQ into the response queueu (s_ack_queue)
+	 *     - Setup struct tid_rdma_req with request info
+	 *     - Prepare struct tid_rdma_flow array?
+	 * 3. Set the qp->s_ack_state as state diagram in design doc.
+	 * 4. Set RVT_S_RESP_PENDING in s_flags.
+	 * 5. Kick the send engine (hfi1_schedule_send())
+	 */
+	struct hfi1_ctxtdata *rcd = packet->rcd;
+	struct rvt_qp *qp = packet->qp;
+	struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
+	struct ib_other_headers *ohdr = packet->ohdr;
+	struct rvt_ack_entry *e;
+	unsigned long flags;
+	struct ib_reth *reth;
+	struct hfi1_qp_priv *qpriv = qp->priv;
+	struct tid_rdma_request *req;
+	u32 bth0, psn, len, rkey, num_segs;
+	bool is_fecn;
+	u8 next;
+	u64 vaddr;
+	int diff;
+
+	bth0 = be32_to_cpu(ohdr->bth[0]);
+	if (hfi1_ruc_check_hdr(ibp, packet))
+		return;
+
+	is_fecn = process_ecn(qp, packet);
+	psn = mask_psn(be32_to_cpu(ohdr->bth[2]));
+	trace_hfi1_rsp_rcv_tid_write_req(qp, psn);
+
+	if (qp->state == IB_QPS_RTR && !(qp->r_flags & RVT_R_COMM_EST))
+		rvt_comm_est(qp);
+
+	if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE)))
+		goto nack_inv;
+
+	reth = &ohdr->u.tid_rdma.w_req.reth;
+	vaddr = be64_to_cpu(reth->vaddr);
+	len = be32_to_cpu(reth->length);
+
+	num_segs = DIV_ROUND_UP(len, qpriv->tid_rdma.local.max_len);
+	diff = delta_psn(psn, qp->r_psn);
+	if (unlikely(diff)) {
+		if (tid_rdma_rcv_error(packet, ohdr, qp, psn, diff))
+			return;
+		goto send_ack;
+	}
+
+	/*
+	 * The resent request which was previously RNR NAK'd is inserted at the
+	 * location of the original request, which is one entry behind
+	 * r_head_ack_queue
+	 */
+	if (qpriv->rnr_nak_state)
+		qp->r_head_ack_queue = qp->r_head_ack_queue ?
+			qp->r_head_ack_queue - 1 :
+			rvt_size_atomic(ib_to_rvt(qp->ibqp.device));
+
+	/* We've verified the request, insert it into the ack queue. */
+	next = qp->r_head_ack_queue + 1;
+	if (next > rvt_size_atomic(ib_to_rvt(qp->ibqp.device)))
+		next = 0;
+	spin_lock_irqsave(&qp->s_lock, flags);
+	if (unlikely(next == qp->s_acked_ack_queue)) {
+		if (!qp->s_ack_queue[next].sent)
+			goto nack_inv_unlock;
+		update_ack_queue(qp, next);
+	}
+	e = &qp->s_ack_queue[qp->r_head_ack_queue];
+	req = ack_to_tid_req(e);
+
+	/* Bring previously RNR NAK'd request back to life */
+	if (qpriv->rnr_nak_state) {
+		qp->r_nak_state = 0;
+		qp->s_nak_state = 0;
+		qpriv->rnr_nak_state = TID_RNR_NAK_INIT;
+		qp->r_psn = e->lpsn + 1;
+		req->state = TID_REQUEST_INIT;
+		goto update_head;
+	}
+
+	if (e->rdma_sge.mr) {
+		rvt_put_mr(e->rdma_sge.mr);
+		e->rdma_sge.mr = NULL;
+	}
+
+	/* The length needs to be in multiples of PAGE_SIZE */
+	if (!len || len & ~PAGE_MASK)
+		goto nack_inv_unlock;
+
+	rkey = be32_to_cpu(reth->rkey);
+	qp->r_len = len;
+
+	if (e->opcode == TID_OP(WRITE_REQ) &&
+	    (req->setup_head != req->clear_tail ||
+	     req->clear_tail != req->acked_tail))
+		goto nack_inv_unlock;
+
+	if (unlikely(!rvt_rkey_ok(qp, &e->rdma_sge, qp->r_len, vaddr,
+				  rkey, IB_ACCESS_REMOTE_WRITE)))
+		goto nack_acc;
+
+	qp->r_psn += num_segs - 1;
+
+	e->opcode = (bth0 >> 24) & 0xff;
+	e->psn = psn;
+	e->lpsn = qp->r_psn;
+	e->sent = 0;
+
+	req->n_flows = min_t(u16, num_segs, qpriv->tid_rdma.local.max_write);
+	req->state = TID_REQUEST_INIT;
+	req->cur_seg = 0;
+	req->comp_seg = 0;
+	req->ack_seg = 0;
+	req->alloc_seg = 0;
+	req->isge = 0;
+	req->seg_len = qpriv->tid_rdma.local.max_len;
+	req->total_len = len;
+	req->total_segs = num_segs;
+	req->r_flow_psn = e->psn;
+	req->ss.sge = e->rdma_sge;
+	req->ss.num_sge = 1;
+
+	req->flow_idx = req->setup_head;
+	req->clear_tail = req->setup_head;
+	req->acked_tail = req->setup_head;
+
+	qp->r_state = e->opcode;
+	qp->r_nak_state = 0;
+	/*
+	 * We need to increment the MSN here instead of when we
+	 * finish sending the result since a duplicate request would
+	 * increment it more than once.
+	 */
+	qp->r_msn++;
+	qp->r_psn++;
+
+	trace_hfi1_tid_req_rcv_write_req(qp, 0, e->opcode, e->psn, e->lpsn,
+					 req);
+
+	if (qpriv->r_tid_tail == HFI1_QP_WQE_INVALID) {
+		qpriv->r_tid_tail = qp->r_head_ack_queue;
+	} else if (qpriv->r_tid_tail == qpriv->r_tid_head) {
+		struct tid_rdma_request *ptr;
+
+		e = &qp->s_ack_queue[qpriv->r_tid_tail];
+		ptr = ack_to_tid_req(e);
+
+		if (e->opcode != TID_OP(WRITE_REQ) ||
+		    ptr->comp_seg == ptr->total_segs) {
+			if (qpriv->r_tid_tail == qpriv->r_tid_ack)
+				qpriv->r_tid_ack = qp->r_head_ack_queue;
+			qpriv->r_tid_tail = qp->r_head_ack_queue;
+		}
+	}
+update_head:
+	qp->r_head_ack_queue = next;
+	qpriv->r_tid_head = qp->r_head_ack_queue;
+
+	hfi1_tid_write_alloc_resources(qp, true);
+	trace_hfi1_tid_write_rsp_rcv_req(qp);
+
+	/* Schedule the send tasklet. */
+	qp->s_flags |= RVT_S_RESP_PENDING;
+	hfi1_schedule_send(qp);
+
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+	if (is_fecn)
+		goto send_ack;
+	return;
+
+nack_inv_unlock:
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+nack_inv:
+	rvt_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
+	qp->r_nak_state = IB_NAK_INVALID_REQUEST;
+	qp->r_ack_psn = qp->r_psn;
+	/* Queue NAK for later */
+	rc_defered_ack(rcd, qp);
+	return;
+nack_acc:
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+	rvt_rc_error(qp, IB_WC_LOC_PROT_ERR);
+	qp->r_nak_state = IB_NAK_REMOTE_ACCESS_ERROR;
+	qp->r_ack_psn = qp->r_psn;
+send_ack:
+	hfi1_send_rc_ack(packet, is_fecn);
+}
+
+u32 hfi1_build_tid_rdma_write_resp(struct rvt_qp *qp, struct rvt_ack_entry *e,
+				   struct ib_other_headers *ohdr, u32 *bth1,
+				   u32 bth2, u32 *len,
+				   struct rvt_sge_state **ss)
+{
+	struct hfi1_ack_priv *epriv = e->priv;
+	struct tid_rdma_request *req = &epriv->tid_req;
+	struct hfi1_qp_priv *qpriv = qp->priv;
+	struct tid_rdma_flow *flow = NULL;
+	u32 resp_len = 0, hdwords = 0;
+	void *resp_addr = NULL;
+	struct tid_rdma_params *remote;
+
+	trace_hfi1_tid_req_build_write_resp(qp, 0, e->opcode, e->psn, e->lpsn,
+					    req);
+	trace_hfi1_tid_write_rsp_build_resp(qp);
+	trace_hfi1_rsp_build_tid_write_resp(qp, bth2);
+	flow = &req->flows[req->flow_idx];
+	switch (req->state) {
+	default:
+		/*
+		 * Try to allocate resources here in case QP was queued and was
+		 * later scheduled when resources became available
+		 */
+		hfi1_tid_write_alloc_resources(qp, false);
+
+		/* We've already sent everything which is ready */
+		if (req->cur_seg >= req->alloc_seg)
+			goto done;
+
+		/*
+		 * Resources can be assigned but responses cannot be sent in
+		 * rnr_nak state, till the resent request is received
+		 */
+		if (qpriv->rnr_nak_state == TID_RNR_NAK_SENT)
+			goto done;
+
+		req->state = TID_REQUEST_ACTIVE;
+		trace_hfi1_tid_flow_build_write_resp(qp, req->flow_idx, flow);
+		req->flow_idx = CIRC_NEXT(req->flow_idx, MAX_FLOWS);
+		hfi1_add_tid_reap_timer(qp);
+		break;
+
+	case TID_REQUEST_RESEND_ACTIVE:
+	case TID_REQUEST_RESEND:
+		trace_hfi1_tid_flow_build_write_resp(qp, req->flow_idx, flow);
+		req->flow_idx = CIRC_NEXT(req->flow_idx, MAX_FLOWS);
+		if (!CIRC_CNT(req->setup_head, req->flow_idx, MAX_FLOWS))
+			req->state = TID_REQUEST_ACTIVE;
+
+		hfi1_mod_tid_reap_timer(qp);
+		break;
+	}
+	flow->flow_state.resp_ib_psn = bth2;
+	resp_addr = (void *)flow->tid_entry;
+	resp_len = sizeof(*flow->tid_entry) * flow->tidcnt;
+	req->cur_seg++;
+
+	memset(&ohdr->u.tid_rdma.w_rsp, 0, sizeof(ohdr->u.tid_rdma.w_rsp));
+	epriv->ss.sge.vaddr = resp_addr;
+	epriv->ss.sge.sge_length = resp_len;
+	epriv->ss.sge.length = epriv->ss.sge.sge_length;
+	/*
+	 * We can safely zero these out. Since the first SGE covers the
+	 * entire packet, nothing else should even look at the MR.
+	 */
+	epriv->ss.sge.mr = NULL;
+	epriv->ss.sge.m = 0;
+	epriv->ss.sge.n = 0;
+
+	epriv->ss.sg_list = NULL;
+	epriv->ss.total_len = epriv->ss.sge.sge_length;
+	epriv->ss.num_sge = 1;
+
+	*ss = &epriv->ss;
+	*len = epriv->ss.total_len;
+
+	/* Construct the TID RDMA WRITE RESP packet header */
+	rcu_read_lock();
+	remote = rcu_dereference(qpriv->tid_rdma.remote);
+
+	KDETH_RESET(ohdr->u.tid_rdma.w_rsp.kdeth0, KVER, 0x1);
+	KDETH_RESET(ohdr->u.tid_rdma.w_rsp.kdeth1, JKEY, remote->jkey);
+	ohdr->u.tid_rdma.w_rsp.aeth = rvt_compute_aeth(qp);
+	ohdr->u.tid_rdma.w_rsp.tid_flow_psn =
+		cpu_to_be32((flow->flow_state.generation <<
+			     HFI1_KDETH_BTH_SEQ_SHIFT) |
+			    (flow->flow_state.spsn &
+			     HFI1_KDETH_BTH_SEQ_MASK));
+	ohdr->u.tid_rdma.w_rsp.tid_flow_qp =
+		cpu_to_be32(qpriv->tid_rdma.local.qp |
+			    ((flow->idx & TID_RDMA_DESTQP_FLOW_MASK) <<
+			     TID_RDMA_DESTQP_FLOW_SHIFT) |
+			    qpriv->rcd->ctxt);
+	ohdr->u.tid_rdma.w_rsp.verbs_qp = cpu_to_be32(qp->remote_qpn);
+	*bth1 = remote->qp;
+	rcu_read_unlock();
+	hdwords = sizeof(ohdr->u.tid_rdma.w_rsp) / sizeof(u32);
+	qpriv->pending_tid_w_segs++;
+done:
+	return hdwords;
+}
+
+static void hfi1_add_tid_reap_timer(struct rvt_qp *qp)
+{
+	struct hfi1_qp_priv *qpriv = qp->priv;
+
+	lockdep_assert_held(&qp->s_lock);
+	if (!(qpriv->s_flags & HFI1_R_TID_RSC_TIMER)) {
+		qpriv->s_flags |= HFI1_R_TID_RSC_TIMER;
+		qpriv->s_tid_timer.expires = jiffies +
+			qpriv->tid_timer_timeout_jiffies;
+		add_timer(&qpriv->s_tid_timer);
+	}
+}
+
+static void hfi1_mod_tid_reap_timer(struct rvt_qp *qp)
+{
+	struct hfi1_qp_priv *qpriv = qp->priv;
+
+	lockdep_assert_held(&qp->s_lock);
+	qpriv->s_flags |= HFI1_R_TID_RSC_TIMER;
+	mod_timer(&qpriv->s_tid_timer, jiffies +
+		  qpriv->tid_timer_timeout_jiffies);
+}
+
+static int hfi1_stop_tid_reap_timer(struct rvt_qp *qp)
+{
+	struct hfi1_qp_priv *qpriv = qp->priv;
+	int rval = 0;
+
+	lockdep_assert_held(&qp->s_lock);
+	if (qpriv->s_flags & HFI1_R_TID_RSC_TIMER) {
+		rval = del_timer(&qpriv->s_tid_timer);
+		qpriv->s_flags &= ~HFI1_R_TID_RSC_TIMER;
+	}
+	return rval;
+}
+
+void hfi1_del_tid_reap_timer(struct rvt_qp *qp)
+{
+	struct hfi1_qp_priv *qpriv = qp->priv;
+
+	del_timer_sync(&qpriv->s_tid_timer);
+	qpriv->s_flags &= ~HFI1_R_TID_RSC_TIMER;
+}
+
+static void hfi1_tid_timeout(struct timer_list *t)
+{
+	struct hfi1_qp_priv *qpriv = from_timer(qpriv, t, s_tid_timer);
+	struct rvt_qp *qp = qpriv->owner;
+	struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device);
+	unsigned long flags;
+	u32 i;
+
+	spin_lock_irqsave(&qp->r_lock, flags);
+	spin_lock(&qp->s_lock);
+	if (qpriv->s_flags & HFI1_R_TID_RSC_TIMER) {
+		dd_dev_warn(dd_from_ibdev(qp->ibqp.device), "[QP%u] %s %d\n",
+			    qp->ibqp.qp_num, __func__, __LINE__);
+		trace_hfi1_msg_tid_timeout(/* msg */
+			qp, "resource timeout = ",
+			(u64)qpriv->tid_timer_timeout_jiffies);
+		hfi1_stop_tid_reap_timer(qp);
+		/*
+		 * Go though the entire ack queue and clear any outstanding
+		 * HW flow and RcvArray resources.
+		 */
+		hfi1_kern_clear_hw_flow(qpriv->rcd, qp);
+		for (i = 0; i < rvt_max_atomic(rdi); i++) {
+			struct tid_rdma_request *req =
+				ack_to_tid_req(&qp->s_ack_queue[i]);
+
+			hfi1_kern_exp_rcv_clear_all(req);
+		}
+		spin_unlock(&qp->s_lock);
+		if (qp->ibqp.event_handler) {
+			struct ib_event ev;
+
+			ev.device = qp->ibqp.device;
+			ev.element.qp = &qp->ibqp;
+			ev.event = IB_EVENT_QP_FATAL;
+			qp->ibqp.event_handler(&ev, qp->ibqp.qp_context);
+		}
+		rvt_rc_error(qp, IB_WC_RESP_TIMEOUT_ERR);
+		goto unlock_r_lock;
+	}
+	spin_unlock(&qp->s_lock);
+unlock_r_lock:
+	spin_unlock_irqrestore(&qp->r_lock, flags);
+}
+
+void hfi1_rc_rcv_tid_rdma_write_resp(struct hfi1_packet *packet)
+{
+	/* HANDLER FOR TID RDMA WRITE RESPONSE packet (Requestor side */
+
+	/*
+	 * 1. Find matching SWQE
+	 * 2. Check that TIDENTRY array has enough space for a complete
+	 *    segment. If not, put QP in error state.
+	 * 3. Save response data in struct tid_rdma_req and struct tid_rdma_flow
+	 * 4. Remove HFI1_S_WAIT_TID_RESP from s_flags.
+	 * 5. Set qp->s_state
+	 * 6. Kick the send engine (hfi1_schedule_send())
+	 */
+	struct ib_other_headers *ohdr = packet->ohdr;
+	struct rvt_qp *qp = packet->qp;
+	struct hfi1_qp_priv *qpriv = qp->priv;
+	struct hfi1_ctxtdata *rcd = packet->rcd;
+	struct rvt_swqe *wqe;
+	struct tid_rdma_request *req;
+	struct tid_rdma_flow *flow;
+	enum ib_wc_status status;
+	u32 opcode, aeth, psn, flow_psn, i, tidlen = 0, pktlen;
+	bool is_fecn;
+	unsigned long flags;
+
+	is_fecn = process_ecn(qp, packet);
+	psn = mask_psn(be32_to_cpu(ohdr->bth[2]));
+	aeth = be32_to_cpu(ohdr->u.tid_rdma.w_rsp.aeth);
+	opcode = (be32_to_cpu(ohdr->bth[0]) >> 24) & 0xff;
+
+	spin_lock_irqsave(&qp->s_lock, flags);
+
+	/* Ignore invalid responses */
+	if (cmp_psn(psn, qp->s_next_psn) >= 0)
+		goto ack_done;
+
+	/* Ignore duplicate responses. */
+	if (unlikely(cmp_psn(psn, qp->s_last_psn) <= 0))
+		goto ack_done;
+
+	if (unlikely(qp->s_acked == qp->s_tail))
+		goto ack_done;
+
+	/*
+	 * If we are waiting for a particular packet sequence number
+	 * due to a request being resent, check for it. Otherwise,
+	 * ensure that we haven't missed anything.
+	 */
+	if (qp->r_flags & RVT_R_RDMAR_SEQ) {
+		if (cmp_psn(psn, qp->s_last_psn + 1) != 0)
+			goto ack_done;
+		qp->r_flags &= ~RVT_R_RDMAR_SEQ;
+	}
+
+	wqe = rvt_get_swqe_ptr(qp, qpriv->s_tid_cur);
+	if (unlikely(wqe->wr.opcode != IB_WR_TID_RDMA_WRITE))
+		goto ack_op_err;
+
+	req = wqe_to_tid_req(wqe);
+	/*
+	 * If we've lost ACKs and our acked_tail pointer is too far
+	 * behind, don't overwrite segments. Just drop the packet and
+	 * let the reliability protocol take care of it.
+	 */
+	if (!CIRC_SPACE(req->setup_head, req->acked_tail, MAX_FLOWS))
+		goto ack_done;
+
+	/*
+	 * The call to do_rc_ack() should be last in the chain of
+	 * packet checks because it will end up updating the QP state.
+	 * Therefore, anything that would prevent the packet from
+	 * being accepted as a successful response should be prior
+	 * to it.
+	 */
+	if (!do_rc_ack(qp, aeth, psn, opcode, 0, rcd))
+		goto ack_done;
+
+	trace_hfi1_ack(qp, psn);
+
+	flow = &req->flows[req->setup_head];
+	flow->pkt = 0;
+	flow->tid_idx = 0;
+	flow->tid_offset = 0;
+	flow->sent = 0;
+	flow->resync_npkts = 0;
+	flow->tid_qpn = be32_to_cpu(ohdr->u.tid_rdma.w_rsp.tid_flow_qp);
+	flow->idx = (flow->tid_qpn >> TID_RDMA_DESTQP_FLOW_SHIFT) &
+		TID_RDMA_DESTQP_FLOW_MASK;
+	flow_psn = mask_psn(be32_to_cpu(ohdr->u.tid_rdma.w_rsp.tid_flow_psn));
+	flow->flow_state.generation = flow_psn >> HFI1_KDETH_BTH_SEQ_SHIFT;
+	flow->flow_state.spsn = flow_psn & HFI1_KDETH_BTH_SEQ_MASK;
+	flow->flow_state.resp_ib_psn = psn;
+	flow->length = min_t(u32, req->seg_len,
+			     (wqe->length - (req->comp_seg * req->seg_len)));
+
+	flow->npkts = rvt_div_round_up_mtu(qp, flow->length);
+	flow->flow_state.lpsn = flow->flow_state.spsn +
+		flow->npkts - 1;
+	/* payload length = packet length - (header length + ICRC length) */
+	pktlen = packet->tlen - (packet->hlen + 4);
+	if (pktlen > sizeof(flow->tid_entry)) {
+		status = IB_WC_LOC_LEN_ERR;
+		goto ack_err;
+	}
+	memcpy(flow->tid_entry, packet->ebuf, pktlen);
+	flow->tidcnt = pktlen / sizeof(*flow->tid_entry);
+	trace_hfi1_tid_flow_rcv_write_resp(qp, req->setup_head, flow);
+
+	req->comp_seg++;
+	trace_hfi1_tid_write_sender_rcv_resp(qp, 0);
+	/*
+	 * Walk the TID_ENTRY list to make sure we have enough space for a
+	 * complete segment.
+	 */
+	for (i = 0; i < flow->tidcnt; i++) {
+		trace_hfi1_tid_entry_rcv_write_resp(/* entry */
+			qp, i, flow->tid_entry[i]);
+		if (!EXP_TID_GET(flow->tid_entry[i], LEN)) {
+			status = IB_WC_LOC_LEN_ERR;
+			goto ack_err;
+		}
+		tidlen += EXP_TID_GET(flow->tid_entry[i], LEN);
+	}
+	if (tidlen * PAGE_SIZE < flow->length) {
+		status = IB_WC_LOC_LEN_ERR;
+		goto ack_err;
+	}
+
+	trace_hfi1_tid_req_rcv_write_resp(qp, 0, wqe->wr.opcode, wqe->psn,
+					  wqe->lpsn, req);
+	/*
+	 * If this is the first response for this request, set the initial
+	 * flow index to the current flow.
+	 */
+	if (!cmp_psn(psn, wqe->psn)) {
+		req->r_last_acked = mask_psn(wqe->psn - 1);
+		/* Set acked flow index to head index */
+		req->acked_tail = req->setup_head;
+	}
+
+	/* advance circular buffer head */
+	req->setup_head = CIRC_NEXT(req->setup_head, MAX_FLOWS);
+	req->state = TID_REQUEST_ACTIVE;
+
+	/*
+	 * If all responses for this TID RDMA WRITE request have been received
+	 * advance the pointer to the next one.
+	 * Since TID RDMA requests could be mixed in with regular IB requests,
+	 * they might not appear sequentially in the queue. Therefore, the
+	 * next request needs to be "found".
+	 */
+	if (qpriv->s_tid_cur != qpriv->s_tid_head &&
+	    req->comp_seg == req->total_segs) {
+		for (i = qpriv->s_tid_cur + 1; ; i++) {
+			if (i == qp->s_size)
+				i = 0;
+			wqe = rvt_get_swqe_ptr(qp, i);
+			if (i == qpriv->s_tid_head)
+				break;
+			if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE)
+				break;
+		}
+		qpriv->s_tid_cur = i;
+	}
+	qp->s_flags &= ~HFI1_S_WAIT_TID_RESP;
+
+	hfi1_schedule_tid_send(qp);
+	goto ack_done;
+
+ack_op_err:
+	status = IB_WC_LOC_QP_OP_ERR;
+ack_err:
+	rvt_error_qp(qp, status);
+ack_done:
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+	if (is_fecn)
+		hfi1_send_rc_ack(packet, is_fecn);
+}
+
+bool hfi1_build_tid_rdma_packet(struct rvt_swqe *wqe,
+				struct ib_other_headers *ohdr,
+				u32 *bth1, u32 *bth2, u32 *len)
+{
+	struct tid_rdma_request *req = wqe_to_tid_req(wqe);
+	struct tid_rdma_flow *flow = &req->flows[req->clear_tail];
+	struct tid_rdma_params *remote;
+	struct rvt_qp *qp = req->qp;
+	struct hfi1_qp_priv *qpriv = qp->priv;
+	u32 tidentry = flow->tid_entry[flow->tid_idx];
+	u32 tidlen = EXP_TID_GET(tidentry, LEN) << PAGE_SHIFT;
+	struct tid_rdma_write_data *wd = &ohdr->u.tid_rdma.w_data;
+	u32 next_offset, om = KDETH_OM_LARGE;
+	bool last_pkt;
+
+	if (!tidlen) {
+		hfi1_trdma_send_complete(qp, wqe, IB_WC_REM_INV_RD_REQ_ERR);
+		rvt_error_qp(qp, IB_WC_REM_INV_RD_REQ_ERR);
+	}
+
+	*len = min_t(u32, qp->pmtu, tidlen - flow->tid_offset);
+	flow->sent += *len;
+	next_offset = flow->tid_offset + *len;
+	last_pkt = (flow->tid_idx == (flow->tidcnt - 1) &&
+		    next_offset >= tidlen) || (flow->sent >= flow->length);
+	trace_hfi1_tid_entry_build_write_data(qp, flow->tid_idx, tidentry);
+	trace_hfi1_tid_flow_build_write_data(qp, req->clear_tail, flow);
+
+	rcu_read_lock();
+	remote = rcu_dereference(qpriv->tid_rdma.remote);
+	KDETH_RESET(wd->kdeth0, KVER, 0x1);
+	KDETH_SET(wd->kdeth0, SH, !last_pkt);
+	KDETH_SET(wd->kdeth0, INTR, !!(!last_pkt && remote->urg));
+	KDETH_SET(wd->kdeth0, TIDCTRL, EXP_TID_GET(tidentry, CTRL));
+	KDETH_SET(wd->kdeth0, TID, EXP_TID_GET(tidentry, IDX));
+	KDETH_SET(wd->kdeth0, OM, om == KDETH_OM_LARGE);
+	KDETH_SET(wd->kdeth0, OFFSET, flow->tid_offset / om);
+	KDETH_RESET(wd->kdeth1, JKEY, remote->jkey);
+	wd->verbs_qp = cpu_to_be32(qp->remote_qpn);
+	rcu_read_unlock();
+
+	*bth1 = flow->tid_qpn;
+	*bth2 = mask_psn(((flow->flow_state.spsn + flow->pkt++) &
+			 HFI1_KDETH_BTH_SEQ_MASK) |
+			 (flow->flow_state.generation <<
+			  HFI1_KDETH_BTH_SEQ_SHIFT));
+	if (last_pkt) {
+		/* PSNs are zero-based, so +1 to count number of packets */
+		if (flow->flow_state.lpsn + 1 +
+		    rvt_div_round_up_mtu(qp, req->seg_len) >
+		    MAX_TID_FLOW_PSN)
+			req->state = TID_REQUEST_SYNC;
+		*bth2 |= IB_BTH_REQ_ACK;
+	}
+
+	if (next_offset >= tidlen) {
+		flow->tid_offset = 0;
+		flow->tid_idx++;
+	} else {
+		flow->tid_offset = next_offset;
+	}
+	return last_pkt;
+}
+
+void hfi1_rc_rcv_tid_rdma_write_data(struct hfi1_packet *packet)
+{
+	struct rvt_qp *qp = packet->qp;
+	struct hfi1_qp_priv *priv = qp->priv;
+	struct hfi1_ctxtdata *rcd = priv->rcd;
+	struct ib_other_headers *ohdr = packet->ohdr;
+	struct rvt_ack_entry *e;
+	struct tid_rdma_request *req;
+	struct tid_rdma_flow *flow;
+	struct hfi1_ibdev *dev = to_idev(qp->ibqp.device);
+	unsigned long flags;
+	u32 psn, next;
+	u8 opcode;
+
+	psn = mask_psn(be32_to_cpu(ohdr->bth[2]));
+	opcode = (be32_to_cpu(ohdr->bth[0]) >> 24) & 0xff;
+
+	/*
+	 * All error handling should be done by now. If we are here, the packet
+	 * is either good or been accepted by the error handler.
+	 */
+	spin_lock_irqsave(&qp->s_lock, flags);
+	e = &qp->s_ack_queue[priv->r_tid_tail];
+	req = ack_to_tid_req(e);
+	flow = &req->flows[req->clear_tail];
+	if (cmp_psn(psn, full_flow_psn(flow, flow->flow_state.lpsn))) {
+		if (cmp_psn(psn, flow->flow_state.r_next_psn))
+			goto send_nak;
+		flow->flow_state.r_next_psn++;
+		goto exit;
+	}
+	flow->flow_state.r_next_psn = mask_psn(psn + 1);
+	hfi1_kern_exp_rcv_clear(req);
+	priv->alloc_w_segs--;
+	rcd->flows[flow->idx].psn = psn & HFI1_KDETH_BTH_SEQ_MASK;
+	req->comp_seg++;
+	priv->s_nak_state = 0;
+
+	/*
+	 * Release the flow if one of the following conditions has been met:
+	 *  - The request has reached a sync point AND all outstanding
+	 *    segments have been completed, or
+	 *  - The entire request is complete and there are no more requests
+	 *    (of any kind) in the queue.
+	 */
+	trace_hfi1_rsp_rcv_tid_write_data(qp, psn);
+	trace_hfi1_tid_req_rcv_write_data(qp, 0, e->opcode, e->psn, e->lpsn,
+					  req);
+	trace_hfi1_tid_write_rsp_rcv_data(qp);
+	if (priv->r_tid_ack == HFI1_QP_WQE_INVALID)
+		priv->r_tid_ack = priv->r_tid_tail;
+
+	if (opcode == TID_OP(WRITE_DATA_LAST)) {
+		for (next = priv->r_tid_tail + 1; ; next++) {
+			if (next > rvt_size_atomic(&dev->rdi))
+				next = 0;
+			if (next == priv->r_tid_head)
+				break;
+			e = &qp->s_ack_queue[next];
+			if (e->opcode == TID_OP(WRITE_REQ))
+				break;
+		}
+		priv->r_tid_tail = next;
+		if (++qp->s_acked_ack_queue > rvt_size_atomic(&dev->rdi))
+			qp->s_acked_ack_queue = 0;
+	}
+
+	hfi1_tid_write_alloc_resources(qp, true);
+
+	/*
+	 * If we need to generate more responses, schedule the
+	 * send engine.
+	 */
+	if (req->cur_seg < req->total_segs ||
+	    qp->s_tail_ack_queue != qp->r_head_ack_queue) {
+		qp->s_flags |= RVT_S_RESP_PENDING;
+		hfi1_schedule_send(qp);
+	}
+
+	priv->pending_tid_w_segs--;
+	if (priv->s_flags & HFI1_R_TID_RSC_TIMER) {
+		if (priv->pending_tid_w_segs)
+			hfi1_mod_tid_reap_timer(req->qp);
+		else
+			hfi1_stop_tid_reap_timer(req->qp);
+	}
+
+done:
+	priv->s_flags |= RVT_S_ACK_PENDING;
+	hfi1_schedule_tid_send(qp);
+exit:
+	priv->r_next_psn_kdeth = flow->flow_state.r_next_psn;
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+	return;
+
+send_nak:
+	if (!priv->s_nak_state) {
+		priv->s_nak_state = IB_NAK_PSN_ERROR;
+		priv->s_nak_psn = flow->flow_state.r_next_psn;
+		priv->s_flags |= RVT_S_ACK_PENDING;
+		if (priv->r_tid_ack == HFI1_QP_WQE_INVALID)
+			priv->r_tid_ack = priv->r_tid_tail;
+		hfi1_schedule_tid_send(qp);
+	}
+	goto done;
+}
+
+static bool hfi1_tid_rdma_is_resync_psn(u32 psn)
+{
+	return (bool)((psn & HFI1_KDETH_BTH_SEQ_MASK) ==
+		      HFI1_KDETH_BTH_SEQ_MASK);
+}
+
+u32 hfi1_build_tid_rdma_write_ack(struct rvt_qp *qp, struct rvt_ack_entry *e,
+				  struct ib_other_headers *ohdr, u16 iflow,
+				  u32 *bth1, u32 *bth2)
+{
+	struct hfi1_qp_priv *qpriv = qp->priv;
+	struct tid_flow_state *fs = &qpriv->flow_state;
+	struct tid_rdma_request *req = ack_to_tid_req(e);
+	struct tid_rdma_flow *flow = &req->flows[iflow];
+	struct tid_rdma_params *remote;
+
+	rcu_read_lock();
+	remote = rcu_dereference(qpriv->tid_rdma.remote);
+	KDETH_RESET(ohdr->u.tid_rdma.ack.kdeth1, JKEY, remote->jkey);
+	ohdr->u.tid_rdma.ack.verbs_qp = cpu_to_be32(qp->remote_qpn);
+	*bth1 = remote->qp;
+	rcu_read_unlock();
+
+	if (qpriv->resync) {
+		*bth2 = mask_psn((fs->generation <<
+				  HFI1_KDETH_BTH_SEQ_SHIFT) - 1);
+		ohdr->u.tid_rdma.ack.aeth = rvt_compute_aeth(qp);
+	} else if (qpriv->s_nak_state) {
+		*bth2 = mask_psn(qpriv->s_nak_psn);
+		ohdr->u.tid_rdma.ack.aeth =
+			cpu_to_be32((qp->r_msn & IB_MSN_MASK) |
+				    (qpriv->s_nak_state <<
+				     IB_AETH_CREDIT_SHIFT));
+	} else {
+		*bth2 = full_flow_psn(flow, flow->flow_state.lpsn);
+		ohdr->u.tid_rdma.ack.aeth = rvt_compute_aeth(qp);
+	}
+	KDETH_RESET(ohdr->u.tid_rdma.ack.kdeth0, KVER, 0x1);
+	ohdr->u.tid_rdma.ack.tid_flow_qp =
+		cpu_to_be32(qpriv->tid_rdma.local.qp |
+			    ((flow->idx & TID_RDMA_DESTQP_FLOW_MASK) <<
+			     TID_RDMA_DESTQP_FLOW_SHIFT) |
+			    qpriv->rcd->ctxt);
+
+	ohdr->u.tid_rdma.ack.tid_flow_psn = 0;
+	ohdr->u.tid_rdma.ack.verbs_psn =
+		cpu_to_be32(flow->flow_state.resp_ib_psn);
+
+	if (qpriv->resync) {
+		/*
+		 * If the PSN before the current expect KDETH PSN is the
+		 * RESYNC PSN, then we never received a good TID RDMA WRITE
+		 * DATA packet after a previous RESYNC.
+		 * In this case, the next expected KDETH PSN stays the same.
+		 */
+		if (hfi1_tid_rdma_is_resync_psn(qpriv->r_next_psn_kdeth - 1)) {
+			ohdr->u.tid_rdma.ack.tid_flow_psn =
+				cpu_to_be32(qpriv->r_next_psn_kdeth_save);
+		} else {
+			/*
+			 * Because the KDETH PSNs jump during a RESYNC, it's
+			 * not possible to infer (or compute) the previous value
+			 * of r_next_psn_kdeth in the case of back-to-back
+			 * RESYNC packets. Therefore, we save it.
+			 */
+			qpriv->r_next_psn_kdeth_save =
+				qpriv->r_next_psn_kdeth - 1;
+			ohdr->u.tid_rdma.ack.tid_flow_psn =
+				cpu_to_be32(qpriv->r_next_psn_kdeth_save);
+			qpriv->r_next_psn_kdeth = mask_psn(*bth2 + 1);
+		}
+		qpriv->resync = false;
+	}
+
+	return sizeof(ohdr->u.tid_rdma.ack) / sizeof(u32);
+}
+
+void hfi1_rc_rcv_tid_rdma_ack(struct hfi1_packet *packet)
+{
+	struct ib_other_headers *ohdr = packet->ohdr;
+	struct rvt_qp *qp = packet->qp;
+	struct hfi1_qp_priv *qpriv = qp->priv;
+	struct rvt_swqe *wqe;
+	struct tid_rdma_request *req;
+	struct tid_rdma_flow *flow;
+	u32 aeth, psn, req_psn, ack_psn, fspsn, resync_psn, ack_kpsn;
+	bool is_fecn;
+	unsigned long flags;
+	u16 fidx;
+
+	trace_hfi1_tid_write_sender_rcv_tid_ack(qp, 0);
+	is_fecn = process_ecn(qp, packet);
+	psn = mask_psn(be32_to_cpu(ohdr->bth[2]));
+	aeth = be32_to_cpu(ohdr->u.tid_rdma.ack.aeth);
+	req_psn = mask_psn(be32_to_cpu(ohdr->u.tid_rdma.ack.verbs_psn));
+	resync_psn = mask_psn(be32_to_cpu(ohdr->u.tid_rdma.ack.tid_flow_psn));
+
+	spin_lock_irqsave(&qp->s_lock, flags);
+	trace_hfi1_rcv_tid_ack(qp, aeth, psn, req_psn, resync_psn);
+
+	/* If we are waiting for an ACK to RESYNC, drop any other packets */
+	if ((qp->s_flags & HFI1_S_WAIT_HALT) &&
+	    cmp_psn(psn, qpriv->s_resync_psn))
+		goto ack_op_err;
+
+	ack_psn = req_psn;
+	if (hfi1_tid_rdma_is_resync_psn(psn))
+		ack_kpsn = resync_psn;
+	else
+		ack_kpsn = psn;
+	if (aeth >> 29) {
+		ack_psn--;
+		ack_kpsn--;
+	}
+
+	wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
+
+	if (wqe->wr.opcode != IB_WR_TID_RDMA_WRITE)
+		goto ack_op_err;
+
+	req = wqe_to_tid_req(wqe);
+	trace_hfi1_tid_req_rcv_tid_ack(qp, 0, wqe->wr.opcode, wqe->psn,
+				       wqe->lpsn, req);
+	flow = &req->flows[req->acked_tail];
+	trace_hfi1_tid_flow_rcv_tid_ack(qp, req->acked_tail, flow);
+
+	/* Drop stale ACK/NAK */
+	if (cmp_psn(psn, full_flow_psn(flow, flow->flow_state.spsn)) < 0)
+		goto ack_op_err;
+
+	while (cmp_psn(ack_kpsn,
+		       full_flow_psn(flow, flow->flow_state.lpsn)) >= 0 &&
+	       req->ack_seg < req->cur_seg) {
+		req->ack_seg++;
+		/* advance acked segment pointer */
+		req->acked_tail = CIRC_NEXT(req->acked_tail, MAX_FLOWS);
+		req->r_last_acked = flow->flow_state.resp_ib_psn;
+		trace_hfi1_tid_req_rcv_tid_ack(qp, 0, wqe->wr.opcode, wqe->psn,
+					       wqe->lpsn, req);
+		if (req->ack_seg == req->total_segs) {
+			req->state = TID_REQUEST_COMPLETE;
+			wqe = do_rc_completion(qp, wqe,
+					       to_iport(qp->ibqp.device,
+							qp->port_num));
+			trace_hfi1_sender_rcv_tid_ack(qp);
+			atomic_dec(&qpriv->n_tid_requests);
+			if (qp->s_acked == qp->s_tail)
+				break;
+			if (wqe->wr.opcode != IB_WR_TID_RDMA_WRITE)
+				break;
+			req = wqe_to_tid_req(wqe);
+		}
+		flow = &req->flows[req->acked_tail];
+		trace_hfi1_tid_flow_rcv_tid_ack(qp, req->acked_tail, flow);
+	}
+
+	trace_hfi1_tid_req_rcv_tid_ack(qp, 0, wqe->wr.opcode, wqe->psn,
+				       wqe->lpsn, req);
+	switch (aeth >> 29) {
+	case 0:         /* ACK */
+		if (qpriv->s_flags & RVT_S_WAIT_ACK)
+			qpriv->s_flags &= ~RVT_S_WAIT_ACK;
+		if (!hfi1_tid_rdma_is_resync_psn(psn)) {
+			/* Check if there is any pending TID ACK */
+			if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE &&
+			    req->ack_seg < req->cur_seg)
+				hfi1_mod_tid_retry_timer(qp);
+			else
+				hfi1_stop_tid_retry_timer(qp);
+			hfi1_schedule_send(qp);
+		} else {
+			u32 spsn, fpsn, last_acked, generation;
+			struct tid_rdma_request *rptr;
+
+			/* ACK(RESYNC) */
+			hfi1_stop_tid_retry_timer(qp);
+			/* Allow new requests (see hfi1_make_tid_rdma_pkt) */
+			qp->s_flags &= ~HFI1_S_WAIT_HALT;
+			/*
+			 * Clear RVT_S_SEND_ONE flag in case that the TID RDMA
+			 * ACK is received after the TID retry timer is fired
+			 * again. In this case, do not send any more TID
+			 * RESYNC request or wait for any more TID ACK packet.
+			 */
+			qpriv->s_flags &= ~RVT_S_SEND_ONE;
+			hfi1_schedule_send(qp);
+
+			if ((qp->s_acked == qpriv->s_tid_tail &&
+			     req->ack_seg == req->total_segs) ||
+			    qp->s_acked == qp->s_tail) {
+				qpriv->s_state = TID_OP(WRITE_DATA_LAST);
+				goto done;
+			}
+
+			if (req->ack_seg == req->comp_seg) {
+				qpriv->s_state = TID_OP(WRITE_DATA);
+				goto done;
+			}
+
+			/*
+			 * The PSN to start with is the next PSN after the
+			 * RESYNC PSN.
+			 */
+			psn = mask_psn(psn + 1);
+			generation = psn >> HFI1_KDETH_BTH_SEQ_SHIFT;
+			spsn = 0;
+
+			/*
+			 * Update to the correct WQE when we get an ACK(RESYNC)
+			 * in the middle of a request.
+			 */
+			if (delta_psn(ack_psn, wqe->lpsn))
+				wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
+			req = wqe_to_tid_req(wqe);
+			flow = &req->flows[req->acked_tail];
+			/*
+			 * RESYNC re-numbers the PSN ranges of all remaining
+			 * segments. Also, PSN's start from 0 in the middle of a
+			 * segment and the first segment size is less than the
+			 * default number of packets. flow->resync_npkts is used
+			 * to track the number of packets from the start of the
+			 * real segment to the point of 0 PSN after the RESYNC
+			 * in order to later correctly rewind the SGE.
+			 */
+			fpsn = full_flow_psn(flow, flow->flow_state.spsn);
+			req->r_ack_psn = psn;
+			flow->resync_npkts +=
+				delta_psn(mask_psn(resync_psn + 1), fpsn);
+			/*
+			 * Renumber all packet sequence number ranges
+			 * based on the new generation.
+			 */
+			last_acked = qp->s_acked;
+			rptr = req;
+			while (1) {
+				/* start from last acked segment */
+				for (fidx = rptr->acked_tail;
+				     CIRC_CNT(rptr->setup_head, fidx,
+					      MAX_FLOWS);
+				     fidx = CIRC_NEXT(fidx, MAX_FLOWS)) {
+					u32 lpsn;
+					u32 gen;
+
+					flow = &rptr->flows[fidx];
+					gen = flow->flow_state.generation;
+					if (WARN_ON(gen == generation &&
+						    flow->flow_state.spsn !=
+						     spsn))
+						continue;
+					lpsn = flow->flow_state.lpsn;
+					lpsn = full_flow_psn(flow, lpsn);
+					flow->npkts =
+						delta_psn(lpsn,
+							  mask_psn(resync_psn)
+							  );
+					flow->flow_state.generation =
+						generation;
+					flow->flow_state.spsn = spsn;
+					flow->flow_state.lpsn =
+						flow->flow_state.spsn +
+						flow->npkts - 1;
+					flow->pkt = 0;
+					spsn += flow->npkts;
+					resync_psn += flow->npkts;
+					trace_hfi1_tid_flow_rcv_tid_ack(qp,
+									fidx,
+									flow);
+				}
+				if (++last_acked == qpriv->s_tid_cur + 1)
+					break;
+				if (last_acked == qp->s_size)
+					last_acked = 0;
+				wqe = rvt_get_swqe_ptr(qp, last_acked);
+				rptr = wqe_to_tid_req(wqe);
+			}
+			req->cur_seg = req->ack_seg;
+			qpriv->s_tid_tail = qp->s_acked;
+			qpriv->s_state = TID_OP(WRITE_REQ);
+			hfi1_schedule_tid_send(qp);
+		}
+done:
+		qpriv->s_retry = qp->s_retry_cnt;
+		break;
+
+	case 3:         /* NAK */
+		hfi1_stop_tid_retry_timer(qp);
+		switch ((aeth >> IB_AETH_CREDIT_SHIFT) &
+			IB_AETH_CREDIT_MASK) {
+		case 0: /* PSN sequence error */
+			flow = &req->flows[req->acked_tail];
+			fspsn = full_flow_psn(flow, flow->flow_state.spsn);
+			trace_hfi1_tid_flow_rcv_tid_ack(qp, req->acked_tail,
+							flow);
+			req->r_ack_psn = mask_psn(be32_to_cpu(ohdr->bth[2]));
+			req->cur_seg = req->ack_seg;
+			qpriv->s_tid_tail = qp->s_acked;
+			qpriv->s_state = TID_OP(WRITE_REQ);
+			qpriv->s_retry = qp->s_retry_cnt;
+			hfi1_schedule_tid_send(qp);
+			break;
+
+		default:
+			break;
+		}
+		break;
+
+	default:
+		break;
+	}
+
+ack_op_err:
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+}
+
+void hfi1_add_tid_retry_timer(struct rvt_qp *qp)
+{
+	struct hfi1_qp_priv *priv = qp->priv;
+	struct ib_qp *ibqp = &qp->ibqp;
+	struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device);
+
+	lockdep_assert_held(&qp->s_lock);
+	if (!(priv->s_flags & HFI1_S_TID_RETRY_TIMER)) {
+		priv->s_flags |= HFI1_S_TID_RETRY_TIMER;
+		priv->s_tid_retry_timer.expires = jiffies +
+			priv->tid_retry_timeout_jiffies + rdi->busy_jiffies;
+		add_timer(&priv->s_tid_retry_timer);
+	}
+}
+
+static void hfi1_mod_tid_retry_timer(struct rvt_qp *qp)
+{
+	struct hfi1_qp_priv *priv = qp->priv;
+	struct ib_qp *ibqp = &qp->ibqp;
+	struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device);
+
+	lockdep_assert_held(&qp->s_lock);
+	priv->s_flags |= HFI1_S_TID_RETRY_TIMER;
+	mod_timer(&priv->s_tid_retry_timer, jiffies +
+		  priv->tid_retry_timeout_jiffies + rdi->busy_jiffies);
+}
+
+static int hfi1_stop_tid_retry_timer(struct rvt_qp *qp)
+{
+	struct hfi1_qp_priv *priv = qp->priv;
+	int rval = 0;
+
+	lockdep_assert_held(&qp->s_lock);
+	if (priv->s_flags & HFI1_S_TID_RETRY_TIMER) {
+		rval = del_timer(&priv->s_tid_retry_timer);
+		priv->s_flags &= ~HFI1_S_TID_RETRY_TIMER;
+	}
+	return rval;
+}
+
+void hfi1_del_tid_retry_timer(struct rvt_qp *qp)
+{
+	struct hfi1_qp_priv *priv = qp->priv;
+
+	del_timer_sync(&priv->s_tid_retry_timer);
+	priv->s_flags &= ~HFI1_S_TID_RETRY_TIMER;
+}
+
+static void hfi1_tid_retry_timeout(struct timer_list *t)
+{
+	struct hfi1_qp_priv *priv = from_timer(priv, t, s_tid_retry_timer);
+	struct rvt_qp *qp = priv->owner;
+	struct rvt_swqe *wqe;
+	unsigned long flags;
+	struct tid_rdma_request *req;
+
+	spin_lock_irqsave(&qp->r_lock, flags);
+	spin_lock(&qp->s_lock);
+	trace_hfi1_tid_write_sender_retry_timeout(qp, 0);
+	if (priv->s_flags & HFI1_S_TID_RETRY_TIMER) {
+		hfi1_stop_tid_retry_timer(qp);
+		if (!priv->s_retry) {
+			trace_hfi1_msg_tid_retry_timeout(/* msg */
+				qp,
+				"Exhausted retries. Tid retry timeout = ",
+				(u64)priv->tid_retry_timeout_jiffies);
+
+			wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
+			hfi1_trdma_send_complete(qp, wqe, IB_WC_RETRY_EXC_ERR);
+			rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR);
+		} else {
+			wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
+			req = wqe_to_tid_req(wqe);
+			trace_hfi1_tid_req_tid_retry_timeout(/* req */
+			   qp, 0, wqe->wr.opcode, wqe->psn, wqe->lpsn, req);
+
+			priv->s_flags &= ~RVT_S_WAIT_ACK;
+			/* Only send one packet (the RESYNC) */
+			priv->s_flags |= RVT_S_SEND_ONE;
+			/*
+			 * No additional request shall be made by this QP until
+			 * the RESYNC has been complete.
+			 */
+			qp->s_flags |= HFI1_S_WAIT_HALT;
+			priv->s_state = TID_OP(RESYNC);
+			priv->s_retry--;
+			hfi1_schedule_tid_send(qp);
+		}
+	}
+	spin_unlock(&qp->s_lock);
+	spin_unlock_irqrestore(&qp->r_lock, flags);
+}
+
+u32 hfi1_build_tid_rdma_resync(struct rvt_qp *qp, struct rvt_swqe *wqe,
+			       struct ib_other_headers *ohdr, u32 *bth1,
+			       u32 *bth2, u16 fidx)
+{
+	struct hfi1_qp_priv *qpriv = qp->priv;
+	struct tid_rdma_params *remote;
+	struct tid_rdma_request *req = wqe_to_tid_req(wqe);
+	struct tid_rdma_flow *flow = &req->flows[fidx];
+	u32 generation;
+
+	rcu_read_lock();
+	remote = rcu_dereference(qpriv->tid_rdma.remote);
+	KDETH_RESET(ohdr->u.tid_rdma.ack.kdeth1, JKEY, remote->jkey);
+	ohdr->u.tid_rdma.ack.verbs_qp = cpu_to_be32(qp->remote_qpn);
+	*bth1 = remote->qp;
+	rcu_read_unlock();
+
+	generation = kern_flow_generation_next(flow->flow_state.generation);
+	*bth2 = mask_psn((generation << HFI1_KDETH_BTH_SEQ_SHIFT) - 1);
+	qpriv->s_resync_psn = *bth2;
+	*bth2 |= IB_BTH_REQ_ACK;
+	KDETH_RESET(ohdr->u.tid_rdma.ack.kdeth0, KVER, 0x1);
+
+	return sizeof(ohdr->u.tid_rdma.resync) / sizeof(u32);
+}
+
+void hfi1_rc_rcv_tid_rdma_resync(struct hfi1_packet *packet)
+{
+	struct ib_other_headers *ohdr = packet->ohdr;
+	struct rvt_qp *qp = packet->qp;
+	struct hfi1_qp_priv *qpriv = qp->priv;
+	struct hfi1_ctxtdata *rcd = qpriv->rcd;
+	struct hfi1_ibdev *dev = to_idev(qp->ibqp.device);
+	struct rvt_ack_entry *e;
+	struct tid_rdma_request *req;
+	struct tid_rdma_flow *flow;
+	struct tid_flow_state *fs = &qpriv->flow_state;
+	u32 psn, generation, idx, gen_next;
+	bool is_fecn;
+	unsigned long flags;
+
+	is_fecn = process_ecn(qp, packet);
+	psn = mask_psn(be32_to_cpu(ohdr->bth[2]));
+
+	generation = mask_psn(psn + 1) >> HFI1_KDETH_BTH_SEQ_SHIFT;
+	spin_lock_irqsave(&qp->s_lock, flags);
+
+	gen_next = (fs->generation == KERN_GENERATION_RESERVED) ?
+		generation : kern_flow_generation_next(fs->generation);
+	/*
+	 * RESYNC packet contains the "next" generation and can only be
+	 * from the current or previous generations
+	 */
+	if (generation != mask_generation(gen_next - 1) &&
+	    generation != gen_next)
+		goto bail;
+	/* Already processing a resync */
+	if (qpriv->resync)
+		goto bail;
+
+	spin_lock(&rcd->exp_lock);
+	if (fs->index >= RXE_NUM_TID_FLOWS) {
+		/*
+		 * If we don't have a flow, save the generation so it can be
+		 * applied when a new flow is allocated
+		 */
+		fs->generation = generation;
+	} else {
+		/* Reprogram the QP flow with new generation */
+		rcd->flows[fs->index].generation = generation;
+		fs->generation = kern_setup_hw_flow(rcd, fs->index);
+	}
+	fs->psn = 0;
+	/*
+	 * Disable SW PSN checking since a RESYNC is equivalent to a
+	 * sync point and the flow has/will be reprogrammed
+	 */
+	qpriv->s_flags &= ~HFI1_R_TID_SW_PSN;
+	trace_hfi1_tid_write_rsp_rcv_resync(qp);
+
+	/*
+	 * Reset all TID flow information with the new generation.
+	 * This is done for all requests and segments after the
+	 * last received segment
+	 */
+	for (idx = qpriv->r_tid_tail; ; idx++) {
+		u16 flow_idx;
+
+		if (idx > rvt_size_atomic(&dev->rdi))
+			idx = 0;
+		e = &qp->s_ack_queue[idx];
+		if (e->opcode == TID_OP(WRITE_REQ)) {
+			req = ack_to_tid_req(e);
+			trace_hfi1_tid_req_rcv_resync(qp, 0, e->opcode, e->psn,
+						      e->lpsn, req);
+
+			/* start from last unacked segment */
+			for (flow_idx = req->clear_tail;
+			     CIRC_CNT(req->setup_head, flow_idx,
+				      MAX_FLOWS);
+			     flow_idx = CIRC_NEXT(flow_idx, MAX_FLOWS)) {
+				u32 lpsn;
+				u32 next;
+
+				flow = &req->flows[flow_idx];
+				lpsn = full_flow_psn(flow,
+						     flow->flow_state.lpsn);
+				next = flow->flow_state.r_next_psn;
+				flow->npkts = delta_psn(lpsn, next - 1);
+				flow->flow_state.generation = fs->generation;
+				flow->flow_state.spsn = fs->psn;
+				flow->flow_state.lpsn =
+					flow->flow_state.spsn + flow->npkts - 1;
+				flow->flow_state.r_next_psn =
+					full_flow_psn(flow,
+						      flow->flow_state.spsn);
+				fs->psn += flow->npkts;
+				trace_hfi1_tid_flow_rcv_resync(qp, flow_idx,
+							       flow);
+			}
+		}
+		if (idx == qp->s_tail_ack_queue)
+			break;
+	}
+
+	spin_unlock(&rcd->exp_lock);
+	qpriv->resync = true;
+	/* RESYNC request always gets a TID RDMA ACK. */
+	qpriv->s_nak_state = 0;
+	qpriv->s_flags |= RVT_S_ACK_PENDING;
+	hfi1_schedule_tid_send(qp);
+bail:
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+}
+
+/*
+ * Call this function when the last TID RDMA WRITE DATA packet for a request
+ * is built.
+ */
+static void update_tid_tail(struct rvt_qp *qp)
+	__must_hold(&qp->s_lock)
+{
+	struct hfi1_qp_priv *priv = qp->priv;
+	u32 i;
+	struct rvt_swqe *wqe;
+
+	lockdep_assert_held(&qp->s_lock);
+	/* Can't move beyond s_tid_cur */
+	if (priv->s_tid_tail == priv->s_tid_cur)
+		return;
+	for (i = priv->s_tid_tail + 1; ; i++) {
+		if (i == qp->s_size)
+			i = 0;
+
+		if (i == priv->s_tid_cur)
+			break;
+		wqe = rvt_get_swqe_ptr(qp, i);
+		if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE)
+			break;
+	}
+	priv->s_tid_tail = i;
+	priv->s_state = TID_OP(WRITE_RESP);
+}
+
+int hfi1_make_tid_rdma_pkt(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
+	__must_hold(&qp->s_lock)
+{
+	struct hfi1_qp_priv *priv = qp->priv;
+	struct rvt_swqe *wqe;
+	u32 bth1 = 0, bth2 = 0, hwords = 5, len, middle = 0;
+	struct ib_other_headers *ohdr;
+	struct rvt_sge_state *ss = &qp->s_sge;
+	struct rvt_ack_entry *e = &qp->s_ack_queue[qp->s_tail_ack_queue];
+	struct tid_rdma_request *req = ack_to_tid_req(e);
+	bool last = false;
+	u8 opcode = TID_OP(WRITE_DATA);
+
+	lockdep_assert_held(&qp->s_lock);
+	trace_hfi1_tid_write_sender_make_tid_pkt(qp, 0);
+	/*
+	 * Prioritize the sending of the requests and responses over the
+	 * sending of the TID RDMA data packets.
+	 */
+	if (((atomic_read(&priv->n_tid_requests) < HFI1_TID_RDMA_WRITE_CNT) &&
+	     atomic_read(&priv->n_requests) &&
+	     !(qp->s_flags & (RVT_S_BUSY | RVT_S_WAIT_ACK |
+			     HFI1_S_ANY_WAIT_IO))) ||
+	    (e->opcode == TID_OP(WRITE_REQ) && req->cur_seg < req->alloc_seg &&
+	     !(qp->s_flags & (RVT_S_BUSY | HFI1_S_ANY_WAIT_IO)))) {
+		struct iowait_work *iowork;
+
+		iowork = iowait_get_ib_work(&priv->s_iowait);
+		ps->s_txreq = get_waiting_verbs_txreq(iowork);
+		if (ps->s_txreq || hfi1_make_rc_req(qp, ps)) {
+			priv->s_flags |= HFI1_S_TID_BUSY_SET;
+			return 1;
+		}
+	}
+
+	ps->s_txreq = get_txreq(ps->dev, qp);
+	if (!ps->s_txreq)
+		goto bail_no_tx;
+
+	ohdr = &ps->s_txreq->phdr.hdr.ibh.u.oth;
+
+	if ((priv->s_flags & RVT_S_ACK_PENDING) &&
+	    make_tid_rdma_ack(qp, ohdr, ps))
+		return 1;
+
+	if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_SEND_OK)) {
+		if (!(ib_rvt_state_ops[qp->state] & RVT_FLUSH_SEND))
+			goto bail;
+		/* We are in the error state, flush the work request. */
+		if (qp->s_last == READ_ONCE(qp->s_head))
+			goto bail;
+		/* If DMAs are in progress, we can't flush immediately. */
+		if (iowait_sdma_pending(&priv->s_iowait)) {
+			qp->s_flags |= RVT_S_WAIT_DMA;
+			goto bail;
+		}
+		clear_ahg(qp);
+		wqe = rvt_get_swqe_ptr(qp, qp->s_last);
+		hfi1_trdma_send_complete(qp, wqe, qp->s_last != qp->s_acked ?
+					 IB_WC_SUCCESS : IB_WC_WR_FLUSH_ERR);
+		/* will get called again */
+		goto done_free_tx;
+	}
+
+	if (priv->s_flags & RVT_S_WAIT_ACK)
+		goto bail;
+
+	/* Check whether there is anything to do. */
+	if (priv->s_tid_tail == HFI1_QP_WQE_INVALID)
+		goto bail;
+	wqe = rvt_get_swqe_ptr(qp, priv->s_tid_tail);
+	req = wqe_to_tid_req(wqe);
+	trace_hfi1_tid_req_make_tid_pkt(qp, 0, wqe->wr.opcode, wqe->psn,
+					wqe->lpsn, req);
+	switch (priv->s_state) {
+	case TID_OP(WRITE_REQ):
+	case TID_OP(WRITE_RESP):
+		priv->tid_ss.sge = wqe->sg_list[0];
+		priv->tid_ss.sg_list = wqe->sg_list + 1;
+		priv->tid_ss.num_sge = wqe->wr.num_sge;
+		priv->tid_ss.total_len = wqe->length;
+
+		if (priv->s_state == TID_OP(WRITE_REQ))
+			hfi1_tid_rdma_restart_req(qp, wqe, &bth2);
+		priv->s_state = TID_OP(WRITE_DATA);
+		/* fall through */
+
+	case TID_OP(WRITE_DATA):
+		/*
+		 * 1. Check whether TID RDMA WRITE RESP available.
+		 * 2. If no:
+		 *    2.1 If have more segments and no TID RDMA WRITE RESP,
+		 *        set HFI1_S_WAIT_TID_RESP
+		 *    2.2 Return indicating no progress made.
+		 * 3. If yes:
+		 *    3.1 Build TID RDMA WRITE DATA packet.
+		 *    3.2 If last packet in segment:
+		 *        3.2.1 Change KDETH header bits
+		 *        3.2.2 Advance RESP pointers.
+		 *    3.3 Return indicating progress made.
+		 */
+		trace_hfi1_sender_make_tid_pkt(qp);
+		trace_hfi1_tid_write_sender_make_tid_pkt(qp, 0);
+		wqe = rvt_get_swqe_ptr(qp, priv->s_tid_tail);
+		req = wqe_to_tid_req(wqe);
+		len = wqe->length;
+
+		if (!req->comp_seg || req->cur_seg == req->comp_seg)
+			goto bail;
+
+		trace_hfi1_tid_req_make_tid_pkt(qp, 0, wqe->wr.opcode,
+						wqe->psn, wqe->lpsn, req);
+		last = hfi1_build_tid_rdma_packet(wqe, ohdr, &bth1, &bth2,
+						  &len);
+
+		if (last) {
+			/* move pointer to next flow */
+			req->clear_tail = CIRC_NEXT(req->clear_tail,
+						    MAX_FLOWS);
+			if (++req->cur_seg < req->total_segs) {
+				if (!CIRC_CNT(req->setup_head, req->clear_tail,
+					      MAX_FLOWS))
+					qp->s_flags |= HFI1_S_WAIT_TID_RESP;
+			} else {
+				priv->s_state = TID_OP(WRITE_DATA_LAST);
+				opcode = TID_OP(WRITE_DATA_LAST);
+
+				/* Advance the s_tid_tail now */
+				update_tid_tail(qp);
+			}
+		}
+		hwords += sizeof(ohdr->u.tid_rdma.w_data) / sizeof(u32);
+		ss = &priv->tid_ss;
+		break;
+
+	case TID_OP(RESYNC):
+		trace_hfi1_sender_make_tid_pkt(qp);
+		/* Use generation from the most recently received response */
+		wqe = rvt_get_swqe_ptr(qp, priv->s_tid_cur);
+		req = wqe_to_tid_req(wqe);
+		/* If no responses for this WQE look at the previous one */
+		if (!req->comp_seg) {
+			wqe = rvt_get_swqe_ptr(qp,
+					       (!priv->s_tid_cur ? qp->s_size :
+						priv->s_tid_cur) - 1);
+			req = wqe_to_tid_req(wqe);
+		}
+		hwords += hfi1_build_tid_rdma_resync(qp, wqe, ohdr, &bth1,
+						     &bth2,
+						     CIRC_PREV(req->setup_head,
+							       MAX_FLOWS));
+		ss = NULL;
+		len = 0;
+		opcode = TID_OP(RESYNC);
+		break;
+
+	default:
+		goto bail;
+	}
+	if (priv->s_flags & RVT_S_SEND_ONE) {
+		priv->s_flags &= ~RVT_S_SEND_ONE;
+		priv->s_flags |= RVT_S_WAIT_ACK;
+		bth2 |= IB_BTH_REQ_ACK;
+	}
+	qp->s_len -= len;
+	ps->s_txreq->hdr_dwords = hwords;
+	ps->s_txreq->sde = priv->s_sde;
+	ps->s_txreq->ss = ss;
+	ps->s_txreq->s_cur_size = len;
+	hfi1_make_ruc_header(qp, ohdr, (opcode << 24), bth1, bth2,
+			     middle, ps);
+	return 1;
+done_free_tx:
+	hfi1_put_txreq(ps->s_txreq);
+	ps->s_txreq = NULL;
+	return 1;
+
+bail:
+	hfi1_put_txreq(ps->s_txreq);
+bail_no_tx:
+	ps->s_txreq = NULL;
+	priv->s_flags &= ~RVT_S_BUSY;
+	/*
+	 * If we didn't get a txreq, the QP will be woken up later to try
+	 * again, set the flags to the the wake up which work item to wake
+	 * up.
+	 * (A better algorithm should be found to do this and generalize the
+	 * sleep/wakeup flags.)
+	 */
+	iowait_set_flag(&priv->s_iowait, IOWAIT_PENDING_TID);
+	return 0;
+}
+
+static int make_tid_rdma_ack(struct rvt_qp *qp,
+			     struct ib_other_headers *ohdr,
+			     struct hfi1_pkt_state *ps)
+{
+	struct rvt_ack_entry *e;
+	struct hfi1_qp_priv *qpriv = qp->priv;
+	struct hfi1_ibdev *dev = to_idev(qp->ibqp.device);
+	u32 hwords, next;
+	u32 len = 0;
+	u32 bth1 = 0, bth2 = 0;
+	int middle = 0;
+	u16 flow;
+	struct tid_rdma_request *req, *nreq;
+
+	trace_hfi1_tid_write_rsp_make_tid_ack(qp);
+	/* Don't send an ACK if we aren't supposed to. */
+	if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK))
+		goto bail;
+
+	/* header size in 32-bit words LRH+BTH = (8+12)/4. */
+	hwords = 5;
+
+	e = &qp->s_ack_queue[qpriv->r_tid_ack];
+	req = ack_to_tid_req(e);
+	/*
+	 * In the RESYNC case, we are exactly one segment past the
+	 * previously sent ack or at the previously sent NAK. So to send
+	 * the resync ack, we go back one segment (which might be part of
+	 * the previous request) and let the do-while loop execute again.
+	 * The advantage of executing the do-while loop is that any data
+	 * received after the previous ack is automatically acked in the
+	 * RESYNC ack. It turns out that for the do-while loop we only need
+	 * to pull back qpriv->r_tid_ack, not the segment
+	 * indices/counters. The scheme works even if the previous request
+	 * was not a TID WRITE request.
+	 */
+	if (qpriv->resync) {
+		if (!req->ack_seg || req->ack_seg == req->total_segs)
+			qpriv->r_tid_ack = !qpriv->r_tid_ack ?
+				rvt_size_atomic(&dev->rdi) :
+				qpriv->r_tid_ack - 1;
+		e = &qp->s_ack_queue[qpriv->r_tid_ack];
+		req = ack_to_tid_req(e);
+	}
+
+	trace_hfi1_rsp_make_tid_ack(qp, e->psn);
+	trace_hfi1_tid_req_make_tid_ack(qp, 0, e->opcode, e->psn, e->lpsn,
+					req);
+	/*
+	 * If we've sent all the ACKs that we can, we are done
+	 * until we get more segments...
+	 */
+	if (!qpriv->s_nak_state && !qpriv->resync &&
+	    req->ack_seg == req->comp_seg)
+		goto bail;
+
+	do {
+		/*
+		 * To deal with coalesced ACKs, the acked_tail pointer
+		 * into the flow array is used. The distance between it
+		 * and the clear_tail is the number of flows that are
+		 * being ACK'ed.
+		 */
+		req->ack_seg +=
+			/* Get up-to-date value */
+			CIRC_CNT(req->clear_tail, req->acked_tail,
+				 MAX_FLOWS);
+		/* Advance acked index */
+		req->acked_tail = req->clear_tail;
+
+		/*
+		 * req->clear_tail points to the segment currently being
+		 * received. So, when sending an ACK, the previous
+		 * segment is being ACK'ed.
+		 */
+		flow = CIRC_PREV(req->acked_tail, MAX_FLOWS);
+		if (req->ack_seg != req->total_segs)
+			break;
+		req->state = TID_REQUEST_COMPLETE;
+
+		next = qpriv->r_tid_ack + 1;
+		if (next > rvt_size_atomic(&dev->rdi))
+			next = 0;
+		qpriv->r_tid_ack = next;
+		if (qp->s_ack_queue[next].opcode != TID_OP(WRITE_REQ))
+			break;
+		nreq = ack_to_tid_req(&qp->s_ack_queue[next]);
+		if (!nreq->comp_seg || nreq->ack_seg == nreq->comp_seg)
+			break;
+
+		/* Move to the next ack entry now */
+		e = &qp->s_ack_queue[qpriv->r_tid_ack];
+		req = ack_to_tid_req(e);
+	} while (1);
+
+	/*
+	 * At this point qpriv->r_tid_ack == qpriv->r_tid_tail but e and
+	 * req could be pointing at the previous ack queue entry
+	 */
+	if (qpriv->s_nak_state ||
+	    (qpriv->resync &&
+	     !hfi1_tid_rdma_is_resync_psn(qpriv->r_next_psn_kdeth - 1) &&
+	     (cmp_psn(qpriv->r_next_psn_kdeth - 1,
+		      full_flow_psn(&req->flows[flow],
+				    req->flows[flow].flow_state.lpsn)) > 0))) {
+		/*
+		 * A NAK will implicitly acknowledge all previous TID RDMA
+		 * requests. Therefore, we NAK with the req->acked_tail
+		 * segment for the request at qpriv->r_tid_ack (same at
+		 * this point as the req->clear_tail segment for the
+		 * qpriv->r_tid_tail request)
+		 */
+		e = &qp->s_ack_queue[qpriv->r_tid_ack];
+		req = ack_to_tid_req(e);
+		flow = req->acked_tail;
+	} else if (req->ack_seg == req->total_segs &&
+		   qpriv->s_flags & HFI1_R_TID_WAIT_INTERLCK)
+		qpriv->s_flags &= ~HFI1_R_TID_WAIT_INTERLCK;
+
+	trace_hfi1_tid_write_rsp_make_tid_ack(qp);
+	trace_hfi1_tid_req_make_tid_ack(qp, 0, e->opcode, e->psn, e->lpsn,
+					req);
+	hwords += hfi1_build_tid_rdma_write_ack(qp, e, ohdr, flow, &bth1,
+						&bth2);
+	len = 0;
+	qpriv->s_flags &= ~RVT_S_ACK_PENDING;
+	ps->s_txreq->hdr_dwords = hwords;
+	ps->s_txreq->sde = qpriv->s_sde;
+	ps->s_txreq->s_cur_size = len;
+	ps->s_txreq->ss = NULL;
+	hfi1_make_ruc_header(qp, ohdr, (TID_OP(ACK) << 24), bth1, bth2, middle,
+			     ps);
+	ps->s_txreq->txreq.flags |= SDMA_TXREQ_F_VIP;
+	return 1;
+bail:
+	/*
+	 * Ensure s_rdma_ack_cnt changes are committed prior to resetting
+	 * RVT_S_RESP_PENDING
+	 */
+	smp_wmb();
+	qpriv->s_flags &= ~RVT_S_ACK_PENDING;
+	return 0;
+}
+
+static int hfi1_send_tid_ok(struct rvt_qp *qp)
+{
+	struct hfi1_qp_priv *priv = qp->priv;
+
+	return !(priv->s_flags & RVT_S_BUSY ||
+		 qp->s_flags & HFI1_S_ANY_WAIT_IO) &&
+		(verbs_txreq_queued(iowait_get_tid_work(&priv->s_iowait)) ||
+		 (priv->s_flags & RVT_S_RESP_PENDING) ||
+		 !(qp->s_flags & HFI1_S_ANY_TID_WAIT_SEND));
+}
+
+void _hfi1_do_tid_send(struct work_struct *work)
+{
+	struct iowait_work *w = container_of(work, struct iowait_work, iowork);
+	struct rvt_qp *qp = iowait_to_qp(w->iow);
+
+	hfi1_do_tid_send(qp);
+}
+
+static void hfi1_do_tid_send(struct rvt_qp *qp)
+{
+	struct hfi1_pkt_state ps;
+	struct hfi1_qp_priv *priv = qp->priv;
+
+	ps.dev = to_idev(qp->ibqp.device);
+	ps.ibp = to_iport(qp->ibqp.device, qp->port_num);
+	ps.ppd = ppd_from_ibp(ps.ibp);
+	ps.wait = iowait_get_tid_work(&priv->s_iowait);
+	ps.in_thread = false;
+	ps.timeout_int = qp->timeout_jiffies / 8;
+
+	trace_hfi1_rc_do_tid_send(qp, false);
+	spin_lock_irqsave(&qp->s_lock, ps.flags);
+
+	/* Return if we are already busy processing a work request. */
+	if (!hfi1_send_tid_ok(qp)) {
+		if (qp->s_flags & HFI1_S_ANY_WAIT_IO)
+			iowait_set_flag(&priv->s_iowait, IOWAIT_PENDING_TID);
+		spin_unlock_irqrestore(&qp->s_lock, ps.flags);
+		return;
+	}
+
+	priv->s_flags |= RVT_S_BUSY;
+
+	ps.timeout = jiffies + ps.timeout_int;
+	ps.cpu = priv->s_sde ? priv->s_sde->cpu :
+		cpumask_first(cpumask_of_node(ps.ppd->dd->node));
+	ps.pkts_sent = false;
+
+	/* insure a pre-built packet is handled  */
+	ps.s_txreq = get_waiting_verbs_txreq(ps.wait);
+	do {
+		/* Check for a constructed packet to be sent. */
+		if (ps.s_txreq) {
+			if (priv->s_flags & HFI1_S_TID_BUSY_SET) {
+				qp->s_flags |= RVT_S_BUSY;
+				ps.wait = iowait_get_ib_work(&priv->s_iowait);
+			}
+			spin_unlock_irqrestore(&qp->s_lock, ps.flags);
+
+			/*
+			 * If the packet cannot be sent now, return and
+			 * the send tasklet will be woken up later.
+			 */
+			if (hfi1_verbs_send(qp, &ps))
+				return;
+
+			/* allow other tasks to run */
+			if (hfi1_schedule_send_yield(qp, &ps, true))
+				return;
+
+			spin_lock_irqsave(&qp->s_lock, ps.flags);
+			if (priv->s_flags & HFI1_S_TID_BUSY_SET) {
+				qp->s_flags &= ~RVT_S_BUSY;
+				priv->s_flags &= ~HFI1_S_TID_BUSY_SET;
+				ps.wait = iowait_get_tid_work(&priv->s_iowait);
+				if (iowait_flag_set(&priv->s_iowait,
+						    IOWAIT_PENDING_IB))
+					hfi1_schedule_send(qp);
+			}
+		}
+	} while (hfi1_make_tid_rdma_pkt(qp, &ps));
+	iowait_starve_clear(ps.pkts_sent, &priv->s_iowait);
+	spin_unlock_irqrestore(&qp->s_lock, ps.flags);
+}
+
+static bool _hfi1_schedule_tid_send(struct rvt_qp *qp)
+{
+	struct hfi1_qp_priv *priv = qp->priv;
+	struct hfi1_ibport *ibp =
+		to_iport(qp->ibqp.device, qp->port_num);
+	struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+	struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device);
+
+	return iowait_tid_schedule(&priv->s_iowait, ppd->hfi1_wq,
+				   priv->s_sde ?
+				   priv->s_sde->cpu :
+				   cpumask_first(cpumask_of_node(dd->node)));
+}
+
+/**
+ * hfi1_schedule_tid_send - schedule progress on TID RDMA state machine
+ * @qp: the QP
+ *
+ * This schedules qp progress on the TID RDMA state machine. Caller
+ * should hold the s_lock.
+ * Unlike hfi1_schedule_send(), this cannot use hfi1_send_ok() because
+ * the two state machines can step on each other with respect to the
+ * RVT_S_BUSY flag.
+ * Therefore, a modified test is used.
+ * @return true if the second leg is scheduled;
+ *  false if the second leg is not scheduled.
+ */
+bool hfi1_schedule_tid_send(struct rvt_qp *qp)
+{
+	lockdep_assert_held(&qp->s_lock);
+	if (hfi1_send_tid_ok(qp)) {
+		/*
+		 * The following call returns true if the qp is not on the
+		 * queue and false if the qp is already on the queue before
+		 * this call. Either way, the qp will be on the queue when the
+		 * call returns.
+		 */
+		_hfi1_schedule_tid_send(qp);
+		return true;
+	}
+	if (qp->s_flags & HFI1_S_ANY_WAIT_IO)
+		iowait_set_flag(&((struct hfi1_qp_priv *)qp->priv)->s_iowait,
+				IOWAIT_PENDING_TID);
+	return false;
+}
+
+bool hfi1_tid_rdma_ack_interlock(struct rvt_qp *qp, struct rvt_ack_entry *e)
+{
+	struct rvt_ack_entry *prev;
+	struct tid_rdma_request *req;
+	struct hfi1_ibdev *dev = to_idev(qp->ibqp.device);
+	struct hfi1_qp_priv *priv = qp->priv;
+	u32 s_prev;
+
+	s_prev = qp->s_tail_ack_queue == 0 ? rvt_size_atomic(&dev->rdi) :
+		(qp->s_tail_ack_queue - 1);
+	prev = &qp->s_ack_queue[s_prev];
+
+	if ((e->opcode == TID_OP(READ_REQ) ||
+	     e->opcode == OP(RDMA_READ_REQUEST)) &&
+	    prev->opcode == TID_OP(WRITE_REQ)) {
+		req = ack_to_tid_req(prev);
+		if (req->ack_seg != req->total_segs) {
+			priv->s_flags |= HFI1_R_TID_WAIT_INTERLCK;
+			return true;
+		}
+	}
+	return false;
+}
diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.h b/drivers/infiniband/hw/hfi1/tid_rdma.h
index a53598ce45b2..53ab24ef4f02 100644
--- a/drivers/infiniband/hw/hfi1/tid_rdma.h
+++ b/drivers/infiniband/hw/hfi1/tid_rdma.h
@@ -25,8 +25,34 @@
  * s_flags, there are no collisions.
  *
  * HFI1_S_TID_WAIT_INTERLCK - QP is waiting for requester interlock
+ * HFI1_R_TID_WAIT_INTERLCK - QP is waiting for responder interlock
  */
+#define HFI1_S_TID_BUSY_SET       BIT(0)
+/* BIT(1) reserved for RVT_S_BUSY. */
+#define HFI1_R_TID_RSC_TIMER      BIT(2)
+/* BIT(3) reserved for RVT_S_RESP_PENDING. */
+/* BIT(4) reserved for RVT_S_ACK_PENDING. */
 #define HFI1_S_TID_WAIT_INTERLCK  BIT(5)
+#define HFI1_R_TID_WAIT_INTERLCK  BIT(6)
+/* BIT(7) - BIT(15) reserved for RVT_S_WAIT_*. */
+/* BIT(16) reserved for RVT_S_SEND_ONE */
+#define HFI1_S_TID_RETRY_TIMER    BIT(17)
+/* BIT(18) reserved for RVT_S_ECN. */
+#define HFI1_R_TID_SW_PSN         BIT(19)
+/* BIT(26) reserved for HFI1_S_WAIT_HALT */
+/* BIT(27) reserved for HFI1_S_WAIT_TID_RESP */
+/* BIT(28) reserved for HFI1_S_WAIT_TID_SPACE */
+
+/*
+ * Unlike regular IB RDMA VERBS, which do not require an entry
+ * in the s_ack_queue, TID RDMA WRITE requests do because they
+ * generate responses.
+ * Therefore, the s_ack_queue needs to be extended by a certain
+ * amount. The key point is that the queue needs to be extended
+ * without letting the "user" know so they user doesn't end up
+ * using these extra entries.
+ */
+#define HFI1_TID_RDMA_WRITE_CNT 8
 
 struct tid_rdma_params {
 	struct rcu_head rcu_head;
@@ -78,20 +104,25 @@ struct tid_rdma_request {
 	} e;
 
 	struct tid_rdma_flow *flows;	/* array of tid flows */
+	struct rvt_sge_state ss; /* SGE state for TID RDMA requests */
 	u16 n_flows;		/* size of the flow buffer window */
 	u16 setup_head;		/* flow index we are setting up */
 	u16 clear_tail;		/* flow index we are clearing */
 	u16 flow_idx;		/* flow index most recently set up */
+	u16 acked_tail;
 
 	u32 seg_len;
 	u32 total_len;
+	u32 r_ack_psn;          /* next expected ack PSN */
 	u32 r_flow_psn;         /* IB PSN of next segment start */
+	u32 r_last_acked;       /* IB PSN of last ACK'ed packet */
 	u32 s_next_psn;		/* IB PSN of next segment start for read */
 
 	u32 total_segs;		/* segments required to complete a request */
 	u32 cur_seg;		/* index of current segment */
 	u32 comp_seg;           /* index of last completed segment */
 	u32 ack_seg;            /* index of last ack'ed segment */
+	u32 alloc_seg;          /* index of next segment to be allocated */
 	u32 isge;		/* index of "current" sge */
 	u32 ack_pending;        /* num acks pending for this request */
 
@@ -158,11 +189,18 @@ struct tid_rdma_flow {
 	u8 npagesets;
 	u8 npkts;
 	u8 pkt;
+	u8 resync_npkts;
 	struct kern_tid_node tnode[TID_RDMA_MAX_PAGES];
 	struct tid_rdma_pageset pagesets[TID_RDMA_MAX_PAGES];
 	u32 tid_entry[TID_RDMA_MAX_PAGES];
 };
 
+enum tid_rnr_nak_state {
+	TID_RNR_NAK_INIT = 0,
+	TID_RNR_NAK_SEND,
+	TID_RNR_NAK_SENT,
+};
+
 bool tid_rdma_conn_req(struct rvt_qp *qp, u64 *data);
 bool tid_rdma_conn_reply(struct rvt_qp *qp, u64 data);
 bool tid_rdma_conn_resp(struct rvt_qp *qp, u64 *data);
@@ -228,9 +266,57 @@ static inline void hfi1_setup_tid_rdma_wqe(struct rvt_qp *qp,
 					   struct rvt_swqe *wqe)
 {
 	if (wqe->priv &&
-	    wqe->wr.opcode == IB_WR_RDMA_READ &&
+	    (wqe->wr.opcode == IB_WR_RDMA_READ ||
+	     wqe->wr.opcode == IB_WR_RDMA_WRITE) &&
 	    wqe->length >= TID_RDMA_MIN_SEGMENT_SIZE)
 		setup_tid_rdma_wqe(qp, wqe);
 }
 
+u32 hfi1_build_tid_rdma_write_req(struct rvt_qp *qp, struct rvt_swqe *wqe,
+				  struct ib_other_headers *ohdr,
+				  u32 *bth1, u32 *bth2, u32 *len);
+
+void hfi1_compute_tid_rdma_flow_wt(void);
+
+void hfi1_rc_rcv_tid_rdma_write_req(struct hfi1_packet *packet);
+
+u32 hfi1_build_tid_rdma_write_resp(struct rvt_qp *qp, struct rvt_ack_entry *e,
+				   struct ib_other_headers *ohdr, u32 *bth1,
+				   u32 bth2, u32 *len,
+				   struct rvt_sge_state **ss);
+
+void hfi1_del_tid_reap_timer(struct rvt_qp *qp);
+
+void hfi1_rc_rcv_tid_rdma_write_resp(struct hfi1_packet *packet);
+
+bool hfi1_build_tid_rdma_packet(struct rvt_swqe *wqe,
+				struct ib_other_headers *ohdr,
+				u32 *bth1, u32 *bth2, u32 *len);
+
+void hfi1_rc_rcv_tid_rdma_write_data(struct hfi1_packet *packet);
+
+u32 hfi1_build_tid_rdma_write_ack(struct rvt_qp *qp, struct rvt_ack_entry *e,
+				  struct ib_other_headers *ohdr, u16 iflow,
+				  u32 *bth1, u32 *bth2);
+
+void hfi1_rc_rcv_tid_rdma_ack(struct hfi1_packet *packet);
+
+void hfi1_add_tid_retry_timer(struct rvt_qp *qp);
+void hfi1_del_tid_retry_timer(struct rvt_qp *qp);
+
+u32 hfi1_build_tid_rdma_resync(struct rvt_qp *qp, struct rvt_swqe *wqe,
+			       struct ib_other_headers *ohdr, u32 *bth1,
+			       u32 *bth2, u16 fidx);
+
+void hfi1_rc_rcv_tid_rdma_resync(struct hfi1_packet *packet);
+
+struct hfi1_pkt_state;
+int hfi1_make_tid_rdma_pkt(struct rvt_qp *qp, struct hfi1_pkt_state *ps);
+
+void _hfi1_do_tid_send(struct work_struct *work);
+
+bool hfi1_schedule_tid_send(struct rvt_qp *qp);
+
+bool hfi1_tid_rdma_ack_interlock(struct rvt_qp *qp, struct rvt_ack_entry *e);
+
 #endif /* HFI1_TID_RDMA_H */
diff --git a/drivers/infiniband/hw/hfi1/trace.c b/drivers/infiniband/hw/hfi1/trace.c
index 28181d711fed..9a3d236bcc88 100644
--- a/drivers/infiniband/hw/hfi1/trace.c
+++ b/drivers/infiniband/hw/hfi1/trace.c
@@ -133,6 +133,11 @@ const char *hfi1_trace_get_packet_l2_str(u8 l2)
 #define TID_RDMA_KDETH_DATA "kdeth0 0x%x: kver %u sh %u intr %u tidctrl %u tid %x offset %x kdeth1 0x%x: jkey %x"
 #define TID_READ_REQ_PRN "tid_flow_psn 0x%x tid_flow_qp 0x%x verbs_qp 0x%x"
 #define TID_READ_RSP_PRN "verbs_qp 0x%x"
+#define TID_WRITE_REQ_PRN "original_qp 0x%x"
+#define TID_WRITE_RSP_PRN "tid_flow_psn 0x%x tid_flow_qp 0x%x verbs_qp 0x%x"
+#define TID_WRITE_DATA_PRN "verbs_qp 0x%x"
+#define TID_ACK_PRN "tid_flow_psn 0x%x verbs_psn 0x%x tid_flow_qp 0x%x verbs_qp 0x%x"
+#define TID_RESYNC_PRN "verbs_qp 0x%x"
 
 #define OP(transport, op) IB_OPCODE_## transport ## _ ## op
 
@@ -327,6 +332,45 @@ const char *parse_everbs_hdrs(
 				 parse_syndrome(be32_to_cpu(eh->aeth) >> 24),
 				 be32_to_cpu(eh->aeth) & IB_MSN_MASK);
 		break;
+	case OP(TID_RDMA, WRITE_REQ):
+		trace_seq_printf(p, TID_RDMA_KDETH " " RETH_PRN " "
+				 TID_WRITE_REQ_PRN,
+				 le32_to_cpu(eh->tid_rdma.w_req.kdeth0),
+				 le32_to_cpu(eh->tid_rdma.w_req.kdeth1),
+				 ib_u64_get(&eh->tid_rdma.w_req.reth.vaddr),
+				 be32_to_cpu(eh->tid_rdma.w_req.reth.rkey),
+				 be32_to_cpu(eh->tid_rdma.w_req.reth.length),
+				 be32_to_cpu(eh->tid_rdma.w_req.verbs_qp));
+		break;
+	case OP(TID_RDMA, WRITE_RESP):
+		trace_seq_printf(p, TID_RDMA_KDETH " " AETH_PRN " "
+				 TID_WRITE_RSP_PRN,
+				 le32_to_cpu(eh->tid_rdma.w_rsp.kdeth0),
+				 le32_to_cpu(eh->tid_rdma.w_rsp.kdeth1),
+				 be32_to_cpu(eh->tid_rdma.w_rsp.aeth) >> 24,
+				 parse_syndrome(/* aeth */
+					 be32_to_cpu(eh->tid_rdma.w_rsp.aeth)
+					 >> 24),
+				 (be32_to_cpu(eh->tid_rdma.w_rsp.aeth) &
+				  IB_MSN_MASK),
+				 be32_to_cpu(eh->tid_rdma.w_rsp.tid_flow_psn),
+				 be32_to_cpu(eh->tid_rdma.w_rsp.tid_flow_qp),
+				 be32_to_cpu(eh->tid_rdma.w_rsp.verbs_qp));
+		break;
+	case OP(TID_RDMA, WRITE_DATA_LAST):
+	case OP(TID_RDMA, WRITE_DATA):
+		trace_seq_printf(p, TID_RDMA_KDETH_DATA " " TID_WRITE_DATA_PRN,
+				 le32_to_cpu(eh->tid_rdma.w_data.kdeth0),
+				 KDETH_GET(eh->tid_rdma.w_data.kdeth0, KVER),
+				 KDETH_GET(eh->tid_rdma.w_data.kdeth0, SH),
+				 KDETH_GET(eh->tid_rdma.w_data.kdeth0, INTR),
+				 KDETH_GET(eh->tid_rdma.w_data.kdeth0, TIDCTRL),
+				 KDETH_GET(eh->tid_rdma.w_data.kdeth0, TID),
+				 KDETH_GET(eh->tid_rdma.w_data.kdeth0, OFFSET),
+				 le32_to_cpu(eh->tid_rdma.w_data.kdeth1),
+				 KDETH_GET(eh->tid_rdma.w_data.kdeth1, JKEY),
+				 be32_to_cpu(eh->tid_rdma.w_data.verbs_qp));
+		break;
 	case OP(TID_RDMA, READ_REQ):
 		trace_seq_printf(p, TID_RDMA_KDETH " " RETH_PRN " "
 				 TID_READ_REQ_PRN,
@@ -359,6 +403,28 @@ const char *parse_everbs_hdrs(
 				  IB_MSN_MASK),
 				 be32_to_cpu(eh->tid_rdma.r_rsp.verbs_qp));
 		break;
+	case OP(TID_RDMA, ACK):
+		trace_seq_printf(p, TID_RDMA_KDETH " " AETH_PRN " "
+				 TID_ACK_PRN,
+				 le32_to_cpu(eh->tid_rdma.ack.kdeth0),
+				 le32_to_cpu(eh->tid_rdma.ack.kdeth1),
+				 be32_to_cpu(eh->tid_rdma.ack.aeth) >> 24,
+				 parse_syndrome(/* aeth */
+					 be32_to_cpu(eh->tid_rdma.ack.aeth)
+					 >> 24),
+				 (be32_to_cpu(eh->tid_rdma.ack.aeth) &
+				  IB_MSN_MASK),
+				 be32_to_cpu(eh->tid_rdma.ack.tid_flow_psn),
+				 be32_to_cpu(eh->tid_rdma.ack.verbs_psn),
+				 be32_to_cpu(eh->tid_rdma.ack.tid_flow_qp),
+				 be32_to_cpu(eh->tid_rdma.ack.verbs_qp));
+		break;
+	case OP(TID_RDMA, RESYNC):
+		trace_seq_printf(p, TID_RDMA_KDETH " " TID_RESYNC_PRN,
+				 le32_to_cpu(eh->tid_rdma.resync.kdeth0),
+				 le32_to_cpu(eh->tid_rdma.resync.kdeth1),
+				 be32_to_cpu(eh->tid_rdma.resync.verbs_qp));
+		break;
 	/* aeth + atomicacketh */
 	case OP(RC, ATOMIC_ACKNOWLEDGE):
 		trace_seq_printf(p, AETH_PRN " " ATOMICACKETH_PRN,
diff --git a/drivers/infiniband/hw/hfi1/trace_ibhdrs.h b/drivers/infiniband/hw/hfi1/trace_ibhdrs.h
index 1116238bf24d..d1372cc66de6 100644
--- a/drivers/infiniband/hw/hfi1/trace_ibhdrs.h
+++ b/drivers/infiniband/hw/hfi1/trace_ibhdrs.h
@@ -79,8 +79,14 @@ __print_symbolic(opcode,                                   \
 	ib_opcode_name(RC_ATOMIC_ACKNOWLEDGE),             \
 	ib_opcode_name(RC_COMPARE_SWAP),                   \
 	ib_opcode_name(RC_FETCH_ADD),                      \
+	ib_opcode_name(TID_RDMA_WRITE_REQ),	           \
+	ib_opcode_name(TID_RDMA_WRITE_RESP),	           \
+	ib_opcode_name(TID_RDMA_WRITE_DATA),	           \
+	ib_opcode_name(TID_RDMA_WRITE_DATA_LAST),          \
 	ib_opcode_name(TID_RDMA_READ_REQ),	           \
 	ib_opcode_name(TID_RDMA_READ_RESP),	           \
+	ib_opcode_name(TID_RDMA_RESYNC),	           \
+	ib_opcode_name(TID_RDMA_ACK),                      \
 	ib_opcode_name(UC_SEND_FIRST),                     \
 	ib_opcode_name(UC_SEND_MIDDLE),                    \
 	ib_opcode_name(UC_SEND_LAST),                      \
diff --git a/drivers/infiniband/hw/hfi1/trace_tid.h b/drivers/infiniband/hw/hfi1/trace_tid.h
index b71638c22d4b..548dfc45a407 100644
--- a/drivers/infiniband/hw/hfi1/trace_tid.h
+++ b/drivers/infiniband/hw/hfi1/trace_tid.h
@@ -40,7 +40,7 @@ u16 hfi1_trace_get_tid_idx(u32 ent);
 #define RSP_INFO_PRN "[%s] qpn 0x%x state 0x%x s_state 0x%x psn 0x%x " \
 		     "r_psn 0x%x r_state 0x%x r_flags 0x%x " \
 		     "r_head_ack_queue %u s_tail_ack_queue %u " \
-		     "s_ack_state 0x%x " \
+		     "s_acked_ack_queue %u s_ack_state 0x%x " \
 		     "s_nak_state 0x%x s_flags 0x%x ps_flags 0x%x " \
 		     "iow_flags 0x%lx"
 
@@ -52,20 +52,37 @@ u16 hfi1_trace_get_tid_idx(u32 ent);
 #define TID_READ_SENDER_PRN "[%s] qpn 0x%x newreq %u tid_r_reqs %u " \
 			    "tid_r_comp %u pending_tid_r_segs %u " \
 			    "s_flags 0x%x ps_flags 0x%x iow_flags 0x%lx " \
-			    "hw_flow_index %u generation 0x%x " \
+			    "s_state 0x%x hw_flow_index %u generation 0x%x " \
 			    "fpsn 0x%x flow_flags 0x%x"
 
 #define TID_REQ_PRN "[%s] qpn 0x%x newreq %u opcode 0x%x psn 0x%x lpsn 0x%x " \
-		    "cur_seg %u comp_seg %u ack_seg %u " \
+		    "cur_seg %u comp_seg %u ack_seg %u alloc_seg %u " \
 		    "total_segs %u setup_head %u clear_tail %u flow_idx %u " \
-		    "state %u r_flow_psn 0x%x " \
-		    "s_next_psn 0x%x"
+		    "acked_tail %u state %u r_ack_psn 0x%x r_flow_psn 0x%x " \
+		    "r_last_ackd 0x%x s_next_psn 0x%x"
 
 #define RCV_ERR_PRN "[%s] qpn 0x%x s_flags 0x%x state 0x%x " \
-		    "s_tail_ack_queue %u " \
+		    "s_acked_ack_queue %u s_tail_ack_queue %u " \
 		    "r_head_ack_queue %u opcode 0x%x psn 0x%x r_psn 0x%x " \
 		    " diff %d"
 
+#define TID_WRITE_RSPDR_PRN "[%s] qpn 0x%x r_tid_head %u r_tid_tail %u " \
+			    "r_tid_ack %u r_tid_alloc %u alloc_w_segs %u " \
+			    "pending_tid_w_segs %u sync_pt %s " \
+			    "ps_nak_psn 0x%x ps_nak_state 0x%x " \
+			    "prnr_nak_state 0x%x hw_flow_index %u generation "\
+			    "0x%x fpsn 0x%x flow_flags 0x%x resync %s" \
+			    "r_next_psn_kdeth 0x%x"
+
+#define TID_WRITE_SENDER_PRN "[%s] qpn 0x%x newreq %u s_tid_cur %u " \
+			     "s_tid_tail %u s_tid_head %u " \
+			     "pending_tid_w_resp %u n_requests %u " \
+			     "n_tid_requests %u s_flags 0x%x ps_flags 0x%x "\
+			     "iow_flags 0x%lx s_state 0x%x s_retry %u"
+
+#define KDETH_EFLAGS_ERR_PRN "[%s] qpn 0x%x  TID ERR: RcvType 0x%x " \
+			     "RcvTypeError 0x%x PSN 0x%x"
+
 DECLARE_EVENT_CLASS(/* class */
 	hfi1_exp_tid_reg_unreg,
 	TP_PROTO(unsigned int ctxt, u16 subctxt, u32 rarr, u32 npages,
@@ -382,6 +399,18 @@ DEFINE_EVENT(/* event */
 	TP_ARGS(qp, msg, more)
 );
 
+DEFINE_EVENT(/* event */
+	hfi1_msg_template, hfi1_msg_tid_timeout,
+	TP_PROTO(struct rvt_qp *qp, const char *msg, u64 more),
+	TP_ARGS(qp, msg, more)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_msg_template, hfi1_msg_tid_retry_timeout,
+	TP_PROTO(struct rvt_qp *qp, const char *msg, u64 more),
+	TP_ARGS(qp, msg, more)
+);
+
 DECLARE_EVENT_CLASS(/* tid_flow_page */
 	hfi1_tid_flow_page_template,
 	TP_PROTO(struct rvt_qp *qp, struct tid_rdma_flow *flow, u32 index,
@@ -562,6 +591,42 @@ DEFINE_EVENT(/* event */
 	TP_ARGS(qp, index, flow)
 );
 
+DEFINE_EVENT(/* event */
+	hfi1_tid_flow_template, hfi1_tid_flow_build_write_resp,
+	TP_PROTO(struct rvt_qp *qp, int index, struct tid_rdma_flow *flow),
+	TP_ARGS(qp, index, flow)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_tid_flow_template, hfi1_tid_flow_rcv_write_resp,
+	TP_PROTO(struct rvt_qp *qp, int index, struct tid_rdma_flow *flow),
+	TP_ARGS(qp, index, flow)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_tid_flow_template, hfi1_tid_flow_build_write_data,
+	TP_PROTO(struct rvt_qp *qp, int index, struct tid_rdma_flow *flow),
+	TP_ARGS(qp, index, flow)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_tid_flow_template, hfi1_tid_flow_rcv_tid_ack,
+	TP_PROTO(struct rvt_qp *qp, int index, struct tid_rdma_flow *flow),
+	TP_ARGS(qp, index, flow)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_tid_flow_template, hfi1_tid_flow_rcv_resync,
+	TP_PROTO(struct rvt_qp *qp, int index, struct tid_rdma_flow *flow),
+	TP_ARGS(qp, index, flow)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_tid_flow_template, hfi1_tid_flow_handle_kdeth_eflags,
+	TP_PROTO(struct rvt_qp *qp, int index, struct tid_rdma_flow *flow),
+	TP_ARGS(qp, index, flow)
+);
+
 DECLARE_EVENT_CLASS(/* tid_node */
 	hfi1_tid_node_template,
 	TP_PROTO(struct rvt_qp *qp, const char *msg, u32 index, u32 base,
@@ -656,6 +721,18 @@ DEFINE_EVENT(/* event */
 	TP_ARGS(qp, index, ent)
 );
 
+DEFINE_EVENT(/* event */
+	hfi1_tid_entry_template, hfi1_tid_entry_rcv_write_resp,
+	TP_PROTO(struct rvt_qp *qp, int index, u32 entry),
+	TP_ARGS(qp, index, entry)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_tid_entry_template, hfi1_tid_entry_build_write_data,
+	TP_PROTO(struct rvt_qp *qp, int index, u32 entry),
+	TP_ARGS(qp, index, entry)
+);
+
 DECLARE_EVENT_CLASS(/* rsp_info */
 	hfi1_responder_info_template,
 	TP_PROTO(struct rvt_qp *qp, u32 psn),
@@ -671,6 +748,7 @@ DECLARE_EVENT_CLASS(/* rsp_info */
 		__field(u8, r_flags)
 		__field(u8, r_head_ack_queue)
 		__field(u8, s_tail_ack_queue)
+		__field(u8, s_acked_ack_queue)
 		__field(u8, s_ack_state)
 		__field(u8, s_nak_state)
 		__field(u8, r_nak_state)
@@ -691,6 +769,7 @@ DECLARE_EVENT_CLASS(/* rsp_info */
 		__entry->r_flags = qp->r_flags;
 		__entry->r_head_ack_queue = qp->r_head_ack_queue;
 		__entry->s_tail_ack_queue = qp->s_tail_ack_queue;
+		__entry->s_acked_ack_queue = qp->s_acked_ack_queue;
 		__entry->s_ack_state = qp->s_ack_state;
 		__entry->s_nak_state = qp->s_nak_state;
 		__entry->s_flags = qp->s_flags;
@@ -709,6 +788,7 @@ DECLARE_EVENT_CLASS(/* rsp_info */
 		__entry->r_flags,
 		__entry->r_head_ack_queue,
 		__entry->s_tail_ack_queue,
+		__entry->s_acked_ack_queue,
 		__entry->s_ack_state,
 		__entry->s_nak_state,
 		__entry->s_flags,
@@ -735,6 +815,42 @@ DEFINE_EVENT(/* event */
 	TP_ARGS(qp, psn)
 );
 
+DEFINE_EVENT(/* event */
+	hfi1_responder_info_template, hfi1_rsp_tid_write_alloc_res,
+	TP_PROTO(struct rvt_qp *qp, u32 psn),
+	TP_ARGS(qp, psn)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_responder_info_template, hfi1_rsp_rcv_tid_write_req,
+	TP_PROTO(struct rvt_qp *qp, u32 psn),
+	TP_ARGS(qp, psn)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_responder_info_template, hfi1_rsp_build_tid_write_resp,
+	TP_PROTO(struct rvt_qp *qp, u32 psn),
+	TP_ARGS(qp, psn)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_responder_info_template, hfi1_rsp_rcv_tid_write_data,
+	TP_PROTO(struct rvt_qp *qp, u32 psn),
+	TP_ARGS(qp, psn)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_responder_info_template, hfi1_rsp_make_tid_ack,
+	TP_PROTO(struct rvt_qp *qp, u32 psn),
+	TP_ARGS(qp, psn)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_responder_info_template, hfi1_rsp_handle_kdeth_eflags,
+	TP_PROTO(struct rvt_qp *qp, u32 psn),
+	TP_ARGS(qp, psn)
+);
+
 DECLARE_EVENT_CLASS(/* sender_info */
 	hfi1_sender_info_template,
 	TP_PROTO(struct rvt_qp *qp),
@@ -827,6 +943,18 @@ DEFINE_EVENT(/* event */
 	TP_ARGS(qp)
 );
 
+DEFINE_EVENT(/* event */
+	hfi1_sender_info_template, hfi1_sender_rcv_tid_ack,
+	TP_PROTO(struct rvt_qp *qp),
+	TP_ARGS(qp)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_sender_info_template, hfi1_sender_make_tid_pkt,
+	TP_PROTO(struct rvt_qp *qp),
+	TP_ARGS(qp)
+);
+
 DECLARE_EVENT_CLASS(/* tid_read_sender */
 	hfi1_tid_read_sender_template,
 	TP_PROTO(struct rvt_qp *qp, char newreq),
@@ -841,6 +969,7 @@ DECLARE_EVENT_CLASS(/* tid_read_sender */
 		__field(u32, s_flags)
 		__field(u32, ps_flags)
 		__field(unsigned long, iow_flags)
+		__field(u8, s_state)
 		__field(u32, hw_flow_index)
 		__field(u32, generation)
 		__field(u32, fpsn)
@@ -858,6 +987,7 @@ DECLARE_EVENT_CLASS(/* tid_read_sender */
 		__entry->s_flags = qp->s_flags;
 		__entry->ps_flags = priv->s_flags;
 		__entry->iow_flags = priv->s_iowait.flags;
+		__entry->s_state = priv->s_state;
 		__entry->hw_flow_index = priv->flow_state.index;
 		__entry->generation = priv->flow_state.generation;
 		__entry->fpsn = priv->flow_state.psn;
@@ -874,6 +1004,7 @@ DECLARE_EVENT_CLASS(/* tid_read_sender */
 		__entry->s_flags,
 		__entry->ps_flags,
 		__entry->iow_flags,
+		__entry->s_state,
 		__entry->hw_flow_index,
 		__entry->generation,
 		__entry->fpsn,
@@ -902,12 +1033,16 @@ DECLARE_EVENT_CLASS(/* tid_rdma_request */
 		__field(u32, cur_seg)
 		__field(u32, comp_seg)
 		__field(u32, ack_seg)
+		__field(u32, alloc_seg)
 		__field(u32, total_segs)
 		__field(u16, setup_head)
 		__field(u16, clear_tail)
 		__field(u16, flow_idx)
+		__field(u16, acked_tail)
 		__field(u32, state)
+		__field(u32, r_ack_psn)
 		__field(u32, r_flow_psn)
+		__field(u32, r_last_acked)
 		__field(u32, s_next_psn)
 	),
 	TP_fast_assign(/* assign */
@@ -920,12 +1055,16 @@ DECLARE_EVENT_CLASS(/* tid_rdma_request */
 		__entry->cur_seg = req->cur_seg;
 		__entry->comp_seg = req->comp_seg;
 		__entry->ack_seg = req->ack_seg;
+		__entry->alloc_seg = req->alloc_seg;
 		__entry->total_segs = req->total_segs;
 		__entry->setup_head = req->setup_head;
 		__entry->clear_tail = req->clear_tail;
 		__entry->flow_idx = req->flow_idx;
+		__entry->acked_tail = req->acked_tail;
 		__entry->state = req->state;
+		__entry->r_ack_psn = req->r_ack_psn;
 		__entry->r_flow_psn = req->r_flow_psn;
+		__entry->r_last_acked = req->r_last_acked;
 		__entry->s_next_psn = req->s_next_psn;
 	),
 	TP_printk(/* print */
@@ -939,12 +1078,16 @@ DECLARE_EVENT_CLASS(/* tid_rdma_request */
 		__entry->cur_seg,
 		__entry->comp_seg,
 		__entry->ack_seg,
+		__entry->alloc_seg,
 		__entry->total_segs,
 		__entry->setup_head,
 		__entry->clear_tail,
 		__entry->flow_idx,
+		__entry->acked_tail,
 		__entry->state,
+		__entry->r_ack_psn,
 		__entry->r_flow_psn,
+		__entry->r_last_acked,
 		__entry->s_next_psn
 	)
 );
@@ -998,6 +1141,97 @@ DEFINE_EVENT(/* event */
 	TP_ARGS(qp, newreq, opcode, psn, lpsn, req)
 );
 
+DEFINE_EVENT(/* event */
+	hfi1_tid_rdma_request_template, hfi1_tid_req_write_alloc_res,
+	TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn,
+		 struct tid_rdma_request *req),
+	TP_ARGS(qp, newreq, opcode, psn, lpsn, req)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_tid_rdma_request_template, hfi1_tid_req_rcv_write_req,
+	TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn,
+		 struct tid_rdma_request *req),
+	TP_ARGS(qp, newreq, opcode, psn, lpsn, req)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_tid_rdma_request_template, hfi1_tid_req_build_write_resp,
+	TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn,
+		 struct tid_rdma_request *req),
+	TP_ARGS(qp, newreq, opcode, psn, lpsn, req)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_tid_rdma_request_template, hfi1_tid_req_rcv_write_resp,
+	TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn,
+		 struct tid_rdma_request *req),
+	TP_ARGS(qp, newreq, opcode, psn, lpsn, req)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_tid_rdma_request_template, hfi1_tid_req_rcv_write_data,
+	TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn,
+		 struct tid_rdma_request *req),
+	TP_ARGS(qp, newreq, opcode, psn, lpsn, req)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_tid_rdma_request_template, hfi1_tid_req_rcv_tid_ack,
+	TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn,
+		 struct tid_rdma_request *req),
+	TP_ARGS(qp, newreq, opcode, psn, lpsn, req)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_tid_rdma_request_template, hfi1_tid_req_tid_retry_timeout,
+	TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn,
+		 struct tid_rdma_request *req),
+	TP_ARGS(qp, newreq, opcode, psn, lpsn, req)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_tid_rdma_request_template, hfi1_tid_req_rcv_resync,
+	TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn,
+		 struct tid_rdma_request *req),
+	TP_ARGS(qp, newreq, opcode, psn, lpsn, req)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_tid_rdma_request_template, hfi1_tid_req_make_tid_pkt,
+	TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn,
+		 struct tid_rdma_request *req),
+	TP_ARGS(qp, newreq, opcode, psn, lpsn, req)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_tid_rdma_request_template, hfi1_tid_req_make_tid_ack,
+	TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn,
+		 struct tid_rdma_request *req),
+	TP_ARGS(qp, newreq, opcode, psn, lpsn, req)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_tid_rdma_request_template, hfi1_tid_req_handle_kdeth_eflags,
+	TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn,
+		 struct tid_rdma_request *req),
+	TP_ARGS(qp, newreq, opcode, psn, lpsn, req)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_tid_rdma_request_template, hfi1_tid_req_make_rc_ack_write,
+	TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn,
+		 struct tid_rdma_request *req),
+	TP_ARGS(qp, newreq, opcode, psn, lpsn, req)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_tid_rdma_request_template, hfi1_tid_req_make_req_write,
+	TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn,
+		 struct tid_rdma_request *req),
+	TP_ARGS(qp, newreq, opcode, psn, lpsn, req)
+);
+
 DECLARE_EVENT_CLASS(/* rc_rcv_err */
 	hfi1_rc_rcv_err_template,
 	TP_PROTO(struct rvt_qp *qp, u32 opcode, u32 psn, int diff),
@@ -1007,6 +1241,7 @@ DECLARE_EVENT_CLASS(/* rc_rcv_err */
 		__field(u32, qpn)
 		__field(u32, s_flags)
 		__field(u8, state)
+		__field(u8, s_acked_ack_queue)
 		__field(u8, s_tail_ack_queue)
 		__field(u8, r_head_ack_queue)
 		__field(u32, opcode)
@@ -1019,6 +1254,7 @@ DECLARE_EVENT_CLASS(/* rc_rcv_err */
 		__entry->qpn = qp->ibqp.qp_num;
 		__entry->s_flags = qp->s_flags;
 		__entry->state = qp->state;
+		__entry->s_acked_ack_queue = qp->s_acked_ack_queue;
 		__entry->s_tail_ack_queue = qp->s_tail_ack_queue;
 		__entry->r_head_ack_queue = qp->r_head_ack_queue;
 		__entry->opcode = opcode;
@@ -1032,6 +1268,7 @@ DECLARE_EVENT_CLASS(/* rc_rcv_err */
 		__entry->qpn,
 		__entry->s_flags,
 		__entry->state,
+		__entry->s_acked_ack_queue,
 		__entry->s_tail_ack_queue,
 		__entry->r_head_ack_queue,
 		__entry->opcode,
@@ -1081,6 +1318,289 @@ DEFINE_EVENT(/* event */
 	TP_ARGS(qp, index, sge)
 );
 
+DECLARE_EVENT_CLASS(/* tid_write_sp */
+	hfi1_tid_write_rsp_template,
+	TP_PROTO(struct rvt_qp *qp),
+	TP_ARGS(qp),
+	TP_STRUCT__entry(/* entry */
+		DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device))
+		__field(u32, qpn)
+		__field(u32, r_tid_head)
+		__field(u32, r_tid_tail)
+		__field(u32, r_tid_ack)
+		__field(u32, r_tid_alloc)
+		__field(u32, alloc_w_segs)
+		__field(u32, pending_tid_w_segs)
+		__field(bool, sync_pt)
+		__field(u32, ps_nak_psn)
+		__field(u8, ps_nak_state)
+		__field(u8, prnr_nak_state)
+		__field(u32, hw_flow_index)
+		__field(u32, generation)
+		__field(u32, fpsn)
+		__field(u32, flow_flags)
+		__field(bool, resync)
+		__field(u32, r_next_psn_kdeth)
+	),
+	TP_fast_assign(/* assign */
+		struct hfi1_qp_priv *priv = qp->priv;
+
+		DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device));
+		__entry->qpn = qp->ibqp.qp_num;
+		__entry->r_tid_head = priv->r_tid_head;
+		__entry->r_tid_tail = priv->r_tid_tail;
+		__entry->r_tid_ack = priv->r_tid_ack;
+		__entry->r_tid_alloc = priv->r_tid_alloc;
+		__entry->alloc_w_segs = priv->alloc_w_segs;
+		__entry->pending_tid_w_segs = priv->pending_tid_w_segs;
+		__entry->sync_pt = priv->sync_pt;
+		__entry->ps_nak_psn = priv->s_nak_psn;
+		__entry->ps_nak_state = priv->s_nak_state;
+		__entry->prnr_nak_state = priv->rnr_nak_state;
+		__entry->hw_flow_index = priv->flow_state.index;
+		__entry->generation = priv->flow_state.generation;
+		__entry->fpsn = priv->flow_state.psn;
+		__entry->flow_flags = priv->flow_state.flags;
+		__entry->resync = priv->resync;
+		__entry->r_next_psn_kdeth = priv->r_next_psn_kdeth;
+	),
+	TP_printk(/* print */
+		TID_WRITE_RSPDR_PRN,
+		__get_str(dev),
+		__entry->qpn,
+		__entry->r_tid_head,
+		__entry->r_tid_tail,
+		__entry->r_tid_ack,
+		__entry->r_tid_alloc,
+		__entry->alloc_w_segs,
+		__entry->pending_tid_w_segs,
+		__entry->sync_pt ? "yes" : "no",
+		__entry->ps_nak_psn,
+		__entry->ps_nak_state,
+		__entry->prnr_nak_state,
+		__entry->hw_flow_index,
+		__entry->generation,
+		__entry->fpsn,
+		__entry->flow_flags,
+		__entry->resync ? "yes" : "no",
+		__entry->r_next_psn_kdeth
+	)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_tid_write_rsp_template, hfi1_tid_write_rsp_alloc_res,
+	TP_PROTO(struct rvt_qp *qp),
+	TP_ARGS(qp)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_tid_write_rsp_template, hfi1_tid_write_rsp_rcv_req,
+	TP_PROTO(struct rvt_qp *qp),
+	TP_ARGS(qp)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_tid_write_rsp_template, hfi1_tid_write_rsp_build_resp,
+	TP_PROTO(struct rvt_qp *qp),
+	TP_ARGS(qp)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_tid_write_rsp_template, hfi1_tid_write_rsp_rcv_data,
+	TP_PROTO(struct rvt_qp *qp),
+	TP_ARGS(qp)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_tid_write_rsp_template, hfi1_tid_write_rsp_rcv_resync,
+	TP_PROTO(struct rvt_qp *qp),
+	TP_ARGS(qp)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_tid_write_rsp_template, hfi1_tid_write_rsp_make_tid_ack,
+	TP_PROTO(struct rvt_qp *qp),
+	TP_ARGS(qp)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_tid_write_rsp_template, hfi1_tid_write_rsp_handle_kdeth_eflags,
+	TP_PROTO(struct rvt_qp *qp),
+	TP_ARGS(qp)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_tid_write_rsp_template, hfi1_tid_write_rsp_make_rc_ack,
+	TP_PROTO(struct rvt_qp *qp),
+	TP_ARGS(qp)
+);
+
+DECLARE_EVENT_CLASS(/* tid_write_sender */
+	hfi1_tid_write_sender_template,
+	TP_PROTO(struct rvt_qp *qp, char newreq),
+	TP_ARGS(qp, newreq),
+	TP_STRUCT__entry(/* entry */
+		DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device))
+		__field(u32, qpn)
+		__field(char, newreq)
+		__field(u32, s_tid_cur)
+		__field(u32, s_tid_tail)
+		__field(u32, s_tid_head)
+		__field(u32, pending_tid_w_resp)
+		__field(u32, n_requests)
+		__field(u32, n_tid_requests)
+		__field(u32, s_flags)
+		__field(u32, ps_flags)
+		__field(unsigned long, iow_flags)
+		__field(u8, s_state)
+		__field(u8, s_retry)
+	),
+	TP_fast_assign(/* assign */
+		struct hfi1_qp_priv *priv = qp->priv;
+
+		DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device));
+		__entry->qpn = qp->ibqp.qp_num;
+		__entry->newreq = newreq;
+		__entry->s_tid_cur = priv->s_tid_cur;
+		__entry->s_tid_tail = priv->s_tid_tail;
+		__entry->s_tid_head = priv->s_tid_head;
+		__entry->pending_tid_w_resp = priv->pending_tid_w_resp;
+		__entry->n_requests = atomic_read(&priv->n_requests);
+		__entry->n_tid_requests = atomic_read(&priv->n_tid_requests);
+		__entry->s_flags = qp->s_flags;
+		__entry->ps_flags = priv->s_flags;
+		__entry->iow_flags = priv->s_iowait.flags;
+		__entry->s_state = priv->s_state;
+		__entry->s_retry = priv->s_retry;
+	),
+	TP_printk(/* print */
+		TID_WRITE_SENDER_PRN,
+		__get_str(dev),
+		__entry->qpn,
+		__entry->newreq,
+		__entry->s_tid_cur,
+		__entry->s_tid_tail,
+		__entry->s_tid_head,
+		__entry->pending_tid_w_resp,
+		__entry->n_requests,
+		__entry->n_tid_requests,
+		__entry->s_flags,
+		__entry->ps_flags,
+		__entry->iow_flags,
+		__entry->s_state,
+		__entry->s_retry
+	)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_tid_write_sender_template, hfi1_tid_write_sender_rcv_resp,
+	TP_PROTO(struct rvt_qp *qp, char newreq),
+	TP_ARGS(qp, newreq)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_tid_write_sender_template, hfi1_tid_write_sender_rcv_tid_ack,
+	TP_PROTO(struct rvt_qp *qp, char newreq),
+	TP_ARGS(qp, newreq)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_tid_write_sender_template, hfi1_tid_write_sender_retry_timeout,
+	TP_PROTO(struct rvt_qp *qp, char newreq),
+	TP_ARGS(qp, newreq)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_tid_write_sender_template, hfi1_tid_write_sender_make_tid_pkt,
+	TP_PROTO(struct rvt_qp *qp, char newreq),
+	TP_ARGS(qp, newreq)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_tid_write_sender_template, hfi1_tid_write_sender_make_req,
+	TP_PROTO(struct rvt_qp *qp, char newreq),
+	TP_ARGS(qp, newreq)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_tid_write_sender_template, hfi1_tid_write_sender_restart_rc,
+	TP_PROTO(struct rvt_qp *qp, char newreq),
+	TP_ARGS(qp, newreq)
+);
+
+DECLARE_EVENT_CLASS(/* tid_ack */
+	hfi1_tid_ack_template,
+	TP_PROTO(struct rvt_qp *qp, u32 aeth, u32 psn,
+		 u32 req_psn, u32 resync_psn),
+	TP_ARGS(qp, aeth, psn, req_psn, resync_psn),
+	TP_STRUCT__entry(/* entry */
+		DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device))
+		__field(u32, qpn)
+		__field(u32, aeth)
+		__field(u32, psn)
+		__field(u32, req_psn)
+		__field(u32, resync_psn)
+	),
+	TP_fast_assign(/* assign */
+		DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device))
+		__entry->qpn = qp->ibqp.qp_num;
+		__entry->aeth = aeth;
+		__entry->psn = psn;
+		__entry->req_psn = req_psn;
+		__entry->resync_psn = resync_psn;
+		),
+	TP_printk(/* print */
+		"[%s] qpn 0x%x aeth 0x%x psn 0x%x req_psn 0x%x resync_psn 0x%x",
+		__get_str(dev),
+		__entry->qpn,
+		__entry->aeth,
+		__entry->psn,
+		__entry->req_psn,
+		__entry->resync_psn
+	)
+);
+
+DEFINE_EVENT(/* rcv_tid_ack */
+	hfi1_tid_ack_template, hfi1_rcv_tid_ack,
+	TP_PROTO(struct rvt_qp *qp, u32 aeth, u32 psn,
+		 u32 req_psn, u32 resync_psn),
+	TP_ARGS(qp, aeth, psn, req_psn, resync_psn)
+);
+
+DECLARE_EVENT_CLASS(/* kdeth_eflags_error */
+	hfi1_kdeth_eflags_error_template,
+	TP_PROTO(struct rvt_qp *qp, u8 rcv_type, u8 rte, u32 psn),
+	TP_ARGS(qp, rcv_type, rte, psn),
+	TP_STRUCT__entry(/* entry */
+		DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device))
+		__field(u32, qpn)
+		__field(u8, rcv_type)
+		__field(u8, rte)
+		__field(u32, psn)
+	),
+	TP_fast_assign(/* assign */
+		DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device));
+		__entry->qpn = qp->ibqp.qp_num;
+		__entry->rcv_type = rcv_type;
+		__entry->rte = rte;
+		__entry->psn = psn;
+	),
+	TP_printk(/* print */
+		KDETH_EFLAGS_ERR_PRN,
+		__get_str(dev),
+		__entry->qpn,
+		__entry->rcv_type,
+		__entry->rte,
+		__entry->psn
+	)
+);
+
+DEFINE_EVENT(/* event */
+	hfi1_kdeth_eflags_error_template, hfi1_eflags_err_write,
+	TP_PROTO(struct rvt_qp *qp, u8 rcv_type, u8 rte, u32 psn),
+	TP_ARGS(qp, rcv_type, rte, psn)
+);
+
 #endif /* __HFI1_TRACE_TID_H */
 
 #undef TRACE_INCLUDE_PATH
diff --git a/drivers/infiniband/hw/hfi1/trace_tx.h b/drivers/infiniband/hw/hfi1/trace_tx.h
index 37dbb3e599c3..09eb0c9ada00 100644
--- a/drivers/infiniband/hw/hfi1/trace_tx.h
+++ b/drivers/infiniband/hw/hfi1/trace_tx.h
@@ -846,6 +846,12 @@ DEFINE_EVENT(
 	TP_ARGS(qp, flag)
 );
 
+DEFINE_EVENT(/* event */
+	hfi1_do_send_template, hfi1_rc_do_tid_send,
+	TP_PROTO(struct rvt_qp *qp, bool flag),
+	TP_ARGS(qp, flag)
+);
+
 DEFINE_EVENT(
 	hfi1_do_send_template, hfi1_rc_expired_time_slice,
 	TP_PROTO(struct rvt_qp *qp, bool flag),
diff --git a/drivers/infiniband/hw/hfi1/user_sdma.c b/drivers/infiniband/hw/hfi1/user_sdma.c
index e5e7fad09f32..8bfbc6d7ea34 100644
--- a/drivers/infiniband/hw/hfi1/user_sdma.c
+++ b/drivers/infiniband/hw/hfi1/user_sdma.c
@@ -144,8 +144,10 @@ static int defer_packet_queue(
 	 */
 	xchg(&pq->state, SDMA_PKT_Q_DEFERRED);
 	write_seqlock(&sde->waitlock);
-	if (list_empty(&pq->busy.list))
+	if (list_empty(&pq->busy.list)) {
+		iowait_get_priority(&pq->busy);
 		iowait_queue(pkts_sent, &pq->busy, &sde->dmawait);
+	}
 	write_sequnlock(&sde->waitlock);
 	return -EBUSY;
 eagain:
@@ -191,7 +193,7 @@ int hfi1_user_sdma_alloc_queues(struct hfi1_ctxtdata *uctxt,
 	pq->mm = fd->mm;
 
 	iowait_init(&pq->busy, 0, NULL, NULL, defer_packet_queue,
-		    activate_packet_queue, NULL);
+		    activate_packet_queue, NULL, NULL);
 	pq->reqidx = 0;
 
 	pq->reqs = kcalloc(hfi1_sdma_comp_ring_size,
@@ -1126,7 +1128,8 @@ static inline u32 set_pkt_bth_psn(__be32 bthpsn, u8 expct, u32 frags)
 			0xffffffull),
 		psn = val & mask;
 	if (expct)
-		psn = (psn & ~BTH_SEQ_MASK) | ((psn + frags) & BTH_SEQ_MASK);
+		psn = (psn & ~HFI1_KDETH_BTH_SEQ_MASK) |
+			((psn + frags) & HFI1_KDETH_BTH_SEQ_MASK);
 	else
 		psn = psn + frags;
 	return psn & mask;
diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c
index 88676ca79fda..55a56b3d7f83 100644
--- a/drivers/infiniband/hw/hfi1/verbs.c
+++ b/drivers/infiniband/hw/hfi1/verbs.c
@@ -161,6 +161,7 @@ MODULE_PARM_DESC(wss_clean_period, "Count of verbs copies before an entry in the
  */
 const enum ib_wc_opcode ib_hfi1_wc_opcode[] = {
 	[IB_WR_RDMA_WRITE] = IB_WC_RDMA_WRITE,
+	[IB_WR_TID_RDMA_WRITE] = IB_WC_RDMA_WRITE,
 	[IB_WR_RDMA_WRITE_WITH_IMM] = IB_WC_RDMA_WRITE,
 	[IB_WR_SEND] = IB_WC_SEND,
 	[IB_WR_SEND_WITH_IMM] = IB_WC_SEND,
@@ -203,6 +204,12 @@ const u8 hdr_len_by_opcode[256] = {
 	[IB_OPCODE_RC_SEND_ONLY_WITH_INVALIDATE]      = 12 + 8 + 4,
 	[IB_OPCODE_TID_RDMA_READ_REQ]                 = 12 + 8 + 36,
 	[IB_OPCODE_TID_RDMA_READ_RESP]                = 12 + 8 + 36,
+	[IB_OPCODE_TID_RDMA_WRITE_REQ]                = 12 + 8 + 36,
+	[IB_OPCODE_TID_RDMA_WRITE_RESP]               = 12 + 8 + 36,
+	[IB_OPCODE_TID_RDMA_WRITE_DATA]               = 12 + 8 + 36,
+	[IB_OPCODE_TID_RDMA_WRITE_DATA_LAST]          = 12 + 8 + 36,
+	[IB_OPCODE_TID_RDMA_ACK]                      = 12 + 8 + 36,
+	[IB_OPCODE_TID_RDMA_RESYNC]                   = 12 + 8 + 36,
 	/* UC */
 	[IB_OPCODE_UC_SEND_FIRST]                     = 12 + 8,
 	[IB_OPCODE_UC_SEND_MIDDLE]                    = 12 + 8,
@@ -248,8 +255,14 @@ static const opcode_handler opcode_handler_tbl[256] = {
 	[IB_OPCODE_RC_SEND_ONLY_WITH_INVALIDATE]      = &hfi1_rc_rcv,
 
 	/* TID RDMA has separate handlers for different opcodes.*/
+	[IB_OPCODE_TID_RDMA_WRITE_REQ]       = &hfi1_rc_rcv_tid_rdma_write_req,
+	[IB_OPCODE_TID_RDMA_WRITE_RESP]      = &hfi1_rc_rcv_tid_rdma_write_resp,
+	[IB_OPCODE_TID_RDMA_WRITE_DATA]      = &hfi1_rc_rcv_tid_rdma_write_data,
+	[IB_OPCODE_TID_RDMA_WRITE_DATA_LAST] = &hfi1_rc_rcv_tid_rdma_write_data,
 	[IB_OPCODE_TID_RDMA_READ_REQ]        = &hfi1_rc_rcv_tid_rdma_read_req,
 	[IB_OPCODE_TID_RDMA_READ_RESP]       = &hfi1_rc_rcv_tid_rdma_read_resp,
+	[IB_OPCODE_TID_RDMA_RESYNC]          = &hfi1_rc_rcv_tid_rdma_resync,
+	[IB_OPCODE_TID_RDMA_ACK]             = &hfi1_rc_rcv_tid_rdma_ack,
 
 	/* UC */
 	[IB_OPCODE_UC_SEND_FIRST]                     = &hfi1_uc_rcv,
@@ -932,6 +945,7 @@ static int pio_wait(struct rvt_qp *qp,
 			dev->n_piodrain += !!(flag & HFI1_S_WAIT_PIO_DRAIN);
 			qp->s_flags |= flag;
 			was_empty = list_empty(&sc->piowait);
+			iowait_get_priority(&priv->s_iowait);
 			iowait_queue(ps->pkts_sent, &priv->s_iowait,
 				     &sc->piowait);
 			priv->s_iowait.lock = &sc->waitlock;
@@ -1332,7 +1346,9 @@ static void hfi1_fill_device_attr(struct hfi1_devdata *dd)
 	rdi->dparms.props.max_mr_size = U64_MAX;
 	rdi->dparms.props.max_fast_reg_page_list_len = UINT_MAX;
 	rdi->dparms.props.max_qp = hfi1_max_qps;
-	rdi->dparms.props.max_qp_wr = hfi1_max_qp_wrs;
+	rdi->dparms.props.max_qp_wr =
+		(hfi1_max_qp_wrs >= HFI1_QP_WQE_INVALID ?
+		 HFI1_QP_WQE_INVALID - 1 : hfi1_max_qp_wrs);
 	rdi->dparms.props.max_send_sge = hfi1_max_sges;
 	rdi->dparms.props.max_recv_sge = hfi1_max_sges;
 	rdi->dparms.props.max_sge_rd = hfi1_max_sges;
@@ -1888,7 +1904,7 @@ int hfi1_register_ib_device(struct hfi1_devdata *dd)
 	dd->verbs_dev.rdi.dparms.wss_threshold = wss_threshold;
 	dd->verbs_dev.rdi.dparms.wss_clean_period = wss_clean_period;
 	dd->verbs_dev.rdi.dparms.reserved_operations = 1;
-	dd->verbs_dev.rdi.dparms.extra_rdma_atomic = 1;
+	dd->verbs_dev.rdi.dparms.extra_rdma_atomic = HFI1_TID_RDMA_WRITE_CNT;
 
 	/* post send table */
 	dd->verbs_dev.rdi.post_parms = hfi1_post_parms;
diff --git a/drivers/infiniband/hw/hfi1/verbs.h b/drivers/infiniband/hw/hfi1/verbs.h
index 841727a684d5..62ace0b2d17a 100644
--- a/drivers/infiniband/hw/hfi1/verbs.h
+++ b/drivers/infiniband/hw/hfi1/verbs.h
@@ -163,16 +163,39 @@ struct hfi1_qp_priv {
 	u32 tid_enqueue;                          /* saved when tid waited */
 	u8 s_sc;		                  /* SC[0..4] for next packet */
 	struct iowait s_iowait;
+	struct timer_list s_tid_timer;            /* for timing tid wait */
+	struct timer_list s_tid_retry_timer;      /* for timing tid ack */
 	struct list_head tid_wait;                /* for queueing tid space */
 	struct hfi1_opfn_data opfn;
 	struct tid_flow_state flow_state;
 	struct tid_rdma_qp_params tid_rdma;
 	struct rvt_qp *owner;
 	u8 hdr_type; /* 9B or 16B */
+	struct rvt_sge_state tid_ss;       /* SGE state pointer for 2nd leg */
+	atomic_t n_requests;               /* # of TID RDMA requests in the */
+					   /* queue */
+	atomic_t n_tid_requests;            /* # of sent TID RDMA requests */
 	unsigned long tid_timer_timeout_jiffies;
+	unsigned long tid_retry_timeout_jiffies;
 
 	/* variables for the TID RDMA SE state machine */
+	u8 s_state;
+	u8 s_retry;
+	u8 rnr_nak_state;       /* RNR NAK state */
+	u8 s_nak_state;
+	u32 s_nak_psn;
 	u32 s_flags;
+	u32 s_tid_cur;
+	u32 s_tid_head;
+	u32 s_tid_tail;
+	u32 r_tid_head;     /* Most recently added TID RDMA request */
+	u32 r_tid_tail;     /* the last completed TID RDMA request */
+	u32 r_tid_ack;      /* the TID RDMA request to be ACK'ed */
+	u32 r_tid_alloc;    /* Request for which we are allocating resources */
+	u32 pending_tid_w_segs; /* Num of pending tid write segments */
+	u32 pending_tid_w_resp; /* Num of pending tid write responses */
+	u32 alloc_w_segs;       /* Number of segments for which write */
+			       /* resources have been allocated for this QP */
 
 	/* For TID RDMA READ */
 	u32 tid_r_reqs;         /* Num of tid reads requested */
@@ -180,14 +203,23 @@ struct hfi1_qp_priv {
 	u32 pending_tid_r_segs; /* Num of pending tid read segments */
 	u16 pkts_ps;            /* packets per segment */
 	u8 timeout_shift;       /* account for number of packets per segment */
+
+	u32 r_next_psn_kdeth;
+	u32 r_next_psn_kdeth_save;
+	u32 s_resync_psn;
+	u8 sync_pt;           /* Set when QP reaches sync point */
+	u8 resync;
 };
 
+#define HFI1_QP_WQE_INVALID   ((u32)-1)
+
 struct hfi1_swqe_priv {
 	struct tid_rdma_request tid_req;
 	struct rvt_sge_state ss;  /* Used for TID RDMA READ Request */
 };
 
 struct hfi1_ack_priv {
+	struct rvt_sge_state ss;               /* used for TID WRITE RESP */
 	struct tid_rdma_request tid_req;
 };
 
@@ -412,6 +444,9 @@ void hfi1_make_ruc_header(struct rvt_qp *qp, struct ib_other_headers *ohdr,
 			  u32 bth0, u32 bth1, u32 bth2, int middle,
 			  struct hfi1_pkt_state *ps);
 
+bool hfi1_schedule_send_yield(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
+			      bool tid);
+
 void _hfi1_do_send(struct work_struct *work);
 
 void hfi1_do_send_from_rvt(struct rvt_qp *qp);
diff --git a/drivers/infiniband/hw/hfi1/verbs_txreq.h b/drivers/infiniband/hw/hfi1/verbs_txreq.h
index 2a77af26a231..b002e96eb335 100644
--- a/drivers/infiniband/hw/hfi1/verbs_txreq.h
+++ b/drivers/infiniband/hw/hfi1/verbs_txreq.h
@@ -94,6 +94,7 @@ static inline struct verbs_txreq *get_txreq(struct hfi1_ibdev *dev,
 	tx->txreq.num_desc = 0;
 	/* Set the header type */
 	tx->phdr.hdr.hdr_type = priv->hdr_type;
+	tx->txreq.flags = 0;
 	return tx;
 }
 
diff --git a/drivers/infiniband/hw/hfi1/vnic_sdma.c b/drivers/infiniband/hw/hfi1/vnic_sdma.c
index 1f81c480e028..af1b1ffcb38e 100644
--- a/drivers/infiniband/hw/hfi1/vnic_sdma.c
+++ b/drivers/infiniband/hw/hfi1/vnic_sdma.c
@@ -240,8 +240,10 @@ static int hfi1_vnic_sdma_sleep(struct sdma_engine *sde,
 	}
 
 	vnic_sdma->state = HFI1_VNIC_SDMA_Q_DEFERRED;
-	if (list_empty(&vnic_sdma->wait.list))
+	if (list_empty(&vnic_sdma->wait.list)) {
+		iowait_get_priority(wait->iow);
 		iowait_queue(pkts_sent, wait->iow, &sde->dmawait);
+	}
 	write_sequnlock(&sde->waitlock);
 	return -EBUSY;
 }
@@ -281,7 +283,7 @@ void hfi1_vnic_sdma_init(struct hfi1_vnic_vport_info *vinfo)
 
 		iowait_init(&vnic_sdma->wait, 0, NULL, NULL,
 			    hfi1_vnic_sdma_sleep,
-			    hfi1_vnic_sdma_wakeup, NULL);
+			    hfi1_vnic_sdma_wakeup, NULL, NULL);
 		vnic_sdma->sde = &vinfo->dd->per_sdma[i];
 		vnic_sdma->dd = vinfo->dd;
 		vnic_sdma->vinfo = vinfo;