From 5f90677ed31963abb184ee08ebee4a4a68225dd8 Mon Sep 17 00:00:00 2001 From: Kaike Wan Date: Fri, 7 Jun 2019 08:25:25 -0400 Subject: IB/hfi1: Validate fault injection opcode user input The opcode range for fault injection from user should be validated before it is applied to the fault->opcodes[] bitmap to avoid out-of-bound error. Cc: Fixes: a74d5307caba ("IB/hfi1: Rework fault injection machinery") Reported-by: Dan Carpenter Reviewed-by: Mike Marciniszyn Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/fault.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/infiniband/hw/hfi1/fault.c b/drivers/infiniband/hw/hfi1/fault.c index 3fd3315d0fb0..93613e5def9b 100644 --- a/drivers/infiniband/hw/hfi1/fault.c +++ b/drivers/infiniband/hw/hfi1/fault.c @@ -153,6 +153,7 @@ static ssize_t fault_opcodes_write(struct file *file, const char __user *buf, char *dash; unsigned long range_start, range_end, i; bool remove = false; + unsigned long bound = 1U << BITS_PER_BYTE; end = strchr(ptr, ','); if (end) @@ -178,6 +179,10 @@ static ssize_t fault_opcodes_write(struct file *file, const char __user *buf, BITS_PER_BYTE); break; } + /* Check the inputs */ + if (range_start >= bound || range_end >= bound) + break; + for (i = range_start; i <= range_end; i++) { if (remove) clear_bit(i, fault->opcodes); -- cgit v1.2.3 From da9de5f8527f4b9efc82f967d29a583318c034c7 Mon Sep 17 00:00:00 2001 From: Mike Marciniszyn Date: Fri, 7 Jun 2019 08:25:31 -0400 Subject: IB/hfi1: Close PSM sdma_progress sleep window The call to sdma_progress() is called outside the wait lock. In this case, there is a race condition where sdma_progress() can return false and the sdma_engine can idle. If that happens, there will be no more sdma interrupts to cause the wakeup and the user_sdma xmit will hang. Fix by moving the lock to enclose the sdma_progress() call. Also, delete busycount. The need for this was removed by: commit bcad29137a97 ("IB/hfi1: Serve the most starved iowait entry first") Cc: Fixes: 7724105686e7 ("IB/hfi1: add driver files") Reviewed-by: Gary Leshner Signed-off-by: Mike Marciniszyn Signed-off-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/user_sdma.c | 12 ++++-------- drivers/infiniband/hw/hfi1/user_sdma.h | 1 - 2 files changed, 4 insertions(+), 9 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/user_sdma.c b/drivers/infiniband/hw/hfi1/user_sdma.c index 8bfbc6d7ea34..fd754a16475a 100644 --- a/drivers/infiniband/hw/hfi1/user_sdma.c +++ b/drivers/infiniband/hw/hfi1/user_sdma.c @@ -130,20 +130,16 @@ static int defer_packet_queue( { struct hfi1_user_sdma_pkt_q *pq = container_of(wait->iow, struct hfi1_user_sdma_pkt_q, busy); - struct user_sdma_txreq *tx = - container_of(txreq, struct user_sdma_txreq, txreq); - if (sdma_progress(sde, seq, txreq)) { - if (tx->busycount++ < MAX_DEFER_RETRY_COUNT) - goto eagain; - } + write_seqlock(&sde->waitlock); + if (sdma_progress(sde, seq, txreq)) + goto eagain; /* * We are assuming that if the list is enqueued somewhere, it * is to the dmawait list since that is the only place where * it is supposed to be enqueued. */ xchg(&pq->state, SDMA_PKT_Q_DEFERRED); - write_seqlock(&sde->waitlock); if (list_empty(&pq->busy.list)) { iowait_get_priority(&pq->busy); iowait_queue(pkts_sent, &pq->busy, &sde->dmawait); @@ -151,6 +147,7 @@ static int defer_packet_queue( write_sequnlock(&sde->waitlock); return -EBUSY; eagain: + write_sequnlock(&sde->waitlock); return -EAGAIN; } @@ -804,7 +801,6 @@ static int user_sdma_send_pkts(struct user_sdma_request *req, u16 maxpkts) tx->flags = 0; tx->req = req; - tx->busycount = 0; INIT_LIST_HEAD(&tx->list); /* diff --git a/drivers/infiniband/hw/hfi1/user_sdma.h b/drivers/infiniband/hw/hfi1/user_sdma.h index 14dfd757dafd..4d8510b0fc38 100644 --- a/drivers/infiniband/hw/hfi1/user_sdma.h +++ b/drivers/infiniband/hw/hfi1/user_sdma.h @@ -245,7 +245,6 @@ struct user_sdma_txreq { struct list_head list; struct user_sdma_request *req; u16 flags; - unsigned int busycount; u16 seqnum; }; -- cgit v1.2.3 From cc78076af14e1478c1a8fb18997674b5f8cbe3c8 Mon Sep 17 00:00:00 2001 From: Mike Marciniszyn Date: Mon, 10 Jun 2019 12:28:18 -0400 Subject: IB/hfi1: Correct tid qp rcd to match verbs context The qp priv rcd pointer doesn't match the context being used for verbs causing issues when 9B and kdeth packets are processed by different receive contexts and hence different CPUs. When running on different CPUs the following panic can occur: WARNING: CPU: 3 PID: 2584 at lib/list_debug.c:59 __list_del_entry+0xa1/0xd0 list_del corruption. prev->next should be ffff9a7ac31f7a30, but was ffff9a7c3bc89230 CPU: 3 PID: 2584 Comm: z_wr_iss Kdump: loaded Tainted: P OE ------------ 3.10.0-862.2.3.el7_lustre.x86_64 #1 Call Trace: [] dump_stack+0x19/0x1b [] __warn+0xd8/0x100 [] warn_slowpath_fmt+0x5f/0x80 [] __list_del_entry+0xa1/0xd0 [] process_rcv_qp_work+0xb5/0x160 [hfi1] [] handle_receive_interrupt_nodma_rtail+0x20b/0x2b0 [hfi1] [] receive_context_interrupt+0x23/0x40 [hfi1] [] __handle_irq_event_percpu+0x44/0x1c0 [] handle_irq_event_percpu+0x32/0x80 [] handle_irq_event+0x3c/0x60 [] handle_edge_irq+0x7f/0x150 [] handle_irq+0xe4/0x1a0 [] do_IRQ+0x4d/0xf0 [] common_interrupt+0x162/0x162 [] ? memcpy+0x6/0x110 [] ? abd_copy_from_buf_off_cb+0x1d/0x30 [zfs] [] ? abd_copy_to_buf_off_cb+0x30/0x30 [zfs] [] abd_iterate_func+0x97/0x120 [zfs] [] abd_copy_from_buf_off+0x39/0x60 [zfs] [] arc_write_ready+0x178/0x300 [zfs] [] ? mutex_lock+0x12/0x2f [] ? mutex_lock+0x12/0x2f [] zio_ready+0x65/0x3d0 [zfs] [] ? tsd_get_by_thread+0x2e/0x50 [spl] [] ? taskq_member+0x18/0x30 [spl] [] zio_execute+0xa2/0x100 [zfs] [] taskq_thread+0x2ac/0x4f0 [spl] [] ? wake_up_state+0x20/0x20 [] ? zio_taskq_member.isra.7.constprop.10+0x80/0x80 [zfs] [] ? taskq_thread_spawn+0x60/0x60 [spl] [] kthread+0xd1/0xe0 [] ? insert_kthread_work+0x40/0x40 [] ret_from_fork_nospec_begin+0x21/0x21 [] ? insert_kthread_work+0x40/0x40 Fix by reading the map entry in the same manner as the hardware so that the kdeth and verbs contexts match. Cc: Fixes: 5190f052a365 ("IB/hfi1: Allow the driver to initialize QP priv struct") Reviewed-by: Kaike Wan Signed-off-by: Mike Marciniszyn Signed-off-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/chip.c | 13 +++++++++++++ drivers/infiniband/hw/hfi1/chip.h | 1 + drivers/infiniband/hw/hfi1/tid_rdma.c | 4 +--- 3 files changed, 15 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/chip.c b/drivers/infiniband/hw/hfi1/chip.c index 4221a99ee7f4..d5b643a1d9fd 100644 --- a/drivers/infiniband/hw/hfi1/chip.c +++ b/drivers/infiniband/hw/hfi1/chip.c @@ -14031,6 +14031,19 @@ static void init_kdeth_qp(struct hfi1_devdata *dd) RCV_BTH_QP_KDETH_QP_SHIFT); } +/** + * hfi1_get_qp_map + * @dd: device data + * @idx: index to read + */ +u8 hfi1_get_qp_map(struct hfi1_devdata *dd, u8 idx) +{ + u64 reg = read_csr(dd, RCV_QP_MAP_TABLE + (idx / 8) * 8); + + reg >>= (idx % 8) * 8; + return reg; +} + /** * init_qpmap_table * @dd - device data diff --git a/drivers/infiniband/hw/hfi1/chip.h b/drivers/infiniband/hw/hfi1/chip.h index 4e6c3556ec48..b76cf81f927f 100644 --- a/drivers/infiniband/hw/hfi1/chip.h +++ b/drivers/infiniband/hw/hfi1/chip.h @@ -1445,6 +1445,7 @@ void clear_all_interrupts(struct hfi1_devdata *dd); void remap_intr(struct hfi1_devdata *dd, int isrc, int msix_intr); void remap_sdma_interrupts(struct hfi1_devdata *dd, int engine, int msix_intr); void reset_interrupts(struct hfi1_devdata *dd); +u8 hfi1_get_qp_map(struct hfi1_devdata *dd, u8 idx); /* * Interrupt source table. diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.c b/drivers/infiniband/hw/hfi1/tid_rdma.c index 6fb93032fbef..aa9c8d3ef87b 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.c +++ b/drivers/infiniband/hw/hfi1/tid_rdma.c @@ -312,9 +312,7 @@ static struct hfi1_ctxtdata *qp_to_rcd(struct rvt_dev_info *rdi, if (qp->ibqp.qp_num == 0) ctxt = 0; else - ctxt = ((qp->ibqp.qp_num >> dd->qos_shift) % - (dd->n_krcv_queues - 1)) + 1; - + ctxt = hfi1_get_qp_map(dd, qp->ibqp.qp_num >> dd->qos_shift); return dd->rcd[ctxt]; } -- cgit v1.2.3 From cf131a81967583ae737df6383a0893b9fee75b4e Mon Sep 17 00:00:00 2001 From: Mike Marciniszyn Date: Fri, 14 Jun 2019 12:32:26 -0400 Subject: IB/hfi1: Avoid hardlockup with flushlist_lock Heavy contention of the sde flushlist_lock can cause hard lockups at extreme scale when the flushing logic is under stress. Mitigate by replacing the item at a time copy to the local list with an O(1) list_splice_init() and using the high priority work queue to do the flushes. Fixes: 7724105686e7 ("IB/hfi1: add driver files") Cc: Reviewed-by: Dennis Dalessandro Signed-off-by: Mike Marciniszyn Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/sdma.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/sdma.c b/drivers/infiniband/hw/hfi1/sdma.c index b0110728f541..70828de7436b 100644 --- a/drivers/infiniband/hw/hfi1/sdma.c +++ b/drivers/infiniband/hw/hfi1/sdma.c @@ -410,10 +410,7 @@ static void sdma_flush(struct sdma_engine *sde) sdma_flush_descq(sde); spin_lock_irqsave(&sde->flushlist_lock, flags); /* copy flush list */ - list_for_each_entry_safe(txp, txp_next, &sde->flushlist, list) { - list_del_init(&txp->list); - list_add_tail(&txp->list, &flushlist); - } + list_splice_init(&sde->flushlist, &flushlist); spin_unlock_irqrestore(&sde->flushlist_lock, flags); /* flush from flush list */ list_for_each_entry_safe(txp, txp_next, &flushlist, list) @@ -2413,7 +2410,7 @@ unlock_noconn: list_add_tail(&tx->list, &sde->flushlist); spin_unlock(&sde->flushlist_lock); iowait_inc_wait_count(wait, tx->num_desc); - schedule_work(&sde->flush_worker); + queue_work_on(sde->cpu, system_highpri_wq, &sde->flush_worker); ret = -ECOMM; goto unlock; nodesc: @@ -2511,7 +2508,7 @@ unlock_noconn: iowait_inc_wait_count(wait, tx->num_desc); } spin_unlock(&sde->flushlist_lock); - schedule_work(&sde->flush_worker); + queue_work_on(sde->cpu, system_highpri_wq, &sde->flush_worker); ret = -ECOMM; goto update_tail; nodesc: -- cgit v1.2.3 From 3230f4a8d44e4a0bb7afea814b280b5129521f52 Mon Sep 17 00:00:00 2001 From: Mike Marciniszyn Date: Fri, 14 Jun 2019 12:32:32 -0400 Subject: IB/hfi1: Silence txreq allocation warnings The following warning can happen when a memory shortage occurs during txreq allocation: [10220.939246] SLUB: Unable to allocate memory on node -1, gfp=0xa20(GFP_ATOMIC) [10220.939246] Hardware name: Intel Corporation S2600WT2R/S2600WT2R, BIOS SE5C610.86B.01.01.0018.C4.072020161249 07/20/2016 [10220.939247] cache: mnt_cache, object size: 384, buffer size: 384, default order: 2, min order: 0 [10220.939260] Workqueue: hfi0_0 _hfi1_do_send [hfi1] [10220.939261] node 0: slabs: 1026568, objs: 43115856, free: 0 [10220.939262] Call Trace: [10220.939262] node 1: slabs: 820872, objs: 34476624, free: 0 [10220.939263] dump_stack+0x5a/0x73 [10220.939265] warn_alloc+0x103/0x190 [10220.939267] ? wake_all_kswapds+0x54/0x8b [10220.939268] __alloc_pages_slowpath+0x86c/0xa2e [10220.939270] ? __alloc_pages_nodemask+0x2fe/0x320 [10220.939271] __alloc_pages_nodemask+0x2fe/0x320 [10220.939273] new_slab+0x475/0x550 [10220.939275] ___slab_alloc+0x36c/0x520 [10220.939287] ? hfi1_make_rc_req+0x90/0x18b0 [hfi1] [10220.939299] ? __get_txreq+0x54/0x160 [hfi1] [10220.939310] ? hfi1_make_rc_req+0x90/0x18b0 [hfi1] [10220.939312] __slab_alloc+0x40/0x61 [10220.939323] ? hfi1_make_rc_req+0x90/0x18b0 [hfi1] [10220.939325] kmem_cache_alloc+0x181/0x1b0 [10220.939336] hfi1_make_rc_req+0x90/0x18b0 [hfi1] [10220.939348] ? hfi1_verbs_send_dma+0x386/0xa10 [hfi1] [10220.939359] ? find_prev_entry+0xb0/0xb0 [hfi1] [10220.939371] hfi1_do_send+0x1d9/0x3f0 [hfi1] [10220.939372] process_one_work+0x171/0x380 [10220.939374] worker_thread+0x49/0x3f0 [10220.939375] kthread+0xf8/0x130 [10220.939377] ? max_active_store+0x80/0x80 [10220.939378] ? kthread_bind+0x10/0x10 [10220.939379] ret_from_fork+0x35/0x40 [10220.939381] SLUB: Unable to allocate memory on node -1, gfp=0xa20(GFP_ATOMIC) The shortage is handled properly so the message isn't needed. Silence by adding the no warn option to the slab allocation. Fixes: 45842abbb292 ("staging/rdma/hfi1: move txreq header code") Cc: Reviewed-by: Dennis Dalessandro Signed-off-by: Mike Marciniszyn Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/verbs_txreq.c | 2 +- drivers/infiniband/hw/hfi1/verbs_txreq.h | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/verbs_txreq.c b/drivers/infiniband/hw/hfi1/verbs_txreq.c index c4ab2d5b4502..8f766dd3f61c 100644 --- a/drivers/infiniband/hw/hfi1/verbs_txreq.c +++ b/drivers/infiniband/hw/hfi1/verbs_txreq.c @@ -100,7 +100,7 @@ struct verbs_txreq *__get_txreq(struct hfi1_ibdev *dev, if (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK) { struct hfi1_qp_priv *priv; - tx = kmem_cache_alloc(dev->verbs_txreq_cache, GFP_ATOMIC); + tx = kmem_cache_alloc(dev->verbs_txreq_cache, VERBS_TXREQ_GFP); if (tx) goto out; priv = qp->priv; diff --git a/drivers/infiniband/hw/hfi1/verbs_txreq.h b/drivers/infiniband/hw/hfi1/verbs_txreq.h index b002e96eb335..bfa6e081cb56 100644 --- a/drivers/infiniband/hw/hfi1/verbs_txreq.h +++ b/drivers/infiniband/hw/hfi1/verbs_txreq.h @@ -72,6 +72,7 @@ struct hfi1_ibdev; struct verbs_txreq *__get_txreq(struct hfi1_ibdev *dev, struct rvt_qp *qp); +#define VERBS_TXREQ_GFP (GFP_ATOMIC | __GFP_NOWARN) static inline struct verbs_txreq *get_txreq(struct hfi1_ibdev *dev, struct rvt_qp *qp) __must_hold(&qp->slock) @@ -79,7 +80,7 @@ static inline struct verbs_txreq *get_txreq(struct hfi1_ibdev *dev, struct verbs_txreq *tx; struct hfi1_qp_priv *priv = qp->priv; - tx = kmem_cache_alloc(dev->verbs_txreq_cache, GFP_ATOMIC); + tx = kmem_cache_alloc(dev->verbs_txreq_cache, VERBS_TXREQ_GFP); if (unlikely(!tx)) { /* call slow path to get the lock */ tx = __get_txreq(dev, qp); -- cgit v1.2.3 From 9755f72496664eec70bc804104118b5797b6bf63 Mon Sep 17 00:00:00 2001 From: Mike Marciniszyn Date: Fri, 14 Jun 2019 12:32:38 -0400 Subject: IB/hfi1: Create inline to get extended headers This paves the way for another patch that reacts to a flush sdma completion for RC. Fixes: 81cd3891f021 ("IB/hfi1: Add support for 16B Management Packets") Reviewed-by: Dennis Dalessandro Signed-off-by: Mike Marciniszyn Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/hfi.h | 31 +++++++++++++++++++++++++++++++ drivers/infiniband/hw/hfi1/rc.c | 21 +-------------------- 2 files changed, 32 insertions(+), 20 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/hfi.h b/drivers/infiniband/hw/hfi1/hfi.h index b458c218842b..fa45350a9a1d 100644 --- a/drivers/infiniband/hw/hfi1/hfi.h +++ b/drivers/infiniband/hw/hfi1/hfi.h @@ -539,6 +539,37 @@ static inline void hfi1_16B_set_qpn(struct opa_16b_mgmt *mgmt, mgmt->src_qpn = cpu_to_be32(src_qp & OPA_16B_MGMT_QPN_MASK); } +/** + * hfi1_get_rc_ohdr - get extended header + * @opah - the opaheader + */ +static inline struct ib_other_headers * +hfi1_get_rc_ohdr(struct hfi1_opa_header *opah) +{ + struct ib_other_headers *ohdr; + struct ib_header *hdr = NULL; + struct hfi1_16b_header *hdr_16b = NULL; + + /* Find out where the BTH is */ + if (opah->hdr_type == HFI1_PKT_TYPE_9B) { + hdr = &opah->ibh; + if (ib_get_lnh(hdr) == HFI1_LRH_BTH) + ohdr = &hdr->u.oth; + else + ohdr = &hdr->u.l.oth; + } else { + u8 l4; + + hdr_16b = &opah->opah; + l4 = hfi1_16B_get_l4(hdr_16b); + if (l4 == OPA_16B_L4_IB_LOCAL) + ohdr = &hdr_16b->u.oth; + else + ohdr = &hdr_16b->u.l.oth; + } + return ohdr; +} + struct rvt_sge_state; /* diff --git a/drivers/infiniband/hw/hfi1/rc.c b/drivers/infiniband/hw/hfi1/rc.c index a922edcf23d6..707a4c52d20e 100644 --- a/drivers/infiniband/hw/hfi1/rc.c +++ b/drivers/infiniband/hw/hfi1/rc.c @@ -1709,8 +1709,6 @@ void hfi1_rc_send_complete(struct rvt_qp *qp, struct hfi1_opa_header *opah) struct ib_other_headers *ohdr; struct hfi1_qp_priv *priv = qp->priv; struct rvt_swqe *wqe; - struct ib_header *hdr = NULL; - struct hfi1_16b_header *hdr_16b = NULL; u32 opcode, head, tail; u32 psn; struct tid_rdma_request *req; @@ -1719,24 +1717,7 @@ void hfi1_rc_send_complete(struct rvt_qp *qp, struct hfi1_opa_header *opah) if (!(ib_rvt_state_ops[qp->state] & RVT_SEND_OR_FLUSH_OR_RECV_OK)) return; - /* Find out where the BTH is */ - if (priv->hdr_type == HFI1_PKT_TYPE_9B) { - hdr = &opah->ibh; - if (ib_get_lnh(hdr) == HFI1_LRH_BTH) - ohdr = &hdr->u.oth; - else - ohdr = &hdr->u.l.oth; - } else { - u8 l4; - - hdr_16b = &opah->opah; - l4 = hfi1_16B_get_l4(hdr_16b); - if (l4 == OPA_16B_L4_IB_LOCAL) - ohdr = &hdr_16b->u.oth; - else - ohdr = &hdr_16b->u.l.oth; - } - + ohdr = hfi1_get_rc_ohdr(opah); opcode = ib_bth_get_opcode(ohdr); if ((opcode >= OP(RDMA_READ_RESPONSE_FIRST) && opcode <= OP(ATOMIC_ACKNOWLEDGE)) || -- cgit v1.2.3 From 4bb02e9572af1383038d83ad196d7166c515f2ee Mon Sep 17 00:00:00 2001 From: Mike Marciniszyn Date: Fri, 14 Jun 2019 12:32:44 -0400 Subject: IB/hfi1: Use aborts to trigger RC throttling SDMA and pio flushes will cause a lot of packets to be transmitted after a link has gone down, using a lot of CPU to retransmit packets. Fix for RC QPs by recognizing the flush status and: - Forcing a timer start - Putting the QP into a "send one" mode Fixes: 7724105686e7 ("IB/hfi1: add driver files") Reviewed-by: Kaike Wan Signed-off-by: Mike Marciniszyn Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/rc.c | 30 ++++++++++++++++++++++++++++++ drivers/infiniband/hw/hfi1/verbs.c | 10 ++++++---- drivers/infiniband/hw/hfi1/verbs.h | 1 + 3 files changed, 37 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/rc.c b/drivers/infiniband/hw/hfi1/rc.c index 707a4c52d20e..6684ec333525 100644 --- a/drivers/infiniband/hw/hfi1/rc.c +++ b/drivers/infiniband/hw/hfi1/rc.c @@ -1701,6 +1701,36 @@ static void reset_sending_psn(struct rvt_qp *qp, u32 psn) } } +/** + * hfi1_rc_verbs_aborted - handle abort status + * @qp: the QP + * @opah: the opa header + * + * This code modifies both ACK bit in BTH[2] + * and the s_flags to go into send one mode. + * + * This serves to throttle the send engine to only + * send a single packet in the likely case the + * a link has gone down. + */ +void hfi1_rc_verbs_aborted(struct rvt_qp *qp, struct hfi1_opa_header *opah) +{ + struct ib_other_headers *ohdr = hfi1_get_rc_ohdr(opah); + u8 opcode = ib_bth_get_opcode(ohdr); + u32 psn; + + /* ignore responses */ + if ((opcode >= OP(RDMA_READ_RESPONSE_FIRST) && + opcode <= OP(ATOMIC_ACKNOWLEDGE)) || + opcode == TID_OP(READ_RESP) || + opcode == TID_OP(WRITE_RESP)) + return; + + psn = ib_bth_get_psn(ohdr) | IB_BTH_REQ_ACK; + ohdr->bth[2] = cpu_to_be32(psn); + qp->s_flags |= RVT_S_SEND_ONE; +} + /* * This should be called with the QP s_lock held and interrupts disabled. */ diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c index a2b26a635baf..a4cbb0f3382b 100644 --- a/drivers/infiniband/hw/hfi1/verbs.c +++ b/drivers/infiniband/hw/hfi1/verbs.c @@ -638,6 +638,8 @@ static void verbs_sdma_complete( struct hfi1_opa_header *hdr; hdr = &tx->phdr.hdr; + if (unlikely(status == SDMA_TXREQ_S_ABORTED)) + hfi1_rc_verbs_aborted(qp, hdr); hfi1_rc_send_complete(qp, hdr); } spin_unlock(&qp->s_lock); @@ -1095,15 +1097,15 @@ int hfi1_verbs_send_pio(struct rvt_qp *qp, struct hfi1_pkt_state *ps, &ps->s_txreq->phdr.hdr, ib_is_sc5(sc5)); pio_bail: + spin_lock_irqsave(&qp->s_lock, flags); if (qp->s_wqe) { - spin_lock_irqsave(&qp->s_lock, flags); rvt_send_complete(qp, qp->s_wqe, wc_status); - spin_unlock_irqrestore(&qp->s_lock, flags); } else if (qp->ibqp.qp_type == IB_QPT_RC) { - spin_lock_irqsave(&qp->s_lock, flags); + if (unlikely(wc_status == IB_WC_GENERAL_ERR)) + hfi1_rc_verbs_aborted(qp, &ps->s_txreq->phdr.hdr); hfi1_rc_send_complete(qp, &ps->s_txreq->phdr.hdr); - spin_unlock_irqrestore(&qp->s_lock, flags); } + spin_unlock_irqrestore(&qp->s_lock, flags); ret = 0; diff --git a/drivers/infiniband/hw/hfi1/verbs.h b/drivers/infiniband/hw/hfi1/verbs.h index 7ecb8ed4a1d9..ae9582ddbc8f 100644 --- a/drivers/infiniband/hw/hfi1/verbs.h +++ b/drivers/infiniband/hw/hfi1/verbs.h @@ -416,6 +416,7 @@ void hfi1_rc_hdrerr( u8 ah_to_sc(struct ib_device *ibdev, struct rdma_ah_attr *ah_attr); +void hfi1_rc_verbs_aborted(struct rvt_qp *qp, struct hfi1_opa_header *opah); void hfi1_rc_send_complete(struct rvt_qp *qp, struct hfi1_opa_header *opah); void hfi1_ud_rcv(struct hfi1_packet *packet); -- cgit v1.2.3 From f972775b1cc0441ae22c9f8d06dd16b118463632 Mon Sep 17 00:00:00 2001 From: Mike Marciniszyn Date: Fri, 14 Jun 2019 12:32:50 -0400 Subject: IB/hfi1: Wakeup QPs orphaned on wait list after flush Once an SDMA engine is taken down due to a link failure, any waiting QPs that do not have outstanding descriptors in the ring will stay on the dmawait list as long as the port is down. Since there is no timer running, they will stay there for a long time. The fix is to wake up all iowaits linked to dmawait. The send engine will build and post packets that get flushed back. Fixes: 7724105686e7 ("IB/hfi1: add driver files") Reviewed-by: Kaike Wan Signed-off-by: Mike Marciniszyn Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/sdma.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/drivers/infiniband/hw/hfi1/sdma.c b/drivers/infiniband/hw/hfi1/sdma.c index 70828de7436b..28b66bd70b74 100644 --- a/drivers/infiniband/hw/hfi1/sdma.c +++ b/drivers/infiniband/hw/hfi1/sdma.c @@ -405,6 +405,7 @@ static void sdma_flush(struct sdma_engine *sde) struct sdma_txreq *txp, *txp_next; LIST_HEAD(flushlist); unsigned long flags; + uint seq; /* flush from head to tail */ sdma_flush_descq(sde); @@ -415,6 +416,22 @@ static void sdma_flush(struct sdma_engine *sde) /* flush from flush list */ list_for_each_entry_safe(txp, txp_next, &flushlist, list) complete_tx(sde, txp, SDMA_TXREQ_S_ABORTED); + /* wakeup QPs orphaned on the dmawait list */ + do { + struct iowait *w, *nw; + + seq = read_seqbegin(&sde->waitlock); + if (!list_empty(&sde->dmawait)) { + write_seqlock(&sde->waitlock); + list_for_each_entry_safe(w, nw, &sde->dmawait, list) { + if (w->wakeup) { + w->wakeup(w, SDMA_AVAIL_REASON); + list_del_init(&w->list); + } + } + write_sequnlock(&sde->waitlock); + } + } while (read_seqretry(&sde->waitlock, seq)); } /* -- cgit v1.2.3 From 099a884ba4c00145cef283d36e050726311c2e95 Mon Sep 17 00:00:00 2001 From: Mike Marciniszyn Date: Fri, 14 Jun 2019 12:33:00 -0400 Subject: IB/hfi1: Handle wakeup of orphaned QPs for pio Once a send context is taken down due to a link failure, any QPs waiting for pio credits will stay on the waitlist indefinitely. Fix by wakeing up all QPs linked to piowait list. Fixes: 7724105686e7 ("IB/hfi1: add driver files") Reviewed-by: Dennis Dalessandro Signed-off-by: Mike Marciniszyn Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/pio.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/drivers/infiniband/hw/hfi1/pio.c b/drivers/infiniband/hw/hfi1/pio.c index 16ba9d52e1b9..66cf2b17e519 100644 --- a/drivers/infiniband/hw/hfi1/pio.c +++ b/drivers/infiniband/hw/hfi1/pio.c @@ -952,6 +952,22 @@ void sc_disable(struct send_context *sc) } } spin_unlock(&sc->release_lock); + + write_seqlock(&sc->waitlock); + while (!list_empty(&sc->piowait)) { + struct iowait *wait; + struct rvt_qp *qp; + struct hfi1_qp_priv *priv; + + wait = list_first_entry(&sc->piowait, struct iowait, list); + qp = iowait_to_qp(wait); + priv = qp->priv; + list_del_init(&priv->s_iowait.list); + priv->s_iowait.lock = NULL; + hfi1_qp_wakeup(qp, RVT_S_WAIT_PIO | HFI1_S_WAIT_PIO_DRAIN); + } + write_sequnlock(&sc->waitlock); + spin_unlock_irq(&sc->alloc_lock); } -- cgit v1.2.3 From 942a899335707fc9cfc97cb382a60734b2ff4e03 Mon Sep 17 00:00:00 2001 From: Mike Marciniszyn Date: Fri, 14 Jun 2019 12:33:06 -0400 Subject: IB/hfi1: Handle port down properly in pio The call to sc_buffer_alloc currently returns NULL (no buffer) or a buffer descriptor. There is a third case when the port is down. Currently that returns NULL and this prevents the caller from properly handling the sc_buffer_alloc() failure. A verbs code link test after the call is racy so the indication needs to come from the state check inside the allocation routine to be valid. Fix by encoding the ECOMM failure like SDMA. IS_ERR_OR_NULL() tests are added at all call sites. For verbs send, this needs to treat any error by returning a completion without any MMIO copy. Fixes: 7724105686e7 ("IB/hfi1: add driver files") Reviewed-by: Dennis Dalessandro Signed-off-by: Mike Marciniszyn Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/pio.c | 5 +++-- drivers/infiniband/hw/hfi1/rc.c | 2 +- drivers/infiniband/hw/hfi1/ud.c | 4 ++-- drivers/infiniband/hw/hfi1/verbs.c | 4 ++-- 4 files changed, 8 insertions(+), 7 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/pio.c b/drivers/infiniband/hw/hfi1/pio.c index 66cf2b17e519..4e5c2d1b8cfa 100644 --- a/drivers/infiniband/hw/hfi1/pio.c +++ b/drivers/infiniband/hw/hfi1/pio.c @@ -1443,7 +1443,8 @@ void sc_stop(struct send_context *sc, int flag) * @cb: optional callback to call when the buffer is finished sending * @arg: argument for cb * - * Return a pointer to a PIO buffer if successful, NULL if not enough room. + * Return a pointer to a PIO buffer, NULL if not enough room, -ECOMM + * when link is down. */ struct pio_buf *sc_buffer_alloc(struct send_context *sc, u32 dw_len, pio_release_cb cb, void *arg) @@ -1459,7 +1460,7 @@ struct pio_buf *sc_buffer_alloc(struct send_context *sc, u32 dw_len, spin_lock_irqsave(&sc->alloc_lock, flags); if (!(sc->flags & SCF_ENABLED)) { spin_unlock_irqrestore(&sc->alloc_lock, flags); - goto done; + return ERR_PTR(-ECOMM); } retry: diff --git a/drivers/infiniband/hw/hfi1/rc.c b/drivers/infiniband/hw/hfi1/rc.c index 6684ec333525..7c8cfb149da0 100644 --- a/drivers/infiniband/hw/hfi1/rc.c +++ b/drivers/infiniband/hw/hfi1/rc.c @@ -1432,7 +1432,7 @@ void hfi1_send_rc_ack(struct hfi1_packet *packet, bool is_fecn) pbc = create_pbc(ppd, pbc_flags, qp->srate_mbps, sc_to_vlt(ppd->dd, sc5), plen); pbuf = sc_buffer_alloc(rcd->sc, plen, NULL, NULL); - if (!pbuf) { + if (IS_ERR_OR_NULL(pbuf)) { /* * We have no room to send at the moment. Pass * responsibility for sending the ACK to the send engine diff --git a/drivers/infiniband/hw/hfi1/ud.c b/drivers/infiniband/hw/hfi1/ud.c index f88ad425664a..4cb0fce5c096 100644 --- a/drivers/infiniband/hw/hfi1/ud.c +++ b/drivers/infiniband/hw/hfi1/ud.c @@ -683,7 +683,7 @@ void return_cnp_16B(struct hfi1_ibport *ibp, struct rvt_qp *qp, pbc = create_pbc(ppd, pbc_flags, qp->srate_mbps, vl, plen); if (ctxt) { pbuf = sc_buffer_alloc(ctxt, plen, NULL, NULL); - if (pbuf) { + if (!IS_ERR_OR_NULL(pbuf)) { trace_pio_output_ibhdr(ppd->dd, &hdr, sc5); ppd->dd->pio_inline_send(ppd->dd, pbuf, pbc, &hdr, hwords); @@ -738,7 +738,7 @@ void return_cnp(struct hfi1_ibport *ibp, struct rvt_qp *qp, u32 remote_qpn, pbc = create_pbc(ppd, pbc_flags, qp->srate_mbps, vl, plen); if (ctxt) { pbuf = sc_buffer_alloc(ctxt, plen, NULL, NULL); - if (pbuf) { + if (!IS_ERR_OR_NULL(pbuf)) { trace_pio_output_ibhdr(ppd->dd, &hdr, sc5); ppd->dd->pio_inline_send(ppd->dd, pbuf, pbc, &hdr, hwords); diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c index a4cbb0f3382b..bad3229bad37 100644 --- a/drivers/infiniband/hw/hfi1/verbs.c +++ b/drivers/infiniband/hw/hfi1/verbs.c @@ -1039,10 +1039,10 @@ int hfi1_verbs_send_pio(struct rvt_qp *qp, struct hfi1_pkt_state *ps, if (cb) iowait_pio_inc(&priv->s_iowait); pbuf = sc_buffer_alloc(sc, plen, cb, qp); - if (unlikely(!pbuf)) { + if (unlikely(IS_ERR_OR_NULL(pbuf))) { if (cb) verbs_pio_complete(qp, 0); - if (ppd->host_link_state != HLS_UP_ACTIVE) { + if (IS_ERR(pbuf)) { /* * If we have filled the PIO buffers to capacity and are * not in an active state this request is not going to -- cgit v1.2.3 From 529254340c7f16d59b928e36568597c603bae917 Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Wed, 12 Jun 2019 10:28:41 +0300 Subject: RDMA/efa: Fix success return value in case of error Existing code would mistakenly return success in case of error instead of a proper return value. Fixes: e9c6c5373088 ("RDMA/efa: Add common command handlers") Reviewed-by: Firas JahJah Reviewed-by: Yossi Leybovich Signed-off-by: Gal Pressman Signed-off-by: Doug Ledford --- drivers/infiniband/hw/efa/efa_com_cmd.c | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/drivers/infiniband/hw/efa/efa_com_cmd.c b/drivers/infiniband/hw/efa/efa_com_cmd.c index 14227725521c..c0016648804c 100644 --- a/drivers/infiniband/hw/efa/efa_com_cmd.c +++ b/drivers/infiniband/hw/efa/efa_com_cmd.c @@ -139,9 +139,11 @@ int efa_com_destroy_qp(struct efa_com_dev *edev, sizeof(qp_cmd), (struct efa_admin_acq_entry *)&cmd_completion, sizeof(cmd_completion)); - if (err) + if (err) { ibdev_err(edev->efa_dev, "Failed to destroy qp-%u [%d]\n", qp_cmd.qp_handle, err); + return err; + } return 0; } @@ -199,9 +201,11 @@ int efa_com_destroy_cq(struct efa_com_dev *edev, (struct efa_admin_acq_entry *)&destroy_resp, sizeof(destroy_resp)); - if (err) + if (err) { ibdev_err(edev->efa_dev, "Failed to destroy CQ-%u [%d]\n", params->cq_idx, err); + return err; + } return 0; } @@ -273,10 +277,12 @@ int efa_com_dereg_mr(struct efa_com_dev *edev, sizeof(mr_cmd), (struct efa_admin_acq_entry *)&cmd_completion, sizeof(cmd_completion)); - if (err) + if (err) { ibdev_err(edev->efa_dev, "Failed to de-register mr(lkey-%u) [%d]\n", mr_cmd.l_key, err); + return err; + } return 0; } @@ -327,9 +333,11 @@ int efa_com_destroy_ah(struct efa_com_dev *edev, sizeof(ah_cmd), (struct efa_admin_acq_entry *)&cmd_completion, sizeof(cmd_completion)); - if (err) + if (err) { ibdev_err(edev->efa_dev, "Failed to destroy ah-%d pd-%d [%d]\n", ah_cmd.ah, ah_cmd.pd, err); + return err; + } return 0; } @@ -387,10 +395,12 @@ static int efa_com_get_feature_ex(struct efa_com_dev *edev, get_resp, sizeof(*get_resp)); - if (err) + if (err) { ibdev_err(edev->efa_dev, "Failed to submit get_feature command %d [%d]\n", feature_id, err); + return err; + } return 0; } @@ -534,10 +544,12 @@ static int efa_com_set_feature_ex(struct efa_com_dev *edev, (struct efa_admin_acq_entry *)set_resp, sizeof(*set_resp)); - if (err) + if (err) { ibdev_err(edev->efa_dev, "Failed to submit set_feature command %d error: %d\n", feature_id, err); + return err; + } return 0; } -- cgit v1.2.3 From 7a5834e456f7fb3eca9b63af2a6bc7f460ae482f Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Tue, 18 Jun 2019 16:07:32 +0300 Subject: RDMA/efa: Handle mmap insertions overflow When inserting a new mmap entry to the xarray we should check for 'mmap_page' overflow as it is limited to 32 bits. Fixes: 40909f664d27 ("RDMA/efa: Add EFA verbs implementation") Signed-off-by: Gal Pressman Signed-off-by: Doug Ledford --- drivers/infiniband/hw/efa/efa_verbs.c | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/drivers/infiniband/hw/efa/efa_verbs.c b/drivers/infiniband/hw/efa/efa_verbs.c index 0fea5d63fdbe..fb6115244d4c 100644 --- a/drivers/infiniband/hw/efa/efa_verbs.c +++ b/drivers/infiniband/hw/efa/efa_verbs.c @@ -204,6 +204,7 @@ static u64 mmap_entry_insert(struct efa_dev *dev, struct efa_ucontext *ucontext, void *obj, u64 address, u64 length, u8 mmap_flag) { struct efa_mmap_entry *entry; + u32 next_mmap_page; int err; entry = kmalloc(sizeof(*entry), GFP_KERNEL); @@ -216,15 +217,19 @@ static u64 mmap_entry_insert(struct efa_dev *dev, struct efa_ucontext *ucontext, entry->mmap_flag = mmap_flag; xa_lock(&ucontext->mmap_xa); + if (check_add_overflow(ucontext->mmap_xa_page, + (u32)(length >> PAGE_SHIFT), + &next_mmap_page)) + goto err_unlock; + entry->mmap_page = ucontext->mmap_xa_page; - ucontext->mmap_xa_page += DIV_ROUND_UP(length, PAGE_SIZE); + ucontext->mmap_xa_page = next_mmap_page; err = __xa_insert(&ucontext->mmap_xa, entry->mmap_page, entry, GFP_KERNEL); + if (err) + goto err_unlock; + xa_unlock(&ucontext->mmap_xa); - if (err){ - kfree(entry); - return EFA_MMAP_INVALID; - } ibdev_dbg( &dev->ibdev, @@ -232,6 +237,12 @@ static u64 mmap_entry_insert(struct efa_dev *dev, struct efa_ucontext *ucontext, entry->obj, entry->address, entry->length, get_mmap_key(entry)); return get_mmap_key(entry); + +err_unlock: + xa_unlock(&ucontext->mmap_xa); + kfree(entry); + return EFA_MMAP_INVALID; + } int efa_query_device(struct ib_device *ibdev, -- cgit v1.2.3