From 2ef5612391f0a7a631c42a8afc867095b49a1992 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 4 Jun 2020 15:39:02 +0100 Subject: RDMA/mlx5: Remove duplicated assignment to resp.response_length The assignment to resp.response_length is never read since it is being updated again on the next statement. The assignment is redundant so removed it. Fixes: a645a89d9a78 ("RDMA/mlx5: Return ECE DC support") Link: https://lore.kernel.org/r/20200604143902.56021-1-colin.king@canonical.com Addresses-Coverity: ("Unused value") Signed-off-by: Colin Ian King Acked-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/qp.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 81bf6b975e0e..d61ca85033de 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -4162,8 +4162,6 @@ static int mlx5_ib_modify_dct(struct ib_qp *ibqp, struct ib_qp_attr *attr, if (udata->outlen < min_resp_len) return -EINVAL; - resp.response_length = min_resp_len; - /* * If we don't have enough space for the ECE options, * simply indicate it with resp.response_length. -- cgit v1.2.3 From 4f5747cf8e5947479a27a3597829e45d6d8d73e0 Mon Sep 17 00:00:00 2001 From: Tom Seewald Date: Thu, 4 Jun 2020 21:30:12 -0500 Subject: RDMA/mlx5: Fix -Wformat warning in check_ucmd_data() Variables of type size_t should use %zu rather than %lu [1]. The variables "inlen", "ucmd", "last", and "size" are all size_t, so use the correct format specifiers. [1] https://www.kernel.org/doc/html/latest/core-api/printk-formats.html Fixes: e383085c2425 ("RDMA/mlx5: Set ECE options during QP create") Link: https://lore.kernel.org/r/20200605023012.9527-1-tseewald@gmail.com Signed-off-by: Tom Seewald Acked-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/qp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index d61ca85033de..dbe82cdb8d2c 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -2907,7 +2907,7 @@ static int check_ucmd_data(struct mlx5_ib_dev *dev, if (!ret) mlx5_ib_dbg( dev, - "udata is not cleared, inlen = %lu, ucmd = %lu, last = %lu, size = %lu\n", + "udata is not cleared, inlen = %zu, ucmd = %zu, last = %zu, size = %zu\n", udata->inlen, params->ucmd_size, last, size); return ret ? 0 : -EINVAL; } -- cgit v1.2.3 From 0dc63bbee0fa6da20283ae6e22e99c6fde25ed8e Mon Sep 17 00:00:00 2001 From: Kieran Bingham Date: Tue, 9 Jun 2020 13:45:55 +0100 Subject: RDMA/hfi1: Fix trivial mis-spelling of 'descriptor' The word 'descriptor' is misspelled throughout the tree. Fix it up accordingly: decriptors -> descriptors Link: https://lore.kernel.org/r/20200609124610.3445662-3-kieran.bingham+renesas@ideasonboard.com Link: https://lore.kernel.org/r/20200609124610.3445662-12-kieran.bingham+renesas@ideasonboard.com Signed-off-by: Kieran Bingham Acked-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/iowait.h | 2 +- drivers/infiniband/hw/hfi1/ipoib_tx.c | 2 +- drivers/infiniband/hw/hfi1/verbs_txreq.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/iowait.h b/drivers/infiniband/hw/hfi1/iowait.h index 07847cb72169..d580aa17ae37 100644 --- a/drivers/infiniband/hw/hfi1/iowait.h +++ b/drivers/infiniband/hw/hfi1/iowait.h @@ -399,7 +399,7 @@ static inline void iowait_get_priority(struct iowait *w) * @wait_head: the wait queue * * This function is called to insert an iowait struct into a - * wait queue after a resource (eg, sdma decriptor or pio + * wait queue after a resource (eg, sdma descriptor or pio * buffer) is run out. */ static inline void iowait_queue(bool pkts_sent, struct iowait *w, diff --git a/drivers/infiniband/hw/hfi1/ipoib_tx.c b/drivers/infiniband/hw/hfi1/ipoib_tx.c index 883cb9d48022..175290c56db9 100644 --- a/drivers/infiniband/hw/hfi1/ipoib_tx.c +++ b/drivers/infiniband/hw/hfi1/ipoib_tx.c @@ -364,7 +364,7 @@ static struct ipoib_txreq *hfi1_ipoib_send_dma_common(struct net_device *dev, if (unlikely(!tx)) return ERR_PTR(-ENOMEM); - /* so that we can test if the sdma decriptors are there */ + /* so that we can test if the sdma descriptors are there */ tx->txreq.num_desc = 0; tx->priv = priv; tx->txq = txp->txq; diff --git a/drivers/infiniband/hw/hfi1/verbs_txreq.h b/drivers/infiniband/hw/hfi1/verbs_txreq.h index bfa6e081cb56..d2d526c5a756 100644 --- a/drivers/infiniband/hw/hfi1/verbs_txreq.h +++ b/drivers/infiniband/hw/hfi1/verbs_txreq.h @@ -91,7 +91,7 @@ static inline struct verbs_txreq *get_txreq(struct hfi1_ibdev *dev, tx->mr = NULL; tx->sde = priv->s_sde; tx->psc = priv->s_sendcontext; - /* so that we can test if the sdma decriptors are there */ + /* so that we can test if the sdma descriptors are there */ tx->txreq.num_desc = 0; /* Set the header type */ tx->phdr.hdr.hdr_type = priv->hdr_type; -- cgit v1.2.3 From 6769b275a313c76ddcd7d94c632032326db5f759 Mon Sep 17 00:00:00 2001 From: Tom Seewald Date: Wed, 10 Jun 2020 12:47:17 -0500 Subject: RDMA/siw: Fix pointer-to-int-cast warning in siw_rx_pbl() The variable buf_addr is type dma_addr_t, which may not be the same size as a pointer. To ensure it is the correct size, cast to a uintptr_t. Fixes: c536277e0db1 ("RDMA/siw: Fix 64/32bit pointer inconsistency") Link: https://lore.kernel.org/r/20200610174717.15932-1-tseewald@gmail.com Signed-off-by: Tom Seewald Reviewed-by: Bernard Metzler Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/siw/siw_qp_rx.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/sw/siw/siw_qp_rx.c b/drivers/infiniband/sw/siw/siw_qp_rx.c index 650520244ed0..7271d705f4b0 100644 --- a/drivers/infiniband/sw/siw/siw_qp_rx.c +++ b/drivers/infiniband/sw/siw/siw_qp_rx.c @@ -139,7 +139,8 @@ static int siw_rx_pbl(struct siw_rx_stream *srx, int *pbl_idx, break; bytes = min(bytes, len); - if (siw_rx_kva(srx, (void *)buf_addr, bytes) == bytes) { + if (siw_rx_kva(srx, (void *)(uintptr_t)buf_addr, bytes) == + bytes) { copied += bytes; offset += bytes; len -= bytes; -- cgit v1.2.3 From 1ea7c546b8b3f27bf7da673c265b09c8f79d11bc Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Thu, 11 Jun 2020 16:00:45 +0300 Subject: RDMA/core: Annotate CMA unlock helper routine Fix the following sparse error by adding annotation to cm_queue_work_unlock() that it releases cm_id_priv->lock lock. drivers/infiniband/core/cm.c:936:24: warning: context imbalance in 'cm_queue_work_unlock' - unexpected unlock Fixes: e83f195aa45c ("RDMA/cm: Pull duplicated code into cm_queue_work_unlock()") Link: https://lore.kernel.org/r/20200611130045.1994026-1-leon@kernel.org Reported-by: kernel test robot Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cm.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c index 9ce787e37e22..0d1377232933 100644 --- a/drivers/infiniband/core/cm.c +++ b/drivers/infiniband/core/cm.c @@ -918,6 +918,7 @@ static void cm_free_work(struct cm_work *work) static void cm_queue_work_unlock(struct cm_id_private *cm_id_priv, struct cm_work *work) + __releases(&cm_id_priv->lock) { bool immediate; -- cgit v1.2.3 From 90a239ee25fa3a483facec3de7c144361a3d3a51 Mon Sep 17 00:00:00 2001 From: Aditya Pakki Date: Sat, 13 Jun 2020 23:11:48 -0500 Subject: RDMA/rvt: Fix potential memory leak caused by rvt_alloc_rq In case of failure of alloc_ud_wq_attr(), the memory allocated by rvt_alloc_rq() is not freed. Fix it by calling rvt_free_rq() using the existing clean-up code. Fixes: d310c4bf8aea ("IB/{rdmavt, hfi1, qib}: Remove AH refcount for UD QPs") Link: https://lore.kernel.org/r/20200614041148.131983-1-pakki001@umn.edu Signed-off-by: Aditya Pakki Acked-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rdmavt/qp.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c index 511b72809e14..7db35dd6ad74 100644 --- a/drivers/infiniband/sw/rdmavt/qp.c +++ b/drivers/infiniband/sw/rdmavt/qp.c @@ -1204,7 +1204,7 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd, err = alloc_ud_wq_attr(qp, rdi->dparms.node); if (err) { ret = (ERR_PTR(err)); - goto bail_driver_priv; + goto bail_rq_rvt; } if (init_attr->create_flags & IB_QP_CREATE_NETDEV_USE) @@ -1314,9 +1314,11 @@ bail_qpn: rvt_free_qpn(&rdi->qp_dev->qpn_table, qp->ibqp.qp_num); bail_rq_wq: - rvt_free_rq(&qp->r_rq); free_ud_wq_attr(qp); +bail_rq_rvt: + rvt_free_rq(&qp->r_rq); + bail_driver_priv: rdi->driver_f.qp_priv_free(rdi, qp); -- cgit v1.2.3 From 0133654d8eb8607eacc96badfe49bf992155f4cb Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Sun, 14 Jun 2020 13:35:34 +0300 Subject: RDMA/efa: Set maximum pkeys device attribute The max_pkeys device attribute was not set in query device verb, set it to one in order to account for the default pkey (0xffff). This information is exposed to userspace and can cause malfunction Fixes: 40909f664d27 ("RDMA/efa: Add EFA verbs implementation") Link: https://lore.kernel.org/r/20200614103534.88060-1-galpress@amazon.com Reviewed-by: Firas JahJah Reviewed-by: Yossi Leybovich Signed-off-by: Gal Pressman Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/efa/efa_verbs.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/infiniband/hw/efa/efa_verbs.c b/drivers/infiniband/hw/efa/efa_verbs.c index 08313f7c73bc..7dd082441333 100644 --- a/drivers/infiniband/hw/efa/efa_verbs.c +++ b/drivers/infiniband/hw/efa/efa_verbs.c @@ -212,6 +212,7 @@ int efa_query_device(struct ib_device *ibdev, props->max_send_sge = dev_attr->max_sq_sge; props->max_recv_sge = dev_attr->max_rq_sge; props->max_sge_rd = dev_attr->max_wr_rdma_sge; + props->max_pkeys = 1; if (udata && udata->outlen) { resp.max_sq_sge = dev_attr->max_sq_sge; -- cgit v1.2.3 From 0dfbd5ecf28cbcb81674c49d34ee97366db1be44 Mon Sep 17 00:00:00 2001 From: Michal Kalderon Date: Tue, 16 Jun 2020 12:34:08 +0300 Subject: RDMA/qedr: Fix KASAN: use-after-free in ucma_event_handler+0x532 Private data passed to iwarp_cm_handler is copied for connection request / response, but ignored otherwise. If junk is passed, it is stored in the event and used later in the event processing. The driver passes an old junk pointer during connection close which leads to a use-after-free on event processing. Set private data to NULL for events that don 't have private data. BUG: KASAN: use-after-free in ucma_event_handler+0x532/0x560 [rdma_ucm] kernel: Read of size 4 at addr ffff8886caa71200 by task kworker/u128:1/5250 kernel: kernel: Workqueue: iw_cm_wq cm_work_handler [iw_cm] kernel: Call Trace: kernel: dump_stack+0x8c/0xc0 kernel: print_address_description.constprop.0+0x1b/0x210 kernel: ? ucma_event_handler+0x532/0x560 [rdma_ucm] kernel: ? ucma_event_handler+0x532/0x560 [rdma_ucm] kernel: __kasan_report.cold+0x1a/0x33 kernel: ? ucma_event_handler+0x532/0x560 [rdma_ucm] kernel: kasan_report+0xe/0x20 kernel: check_memory_region+0x130/0x1a0 kernel: memcpy+0x20/0x50 kernel: ucma_event_handler+0x532/0x560 [rdma_ucm] kernel: ? __rpc_execute+0x608/0x620 [sunrpc] kernel: cma_iw_handler+0x212/0x330 [rdma_cm] kernel: ? iw_conn_req_handler+0x6e0/0x6e0 [rdma_cm] kernel: ? enqueue_timer+0x86/0x140 kernel: ? _raw_write_lock_irq+0xd0/0xd0 kernel: cm_work_handler+0xd3d/0x1070 [iw_cm] Fixes: e411e0587e0d ("RDMA/qedr: Add iWARP connection management functions") Link: https://lore.kernel.org/r/20200616093408.17827-1-michal.kalderon@marvell.com Signed-off-by: Ariel Elior Signed-off-by: Michal Kalderon Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/qedr/qedr_iw_cm.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/qedr/qedr_iw_cm.c b/drivers/infiniband/hw/qedr/qedr_iw_cm.c index 792eecd206b6..97fc7dd353b0 100644 --- a/drivers/infiniband/hw/qedr/qedr_iw_cm.c +++ b/drivers/infiniband/hw/qedr/qedr_iw_cm.c @@ -150,8 +150,17 @@ qedr_iw_issue_event(void *context, if (params->cm_info) { event.ird = params->cm_info->ird; event.ord = params->cm_info->ord; - event.private_data_len = params->cm_info->private_data_len; - event.private_data = (void *)params->cm_info->private_data; + /* Only connect_request and reply have valid private data + * the rest of the events this may be left overs from + * connection establishment. CONNECT_REQUEST is issued via + * qedr_iw_mpa_request + */ + if (event_type == IW_CM_EVENT_CONNECT_REPLY) { + event.private_data_len = + params->cm_info->private_data_len; + event.private_data = + (void *)params->cm_info->private_data; + } } if (ep->cm_id) -- cgit v1.2.3 From 730c8912484186d4623d0c76509066d285c3a755 Mon Sep 17 00:00:00 2001 From: Mark Zhang Date: Tue, 16 Jun 2020 13:43:04 +0300 Subject: RDMA/cma: Protect bind_list and listen_list while finding matching cm id The bind_list and listen_list must be accessed under a lock, add the missing locking around the access in cm_ib_id_from_event() In addition add lockdep asserts to make it clearer what the locking semantic is here. general protection fault: 0000 [#1] SMP NOPTI CPU: 226 PID: 126135 Comm: kworker/226:1 Tainted: G OE 4.12.14-150.47-default #1 SLE15 Hardware name: Cray Inc. Windom/Windom, BIOS 0.8.7 01-10-2020 Workqueue: ib_cm cm_work_handler [ib_cm] task: ffff9c5a60a1d2c0 task.stack: ffffc1d91f554000 RIP: 0010:cma_ib_req_handler+0x3f1/0x11b0 [rdma_cm] RSP: 0018:ffffc1d91f557b40 EFLAGS: 00010286 RAX: deacffffffffff30 RBX: 0000000000000001 RCX: ffff9c2af5bb6000 RDX: 00000000000000a9 RSI: ffff9c5aa4ed2f10 RDI: ffffc1d91f557b08 RBP: ffffc1d91f557d90 R08: ffff9c340cc80000 R09: ffff9c2c0f901900 R10: 0000000000000000 R11: 0000000000000001 R12: deacffffffffff30 R13: ffff9c5a48aeec00 R14: ffffc1d91f557c30 R15: ffff9c5c2eea3688 FS: 0000000000000000(0000) GS:ffff9c5c2fa80000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00002b5cc03fa320 CR3: 0000003f8500a000 CR4: 00000000003406e0 Call Trace: ? rdma_addr_cancel+0xa0/0xa0 [ib_core] ? cm_process_work+0x28/0x140 [ib_cm] cm_process_work+0x28/0x140 [ib_cm] ? cm_get_bth_pkey.isra.44+0x34/0xa0 [ib_cm] cm_work_handler+0xa06/0x1a6f [ib_cm] ? __switch_to_asm+0x34/0x70 ? __switch_to_asm+0x34/0x70 ? __switch_to_asm+0x40/0x70 ? __switch_to_asm+0x34/0x70 ? __switch_to_asm+0x40/0x70 ? __switch_to_asm+0x34/0x70 ? __switch_to_asm+0x40/0x70 ? __switch_to+0x7c/0x4b0 ? __switch_to_asm+0x40/0x70 ? __switch_to_asm+0x34/0x70 process_one_work+0x1da/0x400 worker_thread+0x2b/0x3f0 ? process_one_work+0x400/0x400 kthread+0x118/0x140 ? kthread_create_on_node+0x40/0x40 ret_from_fork+0x22/0x40 Code: 00 66 83 f8 02 0f 84 ca 05 00 00 49 8b 84 24 d0 01 00 00 48 85 c0 0f 84 68 07 00 00 48 2d d0 01 00 00 49 89 c4 0f 84 59 07 00 00 <41> 0f b7 44 24 20 49 8b 77 50 66 83 f8 0a 75 9e 49 8b 7c 24 28 Fixes: 4c21b5bcef73 ("IB/cma: Add net_dev and private data checks to RDMA CM") Link: https://lore.kernel.org/r/20200616104304.2426081-1-leon@kernel.org Signed-off-by: Mark Zhang Reviewed-by: Maor Gottlieb Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cma.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 3d7cc9f0f3d4..c30cf5307ce3 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -1624,6 +1624,8 @@ static struct rdma_id_private *cma_find_listener( { struct rdma_id_private *id_priv, *id_priv_dev; + lockdep_assert_held(&lock); + if (!bind_list) return ERR_PTR(-EINVAL); @@ -1670,6 +1672,7 @@ cma_ib_id_from_event(struct ib_cm_id *cm_id, } } + mutex_lock(&lock); /* * Net namespace might be getting deleted while route lookup, * cm_id lookup is in progress. Therefore, perform netdevice @@ -1711,6 +1714,7 @@ cma_ib_id_from_event(struct ib_cm_id *cm_id, id_priv = cma_find_listener(bind_list, cm_id, ib_event, req, *net_dev); err: rcu_read_unlock(); + mutex_unlock(&lock); if (IS_ERR(id_priv) && *net_dev) { dev_put(*net_dev); *net_dev = NULL; @@ -2492,6 +2496,8 @@ static void cma_listen_on_dev(struct rdma_id_private *id_priv, struct net *net = id_priv->id.route.addr.dev_addr.net; int ret; + lockdep_assert_held(&lock); + if (cma_family(id_priv) == AF_IB && !rdma_cap_ib_cm(cma_dev->device, 1)) return; @@ -3342,6 +3348,8 @@ static void cma_bind_port(struct rdma_bind_list *bind_list, u64 sid, mask; __be16 port; + lockdep_assert_held(&lock); + addr = cma_src_addr(id_priv); port = htons(bind_list->port); @@ -3370,6 +3378,8 @@ static int cma_alloc_port(enum rdma_ucm_port_space ps, struct rdma_bind_list *bind_list; int ret; + lockdep_assert_held(&lock); + bind_list = kzalloc(sizeof *bind_list, GFP_KERNEL); if (!bind_list) return -ENOMEM; @@ -3396,6 +3406,8 @@ static int cma_port_is_unique(struct rdma_bind_list *bind_list, struct sockaddr *saddr = cma_src_addr(id_priv); __be16 dport = cma_port(daddr); + lockdep_assert_held(&lock); + hlist_for_each_entry(cur_id, &bind_list->owners, node) { struct sockaddr *cur_daddr = cma_dst_addr(cur_id); struct sockaddr *cur_saddr = cma_src_addr(cur_id); @@ -3435,6 +3447,8 @@ static int cma_alloc_any_port(enum rdma_ucm_port_space ps, unsigned int rover; struct net *net = id_priv->id.route.addr.dev_addr.net; + lockdep_assert_held(&lock); + inet_get_local_port_range(net, &low, &high); remaining = (high - low) + 1; rover = prandom_u32() % remaining + low; @@ -3482,6 +3496,8 @@ static int cma_check_port(struct rdma_bind_list *bind_list, struct rdma_id_private *cur_id; struct sockaddr *addr, *cur_addr; + lockdep_assert_held(&lock); + addr = cma_src_addr(id_priv); hlist_for_each_entry(cur_id, &bind_list->owners, node) { if (id_priv == cur_id) @@ -3512,6 +3528,8 @@ static int cma_use_port(enum rdma_ucm_port_space ps, unsigned short snum; int ret; + lockdep_assert_held(&lock); + snum = ntohs(cma_port(cma_src_addr(id_priv))); if (snum < PROT_SOCK && !capable(CAP_NET_BIND_SERVICE)) return -EACCES; -- cgit v1.2.3 From ab183d460daac6292cb0cfd989d88b37b2437844 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 16 Jun 2020 13:45:36 +0300 Subject: RDMA/mlx5: Add missed RST2INIT and INIT2INIT steps during ECE handshake Missed steps during ECE handshake left userspace application with less options for the ECE handshake. Pass ECE options in the additional transitions. Fixes: 50aec2c3135e ("RDMA/mlx5: Return ECE data after modify QP") Link: https://lore.kernel.org/r/20200616104536.2426384-1-leon@kernel.org Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/qpc.c | 8 ++++++++ include/linux/mlx5/mlx5_ifc.h | 10 ++++++---- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/qpc.c b/drivers/infiniband/hw/mlx5/qpc.c index c19d91d6dce8..7c3968ef9cd1 100644 --- a/drivers/infiniband/hw/mlx5/qpc.c +++ b/drivers/infiniband/hw/mlx5/qpc.c @@ -346,6 +346,9 @@ static int get_ece_from_mbox(void *out, u16 opcode) int ece = 0; switch (opcode) { + case MLX5_CMD_OP_INIT2INIT_QP: + ece = MLX5_GET(init2init_qp_out, out, ece); + break; case MLX5_CMD_OP_INIT2RTR_QP: ece = MLX5_GET(init2rtr_qp_out, out, ece); break; @@ -355,6 +358,9 @@ static int get_ece_from_mbox(void *out, u16 opcode) case MLX5_CMD_OP_RTS2RTS_QP: ece = MLX5_GET(rts2rts_qp_out, out, ece); break; + case MLX5_CMD_OP_RST2INIT_QP: + ece = MLX5_GET(rst2init_qp_out, out, ece); + break; default: break; } @@ -406,6 +412,7 @@ static int modify_qp_mbox_alloc(struct mlx5_core_dev *dev, u16 opcode, int qpn, return -ENOMEM; MOD_QP_IN_SET_QPC(rst2init_qp, mbox->in, opcode, qpn, opt_param_mask, qpc, uid); + MLX5_SET(rst2init_qp_in, mbox->in, ece, ece); break; case MLX5_CMD_OP_INIT2RTR_QP: if (MBOX_ALLOC(mbox, init2rtr_qp)) @@ -439,6 +446,7 @@ static int modify_qp_mbox_alloc(struct mlx5_core_dev *dev, u16 opcode, int qpn, return -ENOMEM; MOD_QP_IN_SET_QPC(init2init_qp, mbox->in, opcode, qpn, opt_param_mask, qpc, uid); + MLX5_SET(init2init_qp_in, mbox->in, ece, ece); break; default: return -EINVAL; diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 116bd9bb347f..ca1887dd0423 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -4283,7 +4283,8 @@ struct mlx5_ifc_rst2init_qp_out_bits { u8 syndrome[0x20]; - u8 reserved_at_40[0x40]; + u8 reserved_at_40[0x20]; + u8 ece[0x20]; }; struct mlx5_ifc_rst2init_qp_in_bits { @@ -4300,7 +4301,7 @@ struct mlx5_ifc_rst2init_qp_in_bits { u8 opt_param_mask[0x20]; - u8 reserved_at_a0[0x20]; + u8 ece[0x20]; struct mlx5_ifc_qpc_bits qpc; @@ -6619,7 +6620,8 @@ struct mlx5_ifc_init2init_qp_out_bits { u8 syndrome[0x20]; - u8 reserved_at_40[0x40]; + u8 reserved_at_40[0x20]; + u8 ece[0x20]; }; struct mlx5_ifc_init2init_qp_in_bits { @@ -6636,7 +6638,7 @@ struct mlx5_ifc_init2init_qp_in_bits { u8 opt_param_mask[0x20]; - u8 reserved_at_a0[0x20]; + u8 ece[0x20]; struct mlx5_ifc_qpc_bits qpc; -- cgit v1.2.3 From 98a6151907cb5512eb6ba8b90644e3ace2d2fc46 Mon Sep 17 00:00:00 2001 From: Yangyang Li Date: Tue, 16 Jun 2020 21:37:09 +0800 Subject: RDMA/hns: Fix a calltrace when registering MR from userspace ibmr.device is assigned after MR is successfully registered, but both write_mtpt() and frmr_write_mtpt() accesses it during the mr registration process, which may cause the following error when trying to register MR in userspace and pbl_hop_num is set to 0. pc : hns_roce_mtr_find+0xa0/0x200 [hns_roce] lr : set_mtpt_pbl+0x54/0x118 [hns_roce_hw_v2] sp : ffff00023e73ba20 x29: ffff00023e73ba20 x28: ffff00023e73bad8 x27: 0000000000000000 x26: 0000000000000000 x25: 0000000000000002 x24: 0000000000000000 x23: ffff00023e73bad0 x22: 0000000000000000 x21: ffff0000094d9000 x20: 0000000000000000 x19: ffff8020a6bdb2c0 x18: 0000000000000000 x17: 0000000000000000 x16: 0000000000000000 x15: 0000000000000000 x14: 0000000000000000 x13: 0140000000000000 x12: 0040000000000041 x11: ffff000240000000 x10: 0000000000001000 x9 : 0000000000000000 x8 : ffff802fb7558480 x7 : ffff802fb7558480 x6 : 000000000003483d x5 : ffff00023e73bad0 x4 : 0000000000000002 x3 : ffff00023e73bad8 x2 : 0000000000000000 x1 : 0000000000000000 x0 : ffff0000094d9708 Call trace: hns_roce_mtr_find+0xa0/0x200 [hns_roce] set_mtpt_pbl+0x54/0x118 [hns_roce_hw_v2] hns_roce_v2_write_mtpt+0x14c/0x168 [hns_roce_hw_v2] hns_roce_mr_enable+0x6c/0x148 [hns_roce] hns_roce_reg_user_mr+0xd8/0x130 [hns_roce] ib_uverbs_reg_mr+0x14c/0x2e0 [ib_uverbs] ib_uverbs_write+0x27c/0x3e8 [ib_uverbs] __vfs_write+0x60/0x190 vfs_write+0xac/0x1c0 ksys_write+0x6c/0xd8 __arm64_sys_write+0x24/0x30 el0_svc_common+0x78/0x130 el0_svc_handler+0x38/0x78 el0_svc+0x8/0xc Solve above issue by adding a pointer of structure hns_roce_dev as a parameter of write_mtpt() and frmr_write_mtpt(), so that both of these functions can access it before finishing MR's registration. Fixes: 9b2cf76c9f05 ("RDMA/hns: Optimize PBL buffer allocation process") Link: https://lore.kernel.org/r/1592314629-51715-1-git-send-email-liweihang@huawei.com Signed-off-by: Yangyang Li Signed-off-by: Weihang Li Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_device.h | 7 ++++--- drivers/infiniband/hw/hns/hns_roce_hw_v1.c | 4 ++-- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 15 ++++++++------- drivers/infiniband/hw/hns/hns_roce_mr.c | 5 +++-- 4 files changed, 17 insertions(+), 14 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index a77fa6730b2d..479fa557993e 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -898,13 +898,14 @@ struct hns_roce_hw { int (*set_mac)(struct hns_roce_dev *hr_dev, u8 phy_port, u8 *addr); void (*set_mtu)(struct hns_roce_dev *hr_dev, u8 phy_port, enum ib_mtu mtu); - int (*write_mtpt)(void *mb_buf, struct hns_roce_mr *mr, - unsigned long mtpt_idx); + int (*write_mtpt)(struct hns_roce_dev *hr_dev, void *mb_buf, + struct hns_roce_mr *mr, unsigned long mtpt_idx); int (*rereg_write_mtpt)(struct hns_roce_dev *hr_dev, struct hns_roce_mr *mr, int flags, u32 pdn, int mr_access_flags, u64 iova, u64 size, void *mb_buf); - int (*frmr_write_mtpt)(void *mb_buf, struct hns_roce_mr *mr); + int (*frmr_write_mtpt)(struct hns_roce_dev *hr_dev, void *mb_buf, + struct hns_roce_mr *mr); int (*mw_write_mtpt)(void *mb_buf, struct hns_roce_mw *mw); void (*write_cqc)(struct hns_roce_dev *hr_dev, struct hns_roce_cq *hr_cq, void *mb_buf, u64 *mtts, diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c index d02207cd30df..cf39f560b800 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c @@ -1756,10 +1756,10 @@ static void hns_roce_v1_set_mtu(struct hns_roce_dev *hr_dev, u8 phy_port, val); } -static int hns_roce_v1_write_mtpt(void *mb_buf, struct hns_roce_mr *mr, +static int hns_roce_v1_write_mtpt(struct hns_roce_dev *hr_dev, void *mb_buf, + struct hns_roce_mr *mr, unsigned long mtpt_idx) { - struct hns_roce_dev *hr_dev = to_hr_dev(mr->ibmr.device); u64 pages[HNS_ROCE_MAX_INNER_MTPT_NUM] = { 0 }; struct ib_device *ibdev = &hr_dev->ib_dev; struct hns_roce_v1_mpt_entry *mpt_entry; diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index c597d7281629..bb86754cb94d 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -2529,10 +2529,10 @@ static int hns_roce_v2_set_mac(struct hns_roce_dev *hr_dev, u8 phy_port, return hns_roce_cmq_send(hr_dev, &desc, 1); } -static int set_mtpt_pbl(struct hns_roce_v2_mpt_entry *mpt_entry, +static int set_mtpt_pbl(struct hns_roce_dev *hr_dev, + struct hns_roce_v2_mpt_entry *mpt_entry, struct hns_roce_mr *mr) { - struct hns_roce_dev *hr_dev = to_hr_dev(mr->ibmr.device); u64 pages[HNS_ROCE_V2_MAX_INNER_MTPT_NUM] = { 0 }; struct ib_device *ibdev = &hr_dev->ib_dev; dma_addr_t pbl_ba; @@ -2571,7 +2571,8 @@ static int set_mtpt_pbl(struct hns_roce_v2_mpt_entry *mpt_entry, return 0; } -static int hns_roce_v2_write_mtpt(void *mb_buf, struct hns_roce_mr *mr, +static int hns_roce_v2_write_mtpt(struct hns_roce_dev *hr_dev, + void *mb_buf, struct hns_roce_mr *mr, unsigned long mtpt_idx) { struct hns_roce_v2_mpt_entry *mpt_entry; @@ -2620,7 +2621,7 @@ static int hns_roce_v2_write_mtpt(void *mb_buf, struct hns_roce_mr *mr, if (mr->type == MR_TYPE_DMA) return 0; - ret = set_mtpt_pbl(mpt_entry, mr); + ret = set_mtpt_pbl(hr_dev, mpt_entry, mr); return ret; } @@ -2666,15 +2667,15 @@ static int hns_roce_v2_rereg_write_mtpt(struct hns_roce_dev *hr_dev, mr->iova = iova; mr->size = size; - ret = set_mtpt_pbl(mpt_entry, mr); + ret = set_mtpt_pbl(hr_dev, mpt_entry, mr); } return ret; } -static int hns_roce_v2_frmr_write_mtpt(void *mb_buf, struct hns_roce_mr *mr) +static int hns_roce_v2_frmr_write_mtpt(struct hns_roce_dev *hr_dev, + void *mb_buf, struct hns_roce_mr *mr) { - struct hns_roce_dev *hr_dev = to_hr_dev(mr->ibmr.device); struct ib_device *ibdev = &hr_dev->ib_dev; struct hns_roce_v2_mpt_entry *mpt_entry; dma_addr_t pbl_ba = 0; diff --git a/drivers/infiniband/hw/hns/hns_roce_mr.c b/drivers/infiniband/hw/hns/hns_roce_mr.c index 4c0bbb12770d..0e71ebee9e52 100644 --- a/drivers/infiniband/hw/hns/hns_roce_mr.c +++ b/drivers/infiniband/hw/hns/hns_roce_mr.c @@ -180,9 +180,10 @@ static int hns_roce_mr_enable(struct hns_roce_dev *hr_dev, } if (mr->type != MR_TYPE_FRMR) - ret = hr_dev->hw->write_mtpt(mailbox->buf, mr, mtpt_idx); + ret = hr_dev->hw->write_mtpt(hr_dev, mailbox->buf, mr, + mtpt_idx); else - ret = hr_dev->hw->frmr_write_mtpt(mailbox->buf, mr); + ret = hr_dev->hw->frmr_write_mtpt(hr_dev, mailbox->buf, mr); if (ret) { dev_err(dev, "Write mtpt fail!\n"); goto err_page; -- cgit v1.2.3 From 3ec5f54f7a0fdf706951f379d3148798325e516f Mon Sep 17 00:00:00 2001 From: Yangyang Li Date: Tue, 16 Jun 2020 21:39:38 +0800 Subject: RDMA/hns: Fix an cmd queue issue when resetting If a IMP reset caused by some hardware errors and hns RoCE driver reset occurred at the same time, there is a possiblity that the IMP will stop dealing with command and users can't use the hardware. The logs are as follows: hns3 0000:fd:00.1: cleaned 0, need to clean 1 hns3 0000:fd:00.1: firmware version query failed -11 hns3 0000:fd:00.1: Cmd queue init failed hns3 0000:fd:00.1: Upgrade reset level hns3 0000:fd:00.1: global reset interrupt The hns NIC driver divides the reset process into 3 status: initialization, hardware resetting and softwaring restting. RoCE driver gets reset status by interfaces provided by NIC driver and commands will not be sent to the IMP if the driver is in any above status. The main reason for this issue is that there is a time gap between status 1 and 2, if the RoCE driver sends commands to the IMP during this gap, the IMP will stop working because it is not ready. To eliminate the time gap, the hns NIC driver has added a new interface in commit a4de02287abb9 ("net: hns3: provide .get_cmdq_stat interface for the client"), so RoCE driver can ensure that no commands will be sent during resetting. Link: https://lore.kernel.org/r/1592314778-52822-1-git-send-email-liweihang@huawei.com Signed-off-by: Yangyang Li Signed-off-by: Weihang Li Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index bb86754cb94d..dd01a51816cc 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -910,7 +910,7 @@ static int hns_roce_v2_rst_process_cmd(struct hns_roce_dev *hr_dev) instance_stage = handle->rinfo.instance_state; reset_stage = handle->rinfo.reset_state; reset_cnt = ops->ae_dev_reset_cnt(handle); - hw_resetting = ops->get_hw_reset_stat(handle); + hw_resetting = ops->get_cmdq_stat(handle); sw_resetting = ops->ae_dev_resetting(handle); if (reset_cnt != hr_dev->reset_cnt) -- cgit v1.2.3 From 4121fb0db68ed4de574f9bdc630b75fcc99b4835 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Wed, 17 Jun 2020 09:18:26 +0300 Subject: RDMA/core: Check that type_attrs is not NULL prior access In disassociate flow, the type_attrs is set to be NULL, which is in an implicit way is checked in alloc_uobj() by "if (!attrs->context)". Change the logic to rely on that check, to be consistent with other alloc_uobj() places that will fix the following kernel splat. BUG: kernel NULL pointer dereference, address: 0000000000000018 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 0 P4D 0 Oops: 0000 [#1] SMP PTI CPU: 3 PID: 2743 Comm: python3 Not tainted 5.7.0-rc6-for-upstream-perf-2020-05-23_19-04-38-5 #1 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.12.1-0-ga5cab58e9a3f-prebuilt.qemu.org 04/01/2014 RIP: 0010:alloc_begin_fd_uobject+0x18/0xf0 [ib_uverbs] Code: 89 43 48 eb 97 66 66 66 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 41 55 49 89 f5 41 54 55 48 89 fd 53 48 83 ec 08 48 8b 1f <48> 8b 43 18 48 8b 80 80 00 00 00 48 3d 20 10 33 a0 74 1c 48 3d 30 RSP: 0018:ffffc90001127b70 EFLAGS: 00010282 RAX: ffffffffa0339fe0 RBX: 0000000000000000 RCX: 8000000000000007 RDX: fffffffffffffffb RSI: ffffc90001127d28 RDI: ffff88843fe1f600 RBP: ffff88843fe1f600 R08: ffff888461eb06d8 R09: ffff888461eb06f8 R10: ffff888461eb0700 R11: 0000000000000000 R12: ffff88846a5f6450 R13: ffffc90001127d28 R14: ffff88845d7d6ea0 R15: ffffc90001127cb8 FS: 00007f469bff1540(0000) GS:ffff88846f980000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000018 CR3: 0000000450018003 CR4: 0000000000760ee0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 PKRU: 55555554 Call Trace: ? xa_store+0x28/0x40 rdma_alloc_begin_uobject+0x4f/0x90 [ib_uverbs] ib_uverbs_create_comp_channel+0x87/0xf0 [ib_uverbs] ib_uverbs_handler_UVERBS_METHOD_INVOKE_WRITE+0xb1/0xf0 [ib_uverbs] ib_uverbs_cmd_verbs.isra.8+0x96d/0xae0 [ib_uverbs] ? get_page_from_freelist+0x3bb/0xf70 ? _copy_to_user+0x22/0x30 ? uverbs_disassociate_api+0xd0/0xd0 [ib_uverbs] ? __wake_up_common_lock+0x87/0xc0 ib_uverbs_ioctl+0xbc/0x130 [ib_uverbs] ksys_ioctl+0x83/0xc0 ? ksys_write+0x55/0xd0 __x64_sys_ioctl+0x16/0x20 do_syscall_64+0x48/0x130 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x7f469ac43267 Fixes: 849e149063bd ("RDMA/core: Do not allow alloc_commit to fail") Link: https://lore.kernel.org/r/20200617061826.2625359-1-leon@kernel.org Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/rdma_core.c | 36 +++++++++++++++++++++--------------- 1 file changed, 21 insertions(+), 15 deletions(-) diff --git a/drivers/infiniband/core/rdma_core.c b/drivers/infiniband/core/rdma_core.c index 38de4942c682..3027cd2fb247 100644 --- a/drivers/infiniband/core/rdma_core.c +++ b/drivers/infiniband/core/rdma_core.c @@ -470,40 +470,46 @@ static struct ib_uobject * alloc_begin_fd_uobject(const struct uverbs_api_object *obj, struct uverbs_attr_bundle *attrs) { - const struct uverbs_obj_fd_type *fd_type = - container_of(obj->type_attrs, struct uverbs_obj_fd_type, type); + const struct uverbs_obj_fd_type *fd_type; int new_fd; - struct ib_uobject *uobj; + struct ib_uobject *uobj, *ret; struct file *filp; + uobj = alloc_uobj(attrs, obj); + if (IS_ERR(uobj)) + return uobj; + + fd_type = + container_of(obj->type_attrs, struct uverbs_obj_fd_type, type); if (WARN_ON(fd_type->fops->release != &uverbs_uobject_fd_release && - fd_type->fops->release != &uverbs_async_event_release)) - return ERR_PTR(-EINVAL); + fd_type->fops->release != &uverbs_async_event_release)) { + ret = ERR_PTR(-EINVAL); + goto err_fd; + } new_fd = get_unused_fd_flags(O_CLOEXEC); - if (new_fd < 0) - return ERR_PTR(new_fd); - - uobj = alloc_uobj(attrs, obj); - if (IS_ERR(uobj)) + if (new_fd < 0) { + ret = ERR_PTR(new_fd); goto err_fd; + } /* Note that uverbs_uobject_fd_release() is called during abort */ filp = anon_inode_getfile(fd_type->name, fd_type->fops, NULL, fd_type->flags); if (IS_ERR(filp)) { - uverbs_uobject_put(uobj); - uobj = ERR_CAST(filp); - goto err_fd; + ret = ERR_CAST(filp); + goto err_getfile; } uobj->object = filp; uobj->id = new_fd; return uobj; -err_fd: +err_getfile: put_unused_fd(new_fd); - return uobj; +err_fd: + uverbs_uobject_put(uobj); + return ret; } struct ib_uobject *rdma_alloc_begin_uobject(const struct uverbs_api_object *obj, -- cgit v1.2.3 From 6c41965d647a97b51ff665c7406ec9435aab7fc1 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Wed, 17 Jun 2020 16:01:48 +0300 Subject: RDMA/mlx5: Don't access ib_qp fields in internal destroy QP path destroy_qp_common is called for flows where QP is already created by HW. While it is called from IB/core, the ibqp.* fields will be fully initialized, but it is not the case if this function is called during QP creation. Don't rely on ibqp fields as much as possible and initialize send_cq/recv_cq as temporal solution till all drivers will be converted to IB/core QP allocation scheme. refcount_t: underflow; use-after-free. WARNING: CPU: 1 PID: 5372 at lib/refcount.c:28 refcount_warn_saturate+0xfe/0x1a0 Kernel panic - not syncing: panic_on_warn set ... CPU: 1 PID: 5372 Comm: syz-executor.2 Not tainted 5.5.0-rc5 #2 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.1-0-ga5cab58e9a3f-prebuilt.qemu.org 04/01/2014 Call Trace: mlx5_core_put_rsc+0x70/0x80 destroy_resource_common+0x8e/0xb0 mlx5_core_destroy_qp+0xaf/0x1d0 mlx5_ib_destroy_qp+0xeb0/0x1460 ib_destroy_qp_user+0x2d5/0x7d0 create_qp+0xed3/0x2130 ib_uverbs_create_qp+0x13e/0x190 ? ib_uverbs_ex_create_qp ib_uverbs_write+0xaa5/0xdf0 __vfs_write+0x7c/0x100 ksys_write+0xc8/0x200 do_syscall_64+0x9c/0x390 entry_SYSCALL_64_after_hwframe+0x44/0xa9 Fixes: 08d53976609a ("RDMA/mlx5: Copy response to the user in one place") Link: https://lore.kernel.org/r/20200617130148.2846643-1-leon@kernel.org Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/qp.c | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index dbe82cdb8d2c..1e567fe3a527 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -2341,18 +2341,18 @@ static void destroy_qp_common(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, unsigned long flags; int err; - if (qp->ibqp.rwq_ind_tbl) { + if (qp->is_rss) { destroy_rss_raw_qp_tir(dev, qp); return; } - base = (qp->ibqp.qp_type == IB_QPT_RAW_PACKET || + base = (qp->type == IB_QPT_RAW_PACKET || qp->flags & IB_QP_CREATE_SOURCE_QPN) ? - &qp->raw_packet_qp.rq.base : - &qp->trans_qp.base; + &qp->raw_packet_qp.rq.base : + &qp->trans_qp.base; if (qp->state != IB_QPS_RESET) { - if (qp->ibqp.qp_type != IB_QPT_RAW_PACKET && + if (qp->type != IB_QPT_RAW_PACKET && !(qp->flags & IB_QP_CREATE_SOURCE_QPN)) { err = mlx5_core_qp_modify(dev, MLX5_CMD_OP_2RST_QP, 0, NULL, &base->mqp, NULL); @@ -2368,8 +2368,8 @@ static void destroy_qp_common(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, base->mqp.qpn); } - get_cqs(qp->ibqp.qp_type, qp->ibqp.send_cq, qp->ibqp.recv_cq, - &send_cq, &recv_cq); + get_cqs(qp->type, qp->ibqp.send_cq, qp->ibqp.recv_cq, &send_cq, + &recv_cq); spin_lock_irqsave(&dev->reset_flow_resource_lock, flags); mlx5_ib_lock_cqs(send_cq, recv_cq); @@ -2391,7 +2391,7 @@ static void destroy_qp_common(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, mlx5_ib_unlock_cqs(send_cq, recv_cq); spin_unlock_irqrestore(&dev->reset_flow_resource_lock, flags); - if (qp->ibqp.qp_type == IB_QPT_RAW_PACKET || + if (qp->type == IB_QPT_RAW_PACKET || qp->flags & IB_QP_CREATE_SOURCE_QPN) { destroy_raw_packet_qp(dev, qp); } else { @@ -3002,10 +3002,18 @@ struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *attr, return &qp->ibqp; destroy_qp: - if (qp->type == MLX5_IB_QPT_DCT) + if (qp->type == MLX5_IB_QPT_DCT) { mlx5_ib_destroy_dct(qp); - else + } else { + /* + * The two lines below are temp solution till QP allocation + * will be moved to be under IB/core responsiblity. + */ + qp->ibqp.send_cq = attr->send_cq; + qp->ibqp.recv_cq = attr->recv_cq; destroy_qp_common(dev, qp, udata); + } + qp = NULL; free_qp: kfree(qp); -- cgit v1.2.3 From d44335572f76020ef473d7977d9fd424e6fc0021 Mon Sep 17 00:00:00 2001 From: Maor Gottlieb Date: Thu, 18 Jun 2020 14:25:07 +0300 Subject: RDMA/mlx5: Fix remote gid value in query QP Remote gid is not copied to the right address. Fix it by using rdma_ah_set_dgid_raw to copy the remote gid value from the QP context on query QP. Fixes: 70bd7fb87625 ("RDMA/mlx5: Remove manually crafted QP context the query call") Link: https://lore.kernel.org/r/20200618112507.3453496-3-leon@kernel.org Signed-off-by: Maor Gottlieb Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/qp.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 1e567fe3a527..264e6d228803 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -4390,8 +4390,7 @@ static void to_rdma_ah_attr(struct mlx5_ib_dev *ibdev, MLX5_GET(ads, path, src_addr_index), MLX5_GET(ads, path, hop_limit), MLX5_GET(ads, path, tclass)); - memcpy(ah_attr, MLX5_ADDR_OF(ads, path, rgid_rip), - MLX5_FLD_SZ_BYTES(ads, rgid_rip)); + rdma_ah_set_dgid_raw(ah_attr, MLX5_ADDR_OF(ads, path, rgid_rip)); } } -- cgit v1.2.3 From 2c0f5292d5358c2c5576146071d641110c3c1612 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Thu, 18 Jun 2020 14:25:06 +0300 Subject: RDMA/mlx5: Remove ECE limitation from the RAW_PACKET QPs Like any other QP type, rely on FW for the RAW_PACKET QPs to decide if ECE is supported or not. This fixes an inability to create RAW_PACKET QPs with latest rdma-core with the ECE support. Fixes: e383085c2425 ("RDMA/mlx5: Set ECE options during QP create") Link: https://lore.kernel.org/r/20200618112507.3453496-2-leon@kernel.org Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/qp.c | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 264e6d228803..eb5b724b121b 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -2873,7 +2873,6 @@ static int mlx5_ib_destroy_dct(struct mlx5_ib_qp *mqp) static int check_ucmd_data(struct mlx5_ib_dev *dev, struct mlx5_create_qp_params *params) { - struct ib_qp_init_attr *attr = params->attr; struct ib_udata *udata = params->udata; size_t size, last; int ret; @@ -2885,14 +2884,7 @@ static int check_ucmd_data(struct mlx5_ib_dev *dev, */ last = sizeof(struct mlx5_ib_create_qp_rss); else - /* IB_QPT_RAW_PACKET doesn't have ECE data */ - switch (attr->qp_type) { - case IB_QPT_RAW_PACKET: - last = offsetof(struct mlx5_ib_create_qp, ece_options); - break; - default: - last = offsetof(struct mlx5_ib_create_qp, reserved); - } + last = offsetof(struct mlx5_ib_create_qp, reserved); if (udata->inlen <= last) return 0; -- cgit v1.2.3 From 9e0dc7b9e1cbc9cd0dc4ab2377c0536d4b18bedc Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Wed, 17 Jun 2020 16:02:30 +0300 Subject: RDMA/mlx5: Fix integrity enabled QP creation create_flags checks was refactored and broke the creation on integrity enabled QPs and actually broke the NVMe/RDMA and iSER ULP's when using mlx5 driven devices. Fixes: 2978975ce7f1 ("RDMA/mlx5: Process create QP flags in one place") Link: https://lore.kernel.org/r/20200617130230.2846915-1-leon@kernel.org Signed-off-by: Max Gurtovoy Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/qp.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index eb5b724b121b..a7fcb00e37a5 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -2668,6 +2668,9 @@ static int process_create_flags(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, if (qp_type == IB_QPT_RAW_PACKET && attr->rwq_ind_tbl) return (create_flags) ? -EINVAL : 0; + process_create_flag(dev, &create_flags, + IB_QP_CREATE_INTEGRITY_EN, + MLX5_CAP_GEN(mdev, sho), qp); process_create_flag(dev, &create_flags, IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK, MLX5_CAP_GEN(mdev, block_lb_mc), qp); -- cgit v1.2.3 From a17f4bed811c60712d8131883cdba11a105d0161 Mon Sep 17 00:00:00 2001 From: Fan Guo Date: Fri, 12 Jun 2020 14:38:24 +0800 Subject: RDMA/mad: Fix possible memory leak in ib_mad_post_receive_mads() If ib_dma_mapping_error() returns non-zero value, ib_mad_post_receive_mads() will jump out of loops and return -ENOMEM without freeing mad_priv. Fix this memory-leak problem by freeing mad_priv in this case. Fixes: 2c34e68f4261 ("IB/mad: Check and handle potential DMA mapping errors") Link: https://lore.kernel.org/r/20200612063824.180611-1-guofan5@huawei.com Signed-off-by: Fan Guo Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/mad.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c index 186e0d652e8b..5e080191a725 100644 --- a/drivers/infiniband/core/mad.c +++ b/drivers/infiniband/core/mad.c @@ -2718,6 +2718,7 @@ static int ib_mad_post_receive_mads(struct ib_mad_qp_info *qp_info, DMA_FROM_DEVICE); if (unlikely(ib_dma_mapping_error(qp_info->port_priv->device, sg_list.addr))) { + kfree(mad_priv); ret = -ENOMEM; break; } -- cgit v1.2.3 From c1d869d64a1955817c4d6fff08ecbbe8e59d36f8 Mon Sep 17 00:00:00 2001 From: Mark Zhang Date: Sun, 21 Jun 2020 14:00:00 +0300 Subject: RDMA/counter: Query a counter before release Query a dynamically-allocated counter before release it, to update it's hwcounters and log all of them into history data. Otherwise all values of these hwcounters will be lost. Fixes: f34a55e497e8 ("RDMA/core: Get sum value of all counters when perform a sysfs stat read") Link: https://lore.kernel.org/r/20200621110000.56059-1-leon@kernel.org Signed-off-by: Mark Zhang Reviewed-by: Maor Gottlieb Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/counters.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/core/counters.c b/drivers/infiniband/core/counters.c index 2257d7f7810f..738d1faf4bba 100644 --- a/drivers/infiniband/core/counters.c +++ b/drivers/infiniband/core/counters.c @@ -202,7 +202,7 @@ static int __rdma_counter_unbind_qp(struct ib_qp *qp) return ret; } -static void counter_history_stat_update(const struct rdma_counter *counter) +static void counter_history_stat_update(struct rdma_counter *counter) { struct ib_device *dev = counter->device; struct rdma_port_counter *port_counter; @@ -212,6 +212,8 @@ static void counter_history_stat_update(const struct rdma_counter *counter) if (!port_counter->hstats) return; + rdma_counter_query_stats(counter); + for (i = 0; i < counter->stats->num_counters; i++) port_counter->hstats->value[i] += counter->stats->value[i]; } -- cgit v1.2.3 From 6eefa839c4dddf2149e9f5f6f1aa3e1191c8db9c Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Sun, 21 Jun 2020 14:59:59 +0300 Subject: RDMA/mlx5: Protect from kernel crash if XRC_TGT doesn't have udata Don't deref udata if it is NULL BUG: kernel NULL pointer dereference, address: 0000000000000030 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 0 P4D 0 Oops: 0000 SMP PTI CPU: 2 PID: 1592 Comm: python3 Not tainted 5.7.0-rc6+ #1 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.12.1-0-ga5cab58e9a3f-prebuilt.qemu.org 04/01/2014 RIP: 0010:create_qp+0x39e/0xae0 [mlx5_ib] Code: c0 0d 00 00 bf 10 01 00 00 e8 be a9 e4 e0 48 85 c0 49 89 c2 0f 84 0c 07 00 00 41 8b 85 74 63 01 00 0f c8 a9 00 00 00 10 74 0a <41> 8b 46 30 0f c8 41 89 42 14 41 8b 52 18 41 0f b6 4a 1c 0f ca 89 RSP: 0018:ffffc9000067f8b0 EFLAGS: 00010206 RAX: 0000000010170000 RBX: ffff888441313000 RCX: 0000000000000000 RDX: 0000000000000200 RSI: 0000000000000000 RDI: ffff88845b1d4400 RBP: ffffc9000067fa60 R08: 0000000000000200 R09: ffff88845b1d4200 R10: ffff88845b1d4200 R11: ffff888441313000 R12: ffffc9000067f950 R13: ffff88846ac00140 R14: 0000000000000000 R15: ffff88846c2bc000 FS: 00007faa1a3c0540(0000) GS:ffff88846fd00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000030 CR3: 0000000446dca003 CR4: 0000000000760ea0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 PKRU: 55555554 Call Trace: ? __switch_to_asm+0x40/0x70 ? __switch_to_asm+0x34/0x70 mlx5_ib_create_qp+0x897/0xfa0 [mlx5_ib] ib_create_qp+0x9e/0x300 [ib_core] create_qp+0x92d/0xb20 [ib_uverbs] ? ib_uverbs_cq_event_handler+0x30/0x30 [ib_uverbs] ? release_resource+0x30/0x30 ib_uverbs_create_qp+0xc4/0xe0 [ib_uverbs] ib_uverbs_handler_UVERBS_METHOD_INVOKE_WRITE+0xc8/0xf0 [ib_uverbs] ib_uverbs_run_method+0x223/0x770 [ib_uverbs] ? track_pfn_remap+0xa7/0x100 ? uverbs_disassociate_api+0xd0/0xd0 [ib_uverbs] ? remap_pfn_range+0x358/0x490 ib_uverbs_cmd_verbs.isra.6+0x19b/0x370 [ib_uverbs] ? rdma_umap_priv_init+0x82/0xe0 [ib_core] ? vm_mmap_pgoff+0xec/0x120 ib_uverbs_ioctl+0xc0/0x120 [ib_uverbs] ksys_ioctl+0x92/0xb0 __x64_sys_ioctl+0x16/0x20 do_syscall_64+0x48/0x130 entry_SYSCALL_64_after_hwframe+0x44/0xa9 Fixes: e383085c2425 ("RDMA/mlx5: Set ECE options during QP create") Link: https://lore.kernel.org/r/20200621115959.60126-1-leon@kernel.org Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/qp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index a7fcb00e37a5..f939c9b769f0 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -1862,7 +1862,7 @@ static int create_xrc_tgt_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, if (!in) return -ENOMEM; - if (MLX5_CAP_GEN(mdev, ece_support)) + if (MLX5_CAP_GEN(mdev, ece_support) && ucmd) MLX5_SET(create_qp_in, in, ece, ucmd->ece_options); qpc = MLX5_ADDR_OF(create_qp_in, in, qpc); -- cgit v1.2.3 From 116a1b9f1cb769b83e5adff323f977a62b1dcb2e Mon Sep 17 00:00:00 2001 From: Shay Drory Date: Sun, 21 Jun 2020 13:47:35 +0300 Subject: IB/mad: Fix use after free when destroying MAD agent Currently, when RMPP MADs are processed while the MAD agent is destroyed, it could result in use after free of rmpp_recv, as decribed below: cpu-0 cpu-1 ----- ----- ib_mad_recv_done() ib_mad_complete_recv() ib_process_rmpp_recv_wc() unregister_mad_agent() ib_cancel_rmpp_recvs() cancel_delayed_work() process_rmpp_data() start_rmpp() queue_delayed_work(rmpp_recv->cleanup_work) destroy_rmpp_recv() free_rmpp_recv() cleanup_work()[1] spin_lock_irqsave(&rmpp_recv->agent->lock) <-- use after free [1] cleanup_work() == recv_cleanup_handler Fix it by waiting for the MAD agent reference count becoming zero before calling to ib_cancel_rmpp_recvs(). Fixes: 9a41e38a467c ("IB/mad: Use IDR for agent IDs") Link: https://lore.kernel.org/r/20200621104738.54850-2-leon@kernel.org Signed-off-by: Shay Drory Reviewed-by: Maor Gottlieb Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/mad.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c index 5e080191a725..a09f8e3c7f3f 100644 --- a/drivers/infiniband/core/mad.c +++ b/drivers/infiniband/core/mad.c @@ -509,10 +509,10 @@ static void unregister_mad_agent(struct ib_mad_agent_private *mad_agent_priv) xa_erase(&ib_mad_clients, mad_agent_priv->agent.hi_tid); flush_workqueue(port_priv->wq); - ib_cancel_rmpp_recvs(mad_agent_priv); deref_mad_agent(mad_agent_priv); wait_for_completion(&mad_agent_priv->comp); + ib_cancel_rmpp_recvs(mad_agent_priv); ib_mad_agent_security_cleanup(&mad_agent_priv->agent); -- cgit v1.2.3 From b46925a24a9ca7db03655657565e03d2de3027c8 Mon Sep 17 00:00:00 2001 From: Dennis Dalessandro Date: Tue, 23 Jun 2020 16:32:24 -0400 Subject: IB/hfi1: Restore kfree in dummy_netdev cleanup We need to do some rework on the dummy netdev. Calling the free_netdev() would normally make sense, and that will be addressed in an upcoming patch. For now just revert the behavior to what it was before keeping the unused variable removal part of the patch. The dd->dumm_netdev is mainly used for packet receiving through alloc_netdev_mqs() for typical net devices. A a result, it should be freed with kfree instead of free_netdev() that leads to a crash when unloading the hfi1 module: BUG: kernel NULL pointer dereference, address: 0000000000000000 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 8000000855b54067 P4D 8000000855b54067 PUD 84a4f5067 PMD 0 Oops: 0000 [#1] SMP PTI CPU: 73 PID: 10299 Comm: modprobe Not tainted 5.6.0-rc5+ #1 Hardware name: Intel Corporation S2600WT2R/S2600WT2R, BIOS SE5C610.86B.01.01.0016.033120161139 03/31/2016 RIP: 0010:__hw_addr_flush+0x12/0x80 Code: 40 00 48 83 c4 08 4c 89 e7 5b 5d 41 5c e9 76 77 18 00 66 0f 1f 44 00 00 0f 1f 44 00 00 41 54 49 89 fc 55 53 48 8b 1f 48 39 df <48> 8b 2b 75 08 eb 4a 48 89 eb 48 89 c5 48 89 df e8 99 bf d0 ff 84 RSP: 0018:ffffb40e08783db8 EFLAGS: 00010282 RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000002 RDX: ffffb40e00000000 RSI: 0000000000000246 RDI: ffff88ab13662298 RBP: ffff88ab13662000 R08: 0000000000001549 R09: 0000000000001549 R10: 0000000000000001 R11: 0000000000aaaaaa R12: ffff88ab13662298 R13: ffff88ab1b259e20 R14: ffff88ab1b259e42 R15: 0000000000000000 FS: 00007fb39b534740(0000) GS:ffff88b31f940000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000000 CR3: 000000084d3ea004 CR4: 00000000003606e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: dev_addr_flush+0x15/0x30 free_netdev+0x7e/0x130 hfi1_netdev_free+0x59/0x70 [hfi1] remove_one+0x65/0x110 [hfi1] pci_device_remove+0x3b/0xc0 device_release_driver_internal+0xec/0x1b0 driver_detach+0x46/0x90 bus_remove_driver+0x58/0xd0 pci_unregister_driver+0x26/0xa0 hfi1_mod_cleanup+0xc/0xd54 [hfi1] __x64_sys_delete_module+0x16c/0x260 ? exit_to_usermode_loop+0xa4/0xc0 do_syscall_64+0x5b/0x200 entry_SYSCALL_64_after_hwframe+0x44/0xa9 Fixes: 193ba03141bb ("IB/hfi1: Use free_netdev() in hfi1_netdev_free()") Link: https://lore.kernel.org/r/20200623203224.106975.16926.stgit@awfm-01.aw.intel.com Reviewed-by: Mike Marciniszyn Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/netdev_rx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/hfi1/netdev_rx.c b/drivers/infiniband/hw/hfi1/netdev_rx.c index 63688e85e8da..6d263c9749b3 100644 --- a/drivers/infiniband/hw/hfi1/netdev_rx.c +++ b/drivers/infiniband/hw/hfi1/netdev_rx.c @@ -373,7 +373,7 @@ void hfi1_netdev_free(struct hfi1_devdata *dd) { if (dd->dummy_netdev) { dd_dev_info(dd, "hfi1 netdev freed\n"); - free_netdev(dd->dummy_netdev); + kfree(dd->dummy_netdev); dd->dummy_netdev = NULL; } } -- cgit v1.2.3 From 822fbd37410639acdae368ea55477ddd3498651d Mon Sep 17 00:00:00 2001 From: Dennis Dalessandro Date: Tue, 23 Jun 2020 16:32:30 -0400 Subject: IB/hfi1: Fix module use count flaw due to leftover module put calls When the try_module_get calls were removed from opening and closing of the i2c debugfs file, the corresponding module_put calls were missed. This results in an inaccurate module use count that requires a power cycle to fix. Fixes: 09fbca8e6240 ("IB/hfi1: No need to use try_module_get for debugfs") Link: https://lore.kernel.org/r/20200623203230.106975.76240.stgit@awfm-01.aw.intel.com Cc: Reviewed-by: Kaike Wan Reviewed-by: Mike Marciniszyn Signed-off-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/debugfs.c | 19 ++----------------- 1 file changed, 2 insertions(+), 17 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/debugfs.c b/drivers/infiniband/hw/hfi1/debugfs.c index 4633a0ce1a8c..2ced236e1553 100644 --- a/drivers/infiniband/hw/hfi1/debugfs.c +++ b/drivers/infiniband/hw/hfi1/debugfs.c @@ -985,15 +985,10 @@ static ssize_t qsfp2_debugfs_read(struct file *file, char __user *buf, static int __i2c_debugfs_open(struct inode *in, struct file *fp, u32 target) { struct hfi1_pportdata *ppd; - int ret; ppd = private2ppd(fp); - ret = acquire_chip_resource(ppd->dd, i2c_target(target), 0); - if (ret) /* failed - release the module */ - module_put(THIS_MODULE); - - return ret; + return acquire_chip_resource(ppd->dd, i2c_target(target), 0); } static int i2c1_debugfs_open(struct inode *in, struct file *fp) @@ -1013,7 +1008,6 @@ static int __i2c_debugfs_release(struct inode *in, struct file *fp, u32 target) ppd = private2ppd(fp); release_chip_resource(ppd->dd, i2c_target(target)); - module_put(THIS_MODULE); return 0; } @@ -1031,18 +1025,10 @@ static int i2c2_debugfs_release(struct inode *in, struct file *fp) static int __qsfp_debugfs_open(struct inode *in, struct file *fp, u32 target) { struct hfi1_pportdata *ppd; - int ret; - - if (!try_module_get(THIS_MODULE)) - return -ENODEV; ppd = private2ppd(fp); - ret = acquire_chip_resource(ppd->dd, i2c_target(target), 0); - if (ret) /* failed - release the module */ - module_put(THIS_MODULE); - - return ret; + return acquire_chip_resource(ppd->dd, i2c_target(target), 0); } static int qsfp1_debugfs_open(struct inode *in, struct file *fp) @@ -1062,7 +1048,6 @@ static int __qsfp_debugfs_release(struct inode *in, struct file *fp, u32 target) ppd = private2ppd(fp); release_chip_resource(ppd->dd, i2c_target(target)); - module_put(THIS_MODULE); return 0; } -- cgit v1.2.3 From 82172b765530f84b4b9da929f2dcf46f2b7b232b Mon Sep 17 00:00:00 2001 From: Mike Marciniszyn Date: Tue, 23 Jun 2020 16:43:22 -0400 Subject: IB/hfi1: Correct -EBUSY handling in tx code The current code mishandles -EBUSY in two ways: - The flow change doesn't test the return from the flush and runs on to process the current packet racing with the wakeup processing - The -EBUSY handling for a single packet inserts the tx into the txlist after the submit call, racing with the same wakeup processing Fix the first by dropping the skb and returning NETDEV_TX_OK. Fix the second by insuring the the list entry within the txreq is inited when allocated. This enables the sleep routine to detect that the txreq has used the non-list api and queue the packet to the txlist. Both flaws can lead to having the flushing thread executing in causing two threads to manipulate the txlist. Fixes: d99dc602e2a5 ("IB/hfi1: Add functions to transmit datagram ipoib packets") Link: https://lore.kernel.org/r/20200623204321.108092.83898.stgit@awfm-01.aw.intel.com Reviewed-by: Kaike Wan Signed-off-by: Mike Marciniszyn Signed-off-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/ipoib_tx.c | 35 ++++++++++++++++++----------------- 1 file changed, 18 insertions(+), 17 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/ipoib_tx.c b/drivers/infiniband/hw/hfi1/ipoib_tx.c index 175290c56db9..76dcb5624aa6 100644 --- a/drivers/infiniband/hw/hfi1/ipoib_tx.c +++ b/drivers/infiniband/hw/hfi1/ipoib_tx.c @@ -369,6 +369,7 @@ static struct ipoib_txreq *hfi1_ipoib_send_dma_common(struct net_device *dev, tx->priv = priv; tx->txq = txp->txq; tx->skb = skb; + INIT_LIST_HEAD(&tx->txreq.list); hfi1_ipoib_build_ib_tx_headers(tx, txp); @@ -469,6 +470,7 @@ static int hfi1_ipoib_send_dma_single(struct net_device *dev, ret = hfi1_ipoib_submit_tx(txq, tx); if (likely(!ret)) { +tx_ok: trace_sdma_output_ibhdr(tx->priv->dd, &tx->sdma_hdr.hdr, ib_is_sc5(txp->flow.sc5)); @@ -478,20 +480,8 @@ static int hfi1_ipoib_send_dma_single(struct net_device *dev, txq->pkts_sent = false; - if (ret == -EBUSY) { - list_add_tail(&tx->txreq.list, &txq->tx_list); - - trace_sdma_output_ibhdr(tx->priv->dd, - &tx->sdma_hdr.hdr, - ib_is_sc5(txp->flow.sc5)); - hfi1_ipoib_check_queue_depth(txq); - return NETDEV_TX_OK; - } - - if (ret == -ECOMM) { - hfi1_ipoib_check_queue_depth(txq); - return NETDEV_TX_OK; - } + if (ret == -EBUSY || ret == -ECOMM) + goto tx_ok; sdma_txclean(priv->dd, &tx->txreq); dev_kfree_skb_any(skb); @@ -509,9 +499,17 @@ static int hfi1_ipoib_send_dma_list(struct net_device *dev, struct ipoib_txreq *tx; /* Has the flow change ? */ - if (txq->flow.as_int != txp->flow.as_int) - (void)hfi1_ipoib_flush_tx_list(dev, txq); - + if (txq->flow.as_int != txp->flow.as_int) { + int ret; + + ret = hfi1_ipoib_flush_tx_list(dev, txq); + if (unlikely(ret)) { + if (ret == -EBUSY) + ++dev->stats.tx_dropped; + dev_kfree_skb_any(skb); + return NETDEV_TX_OK; + } + } tx = hfi1_ipoib_send_dma_common(dev, skb, txp); if (IS_ERR(tx)) { int ret = PTR_ERR(tx); @@ -612,6 +610,9 @@ static int hfi1_ipoib_sdma_sleep(struct sdma_engine *sde, netif_stop_subqueue(txq->priv->netdev, txq->q_idx); + if (list_empty(&txreq->list)) + /* came from non-list submit */ + list_add_tail(&txreq->list, &txq->tx_list); if (list_empty(&txq->wait.list)) iowait_queue(pkts_sent, wait->iow, &sde->dmawait); -- cgit v1.2.3 From 38fd98afeeb79d3b148db49f81f2ec6a37a4ee00 Mon Sep 17 00:00:00 2001 From: Mike Marciniszyn Date: Tue, 23 Jun 2020 16:43:28 -0400 Subject: IB/hfi1: Add atomic triggered sleep/wakeup When running iperf in a two host configuration the following trace can occur: [ 319.728730] NETDEV WATCHDOG: ib0 (hfi1): transmit queue 0 timed out The issue happens because the current implementation relies on the netif txq being stopped to control the flushing of the tx list. There are two resources that the transmit logic can wait on and stop the txq: - SDMA descriptors - Ring space to hold completions The ring space is tested on the sending side and relieved when the ring is consumed in the napi tx reaping. Unfortunately, that reaping can run conncurrently with the workqueue flushing of the txlist. If the txq is started just before the workitem executes, the txlist will never be flushed, leading to the txq being stuck. Fix by: - Adding sleep/wakeup wrappers * Use an atomic to control the call to the netif routines inside the wrappers - Use another atomic to record ring space exhaustion * Only wakeup when the a ring space exhaustion has happened and it relieved Add additional wrappers to clarify the ring space resource handling. Fixes: d99dc602e2a5 ("IB/hfi1: Add functions to transmit datagram ipoib packets") Link: https://lore.kernel.org/r/20200623204327.108092.4024.stgit@awfm-01.aw.intel.com Reviewed-by: Kaike Wan Signed-off-by: Mike Marciniszyn Signed-off-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/ipoib.h | 6 ++++ drivers/infiniband/hw/hfi1/ipoib_tx.c | 67 ++++++++++++++++++++++++----------- 2 files changed, 53 insertions(+), 20 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/ipoib.h b/drivers/infiniband/hw/hfi1/ipoib.h index 185c9b02c974..b8c9d0a003fb 100644 --- a/drivers/infiniband/hw/hfi1/ipoib.h +++ b/drivers/infiniband/hw/hfi1/ipoib.h @@ -67,6 +67,9 @@ struct hfi1_ipoib_circ_buf { * @sde: sdma engine * @tx_list: tx request list * @sent_txreqs: count of txreqs posted to sdma + * @stops: count of stops of queue + * @ring_full: ring has been filled + * @no_desc: descriptor shortage seen * @flow: tracks when list needs to be flushed for a flow change * @q_idx: ipoib Tx queue index * @pkts_sent: indicator packets have been sent from this queue @@ -80,6 +83,9 @@ struct hfi1_ipoib_txq { struct sdma_engine *sde; struct list_head tx_list; u64 sent_txreqs; + atomic_t stops; + atomic_t ring_full; + atomic_t no_desc; union hfi1_ipoib_flow flow; u8 q_idx; bool pkts_sent; diff --git a/drivers/infiniband/hw/hfi1/ipoib_tx.c b/drivers/infiniband/hw/hfi1/ipoib_tx.c index 76dcb5624aa6..9df292b51a05 100644 --- a/drivers/infiniband/hw/hfi1/ipoib_tx.c +++ b/drivers/infiniband/hw/hfi1/ipoib_tx.c @@ -55,23 +55,48 @@ static u64 hfi1_ipoib_txreqs(const u64 sent, const u64 completed) return sent - completed; } -static void hfi1_ipoib_check_queue_depth(struct hfi1_ipoib_txq *txq) +static u64 hfi1_ipoib_used(struct hfi1_ipoib_txq *txq) +{ + return hfi1_ipoib_txreqs(txq->sent_txreqs, + atomic64_read(&txq->complete_txreqs)); +} + +static void hfi1_ipoib_stop_txq(struct hfi1_ipoib_txq *txq) { - if (unlikely(hfi1_ipoib_txreqs(++txq->sent_txreqs, - atomic64_read(&txq->complete_txreqs)) >= - min_t(unsigned int, txq->priv->netdev->tx_queue_len, - txq->tx_ring.max_items - 1))) + if (atomic_inc_return(&txq->stops) == 1) netif_stop_subqueue(txq->priv->netdev, txq->q_idx); } +static void hfi1_ipoib_wake_txq(struct hfi1_ipoib_txq *txq) +{ + if (atomic_dec_and_test(&txq->stops)) + netif_wake_subqueue(txq->priv->netdev, txq->q_idx); +} + +static uint hfi1_ipoib_ring_hwat(struct hfi1_ipoib_txq *txq) +{ + return min_t(uint, txq->priv->netdev->tx_queue_len, + txq->tx_ring.max_items - 1); +} + +static uint hfi1_ipoib_ring_lwat(struct hfi1_ipoib_txq *txq) +{ + return min_t(uint, txq->priv->netdev->tx_queue_len, + txq->tx_ring.max_items) >> 1; +} + +static void hfi1_ipoib_check_queue_depth(struct hfi1_ipoib_txq *txq) +{ + ++txq->sent_txreqs; + if (hfi1_ipoib_used(txq) >= hfi1_ipoib_ring_hwat(txq) && + !atomic_xchg(&txq->ring_full, 1)) + hfi1_ipoib_stop_txq(txq); +} + static void hfi1_ipoib_check_queue_stopped(struct hfi1_ipoib_txq *txq) { struct net_device *dev = txq->priv->netdev; - /* If the queue is already running just return */ - if (likely(!__netif_subqueue_stopped(dev, txq->q_idx))) - return; - /* If shutting down just return as queue state is irrelevant */ if (unlikely(dev->reg_state != NETREG_REGISTERED)) return; @@ -86,11 +111,9 @@ static void hfi1_ipoib_check_queue_stopped(struct hfi1_ipoib_txq *txq) * Use the minimum of the current tx_queue_len or the rings max txreqs * to protect against ring overflow. */ - if (hfi1_ipoib_txreqs(txq->sent_txreqs, - atomic64_read(&txq->complete_txreqs)) - < min_t(unsigned int, dev->tx_queue_len, - txq->tx_ring.max_items) >> 1) - netif_wake_subqueue(dev, txq->q_idx); + if (hfi1_ipoib_used(txq) < hfi1_ipoib_ring_lwat(txq) && + atomic_xchg(&txq->ring_full, 0)) + hfi1_ipoib_wake_txq(txq); } static void hfi1_ipoib_free_tx(struct ipoib_txreq *tx, int budget) @@ -608,13 +631,14 @@ static int hfi1_ipoib_sdma_sleep(struct sdma_engine *sde, return -EAGAIN; } - netif_stop_subqueue(txq->priv->netdev, txq->q_idx); - if (list_empty(&txreq->list)) /* came from non-list submit */ list_add_tail(&txreq->list, &txq->tx_list); - if (list_empty(&txq->wait.list)) + if (list_empty(&txq->wait.list)) { + if (!atomic_xchg(&txq->no_desc, 1)) + hfi1_ipoib_stop_txq(txq); iowait_queue(pkts_sent, wait->iow, &sde->dmawait); + } write_sequnlock(&sde->waitlock); return -EBUSY; @@ -649,9 +673,9 @@ static void hfi1_ipoib_flush_txq(struct work_struct *work) struct net_device *dev = txq->priv->netdev; if (likely(dev->reg_state == NETREG_REGISTERED) && - likely(__netif_subqueue_stopped(dev, txq->q_idx)) && likely(!hfi1_ipoib_flush_tx_list(dev, txq))) - netif_wake_subqueue(dev, txq->q_idx); + if (atomic_xchg(&txq->no_desc, 0)) + hfi1_ipoib_wake_txq(txq); } int hfi1_ipoib_txreq_init(struct hfi1_ipoib_dev_priv *priv) @@ -705,6 +729,9 @@ int hfi1_ipoib_txreq_init(struct hfi1_ipoib_dev_priv *priv) txq->sde = NULL; INIT_LIST_HEAD(&txq->tx_list); atomic64_set(&txq->complete_txreqs, 0); + atomic_set(&txq->stops, 0); + atomic_set(&txq->ring_full, 0); + atomic_set(&txq->no_desc, 0); txq->q_idx = i; txq->flow.tx_queue = 0xff; txq->flow.sc5 = 0xff; @@ -770,7 +797,7 @@ static void hfi1_ipoib_drain_tx_list(struct hfi1_ipoib_txq *txq) atomic64_inc(complete_txreqs); } - if (hfi1_ipoib_txreqs(txq->sent_txreqs, atomic64_read(complete_txreqs))) + if (hfi1_ipoib_used(txq)) dd_dev_warn(txq->priv->dd, "txq %d not empty found %llu requests\n", txq->q_idx, -- cgit v1.2.3