From 0184afd15a141d7ce24c32c0d86a1e3ba6bc0eb3 Mon Sep 17 00:00:00 2001 From: Zhu Yanjun Date: Mon, 6 Apr 2020 20:35:01 +0300 Subject: RDMA/rxe: Set default vendor ID The RXE driver doesn't set vendor_id and user space applications see zeros. This causes to pyverbs tests to fail with the following traceback, because the expectation is to have valid vendor_id. Traceback (most recent call last): File "tests/test_device.py", line 51, in test_query_device self.verify_device_attr(attr) File "tests/test_device.py", line 77, in verify_device_attr assert attr.vendor_id != 0 In order to fix it, we will set vendor_id 0XFFFFFF, according to the IBTA v1.4 A3.3.1 VENDOR INFORMATION section. """ A vendor that produces a generic controller (i.e., one that supports a standard I/O protocol such as SRP), which does not have vendor specific device drivers, may use the value of 0xFFFFFF in the VendorID field. """ Before: hca_id: rxe0 transport: InfiniBand (0) fw_ver: 0.0.0 node_guid: 5054:00ff:feaa:5363 sys_image_guid: 5054:00ff:feaa:5363 vendor_id: 0x0000 After: hca_id: rxe0 transport: InfiniBand (0) fw_ver: 0.0.0 node_guid: 5054:00ff:feaa:5363 sys_image_guid: 5054:00ff:feaa:5363 vendor_id: 0xffffff Fixes: 8700e3e7c485 ("Soft RoCE driver") Link: https://lore.kernel.org/r/20200406173501.1466273-1-leon@kernel.org Signed-off-by: Zhu Yanjun Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe.c | 1 + drivers/infiniband/sw/rxe/rxe_param.h | 3 +++ 2 files changed, 4 insertions(+) diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c index 4afdd2e20883..5642eefb4ba1 100644 --- a/drivers/infiniband/sw/rxe/rxe.c +++ b/drivers/infiniband/sw/rxe/rxe.c @@ -77,6 +77,7 @@ static void rxe_init_device_param(struct rxe_dev *rxe) { rxe->max_inline_data = RXE_MAX_INLINE_DATA; + rxe->attr.vendor_id = RXE_VENDOR_ID; rxe->attr.max_mr_size = RXE_MAX_MR_SIZE; rxe->attr.page_size_cap = RXE_PAGE_SIZE_CAP; rxe->attr.max_qp = RXE_MAX_QP; diff --git a/drivers/infiniband/sw/rxe/rxe_param.h b/drivers/infiniband/sw/rxe/rxe_param.h index f59616b02477..99e9d8ba9767 100644 --- a/drivers/infiniband/sw/rxe/rxe_param.h +++ b/drivers/infiniband/sw/rxe/rxe_param.h @@ -127,6 +127,9 @@ enum rxe_device_param { /* Delay before calling arbiter timer */ RXE_NSEC_ARB_TIMER_DELAY = 200, + + /* IBTA v1.4 A3.3.1 VENDOR INFORMATION section */ + RXE_VENDOR_ID = 0XFFFFFF, }; /* default/initial rxe port parameters */ -- cgit v1.2.3 From cf26deff9036cd3270af562dbec545239e5c7f07 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Mon, 6 Apr 2020 20:35:40 +0300 Subject: RDMA/mlx5: Fix udata response upon SRQ creation Fix udata response upon SRQ creation to use the UAPI structure (i.e. mlx5_ib_create_srq_resp). It did not zero the reserved field in userspace. Fixes: e126ba97dba9 ("mlx5: Add driver for Mellanox Connect-IB adapters") Link: https://lore.kernel.org/r/20200406173540.1466477-1-leon@kernel.org Signed-off-by: Yishai Hadas Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/srq.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/srq.c b/drivers/infiniband/hw/mlx5/srq.c index b1a8a9175040..6d1ff13d2283 100644 --- a/drivers/infiniband/hw/mlx5/srq.c +++ b/drivers/infiniband/hw/mlx5/srq.c @@ -310,12 +310,18 @@ int mlx5_ib_create_srq(struct ib_srq *ib_srq, srq->msrq.event = mlx5_ib_srq_event; srq->ibsrq.ext.xrc.srq_num = srq->msrq.srqn; - if (udata) - if (ib_copy_to_udata(udata, &srq->msrq.srqn, sizeof(__u32))) { + if (udata) { + struct mlx5_ib_create_srq_resp resp = { + .srqn = srq->msrq.srqn, + }; + + if (ib_copy_to_udata(udata, &resp, min(udata->outlen, + sizeof(resp)))) { mlx5_ib_dbg(dev, "copy to user failed\n"); err = -EFAULT; goto err_core; } + } init_attr->attr.max_wr = srq->msrq.max - 1; -- cgit v1.2.3 From eb356e6dc15a30af604f052cd0e170450193c254 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 6 Apr 2020 21:44:26 -0300 Subject: RDMA/uverbs: Make the event_queue fds return POLLERR when disassociated If is_closed is set, and the event list is empty, then read() will return -EIO without blocking. After setting is_closed in ib_uverbs_free_event_queue(), we do trigger a wake_up on the poll_wait, but the fops->poll() function does not check it, so poll will continue to sleep on an empty list. Fixes: 14e23bd6d221 ("RDMA/core: Fix locking in ib_uverbs_event_read") Link: https://lore.kernel.org/r/0-v1-ace813388969+48859-uverbs_poll_fix%25jgg@mellanox.com Reviewed-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/uverbs_main.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index 2d4083bf4a04..8710a3427146 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -296,6 +296,8 @@ static __poll_t ib_uverbs_event_poll(struct ib_uverbs_event_queue *ev_queue, spin_lock_irq(&ev_queue->lock); if (!list_empty(&ev_queue->event_list)) pollflags = EPOLLIN | EPOLLRDNORM; + else if (ev_queue->is_closed) + pollflags = EPOLLERR; spin_unlock_irq(&ev_queue->lock); return pollflags; -- cgit v1.2.3 From 1587982e705db1ac090b05a7006771c78d0e8417 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 7 Apr 2020 20:20:09 -0300 Subject: RDMA: Remove a few extra calls to ib_get_client_data() These four places already have easy access to the client data, just use that instead. Link: https://lore.kernel.org/r/0-v1-fae83f600b4a+68-less_get_client_data%25jgg@mellanox.com Acked-by: Ursula Braun Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/sa_query.c | 15 ++++++--------- drivers/infiniband/ulp/srpt/ib_srpt.c | 7 ++----- net/smc/smc_ib.c | 3 +-- 3 files changed, 9 insertions(+), 16 deletions(-) diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c index 74e0058fcf9e..2dd326f2beed 100644 --- a/drivers/infiniband/core/sa_query.c +++ b/drivers/infiniband/core/sa_query.c @@ -1412,17 +1412,13 @@ void ib_sa_pack_path(struct sa_path_rec *rec, void *attribute) EXPORT_SYMBOL(ib_sa_pack_path); static bool ib_sa_opa_pathrecord_support(struct ib_sa_client *client, - struct ib_device *device, + struct ib_sa_device *sa_dev, u8 port_num) { - struct ib_sa_device *sa_dev = ib_get_client_data(device, &sa_client); struct ib_sa_port *port; unsigned long flags; bool ret = false; - if (!sa_dev) - return ret; - port = &sa_dev->port[port_num - sa_dev->start_port]; spin_lock_irqsave(&port->classport_lock, flags); if (!port->classport_info.valid) @@ -1450,8 +1446,8 @@ enum opa_pr_supported { * query is possible. */ static int opa_pr_query_possible(struct ib_sa_client *client, - struct ib_device *device, - u8 port_num, + struct ib_sa_device *sa_dev, + struct ib_device *device, u8 port_num, struct sa_path_rec *rec) { struct ib_port_attr port_attr; @@ -1459,7 +1455,7 @@ static int opa_pr_query_possible(struct ib_sa_client *client, if (ib_query_port(device, port_num, &port_attr)) return PR_NOT_SUPPORTED; - if (ib_sa_opa_pathrecord_support(client, device, port_num)) + if (ib_sa_opa_pathrecord_support(client, sa_dev, port_num)) return PR_OPA_SUPPORTED; if (port_attr.lid >= be16_to_cpu(IB_MULTICAST_LID_BASE)) @@ -1574,7 +1570,8 @@ int ib_sa_path_rec_get(struct ib_sa_client *client, query->sa_query.port = port; if (rec->rec_type == SA_PATH_REC_TYPE_OPA) { - status = opa_pr_query_possible(client, device, port_num, rec); + status = opa_pr_query_possible(client, sa_dev, device, port_num, + rec); if (status == PR_NOT_SUPPORTED) { ret = -EINVAL; goto err1; diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.c b/drivers/infiniband/ulp/srpt/ib_srpt.c index 98552749d71c..9d02d8088f1c 100644 --- a/drivers/infiniband/ulp/srpt/ib_srpt.c +++ b/drivers/infiniband/ulp/srpt/ib_srpt.c @@ -135,14 +135,11 @@ static bool srpt_set_ch_state(struct srpt_rdma_ch *ch, enum rdma_ch_state new) static void srpt_event_handler(struct ib_event_handler *handler, struct ib_event *event) { - struct srpt_device *sdev; + struct srpt_device *sdev = + container_of(handler, struct srpt_device, event_handler); struct srpt_port *sport; u8 port_num; - sdev = ib_get_client_data(event->device, &srpt_client); - if (!sdev || sdev->device != event->device) - return; - pr_debug("ASYNC event= %d on device= %s\n", event->event, dev_name(&sdev->device->dev)); diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index 04b6fefb8bce..e7e7c3c6e94a 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -588,9 +588,8 @@ static void smc_ib_add_dev(struct ib_device *ibdev) /* callback function for ib_unregister_client() */ static void smc_ib_remove_dev(struct ib_device *ibdev, void *client_data) { - struct smc_ib_device *smcibdev; + struct smc_ib_device *smcibdev = client_data; - smcibdev = ib_get_client_data(ibdev, &smc_ib_client); if (!smcibdev || smcibdev->ibdev != ibdev) return; ib_set_client_data(ibdev, &smc_ib_client, NULL); -- cgit v1.2.3 From 255e636df4133507254da13137e8d8524ef0794f Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 14 Apr 2020 18:48:43 +0200 Subject: IB: Fix some documentation warnings Parsing verbs.c with kernel-doc produce some warnings: ./drivers/infiniband/core/verbs.c:2579: WARNING: Unexpected indentation. ./drivers/infiniband/core/verbs.c:2581: WARNING: Block quote ends without a blank line; unexpected unindent. ./drivers/infiniband/core/verbs.c:2613: WARNING: Unexpected indentation. ./drivers/infiniband/core/verbs.c:2579: WARNING: Unexpected indentation. ./drivers/infiniband/core/verbs.c:2581: WARNING: Block quote ends without a blank line; unexpected unindent. ./drivers/infiniband/core/verbs.c:2613: WARNING: Unexpected indentation. Address them by adding an extra blank line and converting the parameters on one of the arguments to a table. Link: https://lore.kernel.org/r/4c5466d0f450c5a9952138150c3485740b37f9c5.1586881715.git.mchehab+huawei@kernel.org Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/verbs.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index 56a71337112c..3bfadd8effcc 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -2574,6 +2574,7 @@ EXPORT_SYMBOL(ib_map_mr_sg_pi); * @page_size: page vector desired page size * * Constraints: + * * - The first sg element is allowed to have an offset. * - Each sg element must either be aligned to page_size or virtually * contiguous to the previous element. In case an sg element has a @@ -2607,10 +2608,12 @@ EXPORT_SYMBOL(ib_map_mr_sg); * @mr: memory region * @sgl: dma mapped scatterlist * @sg_nents: number of entries in sg - * @sg_offset_p: IN: start offset in bytes into sg - * OUT: offset in bytes for element n of the sg of the first + * @sg_offset_p: ==== ======================================================= + * IN start offset in bytes into sg + * OUT offset in bytes for element n of the sg of the first * byte that has not been processed where n is the return * value of this function. + * ==== ======================================================= * @set_page: driver page assignment function pointer * * Core service helper for drivers to convert the largest -- cgit v1.2.3 From 4f953089111d45d0e654080a1f2edec39f197c93 Mon Sep 17 00:00:00 2001 From: Zou Wei Date: Mon, 13 Apr 2020 10:42:04 +0800 Subject: IB/qib: Remove unused variable ret This patch fixes below warnings reported by coccicheck drivers/infiniband/hw/qib/qib_iba7322.c:6878:8-11: Unneeded variable: "ret". Return "0" on line 6907 drivers/infiniband/hw/qib/qib_iba7322.c:2378:5-8: Unneeded variable: "ret". Return "0" on line 2513 Link: https://lore.kernel.org/r/1586745724-107477-1-git-send-email-zou_wei@huawei.com Reported-by: Hulk Robot Signed-off-by: Zou Wei Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/qib/qib_iba7322.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/hw/qib/qib_iba7322.c b/drivers/infiniband/hw/qib/qib_iba7322.c index 91d64dd71a8a..8bcbc884e5b6 100644 --- a/drivers/infiniband/hw/qib/qib_iba7322.c +++ b/drivers/infiniband/hw/qib/qib_iba7322.c @@ -2375,7 +2375,6 @@ static int qib_7322_bringup_serdes(struct qib_pportdata *ppd) struct qib_devdata *dd = ppd->dd; u64 val, guid, ibc; unsigned long flags; - int ret = 0; /* * SerDes model not in Pd, but still need to @@ -2510,7 +2509,7 @@ static int qib_7322_bringup_serdes(struct qib_pportdata *ppd) val | ERR_MASK_N(IBStatusChanged)); /* Always zero until we start messing with SerDes for real */ - return ret; + return 0; } /** @@ -6875,7 +6874,7 @@ static int init_sdma_7322_regs(struct qib_pportdata *ppd) struct qib_devdata *dd = ppd->dd; unsigned lastbuf, erstbuf; u64 senddmabufmask[3] = { 0 }; - int n, ret = 0; + int n; qib_write_kreg_port(ppd, krp_senddmabase, ppd->sdma_descq_phys); qib_sdma_7322_setlengen(ppd); @@ -6904,7 +6903,7 @@ static int init_sdma_7322_regs(struct qib_pportdata *ppd) qib_write_kreg_port(ppd, krp_senddmabufmask0, senddmabufmask[0]); qib_write_kreg_port(ppd, krp_senddmabufmask1, senddmabufmask[1]); qib_write_kreg_port(ppd, krp_senddmabufmask2, senddmabufmask[2]); - return ret; + return 0; } /* sdma_lock must be held */ -- cgit v1.2.3 From 99bf84e24eb83d1612598cee1807732bd194c23c Mon Sep 17 00:00:00 2001 From: Devesh Sharma Date: Thu, 2 Apr 2020 14:12:12 -0400 Subject: RDMA/bnxt_re: Reduce device page size detection code Getting rid of the repeated code in the driver when deciding on the page size of the hardware ring memory. A new common function would translate the ring page size into device specific page size. Link: https://lore.kernel.org/r/1585851136-2316-2-git-send-email-devesh.sharma@broadcom.com Signed-off-by: Devesh Sharma Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/bnxt_re/qplib_fp.c | 138 ++++++++--------------------- drivers/infiniband/hw/bnxt_re/qplib_rcfw.c | 72 ++++++--------- drivers/infiniband/hw/bnxt_re/qplib_res.h | 40 +++++++++ 3 files changed, 103 insertions(+), 147 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.c b/drivers/infiniband/hw/bnxt_re/qplib_fp.c index 899a5d2c100e..d3bf9f665982 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_fp.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.c @@ -612,6 +612,7 @@ int bnxt_qplib_create_srq(struct bnxt_qplib_res *res, struct cmdq_create_srq req; struct bnxt_qplib_pbl *pbl; u16 cmd_flags = 0; + u16 pg_sz_lvl; int rc, idx; hwq_attr.res = res; @@ -638,22 +639,11 @@ int bnxt_qplib_create_srq(struct bnxt_qplib_res *res, req.srq_size = cpu_to_le16((u16)srq->hwq.max_elements); pbl = &srq->hwq.pbl[PBL_LVL_0]; - req.pg_size_lvl = cpu_to_le16((((u16)srq->hwq.level & - CMDQ_CREATE_SRQ_LVL_MASK) << - CMDQ_CREATE_SRQ_LVL_SFT) | - (pbl->pg_size == ROCE_PG_SIZE_4K ? - CMDQ_CREATE_SRQ_PG_SIZE_PG_4K : - pbl->pg_size == ROCE_PG_SIZE_8K ? - CMDQ_CREATE_SRQ_PG_SIZE_PG_8K : - pbl->pg_size == ROCE_PG_SIZE_64K ? - CMDQ_CREATE_SRQ_PG_SIZE_PG_64K : - pbl->pg_size == ROCE_PG_SIZE_2M ? - CMDQ_CREATE_SRQ_PG_SIZE_PG_2M : - pbl->pg_size == ROCE_PG_SIZE_8M ? - CMDQ_CREATE_SRQ_PG_SIZE_PG_8M : - pbl->pg_size == ROCE_PG_SIZE_1G ? - CMDQ_CREATE_SRQ_PG_SIZE_PG_1G : - CMDQ_CREATE_SRQ_PG_SIZE_PG_4K)); + pg_sz_lvl = ((u16)bnxt_qplib_base_pg_size(&srq->hwq) << + CMDQ_CREATE_SRQ_PG_SIZE_SFT); + pg_sz_lvl |= (srq->hwq.level & CMDQ_CREATE_SRQ_LVL_MASK) << + CMDQ_CREATE_SRQ_LVL_SFT; + req.pg_size_lvl = cpu_to_le16(pg_sz_lvl); req.pbl = cpu_to_le64(pbl->pg_map_arr[0]); req.pd_id = cpu_to_le32(srq->pd->id); req.eventq_id = cpu_to_le16(srq->eventq_hw_ring_id); @@ -809,6 +799,7 @@ int bnxt_qplib_create_qp1(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp) struct bnxt_qplib_pbl *pbl; u16 cmd_flags = 0; u32 qp_flags = 0; + u8 pg_sz_lvl; int rc; RCFW_CMD_PREP(req, CREATE_QP1, cmd_flags); @@ -835,28 +826,13 @@ int bnxt_qplib_create_qp1(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp) } pbl = &sq->hwq.pbl[PBL_LVL_0]; req.sq_pbl = cpu_to_le64(pbl->pg_map_arr[0]); - req.sq_pg_size_sq_lvl = - ((sq->hwq.level & CMDQ_CREATE_QP1_SQ_LVL_MASK) - << CMDQ_CREATE_QP1_SQ_LVL_SFT) | - (pbl->pg_size == ROCE_PG_SIZE_4K ? - CMDQ_CREATE_QP1_SQ_PG_SIZE_PG_4K : - pbl->pg_size == ROCE_PG_SIZE_8K ? - CMDQ_CREATE_QP1_SQ_PG_SIZE_PG_8K : - pbl->pg_size == ROCE_PG_SIZE_64K ? - CMDQ_CREATE_QP1_SQ_PG_SIZE_PG_64K : - pbl->pg_size == ROCE_PG_SIZE_2M ? - CMDQ_CREATE_QP1_SQ_PG_SIZE_PG_2M : - pbl->pg_size == ROCE_PG_SIZE_8M ? - CMDQ_CREATE_QP1_SQ_PG_SIZE_PG_8M : - pbl->pg_size == ROCE_PG_SIZE_1G ? - CMDQ_CREATE_QP1_SQ_PG_SIZE_PG_1G : - CMDQ_CREATE_QP1_SQ_PG_SIZE_PG_4K); + pg_sz_lvl = (bnxt_qplib_base_pg_size(&sq->hwq) << + CMDQ_CREATE_QP1_SQ_PG_SIZE_SFT); + pg_sz_lvl |= (sq->hwq.level & CMDQ_CREATE_QP1_SQ_LVL_MASK); + req.sq_pg_size_sq_lvl = pg_sz_lvl; if (qp->scq) req.scq_cid = cpu_to_le32(qp->scq->id); - - qp_flags |= CMDQ_CREATE_QP1_QP_FLAGS_RESERVED_LKEY_ENABLE; - /* RQ */ if (rq->max_wqe) { hwq_attr.res = res; @@ -876,32 +852,20 @@ int bnxt_qplib_create_qp1(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp) } pbl = &rq->hwq.pbl[PBL_LVL_0]; req.rq_pbl = cpu_to_le64(pbl->pg_map_arr[0]); - req.rq_pg_size_rq_lvl = - ((rq->hwq.level & CMDQ_CREATE_QP1_RQ_LVL_MASK) << - CMDQ_CREATE_QP1_RQ_LVL_SFT) | - (pbl->pg_size == ROCE_PG_SIZE_4K ? - CMDQ_CREATE_QP1_RQ_PG_SIZE_PG_4K : - pbl->pg_size == ROCE_PG_SIZE_8K ? - CMDQ_CREATE_QP1_RQ_PG_SIZE_PG_8K : - pbl->pg_size == ROCE_PG_SIZE_64K ? - CMDQ_CREATE_QP1_RQ_PG_SIZE_PG_64K : - pbl->pg_size == ROCE_PG_SIZE_2M ? - CMDQ_CREATE_QP1_RQ_PG_SIZE_PG_2M : - pbl->pg_size == ROCE_PG_SIZE_8M ? - CMDQ_CREATE_QP1_RQ_PG_SIZE_PG_8M : - pbl->pg_size == ROCE_PG_SIZE_1G ? - CMDQ_CREATE_QP1_RQ_PG_SIZE_PG_1G : - CMDQ_CREATE_QP1_RQ_PG_SIZE_PG_4K); + pg_sz_lvl = (bnxt_qplib_base_pg_size(&rq->hwq) << + CMDQ_CREATE_QP1_RQ_PG_SIZE_SFT); + pg_sz_lvl |= (rq->hwq.level & CMDQ_CREATE_QP1_RQ_LVL_MASK); + req.rq_pg_size_rq_lvl = pg_sz_lvl; if (qp->rcq) req.rcq_cid = cpu_to_le32(qp->rcq->id); } - /* Header buffer - allow hdr_buf pass in */ rc = bnxt_qplib_alloc_qp_hdr_buf(res, qp); if (rc) { rc = -ENOMEM; goto fail; } + qp_flags |= CMDQ_CREATE_QP1_QP_FLAGS_RESERVED_LKEY_ENABLE; req.qp_flags = cpu_to_le32(qp_flags); req.sq_size = cpu_to_le32(sq->hwq.max_elements); req.rq_size = cpu_to_le32(rq->hwq.max_elements); @@ -965,6 +929,7 @@ int bnxt_qplib_create_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp) struct cmdq_create_qp req; struct bnxt_qplib_pbl *pbl; u32 qp_flags = 0; + u8 pg_sz_lvl; u16 max_rsge; RCFW_CMD_PREP(req, CREATE_QP, cmd_flags); @@ -1025,31 +990,14 @@ int bnxt_qplib_create_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp) } pbl = &sq->hwq.pbl[PBL_LVL_0]; req.sq_pbl = cpu_to_le64(pbl->pg_map_arr[0]); - req.sq_pg_size_sq_lvl = - ((sq->hwq.level & CMDQ_CREATE_QP_SQ_LVL_MASK) - << CMDQ_CREATE_QP_SQ_LVL_SFT) | - (pbl->pg_size == ROCE_PG_SIZE_4K ? - CMDQ_CREATE_QP_SQ_PG_SIZE_PG_4K : - pbl->pg_size == ROCE_PG_SIZE_8K ? - CMDQ_CREATE_QP_SQ_PG_SIZE_PG_8K : - pbl->pg_size == ROCE_PG_SIZE_64K ? - CMDQ_CREATE_QP_SQ_PG_SIZE_PG_64K : - pbl->pg_size == ROCE_PG_SIZE_2M ? - CMDQ_CREATE_QP_SQ_PG_SIZE_PG_2M : - pbl->pg_size == ROCE_PG_SIZE_8M ? - CMDQ_CREATE_QP_SQ_PG_SIZE_PG_8M : - pbl->pg_size == ROCE_PG_SIZE_1G ? - CMDQ_CREATE_QP_SQ_PG_SIZE_PG_1G : - CMDQ_CREATE_QP_SQ_PG_SIZE_PG_4K); + pg_sz_lvl = (bnxt_qplib_base_pg_size(&sq->hwq) << + CMDQ_CREATE_QP_SQ_PG_SIZE_SFT); + pg_sz_lvl |= (sq->hwq.level & CMDQ_CREATE_QP_SQ_LVL_MASK); + req.sq_pg_size_sq_lvl = pg_sz_lvl; if (qp->scq) req.scq_cid = cpu_to_le32(qp->scq->id); - qp_flags |= CMDQ_CREATE_QP_QP_FLAGS_RESERVED_LKEY_ENABLE; - qp_flags |= CMDQ_CREATE_QP_QP_FLAGS_FR_PMR_ENABLED; - if (qp->sig_type) - qp_flags |= CMDQ_CREATE_QP_QP_FLAGS_FORCE_COMPLETION; - /* RQ */ if (rq->max_wqe) { hwq_attr.res = res; @@ -1071,22 +1019,10 @@ int bnxt_qplib_create_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp) } pbl = &rq->hwq.pbl[PBL_LVL_0]; req.rq_pbl = cpu_to_le64(pbl->pg_map_arr[0]); - req.rq_pg_size_rq_lvl = - ((rq->hwq.level & CMDQ_CREATE_QP_RQ_LVL_MASK) << - CMDQ_CREATE_QP_RQ_LVL_SFT) | - (pbl->pg_size == ROCE_PG_SIZE_4K ? - CMDQ_CREATE_QP_RQ_PG_SIZE_PG_4K : - pbl->pg_size == ROCE_PG_SIZE_8K ? - CMDQ_CREATE_QP_RQ_PG_SIZE_PG_8K : - pbl->pg_size == ROCE_PG_SIZE_64K ? - CMDQ_CREATE_QP_RQ_PG_SIZE_PG_64K : - pbl->pg_size == ROCE_PG_SIZE_2M ? - CMDQ_CREATE_QP_RQ_PG_SIZE_PG_2M : - pbl->pg_size == ROCE_PG_SIZE_8M ? - CMDQ_CREATE_QP_RQ_PG_SIZE_PG_8M : - pbl->pg_size == ROCE_PG_SIZE_1G ? - CMDQ_CREATE_QP_RQ_PG_SIZE_PG_1G : - CMDQ_CREATE_QP_RQ_PG_SIZE_PG_4K); + pg_sz_lvl = (bnxt_qplib_base_pg_size(&rq->hwq) << + CMDQ_CREATE_QP_RQ_PG_SIZE_SFT); + pg_sz_lvl |= (rq->hwq.level & CMDQ_CREATE_QP_RQ_LVL_MASK); + req.rq_pg_size_rq_lvl = pg_sz_lvl; } else { /* SRQ */ if (qp->srq) { @@ -1097,7 +1033,13 @@ int bnxt_qplib_create_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp) if (qp->rcq) req.rcq_cid = cpu_to_le32(qp->rcq->id); + + qp_flags |= CMDQ_CREATE_QP_QP_FLAGS_RESERVED_LKEY_ENABLE; + qp_flags |= CMDQ_CREATE_QP_QP_FLAGS_FR_PMR_ENABLED; + if (qp->sig_type) + qp_flags |= CMDQ_CREATE_QP_QP_FLAGS_FORCE_COMPLETION; req.qp_flags = cpu_to_le32(qp_flags); + req.sq_size = cpu_to_le32(sq->hwq.max_elements); req.rq_size = cpu_to_le32(rq->hwq.max_elements); qp->sq_hdr_buf = NULL; @@ -2000,6 +1942,7 @@ int bnxt_qplib_create_cq(struct bnxt_qplib_res *res, struct bnxt_qplib_cq *cq) struct cmdq_create_cq req; struct bnxt_qplib_pbl *pbl; u16 cmd_flags = 0; + u32 pg_sz_lvl; int rc; hwq_attr.res = res; @@ -2020,22 +1963,13 @@ int bnxt_qplib_create_cq(struct bnxt_qplib_res *res, struct bnxt_qplib_cq *cq) } req.dpi = cpu_to_le32(cq->dpi->dpi); req.cq_handle = cpu_to_le64(cq->cq_handle); - req.cq_size = cpu_to_le32(cq->hwq.max_elements); pbl = &cq->hwq.pbl[PBL_LVL_0]; - req.pg_size_lvl = cpu_to_le32( - ((cq->hwq.level & CMDQ_CREATE_CQ_LVL_MASK) << - CMDQ_CREATE_CQ_LVL_SFT) | - (pbl->pg_size == ROCE_PG_SIZE_4K ? CMDQ_CREATE_CQ_PG_SIZE_PG_4K : - pbl->pg_size == ROCE_PG_SIZE_8K ? CMDQ_CREATE_CQ_PG_SIZE_PG_8K : - pbl->pg_size == ROCE_PG_SIZE_64K ? CMDQ_CREATE_CQ_PG_SIZE_PG_64K : - pbl->pg_size == ROCE_PG_SIZE_2M ? CMDQ_CREATE_CQ_PG_SIZE_PG_2M : - pbl->pg_size == ROCE_PG_SIZE_8M ? CMDQ_CREATE_CQ_PG_SIZE_PG_8M : - pbl->pg_size == ROCE_PG_SIZE_1G ? CMDQ_CREATE_CQ_PG_SIZE_PG_1G : - CMDQ_CREATE_CQ_PG_SIZE_PG_4K)); - + pg_sz_lvl = (bnxt_qplib_base_pg_size(&cq->hwq) << + CMDQ_CREATE_CQ_PG_SIZE_SFT); + pg_sz_lvl |= (cq->hwq.level & CMDQ_CREATE_CQ_LVL_MASK); + req.pg_size_lvl = cpu_to_le32(pg_sz_lvl); req.pbl = cpu_to_le64(pbl->pg_map_arr[0]); - req.cq_fco_cnq_id = cpu_to_le32( (cq->cnq_hw_ring_id & CMDQ_CREATE_CQ_CNQ_ID_MASK) << CMDQ_CREATE_CQ_CNQ_ID_SFT); diff --git a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c index f01e864bb611..fe5e06f85ffc 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c @@ -468,29 +468,13 @@ int bnxt_qplib_deinit_rcfw(struct bnxt_qplib_rcfw *rcfw) return 0; } -static int __get_pbl_pg_idx(struct bnxt_qplib_pbl *pbl) -{ - return (pbl->pg_size == ROCE_PG_SIZE_4K ? - CMDQ_INITIALIZE_FW_QPC_PG_SIZE_PG_4K : - pbl->pg_size == ROCE_PG_SIZE_8K ? - CMDQ_INITIALIZE_FW_QPC_PG_SIZE_PG_8K : - pbl->pg_size == ROCE_PG_SIZE_64K ? - CMDQ_INITIALIZE_FW_QPC_PG_SIZE_PG_64K : - pbl->pg_size == ROCE_PG_SIZE_2M ? - CMDQ_INITIALIZE_FW_QPC_PG_SIZE_PG_2M : - pbl->pg_size == ROCE_PG_SIZE_8M ? - CMDQ_INITIALIZE_FW_QPC_PG_SIZE_PG_8M : - pbl->pg_size == ROCE_PG_SIZE_1G ? - CMDQ_INITIALIZE_FW_QPC_PG_SIZE_PG_1G : - CMDQ_INITIALIZE_FW_QPC_PG_SIZE_PG_4K); -} - int bnxt_qplib_init_rcfw(struct bnxt_qplib_rcfw *rcfw, struct bnxt_qplib_ctx *ctx, int is_virtfn) { - struct cmdq_initialize_fw req; struct creq_initialize_fw_resp resp; - u16 cmd_flags = 0, level; + struct cmdq_initialize_fw req; + u16 cmd_flags = 0; + u8 pgsz, lvl; int rc; RCFW_CMD_PREP(req, INITIALIZE_FW, cmd_flags); @@ -511,32 +495,30 @@ int bnxt_qplib_init_rcfw(struct bnxt_qplib_rcfw *rcfw, if (bnxt_qplib_is_chip_gen_p5(rcfw->res->cctx)) goto config_vf_res; - level = ctx->qpc_tbl.level; - req.qpc_pg_size_qpc_lvl = (level << CMDQ_INITIALIZE_FW_QPC_LVL_SFT) | - __get_pbl_pg_idx(&ctx->qpc_tbl.pbl[level]); - level = ctx->mrw_tbl.level; - req.mrw_pg_size_mrw_lvl = (level << CMDQ_INITIALIZE_FW_MRW_LVL_SFT) | - __get_pbl_pg_idx(&ctx->mrw_tbl.pbl[level]); - level = ctx->srqc_tbl.level; - req.srq_pg_size_srq_lvl = (level << CMDQ_INITIALIZE_FW_SRQ_LVL_SFT) | - __get_pbl_pg_idx(&ctx->srqc_tbl.pbl[level]); - level = ctx->cq_tbl.level; - req.cq_pg_size_cq_lvl = (level << CMDQ_INITIALIZE_FW_CQ_LVL_SFT) | - __get_pbl_pg_idx(&ctx->cq_tbl.pbl[level]); - level = ctx->srqc_tbl.level; - req.srq_pg_size_srq_lvl = (level << CMDQ_INITIALIZE_FW_SRQ_LVL_SFT) | - __get_pbl_pg_idx(&ctx->srqc_tbl.pbl[level]); - level = ctx->cq_tbl.level; - req.cq_pg_size_cq_lvl = (level << CMDQ_INITIALIZE_FW_CQ_LVL_SFT) | - __get_pbl_pg_idx(&ctx->cq_tbl.pbl[level]); - level = ctx->tim_tbl.level; - req.tim_pg_size_tim_lvl = (level << CMDQ_INITIALIZE_FW_TIM_LVL_SFT) | - __get_pbl_pg_idx(&ctx->tim_tbl.pbl[level]); - level = ctx->tqm_ctx.pde.level; - req.tqm_pg_size_tqm_lvl = - (level << CMDQ_INITIALIZE_FW_TQM_LVL_SFT) | - __get_pbl_pg_idx(&ctx->tqm_ctx.pde.pbl[level]); - + lvl = ctx->qpc_tbl.level; + pgsz = bnxt_qplib_base_pg_size(&ctx->qpc_tbl); + req.qpc_pg_size_qpc_lvl = (pgsz << CMDQ_INITIALIZE_FW_QPC_PG_SIZE_SFT) | + lvl; + lvl = ctx->mrw_tbl.level; + pgsz = bnxt_qplib_base_pg_size(&ctx->mrw_tbl); + req.mrw_pg_size_mrw_lvl = (pgsz << CMDQ_INITIALIZE_FW_QPC_PG_SIZE_SFT) | + lvl; + lvl = ctx->srqc_tbl.level; + pgsz = bnxt_qplib_base_pg_size(&ctx->srqc_tbl); + req.srq_pg_size_srq_lvl = (pgsz << CMDQ_INITIALIZE_FW_QPC_PG_SIZE_SFT) | + lvl; + lvl = ctx->cq_tbl.level; + pgsz = bnxt_qplib_base_pg_size(&ctx->cq_tbl); + req.cq_pg_size_cq_lvl = (pgsz << CMDQ_INITIALIZE_FW_QPC_PG_SIZE_SFT) | + lvl; + lvl = ctx->tim_tbl.level; + pgsz = bnxt_qplib_base_pg_size(&ctx->tim_tbl); + req.tim_pg_size_tim_lvl = (pgsz << CMDQ_INITIALIZE_FW_QPC_PG_SIZE_SFT) | + lvl; + lvl = ctx->tqm_ctx.pde.level; + pgsz = bnxt_qplib_base_pg_size(&ctx->tqm_ctx.pde); + req.tqm_pg_size_tqm_lvl = (pgsz << CMDQ_INITIALIZE_FW_QPC_PG_SIZE_SFT) | + lvl; req.qpc_page_dir = cpu_to_le64(ctx->qpc_tbl.pbl[PBL_LVL_0].pg_map_arr[0]); req.mrw_page_dir = diff --git a/drivers/infiniband/hw/bnxt_re/qplib_res.h b/drivers/infiniband/hw/bnxt_re/qplib_res.h index 95b645dbbc2d..79109ef6c70c 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_res.h +++ b/drivers/infiniband/hw/bnxt_re/qplib_res.h @@ -80,6 +80,15 @@ enum bnxt_qplib_pbl_lvl { #define ROCE_PG_SIZE_8M (8 * 1024 * 1024) #define ROCE_PG_SIZE_1G (1024 * 1024 * 1024) +enum bnxt_qplib_hwrm_pg_size { + BNXT_QPLIB_HWRM_PG_SIZE_4K = 0, + BNXT_QPLIB_HWRM_PG_SIZE_8K = 1, + BNXT_QPLIB_HWRM_PG_SIZE_64K = 2, + BNXT_QPLIB_HWRM_PG_SIZE_2M = 3, + BNXT_QPLIB_HWRM_PG_SIZE_8M = 4, + BNXT_QPLIB_HWRM_PG_SIZE_1G = 5, +}; + struct bnxt_qplib_reg_desc { u8 bar_id; resource_size_t bar_base; @@ -263,6 +272,37 @@ static inline u8 bnxt_qplib_get_ring_type(struct bnxt_qplib_chip_ctx *cctx) RING_ALLOC_REQ_RING_TYPE_ROCE_CMPL; } +static inline u8 bnxt_qplib_base_pg_size(struct bnxt_qplib_hwq *hwq) +{ + u8 pg_size = BNXT_QPLIB_HWRM_PG_SIZE_4K; + struct bnxt_qplib_pbl *pbl; + + pbl = &hwq->pbl[PBL_LVL_0]; + switch (pbl->pg_size) { + case ROCE_PG_SIZE_4K: + pg_size = BNXT_QPLIB_HWRM_PG_SIZE_4K; + break; + case ROCE_PG_SIZE_8K: + pg_size = BNXT_QPLIB_HWRM_PG_SIZE_8K; + break; + case ROCE_PG_SIZE_64K: + pg_size = BNXT_QPLIB_HWRM_PG_SIZE_64K; + break; + case ROCE_PG_SIZE_2M: + pg_size = BNXT_QPLIB_HWRM_PG_SIZE_2M; + break; + case ROCE_PG_SIZE_8M: + pg_size = BNXT_QPLIB_HWRM_PG_SIZE_8M; + break; + case ROCE_PG_SIZE_1G: + pg_size = BNXT_QPLIB_HWRM_PG_SIZE_1G; + break; + default: + break; + } + + return pg_size; +} #define to_bnxt_qplib(ptr, type, member) \ container_of(ptr, type, member) -- cgit v1.2.3 From c78671a4e65ae0b2e639ea61b4c65842c4200f2d Mon Sep 17 00:00:00 2001 From: Devesh Sharma Date: Thu, 2 Apr 2020 14:12:13 -0400 Subject: RDMA/bnxt_re: Update missing hsi data structures Adding fast path support data structure into hardware HSI. These structures are header only definition of RQE/SRQE/SQE. This is to help calculating the size of hardware wqe size. Link: https://lore.kernel.org/r/1585851136-2316-3-git-send-email-devesh.sharma@broadcom.com Signed-off-by: Devesh Sharma Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/bnxt_re/roce_hsi.h | 106 +++++++++++++++++++++++++++++++ 1 file changed, 106 insertions(+) diff --git a/drivers/infiniband/hw/bnxt_re/roce_hsi.h b/drivers/infiniband/hw/bnxt_re/roce_hsi.h index e4b09e7c2175..6f00f07420b7 100644 --- a/drivers/infiniband/hw/bnxt_re/roce_hsi.h +++ b/drivers/infiniband/hw/bnxt_re/roce_hsi.h @@ -210,6 +210,20 @@ struct sq_send { __le32 data[24]; }; +/* sq_send_hdr (size:256b/32B) */ +struct sq_send_hdr { + u8 wqe_type; + u8 flags; + u8 wqe_size; + u8 reserved8_1; + __le32 inv_key_or_imm_data; + __le32 length; + __le32 q_key; + __le32 dst_qp; + __le32 avid; + __le64 reserved64; +}; + /* Send Raw Ethernet and QP1 SQ WQE (40 bytes) */ struct sq_send_raweth_qp1 { u8 wqe_type; @@ -265,6 +279,21 @@ struct sq_send_raweth_qp1 { __le32 data[24]; }; +/* sq_send_raweth_qp1_hdr (size:256b/32B) */ +struct sq_send_raweth_qp1_hdr { + u8 wqe_type; + u8 flags; + u8 wqe_size; + u8 reserved8; + __le16 lflags; + __le16 cfa_action; + __le32 length; + __le32 reserved32_1; + __le32 cfa_meta; + __le32 reserved32_2; + __le64 reserved64; +}; + /* RDMA SQ WQE (40 bytes) */ struct sq_rdma { u8 wqe_type; @@ -288,6 +317,20 @@ struct sq_rdma { __le32 data[24]; }; +/* sq_rdma_hdr (size:256b/32B) */ +struct sq_rdma_hdr { + u8 wqe_type; + u8 flags; + u8 wqe_size; + u8 reserved8; + __le32 imm_data; + __le32 length; + __le32 reserved32_1; + __le64 remote_va; + __le32 remote_key; + __le32 reserved32_2; +}; + /* Atomic SQ WQE (40 bytes) */ struct sq_atomic { u8 wqe_type; @@ -307,6 +350,17 @@ struct sq_atomic { __le32 data[24]; }; +/* sq_atomic_hdr (size:256b/32B) */ +struct sq_atomic_hdr { + u8 wqe_type; + u8 flags; + __le16 reserved16; + __le32 remote_key; + __le64 remote_va; + __le64 swap_data; + __le64 cmp_data; +}; + /* Local Invalidate SQ WQE (40 bytes) */ struct sq_localinvalidate { u8 wqe_type; @@ -324,6 +378,16 @@ struct sq_localinvalidate { __le32 data[24]; }; +/* sq_localinvalidate_hdr (size:256b/32B) */ +struct sq_localinvalidate_hdr { + u8 wqe_type; + u8 flags; + __le16 reserved16; + __le32 inv_l_key; + __le64 reserved64; + u8 reserved128[16]; +}; + /* FR-PMR SQ WQE (40 bytes) */ struct sq_fr_pmr { u8 wqe_type; @@ -380,6 +444,21 @@ struct sq_fr_pmr { __le32 data[24]; }; +/* sq_fr_pmr_hdr (size:256b/32B) */ +struct sq_fr_pmr_hdr { + u8 wqe_type; + u8 flags; + u8 access_cntl; + u8 zero_based_page_size_log; + __le32 l_key; + u8 length[5]; + u8 reserved8_1; + u8 reserved8_2; + u8 numlevels_pbl_page_size_log; + __le64 pblptr; + __le64 va; +}; + /* Bind SQ WQE (40 bytes) */ struct sq_bind { u8 wqe_type; @@ -417,6 +496,22 @@ struct sq_bind { #define SQ_BIND_DATA_SFT 0 }; +/* sq_bind_hdr (size:256b/32B) */ +struct sq_bind_hdr { + u8 wqe_type; + u8 flags; + u8 access_cntl; + u8 reserved8_1; + u8 mw_type_zero_based; + u8 reserved8_2; + __le16 reserved16; + __le32 parent_l_key; + __le32 l_key; + __le64 va; + u8 length[5]; + u8 reserved24[3]; +}; + /* RQ/SRQ WQE Structures */ /* RQ/SRQ WQE (40 bytes) */ struct rq_wqe { @@ -435,6 +530,17 @@ struct rq_wqe { __le32 data[24]; }; +/* rq_wqe_hdr (size:256b/32B) */ +struct rq_wqe_hdr { + u8 wqe_type; + u8 flags; + u8 wqe_size; + u8 reserved8; + __le32 reserved32; + __le32 wr_id[2]; + u8 reserved128[16]; +}; + /* CQ CQE Structures */ /* Base CQE (32 bytes) */ struct cq_base { -- cgit v1.2.3 From fddcbbb02af42a5d6ec0c6ed38f823cc9dba1414 Mon Sep 17 00:00:00 2001 From: Devesh Sharma Date: Thu, 2 Apr 2020 14:12:14 -0400 Subject: RDMA/bnxt_re: Simplify obtaining queue entry from hw ring Restructring the data path and control path queue management code to simplify the way a queue element is extracted from the hardware ring. Introduced a new function which will give a pointer to the next ring item depending upon the current cons/prod index in the hardware queue. Further, there are hardcoding when size of queue entry is calculated, replacing it with an inline function. This function would be easier to expand if need going forward. The code section to initialize the PSN search areas has also been restructured and couple of functions has been added there. Link: https://lore.kernel.org/r/1585851136-2316-4-git-send-email-devesh.sharma@broadcom.com Signed-off-by: Devesh Sharma Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/bnxt_re/ib_verbs.c | 65 +++++---- drivers/infiniband/hw/bnxt_re/ib_verbs.h | 10 ++ drivers/infiniband/hw/bnxt_re/qplib_fp.c | 216 ++++++++++++++--------------- drivers/infiniband/hw/bnxt_re/qplib_fp.h | 42 +----- drivers/infiniband/hw/bnxt_re/qplib_rcfw.c | 16 +-- drivers/infiniband/hw/bnxt_re/qplib_rcfw.h | 41 ------ drivers/infiniband/hw/bnxt_re/qplib_res.c | 1 + drivers/infiniband/hw/bnxt_re/qplib_res.h | 13 ++ 8 files changed, 176 insertions(+), 228 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c index 95f6d493d1b9..d98348e82422 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c @@ -856,7 +856,7 @@ static int bnxt_re_init_user_qp(struct bnxt_re_dev *rdev, struct bnxt_re_pd *pd, if (ib_copy_from_udata(&ureq, udata, sizeof(ureq))) return -EFAULT; - bytes = (qplib_qp->sq.max_wqe * BNXT_QPLIB_MAX_SQE_ENTRY_SIZE); + bytes = (qplib_qp->sq.max_wqe * qplib_qp->sq.wqe_size); /* Consider mapping PSN search memory only for RC QPs. */ if (qplib_qp->type == CMDQ_CREATE_QP_TYPE_RC) { psn_sz = bnxt_qplib_is_chip_gen_p5(rdev->chip_ctx) ? @@ -879,7 +879,7 @@ static int bnxt_re_init_user_qp(struct bnxt_re_dev *rdev, struct bnxt_re_pd *pd, qplib_qp->qp_handle = ureq.qp_handle; if (!qp->qplib_qp.srq) { - bytes = (qplib_qp->rq.max_wqe * BNXT_QPLIB_MAX_RQE_ENTRY_SIZE); + bytes = (qplib_qp->rq.max_wqe * qplib_qp->rq.wqe_size); bytes = PAGE_ALIGN(bytes); umem = ib_umem_get(&rdev->ibdev, ureq.qprva, bytes, IB_ACCESS_LOCAL_WRITE); @@ -976,6 +976,7 @@ static struct bnxt_re_qp *bnxt_re_create_shadow_qp qp->qplib_qp.sig_type = true; /* Shadow QP SQ depth should be same as QP1 RQ depth */ + qp->qplib_qp.sq.wqe_size = bnxt_re_get_swqe_size(); qp->qplib_qp.sq.max_wqe = qp1_qp->rq.max_wqe; qp->qplib_qp.sq.max_sge = 2; /* Q full delta can be 1 since it is internal QP */ @@ -986,6 +987,7 @@ static struct bnxt_re_qp *bnxt_re_create_shadow_qp qp->qplib_qp.scq = qp1_qp->scq; qp->qplib_qp.rcq = qp1_qp->rcq; + qp->qplib_qp.rq.wqe_size = bnxt_re_get_rwqe_size(); qp->qplib_qp.rq.max_wqe = qp1_qp->rq.max_wqe; qp->qplib_qp.rq.max_sge = qp1_qp->rq.max_sge; /* Q full delta can be 1 since it is internal QP */ @@ -1021,10 +1023,12 @@ static int bnxt_re_init_rq_attr(struct bnxt_re_qp *qp, struct bnxt_qplib_dev_attr *dev_attr; struct bnxt_qplib_qp *qplqp; struct bnxt_re_dev *rdev; + struct bnxt_qplib_q *rq; int entries; rdev = qp->rdev; qplqp = &qp->qplib_qp; + rq = &qplqp->rq; dev_attr = &rdev->dev_attr; if (init_attr->srq) { @@ -1036,23 +1040,21 @@ static int bnxt_re_init_rq_attr(struct bnxt_re_qp *qp, return -EINVAL; } qplqp->srq = &srq->qplib_srq; - qplqp->rq.max_wqe = 0; + rq->max_wqe = 0; } else { + rq->wqe_size = bnxt_re_get_rwqe_size(); /* Allocate 1 more than what's provided so posting max doesn't * mean empty. */ entries = roundup_pow_of_two(init_attr->cap.max_recv_wr + 1); - qplqp->rq.max_wqe = min_t(u32, entries, - dev_attr->max_qp_wqes + 1); - - qplqp->rq.q_full_delta = qplqp->rq.max_wqe - - init_attr->cap.max_recv_wr; - qplqp->rq.max_sge = init_attr->cap.max_recv_sge; - if (qplqp->rq.max_sge > dev_attr->max_qp_sges) - qplqp->rq.max_sge = dev_attr->max_qp_sges; + rq->max_wqe = min_t(u32, entries, dev_attr->max_qp_wqes + 1); + rq->q_full_delta = rq->max_wqe - init_attr->cap.max_recv_wr; + rq->max_sge = init_attr->cap.max_recv_sge; + if (rq->max_sge > dev_attr->max_qp_sges) + rq->max_sge = dev_attr->max_qp_sges; } - qplqp->rq.sg_info.pgsize = PAGE_SIZE; - qplqp->rq.sg_info.pgshft = PAGE_SHIFT; + rq->sg_info.pgsize = PAGE_SIZE; + rq->sg_info.pgshft = PAGE_SHIFT; return 0; } @@ -1080,15 +1082,18 @@ static void bnxt_re_init_sq_attr(struct bnxt_re_qp *qp, struct bnxt_qplib_dev_attr *dev_attr; struct bnxt_qplib_qp *qplqp; struct bnxt_re_dev *rdev; + struct bnxt_qplib_q *sq; int entries; rdev = qp->rdev; qplqp = &qp->qplib_qp; + sq = &qplqp->sq; dev_attr = &rdev->dev_attr; - qplqp->sq.max_sge = init_attr->cap.max_send_sge; - if (qplqp->sq.max_sge > dev_attr->max_qp_sges) - qplqp->sq.max_sge = dev_attr->max_qp_sges; + sq->wqe_size = bnxt_re_get_swqe_size(); + sq->max_sge = init_attr->cap.max_send_sge; + if (sq->max_sge > dev_attr->max_qp_sges) + sq->max_sge = dev_attr->max_qp_sges; /* * Change the SQ depth if user has requested minimum using * configfs. Only supported for kernel consumers @@ -1096,9 +1101,9 @@ static void bnxt_re_init_sq_attr(struct bnxt_re_qp *qp, entries = init_attr->cap.max_send_wr; /* Allocate 128 + 1 more than what's provided */ entries = roundup_pow_of_two(entries + BNXT_QPLIB_RESERVED_QP_WRS + 1); - qplqp->sq.max_wqe = min_t(u32, entries, dev_attr->max_qp_wqes + - BNXT_QPLIB_RESERVED_QP_WRS + 1); - qplqp->sq.q_full_delta = BNXT_QPLIB_RESERVED_QP_WRS + 1; + sq->max_wqe = min_t(u32, entries, dev_attr->max_qp_wqes + + BNXT_QPLIB_RESERVED_QP_WRS + 1); + sq->q_full_delta = BNXT_QPLIB_RESERVED_QP_WRS + 1; /* * Reserving one slot for Phantom WQE. Application can * post one extra entry in this case. But allowing this to avoid @@ -1511,7 +1516,7 @@ static int bnxt_re_init_user_srq(struct bnxt_re_dev *rdev, if (ib_copy_from_udata(&ureq, udata, sizeof(ureq))) return -EFAULT; - bytes = (qplib_srq->max_wqe * BNXT_QPLIB_MAX_RQE_ENTRY_SIZE); + bytes = (qplib_srq->max_wqe * qplib_srq->wqe_size); bytes = PAGE_ALIGN(bytes); umem = ib_umem_get(&rdev->ibdev, ureq.srqva, bytes, IB_ACCESS_LOCAL_WRITE); @@ -1534,15 +1539,20 @@ int bnxt_re_create_srq(struct ib_srq *ib_srq, struct ib_srq_init_attr *srq_init_attr, struct ib_udata *udata) { - struct ib_pd *ib_pd = ib_srq->pd; - struct bnxt_re_pd *pd = container_of(ib_pd, struct bnxt_re_pd, ib_pd); - struct bnxt_re_dev *rdev = pd->rdev; - struct bnxt_qplib_dev_attr *dev_attr = &rdev->dev_attr; - struct bnxt_re_srq *srq = - container_of(ib_srq, struct bnxt_re_srq, ib_srq); + struct bnxt_qplib_dev_attr *dev_attr; struct bnxt_qplib_nq *nq = NULL; + struct bnxt_re_dev *rdev; + struct bnxt_re_srq *srq; + struct bnxt_re_pd *pd; + struct ib_pd *ib_pd; int rc, entries; + ib_pd = ib_srq->pd; + pd = container_of(ib_pd, struct bnxt_re_pd, ib_pd); + rdev = pd->rdev; + dev_attr = &rdev->dev_attr; + srq = container_of(ib_srq, struct bnxt_re_srq, ib_srq); + if (srq_init_attr->attr.max_wr >= dev_attr->max_srq_wqes) { ibdev_err(&rdev->ibdev, "Create CQ failed - max exceeded"); rc = -EINVAL; @@ -1563,8 +1573,9 @@ int bnxt_re_create_srq(struct ib_srq *ib_srq, entries = roundup_pow_of_two(srq_init_attr->attr.max_wr + 1); if (entries > dev_attr->max_srq_wqes + 1) entries = dev_attr->max_srq_wqes + 1; - srq->qplib_srq.max_wqe = entries; + + srq->qplib_srq.wqe_size = bnxt_re_get_rwqe_size(); srq->qplib_srq.max_sge = srq_init_attr->attr.max_sge; srq->qplib_srq.threshold = srq_init_attr->attr.srq_limit; srq->srq_limit = srq_init_attr->attr.srq_limit; diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.h b/drivers/infiniband/hw/bnxt_re/ib_verbs.h index 23d972da5652..18dd46f46cf4 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.h +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.h @@ -142,6 +142,16 @@ struct bnxt_re_ucontext { spinlock_t sh_lock; /* protect shpg */ }; +static inline u16 bnxt_re_get_swqe_size(void) +{ + return sizeof(struct sq_send); +} + +static inline u16 bnxt_re_get_rwqe_size(void) +{ + return sizeof(struct rq_wqe); +} + int bnxt_re_query_device(struct ib_device *ibdev, struct ib_device_attr *ib_attr, struct ib_udata *udata); diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.c b/drivers/infiniband/hw/bnxt_re/qplib_fp.c index d3bf9f665982..a4de56bdd6e8 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_fp.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.c @@ -301,9 +301,9 @@ static void bnxt_qplib_service_nq(unsigned long data) struct bnxt_qplib_nq *nq = (struct bnxt_qplib_nq *)data; struct bnxt_qplib_hwq *hwq = &nq->hwq; struct nq_base *nqe, **nq_ptr; - struct bnxt_qplib_cq *cq; - int num_cqne_processed = 0; int num_srqne_processed = 0; + int num_cqne_processed = 0; + struct bnxt_qplib_cq *cq; int budget = nq->budget; u32 sw_cons, raw_cons; uintptr_t q_handle; @@ -315,7 +315,7 @@ static void bnxt_qplib_service_nq(unsigned long data) while (budget--) { sw_cons = HWQ_CMP(raw_cons, hwq); nq_ptr = (struct nq_base **)hwq->pbl_ptr; - nqe = &nq_ptr[NQE_PG(sw_cons)][NQE_IDX(sw_cons)]; + nqe = bnxt_qplib_get_qe(hwq, sw_cons, NULL); if (!NQE_CMP_VALID(nqe, raw_cons, hwq->max_elements)) break; @@ -392,13 +392,11 @@ static irqreturn_t bnxt_qplib_nq_irq(int irq, void *dev_instance) { struct bnxt_qplib_nq *nq = dev_instance; struct bnxt_qplib_hwq *hwq = &nq->hwq; - struct nq_base **nq_ptr; u32 sw_cons; /* Prefetch the NQ element */ sw_cons = HWQ_CMP(hwq->cons, hwq); - nq_ptr = (struct nq_base **)nq->hwq.pbl_ptr; - prefetch(&nq_ptr[NQE_PG(sw_cons)][NQE_IDX(sw_cons)]); + prefetch(bnxt_qplib_get_qe(hwq, sw_cons, NULL)); /* Fan out to CPU affinitized kthreads? */ tasklet_schedule(&nq->nq_tasklet); @@ -618,7 +616,7 @@ int bnxt_qplib_create_srq(struct bnxt_qplib_res *res, hwq_attr.res = res; hwq_attr.sginfo = &srq->sg_info; hwq_attr.depth = srq->max_wqe; - hwq_attr.stride = BNXT_QPLIB_MAX_RQE_ENTRY_SIZE; + hwq_attr.stride = srq->wqe_size; hwq_attr.type = HWQ_TYPE_QUEUE; rc = bnxt_qplib_alloc_init_hwq(&srq->hwq, &hwq_attr); if (rc) @@ -730,7 +728,7 @@ int bnxt_qplib_post_srq_recv(struct bnxt_qplib_srq *srq, struct bnxt_qplib_swqe *wqe) { struct bnxt_qplib_hwq *srq_hwq = &srq->hwq; - struct rq_wqe *srqe, **srqe_ptr; + struct rq_wqe *srqe; struct sq_sge *hw_sge; u32 sw_prod, sw_cons, count = 0; int i, rc = 0, next; @@ -748,9 +746,8 @@ int bnxt_qplib_post_srq_recv(struct bnxt_qplib_srq *srq, spin_unlock(&srq_hwq->lock); sw_prod = HWQ_CMP(srq_hwq->prod, srq_hwq); - srqe_ptr = (struct rq_wqe **)srq_hwq->pbl_ptr; - srqe = &srqe_ptr[RQE_PG(sw_prod)][RQE_IDX(sw_prod)]; - memset(srqe, 0, BNXT_QPLIB_MAX_RQE_ENTRY_SIZE); + srqe = bnxt_qplib_get_qe(srq_hwq, sw_prod, NULL); + memset(srqe, 0, srq->wqe_size); /* Calculate wqe_size16 and data_len */ for (i = 0, hw_sge = (struct sq_sge *)srqe->data; i < wqe->num_sge; i++, hw_sge++) { @@ -813,7 +810,7 @@ int bnxt_qplib_create_qp1(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp) hwq_attr.res = res; hwq_attr.sginfo = &sq->sg_info; hwq_attr.depth = sq->max_wqe; - hwq_attr.stride = BNXT_QPLIB_MAX_SQE_ENTRY_SIZE; + hwq_attr.stride = sq->wqe_size; hwq_attr.type = HWQ_TYPE_QUEUE; rc = bnxt_qplib_alloc_init_hwq(&sq->hwq, &hwq_attr); if (rc) @@ -837,7 +834,7 @@ int bnxt_qplib_create_qp1(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp) if (rq->max_wqe) { hwq_attr.res = res; hwq_attr.sginfo = &rq->sg_info; - hwq_attr.stride = BNXT_QPLIB_MAX_RQE_ENTRY_SIZE; + hwq_attr.stride = rq->wqe_size; hwq_attr.depth = qp->rq.max_wqe; hwq_attr.type = HWQ_TYPE_QUEUE; rc = bnxt_qplib_alloc_init_hwq(&rq->hwq, &hwq_attr); @@ -912,22 +909,45 @@ exit: return rc; } +static void bnxt_qplib_init_psn_ptr(struct bnxt_qplib_qp *qp, int size) +{ + struct bnxt_qplib_hwq *hwq; + struct bnxt_qplib_q *sq; + u64 fpsne, psne, psn_pg; + u16 indx_pad = 0, indx; + u16 pg_num, pg_indx; + u64 *page; + + sq = &qp->sq; + hwq = &sq->hwq; + + fpsne = (u64)bnxt_qplib_get_qe(hwq, hwq->max_elements, &psn_pg); + if (!IS_ALIGNED(fpsne, PAGE_SIZE)) + indx_pad = ALIGN(fpsne, PAGE_SIZE) / size; + + page = (u64 *)psn_pg; + for (indx = 0; indx < hwq->max_elements; indx++) { + pg_num = (indx + indx_pad) / (PAGE_SIZE / size); + pg_indx = (indx + indx_pad) % (PAGE_SIZE / size); + psne = page[pg_num] + pg_indx * size; + sq->swq[indx].psn_ext = (struct sq_psn_search_ext *)psne; + sq->swq[indx].psn_search = (struct sq_psn_search *)psne; + } +} + int bnxt_qplib_create_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp) { struct bnxt_qplib_rcfw *rcfw = res->rcfw; struct bnxt_qplib_hwq_attr hwq_attr = {}; - unsigned long int psn_search, poff = 0; struct bnxt_qplib_sg_info sginfo = {}; - struct sq_psn_search **psn_search_ptr; struct bnxt_qplib_q *sq = &qp->sq; struct bnxt_qplib_q *rq = &qp->rq; - int i, rc, req_size, psn_sz = 0; - struct sq_send **hw_sq_send_ptr; struct creq_create_qp_resp resp; + int rc, req_size, psn_sz = 0; struct bnxt_qplib_hwq *xrrq; u16 cmd_flags = 0, max_ssge; - struct cmdq_create_qp req; struct bnxt_qplib_pbl *pbl; + struct cmdq_create_qp req; u32 qp_flags = 0; u8 pg_sz_lvl; u16 max_rsge; @@ -948,7 +968,7 @@ int bnxt_qplib_create_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp) hwq_attr.res = res; hwq_attr.sginfo = &sq->sg_info; - hwq_attr.stride = BNXT_QPLIB_MAX_SQE_ENTRY_SIZE; + hwq_attr.stride = sq->wqe_size; hwq_attr.depth = sq->max_wqe; hwq_attr.aux_stride = psn_sz; hwq_attr.aux_depth = hwq_attr.depth; @@ -962,32 +982,10 @@ int bnxt_qplib_create_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp) rc = -ENOMEM; goto fail_sq; } - hw_sq_send_ptr = (struct sq_send **)sq->hwq.pbl_ptr; - if (psn_sz) { - psn_search_ptr = (struct sq_psn_search **) - &hw_sq_send_ptr[get_sqe_pg - (sq->hwq.max_elements)]; - psn_search = (unsigned long int) - &hw_sq_send_ptr[get_sqe_pg(sq->hwq.max_elements)] - [get_sqe_idx(sq->hwq.max_elements)]; - if (psn_search & ~PAGE_MASK) { - /* If the psn_search does not start on a page boundary, - * then calculate the offset - */ - poff = (psn_search & ~PAGE_MASK) / - BNXT_QPLIB_MAX_PSNE_ENTRY_SIZE; - } - for (i = 0; i < sq->hwq.max_elements; i++) { - sq->swq[i].psn_search = - &psn_search_ptr[get_psne_pg(i + poff)] - [get_psne_idx(i + poff)]; - /*psns_ext will be used only for P5 chips. */ - sq->swq[i].psn_ext = - (struct sq_psn_search_ext *) - &psn_search_ptr[get_psne_pg(i + poff)] - [get_psne_idx(i + poff)]; - } - } + + if (psn_sz) + bnxt_qplib_init_psn_ptr(qp, psn_sz); + pbl = &sq->hwq.pbl[PBL_LVL_0]; req.sq_pbl = cpu_to_le64(pbl->pg_map_arr[0]); pg_sz_lvl = (bnxt_qplib_base_pg_size(&sq->hwq) << @@ -1002,7 +1000,7 @@ int bnxt_qplib_create_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp) if (rq->max_wqe) { hwq_attr.res = res; hwq_attr.sginfo = &rq->sg_info; - hwq_attr.stride = BNXT_QPLIB_MAX_RQE_ENTRY_SIZE; + hwq_attr.stride = rq->wqe_size; hwq_attr.depth = rq->max_wqe; hwq_attr.aux_stride = 0; hwq_attr.aux_depth = 0; @@ -1425,12 +1423,11 @@ bail: static void __clean_cq(struct bnxt_qplib_cq *cq, u64 qp) { struct bnxt_qplib_hwq *cq_hwq = &cq->hwq; - struct cq_base *hw_cqe, **hw_cqe_ptr; + struct cq_base *hw_cqe; int i; for (i = 0; i < cq_hwq->max_elements; i++) { - hw_cqe_ptr = (struct cq_base **)cq_hwq->pbl_ptr; - hw_cqe = &hw_cqe_ptr[CQE_PG(i)][CQE_IDX(i)]; + hw_cqe = bnxt_qplib_get_qe(cq_hwq, i, NULL); if (!CQE_CMP_VALID(hw_cqe, i, cq_hwq->max_elements)) continue; /* @@ -1557,6 +1554,34 @@ void *bnxt_qplib_get_qp1_rq_buf(struct bnxt_qplib_qp *qp, return NULL; } +static void bnxt_qplib_fill_psn_search(struct bnxt_qplib_qp *qp, + struct bnxt_qplib_swqe *wqe, + struct bnxt_qplib_swq *swq) +{ + struct sq_psn_search_ext *psns_ext; + struct sq_psn_search *psns; + u32 flg_npsn; + u32 op_spsn; + + psns = swq->psn_search; + psns_ext = swq->psn_ext; + + op_spsn = ((swq->start_psn << SQ_PSN_SEARCH_START_PSN_SFT) & + SQ_PSN_SEARCH_START_PSN_MASK); + op_spsn |= ((wqe->type << SQ_PSN_SEARCH_OPCODE_SFT) & + SQ_PSN_SEARCH_OPCODE_MASK); + flg_npsn = ((swq->next_psn << SQ_PSN_SEARCH_NEXT_PSN_SFT) & + SQ_PSN_SEARCH_NEXT_PSN_MASK); + + if (bnxt_qplib_is_chip_gen_p5(qp->cctx)) { + psns_ext->opcode_start_psn = cpu_to_le32(op_spsn); + psns_ext->flags_next_psn = cpu_to_le32(flg_npsn); + } else { + psns->opcode_start_psn = cpu_to_le32(op_spsn); + psns->flags_next_psn = cpu_to_le32(flg_npsn); + } +} + void bnxt_qplib_post_send_db(struct bnxt_qplib_qp *qp) { struct bnxt_qplib_q *sq = &qp->sq; @@ -1567,16 +1592,16 @@ void bnxt_qplib_post_send_db(struct bnxt_qplib_qp *qp) int bnxt_qplib_post_send(struct bnxt_qplib_qp *qp, struct bnxt_qplib_swqe *wqe) { + struct bnxt_qplib_nq_work *nq_work = NULL; + int i, rc = 0, data_len = 0, pkt_num = 0; struct bnxt_qplib_q *sq = &qp->sq; + struct sq_send *hw_sq_send_hdr; struct bnxt_qplib_swq *swq; - struct sq_send *hw_sq_send_hdr, **hw_sq_send_ptr; - struct sq_sge *hw_sge; - struct bnxt_qplib_nq_work *nq_work = NULL; bool sch_handler = false; - u32 sw_prod; + struct sq_sge *hw_sge; u8 wqe_size16; - int i, rc = 0, data_len = 0, pkt_num = 0; __le32 temp32; + u32 sw_prod; if (qp->state != CMDQ_MODIFY_QP_NEW_STATE_RTS) { if (qp->state == CMDQ_MODIFY_QP_NEW_STATE_ERR) { @@ -1605,11 +1630,8 @@ int bnxt_qplib_post_send(struct bnxt_qplib_qp *qp, swq->flags |= SQ_SEND_FLAGS_SIGNAL_COMP; swq->start_psn = sq->psn & BTH_PSN_MASK; - hw_sq_send_ptr = (struct sq_send **)sq->hwq.pbl_ptr; - hw_sq_send_hdr = &hw_sq_send_ptr[get_sqe_pg(sw_prod)] - [get_sqe_idx(sw_prod)]; - - memset(hw_sq_send_hdr, 0, BNXT_QPLIB_MAX_SQE_ENTRY_SIZE); + hw_sq_send_hdr = bnxt_qplib_get_qe(&sq->hwq, sw_prod, NULL); + memset(hw_sq_send_hdr, 0, sq->wqe_size); if (wqe->flags & BNXT_QPLIB_SWQE_FLAGS_INLINE) { /* Copy the inline data */ @@ -1796,28 +1818,8 @@ int bnxt_qplib_post_send(struct bnxt_qplib_qp *qp, goto done; } swq->next_psn = sq->psn & BTH_PSN_MASK; - if (swq->psn_search) { - u32 opcd_spsn; - u32 flg_npsn; - - opcd_spsn = ((swq->start_psn << SQ_PSN_SEARCH_START_PSN_SFT) & - SQ_PSN_SEARCH_START_PSN_MASK); - opcd_spsn |= ((wqe->type << SQ_PSN_SEARCH_OPCODE_SFT) & - SQ_PSN_SEARCH_OPCODE_MASK); - flg_npsn = ((swq->next_psn << SQ_PSN_SEARCH_NEXT_PSN_SFT) & - SQ_PSN_SEARCH_NEXT_PSN_MASK); - if (bnxt_qplib_is_chip_gen_p5(qp->cctx)) { - swq->psn_ext->opcode_start_psn = - cpu_to_le32(opcd_spsn); - swq->psn_ext->flags_next_psn = - cpu_to_le32(flg_npsn); - } else { - swq->psn_search->opcode_start_psn = - cpu_to_le32(opcd_spsn); - swq->psn_search->flags_next_psn = - cpu_to_le32(flg_npsn); - } - } + if (qp->type == CMDQ_CREATE_QP_TYPE_RC) + bnxt_qplib_fill_psn_search(qp, wqe, swq); queue_err: if (sch_handler) { /* Store the ULP info in the software structures */ @@ -1860,13 +1862,13 @@ void bnxt_qplib_post_recv_db(struct bnxt_qplib_qp *qp) int bnxt_qplib_post_recv(struct bnxt_qplib_qp *qp, struct bnxt_qplib_swqe *wqe) { - struct bnxt_qplib_q *rq = &qp->rq; - struct rq_wqe *rqe, **rqe_ptr; - struct sq_sge *hw_sge; struct bnxt_qplib_nq_work *nq_work = NULL; + struct bnxt_qplib_q *rq = &qp->rq; bool sch_handler = false; - u32 sw_prod; + struct sq_sge *hw_sge; + struct rq_wqe *rqe; int i, rc = 0; + u32 sw_prod; if (qp->state == CMDQ_MODIFY_QP_NEW_STATE_ERR) { sch_handler = true; @@ -1883,10 +1885,8 @@ int bnxt_qplib_post_recv(struct bnxt_qplib_qp *qp, sw_prod = HWQ_CMP(rq->hwq.prod, &rq->hwq); rq->swq[sw_prod].wr_id = wqe->wr_id; - rqe_ptr = (struct rq_wqe **)rq->hwq.pbl_ptr; - rqe = &rqe_ptr[RQE_PG(sw_prod)][RQE_IDX(sw_prod)]; - - memset(rqe, 0, BNXT_QPLIB_MAX_RQE_ENTRY_SIZE); + rqe = bnxt_qplib_get_qe(&rq->hwq, sw_prod, NULL); + memset(rqe, 0, rq->wqe_size); /* Calculate wqe_size16 and data_len */ for (i = 0, hw_sge = (struct sq_sge *)rqe->data; @@ -1939,8 +1939,8 @@ int bnxt_qplib_create_cq(struct bnxt_qplib_res *res, struct bnxt_qplib_cq *cq) struct bnxt_qplib_rcfw *rcfw = res->rcfw; struct bnxt_qplib_hwq_attr hwq_attr = {}; struct creq_create_cq_resp resp; - struct cmdq_create_cq req; struct bnxt_qplib_pbl *pbl; + struct cmdq_create_cq req; u16 cmd_flags = 0; u32 pg_sz_lvl; int rc; @@ -2128,13 +2128,13 @@ void bnxt_qplib_mark_qp_error(void *qp_handle) static int do_wa9060(struct bnxt_qplib_qp *qp, struct bnxt_qplib_cq *cq, u32 cq_cons, u32 sw_sq_cons, u32 cqe_sq_cons) { - struct bnxt_qplib_q *sq = &qp->sq; - struct bnxt_qplib_swq *swq; u32 peek_sw_cq_cons, peek_raw_cq_cons, peek_sq_cons_idx; - struct cq_base *peek_hwcqe, **peek_hw_cqe_ptr; + struct bnxt_qplib_q *sq = &qp->sq; struct cq_req *peek_req_hwcqe; struct bnxt_qplib_qp *peek_qp; struct bnxt_qplib_q *peek_sq; + struct bnxt_qplib_swq *swq; + struct cq_base *peek_hwcqe; int i, rc = 0; /* Normal mode */ @@ -2164,9 +2164,8 @@ static int do_wa9060(struct bnxt_qplib_qp *qp, struct bnxt_qplib_cq *cq, i = cq->hwq.max_elements; while (i--) { peek_sw_cq_cons = HWQ_CMP((peek_sw_cq_cons), &cq->hwq); - peek_hw_cqe_ptr = (struct cq_base **)cq->hwq.pbl_ptr; - peek_hwcqe = &peek_hw_cqe_ptr[CQE_PG(peek_sw_cq_cons)] - [CQE_IDX(peek_sw_cq_cons)]; + peek_hwcqe = bnxt_qplib_get_qe(&cq->hwq, + peek_sw_cq_cons, NULL); /* If the next hwcqe is VALID */ if (CQE_CMP_VALID(peek_hwcqe, peek_raw_cq_cons, cq->hwq.max_elements)) { @@ -2228,11 +2227,11 @@ static int bnxt_qplib_cq_process_req(struct bnxt_qplib_cq *cq, struct bnxt_qplib_cqe **pcqe, int *budget, u32 cq_cons, struct bnxt_qplib_qp **lib_qp) { - struct bnxt_qplib_qp *qp; - struct bnxt_qplib_q *sq; - struct bnxt_qplib_cqe *cqe; u32 sw_sq_cons, cqe_sq_cons; struct bnxt_qplib_swq *swq; + struct bnxt_qplib_cqe *cqe; + struct bnxt_qplib_qp *qp; + struct bnxt_qplib_q *sq; int rc = 0; qp = (struct bnxt_qplib_qp *)((unsigned long) @@ -2342,10 +2341,10 @@ static int bnxt_qplib_cq_process_res_rc(struct bnxt_qplib_cq *cq, struct bnxt_qplib_cqe **pcqe, int *budget) { - struct bnxt_qplib_qp *qp; - struct bnxt_qplib_q *rq; struct bnxt_qplib_srq *srq; struct bnxt_qplib_cqe *cqe; + struct bnxt_qplib_qp *qp; + struct bnxt_qplib_q *rq; u32 wr_id_idx; int rc = 0; @@ -2417,10 +2416,10 @@ static int bnxt_qplib_cq_process_res_ud(struct bnxt_qplib_cq *cq, struct bnxt_qplib_cqe **pcqe, int *budget) { - struct bnxt_qplib_qp *qp; - struct bnxt_qplib_q *rq; struct bnxt_qplib_srq *srq; struct bnxt_qplib_cqe *cqe; + struct bnxt_qplib_qp *qp; + struct bnxt_qplib_q *rq; u32 wr_id_idx; int rc = 0; @@ -2495,15 +2494,13 @@ done: bool bnxt_qplib_is_cq_empty(struct bnxt_qplib_cq *cq) { - struct cq_base *hw_cqe, **hw_cqe_ptr; + struct cq_base *hw_cqe; u32 sw_cons, raw_cons; bool rc = true; raw_cons = cq->hwq.cons; sw_cons = HWQ_CMP(raw_cons, &cq->hwq); - hw_cqe_ptr = (struct cq_base **)cq->hwq.pbl_ptr; - hw_cqe = &hw_cqe_ptr[CQE_PG(sw_cons)][CQE_IDX(sw_cons)]; - + hw_cqe = bnxt_qplib_get_qe(&cq->hwq, sw_cons, NULL); /* Check for Valid bit. If the CQE is valid, return false */ rc = !CQE_CMP_VALID(hw_cqe, raw_cons, cq->hwq.max_elements); return rc; @@ -2747,7 +2744,7 @@ int bnxt_qplib_process_flush_list(struct bnxt_qplib_cq *cq, int bnxt_qplib_poll_cq(struct bnxt_qplib_cq *cq, struct bnxt_qplib_cqe *cqe, int num_cqes, struct bnxt_qplib_qp **lib_qp) { - struct cq_base *hw_cqe, **hw_cqe_ptr; + struct cq_base *hw_cqe; u32 sw_cons, raw_cons; int budget, rc = 0; @@ -2756,8 +2753,7 @@ int bnxt_qplib_poll_cq(struct bnxt_qplib_cq *cq, struct bnxt_qplib_cqe *cqe, while (budget) { sw_cons = HWQ_CMP(raw_cons, &cq->hwq); - hw_cqe_ptr = (struct cq_base **)cq->hwq.pbl_ptr; - hw_cqe = &hw_cqe_ptr[CQE_PG(sw_cons)][CQE_IDX(sw_cons)]; + hw_cqe = bnxt_qplib_get_qe(&cq->hwq, sw_cons, NULL); /* Check for Valid bit */ if (!CQE_CMP_VALID(hw_cqe, raw_cons, cq->hwq.max_elements)) diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.h b/drivers/infiniband/hw/bnxt_re/qplib_fp.h index 7edb70b6bb16..568ca390322c 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_fp.h +++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.h @@ -45,6 +45,7 @@ struct bnxt_qplib_srq { struct bnxt_qplib_db_info dbinfo; u64 srq_handle; u32 id; + u16 wqe_size; u32 max_wqe; u32 max_sge; u32 threshold; @@ -65,38 +66,7 @@ struct bnxt_qplib_sge { u32 size; }; -#define BNXT_QPLIB_MAX_SQE_ENTRY_SIZE sizeof(struct sq_send) - -#define SQE_CNT_PER_PG (PAGE_SIZE / BNXT_QPLIB_MAX_SQE_ENTRY_SIZE) -#define SQE_MAX_IDX_PER_PG (SQE_CNT_PER_PG - 1) - -static inline u32 get_sqe_pg(u32 val) -{ - return ((val & ~SQE_MAX_IDX_PER_PG) / SQE_CNT_PER_PG); -} - -static inline u32 get_sqe_idx(u32 val) -{ - return (val & SQE_MAX_IDX_PER_PG); -} - -#define BNXT_QPLIB_MAX_PSNE_ENTRY_SIZE sizeof(struct sq_psn_search) - -#define PSNE_CNT_PER_PG (PAGE_SIZE / BNXT_QPLIB_MAX_PSNE_ENTRY_SIZE) -#define PSNE_MAX_IDX_PER_PG (PSNE_CNT_PER_PG - 1) - -static inline u32 get_psne_pg(u32 val) -{ - return ((val & ~PSNE_MAX_IDX_PER_PG) / PSNE_CNT_PER_PG); -} - -static inline u32 get_psne_idx(u32 val) -{ - return (val & PSNE_MAX_IDX_PER_PG); -} - #define BNXT_QPLIB_QP_MAX_SGL 6 - struct bnxt_qplib_swq { u64 wr_id; int next_idx; @@ -226,19 +196,13 @@ struct bnxt_qplib_swqe { }; }; -#define BNXT_QPLIB_MAX_RQE_ENTRY_SIZE sizeof(struct rq_wqe) - -#define RQE_CNT_PER_PG (PAGE_SIZE / BNXT_QPLIB_MAX_RQE_ENTRY_SIZE) -#define RQE_MAX_IDX_PER_PG (RQE_CNT_PER_PG - 1) -#define RQE_PG(x) (((x) & ~RQE_MAX_IDX_PER_PG) / RQE_CNT_PER_PG) -#define RQE_IDX(x) ((x) & RQE_MAX_IDX_PER_PG) - struct bnxt_qplib_q { struct bnxt_qplib_hwq hwq; struct bnxt_qplib_swq *swq; struct bnxt_qplib_db_info dbinfo; struct bnxt_qplib_sg_info sg_info; u32 max_wqe; + u16 wqe_size; u16 q_full_delta; u16 max_sge; u32 psn; @@ -256,7 +220,7 @@ struct bnxt_qplib_qp { struct bnxt_qplib_dpi *dpi; struct bnxt_qplib_chip_ctx *cctx; u64 qp_handle; -#define BNXT_QPLIB_QP_ID_INVALID 0xFFFFFFFF +#define BNXT_QPLIB_QP_ID_INVALID 0xFFFFFFFF u32 id; u8 type; u8 sig_type; diff --git a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c index fe5e06f85ffc..4e211162acee 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c @@ -89,10 +89,9 @@ static int __send_message(struct bnxt_qplib_rcfw *rcfw, struct cmdq_base *req, struct creq_base *resp, void *sb, u8 is_block) { struct bnxt_qplib_cmdq_ctx *cmdq = &rcfw->cmdq; - struct bnxt_qplib_cmdqe *cmdqe, **hwq_ptr; struct bnxt_qplib_hwq *hwq = &cmdq->hwq; struct bnxt_qplib_crsqe *crsqe; - u32 cmdq_depth = rcfw->cmdq_depth; + struct bnxt_qplib_cmdqe *cmdqe; u32 sw_prod, cmdq_prod; struct pci_dev *pdev; unsigned long flags; @@ -163,13 +162,11 @@ static int __send_message(struct bnxt_qplib_rcfw *rcfw, struct cmdq_base *req, BNXT_QPLIB_CMDQE_UNITS; } - hwq_ptr = (struct bnxt_qplib_cmdqe **)hwq->pbl_ptr; preq = (u8 *)req; do { /* Locate the next cmdq slot */ sw_prod = HWQ_CMP(hwq->prod, hwq); - cmdqe = &hwq_ptr[get_cmdq_pg(sw_prod, cmdq_depth)] - [get_cmdq_idx(sw_prod, cmdq_depth)]; + cmdqe = bnxt_qplib_get_qe(hwq, sw_prod, NULL); if (!cmdqe) { dev_err(&pdev->dev, "RCFW request failed with no cmdqe!\n"); @@ -378,7 +375,7 @@ static void bnxt_qplib_service_creq(unsigned long data) struct bnxt_qplib_creq_ctx *creq = &rcfw->creq; u32 type, budget = CREQ_ENTRY_POLL_BUDGET; struct bnxt_qplib_hwq *hwq = &creq->hwq; - struct creq_base *creqe, **hwq_ptr; + struct creq_base *creqe; u32 sw_cons, raw_cons; unsigned long flags; @@ -387,8 +384,7 @@ static void bnxt_qplib_service_creq(unsigned long data) raw_cons = hwq->cons; while (budget > 0) { sw_cons = HWQ_CMP(raw_cons, hwq); - hwq_ptr = (struct creq_base **)hwq->pbl_ptr; - creqe = &hwq_ptr[get_creq_pg(sw_cons)][get_creq_idx(sw_cons)]; + creqe = bnxt_qplib_get_qe(hwq, sw_cons, NULL); if (!CREQ_CMP_VALID(creqe, raw_cons, hwq->max_elements)) break; /* The valid test of the entry must be done first before @@ -434,7 +430,6 @@ static irqreturn_t bnxt_qplib_creq_irq(int irq, void *dev_instance) { struct bnxt_qplib_rcfw *rcfw = dev_instance; struct bnxt_qplib_creq_ctx *creq; - struct creq_base **creq_ptr; struct bnxt_qplib_hwq *hwq; u32 sw_cons; @@ -442,8 +437,7 @@ static irqreturn_t bnxt_qplib_creq_irq(int irq, void *dev_instance) hwq = &creq->hwq; /* Prefetch the CREQ element */ sw_cons = HWQ_CMP(hwq->cons, hwq); - creq_ptr = (struct creq_base **)creq->hwq.pbl_ptr; - prefetch(&creq_ptr[get_creq_pg(sw_cons)][get_creq_idx(sw_cons)]); + prefetch(bnxt_qplib_get_qe(hwq, sw_cons, NULL)); tasklet_schedule(&creq->creq_tasklet); diff --git a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h index 411fce3493b6..bf384098f4b2 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h +++ b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h @@ -87,12 +87,6 @@ static inline u32 bnxt_qplib_cmdqe_page_size(u32 depth) return (bnxt_qplib_cmdqe_npages(depth) * PAGE_SIZE); } -static inline u32 bnxt_qplib_cmdqe_cnt_per_pg(u32 depth) -{ - return (bnxt_qplib_cmdqe_page_size(depth) / - BNXT_QPLIB_CMDQE_UNITS); -} - /* Set the cmd_size to a factor of CMDQE unit */ static inline void bnxt_qplib_set_cmd_slots(struct cmdq_base *req) { @@ -100,30 +94,12 @@ static inline void bnxt_qplib_set_cmd_slots(struct cmdq_base *req) BNXT_QPLIB_CMDQE_UNITS; } -#define MAX_CMDQ_IDX(depth) ((depth) - 1) - -static inline u32 bnxt_qplib_max_cmdq_idx_per_pg(u32 depth) -{ - return (bnxt_qplib_cmdqe_cnt_per_pg(depth) - 1); -} - #define RCFW_MAX_COOKIE_VALUE 0x7FFF #define RCFW_CMD_IS_BLOCKING 0x8000 #define RCFW_BLOCKED_CMD_WAIT_COUNT 0x4E20 #define HWRM_VERSION_RCFW_CMDQ_DEPTH_CHECK 0x1000900020011ULL -static inline u32 get_cmdq_pg(u32 val, u32 depth) -{ - return (val & ~(bnxt_qplib_max_cmdq_idx_per_pg(depth))) / - (bnxt_qplib_cmdqe_cnt_per_pg(depth)); -} - -static inline u32 get_cmdq_idx(u32 val, u32 depth) -{ - return val & (bnxt_qplib_max_cmdq_idx_per_pg(depth)); -} - /* Crsq buf is 1024-Byte */ struct bnxt_qplib_crsbe { u8 data[1024]; @@ -133,23 +109,6 @@ struct bnxt_qplib_crsbe { /* Allocate 1 per QP for async error notification for now */ #define BNXT_QPLIB_CREQE_MAX_CNT (64 * 1024) #define BNXT_QPLIB_CREQE_UNITS 16 /* 16-Bytes per prod unit */ -#define BNXT_QPLIB_CREQE_CNT_PER_PG (PAGE_SIZE / BNXT_QPLIB_CREQE_UNITS) - -#define MAX_CREQ_IDX (BNXT_QPLIB_CREQE_MAX_CNT - 1) -#define MAX_CREQ_IDX_PER_PG (BNXT_QPLIB_CREQE_CNT_PER_PG - 1) - -static inline u32 get_creq_pg(u32 val) -{ - return (val & ~MAX_CREQ_IDX_PER_PG) / BNXT_QPLIB_CREQE_CNT_PER_PG; -} - -static inline u32 get_creq_idx(u32 val) -{ - return val & MAX_CREQ_IDX_PER_PG; -} - -#define BNXT_QPLIB_CREQE_PER_PG (PAGE_SIZE / sizeof(struct creq_base)) - #define CREQ_CMP_VALID(hdr, raw_cons, cp_bit) \ (!!((hdr)->v & CREQ_BASE_V) == \ !((raw_cons) & (cp_bit))) diff --git a/drivers/infiniband/hw/bnxt_re/qplib_res.c b/drivers/infiniband/hw/bnxt_re/qplib_res.c index cab1adf1fed9..7efa6e5dce62 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_res.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_res.c @@ -347,6 +347,7 @@ done: hwq->depth = hwq_attr->depth; hwq->max_elements = depth; hwq->element_size = stride; + hwq->qe_ppg = pg_size / stride; /* For direct access to the elements */ lvl = hwq->level; if (hwq_attr->sginfo->nopte && hwq->level) diff --git a/drivers/infiniband/hw/bnxt_re/qplib_res.h b/drivers/infiniband/hw/bnxt_re/qplib_res.h index 79109ef6c70c..c29cbd3a2d7b 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_res.h +++ b/drivers/infiniband/hw/bnxt_re/qplib_res.h @@ -135,6 +135,7 @@ struct bnxt_qplib_hwq { u32 max_elements; u32 depth; u16 element_size; /* Size of each entry */ + u16 qe_ppg; /* queue entry per page */ u32 prod; /* raw */ u32 cons; /* raw */ @@ -304,6 +305,18 @@ static inline u8 bnxt_qplib_base_pg_size(struct bnxt_qplib_hwq *hwq) return pg_size; } +static inline void *bnxt_qplib_get_qe(struct bnxt_qplib_hwq *hwq, + u32 indx, u64 *pg) +{ + u32 pg_num, pg_idx; + + pg_num = (indx / hwq->qe_ppg); + pg_idx = (indx % hwq->qe_ppg); + if (pg) + *pg = (u64)&hwq->pbl_ptr[pg_num]; + return (void *)(hwq->pbl_ptr[pg_num] + hwq->element_size * pg_idx); +} + #define to_bnxt_qplib(ptr, type, member) \ container_of(ptr, type, member) -- cgit v1.2.3 From 8ce111d00e64b52e7c77a5956434384e906f7394 Mon Sep 17 00:00:00 2001 From: Devesh Sharma Date: Thu, 2 Apr 2020 14:12:15 -0400 Subject: RDMA/bnxt_re: Remove dead code from rcfw In the previous refactoring serise there were few leftover functions which are not is use anymore. Removed them as it is a dead code. Fixes: 6f53196bc5e7 ("RDMA/bnxt_re: Refactor doorbell management functions") Link: https://lore.kernel.org/r/1585851136-2316-5-git-send-email-devesh.sharma@broadcom.com Signed-off-by: Devesh Sharma Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/bnxt_re/qplib_rcfw.h | 50 ------------------------------ 1 file changed, 50 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h index bf384098f4b2..157387636d00 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h +++ b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h @@ -112,56 +112,6 @@ struct bnxt_qplib_crsbe { #define CREQ_CMP_VALID(hdr, raw_cons, cp_bit) \ (!!((hdr)->v & CREQ_BASE_V) == \ !((raw_cons) & (cp_bit))) - -#define CREQ_DB_KEY_CP (0x2 << CMPL_DOORBELL_KEY_SFT) -#define CREQ_DB_IDX_VALID CMPL_DOORBELL_IDX_VALID -#define CREQ_DB_IRQ_DIS CMPL_DOORBELL_MASK -#define CREQ_DB_CP_FLAGS_REARM (CREQ_DB_KEY_CP | \ - CREQ_DB_IDX_VALID) -#define CREQ_DB_CP_FLAGS (CREQ_DB_KEY_CP | \ - CREQ_DB_IDX_VALID | \ - CREQ_DB_IRQ_DIS) - -static inline void bnxt_qplib_ring_creq_db64(void __iomem *db, u32 index, - u32 xid, bool arm) -{ - u64 val = 0; - - val = xid & DBC_DBC_XID_MASK; - val |= DBC_DBC_PATH_ROCE; - val |= arm ? DBC_DBC_TYPE_NQ_ARM : DBC_DBC_TYPE_NQ; - val <<= 32; - val |= index & DBC_DBC_INDEX_MASK; - - writeq(val, db); -} - -static inline void bnxt_qplib_ring_creq_db_rearm(void __iomem *db, u32 raw_cons, - u32 max_elements, u32 xid, - bool gen_p5) -{ - u32 index = raw_cons & (max_elements - 1); - - if (gen_p5) - bnxt_qplib_ring_creq_db64(db, index, xid, true); - else - writel(CREQ_DB_CP_FLAGS_REARM | (index & DBC_DBC32_XID_MASK), - db); -} - -static inline void bnxt_qplib_ring_creq_db(void __iomem *db, u32 raw_cons, - u32 max_elements, u32 xid, - bool gen_p5) -{ - u32 index = raw_cons & (max_elements - 1); - - if (gen_p5) - bnxt_qplib_ring_creq_db64(db, index, xid, true); - else - writel(CREQ_DB_CP_FLAGS | (index & DBC_DBC32_XID_MASK), - db); -} - #define CREQ_ENTRY_POLL_BUDGET 0x100 /* HWQ */ -- cgit v1.2.3 From dd302ee41e6ed204f8d9534d511edc72b5ce5e53 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 13 Apr 2020 16:23:23 +0300 Subject: RDMA/cma: Limit the scope of rdma_is_consumer_reject function The function is local to cma.c, so let's limit its scope. Link: https://lore.kernel.org/r/20200413132323.930869-1-leon@kernel.org Reviewed-by: Maor Gottlieb Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cma.c | 9 +++++++-- include/rdma/rdma_cm.h | 8 -------- 2 files changed, 7 insertions(+), 10 deletions(-) diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 26e6f7df247b..6406a597dfb6 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -91,7 +91,13 @@ const char *__attribute_const__ rdma_reject_msg(struct rdma_cm_id *id, } EXPORT_SYMBOL(rdma_reject_msg); -bool rdma_is_consumer_reject(struct rdma_cm_id *id, int reason) +/** + * rdma_is_consumer_reject - return true if the consumer rejected the connect + * request. + * @id: Communication identifier that received the REJECT event. + * @reason: Value returned in the REJECT event status field. + */ +static bool rdma_is_consumer_reject(struct rdma_cm_id *id, int reason) { if (rdma_ib_or_roce(id->device, id->port_num)) return reason == IB_CM_REJ_CONSUMER_DEFINED; @@ -102,7 +108,6 @@ bool rdma_is_consumer_reject(struct rdma_cm_id *id, int reason) WARN_ON_ONCE(1); return false; } -EXPORT_SYMBOL(rdma_is_consumer_reject); const void *rdma_consumer_reject_data(struct rdma_cm_id *id, struct rdma_cm_event *ev, u8 *data_len) diff --git a/include/rdma/rdma_cm.h b/include/rdma/rdma_cm.h index 71f48cfdc24c..ea8e794785ed 100644 --- a/include/rdma/rdma_cm.h +++ b/include/rdma/rdma_cm.h @@ -389,14 +389,6 @@ __be64 rdma_get_service_id(struct rdma_cm_id *id, struct sockaddr *addr); */ const char *__attribute_const__ rdma_reject_msg(struct rdma_cm_id *id, int reason); -/** - * rdma_is_consumer_reject - return true if the consumer rejected the connect - * request. - * @id: Communication identifier that received the REJECT event. - * @reason: Value returned in the REJECT event status field. - */ -bool rdma_is_consumer_reject(struct rdma_cm_id *id, int reason); - /** * rdma_consumer_reject_data - return the consumer reject private data and * length, if any. -- cgit v1.2.3 From 95a776e8a6282ac19051655c55124e711318ae6e Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Mon, 13 Apr 2020 16:39:05 +0300 Subject: RDMA/rw: use DIV_ROUND_UP to calculate nr_ops Don't open-code DIV_ROUND_UP() kernel macro. Link: https://lore.kernel.org/r/20200413133905.933343-1-leon@kernel.org Signed-off-by: Max Gurtovoy Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/rw.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/core/rw.c b/drivers/infiniband/core/rw.c index 557efbf29197..614cff89fc71 100644 --- a/drivers/infiniband/core/rw.c +++ b/drivers/infiniband/core/rw.c @@ -129,7 +129,7 @@ static int rdma_rw_init_mr_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp, qp->integrity_en); int i, j, ret = 0, count = 0; - ctx->nr_ops = (sg_cnt + pages_per_mr - 1) / pages_per_mr; + ctx->nr_ops = DIV_ROUND_UP(sg_cnt, pages_per_mr); ctx->reg = kcalloc(ctx->nr_ops, sizeof(*ctx->reg), GFP_KERNEL); if (!ctx->reg) { ret = -ENOMEM; -- cgit v1.2.3 From 3c873161a0d7d1e11f1ce9cc59f89a009fb65711 Mon Sep 17 00:00:00 2001 From: Xi Wang Date: Mon, 13 Apr 2020 19:58:06 +0800 Subject: RDMA/hns: Add support for addressing when hopnum is 0 Currently, WQE and EQE table have already used the mtr interface to config and access memory by multi-hop addressing when hopnum is from 1 to 3. But if hopnum is 0, each table need write its own but repetitive logic, and many duplicate code exists in the mtr interfaces invoke process. So wraps the public logic as 3 functions: hns_roce_mtr_create(), hns_roce_mtr_destroy() and hns_roce_mtr_map() to support hopnum ranges from 0 to 3. In addition, makes the mtr interfaces easier to use. Link: https://lore.kernel.org/r/1586779091-51410-2-git-send-email-liweihang@huawei.com Signed-off-by: Xi Wang Signed-off-by: Weihang Li Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_device.h | 46 ++- drivers/infiniband/hw/hns/hns_roce_hem.c | 9 +- drivers/infiniband/hw/hns/hns_roce_hem.h | 5 +- drivers/infiniband/hw/hns/hns_roce_mr.c | 423 ++++++++++++++++++++++++++-- 4 files changed, 450 insertions(+), 33 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index f6b3cf6b95d6..4a7afec4899c 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -271,6 +271,9 @@ enum { #define PAGE_ADDR_SHIFT 12 +/* The minimum page count for hardware access page directly. */ +#define HNS_HW_DIRECT_PAGE_COUNT 2 + struct hns_roce_uar { u64 pfn; unsigned long index; @@ -357,13 +360,32 @@ struct hns_roce_hem_list { struct list_head mid_bt[HNS_ROCE_MAX_BT_REGION][HNS_ROCE_MAX_BT_LEVEL]; struct list_head btm_bt; /* link all bottom bt in @mid_bt */ dma_addr_t root_ba; /* pointer to the root ba table */ - int bt_pg_shift; +}; + +struct hns_roce_buf_attr { + struct { + size_t size; /* region size */ + int hopnum; /* multi-hop addressing hop num */ + } region[HNS_ROCE_MAX_BT_REGION]; + int region_count; /* valid region count */ + int page_shift; /* buffer page shift */ + bool fixed_page; /* decide page shift is fixed-size or maximum size */ + int user_access; /* umem access flag */ + bool mtt_only; /* only alloc buffer-required MTT memory */ }; /* memory translate region */ struct hns_roce_mtr { - struct hns_roce_hem_list hem_list; - int buf_pg_shift; + struct hns_roce_hem_list hem_list; /* multi-hop addressing resource */ + struct ib_umem *umem; /* user space buffer */ + struct hns_roce_buf *kmem; /* kernel space buffer */ + struct { + dma_addr_t root_ba; /* root BA table's address */ + bool is_direct; /* addressing without BA table */ + int ba_pg_shift; /* BA table page shift */ + int buf_pg_shift; /* buffer page shift */ + int buf_pg_count; /* buffer page count */ + } hem_cfg; /* config for hardware addressing */ }; struct hns_roce_mw { @@ -1113,6 +1135,16 @@ static inline void *hns_roce_buf_offset(struct hns_roce_buf *buf, int offset) (offset & (page_size - 1)); } +static inline u64 to_hr_hw_page_addr(u64 addr) +{ + return addr >> PAGE_ADDR_SHIFT; +} + +static inline u32 to_hr_hw_page_shift(u32 page_shift) +{ + return page_shift - PAGE_ADDR_SHIFT; +} + int hns_roce_init_uar_table(struct hns_roce_dev *dev); int hns_roce_uar_alloc(struct hns_roce_dev *dev, struct hns_roce_uar *uar); void hns_roce_uar_free(struct hns_roce_dev *dev, struct hns_roce_uar *uar); @@ -1144,6 +1176,14 @@ void hns_roce_mtr_cleanup(struct hns_roce_dev *hr_dev, #define MTT_MIN_COUNT 2 int hns_roce_mtr_find(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr, int offset, u64 *mtt_buf, int mtt_max, u64 *base_addr); +int hns_roce_mtr_create(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr, + struct hns_roce_buf_attr *buf_attr, int page_shift, + struct ib_udata *udata, unsigned long user_addr); +void hns_roce_mtr_destroy(struct hns_roce_dev *hr_dev, + struct hns_roce_mtr *mtr); +int hns_roce_mtr_map(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr, + struct hns_roce_buf_region *regions, int region_cnt, + dma_addr_t *pages, int page_cnt); int hns_roce_init_pd_table(struct hns_roce_dev *hr_dev); int hns_roce_init_mr_table(struct hns_roce_dev *hr_dev); diff --git a/drivers/infiniband/hw/hns/hns_roce_hem.c b/drivers/infiniband/hw/hns/hns_roce_hem.c index 263338b90d7a..a245e753afe9 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hem.c +++ b/drivers/infiniband/hw/hns/hns_roce_hem.c @@ -1505,7 +1505,7 @@ err_exit: int hns_roce_hem_list_request(struct hns_roce_dev *hr_dev, struct hns_roce_hem_list *hem_list, const struct hns_roce_buf_region *regions, - int region_cnt) + int region_cnt, int bt_pg_shift) { const struct hns_roce_buf_region *r; int ofs, end; @@ -1519,7 +1519,7 @@ int hns_roce_hem_list_request(struct hns_roce_dev *hr_dev, return -EINVAL; } - unit = (1 << hem_list->bt_pg_shift) / BA_BYTE_LEN; + unit = (1 << bt_pg_shift) / BA_BYTE_LEN; for (i = 0; i < region_cnt; i++) { r = ®ions[i]; if (!r->count) @@ -1566,8 +1566,7 @@ void hns_roce_hem_list_release(struct hns_roce_dev *hr_dev, hem_list->root_ba = 0; } -void hns_roce_hem_list_init(struct hns_roce_hem_list *hem_list, - int bt_page_order) +void hns_roce_hem_list_init(struct hns_roce_hem_list *hem_list) { int i, j; @@ -1576,8 +1575,6 @@ void hns_roce_hem_list_init(struct hns_roce_hem_list *hem_list, for (i = 0; i < HNS_ROCE_MAX_BT_REGION; i++) for (j = 0; j < HNS_ROCE_MAX_BT_LEVEL; j++) INIT_LIST_HEAD(&hem_list->mid_bt[i][j]); - - hem_list->bt_pg_shift = bt_page_order; } void *hns_roce_hem_list_find_mtt(struct hns_roce_dev *hr_dev, diff --git a/drivers/infiniband/hw/hns/hns_roce_hem.h b/drivers/infiniband/hw/hns/hns_roce_hem.h index 3bb8f78fb7b0..a00b6c27735a 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hem.h +++ b/drivers/infiniband/hw/hns/hns_roce_hem.h @@ -133,14 +133,13 @@ int hns_roce_calc_hem_mhop(struct hns_roce_dev *hr_dev, struct hns_roce_hem_mhop *mhop); bool hns_roce_check_whether_mhop(struct hns_roce_dev *hr_dev, u32 type); -void hns_roce_hem_list_init(struct hns_roce_hem_list *hem_list, - int bt_page_order); +void hns_roce_hem_list_init(struct hns_roce_hem_list *hem_list); int hns_roce_hem_list_calc_root_ba(const struct hns_roce_buf_region *regions, int region_cnt, int unit); int hns_roce_hem_list_request(struct hns_roce_dev *hr_dev, struct hns_roce_hem_list *hem_list, const struct hns_roce_buf_region *regions, - int region_cnt); + int region_cnt, int bt_pg_shift); void hns_roce_hem_list_release(struct hns_roce_dev *hr_dev, struct hns_roce_hem_list *hem_list); void *hns_roce_hem_list_find_mtt(struct hns_roce_dev *hr_dev, diff --git a/drivers/infiniband/hw/hns/hns_roce_mr.c b/drivers/infiniband/hw/hns/hns_roce_mr.c index 176f34692f88..b3af36938f63 100644 --- a/drivers/infiniband/hw/hns/hns_roce_mr.c +++ b/drivers/infiniband/hw/hns/hns_roce_mr.c @@ -1567,8 +1567,9 @@ int hns_roce_dealloc_mw(struct ib_mw *ibmw) void hns_roce_mtr_init(struct hns_roce_mtr *mtr, int bt_pg_shift, int buf_pg_shift) { - hns_roce_hem_list_init(&mtr->hem_list, bt_pg_shift); - mtr->buf_pg_shift = buf_pg_shift; + hns_roce_hem_list_init(&mtr->hem_list); + mtr->hem_cfg.buf_pg_shift = buf_pg_shift; + mtr->hem_cfg.ba_pg_shift = bt_pg_shift; } void hns_roce_mtr_cleanup(struct hns_roce_dev *hr_dev, @@ -1577,19 +1578,23 @@ void hns_roce_mtr_cleanup(struct hns_roce_dev *hr_dev, hns_roce_hem_list_release(hr_dev, &mtr->hem_list); } -static int hns_roce_write_mtr(struct hns_roce_dev *hr_dev, - struct hns_roce_mtr *mtr, dma_addr_t *bufs, - struct hns_roce_buf_region *r) +static int mtr_map_region(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr, + dma_addr_t *pages, struct hns_roce_buf_region *region) { + __le64 *mtts; int offset; int count; int npage; - u64 *mtts; + u64 addr; int end; int i; - offset = r->offset; - end = offset + r->count; + /* if hopnum is 0, buffer cannot store BAs, so skip write mtt */ + if (!region->hopnum) + return 0; + + offset = region->offset; + end = offset + region->count; npage = 0; while (offset < end) { mtts = hns_roce_hem_list_find_mtt(hr_dev, &mtr->hem_list, @@ -1597,13 +1602,13 @@ static int hns_roce_write_mtr(struct hns_roce_dev *hr_dev, if (!mtts) return -ENOBUFS; - /* Save page addr, low 12 bits : 0 */ for (i = 0; i < count; i++) { if (hr_dev->hw_rev == HNS_ROCE_HW_VER1) - mtts[i] = bufs[npage] >> PAGE_ADDR_SHIFT; + addr = to_hr_hw_page_addr(pages[npage]); else - mtts[i] = bufs[npage]; + addr = pages[npage]; + mtts[i] = cpu_to_le64(addr); npage++; } offset += count; @@ -1621,13 +1626,14 @@ int hns_roce_mtr_attach(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr, int i; ret = hns_roce_hem_list_request(hr_dev, &mtr->hem_list, regions, - region_cnt); + region_cnt, mtr->hem_cfg.ba_pg_shift); if (ret) return ret; + mtr->hem_cfg.root_ba = mtr->hem_list.root_ba; for (i = 0; i < region_cnt; i++) { r = ®ions[i]; - ret = hns_roce_write_mtr(hr_dev, mtr, bufs[i], r); + ret = mtr_map_region(hr_dev, mtr, bufs[i], r); if (ret) { dev_err(hr_dev->dev, "write mtr[%d/%d] err %d,offset=%d.\n", @@ -1644,37 +1650,412 @@ err_write: return ret; } +static inline bool mtr_has_mtt(struct hns_roce_buf_attr *attr) +{ + int i; + + for (i = 0; i < attr->region_count; i++) + if (attr->region[i].hopnum != HNS_ROCE_HOP_NUM_0 && + attr->region[i].hopnum > 0) + return true; + + /* because the mtr only one root base address, when hopnum is 0 means + * root base address equals the first buffer address, thus all alloced + * memory must in a continuous space accessed by direct mode. + */ + return false; +} + +static inline size_t mtr_bufs_size(struct hns_roce_buf_attr *attr) +{ + size_t size = 0; + int i; + + for (i = 0; i < attr->region_count; i++) + size += attr->region[i].size; + + return size; +} + +static inline int mtr_umem_page_count(struct ib_umem *umem, int page_shift) +{ + int count = ib_umem_page_count(umem); + + if (page_shift >= PAGE_SHIFT) + count >>= page_shift - PAGE_SHIFT; + else + count <<= PAGE_SHIFT - page_shift; + + return count; +} + +static inline size_t mtr_kmem_direct_size(bool is_direct, size_t alloc_size, + int page_shift) +{ + if (is_direct) + return ALIGN(alloc_size, 1 << page_shift); + else + return HNS_HW_DIRECT_PAGE_COUNT << page_shift; +} + +/* + * check the given pages in continuous address space + * Returns 0 on success, or the error page num. + */ +static inline int mtr_check_direct_pages(dma_addr_t *pages, int page_count, + int page_shift) +{ + size_t page_size = 1 << page_shift; + int i; + + for (i = 1; i < page_count; i++) + if (pages[i] - pages[i - 1] != page_size) + return i; + + return 0; +} + +static void mtr_free_bufs(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr) +{ + /* release user buffers */ + if (mtr->umem) { + ib_umem_release(mtr->umem); + mtr->umem = NULL; + } + + /* release kernel buffers */ + if (mtr->kmem) { + hns_roce_buf_free(hr_dev, mtr->kmem); + kfree(mtr->kmem); + mtr->kmem = NULL; + } +} + +static int mtr_alloc_bufs(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr, + struct hns_roce_buf_attr *buf_attr, bool is_direct, + struct ib_udata *udata, unsigned long user_addr) +{ + struct ib_device *ibdev = &hr_dev->ib_dev; + int max_pg_shift = buf_attr->page_shift; + int best_pg_shift = 0; + int all_pg_count = 0; + size_t direct_size; + size_t total_size; + unsigned long tmp; + int ret = 0; + + total_size = mtr_bufs_size(buf_attr); + if (total_size < 1) { + ibdev_err(ibdev, "Failed to check mtr size\n"); + return -EINVAL; + } + + if (udata) { + mtr->kmem = NULL; + mtr->umem = ib_umem_get(ibdev, user_addr, total_size, + buf_attr->user_access); + if (IS_ERR_OR_NULL(mtr->umem)) { + ibdev_err(ibdev, "Failed to get umem, ret %ld\n", + PTR_ERR(mtr->umem)); + return -ENOMEM; + } + if (buf_attr->fixed_page) { + best_pg_shift = max_pg_shift; + } else { + tmp = GENMASK(max_pg_shift, 0); + ret = ib_umem_find_best_pgsz(mtr->umem, tmp, user_addr); + best_pg_shift = (ret <= PAGE_SIZE) ? + PAGE_SHIFT : ilog2(ret); + } + all_pg_count = mtr_umem_page_count(mtr->umem, best_pg_shift); + ret = 0; + } else { + mtr->umem = NULL; + mtr->kmem = kzalloc(sizeof(*mtr->kmem), GFP_KERNEL); + if (!mtr->kmem) { + ibdev_err(ibdev, "Failed to alloc kmem\n"); + return -ENOMEM; + } + direct_size = mtr_kmem_direct_size(is_direct, total_size, + max_pg_shift); + ret = hns_roce_buf_alloc(hr_dev, total_size, direct_size, + mtr->kmem, max_pg_shift); + if (ret) { + ibdev_err(ibdev, "Failed to alloc kmem, ret %d\n", ret); + goto err_alloc_mem; + } else { + best_pg_shift = max_pg_shift; + all_pg_count = mtr->kmem->npages; + } + } + + /* must bigger than minimum hardware page shift */ + if (best_pg_shift < PAGE_ADDR_SHIFT || all_pg_count < 1) { + ret = -EINVAL; + ibdev_err(ibdev, "Failed to check mtr page shift %d count %d\n", + best_pg_shift, all_pg_count); + goto err_alloc_mem; + } + + mtr->hem_cfg.buf_pg_shift = best_pg_shift; + mtr->hem_cfg.buf_pg_count = all_pg_count; + + return 0; +err_alloc_mem: + mtr_free_bufs(hr_dev, mtr); + return ret; +} + +static int mtr_get_pages(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr, + dma_addr_t *pages, int count, int page_shift) +{ + struct ib_device *ibdev = &hr_dev->ib_dev; + int npage; + int err; + + if (mtr->umem) + npage = hns_roce_get_umem_bufs(hr_dev, pages, count, 0, + mtr->umem, page_shift); + else + npage = hns_roce_get_kmem_bufs(hr_dev, pages, count, 0, + mtr->kmem); + + if (mtr->hem_cfg.is_direct && npage > 1) { + err = mtr_check_direct_pages(pages, npage, page_shift); + if (err) { + ibdev_err(ibdev, "Failed to check %s direct page-%d\n", + mtr->umem ? "user" : "kernel", err); + npage = err; + } + } + + return npage; +} + +int hns_roce_mtr_map(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr, + struct hns_roce_buf_region *regions, int region_cnt, + dma_addr_t *pages, int page_cnt) +{ + struct ib_device *ibdev = &hr_dev->ib_dev; + struct hns_roce_buf_region *r; + int err; + int i; + + for (i = 0; i < region_cnt; i++) { + r = ®ions[i]; + if (r->offset + r->count > page_cnt) { + err = -EINVAL; + ibdev_err(ibdev, + "Failed to check mtr%d end %d + %d, max %d\n", + i, r->offset, r->count, page_cnt); + return err; + } + + err = mtr_map_region(hr_dev, mtr, &pages[r->offset], r); + if (err) { + ibdev_err(ibdev, + "Failed to map mtr%d offset %d, err %d\n", + i, r->offset, err); + return err; + } + } + + return 0; +} + int hns_roce_mtr_find(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr, int offset, u64 *mtt_buf, int mtt_max, u64 *base_addr) { - u64 *mtts = mtt_buf; int mtt_count; int total = 0; - u64 *addr; + __le64 *mtts; int npage; + u64 addr; int left; - if (mtts == NULL || mtt_max < 1) + if (!mtt_buf || mtt_max < 1) goto done; + /* no mtt memory in direct mode, so just return the buffer address */ + if (mtr->hem_cfg.is_direct) { + npage = offset; + for (total = 0; total < mtt_max; total++, npage++) { + addr = mtr->hem_cfg.root_ba + + (npage << mtr->hem_cfg.buf_pg_shift); + + if (hr_dev->hw_rev == HNS_ROCE_HW_VER1) + mtt_buf[total] = to_hr_hw_page_addr(addr); + else + mtt_buf[total] = addr; + } + + goto done; + } + left = mtt_max; while (left > 0) { mtt_count = 0; - addr = hns_roce_hem_list_find_mtt(hr_dev, &mtr->hem_list, + mtts = hns_roce_hem_list_find_mtt(hr_dev, &mtr->hem_list, offset + total, &mtt_count, NULL); - if (!addr || !mtt_count) + if (!mtts || !mtt_count) goto done; npage = min(mtt_count, left); - memcpy(&mtts[total], addr, BA_BYTE_LEN * npage); left -= npage; - total += npage; + for (mtt_count = 0; mtt_count < npage; mtt_count++) + mtt_buf[total++] = le64_to_cpu(mtts[mtt_count]); } done: if (base_addr) - *base_addr = mtr->hem_list.root_ba; + *base_addr = mtr->hem_cfg.root_ba; return total; } + +/* convert buffer size to page index and page count */ +static int mtr_init_region(struct hns_roce_buf_attr *attr, int page_cnt, + struct hns_roce_buf_region *regions, int region_cnt, + int page_shift) +{ + unsigned int page_size = 1 << page_shift; + int max_region = attr->region_count; + struct hns_roce_buf_region *r; + int page_idx = 0; + int i = 0; + + for (; i < region_cnt && i < max_region && page_idx < page_cnt; i++) { + r = ®ions[i]; + r->hopnum = attr->region[i].hopnum == HNS_ROCE_HOP_NUM_0 ? + 0 : attr->region[i].hopnum; + r->offset = page_idx; + r->count = DIV_ROUND_UP(attr->region[i].size, page_size); + page_idx += r->count; + } + + return i; +} + +/** + * hns_roce_mtr_create - Create hns memory translate region. + * + * @mtr: memory translate region + * @init_attr: init attribute for creating mtr + * @page_shift: page shift for multi-hop base address table + * @udata: user space context, if it's NULL, means kernel space + * @user_addr: userspace virtual address to start at + * @buf_alloced: mtr has private buffer, true means need to alloc + */ +int hns_roce_mtr_create(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr, + struct hns_roce_buf_attr *buf_attr, int page_shift, + struct ib_udata *udata, unsigned long user_addr) +{ + struct hns_roce_buf_region regions[HNS_ROCE_MAX_BT_REGION] = {}; + struct ib_device *ibdev = &hr_dev->ib_dev; + dma_addr_t *pages = NULL; + int region_cnt = 0; + int all_pg_cnt; + int get_pg_cnt; + bool has_mtt; + int err = 0; + + has_mtt = mtr_has_mtt(buf_attr); + /* if buffer only need mtt, just init the hem cfg */ + if (buf_attr->mtt_only) { + mtr->hem_cfg.buf_pg_shift = buf_attr->page_shift; + mtr->hem_cfg.buf_pg_count = mtr_bufs_size(buf_attr) >> + buf_attr->page_shift; + mtr->umem = NULL; + mtr->kmem = NULL; + } else { + err = mtr_alloc_bufs(hr_dev, mtr, buf_attr, !has_mtt, udata, + user_addr); + if (err) { + ibdev_err(ibdev, "Failed to alloc mtr bufs, err %d\n", + err); + return err; + } + } + + /* alloc mtt memory */ + all_pg_cnt = mtr->hem_cfg.buf_pg_count; + hns_roce_hem_list_init(&mtr->hem_list); + mtr->hem_cfg.is_direct = !has_mtt; + mtr->hem_cfg.ba_pg_shift = page_shift; + if (has_mtt) { + region_cnt = mtr_init_region(buf_attr, all_pg_cnt, + regions, ARRAY_SIZE(regions), + mtr->hem_cfg.buf_pg_shift); + if (region_cnt < 1) { + err = -ENOBUFS; + ibdev_err(ibdev, "Failed to init mtr region %d\n", + region_cnt); + goto err_alloc_bufs; + } + err = hns_roce_hem_list_request(hr_dev, &mtr->hem_list, + regions, region_cnt, + page_shift); + if (err) { + ibdev_err(ibdev, "Failed to request mtr hem, err %d\n", + err); + goto err_alloc_bufs; + } + mtr->hem_cfg.root_ba = mtr->hem_list.root_ba; + } + + /* no buffer to map */ + if (buf_attr->mtt_only) + return 0; + + /* alloc a tmp array to store buffer's dma address */ + pages = kvcalloc(all_pg_cnt, sizeof(dma_addr_t), GFP_KERNEL); + if (!pages) { + err = -ENOMEM; + ibdev_err(ibdev, "Failed to alloc mtr page list %d\n", + all_pg_cnt); + goto err_alloc_hem_list; + } + + get_pg_cnt = mtr_get_pages(hr_dev, mtr, pages, all_pg_cnt, + mtr->hem_cfg.buf_pg_shift); + if (get_pg_cnt != all_pg_cnt) { + ibdev_err(ibdev, "Failed to get mtr page %d != %d\n", + get_pg_cnt, all_pg_cnt); + err = -ENOBUFS; + goto err_alloc_page_list; + } + + if (!has_mtt) { + mtr->hem_cfg.root_ba = pages[0]; + } else { + /* write buffer's dma address to BA table */ + err = hns_roce_mtr_map(hr_dev, mtr, regions, region_cnt, pages, + all_pg_cnt); + if (err) { + ibdev_err(ibdev, "Failed to map mtr pages, err %d\n", + err); + goto err_alloc_page_list; + } + } + + /* drop tmp array */ + kvfree(pages); + return 0; +err_alloc_page_list: + kvfree(pages); +err_alloc_hem_list: + hns_roce_hem_list_release(hr_dev, &mtr->hem_list); +err_alloc_bufs: + mtr_free_bufs(hr_dev, mtr); + return err; +} + +void hns_roce_mtr_destroy(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr) +{ + /* release multi-hop addressing resource */ + hns_roce_hem_list_release(hr_dev, &mtr->hem_list); + + /* free buffers */ + mtr_free_bufs(hr_dev, mtr); +} -- cgit v1.2.3 From cc23267aedebd847f86953c67606a3f280fde201 Mon Sep 17 00:00:00 2001 From: Xi Wang Date: Mon, 13 Apr 2020 19:58:07 +0800 Subject: RDMA/hns: Optimize hns buffer allocation flow When the value of nbufs is 1, the buffer is in direct mode, which may cause confusion. So optimizes current codes to make it easier to maintain and understand. Link: https://lore.kernel.org/r/1586779091-51410-3-git-send-email-liweihang@huawei.com Signed-off-by: Xi Wang Signed-off-by: Lang Cheng Signed-off-by: Weihang Li Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_alloc.c | 103 +++++++++++++--------------- drivers/infiniband/hw/hns/hns_roce_cq.c | 18 ++--- drivers/infiniband/hw/hns/hns_roce_device.h | 32 ++++++--- drivers/infiniband/hw/hns/hns_roce_hw_v1.c | 2 +- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 43 +++++------- drivers/infiniband/hw/hns/hns_roce_mr.c | 8 +-- drivers/infiniband/hw/hns/hns_roce_qp.c | 6 +- drivers/infiniband/hw/hns/hns_roce_srq.c | 37 +++++----- 8 files changed, 121 insertions(+), 128 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_alloc.c b/drivers/infiniband/hw/hns/hns_roce_alloc.c index da574c26e063..e04e7596d979 100644 --- a/drivers/infiniband/hw/hns/hns_roce_alloc.c +++ b/drivers/infiniband/hw/hns/hns_roce_alloc.c @@ -157,84 +157,78 @@ void hns_roce_bitmap_cleanup(struct hns_roce_bitmap *bitmap) kfree(bitmap->table); } -void hns_roce_buf_free(struct hns_roce_dev *hr_dev, u32 size, - struct hns_roce_buf *buf) +void hns_roce_buf_free(struct hns_roce_dev *hr_dev, struct hns_roce_buf *buf) { - int i; struct device *dev = hr_dev->dev; + u32 size = buf->size; + int i; + + if (size == 0) + return; + + buf->size = 0; - if (buf->nbufs == 1) { + if (hns_roce_buf_is_direct(buf)) { dma_free_coherent(dev, size, buf->direct.buf, buf->direct.map); } else { - for (i = 0; i < buf->nbufs; ++i) + for (i = 0; i < buf->npages; ++i) if (buf->page_list[i].buf) dma_free_coherent(dev, 1 << buf->page_shift, buf->page_list[i].buf, buf->page_list[i].map); kfree(buf->page_list); + buf->page_list = NULL; } } int hns_roce_buf_alloc(struct hns_roce_dev *hr_dev, u32 size, u32 max_direct, struct hns_roce_buf *buf, u32 page_shift) { - int i = 0; - dma_addr_t t; + struct hns_roce_buf_list *buf_list; struct device *dev = hr_dev->dev; - u32 page_size = 1 << page_shift; - u32 order; + u32 page_size; + int i; + + /* The minimum shift of the page accessed by hw is PAGE_ADDR_SHIFT */ + buf->page_shift = max_t(int, PAGE_ADDR_SHIFT, page_shift); - /* SQ/RQ buf lease than one page, SQ + RQ = 8K */ + page_size = 1 << buf->page_shift; + buf->npages = DIV_ROUND_UP(size, page_size); + + /* required size is not bigger than one trunk size */ if (size <= max_direct) { - buf->nbufs = 1; - /* Npages calculated by page_size */ - order = get_order(size); - if (order <= page_shift - PAGE_SHIFT) - order = 0; - else - order -= page_shift - PAGE_SHIFT; - buf->npages = 1 << order; - buf->page_shift = page_shift; - /* MTT PA must be recorded in 4k alignment, t is 4k aligned */ - buf->direct.buf = dma_alloc_coherent(dev, size, &t, + buf->page_list = NULL; + buf->direct.buf = dma_alloc_coherent(dev, size, + &buf->direct.map, GFP_KERNEL); if (!buf->direct.buf) return -ENOMEM; - - buf->direct.map = t; - - while (t & ((1 << buf->page_shift) - 1)) { - --buf->page_shift; - buf->npages *= 2; - } } else { - buf->nbufs = (size + page_size - 1) / page_size; - buf->npages = buf->nbufs; - buf->page_shift = page_shift; - buf->page_list = kcalloc(buf->nbufs, sizeof(*buf->page_list), - GFP_KERNEL); - - if (!buf->page_list) + buf_list = kcalloc(buf->npages, sizeof(*buf_list), GFP_KERNEL); + if (!buf_list) return -ENOMEM; - for (i = 0; i < buf->nbufs; ++i) { - buf->page_list[i].buf = dma_alloc_coherent(dev, - page_size, - &t, - GFP_KERNEL); - - if (!buf->page_list[i].buf) - goto err_free; + for (i = 0; i < buf->npages; i++) { + buf_list[i].buf = dma_alloc_coherent(dev, page_size, + &buf_list[i].map, + GFP_KERNEL); + if (!buf_list[i].buf) + break; + } - buf->page_list[i].map = t; + if (i != buf->npages && i > 0) { + while (i-- > 0) + dma_free_coherent(dev, page_size, + buf_list[i].buf, + buf_list[i].map); + kfree(buf_list); + return -ENOMEM; } + buf->page_list = buf_list; } + buf->size = size; return 0; - -err_free: - hns_roce_buf_free(hr_dev, size, buf); - return -ENOMEM; } int hns_roce_get_kmem_bufs(struct hns_roce_dev *hr_dev, dma_addr_t *bufs, @@ -246,18 +240,14 @@ int hns_roce_get_kmem_bufs(struct hns_roce_dev *hr_dev, dma_addr_t *bufs, end = start + buf_cnt; if (end > buf->npages) { dev_err(hr_dev->dev, - "invalid kmem region,offset %d,buf_cnt %d,total %d!\n", + "Failed to check kmem bufs, end %d + %d total %d!\n", start, buf_cnt, buf->npages); return -EINVAL; } total = 0; for (i = start; i < end; i++) - if (buf->nbufs == 1) - bufs[total++] = buf->direct.map + - ((dma_addr_t)i << buf->page_shift); - else - bufs[total++] = buf->page_list[i].map; + bufs[total++] = hns_roce_buf_page(buf, i); return total; } @@ -271,8 +261,9 @@ int hns_roce_get_umem_bufs(struct hns_roce_dev *hr_dev, dma_addr_t *bufs, int idx = 0; u64 addr; - if (page_shift < PAGE_SHIFT) { - dev_err(hr_dev->dev, "invalid page shift %d!\n", page_shift); + if (page_shift < PAGE_ADDR_SHIFT) { + dev_err(hr_dev->dev, "Failed to check umem page shift %d!\n", + page_shift); return -EINVAL; } diff --git a/drivers/infiniband/hw/hns/hns_roce_cq.c b/drivers/infiniband/hw/hns/hns_roce_cq.c index 5bfb52ffd590..92798ff6360d 100644 --- a/drivers/infiniband/hw/hns/hns_roce_cq.c +++ b/drivers/infiniband/hw/hns/hns_roce_cq.c @@ -157,13 +157,12 @@ static int get_cq_umem(struct hns_roce_dev *hr_dev, struct hns_roce_cq *hr_cq, struct hns_roce_ib_create_cq ucmd, struct ib_udata *udata) { - struct hns_roce_buf *buf = &hr_cq->buf; struct hns_roce_mtt *mtt = &hr_cq->mtt; struct ib_umem **umem = &hr_cq->umem; u32 npages; int ret; - *umem = ib_umem_get(&hr_dev->ib_dev, ucmd.buf_addr, buf->size, + *umem = ib_umem_get(&hr_dev->ib_dev, ucmd.buf_addr, hr_cq->buf_size, IB_ACCESS_LOCAL_WRITE); if (IS_ERR(*umem)) return PTR_ERR(*umem); @@ -175,7 +174,7 @@ static int get_cq_umem(struct hns_roce_dev *hr_dev, struct hns_roce_cq *hr_cq, npages = DIV_ROUND_UP(ib_umem_page_count(*umem), 1 << hr_dev->caps.cqe_buf_pg_sz); - ret = hns_roce_mtt_init(hr_dev, npages, buf->page_shift, mtt); + ret = hns_roce_mtt_init(hr_dev, npages, hr_cq->page_shift, mtt); if (ret) goto err_buf; @@ -199,8 +198,9 @@ static int alloc_cq_buf(struct hns_roce_dev *hr_dev, struct hns_roce_cq *hr_cq) struct hns_roce_mtt *mtt = &hr_cq->mtt; int ret; - ret = hns_roce_buf_alloc(hr_dev, buf->size, (1 << buf->page_shift) * 2, - buf, buf->page_shift); + ret = hns_roce_buf_alloc(hr_dev, hr_cq->buf_size, + (1 << hr_cq->page_shift) * 2, + buf, hr_cq->page_shift); if (ret) goto out; @@ -223,7 +223,7 @@ err_mtt: hns_roce_mtt_cleanup(hr_dev, mtt); err_buf: - hns_roce_buf_free(hr_dev, buf->size, buf); + hns_roce_buf_free(hr_dev, buf); out: return ret; @@ -231,7 +231,7 @@ out: static void free_cq_buf(struct hns_roce_dev *hr_dev, struct hns_roce_cq *hr_cq) { - hns_roce_buf_free(hr_dev, hr_cq->buf.size, &hr_cq->buf); + hns_roce_buf_free(hr_dev, &hr_cq->buf); } static int create_user_cq(struct hns_roce_dev *hr_dev, @@ -367,8 +367,8 @@ int hns_roce_create_cq(struct ib_cq *ib_cq, const struct ib_cq_init_attr *attr, hr_cq->ib_cq.cqe = cq_entries - 1; /* used as cqe index */ hr_cq->cq_depth = cq_entries; hr_cq->vector = vector; - hr_cq->buf.size = hr_cq->cq_depth * hr_dev->caps.cq_entry_sz; - hr_cq->buf.page_shift = PAGE_SHIFT + hr_dev->caps.cqe_buf_pg_sz; + hr_cq->buf_size = hr_cq->cq_depth * hr_dev->caps.cq_entry_sz; + hr_cq->page_shift = PAGE_SHIFT + hr_dev->caps.cqe_buf_pg_sz; spin_lock_init(&hr_cq->lock); INIT_LIST_HEAD(&hr_cq->sq_list); INIT_LIST_HEAD(&hr_cq->rq_list); diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index 4a7afec4899c..c37617d5200f 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -468,7 +468,6 @@ struct hns_roce_buf_list { struct hns_roce_buf { struct hns_roce_buf_list direct; struct hns_roce_buf_list *page_list; - int nbufs; u32 npages; u32 size; int page_shift; @@ -510,6 +509,8 @@ struct hns_roce_cq { u8 db_en; spinlock_t lock; struct ib_umem *umem; + u32 buf_size; + int page_shift; u32 cq_depth; u32 cons_index; u32 *set_ci_db; @@ -549,6 +550,8 @@ struct hns_roce_srq { struct hns_roce_buf buf; u64 *wrid; struct ib_umem *umem; + u32 buf_size; + int page_shift; struct hns_roce_mtt mtt; struct hns_roce_idx_que idx_que; spinlock_t lock; @@ -1124,15 +1127,29 @@ static inline struct hns_roce_qp return xa_load(&hr_dev->qp_table_xa, qpn & (hr_dev->caps.num_qps - 1)); } +static inline bool hns_roce_buf_is_direct(struct hns_roce_buf *buf) +{ + if (buf->page_list) + return false; + + return true; +} + static inline void *hns_roce_buf_offset(struct hns_roce_buf *buf, int offset) { - u32 page_size = 1 << buf->page_shift; + if (hns_roce_buf_is_direct(buf)) + return (char *)(buf->direct.buf) + (offset & (buf->size - 1)); - if (buf->nbufs == 1) - return (char *)(buf->direct.buf) + offset; + return (char *)(buf->page_list[offset >> buf->page_shift].buf) + + (offset & ((1 << buf->page_shift) - 1)); +} + +static inline dma_addr_t hns_roce_buf_page(struct hns_roce_buf *buf, int idx) +{ + if (hns_roce_buf_is_direct(buf)) + return buf->direct.map + ((dma_addr_t)idx << buf->page_shift); else - return (char *)(buf->page_list[offset >> buf->page_shift].buf) + - (offset & (page_size - 1)); + return buf->page_list[idx].map; } static inline u64 to_hr_hw_page_addr(u64 addr) @@ -1240,8 +1257,7 @@ struct ib_mw *hns_roce_alloc_mw(struct ib_pd *pd, enum ib_mw_type, struct ib_udata *udata); int hns_roce_dealloc_mw(struct ib_mw *ibmw); -void hns_roce_buf_free(struct hns_roce_dev *hr_dev, u32 size, - struct hns_roce_buf *buf); +void hns_roce_buf_free(struct hns_roce_dev *hr_dev, struct hns_roce_buf *buf); int hns_roce_buf_alloc(struct hns_roce_dev *hr_dev, u32 size, u32 max_direct, struct hns_roce_buf *buf, u32 page_shift); diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c index 5ff028d77be3..4b5490692fbb 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c @@ -3666,7 +3666,7 @@ static void hns_roce_v1_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata) ib_umem_release(hr_cq->umem); if (!udata) { /* Free the buff of stored cq */ - hns_roce_buf_free(hr_dev, hr_cq->buf.size, &hr_cq->buf); + hns_roce_buf_free(hr_dev, &hr_cq->buf); } } diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index c3316672b70e..998015da48ba 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -4989,24 +4989,14 @@ static void set_eq_cons_index_v2(struct hns_roce_eq *eq) hns_roce_write64(hr_dev, doorbell, eq->doorbell); } -static inline void *get_eqe_buf(struct hns_roce_eq *eq, unsigned long offset) -{ - u32 buf_chk_sz; - - buf_chk_sz = 1 << (eq->eqe_buf_pg_sz + PAGE_SHIFT); - if (eq->buf.nbufs == 1) - return eq->buf.direct.buf + offset % buf_chk_sz; - else - return eq->buf.page_list[offset / buf_chk_sz].buf + - offset % buf_chk_sz; -} - static struct hns_roce_aeqe *next_aeqe_sw_v2(struct hns_roce_eq *eq) { struct hns_roce_aeqe *aeqe; - aeqe = get_eqe_buf(eq, (eq->cons_index & (eq->entries - 1)) * - HNS_ROCE_AEQ_ENTRY_SIZE); + aeqe = hns_roce_buf_offset(&eq->buf, + (eq->cons_index & (eq->entries - 1)) * + HNS_ROCE_AEQ_ENTRY_SIZE); + return (roce_get_bit(aeqe->asyn, HNS_ROCE_V2_AEQ_AEQE_OWNER_S) ^ !!(eq->cons_index & eq->entries)) ? aeqe : NULL; } @@ -5103,8 +5093,9 @@ static struct hns_roce_ceqe *next_ceqe_sw_v2(struct hns_roce_eq *eq) { struct hns_roce_ceqe *ceqe; - ceqe = get_eqe_buf(eq, (eq->cons_index & (eq->entries - 1)) * - HNS_ROCE_CEQ_ENTRY_SIZE); + ceqe = hns_roce_buf_offset(&eq->buf, + (eq->cons_index & (eq->entries - 1)) * + HNS_ROCE_CEQ_ENTRY_SIZE); return (!!(roce_get_bit(ceqe->comp, HNS_ROCE_V2_CEQ_CEQE_OWNER_S))) ^ (!!(eq->cons_index & eq->entries)) ? ceqe : NULL; } @@ -5265,7 +5256,7 @@ static void free_eq_buf(struct hns_roce_dev *hr_dev, struct hns_roce_eq *eq) { if (!eq->hop_num || eq->hop_num == HNS_ROCE_HOP_NUM_0) hns_roce_mtr_cleanup(hr_dev, &eq->mtr); - hns_roce_buf_free(hr_dev, eq->buf.size, &eq->buf); + hns_roce_buf_free(hr_dev, &eq->buf); } static void hns_roce_config_eqc(struct hns_roce_dev *hr_dev, @@ -5290,7 +5281,8 @@ static void hns_roce_config_eqc(struct hns_roce_dev *hr_dev, eq->eqe_buf_pg_sz = hr_dev->caps.eqe_buf_pg_sz; eq->shift = ilog2((unsigned int)eq->entries); - /* if not muti-hop, eqe buffer only use one trunk */ + /* if not multi-hop, eqe buffer only use one trunk */ + eq->eqe_buf_pg_sz = eq->buf.page_shift - PAGE_ADDR_SHIFT; if (!eq->hop_num || eq->hop_num == HNS_ROCE_HOP_NUM_0) { eq->eqe_ba = eq->buf.direct.map; eq->cur_eqe_ba = eq->eqe_ba; @@ -5432,7 +5424,7 @@ static int map_eq_buf(struct hns_roce_dev *hr_dev, struct hns_roce_eq *eq, goto done; } - hns_roce_mtr_init(&eq->mtr, PAGE_SHIFT + hr_dev->caps.eqe_ba_pg_sz, + hns_roce_mtr_init(&eq->mtr, PAGE_ADDR_SHIFT + hr_dev->caps.eqe_ba_pg_sz, page_shift); ret = hns_roce_mtr_attach(hr_dev, &eq->mtr, &buf_list, ®ion, 1); if (ret) @@ -5454,23 +5446,24 @@ static int alloc_eq_buf(struct hns_roce_dev *hr_dev, struct hns_roce_eq *eq) u32 page_shift; u32 mhop_num; u32 max_size; + u32 buf_size; int ret; - page_shift = PAGE_SHIFT + hr_dev->caps.eqe_buf_pg_sz; + page_shift = PAGE_ADDR_SHIFT + hr_dev->caps.eqe_buf_pg_sz; mhop_num = hr_dev->caps.eqe_hop_num; if (!mhop_num) { max_size = 1 << page_shift; - buf->size = max_size; + buf_size = max_size; } else if (mhop_num == HNS_ROCE_HOP_NUM_0) { max_size = eq->entries * eq->eqe_size; - buf->size = max_size; + buf_size = max_size; } else { max_size = 1 << page_shift; - buf->size = PAGE_ALIGN(eq->entries * eq->eqe_size); + buf_size = round_up(eq->entries * eq->eqe_size, max_size); is_mhop = true; } - ret = hns_roce_buf_alloc(hr_dev, buf->size, max_size, buf, page_shift); + ret = hns_roce_buf_alloc(hr_dev, buf_size, max_size, buf, page_shift); if (ret) { dev_err(hr_dev->dev, "alloc eq buf error\n"); return ret; @@ -5486,7 +5479,7 @@ static int alloc_eq_buf(struct hns_roce_dev *hr_dev, struct hns_roce_eq *eq) return 0; err_alloc: - hns_roce_buf_free(hr_dev, buf->size, buf); + hns_roce_buf_free(hr_dev, buf); return ret; } diff --git a/drivers/infiniband/hw/hns/hns_roce_mr.c b/drivers/infiniband/hw/hns/hns_roce_mr.c index b3af36938f63..99e3876e712c 100644 --- a/drivers/infiniband/hw/hns/hns_roce_mr.c +++ b/drivers/infiniband/hw/hns/hns_roce_mr.c @@ -900,13 +900,9 @@ int hns_roce_buf_write_mtt(struct hns_roce_dev *hr_dev, if (!page_list) return -ENOMEM; - for (i = 0; i < buf->npages; ++i) { - if (buf->nbufs == 1) - page_list[i] = buf->direct.map + (i << buf->page_shift); - else - page_list[i] = buf->page_list[i].map; + for (i = 0; i < buf->npages; ++i) + page_list[i] = hns_roce_buf_page(buf, i); - } ret = hns_roce_write_mtt(hr_dev, mtt, 0, buf->npages, page_list); kfree(page_list); diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c index 6317901c4b4f..1667f3753f34 100644 --- a/drivers/infiniband/hw/hns/hns_roce_qp.c +++ b/drivers/infiniband/hw/hns/hns_roce_qp.c @@ -839,7 +839,7 @@ err_alloc: ib_umem_release(hr_qp->umem); hr_qp->umem = NULL; } else { - hns_roce_buf_free(hr_dev, hr_qp->buff_size, &hr_qp->hr_buf); + hns_roce_buf_free(hr_dev, &hr_qp->hr_buf); } ibdev_err(ibdev, "Failed to alloc WQE buffer, ret %d.\n", ret); @@ -855,8 +855,8 @@ static void free_qp_buf(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp) hr_qp->umem = NULL; } - if (hr_qp->hr_buf.nbufs > 0) - hns_roce_buf_free(hr_dev, hr_qp->buff_size, &hr_qp->hr_buf); + if (hr_qp->hr_buf.npages > 0) + hns_roce_buf_free(hr_dev, &hr_qp->hr_buf); if ((hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RQ_INLINE) && hr_qp->rq.wqe_cnt) diff --git a/drivers/infiniband/hw/hns/hns_roce_srq.c b/drivers/infiniband/hw/hns/hns_roce_srq.c index 5b3dd1a337d4..9851d76d2c14 100644 --- a/drivers/infiniband/hw/hns/hns_roce_srq.c +++ b/drivers/infiniband/hw/hns/hns_roce_srq.c @@ -180,7 +180,8 @@ static int create_user_srq(struct hns_roce_srq *srq, struct ib_udata *udata, { struct hns_roce_dev *hr_dev = to_hr_dev(srq->ibsrq.device); struct hns_roce_ib_create_srq ucmd; - struct hns_roce_buf *buf; + int page_shift; + int page_count; int ret; if (ib_copy_from_udata(&ucmd, udata, sizeof(ucmd))) @@ -191,12 +192,11 @@ static int create_user_srq(struct hns_roce_srq *srq, struct ib_udata *udata, if (IS_ERR(srq->umem)) return PTR_ERR(srq->umem); - buf = &srq->buf; - buf->npages = (ib_umem_page_count(srq->umem) + - (1 << hr_dev->caps.srqwqe_buf_pg_sz) - 1) / + page_count = (ib_umem_page_count(srq->umem) + + (1 << hr_dev->caps.srqwqe_buf_pg_sz) - 1) / (1 << hr_dev->caps.srqwqe_buf_pg_sz); - buf->page_shift = PAGE_SHIFT + hr_dev->caps.srqwqe_buf_pg_sz; - ret = hns_roce_mtt_init(hr_dev, buf->npages, buf->page_shift, + page_shift = PAGE_SHIFT + hr_dev->caps.srqwqe_buf_pg_sz; + ret = hns_roce_mtt_init(hr_dev, page_count, page_shift, &srq->mtt); if (ret) goto err_user_buf; @@ -214,11 +214,10 @@ static int create_user_srq(struct hns_roce_srq *srq, struct ib_udata *udata, goto err_user_srq_mtt; } - buf = &srq->idx_que.idx_buf; - buf->npages = DIV_ROUND_UP(ib_umem_page_count(srq->idx_que.umem), - 1 << hr_dev->caps.idx_buf_pg_sz); - buf->page_shift = PAGE_SHIFT + hr_dev->caps.idx_buf_pg_sz; - ret = hns_roce_mtt_init(hr_dev, buf->npages, buf->page_shift, + page_count = DIV_ROUND_UP(ib_umem_page_count(srq->idx_que.umem), + 1 << hr_dev->caps.idx_buf_pg_sz); + page_shift = PAGE_SHIFT + hr_dev->caps.idx_buf_pg_sz; + ret = hns_roce_mtt_init(hr_dev, page_count, page_shift, &srq->idx_que.mtt); if (ret) { dev_err(hr_dev->dev, "hns_roce_mtt_init error for idx que\n"); @@ -325,15 +324,14 @@ err_kernel_idx_buf: hns_roce_mtt_cleanup(hr_dev, &srq->idx_que.mtt); err_kernel_create_idx: - hns_roce_buf_free(hr_dev, srq->idx_que.buf_size, - &srq->idx_que.idx_buf); + hns_roce_buf_free(hr_dev, &srq->idx_que.idx_buf); kfree(srq->idx_que.bitmap); err_kernel_srq_mtt: hns_roce_mtt_cleanup(hr_dev, &srq->mtt); err_kernel_buf: - hns_roce_buf_free(hr_dev, srq_buf_size, &srq->buf); + hns_roce_buf_free(hr_dev, &srq->buf); return ret; } @@ -348,14 +346,14 @@ static void destroy_user_srq(struct hns_roce_dev *hr_dev, } static void destroy_kernel_srq(struct hns_roce_dev *hr_dev, - struct hns_roce_srq *srq, int srq_buf_size) + struct hns_roce_srq *srq) { kvfree(srq->wrid); hns_roce_mtt_cleanup(hr_dev, &srq->idx_que.mtt); - hns_roce_buf_free(hr_dev, srq->idx_que.buf_size, &srq->idx_que.idx_buf); + hns_roce_buf_free(hr_dev, &srq->idx_que.idx_buf); kfree(srq->idx_que.bitmap); hns_roce_mtt_cleanup(hr_dev, &srq->mtt); - hns_roce_buf_free(hr_dev, srq_buf_size, &srq->buf); + hns_roce_buf_free(hr_dev, &srq->buf); } int hns_roce_create_srq(struct ib_srq *ib_srq, @@ -437,7 +435,7 @@ err_wrid: if (udata) destroy_user_srq(hr_dev, srq); else - destroy_kernel_srq(hr_dev, srq, srq_buf_size); + destroy_kernel_srq(hr_dev, srq); err_srq: return ret; @@ -455,8 +453,7 @@ void hns_roce_destroy_srq(struct ib_srq *ibsrq, struct ib_udata *udata) hns_roce_mtt_cleanup(hr_dev, &srq->idx_que.mtt); } else { kvfree(srq->wrid); - hns_roce_buf_free(hr_dev, srq->wqe_cnt << srq->wqe_shift, - &srq->buf); + hns_roce_buf_free(hr_dev, &srq->buf); } ib_umem_release(srq->idx_que.umem); ib_umem_release(srq->umem); -- cgit v1.2.3 From 477a0a38707249227d8929648baf5abbdd58c40f Mon Sep 17 00:00:00 2001 From: Xi Wang Date: Mon, 13 Apr 2020 19:58:08 +0800 Subject: RDMA/hns: Optimize 0 hop addressing for EQE buffer Use the new mtr interface to simple the hop 0 addressing and multihop addressing process. Link: https://lore.kernel.org/r/1586779091-51410-4-git-send-email-liweihang@huawei.com Signed-off-by: Xi Wang Signed-off-by: Lang Cheng Signed-off-by: Weihang Li Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_device.h | 6 - drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 171 ++++++++-------------------- 2 files changed, 48 insertions(+), 129 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index c37617d5200f..39577c2e345e 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -794,17 +794,11 @@ struct hns_roce_eq { int over_ignore; int coalesce; int arm_st; - u64 eqe_ba; - int eqe_ba_pg_sz; - int eqe_buf_pg_sz; int hop_num; struct hns_roce_mtr mtr; - struct hns_roce_buf buf; int eq_max_cnt; int eq_period; int shift; - dma_addr_t cur_eqe_ba; - dma_addr_t nxt_eqe_ba; int event_type; int sub_type; }; diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 998015da48ba..335a637ab239 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -4993,7 +4993,7 @@ static struct hns_roce_aeqe *next_aeqe_sw_v2(struct hns_roce_eq *eq) { struct hns_roce_aeqe *aeqe; - aeqe = hns_roce_buf_offset(&eq->buf, + aeqe = hns_roce_buf_offset(eq->mtr.kmem, (eq->cons_index & (eq->entries - 1)) * HNS_ROCE_AEQ_ENTRY_SIZE); @@ -5093,7 +5093,7 @@ static struct hns_roce_ceqe *next_ceqe_sw_v2(struct hns_roce_eq *eq) { struct hns_roce_ceqe *ceqe; - ceqe = hns_roce_buf_offset(&eq->buf, + ceqe = hns_roce_buf_offset(eq->mtr.kmem, (eq->cons_index & (eq->entries - 1)) * HNS_ROCE_CEQ_ENTRY_SIZE); return (!!(roce_get_bit(ceqe->comp, HNS_ROCE_V2_CEQ_CEQE_OWNER_S))) ^ @@ -5254,17 +5254,15 @@ static void hns_roce_v2_destroy_eqc(struct hns_roce_dev *hr_dev, int eqn) static void free_eq_buf(struct hns_roce_dev *hr_dev, struct hns_roce_eq *eq) { - if (!eq->hop_num || eq->hop_num == HNS_ROCE_HOP_NUM_0) - hns_roce_mtr_cleanup(hr_dev, &eq->mtr); - hns_roce_buf_free(hr_dev, &eq->buf); + hns_roce_mtr_destroy(hr_dev, &eq->mtr); } -static void hns_roce_config_eqc(struct hns_roce_dev *hr_dev, - struct hns_roce_eq *eq, - void *mb_buf) +static int config_eqc(struct hns_roce_dev *hr_dev, struct hns_roce_eq *eq, + void *mb_buf) { + u64 eqe_ba[MTT_MIN_COUNT] = { 0 }; struct hns_roce_eq_context *eqc; - u64 ba[MTT_MIN_COUNT] = { 0 }; + u64 bt_ba = 0; int count; eqc = mb_buf; @@ -5272,32 +5270,18 @@ static void hns_roce_config_eqc(struct hns_roce_dev *hr_dev, /* init eqc */ eq->doorbell = hr_dev->reg_base + ROCEE_VF_EQ_DB_CFG0_REG; - eq->hop_num = hr_dev->caps.eqe_hop_num; eq->cons_index = 0; eq->over_ignore = HNS_ROCE_V2_EQ_OVER_IGNORE_0; eq->coalesce = HNS_ROCE_V2_EQ_COALESCE_0; eq->arm_st = HNS_ROCE_V2_EQ_ALWAYS_ARMED; - eq->eqe_ba_pg_sz = hr_dev->caps.eqe_ba_pg_sz; - eq->eqe_buf_pg_sz = hr_dev->caps.eqe_buf_pg_sz; eq->shift = ilog2((unsigned int)eq->entries); /* if not multi-hop, eqe buffer only use one trunk */ - eq->eqe_buf_pg_sz = eq->buf.page_shift - PAGE_ADDR_SHIFT; - if (!eq->hop_num || eq->hop_num == HNS_ROCE_HOP_NUM_0) { - eq->eqe_ba = eq->buf.direct.map; - eq->cur_eqe_ba = eq->eqe_ba; - if (eq->buf.npages > 1) - eq->nxt_eqe_ba = eq->eqe_ba + (1 << eq->eqe_buf_pg_sz); - else - eq->nxt_eqe_ba = eq->eqe_ba; - } else { - count = hns_roce_mtr_find(hr_dev, &eq->mtr, 0, ba, - MTT_MIN_COUNT, &eq->eqe_ba); - eq->cur_eqe_ba = ba[0]; - if (count > 1) - eq->nxt_eqe_ba = ba[1]; - else - eq->nxt_eqe_ba = ba[0]; + count = hns_roce_mtr_find(hr_dev, &eq->mtr, 0, eqe_ba, MTT_MIN_COUNT, + &bt_ba); + if (count < 1) { + dev_err(hr_dev->dev, "failed to find EQE mtr\n"); + return -ENOBUFS; } /* set eqc state */ @@ -5331,12 +5315,12 @@ static void hns_roce_config_eqc(struct hns_roce_dev *hr_dev, /* set eqe_ba_pg_sz */ roce_set_field(eqc->byte_8, HNS_ROCE_EQC_BA_PG_SZ_M, HNS_ROCE_EQC_BA_PG_SZ_S, - eq->eqe_ba_pg_sz + PG_SHIFT_OFFSET); + to_hr_hw_page_shift(eq->mtr.hem_cfg.ba_pg_shift)); /* set eqe_buf_pg_sz */ roce_set_field(eqc->byte_8, HNS_ROCE_EQC_BUF_PG_SZ_M, HNS_ROCE_EQC_BUF_PG_SZ_S, - eq->eqe_buf_pg_sz + PG_SHIFT_OFFSET); + to_hr_hw_page_shift(eq->mtr.hem_cfg.buf_pg_shift)); /* set eq_producer_idx */ roce_set_field(eqc->byte_8, HNS_ROCE_EQC_PROD_INDX_M, @@ -5355,13 +5339,13 @@ static void hns_roce_config_eqc(struct hns_roce_dev *hr_dev, HNS_ROCE_EQC_REPORT_TIMER_S, HNS_ROCE_EQ_INIT_REPORT_TIMER); - /* set eqe_ba [34:3] */ + /* set bt_ba [34:3] */ roce_set_field(eqc->eqe_ba0, HNS_ROCE_EQC_EQE_BA_L_M, - HNS_ROCE_EQC_EQE_BA_L_S, eq->eqe_ba >> 3); + HNS_ROCE_EQC_EQE_BA_L_S, bt_ba >> 3); - /* set eqe_ba [64:35] */ + /* set bt_ba [64:35] */ roce_set_field(eqc->eqe_ba1, HNS_ROCE_EQC_EQE_BA_H_M, - HNS_ROCE_EQC_EQE_BA_H_S, eq->eqe_ba >> 35); + HNS_ROCE_EQC_EQE_BA_H_S, bt_ba >> 35); /* set eq shift */ roce_set_field(eqc->byte_28, HNS_ROCE_EQC_SHIFT_M, HNS_ROCE_EQC_SHIFT_S, @@ -5373,15 +5357,15 @@ static void hns_roce_config_eqc(struct hns_roce_dev *hr_dev, /* set cur_eqe_ba [27:12] */ roce_set_field(eqc->byte_28, HNS_ROCE_EQC_CUR_EQE_BA_L_M, - HNS_ROCE_EQC_CUR_EQE_BA_L_S, eq->cur_eqe_ba >> 12); + HNS_ROCE_EQC_CUR_EQE_BA_L_S, eqe_ba[0] >> 12); /* set cur_eqe_ba [59:28] */ roce_set_field(eqc->byte_32, HNS_ROCE_EQC_CUR_EQE_BA_M_M, - HNS_ROCE_EQC_CUR_EQE_BA_M_S, eq->cur_eqe_ba >> 28); + HNS_ROCE_EQC_CUR_EQE_BA_M_S, eqe_ba[0] >> 28); /* set cur_eqe_ba [63:60] */ roce_set_field(eqc->byte_36, HNS_ROCE_EQC_CUR_EQE_BA_H_M, - HNS_ROCE_EQC_CUR_EQE_BA_H_S, eq->cur_eqe_ba >> 60); + HNS_ROCE_EQC_CUR_EQE_BA_H_S, eqe_ba[0] >> 60); /* set eq consumer idx */ roce_set_field(eqc->byte_36, HNS_ROCE_EQC_CONS_INDX_M, @@ -5389,98 +5373,38 @@ static void hns_roce_config_eqc(struct hns_roce_dev *hr_dev, /* set nex_eqe_ba[43:12] */ roce_set_field(eqc->nxt_eqe_ba0, HNS_ROCE_EQC_NXT_EQE_BA_L_M, - HNS_ROCE_EQC_NXT_EQE_BA_L_S, eq->nxt_eqe_ba >> 12); + HNS_ROCE_EQC_NXT_EQE_BA_L_S, eqe_ba[1] >> 12); /* set nex_eqe_ba[63:44] */ roce_set_field(eqc->nxt_eqe_ba1, HNS_ROCE_EQC_NXT_EQE_BA_H_M, - HNS_ROCE_EQC_NXT_EQE_BA_H_S, eq->nxt_eqe_ba >> 44); -} + HNS_ROCE_EQC_NXT_EQE_BA_H_S, eqe_ba[1] >> 44); -static int map_eq_buf(struct hns_roce_dev *hr_dev, struct hns_roce_eq *eq, - u32 page_shift) -{ - struct hns_roce_buf_region region = {}; - dma_addr_t *buf_list = NULL; - int ba_num; - int ret; - - ba_num = DIV_ROUND_UP(PAGE_ALIGN(eq->entries * eq->eqe_size), - 1 << page_shift); - hns_roce_init_buf_region(®ion, hr_dev->caps.eqe_hop_num, 0, ba_num); - - /* alloc a tmp list for storing eq buf address */ - ret = hns_roce_alloc_buf_list(®ion, &buf_list, 1); - if (ret) { - dev_err(hr_dev->dev, "alloc eq buf_list error\n"); - return ret; - } - - ba_num = hns_roce_get_kmem_bufs(hr_dev, buf_list, region.count, - region.offset, &eq->buf); - if (ba_num != region.count) { - dev_err(hr_dev->dev, "get eqe buf err,expect %d,ret %d.\n", - region.count, ba_num); - ret = -ENOBUFS; - goto done; - } - - hns_roce_mtr_init(&eq->mtr, PAGE_ADDR_SHIFT + hr_dev->caps.eqe_ba_pg_sz, - page_shift); - ret = hns_roce_mtr_attach(hr_dev, &eq->mtr, &buf_list, ®ion, 1); - if (ret) - dev_err(hr_dev->dev, "mtr attach error for eqe\n"); - - goto done; - - hns_roce_mtr_cleanup(hr_dev, &eq->mtr); -done: - hns_roce_free_buf_list(&buf_list, 1); - - return ret; + return 0; } static int alloc_eq_buf(struct hns_roce_dev *hr_dev, struct hns_roce_eq *eq) { - struct hns_roce_buf *buf = &eq->buf; - bool is_mhop = false; - u32 page_shift; - u32 mhop_num; - u32 max_size; - u32 buf_size; - int ret; + struct hns_roce_buf_attr buf_attr = {}; + int err; - page_shift = PAGE_ADDR_SHIFT + hr_dev->caps.eqe_buf_pg_sz; - mhop_num = hr_dev->caps.eqe_hop_num; - if (!mhop_num) { - max_size = 1 << page_shift; - buf_size = max_size; - } else if (mhop_num == HNS_ROCE_HOP_NUM_0) { - max_size = eq->entries * eq->eqe_size; - buf_size = max_size; - } else { - max_size = 1 << page_shift; - buf_size = round_up(eq->entries * eq->eqe_size, max_size); - is_mhop = true; - } + if (hr_dev->caps.eqe_hop_num == HNS_ROCE_HOP_NUM_0) + eq->hop_num = 0; + else + eq->hop_num = hr_dev->caps.eqe_hop_num; - ret = hns_roce_buf_alloc(hr_dev, buf_size, max_size, buf, page_shift); - if (ret) { - dev_err(hr_dev->dev, "alloc eq buf error\n"); - return ret; - } + buf_attr.page_shift = hr_dev->caps.eqe_buf_pg_sz + PAGE_ADDR_SHIFT; + buf_attr.region[0].size = eq->entries * eq->eqe_size; + buf_attr.region[0].hopnum = eq->hop_num; + buf_attr.region_count = 1; + buf_attr.fixed_page = true; - if (is_mhop) { - ret = map_eq_buf(hr_dev, eq, page_shift); - if (ret) { - dev_err(hr_dev->dev, "map roce buf error\n"); - goto err_alloc; - } - } + err = hns_roce_mtr_create(hr_dev, &eq->mtr, &buf_attr, + hr_dev->caps.srqwqe_ba_pg_sz + + PAGE_ADDR_SHIFT, NULL, 0); + if (err) + dev_err(hr_dev->dev, "Failed to alloc EQE mtr, err %d\n", err); - return 0; -err_alloc: - hns_roce_buf_free(hr_dev, buf); - return ret; + return err; } static int hns_roce_v2_create_eq(struct hns_roce_dev *hr_dev, @@ -5492,15 +5416,16 @@ static int hns_roce_v2_create_eq(struct hns_roce_dev *hr_dev, /* Allocate mailbox memory */ mailbox = hns_roce_alloc_cmd_mailbox(hr_dev); - if (IS_ERR(mailbox)) - return PTR_ERR(mailbox); + if (IS_ERR_OR_NULL(mailbox)) + return -ENOMEM; ret = alloc_eq_buf(hr_dev, eq); - if (ret) { - ret = -ENOMEM; + if (ret) goto free_cmd_mbox; - } - hns_roce_config_eqc(hr_dev, eq, mailbox->buf); + + ret = config_eqc(hr_dev, eq, mailbox->buf); + if (ret) + goto err_cmd_mbox; ret = hns_roce_cmd_mbox(hr_dev, mailbox->dma, 0, eq->eqn, 0, eq_cmd, HNS_ROCE_CMD_TIMEOUT_MSECS); -- cgit v1.2.3 From d563099e3e89c48caf9cc183ab3d39dd326c8987 Mon Sep 17 00:00:00 2001 From: Xi Wang Date: Mon, 13 Apr 2020 19:58:09 +0800 Subject: RDMA/hns: Support 0 hop addressing for WQE buffer Add the zero hop addressing support by using new mtr interface for WQE buffer and simple mtr invoking process, so WQE buffer can support hopnum between 0 to 3. Link: https://lore.kernel.org/r/1586779091-51410-5-git-send-email-liweihang@huawei.com Signed-off-by: Xi Wang Signed-off-by: Weihang Li Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_device.h | 4 - drivers/infiniband/hw/hns/hns_roce_hw_v1.c | 80 ++++++------ drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 36 +++--- drivers/infiniband/hw/hns/hns_roce_qp.c | 182 +++++++--------------------- 4 files changed, 106 insertions(+), 196 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index 39577c2e345e..5df4ee3a1f82 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -681,7 +681,6 @@ struct hns_roce_work { struct hns_roce_qp { struct ib_qp ibqp; - struct hns_roce_buf hr_buf; struct hns_roce_wq rq; struct hns_roce_db rdb; struct hns_roce_db sdb; @@ -691,10 +690,7 @@ struct hns_roce_qp { u32 sq_signal_bits; struct hns_roce_wq sq; - struct ib_umem *umem; - struct hns_roce_mtt mtt; struct hns_roce_mtr mtr; - int wqe_bt_pg_shift; u32 buff_size; struct mutex mutex; diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c index 4b5490692fbb..ddf2a454b525 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c @@ -2479,7 +2479,6 @@ static int hns_roce_v1_clear_hem(struct hns_roce_dev *hr_dev, } static int hns_roce_v1_qp_modify(struct hns_roce_dev *hr_dev, - struct hns_roce_mtt *mtt, enum hns_roce_qp_state cur_state, enum hns_roce_qp_state new_state, struct hns_roce_qp_context *context, @@ -2560,6 +2559,29 @@ static int hns_roce_v1_qp_modify(struct hns_roce_dev *hr_dev, return ret; } +static int find_wqe_mtt(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp, + u64 *sq_ba, u64 *rq_ba, dma_addr_t *bt_ba) +{ + struct ib_device *ibdev = &hr_dev->ib_dev; + int rq_pa_start; + int count; + + count = hns_roce_mtr_find(hr_dev, &hr_qp->mtr, 0, sq_ba, 1, bt_ba); + if (count < 1) { + ibdev_err(ibdev, "Failed to find SQ ba\n"); + return -ENOBUFS; + } + rq_pa_start = hr_qp->rq.offset >> hr_qp->mtr.hem_cfg.buf_pg_shift; + count = hns_roce_mtr_find(hr_dev, &hr_qp->mtr, rq_pa_start, rq_ba, 1, + NULL); + if (!count) { + ibdev_err(ibdev, "Failed to find RQ ba\n"); + return -ENOBUFS; + } + + return 0; +} + static int hns_roce_v1_m_sqp(struct ib_qp *ibqp, const struct ib_qp_attr *attr, int attr_mask, enum ib_qp_state cur_state, enum ib_qp_state new_state) @@ -2567,25 +2589,20 @@ static int hns_roce_v1_m_sqp(struct ib_qp *ibqp, const struct ib_qp_attr *attr, struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device); struct hns_roce_qp *hr_qp = to_hr_qp(ibqp); struct hns_roce_sqp_context *context; - struct device *dev = &hr_dev->pdev->dev; dma_addr_t dma_handle = 0; u32 __iomem *addr; - int rq_pa_start; + u64 sq_ba = 0; + u64 rq_ba = 0; __le32 tmp; u32 reg_val; - u64 *mtts; context = kzalloc(sizeof(*context), GFP_KERNEL); if (!context) return -ENOMEM; /* Search QP buf's MTTs */ - mtts = hns_roce_table_find(hr_dev, &hr_dev->mr_table.mtt_table, - hr_qp->mtt.first_seg, &dma_handle); - if (!mtts) { - dev_err(dev, "qp buf pa find failed\n"); + if (find_wqe_mtt(hr_dev, hr_qp, &sq_ba, &rq_ba, &dma_handle)) goto out; - } if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) { roce_set_field(context->qp1c_bytes_4, @@ -2599,11 +2616,11 @@ static int hns_roce_v1_m_sqp(struct ib_qp *ibqp, const struct ib_qp_attr *attr, roce_set_field(context->qp1c_bytes_4, QP1C_BYTES_4_PD_M, QP1C_BYTES_4_PD_S, to_hr_pd(ibqp->pd)->pdn); - context->sq_rq_bt_l = cpu_to_le32((u32)(dma_handle)); + context->sq_rq_bt_l = cpu_to_le32(dma_handle); roce_set_field(context->qp1c_bytes_12, QP1C_BYTES_12_SQ_RQ_BT_H_M, QP1C_BYTES_12_SQ_RQ_BT_H_S, - ((u32)(dma_handle >> 32))); + upper_32_bits(dma_handle)); roce_set_field(context->qp1c_bytes_16, QP1C_BYTES_16_RQ_HEAD_M, QP1C_BYTES_16_RQ_HEAD_S, hr_qp->rq.head); @@ -2624,14 +2641,12 @@ static int hns_roce_v1_m_sqp(struct ib_qp *ibqp, const struct ib_qp_attr *attr, roce_set_field(context->qp1c_bytes_20, QP1C_BYTES_20_PKEY_IDX_M, QP1C_BYTES_20_PKEY_IDX_S, attr->pkey_index); - rq_pa_start = (u32)hr_qp->rq.offset / PAGE_SIZE; - context->cur_rq_wqe_ba_l = - cpu_to_le32((u32)(mtts[rq_pa_start])); + context->cur_rq_wqe_ba_l = cpu_to_le32(rq_ba); roce_set_field(context->qp1c_bytes_28, QP1C_BYTES_28_CUR_RQ_WQE_BA_H_M, QP1C_BYTES_28_CUR_RQ_WQE_BA_H_S, - (mtts[rq_pa_start]) >> 32); + upper_32_bits(rq_ba)); roce_set_field(context->qp1c_bytes_28, QP1C_BYTES_28_RQ_CUR_IDX_M, QP1C_BYTES_28_RQ_CUR_IDX_S, 0); @@ -2645,12 +2660,12 @@ static int hns_roce_v1_m_sqp(struct ib_qp *ibqp, const struct ib_qp_attr *attr, QP1C_BYTES_32_TX_CQ_NUM_S, to_hr_cq(ibqp->send_cq)->cqn); - context->cur_sq_wqe_ba_l = cpu_to_le32((u32)mtts[0]); + context->cur_sq_wqe_ba_l = cpu_to_le32(sq_ba); roce_set_field(context->qp1c_bytes_40, QP1C_BYTES_40_CUR_SQ_WQE_BA_H_M, QP1C_BYTES_40_CUR_SQ_WQE_BA_H_S, - (mtts[0]) >> 32); + upper_32_bits(sq_ba)); roce_set_field(context->qp1c_bytes_40, QP1C_BYTES_40_SQ_CUR_IDX_M, QP1C_BYTES_40_SQ_CUR_IDX_S, 0); @@ -2716,10 +2731,10 @@ static int hns_roce_v1_m_qp(struct ib_qp *ibqp, const struct ib_qp_attr *attr, dma_addr_t dma_handle_2 = 0; dma_addr_t dma_handle = 0; __le32 doorbell[2] = {0}; - int rq_pa_start = 0; u64 *mtts_2 = NULL; int ret = -EINVAL; - u64 *mtts = NULL; + u64 sq_ba = 0; + u64 rq_ba = 0; int port; u8 port_num; u8 *dmac; @@ -2730,12 +2745,8 @@ static int hns_roce_v1_m_qp(struct ib_qp *ibqp, const struct ib_qp_attr *attr, return -ENOMEM; /* Search qp buf's mtts */ - mtts = hns_roce_table_find(hr_dev, &hr_dev->mr_table.mtt_table, - hr_qp->mtt.first_seg, &dma_handle); - if (mtts == NULL) { - dev_err(dev, "qp buf pa find failed\n"); + if (find_wqe_mtt(hr_dev, hr_qp, &sq_ba, &rq_ba, &dma_handle)) goto out; - } /* Search IRRL's mtts */ mtts_2 = hns_roce_table_find(hr_dev, &hr_dev->qp_table.irrl_table, @@ -2890,11 +2901,11 @@ static int hns_roce_v1_m_qp(struct ib_qp *ibqp, const struct ib_qp_attr *attr, dmac = (u8 *)attr->ah_attr.roce.dmac; - context->sq_rq_bt_l = cpu_to_le32((u32)(dma_handle)); + context->sq_rq_bt_l = cpu_to_le32(dma_handle); roce_set_field(context->qpc_bytes_24, QP_CONTEXT_QPC_BYTES_24_SQ_RQ_BT_H_M, QP_CONTEXT_QPC_BYTES_24_SQ_RQ_BT_H_S, - ((u32)(dma_handle >> 32))); + upper_32_bits(dma_handle)); roce_set_bit(context->qpc_bytes_24, QP_CONTEXT_QPC_BYTE_24_REMOTE_ENABLE_E2E_CREDITS_S, 1); @@ -2993,14 +3004,12 @@ static int hns_roce_v1_m_qp(struct ib_qp *ibqp, const struct ib_qp_attr *attr, QP_CONTEXT_QPC_BYTES_68_RQ_CUR_INDEX_M, QP_CONTEXT_QPC_BYTES_68_RQ_CUR_INDEX_S, 0); - rq_pa_start = (u32)hr_qp->rq.offset / PAGE_SIZE; - context->cur_rq_wqe_ba_l = - cpu_to_le32((u32)(mtts[rq_pa_start])); + context->cur_rq_wqe_ba_l = cpu_to_le32(rq_ba); roce_set_field(context->qpc_bytes_76, QP_CONTEXT_QPC_BYTES_76_CUR_RQ_WQE_BA_H_M, QP_CONTEXT_QPC_BYTES_76_CUR_RQ_WQE_BA_H_S, - mtts[rq_pa_start] >> 32); + upper_32_bits(rq_ba)); roce_set_field(context->qpc_bytes_76, QP_CONTEXT_QPC_BYTES_76_RX_REQ_MSN_M, QP_CONTEXT_QPC_BYTES_76_RX_REQ_MSN_S, 0); @@ -3075,12 +3084,12 @@ static int hns_roce_v1_m_qp(struct ib_qp *ibqp, const struct ib_qp_attr *attr, goto out; } - context->rx_cur_sq_wqe_ba_l = cpu_to_le32((u32)(mtts[0])); + context->rx_cur_sq_wqe_ba_l = cpu_to_le32(sq_ba); roce_set_field(context->qpc_bytes_120, QP_CONTEXT_QPC_BYTES_120_RX_CUR_SQ_WQE_BA_H_M, QP_CONTEXT_QPC_BYTES_120_RX_CUR_SQ_WQE_BA_H_S, - (mtts[0]) >> 32); + upper_32_bits(sq_ba)); roce_set_field(context->qpc_bytes_124, QP_CONTEXT_QPC_BYTES_124_RX_ACK_MSN_M, @@ -3223,12 +3232,12 @@ static int hns_roce_v1_m_qp(struct ib_qp *ibqp, const struct ib_qp_attr *attr, QP_CONTEXT_QPC_BYTES_180_SQ_HEAD_M, QP_CONTEXT_QPC_BYTES_180_SQ_HEAD_S, 0); - context->tx_cur_sq_wqe_ba_l = cpu_to_le32((u32)(mtts[0])); + context->tx_cur_sq_wqe_ba_l = cpu_to_le32(sq_ba); roce_set_field(context->qpc_bytes_188, QP_CONTEXT_QPC_BYTES_188_TX_CUR_SQ_WQE_BA_H_M, QP_CONTEXT_QPC_BYTES_188_TX_CUR_SQ_WQE_BA_H_S, - (mtts[0]) >> 32); + upper_32_bits(sq_ba)); roce_set_bit(context->qpc_bytes_188, QP_CONTEXT_QPC_BYTES_188_PKT_RETRY_FLG_S, 0); roce_set_field(context->qpc_bytes_188, @@ -3253,8 +3262,7 @@ static int hns_roce_v1_m_qp(struct ib_qp *ibqp, const struct ib_qp_attr *attr, QP_CONTEXT_QPC_BYTES_144_QP_STATE_S, new_state); /* SW pass context to HW */ - ret = hns_roce_v1_qp_modify(hr_dev, &hr_qp->mtt, - to_hns_roce_state(cur_state), + ret = hns_roce_v1_qp_modify(hr_dev, to_hns_roce_state(cur_state), to_hns_roce_state(new_state), context, hr_qp); if (ret) { diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 335a637ab239..42bca0ae64ff 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -164,7 +164,7 @@ static void set_extend_sge(struct hns_roce_qp *qp, const struct ib_send_wr *wr, num_in_wqe = HNS_ROCE_V2_UC_RC_SGE_NUM_IN_WQE; extend_sge_num = valid_num_sge - num_in_wqe; sg = wr->sg_list + num_in_wqe; - shift = qp->hr_buf.page_shift; + shift = qp->mtr.hem_cfg.buf_pg_shift; /* * Check whether wr->num_sge sges are in the same page. If not, we @@ -3757,7 +3757,7 @@ static int modify_qp_init_to_rtr(struct ib_qp *ibqp, int port; /* Search qp buf's mtts */ - page_size = 1 << (hr_dev->caps.mtt_buf_pg_sz + PAGE_SHIFT); + page_size = 1 << hr_qp->mtr.hem_cfg.buf_pg_shift; count = hns_roce_mtr_find(hr_dev, &hr_qp->mtr, hr_qp->rq.offset / page_size, mtts, MTT_MIN_COUNT, &wqe_sge_ba); @@ -3831,7 +3831,7 @@ static int modify_qp_init_to_rtr(struct ib_qp *ibqp, roce_set_field(context->byte_16_buf_ba_pg_sz, V2_QPC_BYTE_16_WQE_SGE_BA_PG_SZ_M, V2_QPC_BYTE_16_WQE_SGE_BA_PG_SZ_S, - hr_qp->wqe_bt_pg_shift + PG_SHIFT_OFFSET); + to_hr_hw_page_shift(hr_qp->mtr.hem_cfg.ba_pg_shift)); roce_set_field(qpc_mask->byte_16_buf_ba_pg_sz, V2_QPC_BYTE_16_WQE_SGE_BA_PG_SZ_M, V2_QPC_BYTE_16_WQE_SGE_BA_PG_SZ_S, 0); @@ -3839,29 +3839,29 @@ static int modify_qp_init_to_rtr(struct ib_qp *ibqp, roce_set_field(context->byte_16_buf_ba_pg_sz, V2_QPC_BYTE_16_WQE_SGE_BUF_PG_SZ_M, V2_QPC_BYTE_16_WQE_SGE_BUF_PG_SZ_S, - hr_dev->caps.mtt_buf_pg_sz + PG_SHIFT_OFFSET); + to_hr_hw_page_shift(hr_qp->mtr.hem_cfg.buf_pg_shift)); roce_set_field(qpc_mask->byte_16_buf_ba_pg_sz, V2_QPC_BYTE_16_WQE_SGE_BUF_PG_SZ_M, V2_QPC_BYTE_16_WQE_SGE_BUF_PG_SZ_S, 0); - context->rq_cur_blk_addr = cpu_to_le32(mtts[0] >> PAGE_ADDR_SHIFT); + context->rq_cur_blk_addr = cpu_to_le32(to_hr_hw_page_addr(mtts[0])); qpc_mask->rq_cur_blk_addr = 0; roce_set_field(context->byte_92_srq_info, V2_QPC_BYTE_92_RQ_CUR_BLK_ADDR_M, V2_QPC_BYTE_92_RQ_CUR_BLK_ADDR_S, - mtts[0] >> (32 + PAGE_ADDR_SHIFT)); + upper_32_bits(to_hr_hw_page_addr(mtts[0]))); roce_set_field(qpc_mask->byte_92_srq_info, V2_QPC_BYTE_92_RQ_CUR_BLK_ADDR_M, V2_QPC_BYTE_92_RQ_CUR_BLK_ADDR_S, 0); - context->rq_nxt_blk_addr = cpu_to_le32(mtts[1] >> PAGE_ADDR_SHIFT); + context->rq_nxt_blk_addr = cpu_to_le32(to_hr_hw_page_addr(mtts[1])); qpc_mask->rq_nxt_blk_addr = 0; roce_set_field(context->byte_104_rq_sge, V2_QPC_BYTE_104_RQ_NXT_BLK_ADDR_M, V2_QPC_BYTE_104_RQ_NXT_BLK_ADDR_S, - mtts[1] >> (32 + PAGE_ADDR_SHIFT)); + upper_32_bits(to_hr_hw_page_addr(mtts[1]))); roce_set_field(qpc_mask->byte_104_rq_sge, V2_QPC_BYTE_104_RQ_NXT_BLK_ADDR_M, V2_QPC_BYTE_104_RQ_NXT_BLK_ADDR_S, 0); @@ -3995,18 +3995,18 @@ static int modify_qp_rtr_to_rts(struct ib_qp *ibqp, /* Search qp buf's mtts */ count = hns_roce_mtr_find(hr_dev, &hr_qp->mtr, 0, &sq_cur_blk, 1, NULL); if (count < 1) { - ibdev_err(ibdev, "failed to find buf pa of QP(0x%lx)\n", + ibdev_err(ibdev, "failed to find QP(0x%lx) SQ buf\n", hr_qp->qpn); return -EINVAL; } if (hr_qp->sge.offset) { - page_size = 1 << (hr_dev->caps.mtt_buf_pg_sz + PAGE_SHIFT); + page_size = 1 << hr_qp->mtr.hem_cfg.buf_pg_shift; count = hns_roce_mtr_find(hr_dev, &hr_qp->mtr, hr_qp->sge.offset / page_size, &sge_cur_blk, 1, NULL); if (count < 1) { - ibdev_err(ibdev, "failed to find sge pa of QP(0x%lx)\n", + ibdev_err(ibdev, "failed to find QP(0x%lx) SGE buf\n", hr_qp->qpn); return -EINVAL; } @@ -4024,11 +4024,11 @@ static int modify_qp_rtr_to_rts(struct ib_qp *ibqp, * we should set all bits of the relevant fields in context mask to * 0 at the same time, else set them to 0x1. */ - context->sq_cur_blk_addr = cpu_to_le32(sq_cur_blk >> PAGE_ADDR_SHIFT); + context->sq_cur_blk_addr = cpu_to_le32(to_hr_hw_page_addr(sq_cur_blk)); roce_set_field(context->byte_168_irrl_idx, V2_QPC_BYTE_168_SQ_CUR_BLK_ADDR_M, V2_QPC_BYTE_168_SQ_CUR_BLK_ADDR_S, - sq_cur_blk >> (32 + PAGE_ADDR_SHIFT)); + upper_32_bits(to_hr_hw_page_addr(sq_cur_blk))); qpc_mask->sq_cur_blk_addr = 0; roce_set_field(qpc_mask->byte_168_irrl_idx, V2_QPC_BYTE_168_SQ_CUR_BLK_ADDR_M, @@ -4036,26 +4036,24 @@ static int modify_qp_rtr_to_rts(struct ib_qp *ibqp, context->sq_cur_sge_blk_addr = ((ibqp->qp_type == IB_QPT_GSI) || hr_qp->sq.max_gs > HNS_ROCE_V2_UC_RC_SGE_NUM_IN_WQE) ? - cpu_to_le32(sge_cur_blk >> - PAGE_ADDR_SHIFT) : 0; + cpu_to_le32(to_hr_hw_page_addr(sge_cur_blk)) : 0; roce_set_field(context->byte_184_irrl_idx, V2_QPC_BYTE_184_SQ_CUR_SGE_BLK_ADDR_M, V2_QPC_BYTE_184_SQ_CUR_SGE_BLK_ADDR_S, ((ibqp->qp_type == IB_QPT_GSI) || hr_qp->sq.max_gs > HNS_ROCE_V2_UC_RC_SGE_NUM_IN_WQE) ? - (sge_cur_blk >> - (32 + PAGE_ADDR_SHIFT)) : 0); + upper_32_bits(to_hr_hw_page_addr(sge_cur_blk)) : 0); qpc_mask->sq_cur_sge_blk_addr = 0; roce_set_field(qpc_mask->byte_184_irrl_idx, V2_QPC_BYTE_184_SQ_CUR_SGE_BLK_ADDR_M, V2_QPC_BYTE_184_SQ_CUR_SGE_BLK_ADDR_S, 0); context->rx_sq_cur_blk_addr = - cpu_to_le32(sq_cur_blk >> PAGE_ADDR_SHIFT); + cpu_to_le32(to_hr_hw_page_addr(sq_cur_blk)); roce_set_field(context->byte_232_irrl_sge, V2_QPC_BYTE_232_RX_SQ_CUR_BLK_ADDR_M, V2_QPC_BYTE_232_RX_SQ_CUR_BLK_ADDR_S, - sq_cur_blk >> (32 + PAGE_ADDR_SHIFT)); + upper_32_bits(to_hr_hw_page_addr(sq_cur_blk))); qpc_mask->rx_sq_cur_blk_addr = 0; roce_set_field(qpc_mask->byte_232_irrl_sge, V2_QPC_BYTE_232_RX_SQ_CUR_BLK_ADDR_M, diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c index 1667f3753f34..d05d3cb7de39 100644 --- a/drivers/infiniband/hw/hns/hns_roce_qp.c +++ b/drivers/infiniband/hw/hns/hns_roce_qp.c @@ -512,63 +512,57 @@ static int set_user_sq_size(struct hns_roce_dev *hr_dev, static int split_wqe_buf_region(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp, - struct hns_roce_buf_region *regions, - int region_max, int page_shift) + struct hns_roce_buf_attr *buf_attr) { - int page_size = 1 << page_shift; bool is_extend_sge; - int region_cnt = 0; int buf_size; - int buf_cnt; + int idx = 0; - if (hr_qp->buff_size < 1 || region_max < 1) - return region_cnt; + if (hr_qp->buff_size < 1) + return -EINVAL; + + buf_attr->page_shift = PAGE_ADDR_SHIFT + hr_dev->caps.mtt_buf_pg_sz; + buf_attr->fixed_page = true; + buf_attr->region_count = 0; if (hr_qp->sge.sge_cnt > 0) is_extend_sge = true; else is_extend_sge = false; - /* sq region */ + /* SQ WQE */ if (is_extend_sge) buf_size = hr_qp->sge.offset - hr_qp->sq.offset; else buf_size = hr_qp->rq.offset - hr_qp->sq.offset; - if (buf_size > 0 && region_cnt < region_max) { - buf_cnt = DIV_ROUND_UP(buf_size, page_size); - hns_roce_init_buf_region(®ions[region_cnt], - hr_dev->caps.wqe_sq_hop_num, - hr_qp->sq.offset / page_size, - buf_cnt); - region_cnt++; + if (buf_size > 0 && idx < ARRAY_SIZE(buf_attr->region)) { + buf_attr->region[idx].size = buf_size; + buf_attr->region[idx].hopnum = hr_dev->caps.wqe_sq_hop_num; + idx++; } - /* sge region */ - if (is_extend_sge) { - buf_size = hr_qp->rq.offset - hr_qp->sge.offset; - if (buf_size > 0 && region_cnt < region_max) { - buf_cnt = DIV_ROUND_UP(buf_size, page_size); - hns_roce_init_buf_region(®ions[region_cnt], - hr_dev->caps.wqe_sge_hop_num, - hr_qp->sge.offset / page_size, - buf_cnt); - region_cnt++; - } + /* extend SGE in SQ WQE */ + buf_size = hr_qp->rq.offset - hr_qp->sge.offset; + if (buf_size > 0 && is_extend_sge && + idx < ARRAY_SIZE(buf_attr->region)) { + buf_attr->region[idx].size = buf_size; + buf_attr->region[idx].hopnum = + hr_dev->caps.wqe_sge_hop_num; + idx++; } - /* rq region */ + /* RQ WQE */ buf_size = hr_qp->buff_size - hr_qp->rq.offset; - if (buf_size > 0) { - buf_cnt = DIV_ROUND_UP(buf_size, page_size); - hns_roce_init_buf_region(®ions[region_cnt], - hr_dev->caps.wqe_rq_hop_num, - hr_qp->rq.offset / page_size, - buf_cnt); - region_cnt++; + if (buf_size > 0 && idx < ARRAY_SIZE(buf_attr->region)) { + buf_attr->region[idx].size = buf_size; + buf_attr->region[idx].hopnum = hr_dev->caps.wqe_rq_hop_num; + idx++; } - return region_cnt; + buf_attr->region_count = idx; + + return 0; } static int set_extend_sge_param(struct hns_roce_dev *hr_dev, @@ -731,72 +725,12 @@ static void free_rq_inline_buf(struct hns_roce_qp *hr_qp) kfree(hr_qp->rq_inl_buf.wqe_list); } -static int map_wqe_buf(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp, - u32 page_shift, bool is_user) -{ -/* WQE buffer include 3 parts: SQ, extend SGE and RQ. */ -#define HNS_ROCE_WQE_REGION_MAX 3 - struct hns_roce_buf_region regions[HNS_ROCE_WQE_REGION_MAX] = {}; - dma_addr_t *buf_list[HNS_ROCE_WQE_REGION_MAX] = {}; - struct ib_device *ibdev = &hr_dev->ib_dev; - struct hns_roce_buf_region *r; - int region_count; - int buf_count; - int ret; - int i; - - region_count = split_wqe_buf_region(hr_dev, hr_qp, regions, - ARRAY_SIZE(regions), page_shift); - - /* alloc a tmp list to store WQE buffers address */ - ret = hns_roce_alloc_buf_list(regions, buf_list, region_count); - if (ret) { - ibdev_err(ibdev, "Failed to alloc WQE buffer list\n"); - return ret; - } - - for (i = 0; i < region_count; i++) { - r = ®ions[i]; - if (is_user) - buf_count = hns_roce_get_umem_bufs(hr_dev, buf_list[i], - r->count, r->offset, hr_qp->umem, - page_shift); - else - buf_count = hns_roce_get_kmem_bufs(hr_dev, buf_list[i], - r->count, r->offset, &hr_qp->hr_buf); - - if (buf_count != r->count) { - ibdev_err(ibdev, "Failed to get %s WQE buf, expect %d = %d.\n", - is_user ? "user" : "kernel", - r->count, buf_count); - ret = -ENOBUFS; - goto done; - } - } - - hr_qp->wqe_bt_pg_shift = hr_dev->caps.mtt_ba_pg_sz; - hns_roce_mtr_init(&hr_qp->mtr, PAGE_SHIFT + hr_qp->wqe_bt_pg_shift, - page_shift); - ret = hns_roce_mtr_attach(hr_dev, &hr_qp->mtr, buf_list, regions, - region_count); - if (ret) - ibdev_err(ibdev, "Failed to attach WQE's mtr\n"); - - goto done; - - hns_roce_mtr_cleanup(hr_dev, &hr_qp->mtr); -done: - hns_roce_free_buf_list(buf_list, region_count); - - return ret; -} - static int alloc_qp_buf(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp, struct ib_qp_init_attr *init_attr, struct ib_udata *udata, unsigned long addr) { - u32 page_shift = PAGE_SHIFT + hr_dev->caps.mtt_buf_pg_sz; struct ib_device *ibdev = &hr_dev->ib_dev; + struct hns_roce_buf_attr buf_attr = {}; bool is_rq_buf_inline; int ret; @@ -810,54 +744,30 @@ static int alloc_qp_buf(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp, } } - if (udata) { - hr_qp->umem = ib_umem_get(ibdev, addr, hr_qp->buff_size, 0); - if (IS_ERR(hr_qp->umem)) { - ret = PTR_ERR(hr_qp->umem); - goto err_inline; - } - } else { - ret = hns_roce_buf_alloc(hr_dev, hr_qp->buff_size, - (1 << page_shift) * 2, - &hr_qp->hr_buf, page_shift); - if (ret) - goto err_inline; + ret = split_wqe_buf_region(hr_dev, hr_qp, &buf_attr); + if (ret) { + ibdev_err(ibdev, "Failed to split WQE buf, ret %d\n", ret); + goto err_inline; + } + ret = hns_roce_mtr_create(hr_dev, &hr_qp->mtr, &buf_attr, + PAGE_ADDR_SHIFT + hr_dev->caps.mtt_ba_pg_sz, + udata, addr); + if (ret) { + ibdev_err(ibdev, "Failed to create WQE mtr, ret %d\n", ret); + goto err_inline; } - - ret = map_wqe_buf(hr_dev, hr_qp, page_shift, udata); - if (ret) - goto err_alloc; return 0; - err_inline: if (is_rq_buf_inline) free_rq_inline_buf(hr_qp); -err_alloc: - if (udata) { - ib_umem_release(hr_qp->umem); - hr_qp->umem = NULL; - } else { - hns_roce_buf_free(hr_dev, &hr_qp->hr_buf); - } - - ibdev_err(ibdev, "Failed to alloc WQE buffer, ret %d.\n", ret); - return ret; } static void free_qp_buf(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp) { - hns_roce_mtr_cleanup(hr_dev, &hr_qp->mtr); - if (hr_qp->umem) { - ib_umem_release(hr_qp->umem); - hr_qp->umem = NULL; - } - - if (hr_qp->hr_buf.npages > 0) - hns_roce_buf_free(hr_dev, &hr_qp->hr_buf); - + hns_roce_mtr_destroy(hr_dev, &hr_qp->mtr); if ((hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RQ_INLINE) && hr_qp->rq.wqe_cnt) free_rq_inline_buf(hr_qp); @@ -1431,10 +1341,9 @@ void hns_roce_unlock_cqs(struct hns_roce_cq *send_cq, } } -static void *get_wqe(struct hns_roce_qp *hr_qp, int offset) +static inline void *get_wqe(struct hns_roce_qp *hr_qp, int offset) { - - return hns_roce_buf_offset(&hr_qp->hr_buf, offset); + return hns_roce_buf_offset(hr_qp->mtr.kmem, offset); } void *hns_roce_get_recv_wqe(struct hns_roce_qp *hr_qp, int n) @@ -1449,8 +1358,7 @@ void *hns_roce_get_send_wqe(struct hns_roce_qp *hr_qp, int n) void *hns_roce_get_extend_sge(struct hns_roce_qp *hr_qp, int n) { - return hns_roce_buf_offset(&hr_qp->hr_buf, hr_qp->sge.offset + - (n << hr_qp->sge.sge_shift)); + return get_wqe(hr_qp, hr_qp->sge.offset + (n << hr_qp->sge.sge_shift)); } bool hns_roce_wq_overflow(struct hns_roce_wq *hr_wq, int nreq, -- cgit v1.2.3 From 6fd610c5733d0b2024393e82f145180324ef55a7 Mon Sep 17 00:00:00 2001 From: Xi Wang Date: Mon, 13 Apr 2020 19:58:10 +0800 Subject: RDMA/hns: Support 0 hop addressing for SRQ buffer Add the zero hop addressing support by using mtr interface for SRQ buffer, so the hns driver can support addressing hopnum between 0 to 3 for SRQ. Link: https://lore.kernel.org/r/1586779091-51410-6-git-send-email-liweihang@huawei.com Signed-off-by: Xi Wang Signed-off-by: Weihang Li Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_device.h | 12 +- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 34 +-- drivers/infiniband/hw/hns/hns_roce_srq.c | 365 +++++++++++----------------- 3 files changed, 161 insertions(+), 250 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index 5df4ee3a1f82..39af7366126a 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -528,11 +528,8 @@ struct hns_roce_cq { }; struct hns_roce_idx_que { - struct hns_roce_buf idx_buf; + struct hns_roce_mtr mtr; int entry_sz; - u32 buf_size; - struct ib_umem *umem; - struct hns_roce_mtt mtt; unsigned long *bitmap; }; @@ -547,12 +544,9 @@ struct hns_roce_srq { atomic_t refcount; struct completion free; - struct hns_roce_buf buf; + struct hns_roce_mtr buf_mtr; + u64 *wrid; - struct ib_umem *umem; - u32 buf_size; - int page_shift; - struct hns_roce_mtt mtt; struct hns_roce_idx_que idx_que; spinlock_t lock; int head; diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 42bca0ae64ff..1aed542fbdc5 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -2699,7 +2699,7 @@ static struct hns_roce_v2_cqe *next_cqe_sw_v2(struct hns_roce_cq *hr_cq) static void *get_srq_wqe(struct hns_roce_srq *srq, int n) { - return hns_roce_buf_offset(&srq->buf, n << srq->wqe_shift); + return hns_roce_buf_offset(srq->buf_mtr.kmem, n << srq->wqe_shift); } static void hns_roce_free_srq_wqe(struct hns_roce_srq *srq, int wqe_index) @@ -5699,43 +5699,45 @@ static void hns_roce_v2_write_srqc(struct hns_roce_dev *hr_dev, dma_handle_idx >> 35); srq_context->idx_cur_blk_addr = - cpu_to_le32(mtts_idx[0] >> PAGE_ADDR_SHIFT); + cpu_to_le32(to_hr_hw_page_addr(mtts_idx[0])); roce_set_field(srq_context->byte_44_idxbufpgsz_addr, SRQC_BYTE_44_SRQ_IDX_CUR_BLK_ADDR_M, SRQC_BYTE_44_SRQ_IDX_CUR_BLK_ADDR_S, - mtts_idx[0] >> (32 + PAGE_ADDR_SHIFT)); + upper_32_bits(to_hr_hw_page_addr(mtts_idx[0]))); roce_set_field(srq_context->byte_44_idxbufpgsz_addr, SRQC_BYTE_44_SRQ_IDX_HOP_NUM_M, SRQC_BYTE_44_SRQ_IDX_HOP_NUM_S, hr_dev->caps.idx_hop_num == HNS_ROCE_HOP_NUM_0 ? 0 : hr_dev->caps.idx_hop_num); - roce_set_field(srq_context->byte_44_idxbufpgsz_addr, - SRQC_BYTE_44_SRQ_IDX_BA_PG_SZ_M, - SRQC_BYTE_44_SRQ_IDX_BA_PG_SZ_S, - hr_dev->caps.idx_ba_pg_sz + PG_SHIFT_OFFSET); - roce_set_field(srq_context->byte_44_idxbufpgsz_addr, - SRQC_BYTE_44_SRQ_IDX_BUF_PG_SZ_M, - SRQC_BYTE_44_SRQ_IDX_BUF_PG_SZ_S, - hr_dev->caps.idx_buf_pg_sz + PG_SHIFT_OFFSET); + roce_set_field( + srq_context->byte_44_idxbufpgsz_addr, + SRQC_BYTE_44_SRQ_IDX_BA_PG_SZ_M, + SRQC_BYTE_44_SRQ_IDX_BA_PG_SZ_S, + to_hr_hw_page_shift(srq->idx_que.mtr.hem_cfg.ba_pg_shift)); + roce_set_field( + srq_context->byte_44_idxbufpgsz_addr, + SRQC_BYTE_44_SRQ_IDX_BUF_PG_SZ_M, + SRQC_BYTE_44_SRQ_IDX_BUF_PG_SZ_S, + to_hr_hw_page_shift(srq->idx_que.mtr.hem_cfg.buf_pg_shift)); srq_context->idx_nxt_blk_addr = - cpu_to_le32(mtts_idx[1] >> PAGE_ADDR_SHIFT); + cpu_to_le32(to_hr_hw_page_addr(mtts_idx[1])); roce_set_field(srq_context->rsv_idxnxtblkaddr, SRQC_BYTE_52_SRQ_IDX_NXT_BLK_ADDR_M, SRQC_BYTE_52_SRQ_IDX_NXT_BLK_ADDR_S, - mtts_idx[1] >> (32 + PAGE_ADDR_SHIFT)); + upper_32_bits(to_hr_hw_page_addr(mtts_idx[1]))); roce_set_field(srq_context->byte_56_xrc_cqn, SRQC_BYTE_56_SRQ_XRC_CQN_M, SRQC_BYTE_56_SRQ_XRC_CQN_S, cqn); roce_set_field(srq_context->byte_56_xrc_cqn, SRQC_BYTE_56_SRQ_WQE_BA_PG_SZ_M, SRQC_BYTE_56_SRQ_WQE_BA_PG_SZ_S, - hr_dev->caps.srqwqe_ba_pg_sz + PG_SHIFT_OFFSET); + to_hr_hw_page_shift(srq->buf_mtr.hem_cfg.ba_pg_shift)); roce_set_field(srq_context->byte_56_xrc_cqn, SRQC_BYTE_56_SRQ_WQE_BUF_PG_SZ_M, SRQC_BYTE_56_SRQ_WQE_BUF_PG_SZ_S, - hr_dev->caps.srqwqe_buf_pg_sz + PG_SHIFT_OFFSET); + to_hr_hw_page_shift(srq->buf_mtr.hem_cfg.buf_pg_shift)); roce_set_bit(srq_context->db_record_addr_record_en, SRQC_BYTE_60_SRQ_RECORD_EN_S, 0); @@ -5847,7 +5849,7 @@ static void fill_idx_queue(struct hns_roce_idx_que *idx_que, { unsigned int *addr; - addr = (unsigned int *)hns_roce_buf_offset(&idx_que->idx_buf, + addr = (unsigned int *)hns_roce_buf_offset(idx_que->mtr.kmem, cur_idx * idx_que->entry_sz); *addr = wqe_idx; } diff --git a/drivers/infiniband/hw/hns/hns_roce_srq.c b/drivers/infiniband/hw/hns/hns_roce_srq.c index 9851d76d2c14..e413a9737ae6 100644 --- a/drivers/infiniband/hw/hns/hns_roce_srq.c +++ b/drivers/infiniband/hw/hns/hns_roce_srq.c @@ -77,56 +77,56 @@ static int hns_roce_hw_destroy_srq(struct hns_roce_dev *dev, HNS_ROCE_CMD_TIMEOUT_MSECS); } -static int hns_roce_srq_alloc(struct hns_roce_dev *hr_dev, u32 pdn, u32 cqn, - u16 xrcd, struct hns_roce_mtt *hr_mtt, - u64 db_rec_addr, struct hns_roce_srq *srq) +static int alloc_srqc(struct hns_roce_dev *hr_dev, struct hns_roce_srq *srq, + u32 pdn, u32 cqn, u16 xrcd, u64 db_rec_addr) { struct hns_roce_srq_table *srq_table = &hr_dev->srq_table; + struct ib_device *ibdev = &hr_dev->ib_dev; struct hns_roce_cmd_mailbox *mailbox; - dma_addr_t dma_handle_wqe; - dma_addr_t dma_handle_idx; - u64 *mtts_wqe; - u64 *mtts_idx; + u64 mtts_wqe[MTT_MIN_COUNT] = { 0 }; + u64 mtts_idx[MTT_MIN_COUNT] = { 0 }; + dma_addr_t dma_handle_wqe = 0; + dma_addr_t dma_handle_idx = 0; int ret; /* Get the physical address of srq buf */ - mtts_wqe = hns_roce_table_find(hr_dev, - &hr_dev->mr_table.mtt_srqwqe_table, - srq->mtt.first_seg, - &dma_handle_wqe); - if (!mtts_wqe) { - dev_err(hr_dev->dev, "Failed to find mtt for srq buf.\n"); - return -EINVAL; + ret = hns_roce_mtr_find(hr_dev, &srq->buf_mtr, 0, mtts_wqe, + ARRAY_SIZE(mtts_wqe), &dma_handle_wqe); + if (ret < 1) { + ibdev_err(ibdev, "Failed to find mtr for SRQ WQE\n"); + return -ENOBUFS; } /* Get physical address of idx que buf */ - mtts_idx = hns_roce_table_find(hr_dev, &hr_dev->mr_table.mtt_idx_table, - srq->idx_que.mtt.first_seg, - &dma_handle_idx); - if (!mtts_idx) { - dev_err(hr_dev->dev, - "Failed to find mtt for srq idx queue buf.\n"); - return -EINVAL; + ret = hns_roce_mtr_find(hr_dev, &srq->idx_que.mtr, 0, mtts_idx, + ARRAY_SIZE(mtts_idx), &dma_handle_idx); + if (ret < 1) { + ibdev_err(ibdev, "Failed to find mtr for SRQ idx\n"); + return -ENOBUFS; } ret = hns_roce_bitmap_alloc(&srq_table->bitmap, &srq->srqn); if (ret) { - dev_err(hr_dev->dev, - "Failed to alloc a bit from srq bitmap.\n"); + ibdev_err(ibdev, "Failed to alloc SRQ number, err %d\n", ret); return -ENOMEM; } ret = hns_roce_table_get(hr_dev, &srq_table->table, srq->srqn); - if (ret) + if (ret) { + ibdev_err(ibdev, "Failed to get SRQC table, err %d\n", ret); goto err_out; + } ret = xa_err(xa_store(&srq_table->xa, srq->srqn, srq, GFP_KERNEL)); - if (ret) + if (ret) { + ibdev_err(ibdev, "Failed to store SRQC, err %d\n", ret); goto err_put; + } mailbox = hns_roce_alloc_cmd_mailbox(hr_dev); - if (IS_ERR(mailbox)) { - ret = PTR_ERR(mailbox); + if (IS_ERR_OR_NULL(mailbox)) { + ret = -ENOMEM; + ibdev_err(ibdev, "Failed to alloc mailbox for SRQC\n"); goto err_xa; } @@ -136,8 +136,10 @@ static int hns_roce_srq_alloc(struct hns_roce_dev *hr_dev, u32 pdn, u32 cqn, ret = hns_roce_hw_create_srq(hr_dev, mailbox, srq->srqn); hns_roce_free_cmd_mailbox(hr_dev, mailbox); - if (ret) + if (ret) { + ibdev_err(ibdev, "Failed to config SRQC, err %d\n", ret); goto err_xa; + } atomic_set(&srq->refcount, 1); init_completion(&srq->free); @@ -154,8 +156,7 @@ err_out: return ret; } -static void hns_roce_srq_free(struct hns_roce_dev *hr_dev, - struct hns_roce_srq *srq) +static void free_srqc(struct hns_roce_dev *hr_dev, struct hns_roce_srq *srq) { struct hns_roce_srq_table *srq_table = &hr_dev->srq_table; int ret; @@ -175,185 +176,104 @@ static void hns_roce_srq_free(struct hns_roce_dev *hr_dev, hns_roce_bitmap_free(&srq_table->bitmap, srq->srqn, BITMAP_NO_RR); } -static int create_user_srq(struct hns_roce_srq *srq, struct ib_udata *udata, - int srq_buf_size) +static int alloc_srq_buf(struct hns_roce_dev *hr_dev, struct hns_roce_srq *srq, + struct ib_udata *udata, unsigned long addr) { - struct hns_roce_dev *hr_dev = to_hr_dev(srq->ibsrq.device); - struct hns_roce_ib_create_srq ucmd; - int page_shift; - int page_count; - int ret; - - if (ib_copy_from_udata(&ucmd, udata, sizeof(ucmd))) - return -EFAULT; + struct ib_device *ibdev = &hr_dev->ib_dev; + struct hns_roce_buf_attr buf_attr = {}; + int sge_size; + int err; - srq->umem = - ib_umem_get(srq->ibsrq.device, ucmd.buf_addr, srq_buf_size, 0); - if (IS_ERR(srq->umem)) - return PTR_ERR(srq->umem); + sge_size = roundup_pow_of_two(max(HNS_ROCE_SGE_SIZE, + HNS_ROCE_SGE_SIZE * srq->max_gs)); - page_count = (ib_umem_page_count(srq->umem) + - (1 << hr_dev->caps.srqwqe_buf_pg_sz) - 1) / - (1 << hr_dev->caps.srqwqe_buf_pg_sz); - page_shift = PAGE_SHIFT + hr_dev->caps.srqwqe_buf_pg_sz; - ret = hns_roce_mtt_init(hr_dev, page_count, page_shift, - &srq->mtt); - if (ret) - goto err_user_buf; + srq->wqe_shift = ilog2(sge_size); - ret = hns_roce_ib_umem_write_mtt(hr_dev, &srq->mtt, srq->umem); - if (ret) - goto err_user_srq_mtt; - - /* config index queue BA */ - srq->idx_que.umem = ib_umem_get(srq->ibsrq.device, ucmd.que_addr, - srq->idx_que.buf_size, 0); - if (IS_ERR(srq->idx_que.umem)) { - dev_err(hr_dev->dev, "ib_umem_get error for index queue\n"); - ret = PTR_ERR(srq->idx_que.umem); - goto err_user_srq_mtt; - } + buf_attr.page_shift = hr_dev->caps.srqwqe_buf_pg_sz + PAGE_ADDR_SHIFT; + buf_attr.region[0].size = srq->wqe_cnt * sge_size; + buf_attr.region[0].hopnum = hr_dev->caps.srqwqe_hop_num; + buf_attr.region_count = 1; + buf_attr.fixed_page = true; - page_count = DIV_ROUND_UP(ib_umem_page_count(srq->idx_que.umem), - 1 << hr_dev->caps.idx_buf_pg_sz); - page_shift = PAGE_SHIFT + hr_dev->caps.idx_buf_pg_sz; - ret = hns_roce_mtt_init(hr_dev, page_count, page_shift, - &srq->idx_que.mtt); - if (ret) { - dev_err(hr_dev->dev, "hns_roce_mtt_init error for idx que\n"); - goto err_user_idx_mtt; - } - - ret = hns_roce_ib_umem_write_mtt(hr_dev, &srq->idx_que.mtt, - srq->idx_que.umem); - if (ret) { - dev_err(hr_dev->dev, - "hns_roce_ib_umem_write_mtt error for idx que\n"); - goto err_user_idx_buf; - } + err = hns_roce_mtr_create(hr_dev, &srq->buf_mtr, &buf_attr, + hr_dev->caps.srqwqe_ba_pg_sz + + PAGE_ADDR_SHIFT, udata, addr); + if (err) + ibdev_err(ibdev, "Failed to alloc SRQ buf mtr, err %d\n", err); - return 0; - -err_user_idx_buf: - hns_roce_mtt_cleanup(hr_dev, &srq->idx_que.mtt); - -err_user_idx_mtt: - ib_umem_release(srq->idx_que.umem); - -err_user_srq_mtt: - hns_roce_mtt_cleanup(hr_dev, &srq->mtt); - -err_user_buf: - ib_umem_release(srq->umem); + return err; +} - return ret; +static void free_srq_buf(struct hns_roce_dev *hr_dev, struct hns_roce_srq *srq) +{ + hns_roce_mtr_destroy(hr_dev, &srq->buf_mtr); } -static int hns_roce_create_idx_que(struct ib_pd *pd, struct hns_roce_srq *srq, - u32 page_shift) +static int alloc_srq_idx(struct hns_roce_dev *hr_dev, struct hns_roce_srq *srq, + struct ib_udata *udata, unsigned long addr) { - struct hns_roce_dev *hr_dev = to_hr_dev(pd->device); struct hns_roce_idx_que *idx_que = &srq->idx_que; + struct ib_device *ibdev = &hr_dev->ib_dev; + struct hns_roce_buf_attr buf_attr = {}; + int err; - idx_que->bitmap = bitmap_zalloc(srq->wqe_cnt, GFP_KERNEL); - if (!idx_que->bitmap) - return -ENOMEM; + srq->idx_que.entry_sz = HNS_ROCE_IDX_QUE_ENTRY_SZ; + + buf_attr.page_shift = hr_dev->caps.idx_buf_pg_sz + PAGE_ADDR_SHIFT; + buf_attr.region[0].size = srq->wqe_cnt * HNS_ROCE_IDX_QUE_ENTRY_SZ; + buf_attr.region[0].hopnum = hr_dev->caps.idx_hop_num; + buf_attr.region_count = 1; + buf_attr.fixed_page = true; + + err = hns_roce_mtr_create(hr_dev, &idx_que->mtr, &buf_attr, + hr_dev->caps.idx_ba_pg_sz + PAGE_ADDR_SHIFT, + udata, addr); + if (err) { + ibdev_err(ibdev, "Failed to alloc SRQ idx mtr, err %d\n", err); + return err; + } - idx_que->buf_size = srq->idx_que.buf_size; + if (!udata) { + idx_que->bitmap = bitmap_zalloc(srq->wqe_cnt, GFP_KERNEL); + if (!idx_que->bitmap) { + ibdev_err(ibdev, "Failed to alloc SRQ idx bitmap\n"); + err = -ENOMEM; + goto err_idx_mtr; + } - if (hns_roce_buf_alloc(hr_dev, idx_que->buf_size, (1 << page_shift) * 2, - &idx_que->idx_buf, page_shift)) { - bitmap_free(idx_que->bitmap); - return -ENOMEM; } return 0; +err_idx_mtr: + hns_roce_mtr_destroy(hr_dev, &idx_que->mtr); + + return err; } -static int create_kernel_srq(struct hns_roce_srq *srq, int srq_buf_size) +static void free_srq_idx(struct hns_roce_dev *hr_dev, struct hns_roce_srq *srq) { - struct hns_roce_dev *hr_dev = to_hr_dev(srq->ibsrq.device); - u32 page_shift = PAGE_SHIFT + hr_dev->caps.srqwqe_buf_pg_sz; - int ret; + struct hns_roce_idx_que *idx_que = &srq->idx_que; - if (hns_roce_buf_alloc(hr_dev, srq_buf_size, (1 << page_shift) * 2, - &srq->buf, page_shift)) - return -ENOMEM; + bitmap_free(idx_que->bitmap); + idx_que->bitmap = NULL; + hns_roce_mtr_destroy(hr_dev, &idx_que->mtr); +} +static int alloc_srq_wrid(struct hns_roce_dev *hr_dev, struct hns_roce_srq *srq) +{ srq->head = 0; srq->tail = srq->wqe_cnt - 1; - - ret = hns_roce_mtt_init(hr_dev, srq->buf.npages, srq->buf.page_shift, - &srq->mtt); - if (ret) - goto err_kernel_buf; - - ret = hns_roce_buf_write_mtt(hr_dev, &srq->mtt, &srq->buf); - if (ret) - goto err_kernel_srq_mtt; - - page_shift = PAGE_SHIFT + hr_dev->caps.idx_buf_pg_sz; - ret = hns_roce_create_idx_que(srq->ibsrq.pd, srq, page_shift); - if (ret) { - dev_err(hr_dev->dev, "Create idx queue fail(%d)!\n", ret); - goto err_kernel_srq_mtt; - } - - /* Init mtt table for idx_que */ - ret = hns_roce_mtt_init(hr_dev, srq->idx_que.idx_buf.npages, - srq->idx_que.idx_buf.page_shift, - &srq->idx_que.mtt); - if (ret) - goto err_kernel_create_idx; - - /* Write buffer address into the mtt table */ - ret = hns_roce_buf_write_mtt(hr_dev, &srq->idx_que.mtt, - &srq->idx_que.idx_buf); - if (ret) - goto err_kernel_idx_buf; - srq->wrid = kvmalloc_array(srq->wqe_cnt, sizeof(u64), GFP_KERNEL); - if (!srq->wrid) { - ret = -ENOMEM; - goto err_kernel_idx_buf; - } + if (!srq->wrid) + return -ENOMEM; return 0; - -err_kernel_idx_buf: - hns_roce_mtt_cleanup(hr_dev, &srq->idx_que.mtt); - -err_kernel_create_idx: - hns_roce_buf_free(hr_dev, &srq->idx_que.idx_buf); - kfree(srq->idx_que.bitmap); - -err_kernel_srq_mtt: - hns_roce_mtt_cleanup(hr_dev, &srq->mtt); - -err_kernel_buf: - hns_roce_buf_free(hr_dev, &srq->buf); - - return ret; -} - -static void destroy_user_srq(struct hns_roce_dev *hr_dev, - struct hns_roce_srq *srq) -{ - hns_roce_mtt_cleanup(hr_dev, &srq->idx_que.mtt); - ib_umem_release(srq->idx_que.umem); - hns_roce_mtt_cleanup(hr_dev, &srq->mtt); - ib_umem_release(srq->umem); } -static void destroy_kernel_srq(struct hns_roce_dev *hr_dev, - struct hns_roce_srq *srq) +static void free_srq_wrid(struct hns_roce_dev *hr_dev, struct hns_roce_srq *srq) { - kvfree(srq->wrid); - hns_roce_mtt_cleanup(hr_dev, &srq->idx_que.mtt); - hns_roce_buf_free(hr_dev, &srq->idx_que.idx_buf); - kfree(srq->idx_que.bitmap); - hns_roce_mtt_cleanup(hr_dev, &srq->mtt); - hns_roce_buf_free(hr_dev, &srq->buf); + kfree(srq->wrid); + srq->wrid = NULL; } int hns_roce_create_srq(struct ib_srq *ib_srq, @@ -363,8 +283,8 @@ int hns_roce_create_srq(struct ib_srq *ib_srq, struct hns_roce_dev *hr_dev = to_hr_dev(ib_srq->device); struct hns_roce_ib_create_srq_resp resp = {}; struct hns_roce_srq *srq = to_hr_srq(ib_srq); - int srq_desc_size; - int srq_buf_size; + struct ib_device *ibdev = &hr_dev->ib_dev; + struct hns_roce_ib_create_srq ucmd = {}; int ret = 0; u32 cqn; @@ -379,41 +299,45 @@ int hns_roce_create_srq(struct ib_srq *ib_srq, srq->wqe_cnt = roundup_pow_of_two(init_attr->attr.max_wr + 1); srq->max_gs = init_attr->attr.max_sge; - srq_desc_size = roundup_pow_of_two(max(HNS_ROCE_SGE_SIZE, - HNS_ROCE_SGE_SIZE * srq->max_gs)); - - srq->wqe_shift = ilog2(srq_desc_size); - - srq_buf_size = srq->wqe_cnt * srq_desc_size; - - srq->idx_que.entry_sz = HNS_ROCE_IDX_QUE_ENTRY_SZ; - srq->idx_que.buf_size = srq->wqe_cnt * srq->idx_que.entry_sz; - srq->mtt.mtt_type = MTT_TYPE_SRQWQE; - srq->idx_que.mtt.mtt_type = MTT_TYPE_IDX; - if (udata) { - ret = create_user_srq(srq, udata, srq_buf_size); + ret = ib_copy_from_udata(&ucmd, udata, sizeof(ucmd)); if (ret) { - dev_err(hr_dev->dev, "Create user srq failed\n"); - goto err_srq; + ibdev_err(ibdev, "Failed to copy SRQ udata, err %d\n", + ret); + return ret; } - } else { - ret = create_kernel_srq(srq, srq_buf_size); + } + + ret = alloc_srq_buf(hr_dev, srq, udata, ucmd.buf_addr); + if (ret) { + ibdev_err(ibdev, "Failed to alloc SRQ buffer, err %d\n", ret); + return ret; + } + + ret = alloc_srq_idx(hr_dev, srq, udata, ucmd.que_addr); + if (ret) { + ibdev_err(ibdev, "Failed to alloc SRQ idx, err %d\n", ret); + goto err_buf_alloc; + } + + if (!udata) { + ret = alloc_srq_wrid(hr_dev, srq); if (ret) { - dev_err(hr_dev->dev, "Create kernel srq failed\n"); - goto err_srq; + ibdev_err(ibdev, "Failed to alloc SRQ wrid, err %d\n", + ret); + goto err_idx_alloc; } } cqn = ib_srq_has_cq(init_attr->srq_type) ? to_hr_cq(init_attr->ext.cq)->cqn : 0; - srq->db_reg_l = hr_dev->reg_base + SRQ_DB_REG; - ret = hns_roce_srq_alloc(hr_dev, to_hr_pd(ib_srq->pd)->pdn, cqn, 0, - &srq->mtt, 0, srq); - if (ret) - goto err_wrid; + ret = alloc_srqc(hr_dev, srq, to_hr_pd(ib_srq->pd)->pdn, cqn, 0, 0); + if (ret) { + ibdev_err(ibdev, "Failed to alloc SRQ context, err %d\n", ret); + goto err_wrid_alloc; + } srq->event = hns_roce_ib_srq_event; resp.srqn = srq->srqn; @@ -429,15 +353,13 @@ int hns_roce_create_srq(struct ib_srq *ib_srq, return 0; err_srqc_alloc: - hns_roce_srq_free(hr_dev, srq); - -err_wrid: - if (udata) - destroy_user_srq(hr_dev, srq); - else - destroy_kernel_srq(hr_dev, srq); - -err_srq: + free_srqc(hr_dev, srq); +err_wrid_alloc: + free_srq_wrid(hr_dev, srq); +err_idx_alloc: + free_srq_idx(hr_dev, srq); +err_buf_alloc: + free_srq_buf(hr_dev, srq); return ret; } @@ -446,17 +368,10 @@ void hns_roce_destroy_srq(struct ib_srq *ibsrq, struct ib_udata *udata) struct hns_roce_dev *hr_dev = to_hr_dev(ibsrq->device); struct hns_roce_srq *srq = to_hr_srq(ibsrq); - hns_roce_srq_free(hr_dev, srq); - hns_roce_mtt_cleanup(hr_dev, &srq->mtt); - - if (udata) { - hns_roce_mtt_cleanup(hr_dev, &srq->idx_que.mtt); - } else { - kvfree(srq->wrid); - hns_roce_buf_free(hr_dev, &srq->buf); - } - ib_umem_release(srq->idx_que.umem); - ib_umem_release(srq->umem); + free_srqc(hr_dev, srq); + free_srq_idx(hr_dev, srq); + free_srq_wrid(hr_dev, srq); + free_srq_buf(hr_dev, srq); } int hns_roce_init_srq_table(struct hns_roce_dev *hr_dev) -- cgit v1.2.3 From 744b7bdfa79edb30bb7d5f9ae43b65e0d147533a Mon Sep 17 00:00:00 2001 From: Xi Wang Date: Mon, 13 Apr 2020 19:58:11 +0800 Subject: RDMA/hns: Support 0 hop addressing for CQE buffer Add the zero hop addressing support by using mtr interface for CQE buffer, so the hns driver can support addressing hopnum between 0 to 3 for CQE. Link: https://lore.kernel.org/r/1586779091-51410-7-git-send-email-liweihang@huawei.com Signed-off-by: Xi Wang Signed-off-by: Weihang Li Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_cq.c | 351 +++++++++------------------- drivers/infiniband/hw/hns/hns_roce_device.h | 8 +- drivers/infiniband/hw/hns/hns_roce_hw_v1.c | 13 +- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 15 +- 4 files changed, 122 insertions(+), 265 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_cq.c b/drivers/infiniband/hw/hns/hns_roce_cq.c index 92798ff6360d..d2d7074bbe69 100644 --- a/drivers/infiniband/hw/hns/hns_roce_cq.c +++ b/drivers/infiniband/hw/hns/hns_roce_cq.c @@ -39,51 +39,40 @@ #include #include "hns_roce_common.h" -static int hns_roce_alloc_cqc(struct hns_roce_dev *hr_dev, - struct hns_roce_cq *hr_cq) +static int alloc_cqc(struct hns_roce_dev *hr_dev, struct hns_roce_cq *hr_cq) { struct hns_roce_cmd_mailbox *mailbox; - struct hns_roce_hem_table *mtt_table; struct hns_roce_cq_table *cq_table; - struct device *dev = hr_dev->dev; + struct ib_device *ibdev = &hr_dev->ib_dev; + u64 mtts[MTT_MIN_COUNT] = { 0 }; dma_addr_t dma_handle; - u64 *mtts; int ret; - cq_table = &hr_dev->cq_table; - - /* Get the physical address of cq buf */ - if (hns_roce_check_whether_mhop(hr_dev, HEM_TYPE_CQE)) - mtt_table = &hr_dev->mr_table.mtt_cqe_table; - else - mtt_table = &hr_dev->mr_table.mtt_table; - - mtts = hns_roce_table_find(hr_dev, mtt_table, hr_cq->mtt.first_seg, - &dma_handle); - - if (!mtts) { - dev_err(dev, "Failed to find mtt for CQ buf.\n"); + ret = hns_roce_mtr_find(hr_dev, &hr_cq->mtr, 0, mtts, ARRAY_SIZE(mtts), + &dma_handle); + if (ret < 1) { + ibdev_err(ibdev, "Failed to find CQ mtr\n"); return -EINVAL; } + cq_table = &hr_dev->cq_table; ret = hns_roce_bitmap_alloc(&cq_table->bitmap, &hr_cq->cqn); if (ret) { - dev_err(dev, "Num of CQ out of range.\n"); + ibdev_err(ibdev, "Failed to alloc CQ bitmap, err %d\n", ret); return ret; } /* Get CQC memory HEM(Hardware Entry Memory) table */ ret = hns_roce_table_get(hr_dev, &cq_table->table, hr_cq->cqn); if (ret) { - dev_err(dev, - "Get context mem failed(%d) when CQ(0x%lx) alloc.\n", - ret, hr_cq->cqn); + ibdev_err(ibdev, "Failed to get CQ(0x%lx) context, err %d\n", + hr_cq->cqn, ret); goto err_out; } ret = xa_err(xa_store(&cq_table->array, hr_cq->cqn, hr_cq, GFP_KERNEL)); if (ret) { - dev_err(dev, "Failed to xa_store CQ.\n"); + ibdev_err(ibdev, "Failed to xa_store CQ\n"); goto err_put; } @@ -101,9 +90,9 @@ static int hns_roce_alloc_cqc(struct hns_roce_dev *hr_dev, HNS_ROCE_CMD_CREATE_CQC, HNS_ROCE_CMD_TIMEOUT_MSECS); hns_roce_free_cmd_mailbox(hr_dev, mailbox); if (ret) { - dev_err(dev, - "Send cmd mailbox failed(%d) when CQ(0x%lx) alloc.\n", - ret, hr_cq->cqn); + ibdev_err(ibdev, + "Failed to send create cmd for CQ(0x%lx), err %d\n", + hr_cq->cqn, ret); goto err_xa; } @@ -126,7 +115,7 @@ err_out: return ret; } -void hns_roce_free_cqc(struct hns_roce_dev *hr_dev, struct hns_roce_cq *hr_cq) +static void free_cqc(struct hns_roce_dev *hr_dev, struct hns_roce_cq *hr_cq) { struct hns_roce_cq_table *cq_table = &hr_dev->cq_table; struct device *dev = hr_dev->dev; @@ -153,190 +142,86 @@ void hns_roce_free_cqc(struct hns_roce_dev *hr_dev, struct hns_roce_cq *hr_cq) hns_roce_bitmap_free(&cq_table->bitmap, hr_cq->cqn, BITMAP_NO_RR); } -static int get_cq_umem(struct hns_roce_dev *hr_dev, struct hns_roce_cq *hr_cq, - struct hns_roce_ib_create_cq ucmd, - struct ib_udata *udata) -{ - struct hns_roce_mtt *mtt = &hr_cq->mtt; - struct ib_umem **umem = &hr_cq->umem; - u32 npages; - int ret; - - *umem = ib_umem_get(&hr_dev->ib_dev, ucmd.buf_addr, hr_cq->buf_size, - IB_ACCESS_LOCAL_WRITE); - if (IS_ERR(*umem)) - return PTR_ERR(*umem); - - if (hns_roce_check_whether_mhop(hr_dev, HEM_TYPE_CQE)) - mtt->mtt_type = MTT_TYPE_CQE; - else - mtt->mtt_type = MTT_TYPE_WQE; - - npages = DIV_ROUND_UP(ib_umem_page_count(*umem), - 1 << hr_dev->caps.cqe_buf_pg_sz); - ret = hns_roce_mtt_init(hr_dev, npages, hr_cq->page_shift, mtt); - if (ret) - goto err_buf; - - ret = hns_roce_ib_umem_write_mtt(hr_dev, mtt, *umem); - if (ret) - goto err_mtt; - - return 0; - -err_mtt: - hns_roce_mtt_cleanup(hr_dev, mtt); - -err_buf: - ib_umem_release(*umem); - return ret; -} - -static int alloc_cq_buf(struct hns_roce_dev *hr_dev, struct hns_roce_cq *hr_cq) +static int alloc_cq_buf(struct hns_roce_dev *hr_dev, struct hns_roce_cq *hr_cq, + struct ib_udata *udata, unsigned long addr) { - struct hns_roce_buf *buf = &hr_cq->buf; - struct hns_roce_mtt *mtt = &hr_cq->mtt; - int ret; - - ret = hns_roce_buf_alloc(hr_dev, hr_cq->buf_size, - (1 << hr_cq->page_shift) * 2, - buf, hr_cq->page_shift); - if (ret) - goto out; - - if (hns_roce_check_whether_mhop(hr_dev, HEM_TYPE_CQE)) - mtt->mtt_type = MTT_TYPE_CQE; - else - mtt->mtt_type = MTT_TYPE_WQE; - - ret = hns_roce_mtt_init(hr_dev, buf->npages, buf->page_shift, mtt); - if (ret) - goto err_buf; - - ret = hns_roce_buf_write_mtt(hr_dev, mtt, buf); - if (ret) - goto err_mtt; - - return 0; - -err_mtt: - hns_roce_mtt_cleanup(hr_dev, mtt); - -err_buf: - hns_roce_buf_free(hr_dev, buf); - -out: - return ret; + struct ib_device *ibdev = &hr_dev->ib_dev; + struct hns_roce_buf_attr buf_attr = {}; + int err; + + buf_attr.page_shift = hr_dev->caps.cqe_buf_pg_sz + PAGE_ADDR_SHIFT; + buf_attr.region[0].size = hr_cq->cq_depth * hr_dev->caps.cq_entry_sz; + buf_attr.region[0].hopnum = hr_dev->caps.cqe_hop_num; + buf_attr.region_count = 1; + buf_attr.fixed_page = true; + + err = hns_roce_mtr_create(hr_dev, &hr_cq->mtr, &buf_attr, + hr_dev->caps.cqe_ba_pg_sz + PAGE_ADDR_SHIFT, + udata, addr); + if (err) + ibdev_err(ibdev, "Failed to alloc CQ mtr, err %d\n", err); + + return err; } static void free_cq_buf(struct hns_roce_dev *hr_dev, struct hns_roce_cq *hr_cq) { - hns_roce_buf_free(hr_dev, &hr_cq->buf); + hns_roce_mtr_destroy(hr_dev, &hr_cq->mtr); } -static int create_user_cq(struct hns_roce_dev *hr_dev, - struct hns_roce_cq *hr_cq, - struct ib_udata *udata, - struct hns_roce_ib_create_cq_resp *resp) +static int alloc_cq_db(struct hns_roce_dev *hr_dev, struct hns_roce_cq *hr_cq, + struct ib_udata *udata, unsigned long addr, + struct hns_roce_ib_create_cq_resp *resp) { - struct hns_roce_ib_create_cq ucmd; - struct device *dev = hr_dev->dev; - int ret; - struct hns_roce_ucontext *context = rdma_udata_to_drv_context( - udata, struct hns_roce_ucontext, ibucontext); - - if (ib_copy_from_udata(&ucmd, udata, sizeof(ucmd))) { - dev_err(dev, "Failed to copy_from_udata.\n"); - return -EFAULT; - } + bool has_db = hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB; + struct hns_roce_ucontext *uctx; + int err; - /* Get user space address, write it into mtt table */ - ret = get_cq_umem(hr_dev, hr_cq, ucmd, udata); - if (ret) { - dev_err(dev, "Failed to get_cq_umem.\n"); - return ret; - } - - if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB && - udata->outlen >= offsetofend(typeof(*resp), cap_flags)) { - ret = hns_roce_db_map_user(context, udata, ucmd.db_addr, - &hr_cq->db); - if (ret) { - dev_err(dev, "cq record doorbell map failed!\n"); - goto err_mtt; + if (udata) { + if (has_db && + udata->outlen >= offsetofend(typeof(*resp), cap_flags)) { + uctx = rdma_udata_to_drv_context(udata, + struct hns_roce_ucontext, ibucontext); + err = hns_roce_db_map_user(uctx, udata, addr, + &hr_cq->db); + if (err) + return err; + hr_cq->db_en = 1; + resp->cap_flags |= HNS_ROCE_SUPPORT_CQ_RECORD_DB; } - hr_cq->db_en = 1; - resp->cap_flags |= HNS_ROCE_SUPPORT_CQ_RECORD_DB; - } - - return 0; - -err_mtt: - hns_roce_mtt_cleanup(hr_dev, &hr_cq->mtt); - ib_umem_release(hr_cq->umem); - - return ret; -} - -static int create_kernel_cq(struct hns_roce_dev *hr_dev, - struct hns_roce_cq *hr_cq) -{ - struct device *dev = hr_dev->dev; - int ret; - - if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB) { - ret = hns_roce_alloc_db(hr_dev, &hr_cq->db, 1); - if (ret) - return ret; - - hr_cq->set_ci_db = hr_cq->db.db_record; - *hr_cq->set_ci_db = 0; - hr_cq->db_en = 1; - } - - /* Init mtt table and write buff address to mtt table */ - ret = alloc_cq_buf(hr_dev, hr_cq); - if (ret) { - dev_err(dev, "Failed to alloc_cq_buf.\n"); - goto err_db; + } else { + if (has_db) { + err = hns_roce_alloc_db(hr_dev, &hr_cq->db, 1); + if (err) + return err; + hr_cq->set_ci_db = hr_cq->db.db_record; + *hr_cq->set_ci_db = 0; + hr_cq->db_en = 1; + } + hr_cq->cq_db_l = hr_dev->reg_base + hr_dev->odb_offset + + DB_REG_OFFSET * hr_dev->priv_uar.index; } - hr_cq->cq_db_l = hr_dev->reg_base + hr_dev->odb_offset + - DB_REG_OFFSET * hr_dev->priv_uar.index; - return 0; - -err_db: - if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB) - hns_roce_free_db(hr_dev, &hr_cq->db); - - return ret; } -static void destroy_user_cq(struct hns_roce_dev *hr_dev, - struct hns_roce_cq *hr_cq, - struct ib_udata *udata, - struct hns_roce_ib_create_cq_resp *resp) +static void free_cq_db(struct hns_roce_dev *hr_dev, struct hns_roce_cq *hr_cq, + struct ib_udata *udata) { - struct hns_roce_ucontext *context = rdma_udata_to_drv_context( - udata, struct hns_roce_ucontext, ibucontext); + struct hns_roce_ucontext *uctx; - if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB && - udata->outlen >= offsetofend(typeof(*resp), cap_flags)) - hns_roce_db_unmap_user(context, &hr_cq->db); - - hns_roce_mtt_cleanup(hr_dev, &hr_cq->mtt); - ib_umem_release(hr_cq->umem); -} - -static void destroy_kernel_cq(struct hns_roce_dev *hr_dev, - struct hns_roce_cq *hr_cq) -{ - hns_roce_mtt_cleanup(hr_dev, &hr_cq->mtt); - free_cq_buf(hr_dev, hr_cq); + if (!hr_cq->db_en) + return; - if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB) + hr_cq->db_en = 0; + if (udata) { + uctx = rdma_udata_to_drv_context(udata, + struct hns_roce_ucontext, + ibucontext); + hns_roce_db_unmap_user(uctx, &hr_cq->db); + } else { hns_roce_free_db(hr_dev, &hr_cq->db); + } } int hns_roce_create_cq(struct ib_cq *ib_cq, const struct ib_cq_init_attr *attr, @@ -345,20 +230,21 @@ int hns_roce_create_cq(struct ib_cq *ib_cq, const struct ib_cq_init_attr *attr, struct hns_roce_dev *hr_dev = to_hr_dev(ib_cq->device); struct hns_roce_ib_create_cq_resp resp = {}; struct hns_roce_cq *hr_cq = to_hr_cq(ib_cq); - struct device *dev = hr_dev->dev; + struct ib_device *ibdev = &hr_dev->ib_dev; + struct hns_roce_ib_create_cq ucmd = {}; int vector = attr->comp_vector; u32 cq_entries = attr->cqe; int ret; if (cq_entries < 1 || cq_entries > hr_dev->caps.max_cqes) { - dev_err(dev, "Create CQ failed. entries=%d, max=%d\n", - cq_entries, hr_dev->caps.max_cqes); + ibdev_err(ibdev, "Failed to check CQ count %d max=%d\n", + cq_entries, hr_dev->caps.max_cqes); return -EINVAL; } if (vector >= hr_dev->caps.num_comp_vectors) { - dev_err(dev, "Create CQ failed, vector=%d, max=%d\n", - vector, hr_dev->caps.num_comp_vectors); + ibdev_err(ibdev, "Failed to check CQ vector=%d max=%d\n", + vector, hr_dev->caps.num_comp_vectors); return -EINVAL; } @@ -367,30 +253,35 @@ int hns_roce_create_cq(struct ib_cq *ib_cq, const struct ib_cq_init_attr *attr, hr_cq->ib_cq.cqe = cq_entries - 1; /* used as cqe index */ hr_cq->cq_depth = cq_entries; hr_cq->vector = vector; - hr_cq->buf_size = hr_cq->cq_depth * hr_dev->caps.cq_entry_sz; - hr_cq->page_shift = PAGE_SHIFT + hr_dev->caps.cqe_buf_pg_sz; spin_lock_init(&hr_cq->lock); INIT_LIST_HEAD(&hr_cq->sq_list); INIT_LIST_HEAD(&hr_cq->rq_list); if (udata) { - ret = create_user_cq(hr_dev, hr_cq, udata, &resp); - if (ret) { - dev_err(dev, "Create cq failed in user mode!\n"); - goto err_cq; - } - } else { - ret = create_kernel_cq(hr_dev, hr_cq); + ret = ib_copy_from_udata(&ucmd, udata, sizeof(ucmd)); if (ret) { - dev_err(dev, "Create cq failed in kernel mode!\n"); - goto err_cq; + ibdev_err(ibdev, "Failed to copy CQ udata, err %d\n", + ret); + return ret; } } - ret = hns_roce_alloc_cqc(hr_dev, hr_cq); + ret = alloc_cq_buf(hr_dev, hr_cq, udata, ucmd.buf_addr); + if (ret) { + ibdev_err(ibdev, "Failed to alloc CQ buf, err %d\n", ret); + return ret; + } + + ret = alloc_cq_db(hr_dev, hr_cq, udata, ucmd.db_addr, &resp); if (ret) { - dev_err(dev, "Alloc CQ failed(%d).\n", ret); - goto err_dbmap; + ibdev_err(ibdev, "Failed to alloc CQ db, err %d\n", ret); + goto err_cq_buf; + } + + ret = alloc_cqc(hr_dev, hr_cq); + if (ret) { + ibdev_err(ibdev, "Failed to alloc CQ context, err %d\n", ret); + goto err_cq_db; } /* @@ -412,15 +303,11 @@ int hns_roce_create_cq(struct ib_cq *ib_cq, const struct ib_cq_init_attr *attr, return 0; err_cqc: - hns_roce_free_cqc(hr_dev, hr_cq); - -err_dbmap: - if (udata) - destroy_user_cq(hr_dev, hr_cq, udata, &resp); - else - destroy_kernel_cq(hr_dev, hr_cq); - -err_cq: + free_cqc(hr_dev, hr_cq); +err_cq_db: + free_cq_db(hr_dev, hr_cq, udata); +err_cq_buf: + free_cq_buf(hr_dev, hr_cq); return ret; } @@ -429,28 +316,12 @@ void hns_roce_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata) struct hns_roce_dev *hr_dev = to_hr_dev(ib_cq->device); struct hns_roce_cq *hr_cq = to_hr_cq(ib_cq); - if (hr_dev->hw->destroy_cq) { + if (hr_dev->hw->destroy_cq) hr_dev->hw->destroy_cq(ib_cq, udata); - return; - } - - hns_roce_free_cqc(hr_dev, hr_cq); - hns_roce_mtt_cleanup(hr_dev, &hr_cq->mtt); - ib_umem_release(hr_cq->umem); - if (udata) { - if (hr_cq->db_en == 1) - hns_roce_db_unmap_user(rdma_udata_to_drv_context( - udata, - struct hns_roce_ucontext, - ibucontext), - &hr_cq->db); - } else { - /* Free the buff of stored cq */ - free_cq_buf(hr_dev, hr_cq); - if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB) - hns_roce_free_db(hr_dev, &hr_cq->db); - } + free_cq_buf(hr_dev, hr_cq); + free_cq_db(hr_dev, hr_cq, udata); + free_cqc(hr_dev, hr_cq); } void hns_roce_cq_completion(struct hns_roce_dev *hr_dev, u32 cqn) diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index 39af7366126a..ecbfeb6dbdd4 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -503,14 +503,10 @@ struct hns_roce_db { struct hns_roce_cq { struct ib_cq ib_cq; - struct hns_roce_buf buf; - struct hns_roce_mtt mtt; + struct hns_roce_mtr mtr; struct hns_roce_db db; u8 db_en; spinlock_t lock; - struct ib_umem *umem; - u32 buf_size; - int page_shift; u32 cq_depth; u32 cons_index; u32 *set_ci_db; @@ -1294,8 +1290,6 @@ int hns_roce_create_cq(struct ib_cq *ib_cq, const struct ib_cq_init_attr *attr, struct ib_udata *udata); void hns_roce_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata); -void hns_roce_free_cqc(struct hns_roce_dev *hr_dev, struct hns_roce_cq *hr_cq); - int hns_roce_db_map_user(struct hns_roce_ucontext *context, struct ib_udata *udata, unsigned long virt, struct hns_roce_db *db); diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c index ddf2a454b525..a1f053cd30b9 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c @@ -1972,7 +1972,8 @@ static int hns_roce_v1_write_mtpt(void *mb_buf, struct hns_roce_mr *mr, static void *get_cqe(struct hns_roce_cq *hr_cq, int n) { - return hns_roce_buf_offset(&hr_cq->buf, n * HNS_ROCE_V1_CQE_ENTRY_SIZE); + return hns_roce_buf_offset(hr_cq->mtr.kmem, + n * HNS_ROCE_V1_CQE_ENTRY_SIZE); } static void *get_sw_cqe(struct hns_roce_cq *hr_cq, int n) @@ -3644,8 +3645,6 @@ static void hns_roce_v1_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata) u32 cqe_cnt_cur; int wait_time = 0; - hns_roce_free_cqc(hr_dev, hr_cq); - /* * Before freeing cq buffer, we need to ensure that the outstanding CQE * have been written by checking the CQE counter. @@ -3668,14 +3667,6 @@ static void hns_roce_v1_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata) } wait_time++; } - - hns_roce_mtt_cleanup(hr_dev, &hr_cq->mtt); - - ib_umem_release(hr_cq->umem); - if (!udata) { - /* Free the buff of stored cq */ - hns_roce_buf_free(hr_dev, &hr_cq->buf); - } } static void set_eq_cons_index_v1(struct hns_roce_eq *eq, int req_not) diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 1aed542fbdc5..833e9298f0dc 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -2680,7 +2680,8 @@ static int hns_roce_v2_mw_write_mtpt(void *mb_buf, struct hns_roce_mw *mw) static void *get_cqe_v2(struct hns_roce_cq *hr_cq, int n) { - return hns_roce_buf_offset(&hr_cq->buf, n * HNS_ROCE_V2_CQE_ENTRY_SIZE); + return hns_roce_buf_offset(hr_cq->mtr.kmem, + n * HNS_ROCE_V2_CQE_ENTRY_SIZE); } static void *get_sw_cqe_v2(struct hns_roce_cq *hr_cq, int n) @@ -2801,30 +2802,30 @@ static void hns_roce_v2_write_cqc(struct hns_roce_dev *hr_dev, roce_set_field(cq_context->byte_8_cqn, V2_CQC_BYTE_8_CQN_M, V2_CQC_BYTE_8_CQN_S, hr_cq->cqn); - cq_context->cqe_cur_blk_addr = cpu_to_le32(mtts[0] >> PAGE_ADDR_SHIFT); + cq_context->cqe_cur_blk_addr = cpu_to_le32(to_hr_hw_page_addr(mtts[0])); roce_set_field(cq_context->byte_16_hop_addr, V2_CQC_BYTE_16_CQE_CUR_BLK_ADDR_M, V2_CQC_BYTE_16_CQE_CUR_BLK_ADDR_S, - mtts[0] >> (32 + PAGE_ADDR_SHIFT)); + upper_32_bits(to_hr_hw_page_addr(mtts[0]))); roce_set_field(cq_context->byte_16_hop_addr, V2_CQC_BYTE_16_CQE_HOP_NUM_M, V2_CQC_BYTE_16_CQE_HOP_NUM_S, hr_dev->caps.cqe_hop_num == HNS_ROCE_HOP_NUM_0 ? 0 : hr_dev->caps.cqe_hop_num); - cq_context->cqe_nxt_blk_addr = cpu_to_le32(mtts[1] >> PAGE_ADDR_SHIFT); + cq_context->cqe_nxt_blk_addr = cpu_to_le32(to_hr_hw_page_addr(mtts[1])); roce_set_field(cq_context->byte_24_pgsz_addr, V2_CQC_BYTE_24_CQE_NXT_BLK_ADDR_M, V2_CQC_BYTE_24_CQE_NXT_BLK_ADDR_S, - mtts[1] >> (32 + PAGE_ADDR_SHIFT)); + upper_32_bits(to_hr_hw_page_addr(mtts[1]))); roce_set_field(cq_context->byte_24_pgsz_addr, V2_CQC_BYTE_24_CQE_BA_PG_SZ_M, V2_CQC_BYTE_24_CQE_BA_PG_SZ_S, - hr_dev->caps.cqe_ba_pg_sz + PG_SHIFT_OFFSET); + to_hr_hw_page_shift(hr_cq->mtr.hem_cfg.ba_pg_shift)); roce_set_field(cq_context->byte_24_pgsz_addr, V2_CQC_BYTE_24_CQE_BUF_PG_SZ_M, V2_CQC_BYTE_24_CQE_BUF_PG_SZ_S, - hr_dev->caps.cqe_buf_pg_sz + PG_SHIFT_OFFSET); + to_hr_hw_page_shift(hr_cq->mtr.hem_cfg.buf_pg_shift)); cq_context->cqe_ba = cpu_to_le32(dma_handle >> 3); -- cgit v1.2.3 From 322f3d45a17f64494152bd5583b68c8855b539c0 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Sun, 19 Apr 2020 16:20:46 +0300 Subject: RDMA/bnxt: Delete 'nq_ptr' variable which is not used The variable "nq_ptr" is set but never used, this generates the following warning while compiling kernel with W=1 option. drivers/infiniband/hw/bnxt_re/qplib_fp.c: In function 'bnxt_qplib_service_nq': drivers/infiniband/hw/bnxt_re/qplib_fp.c:303:25: warning: variable 'nq_ptr' set but not used [-Wunused-but-set-variable] 303 | struct nq_base *nqe, **nq_ptr; | Fixes: fddcbbb02af4 ("RDMA/bnxt_re: Simplify obtaining queue entry from hw ring") Link: https://lore.kernel.org/r/20200419132046.123887-1-leon@kernel.org Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/bnxt_re/qplib_fp.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.c b/drivers/infiniband/hw/bnxt_re/qplib_fp.c index a4de56bdd6e8..c5e29577cd43 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_fp.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.c @@ -300,12 +300,12 @@ static void bnxt_qplib_service_nq(unsigned long data) { struct bnxt_qplib_nq *nq = (struct bnxt_qplib_nq *)data; struct bnxt_qplib_hwq *hwq = &nq->hwq; - struct nq_base *nqe, **nq_ptr; int num_srqne_processed = 0; int num_cqne_processed = 0; struct bnxt_qplib_cq *cq; int budget = nq->budget; u32 sw_cons, raw_cons; + struct nq_base *nqe; uintptr_t q_handle; u16 type; @@ -314,7 +314,6 @@ static void bnxt_qplib_service_nq(unsigned long data) raw_cons = hwq->cons; while (budget--) { sw_cons = HWQ_CMP(raw_cons, hwq); - nq_ptr = (struct nq_base **)hwq->pbl_ptr; nqe = bnxt_qplib_get_qe(hwq, sw_cons, NULL); if (!NQE_CMP_VALID(nqe, raw_cons, hwq->max_elements)) break; -- cgit v1.2.3 From 9976ea27b526365c4a9a6d3336a8b06f839ec26d Mon Sep 17 00:00:00 2001 From: Lijun Ou Date: Wed, 15 Apr 2020 16:14:30 +0800 Subject: RDMA/hns: Optimize hns_roce_config_link_table() Remove the unnecessary memset operation and adjust style of some lines in hns_roce_config_link_table(). Link: https://lore.kernel.org/r/1586938475-37049-2-git-send-email-liweihang@huawei.com Signed-off-by: Lijun Ou Signed-off-by: Weihang Li Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 53 ++++++++++++------------------ 1 file changed, 21 insertions(+), 32 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 833e9298f0dc..dbbc5d42d59f 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -2040,8 +2040,6 @@ static int hns_roce_config_link_table(struct hns_roce_dev *hr_dev, page_num = link_tbl->npages; entry = link_tbl->table.buf; - memset(req_a, 0, sizeof(*req_a)); - memset(req_b, 0, sizeof(*req_b)); for (i = 0; i < 2; i++) { hns_roce_cmq_setup_basic_desc(&desc[i], opcode, false); @@ -2050,39 +2048,30 @@ static int hns_roce_config_link_table(struct hns_roce_dev *hr_dev, desc[i].flag |= cpu_to_le16(HNS_ROCE_CMD_FLAG_NEXT); else desc[i].flag &= ~cpu_to_le16(HNS_ROCE_CMD_FLAG_NEXT); - - if (i == 0) { - req_a->base_addr_l = - cpu_to_le32(link_tbl->table.map & 0xffffffff); - req_a->base_addr_h = - cpu_to_le32(link_tbl->table.map >> 32); - roce_set_field(req_a->depth_pgsz_init_en, - CFG_LLM_QUE_DEPTH_M, CFG_LLM_QUE_DEPTH_S, - link_tbl->npages); - roce_set_field(req_a->depth_pgsz_init_en, - CFG_LLM_QUE_PGSZ_M, CFG_LLM_QUE_PGSZ_S, - link_tbl->pg_sz); - req_a->head_ba_l = cpu_to_le32(entry[0].blk_ba0); - req_a->head_ba_h_nxtptr = - cpu_to_le32(entry[0].blk_ba1_nxt_ptr); - roce_set_field(req_a->head_ptr, CFG_LLM_HEAD_PTR_M, - CFG_LLM_HEAD_PTR_S, 0); - } else { - req_b->tail_ba_l = - cpu_to_le32(entry[page_num - 1].blk_ba0); - roce_set_field(req_b->tail_ba_h, CFG_LLM_TAIL_BA_H_M, - CFG_LLM_TAIL_BA_H_S, - entry[page_num - 1].blk_ba1_nxt_ptr & - HNS_ROCE_LINK_TABLE_BA1_M); - roce_set_field(req_b->tail_ptr, CFG_LLM_TAIL_PTR_M, - CFG_LLM_TAIL_PTR_S, - (entry[page_num - 2].blk_ba1_nxt_ptr & - HNS_ROCE_LINK_TABLE_NXT_PTR_M) >> - HNS_ROCE_LINK_TABLE_NXT_PTR_S); - } } + + req_a->base_addr_l = cpu_to_le32(link_tbl->table.map & 0xffffffff); + req_a->base_addr_h = cpu_to_le32(link_tbl->table.map >> 32); + roce_set_field(req_a->depth_pgsz_init_en, CFG_LLM_QUE_DEPTH_M, + CFG_LLM_QUE_DEPTH_S, link_tbl->npages); + roce_set_field(req_a->depth_pgsz_init_en, CFG_LLM_QUE_PGSZ_M, + CFG_LLM_QUE_PGSZ_S, link_tbl->pg_sz); roce_set_field(req_a->depth_pgsz_init_en, CFG_LLM_INIT_EN_M, CFG_LLM_INIT_EN_S, 1); + req_a->head_ba_l = cpu_to_le32(entry[0].blk_ba0); + req_a->head_ba_h_nxtptr = cpu_to_le32(entry[0].blk_ba1_nxt_ptr); + roce_set_field(req_a->head_ptr, CFG_LLM_HEAD_PTR_M, CFG_LLM_HEAD_PTR_S, + 0); + + req_b->tail_ba_l = cpu_to_le32(entry[page_num - 1].blk_ba0); + roce_set_field(req_b->tail_ba_h, CFG_LLM_TAIL_BA_H_M, + CFG_LLM_TAIL_BA_H_S, + entry[page_num - 1].blk_ba1_nxt_ptr & + HNS_ROCE_LINK_TABLE_BA1_M); + roce_set_field(req_b->tail_ptr, CFG_LLM_TAIL_PTR_M, CFG_LLM_TAIL_PTR_S, + (entry[page_num - 2].blk_ba1_nxt_ptr & + HNS_ROCE_LINK_TABLE_NXT_PTR_M) >> + HNS_ROCE_LINK_TABLE_NXT_PTR_S); return hns_roce_cmq_send(hr_dev, desc, 2); } -- cgit v1.2.3 From 375898e83d26d6da7414f129ed417ad7cef2728f Mon Sep 17 00:00:00 2001 From: Lijun Ou Date: Wed, 15 Apr 2020 16:14:31 +0800 Subject: RDMA/hns: Optimize hns_roce_v2_set_mac() Removes the unnecessary memset opertaion and adjust style of some lines in hns_roce_v2_set_mac(). Link: https://lore.kernel.org/r/1586938475-37049-3-git-send-email-liweihang@huawei.com Signed-off-by: Lijun Ou Signed-off-by: Weihang Li Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index dbbc5d42d59f..0624d5ae15d0 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -2427,12 +2427,9 @@ static int hns_roce_v2_set_mac(struct hns_roce_dev *hr_dev, u8 phy_port, reg_smac_l = *(u32 *)(&addr[0]); reg_smac_h = *(u16 *)(&addr[4]); - memset(smac_tb, 0, sizeof(*smac_tb)); - roce_set_field(smac_tb->tb_idx_rsv, - CFG_SMAC_TB_IDX_M, + roce_set_field(smac_tb->tb_idx_rsv, CFG_SMAC_TB_IDX_M, CFG_SMAC_TB_IDX_S, phy_port); - roce_set_field(smac_tb->vf_smac_h_rsv, - CFG_SMAC_TB_VF_SMAC_H_M, + roce_set_field(smac_tb->vf_smac_h_rsv, CFG_SMAC_TB_VF_SMAC_H_M, CFG_SMAC_TB_VF_SMAC_H_S, reg_smac_h); smac_tb->vf_smac_l = cpu_to_le32(reg_smac_l); -- cgit v1.2.3 From a3de9e83810ced41ad7dece44c03f2338e44129d Mon Sep 17 00:00:00 2001 From: Lang Cheng Date: Wed, 15 Apr 2020 16:14:32 +0800 Subject: RDMA/hns: Simplify the qp state convert code Use type map table to reduce the cyclomatic complexity. Link: https://lore.kernel.org/r/1586938475-37049-4-git-send-email-liweihang@huawei.com Signed-off-by: Lang Cheng Signed-off-by: Weihang Li Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 0624d5ae15d0..a1c819db3496 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -4539,19 +4539,20 @@ out: return ret; } -static inline enum ib_qp_state to_ib_qp_st(enum hns_roce_v2_qp_state state) -{ - switch (state) { - case HNS_ROCE_QP_ST_RST: return IB_QPS_RESET; - case HNS_ROCE_QP_ST_INIT: return IB_QPS_INIT; - case HNS_ROCE_QP_ST_RTR: return IB_QPS_RTR; - case HNS_ROCE_QP_ST_RTS: return IB_QPS_RTS; - case HNS_ROCE_QP_ST_SQ_DRAINING: - case HNS_ROCE_QP_ST_SQD: return IB_QPS_SQD; - case HNS_ROCE_QP_ST_SQER: return IB_QPS_SQE; - case HNS_ROCE_QP_ST_ERR: return IB_QPS_ERR; - default: return -1; - } +static int to_ib_qp_st(enum hns_roce_v2_qp_state state) +{ + static const enum ib_qp_state map[] = { + [HNS_ROCE_QP_ST_RST] = IB_QPS_RESET, + [HNS_ROCE_QP_ST_INIT] = IB_QPS_INIT, + [HNS_ROCE_QP_ST_RTR] = IB_QPS_RTR, + [HNS_ROCE_QP_ST_RTS] = IB_QPS_RTS, + [HNS_ROCE_QP_ST_SQD] = IB_QPS_SQD, + [HNS_ROCE_QP_ST_SQER] = IB_QPS_SQE, + [HNS_ROCE_QP_ST_ERR] = IB_QPS_ERR, + [HNS_ROCE_QP_ST_SQ_DRAINING] = IB_QPS_SQD + }; + + return (state < ARRAY_SIZE(map)) ? map[state] : -1; } static int hns_roce_v2_query_qpc(struct hns_roce_dev *hr_dev, -- cgit v1.2.3 From 7c044adca272768d821921f11d3da4587dcec68a Mon Sep 17 00:00:00 2001 From: Lang Cheng Date: Wed, 15 Apr 2020 16:14:33 +0800 Subject: RDMA/hns: Simplify the cqe code of poll cq Encapsulate codes to get status of cqe into a function and use map table instead of switch-case to reduce cyclomatic complexity of hns_roce_v2_poll_one(). Link: https://lore.kernel.org/r/1586938475-37049-5-git-send-email-liweihang@huawei.com Signed-off-by: Lang Cheng Signed-off-by: Weihang Li Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 130 +++++++++++++---------------- 1 file changed, 57 insertions(+), 73 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index a1c819db3496..9b86c8eeccf2 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -2955,6 +2955,61 @@ out: return npolled; } +static void get_cqe_status(struct hns_roce_dev *hr_dev, struct hns_roce_qp *qp, + struct hns_roce_v2_cqe *cqe, struct ib_wc *wc) +{ + static const struct { + u32 cqe_status; + enum ib_wc_status wc_status; + } map[] = { + { HNS_ROCE_CQE_V2_SUCCESS, IB_WC_SUCCESS }, + { HNS_ROCE_CQE_V2_LOCAL_LENGTH_ERR, IB_WC_LOC_LEN_ERR }, + { HNS_ROCE_CQE_V2_LOCAL_QP_OP_ERR, IB_WC_LOC_QP_OP_ERR }, + { HNS_ROCE_CQE_V2_LOCAL_PROT_ERR, IB_WC_LOC_PROT_ERR }, + { HNS_ROCE_CQE_V2_WR_FLUSH_ERR, IB_WC_WR_FLUSH_ERR }, + { HNS_ROCE_CQE_V2_MW_BIND_ERR, IB_WC_MW_BIND_ERR }, + { HNS_ROCE_CQE_V2_BAD_RESP_ERR, IB_WC_BAD_RESP_ERR }, + { HNS_ROCE_CQE_V2_LOCAL_ACCESS_ERR, IB_WC_LOC_ACCESS_ERR }, + { HNS_ROCE_CQE_V2_REMOTE_INVAL_REQ_ERR, IB_WC_REM_INV_REQ_ERR }, + { HNS_ROCE_CQE_V2_REMOTE_ACCESS_ERR, IB_WC_REM_ACCESS_ERR }, + { HNS_ROCE_CQE_V2_REMOTE_OP_ERR, IB_WC_REM_OP_ERR }, + { HNS_ROCE_CQE_V2_TRANSPORT_RETRY_EXC_ERR, + IB_WC_RETRY_EXC_ERR }, + { HNS_ROCE_CQE_V2_RNR_RETRY_EXC_ERR, IB_WC_RNR_RETRY_EXC_ERR }, + { HNS_ROCE_CQE_V2_REMOTE_ABORT_ERR, IB_WC_REM_ABORT_ERR }, + }; + + u32 cqe_status = roce_get_field(cqe->byte_4, V2_CQE_BYTE_4_STATUS_M, + V2_CQE_BYTE_4_STATUS_S); + int i; + + wc->status = IB_WC_GENERAL_ERR; + for (i = 0; i < ARRAY_SIZE(map); i++) + if (cqe_status == map[i].cqe_status) { + wc->status = map[i].wc_status; + break; + } + + if (wc->status == IB_WC_SUCCESS || wc->status == IB_WC_WR_FLUSH_ERR) + return; + + ibdev_err(&hr_dev->ib_dev, "error cqe status 0x%x:\n", cqe_status); + print_hex_dump(KERN_ERR, "", DUMP_PREFIX_NONE, 16, 4, cqe, + sizeof(*cqe), false); + + /* + * Hip08 hardware cannot flush the WQEs in SQ/RQ if the QP state gets + * into errored mode. Hence, as a workaround to this hardware + * limitation, driver needs to assist in flushing. But the flushing + * operation uses mailbox to convey the QP state to the hardware and + * which can sleep due to the mutex protection around the mailbox calls. + * Hence, use the deferred flush for now. Once wc error detected, the + * flushing operation is needed. + */ + if (!test_and_set_bit(HNS_ROCE_FLUSH_FLAG, &qp->flush_flag)) + init_flush_work(hr_dev, qp); +} + static int hns_roce_v2_poll_one(struct hns_roce_cq *hr_cq, struct hns_roce_qp **cur_qp, struct ib_wc *wc) { @@ -2966,7 +3021,6 @@ static int hns_roce_v2_poll_one(struct hns_roce_cq *hr_cq, int is_send; u16 wqe_ctr; u32 opcode; - u32 status; int qpn; int ret; @@ -2996,7 +3050,6 @@ static int hns_roce_v2_poll_one(struct hns_roce_cq *hr_cq, *cur_qp = hr_qp; } - hr_qp = *cur_qp; wc->qp = &(*cur_qp)->ibqp; wc->vendor_err = 0; @@ -3031,77 +3084,8 @@ static int hns_roce_v2_poll_one(struct hns_roce_cq *hr_cq, ++wq->tail; } - status = roce_get_field(cqe->byte_4, V2_CQE_BYTE_4_STATUS_M, - V2_CQE_BYTE_4_STATUS_S); - switch (status & HNS_ROCE_V2_CQE_STATUS_MASK) { - case HNS_ROCE_CQE_V2_SUCCESS: - wc->status = IB_WC_SUCCESS; - break; - case HNS_ROCE_CQE_V2_LOCAL_LENGTH_ERR: - wc->status = IB_WC_LOC_LEN_ERR; - break; - case HNS_ROCE_CQE_V2_LOCAL_QP_OP_ERR: - wc->status = IB_WC_LOC_QP_OP_ERR; - break; - case HNS_ROCE_CQE_V2_LOCAL_PROT_ERR: - wc->status = IB_WC_LOC_PROT_ERR; - break; - case HNS_ROCE_CQE_V2_WR_FLUSH_ERR: - wc->status = IB_WC_WR_FLUSH_ERR; - break; - case HNS_ROCE_CQE_V2_MW_BIND_ERR: - wc->status = IB_WC_MW_BIND_ERR; - break; - case HNS_ROCE_CQE_V2_BAD_RESP_ERR: - wc->status = IB_WC_BAD_RESP_ERR; - break; - case HNS_ROCE_CQE_V2_LOCAL_ACCESS_ERR: - wc->status = IB_WC_LOC_ACCESS_ERR; - break; - case HNS_ROCE_CQE_V2_REMOTE_INVAL_REQ_ERR: - wc->status = IB_WC_REM_INV_REQ_ERR; - break; - case HNS_ROCE_CQE_V2_REMOTE_ACCESS_ERR: - wc->status = IB_WC_REM_ACCESS_ERR; - break; - case HNS_ROCE_CQE_V2_REMOTE_OP_ERR: - wc->status = IB_WC_REM_OP_ERR; - break; - case HNS_ROCE_CQE_V2_TRANSPORT_RETRY_EXC_ERR: - wc->status = IB_WC_RETRY_EXC_ERR; - break; - case HNS_ROCE_CQE_V2_RNR_RETRY_EXC_ERR: - wc->status = IB_WC_RNR_RETRY_EXC_ERR; - break; - case HNS_ROCE_CQE_V2_REMOTE_ABORT_ERR: - wc->status = IB_WC_REM_ABORT_ERR; - break; - default: - wc->status = IB_WC_GENERAL_ERR; - break; - } - - /* - * Hip08 hardware cannot flush the WQEs in SQ/RQ if the QP state gets - * into errored mode. Hence, as a workaround to this hardware - * limitation, driver needs to assist in flushing. But the flushing - * operation uses mailbox to convey the QP state to the hardware and - * which can sleep due to the mutex protection around the mailbox calls. - * Hence, use the deferred flush for now. Once wc error detected, the - * flushing operation is needed. - */ - if (wc->status != IB_WC_SUCCESS && - wc->status != IB_WC_WR_FLUSH_ERR) { - ibdev_err(&hr_dev->ib_dev, "error cqe status is: 0x%x\n", - status & HNS_ROCE_V2_CQE_STATUS_MASK); - - if (!test_and_set_bit(HNS_ROCE_FLUSH_FLAG, &hr_qp->flush_flag)) - init_flush_work(hr_dev, hr_qp); - - return 0; - } - - if (wc->status == IB_WC_WR_FLUSH_ERR) + get_cqe_status(hr_dev, *cur_qp, cqe, wc); + if (wc->status != IB_WC_SUCCESS) return 0; if (is_send) { -- cgit v1.2.3 From 357f342946860b323d1981cc42370516dbb209d2 Mon Sep 17 00:00:00 2001 From: Lang Cheng Date: Wed, 15 Apr 2020 16:14:34 +0800 Subject: RDMA/hns: Simplify the state judgment code of qp Use state table to make the qp state migrate code more readable. Link: https://lore.kernel.org/r/1586938475-37049-6-git-send-email-liweihang@huawei.com Signed-off-by: Lang Cheng Signed-off-by: Weihang Li Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 54 +++++++++++++++--------------- 1 file changed, 27 insertions(+), 27 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 9b86c8eeccf2..2a8c3893bccd 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -4077,21 +4077,6 @@ static int modify_qp_rtr_to_rts(struct ib_qp *ibqp, return 0; } -static inline bool hns_roce_v2_check_qp_stat(enum ib_qp_state cur_state, - enum ib_qp_state new_state) -{ - - if ((cur_state != IB_QPS_RESET && - (new_state == IB_QPS_ERR || new_state == IB_QPS_RESET)) || - ((cur_state == IB_QPS_RTS || cur_state == IB_QPS_SQD) && - (new_state == IB_QPS_RTS || new_state == IB_QPS_SQD)) || - (cur_state == IB_QPS_SQE && new_state == IB_QPS_RTS)) - return true; - - return false; - -} - static int hns_roce_v2_set_path(struct ib_qp *ibqp, const struct ib_qp_attr *attr, int attr_mask, @@ -4195,6 +4180,28 @@ static int hns_roce_v2_set_path(struct ib_qp *ibqp, return 0; } +static bool check_qp_state(enum ib_qp_state cur_state, + enum ib_qp_state new_state) +{ + static const bool sm[][IB_QPS_ERR + 1] = { + [IB_QPS_RESET] = { [IB_QPS_RESET] = true, + [IB_QPS_INIT] = true }, + [IB_QPS_INIT] = { [IB_QPS_RESET] = true, + [IB_QPS_INIT] = true, + [IB_QPS_RTR] = true, + [IB_QPS_ERR] = true }, + [IB_QPS_RTR] = { [IB_QPS_RESET] = true, + [IB_QPS_RTS] = true, + [IB_QPS_ERR] = true }, + [IB_QPS_RTS] = { [IB_QPS_RESET] = true, [IB_QPS_ERR] = true }, + [IB_QPS_SQD] = {}, + [IB_QPS_SQE] = {}, + [IB_QPS_ERR] = { [IB_QPS_RESET] = true, [IB_QPS_ERR] = true } + }; + + return sm[cur_state][new_state]; +} + static int hns_roce_v2_set_abs_fields(struct ib_qp *ibqp, const struct ib_qp_attr *attr, int attr_mask, @@ -4206,6 +4213,11 @@ static int hns_roce_v2_set_abs_fields(struct ib_qp *ibqp, struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device); int ret = 0; + if (!check_qp_state(cur_state, new_state)) { + ibdev_err(&hr_dev->ib_dev, "Illegal state for QP!\n"); + return -EINVAL; + } + if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) { memset(qpc_mask, 0, sizeof(*qpc_mask)); modify_qp_reset_to_init(ibqp, attr, attr_mask, context, @@ -4216,23 +4228,11 @@ static int hns_roce_v2_set_abs_fields(struct ib_qp *ibqp, } else if (cur_state == IB_QPS_INIT && new_state == IB_QPS_RTR) { ret = modify_qp_init_to_rtr(ibqp, attr, attr_mask, context, qpc_mask); - if (ret) - goto out; } else if (cur_state == IB_QPS_RTR && new_state == IB_QPS_RTS) { ret = modify_qp_rtr_to_rts(ibqp, attr, attr_mask, context, qpc_mask); - if (ret) - goto out; - } else if (hns_roce_v2_check_qp_stat(cur_state, new_state)) { - /* Nothing */ - ; - } else { - ibdev_err(&hr_dev->ib_dev, "Illegal state for QP!\n"); - ret = -EINVAL; - goto out; } -out: return ret; } -- cgit v1.2.3 From a97bf49f824e357f1cc5d292e247d05271d32afe Mon Sep 17 00:00:00 2001 From: Lang Cheng Date: Wed, 15 Apr 2020 16:14:35 +0800 Subject: RDMA/hns: Simplify the status judgment code of hns_roce_v1_m_qp() Use status table to reduce cyclomatic complexity. Link: https://lore.kernel.org/r/1586938475-37049-7-git-send-email-liweihang@huawei.com Signed-off-by: Lang Cheng Signed-off-by: Weihang Li Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_hw_v1.c | 42 +++++++++++++++++++++--------- 1 file changed, 30 insertions(+), 12 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c index a1f053cd30b9..49775cda83dc 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c @@ -2720,6 +2720,28 @@ out: return -EINVAL; } +static bool check_qp_state(enum ib_qp_state cur_state, + enum ib_qp_state new_state) +{ + static const bool sm[][IB_QPS_ERR + 1] = { + [IB_QPS_RESET] = { [IB_QPS_RESET] = true, + [IB_QPS_INIT] = true }, + [IB_QPS_INIT] = { [IB_QPS_RESET] = true, + [IB_QPS_INIT] = true, + [IB_QPS_RTR] = true, + [IB_QPS_ERR] = true }, + [IB_QPS_RTR] = { [IB_QPS_RESET] = true, + [IB_QPS_RTS] = true, + [IB_QPS_ERR] = true }, + [IB_QPS_RTS] = { [IB_QPS_RESET] = true, [IB_QPS_ERR] = true }, + [IB_QPS_SQD] = {}, + [IB_QPS_SQE] = {}, + [IB_QPS_ERR] = { [IB_QPS_RESET] = true, [IB_QPS_ERR] = true } + }; + + return sm[cur_state][new_state]; +} + static int hns_roce_v1_m_qp(struct ib_qp *ibqp, const struct ib_qp_attr *attr, int attr_mask, enum ib_qp_state cur_state, enum ib_qp_state new_state) @@ -2741,6 +2763,13 @@ static int hns_roce_v1_m_qp(struct ib_qp *ibqp, const struct ib_qp_attr *attr, u8 *dmac; u8 *smac; + if (!check_qp_state(cur_state, new_state)) { + ibdev_err(ibqp->device, + "not support QP(%u) status from %d to %d\n", + ibqp->qp_num, cur_state, new_state); + return -EINVAL; + } + context = kzalloc(sizeof(*context), GFP_KERNEL); if (!context) return -ENOMEM; @@ -3072,8 +3101,7 @@ static int hns_roce_v1_m_qp(struct ib_qp *ibqp, const struct ib_qp_attr *attr, QP_CONTEXT_QPC_BYTES_156_SL_S, rdma_ah_get_sl(&attr->ah_attr)); hr_qp->sl = rdma_ah_get_sl(&attr->ah_attr); - } else if (cur_state == IB_QPS_RTR && - new_state == IB_QPS_RTS) { + } else if (cur_state == IB_QPS_RTR && new_state == IB_QPS_RTS) { /* If exist optional param, return error */ if ((attr_mask & IB_QP_ALT_PATH) || (attr_mask & IB_QP_ACCESS_FLAGS) || @@ -3245,16 +3273,6 @@ static int hns_roce_v1_m_qp(struct ib_qp *ibqp, const struct ib_qp_attr *attr, QP_CONTEXT_QPC_BYTES_188_TX_RETRY_CUR_INDEX_M, QP_CONTEXT_QPC_BYTES_188_TX_RETRY_CUR_INDEX_S, 0); - } else if (!((cur_state == IB_QPS_INIT && new_state == IB_QPS_RESET) || - (cur_state == IB_QPS_INIT && new_state == IB_QPS_ERR) || - (cur_state == IB_QPS_RTR && new_state == IB_QPS_RESET) || - (cur_state == IB_QPS_RTR && new_state == IB_QPS_ERR) || - (cur_state == IB_QPS_RTS && new_state == IB_QPS_RESET) || - (cur_state == IB_QPS_RTS && new_state == IB_QPS_ERR) || - (cur_state == IB_QPS_ERR && new_state == IB_QPS_RESET) || - (cur_state == IB_QPS_ERR && new_state == IB_QPS_ERR))) { - dev_err(dev, "not support this status migration\n"); - goto out; } /* Every status migrate must change state */ -- cgit v1.2.3 From 6eb7edffb28558aaa3a3e625ac9dcd40fc603bc6 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 27 Apr 2020 18:46:01 +0300 Subject: RDMA/mlx5: Organize QP types checks in one place Perform check if QP type is supported in one place at the beginning of the create_qp function instead of current implementation with checks buried inside of the code. Link: https://lore.kernel.org/r/20200427154636.381474-2-leon@kernel.org Reviewed-by: Maor Gottlieb Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/qp.c | 129 +++++++++++++++++++++------------------- 1 file changed, 68 insertions(+), 61 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index af599c8b88aa..fdab5b6db1e5 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -2677,12 +2677,42 @@ static int set_mlx_qp_type(struct mlx5_ib_dev *dev, } } - if (!MLX5_CAP_GEN(dev->mdev, dct)) { - mlx5_ib_dbg(dev, "DC transport is not supported\n"); - return -EOPNOTSUPP; + return 0; +} + +static int check_qp_type(struct mlx5_ib_dev *dev, struct ib_qp_init_attr *attr) +{ + if (attr->qp_type == IB_QPT_DRIVER && !MLX5_CAP_GEN(dev->mdev, dct)) + goto out; + + switch (attr->qp_type) { + case IB_QPT_XRC_TGT: + case IB_QPT_XRC_INI: + if (!MLX5_CAP_GEN(dev->mdev, xrc)) + goto out; + fallthrough; + case IB_QPT_RAW_PACKET: + case IB_QPT_RC: + case IB_QPT_UC: + case IB_QPT_UD: + case IB_QPT_SMI: + case MLX5_IB_QPT_HW_GSI: + case MLX5_IB_QPT_REG_UMR: + case IB_QPT_DRIVER: + case IB_QPT_GSI: + return 0; + case IB_QPT_RAW_IPV6: + case IB_QPT_RAW_ETHERTYPE: + case IB_QPT_MAX: + default: + goto out; } return 0; + +out: + mlx5_ib_dbg(dev, "Unsupported QP type %d\n", attr->qp_type); + return -EOPNOTSUPP; } struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd, @@ -2698,9 +2728,17 @@ struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd, struct mlx5_ib_ucontext *ucontext = rdma_udata_to_drv_context( udata, struct mlx5_ib_ucontext, ibucontext); - if (pd) { - dev = to_mdev(pd->device); + dev = pd ? to_mdev(pd->device) : + to_mdev(to_mxrcd(init_attr->xrcd)->ibxrcd.device); + err = check_qp_type(dev, init_attr); + if (err) { + mlx5_ib_dbg(dev, "Unsupported QP type %d\n", + init_attr->qp_type); + return ERR_PTR(err); + } + + if (pd) { if (init_attr->qp_type == IB_QPT_RAW_PACKET) { if (!ucontext) { mlx5_ib_dbg(dev, "Raw Packet QP is not supported for kernel consumers\n"); @@ -2718,7 +2756,6 @@ struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd, ib_qp_type_str(init_attr->qp_type)); return ERR_PTR(-EINVAL); } - dev = to_mdev(to_mxrcd(init_attr->xrcd)->ibxrcd.device); } if (init_attr->qp_type == IB_QPT_DRIVER) { @@ -2741,67 +2778,37 @@ struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd, } } - switch (init_attr->qp_type) { - case IB_QPT_XRC_TGT: - case IB_QPT_XRC_INI: - if (!MLX5_CAP_GEN(dev->mdev, xrc)) { - mlx5_ib_dbg(dev, "XRC not supported\n"); - return ERR_PTR(-ENOSYS); - } - init_attr->recv_cq = NULL; - if (init_attr->qp_type == IB_QPT_XRC_TGT) { - xrcdn = to_mxrcd(init_attr->xrcd)->xrcdn; - init_attr->send_cq = NULL; - } - - /* fall through */ - case IB_QPT_RAW_PACKET: - case IB_QPT_RC: - case IB_QPT_UC: - case IB_QPT_UD: - case IB_QPT_SMI: - case MLX5_IB_QPT_HW_GSI: - case MLX5_IB_QPT_REG_UMR: - case MLX5_IB_QPT_DCI: - qp = kzalloc(sizeof(*qp), GFP_KERNEL); - if (!qp) - return ERR_PTR(-ENOMEM); - - err = create_qp_common(dev, pd, init_attr, udata, qp); - if (err) { - mlx5_ib_dbg(dev, "create_qp_common failed\n"); - kfree(qp); - return ERR_PTR(err); - } + if (init_attr->qp_type == IB_QPT_GSI) + return mlx5_ib_gsi_create_qp(pd, init_attr); - if (is_qp0(init_attr->qp_type)) - qp->ibqp.qp_num = 0; - else if (is_qp1(init_attr->qp_type)) - qp->ibqp.qp_num = 1; - else - qp->ibqp.qp_num = qp->trans_qp.base.mqp.qpn; + if (init_attr->qp_type == IB_QPT_XRC_TGT) { + init_attr->recv_cq = NULL; + xrcdn = to_mxrcd(init_attr->xrcd)->xrcdn; + init_attr->send_cq = NULL; + } - mlx5_ib_dbg(dev, "ib qpnum 0x%x, mlx qpn 0x%x, rcqn 0x%x, scqn 0x%x\n", - qp->ibqp.qp_num, qp->trans_qp.base.mqp.qpn, - init_attr->recv_cq ? to_mcq(init_attr->recv_cq)->mcq.cqn : -1, - init_attr->send_cq ? to_mcq(init_attr->send_cq)->mcq.cqn : -1); + if (init_attr->qp_type == IB_QPT_XRC_INI) + init_attr->recv_cq = NULL; - qp->trans_qp.xrcdn = xrcdn; + qp = kzalloc(sizeof(*qp), GFP_KERNEL); + if (!qp) + return ERR_PTR(-ENOMEM); - break; + err = create_qp_common(dev, pd, init_attr, udata, qp); + if (err) { + mlx5_ib_dbg(dev, "create_qp_common failed\n"); + kfree(qp); + return ERR_PTR(err); + } - case IB_QPT_GSI: - return mlx5_ib_gsi_create_qp(pd, init_attr); + if (is_qp0(init_attr->qp_type)) + qp->ibqp.qp_num = 0; + else if (is_qp1(init_attr->qp_type)) + qp->ibqp.qp_num = 1; + else + qp->ibqp.qp_num = qp->trans_qp.base.mqp.qpn; - case IB_QPT_RAW_IPV6: - case IB_QPT_RAW_ETHERTYPE: - case IB_QPT_MAX: - default: - mlx5_ib_dbg(dev, "unsupported qp type %d\n", - init_attr->qp_type); - /* Don't support raw QPs */ - return ERR_PTR(-EOPNOTSUPP); - } + qp->trans_qp.xrcdn = xrcdn; if (verbs_init_attr->qp_type == IB_QPT_DRIVER) qp->qp_sub_type = init_attr->qp_type; -- cgit v1.2.3 From 1265d9f7a522423ef5234457a6d2a2f2a3ccad13 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 27 Apr 2020 18:46:02 +0300 Subject: RDMA/mlx5: Delete impossible GSI port check GSI QP is created in the kernel with very strict parameters, there is no possible way that port number will be wrong in such flow. Link: https://lore.kernel.org/r/20200427154636.381474-3-leon@kernel.org Reviewed-by: Maor Gottlieb Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/gsi.c | 9 --------- 1 file changed, 9 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/gsi.c b/drivers/infiniband/hw/mlx5/gsi.c index 1ae6fd95acaa..1afbf03d1a98 100644 --- a/drivers/infiniband/hw/mlx5/gsi.c +++ b/drivers/infiniband/hw/mlx5/gsi.c @@ -123,15 +123,6 @@ struct ib_qp *mlx5_ib_gsi_create_qp(struct ib_pd *pd, const int num_qps = mlx5_ib_deth_sqpn_cap(dev) ? num_pkeys : 0; int ret; - mlx5_ib_dbg(dev, "creating GSI QP\n"); - - if (port_num > ARRAY_SIZE(dev->devr.ports) || port_num < 1) { - mlx5_ib_warn(dev, - "invalid port number %d during GSI QP creation\n", - port_num); - return ERR_PTR(-EINVAL); - } - gsi = kzalloc(sizeof(*gsi), GFP_KERNEL); if (!gsi) return ERR_PTR(-ENOMEM); -- cgit v1.2.3 From 2242cc25ce82058986ff7721e3d2464d775032df Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 27 Apr 2020 18:46:03 +0300 Subject: RDMA/mlx5: Perform check if QP creation flow is valid Fast check that kernel and user flows provides enough data to create QP. Link: https://lore.kernel.org/r/20200427154636.381474-4-leon@kernel.org Reviewed-by: Maor Gottlieb Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/qp.c | 128 +++++++++++++++++++--------------------- 1 file changed, 61 insertions(+), 67 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index fdab5b6db1e5..91d6151c349c 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -1666,9 +1666,6 @@ static int create_rss_raw_qp_tir(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, size_t required_cmd_sz; u8 lb_flag = 0; - if (init_attr->qp_type != IB_QPT_RAW_PACKET) - return -EOPNOTSUPP; - if (init_attr->create_flags || init_attr->send_cq) return -EINVAL; @@ -2032,13 +2029,8 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, if (mlx5_st < 0) return -EINVAL; - if (init_attr->rwq_ind_tbl) { - if (!udata) - return -ENOSYS; - - err = create_rss_raw_qp_tir(dev, qp, pd, init_attr, udata); - return err; - } + if (init_attr->rwq_ind_tbl) + return create_rss_raw_qp_tir(dev, qp, pd, init_attr, udata); if (init_attr->create_flags & IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK) { if (!MLX5_CAP_GEN(mdev, block_lb_mc)) { @@ -2565,39 +2557,6 @@ static void destroy_qp_common(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, destroy_qp_user(dev, &get_pd(qp)->ibpd, qp, base, udata); } -static const char *ib_qp_type_str(enum ib_qp_type type) -{ - switch (type) { - case IB_QPT_SMI: - return "IB_QPT_SMI"; - case IB_QPT_GSI: - return "IB_QPT_GSI"; - case IB_QPT_RC: - return "IB_QPT_RC"; - case IB_QPT_UC: - return "IB_QPT_UC"; - case IB_QPT_UD: - return "IB_QPT_UD"; - case IB_QPT_RAW_IPV6: - return "IB_QPT_RAW_IPV6"; - case IB_QPT_RAW_ETHERTYPE: - return "IB_QPT_RAW_ETHERTYPE"; - case IB_QPT_XRC_INI: - return "IB_QPT_XRC_INI"; - case IB_QPT_XRC_TGT: - return "IB_QPT_XRC_TGT"; - case IB_QPT_RAW_PACKET: - return "IB_QPT_RAW_PACKET"; - case MLX5_IB_QPT_REG_UMR: - return "MLX5_IB_QPT_REG_UMR"; - case IB_QPT_DRIVER: - return "IB_QPT_DRIVER"; - case IB_QPT_MAX: - default: - return "Invalid QP type"; - } -} - static struct ib_qp *mlx5_ib_create_dct(struct ib_pd *pd, struct ib_qp_init_attr *attr, struct mlx5_ib_create_qp *ucmd, @@ -2655,9 +2614,6 @@ static int set_mlx_qp_type(struct mlx5_ib_dev *dev, enum { MLX_QP_FLAGS = MLX5_QP_FLAG_TYPE_DCT | MLX5_QP_FLAG_TYPE_DCI }; int err; - if (!udata) - return -EINVAL; - if (udata->inlen < sizeof(*ucmd)) { mlx5_ib_dbg(dev, "create_qp user command is smaller than expected\n"); return -EINVAL; @@ -2715,6 +2671,62 @@ out: return -EOPNOTSUPP; } +static int check_valid_flow(struct mlx5_ib_dev *dev, struct ib_pd *pd, + struct ib_qp_init_attr *attr, + struct ib_udata *udata) +{ + struct mlx5_ib_ucontext *ucontext = rdma_udata_to_drv_context( + udata, struct mlx5_ib_ucontext, ibucontext); + + if (!udata) { + /* Kernel create_qp callers */ + if (attr->rwq_ind_tbl) + return -EOPNOTSUPP; + + switch (attr->qp_type) { + case IB_QPT_RAW_PACKET: + case IB_QPT_DRIVER: + return -EOPNOTSUPP; + default: + return 0; + } + } + + /* Userspace create_qp callers */ + if (attr->qp_type == IB_QPT_RAW_PACKET && !ucontext->cqe_version) { + mlx5_ib_dbg(dev, + "Raw Packet QP is only supported for CQE version > 0\n"); + return -EINVAL; + } + + if (attr->qp_type != IB_QPT_RAW_PACKET && attr->rwq_ind_tbl) { + mlx5_ib_dbg(dev, + "Wrong QP type %d for the RWQ indirect table\n", + attr->qp_type); + return -EINVAL; + } + + switch (attr->qp_type) { + case IB_QPT_SMI: + case MLX5_IB_QPT_HW_GSI: + case MLX5_IB_QPT_REG_UMR: + case IB_QPT_GSI: + mlx5_ib_dbg(dev, "Kernel doesn't support QP type %d\n", + attr->qp_type); + return -EINVAL; + default: + break; + } + + /* + * We don't need to see this warning, it means that kernel code + * missing ib_pd. Placed here to catch developer's mistakes. + */ + WARN_ONCE(!pd && attr->qp_type != IB_QPT_XRC_TGT, + "There is a missing PD pointer assignment\n"); + return 0; +} + struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *verbs_init_attr, struct ib_udata *udata) @@ -2725,8 +2737,6 @@ struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd, int err; struct ib_qp_init_attr mlx_init_attr; struct ib_qp_init_attr *init_attr = verbs_init_attr; - struct mlx5_ib_ucontext *ucontext = rdma_udata_to_drv_context( - udata, struct mlx5_ib_ucontext, ibucontext); dev = pd ? to_mdev(pd->device) : to_mdev(to_mxrcd(init_attr->xrcd)->ibxrcd.device); @@ -2738,25 +2748,9 @@ struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd, return ERR_PTR(err); } - if (pd) { - if (init_attr->qp_type == IB_QPT_RAW_PACKET) { - if (!ucontext) { - mlx5_ib_dbg(dev, "Raw Packet QP is not supported for kernel consumers\n"); - return ERR_PTR(-EINVAL); - } else if (!ucontext->cqe_version) { - mlx5_ib_dbg(dev, "Raw Packet QP is only supported for CQE version > 0\n"); - return ERR_PTR(-EINVAL); - } - } - } else { - /* being cautious here */ - if (init_attr->qp_type != IB_QPT_XRC_TGT && - init_attr->qp_type != MLX5_IB_QPT_REG_UMR) { - pr_warn("%s: no PD for transport %s\n", __func__, - ib_qp_type_str(init_attr->qp_type)); - return ERR_PTR(-EINVAL); - } - } + err = check_valid_flow(dev, pd, init_attr, udata); + if (err) + return ERR_PTR(err); if (init_attr->qp_type == IB_QPT_DRIVER) { struct mlx5_ib_create_qp ucmd; -- cgit v1.2.3 From 9c2ba4ede4c0166d4e3bdc15e3b32c9680309ca1 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 27 Apr 2020 18:46:04 +0300 Subject: RDMA/mlx5: Prepare QP allocation for future removal Unify the QP memory allocation across different paths, so it will be in one place. Link: https://lore.kernel.org/r/20200427154636.381474-5-leon@kernel.org Reviewed-by: Maor Gottlieb Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/qp.c | 45 ++++++++++++++++++----------------------- 1 file changed, 20 insertions(+), 25 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 91d6151c349c..07df470e0d58 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -2557,14 +2557,13 @@ static void destroy_qp_common(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, destroy_qp_user(dev, &get_pd(qp)->ibpd, qp, base, udata); } -static struct ib_qp *mlx5_ib_create_dct(struct ib_pd *pd, +static struct ib_qp *mlx5_ib_create_dct(struct ib_pd *pd, struct mlx5_ib_qp *qp, struct ib_qp_init_attr *attr, struct mlx5_ib_create_qp *ucmd, struct ib_udata *udata) { struct mlx5_ib_ucontext *ucontext = rdma_udata_to_drv_context( udata, struct mlx5_ib_ucontext, ibucontext); - struct mlx5_ib_qp *qp; int err = 0; u32 uidx = MLX5_IB_DEFAULT_UIDX; void *dctc; @@ -2576,15 +2575,9 @@ static struct ib_qp *mlx5_ib_create_dct(struct ib_pd *pd, if (err) return ERR_PTR(err); - qp = kzalloc(sizeof(*qp), GFP_KERNEL); - if (!qp) - return ERR_PTR(-ENOMEM); - qp->dct.in = kzalloc(MLX5_ST_SZ_BYTES(create_dct_in), GFP_KERNEL); - if (!qp->dct.in) { - err = -ENOMEM; - goto err_free; - } + if (!qp->dct.in) + return ERR_PTR(-ENOMEM); MLX5_SET(create_dct_in, qp->dct.in, uid, to_mpd(pd)->uid); dctc = MLX5_ADDR_OF(create_dct_in, qp->dct.in, dct_context_entry); @@ -2601,9 +2594,6 @@ static struct ib_qp *mlx5_ib_create_dct(struct ib_pd *pd, qp->state = IB_QPS_RESET; return &qp->ibqp; -err_free: - kfree(qp); - return ERR_PTR(err); } static int set_mlx_qp_type(struct mlx5_ib_dev *dev, @@ -2752,6 +2742,13 @@ struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd, if (err) return ERR_PTR(err); + if (init_attr->qp_type == IB_QPT_GSI) + return mlx5_ib_gsi_create_qp(pd, init_attr); + + qp = kzalloc(sizeof(*qp), GFP_KERNEL); + if (!qp) + return ERR_PTR(-ENOMEM); + if (init_attr->qp_type == IB_QPT_DRIVER) { struct mlx5_ib_create_qp ucmd; @@ -2759,22 +2756,21 @@ struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd, memcpy(init_attr, verbs_init_attr, sizeof(*verbs_init_attr)); err = set_mlx_qp_type(dev, init_attr, &ucmd, udata); if (err) - return ERR_PTR(err); + goto free_qp; if (init_attr->qp_type == MLX5_IB_QPT_DCI) { if (init_attr->cap.max_recv_wr || init_attr->cap.max_recv_sge) { mlx5_ib_dbg(dev, "DCI QP requires zero size receive queue\n"); - return ERR_PTR(-EINVAL); + err = -EINVAL; + goto free_qp; } } else { - return mlx5_ib_create_dct(pd, init_attr, &ucmd, udata); + return mlx5_ib_create_dct(pd, qp, init_attr, &ucmd, + udata); } } - if (init_attr->qp_type == IB_QPT_GSI) - return mlx5_ib_gsi_create_qp(pd, init_attr); - if (init_attr->qp_type == IB_QPT_XRC_TGT) { init_attr->recv_cq = NULL; xrcdn = to_mxrcd(init_attr->xrcd)->xrcdn; @@ -2784,15 +2780,10 @@ struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd, if (init_attr->qp_type == IB_QPT_XRC_INI) init_attr->recv_cq = NULL; - qp = kzalloc(sizeof(*qp), GFP_KERNEL); - if (!qp) - return ERR_PTR(-ENOMEM); - err = create_qp_common(dev, pd, init_attr, udata, qp); if (err) { mlx5_ib_dbg(dev, "create_qp_common failed\n"); - kfree(qp); - return ERR_PTR(err); + goto free_qp; } if (is_qp0(init_attr->qp_type)) @@ -2808,6 +2799,10 @@ struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd, qp->qp_sub_type = init_attr->qp_type; return &qp->ibqp; + +free_qp: + kfree(qp); + return ERR_PTR(err); } static int mlx5_ib_destroy_dct(struct mlx5_ib_qp *mqp) -- cgit v1.2.3 From c86936e6eb13bf3759e4cc0629ccc0076dd763de Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 27 Apr 2020 18:46:05 +0300 Subject: RDMA/mlx5: Avoid setting redundant NULL for XRC QPs There is no need to set NULL in recv_cq and send_cq, they are already set to NULL by the IB/core logic. Link: https://lore.kernel.org/r/20200427154636.381474-6-leon@kernel.org Reviewed-by: Maor Gottlieb Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/qp.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 07df470e0d58..86933a2023dc 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -2771,14 +2771,8 @@ struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd, } } - if (init_attr->qp_type == IB_QPT_XRC_TGT) { - init_attr->recv_cq = NULL; + if (init_attr->qp_type == IB_QPT_XRC_TGT) xrcdn = to_mxrcd(init_attr->xrcd)->xrcdn; - init_attr->send_cq = NULL; - } - - if (init_attr->qp_type == IB_QPT_XRC_INI) - init_attr->recv_cq = NULL; err = create_qp_common(dev, pd, init_attr, udata, qp); if (err) { -- cgit v1.2.3 From 318d2b06fbaa8fbce379a4e00901251b6368b4e3 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 27 Apr 2020 18:46:06 +0300 Subject: RDMA/mlx5: Set QP subtype immediately when it is known There is no need to delay QP subtype assignment to the end of the create_qp() function and it is better to move it to be immediately after it is checked so we would be able to rewrite later checks to be based on it and not on over-written struct ib_qp_init_attr. Link: https://lore.kernel.org/r/20200427154636.381474-7-leon@kernel.org Reviewed-by: Maor Gottlieb Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/qp.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 86933a2023dc..d991c33c4d9b 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -2581,7 +2581,6 @@ static struct ib_qp *mlx5_ib_create_dct(struct ib_pd *pd, struct mlx5_ib_qp *qp, MLX5_SET(create_dct_in, qp->dct.in, uid, to_mpd(pd)->uid); dctc = MLX5_ADDR_OF(create_dct_in, qp->dct.in, dct_context_entry); - qp->qp_sub_type = MLX5_IB_QPT_DCT; MLX5_SET(dctc, dctc, pd, to_mpd(pd)->pdn); MLX5_SET(dctc, dctc, srqn_xrqn, to_msrq(attr->srq)->msrq.srqn); MLX5_SET(dctc, dctc, cqn, to_mcq(attr->recv_cq)->mcq.cqn); @@ -2765,7 +2764,9 @@ struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd, err = -EINVAL; goto free_qp; } + qp->qp_sub_type = MLX5_IB_QPT_DCI; } else { + qp->qp_sub_type = MLX5_IB_QPT_DCT; return mlx5_ib_create_dct(pd, qp, init_attr, &ucmd, udata); } @@ -2789,9 +2790,6 @@ struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd, qp->trans_qp.xrcdn = xrcdn; - if (verbs_init_attr->qp_type == IB_QPT_DRIVER) - qp->qp_sub_type = init_attr->qp_type; - return &qp->ibqp; free_qp: -- cgit v1.2.3 From 47c806121a515bfee3180cced40af25cbf2ac10c Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 27 Apr 2020 18:46:07 +0300 Subject: RDMA/mlx5: Separate create QP flows to be based on type Move driver QP creation flow to separate functions to simplify the create_qp() and allow future separation of create_qp_common() to subtypes. Link: https://lore.kernel.org/r/20200427154636.381474-8-leon@kernel.org Reviewed-by: Maor Gottlieb Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/qp.c | 54 +++++++++++++++++++++++++++++------------ 1 file changed, 39 insertions(+), 15 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index d991c33c4d9b..ae336c1eed74 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -2557,10 +2557,9 @@ static void destroy_qp_common(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, destroy_qp_user(dev, &get_pd(qp)->ibpd, qp, base, udata); } -static struct ib_qp *mlx5_ib_create_dct(struct ib_pd *pd, struct mlx5_ib_qp *qp, - struct ib_qp_init_attr *attr, - struct mlx5_ib_create_qp *ucmd, - struct ib_udata *udata) +static int create_dct(struct ib_pd *pd, struct mlx5_ib_qp *qp, + struct ib_qp_init_attr *attr, + struct mlx5_ib_create_qp *ucmd, struct ib_udata *udata) { struct mlx5_ib_ucontext *ucontext = rdma_udata_to_drv_context( udata, struct mlx5_ib_ucontext, ibucontext); @@ -2568,16 +2567,13 @@ static struct ib_qp *mlx5_ib_create_dct(struct ib_pd *pd, struct mlx5_ib_qp *qp, u32 uidx = MLX5_IB_DEFAULT_UIDX; void *dctc; - if (!attr->srq || !attr->recv_cq) - return ERR_PTR(-EINVAL); - err = get_qp_user_index(ucontext, ucmd, sizeof(*ucmd), &uidx); if (err) - return ERR_PTR(err); + return err; qp->dct.in = kzalloc(MLX5_ST_SZ_BYTES(create_dct_in), GFP_KERNEL); if (!qp->dct.in) - return ERR_PTR(-ENOMEM); + return -ENOMEM; MLX5_SET(create_dct_in, qp->dct.in, uid, to_mpd(pd)->uid); dctc = MLX5_ADDR_OF(create_dct_in, qp->dct.in, dct_context_entry); @@ -2592,7 +2588,7 @@ static struct ib_qp *mlx5_ib_create_dct(struct ib_pd *pd, struct mlx5_ib_qp *qp, qp->state = IB_QPS_RESET; - return &qp->ibqp; + return 0; } static int set_mlx_qp_type(struct mlx5_ib_dev *dev, @@ -2716,10 +2712,36 @@ static int check_valid_flow(struct mlx5_ib_dev *dev, struct ib_pd *pd, return 0; } +static int create_driver_qp(struct ib_pd *pd, struct mlx5_ib_qp *qp, + struct ib_qp_init_attr *attr, + struct mlx5_ib_create_qp *ucmd, + struct ib_udata *udata) +{ + struct mlx5_ib_dev *mdev = to_mdev(pd->device); + int ret = -EINVAL; + + switch (qp->qp_sub_type) { + case MLX5_IB_QPT_DCT: + if (!attr->srq || !attr->recv_cq) + goto out; + + ret = create_dct(pd, qp, attr, ucmd, udata); + break; + case MLX5_IB_QPT_DCI: + ret = create_qp_common(mdev, pd, attr, udata, qp); + break; + default: + return -EINVAL; + } + +out: return ret; +} + struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *verbs_init_attr, struct ib_udata *udata) { + struct mlx5_ib_create_qp ucmd = {}; struct mlx5_ib_dev *dev; struct mlx5_ib_qp *qp; u16 xrcdn = 0; @@ -2749,8 +2771,6 @@ struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd, return ERR_PTR(-ENOMEM); if (init_attr->qp_type == IB_QPT_DRIVER) { - struct mlx5_ib_create_qp ucmd; - init_attr = &mlx_init_attr; memcpy(init_attr, verbs_init_attr, sizeof(*verbs_init_attr)); err = set_mlx_qp_type(dev, init_attr, &ucmd, udata); @@ -2767,15 +2787,19 @@ struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd, qp->qp_sub_type = MLX5_IB_QPT_DCI; } else { qp->qp_sub_type = MLX5_IB_QPT_DCT; - return mlx5_ib_create_dct(pd, qp, init_attr, &ucmd, - udata); } } if (init_attr->qp_type == IB_QPT_XRC_TGT) xrcdn = to_mxrcd(init_attr->xrcd)->xrcdn; - err = create_qp_common(dev, pd, init_attr, udata, qp); + switch (init_attr->qp_type) { + case IB_QPT_DRIVER: + err = create_driver_qp(pd, qp, init_attr, &ucmd, udata); + break; + default: + err = create_qp_common(dev, pd, init_attr, udata, qp); + } if (err) { mlx5_ib_dbg(dev, "create_qp_common failed\n"); goto free_qp; -- cgit v1.2.3 From fd9dab7edc590a52ed141265fa7c88cf938e9be0 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 27 Apr 2020 18:46:08 +0300 Subject: RDMA/mlx5: Split scatter CQE configuration for DCT QP DCT QPs have separate creation flow and can be easily extracted from configure_responder_scat_cqe(), this makes both updated functions more clear. Link: https://lore.kernel.org/r/20200427154636.381474-9-leon@kernel.org Reviewed-by: Maor Gottlieb Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/qp.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index ae336c1eed74..d0e8d27305e9 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -1907,13 +1907,6 @@ static void configure_responder_scat_cqe(struct ib_qp_init_attr *init_attr, rcqe_sz = mlx5_ib_get_cqe_size(init_attr->recv_cq); - if (init_attr->qp_type == MLX5_IB_QPT_DCT) { - if (rcqe_sz == 128) - MLX5_SET(dctc, qpc, cs_res, MLX5_RES_SCAT_DATA64_CQE); - - return; - } - MLX5_SET(qpc, qpc, cs_res, rcqe_sz == 128 ? MLX5_RES_SCAT_DATA64_CQE : MLX5_RES_SCAT_DATA32_CQE); @@ -2583,8 +2576,12 @@ static int create_dct(struct ib_pd *pd, struct mlx5_ib_qp *qp, MLX5_SET64(dctc, dctc, dc_access_key, ucmd->access_key); MLX5_SET(dctc, dctc, user_index, uidx); - if (ucmd->flags & MLX5_QP_FLAG_SCATTER_CQE) - configure_responder_scat_cqe(attr, dctc); + if (ucmd->flags & MLX5_QP_FLAG_SCATTER_CQE) { + int rcqe_sz = mlx5_ib_get_cqe_size(attr->recv_cq); + + if (rcqe_sz == 128) + MLX5_SET(dctc, dctc, cs_res, MLX5_RES_SCAT_DATA64_CQE); + } qp->state = IB_QPS_RESET; -- cgit v1.2.3 From 8bde2c509e4035fb4a200a60f82f85eec914145b Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 27 Apr 2020 18:46:09 +0300 Subject: RDMA/mlx5: Update all DRIVER QP places to use QP subtype Instead of overwriting QP init attributes with driver QP subtype, use that subtype directly. This change will allow us to remove logic which cached QP init attributes. Link: https://lore.kernel.org/r/20200427154636.381474-10-leon@kernel.org Reviewed-by: Maor Gottlieb Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/qp.c | 48 +++++++++++++---------------------------- 1 file changed, 15 insertions(+), 33 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index d0e8d27305e9..0b2090bcb8e8 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -1232,7 +1232,7 @@ static void destroy_qp_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp) static u32 get_rx_type(struct mlx5_ib_qp *qp, struct ib_qp_init_attr *attr) { if (attr->srq || (attr->qp_type == IB_QPT_XRC_TGT) || - (attr->qp_type == MLX5_IB_QPT_DCI) || + (qp->qp_sub_type == MLX5_IB_QPT_DCI) || (attr->qp_type == IB_QPT_XRC_INI)) return MLX5_SRQ_RQ; else if (!qp->has_rq) @@ -1241,15 +1241,6 @@ static u32 get_rx_type(struct mlx5_ib_qp *qp, struct ib_qp_init_attr *attr) return MLX5_NON_ZERO_RQ; } -static int is_connected(enum ib_qp_type qp_type) -{ - if (qp_type == IB_QPT_RC || qp_type == IB_QPT_UC || - qp_type == MLX5_IB_QPT_DCI) - return 1; - - return 0; -} - static int create_raw_packet_qp_tis(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, struct mlx5_ib_sq *sq, u32 tdn, @@ -1897,33 +1888,14 @@ err: return err; } -static void configure_responder_scat_cqe(struct ib_qp_init_attr *init_attr, - void *qpc) -{ - int rcqe_sz; - - if (init_attr->qp_type == MLX5_IB_QPT_DCI) - return; - - rcqe_sz = mlx5_ib_get_cqe_size(init_attr->recv_cq); - - MLX5_SET(qpc, qpc, cs_res, - rcqe_sz == 128 ? MLX5_RES_SCAT_DATA64_CQE : - MLX5_RES_SCAT_DATA32_CQE); -} - static void configure_requester_scat_cqe(struct mlx5_ib_dev *dev, struct ib_qp_init_attr *init_attr, struct mlx5_ib_create_qp *ucmd, void *qpc) { - enum ib_qp_type qpt = init_attr->qp_type; int scqe_sz; bool allow_scat_cqe = false; - if (qpt == IB_QPT_UC || qpt == IB_QPT_UD) - return; - if (ucmd) allow_scat_cqe = ucmd->flags & MLX5_QP_FLAG_ALLOW_SCATTER_CQE; @@ -2018,7 +1990,9 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, spin_lock_init(&qp->sq.lock); spin_lock_init(&qp->rq.lock); - mlx5_st = to_mlx5_st(init_attr->qp_type); + mlx5_st = to_mlx5_st((init_attr->qp_type != IB_QPT_DRIVER) ? + init_attr->qp_type : + qp->qp_sub_type); if (mlx5_st < 0) return -EINVAL; @@ -2240,12 +2214,20 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, MLX5_SET(qpc, qpc, cd_slave_receive, 1); if (qp->flags & MLX5_IB_QP_PACKET_BASED_CREDIT) MLX5_SET(qpc, qpc, req_e2e_credit_mode, 1); - if (qp->scat_cqe && is_connected(init_attr->qp_type)) { - configure_responder_scat_cqe(init_attr, qpc); + if (qp->scat_cqe && (init_attr->qp_type == IB_QPT_RC || + init_attr->qp_type == IB_QPT_UC)) { + int rcqe_sz = rcqe_sz = + mlx5_ib_get_cqe_size(init_attr->recv_cq); + + MLX5_SET(qpc, qpc, cs_res, + rcqe_sz == 128 ? MLX5_RES_SCAT_DATA64_CQE : + MLX5_RES_SCAT_DATA32_CQE); + } + if (qp->scat_cqe && (qp->qp_sub_type == MLX5_IB_QPT_DCI || + init_attr->qp_type == IB_QPT_RC)) configure_requester_scat_cqe(dev, init_attr, udata ? &ucmd : NULL, qpc); - } if (qp->rq.wqe_cnt) { MLX5_SET(qpc, qpc, log_rq_stride, qp->rq.wqe_shift - 4); -- cgit v1.2.3 From 2fdddbd5c966c1ff7e35b0e4d1fa4b951d0f5542 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 27 Apr 2020 18:46:10 +0300 Subject: RDMA/mlx5: Move DRIVER QP flags check into separate function Perform validation of DRIVER QP in relevant function. Link: https://lore.kernel.org/r/20200427154636.381474-11-leon@kernel.org Reviewed-by: Maor Gottlieb Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/qp.c | 91 +++++++++++++++++++---------------------- 1 file changed, 43 insertions(+), 48 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 0b2090bcb8e8..5e4c73c4a7b4 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -2570,36 +2570,6 @@ static int create_dct(struct ib_pd *pd, struct mlx5_ib_qp *qp, return 0; } -static int set_mlx_qp_type(struct mlx5_ib_dev *dev, - struct ib_qp_init_attr *init_attr, - struct mlx5_ib_create_qp *ucmd, - struct ib_udata *udata) -{ - enum { MLX_QP_FLAGS = MLX5_QP_FLAG_TYPE_DCT | MLX5_QP_FLAG_TYPE_DCI }; - int err; - - if (udata->inlen < sizeof(*ucmd)) { - mlx5_ib_dbg(dev, "create_qp user command is smaller than expected\n"); - return -EINVAL; - } - err = ib_copy_from_udata(ucmd, udata, sizeof(*ucmd)); - if (err) - return err; - - if ((ucmd->flags & MLX_QP_FLAGS) == MLX5_QP_FLAG_TYPE_DCI) { - init_attr->qp_type = MLX5_IB_QPT_DCI; - } else { - if ((ucmd->flags & MLX_QP_FLAGS) == MLX5_QP_FLAG_TYPE_DCT) { - init_attr->qp_type = MLX5_IB_QPT_DCT; - } else { - mlx5_ib_dbg(dev, "Invalid QP flags\n"); - return -EINVAL; - } - } - - return 0; -} - static int check_qp_type(struct mlx5_ib_dev *dev, struct ib_qp_init_attr *attr) { if (attr->qp_type == IB_QPT_DRIVER && !MLX5_CAP_GEN(dev->mdev, dct)) @@ -2691,6 +2661,24 @@ static int check_valid_flow(struct mlx5_ib_dev *dev, struct ib_pd *pd, return 0; } +static int process_vendor_flags(struct mlx5_ib_qp *qp, + struct ib_qp_init_attr *attr, + struct mlx5_ib_create_qp *ucmd) +{ + switch (ucmd->flags & (MLX5_QP_FLAG_TYPE_DCT | MLX5_QP_FLAG_TYPE_DCI)) { + case MLX5_QP_FLAG_TYPE_DCI: + qp->qp_sub_type = MLX5_IB_QPT_DCI; + break; + case MLX5_QP_FLAG_TYPE_DCT: + qp->qp_sub_type = MLX5_IB_QPT_DCT; + break; + default: + return -EINVAL; + } + + return 0; +} + static int create_driver_qp(struct ib_pd *pd, struct mlx5_ib_qp *qp, struct ib_qp_init_attr *attr, struct mlx5_ib_create_qp *ucmd, @@ -2707,6 +2695,9 @@ static int create_driver_qp(struct ib_pd *pd, struct mlx5_ib_qp *qp, ret = create_dct(pd, qp, attr, ucmd, udata); break; case MLX5_IB_QPT_DCI: + if (attr->cap.max_recv_wr || attr->cap.max_recv_sge) + goto out; + ret = create_qp_common(mdev, pd, attr, udata, qp); break; default: @@ -2716,8 +2707,16 @@ static int create_driver_qp(struct ib_pd *pd, struct mlx5_ib_qp *qp, out: return ret; } +static size_t process_udata_size(struct ib_qp_init_attr *attr, + struct ib_udata *udata) +{ + size_t ucmd = sizeof(struct mlx5_ib_create_qp); + + return (udata->inlen < ucmd) ? 0 : ucmd; +} + struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd, - struct ib_qp_init_attr *verbs_init_attr, + struct ib_qp_init_attr *init_attr, struct ib_udata *udata) { struct mlx5_ib_create_qp ucmd = {}; @@ -2725,8 +2724,6 @@ struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd, struct mlx5_ib_qp *qp; u16 xrcdn = 0; int err; - struct ib_qp_init_attr mlx_init_attr; - struct ib_qp_init_attr *init_attr = verbs_init_attr; dev = pd ? to_mdev(pd->device) : to_mdev(to_mxrcd(init_attr->xrcd)->ibxrcd.device); @@ -2745,28 +2742,26 @@ struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd, if (init_attr->qp_type == IB_QPT_GSI) return mlx5_ib_gsi_create_qp(pd, init_attr); + if (udata && init_attr->qp_type == IB_QPT_DRIVER) { + size_t inlen = + process_udata_size(init_attr, udata); + + if (!inlen) + return ERR_PTR(-EINVAL); + + err = ib_copy_from_udata(&ucmd, udata, inlen); + if (err) + return ERR_PTR(err); + } + qp = kzalloc(sizeof(*qp), GFP_KERNEL); if (!qp) return ERR_PTR(-ENOMEM); if (init_attr->qp_type == IB_QPT_DRIVER) { - init_attr = &mlx_init_attr; - memcpy(init_attr, verbs_init_attr, sizeof(*verbs_init_attr)); - err = set_mlx_qp_type(dev, init_attr, &ucmd, udata); + err = process_vendor_flags(qp, init_attr, &ucmd); if (err) goto free_qp; - - if (init_attr->qp_type == MLX5_IB_QPT_DCI) { - if (init_attr->cap.max_recv_wr || - init_attr->cap.max_recv_sge) { - mlx5_ib_dbg(dev, "DCI QP requires zero size receive queue\n"); - err = -EINVAL; - goto free_qp; - } - qp->qp_sub_type = MLX5_IB_QPT_DCI; - } else { - qp->qp_sub_type = MLX5_IB_QPT_DCT; - } } if (init_attr->qp_type == IB_QPT_XRC_TGT) -- cgit v1.2.3 From 2dfac92dbb5d6e7607d5a4d3dc9d750f45440d98 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 27 Apr 2020 18:46:11 +0300 Subject: RDMA/mlx5: Remove second copy from user for non RSS RAW QPs Change the common code to use already copied user command buffer. Link: https://lore.kernel.org/r/20200427154636.381474-12-leon@kernel.org Reviewed-by: Maor Gottlieb Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/qp.c | 56 ++++++++++++++++++++--------------------- 1 file changed, 27 insertions(+), 29 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 5e4c73c4a7b4..91a2c9994b59 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -1967,6 +1967,7 @@ static inline bool check_flags_mask(uint64_t input, uint64_t supported) static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, struct ib_qp_init_attr *init_attr, + struct mlx5_ib_create_qp *ucmd, struct ib_udata *udata, struct mlx5_ib_qp *qp) { struct mlx5_ib_resources *devr = &dev->devr; @@ -1979,7 +1980,6 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, struct mlx5_ib_cq *recv_cq; unsigned long flags; u32 uidx = MLX5_IB_DEFAULT_UIDX; - struct mlx5_ib_create_qp ucmd; struct mlx5_ib_qp_base *base; int mlx5_st; void *qpc; @@ -2056,12 +2056,7 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, } if (udata) { - if (ib_copy_from_udata(&ucmd, udata, sizeof(ucmd))) { - mlx5_ib_dbg(dev, "copy failed\n"); - return -EFAULT; - } - - if (!check_flags_mask(ucmd.flags, + if (!check_flags_mask(ucmd->flags, MLX5_QP_FLAG_ALLOW_SCATTER_CQE | MLX5_QP_FLAG_BFREG_INDEX | MLX5_QP_FLAG_PACKET_BASED_CREDIT_MODE | @@ -2075,14 +2070,15 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, MLX5_QP_FLAG_TYPE_DCT)) return -EINVAL; - err = get_qp_user_index(ucontext, &ucmd, udata->inlen, &uidx); + err = get_qp_user_index(ucontext, ucmd, udata->inlen, &uidx); if (err) return err; - qp->wq_sig = !!(ucmd.flags & MLX5_QP_FLAG_SIGNATURE); + qp->wq_sig = !!(ucmd->flags & MLX5_QP_FLAG_SIGNATURE); if (MLX5_CAP_GEN(dev->mdev, sctr_data_cqe)) - qp->scat_cqe = !!(ucmd.flags & MLX5_QP_FLAG_SCATTER_CQE); - if (ucmd.flags & MLX5_QP_FLAG_TUNNEL_OFFLOADS) { + qp->scat_cqe = + !!(ucmd->flags & MLX5_QP_FLAG_SCATTER_CQE); + if (ucmd->flags & MLX5_QP_FLAG_TUNNEL_OFFLOADS) { if (init_attr->qp_type != IB_QPT_RAW_PACKET || !tunnel_offload_supported(mdev)) { mlx5_ib_dbg(dev, "Tunnel offload isn't supported\n"); @@ -2091,7 +2087,7 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, qp->flags_en |= MLX5_QP_FLAG_TUNNEL_OFFLOADS; } - if (ucmd.flags & MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_UC) { + if (ucmd->flags & MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_UC) { if (init_attr->qp_type != IB_QPT_RAW_PACKET) { mlx5_ib_dbg(dev, "Self-LB UC isn't supported\n"); return -EOPNOTSUPP; @@ -2099,7 +2095,7 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, qp->flags_en |= MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_UC; } - if (ucmd.flags & MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_MC) { + if (ucmd->flags & MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_MC) { if (init_attr->qp_type != IB_QPT_RAW_PACKET) { mlx5_ib_dbg(dev, "Self-LB UM isn't supported\n"); return -EOPNOTSUPP; @@ -2107,7 +2103,7 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, qp->flags_en |= MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_MC; } - if (ucmd.flags & MLX5_QP_FLAG_PACKET_BASED_CREDIT_MODE) { + if (ucmd->flags & MLX5_QP_FLAG_PACKET_BASED_CREDIT_MODE) { if (init_attr->qp_type != IB_QPT_RC || !MLX5_CAP_GEN(dev->mdev, qp_packet_based)) { mlx5_ib_dbg(dev, "packet based credit mode isn't supported\n"); @@ -2138,8 +2134,7 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, &qp->trans_qp.base; qp->has_rq = qp_has_rq(init_attr); - err = set_rq_size(dev, &init_attr->cap, qp->has_rq, - qp, udata ? &ucmd : NULL); + err = set_rq_size(dev, &init_attr->cap, qp->has_rq, qp, ucmd); if (err) { mlx5_ib_dbg(dev, "err %d\n", err); return err; @@ -2149,15 +2144,16 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, if (udata) { __u32 max_wqes = 1 << MLX5_CAP_GEN(mdev, log_max_qp_sz); - mlx5_ib_dbg(dev, "requested sq_wqe_count (%d)\n", ucmd.sq_wqe_count); - if (ucmd.rq_wqe_shift != qp->rq.wqe_shift || - ucmd.rq_wqe_count != qp->rq.wqe_cnt) { + mlx5_ib_dbg(dev, "requested sq_wqe_count (%d)\n", + ucmd->sq_wqe_count); + if (ucmd->rq_wqe_shift != qp->rq.wqe_shift || + ucmd->rq_wqe_count != qp->rq.wqe_cnt) { mlx5_ib_dbg(dev, "invalid rq params\n"); return -EINVAL; } - if (ucmd.sq_wqe_count > max_wqes) { + if (ucmd->sq_wqe_count > max_wqes) { mlx5_ib_dbg(dev, "requested sq_wqe_count (%d) > max allowed (%d)\n", - ucmd.sq_wqe_count, max_wqes); + ucmd->sq_wqe_count, max_wqes); return -EINVAL; } if (init_attr->create_flags & @@ -2225,9 +2221,7 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, } if (qp->scat_cqe && (qp->qp_sub_type == MLX5_IB_QPT_DCI || init_attr->qp_type == IB_QPT_RC)) - configure_requester_scat_cqe(dev, init_attr, - udata ? &ucmd : NULL, - qpc); + configure_requester_scat_cqe(dev, init_attr, ucmd, qpc); if (qp->rq.wqe_cnt) { MLX5_SET(qpc, qpc, log_rq_stride, qp->rq.wqe_shift - 4); @@ -2308,7 +2302,7 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, if (init_attr->qp_type == IB_QPT_RAW_PACKET || qp->flags & MLX5_IB_QP_UNDERLAY) { - qp->raw_packet_qp.sq.ubuffer.buf_addr = ucmd.sq_buf_addr; + qp->raw_packet_qp.sq.ubuffer.buf_addr = ucmd->sq_buf_addr; raw_packet_qp_copy_info(qp, &qp->raw_packet_qp); err = create_raw_packet_qp(dev, qp, in, inlen, pd, udata, &resp); @@ -2698,7 +2692,7 @@ static int create_driver_qp(struct ib_pd *pd, struct mlx5_ib_qp *qp, if (attr->cap.max_recv_wr || attr->cap.max_recv_sge) goto out; - ret = create_qp_common(mdev, pd, attr, udata, qp); + ret = create_qp_common(mdev, pd, attr, ucmd, udata, qp); break; default: return -EINVAL; @@ -2712,7 +2706,10 @@ static size_t process_udata_size(struct ib_qp_init_attr *attr, { size_t ucmd = sizeof(struct mlx5_ib_create_qp); - return (udata->inlen < ucmd) ? 0 : ucmd; + if (attr->qp_type == IB_QPT_DRIVER) + return (udata->inlen < ucmd) ? 0 : ucmd; + + return ucmd; } struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd, @@ -2742,7 +2739,7 @@ struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd, if (init_attr->qp_type == IB_QPT_GSI) return mlx5_ib_gsi_create_qp(pd, init_attr); - if (udata && init_attr->qp_type == IB_QPT_DRIVER) { + if (udata && !init_attr->rwq_ind_tbl) { size_t inlen = process_udata_size(init_attr, udata); @@ -2772,7 +2769,8 @@ struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd, err = create_driver_qp(pd, qp, init_attr, &ucmd, udata); break; default: - err = create_qp_common(dev, pd, init_attr, udata, qp); + err = create_qp_common(dev, pd, init_attr, + (udata) ? &ucmd : NULL, udata, qp); } if (err) { mlx5_ib_dbg(dev, "create_qp_common failed\n"); -- cgit v1.2.3 From 5d0dc3d96c7b3bc6bc175754abcb132a1c94d02b Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 27 Apr 2020 18:46:12 +0300 Subject: RDMA/mlx5: Initial separation of RAW_PACKET QP from common flow Create initial function for IB_QPT_RAW_PACKET flow. Link: https://lore.kernel.org/r/20200427154636.381474-13-leon@kernel.org Reviewed-by: Maor Gottlieb Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/qp.c | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 91a2c9994b59..a514b4eca06e 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -1634,13 +1634,13 @@ static void destroy_rss_raw_qp_tir(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *q to_mpd(qp->ibqp.pd)->uid); } -static int create_rss_raw_qp_tir(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, - struct ib_pd *pd, +static int create_rss_raw_qp_tir(struct ib_pd *pd, struct mlx5_ib_qp *qp, struct ib_qp_init_attr *init_attr, struct ib_udata *udata) { struct mlx5_ib_ucontext *mucontext = rdma_udata_to_drv_context( udata, struct mlx5_ib_ucontext, ibucontext); + struct mlx5_ib_dev *dev = to_mdev(pd->device); struct mlx5_ib_create_qp_resp resp = {}; int inlen; int outlen; @@ -1996,9 +1996,6 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, if (mlx5_st < 0) return -EINVAL; - if (init_attr->rwq_ind_tbl) - return create_rss_raw_qp_tir(dev, qp, pd, init_attr, udata); - if (init_attr->create_flags & IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK) { if (!MLX5_CAP_GEN(mdev, block_lb_mc)) { mlx5_ib_dbg(dev, "block multicast loopback isn't supported\n"); @@ -2712,6 +2709,18 @@ static size_t process_udata_size(struct ib_qp_init_attr *attr, return ucmd; } +static int create_raw_qp(struct ib_pd *pd, struct mlx5_ib_qp *qp, + struct ib_qp_init_attr *attr, + struct mlx5_ib_create_qp *ucmd, struct ib_udata *udata) +{ + struct mlx5_ib_dev *dev = to_mdev(pd->device); + + if (attr->rwq_ind_tbl) + return create_rss_raw_qp_tir(pd, qp, attr, udata); + + return create_qp_common(dev, pd, attr, ucmd, udata, qp); +} + struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *init_attr, struct ib_udata *udata) @@ -2768,6 +2777,9 @@ struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd, case IB_QPT_DRIVER: err = create_driver_qp(pd, qp, init_attr, &ucmd, udata); break; + case IB_QPT_RAW_PACKET: + err = create_raw_qp(pd, qp, init_attr, &ucmd, udata); + break; default: err = create_qp_common(dev, pd, init_attr, (udata) ? &ucmd : NULL, udata, qp); -- cgit v1.2.3 From 2be08c308f102eeaee7ffc4a0d08ecee82b77f9d Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 27 Apr 2020 18:46:13 +0300 Subject: RDMA/mlx5: Delete create QP flags obfuscation There is no point in redefinition of stable and exposed to users create flags. Their values won't be changed and it is equal to used by the mlx5. Delete the mlx5 definitions and use IB/core fields. Link: https://lore.kernel.org/r/20200427154636.381474-14-leon@kernel.org Reviewed-by: Maor Gottlieb Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/devx.c | 2 +- drivers/infiniband/hw/mlx5/flow.c | 2 +- drivers/infiniband/hw/mlx5/main.c | 6 +-- drivers/infiniband/hw/mlx5/mlx5_ib.h | 21 +--------- drivers/infiniband/hw/mlx5/qp.c | 80 ++++++++++++++++++------------------ 5 files changed, 47 insertions(+), 64 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/devx.c b/drivers/infiniband/hw/mlx5/devx.c index 35b98c2d64d5..1d7feed6d3cb 100644 --- a/drivers/infiniband/hw/mlx5/devx.c +++ b/drivers/infiniband/hw/mlx5/devx.c @@ -615,7 +615,7 @@ static bool devx_is_valid_obj_id(struct uverbs_attr_bundle *attrs, enum ib_qp_type qp_type = qp->ibqp.qp_type; if (qp_type == IB_QPT_RAW_PACKET || - (qp->flags & MLX5_IB_QP_UNDERLAY)) { + (qp->flags & IB_QP_CREATE_SOURCE_QPN)) { struct mlx5_ib_raw_packet_qp *raw_packet_qp = &qp->raw_packet_qp; struct mlx5_ib_rq *rq = &raw_packet_qp->rq; diff --git a/drivers/infiniband/hw/mlx5/flow.c b/drivers/infiniband/hw/mlx5/flow.c index 69cb7e6e8955..08fd6a650868 100644 --- a/drivers/infiniband/hw/mlx5/flow.c +++ b/drivers/infiniband/hw/mlx5/flow.c @@ -142,7 +142,7 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_CREATE_FLOW)( return -EINVAL; mqp = to_mqp(qp); - if (mqp->flags & MLX5_IB_QP_RSS) + if (mqp->is_rss) dest_id = mqp->rss_qp.tirn; else dest_id = mqp->raw_packet_qp.rq.tirn; diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 65e0e24d463b..80ae8f04bfd5 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -3967,7 +3967,7 @@ static struct ib_flow *mlx5_ib_create_flow(struct ib_qp *qp, dst->type = MLX5_FLOW_DESTINATION_TYPE_PORT; } else { dst->type = MLX5_FLOW_DESTINATION_TYPE_TIR; - if (mqp->flags & MLX5_IB_QP_RSS) + if (mqp->is_rss) dst->tir_num = mqp->rss_qp.tirn; else dst->tir_num = mqp->raw_packet_qp.rq.tirn; @@ -3978,7 +3978,7 @@ static struct ib_flow *mlx5_ib_create_flow(struct ib_qp *qp, handler = create_dont_trap_rule(dev, ft_prio, flow_attr, dst); } else { - underlay_qpn = (mqp->flags & MLX5_IB_QP_UNDERLAY) ? + underlay_qpn = (mqp->flags & IB_QP_CREATE_SOURCE_QPN) ? mqp->underlay_qpn : 0; handler = _create_flow_rule(dev, ft_prio, flow_attr, dst, underlay_qpn, ucmd); @@ -4447,7 +4447,7 @@ static int mlx5_ib_mcg_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) uid = ibqp->pd ? to_mpd(ibqp->pd)->uid : 0; - if (mqp->flags & MLX5_IB_QP_UNDERLAY) { + if (mqp->flags & IB_QP_CREATE_SOURCE_QPN) { mlx5_ib_dbg(dev, "Attaching a multi cast group to underlay QP is not supported\n"); return -EOPNOTSUPP; } diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index aaabb8a98eed..b144660a47e1 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -450,7 +450,8 @@ struct mlx5_ib_qp { int scat_cqe; int max_inline_data; struct mlx5_bf bf; - int has_rq; + u8 has_rq:1; + u8 is_rss:1; /* only for user space QPs. For kernel * we have it from the bf object @@ -481,24 +482,6 @@ struct mlx5_ib_cq_buf { int nent; }; -enum mlx5_ib_qp_flags { - MLX5_IB_QP_LSO = IB_QP_CREATE_IPOIB_UD_LSO, - MLX5_IB_QP_BLOCK_MULTICAST_LOOPBACK = IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK, - MLX5_IB_QP_CROSS_CHANNEL = IB_QP_CREATE_CROSS_CHANNEL, - MLX5_IB_QP_MANAGED_SEND = IB_QP_CREATE_MANAGED_SEND, - MLX5_IB_QP_MANAGED_RECV = IB_QP_CREATE_MANAGED_RECV, - MLX5_IB_QP_SIGNATURE_HANDLING = 1 << 5, - /* QP uses 1 as its source QP number */ - MLX5_IB_QP_SQPN_QP1 = 1 << 6, - MLX5_IB_QP_CAP_SCATTER_FCS = 1 << 7, - MLX5_IB_QP_RSS = 1 << 8, - MLX5_IB_QP_CVLAN_STRIPPING = 1 << 9, - MLX5_IB_QP_UNDERLAY = 1 << 10, - MLX5_IB_QP_PCI_WRITE_END_PADDING = 1 << 11, - MLX5_IB_QP_TUNNEL_OFFLOAD = 1 << 12, - MLX5_IB_QP_PACKET_BASED_CREDIT = 1 << 13, -}; - struct mlx5_umr_wr { struct ib_send_wr wr; u64 virt_addr; diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index a514b4eca06e..cdbb837138c9 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -596,7 +596,7 @@ static int set_user_buf_size(struct mlx5_ib_dev *dev, } if (attr->qp_type == IB_QPT_RAW_PACKET || - qp->flags & MLX5_IB_QP_UNDERLAY) { + qp->flags & IB_QP_CREATE_SOURCE_QPN) { base->ubuffer.buf_size = qp->rq.wqe_cnt << qp->rq.wqe_shift; qp->raw_packet_qp.sq.ubuffer.buf_size = qp->sq.wqe_cnt << 6; } else { @@ -951,7 +951,7 @@ static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd, bfregn = MLX5_IB_INVALID_BFREG; break; case 0: - if (qp->flags & MLX5_IB_QP_CROSS_CHANNEL) + if (qp->flags & IB_QP_CREATE_CROSS_CHANNEL) return -EINVAL; bfregn = alloc_bfreg(dev, &context->bfregi); if (bfregn < 0) @@ -1169,7 +1169,7 @@ static int create_kernel_qp(struct mlx5_ib_dev *dev, if (init_attr->create_flags & MLX5_IB_QP_CREATE_SQPN_QP1) { MLX5_SET(qpc, qpc, deth_sqpn, 1); - qp->flags |= MLX5_IB_QP_SQPN_QP1; + qp->flags |= MLX5_IB_QP_CREATE_SQPN_QP1; } mlx5_fill_page_frag_array(&qp->buf, @@ -1251,7 +1251,7 @@ static int create_raw_packet_qp_tis(struct mlx5_ib_dev *dev, MLX5_SET(create_tis_in, in, uid, to_mpd(pd)->uid); MLX5_SET(tisc, tisc, transport_domain, tdn); - if (qp->flags & MLX5_IB_QP_UNDERLAY) + if (qp->flags & IB_QP_CREATE_SOURCE_QPN) MLX5_SET(tisc, tisc, underlay_qpn, qp->underlay_qpn); return mlx5_core_create_tis(dev->mdev, in, &sq->tisn); @@ -1400,7 +1400,7 @@ static int create_raw_packet_qp_rq(struct mlx5_ib_dev *dev, MLX5_SET(rqc, rqc, user_index, MLX5_GET(qpc, qpc, user_index)); MLX5_SET(rqc, rqc, cqn, MLX5_GET(qpc, qpc, cqn_rcv)); - if (mqp->flags & MLX5_IB_QP_CAP_SCATTER_FCS) + if (mqp->flags & IB_QP_CREATE_SCATTER_FCS) MLX5_SET(rqc, rqc, scatter_fcs, 1); wq = MLX5_ADDR_OF(rqc, rqc, wq); @@ -1538,9 +1538,9 @@ static int create_raw_packet_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, if (qp->rq.wqe_cnt) { rq->base.container_mibqp = qp; - if (qp->flags & MLX5_IB_QP_CVLAN_STRIPPING) + if (qp->flags & IB_QP_CREATE_CVLAN_STRIPPING) rq->flags |= MLX5_IB_RQ_CVLAN_STRIPPING; - if (qp->flags & MLX5_IB_QP_PCI_WRITE_END_PADDING) + if (qp->flags & IB_QP_CREATE_PCI_WRITE_END_PADDING) rq->flags |= MLX5_IB_RQ_PCI_WRITE_END_PADDING; err = create_raw_packet_qp_rq(dev, rq, in, inlen, pd); if (err) @@ -1878,7 +1878,7 @@ create_tir: kvfree(in); /* qpn is reserved for that QP */ qp->trans_qp.base.mqp.qpn = 0; - qp->flags |= MLX5_IB_QP_RSS; + qp->is_rss = true; return 0; err_copy: @@ -2001,7 +2001,7 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, mlx5_ib_dbg(dev, "block multicast loopback isn't supported\n"); return -EINVAL; } else { - qp->flags |= MLX5_IB_QP_BLOCK_MULTICAST_LOOPBACK; + qp->flags |= IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK; } } @@ -2014,11 +2014,11 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, return -EINVAL; } if (init_attr->create_flags & IB_QP_CREATE_CROSS_CHANNEL) - qp->flags |= MLX5_IB_QP_CROSS_CHANNEL; + qp->flags |= IB_QP_CREATE_CROSS_CHANNEL; if (init_attr->create_flags & IB_QP_CREATE_MANAGED_SEND) - qp->flags |= MLX5_IB_QP_MANAGED_SEND; + qp->flags |= IB_QP_CREATE_MANAGED_SEND; if (init_attr->create_flags & IB_QP_CREATE_MANAGED_RECV) - qp->flags |= MLX5_IB_QP_MANAGED_RECV; + qp->flags |= IB_QP_CREATE_MANAGED_RECV; } if (init_attr->qp_type == IB_QPT_UD && @@ -2038,7 +2038,7 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, mlx5_ib_dbg(dev, "Scatter FCS isn't supported\n"); return -EOPNOTSUPP; } - qp->flags |= MLX5_IB_QP_CAP_SCATTER_FCS; + qp->flags |= IB_QP_CREATE_SCATTER_FCS; } if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR) @@ -2049,7 +2049,7 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, MLX5_CAP_ETH(dev->mdev, vlan_cap)) || (init_attr->qp_type != IB_QPT_RAW_PACKET)) return -EOPNOTSUPP; - qp->flags |= MLX5_IB_QP_CVLAN_STRIPPING; + qp->flags |= IB_QP_CREATE_CVLAN_STRIPPING; } if (udata) { @@ -2106,7 +2106,7 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, mlx5_ib_dbg(dev, "packet based credit mode isn't supported\n"); return -EOPNOTSUPP; } - qp->flags |= MLX5_IB_QP_PACKET_BASED_CREDIT; + qp->flags_en |= MLX5_QP_FLAG_PACKET_BASED_CREDIT_MODE; } if (init_attr->create_flags & IB_QP_CREATE_SOURCE_QPN) { @@ -2118,7 +2118,7 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, return -EOPNOTSUPP; } - qp->flags |= MLX5_IB_QP_UNDERLAY; + qp->flags |= IB_QP_CREATE_SOURCE_QPN; qp->underlay_qpn = init_attr->source_qpn; } } else { @@ -2126,7 +2126,7 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, } base = (init_attr->qp_type == IB_QPT_RAW_PACKET || - qp->flags & MLX5_IB_QP_UNDERLAY) ? + qp->flags & IB_QP_CREATE_SOURCE_QPN) ? &qp->raw_packet_qp.rq.base : &qp->trans_qp.base; @@ -2196,16 +2196,16 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, if (qp->wq_sig) MLX5_SET(qpc, qpc, wq_signature, 1); - if (qp->flags & MLX5_IB_QP_BLOCK_MULTICAST_LOOPBACK) + if (qp->flags & IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK) MLX5_SET(qpc, qpc, block_lb_mc, 1); - if (qp->flags & MLX5_IB_QP_CROSS_CHANNEL) + if (qp->flags & IB_QP_CREATE_CROSS_CHANNEL) MLX5_SET(qpc, qpc, cd_master, 1); - if (qp->flags & MLX5_IB_QP_MANAGED_SEND) + if (qp->flags & IB_QP_CREATE_MANAGED_SEND) MLX5_SET(qpc, qpc, cd_slave_send, 1); - if (qp->flags & MLX5_IB_QP_MANAGED_RECV) + if (qp->flags & IB_QP_CREATE_MANAGED_RECV) MLX5_SET(qpc, qpc, cd_slave_receive, 1); - if (qp->flags & MLX5_IB_QP_PACKET_BASED_CREDIT) + if (qp->flags_en & MLX5_QP_FLAG_PACKET_BASED_CREDIT_MODE) MLX5_SET(qpc, qpc, req_e2e_credit_mode, 1); if (qp->scat_cqe && (init_attr->qp_type == IB_QPT_RC || init_attr->qp_type == IB_QPT_UC)) { @@ -2276,7 +2276,7 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, if (init_attr->qp_type == IB_QPT_UD && (init_attr->create_flags & IB_QP_CREATE_IPOIB_UD_LSO)) { MLX5_SET(qpc, qpc, ulp_stateless_offload_mode, 1); - qp->flags |= MLX5_IB_QP_LSO; + qp->flags |= IB_QP_CREATE_IPOIB_UD_LSO; } if (init_attr->create_flags & IB_QP_CREATE_PCI_WRITE_END_PADDING) { @@ -2288,7 +2288,7 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, MLX5_SET(qpc, qpc, end_padding_mode, MLX5_WQ_END_PAD_MODE_ALIGN); } else { - qp->flags |= MLX5_IB_QP_PCI_WRITE_END_PADDING; + qp->flags |= IB_QP_CREATE_PCI_WRITE_END_PADDING; } } @@ -2298,7 +2298,7 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, } if (init_attr->qp_type == IB_QPT_RAW_PACKET || - qp->flags & MLX5_IB_QP_UNDERLAY) { + qp->flags & IB_QP_CREATE_SOURCE_QPN) { qp->raw_packet_qp.sq.ubuffer.buf_addr = ucmd->sq_buf_addr; raw_packet_qp_copy_info(qp, &qp->raw_packet_qp); err = create_raw_packet_qp(dev, qp, in, inlen, pd, udata, @@ -2463,13 +2463,13 @@ static void destroy_qp_common(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, } base = (qp->ibqp.qp_type == IB_QPT_RAW_PACKET || - qp->flags & MLX5_IB_QP_UNDERLAY) ? + qp->flags & IB_QP_CREATE_SOURCE_QPN) ? &qp->raw_packet_qp.rq.base : &qp->trans_qp.base; if (qp->state != IB_QPS_RESET) { if (qp->ibqp.qp_type != IB_QPT_RAW_PACKET && - !(qp->flags & MLX5_IB_QP_UNDERLAY)) { + !(qp->flags & IB_QP_CREATE_SOURCE_QPN)) { err = mlx5_core_qp_modify(dev, MLX5_CMD_OP_2RST_QP, 0, NULL, &base->mqp); } else { @@ -2508,7 +2508,7 @@ static void destroy_qp_common(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, spin_unlock_irqrestore(&dev->reset_flow_resource_lock, flags); if (qp->ibqp.qp_type == IB_QPT_RAW_PACKET || - qp->flags & MLX5_IB_QP_UNDERLAY) { + qp->flags & IB_QP_CREATE_SOURCE_QPN) { destroy_raw_packet_qp(dev, qp); } else { err = mlx5_core_destroy_qp(dev, &base->mqp); @@ -3550,7 +3550,7 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, if ((cur_state == IB_QPS_RESET) && (new_state == IB_QPS_INIT)) { if ((ibqp->qp_type == IB_QPT_RC) || (ibqp->qp_type == IB_QPT_UD && - !(qp->flags & MLX5_IB_QP_SQPN_QP1)) || + !(qp->flags & MLX5_IB_QP_CREATE_SQPN_QP1)) || (ibqp->qp_type == IB_QPT_UC) || (ibqp->qp_type == IB_QPT_RAW_PACKET) || (ibqp->qp_type == IB_QPT_XRC_INI) || @@ -3567,7 +3567,7 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, if (is_sqp(ibqp->qp_type)) { context->mtu_msgmax = (IB_MTU_256 << 5) | 8; } else if ((ibqp->qp_type == IB_QPT_UD && - !(qp->flags & MLX5_IB_QP_UNDERLAY)) || + !(qp->flags & IB_QP_CREATE_SOURCE_QPN)) || ibqp->qp_type == MLX5_IB_QPT_REG_UMR) { context->mtu_msgmax = (IB_MTU_4096 << 5) | 12; } else if (attr_mask & IB_QP_PATH_MTU) { @@ -3672,7 +3672,7 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, qp->port) - 1; /* Underlay port should be used - index 0 function per port */ - if (qp->flags & MLX5_IB_QP_UNDERLAY) + if (qp->flags & IB_QP_CREATE_SOURCE_QPN) port_num = 0; if (ibqp->counter) @@ -3686,7 +3686,7 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, if (!ibqp->uobject && cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) context->sq_crq_size |= cpu_to_be16(1 << 4); - if (qp->flags & MLX5_IB_QP_SQPN_QP1) + if (qp->flags & MLX5_IB_QP_CREATE_SQPN_QP1) context->deth_sqpn = cpu_to_be32(1); mlx5_cur = to_mlx5_state(cur_state); @@ -3703,7 +3703,7 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, optpar &= opt_mask[mlx5_cur][mlx5_new][mlx5_st]; if (qp->ibqp.qp_type == IB_QPT_RAW_PACKET || - qp->flags & MLX5_IB_QP_UNDERLAY) { + qp->flags & IB_QP_CREATE_SOURCE_QPN) { struct mlx5_modify_raw_qp_param raw_qp_param = {}; raw_qp_param.operation = op; @@ -3999,7 +3999,7 @@ int mlx5_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, port = attr_mask & IB_QP_PORT ? attr->port_num : qp->port; } - if (qp->flags & MLX5_IB_QP_UNDERLAY) { + if (qp->flags & IB_QP_CREATE_SOURCE_QPN) { if (attr_mask & ~(IB_QP_STATE | IB_QP_CUR_STATE)) { mlx5_ib_dbg(dev, "invalid attr_mask 0x%x when underlay QP is used\n", attr_mask); @@ -5831,7 +5831,7 @@ int mlx5_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, mutex_lock(&qp->mutex); if (qp->ibqp.qp_type == IB_QPT_RAW_PACKET || - qp->flags & MLX5_IB_QP_UNDERLAY) { + qp->flags & IB_QP_CREATE_SOURCE_QPN) { err = query_raw_packet_qp_state(dev, qp, &raw_packet_qp_state); if (err) goto out; @@ -5866,16 +5866,16 @@ int mlx5_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, qp_init_attr->cap = qp_attr->cap; qp_init_attr->create_flags = 0; - if (qp->flags & MLX5_IB_QP_BLOCK_MULTICAST_LOOPBACK) + if (qp->flags & IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK) qp_init_attr->create_flags |= IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK; - if (qp->flags & MLX5_IB_QP_CROSS_CHANNEL) + if (qp->flags & IB_QP_CREATE_CROSS_CHANNEL) qp_init_attr->create_flags |= IB_QP_CREATE_CROSS_CHANNEL; - if (qp->flags & MLX5_IB_QP_MANAGED_SEND) + if (qp->flags & IB_QP_CREATE_MANAGED_SEND) qp_init_attr->create_flags |= IB_QP_CREATE_MANAGED_SEND; - if (qp->flags & MLX5_IB_QP_MANAGED_RECV) + if (qp->flags & IB_QP_CREATE_MANAGED_RECV) qp_init_attr->create_flags |= IB_QP_CREATE_MANAGED_RECV; - if (qp->flags & MLX5_IB_QP_SQPN_QP1) + if (qp->flags & MLX5_IB_QP_CREATE_SQPN_QP1) qp_init_attr->create_flags |= MLX5_IB_QP_CREATE_SQPN_QP1; qp_init_attr->sq_sig_type = qp->sq_signal_bits & MLX5_WQE_CTRL_CQ_UPDATE ? -- cgit v1.2.3 From 2978975ce7f16131ddf70468f0b189231e33086b Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 27 Apr 2020 18:46:14 +0300 Subject: RDMA/mlx5: Process create QP flags in one place create_flags is checked in too many places and scattered across all the code, consolidate all the checks inside one function, so we will be easily see the flow. As part of such change, delete unreachable code, because IB/core is responsible sanitize the input. Link: https://lore.kernel.org/r/20200427154636.381474-15-leon@kernel.org Reviewed-by: Maor Gottlieb Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/qp.c | 200 ++++++++++++++++++++-------------------- 1 file changed, 101 insertions(+), 99 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index cdbb837138c9..02eb03484b91 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -1097,17 +1097,9 @@ static int create_kernel_qp(struct mlx5_ib_dev *dev, void *qpc; int err; - if (init_attr->create_flags & ~(IB_QP_CREATE_INTEGRITY_EN | - IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK | - IB_QP_CREATE_IPOIB_UD_LSO | - IB_QP_CREATE_NETIF_QP | - MLX5_IB_QP_CREATE_SQPN_QP1 | - MLX5_IB_QP_CREATE_WC_TEST)) - return -EINVAL; - if (init_attr->qp_type == MLX5_IB_QPT_REG_UMR) qp->bf.bfreg = &dev->fp_bfreg; - else if (init_attr->create_flags & MLX5_IB_QP_CREATE_WC_TEST) + else if (qp->flags & MLX5_IB_QP_CREATE_WC_TEST) qp->bf.bfreg = &dev->wc_bfreg; else qp->bf.bfreg = &dev->bfreg; @@ -1167,10 +1159,8 @@ static int create_kernel_qp(struct mlx5_ib_dev *dev, MLX5_SET(qpc, qpc, fre, 1); MLX5_SET(qpc, qpc, rlky, 1); - if (init_attr->create_flags & MLX5_IB_QP_CREATE_SQPN_QP1) { + if (qp->flags & MLX5_IB_QP_CREATE_SQPN_QP1) MLX5_SET(qpc, qpc, deth_sqpn, 1); - qp->flags |= MLX5_IB_QP_CREATE_SQPN_QP1; - } mlx5_fill_page_frag_array(&qp->buf, (__be64 *)MLX5_ADDR_OF(create_qp_in, @@ -1657,7 +1647,7 @@ static int create_rss_raw_qp_tir(struct ib_pd *pd, struct mlx5_ib_qp *qp, size_t required_cmd_sz; u8 lb_flag = 0; - if (init_attr->create_flags || init_attr->send_cq) + if (init_attr->send_cq) return -EINVAL; min_resp_len = offsetof(typeof(resp), bfreg_index) + sizeof(resp.bfreg_index); @@ -1996,62 +1986,9 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, if (mlx5_st < 0) return -EINVAL; - if (init_attr->create_flags & IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK) { - if (!MLX5_CAP_GEN(mdev, block_lb_mc)) { - mlx5_ib_dbg(dev, "block multicast loopback isn't supported\n"); - return -EINVAL; - } else { - qp->flags |= IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK; - } - } - - if (init_attr->create_flags & - (IB_QP_CREATE_CROSS_CHANNEL | - IB_QP_CREATE_MANAGED_SEND | - IB_QP_CREATE_MANAGED_RECV)) { - if (!MLX5_CAP_GEN(mdev, cd)) { - mlx5_ib_dbg(dev, "cross-channel isn't supported\n"); - return -EINVAL; - } - if (init_attr->create_flags & IB_QP_CREATE_CROSS_CHANNEL) - qp->flags |= IB_QP_CREATE_CROSS_CHANNEL; - if (init_attr->create_flags & IB_QP_CREATE_MANAGED_SEND) - qp->flags |= IB_QP_CREATE_MANAGED_SEND; - if (init_attr->create_flags & IB_QP_CREATE_MANAGED_RECV) - qp->flags |= IB_QP_CREATE_MANAGED_RECV; - } - - if (init_attr->qp_type == IB_QPT_UD && - (init_attr->create_flags & IB_QP_CREATE_IPOIB_UD_LSO)) - if (!MLX5_CAP_GEN(mdev, ipoib_basic_offloads)) { - mlx5_ib_dbg(dev, "ipoib UD lso qp isn't supported\n"); - return -EOPNOTSUPP; - } - - if (init_attr->create_flags & IB_QP_CREATE_SCATTER_FCS) { - if (init_attr->qp_type != IB_QPT_RAW_PACKET) { - mlx5_ib_dbg(dev, "Scatter FCS is supported only for Raw Packet QPs"); - return -EOPNOTSUPP; - } - if (!MLX5_CAP_GEN(dev->mdev, eth_net_offloads) || - !MLX5_CAP_ETH(dev->mdev, scatter_fcs)) { - mlx5_ib_dbg(dev, "Scatter FCS isn't supported\n"); - return -EOPNOTSUPP; - } - qp->flags |= IB_QP_CREATE_SCATTER_FCS; - } - if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR) qp->sq_signal_bits = MLX5_WQE_CTRL_CQ_UPDATE; - if (init_attr->create_flags & IB_QP_CREATE_CVLAN_STRIPPING) { - if (!(MLX5_CAP_GEN(dev->mdev, eth_net_offloads) && - MLX5_CAP_ETH(dev->mdev, vlan_cap)) || - (init_attr->qp_type != IB_QPT_RAW_PACKET)) - return -EOPNOTSUPP; - qp->flags |= IB_QP_CREATE_CVLAN_STRIPPING; - } - if (udata) { if (!check_flags_mask(ucmd->flags, MLX5_QP_FLAG_ALLOW_SCATTER_CQE | @@ -2108,23 +2045,13 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, } qp->flags_en |= MLX5_QP_FLAG_PACKET_BASED_CREDIT_MODE; } - - if (init_attr->create_flags & IB_QP_CREATE_SOURCE_QPN) { - if (init_attr->qp_type != IB_QPT_UD || - (MLX5_CAP_GEN(dev->mdev, port_type) != - MLX5_CAP_PORT_TYPE_IB) || - !mlx5_get_flow_namespace(dev->mdev, MLX5_FLOW_NAMESPACE_BYPASS)) { - mlx5_ib_dbg(dev, "Source QP option isn't supported\n"); - return -EOPNOTSUPP; - } - - qp->flags |= IB_QP_CREATE_SOURCE_QPN; - qp->underlay_qpn = init_attr->source_qpn; - } } else { qp->wq_sig = !!wq_signature; } + if (qp->flags & IB_QP_CREATE_SOURCE_QPN) + qp->underlay_qpn = init_attr->source_qpn; + base = (init_attr->qp_type == IB_QPT_RAW_PACKET || qp->flags & IB_QP_CREATE_SOURCE_QPN) ? &qp->raw_packet_qp.rq.base : @@ -2153,11 +2080,6 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, ucmd->sq_wqe_count, max_wqes); return -EINVAL; } - if (init_attr->create_flags & - MLX5_IB_QP_CREATE_SQPN_QP1) { - mlx5_ib_dbg(dev, "user-space is not allowed to create UD QPs spoofing as QP1\n"); - return -EINVAL; - } err = create_user_qp(dev, pd, qp, udata, init_attr, &in, &resp, &inlen, base); if (err) @@ -2273,23 +2195,15 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, MLX5_SET(qpc, qpc, user_index, uidx); /* we use IB_QP_CREATE_IPOIB_UD_LSO to indicates ipoib qp */ - if (init_attr->qp_type == IB_QPT_UD && - (init_attr->create_flags & IB_QP_CREATE_IPOIB_UD_LSO)) { + if (qp->flags & IB_QP_CREATE_IPOIB_UD_LSO) MLX5_SET(qpc, qpc, ulp_stateless_offload_mode, 1); - qp->flags |= IB_QP_CREATE_IPOIB_UD_LSO; - } - if (init_attr->create_flags & IB_QP_CREATE_PCI_WRITE_END_PADDING) { - if (!MLX5_CAP_GEN(dev->mdev, end_pad)) { - mlx5_ib_dbg(dev, "scatter end padding is not supported\n"); - err = -EOPNOTSUPP; - goto err; - } else if (init_attr->qp_type != IB_QPT_RAW_PACKET) { - MLX5_SET(qpc, qpc, end_padding_mode, - MLX5_WQ_END_PAD_MODE_ALIGN); - } else { - qp->flags |= IB_QP_CREATE_PCI_WRITE_END_PADDING; - } + if (qp->flags & IB_QP_CREATE_PCI_WRITE_END_PADDING && + init_attr->qp_type != IB_QPT_RAW_PACKET) { + MLX5_SET(qpc, qpc, end_padding_mode, + MLX5_WQ_END_PAD_MODE_ALIGN); + /* Special case to clean flag */ + qp->flags &= ~IB_QP_CREATE_PCI_WRITE_END_PADDING; } if (inlen < 0) { @@ -2670,6 +2584,91 @@ static int process_vendor_flags(struct mlx5_ib_qp *qp, return 0; } +static void process_create_flag(struct mlx5_ib_dev *dev, int *flags, int flag, + bool cond, struct mlx5_ib_qp *qp) +{ + if (!(*flags & flag)) + return; + + if (cond) { + qp->flags |= flag; + *flags &= ~flag; + return; + } + + if (flag == MLX5_IB_QP_CREATE_WC_TEST) { + /* + * Special case, if condition didn't meet, it won't be error, + * just different in-kernel flow. + */ + *flags &= ~MLX5_IB_QP_CREATE_WC_TEST; + return; + } + mlx5_ib_dbg(dev, "Verbs create QP flag 0x%X is not supported\n", flag); +} + +static int process_create_flags(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, + struct ib_qp_init_attr *attr) +{ + enum ib_qp_type qp_type = attr->qp_type; + struct mlx5_core_dev *mdev = dev->mdev; + int create_flags = attr->create_flags; + bool cond; + + if (qp->qp_sub_type == MLX5_IB_QPT_DCT) + return (create_flags) ? -EINVAL : 0; + + if (qp_type == IB_QPT_RAW_PACKET && attr->rwq_ind_tbl) + return (create_flags) ? -EINVAL : 0; + + process_create_flag(dev, &create_flags, + IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK, + MLX5_CAP_GEN(mdev, block_lb_mc), qp); + process_create_flag(dev, &create_flags, IB_QP_CREATE_CROSS_CHANNEL, + MLX5_CAP_GEN(mdev, cd), qp); + process_create_flag(dev, &create_flags, IB_QP_CREATE_MANAGED_SEND, + MLX5_CAP_GEN(mdev, cd), qp); + process_create_flag(dev, &create_flags, IB_QP_CREATE_MANAGED_RECV, + MLX5_CAP_GEN(mdev, cd), qp); + + if (qp_type == IB_QPT_UD) { + process_create_flag(dev, &create_flags, + IB_QP_CREATE_IPOIB_UD_LSO, + MLX5_CAP_GEN(mdev, ipoib_basic_offloads), + qp); + cond = MLX5_CAP_GEN(mdev, port_type) == MLX5_CAP_PORT_TYPE_IB; + process_create_flag(dev, &create_flags, IB_QP_CREATE_SOURCE_QPN, + cond, qp); + } + + if (qp_type == IB_QPT_RAW_PACKET) { + cond = MLX5_CAP_GEN(mdev, eth_net_offloads) && + MLX5_CAP_ETH(mdev, scatter_fcs); + process_create_flag(dev, &create_flags, + IB_QP_CREATE_SCATTER_FCS, cond, qp); + + cond = MLX5_CAP_GEN(mdev, eth_net_offloads) && + MLX5_CAP_ETH(mdev, vlan_cap); + process_create_flag(dev, &create_flags, + IB_QP_CREATE_CVLAN_STRIPPING, cond, qp); + } + + process_create_flag(dev, &create_flags, + IB_QP_CREATE_PCI_WRITE_END_PADDING, + MLX5_CAP_GEN(mdev, end_pad), qp); + + process_create_flag(dev, &create_flags, MLX5_IB_QP_CREATE_WC_TEST, + qp_type != MLX5_IB_QPT_REG_UMR, qp); + process_create_flag(dev, &create_flags, MLX5_IB_QP_CREATE_SQPN_QP1, + true, qp); + + if (create_flags) + mlx5_ib_dbg(dev, "Create QP has unsupported flags 0x%X\n", + create_flags); + + return (create_flags) ? -EINVAL : 0; +} + static int create_driver_qp(struct ib_pd *pd, struct mlx5_ib_qp *qp, struct ib_qp_init_attr *attr, struct mlx5_ib_create_qp *ucmd, @@ -2769,6 +2768,9 @@ struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd, if (err) goto free_qp; } + err = process_create_flags(dev, qp, init_attr); + if (err) + goto free_qp; if (init_attr->qp_type == IB_QPT_XRC_TGT) xrcdn = to_mxrcd(init_attr->xrcd)->xrcdn; -- cgit v1.2.3 From c95e6d53970254fa04a09c0fd79ae2cfa54cd1f5 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 27 Apr 2020 18:46:15 +0300 Subject: RDMA/mlx5: Use flags_en mechanism to mark QP created with WQE signature MLX5_QP_FLAG_SIGNATURE is exposed to the users but in the kernel the create_qp flow treated it differently from other MLX5_QP_FLAG_*s. Fix it by ditching wq_sig boolean variable and use general flag_en mechanism. Link: https://lore.kernel.org/r/20200427154636.381474-16-leon@kernel.org Reviewed-by: Maor Gottlieb Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/mlx5_ib.h | 1 - drivers/infiniband/hw/mlx5/odp.c | 2 +- drivers/infiniband/hw/mlx5/qp.c | 36 ++++++++++++++++++++++-------------- 3 files changed, 23 insertions(+), 16 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index b144660a47e1..61a96c1dd125 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -446,7 +446,6 @@ struct mlx5_ib_qp { u32 flags; u8 port; u8 state; - int wq_sig; int scat_cqe; int max_inline_data; struct mlx5_bf bf; diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index 16af1105cfcf..e4759310c0e2 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -1190,7 +1190,7 @@ static int mlx5_ib_mr_responder_pfault_handler_rq(struct mlx5_ib_dev *dev, struct mlx5_ib_wq *wq = &qp->rq; int wqe_size = 1 << wq->wqe_shift; - if (qp->wq_sig) { + if (qp->flags_en & MLX5_QP_FLAG_SIGNATURE) { mlx5_ib_err(dev, "ODP fault with WQE signatures is not supported\n"); return -EFAULT; } diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 02eb03484b91..9d29b84242f9 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -41,9 +41,6 @@ #include "cmd.h" #include "qp.h" -/* not supported currently */ -static int wq_signature; - enum { MLX5_IB_ACK_REQ_FREQ = 8, }; @@ -392,17 +389,26 @@ static int set_rq_size(struct mlx5_ib_dev *dev, struct ib_qp_cap *cap, cap->max_recv_wr = 0; cap->max_recv_sge = 0; } else { + int wq_sig = !!(qp->flags_en & MLX5_QP_FLAG_SIGNATURE); + if (ucmd) { qp->rq.wqe_cnt = ucmd->rq_wqe_count; if (ucmd->rq_wqe_shift > BITS_PER_BYTE * sizeof(ucmd->rq_wqe_shift)) return -EINVAL; qp->rq.wqe_shift = ucmd->rq_wqe_shift; - if ((1 << qp->rq.wqe_shift) / sizeof(struct mlx5_wqe_data_seg) < qp->wq_sig) + if ((1 << qp->rq.wqe_shift) / + sizeof(struct mlx5_wqe_data_seg) < + wq_sig) return -EINVAL; - qp->rq.max_gs = (1 << qp->rq.wqe_shift) / sizeof(struct mlx5_wqe_data_seg) - qp->wq_sig; + qp->rq.max_gs = + (1 << qp->rq.wqe_shift) / + sizeof(struct mlx5_wqe_data_seg) - + wq_sig; qp->rq.max_post = qp->rq.wqe_cnt; } else { - wqe_size = qp->wq_sig ? sizeof(struct mlx5_wqe_signature_seg) : 0; + wqe_size = + wq_sig ? sizeof(struct mlx5_wqe_signature_seg) : + 0; wqe_size += cap->max_recv_sge * sizeof(struct mlx5_wqe_data_seg); wqe_size = roundup_pow_of_two(wqe_size); wq_size = roundup_pow_of_two(cap->max_recv_wr) * wqe_size; @@ -416,7 +422,10 @@ static int set_rq_size(struct mlx5_ib_dev *dev, struct ib_qp_cap *cap, return -EINVAL; } qp->rq.wqe_shift = ilog2(wqe_size); - qp->rq.max_gs = (1 << qp->rq.wqe_shift) / sizeof(struct mlx5_wqe_data_seg) - qp->wq_sig; + qp->rq.max_gs = + (1 << qp->rq.wqe_shift) / + sizeof(struct mlx5_wqe_data_seg) - + wq_sig; qp->rq.max_post = qp->rq.wqe_cnt; } } @@ -2008,7 +2017,8 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, if (err) return err; - qp->wq_sig = !!(ucmd->flags & MLX5_QP_FLAG_SIGNATURE); + if (ucmd->flags & MLX5_QP_FLAG_SIGNATURE) + qp->flags_en |= MLX5_QP_FLAG_SIGNATURE; if (MLX5_CAP_GEN(dev->mdev, sctr_data_cqe)) qp->scat_cqe = !!(ucmd->flags & MLX5_QP_FLAG_SCATTER_CQE); @@ -2045,8 +2055,6 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, } qp->flags_en |= MLX5_QP_FLAG_PACKET_BASED_CREDIT_MODE; } - } else { - qp->wq_sig = !!wq_signature; } if (qp->flags & IB_QP_CREATE_SOURCE_QPN) @@ -2115,7 +2123,7 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, MLX5_SET(qpc, qpc, latency_sensitive, 1); - if (qp->wq_sig) + if (qp->flags_en & MLX5_QP_FLAG_SIGNATURE) MLX5_SET(qpc, qpc, wq_signature, 1); if (qp->flags & IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK) @@ -4997,7 +5005,7 @@ static void finish_wqe(struct mlx5_ib_qp *qp, mlx5_opcode | ((u32)opmod << 24)); ctrl->qpn_ds = cpu_to_be32(size | (qp->trans_qp.base.mqp.qpn << 8)); ctrl->fm_ce_se |= fence; - if (unlikely(qp->wq_sig)) + if (unlikely(qp->flags_en & MLX5_QP_FLAG_SIGNATURE)) ctrl->signature = wq_sig(ctrl); qp->sq.wrid[idx] = wr_id; @@ -5449,7 +5457,7 @@ static int _mlx5_ib_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr, } scat = mlx5_frag_buf_get_wqe(&qp->rq.fbc, ind); - if (qp->wq_sig) + if (qp->flags_en & MLX5_QP_FLAG_SIGNATURE) scat++; for (i = 0; i < wr->num_sge; i++) @@ -5461,7 +5469,7 @@ static int _mlx5_ib_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr, scat[i].addr = 0; } - if (qp->wq_sig) { + if (qp->flags_en & MLX5_QP_FLAG_SIGNATURE) { sig = (struct mlx5_rwqe_sig *)scat; set_sig_seg(sig, (qp->rq.max_gs + 1) << 2); } -- cgit v1.2.3 From 90ecb37a751b6923bee846c4e19f73b943c6ffa1 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 27 Apr 2020 18:46:16 +0300 Subject: RDMA/mlx5: Change scatter CQE flag to be set like other vendor flags In similar way to wqe_sig, the scat_cqe was treated differently from other create QP vendor flags. Change it to be similar to other flags and use flags_en mechanism. Link: https://lore.kernel.org/r/20200427154636.381474-17-leon@kernel.org Reviewed-by: Maor Gottlieb Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/mlx5_ib.h | 1 - drivers/infiniband/hw/mlx5/qp.c | 17 ++++++++++------- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 61a96c1dd125..b6467cadc384 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -446,7 +446,6 @@ struct mlx5_ib_qp { u32 flags; u8 port; u8 state; - int scat_cqe; int max_inline_data; struct mlx5_bf bf; u8 has_rq:1; diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 9d29b84242f9..6a4b20c71b40 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -2019,9 +2019,10 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, if (ucmd->flags & MLX5_QP_FLAG_SIGNATURE) qp->flags_en |= MLX5_QP_FLAG_SIGNATURE; - if (MLX5_CAP_GEN(dev->mdev, sctr_data_cqe)) - qp->scat_cqe = - !!(ucmd->flags & MLX5_QP_FLAG_SCATTER_CQE); + if (ucmd->flags & MLX5_QP_FLAG_SCATTER_CQE && + MLX5_CAP_GEN(dev->mdev, sctr_data_cqe)) + qp->flags_en |= MLX5_QP_FLAG_SCATTER_CQE; + if (ucmd->flags & MLX5_QP_FLAG_TUNNEL_OFFLOADS) { if (init_attr->qp_type != IB_QPT_RAW_PACKET || !tunnel_offload_supported(mdev)) { @@ -2137,8 +2138,9 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, MLX5_SET(qpc, qpc, cd_slave_receive, 1); if (qp->flags_en & MLX5_QP_FLAG_PACKET_BASED_CREDIT_MODE) MLX5_SET(qpc, qpc, req_e2e_credit_mode, 1); - if (qp->scat_cqe && (init_attr->qp_type == IB_QPT_RC || - init_attr->qp_type == IB_QPT_UC)) { + if ((qp->flags_en & MLX5_QP_FLAG_SCATTER_CQE) && + (init_attr->qp_type == IB_QPT_RC || + init_attr->qp_type == IB_QPT_UC)) { int rcqe_sz = rcqe_sz = mlx5_ib_get_cqe_size(init_attr->recv_cq); @@ -2146,8 +2148,9 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, rcqe_sz == 128 ? MLX5_RES_SCAT_DATA64_CQE : MLX5_RES_SCAT_DATA32_CQE); } - if (qp->scat_cqe && (qp->qp_sub_type == MLX5_IB_QPT_DCI || - init_attr->qp_type == IB_QPT_RC)) + if ((qp->flags_en & MLX5_QP_FLAG_SCATTER_CQE) && + (qp->qp_sub_type == MLX5_IB_QPT_DCI || + init_attr->qp_type == IB_QPT_RC)) configure_requester_scat_cqe(dev, init_attr, ucmd, qpc); if (qp->rq.wqe_cnt) { -- cgit v1.2.3 From a8f3ea61e1c826a1f882b3fffbb052390ddee310 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 27 Apr 2020 18:46:17 +0300 Subject: RDMA/mlx5: Return all configured create flags through query QP The "flags" field in struct mlx5_ib_qp contains all UAPI flags configured at the create QP stage. Return all the data as is without masking. Link: https://lore.kernel.org/r/20200427154636.381474-18-leon@kernel.org Reviewed-by: Maor Gottlieb Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/mlx5_ib.h | 1 + drivers/infiniband/hw/mlx5/qp.c | 13 +------------ 2 files changed, 2 insertions(+), 12 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index b6467cadc384..9b2baf119823 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -443,6 +443,7 @@ struct mlx5_ib_qp { /* serialize qp state modifications */ struct mutex mutex; + /* cached variant of create_flags from struct ib_qp_init_attr */ u32 flags; u8 port; u8 state; diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 6a4b20c71b40..f0385965a694 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -5878,18 +5878,7 @@ int mlx5_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, qp_init_attr->cap = qp_attr->cap; - qp_init_attr->create_flags = 0; - if (qp->flags & IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK) - qp_init_attr->create_flags |= IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK; - - if (qp->flags & IB_QP_CREATE_CROSS_CHANNEL) - qp_init_attr->create_flags |= IB_QP_CREATE_CROSS_CHANNEL; - if (qp->flags & IB_QP_CREATE_MANAGED_SEND) - qp_init_attr->create_flags |= IB_QP_CREATE_MANAGED_SEND; - if (qp->flags & IB_QP_CREATE_MANAGED_RECV) - qp_init_attr->create_flags |= IB_QP_CREATE_MANAGED_RECV; - if (qp->flags & MLX5_IB_QP_CREATE_SQPN_QP1) - qp_init_attr->create_flags |= MLX5_IB_QP_CREATE_SQPN_QP1; + qp_init_attr->create_flags = qp->flags; qp_init_attr->sq_sig_type = qp->sq_signal_bits & MLX5_WQE_CTRL_CQ_UPDATE ? IB_SIGNAL_ALL_WR : IB_SIGNAL_REQ_WR; -- cgit v1.2.3 From 37518fa49f764516ba68fcc6ec933066bb545276 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 27 Apr 2020 18:46:18 +0300 Subject: RDMA/mlx5: Process all vendor flags in one place Check that vendor flags provided through ucmd are valid. Link: https://lore.kernel.org/r/20200427154636.381474-19-leon@kernel.org Reviewed-by: Maor Gottlieb Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/qp.c | 156 ++++++++++++++++++---------------------- 1 file changed, 71 insertions(+), 85 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index f0385965a694..2673678f1899 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -1430,13 +1430,6 @@ static void destroy_raw_packet_qp_rq(struct mlx5_ib_dev *dev, mlx5_core_destroy_rq_tracked(dev, &rq->base.mqp); } -static bool tunnel_offload_supported(struct mlx5_core_dev *dev) -{ - return (MLX5_CAP_ETH(dev, tunnel_stateless_vxlan) || - MLX5_CAP_ETH(dev, tunnel_stateless_gre) || - MLX5_CAP_ETH(dev, tunnel_stateless_geneve_rx)); -} - static void destroy_raw_packet_qp_tir(struct mlx5_ib_dev *dev, struct mlx5_ib_rq *rq, u32 qp_flags_en, @@ -1693,27 +1686,20 @@ static int create_rss_raw_qp_tir(struct ib_pd *pd, struct mlx5_ib_qp *qp, return -EOPNOTSUPP; } - if (ucmd.flags & MLX5_QP_FLAG_TUNNEL_OFFLOADS && - !tunnel_offload_supported(dev->mdev)) { - mlx5_ib_dbg(dev, "tunnel offloads isn't supported\n"); - return -EOPNOTSUPP; - } - if (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_INNER && !(ucmd.flags & MLX5_QP_FLAG_TUNNEL_OFFLOADS)) { mlx5_ib_dbg(dev, "Tunnel offloads must be set for inner RSS\n"); return -EOPNOTSUPP; } - if (ucmd.flags & MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_UC || dev->is_rep) { - lb_flag |= MLX5_TIRC_SELF_LB_BLOCK_BLOCK_UNICAST; + if (dev->is_rep) qp->flags_en |= MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_UC; - } - if (ucmd.flags & MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_MC) { + if (qp->flags_en & MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_UC) + lb_flag |= MLX5_TIRC_SELF_LB_BLOCK_BLOCK_UNICAST; + + if (qp->flags_en & MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_MC) lb_flag |= MLX5_TIRC_SELF_LB_BLOCK_BLOCK_MULTICAST; - qp->flags_en |= MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_MC; - } err = ib_copy_to_udata(udata, &resp, min(udata->outlen, sizeof(resp))); if (err) { @@ -1959,11 +1945,6 @@ static int get_atomic_mode(struct mlx5_ib_dev *dev, return atomic_mode; } -static inline bool check_flags_mask(uint64_t input, uint64_t supported) -{ - return (input & ~supported) == 0; -} - static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, struct ib_qp_init_attr *init_attr, struct mlx5_ib_create_qp *ucmd, @@ -1999,63 +1980,9 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, qp->sq_signal_bits = MLX5_WQE_CTRL_CQ_UPDATE; if (udata) { - if (!check_flags_mask(ucmd->flags, - MLX5_QP_FLAG_ALLOW_SCATTER_CQE | - MLX5_QP_FLAG_BFREG_INDEX | - MLX5_QP_FLAG_PACKET_BASED_CREDIT_MODE | - MLX5_QP_FLAG_SCATTER_CQE | - MLX5_QP_FLAG_SIGNATURE | - MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_MC | - MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_UC | - MLX5_QP_FLAG_TUNNEL_OFFLOADS | - MLX5_QP_FLAG_UAR_PAGE_INDEX | - MLX5_QP_FLAG_TYPE_DCI | - MLX5_QP_FLAG_TYPE_DCT)) - return -EINVAL; - err = get_qp_user_index(ucontext, ucmd, udata->inlen, &uidx); if (err) return err; - - if (ucmd->flags & MLX5_QP_FLAG_SIGNATURE) - qp->flags_en |= MLX5_QP_FLAG_SIGNATURE; - if (ucmd->flags & MLX5_QP_FLAG_SCATTER_CQE && - MLX5_CAP_GEN(dev->mdev, sctr_data_cqe)) - qp->flags_en |= MLX5_QP_FLAG_SCATTER_CQE; - - if (ucmd->flags & MLX5_QP_FLAG_TUNNEL_OFFLOADS) { - if (init_attr->qp_type != IB_QPT_RAW_PACKET || - !tunnel_offload_supported(mdev)) { - mlx5_ib_dbg(dev, "Tunnel offload isn't supported\n"); - return -EOPNOTSUPP; - } - qp->flags_en |= MLX5_QP_FLAG_TUNNEL_OFFLOADS; - } - - if (ucmd->flags & MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_UC) { - if (init_attr->qp_type != IB_QPT_RAW_PACKET) { - mlx5_ib_dbg(dev, "Self-LB UC isn't supported\n"); - return -EOPNOTSUPP; - } - qp->flags_en |= MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_UC; - } - - if (ucmd->flags & MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_MC) { - if (init_attr->qp_type != IB_QPT_RAW_PACKET) { - mlx5_ib_dbg(dev, "Self-LB UM isn't supported\n"); - return -EOPNOTSUPP; - } - qp->flags_en |= MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_MC; - } - - if (ucmd->flags & MLX5_QP_FLAG_PACKET_BASED_CREDIT_MODE) { - if (init_attr->qp_type != IB_QPT_RC || - !MLX5_CAP_GEN(dev->mdev, qp_packet_based)) { - mlx5_ib_dbg(dev, "packet based credit mode isn't supported\n"); - return -EOPNOTSUPP; - } - qp->flags_en |= MLX5_QP_FLAG_PACKET_BASED_CREDIT_MODE; - } } if (qp->flags & IB_QP_CREATE_SOURCE_QPN) @@ -2474,7 +2401,7 @@ static int create_dct(struct ib_pd *pd, struct mlx5_ib_qp *qp, MLX5_SET64(dctc, dctc, dc_access_key, ucmd->access_key); MLX5_SET(dctc, dctc, user_index, uidx); - if (ucmd->flags & MLX5_QP_FLAG_SCATTER_CQE) { + if (qp->flags_en & MLX5_QP_FLAG_SCATTER_CQE) { int rcqe_sz = mlx5_ib_get_cqe_size(attr->recv_cq); if (rcqe_sz == 128) @@ -2577,22 +2504,81 @@ static int check_valid_flow(struct mlx5_ib_dev *dev, struct ib_pd *pd, return 0; } -static int process_vendor_flags(struct mlx5_ib_qp *qp, +static void process_vendor_flag(struct mlx5_ib_dev *dev, int *flags, int flag, + bool cond, struct mlx5_ib_qp *qp) +{ + if (!(*flags & flag)) + return; + + if (cond) { + qp->flags_en |= flag; + *flags &= ~flag; + return; + } + + if (flag == MLX5_QP_FLAG_SCATTER_CQE) { + /* + * We don't return error if this flag was provided, + * and mlx5 doesn't have right capability. + */ + *flags &= ~MLX5_QP_FLAG_SCATTER_CQE; + return; + } + mlx5_ib_dbg(dev, "Vendor create QP flag 0x%X is not supported\n", flag); +} + +static int process_vendor_flags(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, struct ib_qp_init_attr *attr, struct mlx5_ib_create_qp *ucmd) { - switch (ucmd->flags & (MLX5_QP_FLAG_TYPE_DCT | MLX5_QP_FLAG_TYPE_DCI)) { + struct mlx5_core_dev *mdev = dev->mdev; + int flags = ucmd->flags; + bool cond; + + switch (flags & (MLX5_QP_FLAG_TYPE_DCT | MLX5_QP_FLAG_TYPE_DCI)) { case MLX5_QP_FLAG_TYPE_DCI: qp->qp_sub_type = MLX5_IB_QPT_DCI; break; case MLX5_QP_FLAG_TYPE_DCT: qp->qp_sub_type = MLX5_IB_QPT_DCT; - break; + fallthrough; default: + break; + } + + if (attr->qp_type == IB_QPT_DRIVER && !qp->qp_sub_type) return -EINVAL; + + process_vendor_flag(dev, &flags, MLX5_QP_FLAG_TYPE_DCI, true, qp); + process_vendor_flag(dev, &flags, MLX5_QP_FLAG_TYPE_DCT, true, qp); + + process_vendor_flag(dev, &flags, MLX5_QP_FLAG_SIGNATURE, true, qp); + process_vendor_flag(dev, &flags, MLX5_QP_FLAG_SCATTER_CQE, + MLX5_CAP_GEN(mdev, sctr_data_cqe), qp); + + if (attr->qp_type == IB_QPT_RAW_PACKET) { + cond = MLX5_CAP_ETH(mdev, tunnel_stateless_vxlan) || + MLX5_CAP_ETH(mdev, tunnel_stateless_gre) || + MLX5_CAP_ETH(mdev, tunnel_stateless_geneve_rx); + process_vendor_flag(dev, &flags, MLX5_QP_FLAG_TUNNEL_OFFLOADS, + cond, qp); + process_vendor_flag(dev, &flags, + MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_UC, true, + qp); + process_vendor_flag(dev, &flags, + MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_MC, true, + qp); } - return 0; + if (attr->qp_type == IB_QPT_RC) + process_vendor_flag(dev, &flags, + MLX5_QP_FLAG_PACKET_BASED_CREDIT_MODE, + MLX5_CAP_GEN(mdev, qp_packet_based), qp); + + if (flags) + mlx5_ib_dbg(dev, "udata has unsupported flags 0x%X\n", flags); + + return (flags) ? -EINVAL : 0; } static void process_create_flag(struct mlx5_ib_dev *dev, int *flags, int flag, @@ -2774,8 +2760,8 @@ struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd, if (!qp) return ERR_PTR(-ENOMEM); - if (init_attr->qp_type == IB_QPT_DRIVER) { - err = process_vendor_flags(qp, init_attr, &ucmd); + if (udata) { + err = process_vendor_flags(dev, qp, init_attr, &ucmd); if (err) goto free_qp; } -- cgit v1.2.3 From 3ae7e66a019e18896c46fcbb7ae28bfc343331c4 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 27 Apr 2020 18:46:19 +0300 Subject: RDMA/mlx5: Delete unsupported QP types There is no need to explicitly check unsupported QP types, rely on "default" keyword in switch-case to catch them. Link: https://lore.kernel.org/r/20200427154636.381474-20-leon@kernel.org Reviewed-by: Maor Gottlieb Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/qp.c | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 2673678f1899..5e156b02816a 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -760,10 +760,7 @@ static int to_mlx5_st(enum ib_qp_type type) case IB_QPT_SMI: return MLX5_QP_ST_QP0; case MLX5_IB_QPT_HW_GSI: return MLX5_QP_ST_QP1; case MLX5_IB_QPT_DCI: return MLX5_QP_ST_DCI; - case IB_QPT_RAW_IPV6: return MLX5_QP_ST_RAW_IPV6; - case IB_QPT_RAW_PACKET: - case IB_QPT_RAW_ETHERTYPE: return MLX5_QP_ST_RAW_ETHERTYPE; - case IB_QPT_MAX: + case IB_QPT_RAW_PACKET: return MLX5_QP_ST_RAW_ETHERTYPE; default: return -EINVAL; } } @@ -2282,14 +2279,10 @@ static void get_cqs(enum ib_qp_type qp_type, case IB_QPT_RC: case IB_QPT_UC: case IB_QPT_UD: - case IB_QPT_RAW_IPV6: - case IB_QPT_RAW_ETHERTYPE: case IB_QPT_RAW_PACKET: *send_cq = ib_send_cq ? to_mcq(ib_send_cq) : NULL; *recv_cq = ib_recv_cq ? to_mcq(ib_recv_cq) : NULL; break; - - case IB_QPT_MAX: default: *send_cq = NULL; *recv_cq = NULL; @@ -2434,9 +2427,6 @@ static int check_qp_type(struct mlx5_ib_dev *dev, struct ib_qp_init_attr *attr) case IB_QPT_DRIVER: case IB_QPT_GSI: return 0; - case IB_QPT_RAW_IPV6: - case IB_QPT_RAW_ETHERTYPE: - case IB_QPT_MAX: default: goto out; } -- cgit v1.2.3 From 7aede1a25f4b84318e8a266d7b830a5ed554e370 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 27 Apr 2020 18:46:20 +0300 Subject: RDMA/mlx5: Store QP type in the vendor QP structure QP type is stored in the IB/core QP struct, but it doesn't have all the needed information, like internal QP type used in the driver itself. Update mlx5_ib to have cached QP type which includes both IBTA and Mellanox specific one. Such change allows us to make even further cleanup of QP creation flow. Link: https://lore.kernel.org/r/20200427154636.381474-21-leon@kernel.org Reviewed-by: Maor Gottlieb Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/mlx5_ib.h | 8 ++- drivers/infiniband/hw/mlx5/odp.c | 3 +- drivers/infiniband/hw/mlx5/qp.c | 136 +++++++++++++++++------------------ 3 files changed, 74 insertions(+), 73 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 9b2baf119823..82ea01a211dd 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -465,8 +465,12 @@ struct mlx5_ib_qp { struct mlx5_rate_limit rl; u32 underlay_qpn; u32 flags_en; - /* storage for qp sub type when core qp type is IB_QPT_DRIVER */ - enum ib_qp_type qp_sub_type; + /* + * IB/core doesn't store low-level QP types, so + * store both MLX and IBTA types in the field below. + * IB_QPT_DRIVER will be break to DCI/DCT subtypes. + */ + enum ib_qp_type type; /* A flag to indicate if there's a new counter is configured * but not take effective */ diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index e4759310c0e2..70577d546567 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -1136,8 +1136,7 @@ static int mlx5_ib_mr_initiator_pfault_handler( if (qp->ibqp.qp_type == IB_QPT_XRC_INI) *wqe += sizeof(struct mlx5_wqe_xrc_seg); - if (qp->ibqp.qp_type == IB_QPT_UD || - qp->qp_sub_type == MLX5_IB_QPT_DCI) { + if (qp->type == IB_QPT_UD || qp->type == MLX5_IB_QPT_DCI) { av = *wqe; if (av->dqp_dct & cpu_to_be32(MLX5_EXTENDED_UD_AV)) *wqe += sizeof(struct mlx5_av); diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 5e156b02816a..0d3f4bafe448 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -1227,14 +1227,13 @@ static void destroy_qp_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp) static u32 get_rx_type(struct mlx5_ib_qp *qp, struct ib_qp_init_attr *attr) { - if (attr->srq || (attr->qp_type == IB_QPT_XRC_TGT) || - (qp->qp_sub_type == MLX5_IB_QPT_DCI) || - (attr->qp_type == IB_QPT_XRC_INI)) + if (attr->srq || (qp->type == IB_QPT_XRC_TGT) || + (qp->type == MLX5_IB_QPT_DCI) || (qp->type == IB_QPT_XRC_INI)) return MLX5_SRQ_RQ; else if (!qp->has_rq) return MLX5_ZERO_LEN_RQ; - else - return MLX5_NON_ZERO_RQ; + + return MLX5_NON_ZERO_RQ; } static int create_raw_packet_qp_tis(struct mlx5_ib_dev *dev, @@ -1967,9 +1966,7 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, spin_lock_init(&qp->sq.lock); spin_lock_init(&qp->rq.lock); - mlx5_st = to_mlx5_st((init_attr->qp_type != IB_QPT_DRIVER) ? - init_attr->qp_type : - qp->qp_sub_type); + mlx5_st = to_mlx5_st(qp->type); if (mlx5_st < 0) return -EINVAL; @@ -2073,8 +2070,7 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, MLX5_RES_SCAT_DATA32_CQE); } if ((qp->flags_en & MLX5_QP_FLAG_SCATTER_CQE) && - (qp->qp_sub_type == MLX5_IB_QPT_DCI || - init_attr->qp_type == IB_QPT_RC)) + (qp->type == MLX5_IB_QPT_DCI || qp->type == IB_QPT_RC)) configure_requester_scat_cqe(dev, init_attr, ucmd, qpc); if (qp->rq.wqe_cnt) { @@ -2166,7 +2162,7 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, base->container_mibqp = qp; base->mqp.event = mlx5_ib_qp_event; - get_cqs(init_attr->qp_type, init_attr->send_cq, init_attr->recv_cq, + get_cqs(qp->type, init_attr->send_cq, init_attr->recv_cq, &send_cq, &recv_cq); spin_lock_irqsave(&dev->reset_flow_resource_lock, flags); mlx5_ib_lock_cqs(send_cq, recv_cq); @@ -2406,7 +2402,8 @@ static int create_dct(struct ib_pd *pd, struct mlx5_ib_qp *qp, return 0; } -static int check_qp_type(struct mlx5_ib_dev *dev, struct ib_qp_init_attr *attr) +static int check_qp_type(struct mlx5_ib_dev *dev, struct ib_qp_init_attr *attr, + enum ib_qp_type *type) { if (attr->qp_type == IB_QPT_DRIVER && !MLX5_CAP_GEN(dev->mdev, dct)) goto out; @@ -2426,11 +2423,12 @@ static int check_qp_type(struct mlx5_ib_dev *dev, struct ib_qp_init_attr *attr) case MLX5_IB_QPT_REG_UMR: case IB_QPT_DRIVER: case IB_QPT_GSI: - return 0; + break; default: goto out; } + *type = attr->qp_type; return 0; out: @@ -2518,7 +2516,6 @@ static void process_vendor_flag(struct mlx5_ib_dev *dev, int *flags, int flag, } static int process_vendor_flags(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, - struct ib_qp_init_attr *attr, struct mlx5_ib_create_qp *ucmd) { struct mlx5_core_dev *mdev = dev->mdev; @@ -2527,17 +2524,20 @@ static int process_vendor_flags(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, switch (flags & (MLX5_QP_FLAG_TYPE_DCT | MLX5_QP_FLAG_TYPE_DCI)) { case MLX5_QP_FLAG_TYPE_DCI: - qp->qp_sub_type = MLX5_IB_QPT_DCI; + qp->type = MLX5_IB_QPT_DCI; break; case MLX5_QP_FLAG_TYPE_DCT: - qp->qp_sub_type = MLX5_IB_QPT_DCT; - fallthrough; - default: + qp->type = MLX5_IB_QPT_DCT; break; - } - - if (attr->qp_type == IB_QPT_DRIVER && !qp->qp_sub_type) + default: + if (qp->type != IB_QPT_DRIVER) + break; + /* + * It is IB_QPT_DRIVER and or no subtype or + * wrong subtype were provided. + */ return -EINVAL; + } process_vendor_flag(dev, &flags, MLX5_QP_FLAG_TYPE_DCI, true, qp); process_vendor_flag(dev, &flags, MLX5_QP_FLAG_TYPE_DCT, true, qp); @@ -2546,7 +2546,7 @@ static int process_vendor_flags(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, process_vendor_flag(dev, &flags, MLX5_QP_FLAG_SCATTER_CQE, MLX5_CAP_GEN(mdev, sctr_data_cqe), qp); - if (attr->qp_type == IB_QPT_RAW_PACKET) { + if (qp->type == IB_QPT_RAW_PACKET) { cond = MLX5_CAP_ETH(mdev, tunnel_stateless_vxlan) || MLX5_CAP_ETH(mdev, tunnel_stateless_gre) || MLX5_CAP_ETH(mdev, tunnel_stateless_geneve_rx); @@ -2560,7 +2560,7 @@ static int process_vendor_flags(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, qp); } - if (attr->qp_type == IB_QPT_RC) + if (qp->type == IB_QPT_RC) process_vendor_flag(dev, &flags, MLX5_QP_FLAG_PACKET_BASED_CREDIT_MODE, MLX5_CAP_GEN(mdev, qp_packet_based), qp); @@ -2597,12 +2597,12 @@ static void process_create_flag(struct mlx5_ib_dev *dev, int *flags, int flag, static int process_create_flags(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, struct ib_qp_init_attr *attr) { - enum ib_qp_type qp_type = attr->qp_type; + enum ib_qp_type qp_type = qp->type; struct mlx5_core_dev *mdev = dev->mdev; int create_flags = attr->create_flags; bool cond; - if (qp->qp_sub_type == MLX5_IB_QPT_DCT) + if (qp_type == MLX5_IB_QPT_DCT) return (create_flags) ? -EINVAL : 0; if (qp_type == IB_QPT_RAW_PACKET && attr->rwq_ind_tbl) @@ -2656,34 +2656,6 @@ static int process_create_flags(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, return (create_flags) ? -EINVAL : 0; } -static int create_driver_qp(struct ib_pd *pd, struct mlx5_ib_qp *qp, - struct ib_qp_init_attr *attr, - struct mlx5_ib_create_qp *ucmd, - struct ib_udata *udata) -{ - struct mlx5_ib_dev *mdev = to_mdev(pd->device); - int ret = -EINVAL; - - switch (qp->qp_sub_type) { - case MLX5_IB_QPT_DCT: - if (!attr->srq || !attr->recv_cq) - goto out; - - ret = create_dct(pd, qp, attr, ucmd, udata); - break; - case MLX5_IB_QPT_DCI: - if (attr->cap.max_recv_wr || attr->cap.max_recv_sge) - goto out; - - ret = create_qp_common(mdev, pd, attr, ucmd, udata, qp); - break; - default: - return -EINVAL; - } - -out: return ret; -} - static size_t process_udata_size(struct ib_qp_init_attr *attr, struct ib_udata *udata) { @@ -2707,6 +2679,30 @@ static int create_raw_qp(struct ib_pd *pd, struct mlx5_ib_qp *qp, return create_qp_common(dev, pd, attr, ucmd, udata, qp); } +static int check_qp_attr(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, + struct ib_qp_init_attr *attr) +{ + int ret = 0; + + switch (qp->type) { + case MLX5_IB_QPT_DCT: + ret = (!attr->srq || !attr->recv_cq) ? -EINVAL : 0; + break; + case MLX5_IB_QPT_DCI: + ret = (attr->cap.max_recv_wr || attr->cap.max_recv_sge) ? + -EINVAL : + 0; + break; + default: + break; + } + + if (ret) + mlx5_ib_dbg(dev, "QP type %d has wrong attributes\n", qp->type); + + return ret; +} + struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *init_attr, struct ib_udata *udata) @@ -2714,13 +2710,14 @@ struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd, struct mlx5_ib_create_qp ucmd = {}; struct mlx5_ib_dev *dev; struct mlx5_ib_qp *qp; + enum ib_qp_type type; u16 xrcdn = 0; int err; dev = pd ? to_mdev(pd->device) : to_mdev(to_mxrcd(init_attr->xrcd)->ibxrcd.device); - err = check_qp_type(dev, init_attr); + err = check_qp_type(dev, init_attr, &type); if (err) { mlx5_ib_dbg(dev, "Unsupported QP type %d\n", init_attr->qp_type); @@ -2750,8 +2747,9 @@ struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd, if (!qp) return ERR_PTR(-ENOMEM); + qp->type = type; if (udata) { - err = process_vendor_flags(dev, qp, init_attr, &ucmd); + err = process_vendor_flags(dev, qp, &ucmd); if (err) goto free_qp; } @@ -2759,16 +2757,20 @@ struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd, if (err) goto free_qp; - if (init_attr->qp_type == IB_QPT_XRC_TGT) + if (qp->type == IB_QPT_XRC_TGT) xrcdn = to_mxrcd(init_attr->xrcd)->xrcdn; - switch (init_attr->qp_type) { - case IB_QPT_DRIVER: - err = create_driver_qp(pd, qp, init_attr, &ucmd, udata); - break; + err = check_qp_attr(dev, qp, init_attr); + if (err) + goto free_qp; + + switch (qp->type) { case IB_QPT_RAW_PACKET: err = create_raw_qp(pd, qp, init_attr, &ucmd, udata); break; + case MLX5_IB_QPT_DCT: + err = create_dct(pd, qp, init_attr, &ucmd, udata); + break; default: err = create_qp_common(dev, pd, init_attr, (udata) ? &ucmd : NULL, udata, qp); @@ -2821,7 +2823,7 @@ int mlx5_ib_destroy_qp(struct ib_qp *qp, struct ib_udata *udata) if (unlikely(qp->qp_type == IB_QPT_GSI)) return mlx5_ib_gsi_destroy_qp(qp); - if (mqp->qp_sub_type == MLX5_IB_QPT_DCT) + if (mqp->type == MLX5_IB_QPT_DCT) return mlx5_ib_destroy_dct(mqp); destroy_qp_common(dev, mqp, udata); @@ -3508,8 +3510,7 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, u16 op; u8 tx_affinity = 0; - mlx5_st = to_mlx5_st(ibqp->qp_type == IB_QPT_DRIVER ? - qp->qp_sub_type : ibqp->qp_type); + mlx5_st = to_mlx5_st(qp->type); if (mlx5_st < 0) return -EINVAL; @@ -3970,11 +3971,8 @@ int mlx5_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, if (unlikely(ibqp->qp_type == IB_QPT_GSI)) return mlx5_ib_gsi_modify_qp(ibqp, attr, attr_mask); - if (ibqp->qp_type == IB_QPT_DRIVER) - qp_type = qp->qp_sub_type; - else - qp_type = (unlikely(ibqp->qp_type == MLX5_IB_QPT_HW_GSI)) ? - IB_QPT_GSI : ibqp->qp_type; + qp_type = (unlikely(ibqp->qp_type == MLX5_IB_QPT_HW_GSI)) ? IB_QPT_GSI : + qp->type; if (qp_type == MLX5_IB_QPT_DCT) return mlx5_ib_modify_dct(ibqp, attr, attr_mask, udata); @@ -5813,7 +5811,7 @@ int mlx5_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, memset(qp_init_attr, 0, sizeof(*qp_init_attr)); memset(qp_attr, 0, sizeof(*qp_attr)); - if (unlikely(qp->qp_sub_type == MLX5_IB_QPT_DCT)) + if (unlikely(qp->type == MLX5_IB_QPT_DCT)) return mlx5_ib_dct_query_qp(dev, qp, qp_attr, qp_attr_mask, qp_init_attr); -- cgit v1.2.3 From 266424eba6c90ab8b12cf73aae00f1b08c0619cf Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 27 Apr 2020 18:46:21 +0300 Subject: RDMA/mlx5: Promote RSS RAW QP attribute check in higher level Perform check of attributes of RAW PACKET QP in separate function. Link: https://lore.kernel.org/r/20200427154636.381474-22-leon@kernel.org Reviewed-by: Maor Gottlieb Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/qp.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 0d3f4bafe448..454433a18b97 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -1645,9 +1645,6 @@ static int create_rss_raw_qp_tir(struct ib_pd *pd, struct mlx5_ib_qp *qp, size_t required_cmd_sz; u8 lb_flag = 0; - if (init_attr->send_cq) - return -EINVAL; - min_resp_len = offsetof(typeof(resp), bfreg_index) + sizeof(resp.bfreg_index); if (udata->outlen < min_resp_len) return -EINVAL; @@ -2693,6 +2690,9 @@ static int check_qp_attr(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, -EINVAL : 0; break; + case IB_QPT_RAW_PACKET: + ret = (attr->rwq_ind_tbl && attr->send_cq) ? -EINVAL : 0; + break; default: break; } -- cgit v1.2.3 From 5ce0592b0ee56e41f1a4a164ac2f54dbfbbf5e49 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 27 Apr 2020 18:46:22 +0300 Subject: RDMA/mlx5: Combine copy of create QP command in RSS RAW QP Change the create QP flow to handle all copy_from_user() operations in one place. Link: https://lore.kernel.org/r/20200427154636.381474-23-leon@kernel.org Reviewed-by: Maor Gottlieb Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/qp.c | 156 +++++++++++++++++++++------------------- 1 file changed, 82 insertions(+), 74 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 454433a18b97..4f69105f082b 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -1624,6 +1624,7 @@ static void destroy_rss_raw_qp_tir(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *q static int create_rss_raw_qp_tir(struct ib_pd *pd, struct mlx5_ib_qp *qp, struct ib_qp_init_attr *init_attr, + struct mlx5_ib_create_qp_rss *ucmd, struct ib_udata *udata) { struct mlx5_ib_ucontext *mucontext = rdma_udata_to_drv_context( @@ -1641,46 +1642,26 @@ static int create_rss_raw_qp_tir(struct ib_pd *pd, struct mlx5_ib_qp *qp, u32 outer_l4; size_t min_resp_len; u32 tdn = mucontext->tdn; - struct mlx5_ib_create_qp_rss ucmd = {}; - size_t required_cmd_sz; u8 lb_flag = 0; min_resp_len = offsetof(typeof(resp), bfreg_index) + sizeof(resp.bfreg_index); if (udata->outlen < min_resp_len) return -EINVAL; - required_cmd_sz = offsetof(typeof(ucmd), flags) + sizeof(ucmd.flags); - if (udata->inlen < required_cmd_sz) { - mlx5_ib_dbg(dev, "invalid inlen\n"); - return -EINVAL; - } - - if (udata->inlen > sizeof(ucmd) && - !ib_is_udata_cleared(udata, sizeof(ucmd), - udata->inlen - sizeof(ucmd))) { - mlx5_ib_dbg(dev, "inlen is not supported\n"); - return -EOPNOTSUPP; - } - - if (ib_copy_from_udata(&ucmd, udata, min(sizeof(ucmd), udata->inlen))) { - mlx5_ib_dbg(dev, "copy failed\n"); - return -EFAULT; - } - - if (ucmd.comp_mask) { + if (ucmd->comp_mask) { mlx5_ib_dbg(dev, "invalid comp mask\n"); return -EOPNOTSUPP; } - if (ucmd.flags & ~(MLX5_QP_FLAG_TUNNEL_OFFLOADS | + if (ucmd->flags & ~(MLX5_QP_FLAG_TUNNEL_OFFLOADS | MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_UC | MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_MC)) { mlx5_ib_dbg(dev, "invalid flags\n"); return -EOPNOTSUPP; } - if (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_INNER && - !(ucmd.flags & MLX5_QP_FLAG_TUNNEL_OFFLOADS)) { + if (ucmd->rx_hash_fields_mask & MLX5_RX_HASH_INNER && + !(ucmd->flags & MLX5_QP_FLAG_TUNNEL_OFFLOADS)) { mlx5_ib_dbg(dev, "Tunnel offloads must be set for inner RSS\n"); return -EOPNOTSUPP; } @@ -1717,29 +1698,29 @@ static int create_rss_raw_qp_tir(struct ib_pd *pd, struct mlx5_ib_qp *qp, hfso = MLX5_ADDR_OF(tirc, tirc, rx_hash_field_selector_outer); - if (ucmd.flags & MLX5_QP_FLAG_TUNNEL_OFFLOADS) + if (ucmd->flags & MLX5_QP_FLAG_TUNNEL_OFFLOADS) MLX5_SET(tirc, tirc, tunneled_offload_en, 1); MLX5_SET(tirc, tirc, self_lb_block, lb_flag); - if (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_INNER) + if (ucmd->rx_hash_fields_mask & MLX5_RX_HASH_INNER) hfso = MLX5_ADDR_OF(tirc, tirc, rx_hash_field_selector_inner); else hfso = MLX5_ADDR_OF(tirc, tirc, rx_hash_field_selector_outer); - switch (ucmd.rx_hash_function) { + switch (ucmd->rx_hash_function) { case MLX5_RX_HASH_FUNC_TOEPLITZ: { void *rss_key = MLX5_ADDR_OF(tirc, tirc, rx_hash_toeplitz_key); size_t len = MLX5_FLD_SZ_BYTES(tirc, rx_hash_toeplitz_key); - if (len != ucmd.rx_key_len) { + if (len != ucmd->rx_key_len) { err = -EINVAL; goto err; } MLX5_SET(tirc, tirc, rx_hash_fn, MLX5_RX_HASH_FN_TOEPLITZ); - memcpy(rss_key, ucmd.rx_hash_key, len); + memcpy(rss_key, ucmd->rx_hash_key, len); break; } default: @@ -1747,7 +1728,7 @@ static int create_rss_raw_qp_tir(struct ib_pd *pd, struct mlx5_ib_qp *qp, goto err; } - if (!ucmd.rx_hash_fields_mask) { + if (!ucmd->rx_hash_fields_mask) { /* special case when this TIR serves as steering entry without hashing */ if (!init_attr->rwq_ind_tbl->log_ind_tbl_size) goto create_tir; @@ -1755,29 +1736,31 @@ static int create_rss_raw_qp_tir(struct ib_pd *pd, struct mlx5_ib_qp *qp, goto err; } - if (((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_IPV4) || - (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_IPV4)) && - ((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_IPV6) || - (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_IPV6))) { + if (((ucmd->rx_hash_fields_mask & MLX5_RX_HASH_SRC_IPV4) || + (ucmd->rx_hash_fields_mask & MLX5_RX_HASH_DST_IPV4)) && + ((ucmd->rx_hash_fields_mask & MLX5_RX_HASH_SRC_IPV6) || + (ucmd->rx_hash_fields_mask & MLX5_RX_HASH_DST_IPV6))) { err = -EINVAL; goto err; } /* If none of IPV4 & IPV6 SRC/DST was set - this bit field is ignored */ - if ((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_IPV4) || - (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_IPV4)) + if ((ucmd->rx_hash_fields_mask & MLX5_RX_HASH_SRC_IPV4) || + (ucmd->rx_hash_fields_mask & MLX5_RX_HASH_DST_IPV4)) MLX5_SET(rx_hash_field_select, hfso, l3_prot_type, MLX5_L3_PROT_TYPE_IPV4); - else if ((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_IPV6) || - (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_IPV6)) + else if ((ucmd->rx_hash_fields_mask & MLX5_RX_HASH_SRC_IPV6) || + (ucmd->rx_hash_fields_mask & MLX5_RX_HASH_DST_IPV6)) MLX5_SET(rx_hash_field_select, hfso, l3_prot_type, MLX5_L3_PROT_TYPE_IPV6); - outer_l4 = ((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_PORT_TCP) || - (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_PORT_TCP)) << 0 | - ((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_PORT_UDP) || - (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_PORT_UDP)) << 1 | - (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_IPSEC_SPI) << 2; + outer_l4 = ((ucmd->rx_hash_fields_mask & MLX5_RX_HASH_SRC_PORT_TCP) || + (ucmd->rx_hash_fields_mask & MLX5_RX_HASH_DST_PORT_TCP)) + << 0 | + ((ucmd->rx_hash_fields_mask & MLX5_RX_HASH_SRC_PORT_UDP) || + (ucmd->rx_hash_fields_mask & MLX5_RX_HASH_DST_PORT_UDP)) + << 1 | + (ucmd->rx_hash_fields_mask & MLX5_RX_HASH_IPSEC_SPI) << 2; /* Check that only one l4 protocol is set */ if (outer_l4 & (outer_l4 - 1)) { @@ -1786,32 +1769,32 @@ static int create_rss_raw_qp_tir(struct ib_pd *pd, struct mlx5_ib_qp *qp, } /* If none of TCP & UDP SRC/DST was set - this bit field is ignored */ - if ((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_PORT_TCP) || - (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_PORT_TCP)) + if ((ucmd->rx_hash_fields_mask & MLX5_RX_HASH_SRC_PORT_TCP) || + (ucmd->rx_hash_fields_mask & MLX5_RX_HASH_DST_PORT_TCP)) MLX5_SET(rx_hash_field_select, hfso, l4_prot_type, MLX5_L4_PROT_TYPE_TCP); - else if ((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_PORT_UDP) || - (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_PORT_UDP)) + else if ((ucmd->rx_hash_fields_mask & MLX5_RX_HASH_SRC_PORT_UDP) || + (ucmd->rx_hash_fields_mask & MLX5_RX_HASH_DST_PORT_UDP)) MLX5_SET(rx_hash_field_select, hfso, l4_prot_type, MLX5_L4_PROT_TYPE_UDP); - if ((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_IPV4) || - (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_IPV6)) + if ((ucmd->rx_hash_fields_mask & MLX5_RX_HASH_SRC_IPV4) || + (ucmd->rx_hash_fields_mask & MLX5_RX_HASH_SRC_IPV6)) selected_fields |= MLX5_HASH_FIELD_SEL_SRC_IP; - if ((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_IPV4) || - (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_IPV6)) + if ((ucmd->rx_hash_fields_mask & MLX5_RX_HASH_DST_IPV4) || + (ucmd->rx_hash_fields_mask & MLX5_RX_HASH_DST_IPV6)) selected_fields |= MLX5_HASH_FIELD_SEL_DST_IP; - if ((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_PORT_TCP) || - (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_PORT_UDP)) + if ((ucmd->rx_hash_fields_mask & MLX5_RX_HASH_SRC_PORT_TCP) || + (ucmd->rx_hash_fields_mask & MLX5_RX_HASH_SRC_PORT_UDP)) selected_fields |= MLX5_HASH_FIELD_SEL_L4_SPORT; - if ((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_PORT_TCP) || - (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_PORT_UDP)) + if ((ucmd->rx_hash_fields_mask & MLX5_RX_HASH_DST_PORT_TCP) || + (ucmd->rx_hash_fields_mask & MLX5_RX_HASH_DST_PORT_UDP)) selected_fields |= MLX5_HASH_FIELD_SEL_L4_DPORT; - if (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_IPSEC_SPI) + if (ucmd->rx_hash_fields_mask & MLX5_RX_HASH_IPSEC_SPI) selected_fields |= MLX5_HASH_FIELD_SEL_IPSEC_SPI; MLX5_SET(rx_hash_field_select, hfso, selected_fields, selected_fields); @@ -2513,11 +2496,16 @@ static void process_vendor_flag(struct mlx5_ib_dev *dev, int *flags, int flag, } static int process_vendor_flags(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, - struct mlx5_ib_create_qp *ucmd) + void *ucmd, struct ib_qp_init_attr *attr) { struct mlx5_core_dev *mdev = dev->mdev; - int flags = ucmd->flags; bool cond; + int flags; + + if (attr->rwq_ind_tbl) + flags = ((struct mlx5_ib_create_qp_rss *)ucmd)->flags; + else + flags = ((struct mlx5_ib_create_qp *)ucmd)->flags; switch (flags & (MLX5_QP_FLAG_TYPE_DCT | MLX5_QP_FLAG_TYPE_DCI)) { case MLX5_QP_FLAG_TYPE_DCI: @@ -2657,21 +2645,32 @@ static size_t process_udata_size(struct ib_qp_init_attr *attr, struct ib_udata *udata) { size_t ucmd = sizeof(struct mlx5_ib_create_qp); + size_t inlen = udata->inlen; if (attr->qp_type == IB_QPT_DRIVER) - return (udata->inlen < ucmd) ? 0 : ucmd; + return (inlen < ucmd) ? 0 : ucmd; + + if (!attr->rwq_ind_tbl) + return ucmd; + + if (inlen < offsetofend(struct mlx5_ib_create_qp_rss, flags)) + return 0; + + ucmd = sizeof(struct mlx5_ib_create_qp_rss); + if (inlen > ucmd && !ib_is_udata_cleared(udata, ucmd, inlen - ucmd)) + return 0; - return ucmd; + return min(ucmd, inlen); } static int create_raw_qp(struct ib_pd *pd, struct mlx5_ib_qp *qp, - struct ib_qp_init_attr *attr, - struct mlx5_ib_create_qp *ucmd, struct ib_udata *udata) + struct ib_qp_init_attr *attr, void *ucmd, + struct ib_udata *udata) { struct mlx5_ib_dev *dev = to_mdev(pd->device); if (attr->rwq_ind_tbl) - return create_rss_raw_qp_tir(pd, qp, attr, udata); + return create_rss_raw_qp_tir(pd, qp, attr, ucmd, udata); return create_qp_common(dev, pd, attr, ucmd, udata, qp); } @@ -2707,10 +2706,10 @@ struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *init_attr, struct ib_udata *udata) { - struct mlx5_ib_create_qp ucmd = {}; struct mlx5_ib_dev *dev; struct mlx5_ib_qp *qp; enum ib_qp_type type; + void *ucmd = NULL; u16 xrcdn = 0; int err; @@ -2731,25 +2730,31 @@ struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd, if (init_attr->qp_type == IB_QPT_GSI) return mlx5_ib_gsi_create_qp(pd, init_attr); - if (udata && !init_attr->rwq_ind_tbl) { + if (udata) { size_t inlen = process_udata_size(init_attr, udata); if (!inlen) return ERR_PTR(-EINVAL); - err = ib_copy_from_udata(&ucmd, udata, inlen); + ucmd = kzalloc(inlen, GFP_KERNEL); + if (!ucmd) + return ERR_PTR(-ENOMEM); + + err = ib_copy_from_udata(ucmd, udata, inlen); if (err) - return ERR_PTR(err); + goto free_ucmd; } qp = kzalloc(sizeof(*qp), GFP_KERNEL); - if (!qp) - return ERR_PTR(-ENOMEM); + if (!qp) { + err = -ENOMEM; + goto free_ucmd; + } qp->type = type; if (udata) { - err = process_vendor_flags(dev, qp, &ucmd); + err = process_vendor_flags(dev, qp, ucmd, init_attr); if (err) goto free_qp; } @@ -2766,20 +2771,21 @@ struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd, switch (qp->type) { case IB_QPT_RAW_PACKET: - err = create_raw_qp(pd, qp, init_attr, &ucmd, udata); + err = create_raw_qp(pd, qp, init_attr, ucmd, udata); break; case MLX5_IB_QPT_DCT: - err = create_dct(pd, qp, init_attr, &ucmd, udata); + err = create_dct(pd, qp, init_attr, ucmd, udata); break; default: - err = create_qp_common(dev, pd, init_attr, - (udata) ? &ucmd : NULL, udata, qp); + err = create_qp_common(dev, pd, init_attr, ucmd, udata, qp); } if (err) { mlx5_ib_dbg(dev, "create_qp_common failed\n"); goto free_qp; } + kfree(ucmd); + if (is_qp0(init_attr->qp_type)) qp->ibqp.qp_num = 0; else if (is_qp1(init_attr->qp_type)) @@ -2793,6 +2799,8 @@ struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd, free_qp: kfree(qp); +free_ucmd: + kfree(ucmd); return ERR_PTR(err); } -- cgit v1.2.3 From 76883a6cc1459b2aa1e96e1eaa19aa37221c2406 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 27 Apr 2020 18:46:23 +0300 Subject: RDMA/mlx5: Remove second user copy in create_user_qp Combine copy_from_user() from create_user_qp() and general code. Link: https://lore.kernel.org/r/20200427154636.381474-24-leon@kernel.org Reviewed-by: Maor Gottlieb Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/qp.c | 34 +++++++++++++++------------------- 1 file changed, 15 insertions(+), 19 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 4f69105f082b..495f03905fbf 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -914,13 +914,12 @@ static int adjust_bfregn(struct mlx5_ib_dev *dev, static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd, struct mlx5_ib_qp *qp, struct ib_udata *udata, - struct ib_qp_init_attr *attr, - u32 **in, + struct ib_qp_init_attr *attr, u32 **in, struct mlx5_ib_create_qp_resp *resp, int *inlen, - struct mlx5_ib_qp_base *base) + struct mlx5_ib_qp_base *base, + struct mlx5_ib_create_qp *ucmd) { struct mlx5_ib_ucontext *context; - struct mlx5_ib_create_qp ucmd; struct mlx5_ib_ubuffer *ubuffer = &base->ubuffer; int page_shift = 0; int uar_index = 0; @@ -934,24 +933,18 @@ static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd, u16 uid; u32 uar_flags; - err = ib_copy_from_udata(&ucmd, udata, sizeof(ucmd)); - if (err) { - mlx5_ib_dbg(dev, "copy failed\n"); - return err; - } - context = rdma_udata_to_drv_context(udata, struct mlx5_ib_ucontext, ibucontext); - uar_flags = ucmd.flags & (MLX5_QP_FLAG_UAR_PAGE_INDEX | - MLX5_QP_FLAG_BFREG_INDEX); + uar_flags = qp->flags_en & + (MLX5_QP_FLAG_UAR_PAGE_INDEX | MLX5_QP_FLAG_BFREG_INDEX); switch (uar_flags) { case MLX5_QP_FLAG_UAR_PAGE_INDEX: - uar_index = ucmd.bfreg_index; + uar_index = ucmd->bfreg_index; bfregn = MLX5_IB_INVALID_BFREG; break; case MLX5_QP_FLAG_BFREG_INDEX: uar_index = bfregn_to_uar_index(dev, &context->bfregi, - ucmd.bfreg_index, true); + ucmd->bfreg_index, true); if (uar_index < 0) return uar_index; bfregn = MLX5_IB_INVALID_BFREG; @@ -976,12 +969,12 @@ static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd, qp->sq.wqe_shift = ilog2(MLX5_SEND_WQE_BB); qp->sq.offset = qp->rq.wqe_cnt << qp->rq.wqe_shift; - err = set_user_buf_size(dev, qp, &ucmd, base, attr); + err = set_user_buf_size(dev, qp, ucmd, base, attr); if (err) goto err_bfreg; - if (ucmd.buf_addr && ubuffer->buf_size) { - ubuffer->buf_addr = ucmd.buf_addr; + if (ucmd->buf_addr && ubuffer->buf_size) { + ubuffer->buf_addr = ucmd->buf_addr; err = mlx5_ib_umem_get(dev, udata, ubuffer->buf_addr, ubuffer->buf_size, &ubuffer->umem, &npages, &page_shift, &ncont, &offset); @@ -1018,7 +1011,7 @@ static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd, resp->bfreg_index = MLX5_IB_INVALID_BFREG; qp->bfregn = bfregn; - err = mlx5_ib_db_map_user(context, udata, ucmd.db_addr, &qp->db); + err = mlx5_ib_db_map_user(context, udata, ucmd->db_addr, &qp->db); if (err) { mlx5_ib_dbg(dev, "map failed\n"); goto err_free; @@ -1991,7 +1984,7 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, return -EINVAL; } err = create_user_qp(dev, pd, qp, udata, init_attr, &in, - &resp, &inlen, base); + &resp, &inlen, base, ucmd); if (err) mlx5_ib_dbg(dev, "err %d\n", err); } else { @@ -2550,6 +2543,9 @@ static int process_vendor_flags(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, MLX5_QP_FLAG_PACKET_BASED_CREDIT_MODE, MLX5_CAP_GEN(mdev, qp_packet_based), qp); + process_vendor_flag(dev, &flags, MLX5_QP_FLAG_BFREG_INDEX, true, qp); + process_vendor_flag(dev, &flags, MLX5_QP_FLAG_UAR_PAGE_INDEX, true, qp); + if (flags) mlx5_ib_dbg(dev, "udata has unsupported flags 0x%X\n", flags); -- cgit v1.2.3 From 03c4077b284056fdb144f84aaa0ac0c80023f597 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 27 Apr 2020 18:46:24 +0300 Subject: RDMA/mlx5: Rely on existence of udata to separate kernel/user flows Instead of keeping special field to separate kernel/user create/destroy flows, rely on existence of udata pointer. All allocation flows are using kzalloc() and leave uninitialized pointers as NULL which makes MLX5_QP_EMPTY and MLX5_QP_KERNEL flows to be the same. Link: https://lore.kernel.org/r/20200427154636.381474-25-leon@kernel.org Reviewed-by: Maor Gottlieb Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/mlx5_ib.h | 14 -------------- drivers/infiniband/hw/mlx5/qp.c | 23 ++++++++++------------- 2 files changed, 10 insertions(+), 27 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 82ea01a211dd..df375cb4efbb 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -337,7 +337,6 @@ struct mlx5_ib_rwq { struct ib_umem *umem; size_t buf_size; unsigned int page_shift; - int create_type; struct mlx5_db db; u32 user_index; u32 wqe_count; @@ -346,17 +345,6 @@ struct mlx5_ib_rwq { u32 create_flags; /* Use enum mlx5_ib_wq_flags */ }; -enum { - MLX5_QP_USER, - MLX5_QP_KERNEL, - MLX5_QP_EMPTY -}; - -enum { - MLX5_WQ_USER, - MLX5_WQ_KERNEL -}; - struct mlx5_ib_rwq_ind_table { struct ib_rwq_ind_table ib_rwq_ind_tbl; u32 rqtn; @@ -457,8 +445,6 @@ struct mlx5_ib_qp { */ int bfregn; - int create_type; - struct list_head qps_list; struct list_head cq_recv_list; struct list_head cq_send_list; diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 495f03905fbf..74f09cdb4a33 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -897,7 +897,6 @@ static int create_user_rq(struct mlx5_ib_dev *dev, struct ib_pd *pd, goto err_umem; } - rwq->create_type = MLX5_WQ_USER; return 0; err_umem: @@ -1022,7 +1021,6 @@ static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd, mlx5_ib_dbg(dev, "copy failed\n"); goto err_unmap; } - qp->create_type = MLX5_QP_USER; return 0; @@ -1187,7 +1185,6 @@ static int create_kernel_qp(struct mlx5_ib_dev *dev, err = -ENOMEM; goto err_wrid; } - qp->create_type = MLX5_QP_KERNEL; return 0; @@ -1214,8 +1211,10 @@ static void destroy_qp_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp) kvfree(qp->sq.wrid); kvfree(qp->sq.wr_data); kvfree(qp->rq.wrid); - mlx5_db_free(dev->mdev, &qp->db); - mlx5_frag_buf_free(dev->mdev, &qp->buf); + if (qp->db.db) + mlx5_db_free(dev->mdev, &qp->db); + if (qp->buf.frags) + mlx5_frag_buf_free(dev->mdev, &qp->buf); } static u32 get_rx_type(struct mlx5_ib_qp *qp, struct ib_qp_init_attr *attr) @@ -2000,8 +1999,6 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, in = kvzalloc(inlen, GFP_KERNEL); if (!in) return -ENOMEM; - - qp->create_type = MLX5_QP_EMPTY; } if (is_sqp(init_attr->qp_type)) @@ -2155,9 +2152,9 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, return 0; err_create: - if (qp->create_type == MLX5_QP_USER) + if (udata) destroy_qp_user(dev, pd, qp, base, udata); - else if (qp->create_type == MLX5_QP_KERNEL) + else destroy_qp_kernel(dev, qp); err: @@ -2311,7 +2308,7 @@ static void destroy_qp_common(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, if (recv_cq) list_del(&qp->cq_recv_list); - if (qp->create_type == MLX5_QP_KERNEL) { + if (!udata) { __mlx5_ib_cq_clean(recv_cq, base->mqp.qpn, qp->ibqp.srq ? to_msrq(qp->ibqp.srq) : NULL); if (send_cq != recv_cq) @@ -2331,10 +2328,10 @@ static void destroy_qp_common(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, base->mqp.qpn); } - if (qp->create_type == MLX5_QP_KERNEL) - destroy_qp_kernel(dev, qp); - else if (qp->create_type == MLX5_QP_USER) + if (udata) destroy_qp_user(dev, &get_pd(qp)->ibpd, qp, base, udata); + else + destroy_qp_kernel(dev, qp); } static int create_dct(struct ib_pd *pd, struct mlx5_ib_qp *qp, -- cgit v1.2.3 From 0ce300b15aad7d3940d0625badcfad353041f5a7 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 27 Apr 2020 18:46:25 +0300 Subject: RDMA/mlx5: Delete impossible inlen check The inlen is set to be above zero in all flows before and can't be negative at this stage. Link: https://lore.kernel.org/r/20200427154636.381474-26-leon@kernel.org Reviewed-by: Maor Gottlieb Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/qp.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 74f09cdb4a33..5a43128d651b 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -2107,11 +2107,6 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, qp->flags &= ~IB_QP_CREATE_PCI_WRITE_END_PADDING; } - if (inlen < 0) { - err = -EINVAL; - goto err; - } - if (init_attr->qp_type == IB_QPT_RAW_PACKET || qp->flags & IB_QP_CREATE_SOURCE_QPN) { qp->raw_packet_qp.sq.ubuffer.buf_addr = ucmd->sq_buf_addr; @@ -2156,8 +2151,6 @@ err_create: destroy_qp_user(dev, pd, qp, base, udata); else destroy_qp_kernel(dev, qp); - -err: kvfree(in); return err; } -- cgit v1.2.3 From 21aad80b17e6d17adf99bf17482a5314bcb0aebb Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 27 Apr 2020 18:46:26 +0300 Subject: RDMA/mlx5: Globally parse DEVX UID Remove duplication in parsing of DEVX UID. Link: https://lore.kernel.org/r/20200427154636.381474-27-leon@kernel.org Reviewed-by: Maor Gottlieb Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/qp.c | 51 ++++++++++++++++++++++------------------- 1 file changed, 27 insertions(+), 24 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 5a43128d651b..b2174e0817f5 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -1916,18 +1916,16 @@ static int get_atomic_mode(struct mlx5_ib_dev *dev, static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, struct ib_qp_init_attr *init_attr, struct mlx5_ib_create_qp *ucmd, - struct ib_udata *udata, struct mlx5_ib_qp *qp) + struct ib_udata *udata, struct mlx5_ib_qp *qp, + u32 uidx) { struct mlx5_ib_resources *devr = &dev->devr; int inlen = MLX5_ST_SZ_BYTES(create_qp_in); struct mlx5_core_dev *mdev = dev->mdev; struct mlx5_ib_create_qp_resp resp = {}; - struct mlx5_ib_ucontext *ucontext = rdma_udata_to_drv_context( - udata, struct mlx5_ib_ucontext, ibucontext); struct mlx5_ib_cq *send_cq; struct mlx5_ib_cq *recv_cq; unsigned long flags; - u32 uidx = MLX5_IB_DEFAULT_UIDX; struct mlx5_ib_qp_base *base; int mlx5_st; void *qpc; @@ -1945,12 +1943,6 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR) qp->sq_signal_bits = MLX5_WQE_CTRL_CQ_UPDATE; - if (udata) { - err = get_qp_user_index(ucontext, ucmd, udata->inlen, &uidx); - if (err) - return err; - } - if (qp->flags & IB_QP_CREATE_SOURCE_QPN) qp->underlay_qpn = init_attr->source_qpn; @@ -2329,18 +2321,10 @@ static void destroy_qp_common(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, static int create_dct(struct ib_pd *pd, struct mlx5_ib_qp *qp, struct ib_qp_init_attr *attr, - struct mlx5_ib_create_qp *ucmd, struct ib_udata *udata) + struct mlx5_ib_create_qp *ucmd, u32 uidx) { - struct mlx5_ib_ucontext *ucontext = rdma_udata_to_drv_context( - udata, struct mlx5_ib_ucontext, ibucontext); - int err = 0; - u32 uidx = MLX5_IB_DEFAULT_UIDX; void *dctc; - err = get_qp_user_index(ucontext, ucmd, sizeof(*ucmd), &uidx); - if (err) - return err; - qp->dct.in = kzalloc(MLX5_ST_SZ_BYTES(create_dct_in), GFP_KERNEL); if (!qp->dct.in) return -ENOMEM; @@ -2651,14 +2635,14 @@ static size_t process_udata_size(struct ib_qp_init_attr *attr, static int create_raw_qp(struct ib_pd *pd, struct mlx5_ib_qp *qp, struct ib_qp_init_attr *attr, void *ucmd, - struct ib_udata *udata) + struct ib_udata *udata, u32 uidx) { struct mlx5_ib_dev *dev = to_mdev(pd->device); if (attr->rwq_ind_tbl) return create_rss_raw_qp_tir(pd, qp, attr, ucmd, udata); - return create_qp_common(dev, pd, attr, ucmd, udata, qp); + return create_qp_common(dev, pd, attr, ucmd, udata, qp, uidx); } static int check_qp_attr(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, @@ -2688,10 +2672,24 @@ static int check_qp_attr(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, return ret; } +static int get_qp_uidx(struct mlx5_ib_qp *qp, struct ib_udata *udata, + struct mlx5_ib_create_qp *ucmd, + struct ib_qp_init_attr *attr, u32 *uidx) +{ + struct mlx5_ib_ucontext *ucontext = rdma_udata_to_drv_context( + udata, struct mlx5_ib_ucontext, ibucontext); + + if (attr->rwq_ind_tbl) + return 0; + + return get_qp_user_index(ucontext, ucmd, sizeof(*ucmd), uidx); +} + struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *init_attr, struct ib_udata *udata) { + u32 uidx = MLX5_IB_DEFAULT_UIDX; struct mlx5_ib_dev *dev; struct mlx5_ib_qp *qp; enum ib_qp_type type; @@ -2743,6 +2741,10 @@ struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd, err = process_vendor_flags(dev, qp, ucmd, init_attr); if (err) goto free_qp; + + err = get_qp_uidx(qp, udata, ucmd, init_attr, &uidx); + if (err) + goto free_qp; } err = process_create_flags(dev, qp, init_attr); if (err) @@ -2757,13 +2759,14 @@ struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd, switch (qp->type) { case IB_QPT_RAW_PACKET: - err = create_raw_qp(pd, qp, init_attr, ucmd, udata); + err = create_raw_qp(pd, qp, init_attr, ucmd, udata, uidx); break; case MLX5_IB_QPT_DCT: - err = create_dct(pd, qp, init_attr, ucmd, udata); + err = create_dct(pd, qp, init_attr, ucmd, uidx); break; default: - err = create_qp_common(dev, pd, init_attr, ucmd, udata, qp); + err = create_qp_common(dev, pd, init_attr, ucmd, udata, qp, + uidx); } if (err) { mlx5_ib_dbg(dev, "create_qp_common failed\n"); -- cgit v1.2.3 From 04bcc1c2d0d7bfa0bffa5853d9a127fb4f4cd943 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 27 Apr 2020 18:46:27 +0300 Subject: RDMA/mlx5: Separate XRC_TGT QP creation from common flow XRC_TGT QP doesn't fail into kernel or user flow separation. It is initiated by the user, but is created through in-kernel verbs flow and doesn't have PD and udata in similar way to kernel QPs. So let's separate creation of that QP type from the common flow. Link: https://lore.kernel.org/r/20200427154636.381474-28-leon@kernel.org Reviewed-by: Maor Gottlieb Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/qp.c | 158 +++++++++++++++++++++++++++------------- 1 file changed, 106 insertions(+), 52 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index b2174e0817f5..8890c172f7e5 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -991,8 +991,7 @@ static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd, goto err_umem; } - uid = (attr->qp_type != IB_QPT_XRC_TGT && - attr->qp_type != IB_QPT_XRC_INI) ? to_mpd(pd)->uid : 0; + uid = (attr->qp_type != IB_QPT_XRC_INI) ? to_mpd(pd)->uid : 0; MLX5_SET(create_qp_in, *in, uid, uid); pas = (__be64 *)MLX5_ADDR_OF(create_qp_in, *in, pas); if (ubuffer->umem) @@ -1913,6 +1912,81 @@ static int get_atomic_mode(struct mlx5_ib_dev *dev, return atomic_mode; } +static int create_xrc_tgt_qp(struct mlx5_ib_dev *dev, + struct ib_qp_init_attr *attr, + struct mlx5_ib_qp *qp, struct ib_udata *udata, + u32 uidx) +{ + struct mlx5_ib_resources *devr = &dev->devr; + int inlen = MLX5_ST_SZ_BYTES(create_qp_in); + struct mlx5_core_dev *mdev = dev->mdev; + struct mlx5_ib_qp_base *base; + unsigned long flags; + void *qpc; + u32 *in; + int err; + + mutex_init(&qp->mutex); + + if (attr->sq_sig_type == IB_SIGNAL_ALL_WR) + qp->sq_signal_bits = MLX5_WQE_CTRL_CQ_UPDATE; + + in = kvzalloc(inlen, GFP_KERNEL); + if (!in) + return -ENOMEM; + + qpc = MLX5_ADDR_OF(create_qp_in, in, qpc); + + MLX5_SET(qpc, qpc, st, MLX5_QP_ST_XRC); + MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_MIGRATED); + MLX5_SET(qpc, qpc, pd, to_mpd(devr->p0)->pdn); + + if (qp->flags & IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK) + MLX5_SET(qpc, qpc, block_lb_mc, 1); + if (qp->flags & IB_QP_CREATE_CROSS_CHANNEL) + MLX5_SET(qpc, qpc, cd_master, 1); + if (qp->flags & IB_QP_CREATE_MANAGED_SEND) + MLX5_SET(qpc, qpc, cd_slave_send, 1); + if (qp->flags & IB_QP_CREATE_MANAGED_RECV) + MLX5_SET(qpc, qpc, cd_slave_receive, 1); + + MLX5_SET(qpc, qpc, rq_type, MLX5_SRQ_RQ); + MLX5_SET(qpc, qpc, no_sq, 1); + MLX5_SET(qpc, qpc, cqn_rcv, to_mcq(devr->c0)->mcq.cqn); + MLX5_SET(qpc, qpc, cqn_snd, to_mcq(devr->c0)->mcq.cqn); + MLX5_SET(qpc, qpc, srqn_rmpn_xrqn, to_msrq(devr->s0)->msrq.srqn); + MLX5_SET(qpc, qpc, xrcd, to_mxrcd(attr->xrcd)->xrcdn); + MLX5_SET64(qpc, qpc, dbr_addr, qp->db.dma); + + /* 0xffffff means we ask to work with cqe version 0 */ + if (MLX5_CAP_GEN(mdev, cqe_version) == MLX5_CQE_VERSION_V1) + MLX5_SET(qpc, qpc, user_index, uidx); + + if (qp->flags & IB_QP_CREATE_PCI_WRITE_END_PADDING) { + MLX5_SET(qpc, qpc, end_padding_mode, + MLX5_WQ_END_PAD_MODE_ALIGN); + /* Special case to clean flag */ + qp->flags &= ~IB_QP_CREATE_PCI_WRITE_END_PADDING; + } + + base = &qp->trans_qp.base; + err = mlx5_core_create_qp(dev, &base->mqp, in, inlen); + kvfree(in); + if (err) { + destroy_qp_user(dev, NULL, qp, base, udata); + return err; + } + + base->container_mibqp = qp; + base->mqp.event = mlx5_ib_qp_event; + + spin_lock_irqsave(&dev->reset_flow_resource_lock, flags); + list_add_tail(&qp->qps_list, &dev->qp_list); + spin_unlock_irqrestore(&dev->reset_flow_resource_lock, flags); + + return 0; +} + static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, struct ib_qp_init_attr *init_attr, struct mlx5_ib_create_qp *ucmd, @@ -1958,40 +2032,30 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, return err; } - if (pd) { - if (udata) { - __u32 max_wqes = - 1 << MLX5_CAP_GEN(mdev, log_max_qp_sz); - mlx5_ib_dbg(dev, "requested sq_wqe_count (%d)\n", - ucmd->sq_wqe_count); - if (ucmd->rq_wqe_shift != qp->rq.wqe_shift || - ucmd->rq_wqe_count != qp->rq.wqe_cnt) { - mlx5_ib_dbg(dev, "invalid rq params\n"); - return -EINVAL; - } - if (ucmd->sq_wqe_count > max_wqes) { - mlx5_ib_dbg(dev, "requested sq_wqe_count (%d) > max allowed (%d)\n", - ucmd->sq_wqe_count, max_wqes); - return -EINVAL; - } - err = create_user_qp(dev, pd, qp, udata, init_attr, &in, - &resp, &inlen, base, ucmd); - if (err) - mlx5_ib_dbg(dev, "err %d\n", err); - } else { - err = create_kernel_qp(dev, init_attr, qp, &in, &inlen, - base); - if (err) - mlx5_ib_dbg(dev, "err %d\n", err); + if (udata) { + __u32 max_wqes = 1 << MLX5_CAP_GEN(mdev, log_max_qp_sz); + + mlx5_ib_dbg(dev, "requested sq_wqe_count (%d)\n", + ucmd->sq_wqe_count); + if (ucmd->rq_wqe_shift != qp->rq.wqe_shift || + ucmd->rq_wqe_count != qp->rq.wqe_cnt) { + mlx5_ib_dbg(dev, "invalid rq params\n"); + return -EINVAL; + } + if (ucmd->sq_wqe_count > max_wqes) { + mlx5_ib_dbg( + dev, + "requested sq_wqe_count (%d) > max allowed (%d)\n", + ucmd->sq_wqe_count, max_wqes); + return -EINVAL; } + err = create_user_qp(dev, pd, qp, udata, init_attr, &in, &resp, + &inlen, base, ucmd); + } else + err = create_kernel_qp(dev, init_attr, qp, &in, &inlen, base); - if (err) - return err; - } else { - in = kvzalloc(inlen, GFP_KERNEL); - if (!in) - return -ENOMEM; - } + if (err) + return err; if (is_sqp(init_attr->qp_type)) qp->port = init_attr->port_num; @@ -2054,12 +2118,6 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, /* Set default resources */ switch (init_attr->qp_type) { - case IB_QPT_XRC_TGT: - MLX5_SET(qpc, qpc, cqn_rcv, to_mcq(devr->c0)->mcq.cqn); - MLX5_SET(qpc, qpc, cqn_snd, to_mcq(devr->c0)->mcq.cqn); - MLX5_SET(qpc, qpc, srqn_rmpn_xrqn, to_msrq(devr->s0)->msrq.srqn); - MLX5_SET(qpc, qpc, xrcd, to_mxrcd(init_attr->xrcd)->xrcdn); - break; case IB_QPT_XRC_INI: MLX5_SET(qpc, qpc, cqn_rcv, to_mcq(devr->c0)->mcq.cqn); MLX5_SET(qpc, qpc, xrcd, to_mxrcd(devr->x1)->xrcdn); @@ -2105,16 +2163,12 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, raw_packet_qp_copy_info(qp, &qp->raw_packet_qp); err = create_raw_packet_qp(dev, qp, in, inlen, pd, udata, &resp); - } else { + } else err = mlx5_core_create_qp(dev, &base->mqp, in, inlen); - } - - if (err) { - mlx5_ib_dbg(dev, "create qp failed\n"); - goto err_create; - } kvfree(in); + if (err) + goto err_create; base->container_mibqp = qp; base->mqp.event = mlx5_ib_qp_event; @@ -2143,7 +2197,6 @@ err_create: destroy_qp_user(dev, pd, qp, base, udata); else destroy_qp_kernel(dev, qp); - kvfree(in); return err; } @@ -2750,9 +2803,6 @@ struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd, if (err) goto free_qp; - if (qp->type == IB_QPT_XRC_TGT) - xrcdn = to_mxrcd(init_attr->xrcd)->xrcdn; - err = check_qp_attr(dev, qp, init_attr); if (err) goto free_qp; @@ -2764,12 +2814,16 @@ struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd, case MLX5_IB_QPT_DCT: err = create_dct(pd, qp, init_attr, ucmd, uidx); break; + case IB_QPT_XRC_TGT: + xrcdn = to_mxrcd(init_attr->xrcd)->xrcdn; + err = create_xrc_tgt_qp(dev, init_attr, qp, udata, uidx); + break; default: err = create_qp_common(dev, pd, init_attr, ucmd, udata, qp, uidx); } if (err) { - mlx5_ib_dbg(dev, "create_qp_common failed\n"); + mlx5_ib_dbg(dev, "create_qp failed %d\n", err); goto free_qp; } -- cgit v1.2.3 From 98fc1126c4161450f215254409e5539314b54a04 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 27 Apr 2020 18:46:28 +0300 Subject: RDMA/mlx5: Separate to user/kernel create QP flows The kernel and user create QP flows have very little common code, separate them to simplify the future work of creating per-type create_*_qp() functions. Link: https://lore.kernel.org/r/20200427154636.381474-29-leon@kernel.org Reviewed-by: Maor Gottlieb Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/qp.c | 205 ++++++++++++++++++++++++++++++---------- 1 file changed, 156 insertions(+), 49 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 8890c172f7e5..d5061001217e 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -911,12 +911,12 @@ static int adjust_bfregn(struct mlx5_ib_dev *dev, bfregn % MLX5_NON_FP_BFREGS_PER_UAR; } -static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd, - struct mlx5_ib_qp *qp, struct ib_udata *udata, - struct ib_qp_init_attr *attr, u32 **in, - struct mlx5_ib_create_qp_resp *resp, int *inlen, - struct mlx5_ib_qp_base *base, - struct mlx5_ib_create_qp *ucmd) +static int _create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd, + struct mlx5_ib_qp *qp, struct ib_udata *udata, + struct ib_qp_init_attr *attr, u32 **in, + struct mlx5_ib_create_qp_resp *resp, int *inlen, + struct mlx5_ib_qp_base *base, + struct mlx5_ib_create_qp *ucmd) { struct mlx5_ib_ucontext *context; struct mlx5_ib_ubuffer *ubuffer = &base->ubuffer; @@ -1083,11 +1083,10 @@ static void *get_sq_edge(struct mlx5_ib_wq *sq, u32 idx) return fragment_end + MLX5_SEND_WQE_BB; } -static int create_kernel_qp(struct mlx5_ib_dev *dev, - struct ib_qp_init_attr *init_attr, - struct mlx5_ib_qp *qp, - u32 **in, int *inlen, - struct mlx5_ib_qp_base *base) +static int _create_kernel_qp(struct mlx5_ib_dev *dev, + struct ib_qp_init_attr *init_attr, + struct mlx5_ib_qp *qp, u32 **in, int *inlen, + struct mlx5_ib_qp_base *base) { int uar_index; void *qpc; @@ -1987,11 +1986,11 @@ static int create_xrc_tgt_qp(struct mlx5_ib_dev *dev, return 0; } -static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, - struct ib_qp_init_attr *init_attr, - struct mlx5_ib_create_qp *ucmd, - struct ib_udata *udata, struct mlx5_ib_qp *qp, - u32 uidx) +static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd, + struct ib_qp_init_attr *init_attr, + struct mlx5_ib_create_qp *ucmd, + struct ib_udata *udata, struct mlx5_ib_qp *qp, + u32 uidx) { struct mlx5_ib_resources *devr = &dev->devr; int inlen = MLX5_ST_SZ_BYTES(create_qp_in); @@ -2032,28 +2031,15 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, return err; } - if (udata) { - __u32 max_wqes = 1 << MLX5_CAP_GEN(mdev, log_max_qp_sz); + if (ucmd->rq_wqe_shift != qp->rq.wqe_shift || + ucmd->rq_wqe_count != qp->rq.wqe_cnt) + return -EINVAL; - mlx5_ib_dbg(dev, "requested sq_wqe_count (%d)\n", - ucmd->sq_wqe_count); - if (ucmd->rq_wqe_shift != qp->rq.wqe_shift || - ucmd->rq_wqe_count != qp->rq.wqe_cnt) { - mlx5_ib_dbg(dev, "invalid rq params\n"); - return -EINVAL; - } - if (ucmd->sq_wqe_count > max_wqes) { - mlx5_ib_dbg( - dev, - "requested sq_wqe_count (%d) > max allowed (%d)\n", - ucmd->sq_wqe_count, max_wqes); - return -EINVAL; - } - err = create_user_qp(dev, pd, qp, udata, init_attr, &in, &resp, - &inlen, base, ucmd); - } else - err = create_kernel_qp(dev, init_attr, qp, &in, &inlen, base); + if (ucmd->sq_wqe_count > (1 << MLX5_CAP_GEN(mdev, log_max_qp_sz))) + return -EINVAL; + err = _create_user_qp(dev, pd, qp, udata, init_attr, &in, &resp, &inlen, + base, ucmd); if (err) return err; @@ -2064,12 +2050,7 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, MLX5_SET(qpc, qpc, st, mlx5_st); MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_MIGRATED); - - if (init_attr->qp_type != MLX5_IB_QPT_REG_UMR) - MLX5_SET(qpc, qpc, pd, to_mpd(pd ? pd : devr->p0)->pdn); - else - MLX5_SET(qpc, qpc, latency_sensitive, 1); - + MLX5_SET(qpc, qpc, pd, to_mpd(pd)->pdn); if (qp->flags_en & MLX5_QP_FLAG_SIGNATURE) MLX5_SET(qpc, qpc, wq_signature, 1); @@ -2145,10 +2126,6 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, if (MLX5_CAP_GEN(mdev, cqe_version) == MLX5_CQE_VERSION_V1) MLX5_SET(qpc, qpc, user_index, uidx); - /* we use IB_QP_CREATE_IPOIB_UD_LSO to indicates ipoib qp */ - if (qp->flags & IB_QP_CREATE_IPOIB_UD_LSO) - MLX5_SET(qpc, qpc, ulp_stateless_offload_mode, 1); - if (qp->flags & IB_QP_CREATE_PCI_WRITE_END_PADDING && init_attr->qp_type != IB_QPT_RAW_PACKET) { MLX5_SET(qpc, qpc, end_padding_mode, @@ -2200,6 +2177,133 @@ err_create: return err; } +static int create_kernel_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd, + struct ib_qp_init_attr *attr, struct mlx5_ib_qp *qp, + u32 uidx) +{ + struct mlx5_ib_resources *devr = &dev->devr; + int inlen = MLX5_ST_SZ_BYTES(create_qp_in); + struct mlx5_core_dev *mdev = dev->mdev; + struct mlx5_ib_cq *send_cq; + struct mlx5_ib_cq *recv_cq; + unsigned long flags; + struct mlx5_ib_qp_base *base; + int mlx5_st; + void *qpc; + u32 *in; + int err; + + mutex_init(&qp->mutex); + spin_lock_init(&qp->sq.lock); + spin_lock_init(&qp->rq.lock); + + mlx5_st = to_mlx5_st(qp->type); + if (mlx5_st < 0) + return -EINVAL; + + if (attr->sq_sig_type == IB_SIGNAL_ALL_WR) + qp->sq_signal_bits = MLX5_WQE_CTRL_CQ_UPDATE; + + base = &qp->trans_qp.base; + + qp->has_rq = qp_has_rq(attr); + err = set_rq_size(dev, &attr->cap, qp->has_rq, qp, NULL); + if (err) { + mlx5_ib_dbg(dev, "err %d\n", err); + return err; + } + + err = _create_kernel_qp(dev, attr, qp, &in, &inlen, base); + if (err) + return err; + + if (is_sqp(attr->qp_type)) + qp->port = attr->port_num; + + qpc = MLX5_ADDR_OF(create_qp_in, in, qpc); + + MLX5_SET(qpc, qpc, st, mlx5_st); + MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_MIGRATED); + + if (attr->qp_type != MLX5_IB_QPT_REG_UMR) + MLX5_SET(qpc, qpc, pd, to_mpd(pd ? pd : devr->p0)->pdn); + else + MLX5_SET(qpc, qpc, latency_sensitive, 1); + + + if (qp->flags & IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK) + MLX5_SET(qpc, qpc, block_lb_mc, 1); + + if (qp->rq.wqe_cnt) { + MLX5_SET(qpc, qpc, log_rq_stride, qp->rq.wqe_shift - 4); + MLX5_SET(qpc, qpc, log_rq_size, ilog2(qp->rq.wqe_cnt)); + } + + MLX5_SET(qpc, qpc, rq_type, get_rx_type(qp, attr)); + + if (qp->sq.wqe_cnt) + MLX5_SET(qpc, qpc, log_sq_size, ilog2(qp->sq.wqe_cnt)); + else + MLX5_SET(qpc, qpc, no_sq, 1); + + if (attr->srq) { + MLX5_SET(qpc, qpc, xrcd, to_mxrcd(devr->x0)->xrcdn); + MLX5_SET(qpc, qpc, srqn_rmpn_xrqn, + to_msrq(attr->srq)->msrq.srqn); + } else { + MLX5_SET(qpc, qpc, xrcd, to_mxrcd(devr->x1)->xrcdn); + MLX5_SET(qpc, qpc, srqn_rmpn_xrqn, + to_msrq(devr->s1)->msrq.srqn); + } + + if (attr->send_cq) + MLX5_SET(qpc, qpc, cqn_snd, to_mcq(attr->send_cq)->mcq.cqn); + + if (attr->recv_cq) + MLX5_SET(qpc, qpc, cqn_rcv, to_mcq(attr->recv_cq)->mcq.cqn); + + MLX5_SET64(qpc, qpc, dbr_addr, qp->db.dma); + + /* 0xffffff means we ask to work with cqe version 0 */ + if (MLX5_CAP_GEN(mdev, cqe_version) == MLX5_CQE_VERSION_V1) + MLX5_SET(qpc, qpc, user_index, uidx); + + /* we use IB_QP_CREATE_IPOIB_UD_LSO to indicates ipoib qp */ + if (qp->flags & IB_QP_CREATE_IPOIB_UD_LSO) + MLX5_SET(qpc, qpc, ulp_stateless_offload_mode, 1); + + err = mlx5_core_create_qp(dev, &base->mqp, in, inlen); + kvfree(in); + if (err) + goto err_create; + + base->container_mibqp = qp; + base->mqp.event = mlx5_ib_qp_event; + + get_cqs(qp->type, attr->send_cq, attr->recv_cq, + &send_cq, &recv_cq); + spin_lock_irqsave(&dev->reset_flow_resource_lock, flags); + mlx5_ib_lock_cqs(send_cq, recv_cq); + /* Maintain device to QPs access, needed for further handling via reset + * flow + */ + list_add_tail(&qp->qps_list, &dev->qp_list); + /* Maintain CQ to QPs access, needed for further handling via reset flow + */ + if (send_cq) + list_add_tail(&qp->cq_send_list, &send_cq->list_send_qp); + if (recv_cq) + list_add_tail(&qp->cq_recv_list, &recv_cq->list_recv_qp); + mlx5_ib_unlock_cqs(send_cq, recv_cq); + spin_unlock_irqrestore(&dev->reset_flow_resource_lock, flags); + + return 0; + +err_create: + destroy_qp_kernel(dev, qp); + return err; +} + static void mlx5_ib_lock_cqs(struct mlx5_ib_cq *send_cq, struct mlx5_ib_cq *recv_cq) __acquires(&send_cq->lock) __acquires(&recv_cq->lock) { @@ -2695,7 +2799,7 @@ static int create_raw_qp(struct ib_pd *pd, struct mlx5_ib_qp *qp, if (attr->rwq_ind_tbl) return create_rss_raw_qp_tir(pd, qp, attr, ucmd, udata); - return create_qp_common(dev, pd, attr, ucmd, udata, qp, uidx); + return create_user_qp(dev, pd, attr, ucmd, udata, qp, uidx); } static int check_qp_attr(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, @@ -2819,8 +2923,11 @@ struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd, err = create_xrc_tgt_qp(dev, init_attr, qp, udata, uidx); break; default: - err = create_qp_common(dev, pd, init_attr, ucmd, udata, qp, - uidx); + if (udata) + err = create_user_qp(dev, pd, init_attr, ucmd, udata, + qp, uidx); + else + err = create_kernel_qp(dev, pd, init_attr, qp, uidx); } if (err) { mlx5_ib_dbg(dev, "create_qp failed %d\n", err); -- cgit v1.2.3 From 747c519cdbe4a3f6a616d50c19bcb97413abe384 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 27 Apr 2020 18:46:29 +0300 Subject: RDMA/mlx5: Reduce amount of duplication in QP destroy Delete both PD argument and checks if udata was provided, in favour of unified destroy QP functions. Link: https://lore.kernel.org/r/20200427154636.381474-30-leon@kernel.org Reviewed-by: Maor Gottlieb Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/qp.c | 70 ++++++++++++++++++----------------------- 1 file changed, 31 insertions(+), 39 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index d5061001217e..6b390b0e43af 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -1038,25 +1038,36 @@ err_bfreg: return err; } -static void destroy_qp_user(struct mlx5_ib_dev *dev, struct ib_pd *pd, - struct mlx5_ib_qp *qp, struct mlx5_ib_qp_base *base, - struct ib_udata *udata) +static void destroy_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, + struct mlx5_ib_qp_base *base, struct ib_udata *udata) { - struct mlx5_ib_ucontext *context = - rdma_udata_to_drv_context( - udata, - struct mlx5_ib_ucontext, - ibucontext); + struct mlx5_ib_ucontext *context = rdma_udata_to_drv_context( + udata, struct mlx5_ib_ucontext, ibucontext); - mlx5_ib_db_unmap_user(context, &qp->db); - ib_umem_release(base->ubuffer.umem); + if (udata) { + /* User QP */ + mlx5_ib_db_unmap_user(context, &qp->db); + ib_umem_release(base->ubuffer.umem); - /* - * Free only the BFREGs which are handled by the kernel. - * BFREGs of UARs allocated dynamically are handled by user. - */ - if (qp->bfregn != MLX5_IB_INVALID_BFREG) - mlx5_ib_free_bfreg(dev, &context->bfregi, qp->bfregn); + /* + * Free only the BFREGs which are handled by the kernel. + * BFREGs of UARs allocated dynamically are handled by user. + */ + if (qp->bfregn != MLX5_IB_INVALID_BFREG) + mlx5_ib_free_bfreg(dev, &context->bfregi, qp->bfregn); + return; + } + + /* Kernel QP */ + kvfree(qp->sq.wqe_head); + kvfree(qp->sq.w_list); + kvfree(qp->sq.wrid); + kvfree(qp->sq.wr_data); + kvfree(qp->rq.wrid); + if (qp->db.db) + mlx5_db_free(dev->mdev, &qp->db); + if (qp->buf.frags) + mlx5_frag_buf_free(dev->mdev, &qp->buf); } /* get_sq_edge - Get the next nearby edge. @@ -1202,19 +1213,6 @@ err_buf: return err; } -static void destroy_qp_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp) -{ - kvfree(qp->sq.wqe_head); - kvfree(qp->sq.w_list); - kvfree(qp->sq.wrid); - kvfree(qp->sq.wr_data); - kvfree(qp->rq.wrid); - if (qp->db.db) - mlx5_db_free(dev->mdev, &qp->db); - if (qp->buf.frags) - mlx5_frag_buf_free(dev->mdev, &qp->buf); -} - static u32 get_rx_type(struct mlx5_ib_qp *qp, struct ib_qp_init_attr *attr) { if (attr->srq || (qp->type == IB_QPT_XRC_TGT) || @@ -1972,7 +1970,7 @@ static int create_xrc_tgt_qp(struct mlx5_ib_dev *dev, err = mlx5_core_create_qp(dev, &base->mqp, in, inlen); kvfree(in); if (err) { - destroy_qp_user(dev, NULL, qp, base, udata); + destroy_qp(dev, qp, base, udata); return err; } @@ -2170,10 +2168,7 @@ static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd, return 0; err_create: - if (udata) - destroy_qp_user(dev, pd, qp, base, udata); - else - destroy_qp_kernel(dev, qp); + destroy_qp(dev, qp, base, udata); return err; } @@ -2300,7 +2295,7 @@ static int create_kernel_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd, return 0; err_create: - destroy_qp_kernel(dev, qp); + destroy_qp(dev, qp, base, NULL); return err; } @@ -2470,10 +2465,7 @@ static void destroy_qp_common(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, base->mqp.qpn); } - if (udata) - destroy_qp_user(dev, &get_pd(qp)->ibpd, qp, base, udata); - else - destroy_qp_kernel(dev, qp); + destroy_qp(dev, qp, base, udata); } static int create_dct(struct ib_pd *pd, struct mlx5_ib_qp *qp, -- cgit v1.2.3 From f78d358cec9088ed77b5129c44f858cdfdb1e8c9 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 27 Apr 2020 18:46:30 +0300 Subject: RDMA/mlx5: Group all create QP parameters to simplify in-kernel interfaces The amount of parameters passed in and out between internal mlx5 create QP functions is too large to easily follow the flow. Change it by grouping all create QP parameter into one structure. Link: https://lore.kernel.org/r/20200427154636.381474-31-leon@kernel.org Reviewed-by: Maor Gottlieb Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/qp.c | 148 ++++++++++++++++++++++------------------ 1 file changed, 81 insertions(+), 67 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 6b390b0e43af..3807e1687cb2 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -1610,14 +1610,24 @@ static void destroy_rss_raw_qp_tir(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *q to_mpd(qp->ibqp.pd)->uid); } -static int create_rss_raw_qp_tir(struct ib_pd *pd, struct mlx5_ib_qp *qp, - struct ib_qp_init_attr *init_attr, - struct mlx5_ib_create_qp_rss *ucmd, - struct ib_udata *udata) +struct mlx5_create_qp_params { + struct ib_udata *udata; + size_t inlen; + void *ucmd; + u8 is_rss_raw : 1; + struct ib_qp_init_attr *attr; + u32 uidx; +}; + +static int create_rss_raw_qp_tir(struct mlx5_ib_dev *dev, struct ib_pd *pd, + struct mlx5_ib_qp *qp, + struct mlx5_create_qp_params *params) { + struct ib_qp_init_attr *init_attr = params->attr; + struct mlx5_ib_create_qp_rss *ucmd = params->ucmd; + struct ib_udata *udata = params->udata; struct mlx5_ib_ucontext *mucontext = rdma_udata_to_drv_context( udata, struct mlx5_ib_ucontext, ibucontext); - struct mlx5_ib_dev *dev = to_mdev(pd->device); struct mlx5_ib_create_qp_resp resp = {}; int inlen; int outlen; @@ -1632,7 +1642,8 @@ static int create_rss_raw_qp_tir(struct ib_pd *pd, struct mlx5_ib_qp *qp, u32 tdn = mucontext->tdn; u8 lb_flag = 0; - min_resp_len = offsetof(typeof(resp), bfreg_index) + sizeof(resp.bfreg_index); + min_resp_len = + offsetof(typeof(resp), bfreg_index) + sizeof(resp.bfreg_index); if (udata->outlen < min_resp_len) return -EINVAL; @@ -1909,11 +1920,12 @@ static int get_atomic_mode(struct mlx5_ib_dev *dev, return atomic_mode; } -static int create_xrc_tgt_qp(struct mlx5_ib_dev *dev, - struct ib_qp_init_attr *attr, - struct mlx5_ib_qp *qp, struct ib_udata *udata, - u32 uidx) +static int create_xrc_tgt_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, + struct mlx5_create_qp_params *params) { + struct ib_qp_init_attr *attr = params->attr; + struct ib_udata *udata = params->udata; + u32 uidx = params->uidx; struct mlx5_ib_resources *devr = &dev->devr; int inlen = MLX5_ST_SZ_BYTES(create_qp_in); struct mlx5_core_dev *mdev = dev->mdev; @@ -1985,11 +1997,13 @@ static int create_xrc_tgt_qp(struct mlx5_ib_dev *dev, } static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd, - struct ib_qp_init_attr *init_attr, - struct mlx5_ib_create_qp *ucmd, - struct ib_udata *udata, struct mlx5_ib_qp *qp, - u32 uidx) + struct mlx5_ib_qp *qp, + struct mlx5_create_qp_params *params) { + struct ib_qp_init_attr *init_attr = params->attr; + struct mlx5_ib_create_qp *ucmd = params->ucmd; + struct ib_udata *udata = params->udata; + u32 uidx = params->uidx; struct mlx5_ib_resources *devr = &dev->devr; int inlen = MLX5_ST_SZ_BYTES(create_qp_in); struct mlx5_core_dev *mdev = dev->mdev; @@ -2173,9 +2187,11 @@ err_create: } static int create_kernel_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd, - struct ib_qp_init_attr *attr, struct mlx5_ib_qp *qp, - u32 uidx) + struct mlx5_ib_qp *qp, + struct mlx5_create_qp_params *params) { + struct ib_qp_init_attr *attr = params->attr; + u32 uidx = params->uidx; struct mlx5_ib_resources *devr = &dev->devr; int inlen = MLX5_ST_SZ_BYTES(create_qp_in); struct mlx5_core_dev *mdev = dev->mdev; @@ -2469,9 +2485,11 @@ static void destroy_qp_common(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, } static int create_dct(struct ib_pd *pd, struct mlx5_ib_qp *qp, - struct ib_qp_init_attr *attr, - struct mlx5_ib_create_qp *ucmd, u32 uidx) + struct mlx5_create_qp_params *params) { + struct ib_qp_init_attr *attr = params->attr; + struct mlx5_ib_create_qp *ucmd = params->ucmd; + u32 uidx = params->uidx; void *dctc; qp->dct.in = kzalloc(MLX5_ST_SZ_BYTES(create_dct_in), GFP_KERNEL); @@ -2782,16 +2800,14 @@ static size_t process_udata_size(struct ib_qp_init_attr *attr, return min(ucmd, inlen); } -static int create_raw_qp(struct ib_pd *pd, struct mlx5_ib_qp *qp, - struct ib_qp_init_attr *attr, void *ucmd, - struct ib_udata *udata, u32 uidx) +static int create_raw_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd, + struct mlx5_ib_qp *qp, + struct mlx5_create_qp_params *params) { - struct mlx5_ib_dev *dev = to_mdev(pd->device); + if (params->is_rss_raw) + return create_rss_raw_qp_tir(dev, pd, qp, params); - if (attr->rwq_ind_tbl) - return create_rss_raw_qp_tir(pd, qp, attr, ucmd, udata); - - return create_user_qp(dev, pd, attr, ucmd, udata, qp, uidx); + return create_user_qp(dev, pd, qp, params); } static int check_qp_attr(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, @@ -2821,60 +2837,59 @@ static int check_qp_attr(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, return ret; } -static int get_qp_uidx(struct mlx5_ib_qp *qp, struct ib_udata *udata, - struct mlx5_ib_create_qp *ucmd, - struct ib_qp_init_attr *attr, u32 *uidx) +static int get_qp_uidx(struct mlx5_ib_qp *qp, + struct mlx5_create_qp_params *params) { + struct mlx5_ib_create_qp *ucmd = params->ucmd; + struct ib_udata *udata = params->udata; struct mlx5_ib_ucontext *ucontext = rdma_udata_to_drv_context( udata, struct mlx5_ib_ucontext, ibucontext); - if (attr->rwq_ind_tbl) + if (params->is_rss_raw) return 0; - return get_qp_user_index(ucontext, ucmd, sizeof(*ucmd), uidx); + return get_qp_user_index(ucontext, ucmd, sizeof(*ucmd), ¶ms->uidx); } -struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd, - struct ib_qp_init_attr *init_attr, +struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *attr, struct ib_udata *udata) { - u32 uidx = MLX5_IB_DEFAULT_UIDX; + struct mlx5_create_qp_params params = {}; struct mlx5_ib_dev *dev; struct mlx5_ib_qp *qp; enum ib_qp_type type; - void *ucmd = NULL; u16 xrcdn = 0; int err; dev = pd ? to_mdev(pd->device) : - to_mdev(to_mxrcd(init_attr->xrcd)->ibxrcd.device); + to_mdev(to_mxrcd(attr->xrcd)->ibxrcd.device); - err = check_qp_type(dev, init_attr, &type); - if (err) { - mlx5_ib_dbg(dev, "Unsupported QP type %d\n", - init_attr->qp_type); + err = check_qp_type(dev, attr, &type); + if (err) return ERR_PTR(err); - } - err = check_valid_flow(dev, pd, init_attr, udata); + err = check_valid_flow(dev, pd, attr, udata); if (err) return ERR_PTR(err); - if (init_attr->qp_type == IB_QPT_GSI) - return mlx5_ib_gsi_create_qp(pd, init_attr); + if (attr->qp_type == IB_QPT_GSI) + return mlx5_ib_gsi_create_qp(pd, attr); - if (udata) { - size_t inlen = - process_udata_size(init_attr, udata); + params.udata = udata; + params.uidx = MLX5_IB_DEFAULT_UIDX; + params.attr = attr; + params.is_rss_raw = !!attr->rwq_ind_tbl; - if (!inlen) + if (udata) { + params.inlen = process_udata_size(attr, udata); + if (!params.inlen) return ERR_PTR(-EINVAL); - ucmd = kzalloc(inlen, GFP_KERNEL); - if (!ucmd) + params.ucmd = kzalloc(params.inlen, GFP_KERNEL); + if (!params.ucmd) return ERR_PTR(-ENOMEM); - err = ib_copy_from_udata(ucmd, udata, inlen); + err = ib_copy_from_udata(params.ucmd, udata, params.inlen); if (err) goto free_ucmd; } @@ -2887,50 +2902,49 @@ struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd, qp->type = type; if (udata) { - err = process_vendor_flags(dev, qp, ucmd, init_attr); + err = process_vendor_flags(dev, qp, params.ucmd, attr); if (err) goto free_qp; - err = get_qp_uidx(qp, udata, ucmd, init_attr, &uidx); + err = get_qp_uidx(qp, ¶ms); if (err) goto free_qp; } - err = process_create_flags(dev, qp, init_attr); + err = process_create_flags(dev, qp, attr); if (err) goto free_qp; - err = check_qp_attr(dev, qp, init_attr); + err = check_qp_attr(dev, qp, attr); if (err) goto free_qp; switch (qp->type) { case IB_QPT_RAW_PACKET: - err = create_raw_qp(pd, qp, init_attr, ucmd, udata, uidx); + err = create_raw_qp(dev, pd, qp, ¶ms); break; case MLX5_IB_QPT_DCT: - err = create_dct(pd, qp, init_attr, ucmd, uidx); + err = create_dct(pd, qp, ¶ms); break; case IB_QPT_XRC_TGT: - xrcdn = to_mxrcd(init_attr->xrcd)->xrcdn; - err = create_xrc_tgt_qp(dev, init_attr, qp, udata, uidx); + xrcdn = to_mxrcd(attr->xrcd)->xrcdn; + err = create_xrc_tgt_qp(dev, qp, ¶ms); break; default: if (udata) - err = create_user_qp(dev, pd, init_attr, ucmd, udata, - qp, uidx); + err = create_user_qp(dev, pd, qp, ¶ms); else - err = create_kernel_qp(dev, pd, init_attr, qp, uidx); + err = create_kernel_qp(dev, pd, qp, ¶ms); } if (err) { - mlx5_ib_dbg(dev, "create_qp failed %d\n", err); + mlx5_ib_err(dev, "create_qp failed %d\n", err); goto free_qp; } - kfree(ucmd); + kfree(params.ucmd); - if (is_qp0(init_attr->qp_type)) + if (is_qp0(attr->qp_type)) qp->ibqp.qp_num = 0; - else if (is_qp1(init_attr->qp_type)) + else if (is_qp1(attr->qp_type)) qp->ibqp.qp_num = 1; else qp->ibqp.qp_num = qp->trans_qp.base.mqp.qpn; @@ -2942,7 +2956,7 @@ struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd, free_qp: kfree(qp); free_ucmd: - kfree(ucmd); + kfree(params.ucmd); return ERR_PTR(err); } -- cgit v1.2.3 From 5d6fffed1cfd0c368a9089acb9fcc902c649c31c Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 27 Apr 2020 18:46:31 +0300 Subject: RDMA/mlx5: Promote RSS RAW QP flags check to higher level Move check that user didn't supplied RSS RAW QP unsupported command flags to the function that checks all such flags. Link: https://lore.kernel.org/r/20200427154636.381474-32-leon@kernel.org Reviewed-by: Maor Gottlieb Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/qp.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 3807e1687cb2..8daa8bc6b9c7 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -1652,13 +1652,6 @@ static int create_rss_raw_qp_tir(struct mlx5_ib_dev *dev, struct ib_pd *pd, return -EOPNOTSUPP; } - if (ucmd->flags & ~(MLX5_QP_FLAG_TUNNEL_OFFLOADS | - MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_UC | - MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_MC)) { - mlx5_ib_dbg(dev, "invalid flags\n"); - return -EOPNOTSUPP; - } - if (ucmd->rx_hash_fields_mask & MLX5_RX_HASH_INNER && !(ucmd->flags & MLX5_QP_FLAG_TUNNEL_OFFLOADS)) { mlx5_ib_dbg(dev, "Tunnel offloads must be set for inner RSS\n"); @@ -2687,11 +2680,20 @@ static int process_vendor_flags(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, process_vendor_flag(dev, &flags, MLX5_QP_FLAG_BFREG_INDEX, true, qp); process_vendor_flag(dev, &flags, MLX5_QP_FLAG_UAR_PAGE_INDEX, true, qp); + cond = qp->flags_en & ~(MLX5_QP_FLAG_TUNNEL_OFFLOADS | + MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_UC | + MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_MC); + if (attr->rwq_ind_tbl && cond) { + mlx5_ib_dbg(dev, "RSS RAW QP has unsupported flags 0x%X\n", + cond); + return -EINVAL; + } + if (flags) mlx5_ib_dbg(dev, "udata has unsupported flags 0x%X\n", flags); return (flags) ? -EINVAL : 0; -} + } static void process_create_flag(struct mlx5_ib_dev *dev, int *flags, int flag, bool cond, struct mlx5_ib_qp *qp) -- cgit v1.2.3 From 6f2cf76e6ec7885de116cfc9c08057f2f7873629 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 27 Apr 2020 18:46:32 +0300 Subject: RDMA/mlx5: Handle udate outlen checks in one place Place in one function all udata size checks. This will allow us move ib_copy_to_udata() in general place and ensure that it will be performed after call to the FW. Link: https://lore.kernel.org/r/20200427154636.381474-33-leon@kernel.org Reviewed-by: Maor Gottlieb Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/qp.c | 48 +++++++++++++++++++++++++---------------- 1 file changed, 30 insertions(+), 18 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 8daa8bc6b9c7..0d06706e6ce1 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -1613,6 +1613,7 @@ static void destroy_rss_raw_qp_tir(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *q struct mlx5_create_qp_params { struct ib_udata *udata; size_t inlen; + size_t outlen; void *ucmd; u8 is_rss_raw : 1; struct ib_qp_init_attr *attr; @@ -1638,15 +1639,9 @@ static int create_rss_raw_qp_tir(struct mlx5_ib_dev *dev, struct ib_pd *pd, void *hfso; u32 selected_fields = 0; u32 outer_l4; - size_t min_resp_len; u32 tdn = mucontext->tdn; u8 lb_flag = 0; - min_resp_len = - offsetof(typeof(resp), bfreg_index) + sizeof(resp.bfreg_index); - if (udata->outlen < min_resp_len) - return -EINVAL; - if (ucmd->comp_mask) { mlx5_ib_dbg(dev, "invalid comp mask\n"); return -EOPNOTSUPP; @@ -2780,26 +2775,43 @@ static int process_create_flags(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, return (create_flags) ? -EINVAL : 0; } -static size_t process_udata_size(struct ib_qp_init_attr *attr, - struct ib_udata *udata) +static int process_udata_size(struct mlx5_ib_dev *dev, + struct mlx5_create_qp_params *params) { size_t ucmd = sizeof(struct mlx5_ib_create_qp); + struct ib_qp_init_attr *attr = params->attr; + struct ib_udata *udata = params->udata; + size_t outlen = udata->outlen; size_t inlen = udata->inlen; - if (attr->qp_type == IB_QPT_DRIVER) - return (inlen < ucmd) ? 0 : ucmd; + params->outlen = min(outlen, sizeof(struct mlx5_ib_create_qp_resp)); + if (attr->qp_type == IB_QPT_DRIVER) { + params->inlen = (inlen < ucmd) ? 0 : ucmd; + goto out; + } - if (!attr->rwq_ind_tbl) - return ucmd; + if (!params->is_rss_raw) { + params->inlen = ucmd; + goto out; + } + /* RSS RAW QP */ if (inlen < offsetofend(struct mlx5_ib_create_qp_rss, flags)) - return 0; + return -EINVAL; + + if (outlen < offsetofend(struct mlx5_ib_create_qp_resp, bfreg_index)) + return -EINVAL; ucmd = sizeof(struct mlx5_ib_create_qp_rss); if (inlen > ucmd && !ib_is_udata_cleared(udata, ucmd, inlen - ucmd)) - return 0; + return -EINVAL; + + params->inlen = min(ucmd, inlen); +out: + if (!params->inlen) + mlx5_ib_dbg(dev, "udata is too small or not cleared\n"); - return min(ucmd, inlen); + return (params->inlen) ? 0 : -EINVAL; } static int create_raw_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd, @@ -2883,9 +2895,9 @@ struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *attr, params.is_rss_raw = !!attr->rwq_ind_tbl; if (udata) { - params.inlen = process_udata_size(attr, udata); - if (!params.inlen) - return ERR_PTR(-EINVAL); + err = process_udata_size(dev, ¶ms); + if (err) + return ERR_PTR(err); params.ucmd = kzalloc(params.inlen, GFP_KERNEL); if (!params.ucmd) -- cgit v1.2.3 From 08d53976609aec17f28872423d3a3e86ad1a3ec4 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 27 Apr 2020 18:46:33 +0300 Subject: RDMA/mlx5: Copy response to the user in one place Update all the places in create QP flows to copy response to the user in one place. Link: https://lore.kernel.org/r/20200427154636.381474-34-leon@kernel.org Reviewed-by: Maor Gottlieb Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/qp.c | 113 ++++++++++++++++++---------------------- 1 file changed, 52 insertions(+), 61 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 0d06706e6ce1..9ca742189281 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -1015,17 +1015,8 @@ static int _create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd, goto err_free; } - err = ib_copy_to_udata(udata, resp, min(udata->outlen, sizeof(*resp))); - if (err) { - mlx5_ib_dbg(dev, "copy failed\n"); - goto err_unmap; - } - return 0; -err_unmap: - mlx5_ib_db_unmap_user(context, &qp->db); - err_free: kvfree(*in); @@ -1551,14 +1542,8 @@ static int create_raw_packet_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, qp->trans_qp.base.mqp.qpn = qp->sq.wqe_cnt ? sq->base.mqp.qpn : rq->base.mqp.qpn; - err = ib_copy_to_udata(udata, resp, min(udata->outlen, sizeof(*resp))); - if (err) - goto err_destroy_tir; - return 0; -err_destroy_tir: - destroy_raw_packet_qp_tir(dev, rq, qp->flags_en, pd); err_destroy_rq: destroy_raw_packet_qp_rq(dev, rq); err_destroy_sq: @@ -1618,6 +1603,7 @@ struct mlx5_create_qp_params { u8 is_rss_raw : 1; struct ib_qp_init_attr *attr; u32 uidx; + struct mlx5_ib_create_qp_resp resp; }; static int create_rss_raw_qp_tir(struct mlx5_ib_dev *dev, struct ib_pd *pd, @@ -1629,7 +1615,6 @@ static int create_rss_raw_qp_tir(struct mlx5_ib_dev *dev, struct ib_pd *pd, struct ib_udata *udata = params->udata; struct mlx5_ib_ucontext *mucontext = rdma_udata_to_drv_context( udata, struct mlx5_ib_ucontext, ibucontext); - struct mlx5_ib_create_qp_resp resp = {}; int inlen; int outlen; int err; @@ -1662,12 +1647,6 @@ static int create_rss_raw_qp_tir(struct mlx5_ib_dev *dev, struct ib_pd *pd, if (qp->flags_en & MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_MC) lb_flag |= MLX5_TIRC_SELF_LB_BLOCK_BLOCK_MULTICAST; - err = ib_copy_to_udata(udata, &resp, min(udata->outlen, sizeof(resp))); - if (err) { - mlx5_ib_dbg(dev, "copy failed\n"); - return -EINVAL; - } - inlen = MLX5_ST_SZ_BYTES(create_tir_in); outlen = MLX5_ST_SZ_BYTES(create_tir_out); in = kvzalloc(inlen + outlen, GFP_KERNEL); @@ -1803,34 +1782,30 @@ create_tir: goto err; if (mucontext->devx_uid) { - resp.comp_mask |= MLX5_IB_CREATE_QP_RESP_MASK_TIRN; - resp.tirn = qp->rss_qp.tirn; + params->resp.comp_mask |= MLX5_IB_CREATE_QP_RESP_MASK_TIRN; + params->resp.tirn = qp->rss_qp.tirn; if (MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev, sw_owner)) { - resp.tir_icm_addr = + params->resp.tir_icm_addr = MLX5_GET(create_tir_out, out, icm_address_31_0); - resp.tir_icm_addr |= (u64)MLX5_GET(create_tir_out, out, - icm_address_39_32) - << 32; - resp.tir_icm_addr |= (u64)MLX5_GET(create_tir_out, out, - icm_address_63_40) - << 40; - resp.comp_mask |= + params->resp.tir_icm_addr |= + (u64)MLX5_GET(create_tir_out, out, + icm_address_39_32) + << 32; + params->resp.tir_icm_addr |= + (u64)MLX5_GET(create_tir_out, out, + icm_address_63_40) + << 40; + params->resp.comp_mask |= MLX5_IB_CREATE_QP_RESP_MASK_TIR_ICM_ADDR; } } - err = ib_copy_to_udata(udata, &resp, min(udata->outlen, sizeof(resp))); - if (err) - goto err_copy; - kvfree(in); /* qpn is reserved for that QP */ qp->trans_qp.base.mqp.qpn = 0; qp->is_rss = true; return 0; -err_copy: - mlx5_cmd_destroy_tir(dev->mdev, qp->rss_qp.tirn, mucontext->devx_uid); err: kvfree(in); return err; @@ -1995,7 +1970,6 @@ static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd, struct mlx5_ib_resources *devr = &dev->devr; int inlen = MLX5_ST_SZ_BYTES(create_qp_in); struct mlx5_core_dev *mdev = dev->mdev; - struct mlx5_ib_create_qp_resp resp = {}; struct mlx5_ib_cq *send_cq; struct mlx5_ib_cq *recv_cq; unsigned long flags; @@ -2038,8 +2012,8 @@ static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd, if (ucmd->sq_wqe_count > (1 << MLX5_CAP_GEN(mdev, log_max_qp_sz))) return -EINVAL; - err = _create_user_qp(dev, pd, qp, udata, init_attr, &in, &resp, &inlen, - base, ucmd); + err = _create_user_qp(dev, pd, qp, udata, init_attr, &in, ¶ms->resp, + &inlen, base, ucmd); if (err) return err; @@ -2139,7 +2113,7 @@ static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd, qp->raw_packet_qp.sq.ubuffer.buf_addr = ucmd->sq_buf_addr; raw_packet_qp_copy_info(qp, &qp->raw_packet_qp); err = create_raw_packet_qp(dev, qp, in, inlen, pd, udata, - &resp); + ¶ms->resp); } else err = mlx5_core_create_qp(dev, &base->mqp, in, inlen); @@ -2865,6 +2839,25 @@ static int get_qp_uidx(struct mlx5_ib_qp *qp, return get_qp_user_index(ucontext, ucmd, sizeof(*ucmd), ¶ms->uidx); } +static int mlx5_ib_destroy_dct(struct mlx5_ib_qp *mqp) +{ + struct mlx5_ib_dev *dev = to_mdev(mqp->ibqp.device); + + if (mqp->state == IB_QPS_RTR) { + int err; + + err = mlx5_core_destroy_dct(dev, &mqp->dct.mdct); + if (err) { + mlx5_ib_warn(dev, "failed to destroy DCT %d\n", err); + return err; + } + } + + kfree(mqp->dct.in); + kfree(mqp); + return 0; +} + struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *attr, struct ib_udata *udata) { @@ -2955,6 +2948,7 @@ struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *attr, } kfree(params.ucmd); + params.ucmd = NULL; if (is_qp0(attr->qp_type)) qp->ibqp.qp_num = 0; @@ -2965,8 +2959,24 @@ struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *attr, qp->trans_qp.xrcdn = xrcdn; + if (udata) + /* + * It is safe to copy response for all user create QP flows, + * including MLX5_IB_QPT_DCT, which doesn't need it. + * In that case, resp will be filled with zeros. + */ + err = ib_copy_to_udata(udata, ¶ms.resp, params.outlen); + if (err) + goto destroy_qp; + return &qp->ibqp; +destroy_qp: + if (qp->type == MLX5_IB_QPT_DCT) + mlx5_ib_destroy_dct(qp); + else + destroy_qp_common(dev, qp, udata); + qp = NULL; free_qp: kfree(qp); free_ucmd: @@ -2974,25 +2984,6 @@ free_ucmd: return ERR_PTR(err); } -static int mlx5_ib_destroy_dct(struct mlx5_ib_qp *mqp) -{ - struct mlx5_ib_dev *dev = to_mdev(mqp->ibqp.device); - - if (mqp->state == IB_QPS_RTR) { - int err; - - err = mlx5_core_destroy_dct(dev, &mqp->dct.mdct); - if (err) { - mlx5_ib_warn(dev, "failed to destroy DCT %d\n", err); - return err; - } - } - - kfree(mqp->dct.in); - kfree(mqp); - return 0; -} - int mlx5_ib_destroy_qp(struct ib_qp *qp, struct ib_udata *udata) { struct mlx5_ib_dev *dev = to_mdev(qp->device); -- cgit v1.2.3 From 6367da46d3cb03ff717457875bd01dda7b02a1ff Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 27 Apr 2020 18:46:34 +0300 Subject: RDMA/mlx5: Remove redundant destroy QP call After major refactoring in create QP flow, it is no needed to call to destroy QP in XRC_TGT flow. Link: https://lore.kernel.org/r/20200427154636.381474-35-leon@kernel.org Reviewed-by: Maor Gottlieb Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/qp.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 9ca742189281..d7983a951e8d 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -1887,7 +1887,6 @@ static int create_xrc_tgt_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, struct mlx5_create_qp_params *params) { struct ib_qp_init_attr *attr = params->attr; - struct ib_udata *udata = params->udata; u32 uidx = params->uidx; struct mlx5_ib_resources *devr = &dev->devr; int inlen = MLX5_ST_SZ_BYTES(create_qp_in); @@ -1944,10 +1943,8 @@ static int create_xrc_tgt_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, base = &qp->trans_qp.base; err = mlx5_core_create_qp(dev, &base->mqp, in, inlen); kvfree(in); - if (err) { - destroy_qp(dev, qp, base, udata); + if (err) return err; - } base->container_mibqp = qp; base->mqp.event = mlx5_ib_qp_event; -- cgit v1.2.3 From 968f0b6f9c01bdf772a4c04ee1fe08971d65af14 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 27 Apr 2020 18:46:35 +0300 Subject: RDMA/mlx5: Consolidate into special function all create QP calls Finish separation to blocks of mlx5_ib_create_qp() functions, so all internal create QP implementation are located in one place. Link: https://lore.kernel.org/r/20200427154636.381474-36-leon@kernel.org Reviewed-by: Maor Gottlieb Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/qp.c | 85 ++++++++++++++++++++++++----------------- 1 file changed, 49 insertions(+), 36 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index d7983a951e8d..18c0a25da47a 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -1953,6 +1953,7 @@ static int create_xrc_tgt_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, list_add_tail(&qp->qps_list, &dev->qp_list); spin_unlock_irqrestore(&dev->reset_flow_resource_lock, flags); + qp->trans_qp.xrcdn = to_mxrcd(attr->xrcd)->xrcdn; return 0; } @@ -2785,14 +2786,54 @@ out: return (params->inlen) ? 0 : -EINVAL; } -static int create_raw_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd, - struct mlx5_ib_qp *qp, - struct mlx5_create_qp_params *params) +static int create_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd, + struct mlx5_ib_qp *qp, + struct mlx5_create_qp_params *params) { - if (params->is_rss_raw) - return create_rss_raw_qp_tir(dev, pd, qp, params); + int err; + + if (params->is_rss_raw) { + err = create_rss_raw_qp_tir(dev, pd, qp, params); + goto out; + } + + if (qp->type == MLX5_IB_QPT_DCT) { + err = create_dct(pd, qp, params); + goto out; + } + + if (qp->type == IB_QPT_XRC_TGT) { + err = create_xrc_tgt_qp(dev, qp, params); + goto out; + } + + if (params->udata) + err = create_user_qp(dev, pd, qp, params); + else + err = create_kernel_qp(dev, pd, qp, params); + +out: + if (err) { + mlx5_ib_err(dev, "Create QP type %d failed\n", qp->type); + return err; + } + + if (is_qp0(qp->type)) + qp->ibqp.qp_num = 0; + else if (is_qp1(qp->type)) + qp->ibqp.qp_num = 1; + else + qp->ibqp.qp_num = qp->trans_qp.base.mqp.qpn; + + mlx5_ib_dbg(dev, + "QP type %d, ib qpn 0x%X, mlx qpn 0x%x, rcqn 0x%x, scqn 0x%x\n", + qp->type, qp->ibqp.qp_num, qp->trans_qp.base.mqp.qpn, + params->attr->recv_cq ? to_mcq(params->attr->recv_cq)->mcq.cqn : + -1, + params->attr->send_cq ? to_mcq(params->attr->send_cq)->mcq.cqn : + -1); - return create_user_qp(dev, pd, qp, params); + return 0; } static int check_qp_attr(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, @@ -2862,7 +2903,6 @@ struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *attr, struct mlx5_ib_dev *dev; struct mlx5_ib_qp *qp; enum ib_qp_type type; - u16 xrcdn = 0; int err; dev = pd ? to_mdev(pd->device) : @@ -2922,40 +2962,13 @@ struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *attr, if (err) goto free_qp; - switch (qp->type) { - case IB_QPT_RAW_PACKET: - err = create_raw_qp(dev, pd, qp, ¶ms); - break; - case MLX5_IB_QPT_DCT: - err = create_dct(pd, qp, ¶ms); - break; - case IB_QPT_XRC_TGT: - xrcdn = to_mxrcd(attr->xrcd)->xrcdn; - err = create_xrc_tgt_qp(dev, qp, ¶ms); - break; - default: - if (udata) - err = create_user_qp(dev, pd, qp, ¶ms); - else - err = create_kernel_qp(dev, pd, qp, ¶ms); - } - if (err) { - mlx5_ib_err(dev, "create_qp failed %d\n", err); + err = create_qp(dev, pd, qp, ¶ms); + if (err) goto free_qp; - } kfree(params.ucmd); params.ucmd = NULL; - if (is_qp0(attr->qp_type)) - qp->ibqp.qp_num = 0; - else if (is_qp1(attr->qp_type)) - qp->ibqp.qp_num = 1; - else - qp->ibqp.qp_num = qp->trans_qp.base.mqp.qpn; - - qp->trans_qp.xrcdn = xrcdn; - if (udata) /* * It is safe to copy response for all user create QP flows, -- cgit v1.2.3 From 0eacc574aae7300bf46c10c7116c3ba5825505b7 Mon Sep 17 00:00:00 2001 From: Aharon Landau Date: Mon, 27 Apr 2020 18:46:36 +0300 Subject: RDMA/mlx5: Verify that QP is created with RQ or SQ RAW packet QP and underlay QP must be created with either RQ or SQ, check that. Fixes: e126ba97dba9 ("mlx5: Add driver for Mellanox Connect-IB adapters") Link: https://lore.kernel.org/r/20200427154636.381474-37-leon@kernel.org Signed-off-by: Aharon Landau Reviewed-by: Maor Gottlieb Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/qp.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 18c0a25da47a..14f4f0982e4e 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -1482,6 +1482,8 @@ static int create_raw_packet_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, u16 uid = to_mpd(pd)->uid; u32 out[MLX5_ST_SZ_DW(create_tir_out)] = {}; + if (!qp->sq.wqe_cnt && !qp->rq.wqe_cnt) + return -EINVAL; if (qp->sq.wqe_cnt) { err = create_raw_packet_qp_tis(dev, qp, sq, tdn, pd); if (err) -- cgit v1.2.3 From fa5d010c5630b143b802e0477e87bba0656829cf Mon Sep 17 00:00:00 2001 From: Maor Gottlieb Date: Thu, 30 Apr 2020 22:21:42 +0300 Subject: RDMA: Group create AH arguments in struct Following patch adds additional argument to the create AH function, so it make sense to group ah_attr and flags arguments in struct. Link: https://lore.kernel.org/r/20200430192146.12863-13-maorg@mellanox.com Signed-off-by: Maor Gottlieb Acked-by: Devesh Sharma Acked-by: Gal Pressman Acked-by: Weihang Li Reviewed-by: Jason Gunthorpe Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/verbs.c | 5 ++++- drivers/infiniband/hw/bnxt_re/ib_verbs.c | 8 +++++--- drivers/infiniband/hw/bnxt_re/ib_verbs.h | 2 +- drivers/infiniband/hw/efa/efa.h | 3 +-- drivers/infiniband/hw/efa/efa_verbs.c | 6 +++--- drivers/infiniband/hw/hns/hns_roce_ah.c | 5 +++-- drivers/infiniband/hw/hns/hns_roce_device.h | 4 ++-- drivers/infiniband/hw/mlx4/ah.c | 11 +++++++---- drivers/infiniband/hw/mlx4/mlx4_ib.h | 2 +- drivers/infiniband/hw/mlx5/ah.c | 5 +++-- drivers/infiniband/hw/mlx5/mlx5_ib.h | 2 +- drivers/infiniband/hw/mthca/mthca_provider.c | 9 +++++---- drivers/infiniband/hw/ocrdma/ocrdma_ah.c | 3 ++- drivers/infiniband/hw/ocrdma/ocrdma_ah.h | 2 +- drivers/infiniband/hw/qedr/verbs.c | 4 ++-- drivers/infiniband/hw/qedr/verbs.h | 2 +- drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.c | 5 +++-- drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h | 2 +- drivers/infiniband/sw/rdmavt/ah.c | 11 ++++++----- drivers/infiniband/sw/rdmavt/ah.h | 4 ++-- drivers/infiniband/sw/rxe/rxe_verbs.c | 9 +++++---- include/rdma/ib_verbs.h | 9 +++++++-- 22 files changed, 66 insertions(+), 47 deletions(-) diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index 3bfadd8effcc..86be8a54a2d6 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -502,6 +502,7 @@ static struct ib_ah *_rdma_create_ah(struct ib_pd *pd, u32 flags, struct ib_udata *udata) { + struct rdma_ah_init_attr init_attr = {}; struct ib_device *device = pd->device; struct ib_ah *ah; int ret; @@ -521,8 +522,10 @@ static struct ib_ah *_rdma_create_ah(struct ib_pd *pd, ah->pd = pd; ah->type = ah_attr->type; ah->sgid_attr = rdma_update_sgid_attr(ah_attr, NULL); + init_attr.ah_attr = ah_attr; + init_attr.flags = flags; - ret = device->ops.create_ah(ah, ah_attr, flags, udata); + ret = device->ops.create_ah(ah, &init_attr, udata); if (ret) { kfree(ah); return ERR_PTR(ret); diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c index d98348e82422..5a7c090204c5 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c @@ -631,11 +631,12 @@ static u8 bnxt_re_stack_to_dev_nw_type(enum rdma_network_type ntype) return nw_type; } -int bnxt_re_create_ah(struct ib_ah *ib_ah, struct rdma_ah_attr *ah_attr, - u32 flags, struct ib_udata *udata) +int bnxt_re_create_ah(struct ib_ah *ib_ah, struct rdma_ah_init_attr *init_attr, + struct ib_udata *udata) { struct ib_pd *ib_pd = ib_ah->pd; struct bnxt_re_pd *pd = container_of(ib_pd, struct bnxt_re_pd, ib_pd); + struct rdma_ah_attr *ah_attr = init_attr->ah_attr; const struct ib_global_route *grh = rdma_ah_read_grh(ah_attr); struct bnxt_re_dev *rdev = pd->rdev; const struct ib_gid_attr *sgid_attr; @@ -673,7 +674,8 @@ int bnxt_re_create_ah(struct ib_ah *ib_ah, struct rdma_ah_attr *ah_attr, memcpy(ah->qplib_ah.dmac, ah_attr->roce.dmac, ETH_ALEN); rc = bnxt_qplib_create_ah(&rdev->qplib_res, &ah->qplib_ah, - !(flags & RDMA_CREATE_AH_SLEEPABLE)); + !(init_attr->flags & + RDMA_CREATE_AH_SLEEPABLE)); if (rc) { ibdev_err(&rdev->ibdev, "Failed to allocate HW AH"); return rc; diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.h b/drivers/infiniband/hw/bnxt_re/ib_verbs.h index 18dd46f46cf4..204c0849ba28 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.h +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.h @@ -170,7 +170,7 @@ enum rdma_link_layer bnxt_re_get_link_layer(struct ib_device *ibdev, u8 port_num); int bnxt_re_alloc_pd(struct ib_pd *pd, struct ib_udata *udata); void bnxt_re_dealloc_pd(struct ib_pd *pd, struct ib_udata *udata); -int bnxt_re_create_ah(struct ib_ah *ah, struct rdma_ah_attr *ah_attr, u32 flags, +int bnxt_re_create_ah(struct ib_ah *ah, struct rdma_ah_init_attr *init_attr, struct ib_udata *udata); int bnxt_re_modify_ah(struct ib_ah *ah, struct rdma_ah_attr *ah_attr); int bnxt_re_query_ah(struct ib_ah *ah, struct rdma_ah_attr *ah_attr); diff --git a/drivers/infiniband/hw/efa/efa.h b/drivers/infiniband/hw/efa/efa.h index aa7396a1588a..45d519edb4c3 100644 --- a/drivers/infiniband/hw/efa/efa.h +++ b/drivers/infiniband/hw/efa/efa.h @@ -153,8 +153,7 @@ int efa_mmap(struct ib_ucontext *ibucontext, struct vm_area_struct *vma); void efa_mmap_free(struct rdma_user_mmap_entry *rdma_entry); int efa_create_ah(struct ib_ah *ibah, - struct rdma_ah_attr *ah_attr, - u32 flags, + struct rdma_ah_init_attr *init_attr, struct ib_udata *udata); void efa_destroy_ah(struct ib_ah *ibah, u32 flags); int efa_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, diff --git a/drivers/infiniband/hw/efa/efa_verbs.c b/drivers/infiniband/hw/efa/efa_verbs.c index 5c57098a4aee..454b01b21e6a 100644 --- a/drivers/infiniband/hw/efa/efa_verbs.c +++ b/drivers/infiniband/hw/efa/efa_verbs.c @@ -1639,10 +1639,10 @@ static int efa_ah_destroy(struct efa_dev *dev, struct efa_ah *ah) } int efa_create_ah(struct ib_ah *ibah, - struct rdma_ah_attr *ah_attr, - u32 flags, + struct rdma_ah_init_attr *init_attr, struct ib_udata *udata) { + struct rdma_ah_attr *ah_attr = init_attr->ah_attr; struct efa_dev *dev = to_edev(ibah->device); struct efa_com_create_ah_params params = {}; struct efa_ibv_create_ah_resp resp = {}; @@ -1650,7 +1650,7 @@ int efa_create_ah(struct ib_ah *ibah, struct efa_ah *ah = to_eah(ibah); int err; - if (!(flags & RDMA_CREATE_AH_SLEEPABLE)) { + if (!(init_attr->flags & RDMA_CREATE_AH_SLEEPABLE)) { ibdev_dbg(&dev->ibdev, "Create address handle is not supported in atomic context\n"); err = -EOPNOTSUPP; diff --git a/drivers/infiniband/hw/hns/hns_roce_ah.c b/drivers/infiniband/hw/hns/hns_roce_ah.c index 8a522e14ef62..5b2f9314edd3 100644 --- a/drivers/infiniband/hw/hns/hns_roce_ah.c +++ b/drivers/infiniband/hw/hns/hns_roce_ah.c @@ -39,13 +39,14 @@ #define HNS_ROCE_VLAN_SL_BIT_MASK 7 #define HNS_ROCE_VLAN_SL_SHIFT 13 -int hns_roce_create_ah(struct ib_ah *ibah, struct rdma_ah_attr *ah_attr, - u32 flags, struct ib_udata *udata) +int hns_roce_create_ah(struct ib_ah *ibah, struct rdma_ah_init_attr *init_attr, + struct ib_udata *udata) { struct hns_roce_dev *hr_dev = to_hr_dev(ibah->device); const struct ib_gid_attr *gid_attr; struct device *dev = hr_dev->dev; struct hns_roce_ah *ah = to_hr_ah(ibah); + struct rdma_ah_attr *ah_attr = init_attr->ah_attr; const struct ib_global_route *grh = rdma_ah_read_grh(ah_attr); u16 vlan_id = 0xffff; bool vlan_en = false; diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index ecbfeb6dbdd4..e1032cec2b12 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -1208,8 +1208,8 @@ void hns_roce_bitmap_free_range(struct hns_roce_bitmap *bitmap, unsigned long obj, int cnt, int rr); -int hns_roce_create_ah(struct ib_ah *ah, struct rdma_ah_attr *ah_attr, - u32 flags, struct ib_udata *udata); +int hns_roce_create_ah(struct ib_ah *ah, struct rdma_ah_init_attr *init_attr, + struct ib_udata *udata); int hns_roce_query_ah(struct ib_ah *ibah, struct rdma_ah_attr *ah_attr); void hns_roce_destroy_ah(struct ib_ah *ah, u32 flags); diff --git a/drivers/infiniband/hw/mlx4/ah.c b/drivers/infiniband/hw/mlx4/ah.c index 02a169f8027b..5f8f8d5c0ce0 100644 --- a/drivers/infiniband/hw/mlx4/ah.c +++ b/drivers/infiniband/hw/mlx4/ah.c @@ -141,10 +141,11 @@ static int create_iboe_ah(struct ib_ah *ib_ah, struct rdma_ah_attr *ah_attr) return 0; } -int mlx4_ib_create_ah(struct ib_ah *ib_ah, struct rdma_ah_attr *ah_attr, - u32 flags, struct ib_udata *udata) - +int mlx4_ib_create_ah(struct ib_ah *ib_ah, struct rdma_ah_init_attr *init_attr, + struct ib_udata *udata) { + struct rdma_ah_attr *ah_attr = init_attr->ah_attr; + if (ah_attr->type == RDMA_AH_ATTR_TYPE_ROCE) { if (!(rdma_ah_get_ah_flags(ah_attr) & IB_AH_GRH)) return -EINVAL; @@ -167,12 +168,14 @@ int mlx4_ib_create_ah_slave(struct ib_ah *ah, struct rdma_ah_attr *ah_attr, int slave_sgid_index, u8 *s_mac, u16 vlan_tag) { struct rdma_ah_attr slave_attr = *ah_attr; + struct rdma_ah_init_attr init_attr = {}; struct mlx4_ib_ah *mah = to_mah(ah); int ret; slave_attr.grh.sgid_attr = NULL; slave_attr.grh.sgid_index = slave_sgid_index; - ret = mlx4_ib_create_ah(ah, &slave_attr, 0, NULL); + init_attr.ah_attr = &slave_attr; + ret = mlx4_ib_create_ah(ah, &init_attr, NULL); if (ret) return ret; diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h index d188573187fa..182a237b87f7 100644 --- a/drivers/infiniband/hw/mlx4/mlx4_ib.h +++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h @@ -752,7 +752,7 @@ int mlx4_ib_arm_cq(struct ib_cq *cq, enum ib_cq_notify_flags flags); void __mlx4_ib_cq_clean(struct mlx4_ib_cq *cq, u32 qpn, struct mlx4_ib_srq *srq); void mlx4_ib_cq_clean(struct mlx4_ib_cq *cq, u32 qpn, struct mlx4_ib_srq *srq); -int mlx4_ib_create_ah(struct ib_ah *ah, struct rdma_ah_attr *ah_attr, u32 flags, +int mlx4_ib_create_ah(struct ib_ah *ah, struct rdma_ah_init_attr *init_attr, struct ib_udata *udata); int mlx4_ib_create_ah_slave(struct ib_ah *ah, struct rdma_ah_attr *ah_attr, int slave_sgid_index, u8 *s_mac, u16 vlan_tag); diff --git a/drivers/infiniband/hw/mlx5/ah.c b/drivers/infiniband/hw/mlx5/ah.c index 80642dd359bc..9b59348d51b5 100644 --- a/drivers/infiniband/hw/mlx5/ah.c +++ b/drivers/infiniband/hw/mlx5/ah.c @@ -68,10 +68,11 @@ static void create_ib_ah(struct mlx5_ib_dev *dev, struct mlx5_ib_ah *ah, } } -int mlx5_ib_create_ah(struct ib_ah *ibah, struct rdma_ah_attr *ah_attr, - u32 flags, struct ib_udata *udata) +int mlx5_ib_create_ah(struct ib_ah *ibah, struct rdma_ah_init_attr *init_attr, + struct ib_udata *udata) { + struct rdma_ah_attr *ah_attr = init_attr->ah_attr; struct mlx5_ib_ah *ah = to_mah(ibah); struct mlx5_ib_dev *dev = to_mdev(ibah->device); enum rdma_ah_attr_type ah_type = ah_attr->type; diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index df375cb4efbb..7dffc87601eb 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -1153,7 +1153,7 @@ void mlx5_ib_db_unmap_user(struct mlx5_ib_ucontext *context, struct mlx5_db *db) void __mlx5_ib_cq_clean(struct mlx5_ib_cq *cq, u32 qpn, struct mlx5_ib_srq *srq); void mlx5_ib_cq_clean(struct mlx5_ib_cq *cq, u32 qpn, struct mlx5_ib_srq *srq); void mlx5_ib_free_srq_wqe(struct mlx5_ib_srq *srq, int wqe_index); -int mlx5_ib_create_ah(struct ib_ah *ah, struct rdma_ah_attr *ah_attr, u32 flags, +int mlx5_ib_create_ah(struct ib_ah *ah, struct rdma_ah_init_attr *init_attr, struct ib_udata *udata); int mlx5_ib_query_ah(struct ib_ah *ibah, struct rdma_ah_attr *ah_attr); void mlx5_ib_destroy_ah(struct ib_ah *ah, u32 flags); diff --git a/drivers/infiniband/hw/mthca/mthca_provider.c b/drivers/infiniband/hw/mthca/mthca_provider.c index 69a3e4f62fb1..bc3e3d741ca3 100644 --- a/drivers/infiniband/hw/mthca/mthca_provider.c +++ b/drivers/infiniband/hw/mthca/mthca_provider.c @@ -388,14 +388,15 @@ static void mthca_dealloc_pd(struct ib_pd *pd, struct ib_udata *udata) mthca_pd_free(to_mdev(pd->device), to_mpd(pd)); } -static int mthca_ah_create(struct ib_ah *ibah, struct rdma_ah_attr *ah_attr, - u32 flags, struct ib_udata *udata) +static int mthca_ah_create(struct ib_ah *ibah, + struct rdma_ah_init_attr *init_attr, + struct ib_udata *udata) { struct mthca_ah *ah = to_mah(ibah); - return mthca_create_ah(to_mdev(ibah->device), to_mpd(ibah->pd), ah_attr, - ah); + return mthca_create_ah(to_mdev(ibah->device), to_mpd(ibah->pd), + init_attr->ah_attr, ah); } static void mthca_ah_destroy(struct ib_ah *ah, u32 flags) diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_ah.c b/drivers/infiniband/hw/ocrdma/ocrdma_ah.c index 2b7f00ac41b0..6eea02b18968 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_ah.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_ah.c @@ -155,7 +155,7 @@ static inline int set_av_attr(struct ocrdma_dev *dev, struct ocrdma_ah *ah, return status; } -int ocrdma_create_ah(struct ib_ah *ibah, struct rdma_ah_attr *attr, u32 flags, +int ocrdma_create_ah(struct ib_ah *ibah, struct rdma_ah_init_attr *init_attr, struct ib_udata *udata) { u32 *ahid_addr; @@ -165,6 +165,7 @@ int ocrdma_create_ah(struct ib_ah *ibah, struct rdma_ah_attr *attr, u32 flags, u16 vlan_tag = 0xffff; const struct ib_gid_attr *sgid_attr; struct ocrdma_pd *pd = get_ocrdma_pd(ibah->pd); + struct rdma_ah_attr *attr = init_attr->ah_attr; struct ocrdma_dev *dev = get_ocrdma_dev(ibah->device); if ((attr->type != RDMA_AH_ATTR_TYPE_ROCE) || diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_ah.h b/drivers/infiniband/hw/ocrdma/ocrdma_ah.h index 9780afcde780..8b73b3489f3a 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_ah.h +++ b/drivers/infiniband/hw/ocrdma/ocrdma_ah.h @@ -51,7 +51,7 @@ enum { OCRDMA_AH_L3_TYPE_SHIFT = 0x1D /* 29 bits */ }; -int ocrdma_create_ah(struct ib_ah *ah, struct rdma_ah_attr *ah_attr, u32 flags, +int ocrdma_create_ah(struct ib_ah *ah, struct rdma_ah_init_attr *init_attr, struct ib_udata *udata); void ocrdma_destroy_ah(struct ib_ah *ah, u32 flags); int ocrdma_query_ah(struct ib_ah *ah, struct rdma_ah_attr *ah_attr); diff --git a/drivers/infiniband/hw/qedr/verbs.c b/drivers/infiniband/hw/qedr/verbs.c index a5bd3adaf90a..d6b94a713573 100644 --- a/drivers/infiniband/hw/qedr/verbs.c +++ b/drivers/infiniband/hw/qedr/verbs.c @@ -2750,12 +2750,12 @@ int qedr_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata) return 0; } -int qedr_create_ah(struct ib_ah *ibah, struct rdma_ah_attr *attr, u32 flags, +int qedr_create_ah(struct ib_ah *ibah, struct rdma_ah_init_attr *init_attr, struct ib_udata *udata) { struct qedr_ah *ah = get_qedr_ah(ibah); - rdma_copy_ah_attr(&ah->attr, attr); + rdma_copy_ah_attr(&ah->attr, init_attr->ah_attr); return 0; } diff --git a/drivers/infiniband/hw/qedr/verbs.h b/drivers/infiniband/hw/qedr/verbs.h index 18027844eb87..5e02387e068d 100644 --- a/drivers/infiniband/hw/qedr/verbs.h +++ b/drivers/infiniband/hw/qedr/verbs.h @@ -70,7 +70,7 @@ int qedr_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr); void qedr_destroy_srq(struct ib_srq *ibsrq, struct ib_udata *udata); int qedr_post_srq_recv(struct ib_srq *ibsrq, const struct ib_recv_wr *wr, const struct ib_recv_wr **bad_recv_wr); -int qedr_create_ah(struct ib_ah *ibah, struct rdma_ah_attr *attr, u32 flags, +int qedr_create_ah(struct ib_ah *ibah, struct rdma_ah_init_attr *init_attr, struct ib_udata *udata); void qedr_destroy_ah(struct ib_ah *ibah, u32 flags); diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.c index faf7ecd7b3fa..ccbded2d26ce 100644 --- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.c +++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.c @@ -509,9 +509,10 @@ void pvrdma_dealloc_pd(struct ib_pd *pd, struct ib_udata *udata) * * @return: 0 on success, otherwise errno. */ -int pvrdma_create_ah(struct ib_ah *ibah, struct rdma_ah_attr *ah_attr, - u32 flags, struct ib_udata *udata) +int pvrdma_create_ah(struct ib_ah *ibah, struct rdma_ah_init_attr *init_attr, + struct ib_udata *udata) { + struct rdma_ah_attr *ah_attr = init_attr->ah_attr; struct pvrdma_dev *dev = to_vdev(ibah->device); struct pvrdma_ah *ah = to_vah(ibah); const struct ib_global_route *grh; diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h index e4a48f5c0c85..267702226f10 100644 --- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h +++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h @@ -414,7 +414,7 @@ int pvrdma_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, void pvrdma_destroy_cq(struct ib_cq *cq, struct ib_udata *udata); int pvrdma_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc); int pvrdma_req_notify_cq(struct ib_cq *cq, enum ib_cq_notify_flags flags); -int pvrdma_create_ah(struct ib_ah *ah, struct rdma_ah_attr *ah_attr, u32 flags, +int pvrdma_create_ah(struct ib_ah *ah, struct rdma_ah_init_attr *init_attr, struct ib_udata *udata); void pvrdma_destroy_ah(struct ib_ah *ah, u32 flags); diff --git a/drivers/infiniband/sw/rdmavt/ah.c b/drivers/infiniband/sw/rdmavt/ah.c index ee02c6176007..40480add7dd3 100644 --- a/drivers/infiniband/sw/rdmavt/ah.c +++ b/drivers/infiniband/sw/rdmavt/ah.c @@ -98,14 +98,14 @@ EXPORT_SYMBOL(rvt_check_ah); * * Return: 0 on success */ -int rvt_create_ah(struct ib_ah *ibah, struct rdma_ah_attr *ah_attr, - u32 create_flags, struct ib_udata *udata) +int rvt_create_ah(struct ib_ah *ibah, struct rdma_ah_init_attr *init_attr, + struct ib_udata *udata) { struct rvt_ah *ah = ibah_to_rvtah(ibah); struct rvt_dev_info *dev = ib_to_rvt(ibah->device); unsigned long flags; - if (rvt_check_ah(ibah->device, ah_attr)) + if (rvt_check_ah(ibah->device, init_attr->ah_attr)) return -EINVAL; spin_lock_irqsave(&dev->n_ahs_lock, flags); @@ -117,10 +117,11 @@ int rvt_create_ah(struct ib_ah *ibah, struct rdma_ah_attr *ah_attr, dev->n_ahs_allocated++; spin_unlock_irqrestore(&dev->n_ahs_lock, flags); - rdma_copy_ah_attr(&ah->attr, ah_attr); + rdma_copy_ah_attr(&ah->attr, init_attr->ah_attr); if (dev->driver_f.notify_new_ah) - dev->driver_f.notify_new_ah(ibah->device, ah_attr, ah); + dev->driver_f.notify_new_ah(ibah->device, + init_attr->ah_attr, ah); return 0; } diff --git a/drivers/infiniband/sw/rdmavt/ah.h b/drivers/infiniband/sw/rdmavt/ah.h index bbb4d3bdec4e..40b7123fec76 100644 --- a/drivers/infiniband/sw/rdmavt/ah.h +++ b/drivers/infiniband/sw/rdmavt/ah.h @@ -50,8 +50,8 @@ #include -int rvt_create_ah(struct ib_ah *ah, struct rdma_ah_attr *ah_attr, - u32 create_flags, struct ib_udata *udata); +int rvt_create_ah(struct ib_ah *ah, struct rdma_ah_init_attr *init_attr, + struct ib_udata *udata); void rvt_destroy_ah(struct ib_ah *ibah, u32 destroy_flags); int rvt_modify_ah(struct ib_ah *ibah, struct rdma_ah_attr *ah_attr); int rvt_query_ah(struct ib_ah *ibah, struct rdma_ah_attr *ah_attr); diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c index 9dd4bd7aea92..b8a22af724e8 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.c +++ b/drivers/infiniband/sw/rxe/rxe_verbs.c @@ -195,15 +195,16 @@ static void rxe_dealloc_pd(struct ib_pd *ibpd, struct ib_udata *udata) rxe_drop_ref(pd); } -static int rxe_create_ah(struct ib_ah *ibah, struct rdma_ah_attr *attr, - u32 flags, struct ib_udata *udata) +static int rxe_create_ah(struct ib_ah *ibah, + struct rdma_ah_init_attr *init_attr, + struct ib_udata *udata) { int err; struct rxe_dev *rxe = to_rdev(ibah->device); struct rxe_ah *ah = to_rah(ibah); - err = rxe_av_chk_attr(rxe, attr); + err = rxe_av_chk_attr(rxe, init_attr->ah_attr); if (err) return err; @@ -211,7 +212,7 @@ static int rxe_create_ah(struct ib_ah *ibah, struct rdma_ah_attr *attr, if (err) return err; - rxe_init_av(attr, &ah->av); + rxe_init_av(init_attr->ah_attr, &ah->av); return 0; } diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index bbc5cfb57cd2..20ea26810349 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -880,6 +880,11 @@ struct ib_mr_status { */ __attribute_const__ enum ib_rate mult_to_ib_rate(int mult); +struct rdma_ah_init_attr { + struct rdma_ah_attr *ah_attr; + u32 flags; +}; + enum rdma_ah_attr_type { RDMA_AH_ATTR_TYPE_UNDEFINED, RDMA_AH_ATTR_TYPE_IB, @@ -2403,8 +2408,8 @@ struct ib_device_ops { void (*disassociate_ucontext)(struct ib_ucontext *ibcontext); int (*alloc_pd)(struct ib_pd *pd, struct ib_udata *udata); void (*dealloc_pd)(struct ib_pd *pd, struct ib_udata *udata); - int (*create_ah)(struct ib_ah *ah, struct rdma_ah_attr *ah_attr, - u32 flags, struct ib_udata *udata); + int (*create_ah)(struct ib_ah *ah, struct rdma_ah_init_attr *attr, + struct ib_udata *udata); int (*modify_ah)(struct ib_ah *ah, struct rdma_ah_attr *ah_attr); int (*query_ah)(struct ib_ah *ah, struct rdma_ah_attr *ah_attr); void (*destroy_ah)(struct ib_ah *ah, u32 flags); -- cgit v1.2.3 From bd3920eac133103f0d4aa5fc62290e6df9a6c6da Mon Sep 17 00:00:00 2001 From: Maor Gottlieb Date: Thu, 30 Apr 2020 22:21:43 +0300 Subject: RDMA/core: Add LAG functionality Add support to get the RoCE LAG xmit slave by building skb of the RoCE packet and call to master_get_xmit_slave. If driver wants to get the slave assume all slaves are available, then need to set RDMA_LAG_FLAGS_HASH_ALL_SLAVES in flags. Link: https://lore.kernel.org/r/20200430192146.12863-14-maorg@mellanox.com Signed-off-by: Maor Gottlieb Reviewed-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/Makefile | 2 +- drivers/infiniband/core/lag.c | 136 +++++++++++++++++++++++++++++++++++++++ include/rdma/ib_verbs.h | 1 + include/rdma/lag.h | 23 +++++++ 4 files changed, 161 insertions(+), 1 deletion(-) create mode 100644 drivers/infiniband/core/lag.c create mode 100644 include/rdma/lag.h diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile index d1b14887960e..870f0fcd54d5 100644 --- a/drivers/infiniband/core/Makefile +++ b/drivers/infiniband/core/Makefile @@ -12,7 +12,7 @@ ib_core-y := packer.o ud_header.o verbs.o cq.o rw.o sysfs.o \ roce_gid_mgmt.o mr_pool.o addr.o sa_query.o \ multicast.o mad.o smi.o agent.o mad_rmpp.o \ nldev.o restrack.o counters.o ib_core_uverbs.o \ - trace.o + trace.o lag.o ib_core-$(CONFIG_SECURITY_INFINIBAND) += security.o ib_core-$(CONFIG_CGROUP_RDMA) += cgroup.o diff --git a/drivers/infiniband/core/lag.c b/drivers/infiniband/core/lag.c new file mode 100644 index 000000000000..a29533626a7c --- /dev/null +++ b/drivers/infiniband/core/lag.c @@ -0,0 +1,136 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2020 Mellanox Technologies. All rights reserved. + */ + +#include +#include +#include + +static struct sk_buff *rdma_build_skb(struct ib_device *device, + struct net_device *netdev, + struct rdma_ah_attr *ah_attr, + gfp_t flags) +{ + struct ipv6hdr *ip6h; + struct sk_buff *skb; + struct ethhdr *eth; + struct iphdr *iph; + struct udphdr *uh; + u8 smac[ETH_ALEN]; + bool is_ipv4; + int hdr_len; + + is_ipv4 = ipv6_addr_v4mapped((struct in6_addr *)ah_attr->grh.dgid.raw); + hdr_len = ETH_HLEN + sizeof(struct udphdr) + LL_RESERVED_SPACE(netdev); + hdr_len += is_ipv4 ? sizeof(struct iphdr) : sizeof(struct ipv6hdr); + + skb = alloc_skb(hdr_len, flags); + if (!skb) + return NULL; + + skb->dev = netdev; + skb_reserve(skb, hdr_len); + skb_push(skb, sizeof(struct udphdr)); + skb_reset_transport_header(skb); + uh = udp_hdr(skb); + uh->source = htons(0xC000); + uh->dest = htons(ROCE_V2_UDP_DPORT); + uh->len = htons(sizeof(struct udphdr)); + + if (is_ipv4) { + skb_push(skb, sizeof(struct iphdr)); + skb_reset_network_header(skb); + iph = ip_hdr(skb); + iph->frag_off = 0; + iph->version = 4; + iph->protocol = IPPROTO_UDP; + iph->ihl = 0x5; + iph->tot_len = htons(sizeof(struct udphdr) + sizeof(struct + iphdr)); + memcpy(&iph->saddr, ah_attr->grh.sgid_attr->gid.raw + 12, + sizeof(struct in_addr)); + memcpy(&iph->daddr, ah_attr->grh.dgid.raw + 12, + sizeof(struct in_addr)); + } else { + skb_push(skb, sizeof(struct ipv6hdr)); + skb_reset_network_header(skb); + ip6h = ipv6_hdr(skb); + ip6h->version = 6; + ip6h->nexthdr = IPPROTO_UDP; + memcpy(&ip6h->flow_lbl, &ah_attr->grh.flow_label, + sizeof(*ip6h->flow_lbl)); + memcpy(&ip6h->saddr, ah_attr->grh.sgid_attr->gid.raw, + sizeof(struct in6_addr)); + memcpy(&ip6h->daddr, ah_attr->grh.dgid.raw, + sizeof(struct in6_addr)); + } + + skb_push(skb, sizeof(struct ethhdr)); + skb_reset_mac_header(skb); + eth = eth_hdr(skb); + skb->protocol = eth->h_proto = htons(is_ipv4 ? ETH_P_IP : ETH_P_IPV6); + rdma_read_gid_l2_fields(ah_attr->grh.sgid_attr, NULL, smac); + memcpy(eth->h_source, smac, ETH_ALEN); + memcpy(eth->h_dest, ah_attr->roce.dmac, ETH_ALEN); + + return skb; +} + +static struct net_device *rdma_get_xmit_slave_udp(struct ib_device *device, + struct net_device *master, + struct rdma_ah_attr *ah_attr, + gfp_t flags) +{ + struct net_device *slave; + struct sk_buff *skb; + + skb = rdma_build_skb(device, master, ah_attr, flags); + if (!skb) + return ERR_PTR(-ENOMEM); + + rcu_read_lock(); + slave = netdev_get_xmit_slave(master, skb, + !!(device->lag_flags & + RDMA_LAG_FLAGS_HASH_ALL_SLAVES)); + if (slave) + dev_hold(slave); + rcu_read_unlock(); + kfree_skb(skb); + return slave; +} + +void rdma_lag_put_ah_roce_slave(struct net_device *xmit_slave) +{ + if (xmit_slave) + dev_put(xmit_slave); +} + +struct net_device *rdma_lag_get_ah_roce_slave(struct ib_device *device, + struct rdma_ah_attr *ah_attr, + gfp_t flags) +{ + struct net_device *slave = NULL; + struct net_device *master; + + if (!(ah_attr->type == RDMA_AH_ATTR_TYPE_ROCE && + ah_attr->grh.sgid_attr->gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP)) + return NULL; + + rcu_read_lock(); + master = rdma_read_gid_attr_ndev_rcu(ah_attr->grh.sgid_attr); + if (IS_ERR(master)) { + rcu_read_unlock(); + return master; + } + dev_hold(master); + rcu_read_unlock(); + + if (!netif_is_bond_master(master)) + goto put; + + slave = rdma_get_xmit_slave_udp(device, master, ah_attr, flags); +put: + dev_put(master); + return slave; +} diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 20ea26810349..e6c18ec0365a 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2714,6 +2714,7 @@ struct ib_device { /* Used by iWarp CM */ char iw_ifname[IFNAMSIZ]; u32 iw_driver_flags; + u32 lag_flags; }; struct ib_client_nl_info; diff --git a/include/rdma/lag.h b/include/rdma/lag.h new file mode 100644 index 000000000000..7c06ec9b2eef --- /dev/null +++ b/include/rdma/lag.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* + * Copyright (c) 2020 Mellanox Technologies. All rights reserved. + */ + +#ifndef _RDMA_LAG_H_ +#define _RDMA_LAG_H_ + +#include + +struct ib_device; +struct rdma_ah_attr; + +enum rdma_lag_flags { + RDMA_LAG_FLAGS_HASH_ALL_SLAVES = 1 << 0 +}; + +void rdma_lag_put_ah_roce_slave(struct net_device *xmit_slave); +struct net_device *rdma_lag_get_ah_roce_slave(struct ib_device *device, + struct rdma_ah_attr *ah_attr, + gfp_t flags); + +#endif /* _RDMA_LAG_H_ */ -- cgit v1.2.3 From 51aab12631dd7700385d275846ca49dc0b8c2124 Mon Sep 17 00:00:00 2001 From: Maor Gottlieb Date: Thu, 30 Apr 2020 22:21:44 +0300 Subject: RDMA/core: Get xmit slave for LAG Add a call to rdma_lag_get_ah_roce_slave() when the address handle is created. Lower driver can use it to select the QP's affinity port. Link: https://lore.kernel.org/r/20200430192146.12863-15-maorg@mellanox.com Signed-off-by: Maor Gottlieb Reviewed-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/verbs.c | 61 +++++++++++++++++++++++++++++------------ include/rdma/ib_verbs.h | 2 ++ 2 files changed, 45 insertions(+), 18 deletions(-) diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index 86be8a54a2d6..bf0249f76ae9 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -50,6 +50,7 @@ #include #include #include +#include #include "core_priv.h" #include @@ -500,7 +501,8 @@ rdma_update_sgid_attr(struct rdma_ah_attr *ah_attr, static struct ib_ah *_rdma_create_ah(struct ib_pd *pd, struct rdma_ah_attr *ah_attr, u32 flags, - struct ib_udata *udata) + struct ib_udata *udata, + struct net_device *xmit_slave) { struct rdma_ah_init_attr init_attr = {}; struct ib_device *device = pd->device; @@ -524,6 +526,7 @@ static struct ib_ah *_rdma_create_ah(struct ib_pd *pd, ah->sgid_attr = rdma_update_sgid_attr(ah_attr, NULL); init_attr.ah_attr = ah_attr; init_attr.flags = flags; + init_attr.xmit_slave = xmit_slave; ret = device->ops.create_ah(ah, &init_attr, udata); if (ret) { @@ -550,15 +553,22 @@ struct ib_ah *rdma_create_ah(struct ib_pd *pd, struct rdma_ah_attr *ah_attr, u32 flags) { const struct ib_gid_attr *old_sgid_attr; + struct net_device *slave; struct ib_ah *ah; int ret; ret = rdma_fill_sgid_attr(pd->device, ah_attr, &old_sgid_attr); if (ret) return ERR_PTR(ret); - - ah = _rdma_create_ah(pd, ah_attr, flags, NULL); - + slave = rdma_lag_get_ah_roce_slave(pd->device, ah_attr, + (flags & RDMA_CREATE_AH_SLEEPABLE) ? + GFP_KERNEL : GFP_ATOMIC); + if (IS_ERR(slave)) { + rdma_unfill_sgid_attr(ah_attr, old_sgid_attr); + return (void *)slave; + } + ah = _rdma_create_ah(pd, ah_attr, flags, NULL, slave); + rdma_lag_put_ah_roce_slave(slave); rdma_unfill_sgid_attr(ah_attr, old_sgid_attr); return ah; } @@ -597,7 +607,8 @@ struct ib_ah *rdma_create_user_ah(struct ib_pd *pd, } } - ah = _rdma_create_ah(pd, ah_attr, RDMA_CREATE_AH_SLEEPABLE, udata); + ah = _rdma_create_ah(pd, ah_attr, RDMA_CREATE_AH_SLEEPABLE, + udata, NULL); out: rdma_unfill_sgid_attr(ah_attr, old_sgid_attr); @@ -1636,11 +1647,35 @@ static int _ib_modify_qp(struct ib_qp *qp, struct ib_qp_attr *attr, const struct ib_gid_attr *old_sgid_attr_alt_av; int ret; + attr->xmit_slave = NULL; if (attr_mask & IB_QP_AV) { ret = rdma_fill_sgid_attr(qp->device, &attr->ah_attr, &old_sgid_attr_av); if (ret) return ret; + + if (attr->ah_attr.type == RDMA_AH_ATTR_TYPE_ROCE && + is_qp_type_connected(qp)) { + struct net_device *slave; + + /* + * If the user provided the qp_attr then we have to + * resolve it. Kerne users have to provide already + * resolved rdma_ah_attr's. + */ + if (udata) { + ret = ib_resolve_eth_dmac(qp->device, + &attr->ah_attr); + if (ret) + goto out_av; + } + slave = rdma_lag_get_ah_roce_slave(qp->device, + &attr->ah_attr, + GFP_KERNEL); + if (IS_ERR(slave)) + goto out_av; + attr->xmit_slave = slave; + } } if (attr_mask & IB_QP_ALT_PATH) { /* @@ -1667,18 +1702,6 @@ static int _ib_modify_qp(struct ib_qp *qp, struct ib_qp_attr *attr, } } - /* - * If the user provided the qp_attr then we have to resolve it. Kernel - * users have to provide already resolved rdma_ah_attr's - */ - if (udata && (attr_mask & IB_QP_AV) && - attr->ah_attr.type == RDMA_AH_ATTR_TYPE_ROCE && - is_qp_type_connected(qp)) { - ret = ib_resolve_eth_dmac(qp->device, &attr->ah_attr); - if (ret) - goto out; - } - if (rdma_ib_or_roce(qp->device, port)) { if (attr_mask & IB_QP_RQ_PSN && attr->rq_psn & ~0xffffff) { dev_warn(&qp->device->dev, @@ -1720,8 +1743,10 @@ out: if (attr_mask & IB_QP_ALT_PATH) rdma_unfill_sgid_attr(&attr->alt_ah_attr, old_sgid_attr_alt_av); out_av: - if (attr_mask & IB_QP_AV) + if (attr_mask & IB_QP_AV) { + rdma_lag_put_ah_roce_slave(attr->xmit_slave); rdma_unfill_sgid_attr(&attr->ah_attr, old_sgid_attr_av); + } return ret; } diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index e6c18ec0365a..8d29f2f79da8 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -883,6 +883,7 @@ __attribute_const__ enum ib_rate mult_to_ib_rate(int mult); struct rdma_ah_init_attr { struct rdma_ah_attr *ah_attr; u32 flags; + struct net_device *xmit_slave; }; enum rdma_ah_attr_type { @@ -1272,6 +1273,7 @@ struct ib_qp_attr { u8 alt_port_num; u8 alt_timeout; u32 rate_limit; + struct net_device *xmit_slave; }; enum ib_wr_opcode { -- cgit v1.2.3 From 5163b2743ae00bf428a8a7e06839943b2f3965ed Mon Sep 17 00:00:00 2001 From: Maor Gottlieb Date: Thu, 30 Apr 2020 22:21:45 +0300 Subject: RDMA/mlx5: Refactor affinity related code Move affinity related code in modify qp to function. It's a preparation for next patch the extend the affinity calculation to consider the xmit slave. Link: https://lore.kernel.org/r/20200430192146.12863-16-maorg@mellanox.com Signed-off-by: Maor Gottlieb Reviewed-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/qp.c | 90 ++++++++++++++++++++++++----------------- 1 file changed, 53 insertions(+), 37 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 14f4f0982e4e..14bfdfc8ab96 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -3582,33 +3582,61 @@ static int modify_raw_packet_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, return 0; } -static unsigned int get_tx_affinity(struct mlx5_ib_dev *dev, - struct mlx5_ib_pd *pd, - struct mlx5_ib_qp_base *qp_base, - u8 port_num, struct ib_udata *udata) +static unsigned int get_tx_affinity_rr(struct mlx5_ib_dev *dev, + struct ib_udata *udata) { struct mlx5_ib_ucontext *ucontext = rdma_udata_to_drv_context( udata, struct mlx5_ib_ucontext, ibucontext); - unsigned int tx_port_affinity; + u8 port_num = mlx5_core_native_port_num(dev->mdev) - 1; + atomic_t *tx_port_affinity; - if (ucontext) { - tx_port_affinity = (unsigned int)atomic_add_return( - 1, &ucontext->tx_port_affinity) % - MLX5_MAX_PORTS + - 1; + if (ucontext) + tx_port_affinity = &ucontext->tx_port_affinity; + else + tx_port_affinity = &dev->port[port_num].roce.tx_port_affinity; + + return (unsigned int)atomic_add_return(1, tx_port_affinity) % + MLX5_MAX_PORTS + 1; +} + +static bool qp_supports_affinity(struct ib_qp *qp) +{ + struct mlx5_ib_qp *mqp = to_mqp(qp); + + if ((qp->qp_type == IB_QPT_RC) || + (qp->qp_type == IB_QPT_UD && + !(mqp->flags & MLX5_IB_QP_CREATE_SQPN_QP1)) || + (qp->qp_type == IB_QPT_UC) || + (qp->qp_type == IB_QPT_RAW_PACKET) || + (qp->qp_type == IB_QPT_XRC_INI) || + (qp->qp_type == IB_QPT_XRC_TGT)) + return true; + return false; +} + +static unsigned int get_tx_affinity(struct ib_qp *qp, u8 init, + struct ib_udata *udata) +{ + struct mlx5_ib_ucontext *ucontext = rdma_udata_to_drv_context( + udata, struct mlx5_ib_ucontext, ibucontext); + struct mlx5_ib_dev *dev = to_mdev(qp->device); + struct mlx5_ib_qp *mqp = to_mqp(qp); + struct mlx5_ib_qp_base *qp_base; + unsigned int tx_affinity; + + if (!(dev->lag_active && init && qp_supports_affinity(qp))) + return 0; + + tx_affinity = get_tx_affinity_rr(dev, udata); + + qp_base = &mqp->trans_qp.base; + if (ucontext) mlx5_ib_dbg(dev, "Set tx affinity 0x%x to qpn 0x%x ucontext %p\n", - tx_port_affinity, qp_base->mqp.qpn, ucontext); - } else { - tx_port_affinity = - (unsigned int)atomic_add_return( - 1, &dev->port[port_num].roce.tx_port_affinity) % - MLX5_MAX_PORTS + - 1; + tx_affinity, qp_base->mqp.qpn, ucontext); + else mlx5_ib_dbg(dev, "Set tx affinity 0x%x to qpn 0x%x\n", - tx_port_affinity, qp_base->mqp.qpn); - } - - return tx_port_affinity; + tx_affinity, qp_base->mqp.qpn); + return tx_affinity; } static int __mlx5_ib_qp_set_counter(struct ib_qp *qp, @@ -3718,22 +3746,10 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, } } - if ((cur_state == IB_QPS_RESET) && (new_state == IB_QPS_INIT)) { - if ((ibqp->qp_type == IB_QPT_RC) || - (ibqp->qp_type == IB_QPT_UD && - !(qp->flags & MLX5_IB_QP_CREATE_SQPN_QP1)) || - (ibqp->qp_type == IB_QPT_UC) || - (ibqp->qp_type == IB_QPT_RAW_PACKET) || - (ibqp->qp_type == IB_QPT_XRC_INI) || - (ibqp->qp_type == IB_QPT_XRC_TGT)) { - if (dev->lag_active) { - u8 p = mlx5_core_native_port_num(dev->mdev) - 1; - tx_affinity = get_tx_affinity(dev, pd, base, p, - udata); - context->flags |= cpu_to_be32(tx_affinity << 24); - } - } - } + tx_affinity = get_tx_affinity(ibqp, + cur_state == IB_QPS_RESET && + new_state == IB_QPS_INIT, udata); + context->flags |= cpu_to_be32(tx_affinity << 24); if (is_sqp(ibqp->qp_type)) { context->mtu_msgmax = (IB_MTU_256 << 5) | 8; -- cgit v1.2.3 From cfc1a89e449c02207952c72a4c0394691fdedf43 Mon Sep 17 00:00:00 2001 From: Maor Gottlieb Date: Thu, 30 Apr 2020 22:21:46 +0300 Subject: RDMA/mlx5: Set lag tx affinity according to slave The patch sets the lag tx affinity of the data QPs and the GSI QPs according to the LAG xmit slave. For GSI QPs, in case the link layer is Ethenet (RoCE) we create two GSI QPs, one for each physical port. When the driver selects the GSI QP, it will consider the port affinity result. For connected QPs, the driver sets the affinity of the xmit slave. The above, ensures that RC QP and it's corresponding GSI QP will transmit from the same physical port. Link: https://lore.kernel.org/r/20200430192146.12863-17-maorg@mellanox.com Signed-off-by: Maor Gottlieb Reviewed-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/ah.c | 9 +++++-- drivers/infiniband/hw/mlx5/gsi.c | 33 ++++++++++++++++++----- drivers/infiniband/hw/mlx5/main.c | 2 ++ drivers/infiniband/hw/mlx5/mlx5_ib.h | 1 + drivers/infiniband/hw/mlx5/qp.c | 52 ++++++++++++++++++++++++------------ include/linux/mlx5/mlx5_ifc.h | 4 ++- include/linux/mlx5/qp.h | 2 ++ 7 files changed, 76 insertions(+), 27 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/ah.c b/drivers/infiniband/hw/mlx5/ah.c index 9b59348d51b5..cc858f658567 100644 --- a/drivers/infiniband/hw/mlx5/ah.c +++ b/drivers/infiniband/hw/mlx5/ah.c @@ -33,8 +33,9 @@ #include "mlx5_ib.h" static void create_ib_ah(struct mlx5_ib_dev *dev, struct mlx5_ib_ah *ah, - struct rdma_ah_attr *ah_attr) + struct rdma_ah_init_attr *init_attr) { + struct rdma_ah_attr *ah_attr = init_attr->ah_attr; enum ib_gid_type gid_type; if (rdma_ah_get_ah_flags(ah_attr) & IB_AH_GRH) { @@ -51,6 +52,10 @@ static void create_ib_ah(struct mlx5_ib_dev *dev, struct mlx5_ib_ah *ah, ah->av.stat_rate_sl = (rdma_ah_get_static_rate(ah_attr) << 4); if (ah_attr->type == RDMA_AH_ATTR_TYPE_ROCE) { + if (init_attr->xmit_slave) + ah->xmit_port = + mlx5_lag_get_slave_port(dev->mdev, + init_attr->xmit_slave); gid_type = ah_attr->grh.sgid_attr->gid_type; memcpy(ah->av.rmac, ah_attr->roce.dmac, @@ -98,7 +103,7 @@ int mlx5_ib_create_ah(struct ib_ah *ibah, struct rdma_ah_init_attr *init_attr, return err; } - create_ib_ah(dev, ah, ah_attr); + create_ib_ah(dev, ah, init_attr); return 0; } diff --git a/drivers/infiniband/hw/mlx5/gsi.c b/drivers/infiniband/hw/mlx5/gsi.c index 1afbf03d1a98..40d418153891 100644 --- a/drivers/infiniband/hw/mlx5/gsi.c +++ b/drivers/infiniband/hw/mlx5/gsi.c @@ -119,10 +119,17 @@ struct ib_qp *mlx5_ib_gsi_create_qp(struct ib_pd *pd, struct mlx5_ib_gsi_qp *gsi; struct ib_qp_init_attr hw_init_attr = *init_attr; const u8 port_num = init_attr->port_num; - const int num_pkeys = pd->device->attrs.max_pkeys; - const int num_qps = mlx5_ib_deth_sqpn_cap(dev) ? num_pkeys : 0; + int num_qps = 0; int ret; + if (mlx5_ib_deth_sqpn_cap(dev)) { + if (MLX5_CAP_GEN(dev->mdev, + port_type) == MLX5_CAP_PORT_TYPE_IB) + num_qps = pd->device->attrs.max_pkeys; + else if (dev->lag_active) + num_qps = MLX5_MAX_PORTS; + } + gsi = kzalloc(sizeof(*gsi), GFP_KERNEL); if (!gsi) return ERR_PTR(-ENOMEM); @@ -261,7 +268,7 @@ static struct ib_qp *create_gsi_ud_qp(struct mlx5_ib_gsi_qp *gsi) } static int modify_to_rts(struct mlx5_ib_gsi_qp *gsi, struct ib_qp *qp, - u16 qp_index) + u16 pkey_index) { struct mlx5_ib_dev *dev = to_mdev(qp->device); struct ib_qp_attr attr; @@ -270,7 +277,7 @@ static int modify_to_rts(struct mlx5_ib_gsi_qp *gsi, struct ib_qp *qp, mask = IB_QP_STATE | IB_QP_PKEY_INDEX | IB_QP_QKEY | IB_QP_PORT; attr.qp_state = IB_QPS_INIT; - attr.pkey_index = qp_index; + attr.pkey_index = pkey_index; attr.qkey = IB_QP1_QKEY; attr.port_num = gsi->port_num; ret = ib_modify_qp(qp, &attr, mask); @@ -304,12 +311,17 @@ static void setup_qp(struct mlx5_ib_gsi_qp *gsi, u16 qp_index) { struct ib_device *device = gsi->rx_qp->device; struct mlx5_ib_dev *dev = to_mdev(device); + int pkey_index = qp_index; + struct mlx5_ib_qp *mqp; struct ib_qp *qp; unsigned long flags; u16 pkey; int ret; - ret = ib_query_pkey(device, gsi->port_num, qp_index, &pkey); + if (MLX5_CAP_GEN(dev->mdev, port_type) != MLX5_CAP_PORT_TYPE_IB) + pkey_index = 0; + + ret = ib_query_pkey(device, gsi->port_num, pkey_index, &pkey); if (ret) { mlx5_ib_warn(dev, "unable to read P_Key at port %d, index %d\n", gsi->port_num, qp_index); @@ -338,7 +350,10 @@ static void setup_qp(struct mlx5_ib_gsi_qp *gsi, u16 qp_index) return; } - ret = modify_to_rts(gsi, qp, qp_index); + mqp = to_mqp(qp); + if (dev->lag_active) + mqp->gsi_lag_port = qp_index + 1; + ret = modify_to_rts(gsi, qp, pkey_index); if (ret) goto err_destroy_qp; @@ -457,11 +472,15 @@ static int mlx5_ib_gsi_silent_drop(struct mlx5_ib_gsi_qp *gsi, static struct ib_qp *get_tx_qp(struct mlx5_ib_gsi_qp *gsi, struct ib_ud_wr *wr) { struct mlx5_ib_dev *dev = to_mdev(gsi->rx_qp->device); + struct mlx5_ib_ah *ah = to_mah(wr->ah); int qp_index = wr->pkey_index; - if (!mlx5_ib_deth_sqpn_cap(dev)) + if (!gsi->num_qps) return gsi->rx_qp; + if (dev->lag_active && ah->xmit_port) + qp_index = ah->xmit_port - 1; + if (qp_index >= gsi->num_qps) return NULL; diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 80ae8f04bfd5..e7fb290c9d8d 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -53,6 +53,7 @@ #include #include #include +#include #include #include #include "mlx5_ib.h" @@ -6567,6 +6568,7 @@ static int mlx5_ib_stage_init_init(struct mlx5_ib_dev *dev) dev->ib_dev.phys_port_cnt = dev->num_ports; dev->ib_dev.num_comp_vectors = mlx5_comp_vectors_count(mdev); dev->ib_dev.dev.parent = mdev->device; + dev->ib_dev.lag_flags = RDMA_LAG_FLAGS_HASH_ALL_SLAVES; mutex_init(&dev->cap_mask_mutex); INIT_LIST_HEAD(&dev->qp_list); diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 7dffc87601eb..f250753319d0 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -461,6 +461,7 @@ struct mlx5_ib_qp { * but not take effective */ u32 counter_pending; + u16 gsi_lag_port; }; struct mlx5_ib_cq_buf { diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 14bfdfc8ab96..810bbd52daec 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -3218,10 +3218,12 @@ static enum mlx5_qp_optpar opt_mask[MLX5_QP_NUM_STATE][MLX5_QP_NUM_STATE][MLX5_Q MLX5_QP_OPTPAR_RAE | MLX5_QP_OPTPAR_RWE | MLX5_QP_OPTPAR_PKEY_INDEX | - MLX5_QP_OPTPAR_PRI_PORT, + MLX5_QP_OPTPAR_PRI_PORT | + MLX5_QP_OPTPAR_LAG_TX_AFF, [MLX5_QP_ST_UC] = MLX5_QP_OPTPAR_RWE | MLX5_QP_OPTPAR_PKEY_INDEX | - MLX5_QP_OPTPAR_PRI_PORT, + MLX5_QP_OPTPAR_PRI_PORT | + MLX5_QP_OPTPAR_LAG_TX_AFF, [MLX5_QP_ST_UD] = MLX5_QP_OPTPAR_PKEY_INDEX | MLX5_QP_OPTPAR_Q_KEY | MLX5_QP_OPTPAR_PRI_PORT, @@ -3229,17 +3231,20 @@ static enum mlx5_qp_optpar opt_mask[MLX5_QP_NUM_STATE][MLX5_QP_NUM_STATE][MLX5_Q MLX5_QP_OPTPAR_RAE | MLX5_QP_OPTPAR_RWE | MLX5_QP_OPTPAR_PKEY_INDEX | - MLX5_QP_OPTPAR_PRI_PORT, + MLX5_QP_OPTPAR_PRI_PORT | + MLX5_QP_OPTPAR_LAG_TX_AFF, }, [MLX5_QP_STATE_RTR] = { [MLX5_QP_ST_RC] = MLX5_QP_OPTPAR_ALT_ADDR_PATH | MLX5_QP_OPTPAR_RRE | MLX5_QP_OPTPAR_RAE | MLX5_QP_OPTPAR_RWE | - MLX5_QP_OPTPAR_PKEY_INDEX, + MLX5_QP_OPTPAR_PKEY_INDEX | + MLX5_QP_OPTPAR_LAG_TX_AFF, [MLX5_QP_ST_UC] = MLX5_QP_OPTPAR_ALT_ADDR_PATH | MLX5_QP_OPTPAR_RWE | - MLX5_QP_OPTPAR_PKEY_INDEX, + MLX5_QP_OPTPAR_PKEY_INDEX | + MLX5_QP_OPTPAR_LAG_TX_AFF, [MLX5_QP_ST_UD] = MLX5_QP_OPTPAR_PKEY_INDEX | MLX5_QP_OPTPAR_Q_KEY, [MLX5_QP_ST_MLX] = MLX5_QP_OPTPAR_PKEY_INDEX | @@ -3248,7 +3253,8 @@ static enum mlx5_qp_optpar opt_mask[MLX5_QP_NUM_STATE][MLX5_QP_NUM_STATE][MLX5_Q MLX5_QP_OPTPAR_RRE | MLX5_QP_OPTPAR_RAE | MLX5_QP_OPTPAR_RWE | - MLX5_QP_OPTPAR_PKEY_INDEX, + MLX5_QP_OPTPAR_PKEY_INDEX | + MLX5_QP_OPTPAR_LAG_TX_AFF, }, }, [MLX5_QP_STATE_RTR] = { @@ -3601,11 +3607,8 @@ static unsigned int get_tx_affinity_rr(struct mlx5_ib_dev *dev, static bool qp_supports_affinity(struct ib_qp *qp) { - struct mlx5_ib_qp *mqp = to_mqp(qp); - if ((qp->qp_type == IB_QPT_RC) || - (qp->qp_type == IB_QPT_UD && - !(mqp->flags & MLX5_IB_QP_CREATE_SQPN_QP1)) || + (qp->qp_type == IB_QPT_UD) || (qp->qp_type == IB_QPT_UC) || (qp->qp_type == IB_QPT_RAW_PACKET) || (qp->qp_type == IB_QPT_XRC_INI) || @@ -3614,7 +3617,9 @@ static bool qp_supports_affinity(struct ib_qp *qp) return false; } -static unsigned int get_tx_affinity(struct ib_qp *qp, u8 init, +static unsigned int get_tx_affinity(struct ib_qp *qp, + const struct ib_qp_attr *attr, + int attr_mask, u8 init, struct ib_udata *udata) { struct mlx5_ib_ucontext *ucontext = rdma_udata_to_drv_context( @@ -3624,10 +3629,18 @@ static unsigned int get_tx_affinity(struct ib_qp *qp, u8 init, struct mlx5_ib_qp_base *qp_base; unsigned int tx_affinity; - if (!(dev->lag_active && init && qp_supports_affinity(qp))) + if (!(dev->lag_active && qp_supports_affinity(qp))) return 0; - tx_affinity = get_tx_affinity_rr(dev, udata); + if (mqp->flags & MLX5_IB_QP_CREATE_SQPN_QP1) + tx_affinity = mqp->gsi_lag_port; + else if (init) + tx_affinity = get_tx_affinity_rr(dev, udata); + else if ((attr_mask & IB_QP_AV) && attr->xmit_slave) + tx_affinity = + mlx5_lag_get_slave_port(dev->mdev, attr->xmit_slave); + else + return 0; qp_base = &mqp->trans_qp.base; if (ucontext) @@ -3712,7 +3725,7 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, struct mlx5_qp_context *context; struct mlx5_ib_pd *pd; enum mlx5_qp_state mlx5_cur, mlx5_new; - enum mlx5_qp_optpar optpar; + enum mlx5_qp_optpar optpar = 0; u32 set_id = 0; int mlx5_st; int err; @@ -3746,10 +3759,15 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, } } - tx_affinity = get_tx_affinity(ibqp, + tx_affinity = get_tx_affinity(ibqp, attr, attr_mask, cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT, udata); - context->flags |= cpu_to_be32(tx_affinity << 24); + if (tx_affinity) { + context->flags |= cpu_to_be32(tx_affinity << 24); + if (new_state == IB_QPS_RTR && + MLX5_CAP_GEN(dev->mdev, init2_lag_tx_port_affinity)) + optpar |= MLX5_QP_OPTPAR_LAG_TX_AFF; + } if (is_sqp(ibqp->qp_type)) { context->mtu_msgmax = (IB_MTU_256 << 5) | 8; @@ -3886,7 +3904,7 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, } op = optab[mlx5_cur][mlx5_new]; - optpar = ib_mask_to_mlx5_opt(attr_mask); + optpar |= ib_mask_to_mlx5_opt(attr_mask); optpar &= opt_mask[mlx5_cur][mlx5_new][mlx5_st]; if (qp->ibqp.qp_type == IB_QPT_RAW_PACKET || diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index fb243848132d..c1ba89198335 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -1321,7 +1321,9 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 stat_rate_support[0x10]; u8 reserved_at_1f0[0x1]; u8 pci_sync_for_fw_update_event[0x1]; - u8 reserved_at_1f2[0xa]; + u8 reserved_at_1f2[0x6]; + u8 init2_lag_tx_port_affinity[0x1]; + u8 reserved_at_1fa[0x3]; u8 cqe_version[0x4]; u8 compact_address_vector[0x1]; diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h index f23eb18526fe..b9facdb9b9bd 100644 --- a/include/linux/mlx5/qp.h +++ b/include/linux/mlx5/qp.h @@ -66,6 +66,7 @@ enum mlx5_qp_optpar { MLX5_QP_OPTPAR_RETRY_COUNT = 1 << 12, MLX5_QP_OPTPAR_RNR_RETRY = 1 << 13, MLX5_QP_OPTPAR_ACK_TIMEOUT = 1 << 14, + MLX5_QP_OPTPAR_LAG_TX_AFF = 1 << 15, MLX5_QP_OPTPAR_PRI_PORT = 1 << 16, MLX5_QP_OPTPAR_SRQN = 1 << 18, MLX5_QP_OPTPAR_CQN_RCV = 1 << 19, @@ -321,6 +322,7 @@ struct mlx5_av { struct mlx5_ib_ah { struct ib_ah ibah; struct mlx5_av av; + u8 xmit_port; }; static inline struct mlx5_ib_ah *to_mah(struct ib_ah *ibah) -- cgit v1.2.3 From b2ea69b3b4430642c98eea2c2d08419f2f02124d Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Mon, 20 Apr 2020 09:22:11 +0300 Subject: RDMA/efa: Report create CQ error counter Create CQ errors are already being counted, report them along all other counters. Link: https://lore.kernel.org/r/20200420062213.44577-2-galpress@amazon.com Reviewed-by: Firas JahJah Reviewed-by: Yossi Leybovich Signed-off-by: Gal Pressman Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/efa/efa_verbs.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/infiniband/hw/efa/efa_verbs.c b/drivers/infiniband/hw/efa/efa_verbs.c index 454b01b21e6a..43cc93b19569 100644 --- a/drivers/infiniband/hw/efa/efa_verbs.c +++ b/drivers/infiniband/hw/efa/efa_verbs.c @@ -41,6 +41,7 @@ struct efa_user_mmap_entry { op(EFA_KEEP_ALIVE_RCVD, "keep_alive_rcvd") \ op(EFA_ALLOC_PD_ERR, "alloc_pd_err") \ op(EFA_CREATE_QP_ERR, "create_qp_err") \ + op(EFA_CREATE_CQ_ERR, "create_cq_err") \ op(EFA_REG_MR_ERR, "reg_mr_err") \ op(EFA_ALLOC_UCONTEXT_ERR, "alloc_ucontext_err") \ op(EFA_CREATE_AH_ERR, "create_ah_err") @@ -1753,6 +1754,7 @@ int efa_get_hw_stats(struct ib_device *ibdev, struct rdma_hw_stats *stats, stats->value[EFA_KEEP_ALIVE_RCVD] = atomic64_read(&s->keep_alive_rcvd); stats->value[EFA_ALLOC_PD_ERR] = atomic64_read(&s->sw_stats.alloc_pd_err); stats->value[EFA_CREATE_QP_ERR] = atomic64_read(&s->sw_stats.create_qp_err); + stats->value[EFA_CREATE_CQ_ERR] = atomic64_read(&s->sw_stats.create_cq_err); stats->value[EFA_REG_MR_ERR] = atomic64_read(&s->sw_stats.reg_mr_err); stats->value[EFA_ALLOC_UCONTEXT_ERR] = atomic64_read(&s->sw_stats.alloc_ucontext_err); stats->value[EFA_CREATE_AH_ERR] = atomic64_read(&s->sw_stats.create_ah_err); -- cgit v1.2.3 From eca5757f804f046dfaab4e9d3ea39af1f2523990 Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Mon, 20 Apr 2020 09:22:12 +0300 Subject: RDMA/efa: Count mmap failures Add a new stat that counts mmap failures, which might help when debugging different issues. Link: https://lore.kernel.org/r/20200420062213.44577-3-galpress@amazon.com Reviewed-by: Firas JahJah Reviewed-by: Yossi Leybovich Signed-off-by: Gal Pressman Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/efa/efa.h | 3 ++- drivers/infiniband/hw/efa/efa_verbs.c | 9 +++++++-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/hw/efa/efa.h b/drivers/infiniband/hw/efa/efa.h index 45d519edb4c3..1889dd172a25 100644 --- a/drivers/infiniband/hw/efa/efa.h +++ b/drivers/infiniband/hw/efa/efa.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */ /* - * Copyright 2018-2019 Amazon.com, Inc. or its affiliates. All rights reserved. + * Copyright 2018-2020 Amazon.com, Inc. or its affiliates. All rights reserved. */ #ifndef _EFA_H_ @@ -40,6 +40,7 @@ struct efa_sw_stats { atomic64_t reg_mr_err; atomic64_t alloc_ucontext_err; atomic64_t create_ah_err; + atomic64_t mmap_err; }; /* Don't use anything other than atomic64 */ diff --git a/drivers/infiniband/hw/efa/efa_verbs.c b/drivers/infiniband/hw/efa/efa_verbs.c index 43cc93b19569..1f8162b2067d 100644 --- a/drivers/infiniband/hw/efa/efa_verbs.c +++ b/drivers/infiniband/hw/efa/efa_verbs.c @@ -44,7 +44,8 @@ struct efa_user_mmap_entry { op(EFA_CREATE_CQ_ERR, "create_cq_err") \ op(EFA_REG_MR_ERR, "reg_mr_err") \ op(EFA_ALLOC_UCONTEXT_ERR, "alloc_ucontext_err") \ - op(EFA_CREATE_AH_ERR, "create_ah_err") + op(EFA_CREATE_AH_ERR, "create_ah_err") \ + op(EFA_MMAP_ERR, "mmap_err") #define EFA_STATS_ENUM(ename, name) ename, #define EFA_STATS_STR(ename, name) [ename] = name, @@ -1569,6 +1570,7 @@ static int __efa_mmap(struct efa_dev *dev, struct efa_ucontext *ucontext, ibdev_dbg(&dev->ibdev, "pgoff[%#lx] does not have valid entry\n", vma->vm_pgoff); + atomic64_inc(&dev->stats.sw_stats.mmap_err); return -EINVAL; } entry = to_emmap(rdma_entry); @@ -1604,12 +1606,14 @@ static int __efa_mmap(struct efa_dev *dev, struct efa_ucontext *ucontext, err = -EINVAL; } - if (err) + if (err) { ibdev_dbg( &dev->ibdev, "Couldn't mmap address[%#llx] length[%#zx] mmap_flag[%d] err[%d]\n", entry->address, rdma_entry->npages * PAGE_SIZE, entry->mmap_flag, err); + atomic64_inc(&dev->stats.sw_stats.mmap_err); + } rdma_user_mmap_entry_put(rdma_entry); return err; @@ -1758,6 +1762,7 @@ int efa_get_hw_stats(struct ib_device *ibdev, struct rdma_hw_stats *stats, stats->value[EFA_REG_MR_ERR] = atomic64_read(&s->sw_stats.reg_mr_err); stats->value[EFA_ALLOC_UCONTEXT_ERR] = atomic64_read(&s->sw_stats.alloc_ucontext_err); stats->value[EFA_CREATE_AH_ERR] = atomic64_read(&s->sw_stats.create_ah_err); + stats->value[EFA_MMAP_ERR] = atomic64_read(&s->sw_stats.mmap_err); return ARRAY_SIZE(efa_stats_names); } -- cgit v1.2.3 From f86e34374a05635332229d1928796d04017ddf16 Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Mon, 20 Apr 2020 09:22:13 +0300 Subject: RDMA/efa: Count admin commands errors Add a new stat that counts admin commands failures, which might help when debugging different issues. Link: https://lore.kernel.org/r/20200420062213.44577-4-galpress@amazon.com Reviewed-by: Daniel Kranzdorf Reviewed-by: Yossi Leybovich Signed-off-by: Gal Pressman Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/efa/efa_com.c | 5 ++++- drivers/infiniband/hw/efa/efa_com.h | 3 ++- drivers/infiniband/hw/efa/efa_verbs.c | 2 ++ 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/efa/efa_com.c b/drivers/infiniband/hw/efa/efa_com.c index 7fce69f5568f..336bc2c57bb1 100644 --- a/drivers/infiniband/hw/efa/efa_com.c +++ b/drivers/infiniband/hw/efa/efa_com.c @@ -631,17 +631,20 @@ int efa_com_cmd_exec(struct efa_com_admin_queue *aq, cmd->aq_common_descriptor.opcode, PTR_ERR(comp_ctx)); up(&aq->avail_cmds); + atomic64_inc(&aq->stats.cmd_err); return PTR_ERR(comp_ctx); } err = efa_com_wait_and_process_admin_cq(comp_ctx, aq); - if (err) + if (err) { ibdev_err_ratelimited( aq->efa_dev, "Failed to process command %s (opcode %u) comp_status %d err %d\n", efa_com_cmd_str(cmd->aq_common_descriptor.opcode), cmd->aq_common_descriptor.opcode, comp_ctx->comp_status, err); + atomic64_inc(&aq->stats.cmd_err); + } up(&aq->avail_cmds); diff --git a/drivers/infiniband/hw/efa/efa_com.h b/drivers/infiniband/hw/efa/efa_com.h index c67dd8109d1c..5e4c88877ddb 100644 --- a/drivers/infiniband/hw/efa/efa_com.h +++ b/drivers/infiniband/hw/efa/efa_com.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */ /* - * Copyright 2018-2019 Amazon.com, Inc. or its affiliates. All rights reserved. + * Copyright 2018-2020 Amazon.com, Inc. or its affiliates. All rights reserved. */ #ifndef _EFA_COM_H_ @@ -47,6 +47,7 @@ struct efa_com_admin_sq { struct efa_com_stats_admin { atomic64_t submitted_cmd; atomic64_t completed_cmd; + atomic64_t cmd_err; atomic64_t no_completion; }; diff --git a/drivers/infiniband/hw/efa/efa_verbs.c b/drivers/infiniband/hw/efa/efa_verbs.c index 1f8162b2067d..08313f7c73bc 100644 --- a/drivers/infiniband/hw/efa/efa_verbs.c +++ b/drivers/infiniband/hw/efa/efa_verbs.c @@ -37,6 +37,7 @@ struct efa_user_mmap_entry { op(EFA_RX_DROPS, "rx_drops") \ op(EFA_SUBMITTED_CMDS, "submitted_cmds") \ op(EFA_COMPLETED_CMDS, "completed_cmds") \ + op(EFA_CMDS_ERR, "cmds_err") \ op(EFA_NO_COMPLETION_CMDS, "no_completion_cmds") \ op(EFA_KEEP_ALIVE_RCVD, "keep_alive_rcvd") \ op(EFA_ALLOC_PD_ERR, "alloc_pd_err") \ @@ -1752,6 +1753,7 @@ int efa_get_hw_stats(struct ib_device *ibdev, struct rdma_hw_stats *stats, as = &dev->edev.aq.stats; stats->value[EFA_SUBMITTED_CMDS] = atomic64_read(&as->submitted_cmd); stats->value[EFA_COMPLETED_CMDS] = atomic64_read(&as->completed_cmd); + stats->value[EFA_CMDS_ERR] = atomic64_read(&as->cmd_err); stats->value[EFA_NO_COMPLETION_CMDS] = atomic64_read(&as->no_completion); s = &dev->stats; -- cgit v1.2.3 From 04c349a96506961b1b31e8d03e784fe3c5413e0b Mon Sep 17 00:00:00 2001 From: Maor Gottlieb Date: Mon, 13 Apr 2020 16:24:08 +0300 Subject: RDMA/mad: Remove snoop interface Snoop interface is not used. Remove it. Link: https://lore.kernel.org/r/20200413132408.931084-1-leon@kernel.org Signed-off-by: Maor Gottlieb Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/mad.c | 238 +----------------------------------------- include/rdma/ib_mad.h | 49 +-------- 2 files changed, 6 insertions(+), 281 deletions(-) diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c index c54db13fa9b0..e02b5c4fdf09 100644 --- a/drivers/infiniband/core/mad.c +++ b/drivers/infiniband/core/mad.c @@ -85,7 +85,6 @@ MODULE_PARM_DESC(send_queue_size, "Size of send queue in number of work requests module_param_named(recv_queue_size, mad_recvq_size, int, 0444); MODULE_PARM_DESC(recv_queue_size, "Size of receive queue in number of work requests"); -/* Client ID 0 is used for snoop-only clients */ static DEFINE_XARRAY_ALLOC1(ib_mad_clients); static u32 ib_mad_client_next; static struct list_head ib_mad_port_list; @@ -483,141 +482,12 @@ error1: } EXPORT_SYMBOL(ib_register_mad_agent); -static inline int is_snooping_sends(int mad_snoop_flags) -{ - return (mad_snoop_flags & - (/*IB_MAD_SNOOP_POSTED_SENDS | - IB_MAD_SNOOP_RMPP_SENDS |*/ - IB_MAD_SNOOP_SEND_COMPLETIONS /*| - IB_MAD_SNOOP_RMPP_SEND_COMPLETIONS*/)); -} - -static inline int is_snooping_recvs(int mad_snoop_flags) -{ - return (mad_snoop_flags & - (IB_MAD_SNOOP_RECVS /*| - IB_MAD_SNOOP_RMPP_RECVS*/)); -} - -static int register_snoop_agent(struct ib_mad_qp_info *qp_info, - struct ib_mad_snoop_private *mad_snoop_priv) -{ - struct ib_mad_snoop_private **new_snoop_table; - unsigned long flags; - int i; - - spin_lock_irqsave(&qp_info->snoop_lock, flags); - /* Check for empty slot in array. */ - for (i = 0; i < qp_info->snoop_table_size; i++) - if (!qp_info->snoop_table[i]) - break; - - if (i == qp_info->snoop_table_size) { - /* Grow table. */ - new_snoop_table = krealloc(qp_info->snoop_table, - sizeof mad_snoop_priv * - (qp_info->snoop_table_size + 1), - GFP_ATOMIC); - if (!new_snoop_table) { - i = -ENOMEM; - goto out; - } - - qp_info->snoop_table = new_snoop_table; - qp_info->snoop_table_size++; - } - qp_info->snoop_table[i] = mad_snoop_priv; - atomic_inc(&qp_info->snoop_count); -out: - spin_unlock_irqrestore(&qp_info->snoop_lock, flags); - return i; -} - -struct ib_mad_agent *ib_register_mad_snoop(struct ib_device *device, - u8 port_num, - enum ib_qp_type qp_type, - int mad_snoop_flags, - ib_mad_snoop_handler snoop_handler, - ib_mad_recv_handler recv_handler, - void *context) -{ - struct ib_mad_port_private *port_priv; - struct ib_mad_agent *ret; - struct ib_mad_snoop_private *mad_snoop_priv; - int qpn; - int err; - - /* Validate parameters */ - if ((is_snooping_sends(mad_snoop_flags) && !snoop_handler) || - (is_snooping_recvs(mad_snoop_flags) && !recv_handler)) { - ret = ERR_PTR(-EINVAL); - goto error1; - } - qpn = get_spl_qp_index(qp_type); - if (qpn == -1) { - ret = ERR_PTR(-EINVAL); - goto error1; - } - port_priv = ib_get_mad_port(device, port_num); - if (!port_priv) { - ret = ERR_PTR(-ENODEV); - goto error1; - } - /* Allocate structures */ - mad_snoop_priv = kzalloc(sizeof *mad_snoop_priv, GFP_KERNEL); - if (!mad_snoop_priv) { - ret = ERR_PTR(-ENOMEM); - goto error1; - } - - /* Now, fill in the various structures */ - mad_snoop_priv->qp_info = &port_priv->qp_info[qpn]; - mad_snoop_priv->agent.device = device; - mad_snoop_priv->agent.recv_handler = recv_handler; - mad_snoop_priv->agent.snoop_handler = snoop_handler; - mad_snoop_priv->agent.context = context; - mad_snoop_priv->agent.qp = port_priv->qp_info[qpn].qp; - mad_snoop_priv->agent.port_num = port_num; - mad_snoop_priv->mad_snoop_flags = mad_snoop_flags; - init_completion(&mad_snoop_priv->comp); - - err = ib_mad_agent_security_setup(&mad_snoop_priv->agent, qp_type); - if (err) { - ret = ERR_PTR(err); - goto error2; - } - - mad_snoop_priv->snoop_index = register_snoop_agent( - &port_priv->qp_info[qpn], - mad_snoop_priv); - if (mad_snoop_priv->snoop_index < 0) { - ret = ERR_PTR(mad_snoop_priv->snoop_index); - goto error3; - } - - atomic_set(&mad_snoop_priv->refcount, 1); - return &mad_snoop_priv->agent; -error3: - ib_mad_agent_security_cleanup(&mad_snoop_priv->agent); -error2: - kfree(mad_snoop_priv); -error1: - return ret; -} -EXPORT_SYMBOL(ib_register_mad_snoop); - static inline void deref_mad_agent(struct ib_mad_agent_private *mad_agent_priv) { if (atomic_dec_and_test(&mad_agent_priv->refcount)) complete(&mad_agent_priv->comp); } -static inline void deref_snoop_agent(struct ib_mad_snoop_private *mad_snoop_priv) -{ - if (atomic_dec_and_test(&mad_snoop_priv->refcount)) - complete(&mad_snoop_priv->comp); -} - static void unregister_mad_agent(struct ib_mad_agent_private *mad_agent_priv) { struct ib_mad_port_private *port_priv; @@ -650,25 +520,6 @@ static void unregister_mad_agent(struct ib_mad_agent_private *mad_agent_priv) kfree_rcu(mad_agent_priv, rcu); } -static void unregister_mad_snoop(struct ib_mad_snoop_private *mad_snoop_priv) -{ - struct ib_mad_qp_info *qp_info; - unsigned long flags; - - qp_info = mad_snoop_priv->qp_info; - spin_lock_irqsave(&qp_info->snoop_lock, flags); - qp_info->snoop_table[mad_snoop_priv->snoop_index] = NULL; - atomic_dec(&qp_info->snoop_count); - spin_unlock_irqrestore(&qp_info->snoop_lock, flags); - - deref_snoop_agent(mad_snoop_priv); - wait_for_completion(&mad_snoop_priv->comp); - - ib_mad_agent_security_cleanup(&mad_snoop_priv->agent); - - kfree(mad_snoop_priv); -} - /* * ib_unregister_mad_agent - Unregisters a client from using MAD services * @@ -677,20 +528,11 @@ static void unregister_mad_snoop(struct ib_mad_snoop_private *mad_snoop_priv) void ib_unregister_mad_agent(struct ib_mad_agent *mad_agent) { struct ib_mad_agent_private *mad_agent_priv; - struct ib_mad_snoop_private *mad_snoop_priv; - - /* If the TID is zero, the agent can only snoop. */ - if (mad_agent->hi_tid) { - mad_agent_priv = container_of(mad_agent, - struct ib_mad_agent_private, - agent); - unregister_mad_agent(mad_agent_priv); - } else { - mad_snoop_priv = container_of(mad_agent, - struct ib_mad_snoop_private, - agent); - unregister_mad_snoop(mad_snoop_priv); - } + + mad_agent_priv = container_of(mad_agent, + struct ib_mad_agent_private, + agent); + unregister_mad_agent(mad_agent_priv); } EXPORT_SYMBOL(ib_unregister_mad_agent); @@ -706,57 +548,6 @@ static void dequeue_mad(struct ib_mad_list_head *mad_list) spin_unlock_irqrestore(&mad_queue->lock, flags); } -static void snoop_send(struct ib_mad_qp_info *qp_info, - struct ib_mad_send_buf *send_buf, - struct ib_mad_send_wc *mad_send_wc, - int mad_snoop_flags) -{ - struct ib_mad_snoop_private *mad_snoop_priv; - unsigned long flags; - int i; - - spin_lock_irqsave(&qp_info->snoop_lock, flags); - for (i = 0; i < qp_info->snoop_table_size; i++) { - mad_snoop_priv = qp_info->snoop_table[i]; - if (!mad_snoop_priv || - !(mad_snoop_priv->mad_snoop_flags & mad_snoop_flags)) - continue; - - atomic_inc(&mad_snoop_priv->refcount); - spin_unlock_irqrestore(&qp_info->snoop_lock, flags); - mad_snoop_priv->agent.snoop_handler(&mad_snoop_priv->agent, - send_buf, mad_send_wc); - deref_snoop_agent(mad_snoop_priv); - spin_lock_irqsave(&qp_info->snoop_lock, flags); - } - spin_unlock_irqrestore(&qp_info->snoop_lock, flags); -} - -static void snoop_recv(struct ib_mad_qp_info *qp_info, - struct ib_mad_recv_wc *mad_recv_wc, - int mad_snoop_flags) -{ - struct ib_mad_snoop_private *mad_snoop_priv; - unsigned long flags; - int i; - - spin_lock_irqsave(&qp_info->snoop_lock, flags); - for (i = 0; i < qp_info->snoop_table_size; i++) { - mad_snoop_priv = qp_info->snoop_table[i]; - if (!mad_snoop_priv || - !(mad_snoop_priv->mad_snoop_flags & mad_snoop_flags)) - continue; - - atomic_inc(&mad_snoop_priv->refcount); - spin_unlock_irqrestore(&qp_info->snoop_lock, flags); - mad_snoop_priv->agent.recv_handler(&mad_snoop_priv->agent, NULL, - mad_recv_wc); - deref_snoop_agent(mad_snoop_priv); - spin_lock_irqsave(&qp_info->snoop_lock, flags); - } - spin_unlock_irqrestore(&qp_info->snoop_lock, flags); -} - static void build_smp_wc(struct ib_qp *qp, struct ib_cqe *cqe, u16 slid, u16 pkey_index, u8 port_num, struct ib_wc *wc) { @@ -2289,9 +2080,6 @@ static void ib_mad_recv_done(struct ib_cq *cq, struct ib_wc *wc) recv->header.recv_wc.recv_buf.mad = (struct ib_mad *)recv->mad; recv->header.recv_wc.recv_buf.grh = &recv->grh; - if (atomic_read(&qp_info->snoop_count)) - snoop_recv(qp_info, &recv->header.recv_wc, IB_MAD_SNOOP_RECVS); - /* Validate MAD */ if (!validate_mad((const struct ib_mad_hdr *)recv->mad, qp_info, opa)) goto out; @@ -2538,9 +2326,6 @@ retry: mad_send_wc.send_buf = &mad_send_wr->send_buf; mad_send_wc.status = wc->status; mad_send_wc.vendor_err = wc->vendor_err; - if (atomic_read(&qp_info->snoop_count)) - snoop_send(qp_info, &mad_send_wr->send_buf, &mad_send_wc, - IB_MAD_SNOOP_SEND_COMPLETIONS); ib_mad_complete_send_wr(mad_send_wr, &mad_send_wc); if (queued_send_wr) { @@ -2782,10 +2567,6 @@ static void local_completions(struct work_struct *work) local->mad_priv->header.recv_wc.recv_buf.grh = NULL; local->mad_priv->header.recv_wc.recv_buf.mad = (struct ib_mad *)local->mad_priv->mad; - if (atomic_read(&recv_mad_agent->qp_info->snoop_count)) - snoop_recv(recv_mad_agent->qp_info, - &local->mad_priv->header.recv_wc, - IB_MAD_SNOOP_RECVS); recv_mad_agent->agent.recv_handler( &recv_mad_agent->agent, &local->mad_send_wr->send_buf, @@ -2800,10 +2581,6 @@ local_send_completion: mad_send_wc.status = IB_WC_SUCCESS; mad_send_wc.vendor_err = 0; mad_send_wc.send_buf = &local->mad_send_wr->send_buf; - if (atomic_read(&mad_agent_priv->qp_info->snoop_count)) - snoop_send(mad_agent_priv->qp_info, - &local->mad_send_wr->send_buf, - &mad_send_wc, IB_MAD_SNOOP_SEND_COMPLETIONS); mad_agent_priv->agent.send_handler(&mad_agent_priv->agent, &mad_send_wc); @@ -3119,10 +2896,6 @@ static void init_mad_qp(struct ib_mad_port_private *port_priv, init_mad_queue(qp_info, &qp_info->send_queue); init_mad_queue(qp_info, &qp_info->recv_queue); INIT_LIST_HEAD(&qp_info->overflow_list); - spin_lock_init(&qp_info->snoop_lock); - qp_info->snoop_table = NULL; - qp_info->snoop_table_size = 0; - atomic_set(&qp_info->snoop_count, 0); } static int create_mad_qp(struct ib_mad_qp_info *qp_info, @@ -3166,7 +2939,6 @@ static void destroy_mad_qp(struct ib_mad_qp_info *qp_info) return; ib_destroy_qp(qp_info->qp); - kfree(qp_info->snoop_table); } /* diff --git a/include/rdma/ib_mad.h b/include/rdma/ib_mad.h index 4e62650e2127..8c093fc1bb9f 100644 --- a/include/rdma/ib_mad.h +++ b/include/rdma/ib_mad.h @@ -558,20 +558,6 @@ struct ib_mad_recv_wc; typedef void (*ib_mad_send_handler)(struct ib_mad_agent *mad_agent, struct ib_mad_send_wc *mad_send_wc); -/** - * ib_mad_snoop_handler - Callback handler for snooping sent MADs. - * @mad_agent: MAD agent that snooped the MAD. - * @send_buf: send MAD data buffer. - * @mad_send_wc: Work completion information on the sent MAD. Valid - * only for snooping that occurs on a send completion. - * - * Clients snooping MADs should not modify data referenced by the @send_buf - * or @mad_send_wc. - */ -typedef void (*ib_mad_snoop_handler)(struct ib_mad_agent *mad_agent, - struct ib_mad_send_buf *send_buf, - struct ib_mad_send_wc *mad_send_wc); - /** * ib_mad_recv_handler - callback handler for a received MAD. * @mad_agent: MAD agent requesting the received MAD. @@ -581,8 +567,7 @@ typedef void (*ib_mad_snoop_handler)(struct ib_mad_agent *mad_agent, * MADs received in response to a send request operation will be handed to * the user before the send operation completes. All data buffers given * to registered agents through this routine are owned by the receiving - * client, except for snooping agents. Clients snooping MADs should not - * modify the data referenced by @mad_recv_wc. + * client. */ typedef void (*ib_mad_recv_handler)(struct ib_mad_agent *mad_agent, struct ib_mad_send_buf *send_buf, @@ -595,7 +580,6 @@ typedef void (*ib_mad_recv_handler)(struct ib_mad_agent *mad_agent, * @mr: Memory region for system memory usable for DMA. * @recv_handler: Callback handler for a received MAD. * @send_handler: Callback handler for a sent MAD. - * @snoop_handler: Callback handler for snooped sent MADs. * @context: User-specified context associated with this registration. * @hi_tid: Access layer assigned transaction ID for this client. * Unsolicited MADs sent by this client will have the upper 32-bits @@ -612,7 +596,6 @@ struct ib_mad_agent { struct ib_qp *qp; ib_mad_recv_handler recv_handler; ib_mad_send_handler send_handler; - ib_mad_snoop_handler snoop_handler; void *context; u32 hi_tid; u32 flags; @@ -720,36 +703,6 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device, ib_mad_recv_handler recv_handler, void *context, u32 registration_flags); - -enum ib_mad_snoop_flags { - /*IB_MAD_SNOOP_POSTED_SENDS = 1,*/ - /*IB_MAD_SNOOP_RMPP_SENDS = (1<<1),*/ - IB_MAD_SNOOP_SEND_COMPLETIONS = (1<<2), - /*IB_MAD_SNOOP_RMPP_SEND_COMPLETIONS = (1<<3),*/ - IB_MAD_SNOOP_RECVS = (1<<4) - /*IB_MAD_SNOOP_RMPP_RECVS = (1<<5),*/ - /*IB_MAD_SNOOP_REDIRECTED_QPS = (1<<6)*/ -}; - -/** - * ib_register_mad_snoop - Register to snoop sent and received MADs. - * @device: The device to register with. - * @port_num: The port on the specified device to use. - * @qp_type: Specifies which QP traffic to snoop. Must be either - * IB_QPT_SMI or IB_QPT_GSI. - * @mad_snoop_flags: Specifies information where snooping occurs. - * @send_handler: The callback routine invoked for a snooped send. - * @recv_handler: The callback routine invoked for a snooped receive. - * @context: User specified context associated with the registration. - */ -struct ib_mad_agent *ib_register_mad_snoop(struct ib_device *device, - u8 port_num, - enum ib_qp_type qp_type, - int mad_snoop_flags, - ib_mad_snoop_handler snoop_handler, - ib_mad_recv_handler recv_handler, - void *context); - /** * ib_unregister_mad_agent - Unregisters a client from using MAD services. * @mad_agent: Corresponding MAD registration request to deregister. -- cgit v1.2.3 From 11a0ae4c4bff9b2a471b54dbe910fc0f60e58e62 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 21 Apr 2020 20:24:40 +0300 Subject: RDMA: Allow ib_client's to fail when add() is called When a client is added it isn't allowed to fail, but all the client's have various failure paths within their add routines. This creates the very fringe condition where the client was added, failed during add and didn't set the client_data. The core code will then still call other client_data centric ops like remove(), rename(), get_nl_info(), and get_net_dev_by_params() with NULL client_data - which is confusing and unexpected. If the add() callback fails, then do not call any more client ops for the device, even remove. Remove all the now redundant checks for NULL client_data in ops callbacks. Update all the add() callbacks to return error codes appropriately. EOPNOTSUPP is used for cases where the ULP does not support the ib_device - eg because it only works with IB. Link: https://lore.kernel.org/r/20200421172440.387069-1-leon@kernel.org Signed-off-by: Leon Romanovsky Acked-by: Ursula Braun Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cm.c | 24 ++++++++++++++---------- drivers/infiniband/core/cma.c | 23 ++++++++++++----------- drivers/infiniband/core/device.c | 16 ++++++++++++++-- drivers/infiniband/core/mad.c | 17 +++++++++++++---- drivers/infiniband/core/multicast.c | 12 +++++------- drivers/infiniband/core/sa_query.c | 22 ++++++++++++---------- drivers/infiniband/core/user_mad.c | 22 ++++++++++++---------- drivers/infiniband/core/uverbs_main.c | 24 ++++++++++++------------ drivers/infiniband/ulp/ipoib/ipoib_main.c | 15 +++++---------- drivers/infiniband/ulp/opa_vnic/opa_vnic_vema.c | 12 +++++------- drivers/infiniband/ulp/srp/ib_srp.c | 21 ++++++++++----------- drivers/infiniband/ulp/srpt/ib_srpt.c | 25 ++++++++++--------------- include/rdma/ib_verbs.h | 2 +- net/rds/ib.c | 21 +++++++++++++-------- net/smc/smc_ib.c | 10 ++++------ 15 files changed, 142 insertions(+), 124 deletions(-) diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c index 4794113ecd59..68e1a9bba027 100644 --- a/drivers/infiniband/core/cm.c +++ b/drivers/infiniband/core/cm.c @@ -81,7 +81,7 @@ const char *__attribute_const__ ibcm_reject_msg(int reason) EXPORT_SYMBOL(ibcm_reject_msg); struct cm_id_private; -static void cm_add_one(struct ib_device *device); +static int cm_add_one(struct ib_device *device); static void cm_remove_one(struct ib_device *device, void *client_data); static int cm_send_sidr_rep_locked(struct cm_id_private *cm_id_priv, struct ib_cm_sidr_rep_param *param); @@ -4382,7 +4382,7 @@ static void cm_remove_port_fs(struct cm_port *port) } -static void cm_add_one(struct ib_device *ib_device) +static int cm_add_one(struct ib_device *ib_device) { struct cm_device *cm_dev; struct cm_port *port; @@ -4401,7 +4401,7 @@ static void cm_add_one(struct ib_device *ib_device) cm_dev = kzalloc(struct_size(cm_dev, port, ib_device->phys_port_cnt), GFP_KERNEL); if (!cm_dev) - return; + return -ENOMEM; cm_dev->ib_device = ib_device; cm_dev->ack_delay = ib_device->attrs.local_ca_ack_delay; @@ -4413,8 +4413,10 @@ static void cm_add_one(struct ib_device *ib_device) continue; port = kzalloc(sizeof *port, GFP_KERNEL); - if (!port) + if (!port) { + ret = -ENOMEM; goto error1; + } cm_dev->port[i-1] = port; port->cm_dev = cm_dev; @@ -4435,8 +4437,10 @@ static void cm_add_one(struct ib_device *ib_device) cm_recv_handler, port, 0); - if (IS_ERR(port->mad_agent)) + if (IS_ERR(port->mad_agent)) { + ret = PTR_ERR(port->mad_agent); goto error2; + } ret = ib_modify_port(ib_device, i, 0, &port_modify); if (ret) @@ -4445,15 +4449,17 @@ static void cm_add_one(struct ib_device *ib_device) count++; } - if (!count) + if (!count) { + ret = -EOPNOTSUPP; goto free; + } ib_set_client_data(ib_device, &cm_client, cm_dev); write_lock_irqsave(&cm.device_lock, flags); list_add_tail(&cm_dev->list, &cm.device_list); write_unlock_irqrestore(&cm.device_lock, flags); - return; + return 0; error3: ib_unregister_mad_agent(port->mad_agent); @@ -4475,6 +4481,7 @@ error1: } free: kfree(cm_dev); + return ret; } static void cm_remove_one(struct ib_device *ib_device, void *client_data) @@ -4489,9 +4496,6 @@ static void cm_remove_one(struct ib_device *ib_device, void *client_data) unsigned long flags; int i; - if (!cm_dev) - return; - write_lock_irqsave(&cm.device_lock, flags); list_del(&cm_dev->list); write_unlock_irqrestore(&cm.device_lock, flags); diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 6406a597dfb6..e8d99b71f44a 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -153,7 +153,7 @@ struct rdma_cm_id *rdma_res_to_id(struct rdma_restrack_entry *res) } EXPORT_SYMBOL(rdma_res_to_id); -static void cma_add_one(struct ib_device *device); +static int cma_add_one(struct ib_device *device); static void cma_remove_one(struct ib_device *device, void *client_data); static struct ib_client cma_client = { @@ -4638,29 +4638,34 @@ static struct notifier_block cma_nb = { .notifier_call = cma_netdev_callback }; -static void cma_add_one(struct ib_device *device) +static int cma_add_one(struct ib_device *device) { struct cma_device *cma_dev; struct rdma_id_private *id_priv; unsigned int i; unsigned long supported_gids = 0; + int ret; cma_dev = kmalloc(sizeof *cma_dev, GFP_KERNEL); if (!cma_dev) - return; + return -ENOMEM; cma_dev->device = device; cma_dev->default_gid_type = kcalloc(device->phys_port_cnt, sizeof(*cma_dev->default_gid_type), GFP_KERNEL); - if (!cma_dev->default_gid_type) + if (!cma_dev->default_gid_type) { + ret = -ENOMEM; goto free_cma_dev; + } cma_dev->default_roce_tos = kcalloc(device->phys_port_cnt, sizeof(*cma_dev->default_roce_tos), GFP_KERNEL); - if (!cma_dev->default_roce_tos) + if (!cma_dev->default_roce_tos) { + ret = -ENOMEM; goto free_gid_type; + } rdma_for_each_port (device, i) { supported_gids = roce_gid_type_mask_support(device, i); @@ -4686,15 +4691,14 @@ static void cma_add_one(struct ib_device *device) mutex_unlock(&lock); trace_cm_add_one(device); - return; + return 0; free_gid_type: kfree(cma_dev->default_gid_type); free_cma_dev: kfree(cma_dev); - - return; + return ret; } static int cma_remove_id_dev(struct rdma_id_private *id_priv) @@ -4756,9 +4760,6 @@ static void cma_remove_one(struct ib_device *device, void *client_data) trace_cm_remove_one(device); - if (!cma_dev) - return; - mutex_lock(&lock); list_del(&cma_dev->list); mutex_unlock(&lock); diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index d0b3d35ad3e4..d9f565a779df 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -677,8 +677,20 @@ static int add_client_context(struct ib_device *device, if (ret) goto out; downgrade_write(&device->client_data_rwsem); - if (client->add) - client->add(device); + if (client->add) { + if (client->add(device)) { + /* + * If a client fails to add then the error code is + * ignored, but we won't call any more ops on this + * client. + */ + xa_erase(&device->client_data, client->client_id); + up_read(&device->client_data_rwsem); + ib_device_put(device); + ib_client_put(client); + return 0; + } + } /* Readers shall not see a client until add has been completed */ xa_set_mark(&device->client_data, client->client_id, diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c index e02b5c4fdf09..186e0d652e8b 100644 --- a/drivers/infiniband/core/mad.c +++ b/drivers/infiniband/core/mad.c @@ -3076,9 +3076,11 @@ static int ib_mad_port_close(struct ib_device *device, int port_num) return 0; } -static void ib_mad_init_device(struct ib_device *device) +static int ib_mad_init_device(struct ib_device *device) { int start, i; + unsigned int count = 0; + int ret; start = rdma_start_port(device); @@ -3086,17 +3088,23 @@ static void ib_mad_init_device(struct ib_device *device) if (!rdma_cap_ib_mad(device, i)) continue; - if (ib_mad_port_open(device, i)) { + ret = ib_mad_port_open(device, i); + if (ret) { dev_err(&device->dev, "Couldn't open port %d\n", i); goto error; } - if (ib_agent_port_open(device, i)) { + ret = ib_agent_port_open(device, i); + if (ret) { dev_err(&device->dev, "Couldn't open port %d for agents\n", i); goto error_agent; } + count++; } - return; + if (!count) + return -EOPNOTSUPP; + + return 0; error_agent: if (ib_mad_port_close(device, i)) @@ -3113,6 +3121,7 @@ error: if (ib_mad_port_close(device, i)) dev_err(&device->dev, "Couldn't close port %d\n", i); } + return ret; } static void ib_mad_remove_device(struct ib_device *device, void *client_data) diff --git a/drivers/infiniband/core/multicast.c b/drivers/infiniband/core/multicast.c index 9c2d8b7f1af9..740f03ecc05d 100644 --- a/drivers/infiniband/core/multicast.c +++ b/drivers/infiniband/core/multicast.c @@ -42,7 +42,7 @@ #include #include "sa.h" -static void mcast_add_one(struct ib_device *device); +static int mcast_add_one(struct ib_device *device); static void mcast_remove_one(struct ib_device *device, void *client_data); static struct ib_client mcast_client = { @@ -815,7 +815,7 @@ static void mcast_event_handler(struct ib_event_handler *handler, } } -static void mcast_add_one(struct ib_device *device) +static int mcast_add_one(struct ib_device *device) { struct mcast_device *dev; struct mcast_port *port; @@ -825,7 +825,7 @@ static void mcast_add_one(struct ib_device *device) dev = kmalloc(struct_size(dev, port, device->phys_port_cnt), GFP_KERNEL); if (!dev) - return; + return -ENOMEM; dev->start_port = rdma_start_port(device); dev->end_port = rdma_end_port(device); @@ -845,7 +845,7 @@ static void mcast_add_one(struct ib_device *device) if (!count) { kfree(dev); - return; + return -EOPNOTSUPP; } dev->device = device; @@ -853,6 +853,7 @@ static void mcast_add_one(struct ib_device *device) INIT_IB_EVENT_HANDLER(&dev->event_handler, device, mcast_event_handler); ib_register_event_handler(&dev->event_handler); + return 0; } static void mcast_remove_one(struct ib_device *device, void *client_data) @@ -861,9 +862,6 @@ static void mcast_remove_one(struct ib_device *device, void *client_data) struct mcast_port *port; int i; - if (!dev) - return; - ib_unregister_event_handler(&dev->event_handler); flush_workqueue(mcast_wq); diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c index 2dd326f2beed..5c878646ff62 100644 --- a/drivers/infiniband/core/sa_query.c +++ b/drivers/infiniband/core/sa_query.c @@ -174,7 +174,7 @@ static const struct nla_policy ib_nl_policy[LS_NLA_TYPE_MAX] = { }; -static void ib_sa_add_one(struct ib_device *device); +static int ib_sa_add_one(struct ib_device *device); static void ib_sa_remove_one(struct ib_device *device, void *client_data); static struct ib_client sa_client = { @@ -2322,18 +2322,19 @@ static void ib_sa_event(struct ib_event_handler *handler, } } -static void ib_sa_add_one(struct ib_device *device) +static int ib_sa_add_one(struct ib_device *device) { struct ib_sa_device *sa_dev; int s, e, i; int count = 0; + int ret; s = rdma_start_port(device); e = rdma_end_port(device); sa_dev = kzalloc(struct_size(sa_dev, port, e - s + 1), GFP_KERNEL); if (!sa_dev) - return; + return -ENOMEM; sa_dev->start_port = s; sa_dev->end_port = e; @@ -2353,8 +2354,10 @@ static void ib_sa_add_one(struct ib_device *device) ib_register_mad_agent(device, i + s, IB_QPT_GSI, NULL, 0, send_handler, recv_handler, sa_dev, 0); - if (IS_ERR(sa_dev->port[i].agent)) + if (IS_ERR(sa_dev->port[i].agent)) { + ret = PTR_ERR(sa_dev->port[i].agent); goto err; + } INIT_WORK(&sa_dev->port[i].update_task, update_sm_ah); INIT_DELAYED_WORK(&sa_dev->port[i].ib_cpi_work, @@ -2363,8 +2366,10 @@ static void ib_sa_add_one(struct ib_device *device) count++; } - if (!count) + if (!count) { + ret = -EOPNOTSUPP; goto free; + } ib_set_client_data(device, &sa_client, sa_dev); @@ -2383,7 +2388,7 @@ static void ib_sa_add_one(struct ib_device *device) update_sm_ah(&sa_dev->port[i].update_task); } - return; + return 0; err: while (--i >= 0) { @@ -2392,7 +2397,7 @@ err: } free: kfree(sa_dev); - return; + return ret; } static void ib_sa_remove_one(struct ib_device *device, void *client_data) @@ -2400,9 +2405,6 @@ static void ib_sa_remove_one(struct ib_device *device, void *client_data) struct ib_sa_device *sa_dev = client_data; int i; - if (!sa_dev) - return; - ib_unregister_event_handler(&sa_dev->event_handler); flush_workqueue(ib_wq); diff --git a/drivers/infiniband/core/user_mad.c b/drivers/infiniband/core/user_mad.c index da229eab5903..b0d0b522cc76 100644 --- a/drivers/infiniband/core/user_mad.c +++ b/drivers/infiniband/core/user_mad.c @@ -142,7 +142,7 @@ static dev_t dynamic_issm_dev; static DEFINE_IDA(umad_ida); -static void ib_umad_add_one(struct ib_device *device); +static int ib_umad_add_one(struct ib_device *device); static void ib_umad_remove_one(struct ib_device *device, void *client_data); static void ib_umad_dev_free(struct kref *kref) @@ -1352,37 +1352,41 @@ static void ib_umad_kill_port(struct ib_umad_port *port) put_device(&port->dev); } -static void ib_umad_add_one(struct ib_device *device) +static int ib_umad_add_one(struct ib_device *device) { struct ib_umad_device *umad_dev; int s, e, i; int count = 0; + int ret; s = rdma_start_port(device); e = rdma_end_port(device); umad_dev = kzalloc(struct_size(umad_dev, ports, e - s + 1), GFP_KERNEL); if (!umad_dev) - return; + return -ENOMEM; kref_init(&umad_dev->kref); for (i = s; i <= e; ++i) { if (!rdma_cap_ib_mad(device, i)) continue; - if (ib_umad_init_port(device, i, umad_dev, - &umad_dev->ports[i - s])) + ret = ib_umad_init_port(device, i, umad_dev, + &umad_dev->ports[i - s]); + if (ret) goto err; count++; } - if (!count) + if (!count) { + ret = -EOPNOTSUPP; goto free; + } ib_set_client_data(device, &umad_client, umad_dev); - return; + return 0; err: while (--i >= s) { @@ -1394,6 +1398,7 @@ err: free: /* balances kref_init */ ib_umad_dev_put(umad_dev); + return ret; } static void ib_umad_remove_one(struct ib_device *device, void *client_data) @@ -1401,9 +1406,6 @@ static void ib_umad_remove_one(struct ib_device *device, void *client_data) struct ib_umad_device *umad_dev = client_data; unsigned int i; - if (!umad_dev) - return; - rdma_for_each_port (device, i) { if (rdma_cap_ib_mad(device, i)) ib_umad_kill_port( diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index 8710a3427146..d52eb870533b 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -75,7 +75,7 @@ static dev_t dynamic_uverbs_dev; static struct class *uverbs_class; static DEFINE_IDA(uverbs_ida); -static void ib_uverbs_add_one(struct ib_device *device); +static int ib_uverbs_add_one(struct ib_device *device); static void ib_uverbs_remove_one(struct ib_device *device, void *client_data); /* @@ -1091,7 +1091,7 @@ static int ib_uverbs_create_uapi(struct ib_device *device, return 0; } -static void ib_uverbs_add_one(struct ib_device *device) +static int ib_uverbs_add_one(struct ib_device *device) { int devnum; dev_t base; @@ -1099,16 +1099,16 @@ static void ib_uverbs_add_one(struct ib_device *device) int ret; if (!device->ops.alloc_ucontext) - return; + return -EOPNOTSUPP; uverbs_dev = kzalloc(sizeof(*uverbs_dev), GFP_KERNEL); if (!uverbs_dev) - return; + return -ENOMEM; ret = init_srcu_struct(&uverbs_dev->disassociate_srcu); if (ret) { kfree(uverbs_dev); - return; + return -ENOMEM; } device_initialize(&uverbs_dev->dev); @@ -1128,15 +1128,18 @@ static void ib_uverbs_add_one(struct ib_device *device) devnum = ida_alloc_max(&uverbs_ida, IB_UVERBS_MAX_DEVICES - 1, GFP_KERNEL); - if (devnum < 0) + if (devnum < 0) { + ret = -ENOMEM; goto err; + } uverbs_dev->devnum = devnum; if (devnum >= IB_UVERBS_NUM_FIXED_MINOR) base = dynamic_uverbs_dev + devnum - IB_UVERBS_NUM_FIXED_MINOR; else base = IB_UVERBS_BASE_DEV + devnum; - if (ib_uverbs_create_uapi(device, uverbs_dev)) + ret = ib_uverbs_create_uapi(device, uverbs_dev); + if (ret) goto err_uapi; uverbs_dev->dev.devt = base; @@ -1151,7 +1154,7 @@ static void ib_uverbs_add_one(struct ib_device *device) goto err_uapi; ib_set_client_data(device, &uverbs_client, uverbs_dev); - return; + return 0; err_uapi: ida_free(&uverbs_ida, devnum); @@ -1160,7 +1163,7 @@ err: ib_uverbs_comp_dev(uverbs_dev); wait_for_completion(&uverbs_dev->comp); put_device(&uverbs_dev->dev); - return; + return ret; } static void ib_uverbs_free_hw_resources(struct ib_uverbs_device *uverbs_dev, @@ -1203,9 +1206,6 @@ static void ib_uverbs_remove_one(struct ib_device *device, void *client_data) struct ib_uverbs_device *uverbs_dev = client_data; int wait_clients = 1; - if (!uverbs_dev) - return; - cdev_device_del(&uverbs_dev->cdev, &uverbs_dev->dev); ida_free(&uverbs_ida, uverbs_dev->devnum); diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index 81b8227214f1..d4c6a97ce4c0 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -86,7 +86,7 @@ struct workqueue_struct *ipoib_workqueue; struct ib_sa_client ipoib_sa_client; -static void ipoib_add_one(struct ib_device *device); +static int ipoib_add_one(struct ib_device *device); static void ipoib_remove_one(struct ib_device *device, void *client_data); static void ipoib_neigh_reclaim(struct rcu_head *rp); static struct net_device *ipoib_get_net_dev_by_params( @@ -479,9 +479,6 @@ static struct net_device *ipoib_get_net_dev_by_params( if (ret) return NULL; - if (!dev_list) - return NULL; - /* See if we can find a unique device matching the L2 parameters */ matches = __ipoib_get_net_dev_by_params(dev_list, port, pkey_index, gid, NULL, &net_dev); @@ -2514,7 +2511,7 @@ sysfs_failed: return ERR_PTR(-ENOMEM); } -static void ipoib_add_one(struct ib_device *device) +static int ipoib_add_one(struct ib_device *device) { struct list_head *dev_list; struct net_device *dev; @@ -2524,7 +2521,7 @@ static void ipoib_add_one(struct ib_device *device) dev_list = kmalloc(sizeof(*dev_list), GFP_KERNEL); if (!dev_list) - return; + return -ENOMEM; INIT_LIST_HEAD(dev_list); @@ -2541,10 +2538,11 @@ static void ipoib_add_one(struct ib_device *device) if (!count) { kfree(dev_list); - return; + return -EOPNOTSUPP; } ib_set_client_data(device, &ipoib_client, dev_list); + return 0; } static void ipoib_remove_one(struct ib_device *device, void *client_data) @@ -2552,9 +2550,6 @@ static void ipoib_remove_one(struct ib_device *device, void *client_data) struct ipoib_dev_priv *priv, *tmp, *cpriv, *tcpriv; struct list_head *dev_list = client_data; - if (!dev_list) - return; - list_for_each_entry_safe(priv, tmp, dev_list, list) { LIST_HEAD(head); ipoib_parent_unregister_pre(priv->dev); diff --git a/drivers/infiniband/ulp/opa_vnic/opa_vnic_vema.c b/drivers/infiniband/ulp/opa_vnic/opa_vnic_vema.c index 6e8d650c17c7..874a8eb7638c 100644 --- a/drivers/infiniband/ulp/opa_vnic/opa_vnic_vema.c +++ b/drivers/infiniband/ulp/opa_vnic/opa_vnic_vema.c @@ -113,7 +113,7 @@ struct opa_vnic_vema_port { struct mutex lock; }; -static void opa_vnic_vema_add_one(struct ib_device *device); +static int opa_vnic_vema_add_one(struct ib_device *device); static void opa_vnic_vema_rem_one(struct ib_device *device, void *client_data); @@ -989,18 +989,18 @@ static void opa_vnic_ctrl_config_dev(struct opa_vnic_ctrl_port *cport, bool en) * * Allocate the vnic control port and initialize it. */ -static void opa_vnic_vema_add_one(struct ib_device *device) +static int opa_vnic_vema_add_one(struct ib_device *device) { struct opa_vnic_ctrl_port *cport; int rc, size = sizeof(*cport); if (!rdma_cap_opa_vnic(device)) - return; + return -EOPNOTSUPP; size += device->phys_port_cnt * sizeof(struct opa_vnic_vema_port); cport = kzalloc(size, GFP_KERNEL); if (!cport) - return; + return -ENOMEM; cport->num_ports = device->phys_port_cnt; cport->ibdev = device; @@ -1012,6 +1012,7 @@ static void opa_vnic_vema_add_one(struct ib_device *device) ib_set_client_data(device, &opa_vnic_client, cport); opa_vnic_ctrl_config_dev(cport, true); + return 0; } /** @@ -1026,9 +1027,6 @@ static void opa_vnic_vema_rem_one(struct ib_device *device, { struct opa_vnic_ctrl_port *cport = client_data; - if (!cport) - return; - c_info("removing VNIC client\n"); opa_vnic_ctrl_config_dev(cport, false); vema_unregister(cport); diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index cd1181c39ed2..00b4f88b113e 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -146,7 +146,7 @@ module_param(ch_count, uint, 0444); MODULE_PARM_DESC(ch_count, "Number of RDMA channels to use for communication with an SRP target. Using more than one channel improves performance if the HCA supports multiple completion vectors. The default value is the minimum of four times the number of online CPU sockets and the number of completion vectors supported by the HCA."); -static void srp_add_one(struct ib_device *device); +static int srp_add_one(struct ib_device *device); static void srp_remove_one(struct ib_device *device, void *client_data); static void srp_rename_dev(struct ib_device *device, void *client_data); static void srp_recv_done(struct ib_cq *cq, struct ib_wc *wc); @@ -4132,7 +4132,7 @@ static void srp_rename_dev(struct ib_device *device, void *client_data) } } -static void srp_add_one(struct ib_device *device) +static int srp_add_one(struct ib_device *device) { struct srp_device *srp_dev; struct ib_device_attr *attr = &device->attrs; @@ -4144,7 +4144,7 @@ static void srp_add_one(struct ib_device *device) srp_dev = kzalloc(sizeof(*srp_dev), GFP_KERNEL); if (!srp_dev) - return; + return -ENOMEM; /* * Use the smallest page size supported by the HCA, down to a @@ -4197,8 +4197,12 @@ static void srp_add_one(struct ib_device *device) srp_dev->dev = device; srp_dev->pd = ib_alloc_pd(device, flags); - if (IS_ERR(srp_dev->pd)) - goto free_dev; + if (IS_ERR(srp_dev->pd)) { + int ret = PTR_ERR(srp_dev->pd); + + kfree(srp_dev); + return ret; + } if (flags & IB_PD_UNSAFE_GLOBAL_RKEY) { srp_dev->global_rkey = srp_dev->pd->unsafe_global_rkey; @@ -4212,10 +4216,7 @@ static void srp_add_one(struct ib_device *device) } ib_set_client_data(device, &srp_client, srp_dev); - return; - -free_dev: - kfree(srp_dev); + return 0; } static void srp_remove_one(struct ib_device *device, void *client_data) @@ -4225,8 +4226,6 @@ static void srp_remove_one(struct ib_device *device, void *client_data) struct srp_target_port *target; srp_dev = client_data; - if (!srp_dev) - return; list_for_each_entry_safe(host, tmp_host, &srp_dev->dev_list, list) { device_unregister(&host->dev); diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.c b/drivers/infiniband/ulp/srpt/ib_srpt.c index 9d02d8088f1c..7ed38d1cb997 100644 --- a/drivers/infiniband/ulp/srpt/ib_srpt.c +++ b/drivers/infiniband/ulp/srpt/ib_srpt.c @@ -3101,7 +3101,7 @@ static int srpt_use_srq(struct srpt_device *sdev, bool use_srq) * srpt_add_one - InfiniBand device addition callback function * @device: Describes a HCA. */ -static void srpt_add_one(struct ib_device *device) +static int srpt_add_one(struct ib_device *device) { struct srpt_device *sdev; struct srpt_port *sport; @@ -3112,14 +3112,16 @@ static void srpt_add_one(struct ib_device *device) sdev = kzalloc(struct_size(sdev, port, device->phys_port_cnt), GFP_KERNEL); if (!sdev) - goto err; + return -ENOMEM; sdev->device = device; mutex_init(&sdev->sdev_mutex); sdev->pd = ib_alloc_pd(device, 0); - if (IS_ERR(sdev->pd)) + if (IS_ERR(sdev->pd)) { + ret = PTR_ERR(sdev->pd); goto free_dev; + } sdev->lkey = sdev->pd->local_dma_lkey; @@ -3135,6 +3137,7 @@ static void srpt_add_one(struct ib_device *device) if (IS_ERR(sdev->cm_id)) { pr_info("ib_create_cm_id() failed: %ld\n", PTR_ERR(sdev->cm_id)); + ret = PTR_ERR(sdev->cm_id); sdev->cm_id = NULL; if (!rdma_cm_id) goto err_ring; @@ -3179,7 +3182,8 @@ static void srpt_add_one(struct ib_device *device) mutex_init(&sport->port_gid_id.mutex); INIT_LIST_HEAD(&sport->port_gid_id.tpg_list); - if (srpt_refresh_port(sport)) { + ret = srpt_refresh_port(sport); + if (ret) { pr_err("MAD registration failed for %s-%d.\n", dev_name(&sdev->device->dev), i); goto err_event; @@ -3190,10 +3194,9 @@ static void srpt_add_one(struct ib_device *device) list_add_tail(&sdev->list, &srpt_dev_list); spin_unlock(&srpt_dev_lock); -out: ib_set_client_data(device, &srpt_client, sdev); pr_debug("added %s.\n", dev_name(&device->dev)); - return; + return 0; err_event: ib_unregister_event_handler(&sdev->event_handler); @@ -3205,10 +3208,8 @@ err_ring: ib_dealloc_pd(sdev->pd); free_dev: kfree(sdev); -err: - sdev = NULL; pr_info("%s(%s) failed.\n", __func__, dev_name(&device->dev)); - goto out; + return ret; } /** @@ -3221,12 +3222,6 @@ static void srpt_remove_one(struct ib_device *device, void *client_data) struct srpt_device *sdev = client_data; int i; - if (!sdev) { - pr_info("%s(%s): nothing to do.\n", __func__, - dev_name(&device->dev)); - return; - } - srpt_unregister_mad_agent(sdev); ib_unregister_event_handler(&sdev->event_handler); diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 8d29f2f79da8..c3d715e2fc66 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2722,7 +2722,7 @@ struct ib_device { struct ib_client_nl_info; struct ib_client { const char *name; - void (*add) (struct ib_device *); + int (*add)(struct ib_device *ibdev); void (*remove)(struct ib_device *, void *client_data); void (*rename)(struct ib_device *dev, void *client_data); int (*get_nl_info)(struct ib_device *ibdev, void *client_data, diff --git a/net/rds/ib.c b/net/rds/ib.c index a792d8a3872a..90212ed3edf1 100644 --- a/net/rds/ib.c +++ b/net/rds/ib.c @@ -127,19 +127,20 @@ void rds_ib_dev_put(struct rds_ib_device *rds_ibdev) queue_work(rds_wq, &rds_ibdev->free_work); } -static void rds_ib_add_one(struct ib_device *device) +static int rds_ib_add_one(struct ib_device *device) { struct rds_ib_device *rds_ibdev; bool has_fr, has_fmr; + int ret; /* Only handle IB (no iWARP) devices */ if (device->node_type != RDMA_NODE_IB_CA) - return; + return -EOPNOTSUPP; rds_ibdev = kzalloc_node(sizeof(struct rds_ib_device), GFP_KERNEL, ibdev_to_node(device)); if (!rds_ibdev) - return; + return -ENOMEM; spin_lock_init(&rds_ibdev->spinlock); refcount_set(&rds_ibdev->refcount, 1); @@ -182,12 +183,14 @@ static void rds_ib_add_one(struct ib_device *device) if (!rds_ibdev->vector_load) { pr_err("RDS/IB: %s failed to allocate vector memory\n", __func__); + ret = -ENOMEM; goto put_dev; } rds_ibdev->dev = device; rds_ibdev->pd = ib_alloc_pd(device, 0); if (IS_ERR(rds_ibdev->pd)) { + ret = PTR_ERR(rds_ibdev->pd); rds_ibdev->pd = NULL; goto put_dev; } @@ -195,12 +198,15 @@ static void rds_ib_add_one(struct ib_device *device) device->dma_device, sizeof(struct rds_header), L1_CACHE_BYTES, 0); - if (!rds_ibdev->rid_hdrs_pool) + if (!rds_ibdev->rid_hdrs_pool) { + ret = -ENOMEM; goto put_dev; + } rds_ibdev->mr_1m_pool = rds_ib_create_mr_pool(rds_ibdev, RDS_IB_MR_1M_POOL); if (IS_ERR(rds_ibdev->mr_1m_pool)) { + ret = PTR_ERR(rds_ibdev->mr_1m_pool); rds_ibdev->mr_1m_pool = NULL; goto put_dev; } @@ -208,6 +214,7 @@ static void rds_ib_add_one(struct ib_device *device) rds_ibdev->mr_8k_pool = rds_ib_create_mr_pool(rds_ibdev, RDS_IB_MR_8K_POOL); if (IS_ERR(rds_ibdev->mr_8k_pool)) { + ret = PTR_ERR(rds_ibdev->mr_8k_pool); rds_ibdev->mr_8k_pool = NULL; goto put_dev; } @@ -227,12 +234,13 @@ static void rds_ib_add_one(struct ib_device *device) refcount_inc(&rds_ibdev->refcount); ib_set_client_data(device, &rds_ib_client, rds_ibdev); - refcount_inc(&rds_ibdev->refcount); rds_ib_nodev_connect(); + return 0; put_dev: rds_ib_dev_put(rds_ibdev); + return ret; } /* @@ -274,9 +282,6 @@ static void rds_ib_remove_one(struct ib_device *device, void *client_data) { struct rds_ib_device *rds_ibdev = client_data; - if (!rds_ibdev) - return; - rds_ib_dev_shutdown(rds_ibdev); /* stop connection attempts from getting a reference to this device. */ diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index e7e7c3c6e94a..2fad5f3fe093 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -547,18 +547,18 @@ static void smc_ib_cleanup_per_ibdev(struct smc_ib_device *smcibdev) static struct ib_client smc_ib_client; /* callback function for ib_register_client() */ -static void smc_ib_add_dev(struct ib_device *ibdev) +static int smc_ib_add_dev(struct ib_device *ibdev) { struct smc_ib_device *smcibdev; u8 port_cnt; int i; if (ibdev->node_type != RDMA_NODE_IB_CA) - return; + return -EOPNOTSUPP; smcibdev = kzalloc(sizeof(*smcibdev), GFP_KERNEL); if (!smcibdev) - return; + return -ENOMEM; smcibdev->ibdev = ibdev; INIT_WORK(&smcibdev->port_event_work, smc_ib_port_event_work); @@ -583,6 +583,7 @@ static void smc_ib_add_dev(struct ib_device *ibdev) smcibdev->pnetid[i]); } schedule_work(&smcibdev->port_event_work); + return 0; } /* callback function for ib_unregister_client() */ @@ -590,9 +591,6 @@ static void smc_ib_remove_dev(struct ib_device *ibdev, void *client_data) { struct smc_ib_device *smcibdev = client_data; - if (!smcibdev || smcibdev->ibdev != ibdev) - return; - ib_set_client_data(ibdev, &smc_ib_client, NULL); spin_lock(&smc_ib_devices.lock); list_del_init(&smcibdev->list); /* remove from smc_ib_devices */ spin_unlock(&smc_ib_devices.lock); -- cgit v1.2.3 From d5665a21250efeeb73579a2f8d71ee1820f37952 Mon Sep 17 00:00:00 2001 From: Mark Zhang Date: Mon, 4 May 2020 08:19:31 +0300 Subject: RDMA/core: Add hash functions to calculate RoCEv2 flowlabel and UDP source port Add two hash functions to distribute RoCE v2 UDP source and Flowlabel symmetrically. These are user visible API and any change in the implementation needs to be tested for inter-operability between old and new variant. Link: https://lore.kernel.org/r/20200504051935.269708-2-leon@kernel.org Signed-off-by: Mark Zhang Reviewed-by: Maor Gottlieb Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- include/rdma/ib_verbs.h | 44 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index c3d715e2fc66..4c488cade70f 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -4709,4 +4709,48 @@ static inline struct ib_device *rdma_device_to_ibdev(struct device *device) bool rdma_dev_access_netns(const struct ib_device *device, const struct net *net); + +#define IB_ROCE_UDP_ENCAP_VALID_PORT_MIN (0xC000) +#define IB_GRH_FLOWLABEL_MASK (0x000FFFFF) + +/** + * rdma_flow_label_to_udp_sport - generate a RoCE v2 UDP src port value based + * on the flow_label + * + * This function will convert the 20 bit flow_label input to a valid RoCE v2 + * UDP src port 14 bit value. All RoCE V2 drivers should use this same + * convention. + */ +static inline u16 rdma_flow_label_to_udp_sport(u32 fl) +{ + u32 fl_low = fl & 0x03fff, fl_high = fl & 0xFC000; + + fl_low ^= fl_high >> 14; + return (u16)(fl_low | IB_ROCE_UDP_ENCAP_VALID_PORT_MIN); +} + +/** + * rdma_calc_flow_label - generate a RDMA symmetric flow label value based on + * local and remote qpn values + * + * This function folded the multiplication results of two qpns, 24 bit each, + * fields, and converts it to a 20 bit results. + * + * This function will create symmetric flow_label value based on the local + * and remote qpn values. this will allow both the requester and responder + * to calculate the same flow_label for a given connection. + * + * This helper function should be used by driver in case the upper layer + * provide a zero flow_label value. This is to improve entropy of RDMA + * traffic in the network. + */ +static inline u32 rdma_calc_flow_label(u32 lqpn, u32 rqpn) +{ + u64 v = (u64)lqpn * rqpn; + + v ^= v >> 20; + v ^= v >> 40; + + return (u32)(v & IB_GRH_FLOWLABEL_MASK); +} #endif /* IB_VERBS_H */ -- cgit v1.2.3 From 9611d53aa1600ba94a36cd7bfd6a95dbae76c8e6 Mon Sep 17 00:00:00 2001 From: Maor Gottlieb Date: Mon, 4 May 2020 08:19:32 +0300 Subject: RDMA/core: Consider flow label when building skb Use rdma_flow_label_to_udp_sport to calculate the UDP source port of the RoCEV2 packet. Link: https://lore.kernel.org/r/20200504051935.269708-3-leon@kernel.org Signed-off-by: Maor Gottlieb Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/lag.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/core/lag.c b/drivers/infiniband/core/lag.c index a29533626a7c..7063e41eaf26 100644 --- a/drivers/infiniband/core/lag.c +++ b/drivers/infiniband/core/lag.c @@ -34,7 +34,8 @@ static struct sk_buff *rdma_build_skb(struct ib_device *device, skb_push(skb, sizeof(struct udphdr)); skb_reset_transport_header(skb); uh = udp_hdr(skb); - uh->source = htons(0xC000); + uh->source = + htons(rdma_flow_label_to_udp_sport(ah_attr->grh.flow_label)); uh->dest = htons(ROCE_V2_UDP_DPORT); uh->len = htons(sizeof(struct udphdr)); @@ -114,7 +115,8 @@ struct net_device *rdma_lag_get_ah_roce_slave(struct ib_device *device, struct net_device *master; if (!(ah_attr->type == RDMA_AH_ATTR_TYPE_ROCE && - ah_attr->grh.sgid_attr->gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP)) + ah_attr->grh.sgid_attr->gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP && + ah_attr->grh.flow_label)) return NULL; rcu_read_lock(); -- cgit v1.2.3 From 2b880b2e5e03e790a9b9fd7e3e8fcf7a36230a16 Mon Sep 17 00:00:00 2001 From: Mark Zhang Date: Mon, 4 May 2020 08:19:33 +0300 Subject: RDMA/mlx5: Define RoCEv2 udp source port when set path Calculate and set UDP source port based on the flow label. If flow label is not defined in GRH then calculate it based on lqpn/rqpn. Link: https://lore.kernel.org/r/20200504051935.269708-4-leon@kernel.org Signed-off-by: Mark Zhang Reviewed-by: Maor Gottlieb Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/qp.c | 30 ++++++++++++++++++++++++------ 1 file changed, 24 insertions(+), 6 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 810bbd52daec..e624886bcf85 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -3133,6 +3133,21 @@ static int modify_raw_packet_tx_affinity(struct mlx5_core_dev *dev, return err; } +static void mlx5_set_path_udp_sport(struct mlx5_qp_path *path, + const struct rdma_ah_attr *ah, + u32 lqpn, u32 rqpn) + +{ + u32 fl = ah->grh.flow_label; + u16 sport; + + if (!fl) + fl = rdma_calc_flow_label(lqpn, rqpn); + + sport = rdma_flow_label_to_udp_sport(fl); + path->udp_sport = cpu_to_be16(sport); +} + static int mlx5_set_path(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, const struct rdma_ah_attr *ah, struct mlx5_qp_path *path, u8 port, int attr_mask, @@ -3164,12 +3179,15 @@ static int mlx5_set_path(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, return -EINVAL; memcpy(path->rmac, ah->roce.dmac, sizeof(ah->roce.dmac)); - if (qp->ibqp.qp_type == IB_QPT_RC || - qp->ibqp.qp_type == IB_QPT_UC || - qp->ibqp.qp_type == IB_QPT_XRC_INI || - qp->ibqp.qp_type == IB_QPT_XRC_TGT) - path->udp_sport = - mlx5_get_roce_udp_sport(dev, ah->grh.sgid_attr); + if ((qp->ibqp.qp_type == IB_QPT_RC || + qp->ibqp.qp_type == IB_QPT_UC || + qp->ibqp.qp_type == IB_QPT_XRC_INI || + qp->ibqp.qp_type == IB_QPT_XRC_TGT) && + (grh->sgid_attr->gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) && + (attr_mask & IB_QP_DEST_QPN)) + mlx5_set_path_udp_sport(path, ah, + qp->ibqp.qp_num, + attr->dest_qp_num); path->dci_cfi_prio_sl = (sl & 0x7) << 4; gid_type = ah->grh.sgid_attr->gid_type; if (gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) -- cgit v1.2.3 From f66534051936044728e2be9937eb408494ca4007 Mon Sep 17 00:00:00 2001 From: Mark Zhang Date: Mon, 4 May 2020 08:19:34 +0300 Subject: RDMA/cma: Initialize the flow label of CM's route path record If flow label is not set by the user or it's not IPv4, initialize it with the cma src/dst based on the "Kernighan and Ritchie's hash function". Link: https://lore.kernel.org/r/20200504051935.269708-5-leon@kernel.org Signed-off-by: Mark Zhang Reviewed-by: Maor Gottlieb Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cma.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index e8d99b71f44a..432eec472164 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -2909,6 +2909,24 @@ static int iboe_tos_to_sl(struct net_device *ndev, int tos) return 0; } +static __be32 cma_get_roce_udp_flow_label(struct rdma_id_private *id_priv) +{ + struct sockaddr_in6 *addr6; + u16 dport, sport; + u32 hash, fl; + + addr6 = (struct sockaddr_in6 *)cma_src_addr(id_priv); + fl = be32_to_cpu(addr6->sin6_flowinfo) & IB_GRH_FLOWLABEL_MASK; + if ((cma_family(id_priv) != AF_INET6) || !fl) { + dport = be16_to_cpu(cma_port(cma_dst_addr(id_priv))); + sport = be16_to_cpu(cma_port(cma_src_addr(id_priv))); + hash = (u32)sport * 31 + dport; + fl = hash & IB_GRH_FLOWLABEL_MASK; + } + + return cpu_to_be32(fl); +} + static int cma_resolve_iboe_route(struct rdma_id_private *id_priv) { struct rdma_route *route = &id_priv->id.route; @@ -2975,6 +2993,11 @@ static int cma_resolve_iboe_route(struct rdma_id_private *id_priv) goto err2; } + if (rdma_protocol_roce_udp_encap(id_priv->id.device, + id_priv->id.port_num)) + route->path_rec->flow_label = + cma_get_roce_udp_flow_label(id_priv); + cma_init_resolve_route_work(work, id_priv); queue_work(cma_wq, &work->work); -- cgit v1.2.3 From 5ac55dfc6d92c12d5ef423cd16165eb0350f8f51 Mon Sep 17 00:00:00 2001 From: Mark Zhang Date: Mon, 4 May 2020 08:19:35 +0300 Subject: RDMA/mlx5: Set UDP source port based on the grh.flow_label Calculate UDP source port based on the grh.flow_label. If grh.flow_label is not valid, we will use minimal supported UDP source port. Link: https://lore.kernel.org/r/20200504051935.269708-6-leon@kernel.org Signed-off-by: Mark Zhang Reviewed-by: Maor Gottlieb Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/ah.c | 21 +++++++++++++++++++-- drivers/infiniband/hw/mlx5/main.c | 4 ++-- drivers/infiniband/hw/mlx5/mlx5_ib.h | 4 ++-- 3 files changed, 23 insertions(+), 6 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/ah.c b/drivers/infiniband/hw/mlx5/ah.c index cc858f658567..59e5ec39b447 100644 --- a/drivers/infiniband/hw/mlx5/ah.c +++ b/drivers/infiniband/hw/mlx5/ah.c @@ -32,6 +32,24 @@ #include "mlx5_ib.h" +static __be16 mlx5_ah_get_udp_sport(const struct mlx5_ib_dev *dev, + const struct rdma_ah_attr *ah_attr) +{ + enum ib_gid_type gid_type = ah_attr->grh.sgid_attr->gid_type; + __be16 sport; + + if ((gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) && + (rdma_ah_get_ah_flags(ah_attr) & IB_AH_GRH) && + (ah_attr->grh.flow_label & IB_GRH_FLOWLABEL_MASK)) + sport = cpu_to_be16( + rdma_flow_label_to_udp_sport(ah_attr->grh.flow_label)); + else + sport = mlx5_get_roce_udp_sport_min(dev, + ah_attr->grh.sgid_attr); + + return sport; +} + static void create_ib_ah(struct mlx5_ib_dev *dev, struct mlx5_ib_ah *ah, struct rdma_ah_init_attr *init_attr) { @@ -60,8 +78,7 @@ static void create_ib_ah(struct mlx5_ib_dev *dev, struct mlx5_ib_ah *ah, memcpy(ah->av.rmac, ah_attr->roce.dmac, sizeof(ah_attr->roce.dmac)); - ah->av.udp_sport = - mlx5_get_roce_udp_sport(dev, ah_attr->grh.sgid_attr); + ah->av.udp_sport = mlx5_ah_get_udp_sport(dev, ah_attr); ah->av.stat_rate_sl |= (rdma_ah_get_sl(ah_attr) & 0x7) << 1; if (gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) #define MLX5_ECN_ENABLED BIT(1) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index e7fb290c9d8d..0b8cc219e085 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -629,8 +629,8 @@ static int mlx5_ib_del_gid(const struct ib_gid_attr *attr, attr->index, NULL, NULL); } -__be16 mlx5_get_roce_udp_sport(struct mlx5_ib_dev *dev, - const struct ib_gid_attr *attr) +__be16 mlx5_get_roce_udp_sport_min(const struct mlx5_ib_dev *dev, + const struct ib_gid_attr *attr) { if (attr->gid_type != IB_GID_TYPE_ROCE_UDP_ENCAP) return 0; diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index f250753319d0..3041808773e6 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -1356,8 +1356,8 @@ int mlx5_ib_get_vf_guid(struct ib_device *device, int vf, u8 port, int mlx5_ib_set_vf_guid(struct ib_device *device, int vf, u8 port, u64 guid, int type); -__be16 mlx5_get_roce_udp_sport(struct mlx5_ib_dev *dev, - const struct ib_gid_attr *attr); +__be16 mlx5_get_roce_udp_sport_min(const struct mlx5_ib_dev *dev, + const struct ib_gid_attr *attr); void mlx5_ib_cleanup_cong_debugfs(struct mlx5_ib_dev *dev, u8 port_num); void mlx5_ib_init_cong_debugfs(struct mlx5_ib_dev *dev, u8 port_num); -- cgit v1.2.3 From 9b2cf76c9f052987ae5c4ad450ebebdc7c5d7b87 Mon Sep 17 00:00:00 2001 From: Xi Wang Date: Tue, 28 Apr 2020 19:03:39 +0800 Subject: RDMA/hns: Optimize PBL buffer allocation process PBL table has its own implementation for multi-hop addressing currently, but for the hardware, all table's addressing use the same logic, there is no need to implement repeatedly. So optimize the PBL buffer allocation process by using the mtr's interfaces. Link: https://lore.kernel.org/r/1588071823-40200-2-git-send-email-liweihang@huawei.com Signed-off-by: Xi Wang Signed-off-by: Lang Cheng Signed-off-by: Weihang Li Reported-by: kbuild test robot Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_device.h | 19 +- drivers/infiniband/hw/hns/hns_roce_hw_v1.c | 45 +- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 76 +-- drivers/infiniband/hw/hns/hns_roce_mr.c | 726 ++++++---------------------- 4 files changed, 197 insertions(+), 669 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index e1032cec2b12..1089db8ad0b5 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -403,30 +403,17 @@ struct hns_roce_mw { struct hns_roce_mr { struct ib_mr ibmr; - struct ib_umem *umem; u64 iova; /* MR's virtual orignal addr */ u64 size; /* Address range of MR */ u32 key; /* Key of MR */ u32 pd; /* PD num of MR */ u32 access; /* Access permission of MR */ - u32 npages; int enabled; /* MR's active status */ int type; /* MR's register type */ - u64 *pbl_buf; /* MR's PBL space */ - dma_addr_t pbl_dma_addr; /* MR's PBL space PA */ - u32 pbl_size; /* PA number in the PBL */ - u64 pbl_ba; /* page table address */ - u32 l0_chunk_last_num; /* L0 last number */ - u32 l1_chunk_last_num; /* L1 last number */ - u64 **pbl_bt_l2; /* PBL BT L2 */ - u64 **pbl_bt_l1; /* PBL BT L1 */ - u64 *pbl_bt_l0; /* PBL BT L0 */ - dma_addr_t *pbl_l2_dma_addr; /* PBL BT L2 dma addr */ - dma_addr_t *pbl_l1_dma_addr; /* PBL BT L1 dma addr */ - dma_addr_t pbl_l0_dma_addr; /* PBL BT L0 dma addr */ - u32 pbl_ba_pg_sz; /* BT chunk page size */ - u32 pbl_buf_pg_sz; /* buf chunk page size */ u32 pbl_hop_num; /* multi-hop number */ + struct hns_roce_mtr pbl_mtr; + u32 npages; + dma_addr_t *page_list; }; struct hns_roce_mr_table { diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c index 49775cda83dc..b4b98e818328 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c @@ -1099,7 +1099,6 @@ static int hns_roce_v1_dereg_mr(struct hns_roce_dev *hr_dev, struct completion comp; long end = HNS_ROCE_V1_FREE_MR_TIMEOUT_MSECS; unsigned long start = jiffies; - int npages; int ret = 0; priv = (struct hns_roce_v1_priv *)hr_dev->priv; @@ -1146,17 +1145,9 @@ free_mr: dev_dbg(dev, "Free mr 0x%x use 0x%x us.\n", mr->key, jiffies_to_usecs(jiffies) - jiffies_to_usecs(start)); - if (mr->size != ~0ULL) { - npages = ib_umem_page_count(mr->umem); - dma_free_coherent(dev, npages * 8, mr->pbl_buf, - mr->pbl_dma_addr); - } - hns_roce_bitmap_free(&hr_dev->mr_table.mtpt_bitmap, key_to_hw_index(mr->key), 0); - - ib_umem_release(mr->umem); - + hns_roce_mtr_destroy(hr_dev, &mr->pbl_mtr); kfree(mr); return ret; @@ -1826,9 +1817,12 @@ static void hns_roce_v1_set_mtu(struct hns_roce_dev *hr_dev, u8 phy_port, static int hns_roce_v1_write_mtpt(void *mb_buf, struct hns_roce_mr *mr, unsigned long mtpt_idx) { + struct hns_roce_dev *hr_dev = to_hr_dev(mr->ibmr.device); + u64 pages[HNS_ROCE_MAX_INNER_MTPT_NUM] = { 0 }; + struct ib_device *ibdev = &hr_dev->ib_dev; struct hns_roce_v1_mpt_entry *mpt_entry; - struct sg_dma_page_iter sg_iter; - u64 *pages; + dma_addr_t pbl_ba; + int count; int i; /* MPT filled into mailbox buf */ @@ -1878,22 +1872,15 @@ static int hns_roce_v1_write_mtpt(void *mb_buf, struct hns_roce_mr *mr, if (mr->type == MR_TYPE_DMA) return 0; - pages = (u64 *) __get_free_page(GFP_KERNEL); - if (!pages) - return -ENOMEM; - - i = 0; - for_each_sg_dma_page(mr->umem->sg_head.sgl, &sg_iter, mr->umem->nmap, 0) { - pages[i] = ((u64)sg_page_iter_dma_address(&sg_iter)) >> 12; - - /* Directly record to MTPT table firstly 7 entry */ - if (i >= HNS_ROCE_MAX_INNER_MTPT_NUM) - break; - i++; + count = hns_roce_mtr_find(hr_dev, &mr->pbl_mtr, 0, pages, + ARRAY_SIZE(pages), &pbl_ba); + if (count < 1) { + ibdev_err(ibdev, "failed to find PBL mtr, count = %d.", count); + return -ENOBUFS; } /* Register user mr */ - for (i = 0; i < HNS_ROCE_MAX_INNER_MTPT_NUM; i++) { + for (i = 0; i < count; i++) { switch (i) { case 0: mpt_entry->pa0_l = cpu_to_le32((u32)(pages[i])); @@ -1959,13 +1946,9 @@ static int hns_roce_v1_write_mtpt(void *mb_buf, struct hns_roce_mr *mr, } } - free_page((unsigned long) pages); - - mpt_entry->pbl_addr_l = cpu_to_le32((u32)(mr->pbl_dma_addr)); - + mpt_entry->pbl_addr_l = cpu_to_le32(pbl_ba); roce_set_field(mpt_entry->mpt_byte_12, MPT_BYTE_12_PBL_ADDR_H_M, - MPT_BYTE_12_PBL_ADDR_H_S, - ((u32)(mr->pbl_dma_addr >> 32))); + MPT_BYTE_12_PBL_ADDR_H_S, upper_32_bits(pbl_ba)); return 0; } diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 2a8c3893bccd..e699932e6926 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -95,6 +95,7 @@ static void set_frmr_seg(struct hns_roce_v2_rc_send_wqe *rc_sq_wqe, { struct hns_roce_mr *mr = to_hr_mr(wr->mr); struct hns_roce_wqe_frmr_seg *fseg = wqe; + u64 pbl_ba; /* use ib_access_flags */ roce_set_bit(rc_sq_wqe->byte_4, V2_RC_FRMR_WQE_BYTE_4_BIND_EN_S, @@ -109,19 +110,20 @@ static void set_frmr_seg(struct hns_roce_v2_rc_send_wqe *rc_sq_wqe, wr->access & IB_ACCESS_LOCAL_WRITE ? 1 : 0); /* Data structure reuse may lead to confusion */ - rc_sq_wqe->msg_len = cpu_to_le32(mr->pbl_ba & 0xffffffff); - rc_sq_wqe->inv_key = cpu_to_le32(mr->pbl_ba >> 32); + pbl_ba = mr->pbl_mtr.hem_cfg.root_ba; + rc_sq_wqe->msg_len = cpu_to_le32(lower_32_bits(pbl_ba)); + rc_sq_wqe->inv_key = cpu_to_le32(upper_32_bits(pbl_ba)); rc_sq_wqe->byte_16 = cpu_to_le32(wr->mr->length & 0xffffffff); rc_sq_wqe->byte_20 = cpu_to_le32(wr->mr->length >> 32); rc_sq_wqe->rkey = cpu_to_le32(wr->key); rc_sq_wqe->va = cpu_to_le64(wr->mr->iova); - fseg->pbl_size = cpu_to_le32(mr->pbl_size); + fseg->pbl_size = cpu_to_le32(mr->npages); roce_set_field(fseg->mode_buf_pg_sz, V2_RC_FRMR_WQE_BYTE_40_PBL_BUF_PG_SZ_M, V2_RC_FRMR_WQE_BYTE_40_PBL_BUF_PG_SZ_S, - mr->pbl_buf_pg_sz + PG_SHIFT_OFFSET); + to_hr_hw_page_shift(mr->pbl_mtr.hem_cfg.buf_pg_shift)); roce_set_bit(fseg->mode_buf_pg_sz, V2_RC_FRMR_WQE_BYTE_40_BLK_MODE_S, 0); } @@ -2439,32 +2441,30 @@ static int hns_roce_v2_set_mac(struct hns_roce_dev *hr_dev, u8 phy_port, static int set_mtpt_pbl(struct hns_roce_v2_mpt_entry *mpt_entry, struct hns_roce_mr *mr) { - struct sg_dma_page_iter sg_iter; - u64 page_addr; - u64 *pages; - int i; + struct hns_roce_dev *hr_dev = to_hr_dev(mr->ibmr.device); + u64 pages[HNS_ROCE_V2_MAX_INNER_MTPT_NUM] = { 0 }; + struct ib_device *ibdev = &hr_dev->ib_dev; + dma_addr_t pbl_ba; + int i, count; - mpt_entry->pbl_size = cpu_to_le32(mr->pbl_size); - mpt_entry->pbl_ba_l = cpu_to_le32(lower_32_bits(mr->pbl_ba >> 3)); - roce_set_field(mpt_entry->byte_48_mode_ba, - V2_MPT_BYTE_48_PBL_BA_H_M, V2_MPT_BYTE_48_PBL_BA_H_S, - upper_32_bits(mr->pbl_ba >> 3)); + count = hns_roce_mtr_find(hr_dev, &mr->pbl_mtr, 0, pages, + ARRAY_SIZE(pages), &pbl_ba); + if (count < 1) { + ibdev_err(ibdev, "failed to find PBL mtr, count = %d.\n", + count); + return -ENOBUFS; + } - pages = (u64 *)__get_free_page(GFP_KERNEL); - if (!pages) - return -ENOMEM; + /* Aligned to the hardware address access unit */ + for (i = 0; i < count; i++) + pages[i] >>= 6; - i = 0; - for_each_sg_dma_page(mr->umem->sg_head.sgl, &sg_iter, mr->umem->nmap, 0) { - page_addr = sg_page_iter_dma_address(&sg_iter); - pages[i] = page_addr >> 6; + mpt_entry->pbl_size = cpu_to_le32(mr->npages); + mpt_entry->pbl_ba_l = cpu_to_le32(pbl_ba >> 3); + roce_set_field(mpt_entry->byte_48_mode_ba, + V2_MPT_BYTE_48_PBL_BA_H_M, V2_MPT_BYTE_48_PBL_BA_H_S, + upper_32_bits(pbl_ba >> 3)); - /* Record the first 2 entry directly to MTPT table */ - if (i >= HNS_ROCE_V2_MAX_INNER_MTPT_NUM - 1) - goto found; - i++; - } -found: mpt_entry->pa0_l = cpu_to_le32(lower_32_bits(pages[0])); roce_set_field(mpt_entry->byte_56_pa0_h, V2_MPT_BYTE_56_PA0_H_M, V2_MPT_BYTE_56_PA0_H_S, upper_32_bits(pages[0])); @@ -2475,9 +2475,7 @@ found: roce_set_field(mpt_entry->byte_64_buf_pa1, V2_MPT_BYTE_64_PBL_BUF_PG_SZ_M, V2_MPT_BYTE_64_PBL_BUF_PG_SZ_S, - mr->pbl_buf_pg_sz + PG_SHIFT_OFFSET); - - free_page((unsigned long)pages); + to_hr_hw_page_shift(mr->pbl_mtr.hem_cfg.buf_pg_shift)); return 0; } @@ -2499,7 +2497,7 @@ static int hns_roce_v2_write_mtpt(void *mb_buf, struct hns_roce_mr *mr, roce_set_field(mpt_entry->byte_4_pd_hop_st, V2_MPT_BYTE_4_PBL_BA_PG_SZ_M, V2_MPT_BYTE_4_PBL_BA_PG_SZ_S, - mr->pbl_ba_pg_sz + PG_SHIFT_OFFSET); + to_hr_hw_page_shift(mr->pbl_mtr.hem_cfg.ba_pg_shift)); roce_set_field(mpt_entry->byte_4_pd_hop_st, V2_MPT_BYTE_4_PD_M, V2_MPT_BYTE_4_PD_S, mr->pd); @@ -2585,11 +2583,19 @@ static int hns_roce_v2_rereg_write_mtpt(struct hns_roce_dev *hr_dev, static int hns_roce_v2_frmr_write_mtpt(void *mb_buf, struct hns_roce_mr *mr) { + struct hns_roce_dev *hr_dev = to_hr_dev(mr->ibmr.device); + struct ib_device *ibdev = &hr_dev->ib_dev; struct hns_roce_v2_mpt_entry *mpt_entry; + dma_addr_t pbl_ba = 0; mpt_entry = mb_buf; memset(mpt_entry, 0, sizeof(*mpt_entry)); + if (hns_roce_mtr_find(hr_dev, &mr->pbl_mtr, 0, NULL, 0, &pbl_ba) < 0) { + ibdev_err(ibdev, "failed to find frmr mtr.\n"); + return -ENOBUFS; + } + roce_set_field(mpt_entry->byte_4_pd_hop_st, V2_MPT_BYTE_4_MPT_ST_M, V2_MPT_BYTE_4_MPT_ST_S, V2_MPT_ST_FREE); roce_set_field(mpt_entry->byte_4_pd_hop_st, V2_MPT_BYTE_4_PBL_HOP_NUM_M, @@ -2597,7 +2603,7 @@ static int hns_roce_v2_frmr_write_mtpt(void *mb_buf, struct hns_roce_mr *mr) roce_set_field(mpt_entry->byte_4_pd_hop_st, V2_MPT_BYTE_4_PBL_BA_PG_SZ_M, V2_MPT_BYTE_4_PBL_BA_PG_SZ_S, - mr->pbl_ba_pg_sz + PG_SHIFT_OFFSET); + to_hr_hw_page_shift(mr->pbl_mtr.hem_cfg.ba_pg_shift)); roce_set_field(mpt_entry->byte_4_pd_hop_st, V2_MPT_BYTE_4_PD_M, V2_MPT_BYTE_4_PD_S, mr->pd); @@ -2610,17 +2616,17 @@ static int hns_roce_v2_frmr_write_mtpt(void *mb_buf, struct hns_roce_mr *mr) roce_set_bit(mpt_entry->byte_12_mw_pa, V2_MPT_BYTE_12_MR_MW_S, 0); roce_set_bit(mpt_entry->byte_12_mw_pa, V2_MPT_BYTE_12_BPD_S, 1); - mpt_entry->pbl_size = cpu_to_le32(mr->pbl_size); + mpt_entry->pbl_size = cpu_to_le32(mr->npages); - mpt_entry->pbl_ba_l = cpu_to_le32(lower_32_bits(mr->pbl_ba >> 3)); + mpt_entry->pbl_ba_l = cpu_to_le32(lower_32_bits(pbl_ba >> 3)); roce_set_field(mpt_entry->byte_48_mode_ba, V2_MPT_BYTE_48_PBL_BA_H_M, V2_MPT_BYTE_48_PBL_BA_H_S, - upper_32_bits(mr->pbl_ba >> 3)); + upper_32_bits(pbl_ba >> 3)); roce_set_field(mpt_entry->byte_64_buf_pa1, V2_MPT_BYTE_64_PBL_BUF_PG_SZ_M, V2_MPT_BYTE_64_PBL_BUF_PG_SZ_S, - mr->pbl_buf_pg_sz + PG_SHIFT_OFFSET); + to_hr_hw_page_shift(mr->pbl_mtr.hem_cfg.buf_pg_shift)); return 0; } diff --git a/drivers/infiniband/hw/hns/hns_roce_mr.c b/drivers/infiniband/hw/hns/hns_roce_mr.c index 99e3876e712c..c65f1f682819 100644 --- a/drivers/infiniband/hw/hns/hns_roce_mr.c +++ b/drivers/infiniband/hw/hns/hns_roce_mr.c @@ -293,418 +293,89 @@ void hns_roce_mtt_cleanup(struct hns_roce_dev *hr_dev, struct hns_roce_mtt *mtt) } } -static void hns_roce_loop_free(struct hns_roce_dev *hr_dev, - struct hns_roce_mr *mr, int err_loop_index, - int loop_i, int loop_j) +static int alloc_mr_key(struct hns_roce_dev *hr_dev, struct hns_roce_mr *mr, + u32 pd, u64 iova, u64 size, u32 access) { - struct device *dev = hr_dev->dev; - u32 mhop_num; - u32 pbl_bt_sz; - u64 bt_idx; - int i, j; - - pbl_bt_sz = 1 << (hr_dev->caps.pbl_ba_pg_sz + PAGE_SHIFT); - mhop_num = hr_dev->caps.pbl_hop_num; - - i = loop_i; - if (mhop_num == 3 && err_loop_index == 2) { - for (; i >= 0; i--) { - dma_free_coherent(dev, pbl_bt_sz, mr->pbl_bt_l1[i], - mr->pbl_l1_dma_addr[i]); - - for (j = 0; j < pbl_bt_sz / BA_BYTE_LEN; j++) { - if (i == loop_i && j >= loop_j) - break; - - bt_idx = i * pbl_bt_sz / BA_BYTE_LEN + j; - dma_free_coherent(dev, pbl_bt_sz, - mr->pbl_bt_l2[bt_idx], - mr->pbl_l2_dma_addr[bt_idx]); - } - } - } else if (mhop_num == 3 && err_loop_index == 1) { - for (i -= 1; i >= 0; i--) { - dma_free_coherent(dev, pbl_bt_sz, mr->pbl_bt_l1[i], - mr->pbl_l1_dma_addr[i]); - - for (j = 0; j < pbl_bt_sz / BA_BYTE_LEN; j++) { - bt_idx = i * pbl_bt_sz / BA_BYTE_LEN + j; - dma_free_coherent(dev, pbl_bt_sz, - mr->pbl_bt_l2[bt_idx], - mr->pbl_l2_dma_addr[bt_idx]); - } - } - } else if (mhop_num == 2 && err_loop_index == 1) { - for (i -= 1; i >= 0; i--) - dma_free_coherent(dev, pbl_bt_sz, mr->pbl_bt_l1[i], - mr->pbl_l1_dma_addr[i]); - } else { - dev_warn(dev, "not support: mhop_num=%d, err_loop_index=%d.", - mhop_num, err_loop_index); - return; - } - - dma_free_coherent(dev, pbl_bt_sz, mr->pbl_bt_l0, mr->pbl_l0_dma_addr); - mr->pbl_bt_l0 = NULL; - mr->pbl_l0_dma_addr = 0; -} -static int pbl_1hop_alloc(struct hns_roce_dev *hr_dev, int npages, - struct hns_roce_mr *mr, u32 pbl_bt_sz) -{ - struct device *dev = hr_dev->dev; + struct ib_device *ibdev = &hr_dev->ib_dev; + unsigned long obj = 0; + int err; - if (npages > pbl_bt_sz / 8) { - dev_err(dev, "npages %d is larger than buf_pg_sz!", - npages); - return -EINVAL; - } - mr->pbl_buf = dma_alloc_coherent(dev, npages * 8, - &(mr->pbl_dma_addr), - GFP_KERNEL); - if (!mr->pbl_buf) + /* Allocate a key for mr from mr_table */ + err = hns_roce_bitmap_alloc(&hr_dev->mr_table.mtpt_bitmap, &obj); + if (err) { + ibdev_err(ibdev, + "failed to alloc bitmap for MR key, ret = %d.\n", + err); return -ENOMEM; - - mr->pbl_size = npages; - mr->pbl_ba = mr->pbl_dma_addr; - mr->pbl_hop_num = 1; - mr->pbl_ba_pg_sz = hr_dev->caps.pbl_ba_pg_sz; - mr->pbl_buf_pg_sz = hr_dev->caps.pbl_buf_pg_sz; - return 0; - -} - - -static int pbl_2hop_alloc(struct hns_roce_dev *hr_dev, int npages, - struct hns_roce_mr *mr, u32 pbl_bt_sz) -{ - struct device *dev = hr_dev->dev; - int npages_allocated; - u64 pbl_last_bt_num; - u64 pbl_bt_cnt = 0; - u64 size; - int i; - - pbl_last_bt_num = (npages + pbl_bt_sz / 8 - 1) / (pbl_bt_sz / 8); - - /* alloc L1 BT */ - for (i = 0; i < pbl_bt_sz / 8; i++) { - if (pbl_bt_cnt + 1 < pbl_last_bt_num) { - size = pbl_bt_sz; - } else { - npages_allocated = i * (pbl_bt_sz / 8); - size = (npages - npages_allocated) * 8; - } - mr->pbl_bt_l1[i] = dma_alloc_coherent(dev, size, - &(mr->pbl_l1_dma_addr[i]), - GFP_KERNEL); - if (!mr->pbl_bt_l1[i]) { - hns_roce_loop_free(hr_dev, mr, 1, i, 0); - return -ENOMEM; - } - - *(mr->pbl_bt_l0 + i) = mr->pbl_l1_dma_addr[i]; - - pbl_bt_cnt++; - if (pbl_bt_cnt >= pbl_last_bt_num) - break; } - mr->l0_chunk_last_num = i + 1; - - return 0; -} - -static int pbl_3hop_alloc(struct hns_roce_dev *hr_dev, int npages, - struct hns_roce_mr *mr, u32 pbl_bt_sz) -{ - struct device *dev = hr_dev->dev; - int mr_alloc_done = 0; - int npages_allocated; - u64 pbl_last_bt_num; - u64 pbl_bt_cnt = 0; - u64 bt_idx; - u64 size; - int i; - int j = 0; - - pbl_last_bt_num = (npages + pbl_bt_sz / 8 - 1) / (pbl_bt_sz / 8); - - mr->pbl_l2_dma_addr = kcalloc(pbl_last_bt_num, - sizeof(*mr->pbl_l2_dma_addr), - GFP_KERNEL); - if (!mr->pbl_l2_dma_addr) - return -ENOMEM; - - mr->pbl_bt_l2 = kcalloc(pbl_last_bt_num, - sizeof(*mr->pbl_bt_l2), - GFP_KERNEL); - if (!mr->pbl_bt_l2) - goto err_kcalloc_bt_l2; - - /* alloc L1, L2 BT */ - for (i = 0; i < pbl_bt_sz / 8; i++) { - mr->pbl_bt_l1[i] = dma_alloc_coherent(dev, pbl_bt_sz, - &(mr->pbl_l1_dma_addr[i]), - GFP_KERNEL); - if (!mr->pbl_bt_l1[i]) { - hns_roce_loop_free(hr_dev, mr, 1, i, 0); - goto err_dma_alloc_l0; - } - - *(mr->pbl_bt_l0 + i) = mr->pbl_l1_dma_addr[i]; - - for (j = 0; j < pbl_bt_sz / 8; j++) { - bt_idx = i * pbl_bt_sz / 8 + j; - - if (pbl_bt_cnt + 1 < pbl_last_bt_num) { - size = pbl_bt_sz; - } else { - npages_allocated = bt_idx * - (pbl_bt_sz / 8); - size = (npages - npages_allocated) * 8; - } - mr->pbl_bt_l2[bt_idx] = dma_alloc_coherent( - dev, size, - &(mr->pbl_l2_dma_addr[bt_idx]), - GFP_KERNEL); - if (!mr->pbl_bt_l2[bt_idx]) { - hns_roce_loop_free(hr_dev, mr, 2, i, j); - goto err_dma_alloc_l0; - } - - *(mr->pbl_bt_l1[i] + j) = - mr->pbl_l2_dma_addr[bt_idx]; - - pbl_bt_cnt++; - if (pbl_bt_cnt >= pbl_last_bt_num) { - mr_alloc_done = 1; - break; - } - } + mr->iova = iova; /* MR va starting addr */ + mr->size = size; /* MR addr range */ + mr->pd = pd; /* MR num */ + mr->access = access; /* MR access permit */ + mr->enabled = 0; /* MR active status */ + mr->key = hw_index_to_key(obj); /* MR key */ - if (mr_alloc_done) - break; + err = hns_roce_table_get(hr_dev, &hr_dev->mr_table.mtpt_table, obj); + if (err) { + ibdev_err(ibdev, "failed to alloc mtpt, ret = %d.\n", err); + goto err_free_bitmap; } - mr->l0_chunk_last_num = i + 1; - mr->l1_chunk_last_num = j + 1; - - return 0; - -err_dma_alloc_l0: - kfree(mr->pbl_bt_l2); - mr->pbl_bt_l2 = NULL; - -err_kcalloc_bt_l2: - kfree(mr->pbl_l2_dma_addr); - mr->pbl_l2_dma_addr = NULL; - - return -ENOMEM; +err_free_bitmap: + hns_roce_bitmap_free(&hr_dev->mr_table.mtpt_bitmap, obj, BITMAP_NO_RR); + return err; } - -/* PBL multi hop addressing */ -static int hns_roce_mhop_alloc(struct hns_roce_dev *hr_dev, int npages, - struct hns_roce_mr *mr) +static void free_mr_key(struct hns_roce_dev *hr_dev, struct hns_roce_mr *mr) { - struct device *dev = hr_dev->dev; - u32 pbl_bt_sz; - u32 mhop_num; + unsigned long obj = key_to_hw_index(mr->key); - mhop_num = (mr->type == MR_TYPE_FRMR ? 1 : hr_dev->caps.pbl_hop_num); - pbl_bt_sz = 1 << (hr_dev->caps.pbl_ba_pg_sz + PAGE_SHIFT); - - if (mhop_num == HNS_ROCE_HOP_NUM_0) - return 0; - - if (mhop_num == 1) - return pbl_1hop_alloc(hr_dev, npages, mr, pbl_bt_sz); - - mr->pbl_l1_dma_addr = kcalloc(pbl_bt_sz / 8, - sizeof(*mr->pbl_l1_dma_addr), - GFP_KERNEL); - if (!mr->pbl_l1_dma_addr) - return -ENOMEM; - - mr->pbl_bt_l1 = kcalloc(pbl_bt_sz / 8, sizeof(*mr->pbl_bt_l1), - GFP_KERNEL); - if (!mr->pbl_bt_l1) - goto err_kcalloc_bt_l1; - - /* alloc L0 BT */ - mr->pbl_bt_l0 = dma_alloc_coherent(dev, pbl_bt_sz, - &(mr->pbl_l0_dma_addr), - GFP_KERNEL); - if (!mr->pbl_bt_l0) - goto err_kcalloc_l2_dma; - - if (mhop_num == 2) { - if (pbl_2hop_alloc(hr_dev, npages, mr, pbl_bt_sz)) - goto err_kcalloc_l2_dma; - } - - if (mhop_num == 3) { - if (pbl_3hop_alloc(hr_dev, npages, mr, pbl_bt_sz)) - goto err_kcalloc_l2_dma; - } - - - mr->pbl_size = npages; - mr->pbl_ba = mr->pbl_l0_dma_addr; - mr->pbl_hop_num = hr_dev->caps.pbl_hop_num; - mr->pbl_ba_pg_sz = hr_dev->caps.pbl_ba_pg_sz; - mr->pbl_buf_pg_sz = hr_dev->caps.pbl_buf_pg_sz; - - return 0; - -err_kcalloc_l2_dma: - kfree(mr->pbl_bt_l1); - mr->pbl_bt_l1 = NULL; - -err_kcalloc_bt_l1: - kfree(mr->pbl_l1_dma_addr); - mr->pbl_l1_dma_addr = NULL; - - return -ENOMEM; + hns_roce_table_put(hr_dev, &hr_dev->mr_table.mtpt_table, obj); + hns_roce_bitmap_free(&hr_dev->mr_table.mtpt_bitmap, obj, BITMAP_NO_RR); } -static int hns_roce_mr_alloc(struct hns_roce_dev *hr_dev, u32 pd, u64 iova, - u64 size, u32 access, int npages, - struct hns_roce_mr *mr) +static int alloc_mr_pbl(struct hns_roce_dev *hr_dev, struct hns_roce_mr *mr, + size_t length, struct ib_udata *udata, u64 start, + int access) { - struct device *dev = hr_dev->dev; - unsigned long index = 0; - int ret; - - /* Allocate a key for mr from mr_table */ - ret = hns_roce_bitmap_alloc(&hr_dev->mr_table.mtpt_bitmap, &index); - if (ret) - return -ENOMEM; + struct ib_device *ibdev = &hr_dev->ib_dev; + bool is_fast = mr->type == MR_TYPE_FRMR; + struct hns_roce_buf_attr buf_attr = {}; + int err; - mr->iova = iova; /* MR va starting addr */ - mr->size = size; /* MR addr range */ - mr->pd = pd; /* MR num */ - mr->access = access; /* MR access permit */ - mr->enabled = 0; /* MR active status */ - mr->key = hw_index_to_key(index); /* MR key */ - - if (size == ~0ull) { - mr->pbl_buf = NULL; - mr->pbl_dma_addr = 0; - /* PBL multi-hop addressing parameters */ - mr->pbl_bt_l2 = NULL; - mr->pbl_bt_l1 = NULL; - mr->pbl_bt_l0 = NULL; - mr->pbl_l2_dma_addr = NULL; - mr->pbl_l1_dma_addr = NULL; - mr->pbl_l0_dma_addr = 0; - } else { - if (!hr_dev->caps.pbl_hop_num) { - mr->pbl_buf = dma_alloc_coherent(dev, - npages * BA_BYTE_LEN, - &(mr->pbl_dma_addr), - GFP_KERNEL); - if (!mr->pbl_buf) - return -ENOMEM; - } else { - ret = hns_roce_mhop_alloc(hr_dev, npages, mr); - } - } + mr->pbl_hop_num = is_fast ? 1 : hr_dev->caps.pbl_hop_num; + buf_attr.page_shift = is_fast ? PAGE_SHIFT : + hr_dev->caps.pbl_buf_pg_sz + PAGE_ADDR_SHIFT; + buf_attr.region[0].size = length; + buf_attr.region[0].hopnum = mr->pbl_hop_num; + buf_attr.region_count = 1; + buf_attr.fixed_page = true; + buf_attr.user_access = access; + /* fast MR's buffer is alloced before mapping, not at creation */ + buf_attr.mtt_only = is_fast; + + err = hns_roce_mtr_create(hr_dev, &mr->pbl_mtr, &buf_attr, + hr_dev->caps.pbl_ba_pg_sz + PAGE_ADDR_SHIFT, + udata, start); + if (err) + ibdev_err(ibdev, "failed to alloc pbl mtr, ret = %d.\n", err); + else + mr->npages = mr->pbl_mtr.hem_cfg.buf_pg_count; - return ret; + return err; } -static void hns_roce_mhop_free(struct hns_roce_dev *hr_dev, - struct hns_roce_mr *mr) +static void free_mr_pbl(struct hns_roce_dev *hr_dev, struct hns_roce_mr *mr) { - struct device *dev = hr_dev->dev; - int npages_allocated; - int npages; - int i, j; - u32 pbl_bt_sz; - u32 mhop_num; - u64 bt_idx; - - npages = mr->pbl_size; - pbl_bt_sz = 1 << (hr_dev->caps.pbl_ba_pg_sz + PAGE_SHIFT); - mhop_num = (mr->type == MR_TYPE_FRMR) ? 1 : hr_dev->caps.pbl_hop_num; - - if (mhop_num == HNS_ROCE_HOP_NUM_0) - return; - - if (mhop_num == 1) { - dma_free_coherent(dev, (unsigned int)(npages * BA_BYTE_LEN), - mr->pbl_buf, mr->pbl_dma_addr); - return; - } - - dma_free_coherent(dev, pbl_bt_sz, mr->pbl_bt_l0, - mr->pbl_l0_dma_addr); - - if (mhop_num == 2) { - for (i = 0; i < mr->l0_chunk_last_num; i++) { - if (i == mr->l0_chunk_last_num - 1) { - npages_allocated = - i * (pbl_bt_sz / BA_BYTE_LEN); - - dma_free_coherent(dev, - (npages - npages_allocated) * BA_BYTE_LEN, - mr->pbl_bt_l1[i], - mr->pbl_l1_dma_addr[i]); - - break; - } - - dma_free_coherent(dev, pbl_bt_sz, mr->pbl_bt_l1[i], - mr->pbl_l1_dma_addr[i]); - } - } else if (mhop_num == 3) { - for (i = 0; i < mr->l0_chunk_last_num; i++) { - dma_free_coherent(dev, pbl_bt_sz, mr->pbl_bt_l1[i], - mr->pbl_l1_dma_addr[i]); - - for (j = 0; j < pbl_bt_sz / BA_BYTE_LEN; j++) { - bt_idx = i * (pbl_bt_sz / BA_BYTE_LEN) + j; - - if ((i == mr->l0_chunk_last_num - 1) - && j == mr->l1_chunk_last_num - 1) { - npages_allocated = bt_idx * - (pbl_bt_sz / BA_BYTE_LEN); - - dma_free_coherent(dev, - (npages - npages_allocated) * - BA_BYTE_LEN, - mr->pbl_bt_l2[bt_idx], - mr->pbl_l2_dma_addr[bt_idx]); - - break; - } - - dma_free_coherent(dev, pbl_bt_sz, - mr->pbl_bt_l2[bt_idx], - mr->pbl_l2_dma_addr[bt_idx]); - } - } - } - - kfree(mr->pbl_bt_l1); - kfree(mr->pbl_l1_dma_addr); - mr->pbl_bt_l1 = NULL; - mr->pbl_l1_dma_addr = NULL; - if (mhop_num == 3) { - kfree(mr->pbl_bt_l2); - kfree(mr->pbl_l2_dma_addr); - mr->pbl_bt_l2 = NULL; - mr->pbl_l2_dma_addr = NULL; - } + hns_roce_mtr_destroy(hr_dev, &mr->pbl_mtr); } static void hns_roce_mr_free(struct hns_roce_dev *hr_dev, struct hns_roce_mr *mr) { - struct device *dev = hr_dev->dev; - int npages = 0; + struct ib_device *ibdev = &hr_dev->ib_dev; int ret; if (mr->enabled) { @@ -712,27 +383,12 @@ static void hns_roce_mr_free(struct hns_roce_dev *hr_dev, key_to_hw_index(mr->key) & (hr_dev->caps.num_mtpts - 1)); if (ret) - dev_warn(dev, "DESTROY_MPT failed (%d)\n", ret); - } - - if (mr->size != ~0ULL) { - if (mr->type == MR_TYPE_MR) - npages = ib_umem_page_count(mr->umem); - - if (!hr_dev->caps.pbl_hop_num) - dma_free_coherent(dev, - (unsigned int)(npages * BA_BYTE_LEN), - mr->pbl_buf, mr->pbl_dma_addr); - else - hns_roce_mhop_free(hr_dev, mr); + ibdev_warn(ibdev, "failed to destroy mpt, ret = %d.\n", + ret); } - if (mr->enabled) - hns_roce_table_put(hr_dev, &hr_dev->mr_table.mtpt_table, - key_to_hw_index(mr->key)); - - hns_roce_bitmap_free(&hr_dev->mr_table.mtpt_bitmap, - key_to_hw_index(mr->key), BITMAP_NO_RR); + free_mr_pbl(hr_dev, mr); + free_mr_key(hr_dev, mr); } static int hns_roce_mr_enable(struct hns_roce_dev *hr_dev, @@ -742,18 +398,12 @@ static int hns_roce_mr_enable(struct hns_roce_dev *hr_dev, unsigned long mtpt_idx = key_to_hw_index(mr->key); struct device *dev = hr_dev->dev; struct hns_roce_cmd_mailbox *mailbox; - struct hns_roce_mr_table *mr_table = &hr_dev->mr_table; - - /* Prepare HEM entry memory */ - ret = hns_roce_table_get(hr_dev, &mr_table->mtpt_table, mtpt_idx); - if (ret) - return ret; /* Allocate mailbox memory */ mailbox = hns_roce_alloc_cmd_mailbox(hr_dev); if (IS_ERR(mailbox)) { ret = PTR_ERR(mailbox); - goto err_table; + return ret; } if (mr->type != MR_TYPE_FRMR) @@ -780,8 +430,6 @@ static int hns_roce_mr_enable(struct hns_roce_dev *hr_dev, err_page: hns_roce_free_cmd_mailbox(hr_dev, mailbox); -err_table: - hns_roce_table_put(hr_dev, &mr_table->mtpt_table, mtpt_idx); return ret; } @@ -982,18 +630,19 @@ void hns_roce_cleanup_mr_table(struct hns_roce_dev *hr_dev) struct ib_mr *hns_roce_get_dma_mr(struct ib_pd *pd, int acc) { + struct hns_roce_dev *hr_dev = to_hr_dev(pd->device); struct hns_roce_mr *mr; int ret; - mr = kmalloc(sizeof(*mr), GFP_KERNEL); + mr = kzalloc(sizeof(*mr), GFP_KERNEL); if (mr == NULL) return ERR_PTR(-ENOMEM); mr->type = MR_TYPE_DMA; /* Allocate memory region key */ - ret = hns_roce_mr_alloc(to_hr_dev(pd->device), to_hr_pd(pd)->pdn, 0, - ~0ULL, acc, 0, mr); + hns_roce_hem_list_init(&mr->pbl_mtr.hem_list); + ret = alloc_mr_key(hr_dev, mr, to_hr_pd(pd)->pdn, 0, 0, acc); if (ret) goto err_free; @@ -1002,12 +651,10 @@ struct ib_mr *hns_roce_get_dma_mr(struct ib_pd *pd, int acc) goto err_mr; mr->ibmr.rkey = mr->ibmr.lkey = mr->key; - mr->umem = NULL; return &mr->ibmr; - err_mr: - hns_roce_mr_free(to_hr_dev(pd->device), mr); + free_mr_key(hr_dev, mr); err_free: kfree(mr); @@ -1085,120 +732,41 @@ out: return ret; } -static int hns_roce_ib_umem_write_mr(struct hns_roce_dev *hr_dev, - struct hns_roce_mr *mr, - struct ib_umem *umem) -{ - struct sg_dma_page_iter sg_iter; - int i = 0, j = 0; - u64 page_addr; - u32 pbl_bt_sz; - - if (hr_dev->caps.pbl_hop_num == HNS_ROCE_HOP_NUM_0) - return 0; - - pbl_bt_sz = 1 << (hr_dev->caps.pbl_ba_pg_sz + PAGE_SHIFT); - for_each_sg_dma_page(umem->sg_head.sgl, &sg_iter, umem->nmap, 0) { - page_addr = sg_page_iter_dma_address(&sg_iter); - if (!hr_dev->caps.pbl_hop_num) { - /* for hip06, page addr is aligned to 4K */ - mr->pbl_buf[i++] = page_addr >> 12; - } else if (hr_dev->caps.pbl_hop_num == 1) { - mr->pbl_buf[i++] = page_addr; - } else { - if (hr_dev->caps.pbl_hop_num == 2) - mr->pbl_bt_l1[i][j] = page_addr; - else if (hr_dev->caps.pbl_hop_num == 3) - mr->pbl_bt_l2[i][j] = page_addr; - - j++; - if (j >= (pbl_bt_sz / BA_BYTE_LEN)) { - i++; - j = 0; - } - } - } - - /* Memory barrier */ - mb(); - - return 0; -} - struct ib_mr *hns_roce_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, u64 virt_addr, int access_flags, struct ib_udata *udata) { struct hns_roce_dev *hr_dev = to_hr_dev(pd->device); - struct device *dev = hr_dev->dev; struct hns_roce_mr *mr; - int bt_size; int ret; - int n; - int i; - mr = kmalloc(sizeof(*mr), GFP_KERNEL); + mr = kzalloc(sizeof(*mr), GFP_KERNEL); if (!mr) return ERR_PTR(-ENOMEM); - mr->umem = ib_umem_get(pd->device, start, length, access_flags); - if (IS_ERR(mr->umem)) { - ret = PTR_ERR(mr->umem); - goto err_free; - } - - n = ib_umem_page_count(mr->umem); - - if (!hr_dev->caps.pbl_hop_num) { - if (n > HNS_ROCE_MAX_MTPT_PBL_NUM) { - dev_err(dev, - " MR len %lld err. MR is limited to 4G at most!\n", - length); - ret = -EINVAL; - goto err_umem; - } - } else { - u64 pbl_size = 1; - - bt_size = (1 << (hr_dev->caps.pbl_ba_pg_sz + PAGE_SHIFT)) / - BA_BYTE_LEN; - for (i = 0; i < hr_dev->caps.pbl_hop_num; i++) - pbl_size *= bt_size; - if (n > pbl_size) { - dev_err(dev, - " MR len %lld err. MR page num is limited to %lld!\n", - length, pbl_size); - ret = -EINVAL; - goto err_umem; - } - } - mr->type = MR_TYPE_MR; - - ret = hns_roce_mr_alloc(hr_dev, to_hr_pd(pd)->pdn, virt_addr, length, - access_flags, n, mr); + ret = alloc_mr_key(hr_dev, mr, to_hr_pd(pd)->pdn, virt_addr, length, + access_flags); if (ret) - goto err_umem; + goto err_alloc_mr; - ret = hns_roce_ib_umem_write_mr(hr_dev, mr, mr->umem); + ret = alloc_mr_pbl(hr_dev, mr, length, udata, start, access_flags); if (ret) - goto err_mr; + goto err_alloc_key; ret = hns_roce_mr_enable(hr_dev, mr); if (ret) - goto err_mr; + goto err_alloc_pbl; mr->ibmr.rkey = mr->ibmr.lkey = mr->key; return &mr->ibmr; -err_mr: - hns_roce_mr_free(hr_dev, mr); - -err_umem: - ib_umem_release(mr->umem); - -err_free: +err_alloc_pbl: + free_mr_pbl(hr_dev, mr); +err_alloc_key: + free_mr_key(hr_dev, mr); +err_alloc_mr: kfree(mr); return ERR_PTR(ret); } @@ -1210,84 +778,36 @@ static int rereg_mr_trans(struct ib_mr *ibmr, int flags, u32 pdn, struct ib_udata *udata) { struct hns_roce_dev *hr_dev = to_hr_dev(ibmr->device); + struct ib_device *ibdev = &hr_dev->ib_dev; struct hns_roce_mr *mr = to_hr_mr(ibmr); - struct device *dev = hr_dev->dev; - int npages; int ret; - if (mr->size != ~0ULL) { - npages = ib_umem_page_count(mr->umem); - - if (hr_dev->caps.pbl_hop_num) - hns_roce_mhop_free(hr_dev, mr); - else - dma_free_coherent(dev, npages * 8, - mr->pbl_buf, mr->pbl_dma_addr); - } - ib_umem_release(mr->umem); - - mr->umem = ib_umem_get(ibmr->device, start, length, mr_access_flags); - if (IS_ERR(mr->umem)) { - ret = PTR_ERR(mr->umem); - mr->umem = NULL; - return -ENOMEM; - } - npages = ib_umem_page_count(mr->umem); - - if (hr_dev->caps.pbl_hop_num) { - ret = hns_roce_mhop_alloc(hr_dev, npages, mr); - if (ret) - goto release_umem; - } else { - mr->pbl_buf = dma_alloc_coherent(dev, npages * 8, - &(mr->pbl_dma_addr), - GFP_KERNEL); - if (!mr->pbl_buf) { - ret = -ENOMEM; - goto release_umem; - } + free_mr_pbl(hr_dev, mr); + ret = alloc_mr_pbl(hr_dev, mr, length, udata, start, mr_access_flags); + if (ret) { + ibdev_err(ibdev, "failed to create mr PBL, ret = %d.\n", ret); + return ret; } ret = hr_dev->hw->rereg_write_mtpt(hr_dev, mr, flags, pdn, mr_access_flags, virt_addr, length, mailbox->buf); - if (ret) - goto release_umem; - - - ret = hns_roce_ib_umem_write_mr(hr_dev, mr, mr->umem); if (ret) { - if (mr->size != ~0ULL) { - npages = ib_umem_page_count(mr->umem); - - if (hr_dev->caps.pbl_hop_num) - hns_roce_mhop_free(hr_dev, mr); - else - dma_free_coherent(dev, npages * 8, - mr->pbl_buf, - mr->pbl_dma_addr); - } - - goto release_umem; + ibdev_err(ibdev, "failed to write mtpt, ret = %d.\n", ret); + free_mr_pbl(hr_dev, mr); } - return 0; - -release_umem: - ib_umem_release(mr->umem); return ret; - } - int hns_roce_rereg_user_mr(struct ib_mr *ibmr, int flags, u64 start, u64 length, u64 virt_addr, int mr_access_flags, struct ib_pd *pd, struct ib_udata *udata) { struct hns_roce_dev *hr_dev = to_hr_dev(ibmr->device); + struct ib_device *ib_dev = &hr_dev->ib_dev; struct hns_roce_mr *mr = to_hr_mr(ibmr); struct hns_roce_cmd_mailbox *mailbox; - struct device *dev = hr_dev->dev; unsigned long mtpt_idx; u32 pdn = 0; int ret; @@ -1308,7 +828,7 @@ int hns_roce_rereg_user_mr(struct ib_mr *ibmr, int flags, u64 start, u64 length, ret = hns_roce_hw_destroy_mpt(hr_dev, NULL, mtpt_idx); if (ret) - dev_warn(dev, "DESTROY_MPT failed (%d)\n", ret); + ibdev_warn(ib_dev, "failed to destroy MPT, ret = %d.\n", ret); mr->enabled = 0; @@ -1332,8 +852,7 @@ int hns_roce_rereg_user_mr(struct ib_mr *ibmr, int flags, u64 start, u64 length, ret = hns_roce_hw_create_mpt(hr_dev, mailbox, mtpt_idx); if (ret) { - dev_err(dev, "CREATE_MPT failed (%d)\n", ret); - ib_umem_release(mr->umem); + ibdev_err(ib_dev, "failed to create MPT, ret = %d.\n", ret); goto free_cmd_mbox; } @@ -1361,8 +880,6 @@ int hns_roce_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata) ret = hr_dev->hw->dereg_mr(hr_dev, mr, udata); } else { hns_roce_mr_free(hr_dev, mr); - - ib_umem_release(mr->umem); kfree(mr); } @@ -1376,12 +893,8 @@ struct ib_mr *hns_roce_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type, struct device *dev = hr_dev->dev; struct hns_roce_mr *mr; u64 length; - u32 page_size; int ret; - page_size = 1 << (hr_dev->caps.pbl_buf_pg_sz + PAGE_SHIFT); - length = max_num_sg * page_size; - if (mr_type != IB_MR_TYPE_MEM_REG) return ERR_PTR(-EINVAL); @@ -1398,23 +911,27 @@ struct ib_mr *hns_roce_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type, mr->type = MR_TYPE_FRMR; /* Allocate memory region key */ - ret = hns_roce_mr_alloc(hr_dev, to_hr_pd(pd)->pdn, 0, length, - 0, max_num_sg, mr); + length = max_num_sg * (1 << PAGE_SHIFT); + ret = alloc_mr_key(hr_dev, mr, to_hr_pd(pd)->pdn, 0, length, 0); if (ret) goto err_free; + ret = alloc_mr_pbl(hr_dev, mr, length, NULL, 0, 0); + if (ret) + goto err_key; + ret = hns_roce_mr_enable(hr_dev, mr); if (ret) - goto err_mr; + goto err_pbl; mr->ibmr.rkey = mr->ibmr.lkey = mr->key; - mr->umem = NULL; return &mr->ibmr; -err_mr: - hns_roce_mr_free(to_hr_dev(pd->device), mr); - +err_key: + free_mr_key(hr_dev, mr); +err_pbl: + free_mr_pbl(hr_dev, mr); err_free: kfree(mr); return ERR_PTR(ret); @@ -1424,19 +941,54 @@ static int hns_roce_set_page(struct ib_mr *ibmr, u64 addr) { struct hns_roce_mr *mr = to_hr_mr(ibmr); - mr->pbl_buf[mr->npages++] = addr; + if (likely(mr->npages < mr->pbl_mtr.hem_cfg.buf_pg_count)) { + mr->page_list[mr->npages++] = addr; + return 0; + } - return 0; + return -ENOBUFS; } int hns_roce_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents, unsigned int *sg_offset) { + struct hns_roce_dev *hr_dev = to_hr_dev(ibmr->device); + struct ib_device *ibdev = &hr_dev->ib_dev; struct hns_roce_mr *mr = to_hr_mr(ibmr); + struct hns_roce_buf_region region = {}; + int ret = 0; mr->npages = 0; + mr->page_list = kvcalloc(mr->pbl_mtr.hem_cfg.buf_pg_count, + sizeof(dma_addr_t), GFP_KERNEL); + if (!mr->page_list) + return ret; + + ret = ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset, hns_roce_set_page); + if (ret < 1) { + ibdev_err(ibdev, "failed to store sg pages %d %d, cnt = %d.\n", + mr->npages, mr->pbl_mtr.hem_cfg.buf_pg_count, ret); + goto err_page_list; + } - return ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset, hns_roce_set_page); + region.offset = 0; + region.count = mr->npages; + region.hopnum = mr->pbl_hop_num; + ret = hns_roce_mtr_map(hr_dev, &mr->pbl_mtr, ®ion, 1, mr->page_list, + mr->npages); + if (ret) { + ibdev_err(ibdev, "failed to map sg mtr, ret = %d.\n", ret); + ret = 0; + } else { + mr->pbl_mtr.hem_cfg.buf_pg_shift = ilog2(ibmr->page_size); + ret = mr->npages; + } + +err_page_list: + kvfree(mr->page_list); + mr->page_list = NULL; + + return ret; } static void hns_roce_mw_free(struct hns_roce_dev *hr_dev, -- cgit v1.2.3 From 2929c40f08a9654c4c8e35ad0a36d611deb61394 Mon Sep 17 00:00:00 2001 From: Xi Wang Date: Tue, 28 Apr 2020 19:03:40 +0800 Subject: RDMA/hns: Remove unused MTT functions The MTT (Memory Translate Table) interface is no longer used to configure the buffer address to BT (Base Address Table) that requires driver mapping. Because the MTT is not compatible with multi-hop addressing of the hip08, it is replaced by MTR (Memory Translate Region) interface, and all the MTT functions should be removed. Link: https://lore.kernel.org/r/1588071823-40200-3-git-send-email-liweihang@huawei.com Signed-off-by: Xi Wang Signed-off-by: Weihang Li Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_alloc.c | 43 --- drivers/infiniband/hw/hns/hns_roce_device.h | 64 ---- drivers/infiniband/hw/hns/hns_roce_hem.c | 105 ------ drivers/infiniband/hw/hns/hns_roce_hem.h | 6 - drivers/infiniband/hw/hns/hns_roce_main.c | 70 +--- drivers/infiniband/hw/hns/hns_roce_mr.c | 521 ---------------------------- 6 files changed, 2 insertions(+), 807 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_alloc.c b/drivers/infiniband/hw/hns/hns_roce_alloc.c index e04e7596d979..365e7db6c498 100644 --- a/drivers/infiniband/hw/hns/hns_roce_alloc.c +++ b/drivers/infiniband/hw/hns/hns_roce_alloc.c @@ -283,49 +283,6 @@ done: return total; } -void hns_roce_init_buf_region(struct hns_roce_buf_region *region, int hopnum, - int offset, int buf_cnt) -{ - if (hopnum == HNS_ROCE_HOP_NUM_0) - region->hopnum = 0; - else - region->hopnum = hopnum; - - region->offset = offset; - region->count = buf_cnt; -} - -void hns_roce_free_buf_list(dma_addr_t **bufs, int region_cnt) -{ - int i; - - for (i = 0; i < region_cnt; i++) { - kfree(bufs[i]); - bufs[i] = NULL; - } -} - -int hns_roce_alloc_buf_list(struct hns_roce_buf_region *regions, - dma_addr_t **bufs, int region_cnt) -{ - struct hns_roce_buf_region *r; - int i; - - for (i = 0; i < region_cnt; i++) { - r = ®ions[i]; - bufs[i] = kcalloc(r->count, sizeof(dma_addr_t), GFP_KERNEL); - if (!bufs[i]) - goto err_alloc; - } - - return 0; - -err_alloc: - hns_roce_free_buf_list(bufs, i); - - return -ENOMEM; -} - void hns_roce_cleanup_bitmap(struct hns_roce_dev *hr_dev) { if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_SRQ) diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index 1089db8ad0b5..da3850aeeb14 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -222,13 +222,6 @@ enum { HNS_ROCE_CAP_FLAG_ATOMIC = BIT(10), }; -enum hns_roce_mtt_type { - MTT_TYPE_WQE, - MTT_TYPE_CQE, - MTT_TYPE_SRQWQE, - MTT_TYPE_IDX -}; - #define HNS_ROCE_DB_TYPE_COUNT 2 #define HNS_ROCE_DB_UNIT_SIZE 4 @@ -267,8 +260,6 @@ enum { #define HNS_ROCE_PORT_DOWN 0 #define HNS_ROCE_PORT_UP 1 -#define HNS_ROCE_MTT_ENTRY_PER_SEG 8 - #define PAGE_ADDR_SHIFT 12 /* The minimum page count for hardware access page directly. */ @@ -303,22 +294,6 @@ struct hns_roce_bitmap { unsigned long *table; }; -/* Order bitmap length -- bit num compute formula: 1 << (max_order - order) */ -/* Order = 0: bitmap is biggest, order = max bitmap is least (only a bit) */ -/* Every bit repesent to a partner free/used status in bitmap */ -/* - * Initial, bits of other bitmap are all 0 except that a bit of max_order is 1 - * Bit = 1 represent to idle and available; bit = 0: not available - */ -struct hns_roce_buddy { - /* Members point to every order level bitmap */ - unsigned long **bits; - /* Represent to avail bits of the order level bitmap */ - u32 *num_free; - int max_order; - spinlock_t lock; -}; - /* For Hardware Entry Memory */ struct hns_roce_hem_table { /* HEM type: 0 = qpc, 1 = mtt, 2 = cqc, 3 = srq, 4 = other */ @@ -339,13 +314,6 @@ struct hns_roce_hem_table { dma_addr_t *bt_l0_dma_addr; }; -struct hns_roce_mtt { - unsigned long first_seg; - int order; - int page_shift; - enum hns_roce_mtt_type mtt_type; -}; - struct hns_roce_buf_region { int offset; /* page offset */ u32 count; /* page count */ @@ -418,15 +386,7 @@ struct hns_roce_mr { struct hns_roce_mr_table { struct hns_roce_bitmap mtpt_bitmap; - struct hns_roce_buddy mtt_buddy; - struct hns_roce_hem_table mtt_table; struct hns_roce_hem_table mtpt_table; - struct hns_roce_buddy mtt_cqe_buddy; - struct hns_roce_hem_table mtt_cqe_table; - struct hns_roce_buddy mtt_srqwqe_buddy; - struct hns_roce_hem_table mtt_srqwqe_table; - struct hns_roce_buddy mtt_idx_buddy; - struct hns_roce_hem_table mtt_idx_table; }; struct hns_roce_wq { @@ -1141,21 +1101,6 @@ void hns_roce_cmd_event(struct hns_roce_dev *hr_dev, u16 token, u8 status, int hns_roce_cmd_use_events(struct hns_roce_dev *hr_dev); void hns_roce_cmd_use_polling(struct hns_roce_dev *hr_dev); -int hns_roce_mtt_init(struct hns_roce_dev *hr_dev, int npages, int page_shift, - struct hns_roce_mtt *mtt); -void hns_roce_mtt_cleanup(struct hns_roce_dev *hr_dev, - struct hns_roce_mtt *mtt); -int hns_roce_buf_write_mtt(struct hns_roce_dev *hr_dev, - struct hns_roce_mtt *mtt, struct hns_roce_buf *buf); - -void hns_roce_mtr_init(struct hns_roce_mtr *mtr, int bt_pg_shift, - int buf_pg_shift); -int hns_roce_mtr_attach(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr, - dma_addr_t **bufs, struct hns_roce_buf_region *regions, - int region_cnt); -void hns_roce_mtr_cleanup(struct hns_roce_dev *hr_dev, - struct hns_roce_mtr *mtr); - /* hns roce hw need current block and next block addr from mtt */ #define MTT_MIN_COUNT 2 int hns_roce_mtr_find(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr, @@ -1228,15 +1173,6 @@ void hns_roce_buf_free(struct hns_roce_dev *hr_dev, struct hns_roce_buf *buf); int hns_roce_buf_alloc(struct hns_roce_dev *hr_dev, u32 size, u32 max_direct, struct hns_roce_buf *buf, u32 page_shift); -int hns_roce_ib_umem_write_mtt(struct hns_roce_dev *hr_dev, - struct hns_roce_mtt *mtt, struct ib_umem *umem); - -void hns_roce_init_buf_region(struct hns_roce_buf_region *region, int hopnum, - int offset, int buf_cnt); -int hns_roce_alloc_buf_list(struct hns_roce_buf_region *regions, - dma_addr_t **bufs, int count); -void hns_roce_free_buf_list(dma_addr_t **bufs, int count); - int hns_roce_get_kmem_bufs(struct hns_roce_dev *hr_dev, dma_addr_t *bufs, int buf_cnt, int start, struct hns_roce_buf *buf); int hns_roce_get_umem_bufs(struct hns_roce_dev *hr_dev, dma_addr_t *bufs, diff --git a/drivers/infiniband/hw/hns/hns_roce_hem.c b/drivers/infiniband/hw/hns/hns_roce_hem.c index a245e753afe9..37d101eec181 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hem.c +++ b/drivers/infiniband/hw/hns/hns_roce_hem.c @@ -75,18 +75,6 @@ bool hns_roce_check_whether_mhop(struct hns_roce_dev *hr_dev, u32 type) case HEM_TYPE_CQC_TIMER: hop_num = hr_dev->caps.cqc_timer_hop_num; break; - case HEM_TYPE_CQE: - hop_num = hr_dev->caps.cqe_hop_num; - break; - case HEM_TYPE_MTT: - hop_num = hr_dev->caps.mtt_hop_num; - break; - case HEM_TYPE_SRQWQE: - hop_num = hr_dev->caps.srqwqe_hop_num; - break; - case HEM_TYPE_IDX: - hop_num = hr_dev->caps.idx_hop_num; - break; default: return false; } @@ -195,38 +183,6 @@ static int get_hem_table_config(struct hns_roce_dev *hr_dev, mhop->ba_l0_num = hr_dev->caps.srqc_bt_num; mhop->hop_num = hr_dev->caps.srqc_hop_num; break; - case HEM_TYPE_MTT: - mhop->buf_chunk_size = 1 << (hr_dev->caps.mtt_buf_pg_sz - + PAGE_SHIFT); - mhop->bt_chunk_size = 1 << (hr_dev->caps.mtt_ba_pg_sz - + PAGE_SHIFT); - mhop->ba_l0_num = mhop->bt_chunk_size / BA_BYTE_LEN; - mhop->hop_num = hr_dev->caps.mtt_hop_num; - break; - case HEM_TYPE_CQE: - mhop->buf_chunk_size = 1 << (hr_dev->caps.cqe_buf_pg_sz - + PAGE_SHIFT); - mhop->bt_chunk_size = 1 << (hr_dev->caps.cqe_ba_pg_sz - + PAGE_SHIFT); - mhop->ba_l0_num = mhop->bt_chunk_size / BA_BYTE_LEN; - mhop->hop_num = hr_dev->caps.cqe_hop_num; - break; - case HEM_TYPE_SRQWQE: - mhop->buf_chunk_size = 1 << (hr_dev->caps.srqwqe_buf_pg_sz - + PAGE_SHIFT); - mhop->bt_chunk_size = 1 << (hr_dev->caps.srqwqe_ba_pg_sz - + PAGE_SHIFT); - mhop->ba_l0_num = mhop->bt_chunk_size / BA_BYTE_LEN; - mhop->hop_num = hr_dev->caps.srqwqe_hop_num; - break; - case HEM_TYPE_IDX: - mhop->buf_chunk_size = 1 << (hr_dev->caps.idx_buf_pg_sz - + PAGE_SHIFT); - mhop->bt_chunk_size = 1 << (hr_dev->caps.idx_ba_pg_sz - + PAGE_SHIFT); - mhop->ba_l0_num = mhop->bt_chunk_size / BA_BYTE_LEN; - mhop->hop_num = hr_dev->caps.idx_hop_num; - break; default: dev_err(dev, "Table %d not support multi-hop addressing!\n", type); @@ -899,57 +855,6 @@ out: return addr; } -int hns_roce_table_get_range(struct hns_roce_dev *hr_dev, - struct hns_roce_hem_table *table, - unsigned long start, unsigned long end) -{ - struct hns_roce_hem_mhop mhop; - unsigned long inc = table->table_chunk_size / table->obj_size; - unsigned long i = 0; - int ret; - - if (hns_roce_check_whether_mhop(hr_dev, table->type)) { - ret = hns_roce_calc_hem_mhop(hr_dev, table, NULL, &mhop); - if (ret) - goto fail; - inc = mhop.bt_chunk_size / table->obj_size; - } - - /* Allocate MTT entry memory according to chunk(128K) */ - for (i = start; i <= end; i += inc) { - ret = hns_roce_table_get(hr_dev, table, i); - if (ret) - goto fail; - } - - return 0; - -fail: - while (i > start) { - i -= inc; - hns_roce_table_put(hr_dev, table, i); - } - return ret; -} - -void hns_roce_table_put_range(struct hns_roce_dev *hr_dev, - struct hns_roce_hem_table *table, - unsigned long start, unsigned long end) -{ - struct hns_roce_hem_mhop mhop; - unsigned long inc = table->table_chunk_size / table->obj_size; - unsigned long i; - - if (hns_roce_check_whether_mhop(hr_dev, table->type)) { - if (hns_roce_calc_hem_mhop(hr_dev, table, NULL, &mhop)) - return; - inc = mhop.bt_chunk_size / table->obj_size; - } - - for (i = start; i <= end; i += inc) - hns_roce_table_put(hr_dev, table, i); -} - int hns_roce_init_hem_table(struct hns_roce_dev *hr_dev, struct hns_roce_hem_table *table, u32 type, unsigned long obj_size, unsigned long nobj, @@ -1112,12 +1017,6 @@ void hns_roce_cleanup_hem_table(struct hns_roce_dev *hr_dev, void hns_roce_cleanup_hem(struct hns_roce_dev *hr_dev) { - if ((hr_dev->caps.num_idx_segs)) - hns_roce_cleanup_hem_table(hr_dev, - &hr_dev->mr_table.mtt_idx_table); - if (hr_dev->caps.num_srqwqe_segs) - hns_roce_cleanup_hem_table(hr_dev, - &hr_dev->mr_table.mtt_srqwqe_table); if (hr_dev->caps.srqc_entry_sz) hns_roce_cleanup_hem_table(hr_dev, &hr_dev->srq_table.table); @@ -1137,10 +1036,6 @@ void hns_roce_cleanup_hem(struct hns_roce_dev *hr_dev) hns_roce_cleanup_hem_table(hr_dev, &hr_dev->qp_table.irrl_table); hns_roce_cleanup_hem_table(hr_dev, &hr_dev->qp_table.qp_table); hns_roce_cleanup_hem_table(hr_dev, &hr_dev->mr_table.mtpt_table); - if (hns_roce_check_whether_mhop(hr_dev, HEM_TYPE_CQE)) - hns_roce_cleanup_hem_table(hr_dev, - &hr_dev->mr_table.mtt_cqe_table); - hns_roce_cleanup_hem_table(hr_dev, &hr_dev->mr_table.mtt_table); } struct roce_hem_item { diff --git a/drivers/infiniband/hw/hns/hns_roce_hem.h b/drivers/infiniband/hw/hns/hns_roce_hem.h index a00b6c27735a..1fa0bdcb1989 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hem.h +++ b/drivers/infiniband/hw/hns/hns_roce_hem.h @@ -115,12 +115,6 @@ void hns_roce_table_put(struct hns_roce_dev *hr_dev, void *hns_roce_table_find(struct hns_roce_dev *hr_dev, struct hns_roce_hem_table *table, unsigned long obj, dma_addr_t *dma_handle); -int hns_roce_table_get_range(struct hns_roce_dev *hr_dev, - struct hns_roce_hem_table *table, - unsigned long start, unsigned long end); -void hns_roce_table_put_range(struct hns_roce_dev *hr_dev, - struct hns_roce_hem_table *table, - unsigned long start, unsigned long end); int hns_roce_init_hem_table(struct hns_roce_dev *hr_dev, struct hns_roce_hem_table *table, u32 type, unsigned long obj_size, unsigned long nobj, diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c index d0031d559213..fd3581efe9a8 100644 --- a/drivers/infiniband/hw/hns/hns_roce_main.c +++ b/drivers/infiniband/hw/hns/hns_roce_main.c @@ -579,33 +579,12 @@ static int hns_roce_init_hem(struct hns_roce_dev *hr_dev) int ret; struct device *dev = hr_dev->dev; - ret = hns_roce_init_hem_table(hr_dev, &hr_dev->mr_table.mtt_table, - HEM_TYPE_MTT, hr_dev->caps.mtt_entry_sz, - hr_dev->caps.num_mtt_segs, 1); - if (ret) { - dev_err(dev, "Failed to init MTT context memory, aborting.\n"); - return ret; - } - - if (hns_roce_check_whether_mhop(hr_dev, HEM_TYPE_CQE)) { - ret = hns_roce_init_hem_table(hr_dev, - &hr_dev->mr_table.mtt_cqe_table, - HEM_TYPE_CQE, - hr_dev->caps.mtt_entry_sz, - hr_dev->caps.num_cqe_segs, 1); - if (ret) { - dev_err(dev, - "Failed to init CQE context memory, aborting.\n"); - goto err_unmap_cqe; - } - } - ret = hns_roce_init_hem_table(hr_dev, &hr_dev->mr_table.mtpt_table, HEM_TYPE_MTPT, hr_dev->caps.mtpt_entry_sz, hr_dev->caps.num_mtpts, 1); if (ret) { dev_err(dev, "Failed to init MTPT context memory, aborting.\n"); - goto err_unmap_mtt; + return ret; } ret = hns_roce_init_hem_table(hr_dev, &hr_dev->qp_table.qp_table, @@ -660,32 +639,6 @@ static int hns_roce_init_hem(struct hns_roce_dev *hr_dev) } } - if (hr_dev->caps.num_srqwqe_segs) { - ret = hns_roce_init_hem_table(hr_dev, - &hr_dev->mr_table.mtt_srqwqe_table, - HEM_TYPE_SRQWQE, - hr_dev->caps.mtt_entry_sz, - hr_dev->caps.num_srqwqe_segs, 1); - if (ret) { - dev_err(dev, - "Failed to init MTT srqwqe memory, aborting.\n"); - goto err_unmap_srq; - } - } - - if (hr_dev->caps.num_idx_segs) { - ret = hns_roce_init_hem_table(hr_dev, - &hr_dev->mr_table.mtt_idx_table, - HEM_TYPE_IDX, - hr_dev->caps.idx_entry_sz, - hr_dev->caps.num_idx_segs, 1); - if (ret) { - dev_err(dev, - "Failed to init MTT idx memory, aborting.\n"); - goto err_unmap_srqwqe; - } - } - if (hr_dev->caps.sccc_entry_sz) { ret = hns_roce_init_hem_table(hr_dev, &hr_dev->qp_table.sccc_table, @@ -695,7 +648,7 @@ static int hns_roce_init_hem(struct hns_roce_dev *hr_dev) if (ret) { dev_err(dev, "Failed to init SCC context memory, aborting.\n"); - goto err_unmap_idx; + goto err_unmap_srq; } } @@ -733,17 +686,6 @@ err_unmap_ctx: if (hr_dev->caps.sccc_entry_sz) hns_roce_cleanup_hem_table(hr_dev, &hr_dev->qp_table.sccc_table); - -err_unmap_idx: - if (hr_dev->caps.num_idx_segs) - hns_roce_cleanup_hem_table(hr_dev, - &hr_dev->mr_table.mtt_idx_table); - -err_unmap_srqwqe: - if (hr_dev->caps.num_srqwqe_segs) - hns_roce_cleanup_hem_table(hr_dev, - &hr_dev->mr_table.mtt_srqwqe_table); - err_unmap_srq: if (hr_dev->caps.srqc_entry_sz) hns_roce_cleanup_hem_table(hr_dev, &hr_dev->srq_table.table); @@ -765,14 +707,6 @@ err_unmap_qp: err_unmap_dmpt: hns_roce_cleanup_hem_table(hr_dev, &hr_dev->mr_table.mtpt_table); -err_unmap_mtt: - if (hns_roce_check_whether_mhop(hr_dev, HEM_TYPE_CQE)) - hns_roce_cleanup_hem_table(hr_dev, - &hr_dev->mr_table.mtt_cqe_table); - -err_unmap_cqe: - hns_roce_cleanup_hem_table(hr_dev, &hr_dev->mr_table.mtt_table); - return ret; } diff --git a/drivers/infiniband/hw/hns/hns_roce_mr.c b/drivers/infiniband/hw/hns/hns_roce_mr.c index c65f1f682819..ecd76759d47a 100644 --- a/drivers/infiniband/hw/hns/hns_roce_mr.c +++ b/drivers/infiniband/hw/hns/hns_roce_mr.c @@ -66,233 +66,6 @@ int hns_roce_hw_destroy_mpt(struct hns_roce_dev *hr_dev, HNS_ROCE_CMD_TIMEOUT_MSECS); } -static int hns_roce_buddy_alloc(struct hns_roce_buddy *buddy, int order, - unsigned long *seg) -{ - int o; - u32 m; - - spin_lock(&buddy->lock); - - for (o = order; o <= buddy->max_order; ++o) { - if (buddy->num_free[o]) { - m = 1 << (buddy->max_order - o); - *seg = find_first_bit(buddy->bits[o], m); - if (*seg < m) - goto found; - } - } - spin_unlock(&buddy->lock); - return -EINVAL; - - found: - clear_bit(*seg, buddy->bits[o]); - --buddy->num_free[o]; - - while (o > order) { - --o; - *seg <<= 1; - set_bit(*seg ^ 1, buddy->bits[o]); - ++buddy->num_free[o]; - } - - spin_unlock(&buddy->lock); - - *seg <<= order; - return 0; -} - -static void hns_roce_buddy_free(struct hns_roce_buddy *buddy, unsigned long seg, - int order) -{ - seg >>= order; - - spin_lock(&buddy->lock); - - while (test_bit(seg ^ 1, buddy->bits[order])) { - clear_bit(seg ^ 1, buddy->bits[order]); - --buddy->num_free[order]; - seg >>= 1; - ++order; - } - - set_bit(seg, buddy->bits[order]); - ++buddy->num_free[order]; - - spin_unlock(&buddy->lock); -} - -static int hns_roce_buddy_init(struct hns_roce_buddy *buddy, int max_order) -{ - int i, s; - - buddy->max_order = max_order; - spin_lock_init(&buddy->lock); - buddy->bits = kcalloc(buddy->max_order + 1, - sizeof(*buddy->bits), - GFP_KERNEL); - buddy->num_free = kcalloc(buddy->max_order + 1, - sizeof(*buddy->num_free), - GFP_KERNEL); - if (!buddy->bits || !buddy->num_free) - goto err_out; - - for (i = 0; i <= buddy->max_order; ++i) { - s = BITS_TO_LONGS(1 << (buddy->max_order - i)); - buddy->bits[i] = kcalloc(s, sizeof(long), GFP_KERNEL | - __GFP_NOWARN); - if (!buddy->bits[i]) { - buddy->bits[i] = vzalloc(array_size(s, sizeof(long))); - if (!buddy->bits[i]) - goto err_out_free; - } - } - - set_bit(0, buddy->bits[buddy->max_order]); - buddy->num_free[buddy->max_order] = 1; - - return 0; - -err_out_free: - for (i = 0; i <= buddy->max_order; ++i) - kvfree(buddy->bits[i]); - -err_out: - kfree(buddy->bits); - kfree(buddy->num_free); - return -ENOMEM; -} - -static void hns_roce_buddy_cleanup(struct hns_roce_buddy *buddy) -{ - int i; - - for (i = 0; i <= buddy->max_order; ++i) - kvfree(buddy->bits[i]); - - kfree(buddy->bits); - kfree(buddy->num_free); -} - -static int hns_roce_alloc_mtt_range(struct hns_roce_dev *hr_dev, int order, - unsigned long *seg, u32 mtt_type) -{ - struct hns_roce_mr_table *mr_table = &hr_dev->mr_table; - struct hns_roce_hem_table *table; - struct hns_roce_buddy *buddy; - int ret; - - switch (mtt_type) { - case MTT_TYPE_WQE: - buddy = &mr_table->mtt_buddy; - table = &mr_table->mtt_table; - break; - case MTT_TYPE_CQE: - buddy = &mr_table->mtt_cqe_buddy; - table = &mr_table->mtt_cqe_table; - break; - case MTT_TYPE_SRQWQE: - buddy = &mr_table->mtt_srqwqe_buddy; - table = &mr_table->mtt_srqwqe_table; - break; - case MTT_TYPE_IDX: - buddy = &mr_table->mtt_idx_buddy; - table = &mr_table->mtt_idx_table; - break; - default: - dev_err(hr_dev->dev, "Unsupport MTT table type: %d\n", - mtt_type); - return -EINVAL; - } - - ret = hns_roce_buddy_alloc(buddy, order, seg); - if (ret) - return ret; - - ret = hns_roce_table_get_range(hr_dev, table, *seg, - *seg + (1 << order) - 1); - if (ret) { - hns_roce_buddy_free(buddy, *seg, order); - return ret; - } - - return 0; -} - -int hns_roce_mtt_init(struct hns_roce_dev *hr_dev, int npages, int page_shift, - struct hns_roce_mtt *mtt) -{ - int ret; - int i; - - /* Page num is zero, correspond to DMA memory register */ - if (!npages) { - mtt->order = -1; - mtt->page_shift = HNS_ROCE_HEM_PAGE_SHIFT; - return 0; - } - - /* Note: if page_shift is zero, FAST memory register */ - mtt->page_shift = page_shift; - - /* Compute MTT entry necessary */ - for (mtt->order = 0, i = HNS_ROCE_MTT_ENTRY_PER_SEG; i < npages; - i <<= 1) - ++mtt->order; - - /* Allocate MTT entry */ - ret = hns_roce_alloc_mtt_range(hr_dev, mtt->order, &mtt->first_seg, - mtt->mtt_type); - if (ret) - return -ENOMEM; - - return 0; -} - -void hns_roce_mtt_cleanup(struct hns_roce_dev *hr_dev, struct hns_roce_mtt *mtt) -{ - struct hns_roce_mr_table *mr_table = &hr_dev->mr_table; - - if (mtt->order < 0) - return; - - switch (mtt->mtt_type) { - case MTT_TYPE_WQE: - hns_roce_buddy_free(&mr_table->mtt_buddy, mtt->first_seg, - mtt->order); - hns_roce_table_put_range(hr_dev, &mr_table->mtt_table, - mtt->first_seg, - mtt->first_seg + (1 << mtt->order) - 1); - break; - case MTT_TYPE_CQE: - hns_roce_buddy_free(&mr_table->mtt_cqe_buddy, mtt->first_seg, - mtt->order); - hns_roce_table_put_range(hr_dev, &mr_table->mtt_cqe_table, - mtt->first_seg, - mtt->first_seg + (1 << mtt->order) - 1); - break; - case MTT_TYPE_SRQWQE: - hns_roce_buddy_free(&mr_table->mtt_srqwqe_buddy, mtt->first_seg, - mtt->order); - hns_roce_table_put_range(hr_dev, &mr_table->mtt_srqwqe_table, - mtt->first_seg, - mtt->first_seg + (1 << mtt->order) - 1); - break; - case MTT_TYPE_IDX: - hns_roce_buddy_free(&mr_table->mtt_idx_buddy, mtt->first_seg, - mtt->order); - hns_roce_table_put_range(hr_dev, &mr_table->mtt_idx_table, - mtt->first_seg, - mtt->first_seg + (1 << mtt->order) - 1); - break; - default: - dev_err(hr_dev->dev, - "Unsupport mtt type %d, clean mtt failed\n", - mtt->mtt_type); - break; - } -} - static int alloc_mr_key(struct hns_roce_dev *hr_dev, struct hns_roce_mr *mr, u32 pd, u64 iova, u64 size, u32 access) { @@ -433,131 +206,6 @@ err_page: return ret; } -static int hns_roce_write_mtt_chunk(struct hns_roce_dev *hr_dev, - struct hns_roce_mtt *mtt, u32 start_index, - u32 npages, u64 *page_list) -{ - struct hns_roce_hem_table *table; - dma_addr_t dma_handle; - __le64 *mtts; - u32 bt_page_size; - u32 i; - - switch (mtt->mtt_type) { - case MTT_TYPE_WQE: - table = &hr_dev->mr_table.mtt_table; - bt_page_size = 1 << (hr_dev->caps.mtt_ba_pg_sz + PAGE_SHIFT); - break; - case MTT_TYPE_CQE: - table = &hr_dev->mr_table.mtt_cqe_table; - bt_page_size = 1 << (hr_dev->caps.cqe_ba_pg_sz + PAGE_SHIFT); - break; - case MTT_TYPE_SRQWQE: - table = &hr_dev->mr_table.mtt_srqwqe_table; - bt_page_size = 1 << (hr_dev->caps.srqwqe_ba_pg_sz + PAGE_SHIFT); - break; - case MTT_TYPE_IDX: - table = &hr_dev->mr_table.mtt_idx_table; - bt_page_size = 1 << (hr_dev->caps.idx_ba_pg_sz + PAGE_SHIFT); - break; - default: - return -EINVAL; - } - - /* All MTTs must fit in the same page */ - if (start_index / (bt_page_size / sizeof(u64)) != - (start_index + npages - 1) / (bt_page_size / sizeof(u64))) - return -EINVAL; - - if (start_index & (HNS_ROCE_MTT_ENTRY_PER_SEG - 1)) - return -EINVAL; - - mtts = hns_roce_table_find(hr_dev, table, - mtt->first_seg + - start_index / HNS_ROCE_MTT_ENTRY_PER_SEG, - &dma_handle); - if (!mtts) - return -ENOMEM; - - /* Save page addr, low 12 bits : 0 */ - for (i = 0; i < npages; ++i) { - if (!hr_dev->caps.mtt_hop_num) - mtts[i] = cpu_to_le64(page_list[i] >> PAGE_ADDR_SHIFT); - else - mtts[i] = cpu_to_le64(page_list[i]); - } - - return 0; -} - -static int hns_roce_write_mtt(struct hns_roce_dev *hr_dev, - struct hns_roce_mtt *mtt, u32 start_index, - u32 npages, u64 *page_list) -{ - int chunk; - int ret; - u32 bt_page_size; - - if (mtt->order < 0) - return -EINVAL; - - switch (mtt->mtt_type) { - case MTT_TYPE_WQE: - bt_page_size = 1 << (hr_dev->caps.mtt_ba_pg_sz + PAGE_SHIFT); - break; - case MTT_TYPE_CQE: - bt_page_size = 1 << (hr_dev->caps.cqe_ba_pg_sz + PAGE_SHIFT); - break; - case MTT_TYPE_SRQWQE: - bt_page_size = 1 << (hr_dev->caps.srqwqe_ba_pg_sz + PAGE_SHIFT); - break; - case MTT_TYPE_IDX: - bt_page_size = 1 << (hr_dev->caps.idx_ba_pg_sz + PAGE_SHIFT); - break; - default: - dev_err(hr_dev->dev, - "Unsupport mtt type %d, write mtt failed\n", - mtt->mtt_type); - return -EINVAL; - } - - while (npages > 0) { - chunk = min_t(int, bt_page_size / sizeof(u64), npages); - - ret = hns_roce_write_mtt_chunk(hr_dev, mtt, start_index, chunk, - page_list); - if (ret) - return ret; - - npages -= chunk; - start_index += chunk; - page_list += chunk; - } - - return 0; -} - -int hns_roce_buf_write_mtt(struct hns_roce_dev *hr_dev, - struct hns_roce_mtt *mtt, struct hns_roce_buf *buf) -{ - u64 *page_list; - int ret; - u32 i; - - page_list = kmalloc_array(buf->npages, sizeof(*page_list), GFP_KERNEL); - if (!page_list) - return -ENOMEM; - - for (i = 0; i < buf->npages; ++i) - page_list[i] = hns_roce_buf_page(buf, i); - - ret = hns_roce_write_mtt(hr_dev, mtt, 0, buf->npages, page_list); - - kfree(page_list); - - return ret; -} - int hns_roce_init_mr_table(struct hns_roce_dev *hr_dev) { struct hns_roce_mr_table *mr_table = &hr_dev->mr_table; @@ -567,50 +215,6 @@ int hns_roce_init_mr_table(struct hns_roce_dev *hr_dev) hr_dev->caps.num_mtpts, hr_dev->caps.num_mtpts - 1, hr_dev->caps.reserved_mrws, 0); - if (ret) - return ret; - - ret = hns_roce_buddy_init(&mr_table->mtt_buddy, - ilog2(hr_dev->caps.num_mtt_segs)); - if (ret) - goto err_buddy; - - if (hns_roce_check_whether_mhop(hr_dev, HEM_TYPE_CQE)) { - ret = hns_roce_buddy_init(&mr_table->mtt_cqe_buddy, - ilog2(hr_dev->caps.num_cqe_segs)); - if (ret) - goto err_buddy_cqe; - } - - if (hr_dev->caps.num_srqwqe_segs) { - ret = hns_roce_buddy_init(&mr_table->mtt_srqwqe_buddy, - ilog2(hr_dev->caps.num_srqwqe_segs)); - if (ret) - goto err_buddy_srqwqe; - } - - if (hr_dev->caps.num_idx_segs) { - ret = hns_roce_buddy_init(&mr_table->mtt_idx_buddy, - ilog2(hr_dev->caps.num_idx_segs)); - if (ret) - goto err_buddy_idx; - } - - return 0; - -err_buddy_idx: - if (hr_dev->caps.num_srqwqe_segs) - hns_roce_buddy_cleanup(&mr_table->mtt_srqwqe_buddy); - -err_buddy_srqwqe: - if (hns_roce_check_whether_mhop(hr_dev, HEM_TYPE_CQE)) - hns_roce_buddy_cleanup(&mr_table->mtt_cqe_buddy); - -err_buddy_cqe: - hns_roce_buddy_cleanup(&mr_table->mtt_buddy); - -err_buddy: - hns_roce_bitmap_cleanup(&mr_table->mtpt_bitmap); return ret; } @@ -618,13 +222,6 @@ void hns_roce_cleanup_mr_table(struct hns_roce_dev *hr_dev) { struct hns_roce_mr_table *mr_table = &hr_dev->mr_table; - if (hr_dev->caps.num_idx_segs) - hns_roce_buddy_cleanup(&mr_table->mtt_idx_buddy); - if (hr_dev->caps.num_srqwqe_segs) - hns_roce_buddy_cleanup(&mr_table->mtt_srqwqe_buddy); - hns_roce_buddy_cleanup(&mr_table->mtt_buddy); - if (hns_roce_check_whether_mhop(hr_dev, HEM_TYPE_CQE)) - hns_roce_buddy_cleanup(&mr_table->mtt_cqe_buddy); hns_roce_bitmap_cleanup(&mr_table->mtpt_bitmap); } @@ -661,77 +258,6 @@ err_free: return ERR_PTR(ret); } -int hns_roce_ib_umem_write_mtt(struct hns_roce_dev *hr_dev, - struct hns_roce_mtt *mtt, struct ib_umem *umem) -{ - struct device *dev = hr_dev->dev; - struct sg_dma_page_iter sg_iter; - unsigned int order; - int npage = 0; - int ret = 0; - int i; - u64 page_addr; - u64 *pages; - u32 bt_page_size; - u32 n; - - switch (mtt->mtt_type) { - case MTT_TYPE_WQE: - order = hr_dev->caps.mtt_ba_pg_sz; - break; - case MTT_TYPE_CQE: - order = hr_dev->caps.cqe_ba_pg_sz; - break; - case MTT_TYPE_SRQWQE: - order = hr_dev->caps.srqwqe_ba_pg_sz; - break; - case MTT_TYPE_IDX: - order = hr_dev->caps.idx_ba_pg_sz; - break; - default: - dev_err(dev, "Unsupport mtt type %d, write mtt failed\n", - mtt->mtt_type); - return -EINVAL; - } - - bt_page_size = 1 << (order + PAGE_SHIFT); - - pages = (u64 *) __get_free_pages(GFP_KERNEL, order); - if (!pages) - return -ENOMEM; - - i = n = 0; - - for_each_sg_dma_page(umem->sg_head.sgl, &sg_iter, umem->nmap, 0) { - page_addr = sg_page_iter_dma_address(&sg_iter); - if (!(npage % (1 << (mtt->page_shift - PAGE_SHIFT)))) { - if (page_addr & ((1 << mtt->page_shift) - 1)) { - dev_err(dev, - "page_addr is not page_shift %d alignment!\n", - mtt->page_shift); - ret = -EINVAL; - goto out; - } - pages[i++] = page_addr; - } - npage++; - if (i == bt_page_size / sizeof(u64)) { - ret = hns_roce_write_mtt(hr_dev, mtt, n, i, pages); - if (ret) - goto out; - n += i; - i = 0; - } - } - - if (i) - ret = hns_roce_write_mtt(hr_dev, mtt, n, i, pages); - -out: - free_pages((unsigned long) pages, order); - return ret; -} - struct ib_mr *hns_roce_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, u64 virt_addr, int access_flags, struct ib_udata *udata) @@ -1112,20 +638,6 @@ int hns_roce_dealloc_mw(struct ib_mw *ibmw) return 0; } -void hns_roce_mtr_init(struct hns_roce_mtr *mtr, int bt_pg_shift, - int buf_pg_shift) -{ - hns_roce_hem_list_init(&mtr->hem_list); - mtr->hem_cfg.buf_pg_shift = buf_pg_shift; - mtr->hem_cfg.ba_pg_shift = bt_pg_shift; -} - -void hns_roce_mtr_cleanup(struct hns_roce_dev *hr_dev, - struct hns_roce_mtr *mtr) -{ - hns_roce_hem_list_release(hr_dev, &mtr->hem_list); -} - static int mtr_map_region(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr, dma_addr_t *pages, struct hns_roce_buf_region *region) { @@ -1165,39 +677,6 @@ static int mtr_map_region(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr, return 0; } -int hns_roce_mtr_attach(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr, - dma_addr_t **bufs, struct hns_roce_buf_region *regions, - int region_cnt) -{ - struct hns_roce_buf_region *r; - int ret; - int i; - - ret = hns_roce_hem_list_request(hr_dev, &mtr->hem_list, regions, - region_cnt, mtr->hem_cfg.ba_pg_shift); - if (ret) - return ret; - - mtr->hem_cfg.root_ba = mtr->hem_list.root_ba; - for (i = 0; i < region_cnt; i++) { - r = ®ions[i]; - ret = mtr_map_region(hr_dev, mtr, bufs[i], r); - if (ret) { - dev_err(hr_dev->dev, - "write mtr[%d/%d] err %d,offset=%d.\n", - i, region_cnt, ret, r->offset); - goto err_write; - } - } - - return 0; - -err_write: - hns_roce_hem_list_release(hr_dev, &mtr->hem_list); - - return ret; -} - static inline bool mtr_has_mtt(struct hns_roce_buf_attr *attr) { int i; -- cgit v1.2.3 From 54d6638765b0ede9f3889af47d9d5412bef8f47d Mon Sep 17 00:00:00 2001 From: Xi Wang Date: Tue, 28 Apr 2020 19:03:41 +0800 Subject: RDMA/hns: Optimize WQE buffer size calculating process Optimize the QP's WQE buffer parameters calculating process to make the codes more readable mainly by merging calculation of extended sge space of kernel and userspace. In addition, add some inline functions to simply codes about multi-hop addressing. Link: https://lore.kernel.org/r/1588071823-40200-4-git-send-email-liweihang@huawei.com Signed-off-by: Xi Wang Signed-off-by: Weihang Li Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_device.h | 25 +++ drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 109 ++++------ drivers/infiniband/hw/hns/hns_roce_qp.c | 313 +++++++++++----------------- 3 files changed, 182 insertions(+), 265 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index da3850aeeb14..100224358a49 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -1079,6 +1079,8 @@ static inline dma_addr_t hns_roce_buf_page(struct hns_roce_buf *buf, int idx) return buf->page_list[idx].map; } +#define hr_hw_page_align(x) ALIGN(x, 1 << PAGE_ADDR_SHIFT) + static inline u64 to_hr_hw_page_addr(u64 addr) { return addr >> PAGE_ADDR_SHIFT; @@ -1089,6 +1091,29 @@ static inline u32 to_hr_hw_page_shift(u32 page_shift) return page_shift - PAGE_ADDR_SHIFT; } +static inline u32 to_hr_hem_hopnum(u32 hopnum, u32 count) +{ + if (count > 0) + return hopnum == HNS_ROCE_HOP_NUM_0 ? 0 : hopnum; + + return 0; +} + +static inline u32 to_hr_hem_entries_size(u32 count, u32 buf_shift) +{ + return hr_hw_page_align(count << buf_shift); +} + +static inline u32 to_hr_hem_entries_count(u32 count, u32 buf_shift) +{ + return hr_hw_page_align(count << buf_shift) >> buf_shift; +} + +static inline u32 to_hr_hem_entries_shift(u32 count, u32 buf_shift) +{ + return ilog2(to_hr_hem_entries_count(count, buf_shift)); +} + int hns_roce_init_uar_table(struct hns_roce_dev *dev); int hns_roce_uar_alloc(struct hns_roce_dev *dev, struct hns_roce_uar *uar); void hns_roce_uar_free(struct hns_roce_dev *dev, struct hns_roce_uar *uar); diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index e699932e6926..158b8317f604 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -154,47 +154,24 @@ static void set_extend_sge(struct hns_roce_qp *qp, const struct ib_send_wr *wr, unsigned int *sge_ind, int valid_num_sge) { struct hns_roce_v2_wqe_data_seg *dseg; - struct ib_sge *sg; - int num_in_wqe = 0; - int extend_sge_num; - int fi_sge_num; - int se_sge_num; - int shift; - int i; + struct ib_sge *sge = wr->sg_list; + unsigned int idx = *sge_ind; + int cnt = valid_num_sge; - if (qp->ibqp.qp_type == IB_QPT_RC || qp->ibqp.qp_type == IB_QPT_UC) - num_in_wqe = HNS_ROCE_V2_UC_RC_SGE_NUM_IN_WQE; - extend_sge_num = valid_num_sge - num_in_wqe; - sg = wr->sg_list + num_in_wqe; - shift = qp->mtr.hem_cfg.buf_pg_shift; + if (qp->ibqp.qp_type == IB_QPT_RC || qp->ibqp.qp_type == IB_QPT_UC) { + cnt -= HNS_ROCE_SGE_IN_WQE; + sge += HNS_ROCE_SGE_IN_WQE; + } - /* - * Check whether wr->num_sge sges are in the same page. If not, we - * should calculate how many sges in the first page and the second - * page. - */ - dseg = hns_roce_get_extend_sge(qp, (*sge_ind) & (qp->sge.sge_cnt - 1)); - fi_sge_num = (round_up((uintptr_t)dseg, 1 << shift) - - (uintptr_t)dseg) / - sizeof(struct hns_roce_v2_wqe_data_seg); - if (extend_sge_num > fi_sge_num) { - se_sge_num = extend_sge_num - fi_sge_num; - for (i = 0; i < fi_sge_num; i++) { - set_data_seg_v2(dseg++, sg + i); - (*sge_ind)++; - } - dseg = hns_roce_get_extend_sge(qp, - (*sge_ind) & (qp->sge.sge_cnt - 1)); - for (i = 0; i < se_sge_num; i++) { - set_data_seg_v2(dseg++, sg + fi_sge_num + i); - (*sge_ind)++; - } - } else { - for (i = 0; i < extend_sge_num; i++) { - set_data_seg_v2(dseg++, sg + i); - (*sge_ind)++; - } + while (cnt > 0) { + dseg = hns_roce_get_extend_sge(qp, idx & (qp->sge.sge_cnt - 1)); + set_data_seg_v2(dseg, sge); + idx++; + sge++; + cnt--; } + + *sge_ind = idx; } static int set_rwqe_data_seg(struct ib_qp *ibqp, const struct ib_send_wr *wr, @@ -232,7 +209,7 @@ static int set_rwqe_data_seg(struct ib_qp *ibqp, const struct ib_send_wr *wr, roce_set_bit(rc_sq_wqe->byte_4, V2_RC_SEND_WQE_BYTE_4_INLINE_S, 1); } else { - if (valid_num_sge <= HNS_ROCE_V2_UC_RC_SGE_NUM_IN_WQE) { + if (valid_num_sge <= HNS_ROCE_SGE_IN_WQE) { for (i = 0; i < wr->num_sge; i++) { if (likely(wr->sg_list[i].length)) { set_data_seg_v2(dseg, wr->sg_list + i); @@ -245,8 +222,8 @@ static int set_rwqe_data_seg(struct ib_qp *ibqp, const struct ib_send_wr *wr, V2_RC_SEND_WQE_BYTE_20_MSG_START_SGE_IDX_S, (*sge_ind) & (qp->sge.sge_cnt - 1)); - for (i = 0; i < wr->num_sge && - j < HNS_ROCE_V2_UC_RC_SGE_NUM_IN_WQE; i++) { + for (i = 0; i < wr->num_sge && j < HNS_ROCE_SGE_IN_WQE; + i++) { if (likely(wr->sg_list[i].length)) { set_data_seg_v2(dseg, wr->sg_list + i); dseg++; @@ -675,7 +652,7 @@ static int hns_roce_v2_post_recv(struct ib_qp *ibqp, } /* rq support inline data */ - if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RQ_INLINE) { + if (hr_qp->rq_inl_buf.wqe_cnt) { sge_list = hr_qp->rq_inl_buf.wqe_list[wqe_idx].sg_list; hr_qp->rq_inl_buf.wqe_list[wqe_idx].sge_cnt = (u32)wr->num_sge; @@ -3491,29 +3468,18 @@ static void set_qpc_wqe_cnt(struct hns_roce_qp *hr_qp, struct hns_roce_v2_qp_context *context, struct hns_roce_v2_qp_context *qpc_mask) { - if (hr_qp->ibqp.qp_type == IB_QPT_GSI) - roce_set_field(context->byte_4_sqpn_tst, - V2_QPC_BYTE_4_SGE_SHIFT_M, - V2_QPC_BYTE_4_SGE_SHIFT_S, - ilog2((unsigned int)hr_qp->sge.sge_cnt)); - else - roce_set_field(context->byte_4_sqpn_tst, - V2_QPC_BYTE_4_SGE_SHIFT_M, - V2_QPC_BYTE_4_SGE_SHIFT_S, - hr_qp->sq.max_gs > - HNS_ROCE_V2_UC_RC_SGE_NUM_IN_WQE ? - ilog2((unsigned int)hr_qp->sge.sge_cnt) : 0); + roce_set_field(context->byte_4_sqpn_tst, + V2_QPC_BYTE_4_SGE_SHIFT_M, V2_QPC_BYTE_4_SGE_SHIFT_S, + to_hr_hem_entries_shift(hr_qp->sge.sge_cnt, + hr_qp->sge.sge_shift)); roce_set_field(context->byte_20_smac_sgid_idx, V2_QPC_BYTE_20_SQ_SHIFT_M, V2_QPC_BYTE_20_SQ_SHIFT_S, - ilog2((unsigned int)hr_qp->sq.wqe_cnt)); + ilog2(hr_qp->sq.wqe_cnt)); roce_set_field(context->byte_20_smac_sgid_idx, V2_QPC_BYTE_20_RQ_SHIFT_M, V2_QPC_BYTE_20_RQ_SHIFT_S, - (hr_qp->ibqp.qp_type == IB_QPT_XRC_INI || - hr_qp->ibqp.qp_type == IB_QPT_XRC_TGT || - hr_qp->ibqp.srq) ? 0 : - ilog2((unsigned int)hr_qp->rq.wqe_cnt)); + ilog2(hr_qp->rq.wqe_cnt)); } static void modify_qp_reset_to_init(struct ib_qp *ibqp, @@ -3781,17 +3747,16 @@ static int modify_qp_init_to_rtr(struct ib_qp *ibqp, roce_set_field(context->byte_12_sq_hop, V2_QPC_BYTE_12_SQ_HOP_NUM_M, V2_QPC_BYTE_12_SQ_HOP_NUM_S, - hr_dev->caps.wqe_sq_hop_num == HNS_ROCE_HOP_NUM_0 ? - 0 : hr_dev->caps.wqe_sq_hop_num); + to_hr_hem_hopnum(hr_dev->caps.wqe_sq_hop_num, + hr_qp->sq.wqe_cnt)); roce_set_field(qpc_mask->byte_12_sq_hop, V2_QPC_BYTE_12_SQ_HOP_NUM_M, V2_QPC_BYTE_12_SQ_HOP_NUM_S, 0); roce_set_field(context->byte_20_smac_sgid_idx, V2_QPC_BYTE_20_SGE_HOP_NUM_M, V2_QPC_BYTE_20_SGE_HOP_NUM_S, - ((ibqp->qp_type == IB_QPT_GSI) || - hr_qp->sq.max_gs > HNS_ROCE_V2_UC_RC_SGE_NUM_IN_WQE) ? - hr_dev->caps.wqe_sge_hop_num : 0); + to_hr_hem_hopnum(hr_dev->caps.wqe_sge_hop_num, + hr_qp->sge.sge_cnt)); roce_set_field(qpc_mask->byte_20_smac_sgid_idx, V2_QPC_BYTE_20_SGE_HOP_NUM_M, V2_QPC_BYTE_20_SGE_HOP_NUM_S, 0); @@ -3799,8 +3764,9 @@ static int modify_qp_init_to_rtr(struct ib_qp *ibqp, roce_set_field(context->byte_20_smac_sgid_idx, V2_QPC_BYTE_20_RQ_HOP_NUM_M, V2_QPC_BYTE_20_RQ_HOP_NUM_S, - hr_dev->caps.wqe_rq_hop_num == HNS_ROCE_HOP_NUM_0 ? - 0 : hr_dev->caps.wqe_rq_hop_num); + to_hr_hem_hopnum(hr_dev->caps.wqe_rq_hop_num, + hr_qp->rq.wqe_cnt)); + roce_set_field(qpc_mask->byte_20_smac_sgid_idx, V2_QPC_BYTE_20_RQ_HOP_NUM_M, V2_QPC_BYTE_20_RQ_HOP_NUM_S, 0); @@ -3977,7 +3943,7 @@ static int modify_qp_rtr_to_rts(struct ib_qp *ibqp, return -EINVAL; } - if (hr_qp->sge.offset) { + if (hr_qp->sge.sge_cnt > 0) { page_size = 1 << hr_qp->mtr.hem_cfg.buf_pg_shift; count = hns_roce_mtr_find(hr_dev, &hr_qp->mtr, hr_qp->sge.offset / page_size, @@ -4011,15 +3977,12 @@ static int modify_qp_rtr_to_rts(struct ib_qp *ibqp, V2_QPC_BYTE_168_SQ_CUR_BLK_ADDR_M, V2_QPC_BYTE_168_SQ_CUR_BLK_ADDR_S, 0); - context->sq_cur_sge_blk_addr = ((ibqp->qp_type == IB_QPT_GSI) || - hr_qp->sq.max_gs > HNS_ROCE_V2_UC_RC_SGE_NUM_IN_WQE) ? - cpu_to_le32(to_hr_hw_page_addr(sge_cur_blk)) : 0; + context->sq_cur_sge_blk_addr = + cpu_to_le32(to_hr_hw_page_addr(sge_cur_blk)); roce_set_field(context->byte_184_irrl_idx, V2_QPC_BYTE_184_SQ_CUR_SGE_BLK_ADDR_M, V2_QPC_BYTE_184_SQ_CUR_SGE_BLK_ADDR_S, - ((ibqp->qp_type == IB_QPT_GSI) || hr_qp->sq.max_gs > - HNS_ROCE_V2_UC_RC_SGE_NUM_IN_WQE) ? - upper_32_bits(to_hr_hw_page_addr(sge_cur_blk)) : 0); + upper_32_bits(to_hr_hw_page_addr(sge_cur_blk))); qpc_mask->sq_cur_sge_blk_addr = 0; roce_set_field(qpc_mask->byte_184_irrl_idx, V2_QPC_BYTE_184_SQ_CUR_SGE_BLK_ADDR_M, diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c index d05d3cb7de39..b5707596148d 100644 --- a/drivers/infiniband/hw/hns/hns_roce_qp.c +++ b/drivers/infiniband/hw/hns/hns_roce_qp.c @@ -355,16 +355,16 @@ static void free_qpn(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp) hns_roce_bitmap_free_range(&qp_table->bitmap, hr_qp->qpn, 1, BITMAP_RR); } -static int set_rq_size(struct hns_roce_dev *hr_dev, - struct ib_qp_cap *cap, bool is_user, int has_rq, - struct hns_roce_qp *hr_qp) +static int set_rq_size(struct hns_roce_dev *hr_dev, struct ib_qp_cap *cap, + struct hns_roce_qp *hr_qp, int has_rq) { - u32 max_cnt; + u32 cnt; /* If srq exist, set zero for relative number of rq */ if (!has_rq) { hr_qp->rq.wqe_cnt = 0; hr_qp->rq.max_gs = 0; + hr_qp->rq_inl_buf.wqe_cnt = 0; cap->max_recv_wr = 0; cap->max_recv_sge = 0; @@ -379,17 +379,14 @@ static int set_rq_size(struct hns_roce_dev *hr_dev, return -EINVAL; } - max_cnt = max(cap->max_recv_wr, hr_dev->caps.min_wqes); - - hr_qp->rq.wqe_cnt = roundup_pow_of_two(max_cnt); - if ((u32)hr_qp->rq.wqe_cnt > hr_dev->caps.max_wqes) { + cnt = roundup_pow_of_two(max(cap->max_recv_wr, hr_dev->caps.min_wqes)); + if (cnt > hr_dev->caps.max_wqes) { ibdev_err(&hr_dev->ib_dev, "rq depth %u too large\n", cap->max_recv_wr); return -EINVAL; } - max_cnt = max(1U, cap->max_recv_sge); - hr_qp->rq.max_gs = roundup_pow_of_two(max_cnt); + hr_qp->rq.max_gs = roundup_pow_of_two(max(1U, cap->max_recv_sge)); if (hr_dev->caps.max_rq_sg <= HNS_ROCE_SGE_IN_WQE) hr_qp->rq.wqe_shift = ilog2(hr_dev->caps.max_rq_desc_sz); @@ -397,12 +394,61 @@ static int set_rq_size(struct hns_roce_dev *hr_dev, hr_qp->rq.wqe_shift = ilog2(hr_dev->caps.max_rq_desc_sz * hr_qp->rq.max_gs); - cap->max_recv_wr = hr_qp->rq.wqe_cnt; + hr_qp->rq.wqe_cnt = cnt; + if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RQ_INLINE) + hr_qp->rq_inl_buf.wqe_cnt = cnt; + else + hr_qp->rq_inl_buf.wqe_cnt = 0; + + cap->max_recv_wr = cnt; cap->max_recv_sge = hr_qp->rq.max_gs; return 0; } +static int set_extend_sge_param(struct hns_roce_dev *hr_dev, u32 sq_wqe_cnt, + struct hns_roce_qp *hr_qp, + struct ib_qp_cap *cap) +{ + struct ib_device *ibdev = &hr_dev->ib_dev; + u32 cnt; + + cnt = max(1U, cap->max_send_sge); + if (hr_dev->hw_rev == HNS_ROCE_HW_VER1) { + hr_qp->sq.max_gs = roundup_pow_of_two(cnt); + hr_qp->sge.sge_cnt = 0; + + return 0; + } + + hr_qp->sq.max_gs = cnt; + + /* UD sqwqe's sge use extend sge */ + if (hr_qp->ibqp.qp_type == IB_QPT_GSI || + hr_qp->ibqp.qp_type == IB_QPT_UD) { + cnt = roundup_pow_of_two(sq_wqe_cnt * hr_qp->sq.max_gs); + } else if (hr_qp->sq.max_gs > HNS_ROCE_SGE_IN_WQE) { + cnt = roundup_pow_of_two(sq_wqe_cnt * + (hr_qp->sq.max_gs - HNS_ROCE_SGE_IN_WQE)); + + if (hr_dev->pci_dev->revision == PCI_REVISION_ID_HIP08_A) { + if (cnt > hr_dev->caps.max_extend_sg) { + ibdev_err(ibdev, + "failed to check exSGE num, exSGE num = %d.\n", + cnt); + return -EINVAL; + } + } + } else { + cnt = 0; + } + + hr_qp->sge.sge_shift = HNS_ROCE_SGE_SHIFT; + hr_qp->sge.sge_cnt = cnt; + + return 0; +} + static int check_sq_size_with_integrity(struct hns_roce_dev *hr_dev, struct ib_qp_cap *cap, struct hns_roce_ib_create_qp *ucmd) @@ -430,82 +476,27 @@ static int set_user_sq_size(struct hns_roce_dev *hr_dev, struct ib_qp_cap *cap, struct hns_roce_qp *hr_qp, struct hns_roce_ib_create_qp *ucmd) { - u32 ex_sge_num; - u32 page_size; - u32 max_cnt; + struct ib_device *ibdev = &hr_dev->ib_dev; + u32 cnt = 0; int ret; - if (check_shl_overflow(1, ucmd->log_sq_bb_count, &hr_qp->sq.wqe_cnt) || - hr_qp->sq.wqe_cnt > hr_dev->caps.max_wqes) + if (check_shl_overflow(1, ucmd->log_sq_bb_count, &cnt) || + cnt > hr_dev->caps.max_wqes) return -EINVAL; ret = check_sq_size_with_integrity(hr_dev, cap, ucmd); if (ret) { - ibdev_err(&hr_dev->ib_dev, "Failed to check user SQ size limit\n"); + ibdev_err(ibdev, "failed to check user SQ size, ret = %d.\n", + ret); return ret; } - hr_qp->sq.wqe_shift = ucmd->log_sq_stride; - - max_cnt = max(1U, cap->max_send_sge); - if (hr_dev->hw_rev == HNS_ROCE_HW_VER1) - hr_qp->sq.max_gs = roundup_pow_of_two(max_cnt); - else - hr_qp->sq.max_gs = max_cnt; - - if (hr_qp->sq.max_gs > HNS_ROCE_SGE_IN_WQE) - hr_qp->sge.sge_cnt = roundup_pow_of_two(hr_qp->sq.wqe_cnt * - (hr_qp->sq.max_gs - 2)); - - if (hr_qp->sq.max_gs > HNS_ROCE_SGE_IN_WQE && - hr_dev->pci_dev->revision == PCI_REVISION_ID_HIP08_A) { - if (hr_qp->sge.sge_cnt > hr_dev->caps.max_extend_sg) { - ibdev_err(&hr_dev->ib_dev, - "Failed to check extended SGE size limit %d\n", - hr_qp->sge.sge_cnt); - return -EINVAL; - } - } - - hr_qp->sge.sge_shift = 4; - ex_sge_num = hr_qp->sge.sge_cnt; + ret = set_extend_sge_param(hr_dev, cnt, hr_qp, cap); + if (ret) + return ret; - /* Get buf size, SQ and RQ are aligned to page_szie */ - if (hr_dev->hw_rev == HNS_ROCE_HW_VER1) { - hr_qp->buff_size = round_up((hr_qp->rq.wqe_cnt << - hr_qp->rq.wqe_shift), PAGE_SIZE) + - round_up((hr_qp->sq.wqe_cnt << - hr_qp->sq.wqe_shift), PAGE_SIZE); - - hr_qp->sq.offset = 0; - hr_qp->rq.offset = round_up((hr_qp->sq.wqe_cnt << - hr_qp->sq.wqe_shift), PAGE_SIZE); - } else { - page_size = 1 << (hr_dev->caps.mtt_buf_pg_sz + PAGE_SHIFT); - hr_qp->sge.sge_cnt = ex_sge_num ? - max(page_size / (1 << hr_qp->sge.sge_shift), ex_sge_num) : 0; - hr_qp->buff_size = round_up((hr_qp->rq.wqe_cnt << - hr_qp->rq.wqe_shift), page_size) + - round_up((hr_qp->sge.sge_cnt << - hr_qp->sge.sge_shift), page_size) + - round_up((hr_qp->sq.wqe_cnt << - hr_qp->sq.wqe_shift), page_size); - - hr_qp->sq.offset = 0; - if (ex_sge_num) { - hr_qp->sge.offset = round_up((hr_qp->sq.wqe_cnt << - hr_qp->sq.wqe_shift), - page_size); - hr_qp->rq.offset = hr_qp->sge.offset + - round_up((hr_qp->sge.sge_cnt << - hr_qp->sge.sge_shift), - page_size); - } else { - hr_qp->rq.offset = round_up((hr_qp->sq.wqe_cnt << - hr_qp->sq.wqe_shift), - page_size); - } - } + hr_qp->sq.wqe_shift = ucmd->log_sq_stride; + hr_qp->sq.wqe_cnt = cnt; return 0; } @@ -514,84 +505,50 @@ static int split_wqe_buf_region(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp, struct hns_roce_buf_attr *buf_attr) { - bool is_extend_sge; int buf_size; int idx = 0; - if (hr_qp->buff_size < 1) - return -EINVAL; - - buf_attr->page_shift = PAGE_ADDR_SHIFT + hr_dev->caps.mtt_buf_pg_sz; - buf_attr->fixed_page = true; - buf_attr->region_count = 0; - - if (hr_qp->sge.sge_cnt > 0) - is_extend_sge = true; - else - is_extend_sge = false; + hr_qp->buff_size = 0; /* SQ WQE */ - if (is_extend_sge) - buf_size = hr_qp->sge.offset - hr_qp->sq.offset; - else - buf_size = hr_qp->rq.offset - hr_qp->sq.offset; - + hr_qp->sq.offset = 0; + buf_size = to_hr_hem_entries_size(hr_qp->sq.wqe_cnt, + hr_qp->sq.wqe_shift); if (buf_size > 0 && idx < ARRAY_SIZE(buf_attr->region)) { buf_attr->region[idx].size = buf_size; buf_attr->region[idx].hopnum = hr_dev->caps.wqe_sq_hop_num; idx++; + hr_qp->buff_size += buf_size; } - /* extend SGE in SQ WQE */ - buf_size = hr_qp->rq.offset - hr_qp->sge.offset; - if (buf_size > 0 && is_extend_sge && - idx < ARRAY_SIZE(buf_attr->region)) { + /* extend SGE WQE in SQ */ + hr_qp->sge.offset = hr_qp->buff_size; + buf_size = to_hr_hem_entries_size(hr_qp->sge.sge_cnt, + hr_qp->sge.sge_shift); + if (buf_size > 0 && idx < ARRAY_SIZE(buf_attr->region)) { buf_attr->region[idx].size = buf_size; - buf_attr->region[idx].hopnum = - hr_dev->caps.wqe_sge_hop_num; + buf_attr->region[idx].hopnum = hr_dev->caps.wqe_sge_hop_num; idx++; + hr_qp->buff_size += buf_size; } /* RQ WQE */ - buf_size = hr_qp->buff_size - hr_qp->rq.offset; + hr_qp->rq.offset = hr_qp->buff_size; + buf_size = to_hr_hem_entries_size(hr_qp->rq.wqe_cnt, + hr_qp->rq.wqe_shift); if (buf_size > 0 && idx < ARRAY_SIZE(buf_attr->region)) { buf_attr->region[idx].size = buf_size; buf_attr->region[idx].hopnum = hr_dev->caps.wqe_rq_hop_num; idx++; + hr_qp->buff_size += buf_size; } - buf_attr->region_count = idx; - - return 0; -} - -static int set_extend_sge_param(struct hns_roce_dev *hr_dev, - struct hns_roce_qp *hr_qp) -{ - struct device *dev = hr_dev->dev; - - if (hr_qp->sq.max_gs > 2) { - hr_qp->sge.sge_cnt = roundup_pow_of_two(hr_qp->sq.wqe_cnt * - (hr_qp->sq.max_gs - 2)); - hr_qp->sge.sge_shift = 4; - } - - /* ud sqwqe's sge use extend sge */ - if (hr_dev->hw_rev != HNS_ROCE_HW_VER1 && - hr_qp->ibqp.qp_type == IB_QPT_GSI) { - hr_qp->sge.sge_cnt = roundup_pow_of_two(hr_qp->sq.wqe_cnt * - hr_qp->sq.max_gs); - hr_qp->sge.sge_shift = 4; - } + if (hr_qp->buff_size < 1) + return -EINVAL; - if (hr_qp->sq.max_gs > 2 && - hr_dev->pci_dev->revision == PCI_REVISION_ID_HIP08_A) { - if (hr_qp->sge.sge_cnt > hr_dev->caps.max_extend_sg) { - dev_err(dev, "The extended sge cnt error! sge_cnt=%d\n", - hr_qp->sge.sge_cnt); - return -EINVAL; - } - } + buf_attr->page_shift = PAGE_ADDR_SHIFT + hr_dev->caps.mtt_buf_pg_sz; + buf_attr->fixed_page = true; + buf_attr->region_count = idx; return 0; } @@ -599,62 +556,35 @@ static int set_extend_sge_param(struct hns_roce_dev *hr_dev, static int set_kernel_sq_size(struct hns_roce_dev *hr_dev, struct ib_qp_cap *cap, struct hns_roce_qp *hr_qp) { - u32 page_size; - u32 max_cnt; - int size; + struct ib_device *ibdev = &hr_dev->ib_dev; + u32 cnt; int ret; if (!cap->max_send_wr || cap->max_send_wr > hr_dev->caps.max_wqes || cap->max_send_sge > hr_dev->caps.max_sq_sg || cap->max_inline_data > hr_dev->caps.max_sq_inline) { - ibdev_err(&hr_dev->ib_dev, - "SQ WR or sge or inline data error!\n"); + ibdev_err(ibdev, + "failed to check SQ WR, SGE or inline num, ret = %d.\n", + -EINVAL); return -EINVAL; } - hr_qp->sq.wqe_shift = ilog2(hr_dev->caps.max_sq_desc_sz); - - max_cnt = max(cap->max_send_wr, hr_dev->caps.min_wqes); - - hr_qp->sq.wqe_cnt = roundup_pow_of_two(max_cnt); - if ((u32)hr_qp->sq.wqe_cnt > hr_dev->caps.max_wqes) { - ibdev_err(&hr_dev->ib_dev, - "while setting kernel sq size, sq.wqe_cnt too large\n"); + cnt = roundup_pow_of_two(max(cap->max_send_wr, hr_dev->caps.min_wqes)); + if (cnt > hr_dev->caps.max_wqes) { + ibdev_err(ibdev, "failed to check WQE num, WQE num = %d.\n", + cnt); return -EINVAL; } - /* Get data_seg numbers */ - max_cnt = max(1U, cap->max_send_sge); - if (hr_dev->hw_rev == HNS_ROCE_HW_VER1) - hr_qp->sq.max_gs = roundup_pow_of_two(max_cnt); - else - hr_qp->sq.max_gs = max_cnt; + hr_qp->sq.wqe_shift = ilog2(hr_dev->caps.max_sq_desc_sz); + hr_qp->sq.wqe_cnt = cnt; - ret = set_extend_sge_param(hr_dev, hr_qp); - if (ret) { - ibdev_err(&hr_dev->ib_dev, "set extend sge parameters fail\n"); + ret = set_extend_sge_param(hr_dev, cnt, hr_qp, cap); + if (ret) return ret; - } - /* Get buf size, SQ and RQ are aligned to PAGE_SIZE */ - page_size = 1 << (hr_dev->caps.mtt_buf_pg_sz + PAGE_SHIFT); - hr_qp->sq.offset = 0; - size = round_up(hr_qp->sq.wqe_cnt << hr_qp->sq.wqe_shift, page_size); - - if (hr_dev->hw_rev != HNS_ROCE_HW_VER1 && hr_qp->sge.sge_cnt) { - hr_qp->sge.sge_cnt = max(page_size/(1 << hr_qp->sge.sge_shift), - (u32)hr_qp->sge.sge_cnt); - hr_qp->sge.offset = size; - size += round_up(hr_qp->sge.sge_cnt << hr_qp->sge.sge_shift, - page_size); - } - - hr_qp->rq.offset = size; - size += round_up((hr_qp->rq.wqe_cnt << hr_qp->rq.wqe_shift), page_size); - hr_qp->buff_size = size; - - /* Get wr and sge number which send */ - cap->max_send_wr = hr_qp->sq.wqe_cnt; + /* sync the parameters of kernel QP to user's configuration */ + cap->max_send_wr = cnt; cap->max_send_sge = hr_qp->sq.max_gs; /* We don't support inline sends for kernel QPs (yet) */ @@ -685,8 +615,8 @@ static int alloc_rq_inline_buf(struct hns_roce_qp *hr_qp, struct ib_qp_init_attr *init_attr) { u32 max_recv_sge = init_attr->cap.max_recv_sge; + u32 wqe_cnt = hr_qp->rq_inl_buf.wqe_cnt; struct hns_roce_rinl_wqe *wqe_list; - u32 wqe_cnt = hr_qp->rq.wqe_cnt; int i; /* allocate recv inline buf */ @@ -708,7 +638,6 @@ static int alloc_rq_inline_buf(struct hns_roce_qp *hr_qp, wqe_list[i].sg_list = &wqe_list[0].sg_list[i * max_recv_sge]; hr_qp->rq_inl_buf.wqe_list = wqe_list; - hr_qp->rq_inl_buf.wqe_cnt = wqe_cnt; return 0; @@ -721,7 +650,8 @@ err: static void free_rq_inline_buf(struct hns_roce_qp *hr_qp) { - kfree(hr_qp->rq_inl_buf.wqe_list[0].sg_list); + if (hr_qp->rq_inl_buf.wqe_list) + kfree(hr_qp->rq_inl_buf.wqe_list[0].sg_list); kfree(hr_qp->rq_inl_buf.wqe_list); } @@ -731,36 +661,36 @@ static int alloc_qp_buf(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp, { struct ib_device *ibdev = &hr_dev->ib_dev; struct hns_roce_buf_attr buf_attr = {}; - bool is_rq_buf_inline; int ret; - is_rq_buf_inline = (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RQ_INLINE) && - hns_roce_qp_has_rq(init_attr); - if (is_rq_buf_inline) { + if (!udata && hr_qp->rq_inl_buf.wqe_cnt) { ret = alloc_rq_inline_buf(hr_qp, init_attr); if (ret) { - ibdev_err(ibdev, "Failed to alloc inline RQ buffer\n"); + ibdev_err(ibdev, + "failed to alloc inline buf, ret = %d.\n", + ret); return ret; } + } else { + hr_qp->rq_inl_buf.wqe_list = NULL; } ret = split_wqe_buf_region(hr_dev, hr_qp, &buf_attr); if (ret) { - ibdev_err(ibdev, "Failed to split WQE buf, ret %d\n", ret); + ibdev_err(ibdev, "failed to split WQE buf, ret = %d.\n", ret); goto err_inline; } ret = hns_roce_mtr_create(hr_dev, &hr_qp->mtr, &buf_attr, PAGE_ADDR_SHIFT + hr_dev->caps.mtt_ba_pg_sz, udata, addr); if (ret) { - ibdev_err(ibdev, "Failed to create WQE mtr, ret %d\n", ret); + ibdev_err(ibdev, "failed to create WQE mtr, ret = %d.\n", ret); goto err_inline; } return 0; err_inline: - if (is_rq_buf_inline) - free_rq_inline_buf(hr_qp); + free_rq_inline_buf(hr_qp); return ret; } @@ -768,9 +698,7 @@ err_inline: static void free_qp_buf(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp) { hns_roce_mtr_destroy(hr_dev, &hr_qp->mtr); - if ((hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RQ_INLINE) && - hr_qp->rq.wqe_cnt) - free_rq_inline_buf(hr_qp); + free_rq_inline_buf(hr_qp); } static inline bool user_qp_has_sdb(struct hns_roce_dev *hr_dev, @@ -935,10 +863,11 @@ static int set_qp_param(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp, else hr_qp->sq_signal_bits = IB_SIGNAL_REQ_WR; - ret = set_rq_size(hr_dev, &init_attr->cap, udata, - hns_roce_qp_has_rq(init_attr), hr_qp); + ret = set_rq_size(hr_dev, &init_attr->cap, hr_qp, + hns_roce_qp_has_rq(init_attr)); if (ret) { - ibdev_err(ibdev, "Failed to set user RQ size\n"); + ibdev_err(ibdev, "failed to set user RQ size, ret = %d.\n", + ret); return ret; } -- cgit v1.2.3 From ffb1308b88b6023205249f11332ebff83610899a Mon Sep 17 00:00:00 2001 From: Yixian Liu Date: Tue, 28 Apr 2020 19:03:42 +0800 Subject: RDMA/hns: Move SRQ code to the reasonable place Just move the SRQ related code to more reasonable place, and unify format of some prints. Link: https://lore.kernel.org/r/1588071823-40200-5-git-send-email-liweihang@huawei.com Signed-off-by: Yixian Liu Signed-off-by: Weihang Li Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 795 ++++++++++++++--------------- 1 file changed, 396 insertions(+), 399 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 158b8317f604..0b79dafe919e 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -694,6 +694,129 @@ out: return ret; } +static void *get_srq_wqe(struct hns_roce_srq *srq, int n) +{ + return hns_roce_buf_offset(srq->buf_mtr.kmem, n << srq->wqe_shift); +} + +static void hns_roce_free_srq_wqe(struct hns_roce_srq *srq, int wqe_index) +{ + /* always called with interrupts disabled. */ + spin_lock(&srq->lock); + + bitmap_clear(srq->idx_que.bitmap, wqe_index, 1); + srq->tail++; + + spin_unlock(&srq->lock); +} + +static int find_empty_entry(struct hns_roce_idx_que *idx_que, + unsigned long size) +{ + int wqe_idx; + + if (unlikely(bitmap_full(idx_que->bitmap, size))) + return -ENOSPC; + + wqe_idx = find_first_zero_bit(idx_que->bitmap, size); + + bitmap_set(idx_que->bitmap, wqe_idx, 1); + + return wqe_idx; +} + +static void fill_idx_queue(struct hns_roce_idx_que *idx_que, + int cur_idx, int wqe_idx) +{ + unsigned int *addr; + + addr = (unsigned int *)hns_roce_buf_offset(idx_que->mtr.kmem, + cur_idx * idx_que->entry_sz); + *addr = wqe_idx; +} + +static int hns_roce_v2_post_srq_recv(struct ib_srq *ibsrq, + const struct ib_recv_wr *wr, + const struct ib_recv_wr **bad_wr) +{ + struct hns_roce_dev *hr_dev = to_hr_dev(ibsrq->device); + struct hns_roce_srq *srq = to_hr_srq(ibsrq); + struct hns_roce_v2_wqe_data_seg *dseg; + struct hns_roce_v2_db srq_db; + unsigned long flags; + int ret = 0; + int wqe_idx; + void *wqe; + int nreq; + int ind; + int i; + + spin_lock_irqsave(&srq->lock, flags); + + ind = srq->head & (srq->wqe_cnt - 1); + + for (nreq = 0; wr; ++nreq, wr = wr->next) { + if (unlikely(wr->num_sge >= srq->max_gs)) { + ret = -EINVAL; + *bad_wr = wr; + break; + } + + if (unlikely(srq->head == srq->tail)) { + ret = -ENOMEM; + *bad_wr = wr; + break; + } + + wqe_idx = find_empty_entry(&srq->idx_que, srq->wqe_cnt); + if (wqe_idx < 0) { + ret = -ENOMEM; + *bad_wr = wr; + break; + } + + fill_idx_queue(&srq->idx_que, ind, wqe_idx); + wqe = get_srq_wqe(srq, wqe_idx); + dseg = (struct hns_roce_v2_wqe_data_seg *)wqe; + + for (i = 0; i < wr->num_sge; ++i) { + dseg[i].len = cpu_to_le32(wr->sg_list[i].length); + dseg[i].lkey = cpu_to_le32(wr->sg_list[i].lkey); + dseg[i].addr = cpu_to_le64(wr->sg_list[i].addr); + } + + if (i < srq->max_gs) { + dseg[i].len = 0; + dseg[i].lkey = cpu_to_le32(0x100); + dseg[i].addr = 0; + } + + srq->wrid[wqe_idx] = wr->wr_id; + ind = (ind + 1) & (srq->wqe_cnt - 1); + } + + if (likely(nreq)) { + srq->head += nreq; + + /* + * Make sure that descriptors are written before + * doorbell record. + */ + wmb(); + + srq_db.byte_4 = + cpu_to_le32(HNS_ROCE_V2_SRQ_DB << V2_DB_BYTE_4_CMD_S | + (srq->srqn & V2_DB_BYTE_4_TAG_M)); + srq_db.parameter = cpu_to_le32(srq->head); + + hns_roce_write64(hr_dev, (__le32 *)&srq_db, srq->db_reg_l); + } + + spin_unlock_irqrestore(&srq->lock, flags); + + return ret; +} + static int hns_roce_v2_cmd_hw_reseted(struct hns_roce_dev *hr_dev, unsigned long instance_stage, unsigned long reset_stage) @@ -2667,22 +2790,6 @@ static struct hns_roce_v2_cqe *next_cqe_sw_v2(struct hns_roce_cq *hr_cq) return get_sw_cqe_v2(hr_cq, hr_cq->cons_index); } -static void *get_srq_wqe(struct hns_roce_srq *srq, int n) -{ - return hns_roce_buf_offset(srq->buf_mtr.kmem, n << srq->wqe_shift); -} - -static void hns_roce_free_srq_wqe(struct hns_roce_srq *srq, int wqe_index) -{ - /* always called with interrupts disabled. */ - spin_lock(&srq->lock); - - bitmap_clear(srq->idx_que.bitmap, wqe_index, 1); - srq->tail++; - - spin_unlock(&srq->lock); -} - static void hns_roce_v2_cq_set_ci(struct hns_roce_cq *hr_cq, u32 cons_index) { *hr_cq->set_ci_db = cons_index & V2_CQ_DB_PARAMETER_CONS_IDX_M; @@ -4777,108 +4884,288 @@ out: return ret; } -static int hns_roce_v2_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period) +static void hns_roce_v2_write_srqc(struct hns_roce_dev *hr_dev, + struct hns_roce_srq *srq, u32 pdn, u16 xrcd, + u32 cqn, void *mb_buf, u64 *mtts_wqe, + u64 *mtts_idx, dma_addr_t dma_handle_wqe, + dma_addr_t dma_handle_idx) { - struct hns_roce_dev *hr_dev = to_hr_dev(cq->device); - struct hns_roce_v2_cq_context *cq_context; - struct hns_roce_cq *hr_cq = to_hr_cq(cq); - struct hns_roce_v2_cq_context *cqc_mask; - struct hns_roce_cmd_mailbox *mailbox; - int ret; - - mailbox = hns_roce_alloc_cmd_mailbox(hr_dev); - if (IS_ERR(mailbox)) - return PTR_ERR(mailbox); + struct hns_roce_srq_context *srq_context; - cq_context = mailbox->buf; - cqc_mask = (struct hns_roce_v2_cq_context *)mailbox->buf + 1; + srq_context = mb_buf; + memset(srq_context, 0, sizeof(*srq_context)); - memset(cqc_mask, 0xff, sizeof(*cqc_mask)); + roce_set_field(srq_context->byte_4_srqn_srqst, SRQC_BYTE_4_SRQ_ST_M, + SRQC_BYTE_4_SRQ_ST_S, 1); - roce_set_field(cq_context->byte_56_cqe_period_maxcnt, - V2_CQC_BYTE_56_CQ_MAX_CNT_M, V2_CQC_BYTE_56_CQ_MAX_CNT_S, - cq_count); - roce_set_field(cqc_mask->byte_56_cqe_period_maxcnt, - V2_CQC_BYTE_56_CQ_MAX_CNT_M, V2_CQC_BYTE_56_CQ_MAX_CNT_S, - 0); - roce_set_field(cq_context->byte_56_cqe_period_maxcnt, - V2_CQC_BYTE_56_CQ_PERIOD_M, V2_CQC_BYTE_56_CQ_PERIOD_S, - cq_period); - roce_set_field(cqc_mask->byte_56_cqe_period_maxcnt, - V2_CQC_BYTE_56_CQ_PERIOD_M, V2_CQC_BYTE_56_CQ_PERIOD_S, - 0); + roce_set_field(srq_context->byte_4_srqn_srqst, + SRQC_BYTE_4_SRQ_WQE_HOP_NUM_M, + SRQC_BYTE_4_SRQ_WQE_HOP_NUM_S, + (hr_dev->caps.srqwqe_hop_num == HNS_ROCE_HOP_NUM_0 ? 0 : + hr_dev->caps.srqwqe_hop_num)); + roce_set_field(srq_context->byte_4_srqn_srqst, + SRQC_BYTE_4_SRQ_SHIFT_M, SRQC_BYTE_4_SRQ_SHIFT_S, + ilog2(srq->wqe_cnt)); - ret = hns_roce_cmd_mbox(hr_dev, mailbox->dma, 0, hr_cq->cqn, 1, - HNS_ROCE_CMD_MODIFY_CQC, - HNS_ROCE_CMD_TIMEOUT_MSECS); - hns_roce_free_cmd_mailbox(hr_dev, mailbox); - if (ret) - ibdev_err(&hr_dev->ib_dev, - "failed to process cmd when modifying CQ, ret = %d\n", - ret); + roce_set_field(srq_context->byte_4_srqn_srqst, SRQC_BYTE_4_SRQN_M, + SRQC_BYTE_4_SRQN_S, srq->srqn); - return ret; -} + roce_set_field(srq_context->byte_8_limit_wl, SRQC_BYTE_8_SRQ_LIMIT_WL_M, + SRQC_BYTE_8_SRQ_LIMIT_WL_S, 0); -static void hns_roce_irq_work_handle(struct work_struct *work) -{ - struct hns_roce_work *irq_work = - container_of(work, struct hns_roce_work, work); - struct ib_device *ibdev = &irq_work->hr_dev->ib_dev; - u32 qpn = irq_work->qpn; - u32 cqn = irq_work->cqn; + roce_set_field(srq_context->byte_12_xrcd, SRQC_BYTE_12_SRQ_XRCD_M, + SRQC_BYTE_12_SRQ_XRCD_S, xrcd); - switch (irq_work->event_type) { - case HNS_ROCE_EVENT_TYPE_PATH_MIG: - ibdev_info(ibdev, "Path migrated succeeded.\n"); - break; - case HNS_ROCE_EVENT_TYPE_PATH_MIG_FAILED: - ibdev_warn(ibdev, "Path migration failed.\n"); - break; - case HNS_ROCE_EVENT_TYPE_COMM_EST: - break; - case HNS_ROCE_EVENT_TYPE_SQ_DRAINED: - ibdev_warn(ibdev, "Send queue drained.\n"); - break; - case HNS_ROCE_EVENT_TYPE_WQ_CATAS_ERROR: - ibdev_err(ibdev, "Local work queue 0x%x catast error, sub_event type is: %d\n", - qpn, irq_work->sub_type); - break; - case HNS_ROCE_EVENT_TYPE_INV_REQ_LOCAL_WQ_ERROR: - ibdev_err(ibdev, "Invalid request local work queue 0x%x error.\n", - qpn); - break; - case HNS_ROCE_EVENT_TYPE_LOCAL_WQ_ACCESS_ERROR: - ibdev_err(ibdev, "Local access violation work queue 0x%x error, sub_event type is: %d\n", - qpn, irq_work->sub_type); - break; - case HNS_ROCE_EVENT_TYPE_SRQ_LIMIT_REACH: - ibdev_warn(ibdev, "SRQ limit reach.\n"); - break; - case HNS_ROCE_EVENT_TYPE_SRQ_LAST_WQE_REACH: - ibdev_warn(ibdev, "SRQ last wqe reach.\n"); - break; - case HNS_ROCE_EVENT_TYPE_SRQ_CATAS_ERROR: - ibdev_err(ibdev, "SRQ catas error.\n"); - break; - case HNS_ROCE_EVENT_TYPE_CQ_ACCESS_ERROR: - ibdev_err(ibdev, "CQ 0x%x access err.\n", cqn); - break; - case HNS_ROCE_EVENT_TYPE_CQ_OVERFLOW: - ibdev_warn(ibdev, "CQ 0x%x overflow\n", cqn); - break; - case HNS_ROCE_EVENT_TYPE_DB_OVERFLOW: - ibdev_warn(ibdev, "DB overflow.\n"); - break; - case HNS_ROCE_EVENT_TYPE_FLR: - ibdev_warn(ibdev, "Function level reset.\n"); - break; - default: - break; - } + srq_context->wqe_bt_ba = cpu_to_le32((u32)(dma_handle_wqe >> 3)); - kfree(irq_work); -} + roce_set_field(srq_context->byte_24_wqe_bt_ba, + SRQC_BYTE_24_SRQ_WQE_BT_BA_M, + SRQC_BYTE_24_SRQ_WQE_BT_BA_S, + dma_handle_wqe >> 35); + + roce_set_field(srq_context->byte_28_rqws_pd, SRQC_BYTE_28_PD_M, + SRQC_BYTE_28_PD_S, pdn); + roce_set_field(srq_context->byte_28_rqws_pd, SRQC_BYTE_28_RQWS_M, + SRQC_BYTE_28_RQWS_S, srq->max_gs <= 0 ? 0 : + fls(srq->max_gs - 1)); + + srq_context->idx_bt_ba = cpu_to_le32(dma_handle_idx >> 3); + roce_set_field(srq_context->rsv_idx_bt_ba, + SRQC_BYTE_36_SRQ_IDX_BT_BA_M, + SRQC_BYTE_36_SRQ_IDX_BT_BA_S, + dma_handle_idx >> 35); + + srq_context->idx_cur_blk_addr = + cpu_to_le32(to_hr_hw_page_addr(mtts_idx[0])); + roce_set_field(srq_context->byte_44_idxbufpgsz_addr, + SRQC_BYTE_44_SRQ_IDX_CUR_BLK_ADDR_M, + SRQC_BYTE_44_SRQ_IDX_CUR_BLK_ADDR_S, + upper_32_bits(to_hr_hw_page_addr(mtts_idx[0]))); + roce_set_field(srq_context->byte_44_idxbufpgsz_addr, + SRQC_BYTE_44_SRQ_IDX_HOP_NUM_M, + SRQC_BYTE_44_SRQ_IDX_HOP_NUM_S, + hr_dev->caps.idx_hop_num == HNS_ROCE_HOP_NUM_0 ? 0 : + hr_dev->caps.idx_hop_num); + + roce_set_field(srq_context->byte_44_idxbufpgsz_addr, + SRQC_BYTE_44_SRQ_IDX_BA_PG_SZ_M, + SRQC_BYTE_44_SRQ_IDX_BA_PG_SZ_S, + to_hr_hw_page_shift(srq->idx_que.mtr.hem_cfg.ba_pg_shift)); + roce_set_field(srq_context->byte_44_idxbufpgsz_addr, + SRQC_BYTE_44_SRQ_IDX_BUF_PG_SZ_M, + SRQC_BYTE_44_SRQ_IDX_BUF_PG_SZ_S, + to_hr_hw_page_shift(srq->idx_que.mtr.hem_cfg.buf_pg_shift)); + + srq_context->idx_nxt_blk_addr = + cpu_to_le32(to_hr_hw_page_addr(mtts_idx[1])); + roce_set_field(srq_context->rsv_idxnxtblkaddr, + SRQC_BYTE_52_SRQ_IDX_NXT_BLK_ADDR_M, + SRQC_BYTE_52_SRQ_IDX_NXT_BLK_ADDR_S, + upper_32_bits(to_hr_hw_page_addr(mtts_idx[1]))); + roce_set_field(srq_context->byte_56_xrc_cqn, + SRQC_BYTE_56_SRQ_XRC_CQN_M, SRQC_BYTE_56_SRQ_XRC_CQN_S, + cqn); + roce_set_field(srq_context->byte_56_xrc_cqn, + SRQC_BYTE_56_SRQ_WQE_BA_PG_SZ_M, + SRQC_BYTE_56_SRQ_WQE_BA_PG_SZ_S, + to_hr_hw_page_shift(srq->buf_mtr.hem_cfg.ba_pg_shift)); + roce_set_field(srq_context->byte_56_xrc_cqn, + SRQC_BYTE_56_SRQ_WQE_BUF_PG_SZ_M, + SRQC_BYTE_56_SRQ_WQE_BUF_PG_SZ_S, + to_hr_hw_page_shift(srq->buf_mtr.hem_cfg.buf_pg_shift)); + + roce_set_bit(srq_context->db_record_addr_record_en, + SRQC_BYTE_60_SRQ_RECORD_EN_S, 0); +} + +static int hns_roce_v2_modify_srq(struct ib_srq *ibsrq, + struct ib_srq_attr *srq_attr, + enum ib_srq_attr_mask srq_attr_mask, + struct ib_udata *udata) +{ + struct hns_roce_dev *hr_dev = to_hr_dev(ibsrq->device); + struct hns_roce_srq *srq = to_hr_srq(ibsrq); + struct hns_roce_srq_context *srq_context; + struct hns_roce_srq_context *srqc_mask; + struct hns_roce_cmd_mailbox *mailbox; + int ret; + + if (srq_attr_mask & IB_SRQ_LIMIT) { + if (srq_attr->srq_limit >= srq->wqe_cnt) + return -EINVAL; + + mailbox = hns_roce_alloc_cmd_mailbox(hr_dev); + if (IS_ERR(mailbox)) + return PTR_ERR(mailbox); + + srq_context = mailbox->buf; + srqc_mask = (struct hns_roce_srq_context *)mailbox->buf + 1; + + memset(srqc_mask, 0xff, sizeof(*srqc_mask)); + + roce_set_field(srq_context->byte_8_limit_wl, + SRQC_BYTE_8_SRQ_LIMIT_WL_M, + SRQC_BYTE_8_SRQ_LIMIT_WL_S, srq_attr->srq_limit); + roce_set_field(srqc_mask->byte_8_limit_wl, + SRQC_BYTE_8_SRQ_LIMIT_WL_M, + SRQC_BYTE_8_SRQ_LIMIT_WL_S, 0); + + ret = hns_roce_cmd_mbox(hr_dev, mailbox->dma, 0, srq->srqn, 0, + HNS_ROCE_CMD_MODIFY_SRQC, + HNS_ROCE_CMD_TIMEOUT_MSECS); + hns_roce_free_cmd_mailbox(hr_dev, mailbox); + if (ret) { + ibdev_err(&hr_dev->ib_dev, + "failed to handle cmd of modifying SRQ, ret = %d.\n", + ret); + return ret; + } + } + + return 0; +} + +static int hns_roce_v2_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr) +{ + struct hns_roce_dev *hr_dev = to_hr_dev(ibsrq->device); + struct hns_roce_srq *srq = to_hr_srq(ibsrq); + struct hns_roce_srq_context *srq_context; + struct hns_roce_cmd_mailbox *mailbox; + int limit_wl; + int ret; + + mailbox = hns_roce_alloc_cmd_mailbox(hr_dev); + if (IS_ERR(mailbox)) + return PTR_ERR(mailbox); + + srq_context = mailbox->buf; + ret = hns_roce_cmd_mbox(hr_dev, 0, mailbox->dma, srq->srqn, 0, + HNS_ROCE_CMD_QUERY_SRQC, + HNS_ROCE_CMD_TIMEOUT_MSECS); + if (ret) { + ibdev_err(&hr_dev->ib_dev, + "failed to process cmd of querying SRQ, ret = %d.\n", + ret); + goto out; + } + + limit_wl = roce_get_field(srq_context->byte_8_limit_wl, + SRQC_BYTE_8_SRQ_LIMIT_WL_M, + SRQC_BYTE_8_SRQ_LIMIT_WL_S); + + attr->srq_limit = limit_wl; + attr->max_wr = srq->wqe_cnt; + attr->max_sge = srq->max_gs; + + memcpy(srq_context, mailbox->buf, sizeof(*srq_context)); + +out: + hns_roce_free_cmd_mailbox(hr_dev, mailbox); + return ret; +} + +static int hns_roce_v2_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period) +{ + struct hns_roce_dev *hr_dev = to_hr_dev(cq->device); + struct hns_roce_v2_cq_context *cq_context; + struct hns_roce_cq *hr_cq = to_hr_cq(cq); + struct hns_roce_v2_cq_context *cqc_mask; + struct hns_roce_cmd_mailbox *mailbox; + int ret; + + mailbox = hns_roce_alloc_cmd_mailbox(hr_dev); + if (IS_ERR(mailbox)) + return PTR_ERR(mailbox); + + cq_context = mailbox->buf; + cqc_mask = (struct hns_roce_v2_cq_context *)mailbox->buf + 1; + + memset(cqc_mask, 0xff, sizeof(*cqc_mask)); + + roce_set_field(cq_context->byte_56_cqe_period_maxcnt, + V2_CQC_BYTE_56_CQ_MAX_CNT_M, V2_CQC_BYTE_56_CQ_MAX_CNT_S, + cq_count); + roce_set_field(cqc_mask->byte_56_cqe_period_maxcnt, + V2_CQC_BYTE_56_CQ_MAX_CNT_M, V2_CQC_BYTE_56_CQ_MAX_CNT_S, + 0); + roce_set_field(cq_context->byte_56_cqe_period_maxcnt, + V2_CQC_BYTE_56_CQ_PERIOD_M, V2_CQC_BYTE_56_CQ_PERIOD_S, + cq_period); + roce_set_field(cqc_mask->byte_56_cqe_period_maxcnt, + V2_CQC_BYTE_56_CQ_PERIOD_M, V2_CQC_BYTE_56_CQ_PERIOD_S, + 0); + + ret = hns_roce_cmd_mbox(hr_dev, mailbox->dma, 0, hr_cq->cqn, 1, + HNS_ROCE_CMD_MODIFY_CQC, + HNS_ROCE_CMD_TIMEOUT_MSECS); + hns_roce_free_cmd_mailbox(hr_dev, mailbox); + if (ret) + ibdev_err(&hr_dev->ib_dev, + "failed to process cmd when modifying CQ, ret = %d\n", + ret); + + return ret; +} + +static void hns_roce_irq_work_handle(struct work_struct *work) +{ + struct hns_roce_work *irq_work = + container_of(work, struct hns_roce_work, work); + struct ib_device *ibdev = &irq_work->hr_dev->ib_dev; + u32 qpn = irq_work->qpn; + u32 cqn = irq_work->cqn; + + switch (irq_work->event_type) { + case HNS_ROCE_EVENT_TYPE_PATH_MIG: + ibdev_info(ibdev, "Path migrated succeeded.\n"); + break; + case HNS_ROCE_EVENT_TYPE_PATH_MIG_FAILED: + ibdev_warn(ibdev, "Path migration failed.\n"); + break; + case HNS_ROCE_EVENT_TYPE_COMM_EST: + break; + case HNS_ROCE_EVENT_TYPE_SQ_DRAINED: + ibdev_warn(ibdev, "Send queue drained.\n"); + break; + case HNS_ROCE_EVENT_TYPE_WQ_CATAS_ERROR: + ibdev_err(ibdev, "Local work queue 0x%x catast error, sub_event type is: %d\n", + qpn, irq_work->sub_type); + break; + case HNS_ROCE_EVENT_TYPE_INV_REQ_LOCAL_WQ_ERROR: + ibdev_err(ibdev, "Invalid request local work queue 0x%x error.\n", + qpn); + break; + case HNS_ROCE_EVENT_TYPE_LOCAL_WQ_ACCESS_ERROR: + ibdev_err(ibdev, "Local access violation work queue 0x%x error, sub_event type is: %d\n", + qpn, irq_work->sub_type); + break; + case HNS_ROCE_EVENT_TYPE_SRQ_LIMIT_REACH: + ibdev_warn(ibdev, "SRQ limit reach.\n"); + break; + case HNS_ROCE_EVENT_TYPE_SRQ_LAST_WQE_REACH: + ibdev_warn(ibdev, "SRQ last wqe reach.\n"); + break; + case HNS_ROCE_EVENT_TYPE_SRQ_CATAS_ERROR: + ibdev_err(ibdev, "SRQ catas error.\n"); + break; + case HNS_ROCE_EVENT_TYPE_CQ_ACCESS_ERROR: + ibdev_err(ibdev, "CQ 0x%x access err.\n", cqn); + break; + case HNS_ROCE_EVENT_TYPE_CQ_OVERFLOW: + ibdev_warn(ibdev, "CQ 0x%x overflow\n", cqn); + break; + case HNS_ROCE_EVENT_TYPE_DB_OVERFLOW: + ibdev_warn(ibdev, "DB overflow.\n"); + break; + case HNS_ROCE_EVENT_TYPE_FLR: + ibdev_warn(ibdev, "Function level reset.\n"); + break; + default: + break; + } + + kfree(irq_work); +} static void hns_roce_v2_init_irq_work(struct hns_roce_dev *hr_dev, struct hns_roce_eq *eq, @@ -5588,296 +5875,6 @@ static void hns_roce_v2_cleanup_eq_table(struct hns_roce_dev *hr_dev) destroy_workqueue(hr_dev->irq_workq); } -static void hns_roce_v2_write_srqc(struct hns_roce_dev *hr_dev, - struct hns_roce_srq *srq, u32 pdn, u16 xrcd, - u32 cqn, void *mb_buf, u64 *mtts_wqe, - u64 *mtts_idx, dma_addr_t dma_handle_wqe, - dma_addr_t dma_handle_idx) -{ - struct hns_roce_srq_context *srq_context; - - srq_context = mb_buf; - memset(srq_context, 0, sizeof(*srq_context)); - - roce_set_field(srq_context->byte_4_srqn_srqst, SRQC_BYTE_4_SRQ_ST_M, - SRQC_BYTE_4_SRQ_ST_S, 1); - - roce_set_field(srq_context->byte_4_srqn_srqst, - SRQC_BYTE_4_SRQ_WQE_HOP_NUM_M, - SRQC_BYTE_4_SRQ_WQE_HOP_NUM_S, - (hr_dev->caps.srqwqe_hop_num == HNS_ROCE_HOP_NUM_0 ? 0 : - hr_dev->caps.srqwqe_hop_num)); - roce_set_field(srq_context->byte_4_srqn_srqst, - SRQC_BYTE_4_SRQ_SHIFT_M, SRQC_BYTE_4_SRQ_SHIFT_S, - ilog2(srq->wqe_cnt)); - - roce_set_field(srq_context->byte_4_srqn_srqst, SRQC_BYTE_4_SRQN_M, - SRQC_BYTE_4_SRQN_S, srq->srqn); - - roce_set_field(srq_context->byte_8_limit_wl, SRQC_BYTE_8_SRQ_LIMIT_WL_M, - SRQC_BYTE_8_SRQ_LIMIT_WL_S, 0); - - roce_set_field(srq_context->byte_12_xrcd, SRQC_BYTE_12_SRQ_XRCD_M, - SRQC_BYTE_12_SRQ_XRCD_S, xrcd); - - srq_context->wqe_bt_ba = cpu_to_le32((u32)(dma_handle_wqe >> 3)); - - roce_set_field(srq_context->byte_24_wqe_bt_ba, - SRQC_BYTE_24_SRQ_WQE_BT_BA_M, - SRQC_BYTE_24_SRQ_WQE_BT_BA_S, - dma_handle_wqe >> 35); - - roce_set_field(srq_context->byte_28_rqws_pd, SRQC_BYTE_28_PD_M, - SRQC_BYTE_28_PD_S, pdn); - roce_set_field(srq_context->byte_28_rqws_pd, SRQC_BYTE_28_RQWS_M, - SRQC_BYTE_28_RQWS_S, srq->max_gs <= 0 ? 0 : - fls(srq->max_gs - 1)); - - srq_context->idx_bt_ba = cpu_to_le32(dma_handle_idx >> 3); - roce_set_field(srq_context->rsv_idx_bt_ba, - SRQC_BYTE_36_SRQ_IDX_BT_BA_M, - SRQC_BYTE_36_SRQ_IDX_BT_BA_S, - dma_handle_idx >> 35); - - srq_context->idx_cur_blk_addr = - cpu_to_le32(to_hr_hw_page_addr(mtts_idx[0])); - roce_set_field(srq_context->byte_44_idxbufpgsz_addr, - SRQC_BYTE_44_SRQ_IDX_CUR_BLK_ADDR_M, - SRQC_BYTE_44_SRQ_IDX_CUR_BLK_ADDR_S, - upper_32_bits(to_hr_hw_page_addr(mtts_idx[0]))); - roce_set_field(srq_context->byte_44_idxbufpgsz_addr, - SRQC_BYTE_44_SRQ_IDX_HOP_NUM_M, - SRQC_BYTE_44_SRQ_IDX_HOP_NUM_S, - hr_dev->caps.idx_hop_num == HNS_ROCE_HOP_NUM_0 ? 0 : - hr_dev->caps.idx_hop_num); - - roce_set_field( - srq_context->byte_44_idxbufpgsz_addr, - SRQC_BYTE_44_SRQ_IDX_BA_PG_SZ_M, - SRQC_BYTE_44_SRQ_IDX_BA_PG_SZ_S, - to_hr_hw_page_shift(srq->idx_que.mtr.hem_cfg.ba_pg_shift)); - roce_set_field( - srq_context->byte_44_idxbufpgsz_addr, - SRQC_BYTE_44_SRQ_IDX_BUF_PG_SZ_M, - SRQC_BYTE_44_SRQ_IDX_BUF_PG_SZ_S, - to_hr_hw_page_shift(srq->idx_que.mtr.hem_cfg.buf_pg_shift)); - - srq_context->idx_nxt_blk_addr = - cpu_to_le32(to_hr_hw_page_addr(mtts_idx[1])); - roce_set_field(srq_context->rsv_idxnxtblkaddr, - SRQC_BYTE_52_SRQ_IDX_NXT_BLK_ADDR_M, - SRQC_BYTE_52_SRQ_IDX_NXT_BLK_ADDR_S, - upper_32_bits(to_hr_hw_page_addr(mtts_idx[1]))); - roce_set_field(srq_context->byte_56_xrc_cqn, - SRQC_BYTE_56_SRQ_XRC_CQN_M, SRQC_BYTE_56_SRQ_XRC_CQN_S, - cqn); - roce_set_field(srq_context->byte_56_xrc_cqn, - SRQC_BYTE_56_SRQ_WQE_BA_PG_SZ_M, - SRQC_BYTE_56_SRQ_WQE_BA_PG_SZ_S, - to_hr_hw_page_shift(srq->buf_mtr.hem_cfg.ba_pg_shift)); - roce_set_field(srq_context->byte_56_xrc_cqn, - SRQC_BYTE_56_SRQ_WQE_BUF_PG_SZ_M, - SRQC_BYTE_56_SRQ_WQE_BUF_PG_SZ_S, - to_hr_hw_page_shift(srq->buf_mtr.hem_cfg.buf_pg_shift)); - - roce_set_bit(srq_context->db_record_addr_record_en, - SRQC_BYTE_60_SRQ_RECORD_EN_S, 0); -} - -static int hns_roce_v2_modify_srq(struct ib_srq *ibsrq, - struct ib_srq_attr *srq_attr, - enum ib_srq_attr_mask srq_attr_mask, - struct ib_udata *udata) -{ - struct hns_roce_dev *hr_dev = to_hr_dev(ibsrq->device); - struct hns_roce_srq *srq = to_hr_srq(ibsrq); - struct hns_roce_srq_context *srq_context; - struct hns_roce_srq_context *srqc_mask; - struct hns_roce_cmd_mailbox *mailbox; - int ret; - - if (srq_attr_mask & IB_SRQ_LIMIT) { - if (srq_attr->srq_limit >= srq->wqe_cnt) - return -EINVAL; - - mailbox = hns_roce_alloc_cmd_mailbox(hr_dev); - if (IS_ERR(mailbox)) - return PTR_ERR(mailbox); - - srq_context = mailbox->buf; - srqc_mask = (struct hns_roce_srq_context *)mailbox->buf + 1; - - memset(srqc_mask, 0xff, sizeof(*srqc_mask)); - - roce_set_field(srq_context->byte_8_limit_wl, - SRQC_BYTE_8_SRQ_LIMIT_WL_M, - SRQC_BYTE_8_SRQ_LIMIT_WL_S, srq_attr->srq_limit); - roce_set_field(srqc_mask->byte_8_limit_wl, - SRQC_BYTE_8_SRQ_LIMIT_WL_M, - SRQC_BYTE_8_SRQ_LIMIT_WL_S, 0); - - ret = hns_roce_cmd_mbox(hr_dev, mailbox->dma, 0, srq->srqn, 0, - HNS_ROCE_CMD_MODIFY_SRQC, - HNS_ROCE_CMD_TIMEOUT_MSECS); - hns_roce_free_cmd_mailbox(hr_dev, mailbox); - if (ret) { - ibdev_err(&hr_dev->ib_dev, - "failed to process cmd when modifying SRQ, ret = %d\n", - ret); - return ret; - } - } - - return 0; -} - -static int hns_roce_v2_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr) -{ - struct hns_roce_dev *hr_dev = to_hr_dev(ibsrq->device); - struct hns_roce_srq *srq = to_hr_srq(ibsrq); - struct hns_roce_srq_context *srq_context; - struct hns_roce_cmd_mailbox *mailbox; - int limit_wl; - int ret; - - mailbox = hns_roce_alloc_cmd_mailbox(hr_dev); - if (IS_ERR(mailbox)) - return PTR_ERR(mailbox); - - srq_context = mailbox->buf; - ret = hns_roce_cmd_mbox(hr_dev, 0, mailbox->dma, srq->srqn, 0, - HNS_ROCE_CMD_QUERY_SRQC, - HNS_ROCE_CMD_TIMEOUT_MSECS); - if (ret) { - ibdev_err(&hr_dev->ib_dev, - "failed to process cmd when querying SRQ, ret = %d\n", - ret); - goto out; - } - - limit_wl = roce_get_field(srq_context->byte_8_limit_wl, - SRQC_BYTE_8_SRQ_LIMIT_WL_M, - SRQC_BYTE_8_SRQ_LIMIT_WL_S); - - attr->srq_limit = limit_wl; - attr->max_wr = srq->wqe_cnt - 1; - attr->max_sge = srq->max_gs; - - memcpy(srq_context, mailbox->buf, sizeof(*srq_context)); - -out: - hns_roce_free_cmd_mailbox(hr_dev, mailbox); - return ret; -} - -static int find_empty_entry(struct hns_roce_idx_que *idx_que, - unsigned long size) -{ - int wqe_idx; - - if (unlikely(bitmap_full(idx_que->bitmap, size))) - return -ENOSPC; - - wqe_idx = find_first_zero_bit(idx_que->bitmap, size); - - bitmap_set(idx_que->bitmap, wqe_idx, 1); - - return wqe_idx; -} - -static void fill_idx_queue(struct hns_roce_idx_que *idx_que, - int cur_idx, int wqe_idx) -{ - unsigned int *addr; - - addr = (unsigned int *)hns_roce_buf_offset(idx_que->mtr.kmem, - cur_idx * idx_que->entry_sz); - *addr = wqe_idx; -} - -static int hns_roce_v2_post_srq_recv(struct ib_srq *ibsrq, - const struct ib_recv_wr *wr, - const struct ib_recv_wr **bad_wr) -{ - struct hns_roce_dev *hr_dev = to_hr_dev(ibsrq->device); - struct hns_roce_srq *srq = to_hr_srq(ibsrq); - struct hns_roce_v2_wqe_data_seg *dseg; - struct hns_roce_v2_db srq_db; - unsigned long flags; - int ret = 0; - int wqe_idx; - void *wqe; - int nreq; - int ind; - int i; - - spin_lock_irqsave(&srq->lock, flags); - - ind = srq->head & (srq->wqe_cnt - 1); - - for (nreq = 0; wr; ++nreq, wr = wr->next) { - if (unlikely(wr->num_sge > srq->max_gs)) { - ret = -EINVAL; - *bad_wr = wr; - break; - } - - if (unlikely(srq->head == srq->tail)) { - ret = -ENOMEM; - *bad_wr = wr; - break; - } - - wqe_idx = find_empty_entry(&srq->idx_que, srq->wqe_cnt); - if (wqe_idx < 0) { - ret = -ENOMEM; - *bad_wr = wr; - break; - } - - fill_idx_queue(&srq->idx_que, ind, wqe_idx); - wqe = get_srq_wqe(srq, wqe_idx); - dseg = (struct hns_roce_v2_wqe_data_seg *)wqe; - - for (i = 0; i < wr->num_sge; ++i) { - dseg[i].len = cpu_to_le32(wr->sg_list[i].length); - dseg[i].lkey = cpu_to_le32(wr->sg_list[i].lkey); - dseg[i].addr = cpu_to_le64(wr->sg_list[i].addr); - } - - if (i < srq->max_gs) { - dseg[i].len = 0; - dseg[i].lkey = cpu_to_le32(0x100); - dseg[i].addr = 0; - } - - srq->wrid[wqe_idx] = wr->wr_id; - ind = (ind + 1) & (srq->wqe_cnt - 1); - } - - if (likely(nreq)) { - srq->head += nreq; - - /* - * Make sure that descriptors are written before - * doorbell record. - */ - wmb(); - - srq_db.byte_4 = - cpu_to_le32(HNS_ROCE_V2_SRQ_DB << V2_DB_BYTE_4_CMD_S | - (srq->srqn & V2_DB_BYTE_4_TAG_M)); - srq_db.parameter = cpu_to_le32(srq->head); - - hns_roce_write64(hr_dev, (__le32 *)&srq_db, srq->db_reg_l); - - } - - spin_unlock_irqrestore(&srq->lock, flags); - - return ret; -} - static const struct hns_roce_dfx_hw hns_roce_dfx_hw_v2 = { .query_cqc_info = hns_roce_v2_query_cqc_info, }; -- cgit v1.2.3 From 67954a6e379b4678536ef14a1d49ea78fcdc4a1f Mon Sep 17 00:00:00 2001 From: Xi Wang Date: Tue, 28 Apr 2020 19:03:43 +0800 Subject: RDMA/hns: Optimize SRQ buffer size calculating process Optimize the SRQ's WQE buffer parameters calculating process to make the codes more readable by using new functions about multi-hop addressing to calculating capabilities of SRQ. Link: https://lore.kernel.org/r/1588071823-40200-6-git-send-email-liweihang@huawei.com Signed-off-by: Xi Wang Signed-off-by: Weihang Li Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_device.h | 2 +- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 29 ++++++++++++++--------------- drivers/infiniband/hw/hns/hns_roce_srq.c | 16 ++++++++-------- 3 files changed, 23 insertions(+), 24 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index 100224358a49..5cac14d7be90 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -472,7 +472,7 @@ struct hns_roce_cq { struct hns_roce_idx_que { struct hns_roce_mtr mtr; - int entry_sz; + int entry_shift; unsigned long *bitmap; }; diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 0b79dafe919e..f70370d24512 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -699,6 +699,12 @@ static void *get_srq_wqe(struct hns_roce_srq *srq, int n) return hns_roce_buf_offset(srq->buf_mtr.kmem, n << srq->wqe_shift); } +static void *get_idx_buf(struct hns_roce_idx_que *idx_que, int n) +{ + return hns_roce_buf_offset(idx_que->mtr.kmem, + n << idx_que->entry_shift); +} + static void hns_roce_free_srq_wqe(struct hns_roce_srq *srq, int wqe_index) { /* always called with interrupts disabled. */ @@ -725,16 +731,6 @@ static int find_empty_entry(struct hns_roce_idx_que *idx_que, return wqe_idx; } -static void fill_idx_queue(struct hns_roce_idx_que *idx_que, - int cur_idx, int wqe_idx) -{ - unsigned int *addr; - - addr = (unsigned int *)hns_roce_buf_offset(idx_que->mtr.kmem, - cur_idx * idx_que->entry_sz); - *addr = wqe_idx; -} - static int hns_roce_v2_post_srq_recv(struct ib_srq *ibsrq, const struct ib_recv_wr *wr, const struct ib_recv_wr **bad_wr) @@ -744,6 +740,7 @@ static int hns_roce_v2_post_srq_recv(struct ib_srq *ibsrq, struct hns_roce_v2_wqe_data_seg *dseg; struct hns_roce_v2_db srq_db; unsigned long flags; + __le32 *srq_idx; int ret = 0; int wqe_idx; void *wqe; @@ -775,7 +772,6 @@ static int hns_roce_v2_post_srq_recv(struct ib_srq *ibsrq, break; } - fill_idx_queue(&srq->idx_que, ind, wqe_idx); wqe = get_srq_wqe(srq, wqe_idx); dseg = (struct hns_roce_v2_wqe_data_seg *)wqe; @@ -791,6 +787,9 @@ static int hns_roce_v2_post_srq_recv(struct ib_srq *ibsrq, dseg[i].addr = 0; } + srq_idx = get_idx_buf(&srq->idx_que, ind); + *srq_idx = cpu_to_le32(wqe_idx); + srq->wrid[wqe_idx] = wr->wr_id; ind = (ind + 1) & (srq->wqe_cnt - 1); } @@ -4901,8 +4900,8 @@ static void hns_roce_v2_write_srqc(struct hns_roce_dev *hr_dev, roce_set_field(srq_context->byte_4_srqn_srqst, SRQC_BYTE_4_SRQ_WQE_HOP_NUM_M, SRQC_BYTE_4_SRQ_WQE_HOP_NUM_S, - (hr_dev->caps.srqwqe_hop_num == HNS_ROCE_HOP_NUM_0 ? 0 : - hr_dev->caps.srqwqe_hop_num)); + to_hr_hem_hopnum(hr_dev->caps.srqwqe_hop_num, + srq->wqe_cnt)); roce_set_field(srq_context->byte_4_srqn_srqst, SRQC_BYTE_4_SRQ_SHIFT_M, SRQC_BYTE_4_SRQ_SHIFT_S, ilog2(srq->wqe_cnt)); @@ -4944,8 +4943,8 @@ static void hns_roce_v2_write_srqc(struct hns_roce_dev *hr_dev, roce_set_field(srq_context->byte_44_idxbufpgsz_addr, SRQC_BYTE_44_SRQ_IDX_HOP_NUM_M, SRQC_BYTE_44_SRQ_IDX_HOP_NUM_S, - hr_dev->caps.idx_hop_num == HNS_ROCE_HOP_NUM_0 ? 0 : - hr_dev->caps.idx_hop_num); + to_hr_hem_hopnum(hr_dev->caps.idx_hop_num, + srq->wqe_cnt)); roce_set_field(srq_context->byte_44_idxbufpgsz_addr, SRQC_BYTE_44_SRQ_IDX_BA_PG_SZ_M, diff --git a/drivers/infiniband/hw/hns/hns_roce_srq.c b/drivers/infiniband/hw/hns/hns_roce_srq.c index e413a9737ae6..6e5a2adc2ab2 100644 --- a/drivers/infiniband/hw/hns/hns_roce_srq.c +++ b/drivers/infiniband/hw/hns/hns_roce_srq.c @@ -181,16 +181,15 @@ static int alloc_srq_buf(struct hns_roce_dev *hr_dev, struct hns_roce_srq *srq, { struct ib_device *ibdev = &hr_dev->ib_dev; struct hns_roce_buf_attr buf_attr = {}; - int sge_size; int err; - sge_size = roundup_pow_of_two(max(HNS_ROCE_SGE_SIZE, - HNS_ROCE_SGE_SIZE * srq->max_gs)); - - srq->wqe_shift = ilog2(sge_size); + srq->wqe_shift = ilog2(roundup_pow_of_two(max(HNS_ROCE_SGE_SIZE, + HNS_ROCE_SGE_SIZE * + srq->max_gs))); buf_attr.page_shift = hr_dev->caps.srqwqe_buf_pg_sz + PAGE_ADDR_SHIFT; - buf_attr.region[0].size = srq->wqe_cnt * sge_size; + buf_attr.region[0].size = to_hr_hem_entries_size(srq->wqe_cnt, + srq->wqe_shift); buf_attr.region[0].hopnum = hr_dev->caps.srqwqe_hop_num; buf_attr.region_count = 1; buf_attr.fixed_page = true; @@ -217,10 +216,11 @@ static int alloc_srq_idx(struct hns_roce_dev *hr_dev, struct hns_roce_srq *srq, struct hns_roce_buf_attr buf_attr = {}; int err; - srq->idx_que.entry_sz = HNS_ROCE_IDX_QUE_ENTRY_SZ; + srq->idx_que.entry_shift = ilog2(HNS_ROCE_IDX_QUE_ENTRY_SZ); buf_attr.page_shift = hr_dev->caps.idx_buf_pg_sz + PAGE_ADDR_SHIFT; - buf_attr.region[0].size = srq->wqe_cnt * HNS_ROCE_IDX_QUE_ENTRY_SZ; + buf_attr.region[0].size = to_hr_hem_entries_size(srq->wqe_cnt, + srq->idx_que.entry_shift); buf_attr.region[0].hopnum = hr_dev->caps.idx_hop_num; buf_attr.region_count = 1; buf_attr.fixed_page = true; -- cgit v1.2.3 From 23190b8f47de2d2983d6052f6eb7781779db8eb3 Mon Sep 17 00:00:00 2001 From: Weihang Li Date: Thu, 30 Apr 2020 18:31:29 +0800 Subject: RDMA/hns: Fix comments with non-English symbols There is a comments with some chinese semicolons that cause encoding issues each time hns_roc_hw_v2.h was modified from a IDE. So fix this by using correct symbols. Link: https://lore.kernel.org/r/1588242691-12913-2-git-send-email-liweihang@huawei.com Signed-off-by: Weihang Li Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_hw_v2.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h index 82dd9f6f4845..05bfe078d537 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h @@ -1241,10 +1241,9 @@ struct hns_roce_func_clear { }; #define FUNC_CLEAR_RST_FUN_DONE_S 0 -/* Each physical function manages up to 248 virtual functionsï¼› - * it takes up to 100ms for each function to execute clearï¼› - * if an abnormal reset occurs, it is executed twice at most; - * so it takes up to 249 * 2 * 100ms. +/* Each physical function manages up to 248 virtual functions, it takes up to + * 100ms for each function to execute clear. If an abnormal reset occurs, it is + * executed twice at most, so it takes up to 249 * 2 * 100ms. */ #define HNS_ROCE_V2_FUNC_CLEAR_TIMEOUT_MSECS (249 * 2 * 100) #define HNS_ROCE_V2_READ_FUNC_CLEAR_FLAG_INTERVAL 40 -- cgit v1.2.3 From b713128de7a17c7173f869323f8a5f0593e52538 Mon Sep 17 00:00:00 2001 From: Weihang Li Date: Thu, 30 Apr 2020 18:31:30 +0800 Subject: RDMA/hns: Adjust lp_pktn_ini dynamically lp_pktn_ini means the number of loopback slice packets for long messages, it should depend on MTU(fixed to 4096B currently) and max size of SQ inline. Link: https://lore.kernel.org/r/1588242691-12913-3-git-send-email-liweihang@huawei.com Signed-off-by: Weihang Li Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index f70370d24512..7643b066deea 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -3977,7 +3977,8 @@ static int modify_qp_init_to_rtr(struct ib_qp *ibqp, /* mtu*(2^LP_PKTN_INI) should not bigger than 1 message length 64kb */ roce_set_field(context->byte_56_dqpn_err, V2_QPC_BYTE_56_LP_PKTN_INI_M, - V2_QPC_BYTE_56_LP_PKTN_INI_S, 0); + V2_QPC_BYTE_56_LP_PKTN_INI_S, + ilog2(hr_dev->caps.max_sq_inline / IB_MTU_4096)); roce_set_field(qpc_mask->byte_56_dqpn_err, V2_QPC_BYTE_56_LP_PKTN_INI_M, V2_QPC_BYTE_56_LP_PKTN_INI_S, 0); -- cgit v1.2.3 From e4faa478c6b8af5fc4afe20de693417354a57e4d Mon Sep 17 00:00:00 2001 From: Wenpeng Liang Date: Thu, 30 Apr 2020 18:31:31 +0800 Subject: RDMA/hns: Remove redundant assignment of caps These caps are assigned in query_pf_caps() or set_default_caps(), and should not be assigned out of these two functions. Link: https://lore.kernel.org/r/1588242691-12913-4-git-send-email-liweihang@huawei.com Signed-off-by: Wenpeng Liang Signed-off-by: Weihang Li Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 7643b066deea..ad9a11a2cd0d 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -2079,11 +2079,6 @@ static int hns_roce_v2_profile(struct hns_roce_dev *hr_dev) hr_dev->vendor_part_id = hr_dev->pci_dev->device; hr_dev->sys_image_guid = be64_to_cpu(hr_dev->ib_dev.node_guid); - caps->num_mtt_segs = HNS_ROCE_V2_MAX_MTT_SEGS; - caps->num_cqe_segs = HNS_ROCE_V2_MAX_CQE_SEGS; - caps->num_srqwqe_segs = HNS_ROCE_V2_MAX_SRQWQE_SEGS; - caps->num_idx_segs = HNS_ROCE_V2_MAX_IDX_SEGS; - caps->pbl_ba_pg_sz = HNS_ROCE_BA_PG_SZ_SUPPORTED_16K; caps->pbl_buf_pg_sz = 0; caps->pbl_hop_num = HNS_ROCE_PBL_HOP_NUM; -- cgit v1.2.3 From 31578defe4eb816439d5e3351923e90f6321b3c8 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Wed, 6 May 2020 09:55:11 +0300 Subject: RDMA/mlx5: Update mlx5_ib to use new cmd interface Reuse newly introduced mlx5_cmd_exec_in() and mlx5_cmd_exec_inout() to reduce code duplication in mlx5_ib module. Link: https://lore.kernel.org/r/20200506065513.4668-2-leon@kernel.org Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/cmd.c | 114 +++++++++++----------------------- drivers/infiniband/hw/mlx5/cmd.h | 4 +- drivers/infiniband/hw/mlx5/cong.c | 4 +- drivers/infiniband/hw/mlx5/main.c | 5 +- drivers/infiniband/hw/mlx5/odp.c | 5 +- drivers/infiniband/hw/mlx5/srq_cmd.c | 115 +++++++++++++++-------------------- 6 files changed, 91 insertions(+), 156 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/cmd.c b/drivers/infiniband/hw/mlx5/cmd.c index a2fcbc49131e..cc24c711e92a 100644 --- a/drivers/infiniband/hw/mlx5/cmd.c +++ b/drivers/infiniband/hw/mlx5/cmd.c @@ -1,46 +1,19 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB /* - * Copyright (c) 2017, Mellanox Technologies. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. + * Copyright (c) 2017-2020, Mellanox Technologies inc. All rights reserved. */ #include "cmd.h" int mlx5_cmd_dump_fill_mkey(struct mlx5_core_dev *dev, u32 *mkey) { - u32 out[MLX5_ST_SZ_DW(query_special_contexts_out)] = {0}; - u32 in[MLX5_ST_SZ_DW(query_special_contexts_in)] = {0}; + u32 out[MLX5_ST_SZ_DW(query_special_contexts_out)] = {}; + u32 in[MLX5_ST_SZ_DW(query_special_contexts_in)] = {}; int err; MLX5_SET(query_special_contexts_in, in, opcode, MLX5_CMD_OP_QUERY_SPECIAL_CONTEXTS); - err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); + err = mlx5_cmd_exec_inout(dev, query_special_contexts, in, out); if (!err) *mkey = MLX5_GET(query_special_contexts_out, out, dump_fill_mkey); @@ -50,12 +23,12 @@ int mlx5_cmd_dump_fill_mkey(struct mlx5_core_dev *dev, u32 *mkey) int mlx5_cmd_null_mkey(struct mlx5_core_dev *dev, u32 *null_mkey) { u32 out[MLX5_ST_SZ_DW(query_special_contexts_out)] = {}; - u32 in[MLX5_ST_SZ_DW(query_special_contexts_in)] = {}; + u32 in[MLX5_ST_SZ_DW(query_special_contexts_in)] = {}; int err; MLX5_SET(query_special_contexts_in, in, opcode, MLX5_CMD_OP_QUERY_SPECIAL_CONTEXTS); - err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); + err = mlx5_cmd_exec_inout(dev, query_special_contexts, in, out); if (!err) *null_mkey = MLX5_GET(query_special_contexts_out, out, null_mkey); @@ -63,23 +36,15 @@ int mlx5_cmd_null_mkey(struct mlx5_core_dev *dev, u32 *null_mkey) } int mlx5_cmd_query_cong_params(struct mlx5_core_dev *dev, int cong_point, - void *out, int out_size) + void *out) { - u32 in[MLX5_ST_SZ_DW(query_cong_params_in)] = { }; + u32 in[MLX5_ST_SZ_DW(query_cong_params_in)] = {}; MLX5_SET(query_cong_params_in, in, opcode, MLX5_CMD_OP_QUERY_CONG_PARAMS); MLX5_SET(query_cong_params_in, in, cong_protocol, cong_point); - return mlx5_cmd_exec(dev, in, sizeof(in), out, out_size); -} - -int mlx5_cmd_modify_cong_params(struct mlx5_core_dev *dev, - void *in, int in_size) -{ - u32 out[MLX5_ST_SZ_DW(modify_cong_params_out)] = { }; - - return mlx5_cmd_exec(dev, in, in_size, out, sizeof(out)); + return mlx5_cmd_exec_inout(dev, query_cong_params, in, out); } int mlx5_cmd_alloc_memic(struct mlx5_dm *dm, phys_addr_t *addr, @@ -133,7 +98,7 @@ int mlx5_cmd_alloc_memic(struct mlx5_dm *dm, phys_addr_t *addr, MLX5_SET64(alloc_memic_in, in, range_start_addr, hw_start_addr + (page_idx * PAGE_SIZE)); - ret = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); + ret = mlx5_cmd_exec_inout(dev, alloc_memic, in, out); if (ret) { spin_lock(&dm->lock); bitmap_clear(dm->memic_alloc_pages, @@ -162,8 +127,7 @@ void mlx5_cmd_dealloc_memic(struct mlx5_dm *dm, phys_addr_t addr, u64 length) struct mlx5_core_dev *dev = dm->dev; u64 hw_start_addr = MLX5_CAP64_DEV_MEM(dev, memic_bar_start_addr); u32 num_pages = DIV_ROUND_UP(length, PAGE_SIZE); - u32 out[MLX5_ST_SZ_DW(dealloc_memic_out)] = {0}; - u32 in[MLX5_ST_SZ_DW(dealloc_memic_in)] = {0}; + u32 in[MLX5_ST_SZ_DW(dealloc_memic_in)] = {}; u64 start_page_idx; int err; @@ -174,7 +138,7 @@ void mlx5_cmd_dealloc_memic(struct mlx5_dm *dm, phys_addr_t addr, u64 length) MLX5_SET64(dealloc_memic_in, in, memic_start_addr, addr); MLX5_SET(dealloc_memic_in, in, memic_size, length); - err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); + err = mlx5_cmd_exec_in(dev, dealloc_memic, in); if (err) return; @@ -198,49 +162,46 @@ int mlx5_cmd_query_ext_ppcnt_counters(struct mlx5_core_dev *dev, void *out) void mlx5_cmd_destroy_tir(struct mlx5_core_dev *dev, u32 tirn, u16 uid) { - u32 in[MLX5_ST_SZ_DW(destroy_tir_in)] = {}; - u32 out[MLX5_ST_SZ_DW(destroy_tir_out)] = {}; + u32 in[MLX5_ST_SZ_DW(destroy_tir_in)] = {}; MLX5_SET(destroy_tir_in, in, opcode, MLX5_CMD_OP_DESTROY_TIR); MLX5_SET(destroy_tir_in, in, tirn, tirn); MLX5_SET(destroy_tir_in, in, uid, uid); - mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); + mlx5_cmd_exec_in(dev, destroy_tir, in); } void mlx5_cmd_destroy_tis(struct mlx5_core_dev *dev, u32 tisn, u16 uid) { - u32 in[MLX5_ST_SZ_DW(destroy_tis_in)] = {0}; - u32 out[MLX5_ST_SZ_DW(destroy_tis_out)] = {0}; + u32 in[MLX5_ST_SZ_DW(destroy_tis_in)] = {}; MLX5_SET(destroy_tis_in, in, opcode, MLX5_CMD_OP_DESTROY_TIS); MLX5_SET(destroy_tis_in, in, tisn, tisn); MLX5_SET(destroy_tis_in, in, uid, uid); - mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); + mlx5_cmd_exec_in(dev, destroy_tis, in); } void mlx5_cmd_destroy_rqt(struct mlx5_core_dev *dev, u32 rqtn, u16 uid) { - u32 in[MLX5_ST_SZ_DW(destroy_rqt_in)] = {}; - u32 out[MLX5_ST_SZ_DW(destroy_rqt_out)] = {}; + u32 in[MLX5_ST_SZ_DW(destroy_rqt_in)] = {}; MLX5_SET(destroy_rqt_in, in, opcode, MLX5_CMD_OP_DESTROY_RQT); MLX5_SET(destroy_rqt_in, in, rqtn, rqtn); MLX5_SET(destroy_rqt_in, in, uid, uid); - mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); + mlx5_cmd_exec_in(dev, destroy_rqt, in); } int mlx5_cmd_alloc_transport_domain(struct mlx5_core_dev *dev, u32 *tdn, u16 uid) { - u32 in[MLX5_ST_SZ_DW(alloc_transport_domain_in)] = {0}; - u32 out[MLX5_ST_SZ_DW(alloc_transport_domain_out)] = {0}; + u32 in[MLX5_ST_SZ_DW(alloc_transport_domain_in)] = {}; + u32 out[MLX5_ST_SZ_DW(alloc_transport_domain_out)] = {}; int err; MLX5_SET(alloc_transport_domain_in, in, opcode, MLX5_CMD_OP_ALLOC_TRANSPORT_DOMAIN); MLX5_SET(alloc_transport_domain_in, in, uid, uid); - err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); + err = mlx5_cmd_exec_inout(dev, alloc_transport_domain, in, out); if (!err) *tdn = MLX5_GET(alloc_transport_domain_out, out, transport_domain); @@ -251,32 +212,29 @@ int mlx5_cmd_alloc_transport_domain(struct mlx5_core_dev *dev, u32 *tdn, void mlx5_cmd_dealloc_transport_domain(struct mlx5_core_dev *dev, u32 tdn, u16 uid) { - u32 in[MLX5_ST_SZ_DW(dealloc_transport_domain_in)] = {0}; - u32 out[MLX5_ST_SZ_DW(dealloc_transport_domain_out)] = {0}; + u32 in[MLX5_ST_SZ_DW(dealloc_transport_domain_in)] = {}; MLX5_SET(dealloc_transport_domain_in, in, opcode, MLX5_CMD_OP_DEALLOC_TRANSPORT_DOMAIN); MLX5_SET(dealloc_transport_domain_in, in, uid, uid); MLX5_SET(dealloc_transport_domain_in, in, transport_domain, tdn); - mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); + mlx5_cmd_exec_in(dev, dealloc_transport_domain, in); } void mlx5_cmd_dealloc_pd(struct mlx5_core_dev *dev, u32 pdn, u16 uid) { - u32 out[MLX5_ST_SZ_DW(dealloc_pd_out)] = {}; - u32 in[MLX5_ST_SZ_DW(dealloc_pd_in)] = {}; + u32 in[MLX5_ST_SZ_DW(dealloc_pd_in)] = {}; MLX5_SET(dealloc_pd_in, in, opcode, MLX5_CMD_OP_DEALLOC_PD); MLX5_SET(dealloc_pd_in, in, pd, pdn); MLX5_SET(dealloc_pd_in, in, uid, uid); - mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); + mlx5_cmd_exec_in(dev, dealloc_pd, in); } int mlx5_cmd_attach_mcg(struct mlx5_core_dev *dev, union ib_gid *mgid, u32 qpn, u16 uid) { - u32 out[MLX5_ST_SZ_DW(attach_to_mcg_out)] = {}; - u32 in[MLX5_ST_SZ_DW(attach_to_mcg_in)] = {}; + u32 in[MLX5_ST_SZ_DW(attach_to_mcg_in)] = {}; void *gid; MLX5_SET(attach_to_mcg_in, in, opcode, MLX5_CMD_OP_ATTACH_TO_MCG); @@ -284,14 +242,13 @@ int mlx5_cmd_attach_mcg(struct mlx5_core_dev *dev, union ib_gid *mgid, MLX5_SET(attach_to_mcg_in, in, uid, uid); gid = MLX5_ADDR_OF(attach_to_mcg_in, in, multicast_gid); memcpy(gid, mgid, sizeof(*mgid)); - return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); + return mlx5_cmd_exec_in(dev, attach_to_mcg, in); } int mlx5_cmd_detach_mcg(struct mlx5_core_dev *dev, union ib_gid *mgid, u32 qpn, u16 uid) { - u32 out[MLX5_ST_SZ_DW(detach_from_mcg_out)] = {}; - u32 in[MLX5_ST_SZ_DW(detach_from_mcg_in)] = {}; + u32 in[MLX5_ST_SZ_DW(detach_from_mcg_in)] = {}; void *gid; MLX5_SET(detach_from_mcg_in, in, opcode, MLX5_CMD_OP_DETACH_FROM_MCG); @@ -299,18 +256,18 @@ int mlx5_cmd_detach_mcg(struct mlx5_core_dev *dev, union ib_gid *mgid, MLX5_SET(detach_from_mcg_in, in, uid, uid); gid = MLX5_ADDR_OF(detach_from_mcg_in, in, multicast_gid); memcpy(gid, mgid, sizeof(*mgid)); - return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); + return mlx5_cmd_exec_in(dev, detach_from_mcg, in); } int mlx5_cmd_xrcd_alloc(struct mlx5_core_dev *dev, u32 *xrcdn, u16 uid) { u32 out[MLX5_ST_SZ_DW(alloc_xrcd_out)] = {}; - u32 in[MLX5_ST_SZ_DW(alloc_xrcd_in)] = {}; + u32 in[MLX5_ST_SZ_DW(alloc_xrcd_in)] = {}; int err; MLX5_SET(alloc_xrcd_in, in, opcode, MLX5_CMD_OP_ALLOC_XRCD); MLX5_SET(alloc_xrcd_in, in, uid, uid); - err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); + err = mlx5_cmd_exec_inout(dev, alloc_xrcd, in, out); if (!err) *xrcdn = MLX5_GET(alloc_xrcd_out, out, xrcd); return err; @@ -318,13 +275,12 @@ int mlx5_cmd_xrcd_alloc(struct mlx5_core_dev *dev, u32 *xrcdn, u16 uid) int mlx5_cmd_xrcd_dealloc(struct mlx5_core_dev *dev, u32 xrcdn, u16 uid) { - u32 out[MLX5_ST_SZ_DW(dealloc_xrcd_out)] = {}; - u32 in[MLX5_ST_SZ_DW(dealloc_xrcd_in)] = {}; + u32 in[MLX5_ST_SZ_DW(dealloc_xrcd_in)] = {}; MLX5_SET(dealloc_xrcd_in, in, opcode, MLX5_CMD_OP_DEALLOC_XRCD); MLX5_SET(dealloc_xrcd_in, in, xrcd, xrcdn); MLX5_SET(dealloc_xrcd_in, in, uid, uid); - return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); + return mlx5_cmd_exec_in(dev, dealloc_xrcd, in); } int mlx5_cmd_mad_ifc(struct mlx5_core_dev *dev, const void *inb, void *outb, @@ -350,7 +306,7 @@ int mlx5_cmd_mad_ifc(struct mlx5_core_dev *dev, const void *inb, void *outb, data = MLX5_ADDR_OF(mad_ifc_in, in, mad); memcpy(data, inb, MLX5_FLD_SZ_BYTES(mad_ifc_in, mad)); - err = mlx5_cmd_exec(dev, in, inlen, out, outlen); + err = mlx5_cmd_exec_inout(dev, mad_ifc, in, out); if (err) goto out; diff --git a/drivers/infiniband/hw/mlx5/cmd.h b/drivers/infiniband/hw/mlx5/cmd.h index 43079b18d9b4..f4d8558db434 100644 --- a/drivers/infiniband/hw/mlx5/cmd.h +++ b/drivers/infiniband/hw/mlx5/cmd.h @@ -40,10 +40,8 @@ int mlx5_cmd_dump_fill_mkey(struct mlx5_core_dev *dev, u32 *mkey); int mlx5_cmd_null_mkey(struct mlx5_core_dev *dev, u32 *null_mkey); int mlx5_cmd_query_cong_params(struct mlx5_core_dev *dev, int cong_point, - void *out, int out_size); + void *out); int mlx5_cmd_query_ext_ppcnt_counters(struct mlx5_core_dev *dev, void *out); -int mlx5_cmd_modify_cong_params(struct mlx5_core_dev *mdev, - void *in, int in_size); int mlx5_cmd_alloc_memic(struct mlx5_dm *dm, phys_addr_t *addr, u64 length, u32 alignment); void mlx5_cmd_dealloc_memic(struct mlx5_dm *dm, phys_addr_t addr, u64 length); diff --git a/drivers/infiniband/hw/mlx5/cong.c b/drivers/infiniband/hw/mlx5/cong.c index de4da92b81a6..b9291e482428 100644 --- a/drivers/infiniband/hw/mlx5/cong.c +++ b/drivers/infiniband/hw/mlx5/cong.c @@ -290,7 +290,7 @@ static int mlx5_ib_get_cc_params(struct mlx5_ib_dev *dev, u8 port_num, node = mlx5_ib_param_to_node(offset); - err = mlx5_cmd_query_cong_params(mdev, node, out, outlen); + err = mlx5_cmd_query_cong_params(mdev, node, out); if (err) goto free; @@ -339,7 +339,7 @@ static int mlx5_ib_set_cc_params(struct mlx5_ib_dev *dev, u8 port_num, MLX5_SET(field_select_r_roce_rp, field, field_select_r_roce_rp, attr_mask); - err = mlx5_cmd_modify_cong_params(mdev, in, inlen); + err = mlx5_cmd_exec_in(dev->mdev, modify_cong_params, in); kvfree(in); alloc_err: mlx5_ib_put_native_port_mdev(dev, port_num + 1); diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 0b8cc219e085..0c0fbf4160e0 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -2562,7 +2562,7 @@ static int mlx5_ib_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata) struct mlx5_ib_alloc_pd_resp resp; int err; u32 out[MLX5_ST_SZ_DW(alloc_pd_out)] = {}; - u32 in[MLX5_ST_SZ_DW(alloc_pd_in)] = {}; + u32 in[MLX5_ST_SZ_DW(alloc_pd_in)] = {}; u16 uid = 0; struct mlx5_ib_ucontext *context = rdma_udata_to_drv_context( udata, struct mlx5_ib_ucontext, ibucontext); @@ -2570,8 +2570,7 @@ static int mlx5_ib_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata) uid = context ? context->devx_uid : 0; MLX5_SET(alloc_pd_in, in, opcode, MLX5_CMD_OP_ALLOC_PD); MLX5_SET(alloc_pd_in, in, uid, uid); - err = mlx5_cmd_exec(to_mdev(ibdev)->mdev, in, sizeof(in), - out, sizeof(out)); + err = mlx5_cmd_exec_inout(to_mdev(ibdev)->mdev, alloc_pd, in, out); if (err) return err; diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index 70577d546567..7d2ec9ee5097 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -447,8 +447,7 @@ static void mlx5_ib_page_fault_resume(struct mlx5_ib_dev *dev, { int wq_num = pfault->event_subtype == MLX5_PFAULT_SUBTYPE_WQE ? pfault->wqe.wq_num : pfault->token; - u32 out[MLX5_ST_SZ_DW(page_fault_resume_out)] = { }; - u32 in[MLX5_ST_SZ_DW(page_fault_resume_in)] = { }; + u32 in[MLX5_ST_SZ_DW(page_fault_resume_in)] = {}; int err; MLX5_SET(page_fault_resume_in, in, opcode, MLX5_CMD_OP_PAGE_FAULT_RESUME); @@ -457,7 +456,7 @@ static void mlx5_ib_page_fault_resume(struct mlx5_ib_dev *dev, MLX5_SET(page_fault_resume_in, in, wq_number, wq_num); MLX5_SET(page_fault_resume_in, in, error, !!error); - err = mlx5_cmd_exec(dev->mdev, in, sizeof(in), out, sizeof(out)); + err = mlx5_cmd_exec_in(dev->mdev, page_fault_resume, in); if (err) mlx5_ib_err(dev, "Failed to resolve the page fault on WQ 0x%x err %d\n", wq_num, err); diff --git a/drivers/infiniband/hw/mlx5/srq_cmd.c b/drivers/infiniband/hw/mlx5/srq_cmd.c index c851570791af..bc50a712bf2e 100644 --- a/drivers/infiniband/hw/mlx5/srq_cmd.c +++ b/drivers/infiniband/hw/mlx5/srq_cmd.c @@ -132,38 +132,33 @@ static int create_srq_cmd(struct mlx5_ib_dev *dev, struct mlx5_core_srq *srq, static int destroy_srq_cmd(struct mlx5_ib_dev *dev, struct mlx5_core_srq *srq) { - u32 srq_in[MLX5_ST_SZ_DW(destroy_srq_in)] = {0}; - u32 srq_out[MLX5_ST_SZ_DW(destroy_srq_out)] = {0}; + u32 in[MLX5_ST_SZ_DW(destroy_srq_in)] = {}; - MLX5_SET(destroy_srq_in, srq_in, opcode, - MLX5_CMD_OP_DESTROY_SRQ); - MLX5_SET(destroy_srq_in, srq_in, srqn, srq->srqn); - MLX5_SET(destroy_srq_in, srq_in, uid, srq->uid); + MLX5_SET(destroy_srq_in, in, opcode, MLX5_CMD_OP_DESTROY_SRQ); + MLX5_SET(destroy_srq_in, in, srqn, srq->srqn); + MLX5_SET(destroy_srq_in, in, uid, srq->uid); - return mlx5_cmd_exec(dev->mdev, srq_in, sizeof(srq_in), srq_out, - sizeof(srq_out)); + return mlx5_cmd_exec_in(dev->mdev, destroy_srq, in); } static int arm_srq_cmd(struct mlx5_ib_dev *dev, struct mlx5_core_srq *srq, u16 lwm, int is_srq) { - u32 srq_in[MLX5_ST_SZ_DW(arm_rq_in)] = {0}; - u32 srq_out[MLX5_ST_SZ_DW(arm_rq_out)] = {0}; + u32 in[MLX5_ST_SZ_DW(arm_rq_in)] = {}; - MLX5_SET(arm_rq_in, srq_in, opcode, MLX5_CMD_OP_ARM_RQ); - MLX5_SET(arm_rq_in, srq_in, op_mod, MLX5_ARM_RQ_IN_OP_MOD_SRQ); - MLX5_SET(arm_rq_in, srq_in, srq_number, srq->srqn); - MLX5_SET(arm_rq_in, srq_in, lwm, lwm); - MLX5_SET(arm_rq_in, srq_in, uid, srq->uid); + MLX5_SET(arm_rq_in, in, opcode, MLX5_CMD_OP_ARM_RQ); + MLX5_SET(arm_rq_in, in, op_mod, MLX5_ARM_RQ_IN_OP_MOD_SRQ); + MLX5_SET(arm_rq_in, in, srq_number, srq->srqn); + MLX5_SET(arm_rq_in, in, lwm, lwm); + MLX5_SET(arm_rq_in, in, uid, srq->uid); - return mlx5_cmd_exec(dev->mdev, srq_in, sizeof(srq_in), srq_out, - sizeof(srq_out)); + return mlx5_cmd_exec_in(dev->mdev, arm_rq, in); } static int query_srq_cmd(struct mlx5_ib_dev *dev, struct mlx5_core_srq *srq, struct mlx5_srq_attr *out) { - u32 srq_in[MLX5_ST_SZ_DW(query_srq_in)] = {0}; + u32 in[MLX5_ST_SZ_DW(query_srq_in)] = {}; u32 *srq_out; void *srqc; int err; @@ -172,20 +167,18 @@ static int query_srq_cmd(struct mlx5_ib_dev *dev, struct mlx5_core_srq *srq, if (!srq_out) return -ENOMEM; - MLX5_SET(query_srq_in, srq_in, opcode, - MLX5_CMD_OP_QUERY_SRQ); - MLX5_SET(query_srq_in, srq_in, srqn, srq->srqn); - err = mlx5_cmd_exec(dev->mdev, srq_in, sizeof(srq_in), srq_out, - MLX5_ST_SZ_BYTES(query_srq_out)); + MLX5_SET(query_srq_in, in, opcode, MLX5_CMD_OP_QUERY_SRQ); + MLX5_SET(query_srq_in, in, srqn, srq->srqn); + err = mlx5_cmd_exec_inout(dev->mdev, query_srq, in, out); if (err) goto out; - srqc = MLX5_ADDR_OF(query_srq_out, srq_out, srq_context_entry); + srqc = MLX5_ADDR_OF(query_srq_out, out, srq_context_entry); get_srqc(srqc, out); if (MLX5_GET(srqc, srqc, state) != MLX5_SRQC_STATE_GOOD) out->flags |= MLX5_SRQ_FLAG_ERR; out: - kvfree(srq_out); + kvfree(out); return err; } @@ -234,39 +227,35 @@ out: static int destroy_xrc_srq_cmd(struct mlx5_ib_dev *dev, struct mlx5_core_srq *srq) { - u32 xrcsrq_in[MLX5_ST_SZ_DW(destroy_xrc_srq_in)] = {0}; - u32 xrcsrq_out[MLX5_ST_SZ_DW(destroy_xrc_srq_out)] = {0}; + u32 in[MLX5_ST_SZ_DW(destroy_xrc_srq_in)] = {}; - MLX5_SET(destroy_xrc_srq_in, xrcsrq_in, opcode, - MLX5_CMD_OP_DESTROY_XRC_SRQ); - MLX5_SET(destroy_xrc_srq_in, xrcsrq_in, xrc_srqn, srq->srqn); - MLX5_SET(destroy_xrc_srq_in, xrcsrq_in, uid, srq->uid); + MLX5_SET(destroy_xrc_srq_in, in, opcode, MLX5_CMD_OP_DESTROY_XRC_SRQ); + MLX5_SET(destroy_xrc_srq_in, in, xrc_srqn, srq->srqn); + MLX5_SET(destroy_xrc_srq_in, in, uid, srq->uid); - return mlx5_cmd_exec(dev->mdev, xrcsrq_in, sizeof(xrcsrq_in), - xrcsrq_out, sizeof(xrcsrq_out)); + return mlx5_cmd_exec_in(dev->mdev, destroy_xrc_srq, in); } static int arm_xrc_srq_cmd(struct mlx5_ib_dev *dev, struct mlx5_core_srq *srq, u16 lwm) { - u32 xrcsrq_in[MLX5_ST_SZ_DW(arm_xrc_srq_in)] = {0}; - u32 xrcsrq_out[MLX5_ST_SZ_DW(arm_xrc_srq_out)] = {0}; + u32 in[MLX5_ST_SZ_DW(arm_xrc_srq_in)] = {}; - MLX5_SET(arm_xrc_srq_in, xrcsrq_in, opcode, MLX5_CMD_OP_ARM_XRC_SRQ); - MLX5_SET(arm_xrc_srq_in, xrcsrq_in, op_mod, MLX5_ARM_XRC_SRQ_IN_OP_MOD_XRC_SRQ); - MLX5_SET(arm_xrc_srq_in, xrcsrq_in, xrc_srqn, srq->srqn); - MLX5_SET(arm_xrc_srq_in, xrcsrq_in, lwm, lwm); - MLX5_SET(arm_xrc_srq_in, xrcsrq_in, uid, srq->uid); + MLX5_SET(arm_xrc_srq_in, in, opcode, MLX5_CMD_OP_ARM_XRC_SRQ); + MLX5_SET(arm_xrc_srq_in, in, op_mod, + MLX5_ARM_XRC_SRQ_IN_OP_MOD_XRC_SRQ); + MLX5_SET(arm_xrc_srq_in, in, xrc_srqn, srq->srqn); + MLX5_SET(arm_xrc_srq_in, in, lwm, lwm); + MLX5_SET(arm_xrc_srq_in, in, uid, srq->uid); - return mlx5_cmd_exec(dev->mdev, xrcsrq_in, sizeof(xrcsrq_in), - xrcsrq_out, sizeof(xrcsrq_out)); + return mlx5_cmd_exec_in(dev->mdev, arm_xrc_srq, in); } static int query_xrc_srq_cmd(struct mlx5_ib_dev *dev, struct mlx5_core_srq *srq, struct mlx5_srq_attr *out) { - u32 xrcsrq_in[MLX5_ST_SZ_DW(query_xrc_srq_in)]; + u32 in[MLX5_ST_SZ_DW(query_xrc_srq_in)] = {}; u32 *xrcsrq_out; void *xrc_srqc; int err; @@ -274,14 +263,11 @@ static int query_xrc_srq_cmd(struct mlx5_ib_dev *dev, xrcsrq_out = kvzalloc(MLX5_ST_SZ_BYTES(query_xrc_srq_out), GFP_KERNEL); if (!xrcsrq_out) return -ENOMEM; - memset(xrcsrq_in, 0, sizeof(xrcsrq_in)); - MLX5_SET(query_xrc_srq_in, xrcsrq_in, opcode, - MLX5_CMD_OP_QUERY_XRC_SRQ); - MLX5_SET(query_xrc_srq_in, xrcsrq_in, xrc_srqn, srq->srqn); + MLX5_SET(query_xrc_srq_in, in, opcode, MLX5_CMD_OP_QUERY_XRC_SRQ); + MLX5_SET(query_xrc_srq_in, in, xrc_srqn, srq->srqn); - err = mlx5_cmd_exec(dev->mdev, xrcsrq_in, sizeof(xrcsrq_in), - xrcsrq_out, MLX5_ST_SZ_BYTES(query_xrc_srq_out)); + err = mlx5_cmd_exec_inout(dev->mdev, query_xrc_srq, in, xrcsrq_out); if (err) goto out; @@ -341,13 +327,12 @@ out: static int destroy_rmp_cmd(struct mlx5_ib_dev *dev, struct mlx5_core_srq *srq) { - u32 in[MLX5_ST_SZ_DW(destroy_rmp_in)] = {}; - u32 out[MLX5_ST_SZ_DW(destroy_rmp_out)] = {}; + u32 in[MLX5_ST_SZ_DW(destroy_rmp_in)] = {}; MLX5_SET(destroy_rmp_in, in, opcode, MLX5_CMD_OP_DESTROY_RMP); MLX5_SET(destroy_rmp_in, in, rmpn, srq->srqn); MLX5_SET(destroy_rmp_in, in, uid, srq->uid); - return mlx5_cmd_exec(dev->mdev, in, sizeof(in), out, sizeof(out)); + return mlx5_cmd_exec_in(dev->mdev, destroy_rmp, in); } static int arm_rmp_cmd(struct mlx5_ib_dev *dev, struct mlx5_core_srq *srq, @@ -384,7 +369,7 @@ static int arm_rmp_cmd(struct mlx5_ib_dev *dev, struct mlx5_core_srq *srq, MLX5_SET(rmpc, rmpc, state, MLX5_RMPC_STATE_RDY); MLX5_SET(modify_rmp_in, in, opcode, MLX5_CMD_OP_MODIFY_RMP); - err = mlx5_cmd_exec(dev->mdev, in, inlen, out, outlen); + err = mlx5_cmd_exec_inout(dev->mdev, modify_rmp, in, out); out: kvfree(in); @@ -414,7 +399,7 @@ static int query_rmp_cmd(struct mlx5_ib_dev *dev, struct mlx5_core_srq *srq, MLX5_SET(query_rmp_in, rmp_in, opcode, MLX5_CMD_OP_QUERY_RMP); MLX5_SET(query_rmp_in, rmp_in, rmpn, srq->srqn); - err = mlx5_cmd_exec(dev->mdev, rmp_in, inlen, rmp_out, outlen); + err = mlx5_cmd_exec_inout(dev->mdev, query_rmp, rmp_in, rmp_out); if (err) goto out; @@ -477,36 +462,34 @@ static int create_xrq_cmd(struct mlx5_ib_dev *dev, struct mlx5_core_srq *srq, static int destroy_xrq_cmd(struct mlx5_ib_dev *dev, struct mlx5_core_srq *srq) { - u32 in[MLX5_ST_SZ_DW(destroy_xrq_in)] = {0}; - u32 out[MLX5_ST_SZ_DW(destroy_xrq_out)] = {0}; + u32 in[MLX5_ST_SZ_DW(destroy_xrq_in)] = {}; MLX5_SET(destroy_xrq_in, in, opcode, MLX5_CMD_OP_DESTROY_XRQ); - MLX5_SET(destroy_xrq_in, in, xrqn, srq->srqn); + MLX5_SET(destroy_xrq_in, in, xrqn, srq->srqn); MLX5_SET(destroy_xrq_in, in, uid, srq->uid); - return mlx5_cmd_exec(dev->mdev, in, sizeof(in), out, sizeof(out)); + return mlx5_cmd_exec_in(dev->mdev, destroy_xrq, in); } static int arm_xrq_cmd(struct mlx5_ib_dev *dev, struct mlx5_core_srq *srq, u16 lwm) { - u32 out[MLX5_ST_SZ_DW(arm_rq_out)] = {0}; - u32 in[MLX5_ST_SZ_DW(arm_rq_in)] = {0}; + u32 in[MLX5_ST_SZ_DW(arm_rq_in)] = {}; - MLX5_SET(arm_rq_in, in, opcode, MLX5_CMD_OP_ARM_RQ); - MLX5_SET(arm_rq_in, in, op_mod, MLX5_ARM_RQ_IN_OP_MOD_XRQ); + MLX5_SET(arm_rq_in, in, opcode, MLX5_CMD_OP_ARM_RQ); + MLX5_SET(arm_rq_in, in, op_mod, MLX5_ARM_RQ_IN_OP_MOD_XRQ); MLX5_SET(arm_rq_in, in, srq_number, srq->srqn); - MLX5_SET(arm_rq_in, in, lwm, lwm); + MLX5_SET(arm_rq_in, in, lwm, lwm); MLX5_SET(arm_rq_in, in, uid, srq->uid); - return mlx5_cmd_exec(dev->mdev, in, sizeof(in), out, sizeof(out)); + return mlx5_cmd_exec_in(dev->mdev, arm_rq, in); } static int query_xrq_cmd(struct mlx5_ib_dev *dev, struct mlx5_core_srq *srq, struct mlx5_srq_attr *out) { - u32 in[MLX5_ST_SZ_DW(query_xrq_in)] = {0}; + u32 in[MLX5_ST_SZ_DW(query_xrq_in)] = {}; u32 *xrq_out; int outlen = MLX5_ST_SZ_BYTES(query_xrq_out); void *xrqc; @@ -519,7 +502,7 @@ static int query_xrq_cmd(struct mlx5_ib_dev *dev, struct mlx5_core_srq *srq, MLX5_SET(query_xrq_in, in, opcode, MLX5_CMD_OP_QUERY_XRQ); MLX5_SET(query_xrq_in, in, xrqn, srq->srqn); - err = mlx5_cmd_exec(dev->mdev, in, sizeof(in), xrq_out, outlen); + err = mlx5_cmd_exec_inout(dev->mdev, query_xrq, in, xrq_out); if (err) goto out; -- cgit v1.2.3 From 6671cde83ddb9a65fd0a69e0896d089ace0e195c Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Wed, 6 May 2020 09:55:12 +0300 Subject: RDMA/mlx5: Refactor mlx5_post_send() to improve readability Add small helpers in order to avoid code duplication and improve code readability. Decrease the amount of code in the gigantic post_send function and divide it to readable methods that will help in code maintenance in the future. Link: https://lore.kernel.org/r/20200506065513.4668-3-leon@kernel.org Signed-off-by: Max Gurtovoy Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/qp.c | 490 ++++++++++++++++++++++------------------ 1 file changed, 276 insertions(+), 214 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index e624886bcf85..1e3dcfd1b230 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -5234,18 +5234,279 @@ static void finish_wqe(struct mlx5_ib_qp *qp, cur_edge; } +static void handle_rdma_op(const struct ib_send_wr *wr, void **seg, int *size) +{ + set_raddr_seg(*seg, rdma_wr(wr)->remote_addr, rdma_wr(wr)->rkey); + *seg += sizeof(struct mlx5_wqe_raddr_seg); + *size += sizeof(struct mlx5_wqe_raddr_seg) / 16; +} + +static void handle_local_inv(struct mlx5_ib_qp *qp, const struct ib_send_wr *wr, + struct mlx5_wqe_ctrl_seg **ctrl, void **seg, + int *size, void **cur_edge, unsigned int idx) +{ + qp->sq.wr_data[idx] = IB_WR_LOCAL_INV; + (*ctrl)->imm = cpu_to_be32(wr->ex.invalidate_rkey); + set_linv_wr(qp, seg, size, cur_edge); +} + +static int handle_reg_mr(struct mlx5_ib_qp *qp, const struct ib_send_wr *wr, + struct mlx5_wqe_ctrl_seg **ctrl, void **seg, int *size, + void **cur_edge, unsigned int idx) +{ + qp->sq.wr_data[idx] = IB_WR_REG_MR; + (*ctrl)->imm = cpu_to_be32(reg_wr(wr)->key); + return set_reg_wr(qp, reg_wr(wr), seg, size, cur_edge, true); +} + +static int handle_psv(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, + const struct ib_send_wr *wr, + struct mlx5_wqe_ctrl_seg **ctrl, void **seg, int *size, + void **cur_edge, unsigned int *idx, int nreq, + struct ib_sig_domain *domain, u32 psv_index, + u8 next_fence) +{ + int err; + + /* + * SET_PSV WQEs are not signaled and solicited on error. + */ + err = __begin_wqe(qp, seg, ctrl, wr, idx, size, cur_edge, nreq, + false, true); + if (unlikely(err)) { + mlx5_ib_warn(dev, "\n"); + err = -ENOMEM; + goto out; + } + err = set_psv_wr(domain, psv_index, seg, size); + if (unlikely(err)) { + mlx5_ib_warn(dev, "\n"); + goto out; + } + finish_wqe(qp, *ctrl, *seg, *size, *cur_edge, *idx, wr->wr_id, nreq, + next_fence, MLX5_OPCODE_SET_PSV); + +out: + return err; +} + +static int handle_reg_mr_integrity(struct mlx5_ib_dev *dev, + struct mlx5_ib_qp *qp, const struct ib_send_wr *wr, + struct mlx5_wqe_ctrl_seg **ctrl, void **seg, int *size, + void **cur_edge, unsigned int *idx, int nreq, u8 fence, + u8 next_fence) +{ + struct mlx5_ib_mr *mr; + struct mlx5_ib_mr *pi_mr; + struct mlx5_ib_mr pa_pi_mr; + struct ib_sig_attrs *sig_attrs; + struct ib_reg_wr reg_pi_wr; + int err; + + qp->sq.wr_data[*idx] = IB_WR_REG_MR_INTEGRITY; + + mr = to_mmr(reg_wr(wr)->mr); + pi_mr = mr->pi_mr; + + if (pi_mr) { + memset(®_pi_wr, 0, + sizeof(struct ib_reg_wr)); + + reg_pi_wr.mr = &pi_mr->ibmr; + reg_pi_wr.access = reg_wr(wr)->access; + reg_pi_wr.key = pi_mr->ibmr.rkey; + + (*ctrl)->imm = cpu_to_be32(reg_pi_wr.key); + /* UMR for data + prot registration */ + err = set_reg_wr(qp, ®_pi_wr, seg, size, cur_edge, false); + if (unlikely(err)) + goto out; + + finish_wqe(qp, *ctrl, *seg, *size, *cur_edge, *idx, wr->wr_id, + nreq, fence, MLX5_OPCODE_UMR); + + err = begin_wqe(qp, seg, ctrl, wr, idx, size, cur_edge, nreq); + if (unlikely(err)) { + mlx5_ib_warn(dev, "\n"); + err = -ENOMEM; + goto out; + } + } else { + memset(&pa_pi_mr, 0, sizeof(struct mlx5_ib_mr)); + /* No UMR, use local_dma_lkey */ + pa_pi_mr.ibmr.lkey = mr->ibmr.pd->local_dma_lkey; + pa_pi_mr.ndescs = mr->ndescs; + pa_pi_mr.data_length = mr->data_length; + pa_pi_mr.data_iova = mr->data_iova; + if (mr->meta_ndescs) { + pa_pi_mr.meta_ndescs = mr->meta_ndescs; + pa_pi_mr.meta_length = mr->meta_length; + pa_pi_mr.pi_iova = mr->pi_iova; + } + + pa_pi_mr.ibmr.length = mr->ibmr.length; + mr->pi_mr = &pa_pi_mr; + } + (*ctrl)->imm = cpu_to_be32(mr->ibmr.rkey); + /* UMR for sig MR */ + err = set_pi_umr_wr(wr, qp, seg, size, cur_edge); + if (unlikely(err)) { + mlx5_ib_warn(dev, "\n"); + goto out; + } + finish_wqe(qp, *ctrl, *seg, *size, *cur_edge, *idx, wr->wr_id, nreq, + fence, MLX5_OPCODE_UMR); + + sig_attrs = mr->ibmr.sig_attrs; + err = handle_psv(dev, qp, wr, ctrl, seg, size, cur_edge, idx, nreq, + &sig_attrs->mem, mr->sig->psv_memory.psv_idx, + next_fence); + if (unlikely(err)) + goto out; + + err = handle_psv(dev, qp, wr, ctrl, seg, size, cur_edge, idx, nreq, + &sig_attrs->wire, mr->sig->psv_wire.psv_idx, + next_fence); + if (unlikely(err)) + goto out; + + qp->next_fence = MLX5_FENCE_MODE_INITIATOR_SMALL; + +out: + return err; +} + +static int handle_qpt_rc(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, + const struct ib_send_wr *wr, + struct mlx5_wqe_ctrl_seg **ctrl, void **seg, int *size, + void **cur_edge, unsigned int *idx, int nreq, u8 fence, + u8 next_fence, int *num_sge) +{ + int err = 0; + + switch (wr->opcode) { + case IB_WR_RDMA_READ: + case IB_WR_RDMA_WRITE: + case IB_WR_RDMA_WRITE_WITH_IMM: + handle_rdma_op(wr, seg, size); + break; + + case IB_WR_ATOMIC_CMP_AND_SWP: + case IB_WR_ATOMIC_FETCH_AND_ADD: + case IB_WR_MASKED_ATOMIC_CMP_AND_SWP: + mlx5_ib_warn(dev, "Atomic operations are not supported yet\n"); + err = -EOPNOTSUPP; + goto out; + + case IB_WR_LOCAL_INV: + handle_local_inv(qp, wr, ctrl, seg, size, cur_edge, *idx); + *num_sge = 0; + break; + + case IB_WR_REG_MR: + err = handle_reg_mr(qp, wr, ctrl, seg, size, cur_edge, *idx); + if (unlikely(err)) + goto out; + *num_sge = 0; + break; + + case IB_WR_REG_MR_INTEGRITY: + err = handle_reg_mr_integrity(dev, qp, wr, ctrl, seg, size, + cur_edge, idx, nreq, fence, + next_fence); + if (unlikely(err)) + goto out; + *num_sge = 0; + break; + + default: + break; + } + +out: + return err; +} + +static void handle_qpt_uc(const struct ib_send_wr *wr, void **seg, int *size) +{ + switch (wr->opcode) { + case IB_WR_RDMA_WRITE: + case IB_WR_RDMA_WRITE_WITH_IMM: + handle_rdma_op(wr, seg, size); + break; + default: + break; + } +} + +static void handle_qpt_hw_gsi(struct mlx5_ib_qp *qp, + const struct ib_send_wr *wr, void **seg, + int *size, void **cur_edge) +{ + set_datagram_seg(*seg, wr); + *seg += sizeof(struct mlx5_wqe_datagram_seg); + *size += sizeof(struct mlx5_wqe_datagram_seg) / 16; + handle_post_send_edge(&qp->sq, seg, *size, cur_edge); +} + +static void handle_qpt_ud(struct mlx5_ib_qp *qp, const struct ib_send_wr *wr, + void **seg, int *size, void **cur_edge) +{ + set_datagram_seg(*seg, wr); + *seg += sizeof(struct mlx5_wqe_datagram_seg); + *size += sizeof(struct mlx5_wqe_datagram_seg) / 16; + handle_post_send_edge(&qp->sq, seg, *size, cur_edge); + + /* handle qp that supports ud offload */ + if (qp->flags & IB_QP_CREATE_IPOIB_UD_LSO) { + struct mlx5_wqe_eth_pad *pad; + + pad = *seg; + memset(pad, 0, sizeof(struct mlx5_wqe_eth_pad)); + *seg += sizeof(struct mlx5_wqe_eth_pad); + *size += sizeof(struct mlx5_wqe_eth_pad) / 16; + set_eth_seg(wr, qp, seg, size, cur_edge); + handle_post_send_edge(&qp->sq, seg, *size, cur_edge); + } +} + +static int handle_qpt_reg_umr(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, + const struct ib_send_wr *wr, + struct mlx5_wqe_ctrl_seg **ctrl, void **seg, + int *size, void **cur_edge, unsigned int idx) +{ + int err = 0; + + if (unlikely(wr->opcode != MLX5_IB_WR_UMR)) { + err = -EINVAL; + mlx5_ib_warn(dev, "bad opcode %d\n", wr->opcode); + goto out; + } + + qp->sq.wr_data[idx] = MLX5_IB_WR_UMR; + (*ctrl)->imm = cpu_to_be32(umr_wr(wr)->mkey); + err = set_reg_umr_segment(dev, *seg, wr, + !!(MLX5_CAP_GEN(dev->mdev, atomic))); + if (unlikely(err)) + goto out; + *seg += sizeof(struct mlx5_wqe_umr_ctrl_seg); + *size += sizeof(struct mlx5_wqe_umr_ctrl_seg) / 16; + handle_post_send_edge(&qp->sq, seg, *size, cur_edge); + set_reg_mkey_segment(*seg, wr); + *seg += sizeof(struct mlx5_mkey_seg); + *size += sizeof(struct mlx5_mkey_seg) / 16; + handle_post_send_edge(&qp->sq, seg, *size, cur_edge); +out: + return err; +} + static int _mlx5_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, const struct ib_send_wr **bad_wr, bool drain) { struct mlx5_wqe_ctrl_seg *ctrl = NULL; /* compiler warning */ struct mlx5_ib_dev *dev = to_mdev(ibqp->device); struct mlx5_core_dev *mdev = dev->mdev; - struct ib_reg_wr reg_pi_wr; struct mlx5_ib_qp *qp; - struct mlx5_ib_mr *mr; - struct mlx5_ib_mr *pi_mr; - struct mlx5_ib_mr pa_pi_mr; - struct ib_sig_attrs *sig_attrs; struct mlx5_wqe_xrc_seg *xrc; struct mlx5_bf *bf; void *cur_edge; @@ -5321,186 +5582,20 @@ static int _mlx5_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, size += sizeof(*xrc) / 16; /* fall through */ case IB_QPT_RC: - switch (wr->opcode) { - case IB_WR_RDMA_READ: - case IB_WR_RDMA_WRITE: - case IB_WR_RDMA_WRITE_WITH_IMM: - set_raddr_seg(seg, rdma_wr(wr)->remote_addr, - rdma_wr(wr)->rkey); - seg += sizeof(struct mlx5_wqe_raddr_seg); - size += sizeof(struct mlx5_wqe_raddr_seg) / 16; - break; - - case IB_WR_ATOMIC_CMP_AND_SWP: - case IB_WR_ATOMIC_FETCH_AND_ADD: - case IB_WR_MASKED_ATOMIC_CMP_AND_SWP: - mlx5_ib_warn(dev, "Atomic operations are not supported yet\n"); - err = -ENOSYS; + err = handle_qpt_rc(dev, qp, wr, &ctrl, &seg, &size, + &cur_edge, &idx, nreq, fence, + next_fence, &num_sge); + if (unlikely(err)) { *bad_wr = wr; goto out; - - case IB_WR_LOCAL_INV: - qp->sq.wr_data[idx] = IB_WR_LOCAL_INV; - ctrl->imm = cpu_to_be32(wr->ex.invalidate_rkey); - set_linv_wr(qp, &seg, &size, &cur_edge); - num_sge = 0; - break; - - case IB_WR_REG_MR: - qp->sq.wr_data[idx] = IB_WR_REG_MR; - ctrl->imm = cpu_to_be32(reg_wr(wr)->key); - err = set_reg_wr(qp, reg_wr(wr), &seg, &size, - &cur_edge, true); - if (err) { - *bad_wr = wr; - goto out; - } - num_sge = 0; - break; - - case IB_WR_REG_MR_INTEGRITY: - qp->sq.wr_data[idx] = IB_WR_REG_MR_INTEGRITY; - - mr = to_mmr(reg_wr(wr)->mr); - pi_mr = mr->pi_mr; - - if (pi_mr) { - memset(®_pi_wr, 0, - sizeof(struct ib_reg_wr)); - - reg_pi_wr.mr = &pi_mr->ibmr; - reg_pi_wr.access = reg_wr(wr)->access; - reg_pi_wr.key = pi_mr->ibmr.rkey; - - ctrl->imm = cpu_to_be32(reg_pi_wr.key); - /* UMR for data + prot registration */ - err = set_reg_wr(qp, ®_pi_wr, &seg, - &size, &cur_edge, - false); - if (err) { - *bad_wr = wr; - goto out; - } - finish_wqe(qp, ctrl, seg, size, - cur_edge, idx, wr->wr_id, - nreq, fence, - MLX5_OPCODE_UMR); - - err = begin_wqe(qp, &seg, &ctrl, wr, - &idx, &size, &cur_edge, - nreq); - if (err) { - mlx5_ib_warn(dev, "\n"); - err = -ENOMEM; - *bad_wr = wr; - goto out; - } - } else { - memset(&pa_pi_mr, 0, - sizeof(struct mlx5_ib_mr)); - /* No UMR, use local_dma_lkey */ - pa_pi_mr.ibmr.lkey = - mr->ibmr.pd->local_dma_lkey; - - pa_pi_mr.ndescs = mr->ndescs; - pa_pi_mr.data_length = mr->data_length; - pa_pi_mr.data_iova = mr->data_iova; - if (mr->meta_ndescs) { - pa_pi_mr.meta_ndescs = - mr->meta_ndescs; - pa_pi_mr.meta_length = - mr->meta_length; - pa_pi_mr.pi_iova = mr->pi_iova; - } - - pa_pi_mr.ibmr.length = mr->ibmr.length; - mr->pi_mr = &pa_pi_mr; - } - ctrl->imm = cpu_to_be32(mr->ibmr.rkey); - /* UMR for sig MR */ - err = set_pi_umr_wr(wr, qp, &seg, &size, - &cur_edge); - if (err) { - mlx5_ib_warn(dev, "\n"); - *bad_wr = wr; - goto out; - } - finish_wqe(qp, ctrl, seg, size, cur_edge, idx, - wr->wr_id, nreq, fence, - MLX5_OPCODE_UMR); - - /* - * SET_PSV WQEs are not signaled and solicited - * on error - */ - sig_attrs = mr->ibmr.sig_attrs; - err = __begin_wqe(qp, &seg, &ctrl, wr, &idx, - &size, &cur_edge, nreq, false, - true); - if (err) { - mlx5_ib_warn(dev, "\n"); - err = -ENOMEM; - *bad_wr = wr; - goto out; - } - err = set_psv_wr(&sig_attrs->mem, - mr->sig->psv_memory.psv_idx, - &seg, &size); - if (err) { - mlx5_ib_warn(dev, "\n"); - *bad_wr = wr; - goto out; - } - finish_wqe(qp, ctrl, seg, size, cur_edge, idx, - wr->wr_id, nreq, next_fence, - MLX5_OPCODE_SET_PSV); - - err = __begin_wqe(qp, &seg, &ctrl, wr, &idx, - &size, &cur_edge, nreq, false, - true); - if (err) { - mlx5_ib_warn(dev, "\n"); - err = -ENOMEM; - *bad_wr = wr; - goto out; - } - err = set_psv_wr(&sig_attrs->wire, - mr->sig->psv_wire.psv_idx, - &seg, &size); - if (err) { - mlx5_ib_warn(dev, "\n"); - *bad_wr = wr; - goto out; - } - finish_wqe(qp, ctrl, seg, size, cur_edge, idx, - wr->wr_id, nreq, next_fence, - MLX5_OPCODE_SET_PSV); - - qp->next_fence = - MLX5_FENCE_MODE_INITIATOR_SMALL; - num_sge = 0; + } else if (wr->opcode == IB_WR_REG_MR_INTEGRITY) { goto skip_psv; - - default: - break; } break; case IB_QPT_UC: - switch (wr->opcode) { - case IB_WR_RDMA_WRITE: - case IB_WR_RDMA_WRITE_WITH_IMM: - set_raddr_seg(seg, rdma_wr(wr)->remote_addr, - rdma_wr(wr)->rkey); - seg += sizeof(struct mlx5_wqe_raddr_seg); - size += sizeof(struct mlx5_wqe_raddr_seg) / 16; - break; - - default: - break; - } + handle_qpt_uc(wr, &seg, &size); break; - case IB_QPT_SMI: if (unlikely(!mdev->port_caps[qp->port - 1].has_smi)) { mlx5_ib_warn(dev, "Send SMP MADs is not allowed\n"); @@ -5510,49 +5605,16 @@ static int _mlx5_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, } /* fall through */ case MLX5_IB_QPT_HW_GSI: - set_datagram_seg(seg, wr); - seg += sizeof(struct mlx5_wqe_datagram_seg); - size += sizeof(struct mlx5_wqe_datagram_seg) / 16; - handle_post_send_edge(&qp->sq, &seg, size, &cur_edge); - + handle_qpt_hw_gsi(qp, wr, &seg, &size, &cur_edge); break; case IB_QPT_UD: - set_datagram_seg(seg, wr); - seg += sizeof(struct mlx5_wqe_datagram_seg); - size += sizeof(struct mlx5_wqe_datagram_seg) / 16; - handle_post_send_edge(&qp->sq, &seg, size, &cur_edge); - - /* handle qp that supports ud offload */ - if (qp->flags & IB_QP_CREATE_IPOIB_UD_LSO) { - struct mlx5_wqe_eth_pad *pad; - - pad = seg; - memset(pad, 0, sizeof(struct mlx5_wqe_eth_pad)); - seg += sizeof(struct mlx5_wqe_eth_pad); - size += sizeof(struct mlx5_wqe_eth_pad) / 16; - set_eth_seg(wr, qp, &seg, &size, &cur_edge); - handle_post_send_edge(&qp->sq, &seg, size, - &cur_edge); - } + handle_qpt_ud(qp, wr, &seg, &size, &cur_edge); break; case MLX5_IB_QPT_REG_UMR: - if (wr->opcode != MLX5_IB_WR_UMR) { - err = -EINVAL; - mlx5_ib_warn(dev, "bad opcode\n"); - goto out; - } - qp->sq.wr_data[idx] = MLX5_IB_WR_UMR; - ctrl->imm = cpu_to_be32(umr_wr(wr)->mkey); - err = set_reg_umr_segment(dev, seg, wr, !!(MLX5_CAP_GEN(mdev, atomic))); + err = handle_qpt_reg_umr(dev, qp, wr, &ctrl, &seg, + &size, &cur_edge, idx); if (unlikely(err)) goto out; - seg += sizeof(struct mlx5_wqe_umr_ctrl_seg); - size += sizeof(struct mlx5_wqe_umr_ctrl_seg) / 16; - handle_post_send_edge(&qp->sq, &seg, size, &cur_edge); - set_reg_mkey_segment(seg, wr); - seg += sizeof(struct mlx5_mkey_seg); - size += sizeof(struct mlx5_mkey_seg) / 16; - handle_post_send_edge(&qp->sq, &seg, size, &cur_edge); break; default: -- cgit v1.2.3 From 029e88fd1e6142ded73f07e2baef3e8a2a87e0ed Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Wed, 6 May 2020 09:55:13 +0300 Subject: RDMA/mlx5: Move all WR logic from qp.c to separate file Split qp.c by removing all WR logic to separate file. Link: https://lore.kernel.org/r/20200506065513.4668-4-leon@kernel.org Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/Makefile | 3 +- drivers/infiniband/hw/mlx5/main.c | 5 +- drivers/infiniband/hw/mlx5/mlx5_ib.h | 4 - drivers/infiniband/hw/mlx5/qp.c | 1550 +--------------------------------- drivers/infiniband/hw/mlx5/wr.c | 1504 +++++++++++++++++++++++++++++++++ drivers/infiniband/hw/mlx5/wr.h | 76 ++ 6 files changed, 1589 insertions(+), 1553 deletions(-) create mode 100644 drivers/infiniband/hw/mlx5/wr.c create mode 100644 drivers/infiniband/hw/mlx5/wr.h diff --git a/drivers/infiniband/hw/mlx5/Makefile b/drivers/infiniband/hw/mlx5/Makefile index 228be05fbaf8..8cca61c671f8 100644 --- a/drivers/infiniband/hw/mlx5/Makefile +++ b/drivers/infiniband/hw/mlx5/Makefile @@ -16,7 +16,8 @@ mlx5_ib-y := ah.o \ qpc.o \ restrack.o \ srq.o \ - srq_cmd.o + srq_cmd.o \ + wr.o mlx5_ib-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += odp.o mlx5_ib-$(CONFIG_MLX5_ESWITCH) += ib_rep.o diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 0c0fbf4160e0..40bf71efaeb0 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -61,6 +61,7 @@ #include "cmd.h" #include "srq.h" #include "qp.h" +#include "wr.h" #include #include #include @@ -6657,8 +6658,8 @@ static const struct ib_device_ops mlx5_ib_dev_ops = { .modify_qp = mlx5_ib_modify_qp, .modify_srq = mlx5_ib_modify_srq, .poll_cq = mlx5_ib_poll_cq, - .post_recv = mlx5_ib_post_recv, - .post_send = mlx5_ib_post_send, + .post_recv = mlx5_ib_post_recv_nodrain, + .post_send = mlx5_ib_post_send_nodrain, .post_srq_recv = mlx5_ib_post_srq_recv, .process_mad = mlx5_ib_process_mad, .query_ah = mlx5_ib_query_ah, diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 3041808773e6..482b54eb9764 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -1178,10 +1178,6 @@ int mlx5_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr int mlx5_ib_destroy_qp(struct ib_qp *qp, struct ib_udata *udata); void mlx5_ib_drain_sq(struct ib_qp *qp); void mlx5_ib_drain_rq(struct ib_qp *qp); -int mlx5_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, - const struct ib_send_wr **bad_wr); -int mlx5_ib_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr, - const struct ib_recv_wr **bad_wr); int mlx5_ib_read_wqe_sq(struct mlx5_ib_qp *qp, int wqe_index, void *buffer, size_t buflen, size_t *bc); int mlx5_ib_read_wqe_rq(struct mlx5_ib_qp *qp, int wqe_index, void *buffer, diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 1e3dcfd1b230..fb2ea3bf9be4 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -40,6 +40,7 @@ #include "ib_rep.h" #include "cmd.h" #include "qp.h" +#include "wr.h" enum { MLX5_IB_ACK_REQ_FREQ = 8, @@ -52,32 +53,6 @@ enum { MLX5_IB_LINK_TYPE_ETH = 1 }; -enum { - MLX5_IB_SQ_STRIDE = 6, - MLX5_IB_SQ_UMR_INLINE_THRESHOLD = 64, -}; - -static const u32 mlx5_ib_opcode[] = { - [IB_WR_SEND] = MLX5_OPCODE_SEND, - [IB_WR_LSO] = MLX5_OPCODE_LSO, - [IB_WR_SEND_WITH_IMM] = MLX5_OPCODE_SEND_IMM, - [IB_WR_RDMA_WRITE] = MLX5_OPCODE_RDMA_WRITE, - [IB_WR_RDMA_WRITE_WITH_IMM] = MLX5_OPCODE_RDMA_WRITE_IMM, - [IB_WR_RDMA_READ] = MLX5_OPCODE_RDMA_READ, - [IB_WR_ATOMIC_CMP_AND_SWP] = MLX5_OPCODE_ATOMIC_CS, - [IB_WR_ATOMIC_FETCH_AND_ADD] = MLX5_OPCODE_ATOMIC_FA, - [IB_WR_SEND_WITH_INV] = MLX5_OPCODE_SEND_INVAL, - [IB_WR_LOCAL_INV] = MLX5_OPCODE_UMR, - [IB_WR_REG_MR] = MLX5_OPCODE_UMR, - [IB_WR_MASKED_ATOMIC_CMP_AND_SWP] = MLX5_OPCODE_ATOMIC_MASKED_CS, - [IB_WR_MASKED_ATOMIC_FETCH_AND_ADD] = MLX5_OPCODE_ATOMIC_MASKED_FA, - [MLX5_IB_WR_UMR] = MLX5_OPCODE_UMR, -}; - -struct mlx5_wqe_eth_pad { - u8 rsvd0[16]; -}; - enum raw_qp_set_mask_map { MLX5_RAW_QP_MOD_SET_RQ_Q_CTR_ID = 1UL << 0, MLX5_RAW_QP_RATE_LIMIT = 1UL << 1, @@ -1061,30 +1036,6 @@ static void destroy_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, mlx5_frag_buf_free(dev->mdev, &qp->buf); } -/* get_sq_edge - Get the next nearby edge. - * - * An 'edge' is defined as the first following address after the end - * of the fragment or the SQ. Accordingly, during the WQE construction - * which repetitively increases the pointer to write the next data, it - * simply should check if it gets to an edge. - * - * @sq - SQ buffer. - * @idx - Stride index in the SQ buffer. - * - * Return: - * The new edge. - */ -static void *get_sq_edge(struct mlx5_ib_wq *sq, u32 idx) -{ - void *fragment_end; - - fragment_end = mlx5_frag_buf_get_wqe - (&sq->fbc, - mlx5_frag_buf_get_idx_last_contig_stride(&sq->fbc, idx)); - - return fragment_end + MLX5_SEND_WQE_BB; -} - static int _create_kernel_qp(struct mlx5_ib_dev *dev, struct ib_qp_init_attr *init_attr, struct mlx5_ib_qp *qp, u32 **in, int *inlen, @@ -2335,11 +2286,6 @@ static void mlx5_ib_unlock_cqs(struct mlx5_ib_cq *send_cq, struct mlx5_ib_cq *re } } -static struct mlx5_ib_pd *get_pd(struct mlx5_ib_qp *qp) -{ - return to_mpd(qp->ibqp.pd); -} - static void get_cqs(enum ib_qp_type qp_type, struct ib_cq *ib_send_cq, struct ib_cq *ib_recv_cq, struct mlx5_ib_cq **send_cq, struct mlx5_ib_cq **recv_cq) @@ -3758,7 +3704,7 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, if (!context) return -ENOMEM; - pd = get_pd(qp); + pd = to_mpd(qp->ibqp.pd); context->flags = cpu_to_be32(mlx5_st << 16); if (!(attr_mask & IB_QP_PATH_MIG_STATE)) { @@ -4286,1494 +4232,6 @@ out: return err; } -static void _handle_post_send_edge(struct mlx5_ib_wq *sq, void **seg, - u32 wqe_sz, void **cur_edge) -{ - u32 idx; - - idx = (sq->cur_post + (wqe_sz >> 2)) & (sq->wqe_cnt - 1); - *cur_edge = get_sq_edge(sq, idx); - - *seg = mlx5_frag_buf_get_wqe(&sq->fbc, idx); -} - -/* handle_post_send_edge - Check if we get to SQ edge. If yes, update to the - * next nearby edge and get new address translation for current WQE position. - * @sq - SQ buffer. - * @seg: Current WQE position (16B aligned). - * @wqe_sz: Total current WQE size [16B]. - * @cur_edge: Updated current edge. - */ -static inline void handle_post_send_edge(struct mlx5_ib_wq *sq, void **seg, - u32 wqe_sz, void **cur_edge) -{ - if (likely(*seg != *cur_edge)) - return; - - _handle_post_send_edge(sq, seg, wqe_sz, cur_edge); -} - -/* memcpy_send_wqe - copy data from src to WQE and update the relevant WQ's - * pointers. At the end @seg is aligned to 16B regardless the copied size. - * @sq - SQ buffer. - * @cur_edge: Updated current edge. - * @seg: Current WQE position (16B aligned). - * @wqe_sz: Total current WQE size [16B]. - * @src: Pointer to copy from. - * @n: Number of bytes to copy. - */ -static inline void memcpy_send_wqe(struct mlx5_ib_wq *sq, void **cur_edge, - void **seg, u32 *wqe_sz, const void *src, - size_t n) -{ - while (likely(n)) { - size_t leftlen = *cur_edge - *seg; - size_t copysz = min_t(size_t, leftlen, n); - size_t stride; - - memcpy(*seg, src, copysz); - - n -= copysz; - src += copysz; - stride = !n ? ALIGN(copysz, 16) : copysz; - *seg += stride; - *wqe_sz += stride >> 4; - handle_post_send_edge(sq, seg, *wqe_sz, cur_edge); - } -} - -static int mlx5_wq_overflow(struct mlx5_ib_wq *wq, int nreq, struct ib_cq *ib_cq) -{ - struct mlx5_ib_cq *cq; - unsigned cur; - - cur = wq->head - wq->tail; - if (likely(cur + nreq < wq->max_post)) - return 0; - - cq = to_mcq(ib_cq); - spin_lock(&cq->lock); - cur = wq->head - wq->tail; - spin_unlock(&cq->lock); - - return cur + nreq >= wq->max_post; -} - -static __always_inline void set_raddr_seg(struct mlx5_wqe_raddr_seg *rseg, - u64 remote_addr, u32 rkey) -{ - rseg->raddr = cpu_to_be64(remote_addr); - rseg->rkey = cpu_to_be32(rkey); - rseg->reserved = 0; -} - -static void set_eth_seg(const struct ib_send_wr *wr, struct mlx5_ib_qp *qp, - void **seg, int *size, void **cur_edge) -{ - struct mlx5_wqe_eth_seg *eseg = *seg; - - memset(eseg, 0, sizeof(struct mlx5_wqe_eth_seg)); - - if (wr->send_flags & IB_SEND_IP_CSUM) - eseg->cs_flags = MLX5_ETH_WQE_L3_CSUM | - MLX5_ETH_WQE_L4_CSUM; - - if (wr->opcode == IB_WR_LSO) { - struct ib_ud_wr *ud_wr = container_of(wr, struct ib_ud_wr, wr); - size_t left, copysz; - void *pdata = ud_wr->header; - size_t stride; - - left = ud_wr->hlen; - eseg->mss = cpu_to_be16(ud_wr->mss); - eseg->inline_hdr.sz = cpu_to_be16(left); - - /* memcpy_send_wqe should get a 16B align address. Hence, we - * first copy up to the current edge and then, if needed, - * fall-through to memcpy_send_wqe. - */ - copysz = min_t(u64, *cur_edge - (void *)eseg->inline_hdr.start, - left); - memcpy(eseg->inline_hdr.start, pdata, copysz); - stride = ALIGN(sizeof(struct mlx5_wqe_eth_seg) - - sizeof(eseg->inline_hdr.start) + copysz, 16); - *size += stride / 16; - *seg += stride; - - if (copysz < left) { - handle_post_send_edge(&qp->sq, seg, *size, cur_edge); - left -= copysz; - pdata += copysz; - memcpy_send_wqe(&qp->sq, cur_edge, seg, size, pdata, - left); - } - - return; - } - - *seg += sizeof(struct mlx5_wqe_eth_seg); - *size += sizeof(struct mlx5_wqe_eth_seg) / 16; -} - -static void set_datagram_seg(struct mlx5_wqe_datagram_seg *dseg, - const struct ib_send_wr *wr) -{ - memcpy(&dseg->av, &to_mah(ud_wr(wr)->ah)->av, sizeof(struct mlx5_av)); - dseg->av.dqp_dct = cpu_to_be32(ud_wr(wr)->remote_qpn | MLX5_EXTENDED_UD_AV); - dseg->av.key.qkey.qkey = cpu_to_be32(ud_wr(wr)->remote_qkey); -} - -static void set_data_ptr_seg(struct mlx5_wqe_data_seg *dseg, struct ib_sge *sg) -{ - dseg->byte_count = cpu_to_be32(sg->length); - dseg->lkey = cpu_to_be32(sg->lkey); - dseg->addr = cpu_to_be64(sg->addr); -} - -static u64 get_xlt_octo(u64 bytes) -{ - return ALIGN(bytes, MLX5_IB_UMR_XLT_ALIGNMENT) / - MLX5_IB_UMR_OCTOWORD; -} - -static __be64 frwr_mkey_mask(bool atomic) -{ - u64 result; - - result = MLX5_MKEY_MASK_LEN | - MLX5_MKEY_MASK_PAGE_SIZE | - MLX5_MKEY_MASK_START_ADDR | - MLX5_MKEY_MASK_EN_RINVAL | - MLX5_MKEY_MASK_KEY | - MLX5_MKEY_MASK_LR | - MLX5_MKEY_MASK_LW | - MLX5_MKEY_MASK_RR | - MLX5_MKEY_MASK_RW | - MLX5_MKEY_MASK_SMALL_FENCE | - MLX5_MKEY_MASK_FREE; - - if (atomic) - result |= MLX5_MKEY_MASK_A; - - return cpu_to_be64(result); -} - -static __be64 sig_mkey_mask(void) -{ - u64 result; - - result = MLX5_MKEY_MASK_LEN | - MLX5_MKEY_MASK_PAGE_SIZE | - MLX5_MKEY_MASK_START_ADDR | - MLX5_MKEY_MASK_EN_SIGERR | - MLX5_MKEY_MASK_EN_RINVAL | - MLX5_MKEY_MASK_KEY | - MLX5_MKEY_MASK_LR | - MLX5_MKEY_MASK_LW | - MLX5_MKEY_MASK_RR | - MLX5_MKEY_MASK_RW | - MLX5_MKEY_MASK_SMALL_FENCE | - MLX5_MKEY_MASK_FREE | - MLX5_MKEY_MASK_BSF_EN; - - return cpu_to_be64(result); -} - -static void set_reg_umr_seg(struct mlx5_wqe_umr_ctrl_seg *umr, - struct mlx5_ib_mr *mr, u8 flags, bool atomic) -{ - int size = (mr->ndescs + mr->meta_ndescs) * mr->desc_size; - - memset(umr, 0, sizeof(*umr)); - - umr->flags = flags; - umr->xlt_octowords = cpu_to_be16(get_xlt_octo(size)); - umr->mkey_mask = frwr_mkey_mask(atomic); -} - -static void set_linv_umr_seg(struct mlx5_wqe_umr_ctrl_seg *umr) -{ - memset(umr, 0, sizeof(*umr)); - umr->mkey_mask = cpu_to_be64(MLX5_MKEY_MASK_FREE); - umr->flags = MLX5_UMR_INLINE; -} - -static __be64 get_umr_enable_mr_mask(void) -{ - u64 result; - - result = MLX5_MKEY_MASK_KEY | - MLX5_MKEY_MASK_FREE; - - return cpu_to_be64(result); -} - -static __be64 get_umr_disable_mr_mask(void) -{ - u64 result; - - result = MLX5_MKEY_MASK_FREE; - - return cpu_to_be64(result); -} - -static __be64 get_umr_update_translation_mask(void) -{ - u64 result; - - result = MLX5_MKEY_MASK_LEN | - MLX5_MKEY_MASK_PAGE_SIZE | - MLX5_MKEY_MASK_START_ADDR; - - return cpu_to_be64(result); -} - -static __be64 get_umr_update_access_mask(int atomic) -{ - u64 result; - - result = MLX5_MKEY_MASK_LR | - MLX5_MKEY_MASK_LW | - MLX5_MKEY_MASK_RR | - MLX5_MKEY_MASK_RW; - - if (atomic) - result |= MLX5_MKEY_MASK_A; - - return cpu_to_be64(result); -} - -static __be64 get_umr_update_pd_mask(void) -{ - u64 result; - - result = MLX5_MKEY_MASK_PD; - - return cpu_to_be64(result); -} - -static int umr_check_mkey_mask(struct mlx5_ib_dev *dev, u64 mask) -{ - if ((mask & MLX5_MKEY_MASK_PAGE_SIZE && - MLX5_CAP_GEN(dev->mdev, umr_modify_entity_size_disabled)) || - (mask & MLX5_MKEY_MASK_A && - MLX5_CAP_GEN(dev->mdev, umr_modify_atomic_disabled))) - return -EPERM; - return 0; -} - -static int set_reg_umr_segment(struct mlx5_ib_dev *dev, - struct mlx5_wqe_umr_ctrl_seg *umr, - const struct ib_send_wr *wr, int atomic) -{ - const struct mlx5_umr_wr *umrwr = umr_wr(wr); - - memset(umr, 0, sizeof(*umr)); - - if (!umrwr->ignore_free_state) { - if (wr->send_flags & MLX5_IB_SEND_UMR_FAIL_IF_FREE) - /* fail if free */ - umr->flags = MLX5_UMR_CHECK_FREE; - else - /* fail if not free */ - umr->flags = MLX5_UMR_CHECK_NOT_FREE; - } - - umr->xlt_octowords = cpu_to_be16(get_xlt_octo(umrwr->xlt_size)); - if (wr->send_flags & MLX5_IB_SEND_UMR_UPDATE_XLT) { - u64 offset = get_xlt_octo(umrwr->offset); - - umr->xlt_offset = cpu_to_be16(offset & 0xffff); - umr->xlt_offset_47_16 = cpu_to_be32(offset >> 16); - umr->flags |= MLX5_UMR_TRANSLATION_OFFSET_EN; - } - if (wr->send_flags & MLX5_IB_SEND_UMR_UPDATE_TRANSLATION) - umr->mkey_mask |= get_umr_update_translation_mask(); - if (wr->send_flags & MLX5_IB_SEND_UMR_UPDATE_PD_ACCESS) { - umr->mkey_mask |= get_umr_update_access_mask(atomic); - umr->mkey_mask |= get_umr_update_pd_mask(); - } - if (wr->send_flags & MLX5_IB_SEND_UMR_ENABLE_MR) - umr->mkey_mask |= get_umr_enable_mr_mask(); - if (wr->send_flags & MLX5_IB_SEND_UMR_DISABLE_MR) - umr->mkey_mask |= get_umr_disable_mr_mask(); - - if (!wr->num_sge) - umr->flags |= MLX5_UMR_INLINE; - - return umr_check_mkey_mask(dev, be64_to_cpu(umr->mkey_mask)); -} - -static u8 get_umr_flags(int acc) -{ - return (acc & IB_ACCESS_REMOTE_ATOMIC ? MLX5_PERM_ATOMIC : 0) | - (acc & IB_ACCESS_REMOTE_WRITE ? MLX5_PERM_REMOTE_WRITE : 0) | - (acc & IB_ACCESS_REMOTE_READ ? MLX5_PERM_REMOTE_READ : 0) | - (acc & IB_ACCESS_LOCAL_WRITE ? MLX5_PERM_LOCAL_WRITE : 0) | - MLX5_PERM_LOCAL_READ | MLX5_PERM_UMR_EN; -} - -static void set_reg_mkey_seg(struct mlx5_mkey_seg *seg, - struct mlx5_ib_mr *mr, - u32 key, int access) -{ - int ndescs = ALIGN(mr->ndescs + mr->meta_ndescs, 8) >> 1; - - memset(seg, 0, sizeof(*seg)); - - if (mr->access_mode == MLX5_MKC_ACCESS_MODE_MTT) - seg->log2_page_size = ilog2(mr->ibmr.page_size); - else if (mr->access_mode == MLX5_MKC_ACCESS_MODE_KLMS) - /* KLMs take twice the size of MTTs */ - ndescs *= 2; - - seg->flags = get_umr_flags(access) | mr->access_mode; - seg->qpn_mkey7_0 = cpu_to_be32((key & 0xff) | 0xffffff00); - seg->flags_pd = cpu_to_be32(MLX5_MKEY_REMOTE_INVAL); - seg->start_addr = cpu_to_be64(mr->ibmr.iova); - seg->len = cpu_to_be64(mr->ibmr.length); - seg->xlt_oct_size = cpu_to_be32(ndescs); -} - -static void set_linv_mkey_seg(struct mlx5_mkey_seg *seg) -{ - memset(seg, 0, sizeof(*seg)); - seg->status = MLX5_MKEY_STATUS_FREE; -} - -static void set_reg_mkey_segment(struct mlx5_mkey_seg *seg, - const struct ib_send_wr *wr) -{ - const struct mlx5_umr_wr *umrwr = umr_wr(wr); - - memset(seg, 0, sizeof(*seg)); - if (wr->send_flags & MLX5_IB_SEND_UMR_DISABLE_MR) - seg->status = MLX5_MKEY_STATUS_FREE; - - seg->flags = convert_access(umrwr->access_flags); - if (umrwr->pd) - seg->flags_pd = cpu_to_be32(to_mpd(umrwr->pd)->pdn); - if (wr->send_flags & MLX5_IB_SEND_UMR_UPDATE_TRANSLATION && - !umrwr->length) - seg->flags_pd |= cpu_to_be32(MLX5_MKEY_LEN64); - - seg->start_addr = cpu_to_be64(umrwr->virt_addr); - seg->len = cpu_to_be64(umrwr->length); - seg->log2_page_size = umrwr->page_shift; - seg->qpn_mkey7_0 = cpu_to_be32(0xffffff00 | - mlx5_mkey_variant(umrwr->mkey)); -} - -static void set_reg_data_seg(struct mlx5_wqe_data_seg *dseg, - struct mlx5_ib_mr *mr, - struct mlx5_ib_pd *pd) -{ - int bcount = mr->desc_size * (mr->ndescs + mr->meta_ndescs); - - dseg->addr = cpu_to_be64(mr->desc_map); - dseg->byte_count = cpu_to_be32(ALIGN(bcount, 64)); - dseg->lkey = cpu_to_be32(pd->ibpd.local_dma_lkey); -} - -static __be32 send_ieth(const struct ib_send_wr *wr) -{ - switch (wr->opcode) { - case IB_WR_SEND_WITH_IMM: - case IB_WR_RDMA_WRITE_WITH_IMM: - return wr->ex.imm_data; - - case IB_WR_SEND_WITH_INV: - return cpu_to_be32(wr->ex.invalidate_rkey); - - default: - return 0; - } -} - -static u8 calc_sig(void *wqe, int size) -{ - u8 *p = wqe; - u8 res = 0; - int i; - - for (i = 0; i < size; i++) - res ^= p[i]; - - return ~res; -} - -static u8 wq_sig(void *wqe) -{ - return calc_sig(wqe, (*((u8 *)wqe + 8) & 0x3f) << 4); -} - -static int set_data_inl_seg(struct mlx5_ib_qp *qp, const struct ib_send_wr *wr, - void **wqe, int *wqe_sz, void **cur_edge) -{ - struct mlx5_wqe_inline_seg *seg; - size_t offset; - int inl = 0; - int i; - - seg = *wqe; - *wqe += sizeof(*seg); - offset = sizeof(*seg); - - for (i = 0; i < wr->num_sge; i++) { - size_t len = wr->sg_list[i].length; - void *addr = (void *)(unsigned long)(wr->sg_list[i].addr); - - inl += len; - - if (unlikely(inl > qp->max_inline_data)) - return -ENOMEM; - - while (likely(len)) { - size_t leftlen; - size_t copysz; - - handle_post_send_edge(&qp->sq, wqe, - *wqe_sz + (offset >> 4), - cur_edge); - - leftlen = *cur_edge - *wqe; - copysz = min_t(size_t, leftlen, len); - - memcpy(*wqe, addr, copysz); - len -= copysz; - addr += copysz; - *wqe += copysz; - offset += copysz; - } - } - - seg->byte_count = cpu_to_be32(inl | MLX5_INLINE_SEG); - - *wqe_sz += ALIGN(inl + sizeof(seg->byte_count), 16) / 16; - - return 0; -} - -static u16 prot_field_size(enum ib_signature_type type) -{ - switch (type) { - case IB_SIG_TYPE_T10_DIF: - return MLX5_DIF_SIZE; - default: - return 0; - } -} - -static u8 bs_selector(int block_size) -{ - switch (block_size) { - case 512: return 0x1; - case 520: return 0x2; - case 4096: return 0x3; - case 4160: return 0x4; - case 1073741824: return 0x5; - default: return 0; - } -} - -static void mlx5_fill_inl_bsf(struct ib_sig_domain *domain, - struct mlx5_bsf_inl *inl) -{ - /* Valid inline section and allow BSF refresh */ - inl->vld_refresh = cpu_to_be16(MLX5_BSF_INL_VALID | - MLX5_BSF_REFRESH_DIF); - inl->dif_apptag = cpu_to_be16(domain->sig.dif.app_tag); - inl->dif_reftag = cpu_to_be32(domain->sig.dif.ref_tag); - /* repeating block */ - inl->rp_inv_seed = MLX5_BSF_REPEAT_BLOCK; - inl->sig_type = domain->sig.dif.bg_type == IB_T10DIF_CRC ? - MLX5_DIF_CRC : MLX5_DIF_IPCS; - - if (domain->sig.dif.ref_remap) - inl->dif_inc_ref_guard_check |= MLX5_BSF_INC_REFTAG; - - if (domain->sig.dif.app_escape) { - if (domain->sig.dif.ref_escape) - inl->dif_inc_ref_guard_check |= MLX5_BSF_APPREF_ESCAPE; - else - inl->dif_inc_ref_guard_check |= MLX5_BSF_APPTAG_ESCAPE; - } - - inl->dif_app_bitmask_check = - cpu_to_be16(domain->sig.dif.apptag_check_mask); -} - -static int mlx5_set_bsf(struct ib_mr *sig_mr, - struct ib_sig_attrs *sig_attrs, - struct mlx5_bsf *bsf, u32 data_size) -{ - struct mlx5_core_sig_ctx *msig = to_mmr(sig_mr)->sig; - struct mlx5_bsf_basic *basic = &bsf->basic; - struct ib_sig_domain *mem = &sig_attrs->mem; - struct ib_sig_domain *wire = &sig_attrs->wire; - - memset(bsf, 0, sizeof(*bsf)); - - /* Basic + Extended + Inline */ - basic->bsf_size_sbs = 1 << 7; - /* Input domain check byte mask */ - basic->check_byte_mask = sig_attrs->check_mask; - basic->raw_data_size = cpu_to_be32(data_size); - - /* Memory domain */ - switch (sig_attrs->mem.sig_type) { - case IB_SIG_TYPE_NONE: - break; - case IB_SIG_TYPE_T10_DIF: - basic->mem.bs_selector = bs_selector(mem->sig.dif.pi_interval); - basic->m_bfs_psv = cpu_to_be32(msig->psv_memory.psv_idx); - mlx5_fill_inl_bsf(mem, &bsf->m_inl); - break; - default: - return -EINVAL; - } - - /* Wire domain */ - switch (sig_attrs->wire.sig_type) { - case IB_SIG_TYPE_NONE: - break; - case IB_SIG_TYPE_T10_DIF: - if (mem->sig.dif.pi_interval == wire->sig.dif.pi_interval && - mem->sig_type == wire->sig_type) { - /* Same block structure */ - basic->bsf_size_sbs |= 1 << 4; - if (mem->sig.dif.bg_type == wire->sig.dif.bg_type) - basic->wire.copy_byte_mask |= MLX5_CPY_GRD_MASK; - if (mem->sig.dif.app_tag == wire->sig.dif.app_tag) - basic->wire.copy_byte_mask |= MLX5_CPY_APP_MASK; - if (mem->sig.dif.ref_tag == wire->sig.dif.ref_tag) - basic->wire.copy_byte_mask |= MLX5_CPY_REF_MASK; - } else - basic->wire.bs_selector = bs_selector(wire->sig.dif.pi_interval); - - basic->w_bfs_psv = cpu_to_be32(msig->psv_wire.psv_idx); - mlx5_fill_inl_bsf(wire, &bsf->w_inl); - break; - default: - return -EINVAL; - } - - return 0; -} - -static int set_sig_data_segment(const struct ib_send_wr *send_wr, - struct ib_mr *sig_mr, - struct ib_sig_attrs *sig_attrs, - struct mlx5_ib_qp *qp, void **seg, int *size, - void **cur_edge) -{ - struct mlx5_bsf *bsf; - u32 data_len; - u32 data_key; - u64 data_va; - u32 prot_len = 0; - u32 prot_key = 0; - u64 prot_va = 0; - bool prot = false; - int ret; - int wqe_size; - struct mlx5_ib_mr *mr = to_mmr(sig_mr); - struct mlx5_ib_mr *pi_mr = mr->pi_mr; - - data_len = pi_mr->data_length; - data_key = pi_mr->ibmr.lkey; - data_va = pi_mr->data_iova; - if (pi_mr->meta_ndescs) { - prot_len = pi_mr->meta_length; - prot_key = pi_mr->ibmr.lkey; - prot_va = pi_mr->pi_iova; - prot = true; - } - - if (!prot || (data_key == prot_key && data_va == prot_va && - data_len == prot_len)) { - /** - * Source domain doesn't contain signature information - * or data and protection are interleaved in memory. - * So need construct: - * ------------------ - * | data_klm | - * ------------------ - * | BSF | - * ------------------ - **/ - struct mlx5_klm *data_klm = *seg; - - data_klm->bcount = cpu_to_be32(data_len); - data_klm->key = cpu_to_be32(data_key); - data_klm->va = cpu_to_be64(data_va); - wqe_size = ALIGN(sizeof(*data_klm), 64); - } else { - /** - * Source domain contains signature information - * So need construct a strided block format: - * --------------------------- - * | stride_block_ctrl | - * --------------------------- - * | data_klm | - * --------------------------- - * | prot_klm | - * --------------------------- - * | BSF | - * --------------------------- - **/ - struct mlx5_stride_block_ctrl_seg *sblock_ctrl; - struct mlx5_stride_block_entry *data_sentry; - struct mlx5_stride_block_entry *prot_sentry; - u16 block_size = sig_attrs->mem.sig.dif.pi_interval; - int prot_size; - - sblock_ctrl = *seg; - data_sentry = (void *)sblock_ctrl + sizeof(*sblock_ctrl); - prot_sentry = (void *)data_sentry + sizeof(*data_sentry); - - prot_size = prot_field_size(sig_attrs->mem.sig_type); - if (!prot_size) { - pr_err("Bad block size given: %u\n", block_size); - return -EINVAL; - } - sblock_ctrl->bcount_per_cycle = cpu_to_be32(block_size + - prot_size); - sblock_ctrl->op = cpu_to_be32(MLX5_STRIDE_BLOCK_OP); - sblock_ctrl->repeat_count = cpu_to_be32(data_len / block_size); - sblock_ctrl->num_entries = cpu_to_be16(2); - - data_sentry->bcount = cpu_to_be16(block_size); - data_sentry->key = cpu_to_be32(data_key); - data_sentry->va = cpu_to_be64(data_va); - data_sentry->stride = cpu_to_be16(block_size); - - prot_sentry->bcount = cpu_to_be16(prot_size); - prot_sentry->key = cpu_to_be32(prot_key); - prot_sentry->va = cpu_to_be64(prot_va); - prot_sentry->stride = cpu_to_be16(prot_size); - - wqe_size = ALIGN(sizeof(*sblock_ctrl) + sizeof(*data_sentry) + - sizeof(*prot_sentry), 64); - } - - *seg += wqe_size; - *size += wqe_size / 16; - handle_post_send_edge(&qp->sq, seg, *size, cur_edge); - - bsf = *seg; - ret = mlx5_set_bsf(sig_mr, sig_attrs, bsf, data_len); - if (ret) - return -EINVAL; - - *seg += sizeof(*bsf); - *size += sizeof(*bsf) / 16; - handle_post_send_edge(&qp->sq, seg, *size, cur_edge); - - return 0; -} - -static void set_sig_mkey_segment(struct mlx5_mkey_seg *seg, - struct ib_mr *sig_mr, int access_flags, - u32 size, u32 length, u32 pdn) -{ - u32 sig_key = sig_mr->rkey; - u8 sigerr = to_mmr(sig_mr)->sig->sigerr_count & 1; - - memset(seg, 0, sizeof(*seg)); - - seg->flags = get_umr_flags(access_flags) | MLX5_MKC_ACCESS_MODE_KLMS; - seg->qpn_mkey7_0 = cpu_to_be32((sig_key & 0xff) | 0xffffff00); - seg->flags_pd = cpu_to_be32(MLX5_MKEY_REMOTE_INVAL | sigerr << 26 | - MLX5_MKEY_BSF_EN | pdn); - seg->len = cpu_to_be64(length); - seg->xlt_oct_size = cpu_to_be32(get_xlt_octo(size)); - seg->bsfs_octo_size = cpu_to_be32(MLX5_MKEY_BSF_OCTO_SIZE); -} - -static void set_sig_umr_segment(struct mlx5_wqe_umr_ctrl_seg *umr, - u32 size) -{ - memset(umr, 0, sizeof(*umr)); - - umr->flags = MLX5_FLAGS_INLINE | MLX5_FLAGS_CHECK_FREE; - umr->xlt_octowords = cpu_to_be16(get_xlt_octo(size)); - umr->bsf_octowords = cpu_to_be16(MLX5_MKEY_BSF_OCTO_SIZE); - umr->mkey_mask = sig_mkey_mask(); -} - -static int set_pi_umr_wr(const struct ib_send_wr *send_wr, - struct mlx5_ib_qp *qp, void **seg, int *size, - void **cur_edge) -{ - const struct ib_reg_wr *wr = reg_wr(send_wr); - struct mlx5_ib_mr *sig_mr = to_mmr(wr->mr); - struct mlx5_ib_mr *pi_mr = sig_mr->pi_mr; - struct ib_sig_attrs *sig_attrs = sig_mr->ibmr.sig_attrs; - u32 pdn = get_pd(qp)->pdn; - u32 xlt_size; - int region_len, ret; - - if (unlikely(send_wr->num_sge != 0) || - unlikely(wr->access & IB_ACCESS_REMOTE_ATOMIC) || - unlikely(!sig_mr->sig) || unlikely(!qp->ibqp.integrity_en) || - unlikely(!sig_mr->sig->sig_status_checked)) - return -EINVAL; - - /* length of the protected region, data + protection */ - region_len = pi_mr->ibmr.length; - - /** - * KLM octoword size - if protection was provided - * then we use strided block format (3 octowords), - * else we use single KLM (1 octoword) - **/ - if (sig_attrs->mem.sig_type != IB_SIG_TYPE_NONE) - xlt_size = 0x30; - else - xlt_size = sizeof(struct mlx5_klm); - - set_sig_umr_segment(*seg, xlt_size); - *seg += sizeof(struct mlx5_wqe_umr_ctrl_seg); - *size += sizeof(struct mlx5_wqe_umr_ctrl_seg) / 16; - handle_post_send_edge(&qp->sq, seg, *size, cur_edge); - - set_sig_mkey_segment(*seg, wr->mr, wr->access, xlt_size, region_len, - pdn); - *seg += sizeof(struct mlx5_mkey_seg); - *size += sizeof(struct mlx5_mkey_seg) / 16; - handle_post_send_edge(&qp->sq, seg, *size, cur_edge); - - ret = set_sig_data_segment(send_wr, wr->mr, sig_attrs, qp, seg, size, - cur_edge); - if (ret) - return ret; - - sig_mr->sig->sig_status_checked = false; - return 0; -} - -static int set_psv_wr(struct ib_sig_domain *domain, - u32 psv_idx, void **seg, int *size) -{ - struct mlx5_seg_set_psv *psv_seg = *seg; - - memset(psv_seg, 0, sizeof(*psv_seg)); - psv_seg->psv_num = cpu_to_be32(psv_idx); - switch (domain->sig_type) { - case IB_SIG_TYPE_NONE: - break; - case IB_SIG_TYPE_T10_DIF: - psv_seg->transient_sig = cpu_to_be32(domain->sig.dif.bg << 16 | - domain->sig.dif.app_tag); - psv_seg->ref_tag = cpu_to_be32(domain->sig.dif.ref_tag); - break; - default: - pr_err("Bad signature type (%d) is given.\n", - domain->sig_type); - return -EINVAL; - } - - *seg += sizeof(*psv_seg); - *size += sizeof(*psv_seg) / 16; - - return 0; -} - -static int set_reg_wr(struct mlx5_ib_qp *qp, - const struct ib_reg_wr *wr, - void **seg, int *size, void **cur_edge, - bool check_not_free) -{ - struct mlx5_ib_mr *mr = to_mmr(wr->mr); - struct mlx5_ib_pd *pd = to_mpd(qp->ibqp.pd); - struct mlx5_ib_dev *dev = to_mdev(pd->ibpd.device); - int mr_list_size = (mr->ndescs + mr->meta_ndescs) * mr->desc_size; - bool umr_inline = mr_list_size <= MLX5_IB_SQ_UMR_INLINE_THRESHOLD; - bool atomic = wr->access & IB_ACCESS_REMOTE_ATOMIC; - u8 flags = 0; - - if (!mlx5_ib_can_use_umr(dev, atomic, wr->access)) { - mlx5_ib_warn(to_mdev(qp->ibqp.device), - "Fast update of %s for MR is disabled\n", - (MLX5_CAP_GEN(dev->mdev, - umr_modify_entity_size_disabled)) ? - "entity size" : - "atomic access"); - return -EINVAL; - } - - if (unlikely(wr->wr.send_flags & IB_SEND_INLINE)) { - mlx5_ib_warn(to_mdev(qp->ibqp.device), - "Invalid IB_SEND_INLINE send flag\n"); - return -EINVAL; - } - - if (check_not_free) - flags |= MLX5_UMR_CHECK_NOT_FREE; - if (umr_inline) - flags |= MLX5_UMR_INLINE; - - set_reg_umr_seg(*seg, mr, flags, atomic); - *seg += sizeof(struct mlx5_wqe_umr_ctrl_seg); - *size += sizeof(struct mlx5_wqe_umr_ctrl_seg) / 16; - handle_post_send_edge(&qp->sq, seg, *size, cur_edge); - - set_reg_mkey_seg(*seg, mr, wr->key, wr->access); - *seg += sizeof(struct mlx5_mkey_seg); - *size += sizeof(struct mlx5_mkey_seg) / 16; - handle_post_send_edge(&qp->sq, seg, *size, cur_edge); - - if (umr_inline) { - memcpy_send_wqe(&qp->sq, cur_edge, seg, size, mr->descs, - mr_list_size); - *size = ALIGN(*size, MLX5_SEND_WQE_BB >> 4); - } else { - set_reg_data_seg(*seg, mr, pd); - *seg += sizeof(struct mlx5_wqe_data_seg); - *size += (sizeof(struct mlx5_wqe_data_seg) / 16); - } - return 0; -} - -static void set_linv_wr(struct mlx5_ib_qp *qp, void **seg, int *size, - void **cur_edge) -{ - set_linv_umr_seg(*seg); - *seg += sizeof(struct mlx5_wqe_umr_ctrl_seg); - *size += sizeof(struct mlx5_wqe_umr_ctrl_seg) / 16; - handle_post_send_edge(&qp->sq, seg, *size, cur_edge); - set_linv_mkey_seg(*seg); - *seg += sizeof(struct mlx5_mkey_seg); - *size += sizeof(struct mlx5_mkey_seg) / 16; - handle_post_send_edge(&qp->sq, seg, *size, cur_edge); -} - -static void dump_wqe(struct mlx5_ib_qp *qp, u32 idx, int size_16) -{ - __be32 *p = NULL; - int i, j; - - pr_debug("dump WQE index %u:\n", idx); - for (i = 0, j = 0; i < size_16 * 4; i += 4, j += 4) { - if ((i & 0xf) == 0) { - p = mlx5_frag_buf_get_wqe(&qp->sq.fbc, idx); - pr_debug("WQBB at %p:\n", (void *)p); - j = 0; - idx = (idx + 1) & (qp->sq.wqe_cnt - 1); - } - pr_debug("%08x %08x %08x %08x\n", be32_to_cpu(p[j]), - be32_to_cpu(p[j + 1]), be32_to_cpu(p[j + 2]), - be32_to_cpu(p[j + 3])); - } -} - -static int __begin_wqe(struct mlx5_ib_qp *qp, void **seg, - struct mlx5_wqe_ctrl_seg **ctrl, - const struct ib_send_wr *wr, unsigned int *idx, - int *size, void **cur_edge, int nreq, - bool send_signaled, bool solicited) -{ - if (unlikely(mlx5_wq_overflow(&qp->sq, nreq, qp->ibqp.send_cq))) - return -ENOMEM; - - *idx = qp->sq.cur_post & (qp->sq.wqe_cnt - 1); - *seg = mlx5_frag_buf_get_wqe(&qp->sq.fbc, *idx); - *ctrl = *seg; - *(uint32_t *)(*seg + 8) = 0; - (*ctrl)->imm = send_ieth(wr); - (*ctrl)->fm_ce_se = qp->sq_signal_bits | - (send_signaled ? MLX5_WQE_CTRL_CQ_UPDATE : 0) | - (solicited ? MLX5_WQE_CTRL_SOLICITED : 0); - - *seg += sizeof(**ctrl); - *size = sizeof(**ctrl) / 16; - *cur_edge = qp->sq.cur_edge; - - return 0; -} - -static int begin_wqe(struct mlx5_ib_qp *qp, void **seg, - struct mlx5_wqe_ctrl_seg **ctrl, - const struct ib_send_wr *wr, unsigned *idx, - int *size, void **cur_edge, int nreq) -{ - return __begin_wqe(qp, seg, ctrl, wr, idx, size, cur_edge, nreq, - wr->send_flags & IB_SEND_SIGNALED, - wr->send_flags & IB_SEND_SOLICITED); -} - -static void finish_wqe(struct mlx5_ib_qp *qp, - struct mlx5_wqe_ctrl_seg *ctrl, - void *seg, u8 size, void *cur_edge, - unsigned int idx, u64 wr_id, int nreq, u8 fence, - u32 mlx5_opcode) -{ - u8 opmod = 0; - - ctrl->opmod_idx_opcode = cpu_to_be32(((u32)(qp->sq.cur_post) << 8) | - mlx5_opcode | ((u32)opmod << 24)); - ctrl->qpn_ds = cpu_to_be32(size | (qp->trans_qp.base.mqp.qpn << 8)); - ctrl->fm_ce_se |= fence; - if (unlikely(qp->flags_en & MLX5_QP_FLAG_SIGNATURE)) - ctrl->signature = wq_sig(ctrl); - - qp->sq.wrid[idx] = wr_id; - qp->sq.w_list[idx].opcode = mlx5_opcode; - qp->sq.wqe_head[idx] = qp->sq.head + nreq; - qp->sq.cur_post += DIV_ROUND_UP(size * 16, MLX5_SEND_WQE_BB); - qp->sq.w_list[idx].next = qp->sq.cur_post; - - /* We save the edge which was possibly updated during the WQE - * construction, into SQ's cache. - */ - seg = PTR_ALIGN(seg, MLX5_SEND_WQE_BB); - qp->sq.cur_edge = (unlikely(seg == cur_edge)) ? - get_sq_edge(&qp->sq, qp->sq.cur_post & - (qp->sq.wqe_cnt - 1)) : - cur_edge; -} - -static void handle_rdma_op(const struct ib_send_wr *wr, void **seg, int *size) -{ - set_raddr_seg(*seg, rdma_wr(wr)->remote_addr, rdma_wr(wr)->rkey); - *seg += sizeof(struct mlx5_wqe_raddr_seg); - *size += sizeof(struct mlx5_wqe_raddr_seg) / 16; -} - -static void handle_local_inv(struct mlx5_ib_qp *qp, const struct ib_send_wr *wr, - struct mlx5_wqe_ctrl_seg **ctrl, void **seg, - int *size, void **cur_edge, unsigned int idx) -{ - qp->sq.wr_data[idx] = IB_WR_LOCAL_INV; - (*ctrl)->imm = cpu_to_be32(wr->ex.invalidate_rkey); - set_linv_wr(qp, seg, size, cur_edge); -} - -static int handle_reg_mr(struct mlx5_ib_qp *qp, const struct ib_send_wr *wr, - struct mlx5_wqe_ctrl_seg **ctrl, void **seg, int *size, - void **cur_edge, unsigned int idx) -{ - qp->sq.wr_data[idx] = IB_WR_REG_MR; - (*ctrl)->imm = cpu_to_be32(reg_wr(wr)->key); - return set_reg_wr(qp, reg_wr(wr), seg, size, cur_edge, true); -} - -static int handle_psv(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, - const struct ib_send_wr *wr, - struct mlx5_wqe_ctrl_seg **ctrl, void **seg, int *size, - void **cur_edge, unsigned int *idx, int nreq, - struct ib_sig_domain *domain, u32 psv_index, - u8 next_fence) -{ - int err; - - /* - * SET_PSV WQEs are not signaled and solicited on error. - */ - err = __begin_wqe(qp, seg, ctrl, wr, idx, size, cur_edge, nreq, - false, true); - if (unlikely(err)) { - mlx5_ib_warn(dev, "\n"); - err = -ENOMEM; - goto out; - } - err = set_psv_wr(domain, psv_index, seg, size); - if (unlikely(err)) { - mlx5_ib_warn(dev, "\n"); - goto out; - } - finish_wqe(qp, *ctrl, *seg, *size, *cur_edge, *idx, wr->wr_id, nreq, - next_fence, MLX5_OPCODE_SET_PSV); - -out: - return err; -} - -static int handle_reg_mr_integrity(struct mlx5_ib_dev *dev, - struct mlx5_ib_qp *qp, const struct ib_send_wr *wr, - struct mlx5_wqe_ctrl_seg **ctrl, void **seg, int *size, - void **cur_edge, unsigned int *idx, int nreq, u8 fence, - u8 next_fence) -{ - struct mlx5_ib_mr *mr; - struct mlx5_ib_mr *pi_mr; - struct mlx5_ib_mr pa_pi_mr; - struct ib_sig_attrs *sig_attrs; - struct ib_reg_wr reg_pi_wr; - int err; - - qp->sq.wr_data[*idx] = IB_WR_REG_MR_INTEGRITY; - - mr = to_mmr(reg_wr(wr)->mr); - pi_mr = mr->pi_mr; - - if (pi_mr) { - memset(®_pi_wr, 0, - sizeof(struct ib_reg_wr)); - - reg_pi_wr.mr = &pi_mr->ibmr; - reg_pi_wr.access = reg_wr(wr)->access; - reg_pi_wr.key = pi_mr->ibmr.rkey; - - (*ctrl)->imm = cpu_to_be32(reg_pi_wr.key); - /* UMR for data + prot registration */ - err = set_reg_wr(qp, ®_pi_wr, seg, size, cur_edge, false); - if (unlikely(err)) - goto out; - - finish_wqe(qp, *ctrl, *seg, *size, *cur_edge, *idx, wr->wr_id, - nreq, fence, MLX5_OPCODE_UMR); - - err = begin_wqe(qp, seg, ctrl, wr, idx, size, cur_edge, nreq); - if (unlikely(err)) { - mlx5_ib_warn(dev, "\n"); - err = -ENOMEM; - goto out; - } - } else { - memset(&pa_pi_mr, 0, sizeof(struct mlx5_ib_mr)); - /* No UMR, use local_dma_lkey */ - pa_pi_mr.ibmr.lkey = mr->ibmr.pd->local_dma_lkey; - pa_pi_mr.ndescs = mr->ndescs; - pa_pi_mr.data_length = mr->data_length; - pa_pi_mr.data_iova = mr->data_iova; - if (mr->meta_ndescs) { - pa_pi_mr.meta_ndescs = mr->meta_ndescs; - pa_pi_mr.meta_length = mr->meta_length; - pa_pi_mr.pi_iova = mr->pi_iova; - } - - pa_pi_mr.ibmr.length = mr->ibmr.length; - mr->pi_mr = &pa_pi_mr; - } - (*ctrl)->imm = cpu_to_be32(mr->ibmr.rkey); - /* UMR for sig MR */ - err = set_pi_umr_wr(wr, qp, seg, size, cur_edge); - if (unlikely(err)) { - mlx5_ib_warn(dev, "\n"); - goto out; - } - finish_wqe(qp, *ctrl, *seg, *size, *cur_edge, *idx, wr->wr_id, nreq, - fence, MLX5_OPCODE_UMR); - - sig_attrs = mr->ibmr.sig_attrs; - err = handle_psv(dev, qp, wr, ctrl, seg, size, cur_edge, idx, nreq, - &sig_attrs->mem, mr->sig->psv_memory.psv_idx, - next_fence); - if (unlikely(err)) - goto out; - - err = handle_psv(dev, qp, wr, ctrl, seg, size, cur_edge, idx, nreq, - &sig_attrs->wire, mr->sig->psv_wire.psv_idx, - next_fence); - if (unlikely(err)) - goto out; - - qp->next_fence = MLX5_FENCE_MODE_INITIATOR_SMALL; - -out: - return err; -} - -static int handle_qpt_rc(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, - const struct ib_send_wr *wr, - struct mlx5_wqe_ctrl_seg **ctrl, void **seg, int *size, - void **cur_edge, unsigned int *idx, int nreq, u8 fence, - u8 next_fence, int *num_sge) -{ - int err = 0; - - switch (wr->opcode) { - case IB_WR_RDMA_READ: - case IB_WR_RDMA_WRITE: - case IB_WR_RDMA_WRITE_WITH_IMM: - handle_rdma_op(wr, seg, size); - break; - - case IB_WR_ATOMIC_CMP_AND_SWP: - case IB_WR_ATOMIC_FETCH_AND_ADD: - case IB_WR_MASKED_ATOMIC_CMP_AND_SWP: - mlx5_ib_warn(dev, "Atomic operations are not supported yet\n"); - err = -EOPNOTSUPP; - goto out; - - case IB_WR_LOCAL_INV: - handle_local_inv(qp, wr, ctrl, seg, size, cur_edge, *idx); - *num_sge = 0; - break; - - case IB_WR_REG_MR: - err = handle_reg_mr(qp, wr, ctrl, seg, size, cur_edge, *idx); - if (unlikely(err)) - goto out; - *num_sge = 0; - break; - - case IB_WR_REG_MR_INTEGRITY: - err = handle_reg_mr_integrity(dev, qp, wr, ctrl, seg, size, - cur_edge, idx, nreq, fence, - next_fence); - if (unlikely(err)) - goto out; - *num_sge = 0; - break; - - default: - break; - } - -out: - return err; -} - -static void handle_qpt_uc(const struct ib_send_wr *wr, void **seg, int *size) -{ - switch (wr->opcode) { - case IB_WR_RDMA_WRITE: - case IB_WR_RDMA_WRITE_WITH_IMM: - handle_rdma_op(wr, seg, size); - break; - default: - break; - } -} - -static void handle_qpt_hw_gsi(struct mlx5_ib_qp *qp, - const struct ib_send_wr *wr, void **seg, - int *size, void **cur_edge) -{ - set_datagram_seg(*seg, wr); - *seg += sizeof(struct mlx5_wqe_datagram_seg); - *size += sizeof(struct mlx5_wqe_datagram_seg) / 16; - handle_post_send_edge(&qp->sq, seg, *size, cur_edge); -} - -static void handle_qpt_ud(struct mlx5_ib_qp *qp, const struct ib_send_wr *wr, - void **seg, int *size, void **cur_edge) -{ - set_datagram_seg(*seg, wr); - *seg += sizeof(struct mlx5_wqe_datagram_seg); - *size += sizeof(struct mlx5_wqe_datagram_seg) / 16; - handle_post_send_edge(&qp->sq, seg, *size, cur_edge); - - /* handle qp that supports ud offload */ - if (qp->flags & IB_QP_CREATE_IPOIB_UD_LSO) { - struct mlx5_wqe_eth_pad *pad; - - pad = *seg; - memset(pad, 0, sizeof(struct mlx5_wqe_eth_pad)); - *seg += sizeof(struct mlx5_wqe_eth_pad); - *size += sizeof(struct mlx5_wqe_eth_pad) / 16; - set_eth_seg(wr, qp, seg, size, cur_edge); - handle_post_send_edge(&qp->sq, seg, *size, cur_edge); - } -} - -static int handle_qpt_reg_umr(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, - const struct ib_send_wr *wr, - struct mlx5_wqe_ctrl_seg **ctrl, void **seg, - int *size, void **cur_edge, unsigned int idx) -{ - int err = 0; - - if (unlikely(wr->opcode != MLX5_IB_WR_UMR)) { - err = -EINVAL; - mlx5_ib_warn(dev, "bad opcode %d\n", wr->opcode); - goto out; - } - - qp->sq.wr_data[idx] = MLX5_IB_WR_UMR; - (*ctrl)->imm = cpu_to_be32(umr_wr(wr)->mkey); - err = set_reg_umr_segment(dev, *seg, wr, - !!(MLX5_CAP_GEN(dev->mdev, atomic))); - if (unlikely(err)) - goto out; - *seg += sizeof(struct mlx5_wqe_umr_ctrl_seg); - *size += sizeof(struct mlx5_wqe_umr_ctrl_seg) / 16; - handle_post_send_edge(&qp->sq, seg, *size, cur_edge); - set_reg_mkey_segment(*seg, wr); - *seg += sizeof(struct mlx5_mkey_seg); - *size += sizeof(struct mlx5_mkey_seg) / 16; - handle_post_send_edge(&qp->sq, seg, *size, cur_edge); -out: - return err; -} - -static int _mlx5_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, - const struct ib_send_wr **bad_wr, bool drain) -{ - struct mlx5_wqe_ctrl_seg *ctrl = NULL; /* compiler warning */ - struct mlx5_ib_dev *dev = to_mdev(ibqp->device); - struct mlx5_core_dev *mdev = dev->mdev; - struct mlx5_ib_qp *qp; - struct mlx5_wqe_xrc_seg *xrc; - struct mlx5_bf *bf; - void *cur_edge; - int uninitialized_var(size); - unsigned long flags; - unsigned idx; - int err = 0; - int num_sge; - void *seg; - int nreq; - int i; - u8 next_fence = 0; - u8 fence; - - if (unlikely(mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR && - !drain)) { - *bad_wr = wr; - return -EIO; - } - - if (unlikely(ibqp->qp_type == IB_QPT_GSI)) - return mlx5_ib_gsi_post_send(ibqp, wr, bad_wr); - - qp = to_mqp(ibqp); - bf = &qp->bf; - - spin_lock_irqsave(&qp->sq.lock, flags); - - for (nreq = 0; wr; nreq++, wr = wr->next) { - if (unlikely(wr->opcode >= ARRAY_SIZE(mlx5_ib_opcode))) { - mlx5_ib_warn(dev, "\n"); - err = -EINVAL; - *bad_wr = wr; - goto out; - } - - num_sge = wr->num_sge; - if (unlikely(num_sge > qp->sq.max_gs)) { - mlx5_ib_warn(dev, "\n"); - err = -EINVAL; - *bad_wr = wr; - goto out; - } - - err = begin_wqe(qp, &seg, &ctrl, wr, &idx, &size, &cur_edge, - nreq); - if (err) { - mlx5_ib_warn(dev, "\n"); - err = -ENOMEM; - *bad_wr = wr; - goto out; - } - - if (wr->opcode == IB_WR_REG_MR || - wr->opcode == IB_WR_REG_MR_INTEGRITY) { - fence = dev->umr_fence; - next_fence = MLX5_FENCE_MODE_INITIATOR_SMALL; - } else { - if (wr->send_flags & IB_SEND_FENCE) { - if (qp->next_fence) - fence = MLX5_FENCE_MODE_SMALL_AND_FENCE; - else - fence = MLX5_FENCE_MODE_FENCE; - } else { - fence = qp->next_fence; - } - } - - switch (ibqp->qp_type) { - case IB_QPT_XRC_INI: - xrc = seg; - seg += sizeof(*xrc); - size += sizeof(*xrc) / 16; - /* fall through */ - case IB_QPT_RC: - err = handle_qpt_rc(dev, qp, wr, &ctrl, &seg, &size, - &cur_edge, &idx, nreq, fence, - next_fence, &num_sge); - if (unlikely(err)) { - *bad_wr = wr; - goto out; - } else if (wr->opcode == IB_WR_REG_MR_INTEGRITY) { - goto skip_psv; - } - break; - - case IB_QPT_UC: - handle_qpt_uc(wr, &seg, &size); - break; - case IB_QPT_SMI: - if (unlikely(!mdev->port_caps[qp->port - 1].has_smi)) { - mlx5_ib_warn(dev, "Send SMP MADs is not allowed\n"); - err = -EPERM; - *bad_wr = wr; - goto out; - } - /* fall through */ - case MLX5_IB_QPT_HW_GSI: - handle_qpt_hw_gsi(qp, wr, &seg, &size, &cur_edge); - break; - case IB_QPT_UD: - handle_qpt_ud(qp, wr, &seg, &size, &cur_edge); - break; - case MLX5_IB_QPT_REG_UMR: - err = handle_qpt_reg_umr(dev, qp, wr, &ctrl, &seg, - &size, &cur_edge, idx); - if (unlikely(err)) - goto out; - break; - - default: - break; - } - - if (wr->send_flags & IB_SEND_INLINE && num_sge) { - err = set_data_inl_seg(qp, wr, &seg, &size, &cur_edge); - if (unlikely(err)) { - mlx5_ib_warn(dev, "\n"); - *bad_wr = wr; - goto out; - } - } else { - for (i = 0; i < num_sge; i++) { - handle_post_send_edge(&qp->sq, &seg, size, - &cur_edge); - if (likely(wr->sg_list[i].length)) { - set_data_ptr_seg - ((struct mlx5_wqe_data_seg *)seg, - wr->sg_list + i); - size += sizeof(struct mlx5_wqe_data_seg) / 16; - seg += sizeof(struct mlx5_wqe_data_seg); - } - } - } - - qp->next_fence = next_fence; - finish_wqe(qp, ctrl, seg, size, cur_edge, idx, wr->wr_id, nreq, - fence, mlx5_ib_opcode[wr->opcode]); -skip_psv: - if (0) - dump_wqe(qp, idx, size); - } - -out: - if (likely(nreq)) { - qp->sq.head += nreq; - - /* Make sure that descriptors are written before - * updating doorbell record and ringing the doorbell - */ - wmb(); - - qp->db.db[MLX5_SND_DBR] = cpu_to_be32(qp->sq.cur_post); - - /* Make sure doorbell record is visible to the HCA before - * we hit doorbell */ - wmb(); - - mlx5_write64((__be32 *)ctrl, bf->bfreg->map + bf->offset); - /* Make sure doorbells don't leak out of SQ spinlock - * and reach the HCA out of order. - */ - bf->offset ^= bf->buf_size; - } - - spin_unlock_irqrestore(&qp->sq.lock, flags); - - return err; -} - -int mlx5_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, - const struct ib_send_wr **bad_wr) -{ - return _mlx5_ib_post_send(ibqp, wr, bad_wr, false); -} - -static void set_sig_seg(struct mlx5_rwqe_sig *sig, int size) -{ - sig->signature = calc_sig(sig, size); -} - -static int _mlx5_ib_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr, - const struct ib_recv_wr **bad_wr, bool drain) -{ - struct mlx5_ib_qp *qp = to_mqp(ibqp); - struct mlx5_wqe_data_seg *scat; - struct mlx5_rwqe_sig *sig; - struct mlx5_ib_dev *dev = to_mdev(ibqp->device); - struct mlx5_core_dev *mdev = dev->mdev; - unsigned long flags; - int err = 0; - int nreq; - int ind; - int i; - - if (unlikely(mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR && - !drain)) { - *bad_wr = wr; - return -EIO; - } - - if (unlikely(ibqp->qp_type == IB_QPT_GSI)) - return mlx5_ib_gsi_post_recv(ibqp, wr, bad_wr); - - spin_lock_irqsave(&qp->rq.lock, flags); - - ind = qp->rq.head & (qp->rq.wqe_cnt - 1); - - for (nreq = 0; wr; nreq++, wr = wr->next) { - if (mlx5_wq_overflow(&qp->rq, nreq, qp->ibqp.recv_cq)) { - err = -ENOMEM; - *bad_wr = wr; - goto out; - } - - if (unlikely(wr->num_sge > qp->rq.max_gs)) { - err = -EINVAL; - *bad_wr = wr; - goto out; - } - - scat = mlx5_frag_buf_get_wqe(&qp->rq.fbc, ind); - if (qp->flags_en & MLX5_QP_FLAG_SIGNATURE) - scat++; - - for (i = 0; i < wr->num_sge; i++) - set_data_ptr_seg(scat + i, wr->sg_list + i); - - if (i < qp->rq.max_gs) { - scat[i].byte_count = 0; - scat[i].lkey = cpu_to_be32(MLX5_INVALID_LKEY); - scat[i].addr = 0; - } - - if (qp->flags_en & MLX5_QP_FLAG_SIGNATURE) { - sig = (struct mlx5_rwqe_sig *)scat; - set_sig_seg(sig, (qp->rq.max_gs + 1) << 2); - } - - qp->rq.wrid[ind] = wr->wr_id; - - ind = (ind + 1) & (qp->rq.wqe_cnt - 1); - } - -out: - if (likely(nreq)) { - qp->rq.head += nreq; - - /* Make sure that descriptors are written before - * doorbell record. - */ - wmb(); - - *qp->db.db = cpu_to_be32(qp->rq.head & 0xffff); - } - - spin_unlock_irqrestore(&qp->rq.lock, flags); - - return err; -} - -int mlx5_ib_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr, - const struct ib_recv_wr **bad_wr) -{ - return _mlx5_ib_post_recv(ibqp, wr, bad_wr, false); -} - static inline enum ib_qp_state to_ib_qp_state(enum mlx5_qp_state mlx5_state) { switch (mlx5_state) { @@ -6808,7 +5266,7 @@ void mlx5_ib_drain_sq(struct ib_qp *qp) sdrain.cqe.done = mlx5_ib_drain_qp_done; init_completion(&sdrain.done); - ret = _mlx5_ib_post_send(qp, &swr.wr, &bad_swr, true); + ret = mlx5_ib_post_send_drain(qp, &swr.wr, &bad_swr); if (ret) { WARN_ONCE(ret, "failed to drain send queue: %d\n", ret); return; @@ -6838,7 +5296,7 @@ void mlx5_ib_drain_rq(struct ib_qp *qp) rdrain.cqe.done = mlx5_ib_drain_qp_done; init_completion(&rdrain.done); - ret = _mlx5_ib_post_recv(qp, &rwr, &bad_rwr, true); + ret = mlx5_ib_post_recv_drain(qp, &rwr, &bad_rwr); if (ret) { WARN_ONCE(ret, "failed to drain recv queue: %d\n", ret); return; diff --git a/drivers/infiniband/hw/mlx5/wr.c b/drivers/infiniband/hw/mlx5/wr.c new file mode 100644 index 000000000000..2c6df1c43b55 --- /dev/null +++ b/drivers/infiniband/hw/mlx5/wr.c @@ -0,0 +1,1504 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2020, Mellanox Technologies inc. All rights reserved. + */ + +#include +#include +#include +#include "wr.h" + +static const u32 mlx5_ib_opcode[] = { + [IB_WR_SEND] = MLX5_OPCODE_SEND, + [IB_WR_LSO] = MLX5_OPCODE_LSO, + [IB_WR_SEND_WITH_IMM] = MLX5_OPCODE_SEND_IMM, + [IB_WR_RDMA_WRITE] = MLX5_OPCODE_RDMA_WRITE, + [IB_WR_RDMA_WRITE_WITH_IMM] = MLX5_OPCODE_RDMA_WRITE_IMM, + [IB_WR_RDMA_READ] = MLX5_OPCODE_RDMA_READ, + [IB_WR_ATOMIC_CMP_AND_SWP] = MLX5_OPCODE_ATOMIC_CS, + [IB_WR_ATOMIC_FETCH_AND_ADD] = MLX5_OPCODE_ATOMIC_FA, + [IB_WR_SEND_WITH_INV] = MLX5_OPCODE_SEND_INVAL, + [IB_WR_LOCAL_INV] = MLX5_OPCODE_UMR, + [IB_WR_REG_MR] = MLX5_OPCODE_UMR, + [IB_WR_MASKED_ATOMIC_CMP_AND_SWP] = MLX5_OPCODE_ATOMIC_MASKED_CS, + [IB_WR_MASKED_ATOMIC_FETCH_AND_ADD] = MLX5_OPCODE_ATOMIC_MASKED_FA, + [MLX5_IB_WR_UMR] = MLX5_OPCODE_UMR, +}; + +/* handle_post_send_edge - Check if we get to SQ edge. If yes, update to the + * next nearby edge and get new address translation for current WQE position. + * @sq - SQ buffer. + * @seg: Current WQE position (16B aligned). + * @wqe_sz: Total current WQE size [16B]. + * @cur_edge: Updated current edge. + */ +static inline void handle_post_send_edge(struct mlx5_ib_wq *sq, void **seg, + u32 wqe_sz, void **cur_edge) +{ + u32 idx; + + if (likely(*seg != *cur_edge)) + return; + + idx = (sq->cur_post + (wqe_sz >> 2)) & (sq->wqe_cnt - 1); + *cur_edge = get_sq_edge(sq, idx); + + *seg = mlx5_frag_buf_get_wqe(&sq->fbc, idx); +} + +/* memcpy_send_wqe - copy data from src to WQE and update the relevant WQ's + * pointers. At the end @seg is aligned to 16B regardless the copied size. + * @sq - SQ buffer. + * @cur_edge: Updated current edge. + * @seg: Current WQE position (16B aligned). + * @wqe_sz: Total current WQE size [16B]. + * @src: Pointer to copy from. + * @n: Number of bytes to copy. + */ +static inline void memcpy_send_wqe(struct mlx5_ib_wq *sq, void **cur_edge, + void **seg, u32 *wqe_sz, const void *src, + size_t n) +{ + while (likely(n)) { + size_t leftlen = *cur_edge - *seg; + size_t copysz = min_t(size_t, leftlen, n); + size_t stride; + + memcpy(*seg, src, copysz); + + n -= copysz; + src += copysz; + stride = !n ? ALIGN(copysz, 16) : copysz; + *seg += stride; + *wqe_sz += stride >> 4; + handle_post_send_edge(sq, seg, *wqe_sz, cur_edge); + } +} + +static int mlx5_wq_overflow(struct mlx5_ib_wq *wq, int nreq, + struct ib_cq *ib_cq) +{ + struct mlx5_ib_cq *cq; + unsigned int cur; + + cur = wq->head - wq->tail; + if (likely(cur + nreq < wq->max_post)) + return 0; + + cq = to_mcq(ib_cq); + spin_lock(&cq->lock); + cur = wq->head - wq->tail; + spin_unlock(&cq->lock); + + return cur + nreq >= wq->max_post; +} + +static __always_inline void set_raddr_seg(struct mlx5_wqe_raddr_seg *rseg, + u64 remote_addr, u32 rkey) +{ + rseg->raddr = cpu_to_be64(remote_addr); + rseg->rkey = cpu_to_be32(rkey); + rseg->reserved = 0; +} + +static void set_eth_seg(const struct ib_send_wr *wr, struct mlx5_ib_qp *qp, + void **seg, int *size, void **cur_edge) +{ + struct mlx5_wqe_eth_seg *eseg = *seg; + + memset(eseg, 0, sizeof(struct mlx5_wqe_eth_seg)); + + if (wr->send_flags & IB_SEND_IP_CSUM) + eseg->cs_flags = MLX5_ETH_WQE_L3_CSUM | + MLX5_ETH_WQE_L4_CSUM; + + if (wr->opcode == IB_WR_LSO) { + struct ib_ud_wr *ud_wr = container_of(wr, struct ib_ud_wr, wr); + size_t left, copysz; + void *pdata = ud_wr->header; + size_t stride; + + left = ud_wr->hlen; + eseg->mss = cpu_to_be16(ud_wr->mss); + eseg->inline_hdr.sz = cpu_to_be16(left); + + /* memcpy_send_wqe should get a 16B align address. Hence, we + * first copy up to the current edge and then, if needed, + * continue to memcpy_send_wqe. + */ + copysz = min_t(u64, *cur_edge - (void *)eseg->inline_hdr.start, + left); + memcpy(eseg->inline_hdr.start, pdata, copysz); + stride = ALIGN(sizeof(struct mlx5_wqe_eth_seg) - + sizeof(eseg->inline_hdr.start) + copysz, 16); + *size += stride / 16; + *seg += stride; + + if (copysz < left) { + handle_post_send_edge(&qp->sq, seg, *size, cur_edge); + left -= copysz; + pdata += copysz; + memcpy_send_wqe(&qp->sq, cur_edge, seg, size, pdata, + left); + } + + return; + } + + *seg += sizeof(struct mlx5_wqe_eth_seg); + *size += sizeof(struct mlx5_wqe_eth_seg) / 16; +} + +static void set_datagram_seg(struct mlx5_wqe_datagram_seg *dseg, + const struct ib_send_wr *wr) +{ + memcpy(&dseg->av, &to_mah(ud_wr(wr)->ah)->av, sizeof(struct mlx5_av)); + dseg->av.dqp_dct = + cpu_to_be32(ud_wr(wr)->remote_qpn | MLX5_EXTENDED_UD_AV); + dseg->av.key.qkey.qkey = cpu_to_be32(ud_wr(wr)->remote_qkey); +} + +static void set_data_ptr_seg(struct mlx5_wqe_data_seg *dseg, struct ib_sge *sg) +{ + dseg->byte_count = cpu_to_be32(sg->length); + dseg->lkey = cpu_to_be32(sg->lkey); + dseg->addr = cpu_to_be64(sg->addr); +} + +static u64 get_xlt_octo(u64 bytes) +{ + return ALIGN(bytes, MLX5_IB_UMR_XLT_ALIGNMENT) / + MLX5_IB_UMR_OCTOWORD; +} + +static __be64 frwr_mkey_mask(bool atomic) +{ + u64 result; + + result = MLX5_MKEY_MASK_LEN | + MLX5_MKEY_MASK_PAGE_SIZE | + MLX5_MKEY_MASK_START_ADDR | + MLX5_MKEY_MASK_EN_RINVAL | + MLX5_MKEY_MASK_KEY | + MLX5_MKEY_MASK_LR | + MLX5_MKEY_MASK_LW | + MLX5_MKEY_MASK_RR | + MLX5_MKEY_MASK_RW | + MLX5_MKEY_MASK_SMALL_FENCE | + MLX5_MKEY_MASK_FREE; + + if (atomic) + result |= MLX5_MKEY_MASK_A; + + return cpu_to_be64(result); +} + +static __be64 sig_mkey_mask(void) +{ + u64 result; + + result = MLX5_MKEY_MASK_LEN | + MLX5_MKEY_MASK_PAGE_SIZE | + MLX5_MKEY_MASK_START_ADDR | + MLX5_MKEY_MASK_EN_SIGERR | + MLX5_MKEY_MASK_EN_RINVAL | + MLX5_MKEY_MASK_KEY | + MLX5_MKEY_MASK_LR | + MLX5_MKEY_MASK_LW | + MLX5_MKEY_MASK_RR | + MLX5_MKEY_MASK_RW | + MLX5_MKEY_MASK_SMALL_FENCE | + MLX5_MKEY_MASK_FREE | + MLX5_MKEY_MASK_BSF_EN; + + return cpu_to_be64(result); +} + +static void set_reg_umr_seg(struct mlx5_wqe_umr_ctrl_seg *umr, + struct mlx5_ib_mr *mr, u8 flags, bool atomic) +{ + int size = (mr->ndescs + mr->meta_ndescs) * mr->desc_size; + + memset(umr, 0, sizeof(*umr)); + + umr->flags = flags; + umr->xlt_octowords = cpu_to_be16(get_xlt_octo(size)); + umr->mkey_mask = frwr_mkey_mask(atomic); +} + +static void set_linv_umr_seg(struct mlx5_wqe_umr_ctrl_seg *umr) +{ + memset(umr, 0, sizeof(*umr)); + umr->mkey_mask = cpu_to_be64(MLX5_MKEY_MASK_FREE); + umr->flags = MLX5_UMR_INLINE; +} + +static __be64 get_umr_enable_mr_mask(void) +{ + u64 result; + + result = MLX5_MKEY_MASK_KEY | + MLX5_MKEY_MASK_FREE; + + return cpu_to_be64(result); +} + +static __be64 get_umr_disable_mr_mask(void) +{ + u64 result; + + result = MLX5_MKEY_MASK_FREE; + + return cpu_to_be64(result); +} + +static __be64 get_umr_update_translation_mask(void) +{ + u64 result; + + result = MLX5_MKEY_MASK_LEN | + MLX5_MKEY_MASK_PAGE_SIZE | + MLX5_MKEY_MASK_START_ADDR; + + return cpu_to_be64(result); +} + +static __be64 get_umr_update_access_mask(int atomic) +{ + u64 result; + + result = MLX5_MKEY_MASK_LR | + MLX5_MKEY_MASK_LW | + MLX5_MKEY_MASK_RR | + MLX5_MKEY_MASK_RW; + + if (atomic) + result |= MLX5_MKEY_MASK_A; + + return cpu_to_be64(result); +} + +static __be64 get_umr_update_pd_mask(void) +{ + u64 result; + + result = MLX5_MKEY_MASK_PD; + + return cpu_to_be64(result); +} + +static int umr_check_mkey_mask(struct mlx5_ib_dev *dev, u64 mask) +{ + if ((mask & MLX5_MKEY_MASK_PAGE_SIZE && + MLX5_CAP_GEN(dev->mdev, umr_modify_entity_size_disabled)) || + (mask & MLX5_MKEY_MASK_A && + MLX5_CAP_GEN(dev->mdev, umr_modify_atomic_disabled))) + return -EPERM; + return 0; +} + +static int set_reg_umr_segment(struct mlx5_ib_dev *dev, + struct mlx5_wqe_umr_ctrl_seg *umr, + const struct ib_send_wr *wr, int atomic) +{ + const struct mlx5_umr_wr *umrwr = umr_wr(wr); + + memset(umr, 0, sizeof(*umr)); + + if (!umrwr->ignore_free_state) { + if (wr->send_flags & MLX5_IB_SEND_UMR_FAIL_IF_FREE) + /* fail if free */ + umr->flags = MLX5_UMR_CHECK_FREE; + else + /* fail if not free */ + umr->flags = MLX5_UMR_CHECK_NOT_FREE; + } + + umr->xlt_octowords = cpu_to_be16(get_xlt_octo(umrwr->xlt_size)); + if (wr->send_flags & MLX5_IB_SEND_UMR_UPDATE_XLT) { + u64 offset = get_xlt_octo(umrwr->offset); + + umr->xlt_offset = cpu_to_be16(offset & 0xffff); + umr->xlt_offset_47_16 = cpu_to_be32(offset >> 16); + umr->flags |= MLX5_UMR_TRANSLATION_OFFSET_EN; + } + if (wr->send_flags & MLX5_IB_SEND_UMR_UPDATE_TRANSLATION) + umr->mkey_mask |= get_umr_update_translation_mask(); + if (wr->send_flags & MLX5_IB_SEND_UMR_UPDATE_PD_ACCESS) { + umr->mkey_mask |= get_umr_update_access_mask(atomic); + umr->mkey_mask |= get_umr_update_pd_mask(); + } + if (wr->send_flags & MLX5_IB_SEND_UMR_ENABLE_MR) + umr->mkey_mask |= get_umr_enable_mr_mask(); + if (wr->send_flags & MLX5_IB_SEND_UMR_DISABLE_MR) + umr->mkey_mask |= get_umr_disable_mr_mask(); + + if (!wr->num_sge) + umr->flags |= MLX5_UMR_INLINE; + + return umr_check_mkey_mask(dev, be64_to_cpu(umr->mkey_mask)); +} + +static u8 get_umr_flags(int acc) +{ + return (acc & IB_ACCESS_REMOTE_ATOMIC ? MLX5_PERM_ATOMIC : 0) | + (acc & IB_ACCESS_REMOTE_WRITE ? MLX5_PERM_REMOTE_WRITE : 0) | + (acc & IB_ACCESS_REMOTE_READ ? MLX5_PERM_REMOTE_READ : 0) | + (acc & IB_ACCESS_LOCAL_WRITE ? MLX5_PERM_LOCAL_WRITE : 0) | + MLX5_PERM_LOCAL_READ | MLX5_PERM_UMR_EN; +} + +static void set_reg_mkey_seg(struct mlx5_mkey_seg *seg, + struct mlx5_ib_mr *mr, + u32 key, int access) +{ + int ndescs = ALIGN(mr->ndescs + mr->meta_ndescs, 8) >> 1; + + memset(seg, 0, sizeof(*seg)); + + if (mr->access_mode == MLX5_MKC_ACCESS_MODE_MTT) + seg->log2_page_size = ilog2(mr->ibmr.page_size); + else if (mr->access_mode == MLX5_MKC_ACCESS_MODE_KLMS) + /* KLMs take twice the size of MTTs */ + ndescs *= 2; + + seg->flags = get_umr_flags(access) | mr->access_mode; + seg->qpn_mkey7_0 = cpu_to_be32((key & 0xff) | 0xffffff00); + seg->flags_pd = cpu_to_be32(MLX5_MKEY_REMOTE_INVAL); + seg->start_addr = cpu_to_be64(mr->ibmr.iova); + seg->len = cpu_to_be64(mr->ibmr.length); + seg->xlt_oct_size = cpu_to_be32(ndescs); +} + +static void set_linv_mkey_seg(struct mlx5_mkey_seg *seg) +{ + memset(seg, 0, sizeof(*seg)); + seg->status = MLX5_MKEY_STATUS_FREE; +} + +static void set_reg_mkey_segment(struct mlx5_mkey_seg *seg, + const struct ib_send_wr *wr) +{ + const struct mlx5_umr_wr *umrwr = umr_wr(wr); + + memset(seg, 0, sizeof(*seg)); + if (wr->send_flags & MLX5_IB_SEND_UMR_DISABLE_MR) + seg->status = MLX5_MKEY_STATUS_FREE; + + seg->flags = convert_access(umrwr->access_flags); + if (umrwr->pd) + seg->flags_pd = cpu_to_be32(to_mpd(umrwr->pd)->pdn); + if (wr->send_flags & MLX5_IB_SEND_UMR_UPDATE_TRANSLATION && + !umrwr->length) + seg->flags_pd |= cpu_to_be32(MLX5_MKEY_LEN64); + + seg->start_addr = cpu_to_be64(umrwr->virt_addr); + seg->len = cpu_to_be64(umrwr->length); + seg->log2_page_size = umrwr->page_shift; + seg->qpn_mkey7_0 = cpu_to_be32(0xffffff00 | + mlx5_mkey_variant(umrwr->mkey)); +} + +static void set_reg_data_seg(struct mlx5_wqe_data_seg *dseg, + struct mlx5_ib_mr *mr, + struct mlx5_ib_pd *pd) +{ + int bcount = mr->desc_size * (mr->ndescs + mr->meta_ndescs); + + dseg->addr = cpu_to_be64(mr->desc_map); + dseg->byte_count = cpu_to_be32(ALIGN(bcount, 64)); + dseg->lkey = cpu_to_be32(pd->ibpd.local_dma_lkey); +} + +static __be32 send_ieth(const struct ib_send_wr *wr) +{ + switch (wr->opcode) { + case IB_WR_SEND_WITH_IMM: + case IB_WR_RDMA_WRITE_WITH_IMM: + return wr->ex.imm_data; + + case IB_WR_SEND_WITH_INV: + return cpu_to_be32(wr->ex.invalidate_rkey); + + default: + return 0; + } +} + +static u8 calc_sig(void *wqe, int size) +{ + u8 *p = wqe; + u8 res = 0; + int i; + + for (i = 0; i < size; i++) + res ^= p[i]; + + return ~res; +} + +static u8 wq_sig(void *wqe) +{ + return calc_sig(wqe, (*((u8 *)wqe + 8) & 0x3f) << 4); +} + +static int set_data_inl_seg(struct mlx5_ib_qp *qp, const struct ib_send_wr *wr, + void **wqe, int *wqe_sz, void **cur_edge) +{ + struct mlx5_wqe_inline_seg *seg; + size_t offset; + int inl = 0; + int i; + + seg = *wqe; + *wqe += sizeof(*seg); + offset = sizeof(*seg); + + for (i = 0; i < wr->num_sge; i++) { + size_t len = wr->sg_list[i].length; + void *addr = (void *)(unsigned long)(wr->sg_list[i].addr); + + inl += len; + + if (unlikely(inl > qp->max_inline_data)) + return -ENOMEM; + + while (likely(len)) { + size_t leftlen; + size_t copysz; + + handle_post_send_edge(&qp->sq, wqe, + *wqe_sz + (offset >> 4), + cur_edge); + + leftlen = *cur_edge - *wqe; + copysz = min_t(size_t, leftlen, len); + + memcpy(*wqe, addr, copysz); + len -= copysz; + addr += copysz; + *wqe += copysz; + offset += copysz; + } + } + + seg->byte_count = cpu_to_be32(inl | MLX5_INLINE_SEG); + + *wqe_sz += ALIGN(inl + sizeof(seg->byte_count), 16) / 16; + + return 0; +} + +static u16 prot_field_size(enum ib_signature_type type) +{ + switch (type) { + case IB_SIG_TYPE_T10_DIF: + return MLX5_DIF_SIZE; + default: + return 0; + } +} + +static u8 bs_selector(int block_size) +{ + switch (block_size) { + case 512: return 0x1; + case 520: return 0x2; + case 4096: return 0x3; + case 4160: return 0x4; + case 1073741824: return 0x5; + default: return 0; + } +} + +static void mlx5_fill_inl_bsf(struct ib_sig_domain *domain, + struct mlx5_bsf_inl *inl) +{ + /* Valid inline section and allow BSF refresh */ + inl->vld_refresh = cpu_to_be16(MLX5_BSF_INL_VALID | + MLX5_BSF_REFRESH_DIF); + inl->dif_apptag = cpu_to_be16(domain->sig.dif.app_tag); + inl->dif_reftag = cpu_to_be32(domain->sig.dif.ref_tag); + /* repeating block */ + inl->rp_inv_seed = MLX5_BSF_REPEAT_BLOCK; + inl->sig_type = domain->sig.dif.bg_type == IB_T10DIF_CRC ? + MLX5_DIF_CRC : MLX5_DIF_IPCS; + + if (domain->sig.dif.ref_remap) + inl->dif_inc_ref_guard_check |= MLX5_BSF_INC_REFTAG; + + if (domain->sig.dif.app_escape) { + if (domain->sig.dif.ref_escape) + inl->dif_inc_ref_guard_check |= MLX5_BSF_APPREF_ESCAPE; + else + inl->dif_inc_ref_guard_check |= MLX5_BSF_APPTAG_ESCAPE; + } + + inl->dif_app_bitmask_check = + cpu_to_be16(domain->sig.dif.apptag_check_mask); +} + +static int mlx5_set_bsf(struct ib_mr *sig_mr, + struct ib_sig_attrs *sig_attrs, + struct mlx5_bsf *bsf, u32 data_size) +{ + struct mlx5_core_sig_ctx *msig = to_mmr(sig_mr)->sig; + struct mlx5_bsf_basic *basic = &bsf->basic; + struct ib_sig_domain *mem = &sig_attrs->mem; + struct ib_sig_domain *wire = &sig_attrs->wire; + + memset(bsf, 0, sizeof(*bsf)); + + /* Basic + Extended + Inline */ + basic->bsf_size_sbs = 1 << 7; + /* Input domain check byte mask */ + basic->check_byte_mask = sig_attrs->check_mask; + basic->raw_data_size = cpu_to_be32(data_size); + + /* Memory domain */ + switch (sig_attrs->mem.sig_type) { + case IB_SIG_TYPE_NONE: + break; + case IB_SIG_TYPE_T10_DIF: + basic->mem.bs_selector = bs_selector(mem->sig.dif.pi_interval); + basic->m_bfs_psv = cpu_to_be32(msig->psv_memory.psv_idx); + mlx5_fill_inl_bsf(mem, &bsf->m_inl); + break; + default: + return -EINVAL; + } + + /* Wire domain */ + switch (sig_attrs->wire.sig_type) { + case IB_SIG_TYPE_NONE: + break; + case IB_SIG_TYPE_T10_DIF: + if (mem->sig.dif.pi_interval == wire->sig.dif.pi_interval && + mem->sig_type == wire->sig_type) { + /* Same block structure */ + basic->bsf_size_sbs |= 1 << 4; + if (mem->sig.dif.bg_type == wire->sig.dif.bg_type) + basic->wire.copy_byte_mask |= MLX5_CPY_GRD_MASK; + if (mem->sig.dif.app_tag == wire->sig.dif.app_tag) + basic->wire.copy_byte_mask |= MLX5_CPY_APP_MASK; + if (mem->sig.dif.ref_tag == wire->sig.dif.ref_tag) + basic->wire.copy_byte_mask |= MLX5_CPY_REF_MASK; + } else + basic->wire.bs_selector = + bs_selector(wire->sig.dif.pi_interval); + + basic->w_bfs_psv = cpu_to_be32(msig->psv_wire.psv_idx); + mlx5_fill_inl_bsf(wire, &bsf->w_inl); + break; + default: + return -EINVAL; + } + + return 0; +} + + +static int set_sig_data_segment(const struct ib_send_wr *send_wr, + struct ib_mr *sig_mr, + struct ib_sig_attrs *sig_attrs, + struct mlx5_ib_qp *qp, void **seg, int *size, + void **cur_edge) +{ + struct mlx5_bsf *bsf; + u32 data_len; + u32 data_key; + u64 data_va; + u32 prot_len = 0; + u32 prot_key = 0; + u64 prot_va = 0; + bool prot = false; + int ret; + int wqe_size; + struct mlx5_ib_mr *mr = to_mmr(sig_mr); + struct mlx5_ib_mr *pi_mr = mr->pi_mr; + + data_len = pi_mr->data_length; + data_key = pi_mr->ibmr.lkey; + data_va = pi_mr->data_iova; + if (pi_mr->meta_ndescs) { + prot_len = pi_mr->meta_length; + prot_key = pi_mr->ibmr.lkey; + prot_va = pi_mr->pi_iova; + prot = true; + } + + if (!prot || (data_key == prot_key && data_va == prot_va && + data_len == prot_len)) { + /** + * Source domain doesn't contain signature information + * or data and protection are interleaved in memory. + * So need construct: + * ------------------ + * | data_klm | + * ------------------ + * | BSF | + * ------------------ + **/ + struct mlx5_klm *data_klm = *seg; + + data_klm->bcount = cpu_to_be32(data_len); + data_klm->key = cpu_to_be32(data_key); + data_klm->va = cpu_to_be64(data_va); + wqe_size = ALIGN(sizeof(*data_klm), 64); + } else { + /** + * Source domain contains signature information + * So need construct a strided block format: + * --------------------------- + * | stride_block_ctrl | + * --------------------------- + * | data_klm | + * --------------------------- + * | prot_klm | + * --------------------------- + * | BSF | + * --------------------------- + **/ + struct mlx5_stride_block_ctrl_seg *sblock_ctrl; + struct mlx5_stride_block_entry *data_sentry; + struct mlx5_stride_block_entry *prot_sentry; + u16 block_size = sig_attrs->mem.sig.dif.pi_interval; + int prot_size; + + sblock_ctrl = *seg; + data_sentry = (void *)sblock_ctrl + sizeof(*sblock_ctrl); + prot_sentry = (void *)data_sentry + sizeof(*data_sentry); + + prot_size = prot_field_size(sig_attrs->mem.sig_type); + if (!prot_size) { + pr_err("Bad block size given: %u\n", block_size); + return -EINVAL; + } + sblock_ctrl->bcount_per_cycle = cpu_to_be32(block_size + + prot_size); + sblock_ctrl->op = cpu_to_be32(MLX5_STRIDE_BLOCK_OP); + sblock_ctrl->repeat_count = cpu_to_be32(data_len / block_size); + sblock_ctrl->num_entries = cpu_to_be16(2); + + data_sentry->bcount = cpu_to_be16(block_size); + data_sentry->key = cpu_to_be32(data_key); + data_sentry->va = cpu_to_be64(data_va); + data_sentry->stride = cpu_to_be16(block_size); + + prot_sentry->bcount = cpu_to_be16(prot_size); + prot_sentry->key = cpu_to_be32(prot_key); + prot_sentry->va = cpu_to_be64(prot_va); + prot_sentry->stride = cpu_to_be16(prot_size); + + wqe_size = ALIGN(sizeof(*sblock_ctrl) + sizeof(*data_sentry) + + sizeof(*prot_sentry), 64); + } + + *seg += wqe_size; + *size += wqe_size / 16; + handle_post_send_edge(&qp->sq, seg, *size, cur_edge); + + bsf = *seg; + ret = mlx5_set_bsf(sig_mr, sig_attrs, bsf, data_len); + if (ret) + return -EINVAL; + + *seg += sizeof(*bsf); + *size += sizeof(*bsf) / 16; + handle_post_send_edge(&qp->sq, seg, *size, cur_edge); + + return 0; +} + +static void set_sig_mkey_segment(struct mlx5_mkey_seg *seg, + struct ib_mr *sig_mr, int access_flags, + u32 size, u32 length, u32 pdn) +{ + u32 sig_key = sig_mr->rkey; + u8 sigerr = to_mmr(sig_mr)->sig->sigerr_count & 1; + + memset(seg, 0, sizeof(*seg)); + + seg->flags = get_umr_flags(access_flags) | MLX5_MKC_ACCESS_MODE_KLMS; + seg->qpn_mkey7_0 = cpu_to_be32((sig_key & 0xff) | 0xffffff00); + seg->flags_pd = cpu_to_be32(MLX5_MKEY_REMOTE_INVAL | sigerr << 26 | + MLX5_MKEY_BSF_EN | pdn); + seg->len = cpu_to_be64(length); + seg->xlt_oct_size = cpu_to_be32(get_xlt_octo(size)); + seg->bsfs_octo_size = cpu_to_be32(MLX5_MKEY_BSF_OCTO_SIZE); +} + +static void set_sig_umr_segment(struct mlx5_wqe_umr_ctrl_seg *umr, + u32 size) +{ + memset(umr, 0, sizeof(*umr)); + + umr->flags = MLX5_FLAGS_INLINE | MLX5_FLAGS_CHECK_FREE; + umr->xlt_octowords = cpu_to_be16(get_xlt_octo(size)); + umr->bsf_octowords = cpu_to_be16(MLX5_MKEY_BSF_OCTO_SIZE); + umr->mkey_mask = sig_mkey_mask(); +} + +static int set_pi_umr_wr(const struct ib_send_wr *send_wr, + struct mlx5_ib_qp *qp, void **seg, int *size, + void **cur_edge) +{ + const struct ib_reg_wr *wr = reg_wr(send_wr); + struct mlx5_ib_mr *sig_mr = to_mmr(wr->mr); + struct mlx5_ib_mr *pi_mr = sig_mr->pi_mr; + struct ib_sig_attrs *sig_attrs = sig_mr->ibmr.sig_attrs; + u32 pdn = to_mpd(qp->ibqp.pd)->pdn; + u32 xlt_size; + int region_len, ret; + + if (unlikely(send_wr->num_sge != 0) || + unlikely(wr->access & IB_ACCESS_REMOTE_ATOMIC) || + unlikely(!sig_mr->sig) || unlikely(!qp->ibqp.integrity_en) || + unlikely(!sig_mr->sig->sig_status_checked)) + return -EINVAL; + + /* length of the protected region, data + protection */ + region_len = pi_mr->ibmr.length; + + /** + * KLM octoword size - if protection was provided + * then we use strided block format (3 octowords), + * else we use single KLM (1 octoword) + **/ + if (sig_attrs->mem.sig_type != IB_SIG_TYPE_NONE) + xlt_size = 0x30; + else + xlt_size = sizeof(struct mlx5_klm); + + set_sig_umr_segment(*seg, xlt_size); + *seg += sizeof(struct mlx5_wqe_umr_ctrl_seg); + *size += sizeof(struct mlx5_wqe_umr_ctrl_seg) / 16; + handle_post_send_edge(&qp->sq, seg, *size, cur_edge); + + set_sig_mkey_segment(*seg, wr->mr, wr->access, xlt_size, region_len, + pdn); + *seg += sizeof(struct mlx5_mkey_seg); + *size += sizeof(struct mlx5_mkey_seg) / 16; + handle_post_send_edge(&qp->sq, seg, *size, cur_edge); + + ret = set_sig_data_segment(send_wr, wr->mr, sig_attrs, qp, seg, size, + cur_edge); + if (ret) + return ret; + + sig_mr->sig->sig_status_checked = false; + return 0; +} + +static int set_psv_wr(struct ib_sig_domain *domain, + u32 psv_idx, void **seg, int *size) +{ + struct mlx5_seg_set_psv *psv_seg = *seg; + + memset(psv_seg, 0, sizeof(*psv_seg)); + psv_seg->psv_num = cpu_to_be32(psv_idx); + switch (domain->sig_type) { + case IB_SIG_TYPE_NONE: + break; + case IB_SIG_TYPE_T10_DIF: + psv_seg->transient_sig = cpu_to_be32(domain->sig.dif.bg << 16 | + domain->sig.dif.app_tag); + psv_seg->ref_tag = cpu_to_be32(domain->sig.dif.ref_tag); + break; + default: + pr_err("Bad signature type (%d) is given.\n", + domain->sig_type); + return -EINVAL; + } + + *seg += sizeof(*psv_seg); + *size += sizeof(*psv_seg) / 16; + + return 0; +} + +static int set_reg_wr(struct mlx5_ib_qp *qp, + const struct ib_reg_wr *wr, + void **seg, int *size, void **cur_edge, + bool check_not_free) +{ + struct mlx5_ib_mr *mr = to_mmr(wr->mr); + struct mlx5_ib_pd *pd = to_mpd(qp->ibqp.pd); + struct mlx5_ib_dev *dev = to_mdev(pd->ibpd.device); + int mr_list_size = (mr->ndescs + mr->meta_ndescs) * mr->desc_size; + bool umr_inline = mr_list_size <= MLX5_IB_SQ_UMR_INLINE_THRESHOLD; + bool atomic = wr->access & IB_ACCESS_REMOTE_ATOMIC; + u8 flags = 0; + + if (!mlx5_ib_can_use_umr(dev, atomic, wr->access)) { + mlx5_ib_warn(to_mdev(qp->ibqp.device), + "Fast update of %s for MR is disabled\n", + (MLX5_CAP_GEN(dev->mdev, + umr_modify_entity_size_disabled)) ? + "entity size" : + "atomic access"); + return -EINVAL; + } + + if (unlikely(wr->wr.send_flags & IB_SEND_INLINE)) { + mlx5_ib_warn(to_mdev(qp->ibqp.device), + "Invalid IB_SEND_INLINE send flag\n"); + return -EINVAL; + } + + if (check_not_free) + flags |= MLX5_UMR_CHECK_NOT_FREE; + if (umr_inline) + flags |= MLX5_UMR_INLINE; + + set_reg_umr_seg(*seg, mr, flags, atomic); + *seg += sizeof(struct mlx5_wqe_umr_ctrl_seg); + *size += sizeof(struct mlx5_wqe_umr_ctrl_seg) / 16; + handle_post_send_edge(&qp->sq, seg, *size, cur_edge); + + set_reg_mkey_seg(*seg, mr, wr->key, wr->access); + *seg += sizeof(struct mlx5_mkey_seg); + *size += sizeof(struct mlx5_mkey_seg) / 16; + handle_post_send_edge(&qp->sq, seg, *size, cur_edge); + + if (umr_inline) { + memcpy_send_wqe(&qp->sq, cur_edge, seg, size, mr->descs, + mr_list_size); + *size = ALIGN(*size, MLX5_SEND_WQE_BB >> 4); + } else { + set_reg_data_seg(*seg, mr, pd); + *seg += sizeof(struct mlx5_wqe_data_seg); + *size += (sizeof(struct mlx5_wqe_data_seg) / 16); + } + return 0; +} + +static void set_linv_wr(struct mlx5_ib_qp *qp, void **seg, int *size, + void **cur_edge) +{ + set_linv_umr_seg(*seg); + *seg += sizeof(struct mlx5_wqe_umr_ctrl_seg); + *size += sizeof(struct mlx5_wqe_umr_ctrl_seg) / 16; + handle_post_send_edge(&qp->sq, seg, *size, cur_edge); + set_linv_mkey_seg(*seg); + *seg += sizeof(struct mlx5_mkey_seg); + *size += sizeof(struct mlx5_mkey_seg) / 16; + handle_post_send_edge(&qp->sq, seg, *size, cur_edge); +} + +static void dump_wqe(struct mlx5_ib_qp *qp, u32 idx, int size_16) +{ + __be32 *p = NULL; + int i, j; + + pr_debug("dump WQE index %u:\n", idx); + for (i = 0, j = 0; i < size_16 * 4; i += 4, j += 4) { + if ((i & 0xf) == 0) { + p = mlx5_frag_buf_get_wqe(&qp->sq.fbc, idx); + pr_debug("WQBB at %p:\n", (void *)p); + j = 0; + idx = (idx + 1) & (qp->sq.wqe_cnt - 1); + } + pr_debug("%08x %08x %08x %08x\n", be32_to_cpu(p[j]), + be32_to_cpu(p[j + 1]), be32_to_cpu(p[j + 2]), + be32_to_cpu(p[j + 3])); + } +} + +static int __begin_wqe(struct mlx5_ib_qp *qp, void **seg, + struct mlx5_wqe_ctrl_seg **ctrl, + const struct ib_send_wr *wr, unsigned int *idx, + int *size, void **cur_edge, int nreq, + bool send_signaled, bool solicited) +{ + if (unlikely(mlx5_wq_overflow(&qp->sq, nreq, qp->ibqp.send_cq))) + return -ENOMEM; + + *idx = qp->sq.cur_post & (qp->sq.wqe_cnt - 1); + *seg = mlx5_frag_buf_get_wqe(&qp->sq.fbc, *idx); + *ctrl = *seg; + *(uint32_t *)(*seg + 8) = 0; + (*ctrl)->imm = send_ieth(wr); + (*ctrl)->fm_ce_se = qp->sq_signal_bits | + (send_signaled ? MLX5_WQE_CTRL_CQ_UPDATE : 0) | + (solicited ? MLX5_WQE_CTRL_SOLICITED : 0); + + *seg += sizeof(**ctrl); + *size = sizeof(**ctrl) / 16; + *cur_edge = qp->sq.cur_edge; + + return 0; +} + +static int begin_wqe(struct mlx5_ib_qp *qp, void **seg, + struct mlx5_wqe_ctrl_seg **ctrl, + const struct ib_send_wr *wr, unsigned int *idx, int *size, + void **cur_edge, int nreq) +{ + return __begin_wqe(qp, seg, ctrl, wr, idx, size, cur_edge, nreq, + wr->send_flags & IB_SEND_SIGNALED, + wr->send_flags & IB_SEND_SOLICITED); +} + +static void finish_wqe(struct mlx5_ib_qp *qp, + struct mlx5_wqe_ctrl_seg *ctrl, + void *seg, u8 size, void *cur_edge, + unsigned int idx, u64 wr_id, int nreq, u8 fence, + u32 mlx5_opcode) +{ + u8 opmod = 0; + + ctrl->opmod_idx_opcode = cpu_to_be32(((u32)(qp->sq.cur_post) << 8) | + mlx5_opcode | ((u32)opmod << 24)); + ctrl->qpn_ds = cpu_to_be32(size | (qp->trans_qp.base.mqp.qpn << 8)); + ctrl->fm_ce_se |= fence; + if (unlikely(qp->flags_en & MLX5_QP_FLAG_SIGNATURE)) + ctrl->signature = wq_sig(ctrl); + + qp->sq.wrid[idx] = wr_id; + qp->sq.w_list[idx].opcode = mlx5_opcode; + qp->sq.wqe_head[idx] = qp->sq.head + nreq; + qp->sq.cur_post += DIV_ROUND_UP(size * 16, MLX5_SEND_WQE_BB); + qp->sq.w_list[idx].next = qp->sq.cur_post; + + /* We save the edge which was possibly updated during the WQE + * construction, into SQ's cache. + */ + seg = PTR_ALIGN(seg, MLX5_SEND_WQE_BB); + qp->sq.cur_edge = (unlikely(seg == cur_edge)) ? + get_sq_edge(&qp->sq, qp->sq.cur_post & + (qp->sq.wqe_cnt - 1)) : + cur_edge; +} + +static void handle_rdma_op(const struct ib_send_wr *wr, void **seg, int *size) +{ + set_raddr_seg(*seg, rdma_wr(wr)->remote_addr, rdma_wr(wr)->rkey); + *seg += sizeof(struct mlx5_wqe_raddr_seg); + *size += sizeof(struct mlx5_wqe_raddr_seg) / 16; +} + +static void handle_local_inv(struct mlx5_ib_qp *qp, const struct ib_send_wr *wr, + struct mlx5_wqe_ctrl_seg **ctrl, void **seg, + int *size, void **cur_edge, unsigned int idx) +{ + qp->sq.wr_data[idx] = IB_WR_LOCAL_INV; + (*ctrl)->imm = cpu_to_be32(wr->ex.invalidate_rkey); + set_linv_wr(qp, seg, size, cur_edge); +} + +static int handle_reg_mr(struct mlx5_ib_qp *qp, const struct ib_send_wr *wr, + struct mlx5_wqe_ctrl_seg **ctrl, void **seg, int *size, + void **cur_edge, unsigned int idx) +{ + qp->sq.wr_data[idx] = IB_WR_REG_MR; + (*ctrl)->imm = cpu_to_be32(reg_wr(wr)->key); + return set_reg_wr(qp, reg_wr(wr), seg, size, cur_edge, true); +} + +static int handle_psv(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, + const struct ib_send_wr *wr, + struct mlx5_wqe_ctrl_seg **ctrl, void **seg, int *size, + void **cur_edge, unsigned int *idx, int nreq, + struct ib_sig_domain *domain, u32 psv_index, + u8 next_fence) +{ + int err; + + /* + * SET_PSV WQEs are not signaled and solicited on error. + */ + err = __begin_wqe(qp, seg, ctrl, wr, idx, size, cur_edge, nreq, + false, true); + if (unlikely(err)) { + mlx5_ib_warn(dev, "\n"); + err = -ENOMEM; + goto out; + } + err = set_psv_wr(domain, psv_index, seg, size); + if (unlikely(err)) { + mlx5_ib_warn(dev, "\n"); + goto out; + } + finish_wqe(qp, *ctrl, *seg, *size, *cur_edge, *idx, wr->wr_id, nreq, + next_fence, MLX5_OPCODE_SET_PSV); + +out: + return err; +} + +static int handle_reg_mr_integrity(struct mlx5_ib_dev *dev, + struct mlx5_ib_qp *qp, + const struct ib_send_wr *wr, + struct mlx5_wqe_ctrl_seg **ctrl, void **seg, + int *size, void **cur_edge, + unsigned int *idx, int nreq, u8 fence, + u8 next_fence) +{ + struct mlx5_ib_mr *mr; + struct mlx5_ib_mr *pi_mr; + struct mlx5_ib_mr pa_pi_mr; + struct ib_sig_attrs *sig_attrs; + struct ib_reg_wr reg_pi_wr; + int err; + + qp->sq.wr_data[*idx] = IB_WR_REG_MR_INTEGRITY; + + mr = to_mmr(reg_wr(wr)->mr); + pi_mr = mr->pi_mr; + + if (pi_mr) { + memset(®_pi_wr, 0, + sizeof(struct ib_reg_wr)); + + reg_pi_wr.mr = &pi_mr->ibmr; + reg_pi_wr.access = reg_wr(wr)->access; + reg_pi_wr.key = pi_mr->ibmr.rkey; + + (*ctrl)->imm = cpu_to_be32(reg_pi_wr.key); + /* UMR for data + prot registration */ + err = set_reg_wr(qp, ®_pi_wr, seg, size, cur_edge, false); + if (unlikely(err)) + goto out; + + finish_wqe(qp, *ctrl, *seg, *size, *cur_edge, *idx, wr->wr_id, + nreq, fence, MLX5_OPCODE_UMR); + + err = begin_wqe(qp, seg, ctrl, wr, idx, size, cur_edge, nreq); + if (unlikely(err)) { + mlx5_ib_warn(dev, "\n"); + err = -ENOMEM; + goto out; + } + } else { + memset(&pa_pi_mr, 0, sizeof(struct mlx5_ib_mr)); + /* No UMR, use local_dma_lkey */ + pa_pi_mr.ibmr.lkey = mr->ibmr.pd->local_dma_lkey; + pa_pi_mr.ndescs = mr->ndescs; + pa_pi_mr.data_length = mr->data_length; + pa_pi_mr.data_iova = mr->data_iova; + if (mr->meta_ndescs) { + pa_pi_mr.meta_ndescs = mr->meta_ndescs; + pa_pi_mr.meta_length = mr->meta_length; + pa_pi_mr.pi_iova = mr->pi_iova; + } + + pa_pi_mr.ibmr.length = mr->ibmr.length; + mr->pi_mr = &pa_pi_mr; + } + (*ctrl)->imm = cpu_to_be32(mr->ibmr.rkey); + /* UMR for sig MR */ + err = set_pi_umr_wr(wr, qp, seg, size, cur_edge); + if (unlikely(err)) { + mlx5_ib_warn(dev, "\n"); + goto out; + } + finish_wqe(qp, *ctrl, *seg, *size, *cur_edge, *idx, wr->wr_id, nreq, + fence, MLX5_OPCODE_UMR); + + sig_attrs = mr->ibmr.sig_attrs; + err = handle_psv(dev, qp, wr, ctrl, seg, size, cur_edge, idx, nreq, + &sig_attrs->mem, mr->sig->psv_memory.psv_idx, + next_fence); + if (unlikely(err)) + goto out; + + err = handle_psv(dev, qp, wr, ctrl, seg, size, cur_edge, idx, nreq, + &sig_attrs->wire, mr->sig->psv_wire.psv_idx, + next_fence); + if (unlikely(err)) + goto out; + + qp->next_fence = MLX5_FENCE_MODE_INITIATOR_SMALL; + +out: + return err; +} + +static int handle_qpt_rc(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, + const struct ib_send_wr *wr, + struct mlx5_wqe_ctrl_seg **ctrl, void **seg, int *size, + void **cur_edge, unsigned int *idx, int nreq, u8 fence, + u8 next_fence, int *num_sge) +{ + int err = 0; + + switch (wr->opcode) { + case IB_WR_RDMA_READ: + case IB_WR_RDMA_WRITE: + case IB_WR_RDMA_WRITE_WITH_IMM: + handle_rdma_op(wr, seg, size); + break; + + case IB_WR_ATOMIC_CMP_AND_SWP: + case IB_WR_ATOMIC_FETCH_AND_ADD: + case IB_WR_MASKED_ATOMIC_CMP_AND_SWP: + mlx5_ib_warn(dev, "Atomic operations are not supported yet\n"); + err = -EOPNOTSUPP; + goto out; + + case IB_WR_LOCAL_INV: + handle_local_inv(qp, wr, ctrl, seg, size, cur_edge, *idx); + *num_sge = 0; + break; + + case IB_WR_REG_MR: + err = handle_reg_mr(qp, wr, ctrl, seg, size, cur_edge, *idx); + if (unlikely(err)) + goto out; + *num_sge = 0; + break; + + case IB_WR_REG_MR_INTEGRITY: + err = handle_reg_mr_integrity(dev, qp, wr, ctrl, seg, size, + cur_edge, idx, nreq, fence, + next_fence); + if (unlikely(err)) + goto out; + *num_sge = 0; + break; + + default: + break; + } + +out: + return err; +} + +static void handle_qpt_uc(const struct ib_send_wr *wr, void **seg, int *size) +{ + switch (wr->opcode) { + case IB_WR_RDMA_WRITE: + case IB_WR_RDMA_WRITE_WITH_IMM: + handle_rdma_op(wr, seg, size); + break; + default: + break; + } +} + +static void handle_qpt_hw_gsi(struct mlx5_ib_qp *qp, + const struct ib_send_wr *wr, void **seg, + int *size, void **cur_edge) +{ + set_datagram_seg(*seg, wr); + *seg += sizeof(struct mlx5_wqe_datagram_seg); + *size += sizeof(struct mlx5_wqe_datagram_seg) / 16; + handle_post_send_edge(&qp->sq, seg, *size, cur_edge); +} + +static void handle_qpt_ud(struct mlx5_ib_qp *qp, const struct ib_send_wr *wr, + void **seg, int *size, void **cur_edge) +{ + set_datagram_seg(*seg, wr); + *seg += sizeof(struct mlx5_wqe_datagram_seg); + *size += sizeof(struct mlx5_wqe_datagram_seg) / 16; + handle_post_send_edge(&qp->sq, seg, *size, cur_edge); + + /* handle qp that supports ud offload */ + if (qp->flags & IB_QP_CREATE_IPOIB_UD_LSO) { + struct mlx5_wqe_eth_pad *pad; + + pad = *seg; + memset(pad, 0, sizeof(struct mlx5_wqe_eth_pad)); + *seg += sizeof(struct mlx5_wqe_eth_pad); + *size += sizeof(struct mlx5_wqe_eth_pad) / 16; + set_eth_seg(wr, qp, seg, size, cur_edge); + handle_post_send_edge(&qp->sq, seg, *size, cur_edge); + } +} + +static int handle_qpt_reg_umr(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, + const struct ib_send_wr *wr, + struct mlx5_wqe_ctrl_seg **ctrl, void **seg, + int *size, void **cur_edge, unsigned int idx) +{ + int err = 0; + + if (unlikely(wr->opcode != MLX5_IB_WR_UMR)) { + err = -EINVAL; + mlx5_ib_warn(dev, "bad opcode %d\n", wr->opcode); + goto out; + } + + qp->sq.wr_data[idx] = MLX5_IB_WR_UMR; + (*ctrl)->imm = cpu_to_be32(umr_wr(wr)->mkey); + err = set_reg_umr_segment(dev, *seg, wr, + !!(MLX5_CAP_GEN(dev->mdev, atomic))); + if (unlikely(err)) + goto out; + *seg += sizeof(struct mlx5_wqe_umr_ctrl_seg); + *size += sizeof(struct mlx5_wqe_umr_ctrl_seg) / 16; + handle_post_send_edge(&qp->sq, seg, *size, cur_edge); + set_reg_mkey_segment(*seg, wr); + *seg += sizeof(struct mlx5_mkey_seg); + *size += sizeof(struct mlx5_mkey_seg) / 16; + handle_post_send_edge(&qp->sq, seg, *size, cur_edge); +out: + return err; +} + +int mlx5_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, + const struct ib_send_wr **bad_wr, bool drain) +{ + struct mlx5_wqe_ctrl_seg *ctrl = NULL; /* compiler warning */ + struct mlx5_ib_dev *dev = to_mdev(ibqp->device); + struct mlx5_core_dev *mdev = dev->mdev; + struct mlx5_ib_qp *qp; + struct mlx5_wqe_xrc_seg *xrc; + struct mlx5_bf *bf; + void *cur_edge; + int uninitialized_var(size); + unsigned long flags; + unsigned int idx; + int err = 0; + int num_sge; + void *seg; + int nreq; + int i; + u8 next_fence = 0; + u8 fence; + + if (unlikely(mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR && + !drain)) { + *bad_wr = wr; + return -EIO; + } + + if (unlikely(ibqp->qp_type == IB_QPT_GSI)) + return mlx5_ib_gsi_post_send(ibqp, wr, bad_wr); + + qp = to_mqp(ibqp); + bf = &qp->bf; + + spin_lock_irqsave(&qp->sq.lock, flags); + + for (nreq = 0; wr; nreq++, wr = wr->next) { + if (unlikely(wr->opcode >= ARRAY_SIZE(mlx5_ib_opcode))) { + mlx5_ib_warn(dev, "\n"); + err = -EINVAL; + *bad_wr = wr; + goto out; + } + + num_sge = wr->num_sge; + if (unlikely(num_sge > qp->sq.max_gs)) { + mlx5_ib_warn(dev, "\n"); + err = -EINVAL; + *bad_wr = wr; + goto out; + } + + err = begin_wqe(qp, &seg, &ctrl, wr, &idx, &size, &cur_edge, + nreq); + if (err) { + mlx5_ib_warn(dev, "\n"); + err = -ENOMEM; + *bad_wr = wr; + goto out; + } + + if (wr->opcode == IB_WR_REG_MR || + wr->opcode == IB_WR_REG_MR_INTEGRITY) { + fence = dev->umr_fence; + next_fence = MLX5_FENCE_MODE_INITIATOR_SMALL; + } else { + if (wr->send_flags & IB_SEND_FENCE) { + if (qp->next_fence) + fence = MLX5_FENCE_MODE_SMALL_AND_FENCE; + else + fence = MLX5_FENCE_MODE_FENCE; + } else { + fence = qp->next_fence; + } + } + + switch (ibqp->qp_type) { + case IB_QPT_XRC_INI: + xrc = seg; + seg += sizeof(*xrc); + size += sizeof(*xrc) / 16; + fallthrough; + case IB_QPT_RC: + err = handle_qpt_rc(dev, qp, wr, &ctrl, &seg, &size, + &cur_edge, &idx, nreq, fence, + next_fence, &num_sge); + if (unlikely(err)) { + *bad_wr = wr; + goto out; + } else if (wr->opcode == IB_WR_REG_MR_INTEGRITY) { + goto skip_psv; + } + break; + + case IB_QPT_UC: + handle_qpt_uc(wr, &seg, &size); + break; + case IB_QPT_SMI: + if (unlikely(!mdev->port_caps[qp->port - 1].has_smi)) { + mlx5_ib_warn(dev, "Send SMP MADs is not allowed\n"); + err = -EPERM; + *bad_wr = wr; + goto out; + } + fallthrough; + case MLX5_IB_QPT_HW_GSI: + handle_qpt_hw_gsi(qp, wr, &seg, &size, &cur_edge); + break; + case IB_QPT_UD: + handle_qpt_ud(qp, wr, &seg, &size, &cur_edge); + break; + case MLX5_IB_QPT_REG_UMR: + err = handle_qpt_reg_umr(dev, qp, wr, &ctrl, &seg, + &size, &cur_edge, idx); + if (unlikely(err)) + goto out; + break; + + default: + break; + } + + if (wr->send_flags & IB_SEND_INLINE && num_sge) { + err = set_data_inl_seg(qp, wr, &seg, &size, &cur_edge); + if (unlikely(err)) { + mlx5_ib_warn(dev, "\n"); + *bad_wr = wr; + goto out; + } + } else { + for (i = 0; i < num_sge; i++) { + handle_post_send_edge(&qp->sq, &seg, size, + &cur_edge); + if (unlikely(!wr->sg_list[i].length)) + continue; + + set_data_ptr_seg( + (struct mlx5_wqe_data_seg *)seg, + wr->sg_list + i); + size += sizeof(struct mlx5_wqe_data_seg) / 16; + seg += sizeof(struct mlx5_wqe_data_seg); + } + } + + qp->next_fence = next_fence; + finish_wqe(qp, ctrl, seg, size, cur_edge, idx, wr->wr_id, nreq, + fence, mlx5_ib_opcode[wr->opcode]); +skip_psv: + if (0) + dump_wqe(qp, idx, size); + } + +out: + if (likely(nreq)) { + qp->sq.head += nreq; + + /* Make sure that descriptors are written before + * updating doorbell record and ringing the doorbell + */ + wmb(); + + qp->db.db[MLX5_SND_DBR] = cpu_to_be32(qp->sq.cur_post); + + /* Make sure doorbell record is visible to the HCA before + * we hit doorbell. + */ + wmb(); + + mlx5_write64((__be32 *)ctrl, bf->bfreg->map + bf->offset); + /* Make sure doorbells don't leak out of SQ spinlock + * and reach the HCA out of order. + */ + bf->offset ^= bf->buf_size; + } + + spin_unlock_irqrestore(&qp->sq.lock, flags); + + return err; +} + +static void set_sig_seg(struct mlx5_rwqe_sig *sig, int max_gs) +{ + sig->signature = calc_sig(sig, (max_gs + 1) << 2); +} + +int mlx5_ib_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr, + const struct ib_recv_wr **bad_wr, bool drain) +{ + struct mlx5_ib_qp *qp = to_mqp(ibqp); + struct mlx5_wqe_data_seg *scat; + struct mlx5_rwqe_sig *sig; + struct mlx5_ib_dev *dev = to_mdev(ibqp->device); + struct mlx5_core_dev *mdev = dev->mdev; + unsigned long flags; + int err = 0; + int nreq; + int ind; + int i; + + if (unlikely(mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR && + !drain)) { + *bad_wr = wr; + return -EIO; + } + + if (unlikely(ibqp->qp_type == IB_QPT_GSI)) + return mlx5_ib_gsi_post_recv(ibqp, wr, bad_wr); + + spin_lock_irqsave(&qp->rq.lock, flags); + + ind = qp->rq.head & (qp->rq.wqe_cnt - 1); + + for (nreq = 0; wr; nreq++, wr = wr->next) { + if (mlx5_wq_overflow(&qp->rq, nreq, qp->ibqp.recv_cq)) { + err = -ENOMEM; + *bad_wr = wr; + goto out; + } + + if (unlikely(wr->num_sge > qp->rq.max_gs)) { + err = -EINVAL; + *bad_wr = wr; + goto out; + } + + scat = mlx5_frag_buf_get_wqe(&qp->rq.fbc, ind); + if (qp->flags_en & MLX5_QP_FLAG_SIGNATURE) + scat++; + + for (i = 0; i < wr->num_sge; i++) + set_data_ptr_seg(scat + i, wr->sg_list + i); + + if (i < qp->rq.max_gs) { + scat[i].byte_count = 0; + scat[i].lkey = cpu_to_be32(MLX5_INVALID_LKEY); + scat[i].addr = 0; + } + + if (qp->flags_en & MLX5_QP_FLAG_SIGNATURE) { + sig = (struct mlx5_rwqe_sig *)scat; + set_sig_seg(sig, qp->rq.max_gs); + } + + qp->rq.wrid[ind] = wr->wr_id; + + ind = (ind + 1) & (qp->rq.wqe_cnt - 1); + } + +out: + if (likely(nreq)) { + qp->rq.head += nreq; + + /* Make sure that descriptors are written before + * doorbell record. + */ + wmb(); + + *qp->db.db = cpu_to_be32(qp->rq.head & 0xffff); + } + + spin_unlock_irqrestore(&qp->rq.lock, flags); + + return err; +} diff --git a/drivers/infiniband/hw/mlx5/wr.h b/drivers/infiniband/hw/mlx5/wr.h new file mode 100644 index 000000000000..4f0057516402 --- /dev/null +++ b/drivers/infiniband/hw/mlx5/wr.h @@ -0,0 +1,76 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* + * Copyright (c) 2020, Mellanox Technologies inc. All rights reserved. + */ + +#ifndef _MLX5_IB_WR_H +#define _MLX5_IB_WR_H + +#include "mlx5_ib.h" + +enum { + MLX5_IB_SQ_UMR_INLINE_THRESHOLD = 64, +}; + +struct mlx5_wqe_eth_pad { + u8 rsvd0[16]; +}; + + +/* get_sq_edge - Get the next nearby edge. + * + * An 'edge' is defined as the first following address after the end + * of the fragment or the SQ. Accordingly, during the WQE construction + * which repetitively increases the pointer to write the next data, it + * simply should check if it gets to an edge. + * + * @sq - SQ buffer. + * @idx - Stride index in the SQ buffer. + * + * Return: + * The new edge. + */ +static inline void *get_sq_edge(struct mlx5_ib_wq *sq, u32 idx) +{ + void *fragment_end; + + fragment_end = mlx5_frag_buf_get_wqe + (&sq->fbc, + mlx5_frag_buf_get_idx_last_contig_stride(&sq->fbc, idx)); + + return fragment_end + MLX5_SEND_WQE_BB; +} + +int mlx5_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, + const struct ib_send_wr **bad_wr, bool drain); +int mlx5_ib_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr, + const struct ib_recv_wr **bad_wr, bool drain); + +static inline int mlx5_ib_post_send_nodrain(struct ib_qp *ibqp, + const struct ib_send_wr *wr, + const struct ib_send_wr **bad_wr) +{ + return mlx5_ib_post_send(ibqp, wr, bad_wr, false); +} + +static inline int mlx5_ib_post_send_drain(struct ib_qp *ibqp, + const struct ib_send_wr *wr, + const struct ib_send_wr **bad_wr) +{ + return mlx5_ib_post_send(ibqp, wr, bad_wr, true); +} + +static inline int mlx5_ib_post_recv_nodrain(struct ib_qp *ibqp, + const struct ib_recv_wr *wr, + const struct ib_recv_wr **bad_wr) +{ + return mlx5_ib_post_recv(ibqp, wr, bad_wr, false); +} + +static inline int mlx5_ib_post_recv_drain(struct ib_qp *ibqp, + const struct ib_recv_wr *wr, + const struct ib_recv_wr **bad_wr) +{ + return mlx5_ib_post_recv(ibqp, wr, bad_wr, true); +} +#endif /* _MLX5_IB_WR_H */ -- cgit v1.2.3 From 8d93efb8c5e07706651a992fc07f37a2fb69baca Mon Sep 17 00:00:00 2001 From: Mark Bloch Date: Wed, 6 May 2020 10:16:01 +0300 Subject: RDMA/mlx5: Assign profile before calling stages Assign the profile to the IB device before executing stages. This will allow to check which profile is being used from within a stage. Link: https://lore.kernel.org/r/20200506071602.7177-2-leon@kernel.org Signed-off-by: Mark Bloch Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/ib_rep.h | 2 +- drivers/infiniband/hw/mlx5/main.c | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/ib_rep.h b/drivers/infiniband/hw/mlx5/ib_rep.h index 3b6750cba796..5b30d3fa8f8d 100644 --- a/drivers/infiniband/hw/mlx5/ib_rep.h +++ b/drivers/infiniband/hw/mlx5/ib_rep.h @@ -9,9 +9,9 @@ #include #include "mlx5_ib.h" -#ifdef CONFIG_MLX5_ESWITCH extern const struct mlx5_ib_profile raw_eth_profile; +#ifdef CONFIG_MLX5_ESWITCH u8 mlx5_ib_eswitch_mode(struct mlx5_eswitch *esw); struct mlx5_ib_dev *mlx5_ib_get_rep_ibdev(struct mlx5_eswitch *esw, u16 vport_num); diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 40bf71efaeb0..1a7464b669d5 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -7160,6 +7160,8 @@ void *__mlx5_ib_add(struct mlx5_ib_dev *dev, int err; int i; + dev->profile = profile; + for (i = 0; i < MLX5_IB_STAGE_MAX; i++) { if (profile->stage[i].init) { err = profile->stage[i].init(dev); @@ -7168,7 +7170,6 @@ void *__mlx5_ib_add(struct mlx5_ib_dev *dev, } } - dev->profile = profile; dev->ib_active = true; return dev; -- cgit v1.2.3 From 42caf9cb59370cc6f296c7a9fe39fa66963236ff Mon Sep 17 00:00:00 2001 From: Mark Bloch Date: Wed, 6 May 2020 10:16:02 +0300 Subject: RDMA/mlx5: Allow only raw Ethernet QPs when RoCE isn't enabled When operating in switchdev mode or using devlink to disable RoCE only raw Ethernet QPs are allowed to be created. When in switchdev mode this can lead to passing an invalid port number as part of the modify qp firmware cmd and will lead to a syndrome reported back to the user, such as: * mlx5_cmd_check:803:(pid 50148): RST2INIT_QP(0x502) op_mod(0x0) failed, status bad parameter(0x3), syndrome (0x177405). Internal UD QP might be used to test for write combining support (even if externally we report RoCE as disabled) check for that specific flag and allow is specifically. Fixes: b5ca15ad7e61 ("IB/mlx5: Add proper representors support") Link: https://lore.kernel.org/r/20200506071602.7177-3-leon@kernel.org Signed-off-by: Mark Bloch Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/qp.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index fb2ea3bf9be4..40150595fdbb 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -2436,15 +2436,17 @@ static int check_qp_type(struct mlx5_ib_dev *dev, struct ib_qp_init_attr *attr, if (!MLX5_CAP_GEN(dev->mdev, xrc)) goto out; fallthrough; - case IB_QPT_RAW_PACKET: case IB_QPT_RC: case IB_QPT_UC: - case IB_QPT_UD: case IB_QPT_SMI: case MLX5_IB_QPT_HW_GSI: - case MLX5_IB_QPT_REG_UMR: case IB_QPT_DRIVER: case IB_QPT_GSI: + if (dev->profile == &raw_eth_profile) + goto out; + case IB_QPT_RAW_PACKET: + case IB_QPT_UD: + case MLX5_IB_QPT_REG_UMR: break; default: goto out; @@ -2641,6 +2643,10 @@ static int process_create_flags(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, int create_flags = attr->create_flags; bool cond; + if (qp->type == IB_QPT_UD && dev->profile == &raw_eth_profile) + if (create_flags & ~MLX5_IB_QP_CREATE_WC_TEST) + return -EINVAL; + if (qp_type == MLX5_IB_QPT_DCT) return (create_flags) ? -EINVAL : 0; -- cgit v1.2.3 From 52c81f47f0d2680f0b2e7b61c1fa4d8ad35f3020 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 7 May 2020 16:16:10 +0100 Subject: RDMA/mlx5: Remove duplicated assignment to variable rcqe_sz The variable rcqe_sz is being unnecessarily assigned twice, fix this by removing one of the duplicates. Fixes: 8bde2c509e40 ("RDMA/mlx5: Update all DRIVER QP places to use QP subtype") Link: https://lore.kernel.org/r/20200507151610.52636-1-colin.king@canonical.com Addresses-Coverity: ("Evaluation order violation") Signed-off-by: Colin Ian King Acked-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/qp.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 40150595fdbb..c571b7a97f10 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -1994,8 +1994,7 @@ static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd, if ((qp->flags_en & MLX5_QP_FLAG_SCATTER_CQE) && (init_attr->qp_type == IB_QPT_RC || init_attr->qp_type == IB_QPT_UC)) { - int rcqe_sz = rcqe_sz = - mlx5_ib_get_cqe_size(init_attr->recv_cq); + int rcqe_sz = mlx5_ib_get_cqe_size(init_attr->recv_cq); MLX5_SET(qpc, qpc, cs_res, rcqe_sz == 128 ? MLX5_RES_SCAT_DATA64_CQE : -- cgit v1.2.3 From 17793833f81ceb319be599ec09498ec0136d9acf Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 4 May 2020 16:25:41 +0300 Subject: RDMA/ucma: Return stable IB device index as identifier The librdmacm uses node_guid as identifier to correlate between IB devices and CMA devices. However FW resets cause to such "connection" to be lost and require from the user to restart its application. Extend UCMA to return IB device index, which is stable identifier. Link: https://lore.kernel.org/r/20200504132541.355710-1-leon@kernel.org Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/ucma.c | 16 +++++++++------- include/uapi/rdma/rdma_user_cm.h | 4 ++++ 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c index 16b6cf57fa85..06127c800a49 100644 --- a/drivers/infiniband/core/ucma.c +++ b/drivers/infiniband/core/ucma.c @@ -845,7 +845,7 @@ static ssize_t ucma_query_route(struct ucma_file *file, struct sockaddr *addr; int ret = 0; - if (out_len < sizeof(resp)) + if (out_len < offsetof(struct rdma_ucm_query_route_resp, ibdev_index)) return -ENOSPC; if (copy_from_user(&cmd, inbuf, sizeof(cmd))) @@ -869,6 +869,7 @@ static ssize_t ucma_query_route(struct ucma_file *file, goto out; resp.node_guid = (__force __u64) ctx->cm_id->device->node_guid; + resp.ibdev_index = ctx->cm_id->device->index; resp.port_num = ctx->cm_id->port_num; if (rdma_cap_ib_sa(ctx->cm_id->device, ctx->cm_id->port_num)) @@ -880,8 +881,8 @@ static ssize_t ucma_query_route(struct ucma_file *file, out: mutex_unlock(&ctx->mutex); - if (copy_to_user(u64_to_user_ptr(cmd.response), - &resp, sizeof(resp))) + if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, + min_t(size_t, out_len, sizeof(resp)))) ret = -EFAULT; ucma_put_ctx(ctx); @@ -895,6 +896,7 @@ static void ucma_query_device_addr(struct rdma_cm_id *cm_id, return; resp->node_guid = (__force __u64) cm_id->device->node_guid; + resp->ibdev_index = cm_id->device->index; resp->port_num = cm_id->port_num; resp->pkey = (__force __u16) cpu_to_be16( ib_addr_get_pkey(&cm_id->route.addr.dev_addr)); @@ -907,7 +909,7 @@ static ssize_t ucma_query_addr(struct ucma_context *ctx, struct sockaddr *addr; int ret = 0; - if (out_len < sizeof(resp)) + if (out_len < offsetof(struct rdma_ucm_query_addr_resp, ibdev_index)) return -ENOSPC; memset(&resp, 0, sizeof resp); @@ -922,7 +924,7 @@ static ssize_t ucma_query_addr(struct ucma_context *ctx, ucma_query_device_addr(ctx->cm_id, &resp); - if (copy_to_user(response, &resp, sizeof(resp))) + if (copy_to_user(response, &resp, min_t(size_t, out_len, sizeof(resp)))) ret = -EFAULT; return ret; @@ -974,7 +976,7 @@ static ssize_t ucma_query_gid(struct ucma_context *ctx, struct sockaddr_ib *addr; int ret = 0; - if (out_len < sizeof(resp)) + if (out_len < offsetof(struct rdma_ucm_query_addr_resp, ibdev_index)) return -ENOSPC; memset(&resp, 0, sizeof resp); @@ -1007,7 +1009,7 @@ static ssize_t ucma_query_gid(struct ucma_context *ctx, &ctx->cm_id->route.addr.dst_addr); } - if (copy_to_user(response, &resp, sizeof(resp))) + if (copy_to_user(response, &resp, min_t(size_t, out_len, sizeof(resp)))) ret = -EFAULT; return ret; diff --git a/include/uapi/rdma/rdma_user_cm.h b/include/uapi/rdma/rdma_user_cm.h index e42940a215a3..1bb6e75d254b 100644 --- a/include/uapi/rdma/rdma_user_cm.h +++ b/include/uapi/rdma/rdma_user_cm.h @@ -164,6 +164,8 @@ struct rdma_ucm_query_route_resp { __u32 num_paths; __u8 port_num; __u8 reserved[3]; + __u32 ibdev_index; + __u32 reserved1; }; struct rdma_ucm_query_addr_resp { @@ -175,6 +177,8 @@ struct rdma_ucm_query_addr_resp { __u16 dst_size; struct __kernel_sockaddr_storage src_addr; struct __kernel_sockaddr_storage dst_addr; + __u32 ibdev_index; + __u32 reserved1; }; struct rdma_ucm_query_path_resp { -- cgit v1.2.3 From 30661322b8c32e020c6638dc4317034ec95a1b3f Mon Sep 17 00:00:00 2001 From: Weihang Li Date: Tue, 5 May 2020 18:30:06 +0800 Subject: RDMA/hns: Extend capability flags for HIP08_C 12 bits is not enough for HIP08_C, so extend a new field in length of 16 bits for it. Link: https://lore.kernel.org/r/1588674607-25337-3-git-send-email-liweihang@huawei.com Signed-off-by: Weihang Li Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_device.h | 2 ++ drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 3 +++ drivers/infiniband/hw/hns/hns_roce_hw_v2.h | 2 +- 3 files changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index 5cac14d7be90..226df2040fdd 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -209,6 +209,8 @@ enum { HNS_ROCE_OPCODE_RDMA_WITH_IMM_RECEIVE = 0x07, }; +#define HNS_ROCE_CAP_FLAGS_EX_SHIFT 12 + enum { HNS_ROCE_CAP_FLAG_REREG_MR = BIT(0), HNS_ROCE_CAP_FLAG_ROCE_V1_V2 = BIT(1), diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index ad9a11a2cd0d..5aa56b7dc42b 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -1887,6 +1887,9 @@ static int hns_roce_query_pf_caps(struct hns_roce_dev *hr_dev) caps->flags = roce_get_field(resp_c->cap_flags_num_pds, V2_QUERY_PF_CAPS_C_CAP_FLAGS_M, V2_QUERY_PF_CAPS_C_CAP_FLAGS_S); + caps->flags |= le16_to_cpu(resp_d->cap_flags_ex) << + HNS_ROCE_CAP_FLAGS_EX_SHIFT; + caps->num_cqs = 1 << roce_get_field(resp_c->max_gid_num_cqs, V2_QUERY_PF_CAPS_C_NUM_CQS_M, V2_QUERY_PF_CAPS_C_NUM_CQS_S); diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h index 05bfe078d537..938b7b522faf 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h @@ -1647,7 +1647,7 @@ struct hns_roce_query_pf_caps_c { struct hns_roce_query_pf_caps_d { __le32 wq_hop_num_max_srqs; __le16 srq_depth; - __le16 rsv; + __le16 cap_flags_ex; __le32 num_ceqs_ceq_depth; __le32 arm_st_aeq_depth; __le32 num_uars_rsv_pds; -- cgit v1.2.3 From 90ae0b57e4a515342fe74ffa21f6972f5145d645 Mon Sep 17 00:00:00 2001 From: Lang Cheng Date: Tue, 5 May 2020 18:30:07 +0800 Subject: RDMA/hns: Combine enable flags of qp It's easier to understand and maintain enable flags of qp using a single field in type of unsigned long than defining a field for every flags in the structure hns_roce_qp, and we can add new flags for features more conveniently in the future. Link: https://lore.kernel.org/r/1588674607-25337-4-git-send-email-liweihang@huawei.com Signed-off-by: Lang Cheng Signed-off-by: Weihang Li Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_device.h | 7 +++---- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 2 +- drivers/infiniband/hw/hns/hns_roce_qp.c | 22 +++++++++++----------- 3 files changed, 15 insertions(+), 16 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index 226df2040fdd..4fcd608ee55f 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -131,8 +131,8 @@ enum { }; enum { - HNS_ROCE_SUPPORT_RQ_RECORD_DB = 1 << 0, - HNS_ROCE_SUPPORT_SQ_RECORD_DB = 1 << 1, + HNS_ROCE_QP_CAP_RQ_RECORD_DB = BIT(0), + HNS_ROCE_QP_CAP_SQ_RECORD_DB = BIT(1), }; enum { @@ -623,8 +623,7 @@ struct hns_roce_qp { struct hns_roce_wq rq; struct hns_roce_db rdb; struct hns_roce_db sdb; - u8 rdb_en; - u8 sdb_en; + unsigned long en_flags; u32 doorbell_qpn; u32 sq_signal_bits; struct hns_roce_wq sq; diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 5aa56b7dc42b..ebe570aa2323 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -3619,7 +3619,7 @@ static void modify_qp_reset_to_init(struct ib_qp *ibqp, roce_set_field(context->byte_24_mtu_tc, V2_QPC_BYTE_24_VLAN_ID_M, V2_QPC_BYTE_24_VLAN_ID_S, 0xfff); - if (hr_qp->rdb_en) + if (hr_qp->en_flags & HNS_ROCE_QP_CAP_RQ_RECORD_DB) roce_set_bit(context->byte_68_rq_db, V2_QPC_BYTE_68_RQ_RECORD_EN_S, 1); diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c index b5707596148d..dca979d8c345 100644 --- a/drivers/infiniband/hw/hns/hns_roce_qp.c +++ b/drivers/infiniband/hw/hns/hns_roce_qp.c @@ -750,8 +750,8 @@ static int alloc_qp_db(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp, "Failed to map user SQ doorbell\n"); goto err_out; } - hr_qp->sdb_en = 1; - resp->cap_flags |= HNS_ROCE_SUPPORT_SQ_RECORD_DB; + hr_qp->en_flags |= HNS_ROCE_QP_CAP_SQ_RECORD_DB; + resp->cap_flags |= HNS_ROCE_QP_CAP_SQ_RECORD_DB; } if (user_qp_has_rdb(hr_dev, init_attr, udata, resp)) { @@ -762,8 +762,8 @@ static int alloc_qp_db(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp, "Failed to map user RQ doorbell\n"); goto err_sdb; } - hr_qp->rdb_en = 1; - resp->cap_flags |= HNS_ROCE_SUPPORT_RQ_RECORD_DB; + hr_qp->en_flags |= HNS_ROCE_QP_CAP_RQ_RECORD_DB; + resp->cap_flags |= HNS_ROCE_QP_CAP_RQ_RECORD_DB; } } else { /* QP doorbell register address */ @@ -780,13 +780,13 @@ static int alloc_qp_db(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp, goto err_out; } *hr_qp->rdb.db_record = 0; - hr_qp->rdb_en = 1; + hr_qp->en_flags |= HNS_ROCE_QP_CAP_RQ_RECORD_DB; } } return 0; err_sdb: - if (udata && hr_qp->sdb_en) + if (udata && hr_qp->en_flags & HNS_ROCE_QP_CAP_SQ_RECORD_DB) hns_roce_db_unmap_user(uctx, &hr_qp->sdb); err_out: return ret; @@ -799,12 +799,12 @@ static void free_qp_db(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp, udata, struct hns_roce_ucontext, ibucontext); if (udata) { - if (hr_qp->rdb_en) + if (hr_qp->en_flags & HNS_ROCE_QP_CAP_RQ_RECORD_DB) hns_roce_db_unmap_user(uctx, &hr_qp->rdb); - if (hr_qp->sdb_en) + if (hr_qp->en_flags & HNS_ROCE_QP_CAP_SQ_RECORD_DB) hns_roce_db_unmap_user(uctx, &hr_qp->sdb); } else { - if (hr_qp->rdb_en) + if (hr_qp->en_flags & HNS_ROCE_QP_CAP_RQ_RECORD_DB) hns_roce_free_db(hr_dev, &hr_qp->rdb); } } @@ -1178,10 +1178,10 @@ int hns_roce_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, if (ibqp->uobject && (attr_mask & IB_QP_STATE) && new_state == IB_QPS_ERR) { - if (hr_qp->sdb_en == 1) { + if (hr_qp->en_flags & HNS_ROCE_QP_CAP_SQ_RECORD_DB) { hr_qp->sq.head = *(int *)(hr_qp->sdb.virt_addr); - if (hr_qp->rdb_en == 1) + if (hr_qp->en_flags & HNS_ROCE_QP_CAP_RQ_RECORD_DB) hr_qp->rq.head = *(int *)(hr_qp->rdb.virt_addr); } else { ibdev_warn(&hr_dev->ib_dev, -- cgit v1.2.3 From f8f2a576cb0c574044c049210ea4096e5cb1d7fc Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 6 May 2020 10:46:52 +0300 Subject: RDMA/addr: Mark addr_resolve as might_sleep() Under one path through ib_nl_fetch_ha() this calls nlmsg_new(GFP_KERNEL) which is a sleeping call. This is a very rare path, so mark fetch_ha() and the module external entry point that conditionally calls through to fetch_ha() as might_sleep(). Link: https://lore.kernel.org/r/20200506074701.9775-2-leon@kernel.org Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/addr.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c index 1753a9801b70..3a98439bba83 100644 --- a/drivers/infiniband/core/addr.c +++ b/drivers/infiniband/core/addr.c @@ -371,6 +371,8 @@ static int fetch_ha(const struct dst_entry *dst, struct rdma_dev_addr *dev_addr, (const void *)&dst_in6->sin6_addr; sa_family_t family = dst_in->sa_family; + might_sleep(); + /* If we have a gateway in IB mode then it must be an IB network */ if (has_gateway(dst, family) && dev_addr->network == RDMA_NETWORK_IB) return ib_nl_fetch_ha(dev_addr, daddr, seq, family); @@ -727,6 +729,8 @@ int roce_resolve_route_from_path(struct sa_path_rec *rec, struct rdma_dev_addr dev_addr = {}; int ret; + might_sleep(); + if (rec->roce.route_resolved) return 0; -- cgit v1.2.3 From d3552fb65d239be4775dc9b24740efd1bd70b459 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 6 May 2020 10:46:53 +0300 Subject: RDMA/cm: Remove return code from add_cm_id_to_port_list This cannot happen, all callers pass in one of the two pointers. Use a WARN_ON guard instead. Link: https://lore.kernel.org/r/20200506074701.9775-3-leon@kernel.org Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cm.c | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c index 68e1a9bba027..ea3910917a7b 100644 --- a/drivers/infiniband/core/cm.c +++ b/drivers/infiniband/core/cm.c @@ -474,24 +474,19 @@ static int cm_init_av_for_response(struct cm_port *port, struct ib_wc *wc, grh, &av->ah_attr); } -static int add_cm_id_to_port_list(struct cm_id_private *cm_id_priv, - struct cm_av *av, - struct cm_port *port) +static void add_cm_id_to_port_list(struct cm_id_private *cm_id_priv, + struct cm_av *av, struct cm_port *port) { unsigned long flags; - int ret = 0; spin_lock_irqsave(&cm.lock, flags); - if (&cm_id_priv->av == av) list_add_tail(&cm_id_priv->prim_list, &port->cm_priv_prim_list); else if (&cm_id_priv->alt_av == av) list_add_tail(&cm_id_priv->altr_list, &port->cm_priv_altr_list); else - ret = -EINVAL; - + WARN_ON(true); spin_unlock_irqrestore(&cm.lock, flags); - return ret; } static struct cm_port * @@ -572,12 +567,7 @@ static int cm_init_av_by_path(struct sa_path_rec *path, return ret; av->timeout = path->packet_life_time + 1; - - ret = add_cm_id_to_port_list(cm_id_priv, av, port); - if (ret) { - rdma_destroy_ah_attr(&new_ah_attr); - return ret; - } + add_cm_id_to_port_list(cm_id_priv, av, port); rdma_move_ah_attr(&av->ah_attr, &new_ah_attr); return 0; } -- cgit v1.2.3 From 42113eed8f10533ba419df535b6c0ff9141a948c Mon Sep 17 00:00:00 2001 From: Danit Goldberg Date: Wed, 6 May 2020 10:46:54 +0300 Subject: RDMA/cm: Remove unused store to ret in cm_rej_handler The 'goto out' label doesn't read ret, so don't set it. Link: https://lore.kernel.org/r/20200506074701.9775-4-leon@kernel.org Signed-off-by: Danit Goldberg Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cm.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c index ea3910917a7b..c12fd673678d 100644 --- a/drivers/infiniband/core/cm.c +++ b/drivers/infiniband/core/cm.c @@ -3056,7 +3056,6 @@ static int cm_rej_handler(struct cm_work *work) __func__, be32_to_cpu(cm_id_priv->id.local_id), cm_id_priv->id.state); spin_unlock_irq(&cm_id_priv->lock); - ret = -EINVAL; goto out; } -- cgit v1.2.3 From e83f195aa45c1ffd73b3a950a887e41c260cf194 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 6 May 2020 10:46:55 +0300 Subject: RDMA/cm: Pull duplicated code into cm_queue_work_unlock() While unlocking a spinlock held by the caller is a disturbing pattern, this extensively duplicated code is even worse. Pull all the duplicates into a function and explain the purpose of the algorithm. The on creation side call in cm_req_handler() which is different has been micro-optimized on the basis that the work_count == -1 during creation, remove that and just use the normal function. Link: https://lore.kernel.org/r/20200506074701.9775-5-leon@kernel.org Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cm.c | 146 +++++++++++++------------------------------ 1 file changed, 44 insertions(+), 102 deletions(-) diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c index c12fd673678d..f56494de3c77 100644 --- a/drivers/infiniband/core/cm.c +++ b/drivers/infiniband/core/cm.c @@ -81,8 +81,11 @@ const char *__attribute_const__ ibcm_reject_msg(int reason) EXPORT_SYMBOL(ibcm_reject_msg); struct cm_id_private; +struct cm_work; static int cm_add_one(struct ib_device *device); static void cm_remove_one(struct ib_device *device, void *client_data); +static void cm_process_work(struct cm_id_private *cm_id_priv, + struct cm_work *work); static int cm_send_sidr_rep_locked(struct cm_id_private *cm_id_priv, struct ib_cm_sidr_rep_param *param); static int cm_send_dreq_locked(struct cm_id_private *cm_id_priv, @@ -907,6 +910,35 @@ static void cm_free_work(struct cm_work *work) kfree(work); } +static void cm_queue_work_unlock(struct cm_id_private *cm_id_priv, + struct cm_work *work) +{ + bool immediate; + + /* + * To deliver the event to the user callback we have the drop the + * spinlock, however, we need to ensure that the user callback is single + * threaded and receives events in the temporal order. If there are + * already events being processed then thread new events onto a list, + * the thread currently processing will pick them up. + */ + immediate = atomic_inc_and_test(&cm_id_priv->work_count); + if (!immediate) { + list_add_tail(&work->list, &cm_id_priv->work_list); + /* + * This routine always consumes incoming reference. Once queued + * to the work_list then a reference is held by the thread + * currently running cm_process_work() and this reference is not + * needed. + */ + cm_deref_id(cm_id_priv); + } + spin_unlock_irq(&cm_id_priv->lock); + + if (immediate) + cm_process_work(cm_id_priv, work); +} + static inline int cm_convert_to_ms(int iba_time) { /* approximate conversion to ms from 4.096us x 2^iba_time */ @@ -2144,9 +2176,7 @@ static int cm_req_handler(struct cm_work *work) /* Refcount belongs to the event, pairs with cm_process_work() */ refcount_inc(&cm_id_priv->refcount); - atomic_inc(&cm_id_priv->work_count); - spin_unlock_irq(&cm_id_priv->lock); - cm_process_work(cm_id_priv, work); + cm_queue_work_unlock(cm_id_priv, work); /* * Since this ID was just created and was not made visible to other MAD * handlers until the cm_finalize_id() above we know that the @@ -2492,15 +2522,7 @@ static int cm_rep_handler(struct cm_work *work) cm_id_priv->alt_av.timeout - 1); ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg); - ret = atomic_inc_and_test(&cm_id_priv->work_count); - if (!ret) - list_add_tail(&work->list, &cm_id_priv->work_list); - spin_unlock_irq(&cm_id_priv->lock); - - if (ret) - cm_process_work(cm_id_priv, work); - else - cm_deref_id(cm_id_priv); + cm_queue_work_unlock(cm_id_priv, work); return 0; error: @@ -2511,7 +2533,6 @@ error: static int cm_establish_handler(struct cm_work *work) { struct cm_id_private *cm_id_priv; - int ret; /* See comment in cm_establish about lookup. */ cm_id_priv = cm_acquire_id(work->local_id, work->remote_id); @@ -2525,15 +2546,7 @@ static int cm_establish_handler(struct cm_work *work) } ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg); - ret = atomic_inc_and_test(&cm_id_priv->work_count); - if (!ret) - list_add_tail(&work->list, &cm_id_priv->work_list); - spin_unlock_irq(&cm_id_priv->lock); - - if (ret) - cm_process_work(cm_id_priv, work); - else - cm_deref_id(cm_id_priv); + cm_queue_work_unlock(cm_id_priv, work); return 0; out: cm_deref_id(cm_id_priv); @@ -2544,7 +2557,6 @@ static int cm_rtu_handler(struct cm_work *work) { struct cm_id_private *cm_id_priv; struct cm_rtu_msg *rtu_msg; - int ret; rtu_msg = (struct cm_rtu_msg *)work->mad_recv_wc->recv_buf.mad; cm_id_priv = cm_acquire_id( @@ -2567,15 +2579,7 @@ static int cm_rtu_handler(struct cm_work *work) cm_id_priv->id.state = IB_CM_ESTABLISHED; ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg); - ret = atomic_inc_and_test(&cm_id_priv->work_count); - if (!ret) - list_add_tail(&work->list, &cm_id_priv->work_list); - spin_unlock_irq(&cm_id_priv->lock); - - if (ret) - cm_process_work(cm_id_priv, work); - else - cm_deref_id(cm_id_priv); + cm_queue_work_unlock(cm_id_priv, work); return 0; out: cm_deref_id(cm_id_priv); @@ -2768,7 +2772,6 @@ static int cm_dreq_handler(struct cm_work *work) struct cm_id_private *cm_id_priv; struct cm_dreq_msg *dreq_msg; struct ib_mad_send_buf *msg = NULL; - int ret; dreq_msg = (struct cm_dreq_msg *)work->mad_recv_wc->recv_buf.mad; cm_id_priv = cm_acquire_id( @@ -2833,15 +2836,7 @@ static int cm_dreq_handler(struct cm_work *work) } cm_id_priv->id.state = IB_CM_DREQ_RCVD; cm_id_priv->tid = dreq_msg->hdr.tid; - ret = atomic_inc_and_test(&cm_id_priv->work_count); - if (!ret) - list_add_tail(&work->list, &cm_id_priv->work_list); - spin_unlock_irq(&cm_id_priv->lock); - - if (ret) - cm_process_work(cm_id_priv, work); - else - cm_deref_id(cm_id_priv); + cm_queue_work_unlock(cm_id_priv, work); return 0; unlock: spin_unlock_irq(&cm_id_priv->lock); @@ -2853,7 +2848,6 @@ static int cm_drep_handler(struct cm_work *work) { struct cm_id_private *cm_id_priv; struct cm_drep_msg *drep_msg; - int ret; drep_msg = (struct cm_drep_msg *)work->mad_recv_wc->recv_buf.mad; cm_id_priv = cm_acquire_id( @@ -2874,15 +2868,7 @@ static int cm_drep_handler(struct cm_work *work) cm_enter_timewait(cm_id_priv); ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg); - ret = atomic_inc_and_test(&cm_id_priv->work_count); - if (!ret) - list_add_tail(&work->list, &cm_id_priv->work_list); - spin_unlock_irq(&cm_id_priv->lock); - - if (ret) - cm_process_work(cm_id_priv, work); - else - cm_deref_id(cm_id_priv); + cm_queue_work_unlock(cm_id_priv, work); return 0; out: cm_deref_id(cm_id_priv); @@ -3010,7 +2996,6 @@ static int cm_rej_handler(struct cm_work *work) { struct cm_id_private *cm_id_priv; struct cm_rej_msg *rej_msg; - int ret; rej_msg = (struct cm_rej_msg *)work->mad_recv_wc->recv_buf.mad; cm_id_priv = cm_acquire_rejected_id(rej_msg); @@ -3059,15 +3044,7 @@ static int cm_rej_handler(struct cm_work *work) goto out; } - ret = atomic_inc_and_test(&cm_id_priv->work_count); - if (!ret) - list_add_tail(&work->list, &cm_id_priv->work_list); - spin_unlock_irq(&cm_id_priv->lock); - - if (ret) - cm_process_work(cm_id_priv, work); - else - cm_deref_id(cm_id_priv); + cm_queue_work_unlock(cm_id_priv, work); return 0; out: cm_deref_id(cm_id_priv); @@ -3177,7 +3154,7 @@ static int cm_mra_handler(struct cm_work *work) { struct cm_id_private *cm_id_priv; struct cm_mra_msg *mra_msg; - int timeout, ret; + int timeout; mra_msg = (struct cm_mra_msg *)work->mad_recv_wc->recv_buf.mad; cm_id_priv = cm_acquire_mraed_id(mra_msg); @@ -3237,15 +3214,7 @@ static int cm_mra_handler(struct cm_work *work) cm_id_priv->msg->context[1] = (void *) (unsigned long) cm_id_priv->id.state; - ret = atomic_inc_and_test(&cm_id_priv->work_count); - if (!ret) - list_add_tail(&work->list, &cm_id_priv->work_list); - spin_unlock_irq(&cm_id_priv->lock); - - if (ret) - cm_process_work(cm_id_priv, work); - else - cm_deref_id(cm_id_priv); + cm_queue_work_unlock(cm_id_priv, work); return 0; out: spin_unlock_irq(&cm_id_priv->lock); @@ -3380,15 +3349,7 @@ static int cm_lap_handler(struct cm_work *work) cm_id_priv->id.lap_state = IB_CM_LAP_RCVD; cm_id_priv->tid = lap_msg->hdr.tid; - ret = atomic_inc_and_test(&cm_id_priv->work_count); - if (!ret) - list_add_tail(&work->list, &cm_id_priv->work_list); - spin_unlock_irq(&cm_id_priv->lock); - - if (ret) - cm_process_work(cm_id_priv, work); - else - cm_deref_id(cm_id_priv); + cm_queue_work_unlock(cm_id_priv, work); return 0; unlock: spin_unlock_irq(&cm_id_priv->lock); @@ -3400,7 +3361,6 @@ static int cm_apr_handler(struct cm_work *work) { struct cm_id_private *cm_id_priv; struct cm_apr_msg *apr_msg; - int ret; /* Currently Alternate path messages are not supported for * RoCE link layer. @@ -3435,16 +3395,7 @@ static int cm_apr_handler(struct cm_work *work) cm_id_priv->id.lap_state = IB_CM_LAP_IDLE; ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg); cm_id_priv->msg = NULL; - - ret = atomic_inc_and_test(&cm_id_priv->work_count); - if (!ret) - list_add_tail(&work->list, &cm_id_priv->work_list); - spin_unlock_irq(&cm_id_priv->lock); - - if (ret) - cm_process_work(cm_id_priv, work); - else - cm_deref_id(cm_id_priv); + cm_queue_work_unlock(cm_id_priv, work); return 0; out: cm_deref_id(cm_id_priv); @@ -3455,7 +3406,6 @@ static int cm_timewait_handler(struct cm_work *work) { struct cm_timewait_info *timewait_info; struct cm_id_private *cm_id_priv; - int ret; timewait_info = container_of(work, struct cm_timewait_info, work); spin_lock_irq(&cm.lock); @@ -3474,15 +3424,7 @@ static int cm_timewait_handler(struct cm_work *work) goto out; } cm_id_priv->id.state = IB_CM_IDLE; - ret = atomic_inc_and_test(&cm_id_priv->work_count); - if (!ret) - list_add_tail(&work->list, &cm_id_priv->work_list); - spin_unlock_irq(&cm_id_priv->lock); - - if (ret) - cm_process_work(cm_id_priv, work); - else - cm_deref_id(cm_id_priv); + cm_queue_work_unlock(cm_id_priv, work); return 0; out: cm_deref_id(cm_id_priv); -- cgit v1.2.3 From 9767a27e1aeb462812e9f054c313180eeff0b5c3 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 6 May 2020 10:46:56 +0300 Subject: RDMA/cm: Pass the cm_id_private into cm_cleanup_timewait Also rename it to cm_remove_remote(). This function now removes the tracking of the remote ID/QPN in the redblack trees from a cm_id_private. Replace a open-coded version with a call. The open coded version was deleting only the remote_id, however at this call site the qpn can not have been in the RB tree either, so the cm_remove_remote() will do the same. Link: https://lore.kernel.org/r/20200506074701.9775-6-leon@kernel.org Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cm.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c index f56494de3c77..6a2a5f6e6d90 100644 --- a/drivers/infiniband/core/cm.c +++ b/drivers/infiniband/core/cm.c @@ -964,8 +964,10 @@ static u8 cm_ack_timeout(u8 ca_ack_delay, u8 packet_life_time) return min(31, ack_timeout); } -static void cm_cleanup_timewait(struct cm_timewait_info *timewait_info) +static void cm_remove_remote(struct cm_id_private *cm_id_priv) { + struct cm_timewait_info *timewait_info = cm_id_priv->timewait_info; + if (timewait_info->inserted_remote_id) { rb_erase(&timewait_info->remote_id_node, &cm.remote_id_table); timewait_info->inserted_remote_id = 0; @@ -1004,7 +1006,7 @@ static void cm_enter_timewait(struct cm_id_private *cm_id_priv) return; spin_lock_irqsave(&cm.lock, flags); - cm_cleanup_timewait(cm_id_priv->timewait_info); + cm_remove_remote(cm_id_priv); list_add_tail(&cm_id_priv->timewait_info->list, &cm.timewait_list); spin_unlock_irqrestore(&cm.lock, flags); @@ -1035,7 +1037,7 @@ static void cm_reset_to_idle(struct cm_id_private *cm_id_priv) cm_id_priv->id.state = IB_CM_IDLE; if (cm_id_priv->timewait_info) { spin_lock_irqsave(&cm.lock, flags); - cm_cleanup_timewait(cm_id_priv->timewait_info); + cm_remove_remote(cm_id_priv); spin_unlock_irqrestore(&cm.lock, flags); kfree(cm_id_priv->timewait_info); cm_id_priv->timewait_info = NULL; @@ -1136,7 +1138,7 @@ retest: spin_lock(&cm.lock); /* Required for cleanup paths related cm_req_handler() */ if (cm_id_priv->timewait_info) { - cm_cleanup_timewait(cm_id_priv->timewait_info); + cm_remove_remote(cm_id_priv); kfree(cm_id_priv->timewait_info); cm_id_priv->timewait_info = NULL; } @@ -1971,7 +1973,7 @@ static struct cm_id_private * cm_match_req(struct cm_work *work, /* Check for stale connections. */ timewait_info = cm_insert_remote_qpn(cm_id_priv->timewait_info); if (timewait_info) { - cm_cleanup_timewait(cm_id_priv->timewait_info); + cm_remove_remote(cm_id_priv); cur_cm_id_priv = cm_acquire_id(timewait_info->work.local_id, timewait_info->work.remote_id); @@ -1992,7 +1994,7 @@ static struct cm_id_private * cm_match_req(struct cm_work *work, cm_id_priv->id.device, cpu_to_be64(IBA_GET(CM_REQ_SERVICE_ID, req_msg))); if (!listen_cm_id_priv) { - cm_cleanup_timewait(cm_id_priv->timewait_info); + cm_remove_remote(cm_id_priv); spin_unlock_irq(&cm.lock); cm_issue_rej(work->port, work->mad_recv_wc, IB_CM_REJ_INVALID_SERVICE_ID, CM_MSG_RESPONSE_REQ, @@ -2475,9 +2477,7 @@ static int cm_rep_handler(struct cm_work *work) /* Check for a stale connection. */ timewait_info = cm_insert_remote_qpn(cm_id_priv->timewait_info); if (timewait_info) { - rb_erase(&cm_id_priv->timewait_info->remote_id_node, - &cm.remote_id_table); - cm_id_priv->timewait_info->inserted_remote_id = 0; + cm_remove_remote(cm_id_priv); cur_cm_id_priv = cm_acquire_id(timewait_info->work.local_id, timewait_info->work.remote_id); -- cgit v1.2.3 From 09fb406a569b2f4b596048e1cfa2c92a35b8fc9b Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 6 May 2020 10:46:57 +0300 Subject: RDMA/cm: Add a note explaining how the timewait is eventually freed The way the cm_timewait_info is converted into a work and then freed is very subtle and surprising, add a note clarifying the lifetime here. Link: https://lore.kernel.org/r/20200506074701.9775-7-leon@kernel.org Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cm.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c index 6a2a5f6e6d90..74c46b0272b9 100644 --- a/drivers/infiniband/core/cm.c +++ b/drivers/infiniband/core/cm.c @@ -1025,6 +1025,11 @@ static void cm_enter_timewait(struct cm_id_private *cm_id_priv) msecs_to_jiffies(wait_time)); spin_unlock_irqrestore(&cm.lock, flags); + /* + * The timewait_info is converted into a work and gets freed during + * cm_free_work() in cm_timewait_handler(). + */ + BUILD_BUG_ON(offsetof(struct cm_timewait_info, work) != 0); cm_id_priv->timewait_info = NULL; } -- cgit v1.2.3 From cfa68b0d04401b3ae733787b1c72ab1266e852ab Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 6 May 2020 10:46:58 +0300 Subject: RDMA/cm: Make find_remote_id() return a cm_id_private The only caller doesn't care about the timewait, so acquire and return the cm_id_private from the function. Link: https://lore.kernel.org/r/20200506074701.9775-8-leon@kernel.org Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cm.c | 27 ++++++++++++--------------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c index 74c46b0272b9..4f77ca2ba8fc 100644 --- a/drivers/infiniband/core/cm.c +++ b/drivers/infiniband/core/cm.c @@ -738,12 +738,14 @@ static struct cm_timewait_info * cm_insert_remote_id(struct cm_timewait_info return NULL; } -static struct cm_timewait_info * cm_find_remote_id(__be64 remote_ca_guid, - __be32 remote_id) +static struct cm_id_private *cm_find_remote_id(__be64 remote_ca_guid, + __be32 remote_id) { struct rb_node *node = cm.remote_id_table.rb_node; struct cm_timewait_info *timewait_info; + struct cm_id_private *res = NULL; + spin_lock_irq(&cm.lock); while (node) { timewait_info = rb_entry(node, struct cm_timewait_info, remote_id_node); @@ -755,10 +757,14 @@ static struct cm_timewait_info * cm_find_remote_id(__be64 remote_ca_guid, node = node->rb_left; else if (be64_gt(remote_ca_guid, timewait_info->remote_ca_guid)) node = node->rb_right; - else - return timewait_info; + else { + res = cm_acquire_id(timewait_info->work.local_id, + timewait_info->work.remote_id); + break; + } } - return NULL; + spin_unlock_irq(&cm.lock); + return res; } static struct cm_timewait_info * cm_insert_remote_qpn(struct cm_timewait_info @@ -2966,24 +2972,15 @@ static void cm_format_rej_event(struct cm_work *work) static struct cm_id_private * cm_acquire_rejected_id(struct cm_rej_msg *rej_msg) { - struct cm_timewait_info *timewait_info; struct cm_id_private *cm_id_priv; __be32 remote_id; remote_id = cpu_to_be32(IBA_GET(CM_REJ_LOCAL_COMM_ID, rej_msg)); if (IBA_GET(CM_REJ_REASON, rej_msg) == IB_CM_REJ_TIMEOUT) { - spin_lock_irq(&cm.lock); - timewait_info = cm_find_remote_id( + cm_id_priv = cm_find_remote_id( *((__be64 *)IBA_GET_MEM_PTR(CM_REJ_ARI, rej_msg)), remote_id); - if (!timewait_info) { - spin_unlock_irq(&cm.lock); - return NULL; - } - cm_id_priv = - cm_acquire_id(timewait_info->work.local_id, remote_id); - spin_unlock_irq(&cm.lock); } else if (IBA_GET(CM_REJ_MESSAGE_REJECTED, rej_msg) == CM_MSG_RESPONSE_REQ) cm_id_priv = cm_acquire_id( -- cgit v1.2.3 From 1cc44279f2973b413bae69d9a0bf01f051f382b0 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 6 May 2020 10:46:59 +0300 Subject: RDMA/cm: Remove the cm_free_id() wrapper function Just call xa_erase directly during cm_destroy_id() Link: https://lore.kernel.org/r/20200506074701.9775-9-leon@kernel.org Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cm.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c index 4f77ca2ba8fc..320fe89af5cb 100644 --- a/drivers/infiniband/core/cm.c +++ b/drivers/infiniband/core/cm.c @@ -580,11 +580,6 @@ static u32 cm_local_id(__be32 local_id) return (__force u32) (local_id ^ cm.random_id_operand); } -static void cm_free_id(__be32 local_id) -{ - xa_erase_irq(&cm.local_id_table, cm_local_id(local_id)); -} - static struct cm_id_private *cm_acquire_id(__be32 local_id, __be32 remote_id) { struct cm_id_private *cm_id_priv; @@ -1136,7 +1131,7 @@ retest: case IB_CM_TIMEWAIT: /* * The cm_acquire_id in cm_timewait_handler will stop working - * once we do cm_free_id() below, so just move to idle here for + * once we do xa_erase below, so just move to idle here for * consistency. */ cm_id->state = IB_CM_IDLE; @@ -1166,7 +1161,7 @@ retest: spin_unlock(&cm.lock); spin_unlock_irq(&cm_id_priv->lock); - cm_free_id(cm_id->local_id); + xa_erase_irq(&cm.local_id_table, cm_local_id(cm_id->local_id)); cm_deref_id(cm_id_priv); wait_for_completion(&cm_id_priv->comp); while ((work = cm_dequeue_work(cm_id_priv)) != NULL) -- cgit v1.2.3 From 51e8463cfc48ca030e532c9127ee1219a95795c3 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 6 May 2020 10:47:00 +0300 Subject: RDMA/cm: Remove needless cm_id variable Just put the expression in the only reader Link: https://lore.kernel.org/r/20200506074701.9775-10-leon@kernel.org Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cm.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c index 320fe89af5cb..15cd5253d2c7 100644 --- a/drivers/infiniband/core/cm.c +++ b/drivers/infiniband/core/cm.c @@ -1958,7 +1958,6 @@ static struct cm_id_private * cm_match_req(struct cm_work *work, struct cm_id_private *listen_cm_id_priv, *cur_cm_id_priv; struct cm_timewait_info *timewait_info; struct cm_req_msg *req_msg; - struct ib_cm_id *cm_id; req_msg = (struct cm_req_msg *)work->mad_recv_wc->recv_buf.mad; @@ -1988,8 +1987,7 @@ static struct cm_id_private * cm_match_req(struct cm_work *work, IB_CM_REJ_STALE_CONN, CM_MSG_RESPONSE_REQ, NULL, 0); if (cur_cm_id_priv) { - cm_id = &cur_cm_id_priv->id; - ib_send_cm_dreq(cm_id, NULL, 0); + ib_send_cm_dreq(&cur_cm_id_priv->id, NULL, 0); cm_deref_id(cur_cm_id_priv); } return NULL; @@ -2433,7 +2431,6 @@ static int cm_rep_handler(struct cm_work *work) struct cm_rep_msg *rep_msg; int ret; struct cm_id_private *cur_cm_id_priv; - struct ib_cm_id *cm_id; struct cm_timewait_info *timewait_info; rep_msg = (struct cm_rep_msg *)work->mad_recv_wc->recv_buf.mad; @@ -2499,8 +2496,7 @@ static int cm_rep_handler(struct cm_work *work) IBA_GET(CM_REP_REMOTE_COMM_ID, rep_msg)); if (cur_cm_id_priv) { - cm_id = &cur_cm_id_priv->id; - ib_send_cm_dreq(cm_id, NULL, 0); + ib_send_cm_dreq(&cur_cm_id_priv->id, NULL, 0); cm_deref_id(cur_cm_id_priv); } -- cgit v1.2.3 From a0e46db4e764f56c61f85c235c50bf4578c51a47 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 6 May 2020 10:47:01 +0300 Subject: RDMA/cm: Increment the refcount inside cm_find_listen() All callers need the 'get', so do it in a central place before returning the pointer. Link: https://lore.kernel.org/r/20200506074701.9775-11-leon@kernel.org Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cm.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c index 15cd5253d2c7..fb47cd55ce42 100644 --- a/drivers/infiniband/core/cm.c +++ b/drivers/infiniband/core/cm.c @@ -686,9 +686,10 @@ static struct cm_id_private * cm_find_listen(struct ib_device *device, cm_id_priv = rb_entry(node, struct cm_id_private, service_node); if ((cm_id_priv->id.service_mask & service_id) == cm_id_priv->id.service_id && - (cm_id_priv->id.device == device)) + (cm_id_priv->id.device == device)) { + refcount_inc(&cm_id_priv->refcount); return cm_id_priv; - + } if (device < cm_id_priv->id.device) node = node->rb_left; else if (device > cm_id_priv->id.device) @@ -2005,7 +2006,6 @@ static struct cm_id_private * cm_match_req(struct cm_work *work, NULL, 0); return NULL; } - refcount_inc(&listen_cm_id_priv->refcount); spin_unlock_irq(&cm.lock); return listen_cm_id_priv; } @@ -3564,7 +3564,6 @@ static int cm_sidr_req_handler(struct cm_work *work) .status = IB_SIDR_UNSUPPORTED }); goto out; /* No match. */ } - refcount_inc(&listen_cm_id_priv->refcount); spin_unlock_irq(&cm.lock); cm_id_priv->id.cm_handler = listen_cm_id_priv->id.cm_handler; -- cgit v1.2.3 From 0cb9e4f9e98af4b969b9140e964c646ad6ed7d40 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Thu, 7 May 2020 13:53:42 -0500 Subject: IB/rdmavt: Replace zero-length array with flexible-array The current codebase makes use of the zero-length array language extension to the C90 standard, but the preferred mechanism to declare variable-length types such as these ones is a flexible array member[1][2], introduced in C99: struct foo { int stuff; struct boo array[]; }; By making use of the mechanism above, we will get a compiler warning in case the flexible array does not occur last in the structure, which will help us prevent some kind of undefined behavior bugs from being inadvertently introduced[3] to the codebase from now on. Also, notice that, dynamic memory allocations won't be affected by this change: "Flexible array members have incomplete type, and so the sizeof operator may not be applied. As a quirk of the original implementation of zero-length arrays, sizeof evaluates to zero."[1] sizeof(flexible-array-member) triggers a warning because flexible array members have incomplete type[1]. There are some instances of code in which the sizeof operator is being incorrectly/erroneously applied to zero-length arrays and the result is zero. Such instances may be hiding some bugs. So, this work (flexible-array member conversions) will also help to get completely rid of those sorts of issues. This issue was found with the help of Coccinelle. [1] https://gcc.gnu.org/onlinedocs/gcc/Zero-Length.html [2] https://github.com/KSPP/linux/issues/21 [3] commit 76497732932f ("cxgb3/l2t: Fix undefined behaviour") Link: https://lore.kernel.org/r/20200507185342.GA14476@embeddedor Signed-off-by: Gustavo A. R. Silva Signed-off-by: Jason Gunthorpe --- include/rdma/rdmavt_qp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/rdma/rdmavt_qp.h b/include/rdma/rdmavt_qp.h index 5fc10108703a..982bf2340840 100644 --- a/include/rdma/rdmavt_qp.h +++ b/include/rdma/rdmavt_qp.h @@ -440,7 +440,7 @@ struct rvt_qp { /* * This sge list MUST be last. Do not add anything below here. */ - struct rvt_sge r_sg_list[0] /* verified SGEs */ + struct rvt_sge r_sg_list[] /* verified SGEs */ ____cacheline_aligned_in_smp; }; -- cgit v1.2.3 From b9019507aa6e3e6a039573eb6743b38bf846771b Mon Sep 17 00:00:00 2001 From: Maor Gottlieb Date: Mon, 4 May 2020 08:30:11 +0300 Subject: RDMA/mlx5: Refactor DV create flow Move part of the code that get the destinations into function so the code will be more readable. In addition change the variables definition to be in reversed christmas tree. Link: https://lore.kernel.org/r/20200504053012.270689-4-leon@kernel.org Signed-off-by: Maor Gottlieb Reviewed-by: Mark Zhang Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/flow.c | 108 +++++++++++++++++++++----------------- 1 file changed, 59 insertions(+), 49 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/flow.c b/drivers/infiniband/hw/mlx5/flow.c index 08fd6a650868..5533b5083c29 100644 --- a/drivers/infiniband/hw/mlx5/flow.c +++ b/drivers/infiniband/hw/mlx5/flow.c @@ -67,40 +67,18 @@ static const struct uverbs_attr_spec mlx5_ib_flow_type[] = { }, }; -#define MLX5_IB_CREATE_FLOW_MAX_FLOW_ACTIONS 2 -static int UVERBS_HANDLER(MLX5_IB_METHOD_CREATE_FLOW)( - struct uverbs_attr_bundle *attrs) +static int get_dests(struct uverbs_attr_bundle *attrs, + struct mlx5_ib_flow_matcher *fs_matcher, int *dest_id, + int *dest_type, struct ib_qp **qp) { - struct mlx5_flow_context flow_context = {.flow_tag = MLX5_FS_DEFAULT_FLOW_TAG}; - struct mlx5_ib_flow_handler *flow_handler; - struct mlx5_ib_flow_matcher *fs_matcher; - struct ib_uobject **arr_flow_actions; - struct ib_uflow_resources *uflow_res; - struct mlx5_flow_act flow_act = {}; - void *devx_obj; - int dest_id, dest_type; - void *cmd_in; - int inlen; bool dest_devx, dest_qp; - struct ib_qp *qp = NULL; - struct ib_uobject *uobj = - uverbs_attr_get_uobject(attrs, MLX5_IB_ATTR_CREATE_FLOW_HANDLE); - struct mlx5_ib_dev *dev = mlx5_udata_to_mdev(&attrs->driver_udata); - int len, ret, i; - u32 counter_id = 0; - u32 *offset_attr; - u32 offset = 0; - - if (!capable(CAP_NET_RAW)) - return -EPERM; + void *devx_obj; - dest_devx = - uverbs_attr_is_valid(attrs, MLX5_IB_ATTR_CREATE_FLOW_DEST_DEVX); + dest_devx = uverbs_attr_is_valid(attrs, + MLX5_IB_ATTR_CREATE_FLOW_DEST_DEVX); dest_qp = uverbs_attr_is_valid(attrs, MLX5_IB_ATTR_CREATE_FLOW_DEST_QP); - fs_matcher = uverbs_attr_get_obj(attrs, - MLX5_IB_ATTR_CREATE_FLOW_MATCHER); if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_BYPASS && ((dest_devx && dest_qp) || (!dest_devx && !dest_qp))) return -EINVAL; @@ -114,43 +92,79 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_CREATE_FLOW)( ((!dest_devx && !dest_qp) || (dest_devx && dest_qp))) return -EINVAL; + *qp = NULL; if (dest_devx) { - devx_obj = uverbs_attr_get_obj( - attrs, MLX5_IB_ATTR_CREATE_FLOW_DEST_DEVX); - if (IS_ERR(devx_obj)) - return PTR_ERR(devx_obj); + devx_obj = + uverbs_attr_get_obj(attrs, + MLX5_IB_ATTR_CREATE_FLOW_DEST_DEVX); /* Verify that the given DEVX object is a flow * steering destination. */ - if (!mlx5_ib_devx_is_flow_dest(devx_obj, &dest_id, &dest_type)) + if (!mlx5_ib_devx_is_flow_dest(devx_obj, dest_id, dest_type)) return -EINVAL; /* Allow only flow table as dest when inserting to FDB or RDMA_RX */ if ((fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_FDB || fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_RDMA_RX) && - dest_type != MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE) + *dest_type != MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE) return -EINVAL; } else if (dest_qp) { struct mlx5_ib_qp *mqp; - qp = uverbs_attr_get_obj(attrs, - MLX5_IB_ATTR_CREATE_FLOW_DEST_QP); - if (IS_ERR(qp)) - return PTR_ERR(qp); + *qp = uverbs_attr_get_obj(attrs, + MLX5_IB_ATTR_CREATE_FLOW_DEST_QP); + if (IS_ERR(*qp)) + return PTR_ERR(*qp); - if (qp->qp_type != IB_QPT_RAW_PACKET) + if ((*qp)->qp_type != IB_QPT_RAW_PACKET) return -EINVAL; - mqp = to_mqp(qp); + mqp = to_mqp(*qp); if (mqp->is_rss) - dest_id = mqp->rss_qp.tirn; + *dest_id = mqp->rss_qp.tirn; else - dest_id = mqp->raw_packet_qp.rq.tirn; - dest_type = MLX5_FLOW_DESTINATION_TYPE_TIR; - } else { - dest_type = MLX5_FLOW_DESTINATION_TYPE_PORT; + *dest_id = mqp->raw_packet_qp.rq.tirn; + *dest_type = MLX5_FLOW_DESTINATION_TYPE_TIR; + } else if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_EGRESS) { + *dest_type = MLX5_FLOW_DESTINATION_TYPE_PORT; } + if (*dest_type == MLX5_FLOW_DESTINATION_TYPE_TIR && + fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_EGRESS) + return -EINVAL; + + return 0; +} + +#define MLX5_IB_CREATE_FLOW_MAX_FLOW_ACTIONS 2 +static int UVERBS_HANDLER(MLX5_IB_METHOD_CREATE_FLOW)( + struct uverbs_attr_bundle *attrs) +{ + struct mlx5_flow_context flow_context = {.flow_tag = + MLX5_FS_DEFAULT_FLOW_TAG}; + u32 *offset_attr, offset = 0, counter_id = 0; + int dest_id, dest_type, inlen, len, ret, i; + struct mlx5_ib_flow_handler *flow_handler; + struct mlx5_ib_flow_matcher *fs_matcher; + struct ib_uobject **arr_flow_actions; + struct ib_uflow_resources *uflow_res; + struct mlx5_flow_act flow_act = {}; + struct ib_qp *qp = NULL; + void *devx_obj, *cmd_in; + struct ib_uobject *uobj; + struct mlx5_ib_dev *dev; + + if (!capable(CAP_NET_RAW)) + return -EPERM; + + fs_matcher = uverbs_attr_get_obj(attrs, + MLX5_IB_ATTR_CREATE_FLOW_MATCHER); + uobj = uverbs_attr_get_uobject(attrs, MLX5_IB_ATTR_CREATE_FLOW_HANDLE); + dev = mlx5_udata_to_mdev(&attrs->driver_udata); + + if (get_dests(attrs, fs_matcher, &dest_id, &dest_type, &qp)) + return -EINVAL; + len = uverbs_attr_get_uobjs_arr(attrs, MLX5_IB_ATTR_CREATE_FLOW_ARR_COUNTERS_DEVX, &arr_flow_actions); if (len) { @@ -180,10 +194,6 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_CREATE_FLOW)( flow_act.action |= MLX5_FLOW_CONTEXT_ACTION_COUNT; } - if (dest_type == MLX5_FLOW_DESTINATION_TYPE_TIR && - fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_EGRESS) - return -EINVAL; - cmd_in = uverbs_attr_get_alloced_ptr( attrs, MLX5_IB_ATTR_CREATE_FLOW_MATCH_VALUE); inlen = uverbs_attr_get_len(attrs, -- cgit v1.2.3 From 8c112a5f29a343f89072bed4b9fa176fea226798 Mon Sep 17 00:00:00 2001 From: Maor Gottlieb Date: Mon, 4 May 2020 08:30:12 +0300 Subject: RDMA/mlx5: Add support in steering default miss User can configure default miss rule in order to skip matching in the user domain and forward the packet to the kernel steering domain. When user requests a default miss rule, we add steering rule to forward the traffic to the next namespace. Link: https://lore.kernel.org/r/20200504053012.270689-5-leon@kernel.org Signed-off-by: Maor Gottlieb Reviewed-by: Mark Zhang Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/flow.c | 32 ++++++++++++++++++++++++++------ drivers/infiniband/hw/mlx5/main.c | 9 ++++----- include/uapi/rdma/mlx5_user_ioctl_cmds.h | 5 +++++ 3 files changed, 35 insertions(+), 11 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/flow.c b/drivers/infiniband/hw/mlx5/flow.c index 5533b5083c29..3fa66474afa6 100644 --- a/drivers/infiniband/hw/mlx5/flow.c +++ b/drivers/infiniband/hw/mlx5/flow.c @@ -69,19 +69,32 @@ static const struct uverbs_attr_spec mlx5_ib_flow_type[] = { static int get_dests(struct uverbs_attr_bundle *attrs, struct mlx5_ib_flow_matcher *fs_matcher, int *dest_id, - int *dest_type, struct ib_qp **qp) + int *dest_type, struct ib_qp **qp, bool *def_miss) { bool dest_devx, dest_qp; void *devx_obj; + u32 flags; + int err; dest_devx = uverbs_attr_is_valid(attrs, MLX5_IB_ATTR_CREATE_FLOW_DEST_DEVX); dest_qp = uverbs_attr_is_valid(attrs, MLX5_IB_ATTR_CREATE_FLOW_DEST_QP); - if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_BYPASS && - ((dest_devx && dest_qp) || (!dest_devx && !dest_qp))) - return -EINVAL; + *def_miss = false; + err = uverbs_get_flags32(&flags, attrs, + MLX5_IB_ATTR_CREATE_FLOW_FLAGS, + MLX5_IB_ATTR_CREATE_FLOW_FLAGS_DEFAULT_MISS); + if (err) + return err; + *def_miss = flags & MLX5_IB_ATTR_CREATE_FLOW_FLAGS_DEFAULT_MISS; + + if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_BYPASS) { + if (dest_devx && (dest_qp || *def_miss)) + return -EINVAL; + else if (dest_qp && *def_miss) + return -EINVAL; + } /* Allow only DEVX object as dest when inserting to FDB */ if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_FDB && !dest_devx) @@ -153,6 +166,7 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_CREATE_FLOW)( void *devx_obj, *cmd_in; struct ib_uobject *uobj; struct mlx5_ib_dev *dev; + bool def_miss; if (!capable(CAP_NET_RAW)) return -EPERM; @@ -162,9 +176,12 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_CREATE_FLOW)( uobj = uverbs_attr_get_uobject(attrs, MLX5_IB_ATTR_CREATE_FLOW_HANDLE); dev = mlx5_udata_to_mdev(&attrs->driver_udata); - if (get_dests(attrs, fs_matcher, &dest_id, &dest_type, &qp)) + if (get_dests(attrs, fs_matcher, &dest_id, &dest_type, &qp, &def_miss)) return -EINVAL; + if (def_miss) + flow_act.action |= MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_NS; + len = uverbs_attr_get_uobjs_arr(attrs, MLX5_IB_ATTR_CREATE_FLOW_ARR_COUNTERS_DEVX, &arr_flow_actions); if (len) { @@ -636,7 +653,10 @@ DECLARE_UVERBS_NAMED_METHOD( UVERBS_ATTR_PTR_IN(MLX5_IB_ATTR_CREATE_FLOW_ARR_COUNTERS_DEVX_OFFSET, UVERBS_ATTR_MIN_SIZE(sizeof(u32)), UA_OPTIONAL, - UA_ALLOC_AND_COPY)); + UA_ALLOC_AND_COPY), + UVERBS_ATTR_FLAGS_IN(MLX5_IB_ATTR_CREATE_FLOW_FLAGS, + enum mlx5_ib_create_flow_flags, + UA_OPTIONAL)); DECLARE_UVERBS_NAMED_METHOD_DESTROY( MLX5_IB_METHOD_DESTROY_FLOW, diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 3af57dfe7224..38bf3841741c 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -4200,18 +4200,17 @@ mlx5_ib_raw_fs_rule_add(struct mlx5_ib_dev *dev, if (dest_type == MLX5_FLOW_DESTINATION_TYPE_TIR) { dst[dst_num].type = dest_type; - dst[dst_num].tir_num = dest_id; + dst[dst_num++].tir_num = dest_id; flow_act->action |= MLX5_FLOW_CONTEXT_ACTION_FWD_DEST; } else if (dest_type == MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE) { dst[dst_num].type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE_NUM; - dst[dst_num].ft_num = dest_id; + dst[dst_num++].ft_num = dest_id; flow_act->action |= MLX5_FLOW_CONTEXT_ACTION_FWD_DEST; - } else { - dst[dst_num].type = MLX5_FLOW_DESTINATION_TYPE_PORT; + } else if (dest_type == MLX5_FLOW_DESTINATION_TYPE_PORT) { + dst[dst_num++].type = MLX5_FLOW_DESTINATION_TYPE_PORT; flow_act->action |= MLX5_FLOW_CONTEXT_ACTION_ALLOW; } - dst_num++; if (flow_act->action & MLX5_FLOW_CONTEXT_ACTION_COUNT) { dst[dst_num].type = MLX5_FLOW_DESTINATION_TYPE_COUNTER; diff --git a/include/uapi/rdma/mlx5_user_ioctl_cmds.h b/include/uapi/rdma/mlx5_user_ioctl_cmds.h index 24f3388c3182..07cf54333193 100644 --- a/include/uapi/rdma/mlx5_user_ioctl_cmds.h +++ b/include/uapi/rdma/mlx5_user_ioctl_cmds.h @@ -241,6 +241,10 @@ enum mlx5_ib_flow_type { MLX5_IB_FLOW_TYPE_MC_DEFAULT, }; +enum mlx5_ib_create_flow_flags { + MLX5_IB_ATTR_CREATE_FLOW_FLAGS_DEFAULT_MISS = 1 << 0, +}; + enum mlx5_ib_create_flow_attrs { MLX5_IB_ATTR_CREATE_FLOW_HANDLE = (1U << UVERBS_ID_NS_SHIFT), MLX5_IB_ATTR_CREATE_FLOW_MATCH_VALUE, @@ -251,6 +255,7 @@ enum mlx5_ib_create_flow_attrs { MLX5_IB_ATTR_CREATE_FLOW_TAG, MLX5_IB_ATTR_CREATE_FLOW_ARR_COUNTERS_DEVX, MLX5_IB_ATTR_CREATE_FLOW_ARR_COUNTERS_DEVX_OFFSET, + MLX5_IB_ATTR_CREATE_FLOW_FLAGS, }; enum mlx5_ib_destoy_flow_attrs { -- cgit v1.2.3 From f29de9eee78253d9ae57cd58a6b21eed021742c8 Mon Sep 17 00:00:00 2001 From: Daria Velikovsky Date: Mon, 4 May 2020 08:42:27 +0300 Subject: RDMA/mlx5: Add support for drop action in DV steering When drop action is used the matching packet will stop processing in steering and will be dropped. This functionality will allow users to drop matching packets. Link: https://lore.kernel.org/r/20200504054227.271486-1-leon@kernel.org Signed-off-by: Daria Velikovsky Reviewed-by: Maor Gottlieb Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/flow.c | 35 +++++++++++++++++++------------- include/uapi/rdma/mlx5_user_ioctl_cmds.h | 1 + 2 files changed, 22 insertions(+), 14 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/flow.c b/drivers/infiniband/hw/mlx5/flow.c index 3fa66474afa6..6fa1a510c5d7 100644 --- a/drivers/infiniband/hw/mlx5/flow.c +++ b/drivers/infiniband/hw/mlx5/flow.c @@ -69,11 +69,10 @@ static const struct uverbs_attr_spec mlx5_ib_flow_type[] = { static int get_dests(struct uverbs_attr_bundle *attrs, struct mlx5_ib_flow_matcher *fs_matcher, int *dest_id, - int *dest_type, struct ib_qp **qp, bool *def_miss) + int *dest_type, struct ib_qp **qp, u32 *flags) { bool dest_devx, dest_qp; void *devx_obj; - u32 flags; int err; dest_devx = uverbs_attr_is_valid(attrs, @@ -81,23 +80,28 @@ static int get_dests(struct uverbs_attr_bundle *attrs, dest_qp = uverbs_attr_is_valid(attrs, MLX5_IB_ATTR_CREATE_FLOW_DEST_QP); - *def_miss = false; - err = uverbs_get_flags32(&flags, attrs, - MLX5_IB_ATTR_CREATE_FLOW_FLAGS, - MLX5_IB_ATTR_CREATE_FLOW_FLAGS_DEFAULT_MISS); + *flags = 0; + err = uverbs_get_flags32(flags, attrs, MLX5_IB_ATTR_CREATE_FLOW_FLAGS, + MLX5_IB_ATTR_CREATE_FLOW_FLAGS_DEFAULT_MISS | + MLX5_IB_ATTR_CREATE_FLOW_FLAGS_DROP); if (err) return err; - *def_miss = flags & MLX5_IB_ATTR_CREATE_FLOW_FLAGS_DEFAULT_MISS; + + /* Both flags are not allowed */ + if (*flags & MLX5_IB_ATTR_CREATE_FLOW_FLAGS_DEFAULT_MISS && + *flags & MLX5_IB_ATTR_CREATE_FLOW_FLAGS_DROP) + return -EINVAL; if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_BYPASS) { - if (dest_devx && (dest_qp || *def_miss)) + if (dest_devx && (dest_qp || *flags)) return -EINVAL; - else if (dest_qp && *def_miss) + else if (dest_qp && *flags) return -EINVAL; } - /* Allow only DEVX object as dest when inserting to FDB */ - if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_FDB && !dest_devx) + /* Allow only DEVX object, drop as dest for FDB */ + if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_FDB && !(dest_devx || + (*flags & MLX5_IB_ATTR_CREATE_FLOW_FLAGS_DROP))) return -EINVAL; /* Allow only DEVX object or QP as dest when inserting to RDMA_RX */ @@ -166,7 +170,7 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_CREATE_FLOW)( void *devx_obj, *cmd_in; struct ib_uobject *uobj; struct mlx5_ib_dev *dev; - bool def_miss; + u32 flags; if (!capable(CAP_NET_RAW)) return -EPERM; @@ -176,12 +180,15 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_CREATE_FLOW)( uobj = uverbs_attr_get_uobject(attrs, MLX5_IB_ATTR_CREATE_FLOW_HANDLE); dev = mlx5_udata_to_mdev(&attrs->driver_udata); - if (get_dests(attrs, fs_matcher, &dest_id, &dest_type, &qp, &def_miss)) + if (get_dests(attrs, fs_matcher, &dest_id, &dest_type, &qp, &flags)) return -EINVAL; - if (def_miss) + if (flags & MLX5_IB_ATTR_CREATE_FLOW_FLAGS_DEFAULT_MISS) flow_act.action |= MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_NS; + if (flags & MLX5_IB_ATTR_CREATE_FLOW_FLAGS_DROP) + flow_act.action |= MLX5_FLOW_CONTEXT_ACTION_DROP; + len = uverbs_attr_get_uobjs_arr(attrs, MLX5_IB_ATTR_CREATE_FLOW_ARR_COUNTERS_DEVX, &arr_flow_actions); if (len) { diff --git a/include/uapi/rdma/mlx5_user_ioctl_cmds.h b/include/uapi/rdma/mlx5_user_ioctl_cmds.h index 07cf54333193..8e316ef896b5 100644 --- a/include/uapi/rdma/mlx5_user_ioctl_cmds.h +++ b/include/uapi/rdma/mlx5_user_ioctl_cmds.h @@ -243,6 +243,7 @@ enum mlx5_ib_flow_type { enum mlx5_ib_create_flow_flags { MLX5_IB_ATTR_CREATE_FLOW_FLAGS_DEFAULT_MISS = 1 << 0, + MLX5_IB_ATTR_CREATE_FLOW_FLAGS_DROP = 1 << 1, }; enum mlx5_ib_create_flow_attrs { -- cgit v1.2.3 From 59dde4d19cf8de232c17c79c08e0db67636b022b Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Wed, 13 May 2020 13:08:09 +0300 Subject: RDMA/mlx5: Fix query_srq_cmd() function The output buffer used in mlx5_cmd_exec_inout() was wrongly changed from pre-allocated srq_out pointer to an input "out" point. That leads to unpredictable results in the get_srqc() call later. Fixes: 31578defe4eb ("RDMA/mlx5: Update mlx5_ib to use new cmd interface") Link: https://lore.kernel.org/r/20200513100809.246315-1-leon@kernel.org Reported-by: Dan Carpenter Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/srq_cmd.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/srq_cmd.c b/drivers/infiniband/hw/mlx5/srq_cmd.c index bc50a712bf2e..6f5eadc4d183 100644 --- a/drivers/infiniband/hw/mlx5/srq_cmd.c +++ b/drivers/infiniband/hw/mlx5/srq_cmd.c @@ -169,16 +169,16 @@ static int query_srq_cmd(struct mlx5_ib_dev *dev, struct mlx5_core_srq *srq, MLX5_SET(query_srq_in, in, opcode, MLX5_CMD_OP_QUERY_SRQ); MLX5_SET(query_srq_in, in, srqn, srq->srqn); - err = mlx5_cmd_exec_inout(dev->mdev, query_srq, in, out); + err = mlx5_cmd_exec_inout(dev->mdev, query_srq, in, srq_out); if (err) goto out; - srqc = MLX5_ADDR_OF(query_srq_out, out, srq_context_entry); + srqc = MLX5_ADDR_OF(query_srq_out, srq_out, srq_context_entry); get_srqc(srqc, out); if (MLX5_GET(srqc, srqc, state) != MLX5_SRQC_STATE_GOOD) out->flags |= MLX5_SRQ_FLAG_ERR; out: - kvfree(out); + kvfree(srq_out); return err; } -- cgit v1.2.3 From 9ddacff18b159fd9852734f611a6db6100432635 Mon Sep 17 00:00:00 2001 From: Jack Wang Date: Mon, 11 May 2020 15:51:07 +0200 Subject: sysfs: export sysfs_remove_file_self() Function is going to be used in transport over RDMA module in subsequent patches, so export it to GPL modules. Link: https://lore.kernel.org/r/20200511135131.27580-2-danil.kipnis@cloud.ionos.com Signed-off-by: Roman Pen Acked-by: Tejun Heo Cc: linux-kernel@vger.kernel.org [jwang: extend the commit message] Signed-off-by: Jack Wang Reviewed-by: Bart Van Assche Signed-off-by: Jason Gunthorpe --- fs/sysfs/file.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index 26bbf960e2a2..d81f9f974a35 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -492,6 +492,7 @@ bool sysfs_remove_file_self(struct kobject *kobj, const struct attribute *attr) kernfs_put(kn); return ret; } +EXPORT_SYMBOL_GPL(sysfs_remove_file_self); void sysfs_remove_files(struct kobject *kobj, const struct attribute * const *ptr) { -- cgit v1.2.3 From b5c27cdb094ed9cce562e98c931f53669b6409f7 Mon Sep 17 00:00:00 2001 From: Jack Wang Date: Mon, 11 May 2020 15:51:08 +0200 Subject: RDMA/rtrs: public interface header to establish RDMA connections Introduce public header which provides set of API functions to establish RDMA connections from client to server machine using RTRS protocol, which manages RDMA connections for each session, does multipathing and load balancing. Main functions for client (active) side: rtrs_clt_open() - Creates set of RDMA connections incapsulated in IBTRS session and returns pointer on RTRS session object. rtrs_clt_close() - Closes RDMA connections associated with RTRS session. rtrs_clt_request() - Requests zero-copy RDMA transfer to/from server. Main functions for server (passive) side: rtrs_srv_open() - Starts listening for RTRS clients on specified port and invokes RTRS callbacks for incoming RDMA requests or link events. rtrs_srv_close() - Closes RTRS server context. Link: https://lore.kernel.org/r/20200511135131.27580-3-danil.kipnis@cloud.ionos.com Signed-off-by: Danil Kipnis Signed-off-by: Jack Wang Signed-off-by: Jason Gunthorpe --- drivers/infiniband/ulp/rtrs/rtrs.h | 195 +++++++++++++++++++++++++++++++++++++ 1 file changed, 195 insertions(+) create mode 100644 drivers/infiniband/ulp/rtrs/rtrs.h diff --git a/drivers/infiniband/ulp/rtrs/rtrs.h b/drivers/infiniband/ulp/rtrs/rtrs.h new file mode 100644 index 000000000000..9879d40467b6 --- /dev/null +++ b/drivers/infiniband/ulp/rtrs/rtrs.h @@ -0,0 +1,195 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * RDMA Transport Layer + * + * Copyright (c) 2014 - 2018 ProfitBricks GmbH. All rights reserved. + * Copyright (c) 2018 - 2019 1&1 IONOS Cloud GmbH. All rights reserved. + * Copyright (c) 2019 - 2020 1&1 IONOS SE. All rights reserved. + */ +#ifndef RTRS_H +#define RTRS_H + +#include +#include + +struct rtrs_permit; +struct rtrs_clt; +struct rtrs_srv_ctx; +struct rtrs_srv; +struct rtrs_srv_op; + +/* + * RDMA transport (RTRS) client API + */ + +/** + * enum rtrs_clt_link_ev - Events about connectivity state of a client + * @RTRS_CLT_LINK_EV_RECONNECTED Client was reconnected. + * @RTRS_CLT_LINK_EV_DISCONNECTED Client was disconnected. + */ +enum rtrs_clt_link_ev { + RTRS_CLT_LINK_EV_RECONNECTED, + RTRS_CLT_LINK_EV_DISCONNECTED, +}; + +/** + * Source and destination address of a path to be established + */ +struct rtrs_addr { + struct sockaddr_storage *src; + struct sockaddr_storage *dst; +}; + +/** + * rtrs_clt_ops - it holds the link event callback and private pointer. + * @priv: User supplied private data. + * @link_ev: Event notification callback function for connection state changes + * @priv: User supplied data that was passed to rtrs_clt_open() + * @ev: Occurred event + */ +struct rtrs_clt_ops { + void *priv; + void (*link_ev)(void *priv, enum rtrs_clt_link_ev ev); +}; + +struct rtrs_clt *rtrs_clt_open(struct rtrs_clt_ops *ops, + const char *sessname, + const struct rtrs_addr *paths, + size_t path_cnt, u16 port, + size_t pdu_sz, u8 reconnect_delay_sec, + u16 max_segments, + s16 max_reconnect_attempts); + +void rtrs_clt_close(struct rtrs_clt *sess); + +/** + * rtrs_permit_to_pdu() - converts rtrs_permit to opaque pdu pointer + * @permit: RTRS permit pointer, it associates the memory allocation for future + * RDMA operation. + */ +void *rtrs_permit_to_pdu(struct rtrs_permit *permit); + +enum { + RTRS_PERMIT_NOWAIT = 0, + RTRS_PERMIT_WAIT = 1, +}; + +/** + * enum rtrs_clt_con_type() type of ib connection to use with a given + * rtrs_permit + * @ADMIN_CON - use connection reserved for "service" messages + * @IO_CON - use a connection reserved for IO + */ +enum rtrs_clt_con_type { + RTRS_ADMIN_CON, + RTRS_IO_CON +}; + +struct rtrs_permit *rtrs_clt_get_permit(struct rtrs_clt *sess, + enum rtrs_clt_con_type con_type, + int wait); + +void rtrs_clt_put_permit(struct rtrs_clt *sess, struct rtrs_permit *permit); + +/** + * rtrs_clt_req_ops - it holds the request confirmation callback + * and a private pointer. + * @priv: User supplied private data. + * @conf_fn: callback function to be called as confirmation + * @priv: User provided data, passed back with corresponding + * @(conf) confirmation. + * @errno: error number. + */ +struct rtrs_clt_req_ops { + void *priv; + void (*conf_fn)(void *priv, int errno); +}; + +int rtrs_clt_request(int dir, struct rtrs_clt_req_ops *ops, + struct rtrs_clt *sess, struct rtrs_permit *permit, + const struct kvec *vec, size_t nr, size_t len, + struct scatterlist *sg, unsigned int sg_cnt); + +/** + * rtrs_attrs - RTRS session attributes + */ +struct rtrs_attrs { + u32 queue_depth; + u32 max_io_size; + u8 sessname[NAME_MAX]; + struct kobject *sess_kobj; +}; + +int rtrs_clt_query(struct rtrs_clt *sess, struct rtrs_attrs *attr); + +/* + * Here goes RTRS server API + */ + +/** + * enum rtrs_srv_link_ev - Server link events + * @RTRS_SRV_LINK_EV_CONNECTED: Connection from client established + * @RTRS_SRV_LINK_EV_DISCONNECTED: Connection was disconnected, all + * connection RTRS resources were freed. + */ +enum rtrs_srv_link_ev { + RTRS_SRV_LINK_EV_CONNECTED, + RTRS_SRV_LINK_EV_DISCONNECTED, +}; + +struct rtrs_srv_ops { + /** + * rdma_ev(): Event notification for RDMA operations + * If the callback returns a value != 0, an error + * message for the data transfer will be sent to + * the client. + + * @sess: Session + * @priv: Private data set by rtrs_srv_set_sess_priv() + * @id: internal RTRS operation id + * @dir: READ/WRITE + * @data: Pointer to (bidirectional) rdma memory area: + * - in case of %RTRS_SRV_RDMA_EV_RECV contains + * data sent by the client + * - in case of %RTRS_SRV_RDMA_EV_WRITE_REQ points + * to the memory area where the response is to be + * written to + * @datalen: Size of the memory area in @data + * @usr: The extra user message sent by the client (%vec) + * @usrlen: Size of the user message + */ + int (*rdma_ev)(struct rtrs_srv *sess, void *priv, + struct rtrs_srv_op *id, int dir, + void *data, size_t datalen, const void *usr, + size_t usrlen); + /** + * link_ev(): Events about connectivity state changes + * If the callback returns != 0 and the event + * %RTRS_SRV_LINK_EV_CONNECTED the corresponding + * session will be destroyed. + * @sess: Session + * @ev: event + * @priv: Private data from user if previously set with + * rtrs_srv_set_sess_priv() + */ + int (*link_ev)(struct rtrs_srv *sess, enum rtrs_srv_link_ev ev, + void *priv); +}; + +struct rtrs_srv_ctx *rtrs_srv_open(struct rtrs_srv_ops *ops, u16 port); + +void rtrs_srv_close(struct rtrs_srv_ctx *ctx); + +bool rtrs_srv_resp_rdma(struct rtrs_srv_op *id, int errno); + +void rtrs_srv_set_sess_priv(struct rtrs_srv *sess, void *priv); + +int rtrs_srv_get_sess_name(struct rtrs_srv *sess, char *sessname, size_t len); + +int rtrs_srv_get_queue_depth(struct rtrs_srv *sess); + +int rtrs_addr_to_sockaddr(const char *str, size_t len, u16 port, + struct rtrs_addr *addr); + +int sockaddr_to_str(const struct sockaddr *addr, char *buf, size_t len); +#endif -- cgit v1.2.3 From 91fddedd439c2463762275693f784abc9d9613e2 Mon Sep 17 00:00:00 2001 From: Jack Wang Date: Mon, 11 May 2020 15:51:09 +0200 Subject: RDMA/rtrs: private headers with rtrs protocol structs and helpers These are common private headers with rtrs protocol structures, logging, sysfs and other helper functions, which are used on both client and server sides. Link: https://lore.kernel.org/r/20200511135131.27580-4-danil.kipnis@cloud.ionos.com Signed-off-by: Danil Kipnis Signed-off-by: Jack Wang Signed-off-by: Jason Gunthorpe --- drivers/infiniband/ulp/rtrs/rtrs-log.h | 28 +++ drivers/infiniband/ulp/rtrs/rtrs-pri.h | 399 +++++++++++++++++++++++++++++++++ 2 files changed, 427 insertions(+) create mode 100644 drivers/infiniband/ulp/rtrs/rtrs-log.h create mode 100644 drivers/infiniband/ulp/rtrs/rtrs-pri.h diff --git a/drivers/infiniband/ulp/rtrs/rtrs-log.h b/drivers/infiniband/ulp/rtrs/rtrs-log.h new file mode 100644 index 000000000000..53c785b992f2 --- /dev/null +++ b/drivers/infiniband/ulp/rtrs/rtrs-log.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * RDMA Transport Layer + * + * Copyright (c) 2014 - 2018 ProfitBricks GmbH. All rights reserved. + * Copyright (c) 2018 - 2019 1&1 IONOS Cloud GmbH. All rights reserved. + * Copyright (c) 2019 - 2020 1&1 IONOS SE. All rights reserved. + */ +#ifndef RTRS_LOG_H +#define RTRS_LOG_H + +#define rtrs_log(fn, obj, fmt, ...) \ + fn("<%s>: " fmt, obj->sessname, ##__VA_ARGS__) + +#define rtrs_err(obj, fmt, ...) \ + rtrs_log(pr_err, obj, fmt, ##__VA_ARGS__) +#define rtrs_err_rl(obj, fmt, ...) \ + rtrs_log(pr_err_ratelimited, obj, fmt, ##__VA_ARGS__) +#define rtrs_wrn(obj, fmt, ...) \ + rtrs_log(pr_warn, obj, fmt, ##__VA_ARGS__) +#define rtrs_wrn_rl(obj, fmt, ...) \ + rtrs_log(pr_warn_ratelimited, obj, fmt, ##__VA_ARGS__) +#define rtrs_info(obj, fmt, ...) \ + rtrs_log(pr_info, obj, fmt, ##__VA_ARGS__) +#define rtrs_info_rl(obj, fmt, ...) \ + rtrs_log(pr_info_ratelimited, obj, fmt, ##__VA_ARGS__) + +#endif /* RTRS_LOG_H */ diff --git a/drivers/infiniband/ulp/rtrs/rtrs-pri.h b/drivers/infiniband/ulp/rtrs/rtrs-pri.h new file mode 100644 index 000000000000..0a93c87ef92b --- /dev/null +++ b/drivers/infiniband/ulp/rtrs/rtrs-pri.h @@ -0,0 +1,399 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * RDMA Transport Layer + * + * Copyright (c) 2014 - 2018 ProfitBricks GmbH. All rights reserved. + * Copyright (c) 2018 - 2019 1&1 IONOS Cloud GmbH. All rights reserved. + * Copyright (c) 2019 - 2020 1&1 IONOS SE. All rights reserved. + */ + +#ifndef RTRS_PRI_H +#define RTRS_PRI_H + +#include +#include +#include +#include + +#include "rtrs.h" + +#define RTRS_PROTO_VER_MAJOR 2 +#define RTRS_PROTO_VER_MINOR 0 + +#define RTRS_PROTO_VER_STRING __stringify(RTRS_PROTO_VER_MAJOR) "." \ + __stringify(RTRS_PROTO_VER_MINOR) + +enum rtrs_imm_const { + MAX_IMM_TYPE_BITS = 4, + MAX_IMM_TYPE_MASK = ((1 << MAX_IMM_TYPE_BITS) - 1), + MAX_IMM_PAYL_BITS = 28, + MAX_IMM_PAYL_MASK = ((1 << MAX_IMM_PAYL_BITS) - 1), +}; + +enum rtrs_imm_type { + RTRS_IO_REQ_IMM = 0, /* client to server */ + RTRS_IO_RSP_IMM = 1, /* server to client */ + RTRS_IO_RSP_W_INV_IMM = 2, /* server to client */ + + RTRS_HB_MSG_IMM = 8, /* HB: HeartBeat */ + RTRS_HB_ACK_IMM = 9, + + RTRS_LAST_IMM, +}; + +enum { + SERVICE_CON_QUEUE_DEPTH = 512, + + MAX_PATHS_NUM = 128, + + /* + * With the size of struct rtrs_permit allocated on the client, 4K + * is the maximum number of rtrs_permits we can allocate. This number is + * also used on the client to allocate the IU for the user connection + * to receive the RDMA addresses from the server. + */ + MAX_SESS_QUEUE_DEPTH = 4096, + + RTRS_HB_INTERVAL_MS = 5000, + RTRS_HB_MISSED_MAX = 5, + + RTRS_MAGIC = 0x1BBD, + RTRS_PROTO_VER = (RTRS_PROTO_VER_MAJOR << 8) | RTRS_PROTO_VER_MINOR, +}; + +struct rtrs_ib_dev; + +struct rtrs_rdma_dev_pd_ops { + struct rtrs_ib_dev *(*alloc)(void); + void (*free)(struct rtrs_ib_dev *dev); + int (*init)(struct rtrs_ib_dev *dev); + void (*deinit)(struct rtrs_ib_dev *dev); +}; + +struct rtrs_rdma_dev_pd { + struct mutex mutex; + struct list_head list; + enum ib_pd_flags pd_flags; + const struct rtrs_rdma_dev_pd_ops *ops; +}; + +struct rtrs_ib_dev { + struct ib_device *ib_dev; + struct ib_pd *ib_pd; + struct kref ref; + struct list_head entry; + struct rtrs_rdma_dev_pd *pool; +}; + +struct rtrs_con { + struct rtrs_sess *sess; + struct ib_qp *qp; + struct ib_cq *cq; + struct rdma_cm_id *cm_id; + unsigned int cid; +}; + +struct rtrs_sess { + struct list_head entry; + struct sockaddr_storage dst_addr; + struct sockaddr_storage src_addr; + char sessname[NAME_MAX]; + uuid_t uuid; + struct rtrs_con **con; + unsigned int con_num; + unsigned int recon_cnt; + struct rtrs_ib_dev *dev; + int dev_ref; + struct ib_cqe *hb_cqe; + void (*hb_err_handler)(struct rtrs_con *con); + struct workqueue_struct *hb_wq; + struct delayed_work hb_dwork; + unsigned int hb_interval_ms; + unsigned int hb_missed_cnt; + unsigned int hb_missed_max; +}; + +/* rtrs information unit */ +struct rtrs_iu { + struct list_head list; + struct ib_cqe cqe; + dma_addr_t dma_addr; + void *buf; + size_t size; + enum dma_data_direction direction; +}; + +/** + * enum rtrs_msg_types - RTRS message types, see also rtrs/README + * @RTRS_MSG_INFO_REQ: Client additional info request to the server + * @RTRS_MSG_INFO_RSP: Server additional info response to the client + * @RTRS_MSG_WRITE: Client writes data per RDMA to server + * @RTRS_MSG_READ: Client requests data transfer from server + * @RTRS_MSG_RKEY_RSP: Server refreshed rkey for rbuf + */ +enum rtrs_msg_types { + RTRS_MSG_INFO_REQ, + RTRS_MSG_INFO_RSP, + RTRS_MSG_WRITE, + RTRS_MSG_READ, + RTRS_MSG_RKEY_RSP, +}; + +/** + * enum rtrs_msg_flags - RTRS message flags. + * @RTRS_NEED_INVAL: Send invalidation in response. + * @RTRS_MSG_NEW_RKEY_F: Send refreshed rkey in response. + */ +enum rtrs_msg_flags { + RTRS_MSG_NEED_INVAL_F = 1 << 0, + RTRS_MSG_NEW_RKEY_F = 1 << 1, +}; + +/** + * struct rtrs_sg_desc - RDMA-Buffer entry description + * @addr: Address of RDMA destination buffer + * @key: Authorization rkey to write to the buffer + * @len: Size of the buffer + */ +struct rtrs_sg_desc { + __le64 addr; + __le32 key; + __le32 len; +}; + +/** + * struct rtrs_msg_conn_req - Client connection request to the server + * @magic: RTRS magic + * @version: RTRS protocol version + * @cid: Current connection id + * @cid_num: Number of connections per session + * @recon_cnt: Reconnections counter + * @sess_uuid: UUID of a session (path) + * @paths_uuid: UUID of a group of sessions (paths) + * + * NOTE: max size 56 bytes, see man rdma_connect(). + */ +struct rtrs_msg_conn_req { + /* Is set to 0 by cma.c in case of AF_IB, do not touch that. + * see https://www.spinics.net/lists/linux-rdma/msg22397.html + */ + u8 __cma_version; + /* On sender side that should be set to 0, or cma_save_ip_info() + * extract garbage and will fail. + */ + u8 __ip_version; + __le16 magic; + __le16 version; + __le16 cid; + __le16 cid_num; + __le16 recon_cnt; + uuid_t sess_uuid; + uuid_t paths_uuid; + u8 reserved[12]; +}; + +/** + * struct rtrs_msg_conn_rsp - Server connection response to the client + * @magic: RTRS magic + * @version: RTRS protocol version + * @errno: If rdma_accept() then 0, if rdma_reject() indicates error + * @queue_depth: max inflight messages (queue-depth) in this session + * @max_io_size: max io size server supports + * @max_hdr_size: max msg header size server supports + * + * NOTE: size is 56 bytes, max possible is 136 bytes, see man rdma_accept(). + */ +struct rtrs_msg_conn_rsp { + __le16 magic; + __le16 version; + __le16 errno; + __le16 queue_depth; + __le32 max_io_size; + __le32 max_hdr_size; + __le32 flags; + u8 reserved[36]; +}; + +/** + * struct rtrs_msg_info_req + * @type: @RTRS_MSG_INFO_REQ + * @sessname: Session name chosen by client + */ +struct rtrs_msg_info_req { + __le16 type; + u8 sessname[NAME_MAX]; + u8 reserved[15]; +}; + +/** + * struct rtrs_msg_info_rsp + * @type: @RTRS_MSG_INFO_RSP + * @sg_cnt: Number of @desc entries + * @desc: RDMA buffers where the client can write to server + */ +struct rtrs_msg_info_rsp { + __le16 type; + __le16 sg_cnt; + u8 reserved[4]; + struct rtrs_sg_desc desc[]; +}; + +/** + * struct rtrs_msg_rkey_rsp + * @type: @RTRS_MSG_RKEY_RSP + * @buf_id: RDMA buf_id of the new rkey + * @rkey: new remote key for RDMA buffers id from server + */ +struct rtrs_msg_rkey_rsp { + __le16 type; + __le16 buf_id; + __le32 rkey; +}; + +/** + * struct rtrs_msg_rdma_read - RDMA data transfer request from client + * @type: always @RTRS_MSG_READ + * @usr_len: length of user payload + * @sg_cnt: number of @desc entries + * @desc: RDMA buffers where the server can write the result to + */ +struct rtrs_msg_rdma_read { + __le16 type; + __le16 usr_len; + __le16 flags; + __le16 sg_cnt; + struct rtrs_sg_desc desc[]; +}; + +/** + * struct_msg_rdma_write - Message transferred to server with RDMA-Write + * @type: always @RTRS_MSG_WRITE + * @usr_len: length of user payload + */ +struct rtrs_msg_rdma_write { + __le16 type; + __le16 usr_len; +}; + +/** + * struct_msg_rdma_hdr - header for read or write request + * @type: @RTRS_MSG_WRITE | @RTRS_MSG_READ + */ +struct rtrs_msg_rdma_hdr { + __le16 type; +}; + +/* rtrs.c */ + +struct rtrs_iu *rtrs_iu_alloc(u32 queue_size, size_t size, gfp_t t, + struct ib_device *dev, enum dma_data_direction, + void (*done)(struct ib_cq *cq, struct ib_wc *wc)); +void rtrs_iu_free(struct rtrs_iu *iu, enum dma_data_direction dir, + struct ib_device *dev, u32 queue_size); +int rtrs_iu_post_recv(struct rtrs_con *con, struct rtrs_iu *iu); +int rtrs_iu_post_send(struct rtrs_con *con, struct rtrs_iu *iu, size_t size, + struct ib_send_wr *head); +int rtrs_iu_post_rdma_write_imm(struct rtrs_con *con, struct rtrs_iu *iu, + struct ib_sge *sge, unsigned int num_sge, + u32 rkey, u64 rdma_addr, u32 imm_data, + enum ib_send_flags flags, + struct ib_send_wr *head); + +int rtrs_post_recv_empty(struct rtrs_con *con, struct ib_cqe *cqe); +int rtrs_post_rdma_write_imm_empty(struct rtrs_con *con, struct ib_cqe *cqe, + u32 imm_data, enum ib_send_flags flags, + struct ib_send_wr *head); + +int rtrs_cq_qp_create(struct rtrs_sess *rtrs_sess, struct rtrs_con *con, + u32 max_send_sge, int cq_vector, u16 cq_size, + u16 wr_queue_size, enum ib_poll_context poll_ctx); +void rtrs_cq_qp_destroy(struct rtrs_con *con); + +void rtrs_init_hb(struct rtrs_sess *sess, struct ib_cqe *cqe, + unsigned int interval_ms, unsigned int missed_max, + void (*err_handler)(struct rtrs_con *con), + struct workqueue_struct *wq); +void rtrs_start_hb(struct rtrs_sess *sess); +void rtrs_stop_hb(struct rtrs_sess *sess); +void rtrs_send_hb_ack(struct rtrs_sess *sess); + +void rtrs_rdma_dev_pd_init(enum ib_pd_flags pd_flags, + struct rtrs_rdma_dev_pd *pool); +void rtrs_rdma_dev_pd_deinit(struct rtrs_rdma_dev_pd *pool); + +struct rtrs_ib_dev *rtrs_ib_dev_find_or_add(struct ib_device *ib_dev, + struct rtrs_rdma_dev_pd *pool); +int rtrs_ib_dev_put(struct rtrs_ib_dev *dev); + +static inline u32 rtrs_to_imm(u32 type, u32 payload) +{ + BUILD_BUG_ON(MAX_IMM_PAYL_BITS + MAX_IMM_TYPE_BITS != 32); + BUILD_BUG_ON(RTRS_LAST_IMM > (1<> MAX_IMM_PAYL_BITS; +} + +static inline u32 rtrs_to_io_req_imm(u32 addr) +{ + return rtrs_to_imm(RTRS_IO_REQ_IMM, addr); +} + +static inline u32 rtrs_to_io_rsp_imm(u32 msg_id, int errno, bool w_inval) +{ + enum rtrs_imm_type type; + u32 payload; + + /* 9 bits for errno, 19 bits for msg_id */ + payload = (abs(errno) & 0x1ff) << 19 | (msg_id & 0x7ffff); + type = w_inval ? RTRS_IO_RSP_W_INV_IMM : RTRS_IO_RSP_IMM; + + return rtrs_to_imm(type, payload); +} + +static inline void rtrs_from_io_rsp_imm(u32 payload, u32 *msg_id, int *errno) +{ + /* 9 bits for errno, 19 bits for msg_id */ + *msg_id = payload & 0x7ffff; + *errno = -(int)((payload >> 19) & 0x1ff); +} + +#define STAT_STORE_FUNC(type, set_value, reset) \ +static ssize_t set_value##_store(struct kobject *kobj, \ + struct kobj_attribute *attr, \ + const char *buf, size_t count) \ +{ \ + int ret = -EINVAL; \ + type *stats = container_of(kobj, type, kobj_stats); \ + \ + if (sysfs_streq(buf, "1")) \ + ret = reset(stats, true); \ + else if (sysfs_streq(buf, "0")) \ + ret = reset(stats, false); \ + if (ret) \ + return ret; \ + \ + return count; \ +} + +#define STAT_SHOW_FUNC(type, get_value, print) \ +static ssize_t get_value##_show(struct kobject *kobj, \ + struct kobj_attribute *attr, \ + char *page) \ +{ \ + type *stats = container_of(kobj, type, kobj_stats); \ + \ + return print(stats, page, PAGE_SIZE); \ +} + +#define STAT_ATTR(type, stat, print, reset) \ +STAT_STORE_FUNC(type, stat, reset) \ +STAT_SHOW_FUNC(type, stat, print) \ +static struct kobj_attribute stat##_attr = __ATTR_RW(stat) + +#endif /* RTRS_PRI_H */ -- cgit v1.2.3 From c0894b3ea69d35995bd220020b20570c45a1e6b4 Mon Sep 17 00:00:00 2001 From: Jack Wang Date: Mon, 11 May 2020 15:51:10 +0200 Subject: RDMA/rtrs: core: lib functions shared between client and server modules This is a set of library functions existing as a rtrs-core module, used by client and server modules. Mainly these functions wrap IB and RDMA calls and provide a bit higher abstraction for implementing of RTRS protocol on client or server sides. Link: https://lore.kernel.org/r/20200511135131.27580-5-danil.kipnis@cloud.ionos.com Signed-off-by: Danil Kipnis Signed-off-by: Jack Wang Signed-off-by: Jason Gunthorpe --- drivers/infiniband/ulp/rtrs/rtrs.c | 612 +++++++++++++++++++++++++++++++++++++ 1 file changed, 612 insertions(+) create mode 100644 drivers/infiniband/ulp/rtrs/rtrs.c diff --git a/drivers/infiniband/ulp/rtrs/rtrs.c b/drivers/infiniband/ulp/rtrs/rtrs.c new file mode 100644 index 000000000000..ff1093d6e4bc --- /dev/null +++ b/drivers/infiniband/ulp/rtrs/rtrs.c @@ -0,0 +1,612 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * RDMA Transport Layer + * + * Copyright (c) 2014 - 2018 ProfitBricks GmbH. All rights reserved. + * Copyright (c) 2018 - 2019 1&1 IONOS Cloud GmbH. All rights reserved. + * Copyright (c) 2019 - 2020 1&1 IONOS SE. All rights reserved. + */ +#undef pr_fmt +#define pr_fmt(fmt) KBUILD_MODNAME " L" __stringify(__LINE__) ": " fmt + +#include +#include + +#include "rtrs-pri.h" +#include "rtrs-log.h" + +MODULE_DESCRIPTION("RDMA Transport Core"); +MODULE_LICENSE("GPL"); + +struct rtrs_iu *rtrs_iu_alloc(u32 queue_size, size_t size, gfp_t gfp_mask, + struct ib_device *dma_dev, + enum dma_data_direction dir, + void (*done)(struct ib_cq *cq, struct ib_wc *wc)) +{ + struct rtrs_iu *ius, *iu; + int i; + + ius = kcalloc(queue_size, sizeof(*ius), gfp_mask); + if (!ius) + return NULL; + for (i = 0; i < queue_size; i++) { + iu = &ius[i]; + iu->buf = kzalloc(size, gfp_mask); + if (!iu->buf) + goto err; + + iu->dma_addr = ib_dma_map_single(dma_dev, iu->buf, size, dir); + if (ib_dma_mapping_error(dma_dev, iu->dma_addr)) + goto err; + + iu->cqe.done = done; + iu->size = size; + iu->direction = dir; + } + return ius; +err: + rtrs_iu_free(ius, dir, dma_dev, i); + return NULL; +} +EXPORT_SYMBOL_GPL(rtrs_iu_alloc); + +void rtrs_iu_free(struct rtrs_iu *ius, enum dma_data_direction dir, + struct ib_device *ibdev, u32 queue_size) +{ + struct rtrs_iu *iu; + int i; + + if (!ius) + return; + + for (i = 0; i < queue_size; i++) { + iu = &ius[i]; + ib_dma_unmap_single(ibdev, iu->dma_addr, iu->size, dir); + kfree(iu->buf); + } + kfree(ius); +} +EXPORT_SYMBOL_GPL(rtrs_iu_free); + +int rtrs_iu_post_recv(struct rtrs_con *con, struct rtrs_iu *iu) +{ + struct rtrs_sess *sess = con->sess; + struct ib_recv_wr wr; + struct ib_sge list; + + list.addr = iu->dma_addr; + list.length = iu->size; + list.lkey = sess->dev->ib_pd->local_dma_lkey; + + if (list.length == 0) { + rtrs_wrn(con->sess, + "Posting receive work request failed, sg list is empty\n"); + return -EINVAL; + } + wr = (struct ib_recv_wr) { + .wr_cqe = &iu->cqe, + .sg_list = &list, + .num_sge = 1, + }; + + return ib_post_recv(con->qp, &wr, NULL); +} +EXPORT_SYMBOL_GPL(rtrs_iu_post_recv); + +int rtrs_post_recv_empty(struct rtrs_con *con, struct ib_cqe *cqe) +{ + struct ib_recv_wr wr; + + wr = (struct ib_recv_wr) { + .wr_cqe = cqe, + }; + + return ib_post_recv(con->qp, &wr, NULL); +} +EXPORT_SYMBOL_GPL(rtrs_post_recv_empty); + +int rtrs_iu_post_send(struct rtrs_con *con, struct rtrs_iu *iu, size_t size, + struct ib_send_wr *head) +{ + struct rtrs_sess *sess = con->sess; + struct ib_send_wr wr; + struct ib_sge list; + + if (WARN_ON(size == 0)) + return -EINVAL; + + list.addr = iu->dma_addr; + list.length = size; + list.lkey = sess->dev->ib_pd->local_dma_lkey; + + wr = (struct ib_send_wr) { + .wr_cqe = &iu->cqe, + .sg_list = &list, + .num_sge = 1, + .opcode = IB_WR_SEND, + .send_flags = IB_SEND_SIGNALED, + }; + + if (head) { + struct ib_send_wr *tail = head; + + while (tail->next) + tail = tail->next; + tail->next = ≀ + } else { + head = ≀ + } + + return ib_post_send(con->qp, head, NULL); +} +EXPORT_SYMBOL_GPL(rtrs_iu_post_send); + +int rtrs_iu_post_rdma_write_imm(struct rtrs_con *con, struct rtrs_iu *iu, + struct ib_sge *sge, unsigned int num_sge, + u32 rkey, u64 rdma_addr, u32 imm_data, + enum ib_send_flags flags, + struct ib_send_wr *head) +{ + struct ib_rdma_wr wr; + int i; + + wr = (struct ib_rdma_wr) { + .wr.wr_cqe = &iu->cqe, + .wr.sg_list = sge, + .wr.num_sge = num_sge, + .rkey = rkey, + .remote_addr = rdma_addr, + .wr.opcode = IB_WR_RDMA_WRITE_WITH_IMM, + .wr.ex.imm_data = cpu_to_be32(imm_data), + .wr.send_flags = flags, + }; + + /* + * If one of the sges has 0 size, the operation will fail with a + * length error + */ + for (i = 0; i < num_sge; i++) + if (WARN_ON(sge[i].length == 0)) + return -EINVAL; + + if (head) { + struct ib_send_wr *tail = head; + + while (tail->next) + tail = tail->next; + tail->next = &wr.wr; + } else { + head = &wr.wr; + } + + return ib_post_send(con->qp, head, NULL); +} +EXPORT_SYMBOL_GPL(rtrs_iu_post_rdma_write_imm); + +int rtrs_post_rdma_write_imm_empty(struct rtrs_con *con, struct ib_cqe *cqe, + u32 imm_data, enum ib_send_flags flags, + struct ib_send_wr *head) +{ + struct ib_send_wr wr; + + wr = (struct ib_send_wr) { + .wr_cqe = cqe, + .send_flags = flags, + .opcode = IB_WR_RDMA_WRITE_WITH_IMM, + .ex.imm_data = cpu_to_be32(imm_data), + }; + + if (head) { + struct ib_send_wr *tail = head; + + while (tail->next) + tail = tail->next; + tail->next = ≀ + } else { + head = ≀ + } + + return ib_post_send(con->qp, head, NULL); +} +EXPORT_SYMBOL_GPL(rtrs_post_rdma_write_imm_empty); + +static void qp_event_handler(struct ib_event *ev, void *ctx) +{ + struct rtrs_con *con = ctx; + + switch (ev->event) { + case IB_EVENT_COMM_EST: + rtrs_info(con->sess, "QP event %s (%d) received\n", + ib_event_msg(ev->event), ev->event); + rdma_notify(con->cm_id, IB_EVENT_COMM_EST); + break; + default: + rtrs_info(con->sess, "Unhandled QP event %s (%d) received\n", + ib_event_msg(ev->event), ev->event); + break; + } +} + +static int create_cq(struct rtrs_con *con, int cq_vector, u16 cq_size, + enum ib_poll_context poll_ctx) +{ + struct rdma_cm_id *cm_id = con->cm_id; + struct ib_cq *cq; + + cq = ib_alloc_cq(cm_id->device, con, cq_size, + cq_vector, poll_ctx); + if (IS_ERR(cq)) { + rtrs_err(con->sess, "Creating completion queue failed, errno: %ld\n", + PTR_ERR(cq)); + return PTR_ERR(cq); + } + con->cq = cq; + + return 0; +} + +static int create_qp(struct rtrs_con *con, struct ib_pd *pd, + u16 wr_queue_size, u32 max_sge) +{ + struct ib_qp_init_attr init_attr = {NULL}; + struct rdma_cm_id *cm_id = con->cm_id; + int ret; + + init_attr.cap.max_send_wr = wr_queue_size; + init_attr.cap.max_recv_wr = wr_queue_size; + init_attr.cap.max_recv_sge = 1; + init_attr.event_handler = qp_event_handler; + init_attr.qp_context = con; + init_attr.cap.max_send_sge = max_sge; + + init_attr.qp_type = IB_QPT_RC; + init_attr.send_cq = con->cq; + init_attr.recv_cq = con->cq; + init_attr.sq_sig_type = IB_SIGNAL_REQ_WR; + + ret = rdma_create_qp(cm_id, pd, &init_attr); + if (ret) { + rtrs_err(con->sess, "Creating QP failed, err: %d\n", ret); + return ret; + } + con->qp = cm_id->qp; + + return ret; +} + +int rtrs_cq_qp_create(struct rtrs_sess *sess, struct rtrs_con *con, + u32 max_send_sge, int cq_vector, u16 cq_size, + u16 wr_queue_size, enum ib_poll_context poll_ctx) +{ + int err; + + err = create_cq(con, cq_vector, cq_size, poll_ctx); + if (err) + return err; + + err = create_qp(con, sess->dev->ib_pd, wr_queue_size, max_send_sge); + if (err) { + ib_free_cq(con->cq); + con->cq = NULL; + return err; + } + con->sess = sess; + + return 0; +} +EXPORT_SYMBOL_GPL(rtrs_cq_qp_create); + +void rtrs_cq_qp_destroy(struct rtrs_con *con) +{ + if (con->qp) { + rdma_destroy_qp(con->cm_id); + con->qp = NULL; + } + if (con->cq) { + ib_free_cq(con->cq); + con->cq = NULL; + } +} +EXPORT_SYMBOL_GPL(rtrs_cq_qp_destroy); + +static void schedule_hb(struct rtrs_sess *sess) +{ + queue_delayed_work(sess->hb_wq, &sess->hb_dwork, + msecs_to_jiffies(sess->hb_interval_ms)); +} + +void rtrs_send_hb_ack(struct rtrs_sess *sess) +{ + struct rtrs_con *usr_con = sess->con[0]; + u32 imm; + int err; + + imm = rtrs_to_imm(RTRS_HB_ACK_IMM, 0); + err = rtrs_post_rdma_write_imm_empty(usr_con, sess->hb_cqe, imm, + IB_SEND_SIGNALED, NULL); + if (err) { + sess->hb_err_handler(usr_con); + return; + } +} +EXPORT_SYMBOL_GPL(rtrs_send_hb_ack); + +static void hb_work(struct work_struct *work) +{ + struct rtrs_con *usr_con; + struct rtrs_sess *sess; + u32 imm; + int err; + + sess = container_of(to_delayed_work(work), typeof(*sess), hb_dwork); + usr_con = sess->con[0]; + + if (sess->hb_missed_cnt > sess->hb_missed_max) { + sess->hb_err_handler(usr_con); + return; + } + if (sess->hb_missed_cnt++) { + /* Reschedule work without sending hb */ + schedule_hb(sess); + return; + } + imm = rtrs_to_imm(RTRS_HB_MSG_IMM, 0); + err = rtrs_post_rdma_write_imm_empty(usr_con, sess->hb_cqe, imm, + IB_SEND_SIGNALED, NULL); + if (err) { + sess->hb_err_handler(usr_con); + return; + } + + schedule_hb(sess); +} + +void rtrs_init_hb(struct rtrs_sess *sess, struct ib_cqe *cqe, + unsigned int interval_ms, unsigned int missed_max, + void (*err_handler)(struct rtrs_con *con), + struct workqueue_struct *wq) +{ + sess->hb_cqe = cqe; + sess->hb_interval_ms = interval_ms; + sess->hb_err_handler = err_handler; + sess->hb_wq = wq; + sess->hb_missed_max = missed_max; + sess->hb_missed_cnt = 0; + INIT_DELAYED_WORK(&sess->hb_dwork, hb_work); +} +EXPORT_SYMBOL_GPL(rtrs_init_hb); + +void rtrs_start_hb(struct rtrs_sess *sess) +{ + schedule_hb(sess); +} +EXPORT_SYMBOL_GPL(rtrs_start_hb); + +void rtrs_stop_hb(struct rtrs_sess *sess) +{ + cancel_delayed_work_sync(&sess->hb_dwork); + sess->hb_missed_cnt = 0; + sess->hb_missed_max = 0; +} +EXPORT_SYMBOL_GPL(rtrs_stop_hb); + +static int rtrs_str_gid_to_sockaddr(const char *addr, size_t len, + short port, struct sockaddr_storage *dst) +{ + struct sockaddr_ib *dst_ib = (struct sockaddr_ib *)dst; + int ret; + + /* + * We can use some of the IPv6 functions since GID is a valid + * IPv6 address format + */ + ret = in6_pton(addr, len, dst_ib->sib_addr.sib_raw, '\0', NULL); + if (ret == 0) + return -EINVAL; + + dst_ib->sib_family = AF_IB; + /* + * Use the same TCP server port number as the IB service ID + * on the IB port space range + */ + dst_ib->sib_sid = cpu_to_be64(RDMA_IB_IP_PS_IB | port); + dst_ib->sib_sid_mask = cpu_to_be64(0xffffffffffffffffULL); + dst_ib->sib_pkey = cpu_to_be16(0xffff); + + return 0; +} + +/** + * rtrs_str_to_sockaddr() - Convert rtrs address string to sockaddr + * @addr: String representation of an addr (IPv4, IPv6 or IB GID): + * - "ip:192.168.1.1" + * - "ip:fe80::200:5aee:feaa:20a2" + * - "gid:fe80::200:5aee:feaa:20a2" + * @len: String address length + * @port: Destination port + * @dst: Destination sockaddr structure + * + * Returns 0 if conversion successful. Non-zero on error. + */ +static int rtrs_str_to_sockaddr(const char *addr, size_t len, + u16 port, struct sockaddr_storage *dst) +{ + if (strncmp(addr, "gid:", 4) == 0) { + return rtrs_str_gid_to_sockaddr(addr + 4, len - 4, port, dst); + } else if (strncmp(addr, "ip:", 3) == 0) { + char port_str[8]; + char *cpy; + int err; + + snprintf(port_str, sizeof(port_str), "%u", port); + cpy = kstrndup(addr + 3, len - 3, GFP_KERNEL); + err = cpy ? inet_pton_with_scope(&init_net, AF_UNSPEC, + cpy, port_str, dst) : -ENOMEM; + kfree(cpy); + + return err; + } + return -EPROTONOSUPPORT; +} + +/** + * sockaddr_to_str() - convert sockaddr to a string. + * @addr: the sockadddr structure to be converted. + * @buf: string containing socket addr. + * @len: string length. + * + * The return value is the number of characters written into buf not + * including the trailing '\0'. If len is == 0 the function returns 0.. + */ +int sockaddr_to_str(const struct sockaddr *addr, char *buf, size_t len) +{ + + switch (addr->sa_family) { + case AF_IB: + return scnprintf(buf, len, "gid:%pI6", + &((struct sockaddr_ib *)addr)->sib_addr.sib_raw); + case AF_INET: + return scnprintf(buf, len, "ip:%pI4", + &((struct sockaddr_in *)addr)->sin_addr); + case AF_INET6: + return scnprintf(buf, len, "ip:%pI6c", + &((struct sockaddr_in6 *)addr)->sin6_addr); + } + return scnprintf(buf, len, ""); +} +EXPORT_SYMBOL(sockaddr_to_str); + +/** + * rtrs_addr_to_sockaddr() - convert path string "src,dst" or "src@dst" + * to sockaddreses + * @str: string containing source and destination addr of a path + * separated by ',' or '@' I.e. "ip:1.1.1.1,ip:1.1.1.2" or + * "ip:1.1.1.1@ip:1.1.1.2". If str contains only one address it's + * considered to be destination. + * @len: string length + * @port: Destination port number. + * @addr: will be set to the source/destination address or to NULL + * if str doesn't contain any source address. + * + * Returns zero if conversion successful. Non-zero otherwise. + */ +int rtrs_addr_to_sockaddr(const char *str, size_t len, u16 port, + struct rtrs_addr *addr) +{ + const char *d; + + d = strchr(str, ','); + if (!d) + d = strchr(str, '@'); + if (d) { + if (rtrs_str_to_sockaddr(str, d - str, 0, addr->src)) + return -EINVAL; + d += 1; + len -= d - str; + str = d; + + } else { + addr->src = NULL; + } + return rtrs_str_to_sockaddr(str, len, port, addr->dst); +} +EXPORT_SYMBOL(rtrs_addr_to_sockaddr); + +void rtrs_rdma_dev_pd_init(enum ib_pd_flags pd_flags, + struct rtrs_rdma_dev_pd *pool) +{ + WARN_ON(pool->ops && (!pool->ops->alloc ^ !pool->ops->free)); + INIT_LIST_HEAD(&pool->list); + mutex_init(&pool->mutex); + pool->pd_flags = pd_flags; +} +EXPORT_SYMBOL(rtrs_rdma_dev_pd_init); + +void rtrs_rdma_dev_pd_deinit(struct rtrs_rdma_dev_pd *pool) +{ + mutex_destroy(&pool->mutex); + WARN_ON(!list_empty(&pool->list)); +} +EXPORT_SYMBOL(rtrs_rdma_dev_pd_deinit); + +static void dev_free(struct kref *ref) +{ + struct rtrs_rdma_dev_pd *pool; + struct rtrs_ib_dev *dev; + + dev = container_of(ref, typeof(*dev), ref); + pool = dev->pool; + + mutex_lock(&pool->mutex); + list_del(&dev->entry); + mutex_unlock(&pool->mutex); + + if (pool->ops && pool->ops->deinit) + pool->ops->deinit(dev); + + ib_dealloc_pd(dev->ib_pd); + + if (pool->ops && pool->ops->free) + pool->ops->free(dev); + else + kfree(dev); +} + +int rtrs_ib_dev_put(struct rtrs_ib_dev *dev) +{ + return kref_put(&dev->ref, dev_free); +} +EXPORT_SYMBOL(rtrs_ib_dev_put); + +static int rtrs_ib_dev_get(struct rtrs_ib_dev *dev) +{ + return kref_get_unless_zero(&dev->ref); +} + +struct rtrs_ib_dev * +rtrs_ib_dev_find_or_add(struct ib_device *ib_dev, + struct rtrs_rdma_dev_pd *pool) +{ + struct rtrs_ib_dev *dev; + + mutex_lock(&pool->mutex); + list_for_each_entry(dev, &pool->list, entry) { + if (dev->ib_dev->node_guid == ib_dev->node_guid && + rtrs_ib_dev_get(dev)) + goto out_unlock; + } + mutex_unlock(&pool->mutex); + if (pool->ops && pool->ops->alloc) + dev = pool->ops->alloc(); + else + dev = kzalloc(sizeof(*dev), GFP_KERNEL); + if (IS_ERR_OR_NULL(dev)) + goto out_err; + + kref_init(&dev->ref); + dev->pool = pool; + dev->ib_dev = ib_dev; + dev->ib_pd = ib_alloc_pd(ib_dev, pool->pd_flags); + if (IS_ERR(dev->ib_pd)) + goto out_free_dev; + + if (pool->ops && pool->ops->init && pool->ops->init(dev)) + goto out_free_pd; + + mutex_lock(&pool->mutex); + list_add(&dev->entry, &pool->list); +out_unlock: + mutex_unlock(&pool->mutex); + return dev; + +out_free_pd: + ib_dealloc_pd(dev->ib_pd); +out_free_dev: + if (pool->ops && pool->ops->free) + pool->ops->free(dev); + else + kfree(dev); +out_err: + return NULL; +} +EXPORT_SYMBOL(rtrs_ib_dev_find_or_add); -- cgit v1.2.3 From cb80329c9434c64493789e7ea5b1f2957021ce61 Mon Sep 17 00:00:00 2001 From: Jack Wang Date: Mon, 11 May 2020 15:51:11 +0200 Subject: RDMA/rtrs: client: private header with client structs and functions This header describes main structs and functions used by rtrs-client module, mainly for managing rtrs sessions, creating/destroying sysfs entries, accounting statistics on client side. Link: https://lore.kernel.org/r/20200511135131.27580-6-danil.kipnis@cloud.ionos.com Signed-off-by: Danil Kipnis Signed-off-by: Jack Wang Signed-off-by: Jason Gunthorpe --- drivers/infiniband/ulp/rtrs/rtrs-clt.h | 251 +++++++++++++++++++++++++++++++++ 1 file changed, 251 insertions(+) create mode 100644 drivers/infiniband/ulp/rtrs/rtrs-clt.h diff --git a/drivers/infiniband/ulp/rtrs/rtrs-clt.h b/drivers/infiniband/ulp/rtrs/rtrs-clt.h new file mode 100644 index 000000000000..039a2ebba2f9 --- /dev/null +++ b/drivers/infiniband/ulp/rtrs/rtrs-clt.h @@ -0,0 +1,251 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * RDMA Transport Layer + * + * Copyright (c) 2014 - 2018 ProfitBricks GmbH. All rights reserved. + * Copyright (c) 2018 - 2019 1&1 IONOS Cloud GmbH. All rights reserved. + * Copyright (c) 2019 - 2020 1&1 IONOS SE. All rights reserved. + */ + +#ifndef RTRS_CLT_H +#define RTRS_CLT_H + +#include +#include "rtrs-pri.h" + +/** + * enum rtrs_clt_state - Client states. + */ +enum rtrs_clt_state { + RTRS_CLT_CONNECTING, + RTRS_CLT_CONNECTING_ERR, + RTRS_CLT_RECONNECTING, + RTRS_CLT_CONNECTED, + RTRS_CLT_CLOSING, + RTRS_CLT_CLOSED, + RTRS_CLT_DEAD, +}; + +enum rtrs_mp_policy { + MP_POLICY_RR, + MP_POLICY_MIN_INFLIGHT, +}; + +/* see Documentation/ABI/testing/sysfs-class-rtrs-client for details */ +struct rtrs_clt_stats_reconnects { + int successful_cnt; + int fail_cnt; +}; + +/* see Documentation/ABI/testing/sysfs-class-rtrs-client for details */ +struct rtrs_clt_stats_cpu_migr { + atomic_t from; + int to; +}; + +/* stats for Read and write operation. + * see Documentation/ABI/testing/sysfs-class-rtrs-client for details + */ +struct rtrs_clt_stats_rdma { + struct { + u64 cnt; + u64 size_total; + } dir[2]; + + u64 failover_cnt; +}; + +struct rtrs_clt_stats_pcpu { + struct rtrs_clt_stats_cpu_migr cpu_migr; + struct rtrs_clt_stats_rdma rdma; +}; + +struct rtrs_clt_stats { + struct kobject kobj_stats; + struct rtrs_clt_stats_pcpu __percpu *pcpu_stats; + struct rtrs_clt_stats_reconnects reconnects; + atomic_t inflight; +}; + +struct rtrs_clt_con { + struct rtrs_con c; + struct rtrs_iu *rsp_ius; + u32 queue_size; + unsigned int cpu; + atomic_t io_cnt; + int cm_err; +}; + +/** + * rtrs_permit - permits the memory allocation for future RDMA operation. + * Combine with irq pinning to keep IO on same CPU. + */ +struct rtrs_permit { + enum rtrs_clt_con_type con_type; + unsigned int cpu_id; + unsigned int mem_id; + unsigned int mem_off; +}; + +/** + * rtrs_clt_io_req - describes one inflight IO request + */ +struct rtrs_clt_io_req { + struct list_head list; + struct rtrs_iu *iu; + struct scatterlist *sglist; /* list holding user data */ + unsigned int sg_cnt; + unsigned int sg_size; + unsigned int data_len; + unsigned int usr_len; + void *priv; + bool in_use; + struct rtrs_clt_con *con; + struct rtrs_sg_desc *desc; + struct ib_sge *sge; + struct rtrs_permit *permit; + enum dma_data_direction dir; + void (*conf)(void *priv, int errno); + unsigned long start_jiffies; + + struct ib_mr *mr; + struct ib_cqe inv_cqe; + struct completion inv_comp; + int inv_errno; + bool need_inv_comp; + bool need_inv; +}; + +struct rtrs_rbuf { + u64 addr; + u32 rkey; +}; + +struct rtrs_clt_sess { + struct rtrs_sess s; + struct rtrs_clt *clt; + wait_queue_head_t state_wq; + enum rtrs_clt_state state; + atomic_t connected_cnt; + struct mutex init_mutex; + struct rtrs_clt_io_req *reqs; + struct delayed_work reconnect_dwork; + struct work_struct close_work; + unsigned int reconnect_attempts; + bool established; + struct rtrs_rbuf *rbufs; + size_t max_io_size; + u32 max_hdr_size; + u32 chunk_size; + size_t queue_depth; + u32 max_pages_per_mr; + int max_send_sge; + u32 flags; + struct kobject kobj; + struct rtrs_clt_stats *stats; + /* cache hca_port and hca_name to display in sysfs */ + u8 hca_port; + char hca_name[IB_DEVICE_NAME_MAX]; + struct list_head __percpu + *mp_skip_entry; +}; + +struct rtrs_clt { + struct list_head paths_list; /* rcu protected list */ + size_t paths_num; + struct rtrs_clt_sess + __rcu * __percpu *pcpu_path; + uuid_t paths_uuid; + int paths_up; + struct mutex paths_mutex; + struct mutex paths_ev_mutex; + char sessname[NAME_MAX]; + u16 port; + unsigned int max_reconnect_attempts; + unsigned int reconnect_delay_sec; + unsigned int max_segments; + void *permits; + unsigned long *permits_map; + size_t queue_depth; + size_t max_io_size; + wait_queue_head_t permits_wait; + size_t pdu_sz; + void *priv; + void (*link_ev)(void *priv, + enum rtrs_clt_link_ev ev); + struct device dev; + struct kobject *kobj_paths; + enum rtrs_mp_policy mp_policy; +}; + +static inline struct rtrs_clt_con *to_clt_con(struct rtrs_con *c) +{ + return container_of(c, struct rtrs_clt_con, c); +} + +static inline struct rtrs_clt_sess *to_clt_sess(struct rtrs_sess *s) +{ + return container_of(s, struct rtrs_clt_sess, s); +} + +static inline int permit_size(struct rtrs_clt *clt) +{ + return sizeof(struct rtrs_permit) + clt->pdu_sz; +} + +static inline struct rtrs_permit *get_permit(struct rtrs_clt *clt, int idx) +{ + return (struct rtrs_permit *)(clt->permits + permit_size(clt) * idx); +} + +int rtrs_clt_reconnect_from_sysfs(struct rtrs_clt_sess *sess); +int rtrs_clt_disconnect_from_sysfs(struct rtrs_clt_sess *sess); +int rtrs_clt_create_path_from_sysfs(struct rtrs_clt *clt, + struct rtrs_addr *addr); +int rtrs_clt_remove_path_from_sysfs(struct rtrs_clt_sess *sess, + const struct attribute *sysfs_self); + +void rtrs_clt_set_max_reconnect_attempts(struct rtrs_clt *clt, int value); +int rtrs_clt_get_max_reconnect_attempts(const struct rtrs_clt *clt); +void free_sess(struct rtrs_clt_sess *sess); + +/* rtrs-clt-stats.c */ + +int rtrs_clt_init_stats(struct rtrs_clt_stats *stats); + +void rtrs_clt_inc_failover_cnt(struct rtrs_clt_stats *s); + +void rtrs_clt_update_wc_stats(struct rtrs_clt_con *con); +void rtrs_clt_update_all_stats(struct rtrs_clt_io_req *req, int dir); + +int rtrs_clt_reset_rdma_lat_distr_stats(struct rtrs_clt_stats *stats, + bool enable); +ssize_t rtrs_clt_stats_rdma_lat_distr_to_str(struct rtrs_clt_stats *stats, + char *page, size_t len); +int rtrs_clt_reset_cpu_migr_stats(struct rtrs_clt_stats *stats, bool enable); +int rtrs_clt_stats_migration_cnt_to_str(struct rtrs_clt_stats *stats, char *buf, + size_t len); +int rtrs_clt_reset_reconnects_stat(struct rtrs_clt_stats *stats, bool enable); +int rtrs_clt_stats_reconnects_to_str(struct rtrs_clt_stats *stats, char *buf, + size_t len); +int rtrs_clt_reset_wc_comp_stats(struct rtrs_clt_stats *stats, bool enable); +int rtrs_clt_stats_wc_completion_to_str(struct rtrs_clt_stats *stats, char *buf, + size_t len); +int rtrs_clt_reset_rdma_stats(struct rtrs_clt_stats *stats, bool enable); +ssize_t rtrs_clt_stats_rdma_to_str(struct rtrs_clt_stats *stats, + char *page, size_t len); +int rtrs_clt_reset_all_stats(struct rtrs_clt_stats *stats, bool enable); +ssize_t rtrs_clt_reset_all_help(struct rtrs_clt_stats *stats, + char *page, size_t len); + +/* rtrs-clt-sysfs.c */ + +int rtrs_clt_create_sysfs_root_files(struct rtrs_clt *clt); +void rtrs_clt_destroy_sysfs_root_folders(struct rtrs_clt *clt); +void rtrs_clt_destroy_sysfs_root_files(struct rtrs_clt *clt); + +int rtrs_clt_create_sess_files(struct rtrs_clt_sess *sess); +void rtrs_clt_destroy_sess_files(struct rtrs_clt_sess *sess, + const struct attribute *sysfs_self); + +#endif /* RTRS_CLT_H */ -- cgit v1.2.3 From 6a98d71daea186247005099758af549e6afdd244 Mon Sep 17 00:00:00 2001 From: Jack Wang Date: Mon, 11 May 2020 15:51:12 +0200 Subject: RDMA/rtrs: client: main functionality This is main functionality of rtrs-client module, which manages set of RDMA connections for each rtrs session, does multipathing, load balancing and failover of RDMA requests. Link: https://lore.kernel.org/r/20200511135131.27580-7-danil.kipnis@cloud.ionos.com Signed-off-by: Danil Kipnis Signed-off-by: Jack Wang Signed-off-by: Jason Gunthorpe --- drivers/infiniband/ulp/rtrs/rtrs-clt.c | 2994 ++++++++++++++++++++++++++++++++ 1 file changed, 2994 insertions(+) create mode 100644 drivers/infiniband/ulp/rtrs/rtrs-clt.c diff --git a/drivers/infiniband/ulp/rtrs/rtrs-clt.c b/drivers/infiniband/ulp/rtrs/rtrs-clt.c new file mode 100644 index 000000000000..468fdd0d8713 --- /dev/null +++ b/drivers/infiniband/ulp/rtrs/rtrs-clt.c @@ -0,0 +1,2994 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * RDMA Transport Layer + * + * Copyright (c) 2014 - 2018 ProfitBricks GmbH. All rights reserved. + * Copyright (c) 2018 - 2019 1&1 IONOS Cloud GmbH. All rights reserved. + * Copyright (c) 2019 - 2020 1&1 IONOS SE. All rights reserved. + */ + +#undef pr_fmt +#define pr_fmt(fmt) KBUILD_MODNAME " L" __stringify(__LINE__) ": " fmt + +#include +#include +#include /* for BLK_MAX_SEGMENT_SIZE */ + +#include "rtrs-clt.h" +#include "rtrs-log.h" + +#define RTRS_CONNECT_TIMEOUT_MS 30000 +/* + * Wait a bit before trying to reconnect after a failure + * in order to give server time to finish clean up which + * leads to "false positives" failed reconnect attempts + */ +#define RTRS_RECONNECT_BACKOFF 1000 + +MODULE_DESCRIPTION("RDMA Transport Client"); +MODULE_LICENSE("GPL"); + +static const struct rtrs_rdma_dev_pd_ops dev_pd_ops; +static struct rtrs_rdma_dev_pd dev_pd = { + .ops = &dev_pd_ops +}; + +static struct workqueue_struct *rtrs_wq; +static struct class *rtrs_clt_dev_class; + +static inline bool rtrs_clt_is_connected(const struct rtrs_clt *clt) +{ + struct rtrs_clt_sess *sess; + bool connected = false; + + rcu_read_lock(); + list_for_each_entry_rcu(sess, &clt->paths_list, s.entry) + connected |= READ_ONCE(sess->state) == RTRS_CLT_CONNECTED; + rcu_read_unlock(); + + return connected; +} + +static struct rtrs_permit * +__rtrs_get_permit(struct rtrs_clt *clt, enum rtrs_clt_con_type con_type) +{ + size_t max_depth = clt->queue_depth; + struct rtrs_permit *permit; + int bit; + + /* + * Adapted from null_blk get_tag(). Callers from different cpus may + * grab the same bit, since find_first_zero_bit is not atomic. + * But then the test_and_set_bit_lock will fail for all the + * callers but one, so that they will loop again. + * This way an explicit spinlock is not required. + */ + do { + bit = find_first_zero_bit(clt->permits_map, max_depth); + if (unlikely(bit >= max_depth)) + return NULL; + } while (unlikely(test_and_set_bit_lock(bit, clt->permits_map))); + + permit = get_permit(clt, bit); + WARN_ON(permit->mem_id != bit); + permit->cpu_id = raw_smp_processor_id(); + permit->con_type = con_type; + + return permit; +} + +static inline void __rtrs_put_permit(struct rtrs_clt *clt, + struct rtrs_permit *permit) +{ + clear_bit_unlock(permit->mem_id, clt->permits_map); +} + +/** + * rtrs_clt_get_permit() - allocates permit for future RDMA operation + * @clt: Current session + * @con_type: Type of connection to use with the permit + * @can_wait: Wait type + * + * Description: + * Allocates permit for the following RDMA operation. Permit is used + * to preallocate all resources and to propagate memory pressure + * up earlier. + * + * Context: + * Can sleep if @wait == RTRS_TAG_WAIT + */ +struct rtrs_permit *rtrs_clt_get_permit(struct rtrs_clt *clt, + enum rtrs_clt_con_type con_type, + int can_wait) +{ + struct rtrs_permit *permit; + DEFINE_WAIT(wait); + + permit = __rtrs_get_permit(clt, con_type); + if (likely(permit) || !can_wait) + return permit; + + do { + prepare_to_wait(&clt->permits_wait, &wait, + TASK_UNINTERRUPTIBLE); + permit = __rtrs_get_permit(clt, con_type); + if (likely(permit)) + break; + + io_schedule(); + } while (1); + + finish_wait(&clt->permits_wait, &wait); + + return permit; +} +EXPORT_SYMBOL(rtrs_clt_get_permit); + +/** + * rtrs_clt_put_permit() - puts allocated permit + * @clt: Current session + * @permit: Permit to be freed + * + * Context: + * Does not matter + */ +void rtrs_clt_put_permit(struct rtrs_clt *clt, struct rtrs_permit *permit) +{ + if (WARN_ON(!test_bit(permit->mem_id, clt->permits_map))) + return; + + __rtrs_put_permit(clt, permit); + + /* + * rtrs_clt_get_permit() adds itself to the &clt->permits_wait list + * before calling schedule(). So if rtrs_clt_get_permit() is sleeping + * it must have added itself to &clt->permits_wait before + * __rtrs_put_permit() finished. + * Hence it is safe to guard wake_up() with a waitqueue_active() test. + */ + if (waitqueue_active(&clt->permits_wait)) + wake_up(&clt->permits_wait); +} +EXPORT_SYMBOL(rtrs_clt_put_permit); + +void *rtrs_permit_to_pdu(struct rtrs_permit *permit) +{ + return permit + 1; +} +EXPORT_SYMBOL(rtrs_permit_to_pdu); + +/** + * rtrs_permit_to_clt_con() - returns RDMA connection pointer by the permit + * @sess: client session pointer + * @permit: permit for the allocation of the RDMA buffer + * Note: + * IO connection starts from 1. + * 0 connection is for user messages. + */ +static +struct rtrs_clt_con *rtrs_permit_to_clt_con(struct rtrs_clt_sess *sess, + struct rtrs_permit *permit) +{ + int id = 0; + + if (likely(permit->con_type == RTRS_IO_CON)) + id = (permit->cpu_id % (sess->s.con_num - 1)) + 1; + + return to_clt_con(sess->s.con[id]); +} + +/** + * __rtrs_clt_change_state() - change the session state through session state + * machine. + * + * @sess: client session to change the state of. + * @new_state: state to change to. + * + * returns true if successful, false if the requested state can not be set. + * + * Locks: + * state_wq lock must be hold. + */ +static bool __rtrs_clt_change_state(struct rtrs_clt_sess *sess, + enum rtrs_clt_state new_state) +{ + enum rtrs_clt_state old_state; + bool changed = false; + + lockdep_assert_held(&sess->state_wq.lock); + + old_state = sess->state; + switch (new_state) { + case RTRS_CLT_CONNECTING: + switch (old_state) { + case RTRS_CLT_RECONNECTING: + changed = true; + fallthrough; + default: + break; + } + break; + case RTRS_CLT_RECONNECTING: + switch (old_state) { + case RTRS_CLT_CONNECTED: + case RTRS_CLT_CONNECTING_ERR: + case RTRS_CLT_CLOSED: + changed = true; + fallthrough; + default: + break; + } + break; + case RTRS_CLT_CONNECTED: + switch (old_state) { + case RTRS_CLT_CONNECTING: + changed = true; + fallthrough; + default: + break; + } + break; + case RTRS_CLT_CONNECTING_ERR: + switch (old_state) { + case RTRS_CLT_CONNECTING: + changed = true; + fallthrough; + default: + break; + } + break; + case RTRS_CLT_CLOSING: + switch (old_state) { + case RTRS_CLT_CONNECTING: + case RTRS_CLT_CONNECTING_ERR: + case RTRS_CLT_RECONNECTING: + case RTRS_CLT_CONNECTED: + changed = true; + fallthrough; + default: + break; + } + break; + case RTRS_CLT_CLOSED: + switch (old_state) { + case RTRS_CLT_CLOSING: + changed = true; + fallthrough; + default: + break; + } + break; + case RTRS_CLT_DEAD: + switch (old_state) { + case RTRS_CLT_CLOSED: + changed = true; + fallthrough; + default: + break; + } + break; + default: + break; + } + if (changed) { + sess->state = new_state; + wake_up_locked(&sess->state_wq); + } + + return changed; +} + +static bool rtrs_clt_change_state_from_to(struct rtrs_clt_sess *sess, + enum rtrs_clt_state old_state, + enum rtrs_clt_state new_state) +{ + bool changed = false; + + spin_lock_irq(&sess->state_wq.lock); + if (sess->state == old_state) + changed = __rtrs_clt_change_state(sess, new_state); + spin_unlock_irq(&sess->state_wq.lock); + + return changed; +} + +static void rtrs_rdma_error_recovery(struct rtrs_clt_con *con) +{ + struct rtrs_clt_sess *sess = to_clt_sess(con->c.sess); + + if (rtrs_clt_change_state_from_to(sess, + RTRS_CLT_CONNECTED, + RTRS_CLT_RECONNECTING)) { + struct rtrs_clt *clt = sess->clt; + unsigned int delay_ms; + + /* + * Normal scenario, reconnect if we were successfully connected + */ + delay_ms = clt->reconnect_delay_sec * 1000; + queue_delayed_work(rtrs_wq, &sess->reconnect_dwork, + msecs_to_jiffies(delay_ms)); + } else { + /* + * Error can happen just on establishing new connection, + * so notify waiter with error state, waiter is responsible + * for cleaning the rest and reconnect if needed. + */ + rtrs_clt_change_state_from_to(sess, + RTRS_CLT_CONNECTING, + RTRS_CLT_CONNECTING_ERR); + } +} + +static void rtrs_clt_fast_reg_done(struct ib_cq *cq, struct ib_wc *wc) +{ + struct rtrs_clt_con *con = cq->cq_context; + + if (unlikely(wc->status != IB_WC_SUCCESS)) { + rtrs_err(con->c.sess, "Failed IB_WR_REG_MR: %s\n", + ib_wc_status_msg(wc->status)); + rtrs_rdma_error_recovery(con); + } +} + +static struct ib_cqe fast_reg_cqe = { + .done = rtrs_clt_fast_reg_done +}; + +static void complete_rdma_req(struct rtrs_clt_io_req *req, int errno, + bool notify, bool can_wait); + +static void rtrs_clt_inv_rkey_done(struct ib_cq *cq, struct ib_wc *wc) +{ + struct rtrs_clt_io_req *req = + container_of(wc->wr_cqe, typeof(*req), inv_cqe); + struct rtrs_clt_con *con = cq->cq_context; + + if (unlikely(wc->status != IB_WC_SUCCESS)) { + rtrs_err(con->c.sess, "Failed IB_WR_LOCAL_INV: %s\n", + ib_wc_status_msg(wc->status)); + rtrs_rdma_error_recovery(con); + } + req->need_inv = false; + if (likely(req->need_inv_comp)) + complete(&req->inv_comp); + else + /* Complete request from INV callback */ + complete_rdma_req(req, req->inv_errno, true, false); +} + +static int rtrs_inv_rkey(struct rtrs_clt_io_req *req) +{ + struct rtrs_clt_con *con = req->con; + struct ib_send_wr wr = { + .opcode = IB_WR_LOCAL_INV, + .wr_cqe = &req->inv_cqe, + .send_flags = IB_SEND_SIGNALED, + .ex.invalidate_rkey = req->mr->rkey, + }; + req->inv_cqe.done = rtrs_clt_inv_rkey_done; + + return ib_post_send(con->c.qp, &wr, NULL); +} + +static void complete_rdma_req(struct rtrs_clt_io_req *req, int errno, + bool notify, bool can_wait) +{ + struct rtrs_clt_con *con = req->con; + struct rtrs_clt_sess *sess; + int err; + + if (WARN_ON(!req->in_use)) + return; + if (WARN_ON(!req->con)) + return; + sess = to_clt_sess(con->c.sess); + + if (req->sg_cnt) { + if (unlikely(req->dir == DMA_FROM_DEVICE && req->need_inv)) { + /* + * We are here to invalidate read requests + * ourselves. In normal scenario server should + * send INV for all read requests, but + * we are here, thus two things could happen: + * + * 1. this is failover, when errno != 0 + * and can_wait == 1, + * + * 2. something totally bad happened and + * server forgot to send INV, so we + * should do that ourselves. + */ + + if (likely(can_wait)) { + req->need_inv_comp = true; + } else { + /* This should be IO path, so always notify */ + WARN_ON(!notify); + /* Save errno for INV callback */ + req->inv_errno = errno; + } + + err = rtrs_inv_rkey(req); + if (unlikely(err)) { + rtrs_err(con->c.sess, "Send INV WR key=%#x: %d\n", + req->mr->rkey, err); + } else if (likely(can_wait)) { + wait_for_completion(&req->inv_comp); + } else { + /* + * Something went wrong, so request will be + * completed from INV callback. + */ + WARN_ON_ONCE(1); + + return; + } + } + ib_dma_unmap_sg(sess->s.dev->ib_dev, req->sglist, + req->sg_cnt, req->dir); + } + if (sess->clt->mp_policy == MP_POLICY_MIN_INFLIGHT) + atomic_dec(&sess->stats->inflight); + + req->in_use = false; + req->con = NULL; + + if (notify) + req->conf(req->priv, errno); +} + +static int rtrs_post_send_rdma(struct rtrs_clt_con *con, + struct rtrs_clt_io_req *req, + struct rtrs_rbuf *rbuf, u32 off, + u32 imm, struct ib_send_wr *wr) +{ + struct rtrs_clt_sess *sess = to_clt_sess(con->c.sess); + enum ib_send_flags flags; + struct ib_sge sge; + + if (unlikely(!req->sg_size)) { + rtrs_wrn(con->c.sess, + "Doing RDMA Write failed, no data supplied\n"); + return -EINVAL; + } + + /* user data and user message in the first list element */ + sge.addr = req->iu->dma_addr; + sge.length = req->sg_size; + sge.lkey = sess->s.dev->ib_pd->local_dma_lkey; + + /* + * From time to time we have to post signalled sends, + * or send queue will fill up and only QP reset can help. + */ + flags = atomic_inc_return(&con->io_cnt) % sess->queue_depth ? + 0 : IB_SEND_SIGNALED; + + ib_dma_sync_single_for_device(sess->s.dev->ib_dev, req->iu->dma_addr, + req->sg_size, DMA_TO_DEVICE); + + return rtrs_iu_post_rdma_write_imm(&con->c, req->iu, &sge, 1, + rbuf->rkey, rbuf->addr + off, + imm, flags, wr); +} + +static void process_io_rsp(struct rtrs_clt_sess *sess, u32 msg_id, + s16 errno, bool w_inval) +{ + struct rtrs_clt_io_req *req; + + if (WARN_ON(msg_id >= sess->queue_depth)) + return; + + req = &sess->reqs[msg_id]; + /* Drop need_inv if server responded with send with invalidation */ + req->need_inv &= !w_inval; + complete_rdma_req(req, errno, true, false); +} + +static void rtrs_clt_recv_done(struct rtrs_clt_con *con, struct ib_wc *wc) +{ + struct rtrs_iu *iu; + int err; + struct rtrs_clt_sess *sess = to_clt_sess(con->c.sess); + + WARN_ON(sess->flags != RTRS_MSG_NEW_RKEY_F); + iu = container_of(wc->wr_cqe, struct rtrs_iu, + cqe); + err = rtrs_iu_post_recv(&con->c, iu); + if (unlikely(err)) { + rtrs_err(con->c.sess, "post iu failed %d\n", err); + rtrs_rdma_error_recovery(con); + } +} + +static void rtrs_clt_rkey_rsp_done(struct rtrs_clt_con *con, struct ib_wc *wc) +{ + struct rtrs_clt_sess *sess = to_clt_sess(con->c.sess); + struct rtrs_msg_rkey_rsp *msg; + u32 imm_type, imm_payload; + bool w_inval = false; + struct rtrs_iu *iu; + u32 buf_id; + int err; + + WARN_ON(sess->flags != RTRS_MSG_NEW_RKEY_F); + + iu = container_of(wc->wr_cqe, struct rtrs_iu, cqe); + + if (unlikely(wc->byte_len < sizeof(*msg))) { + rtrs_err(con->c.sess, "rkey response is malformed: size %d\n", + wc->byte_len); + goto out; + } + ib_dma_sync_single_for_cpu(sess->s.dev->ib_dev, iu->dma_addr, + iu->size, DMA_FROM_DEVICE); + msg = iu->buf; + if (unlikely(le16_to_cpu(msg->type) != RTRS_MSG_RKEY_RSP)) { + rtrs_err(sess->clt, "rkey response is malformed: type %d\n", + le16_to_cpu(msg->type)); + goto out; + } + buf_id = le16_to_cpu(msg->buf_id); + if (WARN_ON(buf_id >= sess->queue_depth)) + goto out; + + rtrs_from_imm(be32_to_cpu(wc->ex.imm_data), &imm_type, &imm_payload); + if (likely(imm_type == RTRS_IO_RSP_IMM || + imm_type == RTRS_IO_RSP_W_INV_IMM)) { + u32 msg_id; + + w_inval = (imm_type == RTRS_IO_RSP_W_INV_IMM); + rtrs_from_io_rsp_imm(imm_payload, &msg_id, &err); + + if (WARN_ON(buf_id != msg_id)) + goto out; + sess->rbufs[buf_id].rkey = le32_to_cpu(msg->rkey); + process_io_rsp(sess, msg_id, err, w_inval); + } + ib_dma_sync_single_for_device(sess->s.dev->ib_dev, iu->dma_addr, + iu->size, DMA_FROM_DEVICE); + return rtrs_clt_recv_done(con, wc); +out: + rtrs_rdma_error_recovery(con); +} + +static void rtrs_clt_rdma_done(struct ib_cq *cq, struct ib_wc *wc); + +static struct ib_cqe io_comp_cqe = { + .done = rtrs_clt_rdma_done +}; + +/* + * Post x2 empty WRs: first is for this RDMA with IMM, + * second is for RECV with INV, which happened earlier. + */ +static int rtrs_post_recv_empty_x2(struct rtrs_con *con, struct ib_cqe *cqe) +{ + struct ib_recv_wr wr_arr[2], *wr; + int i; + + memset(wr_arr, 0, sizeof(wr_arr)); + for (i = 0; i < ARRAY_SIZE(wr_arr); i++) { + wr = &wr_arr[i]; + wr->wr_cqe = cqe; + if (i) + /* Chain backwards */ + wr->next = &wr_arr[i - 1]; + } + + return ib_post_recv(con->qp, wr, NULL); +} + +static void rtrs_clt_rdma_done(struct ib_cq *cq, struct ib_wc *wc) +{ + struct rtrs_clt_con *con = cq->cq_context; + struct rtrs_clt_sess *sess = to_clt_sess(con->c.sess); + u32 imm_type, imm_payload; + bool w_inval = false; + int err; + + if (unlikely(wc->status != IB_WC_SUCCESS)) { + if (wc->status != IB_WC_WR_FLUSH_ERR) { + rtrs_err(sess->clt, "RDMA failed: %s\n", + ib_wc_status_msg(wc->status)); + rtrs_rdma_error_recovery(con); + } + return; + } + rtrs_clt_update_wc_stats(con); + + switch (wc->opcode) { + case IB_WC_RECV_RDMA_WITH_IMM: + /* + * post_recv() RDMA write completions of IO reqs (read/write) + * and hb + */ + if (WARN_ON(wc->wr_cqe->done != rtrs_clt_rdma_done)) + return; + rtrs_from_imm(be32_to_cpu(wc->ex.imm_data), + &imm_type, &imm_payload); + if (likely(imm_type == RTRS_IO_RSP_IMM || + imm_type == RTRS_IO_RSP_W_INV_IMM)) { + u32 msg_id; + + w_inval = (imm_type == RTRS_IO_RSP_W_INV_IMM); + rtrs_from_io_rsp_imm(imm_payload, &msg_id, &err); + + process_io_rsp(sess, msg_id, err, w_inval); + } else if (imm_type == RTRS_HB_MSG_IMM) { + WARN_ON(con->c.cid); + rtrs_send_hb_ack(&sess->s); + if (sess->flags == RTRS_MSG_NEW_RKEY_F) + return rtrs_clt_recv_done(con, wc); + } else if (imm_type == RTRS_HB_ACK_IMM) { + WARN_ON(con->c.cid); + sess->s.hb_missed_cnt = 0; + if (sess->flags == RTRS_MSG_NEW_RKEY_F) + return rtrs_clt_recv_done(con, wc); + } else { + rtrs_wrn(con->c.sess, "Unknown IMM type %u\n", + imm_type); + } + if (w_inval) + /* + * Post x2 empty WRs: first is for this RDMA with IMM, + * second is for RECV with INV, which happened earlier. + */ + err = rtrs_post_recv_empty_x2(&con->c, &io_comp_cqe); + else + err = rtrs_post_recv_empty(&con->c, &io_comp_cqe); + if (unlikely(err)) { + rtrs_err(con->c.sess, "rtrs_post_recv_empty(): %d\n", + err); + rtrs_rdma_error_recovery(con); + break; + } + break; + case IB_WC_RECV: + /* + * Key invalidations from server side + */ + WARN_ON(!(wc->wc_flags & IB_WC_WITH_INVALIDATE || + wc->wc_flags & IB_WC_WITH_IMM)); + WARN_ON(wc->wr_cqe->done != rtrs_clt_rdma_done); + if (sess->flags == RTRS_MSG_NEW_RKEY_F) { + if (wc->wc_flags & IB_WC_WITH_INVALIDATE) + return rtrs_clt_recv_done(con, wc); + + return rtrs_clt_rkey_rsp_done(con, wc); + } + break; + case IB_WC_RDMA_WRITE: + /* + * post_send() RDMA write completions of IO reqs (read/write) + * and hb + */ + break; + + default: + rtrs_wrn(sess->clt, "Unexpected WC type: %d\n", wc->opcode); + return; + } +} + +static int post_recv_io(struct rtrs_clt_con *con, size_t q_size) +{ + int err, i; + struct rtrs_clt_sess *sess = to_clt_sess(con->c.sess); + + for (i = 0; i < q_size; i++) { + if (sess->flags == RTRS_MSG_NEW_RKEY_F) { + struct rtrs_iu *iu = &con->rsp_ius[i]; + + err = rtrs_iu_post_recv(&con->c, iu); + } else { + err = rtrs_post_recv_empty(&con->c, &io_comp_cqe); + } + if (unlikely(err)) + return err; + } + + return 0; +} + +static int post_recv_sess(struct rtrs_clt_sess *sess) +{ + size_t q_size = 0; + int err, cid; + + for (cid = 0; cid < sess->s.con_num; cid++) { + if (cid == 0) + q_size = SERVICE_CON_QUEUE_DEPTH; + else + q_size = sess->queue_depth; + + /* + * x2 for RDMA read responses + FR key invalidations, + * RDMA writes do not require any FR registrations. + */ + q_size *= 2; + + err = post_recv_io(to_clt_con(sess->s.con[cid]), q_size); + if (unlikely(err)) { + rtrs_err(sess->clt, "post_recv_io(), err: %d\n", err); + return err; + } + } + + return 0; +} + +struct path_it { + int i; + struct list_head skip_list; + struct rtrs_clt *clt; + struct rtrs_clt_sess *(*next_path)(struct path_it *it); +}; + +#define do_each_path(path, clt, it) { \ + path_it_init(it, clt); \ + rcu_read_lock(); \ + for ((it)->i = 0; ((path) = ((it)->next_path)(it)) && \ + (it)->i < (it)->clt->paths_num; \ + (it)->i++) + +#define while_each_path(it) \ + path_it_deinit(it); \ + rcu_read_unlock(); \ + } + +/** + * list_next_or_null_rr_rcu - get next list element in round-robin fashion. + * @head: the head for the list. + * @ptr: the list head to take the next element from. + * @type: the type of the struct this is embedded in. + * @memb: the name of the list_head within the struct. + * + * Next element returned in round-robin fashion, i.e. head will be skipped, + * but if list is observed as empty, NULL will be returned. + * + * This primitive may safely run concurrently with the _rcu list-mutation + * primitives such as list_add_rcu() as long as it's guarded by rcu_read_lock(). + */ +#define list_next_or_null_rr_rcu(head, ptr, type, memb) \ +({ \ + list_next_or_null_rcu(head, ptr, type, memb) ?: \ + list_next_or_null_rcu(head, READ_ONCE((ptr)->next), \ + type, memb); \ +}) + +/** + * get_next_path_rr() - Returns path in round-robin fashion. + * @it: the path pointer + * + * Related to @MP_POLICY_RR + * + * Locks: + * rcu_read_lock() must be hold. + */ +static struct rtrs_clt_sess *get_next_path_rr(struct path_it *it) +{ + struct rtrs_clt_sess __rcu **ppcpu_path; + struct rtrs_clt_sess *path; + struct rtrs_clt *clt; + + clt = it->clt; + + /* + * Here we use two RCU objects: @paths_list and @pcpu_path + * pointer. See rtrs_clt_remove_path_from_arr() for details + * how that is handled. + */ + + ppcpu_path = this_cpu_ptr(clt->pcpu_path); + path = rcu_dereference(*ppcpu_path); + if (unlikely(!path)) + path = list_first_or_null_rcu(&clt->paths_list, + typeof(*path), s.entry); + else + path = list_next_or_null_rr_rcu(&clt->paths_list, + &path->s.entry, + typeof(*path), + s.entry); + rcu_assign_pointer(*ppcpu_path, path); + + return path; +} + +/** + * get_next_path_min_inflight() - Returns path with minimal inflight count. + * @it: the path pointer + * + * Related to @MP_POLICY_MIN_INFLIGHT + * + * Locks: + * rcu_read_lock() must be hold. + */ +static struct rtrs_clt_sess *get_next_path_min_inflight(struct path_it *it) +{ + struct rtrs_clt_sess *min_path = NULL; + struct rtrs_clt *clt = it->clt; + struct rtrs_clt_sess *sess; + int min_inflight = INT_MAX; + int inflight; + + list_for_each_entry_rcu(sess, &clt->paths_list, s.entry) { + if (unlikely(!list_empty(raw_cpu_ptr(sess->mp_skip_entry)))) + continue; + + inflight = atomic_read(&sess->stats->inflight); + + if (inflight < min_inflight) { + min_inflight = inflight; + min_path = sess; + } + } + + /* + * add the path to the skip list, so that next time we can get + * a different one + */ + if (min_path) + list_add(raw_cpu_ptr(min_path->mp_skip_entry), &it->skip_list); + + return min_path; +} + +static inline void path_it_init(struct path_it *it, struct rtrs_clt *clt) +{ + INIT_LIST_HEAD(&it->skip_list); + it->clt = clt; + it->i = 0; + + if (clt->mp_policy == MP_POLICY_RR) + it->next_path = get_next_path_rr; + else + it->next_path = get_next_path_min_inflight; +} + +static inline void path_it_deinit(struct path_it *it) +{ + struct list_head *skip, *tmp; + /* + * The skip_list is used only for the MIN_INFLIGHT policy. + * We need to remove paths from it, so that next IO can insert + * paths (->mp_skip_entry) into a skip_list again. + */ + list_for_each_safe(skip, tmp, &it->skip_list) + list_del_init(skip); +} + +/** + * rtrs_clt_init_req() Initialize an rtrs_clt_io_req holding information + * about an inflight IO. + * The user buffer holding user control message (not data) is copied into + * the corresponding buffer of rtrs_iu (req->iu->buf), which later on will + * also hold the control message of rtrs. + * @req: an io request holding information about IO. + * @sess: client session + * @conf: conformation callback function to notify upper layer. + * @permit: permit for allocation of RDMA remote buffer + * @priv: private pointer + * @vec: kernel vector containing control message + * @usr_len: length of the user message + * @sg: scater list for IO data + * @sg_cnt: number of scater list entries + * @data_len: length of the IO data + * @dir: direction of the IO. + */ +static void rtrs_clt_init_req(struct rtrs_clt_io_req *req, + struct rtrs_clt_sess *sess, + void (*conf)(void *priv, int errno), + struct rtrs_permit *permit, void *priv, + const struct kvec *vec, size_t usr_len, + struct scatterlist *sg, size_t sg_cnt, + size_t data_len, int dir) +{ + struct iov_iter iter; + size_t len; + + req->permit = permit; + req->in_use = true; + req->usr_len = usr_len; + req->data_len = data_len; + req->sglist = sg; + req->sg_cnt = sg_cnt; + req->priv = priv; + req->dir = dir; + req->con = rtrs_permit_to_clt_con(sess, permit); + req->conf = conf; + req->need_inv = false; + req->need_inv_comp = false; + req->inv_errno = 0; + + iov_iter_kvec(&iter, READ, vec, 1, usr_len); + len = _copy_from_iter(req->iu->buf, usr_len, &iter); + WARN_ON(len != usr_len); + + reinit_completion(&req->inv_comp); +} + +static struct rtrs_clt_io_req * +rtrs_clt_get_req(struct rtrs_clt_sess *sess, + void (*conf)(void *priv, int errno), + struct rtrs_permit *permit, void *priv, + const struct kvec *vec, size_t usr_len, + struct scatterlist *sg, size_t sg_cnt, + size_t data_len, int dir) +{ + struct rtrs_clt_io_req *req; + + req = &sess->reqs[permit->mem_id]; + rtrs_clt_init_req(req, sess, conf, permit, priv, vec, usr_len, + sg, sg_cnt, data_len, dir); + return req; +} + +static struct rtrs_clt_io_req * +rtrs_clt_get_copy_req(struct rtrs_clt_sess *alive_sess, + struct rtrs_clt_io_req *fail_req) +{ + struct rtrs_clt_io_req *req; + struct kvec vec = { + .iov_base = fail_req->iu->buf, + .iov_len = fail_req->usr_len + }; + + req = &alive_sess->reqs[fail_req->permit->mem_id]; + rtrs_clt_init_req(req, alive_sess, fail_req->conf, fail_req->permit, + fail_req->priv, &vec, fail_req->usr_len, + fail_req->sglist, fail_req->sg_cnt, + fail_req->data_len, fail_req->dir); + return req; +} + +static int rtrs_post_rdma_write_sg(struct rtrs_clt_con *con, + struct rtrs_clt_io_req *req, + struct rtrs_rbuf *rbuf, + u32 size, u32 imm) +{ + struct rtrs_clt_sess *sess = to_clt_sess(con->c.sess); + struct ib_sge *sge = req->sge; + enum ib_send_flags flags; + struct scatterlist *sg; + size_t num_sge; + int i; + + for_each_sg(req->sglist, sg, req->sg_cnt, i) { + sge[i].addr = sg_dma_address(sg); + sge[i].length = sg_dma_len(sg); + sge[i].lkey = sess->s.dev->ib_pd->local_dma_lkey; + } + sge[i].addr = req->iu->dma_addr; + sge[i].length = size; + sge[i].lkey = sess->s.dev->ib_pd->local_dma_lkey; + + num_sge = 1 + req->sg_cnt; + + /* + * From time to time we have to post signalled sends, + * or send queue will fill up and only QP reset can help. + */ + flags = atomic_inc_return(&con->io_cnt) % sess->queue_depth ? + 0 : IB_SEND_SIGNALED; + + ib_dma_sync_single_for_device(sess->s.dev->ib_dev, req->iu->dma_addr, + size, DMA_TO_DEVICE); + + return rtrs_iu_post_rdma_write_imm(&con->c, req->iu, sge, num_sge, + rbuf->rkey, rbuf->addr, imm, + flags, NULL); +} + +static int rtrs_clt_write_req(struct rtrs_clt_io_req *req) +{ + struct rtrs_clt_con *con = req->con; + struct rtrs_sess *s = con->c.sess; + struct rtrs_clt_sess *sess = to_clt_sess(s); + struct rtrs_msg_rdma_write *msg; + + struct rtrs_rbuf *rbuf; + int ret, count = 0; + u32 imm, buf_id; + + const size_t tsize = sizeof(*msg) + req->data_len + req->usr_len; + + if (unlikely(tsize > sess->chunk_size)) { + rtrs_wrn(s, "Write request failed, size too big %zu > %d\n", + tsize, sess->chunk_size); + return -EMSGSIZE; + } + if (req->sg_cnt) { + count = ib_dma_map_sg(sess->s.dev->ib_dev, req->sglist, + req->sg_cnt, req->dir); + if (unlikely(!count)) { + rtrs_wrn(s, "Write request failed, map failed\n"); + return -EINVAL; + } + } + /* put rtrs msg after sg and user message */ + msg = req->iu->buf + req->usr_len; + msg->type = cpu_to_le16(RTRS_MSG_WRITE); + msg->usr_len = cpu_to_le16(req->usr_len); + + /* rtrs message on server side will be after user data and message */ + imm = req->permit->mem_off + req->data_len + req->usr_len; + imm = rtrs_to_io_req_imm(imm); + buf_id = req->permit->mem_id; + req->sg_size = tsize; + rbuf = &sess->rbufs[buf_id]; + + /* + * Update stats now, after request is successfully sent it is not + * safe anymore to touch it. + */ + rtrs_clt_update_all_stats(req, WRITE); + + ret = rtrs_post_rdma_write_sg(req->con, req, rbuf, + req->usr_len + sizeof(*msg), + imm); + if (unlikely(ret)) { + rtrs_err(s, "Write request failed: %d\n", ret); + if (sess->clt->mp_policy == MP_POLICY_MIN_INFLIGHT) + atomic_dec(&sess->stats->inflight); + if (req->sg_cnt) + ib_dma_unmap_sg(sess->s.dev->ib_dev, req->sglist, + req->sg_cnt, req->dir); + } + + return ret; +} + +static int rtrs_map_sg_fr(struct rtrs_clt_io_req *req, size_t count) +{ + int nr; + + /* Align the MR to a 4K page size to match the block virt boundary */ + nr = ib_map_mr_sg(req->mr, req->sglist, count, NULL, SZ_4K); + if (unlikely(nr < req->sg_cnt)) { + if (nr < 0) + return nr; + return -EINVAL; + } + ib_update_fast_reg_key(req->mr, ib_inc_rkey(req->mr->rkey)); + + return nr; +} + +static int rtrs_clt_read_req(struct rtrs_clt_io_req *req) +{ + struct rtrs_clt_con *con = req->con; + struct rtrs_sess *s = con->c.sess; + struct rtrs_clt_sess *sess = to_clt_sess(s); + struct rtrs_msg_rdma_read *msg; + struct rtrs_ib_dev *dev; + + struct ib_reg_wr rwr; + struct ib_send_wr *wr = NULL; + + int ret, count = 0; + u32 imm, buf_id; + + const size_t tsize = sizeof(*msg) + req->data_len + req->usr_len; + + s = &sess->s; + dev = sess->s.dev; + + if (unlikely(tsize > sess->chunk_size)) { + rtrs_wrn(s, + "Read request failed, message size is %zu, bigger than CHUNK_SIZE %d\n", + tsize, sess->chunk_size); + return -EMSGSIZE; + } + + if (req->sg_cnt) { + count = ib_dma_map_sg(dev->ib_dev, req->sglist, req->sg_cnt, + req->dir); + if (unlikely(!count)) { + rtrs_wrn(s, + "Read request failed, dma map failed\n"); + return -EINVAL; + } + } + /* put our message into req->buf after user message*/ + msg = req->iu->buf + req->usr_len; + msg->type = cpu_to_le16(RTRS_MSG_READ); + msg->usr_len = cpu_to_le16(req->usr_len); + + if (count) { + ret = rtrs_map_sg_fr(req, count); + if (ret < 0) { + rtrs_err_rl(s, + "Read request failed, failed to map fast reg. data, err: %d\n", + ret); + ib_dma_unmap_sg(dev->ib_dev, req->sglist, req->sg_cnt, + req->dir); + return ret; + } + rwr = (struct ib_reg_wr) { + .wr.opcode = IB_WR_REG_MR, + .wr.wr_cqe = &fast_reg_cqe, + .mr = req->mr, + .key = req->mr->rkey, + .access = (IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_REMOTE_WRITE), + }; + wr = &rwr.wr; + + msg->sg_cnt = cpu_to_le16(1); + msg->flags = cpu_to_le16(RTRS_MSG_NEED_INVAL_F); + + msg->desc[0].addr = cpu_to_le64(req->mr->iova); + msg->desc[0].key = cpu_to_le32(req->mr->rkey); + msg->desc[0].len = cpu_to_le32(req->mr->length); + + /* Further invalidation is required */ + req->need_inv = !!RTRS_MSG_NEED_INVAL_F; + + } else { + msg->sg_cnt = 0; + msg->flags = 0; + } + /* + * rtrs message will be after the space reserved for disk data and + * user message + */ + imm = req->permit->mem_off + req->data_len + req->usr_len; + imm = rtrs_to_io_req_imm(imm); + buf_id = req->permit->mem_id; + + req->sg_size = sizeof(*msg); + req->sg_size += le16_to_cpu(msg->sg_cnt) * sizeof(struct rtrs_sg_desc); + req->sg_size += req->usr_len; + + /* + * Update stats now, after request is successfully sent it is not + * safe anymore to touch it. + */ + rtrs_clt_update_all_stats(req, READ); + + ret = rtrs_post_send_rdma(req->con, req, &sess->rbufs[buf_id], + req->data_len, imm, wr); + if (unlikely(ret)) { + rtrs_err(s, "Read request failed: %d\n", ret); + if (sess->clt->mp_policy == MP_POLICY_MIN_INFLIGHT) + atomic_dec(&sess->stats->inflight); + req->need_inv = false; + if (req->sg_cnt) + ib_dma_unmap_sg(dev->ib_dev, req->sglist, + req->sg_cnt, req->dir); + } + + return ret; +} + +/** + * rtrs_clt_failover_req() Try to find an active path for a failed request + * @clt: clt context + * @fail_req: a failed io request. + */ +static int rtrs_clt_failover_req(struct rtrs_clt *clt, + struct rtrs_clt_io_req *fail_req) +{ + struct rtrs_clt_sess *alive_sess; + struct rtrs_clt_io_req *req; + int err = -ECONNABORTED; + struct path_it it; + + do_each_path(alive_sess, clt, &it) { + if (unlikely(READ_ONCE(alive_sess->state) != + RTRS_CLT_CONNECTED)) + continue; + req = rtrs_clt_get_copy_req(alive_sess, fail_req); + if (req->dir == DMA_TO_DEVICE) + err = rtrs_clt_write_req(req); + else + err = rtrs_clt_read_req(req); + if (unlikely(err)) { + req->in_use = false; + continue; + } + /* Success path */ + rtrs_clt_inc_failover_cnt(alive_sess->stats); + break; + } while_each_path(&it); + + return err; +} + +static void fail_all_outstanding_reqs(struct rtrs_clt_sess *sess) +{ + struct rtrs_clt *clt = sess->clt; + struct rtrs_clt_io_req *req; + int i, err; + + if (!sess->reqs) + return; + for (i = 0; i < sess->queue_depth; ++i) { + req = &sess->reqs[i]; + if (!req->in_use) + continue; + + /* + * Safely (without notification) complete failed request. + * After completion this request is still useble and can + * be failovered to another path. + */ + complete_rdma_req(req, -ECONNABORTED, false, true); + + err = rtrs_clt_failover_req(clt, req); + if (unlikely(err)) + /* Failover failed, notify anyway */ + req->conf(req->priv, err); + } +} + +static void free_sess_reqs(struct rtrs_clt_sess *sess) +{ + struct rtrs_clt_io_req *req; + int i; + + if (!sess->reqs) + return; + for (i = 0; i < sess->queue_depth; ++i) { + req = &sess->reqs[i]; + if (req->mr) + ib_dereg_mr(req->mr); + kfree(req->sge); + rtrs_iu_free(req->iu, DMA_TO_DEVICE, + sess->s.dev->ib_dev, 1); + } + kfree(sess->reqs); + sess->reqs = NULL; +} + +static int alloc_sess_reqs(struct rtrs_clt_sess *sess) +{ + struct rtrs_clt_io_req *req; + struct rtrs_clt *clt = sess->clt; + int i, err = -ENOMEM; + + sess->reqs = kcalloc(sess->queue_depth, sizeof(*sess->reqs), + GFP_KERNEL); + if (!sess->reqs) + return -ENOMEM; + + for (i = 0; i < sess->queue_depth; ++i) { + req = &sess->reqs[i]; + req->iu = rtrs_iu_alloc(1, sess->max_hdr_size, GFP_KERNEL, + sess->s.dev->ib_dev, + DMA_TO_DEVICE, + rtrs_clt_rdma_done); + if (!req->iu) + goto out; + + req->sge = kmalloc_array(clt->max_segments + 1, + sizeof(*req->sge), GFP_KERNEL); + if (!req->sge) + goto out; + + req->mr = ib_alloc_mr(sess->s.dev->ib_pd, IB_MR_TYPE_MEM_REG, + sess->max_pages_per_mr); + if (IS_ERR(req->mr)) { + err = PTR_ERR(req->mr); + req->mr = NULL; + pr_err("Failed to alloc sess->max_pages_per_mr %d\n", + sess->max_pages_per_mr); + goto out; + } + + init_completion(&req->inv_comp); + } + + return 0; + +out: + free_sess_reqs(sess); + + return err; +} + +static int alloc_permits(struct rtrs_clt *clt) +{ + unsigned int chunk_bits; + int err, i; + + clt->permits_map = kcalloc(BITS_TO_LONGS(clt->queue_depth), + sizeof(long), GFP_KERNEL); + if (!clt->permits_map) { + err = -ENOMEM; + goto out_err; + } + clt->permits = kcalloc(clt->queue_depth, permit_size(clt), GFP_KERNEL); + if (!clt->permits) { + err = -ENOMEM; + goto err_map; + } + chunk_bits = ilog2(clt->queue_depth - 1) + 1; + for (i = 0; i < clt->queue_depth; i++) { + struct rtrs_permit *permit; + + permit = get_permit(clt, i); + permit->mem_id = i; + permit->mem_off = i << (MAX_IMM_PAYL_BITS - chunk_bits); + } + + return 0; + +err_map: + kfree(clt->permits_map); + clt->permits_map = NULL; +out_err: + return err; +} + +static void free_permits(struct rtrs_clt *clt) +{ + kfree(clt->permits_map); + clt->permits_map = NULL; + kfree(clt->permits); + clt->permits = NULL; +} + +static void query_fast_reg_mode(struct rtrs_clt_sess *sess) +{ + struct ib_device *ib_dev; + u64 max_pages_per_mr; + int mr_page_shift; + + ib_dev = sess->s.dev->ib_dev; + + /* + * Use the smallest page size supported by the HCA, down to a + * minimum of 4096 bytes. We're unlikely to build large sglists + * out of smaller entries. + */ + mr_page_shift = max(12, ffs(ib_dev->attrs.page_size_cap) - 1); + max_pages_per_mr = ib_dev->attrs.max_mr_size; + do_div(max_pages_per_mr, (1ull << mr_page_shift)); + sess->max_pages_per_mr = + min3(sess->max_pages_per_mr, (u32)max_pages_per_mr, + ib_dev->attrs.max_fast_reg_page_list_len); + sess->max_send_sge = ib_dev->attrs.max_send_sge; +} + +static bool rtrs_clt_change_state_get_old(struct rtrs_clt_sess *sess, + enum rtrs_clt_state new_state, + enum rtrs_clt_state *old_state) +{ + bool changed; + + spin_lock_irq(&sess->state_wq.lock); + *old_state = sess->state; + changed = __rtrs_clt_change_state(sess, new_state); + spin_unlock_irq(&sess->state_wq.lock); + + return changed; +} + +static bool rtrs_clt_change_state(struct rtrs_clt_sess *sess, + enum rtrs_clt_state new_state) +{ + enum rtrs_clt_state old_state; + + return rtrs_clt_change_state_get_old(sess, new_state, &old_state); +} + +static void rtrs_clt_hb_err_handler(struct rtrs_con *c) +{ + struct rtrs_clt_con *con = container_of(c, typeof(*con), c); + + rtrs_rdma_error_recovery(con); +} + +static void rtrs_clt_init_hb(struct rtrs_clt_sess *sess) +{ + rtrs_init_hb(&sess->s, &io_comp_cqe, + RTRS_HB_INTERVAL_MS, + RTRS_HB_MISSED_MAX, + rtrs_clt_hb_err_handler, + rtrs_wq); +} + +static void rtrs_clt_start_hb(struct rtrs_clt_sess *sess) +{ + rtrs_start_hb(&sess->s); +} + +static void rtrs_clt_stop_hb(struct rtrs_clt_sess *sess) +{ + rtrs_stop_hb(&sess->s); +} + +static void rtrs_clt_reconnect_work(struct work_struct *work); +static void rtrs_clt_close_work(struct work_struct *work); + +static struct rtrs_clt_sess *alloc_sess(struct rtrs_clt *clt, + const struct rtrs_addr *path, + size_t con_num, u16 max_segments) +{ + struct rtrs_clt_sess *sess; + int err = -ENOMEM; + int cpu; + + sess = kzalloc(sizeof(*sess), GFP_KERNEL); + if (!sess) + goto err; + + /* Extra connection for user messages */ + con_num += 1; + + sess->s.con = kcalloc(con_num, sizeof(*sess->s.con), GFP_KERNEL); + if (!sess->s.con) + goto err_free_sess; + + sess->stats = kzalloc(sizeof(*sess->stats), GFP_KERNEL); + if (!sess->stats) + goto err_free_con; + + mutex_init(&sess->init_mutex); + uuid_gen(&sess->s.uuid); + memcpy(&sess->s.dst_addr, path->dst, + rdma_addr_size((struct sockaddr *)path->dst)); + + /* + * rdma_resolve_addr() passes src_addr to cma_bind_addr, which + * checks the sa_family to be non-zero. If user passed src_addr=NULL + * the sess->src_addr will contain only zeros, which is then fine. + */ + if (path->src) + memcpy(&sess->s.src_addr, path->src, + rdma_addr_size((struct sockaddr *)path->src)); + strlcpy(sess->s.sessname, clt->sessname, sizeof(sess->s.sessname)); + sess->s.con_num = con_num; + sess->clt = clt; + sess->max_pages_per_mr = max_segments * BLK_MAX_SEGMENT_SIZE >> 12; + init_waitqueue_head(&sess->state_wq); + sess->state = RTRS_CLT_CONNECTING; + atomic_set(&sess->connected_cnt, 0); + INIT_WORK(&sess->close_work, rtrs_clt_close_work); + INIT_DELAYED_WORK(&sess->reconnect_dwork, rtrs_clt_reconnect_work); + rtrs_clt_init_hb(sess); + + sess->mp_skip_entry = alloc_percpu(typeof(*sess->mp_skip_entry)); + if (!sess->mp_skip_entry) + goto err_free_stats; + + for_each_possible_cpu(cpu) + INIT_LIST_HEAD(per_cpu_ptr(sess->mp_skip_entry, cpu)); + + err = rtrs_clt_init_stats(sess->stats); + if (err) + goto err_free_percpu; + + return sess; + +err_free_percpu: + free_percpu(sess->mp_skip_entry); +err_free_stats: + kfree(sess->stats); +err_free_con: + kfree(sess->s.con); +err_free_sess: + kfree(sess); +err: + return ERR_PTR(err); +} + +void free_sess(struct rtrs_clt_sess *sess) +{ + free_percpu(sess->mp_skip_entry); + mutex_destroy(&sess->init_mutex); + kfree(sess->s.con); + kfree(sess->rbufs); + kfree(sess); +} + +static int create_con(struct rtrs_clt_sess *sess, unsigned int cid) +{ + struct rtrs_clt_con *con; + + con = kzalloc(sizeof(*con), GFP_KERNEL); + if (!con) + return -ENOMEM; + + /* Map first two connections to the first CPU */ + con->cpu = (cid ? cid - 1 : 0) % nr_cpu_ids; + con->c.cid = cid; + con->c.sess = &sess->s; + atomic_set(&con->io_cnt, 0); + + sess->s.con[cid] = &con->c; + + return 0; +} + +static void destroy_con(struct rtrs_clt_con *con) +{ + struct rtrs_clt_sess *sess = to_clt_sess(con->c.sess); + + sess->s.con[con->c.cid] = NULL; + kfree(con); +} + +static int create_con_cq_qp(struct rtrs_clt_con *con) +{ + struct rtrs_clt_sess *sess = to_clt_sess(con->c.sess); + u16 wr_queue_size; + int err, cq_vector; + struct rtrs_msg_rkey_rsp *rsp; + + /* + * This function can fail, but still destroy_con_cq_qp() should + * be called, this is because create_con_cq_qp() is called on cm + * event path, thus caller/waiter never knows: have we failed before + * create_con_cq_qp() or after. To solve this dilemma without + * creating any additional flags just allow destroy_con_cq_qp() be + * called many times. + */ + + if (con->c.cid == 0) { + /* + * One completion for each receive and two for each send + * (send request + registration) + * + 2 for drain and heartbeat + * in case qp gets into error state + */ + wr_queue_size = SERVICE_CON_QUEUE_DEPTH * 3 + 2; + /* We must be the first here */ + if (WARN_ON(sess->s.dev)) + return -EINVAL; + + /* + * The whole session uses device from user connection. + * Be careful not to close user connection before ib dev + * is gracefully put. + */ + sess->s.dev = rtrs_ib_dev_find_or_add(con->c.cm_id->device, + &dev_pd); + if (!sess->s.dev) { + rtrs_wrn(sess->clt, + "rtrs_ib_dev_find_get_or_add(): no memory\n"); + return -ENOMEM; + } + sess->s.dev_ref = 1; + query_fast_reg_mode(sess); + } else { + /* + * Here we assume that session members are correctly set. + * This is always true if user connection (cid == 0) is + * established first. + */ + if (WARN_ON(!sess->s.dev)) + return -EINVAL; + if (WARN_ON(!sess->queue_depth)) + return -EINVAL; + + /* Shared between connections */ + sess->s.dev_ref++; + wr_queue_size = + min_t(int, sess->s.dev->ib_dev->attrs.max_qp_wr, + /* QD * (REQ + RSP + FR REGS or INVS) + drain */ + sess->queue_depth * 3 + 1); + } + /* alloc iu to recv new rkey reply when server reports flags set */ + if (sess->flags == RTRS_MSG_NEW_RKEY_F || con->c.cid == 0) { + con->rsp_ius = rtrs_iu_alloc(wr_queue_size, sizeof(*rsp), + GFP_KERNEL, sess->s.dev->ib_dev, + DMA_FROM_DEVICE, + rtrs_clt_rdma_done); + if (!con->rsp_ius) + return -ENOMEM; + con->queue_size = wr_queue_size; + } + cq_vector = con->cpu % sess->s.dev->ib_dev->num_comp_vectors; + err = rtrs_cq_qp_create(&sess->s, &con->c, sess->max_send_sge, + cq_vector, wr_queue_size, wr_queue_size, + IB_POLL_SOFTIRQ); + /* + * In case of error we do not bother to clean previous allocations, + * since destroy_con_cq_qp() must be called. + */ + + if (err) + return err; + return err; +} + +static void destroy_con_cq_qp(struct rtrs_clt_con *con) +{ + struct rtrs_clt_sess *sess = to_clt_sess(con->c.sess); + + /* + * Be careful here: destroy_con_cq_qp() can be called even + * create_con_cq_qp() failed, see comments there. + */ + + rtrs_cq_qp_destroy(&con->c); + if (con->rsp_ius) { + rtrs_iu_free(con->rsp_ius, DMA_FROM_DEVICE, + sess->s.dev->ib_dev, con->queue_size); + con->rsp_ius = NULL; + con->queue_size = 0; + } + if (sess->s.dev_ref && !--sess->s.dev_ref) { + rtrs_ib_dev_put(sess->s.dev); + sess->s.dev = NULL; + } +} + +static void stop_cm(struct rtrs_clt_con *con) +{ + rdma_disconnect(con->c.cm_id); + if (con->c.qp) + ib_drain_qp(con->c.qp); +} + +static void destroy_cm(struct rtrs_clt_con *con) +{ + rdma_destroy_id(con->c.cm_id); + con->c.cm_id = NULL; +} + +static int rtrs_rdma_addr_resolved(struct rtrs_clt_con *con) +{ + struct rtrs_sess *s = con->c.sess; + int err; + + err = create_con_cq_qp(con); + if (err) { + rtrs_err(s, "create_con_cq_qp(), err: %d\n", err); + return err; + } + err = rdma_resolve_route(con->c.cm_id, RTRS_CONNECT_TIMEOUT_MS); + if (err) { + rtrs_err(s, "Resolving route failed, err: %d\n", err); + destroy_con_cq_qp(con); + } + + return err; +} + +static int rtrs_rdma_route_resolved(struct rtrs_clt_con *con) +{ + struct rtrs_clt_sess *sess = to_clt_sess(con->c.sess); + struct rtrs_clt *clt = sess->clt; + struct rtrs_msg_conn_req msg; + struct rdma_conn_param param; + + int err; + + param = (struct rdma_conn_param) { + .retry_count = 7, + .rnr_retry_count = 7, + .private_data = &msg, + .private_data_len = sizeof(msg), + }; + + msg = (struct rtrs_msg_conn_req) { + .magic = cpu_to_le16(RTRS_MAGIC), + .version = cpu_to_le16(RTRS_PROTO_VER), + .cid = cpu_to_le16(con->c.cid), + .cid_num = cpu_to_le16(sess->s.con_num), + .recon_cnt = cpu_to_le16(sess->s.recon_cnt), + }; + uuid_copy(&msg.sess_uuid, &sess->s.uuid); + uuid_copy(&msg.paths_uuid, &clt->paths_uuid); + + err = rdma_connect(con->c.cm_id, ¶m); + if (err) + rtrs_err(clt, "rdma_connect(): %d\n", err); + + return err; +} + +static int rtrs_rdma_conn_established(struct rtrs_clt_con *con, + struct rdma_cm_event *ev) +{ + struct rtrs_clt_sess *sess = to_clt_sess(con->c.sess); + struct rtrs_clt *clt = sess->clt; + const struct rtrs_msg_conn_rsp *msg; + u16 version, queue_depth; + int errno; + u8 len; + + msg = ev->param.conn.private_data; + len = ev->param.conn.private_data_len; + if (len < sizeof(*msg)) { + rtrs_err(clt, "Invalid RTRS connection response\n"); + return -ECONNRESET; + } + if (le16_to_cpu(msg->magic) != RTRS_MAGIC) { + rtrs_err(clt, "Invalid RTRS magic\n"); + return -ECONNRESET; + } + version = le16_to_cpu(msg->version); + if (version >> 8 != RTRS_PROTO_VER_MAJOR) { + rtrs_err(clt, "Unsupported major RTRS version: %d, expected %d\n", + version >> 8, RTRS_PROTO_VER_MAJOR); + return -ECONNRESET; + } + errno = le16_to_cpu(msg->errno); + if (errno) { + rtrs_err(clt, "Invalid RTRS message: errno %d\n", + errno); + return -ECONNRESET; + } + if (con->c.cid == 0) { + queue_depth = le16_to_cpu(msg->queue_depth); + + if (queue_depth > MAX_SESS_QUEUE_DEPTH) { + rtrs_err(clt, "Invalid RTRS message: queue=%d\n", + queue_depth); + return -ECONNRESET; + } + if (!sess->rbufs || sess->queue_depth < queue_depth) { + kfree(sess->rbufs); + sess->rbufs = kcalloc(queue_depth, sizeof(*sess->rbufs), + GFP_KERNEL); + if (!sess->rbufs) + return -ENOMEM; + } + sess->queue_depth = queue_depth; + sess->max_hdr_size = le32_to_cpu(msg->max_hdr_size); + sess->max_io_size = le32_to_cpu(msg->max_io_size); + sess->flags = le32_to_cpu(msg->flags); + sess->chunk_size = sess->max_io_size + sess->max_hdr_size; + + /* + * Global queue depth and IO size is always a minimum. + * If while a reconnection server sends us a value a bit + * higher - client does not care and uses cached minimum. + * + * Since we can have several sessions (paths) restablishing + * connections in parallel, use lock. + */ + mutex_lock(&clt->paths_mutex); + clt->queue_depth = min_not_zero(sess->queue_depth, + clt->queue_depth); + clt->max_io_size = min_not_zero(sess->max_io_size, + clt->max_io_size); + mutex_unlock(&clt->paths_mutex); + + /* + * Cache the hca_port and hca_name for sysfs + */ + sess->hca_port = con->c.cm_id->port_num; + scnprintf(sess->hca_name, sizeof(sess->hca_name), + sess->s.dev->ib_dev->name); + sess->s.src_addr = con->c.cm_id->route.addr.src_addr; + } + + return 0; +} + +static inline void flag_success_on_conn(struct rtrs_clt_con *con) +{ + struct rtrs_clt_sess *sess = to_clt_sess(con->c.sess); + + atomic_inc(&sess->connected_cnt); + con->cm_err = 1; +} + +static int rtrs_rdma_conn_rejected(struct rtrs_clt_con *con, + struct rdma_cm_event *ev) +{ + struct rtrs_sess *s = con->c.sess; + const struct rtrs_msg_conn_rsp *msg; + const char *rej_msg; + int status, errno; + u8 data_len; + + status = ev->status; + rej_msg = rdma_reject_msg(con->c.cm_id, status); + msg = rdma_consumer_reject_data(con->c.cm_id, ev, &data_len); + + if (msg && data_len >= sizeof(*msg)) { + errno = (int16_t)le16_to_cpu(msg->errno); + if (errno == -EBUSY) + rtrs_err(s, + "Previous session is still exists on the server, please reconnect later\n"); + else + rtrs_err(s, + "Connect rejected: status %d (%s), rtrs errno %d\n", + status, rej_msg, errno); + } else { + rtrs_err(s, + "Connect rejected but with malformed message: status %d (%s)\n", + status, rej_msg); + } + + return -ECONNRESET; +} + +static void rtrs_clt_close_conns(struct rtrs_clt_sess *sess, bool wait) +{ + if (rtrs_clt_change_state(sess, RTRS_CLT_CLOSING)) + queue_work(rtrs_wq, &sess->close_work); + if (wait) + flush_work(&sess->close_work); +} + +static inline void flag_error_on_conn(struct rtrs_clt_con *con, int cm_err) +{ + if (con->cm_err == 1) { + struct rtrs_clt_sess *sess; + + sess = to_clt_sess(con->c.sess); + if (atomic_dec_and_test(&sess->connected_cnt)) + + wake_up(&sess->state_wq); + } + con->cm_err = cm_err; +} + +static int rtrs_clt_rdma_cm_handler(struct rdma_cm_id *cm_id, + struct rdma_cm_event *ev) +{ + struct rtrs_clt_con *con = cm_id->context; + struct rtrs_sess *s = con->c.sess; + struct rtrs_clt_sess *sess = to_clt_sess(s); + int cm_err = 0; + + switch (ev->event) { + case RDMA_CM_EVENT_ADDR_RESOLVED: + cm_err = rtrs_rdma_addr_resolved(con); + break; + case RDMA_CM_EVENT_ROUTE_RESOLVED: + cm_err = rtrs_rdma_route_resolved(con); + break; + case RDMA_CM_EVENT_ESTABLISHED: + con->cm_err = rtrs_rdma_conn_established(con, ev); + if (likely(!con->cm_err)) { + /* + * Report success and wake up. Here we abuse state_wq, + * i.e. wake up without state change, but we set cm_err. + */ + flag_success_on_conn(con); + wake_up(&sess->state_wq); + return 0; + } + break; + case RDMA_CM_EVENT_REJECTED: + cm_err = rtrs_rdma_conn_rejected(con, ev); + break; + case RDMA_CM_EVENT_CONNECT_ERROR: + case RDMA_CM_EVENT_UNREACHABLE: + rtrs_wrn(s, "CM error event %d\n", ev->event); + cm_err = -ECONNRESET; + break; + case RDMA_CM_EVENT_ADDR_ERROR: + case RDMA_CM_EVENT_ROUTE_ERROR: + cm_err = -EHOSTUNREACH; + break; + case RDMA_CM_EVENT_DISCONNECTED: + case RDMA_CM_EVENT_ADDR_CHANGE: + case RDMA_CM_EVENT_TIMEWAIT_EXIT: + cm_err = -ECONNRESET; + break; + case RDMA_CM_EVENT_DEVICE_REMOVAL: + /* + * Device removal is a special case. Queue close and return 0. + */ + rtrs_clt_close_conns(sess, false); + return 0; + default: + rtrs_err(s, "Unexpected RDMA CM event (%d)\n", ev->event); + cm_err = -ECONNRESET; + break; + } + + if (cm_err) { + /* + * cm error makes sense only on connection establishing, + * in other cases we rely on normal procedure of reconnecting. + */ + flag_error_on_conn(con, cm_err); + rtrs_rdma_error_recovery(con); + } + + return 0; +} + +static int create_cm(struct rtrs_clt_con *con) +{ + struct rtrs_sess *s = con->c.sess; + struct rtrs_clt_sess *sess = to_clt_sess(s); + struct rdma_cm_id *cm_id; + int err; + + cm_id = rdma_create_id(&init_net, rtrs_clt_rdma_cm_handler, con, + sess->s.dst_addr.ss_family == AF_IB ? + RDMA_PS_IB : RDMA_PS_TCP, IB_QPT_RC); + if (IS_ERR(cm_id)) { + err = PTR_ERR(cm_id); + rtrs_err(s, "Failed to create CM ID, err: %d\n", err); + + return err; + } + con->c.cm_id = cm_id; + con->cm_err = 0; + /* allow the port to be reused */ + err = rdma_set_reuseaddr(cm_id, 1); + if (err != 0) { + rtrs_err(s, "Set address reuse failed, err: %d\n", err); + goto destroy_cm; + } + err = rdma_resolve_addr(cm_id, (struct sockaddr *)&sess->s.src_addr, + (struct sockaddr *)&sess->s.dst_addr, + RTRS_CONNECT_TIMEOUT_MS); + if (err) { + rtrs_err(s, "Failed to resolve address, err: %d\n", err); + goto destroy_cm; + } + /* + * Combine connection status and session events. This is needed + * for waiting two possible cases: cm_err has something meaningful + * or session state was really changed to error by device removal. + */ + err = wait_event_interruptible_timeout( + sess->state_wq, + con->cm_err || sess->state != RTRS_CLT_CONNECTING, + msecs_to_jiffies(RTRS_CONNECT_TIMEOUT_MS)); + if (err == 0 || err == -ERESTARTSYS) { + if (err == 0) + err = -ETIMEDOUT; + /* Timedout or interrupted */ + goto errr; + } + if (con->cm_err < 0) { + err = con->cm_err; + goto errr; + } + if (READ_ONCE(sess->state) != RTRS_CLT_CONNECTING) { + /* Device removal */ + err = -ECONNABORTED; + goto errr; + } + + return 0; + +errr: + stop_cm(con); + /* Is safe to call destroy if cq_qp is not inited */ + destroy_con_cq_qp(con); +destroy_cm: + destroy_cm(con); + + return err; +} + +static void rtrs_clt_sess_up(struct rtrs_clt_sess *sess) +{ + struct rtrs_clt *clt = sess->clt; + int up; + + /* + * We can fire RECONNECTED event only when all paths were + * connected on rtrs_clt_open(), then each was disconnected + * and the first one connected again. That's why this nasty + * game with counter value. + */ + + mutex_lock(&clt->paths_ev_mutex); + up = ++clt->paths_up; + /* + * Here it is safe to access paths num directly since up counter + * is greater than MAX_PATHS_NUM only while rtrs_clt_open() is + * in progress, thus paths removals are impossible. + */ + if (up > MAX_PATHS_NUM && up == MAX_PATHS_NUM + clt->paths_num) + clt->paths_up = clt->paths_num; + else if (up == 1) + clt->link_ev(clt->priv, RTRS_CLT_LINK_EV_RECONNECTED); + mutex_unlock(&clt->paths_ev_mutex); + + /* Mark session as established */ + sess->established = true; + sess->reconnect_attempts = 0; + sess->stats->reconnects.successful_cnt++; +} + +static void rtrs_clt_sess_down(struct rtrs_clt_sess *sess) +{ + struct rtrs_clt *clt = sess->clt; + + if (!sess->established) + return; + + sess->established = false; + mutex_lock(&clt->paths_ev_mutex); + WARN_ON(!clt->paths_up); + if (--clt->paths_up == 0) + clt->link_ev(clt->priv, RTRS_CLT_LINK_EV_DISCONNECTED); + mutex_unlock(&clt->paths_ev_mutex); +} + +static void rtrs_clt_stop_and_destroy_conns(struct rtrs_clt_sess *sess) +{ + struct rtrs_clt_con *con; + unsigned int cid; + + WARN_ON(READ_ONCE(sess->state) == RTRS_CLT_CONNECTED); + + /* + * Possible race with rtrs_clt_open(), when DEVICE_REMOVAL comes + * exactly in between. Start destroying after it finishes. + */ + mutex_lock(&sess->init_mutex); + mutex_unlock(&sess->init_mutex); + + /* + * All IO paths must observe !CONNECTED state before we + * free everything. + */ + synchronize_rcu(); + + rtrs_clt_stop_hb(sess); + + /* + * The order it utterly crucial: firstly disconnect and complete all + * rdma requests with error (thus set in_use=false for requests), + * then fail outstanding requests checking in_use for each, and + * eventually notify upper layer about session disconnection. + */ + + for (cid = 0; cid < sess->s.con_num; cid++) { + if (!sess->s.con[cid]) + break; + con = to_clt_con(sess->s.con[cid]); + stop_cm(con); + } + fail_all_outstanding_reqs(sess); + free_sess_reqs(sess); + rtrs_clt_sess_down(sess); + + /* + * Wait for graceful shutdown, namely when peer side invokes + * rdma_disconnect(). 'connected_cnt' is decremented only on + * CM events, thus if other side had crashed and hb has detected + * something is wrong, here we will stuck for exactly timeout ms, + * since CM does not fire anything. That is fine, we are not in + * hurry. + */ + wait_event_timeout(sess->state_wq, !atomic_read(&sess->connected_cnt), + msecs_to_jiffies(RTRS_CONNECT_TIMEOUT_MS)); + + for (cid = 0; cid < sess->s.con_num; cid++) { + if (!sess->s.con[cid]) + break; + con = to_clt_con(sess->s.con[cid]); + destroy_con_cq_qp(con); + destroy_cm(con); + destroy_con(con); + } +} + +static inline bool xchg_sessions(struct rtrs_clt_sess __rcu **rcu_ppcpu_path, + struct rtrs_clt_sess *sess, + struct rtrs_clt_sess *next) +{ + struct rtrs_clt_sess **ppcpu_path; + + /* Call cmpxchg() without sparse warnings */ + ppcpu_path = (typeof(ppcpu_path))rcu_ppcpu_path; + return sess == cmpxchg(ppcpu_path, sess, next); +} + +static void rtrs_clt_remove_path_from_arr(struct rtrs_clt_sess *sess) +{ + struct rtrs_clt *clt = sess->clt; + struct rtrs_clt_sess *next; + bool wait_for_grace = false; + int cpu; + + mutex_lock(&clt->paths_mutex); + list_del_rcu(&sess->s.entry); + + /* Make sure everybody observes path removal. */ + synchronize_rcu(); + + /* + * At this point nobody sees @sess in the list, but still we have + * dangling pointer @pcpu_path which _can_ point to @sess. Since + * nobody can observe @sess in the list, we guarantee that IO path + * will not assign @sess to @pcpu_path, i.e. @pcpu_path can be equal + * to @sess, but can never again become @sess. + */ + + /* + * Decrement paths number only after grace period, because + * caller of do_each_path() must firstly observe list without + * path and only then decremented paths number. + * + * Otherwise there can be the following situation: + * o Two paths exist and IO is coming. + * o One path is removed: + * CPU#0 CPU#1 + * do_each_path(): rtrs_clt_remove_path_from_arr(): + * path = get_next_path() + * ^^^ list_del_rcu(path) + * [!CONNECTED path] clt->paths_num-- + * ^^^^^^^^^ + * load clt->paths_num from 2 to 1 + * ^^^^^^^^^ + * sees 1 + * + * path is observed as !CONNECTED, but do_each_path() loop + * ends, because expression i < clt->paths_num is false. + */ + clt->paths_num--; + + /* + * Get @next connection from current @sess which is going to be + * removed. If @sess is the last element, then @next is NULL. + */ + rcu_read_lock(); + next = list_next_or_null_rr_rcu(&clt->paths_list, &sess->s.entry, + typeof(*next), s.entry); + rcu_read_unlock(); + + /* + * @pcpu paths can still point to the path which is going to be + * removed, so change the pointer manually. + */ + for_each_possible_cpu(cpu) { + struct rtrs_clt_sess __rcu **ppcpu_path; + + ppcpu_path = per_cpu_ptr(clt->pcpu_path, cpu); + if (rcu_dereference_protected(*ppcpu_path, + lockdep_is_held(&clt->paths_mutex)) != sess) + /* + * synchronize_rcu() was called just after deleting + * entry from the list, thus IO code path cannot + * change pointer back to the pointer which is going + * to be removed, we are safe here. + */ + continue; + + /* + * We race with IO code path, which also changes pointer, + * thus we have to be careful not to overwrite it. + */ + if (xchg_sessions(ppcpu_path, sess, next)) + /* + * @ppcpu_path was successfully replaced with @next, + * that means that someone could also pick up the + * @sess and dereferencing it right now, so wait for + * a grace period is required. + */ + wait_for_grace = true; + } + if (wait_for_grace) + synchronize_rcu(); + + mutex_unlock(&clt->paths_mutex); +} + +static void rtrs_clt_add_path_to_arr(struct rtrs_clt_sess *sess, + struct rtrs_addr *addr) +{ + struct rtrs_clt *clt = sess->clt; + + mutex_lock(&clt->paths_mutex); + clt->paths_num++; + + list_add_tail_rcu(&sess->s.entry, &clt->paths_list); + mutex_unlock(&clt->paths_mutex); +} + +static void rtrs_clt_close_work(struct work_struct *work) +{ + struct rtrs_clt_sess *sess; + + sess = container_of(work, struct rtrs_clt_sess, close_work); + + cancel_delayed_work_sync(&sess->reconnect_dwork); + rtrs_clt_stop_and_destroy_conns(sess); + rtrs_clt_change_state(sess, RTRS_CLT_CLOSED); +} + +static int init_conns(struct rtrs_clt_sess *sess) +{ + unsigned int cid; + int err; + + /* + * On every new session connections increase reconnect counter + * to avoid clashes with previous sessions not yet closed + * sessions on a server side. + */ + sess->s.recon_cnt++; + + /* Establish all RDMA connections */ + for (cid = 0; cid < sess->s.con_num; cid++) { + err = create_con(sess, cid); + if (err) + goto destroy; + + err = create_cm(to_clt_con(sess->s.con[cid])); + if (err) { + destroy_con(to_clt_con(sess->s.con[cid])); + goto destroy; + } + } + err = alloc_sess_reqs(sess); + if (err) + goto destroy; + + rtrs_clt_start_hb(sess); + + return 0; + +destroy: + while (cid--) { + struct rtrs_clt_con *con = to_clt_con(sess->s.con[cid]); + + stop_cm(con); + destroy_con_cq_qp(con); + destroy_cm(con); + destroy_con(con); + } + /* + * If we've never taken async path and got an error, say, + * doing rdma_resolve_addr(), switch to CONNECTION_ERR state + * manually to keep reconnecting. + */ + rtrs_clt_change_state(sess, RTRS_CLT_CONNECTING_ERR); + + return err; +} + +static void rtrs_clt_info_req_done(struct ib_cq *cq, struct ib_wc *wc) +{ + struct rtrs_clt_con *con = cq->cq_context; + struct rtrs_clt_sess *sess = to_clt_sess(con->c.sess); + struct rtrs_iu *iu; + + iu = container_of(wc->wr_cqe, struct rtrs_iu, cqe); + rtrs_iu_free(iu, DMA_TO_DEVICE, sess->s.dev->ib_dev, 1); + + if (unlikely(wc->status != IB_WC_SUCCESS)) { + rtrs_err(sess->clt, "Sess info request send failed: %s\n", + ib_wc_status_msg(wc->status)); + rtrs_clt_change_state(sess, RTRS_CLT_CONNECTING_ERR); + return; + } + + rtrs_clt_update_wc_stats(con); +} + +static int process_info_rsp(struct rtrs_clt_sess *sess, + const struct rtrs_msg_info_rsp *msg) +{ + unsigned int sg_cnt, total_len; + int i, sgi; + + sg_cnt = le16_to_cpu(msg->sg_cnt); + if (unlikely(!sg_cnt)) + return -EINVAL; + /* + * Check if IB immediate data size is enough to hold the mem_id and + * the offset inside the memory chunk. + */ + if (unlikely((ilog2(sg_cnt - 1) + 1) + + (ilog2(sess->chunk_size - 1) + 1) > + MAX_IMM_PAYL_BITS)) { + rtrs_err(sess->clt, + "RDMA immediate size (%db) not enough to encode %d buffers of size %dB\n", + MAX_IMM_PAYL_BITS, sg_cnt, sess->chunk_size); + return -EINVAL; + } + if (unlikely(!sg_cnt || (sess->queue_depth % sg_cnt))) { + rtrs_err(sess->clt, "Incorrect sg_cnt %d, is not multiple\n", + sg_cnt); + return -EINVAL; + } + total_len = 0; + for (sgi = 0, i = 0; sgi < sg_cnt && i < sess->queue_depth; sgi++) { + const struct rtrs_sg_desc *desc = &msg->desc[sgi]; + u32 len, rkey; + u64 addr; + + addr = le64_to_cpu(desc->addr); + rkey = le32_to_cpu(desc->key); + len = le32_to_cpu(desc->len); + + total_len += len; + + if (unlikely(!len || (len % sess->chunk_size))) { + rtrs_err(sess->clt, "Incorrect [%d].len %d\n", sgi, + len); + return -EINVAL; + } + for ( ; len && i < sess->queue_depth; i++) { + sess->rbufs[i].addr = addr; + sess->rbufs[i].rkey = rkey; + + len -= sess->chunk_size; + addr += sess->chunk_size; + } + } + /* Sanity check */ + if (unlikely(sgi != sg_cnt || i != sess->queue_depth)) { + rtrs_err(sess->clt, "Incorrect sg vector, not fully mapped\n"); + return -EINVAL; + } + if (unlikely(total_len != sess->chunk_size * sess->queue_depth)) { + rtrs_err(sess->clt, "Incorrect total_len %d\n", total_len); + return -EINVAL; + } + + return 0; +} + +static void rtrs_clt_info_rsp_done(struct ib_cq *cq, struct ib_wc *wc) +{ + struct rtrs_clt_con *con = cq->cq_context; + struct rtrs_clt_sess *sess = to_clt_sess(con->c.sess); + struct rtrs_msg_info_rsp *msg; + enum rtrs_clt_state state; + struct rtrs_iu *iu; + size_t rx_sz; + int err; + + state = RTRS_CLT_CONNECTING_ERR; + + WARN_ON(con->c.cid); + iu = container_of(wc->wr_cqe, struct rtrs_iu, cqe); + if (unlikely(wc->status != IB_WC_SUCCESS)) { + rtrs_err(sess->clt, "Sess info response recv failed: %s\n", + ib_wc_status_msg(wc->status)); + goto out; + } + WARN_ON(wc->opcode != IB_WC_RECV); + + if (unlikely(wc->byte_len < sizeof(*msg))) { + rtrs_err(sess->clt, "Sess info response is malformed: size %d\n", + wc->byte_len); + goto out; + } + ib_dma_sync_single_for_cpu(sess->s.dev->ib_dev, iu->dma_addr, + iu->size, DMA_FROM_DEVICE); + msg = iu->buf; + if (unlikely(le16_to_cpu(msg->type) != RTRS_MSG_INFO_RSP)) { + rtrs_err(sess->clt, "Sess info response is malformed: type %d\n", + le16_to_cpu(msg->type)); + goto out; + } + rx_sz = sizeof(*msg); + rx_sz += sizeof(msg->desc[0]) * le16_to_cpu(msg->sg_cnt); + if (unlikely(wc->byte_len < rx_sz)) { + rtrs_err(sess->clt, "Sess info response is malformed: size %d\n", + wc->byte_len); + goto out; + } + err = process_info_rsp(sess, msg); + if (unlikely(err)) + goto out; + + err = post_recv_sess(sess); + if (unlikely(err)) + goto out; + + state = RTRS_CLT_CONNECTED; + +out: + rtrs_clt_update_wc_stats(con); + rtrs_iu_free(iu, DMA_FROM_DEVICE, sess->s.dev->ib_dev, 1); + rtrs_clt_change_state(sess, state); +} + +static int rtrs_send_sess_info(struct rtrs_clt_sess *sess) +{ + struct rtrs_clt_con *usr_con = to_clt_con(sess->s.con[0]); + struct rtrs_msg_info_req *msg; + struct rtrs_iu *tx_iu, *rx_iu; + size_t rx_sz; + int err; + + rx_sz = sizeof(struct rtrs_msg_info_rsp); + rx_sz += sizeof(u64) * MAX_SESS_QUEUE_DEPTH; + + tx_iu = rtrs_iu_alloc(1, sizeof(struct rtrs_msg_info_req), GFP_KERNEL, + sess->s.dev->ib_dev, DMA_TO_DEVICE, + rtrs_clt_info_req_done); + rx_iu = rtrs_iu_alloc(1, rx_sz, GFP_KERNEL, sess->s.dev->ib_dev, + DMA_FROM_DEVICE, rtrs_clt_info_rsp_done); + if (unlikely(!tx_iu || !rx_iu)) { + err = -ENOMEM; + goto out; + } + /* Prepare for getting info response */ + err = rtrs_iu_post_recv(&usr_con->c, rx_iu); + if (unlikely(err)) { + rtrs_err(sess->clt, "rtrs_iu_post_recv(), err: %d\n", err); + goto out; + } + rx_iu = NULL; + + msg = tx_iu->buf; + msg->type = cpu_to_le16(RTRS_MSG_INFO_REQ); + memcpy(msg->sessname, sess->s.sessname, sizeof(msg->sessname)); + + ib_dma_sync_single_for_device(sess->s.dev->ib_dev, tx_iu->dma_addr, + tx_iu->size, DMA_TO_DEVICE); + + /* Send info request */ + err = rtrs_iu_post_send(&usr_con->c, tx_iu, sizeof(*msg), NULL); + if (unlikely(err)) { + rtrs_err(sess->clt, "rtrs_iu_post_send(), err: %d\n", err); + goto out; + } + tx_iu = NULL; + + /* Wait for state change */ + wait_event_interruptible_timeout(sess->state_wq, + sess->state != RTRS_CLT_CONNECTING, + msecs_to_jiffies( + RTRS_CONNECT_TIMEOUT_MS)); + if (unlikely(READ_ONCE(sess->state) != RTRS_CLT_CONNECTED)) { + if (READ_ONCE(sess->state) == RTRS_CLT_CONNECTING_ERR) + err = -ECONNRESET; + else + err = -ETIMEDOUT; + goto out; + } + +out: + if (tx_iu) + rtrs_iu_free(tx_iu, DMA_TO_DEVICE, sess->s.dev->ib_dev, 1); + if (rx_iu) + rtrs_iu_free(rx_iu, DMA_FROM_DEVICE, sess->s.dev->ib_dev, 1); + if (unlikely(err)) + /* If we've never taken async path because of malloc problems */ + rtrs_clt_change_state(sess, RTRS_CLT_CONNECTING_ERR); + + return err; +} + +/** + * init_sess() - establishes all session connections and does handshake + * @sess: client session. + * In case of error full close or reconnect procedure should be taken, + * because reconnect or close async works can be started. + */ +static int init_sess(struct rtrs_clt_sess *sess) +{ + int err; + + mutex_lock(&sess->init_mutex); + err = init_conns(sess); + if (err) { + rtrs_err(sess->clt, "init_conns(), err: %d\n", err); + goto out; + } + err = rtrs_send_sess_info(sess); + if (err) { + rtrs_err(sess->clt, "rtrs_send_sess_info(), err: %d\n", err); + goto out; + } + rtrs_clt_sess_up(sess); +out: + mutex_unlock(&sess->init_mutex); + + return err; +} + +static void rtrs_clt_reconnect_work(struct work_struct *work) +{ + struct rtrs_clt_sess *sess; + struct rtrs_clt *clt; + unsigned int delay_ms; + int err; + + sess = container_of(to_delayed_work(work), struct rtrs_clt_sess, + reconnect_dwork); + clt = sess->clt; + + if (READ_ONCE(sess->state) != RTRS_CLT_RECONNECTING) + return; + + if (sess->reconnect_attempts >= clt->max_reconnect_attempts) { + /* Close a session completely if max attempts is reached */ + rtrs_clt_close_conns(sess, false); + return; + } + sess->reconnect_attempts++; + + /* Stop everything */ + rtrs_clt_stop_and_destroy_conns(sess); + msleep(RTRS_RECONNECT_BACKOFF); + if (rtrs_clt_change_state(sess, RTRS_CLT_CONNECTING)) { + err = init_sess(sess); + if (err) + goto reconnect_again; + } + + return; + +reconnect_again: + if (rtrs_clt_change_state(sess, RTRS_CLT_RECONNECTING)) { + sess->stats->reconnects.fail_cnt++; + delay_ms = clt->reconnect_delay_sec * 1000; + queue_delayed_work(rtrs_wq, &sess->reconnect_dwork, + msecs_to_jiffies(delay_ms)); + } +} + +static void rtrs_clt_dev_release(struct device *dev) +{ + struct rtrs_clt *clt = container_of(dev, struct rtrs_clt, dev); + + kfree(clt); +} + +static struct rtrs_clt *alloc_clt(const char *sessname, size_t paths_num, + u16 port, size_t pdu_sz, void *priv, + void (*link_ev)(void *priv, + enum rtrs_clt_link_ev ev), + unsigned int max_segments, + unsigned int reconnect_delay_sec, + unsigned int max_reconnect_attempts) +{ + struct rtrs_clt *clt; + int err; + + if (!paths_num || paths_num > MAX_PATHS_NUM) + return ERR_PTR(-EINVAL); + + if (strlen(sessname) >= sizeof(clt->sessname)) + return ERR_PTR(-EINVAL); + + clt = kzalloc(sizeof(*clt), GFP_KERNEL); + if (!clt) + return ERR_PTR(-ENOMEM); + + clt->pcpu_path = alloc_percpu(typeof(*clt->pcpu_path)); + if (!clt->pcpu_path) { + kfree(clt); + return ERR_PTR(-ENOMEM); + } + + uuid_gen(&clt->paths_uuid); + INIT_LIST_HEAD_RCU(&clt->paths_list); + clt->paths_num = paths_num; + clt->paths_up = MAX_PATHS_NUM; + clt->port = port; + clt->pdu_sz = pdu_sz; + clt->max_segments = max_segments; + clt->reconnect_delay_sec = reconnect_delay_sec; + clt->max_reconnect_attempts = max_reconnect_attempts; + clt->priv = priv; + clt->link_ev = link_ev; + clt->mp_policy = MP_POLICY_MIN_INFLIGHT; + strlcpy(clt->sessname, sessname, sizeof(clt->sessname)); + init_waitqueue_head(&clt->permits_wait); + mutex_init(&clt->paths_ev_mutex); + mutex_init(&clt->paths_mutex); + + clt->dev.class = rtrs_clt_dev_class; + clt->dev.release = rtrs_clt_dev_release; + err = dev_set_name(&clt->dev, "%s", sessname); + if (err) { + free_percpu(clt->pcpu_path); + kfree(clt); + return ERR_PTR(err); + } + /* + * Suppress user space notification until + * sysfs files are created + */ + dev_set_uevent_suppress(&clt->dev, true); + err = device_register(&clt->dev); + if (err) { + free_percpu(clt->pcpu_path); + put_device(&clt->dev); + return ERR_PTR(err); + } + + clt->kobj_paths = kobject_create_and_add("paths", &clt->dev.kobj); + if (!clt->kobj_paths) { + free_percpu(clt->pcpu_path); + device_unregister(&clt->dev); + return NULL; + } + err = rtrs_clt_create_sysfs_root_files(clt); + if (err) { + free_percpu(clt->pcpu_path); + kobject_del(clt->kobj_paths); + kobject_put(clt->kobj_paths); + device_unregister(&clt->dev); + return ERR_PTR(err); + } + dev_set_uevent_suppress(&clt->dev, false); + kobject_uevent(&clt->dev.kobj, KOBJ_ADD); + + return clt; +} + +static void wait_for_inflight_permits(struct rtrs_clt *clt) +{ + if (clt->permits_map) { + size_t sz = clt->queue_depth; + + wait_event(clt->permits_wait, + find_first_bit(clt->permits_map, sz) >= sz); + } +} + +static void free_clt(struct rtrs_clt *clt) +{ + wait_for_inflight_permits(clt); + free_permits(clt); + free_percpu(clt->pcpu_path); + mutex_destroy(&clt->paths_ev_mutex); + mutex_destroy(&clt->paths_mutex); + /* release callback will free clt in last put */ + device_unregister(&clt->dev); +} + +/** + * rtrs_clt_open() - Open a session to an RTRS server + * @ops: holds the link event callback and the private pointer. + * @sessname: name of the session + * @paths: Paths to be established defined by their src and dst addresses + * @paths_num: Number of elements in the @paths array + * @port: port to be used by the RTRS session + * @pdu_sz: Size of extra payload which can be accessed after permit allocation. + * @reconnect_delay_sec: time between reconnect tries + * @max_segments: Max. number of segments per IO request + * @max_reconnect_attempts: Number of times to reconnect on error before giving + * up, 0 for * disabled, -1 for forever + * + * Starts session establishment with the rtrs_server. The function can block + * up to ~2000ms before it returns. + * + * Return a valid pointer on success otherwise PTR_ERR. + */ +struct rtrs_clt *rtrs_clt_open(struct rtrs_clt_ops *ops, + const char *sessname, + const struct rtrs_addr *paths, + size_t paths_num, u16 port, + size_t pdu_sz, u8 reconnect_delay_sec, + u16 max_segments, + s16 max_reconnect_attempts) +{ + struct rtrs_clt_sess *sess, *tmp; + struct rtrs_clt *clt; + int err, i; + + clt = alloc_clt(sessname, paths_num, port, pdu_sz, ops->priv, + ops->link_ev, + max_segments, reconnect_delay_sec, + max_reconnect_attempts); + if (IS_ERR(clt)) { + err = PTR_ERR(clt); + goto out; + } + for (i = 0; i < paths_num; i++) { + struct rtrs_clt_sess *sess; + + sess = alloc_sess(clt, &paths[i], nr_cpu_ids, + max_segments); + if (IS_ERR(sess)) { + err = PTR_ERR(sess); + goto close_all_sess; + } + list_add_tail_rcu(&sess->s.entry, &clt->paths_list); + + err = init_sess(sess); + if (err) { + list_del_rcu(&sess->s.entry); + rtrs_clt_close_conns(sess, true); + free_sess(sess); + goto close_all_sess; + } + + err = rtrs_clt_create_sess_files(sess); + if (err) { + list_del_rcu(&sess->s.entry); + rtrs_clt_close_conns(sess, true); + free_sess(sess); + goto close_all_sess; + } + } + err = alloc_permits(clt); + if (err) + goto close_all_sess; + + return clt; + +close_all_sess: + list_for_each_entry_safe(sess, tmp, &clt->paths_list, s.entry) { + rtrs_clt_destroy_sess_files(sess, NULL); + rtrs_clt_close_conns(sess, true); + kobject_put(&sess->kobj); + } + rtrs_clt_destroy_sysfs_root_files(clt); + rtrs_clt_destroy_sysfs_root_folders(clt); + free_clt(clt); + +out: + return ERR_PTR(err); +} +EXPORT_SYMBOL(rtrs_clt_open); + +/** + * rtrs_clt_close() - Close a session + * @clt: Session handle. Session is freed upon return. + */ +void rtrs_clt_close(struct rtrs_clt *clt) +{ + struct rtrs_clt_sess *sess, *tmp; + + /* Firstly forbid sysfs access */ + rtrs_clt_destroy_sysfs_root_files(clt); + rtrs_clt_destroy_sysfs_root_folders(clt); + + /* Now it is safe to iterate over all paths without locks */ + list_for_each_entry_safe(sess, tmp, &clt->paths_list, s.entry) { + rtrs_clt_destroy_sess_files(sess, NULL); + rtrs_clt_close_conns(sess, true); + kobject_put(&sess->kobj); + } + free_clt(clt); +} +EXPORT_SYMBOL(rtrs_clt_close); + +int rtrs_clt_reconnect_from_sysfs(struct rtrs_clt_sess *sess) +{ + enum rtrs_clt_state old_state; + int err = -EBUSY; + bool changed; + + changed = rtrs_clt_change_state_get_old(sess, RTRS_CLT_RECONNECTING, + &old_state); + if (changed) { + sess->reconnect_attempts = 0; + queue_delayed_work(rtrs_wq, &sess->reconnect_dwork, 0); + } + if (changed || old_state == RTRS_CLT_RECONNECTING) { + /* + * flush_delayed_work() queues pending work for immediate + * execution, so do the flush if we have queued something + * right now or work is pending. + */ + flush_delayed_work(&sess->reconnect_dwork); + err = (READ_ONCE(sess->state) == + RTRS_CLT_CONNECTED ? 0 : -ENOTCONN); + } + + return err; +} + +int rtrs_clt_disconnect_from_sysfs(struct rtrs_clt_sess *sess) +{ + rtrs_clt_close_conns(sess, true); + + return 0; +} + +int rtrs_clt_remove_path_from_sysfs(struct rtrs_clt_sess *sess, + const struct attribute *sysfs_self) +{ + enum rtrs_clt_state old_state; + bool changed; + + /* + * Continue stopping path till state was changed to DEAD or + * state was observed as DEAD: + * 1. State was changed to DEAD - we were fast and nobody + * invoked rtrs_clt_reconnect(), which can again start + * reconnecting. + * 2. State was observed as DEAD - we have someone in parallel + * removing the path. + */ + do { + rtrs_clt_close_conns(sess, true); + changed = rtrs_clt_change_state_get_old(sess, + RTRS_CLT_DEAD, + &old_state); + } while (!changed && old_state != RTRS_CLT_DEAD); + + if (likely(changed)) { + rtrs_clt_destroy_sess_files(sess, sysfs_self); + rtrs_clt_remove_path_from_arr(sess); + kobject_put(&sess->kobj); + } + + return 0; +} + +void rtrs_clt_set_max_reconnect_attempts(struct rtrs_clt *clt, int value) +{ + clt->max_reconnect_attempts = (unsigned int)value; +} + +int rtrs_clt_get_max_reconnect_attempts(const struct rtrs_clt *clt) +{ + return (int)clt->max_reconnect_attempts; +} + +/** + * rtrs_clt_request() - Request data transfer to/from server via RDMA. + * + * @dir: READ/WRITE + * @ops: callback function to be called as confirmation, and the pointer. + * @clt: Session + * @permit: Preallocated permit + * @vec: Message that is sent to server together with the request. + * Sum of len of all @vec elements limited to <= IO_MSG_SIZE. + * Since the msg is copied internally it can be allocated on stack. + * @nr: Number of elements in @vec. + * @data_len: length of data sent to/from server + * @sg: Pages to be sent/received to/from server. + * @sg_cnt: Number of elements in the @sg + * + * Return: + * 0: Success + * <0: Error + * + * On dir=READ rtrs client will request a data transfer from Server to client. + * The data that the server will respond with will be stored in @sg when + * the user receives an %RTRS_CLT_RDMA_EV_RDMA_REQUEST_WRITE_COMPL event. + * On dir=WRITE rtrs client will rdma write data in sg to server side. + */ +int rtrs_clt_request(int dir, struct rtrs_clt_req_ops *ops, + struct rtrs_clt *clt, struct rtrs_permit *permit, + const struct kvec *vec, size_t nr, size_t data_len, + struct scatterlist *sg, unsigned int sg_cnt) +{ + struct rtrs_clt_io_req *req; + struct rtrs_clt_sess *sess; + + enum dma_data_direction dma_dir; + int err = -ECONNABORTED, i; + size_t usr_len, hdr_len; + struct path_it it; + + /* Get kvec length */ + for (i = 0, usr_len = 0; i < nr; i++) + usr_len += vec[i].iov_len; + + if (dir == READ) { + hdr_len = sizeof(struct rtrs_msg_rdma_read) + + sg_cnt * sizeof(struct rtrs_sg_desc); + dma_dir = DMA_FROM_DEVICE; + } else { + hdr_len = sizeof(struct rtrs_msg_rdma_write); + dma_dir = DMA_TO_DEVICE; + } + + do_each_path(sess, clt, &it) { + if (unlikely(READ_ONCE(sess->state) != RTRS_CLT_CONNECTED)) + continue; + + if (unlikely(usr_len + hdr_len > sess->max_hdr_size)) { + rtrs_wrn_rl(sess->clt, + "%s request failed, user message size is %zu and header length %zu, but max size is %u\n", + dir == READ ? "Read" : "Write", + usr_len, hdr_len, sess->max_hdr_size); + err = -EMSGSIZE; + break; + } + req = rtrs_clt_get_req(sess, ops->conf_fn, permit, ops->priv, + vec, usr_len, sg, sg_cnt, data_len, + dma_dir); + if (dir == READ) + err = rtrs_clt_read_req(req); + else + err = rtrs_clt_write_req(req); + if (unlikely(err)) { + req->in_use = false; + continue; + } + /* Success path */ + break; + } while_each_path(&it); + + return err; +} +EXPORT_SYMBOL(rtrs_clt_request); + +/** + * rtrs_clt_query() - queries RTRS session attributes + *@clt: session pointer + *@attr: query results for session attributes. + * Returns: + * 0 on success + * -ECOMM no connection to the server + */ +int rtrs_clt_query(struct rtrs_clt *clt, struct rtrs_attrs *attr) +{ + if (!rtrs_clt_is_connected(clt)) + return -ECOMM; + + attr->queue_depth = clt->queue_depth; + attr->max_io_size = clt->max_io_size; + attr->sess_kobj = &clt->dev.kobj; + strlcpy(attr->sessname, clt->sessname, sizeof(attr->sessname)); + + return 0; +} +EXPORT_SYMBOL(rtrs_clt_query); + +int rtrs_clt_create_path_from_sysfs(struct rtrs_clt *clt, + struct rtrs_addr *addr) +{ + struct rtrs_clt_sess *sess; + int err; + + sess = alloc_sess(clt, addr, nr_cpu_ids, clt->max_segments); + if (IS_ERR(sess)) + return PTR_ERR(sess); + + /* + * It is totally safe to add path in CONNECTING state: coming + * IO will never grab it. Also it is very important to add + * path before init, since init fires LINK_CONNECTED event. + */ + rtrs_clt_add_path_to_arr(sess, addr); + + err = init_sess(sess); + if (err) + goto close_sess; + + err = rtrs_clt_create_sess_files(sess); + if (err) + goto close_sess; + + return 0; + +close_sess: + rtrs_clt_remove_path_from_arr(sess); + rtrs_clt_close_conns(sess, true); + free_sess(sess); + + return err; +} + +static int rtrs_clt_ib_dev_init(struct rtrs_ib_dev *dev) +{ + if (!(dev->ib_dev->attrs.device_cap_flags & + IB_DEVICE_MEM_MGT_EXTENSIONS)) { + pr_err("Memory registrations not supported.\n"); + return -ENOTSUPP; + } + + return 0; +} + +static const struct rtrs_rdma_dev_pd_ops dev_pd_ops = { + .init = rtrs_clt_ib_dev_init +}; + +static int __init rtrs_client_init(void) +{ + rtrs_rdma_dev_pd_init(0, &dev_pd); + + rtrs_clt_dev_class = class_create(THIS_MODULE, "rtrs-client"); + if (IS_ERR(rtrs_clt_dev_class)) { + pr_err("Failed to create rtrs-client dev class\n"); + return PTR_ERR(rtrs_clt_dev_class); + } + rtrs_wq = alloc_workqueue("rtrs_client_wq", WQ_MEM_RECLAIM, 0); + if (!rtrs_wq) { + class_destroy(rtrs_clt_dev_class); + return -ENOMEM; + } + + return 0; +} + +static void __exit rtrs_client_exit(void) +{ + destroy_workqueue(rtrs_wq); + class_destroy(rtrs_clt_dev_class); + rtrs_rdma_dev_pd_deinit(&dev_pd); +} + +module_init(rtrs_client_init); +module_exit(rtrs_client_exit); -- cgit v1.2.3 From 89dd4c3bdc46688b1af53298890161d22f7314cb Mon Sep 17 00:00:00 2001 From: Jack Wang Date: Mon, 11 May 2020 15:51:13 +0200 Subject: RDMA/rtrs: client: statistics functions This introduces set of functions used on client side to account statistics of RDMA data sent/received, amount of IOs inflight, latency, cpu migrations, etc. Almost all statistics are collected using percpu variables. Link: https://lore.kernel.org/r/20200511135131.27580-8-danil.kipnis@cloud.ionos.com Signed-off-by: Danil Kipnis Signed-off-by: Jack Wang Signed-off-by: Jason Gunthorpe --- drivers/infiniband/ulp/rtrs/rtrs-clt-stats.c | 200 +++++++++++++++++++++++++++ 1 file changed, 200 insertions(+) create mode 100644 drivers/infiniband/ulp/rtrs/rtrs-clt-stats.c diff --git a/drivers/infiniband/ulp/rtrs/rtrs-clt-stats.c b/drivers/infiniband/ulp/rtrs/rtrs-clt-stats.c new file mode 100644 index 000000000000..26bbe5d6dff5 --- /dev/null +++ b/drivers/infiniband/ulp/rtrs/rtrs-clt-stats.c @@ -0,0 +1,200 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * RDMA Transport Layer + * + * Copyright (c) 2014 - 2018 ProfitBricks GmbH. All rights reserved. + * Copyright (c) 2018 - 2019 1&1 IONOS Cloud GmbH. All rights reserved. + * Copyright (c) 2019 - 2020 1&1 IONOS SE. All rights reserved. + */ +#undef pr_fmt +#define pr_fmt(fmt) KBUILD_MODNAME " L" __stringify(__LINE__) ": " fmt + +#include "rtrs-clt.h" + +void rtrs_clt_update_wc_stats(struct rtrs_clt_con *con) +{ + struct rtrs_clt_sess *sess = to_clt_sess(con->c.sess); + struct rtrs_clt_stats *stats = sess->stats; + struct rtrs_clt_stats_pcpu *s; + int cpu; + + cpu = raw_smp_processor_id(); + s = this_cpu_ptr(stats->pcpu_stats); + if (unlikely(con->cpu != cpu)) { + s->cpu_migr.to++; + + /* Careful here, override s pointer */ + s = per_cpu_ptr(stats->pcpu_stats, con->cpu); + atomic_inc(&s->cpu_migr.from); + } +} + +void rtrs_clt_inc_failover_cnt(struct rtrs_clt_stats *stats) +{ + struct rtrs_clt_stats_pcpu *s; + + s = this_cpu_ptr(stats->pcpu_stats); + s->rdma.failover_cnt++; +} + +int rtrs_clt_stats_migration_cnt_to_str(struct rtrs_clt_stats *stats, + char *buf, size_t len) +{ + struct rtrs_clt_stats_pcpu *s; + + size_t used; + int cpu; + + used = scnprintf(buf, len, " "); + for_each_possible_cpu(cpu) + used += scnprintf(buf + used, len - used, " CPU%u", cpu); + + used += scnprintf(buf + used, len - used, "\nfrom:"); + for_each_possible_cpu(cpu) { + s = per_cpu_ptr(stats->pcpu_stats, cpu); + used += scnprintf(buf + used, len - used, " %d", + atomic_read(&s->cpu_migr.from)); + } + + used += scnprintf(buf + used, len - used, "\nto :"); + for_each_possible_cpu(cpu) { + s = per_cpu_ptr(stats->pcpu_stats, cpu); + used += scnprintf(buf + used, len - used, " %d", + s->cpu_migr.to); + } + used += scnprintf(buf + used, len - used, "\n"); + + return used; +} + +int rtrs_clt_stats_reconnects_to_str(struct rtrs_clt_stats *stats, char *buf, + size_t len) +{ + return scnprintf(buf, len, "%d %d\n", + stats->reconnects.successful_cnt, + stats->reconnects.fail_cnt); +} + +ssize_t rtrs_clt_stats_rdma_to_str(struct rtrs_clt_stats *stats, + char *page, size_t len) +{ + struct rtrs_clt_stats_rdma sum; + struct rtrs_clt_stats_rdma *r; + int cpu; + + memset(&sum, 0, sizeof(sum)); + + for_each_possible_cpu(cpu) { + r = &per_cpu_ptr(stats->pcpu_stats, cpu)->rdma; + + sum.dir[READ].cnt += r->dir[READ].cnt; + sum.dir[READ].size_total += r->dir[READ].size_total; + sum.dir[WRITE].cnt += r->dir[WRITE].cnt; + sum.dir[WRITE].size_total += r->dir[WRITE].size_total; + sum.failover_cnt += r->failover_cnt; + } + + return scnprintf(page, len, "%llu %llu %llu %llu %u %llu\n", + sum.dir[READ].cnt, sum.dir[READ].size_total, + sum.dir[WRITE].cnt, sum.dir[WRITE].size_total, + atomic_read(&stats->inflight), sum.failover_cnt); +} + +ssize_t rtrs_clt_reset_all_help(struct rtrs_clt_stats *s, + char *page, size_t len) +{ + return scnprintf(page, len, "echo 1 to reset all statistics\n"); +} + +int rtrs_clt_reset_rdma_stats(struct rtrs_clt_stats *stats, bool enable) +{ + struct rtrs_clt_stats_pcpu *s; + int cpu; + + if (!enable) + return -EINVAL; + + for_each_possible_cpu(cpu) { + s = per_cpu_ptr(stats->pcpu_stats, cpu); + memset(&s->rdma, 0, sizeof(s->rdma)); + } + + return 0; +} + +int rtrs_clt_reset_cpu_migr_stats(struct rtrs_clt_stats *stats, bool enable) +{ + struct rtrs_clt_stats_pcpu *s; + int cpu; + + if (!enable) + return -EINVAL; + + for_each_possible_cpu(cpu) { + s = per_cpu_ptr(stats->pcpu_stats, cpu); + memset(&s->cpu_migr, 0, sizeof(s->cpu_migr)); + } + + return 0; +} + +int rtrs_clt_reset_reconnects_stat(struct rtrs_clt_stats *stats, bool enable) +{ + if (!enable) + return -EINVAL; + + memset(&stats->reconnects, 0, sizeof(stats->reconnects)); + + return 0; +} + +int rtrs_clt_reset_all_stats(struct rtrs_clt_stats *s, bool enable) +{ + if (enable) { + rtrs_clt_reset_rdma_stats(s, enable); + rtrs_clt_reset_cpu_migr_stats(s, enable); + rtrs_clt_reset_reconnects_stat(s, enable); + atomic_set(&s->inflight, 0); + return 0; + } + + return -EINVAL; +} + +static inline void rtrs_clt_update_rdma_stats(struct rtrs_clt_stats *stats, + size_t size, int d) +{ + struct rtrs_clt_stats_pcpu *s; + + s = this_cpu_ptr(stats->pcpu_stats); + s->rdma.dir[d].cnt++; + s->rdma.dir[d].size_total += size; +} + +void rtrs_clt_update_all_stats(struct rtrs_clt_io_req *req, int dir) +{ + struct rtrs_clt_con *con = req->con; + struct rtrs_clt_sess *sess = to_clt_sess(con->c.sess); + struct rtrs_clt_stats *stats = sess->stats; + unsigned int len; + + len = req->usr_len + req->data_len; + rtrs_clt_update_rdma_stats(stats, len, dir); + if (sess->clt->mp_policy == MP_POLICY_MIN_INFLIGHT) + atomic_inc(&stats->inflight); +} + +int rtrs_clt_init_stats(struct rtrs_clt_stats *stats) +{ + stats->pcpu_stats = alloc_percpu(typeof(*stats->pcpu_stats)); + if (!stats->pcpu_stats) + return -ENOMEM; + + /* + * successful_cnt will be set to 0 after session + * is established for the first time + */ + stats->reconnects.successful_cnt = -1; + + return 0; +} -- cgit v1.2.3 From 215378b838df0019097a5266ebec1269ebd27f89 Mon Sep 17 00:00:00 2001 From: Jack Wang Date: Mon, 11 May 2020 15:51:14 +0200 Subject: RDMA/rtrs: client: sysfs interface functions This is the sysfs interface to rtrs sessions on client side: /sys/class/rtrs-client// *** rtrs session created by rtrs_clt_open() API call | |- max_reconnect_attempts | *** number of reconnect attempts for session | |- add_path | *** adds another connection path into rtrs session | |- paths// *** established paths to server in a session | |- disconnect | *** disconnect path | |- reconnect | *** reconnect path | |- remove_path | *** remove current path | |- state | *** retrieve current path state | |- hca_port | *** HCA port number | |- hca_name | *** HCA name | |- stats/ *** current path statistics | |- cpu_migration |- rdma |- reconnects |- reset_all Link: https://lore.kernel.org/r/20200511135131.27580-9-danil.kipnis@cloud.ionos.com Signed-off-by: Danil Kipnis Signed-off-by: Jack Wang Signed-off-by: Jason Gunthorpe --- drivers/infiniband/ulp/rtrs/rtrs-clt-sysfs.c | 483 +++++++++++++++++++++++++++ 1 file changed, 483 insertions(+) create mode 100644 drivers/infiniband/ulp/rtrs/rtrs-clt-sysfs.c diff --git a/drivers/infiniband/ulp/rtrs/rtrs-clt-sysfs.c b/drivers/infiniband/ulp/rtrs/rtrs-clt-sysfs.c new file mode 100644 index 000000000000..298b747d0330 --- /dev/null +++ b/drivers/infiniband/ulp/rtrs/rtrs-clt-sysfs.c @@ -0,0 +1,483 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * RDMA Transport Layer + * + * Copyright (c) 2014 - 2018 ProfitBricks GmbH. All rights reserved. + * Copyright (c) 2018 - 2019 1&1 IONOS Cloud GmbH. All rights reserved. + * Copyright (c) 2019 - 2020 1&1 IONOS SE. All rights reserved. + */ +#undef pr_fmt +#define pr_fmt(fmt) KBUILD_MODNAME " L" __stringify(__LINE__) ": " fmt + +#include "rtrs-pri.h" +#include "rtrs-clt.h" +#include "rtrs-log.h" + +#define MIN_MAX_RECONN_ATT -1 +#define MAX_MAX_RECONN_ATT 9999 + +static void rtrs_clt_sess_release(struct kobject *kobj) +{ + struct rtrs_clt_sess *sess; + + sess = container_of(kobj, struct rtrs_clt_sess, kobj); + + free_sess(sess); +} + +static struct kobj_type ktype_sess = { + .sysfs_ops = &kobj_sysfs_ops, + .release = rtrs_clt_sess_release +}; + +static void rtrs_clt_sess_stats_release(struct kobject *kobj) +{ + struct rtrs_clt_stats *stats; + + stats = container_of(kobj, struct rtrs_clt_stats, kobj_stats); + + free_percpu(stats->pcpu_stats); + + kfree(stats); +} + +static struct kobj_type ktype_stats = { + .sysfs_ops = &kobj_sysfs_ops, + .release = rtrs_clt_sess_stats_release, +}; + +static ssize_t max_reconnect_attempts_show(struct device *dev, + struct device_attribute *attr, + char *page) +{ + struct rtrs_clt *clt = container_of(dev, struct rtrs_clt, dev); + + return sprintf(page, "%d\n", rtrs_clt_get_max_reconnect_attempts(clt)); +} + +static ssize_t max_reconnect_attempts_store(struct device *dev, + struct device_attribute *attr, + const char *buf, + size_t count) +{ + int value; + int ret; + struct rtrs_clt *clt = container_of(dev, struct rtrs_clt, dev); + + ret = kstrtoint(buf, 10, &value); + if (ret) { + rtrs_err(clt, "%s: failed to convert string '%s' to int\n", + attr->attr.name, buf); + return ret; + } + if (value > MAX_MAX_RECONN_ATT || + value < MIN_MAX_RECONN_ATT) { + rtrs_err(clt, + "%s: invalid range (provided: '%s', accepted: min: %d, max: %d)\n", + attr->attr.name, buf, MIN_MAX_RECONN_ATT, + MAX_MAX_RECONN_ATT); + return -EINVAL; + } + rtrs_clt_set_max_reconnect_attempts(clt, value); + + return count; +} + +static DEVICE_ATTR_RW(max_reconnect_attempts); + +static ssize_t mpath_policy_show(struct device *dev, + struct device_attribute *attr, + char *page) +{ + struct rtrs_clt *clt; + + clt = container_of(dev, struct rtrs_clt, dev); + + switch (clt->mp_policy) { + case MP_POLICY_RR: + return sprintf(page, "round-robin (RR: %d)\n", clt->mp_policy); + case MP_POLICY_MIN_INFLIGHT: + return sprintf(page, "min-inflight (MI: %d)\n", clt->mp_policy); + default: + return sprintf(page, "Unknown (%d)\n", clt->mp_policy); + } +} + +static ssize_t mpath_policy_store(struct device *dev, + struct device_attribute *attr, + const char *buf, + size_t count) +{ + struct rtrs_clt *clt; + int value; + int ret; + + clt = container_of(dev, struct rtrs_clt, dev); + + ret = kstrtoint(buf, 10, &value); + if (!ret && (value == MP_POLICY_RR || + value == MP_POLICY_MIN_INFLIGHT)) { + clt->mp_policy = value; + return count; + } + + if (!strncasecmp(buf, "round-robin", 11) || + !strncasecmp(buf, "rr", 2)) + clt->mp_policy = MP_POLICY_RR; + else if (!strncasecmp(buf, "min-inflight", 12) || + !strncasecmp(buf, "mi", 2)) + clt->mp_policy = MP_POLICY_MIN_INFLIGHT; + else + return -EINVAL; + + return count; +} + +static DEVICE_ATTR_RW(mpath_policy); + +static ssize_t add_path_show(struct device *dev, + struct device_attribute *attr, char *page) +{ + return scnprintf(page, PAGE_SIZE, + "Usage: echo [@] > %s\n\n*addr ::= [ ip: | gid: ]\n", + attr->attr.name); +} + +static ssize_t add_path_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct sockaddr_storage srcaddr, dstaddr; + struct rtrs_addr addr = { + .src = &srcaddr, + .dst = &dstaddr + }; + struct rtrs_clt *clt; + const char *nl; + size_t len; + int err; + + clt = container_of(dev, struct rtrs_clt, dev); + + nl = strchr(buf, '\n'); + if (nl) + len = nl - buf; + else + len = count; + err = rtrs_addr_to_sockaddr(buf, len, clt->port, &addr); + if (err) + return -EINVAL; + + err = rtrs_clt_create_path_from_sysfs(clt, &addr); + if (err) + return err; + + return count; +} + +static DEVICE_ATTR_RW(add_path); + +static ssize_t rtrs_clt_state_show(struct kobject *kobj, + struct kobj_attribute *attr, char *page) +{ + struct rtrs_clt_sess *sess; + + sess = container_of(kobj, struct rtrs_clt_sess, kobj); + if (sess->state == RTRS_CLT_CONNECTED) + return sprintf(page, "connected\n"); + + return sprintf(page, "disconnected\n"); +} + +static struct kobj_attribute rtrs_clt_state_attr = + __ATTR(state, 0444, rtrs_clt_state_show, NULL); + +static ssize_t rtrs_clt_reconnect_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *page) +{ + return scnprintf(page, PAGE_SIZE, "Usage: echo 1 > %s\n", + attr->attr.name); +} + +static ssize_t rtrs_clt_reconnect_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct rtrs_clt_sess *sess; + int ret; + + sess = container_of(kobj, struct rtrs_clt_sess, kobj); + if (!sysfs_streq(buf, "1")) { + rtrs_err(sess->clt, "%s: unknown value: '%s'\n", + attr->attr.name, buf); + return -EINVAL; + } + ret = rtrs_clt_reconnect_from_sysfs(sess); + if (ret) + return ret; + + return count; +} + +static struct kobj_attribute rtrs_clt_reconnect_attr = + __ATTR(reconnect, 0644, rtrs_clt_reconnect_show, + rtrs_clt_reconnect_store); + +static ssize_t rtrs_clt_disconnect_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *page) +{ + return scnprintf(page, PAGE_SIZE, "Usage: echo 1 > %s\n", + attr->attr.name); +} + +static ssize_t rtrs_clt_disconnect_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct rtrs_clt_sess *sess; + int ret; + + sess = container_of(kobj, struct rtrs_clt_sess, kobj); + if (!sysfs_streq(buf, "1")) { + rtrs_err(sess->clt, "%s: unknown value: '%s'\n", + attr->attr.name, buf); + return -EINVAL; + } + ret = rtrs_clt_disconnect_from_sysfs(sess); + if (ret) + return ret; + + return count; +} + +static struct kobj_attribute rtrs_clt_disconnect_attr = + __ATTR(disconnect, 0644, rtrs_clt_disconnect_show, + rtrs_clt_disconnect_store); + +static ssize_t rtrs_clt_remove_path_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *page) +{ + return scnprintf(page, PAGE_SIZE, "Usage: echo 1 > %s\n", + attr->attr.name); +} + +static ssize_t rtrs_clt_remove_path_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct rtrs_clt_sess *sess; + int ret; + + sess = container_of(kobj, struct rtrs_clt_sess, kobj); + if (!sysfs_streq(buf, "1")) { + rtrs_err(sess->clt, "%s: unknown value: '%s'\n", + attr->attr.name, buf); + return -EINVAL; + } + ret = rtrs_clt_remove_path_from_sysfs(sess, &attr->attr); + if (ret) + return ret; + + return count; +} + +static struct kobj_attribute rtrs_clt_remove_path_attr = + __ATTR(remove_path, 0644, rtrs_clt_remove_path_show, + rtrs_clt_remove_path_store); + +STAT_ATTR(struct rtrs_clt_stats, cpu_migration, + rtrs_clt_stats_migration_cnt_to_str, + rtrs_clt_reset_cpu_migr_stats); + +STAT_ATTR(struct rtrs_clt_stats, reconnects, + rtrs_clt_stats_reconnects_to_str, + rtrs_clt_reset_reconnects_stat); + +STAT_ATTR(struct rtrs_clt_stats, rdma, + rtrs_clt_stats_rdma_to_str, + rtrs_clt_reset_rdma_stats); + +STAT_ATTR(struct rtrs_clt_stats, reset_all, + rtrs_clt_reset_all_help, + rtrs_clt_reset_all_stats); + +static struct attribute *rtrs_clt_stats_attrs[] = { + &cpu_migration_attr.attr, + &reconnects_attr.attr, + &rdma_attr.attr, + &reset_all_attr.attr, + NULL +}; + +static struct attribute_group rtrs_clt_stats_attr_group = { + .attrs = rtrs_clt_stats_attrs, +}; + +static ssize_t rtrs_clt_hca_port_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *page) +{ + struct rtrs_clt_sess *sess; + + sess = container_of(kobj, typeof(*sess), kobj); + + return scnprintf(page, PAGE_SIZE, "%u\n", sess->hca_port); +} + +static struct kobj_attribute rtrs_clt_hca_port_attr = + __ATTR(hca_port, 0444, rtrs_clt_hca_port_show, NULL); + +static ssize_t rtrs_clt_hca_name_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *page) +{ + struct rtrs_clt_sess *sess; + + sess = container_of(kobj, struct rtrs_clt_sess, kobj); + + return scnprintf(page, PAGE_SIZE, "%s\n", sess->hca_name); +} + +static struct kobj_attribute rtrs_clt_hca_name_attr = + __ATTR(hca_name, 0444, rtrs_clt_hca_name_show, NULL); + +static ssize_t rtrs_clt_src_addr_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *page) +{ + struct rtrs_clt_sess *sess; + int cnt; + + sess = container_of(kobj, struct rtrs_clt_sess, kobj); + cnt = sockaddr_to_str((struct sockaddr *)&sess->s.src_addr, + page, PAGE_SIZE); + return cnt + scnprintf(page + cnt, PAGE_SIZE - cnt, "\n"); +} + +static struct kobj_attribute rtrs_clt_src_addr_attr = + __ATTR(src_addr, 0444, rtrs_clt_src_addr_show, NULL); + +static ssize_t rtrs_clt_dst_addr_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *page) +{ + struct rtrs_clt_sess *sess; + int cnt; + + sess = container_of(kobj, struct rtrs_clt_sess, kobj); + cnt = sockaddr_to_str((struct sockaddr *)&sess->s.dst_addr, + page, PAGE_SIZE); + return cnt + scnprintf(page + cnt, PAGE_SIZE - cnt, "\n"); +} + +static struct kobj_attribute rtrs_clt_dst_addr_attr = + __ATTR(dst_addr, 0444, rtrs_clt_dst_addr_show, NULL); + +static struct attribute *rtrs_clt_sess_attrs[] = { + &rtrs_clt_hca_name_attr.attr, + &rtrs_clt_hca_port_attr.attr, + &rtrs_clt_src_addr_attr.attr, + &rtrs_clt_dst_addr_attr.attr, + &rtrs_clt_state_attr.attr, + &rtrs_clt_reconnect_attr.attr, + &rtrs_clt_disconnect_attr.attr, + &rtrs_clt_remove_path_attr.attr, + NULL, +}; + +static struct attribute_group rtrs_clt_sess_attr_group = { + .attrs = rtrs_clt_sess_attrs, +}; + +int rtrs_clt_create_sess_files(struct rtrs_clt_sess *sess) +{ + struct rtrs_clt *clt = sess->clt; + char str[NAME_MAX]; + int err, cnt; + + cnt = sockaddr_to_str((struct sockaddr *)&sess->s.src_addr, + str, sizeof(str)); + cnt += scnprintf(str + cnt, sizeof(str) - cnt, "@"); + sockaddr_to_str((struct sockaddr *)&sess->s.dst_addr, + str + cnt, sizeof(str) - cnt); + + err = kobject_init_and_add(&sess->kobj, &ktype_sess, clt->kobj_paths, + "%s", str); + if (err) { + pr_err("kobject_init_and_add: %d\n", err); + return err; + } + err = sysfs_create_group(&sess->kobj, &rtrs_clt_sess_attr_group); + if (err) { + pr_err("sysfs_create_group(): %d\n", err); + goto put_kobj; + } + err = kobject_init_and_add(&sess->stats->kobj_stats, &ktype_stats, + &sess->kobj, "stats"); + if (err) { + pr_err("kobject_init_and_add: %d\n", err); + goto remove_group; + } + + err = sysfs_create_group(&sess->stats->kobj_stats, + &rtrs_clt_stats_attr_group); + if (err) { + pr_err("failed to create stats sysfs group, err: %d\n", err); + goto put_kobj_stats; + } + + return 0; + +put_kobj_stats: + kobject_del(&sess->stats->kobj_stats); + kobject_put(&sess->stats->kobj_stats); +remove_group: + sysfs_remove_group(&sess->kobj, &rtrs_clt_sess_attr_group); +put_kobj: + kobject_del(&sess->kobj); + kobject_put(&sess->kobj); + + return err; +} + +void rtrs_clt_destroy_sess_files(struct rtrs_clt_sess *sess, + const struct attribute *sysfs_self) +{ + kobject_del(&sess->stats->kobj_stats); + kobject_put(&sess->stats->kobj_stats); + if (sysfs_self) + sysfs_remove_file_self(&sess->kobj, sysfs_self); + kobject_del(&sess->kobj); +} + +static struct attribute *rtrs_clt_attrs[] = { + &dev_attr_max_reconnect_attempts.attr, + &dev_attr_mpath_policy.attr, + &dev_attr_add_path.attr, + NULL, +}; + +static struct attribute_group rtrs_clt_attr_group = { + .attrs = rtrs_clt_attrs, +}; + +int rtrs_clt_create_sysfs_root_files(struct rtrs_clt *clt) +{ + return sysfs_create_group(&clt->dev.kobj, &rtrs_clt_attr_group); +} + +void rtrs_clt_destroy_sysfs_root_folders(struct rtrs_clt *clt) +{ + if (clt->kobj_paths) { + kobject_del(clt->kobj_paths); + kobject_put(clt->kobj_paths); + } +} + +void rtrs_clt_destroy_sysfs_root_files(struct rtrs_clt *clt) +{ + sysfs_remove_group(&clt->dev.kobj, &rtrs_clt_attr_group); +} -- cgit v1.2.3 From 787f78a6b075ac7678123d5d0cac2c57d98c63e1 Mon Sep 17 00:00:00 2001 From: Jack Wang Date: Mon, 11 May 2020 15:51:15 +0200 Subject: RDMA/rtrs: server: private header with server structs and functions This header describes main structs and functions used by rtrs-server module, mainly for accepting rtrs sessions, creating/destroying sysfs entries, accounting statistics on server side. Link: https://lore.kernel.org/r/20200511135131.27580-10-danil.kipnis@cloud.ionos.com Signed-off-by: Danil Kipnis Signed-off-by: Jack Wang Signed-off-by: Jason Gunthorpe --- drivers/infiniband/ulp/rtrs/rtrs-srv.h | 148 +++++++++++++++++++++++++++++++++ 1 file changed, 148 insertions(+) create mode 100644 drivers/infiniband/ulp/rtrs/rtrs-srv.h diff --git a/drivers/infiniband/ulp/rtrs/rtrs-srv.h b/drivers/infiniband/ulp/rtrs/rtrs-srv.h new file mode 100644 index 000000000000..dc95b0932f0d --- /dev/null +++ b/drivers/infiniband/ulp/rtrs/rtrs-srv.h @@ -0,0 +1,148 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * RDMA Transport Layer + * + * Copyright (c) 2014 - 2018 ProfitBricks GmbH. All rights reserved. + * Copyright (c) 2018 - 2019 1&1 IONOS Cloud GmbH. All rights reserved. + * Copyright (c) 2019 - 2020 1&1 IONOS SE. All rights reserved. + */ + +#ifndef RTRS_SRV_H +#define RTRS_SRV_H + +#include +#include +#include "rtrs-pri.h" + +/* + * enum rtrs_srv_state - Server states. + */ +enum rtrs_srv_state { + RTRS_SRV_CONNECTING, + RTRS_SRV_CONNECTED, + RTRS_SRV_CLOSING, + RTRS_SRV_CLOSED, +}; + +/* stats for Read and write operation. + * see Documentation/ABI/testing/sysfs-class-rtrs-server for details + */ +struct rtrs_srv_stats_rdma_stats { + struct { + atomic64_t cnt; + atomic64_t size_total; + } dir[2]; +}; + +struct rtrs_srv_stats { + struct kobject kobj_stats; + struct rtrs_srv_stats_rdma_stats rdma_stats; + struct rtrs_srv_sess *sess; +}; + +struct rtrs_srv_con { + struct rtrs_con c; + atomic_t wr_cnt; + atomic_t sq_wr_avail; + struct list_head rsp_wr_wait_list; + spinlock_t rsp_wr_wait_lock; +}; + +/* IO context in rtrs_srv, each io has one */ +struct rtrs_srv_op { + struct rtrs_srv_con *con; + u32 msg_id; + u8 dir; + struct rtrs_msg_rdma_read *rd_msg; + struct ib_rdma_wr tx_wr; + struct ib_sge tx_sg; + struct list_head wait_list; + int status; +}; + +/* + * server side memory region context, when always_invalidate=Y, we need + * queue_depth of memory regrion to invalidate each memory region. + */ +struct rtrs_srv_mr { + struct ib_mr *mr; + struct sg_table sgt; + struct ib_cqe inv_cqe; /* only for always_invalidate=true */ + u32 msg_id; /* only for always_invalidate=true */ + u32 msg_off; /* only for always_invalidate=true */ + struct rtrs_iu *iu; /* send buffer for new rkey msg */ +}; + +struct rtrs_srv_sess { + struct rtrs_sess s; + struct rtrs_srv *srv; + struct work_struct close_work; + enum rtrs_srv_state state; + spinlock_t state_lock; + int cur_cq_vector; + struct rtrs_srv_op **ops_ids; + atomic_t ids_inflight; + wait_queue_head_t ids_waitq; + struct rtrs_srv_mr *mrs; + unsigned int mrs_num; + dma_addr_t *dma_addr; + bool established; + unsigned int mem_bits; + struct kobject kobj; + struct rtrs_srv_stats *stats; +}; + +struct rtrs_srv { + struct list_head paths_list; + int paths_up; + struct mutex paths_ev_mutex; + size_t paths_num; + struct mutex paths_mutex; + uuid_t paths_uuid; + refcount_t refcount; + struct rtrs_srv_ctx *ctx; + struct list_head ctx_list; + void *priv; + size_t queue_depth; + struct page **chunks; + struct device dev; + unsigned int dev_ref; + struct kobject *kobj_paths; +}; + +struct rtrs_srv_ctx { + struct rtrs_srv_ops ops; + struct rdma_cm_id *cm_id_ip; + struct rdma_cm_id *cm_id_ib; + struct mutex srv_mutex; + struct list_head srv_list; +}; + +extern struct class *rtrs_dev_class; + +void close_sess(struct rtrs_srv_sess *sess); + +static inline void rtrs_srv_update_rdma_stats(struct rtrs_srv_stats *s, + size_t size, int d) +{ + atomic64_inc(&s->rdma_stats.dir[d].cnt); + atomic64_add(size, &s->rdma_stats.dir[d].size_total); +} + +/* functions which are implemented in rtrs-srv-stats.c */ +int rtrs_srv_reset_rdma_stats(struct rtrs_srv_stats *stats, bool enable); +ssize_t rtrs_srv_stats_rdma_to_str(struct rtrs_srv_stats *stats, + char *page, size_t len); +int rtrs_srv_reset_wc_completion_stats(struct rtrs_srv_stats *stats, + bool enable); +int rtrs_srv_stats_wc_completion_to_str(struct rtrs_srv_stats *stats, char *buf, + size_t len); +int rtrs_srv_reset_all_stats(struct rtrs_srv_stats *stats, bool enable); +ssize_t rtrs_srv_reset_all_help(struct rtrs_srv_stats *stats, + char *page, size_t len); + +/* functions which are implemented in rtrs-srv-sysfs.c */ +int rtrs_srv_create_sess_files(struct rtrs_srv_sess *sess); +void rtrs_srv_destroy_sess_files(struct rtrs_srv_sess *sess); + +#endif /* RTRS_SRV_H */ -- cgit v1.2.3 From 9cb837480424e78ed585376f944088246685aec3 Mon Sep 17 00:00:00 2001 From: Jack Wang Date: Mon, 11 May 2020 15:51:16 +0200 Subject: RDMA/rtrs: server: main functionality This is main functionality of rtrs-server module, which accepts set of RDMA connections (so called rtrs session), creates/destroys sysfs entries associated with rtrs session and notifies upper layer (user of RTRS API) about RDMA requests or link events. Link: https://lore.kernel.org/r/20200511135131.27580-11-danil.kipnis@cloud.ionos.com Signed-off-by: Danil Kipnis Signed-off-by: Jack Wang Signed-off-by: Jason Gunthorpe --- drivers/infiniband/ulp/rtrs/rtrs-srv.c | 2175 ++++++++++++++++++++++++++++++++ 1 file changed, 2175 insertions(+) create mode 100644 drivers/infiniband/ulp/rtrs/rtrs-srv.c diff --git a/drivers/infiniband/ulp/rtrs/rtrs-srv.c b/drivers/infiniband/ulp/rtrs/rtrs-srv.c new file mode 100644 index 000000000000..ba8ab33b94a2 --- /dev/null +++ b/drivers/infiniband/ulp/rtrs/rtrs-srv.c @@ -0,0 +1,2175 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * RDMA Transport Layer + * + * Copyright (c) 2014 - 2018 ProfitBricks GmbH. All rights reserved. + * Copyright (c) 2018 - 2019 1&1 IONOS Cloud GmbH. All rights reserved. + * Copyright (c) 2019 - 2020 1&1 IONOS SE. All rights reserved. + */ + +#undef pr_fmt +#define pr_fmt(fmt) KBUILD_MODNAME " L" __stringify(__LINE__) ": " fmt + +#include +#include + +#include "rtrs-srv.h" +#include "rtrs-log.h" + +MODULE_DESCRIPTION("RDMA Transport Server"); +MODULE_LICENSE("GPL"); + +/* Must be power of 2, see mask from mr->page_size in ib_sg_to_pages() */ +#define DEFAULT_MAX_CHUNK_SIZE (128 << 10) +#define DEFAULT_SESS_QUEUE_DEPTH 512 +#define MAX_HDR_SIZE PAGE_SIZE + +/* We guarantee to serve 10 paths at least */ +#define CHUNK_POOL_SZ 10 + +static struct rtrs_rdma_dev_pd dev_pd; +static mempool_t *chunk_pool; +struct class *rtrs_dev_class; + +static int __read_mostly max_chunk_size = DEFAULT_MAX_CHUNK_SIZE; +static int __read_mostly sess_queue_depth = DEFAULT_SESS_QUEUE_DEPTH; + +static bool always_invalidate = true; +module_param(always_invalidate, bool, 0444); +MODULE_PARM_DESC(always_invalidate, + "Invalidate memory registration for contiguous memory regions before accessing."); + +module_param_named(max_chunk_size, max_chunk_size, int, 0444); +MODULE_PARM_DESC(max_chunk_size, + "Max size for each IO request, when change the unit is in byte (default: " + __stringify(DEFAULT_MAX_CHUNK_SIZE) "KB)"); + +module_param_named(sess_queue_depth, sess_queue_depth, int, 0444); +MODULE_PARM_DESC(sess_queue_depth, + "Number of buffers for pending I/O requests to allocate per session. Maximum: " + __stringify(MAX_SESS_QUEUE_DEPTH) " (default: " + __stringify(DEFAULT_SESS_QUEUE_DEPTH) ")"); + +static cpumask_t cq_affinity_mask = { CPU_BITS_ALL }; + +static struct workqueue_struct *rtrs_wq; + +static inline struct rtrs_srv_con *to_srv_con(struct rtrs_con *c) +{ + return container_of(c, struct rtrs_srv_con, c); +} + +static inline struct rtrs_srv_sess *to_srv_sess(struct rtrs_sess *s) +{ + return container_of(s, struct rtrs_srv_sess, s); +} + +static bool __rtrs_srv_change_state(struct rtrs_srv_sess *sess, + enum rtrs_srv_state new_state) +{ + enum rtrs_srv_state old_state; + bool changed = false; + + lockdep_assert_held(&sess->state_lock); + old_state = sess->state; + switch (new_state) { + case RTRS_SRV_CONNECTED: + switch (old_state) { + case RTRS_SRV_CONNECTING: + changed = true; + fallthrough; + default: + break; + } + break; + case RTRS_SRV_CLOSING: + switch (old_state) { + case RTRS_SRV_CONNECTING: + case RTRS_SRV_CONNECTED: + changed = true; + fallthrough; + default: + break; + } + break; + case RTRS_SRV_CLOSED: + switch (old_state) { + case RTRS_SRV_CLOSING: + changed = true; + fallthrough; + default: + break; + } + break; + default: + break; + } + if (changed) + sess->state = new_state; + + return changed; +} + +static bool rtrs_srv_change_state_get_old(struct rtrs_srv_sess *sess, + enum rtrs_srv_state new_state, + enum rtrs_srv_state *old_state) +{ + bool changed; + + spin_lock_irq(&sess->state_lock); + *old_state = sess->state; + changed = __rtrs_srv_change_state(sess, new_state); + spin_unlock_irq(&sess->state_lock); + + return changed; +} + +static bool rtrs_srv_change_state(struct rtrs_srv_sess *sess, + enum rtrs_srv_state new_state) +{ + enum rtrs_srv_state old_state; + + return rtrs_srv_change_state_get_old(sess, new_state, &old_state); +} + +static void free_id(struct rtrs_srv_op *id) +{ + if (!id) + return; + kfree(id); +} + +static void rtrs_srv_free_ops_ids(struct rtrs_srv_sess *sess) +{ + struct rtrs_srv *srv = sess->srv; + int i; + + WARN_ON(atomic_read(&sess->ids_inflight)); + if (sess->ops_ids) { + for (i = 0; i < srv->queue_depth; i++) + free_id(sess->ops_ids[i]); + kfree(sess->ops_ids); + sess->ops_ids = NULL; + } +} + +static void rtrs_srv_rdma_done(struct ib_cq *cq, struct ib_wc *wc); + +static struct ib_cqe io_comp_cqe = { + .done = rtrs_srv_rdma_done +}; + +static int rtrs_srv_alloc_ops_ids(struct rtrs_srv_sess *sess) +{ + struct rtrs_srv *srv = sess->srv; + struct rtrs_srv_op *id; + int i; + + sess->ops_ids = kcalloc(srv->queue_depth, sizeof(*sess->ops_ids), + GFP_KERNEL); + if (!sess->ops_ids) + goto err; + + for (i = 0; i < srv->queue_depth; ++i) { + id = kzalloc(sizeof(*id), GFP_KERNEL); + if (!id) + goto err; + + sess->ops_ids[i] = id; + } + init_waitqueue_head(&sess->ids_waitq); + atomic_set(&sess->ids_inflight, 0); + + return 0; + +err: + rtrs_srv_free_ops_ids(sess); + return -ENOMEM; +} + +static inline void rtrs_srv_get_ops_ids(struct rtrs_srv_sess *sess) +{ + atomic_inc(&sess->ids_inflight); +} + +static inline void rtrs_srv_put_ops_ids(struct rtrs_srv_sess *sess) +{ + if (atomic_dec_and_test(&sess->ids_inflight)) + wake_up(&sess->ids_waitq); +} + +static void rtrs_srv_wait_ops_ids(struct rtrs_srv_sess *sess) +{ + wait_event(sess->ids_waitq, !atomic_read(&sess->ids_inflight)); +} + + +static void rtrs_srv_reg_mr_done(struct ib_cq *cq, struct ib_wc *wc) +{ + struct rtrs_srv_con *con = cq->cq_context; + struct rtrs_sess *s = con->c.sess; + struct rtrs_srv_sess *sess = to_srv_sess(s); + + if (unlikely(wc->status != IB_WC_SUCCESS)) { + rtrs_err(s, "REG MR failed: %s\n", + ib_wc_status_msg(wc->status)); + close_sess(sess); + return; + } +} + +static struct ib_cqe local_reg_cqe = { + .done = rtrs_srv_reg_mr_done +}; + +static int rdma_write_sg(struct rtrs_srv_op *id) +{ + struct rtrs_sess *s = id->con->c.sess; + struct rtrs_srv_sess *sess = to_srv_sess(s); + dma_addr_t dma_addr = sess->dma_addr[id->msg_id]; + struct rtrs_srv_mr *srv_mr; + struct rtrs_srv *srv = sess->srv; + struct ib_send_wr inv_wr, imm_wr; + struct ib_rdma_wr *wr = NULL; + enum ib_send_flags flags; + size_t sg_cnt; + int err, offset; + bool need_inval; + u32 rkey = 0; + struct ib_reg_wr rwr; + struct ib_sge *plist; + struct ib_sge list; + + sg_cnt = le16_to_cpu(id->rd_msg->sg_cnt); + need_inval = le16_to_cpu(id->rd_msg->flags) & RTRS_MSG_NEED_INVAL_F; + if (unlikely(sg_cnt != 1)) + return -EINVAL; + + offset = 0; + + wr = &id->tx_wr; + plist = &id->tx_sg; + plist->addr = dma_addr + offset; + plist->length = le32_to_cpu(id->rd_msg->desc[0].len); + + /* WR will fail with length error + * if this is 0 + */ + if (unlikely(plist->length == 0)) { + rtrs_err(s, "Invalid RDMA-Write sg list length 0\n"); + return -EINVAL; + } + + plist->lkey = sess->s.dev->ib_pd->local_dma_lkey; + offset += plist->length; + + wr->wr.sg_list = plist; + wr->wr.num_sge = 1; + wr->remote_addr = le64_to_cpu(id->rd_msg->desc[0].addr); + wr->rkey = le32_to_cpu(id->rd_msg->desc[0].key); + if (rkey == 0) + rkey = wr->rkey; + else + /* Only one key is actually used */ + WARN_ON_ONCE(rkey != wr->rkey); + + wr->wr.opcode = IB_WR_RDMA_WRITE; + wr->wr.ex.imm_data = 0; + wr->wr.send_flags = 0; + + if (need_inval && always_invalidate) { + wr->wr.next = &rwr.wr; + rwr.wr.next = &inv_wr; + inv_wr.next = &imm_wr; + } else if (always_invalidate) { + wr->wr.next = &rwr.wr; + rwr.wr.next = &imm_wr; + } else if (need_inval) { + wr->wr.next = &inv_wr; + inv_wr.next = &imm_wr; + } else { + wr->wr.next = &imm_wr; + } + /* + * From time to time we have to post signaled sends, + * or send queue will fill up and only QP reset can help. + */ + flags = (atomic_inc_return(&id->con->wr_cnt) % srv->queue_depth) ? + 0 : IB_SEND_SIGNALED; + + if (need_inval) { + inv_wr.sg_list = NULL; + inv_wr.num_sge = 0; + inv_wr.opcode = IB_WR_SEND_WITH_INV; + inv_wr.send_flags = 0; + inv_wr.ex.invalidate_rkey = rkey; + } + + imm_wr.next = NULL; + if (always_invalidate) { + struct rtrs_msg_rkey_rsp *msg; + + srv_mr = &sess->mrs[id->msg_id]; + rwr.wr.opcode = IB_WR_REG_MR; + rwr.wr.num_sge = 0; + rwr.mr = srv_mr->mr; + rwr.wr.send_flags = 0; + rwr.key = srv_mr->mr->rkey; + rwr.access = (IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_REMOTE_WRITE); + msg = srv_mr->iu->buf; + msg->buf_id = cpu_to_le16(id->msg_id); + msg->type = cpu_to_le16(RTRS_MSG_RKEY_RSP); + msg->rkey = cpu_to_le32(srv_mr->mr->rkey); + + list.addr = srv_mr->iu->dma_addr; + list.length = sizeof(*msg); + list.lkey = sess->s.dev->ib_pd->local_dma_lkey; + imm_wr.sg_list = &list; + imm_wr.num_sge = 1; + imm_wr.opcode = IB_WR_SEND_WITH_IMM; + ib_dma_sync_single_for_device(sess->s.dev->ib_dev, + srv_mr->iu->dma_addr, + srv_mr->iu->size, DMA_TO_DEVICE); + } else { + imm_wr.sg_list = NULL; + imm_wr.num_sge = 0; + imm_wr.opcode = IB_WR_RDMA_WRITE_WITH_IMM; + } + imm_wr.send_flags = flags; + imm_wr.ex.imm_data = cpu_to_be32(rtrs_to_io_rsp_imm(id->msg_id, + 0, need_inval)); + + imm_wr.wr_cqe = &io_comp_cqe; + ib_dma_sync_single_for_device(sess->s.dev->ib_dev, dma_addr, + offset, DMA_BIDIRECTIONAL); + + err = ib_post_send(id->con->c.qp, &id->tx_wr.wr, NULL); + if (unlikely(err)) + rtrs_err(s, + "Posting RDMA-Write-Request to QP failed, err: %d\n", + err); + + return err; +} + +/** + * send_io_resp_imm() - respond to client with empty IMM on failed READ/WRITE + * requests or on successful WRITE request. + * @con: the connection to send back result + * @id: the id associated with the IO + * @errno: the error number of the IO. + * + * Return 0 on success, errno otherwise. + */ +static int send_io_resp_imm(struct rtrs_srv_con *con, struct rtrs_srv_op *id, + int errno) +{ + struct rtrs_sess *s = con->c.sess; + struct rtrs_srv_sess *sess = to_srv_sess(s); + struct ib_send_wr inv_wr, imm_wr, *wr = NULL; + struct ib_reg_wr rwr; + struct rtrs_srv *srv = sess->srv; + struct rtrs_srv_mr *srv_mr; + bool need_inval = false; + enum ib_send_flags flags; + u32 imm; + int err; + + if (id->dir == READ) { + struct rtrs_msg_rdma_read *rd_msg = id->rd_msg; + size_t sg_cnt; + + need_inval = le16_to_cpu(rd_msg->flags) & + RTRS_MSG_NEED_INVAL_F; + sg_cnt = le16_to_cpu(rd_msg->sg_cnt); + + if (need_inval) { + if (likely(sg_cnt)) { + inv_wr.sg_list = NULL; + inv_wr.num_sge = 0; + inv_wr.opcode = IB_WR_SEND_WITH_INV; + inv_wr.send_flags = 0; + /* Only one key is actually used */ + inv_wr.ex.invalidate_rkey = + le32_to_cpu(rd_msg->desc[0].key); + } else { + WARN_ON_ONCE(1); + need_inval = false; + } + } + } + + if (need_inval && always_invalidate) { + wr = &inv_wr; + inv_wr.next = &rwr.wr; + rwr.wr.next = &imm_wr; + } else if (always_invalidate) { + wr = &rwr.wr; + rwr.wr.next = &imm_wr; + } else if (need_inval) { + wr = &inv_wr; + inv_wr.next = &imm_wr; + } else { + wr = &imm_wr; + } + /* + * From time to time we have to post signalled sends, + * or send queue will fill up and only QP reset can help. + */ + flags = (atomic_inc_return(&con->wr_cnt) % srv->queue_depth) ? + 0 : IB_SEND_SIGNALED; + imm = rtrs_to_io_rsp_imm(id->msg_id, errno, need_inval); + imm_wr.next = NULL; + if (always_invalidate) { + struct ib_sge list; + struct rtrs_msg_rkey_rsp *msg; + + srv_mr = &sess->mrs[id->msg_id]; + rwr.wr.next = &imm_wr; + rwr.wr.opcode = IB_WR_REG_MR; + rwr.wr.num_sge = 0; + rwr.wr.send_flags = 0; + rwr.mr = srv_mr->mr; + rwr.key = srv_mr->mr->rkey; + rwr.access = (IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_REMOTE_WRITE); + msg = srv_mr->iu->buf; + msg->buf_id = cpu_to_le16(id->msg_id); + msg->type = cpu_to_le16(RTRS_MSG_RKEY_RSP); + msg->rkey = cpu_to_le32(srv_mr->mr->rkey); + + list.addr = srv_mr->iu->dma_addr; + list.length = sizeof(*msg); + list.lkey = sess->s.dev->ib_pd->local_dma_lkey; + imm_wr.sg_list = &list; + imm_wr.num_sge = 1; + imm_wr.opcode = IB_WR_SEND_WITH_IMM; + ib_dma_sync_single_for_device(sess->s.dev->ib_dev, + srv_mr->iu->dma_addr, + srv_mr->iu->size, DMA_TO_DEVICE); + } else { + imm_wr.sg_list = NULL; + imm_wr.num_sge = 0; + imm_wr.opcode = IB_WR_RDMA_WRITE_WITH_IMM; + } + imm_wr.send_flags = flags; + imm_wr.wr_cqe = &io_comp_cqe; + + imm_wr.ex.imm_data = cpu_to_be32(imm); + + err = ib_post_send(id->con->c.qp, wr, NULL); + if (unlikely(err)) + rtrs_err_rl(s, "Posting RDMA-Reply to QP failed, err: %d\n", + err); + + return err; +} + +void close_sess(struct rtrs_srv_sess *sess) +{ + enum rtrs_srv_state old_state; + + if (rtrs_srv_change_state_get_old(sess, RTRS_SRV_CLOSING, + &old_state)) + queue_work(rtrs_wq, &sess->close_work); + WARN_ON(sess->state != RTRS_SRV_CLOSING); +} + +static inline const char *rtrs_srv_state_str(enum rtrs_srv_state state) +{ + switch (state) { + case RTRS_SRV_CONNECTING: + return "RTRS_SRV_CONNECTING"; + case RTRS_SRV_CONNECTED: + return "RTRS_SRV_CONNECTED"; + case RTRS_SRV_CLOSING: + return "RTRS_SRV_CLOSING"; + case RTRS_SRV_CLOSED: + return "RTRS_SRV_CLOSED"; + default: + return "UNKNOWN"; + } +} + +/** + * rtrs_srv_resp_rdma() - Finish an RDMA request + * + * @id: Internal RTRS operation identifier + * @status: Response Code sent to the other side for this operation. + * 0 = success, <=0 error + * Context: any + * + * Finish a RDMA operation. A message is sent to the client and the + * corresponding memory areas will be released. + */ +bool rtrs_srv_resp_rdma(struct rtrs_srv_op *id, int status) +{ + struct rtrs_srv_sess *sess; + struct rtrs_srv_con *con; + struct rtrs_sess *s; + int err; + + if (WARN_ON(!id)) + return true; + + con = id->con; + s = con->c.sess; + sess = to_srv_sess(s); + + id->status = status; + + if (unlikely(sess->state != RTRS_SRV_CONNECTED)) { + rtrs_err_rl(s, + "Sending I/O response failed, session is disconnected, sess state %s\n", + rtrs_srv_state_str(sess->state)); + goto out; + } + if (always_invalidate) { + struct rtrs_srv_mr *mr = &sess->mrs[id->msg_id]; + + ib_update_fast_reg_key(mr->mr, ib_inc_rkey(mr->mr->rkey)); + } + if (unlikely(atomic_sub_return(1, + &con->sq_wr_avail) < 0)) { + pr_err("IB send queue full\n"); + atomic_add(1, &con->sq_wr_avail); + spin_lock(&con->rsp_wr_wait_lock); + list_add_tail(&id->wait_list, &con->rsp_wr_wait_list); + spin_unlock(&con->rsp_wr_wait_lock); + return false; + } + + if (status || id->dir == WRITE || !id->rd_msg->sg_cnt) + err = send_io_resp_imm(con, id, status); + else + err = rdma_write_sg(id); + + if (unlikely(err)) { + rtrs_err_rl(s, "IO response failed: %d\n", err); + close_sess(sess); + } +out: + rtrs_srv_put_ops_ids(sess); + return true; +} +EXPORT_SYMBOL(rtrs_srv_resp_rdma); + +/** + * rtrs_srv_set_sess_priv() - Set private pointer in rtrs_srv. + * @srv: Session pointer + * @priv: The private pointer that is associated with the session. + */ +void rtrs_srv_set_sess_priv(struct rtrs_srv *srv, void *priv) +{ + srv->priv = priv; +} +EXPORT_SYMBOL(rtrs_srv_set_sess_priv); + +static void unmap_cont_bufs(struct rtrs_srv_sess *sess) +{ + int i; + + for (i = 0; i < sess->mrs_num; i++) { + struct rtrs_srv_mr *srv_mr; + + srv_mr = &sess->mrs[i]; + rtrs_iu_free(srv_mr->iu, DMA_TO_DEVICE, + sess->s.dev->ib_dev, 1); + ib_dereg_mr(srv_mr->mr); + ib_dma_unmap_sg(sess->s.dev->ib_dev, srv_mr->sgt.sgl, + srv_mr->sgt.nents, DMA_BIDIRECTIONAL); + sg_free_table(&srv_mr->sgt); + } + kfree(sess->mrs); +} + +static int map_cont_bufs(struct rtrs_srv_sess *sess) +{ + struct rtrs_srv *srv = sess->srv; + struct rtrs_sess *ss = &sess->s; + int i, mri, err, mrs_num; + unsigned int chunk_bits; + int chunks_per_mr = 1; + + /* + * Here we map queue_depth chunks to MR. Firstly we have to + * figure out how many chunks can we map per MR. + */ + if (always_invalidate) { + /* + * in order to do invalidate for each chunks of memory, we needs + * more memory regions. + */ + mrs_num = srv->queue_depth; + } else { + chunks_per_mr = + sess->s.dev->ib_dev->attrs.max_fast_reg_page_list_len; + mrs_num = DIV_ROUND_UP(srv->queue_depth, chunks_per_mr); + chunks_per_mr = DIV_ROUND_UP(srv->queue_depth, mrs_num); + } + + sess->mrs = kcalloc(mrs_num, sizeof(*sess->mrs), GFP_KERNEL); + if (!sess->mrs) + return -ENOMEM; + + sess->mrs_num = mrs_num; + + for (mri = 0; mri < mrs_num; mri++) { + struct rtrs_srv_mr *srv_mr = &sess->mrs[mri]; + struct sg_table *sgt = &srv_mr->sgt; + struct scatterlist *s; + struct ib_mr *mr; + int nr, chunks; + + chunks = chunks_per_mr * mri; + if (!always_invalidate) + chunks_per_mr = min_t(int, chunks_per_mr, + srv->queue_depth - chunks); + + err = sg_alloc_table(sgt, chunks_per_mr, GFP_KERNEL); + if (err) + goto err; + + for_each_sg(sgt->sgl, s, chunks_per_mr, i) + sg_set_page(s, srv->chunks[chunks + i], + max_chunk_size, 0); + + nr = ib_dma_map_sg(sess->s.dev->ib_dev, sgt->sgl, + sgt->nents, DMA_BIDIRECTIONAL); + if (nr < sgt->nents) { + err = nr < 0 ? nr : -EINVAL; + goto free_sg; + } + mr = ib_alloc_mr(sess->s.dev->ib_pd, IB_MR_TYPE_MEM_REG, + sgt->nents); + if (IS_ERR(mr)) { + err = PTR_ERR(mr); + goto unmap_sg; + } + nr = ib_map_mr_sg(mr, sgt->sgl, sgt->nents, + NULL, max_chunk_size); + if (nr < sgt->nents) { + err = nr < 0 ? nr : -EINVAL; + goto dereg_mr; + } + + if (always_invalidate) { + srv_mr->iu = rtrs_iu_alloc(1, + sizeof(struct rtrs_msg_rkey_rsp), + GFP_KERNEL, sess->s.dev->ib_dev, + DMA_TO_DEVICE, rtrs_srv_rdma_done); + if (!srv_mr->iu) { + rtrs_err(ss, "rtrs_iu_alloc(), err: %d\n", + -ENOMEM); + goto free_iu; + } + } + /* Eventually dma addr for each chunk can be cached */ + for_each_sg(sgt->sgl, s, sgt->orig_nents, i) + sess->dma_addr[chunks + i] = sg_dma_address(s); + + ib_update_fast_reg_key(mr, ib_inc_rkey(mr->rkey)); + srv_mr->mr = mr; + + continue; +err: + while (mri--) { + srv_mr = &sess->mrs[mri]; + sgt = &srv_mr->sgt; + mr = srv_mr->mr; +free_iu: + rtrs_iu_free(srv_mr->iu, DMA_TO_DEVICE, + sess->s.dev->ib_dev, 1); +dereg_mr: + ib_dereg_mr(mr); +unmap_sg: + ib_dma_unmap_sg(sess->s.dev->ib_dev, sgt->sgl, + sgt->nents, DMA_BIDIRECTIONAL); +free_sg: + sg_free_table(sgt); + } + kfree(sess->mrs); + + return err; + } + + chunk_bits = ilog2(srv->queue_depth - 1) + 1; + sess->mem_bits = (MAX_IMM_PAYL_BITS - chunk_bits); + + return 0; +} + +static void rtrs_srv_hb_err_handler(struct rtrs_con *c) +{ + close_sess(to_srv_sess(c->sess)); +} + +static void rtrs_srv_init_hb(struct rtrs_srv_sess *sess) +{ + rtrs_init_hb(&sess->s, &io_comp_cqe, + RTRS_HB_INTERVAL_MS, + RTRS_HB_MISSED_MAX, + rtrs_srv_hb_err_handler, + rtrs_wq); +} + +static void rtrs_srv_start_hb(struct rtrs_srv_sess *sess) +{ + rtrs_start_hb(&sess->s); +} + +static void rtrs_srv_stop_hb(struct rtrs_srv_sess *sess) +{ + rtrs_stop_hb(&sess->s); +} + +static void rtrs_srv_info_rsp_done(struct ib_cq *cq, struct ib_wc *wc) +{ + struct rtrs_srv_con *con = cq->cq_context; + struct rtrs_sess *s = con->c.sess; + struct rtrs_srv_sess *sess = to_srv_sess(s); + struct rtrs_iu *iu; + + iu = container_of(wc->wr_cqe, struct rtrs_iu, cqe); + rtrs_iu_free(iu, DMA_TO_DEVICE, sess->s.dev->ib_dev, 1); + + if (unlikely(wc->status != IB_WC_SUCCESS)) { + rtrs_err(s, "Sess info response send failed: %s\n", + ib_wc_status_msg(wc->status)); + close_sess(sess); + return; + } + WARN_ON(wc->opcode != IB_WC_SEND); +} + +static void rtrs_srv_sess_up(struct rtrs_srv_sess *sess) +{ + struct rtrs_srv *srv = sess->srv; + struct rtrs_srv_ctx *ctx = srv->ctx; + int up; + + mutex_lock(&srv->paths_ev_mutex); + up = ++srv->paths_up; + if (up == 1) + ctx->ops.link_ev(srv, RTRS_SRV_LINK_EV_CONNECTED, NULL); + mutex_unlock(&srv->paths_ev_mutex); + + /* Mark session as established */ + sess->established = true; +} + +static void rtrs_srv_sess_down(struct rtrs_srv_sess *sess) +{ + struct rtrs_srv *srv = sess->srv; + struct rtrs_srv_ctx *ctx = srv->ctx; + + if (!sess->established) + return; + + sess->established = false; + mutex_lock(&srv->paths_ev_mutex); + WARN_ON(!srv->paths_up); + if (--srv->paths_up == 0) + ctx->ops.link_ev(srv, RTRS_SRV_LINK_EV_DISCONNECTED, srv->priv); + mutex_unlock(&srv->paths_ev_mutex); +} + +static int post_recv_sess(struct rtrs_srv_sess *sess); + +static int process_info_req(struct rtrs_srv_con *con, + struct rtrs_msg_info_req *msg) +{ + struct rtrs_sess *s = con->c.sess; + struct rtrs_srv_sess *sess = to_srv_sess(s); + struct ib_send_wr *reg_wr = NULL; + struct rtrs_msg_info_rsp *rsp; + struct rtrs_iu *tx_iu; + struct ib_reg_wr *rwr; + int mri, err; + size_t tx_sz; + + err = post_recv_sess(sess); + if (unlikely(err)) { + rtrs_err(s, "post_recv_sess(), err: %d\n", err); + return err; + } + rwr = kcalloc(sess->mrs_num, sizeof(*rwr), GFP_KERNEL); + if (unlikely(!rwr)) + return -ENOMEM; + strlcpy(sess->s.sessname, msg->sessname, sizeof(sess->s.sessname)); + + tx_sz = sizeof(*rsp); + tx_sz += sizeof(rsp->desc[0]) * sess->mrs_num; + tx_iu = rtrs_iu_alloc(1, tx_sz, GFP_KERNEL, sess->s.dev->ib_dev, + DMA_TO_DEVICE, rtrs_srv_info_rsp_done); + if (unlikely(!tx_iu)) { + err = -ENOMEM; + goto rwr_free; + } + + rsp = tx_iu->buf; + rsp->type = cpu_to_le16(RTRS_MSG_INFO_RSP); + rsp->sg_cnt = cpu_to_le16(sess->mrs_num); + + for (mri = 0; mri < sess->mrs_num; mri++) { + struct ib_mr *mr = sess->mrs[mri].mr; + + rsp->desc[mri].addr = cpu_to_le64(mr->iova); + rsp->desc[mri].key = cpu_to_le32(mr->rkey); + rsp->desc[mri].len = cpu_to_le32(mr->length); + + /* + * Fill in reg MR request and chain them *backwards* + */ + rwr[mri].wr.next = mri ? &rwr[mri - 1].wr : NULL; + rwr[mri].wr.opcode = IB_WR_REG_MR; + rwr[mri].wr.wr_cqe = &local_reg_cqe; + rwr[mri].wr.num_sge = 0; + rwr[mri].wr.send_flags = mri ? 0 : IB_SEND_SIGNALED; + rwr[mri].mr = mr; + rwr[mri].key = mr->rkey; + rwr[mri].access = (IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_REMOTE_WRITE); + reg_wr = &rwr[mri].wr; + } + + err = rtrs_srv_create_sess_files(sess); + if (unlikely(err)) + goto iu_free; + kobject_get(&sess->kobj); + get_device(&sess->srv->dev); + rtrs_srv_change_state(sess, RTRS_SRV_CONNECTED); + rtrs_srv_start_hb(sess); + + /* + * We do not account number of established connections at the current + * moment, we rely on the client, which should send info request when + * all connections are successfully established. Thus, simply notify + * listener with a proper event if we are the first path. + */ + rtrs_srv_sess_up(sess); + + ib_dma_sync_single_for_device(sess->s.dev->ib_dev, tx_iu->dma_addr, + tx_iu->size, DMA_TO_DEVICE); + + /* Send info response */ + err = rtrs_iu_post_send(&con->c, tx_iu, tx_sz, reg_wr); + if (unlikely(err)) { + rtrs_err(s, "rtrs_iu_post_send(), err: %d\n", err); +iu_free: + rtrs_iu_free(tx_iu, DMA_TO_DEVICE, sess->s.dev->ib_dev, 1); + } +rwr_free: + kfree(rwr); + + return err; +} + +static void rtrs_srv_info_req_done(struct ib_cq *cq, struct ib_wc *wc) +{ + struct rtrs_srv_con *con = cq->cq_context; + struct rtrs_sess *s = con->c.sess; + struct rtrs_srv_sess *sess = to_srv_sess(s); + struct rtrs_msg_info_req *msg; + struct rtrs_iu *iu; + int err; + + WARN_ON(con->c.cid); + + iu = container_of(wc->wr_cqe, struct rtrs_iu, cqe); + if (unlikely(wc->status != IB_WC_SUCCESS)) { + rtrs_err(s, "Sess info request receive failed: %s\n", + ib_wc_status_msg(wc->status)); + goto close; + } + WARN_ON(wc->opcode != IB_WC_RECV); + + if (unlikely(wc->byte_len < sizeof(*msg))) { + rtrs_err(s, "Sess info request is malformed: size %d\n", + wc->byte_len); + goto close; + } + ib_dma_sync_single_for_cpu(sess->s.dev->ib_dev, iu->dma_addr, + iu->size, DMA_FROM_DEVICE); + msg = iu->buf; + if (unlikely(le16_to_cpu(msg->type) != RTRS_MSG_INFO_REQ)) { + rtrs_err(s, "Sess info request is malformed: type %d\n", + le16_to_cpu(msg->type)); + goto close; + } + err = process_info_req(con, msg); + if (unlikely(err)) + goto close; + +out: + rtrs_iu_free(iu, DMA_FROM_DEVICE, sess->s.dev->ib_dev, 1); + return; +close: + close_sess(sess); + goto out; +} + +static int post_recv_info_req(struct rtrs_srv_con *con) +{ + struct rtrs_sess *s = con->c.sess; + struct rtrs_srv_sess *sess = to_srv_sess(s); + struct rtrs_iu *rx_iu; + int err; + + rx_iu = rtrs_iu_alloc(1, sizeof(struct rtrs_msg_info_req), + GFP_KERNEL, sess->s.dev->ib_dev, + DMA_FROM_DEVICE, rtrs_srv_info_req_done); + if (unlikely(!rx_iu)) + return -ENOMEM; + /* Prepare for getting info response */ + err = rtrs_iu_post_recv(&con->c, rx_iu); + if (unlikely(err)) { + rtrs_err(s, "rtrs_iu_post_recv(), err: %d\n", err); + rtrs_iu_free(rx_iu, DMA_FROM_DEVICE, sess->s.dev->ib_dev, 1); + return err; + } + + return 0; +} + +static int post_recv_io(struct rtrs_srv_con *con, size_t q_size) +{ + int i, err; + + for (i = 0; i < q_size; i++) { + err = rtrs_post_recv_empty(&con->c, &io_comp_cqe); + if (unlikely(err)) + return err; + } + + return 0; +} + +static int post_recv_sess(struct rtrs_srv_sess *sess) +{ + struct rtrs_srv *srv = sess->srv; + struct rtrs_sess *s = &sess->s; + size_t q_size; + int err, cid; + + for (cid = 0; cid < sess->s.con_num; cid++) { + if (cid == 0) + q_size = SERVICE_CON_QUEUE_DEPTH; + else + q_size = srv->queue_depth; + + err = post_recv_io(to_srv_con(sess->s.con[cid]), q_size); + if (unlikely(err)) { + rtrs_err(s, "post_recv_io(), err: %d\n", err); + return err; + } + } + + return 0; +} + +static void process_read(struct rtrs_srv_con *con, + struct rtrs_msg_rdma_read *msg, + u32 buf_id, u32 off) +{ + struct rtrs_sess *s = con->c.sess; + struct rtrs_srv_sess *sess = to_srv_sess(s); + struct rtrs_srv *srv = sess->srv; + struct rtrs_srv_ctx *ctx = srv->ctx; + struct rtrs_srv_op *id; + + size_t usr_len, data_len; + void *data; + int ret; + + if (unlikely(sess->state != RTRS_SRV_CONNECTED)) { + rtrs_err_rl(s, + "Processing read request failed, session is disconnected, sess state %s\n", + rtrs_srv_state_str(sess->state)); + return; + } + if (unlikely(msg->sg_cnt != 1 && msg->sg_cnt != 0)) { + rtrs_err_rl(s, + "Processing read request failed, invalid message\n"); + return; + } + rtrs_srv_get_ops_ids(sess); + rtrs_srv_update_rdma_stats(sess->stats, off, READ); + id = sess->ops_ids[buf_id]; + id->con = con; + id->dir = READ; + id->msg_id = buf_id; + id->rd_msg = msg; + usr_len = le16_to_cpu(msg->usr_len); + data_len = off - usr_len; + data = page_address(srv->chunks[buf_id]); + ret = ctx->ops.rdma_ev(srv, srv->priv, id, READ, data, data_len, + data + data_len, usr_len); + + if (unlikely(ret)) { + rtrs_err_rl(s, + "Processing read request failed, user module cb reported for msg_id %d, err: %d\n", + buf_id, ret); + goto send_err_msg; + } + + return; + +send_err_msg: + ret = send_io_resp_imm(con, id, ret); + if (ret < 0) { + rtrs_err_rl(s, + "Sending err msg for failed RDMA-Write-Req failed, msg_id %d, err: %d\n", + buf_id, ret); + close_sess(sess); + } + rtrs_srv_put_ops_ids(sess); +} + +static void process_write(struct rtrs_srv_con *con, + struct rtrs_msg_rdma_write *req, + u32 buf_id, u32 off) +{ + struct rtrs_sess *s = con->c.sess; + struct rtrs_srv_sess *sess = to_srv_sess(s); + struct rtrs_srv *srv = sess->srv; + struct rtrs_srv_ctx *ctx = srv->ctx; + struct rtrs_srv_op *id; + + size_t data_len, usr_len; + void *data; + int ret; + + if (unlikely(sess->state != RTRS_SRV_CONNECTED)) { + rtrs_err_rl(s, + "Processing write request failed, session is disconnected, sess state %s\n", + rtrs_srv_state_str(sess->state)); + return; + } + rtrs_srv_get_ops_ids(sess); + rtrs_srv_update_rdma_stats(sess->stats, off, WRITE); + id = sess->ops_ids[buf_id]; + id->con = con; + id->dir = WRITE; + id->msg_id = buf_id; + + usr_len = le16_to_cpu(req->usr_len); + data_len = off - usr_len; + data = page_address(srv->chunks[buf_id]); + ret = ctx->ops.rdma_ev(srv, srv->priv, id, WRITE, data, data_len, + data + data_len, usr_len); + if (unlikely(ret)) { + rtrs_err_rl(s, + "Processing write request failed, user module callback reports err: %d\n", + ret); + goto send_err_msg; + } + + return; + +send_err_msg: + ret = send_io_resp_imm(con, id, ret); + if (ret < 0) { + rtrs_err_rl(s, + "Processing write request failed, sending I/O response failed, msg_id %d, err: %d\n", + buf_id, ret); + close_sess(sess); + } + rtrs_srv_put_ops_ids(sess); +} + +static void process_io_req(struct rtrs_srv_con *con, void *msg, + u32 id, u32 off) +{ + struct rtrs_sess *s = con->c.sess; + struct rtrs_srv_sess *sess = to_srv_sess(s); + struct rtrs_msg_rdma_hdr *hdr; + unsigned int type; + + ib_dma_sync_single_for_cpu(sess->s.dev->ib_dev, sess->dma_addr[id], + max_chunk_size, DMA_BIDIRECTIONAL); + hdr = msg; + type = le16_to_cpu(hdr->type); + + switch (type) { + case RTRS_MSG_WRITE: + process_write(con, msg, id, off); + break; + case RTRS_MSG_READ: + process_read(con, msg, id, off); + break; + default: + rtrs_err(s, + "Processing I/O request failed, unknown message type received: 0x%02x\n", + type); + goto err; + } + + return; + +err: + close_sess(sess); +} + +static void rtrs_srv_inv_rkey_done(struct ib_cq *cq, struct ib_wc *wc) +{ + struct rtrs_srv_mr *mr = + container_of(wc->wr_cqe, typeof(*mr), inv_cqe); + struct rtrs_srv_con *con = cq->cq_context; + struct rtrs_sess *s = con->c.sess; + struct rtrs_srv_sess *sess = to_srv_sess(s); + struct rtrs_srv *srv = sess->srv; + u32 msg_id, off; + void *data; + + if (unlikely(wc->status != IB_WC_SUCCESS)) { + rtrs_err(s, "Failed IB_WR_LOCAL_INV: %s\n", + ib_wc_status_msg(wc->status)); + close_sess(sess); + } + msg_id = mr->msg_id; + off = mr->msg_off; + data = page_address(srv->chunks[msg_id]) + off; + process_io_req(con, data, msg_id, off); +} + +static int rtrs_srv_inv_rkey(struct rtrs_srv_con *con, + struct rtrs_srv_mr *mr) +{ + struct ib_send_wr wr = { + .opcode = IB_WR_LOCAL_INV, + .wr_cqe = &mr->inv_cqe, + .send_flags = IB_SEND_SIGNALED, + .ex.invalidate_rkey = mr->mr->rkey, + }; + mr->inv_cqe.done = rtrs_srv_inv_rkey_done; + + return ib_post_send(con->c.qp, &wr, NULL); +} + +static void rtrs_rdma_process_wr_wait_list(struct rtrs_srv_con *con) +{ + spin_lock(&con->rsp_wr_wait_lock); + while (!list_empty(&con->rsp_wr_wait_list)) { + struct rtrs_srv_op *id; + int ret; + + id = list_entry(con->rsp_wr_wait_list.next, + struct rtrs_srv_op, wait_list); + list_del(&id->wait_list); + + spin_unlock(&con->rsp_wr_wait_lock); + ret = rtrs_srv_resp_rdma(id, id->status); + spin_lock(&con->rsp_wr_wait_lock); + + if (!ret) { + list_add(&id->wait_list, &con->rsp_wr_wait_list); + break; + } + } + spin_unlock(&con->rsp_wr_wait_lock); +} + +static void rtrs_srv_rdma_done(struct ib_cq *cq, struct ib_wc *wc) +{ + struct rtrs_srv_con *con = cq->cq_context; + struct rtrs_sess *s = con->c.sess; + struct rtrs_srv_sess *sess = to_srv_sess(s); + struct rtrs_srv *srv = sess->srv; + u32 imm_type, imm_payload; + int err; + + if (unlikely(wc->status != IB_WC_SUCCESS)) { + if (wc->status != IB_WC_WR_FLUSH_ERR) { + rtrs_err(s, + "%s (wr_cqe: %p, type: %d, vendor_err: 0x%x, len: %u)\n", + ib_wc_status_msg(wc->status), wc->wr_cqe, + wc->opcode, wc->vendor_err, wc->byte_len); + close_sess(sess); + } + return; + } + + switch (wc->opcode) { + case IB_WC_RECV_RDMA_WITH_IMM: + /* + * post_recv() RDMA write completions of IO reqs (read/write) + * and hb + */ + if (WARN_ON(wc->wr_cqe != &io_comp_cqe)) + return; + err = rtrs_post_recv_empty(&con->c, &io_comp_cqe); + if (unlikely(err)) { + rtrs_err(s, "rtrs_post_recv(), err: %d\n", err); + close_sess(sess); + break; + } + rtrs_from_imm(be32_to_cpu(wc->ex.imm_data), + &imm_type, &imm_payload); + if (likely(imm_type == RTRS_IO_REQ_IMM)) { + u32 msg_id, off; + void *data; + + msg_id = imm_payload >> sess->mem_bits; + off = imm_payload & ((1 << sess->mem_bits) - 1); + if (unlikely(msg_id > srv->queue_depth || + off > max_chunk_size)) { + rtrs_err(s, "Wrong msg_id %u, off %u\n", + msg_id, off); + close_sess(sess); + return; + } + if (always_invalidate) { + struct rtrs_srv_mr *mr = &sess->mrs[msg_id]; + + mr->msg_off = off; + mr->msg_id = msg_id; + err = rtrs_srv_inv_rkey(con, mr); + if (unlikely(err)) { + rtrs_err(s, "rtrs_post_recv(), err: %d\n", + err); + close_sess(sess); + break; + } + } else { + data = page_address(srv->chunks[msg_id]) + off; + process_io_req(con, data, msg_id, off); + } + } else if (imm_type == RTRS_HB_MSG_IMM) { + WARN_ON(con->c.cid); + rtrs_send_hb_ack(&sess->s); + } else if (imm_type == RTRS_HB_ACK_IMM) { + WARN_ON(con->c.cid); + sess->s.hb_missed_cnt = 0; + } else { + rtrs_wrn(s, "Unknown IMM type %u\n", imm_type); + } + break; + case IB_WC_RDMA_WRITE: + case IB_WC_SEND: + /* + * post_send() RDMA write completions of IO reqs (read/write) + * and hb + */ + atomic_add(srv->queue_depth, &con->sq_wr_avail); + + if (unlikely(!list_empty_careful(&con->rsp_wr_wait_list))) + rtrs_rdma_process_wr_wait_list(con); + + break; + default: + rtrs_wrn(s, "Unexpected WC type: %d\n", wc->opcode); + return; + } +} + +/** + * rtrs_srv_get_sess_name() - Get rtrs_srv peer hostname. + * @srv: Session + * @sessname: Sessname buffer + * @len: Length of sessname buffer + */ +int rtrs_srv_get_sess_name(struct rtrs_srv *srv, char *sessname, size_t len) +{ + struct rtrs_srv_sess *sess; + int err = -ENOTCONN; + + mutex_lock(&srv->paths_mutex); + list_for_each_entry(sess, &srv->paths_list, s.entry) { + if (sess->state != RTRS_SRV_CONNECTED) + continue; + strlcpy(sessname, sess->s.sessname, + min_t(size_t, sizeof(sess->s.sessname), len)); + err = 0; + break; + } + mutex_unlock(&srv->paths_mutex); + + return err; +} +EXPORT_SYMBOL(rtrs_srv_get_sess_name); + +/** + * rtrs_srv_get_sess_qdepth() - Get rtrs_srv qdepth. + * @srv: Session + */ +int rtrs_srv_get_queue_depth(struct rtrs_srv *srv) +{ + return srv->queue_depth; +} +EXPORT_SYMBOL(rtrs_srv_get_queue_depth); + +static int find_next_bit_ring(struct rtrs_srv_sess *sess) +{ + struct ib_device *ib_dev = sess->s.dev->ib_dev; + int v; + + v = cpumask_next(sess->cur_cq_vector, &cq_affinity_mask); + if (v >= nr_cpu_ids || v >= ib_dev->num_comp_vectors) + v = cpumask_first(&cq_affinity_mask); + return v; +} + +static int rtrs_srv_get_next_cq_vector(struct rtrs_srv_sess *sess) +{ + sess->cur_cq_vector = find_next_bit_ring(sess); + + return sess->cur_cq_vector; +} + +static struct rtrs_srv *__alloc_srv(struct rtrs_srv_ctx *ctx, + const uuid_t *paths_uuid) +{ + struct rtrs_srv *srv; + int i; + + srv = kzalloc(sizeof(*srv), GFP_KERNEL); + if (!srv) + return NULL; + + refcount_set(&srv->refcount, 1); + INIT_LIST_HEAD(&srv->paths_list); + mutex_init(&srv->paths_mutex); + mutex_init(&srv->paths_ev_mutex); + uuid_copy(&srv->paths_uuid, paths_uuid); + srv->queue_depth = sess_queue_depth; + srv->ctx = ctx; + + srv->chunks = kcalloc(srv->queue_depth, sizeof(*srv->chunks), + GFP_KERNEL); + if (!srv->chunks) + goto err_free_srv; + + for (i = 0; i < srv->queue_depth; i++) { + srv->chunks[i] = mempool_alloc(chunk_pool, GFP_KERNEL); + if (!srv->chunks[i]) + goto err_free_chunks; + } + list_add(&srv->ctx_list, &ctx->srv_list); + + return srv; + +err_free_chunks: + while (i--) + mempool_free(srv->chunks[i], chunk_pool); + kfree(srv->chunks); + +err_free_srv: + kfree(srv); + + return NULL; +} + +static void free_srv(struct rtrs_srv *srv) +{ + int i; + + WARN_ON(refcount_read(&srv->refcount)); + for (i = 0; i < srv->queue_depth; i++) + mempool_free(srv->chunks[i], chunk_pool); + kfree(srv->chunks); + mutex_destroy(&srv->paths_mutex); + mutex_destroy(&srv->paths_ev_mutex); + /* last put to release the srv structure */ + put_device(&srv->dev); +} + +static inline struct rtrs_srv *__find_srv_and_get(struct rtrs_srv_ctx *ctx, + const uuid_t *paths_uuid) +{ + struct rtrs_srv *srv; + + list_for_each_entry(srv, &ctx->srv_list, ctx_list) { + if (uuid_equal(&srv->paths_uuid, paths_uuid) && + refcount_inc_not_zero(&srv->refcount)) + return srv; + } + + return NULL; +} + +static struct rtrs_srv *get_or_create_srv(struct rtrs_srv_ctx *ctx, + const uuid_t *paths_uuid) +{ + struct rtrs_srv *srv; + + mutex_lock(&ctx->srv_mutex); + srv = __find_srv_and_get(ctx, paths_uuid); + if (!srv) + srv = __alloc_srv(ctx, paths_uuid); + mutex_unlock(&ctx->srv_mutex); + + return srv; +} + +static void put_srv(struct rtrs_srv *srv) +{ + if (refcount_dec_and_test(&srv->refcount)) { + struct rtrs_srv_ctx *ctx = srv->ctx; + + WARN_ON(srv->dev.kobj.state_in_sysfs); + + mutex_lock(&ctx->srv_mutex); + list_del(&srv->ctx_list); + mutex_unlock(&ctx->srv_mutex); + free_srv(srv); + } +} + +static void __add_path_to_srv(struct rtrs_srv *srv, + struct rtrs_srv_sess *sess) +{ + list_add_tail(&sess->s.entry, &srv->paths_list); + srv->paths_num++; + WARN_ON(srv->paths_num >= MAX_PATHS_NUM); +} + +static void del_path_from_srv(struct rtrs_srv_sess *sess) +{ + struct rtrs_srv *srv = sess->srv; + + if (WARN_ON(!srv)) + return; + + mutex_lock(&srv->paths_mutex); + list_del(&sess->s.entry); + WARN_ON(!srv->paths_num); + srv->paths_num--; + mutex_unlock(&srv->paths_mutex); +} + +/* return true if addresses are the same, error other wise */ +static int sockaddr_cmp(const struct sockaddr *a, const struct sockaddr *b) +{ + switch (a->sa_family) { + case AF_IB: + return memcmp(&((struct sockaddr_ib *)a)->sib_addr, + &((struct sockaddr_ib *)b)->sib_addr, + sizeof(struct ib_addr)) && + (b->sa_family == AF_IB); + case AF_INET: + return memcmp(&((struct sockaddr_in *)a)->sin_addr, + &((struct sockaddr_in *)b)->sin_addr, + sizeof(struct in_addr)) && + (b->sa_family == AF_INET); + case AF_INET6: + return memcmp(&((struct sockaddr_in6 *)a)->sin6_addr, + &((struct sockaddr_in6 *)b)->sin6_addr, + sizeof(struct in6_addr)) && + (b->sa_family == AF_INET6); + default: + return -ENOENT; + } +} + +static bool __is_path_w_addr_exists(struct rtrs_srv *srv, + struct rdma_addr *addr) +{ + struct rtrs_srv_sess *sess; + + list_for_each_entry(sess, &srv->paths_list, s.entry) + if (!sockaddr_cmp((struct sockaddr *)&sess->s.dst_addr, + (struct sockaddr *)&addr->dst_addr) && + !sockaddr_cmp((struct sockaddr *)&sess->s.src_addr, + (struct sockaddr *)&addr->src_addr)) + return true; + + return false; +} + +static void free_sess(struct rtrs_srv_sess *sess) +{ + if (sess->kobj.state_in_sysfs) + kobject_put(&sess->kobj); + else + kfree(sess); +} + +static void rtrs_srv_close_work(struct work_struct *work) +{ + struct rtrs_srv_sess *sess; + struct rtrs_srv_con *con; + int i; + + sess = container_of(work, typeof(*sess), close_work); + + rtrs_srv_destroy_sess_files(sess); + rtrs_srv_stop_hb(sess); + + for (i = 0; i < sess->s.con_num; i++) { + if (!sess->s.con[i]) + continue; + con = to_srv_con(sess->s.con[i]); + rdma_disconnect(con->c.cm_id); + ib_drain_qp(con->c.qp); + } + /* Wait for all inflights */ + rtrs_srv_wait_ops_ids(sess); + + /* Notify upper layer if we are the last path */ + rtrs_srv_sess_down(sess); + + unmap_cont_bufs(sess); + rtrs_srv_free_ops_ids(sess); + + for (i = 0; i < sess->s.con_num; i++) { + if (!sess->s.con[i]) + continue; + con = to_srv_con(sess->s.con[i]); + rtrs_cq_qp_destroy(&con->c); + rdma_destroy_id(con->c.cm_id); + kfree(con); + } + rtrs_ib_dev_put(sess->s.dev); + + del_path_from_srv(sess); + put_srv(sess->srv); + sess->srv = NULL; + rtrs_srv_change_state(sess, RTRS_SRV_CLOSED); + + kfree(sess->dma_addr); + kfree(sess->s.con); + free_sess(sess); +} + +static int rtrs_rdma_do_accept(struct rtrs_srv_sess *sess, + struct rdma_cm_id *cm_id) +{ + struct rtrs_srv *srv = sess->srv; + struct rtrs_msg_conn_rsp msg; + struct rdma_conn_param param; + int err; + + param = (struct rdma_conn_param) { + .rnr_retry_count = 7, + .private_data = &msg, + .private_data_len = sizeof(msg), + }; + + msg = (struct rtrs_msg_conn_rsp) { + .magic = cpu_to_le16(RTRS_MAGIC), + .version = cpu_to_le16(RTRS_PROTO_VER), + .queue_depth = cpu_to_le16(srv->queue_depth), + .max_io_size = cpu_to_le32(max_chunk_size - MAX_HDR_SIZE), + .max_hdr_size = cpu_to_le32(MAX_HDR_SIZE), + }; + + if (always_invalidate) + msg.flags = cpu_to_le32(RTRS_MSG_NEW_RKEY_F); + + err = rdma_accept(cm_id, ¶m); + if (err) + pr_err("rdma_accept(), err: %d\n", err); + + return err; +} + +static int rtrs_rdma_do_reject(struct rdma_cm_id *cm_id, int errno) +{ + struct rtrs_msg_conn_rsp msg; + int err; + + msg = (struct rtrs_msg_conn_rsp) { + .magic = cpu_to_le16(RTRS_MAGIC), + .version = cpu_to_le16(RTRS_PROTO_VER), + .errno = cpu_to_le16(errno), + }; + + err = rdma_reject(cm_id, &msg, sizeof(msg)); + if (err) + pr_err("rdma_reject(), err: %d\n", err); + + /* Bounce errno back */ + return errno; +} + +static struct rtrs_srv_sess * +__find_sess(struct rtrs_srv *srv, const uuid_t *sess_uuid) +{ + struct rtrs_srv_sess *sess; + + list_for_each_entry(sess, &srv->paths_list, s.entry) { + if (uuid_equal(&sess->s.uuid, sess_uuid)) + return sess; + } + + return NULL; +} + +static int create_con(struct rtrs_srv_sess *sess, + struct rdma_cm_id *cm_id, + unsigned int cid) +{ + struct rtrs_srv *srv = sess->srv; + struct rtrs_sess *s = &sess->s; + struct rtrs_srv_con *con; + + u16 cq_size, wr_queue_size; + int err, cq_vector; + + con = kzalloc(sizeof(*con), GFP_KERNEL); + if (!con) { + err = -ENOMEM; + goto err; + } + + spin_lock_init(&con->rsp_wr_wait_lock); + INIT_LIST_HEAD(&con->rsp_wr_wait_list); + con->c.cm_id = cm_id; + con->c.sess = &sess->s; + con->c.cid = cid; + atomic_set(&con->wr_cnt, 0); + + if (con->c.cid == 0) { + /* + * All receive and all send (each requiring invalidate) + * + 2 for drain and heartbeat + */ + wr_queue_size = SERVICE_CON_QUEUE_DEPTH * 3 + 2; + cq_size = wr_queue_size; + } else { + /* + * If we have all receive requests posted and + * all write requests posted and each read request + * requires an invalidate request + drain + * and qp gets into error state. + */ + cq_size = srv->queue_depth * 3 + 1; + /* + * In theory we might have queue_depth * 32 + * outstanding requests if an unsafe global key is used + * and we have queue_depth read requests each consisting + * of 32 different addresses. div 3 for mlx5. + */ + wr_queue_size = sess->s.dev->ib_dev->attrs.max_qp_wr / 3; + } + atomic_set(&con->sq_wr_avail, wr_queue_size); + cq_vector = rtrs_srv_get_next_cq_vector(sess); + + /* TODO: SOFTIRQ can be faster, but be careful with softirq context */ + err = rtrs_cq_qp_create(&sess->s, &con->c, 1, cq_vector, cq_size, + wr_queue_size, IB_POLL_WORKQUEUE); + if (err) { + rtrs_err(s, "rtrs_cq_qp_create(), err: %d\n", err); + goto free_con; + } + if (con->c.cid == 0) { + err = post_recv_info_req(con); + if (err) + goto free_cqqp; + } + WARN_ON(sess->s.con[cid]); + sess->s.con[cid] = &con->c; + + /* + * Change context from server to current connection. The other + * way is to use cm_id->qp->qp_context, which does not work on OFED. + */ + cm_id->context = &con->c; + + return 0; + +free_cqqp: + rtrs_cq_qp_destroy(&con->c); +free_con: + kfree(con); + +err: + return err; +} + +static struct rtrs_srv_sess *__alloc_sess(struct rtrs_srv *srv, + struct rdma_cm_id *cm_id, + unsigned int con_num, + unsigned int recon_cnt, + const uuid_t *uuid) +{ + struct rtrs_srv_sess *sess; + int err = -ENOMEM; + + if (srv->paths_num >= MAX_PATHS_NUM) { + err = -ECONNRESET; + goto err; + } + if (__is_path_w_addr_exists(srv, &cm_id->route.addr)) { + err = -EEXIST; + pr_err("Path with same addr exists\n"); + goto err; + } + sess = kzalloc(sizeof(*sess), GFP_KERNEL); + if (!sess) + goto err; + + sess->stats = kzalloc(sizeof(*sess->stats), GFP_KERNEL); + if (!sess->stats) + goto err_free_sess; + + sess->stats->sess = sess; + + sess->dma_addr = kcalloc(srv->queue_depth, sizeof(*sess->dma_addr), + GFP_KERNEL); + if (!sess->dma_addr) + goto err_free_stats; + + sess->s.con = kcalloc(con_num, sizeof(*sess->s.con), GFP_KERNEL); + if (!sess->s.con) + goto err_free_dma_addr; + + sess->state = RTRS_SRV_CONNECTING; + sess->srv = srv; + sess->cur_cq_vector = -1; + sess->s.dst_addr = cm_id->route.addr.dst_addr; + sess->s.src_addr = cm_id->route.addr.src_addr; + sess->s.con_num = con_num; + sess->s.recon_cnt = recon_cnt; + uuid_copy(&sess->s.uuid, uuid); + spin_lock_init(&sess->state_lock); + INIT_WORK(&sess->close_work, rtrs_srv_close_work); + rtrs_srv_init_hb(sess); + + sess->s.dev = rtrs_ib_dev_find_or_add(cm_id->device, &dev_pd); + if (!sess->s.dev) { + err = -ENOMEM; + goto err_free_con; + } + err = map_cont_bufs(sess); + if (err) + goto err_put_dev; + + err = rtrs_srv_alloc_ops_ids(sess); + if (err) + goto err_unmap_bufs; + + __add_path_to_srv(srv, sess); + + return sess; + +err_unmap_bufs: + unmap_cont_bufs(sess); +err_put_dev: + rtrs_ib_dev_put(sess->s.dev); +err_free_con: + kfree(sess->s.con); +err_free_dma_addr: + kfree(sess->dma_addr); +err_free_stats: + kfree(sess->stats); +err_free_sess: + kfree(sess); +err: + return ERR_PTR(err); +} + +static int rtrs_rdma_connect(struct rdma_cm_id *cm_id, + const struct rtrs_msg_conn_req *msg, + size_t len) +{ + struct rtrs_srv_ctx *ctx = cm_id->context; + struct rtrs_srv_sess *sess; + struct rtrs_srv *srv; + + u16 version, con_num, cid; + u16 recon_cnt; + int err; + + if (len < sizeof(*msg)) { + pr_err("Invalid RTRS connection request\n"); + goto reject_w_econnreset; + } + if (le16_to_cpu(msg->magic) != RTRS_MAGIC) { + pr_err("Invalid RTRS magic\n"); + goto reject_w_econnreset; + } + version = le16_to_cpu(msg->version); + if (version >> 8 != RTRS_PROTO_VER_MAJOR) { + pr_err("Unsupported major RTRS version: %d, expected %d\n", + version >> 8, RTRS_PROTO_VER_MAJOR); + goto reject_w_econnreset; + } + con_num = le16_to_cpu(msg->cid_num); + if (con_num > 4096) { + /* Sanity check */ + pr_err("Too many connections requested: %d\n", con_num); + goto reject_w_econnreset; + } + cid = le16_to_cpu(msg->cid); + if (cid >= con_num) { + /* Sanity check */ + pr_err("Incorrect cid: %d >= %d\n", cid, con_num); + goto reject_w_econnreset; + } + recon_cnt = le16_to_cpu(msg->recon_cnt); + srv = get_or_create_srv(ctx, &msg->paths_uuid); + if (!srv) { + err = -ENOMEM; + goto reject_w_err; + } + mutex_lock(&srv->paths_mutex); + sess = __find_sess(srv, &msg->sess_uuid); + if (sess) { + struct rtrs_sess *s = &sess->s; + + /* Session already holds a reference */ + put_srv(srv); + + if (sess->state != RTRS_SRV_CONNECTING) { + rtrs_err(s, "Session in wrong state: %s\n", + rtrs_srv_state_str(sess->state)); + mutex_unlock(&srv->paths_mutex); + goto reject_w_econnreset; + } + /* + * Sanity checks + */ + if (con_num != sess->s.con_num || cid >= sess->s.con_num) { + rtrs_err(s, "Incorrect request: %d, %d\n", + cid, con_num); + mutex_unlock(&srv->paths_mutex); + goto reject_w_econnreset; + } + if (sess->s.con[cid]) { + rtrs_err(s, "Connection already exists: %d\n", + cid); + mutex_unlock(&srv->paths_mutex); + goto reject_w_econnreset; + } + } else { + sess = __alloc_sess(srv, cm_id, con_num, recon_cnt, + &msg->sess_uuid); + if (IS_ERR(sess)) { + mutex_unlock(&srv->paths_mutex); + put_srv(srv); + err = PTR_ERR(sess); + goto reject_w_err; + } + } + err = create_con(sess, cm_id, cid); + if (err) { + (void)rtrs_rdma_do_reject(cm_id, err); + /* + * Since session has other connections we follow normal way + * through workqueue, but still return an error to tell cma.c + * to call rdma_destroy_id() for current connection. + */ + goto close_and_return_err; + } + err = rtrs_rdma_do_accept(sess, cm_id); + if (err) { + (void)rtrs_rdma_do_reject(cm_id, err); + /* + * Since current connection was successfully added to the + * session we follow normal way through workqueue to close the + * session, thus return 0 to tell cma.c we call + * rdma_destroy_id() ourselves. + */ + err = 0; + goto close_and_return_err; + } + mutex_unlock(&srv->paths_mutex); + + return 0; + +reject_w_err: + return rtrs_rdma_do_reject(cm_id, err); + +reject_w_econnreset: + return rtrs_rdma_do_reject(cm_id, -ECONNRESET); + +close_and_return_err: + close_sess(sess); + mutex_unlock(&srv->paths_mutex); + + return err; +} + +static int rtrs_srv_rdma_cm_handler(struct rdma_cm_id *cm_id, + struct rdma_cm_event *ev) +{ + struct rtrs_srv_sess *sess = NULL; + struct rtrs_sess *s = NULL; + + if (ev->event != RDMA_CM_EVENT_CONNECT_REQUEST) { + struct rtrs_con *c = cm_id->context; + + s = c->sess; + sess = to_srv_sess(s); + } + + switch (ev->event) { + case RDMA_CM_EVENT_CONNECT_REQUEST: + /* + * In case of error cma.c will destroy cm_id, + * see cma_process_remove() + */ + return rtrs_rdma_connect(cm_id, ev->param.conn.private_data, + ev->param.conn.private_data_len); + case RDMA_CM_EVENT_ESTABLISHED: + /* Nothing here */ + break; + case RDMA_CM_EVENT_REJECTED: + case RDMA_CM_EVENT_CONNECT_ERROR: + case RDMA_CM_EVENT_UNREACHABLE: + rtrs_err(s, "CM error (CM event: %s, err: %d)\n", + rdma_event_msg(ev->event), ev->status); + close_sess(sess); + break; + case RDMA_CM_EVENT_DISCONNECTED: + case RDMA_CM_EVENT_ADDR_CHANGE: + case RDMA_CM_EVENT_TIMEWAIT_EXIT: + close_sess(sess); + break; + case RDMA_CM_EVENT_DEVICE_REMOVAL: + close_sess(sess); + break; + default: + pr_err("Ignoring unexpected CM event %s, err %d\n", + rdma_event_msg(ev->event), ev->status); + break; + } + + return 0; +} + +static struct rdma_cm_id *rtrs_srv_cm_init(struct rtrs_srv_ctx *ctx, + struct sockaddr *addr, + enum rdma_ucm_port_space ps) +{ + struct rdma_cm_id *cm_id; + int ret; + + cm_id = rdma_create_id(&init_net, rtrs_srv_rdma_cm_handler, + ctx, ps, IB_QPT_RC); + if (IS_ERR(cm_id)) { + ret = PTR_ERR(cm_id); + pr_err("Creating id for RDMA connection failed, err: %d\n", + ret); + goto err_out; + } + ret = rdma_bind_addr(cm_id, addr); + if (ret) { + pr_err("Binding RDMA address failed, err: %d\n", ret); + goto err_cm; + } + ret = rdma_listen(cm_id, 64); + if (ret) { + pr_err("Listening on RDMA connection failed, err: %d\n", + ret); + goto err_cm; + } + + return cm_id; + +err_cm: + rdma_destroy_id(cm_id); +err_out: + + return ERR_PTR(ret); +} + +static int rtrs_srv_rdma_init(struct rtrs_srv_ctx *ctx, u16 port) +{ + struct sockaddr_in6 sin = { + .sin6_family = AF_INET6, + .sin6_addr = IN6ADDR_ANY_INIT, + .sin6_port = htons(port), + }; + struct sockaddr_ib sib = { + .sib_family = AF_IB, + .sib_sid = cpu_to_be64(RDMA_IB_IP_PS_IB | port), + .sib_sid_mask = cpu_to_be64(0xffffffffffffffffULL), + .sib_pkey = cpu_to_be16(0xffff), + }; + struct rdma_cm_id *cm_ip, *cm_ib; + int ret; + + /* + * We accept both IPoIB and IB connections, so we need to keep + * two cm id's, one for each socket type and port space. + * If the cm initialization of one of the id's fails, we abort + * everything. + */ + cm_ip = rtrs_srv_cm_init(ctx, (struct sockaddr *)&sin, RDMA_PS_TCP); + if (IS_ERR(cm_ip)) + return PTR_ERR(cm_ip); + + cm_ib = rtrs_srv_cm_init(ctx, (struct sockaddr *)&sib, RDMA_PS_IB); + if (IS_ERR(cm_ib)) { + ret = PTR_ERR(cm_ib); + goto free_cm_ip; + } + + ctx->cm_id_ip = cm_ip; + ctx->cm_id_ib = cm_ib; + + return 0; + +free_cm_ip: + rdma_destroy_id(cm_ip); + + return ret; +} + +static struct rtrs_srv_ctx *alloc_srv_ctx(struct rtrs_srv_ops *ops) +{ + struct rtrs_srv_ctx *ctx; + + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return NULL; + + ctx->ops = *ops; + mutex_init(&ctx->srv_mutex); + INIT_LIST_HEAD(&ctx->srv_list); + + return ctx; +} + +static void free_srv_ctx(struct rtrs_srv_ctx *ctx) +{ + WARN_ON(!list_empty(&ctx->srv_list)); + mutex_destroy(&ctx->srv_mutex); + kfree(ctx); +} + +/** + * rtrs_srv_open() - open RTRS server context + * @ops: callback functions + * @port: port to listen on + * + * Creates server context with specified callbacks. + * + * Return a valid pointer on success otherwise PTR_ERR. + */ +struct rtrs_srv_ctx *rtrs_srv_open(struct rtrs_srv_ops *ops, u16 port) +{ + struct rtrs_srv_ctx *ctx; + int err; + + ctx = alloc_srv_ctx(ops); + if (!ctx) + return ERR_PTR(-ENOMEM); + + err = rtrs_srv_rdma_init(ctx, port); + if (err) { + free_srv_ctx(ctx); + return ERR_PTR(err); + } + + return ctx; +} +EXPORT_SYMBOL(rtrs_srv_open); + +static void close_sessions(struct rtrs_srv *srv) +{ + struct rtrs_srv_sess *sess; + + mutex_lock(&srv->paths_mutex); + list_for_each_entry(sess, &srv->paths_list, s.entry) + close_sess(sess); + mutex_unlock(&srv->paths_mutex); +} + +static void close_ctx(struct rtrs_srv_ctx *ctx) +{ + struct rtrs_srv *srv; + + mutex_lock(&ctx->srv_mutex); + list_for_each_entry(srv, &ctx->srv_list, ctx_list) + close_sessions(srv); + mutex_unlock(&ctx->srv_mutex); + flush_workqueue(rtrs_wq); +} + +/** + * rtrs_srv_close() - close RTRS server context + * @ctx: pointer to server context + * + * Closes RTRS server context with all client sessions. + */ +void rtrs_srv_close(struct rtrs_srv_ctx *ctx) +{ + rdma_destroy_id(ctx->cm_id_ip); + rdma_destroy_id(ctx->cm_id_ib); + close_ctx(ctx); + free_srv_ctx(ctx); +} +EXPORT_SYMBOL(rtrs_srv_close); + +static int check_module_params(void) +{ + if (sess_queue_depth < 1 || sess_queue_depth > MAX_SESS_QUEUE_DEPTH) { + pr_err("Invalid sess_queue_depth value %d, has to be >= %d, <= %d.\n", + sess_queue_depth, 1, MAX_SESS_QUEUE_DEPTH); + return -EINVAL; + } + if (max_chunk_size < 4096 || !is_power_of_2(max_chunk_size)) { + pr_err("Invalid max_chunk_size value %d, has to be >= %d and should be power of two.\n", + max_chunk_size, 4096); + return -EINVAL; + } + + /* + * Check if IB immediate data size is enough to hold the mem_id and the + * offset inside the memory chunk + */ + if ((ilog2(sess_queue_depth - 1) + 1) + + (ilog2(max_chunk_size - 1) + 1) > MAX_IMM_PAYL_BITS) { + pr_err("RDMA immediate size (%db) not enough to encode %d buffers of size %dB. Reduce 'sess_queue_depth' or 'max_chunk_size' parameters.\n", + MAX_IMM_PAYL_BITS, sess_queue_depth, max_chunk_size); + return -EINVAL; + } + + return 0; +} + +static int __init rtrs_server_init(void) +{ + int err; + + pr_info("Loading module %s, proto %s: (max_chunk_size: %d (pure IO %ld, headers %ld) , sess_queue_depth: %d, always_invalidate: %d)\n", + KBUILD_MODNAME, RTRS_PROTO_VER_STRING, + max_chunk_size, max_chunk_size - MAX_HDR_SIZE, MAX_HDR_SIZE, + sess_queue_depth, always_invalidate); + + rtrs_rdma_dev_pd_init(0, &dev_pd); + + err = check_module_params(); + if (err) { + pr_err("Failed to load module, invalid module parameters, err: %d\n", + err); + return err; + } + chunk_pool = mempool_create_page_pool(sess_queue_depth * CHUNK_POOL_SZ, + get_order(max_chunk_size)); + if (!chunk_pool) + return -ENOMEM; + rtrs_dev_class = class_create(THIS_MODULE, "rtrs-server"); + if (IS_ERR(rtrs_dev_class)) { + err = PTR_ERR(rtrs_dev_class); + goto out_chunk_pool; + } + rtrs_wq = alloc_workqueue("rtrs_server_wq", WQ_MEM_RECLAIM, 0); + if (!rtrs_wq) + goto out_dev_class; + + return 0; + +out_dev_class: + class_destroy(rtrs_dev_class); +out_chunk_pool: + mempool_destroy(chunk_pool); + + return err; +} + +static void __exit rtrs_server_exit(void) +{ + destroy_workqueue(rtrs_wq); + class_destroy(rtrs_dev_class); + mempool_destroy(chunk_pool); + rtrs_rdma_dev_pd_deinit(&dev_pd); +} + +module_init(rtrs_server_init); +module_exit(rtrs_server_exit); -- cgit v1.2.3 From c4f07c60bb021dd76382457e23d72ca078bb6f13 Mon Sep 17 00:00:00 2001 From: Jack Wang Date: Mon, 11 May 2020 15:51:17 +0200 Subject: RDMA/rtrs: server: statistics functions This introduces set of functions used on server side to account statistics of RDMA data sent/received. Link: https://lore.kernel.org/r/20200511135131.27580-12-danil.kipnis@cloud.ionos.com Signed-off-by: Danil Kipnis Signed-off-by: Jack Wang Signed-off-by: Jason Gunthorpe --- drivers/infiniband/ulp/rtrs/rtrs-srv-stats.c | 38 ++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100644 drivers/infiniband/ulp/rtrs/rtrs-srv-stats.c diff --git a/drivers/infiniband/ulp/rtrs/rtrs-srv-stats.c b/drivers/infiniband/ulp/rtrs/rtrs-srv-stats.c new file mode 100644 index 000000000000..e102b1368d0c --- /dev/null +++ b/drivers/infiniband/ulp/rtrs/rtrs-srv-stats.c @@ -0,0 +1,38 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * RDMA Transport Layer + * + * Copyright (c) 2014 - 2018 ProfitBricks GmbH. All rights reserved. + * Copyright (c) 2018 - 2019 1&1 IONOS Cloud GmbH. All rights reserved. + * Copyright (c) 2019 - 2020 1&1 IONOS SE. All rights reserved. + */ +#undef pr_fmt +#define pr_fmt(fmt) KBUILD_MODNAME " L" __stringify(__LINE__) ": " fmt + +#include "rtrs-srv.h" + +int rtrs_srv_reset_rdma_stats(struct rtrs_srv_stats *stats, bool enable) +{ + if (enable) { + struct rtrs_srv_stats_rdma_stats *r = &stats->rdma_stats; + + memset(r, 0, sizeof(*r)); + return 0; + } + + return -EINVAL; +} + +ssize_t rtrs_srv_stats_rdma_to_str(struct rtrs_srv_stats *stats, + char *page, size_t len) +{ + struct rtrs_srv_stats_rdma_stats *r = &stats->rdma_stats; + struct rtrs_srv_sess *sess = stats->sess; + + return scnprintf(page, len, "%lld %lld %lld %lld %u\n", + (s64)atomic64_read(&r->dir[READ].cnt), + (s64)atomic64_read(&r->dir[READ].size_total), + (s64)atomic64_read(&r->dir[WRITE].cnt), + (s64)atomic64_read(&r->dir[WRITE].size_total), + atomic_read(&sess->ids_inflight)); +} -- cgit v1.2.3 From 91b11610af8d61acd618ab1532cf34a4901fee1e Mon Sep 17 00:00:00 2001 From: Jack Wang Date: Mon, 11 May 2020 15:51:18 +0200 Subject: RDMA/rtrs: server: sysfs interface functions This is the sysfs interface to rtrs sessions on server side: /sys/class/rtrs-server// *** rtrs session accepted from a client peer | |- paths// *** established paths from a client in a session | |- disconnect | *** disconnect path | |- hca_name | *** HCA name | |- hca_port | *** HCA port | |- stats/ *** current path statistics | |- rdma Link: https://lore.kernel.org/r/20200511135131.27580-13-danil.kipnis@cloud.ionos.com Signed-off-by: Danil Kipnis Signed-off-by: Jack Wang Signed-off-by: Jason Gunthorpe --- drivers/infiniband/ulp/rtrs/rtrs-srv-sysfs.c | 320 +++++++++++++++++++++++++++ 1 file changed, 320 insertions(+) create mode 100644 drivers/infiniband/ulp/rtrs/rtrs-srv-sysfs.c diff --git a/drivers/infiniband/ulp/rtrs/rtrs-srv-sysfs.c b/drivers/infiniband/ulp/rtrs/rtrs-srv-sysfs.c new file mode 100644 index 000000000000..0cf015634338 --- /dev/null +++ b/drivers/infiniband/ulp/rtrs/rtrs-srv-sysfs.c @@ -0,0 +1,320 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * RDMA Transport Layer + * + * Copyright (c) 2014 - 2018 ProfitBricks GmbH. All rights reserved. + * Copyright (c) 2018 - 2019 1&1 IONOS Cloud GmbH. All rights reserved. + * Copyright (c) 2019 - 2020 1&1 IONOS SE. All rights reserved. + */ +#undef pr_fmt +#define pr_fmt(fmt) KBUILD_MODNAME " L" __stringify(__LINE__) ": " fmt + +#include "rtrs-pri.h" +#include "rtrs-srv.h" +#include "rtrs-log.h" + +static void rtrs_srv_release(struct kobject *kobj) +{ + struct rtrs_srv_sess *sess; + + sess = container_of(kobj, struct rtrs_srv_sess, kobj); + kfree(sess); +} + +static struct kobj_type ktype = { + .sysfs_ops = &kobj_sysfs_ops, + .release = rtrs_srv_release, +}; + +static ssize_t rtrs_srv_disconnect_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *page) +{ + return scnprintf(page, PAGE_SIZE, "Usage: echo 1 > %s\n", + attr->attr.name); +} + +static ssize_t rtrs_srv_disconnect_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct rtrs_srv_sess *sess; + struct rtrs_sess *s; + char str[MAXHOSTNAMELEN]; + + sess = container_of(kobj, struct rtrs_srv_sess, kobj); + s = &sess->s; + if (!sysfs_streq(buf, "1")) { + rtrs_err(s, "%s: invalid value: '%s'\n", + attr->attr.name, buf); + return -EINVAL; + } + + sockaddr_to_str((struct sockaddr *)&sess->s.dst_addr, str, sizeof(str)); + + rtrs_info(s, "disconnect for path %s requested\n", str); + close_sess(sess); + + return count; +} + +static struct kobj_attribute rtrs_srv_disconnect_attr = + __ATTR(disconnect, 0644, + rtrs_srv_disconnect_show, rtrs_srv_disconnect_store); + +static ssize_t rtrs_srv_hca_port_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *page) +{ + struct rtrs_srv_sess *sess; + struct rtrs_con *usr_con; + + sess = container_of(kobj, typeof(*sess), kobj); + usr_con = sess->s.con[0]; + + return scnprintf(page, PAGE_SIZE, "%u\n", + usr_con->cm_id->port_num); +} + +static struct kobj_attribute rtrs_srv_hca_port_attr = + __ATTR(hca_port, 0444, rtrs_srv_hca_port_show, NULL); + +static ssize_t rtrs_srv_hca_name_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *page) +{ + struct rtrs_srv_sess *sess; + + sess = container_of(kobj, struct rtrs_srv_sess, kobj); + + return scnprintf(page, PAGE_SIZE, "%s\n", + sess->s.dev->ib_dev->name); +} + +static struct kobj_attribute rtrs_srv_hca_name_attr = + __ATTR(hca_name, 0444, rtrs_srv_hca_name_show, NULL); + +static ssize_t rtrs_srv_src_addr_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *page) +{ + struct rtrs_srv_sess *sess; + int cnt; + + sess = container_of(kobj, struct rtrs_srv_sess, kobj); + cnt = sockaddr_to_str((struct sockaddr *)&sess->s.dst_addr, + page, PAGE_SIZE); + return cnt + scnprintf(page + cnt, PAGE_SIZE - cnt, "\n"); +} + +static struct kobj_attribute rtrs_srv_src_addr_attr = + __ATTR(src_addr, 0444, rtrs_srv_src_addr_show, NULL); + +static ssize_t rtrs_srv_dst_addr_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *page) +{ + struct rtrs_srv_sess *sess; + int cnt; + + sess = container_of(kobj, struct rtrs_srv_sess, kobj); + cnt = sockaddr_to_str((struct sockaddr *)&sess->s.src_addr, + page, PAGE_SIZE); + return cnt + scnprintf(page + cnt, PAGE_SIZE - cnt, "\n"); +} + +static struct kobj_attribute rtrs_srv_dst_addr_attr = + __ATTR(dst_addr, 0444, rtrs_srv_dst_addr_show, NULL); + +static struct attribute *rtrs_srv_sess_attrs[] = { + &rtrs_srv_hca_name_attr.attr, + &rtrs_srv_hca_port_attr.attr, + &rtrs_srv_src_addr_attr.attr, + &rtrs_srv_dst_addr_attr.attr, + &rtrs_srv_disconnect_attr.attr, + NULL, +}; + +static struct attribute_group rtrs_srv_sess_attr_group = { + .attrs = rtrs_srv_sess_attrs, +}; + +STAT_ATTR(struct rtrs_srv_stats, rdma, + rtrs_srv_stats_rdma_to_str, + rtrs_srv_reset_rdma_stats); + +static struct attribute *rtrs_srv_stats_attrs[] = { + &rdma_attr.attr, + NULL, +}; + +static struct attribute_group rtrs_srv_stats_attr_group = { + .attrs = rtrs_srv_stats_attrs, +}; + +static void rtrs_srv_dev_release(struct device *dev) +{ + struct rtrs_srv *srv = container_of(dev, struct rtrs_srv, dev); + + kfree(srv); +} + +static int rtrs_srv_create_once_sysfs_root_folders(struct rtrs_srv_sess *sess) +{ + struct rtrs_srv *srv = sess->srv; + int err = 0; + + mutex_lock(&srv->paths_mutex); + if (srv->dev_ref++) { + /* + * Device needs to be registered only on the first session + */ + goto unlock; + } + srv->dev.class = rtrs_dev_class; + srv->dev.release = rtrs_srv_dev_release; + err = dev_set_name(&srv->dev, "%s", sess->s.sessname); + if (err) + goto unlock; + + /* + * Suppress user space notification until + * sysfs files are created + */ + dev_set_uevent_suppress(&srv->dev, true); + err = device_register(&srv->dev); + if (err) { + pr_err("device_register(): %d\n", err); + goto put; + } + srv->kobj_paths = kobject_create_and_add("paths", &srv->dev.kobj); + if (!srv->kobj_paths) { + pr_err("kobject_create_and_add(): %d\n", err); + device_unregister(&srv->dev); + goto unlock; + } + dev_set_uevent_suppress(&srv->dev, false); + kobject_uevent(&srv->dev.kobj, KOBJ_ADD); + goto unlock; + +put: + put_device(&srv->dev); +unlock: + mutex_unlock(&srv->paths_mutex); + + return err; +} + +static void +rtrs_srv_destroy_once_sysfs_root_folders(struct rtrs_srv_sess *sess) +{ + struct rtrs_srv *srv = sess->srv; + + mutex_lock(&srv->paths_mutex); + if (!--srv->dev_ref) { + kobject_del(srv->kobj_paths); + kobject_put(srv->kobj_paths); + mutex_unlock(&srv->paths_mutex); + device_unregister(&srv->dev); + } else { + mutex_unlock(&srv->paths_mutex); + } +} + +static void rtrs_srv_sess_stats_release(struct kobject *kobj) +{ + struct rtrs_srv_stats *stats; + + stats = container_of(kobj, struct rtrs_srv_stats, kobj_stats); + + kfree(stats); +} + +static struct kobj_type ktype_stats = { + .sysfs_ops = &kobj_sysfs_ops, + .release = rtrs_srv_sess_stats_release, +}; + +static int rtrs_srv_create_stats_files(struct rtrs_srv_sess *sess) +{ + int err; + struct rtrs_sess *s = &sess->s; + + err = kobject_init_and_add(&sess->stats->kobj_stats, &ktype_stats, + &sess->kobj, "stats"); + if (err) { + rtrs_err(s, "kobject_init_and_add(): %d\n", err); + return err; + } + err = sysfs_create_group(&sess->stats->kobj_stats, + &rtrs_srv_stats_attr_group); + if (err) { + rtrs_err(s, "sysfs_create_group(): %d\n", err); + goto err; + } + + return 0; + +err: + kobject_del(&sess->stats->kobj_stats); + kobject_put(&sess->stats->kobj_stats); + + return err; +} + +int rtrs_srv_create_sess_files(struct rtrs_srv_sess *sess) +{ + struct rtrs_srv *srv = sess->srv; + struct rtrs_sess *s = &sess->s; + char str[NAME_MAX]; + int err, cnt; + + cnt = sockaddr_to_str((struct sockaddr *)&sess->s.dst_addr, + str, sizeof(str)); + cnt += scnprintf(str + cnt, sizeof(str) - cnt, "@"); + sockaddr_to_str((struct sockaddr *)&sess->s.src_addr, + str + cnt, sizeof(str) - cnt); + + err = rtrs_srv_create_once_sysfs_root_folders(sess); + if (err) + return err; + + err = kobject_init_and_add(&sess->kobj, &ktype, srv->kobj_paths, + "%s", str); + if (err) { + rtrs_err(s, "kobject_init_and_add(): %d\n", err); + goto destroy_root; + } + err = sysfs_create_group(&sess->kobj, &rtrs_srv_sess_attr_group); + if (err) { + rtrs_err(s, "sysfs_create_group(): %d\n", err); + goto put_kobj; + } + err = rtrs_srv_create_stats_files(sess); + if (err) + goto remove_group; + + return 0; + +remove_group: + sysfs_remove_group(&sess->kobj, &rtrs_srv_sess_attr_group); +put_kobj: + kobject_del(&sess->kobj); + kobject_put(&sess->kobj); +destroy_root: + rtrs_srv_destroy_once_sysfs_root_folders(sess); + + return err; +} + +void rtrs_srv_destroy_sess_files(struct rtrs_srv_sess *sess) +{ + if (sess->kobj.state_in_sysfs) { + kobject_del(&sess->stats->kobj_stats); + kobject_put(&sess->stats->kobj_stats); + kobject_del(&sess->kobj); + kobject_put(&sess->kobj); + + rtrs_srv_destroy_once_sysfs_root_folders(sess); + } +} -- cgit v1.2.3 From c013fbc1fd341d28269cf0a6b465925186b9a1e1 Mon Sep 17 00:00:00 2001 From: Jack Wang Date: Mon, 11 May 2020 15:51:19 +0200 Subject: RDMA/rtrs: include client and server modules into kernel compilation Add rtrs Makefile, Kconfig and also corresponding lines into upper layer infiniband/ulp files. Link: https://lore.kernel.org/r/20200511135131.27580-14-danil.kipnis@cloud.ionos.com Signed-off-by: Danil Kipnis Signed-off-by: Jack Wang Signed-off-by: Jason Gunthorpe --- drivers/infiniband/Kconfig | 1 + drivers/infiniband/ulp/Makefile | 1 + drivers/infiniband/ulp/rtrs/Kconfig | 27 +++++++++++++++++++++++++++ drivers/infiniband/ulp/rtrs/Makefile | 15 +++++++++++++++ 4 files changed, 44 insertions(+) create mode 100644 drivers/infiniband/ulp/rtrs/Kconfig create mode 100644 drivers/infiniband/ulp/rtrs/Makefile diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig index ade86388434f..477418b37786 100644 --- a/drivers/infiniband/Kconfig +++ b/drivers/infiniband/Kconfig @@ -107,6 +107,7 @@ source "drivers/infiniband/ulp/srpt/Kconfig" source "drivers/infiniband/ulp/iser/Kconfig" source "drivers/infiniband/ulp/isert/Kconfig" +source "drivers/infiniband/ulp/rtrs/Kconfig" source "drivers/infiniband/ulp/opa_vnic/Kconfig" diff --git a/drivers/infiniband/ulp/Makefile b/drivers/infiniband/ulp/Makefile index 437813c7b481..4d0004b58377 100644 --- a/drivers/infiniband/ulp/Makefile +++ b/drivers/infiniband/ulp/Makefile @@ -5,3 +5,4 @@ obj-$(CONFIG_INFINIBAND_SRPT) += srpt/ obj-$(CONFIG_INFINIBAND_ISER) += iser/ obj-$(CONFIG_INFINIBAND_ISERT) += isert/ obj-$(CONFIG_INFINIBAND_OPA_VNIC) += opa_vnic/ +obj-$(CONFIG_INFINIBAND_RTRS) += rtrs/ diff --git a/drivers/infiniband/ulp/rtrs/Kconfig b/drivers/infiniband/ulp/rtrs/Kconfig new file mode 100644 index 000000000000..9092b62e6dc8 --- /dev/null +++ b/drivers/infiniband/ulp/rtrs/Kconfig @@ -0,0 +1,27 @@ +# SPDX-License-Identifier: GPL-2.0-or-later + +config INFINIBAND_RTRS + tristate + depends on INFINIBAND_ADDR_TRANS + +config INFINIBAND_RTRS_CLIENT + tristate "RTRS client module" + depends on INFINIBAND_ADDR_TRANS + select INFINIBAND_RTRS + help + RDMA transport client module. + + RDMA Transport (RTRS) client implements a reliable transport layer + and also multipathing functionality and that it is intended to be + the base layer for a block storage initiator over RDMA. + +config INFINIBAND_RTRS_SERVER + tristate "RTRS server module" + depends on INFINIBAND_ADDR_TRANS + select INFINIBAND_RTRS + help + RDMA transport server module. + + RDMA Transport (RTRS) server module processing connection and IO + requests received from the RTRS client module, it will pass the + IO requests to its user eg. RNBD_server. diff --git a/drivers/infiniband/ulp/rtrs/Makefile b/drivers/infiniband/ulp/rtrs/Makefile new file mode 100644 index 000000000000..3898509be270 --- /dev/null +++ b/drivers/infiniband/ulp/rtrs/Makefile @@ -0,0 +1,15 @@ +# SPDX-License-Identifier: GPL-2.0-or-later + +rtrs-client-y := rtrs-clt.o \ + rtrs-clt-stats.o \ + rtrs-clt-sysfs.o + +rtrs-server-y := rtrs-srv.o \ + rtrs-srv-stats.o \ + rtrs-srv-sysfs.o + +rtrs-core-y := rtrs.o + +obj-$(CONFIG_INFINIBAND_RTRS) += rtrs-core.o +obj-$(CONFIG_INFINIBAND_RTRS_CLIENT) += rtrs-client.o +obj-$(CONFIG_INFINIBAND_RTRS_SERVER) += rtrs-server.o -- cgit v1.2.3 From 745b6a3d4a673c0b8de6e7c15b0620117614b75b Mon Sep 17 00:00:00 2001 From: Jack Wang Date: Mon, 11 May 2020 15:51:20 +0200 Subject: RDMA/rtrs: a bit of documentation README with description of major sysfs entries, sysfs documentation has been moved to ABI dir as suggested by Bart. Link: https://lore.kernel.org/r/20200511135131.27580-15-danil.kipnis@cloud.ionos.com Signed-off-by: Danil Kipnis Signed-off-by: Jack Wang Cc: linux-kernel@vger.kernel.org Signed-off-by: Jason Gunthorpe --- Documentation/ABI/testing/sysfs-class-rtrs-client | 131 +++++++++++++ Documentation/ABI/testing/sysfs-class-rtrs-server | 53 ++++++ drivers/infiniband/ulp/rtrs/README | 213 ++++++++++++++++++++++ 3 files changed, 397 insertions(+) create mode 100644 Documentation/ABI/testing/sysfs-class-rtrs-client create mode 100644 Documentation/ABI/testing/sysfs-class-rtrs-server create mode 100644 drivers/infiniband/ulp/rtrs/README diff --git a/Documentation/ABI/testing/sysfs-class-rtrs-client b/Documentation/ABI/testing/sysfs-class-rtrs-client new file mode 100644 index 000000000000..e7e718db8941 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-class-rtrs-client @@ -0,0 +1,131 @@ +What: /sys/class/rtrs-client +Date: Feb 2020 +KernelVersion: 5.7 +Contact: Jack Wang Danil Kipnis +Description: When a user of RTRS API creates a new session, a directory entry with + the name of that session is created under /sys/class/rtrs-client// + +What: /sys/class/rtrs-client//add_path +Date: Feb 2020 +KernelVersion: 5.7 +Contact: Jack Wang Danil Kipnis +Description: RW, adds a new path (connection) to an existing session. Expected format is the + following: + + <[source addr,]destination addr> + *addr ::= [ ip: | gid: ] + +What: /sys/class/rtrs-client//max_reconnect_attempts +Date: Feb 2020 +KernelVersion: 5.7 +Contact: Jack Wang Danil Kipnis +Description: Maximum number reconnect attempts the client should make before giving up + after connection breaks unexpectedly. + +What: /sys/class/rtrs-client//mp_policy +Date: Feb 2020 +KernelVersion: 5.7 +Contact: Jack Wang Danil Kipnis +Description: Multipath policy specifies which path should be selected on each IO: + + round-robin (0): + select path in per CPU round-robin manner. + + min-inflight (1): + select path with minimum inflights. + +What: /sys/class/rtrs-client//paths/ +Date: Feb 2020 +KernelVersion: 5.7 +Contact: Jack Wang Danil Kipnis +Description: Each path belonging to a given session is listed here by its source and + destination address. When a new path is added to a session by writing to + the "add_path" entry, a directory is created. + +What: /sys/class/rtrs-client//paths//state +Date: Feb 2020 +KernelVersion: 5.7 +Contact: Jack Wang Danil Kipnis +Description: RO, Contains "connected" if the session is connected to the peer and fully + functional. Otherwise the file contains "disconnected" + +What: /sys/class/rtrs-client//paths//reconnect +Date: Feb 2020 +KernelVersion: 5.7 +Contact: Jack Wang Danil Kipnis +Description: Write "1" to the file in order to reconnect the path. + Operation is blocking and returns 0 if reconnect was successful. + +What: /sys/class/rtrs-client//paths//disconnect +Date: Feb 2020 +KernelVersion: 5.7 +Contact: Jack Wang Danil Kipnis +Description: Write "1" to the file in order to disconnect the path. + Operation blocks until RTRS path is disconnected. + +What: /sys/class/rtrs-client//paths//remove_path +Date: Feb 2020 +KernelVersion: 5.7 +Contact: Jack Wang Danil Kipnis +Description: Write "1" to the file in order to disconnected and remove the path + from the session. Operation blocks until the path is disconnected + and removed from the session. + +What: /sys/class/rtrs-client//paths//hca_name +Date: Feb 2020 +KernelVersion: 5.7 +Contact: Jack Wang Danil Kipnis +Description: RO, Contains the the name of HCA the connection established on. + +What: /sys/class/rtrs-client//paths//hca_port +Date: Feb 2020 +KernelVersion: 5.7 +Contact: Jack Wang Danil Kipnis +Description: RO, Contains the port number of active port traffic is going through. + +What: /sys/class/rtrs-client//paths//src_addr +Date: Feb 2020 +KernelVersion: 5.7 +Contact: Jack Wang Danil Kipnis +Description: RO, Contains the source address of the path + +What: /sys/class/rtrs-client//paths//dst_addr +Date: Feb 2020 +KernelVersion: 5.7 +Contact: Jack Wang Danil Kipnis +Description: RO, Contains the destination address of the path + +What: /sys/class/rtrs-client//paths//stats/reset_all +Date: Feb 2020 +KernelVersion: 5.7 +Contact: Jack Wang Danil Kipnis +Description: RW, Read will return usage help, write 0 will clear all the statistics. + +What: /sys/class/rtrs-client//paths//stats/cpu_migration +Date: Feb 2020 +KernelVersion: 5.7 +Contact: Jack Wang Danil Kipnis +Description: RTRS expects that each HCA IRQ is pinned to a separate CPU. If it's + not the case, the processing of an I/O response could be processed on a + different CPU than where it was originally submitted. This file shows + how many interrupts where generated on a non expected CPU. + "from:" is the CPU on which the IRQ was expected, but not generated. + "to:" is the CPU on which the IRQ was generated, but not expected. + +What: /sys/class/rtrs-client//paths//stats/reconnects +Date: Feb 2020 +KernelVersion: 5.7 +Contact: Jack Wang Danil Kipnis +Description: Contains 2 unsigned int values, the first one records number of successful + reconnects in the path lifetime, the second one records number of failed + reconnects in the path lifetime. + +What: /sys/class/rtrs-client//paths//stats/rdma +Date: Feb 2020 +KernelVersion: 5.7 +Contact: Jack Wang Danil Kipnis +Description: Contains statistics regarding rdma operations and inflight operations. + The output consists of 6 values: + + \ + diff --git a/Documentation/ABI/testing/sysfs-class-rtrs-server b/Documentation/ABI/testing/sysfs-class-rtrs-server new file mode 100644 index 000000000000..3b6d5b067df0 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-class-rtrs-server @@ -0,0 +1,53 @@ +What: /sys/class/rtrs-server +Date: Feb 2020 +KernelVersion: 5.7 +Contact: Jack Wang Danil Kipnis +Description: When a user of RTRS API creates a new session on a client side, a + directory entry with the name of that session is created in here. + +What: /sys/class/rtrs-server//paths/ +Date: Feb 2020 +KernelVersion: 5.7 +Contact: Jack Wang Danil Kipnis +Description: When new path is created by writing to "add_path" entry on client side, + a directory entry named as @ is created + on server. + +What: /sys/class/rtrs-server//paths//disconnect +Date: Feb 2020 +KernelVersion: 5.7 +Contact: Jack Wang Danil Kipnis +Description: When "1" is written to the file, the RTRS session is being disconnected. + Operations is non-blocking and returns control immediately to the caller. + +What: /sys/class/rtrs-server//paths//hca_name +Date: Feb 2020 +KernelVersion: 5.7 +Contact: Jack Wang Danil Kipnis +Description: RO, Contains the the name of HCA the connection established on. + +What: /sys/class/rtrs-server//paths//hca_port +Date: Feb 2020 +KernelVersion: 5.7 +Contact: Jack Wang Danil Kipnis +Description: RO, Contains the port number of active port traffic is going through. + +What: /sys/class/rtrs-server//paths//src_addr +Date: Feb 2020 +KernelVersion: 5.7 +Contact: Jack Wang Danil Kipnis +Description: RO, Contains the source address of the path + +What: /sys/class/rtrs-server//paths//dst_addr +Date: Feb 2020 +KernelVersion: 5.7 +Contact: Jack Wang Danil Kipnis +Description: RO, Contains the destination address of the path + +What: /sys/class/rtrs-server//paths//stats/rdma +Date: Feb 2020 +KernelVersion: 5.7 +Contact: Jack Wang Danil Kipnis +Description: Contains statistics regarding rdma operations and inflight operations. + The output consists of 5 values: + diff --git a/drivers/infiniband/ulp/rtrs/README b/drivers/infiniband/ulp/rtrs/README new file mode 100644 index 000000000000..5d9ea142e5dd --- /dev/null +++ b/drivers/infiniband/ulp/rtrs/README @@ -0,0 +1,213 @@ +**************************** +RDMA Transport (RTRS) +**************************** + +RTRS (RDMA Transport) is a reliable high speed transport library +which provides support to establish optimal number of connections +between client and server machines using RDMA (InfiniBand, RoCE, iWarp) +transport. It is optimized to transfer (read/write) IO blocks. + +In its core interface it follows the BIO semantics of providing the +possibility to either write data from an sg list to the remote side +or to request ("read") data transfer from the remote side into a given +sg list. + +RTRS provides I/O fail-over and load-balancing capabilities by using +multipath I/O (see "add_path" and "mp_policy" configuration entries in +Documentation/ABI/testing/sysfs-class-rtrs-client). + +RTRS is used by the RNBD (RDMA Network Block Device) modules. + +================== +Transport protocol +================== + +Overview +-------- +An established connection between a client and a server is called rtrs +session. A session is associated with a set of memory chunks reserved on the +server side for a given client for rdma transfer. A session +consists of multiple paths, each representing a separate physical link +between client and server. Those are used for load balancing and failover. +Each path consists of as many connections (QPs) as there are cpus on +the client. + +When processing an incoming write or read request, rtrs client uses memory +chunks reserved for him on the server side. Their number, size and addresses +need to be exchanged between client and server during the connection +establishment phase. Apart from the memory related information client needs to +inform the server about the session name and identify each path and connection +individually. + +On an established session client sends to server write or read messages. +Server uses immediate field to tell the client which request is being +acknowledged and for errno. Client uses immediate field to tell the server +which of the memory chunks has been accessed and at which offset the message +can be found. + +Module parameter always_invalidate is introduced for the security problem +discussed in LPC RDMA MC 2019. When always_invalidate=Y, on the server side we +invalidate each rdma buffer before we hand it over to RNBD server and +then pass it to the block layer. A new rkey is generated and registered for the +buffer after it returns back from the block layer and RNBD server. +The new rkey is sent back to the client along with the IO result. +The procedure is the default behaviour of the driver. This invalidation and +registration on each IO causes performance drop of up to 20%. A user of the +driver may choose to load the modules with this mechanism switched off +(always_invalidate=N), if he understands and can take the risk of a malicious +client being able to corrupt memory of a server it is connected to. This might +be a reasonable option in a scenario where all the clients and all the servers +are located within a secure datacenter. + + +Connection establishment +------------------------ + +1. Client starts establishing connections belonging to a path of a session one +by one via attaching RTRS_MSG_CON_REQ messages to the rdma_connect requests. +Those include uuid of the session and uuid of the path to be +established. They are used by the server to find a persisting session/path or +to create a new one when necessary. The message also contains the protocol +version and magic for compatibility, total number of connections per session +(as many as cpus on the client), the id of the current connection and +the reconnect counter, which is used to resolve the situations where +client is trying to reconnect a path, while server is still destroying the old +one. + +2. Server accepts the connection requests one by one and attaches +RTRS_MSG_CONN_RSP messages to the rdma_accept. Apart from magic and +protocol version, the messages include error code, queue depth supported by +the server (number of memory chunks which are going to be allocated for that +session) and the maximum size of one io, RTRS_MSG_NEW_RKEY_F flags is set +when always_invalidate=Y. + +3. After all connections of a path are established client sends to server the +RTRS_MSG_INFO_REQ message, containing the name of the session. This message +requests the address information from the server. + +4. Server replies to the session info request message with RTRS_MSG_INFO_RSP, +which contains the addresses and keys of the RDMA buffers allocated for that +session. + +5. Session becomes connected after all paths to be established are connected +(i.e. steps 1-4 finished for all paths requested for a session) + +6. Server and client exchange periodically heartbeat messages (empty rdma +messages with an immediate field) which are used to detect a crash on remote +side or network outage in an absence of IO. + +7. On any RDMA related error or in the case of a heartbeat timeout, the +corresponding path is disconnected, all the inflight IO are failed over to a +healthy path, if any, and the reconnect mechanism is triggered. + +CLT SRV +*for each connection belonging to a path and for each path: +RTRS_MSG_CON_REQ -------------------> + <------------------- RTRS_MSG_CON_RSP +... +*after all connections are established: +RTRS_MSG_INFO_REQ -------------------> + <------------------- RTRS_MSG_INFO_RSP +*heartbeat is started from both sides: + -------------------> [RTRS_HB_MSG_IMM] +[RTRS_HB_MSG_ACK] <------------------- +[RTRS_HB_MSG_IMM] <------------------- + -------------------> [RTRS_HB_MSG_ACK] + +IO path +------- + +* Write (always_invalidate=N) * + +1. When processing a write request client selects one of the memory chunks +on the server side and rdma writes there the user data, user header and the +RTRS_MSG_RDMA_WRITE message. Apart from the type (write), the message only +contains size of the user header. The client tells the server which chunk has +been accessed and at what offset the RTRS_MSG_RDMA_WRITE can be found by +using the IMM field. + +2. When confirming a write request server sends an "empty" rdma message with +an immediate field. The 32 bit field is used to specify the outstanding +inflight IO and for the error code. + +CLT SRV +usr_data + usr_hdr + rtrs_msg_rdma_write -----------------> [RTRS_IO_REQ_IMM] +[RTRS_IO_RSP_IMM] <----------------- (id + errno) + +* Write (always_invalidate=Y) * + +1. When processing a write request client selects one of the memory chunks +on the server side and rdma writes there the user data, user header and the +RTRS_MSG_RDMA_WRITE message. Apart from the type (write), the message only +contains size of the user header. The client tells the server which chunk has +been accessed and at what offset the RTRS_MSG_RDMA_WRITE can be found by +using the IMM field, Server invalidate rkey associated to the memory chunks +first, when it finishes, pass the IO to RNBD server module. + +2. When confirming a write request server sends an "empty" rdma message with +an immediate field. The 32 bit field is used to specify the outstanding +inflight IO and for the error code. The new rkey is sent back using +SEND_WITH_IMM WR, client When it recived new rkey message, it validates +the message and finished IO after update rkey for the rbuffer, then post +back the recv buffer for later use. + +CLT SRV +usr_data + usr_hdr + rtrs_msg_rdma_write -----------------> [RTRS_IO_REQ_IMM] +[RTRS_MSG_RKEY_RSP] <----------------- (RTRS_MSG_RKEY_RSP) +[RTRS_IO_RSP_IMM] <----------------- (id + errno) + + +* Read (always_invalidate=N)* + +1. When processing a read request client selects one of the memory chunks +on the server side and rdma writes there the user header and the +RTRS_MSG_RDMA_READ message. This message contains the type (read), size of +the user header, flags (specifying if memory invalidation is necessary) and the +list of addresses along with keys for the data to be read into. + +2. When confirming a read request server transfers the requested data first, +attaches an invalidation message if requested and finally an "empty" rdma +message with an immediate field. The 32 bit field is used to specify the +outstanding inflight IO and the error code. + +CLT SRV +usr_hdr + rtrs_msg_rdma_read --------------> [RTRS_IO_REQ_IMM] +[RTRS_IO_RSP_IMM] <-------------- usr_data + (id + errno) +or in case client requested invalidation: +[RTRS_IO_RSP_IMM_W_INV] <-------------- usr_data + (INV) + (id + errno) + +* Read (always_invalidate=Y)* + +1. When processing a read request client selects one of the memory chunks +on the server side and rdma writes there the user header and the +RTRS_MSG_RDMA_READ message. This message contains the type (read), size of +the user header, flags (specifying if memory invalidation is necessary) and the +list of addresses along with keys for the data to be read into. +Server invalidate rkey associated to the memory chunks first, when it finishes, +passes the IO to RNBD server module. + +2. When confirming a read request server transfers the requested data first, +attaches an invalidation message if requested and finally an "empty" rdma +message with an immediate field. The 32 bit field is used to specify the +outstanding inflight IO and the error code. The new rkey is sent back using +SEND_WITH_IMM WR, client When it recived new rkey message, it validates +the message and finished IO after update rkey for the rbuffer, then post +back the recv buffer for later use. + +CLT SRV +usr_hdr + rtrs_msg_rdma_read --------------> [RTRS_IO_REQ_IMM] +[RTRS_IO_RSP_IMM] <-------------- usr_data + (id + errno) +[RTRS_MSG_RKEY_RSP] <----------------- (RTRS_MSG_RKEY_RSP) +or in case client requested invalidation: +[RTRS_IO_RSP_IMM_W_INV] <-------------- usr_data + (INV) + (id + errno) +========================================= +Contributors List(in alphabetical order) +========================================= +Danil Kipnis +Fabian Holler +Guoqing Jiang +Jack Wang +Kleber Souza +Lutz Pogrell +Milind Dumbare +Roman Penyaev -- cgit v1.2.3 From 219ace60770117fbe440904f9156ab2ab8f30e7d Mon Sep 17 00:00:00 2001 From: Jack Wang Date: Mon, 11 May 2020 15:51:21 +0200 Subject: block/rnbd: private headers with rnbd protocol structs and helpers These are common private headers with rnbd protocol structures, logging, sysfs and other helper functions, which are used on both client and server sides. Link: https://lore.kernel.org/r/20200511135131.27580-16-danil.kipnis@cloud.ionos.com Signed-off-by: Danil Kipnis Signed-off-by: Jack Wang Reviewed-by: Bart Van Assche Acked-by: Jens Axboe Signed-off-by: Jason Gunthorpe --- drivers/block/rnbd/rnbd-common.c | 23 +++ drivers/block/rnbd/rnbd-log.h | 41 ++++++ drivers/block/rnbd/rnbd-proto.h | 303 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 367 insertions(+) create mode 100644 drivers/block/rnbd/rnbd-common.c create mode 100644 drivers/block/rnbd/rnbd-log.h create mode 100644 drivers/block/rnbd/rnbd-proto.h diff --git a/drivers/block/rnbd/rnbd-common.c b/drivers/block/rnbd/rnbd-common.c new file mode 100644 index 000000000000..596c3f732403 --- /dev/null +++ b/drivers/block/rnbd/rnbd-common.c @@ -0,0 +1,23 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * RDMA Network Block Driver + * + * Copyright (c) 2014 - 2018 ProfitBricks GmbH. All rights reserved. + * Copyright (c) 2018 - 2019 1&1 IONOS Cloud GmbH. All rights reserved. + * Copyright (c) 2019 - 2020 1&1 IONOS SE. All rights reserved. + */ +#include "rnbd-proto.h" + +const char *rnbd_access_mode_str(enum rnbd_access_mode mode) +{ + switch (mode) { + case RNBD_ACCESS_RO: + return "ro"; + case RNBD_ACCESS_RW: + return "rw"; + case RNBD_ACCESS_MIGRATION: + return "migration"; + default: + return "unknown"; + } +} diff --git a/drivers/block/rnbd/rnbd-log.h b/drivers/block/rnbd/rnbd-log.h new file mode 100644 index 000000000000..136e7d6c3451 --- /dev/null +++ b/drivers/block/rnbd/rnbd-log.h @@ -0,0 +1,41 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * RDMA Network Block Driver + * + * Copyright (c) 2014 - 2018 ProfitBricks GmbH. All rights reserved. + * Copyright (c) 2018 - 2019 1&1 IONOS Cloud GmbH. All rights reserved. + * Copyright (c) 2019 - 2020 1&1 IONOS SE. All rights reserved. + */ +#ifndef RNBD_LOG_H +#define RNBD_LOG_H + +#include "rnbd-clt.h" +#include "rnbd-srv.h" + +#define rnbd_clt_log(fn, dev, fmt, ...) ( \ + fn("<%s@%s> " fmt, (dev)->pathname, \ + (dev)->sess->sessname, \ + ##__VA_ARGS__)) +#define rnbd_srv_log(fn, dev, fmt, ...) ( \ + fn("<%s@%s>: " fmt, (dev)->pathname, \ + (dev)->sess->sessname, ##__VA_ARGS__)) + +#define rnbd_clt_err(dev, fmt, ...) \ + rnbd_clt_log(pr_err, dev, fmt, ##__VA_ARGS__) +#define rnbd_clt_err_rl(dev, fmt, ...) \ + rnbd_clt_log(pr_err_ratelimited, dev, fmt, ##__VA_ARGS__) +#define rnbd_clt_info(dev, fmt, ...) \ + rnbd_clt_log(pr_info, dev, fmt, ##__VA_ARGS__) +#define rnbd_clt_info_rl(dev, fmt, ...) \ + rnbd_clt_log(pr_info_ratelimited, dev, fmt, ##__VA_ARGS__) + +#define rnbd_srv_err(dev, fmt, ...) \ + rnbd_srv_log(pr_err, dev, fmt, ##__VA_ARGS__) +#define rnbd_srv_err_rl(dev, fmt, ...) \ + rnbd_srv_log(pr_err_ratelimited, dev, fmt, ##__VA_ARGS__) +#define rnbd_srv_info(dev, fmt, ...) \ + rnbd_srv_log(pr_info, dev, fmt, ##__VA_ARGS__) +#define rnbd_srv_info_rl(dev, fmt, ...) \ + rnbd_srv_log(pr_info_ratelimited, dev, fmt, ##__VA_ARGS__) + +#endif /* RNBD_LOG_H */ diff --git a/drivers/block/rnbd/rnbd-proto.h b/drivers/block/rnbd/rnbd-proto.h new file mode 100644 index 000000000000..ca166241452c --- /dev/null +++ b/drivers/block/rnbd/rnbd-proto.h @@ -0,0 +1,303 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * RDMA Network Block Driver + * + * Copyright (c) 2014 - 2018 ProfitBricks GmbH. All rights reserved. + * Copyright (c) 2018 - 2019 1&1 IONOS Cloud GmbH. All rights reserved. + * Copyright (c) 2019 - 2020 1&1 IONOS SE. All rights reserved. + */ +#ifndef RNBD_PROTO_H +#define RNBD_PROTO_H + +#include +#include +#include +#include +#include +#include +#include + +#define RNBD_PROTO_VER_MAJOR 2 +#define RNBD_PROTO_VER_MINOR 0 + +/* The default port number the RTRS server is listening on. */ +#define RTRS_PORT 1234 + +/** + * enum rnbd_msg_types - RNBD message types + * @RNBD_MSG_SESS_INFO: initial session info from client to server + * @RNBD_MSG_SESS_INFO_RSP: initial session info from server to client + * @RNBD_MSG_OPEN: open (map) device request + * @RNBD_MSG_OPEN_RSP: response to an @RNBD_MSG_OPEN + * @RNBD_MSG_IO: block IO request operation + * @RNBD_MSG_CLOSE: close (unmap) device request + */ +enum rnbd_msg_type { + RNBD_MSG_SESS_INFO, + RNBD_MSG_SESS_INFO_RSP, + RNBD_MSG_OPEN, + RNBD_MSG_OPEN_RSP, + RNBD_MSG_IO, + RNBD_MSG_CLOSE, +}; + +/** + * struct rnbd_msg_hdr - header of RNBD messages + * @type: Message type, valid values see: enum rnbd_msg_types + */ +struct rnbd_msg_hdr { + __le16 type; + __le16 __padding; +}; + +/** + * We allow to map RO many times and RW only once. We allow to map yet another + * time RW, if MIGRATION is provided (second RW export can be required for + * example for VM migration) + */ +enum rnbd_access_mode { + RNBD_ACCESS_RO, + RNBD_ACCESS_RW, + RNBD_ACCESS_MIGRATION, +}; + +/** + * struct rnbd_msg_sess_info - initial session info from client to server + * @hdr: message header + * @ver: RNBD protocol version + */ +struct rnbd_msg_sess_info { + struct rnbd_msg_hdr hdr; + u8 ver; + u8 reserved[31]; +}; + +/** + * struct rnbd_msg_sess_info_rsp - initial session info from server to client + * @hdr: message header + * @ver: RNBD protocol version + */ +struct rnbd_msg_sess_info_rsp { + struct rnbd_msg_hdr hdr; + u8 ver; + u8 reserved[31]; +}; + +/** + * struct rnbd_msg_open - request to open a remote device. + * @hdr: message header + * @access_mode: the mode to open remote device, valid values see: + * enum rnbd_access_mode + * @device_name: device path on remote side + */ +struct rnbd_msg_open { + struct rnbd_msg_hdr hdr; + u8 access_mode; + u8 resv1; + s8 dev_name[NAME_MAX]; + u8 reserved[3]; +}; + +/** + * struct rnbd_msg_close - request to close a remote device. + * @hdr: message header + * @device_id: device_id on server side to identify the device + */ +struct rnbd_msg_close { + struct rnbd_msg_hdr hdr; + __le32 device_id; +}; + +/** + * struct rnbd_msg_open_rsp - response message to RNBD_MSG_OPEN + * @hdr: message header + * @device_id: device_id on server side to identify the device + * @nsectors: number of sectors in the usual 512b unit + * @max_hw_sectors: max hardware sectors in the usual 512b unit + * @max_write_same_sectors: max sectors for WRITE SAME in the 512b unit + * @max_discard_sectors: max. sectors that can be discarded at once in 512b + * unit. + * @discard_granularity: size of the internal discard allocation unit in bytes + * @discard_alignment: offset from internal allocation assignment in bytes + * @physical_block_size: physical block size device supports in bytes + * @logical_block_size: logical block size device supports in bytes + * @max_segments: max segments hardware support in one transfer + * @secure_discard: supports secure discard + * @rotation: is a rotational disc? + */ +struct rnbd_msg_open_rsp { + struct rnbd_msg_hdr hdr; + __le32 device_id; + __le64 nsectors; + __le32 max_hw_sectors; + __le32 max_write_same_sectors; + __le32 max_discard_sectors; + __le32 discard_granularity; + __le32 discard_alignment; + __le16 physical_block_size; + __le16 logical_block_size; + __le16 max_segments; + __le16 secure_discard; + u8 rotational; + u8 reserved[11]; +}; + +/** + * struct rnbd_msg_io - message for I/O read/write + * @hdr: message header + * @device_id: device_id on server side to find the right device + * @sector: bi_sector attribute from struct bio + * @rw: valid values are defined in enum rnbd_io_flags + * @bi_size: number of bytes for I/O read/write + * @prio: priority + */ +struct rnbd_msg_io { + struct rnbd_msg_hdr hdr; + __le32 device_id; + __le64 sector; + __le32 rw; + __le32 bi_size; + __le16 prio; +}; + +#define RNBD_OP_BITS 8 +#define RNBD_OP_MASK ((1 << RNBD_OP_BITS) - 1) + +/** + * enum rnbd_io_flags - RNBD request types from rq_flag_bits + * @RNBD_OP_READ: read sectors from the device + * @RNBD_OP_WRITE: write sectors to the device + * @RNBD_OP_FLUSH: flush the volatile write cache + * @RNBD_OP_DISCARD: discard sectors + * @RNBD_OP_SECURE_ERASE: securely erase sectors + * @RNBD_OP_WRITE_SAME: write the same sectors many times + + * @RNBD_F_SYNC: request is sync (sync write or read) + * @RNBD_F_FUA: forced unit access + */ +enum rnbd_io_flags { + + /* Operations */ + + RNBD_OP_READ = 0, + RNBD_OP_WRITE = 1, + RNBD_OP_FLUSH = 2, + RNBD_OP_DISCARD = 3, + RNBD_OP_SECURE_ERASE = 4, + RNBD_OP_WRITE_SAME = 5, + + RNBD_OP_LAST, + + /* Flags */ + + RNBD_F_SYNC = 1<<(RNBD_OP_BITS + 0), + RNBD_F_FUA = 1<<(RNBD_OP_BITS + 1), + + RNBD_F_ALL = (RNBD_F_SYNC | RNBD_F_FUA) + +}; + +static inline u32 rnbd_op(u32 flags) +{ + return flags & RNBD_OP_MASK; +} + +static inline u32 rnbd_flags(u32 flags) +{ + return flags & ~RNBD_OP_MASK; +} + +static inline bool rnbd_flags_supported(u32 flags) +{ + u32 op; + + op = rnbd_op(flags); + flags = rnbd_flags(flags); + + if (op >= RNBD_OP_LAST) + return false; + if (flags & ~RNBD_F_ALL) + return false; + + return true; +} + +static inline u32 rnbd_to_bio_flags(u32 rnbd_opf) +{ + u32 bio_opf; + + switch (rnbd_op(rnbd_opf)) { + case RNBD_OP_READ: + bio_opf = REQ_OP_READ; + break; + case RNBD_OP_WRITE: + bio_opf = REQ_OP_WRITE; + break; + case RNBD_OP_FLUSH: + bio_opf = REQ_OP_FLUSH | REQ_PREFLUSH; + break; + case RNBD_OP_DISCARD: + bio_opf = REQ_OP_DISCARD; + break; + case RNBD_OP_SECURE_ERASE: + bio_opf = REQ_OP_SECURE_ERASE; + break; + case RNBD_OP_WRITE_SAME: + bio_opf = REQ_OP_WRITE_SAME; + break; + default: + WARN(1, "Unknown RNBD type: %d (flags %d)\n", + rnbd_op(rnbd_opf), rnbd_opf); + bio_opf = 0; + } + + if (rnbd_opf & RNBD_F_SYNC) + bio_opf |= REQ_SYNC; + + if (rnbd_opf & RNBD_F_FUA) + bio_opf |= REQ_FUA; + + return bio_opf; +} + +static inline u32 rq_to_rnbd_flags(struct request *rq) +{ + u32 rnbd_opf; + + switch (req_op(rq)) { + case REQ_OP_READ: + rnbd_opf = RNBD_OP_READ; + break; + case REQ_OP_WRITE: + rnbd_opf = RNBD_OP_WRITE; + break; + case REQ_OP_DISCARD: + rnbd_opf = RNBD_OP_DISCARD; + break; + case REQ_OP_SECURE_ERASE: + rnbd_opf = RNBD_OP_SECURE_ERASE; + break; + case REQ_OP_WRITE_SAME: + rnbd_opf = RNBD_OP_WRITE_SAME; + break; + case REQ_OP_FLUSH: + rnbd_opf = RNBD_OP_FLUSH; + break; + default: + WARN(1, "Unknown request type %d (flags %llu)\n", + req_op(rq), (unsigned long long)rq->cmd_flags); + rnbd_opf = 0; + } + + if (op_is_sync(rq->cmd_flags)) + rnbd_opf |= RNBD_F_SYNC; + + if (op_is_flush(rq->cmd_flags)) + rnbd_opf |= RNBD_F_FUA; + + return rnbd_opf; +} + +const char *rnbd_access_mode_str(enum rnbd_access_mode mode); + +#endif /* RNBD_PROTO_H */ -- cgit v1.2.3 From 90426e89f54dbb8f77d94604a06d0643dd0c3eb9 Mon Sep 17 00:00:00 2001 From: Jack Wang Date: Mon, 11 May 2020 15:51:22 +0200 Subject: block/rnbd: client: private header with client structs and functions This header describes main structs and functions used by rnbd-client module, mainly for managing RNBD sessions and mapped block devices, creating and destroying sysfs entries. Link: https://lore.kernel.org/r/20200511135131.27580-17-danil.kipnis@cloud.ionos.com Signed-off-by: Danil Kipnis Signed-off-by: Jack Wang Reviewed-by: Bart Van Assche Acked-by: Jens Axboe Signed-off-by: Jason Gunthorpe --- drivers/block/rnbd/rnbd-clt.h | 156 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 156 insertions(+) create mode 100644 drivers/block/rnbd/rnbd-clt.h diff --git a/drivers/block/rnbd/rnbd-clt.h b/drivers/block/rnbd/rnbd-clt.h new file mode 100644 index 000000000000..ed33654aa486 --- /dev/null +++ b/drivers/block/rnbd/rnbd-clt.h @@ -0,0 +1,156 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * RDMA Network Block Driver + * + * Copyright (c) 2014 - 2018 ProfitBricks GmbH. All rights reserved. + * Copyright (c) 2018 - 2019 1&1 IONOS Cloud GmbH. All rights reserved. + * Copyright (c) 2019 - 2020 1&1 IONOS SE. All rights reserved. + */ + +#ifndef RNBD_CLT_H +#define RNBD_CLT_H + +#include +#include +#include +#include +#include + +#include +#include "rnbd-proto.h" +#include "rnbd-log.h" + +/* Max. number of segments per IO request, Mellanox Connect X ~ Connect X5, + * choose minimial 30 for all, minus 1 for internal protocol, so 29. + */ +#define BMAX_SEGMENTS 29 +/* time in seconds between reconnect tries, default to 30 s */ +#define RECONNECT_DELAY 30 +/* + * Number of times to reconnect on error before giving up, 0 for * disabled, + * -1 for forever + */ +#define MAX_RECONNECTS -1 + +enum rnbd_clt_dev_state { + DEV_STATE_INIT, + DEV_STATE_MAPPED, + DEV_STATE_MAPPED_DISCONNECTED, + DEV_STATE_UNMAPPED, +}; + +struct rnbd_iu_comp { + wait_queue_head_t wait; + int errno; +}; + +struct rnbd_iu { + union { + struct request *rq; /* for block io */ + void *buf; /* for user messages */ + }; + struct rtrs_permit *permit; + union { + /* use to send msg associated with a dev */ + struct rnbd_clt_dev *dev; + /* use to send msg associated with a sess */ + struct rnbd_clt_session *sess; + }; + struct scatterlist sglist[BMAX_SEGMENTS]; + struct work_struct work; + int errno; + struct rnbd_iu_comp comp; + atomic_t refcount; +}; + +struct rnbd_cpu_qlist { + struct list_head requeue_list; + spinlock_t requeue_lock; + unsigned int cpu; +}; + +struct rnbd_clt_session { + struct list_head list; + struct rtrs_clt *rtrs; + wait_queue_head_t rtrs_waitq; + bool rtrs_ready; + struct rnbd_cpu_qlist __percpu + *cpu_queues; + DECLARE_BITMAP(cpu_queues_bm, NR_CPUS); + int __percpu *cpu_rr; /* per-cpu var for CPU round-robin */ + atomic_t busy; + int queue_depth; + u32 max_io_size; + struct blk_mq_tag_set tag_set; + struct mutex lock; /* protects state and devs_list */ + struct list_head devs_list; /* list of struct rnbd_clt_dev */ + refcount_t refcount; + char sessname[NAME_MAX]; + u8 ver; /* protocol version */ +}; + +/** + * Submission queues. + */ +struct rnbd_queue { + struct list_head requeue_list; + unsigned long in_list; + struct rnbd_clt_dev *dev; + struct blk_mq_hw_ctx *hctx; +}; + +struct rnbd_clt_dev { + struct rnbd_clt_session *sess; + struct request_queue *queue; + struct rnbd_queue *hw_queues; + u32 device_id; + /* local Idr index - used to track minor number allocations. */ + u32 clt_device_id; + struct mutex lock; + enum rnbd_clt_dev_state dev_state; + char pathname[NAME_MAX]; + enum rnbd_access_mode access_mode; + bool read_only; + bool rotational; + u32 max_hw_sectors; + u32 max_write_same_sectors; + u32 max_discard_sectors; + u32 discard_granularity; + u32 discard_alignment; + u16 secure_discard; + u16 physical_block_size; + u16 logical_block_size; + u16 max_segments; + size_t nsectors; + u64 size; /* device size in bytes */ + struct list_head list; + struct gendisk *gd; + struct kobject kobj; + char blk_symlink_name[NAME_MAX]; + refcount_t refcount; + struct work_struct unmap_on_rmmod_work; +}; + +/* rnbd-clt.c */ + +struct rnbd_clt_dev *rnbd_clt_map_device(const char *sessname, + struct rtrs_addr *paths, + size_t path_cnt, u16 port_nr, + const char *pathname, + enum rnbd_access_mode access_mode); +int rnbd_clt_unmap_device(struct rnbd_clt_dev *dev, bool force, + const struct attribute *sysfs_self); + +int rnbd_clt_remap_device(struct rnbd_clt_dev *dev); +int rnbd_clt_resize_disk(struct rnbd_clt_dev *dev, size_t newsize); + +/* rnbd-clt-sysfs.c */ + +int rnbd_clt_create_sysfs_files(void); + +void rnbd_clt_destroy_sysfs_files(void); +void rnbd_clt_destroy_default_group(void); + +void rnbd_clt_remove_dev_symlink(struct rnbd_clt_dev *dev); + +#endif /* RNBD_CLT_H */ -- cgit v1.2.3 From f7a7a5c228d45efc45d6e26a199a3ea13d2f8754 Mon Sep 17 00:00:00 2001 From: Jack Wang Date: Mon, 11 May 2020 15:51:23 +0200 Subject: block/rnbd: client: main functionality This is main functionality of rnbd-client module, which provides interface to map remote device as local block device /dev/rnbd and feeds RTRS with IO requests. Link: https://lore.kernel.org/r/20200511135131.27580-18-danil.kipnis@cloud.ionos.com Signed-off-by: Danil Kipnis Signed-off-by: Jack Wang Reviewed-by: Bart Van Assche Acked-by: Jens Axboe Signed-off-by: Jason Gunthorpe --- drivers/block/rnbd/rnbd-clt.c | 1729 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 1729 insertions(+) create mode 100644 drivers/block/rnbd/rnbd-clt.c diff --git a/drivers/block/rnbd/rnbd-clt.c b/drivers/block/rnbd/rnbd-clt.c new file mode 100644 index 000000000000..55bff3b1be71 --- /dev/null +++ b/drivers/block/rnbd/rnbd-clt.c @@ -0,0 +1,1729 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * RDMA Network Block Driver + * + * Copyright (c) 2014 - 2018 ProfitBricks GmbH. All rights reserved. + * Copyright (c) 2018 - 2019 1&1 IONOS Cloud GmbH. All rights reserved. + * Copyright (c) 2019 - 2020 1&1 IONOS SE. All rights reserved. + */ + +#undef pr_fmt +#define pr_fmt(fmt) KBUILD_MODNAME " L" __stringify(__LINE__) ": " fmt + +#include +#include +#include +#include +#include + +#include "rnbd-clt.h" + +MODULE_DESCRIPTION("RDMA Network Block Device Client"); +MODULE_LICENSE("GPL"); + +static int rnbd_client_major; +static DEFINE_IDA(index_ida); +static DEFINE_MUTEX(ida_lock); +static DEFINE_MUTEX(sess_lock); +static LIST_HEAD(sess_list); + +/* + * Maximum number of partitions an instance can have. + * 6 bits = 64 minors = 63 partitions (one minor is used for the device itself) + */ +#define RNBD_PART_BITS 6 + +static inline bool rnbd_clt_get_sess(struct rnbd_clt_session *sess) +{ + return refcount_inc_not_zero(&sess->refcount); +} + +static void free_sess(struct rnbd_clt_session *sess); + +static void rnbd_clt_put_sess(struct rnbd_clt_session *sess) +{ + might_sleep(); + + if (refcount_dec_and_test(&sess->refcount)) + free_sess(sess); +} + +static void rnbd_clt_put_dev(struct rnbd_clt_dev *dev) +{ + might_sleep(); + + if (!refcount_dec_and_test(&dev->refcount)) + return; + + mutex_lock(&ida_lock); + ida_simple_remove(&index_ida, dev->clt_device_id); + mutex_unlock(&ida_lock); + kfree(dev->hw_queues); + rnbd_clt_put_sess(dev->sess); + mutex_destroy(&dev->lock); + kfree(dev); +} + +static inline bool rnbd_clt_get_dev(struct rnbd_clt_dev *dev) +{ + return refcount_inc_not_zero(&dev->refcount); +} + +static int rnbd_clt_set_dev_attr(struct rnbd_clt_dev *dev, + const struct rnbd_msg_open_rsp *rsp) +{ + struct rnbd_clt_session *sess = dev->sess; + + if (!rsp->logical_block_size) + return -EINVAL; + + dev->device_id = le32_to_cpu(rsp->device_id); + dev->nsectors = le64_to_cpu(rsp->nsectors); + dev->logical_block_size = le16_to_cpu(rsp->logical_block_size); + dev->physical_block_size = le16_to_cpu(rsp->physical_block_size); + dev->max_write_same_sectors = le32_to_cpu(rsp->max_write_same_sectors); + dev->max_discard_sectors = le32_to_cpu(rsp->max_discard_sectors); + dev->discard_granularity = le32_to_cpu(rsp->discard_granularity); + dev->discard_alignment = le32_to_cpu(rsp->discard_alignment); + dev->secure_discard = le16_to_cpu(rsp->secure_discard); + dev->rotational = rsp->rotational; + + dev->max_hw_sectors = sess->max_io_size / SECTOR_SIZE; + dev->max_segments = BMAX_SEGMENTS; + + dev->max_hw_sectors = min_t(u32, dev->max_hw_sectors, + le32_to_cpu(rsp->max_hw_sectors)); + dev->max_segments = min_t(u16, dev->max_segments, + le16_to_cpu(rsp->max_segments)); + + return 0; +} + +static int rnbd_clt_change_capacity(struct rnbd_clt_dev *dev, + size_t new_nsectors) +{ + int err = 0; + + rnbd_clt_info(dev, "Device size changed from %zu to %zu sectors\n", + dev->nsectors, new_nsectors); + dev->nsectors = new_nsectors; + set_capacity(dev->gd, dev->nsectors); + err = revalidate_disk(dev->gd); + if (err) + rnbd_clt_err(dev, + "Failed to change device size from %zu to %zu, err: %d\n", + dev->nsectors, new_nsectors, err); + return err; +} + +static int process_msg_open_rsp(struct rnbd_clt_dev *dev, + struct rnbd_msg_open_rsp *rsp) +{ + int err = 0; + + mutex_lock(&dev->lock); + if (dev->dev_state == DEV_STATE_UNMAPPED) { + rnbd_clt_info(dev, + "Ignoring Open-Response message from server for unmapped device\n"); + err = -ENOENT; + goto out; + } + if (dev->dev_state == DEV_STATE_MAPPED_DISCONNECTED) { + u64 nsectors = le64_to_cpu(rsp->nsectors); + + /* + * If the device was remapped and the size changed in the + * meantime we need to revalidate it + */ + if (dev->nsectors != nsectors) + rnbd_clt_change_capacity(dev, nsectors); + rnbd_clt_info(dev, "Device online, device remapped successfully\n"); + } + err = rnbd_clt_set_dev_attr(dev, rsp); + if (err) + goto out; + dev->dev_state = DEV_STATE_MAPPED; + +out: + mutex_unlock(&dev->lock); + + return err; +} + +int rnbd_clt_resize_disk(struct rnbd_clt_dev *dev, size_t newsize) +{ + int ret = 0; + + mutex_lock(&dev->lock); + if (dev->dev_state != DEV_STATE_MAPPED) { + pr_err("Failed to set new size of the device, device is not opened\n"); + ret = -ENOENT; + goto out; + } + ret = rnbd_clt_change_capacity(dev, newsize); + +out: + mutex_unlock(&dev->lock); + + return ret; +} + +static inline void rnbd_clt_dev_requeue(struct rnbd_queue *q) +{ + if (WARN_ON(!q->hctx)) + return; + + /* We can come here from interrupt, thus async=true */ + blk_mq_run_hw_queue(q->hctx, true); +} + +enum { + RNBD_DELAY_IFBUSY = -1, +}; + +/** + * rnbd_get_cpu_qlist() - finds a list with HW queues to be rerun + * @sess: Session to find a queue for + * @cpu: Cpu to start the search from + * + * Description: + * Each CPU has a list of HW queues, which needs to be rerun. If a list + * is not empty - it is marked with a bit. This function finds first + * set bit in a bitmap and returns corresponding CPU list. + */ +static struct rnbd_cpu_qlist * +rnbd_get_cpu_qlist(struct rnbd_clt_session *sess, int cpu) +{ + int bit; + + /* Search from cpu to nr_cpu_ids */ + bit = find_next_bit(sess->cpu_queues_bm, nr_cpu_ids, cpu); + if (bit < nr_cpu_ids) { + return per_cpu_ptr(sess->cpu_queues, bit); + } else if (cpu != 0) { + /* Search from 0 to cpu */ + bit = find_next_bit(sess->cpu_queues_bm, cpu, 0); + if (bit < cpu) + return per_cpu_ptr(sess->cpu_queues, bit); + } + + return NULL; +} + +static inline int nxt_cpu(int cpu) +{ + return (cpu + 1) % nr_cpu_ids; +} + +/** + * rnbd_rerun_if_needed() - rerun next queue marked as stopped + * @sess: Session to rerun a queue on + * + * Description: + * Each CPU has it's own list of HW queues, which should be rerun. + * Function finds such list with HW queues, takes a list lock, picks up + * the first HW queue out of the list and requeues it. + * + * Return: + * True if the queue was requeued, false otherwise. + * + * Context: + * Does not matter. + */ +static bool rnbd_rerun_if_needed(struct rnbd_clt_session *sess) +{ + struct rnbd_queue *q = NULL; + struct rnbd_cpu_qlist *cpu_q; + unsigned long flags; + int *cpup; + + /* + * To keep fairness and not to let other queues starve we always + * try to wake up someone else in round-robin manner. That of course + * increases latency but queues always have a chance to be executed. + */ + cpup = get_cpu_ptr(sess->cpu_rr); + for (cpu_q = rnbd_get_cpu_qlist(sess, nxt_cpu(*cpup)); cpu_q; + cpu_q = rnbd_get_cpu_qlist(sess, nxt_cpu(cpu_q->cpu))) { + if (!spin_trylock_irqsave(&cpu_q->requeue_lock, flags)) + continue; + if (unlikely(!test_bit(cpu_q->cpu, sess->cpu_queues_bm))) + goto unlock; + q = list_first_entry_or_null(&cpu_q->requeue_list, + typeof(*q), requeue_list); + if (WARN_ON(!q)) + goto clear_bit; + list_del_init(&q->requeue_list); + clear_bit_unlock(0, &q->in_list); + + if (list_empty(&cpu_q->requeue_list)) { + /* Clear bit if nothing is left */ +clear_bit: + clear_bit(cpu_q->cpu, sess->cpu_queues_bm); + } +unlock: + spin_unlock_irqrestore(&cpu_q->requeue_lock, flags); + + if (q) + break; + } + + /** + * Saves the CPU that is going to be requeued on the per-cpu var. Just + * incrementing it doesn't work because rnbd_get_cpu_qlist() will + * always return the first CPU with something on the queue list when the + * value stored on the var is greater than the last CPU with something + * on the list. + */ + if (cpu_q) + *cpup = cpu_q->cpu; + put_cpu_var(sess->cpu_rr); + + if (q) + rnbd_clt_dev_requeue(q); + + return q; +} + +/** + * rnbd_rerun_all_if_idle() - rerun all queues left in the list if + * session is idling (there are no requests + * in-flight). + * @sess: Session to rerun the queues on + * + * Description: + * This function tries to rerun all stopped queues if there are no + * requests in-flight anymore. This function tries to solve an obvious + * problem, when number of tags < than number of queues (hctx), which + * are stopped and put to sleep. If last permit, which has been just put, + * does not wake up all left queues (hctxs), IO requests hang forever. + * + * That can happen when all number of permits, say N, have been exhausted + * from one CPU, and we have many block devices per session, say M. + * Each block device has it's own queue (hctx) for each CPU, so eventually + * we can put that number of queues (hctxs) to sleep: M x nr_cpu_ids. + * If number of permits N < M x nr_cpu_ids finally we will get an IO hang. + * + * To avoid this hang last caller of rnbd_put_permit() (last caller is the + * one who observes sess->busy == 0) must wake up all remaining queues. + * + * Context: + * Does not matter. + */ +static void rnbd_rerun_all_if_idle(struct rnbd_clt_session *sess) +{ + bool requeued; + + do { + requeued = rnbd_rerun_if_needed(sess); + } while (atomic_read(&sess->busy) == 0 && requeued); +} + +static struct rtrs_permit *rnbd_get_permit(struct rnbd_clt_session *sess, + enum rtrs_clt_con_type con_type, + int wait) +{ + struct rtrs_permit *permit; + + permit = rtrs_clt_get_permit(sess->rtrs, con_type, + wait ? RTRS_PERMIT_WAIT : + RTRS_PERMIT_NOWAIT); + if (likely(permit)) + /* We have a subtle rare case here, when all permits can be + * consumed before busy counter increased. This is safe, + * because loser will get NULL as a permit, observe 0 busy + * counter and immediately restart the queue himself. + */ + atomic_inc(&sess->busy); + + return permit; +} + +static void rnbd_put_permit(struct rnbd_clt_session *sess, + struct rtrs_permit *permit) +{ + rtrs_clt_put_permit(sess->rtrs, permit); + atomic_dec(&sess->busy); + /* Paired with rnbd_clt_dev_add_to_requeue(). Decrement first + * and then check queue bits. + */ + smp_mb__after_atomic(); + rnbd_rerun_all_if_idle(sess); +} + +static struct rnbd_iu *rnbd_get_iu(struct rnbd_clt_session *sess, + enum rtrs_clt_con_type con_type, + int wait) +{ + struct rnbd_iu *iu; + struct rtrs_permit *permit; + + permit = rnbd_get_permit(sess, con_type, + wait ? RTRS_PERMIT_WAIT : + RTRS_PERMIT_NOWAIT); + if (unlikely(!permit)) + return NULL; + iu = rtrs_permit_to_pdu(permit); + iu->permit = permit; + /* + * 1st reference is dropped after finishing sending a "user" message, + * 2nd reference is dropped after confirmation with the response is + * returned. + * 1st and 2nd can happen in any order, so the rnbd_iu should be + * released (rtrs_permit returned to ibbtrs) only leased after both + * are finished. + */ + atomic_set(&iu->refcount, 2); + init_waitqueue_head(&iu->comp.wait); + iu->comp.errno = INT_MAX; + + return iu; +} + +static void rnbd_put_iu(struct rnbd_clt_session *sess, struct rnbd_iu *iu) +{ + if (atomic_dec_and_test(&iu->refcount)) + rnbd_put_permit(sess, iu->permit); +} + +static void rnbd_softirq_done_fn(struct request *rq) +{ + struct rnbd_clt_dev *dev = rq->rq_disk->private_data; + struct rnbd_clt_session *sess = dev->sess; + struct rnbd_iu *iu; + + iu = blk_mq_rq_to_pdu(rq); + rnbd_put_permit(sess, iu->permit); + blk_mq_end_request(rq, errno_to_blk_status(iu->errno)); +} + +static void msg_io_conf(void *priv, int errno) +{ + struct rnbd_iu *iu = priv; + struct rnbd_clt_dev *dev = iu->dev; + struct request *rq = iu->rq; + int rw = rq_data_dir(rq); + + iu->errno = errno; + + blk_mq_complete_request(rq); + + if (errno) + rnbd_clt_info_rl(dev, "%s I/O failed with err: %d\n", + rw == READ ? "read" : "write", errno); +} + +static void wake_up_iu_comp(struct rnbd_iu *iu, int errno) +{ + iu->comp.errno = errno; + wake_up(&iu->comp.wait); +} + +static void msg_conf(void *priv, int errno) +{ + struct rnbd_iu *iu = priv; + + iu->errno = errno; + schedule_work(&iu->work); +} + +enum wait_type { + NO_WAIT = 0, + WAIT = 1 +}; + +static int send_usr_msg(struct rtrs_clt *rtrs, int dir, + struct rnbd_iu *iu, struct kvec *vec, size_t nr, + size_t len, struct scatterlist *sg, unsigned int sg_len, + void (*conf)(struct work_struct *work), + int *errno, enum wait_type wait) +{ + int err; + struct rtrs_clt_req_ops req_ops; + + INIT_WORK(&iu->work, conf); + req_ops = (struct rtrs_clt_req_ops) { + .priv = iu, + .conf_fn = msg_conf, + }; + err = rtrs_clt_request(dir, &req_ops, rtrs, iu->permit, + vec, nr, len, sg, sg_len); + if (!err && wait) { + wait_event(iu->comp.wait, iu->comp.errno != INT_MAX); + *errno = iu->comp.errno; + } else { + *errno = 0; + } + + return err; +} + +static void msg_close_conf(struct work_struct *work) +{ + struct rnbd_iu *iu = container_of(work, struct rnbd_iu, work); + struct rnbd_clt_dev *dev = iu->dev; + + wake_up_iu_comp(iu, iu->errno); + rnbd_put_iu(dev->sess, iu); + rnbd_clt_put_dev(dev); +} + +static int send_msg_close(struct rnbd_clt_dev *dev, u32 device_id, bool wait) +{ + struct rnbd_clt_session *sess = dev->sess; + struct rnbd_msg_close msg; + struct rnbd_iu *iu; + struct kvec vec = { + .iov_base = &msg, + .iov_len = sizeof(msg) + }; + int err, errno; + + iu = rnbd_get_iu(sess, RTRS_ADMIN_CON, RTRS_PERMIT_WAIT); + if (!iu) + return -ENOMEM; + + iu->buf = NULL; + iu->dev = dev; + + sg_mark_end(&iu->sglist[0]); + + msg.hdr.type = cpu_to_le16(RNBD_MSG_CLOSE); + msg.device_id = cpu_to_le32(device_id); + + WARN_ON(!rnbd_clt_get_dev(dev)); + err = send_usr_msg(sess->rtrs, WRITE, iu, &vec, 1, 0, NULL, 0, + msg_close_conf, &errno, wait); + if (err) { + rnbd_clt_put_dev(dev); + rnbd_put_iu(sess, iu); + } else { + err = errno; + } + + rnbd_put_iu(sess, iu); + return err; +} + +static void msg_open_conf(struct work_struct *work) +{ + struct rnbd_iu *iu = container_of(work, struct rnbd_iu, work); + struct rnbd_msg_open_rsp *rsp = iu->buf; + struct rnbd_clt_dev *dev = iu->dev; + int errno = iu->errno; + + if (errno) { + rnbd_clt_err(dev, + "Opening failed, server responded: %d\n", + errno); + } else { + errno = process_msg_open_rsp(dev, rsp); + if (errno) { + u32 device_id = le32_to_cpu(rsp->device_id); + /* + * If server thinks its fine, but we fail to process + * then be nice and send a close to server. + */ + (void)send_msg_close(dev, device_id, NO_WAIT); + } + } + kfree(rsp); + wake_up_iu_comp(iu, errno); + rnbd_put_iu(dev->sess, iu); + rnbd_clt_put_dev(dev); +} + +static void msg_sess_info_conf(struct work_struct *work) +{ + struct rnbd_iu *iu = container_of(work, struct rnbd_iu, work); + struct rnbd_msg_sess_info_rsp *rsp = iu->buf; + struct rnbd_clt_session *sess = iu->sess; + + if (!iu->errno) + sess->ver = min_t(u8, rsp->ver, RNBD_PROTO_VER_MAJOR); + + kfree(rsp); + wake_up_iu_comp(iu, iu->errno); + rnbd_put_iu(sess, iu); + rnbd_clt_put_sess(sess); +} + +static int send_msg_open(struct rnbd_clt_dev *dev, bool wait) +{ + struct rnbd_clt_session *sess = dev->sess; + struct rnbd_msg_open_rsp *rsp; + struct rnbd_msg_open msg; + struct rnbd_iu *iu; + struct kvec vec = { + .iov_base = &msg, + .iov_len = sizeof(msg) + }; + int err, errno; + + rsp = kzalloc(sizeof(*rsp), GFP_KERNEL); + if (!rsp) + return -ENOMEM; + + iu = rnbd_get_iu(sess, RTRS_ADMIN_CON, RTRS_PERMIT_WAIT); + if (!iu) { + kfree(rsp); + return -ENOMEM; + } + + iu->buf = rsp; + iu->dev = dev; + + sg_init_one(iu->sglist, rsp, sizeof(*rsp)); + + msg.hdr.type = cpu_to_le16(RNBD_MSG_OPEN); + msg.access_mode = dev->access_mode; + strlcpy(msg.dev_name, dev->pathname, sizeof(msg.dev_name)); + + WARN_ON(!rnbd_clt_get_dev(dev)); + err = send_usr_msg(sess->rtrs, READ, iu, + &vec, 1, sizeof(*rsp), iu->sglist, 1, + msg_open_conf, &errno, wait); + if (err) { + rnbd_clt_put_dev(dev); + rnbd_put_iu(sess, iu); + kfree(rsp); + } else { + err = errno; + } + + rnbd_put_iu(sess, iu); + return err; +} + +static int send_msg_sess_info(struct rnbd_clt_session *sess, bool wait) +{ + struct rnbd_msg_sess_info_rsp *rsp; + struct rnbd_msg_sess_info msg; + struct rnbd_iu *iu; + struct kvec vec = { + .iov_base = &msg, + .iov_len = sizeof(msg) + }; + int err, errno; + + rsp = kzalloc(sizeof(*rsp), GFP_KERNEL); + if (!rsp) + return -ENOMEM; + + iu = rnbd_get_iu(sess, RTRS_ADMIN_CON, RTRS_PERMIT_WAIT); + if (!iu) { + kfree(rsp); + return -ENOMEM; + } + + iu->buf = rsp; + iu->sess = sess; + + sg_init_one(iu->sglist, rsp, sizeof(*rsp)); + + msg.hdr.type = cpu_to_le16(RNBD_MSG_SESS_INFO); + msg.ver = RNBD_PROTO_VER_MAJOR; + + if (!rnbd_clt_get_sess(sess)) { + /* + * That can happen only in one case, when RTRS has restablished + * the connection and link_ev() is called, but session is almost + * dead, last reference on session is put and caller is waiting + * for RTRS to close everything. + */ + err = -ENODEV; + goto put_iu; + } + err = send_usr_msg(sess->rtrs, READ, iu, + &vec, 1, sizeof(*rsp), iu->sglist, 1, + msg_sess_info_conf, &errno, wait); + if (err) { + rnbd_clt_put_sess(sess); +put_iu: + rnbd_put_iu(sess, iu); + kfree(rsp); + } else { + err = errno; + } + + rnbd_put_iu(sess, iu); + return err; +} + +static void set_dev_states_to_disconnected(struct rnbd_clt_session *sess) +{ + struct rnbd_clt_dev *dev; + + mutex_lock(&sess->lock); + list_for_each_entry(dev, &sess->devs_list, list) { + rnbd_clt_err(dev, "Device disconnected.\n"); + + mutex_lock(&dev->lock); + if (dev->dev_state == DEV_STATE_MAPPED) + dev->dev_state = DEV_STATE_MAPPED_DISCONNECTED; + mutex_unlock(&dev->lock); + } + mutex_unlock(&sess->lock); +} + +static void remap_devs(struct rnbd_clt_session *sess) +{ + struct rnbd_clt_dev *dev; + struct rtrs_attrs attrs; + int err; + + /* + * Careful here: we are called from RTRS link event directly, + * thus we can't send any RTRS request and wait for response + * or RTRS will not be able to complete request with failure + * if something goes wrong (failing of outstanding requests + * happens exactly from the context where we are blocking now). + * + * So to avoid deadlocks each usr message sent from here must + * be asynchronous. + */ + + err = send_msg_sess_info(sess, NO_WAIT); + if (err) { + pr_err("send_msg_sess_info(\"%s\"): %d\n", sess->sessname, err); + return; + } + + rtrs_clt_query(sess->rtrs, &attrs); + mutex_lock(&sess->lock); + sess->max_io_size = attrs.max_io_size; + + list_for_each_entry(dev, &sess->devs_list, list) { + bool skip; + + mutex_lock(&dev->lock); + skip = (dev->dev_state == DEV_STATE_INIT); + mutex_unlock(&dev->lock); + if (skip) + /* + * When device is establishing connection for the first + * time - do not remap, it will be closed soon. + */ + continue; + + rnbd_clt_info(dev, "session reconnected, remapping device\n"); + err = send_msg_open(dev, NO_WAIT); + if (err) { + rnbd_clt_err(dev, "send_msg_open(): %d\n", err); + break; + } + } + mutex_unlock(&sess->lock); +} + +static void rnbd_clt_link_ev(void *priv, enum rtrs_clt_link_ev ev) +{ + struct rnbd_clt_session *sess = priv; + + switch (ev) { + case RTRS_CLT_LINK_EV_DISCONNECTED: + set_dev_states_to_disconnected(sess); + break; + case RTRS_CLT_LINK_EV_RECONNECTED: + remap_devs(sess); + break; + default: + pr_err("Unknown session event received (%d), session: %s\n", + ev, sess->sessname); + } +} + +static void rnbd_init_cpu_qlists(struct rnbd_cpu_qlist __percpu *cpu_queues) +{ + unsigned int cpu; + struct rnbd_cpu_qlist *cpu_q; + + for_each_possible_cpu(cpu) { + cpu_q = per_cpu_ptr(cpu_queues, cpu); + + cpu_q->cpu = cpu; + INIT_LIST_HEAD(&cpu_q->requeue_list); + spin_lock_init(&cpu_q->requeue_lock); + } +} + +static void destroy_mq_tags(struct rnbd_clt_session *sess) +{ + if (sess->tag_set.tags) + blk_mq_free_tag_set(&sess->tag_set); +} + +static inline void wake_up_rtrs_waiters(struct rnbd_clt_session *sess) +{ + sess->rtrs_ready = true; + wake_up_all(&sess->rtrs_waitq); +} + +static void close_rtrs(struct rnbd_clt_session *sess) +{ + might_sleep(); + + if (!IS_ERR_OR_NULL(sess->rtrs)) { + rtrs_clt_close(sess->rtrs); + sess->rtrs = NULL; + wake_up_rtrs_waiters(sess); + } +} + +static void free_sess(struct rnbd_clt_session *sess) +{ + WARN_ON(!list_empty(&sess->devs_list)); + + might_sleep(); + + close_rtrs(sess); + destroy_mq_tags(sess); + if (!list_empty(&sess->list)) { + mutex_lock(&sess_lock); + list_del(&sess->list); + mutex_unlock(&sess_lock); + } + free_percpu(sess->cpu_queues); + free_percpu(sess->cpu_rr); + mutex_destroy(&sess->lock); + kfree(sess); +} + +static struct rnbd_clt_session *alloc_sess(const char *sessname) +{ + struct rnbd_clt_session *sess; + int err, cpu; + + sess = kzalloc_node(sizeof(*sess), GFP_KERNEL, NUMA_NO_NODE); + if (!sess) + return ERR_PTR(-ENOMEM); + strlcpy(sess->sessname, sessname, sizeof(sess->sessname)); + atomic_set(&sess->busy, 0); + mutex_init(&sess->lock); + INIT_LIST_HEAD(&sess->devs_list); + INIT_LIST_HEAD(&sess->list); + bitmap_zero(sess->cpu_queues_bm, NR_CPUS); + init_waitqueue_head(&sess->rtrs_waitq); + refcount_set(&sess->refcount, 1); + + sess->cpu_queues = alloc_percpu(struct rnbd_cpu_qlist); + if (!sess->cpu_queues) { + err = -ENOMEM; + goto err; + } + rnbd_init_cpu_qlists(sess->cpu_queues); + + /* + * That is simple percpu variable which stores cpu indeces, which are + * incremented on each access. We need that for the sake of fairness + * to wake up queues in a round-robin manner. + */ + sess->cpu_rr = alloc_percpu(int); + if (!sess->cpu_rr) { + err = -ENOMEM; + goto err; + } + for_each_possible_cpu(cpu) + * per_cpu_ptr(sess->cpu_rr, cpu) = cpu; + + return sess; + +err: + free_sess(sess); + + return ERR_PTR(err); +} + +static int wait_for_rtrs_connection(struct rnbd_clt_session *sess) +{ + wait_event(sess->rtrs_waitq, sess->rtrs_ready); + if (IS_ERR_OR_NULL(sess->rtrs)) + return -ECONNRESET; + + return 0; +} + +static void wait_for_rtrs_disconnection(struct rnbd_clt_session *sess) + __releases(&sess_lock) + __acquires(&sess_lock) +{ + DEFINE_WAIT(wait); + + prepare_to_wait(&sess->rtrs_waitq, &wait, TASK_UNINTERRUPTIBLE); + if (IS_ERR_OR_NULL(sess->rtrs)) { + finish_wait(&sess->rtrs_waitq, &wait); + return; + } + mutex_unlock(&sess_lock); + /* loop in caller, see __find_and_get_sess(). + * You can't leave mutex locked and call schedule(), you will catch a + * deadlock with a caller of free_sess(), which has just put the last + * reference and is about to take the sess_lock in order to delete + * the session from the list. + */ + schedule(); + mutex_lock(&sess_lock); +} + +static struct rnbd_clt_session *__find_and_get_sess(const char *sessname) + __releases(&sess_lock) + __acquires(&sess_lock) +{ + struct rnbd_clt_session *sess, *sn; + int err; + +again: + list_for_each_entry_safe(sess, sn, &sess_list, list) { + if (strcmp(sessname, sess->sessname)) + continue; + + if (sess->rtrs_ready && IS_ERR_OR_NULL(sess->rtrs)) + /* + * No RTRS connection, session is dying. + */ + continue; + + if (rnbd_clt_get_sess(sess)) { + /* + * Alive session is found, wait for RTRS connection. + */ + mutex_unlock(&sess_lock); + err = wait_for_rtrs_connection(sess); + if (err) + rnbd_clt_put_sess(sess); + mutex_lock(&sess_lock); + + if (err) + /* Session is dying, repeat the loop */ + goto again; + + return sess; + } + /* + * Ref is 0, session is dying, wait for RTRS disconnect + * in order to avoid session names clashes. + */ + wait_for_rtrs_disconnection(sess); + /* + * RTRS is disconnected and soon session will be freed, + * so repeat a loop. + */ + goto again; + } + + return NULL; +} + +static struct +rnbd_clt_session *find_or_create_sess(const char *sessname, bool *first) +{ + struct rnbd_clt_session *sess = NULL; + + mutex_lock(&sess_lock); + sess = __find_and_get_sess(sessname); + if (!sess) { + sess = alloc_sess(sessname); + if (sess) { + list_add(&sess->list, &sess_list); + *first = true; + } else { + mutex_unlock(&sess_lock); + return ERR_PTR(-ENOMEM); + } + } else + *first = false; + mutex_unlock(&sess_lock); + + return sess; +} + +static int rnbd_client_open(struct block_device *block_device, fmode_t mode) +{ + struct rnbd_clt_dev *dev = block_device->bd_disk->private_data; + + if (dev->read_only && (mode & FMODE_WRITE)) + return -EPERM; + + if (dev->dev_state == DEV_STATE_UNMAPPED || + !rnbd_clt_get_dev(dev)) + return -EIO; + + return 0; +} + +static void rnbd_client_release(struct gendisk *gen, fmode_t mode) +{ + struct rnbd_clt_dev *dev = gen->private_data; + + rnbd_clt_put_dev(dev); +} + +static int rnbd_client_getgeo(struct block_device *block_device, + struct hd_geometry *geo) +{ + u64 size; + struct rnbd_clt_dev *dev; + + dev = block_device->bd_disk->private_data; + size = dev->size * (dev->logical_block_size / SECTOR_SIZE); + geo->cylinders = size >> 6; /* size/64 */ + geo->heads = 4; + geo->sectors = 16; + geo->start = 0; + + return 0; +} + +static const struct block_device_operations rnbd_client_ops = { + .owner = THIS_MODULE, + .open = rnbd_client_open, + .release = rnbd_client_release, + .getgeo = rnbd_client_getgeo +}; + +/* The amount of data that belongs to an I/O and the amount of data that + * should be read or written to the disk (bi_size) can differ. + * + * E.g. When WRITE_SAME is used, only a small amount of data is + * transferred that is then written repeatedly over a lot of sectors. + * + * Get the size of data to be transferred via RTRS by summing up the size + * of the scather-gather list entries. + */ +static size_t rnbd_clt_get_sg_size(struct scatterlist *sglist, u32 len) +{ + struct scatterlist *sg; + size_t tsize = 0; + int i; + + for_each_sg(sglist, sg, len, i) + tsize += sg->length; + return tsize; +} + +static int rnbd_client_xfer_request(struct rnbd_clt_dev *dev, + struct request *rq, + struct rnbd_iu *iu) +{ + struct rtrs_clt *rtrs = dev->sess->rtrs; + struct rtrs_permit *permit = iu->permit; + struct rnbd_msg_io msg; + struct rtrs_clt_req_ops req_ops; + unsigned int sg_cnt = 0; + struct kvec vec; + size_t size; + int err; + + iu->rq = rq; + iu->dev = dev; + msg.sector = cpu_to_le64(blk_rq_pos(rq)); + msg.bi_size = cpu_to_le32(blk_rq_bytes(rq)); + msg.rw = cpu_to_le32(rq_to_rnbd_flags(rq)); + msg.prio = cpu_to_le16(req_get_ioprio(rq)); + + /* + * We only support discards with single segment for now. + * See queue limits. + */ + if (req_op(rq) != REQ_OP_DISCARD) + sg_cnt = blk_rq_map_sg(dev->queue, rq, iu->sglist); + + if (sg_cnt == 0) + /* Do not forget to mark the end */ + sg_mark_end(&iu->sglist[0]); + + msg.hdr.type = cpu_to_le16(RNBD_MSG_IO); + msg.device_id = cpu_to_le32(dev->device_id); + + vec = (struct kvec) { + .iov_base = &msg, + .iov_len = sizeof(msg) + }; + size = rnbd_clt_get_sg_size(iu->sglist, sg_cnt); + req_ops = (struct rtrs_clt_req_ops) { + .priv = iu, + .conf_fn = msg_io_conf, + }; + err = rtrs_clt_request(rq_data_dir(rq), &req_ops, rtrs, permit, + &vec, 1, size, iu->sglist, sg_cnt); + if (unlikely(err)) { + rnbd_clt_err_rl(dev, "RTRS failed to transfer IO, err: %d\n", + err); + return err; + } + + return 0; +} + +/** + * rnbd_clt_dev_add_to_requeue() - add device to requeue if session is busy + * @dev: Device to be checked + * @q: Queue to be added to the requeue list if required + * + * Description: + * If session is busy, that means someone will requeue us when resources + * are freed. If session is not doing anything - device is not added to + * the list and @false is returned. + */ +static bool rnbd_clt_dev_add_to_requeue(struct rnbd_clt_dev *dev, + struct rnbd_queue *q) +{ + struct rnbd_clt_session *sess = dev->sess; + struct rnbd_cpu_qlist *cpu_q; + unsigned long flags; + bool added = true; + bool need_set; + + cpu_q = get_cpu_ptr(sess->cpu_queues); + spin_lock_irqsave(&cpu_q->requeue_lock, flags); + + if (likely(!test_and_set_bit_lock(0, &q->in_list))) { + if (WARN_ON(!list_empty(&q->requeue_list))) + goto unlock; + + need_set = !test_bit(cpu_q->cpu, sess->cpu_queues_bm); + if (need_set) { + set_bit(cpu_q->cpu, sess->cpu_queues_bm); + /* Paired with rnbd_put_permit(). Set a bit first + * and then observe the busy counter. + */ + smp_mb__before_atomic(); + } + if (likely(atomic_read(&sess->busy))) { + list_add_tail(&q->requeue_list, &cpu_q->requeue_list); + } else { + /* Very unlikely, but possible: busy counter was + * observed as zero. Drop all bits and return + * false to restart the queue by ourselves. + */ + if (need_set) + clear_bit(cpu_q->cpu, sess->cpu_queues_bm); + clear_bit_unlock(0, &q->in_list); + added = false; + } + } +unlock: + spin_unlock_irqrestore(&cpu_q->requeue_lock, flags); + put_cpu_ptr(sess->cpu_queues); + + return added; +} + +static void rnbd_clt_dev_kick_mq_queue(struct rnbd_clt_dev *dev, + struct blk_mq_hw_ctx *hctx, + int delay) +{ + struct rnbd_queue *q = hctx->driver_data; + + if (delay != RNBD_DELAY_IFBUSY) + blk_mq_delay_run_hw_queue(hctx, delay); + else if (unlikely(!rnbd_clt_dev_add_to_requeue(dev, q))) + /* + * If session is not busy we have to restart + * the queue ourselves. + */ + blk_mq_delay_run_hw_queue(hctx, 10/*ms*/); +} + +static blk_status_t rnbd_queue_rq(struct blk_mq_hw_ctx *hctx, + const struct blk_mq_queue_data *bd) +{ + struct request *rq = bd->rq; + struct rnbd_clt_dev *dev = rq->rq_disk->private_data; + struct rnbd_iu *iu = blk_mq_rq_to_pdu(rq); + int err; + + if (unlikely(dev->dev_state != DEV_STATE_MAPPED)) + return BLK_STS_IOERR; + + iu->permit = rnbd_get_permit(dev->sess, RTRS_IO_CON, + RTRS_PERMIT_NOWAIT); + if (unlikely(!iu->permit)) { + rnbd_clt_dev_kick_mq_queue(dev, hctx, RNBD_DELAY_IFBUSY); + return BLK_STS_RESOURCE; + } + + blk_mq_start_request(rq); + err = rnbd_client_xfer_request(dev, rq, iu); + if (likely(err == 0)) + return BLK_STS_OK; + if (unlikely(err == -EAGAIN || err == -ENOMEM)) { + rnbd_clt_dev_kick_mq_queue(dev, hctx, 10/*ms*/); + rnbd_put_permit(dev->sess, iu->permit); + return BLK_STS_RESOURCE; + } + + rnbd_put_permit(dev->sess, iu->permit); + return BLK_STS_IOERR; +} + +static int rnbd_init_request(struct blk_mq_tag_set *set, struct request *rq, + unsigned int hctx_idx, unsigned int numa_node) +{ + struct rnbd_iu *iu = blk_mq_rq_to_pdu(rq); + + sg_init_table(iu->sglist, BMAX_SEGMENTS); + return 0; +} + +static struct blk_mq_ops rnbd_mq_ops = { + .queue_rq = rnbd_queue_rq, + .init_request = rnbd_init_request, + .complete = rnbd_softirq_done_fn, +}; + +static int setup_mq_tags(struct rnbd_clt_session *sess) +{ + struct blk_mq_tag_set *tag_set = &sess->tag_set; + + memset(tag_set, 0, sizeof(*tag_set)); + tag_set->ops = &rnbd_mq_ops; + tag_set->queue_depth = sess->queue_depth; + tag_set->numa_node = NUMA_NO_NODE; + tag_set->flags = BLK_MQ_F_SHOULD_MERGE | + BLK_MQ_F_TAG_SHARED; + tag_set->cmd_size = sizeof(struct rnbd_iu); + tag_set->nr_hw_queues = num_online_cpus(); + + return blk_mq_alloc_tag_set(tag_set); +} + +static struct rnbd_clt_session * +find_and_get_or_create_sess(const char *sessname, + const struct rtrs_addr *paths, + size_t path_cnt, u16 port_nr) +{ + struct rnbd_clt_session *sess; + struct rtrs_attrs attrs; + int err; + bool first; + struct rtrs_clt_ops rtrs_ops; + + sess = find_or_create_sess(sessname, &first); + if (sess == ERR_PTR(-ENOMEM)) + return ERR_PTR(-ENOMEM); + else if (!first) + return sess; + + rtrs_ops = (struct rtrs_clt_ops) { + .priv = sess, + .link_ev = rnbd_clt_link_ev, + }; + /* + * Nothing was found, establish rtrs connection and proceed further. + */ + sess->rtrs = rtrs_clt_open(&rtrs_ops, sessname, + paths, path_cnt, port_nr, + sizeof(struct rnbd_iu), + RECONNECT_DELAY, BMAX_SEGMENTS, + MAX_RECONNECTS); + if (IS_ERR(sess->rtrs)) { + err = PTR_ERR(sess->rtrs); + goto wake_up_and_put; + } + rtrs_clt_query(sess->rtrs, &attrs); + sess->max_io_size = attrs.max_io_size; + sess->queue_depth = attrs.queue_depth; + + err = setup_mq_tags(sess); + if (err) + goto close_rtrs; + + err = send_msg_sess_info(sess, WAIT); + if (err) + goto close_rtrs; + + wake_up_rtrs_waiters(sess); + + return sess; + +close_rtrs: + close_rtrs(sess); +put_sess: + rnbd_clt_put_sess(sess); + + return ERR_PTR(err); + +wake_up_and_put: + wake_up_rtrs_waiters(sess); + goto put_sess; +} + +static inline void rnbd_init_hw_queue(struct rnbd_clt_dev *dev, + struct rnbd_queue *q, + struct blk_mq_hw_ctx *hctx) +{ + INIT_LIST_HEAD(&q->requeue_list); + q->dev = dev; + q->hctx = hctx; +} + +static void rnbd_init_mq_hw_queues(struct rnbd_clt_dev *dev) +{ + int i; + struct blk_mq_hw_ctx *hctx; + struct rnbd_queue *q; + + queue_for_each_hw_ctx(dev->queue, hctx, i) { + q = &dev->hw_queues[i]; + rnbd_init_hw_queue(dev, q, hctx); + hctx->driver_data = q; + } +} + +static int setup_mq_dev(struct rnbd_clt_dev *dev) +{ + dev->queue = blk_mq_init_queue(&dev->sess->tag_set); + if (IS_ERR(dev->queue)) { + rnbd_clt_err(dev, "Initializing multiqueue queue failed, err: %ld\n", + PTR_ERR(dev->queue)); + return PTR_ERR(dev->queue); + } + rnbd_init_mq_hw_queues(dev); + return 0; +} + +static void setup_request_queue(struct rnbd_clt_dev *dev) +{ + blk_queue_logical_block_size(dev->queue, dev->logical_block_size); + blk_queue_physical_block_size(dev->queue, dev->physical_block_size); + blk_queue_max_hw_sectors(dev->queue, dev->max_hw_sectors); + blk_queue_max_write_same_sectors(dev->queue, + dev->max_write_same_sectors); + + /* + * we don't support discards to "discontiguous" segments + * in on request + */ + blk_queue_max_discard_segments(dev->queue, 1); + + blk_queue_max_discard_sectors(dev->queue, dev->max_discard_sectors); + dev->queue->limits.discard_granularity = dev->discard_granularity; + dev->queue->limits.discard_alignment = dev->discard_alignment; + if (dev->max_discard_sectors) + blk_queue_flag_set(QUEUE_FLAG_DISCARD, dev->queue); + if (dev->secure_discard) + blk_queue_flag_set(QUEUE_FLAG_SECERASE, dev->queue); + + blk_queue_flag_set(QUEUE_FLAG_SAME_COMP, dev->queue); + blk_queue_flag_set(QUEUE_FLAG_SAME_FORCE, dev->queue); + blk_queue_max_segments(dev->queue, dev->max_segments); + blk_queue_io_opt(dev->queue, dev->sess->max_io_size); + blk_queue_virt_boundary(dev->queue, SZ_4K - 1); + blk_queue_write_cache(dev->queue, true, true); + dev->queue->queuedata = dev; +} + +static void rnbd_clt_setup_gen_disk(struct rnbd_clt_dev *dev, int idx) +{ + dev->gd->major = rnbd_client_major; + dev->gd->first_minor = idx << RNBD_PART_BITS; + dev->gd->fops = &rnbd_client_ops; + dev->gd->queue = dev->queue; + dev->gd->private_data = dev; + snprintf(dev->gd->disk_name, sizeof(dev->gd->disk_name), "rnbd%d", + idx); + pr_debug("disk_name=%s, capacity=%zu\n", + dev->gd->disk_name, + dev->nsectors * (dev->logical_block_size / SECTOR_SIZE) + ); + + set_capacity(dev->gd, dev->nsectors); + + if (dev->access_mode == RNBD_ACCESS_RO) { + dev->read_only = true; + set_disk_ro(dev->gd, true); + } else { + dev->read_only = false; + } + + if (!dev->rotational) + blk_queue_flag_set(QUEUE_FLAG_NONROT, dev->queue); +} + +static int rnbd_client_setup_device(struct rnbd_clt_session *sess, + struct rnbd_clt_dev *dev, int idx) +{ + int err; + + dev->size = dev->nsectors * dev->logical_block_size; + + err = setup_mq_dev(dev); + if (err) + return err; + + setup_request_queue(dev); + + dev->gd = alloc_disk_node(1 << RNBD_PART_BITS, NUMA_NO_NODE); + if (!dev->gd) { + blk_cleanup_queue(dev->queue); + return -ENOMEM; + } + + rnbd_clt_setup_gen_disk(dev, idx); + + return 0; +} + +static struct rnbd_clt_dev *init_dev(struct rnbd_clt_session *sess, + enum rnbd_access_mode access_mode, + const char *pathname) +{ + struct rnbd_clt_dev *dev; + int ret; + + dev = kzalloc_node(sizeof(*dev), GFP_KERNEL, NUMA_NO_NODE); + if (!dev) + return ERR_PTR(-ENOMEM); + + dev->hw_queues = kcalloc(nr_cpu_ids, sizeof(*dev->hw_queues), + GFP_KERNEL); + if (!dev->hw_queues) { + ret = -ENOMEM; + goto out_alloc; + } + + mutex_lock(&ida_lock); + ret = ida_simple_get(&index_ida, 0, 1 << (MINORBITS - RNBD_PART_BITS), + GFP_KERNEL); + mutex_unlock(&ida_lock); + if (ret < 0) { + pr_err("Failed to initialize device '%s' from session %s, allocating idr failed, err: %d\n", + pathname, sess->sessname, ret); + goto out_queues; + } + dev->clt_device_id = ret; + dev->sess = sess; + dev->access_mode = access_mode; + strlcpy(dev->pathname, pathname, sizeof(dev->pathname)); + mutex_init(&dev->lock); + refcount_set(&dev->refcount, 1); + dev->dev_state = DEV_STATE_INIT; + + /* + * Here we called from sysfs entry, thus clt-sysfs is + * responsible that session will not disappear. + */ + WARN_ON(!rnbd_clt_get_sess(sess)); + + return dev; + +out_queues: + kfree(dev->hw_queues); +out_alloc: + kfree(dev); + return ERR_PTR(ret); +} + +static bool __exists_dev(const char *pathname) +{ + struct rnbd_clt_session *sess; + struct rnbd_clt_dev *dev; + bool found = false; + + list_for_each_entry(sess, &sess_list, list) { + mutex_lock(&sess->lock); + list_for_each_entry(dev, &sess->devs_list, list) { + if (!strncmp(dev->pathname, pathname, + sizeof(dev->pathname))) { + found = true; + break; + } + } + mutex_unlock(&sess->lock); + if (found) + break; + } + + return found; +} + +static bool exists_devpath(const char *pathname) +{ + bool found; + + mutex_lock(&sess_lock); + found = __exists_dev(pathname); + mutex_unlock(&sess_lock); + + return found; +} + +static bool insert_dev_if_not_exists_devpath(const char *pathname, + struct rnbd_clt_session *sess, + struct rnbd_clt_dev *dev) +{ + bool found; + + mutex_lock(&sess_lock); + found = __exists_dev(pathname); + if (!found) { + mutex_lock(&sess->lock); + list_add_tail(&dev->list, &sess->devs_list); + mutex_unlock(&sess->lock); + } + mutex_unlock(&sess_lock); + + return found; +} + +static void delete_dev(struct rnbd_clt_dev *dev) +{ + struct rnbd_clt_session *sess = dev->sess; + + mutex_lock(&sess->lock); + list_del(&dev->list); + mutex_unlock(&sess->lock); +} + +struct rnbd_clt_dev *rnbd_clt_map_device(const char *sessname, + struct rtrs_addr *paths, + size_t path_cnt, u16 port_nr, + const char *pathname, + enum rnbd_access_mode access_mode) +{ + struct rnbd_clt_session *sess; + struct rnbd_clt_dev *dev; + int ret; + + if (exists_devpath(pathname)) + return ERR_PTR(-EEXIST); + + sess = find_and_get_or_create_sess(sessname, paths, path_cnt, port_nr); + if (IS_ERR(sess)) + return ERR_CAST(sess); + + dev = init_dev(sess, access_mode, pathname); + if (IS_ERR(dev)) { + pr_err("map_device: failed to map device '%s' from session %s, can't initialize device, err: %ld\n", + pathname, sess->sessname, PTR_ERR(dev)); + ret = PTR_ERR(dev); + goto put_sess; + } + if (insert_dev_if_not_exists_devpath(pathname, sess, dev)) { + ret = -EEXIST; + goto put_dev; + } + ret = send_msg_open(dev, WAIT); + if (ret) { + rnbd_clt_err(dev, + "map_device: failed, can't open remote device, err: %d\n", + ret); + goto del_dev; + } + mutex_lock(&dev->lock); + pr_debug("Opened remote device: session=%s, path='%s'\n", + sess->sessname, pathname); + ret = rnbd_client_setup_device(sess, dev, dev->clt_device_id); + if (ret) { + rnbd_clt_err(dev, + "map_device: Failed to configure device, err: %d\n", + ret); + mutex_unlock(&dev->lock); + goto del_dev; + } + + rnbd_clt_info(dev, + "map_device: Device mapped as %s (nsectors: %zu, logical_block_size: %d, physical_block_size: %d, max_write_same_sectors: %d, max_discard_sectors: %d, discard_granularity: %d, discard_alignment: %d, secure_discard: %d, max_segments: %d, max_hw_sectors: %d, rotational: %d)\n", + dev->gd->disk_name, dev->nsectors, + dev->logical_block_size, dev->physical_block_size, + dev->max_write_same_sectors, dev->max_discard_sectors, + dev->discard_granularity, dev->discard_alignment, + dev->secure_discard, dev->max_segments, + dev->max_hw_sectors, dev->rotational); + + mutex_unlock(&dev->lock); + + add_disk(dev->gd); + rnbd_clt_put_sess(sess); + + return dev; + +del_dev: + delete_dev(dev); +put_dev: + rnbd_clt_put_dev(dev); +put_sess: + rnbd_clt_put_sess(sess); + + return ERR_PTR(ret); +} + +static void destroy_gen_disk(struct rnbd_clt_dev *dev) +{ + del_gendisk(dev->gd); + blk_cleanup_queue(dev->queue); + put_disk(dev->gd); +} + +static void destroy_sysfs(struct rnbd_clt_dev *dev, + const struct attribute *sysfs_self) +{ + rnbd_clt_remove_dev_symlink(dev); + if (dev->kobj.state_initialized) { + if (sysfs_self) + /* To avoid deadlock firstly remove itself */ + sysfs_remove_file_self(&dev->kobj, sysfs_self); + kobject_del(&dev->kobj); + kobject_put(&dev->kobj); + } +} + +int rnbd_clt_unmap_device(struct rnbd_clt_dev *dev, bool force, + const struct attribute *sysfs_self) +{ + struct rnbd_clt_session *sess = dev->sess; + int refcount, ret = 0; + bool was_mapped; + + mutex_lock(&dev->lock); + if (dev->dev_state == DEV_STATE_UNMAPPED) { + rnbd_clt_info(dev, "Device is already being unmapped\n"); + ret = -EALREADY; + goto err; + } + refcount = refcount_read(&dev->refcount); + if (!force && refcount > 1) { + rnbd_clt_err(dev, + "Closing device failed, device is in use, (%d device users)\n", + refcount - 1); + ret = -EBUSY; + goto err; + } + was_mapped = (dev->dev_state == DEV_STATE_MAPPED); + dev->dev_state = DEV_STATE_UNMAPPED; + mutex_unlock(&dev->lock); + + delete_dev(dev); + destroy_sysfs(dev, sysfs_self); + destroy_gen_disk(dev); + if (was_mapped && sess->rtrs) + send_msg_close(dev, dev->device_id, WAIT); + + rnbd_clt_info(dev, "Device is unmapped\n"); + + /* Likely last reference put */ + rnbd_clt_put_dev(dev); + + /* + * Here device and session can be vanished! + */ + + return 0; +err: + mutex_unlock(&dev->lock); + + return ret; +} + +int rnbd_clt_remap_device(struct rnbd_clt_dev *dev) +{ + int err; + + mutex_lock(&dev->lock); + if (dev->dev_state == DEV_STATE_MAPPED_DISCONNECTED) + err = 0; + else if (dev->dev_state == DEV_STATE_UNMAPPED) + err = -ENODEV; + else if (dev->dev_state == DEV_STATE_MAPPED) + err = -EALREADY; + else + err = -EBUSY; + mutex_unlock(&dev->lock); + if (!err) { + rnbd_clt_info(dev, "Remapping device.\n"); + err = send_msg_open(dev, WAIT); + if (err) + rnbd_clt_err(dev, "remap_device: %d\n", err); + } + + return err; +} + +static void unmap_device_work(struct work_struct *work) +{ + struct rnbd_clt_dev *dev; + + dev = container_of(work, typeof(*dev), unmap_on_rmmod_work); + rnbd_clt_unmap_device(dev, true, NULL); +} + +static void rnbd_destroy_sessions(void) +{ + struct rnbd_clt_session *sess, *sn; + struct rnbd_clt_dev *dev, *tn; + + /* Firstly forbid access through sysfs interface */ + rnbd_clt_destroy_default_group(); + rnbd_clt_destroy_sysfs_files(); + + /* + * Here at this point there is no any concurrent access to sessions + * list and devices list: + * 1. New session or device can'be be created - session sysfs files + * are removed. + * 2. Device or session can't be removed - module reference is taken + * into account in unmap device sysfs callback. + * 3. No IO requests inflight - each file open of block_dev increases + * module reference in get_disk(). + * + * But still there can be user requests inflights, which are sent by + * asynchronous send_msg_*() functions, thus before unmapping devices + * RTRS session must be explicitly closed. + */ + + list_for_each_entry_safe(sess, sn, &sess_list, list) { + WARN_ON(!rnbd_clt_get_sess(sess)); + close_rtrs(sess); + list_for_each_entry_safe(dev, tn, &sess->devs_list, list) { + /* + * Here unmap happens in parallel for only one reason: + * blk_cleanup_queue() takes around half a second, so + * on huge amount of devices the whole module unload + * procedure takes minutes. + */ + INIT_WORK(&dev->unmap_on_rmmod_work, unmap_device_work); + queue_work(system_long_wq, &dev->unmap_on_rmmod_work); + } + rnbd_clt_put_sess(sess); + } + /* Wait for all scheduled unmap works */ + flush_workqueue(system_long_wq); + WARN_ON(!list_empty(&sess_list)); +} + +static int __init rnbd_client_init(void) +{ + int err = 0; + + BUILD_BUG_ON(sizeof(struct rnbd_msg_hdr) != 4); + BUILD_BUG_ON(sizeof(struct rnbd_msg_sess_info) != 36); + BUILD_BUG_ON(sizeof(struct rnbd_msg_sess_info_rsp) != 36); + BUILD_BUG_ON(sizeof(struct rnbd_msg_open) != 264); + BUILD_BUG_ON(sizeof(struct rnbd_msg_close) != 8); + BUILD_BUG_ON(sizeof(struct rnbd_msg_open_rsp) != 56); + rnbd_client_major = register_blkdev(rnbd_client_major, "rnbd"); + if (rnbd_client_major <= 0) { + pr_err("Failed to load module, block device registration failed\n"); + return -EBUSY; + } + + err = rnbd_clt_create_sysfs_files(); + if (err) { + pr_err("Failed to load module, creating sysfs device files failed, err: %d\n", + err); + unregister_blkdev(rnbd_client_major, "rnbd"); + } + + return err; +} + +static void __exit rnbd_client_exit(void) +{ + rnbd_destroy_sessions(); + unregister_blkdev(rnbd_client_major, "rnbd"); + ida_destroy(&index_ida); +} + +module_init(rnbd_client_init); +module_exit(rnbd_client_exit); -- cgit v1.2.3 From 1eb54f8f5dd8ae09829caa37a50952d931bb79cf Mon Sep 17 00:00:00 2001 From: Jack Wang Date: Mon, 11 May 2020 15:51:24 +0200 Subject: block/rnbd: client: sysfs interface functions This is the sysfs interface to rnbd block devices on client side: /sys/class/rnbd-client/ctl/ |- map_device | *** maps remote device | |- devices/ *** all mapped devices /sys/block/rnbd/rnbd/ |- unmap_device | *** unmaps device | |- state | *** device state | |- session | *** session name | |- mapping_path *** path of the dev that was mapped on server Link: https://lore.kernel.org/r/20200511135131.27580-19-danil.kipnis@cloud.ionos.com Signed-off-by: Danil Kipnis Signed-off-by: Jack Wang Reviewed-by: Bart Van Assche Acked-by: Jens Axboe Signed-off-by: Jason Gunthorpe --- drivers/block/rnbd/rnbd-clt-sysfs.c | 636 ++++++++++++++++++++++++++++++++++++ 1 file changed, 636 insertions(+) create mode 100644 drivers/block/rnbd/rnbd-clt-sysfs.c diff --git a/drivers/block/rnbd/rnbd-clt-sysfs.c b/drivers/block/rnbd/rnbd-clt-sysfs.c new file mode 100644 index 000000000000..a4508fcc7ffe --- /dev/null +++ b/drivers/block/rnbd/rnbd-clt-sysfs.c @@ -0,0 +1,636 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * RDMA Network Block Driver + * + * Copyright (c) 2014 - 2018 ProfitBricks GmbH. All rights reserved. + * Copyright (c) 2018 - 2019 1&1 IONOS Cloud GmbH. All rights reserved. + * Copyright (c) 2019 - 2020 1&1 IONOS SE. All rights reserved. + */ + +#undef pr_fmt +#define pr_fmt(fmt) KBUILD_MODNAME " L" __stringify(__LINE__) ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "rnbd-clt.h" + +static struct device *rnbd_dev; +static struct class *rnbd_dev_class; +static struct kobject *rnbd_devs_kobj; + +enum { + RNBD_OPT_ERR = 0, + RNBD_OPT_DEST_PORT = 1 << 0, + RNBD_OPT_PATH = 1 << 1, + RNBD_OPT_DEV_PATH = 1 << 2, + RNBD_OPT_ACCESS_MODE = 1 << 3, + RNBD_OPT_SESSNAME = 1 << 6, +}; + +static const unsigned int rnbd_opt_mandatory[] = { + RNBD_OPT_PATH, + RNBD_OPT_DEV_PATH, + RNBD_OPT_SESSNAME, +}; + +static const match_table_t rnbd_opt_tokens = { + {RNBD_OPT_PATH, "path=%s" }, + {RNBD_OPT_DEV_PATH, "device_path=%s"}, + {RNBD_OPT_DEST_PORT, "dest_port=%d" }, + {RNBD_OPT_ACCESS_MODE, "access_mode=%s"}, + {RNBD_OPT_SESSNAME, "sessname=%s" }, + {RNBD_OPT_ERR, NULL }, +}; + +struct rnbd_map_options { + char *sessname; + struct rtrs_addr *paths; + size_t *path_cnt; + char *pathname; + u16 *dest_port; + enum rnbd_access_mode *access_mode; +}; + +static int rnbd_clt_parse_map_options(const char *buf, size_t max_path_cnt, + struct rnbd_map_options *opt) +{ + char *options, *sep_opt; + char *p; + substring_t args[MAX_OPT_ARGS]; + int opt_mask = 0; + int token; + int ret = -EINVAL; + int i, dest_port; + int p_cnt = 0; + + options = kstrdup(buf, GFP_KERNEL); + if (!options) + return -ENOMEM; + + sep_opt = strstrip(options); + while ((p = strsep(&sep_opt, " ")) != NULL) { + if (!*p) + continue; + + token = match_token(p, rnbd_opt_tokens, args); + opt_mask |= token; + + switch (token) { + case RNBD_OPT_SESSNAME: + p = match_strdup(args); + if (!p) { + ret = -ENOMEM; + goto out; + } + if (strlen(p) > NAME_MAX) { + pr_err("map_device: sessname too long\n"); + ret = -EINVAL; + kfree(p); + goto out; + } + strlcpy(opt->sessname, p, NAME_MAX); + kfree(p); + break; + + case RNBD_OPT_PATH: + if (p_cnt >= max_path_cnt) { + pr_err("map_device: too many (> %zu) paths provided\n", + max_path_cnt); + ret = -ENOMEM; + goto out; + } + p = match_strdup(args); + if (!p) { + ret = -ENOMEM; + goto out; + } + + ret = rtrs_addr_to_sockaddr(p, strlen(p), + *opt->dest_port, + &opt->paths[p_cnt]); + if (ret) { + pr_err("Can't parse path %s: %d\n", p, ret); + kfree(p); + goto out; + } + + p_cnt++; + + kfree(p); + break; + + case RNBD_OPT_DEV_PATH: + p = match_strdup(args); + if (!p) { + ret = -ENOMEM; + goto out; + } + if (strlen(p) > NAME_MAX) { + pr_err("map_device: Device path too long\n"); + ret = -EINVAL; + kfree(p); + goto out; + } + strlcpy(opt->pathname, p, NAME_MAX); + kfree(p); + break; + + case RNBD_OPT_DEST_PORT: + if (match_int(args, &dest_port) || dest_port < 0 || + dest_port > 65535) { + pr_err("bad destination port number parameter '%d'\n", + dest_port); + ret = -EINVAL; + goto out; + } + *opt->dest_port = dest_port; + break; + + case RNBD_OPT_ACCESS_MODE: + p = match_strdup(args); + if (!p) { + ret = -ENOMEM; + goto out; + } + + if (!strcmp(p, "ro")) { + *opt->access_mode = RNBD_ACCESS_RO; + } else if (!strcmp(p, "rw")) { + *opt->access_mode = RNBD_ACCESS_RW; + } else if (!strcmp(p, "migration")) { + *opt->access_mode = RNBD_ACCESS_MIGRATION; + } else { + pr_err("map_device: Invalid access_mode: '%s'\n", + p); + ret = -EINVAL; + kfree(p); + goto out; + } + + kfree(p); + break; + + default: + pr_err("map_device: Unknown parameter or missing value '%s'\n", + p); + ret = -EINVAL; + goto out; + } + } + + for (i = 0; i < ARRAY_SIZE(rnbd_opt_mandatory); i++) { + if ((opt_mask & rnbd_opt_mandatory[i])) { + ret = 0; + } else { + pr_err("map_device: Parameters missing\n"); + ret = -EINVAL; + break; + } + } + +out: + *opt->path_cnt = p_cnt; + kfree(options); + return ret; +} + +static ssize_t state_show(struct kobject *kobj, + struct kobj_attribute *attr, char *page) +{ + struct rnbd_clt_dev *dev; + + dev = container_of(kobj, struct rnbd_clt_dev, kobj); + + switch (dev->dev_state) { + case DEV_STATE_INIT: + return snprintf(page, PAGE_SIZE, "init\n"); + case DEV_STATE_MAPPED: + /* TODO fix cli tool before changing to proper state */ + return snprintf(page, PAGE_SIZE, "open\n"); + case DEV_STATE_MAPPED_DISCONNECTED: + /* TODO fix cli tool before changing to proper state */ + return snprintf(page, PAGE_SIZE, "closed\n"); + case DEV_STATE_UNMAPPED: + return snprintf(page, PAGE_SIZE, "unmapped\n"); + default: + return snprintf(page, PAGE_SIZE, "unknown\n"); + } +} + +static struct kobj_attribute rnbd_clt_state_attr = __ATTR_RO(state); + +static ssize_t mapping_path_show(struct kobject *kobj, + struct kobj_attribute *attr, char *page) +{ + struct rnbd_clt_dev *dev; + + dev = container_of(kobj, struct rnbd_clt_dev, kobj); + + return scnprintf(page, PAGE_SIZE, "%s\n", dev->pathname); +} + +static struct kobj_attribute rnbd_clt_mapping_path_attr = + __ATTR_RO(mapping_path); + +static ssize_t access_mode_show(struct kobject *kobj, + struct kobj_attribute *attr, char *page) +{ + struct rnbd_clt_dev *dev; + + dev = container_of(kobj, struct rnbd_clt_dev, kobj); + + return snprintf(page, PAGE_SIZE, "%s\n", + rnbd_access_mode_str(dev->access_mode)); +} + +static struct kobj_attribute rnbd_clt_access_mode = + __ATTR_RO(access_mode); + +static ssize_t rnbd_clt_unmap_dev_show(struct kobject *kobj, + struct kobj_attribute *attr, char *page) +{ + return scnprintf(page, PAGE_SIZE, "Usage: echo > %s\n", + attr->attr.name); +} + +static ssize_t rnbd_clt_unmap_dev_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct rnbd_clt_dev *dev; + char *opt, *options; + bool force; + int err; + + opt = kstrdup(buf, GFP_KERNEL); + if (!opt) + return -ENOMEM; + + options = strstrip(opt); + dev = container_of(kobj, struct rnbd_clt_dev, kobj); + if (sysfs_streq(options, "normal")) { + force = false; + } else if (sysfs_streq(options, "force")) { + force = true; + } else { + rnbd_clt_err(dev, + "unmap_device: Invalid value: %s\n", + options); + err = -EINVAL; + goto out; + } + + rnbd_clt_info(dev, "Unmapping device, option: %s.\n", + force ? "force" : "normal"); + + /* + * We take explicit module reference only for one reason: do not + * race with lockless rnbd_destroy_sessions(). + */ + if (!try_module_get(THIS_MODULE)) { + err = -ENODEV; + goto out; + } + err = rnbd_clt_unmap_device(dev, force, &attr->attr); + if (err) { + if (err != -EALREADY) + rnbd_clt_err(dev, "unmap_device: %d\n", err); + goto module_put; + } + + /* + * Here device can be vanished! + */ + + err = count; + +module_put: + module_put(THIS_MODULE); +out: + kfree(opt); + + return err; +} + +static struct kobj_attribute rnbd_clt_unmap_device_attr = + __ATTR(unmap_device, 0644, rnbd_clt_unmap_dev_show, + rnbd_clt_unmap_dev_store); + +static ssize_t rnbd_clt_resize_dev_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *page) +{ + return scnprintf(page, PAGE_SIZE, + "Usage: echo > %s\n", + attr->attr.name); +} + +static ssize_t rnbd_clt_resize_dev_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int ret; + unsigned long sectors; + struct rnbd_clt_dev *dev; + + dev = container_of(kobj, struct rnbd_clt_dev, kobj); + + ret = kstrtoul(buf, 0, §ors); + if (ret) + return ret; + + ret = rnbd_clt_resize_disk(dev, (size_t)sectors); + if (ret) + return ret; + + return count; +} + +static struct kobj_attribute rnbd_clt_resize_dev_attr = + __ATTR(resize, 0644, rnbd_clt_resize_dev_show, + rnbd_clt_resize_dev_store); + +static ssize_t rnbd_clt_remap_dev_show(struct kobject *kobj, + struct kobj_attribute *attr, char *page) +{ + return scnprintf(page, PAGE_SIZE, "Usage: echo <1> > %s\n", + attr->attr.name); +} + +static ssize_t rnbd_clt_remap_dev_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct rnbd_clt_dev *dev; + char *opt, *options; + int err; + + opt = kstrdup(buf, GFP_KERNEL); + if (!opt) + return -ENOMEM; + + options = strstrip(opt); + dev = container_of(kobj, struct rnbd_clt_dev, kobj); + if (!sysfs_streq(options, "1")) { + rnbd_clt_err(dev, + "remap_device: Invalid value: %s\n", + options); + err = -EINVAL; + goto out; + } + err = rnbd_clt_remap_device(dev); + if (likely(!err)) + err = count; + +out: + kfree(opt); + + return err; +} + +static struct kobj_attribute rnbd_clt_remap_device_attr = + __ATTR(remap_device, 0644, rnbd_clt_remap_dev_show, + rnbd_clt_remap_dev_store); + +static ssize_t session_show(struct kobject *kobj, struct kobj_attribute *attr, + char *page) +{ + struct rnbd_clt_dev *dev; + + dev = container_of(kobj, struct rnbd_clt_dev, kobj); + + return scnprintf(page, PAGE_SIZE, "%s\n", dev->sess->sessname); +} + +static struct kobj_attribute rnbd_clt_session_attr = + __ATTR_RO(session); + +static struct attribute *rnbd_dev_attrs[] = { + &rnbd_clt_unmap_device_attr.attr, + &rnbd_clt_resize_dev_attr.attr, + &rnbd_clt_remap_device_attr.attr, + &rnbd_clt_mapping_path_attr.attr, + &rnbd_clt_state_attr.attr, + &rnbd_clt_session_attr.attr, + &rnbd_clt_access_mode.attr, + NULL, +}; + +void rnbd_clt_remove_dev_symlink(struct rnbd_clt_dev *dev) +{ + /* + * The module_is_live() check is crucial and helps to avoid annoying + * sysfs warning raised in sysfs_remove_link(), when the whole sysfs + * path was just removed, see rnbd_close_sessions(). + */ + if (strlen(dev->blk_symlink_name) && module_is_live(THIS_MODULE)) + sysfs_remove_link(rnbd_devs_kobj, dev->blk_symlink_name); +} + +static struct kobj_type rnbd_dev_ktype = { + .sysfs_ops = &kobj_sysfs_ops, + .default_attrs = rnbd_dev_attrs, +}; + +static int rnbd_clt_add_dev_kobj(struct rnbd_clt_dev *dev) +{ + int ret; + struct kobject *gd_kobj = &disk_to_dev(dev->gd)->kobj; + + ret = kobject_init_and_add(&dev->kobj, &rnbd_dev_ktype, gd_kobj, "%s", + "rnbd"); + if (ret) + rnbd_clt_err(dev, "Failed to create device sysfs dir, err: %d\n", + ret); + + return ret; +} + +static ssize_t rnbd_clt_map_device_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *page) +{ + return scnprintf(page, PAGE_SIZE, + "Usage: echo \"[dest_port=server port number] sessname= path=<[srcaddr@]dstaddr> [path=<[srcaddr@]dstaddr>] device_path= [access_mode=]\" > %s\n\naddr ::= [ ip: | ip: | gid: ]\n", + attr->attr.name); +} + +static int rnbd_clt_get_path_name(struct rnbd_clt_dev *dev, char *buf, + size_t len) +{ + int ret; + char pathname[NAME_MAX], *s; + + strlcpy(pathname, dev->pathname, sizeof(pathname)); + while ((s = strchr(pathname, '/'))) + s[0] = '!'; + + ret = snprintf(buf, len, "%s", pathname); + if (ret >= len) + return -ENAMETOOLONG; + + return 0; +} + +static int rnbd_clt_add_dev_symlink(struct rnbd_clt_dev *dev) +{ + struct kobject *gd_kobj = &disk_to_dev(dev->gd)->kobj; + int ret; + + ret = rnbd_clt_get_path_name(dev, dev->blk_symlink_name, + sizeof(dev->blk_symlink_name)); + if (ret) { + rnbd_clt_err(dev, "Failed to get /sys/block symlink path, err: %d\n", + ret); + goto out_err; + } + + ret = sysfs_create_link(rnbd_devs_kobj, gd_kobj, + dev->blk_symlink_name); + if (ret) { + rnbd_clt_err(dev, "Creating /sys/block symlink failed, err: %d\n", + ret); + goto out_err; + } + + return 0; + +out_err: + dev->blk_symlink_name[0] = '\0'; + return ret; +} + +static ssize_t rnbd_clt_map_device_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct rnbd_clt_dev *dev; + struct rnbd_map_options opt; + int ret; + char pathname[NAME_MAX]; + char sessname[NAME_MAX]; + enum rnbd_access_mode access_mode = RNBD_ACCESS_RW; + u16 port_nr = RTRS_PORT; + + struct sockaddr_storage *addrs; + struct rtrs_addr paths[6]; + size_t path_cnt; + + opt.sessname = sessname; + opt.paths = paths; + opt.path_cnt = &path_cnt; + opt.pathname = pathname; + opt.dest_port = &port_nr; + opt.access_mode = &access_mode; + addrs = kcalloc(ARRAY_SIZE(paths) * 2, sizeof(*addrs), GFP_KERNEL); + if (!addrs) + return -ENOMEM; + + for (path_cnt = 0; path_cnt < ARRAY_SIZE(paths); path_cnt++) { + paths[path_cnt].src = &addrs[path_cnt * 2]; + paths[path_cnt].dst = &addrs[path_cnt * 2 + 1]; + } + + ret = rnbd_clt_parse_map_options(buf, ARRAY_SIZE(paths), &opt); + if (ret) + goto out; + + pr_info("Mapping device %s on session %s, (access_mode: %s)\n", + pathname, sessname, + rnbd_access_mode_str(access_mode)); + + dev = rnbd_clt_map_device(sessname, paths, path_cnt, port_nr, pathname, + access_mode); + if (IS_ERR(dev)) { + ret = PTR_ERR(dev); + goto out; + } + + ret = rnbd_clt_add_dev_kobj(dev); + if (ret) + goto unmap_dev; + + ret = rnbd_clt_add_dev_symlink(dev); + if (ret) + goto unmap_dev; + + kfree(addrs); + return count; + +unmap_dev: + rnbd_clt_unmap_device(dev, true, NULL); +out: + kfree(addrs); + return ret; +} + +static struct kobj_attribute rnbd_clt_map_device_attr = + __ATTR(map_device, 0644, + rnbd_clt_map_device_show, rnbd_clt_map_device_store); + +static struct attribute *default_attrs[] = { + &rnbd_clt_map_device_attr.attr, + NULL, +}; + +static struct attribute_group default_attr_group = { + .attrs = default_attrs, +}; + +static const struct attribute_group *default_attr_groups[] = { + &default_attr_group, + NULL, +}; + +int rnbd_clt_create_sysfs_files(void) +{ + int err; + + rnbd_dev_class = class_create(THIS_MODULE, "rnbd-client"); + if (IS_ERR(rnbd_dev_class)) + return PTR_ERR(rnbd_dev_class); + + rnbd_dev = device_create_with_groups(rnbd_dev_class, NULL, + MKDEV(0, 0), NULL, + default_attr_groups, "ctl"); + if (IS_ERR(rnbd_dev)) { + err = PTR_ERR(rnbd_dev); + goto cls_destroy; + } + rnbd_devs_kobj = kobject_create_and_add("devices", &rnbd_dev->kobj); + if (!rnbd_devs_kobj) { + err = -ENOMEM; + goto dev_destroy; + } + + return 0; + +dev_destroy: + device_destroy(rnbd_dev_class, MKDEV(0, 0)); +cls_destroy: + class_destroy(rnbd_dev_class); + + return err; +} + +void rnbd_clt_destroy_default_group(void) +{ + sysfs_remove_group(&rnbd_dev->kobj, &default_attr_group); +} + +void rnbd_clt_destroy_sysfs_files(void) +{ + kobject_del(rnbd_devs_kobj); + kobject_put(rnbd_devs_kobj); + device_destroy(rnbd_dev_class, MKDEV(0, 0)); + class_destroy(rnbd_dev_class); +} -- cgit v1.2.3 From d4c6957dd001dc097e8057611093c0731c517d26 Mon Sep 17 00:00:00 2001 From: Jack Wang Date: Mon, 11 May 2020 15:51:25 +0200 Subject: block/rnbd: server: private header with server structs and functions This header describes main structs and functions used by rnbd-server module, namely structs for managing sessions from different clients and mapped (opened) devices. Link: https://lore.kernel.org/r/20200511135131.27580-20-danil.kipnis@cloud.ionos.com Signed-off-by: Danil Kipnis Signed-off-by: Jack Wang Reviewed-by: Bart Van Assche Acked-by: Jens Axboe Signed-off-by: Jason Gunthorpe --- drivers/block/rnbd/rnbd-srv.h | 78 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 78 insertions(+) create mode 100644 drivers/block/rnbd/rnbd-srv.h diff --git a/drivers/block/rnbd/rnbd-srv.h b/drivers/block/rnbd/rnbd-srv.h new file mode 100644 index 000000000000..5a8544b5e74f --- /dev/null +++ b/drivers/block/rnbd/rnbd-srv.h @@ -0,0 +1,78 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * RDMA Network Block Driver + * + * Copyright (c) 2014 - 2018 ProfitBricks GmbH. All rights reserved. + * Copyright (c) 2018 - 2019 1&1 IONOS Cloud GmbH. All rights reserved. + * Copyright (c) 2019 - 2020 1&1 IONOS SE. All rights reserved. + */ +#ifndef RNBD_SRV_H +#define RNBD_SRV_H + +#include +#include +#include + +#include +#include "rnbd-proto.h" +#include "rnbd-log.h" + +struct rnbd_srv_session { + /* Entry inside global sess_list */ + struct list_head list; + struct rtrs_srv *rtrs; + char sessname[NAME_MAX]; + int queue_depth; + struct bio_set sess_bio_set; + + struct xarray index_idr; + /* List of struct rnbd_srv_sess_dev */ + struct list_head sess_dev_list; + struct mutex lock; + u8 ver; +}; + +struct rnbd_srv_dev { + /* Entry inside global dev_list */ + struct list_head list; + struct kobject dev_kobj; + struct kobject *dev_sessions_kobj; + struct kref kref; + char id[NAME_MAX]; + /* List of rnbd_srv_sess_dev structs */ + struct list_head sess_dev_list; + struct mutex lock; + int open_write_cnt; +}; + +/* Structure which binds N devices and N sessions */ +struct rnbd_srv_sess_dev { + /* Entry inside rnbd_srv_dev struct */ + struct list_head dev_list; + /* Entry inside rnbd_srv_session struct */ + struct list_head sess_list; + struct rnbd_dev *rnbd_dev; + struct rnbd_srv_session *sess; + struct rnbd_srv_dev *dev; + struct kobject kobj; + u32 device_id; + fmode_t open_flags; + struct kref kref; + struct completion *destroy_comp; + char pathname[NAME_MAX]; + enum rnbd_access_mode access_mode; +}; + +/* rnbd-srv-sysfs.c */ + +int rnbd_srv_create_dev_sysfs(struct rnbd_srv_dev *dev, + struct block_device *bdev, + const char *dir_name); +void rnbd_srv_destroy_dev_sysfs(struct rnbd_srv_dev *dev); +int rnbd_srv_create_dev_session_sysfs(struct rnbd_srv_sess_dev *sess_dev); +void rnbd_srv_destroy_dev_session_sysfs(struct rnbd_srv_sess_dev *sess_dev); +int rnbd_srv_create_sysfs_files(void); +void rnbd_srv_destroy_sysfs_files(void); +void rnbd_destroy_sess_dev(struct rnbd_srv_sess_dev *sess_dev); + +#endif /* RNBD_SRV_H */ -- cgit v1.2.3 From 2de6c8de192b9341ffa5e84afe1ce6196d4eef41 Mon Sep 17 00:00:00 2001 From: Jack Wang Date: Mon, 11 May 2020 15:51:26 +0200 Subject: block/rnbd: server: main functionality This is main functionality of rnbd-server module, which handles RTRS events and rnbd protocol requests, like map (open) or unmap (close) device. Also server side is responsible for processing incoming IBTRS IO requests and forward them to local mapped devices. Link: https://lore.kernel.org/r/20200511135131.27580-21-danil.kipnis@cloud.ionos.com Signed-off-by: Danil Kipnis Signed-off-by: Jack Wang Reviewed-by: Bart Van Assche Acked-by: Jens Axboe Signed-off-by: Jason Gunthorpe --- drivers/block/rnbd/rnbd-srv.c | 844 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 844 insertions(+) create mode 100644 drivers/block/rnbd/rnbd-srv.c diff --git a/drivers/block/rnbd/rnbd-srv.c b/drivers/block/rnbd/rnbd-srv.c new file mode 100644 index 000000000000..86e61523907b --- /dev/null +++ b/drivers/block/rnbd/rnbd-srv.c @@ -0,0 +1,844 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * RDMA Network Block Driver + * + * Copyright (c) 2014 - 2018 ProfitBricks GmbH. All rights reserved. + * Copyright (c) 2018 - 2019 1&1 IONOS Cloud GmbH. All rights reserved. + * Copyright (c) 2019 - 2020 1&1 IONOS SE. All rights reserved. + */ +#undef pr_fmt +#define pr_fmt(fmt) KBUILD_MODNAME " L" __stringify(__LINE__) ": " fmt + +#include +#include + +#include "rnbd-srv.h" +#include "rnbd-srv-dev.h" + +MODULE_DESCRIPTION("RDMA Network Block Device Server"); +MODULE_LICENSE("GPL"); + +static u16 port_nr = RTRS_PORT; + +module_param_named(port_nr, port_nr, ushort, 0444); +MODULE_PARM_DESC(port_nr, + "The port number the server is listening on (default: " + __stringify(RTRS_PORT)")"); + +#define DEFAULT_DEV_SEARCH_PATH "/" + +static char dev_search_path[PATH_MAX] = DEFAULT_DEV_SEARCH_PATH; + +static int dev_search_path_set(const char *val, const struct kernel_param *kp) +{ + const char *p = strrchr(val, '\n') ? : val + strlen(val); + + if (strlen(val) >= sizeof(dev_search_path)) + return -EINVAL; + + snprintf(dev_search_path, sizeof(dev_search_path), "%.*s", + (int)(p - val), val); + + pr_info("dev_search_path changed to '%s'\n", dev_search_path); + + return 0; +} + +static struct kparam_string dev_search_path_kparam_str = { + .maxlen = sizeof(dev_search_path), + .string = dev_search_path +}; + +static const struct kernel_param_ops dev_search_path_ops = { + .set = dev_search_path_set, + .get = param_get_string, +}; + +module_param_cb(dev_search_path, &dev_search_path_ops, + &dev_search_path_kparam_str, 0444); +MODULE_PARM_DESC(dev_search_path, + "Sets the dev_search_path. When a device is mapped this path is prepended to the device path from the map device operation. If %SESSNAME% is specified in a path, then device will be searched in a session namespace. (default: " + DEFAULT_DEV_SEARCH_PATH ")"); + +static DEFINE_MUTEX(sess_lock); +static DEFINE_SPINLOCK(dev_lock); + +static LIST_HEAD(sess_list); +static LIST_HEAD(dev_list); + +struct rnbd_io_private { + struct rtrs_srv_op *id; + struct rnbd_srv_sess_dev *sess_dev; +}; + +static void rnbd_sess_dev_release(struct kref *kref) +{ + struct rnbd_srv_sess_dev *sess_dev; + + sess_dev = container_of(kref, struct rnbd_srv_sess_dev, kref); + complete(sess_dev->destroy_comp); +} + +static inline void rnbd_put_sess_dev(struct rnbd_srv_sess_dev *sess_dev) +{ + kref_put(&sess_dev->kref, rnbd_sess_dev_release); +} + +void rnbd_endio(void *priv, int error) +{ + struct rnbd_io_private *rnbd_priv = priv; + struct rnbd_srv_sess_dev *sess_dev = rnbd_priv->sess_dev; + + rnbd_put_sess_dev(sess_dev); + + rtrs_srv_resp_rdma(rnbd_priv->id, error); + + kfree(priv); +} + +static struct rnbd_srv_sess_dev * +rnbd_get_sess_dev(int dev_id, struct rnbd_srv_session *srv_sess) +{ + struct rnbd_srv_sess_dev *sess_dev; + int ret = 0; + + rcu_read_lock(); + sess_dev = xa_load(&srv_sess->index_idr, dev_id); + if (likely(sess_dev)) + ret = kref_get_unless_zero(&sess_dev->kref); + rcu_read_unlock(); + + if (!sess_dev || !ret) + return ERR_PTR(-ENXIO); + + return sess_dev; +} + +static int process_rdma(struct rtrs_srv *sess, + struct rnbd_srv_session *srv_sess, + struct rtrs_srv_op *id, void *data, u32 datalen, + const void *usr, size_t usrlen) +{ + const struct rnbd_msg_io *msg = usr; + struct rnbd_io_private *priv; + struct rnbd_srv_sess_dev *sess_dev; + u32 dev_id; + int err; + + priv = kmalloc(sizeof(*priv), GFP_KERNEL); + if (!priv) + return -ENOMEM; + + dev_id = le32_to_cpu(msg->device_id); + + sess_dev = rnbd_get_sess_dev(dev_id, srv_sess); + if (IS_ERR(sess_dev)) { + pr_err_ratelimited("Got I/O request on session %s for unknown device id %d\n", + srv_sess->sessname, dev_id); + err = -ENOTCONN; + goto err; + } + + priv->sess_dev = sess_dev; + priv->id = id; + + err = rnbd_dev_submit_io(sess_dev->rnbd_dev, le64_to_cpu(msg->sector), + data, datalen, le32_to_cpu(msg->bi_size), + le32_to_cpu(msg->rw), + srv_sess->ver < RNBD_PROTO_VER_MAJOR || + usrlen < sizeof(*msg) ? + 0 : le16_to_cpu(msg->prio), priv); + if (unlikely(err)) { + rnbd_srv_err(sess_dev, "Submitting I/O to device failed, err: %d\n", + err); + goto sess_dev_put; + } + + return 0; + +sess_dev_put: + rnbd_put_sess_dev(sess_dev); +err: + kfree(priv); + return err; +} + +static void destroy_device(struct rnbd_srv_dev *dev) +{ + WARN_ONCE(!list_empty(&dev->sess_dev_list), + "Device %s is being destroyed but still in use!\n", + dev->id); + + spin_lock(&dev_lock); + list_del(&dev->list); + spin_unlock(&dev_lock); + + mutex_destroy(&dev->lock); + if (dev->dev_kobj.state_in_sysfs) + /* + * Destroy kobj only if it was really created. + */ + rnbd_srv_destroy_dev_sysfs(dev); + else + kfree(dev); +} + +static void destroy_device_cb(struct kref *kref) +{ + struct rnbd_srv_dev *dev; + + dev = container_of(kref, struct rnbd_srv_dev, kref); + + destroy_device(dev); +} + +static void rnbd_put_srv_dev(struct rnbd_srv_dev *dev) +{ + kref_put(&dev->kref, destroy_device_cb); +} + +void rnbd_destroy_sess_dev(struct rnbd_srv_sess_dev *sess_dev) +{ + DECLARE_COMPLETION_ONSTACK(dc); + + xa_erase(&sess_dev->sess->index_idr, sess_dev->device_id); + synchronize_rcu(); + sess_dev->destroy_comp = &dc; + rnbd_put_sess_dev(sess_dev); + wait_for_completion(&dc); /* wait for inflights to drop to zero */ + + rnbd_dev_close(sess_dev->rnbd_dev); + list_del(&sess_dev->sess_list); + mutex_lock(&sess_dev->dev->lock); + list_del(&sess_dev->dev_list); + if (sess_dev->open_flags & FMODE_WRITE) + sess_dev->dev->open_write_cnt--; + mutex_unlock(&sess_dev->dev->lock); + + rnbd_put_srv_dev(sess_dev->dev); + + rnbd_srv_info(sess_dev, "Device closed\n"); + kfree(sess_dev); +} + +static void destroy_sess(struct rnbd_srv_session *srv_sess) +{ + struct rnbd_srv_sess_dev *sess_dev, *tmp; + + if (list_empty(&srv_sess->sess_dev_list)) + goto out; + + mutex_lock(&srv_sess->lock); + list_for_each_entry_safe(sess_dev, tmp, &srv_sess->sess_dev_list, + sess_list) + rnbd_srv_destroy_dev_session_sysfs(sess_dev); + mutex_unlock(&srv_sess->lock); + +out: + xa_destroy(&srv_sess->index_idr); + bioset_exit(&srv_sess->sess_bio_set); + + pr_info("RTRS Session %s disconnected\n", srv_sess->sessname); + + mutex_lock(&sess_lock); + list_del(&srv_sess->list); + mutex_unlock(&sess_lock); + + mutex_destroy(&srv_sess->lock); + kfree(srv_sess); +} + +static int create_sess(struct rtrs_srv *rtrs) +{ + struct rnbd_srv_session *srv_sess; + char sessname[NAME_MAX]; + int err; + + err = rtrs_srv_get_sess_name(rtrs, sessname, sizeof(sessname)); + if (err) { + pr_err("rtrs_srv_get_sess_name(%s): %d\n", sessname, err); + + return err; + } + srv_sess = kzalloc(sizeof(*srv_sess), GFP_KERNEL); + if (!srv_sess) + return -ENOMEM; + + srv_sess->queue_depth = rtrs_srv_get_queue_depth(rtrs); + err = bioset_init(&srv_sess->sess_bio_set, srv_sess->queue_depth, + offsetof(struct rnbd_dev_blk_io, bio), + BIOSET_NEED_BVECS); + if (err) { + pr_err("Allocating srv_session for session %s failed\n", + sessname); + kfree(srv_sess); + return err; + } + + xa_init_flags(&srv_sess->index_idr, XA_FLAGS_ALLOC); + INIT_LIST_HEAD(&srv_sess->sess_dev_list); + mutex_init(&srv_sess->lock); + mutex_lock(&sess_lock); + list_add(&srv_sess->list, &sess_list); + mutex_unlock(&sess_lock); + + srv_sess->rtrs = rtrs; + strlcpy(srv_sess->sessname, sessname, sizeof(srv_sess->sessname)); + + rtrs_srv_set_sess_priv(rtrs, srv_sess); + + return 0; +} + +static int rnbd_srv_link_ev(struct rtrs_srv *rtrs, + enum rtrs_srv_link_ev ev, void *priv) +{ + struct rnbd_srv_session *srv_sess = priv; + + switch (ev) { + case RTRS_SRV_LINK_EV_CONNECTED: + return create_sess(rtrs); + + case RTRS_SRV_LINK_EV_DISCONNECTED: + if (WARN_ON_ONCE(!srv_sess)) + return -EINVAL; + + destroy_sess(srv_sess); + return 0; + + default: + pr_warn("Received unknown RTRS session event %d from session %s\n", + ev, srv_sess->sessname); + return -EINVAL; + } +} + +static int process_msg_close(struct rtrs_srv *rtrs, + struct rnbd_srv_session *srv_sess, + void *data, size_t datalen, const void *usr, + size_t usrlen) +{ + const struct rnbd_msg_close *close_msg = usr; + struct rnbd_srv_sess_dev *sess_dev; + + sess_dev = rnbd_get_sess_dev(le32_to_cpu(close_msg->device_id), + srv_sess); + if (IS_ERR(sess_dev)) + return 0; + + rnbd_put_sess_dev(sess_dev); + mutex_lock(&srv_sess->lock); + rnbd_srv_destroy_dev_session_sysfs(sess_dev); + mutex_unlock(&srv_sess->lock); + return 0; +} + +static int process_msg_open(struct rtrs_srv *rtrs, + struct rnbd_srv_session *srv_sess, + const void *msg, size_t len, + void *data, size_t datalen); + +static int process_msg_sess_info(struct rtrs_srv *rtrs, + struct rnbd_srv_session *srv_sess, + const void *msg, size_t len, + void *data, size_t datalen); + +static int rnbd_srv_rdma_ev(struct rtrs_srv *rtrs, void *priv, + struct rtrs_srv_op *id, int dir, + void *data, size_t datalen, const void *usr, + size_t usrlen) +{ + struct rnbd_srv_session *srv_sess = priv; + const struct rnbd_msg_hdr *hdr = usr; + int ret = 0; + u16 type; + + if (WARN_ON_ONCE(!srv_sess)) + return -ENODEV; + + type = le16_to_cpu(hdr->type); + + switch (type) { + case RNBD_MSG_IO: + return process_rdma(rtrs, srv_sess, id, data, datalen, usr, + usrlen); + case RNBD_MSG_CLOSE: + ret = process_msg_close(rtrs, srv_sess, data, datalen, + usr, usrlen); + break; + case RNBD_MSG_OPEN: + ret = process_msg_open(rtrs, srv_sess, usr, usrlen, + data, datalen); + break; + case RNBD_MSG_SESS_INFO: + ret = process_msg_sess_info(rtrs, srv_sess, usr, usrlen, + data, datalen); + break; + default: + pr_warn("Received unexpected message type %d with dir %d from session %s\n", + type, dir, srv_sess->sessname); + return -EINVAL; + } + + rtrs_srv_resp_rdma(id, ret); + return 0; +} + +static struct rnbd_srv_sess_dev +*rnbd_sess_dev_alloc(struct rnbd_srv_session *srv_sess) +{ + struct rnbd_srv_sess_dev *sess_dev; + int error; + + sess_dev = kzalloc(sizeof(*sess_dev), GFP_KERNEL); + if (!sess_dev) + return ERR_PTR(-ENOMEM); + + error = xa_alloc(&srv_sess->index_idr, &sess_dev->device_id, sess_dev, + xa_limit_32b, GFP_NOWAIT); + if (error < 0) { + pr_warn("Allocating idr failed, err: %d\n", error); + kfree(sess_dev); + return ERR_PTR(error); + } + + return sess_dev; +} + +static struct rnbd_srv_dev *rnbd_srv_init_srv_dev(const char *id) +{ + struct rnbd_srv_dev *dev; + + dev = kzalloc(sizeof(*dev), GFP_KERNEL); + if (!dev) + return ERR_PTR(-ENOMEM); + + strlcpy(dev->id, id, sizeof(dev->id)); + kref_init(&dev->kref); + INIT_LIST_HEAD(&dev->sess_dev_list); + mutex_init(&dev->lock); + + return dev; +} + +static struct rnbd_srv_dev * +rnbd_srv_find_or_add_srv_dev(struct rnbd_srv_dev *new_dev) +{ + struct rnbd_srv_dev *dev; + + spin_lock(&dev_lock); + list_for_each_entry(dev, &dev_list, list) { + if (!strncmp(dev->id, new_dev->id, sizeof(dev->id))) { + if (!kref_get_unless_zero(&dev->kref)) + /* + * We lost the race, device is almost dead. + * Continue traversing to find a valid one. + */ + continue; + spin_unlock(&dev_lock); + return dev; + } + } + list_add(&new_dev->list, &dev_list); + spin_unlock(&dev_lock); + + return new_dev; +} + +static int rnbd_srv_check_update_open_perm(struct rnbd_srv_dev *srv_dev, + struct rnbd_srv_session *srv_sess, + enum rnbd_access_mode access_mode) +{ + int ret = -EPERM; + + mutex_lock(&srv_dev->lock); + + switch (access_mode) { + case RNBD_ACCESS_RO: + ret = 0; + break; + case RNBD_ACCESS_RW: + if (srv_dev->open_write_cnt == 0) { + srv_dev->open_write_cnt++; + ret = 0; + } else { + pr_err("Mapping device '%s' for session %s with RW permissions failed. Device already opened as 'RW' by %d client(s), access mode %s.\n", + srv_dev->id, srv_sess->sessname, + srv_dev->open_write_cnt, + rnbd_access_mode_str(access_mode)); + } + break; + case RNBD_ACCESS_MIGRATION: + if (srv_dev->open_write_cnt < 2) { + srv_dev->open_write_cnt++; + ret = 0; + } else { + pr_err("Mapping device '%s' for session %s with migration permissions failed. Device already opened as 'RW' by %d client(s), access mode %s.\n", + srv_dev->id, srv_sess->sessname, + srv_dev->open_write_cnt, + rnbd_access_mode_str(access_mode)); + } + break; + default: + pr_err("Received mapping request for device '%s' on session %s with invalid access mode: %d\n", + srv_dev->id, srv_sess->sessname, access_mode); + ret = -EINVAL; + } + + mutex_unlock(&srv_dev->lock); + + return ret; +} + +static struct rnbd_srv_dev * +rnbd_srv_get_or_create_srv_dev(struct rnbd_dev *rnbd_dev, + struct rnbd_srv_session *srv_sess, + enum rnbd_access_mode access_mode) +{ + int ret; + struct rnbd_srv_dev *new_dev, *dev; + + new_dev = rnbd_srv_init_srv_dev(rnbd_dev->name); + if (IS_ERR(new_dev)) + return new_dev; + + dev = rnbd_srv_find_or_add_srv_dev(new_dev); + if (dev != new_dev) + kfree(new_dev); + + ret = rnbd_srv_check_update_open_perm(dev, srv_sess, access_mode); + if (ret) { + rnbd_put_srv_dev(dev); + return ERR_PTR(ret); + } + + return dev; +} + +static void rnbd_srv_fill_msg_open_rsp(struct rnbd_msg_open_rsp *rsp, + struct rnbd_srv_sess_dev *sess_dev) +{ + struct rnbd_dev *rnbd_dev = sess_dev->rnbd_dev; + + rsp->hdr.type = cpu_to_le16(RNBD_MSG_OPEN_RSP); + rsp->device_id = + cpu_to_le32(sess_dev->device_id); + rsp->nsectors = + cpu_to_le64(get_capacity(rnbd_dev->bdev->bd_disk)); + rsp->logical_block_size = + cpu_to_le16(bdev_logical_block_size(rnbd_dev->bdev)); + rsp->physical_block_size = + cpu_to_le16(bdev_physical_block_size(rnbd_dev->bdev)); + rsp->max_segments = + cpu_to_le16(rnbd_dev_get_max_segs(rnbd_dev)); + rsp->max_hw_sectors = + cpu_to_le32(rnbd_dev_get_max_hw_sects(rnbd_dev)); + rsp->max_write_same_sectors = + cpu_to_le32(bdev_write_same(rnbd_dev->bdev)); + rsp->max_discard_sectors = + cpu_to_le32(rnbd_dev_get_max_discard_sects(rnbd_dev)); + rsp->discard_granularity = + cpu_to_le32(rnbd_dev_get_discard_granularity(rnbd_dev)); + rsp->discard_alignment = + cpu_to_le32(rnbd_dev_get_discard_alignment(rnbd_dev)); + rsp->secure_discard = + cpu_to_le16(rnbd_dev_get_secure_discard(rnbd_dev)); + rsp->rotational = + !blk_queue_nonrot(bdev_get_queue(rnbd_dev->bdev)); +} + +static struct rnbd_srv_sess_dev * +rnbd_srv_create_set_sess_dev(struct rnbd_srv_session *srv_sess, + const struct rnbd_msg_open *open_msg, + struct rnbd_dev *rnbd_dev, fmode_t open_flags, + struct rnbd_srv_dev *srv_dev) +{ + struct rnbd_srv_sess_dev *sdev = rnbd_sess_dev_alloc(srv_sess); + + if (IS_ERR(sdev)) + return sdev; + + kref_init(&sdev->kref); + + strlcpy(sdev->pathname, open_msg->dev_name, sizeof(sdev->pathname)); + + sdev->rnbd_dev = rnbd_dev; + sdev->sess = srv_sess; + sdev->dev = srv_dev; + sdev->open_flags = open_flags; + sdev->access_mode = open_msg->access_mode; + + return sdev; +} + +static char *rnbd_srv_get_full_path(struct rnbd_srv_session *srv_sess, + const char *dev_name) +{ + char *full_path; + char *a, *b; + + full_path = kmalloc(PATH_MAX, GFP_KERNEL); + if (!full_path) + return ERR_PTR(-ENOMEM); + + /* + * Replace %SESSNAME% with a real session name in order to + * create device namespace. + */ + a = strnstr(dev_search_path, "%SESSNAME%", sizeof(dev_search_path)); + if (a) { + int len = a - dev_search_path; + + len = snprintf(full_path, PATH_MAX, "%.*s/%s/%s", len, + dev_search_path, srv_sess->sessname, dev_name); + if (len >= PATH_MAX) { + pr_err("Too long path: %s, %s, %s\n", + dev_search_path, srv_sess->sessname, dev_name); + kfree(full_path); + return ERR_PTR(-EINVAL); + } + } else { + snprintf(full_path, PATH_MAX, "%s/%s", + dev_search_path, dev_name); + } + + /* eliminitate duplicated slashes */ + a = strchr(full_path, '/'); + b = a; + while (*b != '\0') { + if (*b == '/' && *a == '/') { + b++; + } else { + a++; + *a = *b; + b++; + } + } + a++; + *a = '\0'; + + return full_path; +} + +static int process_msg_sess_info(struct rtrs_srv *rtrs, + struct rnbd_srv_session *srv_sess, + const void *msg, size_t len, + void *data, size_t datalen) +{ + const struct rnbd_msg_sess_info *sess_info_msg = msg; + struct rnbd_msg_sess_info_rsp *rsp = data; + + srv_sess->ver = min_t(u8, sess_info_msg->ver, RNBD_PROTO_VER_MAJOR); + pr_debug("Session %s using protocol version %d (client version: %d, server version: %d)\n", + srv_sess->sessname, srv_sess->ver, + sess_info_msg->ver, RNBD_PROTO_VER_MAJOR); + + rsp->hdr.type = cpu_to_le16(RNBD_MSG_SESS_INFO_RSP); + rsp->ver = srv_sess->ver; + + return 0; +} + +/** + * find_srv_sess_dev() - a dev is already opened by this name + * @srv_sess: the session to search. + * @dev_name: string containing the name of the device. + * + * Return struct rnbd_srv_sess_dev if srv_sess already opened the dev_name + * NULL if the session didn't open the device yet. + */ +static struct rnbd_srv_sess_dev * +find_srv_sess_dev(struct rnbd_srv_session *srv_sess, const char *dev_name) +{ + struct rnbd_srv_sess_dev *sess_dev; + + if (list_empty(&srv_sess->sess_dev_list)) + return NULL; + + list_for_each_entry(sess_dev, &srv_sess->sess_dev_list, sess_list) + if (!strcmp(sess_dev->pathname, dev_name)) + return sess_dev; + + return NULL; +} + +static int process_msg_open(struct rtrs_srv *rtrs, + struct rnbd_srv_session *srv_sess, + const void *msg, size_t len, + void *data, size_t datalen) +{ + int ret; + struct rnbd_srv_dev *srv_dev; + struct rnbd_srv_sess_dev *srv_sess_dev; + const struct rnbd_msg_open *open_msg = msg; + fmode_t open_flags; + char *full_path; + struct rnbd_dev *rnbd_dev; + struct rnbd_msg_open_rsp *rsp = data; + + pr_debug("Open message received: session='%s' path='%s' access_mode=%d\n", + srv_sess->sessname, open_msg->dev_name, + open_msg->access_mode); + open_flags = FMODE_READ; + if (open_msg->access_mode != RNBD_ACCESS_RO) + open_flags |= FMODE_WRITE; + + mutex_lock(&srv_sess->lock); + + srv_sess_dev = find_srv_sess_dev(srv_sess, open_msg->dev_name); + if (srv_sess_dev) + goto fill_response; + + if ((strlen(dev_search_path) + strlen(open_msg->dev_name)) + >= PATH_MAX) { + pr_err("Opening device for session %s failed, device path too long. '%s/%s' is longer than PATH_MAX (%d)\n", + srv_sess->sessname, dev_search_path, open_msg->dev_name, + PATH_MAX); + ret = -EINVAL; + goto reject; + } + if (strstr(open_msg->dev_name, "..")) { + pr_err("Opening device for session %s failed, device path %s contains relative path ..\n", + srv_sess->sessname, open_msg->dev_name); + ret = -EINVAL; + goto reject; + } + full_path = rnbd_srv_get_full_path(srv_sess, open_msg->dev_name); + if (IS_ERR(full_path)) { + ret = PTR_ERR(full_path); + pr_err("Opening device '%s' for client %s failed, failed to get device full path, err: %d\n", + open_msg->dev_name, srv_sess->sessname, ret); + goto reject; + } + + rnbd_dev = rnbd_dev_open(full_path, open_flags, + &srv_sess->sess_bio_set); + if (IS_ERR(rnbd_dev)) { + pr_err("Opening device '%s' on session %s failed, failed to open the block device, err: %ld\n", + full_path, srv_sess->sessname, PTR_ERR(rnbd_dev)); + ret = PTR_ERR(rnbd_dev); + goto free_path; + } + + srv_dev = rnbd_srv_get_or_create_srv_dev(rnbd_dev, srv_sess, + open_msg->access_mode); + if (IS_ERR(srv_dev)) { + pr_err("Opening device '%s' on session %s failed, creating srv_dev failed, err: %ld\n", + full_path, srv_sess->sessname, PTR_ERR(srv_dev)); + ret = PTR_ERR(srv_dev); + goto rnbd_dev_close; + } + + srv_sess_dev = rnbd_srv_create_set_sess_dev(srv_sess, open_msg, + rnbd_dev, open_flags, + srv_dev); + if (IS_ERR(srv_sess_dev)) { + pr_err("Opening device '%s' on session %s failed, creating sess_dev failed, err: %ld\n", + full_path, srv_sess->sessname, PTR_ERR(srv_sess_dev)); + ret = PTR_ERR(srv_sess_dev); + goto srv_dev_put; + } + + /* Create the srv_dev sysfs files if they haven't been created yet. The + * reason to delay the creation is not to create the sysfs files before + * we are sure the device can be opened. + */ + mutex_lock(&srv_dev->lock); + if (!srv_dev->dev_kobj.state_in_sysfs) { + ret = rnbd_srv_create_dev_sysfs(srv_dev, rnbd_dev->bdev, + rnbd_dev->name); + if (ret) { + mutex_unlock(&srv_dev->lock); + rnbd_srv_err(srv_sess_dev, + "Opening device failed, failed to create device sysfs files, err: %d\n", + ret); + goto free_srv_sess_dev; + } + } + + ret = rnbd_srv_create_dev_session_sysfs(srv_sess_dev); + if (ret) { + mutex_unlock(&srv_dev->lock); + rnbd_srv_err(srv_sess_dev, + "Opening device failed, failed to create dev client sysfs files, err: %d\n", + ret); + goto free_srv_sess_dev; + } + + list_add(&srv_sess_dev->dev_list, &srv_dev->sess_dev_list); + mutex_unlock(&srv_dev->lock); + + list_add(&srv_sess_dev->sess_list, &srv_sess->sess_dev_list); + + rnbd_srv_info(srv_sess_dev, "Opened device '%s'\n", srv_dev->id); + + kfree(full_path); + +fill_response: + rnbd_srv_fill_msg_open_rsp(rsp, srv_sess_dev); + mutex_unlock(&srv_sess->lock); + return 0; + +free_srv_sess_dev: + xa_erase(&srv_sess->index_idr, srv_sess_dev->device_id); + synchronize_rcu(); + kfree(srv_sess_dev); +srv_dev_put: + if (open_msg->access_mode != RNBD_ACCESS_RO) { + mutex_lock(&srv_dev->lock); + srv_dev->open_write_cnt--; + mutex_unlock(&srv_dev->lock); + } + rnbd_put_srv_dev(srv_dev); +rnbd_dev_close: + rnbd_dev_close(rnbd_dev); +free_path: + kfree(full_path); +reject: + mutex_unlock(&srv_sess->lock); + return ret; +} + +static struct rtrs_srv_ctx *rtrs_ctx; + +static struct rtrs_srv_ops rtrs_ops; +static int __init rnbd_srv_init_module(void) +{ + int err; + + BUILD_BUG_ON(sizeof(struct rnbd_msg_hdr) != 4); + BUILD_BUG_ON(sizeof(struct rnbd_msg_sess_info) != 36); + BUILD_BUG_ON(sizeof(struct rnbd_msg_sess_info_rsp) != 36); + BUILD_BUG_ON(sizeof(struct rnbd_msg_open) != 264); + BUILD_BUG_ON(sizeof(struct rnbd_msg_close) != 8); + BUILD_BUG_ON(sizeof(struct rnbd_msg_open_rsp) != 56); + rtrs_ops = (struct rtrs_srv_ops) { + .rdma_ev = rnbd_srv_rdma_ev, + .link_ev = rnbd_srv_link_ev, + }; + rtrs_ctx = rtrs_srv_open(&rtrs_ops, port_nr); + if (IS_ERR(rtrs_ctx)) { + err = PTR_ERR(rtrs_ctx); + pr_err("rtrs_srv_open(), err: %d\n", err); + return err; + } + + err = rnbd_srv_create_sysfs_files(); + if (err) { + pr_err("rnbd_srv_create_sysfs_files(), err: %d\n", err); + rtrs_srv_close(rtrs_ctx); + return err; + } + + return 0; +} + +static void __exit rnbd_srv_cleanup_module(void) +{ + rtrs_srv_close(rtrs_ctx); + WARN_ON(!list_empty(&sess_list)); + rnbd_srv_destroy_sysfs_files(); +} + +module_init(rnbd_srv_init_module); +module_exit(rnbd_srv_cleanup_module); -- cgit v1.2.3 From f0aad9baadb5b2933e6f4fbb2fd3ffbdcc35b2cf Mon Sep 17 00:00:00 2001 From: Jack Wang Date: Mon, 11 May 2020 15:51:27 +0200 Subject: block/rnbd: server: functionality for IO submitting to block dev This provides helper functions for IO submitting to block dev. Link: https://lore.kernel.org/r/20200511135131.27580-22-danil.kipnis@cloud.ionos.com Signed-off-by: Danil Kipnis Signed-off-by: Jack Wang Reviewed-by: Bart Van Assche Acked-by: Jens Axboe Signed-off-by: Jason Gunthorpe --- drivers/block/rnbd/rnbd-srv-dev.c | 134 ++++++++++++++++++++++++++++++++++++++ drivers/block/rnbd/rnbd-srv-dev.h | 92 ++++++++++++++++++++++++++ 2 files changed, 226 insertions(+) create mode 100644 drivers/block/rnbd/rnbd-srv-dev.c create mode 100644 drivers/block/rnbd/rnbd-srv-dev.h diff --git a/drivers/block/rnbd/rnbd-srv-dev.c b/drivers/block/rnbd/rnbd-srv-dev.c new file mode 100644 index 000000000000..5eddfd29ab64 --- /dev/null +++ b/drivers/block/rnbd/rnbd-srv-dev.c @@ -0,0 +1,134 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * RDMA Network Block Driver + * + * Copyright (c) 2014 - 2018 ProfitBricks GmbH. All rights reserved. + * Copyright (c) 2018 - 2019 1&1 IONOS Cloud GmbH. All rights reserved. + * Copyright (c) 2019 - 2020 1&1 IONOS SE. All rights reserved. + */ +#undef pr_fmt +#define pr_fmt(fmt) KBUILD_MODNAME " L" __stringify(__LINE__) ": " fmt + +#include "rnbd-srv-dev.h" +#include "rnbd-log.h" + +struct rnbd_dev *rnbd_dev_open(const char *path, fmode_t flags, + struct bio_set *bs) +{ + struct rnbd_dev *dev; + int ret; + + dev = kzalloc(sizeof(*dev), GFP_KERNEL); + if (!dev) + return ERR_PTR(-ENOMEM); + + dev->blk_open_flags = flags; + dev->bdev = blkdev_get_by_path(path, flags, THIS_MODULE); + ret = PTR_ERR_OR_ZERO(dev->bdev); + if (ret) + goto err; + + dev->blk_open_flags = flags; + bdevname(dev->bdev, dev->name); + dev->ibd_bio_set = bs; + + return dev; + +err: + kfree(dev); + return ERR_PTR(ret); +} + +void rnbd_dev_close(struct rnbd_dev *dev) +{ + blkdev_put(dev->bdev, dev->blk_open_flags); + kfree(dev); +} + +static void rnbd_dev_bi_end_io(struct bio *bio) +{ + struct rnbd_dev_blk_io *io = bio->bi_private; + + rnbd_endio(io->priv, blk_status_to_errno(bio->bi_status)); + bio_put(bio); +} + +/** + * rnbd_bio_map_kern - map kernel address into bio + * @data: pointer to buffer to map + * @bs: bio_set to use. + * @len: length in bytes + * @gfp_mask: allocation flags for bio allocation + * + * Map the kernel address into a bio suitable for io to a block + * device. Returns an error pointer in case of error. + */ +static struct bio *rnbd_bio_map_kern(void *data, struct bio_set *bs, + unsigned int len, gfp_t gfp_mask) +{ + unsigned long kaddr = (unsigned long)data; + unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; + unsigned long start = kaddr >> PAGE_SHIFT; + const int nr_pages = end - start; + int offset, i; + struct bio *bio; + + bio = bio_alloc_bioset(gfp_mask, nr_pages, bs); + if (!bio) + return ERR_PTR(-ENOMEM); + + offset = offset_in_page(kaddr); + for (i = 0; i < nr_pages; i++) { + unsigned int bytes = PAGE_SIZE - offset; + + if (len <= 0) + break; + + if (bytes > len) + bytes = len; + + if (bio_add_page(bio, virt_to_page(data), bytes, + offset) < bytes) { + /* we don't support partial mappings */ + bio_put(bio); + return ERR_PTR(-EINVAL); + } + + data += bytes; + len -= bytes; + offset = 0; + } + + bio->bi_end_io = bio_put; + return bio; +} + +int rnbd_dev_submit_io(struct rnbd_dev *dev, sector_t sector, void *data, + size_t len, u32 bi_size, enum rnbd_io_flags flags, + short prio, void *priv) +{ + struct rnbd_dev_blk_io *io; + struct bio *bio; + + /* Generate bio with pages pointing to the rdma buffer */ + bio = rnbd_bio_map_kern(data, dev->ibd_bio_set, len, GFP_KERNEL); + if (IS_ERR(bio)) + return PTR_ERR(bio); + + io = container_of(bio, struct rnbd_dev_blk_io, bio); + + io->dev = dev; + io->priv = priv; + + bio->bi_end_io = rnbd_dev_bi_end_io; + bio->bi_private = io; + bio->bi_opf = rnbd_to_bio_flags(flags); + bio->bi_iter.bi_sector = sector; + bio->bi_iter.bi_size = bi_size; + bio_set_prio(bio, prio); + bio_set_dev(bio, dev->bdev); + + submit_bio(bio); + + return 0; +} diff --git a/drivers/block/rnbd/rnbd-srv-dev.h b/drivers/block/rnbd/rnbd-srv-dev.h new file mode 100644 index 000000000000..0f65b09a270e --- /dev/null +++ b/drivers/block/rnbd/rnbd-srv-dev.h @@ -0,0 +1,92 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * RDMA Network Block Driver + * + * Copyright (c) 2014 - 2018 ProfitBricks GmbH. All rights reserved. + * Copyright (c) 2018 - 2019 1&1 IONOS Cloud GmbH. All rights reserved. + * Copyright (c) 2019 - 2020 1&1 IONOS SE. All rights reserved. + */ +#ifndef RNBD_SRV_DEV_H +#define RNBD_SRV_DEV_H + +#include +#include "rnbd-proto.h" + +struct rnbd_dev { + struct block_device *bdev; + struct bio_set *ibd_bio_set; + fmode_t blk_open_flags; + char name[BDEVNAME_SIZE]; +}; + +struct rnbd_dev_blk_io { + struct rnbd_dev *dev; + void *priv; + /* have to be last member for front_pad usage of bioset_init */ + struct bio bio; +}; + +/** + * rnbd_dev_open() - Open a device + * @flags: open flags + * @bs: bio_set to use during block io, + */ +struct rnbd_dev *rnbd_dev_open(const char *path, fmode_t flags, + struct bio_set *bs); + +/** + * rnbd_dev_close() - Close a device + */ +void rnbd_dev_close(struct rnbd_dev *dev); + +void rnbd_endio(void *priv, int error); + +static inline int rnbd_dev_get_max_segs(const struct rnbd_dev *dev) +{ + return queue_max_segments(bdev_get_queue(dev->bdev)); +} + +static inline int rnbd_dev_get_max_hw_sects(const struct rnbd_dev *dev) +{ + return queue_max_hw_sectors(bdev_get_queue(dev->bdev)); +} + +static inline int rnbd_dev_get_secure_discard(const struct rnbd_dev *dev) +{ + return blk_queue_secure_erase(bdev_get_queue(dev->bdev)); +} + +static inline int rnbd_dev_get_max_discard_sects(const struct rnbd_dev *dev) +{ + if (!blk_queue_discard(bdev_get_queue(dev->bdev))) + return 0; + + return blk_queue_get_max_sectors(bdev_get_queue(dev->bdev), + REQ_OP_DISCARD); +} + +static inline int rnbd_dev_get_discard_granularity(const struct rnbd_dev *dev) +{ + return bdev_get_queue(dev->bdev)->limits.discard_granularity; +} + +static inline int rnbd_dev_get_discard_alignment(const struct rnbd_dev *dev) +{ + return bdev_get_queue(dev->bdev)->limits.discard_alignment; +} + +/** + * rnbd_dev_submit_io() - Submit an I/O to the disk + * @dev: device to that the I/O is submitted + * @sector: address to read/write data to + * @data: I/O data to write or buffer to read I/O date into + * @len: length of @data + * @bi_size: Amount of data that will be read/written + * @prio: IO priority + * @priv: private data passed to @io_fn + */ +int rnbd_dev_submit_io(struct rnbd_dev *dev, sector_t sector, void *data, + size_t len, u32 bi_size, enum rnbd_io_flags flags, + short prio, void *priv); + +#endif /* RNBD_SRV_DEV_H */ -- cgit v1.2.3 From 8cee532f469bbcdb6ac0ab161ebff36fbc6439d7 Mon Sep 17 00:00:00 2001 From: Jack Wang Date: Mon, 11 May 2020 15:51:28 +0200 Subject: block/rnbd: server: sysfs interface functions This is the sysfs interface to rnbd mapped devices on server side: /sys/class/rnbd-server/ctl/devices// |- block_dev | *** link pointing to the corresponding block device sysfs entry | |- sessions// | *** sessions directory | |- read_only | *** is devices mapped as read only | |- mapping_path *** relative device path provided by the client during mapping Link: https://lore.kernel.org/r/20200511135131.27580-23-danil.kipnis@cloud.ionos.com Signed-off-by: Danil Kipnis Signed-off-by: Jack Wang Reviewed-by: Bart Van Assche Acked-by: Jens Axboe Signed-off-by: Jason Gunthorpe --- drivers/block/rnbd/rnbd-srv-sysfs.c | 215 ++++++++++++++++++++++++++++++++++++ 1 file changed, 215 insertions(+) create mode 100644 drivers/block/rnbd/rnbd-srv-sysfs.c diff --git a/drivers/block/rnbd/rnbd-srv-sysfs.c b/drivers/block/rnbd/rnbd-srv-sysfs.c new file mode 100644 index 000000000000..106775c074d1 --- /dev/null +++ b/drivers/block/rnbd/rnbd-srv-sysfs.c @@ -0,0 +1,215 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * RDMA Network Block Driver + * + * Copyright (c) 2014 - 2018 ProfitBricks GmbH. All rights reserved. + * Copyright (c) 2018 - 2019 1&1 IONOS Cloud GmbH. All rights reserved. + * Copyright (c) 2019 - 2020 1&1 IONOS SE. All rights reserved. + */ +#undef pr_fmt +#define pr_fmt(fmt) KBUILD_MODNAME " L" __stringify(__LINE__) ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "rnbd-srv.h" + +static struct device *rnbd_dev; +static struct class *rnbd_dev_class; +static struct kobject *rnbd_devs_kobj; + +static void rnbd_srv_dev_release(struct kobject *kobj) +{ + struct rnbd_srv_dev *dev; + + dev = container_of(kobj, struct rnbd_srv_dev, dev_kobj); + + kfree(dev); +} + +static struct kobj_type dev_ktype = { + .sysfs_ops = &kobj_sysfs_ops, + .release = rnbd_srv_dev_release +}; + +int rnbd_srv_create_dev_sysfs(struct rnbd_srv_dev *dev, + struct block_device *bdev, + const char *dev_name) +{ + struct kobject *bdev_kobj; + int ret; + + ret = kobject_init_and_add(&dev->dev_kobj, &dev_ktype, + rnbd_devs_kobj, dev_name); + if (ret) + return ret; + + dev->dev_sessions_kobj = kobject_create_and_add("sessions", + &dev->dev_kobj); + if (!dev->dev_sessions_kobj) + goto put_dev_kobj; + + bdev_kobj = &disk_to_dev(bdev->bd_disk)->kobj; + ret = sysfs_create_link(&dev->dev_kobj, bdev_kobj, "block_dev"); + if (ret) + goto put_sess_kobj; + + return 0; + +put_sess_kobj: + kobject_put(dev->dev_sessions_kobj); +put_dev_kobj: + kobject_put(&dev->dev_kobj); + return ret; +} + +void rnbd_srv_destroy_dev_sysfs(struct rnbd_srv_dev *dev) +{ + sysfs_remove_link(&dev->dev_kobj, "block_dev"); + kobject_del(dev->dev_sessions_kobj); + kobject_put(dev->dev_sessions_kobj); + kobject_del(&dev->dev_kobj); + kobject_put(&dev->dev_kobj); +} + +static ssize_t read_only_show(struct kobject *kobj, struct kobj_attribute *attr, + char *page) +{ + struct rnbd_srv_sess_dev *sess_dev; + + sess_dev = container_of(kobj, struct rnbd_srv_sess_dev, kobj); + + return scnprintf(page, PAGE_SIZE, "%d\n", + !(sess_dev->open_flags & FMODE_WRITE)); +} + +static struct kobj_attribute rnbd_srv_dev_session_ro_attr = + __ATTR_RO(read_only); + +static ssize_t access_mode_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *page) +{ + struct rnbd_srv_sess_dev *sess_dev; + + sess_dev = container_of(kobj, struct rnbd_srv_sess_dev, kobj); + + return scnprintf(page, PAGE_SIZE, "%s\n", + rnbd_access_mode_str(sess_dev->access_mode)); +} + +static struct kobj_attribute rnbd_srv_dev_session_access_mode_attr = + __ATTR_RO(access_mode); + +static ssize_t mapping_path_show(struct kobject *kobj, + struct kobj_attribute *attr, char *page) +{ + struct rnbd_srv_sess_dev *sess_dev; + + sess_dev = container_of(kobj, struct rnbd_srv_sess_dev, kobj); + + return scnprintf(page, PAGE_SIZE, "%s\n", sess_dev->pathname); +} + +static struct kobj_attribute rnbd_srv_dev_session_mapping_path_attr = + __ATTR_RO(mapping_path); + +static struct attribute *rnbd_srv_default_dev_sessions_attrs[] = { + &rnbd_srv_dev_session_access_mode_attr.attr, + &rnbd_srv_dev_session_ro_attr.attr, + &rnbd_srv_dev_session_mapping_path_attr.attr, + NULL, +}; + +static struct attribute_group rnbd_srv_default_dev_session_attr_group = { + .attrs = rnbd_srv_default_dev_sessions_attrs, +}; + +void rnbd_srv_destroy_dev_session_sysfs(struct rnbd_srv_sess_dev *sess_dev) +{ + sysfs_remove_group(&sess_dev->kobj, + &rnbd_srv_default_dev_session_attr_group); + + kobject_del(&sess_dev->kobj); + kobject_put(&sess_dev->kobj); +} + +static void rnbd_srv_sess_dev_release(struct kobject *kobj) +{ + struct rnbd_srv_sess_dev *sess_dev; + + sess_dev = container_of(kobj, struct rnbd_srv_sess_dev, kobj); + rnbd_destroy_sess_dev(sess_dev); +} + +static struct kobj_type rnbd_srv_sess_dev_ktype = { + .sysfs_ops = &kobj_sysfs_ops, + .release = rnbd_srv_sess_dev_release, +}; + +int rnbd_srv_create_dev_session_sysfs(struct rnbd_srv_sess_dev *sess_dev) +{ + int ret; + + ret = kobject_init_and_add(&sess_dev->kobj, &rnbd_srv_sess_dev_ktype, + sess_dev->dev->dev_sessions_kobj, "%s", + sess_dev->sess->sessname); + if (ret) + return ret; + + ret = sysfs_create_group(&sess_dev->kobj, + &rnbd_srv_default_dev_session_attr_group); + if (ret) + goto err; + + return 0; + +err: + kobject_put(&sess_dev->kobj); + + return ret; +} + +int rnbd_srv_create_sysfs_files(void) +{ + int err; + + rnbd_dev_class = class_create(THIS_MODULE, "rnbd-server"); + if (IS_ERR(rnbd_dev_class)) + return PTR_ERR(rnbd_dev_class); + + rnbd_dev = device_create(rnbd_dev_class, NULL, + MKDEV(0, 0), NULL, "ctl"); + if (IS_ERR(rnbd_dev)) { + err = PTR_ERR(rnbd_dev); + goto cls_destroy; + } + rnbd_devs_kobj = kobject_create_and_add("devices", &rnbd_dev->kobj); + if (!rnbd_devs_kobj) { + err = -ENOMEM; + goto dev_destroy; + } + + return 0; + +dev_destroy: + device_destroy(rnbd_dev_class, MKDEV(0, 0)); +cls_destroy: + class_destroy(rnbd_dev_class); + + return err; +} + +void rnbd_srv_destroy_sysfs_files(void) +{ + kobject_del(rnbd_devs_kobj); + kobject_put(rnbd_devs_kobj); + device_destroy(rnbd_dev_class, MKDEV(0, 0)); + class_destroy(rnbd_dev_class); +} -- cgit v1.2.3 From bc01885342e193e7943d86ccbd7bc3e8fee50a68 Mon Sep 17 00:00:00 2001 From: Jack Wang Date: Mon, 11 May 2020 15:51:29 +0200 Subject: block/rnbd: include client and server modules into kernel compilation Add rnbd Makefile, Kconfig and also corresponding lines into upper block layer files. Link: https://lore.kernel.org/r/20200511135131.27580-24-danil.kipnis@cloud.ionos.com Signed-off-by: Danil Kipnis Signed-off-by: Jack Wang Reviewed-by: Bart Van Assche Acked-by: Jens Axboe Signed-off-by: Jason Gunthorpe --- drivers/block/Kconfig | 2 ++ drivers/block/Makefile | 1 + drivers/block/rnbd/Kconfig | 28 ++++++++++++++++++++++++++++ drivers/block/rnbd/Makefile | 15 +++++++++++++++ 4 files changed, 46 insertions(+) create mode 100644 drivers/block/rnbd/Kconfig create mode 100644 drivers/block/rnbd/Makefile diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig index 025b1b77b11a..084b9efcefca 100644 --- a/drivers/block/Kconfig +++ b/drivers/block/Kconfig @@ -458,4 +458,6 @@ config BLK_DEV_RSXX To compile this driver as a module, choose M here: the module will be called rsxx. +source "drivers/block/rnbd/Kconfig" + endif # BLK_DEV diff --git a/drivers/block/Makefile b/drivers/block/Makefile index 795facd8cf19..e1f63117ee94 100644 --- a/drivers/block/Makefile +++ b/drivers/block/Makefile @@ -39,6 +39,7 @@ obj-$(CONFIG_BLK_DEV_PCIESSD_MTIP32XX) += mtip32xx/ obj-$(CONFIG_BLK_DEV_RSXX) += rsxx/ obj-$(CONFIG_ZRAM) += zram/ +obj-$(CONFIG_BLK_DEV_RNBD) += rnbd/ obj-$(CONFIG_BLK_DEV_NULL_BLK) += null_blk.o null_blk-objs := null_blk_main.o diff --git a/drivers/block/rnbd/Kconfig b/drivers/block/rnbd/Kconfig new file mode 100644 index 000000000000..4b6d3d816d1f --- /dev/null +++ b/drivers/block/rnbd/Kconfig @@ -0,0 +1,28 @@ +# SPDX-License-Identifier: GPL-2.0-or-later + +config BLK_DEV_RNBD + bool + +config BLK_DEV_RNBD_CLIENT + tristate "RDMA Network Block Device driver client" + depends on INFINIBAND_RTRS_CLIENT + select BLK_DEV_RNBD + help + RNBD client is a network block device driver using rdma transport. + + RNBD client allows for mapping of a remote block devices over + RTRS protocol from a target system where RNBD server is running. + + If unsure, say N. + +config BLK_DEV_RNBD_SERVER + tristate "RDMA Network Block Device driver server" + depends on INFINIBAND_RTRS_SERVER + select BLK_DEV_RNBD + help + RNBD server is the server side of RNBD using rdma transport. + + RNBD server allows for exporting local block devices to a remote client + over RTRS protocol. + + If unsure, say N. diff --git a/drivers/block/rnbd/Makefile b/drivers/block/rnbd/Makefile new file mode 100644 index 000000000000..5bb1a7ad1ada --- /dev/null +++ b/drivers/block/rnbd/Makefile @@ -0,0 +1,15 @@ +# SPDX-License-Identifier: GPL-2.0-or-later + +ccflags-y := -I$(srctree)/drivers/infiniband/ulp/rtrs + +rnbd-client-y := rnbd-clt.o \ + rnbd-clt-sysfs.o \ + rnbd-common.o + +rnbd-server-y := rnbd-common.o \ + rnbd-srv.o \ + rnbd-srv-dev.o \ + rnbd-srv-sysfs.o + +obj-$(CONFIG_BLK_DEV_RNBD_CLIENT) += rnbd-client.o +obj-$(CONFIG_BLK_DEV_RNBD_SERVER) += rnbd-server.o -- cgit v1.2.3 From aa4d16e44f607caccaa697fcb29f2c94672f08d5 Mon Sep 17 00:00:00 2001 From: Jack Wang Date: Mon, 11 May 2020 15:51:30 +0200 Subject: block/rnbd: a bit of documentation README with description of major sysfs entries, sysfs documentation are moved to ABI dir as Bart suggested. Link: https://lore.kernel.org/r/20200511135131.27580-25-danil.kipnis@cloud.ionos.com Signed-off-by: Danil Kipnis Signed-off-by: Jack Wang Reviewed-by: Bart Van Assche Acked-by: Jens Axboe Signed-off-by: Jason Gunthorpe --- Documentation/ABI/testing/sysfs-block-rnbd | 46 +++++++++ Documentation/ABI/testing/sysfs-class-rnbd-client | 111 ++++++++++++++++++++++ Documentation/ABI/testing/sysfs-class-rnbd-server | 50 ++++++++++ drivers/block/rnbd/README | 92 ++++++++++++++++++ 4 files changed, 299 insertions(+) create mode 100644 Documentation/ABI/testing/sysfs-block-rnbd create mode 100644 Documentation/ABI/testing/sysfs-class-rnbd-client create mode 100644 Documentation/ABI/testing/sysfs-class-rnbd-server create mode 100644 drivers/block/rnbd/README diff --git a/Documentation/ABI/testing/sysfs-block-rnbd b/Documentation/ABI/testing/sysfs-block-rnbd new file mode 100644 index 000000000000..8f070b47f361 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-block-rnbd @@ -0,0 +1,46 @@ +What: /sys/block/rnbd/rnbd/unmap_device +Date: Feb 2020 +KernelVersion: 5.7 +Contact: Jack Wang Danil Kipnis +Description: To unmap a volume, "normal" or "force" has to be written to: + /sys/block/rnbd/rnbd/unmap_device + + When "normal" is used, the operation will fail with EBUSY if any process + is using the device. When "force" is used, the device is also unmapped + when device is in use. All I/Os that are in progress will fail. + + Example: + + # echo "normal" > /sys/block/rnbd0/rnbd/unmap_device + +What: /sys/block/rnbd/rnbd/state +Date: Feb 2020 +KernelVersion: 5.7 +Contact: Jack Wang Danil Kipnis +Description: The file contains the current state of the block device. The state file + returns "open" when the device is successfully mapped from the server + and accepting I/O requests. When the connection to the server gets + disconnected in case of an error (e.g. link failure), the state file + returns "closed" and all I/O requests submitted to it will fail with -EIO. + +What: /sys/block/rnbd/rnbd/session +Date: Feb 2020 +KernelVersion: 5.7 +Contact: Jack Wang Danil Kipnis +Description: RNBD uses RTRS session to transport the data between client and + server. The entry "session" contains the name of the session, that + was used to establish the RTRS session. It's the same name that + was passed as server parameter to the map_device entry. + +What: /sys/block/rnbd/rnbd/mapping_path +Date: Feb 2020 +KernelVersion: 5.7 +Contact: Jack Wang Danil Kipnis +Description: Contains the path that was passed as "device_path" to the map_device + operation. + +What: /sys/block/rnbd/rnbd/access_mode +Date: Feb 2020 +KernelVersion: 5.7 +Contact: Jack Wang Danil Kipnis +Description: Contains the device access mode: ro, rw or migration. diff --git a/Documentation/ABI/testing/sysfs-class-rnbd-client b/Documentation/ABI/testing/sysfs-class-rnbd-client new file mode 100644 index 000000000000..c084f203b41e --- /dev/null +++ b/Documentation/ABI/testing/sysfs-class-rnbd-client @@ -0,0 +1,111 @@ +What: /sys/class/rnbd-client +Date: Feb 2020 +KernelVersion: 5.7 +Contact: Jack Wang Danil Kipnis +Description: Provide information about RNBD-client. + All sysfs files that are not read-only provide the usage information on read: + + Example: + # cat /sys/class/rnbd-client/ctl/map_device + + > Usage: echo "sessname= path=<[srcaddr,]dstaddr> + > [path=<[srcaddr,]dstaddr>] device_path= + > [access_mode=] > map_device + > + > addr ::= [ ip: | ip: | gid: ] + +What: /sys/class/rnbd-client/ctl/map_device +Date: Feb 2020 +KernelVersion: 5.7 +Contact: Jack Wang Danil Kipnis +Description: Expected format is the following: + + sessname= + path=<[srcaddr,]dstaddr> [path=<[srcaddr,]dstaddr> ...] + device_path= + [access_mode=] + + Where: + + sessname: accepts a string not bigger than 256 chars, which identifies + a given session on the client and on the server. + I.e. "clt_hostname-srv_hostname" could be a natural choice. + + path: describes a connection between the client and the server by + specifying destination and, when required, the source address. + The addresses are to be provided in the following format: + + ip: + ip: + gid: + + for example: + + path=ip:10.0.0.66 + The single addr is treated as the destination. + The connection will be established to this server from any client IP address. + + path=ip:10.0.0.66,ip:10.0.1.66 + First addr is the source address and the second is the destination. + + If multiple "path=" options are specified multiple connection + will be established and data will be sent according to + the selected multipath policy (see RTRS mp_policy sysfs entry description). + + device_path: Path to the block device on the server side. Path is specified + relative to the directory on server side configured in the + 'dev_search_path' module parameter of the rnbd_server. + The rnbd_server prepends the received from client + with and tries to open the + / block device. On success, + a /dev/rnbd device file, a /sys/block/rnbd_client/rnbd/ + directory and an entry in /sys/class/rnbd-client/ctl/devices + will be created. + + If 'dev_search_path' contains '%SESSNAME%', then each session can + have different devices namespace, e.g. server was configured with + the following parameter "dev_search_path=/run/rnbd-devs/%SESSNAME%", + client has this string "sessname=blya device_path=sda", then server + will try to open: /run/rnbd-devs/blya/sda. + + access_mode: the access_mode parameter specifies if the device is to be + mapped as "ro" read-only or "rw" read-write. The server allows + a device to be exported in rw mode only once. The "migration" + access mode has to be specified if a second mapping in read-write + mode is desired. + + By default "rw" is used. + + Exit Codes: + + If the device is already mapped it will fail with EEXIST. If the input + has an invalid format it will return EINVAL. If the device path cannot + be found on the server, it will fail with ENOENT. + + Finding device file after mapping + --------------------------------- + + After mapping, the device file can be found by: + o The symlink /sys/class/rnbd-client/ctl/devices/ + points to /sys/block/. The last part of the symlink destination + is the same as the device name. By extracting the last part of the + path the path to the device /dev/ can be build. + + o /dev/block/$(cat /sys/class/rnbd-client/ctl/devices//dev) + + How to find the of the device is described on the next + section. + +What: /sys/class/rnbd-client/ctl/devices/ +Date: Feb 2020 +KernelVersion: 5.7 +Contact: Jack Wang Danil Kipnis +Description: For each device mapped on the client a new symbolic link is created as + /sys/class/rnbd-client/ctl/devices/, which points + to the block device created by rnbd (/sys/block/rnbd/). + The of each device is created as follows: + + - If the 'device_path' provided during mapping contains slashes ("/"), + they are replaced by exclamation mark ("!") and used as as the + . Otherwise, the will be the same as the + "device_path" provided. diff --git a/Documentation/ABI/testing/sysfs-class-rnbd-server b/Documentation/ABI/testing/sysfs-class-rnbd-server new file mode 100644 index 000000000000..ba60a90c0e45 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-class-rnbd-server @@ -0,0 +1,50 @@ +What: /sys/class/rnbd-server +Date: Feb 2020 +KernelVersion: 5.7 +Contact: Jack Wang Danil Kipnis +Description: provide information about RNBD-server. + +What: /sys/class/rnbd-server/ctl/ +Date: Feb 2020 +KernelVersion: 5.7 +Contact: Jack Wang Danil Kipnis +Description: When a client maps a device, a directory entry with the name of the + block device is created under /sys/class/rnbd-server/ctl/devices/. + +What: /sys/class/rnbd-server/ctl/devices//block_dev +Date: Feb 2020 +KernelVersion: 5.7 +Contact: Jack Wang Danil Kipnis +Description: Is a symlink to the sysfs entry of the exported device. + + Example: + block_dev -> ../../../../class/block/ram0 + +What: /sys/class/rnbd-server/ctl/devices//sessions/ +Date: Feb 2020 +KernelVersion: 5.7 +Contact: Jack Wang Danil Kipnis +Description: For each client a particular device is exported to, following directory will be + created: + + /sys/class/rnbd-server/ctl/devices//sessions// + + When the device is unmapped by that client, the directory will be removed. + +What: /sys/class/rnbd-server/ctl/devices//sessions//read_only +Date: Feb 2020 +KernelVersion: 5.7 +Contact: Jack Wang Danil Kipnis +Description: Contains '1' if device is mapped read-only, otherwise '0'. + +What: /sys/class/rnbd-server/ctl/devices//sessions//mapping_path +Date: Feb 2020 +KernelVersion: 5.7 +Contact: Jack Wang Danil Kipnis +Description: Contains the relative device path provided by the user during mapping. + +What: /sys/class/rnbd-server/ctl/devices//sessions//access_mode +Date: Feb 2020 +KernelVersion: 5.7 +Contact: Jack Wang Danil Kipnis +Description: Contains the device access mode: ro, rw or migration. diff --git a/drivers/block/rnbd/README b/drivers/block/rnbd/README new file mode 100644 index 000000000000..1773c0aa0bd4 --- /dev/null +++ b/drivers/block/rnbd/README @@ -0,0 +1,92 @@ +******************************** +RDMA Network Block Device (RNBD) +******************************** + +Introduction +------------ + +RNBD (RDMA Network Block Device) is a pair of kernel modules +(client and server) that allow for remote access of a block device on +the server over RTRS protocol using the RDMA (InfiniBand, RoCE, iWARP) +transport. After being mapped, the remote block devices can be accessed +on the client side as local block devices. + +I/O is transferred between client and server by the RTRS transport +modules. The administration of RNBD and RTRS modules is done via +sysfs entries. + +Requirements +------------ + + RTRS kernel modules + +Quick Start +----------- + +Server side: + # modprobe rnbd_server + +Client side: + # modprobe rnbd_client + # echo "sessname=blya path=ip:10.50.100.66 device_path=/dev/ram0" > \ + /sys/devices/virtual/rnbd-client/ctl/map_device + + Where "sessname=" is a session name, a string to identify the session + on client and on server sides; "path=" is a destination IP address or + a pair of a source and a destination IPs, separated by comma. Multiple + "path=" options can be specified in order to use multipath (see RTRS + description for details); "device_path=" is the block device to be + mapped from the server side. After the session to the server machine is + established, the mapped device will appear on the client side under + /dev/rnbd. + + +RNBD-Server Module Parameters +============================= + +dev_search_path +--------------- + +When a device is mapped from the client, the server generates the path +to the block device on the server side by concatenating dev_search_path +and the "device_path" that was specified in the map_device operation. + +The default dev_search_path is: "/". + +dev_search_path option can also contain %SESSNAME% in order to provide +different device namespaces for different sessions. See "device_path" +option for details. + +============================ +Protocol (rnbd/rnbd-proto.h) +============================ + +1. Before mapping first device from a given server, client sends an +RNBD_MSG_SESS_INFO to the server. Server responds with +RNBD_MSG_SESS_INFO_RSP. Currently the messages only contain the protocol +version for backward compatibility. + +2. Client requests to open a device by sending RNBD_MSG_OPEN message. This +contains the path to the device and access mode (read-only or writable). +Server responds to the message with RNBD_MSG_OPEN_RSP. This contains +a 32 bit device id to be used for IOs and device "geometry" related +information: side, max_hw_sectors, etc. + +3. Client attaches RNBD_MSG_IO to each IO message send to a device. This +message contains device id, provided by server in his rnbd_msg_open_rsp, +sector to be accessed, read-write flags and bi_size. + +4. Client closes a device by sending RNBD_MSG_CLOSE which contains only the +device id provided by the server. + +========================================= +Contributors List(in alphabetical order) +========================================= +Danil Kipnis +Fabian Holler +Guoqing Jiang +Jack Wang +Kleber Souza +Lutz Pogrell +Milind Dumbare +Roman Penyaev -- cgit v1.2.3 From f11e0ec55f0c80ff47693af2150bad5db0e20387 Mon Sep 17 00:00:00 2001 From: Jack Wang Date: Mon, 11 May 2020 15:51:31 +0200 Subject: MAINTAINERS: Add maintainers for RNBD/RTRS modules Danil and I will maintain RNBD/RTRS modules. Link: https://lore.kernel.org/r/20200511135131.27580-26-danil.kipnis@cloud.ionos.com Signed-off-by: Danil Kipnis Signed-off-by: Jack Wang Reviewed-by: Bart Van Assche Signed-off-by: Jason Gunthorpe --- MAINTAINERS | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index e64e5db31497..8031a923e5a5 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -14448,6 +14448,13 @@ F: arch/riscv/ N: riscv K: riscv +RNBD BLOCK DRIVERS +M: Danil Kipnis +M: Jack Wang +L: linux-block@vger.kernel.org +S: Maintained +F: drivers/block/rnbd/ + ROCCAT DRIVERS M: Stefan Achatz S: Maintained @@ -14576,6 +14583,13 @@ S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/jes/linux.git rtl8xxxu-devel F: drivers/net/wireless/realtek/rtl8xxxu/ +RTRS TRANSPORT DRIVERS +M: Danil Kipnis +M: Jack Wang +L: linux-rdma@vger.kernel.org +S: Maintained +F: drivers/infiniband/ulp/rtrs/ + RXRPC SOCKETS (AF_RXRPC) M: David Howells L: linux-afs@lists.infradead.org -- cgit v1.2.3 From b19a530b002fabdd93da62504b9cb0778447e8e2 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Wed, 6 May 2020 11:24:38 +0300 Subject: RDMA/uverbs: Cleanup wq/srq context usage from uverbs layer Both wq_context and srq_context are some leftover from the past in uverbs layer, they are not really in use, drop them. Link: https://lore.kernel.org/r/20200506082444.14502-5-leon@kernel.org Signed-off-by: Yishai Hadas Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/uverbs_cmd.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 060b4ebbd2ba..dc8fe1a4eba3 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -2954,7 +2954,6 @@ static int ib_uverbs_ex_create_wq(struct uverbs_attr_bundle *attrs) wq_init_attr.cq = cq; wq_init_attr.max_sge = cmd.max_sge; wq_init_attr.max_wr = cmd.max_wr; - wq_init_attr.wq_context = attrs->ufile; wq_init_attr.wq_type = cmd.wq_type; wq_init_attr.event_handler = ib_uverbs_wq_event_handler; wq_init_attr.create_flags = cmd.create_flags; @@ -2972,7 +2971,6 @@ static int ib_uverbs_ex_create_wq(struct uverbs_attr_bundle *attrs) wq->cq = cq; wq->pd = pd; wq->device = pd->device; - wq->wq_context = wq_init_attr.wq_context; atomic_set(&wq->usecnt, 0); atomic_inc(&pd->usecnt); atomic_inc(&cq->usecnt); @@ -3441,7 +3439,6 @@ static int __uverbs_create_xsrq(struct uverbs_attr_bundle *attrs, } attr.event_handler = ib_uverbs_srq_event_handler; - attr.srq_context = attrs->ufile; attr.srq_type = cmd->srq_type; attr.attr.max_wr = cmd->max_wr; attr.attr.max_sge = cmd->max_sge; @@ -3460,7 +3457,6 @@ static int __uverbs_create_xsrq(struct uverbs_attr_bundle *attrs, srq->srq_type = cmd->srq_type; srq->uobject = obj; srq->event_handler = attr.event_handler; - srq->srq_context = attr.srq_context; ret = pd->device->ops.create_srq(srq, &attr, udata); if (ret) -- cgit v1.2.3 From dbd67252869ba58d086edfa14113e10f8059b97e Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Wed, 6 May 2020 11:24:42 +0300 Subject: RDMA/uverbs: Fix create WQ to use the given user handle Fix create WQ to use the given user handle, in addition dropped some duplicated code from this flow. Fixes: fd3c7904db6e ("IB/core: Change idr objects to use the new schema") Fixes: f213c0527210 ("IB/uverbs: Add WQ support") Link: https://lore.kernel.org/r/20200506082444.14502-9-leon@kernel.org Signed-off-by: Yishai Hadas Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/uverbs_cmd.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index dc8fe1a4eba3..e03f3a43996b 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -2958,6 +2958,7 @@ static int ib_uverbs_ex_create_wq(struct uverbs_attr_bundle *attrs) wq_init_attr.event_handler = ib_uverbs_wq_event_handler; wq_init_attr.create_flags = cmd.create_flags; INIT_LIST_HEAD(&obj->uevent.event_list); + obj->uevent.uobject.user_handle = cmd.user_handle; wq = pd->device->ops.create_wq(pd, &wq_init_attr, &attrs->driver_udata); if (IS_ERR(wq)) { @@ -2974,8 +2975,6 @@ static int ib_uverbs_ex_create_wq(struct uverbs_attr_bundle *attrs) atomic_set(&wq->usecnt, 0); atomic_inc(&pd->usecnt); atomic_inc(&cq->usecnt); - wq->uobject = obj; - obj->uevent.uobject.object = wq; memset(&resp, 0, sizeof(resp)); resp.wq_handle = obj->uevent.uobject.id; -- cgit v1.2.3 From b0810b037de0b62a3c6e3abfc123fe2734335f53 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 6 May 2020 11:24:39 +0300 Subject: RDMA/core: Consolidate ib_create_srq flows The uverbs layer largely duplicate the code in ib_create_srq(), with the slight difference that it passes in a udata. Move all the code together into ib_create_srq_user() and provide an inline for kernel users, similar to other create calls. Link: https://lore.kernel.org/r/20200506082444.14502-6-leon@kernel.org Signed-off-by: Yishai Hadas Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/uverbs_cmd.c | 40 ++++++------------------------------ drivers/infiniband/core/verbs.c | 29 +++++++++++++++++++------- include/rdma/ib_verbs.h | 27 +++++++++++------------- 3 files changed, 40 insertions(+), 56 deletions(-) diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index e03f3a43996b..d5642bcf93ee 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -3444,38 +3444,15 @@ static int __uverbs_create_xsrq(struct uverbs_attr_bundle *attrs, attr.attr.srq_limit = cmd->srq_limit; INIT_LIST_HEAD(&obj->uevent.event_list); + obj->uevent.uobject.user_handle = cmd->user_handle; - srq = rdma_zalloc_drv_obj(ib_dev, ib_srq); - if (!srq) { - ret = -ENOMEM; - goto err_put; - } - - srq->device = pd->device; - srq->pd = pd; - srq->srq_type = cmd->srq_type; - srq->uobject = obj; - srq->event_handler = attr.event_handler; - - ret = pd->device->ops.create_srq(srq, &attr, udata); - if (ret) - goto err_free; - - if (ib_srq_has_cq(cmd->srq_type)) { - srq->ext.cq = attr.ext.cq; - atomic_inc(&attr.ext.cq->usecnt); - } - - if (cmd->srq_type == IB_SRQT_XRC) { - srq->ext.xrc.xrcd = attr.ext.xrc.xrcd; - atomic_inc(&attr.ext.xrc.xrcd->usecnt); + srq = ib_create_srq_user(pd, &attr, obj, udata); + if (IS_ERR(srq)) { + ret = PTR_ERR(srq); + goto err_put_pd; } - atomic_inc(&pd->usecnt); - atomic_set(&srq->usecnt, 0); - obj->uevent.uobject.object = srq; - obj->uevent.uobject.user_handle = cmd->user_handle; memset(&resp, 0, sizeof resp); resp.srq_handle = obj->uevent.uobject.id; @@ -3501,13 +3478,8 @@ static int __uverbs_create_xsrq(struct uverbs_attr_bundle *attrs, err_copy: ib_destroy_srq_user(srq, uverbs_get_cleared_udata(attrs)); - /* It was released in ib_destroy_srq_user */ - srq = NULL; -err_free: - kfree(srq); -err_put: +err_put_pd: uobj_put_obj_read(pd); - err_put_cq: if (ib_srq_has_cq(cmd->srq_type)) rdma_lookup_put_uobject(&attr.ext.cq->uobject->uevent.uobject, diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index bf0249f76ae9..e2c9430a3ff1 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -981,15 +981,29 @@ EXPORT_SYMBOL(rdma_destroy_ah_user); /* Shared receive queues */ -struct ib_srq *ib_create_srq(struct ib_pd *pd, - struct ib_srq_init_attr *srq_init_attr) +/** + * ib_create_srq_user - Creates a SRQ associated with the specified protection + * domain. + * @pd: The protection domain associated with the SRQ. + * @srq_init_attr: A list of initial attributes required to create the + * SRQ. If SRQ creation succeeds, then the attributes are updated to + * the actual capabilities of the created SRQ. + * @uobject - uobject pointer if this is not a kernel SRQ + * @udata - udata pointer if this is not a kernel SRQ + * + * srq_attr->max_wr and srq_attr->max_sge are read the determine the + * requested size of the SRQ, and set to the actual values allocated + * on return. If ib_create_srq() succeeds, then max_wr and max_sge + * will always be at least as large as the requested values. + */ +struct ib_srq *ib_create_srq_user(struct ib_pd *pd, + struct ib_srq_init_attr *srq_init_attr, + struct ib_usrq_object *uobject, + struct ib_udata *udata) { struct ib_srq *srq; int ret; - if (!pd->device->ops.create_srq) - return ERR_PTR(-EOPNOTSUPP); - srq = rdma_zalloc_drv_obj(pd->device, ib_srq); if (!srq) return ERR_PTR(-ENOMEM); @@ -999,6 +1013,7 @@ struct ib_srq *ib_create_srq(struct ib_pd *pd, srq->event_handler = srq_init_attr->event_handler; srq->srq_context = srq_init_attr->srq_context; srq->srq_type = srq_init_attr->srq_type; + srq->uobject = uobject; if (ib_srq_has_cq(srq->srq_type)) { srq->ext.cq = srq_init_attr->ext.cq; @@ -1010,7 +1025,7 @@ struct ib_srq *ib_create_srq(struct ib_pd *pd, } atomic_inc(&pd->usecnt); - ret = pd->device->ops.create_srq(srq, srq_init_attr, NULL); + ret = pd->device->ops.create_srq(srq, srq_init_attr, udata); if (ret) { atomic_dec(&srq->pd->usecnt); if (srq->srq_type == IB_SRQT_XRC) @@ -1023,7 +1038,7 @@ struct ib_srq *ib_create_srq(struct ib_pd *pd, return srq; } -EXPORT_SYMBOL(ib_create_srq); +EXPORT_SYMBOL(ib_create_srq_user); int ib_modify_srq(struct ib_srq *srq, struct ib_srq_attr *srq_attr, diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 4c488cade70f..db58f11552f1 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -3559,21 +3559,18 @@ static inline int rdma_destroy_ah(struct ib_ah *ah, u32 flags) return rdma_destroy_ah_user(ah, flags, NULL); } -/** - * ib_create_srq - Creates a SRQ associated with the specified protection - * domain. - * @pd: The protection domain associated with the SRQ. - * @srq_init_attr: A list of initial attributes required to create the - * SRQ. If SRQ creation succeeds, then the attributes are updated to - * the actual capabilities of the created SRQ. - * - * srq_attr->max_wr and srq_attr->max_sge are read the determine the - * requested size of the SRQ, and set to the actual values allocated - * on return. If ib_create_srq() succeeds, then max_wr and max_sge - * will always be at least as large as the requested values. - */ -struct ib_srq *ib_create_srq(struct ib_pd *pd, - struct ib_srq_init_attr *srq_init_attr); +struct ib_srq *ib_create_srq_user(struct ib_pd *pd, + struct ib_srq_init_attr *srq_init_attr, + struct ib_usrq_object *uobject, + struct ib_udata *udata); +static inline struct ib_srq * +ib_create_srq(struct ib_pd *pd, struct ib_srq_init_attr *srq_init_attr) +{ + if (!pd->device->ops.create_srq) + return ERR_PTR(-EOPNOTSUPP); + + return ib_create_srq_user(pd, srq_init_attr, NULL, NULL); +} /** * ib_modify_srq - Modifies the attributes for the specified SRQ. -- cgit v1.2.3 From a8f5c1f1a5c80e5a4bde5eaa1de645d72c562da6 Mon Sep 17 00:00:00 2001 From: Xiongfeng Wang Date: Mon, 11 May 2020 15:37:09 +0800 Subject: RDMA/srpt: Add a newline when printing parameter 'srpt_service_guid' by sysfs When I cat module parameter 'srpt_service_guid', it displays as follows. It is better to add a newline for easy reading. [root@hulk-202 ~]# cat /sys/module/ib_srpt/parameters/srpt_service_guid 0x0205cdfffe8346b9[root@hulk-202 ~]# Link: https://lore.kernel.org/r/1589182629-27743-1-git-send-email-wangxiongfeng2@huawei.com Signed-off-by: Xiongfeng Wang Signed-off-by: Jason Gunthorpe --- drivers/infiniband/ulp/srpt/ib_srpt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.c b/drivers/infiniband/ulp/srpt/ib_srpt.c index 7ed38d1cb997..63056af5337c 100644 --- a/drivers/infiniband/ulp/srpt/ib_srpt.c +++ b/drivers/infiniband/ulp/srpt/ib_srpt.c @@ -81,7 +81,7 @@ MODULE_PARM_DESC(srpt_srq_size, static int srpt_get_u64_x(char *buffer, const struct kernel_param *kp) { - return sprintf(buffer, "0x%016llx", *(u64 *)kp->arg); + return sprintf(buffer, "0x%016llx\n", *(u64 *)kp->arg); } module_param_call(srpt_service_guid, NULL, srpt_get_u64_x, &srpt_service_guid, 0444); -- cgit v1.2.3 From daeee976904c0b7326eb4c033df7b28d4b726177 Mon Sep 17 00:00:00 2001 From: Shay Drory Date: Wed, 13 May 2020 12:53:04 +0300 Subject: RDMA/mlx5: Update mlx5_ib driver name Current description doesn't include new devices, change it by updating to have more generic description and remove DRIVER_NAME and DRIVER_VERSION defines. Link: https://lore.kernel.org/r/20200513095304.210240-1-leon@kernel.org Signed-off-by: Shay Drory Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/main.c | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 38bf3841741c..26f0b39c7f74 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -72,17 +72,10 @@ #define UVERBS_MODULE_NAME mlx5_ib #include -#define DRIVER_NAME "mlx5_ib" -#define DRIVER_VERSION "5.0-0" - MODULE_AUTHOR("Eli Cohen "); -MODULE_DESCRIPTION("Mellanox Connect-IB HCA IB driver"); +MODULE_DESCRIPTION("Mellanox 5th generation network adapters (ConnectX series) IB driver"); MODULE_LICENSE("Dual BSD/GPL"); -static char mlx5_version[] = - DRIVER_NAME ": Mellanox Connect-IB Infiniband driver v" - DRIVER_VERSION "\n"; - struct mlx5_ib_event_work { struct work_struct work; union { @@ -7315,8 +7308,6 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev) int port_type_cap; int num_ports; - printk_once(KERN_INFO "%s", mlx5_version); - if (MLX5_ESWITCH_MANAGER(mdev) && mlx5_ib_eswitch_mode(mdev->priv.eswitch) == MLX5_ESWITCH_OFFLOADS) { if (!mlx5_core_mp_enabled(mdev)) -- cgit v1.2.3 From 23bbd5818e2b0d265aa1835e66f5055f63a8fa4c Mon Sep 17 00:00:00 2001 From: Kamal Heib Date: Thu, 14 May 2020 14:47:20 +0300 Subject: RDMA/srpt: Fix disabling device management Avoid disabling device management for devices that don't support Management datagrams (MADs) by checking if the "mad_agent" pointer is initialized before calling ib_modify_port, also fix the error flow in srpt_refresh_port() to disable device management if ib_register_mad_agent() fail. Fixes: 09f8a1486dca ("RDMA/srpt: Fix handling of SR-IOV and iWARP ports") Link: https://lore.kernel.org/r/20200514114720.141139-1-kamalheib1@gmail.com Signed-off-by: Kamal Heib Reviewed-by: Bart Van Assche Signed-off-by: Jason Gunthorpe --- drivers/infiniband/ulp/srpt/ib_srpt.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.c b/drivers/infiniband/ulp/srpt/ib_srpt.c index 63056af5337c..a294630f2100 100644 --- a/drivers/infiniband/ulp/srpt/ib_srpt.c +++ b/drivers/infiniband/ulp/srpt/ib_srpt.c @@ -607,6 +607,11 @@ static int srpt_refresh_port(struct srpt_port *sport) dev_name(&sport->sdev->device->dev), sport->port, PTR_ERR(sport->mad_agent)); sport->mad_agent = NULL; + memset(&port_modify, 0, sizeof(port_modify)); + port_modify.clr_port_cap_mask = IB_PORT_DEVICE_MGMT_SUP; + ib_modify_port(sport->sdev->device, sport->port, 0, + &port_modify); + } } @@ -630,9 +635,8 @@ static void srpt_unregister_mad_agent(struct srpt_device *sdev) for (i = 1; i <= sdev->device->phys_port_cnt; i++) { sport = &sdev->port[i - 1]; WARN_ON(sport->port != i); - if (ib_modify_port(sdev->device, i, 0, &port_modify) < 0) - pr_err("disabling MAD processing failed.\n"); if (sport->mad_agent) { + ib_modify_port(sdev->device, i, 0, &port_modify); ib_unregister_mad_agent(sport->mad_agent); sport->mad_agent = NULL; } -- cgit v1.2.3 From b386cd65d961e29710ef6ad84bc788f0a7e9d64e Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 19 May 2020 16:32:23 +0300 Subject: RDMA/rtrs: Fix some signedness bugs in error handling The problem is that "req->sg_cnt" is an unsigned int so if "nr" is negative, it gets type promoted to a high positive value and the condition is false. This patch fixes it by handling negatives separately. Fixes: 6a98d71daea1 ("RDMA/rtrs: client: main functionality") Link: https://lore.kernel.org/r/20200519133223.GN2078@kadam Signed-off-by: Dan Carpenter Reviewed-by: Jack Wang Signed-off-by: Jason Gunthorpe --- drivers/infiniband/ulp/rtrs/rtrs-clt.c | 7 +++---- drivers/infiniband/ulp/rtrs/rtrs-srv.c | 2 +- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/infiniband/ulp/rtrs/rtrs-clt.c b/drivers/infiniband/ulp/rtrs/rtrs-clt.c index 468fdd0d8713..96cba06e8ba7 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs-clt.c +++ b/drivers/infiniband/ulp/rtrs/rtrs-clt.c @@ -1047,11 +1047,10 @@ static int rtrs_map_sg_fr(struct rtrs_clt_io_req *req, size_t count) /* Align the MR to a 4K page size to match the block virt boundary */ nr = ib_map_mr_sg(req->mr, req->sglist, count, NULL, SZ_4K); - if (unlikely(nr < req->sg_cnt)) { - if (nr < 0) - return nr; + if (nr < 0) + return nr; + if (unlikely(nr < req->sg_cnt)) return -EINVAL; - } ib_update_fast_reg_key(req->mr, ib_inc_rkey(req->mr->rkey)); return nr; diff --git a/drivers/infiniband/ulp/rtrs/rtrs-srv.c b/drivers/infiniband/ulp/rtrs/rtrs-srv.c index ba8ab33b94a2..eefd149ce7a4 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs-srv.c +++ b/drivers/infiniband/ulp/rtrs/rtrs-srv.c @@ -649,7 +649,7 @@ static int map_cont_bufs(struct rtrs_srv_sess *sess) } nr = ib_map_mr_sg(mr, sgt->sgl, sgt->nents, NULL, max_chunk_size); - if (nr < sgt->nents) { + if (nr < 0 || nr < sgt->nents) { err = nr < 0 ? nr : -EINVAL; goto dereg_mr; } -- cgit v1.2.3 From bf1d8edb38bbf0628c1f2de7d13ab98533c1fe60 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 19 May 2020 18:45:25 +0300 Subject: RDMA/rtrs: Fix a couple off by one bugs in rtrs_srv_rdma_done() These > comparisons should be >= to prevent accessing one element beyond the end of the buffer. Fixes: 9cb837480424 ("RDMA/rtrs: server: main functionality") Link: https://lore.kernel.org/r/20200519154525.GA66801@mwanda Signed-off-by: Dan Carpenter Acked-by: Danil Kipnis Signed-off-by: Jason Gunthorpe --- drivers/infiniband/ulp/rtrs/rtrs-srv.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/ulp/rtrs/rtrs-srv.c b/drivers/infiniband/ulp/rtrs/rtrs-srv.c index eefd149ce7a4..863b3942e333 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs-srv.c +++ b/drivers/infiniband/ulp/rtrs/rtrs-srv.c @@ -1213,8 +1213,8 @@ static void rtrs_srv_rdma_done(struct ib_cq *cq, struct ib_wc *wc) msg_id = imm_payload >> sess->mem_bits; off = imm_payload & ((1 << sess->mem_bits) - 1); - if (unlikely(msg_id > srv->queue_depth || - off > max_chunk_size)) { + if (unlikely(msg_id >= srv->queue_depth || + off >= max_chunk_size)) { rtrs_err(s, "Wrong msg_id %u, off %u\n", msg_id, off); close_sess(sess); -- cgit v1.2.3 From e19840867013f0f4081265fdb65b31e80b7bcb5b Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Tue, 19 May 2020 11:36:12 -0500 Subject: RDMA/rtrs: client: Fix function return on success Remove the if-statement and return the value contained in _err_, unconditionally. Link: https://lore.kernel.org/r/20200519163612.GA6043@embeddedor Addresses-Coverity-ID: 1493753 ("Identical code for different branches") Fixes: 6a98d71daea1 ("RDMA/rtrs: client: main functionality") Signed-off-by: Gustavo A. R. Silva Reviewed-by: Bart Van Assche Signed-off-by: Jason Gunthorpe --- drivers/infiniband/ulp/rtrs/rtrs-clt.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/infiniband/ulp/rtrs/rtrs-clt.c b/drivers/infiniband/ulp/rtrs/rtrs-clt.c index 96cba06e8ba7..1b98785fd8ac 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs-clt.c +++ b/drivers/infiniband/ulp/rtrs/rtrs-clt.c @@ -1590,9 +1590,6 @@ static int create_con_cq_qp(struct rtrs_clt_con *con) * In case of error we do not bother to clean previous allocations, * since destroy_con_cq_qp() must be called. */ - - if (err) - return err; return err; } -- cgit v1.2.3 From 6b31afcef51e578e936e66c347ab333c024963da Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Tue, 19 May 2020 09:19:12 +0000 Subject: RDMA/rtrs: server: Fix some error return code Fix to return negative error code -ENOMEM from the some error handling cases instead of 0, as done elsewhere in this function. Fixes: 9cb837480424 ("RDMA/rtrs: server: main functionality") Fixes: 91b11610af8d ("RDMA/rtrs: server: sysfs interface functions") Link: https://lore.kernel.org/r/20200519091912.134358-1-weiyongjun1@huawei.com Reported-by: Hulk Robot Signed-off-by: Wei Yongjun Reviewed-by: Danil Kipnis Signed-off-by: Jason Gunthorpe --- drivers/infiniband/ulp/rtrs/rtrs-srv-sysfs.c | 1 + drivers/infiniband/ulp/rtrs/rtrs-srv.c | 8 +++++--- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/ulp/rtrs/rtrs-srv-sysfs.c b/drivers/infiniband/ulp/rtrs/rtrs-srv-sysfs.c index 0cf015634338..3d7877534bcc 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs-srv-sysfs.c +++ b/drivers/infiniband/ulp/rtrs/rtrs-srv-sysfs.c @@ -189,6 +189,7 @@ static int rtrs_srv_create_once_sysfs_root_folders(struct rtrs_srv_sess *sess) } srv->kobj_paths = kobject_create_and_add("paths", &srv->dev.kobj); if (!srv->kobj_paths) { + err = -ENOMEM; pr_err("kobject_create_and_add(): %d\n", err); device_unregister(&srv->dev); goto unlock; diff --git a/drivers/infiniband/ulp/rtrs/rtrs-srv.c b/drivers/infiniband/ulp/rtrs/rtrs-srv.c index 863b3942e333..1fc6ece036ff 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs-srv.c +++ b/drivers/infiniband/ulp/rtrs/rtrs-srv.c @@ -660,8 +660,8 @@ static int map_cont_bufs(struct rtrs_srv_sess *sess) GFP_KERNEL, sess->s.dev->ib_dev, DMA_TO_DEVICE, rtrs_srv_rdma_done); if (!srv_mr->iu) { - rtrs_err(ss, "rtrs_iu_alloc(), err: %d\n", - -ENOMEM); + err = -ENOMEM; + rtrs_err(ss, "rtrs_iu_alloc(), err: %d\n", err); goto free_iu; } } @@ -2150,8 +2150,10 @@ static int __init rtrs_server_init(void) goto out_chunk_pool; } rtrs_wq = alloc_workqueue("rtrs_server_wq", WQ_MEM_RECLAIM, 0); - if (!rtrs_wq) + if (!rtrs_wq) { + err = -ENOMEM; goto out_dev_class; + } return 0; -- cgit v1.2.3 From d6ea395072457153f2120e2361657e00f3c0958d Mon Sep 17 00:00:00 2001 From: Danil Kipnis Date: Tue, 19 May 2020 13:14:19 +0200 Subject: rnbd/rtrs: Pass max segment size from blk user to the rdma library When Block Device Layer is disabled, BLK_MAX_SEGMENT_SIZE is undefined. The rtrs is a transport library and should compile independently of the block layer. The desired max segment size should be passed down by the user. Introduce max_segment_size parameter for the rtrs_clt_open() call. Fixes: f7a7a5c228d4 ("block/rnbd: client: main functionality") Fixes: 6a98d71daea1 ("RDMA/rtrs: client: main functionality") Fixes: cb80329c9434 ("RDMA/rtrs: client: private header with client structs and functions") Fixes: b5c27cdb094e ("RDMA/rtrs: public interface header to establish RDMA connections") Link: https://lore.kernel.org/r/20200519111419.924170-1-danil.kipnis@cloud.ionos.com Signed-off-by: Danil Kipnis Reported-by: Randy Dunlap Acked-by: Randy Dunlap # build-tested Signed-off-by: Jason Gunthorpe --- drivers/block/rnbd/rnbd-clt.c | 1 + drivers/infiniband/ulp/rtrs/rtrs-clt.c | 17 +++++++++++------ drivers/infiniband/ulp/rtrs/rtrs-clt.h | 1 + drivers/infiniband/ulp/rtrs/rtrs.h | 1 + 4 files changed, 14 insertions(+), 6 deletions(-) diff --git a/drivers/block/rnbd/rnbd-clt.c b/drivers/block/rnbd/rnbd-clt.c index 55bff3b1be71..450a571e6a1e 100644 --- a/drivers/block/rnbd/rnbd-clt.c +++ b/drivers/block/rnbd/rnbd-clt.c @@ -1216,6 +1216,7 @@ find_and_get_or_create_sess(const char *sessname, paths, path_cnt, port_nr, sizeof(struct rnbd_iu), RECONNECT_DELAY, BMAX_SEGMENTS, + BLK_MAX_SEGMENT_SIZE, MAX_RECONNECTS); if (IS_ERR(sess->rtrs)) { err = PTR_ERR(sess->rtrs); diff --git a/drivers/infiniband/ulp/rtrs/rtrs-clt.c b/drivers/infiniband/ulp/rtrs/rtrs-clt.c index 1b98785fd8ac..0ab7e5e912c0 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs-clt.c +++ b/drivers/infiniband/ulp/rtrs/rtrs-clt.c @@ -12,7 +12,6 @@ #include #include -#include /* for BLK_MAX_SEGMENT_SIZE */ #include "rtrs-clt.h" #include "rtrs-log.h" @@ -1406,7 +1405,8 @@ static void rtrs_clt_close_work(struct work_struct *work); static struct rtrs_clt_sess *alloc_sess(struct rtrs_clt *clt, const struct rtrs_addr *path, - size_t con_num, u16 max_segments) + size_t con_num, u16 max_segments, + size_t max_segment_size) { struct rtrs_clt_sess *sess; int err = -ENOMEM; @@ -1443,7 +1443,7 @@ static struct rtrs_clt_sess *alloc_sess(struct rtrs_clt *clt, strlcpy(sess->s.sessname, clt->sessname, sizeof(sess->s.sessname)); sess->s.con_num = con_num; sess->clt = clt; - sess->max_pages_per_mr = max_segments * BLK_MAX_SEGMENT_SIZE >> 12; + sess->max_pages_per_mr = max_segments * max_segment_size >> 12; init_waitqueue_head(&sess->state_wq); sess->state = RTRS_CLT_CONNECTING; atomic_set(&sess->connected_cnt, 0); @@ -2526,6 +2526,7 @@ static struct rtrs_clt *alloc_clt(const char *sessname, size_t paths_num, void (*link_ev)(void *priv, enum rtrs_clt_link_ev ev), unsigned int max_segments, + size_t max_segment_size, unsigned int reconnect_delay_sec, unsigned int max_reconnect_attempts) { @@ -2555,6 +2556,7 @@ static struct rtrs_clt *alloc_clt(const char *sessname, size_t paths_num, clt->port = port; clt->pdu_sz = pdu_sz; clt->max_segments = max_segments; + clt->max_segment_size = max_segment_size; clt->reconnect_delay_sec = reconnect_delay_sec; clt->max_reconnect_attempts = max_reconnect_attempts; clt->priv = priv; @@ -2636,6 +2638,7 @@ static void free_clt(struct rtrs_clt *clt) * @pdu_sz: Size of extra payload which can be accessed after permit allocation. * @reconnect_delay_sec: time between reconnect tries * @max_segments: Max. number of segments per IO request + * @max_segment_size: Max. size of one segment * @max_reconnect_attempts: Number of times to reconnect on error before giving * up, 0 for * disabled, -1 for forever * @@ -2650,6 +2653,7 @@ struct rtrs_clt *rtrs_clt_open(struct rtrs_clt_ops *ops, size_t paths_num, u16 port, size_t pdu_sz, u8 reconnect_delay_sec, u16 max_segments, + size_t max_segment_size, s16 max_reconnect_attempts) { struct rtrs_clt_sess *sess, *tmp; @@ -2658,7 +2662,7 @@ struct rtrs_clt *rtrs_clt_open(struct rtrs_clt_ops *ops, clt = alloc_clt(sessname, paths_num, port, pdu_sz, ops->priv, ops->link_ev, - max_segments, reconnect_delay_sec, + max_segments, max_segment_size, reconnect_delay_sec, max_reconnect_attempts); if (IS_ERR(clt)) { err = PTR_ERR(clt); @@ -2668,7 +2672,7 @@ struct rtrs_clt *rtrs_clt_open(struct rtrs_clt_ops *ops, struct rtrs_clt_sess *sess; sess = alloc_sess(clt, &paths[i], nr_cpu_ids, - max_segments); + max_segments, max_segment_size); if (IS_ERR(sess)) { err = PTR_ERR(sess); goto close_all_sess; @@ -2917,7 +2921,8 @@ int rtrs_clt_create_path_from_sysfs(struct rtrs_clt *clt, struct rtrs_clt_sess *sess; int err; - sess = alloc_sess(clt, addr, nr_cpu_ids, clt->max_segments); + sess = alloc_sess(clt, addr, nr_cpu_ids, clt->max_segments, + clt->max_segment_size); if (IS_ERR(sess)) return PTR_ERR(sess); diff --git a/drivers/infiniband/ulp/rtrs/rtrs-clt.h b/drivers/infiniband/ulp/rtrs/rtrs-clt.h index 039a2ebba2f9..167acd3c90fc 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs-clt.h +++ b/drivers/infiniband/ulp/rtrs/rtrs-clt.h @@ -164,6 +164,7 @@ struct rtrs_clt { unsigned int max_reconnect_attempts; unsigned int reconnect_delay_sec; unsigned int max_segments; + size_t max_segment_size; void *permits; unsigned long *permits_map; size_t queue_depth; diff --git a/drivers/infiniband/ulp/rtrs/rtrs.h b/drivers/infiniband/ulp/rtrs/rtrs.h index 9879d40467b6..9af750f4d783 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs.h +++ b/drivers/infiniband/ulp/rtrs/rtrs.h @@ -58,6 +58,7 @@ struct rtrs_clt *rtrs_clt_open(struct rtrs_clt_ops *ops, size_t path_cnt, u16 port, size_t pdu_sz, u8 reconnect_delay_sec, u16 max_segments, + size_t max_segment_size, s16 max_reconnect_attempts); void rtrs_clt_close(struct rtrs_clt *sess); -- cgit v1.2.3 From bd25c8066fc2e0868228b3cb0563d6c1b65505b2 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Tue, 19 May 2020 18:30:18 -0500 Subject: RDMA/siw: Replace one-element array and use struct_size() helper The current codebase makes use of one-element arrays in the following form: struct something { int length; u8 data[1]; }; struct something *instance; instance = kmalloc(sizeof(*instance) + size, GFP_KERNEL); instance->length = size; memcpy(instance->data, source, size); but the preferred mechanism to declare variable-length types such as these ones is a flexible array member[1][2], introduced in C99: struct foo { int stuff; struct boo array[]; }; By making use of the mechanism above, we will get a compiler warning in case the flexible array does not occur last in the structure, which will help us prevent some kind of undefined behavior bugs from being inadvertently introduced[3] to the codebase from now on. So, replace the one-element array with a flexible-array member. Also, make use of the new struct_size() helper to properly calculate the size of struct siw_pbl. This issue was found with the help of Coccinelle and, audited and fixed _manually_. [1] https://gcc.gnu.org/onlinedocs/gcc/Zero-Length.html [2] https://github.com/KSPP/linux/issues/21 [3] commit 76497732932f ("cxgb3/l2t: Fix undefined behaviour") Link: https://lore.kernel.org/r/20200519233018.GA6105@embeddedor Signed-off-by: Gustavo A. R. Silva Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/siw/siw.h | 2 +- drivers/infiniband/sw/siw/siw_mem.c | 5 +---- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/drivers/infiniband/sw/siw/siw.h b/drivers/infiniband/sw/siw/siw.h index af5e9f8c0fcd..5a58a1cc7a7e 100644 --- a/drivers/infiniband/sw/siw/siw.h +++ b/drivers/infiniband/sw/siw/siw.h @@ -139,7 +139,7 @@ struct siw_pble { struct siw_pbl { unsigned int num_buf; unsigned int max_buf; - struct siw_pble pbe[1]; + struct siw_pble pbe[]; }; /* diff --git a/drivers/infiniband/sw/siw/siw_mem.c b/drivers/infiniband/sw/siw/siw_mem.c index e2061dc0b043..87117781d637 100644 --- a/drivers/infiniband/sw/siw/siw_mem.c +++ b/drivers/infiniband/sw/siw/siw_mem.c @@ -349,14 +349,11 @@ dma_addr_t siw_pbl_get_buffer(struct siw_pbl *pbl, u64 off, int *len, int *idx) struct siw_pbl *siw_pbl_alloc(u32 num_buf) { struct siw_pbl *pbl; - int buf_size = sizeof(*pbl); if (num_buf == 0) return ERR_PTR(-EINVAL); - buf_size += ((num_buf - 1) * sizeof(struct siw_pble)); - - pbl = kzalloc(buf_size, GFP_KERNEL); + pbl = kzalloc(struct_size(pbl, pbe, num_buf), GFP_KERNEL); if (!pbl) return ERR_PTR(-ENOMEM); -- cgit v1.2.3 From 349be276509455ac2f19fa4051ed773082c6a27e Mon Sep 17 00:00:00 2001 From: Lijun Ou Date: Fri, 8 May 2020 17:45:51 +0800 Subject: RDMA/hns: Bugfix for querying qkey The qkey queried through the query ud qp verb is a fixed value and it should be read from qp context. Fixes: 926a01dc000d ("RDMA/hns: Add QP operations support for hip08 SoC") Link: https://lore.kernel.org/r/1588931159-56875-2-git-send-email-liweihang@huawei.com Signed-off-by: Lijun Ou Signed-off-by: Weihang Li Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index ebe570aa2323..8c24ddfb76fe 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -4683,7 +4683,7 @@ static int hns_roce_v2_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, qp_attr->path_mig_state = IB_MIG_ARMED; qp_attr->ah_attr.type = RDMA_AH_ATTR_TYPE_ROCE; if (hr_qp->ibqp.qp_type == IB_QPT_UD) - qp_attr->qkey = V2_QKEY_VAL; + qp_attr->qkey = le32_to_cpu(context.qkey_xrcd); qp_attr->rq_psn = roce_get_field(context.byte_108_rx_reqepsn, V2_QPC_BYTE_108_RX_REQ_EPSN_M, -- cgit v1.2.3 From 441c88d5b3ff80108ff536c6cf80591187015403 Mon Sep 17 00:00:00 2001 From: Lang Cheng Date: Fri, 8 May 2020 17:45:52 +0800 Subject: RDMA/hns: Fix cmdq parameter of querying pf timer resource The firmware has reduced the number of descriptions of command HNS_ROCE_OPC_QUERY_PF_TIMER_RES to 1. The driver needs to adapt, otherwise the hardware will report error 4(CMD_NEXT_ERR). Fixes: 0e40dc2f70cd ("RDMA/hns: Add timer allocation support for hip08") Link: https://lore.kernel.org/r/1588931159-56875-3-git-send-email-liweihang@huawei.com Signed-off-by: Lang Cheng Signed-off-by: Weihang Li Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 32 +++++++++++------------------- 1 file changed, 12 insertions(+), 20 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 8c24ddfb76fe..0e488de22355 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -1450,34 +1450,26 @@ static int hns_roce_query_pf_resource(struct hns_roce_dev *hr_dev) static int hns_roce_query_pf_timer_resource(struct hns_roce_dev *hr_dev) { struct hns_roce_pf_timer_res_a *req_a; - struct hns_roce_cmq_desc desc[2]; - int ret, i; + struct hns_roce_cmq_desc desc; + int ret; - for (i = 0; i < 2; i++) { - hns_roce_cmq_setup_basic_desc(&desc[i], - HNS_ROCE_OPC_QUERY_PF_TIMER_RES, - true); + hns_roce_cmq_setup_basic_desc(&desc, HNS_ROCE_OPC_QUERY_PF_TIMER_RES, + true); - if (i == 0) - desc[i].flag |= cpu_to_le16(HNS_ROCE_CMD_FLAG_NEXT); - else - desc[i].flag &= ~cpu_to_le16(HNS_ROCE_CMD_FLAG_NEXT); - } - - ret = hns_roce_cmq_send(hr_dev, desc, 2); + ret = hns_roce_cmq_send(hr_dev, &desc, 1); if (ret) return ret; - req_a = (struct hns_roce_pf_timer_res_a *)desc[0].data; + req_a = (struct hns_roce_pf_timer_res_a *)desc.data; hr_dev->caps.qpc_timer_bt_num = - roce_get_field(req_a->qpc_timer_bt_idx_num, - PF_RES_DATA_1_PF_QPC_TIMER_BT_NUM_M, - PF_RES_DATA_1_PF_QPC_TIMER_BT_NUM_S); + roce_get_field(req_a->qpc_timer_bt_idx_num, + PF_RES_DATA_1_PF_QPC_TIMER_BT_NUM_M, + PF_RES_DATA_1_PF_QPC_TIMER_BT_NUM_S); hr_dev->caps.cqc_timer_bt_num = - roce_get_field(req_a->cqc_timer_bt_idx_num, - PF_RES_DATA_2_PF_CQC_TIMER_BT_NUM_M, - PF_RES_DATA_2_PF_CQC_TIMER_BT_NUM_S); + roce_get_field(req_a->cqc_timer_bt_idx_num, + PF_RES_DATA_2_PF_CQC_TIMER_BT_NUM_M, + PF_RES_DATA_2_PF_CQC_TIMER_BT_NUM_S); return 0; } -- cgit v1.2.3 From 053c0acf52edf97cae7d53c9f249f7c2eb565ed9 Mon Sep 17 00:00:00 2001 From: Wenpeng Liang Date: Fri, 8 May 2020 17:45:53 +0800 Subject: RDMA/hns: Fix assignment to ba_pg_sz of eqe When allocating eq buffer, the size of base address page should be defined by eqe_ba_pg_sz instead of srqwqe_ba_pg_sz. Fixes: 477a0a387072 ("RDMA/hns: Optimize 0 hop addressing for EQE buffer") Link: https://lore.kernel.org/r/1588931159-56875-4-git-send-email-liweihang@huawei.com Signed-off-by: Wenpeng Liang Signed-off-by: Weihang Li Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 0e488de22355..96a5ff565b2f 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -5615,7 +5615,7 @@ static int alloc_eq_buf(struct hns_roce_dev *hr_dev, struct hns_roce_eq *eq) buf_attr.fixed_page = true; err = hns_roce_mtr_create(hr_dev, &eq->mtr, &buf_attr, - hr_dev->caps.srqwqe_ba_pg_sz + + hr_dev->caps.eqe_ba_pg_sz + PAGE_ADDR_SHIFT, NULL, 0); if (err) dev_err(hr_dev->dev, "Failed to alloc EQE mtr, err %d\n", err); -- cgit v1.2.3 From 6968aeb5aa64a46175f408fa91e49c19e9558428 Mon Sep 17 00:00:00 2001 From: Weihang Li Date: Fri, 8 May 2020 17:45:54 +0800 Subject: RDMA/hns: Fix wrong assignment of SRQ's max_wr srq's attribute max_wr should be 1 less than the total count of wqe. Fixes: ffb1308b88b6 ("RDMA/hns: Move SRQ code to the reasonable place") Link: https://lore.kernel.org/r/1588931159-56875-5-git-send-email-liweihang@huawei.com Signed-off-by: Weihang Li Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 96a5ff565b2f..155c658ed3eb 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -5044,8 +5044,8 @@ static int hns_roce_v2_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr) SRQC_BYTE_8_SRQ_LIMIT_WL_S); attr->srq_limit = limit_wl; - attr->max_wr = srq->wqe_cnt; - attr->max_sge = srq->max_gs; + attr->max_wr = srq->wqe_cnt - 1; + attr->max_sge = srq->max_gs; memcpy(srq_context, mailbox->buf, sizeof(*srq_context)); -- cgit v1.2.3 From d4d813874156063eae6542c66da2a6971592e46f Mon Sep 17 00:00:00 2001 From: Weihang Li Date: Fri, 8 May 2020 17:45:55 +0800 Subject: RDMA/hns: Fix error with to_hr_hem_entries_count() For ilog2(x), if x is 0 and not a constant variable, it will return -1. And there will be an error as below: hns3 0000:7d:00.0 hns_0: Local work queue 0x8 catast error, sub_event type is: 2 So modify to_hr_hem_entries_shift() to return 0 if conut is 0. Fixes: 54d6638765b0 ("RDMA/hns: Optimize WQE buffer size calculating process") Link: https://lore.kernel.org/r/1588931159-56875-6-git-send-email-liweihang@huawei.com Signed-off-by: Weihang Li Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_device.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index 4fcd608ee55f..770a6d5517d5 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -1112,6 +1112,9 @@ static inline u32 to_hr_hem_entries_count(u32 count, u32 buf_shift) static inline u32 to_hr_hem_entries_shift(u32 count, u32 buf_shift) { + if (!count) + return 0; + return ilog2(to_hr_hem_entries_count(count, buf_shift)); } -- cgit v1.2.3 From 7b611d2f6e8b99b699996c52b823454f4a74978f Mon Sep 17 00:00:00 2001 From: Lang Cheng Date: Fri, 8 May 2020 17:45:56 +0800 Subject: RDMA/hns: Store mr len information into mr obj The length information should be stored in the struct ib_mr object, otherwise the length value of a valid mr object would always be 0. Link: https://lore.kernel.org/r/1588931159-56875-7-git-send-email-liweihang@huawei.com Signed-off-by: Lang Cheng Signed-off-by: Weihang Li Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_mr.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/infiniband/hw/hns/hns_roce_mr.c b/drivers/infiniband/hw/hns/hns_roce_mr.c index ecd76759d47a..f727b1875af8 100644 --- a/drivers/infiniband/hw/hns/hns_roce_mr.c +++ b/drivers/infiniband/hw/hns/hns_roce_mr.c @@ -285,6 +285,7 @@ struct ib_mr *hns_roce_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, goto err_alloc_pbl; mr->ibmr.rkey = mr->ibmr.lkey = mr->key; + mr->ibmr.length = length; return &mr->ibmr; @@ -451,6 +452,7 @@ struct ib_mr *hns_roce_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type, goto err_pbl; mr->ibmr.rkey = mr->ibmr.lkey = mr->key; + mr->ibmr.length = length; return &mr->ibmr; -- cgit v1.2.3 From 252067e95035151372f21c0c8626bf6fed9c5f0b Mon Sep 17 00:00:00 2001 From: Weihang Li Date: Fri, 8 May 2020 17:45:57 +0800 Subject: RDMA/hns: Remove redundant memcpy() srq_context is a local variables and is only used to get some fields from buffer of mailbox. It's meaningless to copy mailbox's buffer's contents back to it. Link: https://lore.kernel.org/r/1588931159-56875-8-git-send-email-liweihang@huawei.com Signed-off-by: Weihang Li Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 155c658ed3eb..34c183181908 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -5047,8 +5047,6 @@ static int hns_roce_v2_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr) attr->max_wr = srq->wqe_cnt - 1; attr->max_sge = srq->max_gs; - memcpy(srq_context, mailbox->buf, sizeof(*srq_context)); - out: hns_roce_free_cmd_mailbox(hr_dev, mailbox); return ret; -- cgit v1.2.3 From 9581a356ccadb24d4a18c62c5c8327997e47241e Mon Sep 17 00:00:00 2001 From: Xi Wang Date: Fri, 8 May 2020 17:45:58 +0800 Subject: RDMA/hns: Rename macro for defining hns hardware page size Rename the PAGE_ADDR_SHIFT as HNS_HW_PAGE_SHIFT to make code more readable. Link: https://lore.kernel.org/r/1588931159-56875-9-git-send-email-liweihang@huawei.com Signed-off-by: Xi Wang Signed-off-by: Weihang Li Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_alloc.c | 6 +++--- drivers/infiniband/hw/hns/hns_roce_cq.c | 4 ++-- drivers/infiniband/hw/hns/hns_roce_device.h | 10 ++++++---- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 4 ++-- drivers/infiniband/hw/hns/hns_roce_mr.c | 6 +++--- drivers/infiniband/hw/hns/hns_roce_qp.c | 4 ++-- drivers/infiniband/hw/hns/hns_roce_srq.c | 8 ++++---- 7 files changed, 22 insertions(+), 20 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_alloc.c b/drivers/infiniband/hw/hns/hns_roce_alloc.c index 365e7db6c498..742aee846676 100644 --- a/drivers/infiniband/hw/hns/hns_roce_alloc.c +++ b/drivers/infiniband/hw/hns/hns_roce_alloc.c @@ -189,8 +189,8 @@ int hns_roce_buf_alloc(struct hns_roce_dev *hr_dev, u32 size, u32 max_direct, u32 page_size; int i; - /* The minimum shift of the page accessed by hw is PAGE_ADDR_SHIFT */ - buf->page_shift = max_t(int, PAGE_ADDR_SHIFT, page_shift); + /* The minimum shift of the page accessed by hw is HNS_HW_PAGE_SHIFT */ + buf->page_shift = max_t(int, HNS_HW_PAGE_SHIFT, page_shift); page_size = 1 << buf->page_shift; buf->npages = DIV_ROUND_UP(size, page_size); @@ -261,7 +261,7 @@ int hns_roce_get_umem_bufs(struct hns_roce_dev *hr_dev, dma_addr_t *bufs, int idx = 0; u64 addr; - if (page_shift < PAGE_ADDR_SHIFT) { + if (page_shift < HNS_HW_PAGE_SHIFT) { dev_err(hr_dev->dev, "Failed to check umem page shift %d!\n", page_shift); return -EINVAL; diff --git a/drivers/infiniband/hw/hns/hns_roce_cq.c b/drivers/infiniband/hw/hns/hns_roce_cq.c index d2d7074bbe69..6dd8deaffec8 100644 --- a/drivers/infiniband/hw/hns/hns_roce_cq.c +++ b/drivers/infiniband/hw/hns/hns_roce_cq.c @@ -149,14 +149,14 @@ static int alloc_cq_buf(struct hns_roce_dev *hr_dev, struct hns_roce_cq *hr_cq, struct hns_roce_buf_attr buf_attr = {}; int err; - buf_attr.page_shift = hr_dev->caps.cqe_buf_pg_sz + PAGE_ADDR_SHIFT; + buf_attr.page_shift = hr_dev->caps.cqe_buf_pg_sz + HNS_HW_PAGE_SHIFT; buf_attr.region[0].size = hr_cq->cq_depth * hr_dev->caps.cq_entry_sz; buf_attr.region[0].hopnum = hr_dev->caps.cqe_hop_num; buf_attr.region_count = 1; buf_attr.fixed_page = true; err = hns_roce_mtr_create(hr_dev, &hr_cq->mtr, &buf_attr, - hr_dev->caps.cqe_ba_pg_sz + PAGE_ADDR_SHIFT, + hr_dev->caps.cqe_ba_pg_sz + HNS_HW_PAGE_SHIFT, udata, addr); if (err) ibdev_err(ibdev, "Failed to alloc CQ mtr, err %d\n", err); diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index 770a6d5517d5..1befdbe00b89 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -262,7 +262,9 @@ enum { #define HNS_ROCE_PORT_DOWN 0 #define HNS_ROCE_PORT_UP 1 -#define PAGE_ADDR_SHIFT 12 +/* The minimum page size is 4K for hardware */ +#define HNS_HW_PAGE_SHIFT 12 +#define HNS_HW_PAGE_SIZE (1 << HNS_HW_PAGE_SHIFT) /* The minimum page count for hardware access page directly. */ #define HNS_HW_DIRECT_PAGE_COUNT 2 @@ -1080,16 +1082,16 @@ static inline dma_addr_t hns_roce_buf_page(struct hns_roce_buf *buf, int idx) return buf->page_list[idx].map; } -#define hr_hw_page_align(x) ALIGN(x, 1 << PAGE_ADDR_SHIFT) +#define hr_hw_page_align(x) ALIGN(x, 1 << HNS_HW_PAGE_SHIFT) static inline u64 to_hr_hw_page_addr(u64 addr) { - return addr >> PAGE_ADDR_SHIFT; + return addr >> HNS_HW_PAGE_SHIFT; } static inline u32 to_hr_hw_page_shift(u32 page_shift) { - return page_shift - PAGE_ADDR_SHIFT; + return page_shift - HNS_HW_PAGE_SHIFT; } static inline u32 to_hr_hem_hopnum(u32 hopnum, u32 count) diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 34c183181908..d31ecaa7010c 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -5606,7 +5606,7 @@ static int alloc_eq_buf(struct hns_roce_dev *hr_dev, struct hns_roce_eq *eq) else eq->hop_num = hr_dev->caps.eqe_hop_num; - buf_attr.page_shift = hr_dev->caps.eqe_buf_pg_sz + PAGE_ADDR_SHIFT; + buf_attr.page_shift = hr_dev->caps.eqe_buf_pg_sz + HNS_HW_PAGE_SHIFT; buf_attr.region[0].size = eq->entries * eq->eqe_size; buf_attr.region[0].hopnum = eq->hop_num; buf_attr.region_count = 1; @@ -5614,7 +5614,7 @@ static int alloc_eq_buf(struct hns_roce_dev *hr_dev, struct hns_roce_eq *eq) err = hns_roce_mtr_create(hr_dev, &eq->mtr, &buf_attr, hr_dev->caps.eqe_ba_pg_sz + - PAGE_ADDR_SHIFT, NULL, 0); + HNS_HW_PAGE_SHIFT, NULL, 0); if (err) dev_err(hr_dev->dev, "Failed to alloc EQE mtr, err %d\n", err); diff --git a/drivers/infiniband/hw/hns/hns_roce_mr.c b/drivers/infiniband/hw/hns/hns_roce_mr.c index f727b1875af8..3075e8450cda 100644 --- a/drivers/infiniband/hw/hns/hns_roce_mr.c +++ b/drivers/infiniband/hw/hns/hns_roce_mr.c @@ -120,7 +120,7 @@ static int alloc_mr_pbl(struct hns_roce_dev *hr_dev, struct hns_roce_mr *mr, mr->pbl_hop_num = is_fast ? 1 : hr_dev->caps.pbl_hop_num; buf_attr.page_shift = is_fast ? PAGE_SHIFT : - hr_dev->caps.pbl_buf_pg_sz + PAGE_ADDR_SHIFT; + hr_dev->caps.pbl_buf_pg_sz + HNS_HW_PAGE_SHIFT; buf_attr.region[0].size = length; buf_attr.region[0].hopnum = mr->pbl_hop_num; buf_attr.region_count = 1; @@ -130,7 +130,7 @@ static int alloc_mr_pbl(struct hns_roce_dev *hr_dev, struct hns_roce_mr *mr, buf_attr.mtt_only = is_fast; err = hns_roce_mtr_create(hr_dev, &mr->pbl_mtr, &buf_attr, - hr_dev->caps.pbl_ba_pg_sz + PAGE_ADDR_SHIFT, + hr_dev->caps.pbl_ba_pg_sz + HNS_HW_PAGE_SHIFT, udata, start); if (err) ibdev_err(ibdev, "failed to alloc pbl mtr, ret = %d.\n", err); @@ -819,7 +819,7 @@ static int mtr_alloc_bufs(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr, } /* must bigger than minimum hardware page shift */ - if (best_pg_shift < PAGE_ADDR_SHIFT || all_pg_count < 1) { + if (best_pg_shift < HNS_HW_PAGE_SHIFT || all_pg_count < 1) { ret = -EINVAL; ibdev_err(ibdev, "Failed to check mtr page shift %d count %d\n", best_pg_shift, all_pg_count); diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c index dca979d8c345..e6ecdce4d63e 100644 --- a/drivers/infiniband/hw/hns/hns_roce_qp.c +++ b/drivers/infiniband/hw/hns/hns_roce_qp.c @@ -546,7 +546,7 @@ static int split_wqe_buf_region(struct hns_roce_dev *hr_dev, if (hr_qp->buff_size < 1) return -EINVAL; - buf_attr->page_shift = PAGE_ADDR_SHIFT + hr_dev->caps.mtt_buf_pg_sz; + buf_attr->page_shift = HNS_HW_PAGE_SHIFT + hr_dev->caps.mtt_buf_pg_sz; buf_attr->fixed_page = true; buf_attr->region_count = idx; @@ -681,7 +681,7 @@ static int alloc_qp_buf(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp, goto err_inline; } ret = hns_roce_mtr_create(hr_dev, &hr_qp->mtr, &buf_attr, - PAGE_ADDR_SHIFT + hr_dev->caps.mtt_ba_pg_sz, + HNS_HW_PAGE_SHIFT + hr_dev->caps.mtt_ba_pg_sz, udata, addr); if (ret) { ibdev_err(ibdev, "failed to create WQE mtr, ret = %d.\n", ret); diff --git a/drivers/infiniband/hw/hns/hns_roce_srq.c b/drivers/infiniband/hw/hns/hns_roce_srq.c index 6e5a2adc2ab2..03b76e69a185 100644 --- a/drivers/infiniband/hw/hns/hns_roce_srq.c +++ b/drivers/infiniband/hw/hns/hns_roce_srq.c @@ -187,7 +187,7 @@ static int alloc_srq_buf(struct hns_roce_dev *hr_dev, struct hns_roce_srq *srq, HNS_ROCE_SGE_SIZE * srq->max_gs))); - buf_attr.page_shift = hr_dev->caps.srqwqe_buf_pg_sz + PAGE_ADDR_SHIFT; + buf_attr.page_shift = hr_dev->caps.srqwqe_buf_pg_sz + HNS_HW_PAGE_SHIFT; buf_attr.region[0].size = to_hr_hem_entries_size(srq->wqe_cnt, srq->wqe_shift); buf_attr.region[0].hopnum = hr_dev->caps.srqwqe_hop_num; @@ -196,7 +196,7 @@ static int alloc_srq_buf(struct hns_roce_dev *hr_dev, struct hns_roce_srq *srq, err = hns_roce_mtr_create(hr_dev, &srq->buf_mtr, &buf_attr, hr_dev->caps.srqwqe_ba_pg_sz + - PAGE_ADDR_SHIFT, udata, addr); + HNS_HW_PAGE_SHIFT, udata, addr); if (err) ibdev_err(ibdev, "Failed to alloc SRQ buf mtr, err %d\n", err); @@ -218,7 +218,7 @@ static int alloc_srq_idx(struct hns_roce_dev *hr_dev, struct hns_roce_srq *srq, srq->idx_que.entry_shift = ilog2(HNS_ROCE_IDX_QUE_ENTRY_SZ); - buf_attr.page_shift = hr_dev->caps.idx_buf_pg_sz + PAGE_ADDR_SHIFT; + buf_attr.page_shift = hr_dev->caps.idx_buf_pg_sz + HNS_HW_PAGE_SHIFT; buf_attr.region[0].size = to_hr_hem_entries_size(srq->wqe_cnt, srq->idx_que.entry_shift); buf_attr.region[0].hopnum = hr_dev->caps.idx_hop_num; @@ -226,7 +226,7 @@ static int alloc_srq_idx(struct hns_roce_dev *hr_dev, struct hns_roce_srq *srq, buf_attr.fixed_page = true; err = hns_roce_mtr_create(hr_dev, &idx_que->mtr, &buf_attr, - hr_dev->caps.idx_ba_pg_sz + PAGE_ADDR_SHIFT, + hr_dev->caps.idx_ba_pg_sz + HNS_HW_PAGE_SHIFT, udata, addr); if (err) { ibdev_err(ibdev, "Failed to alloc SRQ idx mtr, err %d\n", err); -- cgit v1.2.3 From 711195e57d341e58133d92cf8aaab1db24e4768d Mon Sep 17 00:00:00 2001 From: Lijun Ou Date: Fri, 8 May 2020 17:45:59 +0800 Subject: RDMA/hns: Reserve one sge in order to avoid local length error When rq/srq sge length is smaller than sq sge length, it will produce a local length error and may cause the bus to hang. Therefore, for rq wqe and srq wqe, one reserved sge pointing to a reserved mr is used to avoid this error. Link: https://lore.kernel.org/r/1588931159-56875-10-git-send-email-liweihang@huawei.com Signed-off-by: Lijun Ou Signed-off-by: Weihang Li Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_device.h | 2 ++ drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 9 +++++---- drivers/infiniband/hw/hns/hns_roce_hw_v2.h | 4 +++- drivers/infiniband/hw/hns/hns_roce_qp.c | 5 +++-- drivers/infiniband/hw/hns/hns_roce_srq.c | 2 +- 5 files changed, 14 insertions(+), 8 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index 1befdbe00b89..bd6e295f4669 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -66,6 +66,8 @@ #define HNS_ROCE_CQE_WCMD_EMPTY_BIT 0x2 #define HNS_ROCE_MIN_CQE_CNT 16 +#define HNS_ROCE_RESERVED_SGE 1 + #define HNS_ROCE_MAX_IRQ_NUM 128 #define HNS_ROCE_SGE_IN_WQE 2 diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index d31ecaa7010c..d2c58d395962 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -629,7 +629,7 @@ static int hns_roce_v2_post_recv(struct ib_qp *ibqp, wqe_idx = (hr_qp->rq.head + nreq) & (hr_qp->rq.wqe_cnt - 1); - if (unlikely(wr->num_sge > hr_qp->rq.max_gs)) { + if (unlikely(wr->num_sge >= hr_qp->rq.max_gs)) { ibdev_err(ibdev, "rq:num_sge=%d >= qp->sq.max_gs=%d\n", wr->num_sge, hr_qp->rq.max_gs); ret = -EINVAL; @@ -649,6 +649,7 @@ static int hns_roce_v2_post_recv(struct ib_qp *ibqp, if (i < hr_qp->rq.max_gs) { dseg->lkey = cpu_to_le32(HNS_ROCE_INVALID_LKEY); dseg->addr = 0; + dseg->len = cpu_to_le32(HNS_ROCE_INVALID_SGE_LENGTH); } /* rq support inline data */ @@ -782,8 +783,8 @@ static int hns_roce_v2_post_srq_recv(struct ib_srq *ibsrq, } if (i < srq->max_gs) { - dseg[i].len = 0; - dseg[i].lkey = cpu_to_le32(0x100); + dseg[i].len = cpu_to_le32(HNS_ROCE_INVALID_SGE_LENGTH); + dseg[i].lkey = cpu_to_le32(HNS_ROCE_INVALID_LKEY); dseg[i].addr = 0; } @@ -5045,7 +5046,7 @@ static int hns_roce_v2_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr) attr->srq_limit = limit_wl; attr->max_wr = srq->wqe_cnt - 1; - attr->max_sge = srq->max_gs; + attr->max_sge = srq->max_gs - HNS_ROCE_RESERVED_SGE; out: hns_roce_free_cmd_mailbox(hr_dev, mailbox); diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h index 938b7b522faf..532dcf6a05ff 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h @@ -92,7 +92,9 @@ #define HNS_ROCE_V2_CQC_TIMER_ENTRY_SZ PAGE_SIZE #define HNS_ROCE_V2_PAGE_SIZE_SUPPORTED 0xFFFFF000 #define HNS_ROCE_V2_MAX_INNER_MTPT_NUM 2 -#define HNS_ROCE_INVALID_LKEY 0x100 +#define HNS_ROCE_INVALID_LKEY 0x0 +#define HNS_ROCE_INVALID_SGE_LENGTH 0x80000000 + #define HNS_ROCE_CMQ_TX_TIMEOUT 30000 #define HNS_ROCE_V2_UC_RC_SGE_NUM_IN_WQE 2 #define HNS_ROCE_V2_RSV_QPS 8 diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c index e6ecdce4d63e..fb71755f6179 100644 --- a/drivers/infiniband/hw/hns/hns_roce_qp.c +++ b/drivers/infiniband/hw/hns/hns_roce_qp.c @@ -386,7 +386,8 @@ static int set_rq_size(struct hns_roce_dev *hr_dev, struct ib_qp_cap *cap, return -EINVAL; } - hr_qp->rq.max_gs = roundup_pow_of_two(max(1U, cap->max_recv_sge)); + hr_qp->rq.max_gs = roundup_pow_of_two(max(1U, cap->max_recv_sge) + + HNS_ROCE_RESERVED_SGE); if (hr_dev->caps.max_rq_sg <= HNS_ROCE_SGE_IN_WQE) hr_qp->rq.wqe_shift = ilog2(hr_dev->caps.max_rq_desc_sz); @@ -401,7 +402,7 @@ static int set_rq_size(struct hns_roce_dev *hr_dev, struct ib_qp_cap *cap, hr_qp->rq_inl_buf.wqe_cnt = 0; cap->max_recv_wr = cnt; - cap->max_recv_sge = hr_qp->rq.max_gs; + cap->max_recv_sge = hr_qp->rq.max_gs - HNS_ROCE_RESERVED_SGE; return 0; } diff --git a/drivers/infiniband/hw/hns/hns_roce_srq.c b/drivers/infiniband/hw/hns/hns_roce_srq.c index 03b76e69a185..3018c981f1d1 100644 --- a/drivers/infiniband/hw/hns/hns_roce_srq.c +++ b/drivers/infiniband/hw/hns/hns_roce_srq.c @@ -297,7 +297,7 @@ int hns_roce_create_srq(struct ib_srq *ib_srq, spin_lock_init(&srq->lock); srq->wqe_cnt = roundup_pow_of_two(init_attr->attr.max_wr + 1); - srq->max_gs = init_attr->attr.max_sge; + srq->max_gs = init_attr->attr.max_sge + HNS_ROCE_RESERVED_SGE; if (udata) { ret = ib_copy_from_udata(&ucmd, udata, sizeof(ucmd)); -- cgit v1.2.3 From 819f7427bafd494ef7ca4942ec6322db20722d7b Mon Sep 17 00:00:00 2001 From: Aharon Landau Date: Wed, 13 May 2020 12:55:50 +0300 Subject: RDMA/mlx5: Add init2init as a modify command Missing INIT2INIT entry in the list of modify commands caused DEVX applications to be unable to modify_qp for this transition state. Add the MLX5_CMD_OP_INIT2INIT_QP opcode to the list of allowed DEVX opcodes. Fixes: e662e14d801b ("IB/mlx5: Add DEVX support for modify and query commands") Link: https://lore.kernel.org/r/20200513095550.211345-1-leon@kernel.org Signed-off-by: Aharon Landau Reviewed-by: Maor Gottlieb Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/devx.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/infiniband/hw/mlx5/devx.c b/drivers/infiniband/hw/mlx5/devx.c index 1d7feed6d3cb..c339dd5ee694 100644 --- a/drivers/infiniband/hw/mlx5/devx.c +++ b/drivers/infiniband/hw/mlx5/devx.c @@ -820,6 +820,7 @@ static bool devx_is_obj_modify_cmd(const void *in) case MLX5_CMD_OP_SET_L2_TABLE_ENTRY: case MLX5_CMD_OP_RST2INIT_QP: case MLX5_CMD_OP_INIT2RTR_QP: + case MLX5_CMD_OP_INIT2INIT_QP: case MLX5_CMD_OP_RTR2RTS_QP: case MLX5_CMD_OP_RTS2RTS_QP: case MLX5_CMD_OP_SQERR2RTS_QP: -- cgit v1.2.3 From cc8a635e24acf2793605f243c913c51b8c3702ab Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Tue, 12 May 2020 18:22:03 +0300 Subject: RDMA/efa: Fix setting of wrong bit in get/set_feature commands When using a control buffer the ctrl_data bit should be set in order to indicate the control buffer address is valid, not ctrl_data_indirect which is used when the control buffer itself is indirect. Fixes: e9c6c5373088 ("RDMA/efa: Add common command handlers") Link: https://lore.kernel.org/r/20200512152204.93091-2-galpress@amazon.com Reviewed-by: Firas JahJah Reviewed-by: Yossi Leybovich Signed-off-by: Gal Pressman Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/efa/efa_com_cmd.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/efa/efa_com_cmd.c b/drivers/infiniband/hw/efa/efa_com_cmd.c index eea5574a62e8..69f842c92ff6 100644 --- a/drivers/infiniband/hw/efa/efa_com_cmd.c +++ b/drivers/infiniband/hw/efa/efa_com_cmd.c @@ -388,7 +388,7 @@ static int efa_com_get_feature_ex(struct efa_com_dev *edev, if (control_buff_size) EFA_SET(&get_cmd.aq_common_descriptor.flags, - EFA_ADMIN_AQ_COMMON_DESC_CTRL_DATA_INDIRECT, 1); + EFA_ADMIN_AQ_COMMON_DESC_CTRL_DATA, 1); efa_com_set_dma_addr(control_buf_dma_addr, &get_cmd.control_buffer.address.mem_addr_high, @@ -540,7 +540,7 @@ static int efa_com_set_feature_ex(struct efa_com_dev *edev, if (control_buff_size) { set_cmd->aq_common_descriptor.flags = 0; EFA_SET(&set_cmd->aq_common_descriptor.flags, - EFA_ADMIN_AQ_COMMON_DESC_CTRL_DATA_INDIRECT, 1); + EFA_ADMIN_AQ_COMMON_DESC_CTRL_DATA, 1); efa_com_set_dma_addr(control_buf_dma_addr, &set_cmd->control_buffer.address.mem_addr_high, &set_cmd->control_buffer.address.mem_addr_low); -- cgit v1.2.3 From e1ca01a902fedebd72978f606f4be767ea4a26ea Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Tue, 12 May 2020 18:22:04 +0300 Subject: RDMA/efa: Report host information to the device The host info feature allows the driver to infrom the EFA device firmware with system configuration for debugging and troubleshooting purposes. The host info buffer is passed as an admin command DMA mapped control buffer, and is unmapped and freed once the command CQE is consumed. Currently, the setting of host info is done for each device on its probe. Failing to set the host info for the device shall not disturb the probe flow, any errors will be discarded. Link: https://lore.kernel.org/r/20200512152204.93091-3-galpress@amazon.com Reviewed-by: Firas JahJah Reviewed-by: Guy Tzalik Signed-off-by: Gal Pressman Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/efa/efa_admin_cmds_defs.h | 63 ++++++++++++++++++++++++- drivers/infiniband/hw/efa/efa_com_cmd.c | 14 +++--- drivers/infiniband/hw/efa/efa_com_cmd.h | 11 ++++- drivers/infiniband/hw/efa/efa_main.c | 52 +++++++++++++++++++- 4 files changed, 130 insertions(+), 10 deletions(-) diff --git a/drivers/infiniband/hw/efa/efa_admin_cmds_defs.h b/drivers/infiniband/hw/efa/efa_admin_cmds_defs.h index 96b104ab5415..bef2bd291054 100644 --- a/drivers/infiniband/hw/efa/efa_admin_cmds_defs.h +++ b/drivers/infiniband/hw/efa/efa_admin_cmds_defs.h @@ -37,7 +37,7 @@ enum efa_admin_aq_feature_id { EFA_ADMIN_NETWORK_ATTR = 3, EFA_ADMIN_QUEUE_ATTR = 4, EFA_ADMIN_HW_HINTS = 5, - EFA_ADMIN_FEATURES_OPCODE_NUM = 8, + EFA_ADMIN_HOST_INFO = 6, }; /* QP transport type */ @@ -799,6 +799,54 @@ struct efa_admin_mmio_req_read_less_resp { u32 reg_val; }; +enum efa_admin_os_type { + EFA_ADMIN_OS_LINUX = 0, +}; + +struct efa_admin_host_info { + /* OS distribution string format */ + u8 os_dist_str[128]; + + /* Defined in enum efa_admin_os_type */ + u32 os_type; + + /* Kernel version string format */ + u8 kernel_ver_str[32]; + + /* Kernel version numeric format */ + u32 kernel_ver; + + /* + * 7:0 : driver_module_type + * 15:8 : driver_sub_minor + * 23:16 : driver_minor + * 31:24 : driver_major + */ + u32 driver_ver; + + /* + * Device's Bus, Device and Function + * 2:0 : function + * 7:3 : device + * 15:8 : bus + */ + u16 bdf; + + /* + * Spec version + * 7:0 : spec_minor + * 15:8 : spec_major + */ + u16 spec_ver; + + /* + * 0 : intree - Intree driver + * 1 : gdr - GPUDirect RDMA supported + * 31:2 : reserved2 + */ + u32 flags; +}; + /* create_qp_cmd */ #define EFA_ADMIN_CREATE_QP_CMD_SQ_VIRT_MASK BIT(0) #define EFA_ADMIN_CREATE_QP_CMD_RQ_VIRT_MASK BIT(1) @@ -820,4 +868,17 @@ struct efa_admin_mmio_req_read_less_resp { /* feature_device_attr_desc */ #define EFA_ADMIN_FEATURE_DEVICE_ATTR_DESC_RDMA_READ_MASK BIT(0) +/* host_info */ +#define EFA_ADMIN_HOST_INFO_DRIVER_MODULE_TYPE_MASK GENMASK(7, 0) +#define EFA_ADMIN_HOST_INFO_DRIVER_SUB_MINOR_MASK GENMASK(15, 8) +#define EFA_ADMIN_HOST_INFO_DRIVER_MINOR_MASK GENMASK(23, 16) +#define EFA_ADMIN_HOST_INFO_DRIVER_MAJOR_MASK GENMASK(31, 24) +#define EFA_ADMIN_HOST_INFO_FUNCTION_MASK GENMASK(2, 0) +#define EFA_ADMIN_HOST_INFO_DEVICE_MASK GENMASK(7, 3) +#define EFA_ADMIN_HOST_INFO_BUS_MASK GENMASK(15, 8) +#define EFA_ADMIN_HOST_INFO_SPEC_MINOR_MASK GENMASK(7, 0) +#define EFA_ADMIN_HOST_INFO_SPEC_MAJOR_MASK GENMASK(15, 8) +#define EFA_ADMIN_HOST_INFO_INTREE_MASK BIT(0) +#define EFA_ADMIN_HOST_INFO_GDR_MASK BIT(1) + #endif /* _EFA_ADMIN_CMDS_H_ */ diff --git a/drivers/infiniband/hw/efa/efa_com_cmd.c b/drivers/infiniband/hw/efa/efa_com_cmd.c index 69f842c92ff6..fabd8df2e78f 100644 --- a/drivers/infiniband/hw/efa/efa_com_cmd.c +++ b/drivers/infiniband/hw/efa/efa_com_cmd.c @@ -351,7 +351,7 @@ int efa_com_destroy_ah(struct efa_com_dev *edev, return 0; } -static bool +bool efa_com_check_supported_feature_id(struct efa_com_dev *edev, enum efa_admin_aq_feature_id feature_id) { @@ -517,12 +517,12 @@ int efa_com_get_hw_hints(struct efa_com_dev *edev, return 0; } -static int efa_com_set_feature_ex(struct efa_com_dev *edev, - struct efa_admin_set_feature_resp *set_resp, - struct efa_admin_set_feature_cmd *set_cmd, - enum efa_admin_aq_feature_id feature_id, - dma_addr_t control_buf_dma_addr, - u32 control_buff_size) +int efa_com_set_feature_ex(struct efa_com_dev *edev, + struct efa_admin_set_feature_resp *set_resp, + struct efa_admin_set_feature_cmd *set_cmd, + enum efa_admin_aq_feature_id feature_id, + dma_addr_t control_buf_dma_addr, + u32 control_buff_size) { struct efa_com_admin_queue *aq; int err; diff --git a/drivers/infiniband/hw/efa/efa_com_cmd.h b/drivers/infiniband/hw/efa/efa_com_cmd.h index 31db5a0cbd5b..41ce4a476ee6 100644 --- a/drivers/infiniband/hw/efa/efa_com_cmd.h +++ b/drivers/infiniband/hw/efa/efa_com_cmd.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */ /* - * Copyright 2018-2019 Amazon.com, Inc. or its affiliates. All rights reserved. + * Copyright 2018-2020 Amazon.com, Inc. or its affiliates. All rights reserved. */ #ifndef _EFA_COM_CMD_H_ @@ -270,6 +270,15 @@ int efa_com_get_device_attr(struct efa_com_dev *edev, struct efa_com_get_device_attr_result *result); int efa_com_get_hw_hints(struct efa_com_dev *edev, struct efa_com_get_hw_hints_result *result); +bool +efa_com_check_supported_feature_id(struct efa_com_dev *edev, + enum efa_admin_aq_feature_id feature_id); +int efa_com_set_feature_ex(struct efa_com_dev *edev, + struct efa_admin_set_feature_resp *set_resp, + struct efa_admin_set_feature_cmd *set_cmd, + enum efa_admin_aq_feature_id feature_id, + dma_addr_t control_buf_dma_addr, + u32 control_buff_size); int efa_com_set_aenq_config(struct efa_com_dev *edev, u32 groups); int efa_com_alloc_pd(struct efa_com_dev *edev, struct efa_com_alloc_pd_result *result); diff --git a/drivers/infiniband/hw/efa/efa_main.c b/drivers/infiniband/hw/efa/efa_main.c index faf3ff1bca2a..82145574c928 100644 --- a/drivers/infiniband/hw/efa/efa_main.c +++ b/drivers/infiniband/hw/efa/efa_main.c @@ -1,10 +1,12 @@ // SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause /* - * Copyright 2018-2019 Amazon.com, Inc. or its affiliates. All rights reserved. + * Copyright 2018-2020 Amazon.com, Inc. or its affiliates. All rights reserved. */ #include #include +#include +#include #include @@ -187,6 +189,52 @@ static void efa_stats_init(struct efa_dev *dev) atomic64_set(s, 0); } +static void efa_set_host_info(struct efa_dev *dev) +{ + struct efa_admin_set_feature_resp resp = {}; + struct efa_admin_set_feature_cmd cmd = {}; + struct efa_admin_host_info *hinf; + u32 bufsz = sizeof(*hinf); + dma_addr_t hinf_dma; + + if (!efa_com_check_supported_feature_id(&dev->edev, + EFA_ADMIN_HOST_INFO)) + return; + + /* Failures in host info set shall not disturb probe */ + hinf = dma_alloc_coherent(&dev->pdev->dev, bufsz, &hinf_dma, + GFP_KERNEL); + if (!hinf) + return; + + strlcpy(hinf->os_dist_str, utsname()->release, + min(sizeof(hinf->os_dist_str), sizeof(utsname()->release))); + hinf->os_type = EFA_ADMIN_OS_LINUX; + strlcpy(hinf->kernel_ver_str, utsname()->version, + min(sizeof(hinf->kernel_ver_str), sizeof(utsname()->version))); + hinf->kernel_ver = LINUX_VERSION_CODE; + EFA_SET(&hinf->driver_ver, EFA_ADMIN_HOST_INFO_DRIVER_MAJOR, 0); + EFA_SET(&hinf->driver_ver, EFA_ADMIN_HOST_INFO_DRIVER_MINOR, 0); + EFA_SET(&hinf->driver_ver, EFA_ADMIN_HOST_INFO_DRIVER_SUB_MINOR, 0); + EFA_SET(&hinf->driver_ver, EFA_ADMIN_HOST_INFO_DRIVER_MODULE_TYPE, 0); + EFA_SET(&hinf->bdf, EFA_ADMIN_HOST_INFO_BUS, dev->pdev->bus->number); + EFA_SET(&hinf->bdf, EFA_ADMIN_HOST_INFO_DEVICE, + PCI_SLOT(dev->pdev->devfn)); + EFA_SET(&hinf->bdf, EFA_ADMIN_HOST_INFO_FUNCTION, + PCI_FUNC(dev->pdev->devfn)); + EFA_SET(&hinf->spec_ver, EFA_ADMIN_HOST_INFO_SPEC_MAJOR, + EFA_COMMON_SPEC_VERSION_MAJOR); + EFA_SET(&hinf->spec_ver, EFA_ADMIN_HOST_INFO_SPEC_MINOR, + EFA_COMMON_SPEC_VERSION_MINOR); + EFA_SET(&hinf->flags, EFA_ADMIN_HOST_INFO_INTREE, 1); + EFA_SET(&hinf->flags, EFA_ADMIN_HOST_INFO_GDR, 0); + + efa_com_set_feature_ex(&dev->edev, &resp, &cmd, EFA_ADMIN_HOST_INFO, + hinf_dma, bufsz); + + dma_free_coherent(&dev->pdev->dev, bufsz, hinf, hinf_dma); +} + static const struct ib_device_ops efa_dev_ops = { .owner = THIS_MODULE, .driver_id = RDMA_DRIVER_EFA, @@ -251,6 +299,8 @@ static int efa_ib_device_add(struct efa_dev *dev) if (err) goto err_release_doorbell_bar; + efa_set_host_info(dev); + dev->ibdev.node_type = RDMA_NODE_UNSPECIFIED; dev->ibdev.phys_port_cnt = 1; dev->ibdev.num_comp_vectors = 1; -- cgit v1.2.3 From fe810b509c5f62b5b3d5681ea6f5d36349ced979 Mon Sep 17 00:00:00 2001 From: Kaike Wan Date: Mon, 11 May 2020 12:05:41 -0400 Subject: IB/hfi1: Add accelerated IP capability bit The accelerated IP capability bit is added to allow users to control which feature is enabled and disabled. Link: https://lore.kernel.org/r/20200511160541.173205.96870.stgit@awfm-01.aw.intel.com Reviewed-by: Dennis Dalessandro Reviewed-by: Mike Marciniszyn Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/common.h | 5 +++-- include/uapi/rdma/hfi/hfi1_user.h | 3 ++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/common.h b/drivers/infiniband/hw/hfi1/common.h index 40a1ff0c8a8e..1f7107e35a43 100644 --- a/drivers/infiniband/hw/hfi1/common.h +++ b/drivers/infiniband/hw/hfi1/common.h @@ -1,5 +1,5 @@ /* - * Copyright(c) 2015 - 2018 Intel Corporation. + * Copyright(c) 2015 - 2020 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. @@ -149,7 +149,8 @@ HFI1_CAP_NO_INTEGRITY | \ HFI1_CAP_PKEY_CHECK | \ HFI1_CAP_TID_RDMA | \ - HFI1_CAP_OPFN) << \ + HFI1_CAP_OPFN | \ + HFI1_CAP_AIP) << \ HFI1_CAP_USER_SHIFT) /* * Set of capabilities that need to be enabled for kernel context in diff --git a/include/uapi/rdma/hfi/hfi1_user.h b/include/uapi/rdma/hfi/hfi1_user.h index 01ac5853d9ac..d95ef9a2b032 100644 --- a/include/uapi/rdma/hfi/hfi1_user.h +++ b/include/uapi/rdma/hfi/hfi1_user.h @@ -6,7 +6,7 @@ * * GPL LICENSE SUMMARY * - * Copyright(c) 2015 - 2018 Intel Corporation. + * Copyright(c) 2015 - 2020 Intel Corporation. * * This program is free software; you can redistribute it and/or modify * it under the terms of version 2 of the GNU General Public License as @@ -109,6 +109,7 @@ #define HFI1_CAP_OPFN (1UL << 16) /* Enable the OPFN protocol */ #define HFI1_CAP_SDMA_HEAD_CHECK (1UL << 17) /* SDMA head checking */ #define HFI1_CAP_EARLY_CREDIT_RETURN (1UL << 18) /* early credit return */ +#define HFI1_CAP_AIP (1UL << 19) /* Enable accelerated IP */ #define HFI1_RCVHDR_ENTSIZE_2 (1UL << 0) #define HFI1_RCVHDR_ENTSIZE_16 (1UL << 1) -- cgit v1.2.3 From d99dc602e2a55a99940ba9506a7126dfa54d54ea Mon Sep 17 00:00:00 2001 From: Gary Leshner Date: Mon, 11 May 2020 12:05:48 -0400 Subject: IB/hfi1: Add functions to transmit datagram ipoib packets This patch implements the mechanism to accelerate the transmit side of a multiple transmit queue RDMA netdev by submitting the packets to the SDMA engine directly instead of sending through the verbs layer. This patch also changes the UD/SEND_ONLY op to output the entropy value in byte 0 of deth[1]. UD/SEND_ONLY_WITH_IMMEDIATE uses the previous behavior with no entropy value being output. The code in the ipoib rdma netdev which submits tx requests upon successful submission will call trace_sdma_output_ibhdr to output the ibhdr to the trace buffer. Link: https://lore.kernel.org/r/20200511160548.173205.45616.stgit@awfm-01.aw.intel.com Reviewed-by: Mike Marciniszyn Reviewed-by: Dennis Dalessandro Signed-off-by: Gary Leshner Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/Makefile | 1 + drivers/infiniband/hw/hfi1/ipoib.h | 145 ++++++ drivers/infiniband/hw/hfi1/ipoib_tx.c | 828 ++++++++++++++++++++++++++++++++++ drivers/infiniband/hw/hfi1/trace.c | 10 +- include/rdma/ib_verbs.h | 1 + 5 files changed, 984 insertions(+), 1 deletion(-) create mode 100644 drivers/infiniband/hw/hfi1/ipoib.h create mode 100644 drivers/infiniband/hw/hfi1/ipoib_tx.c diff --git a/drivers/infiniband/hw/hfi1/Makefile b/drivers/infiniband/hw/hfi1/Makefile index 0405d26d0833..09ef0b8b8ac7 100644 --- a/drivers/infiniband/hw/hfi1/Makefile +++ b/drivers/infiniband/hw/hfi1/Makefile @@ -22,6 +22,7 @@ hfi1-y := \ init.o \ intr.o \ iowait.o \ + ipoib_tx.o \ mad.o \ mmu_rb.o \ msix.o \ diff --git a/drivers/infiniband/hw/hfi1/ipoib.h b/drivers/infiniband/hw/hfi1/ipoib.h new file mode 100644 index 000000000000..2b541abde266 --- /dev/null +++ b/drivers/infiniband/hw/hfi1/ipoib.h @@ -0,0 +1,145 @@ +/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */ +/* + * Copyright(c) 2020 Intel Corporation. + * + */ + +/* + * This file contains HFI1 support for IPOIB functionality + */ + +#ifndef HFI1_IPOIB_H +#define HFI1_IPOIB_H + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "hfi.h" +#include "iowait.h" + +#include + +#define HFI1_IPOIB_ENTROPY_SHIFT 24 + +#define HFI1_IPOIB_TXREQ_NAME_LEN 32 + +#define HFI1_IPOIB_ENCAP_LEN 4 + +struct hfi1_ipoib_dev_priv; + +union hfi1_ipoib_flow { + u16 as_int; + struct { + u8 tx_queue; + u8 sc5; + } __attribute__((__packed__)); +}; + +/** + * struct hfi1_ipoib_circ_buf - List of items to be processed + * @items: ring of items + * @head: ring head + * @tail: ring tail + * @max_items: max items + 1 that the ring can contain + * @producer_lock: producer sync lock + * @consumer_lock: consumer sync lock + */ +struct hfi1_ipoib_circ_buf { + void **items; + unsigned long head; + unsigned long tail; + unsigned long max_items; + spinlock_t producer_lock; /* head sync lock */ + spinlock_t consumer_lock; /* tail sync lock */ +}; + +/** + * struct hfi1_ipoib_txq - IPOIB per Tx queue information + * @priv: private pointer + * @sde: sdma engine + * @tx_list: tx request list + * @sent_txreqs: count of txreqs posted to sdma + * @flow: tracks when list needs to be flushed for a flow change + * @q_idx: ipoib Tx queue index + * @pkts_sent: indicator packets have been sent from this queue + * @wait: iowait structure + * @complete_txreqs: count of txreqs completed by sdma + * @napi: pointer to tx napi interface + * @tx_ring: ring of ipoib txreqs to be reaped by napi callback + */ +struct hfi1_ipoib_txq { + struct hfi1_ipoib_dev_priv *priv; + struct sdma_engine *sde; + struct list_head tx_list; + u64 sent_txreqs; + union hfi1_ipoib_flow flow; + u8 q_idx; + bool pkts_sent; + struct iowait wait; + + atomic64_t ____cacheline_aligned_in_smp complete_txreqs; + struct napi_struct *napi; + struct hfi1_ipoib_circ_buf tx_ring; +}; + +struct hfi1_ipoib_dev_priv { + struct hfi1_devdata *dd; + struct net_device *netdev; + struct ib_device *device; + struct hfi1_ipoib_txq *txqs; + struct kmem_cache *txreq_cache; + struct napi_struct *tx_napis; + u16 pkey; + u16 pkey_index; + u32 qkey; + u8 port_num; + + const struct net_device_ops *netdev_ops; + struct rvt_qp *qp; + struct pcpu_sw_netstats __percpu *netstats; +}; + +/* hfi1 ipoib rdma netdev's private data structure */ +struct hfi1_ipoib_rdma_netdev { + struct rdma_netdev rn; /* keep this first */ + /* followed by device private data */ + struct hfi1_ipoib_dev_priv dev_priv; +}; + +static inline struct hfi1_ipoib_dev_priv * +hfi1_ipoib_priv(const struct net_device *dev) +{ + return &((struct hfi1_ipoib_rdma_netdev *)netdev_priv(dev))->dev_priv; +} + +static inline void +hfi1_ipoib_update_tx_netstats(struct hfi1_ipoib_dev_priv *priv, + u64 packets, + u64 bytes) +{ + struct pcpu_sw_netstats *netstats = this_cpu_ptr(priv->netstats); + + u64_stats_update_begin(&netstats->syncp); + netstats->tx_packets += packets; + netstats->tx_bytes += bytes; + u64_stats_update_end(&netstats->syncp); +} + +int hfi1_ipoib_send_dma(struct net_device *dev, + struct sk_buff *skb, + struct ib_ah *address, + u32 dqpn); + +int hfi1_ipoib_txreq_init(struct hfi1_ipoib_dev_priv *priv); +void hfi1_ipoib_txreq_deinit(struct hfi1_ipoib_dev_priv *priv); + +void hfi1_ipoib_napi_tx_enable(struct net_device *dev); +void hfi1_ipoib_napi_tx_disable(struct net_device *dev); + +#endif /* _IPOIB_H */ diff --git a/drivers/infiniband/hw/hfi1/ipoib_tx.c b/drivers/infiniband/hw/hfi1/ipoib_tx.c new file mode 100644 index 000000000000..883cb9d48022 --- /dev/null +++ b/drivers/infiniband/hw/hfi1/ipoib_tx.c @@ -0,0 +1,828 @@ +// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) +/* + * Copyright(c) 2020 Intel Corporation. + * + */ + +/* + * This file contains HFI1 support for IPOIB SDMA functionality + */ + +#include +#include + +#include "sdma.h" +#include "verbs.h" +#include "trace_ibhdrs.h" +#include "ipoib.h" + +/* Add a convenience helper */ +#define CIRC_ADD(val, add, size) (((val) + (add)) & ((size) - 1)) +#define CIRC_NEXT(val, size) CIRC_ADD(val, 1, size) +#define CIRC_PREV(val, size) CIRC_ADD(val, -1, size) + +/** + * struct ipoib_txreq - IPOIB transmit descriptor + * @txreq: sdma transmit request + * @sdma_hdr: 9b ib headers + * @sdma_status: status returned by sdma engine + * @priv: ipoib netdev private data + * @txq: txq on which skb was output + * @skb: skb to send + */ +struct ipoib_txreq { + struct sdma_txreq txreq; + struct hfi1_sdma_header sdma_hdr; + int sdma_status; + struct hfi1_ipoib_dev_priv *priv; + struct hfi1_ipoib_txq *txq; + struct sk_buff *skb; +}; + +struct ipoib_txparms { + struct hfi1_devdata *dd; + struct rdma_ah_attr *ah_attr; + struct hfi1_ibport *ibp; + struct hfi1_ipoib_txq *txq; + union hfi1_ipoib_flow flow; + u32 dqpn; + u8 hdr_dwords; + u8 entropy; +}; + +static u64 hfi1_ipoib_txreqs(const u64 sent, const u64 completed) +{ + return sent - completed; +} + +static void hfi1_ipoib_check_queue_depth(struct hfi1_ipoib_txq *txq) +{ + if (unlikely(hfi1_ipoib_txreqs(++txq->sent_txreqs, + atomic64_read(&txq->complete_txreqs)) >= + min_t(unsigned int, txq->priv->netdev->tx_queue_len, + txq->tx_ring.max_items - 1))) + netif_stop_subqueue(txq->priv->netdev, txq->q_idx); +} + +static void hfi1_ipoib_check_queue_stopped(struct hfi1_ipoib_txq *txq) +{ + struct net_device *dev = txq->priv->netdev; + + /* If the queue is already running just return */ + if (likely(!__netif_subqueue_stopped(dev, txq->q_idx))) + return; + + /* If shutting down just return as queue state is irrelevant */ + if (unlikely(dev->reg_state != NETREG_REGISTERED)) + return; + + /* + * When the queue has been drained to less than half full it will be + * restarted. + * The size of the txreq ring is fixed at initialization. + * The tx queue len can be adjusted upward while the interface is + * running. + * The tx queue len can be large enough to overflow the txreq_ring. + * Use the minimum of the current tx_queue_len or the rings max txreqs + * to protect against ring overflow. + */ + if (hfi1_ipoib_txreqs(txq->sent_txreqs, + atomic64_read(&txq->complete_txreqs)) + < min_t(unsigned int, dev->tx_queue_len, + txq->tx_ring.max_items) >> 1) + netif_wake_subqueue(dev, txq->q_idx); +} + +static void hfi1_ipoib_free_tx(struct ipoib_txreq *tx, int budget) +{ + struct hfi1_ipoib_dev_priv *priv = tx->priv; + + if (likely(!tx->sdma_status)) { + hfi1_ipoib_update_tx_netstats(priv, 1, tx->skb->len); + } else { + ++priv->netdev->stats.tx_errors; + dd_dev_warn(priv->dd, + "%s: Status = 0x%x pbc 0x%llx txq = %d sde = %d\n", + __func__, tx->sdma_status, + le64_to_cpu(tx->sdma_hdr.pbc), tx->txq->q_idx, + tx->txq->sde->this_idx); + } + + napi_consume_skb(tx->skb, budget); + sdma_txclean(priv->dd, &tx->txreq); + kmem_cache_free(priv->txreq_cache, tx); +} + +static int hfi1_ipoib_drain_tx_ring(struct hfi1_ipoib_txq *txq, int budget) +{ + struct hfi1_ipoib_circ_buf *tx_ring = &txq->tx_ring; + unsigned long head; + unsigned long tail; + unsigned int max_tx; + int work_done; + int tx_count; + + spin_lock_bh(&tx_ring->consumer_lock); + + /* Read index before reading contents at that index. */ + head = smp_load_acquire(&tx_ring->head); + tail = tx_ring->tail; + max_tx = tx_ring->max_items; + + work_done = min_t(int, CIRC_CNT(head, tail, max_tx), budget); + + for (tx_count = work_done; tx_count; tx_count--) { + hfi1_ipoib_free_tx(tx_ring->items[tail], budget); + tail = CIRC_NEXT(tail, max_tx); + } + + atomic64_add(work_done, &txq->complete_txreqs); + + /* Finished freeing tx items so store the tail value. */ + smp_store_release(&tx_ring->tail, tail); + + spin_unlock_bh(&tx_ring->consumer_lock); + + hfi1_ipoib_check_queue_stopped(txq); + + return work_done; +} + +static int hfi1_ipoib_process_tx_ring(struct napi_struct *napi, int budget) +{ + struct hfi1_ipoib_dev_priv *priv = hfi1_ipoib_priv(napi->dev); + struct hfi1_ipoib_txq *txq = &priv->txqs[napi - priv->tx_napis]; + + int work_done = hfi1_ipoib_drain_tx_ring(txq, budget); + + if (work_done < budget) + napi_complete_done(napi, work_done); + + return work_done; +} + +static void hfi1_ipoib_add_tx(struct ipoib_txreq *tx) +{ + struct hfi1_ipoib_circ_buf *tx_ring = &tx->txq->tx_ring; + unsigned long head; + unsigned long tail; + size_t max_tx; + + spin_lock(&tx_ring->producer_lock); + + head = tx_ring->head; + tail = READ_ONCE(tx_ring->tail); + max_tx = tx_ring->max_items; + + if (likely(CIRC_SPACE(head, tail, max_tx))) { + tx_ring->items[head] = tx; + + /* Finish storing txreq before incrementing head. */ + smp_store_release(&tx_ring->head, CIRC_ADD(head, 1, max_tx)); + napi_schedule(tx->txq->napi); + } else { + struct hfi1_ipoib_txq *txq = tx->txq; + struct hfi1_ipoib_dev_priv *priv = tx->priv; + + /* Ring was full */ + hfi1_ipoib_free_tx(tx, 0); + atomic64_inc(&txq->complete_txreqs); + dd_dev_dbg(priv->dd, "txq %d full.\n", txq->q_idx); + } + + spin_unlock(&tx_ring->producer_lock); +} + +static void hfi1_ipoib_sdma_complete(struct sdma_txreq *txreq, int status) +{ + struct ipoib_txreq *tx = container_of(txreq, struct ipoib_txreq, txreq); + + tx->sdma_status = status; + + hfi1_ipoib_add_tx(tx); +} + +static int hfi1_ipoib_build_ulp_payload(struct ipoib_txreq *tx, + struct ipoib_txparms *txp) +{ + struct hfi1_devdata *dd = txp->dd; + struct sdma_txreq *txreq = &tx->txreq; + struct sk_buff *skb = tx->skb; + int ret = 0; + int i; + + if (skb_headlen(skb)) { + ret = sdma_txadd_kvaddr(dd, txreq, skb->data, skb_headlen(skb)); + if (unlikely(ret)) + return ret; + } + + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + const skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + + ret = sdma_txadd_page(dd, + txreq, + skb_frag_page(frag), + frag->bv_offset, + skb_frag_size(frag)); + if (unlikely(ret)) + break; + } + + return ret; +} + +static int hfi1_ipoib_build_tx_desc(struct ipoib_txreq *tx, + struct ipoib_txparms *txp) +{ + struct hfi1_devdata *dd = txp->dd; + struct sdma_txreq *txreq = &tx->txreq; + struct hfi1_sdma_header *sdma_hdr = &tx->sdma_hdr; + u16 pkt_bytes = + sizeof(sdma_hdr->pbc) + (txp->hdr_dwords << 2) + tx->skb->len; + int ret; + + ret = sdma_txinit(txreq, 0, pkt_bytes, hfi1_ipoib_sdma_complete); + if (unlikely(ret)) + return ret; + + /* add pbc + headers */ + ret = sdma_txadd_kvaddr(dd, + txreq, + sdma_hdr, + sizeof(sdma_hdr->pbc) + (txp->hdr_dwords << 2)); + if (unlikely(ret)) + return ret; + + /* add the ulp payload */ + return hfi1_ipoib_build_ulp_payload(tx, txp); +} + +static void hfi1_ipoib_build_ib_tx_headers(struct ipoib_txreq *tx, + struct ipoib_txparms *txp) +{ + struct hfi1_ipoib_dev_priv *priv = tx->priv; + struct hfi1_sdma_header *sdma_hdr = &tx->sdma_hdr; + struct sk_buff *skb = tx->skb; + struct hfi1_pportdata *ppd = ppd_from_ibp(txp->ibp); + struct rdma_ah_attr *ah_attr = txp->ah_attr; + struct ib_other_headers *ohdr; + struct ib_grh *grh; + u16 dwords; + u16 slid; + u16 dlid; + u16 lrh0; + u32 bth0; + u32 sqpn = (u32)(priv->netdev->dev_addr[1] << 16 | + priv->netdev->dev_addr[2] << 8 | + priv->netdev->dev_addr[3]); + u16 payload_dwords; + u8 pad_cnt; + + pad_cnt = -skb->len & 3; + + /* Includes ICRC */ + payload_dwords = ((skb->len + pad_cnt) >> 2) + SIZE_OF_CRC; + + /* header size in dwords LRH+BTH+DETH = (8+12+8)/4. */ + txp->hdr_dwords = 7; + + if (rdma_ah_get_ah_flags(ah_attr) & IB_AH_GRH) { + grh = &sdma_hdr->hdr.ibh.u.l.grh; + txp->hdr_dwords += + hfi1_make_grh(txp->ibp, + grh, + rdma_ah_read_grh(ah_attr), + txp->hdr_dwords - LRH_9B_DWORDS, + payload_dwords); + lrh0 = HFI1_LRH_GRH; + ohdr = &sdma_hdr->hdr.ibh.u.l.oth; + } else { + lrh0 = HFI1_LRH_BTH; + ohdr = &sdma_hdr->hdr.ibh.u.oth; + } + + lrh0 |= (rdma_ah_get_sl(ah_attr) & 0xf) << 4; + lrh0 |= (txp->flow.sc5 & 0xf) << 12; + + dlid = opa_get_lid(rdma_ah_get_dlid(ah_attr), 9B); + if (dlid == be16_to_cpu(IB_LID_PERMISSIVE)) { + slid = be16_to_cpu(IB_LID_PERMISSIVE); + } else { + u16 lid = (u16)ppd->lid; + + if (lid) { + lid |= rdma_ah_get_path_bits(ah_attr) & + ((1 << ppd->lmc) - 1); + slid = lid; + } else { + slid = be16_to_cpu(IB_LID_PERMISSIVE); + } + } + + /* Includes ICRC */ + dwords = txp->hdr_dwords + payload_dwords; + + /* Build the lrh */ + sdma_hdr->hdr.hdr_type = HFI1_PKT_TYPE_9B; + hfi1_make_ib_hdr(&sdma_hdr->hdr.ibh, lrh0, dwords, dlid, slid); + + /* Build the bth */ + bth0 = (IB_OPCODE_UD_SEND_ONLY << 24) | (pad_cnt << 20) | priv->pkey; + + ohdr->bth[0] = cpu_to_be32(bth0); + ohdr->bth[1] = cpu_to_be32(txp->dqpn); + ohdr->bth[2] = cpu_to_be32(mask_psn((u32)txp->txq->sent_txreqs)); + + /* Build the deth */ + ohdr->u.ud.deth[0] = cpu_to_be32(priv->qkey); + ohdr->u.ud.deth[1] = cpu_to_be32((txp->entropy << + HFI1_IPOIB_ENTROPY_SHIFT) | sqpn); + + /* Construct the pbc. */ + sdma_hdr->pbc = + cpu_to_le64(create_pbc(ppd, + ib_is_sc5(txp->flow.sc5) << + PBC_DC_INFO_SHIFT, + 0, + sc_to_vlt(priv->dd, txp->flow.sc5), + dwords - SIZE_OF_CRC + + (sizeof(sdma_hdr->pbc) >> 2))); +} + +static struct ipoib_txreq *hfi1_ipoib_send_dma_common(struct net_device *dev, + struct sk_buff *skb, + struct ipoib_txparms *txp) +{ + struct hfi1_ipoib_dev_priv *priv = hfi1_ipoib_priv(dev); + struct ipoib_txreq *tx; + int ret; + + tx = kmem_cache_alloc_node(priv->txreq_cache, + GFP_ATOMIC, + priv->dd->node); + if (unlikely(!tx)) + return ERR_PTR(-ENOMEM); + + /* so that we can test if the sdma decriptors are there */ + tx->txreq.num_desc = 0; + tx->priv = priv; + tx->txq = txp->txq; + tx->skb = skb; + + hfi1_ipoib_build_ib_tx_headers(tx, txp); + + ret = hfi1_ipoib_build_tx_desc(tx, txp); + if (likely(!ret)) { + if (txp->txq->flow.as_int != txp->flow.as_int) { + txp->txq->flow.tx_queue = txp->flow.tx_queue; + txp->txq->flow.sc5 = txp->flow.sc5; + txp->txq->sde = + sdma_select_engine_sc(priv->dd, + txp->flow.tx_queue, + txp->flow.sc5); + } + + return tx; + } + + sdma_txclean(priv->dd, &tx->txreq); + kmem_cache_free(priv->txreq_cache, tx); + + return ERR_PTR(ret); +} + +static int hfi1_ipoib_submit_tx_list(struct net_device *dev, + struct hfi1_ipoib_txq *txq) +{ + int ret; + u16 count_out; + + ret = sdma_send_txlist(txq->sde, + iowait_get_ib_work(&txq->wait), + &txq->tx_list, + &count_out); + if (likely(!ret) || ret == -EBUSY || ret == -ECOMM) + return ret; + + dd_dev_warn(txq->priv->dd, "cannot send skb tx list, err %d.\n", ret); + + return ret; +} + +static int hfi1_ipoib_flush_tx_list(struct net_device *dev, + struct hfi1_ipoib_txq *txq) +{ + int ret = 0; + + if (!list_empty(&txq->tx_list)) { + /* Flush the current list */ + ret = hfi1_ipoib_submit_tx_list(dev, txq); + + if (unlikely(ret)) + if (ret != -EBUSY) + ++dev->stats.tx_carrier_errors; + } + + return ret; +} + +static int hfi1_ipoib_submit_tx(struct hfi1_ipoib_txq *txq, + struct ipoib_txreq *tx) +{ + int ret; + + ret = sdma_send_txreq(txq->sde, + iowait_get_ib_work(&txq->wait), + &tx->txreq, + txq->pkts_sent); + if (likely(!ret)) { + txq->pkts_sent = true; + iowait_starve_clear(txq->pkts_sent, &txq->wait); + } + + return ret; +} + +static int hfi1_ipoib_send_dma_single(struct net_device *dev, + struct sk_buff *skb, + struct ipoib_txparms *txp) +{ + struct hfi1_ipoib_dev_priv *priv = hfi1_ipoib_priv(dev); + struct hfi1_ipoib_txq *txq = txp->txq; + struct ipoib_txreq *tx; + int ret; + + tx = hfi1_ipoib_send_dma_common(dev, skb, txp); + if (IS_ERR(tx)) { + int ret = PTR_ERR(tx); + + dev_kfree_skb_any(skb); + + if (ret == -ENOMEM) + ++dev->stats.tx_errors; + else + ++dev->stats.tx_carrier_errors; + + return NETDEV_TX_OK; + } + + ret = hfi1_ipoib_submit_tx(txq, tx); + if (likely(!ret)) { + trace_sdma_output_ibhdr(tx->priv->dd, + &tx->sdma_hdr.hdr, + ib_is_sc5(txp->flow.sc5)); + hfi1_ipoib_check_queue_depth(txq); + return NETDEV_TX_OK; + } + + txq->pkts_sent = false; + + if (ret == -EBUSY) { + list_add_tail(&tx->txreq.list, &txq->tx_list); + + trace_sdma_output_ibhdr(tx->priv->dd, + &tx->sdma_hdr.hdr, + ib_is_sc5(txp->flow.sc5)); + hfi1_ipoib_check_queue_depth(txq); + return NETDEV_TX_OK; + } + + if (ret == -ECOMM) { + hfi1_ipoib_check_queue_depth(txq); + return NETDEV_TX_OK; + } + + sdma_txclean(priv->dd, &tx->txreq); + dev_kfree_skb_any(skb); + kmem_cache_free(priv->txreq_cache, tx); + ++dev->stats.tx_carrier_errors; + + return NETDEV_TX_OK; +} + +static int hfi1_ipoib_send_dma_list(struct net_device *dev, + struct sk_buff *skb, + struct ipoib_txparms *txp) +{ + struct hfi1_ipoib_txq *txq = txp->txq; + struct ipoib_txreq *tx; + + /* Has the flow change ? */ + if (txq->flow.as_int != txp->flow.as_int) + (void)hfi1_ipoib_flush_tx_list(dev, txq); + + tx = hfi1_ipoib_send_dma_common(dev, skb, txp); + if (IS_ERR(tx)) { + int ret = PTR_ERR(tx); + + dev_kfree_skb_any(skb); + + if (ret == -ENOMEM) + ++dev->stats.tx_errors; + else + ++dev->stats.tx_carrier_errors; + + return NETDEV_TX_OK; + } + + list_add_tail(&tx->txreq.list, &txq->tx_list); + + hfi1_ipoib_check_queue_depth(txq); + + trace_sdma_output_ibhdr(tx->priv->dd, + &tx->sdma_hdr.hdr, + ib_is_sc5(txp->flow.sc5)); + + if (!netdev_xmit_more()) + (void)hfi1_ipoib_flush_tx_list(dev, txq); + + return NETDEV_TX_OK; +} + +static u8 hfi1_ipoib_calc_entropy(struct sk_buff *skb) +{ + if (skb_transport_header_was_set(skb)) { + u8 *hdr = (u8 *)skb_transport_header(skb); + + return (hdr[0] ^ hdr[1] ^ hdr[2] ^ hdr[3]); + } + + return (u8)skb_get_queue_mapping(skb); +} + +int hfi1_ipoib_send_dma(struct net_device *dev, + struct sk_buff *skb, + struct ib_ah *address, + u32 dqpn) +{ + struct hfi1_ipoib_dev_priv *priv = hfi1_ipoib_priv(dev); + struct ipoib_txparms txp; + struct rdma_netdev *rn = netdev_priv(dev); + + if (unlikely(skb->len > rn->mtu + HFI1_IPOIB_ENCAP_LEN)) { + dd_dev_warn(priv->dd, "packet len %d (> %d) too long to send, dropping\n", + skb->len, + rn->mtu + HFI1_IPOIB_ENCAP_LEN); + ++dev->stats.tx_dropped; + ++dev->stats.tx_errors; + dev_kfree_skb_any(skb); + return NETDEV_TX_OK; + } + + txp.dd = priv->dd; + txp.ah_attr = &ibah_to_rvtah(address)->attr; + txp.ibp = to_iport(priv->device, priv->port_num); + txp.txq = &priv->txqs[skb_get_queue_mapping(skb)]; + txp.dqpn = dqpn; + txp.flow.sc5 = txp.ibp->sl_to_sc[rdma_ah_get_sl(txp.ah_attr)]; + txp.flow.tx_queue = (u8)skb_get_queue_mapping(skb); + txp.entropy = hfi1_ipoib_calc_entropy(skb); + + if (netdev_xmit_more() || !list_empty(&txp.txq->tx_list)) + return hfi1_ipoib_send_dma_list(dev, skb, &txp); + + return hfi1_ipoib_send_dma_single(dev, skb, &txp); +} + +/* + * hfi1_ipoib_sdma_sleep - ipoib sdma sleep function + * + * This function gets called from sdma_send_txreq() when there are not enough + * sdma descriptors available to send the packet. It adds Tx queue's wait + * structure to sdma engine's dmawait list to be woken up when descriptors + * become available. + */ +static int hfi1_ipoib_sdma_sleep(struct sdma_engine *sde, + struct iowait_work *wait, + struct sdma_txreq *txreq, + uint seq, + bool pkts_sent) +{ + struct hfi1_ipoib_txq *txq = + container_of(wait->iow, struct hfi1_ipoib_txq, wait); + + write_seqlock(&sde->waitlock); + + if (likely(txq->priv->netdev->reg_state == NETREG_REGISTERED)) { + if (sdma_progress(sde, seq, txreq)) { + write_sequnlock(&sde->waitlock); + return -EAGAIN; + } + + netif_stop_subqueue(txq->priv->netdev, txq->q_idx); + + if (list_empty(&txq->wait.list)) + iowait_queue(pkts_sent, wait->iow, &sde->dmawait); + + write_sequnlock(&sde->waitlock); + return -EBUSY; + } + + write_sequnlock(&sde->waitlock); + return -EINVAL; +} + +/* + * hfi1_ipoib_sdma_wakeup - ipoib sdma wakeup function + * + * This function gets called when SDMA descriptors becomes available and Tx + * queue's wait structure was previously added to sdma engine's dmawait list. + */ +static void hfi1_ipoib_sdma_wakeup(struct iowait *wait, int reason) +{ + struct hfi1_ipoib_txq *txq = + container_of(wait, struct hfi1_ipoib_txq, wait); + + if (likely(txq->priv->netdev->reg_state == NETREG_REGISTERED)) + iowait_schedule(wait, system_highpri_wq, WORK_CPU_UNBOUND); +} + +static void hfi1_ipoib_flush_txq(struct work_struct *work) +{ + struct iowait_work *ioww = + container_of(work, struct iowait_work, iowork); + struct iowait *wait = iowait_ioww_to_iow(ioww); + struct hfi1_ipoib_txq *txq = + container_of(wait, struct hfi1_ipoib_txq, wait); + struct net_device *dev = txq->priv->netdev; + + if (likely(dev->reg_state == NETREG_REGISTERED) && + likely(__netif_subqueue_stopped(dev, txq->q_idx)) && + likely(!hfi1_ipoib_flush_tx_list(dev, txq))) + netif_wake_subqueue(dev, txq->q_idx); +} + +int hfi1_ipoib_txreq_init(struct hfi1_ipoib_dev_priv *priv) +{ + struct net_device *dev = priv->netdev; + char buf[HFI1_IPOIB_TXREQ_NAME_LEN]; + unsigned long tx_ring_size; + int i; + + /* + * Ring holds 1 less than tx_ring_size + * Round up to next power of 2 in order to hold at least tx_queue_len + */ + tx_ring_size = roundup_pow_of_two((unsigned long)dev->tx_queue_len + 1); + + snprintf(buf, sizeof(buf), "hfi1_%u_ipoib_txreq_cache", priv->dd->unit); + priv->txreq_cache = kmem_cache_create(buf, + sizeof(struct ipoib_txreq), + 0, + 0, + NULL); + if (!priv->txreq_cache) + return -ENOMEM; + + priv->tx_napis = kcalloc_node(dev->num_tx_queues, + sizeof(struct napi_struct), + GFP_ATOMIC, + priv->dd->node); + if (!priv->tx_napis) + goto free_txreq_cache; + + priv->txqs = kcalloc_node(dev->num_tx_queues, + sizeof(struct hfi1_ipoib_txq), + GFP_ATOMIC, + priv->dd->node); + if (!priv->txqs) + goto free_tx_napis; + + for (i = 0; i < dev->num_tx_queues; i++) { + struct hfi1_ipoib_txq *txq = &priv->txqs[i]; + + iowait_init(&txq->wait, + 0, + hfi1_ipoib_flush_txq, + NULL, + hfi1_ipoib_sdma_sleep, + hfi1_ipoib_sdma_wakeup, + NULL, + NULL); + txq->priv = priv; + txq->sde = NULL; + INIT_LIST_HEAD(&txq->tx_list); + atomic64_set(&txq->complete_txreqs, 0); + txq->q_idx = i; + txq->flow.tx_queue = 0xff; + txq->flow.sc5 = 0xff; + txq->pkts_sent = false; + + netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i), + priv->dd->node); + + txq->tx_ring.items = + vzalloc_node(array_size(tx_ring_size, + sizeof(struct ipoib_txreq)), + priv->dd->node); + if (!txq->tx_ring.items) + goto free_txqs; + + spin_lock_init(&txq->tx_ring.producer_lock); + spin_lock_init(&txq->tx_ring.consumer_lock); + txq->tx_ring.max_items = tx_ring_size; + + txq->napi = &priv->tx_napis[i]; + netif_tx_napi_add(dev, txq->napi, + hfi1_ipoib_process_tx_ring, + NAPI_POLL_WEIGHT); + } + + return 0; + +free_txqs: + for (i--; i >= 0; i--) { + struct hfi1_ipoib_txq *txq = &priv->txqs[i]; + + netif_napi_del(txq->napi); + vfree(txq->tx_ring.items); + } + + kfree(priv->txqs); + priv->txqs = NULL; + +free_tx_napis: + kfree(priv->tx_napis); + priv->tx_napis = NULL; + +free_txreq_cache: + kmem_cache_destroy(priv->txreq_cache); + priv->txreq_cache = NULL; + return -ENOMEM; +} + +static void hfi1_ipoib_drain_tx_list(struct hfi1_ipoib_txq *txq) +{ + struct sdma_txreq *txreq; + struct sdma_txreq *txreq_tmp; + atomic64_t *complete_txreqs = &txq->complete_txreqs; + + list_for_each_entry_safe(txreq, txreq_tmp, &txq->tx_list, list) { + struct ipoib_txreq *tx = + container_of(txreq, struct ipoib_txreq, txreq); + + list_del(&txreq->list); + sdma_txclean(txq->priv->dd, &tx->txreq); + dev_kfree_skb_any(tx->skb); + kmem_cache_free(txq->priv->txreq_cache, tx); + atomic64_inc(complete_txreqs); + } + + if (hfi1_ipoib_txreqs(txq->sent_txreqs, atomic64_read(complete_txreqs))) + dd_dev_warn(txq->priv->dd, + "txq %d not empty found %llu requests\n", + txq->q_idx, + hfi1_ipoib_txreqs(txq->sent_txreqs, + atomic64_read(complete_txreqs))); +} + +void hfi1_ipoib_txreq_deinit(struct hfi1_ipoib_dev_priv *priv) +{ + int i; + + for (i = 0; i < priv->netdev->num_tx_queues; i++) { + struct hfi1_ipoib_txq *txq = &priv->txqs[i]; + + iowait_cancel_work(&txq->wait); + iowait_sdma_drain(&txq->wait); + hfi1_ipoib_drain_tx_list(txq); + netif_napi_del(txq->napi); + (void)hfi1_ipoib_drain_tx_ring(txq, txq->tx_ring.max_items); + vfree(txq->tx_ring.items); + } + + kfree(priv->txqs); + priv->txqs = NULL; + + kfree(priv->tx_napis); + priv->tx_napis = NULL; + + kmem_cache_destroy(priv->txreq_cache); + priv->txreq_cache = NULL; +} + +void hfi1_ipoib_napi_tx_enable(struct net_device *dev) +{ + struct hfi1_ipoib_dev_priv *priv = hfi1_ipoib_priv(dev); + int i; + + for (i = 0; i < dev->num_tx_queues; i++) { + struct hfi1_ipoib_txq *txq = &priv->txqs[i]; + + napi_enable(txq->napi); + } +} + +void hfi1_ipoib_napi_tx_disable(struct net_device *dev) +{ + struct hfi1_ipoib_dev_priv *priv = hfi1_ipoib_priv(dev); + int i; + + for (i = 0; i < dev->num_tx_queues; i++) { + struct hfi1_ipoib_txq *txq = &priv->txqs[i]; + + napi_disable(txq->napi); + (void)hfi1_ipoib_drain_tx_ring(txq, txq->tx_ring.max_items); + } +} diff --git a/drivers/infiniband/hw/hfi1/trace.c b/drivers/infiniband/hw/hfi1/trace.c index 9a3d236bcc88..c8a9988d972d 100644 --- a/drivers/infiniband/hw/hfi1/trace.c +++ b/drivers/infiniband/hw/hfi1/trace.c @@ -1,5 +1,5 @@ /* - * Copyright(c) 2015 - 2018 Intel Corporation. + * Copyright(c) 2015 - 2020 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. @@ -47,6 +47,7 @@ #define CREATE_TRACE_POINTS #include "trace.h" #include "exp_rcv.h" +#include "ipoib.h" static u8 __get_ib_hdr_len(struct ib_header *hdr) { @@ -126,6 +127,7 @@ const char *hfi1_trace_get_packet_l2_str(u8 l2) #define RETH_PRN "reth vaddr:0x%.16llx rkey:0x%.8x dlen:0x%.8x" #define AETH_PRN "aeth syn:0x%.2x %s msn:0x%.8x" #define DETH_PRN "deth qkey:0x%.8x sqpn:0x%.6x" +#define DETH_ENTROPY_PRN "deth qkey:0x%.8x sqpn:0x%.6x entropy:0x%.2x" #define IETH_PRN "ieth rkey:0x%.8x" #define ATOMICACKETH_PRN "origdata:%llx" #define ATOMICETH_PRN "vaddr:0x%llx rkey:0x%.8x sdata:%llx cdata:%llx" @@ -444,6 +446,12 @@ const char *parse_everbs_hdrs( break; /* deth */ case OP(UD, SEND_ONLY): + trace_seq_printf(p, DETH_ENTROPY_PRN, + be32_to_cpu(eh->ud.deth[0]), + be32_to_cpu(eh->ud.deth[1]) & RVT_QPN_MASK, + be32_to_cpu(eh->ud.deth[1]) >> + HFI1_IPOIB_ENTROPY_SHIFT); + break; case OP(UD, SEND_ONLY_WITH_IMMEDIATE): trace_seq_printf(p, DETH_PRN, be32_to_cpu(eh->ud.deth[0]), diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index db58f11552f1..029541a8faeb 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2205,6 +2205,7 @@ struct rdma_netdev { void *clnt_priv; struct ib_device *hca; u8 port_num; + int mtu; /* * cleanup function must be specified. -- cgit v1.2.3 From 438d7dda9841ec42ef7d9024dc45347f9526016a Mon Sep 17 00:00:00 2001 From: Gary Leshner Date: Mon, 11 May 2020 12:05:54 -0400 Subject: IB/hfi1: Add the transmit side of a datagram ipoib RDMA netdev This implements the transmit side of the multiple transmit queue RDMA netdev used to accelerate ipoib. The receive side remains the ipoib internal implementation. The init/unint/open/stop netdev operations are saved off and called by the versions within the hfi1 netdev in order to initialize the connected mode resources present in ipoib thus allowing us to switch modes between datagram and connected. The datagram queue pair instantiated by the ipoib ulp is used by this implementation for its queue pair number and to register with multicast. The above queue pair is not used on transmit other than its qpn as the verbs layer is skipped and packets are directly submitted to the sdma engines. Link: https://lore.kernel.org/r/20200511160554.173205.1369.stgit@awfm-01.aw.intel.com Reviewed-by: Mike Marciniszyn Reviewed-by: Dennis Dalessandro Signed-off-by: Gary Leshner Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/Makefile | 1 + drivers/infiniband/hw/hfi1/ipoib.h | 5 + drivers/infiniband/hw/hfi1/ipoib_main.c | 283 ++++++++++++++++++++++++++++++++ 3 files changed, 289 insertions(+) create mode 100644 drivers/infiniband/hw/hfi1/ipoib_main.c diff --git a/drivers/infiniband/hw/hfi1/Makefile b/drivers/infiniband/hw/hfi1/Makefile index 09ef0b8b8ac7..0b2571306267 100644 --- a/drivers/infiniband/hw/hfi1/Makefile +++ b/drivers/infiniband/hw/hfi1/Makefile @@ -22,6 +22,7 @@ hfi1-y := \ init.o \ intr.o \ iowait.o \ + ipoib_main.o \ ipoib_tx.o \ mad.o \ mmu_rb.o \ diff --git a/drivers/infiniband/hw/hfi1/ipoib.h b/drivers/infiniband/hw/hfi1/ipoib.h index 2b541abde266..c2e63ca57896 100644 --- a/drivers/infiniband/hw/hfi1/ipoib.h +++ b/drivers/infiniband/hw/hfi1/ipoib.h @@ -142,4 +142,9 @@ void hfi1_ipoib_txreq_deinit(struct hfi1_ipoib_dev_priv *priv); void hfi1_ipoib_napi_tx_enable(struct net_device *dev); void hfi1_ipoib_napi_tx_disable(struct net_device *dev); +int hfi1_ipoib_rn_get_params(struct ib_device *device, + u8 port_num, + enum rdma_netdev_t type, + struct rdma_netdev_alloc_params *params); + #endif /* _IPOIB_H */ diff --git a/drivers/infiniband/hw/hfi1/ipoib_main.c b/drivers/infiniband/hw/hfi1/ipoib_main.c new file mode 100644 index 000000000000..304a5ac86f77 --- /dev/null +++ b/drivers/infiniband/hw/hfi1/ipoib_main.c @@ -0,0 +1,283 @@ +// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) +/* + * Copyright(c) 2020 Intel Corporation. + * + */ + +/* + * This file contains HFI1 support for ipoib functionality + */ + +#include "ipoib.h" +#include "hfi.h" + +static u32 qpn_from_mac(u8 *mac_arr) +{ + return (u32)mac_arr[1] << 16 | mac_arr[2] << 8 | mac_arr[3]; +} + +static int hfi1_ipoib_dev_init(struct net_device *dev) +{ + struct hfi1_ipoib_dev_priv *priv = hfi1_ipoib_priv(dev); + + priv->netstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); + + return priv->netdev_ops->ndo_init(dev); +} + +static void hfi1_ipoib_dev_uninit(struct net_device *dev) +{ + struct hfi1_ipoib_dev_priv *priv = hfi1_ipoib_priv(dev); + + priv->netdev_ops->ndo_uninit(dev); +} + +static int hfi1_ipoib_dev_open(struct net_device *dev) +{ + struct hfi1_ipoib_dev_priv *priv = hfi1_ipoib_priv(dev); + int ret; + + ret = priv->netdev_ops->ndo_open(dev); + if (!ret) { + struct hfi1_ibport *ibp = to_iport(priv->device, + priv->port_num); + struct rvt_qp *qp; + u32 qpn = qpn_from_mac(priv->netdev->dev_addr); + + rcu_read_lock(); + qp = rvt_lookup_qpn(ib_to_rvt(priv->device), &ibp->rvp, qpn); + if (!qp) { + rcu_read_unlock(); + priv->netdev_ops->ndo_stop(dev); + return -EINVAL; + } + rvt_get_qp(qp); + priv->qp = qp; + rcu_read_unlock(); + + hfi1_ipoib_napi_tx_enable(dev); + } + + return ret; +} + +static int hfi1_ipoib_dev_stop(struct net_device *dev) +{ + struct hfi1_ipoib_dev_priv *priv = hfi1_ipoib_priv(dev); + + if (!priv->qp) + return 0; + + hfi1_ipoib_napi_tx_disable(dev); + + rvt_put_qp(priv->qp); + priv->qp = NULL; + + return priv->netdev_ops->ndo_stop(dev); +} + +static void hfi1_ipoib_dev_get_stats64(struct net_device *dev, + struct rtnl_link_stats64 *storage) +{ + struct hfi1_ipoib_dev_priv *priv = hfi1_ipoib_priv(dev); + u64 rx_packets = 0ull; + u64 rx_bytes = 0ull; + u64 tx_packets = 0ull; + u64 tx_bytes = 0ull; + int i; + + netdev_stats_to_stats64(storage, &dev->stats); + + for_each_possible_cpu(i) { + const struct pcpu_sw_netstats *stats; + unsigned int start; + u64 trx_packets; + u64 trx_bytes; + u64 ttx_packets; + u64 ttx_bytes; + + stats = per_cpu_ptr(priv->netstats, i); + do { + start = u64_stats_fetch_begin_irq(&stats->syncp); + trx_packets = stats->rx_packets; + trx_bytes = stats->rx_bytes; + ttx_packets = stats->tx_packets; + ttx_bytes = stats->tx_bytes; + } while (u64_stats_fetch_retry_irq(&stats->syncp, start)); + + rx_packets += trx_packets; + rx_bytes += trx_bytes; + tx_packets += ttx_packets; + tx_bytes += ttx_bytes; + } + + storage->rx_packets += rx_packets; + storage->rx_bytes += rx_bytes; + storage->tx_packets += tx_packets; + storage->tx_bytes += tx_bytes; +} + +static const struct net_device_ops hfi1_ipoib_netdev_ops = { + .ndo_init = hfi1_ipoib_dev_init, + .ndo_uninit = hfi1_ipoib_dev_uninit, + .ndo_open = hfi1_ipoib_dev_open, + .ndo_stop = hfi1_ipoib_dev_stop, + .ndo_get_stats64 = hfi1_ipoib_dev_get_stats64, +}; + +static int hfi1_ipoib_send(struct net_device *dev, + struct sk_buff *skb, + struct ib_ah *address, + u32 dqpn) +{ + return hfi1_ipoib_send_dma(dev, skb, address, dqpn); +} + +static int hfi1_ipoib_mcast_attach(struct net_device *dev, + struct ib_device *device, + union ib_gid *mgid, + u16 mlid, + int set_qkey, + u32 qkey) +{ + struct hfi1_ipoib_dev_priv *priv = hfi1_ipoib_priv(dev); + u32 qpn = (u32)qpn_from_mac(priv->netdev->dev_addr); + struct hfi1_ibport *ibp = to_iport(priv->device, priv->port_num); + struct rvt_qp *qp; + int ret = -EINVAL; + + rcu_read_lock(); + + qp = rvt_lookup_qpn(ib_to_rvt(priv->device), &ibp->rvp, qpn); + if (qp) { + rvt_get_qp(qp); + rcu_read_unlock(); + if (set_qkey) + priv->qkey = qkey; + + /* attach QP to multicast group */ + ret = ib_attach_mcast(&qp->ibqp, mgid, mlid); + rvt_put_qp(qp); + } else { + rcu_read_unlock(); + } + + return ret; +} + +static int hfi1_ipoib_mcast_detach(struct net_device *dev, + struct ib_device *device, + union ib_gid *mgid, + u16 mlid) +{ + struct hfi1_ipoib_dev_priv *priv = hfi1_ipoib_priv(dev); + u32 qpn = (u32)qpn_from_mac(priv->netdev->dev_addr); + struct hfi1_ibport *ibp = to_iport(priv->device, priv->port_num); + struct rvt_qp *qp; + int ret = -EINVAL; + + rcu_read_lock(); + + qp = rvt_lookup_qpn(ib_to_rvt(priv->device), &ibp->rvp, qpn); + if (qp) { + rvt_get_qp(qp); + rcu_read_unlock(); + ret = ib_detach_mcast(&qp->ibqp, mgid, mlid); + rvt_put_qp(qp); + } else { + rcu_read_unlock(); + } + return ret; +} + +static void hfi1_ipoib_netdev_dtor(struct net_device *dev) +{ + struct hfi1_ipoib_dev_priv *priv = hfi1_ipoib_priv(dev); + + hfi1_ipoib_txreq_deinit(priv); + + free_percpu(priv->netstats); +} + +static void hfi1_ipoib_free_rdma_netdev(struct net_device *dev) +{ + hfi1_ipoib_netdev_dtor(dev); + free_netdev(dev); +} + +static void hfi1_ipoib_set_id(struct net_device *dev, int id) +{ + struct hfi1_ipoib_dev_priv *priv = hfi1_ipoib_priv(dev); + + priv->pkey_index = (u16)id; + ib_query_pkey(priv->device, + priv->port_num, + priv->pkey_index, + &priv->pkey); +} + +static int hfi1_ipoib_setup_rn(struct ib_device *device, + u8 port_num, + struct net_device *netdev, + void *param) +{ + struct hfi1_devdata *dd = dd_from_ibdev(device); + struct rdma_netdev *rn = netdev_priv(netdev); + struct hfi1_ipoib_dev_priv *priv; + int rc; + + rn->send = hfi1_ipoib_send; + rn->attach_mcast = hfi1_ipoib_mcast_attach; + rn->detach_mcast = hfi1_ipoib_mcast_detach; + rn->set_id = hfi1_ipoib_set_id; + rn->hca = device; + rn->port_num = port_num; + rn->mtu = netdev->mtu; + + priv = hfi1_ipoib_priv(netdev); + priv->dd = dd; + priv->netdev = netdev; + priv->device = device; + priv->port_num = port_num; + priv->netdev_ops = netdev->netdev_ops; + + netdev->netdev_ops = &hfi1_ipoib_netdev_ops; + + ib_query_pkey(device, port_num, priv->pkey_index, &priv->pkey); + + rc = hfi1_ipoib_txreq_init(priv); + if (rc) { + dd_dev_err(dd, "IPoIB netdev TX init - failed(%d)\n", rc); + hfi1_ipoib_free_rdma_netdev(netdev); + return rc; + } + + netdev->priv_destructor = hfi1_ipoib_netdev_dtor; + netdev->needs_free_netdev = true; + + return 0; +} + +int hfi1_ipoib_rn_get_params(struct ib_device *device, + u8 port_num, + enum rdma_netdev_t type, + struct rdma_netdev_alloc_params *params) +{ + struct hfi1_devdata *dd = dd_from_ibdev(device); + + if (type != RDMA_NETDEV_IPOIB) + return -EOPNOTSUPP; + + if (!HFI1_CAP_IS_KSET(AIP)) + return -EOPNOTSUPP; + + if (!port_num || port_num > dd->num_pports) + return -EINVAL; + + params->sizeof_priv = sizeof(struct hfi1_ipoib_rdma_netdev); + params->txqs = dd->num_sdma; + params->param = NULL; + params->initialize_rdma_netdev = hfi1_ipoib_setup_rn; + + return 0; +} -- cgit v1.2.3 From 84e3b19a27f8f37c8cf98f8b7cdf3f8674bf8e97 Mon Sep 17 00:00:00 2001 From: Gary Leshner Date: Mon, 11 May 2020 12:06:00 -0400 Subject: IB/hfi1: Remove module parameter for KDETH qpns The module parameter for KDETH qpns is being removed in favor of always using the default value of 0x80 as the qpn prefix. Defines have been added for various KDETH values including the prefix of 0x80. The reserved range now starts at the base value for KDETH qpns (0x80) and extends up to and including the last qpn for other reserved QP prefixed types. Adjust other QP prefixed define names to match KDETH defined names. Link: https://lore.kernel.org/r/20200511160600.173205.27508.stgit@awfm-01.aw.intel.com Reviewed-by: Dennis Dalessandro Reviewed-by: Mike Marciniszyn Signed-off-by: Gary Leshner Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/chip.c | 19 +++---------------- drivers/infiniband/hw/hfi1/common.h | 7 ------- drivers/infiniband/hw/hfi1/file_ops.c | 4 ++-- drivers/infiniband/hw/hfi1/hfi.h | 3 +-- drivers/infiniband/hw/hfi1/tid_rdma.c | 4 ++-- drivers/infiniband/hw/hfi1/verbs.c | 7 +++---- include/rdma/rdmavt_qp.h | 29 ++++++++++++++++++++++++++++- 7 files changed, 39 insertions(+), 34 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/chip.c b/drivers/infiniband/hw/hfi1/chip.c index e0b1238d31df..c08bf813d6fa 100644 --- a/drivers/infiniband/hw/hfi1/chip.c +++ b/drivers/infiniband/hw/hfi1/chip.c @@ -1,5 +1,5 @@ /* - * Copyright(c) 2015 - 2018 Intel Corporation. + * Copyright(c) 2015 - 2020 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. @@ -67,10 +67,6 @@ #include "debugfs.h" #include "fault.h" -uint kdeth_qp; -module_param_named(kdeth_qp, kdeth_qp, uint, S_IRUGO); -MODULE_PARM_DESC(kdeth_qp, "Set the KDETH queue pair prefix"); - uint num_vls = HFI1_MAX_VLS_SUPPORTED; module_param(num_vls, uint, S_IRUGO); MODULE_PARM_DESC(num_vls, "Set number of Virtual Lanes to use (1-8)"); @@ -14119,21 +14115,12 @@ static void init_early_variables(struct hfi1_devdata *dd) static void init_kdeth_qp(struct hfi1_devdata *dd) { - /* user changed the KDETH_QP */ - if (kdeth_qp != 0 && kdeth_qp >= 0xff) { - /* out of range or illegal value */ - dd_dev_err(dd, "Invalid KDETH queue pair prefix, ignoring"); - kdeth_qp = 0; - } - if (kdeth_qp == 0) /* not set, or failed range check */ - kdeth_qp = DEFAULT_KDETH_QP; - write_csr(dd, SEND_BTH_QP, - (kdeth_qp & SEND_BTH_QP_KDETH_QP_MASK) << + (RVT_KDETH_QP_PREFIX & SEND_BTH_QP_KDETH_QP_MASK) << SEND_BTH_QP_KDETH_QP_SHIFT); write_csr(dd, RCV_BTH_QP, - (kdeth_qp & RCV_BTH_QP_KDETH_QP_MASK) << + (RVT_KDETH_QP_PREFIX & RCV_BTH_QP_KDETH_QP_MASK) << RCV_BTH_QP_KDETH_QP_SHIFT); } diff --git a/drivers/infiniband/hw/hfi1/common.h b/drivers/infiniband/hw/hfi1/common.h index 1f7107e35a43..606254513640 100644 --- a/drivers/infiniband/hw/hfi1/common.h +++ b/drivers/infiniband/hw/hfi1/common.h @@ -72,13 +72,6 @@ * compilation unit */ -/* - * If a packet's QP[23:16] bits match this value, then it is - * a PSM packet and the hardware will expect a KDETH header - * following the BTH. - */ -#define DEFAULT_KDETH_QP 0x80 - /* driver/hw feature set bitmask */ #define HFI1_CAP_USER_SHIFT 24 #define HFI1_CAP_MASK ((1UL << HFI1_CAP_USER_SHIFT) - 1) diff --git a/drivers/infiniband/hw/hfi1/file_ops.c b/drivers/infiniband/hw/hfi1/file_ops.c index e7fdd70c6e78..8ca51e43cf53 100644 --- a/drivers/infiniband/hw/hfi1/file_ops.c +++ b/drivers/infiniband/hw/hfi1/file_ops.c @@ -1,5 +1,5 @@ /* - * Copyright(c) 2015-2017 Intel Corporation. + * Copyright(c) 2015-2020 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. @@ -1264,7 +1264,7 @@ static int get_base_info(struct hfi1_filedata *fd, unsigned long arg, u32 len) memset(&binfo, 0, sizeof(binfo)); binfo.hw_version = dd->revision; binfo.sw_version = HFI1_KERN_SWVERSION; - binfo.bthqp = kdeth_qp; + binfo.bthqp = RVT_KDETH_QP_PREFIX; binfo.jkey = uctxt->jkey; /* * If more than 64 contexts are enabled the allocated credit diff --git a/drivers/infiniband/hw/hfi1/hfi.h b/drivers/infiniband/hw/hfi1/hfi.h index b06c2594105a..ed13051d38da 100644 --- a/drivers/infiniband/hw/hfi1/hfi.h +++ b/drivers/infiniband/hw/hfi1/hfi.h @@ -1,7 +1,7 @@ #ifndef _HFI1_KERNEL_H #define _HFI1_KERNEL_H /* - * Copyright(c) 2015-2018 Intel Corporation. + * Copyright(c) 2015-2020 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. @@ -2250,7 +2250,6 @@ extern int num_user_contexts; extern unsigned long n_krcvqs; extern uint krcvqs[]; extern int krcvqsset; -extern uint kdeth_qp; extern uint loopback; extern uint quick_linkup; extern uint rcv_intr_timeout; diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.c b/drivers/infiniband/hw/hfi1/tid_rdma.c index 8a2e0d9351e9..243b4ba0b6f6 100644 --- a/drivers/infiniband/hw/hfi1/tid_rdma.c +++ b/drivers/infiniband/hw/hfi1/tid_rdma.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) /* - * Copyright(c) 2018 Intel Corporation. + * Copyright(c) 2018 - 2020 Intel Corporation. * */ @@ -194,7 +194,7 @@ void tid_rdma_opfn_init(struct rvt_qp *qp, struct tid_rdma_params *p) { struct hfi1_qp_priv *priv = qp->priv; - p->qp = (kdeth_qp << 16) | priv->rcd->ctxt; + p->qp = (RVT_KDETH_QP_PREFIX << 16) | priv->rcd->ctxt; p->max_len = TID_RDMA_MAX_SEGMENT_SIZE; p->jkey = priv->rcd->jkey; p->max_read = TID_RDMA_MAX_READ_SEGS_PER_REQ; diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c index 2f6323ad9c59..c1c6fa986cd1 100644 --- a/drivers/infiniband/hw/hfi1/verbs.c +++ b/drivers/infiniband/hw/hfi1/verbs.c @@ -1,5 +1,5 @@ /* - * Copyright(c) 2015 - 2018 Intel Corporation. + * Copyright(c) 2015 - 2020 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. @@ -1863,9 +1863,8 @@ int hfi1_register_ib_device(struct hfi1_devdata *dd) dd->verbs_dev.rdi.dparms.qpn_start = 0; dd->verbs_dev.rdi.dparms.qpn_inc = 1; dd->verbs_dev.rdi.dparms.qos_shift = dd->qos_shift; - dd->verbs_dev.rdi.dparms.qpn_res_start = kdeth_qp << 16; - dd->verbs_dev.rdi.dparms.qpn_res_end = - dd->verbs_dev.rdi.dparms.qpn_res_start + 65535; + dd->verbs_dev.rdi.dparms.qpn_res_start = RVT_KDETH_QP_BASE; + dd->verbs_dev.rdi.dparms.qpn_res_end = RVT_AIP_QP_MAX; dd->verbs_dev.rdi.dparms.max_rdma_atomic = HFI1_MAX_RDMA_ATOMIC; dd->verbs_dev.rdi.dparms.psn_mask = PSN_MASK; dd->verbs_dev.rdi.dparms.psn_shift = PSN_SHIFT; diff --git a/include/rdma/rdmavt_qp.h b/include/rdma/rdmavt_qp.h index 982bf2340840..c4369a6c2951 100644 --- a/include/rdma/rdmavt_qp.h +++ b/include/rdma/rdmavt_qp.h @@ -2,7 +2,7 @@ #define DEF_RDMAVT_INCQP_H /* - * Copyright(c) 2016 - 2019 Intel Corporation. + * Copyright(c) 2016 - 2020 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. @@ -68,6 +68,33 @@ #define RVT_R_RSP_SEND 0x08 #define RVT_R_COMM_EST 0x10 +/* + * If a packet's QP[23:16] bits match this value, then it is + * a PSM packet and the hardware will expect a KDETH header + * following the BTH. + */ +#define RVT_KDETH_QP_PREFIX 0x80 +#define RVT_KDETH_QP_SUFFIX 0xffff +#define RVT_KDETH_QP_PREFIX_MASK 0x00ff0000 +#define RVT_KDETH_QP_PREFIX_SHIFT 16 +#define RVT_KDETH_QP_BASE (u32)(RVT_KDETH_QP_PREFIX << \ + RVT_KDETH_QP_PREFIX_SHIFT) +#define RVT_KDETH_QP_MAX (u32)(RVT_KDETH_QP_BASE + RVT_KDETH_QP_SUFFIX) + +/* + * If a packet's LNH == BTH and DEST QPN[23:16] in the BTH match this + * prefix value, then it is an AIP packet with a DETH containing the entropy + * value in byte 4 following the BTH. + */ +#define RVT_AIP_QP_PREFIX 0x81 +#define RVT_AIP_QP_SUFFIX 0xffff +#define RVT_AIP_QP_PREFIX_MASK 0x00ff0000 +#define RVT_AIP_QP_PREFIX_SHIFT 16 +#define RVT_AIP_QP_BASE (u32)(RVT_AIP_QP_PREFIX << \ + RVT_AIP_QP_PREFIX_SHIFT) +#define RVT_AIP_QPN_MAX BIT(RVT_AIP_QP_PREFIX_SHIFT) +#define RVT_AIP_QP_MAX (u32)(RVT_AIP_QP_BASE + RVT_AIP_QPN_MAX - 1) + /* * Bit definitions for s_flags. * -- cgit v1.2.3 From 7f90a5a069f8dff9c76505b9853f95667d117c15 Mon Sep 17 00:00:00 2001 From: Gary Leshner Date: Mon, 11 May 2020 12:06:07 -0400 Subject: IB/{rdmavt, hfi1}: Implement creation of accelerated UD QPs Adds capability to create a qpn to be recognized as an accelerated UD QP for ipoib. This is accomplished by reserving 0x81 in byte[0] of the qpn as the prefix for these qp types and reserving qpns between 0x810000 and 0x81ffff. The hfi1 capability mask already contained a flag for the VNIC netdev. This has been renamed and extended to include both VNIC and ipoib. The rvt code to allocate qps now recognizes this flag and sets 0x81 into byte[0] of the qpn. The code to allocate qpns is modified to reset the qpn numbering when it is detected that a value is located in byte[0] for a UD QP and it is a qpn being requested for net dev use. If it is a regular UD QP then it is allowable to have bits set in byte[0] of the qpn and provide the previously normal behavior. The code to free the qpn now checks for the AIP prefix value of 0x81 and removes it from the qpn before being freed so that the lower 16 bit number can be reused. This patch requires minor changes in the IB core and ipoib to facilitate the creation of accelerated UP QPs. Link: https://lore.kernel.org/r/20200511160607.173205.11757.stgit@awfm-01.aw.intel.com Reviewed-by: Dennis Dalessandro Reviewed-by: Mike Marciniszyn Signed-off-by: Gary Leshner Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/verbs.c | 2 +- drivers/infiniband/sw/rdmavt/qp.c | 24 +++++++++++++++++++----- drivers/infiniband/ulp/ipoib/ipoib_verbs.c | 3 +++ include/rdma/ib_verbs.h | 4 ++-- include/rdma/opa_vnic.h | 4 ++-- 5 files changed, 27 insertions(+), 10 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c index c1c6fa986cd1..c61b2916d420 100644 --- a/drivers/infiniband/hw/hfi1/verbs.c +++ b/drivers/infiniband/hw/hfi1/verbs.c @@ -1342,7 +1342,7 @@ static void hfi1_fill_device_attr(struct hfi1_devdata *dd) IB_DEVICE_SYS_IMAGE_GUID | IB_DEVICE_RC_RNR_NAK_GEN | IB_DEVICE_PORT_ACTIVE_EVENT | IB_DEVICE_SRQ_RESIZE | IB_DEVICE_MEM_MGT_EXTENSIONS | - IB_DEVICE_RDMA_NETDEV_OPA_VNIC; + IB_DEVICE_RDMA_NETDEV_OPA; rdi->dparms.props.page_size_cap = PAGE_SIZE; rdi->dparms.props.vendor_id = dd->oui1 << 16 | dd->oui2 << 8 | dd->oui3; rdi->dparms.props.vendor_part_id = dd->pcidev->device; diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c index 0e1b291d2cec..91ad6c571080 100644 --- a/drivers/infiniband/sw/rdmavt/qp.c +++ b/drivers/infiniband/sw/rdmavt/qp.c @@ -1,5 +1,5 @@ /* - * Copyright(c) 2016 - 2019 Intel Corporation. + * Copyright(c) 2016 - 2020 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. @@ -525,15 +525,18 @@ static inline unsigned mk_qpn(struct rvt_qpn_table *qpt, * @rdi: rvt device info structure * @qpt: queue pair number table pointer * @port_num: IB port number, 1 based, comes from core + * @exclude_prefix: prefix of special queue pair number being allocated * * Return: The queue pair number */ static int alloc_qpn(struct rvt_dev_info *rdi, struct rvt_qpn_table *qpt, - enum ib_qp_type type, u8 port_num) + enum ib_qp_type type, u8 port_num, u8 exclude_prefix) { u32 i, offset, max_scan, qpn; struct rvt_qpn_map *map; u32 ret; + u32 max_qpn = exclude_prefix == RVT_AIP_QP_PREFIX ? + RVT_AIP_QPN_MAX : RVT_QPN_MAX; if (rdi->driver_f.alloc_qpn) return rdi->driver_f.alloc_qpn(rdi, qpt, type, port_num); @@ -553,7 +556,7 @@ static int alloc_qpn(struct rvt_dev_info *rdi, struct rvt_qpn_table *qpt, } qpn = qpt->last + qpt->incr; - if (qpn >= RVT_QPN_MAX) + if (qpn >= max_qpn) qpn = qpt->incr | ((qpt->last & 1) ^ 1); /* offset carries bit 0 */ offset = qpn & RVT_BITS_PER_PAGE_MASK; @@ -987,6 +990,9 @@ static void rvt_free_qpn(struct rvt_qpn_table *qpt, u32 qpn) { struct rvt_qpn_map *map; + if ((qpn & RVT_AIP_QP_PREFIX_MASK) == RVT_AIP_QP_BASE) + qpn &= RVT_AIP_QP_SUFFIX; + map = qpt->map + (qpn & RVT_QPN_MASK) / RVT_BITS_PER_PAGE; if (map->page) clear_bit(qpn & RVT_BITS_PER_PAGE_MASK, map->page); @@ -1074,13 +1080,15 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd, struct rvt_dev_info *rdi = ib_to_rvt(ibpd->device); void *priv = NULL; size_t sqsize; + u8 exclude_prefix = 0; if (!rdi) return ERR_PTR(-EINVAL); if (init_attr->cap.max_send_sge > rdi->dparms.props.max_send_sge || init_attr->cap.max_send_wr > rdi->dparms.props.max_qp_wr || - init_attr->create_flags) + (init_attr->create_flags && + init_attr->create_flags != IB_QP_CREATE_NETDEV_USE)) return ERR_PTR(-EINVAL); /* Check receive queue parameters if no SRQ is specified. */ @@ -1199,14 +1207,20 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd, goto bail_driver_priv; } + if (init_attr->create_flags & IB_QP_CREATE_NETDEV_USE) + exclude_prefix = RVT_AIP_QP_PREFIX; + err = alloc_qpn(rdi, &rdi->qp_dev->qpn_table, init_attr->qp_type, - init_attr->port_num); + init_attr->port_num, + exclude_prefix); if (err < 0) { ret = ERR_PTR(err); goto bail_rq_wq; } qp->ibqp.qp_num = err; + if (init_attr->create_flags & IB_QP_CREATE_NETDEV_USE) + qp->ibqp.qp_num |= RVT_AIP_QP_BASE; qp->port_num = init_attr->port_num; rvt_init_qp(rdi, qp, init_attr->qp_type); if (rdi->driver_f.qp_priv_init) { diff --git a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c index b69304d28f06..587252fd6f57 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c @@ -206,6 +206,9 @@ int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca) if (priv->hca_caps & IB_DEVICE_MANAGED_FLOW_STEERING) init_attr.create_flags |= IB_QP_CREATE_NETIF_QP; + if (priv->hca_caps & IB_DEVICE_RDMA_NETDEV_OPA) + init_attr.create_flags |= IB_QP_CREATE_NETDEV_USE; + priv->qp = ib_create_qp(priv->pd, &init_attr); if (IS_ERR(priv->qp)) { pr_warn("%s: failed to create QP\n", ca->name); diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 029541a8faeb..6278e4e040fc 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -305,7 +305,7 @@ enum ib_device_cap_flags { IB_DEVICE_VIRTUAL_FUNCTION = (1ULL << 33), /* Deprecated. Please use IB_RAW_PACKET_CAP_SCATTER_FCS. */ IB_DEVICE_RAW_SCATTER_FCS = (1ULL << 34), - IB_DEVICE_RDMA_NETDEV_OPA_VNIC = (1ULL << 35), + IB_DEVICE_RDMA_NETDEV_OPA = (1ULL << 35), /* The device supports padding incoming writes to cacheline. */ IB_DEVICE_PCI_WRITE_END_PADDING = (1ULL << 36), IB_DEVICE_ALLOW_USER_UNREG = (1ULL << 37), @@ -1117,7 +1117,7 @@ enum ib_qp_create_flags { IB_QP_CREATE_MANAGED_RECV = 1 << 4, IB_QP_CREATE_NETIF_QP = 1 << 5, IB_QP_CREATE_INTEGRITY_EN = 1 << 6, - /* FREE = 1 << 7, */ + IB_QP_CREATE_NETDEV_USE = 1 << 7, IB_QP_CREATE_SCATTER_FCS = 1 << 8, IB_QP_CREATE_CVLAN_STRIPPING = 1 << 9, IB_QP_CREATE_SOURCE_QPN = 1 << 10, diff --git a/include/rdma/opa_vnic.h b/include/rdma/opa_vnic.h index e90b149fe92a..6f244e759b4f 100644 --- a/include/rdma/opa_vnic.h +++ b/include/rdma/opa_vnic.h @@ -1,7 +1,7 @@ #ifndef _OPA_VNIC_H #define _OPA_VNIC_H /* - * Copyright(c) 2017 Intel Corporation. + * Copyright(c) 2017 - 2020 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. @@ -132,7 +132,7 @@ struct opa_vnic_stats { static inline bool rdma_cap_opa_vnic(struct ib_device *device) { return !!(device->attrs.device_cap_flags & - IB_DEVICE_RDMA_NETDEV_OPA_VNIC); + IB_DEVICE_RDMA_NETDEV_OPA); } #endif /* _OPA_VNIC_H */ -- cgit v1.2.3 From 19d8b90a509f7fd9a3224cca6df160a413a4d521 Mon Sep 17 00:00:00 2001 From: Grzegorz Andrejczuk Date: Mon, 11 May 2020 12:06:12 -0400 Subject: IB/hfi1: RSM rules for AIP This is implementation of RSM rule for AIP packets. AIP rule will use rule RSM2 and will match standard Infiniband packet containg BTH (LNH==BTH) and having Dest QPN prefixed with value 0x81. Spread between receive contexts will be done using source QPN bits. VNIC and AIP will share receive contexts, so their rules will point to the same RMT entries and their shared code is moved to separate functions. If any of the rules is active RMT mapping will be skipped for latter. Changed function hfi1_vnic_is_rsm_full to be more general and moved it from main header to chip.c. Changed the order of RSM rules because AIP rule as more specific one is needed to be placed before more general QOS rule. Rules are occupying two last RSM registers. Link: https://lore.kernel.org/r/20200511160612.173205.73002.stgit@awfm-01.aw.intel.com Reviewed-by: Dennis Dalessandro Reviewed-by: Mike Marciniszyn Signed-off-by: Grzegorz Andrejczuk Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/chip.c | 171 ++++++++++++++++++++++++++++---------- drivers/infiniband/hw/hfi1/chip.h | 4 +- drivers/infiniband/hw/hfi1/hfi.h | 8 +- drivers/infiniband/hw/hfi1/init.c | 3 +- 4 files changed, 136 insertions(+), 50 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/chip.c b/drivers/infiniband/hw/hfi1/chip.c index c08bf813d6fa..be1fb29a1f85 100644 --- a/drivers/infiniband/hw/hfi1/chip.c +++ b/drivers/infiniband/hw/hfi1/chip.c @@ -124,13 +124,15 @@ struct flag_table { /* * RSM instance allocation - * 0 - Verbs - * 1 - User Fecn Handling - * 2 - Vnic + * 0 - User Fecn Handling + * 1 - Vnic + * 2 - AIP + * 3 - Verbs */ -#define RSM_INS_VERBS 0 -#define RSM_INS_FECN 1 -#define RSM_INS_VNIC 2 +#define RSM_INS_FECN 0 +#define RSM_INS_VNIC 1 +#define RSM_INS_AIP 2 +#define RSM_INS_VERBS 3 /* Bit offset into the GUID which carries HFI id information */ #define GUID_HFI_INDEX_SHIFT 39 @@ -171,6 +173,25 @@ struct flag_table { /* QPN[m+n:1] QW 1, OFFSET 1 */ #define QPN_SELECT_OFFSET ((1ull << QW_SHIFT) | (1ull)) +/* RSM fields for AIP */ +/* LRH.BTH above is reused for this rule */ + +/* BTH.DESTQP: QW 1, OFFSET 16 for match */ +#define BTH_DESTQP_QW 1ull +#define BTH_DESTQP_BIT_OFFSET 16ull +#define BTH_DESTQP_OFFSET(off) ((BTH_DESTQP_QW << QW_SHIFT) | (off)) +#define BTH_DESTQP_MATCH_OFFSET BTH_DESTQP_OFFSET(BTH_DESTQP_BIT_OFFSET) +#define BTH_DESTQP_MASK 0xFFull +#define BTH_DESTQP_VALUE 0x81ull + +/* DETH.SQPN: QW 1 Offset 56 for select */ +/* We use 8 most significant Soure QPN bits as entropy fpr AIP */ +#define DETH_AIP_SQPN_QW 3ull +#define DETH_AIP_SQPN_BIT_OFFSET 56ull +#define DETH_AIP_SQPN_OFFSET(off) ((DETH_AIP_SQPN_QW << QW_SHIFT) | (off)) +#define DETH_AIP_SQPN_SELECT_OFFSET \ + DETH_AIP_SQPN_OFFSET(DETH_AIP_SQPN_BIT_OFFSET) + /* RSM fields for Vnic */ /* L2_TYPE: QW 0, OFFSET 61 - for match */ #define L2_TYPE_QW 0ull @@ -14236,6 +14257,12 @@ static void complete_rsm_map_table(struct hfi1_devdata *dd, } } +/* Is a receive side mapping rule */ +static bool has_rsm_rule(struct hfi1_devdata *dd, u8 rule_index) +{ + return read_csr(dd, RCV_RSM_CFG + (8 * rule_index)) != 0; +} + /* * Add a receive side mapping rule. */ @@ -14472,39 +14499,49 @@ static void init_fecn_handling(struct hfi1_devdata *dd, rmt->used += total_cnt; } -/* Initialize RSM for VNIC */ -void hfi1_init_vnic_rsm(struct hfi1_devdata *dd) +static inline bool hfi1_is_rmt_full(int start, int spare) +{ + return (start + spare) > NUM_MAP_ENTRIES; +} + +static bool hfi1_netdev_update_rmt(struct hfi1_devdata *dd) { u8 i, j; u8 ctx_id = 0; u64 reg; u32 regoff; - struct rsm_rule_data rrd; + int rmt_start = dd->vnic.rmt_start; - if (hfi1_vnic_is_rsm_full(dd, NUM_VNIC_MAP_ENTRIES)) { - dd_dev_err(dd, "Vnic RSM disabled, rmt entries used = %d\n", - dd->vnic.rmt_start); - return; + /* We already have contexts mapped in RMT */ + if (has_rsm_rule(dd, RSM_INS_VNIC) || has_rsm_rule(dd, RSM_INS_AIP)) { + dd_dev_info(dd, "Contexts are already mapped in RMT\n"); + return true; + } + + if (hfi1_is_rmt_full(rmt_start, NUM_VNIC_MAP_ENTRIES)) { + dd_dev_err(dd, "Not enought RMT entries used = %d\n", + rmt_start); + return false; } - dev_dbg(&(dd)->pcidev->dev, "Vnic rsm start = %d, end %d\n", - dd->vnic.rmt_start, - dd->vnic.rmt_start + NUM_VNIC_MAP_ENTRIES); + dev_dbg(&(dd)->pcidev->dev, "RMT start = %d, end %d\n", + rmt_start, + rmt_start + NUM_VNIC_MAP_ENTRIES); /* Update RSM mapping table, 32 regs, 256 entries - 1 ctx per byte */ - regoff = RCV_RSM_MAP_TABLE + (dd->vnic.rmt_start / 8) * 8; + regoff = RCV_RSM_MAP_TABLE + (rmt_start / 8) * 8; reg = read_csr(dd, regoff); for (i = 0; i < NUM_VNIC_MAP_ENTRIES; i++) { - /* Update map register with vnic context */ - j = (dd->vnic.rmt_start + i) % 8; + /* Update map register with netdev context */ + j = (rmt_start + i) % 8; reg &= ~(0xffllu << (j * 8)); reg |= (u64)dd->vnic.ctxt[ctx_id++]->ctxt << (j * 8); - /* Wrap up vnic ctx index */ + /* Wrap up netdev ctx index */ ctx_id %= dd->vnic.num_ctxt; /* Write back map register */ if (j == 7 || ((i + 1) == NUM_VNIC_MAP_ENTRIES)) { dev_dbg(&(dd)->pcidev->dev, - "Vnic rsm map reg[%d] =0x%llx\n", + "RMT[%d] =0x%llx\n", regoff - RCV_RSM_MAP_TABLE, reg); write_csr(dd, regoff, reg); @@ -14514,35 +14551,83 @@ void hfi1_init_vnic_rsm(struct hfi1_devdata *dd) } } - /* Add rule for vnic */ - rrd.offset = dd->vnic.rmt_start; - rrd.pkt_type = 4; - /* Match 16B packets */ - rrd.field1_off = L2_TYPE_MATCH_OFFSET; - rrd.mask1 = L2_TYPE_MASK; - rrd.value1 = L2_16B_VALUE; - /* Match ETH L4 packets */ - rrd.field2_off = L4_TYPE_MATCH_OFFSET; - rrd.mask2 = L4_16B_TYPE_MASK; - rrd.value2 = L4_16B_ETH_VALUE; - /* Calc context from veswid and entropy */ - rrd.index1_off = L4_16B_HDR_VESWID_OFFSET; - rrd.index1_width = ilog2(NUM_VNIC_MAP_ENTRIES); - rrd.index2_off = L2_16B_ENTROPY_OFFSET; - rrd.index2_width = ilog2(NUM_VNIC_MAP_ENTRIES); - add_rsm_rule(dd, RSM_INS_VNIC, &rrd); - - /* Enable RSM if not already enabled */ + return true; +} + +static void hfi1_enable_rsm_rule(struct hfi1_devdata *dd, + int rule, struct rsm_rule_data *rrd) +{ + if (!hfi1_netdev_update_rmt(dd)) { + dd_dev_err(dd, "Failed to update RMT for RSM%d rule\n", rule); + return; + } + + add_rsm_rule(dd, rule, rrd); add_rcvctrl(dd, RCV_CTRL_RCV_RSM_ENABLE_SMASK); } +void hfi1_init_aip_rsm(struct hfi1_devdata *dd) +{ + /* + * go through with the initialisation only if this rule actually doesn't + * exist yet + */ + if (atomic_fetch_inc(&dd->ipoib_rsm_usr_num) == 0) { + struct rsm_rule_data rrd = { + .offset = dd->vnic.rmt_start, + .pkt_type = IB_PACKET_TYPE, + .field1_off = LRH_BTH_MATCH_OFFSET, + .mask1 = LRH_BTH_MASK, + .value1 = LRH_BTH_VALUE, + .field2_off = BTH_DESTQP_MATCH_OFFSET, + .mask2 = BTH_DESTQP_MASK, + .value2 = BTH_DESTQP_VALUE, + .index1_off = DETH_AIP_SQPN_SELECT_OFFSET + + ilog2(NUM_VNIC_MAP_ENTRIES), + .index1_width = ilog2(NUM_VNIC_MAP_ENTRIES), + .index2_off = DETH_AIP_SQPN_SELECT_OFFSET, + .index2_width = ilog2(NUM_VNIC_MAP_ENTRIES) + }; + + hfi1_enable_rsm_rule(dd, RSM_INS_AIP, &rrd); + } +} + +/* Initialize RSM for VNIC */ +void hfi1_init_vnic_rsm(struct hfi1_devdata *dd) +{ + struct rsm_rule_data rrd = { + /* Add rule for vnic */ + .offset = dd->vnic.rmt_start, + .pkt_type = 4, + /* Match 16B packets */ + .field1_off = L2_TYPE_MATCH_OFFSET, + .mask1 = L2_TYPE_MASK, + .value1 = L2_16B_VALUE, + /* Match ETH L4 packets */ + .field2_off = L4_TYPE_MATCH_OFFSET, + .mask2 = L4_16B_TYPE_MASK, + .value2 = L4_16B_ETH_VALUE, + /* Calc context from veswid and entropy */ + .index1_off = L4_16B_HDR_VESWID_OFFSET, + .index1_width = ilog2(NUM_VNIC_MAP_ENTRIES), + .index2_off = L2_16B_ENTROPY_OFFSET, + .index2_width = ilog2(NUM_VNIC_MAP_ENTRIES) + }; + + hfi1_enable_rsm_rule(dd, RSM_INS_VNIC, &rrd); +} + void hfi1_deinit_vnic_rsm(struct hfi1_devdata *dd) { clear_rsm_rule(dd, RSM_INS_VNIC); +} - /* Disable RSM if used only by vnic */ - if (dd->vnic.rmt_start == 0) - clear_rcvctrl(dd, RCV_CTRL_RCV_RSM_ENABLE_SMASK); +void hfi1_deinit_aip_rsm(struct hfi1_devdata *dd) +{ + /* only actually clear the rule if it's the last user asking to do so */ + if (atomic_fetch_add_unless(&dd->ipoib_rsm_usr_num, -1, 0) == 1) + clear_rsm_rule(dd, RSM_INS_AIP); } static int init_rxe(struct hfi1_devdata *dd) diff --git a/drivers/infiniband/hw/hfi1/chip.h b/drivers/infiniband/hw/hfi1/chip.h index 725509261016..b10e0bf6d8c0 100644 --- a/drivers/infiniband/hw/hfi1/chip.h +++ b/drivers/infiniband/hw/hfi1/chip.h @@ -1,7 +1,7 @@ #ifndef _CHIP_H #define _CHIP_H /* - * Copyright(c) 2015 - 2018 Intel Corporation. + * Copyright(c) 2015 - 2020 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. @@ -1455,6 +1455,8 @@ void remap_intr(struct hfi1_devdata *dd, int isrc, int msix_intr); void remap_sdma_interrupts(struct hfi1_devdata *dd, int engine, int msix_intr); void reset_interrupts(struct hfi1_devdata *dd); u8 hfi1_get_qp_map(struct hfi1_devdata *dd, u8 idx); +void hfi1_init_aip_rsm(struct hfi1_devdata *dd); +void hfi1_deinit_aip_rsm(struct hfi1_devdata *dd); /* * Interrupt source table. diff --git a/drivers/infiniband/hw/hfi1/hfi.h b/drivers/infiniband/hw/hfi1/hfi.h index ed13051d38da..c61e56a34cb8 100644 --- a/drivers/infiniband/hw/hfi1/hfi.h +++ b/drivers/infiniband/hw/hfi1/hfi.h @@ -1417,12 +1417,10 @@ struct hfi1_devdata { struct hfi1_vnic_data vnic; /* Lock to protect IRQ SRC register access */ spinlock_t irq_src_lock; -}; -static inline bool hfi1_vnic_is_rsm_full(struct hfi1_devdata *dd, int spare) -{ - return (dd->vnic.rmt_start + spare) > NUM_MAP_ENTRIES; -} + /* Keeps track of IPoIB RSM rule users */ + atomic_t ipoib_rsm_usr_num; +}; /* 8051 firmware version helper */ #define dc8051_ver(a, b, c) ((a) << 16 | (b) << 8 | (c)) diff --git a/drivers/infiniband/hw/hfi1/init.c b/drivers/infiniband/hw/hfi1/init.c index 3759d9233a1c..8c6b96a660a6 100644 --- a/drivers/infiniband/hw/hfi1/init.c +++ b/drivers/infiniband/hw/hfi1/init.c @@ -1,5 +1,5 @@ /* - * Copyright(c) 2015 - 2018 Intel Corporation. + * Copyright(c) 2015 - 2020 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. @@ -1316,6 +1316,7 @@ static struct hfi1_devdata *hfi1_alloc_devdata(struct pci_dev *pdev, goto bail; } + atomic_set(&dd->ipoib_rsm_usr_num, 0); return dd; bail: -- cgit v1.2.3 From 6d72344cf6c47010cc2055a832e16c7fcdd16f82 Mon Sep 17 00:00:00 2001 From: Kaike Wan Date: Mon, 11 May 2020 12:06:18 -0400 Subject: IB/ipoib: Increase ipoib Datagram mode MTU's upper limit Currently the ipoib UD mtu is restricted to 4K bytes. Remove this limitation so that the IPOIB module can potentially use an MTU (in UD mode) that is bounded by the MTU of the underlying device. A field is added to the ib_port_attr structure to indicate the maximum physical MTU the underlying device supports. Link: https://lore.kernel.org/r/20200511160618.173205.23053.stgit@awfm-01.aw.intel.com Reviewed-by: Dennis Dalessandro Reviewed-by: Mike Marciniszyn Signed-off-by: Sadanand Warrier Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/qp.c | 18 +----- drivers/infiniband/hw/hfi1/verbs.c | 2 + drivers/infiniband/ulp/ipoib/ipoib_main.c | 2 +- drivers/infiniband/ulp/ipoib/ipoib_multicast.c | 11 ++-- include/rdma/ib_verbs.h | 77 ++++++++++++++++++++++++++ include/rdma/opa_port_info.h | 10 +--- 6 files changed, 88 insertions(+), 32 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/qp.c b/drivers/infiniband/hw/hfi1/qp.c index f8e733aa3bb8..0c2ae9f7b3e8 100644 --- a/drivers/infiniband/hw/hfi1/qp.c +++ b/drivers/infiniband/hw/hfi1/qp.c @@ -1,5 +1,5 @@ /* - * Copyright(c) 2015 - 2019 Intel Corporation. + * Copyright(c) 2015 - 2020 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. @@ -186,15 +186,6 @@ static void flush_iowait(struct rvt_qp *qp) write_sequnlock_irqrestore(lock, flags); } -static inline int opa_mtu_enum_to_int(int mtu) -{ - switch (mtu) { - case OPA_MTU_8192: return 8192; - case OPA_MTU_10240: return 10240; - default: return -1; - } -} - /** * This function is what we would push to the core layer if we wanted to be a * "first class citizen". Instead we hide this here and rely on Verbs ULPs @@ -202,15 +193,10 @@ static inline int opa_mtu_enum_to_int(int mtu) */ static inline int verbs_mtu_enum_to_int(struct ib_device *dev, enum ib_mtu mtu) { - int val; - /* Constraining 10KB packets to 8KB packets */ if (mtu == (enum ib_mtu)OPA_MTU_10240) mtu = OPA_MTU_8192; - val = opa_mtu_enum_to_int((int)mtu); - if (val > 0) - return val; - return ib_mtu_enum_to_int(mtu); + return opa_mtu_enum_to_int((enum opa_mtu)mtu); } int hfi1_check_modify_qp(struct rvt_qp *qp, struct ib_qp_attr *attr, diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c index c61b2916d420..19d5d0061b01 100644 --- a/drivers/infiniband/hw/hfi1/verbs.c +++ b/drivers/infiniband/hw/hfi1/verbs.c @@ -1439,6 +1439,8 @@ static int query_port(struct rvt_dev_info *rdi, u8 port_num, 4096 : hfi1_max_mtu), IB_MTU_4096); props->active_mtu = !valid_ib_mtu(ppd->ibmtu) ? props->max_mtu : mtu_to_enum(ppd->ibmtu, IB_MTU_4096); + props->phys_mtu = HFI1_CAP_IS_KSET(AIP) ? hfi1_max_mtu : + ib_mtu_enum_to_int(props->max_mtu); return 0; } diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index d4c6a97ce4c0..22216f181b24 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -1855,7 +1855,7 @@ static int ipoib_parent_init(struct net_device *ndev) priv->port); return result; } - priv->max_ib_mtu = ib_mtu_enum_to_int(attr.max_mtu); + priv->max_ib_mtu = rdma_mtu_from_attr(priv->ca, priv->port, &attr); result = ib_query_pkey(priv->ca, priv->port, 0, &priv->pkey); if (result) { diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c index b9e9562f5034..7166ee9b7a25 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c @@ -218,6 +218,7 @@ static int ipoib_mcast_join_finish(struct ipoib_mcast *mcast, struct rdma_ah_attr av; int ret; int set_qkey = 0; + int mtu; mcast->mcmember = *mcmember; @@ -240,13 +241,11 @@ static int ipoib_mcast_join_finish(struct ipoib_mcast *mcast, priv->broadcast->mcmember.flow_label = mcmember->flow_label; priv->broadcast->mcmember.hop_limit = mcmember->hop_limit; /* assume if the admin and the mcast are the same both can be changed */ + mtu = rdma_mtu_enum_to_int(priv->ca, priv->port, + priv->broadcast->mcmember.mtu); if (priv->mcast_mtu == priv->admin_mtu) - priv->admin_mtu = - priv->mcast_mtu = - IPOIB_UD_MTU(ib_mtu_enum_to_int(priv->broadcast->mcmember.mtu)); - else - priv->mcast_mtu = - IPOIB_UD_MTU(ib_mtu_enum_to_int(priv->broadcast->mcmember.mtu)); + priv->admin_mtu = IPOIB_UD_MTU(mtu); + priv->mcast_mtu = IPOIB_UD_MTU(mtu); priv->qkey = be32_to_cpu(priv->broadcast->mcmember.qkey); spin_unlock_irq(&priv->lock); diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 6278e4e040fc..641f4751b062 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -462,6 +462,11 @@ enum ib_mtu { IB_MTU_4096 = 5 }; +enum opa_mtu { + OPA_MTU_8192 = 6, + OPA_MTU_10240 = 7 +}; + static inline int ib_mtu_enum_to_int(enum ib_mtu mtu) { switch (mtu) { @@ -488,6 +493,28 @@ static inline enum ib_mtu ib_mtu_int_to_enum(int mtu) return IB_MTU_256; } +static inline int opa_mtu_enum_to_int(enum opa_mtu mtu) +{ + switch (mtu) { + case OPA_MTU_8192: + return 8192; + case OPA_MTU_10240: + return 10240; + default: + return(ib_mtu_enum_to_int((enum ib_mtu)mtu)); + } +} + +static inline enum opa_mtu opa_mtu_int_to_enum(int mtu) +{ + if (mtu >= 10240) + return OPA_MTU_10240; + else if (mtu >= 8192) + return OPA_MTU_8192; + else + return ((enum opa_mtu)ib_mtu_int_to_enum(mtu)); +} + enum ib_port_state { IB_PORT_NOP = 0, IB_PORT_DOWN = 1, @@ -651,6 +678,7 @@ struct ib_port_attr { enum ib_port_state state; enum ib_mtu max_mtu; enum ib_mtu active_mtu; + u32 phys_mtu; int gid_tbl_len; unsigned int ip_gids:1; /* This is the value from PortInfo CapabilityMask, defined by IBA */ @@ -3364,6 +3392,55 @@ static inline unsigned int rdma_find_pg_bit(unsigned long addr, return __fls(pgsz); } +/** + * rdma_core_cap_opa_port - Return whether the RDMA Port is OPA or not. + * @device: Device + * @port_num: 1 based Port number + * + * Return true if port is an Intel OPA port , false if not + */ +static inline bool rdma_core_cap_opa_port(struct ib_device *device, + u32 port_num) +{ + return (device->port_data[port_num].immutable.core_cap_flags & + RDMA_CORE_PORT_INTEL_OPA) == RDMA_CORE_PORT_INTEL_OPA; +} + +/** + * rdma_mtu_enum_to_int - Return the mtu of the port as an integer value. + * @device: Device + * @port_num: Port number + * @mtu: enum value of MTU + * + * Return the MTU size supported by the port as an integer value. Will return + * -1 if enum value of mtu is not supported. + */ +static inline int rdma_mtu_enum_to_int(struct ib_device *device, u8 port, + int mtu) +{ + if (rdma_core_cap_opa_port(device, port)) + return opa_mtu_enum_to_int((enum opa_mtu)mtu); + else + return ib_mtu_enum_to_int((enum ib_mtu)mtu); +} + +/** + * rdma_mtu_from_attr - Return the mtu of the port from the port attribute. + * @device: Device + * @port_num: Port number + * @attr: port attribute + * + * Return the MTU size supported by the port as an integer value. + */ +static inline int rdma_mtu_from_attr(struct ib_device *device, u8 port, + struct ib_port_attr *attr) +{ + if (rdma_core_cap_opa_port(device, port)) + return attr->phys_mtu; + else + return ib_mtu_enum_to_int(attr->max_mtu); +} + int ib_set_vf_link_state(struct ib_device *device, int vf, u8 port, int state); int ib_get_vf_config(struct ib_device *device, int vf, u8 port, diff --git a/include/rdma/opa_port_info.h b/include/rdma/opa_port_info.h index bdbfe25d3854..0d9e6d74c385 100644 --- a/include/rdma/opa_port_info.h +++ b/include/rdma/opa_port_info.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2014-2017 Intel Corporation. All rights reserved. + * Copyright (c) 2014-2020 Intel Corporation. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -139,14 +139,6 @@ #define OPA_CAP_MASK3_IsVLMarkerSupported (1 << 1) #define OPA_CAP_MASK3_IsVLrSupported (1 << 0) -/** - * new MTU values - */ -enum { - OPA_MTU_8192 = 6, - OPA_MTU_10240 = 7, -}; - enum { OPA_PORT_PHYS_CONF_DISCONNECTED = 0, OPA_PORT_PHYS_CONF_STANDARD = 1, -- cgit v1.2.3 From 89dcaa366bffb9fcef39b97d08cc26d0a115ee35 Mon Sep 17 00:00:00 2001 From: Grzegorz Andrejczuk Date: Mon, 11 May 2020 12:06:25 -0400 Subject: IB/hfi1: Rename num_vnic_contexts as num_netdev_contexts Rename num_vnic_contexts as num_ndetdev_contexts since VNIC and ipoib will share the same set of receive contexts. Link: https://lore.kernel.org/r/20200511160625.173205.53306.stgit@awfm-01.aw.intel.com Reviewed-by: Mike Marciniszyn Reviewed-by: Dennis Dalessandro Signed-off-by: Sadanand Warrier Signed-off-by: Grzegorz Andrejczuk Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/chip.c | 24 ++++++++++++------------ drivers/infiniband/hw/hfi1/hfi.h | 4 ++-- drivers/infiniband/hw/hfi1/msix.c | 4 ++-- drivers/infiniband/hw/hfi1/vnic_main.c | 8 ++++---- 4 files changed, 20 insertions(+), 20 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/chip.c b/drivers/infiniband/hw/hfi1/chip.c index be1fb29a1f85..07eec3544304 100644 --- a/drivers/infiniband/hw/hfi1/chip.c +++ b/drivers/infiniband/hw/hfi1/chip.c @@ -13347,12 +13347,12 @@ static int set_up_interrupts(struct hfi1_devdata *dd) * in array of contexts * freectxts - number of free user contexts * num_send_contexts - number of PIO send contexts being used - * num_vnic_contexts - number of contexts reserved for VNIC + * num_netdev_contexts - number of contexts reserved for netdev */ static int set_up_context_variables(struct hfi1_devdata *dd) { unsigned long num_kernel_contexts; - u16 num_vnic_contexts = HFI1_NUM_VNIC_CTXT; + u16 num_netdev_contexts = HFI1_NUM_VNIC_CTXT; int total_contexts; int ret; unsigned ngroups; @@ -13391,11 +13391,11 @@ static int set_up_context_variables(struct hfi1_devdata *dd) } /* Accommodate VNIC contexts if possible */ - if ((num_kernel_contexts + num_vnic_contexts) > rcv_contexts) { + if ((num_kernel_contexts + num_netdev_contexts) > rcv_contexts) { dd_dev_err(dd, "No receive contexts available for VNIC\n"); - num_vnic_contexts = 0; + num_netdev_contexts = 0; } - total_contexts = num_kernel_contexts + num_vnic_contexts; + total_contexts = num_kernel_contexts + num_netdev_contexts; /* * User contexts: @@ -13422,15 +13422,15 @@ static int set_up_context_variables(struct hfi1_devdata *dd) * The RMT entries are currently allocated as shown below: * 1. QOS (0 to 128 entries); * 2. FECN (num_kernel_context - 1 + num_user_contexts + - * num_vnic_contexts); - * 3. VNIC (num_vnic_contexts). - * It should be noted that FECN oversubscribe num_vnic_contexts - * entries of RMT because both VNIC and PSM could allocate any receive + * num_netdev_contexts); + * 3. netdev (num_netdev_contexts). + * It should be noted that FECN oversubscribe num_netdev_contexts + * entries of RMT because both netdev and PSM could allocate any receive * context between dd->first_dyn_alloc_text and dd->num_rcv_contexts, * and PSM FECN must reserve an RMT entry for each possible PSM receive * context. */ - rmt_count = qos_rmt_entries(dd, NULL, NULL) + (num_vnic_contexts * 2); + rmt_count = qos_rmt_entries(dd, NULL, NULL) + (num_netdev_contexts * 2); if (HFI1_CAP_IS_KSET(TID_RDMA)) rmt_count += num_kernel_contexts - 1; if (rmt_count + n_usr_ctxts > NUM_MAP_ENTRIES) { @@ -13449,7 +13449,7 @@ static int set_up_context_variables(struct hfi1_devdata *dd) dd->num_rcv_contexts = total_contexts; dd->n_krcv_queues = num_kernel_contexts; dd->first_dyn_alloc_ctxt = num_kernel_contexts; - dd->num_vnic_contexts = num_vnic_contexts; + dd->num_netdev_contexts = num_netdev_contexts; dd->num_user_contexts = n_usr_ctxts; dd->freectxts = n_usr_ctxts; dd_dev_info(dd, @@ -13457,7 +13457,7 @@ static int set_up_context_variables(struct hfi1_devdata *dd) rcv_contexts, (int)dd->num_rcv_contexts, (int)dd->n_krcv_queues, - dd->num_vnic_contexts, + dd->num_netdev_contexts, dd->num_user_contexts); /* diff --git a/drivers/infiniband/hw/hfi1/hfi.h b/drivers/infiniband/hw/hfi1/hfi.h index c61e56a34cb8..5a9276c4c188 100644 --- a/drivers/infiniband/hw/hfi1/hfi.h +++ b/drivers/infiniband/hw/hfi1/hfi.h @@ -1167,8 +1167,8 @@ struct hfi1_devdata { u64 z_send_schedule; u64 __percpu *send_schedule; - /* number of reserved contexts for VNIC usage */ - u16 num_vnic_contexts; + /* number of reserved contexts for netdev usage */ + u16 num_netdev_contexts; /* number of receive contexts in use by the driver */ u32 num_rcv_contexts; /* number of pio send contexts in use by the driver */ diff --git a/drivers/infiniband/hw/hfi1/msix.c b/drivers/infiniband/hw/hfi1/msix.c index db82db497b2c..7574f2bc9718 100644 --- a/drivers/infiniband/hw/hfi1/msix.c +++ b/drivers/infiniband/hw/hfi1/msix.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) /* - * Copyright(c) 2018 Intel Corporation. + * Copyright(c) 2018 - 2020 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. @@ -69,7 +69,7 @@ int msix_initialize(struct hfi1_devdata *dd) * one for each VNIC context * ...any new IRQs should be added here. */ - total = 1 + dd->num_sdma + dd->n_krcv_queues + dd->num_vnic_contexts; + total = 1 + dd->num_sdma + dd->n_krcv_queues + dd->num_netdev_contexts; if (total >= CCE_NUM_MSIX_VECTORS) return -EINVAL; diff --git a/drivers/infiniband/hw/hfi1/vnic_main.c b/drivers/infiniband/hw/hfi1/vnic_main.c index 6b14581b9965..db7624cacee1 100644 --- a/drivers/infiniband/hw/hfi1/vnic_main.c +++ b/drivers/infiniband/hw/hfi1/vnic_main.c @@ -1,5 +1,5 @@ /* - * Copyright(c) 2017 - 2018 Intel Corporation. + * Copyright(c) 2017 - 2020 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. @@ -804,7 +804,7 @@ struct net_device *hfi1_vnic_alloc_rn(struct ib_device *device, struct rdma_netdev *rn; int i, size, rc; - if (!dd->num_vnic_contexts) + if (!dd->num_netdev_contexts) return ERR_PTR(-ENOMEM); if (!port_num || (port_num > dd->num_pports)) @@ -815,7 +815,7 @@ struct net_device *hfi1_vnic_alloc_rn(struct ib_device *device, size = sizeof(struct opa_vnic_rdma_netdev) + sizeof(*vinfo); netdev = alloc_netdev_mqs(size, name, name_assign_type, setup, - dd->num_sdma, dd->num_vnic_contexts); + dd->num_sdma, dd->num_netdev_contexts); if (!netdev) return ERR_PTR(-ENOMEM); @@ -823,7 +823,7 @@ struct net_device *hfi1_vnic_alloc_rn(struct ib_device *device, vinfo = opa_vnic_dev_priv(netdev); vinfo->dd = dd; vinfo->num_tx_q = dd->num_sdma; - vinfo->num_rx_q = dd->num_vnic_contexts; + vinfo->num_rx_q = dd->num_netdev_contexts; vinfo->netdev = netdev; rn->free_rdma_netdev = hfi1_vnic_free_rn; rn->set_id = hfi1_vnic_set_vesw_id; -- cgit v1.2.3 From 6991abcb993cf6c0711237b9d393d4f0a2008f1f Mon Sep 17 00:00:00 2001 From: Kaike Wan Date: Mon, 11 May 2020 12:06:31 -0400 Subject: IB/hfi1: Add functions to receive accelerated ipoib packets Ipoib netdev will share receive contexts with existing VNIC netdev. To achieve that, a dummy netdev is allocated with hfi1_devdata to own the receive contexts, and ipoib and VNIC netdevs will be put on top of it. Each receive context is associated with a single NAPI object. This patch adds the functions to receive incoming packets for accelerated ipoib. Link: https://lore.kernel.org/r/20200511160631.173205.54184.stgit@awfm-01.aw.intel.com Reviewed-by: Mike Marciniszyn Reviewed-by: Dennis Dalessandro Signed-off-by: Sadanand Warrier Signed-off-by: Grzegorz Andrejczuk Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/Makefile | 2 + drivers/infiniband/hw/hfi1/driver.c | 92 +++++++++++++++++++++++++++++++++- drivers/infiniband/hw/hfi1/hfi.h | 5 +- drivers/infiniband/hw/hfi1/ipoib.h | 18 +++++++ drivers/infiniband/hw/hfi1/ipoib_rx.c | 71 ++++++++++++++++++++++++++ drivers/infiniband/hw/hfi1/netdev.h | 90 +++++++++++++++++++++++++++++++++ drivers/infiniband/hw/hfi1/netdev_rx.c | 79 +++++++++++++++++++++++++++++ 7 files changed, 355 insertions(+), 2 deletions(-) create mode 100644 drivers/infiniband/hw/hfi1/ipoib_rx.c create mode 100644 drivers/infiniband/hw/hfi1/netdev.h create mode 100644 drivers/infiniband/hw/hfi1/netdev_rx.c diff --git a/drivers/infiniband/hw/hfi1/Makefile b/drivers/infiniband/hw/hfi1/Makefile index 0b2571306267..2e89ec10efed 100644 --- a/drivers/infiniband/hw/hfi1/Makefile +++ b/drivers/infiniband/hw/hfi1/Makefile @@ -23,10 +23,12 @@ hfi1-y := \ intr.o \ iowait.o \ ipoib_main.o \ + ipoib_rx.o \ ipoib_tx.o \ mad.o \ mmu_rb.o \ msix.o \ + netdev_rx.o \ opfn.o \ pcie.o \ pio.o \ diff --git a/drivers/infiniband/hw/hfi1/driver.c b/drivers/infiniband/hw/hfi1/driver.c index 049d15befe58..c5ed6ed30100 100644 --- a/drivers/infiniband/hw/hfi1/driver.c +++ b/drivers/infiniband/hw/hfi1/driver.c @@ -1,5 +1,5 @@ /* - * Copyright(c) 2015-2018 Intel Corporation. + * Copyright(c) 2015-2020 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. @@ -54,6 +54,7 @@ #include #include #include +#include #include "hfi.h" #include "trace.h" @@ -63,6 +64,9 @@ #include "vnic.h" #include "fault.h" +#include "ipoib.h" +#include "netdev.h" + #undef pr_fmt #define pr_fmt(fmt) DRIVER_NAME ": " fmt @@ -1550,6 +1554,81 @@ void handle_eflags(struct hfi1_packet *packet) show_eflags_errs(packet); } +static void hfi1_ipoib_ib_rcv(struct hfi1_packet *packet) +{ + struct hfi1_ibport *ibp; + struct net_device *netdev; + struct hfi1_ctxtdata *rcd = packet->rcd; + struct napi_struct *napi = rcd->napi; + struct sk_buff *skb; + struct hfi1_netdev_rxq *rxq = container_of(napi, + struct hfi1_netdev_rxq, napi); + u32 extra_bytes; + u32 tlen, qpnum; + bool do_work, do_cnp; + struct hfi1_ipoib_dev_priv *priv; + + trace_hfi1_rcvhdr(packet); + + hfi1_setup_ib_header(packet); + + packet->ohdr = &((struct ib_header *)packet->hdr)->u.oth; + packet->grh = NULL; + + if (unlikely(rhf_err_flags(packet->rhf))) { + handle_eflags(packet); + return; + } + + qpnum = ib_bth_get_qpn(packet->ohdr); + netdev = hfi1_netdev_get_data(rcd->dd, qpnum); + if (!netdev) + goto drop_no_nd; + + trace_input_ibhdr(rcd->dd, packet, !!(rhf_dc_info(packet->rhf))); + + /* handle congestion notifications */ + do_work = hfi1_may_ecn(packet); + if (unlikely(do_work)) { + do_cnp = (packet->opcode != IB_OPCODE_CNP); + (void)hfi1_process_ecn_slowpath(hfi1_ipoib_priv(netdev)->qp, + packet, do_cnp); + } + + /* + * We have split point after last byte of DETH + * lets strip padding and CRC and ICRC. + * tlen is whole packet len so we need to + * subtract header size as well. + */ + tlen = packet->tlen; + extra_bytes = ib_bth_get_pad(packet->ohdr) + (SIZE_OF_CRC << 2) + + packet->hlen; + if (unlikely(tlen < extra_bytes)) + goto drop; + + tlen -= extra_bytes; + + skb = hfi1_ipoib_prepare_skb(rxq, tlen, packet->ebuf); + if (unlikely(!skb)) + goto drop; + + priv = hfi1_ipoib_priv(netdev); + hfi1_ipoib_update_rx_netstats(priv, 1, skb->len); + + skb->dev = netdev; + skb->pkt_type = PACKET_HOST; + netif_receive_skb(skb); + + return; + +drop: + ++netdev->stats.rx_dropped; +drop_no_nd: + ibp = rcd_to_iport(packet->rcd); + ++ibp->rvp.n_pkt_drops; +} + /* * The following functions are called by the interrupt handler. They are type * specific handlers for each packet type. @@ -1757,3 +1836,14 @@ const rhf_rcv_function_ptr normal_rhf_rcv_functions[] = { [RHF_RCV_TYPE_INVALID6] = process_receive_invalid, [RHF_RCV_TYPE_INVALID7] = process_receive_invalid, }; + +const rhf_rcv_function_ptr netdev_rhf_rcv_functions[] = { + [RHF_RCV_TYPE_EXPECTED] = process_receive_invalid, + [RHF_RCV_TYPE_EAGER] = process_receive_invalid, + [RHF_RCV_TYPE_IB] = hfi1_ipoib_ib_rcv, + [RHF_RCV_TYPE_ERROR] = process_receive_error, + [RHF_RCV_TYPE_BYPASS] = hfi1_vnic_bypass_rcv, + [RHF_RCV_TYPE_INVALID5] = process_receive_invalid, + [RHF_RCV_TYPE_INVALID6] = process_receive_invalid, + [RHF_RCV_TYPE_INVALID7] = process_receive_invalid, +}; diff --git a/drivers/infiniband/hw/hfi1/hfi.h b/drivers/infiniband/hw/hfi1/hfi.h index 5a9276c4c188..c7d0aad41f41 100644 --- a/drivers/infiniband/hw/hfi1/hfi.h +++ b/drivers/infiniband/hw/hfi1/hfi.h @@ -233,6 +233,8 @@ struct hfi1_ctxtdata { intr_handler fast_handler; /** slow handler */ intr_handler slow_handler; + /* napi pointer assiociated with netdev */ + struct napi_struct *napi; /* verbs rx_stats per rcd */ struct hfi1_opcode_stats_perctx *opstats; /* clear interrupt mask */ @@ -985,7 +987,7 @@ typedef void (*hfi1_make_req)(struct rvt_qp *qp, struct hfi1_pkt_state *ps, struct rvt_swqe *wqe); extern const rhf_rcv_function_ptr normal_rhf_rcv_functions[]; - +extern const rhf_rcv_function_ptr netdev_rhf_rcv_functions[]; /* return values for the RHF receive functions */ #define RHF_RCV_CONTINUE 0 /* keep going */ @@ -1417,6 +1419,7 @@ struct hfi1_devdata { struct hfi1_vnic_data vnic; /* Lock to protect IRQ SRC register access */ spinlock_t irq_src_lock; + struct net_device *dummy_netdev; /* Keeps track of IPoIB RSM rule users */ atomic_t ipoib_rsm_usr_num; diff --git a/drivers/infiniband/hw/hfi1/ipoib.h b/drivers/infiniband/hw/hfi1/ipoib.h index c2e63ca57896..ca00f6c6a90d 100644 --- a/drivers/infiniband/hw/hfi1/ipoib.h +++ b/drivers/infiniband/hw/hfi1/ipoib.h @@ -22,6 +22,7 @@ #include "hfi.h" #include "iowait.h" +#include "netdev.h" #include @@ -29,6 +30,7 @@ #define HFI1_IPOIB_TXREQ_NAME_LEN 32 +#define HFI1_IPOIB_PSEUDO_LEN 20 #define HFI1_IPOIB_ENCAP_LEN 4 struct hfi1_ipoib_dev_priv; @@ -118,6 +120,19 @@ hfi1_ipoib_priv(const struct net_device *dev) return &((struct hfi1_ipoib_rdma_netdev *)netdev_priv(dev))->dev_priv; } +static inline void +hfi1_ipoib_update_rx_netstats(struct hfi1_ipoib_dev_priv *priv, + u64 packets, + u64 bytes) +{ + struct pcpu_sw_netstats *netstats = this_cpu_ptr(priv->netstats); + + u64_stats_update_begin(&netstats->syncp); + netstats->rx_packets += packets; + netstats->rx_bytes += bytes; + u64_stats_update_end(&netstats->syncp); +} + static inline void hfi1_ipoib_update_tx_netstats(struct hfi1_ipoib_dev_priv *priv, u64 packets, @@ -142,6 +157,9 @@ void hfi1_ipoib_txreq_deinit(struct hfi1_ipoib_dev_priv *priv); void hfi1_ipoib_napi_tx_enable(struct net_device *dev); void hfi1_ipoib_napi_tx_disable(struct net_device *dev); +struct sk_buff *hfi1_ipoib_prepare_skb(struct hfi1_netdev_rxq *rxq, + int size, void *data); + int hfi1_ipoib_rn_get_params(struct ib_device *device, u8 port_num, enum rdma_netdev_t type, diff --git a/drivers/infiniband/hw/hfi1/ipoib_rx.c b/drivers/infiniband/hw/hfi1/ipoib_rx.c new file mode 100644 index 000000000000..2485663032c7 --- /dev/null +++ b/drivers/infiniband/hw/hfi1/ipoib_rx.c @@ -0,0 +1,71 @@ +// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) +/* + * Copyright(c) 2020 Intel Corporation. + * + */ + +#include "netdev.h" +#include "ipoib.h" + +#define HFI1_IPOIB_SKB_PAD ((NET_SKB_PAD) + (NET_IP_ALIGN)) + +static void copy_ipoib_buf(struct sk_buff *skb, void *data, int size) +{ + void *dst_data; + + skb_checksum_none_assert(skb); + skb->protocol = *((__be16 *)data); + + dst_data = skb_put(skb, size); + memcpy(dst_data, data, size); + skb->mac_header = HFI1_IPOIB_PSEUDO_LEN; + skb_pull(skb, HFI1_IPOIB_ENCAP_LEN); +} + +static struct sk_buff *prepare_frag_skb(struct napi_struct *napi, int size) +{ + struct sk_buff *skb; + int skb_size = SKB_DATA_ALIGN(size + HFI1_IPOIB_SKB_PAD); + void *frag; + + skb_size += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + skb_size = SKB_DATA_ALIGN(skb_size); + frag = napi_alloc_frag(skb_size); + + if (unlikely(!frag)) + return napi_alloc_skb(napi, size); + + skb = build_skb(frag, skb_size); + + if (unlikely(!skb)) { + skb_free_frag(frag); + return NULL; + } + + skb_reserve(skb, HFI1_IPOIB_SKB_PAD); + return skb; +} + +struct sk_buff *hfi1_ipoib_prepare_skb(struct hfi1_netdev_rxq *rxq, + int size, void *data) +{ + struct napi_struct *napi = &rxq->napi; + int skb_size = size + HFI1_IPOIB_ENCAP_LEN; + struct sk_buff *skb; + + /* + * For smaller(4k + skb overhead) allocations we will go using + * napi cache. Otherwise we will try to use napi frag cache. + */ + if (size <= SKB_WITH_OVERHEAD(PAGE_SIZE)) + skb = napi_alloc_skb(napi, skb_size); + else + skb = prepare_frag_skb(napi, skb_size); + + if (unlikely(!skb)) + return NULL; + + copy_ipoib_buf(skb, data, size); + + return skb; +} diff --git a/drivers/infiniband/hw/hfi1/netdev.h b/drivers/infiniband/hw/hfi1/netdev.h new file mode 100644 index 000000000000..8992dfe11e3e --- /dev/null +++ b/drivers/infiniband/hw/hfi1/netdev.h @@ -0,0 +1,90 @@ +/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */ +/* + * Copyright(c) 2020 Intel Corporation. + * + */ + +#ifndef HFI1_NETDEV_H +#define HFI1_NETDEV_H + +#include "hfi.h" + +#include +#include + +/** + * struct hfi1_netdev_rxq - Receive Queue for HFI + * dummy netdev. Both IPoIB and VNIC netdevices will be working on + * top of this device. + * @napi: napi object + * @priv: ptr to netdev_priv + * @rcd: ptr to receive context data + */ +struct hfi1_netdev_rxq { + struct napi_struct napi; + struct hfi1_netdev_priv *priv; + struct hfi1_ctxtdata *rcd; +}; + +/* + * Number of netdev contexts used. Ensure it is less than or equal to + * max queues supported by VNIC (HFI1_VNIC_MAX_QUEUE). + */ +#define HFI1_MAX_NETDEV_CTXTS 8 + +/* Number of NETDEV RSM entries */ +#define NUM_NETDEV_MAP_ENTRIES HFI1_MAX_NETDEV_CTXTS + +/** + * struct hfi1_netdev_priv: data required to setup and run HFI netdev. + * @dd: hfi1_devdata + * @rxq: pointer to dummy netdev receive queues. + * @num_rx_q: number of receive queues + * @rmt_index: first free index in RMT Array + * @msix_start: first free MSI-X interrupt vector. + * @dev_tbl: netdev table for unique identifier VNIC and IPoIb VLANs. + * @enabled: atomic counter of netdevs enabling receive queues. + * When 0 NAPI will be disabled. + * @netdevs: atomic counter of netdevs using dummy netdev. + * When 0 receive queues will be freed. + */ +struct hfi1_netdev_priv { + struct hfi1_devdata *dd; + struct hfi1_netdev_rxq *rxq; + int num_rx_q; + int rmt_start; + struct xarray dev_tbl; + /* count of enabled napi polls */ + atomic_t enabled; + /* count of netdevs on top */ + atomic_t netdevs; +}; + +static inline +struct hfi1_netdev_priv *hfi1_netdev_priv(struct net_device *dev) +{ + return (struct hfi1_netdev_priv *)&dev[1]; +} + +static inline +int hfi1_netdev_ctxt_count(struct hfi1_devdata *dd) +{ + struct hfi1_netdev_priv *priv = hfi1_netdev_priv(dd->dummy_netdev); + + return priv->num_rx_q; +} + +static inline +struct hfi1_ctxtdata *hfi1_netdev_get_ctxt(struct hfi1_devdata *dd, int ctxt) +{ + struct hfi1_netdev_priv *priv = hfi1_netdev_priv(dd->dummy_netdev); + + return priv->rxq[ctxt].rcd; +} + +int hfi1_netdev_add_data(struct hfi1_devdata *dd, int id, void *data); +void *hfi1_netdev_remove_data(struct hfi1_devdata *dd, int id); +void *hfi1_netdev_get_data(struct hfi1_devdata *dd, int id); +void *hfi1_netdev_get_first_data(struct hfi1_devdata *dd, int *start_id); + +#endif /* HFI1_NETDEV_H */ diff --git a/drivers/infiniband/hw/hfi1/netdev_rx.c b/drivers/infiniband/hw/hfi1/netdev_rx.c new file mode 100644 index 000000000000..3e286cbaa8c6 --- /dev/null +++ b/drivers/infiniband/hw/hfi1/netdev_rx.c @@ -0,0 +1,79 @@ +// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) +/* + * Copyright(c) 2020 Intel Corporation. + * + */ + +/* + * This file contains HFI1 support for netdev RX functionality + */ + +#include "sdma.h" +#include "verbs.h" +#include "netdev.h" +#include "hfi.h" + +#include +#include +#include + +/** + * hfi1_netdev_add_data - Registers data with unique identifier + * to be requested later this is needed for VNIC and IPoIB VLANs + * implementations. + * This call is protected by mutex idr_lock. + * + * @dd: hfi1 dev data + * @id: requested integer id up to INT_MAX + * @data: data to be associated with index + */ +int hfi1_netdev_add_data(struct hfi1_devdata *dd, int id, void *data) +{ + struct hfi1_netdev_priv *priv = hfi1_netdev_priv(dd->dummy_netdev); + + return xa_insert(&priv->dev_tbl, id, data, GFP_NOWAIT); +} + +/** + * hfi1_netdev_remove_data - Removes data with previously given id. + * Returns the reference to removed entry. + * + * @dd: hfi1 dev data + * @id: requested integer id up to INT_MAX + */ +void *hfi1_netdev_remove_data(struct hfi1_devdata *dd, int id) +{ + struct hfi1_netdev_priv *priv = hfi1_netdev_priv(dd->dummy_netdev); + + return xa_erase(&priv->dev_tbl, id); +} + +/** + * hfi1_netdev_get_data - Gets data with given id + * + * @dd: hfi1 dev data + * @id: requested integer id up to INT_MAX + */ +void *hfi1_netdev_get_data(struct hfi1_devdata *dd, int id) +{ + struct hfi1_netdev_priv *priv = hfi1_netdev_priv(dd->dummy_netdev); + + return xa_load(&priv->dev_tbl, id); +} + +/** + * hfi1_netdev_get_first_dat - Gets first entry with greater or equal id. + * + * @dd: hfi1 dev data + * @id: requested integer id up to INT_MAX + */ +void *hfi1_netdev_get_first_data(struct hfi1_devdata *dd, int *start_id) +{ + struct hfi1_netdev_priv *priv = hfi1_netdev_priv(dd->dummy_netdev); + unsigned long index = *start_id; + void *ret; + + ret = xa_find(&priv->dev_tbl, &index, UINT_MAX, XA_PRESENT); + *start_id = (int)index; + return ret; +} -- cgit v1.2.3 From 0bae02d56bba6cc3836a9d8dfbbe53787af19a58 Mon Sep 17 00:00:00 2001 From: Grzegorz Andrejczuk Date: Mon, 11 May 2020 12:06:37 -0400 Subject: IB/hfi1: Add interrupt handler functions for accelerated ipoib This patch adds the interrupt handler function, the NAPI poll function, and its associated helper functions for receiving accelerated ipoib packets. While we are here, fix the formats of two error printouts. Link: https://lore.kernel.org/r/20200511160637.173205.64890.stgit@awfm-01.aw.intel.com Reviewed-by: Mike Marciniszyn Reviewed-by: Dennis Dalessandro Signed-off-by: Sadanand Warrier Signed-off-by: Grzegorz Andrejczuk Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/affinity.c | 12 +++- drivers/infiniband/hw/hfi1/affinity.h | 3 +- drivers/infiniband/hw/hfi1/chip.c | 44 +++++++++++++ drivers/infiniband/hw/hfi1/chip.h | 1 + drivers/infiniband/hw/hfi1/driver.c | 120 ++++++++++++++++++++++++++++++++++ drivers/infiniband/hw/hfi1/hfi.h | 4 +- drivers/infiniband/hw/hfi1/init.c | 1 + drivers/infiniband/hw/hfi1/msix.c | 20 +++++- drivers/infiniband/hw/hfi1/msix.h | 5 +- drivers/infiniband/hw/hfi1/netdev.h | 3 + 10 files changed, 206 insertions(+), 7 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/affinity.c b/drivers/infiniband/hw/hfi1/affinity.c index 1aeea5d65c01..2a91b8d95e12 100644 --- a/drivers/infiniband/hw/hfi1/affinity.c +++ b/drivers/infiniband/hw/hfi1/affinity.c @@ -1,5 +1,5 @@ /* - * Copyright(c) 2015 - 2018 Intel Corporation. + * Copyright(c) 2015 - 2020 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. @@ -64,6 +64,7 @@ struct hfi1_affinity_node_list node_affinity = { static const char * const irq_type_names[] = { "SDMA", "RCVCTXT", + "NETDEVCTXT", "GENERAL", "OTHER", }; @@ -915,6 +916,11 @@ static int get_irq_affinity(struct hfi1_devdata *dd, set = &entry->rcv_intr; scnprintf(extra, 64, "ctxt %u", rcd->ctxt); break; + case IRQ_NETDEVCTXT: + rcd = (struct hfi1_ctxtdata *)msix->arg; + set = &entry->def_intr; + scnprintf(extra, 64, "ctxt %u", rcd->ctxt); + break; default: dd_dev_err(dd, "Invalid IRQ type %d\n", msix->type); return -EINVAL; @@ -987,6 +993,10 @@ void hfi1_put_irq_affinity(struct hfi1_devdata *dd, if (rcd->ctxt != HFI1_CTRL_CTXT) set = &entry->rcv_intr; break; + case IRQ_NETDEVCTXT: + rcd = (struct hfi1_ctxtdata *)msix->arg; + set = &entry->def_intr; + break; default: mutex_unlock(&node_affinity.lock); return; diff --git a/drivers/infiniband/hw/hfi1/affinity.h b/drivers/infiniband/hw/hfi1/affinity.h index 6a7e6ea4e426..f94ed5d7c7a3 100644 --- a/drivers/infiniband/hw/hfi1/affinity.h +++ b/drivers/infiniband/hw/hfi1/affinity.h @@ -1,5 +1,5 @@ /* - * Copyright(c) 2015 - 2018 Intel Corporation. + * Copyright(c) 2015 - 2020 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. @@ -52,6 +52,7 @@ enum irq_type { IRQ_SDMA, IRQ_RCVCTXT, + IRQ_NETDEVCTXT, IRQ_GENERAL, IRQ_OTHER }; diff --git a/drivers/infiniband/hw/hfi1/chip.c b/drivers/infiniband/hw/hfi1/chip.c index 07eec3544304..2117612c61b2 100644 --- a/drivers/infiniband/hw/hfi1/chip.c +++ b/drivers/infiniband/hw/hfi1/chip.c @@ -66,6 +66,7 @@ #include "affinity.h" #include "debugfs.h" #include "fault.h" +#include "netdev.h" uint num_vls = HFI1_MAX_VLS_SUPPORTED; module_param(num_vls, uint, S_IRUGO); @@ -8480,6 +8481,49 @@ static void hfi1_rcd_eoi_intr(struct hfi1_ctxtdata *rcd) local_irq_restore(flags); } +/** + * hfi1_netdev_rx_napi - napi poll function to move eoi inline + * @napi - pointer to napi object + * @budget - netdev budget + */ +int hfi1_netdev_rx_napi(struct napi_struct *napi, int budget) +{ + struct hfi1_netdev_rxq *rxq = container_of(napi, + struct hfi1_netdev_rxq, napi); + struct hfi1_ctxtdata *rcd = rxq->rcd; + int work_done = 0; + + work_done = rcd->do_interrupt(rcd, budget); + + if (work_done < budget) { + napi_complete_done(napi, work_done); + hfi1_rcd_eoi_intr(rcd); + } + + return work_done; +} + +/* Receive packet napi handler for netdevs VNIC and AIP */ +irqreturn_t receive_context_interrupt_napi(int irq, void *data) +{ + struct hfi1_ctxtdata *rcd = data; + + receive_interrupt_common(rcd); + + if (likely(rcd->napi)) { + if (likely(napi_schedule_prep(rcd->napi))) + __napi_schedule_irqoff(rcd->napi); + else + __hfi1_rcd_eoi_intr(rcd); + } else { + WARN_ONCE(1, "Napi IRQ handler without napi set up ctxt=%d\n", + rcd->ctxt); + __hfi1_rcd_eoi_intr(rcd); + } + + return IRQ_HANDLED; +} + /* * Receive packet IRQ handler. This routine expects to be on its own IRQ. * This routine will try to handle packets immediately (latency), but if diff --git a/drivers/infiniband/hw/hfi1/chip.h b/drivers/infiniband/hw/hfi1/chip.h index b10e0bf6d8c0..2c6f2de74d4d 100644 --- a/drivers/infiniband/hw/hfi1/chip.h +++ b/drivers/infiniband/hw/hfi1/chip.h @@ -1447,6 +1447,7 @@ irqreturn_t general_interrupt(int irq, void *data); irqreturn_t sdma_interrupt(int irq, void *data); irqreturn_t receive_context_interrupt(int irq, void *data); irqreturn_t receive_context_thread(int irq, void *data); +irqreturn_t receive_context_interrupt_napi(int irq, void *data); int set_intr_bits(struct hfi1_devdata *dd, u16 first, u16 last, bool set); void init_qsfp_int(struct hfi1_devdata *dd); diff --git a/drivers/infiniband/hw/hfi1/driver.c b/drivers/infiniband/hw/hfi1/driver.c index c5ed6ed30100..d89fc8fdff6a 100644 --- a/drivers/infiniband/hw/hfi1/driver.c +++ b/drivers/infiniband/hw/hfi1/driver.c @@ -752,6 +752,39 @@ static noinline int skip_rcv_packet(struct hfi1_packet *packet, int thread) return ret; } +static void process_rcv_packet_napi(struct hfi1_packet *packet) +{ + packet->etype = rhf_rcv_type(packet->rhf); + + /* total length */ + packet->tlen = rhf_pkt_len(packet->rhf); /* in bytes */ + /* retrieve eager buffer details */ + packet->etail = rhf_egr_index(packet->rhf); + packet->ebuf = get_egrbuf(packet->rcd, packet->rhf, + &packet->updegr); + /* + * Prefetch the contents of the eager buffer. It is + * OK to send a negative length to prefetch_range(). + * The +2 is the size of the RHF. + */ + prefetch_range(packet->ebuf, + packet->tlen - ((packet->rcd->rcvhdrqentsize - + (rhf_hdrq_offset(packet->rhf) + + 2)) * 4)); + + packet->rcd->rhf_rcv_function_map[packet->etype](packet); + packet->numpkt++; + + /* Set up for the next packet */ + packet->rhqoff += packet->rsize; + if (packet->rhqoff >= packet->maxcnt) + packet->rhqoff = 0; + + packet->rhf_addr = (__le32 *)packet->rcd->rcvhdrq + packet->rhqoff + + packet->rcd->rhf_offset; + packet->rhf = rhf_to_cpu(packet->rhf_addr); +} + static inline int process_rcv_packet(struct hfi1_packet *packet, int thread) { int ret; @@ -830,6 +863,36 @@ static inline void finish_packet(struct hfi1_packet *packet) packet->etail, rcv_intr_dynamic, packet->numpkt); } +/* + * handle_receive_interrupt_napi_fp - receive a packet + * @rcd: the context + * @budget: polling budget + * + * Called from interrupt handler for receive interrupt. + * This is the fast path interrupt handler + * when executing napi soft irq environment. + */ +int handle_receive_interrupt_napi_fp(struct hfi1_ctxtdata *rcd, int budget) +{ + struct hfi1_packet packet; + + init_packet(rcd, &packet); + if (last_rcv_seq(rcd, rhf_rcv_seq(packet.rhf))) + goto bail; + + while (packet.numpkt < budget) { + process_rcv_packet_napi(&packet); + if (hfi1_seq_incr(rcd, rhf_rcv_seq(packet.rhf))) + break; + + process_rcv_update(0, &packet); + } + hfi1_set_rcd_head(rcd, packet.rhqoff); +bail: + finish_packet(&packet); + return packet.numpkt; +} + /* * Handle receive interrupts when using the no dma rtail option. */ @@ -1077,6 +1140,63 @@ bail: return last; } +/* + * handle_receive_interrupt_napi_sp - receive a packet + * @rcd: the context + * @budget: polling budget + * + * Called from interrupt handler for errors or receive interrupt. + * This is the slow path interrupt handler + * when executing napi soft irq environment. + */ +int handle_receive_interrupt_napi_sp(struct hfi1_ctxtdata *rcd, int budget) +{ + struct hfi1_devdata *dd = rcd->dd; + int last = RCV_PKT_OK; + bool needset = true; + struct hfi1_packet packet; + + init_packet(rcd, &packet); + if (last_rcv_seq(rcd, rhf_rcv_seq(packet.rhf))) + goto bail; + + while (last != RCV_PKT_DONE && packet.numpkt < budget) { + if (hfi1_need_drop(dd)) { + /* On to the next packet */ + packet.rhqoff += packet.rsize; + packet.rhf_addr = (__le32 *)rcd->rcvhdrq + + packet.rhqoff + + rcd->rhf_offset; + packet.rhf = rhf_to_cpu(packet.rhf_addr); + + } else { + if (set_armed_to_active(&packet)) + goto bail; + process_rcv_packet_napi(&packet); + } + + if (hfi1_seq_incr(rcd, rhf_rcv_seq(packet.rhf))) + last = RCV_PKT_DONE; + + if (needset) { + needset = false; + set_all_fastpath(dd, rcd); + } + + process_rcv_update(last, &packet); + } + + hfi1_set_rcd_head(rcd, packet.rhqoff); + +bail: + /* + * Always write head at end, and setup rcv interrupt, even + * if no packets were processed. + */ + finish_packet(&packet); + return packet.numpkt; +} + /* * We may discover in the interrupt that the hardware link state has * changed from ARMED to ACTIVE (due to the arrival of a non-SC15 packet), diff --git a/drivers/infiniband/hw/hfi1/hfi.h b/drivers/infiniband/hw/hfi1/hfi.h index c7d0aad41f41..986d8c3dc430 100644 --- a/drivers/infiniband/hw/hfi1/hfi.h +++ b/drivers/infiniband/hw/hfi1/hfi.h @@ -385,11 +385,11 @@ struct hfi1_packet { u32 rhqoff; u32 dlid; u32 slid; + int numpkt; u16 tlen; s16 etail; u16 pkey; u8 hlen; - u8 numpkt; u8 rsize; u8 updegr; u8 etype; @@ -1501,6 +1501,8 @@ struct hfi1_ctxtdata *hfi1_rcd_get_by_index(struct hfi1_devdata *dd, u16 ctxt); int handle_receive_interrupt(struct hfi1_ctxtdata *rcd, int thread); int handle_receive_interrupt_nodma_rtail(struct hfi1_ctxtdata *rcd, int thread); int handle_receive_interrupt_dma_rtail(struct hfi1_ctxtdata *rcd, int thread); +int handle_receive_interrupt_napi_fp(struct hfi1_ctxtdata *rcd, int budget); +int handle_receive_interrupt_napi_sp(struct hfi1_ctxtdata *rcd, int budget); void set_all_slowpath(struct hfi1_devdata *dd); extern const struct pci_device_id hfi1_pci_tbl[]; diff --git a/drivers/infiniband/hw/hfi1/init.c b/drivers/infiniband/hw/hfi1/init.c index 8c6b96a660a6..64279d04370d 100644 --- a/drivers/infiniband/hw/hfi1/init.c +++ b/drivers/infiniband/hw/hfi1/init.c @@ -374,6 +374,7 @@ int hfi1_create_ctxtdata(struct hfi1_pportdata *ppd, int numa, rcd->numa_id = numa; rcd->rcv_array_groups = dd->rcv_entries.ngroups; rcd->rhf_rcv_function_map = normal_rhf_rcv_functions; + rcd->msix_intr = CCE_NUM_MSIX_VECTORS; mutex_init(&rcd->exp_mutex); spin_lock_init(&rcd->exp_lock); diff --git a/drivers/infiniband/hw/hfi1/msix.c b/drivers/infiniband/hw/hfi1/msix.c index 7574f2bc9718..7559875e0322 100644 --- a/drivers/infiniband/hw/hfi1/msix.c +++ b/drivers/infiniband/hw/hfi1/msix.c @@ -49,6 +49,7 @@ #include "hfi.h" #include "affinity.h" #include "sdma.h" +#include "netdev.h" /** * msix_initialize() - Calculate, request and configure MSIx IRQs @@ -140,7 +141,7 @@ static int msix_request_irq(struct hfi1_devdata *dd, void *arg, ret = pci_request_irq(dd->pcidev, nr, handler, thread, arg, name); if (ret) { dd_dev_err(dd, - "%s: request for IRQ %d failed, MSIx %lu, err %d\n", + "%s: request for IRQ %d failed, MSIx %lx, err %d\n", name, irq, nr, ret); spin_lock(&dd->msix_info.msix_lock); __clear_bit(nr, dd->msix_info.in_use_msix); @@ -160,7 +161,7 @@ static int msix_request_irq(struct hfi1_devdata *dd, void *arg, /* This is a request, so a failure is not fatal */ ret = hfi1_get_irq_affinity(dd, me); if (ret) - dd_dev_err(dd, "unable to pin IRQ %d\n", ret); + dd_dev_err(dd, "%s: unable to pin IRQ %d\n", name, ret); return nr; } @@ -203,6 +204,21 @@ int msix_request_rcd_irq(struct hfi1_ctxtdata *rcd) receive_context_thread, name); } +/** + * msix_request_rcd_irq() - Helper function for RCVAVAIL IRQs + * for netdev context + * @rcd: valid netdev contexti + */ +int msix_netdev_request_rcd_irq(struct hfi1_ctxtdata *rcd) +{ + char name[MAX_NAME_SIZE]; + + snprintf(name, sizeof(name), DRIVER_NAME "_%d nd kctxt%d", + rcd->dd->unit, rcd->ctxt); + return msix_request_rcd_irq_common(rcd, receive_context_interrupt_napi, + NULL, name); +} + /** * msix_request_smda_ira() - Helper for getting SDMA IRQ resources * @sde: valid sdma engine diff --git a/drivers/infiniband/hw/hfi1/msix.h b/drivers/infiniband/hw/hfi1/msix.h index 1a02ab7971c8..42fab9475499 100644 --- a/drivers/infiniband/hw/hfi1/msix.h +++ b/drivers/infiniband/hw/hfi1/msix.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */ /* - * Copyright(c) 2018 Intel Corporation. + * Copyright(c) 2018 - 2020 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. @@ -59,7 +59,8 @@ int msix_request_rcd_irq(struct hfi1_ctxtdata *rcd); int msix_request_sdma_irq(struct sdma_engine *sde); void msix_free_irq(struct hfi1_devdata *dd, u8 msix_intr); -/* VNIC interface */ +/* Netdev interface */ void msix_vnic_synchronize_irq(struct hfi1_devdata *dd); +int msix_netdev_request_rcd_irq(struct hfi1_ctxtdata *rcd); #endif diff --git a/drivers/infiniband/hw/hfi1/netdev.h b/drivers/infiniband/hw/hfi1/netdev.h index 8992dfe11e3e..6740ec34224a 100644 --- a/drivers/infiniband/hw/hfi1/netdev.h +++ b/drivers/infiniband/hw/hfi1/netdev.h @@ -87,4 +87,7 @@ void *hfi1_netdev_remove_data(struct hfi1_devdata *dd, int id); void *hfi1_netdev_get_data(struct hfi1_devdata *dd, int id); void *hfi1_netdev_get_first_data(struct hfi1_devdata *dd, int *start_id); +/* chip.c */ +int hfi1_netdev_rx_napi(struct napi_struct *napi, int budget); + #endif /* HFI1_NETDEV_H */ -- cgit v1.2.3 From 370caa5b5880cd988645735c2d5d1d597c258e39 Mon Sep 17 00:00:00 2001 From: Grzegorz Andrejczuk Date: Mon, 11 May 2020 12:06:43 -0400 Subject: IB/hfi1: Add rx functions for dummy netdev This patch adds the rx functions for the dummy netdev: - Functions to allocate/free the dummy netdev. - Functions to allocate/free receiving contexts for the netdev. - Functions to initialize/de-initialize the receive queue. - Functions to enable/disable the receive queue. Link: https://lore.kernel.org/r/20200511160643.173205.75087.stgit@awfm-01.aw.intel.com Reviewed-by: Mike Marciniszyn Reviewed-by: Dennis Dalessandro Signed-off-by: Sadanand Warrier Signed-off-by: Grzegorz Andrejczuk Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/ipoib.h | 3 + drivers/infiniband/hw/hfi1/ipoib_main.c | 30 ++- drivers/infiniband/hw/hfi1/ipoib_rx.c | 16 ++ drivers/infiniband/hw/hfi1/netdev.h | 6 + drivers/infiniband/hw/hfi1/netdev_rx.c | 361 ++++++++++++++++++++++++++++++++ 5 files changed, 414 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/ipoib.h b/drivers/infiniband/hw/hfi1/ipoib.h index ca00f6c6a90d..185c9b02c974 100644 --- a/drivers/infiniband/hw/hfi1/ipoib.h +++ b/drivers/infiniband/hw/hfi1/ipoib.h @@ -154,6 +154,9 @@ int hfi1_ipoib_send_dma(struct net_device *dev, int hfi1_ipoib_txreq_init(struct hfi1_ipoib_dev_priv *priv); void hfi1_ipoib_txreq_deinit(struct hfi1_ipoib_dev_priv *priv); +int hfi1_ipoib_rxq_init(struct net_device *dev); +void hfi1_ipoib_rxq_deinit(struct net_device *dev); + void hfi1_ipoib_napi_tx_enable(struct net_device *dev); void hfi1_ipoib_napi_tx_disable(struct net_device *dev); diff --git a/drivers/infiniband/hw/hfi1/ipoib_main.c b/drivers/infiniband/hw/hfi1/ipoib_main.c index 304a5ac86f77..014351ebbefa 100644 --- a/drivers/infiniband/hw/hfi1/ipoib_main.c +++ b/drivers/infiniband/hw/hfi1/ipoib_main.c @@ -19,16 +19,31 @@ static u32 qpn_from_mac(u8 *mac_arr) static int hfi1_ipoib_dev_init(struct net_device *dev) { struct hfi1_ipoib_dev_priv *priv = hfi1_ipoib_priv(dev); + int ret; priv->netstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); - return priv->netdev_ops->ndo_init(dev); + ret = priv->netdev_ops->ndo_init(dev); + if (ret) + return ret; + + ret = hfi1_netdev_add_data(priv->dd, + qpn_from_mac(priv->netdev->dev_addr), + dev); + if (ret < 0) { + priv->netdev_ops->ndo_uninit(dev); + return ret; + } + + return 0; } static void hfi1_ipoib_dev_uninit(struct net_device *dev) { struct hfi1_ipoib_dev_priv *priv = hfi1_ipoib_priv(dev); + hfi1_netdev_remove_data(priv->dd, qpn_from_mac(priv->netdev->dev_addr)); + priv->netdev_ops->ndo_uninit(dev); } @@ -55,6 +70,7 @@ static int hfi1_ipoib_dev_open(struct net_device *dev) priv->qp = qp; rcu_read_unlock(); + hfi1_netdev_enable_queues(priv->dd); hfi1_ipoib_napi_tx_enable(dev); } @@ -69,6 +85,7 @@ static int hfi1_ipoib_dev_stop(struct net_device *dev) return 0; hfi1_ipoib_napi_tx_disable(dev); + hfi1_netdev_disable_queues(priv->dd); rvt_put_qp(priv->qp); priv->qp = NULL; @@ -195,6 +212,7 @@ static void hfi1_ipoib_netdev_dtor(struct net_device *dev) struct hfi1_ipoib_dev_priv *priv = hfi1_ipoib_priv(dev); hfi1_ipoib_txreq_deinit(priv); + hfi1_ipoib_rxq_deinit(priv->netdev); free_percpu(priv->netstats); } @@ -252,6 +270,13 @@ static int hfi1_ipoib_setup_rn(struct ib_device *device, return rc; } + rc = hfi1_ipoib_rxq_init(netdev); + if (rc) { + dd_dev_err(dd, "IPoIB netdev RX init - failed(%d)\n", rc); + hfi1_ipoib_free_rdma_netdev(netdev); + return rc; + } + netdev->priv_destructor = hfi1_ipoib_netdev_dtor; netdev->needs_free_netdev = true; @@ -268,7 +293,7 @@ int hfi1_ipoib_rn_get_params(struct ib_device *device, if (type != RDMA_NETDEV_IPOIB) return -EOPNOTSUPP; - if (!HFI1_CAP_IS_KSET(AIP)) + if (!HFI1_CAP_IS_KSET(AIP) || !dd->num_netdev_contexts) return -EOPNOTSUPP; if (!port_num || port_num > dd->num_pports) @@ -276,6 +301,7 @@ int hfi1_ipoib_rn_get_params(struct ib_device *device, params->sizeof_priv = sizeof(struct hfi1_ipoib_rdma_netdev); params->txqs = dd->num_sdma; + params->rxqs = dd->num_netdev_contexts; params->param = NULL; params->initialize_rdma_netdev = hfi1_ipoib_setup_rn; diff --git a/drivers/infiniband/hw/hfi1/ipoib_rx.c b/drivers/infiniband/hw/hfi1/ipoib_rx.c index 2485663032c7..606ac69eeea5 100644 --- a/drivers/infiniband/hw/hfi1/ipoib_rx.c +++ b/drivers/infiniband/hw/hfi1/ipoib_rx.c @@ -69,3 +69,19 @@ struct sk_buff *hfi1_ipoib_prepare_skb(struct hfi1_netdev_rxq *rxq, return skb; } + +int hfi1_ipoib_rxq_init(struct net_device *netdev) +{ + struct hfi1_ipoib_dev_priv *ipoib_priv = hfi1_ipoib_priv(netdev); + struct hfi1_devdata *dd = ipoib_priv->dd; + + return hfi1_netdev_rx_init(dd); +} + +void hfi1_ipoib_rxq_deinit(struct net_device *netdev) +{ + struct hfi1_ipoib_dev_priv *ipoib_priv = hfi1_ipoib_priv(netdev); + struct hfi1_devdata *dd = ipoib_priv->dd; + + hfi1_netdev_rx_destroy(dd); +} diff --git a/drivers/infiniband/hw/hfi1/netdev.h b/drivers/infiniband/hw/hfi1/netdev.h index 6740ec34224a..edb936f013c1 100644 --- a/drivers/infiniband/hw/hfi1/netdev.h +++ b/drivers/infiniband/hw/hfi1/netdev.h @@ -82,6 +82,12 @@ struct hfi1_ctxtdata *hfi1_netdev_get_ctxt(struct hfi1_devdata *dd, int ctxt) return priv->rxq[ctxt].rcd; } +void hfi1_netdev_enable_queues(struct hfi1_devdata *dd); +void hfi1_netdev_disable_queues(struct hfi1_devdata *dd); +int hfi1_netdev_rx_init(struct hfi1_devdata *dd); +int hfi1_netdev_rx_destroy(struct hfi1_devdata *dd); +int hfi1_netdev_alloc(struct hfi1_devdata *dd); +void hfi1_netdev_free(struct hfi1_devdata *dd); int hfi1_netdev_add_data(struct hfi1_devdata *dd, int id, void *data); void *hfi1_netdev_remove_data(struct hfi1_devdata *dd, int id); void *hfi1_netdev_get_data(struct hfi1_devdata *dd, int id); diff --git a/drivers/infiniband/hw/hfi1/netdev_rx.c b/drivers/infiniband/hw/hfi1/netdev_rx.c index 3e286cbaa8c6..124e4e8695b0 100644 --- a/drivers/infiniband/hw/hfi1/netdev_rx.c +++ b/drivers/infiniband/hw/hfi1/netdev_rx.c @@ -17,6 +17,367 @@ #include #include +static int hfi1_netdev_setup_ctxt(struct hfi1_netdev_priv *priv, + struct hfi1_ctxtdata *uctxt) +{ + unsigned int rcvctrl_ops; + struct hfi1_devdata *dd = priv->dd; + int ret; + + uctxt->rhf_rcv_function_map = netdev_rhf_rcv_functions; + uctxt->do_interrupt = &handle_receive_interrupt_napi_sp; + + /* Now allocate the RcvHdr queue and eager buffers. */ + ret = hfi1_create_rcvhdrq(dd, uctxt); + if (ret) + goto done; + + ret = hfi1_setup_eagerbufs(uctxt); + if (ret) + goto done; + + clear_rcvhdrtail(uctxt); + + rcvctrl_ops = HFI1_RCVCTRL_CTXT_DIS; + rcvctrl_ops |= HFI1_RCVCTRL_INTRAVAIL_DIS; + + if (!HFI1_CAP_KGET_MASK(uctxt->flags, MULTI_PKT_EGR)) + rcvctrl_ops |= HFI1_RCVCTRL_ONE_PKT_EGR_ENB; + if (HFI1_CAP_KGET_MASK(uctxt->flags, NODROP_EGR_FULL)) + rcvctrl_ops |= HFI1_RCVCTRL_NO_EGR_DROP_ENB; + if (HFI1_CAP_KGET_MASK(uctxt->flags, NODROP_RHQ_FULL)) + rcvctrl_ops |= HFI1_RCVCTRL_NO_RHQ_DROP_ENB; + if (HFI1_CAP_KGET_MASK(uctxt->flags, DMA_RTAIL)) + rcvctrl_ops |= HFI1_RCVCTRL_TAILUPD_ENB; + + hfi1_rcvctrl(uctxt->dd, rcvctrl_ops, uctxt); +done: + return ret; +} + +static int hfi1_netdev_allocate_ctxt(struct hfi1_devdata *dd, + struct hfi1_ctxtdata **ctxt) +{ + struct hfi1_ctxtdata *uctxt; + int ret; + + if (dd->flags & HFI1_FROZEN) + return -EIO; + + ret = hfi1_create_ctxtdata(dd->pport, dd->node, &uctxt); + if (ret < 0) { + dd_dev_err(dd, "Unable to create ctxtdata, failing open\n"); + return -ENOMEM; + } + + uctxt->flags = HFI1_CAP_KGET(MULTI_PKT_EGR) | + HFI1_CAP_KGET(NODROP_RHQ_FULL) | + HFI1_CAP_KGET(NODROP_EGR_FULL) | + HFI1_CAP_KGET(DMA_RTAIL); + /* Netdev contexts are always NO_RDMA_RTAIL */ + uctxt->fast_handler = handle_receive_interrupt_napi_fp; + uctxt->slow_handler = handle_receive_interrupt_napi_sp; + hfi1_set_seq_cnt(uctxt, 1); + uctxt->is_vnic = true; + + hfi1_stats.sps_ctxts++; + + dd_dev_info(dd, "created netdev context %d\n", uctxt->ctxt); + *ctxt = uctxt; + + return 0; +} + +static void hfi1_netdev_deallocate_ctxt(struct hfi1_devdata *dd, + struct hfi1_ctxtdata *uctxt) +{ + flush_wc(); + + /* + * Disable receive context and interrupt available, reset all + * RcvCtxtCtrl bits to default values. + */ + hfi1_rcvctrl(dd, HFI1_RCVCTRL_CTXT_DIS | + HFI1_RCVCTRL_TIDFLOW_DIS | + HFI1_RCVCTRL_INTRAVAIL_DIS | + HFI1_RCVCTRL_ONE_PKT_EGR_DIS | + HFI1_RCVCTRL_NO_RHQ_DROP_DIS | + HFI1_RCVCTRL_NO_EGR_DROP_DIS, uctxt); + + if (uctxt->msix_intr != CCE_NUM_MSIX_VECTORS) + msix_free_irq(dd, uctxt->msix_intr); + + uctxt->msix_intr = CCE_NUM_MSIX_VECTORS; + uctxt->event_flags = 0; + + hfi1_clear_tids(uctxt); + hfi1_clear_ctxt_pkey(dd, uctxt); + + hfi1_stats.sps_ctxts--; + + hfi1_free_ctxt(uctxt); +} + +static int hfi1_netdev_allot_ctxt(struct hfi1_netdev_priv *priv, + struct hfi1_ctxtdata **ctxt) +{ + int rc; + struct hfi1_devdata *dd = priv->dd; + + rc = hfi1_netdev_allocate_ctxt(dd, ctxt); + if (rc) { + dd_dev_err(dd, "netdev ctxt alloc failed %d\n", rc); + return rc; + } + + rc = hfi1_netdev_setup_ctxt(priv, *ctxt); + if (rc) { + dd_dev_err(dd, "netdev ctxt setup failed %d\n", rc); + hfi1_netdev_deallocate_ctxt(dd, *ctxt); + *ctxt = NULL; + } + + return rc; +} + +static int hfi1_netdev_rxq_init(struct net_device *dev) +{ + int i; + int rc; + struct hfi1_netdev_priv *priv = hfi1_netdev_priv(dev); + struct hfi1_devdata *dd = priv->dd; + + priv->num_rx_q = dd->num_netdev_contexts; + priv->rxq = kcalloc_node(priv->num_rx_q, sizeof(struct hfi1_netdev_rxq), + GFP_KERNEL, dd->node); + + if (!priv->rxq) { + dd_dev_err(dd, "Unable to allocate netdev queue data\n"); + return (-ENOMEM); + } + + for (i = 0; i < priv->num_rx_q; i++) { + struct hfi1_netdev_rxq *rxq = &priv->rxq[i]; + + rc = hfi1_netdev_allot_ctxt(priv, &rxq->rcd); + if (rc) + goto bail_context_irq_failure; + + hfi1_rcd_get(rxq->rcd); + rxq->priv = priv; + rxq->rcd->napi = &rxq->napi; + dd_dev_info(dd, "Setting rcv queue %d napi to context %d\n", + i, rxq->rcd->ctxt); + /* + * Disable BUSY_POLL on this NAPI as this is not supported + * right now. + */ + set_bit(NAPI_STATE_NO_BUSY_POLL, &rxq->napi.state); + netif_napi_add(dev, &rxq->napi, hfi1_netdev_rx_napi, 64); + rc = msix_netdev_request_rcd_irq(rxq->rcd); + if (rc) + goto bail_context_irq_failure; + } + + return 0; + +bail_context_irq_failure: + dd_dev_err(dd, "Unable to allot receive context\n"); + for (; i >= 0; i--) { + struct hfi1_netdev_rxq *rxq = &priv->rxq[i]; + + if (rxq->rcd) { + hfi1_netdev_deallocate_ctxt(dd, rxq->rcd); + hfi1_rcd_put(rxq->rcd); + rxq->rcd = NULL; + } + } + kfree(priv->rxq); + priv->rxq = NULL; + + return rc; +} + +static void hfi1_netdev_rxq_deinit(struct net_device *dev) +{ + int i; + struct hfi1_netdev_priv *priv = hfi1_netdev_priv(dev); + struct hfi1_devdata *dd = priv->dd; + + for (i = 0; i < priv->num_rx_q; i++) { + struct hfi1_netdev_rxq *rxq = &priv->rxq[i]; + + netif_napi_del(&rxq->napi); + hfi1_netdev_deallocate_ctxt(dd, rxq->rcd); + hfi1_rcd_put(rxq->rcd); + rxq->rcd = NULL; + } + + kfree(priv->rxq); + priv->rxq = NULL; + priv->num_rx_q = 0; +} + +static void enable_queues(struct hfi1_netdev_priv *priv) +{ + int i; + + for (i = 0; i < priv->num_rx_q; i++) { + struct hfi1_netdev_rxq *rxq = &priv->rxq[i]; + + dd_dev_info(priv->dd, "enabling queue %d on context %d\n", i, + rxq->rcd->ctxt); + napi_enable(&rxq->napi); + hfi1_rcvctrl(priv->dd, + HFI1_RCVCTRL_CTXT_ENB | HFI1_RCVCTRL_INTRAVAIL_ENB, + rxq->rcd); + } +} + +static void disable_queues(struct hfi1_netdev_priv *priv) +{ + int i; + + msix_vnic_synchronize_irq(priv->dd); + + for (i = 0; i < priv->num_rx_q; i++) { + struct hfi1_netdev_rxq *rxq = &priv->rxq[i]; + + dd_dev_info(priv->dd, "disabling queue %d on context %d\n", i, + rxq->rcd->ctxt); + + /* wait for napi if it was scheduled */ + hfi1_rcvctrl(priv->dd, + HFI1_RCVCTRL_CTXT_DIS | HFI1_RCVCTRL_INTRAVAIL_DIS, + rxq->rcd); + napi_synchronize(&rxq->napi); + napi_disable(&rxq->napi); + } +} + +/** + * hfi1_netdev_rx_init - Incrememnts netdevs counter. When called first time, + * it allocates receive queue data and calls netif_napi_add + * for each queue. + * + * @dd: hfi1 dev data + */ +int hfi1_netdev_rx_init(struct hfi1_devdata *dd) +{ + struct hfi1_netdev_priv *priv = hfi1_netdev_priv(dd->dummy_netdev); + int res; + + if (atomic_fetch_inc(&priv->netdevs)) + return 0; + + mutex_lock(&hfi1_mutex); + init_dummy_netdev(dd->dummy_netdev); + res = hfi1_netdev_rxq_init(dd->dummy_netdev); + mutex_unlock(&hfi1_mutex); + return res; +} + +/** + * hfi1_netdev_rx_destroy - Decrements netdevs counter, when it reaches 0 + * napi is deleted and receive queses memory is freed. + * + * @dd: hfi1 dev data + */ +int hfi1_netdev_rx_destroy(struct hfi1_devdata *dd) +{ + struct hfi1_netdev_priv *priv = hfi1_netdev_priv(dd->dummy_netdev); + + /* destroy the RX queues only if it is the last netdev going away */ + if (atomic_fetch_add_unless(&priv->netdevs, -1, 0) == 1) { + mutex_lock(&hfi1_mutex); + hfi1_netdev_rxq_deinit(dd->dummy_netdev); + mutex_unlock(&hfi1_mutex); + } + + return 0; +} + +/** + * hfi1_netdev_alloc - Allocates netdev and private data. It is required + * because RMT index and MSI-X interrupt can be set only + * during driver initialization. + * + * @dd: hfi1 dev data + */ +int hfi1_netdev_alloc(struct hfi1_devdata *dd) +{ + struct hfi1_netdev_priv *priv; + const int netdev_size = sizeof(*dd->dummy_netdev) + + sizeof(struct hfi1_netdev_priv); + + dd_dev_info(dd, "allocating netdev size %d\n", netdev_size); + dd->dummy_netdev = kcalloc_node(1, netdev_size, GFP_KERNEL, dd->node); + + if (!dd->dummy_netdev) + return -ENOMEM; + + priv = hfi1_netdev_priv(dd->dummy_netdev); + priv->dd = dd; + xa_init(&priv->dev_tbl); + atomic_set(&priv->enabled, 0); + atomic_set(&priv->netdevs, 0); + + return 0; +} + +void hfi1_netdev_free(struct hfi1_devdata *dd) +{ + struct hfi1_netdev_priv *priv; + + if (dd->dummy_netdev) { + priv = hfi1_netdev_priv(dd->dummy_netdev); + dd_dev_info(dd, "hfi1 netdev freed\n"); + kfree(dd->dummy_netdev); + dd->dummy_netdev = NULL; + } +} + +/** + * hfi1_netdev_enable_queues - This is napi enable function. + * It enables napi objects associated with queues. + * When at least one device has called it it increments atomic counter. + * Disable function decrements counter and when it is 0, + * calls napi_disable for every queue. + * + * @dd: hfi1 dev data + */ +void hfi1_netdev_enable_queues(struct hfi1_devdata *dd) +{ + struct hfi1_netdev_priv *priv; + + if (!dd->dummy_netdev) + return; + + priv = hfi1_netdev_priv(dd->dummy_netdev); + if (atomic_fetch_inc(&priv->enabled)) + return; + + mutex_lock(&hfi1_mutex); + enable_queues(priv); + mutex_unlock(&hfi1_mutex); +} + +void hfi1_netdev_disable_queues(struct hfi1_devdata *dd) +{ + struct hfi1_netdev_priv *priv; + + if (!dd->dummy_netdev) + return; + + priv = hfi1_netdev_priv(dd->dummy_netdev); + if (atomic_dec_if_positive(&priv->enabled)) + return; + + mutex_lock(&hfi1_mutex); + disable_queues(priv); + mutex_unlock(&hfi1_mutex); +} + /** * hfi1_netdev_add_data - Registers data with unique identifier * to be requested later this is needed for VNIC and IPoIB VLANs -- cgit v1.2.3 From 4730f4a6c6b2065589c0822af00aa45e639bbc36 Mon Sep 17 00:00:00 2001 From: Grzegorz Andrejczuk Date: Mon, 11 May 2020 12:06:49 -0400 Subject: IB/hfi1: Activate the dummy netdev As described in earlier patches, ipoib netdev will share receive contexts with existing VNIC netdev through a dummy netdev. The following changes are made to achieve that: - Set up netdev receive contexts after user contexts. A function is added to count the available netdev receive contexts. - Add functions to set/get receive map table free index. - Rename NUM_VNIC_MAP_ENTRIES as NUM_NETDEV_MAP_ENTRIES. - Let the dummy netdev own the receive contexts instead of VNIC. - Allocate the dummy netdev when the hfi1 device is added and free it when the device is removed. - Initialize AIP RSM rules when the IpoIb rxq is initialized and remove the rules when it is de-initialized. - Convert VNIC to use the dummy netdev. Link: https://lore.kernel.org/r/20200511160649.173205.4626.stgit@awfm-01.aw.intel.com Reviewed-by: Mike Marciniszyn Reviewed-by: Dennis Dalessandro Signed-off-by: Sadanand Warrier Signed-off-by: Grzegorz Andrejczuk Signed-off-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/chip.c | 73 ++++---- drivers/infiniband/hw/hfi1/driver.c | 18 -- drivers/infiniband/hw/hfi1/hfi.h | 14 +- drivers/infiniband/hw/hfi1/init.c | 9 +- drivers/infiniband/hw/hfi1/ipoib_rx.c | 10 +- drivers/infiniband/hw/hfi1/msix.c | 12 +- drivers/infiniband/hw/hfi1/msix.h | 2 +- drivers/infiniband/hw/hfi1/netdev.h | 19 ++ drivers/infiniband/hw/hfi1/netdev_rx.c | 46 ++++- drivers/infiniband/hw/hfi1/vnic.h | 5 +- drivers/infiniband/hw/hfi1/vnic_main.c | 312 ++++++--------------------------- 11 files changed, 178 insertions(+), 342 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/chip.c b/drivers/infiniband/hw/hfi1/chip.c index 2117612c61b2..7f35b9ea158b 100644 --- a/drivers/infiniband/hw/hfi1/chip.c +++ b/drivers/infiniband/hw/hfi1/chip.c @@ -13396,8 +13396,7 @@ static int set_up_interrupts(struct hfi1_devdata *dd) static int set_up_context_variables(struct hfi1_devdata *dd) { unsigned long num_kernel_contexts; - u16 num_netdev_contexts = HFI1_NUM_VNIC_CTXT; - int total_contexts; + u16 num_netdev_contexts; int ret; unsigned ngroups; int rmt_count; @@ -13434,13 +13433,6 @@ static int set_up_context_variables(struct hfi1_devdata *dd) num_kernel_contexts = send_contexts - num_vls - 1; } - /* Accommodate VNIC contexts if possible */ - if ((num_kernel_contexts + num_netdev_contexts) > rcv_contexts) { - dd_dev_err(dd, "No receive contexts available for VNIC\n"); - num_netdev_contexts = 0; - } - total_contexts = num_kernel_contexts + num_netdev_contexts; - /* * User contexts: * - default to 1 user context per real (non-HT) CPU core if @@ -13453,15 +13445,19 @@ static int set_up_context_variables(struct hfi1_devdata *dd) /* * Adjust the counts given a global max. */ - if (total_contexts + n_usr_ctxts > rcv_contexts) { + if (num_kernel_contexts + n_usr_ctxts > rcv_contexts) { dd_dev_err(dd, - "Reducing # user receive contexts to: %d, from %u\n", - rcv_contexts - total_contexts, + "Reducing # user receive contexts to: %u, from %u\n", + (u32)(rcv_contexts - num_kernel_contexts), n_usr_ctxts); /* recalculate */ - n_usr_ctxts = rcv_contexts - total_contexts; + n_usr_ctxts = rcv_contexts - num_kernel_contexts; } + num_netdev_contexts = + hfi1_num_netdev_contexts(dd, rcv_contexts - + (num_kernel_contexts + n_usr_ctxts), + &node_affinity.real_cpu_mask); /* * The RMT entries are currently allocated as shown below: * 1. QOS (0 to 128 entries); @@ -13487,17 +13483,16 @@ static int set_up_context_variables(struct hfi1_devdata *dd) n_usr_ctxts = user_rmt_reduced; } - total_contexts += n_usr_ctxts; - - /* the first N are kernel contexts, the rest are user/vnic contexts */ - dd->num_rcv_contexts = total_contexts; + /* the first N are kernel contexts, the rest are user/netdev contexts */ + dd->num_rcv_contexts = + num_kernel_contexts + n_usr_ctxts + num_netdev_contexts; dd->n_krcv_queues = num_kernel_contexts; dd->first_dyn_alloc_ctxt = num_kernel_contexts; dd->num_netdev_contexts = num_netdev_contexts; dd->num_user_contexts = n_usr_ctxts; dd->freectxts = n_usr_ctxts; dd_dev_info(dd, - "rcv contexts: chip %d, used %d (kernel %d, vnic %u, user %u)\n", + "rcv contexts: chip %d, used %d (kernel %d, netdev %u, user %u)\n", rcv_contexts, (int)dd->num_rcv_contexts, (int)dd->n_krcv_queues, @@ -14554,7 +14549,8 @@ static bool hfi1_netdev_update_rmt(struct hfi1_devdata *dd) u8 ctx_id = 0; u64 reg; u32 regoff; - int rmt_start = dd->vnic.rmt_start; + int rmt_start = hfi1_netdev_get_free_rmt_idx(dd); + int ctxt_count = hfi1_netdev_ctxt_count(dd); /* We already have contexts mapped in RMT */ if (has_rsm_rule(dd, RSM_INS_VNIC) || has_rsm_rule(dd, RSM_INS_AIP)) { @@ -14562,7 +14558,7 @@ static bool hfi1_netdev_update_rmt(struct hfi1_devdata *dd) return true; } - if (hfi1_is_rmt_full(rmt_start, NUM_VNIC_MAP_ENTRIES)) { + if (hfi1_is_rmt_full(rmt_start, NUM_NETDEV_MAP_ENTRIES)) { dd_dev_err(dd, "Not enought RMT entries used = %d\n", rmt_start); return false; @@ -14570,27 +14566,27 @@ static bool hfi1_netdev_update_rmt(struct hfi1_devdata *dd) dev_dbg(&(dd)->pcidev->dev, "RMT start = %d, end %d\n", rmt_start, - rmt_start + NUM_VNIC_MAP_ENTRIES); + rmt_start + NUM_NETDEV_MAP_ENTRIES); /* Update RSM mapping table, 32 regs, 256 entries - 1 ctx per byte */ regoff = RCV_RSM_MAP_TABLE + (rmt_start / 8) * 8; reg = read_csr(dd, regoff); - for (i = 0; i < NUM_VNIC_MAP_ENTRIES; i++) { + for (i = 0; i < NUM_NETDEV_MAP_ENTRIES; i++) { /* Update map register with netdev context */ j = (rmt_start + i) % 8; reg &= ~(0xffllu << (j * 8)); - reg |= (u64)dd->vnic.ctxt[ctx_id++]->ctxt << (j * 8); + reg |= (u64)hfi1_netdev_get_ctxt(dd, ctx_id++)->ctxt << (j * 8); /* Wrap up netdev ctx index */ - ctx_id %= dd->vnic.num_ctxt; + ctx_id %= ctxt_count; /* Write back map register */ - if (j == 7 || ((i + 1) == NUM_VNIC_MAP_ENTRIES)) { + if (j == 7 || ((i + 1) == NUM_NETDEV_MAP_ENTRIES)) { dev_dbg(&(dd)->pcidev->dev, "RMT[%d] =0x%llx\n", regoff - RCV_RSM_MAP_TABLE, reg); write_csr(dd, regoff, reg); regoff += 8; - if (i < (NUM_VNIC_MAP_ENTRIES - 1)) + if (i < (NUM_NETDEV_MAP_ENTRIES - 1)) reg = read_csr(dd, regoff); } } @@ -14617,8 +14613,9 @@ void hfi1_init_aip_rsm(struct hfi1_devdata *dd) * exist yet */ if (atomic_fetch_inc(&dd->ipoib_rsm_usr_num) == 0) { + int rmt_start = hfi1_netdev_get_free_rmt_idx(dd); struct rsm_rule_data rrd = { - .offset = dd->vnic.rmt_start, + .offset = rmt_start, .pkt_type = IB_PACKET_TYPE, .field1_off = LRH_BTH_MATCH_OFFSET, .mask1 = LRH_BTH_MASK, @@ -14627,10 +14624,10 @@ void hfi1_init_aip_rsm(struct hfi1_devdata *dd) .mask2 = BTH_DESTQP_MASK, .value2 = BTH_DESTQP_VALUE, .index1_off = DETH_AIP_SQPN_SELECT_OFFSET + - ilog2(NUM_VNIC_MAP_ENTRIES), - .index1_width = ilog2(NUM_VNIC_MAP_ENTRIES), + ilog2(NUM_NETDEV_MAP_ENTRIES), + .index1_width = ilog2(NUM_NETDEV_MAP_ENTRIES), .index2_off = DETH_AIP_SQPN_SELECT_OFFSET, - .index2_width = ilog2(NUM_VNIC_MAP_ENTRIES) + .index2_width = ilog2(NUM_NETDEV_MAP_ENTRIES) }; hfi1_enable_rsm_rule(dd, RSM_INS_AIP, &rrd); @@ -14640,9 +14637,10 @@ void hfi1_init_aip_rsm(struct hfi1_devdata *dd) /* Initialize RSM for VNIC */ void hfi1_init_vnic_rsm(struct hfi1_devdata *dd) { + int rmt_start = hfi1_netdev_get_free_rmt_idx(dd); struct rsm_rule_data rrd = { /* Add rule for vnic */ - .offset = dd->vnic.rmt_start, + .offset = rmt_start, .pkt_type = 4, /* Match 16B packets */ .field1_off = L2_TYPE_MATCH_OFFSET, @@ -14654,9 +14652,9 @@ void hfi1_init_vnic_rsm(struct hfi1_devdata *dd) .value2 = L4_16B_ETH_VALUE, /* Calc context from veswid and entropy */ .index1_off = L4_16B_HDR_VESWID_OFFSET, - .index1_width = ilog2(NUM_VNIC_MAP_ENTRIES), + .index1_width = ilog2(NUM_NETDEV_MAP_ENTRIES), .index2_off = L2_16B_ENTROPY_OFFSET, - .index2_width = ilog2(NUM_VNIC_MAP_ENTRIES) + .index2_width = ilog2(NUM_NETDEV_MAP_ENTRIES) }; hfi1_enable_rsm_rule(dd, RSM_INS_VNIC, &rrd); @@ -14690,8 +14688,8 @@ static int init_rxe(struct hfi1_devdata *dd) init_qos(dd, rmt); init_fecn_handling(dd, rmt); complete_rsm_map_table(dd, rmt); - /* record number of used rsm map entries for vnic */ - dd->vnic.rmt_start = rmt->used; + /* record number of used rsm map entries for netdev */ + hfi1_netdev_set_free_rmt_idx(dd, rmt->used); kfree(rmt); /* @@ -15245,6 +15243,10 @@ int hfi1_init_dd(struct hfi1_devdata *dd) (dd->revision >> CCE_REVISION_SW_SHIFT) & CCE_REVISION_SW_MASK); + /* alloc netdev data */ + if (hfi1_netdev_alloc(dd)) + goto bail_cleanup; + ret = set_up_context_variables(dd); if (ret) goto bail_cleanup; @@ -15345,6 +15347,7 @@ bail_clear_intr: hfi1_comp_vectors_clean_up(dd); msix_clean_up_interrupts(dd); bail_cleanup: + hfi1_netdev_free(dd); hfi1_pcie_ddcleanup(dd); bail_free: hfi1_free_devdata(dd); diff --git a/drivers/infiniband/hw/hfi1/driver.c b/drivers/infiniband/hw/hfi1/driver.c index d89fc8fdff6a..60ff6de8cf98 100644 --- a/drivers/infiniband/hw/hfi1/driver.c +++ b/drivers/infiniband/hw/hfi1/driver.c @@ -1771,28 +1771,10 @@ static void process_receive_ib(struct hfi1_packet *packet) hfi1_ib_rcv(packet); } -static inline bool hfi1_is_vnic_packet(struct hfi1_packet *packet) -{ - /* Packet received in VNIC context via RSM */ - if (packet->rcd->is_vnic) - return true; - - if ((hfi1_16B_get_l2(packet->ebuf) == OPA_16B_L2_TYPE) && - (hfi1_16B_get_l4(packet->ebuf) == OPA_16B_L4_ETHR)) - return true; - - return false; -} - static void process_receive_bypass(struct hfi1_packet *packet) { struct hfi1_devdata *dd = packet->rcd->dd; - if (hfi1_is_vnic_packet(packet)) { - hfi1_vnic_bypass_rcv(packet); - return; - } - if (hfi1_setup_bypass_packet(packet)) return; diff --git a/drivers/infiniband/hw/hfi1/hfi.h b/drivers/infiniband/hw/hfi1/hfi.h index 986d8c3dc430..b4c6bff60a4e 100644 --- a/drivers/infiniband/hw/hfi1/hfi.h +++ b/drivers/infiniband/hw/hfi1/hfi.h @@ -1047,23 +1047,10 @@ struct hfi1_asic_data { #define NUM_MAP_ENTRIES 256 #define NUM_MAP_REGS 32 -/* - * Number of VNIC contexts used. Ensure it is less than or equal to - * max queues supported by VNIC (HFI1_VNIC_MAX_QUEUE). - */ -#define HFI1_NUM_VNIC_CTXT 8 - -/* Number of VNIC RSM entries */ -#define NUM_VNIC_MAP_ENTRIES 8 - /* Virtual NIC information */ struct hfi1_vnic_data { - struct hfi1_ctxtdata *ctxt[HFI1_NUM_VNIC_CTXT]; struct kmem_cache *txreq_cache; - struct xarray vesws; u8 num_vports; - u8 rmt_start; - u8 num_ctxt; }; struct hfi1_vnic_vport_info; @@ -1419,6 +1406,7 @@ struct hfi1_devdata { struct hfi1_vnic_data vnic; /* Lock to protect IRQ SRC register access */ spinlock_t irq_src_lock; + int vnic_num_vports; struct net_device *dummy_netdev; /* Keeps track of IPoIB RSM rule users */ diff --git a/drivers/infiniband/hw/hfi1/init.c b/drivers/infiniband/hw/hfi1/init.c index 64279d04370d..5eed4360695f 100644 --- a/drivers/infiniband/hw/hfi1/init.c +++ b/drivers/infiniband/hw/hfi1/init.c @@ -69,6 +69,7 @@ #include "affinity.h" #include "vnic.h" #include "exp_rcv.h" +#include "netdev.h" #undef pr_fmt #define pr_fmt(fmt) DRIVER_NAME ": " fmt @@ -1665,9 +1666,6 @@ static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent) /* do the generic initialization */ initfail = hfi1_init(dd, 0); - /* setup vnic */ - hfi1_vnic_setup(dd); - ret = hfi1_register_ib_device(dd); /* @@ -1706,7 +1704,6 @@ static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent) hfi1_device_remove(dd); if (!ret) hfi1_unregister_ib_device(dd); - hfi1_vnic_cleanup(dd); postinit_cleanup(dd); if (initfail) ret = initfail; @@ -1751,8 +1748,8 @@ static void remove_one(struct pci_dev *pdev) /* unregister from IB core */ hfi1_unregister_ib_device(dd); - /* cleanup vnic */ - hfi1_vnic_cleanup(dd); + /* free netdev data */ + hfi1_netdev_free(dd); /* * Disable the IB link, disable interrupts on the device, diff --git a/drivers/infiniband/hw/hfi1/ipoib_rx.c b/drivers/infiniband/hw/hfi1/ipoib_rx.c index 606ac69eeea5..3afa7545242c 100644 --- a/drivers/infiniband/hw/hfi1/ipoib_rx.c +++ b/drivers/infiniband/hw/hfi1/ipoib_rx.c @@ -74,8 +74,15 @@ int hfi1_ipoib_rxq_init(struct net_device *netdev) { struct hfi1_ipoib_dev_priv *ipoib_priv = hfi1_ipoib_priv(netdev); struct hfi1_devdata *dd = ipoib_priv->dd; + int ret; - return hfi1_netdev_rx_init(dd); + ret = hfi1_netdev_rx_init(dd); + if (ret) + return ret; + + hfi1_init_aip_rsm(dd); + + return ret; } void hfi1_ipoib_rxq_deinit(struct net_device *netdev) @@ -83,5 +90,6 @@ void hfi1_ipoib_rxq_deinit(struct net_device *netdev) struct hfi1_ipoib_dev_priv *ipoib_priv = hfi1_ipoib_priv(netdev); struct hfi1_devdata *dd = ipoib_priv->dd; + hfi1_deinit_aip_rsm(dd); hfi1_netdev_rx_destroy(dd); } diff --git a/drivers/infiniband/hw/hfi1/msix.c b/drivers/infiniband/hw/hfi1/msix.c index 7559875e0322..d61ee853d215 100644 --- a/drivers/infiniband/hw/hfi1/msix.c +++ b/drivers/infiniband/hw/hfi1/msix.c @@ -172,7 +172,8 @@ static int msix_request_rcd_irq_common(struct hfi1_ctxtdata *rcd, const char *name) { int nr = msix_request_irq(rcd->dd, rcd, handler, thread, - IRQ_RCVCTXT, name); + rcd->is_vnic ? IRQ_NETDEVCTXT : IRQ_RCVCTXT, + name); if (nr < 0) return nr; @@ -371,15 +372,16 @@ void msix_clean_up_interrupts(struct hfi1_devdata *dd) } /** - * msix_vnic_syncrhonize_irq() - Vnic IRQ synchronize + * msix_netdev_syncrhonize_irq() - netdev IRQ synchronize * @dd: valid devdata */ -void msix_vnic_synchronize_irq(struct hfi1_devdata *dd) +void msix_netdev_synchronize_irq(struct hfi1_devdata *dd) { int i; + int ctxt_count = hfi1_netdev_ctxt_count(dd); - for (i = 0; i < dd->vnic.num_ctxt; i++) { - struct hfi1_ctxtdata *rcd = dd->vnic.ctxt[i]; + for (i = 0; i < ctxt_count; i++) { + struct hfi1_ctxtdata *rcd = hfi1_netdev_get_ctxt(dd, i); struct hfi1_msix_entry *me; me = &dd->msix_info.msix_entries[rcd->msix_intr]; diff --git a/drivers/infiniband/hw/hfi1/msix.h b/drivers/infiniband/hw/hfi1/msix.h index 42fab9475499..e63e944bf0fc 100644 --- a/drivers/infiniband/hw/hfi1/msix.h +++ b/drivers/infiniband/hw/hfi1/msix.h @@ -60,7 +60,7 @@ int msix_request_sdma_irq(struct sdma_engine *sde); void msix_free_irq(struct hfi1_devdata *dd, u8 msix_intr); /* Netdev interface */ -void msix_vnic_synchronize_irq(struct hfi1_devdata *dd); +void msix_netdev_synchronize_irq(struct hfi1_devdata *dd); int msix_netdev_request_rcd_irq(struct hfi1_ctxtdata *rcd); #endif diff --git a/drivers/infiniband/hw/hfi1/netdev.h b/drivers/infiniband/hw/hfi1/netdev.h index edb936f013c1..947543a3e0c4 100644 --- a/drivers/infiniband/hw/hfi1/netdev.h +++ b/drivers/infiniband/hw/hfi1/netdev.h @@ -82,6 +82,25 @@ struct hfi1_ctxtdata *hfi1_netdev_get_ctxt(struct hfi1_devdata *dd, int ctxt) return priv->rxq[ctxt].rcd; } +static inline +int hfi1_netdev_get_free_rmt_idx(struct hfi1_devdata *dd) +{ + struct hfi1_netdev_priv *priv = hfi1_netdev_priv(dd->dummy_netdev); + + return priv->rmt_start; +} + +static inline +void hfi1_netdev_set_free_rmt_idx(struct hfi1_devdata *dd, int rmt_idx) +{ + struct hfi1_netdev_priv *priv = hfi1_netdev_priv(dd->dummy_netdev); + + priv->rmt_start = rmt_idx; +} + +u32 hfi1_num_netdev_contexts(struct hfi1_devdata *dd, u32 available_contexts, + struct cpumask *cpu_mask); + void hfi1_netdev_enable_queues(struct hfi1_devdata *dd); void hfi1_netdev_disable_queues(struct hfi1_devdata *dd); int hfi1_netdev_rx_init(struct hfi1_devdata *dd); diff --git a/drivers/infiniband/hw/hfi1/netdev_rx.c b/drivers/infiniband/hw/hfi1/netdev_rx.c index 124e4e8695b0..58af6a454761 100644 --- a/drivers/infiniband/hw/hfi1/netdev_rx.c +++ b/drivers/infiniband/hw/hfi1/netdev_rx.c @@ -140,6 +140,50 @@ static int hfi1_netdev_allot_ctxt(struct hfi1_netdev_priv *priv, return rc; } +/** + * hfi1_num_netdev_contexts - Count of netdev recv contexts to use. + * @dd: device on which to allocate netdev contexts + * @available_contexts: count of available receive contexts + * @cpu_mask: mask of possible cpus to include for contexts + * + * Return: count of physical cores on a node or the remaining available recv + * contexts for netdev recv context usage up to the maximum of + * HFI1_MAX_NETDEV_CTXTS. + * A value of 0 can be returned when acceleration is explicitly turned off, + * a memory allocation error occurs or when there are no available contexts. + * + */ +u32 hfi1_num_netdev_contexts(struct hfi1_devdata *dd, u32 available_contexts, + struct cpumask *cpu_mask) +{ + cpumask_var_t node_cpu_mask; + unsigned int available_cpus; + + if (!HFI1_CAP_IS_KSET(AIP)) + return 0; + + /* Always give user contexts priority over netdev contexts */ + if (available_contexts == 0) { + dd_dev_info(dd, "No receive contexts available for netdevs.\n"); + return 0; + } + + if (!zalloc_cpumask_var(&node_cpu_mask, GFP_KERNEL)) { + dd_dev_err(dd, "Unable to allocate cpu_mask for netdevs.\n"); + return 0; + } + + cpumask_and(node_cpu_mask, cpu_mask, + cpumask_of_node(pcibus_to_node(dd->pcidev->bus))); + + available_cpus = cpumask_weight(node_cpu_mask); + + free_cpumask_var(node_cpu_mask); + + return min3(available_cpus, available_contexts, + (u32)HFI1_MAX_NETDEV_CTXTS); +} + static int hfi1_netdev_rxq_init(struct net_device *dev) { int i; @@ -238,7 +282,7 @@ static void disable_queues(struct hfi1_netdev_priv *priv) { int i; - msix_vnic_synchronize_irq(priv->dd); + msix_netdev_synchronize_irq(priv->dd); for (i = 0; i < priv->num_rx_q; i++) { struct hfi1_netdev_rxq *rxq = &priv->rxq[i]; diff --git a/drivers/infiniband/hw/hfi1/vnic.h b/drivers/infiniband/hw/hfi1/vnic.h index 5ae781514e32..66150a13f374 100644 --- a/drivers/infiniband/hw/hfi1/vnic.h +++ b/drivers/infiniband/hw/hfi1/vnic.h @@ -1,7 +1,7 @@ #ifndef _HFI1_VNIC_H #define _HFI1_VNIC_H /* - * Copyright(c) 2017 Intel Corporation. + * Copyright(c) 2017 - 2020 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. @@ -69,6 +69,7 @@ #define HFI1_VNIC_SC_SHIFT 4 #define HFI1_VNIC_MAX_QUEUE 16 +#define HFI1_NUM_VNIC_CTXT 8 /** * struct hfi1_vnic_sdma - VNIC per Tx ring SDMA information @@ -104,7 +105,6 @@ struct hfi1_vnic_rx_queue { struct hfi1_vnic_vport_info *vinfo; struct net_device *netdev; struct napi_struct napi; - struct sk_buff_head skbq; }; /** @@ -146,7 +146,6 @@ struct hfi1_vnic_vport_info { /* vnic hfi1 internal functions */ void hfi1_vnic_setup(struct hfi1_devdata *dd); -void hfi1_vnic_cleanup(struct hfi1_devdata *dd); int hfi1_vnic_txreq_init(struct hfi1_devdata *dd); void hfi1_vnic_txreq_deinit(struct hfi1_devdata *dd); diff --git a/drivers/infiniband/hw/hfi1/vnic_main.c b/drivers/infiniband/hw/hfi1/vnic_main.c index db7624cacee1..b183c56b7b6a 100644 --- a/drivers/infiniband/hw/hfi1/vnic_main.c +++ b/drivers/infiniband/hw/hfi1/vnic_main.c @@ -53,6 +53,7 @@ #include #include "vnic.h" +#include "netdev.h" #define HFI_TX_TIMEOUT_MS 1000 @@ -62,114 +63,6 @@ static DEFINE_SPINLOCK(vport_cntr_lock); -static int setup_vnic_ctxt(struct hfi1_devdata *dd, struct hfi1_ctxtdata *uctxt) -{ - unsigned int rcvctrl_ops = 0; - int ret; - - uctxt->do_interrupt = &handle_receive_interrupt; - - /* Now allocate the RcvHdr queue and eager buffers. */ - ret = hfi1_create_rcvhdrq(dd, uctxt); - if (ret) - goto done; - - ret = hfi1_setup_eagerbufs(uctxt); - if (ret) - goto done; - - if (hfi1_rcvhdrtail_kvaddr(uctxt)) - clear_rcvhdrtail(uctxt); - - rcvctrl_ops = HFI1_RCVCTRL_CTXT_ENB; - rcvctrl_ops |= HFI1_RCVCTRL_INTRAVAIL_ENB; - - if (!HFI1_CAP_KGET_MASK(uctxt->flags, MULTI_PKT_EGR)) - rcvctrl_ops |= HFI1_RCVCTRL_ONE_PKT_EGR_ENB; - if (HFI1_CAP_KGET_MASK(uctxt->flags, NODROP_EGR_FULL)) - rcvctrl_ops |= HFI1_RCVCTRL_NO_EGR_DROP_ENB; - if (HFI1_CAP_KGET_MASK(uctxt->flags, NODROP_RHQ_FULL)) - rcvctrl_ops |= HFI1_RCVCTRL_NO_RHQ_DROP_ENB; - if (HFI1_CAP_KGET_MASK(uctxt->flags, DMA_RTAIL)) - rcvctrl_ops |= HFI1_RCVCTRL_TAILUPD_ENB; - - hfi1_rcvctrl(uctxt->dd, rcvctrl_ops, uctxt); -done: - return ret; -} - -static int allocate_vnic_ctxt(struct hfi1_devdata *dd, - struct hfi1_ctxtdata **vnic_ctxt) -{ - struct hfi1_ctxtdata *uctxt; - int ret; - - if (dd->flags & HFI1_FROZEN) - return -EIO; - - ret = hfi1_create_ctxtdata(dd->pport, dd->node, &uctxt); - if (ret < 0) { - dd_dev_err(dd, "Unable to create ctxtdata, failing open\n"); - return -ENOMEM; - } - - uctxt->flags = HFI1_CAP_KGET(MULTI_PKT_EGR) | - HFI1_CAP_KGET(NODROP_RHQ_FULL) | - HFI1_CAP_KGET(NODROP_EGR_FULL) | - HFI1_CAP_KGET(DMA_RTAIL); - uctxt->seq_cnt = 1; - uctxt->is_vnic = true; - - msix_request_rcd_irq(uctxt); - - hfi1_stats.sps_ctxts++; - dd_dev_dbg(dd, "created vnic context %d\n", uctxt->ctxt); - *vnic_ctxt = uctxt; - - return 0; -} - -static void deallocate_vnic_ctxt(struct hfi1_devdata *dd, - struct hfi1_ctxtdata *uctxt) -{ - dd_dev_dbg(dd, "closing vnic context %d\n", uctxt->ctxt); - flush_wc(); - - /* - * Disable receive context and interrupt available, reset all - * RcvCtxtCtrl bits to default values. - */ - hfi1_rcvctrl(dd, HFI1_RCVCTRL_CTXT_DIS | - HFI1_RCVCTRL_TIDFLOW_DIS | - HFI1_RCVCTRL_INTRAVAIL_DIS | - HFI1_RCVCTRL_ONE_PKT_EGR_DIS | - HFI1_RCVCTRL_NO_RHQ_DROP_DIS | - HFI1_RCVCTRL_NO_EGR_DROP_DIS, uctxt); - - /* msix_intr will always be > 0, only clean up if this is true */ - if (uctxt->msix_intr) - msix_free_irq(dd, uctxt->msix_intr); - - uctxt->event_flags = 0; - - hfi1_clear_tids(uctxt); - hfi1_clear_ctxt_pkey(dd, uctxt); - - hfi1_stats.sps_ctxts--; - - hfi1_free_ctxt(uctxt); -} - -void hfi1_vnic_setup(struct hfi1_devdata *dd) -{ - xa_init(&dd->vnic.vesws); -} - -void hfi1_vnic_cleanup(struct hfi1_devdata *dd) -{ - WARN_ON(!xa_empty(&dd->vnic.vesws)); -} - #define SUM_GRP_COUNTERS(stats, qstats, x_grp) do { \ u64 *src64, *dst64; \ for (src64 = &qstats->x_grp.unicast, \ @@ -179,6 +72,9 @@ void hfi1_vnic_cleanup(struct hfi1_devdata *dd) } \ } while (0) +#define VNIC_MASK (0xFF) +#define VNIC_ID(val) ((1ull << 24) | ((val) & VNIC_MASK)) + /* hfi1_vnic_update_stats - update statistics */ static void hfi1_vnic_update_stats(struct hfi1_vnic_vport_info *vinfo, struct opa_vnic_stats *stats) @@ -454,71 +350,25 @@ static inline int hfi1_vnic_decap_skb(struct hfi1_vnic_rx_queue *rxq, return rc; } -static inline struct sk_buff *hfi1_vnic_get_skb(struct hfi1_vnic_rx_queue *rxq) +static struct hfi1_vnic_vport_info *get_vnic_port(struct hfi1_devdata *dd, + int vesw_id) { - unsigned char *pad_info; - struct sk_buff *skb; - - skb = skb_dequeue(&rxq->skbq); - if (unlikely(!skb)) - return NULL; + int vnic_id = VNIC_ID(vesw_id); - /* remove tail padding and icrc */ - pad_info = skb->data + skb->len - 1; - skb_trim(skb, (skb->len - OPA_VNIC_ICRC_TAIL_LEN - - ((*pad_info) & 0x7))); - - return skb; + return hfi1_netdev_get_data(dd, vnic_id); } -/* hfi1_vnic_handle_rx - handle skb receive */ -static void hfi1_vnic_handle_rx(struct hfi1_vnic_rx_queue *rxq, - int *work_done, int work_to_do) +static struct hfi1_vnic_vport_info *get_first_vnic_port(struct hfi1_devdata *dd) { - struct hfi1_vnic_vport_info *vinfo = rxq->vinfo; - struct sk_buff *skb; - int rc; - - while (1) { - if (*work_done >= work_to_do) - break; - - skb = hfi1_vnic_get_skb(rxq); - if (unlikely(!skb)) - break; - - rc = hfi1_vnic_decap_skb(rxq, skb); - /* update rx counters */ - hfi1_vnic_update_rx_counters(vinfo, rxq->idx, skb, rc); - if (unlikely(rc)) { - dev_kfree_skb_any(skb); - continue; - } - - skb_checksum_none_assert(skb); - skb->protocol = eth_type_trans(skb, rxq->netdev); - - napi_gro_receive(&rxq->napi, skb); - (*work_done)++; - } -} - -/* hfi1_vnic_napi - napi receive polling callback function */ -static int hfi1_vnic_napi(struct napi_struct *napi, int budget) -{ - struct hfi1_vnic_rx_queue *rxq = container_of(napi, - struct hfi1_vnic_rx_queue, napi); - struct hfi1_vnic_vport_info *vinfo = rxq->vinfo; - int work_done = 0; + struct hfi1_vnic_vport_info *vinfo; + int next_id = VNIC_ID(0); - v_dbg("napi %d budget %d\n", rxq->idx, budget); - hfi1_vnic_handle_rx(rxq, &work_done, budget); + vinfo = hfi1_netdev_get_first_data(dd, &next_id); - v_dbg("napi %d work_done %d\n", rxq->idx, work_done); - if (work_done < budget) - napi_complete(napi); + if (next_id > VNIC_ID(VNIC_MASK)) + return NULL; - return work_done; + return vinfo; } void hfi1_vnic_bypass_rcv(struct hfi1_packet *packet) @@ -527,13 +377,14 @@ void hfi1_vnic_bypass_rcv(struct hfi1_packet *packet) struct hfi1_vnic_vport_info *vinfo = NULL; struct hfi1_vnic_rx_queue *rxq; struct sk_buff *skb; - int l4_type, vesw_id = -1; + int l4_type, vesw_id = -1, rc; u8 q_idx; + unsigned char *pad_info; l4_type = hfi1_16B_get_l4(packet->ebuf); if (likely(l4_type == OPA_16B_L4_ETHR)) { vesw_id = HFI1_VNIC_GET_VESWID(packet->ebuf); - vinfo = xa_load(&dd->vnic.vesws, vesw_id); + vinfo = get_vnic_port(dd, vesw_id); /* * In case of invalid vesw id, count the error on @@ -541,10 +392,8 @@ void hfi1_vnic_bypass_rcv(struct hfi1_packet *packet) */ if (unlikely(!vinfo)) { struct hfi1_vnic_vport_info *vinfo_tmp; - unsigned long index = 0; - vinfo_tmp = xa_find(&dd->vnic.vesws, &index, ULONG_MAX, - XA_PRESENT); + vinfo_tmp = get_first_vnic_port(dd); if (vinfo_tmp) { spin_lock(&vport_cntr_lock); vinfo_tmp->stats[0].netstats.rx_nohandler++; @@ -563,12 +412,6 @@ void hfi1_vnic_bypass_rcv(struct hfi1_packet *packet) rxq = &vinfo->rxq[q_idx]; if (unlikely(!netif_oper_up(vinfo->netdev))) { vinfo->stats[q_idx].rx_drop_state++; - skb_queue_purge(&rxq->skbq); - return; - } - - if (unlikely(skb_queue_len(&rxq->skbq) > HFI1_VNIC_RCV_Q_SIZE)) { - vinfo->stats[q_idx].netstats.rx_fifo_errors++; return; } @@ -580,34 +423,41 @@ void hfi1_vnic_bypass_rcv(struct hfi1_packet *packet) memcpy(skb->data, packet->ebuf, packet->tlen); skb_put(skb, packet->tlen); - skb_queue_tail(&rxq->skbq, skb); - if (napi_schedule_prep(&rxq->napi)) { - v_dbg("napi %d scheduling\n", q_idx); - __napi_schedule(&rxq->napi); + pad_info = skb->data + skb->len - 1; + skb_trim(skb, (skb->len - OPA_VNIC_ICRC_TAIL_LEN - + ((*pad_info) & 0x7))); + + rc = hfi1_vnic_decap_skb(rxq, skb); + + /* update rx counters */ + hfi1_vnic_update_rx_counters(vinfo, rxq->idx, skb, rc); + if (unlikely(rc)) { + dev_kfree_skb_any(skb); + return; } + + skb_checksum_none_assert(skb); + skb->protocol = eth_type_trans(skb, rxq->netdev); + + napi_gro_receive(&rxq->napi, skb); } static int hfi1_vnic_up(struct hfi1_vnic_vport_info *vinfo) { struct hfi1_devdata *dd = vinfo->dd; struct net_device *netdev = vinfo->netdev; - int i, rc; + int rc; /* ensure virtual eth switch id is valid */ if (!vinfo->vesw_id) return -EINVAL; - rc = xa_insert(&dd->vnic.vesws, vinfo->vesw_id, vinfo, GFP_KERNEL); + rc = hfi1_netdev_add_data(dd, VNIC_ID(vinfo->vesw_id), vinfo); if (rc < 0) return rc; - for (i = 0; i < vinfo->num_rx_q; i++) { - struct hfi1_vnic_rx_queue *rxq = &vinfo->rxq[i]; - - skb_queue_head_init(&rxq->skbq); - napi_enable(&rxq->napi); - } + hfi1_netdev_rx_init(dd); netif_carrier_on(netdev); netif_tx_start_all_queues(netdev); @@ -619,23 +469,13 @@ static int hfi1_vnic_up(struct hfi1_vnic_vport_info *vinfo) static void hfi1_vnic_down(struct hfi1_vnic_vport_info *vinfo) { struct hfi1_devdata *dd = vinfo->dd; - u8 i; clear_bit(HFI1_VNIC_UP, &vinfo->flags); netif_carrier_off(vinfo->netdev); netif_tx_disable(vinfo->netdev); - xa_erase(&dd->vnic.vesws, vinfo->vesw_id); + hfi1_netdev_remove_data(dd, VNIC_ID(vinfo->vesw_id)); - /* ensure irqs see the change */ - msix_vnic_synchronize_irq(dd); - - /* remove unread skbs */ - for (i = 0; i < vinfo->num_rx_q; i++) { - struct hfi1_vnic_rx_queue *rxq = &vinfo->rxq[i]; - - napi_disable(&rxq->napi); - skb_queue_purge(&rxq->skbq); - } + hfi1_netdev_rx_destroy(dd); } static int hfi1_netdev_open(struct net_device *netdev) @@ -660,70 +500,30 @@ static int hfi1_netdev_close(struct net_device *netdev) return 0; } -static int hfi1_vnic_allot_ctxt(struct hfi1_devdata *dd, - struct hfi1_ctxtdata **vnic_ctxt) -{ - int rc; - - rc = allocate_vnic_ctxt(dd, vnic_ctxt); - if (rc) { - dd_dev_err(dd, "vnic ctxt alloc failed %d\n", rc); - return rc; - } - - rc = setup_vnic_ctxt(dd, *vnic_ctxt); - if (rc) { - dd_dev_err(dd, "vnic ctxt setup failed %d\n", rc); - deallocate_vnic_ctxt(dd, *vnic_ctxt); - *vnic_ctxt = NULL; - } - - return rc; -} - static int hfi1_vnic_init(struct hfi1_vnic_vport_info *vinfo) { struct hfi1_devdata *dd = vinfo->dd; - int i, rc = 0; + int rc = 0; mutex_lock(&hfi1_mutex); - if (!dd->vnic.num_vports) { + if (!dd->vnic_num_vports) { rc = hfi1_vnic_txreq_init(dd); if (rc) goto txreq_fail; } - for (i = dd->vnic.num_ctxt; i < vinfo->num_rx_q; i++) { - rc = hfi1_vnic_allot_ctxt(dd, &dd->vnic.ctxt[i]); - if (rc) - break; - hfi1_rcd_get(dd->vnic.ctxt[i]); - dd->vnic.ctxt[i]->vnic_q_idx = i; - } - - if (i < vinfo->num_rx_q) { - /* - * If required amount of contexts is not - * allocated successfully then remaining contexts - * are released. - */ - while (i-- > dd->vnic.num_ctxt) { - deallocate_vnic_ctxt(dd, dd->vnic.ctxt[i]); - hfi1_rcd_put(dd->vnic.ctxt[i]); - dd->vnic.ctxt[i] = NULL; - } + if (hfi1_netdev_rx_init(dd)) { + dd_dev_err(dd, "Unable to initialize netdev contexts\n"); goto alloc_fail; } - if (dd->vnic.num_ctxt != i) { - dd->vnic.num_ctxt = i; - hfi1_init_vnic_rsm(dd); - } + hfi1_init_vnic_rsm(dd); - dd->vnic.num_vports++; + dd->vnic_num_vports++; hfi1_vnic_sdma_init(vinfo); + alloc_fail: - if (!dd->vnic.num_vports) + if (!dd->vnic_num_vports) hfi1_vnic_txreq_deinit(dd); txreq_fail: mutex_unlock(&hfi1_mutex); @@ -733,20 +533,14 @@ txreq_fail: static void hfi1_vnic_deinit(struct hfi1_vnic_vport_info *vinfo) { struct hfi1_devdata *dd = vinfo->dd; - int i; mutex_lock(&hfi1_mutex); - if (--dd->vnic.num_vports == 0) { - for (i = 0; i < dd->vnic.num_ctxt; i++) { - deallocate_vnic_ctxt(dd, dd->vnic.ctxt[i]); - hfi1_rcd_put(dd->vnic.ctxt[i]); - dd->vnic.ctxt[i] = NULL; - } + if (--dd->vnic_num_vports == 0) { hfi1_deinit_vnic_rsm(dd); - dd->vnic.num_ctxt = 0; hfi1_vnic_txreq_deinit(dd); } mutex_unlock(&hfi1_mutex); + hfi1_netdev_rx_destroy(dd); } static void hfi1_vnic_set_vesw_id(struct net_device *netdev, int id) @@ -815,14 +609,15 @@ struct net_device *hfi1_vnic_alloc_rn(struct ib_device *device, size = sizeof(struct opa_vnic_rdma_netdev) + sizeof(*vinfo); netdev = alloc_netdev_mqs(size, name, name_assign_type, setup, - dd->num_sdma, dd->num_netdev_contexts); + chip_sdma_engines(dd), + dd->num_netdev_contexts); if (!netdev) return ERR_PTR(-ENOMEM); rn = netdev_priv(netdev); vinfo = opa_vnic_dev_priv(netdev); vinfo->dd = dd; - vinfo->num_tx_q = dd->num_sdma; + vinfo->num_tx_q = chip_sdma_engines(dd); vinfo->num_rx_q = dd->num_netdev_contexts; vinfo->netdev = netdev; rn->free_rdma_netdev = hfi1_vnic_free_rn; @@ -841,7 +636,6 @@ struct net_device *hfi1_vnic_alloc_rn(struct ib_device *device, rxq->idx = i; rxq->vinfo = vinfo; rxq->netdev = netdev; - netif_napi_add(netdev, &rxq->napi, hfi1_vnic_napi, 64); } rc = hfi1_vnic_init(vinfo); -- cgit v1.2.3 From b7e159eb008eb8b0bb83c09990b648bd2c4081df Mon Sep 17 00:00:00 2001 From: Gary Leshner Date: Mon, 11 May 2020 12:06:55 -0400 Subject: IB/{hfi1, ipoib, rdma}: Broadcast ping sent packets which exceeded mtu size When in connected mode ipoib sent broadcast pings which exceeded the mtu size for broadcast addresses. Add an mtu attribute to the rdma_netdev structure which ipoib sets to its mcast mtu size. The RDMA netdev uses this value to determine if the skb length is too long for the mtu specified and if it is, drops the packet and logs an error about the errant packet. Link: https://lore.kernel.org/r/20200511160655.173205.14546.stgit@awfm-01.aw.intel.com Reviewed-by: Mike Marciniszyn Reviewed-by: Dennis Dalessandro Signed-off-by: Gary Leshner Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/ulp/ipoib/ipoib_main.c | 2 ++ drivers/infiniband/ulp/ipoib/ipoib_multicast.c | 1 + drivers/infiniband/ulp/ipoib/ipoib_vlan.c | 3 +++ 3 files changed, 6 insertions(+) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index 22216f181b24..a6c4322b409b 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -1896,6 +1896,7 @@ static int ipoib_ndo_init(struct net_device *ndev) { struct ipoib_dev_priv *priv = ipoib_priv(ndev); int rc; + struct rdma_netdev *rn = netdev_priv(ndev); if (priv->parent) { ipoib_child_init(ndev); @@ -1908,6 +1909,7 @@ static int ipoib_ndo_init(struct net_device *ndev) /* MTU will be reset when mcast join happens */ ndev->mtu = IPOIB_UD_MTU(priv->max_ib_mtu); priv->mcast_mtu = priv->admin_mtu = ndev->mtu; + rn->mtu = priv->mcast_mtu; ndev->max_mtu = IPOIB_CM_MTU; ndev->neigh_priv_len = sizeof(struct ipoib_neigh); diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c index 7166ee9b7a25..3d5f6b848c9e 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c @@ -246,6 +246,7 @@ static int ipoib_mcast_join_finish(struct ipoib_mcast *mcast, if (priv->mcast_mtu == priv->admin_mtu) priv->admin_mtu = IPOIB_UD_MTU(mtu); priv->mcast_mtu = IPOIB_UD_MTU(mtu); + rn->mtu = priv->mcast_mtu; priv->qkey = be32_to_cpu(priv->broadcast->mcmember.qkey); spin_unlock_irq(&priv->lock); diff --git a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c index 8ac8e18fbe0c..30865605e098 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c @@ -97,6 +97,7 @@ int __ipoib_vlan_add(struct ipoib_dev_priv *ppriv, struct ipoib_dev_priv *priv, { struct net_device *ndev = priv->dev; int result; + struct rdma_netdev *rn = netdev_priv(ndev); ASSERT_RTNL(); @@ -117,6 +118,8 @@ int __ipoib_vlan_add(struct ipoib_dev_priv *ppriv, struct ipoib_dev_priv *priv, goto out_early; } + rn->mtu = priv->mcast_mtu; + priv->parent = ppriv->dev; priv->pkey = pkey; priv->child_type = type; -- cgit v1.2.3 From 7638c0e965f48d773d8684d38e0967e4d0ee238c Mon Sep 17 00:00:00 2001 From: Grzegorz Andrejczuk Date: Mon, 11 May 2020 12:07:01 -0400 Subject: IB/hfi1: Add packet histogram trace event Add a simple trace event taking context number and building simple histogram to print packets distribution between contexts. Link: https://lore.kernel.org/r/20200511160700.173205.84270.stgit@awfm-01.aw.intel.com Reviewed-by: Mike Marciniszyn Reviewed-by: Dennis Dalessandro Signed-off-by: Grzegorz Andrejczuk Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/driver.c | 1 + drivers/infiniband/hw/hfi1/trace.c | 32 ++++++++++++++++++++++++++++++++ drivers/infiniband/hw/hfi1/trace_ctxts.h | 11 ++++++++++- 3 files changed, 43 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/hfi1/driver.c b/drivers/infiniband/hw/hfi1/driver.c index 60ff6de8cf98..a40701a6e1b6 100644 --- a/drivers/infiniband/hw/hfi1/driver.c +++ b/drivers/infiniband/hw/hfi1/driver.c @@ -1706,6 +1706,7 @@ static void hfi1_ipoib_ib_rcv(struct hfi1_packet *packet) goto drop_no_nd; trace_input_ibhdr(rcd->dd, packet, !!(rhf_dc_info(packet->rhf))); + trace_ctxt_rsm_hist(rcd->ctxt); /* handle congestion notifications */ do_work = hfi1_may_ecn(packet); diff --git a/drivers/infiniband/hw/hfi1/trace.c b/drivers/infiniband/hw/hfi1/trace.c index c8a9988d972d..b219ea90fd6f 100644 --- a/drivers/infiniband/hw/hfi1/trace.c +++ b/drivers/infiniband/hw/hfi1/trace.c @@ -520,6 +520,38 @@ u16 hfi1_trace_get_tid_idx(u32 ent) return EXP_TID_GET(ent, IDX); } +struct hfi1_ctxt_hist { + atomic_t count; + atomic_t data[255]; +}; + +struct hfi1_ctxt_hist hist = { + .count = ATOMIC_INIT(0) +}; + +const char *hfi1_trace_print_rsm_hist(struct trace_seq *p, unsigned int ctxt) +{ + int i, len = ARRAY_SIZE(hist.data); + const char *ret = trace_seq_buffer_ptr(p); + unsigned long packet_count = atomic_fetch_inc(&hist.count); + + trace_seq_printf(p, "packet[%lu]", packet_count); + for (i = 0; i < len; ++i) { + unsigned long val; + atomic_t *count = &hist.data[i]; + + if (ctxt == i) + val = atomic_fetch_inc(count); + else + val = atomic_read(count); + + if (val) + trace_seq_printf(p, "(%d:%lu)", i, val); + } + trace_seq_putc(p, 0); + return ret; +} + __hfi1_trace_fn(AFFINITY); __hfi1_trace_fn(PKT); __hfi1_trace_fn(PROC); diff --git a/drivers/infiniband/hw/hfi1/trace_ctxts.h b/drivers/infiniband/hw/hfi1/trace_ctxts.h index b5fc5c6cd52f..d8c168dc3ea8 100644 --- a/drivers/infiniband/hw/hfi1/trace_ctxts.h +++ b/drivers/infiniband/hw/hfi1/trace_ctxts.h @@ -1,5 +1,5 @@ /* -* Copyright(c) 2015, 2016 Intel Corporation. +* Copyright(c) 2015 - 2020 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. @@ -138,6 +138,15 @@ TRACE_EVENT(hfi1_ctxt_info, ) ); +const char *hfi1_trace_print_rsm_hist(struct trace_seq *p, unsigned int ctxt); +TRACE_EVENT(ctxt_rsm_hist, + TP_PROTO(unsigned int ctxt), + TP_ARGS(ctxt), + TP_STRUCT__entry(__field(unsigned int, ctxt)), + TP_fast_assign(__entry->ctxt = ctxt;), + TP_printk("%s", hfi1_trace_print_rsm_hist(p, __entry->ctxt)) +); + #endif /* __HFI1_TRACE_CTXTS_H */ #undef TRACE_INCLUDE_PATH -- cgit v1.2.3 From 8f149b684764662bca3e08f340202b7bd67736fc Mon Sep 17 00:00:00 2001 From: Gary Leshner Date: Mon, 11 May 2020 12:07:06 -0400 Subject: IB/ipoib: Add capability to switch between datagram and connected mode This is the prerequisite modification to the ipoib ulp to allow a rdma netdev to obtain the default ndo ops for init/uninit/open/close. This is accomplished by setting the netdev ops field within the callback function passed to the netdev allocation routine which in turn was passed into the rdma netdev allocation routine. This allows the rdma netdev to call back into the ulp to create the resources required for connected mode operation. Additionally as the ulp is not re-entrant, when switching modes, the number of real tx queues is set to 1 for the connected mode. For datagram mode the number of real tx queues is set to the actual number of tx queues specified at the netdev's allocation. For the internal ulp netdev the number of tx queues defaults to 1. It is up to the rdma netdev to specify the actual number it can support. When the driver does not support a rdma netdev for acceleration, (-ENOTSUPPORTED return code or the verbs function for allocation is NULL) the ipoib ulp functions are unaffected by using the internal netdev allocated by the ipoib ulp. Link: https://lore.kernel.org/r/20200511160706.173205.19086.stgit@awfm-01.aw.intel.com Reviewed-by: Mike Marciniszyn Reviewed-by: Dennis Dalessandro Signed-off-by: Gary Leshner Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/ulp/ipoib/ipoib_main.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index a6c4322b409b..d12e5c9c38af 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -526,6 +526,7 @@ int ipoib_set_mode(struct net_device *dev, const char *buf) "will cause multicast packet drops\n"); netdev_update_features(dev); dev_set_mtu(dev, ipoib_cm_max_mtu(dev)); + netif_set_real_num_tx_queues(dev, 1); rtnl_unlock(); priv->tx_wr.wr.send_flags &= ~IB_SEND_IP_CSUM; @@ -537,6 +538,7 @@ int ipoib_set_mode(struct net_device *dev, const char *buf) clear_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags); netdev_update_features(dev); dev_set_mtu(dev, min(priv->mcast_mtu, dev->mtu)); + netif_set_real_num_tx_queues(dev, dev->num_tx_queues); rtnl_unlock(); ipoib_flush_paths(dev); return (!rtnl_trylock()) ? -EBUSY : 0; @@ -2071,9 +2073,17 @@ static const struct net_device_ops ipoib_netdev_ops_vf = { .ndo_do_ioctl = ipoib_ioctl, }; +static const struct net_device_ops ipoib_netdev_default_pf = { + .ndo_init = ipoib_dev_init_default, + .ndo_uninit = ipoib_dev_uninit_default, + .ndo_open = ipoib_ib_dev_open_default, + .ndo_stop = ipoib_ib_dev_stop_default, +}; + void ipoib_setup_common(struct net_device *dev) { dev->header_ops = &ipoib_header_ops; + dev->netdev_ops = &ipoib_netdev_default_pf; ipoib_set_ethtool_ops(dev); @@ -2123,13 +2133,6 @@ static void ipoib_build_priv(struct net_device *dev) INIT_DELAYED_WORK(&priv->neigh_reap_task, ipoib_reap_neigh); } -static const struct net_device_ops ipoib_netdev_default_pf = { - .ndo_init = ipoib_dev_init_default, - .ndo_uninit = ipoib_dev_uninit_default, - .ndo_open = ipoib_ib_dev_open_default, - .ndo_stop = ipoib_ib_dev_stop_default, -}; - static struct net_device *ipoib_alloc_netdev(struct ib_device *hca, u8 port, const char *name) { @@ -2167,7 +2170,6 @@ int ipoib_intf_init(struct ib_device *hca, u8 port, const char *name, if (rc != -EOPNOTSUPP) goto out; - dev->netdev_ops = &ipoib_netdev_default_pf; rn->send = ipoib_send; rn->attach_mcast = ipoib_mcast_attach; rn->detach_mcast = ipoib_mcast_detach; -- cgit v1.2.3 From 0ad45e5fdc522b26242882abfca1b4b3c840961d Mon Sep 17 00:00:00 2001 From: Piotr Stankiewicz Date: Mon, 11 May 2020 12:07:13 -0400 Subject: IB/hfi1: Enable the transmit side of the datagram ipoib netdev This patch hooks the transmit side of the datagram netdev with ipoib by setting the rdma_netdev_get_params function for the hfi1 ib_device_ops structue. It also enables the receiving side by adding the AIP capability into the default capabilities. Link: https://lore.kernel.org/r/20200511160712.173205.65700.stgit@awfm-01.aw.intel.com Reviewed-by: Mike Marciniszyn Reviewed-by: Dennis Dalessandro Signed-off-by: Piotr Stankiewicz Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/common.h | 1 + drivers/infiniband/hw/hfi1/verbs.c | 2 ++ 2 files changed, 3 insertions(+) diff --git a/drivers/infiniband/hw/hfi1/common.h b/drivers/infiniband/hw/hfi1/common.h index 606254513640..ff423e546b80 100644 --- a/drivers/infiniband/hw/hfi1/common.h +++ b/drivers/infiniband/hw/hfi1/common.h @@ -160,6 +160,7 @@ HFI1_CAP_PKEY_CHECK | \ HFI1_CAP_MULTI_PKT_EGR | \ HFI1_CAP_EXTENDED_PSN | \ + HFI1_CAP_AIP | \ ((HFI1_CAP_HDRSUPP | \ HFI1_CAP_MULTI_PKT_EGR | \ HFI1_CAP_STATIC_RATE_CTRL | \ diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c index 19d5d0061b01..43ddced15951 100644 --- a/drivers/infiniband/hw/hfi1/verbs.c +++ b/drivers/infiniband/hw/hfi1/verbs.c @@ -66,6 +66,7 @@ #include "vnic.h" #include "fault.h" #include "affinity.h" +#include "ipoib.h" static unsigned int hfi1_lkey_table_size = 16; module_param_named(lkey_table_size, hfi1_lkey_table_size, uint, @@ -1795,6 +1796,7 @@ static const struct ib_device_ops hfi1_dev_ops = { .modify_device = modify_device, /* keep process mad in the driver */ .process_mad = hfi1_process_mad, + .rdma_netdev_get_params = hfi1_ipoib_rn_get_params, }; /** -- cgit v1.2.3 From 0ac8903cbbe618d947b5815d6e0f7b044ee83aa3 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 19 May 2020 10:27:05 +0300 Subject: RDMA/core: Allow the ioctl layer to abort a fully created uobject While creating a uobject every create reaches a point where the uobject is fully initialized. For ioctls that go on to copy_to_user this means they need to open code the destruction of a fully created uobject - ie the RDMA_REMOVE_DESTROY sort of flow. Open coding this creates bugs, eg the CQ does not properly flush the events list when it does its error unwind. Provide a uverbs_finalize_uobj_create() function which indicates that the uobject is fully initialized and that abort should call to destroy_hw to destroy the uobj->object and related. Methods can call this function if they go on to have error cases after setting uobj->object. Once done those error cases can simply do return, without an error unwind. Link: https://lore.kernel.org/r/20200519072711.257271-2-leon@kernel.org Signed-off-by: Yishai Hadas Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/rdma_core.c | 25 ++++++++++++++++++++----- drivers/infiniband/core/rdma_core.h | 4 ++-- drivers/infiniband/core/uverbs_cmd.c | 2 +- drivers/infiniband/core/uverbs_ioctl.c | 22 ++++++++++++++++++++-- drivers/infiniband/core/uverbs_std_types_cq.c | 8 ++------ drivers/infiniband/core/uverbs_std_types_mr.c | 12 +++--------- drivers/infiniband/hw/mlx5/devx.c | 10 ++++------ drivers/infiniband/hw/mlx5/main.c | 24 ++++++------------------ drivers/infiniband/hw/mlx5/qos.c | 13 +++++-------- include/rdma/ib_verbs.h | 5 +++++ include/rdma/uverbs_ioctl.h | 3 +++ include/rdma/uverbs_std_types.h | 2 +- include/rdma/uverbs_types.h | 3 ++- 13 files changed, 74 insertions(+), 59 deletions(-) diff --git a/drivers/infiniband/core/rdma_core.c b/drivers/infiniband/core/rdma_core.c index bf8e149d3191..de3858515275 100644 --- a/drivers/infiniband/core/rdma_core.c +++ b/drivers/infiniband/core/rdma_core.c @@ -130,6 +130,17 @@ static int uverbs_destroy_uobject(struct ib_uobject *uobj, lockdep_assert_held(&ufile->hw_destroy_rwsem); assert_uverbs_usecnt(uobj, UVERBS_LOOKUP_WRITE); + if (reason == RDMA_REMOVE_ABORT_HWOBJ) { + reason = RDMA_REMOVE_ABORT; + ret = uobj->uapi_object->type_class->destroy_hw(uobj, reason, + attrs); + /* + * Drivers are not permitted to ignore RDMA_REMOVE_ABORT, see + * ib_is_destroy_retryable, cleanup_retryable == false here. + */ + WARN_ON(ret); + } + if (reason == RDMA_REMOVE_ABORT) { WARN_ON(!list_empty(&uobj->list)); WARN_ON(!uobj->context); @@ -647,11 +658,15 @@ void rdma_alloc_commit_uobject(struct ib_uobject *uobj, * object and anything else connected to uobj before calling this. */ void rdma_alloc_abort_uobject(struct ib_uobject *uobj, - struct uverbs_attr_bundle *attrs) + struct uverbs_attr_bundle *attrs, + bool hw_obj_valid) { struct ib_uverbs_file *ufile = uobj->ufile; - uverbs_destroy_uobject(uobj, RDMA_REMOVE_ABORT, attrs); + uverbs_destroy_uobject(uobj, + hw_obj_valid ? RDMA_REMOVE_ABORT_HWOBJ : + RDMA_REMOVE_ABORT, + attrs); /* Matches the down_read in rdma_alloc_begin_uobject */ up_read(&ufile->hw_destroy_rwsem); @@ -921,8 +936,8 @@ uverbs_get_uobject_from_file(u16 object_id, enum uverbs_obj_access access, } void uverbs_finalize_object(struct ib_uobject *uobj, - enum uverbs_obj_access access, bool commit, - struct uverbs_attr_bundle *attrs) + enum uverbs_obj_access access, bool hw_obj_valid, + bool commit, struct uverbs_attr_bundle *attrs) { /* * refcounts should be handled at the object level and not at the @@ -945,7 +960,7 @@ void uverbs_finalize_object(struct ib_uobject *uobj, if (commit) rdma_alloc_commit_uobject(uobj, attrs); else - rdma_alloc_abort_uobject(uobj, attrs); + rdma_alloc_abort_uobject(uobj, attrs, hw_obj_valid); break; default: WARN_ON(true); diff --git a/drivers/infiniband/core/rdma_core.h b/drivers/infiniband/core/rdma_core.h index 33978e0f1262..2b529233e159 100644 --- a/drivers/infiniband/core/rdma_core.h +++ b/drivers/infiniband/core/rdma_core.h @@ -64,8 +64,8 @@ uverbs_get_uobject_from_file(u16 object_id, enum uverbs_obj_access access, s64 id, struct uverbs_attr_bundle *attrs); void uverbs_finalize_object(struct ib_uobject *uobj, - enum uverbs_obj_access access, bool commit, - struct uverbs_attr_bundle *attrs); + enum uverbs_obj_access access, bool hw_obj_valid, + bool commit, struct uverbs_attr_bundle *attrs); int uverbs_output_written(const struct uverbs_attr_bundle *bundle, size_t idx); diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index d5642bcf93ee..86c97221872d 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -311,7 +311,7 @@ static int ib_uverbs_get_context(struct uverbs_attr_bundle *attrs) return 0; err_uobj: - rdma_alloc_abort_uobject(uobj, attrs); + rdma_alloc_abort_uobject(uobj, attrs, false); err_ucontext: kfree(attrs->context); attrs->context = NULL; diff --git a/drivers/infiniband/core/uverbs_ioctl.c b/drivers/infiniband/core/uverbs_ioctl.c index 538affbc517e..42c5696f03bd 100644 --- a/drivers/infiniband/core/uverbs_ioctl.c +++ b/drivers/infiniband/core/uverbs_ioctl.c @@ -58,6 +58,7 @@ struct bundle_priv { DECLARE_BITMAP(uobj_finalize, UVERBS_API_ATTR_BKEY_LEN); DECLARE_BITMAP(spec_finalize, UVERBS_API_ATTR_BKEY_LEN); + DECLARE_BITMAP(uobj_hw_obj_valid, UVERBS_API_ATTR_BKEY_LEN); /* * Must be last. bundle ends in a flex array which overlaps @@ -230,7 +231,8 @@ static void uverbs_free_idrs_array(const struct uverbs_api_attr *attr_uapi, for (i = 0; i != attr->len; i++) uverbs_finalize_object(attr->uobjects[i], - spec->u2.objs_arr.access, commit, attrs); + spec->u2.objs_arr.access, false, commit, + attrs); } static int uverbs_process_attr(struct bundle_priv *pbundle, @@ -502,7 +504,9 @@ static void bundle_destroy(struct bundle_priv *pbundle, bool commit) uverbs_finalize_object( attr->obj_attr.uobject, - attr->obj_attr.attr_elm->spec.u.obj.access, commit, + attr->obj_attr.attr_elm->spec.u.obj.access, + test_bit(i, pbundle->uobj_hw_obj_valid), + commit, &pbundle->bundle); } @@ -590,6 +594,8 @@ static int ib_uverbs_cmd_verbs(struct ib_uverbs_file *ufile, sizeof(pbundle->bundle.attr_present)); memset(pbundle->uobj_finalize, 0, sizeof(pbundle->uobj_finalize)); memset(pbundle->spec_finalize, 0, sizeof(pbundle->spec_finalize)); + memset(pbundle->uobj_hw_obj_valid, 0, + sizeof(pbundle->uobj_hw_obj_valid)); ret = ib_uverbs_run_method(pbundle, hdr->num_attrs); bundle_destroy(pbundle, ret == 0); @@ -784,3 +790,15 @@ int uverbs_copy_to_struct_or_zero(const struct uverbs_attr_bundle *bundle, } return uverbs_copy_to(bundle, idx, from, size); } + +/* Once called an abort will call through to the type's destroy_hw() */ +void uverbs_finalize_uobj_create(const struct uverbs_attr_bundle *bundle, + u16 idx) +{ + struct bundle_priv *pbundle = + container_of(bundle, struct bundle_priv, bundle); + + __set_bit(uapi_bkey_attr(uapi_key_attr(idx)), + pbundle->uobj_hw_obj_valid); +} +EXPORT_SYMBOL(uverbs_finalize_uobj_create); diff --git a/drivers/infiniband/core/uverbs_std_types_cq.c b/drivers/infiniband/core/uverbs_std_types_cq.c index da4110a0eea2..73fbbeb2586c 100644 --- a/drivers/infiniband/core/uverbs_std_types_cq.c +++ b/drivers/infiniband/core/uverbs_std_types_cq.c @@ -129,16 +129,12 @@ static int UVERBS_HANDLER(UVERBS_METHOD_CQ_CREATE)( obj->uevent.uobject.object = cq; obj->uevent.uobject.user_handle = user_handle; rdma_restrack_uadd(&cq->res); + uverbs_finalize_uobj_create(attrs, UVERBS_ATTR_CREATE_CQ_HANDLE); ret = uverbs_copy_to(attrs, UVERBS_ATTR_CREATE_CQ_RESP_CQE, &cq->cqe, sizeof(cq->cqe)); - if (ret) - goto err_cq; + return ret; - return 0; -err_cq: - ib_destroy_cq_user(cq, uverbs_get_cleared_udata(attrs)); - cq = NULL; err_free: kfree(cq); err_event_file: diff --git a/drivers/infiniband/core/uverbs_std_types_mr.c b/drivers/infiniband/core/uverbs_std_types_mr.c index c1286a52dc84..a2722ef8496e 100644 --- a/drivers/infiniband/core/uverbs_std_types_mr.c +++ b/drivers/infiniband/core/uverbs_std_types_mr.c @@ -136,21 +136,15 @@ static int UVERBS_HANDLER(UVERBS_METHOD_DM_MR_REG)( uobj->object = mr; + uverbs_finalize_uobj_create(attrs, UVERBS_ATTR_REG_DM_MR_HANDLE); + ret = uverbs_copy_to(attrs, UVERBS_ATTR_REG_DM_MR_RESP_LKEY, &mr->lkey, sizeof(mr->lkey)); if (ret) - goto err_dereg; + return ret; ret = uverbs_copy_to(attrs, UVERBS_ATTR_REG_DM_MR_RESP_RKEY, &mr->rkey, sizeof(mr->rkey)); - if (ret) - goto err_dereg; - - return 0; - -err_dereg: - ib_dereg_mr_user(mr, uverbs_get_cleared_udata(attrs)); - return ret; } diff --git a/drivers/infiniband/hw/mlx5/devx.c b/drivers/infiniband/hw/mlx5/devx.c index c339dd5ee694..3047e7d60a9b 100644 --- a/drivers/infiniband/hw/mlx5/devx.c +++ b/drivers/infiniband/hw/mlx5/devx.c @@ -2218,14 +2218,12 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_UMEM_REG)( obj->mdev = dev->mdev; uobj->object = obj; devx_obj_build_destroy_cmd(cmd.in, cmd.out, obj->dinbox, &obj->dinlen, &obj_id); - err = uverbs_copy_to(attrs, MLX5_IB_ATTR_DEVX_UMEM_REG_OUT_ID, &obj_id, sizeof(obj_id)); - if (err) - goto err_umem_destroy; + uverbs_finalize_uobj_create(attrs, MLX5_IB_ATTR_DEVX_UMEM_REG_HANDLE); - return 0; + err = uverbs_copy_to(attrs, MLX5_IB_ATTR_DEVX_UMEM_REG_OUT_ID, &obj_id, + sizeof(obj_id)); + return err; -err_umem_destroy: - mlx5_cmd_exec(obj->mdev, obj->dinbox, obj->dinlen, cmd.out, sizeof(cmd.out)); err_umem_release: ib_umem_release(obj->umem); err_obj_free: diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 26f0b39c7f74..623d7898ae6d 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -6187,26 +6187,20 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_VAR_OBJ_ALLOC)( mmap_offset = mlx5_entry_to_mmap_offset(entry); length = entry->rdma_entry.npages * PAGE_SIZE; uobj->object = entry; + uverbs_finalize_uobj_create(attrs, MLX5_IB_ATTR_VAR_OBJ_ALLOC_HANDLE); err = uverbs_copy_to(attrs, MLX5_IB_ATTR_VAR_OBJ_ALLOC_MMAP_OFFSET, &mmap_offset, sizeof(mmap_offset)); if (err) - goto err; + return err; err = uverbs_copy_to(attrs, MLX5_IB_ATTR_VAR_OBJ_ALLOC_PAGE_ID, &entry->page_idx, sizeof(entry->page_idx)); if (err) - goto err; + return err; err = uverbs_copy_to(attrs, MLX5_IB_ATTR_VAR_OBJ_ALLOC_MMAP_LENGTH, &length, sizeof(length)); - if (err) - goto err; - - return 0; - -err: - rdma_user_mmap_entry_remove(&entry->rdma_entry); return err; } @@ -6320,26 +6314,20 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_UAR_OBJ_ALLOC)( mmap_offset = mlx5_entry_to_mmap_offset(entry); length = entry->rdma_entry.npages * PAGE_SIZE; uobj->object = entry; + uverbs_finalize_uobj_create(attrs, MLX5_IB_ATTR_UAR_OBJ_ALLOC_HANDLE); err = uverbs_copy_to(attrs, MLX5_IB_ATTR_UAR_OBJ_ALLOC_MMAP_OFFSET, &mmap_offset, sizeof(mmap_offset)); if (err) - goto err; + return err; err = uverbs_copy_to(attrs, MLX5_IB_ATTR_UAR_OBJ_ALLOC_PAGE_ID, &entry->page_idx, sizeof(entry->page_idx)); if (err) - goto err; + return err; err = uverbs_copy_to(attrs, MLX5_IB_ATTR_UAR_OBJ_ALLOC_MMAP_LENGTH, &length, sizeof(length)); - if (err) - goto err; - - return 0; - -err: - rdma_user_mmap_entry_remove(&entry->rdma_entry); return err; } diff --git a/drivers/infiniband/hw/mlx5/qos.c b/drivers/infiniband/hw/mlx5/qos.c index cac878a70edb..dce92554142a 100644 --- a/drivers/infiniband/hw/mlx5/qos.c +++ b/drivers/infiniband/hw/mlx5/qos.c @@ -69,17 +69,14 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_PP_OBJ_ALLOC)( if (err) goto err; - err = uverbs_copy_to(attrs, MLX5_IB_ATTR_PP_OBJ_ALLOC_INDEX, - &pp_entry->index, sizeof(pp_entry->index)); - if (err) - goto clean; - pp_entry->mdev = dev->mdev; uobj->object = pp_entry; - return 0; + uverbs_finalize_uobj_create(attrs, MLX5_IB_ATTR_PP_OBJ_ALLOC_HANDLE); + + err = uverbs_copy_to(attrs, MLX5_IB_ATTR_PP_OBJ_ALLOC_INDEX, + &pp_entry->index, sizeof(pp_entry->index)); + return err; -clean: - mlx5_rl_remove_rate_raw(dev->mdev, pp_entry->index); err: kfree(pp_entry); return err; diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 641f4751b062..c988e9205cf9 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -1491,6 +1491,11 @@ enum rdma_remove_reason { RDMA_REMOVE_DRIVER_REMOVE, /* uobj is being cleaned-up before being committed */ RDMA_REMOVE_ABORT, + /* + * uobj has been fully created, with the uobj->object set, but is being + * cleaned up before being comitted + */ + RDMA_REMOVE_ABORT_HWOBJ, }; struct ib_rdmacg_object { diff --git a/include/rdma/uverbs_ioctl.h b/include/rdma/uverbs_ioctl.h index 9f3b1e004046..5bd2b037e914 100644 --- a/include/rdma/uverbs_ioctl.h +++ b/include/rdma/uverbs_ioctl.h @@ -737,6 +737,9 @@ uverbs_attr_get_len(const struct uverbs_attr_bundle *attrs_bundle, u16 idx) return attr->ptr_attr.len; } +void uverbs_finalize_uobj_create(const struct uverbs_attr_bundle *attrs_bundle, + u16 idx); + /* * uverbs_attr_ptr_get_array_size() - Get array size pointer by a ptr * attribute. diff --git a/include/rdma/uverbs_std_types.h b/include/rdma/uverbs_std_types.h index 1b28ce1aba07..d6784be27e4b 100644 --- a/include/rdma/uverbs_std_types.h +++ b/include/rdma/uverbs_std_types.h @@ -107,7 +107,7 @@ static inline void uobj_put_write(struct ib_uobject *uobj) static inline void uobj_alloc_abort(struct ib_uobject *uobj, struct uverbs_attr_bundle *attrs) { - rdma_alloc_abort_uobject(uobj, attrs); + rdma_alloc_abort_uobject(uobj, attrs, false); } static inline struct ib_uobject * diff --git a/include/rdma/uverbs_types.h b/include/rdma/uverbs_types.h index f1cbdae67250..c15b298aa62f 100644 --- a/include/rdma/uverbs_types.h +++ b/include/rdma/uverbs_types.h @@ -139,7 +139,8 @@ void rdma_lookup_put_uobject(struct ib_uobject *uobj, struct ib_uobject *rdma_alloc_begin_uobject(const struct uverbs_api_object *obj, struct uverbs_attr_bundle *attrs); void rdma_alloc_abort_uobject(struct ib_uobject *uobj, - struct uverbs_attr_bundle *attrs); + struct uverbs_attr_bundle *attrs, + bool hw_obj_valid); void rdma_alloc_commit_uobject(struct ib_uobject *uobj, struct uverbs_attr_bundle *attrs); -- cgit v1.2.3 From 98a8890f73489416a1ea49a644565a244d3f729a Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Tue, 19 May 2020 10:27:06 +0300 Subject: IB/uverbs: Refactor related objects to use their own asynchronous event FD Refactor related objects to use their own asynchronous event FD. The ufile event FD will be the default in case an object won't have its own event FD. Link: https://lore.kernel.org/r/20200519072711.257271-3-leon@kernel.org Signed-off-by: Yishai Hadas Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/uverbs.h | 3 ++- drivers/infiniband/core/uverbs_cmd.c | 25 ++++++++++++++++++++++++- drivers/infiniband/core/uverbs_main.c | 14 +++++++------- drivers/infiniband/core/uverbs_std_types_cq.c | 6 ++++++ 4 files changed, 39 insertions(+), 9 deletions(-) diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h index 3d189c7ee59e..34c155f88317 100644 --- a/drivers/infiniband/core/uverbs.h +++ b/drivers/infiniband/core/uverbs.h @@ -142,7 +142,7 @@ struct ib_uverbs_file { * ucontext_lock held */ struct ib_ucontext *ucontext; - struct ib_uverbs_async_event_file *async_file; + struct ib_uverbs_async_event_file *default_async_file; struct list_head list; /* @@ -180,6 +180,7 @@ struct ib_uverbs_mcast_entry { struct ib_uevent_object { struct ib_uobject uobject; + struct ib_uverbs_async_event_file *event_file; /* List member for ib_uverbs_async_event_file list */ struct list_head event_list; u32 events_reported; diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 86c97221872d..4859ac0df17c 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -1051,6 +1051,10 @@ static struct ib_ucq_object *create_cq(struct uverbs_attr_bundle *attrs, goto err_free; obj->uevent.uobject.object = cq; + obj->uevent.event_file = READ_ONCE(attrs->ufile->default_async_file); + if (obj->uevent.event_file) + uverbs_uobject_get(&obj->uevent.event_file->uobj); + memset(&resp, 0, sizeof resp); resp.base.cq_handle = obj->uevent.uobject.id; resp.base.cqe = cq->cqe; @@ -1067,6 +1071,8 @@ static struct ib_ucq_object *create_cq(struct uverbs_attr_bundle *attrs, return obj; err_cb: + if (obj->uevent.event_file) + uverbs_uobject_put(&obj->uevent.event_file->uobj); ib_destroy_cq_user(cq, uverbs_get_cleared_udata(attrs)); cq = NULL; err_free: @@ -1460,6 +1466,9 @@ static int create_qp(struct uverbs_attr_bundle *attrs, } obj->uevent.uobject.object = qp; + obj->uevent.event_file = READ_ONCE(attrs->ufile->default_async_file); + if (obj->uevent.event_file) + uverbs_uobject_get(&obj->uevent.event_file->uobj); memset(&resp, 0, sizeof resp); resp.base.qpn = qp->qp_num; @@ -1473,7 +1482,7 @@ static int create_qp(struct uverbs_attr_bundle *attrs, ret = uverbs_response(attrs, &resp, sizeof(resp)); if (ret) - goto err_cb; + goto err_uevent; if (xrcd) { obj->uxrcd = container_of(xrcd_uobj, struct ib_uxrcd_object, @@ -1498,6 +1507,9 @@ static int create_qp(struct uverbs_attr_bundle *attrs, rdma_alloc_commit_uobject(&obj->uevent.uobject, attrs); return 0; +err_uevent: + if (obj->uevent.event_file) + uverbs_uobject_put(&obj->uevent.event_file->uobj); err_cb: ib_destroy_qp_user(qp, uverbs_get_cleared_udata(attrs)); @@ -2975,6 +2987,9 @@ static int ib_uverbs_ex_create_wq(struct uverbs_attr_bundle *attrs) atomic_set(&wq->usecnt, 0); atomic_inc(&pd->usecnt); atomic_inc(&cq->usecnt); + obj->uevent.event_file = READ_ONCE(attrs->ufile->default_async_file); + if (obj->uevent.event_file) + uverbs_uobject_get(&obj->uevent.event_file->uobj); memset(&resp, 0, sizeof(resp)); resp.wq_handle = obj->uevent.uobject.id; @@ -2993,6 +3008,8 @@ static int ib_uverbs_ex_create_wq(struct uverbs_attr_bundle *attrs) return 0; err_copy: + if (obj->uevent.event_file) + uverbs_uobject_put(&obj->uevent.event_file->uobj); ib_destroy_wq(wq, uverbs_get_cleared_udata(attrs)); err_put_cq: rdma_lookup_put_uobject(&cq->uobject->uevent.uobject, @@ -3453,6 +3470,10 @@ static int __uverbs_create_xsrq(struct uverbs_attr_bundle *attrs, } obj->uevent.uobject.object = srq; + obj->uevent.uobject.user_handle = cmd->user_handle; + obj->uevent.event_file = READ_ONCE(attrs->ufile->default_async_file); + if (obj->uevent.event_file) + uverbs_uobject_get(&obj->uevent.event_file->uobj); memset(&resp, 0, sizeof resp); resp.srq_handle = obj->uevent.uobject.id; @@ -3477,6 +3498,8 @@ static int __uverbs_create_xsrq(struct uverbs_attr_bundle *attrs, return 0; err_copy: + if (obj->uevent.event_file) + uverbs_uobject_put(&obj->uevent.event_file->uobj); ib_destroy_srq_user(srq, uverbs_get_cleared_udata(attrs)); err_put_pd: uobj_put_obj_read(pd); diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index 6948f8cd1885..47794c85e9af 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -146,8 +146,7 @@ void ib_uverbs_release_ucq(struct ib_uverbs_completion_event_file *ev_file, void ib_uverbs_release_uevent(struct ib_uevent_object *uobj) { - struct ib_uverbs_async_event_file *async_file = - READ_ONCE(uobj->uobject.ufile->async_file); + struct ib_uverbs_async_event_file *async_file = uobj->event_file; struct ib_uverbs_event *evt, *tmp; if (!async_file) @@ -159,6 +158,7 @@ void ib_uverbs_release_uevent(struct ib_uevent_object *uobj) kfree(evt); } spin_unlock_irq(&async_file->ev_queue.lock); + uverbs_uobject_put(&async_file->uobj); } void ib_uverbs_detach_umcast(struct ib_qp *qp, @@ -197,8 +197,8 @@ void ib_uverbs_release_file(struct kref *ref) if (atomic_dec_and_test(&file->device->refcount)) ib_uverbs_comp_dev(file->device); - if (file->async_file) - uverbs_uobject_put(&file->async_file->uobj); + if (file->default_async_file) + uverbs_uobject_put(&file->default_async_file->uobj); put_device(&file->device->dev); if (file->disassociate_page) @@ -427,7 +427,7 @@ void ib_uverbs_async_handler(struct ib_uverbs_async_event_file *async_file, static void uverbs_uobj_event(struct ib_uevent_object *eobj, struct ib_event *event) { - ib_uverbs_async_handler(READ_ONCE(eobj->uobject.ufile->async_file), + ib_uverbs_async_handler(eobj->event_file, eobj->uobject.user_handle, event->event, &eobj->event_list, &eobj->events_reported); } @@ -484,10 +484,10 @@ void ib_uverbs_init_async_event_file( /* The first async_event_file becomes the default one for the file. */ mutex_lock(&uverbs_file->ucontext_lock); - if (!uverbs_file->async_file) { + if (!uverbs_file->default_async_file) { /* Pairs with the put in ib_uverbs_release_file */ uverbs_uobject_get(&async_file->uobj); - smp_store_release(&uverbs_file->async_file, async_file); + smp_store_release(&uverbs_file->default_async_file, async_file); } mutex_unlock(&uverbs_file->ucontext_lock); diff --git a/drivers/infiniband/core/uverbs_std_types_cq.c b/drivers/infiniband/core/uverbs_std_types_cq.c index 73fbbeb2586c..be534b0af4f8 100644 --- a/drivers/infiniband/core/uverbs_std_types_cq.c +++ b/drivers/infiniband/core/uverbs_std_types_cq.c @@ -100,6 +100,10 @@ static int UVERBS_HANDLER(UVERBS_METHOD_CQ_CREATE)( uverbs_uobject_get(ev_file_uobj); } + obj->uevent.event_file = READ_ONCE(attrs->ufile->default_async_file); + if (obj->uevent.event_file) + uverbs_uobject_get(&obj->uevent.event_file->uobj); + if (attr.comp_vector >= attrs->ufile->device->num_comp_vectors) { ret = -EINVAL; goto err_event_file; @@ -138,6 +142,8 @@ static int UVERBS_HANDLER(UVERBS_METHOD_CQ_CREATE)( err_free: kfree(cq); err_event_file: + if (obj->uevent.event_file) + uverbs_uobject_put(&obj->uevent.event_file->uobj); if (ev_file) uverbs_uobject_put(ev_file_uobj); return ret; -- cgit v1.2.3 From cda9ee494248b890973f5d31cf7851c0d21755b9 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Tue, 19 May 2020 10:27:07 +0300 Subject: IB/uverbs: Extend CQ to get its own asynchronous event FD Extend CQ to get its own asynchronous event FD. The event FD is an optional attribute, in case wasn't given the ufile event FD will be used. Link: https://lore.kernel.org/r/20200519072711.257271-4-leon@kernel.org Signed-off-by: Yishai Hadas Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/uverbs.h | 18 ++++++++++++++++++ drivers/infiniband/core/uverbs_std_types_cq.c | 9 ++++++--- include/uapi/rdma/ib_user_ioctl_cmds.h | 1 + 3 files changed, 25 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h index 34c155f88317..53a10479958b 100644 --- a/drivers/infiniband/core/uverbs.h +++ b/drivers/infiniband/core/uverbs.h @@ -297,6 +297,24 @@ static inline u32 make_port_cap_flags(const struct ib_port_attr *attr) return res; } +static inline struct ib_uverbs_async_event_file * +ib_uverbs_get_async_event(struct uverbs_attr_bundle *attrs, + u16 id) +{ + struct ib_uobject *async_ev_file_uobj; + struct ib_uverbs_async_event_file *async_ev_file; + + async_ev_file_uobj = uverbs_attr_get_uobject(attrs, id); + if (IS_ERR(async_ev_file_uobj)) + async_ev_file = READ_ONCE(attrs->ufile->default_async_file); + else + async_ev_file = container_of(async_ev_file_uobj, + struct ib_uverbs_async_event_file, + uobj); + if (async_ev_file) + uverbs_uobject_get(&async_ev_file->uobj); + return async_ev_file; +} void copy_port_attr_to_resp(struct ib_port_attr *attr, struct ib_uverbs_query_port_resp *resp, diff --git a/drivers/infiniband/core/uverbs_std_types_cq.c b/drivers/infiniband/core/uverbs_std_types_cq.c index be534b0af4f8..5dce2c7cc323 100644 --- a/drivers/infiniband/core/uverbs_std_types_cq.c +++ b/drivers/infiniband/core/uverbs_std_types_cq.c @@ -100,9 +100,8 @@ static int UVERBS_HANDLER(UVERBS_METHOD_CQ_CREATE)( uverbs_uobject_get(ev_file_uobj); } - obj->uevent.event_file = READ_ONCE(attrs->ufile->default_async_file); - if (obj->uevent.event_file) - uverbs_uobject_get(&obj->uevent.event_file->uobj); + obj->uevent.event_file = ib_uverbs_get_async_event( + attrs, UVERBS_ATTR_CREATE_CQ_EVENT_FD); if (attr.comp_vector >= attrs->ufile->device->num_comp_vectors) { ret = -EINVAL; @@ -173,6 +172,10 @@ DECLARE_UVERBS_NAMED_METHOD( UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_CREATE_CQ_RESP_CQE, UVERBS_ATTR_TYPE(u32), UA_MANDATORY), + UVERBS_ATTR_FD(UVERBS_ATTR_CREATE_CQ_EVENT_FD, + UVERBS_OBJECT_ASYNC_EVENT, + UVERBS_ACCESS_READ, + UA_OPTIONAL), UVERBS_ATTR_UHW()); static int UVERBS_HANDLER(UVERBS_METHOD_CQ_DESTROY)( diff --git a/include/uapi/rdma/ib_user_ioctl_cmds.h b/include/uapi/rdma/ib_user_ioctl_cmds.h index d4ddbe4e696c..286fdc1929e0 100644 --- a/include/uapi/rdma/ib_user_ioctl_cmds.h +++ b/include/uapi/rdma/ib_user_ioctl_cmds.h @@ -95,6 +95,7 @@ enum uverbs_attrs_create_cq_cmd_attr_ids { UVERBS_ATTR_CREATE_CQ_COMP_VECTOR, UVERBS_ATTR_CREATE_CQ_FLAGS, UVERBS_ATTR_CREATE_CQ_RESP_CQE, + UVERBS_ATTR_CREATE_CQ_EVENT_FD, }; enum uverbs_attrs_destroy_cq_cmd_attr_ids { -- cgit v1.2.3 From 175ba58d62c84e1216cdf8b4f49f79e55e1ed04b Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Tue, 19 May 2020 10:27:08 +0300 Subject: IB/uverbs: Move QP, SRQ, WQ type and flags to UAPI These constants are going to be used in the ioctl interface in coming patches so they are part of the UAPI, place them in the correct header for clarity. Link: https://lore.kernel.org/r/20200519072711.257271-5-leon@kernel.org Signed-off-by: Yishai Hadas Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- include/rdma/ib_verbs.h | 43 ++++++++++++++++++--------------- include/uapi/rdma/ib_user_ioctl_verbs.h | 34 ++++++++++++++++++++++++++ 2 files changed, 58 insertions(+), 19 deletions(-) diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index c988e9205cf9..94533ae16697 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -1040,9 +1040,9 @@ enum ib_cq_notify_flags { }; enum ib_srq_type { - IB_SRQT_BASIC, - IB_SRQT_XRC, - IB_SRQT_TM, + IB_SRQT_BASIC = IB_UVERBS_SRQT_BASIC, + IB_SRQT_XRC = IB_UVERBS_SRQT_XRC, + IB_SRQT_TM = IB_UVERBS_SRQT_TM, }; static inline bool ib_srq_has_cq(enum ib_srq_type srq_type) @@ -1111,16 +1111,16 @@ enum ib_qp_type { IB_QPT_SMI, IB_QPT_GSI, - IB_QPT_RC, - IB_QPT_UC, - IB_QPT_UD, + IB_QPT_RC = IB_UVERBS_QPT_RC, + IB_QPT_UC = IB_UVERBS_QPT_UC, + IB_QPT_UD = IB_UVERBS_QPT_UD, IB_QPT_RAW_IPV6, IB_QPT_RAW_ETHERTYPE, - IB_QPT_RAW_PACKET = 8, - IB_QPT_XRC_INI = 9, - IB_QPT_XRC_TGT, + IB_QPT_RAW_PACKET = IB_UVERBS_QPT_RAW_PACKET, + IB_QPT_XRC_INI = IB_UVERBS_QPT_XRC_INI, + IB_QPT_XRC_TGT = IB_UVERBS_QPT_XRC_TGT, IB_QPT_MAX, - IB_QPT_DRIVER = 0xFF, + IB_QPT_DRIVER = IB_UVERBS_QPT_DRIVER, /* Reserve a range for qp types internal to the low level driver. * These qp types will not be visible at the IB core layer, so the * IB_QPT_MAX usages should not be affected in the core layer @@ -1139,17 +1139,21 @@ enum ib_qp_type { enum ib_qp_create_flags { IB_QP_CREATE_IPOIB_UD_LSO = 1 << 0, - IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK = 1 << 1, + IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK = + IB_UVERBS_QP_CREATE_BLOCK_MULTICAST_LOOPBACK, IB_QP_CREATE_CROSS_CHANNEL = 1 << 2, IB_QP_CREATE_MANAGED_SEND = 1 << 3, IB_QP_CREATE_MANAGED_RECV = 1 << 4, IB_QP_CREATE_NETIF_QP = 1 << 5, IB_QP_CREATE_INTEGRITY_EN = 1 << 6, IB_QP_CREATE_NETDEV_USE = 1 << 7, - IB_QP_CREATE_SCATTER_FCS = 1 << 8, - IB_QP_CREATE_CVLAN_STRIPPING = 1 << 9, + IB_QP_CREATE_SCATTER_FCS = + IB_UVERBS_QP_CREATE_SCATTER_FCS, + IB_QP_CREATE_CVLAN_STRIPPING = + IB_UVERBS_QP_CREATE_CVLAN_STRIPPING, IB_QP_CREATE_SOURCE_QPN = 1 << 10, - IB_QP_CREATE_PCI_WRITE_END_PADDING = 1 << 11, + IB_QP_CREATE_PCI_WRITE_END_PADDING = + IB_UVERBS_QP_CREATE_PCI_WRITE_END_PADDING, /* reserve bits 26-31 for low level drivers' internal use */ IB_QP_CREATE_RESERVED_START = 1 << 26, IB_QP_CREATE_RESERVED_END = 1 << 31, @@ -1654,7 +1658,7 @@ enum ib_raw_packet_caps { }; enum ib_wq_type { - IB_WQT_RQ + IB_WQT_RQ = IB_UVERBS_WQT_RQ, }; enum ib_wq_state { @@ -1677,10 +1681,11 @@ struct ib_wq { }; enum ib_wq_flags { - IB_WQ_FLAGS_CVLAN_STRIPPING = 1 << 0, - IB_WQ_FLAGS_SCATTER_FCS = 1 << 1, - IB_WQ_FLAGS_DELAY_DROP = 1 << 2, - IB_WQ_FLAGS_PCI_WRITE_END_PADDING = 1 << 3, + IB_WQ_FLAGS_CVLAN_STRIPPING = IB_UVERBS_WQ_FLAGS_CVLAN_STRIPPING, + IB_WQ_FLAGS_SCATTER_FCS = IB_UVERBS_WQ_FLAGS_SCATTER_FCS, + IB_WQ_FLAGS_DELAY_DROP = IB_UVERBS_WQ_FLAGS_DELAY_DROP, + IB_WQ_FLAGS_PCI_WRITE_END_PADDING = + IB_UVERBS_WQ_FLAGS_PCI_WRITE_END_PADDING, }; struct ib_wq_init_attr { diff --git a/include/uapi/rdma/ib_user_ioctl_verbs.h b/include/uapi/rdma/ib_user_ioctl_verbs.h index a640bb814be0..b1662dfe86a6 100644 --- a/include/uapi/rdma/ib_user_ioctl_verbs.h +++ b/include/uapi/rdma/ib_user_ioctl_verbs.h @@ -64,6 +64,40 @@ enum ib_uverbs_access_flags { ~(IB_UVERBS_ACCESS_OPTIONAL_FIRST - 1) }; +enum ib_uverbs_srq_type { + IB_UVERBS_SRQT_BASIC, + IB_UVERBS_SRQT_XRC, + IB_UVERBS_SRQT_TM, +}; + +enum ib_uverbs_wq_type { + IB_UVERBS_WQT_RQ, +}; + +enum ib_uverbs_wq_flags { + IB_UVERBS_WQ_FLAGS_CVLAN_STRIPPING = 1 << 0, + IB_UVERBS_WQ_FLAGS_SCATTER_FCS = 1 << 1, + IB_UVERBS_WQ_FLAGS_DELAY_DROP = 1 << 2, + IB_UVERBS_WQ_FLAGS_PCI_WRITE_END_PADDING = 1 << 3, +}; + +enum ib_uverbs_qp_type { + IB_UVERBS_QPT_RC = 2, + IB_UVERBS_QPT_UC, + IB_UVERBS_QPT_UD, + IB_UVERBS_QPT_RAW_PACKET = 8, + IB_UVERBS_QPT_XRC_INI, + IB_UVERBS_QPT_XRC_TGT, + IB_UVERBS_QPT_DRIVER = 0xFF, +}; + +enum ib_uverbs_qp_create_flags { + IB_UVERBS_QP_CREATE_BLOCK_MULTICAST_LOOPBACK = 1 << 1, + IB_UVERBS_QP_CREATE_SCATTER_FCS = 1 << 8, + IB_UVERBS_QP_CREATE_CVLAN_STRIPPING = 1 << 9, + IB_UVERBS_QP_CREATE_PCI_WRITE_END_PADDING = 1 << 11, +}; + enum ib_uverbs_query_port_cap_flags { IB_UVERBS_PCF_SM = 1 << 1, IB_UVERBS_PCF_NOTICE_SUP = 1 << 2, -- cgit v1.2.3 From c3eab946aba443f0b44a08f446735c74495610a9 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Tue, 19 May 2020 10:27:09 +0300 Subject: IB/uverbs: Introduce create/destroy SRQ commands over ioctl Introduce create/destroy SRQ commands over the ioctl interface to let it be extended to get an asynchronous event FD. Link: https://lore.kernel.org/r/20200519072711.257271-6-leon@kernel.org Signed-off-by: Yishai Hadas Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/Makefile | 3 +- drivers/infiniband/core/rdma_core.h | 1 + drivers/infiniband/core/uverbs_std_types.c | 32 ---- drivers/infiniband/core/uverbs_std_types_srq.c | 234 +++++++++++++++++++++++++ drivers/infiniband/core/uverbs_uapi.c | 1 + include/uapi/rdma/ib_user_ioctl_cmds.h | 27 +++ 6 files changed, 265 insertions(+), 33 deletions(-) create mode 100644 drivers/infiniband/core/uverbs_std_types_srq.c diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile index 870f0fcd54d5..d7b46a7c07fd 100644 --- a/drivers/infiniband/core/Makefile +++ b/drivers/infiniband/core/Makefile @@ -36,6 +36,7 @@ ib_uverbs-y := uverbs_main.o uverbs_cmd.o uverbs_marshall.o \ uverbs_std_types_flow_action.o uverbs_std_types_dm.o \ uverbs_std_types_mr.o uverbs_std_types_counters.o \ uverbs_uapi.o uverbs_std_types_device.o \ - uverbs_std_types_async_fd.o + uverbs_std_types_async_fd.o \ + uverbs_std_types_srq.o ib_uverbs-$(CONFIG_INFINIBAND_USER_MEM) += umem.o ib_uverbs-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += umem_odp.o diff --git a/drivers/infiniband/core/rdma_core.h b/drivers/infiniband/core/rdma_core.h index 2b529233e159..d623f911b70b 100644 --- a/drivers/infiniband/core/rdma_core.h +++ b/drivers/infiniband/core/rdma_core.h @@ -159,6 +159,7 @@ extern const struct uapi_definition uverbs_def_obj_dm[]; extern const struct uapi_definition uverbs_def_obj_flow_action[]; extern const struct uapi_definition uverbs_def_obj_intf[]; extern const struct uapi_definition uverbs_def_obj_mr[]; +extern const struct uapi_definition uverbs_def_obj_srq[]; extern const struct uapi_definition uverbs_def_write_intf[]; static inline const struct uverbs_api_write_method * diff --git a/drivers/infiniband/core/uverbs_std_types.c b/drivers/infiniband/core/uverbs_std_types.c index 3abfc63225cb..d9b6912eafa8 100644 --- a/drivers/infiniband/core/uverbs_std_types.c +++ b/drivers/infiniband/core/uverbs_std_types.c @@ -142,31 +142,6 @@ static int uverbs_free_wq(struct ib_uobject *uobject, return ret; } -static int uverbs_free_srq(struct ib_uobject *uobject, - enum rdma_remove_reason why, - struct uverbs_attr_bundle *attrs) -{ - struct ib_srq *srq = uobject->object; - struct ib_uevent_object *uevent = - container_of(uobject, struct ib_uevent_object, uobject); - enum ib_srq_type srq_type = srq->srq_type; - int ret; - - ret = ib_destroy_srq_user(srq, &attrs->driver_udata); - if (ib_is_destroy_retryable(ret, why, uobject)) - return ret; - - if (srq_type == IB_SRQT_XRC) { - struct ib_usrq_object *us = - container_of(uevent, struct ib_usrq_object, uevent); - - atomic_dec(&us->uxrcd->refcnt); - } - - ib_uverbs_release_uevent(uevent); - return ret; -} - static int uverbs_free_xrcd(struct ib_uobject *uobject, enum rdma_remove_reason why, struct uverbs_attr_bundle *attrs) @@ -267,11 +242,6 @@ DECLARE_UVERBS_NAMED_OBJECT(UVERBS_OBJECT_MW, UVERBS_TYPE_ALLOC_IDR(uverbs_free_mw), &UVERBS_METHOD(UVERBS_METHOD_MW_DESTROY)); -DECLARE_UVERBS_NAMED_OBJECT( - UVERBS_OBJECT_SRQ, - UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_usrq_object), - uverbs_free_srq)); - DECLARE_UVERBS_NAMED_METHOD_DESTROY( UVERBS_METHOD_AH_DESTROY, UVERBS_ATTR_IDR(UVERBS_ATTR_DESTROY_AH_HANDLE, @@ -346,8 +316,6 @@ const struct uapi_definition uverbs_def_obj_intf[] = { UAPI_DEF_OBJ_NEEDS_FN(destroy_ah)), UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_MW, UAPI_DEF_OBJ_NEEDS_FN(dealloc_mw)), - UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_SRQ, - UAPI_DEF_OBJ_NEEDS_FN(destroy_srq)), UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_FLOW, UAPI_DEF_OBJ_NEEDS_FN(destroy_flow)), UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_WQ, diff --git a/drivers/infiniband/core/uverbs_std_types_srq.c b/drivers/infiniband/core/uverbs_std_types_srq.c new file mode 100644 index 000000000000..c0ecbba26bf4 --- /dev/null +++ b/drivers/infiniband/core/uverbs_std_types_srq.c @@ -0,0 +1,234 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2020, Mellanox Technologies inc. All rights reserved. + */ + +#include +#include "rdma_core.h" +#include "uverbs.h" + +static int uverbs_free_srq(struct ib_uobject *uobject, + enum rdma_remove_reason why, + struct uverbs_attr_bundle *attrs) +{ + struct ib_srq *srq = uobject->object; + struct ib_uevent_object *uevent = + container_of(uobject, struct ib_uevent_object, uobject); + enum ib_srq_type srq_type = srq->srq_type; + int ret; + + ret = ib_destroy_srq_user(srq, &attrs->driver_udata); + if (ib_is_destroy_retryable(ret, why, uobject)) + return ret; + + if (srq_type == IB_SRQT_XRC) { + struct ib_usrq_object *us = + container_of(uobject, struct ib_usrq_object, + uevent.uobject); + + atomic_dec(&us->uxrcd->refcnt); + } + + ib_uverbs_release_uevent(uevent); + return ret; +} + +static int UVERBS_HANDLER(UVERBS_METHOD_SRQ_CREATE)( + struct uverbs_attr_bundle *attrs) +{ + struct ib_usrq_object *obj = container_of( + uverbs_attr_get_uobject(attrs, UVERBS_ATTR_CREATE_SRQ_HANDLE), + typeof(*obj), uevent.uobject); + struct ib_pd *pd = + uverbs_attr_get_obj(attrs, UVERBS_ATTR_CREATE_SRQ_PD_HANDLE); + struct ib_srq_init_attr attr = {}; + struct ib_uobject *xrcd_uobj; + struct ib_srq *srq; + u64 user_handle; + int ret; + + ret = uverbs_copy_from(&attr.attr.max_sge, attrs, + UVERBS_ATTR_CREATE_SRQ_MAX_SGE); + if (!ret) + ret = uverbs_copy_from(&attr.attr.max_wr, attrs, + UVERBS_ATTR_CREATE_SRQ_MAX_WR); + if (!ret) + ret = uverbs_copy_from(&attr.attr.srq_limit, attrs, + UVERBS_ATTR_CREATE_SRQ_LIMIT); + if (!ret) + ret = uverbs_copy_from(&user_handle, attrs, + UVERBS_ATTR_CREATE_SRQ_USER_HANDLE); + if (!ret) + ret = uverbs_get_const(&attr.srq_type, attrs, + UVERBS_ATTR_CREATE_SRQ_TYPE); + if (ret) + return ret; + + if (ib_srq_has_cq(attr.srq_type)) { + attr.ext.cq = uverbs_attr_get_obj(attrs, + UVERBS_ATTR_CREATE_SRQ_CQ_HANDLE); + if (IS_ERR(attr.ext.cq)) + return PTR_ERR(attr.ext.cq); + } + + switch (attr.srq_type) { + case IB_UVERBS_SRQT_XRC: + xrcd_uobj = uverbs_attr_get_uobject(attrs, + UVERBS_ATTR_CREATE_SRQ_XRCD_HANDLE); + if (IS_ERR(xrcd_uobj)) + return PTR_ERR(xrcd_uobj); + + attr.ext.xrc.xrcd = (struct ib_xrcd *)xrcd_uobj->object; + if (!attr.ext.xrc.xrcd) + return -EINVAL; + obj->uxrcd = container_of(xrcd_uobj, struct ib_uxrcd_object, + uobject); + atomic_inc(&obj->uxrcd->refcnt); + break; + case IB_UVERBS_SRQT_TM: + ret = uverbs_copy_from(&attr.ext.tag_matching.max_num_tags, + attrs, + UVERBS_ATTR_CREATE_SRQ_MAX_NUM_TAGS); + if (ret) + return ret; + break; + case IB_UVERBS_SRQT_BASIC: + break; + default: + return -EINVAL; + } + + obj->uevent.event_file = ib_uverbs_get_async_event(attrs, + UVERBS_ATTR_CREATE_SRQ_EVENT_FD); + INIT_LIST_HEAD(&obj->uevent.event_list); + attr.event_handler = ib_uverbs_srq_event_handler; + obj->uevent.uobject.user_handle = user_handle; + + srq = ib_create_srq_user(pd, &attr, obj, &attrs->driver_udata); + if (IS_ERR(srq)) { + ret = PTR_ERR(srq); + goto err; + } + + obj->uevent.uobject.object = srq; + uverbs_finalize_uobj_create(attrs, UVERBS_ATTR_CREATE_SRQ_HANDLE); + + ret = uverbs_copy_to(attrs, UVERBS_ATTR_CREATE_SRQ_RESP_MAX_WR, + &attr.attr.max_wr, + sizeof(attr.attr.max_wr)); + if (ret) + return ret; + + ret = uverbs_copy_to(attrs, UVERBS_ATTR_CREATE_SRQ_RESP_MAX_SGE, + &attr.attr.max_sge, + sizeof(attr.attr.max_sge)); + if (ret) + return ret; + + if (attr.srq_type == IB_SRQT_XRC) { + ret = uverbs_copy_to(attrs, + UVERBS_ATTR_CREATE_SRQ_RESP_SRQ_NUM, + &srq->ext.xrc.srq_num, + sizeof(srq->ext.xrc.srq_num)); + if (ret) + return ret; + } + + return 0; +err: + if (obj->uevent.event_file) + uverbs_uobject_put(&obj->uevent.event_file->uobj); + if (attr.srq_type == IB_SRQT_XRC) + atomic_dec(&obj->uxrcd->refcnt); + return ret; +}; + +DECLARE_UVERBS_NAMED_METHOD( + UVERBS_METHOD_SRQ_CREATE, + UVERBS_ATTR_IDR(UVERBS_ATTR_CREATE_SRQ_HANDLE, + UVERBS_OBJECT_SRQ, + UVERBS_ACCESS_NEW, + UA_MANDATORY), + UVERBS_ATTR_IDR(UVERBS_ATTR_CREATE_SRQ_PD_HANDLE, + UVERBS_OBJECT_PD, + UVERBS_ACCESS_READ, + UA_MANDATORY), + UVERBS_ATTR_CONST_IN(UVERBS_ATTR_CREATE_SRQ_TYPE, + enum ib_uverbs_srq_type, + UA_MANDATORY), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_SRQ_USER_HANDLE, + UVERBS_ATTR_TYPE(u64), + UA_MANDATORY), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_SRQ_MAX_WR, + UVERBS_ATTR_TYPE(u32), + UA_MANDATORY), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_SRQ_MAX_SGE, + UVERBS_ATTR_TYPE(u32), + UA_MANDATORY), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_SRQ_LIMIT, + UVERBS_ATTR_TYPE(u32), + UA_MANDATORY), + UVERBS_ATTR_IDR(UVERBS_ATTR_CREATE_SRQ_XRCD_HANDLE, + UVERBS_OBJECT_XRCD, + UVERBS_ACCESS_READ, + UA_OPTIONAL), + UVERBS_ATTR_IDR(UVERBS_ATTR_CREATE_SRQ_CQ_HANDLE, + UVERBS_OBJECT_CQ, + UVERBS_ACCESS_READ, + UA_OPTIONAL), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_SRQ_MAX_NUM_TAGS, + UVERBS_ATTR_TYPE(u32), + UA_OPTIONAL), + UVERBS_ATTR_FD(UVERBS_ATTR_CREATE_SRQ_EVENT_FD, + UVERBS_OBJECT_ASYNC_EVENT, + UVERBS_ACCESS_READ, + UA_OPTIONAL), + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_CREATE_SRQ_RESP_MAX_WR, + UVERBS_ATTR_TYPE(u32), + UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_CREATE_SRQ_RESP_MAX_SGE, + UVERBS_ATTR_TYPE(u32), + UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_CREATE_SRQ_RESP_SRQ_NUM, + UVERBS_ATTR_TYPE(u32), + UA_OPTIONAL), + UVERBS_ATTR_UHW()); + +static int UVERBS_HANDLER(UVERBS_METHOD_SRQ_DESTROY)( + struct uverbs_attr_bundle *attrs) +{ + struct ib_uobject *uobj = + uverbs_attr_get_uobject(attrs, UVERBS_ATTR_DESTROY_SRQ_HANDLE); + struct ib_usrq_object *obj = + container_of(uobj, struct ib_usrq_object, uevent.uobject); + struct ib_uverbs_destroy_srq_resp resp = { + .events_reported = obj->uevent.events_reported + }; + + return uverbs_copy_to(attrs, UVERBS_ATTR_DESTROY_SRQ_RESP, &resp, + sizeof(resp)); +} + +DECLARE_UVERBS_NAMED_METHOD( + UVERBS_METHOD_SRQ_DESTROY, + UVERBS_ATTR_IDR(UVERBS_ATTR_DESTROY_SRQ_HANDLE, + UVERBS_OBJECT_SRQ, + UVERBS_ACCESS_DESTROY, + UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_DESTROY_SRQ_RESP, + UVERBS_ATTR_TYPE(struct ib_uverbs_destroy_srq_resp), + UA_MANDATORY)); + +DECLARE_UVERBS_NAMED_OBJECT( + UVERBS_OBJECT_SRQ, + UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_usrq_object), + uverbs_free_srq), + &UVERBS_METHOD(UVERBS_METHOD_SRQ_CREATE), + &UVERBS_METHOD(UVERBS_METHOD_SRQ_DESTROY) +); + +const struct uapi_definition uverbs_def_obj_srq[] = { + UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_SRQ, + UAPI_DEF_OBJ_NEEDS_FN(destroy_srq)), + {} +}; diff --git a/drivers/infiniband/core/uverbs_uapi.c b/drivers/infiniband/core/uverbs_uapi.c index 3f121ac31e0a..3f5627954fe7 100644 --- a/drivers/infiniband/core/uverbs_uapi.c +++ b/drivers/infiniband/core/uverbs_uapi.c @@ -634,6 +634,7 @@ static const struct uapi_definition uverbs_core_api[] = { UAPI_DEF_CHAIN(uverbs_def_obj_flow_action), UAPI_DEF_CHAIN(uverbs_def_obj_intf), UAPI_DEF_CHAIN(uverbs_def_obj_mr), + UAPI_DEF_CHAIN(uverbs_def_obj_srq), UAPI_DEF_CHAIN(uverbs_def_write_intf), {}, }; diff --git a/include/uapi/rdma/ib_user_ioctl_cmds.h b/include/uapi/rdma/ib_user_ioctl_cmds.h index 286fdc1929e0..c07af46ff04c 100644 --- a/include/uapi/rdma/ib_user_ioctl_cmds.h +++ b/include/uapi/rdma/ib_user_ioctl_cmds.h @@ -121,6 +121,33 @@ enum uverbs_attrs_destroy_flow_action_esp { UVERBS_ATTR_DESTROY_FLOW_ACTION_HANDLE, }; +enum uverbs_attrs_create_srq_cmd_attr_ids { + UVERBS_ATTR_CREATE_SRQ_HANDLE, + UVERBS_ATTR_CREATE_SRQ_PD_HANDLE, + UVERBS_ATTR_CREATE_SRQ_XRCD_HANDLE, + UVERBS_ATTR_CREATE_SRQ_CQ_HANDLE, + UVERBS_ATTR_CREATE_SRQ_USER_HANDLE, + UVERBS_ATTR_CREATE_SRQ_MAX_WR, + UVERBS_ATTR_CREATE_SRQ_MAX_SGE, + UVERBS_ATTR_CREATE_SRQ_LIMIT, + UVERBS_ATTR_CREATE_SRQ_MAX_NUM_TAGS, + UVERBS_ATTR_CREATE_SRQ_TYPE, + UVERBS_ATTR_CREATE_SRQ_EVENT_FD, + UVERBS_ATTR_CREATE_SRQ_RESP_MAX_WR, + UVERBS_ATTR_CREATE_SRQ_RESP_MAX_SGE, + UVERBS_ATTR_CREATE_SRQ_RESP_SRQ_NUM, +}; + +enum uverbs_attrs_destroy_srq_cmd_attr_ids { + UVERBS_ATTR_DESTROY_SRQ_HANDLE, + UVERBS_ATTR_DESTROY_SRQ_RESP, +}; + +enum uverbs_methods_srq { + UVERBS_METHOD_SRQ_CREATE, + UVERBS_METHOD_SRQ_DESTROY, +}; + enum uverbs_methods_cq { UVERBS_METHOD_CQ_CREATE, UVERBS_METHOD_CQ_DESTROY, -- cgit v1.2.3 From ef3bc084a8ed461e3d1f82481f47dacb96596f8f Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Tue, 19 May 2020 10:27:10 +0300 Subject: IB/uverbs: Introduce create/destroy WQ commands over ioctl Introduce create/destroy WQ commands over the ioctl interface to let it be extended to get an asynchronous event FD. Link: https://lore.kernel.org/r/20200519072711.257271-7-leon@kernel.org Signed-off-by: Yishai Hadas Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/Makefile | 3 +- drivers/infiniband/core/rdma_core.h | 1 + drivers/infiniband/core/uverbs_std_types.c | 23 --- drivers/infiniband/core/uverbs_std_types_wq.c | 194 ++++++++++++++++++++++++++ drivers/infiniband/core/uverbs_uapi.c | 1 + include/uapi/rdma/ib_user_ioctl_cmds.h | 25 ++++ 6 files changed, 223 insertions(+), 24 deletions(-) create mode 100644 drivers/infiniband/core/uverbs_std_types_wq.c diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile index d7b46a7c07fd..96c0a4b5af18 100644 --- a/drivers/infiniband/core/Makefile +++ b/drivers/infiniband/core/Makefile @@ -37,6 +37,7 @@ ib_uverbs-y := uverbs_main.o uverbs_cmd.o uverbs_marshall.o \ uverbs_std_types_mr.o uverbs_std_types_counters.o \ uverbs_uapi.o uverbs_std_types_device.o \ uverbs_std_types_async_fd.o \ - uverbs_std_types_srq.o + uverbs_std_types_srq.o \ + uverbs_std_types_wq.o ib_uverbs-$(CONFIG_INFINIBAND_USER_MEM) += umem.o ib_uverbs-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += umem_odp.o diff --git a/drivers/infiniband/core/rdma_core.h b/drivers/infiniband/core/rdma_core.h index d623f911b70b..9e9f2fa04fb9 100644 --- a/drivers/infiniband/core/rdma_core.h +++ b/drivers/infiniband/core/rdma_core.h @@ -160,6 +160,7 @@ extern const struct uapi_definition uverbs_def_obj_flow_action[]; extern const struct uapi_definition uverbs_def_obj_intf[]; extern const struct uapi_definition uverbs_def_obj_mr[]; extern const struct uapi_definition uverbs_def_obj_srq[]; +extern const struct uapi_definition uverbs_def_obj_wq[]; extern const struct uapi_definition uverbs_def_write_intf[]; static inline const struct uverbs_api_write_method * diff --git a/drivers/infiniband/core/uverbs_std_types.c b/drivers/infiniband/core/uverbs_std_types.c index d9b6912eafa8..c328d5194076 100644 --- a/drivers/infiniband/core/uverbs_std_types.c +++ b/drivers/infiniband/core/uverbs_std_types.c @@ -125,23 +125,6 @@ static int uverbs_free_rwq_ind_tbl(struct ib_uobject *uobject, return ret; } -static int uverbs_free_wq(struct ib_uobject *uobject, - enum rdma_remove_reason why, - struct uverbs_attr_bundle *attrs) -{ - struct ib_wq *wq = uobject->object; - struct ib_uwq_object *uwq = - container_of(uobject, struct ib_uwq_object, uevent.uobject); - int ret; - - ret = ib_destroy_wq(wq, &attrs->driver_udata); - if (ib_is_destroy_retryable(ret, why, uobject)) - return ret; - - ib_uverbs_release_uevent(&uwq->uevent); - return ret; -} - static int uverbs_free_xrcd(struct ib_uobject *uobject, enum rdma_remove_reason why, struct uverbs_attr_bundle *attrs) @@ -266,10 +249,6 @@ DECLARE_UVERBS_NAMED_OBJECT( uverbs_free_flow), &UVERBS_METHOD(UVERBS_METHOD_FLOW_DESTROY)); -DECLARE_UVERBS_NAMED_OBJECT( - UVERBS_OBJECT_WQ, - UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_uwq_object), uverbs_free_wq)); - DECLARE_UVERBS_NAMED_METHOD_DESTROY( UVERBS_METHOD_RWQ_IND_TBL_DESTROY, UVERBS_ATTR_IDR(UVERBS_ATTR_DESTROY_RWQ_IND_TBL_HANDLE, @@ -318,8 +297,6 @@ const struct uapi_definition uverbs_def_obj_intf[] = { UAPI_DEF_OBJ_NEEDS_FN(dealloc_mw)), UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_FLOW, UAPI_DEF_OBJ_NEEDS_FN(destroy_flow)), - UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_WQ, - UAPI_DEF_OBJ_NEEDS_FN(destroy_wq)), UAPI_DEF_CHAIN_OBJ_TREE_NAMED( UVERBS_OBJECT_RWQ_IND_TBL, UAPI_DEF_OBJ_NEEDS_FN(destroy_rwq_ind_table)), diff --git a/drivers/infiniband/core/uverbs_std_types_wq.c b/drivers/infiniband/core/uverbs_std_types_wq.c new file mode 100644 index 000000000000..cad842ede077 --- /dev/null +++ b/drivers/infiniband/core/uverbs_std_types_wq.c @@ -0,0 +1,194 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2020, Mellanox Technologies inc. All rights reserved. + */ + +#include +#include "rdma_core.h" +#include "uverbs.h" + +static int uverbs_free_wq(struct ib_uobject *uobject, + enum rdma_remove_reason why, + struct uverbs_attr_bundle *attrs) +{ + struct ib_wq *wq = uobject->object; + struct ib_uwq_object *uwq = + container_of(uobject, struct ib_uwq_object, uevent.uobject); + int ret; + + ret = ib_destroy_wq(wq, &attrs->driver_udata); + if (ib_is_destroy_retryable(ret, why, uobject)) + return ret; + + ib_uverbs_release_uevent(&uwq->uevent); + return ret; +} + +static int UVERBS_HANDLER(UVERBS_METHOD_WQ_CREATE)( + struct uverbs_attr_bundle *attrs) +{ + struct ib_uwq_object *obj = container_of( + uverbs_attr_get_uobject(attrs, UVERBS_ATTR_CREATE_WQ_HANDLE), + typeof(*obj), uevent.uobject); + struct ib_pd *pd = + uverbs_attr_get_obj(attrs, UVERBS_ATTR_CREATE_WQ_PD_HANDLE); + struct ib_cq *cq = + uverbs_attr_get_obj(attrs, UVERBS_ATTR_CREATE_WQ_CQ_HANDLE); + struct ib_wq_init_attr wq_init_attr = {}; + struct ib_wq *wq; + u64 user_handle; + int ret; + + ret = uverbs_get_flags32(&wq_init_attr.create_flags, attrs, + UVERBS_ATTR_CREATE_WQ_FLAGS, + IB_UVERBS_WQ_FLAGS_CVLAN_STRIPPING | + IB_UVERBS_WQ_FLAGS_SCATTER_FCS | + IB_UVERBS_WQ_FLAGS_DELAY_DROP | + IB_UVERBS_WQ_FLAGS_PCI_WRITE_END_PADDING); + if (!ret) + ret = uverbs_copy_from(&wq_init_attr.max_sge, attrs, + UVERBS_ATTR_CREATE_WQ_MAX_SGE); + if (!ret) + ret = uverbs_copy_from(&wq_init_attr.max_wr, attrs, + UVERBS_ATTR_CREATE_WQ_MAX_WR); + if (!ret) + ret = uverbs_copy_from(&user_handle, attrs, + UVERBS_ATTR_CREATE_WQ_USER_HANDLE); + if (!ret) + ret = uverbs_get_const(&wq_init_attr.wq_type, attrs, + UVERBS_ATTR_CREATE_WQ_TYPE); + if (ret) + return ret; + + if (wq_init_attr.wq_type != IB_WQT_RQ) + return -EINVAL; + + obj->uevent.event_file = ib_uverbs_get_async_event(attrs, + UVERBS_ATTR_CREATE_WQ_EVENT_FD); + obj->uevent.uobject.user_handle = user_handle; + INIT_LIST_HEAD(&obj->uevent.event_list); + wq_init_attr.event_handler = ib_uverbs_wq_event_handler; + wq_init_attr.wq_context = attrs->ufile; + wq_init_attr.cq = cq; + + wq = pd->device->ops.create_wq(pd, &wq_init_attr, &attrs->driver_udata); + if (IS_ERR(wq)) { + ret = PTR_ERR(wq); + goto err; + } + + obj->uevent.uobject.object = wq; + wq->wq_type = wq_init_attr.wq_type; + wq->cq = cq; + wq->pd = pd; + wq->device = pd->device; + wq->wq_context = wq_init_attr.wq_context; + atomic_set(&wq->usecnt, 0); + atomic_inc(&pd->usecnt); + atomic_inc(&cq->usecnt); + wq->uobject = obj; + uverbs_finalize_uobj_create(attrs, UVERBS_ATTR_CREATE_WQ_HANDLE); + + ret = uverbs_copy_to(attrs, UVERBS_ATTR_CREATE_WQ_RESP_MAX_WR, + &wq_init_attr.max_wr, + sizeof(wq_init_attr.max_wr)); + if (ret) + return ret; + + ret = uverbs_copy_to(attrs, UVERBS_ATTR_CREATE_WQ_RESP_MAX_SGE, + &wq_init_attr.max_sge, + sizeof(wq_init_attr.max_sge)); + if (ret) + return ret; + + ret = uverbs_copy_to(attrs, UVERBS_ATTR_CREATE_WQ_RESP_WQ_NUM, + &wq->wq_num, + sizeof(wq->wq_num)); + return ret; + +err: + if (obj->uevent.event_file) + uverbs_uobject_put(&obj->uevent.event_file->uobj); + return ret; +}; + +DECLARE_UVERBS_NAMED_METHOD( + UVERBS_METHOD_WQ_CREATE, + UVERBS_ATTR_IDR(UVERBS_ATTR_CREATE_WQ_HANDLE, + UVERBS_OBJECT_WQ, + UVERBS_ACCESS_NEW, + UA_MANDATORY), + UVERBS_ATTR_IDR(UVERBS_ATTR_CREATE_WQ_PD_HANDLE, + UVERBS_OBJECT_PD, + UVERBS_ACCESS_READ, + UA_MANDATORY), + UVERBS_ATTR_CONST_IN(UVERBS_ATTR_CREATE_WQ_TYPE, + enum ib_wq_type, + UA_MANDATORY), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_WQ_USER_HANDLE, + UVERBS_ATTR_TYPE(u64), + UA_MANDATORY), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_WQ_MAX_WR, + UVERBS_ATTR_TYPE(u32), + UA_MANDATORY), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_WQ_MAX_SGE, + UVERBS_ATTR_TYPE(u32), + UA_MANDATORY), + UVERBS_ATTR_FLAGS_IN(UVERBS_ATTR_CREATE_WQ_FLAGS, + enum ib_uverbs_wq_flags, + UA_MANDATORY), + UVERBS_ATTR_IDR(UVERBS_ATTR_CREATE_WQ_CQ_HANDLE, + UVERBS_OBJECT_CQ, + UVERBS_ACCESS_READ, + UA_OPTIONAL), + UVERBS_ATTR_FD(UVERBS_ATTR_CREATE_WQ_EVENT_FD, + UVERBS_OBJECT_ASYNC_EVENT, + UVERBS_ACCESS_READ, + UA_OPTIONAL), + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_CREATE_WQ_RESP_MAX_WR, + UVERBS_ATTR_TYPE(u32), + UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_CREATE_WQ_RESP_MAX_SGE, + UVERBS_ATTR_TYPE(u32), + UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_CREATE_WQ_RESP_WQ_NUM, + UVERBS_ATTR_TYPE(u32), + UA_OPTIONAL), + UVERBS_ATTR_UHW()); + +static int UVERBS_HANDLER(UVERBS_METHOD_WQ_DESTROY)( + struct uverbs_attr_bundle *attrs) +{ + struct ib_uobject *uobj = + uverbs_attr_get_uobject(attrs, UVERBS_ATTR_DESTROY_WQ_HANDLE); + struct ib_uwq_object *obj = + container_of(uobj, struct ib_uwq_object, uevent.uobject); + + return uverbs_copy_to(attrs, UVERBS_ATTR_DESTROY_WQ_RESP, + &obj->uevent.events_reported, + sizeof(obj->uevent.events_reported)); +} + +DECLARE_UVERBS_NAMED_METHOD( + UVERBS_METHOD_WQ_DESTROY, + UVERBS_ATTR_IDR(UVERBS_ATTR_DESTROY_WQ_HANDLE, + UVERBS_OBJECT_WQ, + UVERBS_ACCESS_DESTROY, + UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_DESTROY_WQ_RESP, + UVERBS_ATTR_TYPE(u32), + UA_MANDATORY)); + + +DECLARE_UVERBS_NAMED_OBJECT( + UVERBS_OBJECT_WQ, + UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_uwq_object), uverbs_free_wq), + &UVERBS_METHOD(UVERBS_METHOD_WQ_CREATE), + &UVERBS_METHOD(UVERBS_METHOD_WQ_DESTROY) +); + +const struct uapi_definition uverbs_def_obj_wq[] = { + UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_WQ, + UAPI_DEF_OBJ_NEEDS_FN(destroy_wq)), + {} +}; diff --git a/drivers/infiniband/core/uverbs_uapi.c b/drivers/infiniband/core/uverbs_uapi.c index 3f5627954fe7..0ec8cf86ecfa 100644 --- a/drivers/infiniband/core/uverbs_uapi.c +++ b/drivers/infiniband/core/uverbs_uapi.c @@ -635,6 +635,7 @@ static const struct uapi_definition uverbs_core_api[] = { UAPI_DEF_CHAIN(uverbs_def_obj_intf), UAPI_DEF_CHAIN(uverbs_def_obj_mr), UAPI_DEF_CHAIN(uverbs_def_obj_srq), + UAPI_DEF_CHAIN(uverbs_def_obj_wq), UAPI_DEF_CHAIN(uverbs_def_write_intf), {}, }; diff --git a/include/uapi/rdma/ib_user_ioctl_cmds.h b/include/uapi/rdma/ib_user_ioctl_cmds.h index c07af46ff04c..381b17889d20 100644 --- a/include/uapi/rdma/ib_user_ioctl_cmds.h +++ b/include/uapi/rdma/ib_user_ioctl_cmds.h @@ -153,6 +153,31 @@ enum uverbs_methods_cq { UVERBS_METHOD_CQ_DESTROY, }; +enum uverbs_attrs_create_wq_cmd_attr_ids { + UVERBS_ATTR_CREATE_WQ_HANDLE, + UVERBS_ATTR_CREATE_WQ_PD_HANDLE, + UVERBS_ATTR_CREATE_WQ_CQ_HANDLE, + UVERBS_ATTR_CREATE_WQ_USER_HANDLE, + UVERBS_ATTR_CREATE_WQ_TYPE, + UVERBS_ATTR_CREATE_WQ_EVENT_FD, + UVERBS_ATTR_CREATE_WQ_MAX_WR, + UVERBS_ATTR_CREATE_WQ_MAX_SGE, + UVERBS_ATTR_CREATE_WQ_FLAGS, + UVERBS_ATTR_CREATE_WQ_RESP_MAX_WR, + UVERBS_ATTR_CREATE_WQ_RESP_MAX_SGE, + UVERBS_ATTR_CREATE_WQ_RESP_WQ_NUM, +}; + +enum uverbs_attrs_destroy_wq_cmd_attr_ids { + UVERBS_ATTR_DESTROY_WQ_HANDLE, + UVERBS_ATTR_DESTROY_WQ_RESP, +}; + +enum uverbs_methods_wq { + UVERBS_METHOD_WQ_CREATE, + UVERBS_METHOD_WQ_DESTROY, +}; + enum uverbs_methods_actions_flow_action_ops { UVERBS_METHOD_FLOW_ACTION_ESP_CREATE, UVERBS_METHOD_FLOW_ACTION_DESTROY, -- cgit v1.2.3 From 6d1e7ba241e990b5c6ba7fdaa03d466f852f3c9e Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Tue, 19 May 2020 10:27:11 +0300 Subject: IB/uverbs: Introduce create/destroy QP commands over ioctl Introduce create/destroy QP commands over the ioctl interface to let it be extended to get an asynchronous event FD. Link: https://lore.kernel.org/r/20200519072711.257271-8-leon@kernel.org Signed-off-by: Yishai Hadas Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/Makefile | 3 +- drivers/infiniband/core/rdma_core.h | 1 + drivers/infiniband/core/uverbs_std_types.c | 40 --- drivers/infiniband/core/uverbs_std_types_qp.c | 401 ++++++++++++++++++++++++++ drivers/infiniband/core/uverbs_uapi.c | 1 + include/uapi/rdma/ib_user_ioctl_cmds.h | 28 ++ include/uapi/rdma/ib_user_ioctl_verbs.h | 9 + 7 files changed, 442 insertions(+), 41 deletions(-) create mode 100644 drivers/infiniband/core/uverbs_std_types_qp.c diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile index 96c0a4b5af18..63c1591223ac 100644 --- a/drivers/infiniband/core/Makefile +++ b/drivers/infiniband/core/Makefile @@ -38,6 +38,7 @@ ib_uverbs-y := uverbs_main.o uverbs_cmd.o uverbs_marshall.o \ uverbs_uapi.o uverbs_std_types_device.o \ uverbs_std_types_async_fd.o \ uverbs_std_types_srq.o \ - uverbs_std_types_wq.o + uverbs_std_types_wq.o \ + uverbs_std_types_qp.o ib_uverbs-$(CONFIG_INFINIBAND_USER_MEM) += umem.o ib_uverbs-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += umem_odp.o diff --git a/drivers/infiniband/core/rdma_core.h b/drivers/infiniband/core/rdma_core.h index 9e9f2fa04fb9..33706dad6c0f 100644 --- a/drivers/infiniband/core/rdma_core.h +++ b/drivers/infiniband/core/rdma_core.h @@ -159,6 +159,7 @@ extern const struct uapi_definition uverbs_def_obj_dm[]; extern const struct uapi_definition uverbs_def_obj_flow_action[]; extern const struct uapi_definition uverbs_def_obj_intf[]; extern const struct uapi_definition uverbs_def_obj_mr[]; +extern const struct uapi_definition uverbs_def_obj_qp[]; extern const struct uapi_definition uverbs_def_obj_srq[]; extern const struct uapi_definition uverbs_def_obj_wq[]; extern const struct uapi_definition uverbs_def_write_intf[]; diff --git a/drivers/infiniband/core/uverbs_std_types.c b/drivers/infiniband/core/uverbs_std_types.c index c328d5194076..08c39cfb1bd9 100644 --- a/drivers/infiniband/core/uverbs_std_types.c +++ b/drivers/infiniband/core/uverbs_std_types.c @@ -75,40 +75,6 @@ static int uverbs_free_mw(struct ib_uobject *uobject, return uverbs_dealloc_mw((struct ib_mw *)uobject->object); } -static int uverbs_free_qp(struct ib_uobject *uobject, - enum rdma_remove_reason why, - struct uverbs_attr_bundle *attrs) -{ - struct ib_qp *qp = uobject->object; - struct ib_uqp_object *uqp = - container_of(uobject, struct ib_uqp_object, uevent.uobject); - int ret; - - /* - * If this is a user triggered destroy then do not allow destruction - * until the user cleans up all the mcast bindings. Unlike in other - * places we forcibly clean up the mcast attachments for !DESTROY - * because the mcast attaches are not ubojects and will not be - * destroyed by anything else during cleanup processing. - */ - if (why == RDMA_REMOVE_DESTROY) { - if (!list_empty(&uqp->mcast_list)) - return -EBUSY; - } else if (qp == qp->real_qp) { - ib_uverbs_detach_umcast(qp, uqp); - } - - ret = ib_destroy_qp_user(qp, &attrs->driver_udata); - if (ib_is_destroy_retryable(ret, why, uobject)) - return ret; - - if (uqp->uxrcd) - atomic_dec(&uqp->uxrcd->refcnt); - - ib_uverbs_release_uevent(&uqp->uevent); - return ret; -} - static int uverbs_free_rwq_ind_tbl(struct ib_uobject *uobject, enum rdma_remove_reason why, struct uverbs_attr_bundle *attrs) @@ -210,10 +176,6 @@ DECLARE_UVERBS_NAMED_OBJECT( "[infinibandevent]", O_RDONLY)); -DECLARE_UVERBS_NAMED_OBJECT( - UVERBS_OBJECT_QP, - UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_uqp_object), uverbs_free_qp)); - DECLARE_UVERBS_NAMED_METHOD_DESTROY( UVERBS_METHOD_MW_DESTROY, UVERBS_ATTR_IDR(UVERBS_ATTR_DESTROY_MW_HANDLE, @@ -289,8 +251,6 @@ const struct uapi_definition uverbs_def_obj_intf[] = { UAPI_DEF_OBJ_NEEDS_FN(dealloc_pd)), UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_COMP_CHANNEL, UAPI_DEF_OBJ_NEEDS_FN(dealloc_pd)), - UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_QP, - UAPI_DEF_OBJ_NEEDS_FN(destroy_qp)), UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_AH, UAPI_DEF_OBJ_NEEDS_FN(destroy_ah)), UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_MW, diff --git a/drivers/infiniband/core/uverbs_std_types_qp.c b/drivers/infiniband/core/uverbs_std_types_qp.c new file mode 100644 index 000000000000..3bf8dcdfe7eb --- /dev/null +++ b/drivers/infiniband/core/uverbs_std_types_qp.c @@ -0,0 +1,401 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2020, Mellanox Technologies inc. All rights reserved. + */ + +#include +#include "rdma_core.h" +#include "uverbs.h" +#include "core_priv.h" + +static int uverbs_free_qp(struct ib_uobject *uobject, + enum rdma_remove_reason why, + struct uverbs_attr_bundle *attrs) +{ + struct ib_qp *qp = uobject->object; + struct ib_uqp_object *uqp = + container_of(uobject, struct ib_uqp_object, uevent.uobject); + int ret; + + /* + * If this is a user triggered destroy then do not allow destruction + * until the user cleans up all the mcast bindings. Unlike in other + * places we forcibly clean up the mcast attachments for !DESTROY + * because the mcast attaches are not ubojects and will not be + * destroyed by anything else during cleanup processing. + */ + if (why == RDMA_REMOVE_DESTROY) { + if (!list_empty(&uqp->mcast_list)) + return -EBUSY; + } else if (qp == qp->real_qp) { + ib_uverbs_detach_umcast(qp, uqp); + } + + ret = ib_destroy_qp_user(qp, &attrs->driver_udata); + if (ib_is_destroy_retryable(ret, why, uobject)) + return ret; + + if (uqp->uxrcd) + atomic_dec(&uqp->uxrcd->refcnt); + + ib_uverbs_release_uevent(&uqp->uevent); + return ret; +} + +static int check_creation_flags(enum ib_qp_type qp_type, + u32 create_flags) +{ + create_flags &= ~IB_UVERBS_QP_CREATE_SQ_SIG_ALL; + + if (!create_flags || qp_type == IB_QPT_DRIVER) + return 0; + + if (qp_type != IB_QPT_RAW_PACKET && qp_type != IB_QPT_UD) + return -EINVAL; + + if ((create_flags & IB_UVERBS_QP_CREATE_SCATTER_FCS || + create_flags & IB_UVERBS_QP_CREATE_CVLAN_STRIPPING) && + qp_type != IB_QPT_RAW_PACKET) + return -EINVAL; + + return 0; +} + +static void set_caps(struct ib_qp_init_attr *attr, + struct ib_uverbs_qp_cap *cap, bool req) +{ + if (req) { + attr->cap.max_send_wr = cap->max_send_wr; + attr->cap.max_recv_wr = cap->max_recv_wr; + attr->cap.max_send_sge = cap->max_send_sge; + attr->cap.max_recv_sge = cap->max_recv_sge; + attr->cap.max_inline_data = cap->max_inline_data; + } else { + cap->max_send_wr = attr->cap.max_send_wr; + cap->max_recv_wr = attr->cap.max_recv_wr; + cap->max_send_sge = attr->cap.max_send_sge; + cap->max_recv_sge = attr->cap.max_recv_sge; + cap->max_inline_data = attr->cap.max_inline_data; + } +} + +static int UVERBS_HANDLER(UVERBS_METHOD_QP_CREATE)( + struct uverbs_attr_bundle *attrs) +{ + struct ib_uqp_object *obj = container_of( + uverbs_attr_get_uobject(attrs, UVERBS_ATTR_CREATE_QP_HANDLE), + typeof(*obj), uevent.uobject); + struct ib_qp_init_attr attr = {}; + struct ib_uverbs_qp_cap cap = {}; + struct ib_rwq_ind_table *rwq_ind_tbl = NULL; + struct ib_qp *qp; + struct ib_pd *pd = NULL; + struct ib_srq *srq = NULL; + struct ib_cq *recv_cq = NULL; + struct ib_cq *send_cq = NULL; + struct ib_xrcd *xrcd = NULL; + struct ib_uobject *xrcd_uobj = NULL; + struct ib_device *device; + u64 user_handle; + int ret; + + ret = uverbs_copy_from_or_zero(&cap, attrs, + UVERBS_ATTR_CREATE_QP_CAP); + if (!ret) + ret = uverbs_copy_from(&user_handle, attrs, + UVERBS_ATTR_CREATE_QP_USER_HANDLE); + if (!ret) + ret = uverbs_get_const(&attr.qp_type, attrs, + UVERBS_ATTR_CREATE_QP_TYPE); + if (ret) + return ret; + + switch (attr.qp_type) { + case IB_QPT_XRC_TGT: + if (uverbs_attr_is_valid(attrs, + UVERBS_ATTR_CREATE_QP_RECV_CQ_HANDLE) || + uverbs_attr_is_valid(attrs, + UVERBS_ATTR_CREATE_QP_SEND_CQ_HANDLE) || + uverbs_attr_is_valid(attrs, + UVERBS_ATTR_CREATE_QP_PD_HANDLE) || + uverbs_attr_is_valid(attrs, + UVERBS_ATTR_CREATE_QP_IND_TABLE_HANDLE)) + return -EINVAL; + + xrcd_uobj = uverbs_attr_get_uobject(attrs, + UVERBS_ATTR_CREATE_QP_XRCD_HANDLE); + if (IS_ERR(xrcd_uobj)) + return PTR_ERR(xrcd_uobj); + + xrcd = (struct ib_xrcd *)xrcd_uobj->object; + if (!xrcd) + return -EINVAL; + device = xrcd->device; + break; + case IB_UVERBS_QPT_RAW_PACKET: + if (!capable(CAP_NET_RAW)) + return -EPERM; + fallthrough; + case IB_UVERBS_QPT_RC: + case IB_UVERBS_QPT_UC: + case IB_UVERBS_QPT_UD: + case IB_UVERBS_QPT_XRC_INI: + case IB_UVERBS_QPT_DRIVER: + if (uverbs_attr_is_valid(attrs, + UVERBS_ATTR_CREATE_QP_XRCD_HANDLE) || + (uverbs_attr_is_valid(attrs, + UVERBS_ATTR_CREATE_QP_SRQ_HANDLE) && + attr.qp_type == IB_QPT_XRC_INI)) + return -EINVAL; + + pd = uverbs_attr_get_obj(attrs, + UVERBS_ATTR_CREATE_QP_PD_HANDLE); + if (IS_ERR(pd)) + return PTR_ERR(pd); + + rwq_ind_tbl = uverbs_attr_get_obj(attrs, + UVERBS_ATTR_CREATE_QP_IND_TABLE_HANDLE); + if (!IS_ERR(rwq_ind_tbl)) { + if (cap.max_recv_wr || cap.max_recv_sge || + uverbs_attr_is_valid(attrs, + UVERBS_ATTR_CREATE_QP_RECV_CQ_HANDLE) || + uverbs_attr_is_valid(attrs, + UVERBS_ATTR_CREATE_QP_SRQ_HANDLE)) + return -EINVAL; + + /* send_cq is optinal */ + if (cap.max_send_wr) { + send_cq = uverbs_attr_get_obj(attrs, + UVERBS_ATTR_CREATE_QP_SEND_CQ_HANDLE); + if (IS_ERR(send_cq)) + return PTR_ERR(send_cq); + } + attr.rwq_ind_tbl = rwq_ind_tbl; + } else { + send_cq = uverbs_attr_get_obj(attrs, + UVERBS_ATTR_CREATE_QP_SEND_CQ_HANDLE); + if (IS_ERR(send_cq)) + return PTR_ERR(send_cq); + + if (attr.qp_type != IB_QPT_XRC_INI) { + recv_cq = uverbs_attr_get_obj(attrs, + UVERBS_ATTR_CREATE_QP_RECV_CQ_HANDLE); + if (IS_ERR(recv_cq)) + return PTR_ERR(recv_cq); + } + } + + device = pd->device; + break; + default: + return -EINVAL; + } + + ret = uverbs_get_flags32(&attr.create_flags, attrs, + UVERBS_ATTR_CREATE_QP_FLAGS, + IB_UVERBS_QP_CREATE_BLOCK_MULTICAST_LOOPBACK | + IB_UVERBS_QP_CREATE_SCATTER_FCS | + IB_UVERBS_QP_CREATE_CVLAN_STRIPPING | + IB_UVERBS_QP_CREATE_PCI_WRITE_END_PADDING | + IB_UVERBS_QP_CREATE_SQ_SIG_ALL); + if (ret) + return ret; + + ret = check_creation_flags(attr.qp_type, attr.create_flags); + if (ret) + return ret; + + if (uverbs_attr_is_valid(attrs, + UVERBS_ATTR_CREATE_QP_SOURCE_QPN)) { + ret = uverbs_copy_from(&attr.source_qpn, attrs, + UVERBS_ATTR_CREATE_QP_SOURCE_QPN); + if (ret) + return ret; + attr.create_flags |= IB_QP_CREATE_SOURCE_QPN; + } + + srq = uverbs_attr_get_obj(attrs, + UVERBS_ATTR_CREATE_QP_SRQ_HANDLE); + if (!IS_ERR(srq)) { + if ((srq->srq_type == IB_SRQT_XRC && + attr.qp_type != IB_QPT_XRC_TGT) || + (srq->srq_type != IB_SRQT_XRC && + attr.qp_type == IB_QPT_XRC_TGT)) + return -EINVAL; + attr.srq = srq; + } + + obj->uevent.event_file = ib_uverbs_get_async_event(attrs, + UVERBS_ATTR_CREATE_QP_EVENT_FD); + INIT_LIST_HEAD(&obj->uevent.event_list); + INIT_LIST_HEAD(&obj->mcast_list); + obj->uevent.uobject.user_handle = user_handle; + attr.event_handler = ib_uverbs_qp_event_handler; + attr.send_cq = send_cq; + attr.recv_cq = recv_cq; + attr.xrcd = xrcd; + if (attr.create_flags & IB_UVERBS_QP_CREATE_SQ_SIG_ALL) { + /* This creation bit is uverbs one, need to mask before + * calling drivers. It was added to prevent an extra user attr + * only for that when using ioctl. + */ + attr.create_flags &= ~IB_UVERBS_QP_CREATE_SQ_SIG_ALL; + attr.sq_sig_type = IB_SIGNAL_ALL_WR; + } else { + attr.sq_sig_type = IB_SIGNAL_REQ_WR; + } + + set_caps(&attr, &cap, true); + mutex_init(&obj->mcast_lock); + + if (attr.qp_type == IB_QPT_XRC_TGT) + qp = ib_create_qp(pd, &attr); + else + qp = _ib_create_qp(device, pd, &attr, &attrs->driver_udata, + obj); + + if (IS_ERR(qp)) { + ret = PTR_ERR(qp); + goto err_put; + } + + if (attr.qp_type != IB_QPT_XRC_TGT) { + atomic_inc(&pd->usecnt); + if (attr.send_cq) + atomic_inc(&attr.send_cq->usecnt); + if (attr.recv_cq) + atomic_inc(&attr.recv_cq->usecnt); + if (attr.srq) + atomic_inc(&attr.srq->usecnt); + if (attr.rwq_ind_tbl) + atomic_inc(&attr.rwq_ind_tbl->usecnt); + } else { + obj->uxrcd = container_of(xrcd_uobj, struct ib_uxrcd_object, + uobject); + atomic_inc(&obj->uxrcd->refcnt); + /* It is done in _ib_create_qp for other QP types */ + qp->uobject = obj; + } + + obj->uevent.uobject.object = qp; + uverbs_finalize_uobj_create(attrs, UVERBS_ATTR_CREATE_QP_HANDLE); + + if (attr.qp_type != IB_QPT_XRC_TGT) { + ret = ib_create_qp_security(qp, device); + if (ret) + return ret; + } + + set_caps(&attr, &cap, false); + ret = uverbs_copy_to_struct_or_zero(attrs, + UVERBS_ATTR_CREATE_QP_RESP_CAP, &cap, + sizeof(cap)); + if (ret) + return ret; + + ret = uverbs_copy_to(attrs, UVERBS_ATTR_CREATE_QP_RESP_QP_NUM, + &qp->qp_num, + sizeof(qp->qp_num)); + + return ret; +err_put: + if (obj->uevent.event_file) + uverbs_uobject_put(&obj->uevent.event_file->uobj); + return ret; +}; + +DECLARE_UVERBS_NAMED_METHOD( + UVERBS_METHOD_QP_CREATE, + UVERBS_ATTR_IDR(UVERBS_ATTR_CREATE_QP_HANDLE, + UVERBS_OBJECT_QP, + UVERBS_ACCESS_NEW, + UA_MANDATORY), + UVERBS_ATTR_IDR(UVERBS_ATTR_CREATE_QP_XRCD_HANDLE, + UVERBS_OBJECT_XRCD, + UVERBS_ACCESS_READ, + UA_OPTIONAL), + UVERBS_ATTR_IDR(UVERBS_ATTR_CREATE_QP_PD_HANDLE, + UVERBS_OBJECT_PD, + UVERBS_ACCESS_READ, + UA_OPTIONAL), + UVERBS_ATTR_IDR(UVERBS_ATTR_CREATE_QP_SRQ_HANDLE, + UVERBS_OBJECT_SRQ, + UVERBS_ACCESS_READ, + UA_OPTIONAL), + UVERBS_ATTR_IDR(UVERBS_ATTR_CREATE_QP_SEND_CQ_HANDLE, + UVERBS_OBJECT_CQ, + UVERBS_ACCESS_READ, + UA_OPTIONAL), + UVERBS_ATTR_IDR(UVERBS_ATTR_CREATE_QP_RECV_CQ_HANDLE, + UVERBS_OBJECT_CQ, + UVERBS_ACCESS_READ, + UA_OPTIONAL), + UVERBS_ATTR_IDR(UVERBS_ATTR_CREATE_QP_IND_TABLE_HANDLE, + UVERBS_OBJECT_RWQ_IND_TBL, + UVERBS_ACCESS_READ, + UA_OPTIONAL), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_QP_USER_HANDLE, + UVERBS_ATTR_TYPE(u64), + UA_MANDATORY), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_QP_CAP, + UVERBS_ATTR_STRUCT(struct ib_uverbs_qp_cap, + max_inline_data), + UA_MANDATORY), + UVERBS_ATTR_CONST_IN(UVERBS_ATTR_CREATE_QP_TYPE, + enum ib_uverbs_qp_type, + UA_MANDATORY), + UVERBS_ATTR_FLAGS_IN(UVERBS_ATTR_CREATE_QP_FLAGS, + enum ib_uverbs_qp_create_flags, + UA_OPTIONAL), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_QP_SOURCE_QPN, + UVERBS_ATTR_TYPE(u32), + UA_OPTIONAL), + UVERBS_ATTR_FD(UVERBS_ATTR_CREATE_QP_EVENT_FD, + UVERBS_OBJECT_ASYNC_EVENT, + UVERBS_ACCESS_READ, + UA_OPTIONAL), + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_CREATE_QP_RESP_CAP, + UVERBS_ATTR_STRUCT(struct ib_uverbs_qp_cap, + max_inline_data), + UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_CREATE_QP_RESP_QP_NUM, + UVERBS_ATTR_TYPE(u32), + UA_MANDATORY), + UVERBS_ATTR_UHW()); + +static int UVERBS_HANDLER(UVERBS_METHOD_QP_DESTROY)( + struct uverbs_attr_bundle *attrs) +{ + struct ib_uobject *uobj = + uverbs_attr_get_uobject(attrs, UVERBS_ATTR_DESTROY_QP_HANDLE); + struct ib_uqp_object *obj = + container_of(uobj, struct ib_uqp_object, uevent.uobject); + struct ib_uverbs_destroy_qp_resp resp = { + .events_reported = obj->uevent.events_reported + }; + + return uverbs_copy_to(attrs, UVERBS_ATTR_DESTROY_QP_RESP, &resp, + sizeof(resp)); +} + +DECLARE_UVERBS_NAMED_METHOD( + UVERBS_METHOD_QP_DESTROY, + UVERBS_ATTR_IDR(UVERBS_ATTR_DESTROY_QP_HANDLE, + UVERBS_OBJECT_QP, + UVERBS_ACCESS_DESTROY, + UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_DESTROY_QP_RESP, + UVERBS_ATTR_TYPE(struct ib_uverbs_destroy_qp_resp), + UA_MANDATORY)); + +DECLARE_UVERBS_NAMED_OBJECT( + UVERBS_OBJECT_QP, + UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_uqp_object), uverbs_free_qp), + &UVERBS_METHOD(UVERBS_METHOD_QP_CREATE), + &UVERBS_METHOD(UVERBS_METHOD_QP_DESTROY)); + +const struct uapi_definition uverbs_def_obj_qp[] = { + UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_QP, + UAPI_DEF_OBJ_NEEDS_FN(destroy_qp)), + {} +}; diff --git a/drivers/infiniband/core/uverbs_uapi.c b/drivers/infiniband/core/uverbs_uapi.c index 0ec8cf86ecfa..5addc8fae3f3 100644 --- a/drivers/infiniband/core/uverbs_uapi.c +++ b/drivers/infiniband/core/uverbs_uapi.c @@ -634,6 +634,7 @@ static const struct uapi_definition uverbs_core_api[] = { UAPI_DEF_CHAIN(uverbs_def_obj_flow_action), UAPI_DEF_CHAIN(uverbs_def_obj_intf), UAPI_DEF_CHAIN(uverbs_def_obj_mr), + UAPI_DEF_CHAIN(uverbs_def_obj_qp), UAPI_DEF_CHAIN(uverbs_def_obj_srq), UAPI_DEF_CHAIN(uverbs_def_obj_wq), UAPI_DEF_CHAIN(uverbs_def_write_intf), diff --git a/include/uapi/rdma/ib_user_ioctl_cmds.h b/include/uapi/rdma/ib_user_ioctl_cmds.h index 381b17889d20..4961d5e858eb 100644 --- a/include/uapi/rdma/ib_user_ioctl_cmds.h +++ b/include/uapi/rdma/ib_user_ioctl_cmds.h @@ -121,6 +121,34 @@ enum uverbs_attrs_destroy_flow_action_esp { UVERBS_ATTR_DESTROY_FLOW_ACTION_HANDLE, }; +enum uverbs_attrs_create_qp_cmd_attr_ids { + UVERBS_ATTR_CREATE_QP_HANDLE, + UVERBS_ATTR_CREATE_QP_XRCD_HANDLE, + UVERBS_ATTR_CREATE_QP_PD_HANDLE, + UVERBS_ATTR_CREATE_QP_SRQ_HANDLE, + UVERBS_ATTR_CREATE_QP_SEND_CQ_HANDLE, + UVERBS_ATTR_CREATE_QP_RECV_CQ_HANDLE, + UVERBS_ATTR_CREATE_QP_IND_TABLE_HANDLE, + UVERBS_ATTR_CREATE_QP_USER_HANDLE, + UVERBS_ATTR_CREATE_QP_CAP, + UVERBS_ATTR_CREATE_QP_TYPE, + UVERBS_ATTR_CREATE_QP_FLAGS, + UVERBS_ATTR_CREATE_QP_SOURCE_QPN, + UVERBS_ATTR_CREATE_QP_EVENT_FD, + UVERBS_ATTR_CREATE_QP_RESP_CAP, + UVERBS_ATTR_CREATE_QP_RESP_QP_NUM, +}; + +enum uverbs_attrs_destroy_qp_cmd_attr_ids { + UVERBS_ATTR_DESTROY_QP_HANDLE, + UVERBS_ATTR_DESTROY_QP_RESP, +}; + +enum uverbs_methods_qp { + UVERBS_METHOD_QP_CREATE, + UVERBS_METHOD_QP_DESTROY, +}; + enum uverbs_attrs_create_srq_cmd_attr_ids { UVERBS_ATTR_CREATE_SRQ_HANDLE, UVERBS_ATTR_CREATE_SRQ_PD_HANDLE, diff --git a/include/uapi/rdma/ib_user_ioctl_verbs.h b/include/uapi/rdma/ib_user_ioctl_verbs.h index b1662dfe86a6..5debab45ebcb 100644 --- a/include/uapi/rdma/ib_user_ioctl_verbs.h +++ b/include/uapi/rdma/ib_user_ioctl_verbs.h @@ -96,6 +96,7 @@ enum ib_uverbs_qp_create_flags { IB_UVERBS_QP_CREATE_SCATTER_FCS = 1 << 8, IB_UVERBS_QP_CREATE_CVLAN_STRIPPING = 1 << 9, IB_UVERBS_QP_CREATE_PCI_WRITE_END_PADDING = 1 << 11, + IB_UVERBS_QP_CREATE_SQ_SIG_ALL = 1 << 12, }; enum ib_uverbs_query_port_cap_flags { @@ -219,6 +220,14 @@ struct ib_uverbs_query_port_resp_ex { __u8 reserved[6]; }; +struct ib_uverbs_qp_cap { + __u32 max_send_wr; + __u32 max_recv_wr; + __u32 max_send_sge; + __u32 max_recv_sge; + __u32 max_inline_data; +}; + enum rdma_driver_id { RDMA_DRIVER_UNKNOWN, RDMA_DRIVER_MLX5, -- cgit v1.2.3 From 47393fb57ba7b914c869f70010326c8b8940c3a0 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 19 May 2020 15:03:47 +0300 Subject: block/rnbd: Fix an IS_ERR() vs NULL check in find_or_create_sess() The alloc_sess() function returns error pointers, it never returns NULL. Fixes: f7a7a5c228d4 ("block/rnbd: client: main functionality") Link: https://lore.kernel.org/r/20200519120347.GD42765@mwanda Signed-off-by: Dan Carpenter Reviewed-by: Jack Wang Signed-off-by: Jason Gunthorpe --- drivers/block/rnbd/rnbd-clt.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/block/rnbd/rnbd-clt.c b/drivers/block/rnbd/rnbd-clt.c index 450a571e6a1e..cc6a4e2587ae 100644 --- a/drivers/block/rnbd/rnbd-clt.c +++ b/drivers/block/rnbd/rnbd-clt.c @@ -923,13 +923,12 @@ rnbd_clt_session *find_or_create_sess(const char *sessname, bool *first) sess = __find_and_get_sess(sessname); if (!sess) { sess = alloc_sess(sessname); - if (sess) { - list_add(&sess->list, &sess_list); - *first = true; - } else { + if (IS_ERR(sess)) { mutex_unlock(&sess_lock); - return ERR_PTR(-ENOMEM); + return sess; } + list_add(&sess->list, &sess_list); + *first = true; } else *first = false; mutex_unlock(&sess_lock); -- cgit v1.2.3 From 63a3345c2d42a9b29e1ce2d3a4043689b3995cea Mon Sep 17 00:00:00 2001 From: Maor Gottlieb Date: Thu, 21 May 2020 10:26:50 +0300 Subject: IB/cma: Fix ports memory leak in cma_configfs The allocated ports structure in never freed. The free function should be called by release_cma_ports_group, but the group is never released since we don't remove its default group. Remove default groups when device group is deleted. Fixes: 045959db65c6 ("IB/cma: Add configfs for rdma_cm") Link: https://lore.kernel.org/r/20200521072650.567908-1-leon@kernel.org Signed-off-by: Maor Gottlieb Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cma_configfs.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/drivers/infiniband/core/cma_configfs.c b/drivers/infiniband/core/cma_configfs.c index c672a4978bfd..3c1e2ca564fe 100644 --- a/drivers/infiniband/core/cma_configfs.c +++ b/drivers/infiniband/core/cma_configfs.c @@ -322,8 +322,21 @@ fail: return ERR_PTR(err); } +static void drop_cma_dev(struct config_group *cgroup, struct config_item *item) +{ + struct config_group *group = + container_of(item, struct config_group, cg_item); + struct cma_dev_group *cma_dev_group = + container_of(group, struct cma_dev_group, device_group); + + configfs_remove_default_groups(&cma_dev_group->ports_group); + configfs_remove_default_groups(&cma_dev_group->device_group); + config_item_put(item); +} + static struct configfs_group_operations cma_subsys_group_ops = { .make_group = make_cma_dev, + .drop_item = drop_cma_dev, }; static const struct config_item_type cma_subsys_type = { -- cgit v1.2.3 From cdb685cb9158fa67f6f4584ea39279ed7ae39253 Mon Sep 17 00:00:00 2001 From: Danil Kipnis Date: Thu, 21 May 2020 20:59:09 +0200 Subject: RDMA/rnbd: Fix compilation error when CONFIG_MODULES is disabled module_is_live function is only defined when CONFIG_MODULES is enabled. Use try_module_get instead to check whether the module is being removed. When module unload and manuall unmapping is happening in parallel, we can try removing the symlink twice: rnbd_client_exit vs. rnbd_clt_unmap_dev_store. This is probably not the best way to deal with this race in general, but for now this fixes the compilation issue when CONFIG_MODULES is disabled and has no functional impact. Regression tests passed. Fixes: 1eb54f8f5dd8 ("block/rnbd: client: sysfs interface functions") Link: https://lore.kernel.org/r/20200521185909.457245-1-danil.kipnis@cloud.ionos.com Reported-by: Randy Dunlap Suggested-by: Guoqing Jiang Signed-off-by: Danil Kipnis Acked-by: Randy Dunlap Signed-off-by: Jason Gunthorpe --- drivers/block/rnbd/rnbd-clt-sysfs.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/drivers/block/rnbd/rnbd-clt-sysfs.c b/drivers/block/rnbd/rnbd-clt-sysfs.c index a4508fcc7ffe..4f4474eecadb 100644 --- a/drivers/block/rnbd/rnbd-clt-sysfs.c +++ b/drivers/block/rnbd/rnbd-clt-sysfs.c @@ -428,12 +428,15 @@ static struct attribute *rnbd_dev_attrs[] = { void rnbd_clt_remove_dev_symlink(struct rnbd_clt_dev *dev) { /* - * The module_is_live() check is crucial and helps to avoid annoying - * sysfs warning raised in sysfs_remove_link(), when the whole sysfs - * path was just removed, see rnbd_close_sessions(). + * The module unload rnbd_client_exit path is racing with unmapping of + * the last single device from the sysfs manually + * i.e. rnbd_clt_unmap_dev_store() leading to a sysfs warning because + * of sysfs link already was removed already. */ - if (strlen(dev->blk_symlink_name) && module_is_live(THIS_MODULE)) + if (strlen(dev->blk_symlink_name) && try_module_get(THIS_MODULE)) { sysfs_remove_link(rnbd_devs_kobj, dev->blk_symlink_name); + module_put(THIS_MODULE); + } } static struct kobj_type rnbd_dev_ktype = { -- cgit v1.2.3 From e172037be757dc7ab6ee67932c6663a2ff8cfd27 Mon Sep 17 00:00:00 2001 From: Md Haris Iqbal Date: Fri, 22 May 2020 08:28:33 +0000 Subject: RDMA/rtrs: server: Use already dereferenced rtrs_sess structure The rtrs_sess structure has already been extracted above from the rtrs_srv_sess structure. Use that to avoid redundant dereferencing. Fixes: 9cb837480424 ("RDMA/rtrs: server: main functionality") Link: https://lore.kernel.org/r/20200522082833.1480551-1-haris.phnx@gmail.com Signed-off-by: Md Haris Iqbal Acked-by: Danil Kipnis Signed-off-by: Jason Gunthorpe --- drivers/infiniband/ulp/rtrs/rtrs-srv.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/ulp/rtrs/rtrs-srv.c b/drivers/infiniband/ulp/rtrs/rtrs-srv.c index 1fc6ece036ff..5ef8988ee75b 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs-srv.c +++ b/drivers/infiniband/ulp/rtrs/rtrs-srv.c @@ -1822,13 +1822,13 @@ static int rtrs_rdma_connect(struct rdma_cm_id *cm_id, /* * Sanity checks */ - if (con_num != sess->s.con_num || cid >= sess->s.con_num) { + if (con_num != s->con_num || cid >= s->con_num) { rtrs_err(s, "Incorrect request: %d, %d\n", cid, con_num); mutex_unlock(&srv->paths_mutex); goto reject_w_econnreset; } - if (sess->s.con[cid]) { + if (s->con[cid]) { rtrs_err(s, "Connection already exists: %d\n", cid); mutex_unlock(&srv->paths_mutex); -- cgit v1.2.3 From a94dae867c5663f36c950b82832e146a6c2f0e42 Mon Sep 17 00:00:00 2001 From: Danil Kipnis Date: Fri, 22 May 2020 07:39:24 +0200 Subject: RDMA/rtrs: Get rid of the do_next_path while_next_path macros The macros do_each_path/while_each_path lead to a smatch warning: drivers/infiniband/ulp/rtrs/rtrs-clt.c:1196 rtrs_clt_failover_req() warn: inconsistent indenting drivers/infiniband/ulp/rtrs/rtrs-clt.c:2890 rtrs_clt_request() warn: inconsistent indenting Also checkpatch complains: ERROR: Macros with multiple statements should be enclosed in a do - while loop The macros are used only in two places: for a normal IO path and for the failover path triggered after errors. Get rid of the macros and just use a for loop iterating over the list of paths in both places. It is easier to read and also less lines of code. Fixes: 6a98d71daea1 ("RDMA/rtrs: client: main functionality") Link: https://lore.kernel.org/r/20200522053924.528980-1-danil.kipnis@cloud.ionos.com Reported-by: kbuild test robot Signed-off-by: Danil Kipnis Signed-off-by: Jason Gunthorpe --- drivers/infiniband/ulp/rtrs/rtrs-clt.c | 29 +++++++++++++---------------- 1 file changed, 13 insertions(+), 16 deletions(-) diff --git a/drivers/infiniband/ulp/rtrs/rtrs-clt.c b/drivers/infiniband/ulp/rtrs/rtrs-clt.c index 0ab7e5e912c0..564388a85603 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs-clt.c +++ b/drivers/infiniband/ulp/rtrs/rtrs-clt.c @@ -726,18 +726,6 @@ struct path_it { struct rtrs_clt_sess *(*next_path)(struct path_it *it); }; -#define do_each_path(path, clt, it) { \ - path_it_init(it, clt); \ - rcu_read_lock(); \ - for ((it)->i = 0; ((path) = ((it)->next_path)(it)) && \ - (it)->i < (it)->clt->paths_num; \ - (it)->i++) - -#define while_each_path(it) \ - path_it_deinit(it); \ - rcu_read_unlock(); \ - } - /** * list_next_or_null_rr_rcu - get next list element in round-robin fashion. * @head: the head for the list. @@ -1175,7 +1163,10 @@ static int rtrs_clt_failover_req(struct rtrs_clt *clt, int err = -ECONNABORTED; struct path_it it; - do_each_path(alive_sess, clt, &it) { + rcu_read_lock(); + for (path_it_init(&it, clt); + (alive_sess = it.next_path(&it)) && it.i < it.clt->paths_num; + it.i++) { if (unlikely(READ_ONCE(alive_sess->state) != RTRS_CLT_CONNECTED)) continue; @@ -1191,7 +1182,9 @@ static int rtrs_clt_failover_req(struct rtrs_clt *clt, /* Success path */ rtrs_clt_inc_failover_cnt(alive_sess->stats); break; - } while_each_path(&it); + } + path_it_deinit(&it); + rcu_read_unlock(); return err; } @@ -2862,7 +2855,9 @@ int rtrs_clt_request(int dir, struct rtrs_clt_req_ops *ops, dma_dir = DMA_TO_DEVICE; } - do_each_path(sess, clt, &it) { + rcu_read_lock(); + for (path_it_init(&it, clt); + (sess = it.next_path(&it)) && it.i < it.clt->paths_num; it.i++) { if (unlikely(READ_ONCE(sess->state) != RTRS_CLT_CONNECTED)) continue; @@ -2887,7 +2882,9 @@ int rtrs_clt_request(int dir, struct rtrs_clt_req_ops *ops, } /* Success path */ break; - } while_each_path(&it); + } + path_it_deinit(&it); + rcu_read_unlock(); return err; } -- cgit v1.2.3 From 25966e893143f1cf9b1294bc6e33e3a6b51ed2ad Mon Sep 17 00:00:00 2001 From: Lang Cheng Date: Wed, 20 May 2020 21:53:11 +0800 Subject: RDMA/hns: Let software PI/CI grow naturally The hardware can truncate PI/CI when posting or polling, the driver does not need to do truncation. Therefore keep the software's PI/CI consistent with it in the hardware. Link: https://lore.kernel.org/r/1589982799-28728-2-git-send-email-liweihang@huawei.com Signed-off-by: Lang Cheng Signed-off-by: Weihang Li Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index d2c58d395962..6229b57e3c29 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -500,8 +500,7 @@ static inline void update_sq_db(struct hns_roce_dev *hr_dev, roce_set_field(sq_db.byte_4, V2_DB_BYTE_4_CMD_M, V2_DB_BYTE_4_CMD_S, HNS_ROCE_V2_SQ_DB); roce_set_field(sq_db.parameter, V2_DB_PARAMETER_IDX_M, - V2_DB_PARAMETER_IDX_S, - qp->sq.head & ((qp->sq.wqe_cnt << 1) - 1)); + V2_DB_PARAMETER_IDX_S, qp->sq.head); roce_set_field(sq_db.parameter, V2_DB_PARAMETER_SL_M, V2_DB_PARAMETER_SL_S, qp->sl); @@ -807,7 +806,8 @@ static int hns_roce_v2_post_srq_recv(struct ib_srq *ibsrq, srq_db.byte_4 = cpu_to_le32(HNS_ROCE_V2_SRQ_DB << V2_DB_BYTE_4_CMD_S | (srq->srqn & V2_DB_BYTE_4_TAG_M)); - srq_db.parameter = cpu_to_le32(srq->head); + srq_db.parameter = + cpu_to_le32(srq->head & V2_DB_PARAMETER_IDX_M); hns_roce_write64(hr_dev, (__le32 *)&srq_db, srq->db_reg_l); } @@ -2940,8 +2940,7 @@ static int hns_roce_v2_req_notify_cq(struct ib_cq *ibcq, roce_set_field(doorbell[0], V2_CQ_DB_BYTE_4_CMD_M, V2_DB_BYTE_4_CMD_S, HNS_ROCE_V2_CQ_DB_NTR); roce_set_field(doorbell[1], V2_CQ_DB_PARAMETER_CONS_IDX_M, - V2_CQ_DB_PARAMETER_CONS_IDX_S, - hr_cq->cons_index & ((hr_cq->cq_depth << 1) - 1)); + V2_CQ_DB_PARAMETER_CONS_IDX_S, hr_cq->cons_index); roce_set_field(doorbell[1], V2_CQ_DB_PARAMETER_CMD_SN_M, V2_CQ_DB_PARAMETER_CMD_SN_S, hr_cq->arm_sn & 0x3); roce_set_bit(doorbell[1], V2_CQ_DB_PARAMETER_NOTIFY_S, -- cgit v1.2.3 From 05e6a5a63579d4c55cc996e5148bd6da9ed48860 Mon Sep 17 00:00:00 2001 From: Lang Cheng Date: Wed, 20 May 2020 21:53:12 +0800 Subject: RDMA/hns: Add CQ flag instead of independent enable flag It's easier to understand and maintain enable flags of cq using a single field in type of u32 than defining a field for every flags in the structure hns_roce_cq, and we can add new flags for features more conveniently in the future. Link: https://lore.kernel.org/r/1589982799-28728-3-git-send-email-liweihang@huawei.com Signed-off-by: Lang Cheng Signed-off-by: Weihang Li Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_cq.c | 10 +++++----- drivers/infiniband/hw/hns/hns_roce_device.h | 6 +++--- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 6 +++--- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_cq.c b/drivers/infiniband/hw/hns/hns_roce_cq.c index 6dd8deaffec8..e87d616f7988 100644 --- a/drivers/infiniband/hw/hns/hns_roce_cq.c +++ b/drivers/infiniband/hw/hns/hns_roce_cq.c @@ -186,8 +186,8 @@ static int alloc_cq_db(struct hns_roce_dev *hr_dev, struct hns_roce_cq *hr_cq, &hr_cq->db); if (err) return err; - hr_cq->db_en = 1; - resp->cap_flags |= HNS_ROCE_SUPPORT_CQ_RECORD_DB; + hr_cq->flags |= HNS_ROCE_CQ_FLAG_RECORD_DB; + resp->cap_flags |= HNS_ROCE_CQ_FLAG_RECORD_DB; } } else { if (has_db) { @@ -196,7 +196,7 @@ static int alloc_cq_db(struct hns_roce_dev *hr_dev, struct hns_roce_cq *hr_cq, return err; hr_cq->set_ci_db = hr_cq->db.db_record; *hr_cq->set_ci_db = 0; - hr_cq->db_en = 1; + hr_cq->flags |= HNS_ROCE_CQ_FLAG_RECORD_DB; } hr_cq->cq_db_l = hr_dev->reg_base + hr_dev->odb_offset + DB_REG_OFFSET * hr_dev->priv_uar.index; @@ -210,10 +210,10 @@ static void free_cq_db(struct hns_roce_dev *hr_dev, struct hns_roce_cq *hr_cq, { struct hns_roce_ucontext *uctx; - if (!hr_cq->db_en) + if (!(hr_cq->flags & HNS_ROCE_CQ_FLAG_RECORD_DB)) return; - hr_cq->db_en = 0; + hr_cq->flags &= ~HNS_ROCE_CQ_FLAG_RECORD_DB; if (udata) { uctx = rdma_udata_to_drv_context(udata, struct hns_roce_ucontext, diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index bd6e295f4669..f614959f97fd 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -137,8 +137,8 @@ enum { HNS_ROCE_QP_CAP_SQ_RECORD_DB = BIT(1), }; -enum { - HNS_ROCE_SUPPORT_CQ_RECORD_DB = 1 << 0, +enum hns_roce_cq_flags { + HNS_ROCE_CQ_FLAG_RECORD_DB = BIT(0), }; enum hns_roce_qp_state { @@ -458,7 +458,7 @@ struct hns_roce_cq { struct ib_cq ib_cq; struct hns_roce_mtr mtr; struct hns_roce_db db; - u8 db_en; + u32 flags; spinlock_t lock; u32 cq_depth; u32 cons_index; diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 6229b57e3c29..f0021832883d 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -2898,9 +2898,9 @@ static void hns_roce_v2_write_cqc(struct hns_roce_dev *hr_dev, roce_set_field(cq_context->byte_40_cqe_ba, V2_CQC_BYTE_40_CQE_BA_M, V2_CQC_BYTE_40_CQE_BA_S, (dma_handle >> (32 + 3))); - if (hr_cq->db_en) - roce_set_bit(cq_context->byte_44_db_record, - V2_CQC_BYTE_44_DB_RECORD_EN_S, 1); + roce_set_bit(cq_context->byte_44_db_record, + V2_CQC_BYTE_44_DB_RECORD_EN_S, + (hr_cq->flags & HNS_ROCE_CQ_FLAG_RECORD_DB) ? 1 : 0); roce_set_field(cq_context->byte_44_db_record, V2_CQC_BYTE_44_DB_RECORD_ADDR_M, -- cgit v1.2.3 From 0db6570947f43a39664ab8665f58101f112cedf3 Mon Sep 17 00:00:00 2001 From: Lang Cheng Date: Wed, 20 May 2020 21:53:13 +0800 Subject: RDMA/hns: Optimize post and poll process Add unlikely() and likely() to optimize main I/O process code. Link: https://lore.kernel.org/r/1589982799-28728-4-git-send-email-liweihang@huawei.com Signed-off-by: Lang Cheng Signed-off-by: Weihang Li Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index f0021832883d..af0911e522f8 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -187,15 +187,15 @@ static int set_rwqe_data_seg(struct ib_qp *ibqp, const struct ib_send_wr *wr, int i; if (wr->send_flags & IB_SEND_INLINE && valid_num_sge) { - if (le32_to_cpu(rc_sq_wqe->msg_len) > - hr_dev->caps.max_sq_inline) { + if (unlikely(le32_to_cpu(rc_sq_wqe->msg_len) > + hr_dev->caps.max_sq_inline)) { ibdev_err(ibdev, "inline len(1-%d)=%d, illegal", rc_sq_wqe->msg_len, hr_dev->caps.max_sq_inline); return -EINVAL; } - if (wr->opcode == IB_WR_RDMA_READ) { + if (unlikely(wr->opcode == IB_WR_RDMA_READ)) { ibdev_err(ibdev, "Not support inline data!\n"); return -EINVAL; } @@ -526,7 +526,7 @@ static int hns_roce_v2_post_send(struct ib_qp *ibqp, spin_lock_irqsave(&qp->sq.lock, flags); ret = check_send_valid(hr_dev, qp); - if (ret) { + if (unlikely(ret)) { *bad_wr = wr; nreq = 0; goto out; @@ -562,7 +562,7 @@ static int hns_roce_v2_post_send(struct ib_qp *ibqp, else if (ibqp->qp_type == IB_QPT_RC) ret = set_rc_wqe(qp, wr, wqe, &sge_idx, owner_bit); - if (ret) { + if (unlikely(ret)) { *bad_wr = wr; goto out; } @@ -612,15 +612,15 @@ static int hns_roce_v2_post_recv(struct ib_qp *ibqp, spin_lock_irqsave(&hr_qp->rq.lock, flags); ret = check_recv_valid(hr_dev, hr_qp); - if (ret) { + if (unlikely(ret)) { *bad_wr = wr; nreq = 0; goto out; } for (nreq = 0; wr; ++nreq, wr = wr->next) { - if (hns_roce_wq_overflow(&hr_qp->rq, nreq, - hr_qp->ibqp.recv_cq)) { + if (unlikely(hns_roce_wq_overflow(&hr_qp->rq, nreq, + hr_qp->ibqp.recv_cq))) { ret = -ENOMEM; *bad_wr = wr; goto out; @@ -766,7 +766,7 @@ static int hns_roce_v2_post_srq_recv(struct ib_srq *ibsrq, } wqe_idx = find_empty_entry(&srq->idx_que, srq->wqe_cnt); - if (wqe_idx < 0) { + if (unlikely(wqe_idx < 0)) { ret = -ENOMEM; *bad_wr = wr; break; @@ -2977,7 +2977,7 @@ static int hns_roce_handle_recv_inl_wqe(struct hns_roce_v2_cqe *cqe, wqe_buf += size; } - if (data_len) { + if (unlikely(data_len)) { wc->status = IB_WC_LOC_LEN_ERR; return -EAGAIN; } @@ -3069,7 +3069,8 @@ static void get_cqe_status(struct hns_roce_dev *hr_dev, struct hns_roce_qp *qp, break; } - if (wc->status == IB_WC_SUCCESS || wc->status == IB_WC_WR_FLUSH_ERR) + if (likely(wc->status == IB_WC_SUCCESS || + wc->status == IB_WC_WR_FLUSH_ERR)) return; ibdev_err(&hr_dev->ib_dev, "error cqe status 0x%x:\n", cqe_status); @@ -3164,7 +3165,7 @@ static int hns_roce_v2_poll_one(struct hns_roce_cq *hr_cq, } get_cqe_status(hr_dev, *cur_qp, cqe, wc); - if (wc->status != IB_WC_SUCCESS) + if (unlikely(wc->status != IB_WC_SUCCESS)) return 0; if (is_send) { @@ -3263,7 +3264,7 @@ static int hns_roce_v2_poll_one(struct hns_roce_cq *hr_cq, opcode == HNS_ROCE_V2_OPCODE_SEND_WITH_INV) && (roce_get_bit(cqe->byte_4, V2_CQE_BYTE_4_RQ_INLINE_S))) { ret = hns_roce_handle_recv_inl_wqe(cqe, cur_qp, wc); - if (ret) + if (unlikely(ret)) return -EAGAIN; } -- cgit v1.2.3 From b9c93e3aad13048c673999e65acbde0378600317 Mon Sep 17 00:00:00 2001 From: Yangyang Li Date: Wed, 20 May 2020 21:53:14 +0800 Subject: RDMA/hns: Remove unused code about assert The codes related to assert are no longer used and need to be deleted. Link: https://lore.kernel.org/r/1589982799-28728-5-git-send-email-liweihang@huawei.com Signed-off-by: Yangyang Li Signed-off-by: Weihang Li Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_common.h | 4 ---- drivers/infiniband/hw/hns/hns_roce_main.c | 1 - 2 files changed, 5 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_common.h b/drivers/infiniband/hw/hns/hns_roce_common.h index 8e95a1aa1b4f..f5669ff8cfeb 100644 --- a/drivers/infiniband/hw/hns/hns_roce_common.h +++ b/drivers/infiniband/hw/hns/hns_roce_common.h @@ -33,10 +33,6 @@ #ifndef _HNS_ROCE_COMMON_H #define _HNS_ROCE_COMMON_H -#ifndef assert -#define assert(cond) -#endif - #define roce_write(dev, reg, val) writel((val), (dev)->reg_base + (reg)) #define roce_read(dev, reg) readl((dev)->reg_base + (reg)) #define roce_raw_write(value, addr) \ diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c index fd3581efe9a8..50763cf4fa3d 100644 --- a/drivers/infiniband/hw/hns/hns_roce_main.c +++ b/drivers/infiniband/hw/hns/hns_roce_main.c @@ -233,7 +233,6 @@ static int hns_roce_query_port(struct ib_device *ib_dev, u8 port_num, enum ib_mtu mtu; u8 port; - assert(port_num > 0); port = port_num - 1; /* props being zeroed by the caller, avoid zeroing it here */ -- cgit v1.2.3 From e9f2cd28250cd9e77db2c0dc8efe0412f6971a76 Mon Sep 17 00:00:00 2001 From: Xi Wang Date: Wed, 20 May 2020 21:53:15 +0800 Subject: RDMA/hns: Rename QP buffer related function Rename the function related to QP buffer to make the code more readable. Link: https://lore.kernel.org/r/1589982799-28728-6-git-send-email-liweihang@huawei.com Signed-off-by: Xi Wang Signed-off-by: Weihang Li Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_qp.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c index fb71755f6179..5d294209fb23 100644 --- a/drivers/infiniband/hw/hns/hns_roce_qp.c +++ b/drivers/infiniband/hw/hns/hns_roce_qp.c @@ -502,9 +502,9 @@ static int set_user_sq_size(struct hns_roce_dev *hr_dev, return 0; } -static int split_wqe_buf_region(struct hns_roce_dev *hr_dev, - struct hns_roce_qp *hr_qp, - struct hns_roce_buf_attr *buf_attr) +static int set_wqe_buf_attr(struct hns_roce_dev *hr_dev, + struct hns_roce_qp *hr_qp, + struct hns_roce_buf_attr *buf_attr) { int buf_size; int idx = 0; @@ -676,7 +676,7 @@ static int alloc_qp_buf(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp, hr_qp->rq_inl_buf.wqe_list = NULL; } - ret = split_wqe_buf_region(hr_dev, hr_qp, &buf_attr); + ret = set_wqe_buf_attr(hr_dev, hr_qp, &buf_attr); if (ret) { ibdev_err(ibdev, "failed to split WQE buf, ret = %d.\n", ret); goto err_inline; -- cgit v1.2.3 From 82d07a4e466fa2e3cc0ac5479beeb739abaa7438 Mon Sep 17 00:00:00 2001 From: Weihang Li Date: Wed, 20 May 2020 21:53:16 +0800 Subject: RDMA/hns: Change all page_shift to unsigned page_shift is used to calculate the page size, it's always non-negative, and should be in type of unsigned. Link: https://lore.kernel.org/r/1589982799-28728-7-git-send-email-liweihang@huawei.com Signed-off-by: Weihang Li Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_alloc.c | 2 +- drivers/infiniband/hw/hns/hns_roce_device.h | 25 +++++++++++++------------ drivers/infiniband/hw/hns/hns_roce_hem.c | 2 +- drivers/infiniband/hw/hns/hns_roce_hem.h | 2 +- drivers/infiniband/hw/hns/hns_roce_mr.c | 20 +++++++++++--------- 5 files changed, 27 insertions(+), 24 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_alloc.c b/drivers/infiniband/hw/hns/hns_roce_alloc.c index 742aee846676..a522cb2d29ea 100644 --- a/drivers/infiniband/hw/hns/hns_roce_alloc.c +++ b/drivers/infiniband/hw/hns/hns_roce_alloc.c @@ -254,7 +254,7 @@ int hns_roce_get_kmem_bufs(struct hns_roce_dev *hr_dev, dma_addr_t *bufs, int hns_roce_get_umem_bufs(struct hns_roce_dev *hr_dev, dma_addr_t *bufs, int buf_cnt, int start, struct ib_umem *umem, - int page_shift) + unsigned int page_shift) { struct ib_block_iter biter; int total = 0; diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index f614959f97fd..7190c8a5422e 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -342,7 +342,7 @@ struct hns_roce_buf_attr { int hopnum; /* multi-hop addressing hop num */ } region[HNS_ROCE_MAX_BT_REGION]; int region_count; /* valid region count */ - int page_shift; /* buffer page shift */ + unsigned int page_shift; /* buffer page shift */ bool fixed_page; /* decide page shift is fixed-size or maximum size */ int user_access; /* umem access flag */ bool mtt_only; /* only alloc buffer-required MTT memory */ @@ -351,14 +351,14 @@ struct hns_roce_buf_attr { /* memory translate region */ struct hns_roce_mtr { struct hns_roce_hem_list hem_list; /* multi-hop addressing resource */ - struct ib_umem *umem; /* user space buffer */ - struct hns_roce_buf *kmem; /* kernel space buffer */ + struct ib_umem *umem; /* user space buffer */ + struct hns_roce_buf *kmem; /* kernel space buffer */ struct { - dma_addr_t root_ba; /* root BA table's address */ - bool is_direct; /* addressing without BA table */ - int ba_pg_shift; /* BA table page shift */ - int buf_pg_shift; /* buffer page shift */ - int buf_pg_count; /* buffer page count */ + dma_addr_t root_ba; /* root BA table's address */ + bool is_direct; /* addressing without BA table */ + unsigned int ba_pg_shift; /* BA table page shift */ + unsigned int buf_pg_shift; /* buffer page shift */ + int buf_pg_count; /* buffer page count */ } hem_cfg; /* config for hardware addressing */ }; @@ -423,7 +423,7 @@ struct hns_roce_buf { struct hns_roce_buf_list *page_list; u32 npages; u32 size; - int page_shift; + unsigned int page_shift; }; struct hns_roce_db_pgdir { @@ -1139,8 +1139,9 @@ void hns_roce_cmd_use_polling(struct hns_roce_dev *hr_dev); int hns_roce_mtr_find(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr, int offset, u64 *mtt_buf, int mtt_max, u64 *base_addr); int hns_roce_mtr_create(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr, - struct hns_roce_buf_attr *buf_attr, int page_shift, - struct ib_udata *udata, unsigned long user_addr); + struct hns_roce_buf_attr *buf_attr, + unsigned int page_shift, struct ib_udata *udata, + unsigned long user_addr); void hns_roce_mtr_destroy(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr); int hns_roce_mtr_map(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr, @@ -1210,7 +1211,7 @@ int hns_roce_get_kmem_bufs(struct hns_roce_dev *hr_dev, dma_addr_t *bufs, int buf_cnt, int start, struct hns_roce_buf *buf); int hns_roce_get_umem_bufs(struct hns_roce_dev *hr_dev, dma_addr_t *bufs, int buf_cnt, int start, struct ib_umem *umem, - int page_shift); + unsigned int page_shift); int hns_roce_create_srq(struct ib_srq *srq, struct ib_srq_init_attr *srq_init_attr, diff --git a/drivers/infiniband/hw/hns/hns_roce_hem.c b/drivers/infiniband/hw/hns/hns_roce_hem.c index 37d101eec181..c8db6f8ae018 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hem.c +++ b/drivers/infiniband/hw/hns/hns_roce_hem.c @@ -1400,7 +1400,7 @@ err_exit: int hns_roce_hem_list_request(struct hns_roce_dev *hr_dev, struct hns_roce_hem_list *hem_list, const struct hns_roce_buf_region *regions, - int region_cnt, int bt_pg_shift) + int region_cnt, unsigned int bt_pg_shift) { const struct hns_roce_buf_region *r; int ofs, end; diff --git a/drivers/infiniband/hw/hns/hns_roce_hem.h b/drivers/infiniband/hw/hns/hns_roce_hem.h index 1fa0bdcb1989..b34c940077bb 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hem.h +++ b/drivers/infiniband/hw/hns/hns_roce_hem.h @@ -133,7 +133,7 @@ int hns_roce_hem_list_calc_root_ba(const struct hns_roce_buf_region *regions, int hns_roce_hem_list_request(struct hns_roce_dev *hr_dev, struct hns_roce_hem_list *hem_list, const struct hns_roce_buf_region *regions, - int region_cnt, int bt_pg_shift); + int region_cnt, unsigned int bt_pg_shift); void hns_roce_hem_list_release(struct hns_roce_dev *hr_dev, struct hns_roce_hem_list *hem_list); void *hns_roce_hem_list_find_mtt(struct hns_roce_dev *hr_dev, diff --git a/drivers/infiniband/hw/hns/hns_roce_mr.c b/drivers/infiniband/hw/hns/hns_roce_mr.c index 3075e8450cda..17759de4108d 100644 --- a/drivers/infiniband/hw/hns/hns_roce_mr.c +++ b/drivers/infiniband/hw/hns/hns_roce_mr.c @@ -706,7 +706,8 @@ static inline size_t mtr_bufs_size(struct hns_roce_buf_attr *attr) return size; } -static inline int mtr_umem_page_count(struct ib_umem *umem, int page_shift) +static inline int mtr_umem_page_count(struct ib_umem *umem, + unsigned int page_shift) { int count = ib_umem_page_count(umem); @@ -719,7 +720,7 @@ static inline int mtr_umem_page_count(struct ib_umem *umem, int page_shift) } static inline size_t mtr_kmem_direct_size(bool is_direct, size_t alloc_size, - int page_shift) + unsigned int page_shift) { if (is_direct) return ALIGN(alloc_size, 1 << page_shift); @@ -732,7 +733,7 @@ static inline size_t mtr_kmem_direct_size(bool is_direct, size_t alloc_size, * Returns 0 on success, or the error page num. */ static inline int mtr_check_direct_pages(dma_addr_t *pages, int page_count, - int page_shift) + unsigned int page_shift) { size_t page_size = 1 << page_shift; int i; @@ -765,8 +766,8 @@ static int mtr_alloc_bufs(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr, struct ib_udata *udata, unsigned long user_addr) { struct ib_device *ibdev = &hr_dev->ib_dev; - int max_pg_shift = buf_attr->page_shift; - int best_pg_shift = 0; + unsigned int max_pg_shift = buf_attr->page_shift; + unsigned int best_pg_shift = 0; int all_pg_count = 0; size_t direct_size; size_t total_size; @@ -836,7 +837,7 @@ err_alloc_mem: } static int mtr_get_pages(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr, - dma_addr_t *pages, int count, int page_shift) + dma_addr_t *pages, int count, unsigned int page_shift) { struct ib_device *ibdev = &hr_dev->ib_dev; int npage; @@ -946,7 +947,7 @@ done: /* convert buffer size to page index and page count */ static int mtr_init_region(struct hns_roce_buf_attr *attr, int page_cnt, struct hns_roce_buf_region *regions, int region_cnt, - int page_shift) + unsigned int page_shift) { unsigned int page_size = 1 << page_shift; int max_region = attr->region_count; @@ -977,8 +978,9 @@ static int mtr_init_region(struct hns_roce_buf_attr *attr, int page_cnt, * @buf_alloced: mtr has private buffer, true means need to alloc */ int hns_roce_mtr_create(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr, - struct hns_roce_buf_attr *buf_attr, int page_shift, - struct ib_udata *udata, unsigned long user_addr) + struct hns_roce_buf_attr *buf_attr, + unsigned int page_shift, struct ib_udata *udata, + unsigned long user_addr) { struct hns_roce_buf_region regions[HNS_ROCE_MAX_BT_REGION] = {}; struct ib_device *ibdev = &hr_dev->ib_dev; -- cgit v1.2.3 From 13aa13dddd5f06d47c35f9de46343e740f7d8b90 Mon Sep 17 00:00:00 2001 From: Weihang Li Date: Wed, 20 May 2020 21:53:17 +0800 Subject: RDMA/hns: Change variables representing quantity to unsigned Number of sge/eqe is always non-negative, they should be defined in type of unsigned. Link: https://lore.kernel.org/r/1589982799-28728-8-git-send-email-liweihang@huawei.com Signed-off-by: Weihang Li Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_device.h | 4 ++-- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 17 +++++++++-------- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index 7190c8a5422e..f9413722b28e 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -408,7 +408,7 @@ struct hns_roce_wq { }; struct hns_roce_sge { - int sge_cnt; /* SGE num */ + unsigned int sge_cnt; /* SGE num */ int offset; int sge_shift; /* SGE size */ }; @@ -734,7 +734,7 @@ struct hns_roce_eq { int arm_st; int hop_num; struct hns_roce_mtr mtr; - int eq_max_cnt; + u16 eq_max_cnt; int eq_period; int shift; int event_type; diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index af0911e522f8..4d29a9ec6a84 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -130,7 +130,7 @@ static void set_frmr_seg(struct hns_roce_v2_rc_send_wqe *rc_sq_wqe, static void set_atomic_seg(const struct ib_send_wr *wr, void *wqe, struct hns_roce_v2_rc_send_wqe *rc_sq_wqe, - int valid_num_sge) + unsigned int valid_num_sge) { struct hns_roce_wqe_atomic_seg *aseg; @@ -151,12 +151,12 @@ static void set_atomic_seg(const struct ib_send_wr *wr, void *wqe, } static void set_extend_sge(struct hns_roce_qp *qp, const struct ib_send_wr *wr, - unsigned int *sge_ind, int valid_num_sge) + unsigned int *sge_ind, unsigned int valid_num_sge) { struct hns_roce_v2_wqe_data_seg *dseg; + unsigned int cnt = valid_num_sge; struct ib_sge *sge = wr->sg_list; unsigned int idx = *sge_ind; - int cnt = valid_num_sge; if (qp->ibqp.qp_type == IB_QPT_RC || qp->ibqp.qp_type == IB_QPT_UC) { cnt -= HNS_ROCE_SGE_IN_WQE; @@ -177,7 +177,7 @@ static void set_extend_sge(struct hns_roce_qp *qp, const struct ib_send_wr *wr, static int set_rwqe_data_seg(struct ib_qp *ibqp, const struct ib_send_wr *wr, struct hns_roce_v2_rc_send_wqe *rc_sq_wqe, void *wqe, unsigned int *sge_ind, - int valid_num_sge) + unsigned int valid_num_sge) { struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device); struct hns_roce_v2_wqe_data_seg *dseg = wqe; @@ -269,10 +269,11 @@ static int check_send_valid(struct hns_roce_dev *hr_dev, return 0; } -static inline int calc_wr_sge_num(const struct ib_send_wr *wr, u32 *sge_len) +static unsigned int calc_wr_sge_num(const struct ib_send_wr *wr, + unsigned int *sge_len) { - int valid_num = 0; - u32 len = 0; + unsigned int valid_num = 0; + unsigned int len = 0; int i; for (i = 0; i < wr->num_sge; i++) { @@ -403,7 +404,7 @@ static inline int set_rc_wqe(struct hns_roce_qp *qp, { struct hns_roce_v2_rc_send_wqe *rc_sq_wqe = wqe; unsigned int curr_idx = *sge_idx; - int valid_num_sge; + unsigned int valid_num_sge; u32 msg_len = 0; int ret = 0; -- cgit v1.2.3 From 494c3b312255a39d4450f37ec2e675f142a76c8c Mon Sep 17 00:00:00 2001 From: Xi Wang Date: Wed, 20 May 2020 21:53:18 +0800 Subject: RDMA/hns: Refactor the QP context filling process related to WQE buffer configure Split the code related to WQE buffer configure from the QPC filling process into two functions: config_qp_sq_buf() and config_qp_rq_buf(), this will make the code more readable. Link: https://lore.kernel.org/r/1589982799-28728-9-git-send-email-liweihang@huawei.com Signed-off-by: Xi Wang Signed-off-by: Weihang Li Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 264 ++++++++++++++++------------- 1 file changed, 149 insertions(+), 115 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 4d29a9ec6a84..b1674f742361 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -3775,27 +3775,16 @@ static bool check_wqe_rq_mtt_count(struct hns_roce_dev *hr_dev, return true; } -static int modify_qp_init_to_rtr(struct ib_qp *ibqp, - const struct ib_qp_attr *attr, int attr_mask, - struct hns_roce_v2_qp_context *context, - struct hns_roce_v2_qp_context *qpc_mask) +static int config_qp_rq_buf(struct hns_roce_dev *hr_dev, + struct hns_roce_qp *hr_qp, + struct hns_roce_v2_qp_context *context, + struct hns_roce_v2_qp_context *qpc_mask) { - const struct ib_global_route *grh = rdma_ah_read_grh(&attr->ah_attr); - struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device); - struct hns_roce_qp *hr_qp = to_hr_qp(ibqp); - struct ib_device *ibdev = &hr_dev->ib_dev; + struct ib_qp *ibqp = &hr_qp->ibqp; u64 mtts[MTT_MIN_COUNT] = { 0 }; - dma_addr_t dma_handle_3; - dma_addr_t dma_handle_2; u64 wqe_sge_ba; u32 page_size; - u8 port_num; - u64 *mtts_3; - u64 *mtts_2; int count; - u8 *dmac; - u8 *smac; - int port; /* Search qp buf's mtts */ page_size = 1 << hr_qp->mtr.hem_cfg.buf_pg_shift; @@ -3806,29 +3795,6 @@ static int modify_qp_init_to_rtr(struct ib_qp *ibqp, if (!check_wqe_rq_mtt_count(hr_dev, hr_qp, count, page_size)) return -EINVAL; - /* Search IRRL's mtts */ - mtts_2 = hns_roce_table_find(hr_dev, &hr_dev->qp_table.irrl_table, - hr_qp->qpn, &dma_handle_2); - if (!mtts_2) { - ibdev_err(ibdev, "failed to find QP irrl_table\n"); - return -EINVAL; - } - - /* Search TRRL's mtts */ - mtts_3 = hns_roce_table_find(hr_dev, &hr_dev->qp_table.trrl_table, - hr_qp->qpn, &dma_handle_3); - if (!mtts_3) { - ibdev_err(ibdev, "failed to find QP trrl_table\n"); - return -EINVAL; - } - - if (attr_mask & IB_QP_ALT_PATH) { - ibdev_err(ibdev, "INIT2RTR attr_mask (0x%x) error\n", - attr_mask); - return -EINVAL; - } - - dmac = (u8 *)attr->ah_attr.roce.dmac; context->wqe_sge_ba = cpu_to_le32(wqe_sge_ba >> 3); qpc_mask->wqe_sge_ba = 0; @@ -3907,23 +3873,154 @@ static int modify_qp_init_to_rtr(struct ib_qp *ibqp, V2_QPC_BYTE_104_RQ_NXT_BLK_ADDR_M, V2_QPC_BYTE_104_RQ_NXT_BLK_ADDR_S, 0); + roce_set_field(context->byte_84_rq_ci_pi, + V2_QPC_BYTE_84_RQ_PRODUCER_IDX_M, + V2_QPC_BYTE_84_RQ_PRODUCER_IDX_S, hr_qp->rq.head); + roce_set_field(qpc_mask->byte_84_rq_ci_pi, + V2_QPC_BYTE_84_RQ_PRODUCER_IDX_M, + V2_QPC_BYTE_84_RQ_PRODUCER_IDX_S, 0); + + roce_set_field(qpc_mask->byte_84_rq_ci_pi, + V2_QPC_BYTE_84_RQ_CONSUMER_IDX_M, + V2_QPC_BYTE_84_RQ_CONSUMER_IDX_S, 0); + + return 0; +} + +static int config_qp_sq_buf(struct hns_roce_dev *hr_dev, + struct hns_roce_qp *hr_qp, + struct hns_roce_v2_qp_context *context, + struct hns_roce_v2_qp_context *qpc_mask) +{ + struct ib_device *ibdev = &hr_dev->ib_dev; + u64 sge_cur_blk = 0; + u64 sq_cur_blk = 0; + u32 page_size; + int count; + + /* search qp buf's mtts */ + count = hns_roce_mtr_find(hr_dev, &hr_qp->mtr, 0, &sq_cur_blk, 1, NULL); + if (count < 1) { + ibdev_err(ibdev, "failed to find QP(0x%lx) SQ buf.\n", + hr_qp->qpn); + return -EINVAL; + } + if (hr_qp->sge.sge_cnt > 0) { + page_size = 1 << hr_qp->mtr.hem_cfg.buf_pg_shift; + count = hns_roce_mtr_find(hr_dev, &hr_qp->mtr, + hr_qp->sge.offset / page_size, + &sge_cur_blk, 1, NULL); + if (count < 1) { + ibdev_err(ibdev, "failed to find QP(0x%lx) SGE buf.\n", + hr_qp->qpn); + return -EINVAL; + } + } + + /* + * In v2 engine, software pass context and context mask to hardware + * when modifying qp. If software need modify some fields in context, + * we should set all bits of the relevant fields in context mask to + * 0 at the same time, else set them to 0x1. + */ + context->sq_cur_blk_addr = cpu_to_le32(to_hr_hw_page_addr(sq_cur_blk)); + roce_set_field(context->byte_168_irrl_idx, + V2_QPC_BYTE_168_SQ_CUR_BLK_ADDR_M, + V2_QPC_BYTE_168_SQ_CUR_BLK_ADDR_S, + upper_32_bits(to_hr_hw_page_addr(sq_cur_blk))); + qpc_mask->sq_cur_blk_addr = 0; + roce_set_field(qpc_mask->byte_168_irrl_idx, + V2_QPC_BYTE_168_SQ_CUR_BLK_ADDR_M, + V2_QPC_BYTE_168_SQ_CUR_BLK_ADDR_S, 0); + + context->sq_cur_sge_blk_addr = + cpu_to_le32(to_hr_hw_page_addr(sge_cur_blk)); + roce_set_field(context->byte_184_irrl_idx, + V2_QPC_BYTE_184_SQ_CUR_SGE_BLK_ADDR_M, + V2_QPC_BYTE_184_SQ_CUR_SGE_BLK_ADDR_S, + upper_32_bits(to_hr_hw_page_addr(sge_cur_blk))); + qpc_mask->sq_cur_sge_blk_addr = 0; + roce_set_field(qpc_mask->byte_184_irrl_idx, + V2_QPC_BYTE_184_SQ_CUR_SGE_BLK_ADDR_M, + V2_QPC_BYTE_184_SQ_CUR_SGE_BLK_ADDR_S, 0); + + context->rx_sq_cur_blk_addr = + cpu_to_le32(to_hr_hw_page_addr(sq_cur_blk)); + roce_set_field(context->byte_232_irrl_sge, + V2_QPC_BYTE_232_RX_SQ_CUR_BLK_ADDR_M, + V2_QPC_BYTE_232_RX_SQ_CUR_BLK_ADDR_S, + upper_32_bits(to_hr_hw_page_addr(sq_cur_blk))); + qpc_mask->rx_sq_cur_blk_addr = 0; + roce_set_field(qpc_mask->byte_232_irrl_sge, + V2_QPC_BYTE_232_RX_SQ_CUR_BLK_ADDR_M, + V2_QPC_BYTE_232_RX_SQ_CUR_BLK_ADDR_S, 0); + + return 0; +} + +static int modify_qp_init_to_rtr(struct ib_qp *ibqp, + const struct ib_qp_attr *attr, int attr_mask, + struct hns_roce_v2_qp_context *context, + struct hns_roce_v2_qp_context *qpc_mask) +{ + const struct ib_global_route *grh = rdma_ah_read_grh(&attr->ah_attr); + struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device); + struct hns_roce_qp *hr_qp = to_hr_qp(ibqp); + struct ib_device *ibdev = &hr_dev->ib_dev; + dma_addr_t trrl_ba; + dma_addr_t irrl_ba; + u8 port_num; + u64 *mtts; + u8 *dmac; + u8 *smac; + int port; + int ret; + + ret = config_qp_rq_buf(hr_dev, hr_qp, context, qpc_mask); + if (ret) { + ibdev_err(ibdev, "failed to config rq buf, ret = %d.\n", ret); + return ret; + } + + /* Search IRRL's mtts */ + mtts = hns_roce_table_find(hr_dev, &hr_dev->qp_table.irrl_table, + hr_qp->qpn, &irrl_ba); + if (!mtts) { + ibdev_err(ibdev, "failed to find qp irrl_table.\n"); + return -EINVAL; + } + + /* Search TRRL's mtts */ + mtts = hns_roce_table_find(hr_dev, &hr_dev->qp_table.trrl_table, + hr_qp->qpn, &trrl_ba); + if (!mtts) { + ibdev_err(ibdev, "failed to find qp trrl_table.\n"); + return -EINVAL; + } + + if (attr_mask & IB_QP_ALT_PATH) { + ibdev_err(ibdev, "INIT2RTR attr_mask (0x%x) error.\n", + attr_mask); + return -EINVAL; + } + roce_set_field(context->byte_132_trrl, V2_QPC_BYTE_132_TRRL_BA_M, - V2_QPC_BYTE_132_TRRL_BA_S, dma_handle_3 >> 4); + V2_QPC_BYTE_132_TRRL_BA_S, trrl_ba >> 4); roce_set_field(qpc_mask->byte_132_trrl, V2_QPC_BYTE_132_TRRL_BA_M, V2_QPC_BYTE_132_TRRL_BA_S, 0); - context->trrl_ba = cpu_to_le32(dma_handle_3 >> (16 + 4)); + context->trrl_ba = cpu_to_le32(trrl_ba >> (16 + 4)); qpc_mask->trrl_ba = 0; roce_set_field(context->byte_140_raq, V2_QPC_BYTE_140_TRRL_BA_M, V2_QPC_BYTE_140_TRRL_BA_S, - (u32)(dma_handle_3 >> (32 + 16 + 4))); + (u32)(trrl_ba >> (32 + 16 + 4))); roce_set_field(qpc_mask->byte_140_raq, V2_QPC_BYTE_140_TRRL_BA_M, V2_QPC_BYTE_140_TRRL_BA_S, 0); - context->irrl_ba = cpu_to_le32(dma_handle_2 >> 6); + context->irrl_ba = cpu_to_le32(irrl_ba >> 6); qpc_mask->irrl_ba = 0; roce_set_field(context->byte_208_irrl, V2_QPC_BYTE_208_IRRL_BA_M, V2_QPC_BYTE_208_IRRL_BA_S, - dma_handle_2 >> (32 + 6)); + irrl_ba >> (32 + 6)); roce_set_field(qpc_mask->byte_208_irrl, V2_QPC_BYTE_208_IRRL_BA_M, V2_QPC_BYTE_208_IRRL_BA_S, 0); @@ -3960,6 +4057,8 @@ static int modify_qp_init_to_rtr(struct ib_qp *ibqp, grh->sgid_index)); roce_set_field(qpc_mask->byte_20_smac_sgid_idx, V2_QPC_BYTE_20_SGID_IDX_M, V2_QPC_BYTE_20_SGID_IDX_S, 0); + + dmac = (u8 *)attr->ah_attr.roce.dmac; memcpy(&(context->dmac), dmac, sizeof(u32)); roce_set_field(context->byte_52_udpspn_dmac, V2_QPC_BYTE_52_DMAC_M, V2_QPC_BYTE_52_DMAC_S, *((u16 *)(&dmac[4]))); @@ -3984,16 +4083,6 @@ static int modify_qp_init_to_rtr(struct ib_qp *ibqp, roce_set_field(qpc_mask->byte_24_mtu_tc, V2_QPC_BYTE_24_MTU_M, V2_QPC_BYTE_24_MTU_S, 0); - roce_set_field(context->byte_84_rq_ci_pi, - V2_QPC_BYTE_84_RQ_PRODUCER_IDX_M, - V2_QPC_BYTE_84_RQ_PRODUCER_IDX_S, hr_qp->rq.head); - roce_set_field(qpc_mask->byte_84_rq_ci_pi, - V2_QPC_BYTE_84_RQ_PRODUCER_IDX_M, - V2_QPC_BYTE_84_RQ_PRODUCER_IDX_S, 0); - - roce_set_field(qpc_mask->byte_84_rq_ci_pi, - V2_QPC_BYTE_84_RQ_CONSUMER_IDX_M, - V2_QPC_BYTE_84_RQ_CONSUMER_IDX_S, 0); roce_set_bit(qpc_mask->byte_108_rx_reqepsn, V2_QPC_BYTE_108_RX_REQ_PSN_ERR_S, 0); roce_set_field(qpc_mask->byte_96_rx_reqmsn, V2_QPC_BYTE_96_RX_REQ_MSN_M, @@ -4029,30 +4118,7 @@ static int modify_qp_rtr_to_rts(struct ib_qp *ibqp, struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device); struct hns_roce_qp *hr_qp = to_hr_qp(ibqp); struct ib_device *ibdev = &hr_dev->ib_dev; - u64 sge_cur_blk = 0; - u64 sq_cur_blk = 0; - u32 page_size; - int count; - - /* Search qp buf's mtts */ - count = hns_roce_mtr_find(hr_dev, &hr_qp->mtr, 0, &sq_cur_blk, 1, NULL); - if (count < 1) { - ibdev_err(ibdev, "failed to find QP(0x%lx) SQ buf\n", - hr_qp->qpn); - return -EINVAL; - } - - if (hr_qp->sge.sge_cnt > 0) { - page_size = 1 << hr_qp->mtr.hem_cfg.buf_pg_shift; - count = hns_roce_mtr_find(hr_dev, &hr_qp->mtr, - hr_qp->sge.offset / page_size, - &sge_cur_blk, 1, NULL); - if (count < 1) { - ibdev_err(ibdev, "failed to find QP(0x%lx) SGE buf\n", - hr_qp->qpn); - return -EINVAL; - } - } + int ret; /* Not support alternate path and path migration */ if (attr_mask & (IB_QP_ALT_PATH | IB_QP_PATH_MIG_STATE)) { @@ -4060,43 +4126,11 @@ static int modify_qp_rtr_to_rts(struct ib_qp *ibqp, return -EINVAL; } - /* - * In v2 engine, software pass context and context mask to hardware - * when modifying qp. If software need modify some fields in context, - * we should set all bits of the relevant fields in context mask to - * 0 at the same time, else set them to 0x1. - */ - context->sq_cur_blk_addr = cpu_to_le32(to_hr_hw_page_addr(sq_cur_blk)); - roce_set_field(context->byte_168_irrl_idx, - V2_QPC_BYTE_168_SQ_CUR_BLK_ADDR_M, - V2_QPC_BYTE_168_SQ_CUR_BLK_ADDR_S, - upper_32_bits(to_hr_hw_page_addr(sq_cur_blk))); - qpc_mask->sq_cur_blk_addr = 0; - roce_set_field(qpc_mask->byte_168_irrl_idx, - V2_QPC_BYTE_168_SQ_CUR_BLK_ADDR_M, - V2_QPC_BYTE_168_SQ_CUR_BLK_ADDR_S, 0); - - context->sq_cur_sge_blk_addr = - cpu_to_le32(to_hr_hw_page_addr(sge_cur_blk)); - roce_set_field(context->byte_184_irrl_idx, - V2_QPC_BYTE_184_SQ_CUR_SGE_BLK_ADDR_M, - V2_QPC_BYTE_184_SQ_CUR_SGE_BLK_ADDR_S, - upper_32_bits(to_hr_hw_page_addr(sge_cur_blk))); - qpc_mask->sq_cur_sge_blk_addr = 0; - roce_set_field(qpc_mask->byte_184_irrl_idx, - V2_QPC_BYTE_184_SQ_CUR_SGE_BLK_ADDR_M, - V2_QPC_BYTE_184_SQ_CUR_SGE_BLK_ADDR_S, 0); - - context->rx_sq_cur_blk_addr = - cpu_to_le32(to_hr_hw_page_addr(sq_cur_blk)); - roce_set_field(context->byte_232_irrl_sge, - V2_QPC_BYTE_232_RX_SQ_CUR_BLK_ADDR_M, - V2_QPC_BYTE_232_RX_SQ_CUR_BLK_ADDR_S, - upper_32_bits(to_hr_hw_page_addr(sq_cur_blk))); - qpc_mask->rx_sq_cur_blk_addr = 0; - roce_set_field(qpc_mask->byte_232_irrl_sge, - V2_QPC_BYTE_232_RX_SQ_CUR_BLK_ADDR_M, - V2_QPC_BYTE_232_RX_SQ_CUR_BLK_ADDR_S, 0); + ret = config_qp_sq_buf(hr_dev, hr_qp, context, qpc_mask); + if (ret) { + ibdev_err(ibdev, "failed to config sq buf, ret %d\n", ret); + return ret; + } /* * Set some fields in context to zero, Because the default values -- cgit v1.2.3 From 8e029d386bcef651bb89091c8a48375c2506aefe Mon Sep 17 00:00:00 2001 From: Xi Wang Date: Wed, 20 May 2020 21:53:19 +0800 Subject: RDMA/hns: Optimize the usage of MTR Currently, the MTR region is configed before hns_roce_mtr_map() is invoked, but in some scenarios, the region is configed at MTR creation, the caller need to store this config and call hns_roce_mtr_map() later. So optimize the usage by wrapping the MTR region config into MTR. Link: https://lore.kernel.org/r/1589982799-28728-10-git-send-email-liweihang@huawei.com Signed-off-by: Xi Wang Signed-off-by: Weihang Li Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_device.h | 3 +- drivers/infiniband/hw/hns/hns_roce_mr.c | 54 +++++++++++++++-------------- 2 files changed, 30 insertions(+), 27 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index f9413722b28e..a77fa6730b2d 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -359,6 +359,8 @@ struct hns_roce_mtr { unsigned int ba_pg_shift; /* BA table page shift */ unsigned int buf_pg_shift; /* buffer page shift */ int buf_pg_count; /* buffer page count */ + struct hns_roce_buf_region region[HNS_ROCE_MAX_BT_REGION]; + unsigned int region_count; } hem_cfg; /* config for hardware addressing */ }; @@ -1145,7 +1147,6 @@ int hns_roce_mtr_create(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr, void hns_roce_mtr_destroy(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr); int hns_roce_mtr_map(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr, - struct hns_roce_buf_region *regions, int region_cnt, dma_addr_t *pages, int page_cnt); int hns_roce_init_pd_table(struct hns_roce_dev *hr_dev); diff --git a/drivers/infiniband/hw/hns/hns_roce_mr.c b/drivers/infiniband/hw/hns/hns_roce_mr.c index 17759de4108d..4c0bbb12770d 100644 --- a/drivers/infiniband/hw/hns/hns_roce_mr.c +++ b/drivers/infiniband/hw/hns/hns_roce_mr.c @@ -483,7 +483,7 @@ int hns_roce_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents, struct hns_roce_dev *hr_dev = to_hr_dev(ibmr->device); struct ib_device *ibdev = &hr_dev->ib_dev; struct hns_roce_mr *mr = to_hr_mr(ibmr); - struct hns_roce_buf_region region = {}; + struct hns_roce_mtr *mtr = &mr->pbl_mtr; int ret = 0; mr->npages = 0; @@ -499,11 +499,11 @@ int hns_roce_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents, goto err_page_list; } - region.offset = 0; - region.count = mr->npages; - region.hopnum = mr->pbl_hop_num; - ret = hns_roce_mtr_map(hr_dev, &mr->pbl_mtr, ®ion, 1, mr->page_list, - mr->npages); + mtr->hem_cfg.region[0].offset = 0; + mtr->hem_cfg.region[0].count = mr->npages; + mtr->hem_cfg.region[0].hopnum = mr->pbl_hop_num; + mtr->hem_cfg.region_count = 1; + ret = hns_roce_mtr_map(hr_dev, mtr, mr->page_list, mr->npages); if (ret) { ibdev_err(ibdev, "failed to map sg mtr, ret = %d.\n", ret); ret = 0; @@ -863,7 +863,6 @@ static int mtr_get_pages(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr, } int hns_roce_mtr_map(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr, - struct hns_roce_buf_region *regions, int region_cnt, dma_addr_t *pages, int page_cnt) { struct ib_device *ibdev = &hr_dev->ib_dev; @@ -871,8 +870,8 @@ int hns_roce_mtr_map(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr, int err; int i; - for (i = 0; i < region_cnt; i++) { - r = ®ions[i]; + for (i = 0; i < mtr->hem_cfg.region_count; i++) { + r = &mtr->hem_cfg.region[i]; if (r->offset + r->count > page_cnt) { err = -EINVAL; ibdev_err(ibdev, @@ -945,15 +944,16 @@ done: } /* convert buffer size to page index and page count */ -static int mtr_init_region(struct hns_roce_buf_attr *attr, int page_cnt, - struct hns_roce_buf_region *regions, int region_cnt, - unsigned int page_shift) +static unsigned int mtr_init_region(struct hns_roce_buf_attr *attr, + int page_cnt, + struct hns_roce_buf_region *regions, + int region_cnt, unsigned int page_shift) { unsigned int page_size = 1 << page_shift; int max_region = attr->region_count; struct hns_roce_buf_region *r; + unsigned int i = 0; int page_idx = 0; - int i = 0; for (; i < region_cnt && i < max_region && page_idx < page_cnt; i++) { r = ®ions[i]; @@ -982,7 +982,6 @@ int hns_roce_mtr_create(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr, unsigned int page_shift, struct ib_udata *udata, unsigned long user_addr) { - struct hns_roce_buf_region regions[HNS_ROCE_MAX_BT_REGION] = {}; struct ib_device *ibdev = &hr_dev->ib_dev; dma_addr_t *pages = NULL; int region_cnt = 0; @@ -1014,18 +1013,22 @@ int hns_roce_mtr_create(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr, hns_roce_hem_list_init(&mtr->hem_list); mtr->hem_cfg.is_direct = !has_mtt; mtr->hem_cfg.ba_pg_shift = page_shift; + mtr->hem_cfg.region_count = 0; + region_cnt = mtr_init_region(buf_attr, all_pg_cnt, + mtr->hem_cfg.region, + ARRAY_SIZE(mtr->hem_cfg.region), + mtr->hem_cfg.buf_pg_shift); + if (region_cnt < 1) { + err = -ENOBUFS; + ibdev_err(ibdev, "failed to init mtr region %d\n", region_cnt); + goto err_alloc_bufs; + } + + mtr->hem_cfg.region_count = region_cnt; + if (has_mtt) { - region_cnt = mtr_init_region(buf_attr, all_pg_cnt, - regions, ARRAY_SIZE(regions), - mtr->hem_cfg.buf_pg_shift); - if (region_cnt < 1) { - err = -ENOBUFS; - ibdev_err(ibdev, "Failed to init mtr region %d\n", - region_cnt); - goto err_alloc_bufs; - } err = hns_roce_hem_list_request(hr_dev, &mtr->hem_list, - regions, region_cnt, + mtr->hem_cfg.region, region_cnt, page_shift); if (err) { ibdev_err(ibdev, "Failed to request mtr hem, err %d\n", @@ -1061,8 +1064,7 @@ int hns_roce_mtr_create(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr, mtr->hem_cfg.root_ba = pages[0]; } else { /* write buffer's dma address to BA table */ - err = hns_roce_mtr_map(hr_dev, mtr, regions, region_cnt, pages, - all_pg_cnt); + err = hns_roce_mtr_map(hr_dev, mtr, pages, all_pg_cnt); if (err) { ibdev_err(ibdev, "Failed to map mtr pages, err %d\n", err); -- cgit v1.2.3 From 14ba87304bf98a0a0c069708bb14e92a616420d1 Mon Sep 17 00:00:00 2001 From: Weihang Li Date: Fri, 22 May 2020 21:02:56 +0800 Subject: RDMA/hns: Remove redundant type cast for general pointers There is no need to do a type cast on genernal pointers, they could be assigned to any type of variables. In addition, optimize initialization of some variables and adjust order of them. Link: https://lore.kernel.org/r/1590152579-32364-2-git-send-email-liweihang@huawei.com Signed-off-by: Weihang Li Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_hw_v1.c | 180 ++++++++++------------------- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 28 ++--- drivers/infiniband/hw/hns/hns_roce_hw_v2.h | 2 +- 3 files changed, 73 insertions(+), 137 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c index b4b98e818328..8ff6b922b4d7 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c @@ -503,16 +503,13 @@ static void hns_roce_set_odb(struct hns_roce_dev *hr_dev, u32 odb_alept, static void hns_roce_set_sdb_ext(struct hns_roce_dev *hr_dev, u32 ext_sdb_alept, u32 ext_sdb_alful) { + struct hns_roce_v1_priv *priv = hr_dev->priv; + struct hns_roce_db_table *db = &priv->db_table; struct device *dev = &hr_dev->pdev->dev; - struct hns_roce_v1_priv *priv; - struct hns_roce_db_table *db; dma_addr_t sdb_dma_addr; __le32 tmp; u32 val; - priv = (struct hns_roce_v1_priv *)hr_dev->priv; - db = &priv->db_table; - /* Configure extend SDB threshold */ roce_write(hr_dev, ROCEE_EXT_DB_SQ_WL_EMPTY_REG, ext_sdb_alept); roce_write(hr_dev, ROCEE_EXT_DB_SQ_WL_REG, ext_sdb_alful); @@ -545,16 +542,13 @@ static void hns_roce_set_sdb_ext(struct hns_roce_dev *hr_dev, u32 ext_sdb_alept, static void hns_roce_set_odb_ext(struct hns_roce_dev *hr_dev, u32 ext_odb_alept, u32 ext_odb_alful) { + struct hns_roce_v1_priv *priv = hr_dev->priv; + struct hns_roce_db_table *db = &priv->db_table; struct device *dev = &hr_dev->pdev->dev; - struct hns_roce_v1_priv *priv; - struct hns_roce_db_table *db; dma_addr_t odb_dma_addr; __le32 tmp; u32 val; - priv = (struct hns_roce_v1_priv *)hr_dev->priv; - db = &priv->db_table; - /* Configure extend ODB threshold */ roce_write(hr_dev, ROCEE_EXT_DB_OTHERS_WL_EMPTY_REG, ext_odb_alept); roce_write(hr_dev, ROCEE_EXT_DB_OTHERS_WL_REG, ext_odb_alful); @@ -583,16 +577,13 @@ static void hns_roce_set_odb_ext(struct hns_roce_dev *hr_dev, u32 ext_odb_alept, static int hns_roce_db_ext_init(struct hns_roce_dev *hr_dev, u32 sdb_ext_mod, u32 odb_ext_mod) { + struct hns_roce_v1_priv *priv = hr_dev->priv; + struct hns_roce_db_table *db = &priv->db_table; struct device *dev = &hr_dev->pdev->dev; - struct hns_roce_v1_priv *priv; - struct hns_roce_db_table *db; dma_addr_t sdb_dma_addr; dma_addr_t odb_dma_addr; int ret = 0; - priv = (struct hns_roce_v1_priv *)hr_dev->priv; - db = &priv->db_table; - db->ext_db = kmalloc(sizeof(*db->ext_db), GFP_KERNEL); if (!db->ext_db) return -ENOMEM; @@ -692,14 +683,14 @@ static struct hns_roce_qp *hns_roce_v1_create_lp_qp(struct hns_roce_dev *hr_dev, static int hns_roce_v1_rsv_lp_qp(struct hns_roce_dev *hr_dev) { + struct hns_roce_v1_priv *priv = hr_dev->priv; + struct hns_roce_free_mr *free_mr = &priv->free_mr; struct hns_roce_caps *caps = &hr_dev->caps; + struct ib_device *ibdev = &hr_dev->ib_dev; struct device *dev = &hr_dev->pdev->dev; struct ib_cq_init_attr cq_init_attr; - struct hns_roce_free_mr *free_mr; struct ib_qp_attr attr = { 0 }; - struct hns_roce_v1_priv *priv; struct hns_roce_qp *hr_qp; - struct ib_device *ibdev; struct ib_cq *cq; struct ib_pd *pd; union ib_gid dgid; @@ -712,14 +703,10 @@ static int hns_roce_v1_rsv_lp_qp(struct hns_roce_dev *hr_dev) u8 port = 0; u8 sl; - priv = (struct hns_roce_v1_priv *)hr_dev->priv; - free_mr = &priv->free_mr; - /* Reserved cq for loop qp */ cq_init_attr.cqe = HNS_ROCE_MIN_WQE_NUM * 2; cq_init_attr.comp_vector = 0; - ibdev = &hr_dev->ib_dev; cq = rdma_zalloc_drv_obj(ibdev, ib_cq); if (!cq) return -ENOMEM; @@ -868,16 +855,13 @@ alloc_cq_failed: static void hns_roce_v1_release_lp_qp(struct hns_roce_dev *hr_dev) { + struct hns_roce_v1_priv *priv = hr_dev->priv; + struct hns_roce_free_mr *free_mr = &priv->free_mr; struct device *dev = &hr_dev->pdev->dev; - struct hns_roce_free_mr *free_mr; - struct hns_roce_v1_priv *priv; struct hns_roce_qp *hr_qp; int ret; int i; - priv = (struct hns_roce_v1_priv *)hr_dev->priv; - free_mr = &priv->free_mr; - for (i = 0; i < HNS_ROCE_V1_RESV_QP; i++) { hr_qp = free_mr->mr_free_qp[i]; if (!hr_qp) @@ -897,18 +881,15 @@ static void hns_roce_v1_release_lp_qp(struct hns_roce_dev *hr_dev) static int hns_roce_db_init(struct hns_roce_dev *hr_dev) { + struct hns_roce_v1_priv *priv = hr_dev->priv; + struct hns_roce_db_table *db = &priv->db_table; struct device *dev = &hr_dev->pdev->dev; - struct hns_roce_v1_priv *priv; - struct hns_roce_db_table *db; u32 sdb_ext_mod; u32 odb_ext_mod; u32 sdb_evt_mod; u32 odb_evt_mod; int ret = 0; - priv = (struct hns_roce_v1_priv *)hr_dev->priv; - db = &priv->db_table; - memset(db, 0, sizeof(*db)); /* Default DB mode */ @@ -954,15 +935,12 @@ static void hns_roce_v1_recreate_lp_qp_work_fn(struct work_struct *work) static int hns_roce_v1_recreate_lp_qp(struct hns_roce_dev *hr_dev) { - struct device *dev = &hr_dev->pdev->dev; + long end = HNS_ROCE_V1_RECREATE_LP_QP_TIMEOUT_MSECS; + struct hns_roce_v1_priv *priv = hr_dev->priv; + struct hns_roce_free_mr *free_mr = &priv->free_mr; struct hns_roce_recreate_lp_qp_work *lp_qp_work; - struct hns_roce_free_mr *free_mr; - struct hns_roce_v1_priv *priv; + struct device *dev = &hr_dev->pdev->dev; struct completion comp; - long end = HNS_ROCE_V1_RECREATE_LP_QP_TIMEOUT_MSECS; - - priv = (struct hns_roce_v1_priv *)hr_dev->priv; - free_mr = &priv->free_mr; lp_qp_work = kzalloc(sizeof(struct hns_roce_recreate_lp_qp_work), GFP_KERNEL); @@ -1021,29 +999,21 @@ static int hns_roce_v1_send_lp_wqe(struct hns_roce_qp *hr_qp) static void hns_roce_v1_mr_free_work_fn(struct work_struct *work) { - struct hns_roce_mr_free_work *mr_work; - struct ib_wc wc[HNS_ROCE_V1_RESV_QP]; - struct hns_roce_free_mr *free_mr; - struct hns_roce_cq *mr_free_cq; - struct hns_roce_v1_priv *priv; - struct hns_roce_dev *hr_dev; - struct hns_roce_mr *hr_mr; - struct hns_roce_qp *hr_qp; - struct device *dev; unsigned long end = msecs_to_jiffies(HNS_ROCE_V1_FREE_MR_TIMEOUT_MSECS) + jiffies; - int i; - int ret; + struct hns_roce_mr_free_work *mr_work = + container_of(work, struct hns_roce_mr_free_work, work); + struct hns_roce_dev *hr_dev = to_hr_dev(mr_work->ib_dev); + struct hns_roce_v1_priv *priv = hr_dev->priv; + struct hns_roce_free_mr *free_mr = &priv->free_mr; + struct hns_roce_cq *mr_free_cq = free_mr->mr_free_cq; + struct hns_roce_mr *hr_mr = mr_work->mr; + struct device *dev = &hr_dev->pdev->dev; + struct ib_wc wc[HNS_ROCE_V1_RESV_QP]; + struct hns_roce_qp *hr_qp; int ne = 0; - - mr_work = container_of(work, struct hns_roce_mr_free_work, work); - hr_mr = (struct hns_roce_mr *)mr_work->mr; - hr_dev = to_hr_dev(mr_work->ib_dev); - dev = &hr_dev->pdev->dev; - - priv = (struct hns_roce_v1_priv *)hr_dev->priv; - free_mr = &priv->free_mr; - mr_free_cq = free_mr->mr_free_cq; + int ret; + int i; for (i = 0; i < HNS_ROCE_V1_RESV_QP; i++) { hr_qp = free_mr->mr_free_qp[i]; @@ -1092,18 +1062,15 @@ free_work: static int hns_roce_v1_dereg_mr(struct hns_roce_dev *hr_dev, struct hns_roce_mr *mr, struct ib_udata *udata) { + struct hns_roce_v1_priv *priv = hr_dev->priv; + struct hns_roce_free_mr *free_mr = &priv->free_mr; + long end = HNS_ROCE_V1_FREE_MR_TIMEOUT_MSECS; struct device *dev = &hr_dev->pdev->dev; struct hns_roce_mr_free_work *mr_work; - struct hns_roce_free_mr *free_mr; - struct hns_roce_v1_priv *priv; - struct completion comp; - long end = HNS_ROCE_V1_FREE_MR_TIMEOUT_MSECS; unsigned long start = jiffies; + struct completion comp; int ret = 0; - priv = (struct hns_roce_v1_priv *)hr_dev->priv; - free_mr = &priv->free_mr; - if (mr->enabled) { if (hns_roce_hw_destroy_mpt(hr_dev, NULL, key_to_hw_index(mr->key) & @@ -1155,12 +1122,9 @@ free_mr: static void hns_roce_db_free(struct hns_roce_dev *hr_dev) { + struct hns_roce_v1_priv *priv = hr_dev->priv; + struct hns_roce_db_table *db = &priv->db_table; struct device *dev = &hr_dev->pdev->dev; - struct hns_roce_v1_priv *priv; - struct hns_roce_db_table *db; - - priv = (struct hns_roce_v1_priv *)hr_dev->priv; - db = &priv->db_table; if (db->sdb_ext_mod) { dma_free_coherent(dev, HNS_ROCE_V1_EXT_SDB_SIZE, @@ -1181,17 +1145,14 @@ static void hns_roce_db_free(struct hns_roce_dev *hr_dev) static int hns_roce_raq_init(struct hns_roce_dev *hr_dev) { - int ret; - u32 val; - __le32 tmp; + struct hns_roce_v1_priv *priv = hr_dev->priv; + struct hns_roce_raq_table *raq = raq = &priv->raq_table; + struct device *dev = &hr_dev->pdev->dev; int raq_shift = 0; dma_addr_t addr; - struct hns_roce_v1_priv *priv; - struct hns_roce_raq_table *raq; - struct device *dev = &hr_dev->pdev->dev; - - priv = (struct hns_roce_v1_priv *)hr_dev->priv; - raq = &priv->raq_table; + __le32 tmp; + u32 val; + int ret; raq->e_raq_buf = kzalloc(sizeof(*(raq->e_raq_buf)), GFP_KERNEL); if (!raq->e_raq_buf) @@ -1271,12 +1232,9 @@ err_dma_alloc_raq: static void hns_roce_raq_free(struct hns_roce_dev *hr_dev) { + struct hns_roce_v1_priv *priv = hr_dev->priv; + struct hns_roce_raq_table *raq = &priv->raq_table; struct device *dev = &hr_dev->pdev->dev; - struct hns_roce_v1_priv *priv; - struct hns_roce_raq_table *raq; - - priv = (struct hns_roce_v1_priv *)hr_dev->priv; - raq = &priv->raq_table; dma_free_coherent(dev, HNS_ROCE_V1_RAQ_SIZE, raq->e_raq_buf->buf, raq->e_raq_buf->map); @@ -1310,12 +1268,10 @@ static void hns_roce_port_enable(struct hns_roce_dev *hr_dev, int enable_flag) static int hns_roce_bt_init(struct hns_roce_dev *hr_dev) { + struct hns_roce_v1_priv *priv = hr_dev->priv; struct device *dev = &hr_dev->pdev->dev; - struct hns_roce_v1_priv *priv; int ret; - priv = (struct hns_roce_v1_priv *)hr_dev->priv; - priv->bt_table.qpc_buf.buf = dma_alloc_coherent(dev, HNS_ROCE_BT_RSV_BUF_SIZE, &priv->bt_table.qpc_buf.map, GFP_KERNEL); @@ -1353,10 +1309,8 @@ err_failed_alloc_mtpt_buf: static void hns_roce_bt_free(struct hns_roce_dev *hr_dev) { + struct hns_roce_v1_priv *priv = hr_dev->priv; struct device *dev = &hr_dev->pdev->dev; - struct hns_roce_v1_priv *priv; - - priv = (struct hns_roce_v1_priv *)hr_dev->priv; dma_free_coherent(dev, HNS_ROCE_BT_RSV_BUF_SIZE, priv->bt_table.cqc_buf.buf, priv->bt_table.cqc_buf.map); @@ -1370,12 +1324,9 @@ static void hns_roce_bt_free(struct hns_roce_dev *hr_dev) static int hns_roce_tptr_init(struct hns_roce_dev *hr_dev) { + struct hns_roce_v1_priv *priv = hr_dev->priv; + struct hns_roce_buf_list *tptr_buf = &priv->tptr_table.tptr_buf; struct device *dev = &hr_dev->pdev->dev; - struct hns_roce_buf_list *tptr_buf; - struct hns_roce_v1_priv *priv; - - priv = (struct hns_roce_v1_priv *)hr_dev->priv; - tptr_buf = &priv->tptr_table.tptr_buf; /* * This buffer will be used for CQ's tptr(tail pointer), also @@ -1396,12 +1347,9 @@ static int hns_roce_tptr_init(struct hns_roce_dev *hr_dev) static void hns_roce_tptr_free(struct hns_roce_dev *hr_dev) { + struct hns_roce_v1_priv *priv = hr_dev->priv; + struct hns_roce_buf_list *tptr_buf = &priv->tptr_table.tptr_buf; struct device *dev = &hr_dev->pdev->dev; - struct hns_roce_buf_list *tptr_buf; - struct hns_roce_v1_priv *priv; - - priv = (struct hns_roce_v1_priv *)hr_dev->priv; - tptr_buf = &priv->tptr_table.tptr_buf; dma_free_coherent(dev, HNS_ROCE_V1_TPTR_BUF_SIZE, tptr_buf->buf, tptr_buf->map); @@ -1409,14 +1357,11 @@ static void hns_roce_tptr_free(struct hns_roce_dev *hr_dev) static int hns_roce_free_mr_init(struct hns_roce_dev *hr_dev) { + struct hns_roce_v1_priv *priv = hr_dev->priv; + struct hns_roce_free_mr *free_mr = &priv->free_mr; struct device *dev = &hr_dev->pdev->dev; - struct hns_roce_free_mr *free_mr; - struct hns_roce_v1_priv *priv; int ret = 0; - priv = (struct hns_roce_v1_priv *)hr_dev->priv; - free_mr = &priv->free_mr; - free_mr->free_mr_wq = create_singlethread_workqueue("hns_roce_free_mr"); if (!free_mr->free_mr_wq) { dev_err(dev, "Create free mr workqueue failed!\n"); @@ -1435,11 +1380,8 @@ static int hns_roce_free_mr_init(struct hns_roce_dev *hr_dev) static void hns_roce_free_mr_free(struct hns_roce_dev *hr_dev) { - struct hns_roce_free_mr *free_mr; - struct hns_roce_v1_priv *priv; - - priv = (struct hns_roce_v1_priv *)hr_dev->priv; - free_mr = &priv->free_mr; + struct hns_roce_v1_priv *priv = hr_dev->priv; + struct hns_roce_free_mr *free_mr = &priv->free_mr; flush_workqueue(free_mr->free_mr_wq); destroy_workqueue(free_mr->free_mr_wq); @@ -2050,16 +1992,12 @@ static void hns_roce_v1_write_cqc(struct hns_roce_dev *hr_dev, struct hns_roce_cq *hr_cq, void *mb_buf, u64 *mtts, dma_addr_t dma_handle) { - struct hns_roce_cq_context *cq_context = NULL; - struct hns_roce_buf_list *tptr_buf; - struct hns_roce_v1_priv *priv; + struct hns_roce_v1_priv *priv = hr_dev->priv; + struct hns_roce_buf_list *tptr_buf = &priv->tptr_table.tptr_buf; + struct hns_roce_cq_context *cq_context = mb_buf; dma_addr_t tptr_dma_addr; int offset; - priv = (struct hns_roce_v1_priv *)hr_dev->priv; - tptr_buf = &priv->tptr_table.tptr_buf; - - cq_context = mb_buf; memset(cq_context, 0, sizeof(*cq_context)); /* Get the tptr for this CQ. */ @@ -2400,16 +2338,14 @@ static int hns_roce_v1_clear_hem(struct hns_roce_dev *hr_dev, struct hns_roce_hem_table *table, int obj, int step_idx) { + struct hns_roce_v1_priv *priv = hr_dev->priv; struct device *dev = &hr_dev->pdev->dev; - struct hns_roce_v1_priv *priv; - unsigned long flags = 0; long end = HW_SYNC_TIMEOUT_MSECS; __le32 bt_cmd_val[2] = {0}; + unsigned long flags = 0; void __iomem *bt_cmd; u64 bt_ba = 0; - priv = (struct hns_roce_v1_priv *)hr_dev->priv; - switch (table->type) { case HEM_TYPE_QPC: bt_ba = priv->bt_table.qpc_buf.map >> 12; diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index b1674f742361..97d087ae8988 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -845,7 +845,7 @@ static int hns_roce_v2_cmd_hw_resetting(struct hns_roce_dev *hr_dev, unsigned long instance_stage, unsigned long reset_stage) { - struct hns_roce_v2_priv *priv = (struct hns_roce_v2_priv *)hr_dev->priv; + struct hns_roce_v2_priv *priv = hr_dev->priv; struct hnae3_handle *handle = priv->handle; const struct hnae3_ae_ops *ops = handle->ae_algo->ops; @@ -871,7 +871,7 @@ static int hns_roce_v2_cmd_hw_resetting(struct hns_roce_dev *hr_dev, static int hns_roce_v2_cmd_sw_resetting(struct hns_roce_dev *hr_dev) { - struct hns_roce_v2_priv *priv = (struct hns_roce_v2_priv *)hr_dev->priv; + struct hns_roce_v2_priv *priv = hr_dev->priv; struct hnae3_handle *handle = priv->handle; const struct hnae3_ae_ops *ops = handle->ae_algo->ops; @@ -888,7 +888,7 @@ static int hns_roce_v2_cmd_sw_resetting(struct hns_roce_dev *hr_dev) static int hns_roce_v2_rst_process_cmd(struct hns_roce_dev *hr_dev) { - struct hns_roce_v2_priv *priv = (struct hns_roce_v2_priv *)hr_dev->priv; + struct hns_roce_v2_priv *priv = hr_dev->priv; struct hnae3_handle *handle = priv->handle; const struct hnae3_ae_ops *ops = handle->ae_algo->ops; unsigned long instance_stage; /* the current instance stage */ @@ -968,7 +968,7 @@ static void hns_roce_free_cmq_desc(struct hns_roce_dev *hr_dev, static int hns_roce_init_cmq_ring(struct hns_roce_dev *hr_dev, bool ring_type) { - struct hns_roce_v2_priv *priv = (struct hns_roce_v2_priv *)hr_dev->priv; + struct hns_roce_v2_priv *priv = hr_dev->priv; struct hns_roce_v2_cmq_ring *ring = (ring_type == TYPE_CSQ) ? &priv->cmq.csq : &priv->cmq.crq; @@ -981,7 +981,7 @@ static int hns_roce_init_cmq_ring(struct hns_roce_dev *hr_dev, bool ring_type) static void hns_roce_cmq_init_regs(struct hns_roce_dev *hr_dev, bool ring_type) { - struct hns_roce_v2_priv *priv = (struct hns_roce_v2_priv *)hr_dev->priv; + struct hns_roce_v2_priv *priv = hr_dev->priv; struct hns_roce_v2_cmq_ring *ring = (ring_type == TYPE_CSQ) ? &priv->cmq.csq : &priv->cmq.crq; dma_addr_t dma = ring->desc_dma_addr; @@ -1007,7 +1007,7 @@ static void hns_roce_cmq_init_regs(struct hns_roce_dev *hr_dev, bool ring_type) static int hns_roce_v2_cmq_init(struct hns_roce_dev *hr_dev) { - struct hns_roce_v2_priv *priv = (struct hns_roce_v2_priv *)hr_dev->priv; + struct hns_roce_v2_priv *priv = hr_dev->priv; int ret; /* Setup the queue entries for command queue */ @@ -1051,7 +1051,7 @@ err_crq: static void hns_roce_v2_cmq_exit(struct hns_roce_dev *hr_dev) { - struct hns_roce_v2_priv *priv = (struct hns_roce_v2_priv *)hr_dev->priv; + struct hns_roce_v2_priv *priv = hr_dev->priv; hns_roce_free_cmq_desc(hr_dev, &priv->cmq.csq); hns_roce_free_cmq_desc(hr_dev, &priv->cmq.crq); @@ -1073,15 +1073,15 @@ static void hns_roce_cmq_setup_basic_desc(struct hns_roce_cmq_desc *desc, static int hns_roce_cmq_csq_done(struct hns_roce_dev *hr_dev) { - struct hns_roce_v2_priv *priv = (struct hns_roce_v2_priv *)hr_dev->priv; u32 head = roce_read(hr_dev, ROCEE_TX_CMQ_HEAD_REG); + struct hns_roce_v2_priv *priv = hr_dev->priv; return head == priv->cmq.csq.next_to_use; } static int hns_roce_cmq_csq_clean(struct hns_roce_dev *hr_dev) { - struct hns_roce_v2_priv *priv = (struct hns_roce_v2_priv *)hr_dev->priv; + struct hns_roce_v2_priv *priv = hr_dev->priv; struct hns_roce_v2_cmq_ring *csq = &priv->cmq.csq; struct hns_roce_cmq_desc *desc; u16 ntc = csq->next_to_clean; @@ -1106,7 +1106,7 @@ static int hns_roce_cmq_csq_clean(struct hns_roce_dev *hr_dev) static int __hns_roce_cmq_send(struct hns_roce_dev *hr_dev, struct hns_roce_cmq_desc *desc, int num) { - struct hns_roce_v2_priv *priv = (struct hns_roce_v2_priv *)hr_dev->priv; + struct hns_roce_v2_priv *priv = hr_dev->priv; struct hns_roce_v2_cmq_ring *csq = &priv->cmq.csq; struct hns_roce_cmq_desc *desc_to_use; bool complete = false; @@ -1234,7 +1234,7 @@ static int hns_roce_cmq_query_hw_info(struct hns_roce_dev *hr_dev) static bool hns_roce_func_clr_chk_rst(struct hns_roce_dev *hr_dev) { - struct hns_roce_v2_priv *priv = (struct hns_roce_v2_priv *)hr_dev->priv; + struct hns_roce_v2_priv *priv = hr_dev->priv; struct hnae3_handle *handle = priv->handle; const struct hnae3_ae_ops *ops = handle->ae_algo->ops; unsigned long reset_cnt; @@ -1254,7 +1254,7 @@ static bool hns_roce_func_clr_chk_rst(struct hns_roce_dev *hr_dev) static void hns_roce_func_clr_rst_prc(struct hns_roce_dev *hr_dev, int retval, int flag) { - struct hns_roce_v2_priv *priv = (struct hns_roce_v2_priv *)hr_dev->priv; + struct hns_roce_v2_priv *priv = hr_dev->priv; struct hnae3_handle *handle = priv->handle; const struct hnae3_ae_ops *ops = handle->ae_algo->ops; unsigned long instance_stage; @@ -6041,7 +6041,7 @@ error_failed_kzalloc: static void __hns_roce_hw_v2_uninit_instance(struct hnae3_handle *handle, bool reset) { - struct hns_roce_dev *hr_dev = (struct hns_roce_dev *)handle->priv; + struct hns_roce_dev *hr_dev = handle->priv; if (!hr_dev) return; @@ -6121,7 +6121,7 @@ static int hns_roce_hw_v2_reset_notify_down(struct hnae3_handle *handle) handle->rinfo.reset_state = HNS_ROCE_STATE_RST_DOWN; clear_bit(HNS_ROCE_RST_DIRECT_RETURN, &handle->rinfo.state); - hr_dev = (struct hns_roce_dev *)handle->priv; + hr_dev = handle->priv; if (!hr_dev) return 0; diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h index 532dcf6a05ff..e176b0aaa4ac 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h @@ -1979,7 +1979,7 @@ int hns_roce_v2_query_cqc_info(struct hns_roce_dev *hr_dev, u32 cqn, static inline void hns_roce_write64(struct hns_roce_dev *hr_dev, __le32 val[2], void __iomem *dest) { - struct hns_roce_v2_priv *priv = (struct hns_roce_v2_priv *)hr_dev->priv; + struct hns_roce_v2_priv *priv = hr_dev->priv; struct hnae3_handle *handle = priv->handle; const struct hnae3_ae_ops *ops = handle->ae_algo->ops; -- cgit v1.2.3 From f226f6765f7fe435e033da67698565ca876c2b8d Mon Sep 17 00:00:00 2001 From: Wenpeng Liang Date: Fri, 22 May 2020 21:02:57 +0800 Subject: RDMA/hns: Remove redundant parameters from free_srq/qp_wrid() The redundant parameters "hr_dev" need to be removed from free_kernel_wrid() and free_srq_wrid(). Link: https://lore.kernel.org/r/1590152579-32364-3-git-send-email-liweihang@huawei.com Signed-off-by: Wenpeng Liang Signed-off-by: Weihang Li Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_qp.c | 7 +++---- drivers/infiniband/hw/hns/hns_roce_srq.c | 6 +++--- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c index 5d294209fb23..a0a47bd66975 100644 --- a/drivers/infiniband/hw/hns/hns_roce_qp.c +++ b/drivers/infiniband/hw/hns/hns_roce_qp.c @@ -842,8 +842,7 @@ err_sq: return ret; } -static void free_kernel_wrid(struct hns_roce_dev *hr_dev, - struct hns_roce_qp *hr_qp) +static void free_kernel_wrid(struct hns_roce_qp *hr_qp) { kfree(hr_qp->rq.wrid); kfree(hr_qp->sq.wrid); @@ -996,7 +995,7 @@ err_buf: err_db: free_qp_db(hr_dev, hr_qp, udata); err_wrid: - free_kernel_wrid(hr_dev, hr_qp); + free_kernel_wrid(hr_qp); return ret; } @@ -1010,7 +1009,7 @@ void hns_roce_qp_destroy(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp, free_qpc(hr_dev, hr_qp); free_qpn(hr_dev, hr_qp); free_qp_buf(hr_dev, hr_qp); - free_kernel_wrid(hr_dev, hr_qp); + free_kernel_wrid(hr_qp); free_qp_db(hr_dev, hr_qp, udata); kfree(hr_qp); diff --git a/drivers/infiniband/hw/hns/hns_roce_srq.c b/drivers/infiniband/hw/hns/hns_roce_srq.c index 3018c981f1d1..f40a000e94ee 100644 --- a/drivers/infiniband/hw/hns/hns_roce_srq.c +++ b/drivers/infiniband/hw/hns/hns_roce_srq.c @@ -270,7 +270,7 @@ static int alloc_srq_wrid(struct hns_roce_dev *hr_dev, struct hns_roce_srq *srq) return 0; } -static void free_srq_wrid(struct hns_roce_dev *hr_dev, struct hns_roce_srq *srq) +static void free_srq_wrid(struct hns_roce_srq *srq) { kfree(srq->wrid); srq->wrid = NULL; @@ -355,7 +355,7 @@ int hns_roce_create_srq(struct ib_srq *ib_srq, err_srqc_alloc: free_srqc(hr_dev, srq); err_wrid_alloc: - free_srq_wrid(hr_dev, srq); + free_srq_wrid(srq); err_idx_alloc: free_srq_idx(hr_dev, srq); err_buf_alloc: @@ -370,7 +370,7 @@ void hns_roce_destroy_srq(struct ib_srq *ibsrq, struct ib_udata *udata) free_srqc(hr_dev, srq); free_srq_idx(hr_dev, srq); - free_srq_wrid(hr_dev, srq); + free_srq_wrid(srq); free_srq_buf(hr_dev, srq); } -- cgit v1.2.3 From e4aaf4bad4651358a923ad756f8b0c9d447ea21f Mon Sep 17 00:00:00 2001 From: Lang Cheng Date: Fri, 22 May 2020 21:02:58 +0800 Subject: RDMA/hns: Simplify process related to poll cq Set hns_roce_v2_cq_set_ci to inline type and remove unnecessary next_cqe_sw_v2(). Link: https://lore.kernel.org/r/1590152579-32364-4-git-send-email-liweihang@huawei.com Signed-off-by: Lang Cheng Signed-off-by: Weihang Li Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 97d087ae8988..24c661a32e01 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -2776,14 +2776,9 @@ static void *get_sw_cqe_v2(struct hns_roce_cq *hr_cq, int n) !!(n & hr_cq->cq_depth)) ? cqe : NULL; } -static struct hns_roce_v2_cqe *next_cqe_sw_v2(struct hns_roce_cq *hr_cq) +static inline void hns_roce_v2_cq_set_ci(struct hns_roce_cq *hr_cq, u32 ci) { - return get_sw_cqe_v2(hr_cq, hr_cq->cons_index); -} - -static void hns_roce_v2_cq_set_ci(struct hns_roce_cq *hr_cq, u32 cons_index) -{ - *hr_cq->set_ci_db = cons_index & V2_CQ_DB_PARAMETER_CONS_IDX_M; + *hr_cq->set_ci_db = ci & V2_CQ_DB_PARAMETER_CONS_IDX_M; } static void __hns_roce_v2_cq_clean(struct hns_roce_cq *hr_cq, u32 qpn, @@ -3106,7 +3101,7 @@ static int hns_roce_v2_poll_one(struct hns_roce_cq *hr_cq, int ret; /* Find cqe according to consumer index */ - cqe = next_cqe_sw_v2(hr_cq); + cqe = get_sw_cqe_v2(hr_cq, hr_cq->cons_index); if (!cqe) return -EAGAIN; -- cgit v1.2.3 From e1b43f07c0d4c82fd5591ae24d045fee2b00edf3 Mon Sep 17 00:00:00 2001 From: Yixian Liu Date: Fri, 22 May 2020 21:02:59 +0800 Subject: RDMA/hns: Make the end of sge process more clear Instead of i with the sge number of wr will make the comparision more clear, that is, when the sge number in wr is small than the maximum supported sge number in the queue, then a stop sge needed to be filled at the end of sges in wr. Link: https://lore.kernel.org/r/1590152579-32364-5-git-send-email-liweihang@huawei.com Signed-off-by: Yixian Liu Signed-off-by: Weihang Li Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 24c661a32e01..6454ac4ad06f 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -646,7 +646,7 @@ static int hns_roce_v2_post_recv(struct ib_qp *ibqp, dseg++; } - if (i < hr_qp->rq.max_gs) { + if (wr->num_sge < hr_qp->rq.max_gs) { dseg->lkey = cpu_to_le32(HNS_ROCE_INVALID_LKEY); dseg->addr = 0; dseg->len = cpu_to_le32(HNS_ROCE_INVALID_SGE_LENGTH); @@ -782,7 +782,7 @@ static int hns_roce_v2_post_srq_recv(struct ib_srq *ibsrq, dseg[i].addr = cpu_to_le64(wr->sg_list[i].addr); } - if (i < srq->max_gs) { + if (wr->num_sge < srq->max_gs) { dseg[i].len = cpu_to_le32(HNS_ROCE_INVALID_SGE_LENGTH); dseg[i].lkey = cpu_to_le32(HNS_ROCE_INVALID_LKEY); dseg[i].addr = 0; -- cgit v1.2.3 From 49ea0c036ede81f126f1a9389d377999fdf5c5a1 Mon Sep 17 00:00:00 2001 From: Potnuri Bharat Teja Date: Mon, 25 May 2020 00:38:14 +0530 Subject: RDMA/iw_cxgb4: cleanup device debugfs entries on ULD remove Remove device specific debugfs entries immediately if LLD detaches a particular ULD device in case of fatal PCI errors. Link: https://lore.kernel.org/r/20200524190814.17599-1-bharat@chelsio.com Signed-off-by: Potnuri Bharat Teja Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/cxgb4/device.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/infiniband/hw/cxgb4/device.c b/drivers/infiniband/hw/cxgb4/device.c index 599340c1f0b8..541dbcf22d0e 100644 --- a/drivers/infiniband/hw/cxgb4/device.c +++ b/drivers/infiniband/hw/cxgb4/device.c @@ -953,6 +953,7 @@ void c4iw_dealloc(struct uld_ctx *ctx) static void c4iw_remove(struct uld_ctx *ctx) { pr_debug("c4iw_dev %p\n", ctx->dev); + debugfs_remove_recursive(ctx->dev->debugfs_root); c4iw_unregister_device(ctx->dev); c4iw_dealloc(ctx); } -- cgit v1.2.3 From ebd6e96b33a23fe84d1a64441a04d7bc91ccc519 Mon Sep 17 00:00:00 2001 From: Kamal Heib Date: Mon, 25 May 2020 16:03:05 +0300 Subject: RDMA/ipoib: Remove can_sleep parameter from iboib_mcast_alloc can_sleep is always 0 when iboib_mcast_alloc() is called, so remove it and use GFP_ATOMIC instead of GFP_KERNEL. Link: https://lore.kernel.org/r/20200525130305.171509-1-kamalheib1@gmail.com Signed-off-by: Kamal Heib Signed-off-by: Jason Gunthorpe --- drivers/infiniband/ulp/ipoib/ipoib_multicast.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c index 3d5f6b848c9e..9bfa514473d5 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c @@ -135,12 +135,11 @@ static void ipoib_mcast_free(struct ipoib_mcast *mcast) kfree(mcast); } -static struct ipoib_mcast *ipoib_mcast_alloc(struct net_device *dev, - int can_sleep) +static struct ipoib_mcast *ipoib_mcast_alloc(struct net_device *dev) { struct ipoib_mcast *mcast; - mcast = kzalloc(sizeof(*mcast), can_sleep ? GFP_KERNEL : GFP_ATOMIC); + mcast = kzalloc(sizeof(*mcast), GFP_ATOMIC); if (!mcast) return NULL; @@ -599,7 +598,7 @@ void ipoib_mcast_join_task(struct work_struct *work) if (!priv->broadcast) { struct ipoib_mcast *broadcast; - broadcast = ipoib_mcast_alloc(dev, 0); + broadcast = ipoib_mcast_alloc(dev); if (!broadcast) { ipoib_warn(priv, "failed to allocate broadcast group\n"); /* @@ -782,7 +781,7 @@ void ipoib_mcast_send(struct net_device *dev, u8 *daddr, struct sk_buff *skb) ipoib_dbg_mcast(priv, "setting up send only multicast group for %pI6\n", mgid); - mcast = ipoib_mcast_alloc(dev, 0); + mcast = ipoib_mcast_alloc(dev); if (!mcast) { ipoib_warn(priv, "unable to allocate memory " "for multicast structure\n"); @@ -936,7 +935,7 @@ void ipoib_mcast_restart_task(struct work_struct *work) ipoib_dbg_mcast(priv, "adding multicast entry for mgid %pI6\n", mgid.raw); - nmcast = ipoib_mcast_alloc(dev, 0); + nmcast = ipoib_mcast_alloc(dev); if (!nmcast) { ipoib_warn(priv, "unable to allocate memory for multicast structure\n"); continue; -- cgit v1.2.3 From bebcfe85f4338ba1434561a460169a5e0af78f98 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Wed, 27 May 2020 09:41:52 -0500 Subject: RDMA/core: Use sizeof_field() helper Make use of the sizeof_field() helper instead of an open-coded version. Link: https://lore.kernel.org/r/20200527144152.GA22605@embeddedor Signed-off-by: Gustavo A. R. Silva Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/sa_query.c | 14 +++++++------- drivers/infiniband/core/ud_header.c | 2 +- drivers/infiniband/core/uverbs_cmd.c | 2 +- drivers/infiniband/core/uverbs_ioctl.c | 2 +- include/rdma/uverbs_ioctl.h | 12 ++++++------ 5 files changed, 16 insertions(+), 16 deletions(-) diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c index 5c878646ff62..a2ed09a3c714 100644 --- a/drivers/infiniband/core/sa_query.c +++ b/drivers/infiniband/core/sa_query.c @@ -190,7 +190,7 @@ static u32 tid; #define PATH_REC_FIELD(field) \ .struct_offset_bytes = offsetof(struct sa_path_rec, field), \ - .struct_size_bytes = sizeof((struct sa_path_rec *)0)->field, \ + .struct_size_bytes = sizeof_field(struct sa_path_rec, field), \ .field_name = "sa_path_rec:" #field static const struct ib_field path_rec_table[] = { @@ -292,7 +292,7 @@ static const struct ib_field path_rec_table[] = { .struct_offset_bytes = \ offsetof(struct sa_path_rec, field), \ .struct_size_bytes = \ - sizeof((struct sa_path_rec *)0)->field, \ + sizeof_field(struct sa_path_rec, field), \ .field_name = "sa_path_rec:" #field static const struct ib_field opa_path_rec_table[] = { @@ -420,7 +420,7 @@ static const struct ib_field opa_path_rec_table[] = { #define MCMEMBER_REC_FIELD(field) \ .struct_offset_bytes = offsetof(struct ib_sa_mcmember_rec, field), \ - .struct_size_bytes = sizeof ((struct ib_sa_mcmember_rec *) 0)->field, \ + .struct_size_bytes = sizeof_field(struct ib_sa_mcmember_rec, field), \ .field_name = "sa_mcmember_rec:" #field static const struct ib_field mcmember_rec_table[] = { @@ -504,7 +504,7 @@ static const struct ib_field mcmember_rec_table[] = { #define SERVICE_REC_FIELD(field) \ .struct_offset_bytes = offsetof(struct ib_sa_service_rec, field), \ - .struct_size_bytes = sizeof ((struct ib_sa_service_rec *) 0)->field, \ + .struct_size_bytes = sizeof_field(struct ib_sa_service_rec, field), \ .field_name = "sa_service_rec:" #field static const struct ib_field service_rec_table[] = { @@ -552,7 +552,7 @@ static const struct ib_field service_rec_table[] = { #define CLASSPORTINFO_REC_FIELD(field) \ .struct_offset_bytes = offsetof(struct ib_class_port_info, field), \ - .struct_size_bytes = sizeof((struct ib_class_port_info *)0)->field, \ + .struct_size_bytes = sizeof_field(struct ib_class_port_info, field), \ .field_name = "ib_class_port_info:" #field static const struct ib_field ib_classport_info_rec_table[] = { @@ -630,7 +630,7 @@ static const struct ib_field ib_classport_info_rec_table[] = { .struct_offset_bytes =\ offsetof(struct opa_class_port_info, field), \ .struct_size_bytes = \ - sizeof((struct opa_class_port_info *)0)->field, \ + sizeof_field(struct opa_class_port_info, field), \ .field_name = "opa_class_port_info:" #field static const struct ib_field opa_classport_info_rec_table[] = { @@ -710,7 +710,7 @@ static const struct ib_field opa_classport_info_rec_table[] = { #define GUIDINFO_REC_FIELD(field) \ .struct_offset_bytes = offsetof(struct ib_sa_guidinfo_rec, field), \ - .struct_size_bytes = sizeof((struct ib_sa_guidinfo_rec *) 0)->field, \ + .struct_size_bytes = sizeof_field(struct ib_sa_guidinfo_rec, field), \ .field_name = "sa_guidinfo_rec:" #field static const struct ib_field guidinfo_rec_table[] = { diff --git a/drivers/infiniband/core/ud_header.c b/drivers/infiniband/core/ud_header.c index 29a45d2f8898..d65d541b9a25 100644 --- a/drivers/infiniband/core/ud_header.c +++ b/drivers/infiniband/core/ud_header.c @@ -41,7 +41,7 @@ #define STRUCT_FIELD(header, field) \ .struct_offset_bytes = offsetof(struct ib_unpacked_ ## header, field), \ - .struct_size_bytes = sizeof ((struct ib_unpacked_ ## header *) 0)->field, \ + .struct_size_bytes = sizeof_field(struct ib_unpacked_ ## header, field), \ .field_name = #header ":" #field static const struct ib_field lrh_table[] = { diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 4859ac0df17c..2067a939788b 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -3741,7 +3741,7 @@ static int ib_uverbs_ex_modify_cq(struct uverbs_attr_bundle *attrs) #define UAPI_DEF_WRITE_IO(req, resp) \ .write.has_resp = 1 + \ BUILD_BUG_ON_ZERO(offsetof(req, response) != 0) + \ - BUILD_BUG_ON_ZERO(sizeof(((req *)0)->response) != \ + BUILD_BUG_ON_ZERO(sizeof_field(req, response) != \ sizeof(u64)), \ .write.req_size = sizeof(req), .write.resp_size = sizeof(resp) diff --git a/drivers/infiniband/core/uverbs_ioctl.c b/drivers/infiniband/core/uverbs_ioctl.c index 42c5696f03bd..2d882c02387c 100644 --- a/drivers/infiniband/core/uverbs_ioctl.c +++ b/drivers/infiniband/core/uverbs_ioctl.c @@ -137,7 +137,7 @@ EXPORT_SYMBOL(_uverbs_alloc); static bool uverbs_is_attr_cleared(const struct ib_uverbs_attr *uattr, u16 len) { - if (uattr->len > sizeof(((struct ib_uverbs_attr *)0)->data)) + if (uattr->len > sizeof_field(struct ib_uverbs_attr, data)) return ib_is_buffer_cleared(u64_to_user_ptr(uattr->data) + len, uattr->len - len); diff --git a/include/rdma/uverbs_ioctl.h b/include/rdma/uverbs_ioctl.h index 5bd2b037e914..0418d7bddf3e 100644 --- a/include/rdma/uverbs_ioctl.h +++ b/include/rdma/uverbs_ioctl.h @@ -420,9 +420,9 @@ struct uapi_definition { .scope = UAPI_SCOPE_OBJECT, \ .needs_fn_offset = \ offsetof(struct ib_device_ops, ibdev_fn) + \ - BUILD_BUG_ON_ZERO( \ - sizeof(((struct ib_device_ops *)0)->ibdev_fn) != \ - sizeof(void *)), \ + BUILD_BUG_ON_ZERO(sizeof_field(struct ib_device_ops, \ + ibdev_fn) != \ + sizeof(void *)), \ } /* @@ -435,9 +435,9 @@ struct uapi_definition { .scope = UAPI_SCOPE_METHOD, \ .needs_fn_offset = \ offsetof(struct ib_device_ops, ibdev_fn) + \ - BUILD_BUG_ON_ZERO( \ - sizeof(((struct ib_device_ops *)0)->ibdev_fn) != \ - sizeof(void *)), \ + BUILD_BUG_ON_ZERO(sizeof_field(struct ib_device_ops, \ + ibdev_fn) != \ + sizeof(void *)), \ } /* Call a function to determine if the entire object is supported or not */ -- cgit v1.2.3 From d246a3061528be6d852156d25c02ea69d6db7e65 Mon Sep 17 00:00:00 2001 From: Mark Zhang Date: Wed, 27 May 2020 16:57:03 +0300 Subject: IB/mlx5: Fix DEVX support for MLX5_CMD_OP_INIT2INIT_QP command The commit citied in the Fixes line wasn't complete and solved only part of the problems. Update the mlx5_ib to properly support MLX5_CMD_OP_INIT2INIT_QP command in the DEVX, that is required when modify the QP tx_port_affinity. Fixes: 819f7427bafd ("RDMA/mlx5: Add init2init as a modify command") Link: https://lore.kernel.org/r/20200527135703.482501-1-leon@kernel.org Signed-off-by: Mark Zhang Reviewed-by: Maor Gottlieb Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/devx.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/infiniband/hw/mlx5/devx.c b/drivers/infiniband/hw/mlx5/devx.c index 3047e7d60a9b..9454a66c12cc 100644 --- a/drivers/infiniband/hw/mlx5/devx.c +++ b/drivers/infiniband/hw/mlx5/devx.c @@ -495,6 +495,10 @@ static u64 devx_get_obj_id(const void *in) obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_QP, MLX5_GET(rst2init_qp_in, in, qpn)); break; + case MLX5_CMD_OP_INIT2INIT_QP: + obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_QP, + MLX5_GET(init2init_qp_in, in, qpn)); + break; case MLX5_CMD_OP_INIT2RTR_QP: obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_QP, MLX5_GET(init2rtr_qp_in, in, qpn)); -- cgit v1.2.3 From fef17f91da7d4af11dde1ff832b82d6a64f89562 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 26 May 2020 13:32:59 +0300 Subject: RDMA/cm: Add Enhanced Connection Establishment (ECE) bits Extend REQ (request for communications), REP (reply to request for communication), rejected reason and SIDR_REP (service ID resolution response) structures with hardware vendor ID bits according to IBTA v1.4. Link: https://lore.kernel.org/r/20200526103304.196371-2-leon@kernel.org Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- include/rdma/ibta_vol1_c12.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/include/rdma/ibta_vol1_c12.h b/include/rdma/ibta_vol1_c12.h index 269904425d3f..960c86bec76c 100644 --- a/include/rdma/ibta_vol1_c12.h +++ b/include/rdma/ibta_vol1_c12.h @@ -38,6 +38,7 @@ /* Table 106 REQ Message Contents */ #define CM_REQ_LOCAL_COMM_ID CM_FIELD32_LOC(struct cm_req_msg, 0, 32) +#define CM_REQ_VENDOR_ID CM_FIELD32_LOC(struct cm_req_msg, 5, 24) #define CM_REQ_SERVICE_ID CM_FIELD64_LOC(struct cm_req_msg, 8) #define CM_REQ_LOCAL_CA_GUID CM_FIELD64_LOC(struct cm_req_msg, 16) #define CM_REQ_LOCAL_Q_KEY CM_FIELD32_LOC(struct cm_req_msg, 28, 32) @@ -119,8 +120,11 @@ CM_STRUCT(struct cm_rej_msg, 84 * 8 + 1184); #define CM_REP_REMOTE_COMM_ID CM_FIELD32_LOC(struct cm_rep_msg, 4, 32) #define CM_REP_LOCAL_Q_KEY CM_FIELD32_LOC(struct cm_rep_msg, 8, 32) #define CM_REP_LOCAL_QPN CM_FIELD32_LOC(struct cm_rep_msg, 12, 24) +#define CM_REP_VENDOR_ID_H CM_FIELD8_LOC(struct cm_rep_msg, 15, 8) #define CM_REP_LOCAL_EE_CONTEXT_NUMBER CM_FIELD32_LOC(struct cm_rep_msg, 16, 24) +#define CM_REP_VENDOR_ID_M CM_FIELD8_LOC(struct cm_rep_msg, 19, 8) #define CM_REP_STARTING_PSN CM_FIELD32_LOC(struct cm_rep_msg, 20, 24) +#define CM_REP_VENDOR_ID_L CM_FIELD8_LOC(struct cm_rep_msg, 23, 8) #define CM_REP_RESPONDER_RESOURCES CM_FIELD8_LOC(struct cm_rep_msg, 24, 8) #define CM_REP_INITIATOR_DEPTH CM_FIELD8_LOC(struct cm_rep_msg, 25, 8) #define CM_REP_TARGET_ACK_DELAY CM_FIELD8_LOC(struct cm_rep_msg, 26, 5) @@ -201,7 +205,9 @@ CM_STRUCT(struct cm_sidr_req_msg, 16 * 8 + 1728); #define CM_SIDR_REP_STATUS CM_FIELD8_LOC(struct cm_sidr_rep_msg, 4, 8) #define CM_SIDR_REP_ADDITIONAL_INFORMATION_LENGTH \ CM_FIELD8_LOC(struct cm_sidr_rep_msg, 5, 8) +#define CM_SIDR_REP_VENDOR_ID_H CM_FIELD16_LOC(struct cm_sidr_rep_msg, 6, 16) #define CM_SIDR_REP_QPN CM_FIELD32_LOC(struct cm_sidr_rep_msg, 8, 24) +#define CM_SIDR_REP_VENDOR_ID_L CM_FIELD8_LOC(struct cm_sidr_rep_msg, 11, 8) #define CM_SIDR_REP_SERVICEID CM_FIELD64_LOC(struct cm_sidr_rep_msg, 12) #define CM_SIDR_REP_Q_KEY CM_FIELD32_LOC(struct cm_sidr_rep_msg, 20, 32) #define CM_SIDR_REP_ADDITIONAL_INFORMATION \ -- cgit v1.2.3 From 34e2ab57a911f8b32b22580d11a02f0b79108245 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 26 May 2020 13:33:00 +0300 Subject: RDMA/ucma: Extend ucma_connect to receive ECE parameters Active side of CMID initiates connection through librdmacm's rdma_connect() and kernel's ucma_connect(). Extend UCMA interface to handle those new parameters. Link: https://lore.kernel.org/r/20200526103304.196371-3-leon@kernel.org Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cma.c | 21 +++++++++++++++++++++ drivers/infiniband/core/cma_priv.h | 1 + drivers/infiniband/core/ucma.c | 14 +++++++++++--- include/rdma/rdma_cm.h | 3 +++ include/uapi/rdma/rdma_user_cm.h | 6 ++++++ 5 files changed, 42 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 432eec472164..e81b8a523a3e 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -4036,6 +4036,27 @@ err: } EXPORT_SYMBOL(rdma_connect); +/** + * rdma_connect_ece - Initiate an active connection request with ECE data. + * @id: Connection identifier to connect. + * @conn_param: Connection information used for connected QPs. + * @ece: ECE parameters + * + * See rdma_connect() explanation. + */ +int rdma_connect_ece(struct rdma_cm_id *id, struct rdma_conn_param *conn_param, + struct rdma_ucm_ece *ece) +{ + struct rdma_id_private *id_priv = + container_of(id, struct rdma_id_private, id); + + id_priv->ece.vendor_id = ece->vendor_id; + id_priv->ece.attr_mod = ece->attr_mod; + + return rdma_connect(id, conn_param); +} +EXPORT_SYMBOL(rdma_connect_ece); + static int cma_accept_ib(struct rdma_id_private *id_priv, struct rdma_conn_param *conn_param) { diff --git a/drivers/infiniband/core/cma_priv.h b/drivers/infiniband/core/cma_priv.h index 5edcf44a9307..caece96ebcf5 100644 --- a/drivers/infiniband/core/cma_priv.h +++ b/drivers/infiniband/core/cma_priv.h @@ -95,6 +95,7 @@ struct rdma_id_private { * Internal to RDMA/core, don't use in the drivers */ struct rdma_restrack_entry res; + struct rdma_ucm_ece ece; }; #if IS_ENABLED(CONFIG_INFINIBAND_ADDR_TRANS_CONFIGFS) diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c index 06127c800a49..7cbb63690241 100644 --- a/drivers/infiniband/core/ucma.c +++ b/drivers/infiniband/core/ucma.c @@ -1072,12 +1072,15 @@ static void ucma_copy_conn_param(struct rdma_cm_id *id, static ssize_t ucma_connect(struct ucma_file *file, const char __user *inbuf, int in_len, int out_len) { - struct rdma_ucm_connect cmd; struct rdma_conn_param conn_param; + struct rdma_ucm_ece ece = {}; + struct rdma_ucm_connect cmd; struct ucma_context *ctx; + size_t in_size; int ret; - if (copy_from_user(&cmd, inbuf, sizeof(cmd))) + in_size = min_t(size_t, in_len, sizeof(cmd)); + if (copy_from_user(&cmd, inbuf, in_size)) return -EFAULT; if (!cmd.conn_param.valid) @@ -1088,8 +1091,13 @@ static ssize_t ucma_connect(struct ucma_file *file, const char __user *inbuf, return PTR_ERR(ctx); ucma_copy_conn_param(ctx->cm_id, &conn_param, &cmd.conn_param); + if (offsetofend(typeof(cmd), ece) <= in_size) { + ece.vendor_id = cmd.ece.vendor_id; + ece.attr_mod = cmd.ece.attr_mod; + } + mutex_lock(&ctx->mutex); - ret = rdma_connect(ctx->cm_id, &conn_param); + ret = rdma_connect_ece(ctx->cm_id, &conn_param, &ece); mutex_unlock(&ctx->mutex); ucma_put_ctx(ctx); return ret; diff --git a/include/rdma/rdma_cm.h b/include/rdma/rdma_cm.h index ea8e794785ed..4e2975eb3643 100644 --- a/include/rdma/rdma_cm.h +++ b/include/rdma/rdma_cm.h @@ -264,6 +264,9 @@ int rdma_init_qp_attr(struct rdma_cm_id *id, struct ib_qp_attr *qp_attr, */ int rdma_connect(struct rdma_cm_id *id, struct rdma_conn_param *conn_param); +int rdma_connect_ece(struct rdma_cm_id *id, struct rdma_conn_param *conn_param, + struct rdma_ucm_ece *ece); + /** * rdma_listen - This function is called by the passive side to * listen for incoming connection requests. diff --git a/include/uapi/rdma/rdma_user_cm.h b/include/uapi/rdma/rdma_user_cm.h index 1bb6e75d254b..c1409dd7225f 100644 --- a/include/uapi/rdma/rdma_user_cm.h +++ b/include/uapi/rdma/rdma_user_cm.h @@ -210,10 +210,16 @@ struct rdma_ucm_ud_param { __u8 reserved[7]; }; +struct rdma_ucm_ece { + __u32 vendor_id; + __u32 attr_mod; +}; + struct rdma_ucm_connect { struct rdma_ucm_conn_param conn_param; __u32 id; __u32 reserved; + struct rdma_ucm_ece ece; }; struct rdma_ucm_listen { -- cgit v1.2.3 From 93531ee7b9d1313227d2b4f354989895e8d57b72 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 26 May 2020 13:33:01 +0300 Subject: RDMA/ucma: Deliver ECE parameters through UCMA events Passive side of CMID connection receives ECE request through REQ message and needs to respond with relevant REP message which will be forwarded to active side. The UCMA events interface is responsible for such communication with the user space (librdmacm). Extend it to provide ECE wire data. Link: https://lore.kernel.org/r/20200526103304.196371-4-leon@kernel.org Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/ucma.c | 6 +++++- include/rdma/rdma_cm.h | 1 + include/uapi/rdma/rdma_user_cm.h | 1 + 3 files changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c index 7cbb63690241..3e5268cfa164 100644 --- a/drivers/infiniband/core/ucma.c +++ b/drivers/infiniband/core/ucma.c @@ -360,6 +360,9 @@ static int ucma_event_handler(struct rdma_cm_id *cm_id, ucma_copy_conn_event(&uevent->resp.param.conn, &event->param.conn); + uevent->resp.ece.vendor_id = event->ece.vendor_id; + uevent->resp.ece.attr_mod = event->ece.attr_mod; + if (event->event == RDMA_CM_EVENT_CONNECT_REQUEST) { if (!ctx->backlog) { ret = -ENOMEM; @@ -404,7 +407,8 @@ static ssize_t ucma_get_event(struct ucma_file *file, const char __user *inbuf, * Old 32 bit user space does not send the 4 byte padding in the * reserved field. We don't care, allow it to keep working. */ - if (out_len < sizeof(uevent->resp) - sizeof(uevent->resp.reserved)) + if (out_len < sizeof(uevent->resp) - sizeof(uevent->resp.reserved) - + sizeof(uevent->resp.ece)) return -ENOSPC; if (copy_from_user(&cmd, inbuf, sizeof(cmd))) diff --git a/include/rdma/rdma_cm.h b/include/rdma/rdma_cm.h index 4e2975eb3643..418590c9a9e8 100644 --- a/include/rdma/rdma_cm.h +++ b/include/rdma/rdma_cm.h @@ -111,6 +111,7 @@ struct rdma_cm_event { struct rdma_conn_param conn; struct rdma_ud_param ud; } param; + struct rdma_ucm_ece ece; }; struct rdma_cm_id; diff --git a/include/uapi/rdma/rdma_user_cm.h b/include/uapi/rdma/rdma_user_cm.h index c1409dd7225f..19c5c3f74af9 100644 --- a/include/uapi/rdma/rdma_user_cm.h +++ b/include/uapi/rdma/rdma_user_cm.h @@ -297,6 +297,7 @@ struct rdma_ucm_event_resp { struct rdma_ucm_ud_param ud; } param; __u32 reserved; + struct rdma_ucm_ece ece; }; /* Option levels */ -- cgit v1.2.3 From a20652e175f2c5cea74c90503eeaeafabd08abed Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 26 May 2020 13:33:02 +0300 Subject: RDMA/cm: Send and receive ECE parameter over the wire ECE parameters are exchanged through REQ->REP/SIDR_REP messages, this patch adds the data to provide to other side of CMID communication channel. Link: https://lore.kernel.org/r/20200526103304.196371-5-leon@kernel.org Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cm.c | 39 ++++++++++++++++++++++++++++++++++----- drivers/infiniband/core/cma.c | 8 ++++++++ include/rdma/ib_cm.h | 9 ++++++++- 3 files changed, 50 insertions(+), 6 deletions(-) diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c index f38ff46abe8f..085c146fe400 100644 --- a/drivers/infiniband/core/cm.c +++ b/drivers/infiniband/core/cm.c @@ -66,6 +66,8 @@ static const char * const ibcm_rej_reason_strs[] = { [IB_CM_REJ_INVALID_CLASS_VERSION] = "invalid class version", [IB_CM_REJ_INVALID_FLOW_LABEL] = "invalid flow label", [IB_CM_REJ_INVALID_ALT_FLOW_LABEL] = "invalid alt flow label", + [IB_CM_REJ_VENDOR_OPTION_NOT_SUPPORTED] = + "vendor option is not supported", }; const char *__attribute_const__ ibcm_reject_msg(int reason) @@ -290,6 +292,8 @@ struct cm_id_private { struct list_head work_list; atomic_t work_count; + + struct rdma_ucm_ece ece; }; static void cm_work_handler(struct work_struct *work); @@ -1318,6 +1322,13 @@ static void cm_format_mad_hdr(struct ib_mad_hdr *hdr, hdr->tid = tid; } +static void cm_format_mad_ece_hdr(struct ib_mad_hdr *hdr, __be16 attr_id, + __be64 tid, u32 attr_mod) +{ + cm_format_mad_hdr(hdr, attr_id, tid); + hdr->attr_mod = cpu_to_be32(attr_mod); +} + static void cm_format_req(struct cm_req_msg *req_msg, struct cm_id_private *cm_id_priv, struct ib_cm_req_param *param) @@ -1330,8 +1341,8 @@ static void cm_format_req(struct cm_req_msg *req_msg, pri_ext = opa_is_extended_lid(pri_path->opa.dlid, pri_path->opa.slid); - cm_format_mad_hdr(&req_msg->hdr, CM_REQ_ATTR_ID, - cm_form_tid(cm_id_priv)); + cm_format_mad_ece_hdr(&req_msg->hdr, CM_REQ_ATTR_ID, + cm_form_tid(cm_id_priv), param->ece.attr_mod); IBA_SET(CM_REQ_LOCAL_COMM_ID, req_msg, be32_to_cpu(cm_id_priv->id.local_id)); @@ -1454,6 +1465,7 @@ static void cm_format_req(struct cm_req_msg *req_msg, cm_ack_timeout(cm_id_priv->av.port->cm_dev->ack_delay, alt_path->packet_life_time)); } + IBA_SET(CM_REQ_VENDOR_ID, req_msg, param->ece.vendor_id); if (param->private_data && param->private_data_len) IBA_SET_MEM(CM_REQ_PRIVATE_DATA, req_msg, param->private_data, @@ -1810,6 +1822,9 @@ static void cm_format_req_event(struct cm_work *work, param->rnr_retry_count = IBA_GET(CM_REQ_RNR_RETRY_COUNT, req_msg); param->srq = IBA_GET(CM_REQ_SRQ, req_msg); param->ppath_sgid_attr = cm_id_priv->av.ah_attr.grh.sgid_attr; + param->ece.vendor_id = IBA_GET(CM_REQ_VENDOR_ID, req_msg); + param->ece.attr_mod = be32_to_cpu(req_msg->hdr.attr_mod); + work->cm_event.private_data = IBA_GET_MEM_PTR(CM_REQ_PRIVATE_DATA, req_msg); } @@ -2202,7 +2217,8 @@ static void cm_format_rep(struct cm_rep_msg *rep_msg, struct cm_id_private *cm_id_priv, struct ib_cm_rep_param *param) { - cm_format_mad_hdr(&rep_msg->hdr, CM_REP_ATTR_ID, cm_id_priv->tid); + cm_format_mad_ece_hdr(&rep_msg->hdr, CM_REP_ATTR_ID, cm_id_priv->tid, + param->ece.attr_mod); IBA_SET(CM_REP_LOCAL_COMM_ID, rep_msg, be32_to_cpu(cm_id_priv->id.local_id)); IBA_SET(CM_REP_REMOTE_COMM_ID, rep_msg, @@ -2229,6 +2245,10 @@ static void cm_format_rep(struct cm_rep_msg *rep_msg, IBA_SET(CM_REP_LOCAL_EE_CONTEXT_NUMBER, rep_msg, param->qp_num); } + IBA_SET(CM_REP_VENDOR_ID_L, rep_msg, param->ece.vendor_id); + IBA_SET(CM_REP_VENDOR_ID_M, rep_msg, param->ece.vendor_id >> 8); + IBA_SET(CM_REP_VENDOR_ID_H, rep_msg, param->ece.vendor_id >> 16); + if (param->private_data && param->private_data_len) IBA_SET_MEM(CM_REP_PRIVATE_DATA, rep_msg, param->private_data, param->private_data_len); @@ -2376,6 +2396,11 @@ static void cm_format_rep_event(struct cm_work *work, enum ib_qp_type qp_type) param->flow_control = IBA_GET(CM_REP_END_TO_END_FLOW_CONTROL, rep_msg); param->rnr_retry_count = IBA_GET(CM_REP_RNR_RETRY_COUNT, rep_msg); param->srq = IBA_GET(CM_REP_SRQ, rep_msg); + param->ece.vendor_id = IBA_GET(CM_REP_VENDOR_ID_H, rep_msg) << 16; + param->ece.vendor_id |= IBA_GET(CM_REP_VENDOR_ID_M, rep_msg) << 8; + param->ece.vendor_id |= IBA_GET(CM_REP_VENDOR_ID_L, rep_msg); + param->ece.attr_mod = be32_to_cpu(rep_msg->hdr.attr_mod); + work->cm_event.private_data = IBA_GET_MEM_PTR(CM_REP_PRIVATE_DATA, rep_msg); } @@ -3597,8 +3622,8 @@ static void cm_format_sidr_rep(struct cm_sidr_rep_msg *sidr_rep_msg, struct cm_id_private *cm_id_priv, struct ib_cm_sidr_rep_param *param) { - cm_format_mad_hdr(&sidr_rep_msg->hdr, CM_SIDR_REP_ATTR_ID, - cm_id_priv->tid); + cm_format_mad_ece_hdr(&sidr_rep_msg->hdr, CM_SIDR_REP_ATTR_ID, + cm_id_priv->tid, param->ece.attr_mod); IBA_SET(CM_SIDR_REP_REQUESTID, sidr_rep_msg, be32_to_cpu(cm_id_priv->id.remote_id)); IBA_SET(CM_SIDR_REP_STATUS, sidr_rep_msg, param->status); @@ -3606,6 +3631,10 @@ static void cm_format_sidr_rep(struct cm_sidr_rep_msg *sidr_rep_msg, IBA_SET(CM_SIDR_REP_SERVICEID, sidr_rep_msg, be64_to_cpu(cm_id_priv->id.service_id)); IBA_SET(CM_SIDR_REP_Q_KEY, sidr_rep_msg, param->qkey); + IBA_SET(CM_SIDR_REP_VENDOR_ID_L, sidr_rep_msg, + param->ece.vendor_id & 0xFF); + IBA_SET(CM_SIDR_REP_VENDOR_ID_H, sidr_rep_msg, + (param->ece.vendor_id >> 8) & 0xFF); if (param->info && param->info_length) IBA_SET_MEM(CM_SIDR_REP_ADDITIONAL_INFORMATION, sidr_rep_msg, diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index e81b8a523a3e..f554a371f4fa 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -1911,6 +1911,9 @@ static void cma_set_rep_event_data(struct rdma_cm_event *event, event->param.conn.rnr_retry_count = rep_data->rnr_retry_count; event->param.conn.srq = rep_data->srq; event->param.conn.qp_num = rep_data->remote_qpn; + + event->ece.vendor_id = rep_data->ece.vendor_id; + event->ece.attr_mod = rep_data->ece.attr_mod; } static int cma_cm_event_handler(struct rdma_id_private *id_priv, @@ -2129,6 +2132,9 @@ static void cma_set_req_event_data(struct rdma_cm_event *event, event->param.conn.rnr_retry_count = req_data->rnr_retry_count; event->param.conn.srq = req_data->srq; event->param.conn.qp_num = req_data->remote_qpn; + + event->ece.vendor_id = req_data->ece.vendor_id; + event->ece.attr_mod = req_data->ece.attr_mod; } static int cma_ib_check_req_qp_type(const struct rdma_cm_id *id, @@ -3947,6 +3953,8 @@ static int cma_connect_ib(struct rdma_id_private *id_priv, req.local_cm_response_timeout = CMA_CM_RESPONSE_TIMEOUT; req.max_cm_retries = CMA_MAX_CM_RETRIES; req.srq = id_priv->srq ? 1 : 0; + req.ece.vendor_id = id_priv->ece.vendor_id; + req.ece.attr_mod = id_priv->ece.attr_mod; trace_cm_send_req(id_priv); ret = ib_send_cm_req(id_priv->cm_id.ib, &req); diff --git a/include/rdma/ib_cm.h b/include/rdma/ib_cm.h index 058cfbc2b37f..0f1ea5f2d01c 100644 --- a/include/rdma/ib_cm.h +++ b/include/rdma/ib_cm.h @@ -11,6 +11,7 @@ #include #include +#include /* ib_cm and ib_user_cm modules share /sys/class/infiniband_cm */ extern struct class cm_class; @@ -115,6 +116,7 @@ struct ib_cm_req_event_param { unsigned int retry_count:3; unsigned int rnr_retry_count:3; unsigned int srq:1; + struct rdma_ucm_ece ece; }; struct ib_cm_rep_event_param { @@ -129,6 +131,7 @@ struct ib_cm_rep_event_param { unsigned int flow_control:1; unsigned int rnr_retry_count:3; unsigned int srq:1; + struct rdma_ucm_ece ece; }; enum ib_cm_rej_reason { @@ -164,7 +167,8 @@ enum ib_cm_rej_reason { IB_CM_REJ_DUPLICATE_LOCAL_COMM_ID = 30, IB_CM_REJ_INVALID_CLASS_VERSION = 31, IB_CM_REJ_INVALID_FLOW_LABEL = 32, - IB_CM_REJ_INVALID_ALT_FLOW_LABEL = 33 + IB_CM_REJ_INVALID_ALT_FLOW_LABEL = 33, + IB_CM_REJ_VENDOR_OPTION_NOT_SUPPORTED = 35, }; struct ib_cm_rej_event_param { @@ -369,6 +373,7 @@ struct ib_cm_req_param { u8 rnr_retry_count; u8 max_cm_retries; u8 srq; + struct rdma_ucm_ece ece; }; /** @@ -392,6 +397,7 @@ struct ib_cm_rep_param { u8 flow_control; u8 rnr_retry_count; u8 srq; + struct rdma_ucm_ece ece; }; /** @@ -546,6 +552,7 @@ struct ib_cm_sidr_rep_param { u8 info_length; const void *private_data; u8 private_data_len; + struct rdma_ucm_ece ece; }; /** -- cgit v1.2.3 From 0cb15372a615a9835893f43e86ae45399eb63996 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 26 May 2020 13:33:03 +0300 Subject: RDMA/cma: Connect ECE to rdma_accept The rdma_accept() is called by both passive and active sides of CMID connection to mark readiness to start data transfer. For passive side, this is called explicitly, for active side, it is called implicitly while receiving REP message. Provide ECE data to rdma_accept function needed for passive side to send that REP message. Link: https://lore.kernel.org/r/20200526103304.196371-6-leon@kernel.org Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cma.c | 19 +++++++++++++++++++ drivers/infiniband/core/ucma.c | 14 +++++++++++--- include/rdma/rdma_cm.h | 3 +++ include/uapi/rdma/rdma_user_cm.h | 1 + 4 files changed, 34 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index f554a371f4fa..d449afe5557b 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -4090,6 +4090,8 @@ static int cma_accept_ib(struct rdma_id_private *id_priv, rep.flow_control = conn_param->flow_control; rep.rnr_retry_count = min_t(u8, 7, conn_param->rnr_retry_count); rep.srq = id_priv->srq ? 1 : 0; + rep.ece.vendor_id = id_priv->ece.vendor_id; + rep.ece.attr_mod = id_priv->ece.attr_mod; trace_cm_send_rep(id_priv); ret = ib_send_cm_rep(id_priv->cm_id.ib, &rep); @@ -4137,7 +4139,11 @@ static int cma_send_sidr_rep(struct rdma_id_private *id_priv, return ret; rep.qp_num = id_priv->qp_num; rep.qkey = id_priv->qkey; + + rep.ece.vendor_id = id_priv->ece.vendor_id; + rep.ece.attr_mod = id_priv->ece.attr_mod; } + rep.private_data = private_data; rep.private_data_len = private_data_len; @@ -4195,6 +4201,19 @@ reject: } EXPORT_SYMBOL(__rdma_accept); +int __rdma_accept_ece(struct rdma_cm_id *id, struct rdma_conn_param *conn_param, + const char *caller, struct rdma_ucm_ece *ece) +{ + struct rdma_id_private *id_priv = + container_of(id, struct rdma_id_private, id); + + id_priv->ece.vendor_id = ece->vendor_id; + id_priv->ece.attr_mod = ece->attr_mod; + + return __rdma_accept(id, conn_param, caller); +} +EXPORT_SYMBOL(__rdma_accept_ece); + int rdma_notify(struct rdma_cm_id *id, enum ib_event_type event) { struct rdma_id_private *id_priv; diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c index 3e5268cfa164..6b27b210b890 100644 --- a/drivers/infiniband/core/ucma.c +++ b/drivers/infiniband/core/ucma.c @@ -1135,28 +1135,36 @@ static ssize_t ucma_accept(struct ucma_file *file, const char __user *inbuf, { struct rdma_ucm_accept cmd; struct rdma_conn_param conn_param; + struct rdma_ucm_ece ece = {}; struct ucma_context *ctx; + size_t in_size; int ret; - if (copy_from_user(&cmd, inbuf, sizeof(cmd))) + in_size = min_t(size_t, in_len, sizeof(cmd)); + if (copy_from_user(&cmd, inbuf, in_size)) return -EFAULT; ctx = ucma_get_ctx_dev(file, cmd.id); if (IS_ERR(ctx)) return PTR_ERR(ctx); + if (offsetofend(typeof(cmd), ece) <= in_size) { + ece.vendor_id = cmd.ece.vendor_id; + ece.attr_mod = cmd.ece.attr_mod; + } + if (cmd.conn_param.valid) { ucma_copy_conn_param(ctx->cm_id, &conn_param, &cmd.conn_param); mutex_lock(&file->mut); mutex_lock(&ctx->mutex); - ret = __rdma_accept(ctx->cm_id, &conn_param, NULL); + ret = __rdma_accept_ece(ctx->cm_id, &conn_param, NULL, &ece); mutex_unlock(&ctx->mutex); if (!ret) ctx->uid = cmd.uid; mutex_unlock(&file->mut); } else { mutex_lock(&ctx->mutex); - ret = __rdma_accept(ctx->cm_id, NULL, NULL); + ret = __rdma_accept_ece(ctx->cm_id, NULL, NULL, &ece); mutex_unlock(&ctx->mutex); } ucma_put_ctx(ctx); diff --git a/include/rdma/rdma_cm.h b/include/rdma/rdma_cm.h index 418590c9a9e8..7ac91677660f 100644 --- a/include/rdma/rdma_cm.h +++ b/include/rdma/rdma_cm.h @@ -280,6 +280,9 @@ int rdma_listen(struct rdma_cm_id *id, int backlog); int __rdma_accept(struct rdma_cm_id *id, struct rdma_conn_param *conn_param, const char *caller); +int __rdma_accept_ece(struct rdma_cm_id *id, struct rdma_conn_param *conn_param, + const char *caller, struct rdma_ucm_ece *ece); + /** * rdma_accept - Called to accept a connection request or response. * @id: Connection identifier associated with the request. diff --git a/include/uapi/rdma/rdma_user_cm.h b/include/uapi/rdma/rdma_user_cm.h index 19c5c3f74af9..6b883dde7064 100644 --- a/include/uapi/rdma/rdma_user_cm.h +++ b/include/uapi/rdma/rdma_user_cm.h @@ -232,6 +232,7 @@ struct rdma_ucm_accept { struct rdma_ucm_conn_param conn_param; __u32 id; __u32 reserved; + struct rdma_ucm_ece ece; }; struct rdma_ucm_reject { -- cgit v1.2.3 From 8094ba0ace7f6cd1e31ea8b151fba3594cadfa9a Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 26 May 2020 13:33:04 +0300 Subject: RDMA/cma: Provide ECE reject reason IBTA declares "vendor option not supported" reject reason in REJ messages if passive side doesn't want to accept proposed ECE options. Due to the fact that ECE is managed by userspace, there is a need to let users to provide such rejected reason. Link: https://lore.kernel.org/r/20200526103304.196371-7-leon@kernel.org Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cma.c | 9 ++++----- drivers/infiniband/core/ucma.c | 15 ++++++++++++++- drivers/infiniband/ulp/isert/ib_isert.c | 5 +++-- drivers/infiniband/ulp/rtrs/rtrs-srv.c | 3 ++- drivers/infiniband/ulp/srpt/ib_srpt.c | 3 ++- drivers/nvme/target/rdma.c | 4 +++- include/rdma/rdma_cm.h | 2 +- include/uapi/rdma/rdma_user_cm.h | 3 ++- net/rds/ib_cm.c | 4 +++- 9 files changed, 34 insertions(+), 14 deletions(-) diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index d449afe5557b..8026ee56546a 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -4196,7 +4196,7 @@ int __rdma_accept(struct rdma_cm_id *id, struct rdma_conn_param *conn_param, return 0; reject: cma_modify_qp_err(id_priv); - rdma_reject(id, NULL, 0); + rdma_reject(id, NULL, 0, IB_CM_REJ_CONSUMER_DEFINED); return ret; } EXPORT_SYMBOL(__rdma_accept); @@ -4236,7 +4236,7 @@ int rdma_notify(struct rdma_cm_id *id, enum ib_event_type event) EXPORT_SYMBOL(rdma_notify); int rdma_reject(struct rdma_cm_id *id, const void *private_data, - u8 private_data_len) + u8 private_data_len, u8 reason) { struct rdma_id_private *id_priv; int ret; @@ -4251,9 +4251,8 @@ int rdma_reject(struct rdma_cm_id *id, const void *private_data, private_data, private_data_len); } else { trace_cm_send_rej(id_priv); - ret = ib_send_cm_rej(id_priv->cm_id.ib, - IB_CM_REJ_CONSUMER_DEFINED, NULL, - 0, private_data, private_data_len); + ret = ib_send_cm_rej(id_priv->cm_id.ib, reason, NULL, 0, + private_data, private_data_len); } } else if (rdma_cap_iw_cm(id->device, id->port_num)) { ret = iw_cm_reject(id_priv->cm_id.iw, diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c index 6b27b210b890..5b87eee8ccc8 100644 --- a/drivers/infiniband/core/ucma.c +++ b/drivers/infiniband/core/ucma.c @@ -52,6 +52,7 @@ #include #include #include +#include #include #include "core_priv.h" @@ -1181,12 +1182,24 @@ static ssize_t ucma_reject(struct ucma_file *file, const char __user *inbuf, if (copy_from_user(&cmd, inbuf, sizeof(cmd))) return -EFAULT; + if (!cmd.reason) + cmd.reason = IB_CM_REJ_CONSUMER_DEFINED; + + switch (cmd.reason) { + case IB_CM_REJ_CONSUMER_DEFINED: + case IB_CM_REJ_VENDOR_OPTION_NOT_SUPPORTED: + break; + default: + return -EINVAL; + } + ctx = ucma_get_ctx_dev(file, cmd.id); if (IS_ERR(ctx)) return PTR_ERR(ctx); mutex_lock(&ctx->mutex); - ret = rdma_reject(ctx->cm_id, cmd.private_data, cmd.private_data_len); + ret = rdma_reject(ctx->cm_id, cmd.private_data, cmd.private_data_len, + cmd.reason); mutex_unlock(&ctx->mutex); ucma_put_ctx(ctx); return ret; diff --git a/drivers/infiniband/ulp/isert/ib_isert.c b/drivers/infiniband/ulp/isert/ib_isert.c index a1a035270cab..b7df38ee8ae0 100644 --- a/drivers/infiniband/ulp/isert/ib_isert.c +++ b/drivers/infiniband/ulp/isert/ib_isert.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -502,7 +503,7 @@ isert_connect_request(struct rdma_cm_id *cma_id, struct rdma_cm_event *event) if (!np->enabled) { spin_unlock_bh(&np->np_thread_lock); isert_dbg("iscsi_np is not enabled, reject connect request\n"); - return rdma_reject(cma_id, NULL, 0); + return rdma_reject(cma_id, NULL, 0, IB_CM_REJ_CONSUMER_DEFINED); } spin_unlock_bh(&np->np_thread_lock); @@ -553,7 +554,7 @@ out_rsp_dma_map: isert_free_login_buf(isert_conn); out: kfree(isert_conn); - rdma_reject(cma_id, NULL, 0); + rdma_reject(cma_id, NULL, 0, IB_CM_REJ_CONSUMER_DEFINED); return ret; } diff --git a/drivers/infiniband/ulp/rtrs/rtrs-srv.c b/drivers/infiniband/ulp/rtrs/rtrs-srv.c index 5ef8988ee75b..0d9241f5d9e6 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs-srv.c +++ b/drivers/infiniband/ulp/rtrs/rtrs-srv.c @@ -15,6 +15,7 @@ #include "rtrs-srv.h" #include "rtrs-log.h" +#include MODULE_DESCRIPTION("RDMA Transport Server"); MODULE_LICENSE("GPL"); @@ -1576,7 +1577,7 @@ static int rtrs_rdma_do_reject(struct rdma_cm_id *cm_id, int errno) .errno = cpu_to_le16(errno), }; - err = rdma_reject(cm_id, &msg, sizeof(msg)); + err = rdma_reject(cm_id, &msg, sizeof(msg), IB_CM_REJ_CONSUMER_DEFINED); if (err) pr_err("rdma_reject(), err: %d\n", err); diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.c b/drivers/infiniband/ulp/srpt/ib_srpt.c index a294630f2100..cdc8c239d6c0 100644 --- a/drivers/infiniband/ulp/srpt/ib_srpt.c +++ b/drivers/infiniband/ulp/srpt/ib_srpt.c @@ -2497,7 +2497,8 @@ reject: SRP_BUF_FORMAT_INDIRECT); if (rdma_cm_id) - rdma_reject(rdma_cm_id, rej, sizeof(*rej)); + rdma_reject(rdma_cm_id, rej, sizeof(*rej), + IB_CM_REJ_CONSUMER_DEFINED); else ib_send_cm_rej(ib_cm_id, IB_CM_REJ_CONSUMER_DEFINED, NULL, 0, rej, sizeof(*rej)); diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c index fd47de0e4e4e..55aaf03a9580 100644 --- a/drivers/nvme/target/rdma.c +++ b/drivers/nvme/target/rdma.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include "nvmet.h" @@ -1138,7 +1139,8 @@ static int nvmet_rdma_cm_reject(struct rdma_cm_id *cm_id, rej.recfmt = cpu_to_le16(NVME_RDMA_CM_FMT_1_0); rej.sts = cpu_to_le16(status); - return rdma_reject(cm_id, (void *)&rej, sizeof(rej)); + return rdma_reject(cm_id, (void *)&rej, sizeof(rej), + IB_CM_REJ_CONSUMER_DEFINED); } static struct nvmet_rdma_queue * diff --git a/include/rdma/rdma_cm.h b/include/rdma/rdma_cm.h index 7ac91677660f..939d7abe026f 100644 --- a/include/rdma/rdma_cm.h +++ b/include/rdma/rdma_cm.h @@ -320,7 +320,7 @@ int rdma_notify(struct rdma_cm_id *id, enum ib_event_type event); * rdma_reject - Called to reject a connection request or response. */ int rdma_reject(struct rdma_cm_id *id, const void *private_data, - u8 private_data_len); + u8 private_data_len, u8 reason); /** * rdma_disconnect - This function disconnects the associated QP and diff --git a/include/uapi/rdma/rdma_user_cm.h b/include/uapi/rdma/rdma_user_cm.h index 6b883dde7064..ed5a514305c1 100644 --- a/include/uapi/rdma/rdma_user_cm.h +++ b/include/uapi/rdma/rdma_user_cm.h @@ -238,7 +238,8 @@ struct rdma_ucm_accept { struct rdma_ucm_reject { __u32 id; __u8 private_data_len; - __u8 reserved[3]; + __u8 reason; + __u8 reserved[2]; __u8 private_data[RDMA_MAX_PRIVATE_DATA]; }; diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c index c71f4328d138..0fec4171564e 100644 --- a/net/rds/ib_cm.c +++ b/net/rds/ib_cm.c @@ -37,6 +37,7 @@ #include #include #include +#include #include "rds_single_path.h" #include "rds.h" @@ -927,7 +928,8 @@ out: if (conn) mutex_unlock(&conn->c_cm_lock); if (err) - rdma_reject(cm_id, &err, sizeof(int)); + rdma_reject(cm_id, &err, sizeof(int), + IB_CM_REJ_CONSUMER_DEFINED); return destroy; } -- cgit v1.2.3 From 3e09a427ae7ac347e08dca5ffac64c902860d675 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 26 May 2020 14:54:34 +0300 Subject: RDMA/mlx5: Get ECE options from FW during create QP Supported ECE options are returned from FW in the create_qp phase and zero means that field is not valid. Such default value allows us to reuse reserved field without worries about comp_mask. Update create QP API to return ECE options. Link: https://lore.kernel.org/r/20200526115440.205922-3-leon@kernel.org Reviewed-by: Mark Zhang Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/qp.c | 16 +++++++++++----- drivers/infiniband/hw/mlx5/qp.h | 4 ++-- drivers/infiniband/hw/mlx5/qpc.c | 8 ++++---- include/uapi/rdma/mlx5-abi.h | 2 +- 4 files changed, 18 insertions(+), 12 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 2e28752e8cd2..be7289c480f7 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -1842,6 +1842,7 @@ static int create_xrc_tgt_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, struct ib_qp_init_attr *attr = params->attr; u32 uidx = params->uidx; struct mlx5_ib_resources *devr = &dev->devr; + u32 out[MLX5_ST_SZ_DW(create_qp_out)] = {}; int inlen = MLX5_ST_SZ_BYTES(create_qp_in); struct mlx5_core_dev *mdev = dev->mdev; struct mlx5_ib_qp_base *base; @@ -1894,13 +1895,14 @@ static int create_xrc_tgt_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, } base = &qp->trans_qp.base; - err = mlx5_core_create_qp(dev, &base->mqp, in, inlen); + err = mlx5_qpc_create_qp(dev, &base->mqp, in, inlen, out); kvfree(in); if (err) return err; base->container_mibqp = qp; base->mqp.event = mlx5_ib_qp_event; + params->resp.ece_options = MLX5_GET(create_qp_out, out, ece); spin_lock_irqsave(&dev->reset_flow_resource_lock, flags); list_add_tail(&qp->qps_list, &dev->qp_list); @@ -1916,6 +1918,7 @@ static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd, { struct ib_qp_init_attr *init_attr = params->attr; struct mlx5_ib_create_qp *ucmd = params->ucmd; + u32 out[MLX5_ST_SZ_DW(create_qp_out)] = {}; struct ib_udata *udata = params->udata; u32 uidx = params->uidx; struct mlx5_ib_resources *devr = &dev->devr; @@ -2065,7 +2068,7 @@ static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd, err = create_raw_packet_qp(dev, qp, in, inlen, pd, udata, ¶ms->resp); } else - err = mlx5_core_create_qp(dev, &base->mqp, in, inlen); + err = mlx5_qpc_create_qp(dev, &base->mqp, in, inlen, out); kvfree(in); if (err) @@ -2073,6 +2076,7 @@ static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd, base->container_mibqp = qp; base->mqp.event = mlx5_ib_qp_event; + params->resp.ece_options = MLX5_GET(create_qp_out, out, ece); get_cqs(qp->type, init_attr->send_cq, init_attr->recv_cq, &send_cq, &recv_cq); @@ -2105,6 +2109,7 @@ static int create_kernel_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd, struct ib_qp_init_attr *attr = params->attr; u32 uidx = params->uidx; struct mlx5_ib_resources *devr = &dev->devr; + u32 out[MLX5_ST_SZ_DW(create_qp_out)] = {}; int inlen = MLX5_ST_SZ_BYTES(create_qp_in); struct mlx5_core_dev *mdev = dev->mdev; struct mlx5_ib_cq *send_cq; @@ -2195,7 +2200,7 @@ static int create_kernel_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd, if (qp->flags & IB_QP_CREATE_IPOIB_UD_LSO) MLX5_SET(qpc, qpc, ulp_stateless_offload_mode, 1); - err = mlx5_core_create_qp(dev, &base->mqp, in, inlen); + err = mlx5_qpc_create_qp(dev, &base->mqp, in, inlen, out); kvfree(in); if (err) goto err_create; @@ -2779,12 +2784,13 @@ out: qp->ibqp.qp_num = qp->trans_qp.base.mqp.qpn; mlx5_ib_dbg(dev, - "QP type %d, ib qpn 0x%X, mlx qpn 0x%x, rcqn 0x%x, scqn 0x%x\n", + "QP type %d, ib qpn 0x%X, mlx qpn 0x%x, rcqn 0x%x, scqn 0x%x, ece 0x%x\n", qp->type, qp->ibqp.qp_num, qp->trans_qp.base.mqp.qpn, params->attr->recv_cq ? to_mcq(params->attr->recv_cq)->mcq.cqn : -1, params->attr->send_cq ? to_mcq(params->attr->send_cq)->mcq.cqn : - -1); + -1, + params->resp.ece_options); return 0; } diff --git a/drivers/infiniband/hw/mlx5/qp.h b/drivers/infiniband/hw/mlx5/qp.h index ad9d76e3e18a..795c21f88962 100644 --- a/drivers/infiniband/hw/mlx5/qp.h +++ b/drivers/infiniband/hw/mlx5/qp.h @@ -13,8 +13,8 @@ void mlx5_cleanup_qp_table(struct mlx5_ib_dev *dev); int mlx5_core_create_dct(struct mlx5_ib_dev *dev, struct mlx5_core_dct *qp, u32 *in, int inlen, u32 *out, int outlen); -int mlx5_core_create_qp(struct mlx5_ib_dev *dev, struct mlx5_core_qp *qp, - u32 *in, int inlen); +int mlx5_qpc_create_qp(struct mlx5_ib_dev *dev, struct mlx5_core_qp *qp, + u32 *in, int inlen, u32 *out); int mlx5_core_qp_modify(struct mlx5_ib_dev *dev, u16 opcode, u32 opt_param_mask, void *qpc, struct mlx5_core_qp *qp); int mlx5_core_destroy_qp(struct mlx5_ib_dev *dev, struct mlx5_core_qp *qp); diff --git a/drivers/infiniband/hw/mlx5/qpc.c b/drivers/infiniband/hw/mlx5/qpc.c index ea62735042f0..69c80859a6ee 100644 --- a/drivers/infiniband/hw/mlx5/qpc.c +++ b/drivers/infiniband/hw/mlx5/qpc.c @@ -236,16 +236,16 @@ err_cmd: return err; } -int mlx5_core_create_qp(struct mlx5_ib_dev *dev, struct mlx5_core_qp *qp, - u32 *in, int inlen) +int mlx5_qpc_create_qp(struct mlx5_ib_dev *dev, struct mlx5_core_qp *qp, + u32 *in, int inlen, u32 *out) { - u32 out[MLX5_ST_SZ_DW(create_qp_out)] = {}; u32 din[MLX5_ST_SZ_DW(destroy_qp_in)] = {}; int err; MLX5_SET(create_qp_in, in, opcode, MLX5_CMD_OP_CREATE_QP); - err = mlx5_cmd_exec(dev->mdev, in, inlen, out, sizeof(out)); + err = mlx5_cmd_exec(dev->mdev, in, inlen, out, + MLX5_ST_SZ_BYTES(create_qp_out)); if (err) return err; diff --git a/include/uapi/rdma/mlx5-abi.h b/include/uapi/rdma/mlx5-abi.h index df1cc3641bda..106fbb3bec6a 100644 --- a/include/uapi/rdma/mlx5-abi.h +++ b/include/uapi/rdma/mlx5-abi.h @@ -371,7 +371,7 @@ enum mlx5_ib_create_qp_resp_mask { struct mlx5_ib_create_qp_resp { __u32 bfreg_index; - __u32 reserved; + __u32 ece_options; __u32 comp_mask; __u32 tirn; __u32 tisn; -- cgit v1.2.3 From e383085c24255821e79d3c2aa6302d804b6a1c48 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 26 May 2020 14:54:35 +0300 Subject: RDMA/mlx5: Set ECE options during QP create Allow users to ask creation of QPs with specific ECE options. Such early set even before RDMA-CM connection is established is useful if user knows exactly which option he needs. Link: https://lore.kernel.org/r/20200526115440.205922-4-leon@kernel.org Reviewed-by: Mark Zhang Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/qp.c | 75 ++++++++++++++++++++++++++++++++++++----- include/uapi/rdma/mlx5-abi.h | 2 ++ 2 files changed, 68 insertions(+), 9 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index be7289c480f7..eb70eb371b4b 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -1552,6 +1552,7 @@ struct mlx5_create_qp_params { struct ib_udata *udata; size_t inlen; size_t outlen; + size_t ucmd_size; void *ucmd; u8 is_rss_raw : 1; struct ib_qp_init_attr *attr; @@ -1839,6 +1840,7 @@ static int get_atomic_mode(struct mlx5_ib_dev *dev, static int create_xrc_tgt_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, struct mlx5_create_qp_params *params) { + struct mlx5_ib_create_qp *ucmd = params->ucmd; struct ib_qp_init_attr *attr = params->attr; u32 uidx = params->uidx; struct mlx5_ib_resources *devr = &dev->devr; @@ -1860,6 +1862,8 @@ static int create_xrc_tgt_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, if (!in) return -ENOMEM; + if (MLX5_CAP_GEN(mdev, ece_support)) + MLX5_SET(create_qp_in, in, ece, ucmd->ece_options); qpc = MLX5_ADDR_OF(create_qp_in, in, qpc); MLX5_SET(qpc, qpc, st, MLX5_QP_ST_XRC); @@ -1974,6 +1978,8 @@ static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd, if (is_sqp(init_attr->qp_type)) qp->port = init_attr->port_num; + if (MLX5_CAP_GEN(mdev, ece_support)) + MLX5_SET(create_qp_in, in, ece, ucmd->ece_options); qpc = MLX5_ADDR_OF(create_qp_in, in, qpc); MLX5_SET(qpc, qpc, st, mlx5_st); @@ -2709,19 +2715,22 @@ static int process_udata_size(struct mlx5_ib_dev *dev, struct mlx5_create_qp_params *params) { size_t ucmd = sizeof(struct mlx5_ib_create_qp); - struct ib_qp_init_attr *attr = params->attr; struct ib_udata *udata = params->udata; size_t outlen = udata->outlen; size_t inlen = udata->inlen; params->outlen = min(outlen, sizeof(struct mlx5_ib_create_qp_resp)); - if (attr->qp_type == IB_QPT_DRIVER) { - params->inlen = (inlen < ucmd) ? 0 : ucmd; - goto out; - } - + params->ucmd_size = ucmd; if (!params->is_rss_raw) { - params->inlen = ucmd; + /* User has old rdma-core, which doesn't support ECE */ + size_t min_inlen = + offsetof(struct mlx5_ib_create_qp, ece_options); + + /* + * We will check in check_ucmd_data() that user + * cleared everything after inlen. + */ + params->inlen = (inlen < min_inlen) ? 0 : min(inlen, ucmd); goto out; } @@ -2733,13 +2742,14 @@ static int process_udata_size(struct mlx5_ib_dev *dev, return -EINVAL; ucmd = sizeof(struct mlx5_ib_create_qp_rss); + params->ucmd_size = ucmd; if (inlen > ucmd && !ib_is_udata_cleared(udata, ucmd, inlen - ucmd)) return -EINVAL; params->inlen = min(ucmd, inlen); out: if (!params->inlen) - mlx5_ib_dbg(dev, "udata is too small or not cleared\n"); + mlx5_ib_dbg(dev, "udata is too small\n"); return (params->inlen) ? 0 : -EINVAL; } @@ -2855,6 +2865,49 @@ static int mlx5_ib_destroy_dct(struct mlx5_ib_qp *mqp) return 0; } +static int check_ucmd_data(struct mlx5_ib_dev *dev, + struct mlx5_create_qp_params *params) +{ + struct ib_qp_init_attr *attr = params->attr; + struct ib_udata *udata = params->udata; + size_t size, last; + int ret; + + if (params->is_rss_raw) + /* + * These QPs don't have "reserved" field in their + * create_qp input struct, so their data is always valid. + */ + last = sizeof(struct mlx5_ib_create_qp_rss); + else + /* IB_QPT_RAW_PACKET and IB_QPT_DRIVER don't have ECE data */ + switch (attr->qp_type) { + case IB_QPT_DRIVER: + case IB_QPT_RAW_PACKET: + last = offsetof(struct mlx5_ib_create_qp, ece_options); + break; + default: + last = offsetof(struct mlx5_ib_create_qp, reserved); + } + + if (udata->inlen <= last) + return 0; + + /* + * User provides different create_qp structures based on the + * flow and we need to know if he cleared memory after our + * struct create_qp ends. + */ + size = udata->inlen - last; + ret = ib_is_udata_cleared(params->udata, last, size); + if (!ret) + mlx5_ib_dbg( + dev, + "udata is not cleared, inlen = %lu, ucmd = %lu, last = %lu, size = %lu\n", + udata->inlen, params->ucmd_size, last, size); + return ret ? 0 : -EINVAL; +} + struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *attr, struct ib_udata *udata) { @@ -2888,7 +2941,11 @@ struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *attr, if (err) return ERR_PTR(err); - params.ucmd = kzalloc(params.inlen, GFP_KERNEL); + err = check_ucmd_data(dev, ¶ms); + if (err) + return ERR_PTR(err); + + params.ucmd = kzalloc(params.ucmd_size, GFP_KERNEL); if (!params.ucmd) return ERR_PTR(-ENOMEM); diff --git a/include/uapi/rdma/mlx5-abi.h b/include/uapi/rdma/mlx5-abi.h index 106fbb3bec6a..bc9d9e3cb369 100644 --- a/include/uapi/rdma/mlx5-abi.h +++ b/include/uapi/rdma/mlx5-abi.h @@ -322,6 +322,8 @@ struct mlx5_ib_create_qp { __aligned_u64 sq_buf_addr; __aligned_u64 access_key; }; + __u32 ece_options; + __u32 reserved; }; /* RX Hash function flags */ -- cgit v1.2.3 From 64bae2d455f6058572ac4d23a8ea9e47c9d10f03 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 26 May 2020 14:54:36 +0300 Subject: RDMA/mlx5: Use direct modify QP implementation As a preparation to removal hand crafted mlx5_qp_context, convert counter code to use mlx5_cmd_exec_in() directly. Link: https://lore.kernel.org/r/20200526115440.205922-5-leon@kernel.org Reviewed-by: Mark Zhang Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/qp.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index eb70eb371b4b..8a3aee57a196 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -3688,10 +3688,11 @@ static int __mlx5_ib_qp_set_counter(struct ib_qp *qp, struct rdma_counter *counter) { struct mlx5_ib_dev *dev = to_mdev(qp->device); + u32 in[MLX5_ST_SZ_DW(rts2rts_qp_in)] = {}; struct mlx5_ib_qp *mqp = to_mqp(qp); - struct mlx5_qp_context context = {}; struct mlx5_ib_qp_base *base; u32 set_id; + u32 *qpc; if (counter) set_id = counter->id; @@ -3699,11 +3700,15 @@ static int __mlx5_ib_qp_set_counter(struct ib_qp *qp, set_id = mlx5_ib_get_counters_id(dev, mqp->port - 1); base = &mqp->trans_qp.base; - context.qp_counter_set_usr_page &= cpu_to_be32(0xffffff); - context.qp_counter_set_usr_page |= cpu_to_be32(set_id << 24); - return mlx5_core_qp_modify(dev, MLX5_CMD_OP_RTS2RTS_QP, - MLX5_QP_OPTPAR_COUNTER_SET_ID, &context, - &base->mqp); + MLX5_SET(rts2rts_qp_in, in, opcode, MLX5_CMD_OP_RTS2RTS_QP); + MLX5_SET(rts2rts_qp_in, in, qpn, base->mqp.qpn); + MLX5_SET(rts2rts_qp_in, in, uid, base->mqp.uid); + MLX5_SET(rts2rts_qp_in, in, opt_param_mask, + MLX5_QP_OPTPAR_COUNTER_SET_ID); + + qpc = MLX5_ADDR_OF(rts2rts_qp_in, in, qpc); + MLX5_SET(qpc, qpc, counter_set_id, set_id); + return mlx5_cmd_exec_in(dev->mdev, rts2rts_qp, in); } static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, -- cgit v1.2.3 From 70bd7fb8762528ac0e69a8ae0f485298dff57043 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 26 May 2020 14:54:37 +0300 Subject: RDMA/mlx5: Remove manually crafted QP context the query call As a preparation to removal hand crafted mlx5_qp_context, convert query_qp_attr() to use proper MLX5_GET() macros. Link: https://lore.kernel.org/r/20200526115440.205922-6-leon@kernel.org Reviewed-by: Mark Zhang Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/qp.c | 129 +++++++++++++++++----------------------- 1 file changed, 56 insertions(+), 73 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 8a3aee57a196..5099866533dd 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -4330,50 +4330,35 @@ static inline enum ib_mig_state to_ib_mig_state(int mlx5_mig_state) } } -static int to_ib_qp_access_flags(int mlx5_flags) -{ - int ib_flags = 0; - - if (mlx5_flags & MLX5_QP_BIT_RRE) - ib_flags |= IB_ACCESS_REMOTE_READ; - if (mlx5_flags & MLX5_QP_BIT_RWE) - ib_flags |= IB_ACCESS_REMOTE_WRITE; - if (mlx5_flags & MLX5_QP_BIT_RAE) - ib_flags |= IB_ACCESS_REMOTE_ATOMIC; - - return ib_flags; -} - static void to_rdma_ah_attr(struct mlx5_ib_dev *ibdev, - struct rdma_ah_attr *ah_attr, - struct mlx5_qp_path *path) + struct rdma_ah_attr *ah_attr, void *path) { + int port = MLX5_GET(ads, path, vhca_port_num); + int static_rate; memset(ah_attr, 0, sizeof(*ah_attr)); - if (!path->port || path->port > ibdev->num_ports) + if (!port || port > ibdev->num_ports) return; - ah_attr->type = rdma_ah_find_type(&ibdev->ib_dev, path->port); + ah_attr->type = rdma_ah_find_type(&ibdev->ib_dev, port); - rdma_ah_set_port_num(ah_attr, path->port); - rdma_ah_set_sl(ah_attr, path->dci_cfi_prio_sl & 0xf); + rdma_ah_set_port_num(ah_attr, port); + rdma_ah_set_sl(ah_attr, MLX5_GET(ads, path, sl)); - rdma_ah_set_dlid(ah_attr, be16_to_cpu(path->rlid)); - rdma_ah_set_path_bits(ah_attr, path->grh_mlid & 0x7f); - rdma_ah_set_static_rate(ah_attr, - path->static_rate ? path->static_rate - 5 : 0); + rdma_ah_set_dlid(ah_attr, MLX5_GET(ads, path, rlid)); + rdma_ah_set_path_bits(ah_attr, MLX5_GET(ads, path, mlid)); - if (path->grh_mlid & (1 << 7) || + static_rate = MLX5_GET(ads, path, stat_rate); + rdma_ah_set_static_rate(ah_attr, static_rate ? static_rate - 5 : 0); + if (MLX5_GET(ads, path, grh) || ah_attr->type == RDMA_AH_ATTR_TYPE_ROCE) { - u32 tc_fl = be32_to_cpu(path->tclass_flowlabel); - - rdma_ah_set_grh(ah_attr, NULL, - tc_fl & 0xfffff, - path->mgid_index, - path->hop_limit, - (tc_fl >> 20) & 0xff); - rdma_ah_set_dgid_raw(ah_attr, path->rgid); + rdma_ah_set_grh(ah_attr, NULL, MLX5_GET(ads, path, flow_label), + MLX5_GET(ads, path, src_addr_index), + MLX5_GET(ads, path, hop_limit), + MLX5_GET(ads, path, tclass)); + memcpy(ah_attr, MLX5_ADDR_OF(ads, path, rgid_rip), + MLX5_FLD_SZ_BYTES(ads, rgid_rip)); } } @@ -4495,10 +4480,9 @@ static int query_qp_attr(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, struct ib_qp_attr *qp_attr) { int outlen = MLX5_ST_SZ_BYTES(query_qp_out); - struct mlx5_qp_context *context; - int mlx5_state; + void *qpc, *pri_path, *alt_path; u32 *outb; - int err = 0; + int err; outb = kzalloc(outlen, GFP_KERNEL); if (!outb) @@ -4508,47 +4492,46 @@ static int query_qp_attr(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, if (err) goto out; - /* FIXME: use MLX5_GET rather than mlx5_qp_context manual struct */ - context = (struct mlx5_qp_context *)MLX5_ADDR_OF(query_qp_out, outb, qpc); + qpc = MLX5_ADDR_OF(query_qp_out, outb, qpc); + + qp->state = to_ib_qp_state(MLX5_GET(qpc, qpc, state)); + if (MLX5_GET(qpc, qpc, state) == MLX5_QP_STATE_SQ_DRAINING) + qp_attr->sq_draining = 1; + + qp_attr->path_mtu = MLX5_GET(qpc, qpc, mtu); + qp_attr->path_mig_state = to_ib_mig_state(MLX5_GET(qpc, qpc, pm_state)); + qp_attr->qkey = MLX5_GET(qpc, qpc, q_key); + qp_attr->rq_psn = MLX5_GET(qpc, qpc, next_rcv_psn); + qp_attr->sq_psn = MLX5_GET(qpc, qpc, next_send_psn); + qp_attr->dest_qp_num = MLX5_GET(qpc, qpc, remote_qpn); - mlx5_state = be32_to_cpu(context->flags) >> 28; + if (MLX5_GET(qpc, qpc, rre)) + qp_attr->qp_access_flags |= IB_ACCESS_REMOTE_READ; + if (MLX5_GET(qpc, qpc, rwe)) + qp_attr->qp_access_flags |= IB_ACCESS_REMOTE_WRITE; + if (MLX5_GET(qpc, qpc, rae)) + qp_attr->qp_access_flags |= IB_ACCESS_REMOTE_ATOMIC; - qp->state = to_ib_qp_state(mlx5_state); - qp_attr->path_mtu = context->mtu_msgmax >> 5; - qp_attr->path_mig_state = - to_ib_mig_state((be32_to_cpu(context->flags) >> 11) & 0x3); - qp_attr->qkey = be32_to_cpu(context->qkey); - qp_attr->rq_psn = be32_to_cpu(context->rnr_nextrecvpsn) & 0xffffff; - qp_attr->sq_psn = be32_to_cpu(context->next_send_psn) & 0xffffff; - qp_attr->dest_qp_num = be32_to_cpu(context->log_pg_sz_remote_qpn) & 0xffffff; - qp_attr->qp_access_flags = - to_ib_qp_access_flags(be32_to_cpu(context->params2)); + qp_attr->max_rd_atomic = 1 << MLX5_GET(qpc, qpc, log_sra_max); + qp_attr->max_dest_rd_atomic = 1 << MLX5_GET(qpc, qpc, log_rra_max); + qp_attr->min_rnr_timer = MLX5_GET(qpc, qpc, min_rnr_nak); + qp_attr->retry_cnt = MLX5_GET(qpc, qpc, retry_count); + qp_attr->rnr_retry = MLX5_GET(qpc, qpc, rnr_retry); + + pri_path = MLX5_ADDR_OF(qpc, qpc, primary_address_path); + alt_path = MLX5_ADDR_OF(qpc, qpc, secondary_address_path); if (qp->ibqp.qp_type == IB_QPT_RC || qp->ibqp.qp_type == IB_QPT_UC) { - to_rdma_ah_attr(dev, &qp_attr->ah_attr, &context->pri_path); - to_rdma_ah_attr(dev, &qp_attr->alt_ah_attr, &context->alt_path); - qp_attr->alt_pkey_index = - be16_to_cpu(context->alt_path.pkey_index); - qp_attr->alt_port_num = - rdma_ah_get_port_num(&qp_attr->alt_ah_attr); - } - - qp_attr->pkey_index = be16_to_cpu(context->pri_path.pkey_index); - qp_attr->port_num = context->pri_path.port; - - /* qp_attr->en_sqd_async_notify is only applicable in modify qp */ - qp_attr->sq_draining = mlx5_state == MLX5_QP_STATE_SQ_DRAINING; - - qp_attr->max_rd_atomic = 1 << ((be32_to_cpu(context->params1) >> 21) & 0x7); - - qp_attr->max_dest_rd_atomic = - 1 << ((be32_to_cpu(context->params2) >> 21) & 0x7); - qp_attr->min_rnr_timer = - (be32_to_cpu(context->rnr_nextrecvpsn) >> 24) & 0x1f; - qp_attr->timeout = context->pri_path.ackto_lt >> 3; - qp_attr->retry_cnt = (be32_to_cpu(context->params1) >> 16) & 0x7; - qp_attr->rnr_retry = (be32_to_cpu(context->params1) >> 13) & 0x7; - qp_attr->alt_timeout = context->alt_path.ackto_lt >> 3; + to_rdma_ah_attr(dev, &qp_attr->ah_attr, pri_path); + to_rdma_ah_attr(dev, &qp_attr->alt_ah_attr, alt_path); + qp_attr->alt_pkey_index = MLX5_GET(ads, alt_path, pkey_index); + qp_attr->alt_port_num = MLX5_GET(ads, alt_path, vhca_port_num); + } + + qp_attr->pkey_index = MLX5_GET(ads, pri_path, pkey_index); + qp_attr->port_num = MLX5_GET(ads, pri_path, vhca_port_num); + qp_attr->timeout = MLX5_GET(ads, pri_path, ack_timeout); + qp_attr->alt_timeout = MLX5_GET(ads, alt_path, ack_timeout); out: kfree(outb); -- cgit v1.2.3 From f18e26af6aba778b888044859d9c69bb9bbc7bc1 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 26 May 2020 14:54:38 +0300 Subject: RDMA/mlx5: Convert modify QP to use MLX5_SET macros Instead of hand crafted mlx5_qp_context and mlx5_qp_path use common MLX5_SET() macros. Link: https://lore.kernel.org/r/20200526115440.205922-7-leon@kernel.org Reviewed-by: Maor Gottlieb Reviewed-by: Mark Zhang Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/qp.c | 203 +++++++++++++++++++--------------------- include/linux/mlx5/qp.h | 66 ------------- 2 files changed, 97 insertions(+), 172 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 5099866533dd..a24176a8ec83 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -3028,14 +3028,13 @@ int mlx5_ib_destroy_qp(struct ib_qp *qp, struct ib_udata *udata) return 0; } -static int to_mlx5_access_flags(struct mlx5_ib_qp *qp, - const struct ib_qp_attr *attr, - int attr_mask, __be32 *hw_access_flags_be) +static int set_qpc_atomic_flags(struct mlx5_ib_qp *qp, + const struct ib_qp_attr *attr, int attr_mask, + void *qpc) { - u8 dest_rd_atomic; - u32 access_flags, hw_access_flags = 0; - struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.device); + u8 dest_rd_atomic; + u32 access_flags; if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) dest_rd_atomic = attr->max_dest_rd_atomic; @@ -3050,8 +3049,8 @@ static int to_mlx5_access_flags(struct mlx5_ib_qp *qp, if (!dest_rd_atomic) access_flags &= IB_ACCESS_REMOTE_WRITE; - if (access_flags & IB_ACCESS_REMOTE_READ) - hw_access_flags |= MLX5_QP_BIT_RRE; + MLX5_SET(qpc, qpc, rre, !!(access_flags & IB_ACCESS_REMOTE_READ)); + if (access_flags & IB_ACCESS_REMOTE_ATOMIC) { int atomic_mode; @@ -3059,15 +3058,11 @@ static int to_mlx5_access_flags(struct mlx5_ib_qp *qp, if (atomic_mode < 0) return -EOPNOTSUPP; - hw_access_flags |= MLX5_QP_BIT_RAE; - hw_access_flags |= atomic_mode << MLX5_ATOMIC_MODE_OFFSET; + MLX5_SET(qpc, qpc, rae, 1); + MLX5_SET(qpc, qpc, atomic_mode, atomic_mode); } - if (access_flags & IB_ACCESS_REMOTE_WRITE) - hw_access_flags |= MLX5_QP_BIT_RWE; - - *hw_access_flags_be = cpu_to_be32(hw_access_flags); - + MLX5_SET(qpc, qpc, rwe, !!(access_flags & IB_ACCESS_REMOTE_WRITE)); return 0; } @@ -3147,26 +3142,22 @@ static int modify_raw_packet_tx_affinity(struct mlx5_core_dev *dev, return err; } -static void mlx5_set_path_udp_sport(struct mlx5_qp_path *path, - const struct rdma_ah_attr *ah, +static void mlx5_set_path_udp_sport(void *path, const struct rdma_ah_attr *ah, u32 lqpn, u32 rqpn) { u32 fl = ah->grh.flow_label; - u16 sport; if (!fl) fl = rdma_calc_flow_label(lqpn, rqpn); - sport = rdma_flow_label_to_udp_sport(fl); - path->udp_sport = cpu_to_be16(sport); + MLX5_SET(ads, path, udp_sport, rdma_flow_label_to_udp_sport(fl)); } static int mlx5_set_path(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, - const struct rdma_ah_attr *ah, - struct mlx5_qp_path *path, u8 port, int attr_mask, - u32 path_flags, const struct ib_qp_attr *attr, - bool alt) + const struct rdma_ah_attr *ah, void *path, u8 port, + int attr_mask, u32 path_flags, + const struct ib_qp_attr *attr, bool alt) { const struct ib_global_route *grh = rdma_ah_read_grh(ah); int err; @@ -3175,8 +3166,8 @@ static int mlx5_set_path(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, u8 sl = rdma_ah_get_sl(ah); if (attr_mask & IB_QP_PKEY_INDEX) - path->pkey_index = cpu_to_be16(alt ? attr->alt_pkey_index : - attr->pkey_index); + MLX5_SET(ads, path, pkey_index, + alt ? attr->alt_pkey_index : attr->pkey_index); if (ah_flags & IB_AH_GRH) { if (grh->sgid_index >= @@ -3192,7 +3183,8 @@ static int mlx5_set_path(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, if (!(ah_flags & IB_AH_GRH)) return -EINVAL; - memcpy(path->rmac, ah->roce.dmac, sizeof(ah->roce.dmac)); + ether_addr_copy(MLX5_ADDR_OF(ads, path, rmac_47_32), + ah->roce.dmac); if ((qp->ibqp.qp_type == IB_QPT_RC || qp->ibqp.qp_type == IB_QPT_UC || qp->ibqp.qp_type == IB_QPT_XRC_INI || @@ -3202,38 +3194,38 @@ static int mlx5_set_path(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, mlx5_set_path_udp_sport(path, ah, qp->ibqp.qp_num, attr->dest_qp_num); - path->dci_cfi_prio_sl = (sl & 0x7) << 4; + MLX5_SET(ads, path, eth_prio, sl & 0x7); gid_type = ah->grh.sgid_attr->gid_type; if (gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) - path->ecn_dscp = (grh->traffic_class >> 2) & 0x3f; + MLX5_SET(ads, path, dscp, grh->traffic_class >> 2); } else { - path->fl_free_ar = (path_flags & MLX5_PATH_FLAG_FL) ? 0x80 : 0; - path->fl_free_ar |= - (path_flags & MLX5_PATH_FLAG_FREE_AR) ? 0x40 : 0; - path->rlid = cpu_to_be16(rdma_ah_get_dlid(ah)); - path->grh_mlid = rdma_ah_get_path_bits(ah) & 0x7f; - if (ah_flags & IB_AH_GRH) - path->grh_mlid |= 1 << 7; - path->dci_cfi_prio_sl = sl & 0xf; + MLX5_SET(ads, path, fl, !!(path_flags & MLX5_PATH_FLAG_FL)); + MLX5_SET(ads, path, free_ar, + !!(path_flags & MLX5_PATH_FLAG_FREE_AR)); + MLX5_SET(ads, path, rlid, rdma_ah_get_dlid(ah)); + MLX5_SET(ads, path, mlid, rdma_ah_get_path_bits(ah)); + MLX5_SET(ads, path, grh, !!(ah_flags & IB_AH_GRH)); + MLX5_SET(ads, path, sl, sl); } if (ah_flags & IB_AH_GRH) { - path->mgid_index = grh->sgid_index; - path->hop_limit = grh->hop_limit; - path->tclass_flowlabel = - cpu_to_be32((grh->traffic_class << 20) | - (grh->flow_label)); - memcpy(path->rgid, grh->dgid.raw, 16); + MLX5_SET(ads, path, src_addr_index, grh->sgid_index); + MLX5_SET(ads, path, hop_limit, grh->hop_limit); + MLX5_SET(ads, path, tclass, grh->traffic_class); + MLX5_SET(ads, path, flow_label, grh->flow_label); + memcpy(MLX5_ADDR_OF(ads, path, rgid_rip), grh->dgid.raw, + sizeof(grh->dgid.raw)); } err = ib_rate_to_mlx5(dev, rdma_ah_get_static_rate(ah)); if (err < 0) return err; - path->static_rate = err; - path->port = port; + MLX5_SET(ads, path, stat_rate, err); + MLX5_SET(ads, path, vhca_port_num, port); if (attr_mask & IB_QP_TIMEOUT) - path->ackto_lt = (alt ? attr->alt_timeout : attr->timeout) << 3; + MLX5_SET(ads, path, ack_timeout, + alt ? attr->alt_timeout : attr->timeout); if ((qp->ibqp.qp_type == IB_QPT_RAW_PACKET) && qp->sq.wqe_cnt) return modify_raw_packet_eth_prio(dev->mdev, @@ -3759,9 +3751,9 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, struct mlx5_ib_qp *qp = to_mqp(ibqp); struct mlx5_ib_qp_base *base = &qp->trans_qp.base; struct mlx5_ib_cq *send_cq, *recv_cq; - struct mlx5_qp_context *context; struct mlx5_ib_pd *pd; enum mlx5_qp_state mlx5_cur, mlx5_new; + void *qpc, *pri_path, *alt_path; enum mlx5_qp_optpar optpar = 0; u32 set_id = 0; int mlx5_st; @@ -3773,25 +3765,25 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, if (mlx5_st < 0) return -EINVAL; - context = kzalloc(sizeof(*context), GFP_KERNEL); - if (!context) + qpc = kzalloc(MLX5_ST_SZ_BYTES(qpc), GFP_KERNEL); + if (!qpc) return -ENOMEM; pd = to_mpd(qp->ibqp.pd); - context->flags = cpu_to_be32(mlx5_st << 16); + MLX5_SET(qpc, qpc, st, mlx5_st); if (!(attr_mask & IB_QP_PATH_MIG_STATE)) { - context->flags |= cpu_to_be32(MLX5_QP_PM_MIGRATED << 11); + MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_MIGRATED); } else { switch (attr->path_mig_state) { case IB_MIG_MIGRATED: - context->flags |= cpu_to_be32(MLX5_QP_PM_MIGRATED << 11); + MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_MIGRATED); break; case IB_MIG_REARM: - context->flags |= cpu_to_be32(MLX5_QP_PM_REARM << 11); + MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_REARM); break; case IB_MIG_ARMED: - context->flags |= cpu_to_be32(MLX5_QP_PM_ARMED << 11); + MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_ARMED); break; } } @@ -3799,19 +3791,20 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, tx_affinity = get_tx_affinity(ibqp, attr, attr_mask, cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT, udata); - if (tx_affinity) { - context->flags |= cpu_to_be32(tx_affinity << 24); - if (new_state == IB_QPS_RTR && - MLX5_CAP_GEN(dev->mdev, init2_lag_tx_port_affinity)) - optpar |= MLX5_QP_OPTPAR_LAG_TX_AFF; - } + + MLX5_SET(qpc, qpc, lag_tx_port_affinity, tx_affinity); + if (tx_affinity && new_state == IB_QPS_RTR && + MLX5_CAP_GEN(dev->mdev, init2_lag_tx_port_affinity)) + optpar |= MLX5_QP_OPTPAR_LAG_TX_AFF; if (is_sqp(ibqp->qp_type)) { - context->mtu_msgmax = (IB_MTU_256 << 5) | 8; + MLX5_SET(qpc, qpc, mtu, IB_MTU_256); + MLX5_SET(qpc, qpc, log_msg_max, 8); } else if ((ibqp->qp_type == IB_QPT_UD && !(qp->flags & IB_QP_CREATE_SOURCE_QPN)) || ibqp->qp_type == MLX5_IB_QPT_REG_UMR) { - context->mtu_msgmax = (IB_MTU_4096 << 5) | 12; + MLX5_SET(qpc, qpc, mtu, IB_MTU_4096); + MLX5_SET(qpc, qpc, log_msg_max, 12); } else if (attr_mask & IB_QP_PATH_MTU) { if (attr->path_mtu < IB_MTU_256 || attr->path_mtu > IB_MTU_4096) { @@ -3819,40 +3812,45 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, err = -EINVAL; goto out; } - context->mtu_msgmax = (attr->path_mtu << 5) | - (u8)MLX5_CAP_GEN(dev->mdev, log_max_msg); + MLX5_SET(qpc, qpc, mtu, attr->path_mtu); + MLX5_SET(qpc, qpc, log_msg_max, + MLX5_CAP_GEN(dev->mdev, log_max_msg)); } if (attr_mask & IB_QP_DEST_QPN) - context->log_pg_sz_remote_qpn = cpu_to_be32(attr->dest_qp_num); + MLX5_SET(qpc, qpc, remote_qpn, attr->dest_qp_num); + + pri_path = MLX5_ADDR_OF(qpc, qpc, primary_address_path); + alt_path = MLX5_ADDR_OF(qpc, qpc, secondary_address_path); if (attr_mask & IB_QP_PKEY_INDEX) - context->pri_path.pkey_index = cpu_to_be16(attr->pkey_index); + MLX5_SET(ads, pri_path, pkey_index, attr->pkey_index); /* todo implement counter_index functionality */ if (is_sqp(ibqp->qp_type)) - context->pri_path.port = qp->port; + MLX5_SET(ads, pri_path, vhca_port_num, qp->port); if (attr_mask & IB_QP_PORT) - context->pri_path.port = attr->port_num; + MLX5_SET(ads, pri_path, vhca_port_num, attr->port_num); if (attr_mask & IB_QP_AV) { - err = mlx5_set_path(dev, qp, &attr->ah_attr, &context->pri_path, - attr_mask & IB_QP_PORT ? attr->port_num : qp->port, + err = mlx5_set_path(dev, qp, &attr->ah_attr, pri_path, + attr_mask & IB_QP_PORT ? attr->port_num : + qp->port, attr_mask, 0, attr, false); if (err) goto out; } if (attr_mask & IB_QP_TIMEOUT) - context->pri_path.ackto_lt |= attr->timeout << 3; + MLX5_SET(ads, pri_path, ack_timeout, attr->timeout); if (attr_mask & IB_QP_ALT_PATH) { - err = mlx5_set_path(dev, qp, &attr->alt_ah_attr, - &context->alt_path, + err = mlx5_set_path(dev, qp, &attr->alt_ah_attr, alt_path, attr->alt_port_num, - attr_mask | IB_QP_PKEY_INDEX | IB_QP_TIMEOUT, + attr_mask | IB_QP_PKEY_INDEX | + IB_QP_TIMEOUT, 0, attr, true); if (err) goto out; @@ -3861,53 +3859,47 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, get_cqs(qp->ibqp.qp_type, qp->ibqp.send_cq, qp->ibqp.recv_cq, &send_cq, &recv_cq); - context->flags_pd = cpu_to_be32(pd ? pd->pdn : to_mpd(dev->devr.p0)->pdn); - context->cqn_send = send_cq ? cpu_to_be32(send_cq->mcq.cqn) : 0; - context->cqn_recv = recv_cq ? cpu_to_be32(recv_cq->mcq.cqn) : 0; - context->params1 = cpu_to_be32(MLX5_IB_ACK_REQ_FREQ << 28); + MLX5_SET(qpc, qpc, pd, pd ? pd->pdn : to_mpd(dev->devr.p0)->pdn); + if (send_cq) + MLX5_SET(qpc, qpc, cqn_snd, send_cq->mcq.cqn); + if (recv_cq) + MLX5_SET(qpc, qpc, cqn_rcv, recv_cq->mcq.cqn); + + MLX5_SET(qpc, qpc, log_ack_req_freq, MLX5_IB_ACK_REQ_FREQ); if (attr_mask & IB_QP_RNR_RETRY) - context->params1 |= cpu_to_be32(attr->rnr_retry << 13); + MLX5_SET(qpc, qpc, rnr_retry, attr->rnr_retry); if (attr_mask & IB_QP_RETRY_CNT) - context->params1 |= cpu_to_be32(attr->retry_cnt << 16); + MLX5_SET(qpc, qpc, retry_count, attr->retry_cnt); - if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC) { - if (attr->max_rd_atomic) - context->params1 |= - cpu_to_be32(fls(attr->max_rd_atomic - 1) << 21); - } + if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC && attr->max_rd_atomic) + MLX5_SET(qpc, qpc, log_sra_max, ilog2(attr->max_rd_atomic)); if (attr_mask & IB_QP_SQ_PSN) - context->next_send_psn = cpu_to_be32(attr->sq_psn); + MLX5_SET(qpc, qpc, next_send_psn, attr->sq_psn); - if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) { - if (attr->max_dest_rd_atomic) - context->params2 |= - cpu_to_be32(fls(attr->max_dest_rd_atomic - 1) << 21); - } + if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC && attr->max_dest_rd_atomic) + MLX5_SET(qpc, qpc, log_rra_max, + ilog2(attr->max_dest_rd_atomic)); if (attr_mask & (IB_QP_ACCESS_FLAGS | IB_QP_MAX_DEST_RD_ATOMIC)) { - __be32 access_flags; - - err = to_mlx5_access_flags(qp, attr, attr_mask, &access_flags); + err = set_qpc_atomic_flags(qp, attr, attr_mask, qpc); if (err) goto out; - - context->params2 |= access_flags; } if (attr_mask & IB_QP_MIN_RNR_TIMER) - context->rnr_nextrecvpsn |= cpu_to_be32(attr->min_rnr_timer << 24); + MLX5_SET(qpc, qpc, min_rnr_nak, attr->min_rnr_timer); if (attr_mask & IB_QP_RQ_PSN) - context->rnr_nextrecvpsn |= cpu_to_be32(attr->rq_psn); + MLX5_SET(qpc, qpc, next_rcv_psn, attr->rq_psn); if (attr_mask & IB_QP_QKEY) - context->qkey = cpu_to_be32(attr->qkey); + MLX5_SET(qpc, qpc, q_key, attr->qkey); if (qp->rq.wqe_cnt && cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) - context->db_rec_addr = cpu_to_be64(qp->db.dma); + MLX5_SET64(qpc, qpc, dbr_addr, qp->db.dma); if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) { u8 port_num = (attr_mask & IB_QP_PORT ? attr->port_num : @@ -3921,15 +3913,14 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, set_id = ibqp->counter->id; else set_id = mlx5_ib_get_counters_id(dev, port_num); - context->qp_counter_set_usr_page |= - cpu_to_be32(set_id << 24); + MLX5_SET(qpc, qpc, counter_set_id, set_id); } if (!ibqp->uobject && cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) - context->sq_crq_size |= cpu_to_be16(1 << 4); + MLX5_SET(qpc, qpc, rlky, 1); if (qp->flags & MLX5_IB_QP_CREATE_SQPN_QP1) - context->deth_sqpn = cpu_to_be32(1); + MLX5_SET(qpc, qpc, deth_sqpn, 1); mlx5_cur = to_mlx5_state(cur_state); mlx5_new = to_mlx5_state(new_state); @@ -3987,7 +3978,7 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, err = modify_raw_packet_qp(dev, qp, &raw_qp_param, tx_affinity); } else { - err = mlx5_core_qp_modify(dev, op, optpar, context, &base->mqp); + err = mlx5_core_qp_modify(dev, op, optpar, qpc, &base->mqp); } if (err) @@ -4034,7 +4025,7 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, } out: - kfree(context); + kfree(qpc); return err; } diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h index def601199a1a..b8992b861ae6 100644 --- a/include/linux/mlx5/qp.h +++ b/include/linux/mlx5/qp.h @@ -495,72 +495,6 @@ struct mlx5_core_dct { struct completion drained; }; -struct mlx5_qp_path { - u8 fl_free_ar; - u8 rsvd3; - __be16 pkey_index; - u8 rsvd0; - u8 grh_mlid; - __be16 rlid; - u8 ackto_lt; - u8 mgid_index; - u8 static_rate; - u8 hop_limit; - __be32 tclass_flowlabel; - union { - u8 rgid[16]; - u8 rip[16]; - }; - u8 f_dscp_ecn_prio; - u8 ecn_dscp; - __be16 udp_sport; - u8 dci_cfi_prio_sl; - u8 port; - u8 rmac[6]; -}; - -/* FIXME: use mlx5_ifc.h qpc */ -struct mlx5_qp_context { - __be32 flags; - __be32 flags_pd; - u8 mtu_msgmax; - u8 rq_size_stride; - __be16 sq_crq_size; - __be32 qp_counter_set_usr_page; - __be32 wire_qpn; - __be32 log_pg_sz_remote_qpn; - struct mlx5_qp_path pri_path; - struct mlx5_qp_path alt_path; - __be32 params1; - u8 reserved2[4]; - __be32 next_send_psn; - __be32 cqn_send; - __be32 deth_sqpn; - u8 reserved3[4]; - __be32 last_acked_psn; - __be32 ssn; - __be32 params2; - __be32 rnr_nextrecvpsn; - __be32 xrcd; - __be32 cqn_recv; - __be64 db_rec_addr; - __be32 qkey; - __be32 rq_type_srqn; - __be32 rmsn; - __be16 hw_sq_wqe_counter; - __be16 sw_sq_wqe_counter; - __be16 hw_rcyclic_byte_counter; - __be16 hw_rq_counter; - __be16 sw_rcyclic_byte_counter; - __be16 sw_rq_counter; - u8 rsvd0[5]; - u8 cgs; - u8 cs_req; - u8 cs_res; - __be64 dc_access_key; - u8 rsvd1[24]; -}; - int mlx5_debug_qp_add(struct mlx5_core_dev *dev, struct mlx5_core_qp *qp); void mlx5_debug_qp_remove(struct mlx5_core_dev *dev, struct mlx5_core_qp *qp); -- cgit v1.2.3 From 5f62a521ff20e0b47a8d33421334bd245d6714ff Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 26 May 2020 14:54:39 +0300 Subject: RDMA/mlx5: Set ECE options during modify QP The most common way to set ECE option will be during modify QP command in INIT2RTR, RTR2RTS and RTS2RTS stages, so update mlx5 to support it. The new bit in the comp_mask is needed to mark that kernel supports ECE and can receive data instead of "reserved" field in the struct mlx5_ib_modify_qp. Link: https://lore.kernel.org/r/20200526115440.205922-8-leon@kernel.org Reviewed-by: Mark Zhang Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/main.c | 3 +++ drivers/infiniband/hw/mlx5/qp.c | 21 ++++++++++++--------- drivers/infiniband/hw/mlx5/qp.h | 2 +- drivers/infiniband/hw/mlx5/qpc.c | 11 +++++++---- include/uapi/rdma/mlx5-abi.h | 3 ++- 5 files changed, 25 insertions(+), 15 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 623d7898ae6d..6557c8339161 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -1959,6 +1959,9 @@ uar_done: resp.response_length += sizeof(resp.dump_fill_mkey); } + if (MLX5_CAP_GEN(dev->mdev, ece_support)) + resp.comp_mask |= MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_ECE; + err = ib_copy_to_udata(udata, &resp, resp.response_length); if (err) goto out_mdev; diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index a24176a8ec83..bfa0f7e43e3b 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -2353,7 +2353,7 @@ static void destroy_qp_common(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, if (qp->ibqp.qp_type != IB_QPT_RAW_PACKET && !(qp->flags & IB_QP_CREATE_SOURCE_QPN)) { err = mlx5_core_qp_modify(dev, MLX5_CMD_OP_2RST_QP, 0, - NULL, &base->mqp); + NULL, &base->mqp, NULL); } else { struct mlx5_modify_raw_qp_param raw_qp_param = { .operation = MLX5_CMD_OP_2RST_QP @@ -3978,7 +3978,10 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, err = modify_raw_packet_qp(dev, qp, &raw_qp_param, tx_affinity); } else { - err = mlx5_core_qp_modify(dev, op, optpar, qpc, &base->mqp); + u32 ece = MLX5_CAP_GEN(dev->mdev, ece_support) ? + ucmd->ece_options : 0; + err = mlx5_core_qp_modify(dev, op, optpar, qpc, &base->mqp, + &ece); } if (err) @@ -4131,7 +4134,6 @@ static int mlx5_ib_modify_dct(struct ib_qp *ibqp, struct ib_qp_attr *attr, set_id = mlx5_ib_get_counters_id(dev, attr->port_num - 1); MLX5_SET(dctc, dctc, counter_set_id, set_id); - } else if (cur_state == IB_QPS_INIT && new_state == IB_QPS_RTR) { struct mlx5_ib_modify_qp_resp resp = {}; u32 out[MLX5_ST_SZ_DW(create_dct_out)] = {0}; @@ -4182,7 +4184,6 @@ int mlx5_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, struct mlx5_ib_modify_qp ucmd = {}; enum ib_qp_type qp_type; enum ib_qp_state cur_state, new_state; - size_t required_cmd_sz; int err = -EINVAL; int port; @@ -4190,9 +4191,7 @@ int mlx5_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, return -ENOSYS; if (udata && udata->inlen) { - required_cmd_sz = offsetof(typeof(ucmd), reserved) + - sizeof(ucmd.reserved); - if (udata->inlen < required_cmd_sz) + if (udata->inlen < offsetofend(typeof(ucmd), ece_options)) return -EINVAL; if (udata->inlen > sizeof(ucmd) && @@ -4205,10 +4204,10 @@ int mlx5_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, return -EFAULT; if (ucmd.comp_mask || - memchr_inv(&ucmd.reserved, 0, sizeof(ucmd.reserved)) || memchr_inv(&ucmd.burst_info.reserved, 0, sizeof(ucmd.burst_info.reserved))) return -EOPNOTSUPP; + } if (unlikely(ibqp->qp_type == IB_QPT_GSI)) @@ -4217,8 +4216,12 @@ int mlx5_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, qp_type = (unlikely(ibqp->qp_type == MLX5_IB_QPT_HW_GSI)) ? IB_QPT_GSI : qp->type; - if (qp_type == MLX5_IB_QPT_DCT) + if (qp_type == MLX5_IB_QPT_DCT) { + if (memchr_inv(&ucmd.ece_options, 0, sizeof(ucmd.ece_options))) + return -EOPNOTSUPP; + return mlx5_ib_modify_dct(ibqp, attr, attr_mask, udata); + } mutex_lock(&qp->mutex); diff --git a/drivers/infiniband/hw/mlx5/qp.h b/drivers/infiniband/hw/mlx5/qp.h index 795c21f88962..82ea2b94dfa6 100644 --- a/drivers/infiniband/hw/mlx5/qp.h +++ b/drivers/infiniband/hw/mlx5/qp.h @@ -16,7 +16,7 @@ int mlx5_core_create_dct(struct mlx5_ib_dev *dev, struct mlx5_core_dct *qp, int mlx5_qpc_create_qp(struct mlx5_ib_dev *dev, struct mlx5_core_qp *qp, u32 *in, int inlen, u32 *out); int mlx5_core_qp_modify(struct mlx5_ib_dev *dev, u16 opcode, u32 opt_param_mask, - void *qpc, struct mlx5_core_qp *qp); + void *qpc, struct mlx5_core_qp *qp, u32 *ece); int mlx5_core_destroy_qp(struct mlx5_ib_dev *dev, struct mlx5_core_qp *qp); int mlx5_core_destroy_dct(struct mlx5_ib_dev *dev, struct mlx5_core_dct *dct); int mlx5_core_qp_query(struct mlx5_ib_dev *dev, struct mlx5_core_qp *qp, diff --git a/drivers/infiniband/hw/mlx5/qpc.c b/drivers/infiniband/hw/mlx5/qpc.c index 69c80859a6ee..d61bc1a88925 100644 --- a/drivers/infiniband/hw/mlx5/qpc.c +++ b/drivers/infiniband/hw/mlx5/qpc.c @@ -343,7 +343,7 @@ static void mbox_free(struct mbox_info *mbox) static int modify_qp_mbox_alloc(struct mlx5_core_dev *dev, u16 opcode, int qpn, u32 opt_param_mask, void *qpc, - struct mbox_info *mbox, u16 uid) + struct mbox_info *mbox, u16 uid, u32 ece) { mbox->out = NULL; mbox->in = NULL; @@ -391,18 +391,21 @@ static int modify_qp_mbox_alloc(struct mlx5_core_dev *dev, u16 opcode, int qpn, return -ENOMEM; MOD_QP_IN_SET_QPC(init2rtr_qp, mbox->in, opcode, qpn, opt_param_mask, qpc, uid); + MLX5_SET(init2rtr_qp_in, mbox->in, ece, ece); break; case MLX5_CMD_OP_RTR2RTS_QP: if (MBOX_ALLOC(mbox, rtr2rts_qp)) return -ENOMEM; MOD_QP_IN_SET_QPC(rtr2rts_qp, mbox->in, opcode, qpn, opt_param_mask, qpc, uid); + MLX5_SET(rtr2rts_qp_in, mbox->in, ece, ece); break; case MLX5_CMD_OP_RTS2RTS_QP: if (MBOX_ALLOC(mbox, rts2rts_qp)) return -ENOMEM; MOD_QP_IN_SET_QPC(rts2rts_qp, mbox->in, opcode, qpn, opt_param_mask, qpc, uid); + MLX5_SET(rts2rts_qp_in, mbox->in, ece, ece); break; case MLX5_CMD_OP_SQERR2RTS_QP: if (MBOX_ALLOC(mbox, sqerr2rts_qp)) @@ -423,13 +426,13 @@ static int modify_qp_mbox_alloc(struct mlx5_core_dev *dev, u16 opcode, int qpn, } int mlx5_core_qp_modify(struct mlx5_ib_dev *dev, u16 opcode, u32 opt_param_mask, - void *qpc, struct mlx5_core_qp *qp) + void *qpc, struct mlx5_core_qp *qp, u32 *ece) { struct mbox_info mbox; int err; - err = modify_qp_mbox_alloc(dev->mdev, opcode, qp->qpn, - opt_param_mask, qpc, &mbox, qp->uid); + err = modify_qp_mbox_alloc(dev->mdev, opcode, qp->qpn, opt_param_mask, + qpc, &mbox, qp->uid, (ece) ? *ece : 0); if (err) return err; diff --git a/include/uapi/rdma/mlx5-abi.h b/include/uapi/rdma/mlx5-abi.h index bc9d9e3cb369..24e29a678177 100644 --- a/include/uapi/rdma/mlx5-abi.h +++ b/include/uapi/rdma/mlx5-abi.h @@ -100,6 +100,7 @@ struct mlx5_ib_alloc_ucontext_req_v2 { enum mlx5_ib_alloc_ucontext_resp_mask { MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_CORE_CLOCK_OFFSET = 1UL << 0, MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_DUMP_FILL_MKEY = 1UL << 1, + MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_ECE = 1UL << 2, }; enum mlx5_user_cmds_supp_uhw { @@ -422,7 +423,7 @@ struct mlx5_ib_burst_info { struct mlx5_ib_modify_qp { __u32 comp_mask; struct mlx5_ib_burst_info burst_info; - __u32 reserved; + __u32 ece_options; }; struct mlx5_ib_modify_qp_resp { -- cgit v1.2.3 From 50aec2c3135efd985291adc2e4d1278d52b03de9 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 26 May 2020 14:54:40 +0300 Subject: RDMA/mlx5: Return ECE data after modify QP After users sets the ECE option, FW will return the agreed/supported bits through an output structures of modify QP stages for regular QPs or through create QP for the DCT. Link: https://lore.kernel.org/r/20200526115440.205922-9-leon@kernel.org Reviewed-by: Mark Zhang Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/qp.c | 25 +++++++++++++++++++++---- drivers/infiniband/hw/mlx5/qpc.c | 25 +++++++++++++++++++++++++ include/uapi/rdma/mlx5-abi.h | 2 ++ 3 files changed, 48 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index bfa0f7e43e3b..1988a0375696 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -3708,6 +3708,7 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, enum ib_qp_state cur_state, enum ib_qp_state new_state, const struct mlx5_ib_modify_qp *ucmd, + struct mlx5_ib_modify_qp_resp *resp, struct ib_udata *udata) { static const u16 optab[MLX5_QP_NUM_STATE][MLX5_QP_NUM_STATE] = { @@ -3978,10 +3979,15 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, err = modify_raw_packet_qp(dev, qp, &raw_qp_param, tx_affinity); } else { - u32 ece = MLX5_CAP_GEN(dev->mdev, ece_support) ? - ucmd->ece_options : 0; + if (udata) { + /* For the kernel flows, the resp will stay zero */ + resp->ece_options = + MLX5_CAP_GEN(dev->mdev, ece_support) ? + ucmd->ece_options : 0; + resp->response_length = sizeof(*resp); + } err = mlx5_core_qp_modify(dev, op, optpar, qpc, &base->mqp, - &ece); + &resp->ece_options); } if (err) @@ -4180,6 +4186,7 @@ int mlx5_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask, struct ib_udata *udata) { struct mlx5_ib_dev *dev = to_mdev(ibqp->device); + struct mlx5_ib_modify_qp_resp resp = {}; struct mlx5_ib_qp *qp = to_mqp(ibqp); struct mlx5_ib_modify_qp ucmd = {}; enum ib_qp_type qp_type; @@ -4292,7 +4299,17 @@ int mlx5_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, } err = __mlx5_ib_modify_qp(ibqp, attr, attr_mask, cur_state, - new_state, &ucmd, udata); + new_state, &ucmd, &resp, udata); + + /* resp.response_length is set in ECE supported flows only */ + if (!err && resp.response_length && + udata->outlen >= resp.response_length) + /* + * We don't check return value of the function below + * on purpose, because it is unclear how to unwind the + * error flow after QP was modified to the new state. + */ + ib_copy_to_udata(udata, &resp, resp.response_length); out: mutex_unlock(&qp->mutex); diff --git a/drivers/infiniband/hw/mlx5/qpc.c b/drivers/infiniband/hw/mlx5/qpc.c index d61bc1a88925..c19d91d6dce8 100644 --- a/drivers/infiniband/hw/mlx5/qpc.c +++ b/drivers/infiniband/hw/mlx5/qpc.c @@ -341,6 +341,27 @@ static void mbox_free(struct mbox_info *mbox) kfree(mbox->out); } +static int get_ece_from_mbox(void *out, u16 opcode) +{ + int ece = 0; + + switch (opcode) { + case MLX5_CMD_OP_INIT2RTR_QP: + ece = MLX5_GET(init2rtr_qp_out, out, ece); + break; + case MLX5_CMD_OP_RTR2RTS_QP: + ece = MLX5_GET(rtr2rts_qp_out, out, ece); + break; + case MLX5_CMD_OP_RTS2RTS_QP: + ece = MLX5_GET(rts2rts_qp_out, out, ece); + break; + default: + break; + } + + return ece; +} + static int modify_qp_mbox_alloc(struct mlx5_core_dev *dev, u16 opcode, int qpn, u32 opt_param_mask, void *qpc, struct mbox_info *mbox, u16 uid, u32 ece) @@ -438,6 +459,10 @@ int mlx5_core_qp_modify(struct mlx5_ib_dev *dev, u16 opcode, u32 opt_param_mask, err = mlx5_cmd_exec(dev->mdev, mbox.in, mbox.inlen, mbox.out, mbox.outlen); + + if (ece) + *ece = get_ece_from_mbox(mbox.out, opcode); + mbox_free(&mbox); return err; } diff --git a/include/uapi/rdma/mlx5-abi.h b/include/uapi/rdma/mlx5-abi.h index 24e29a678177..27905a0268c9 100644 --- a/include/uapi/rdma/mlx5-abi.h +++ b/include/uapi/rdma/mlx5-abi.h @@ -429,6 +429,8 @@ struct mlx5_ib_modify_qp { struct mlx5_ib_modify_qp_resp { __u32 response_length; __u32 dctn; + __u32 ece_options; + __u32 reserved; }; struct mlx5_ib_create_wq_resp { -- cgit v1.2.3 From 87fee61c35133e41f28facf856e0481cbf1bb2bd Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Mon, 25 May 2020 10:22:09 -0700 Subject: RDMA/srp: Make the channel count configurable per target Increase the flexibility of the SRP initiator driver by making the channel count configurable per target instead of only providing a kernel module parameter for configuring the channel count. Link: https://lore.kernel.org/r/20200525172212.14413-2-bvanassche@acm.org Signed-off-by: Bart Van Assche Signed-off-by: Jason Gunthorpe --- drivers/infiniband/ulp/srp/ib_srp.c | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index 00b4f88b113e..22fa8722a03a 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -3424,6 +3424,7 @@ enum { SRP_OPT_IP_DEST = 1 << 16, SRP_OPT_TARGET_CAN_QUEUE= 1 << 17, SRP_OPT_MAX_IT_IU_SIZE = 1 << 18, + SRP_OPT_CH_COUNT = 1 << 19, }; static unsigned int srp_opt_mandatory[] = { @@ -3457,6 +3458,7 @@ static const match_table_t srp_opt_tokens = { { SRP_OPT_IP_SRC, "src=%s" }, { SRP_OPT_IP_DEST, "dest=%s" }, { SRP_OPT_MAX_IT_IU_SIZE, "max_it_iu_size=%d" }, + { SRP_OPT_CH_COUNT, "ch_count=%u", }, { SRP_OPT_ERR, NULL } }; @@ -3758,6 +3760,14 @@ static int srp_parse_options(struct net *net, const char *buf, target->max_it_iu_size = token; break; + case SRP_OPT_CH_COUNT: + if (match_int(args, &token) || token < 1) { + pr_warn("bad channel count %s\n", p); + goto out; + } + target->ch_count = token; + break; + default: pr_warn("unknown parameter or missing value '%s' in target creation request\n", p); @@ -3921,11 +3931,13 @@ static ssize_t srp_create_target(struct device *dev, goto out; ret = -ENOMEM; - target->ch_count = max_t(unsigned, num_online_nodes(), - min(ch_count ? : - min(4 * num_online_nodes(), - ibdev->num_comp_vectors), - num_online_cpus())); + if (target->ch_count == 0) + target->ch_count = + max_t(unsigned int, num_online_nodes(), + min(ch_count ?: + min(4 * num_online_nodes(), + ibdev->num_comp_vectors), + num_online_cpus())); target->ch = kcalloc(target->ch_count, sizeof(*target->ch), GFP_KERNEL); if (!target->ch) -- cgit v1.2.3 From d4ee7f3a4445ec1b0b88af216f4032c4d30abf5a Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Mon, 25 May 2020 10:22:10 -0700 Subject: RDMA/srpt: Make debug output more detailed Since the session name by itself is not sufficient to uniquely identify a queue pair, include the queue pair number. Show the ASCII channel state name instead of the numeric value. This change makes the ib_srpt debug output more consistent. Link: https://lore.kernel.org/r/20200525172212.14413-3-bvanassche@acm.org Signed-off-by: Bart Van Assche Signed-off-by: Jason Gunthorpe --- drivers/infiniband/ulp/srpt/ib_srpt.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.c b/drivers/infiniband/ulp/srpt/ib_srpt.c index cdc8c239d6c0..b96e91e1b775 100644 --- a/drivers/infiniband/ulp/srpt/ib_srpt.c +++ b/drivers/infiniband/ulp/srpt/ib_srpt.c @@ -214,8 +214,9 @@ static const char *get_ch_state_name(enum rdma_ch_state s) */ static void srpt_qp_event(struct ib_event *event, struct srpt_rdma_ch *ch) { - pr_debug("QP event %d on ch=%p sess_name=%s state=%d\n", - event->event, ch, ch->sess_name, ch->state); + pr_debug("QP event %d on ch=%p sess_name=%s-%d state=%s\n", + event->event, ch, ch->sess_name, ch->qp->qp_num, + get_ch_state_name(ch->state)); switch (event->event) { case IB_EVENT_COMM_EST: @@ -1985,8 +1986,8 @@ static void __srpt_close_all_ch(struct srpt_port *sport) list_for_each_entry(nexus, &sport->nexus_list, entry) { list_for_each_entry(ch, &nexus->ch_list, list) { if (srpt_disconnect_ch(ch) >= 0) - pr_info("Closing channel %s because target %s_%d has been disabled\n", - ch->sess_name, + pr_info("Closing channel %s-%d because target %s_%d has been disabled\n", + ch->sess_name, ch->qp->qp_num, dev_name(&sport->sdev->device->dev), sport->port); srpt_close_ch(ch); -- cgit v1.2.3 From 66ced2eb2ab95f6ebe8033e2e0a2e61f2615377c Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Mon, 25 May 2020 10:22:11 -0700 Subject: RDMA/srpt: Reduce max_recv_sge to 1 Since srpt_post_recv() always sets num_sge to 1, reduce the max_recv_sge parameter that is used at queue pair allocation time to 1. Link: https://lore.kernel.org/r/20200525172212.14413-4-bvanassche@acm.org Signed-off-by: Bart Van Assche Signed-off-by: Jason Gunthorpe --- drivers/infiniband/ulp/srpt/ib_srpt.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.c b/drivers/infiniband/ulp/srpt/ib_srpt.c index b96e91e1b775..84f80f35ec8a 100644 --- a/drivers/infiniband/ulp/srpt/ib_srpt.c +++ b/drivers/infiniband/ulp/srpt/ib_srpt.c @@ -1818,16 +1818,12 @@ retry: qp_init->cap.max_rdma_ctxs = sq_size / 2; qp_init->cap.max_send_sge = min(attrs->max_send_sge, SRPT_MAX_SG_PER_WQE); - qp_init->cap.max_recv_sge = min(attrs->max_recv_sge, - SRPT_MAX_SG_PER_WQE); + qp_init->cap.max_recv_sge = 1; qp_init->port_num = ch->sport->port; - if (sdev->use_srq) { + if (sdev->use_srq) qp_init->srq = sdev->srq; - } else { + else qp_init->cap.max_recv_wr = ch->rq_size; - qp_init->cap.max_recv_sge = min(attrs->max_recv_sge, - SRPT_MAX_SG_PER_WQE); - } if (ch->using_rdma_cm) { ret = rdma_create_qp(ch->rdma_cm.cm_id, sdev->pd, qp_init); -- cgit v1.2.3 From e0cca8b456e2319804c9187f237268b14c50d323 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Mon, 25 May 2020 10:22:12 -0700 Subject: RDMA/srpt: Increase max_send_sge The ib_srpt driver limits max_send_sge to 16. Since that is a workaround for an mlx4 bug that has been fixed, increase max_send_sge. See also commit f95ccffc715b ("IB/mlx4: Use 4K pages for kernel QP's WQE buffer"). Link: https://lore.kernel.org/r/20200525172212.14413-5-bvanassche@acm.org Signed-off-by: Bart Van Assche Signed-off-by: Jason Gunthorpe --- drivers/infiniband/ulp/srpt/ib_srpt.c | 3 +-- drivers/infiniband/ulp/srpt/ib_srpt.h | 5 ----- 2 files changed, 1 insertion(+), 7 deletions(-) diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.c b/drivers/infiniband/ulp/srpt/ib_srpt.c index 84f80f35ec8a..ef7fcd3e8e15 100644 --- a/drivers/infiniband/ulp/srpt/ib_srpt.c +++ b/drivers/infiniband/ulp/srpt/ib_srpt.c @@ -1816,8 +1816,7 @@ retry: */ qp_init->cap.max_send_wr = min(sq_size / 2, attrs->max_qp_wr); qp_init->cap.max_rdma_ctxs = sq_size / 2; - qp_init->cap.max_send_sge = min(attrs->max_send_sge, - SRPT_MAX_SG_PER_WQE); + qp_init->cap.max_send_sge = attrs->max_send_sge; qp_init->cap.max_recv_sge = 1; qp_init->port_num = ch->sport->port; if (sdev->use_srq) diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.h b/drivers/infiniband/ulp/srpt/ib_srpt.h index 2e1a69840857..f31c349d07a1 100644 --- a/drivers/infiniband/ulp/srpt/ib_srpt.h +++ b/drivers/infiniband/ulp/srpt/ib_srpt.h @@ -105,11 +105,6 @@ enum { SRP_CMD_ACA = 0x4, SRPT_DEF_SG_TABLESIZE = 128, - /* - * An experimentally determined value that avoids that QP creation - * fails due to "swiotlb buffer is full" on systems using the swiotlb. - */ - SRPT_MAX_SG_PER_WQE = 16, MIN_SRPT_SQ_SIZE = 16, DEF_SRPT_SQ_SIZE = 4096, -- cgit v1.2.3 From 802dcc7fc5ec0932bea0f33db046cc744aecf233 Mon Sep 17 00:00:00 2001 From: Mark Zhang Date: Wed, 27 May 2020 08:50:14 +0300 Subject: RDMA/mlx5: Support TX port affinity for VF drivers in LAG mode The mlx5 VF driver doesn't set QP tx port affinity because it doesn't know if the lag is active or not, since the "lag_active" works only for PF interfaces. In this case for VF interfaces only one lag is used which brings performance issue. Add a lag_tx_port_affinity CAP bit; When it is enabled and "num_lag_ports > 1", then driver always set QP tx affinity, regardless of lag state. Link: https://lore.kernel.org/r/20200527055014.355093-1-leon@kernel.org Signed-off-by: Mark Zhang Reviewed-by: Maor Gottlieb Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/main.c | 2 +- drivers/infiniband/hw/mlx5/mlx5_ib.h | 7 +++++++ drivers/infiniband/hw/mlx5/qp.c | 3 ++- 3 files changed, 10 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 6557c8339161..49a1aff72715 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -1972,7 +1972,7 @@ uar_done: context->lib_caps = req.lib_caps; print_lib_caps(dev, context->lib_caps); - if (dev->lag_active) { + if (mlx5_ib_lag_should_assign_affinity(dev)) { u8 port = mlx5_core_native_port_num(dev->mdev) - 1; atomic_set(&context->tx_port_affinity, diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 482b54eb9764..2a702fa9e943 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -1550,4 +1550,11 @@ static inline bool mlx5_ib_can_use_umr(struct mlx5_ib_dev *dev, int mlx5_ib_enable_driver(struct ib_device *dev); int mlx5_ib_test_wc(struct mlx5_ib_dev *dev); + +static inline bool mlx5_ib_lag_should_assign_affinity(struct mlx5_ib_dev *dev) +{ + return dev->lag_active || + (MLX5_CAP_GEN(dev->mdev, num_lag_ports) > 1 && + MLX5_CAP_GEN(dev->mdev, lag_tx_port_affinity)); +} #endif /* MLX5_IB_H */ diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 1988a0375696..9364a7a76ac2 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -3653,7 +3653,8 @@ static unsigned int get_tx_affinity(struct ib_qp *qp, struct mlx5_ib_qp_base *qp_base; unsigned int tx_affinity; - if (!(dev->lag_active && qp_supports_affinity(qp))) + if (!(mlx5_ib_lag_should_assign_affinity(dev) && + qp_supports_affinity(qp))) return 0; if (mqp->flags & MLX5_IB_QP_CREATE_SQPN_QP1) -- cgit v1.2.3 From 48062b0a8ba0e0d7da2a96fe53028b7474e2dd26 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 28 May 2020 16:04:27 +0100 Subject: RDMA/hns: remove duplicate assignment to pointer raq The pointer raq is being assigned twice. Fix this by removing one of the redundant assignments. Fixes: 14ba87304bf9 ("RDMA/hns: Remove redundant type cast for general pointers") Link: https://lore.kernel.org/r/20200528150427.420624-1-colin.king@canonical.com Addressses-Coverity: ("Evaluation order violation") Signed-off-by: Colin Ian King Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_hw_v1.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c index 8ff6b922b4d7..d02207cd30df 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c @@ -1146,7 +1146,7 @@ static void hns_roce_db_free(struct hns_roce_dev *hr_dev) static int hns_roce_raq_init(struct hns_roce_dev *hr_dev) { struct hns_roce_v1_priv *priv = hr_dev->priv; - struct hns_roce_raq_table *raq = raq = &priv->raq_table; + struct hns_roce_raq_table *raq = &priv->raq_table; struct device *dev = &hr_dev->pdev->dev; int raq_shift = 0; dma_addr_t addr; -- cgit v1.2.3 From ffd7339a2fac98b9ff731e336c4411bf1ce57e22 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 27 May 2020 14:18:45 -0300 Subject: RDMA/core: Use offsetofend() instead of open coding No reason to open code this. Link: https://lore.kernel.org/r/0-v1-0bc346e08476+585-drop_offsetofend_jgg@mellanox.com Signed-off-by: Jason Gunthorpe --- include/rdma/uverbs_ioctl.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/include/rdma/uverbs_ioctl.h b/include/rdma/uverbs_ioctl.h index 0418d7bddf3e..86de10ea30af 100644 --- a/include/rdma/uverbs_ioctl.h +++ b/include/rdma/uverbs_ioctl.h @@ -491,8 +491,7 @@ struct uapi_definition { */ #define UVERBS_ATTR_STRUCT(_type, _last) \ .zero_trailing = 1, \ - UVERBS_ATTR_SIZE(((uintptr_t)(&((_type *)0)->_last + 1)), \ - sizeof(_type)) + UVERBS_ATTR_SIZE(offsetofend(_type, _last), sizeof(_type)) /* * Specifies at least min_len bytes must be passed in, but the amount can be * larger, up to the protocol maximum size. No check for zeroing is done. -- cgit v1.2.3 From bcafcdfdaee7665267e56990e8040fbab588550f Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 28 May 2020 12:07:09 +0100 Subject: IB/hfi1: Fix spelling mistake "enought" -> "enough" There is a spelling mistake in an error message. Fix it. Link: https://lore.kernel.org/r/20200528110709.400935-1-colin.king@canonical.com Signed-off-by: Colin Ian King Acked-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/chip.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/hfi1/chip.c b/drivers/infiniband/hw/hfi1/chip.c index 7f35b9ea158b..15f9c635f292 100644 --- a/drivers/infiniband/hw/hfi1/chip.c +++ b/drivers/infiniband/hw/hfi1/chip.c @@ -14559,7 +14559,7 @@ static bool hfi1_netdev_update_rmt(struct hfi1_devdata *dd) } if (hfi1_is_rmt_full(rmt_start, NUM_NETDEV_MAP_ENTRIES)) { - dd_dev_err(dd, "Not enought RMT entries used = %d\n", + dd_dev_err(dd, "Not enough RMT entries used = %d\n", rmt_start); return false; } -- cgit v1.2.3 From 0b8e125e213204508e1b3c4bdfe69713280b7abd Mon Sep 17 00:00:00 2001 From: Qiushi Wu Date: Wed, 27 May 2020 22:02:30 -0500 Subject: RDMA/core: Fix several reference count leaks. kobject_init_and_add() takes reference even when it fails. If this function returns an error, kobject_put() must be called to properly clean up the memory associated with the object. Previous commit b8eb718348b8 ("net-sysfs: Fix reference count leak in rx|netdev_queue_add_kobject") fixed a similar problem. Link: https://lore.kernel.org/r/20200528030231.9082-1-wu000273@umn.edu Signed-off-by: Qiushi Wu Reviewed-by: Jason Gunthorpe Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/sysfs.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c index 087682e6969e..defe9cd4c5ee 100644 --- a/drivers/infiniband/core/sysfs.c +++ b/drivers/infiniband/core/sysfs.c @@ -1058,8 +1058,7 @@ static int add_port(struct ib_core_device *coredev, int port_num) coredev->ports_kobj, "%d", port_num); if (ret) { - kfree(p); - return ret; + goto err_put; } p->gid_attr_group = kzalloc(sizeof(*p->gid_attr_group), GFP_KERNEL); @@ -1072,8 +1071,7 @@ static int add_port(struct ib_core_device *coredev, int port_num) ret = kobject_init_and_add(&p->gid_attr_group->kobj, &gid_attr_type, &p->kobj, "gid_attrs"); if (ret) { - kfree(p->gid_attr_group); - goto err_put; + goto err_put_gid_attrs; } if (device->ops.process_mad && is_full_dev) { @@ -1404,8 +1402,10 @@ int ib_port_register_module_stat(struct ib_device *device, u8 port_num, ret = kobject_init_and_add(kobj, ktype, &port->kobj, "%s", name); - if (ret) + if (ret) { + kobject_put(kobj); return ret; + } } return 0; -- cgit v1.2.3 From 3446cbd2d523fdaf37f3772082071d1154c419d9 Mon Sep 17 00:00:00 2001 From: Yamin Friedman Date: Wed, 27 May 2020 11:34:52 +0300 Subject: RDMA/core: Add protection for shared CQs used by ULPs A pre-step for adding shared CQs. Add the infrastructure to prevent shared CQ users from altering the CQ configurations. For now all cqs are marked as private (non-shared). The core driver should use the new force functions to perform resize/destroy/moderation changes that are not allowed for users of shared CQs. Link: https://lore.kernel.org/r/1590568495-101621-2-git-send-email-yaminf@mellanox.com Signed-off-by: Yamin Friedman Reviewed-by: Or Gerlitz Reviewed-by: Max Gurtovoy Reviewed-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/verbs.c | 9 +++++++++ include/rdma/ib_verbs.h | 5 ++++- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index e2c9430a3ff1..21815e125e98 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -2005,6 +2005,9 @@ EXPORT_SYMBOL(__ib_create_cq); int rdma_set_cq_moderation(struct ib_cq *cq, u16 cq_count, u16 cq_period) { + if (cq->shared) + return -EOPNOTSUPP; + return cq->device->ops.modify_cq ? cq->device->ops.modify_cq(cq, cq_count, cq_period) : -EOPNOTSUPP; @@ -2013,6 +2016,9 @@ EXPORT_SYMBOL(rdma_set_cq_moderation); int ib_destroy_cq_user(struct ib_cq *cq, struct ib_udata *udata) { + if (WARN_ON_ONCE(cq->shared)) + return -EOPNOTSUPP; + if (atomic_read(&cq->usecnt)) return -EBUSY; @@ -2025,6 +2031,9 @@ EXPORT_SYMBOL(ib_destroy_cq_user); int ib_resize_cq(struct ib_cq *cq, int cqe) { + if (cq->shared) + return -EOPNOTSUPP; + return cq->device->ops.resize_cq ? cq->device->ops.resize_cq(cq, cqe, NULL) : -EOPNOTSUPP; } diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 94533ae16697..cc515025cbdb 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -1613,7 +1613,8 @@ struct ib_cq { /* updated only by trace points */ ktime_t timestamp; - bool interrupt; + u8 interrupt:1; + u8 shared:1; /* * Implementation details of the RDMA core, don't use in drivers: @@ -3909,6 +3910,8 @@ static inline struct ib_cq *ib_alloc_cq_any(struct ib_device *dev, * ib_free_cq_user - Free kernel/user CQ * @cq: The CQ to free * @udata: Valid user data or NULL for kernel objects + * + * NOTE: This function shouldn't be called on shared CQs. */ void ib_free_cq_user(struct ib_cq *cq, struct ib_udata *udata); -- cgit v1.2.3 From c7ff819aefea04944dfcec5f0731b97277df6a9c Mon Sep 17 00:00:00 2001 From: Yamin Friedman Date: Wed, 27 May 2020 11:34:53 +0300 Subject: RDMA/core: Introduce shared CQ pool API Allow a ULP to ask the core to provide a completion queue based on a least-used search on a per-device CQ pools. The device CQ pools grow in a lazy fashion when more CQs are requested. This feature reduces the amount of interrupts when using many QPs. Using shared CQs allows for more effcient completion handling. It also reduces the amount of overhead needed for CQ contexts. Test setup: Intel(R) Xeon(R) Platinum 8176M CPU @ 2.10GHz servers. Running NVMeoF 4KB read IOs over ConnectX-5EX across Spectrum switch. TX-depth = 32. The patch was applied in the nvme driver on both the target and initiator. Four controllers are accessed from each core. In the current test case we have exposed sixteen NVMe namespaces using four different subsystems (four namespaces per subsystem) from one NVM port. Each controller allocated X queues (RDMA QPs) and attached to Y CQs. Before this series we had X == Y, i.e for four controllers we've created total of 4X QPs and 4X CQs. In the shared case, we've created 4X QPs and only X CQs which means that we have four controllers that share a completion queue per core. Until fourteen cores there is no significant change in performance and the number of interrupts per second is less than a million in the current case. ================================================== |Cores|Current KIOPs |Shared KIOPs |improvement| |-----|---------------|--------------|-----------| |14 |2332 |2723 |16.7% | |-----|---------------|--------------|-----------| |20 |2086 |2712 |30% | |-----|---------------|--------------|-----------| |28 |1971 |2669 |35.4% | |================================================= |Cores|Current avg lat|Shared avg lat|improvement| |-----|---------------|--------------|-----------| |14 |767us |657us |14.3% | |-----|---------------|--------------|-----------| |20 |1225us |943us |23% | |-----|---------------|--------------|-----------| |28 |1816us |1341us |26.1% | ======================================================== |Cores|Current interrupts|Shared interrupts|improvement| |-----|------------------|-----------------|-----------| |14 |1.6M/sec |0.4M/sec |72% | |-----|------------------|-----------------|-----------| |20 |2.8M/sec |0.6M/sec |72.4% | |-----|------------------|-----------------|-----------| |28 |2.9M/sec |0.8M/sec |63.4% | ==================================================================== |Cores|Current 99.99th PCTL lat|Shared 99.99th PCTL lat|improvement| |-----|------------------------|-----------------------|-----------| |14 |67ms |6ms |90.9% | |-----|------------------------|-----------------------|-----------| |20 |5ms |6ms |-10% | |-----|------------------------|-----------------------|-----------| |28 |8.7ms |6ms |25.9% | |=================================================================== Performance improvement with sixteen disks (sixteen CQs per core) is comparable. Link: https://lore.kernel.org/r/1590568495-101621-3-git-send-email-yaminf@mellanox.com Signed-off-by: Yamin Friedman Reviewed-by: Or Gerlitz Reviewed-by: Max Gurtovoy Reviewed-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/core_priv.h | 3 + drivers/infiniband/core/cq.c | 173 ++++++++++++++++++++++++++++++++++++ drivers/infiniband/core/device.c | 2 + include/rdma/ib_verbs.h | 17 +++- 4 files changed, 194 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h index cf42acca4a3a..a1e6a67b2c4a 100644 --- a/drivers/infiniband/core/core_priv.h +++ b/drivers/infiniband/core/core_priv.h @@ -414,4 +414,7 @@ void rdma_umap_priv_init(struct rdma_umap_priv *priv, struct vm_area_struct *vma, struct rdma_user_mmap_entry *entry); +void ib_cq_pool_init(struct ib_device *dev); +void ib_cq_pool_destroy(struct ib_device *dev); + #endif /* _CORE_PRIV_H */ diff --git a/drivers/infiniband/core/cq.c b/drivers/infiniband/core/cq.c index 4f25b2400694..655795bfa0ee 100644 --- a/drivers/infiniband/core/cq.c +++ b/drivers/infiniband/core/cq.c @@ -7,7 +7,11 @@ #include #include +#include "core_priv.h" + #include +/* Max size for shared CQ, may require tuning */ +#define IB_MAX_SHARED_CQ_SZ 4096U /* # of WCs to poll for with a single call to ib_poll_cq */ #define IB_POLL_BATCH 16 @@ -218,6 +222,7 @@ struct ib_cq *__ib_alloc_cq_user(struct ib_device *dev, void *private, cq->cq_context = private; cq->poll_ctx = poll_ctx; atomic_set(&cq->usecnt, 0); + cq->comp_vector = comp_vector; cq->wc = kmalloc_array(IB_POLL_BATCH, sizeof(*cq->wc), GFP_KERNEL); if (!cq->wc) @@ -309,6 +314,8 @@ void ib_free_cq_user(struct ib_cq *cq, struct ib_udata *udata) { if (WARN_ON_ONCE(atomic_read(&cq->usecnt))) return; + if (WARN_ON_ONCE(cq->cqe_used)) + return; switch (cq->poll_ctx) { case IB_POLL_DIRECT: @@ -334,3 +341,169 @@ void ib_free_cq_user(struct ib_cq *cq, struct ib_udata *udata) kfree(cq); } EXPORT_SYMBOL(ib_free_cq_user); + +void ib_cq_pool_init(struct ib_device *dev) +{ + unsigned int i; + + spin_lock_init(&dev->cq_pools_lock); + for (i = 0; i < ARRAY_SIZE(dev->cq_pools); i++) + INIT_LIST_HEAD(&dev->cq_pools[i]); +} + +void ib_cq_pool_destroy(struct ib_device *dev) +{ + struct ib_cq *cq, *n; + unsigned int i; + + for (i = 0; i < ARRAY_SIZE(dev->cq_pools); i++) { + list_for_each_entry_safe(cq, n, &dev->cq_pools[i], + pool_entry) { + WARN_ON(cq->cqe_used); + cq->shared = false; + ib_free_cq(cq); + } + } +} + +static int ib_alloc_cqs(struct ib_device *dev, unsigned int nr_cqes, + enum ib_poll_context poll_ctx) +{ + LIST_HEAD(tmp_list); + unsigned int nr_cqs, i; + struct ib_cq *cq; + int ret; + + if (poll_ctx > IB_POLL_LAST_POOL_TYPE) { + WARN_ON_ONCE(poll_ctx > IB_POLL_LAST_POOL_TYPE); + return -EINVAL; + } + + /* + * Allocate at least as many CQEs as requested, and otherwise + * a reasonable batch size so that we can share CQs between + * multiple users instead of allocating a larger number of CQs. + */ + nr_cqes = min_t(unsigned int, dev->attrs.max_cqe, + max(nr_cqes, IB_MAX_SHARED_CQ_SZ)); + nr_cqs = min_t(unsigned int, dev->num_comp_vectors, num_online_cpus()); + for (i = 0; i < nr_cqs; i++) { + cq = ib_alloc_cq(dev, NULL, nr_cqes, i, poll_ctx); + if (IS_ERR(cq)) { + ret = PTR_ERR(cq); + goto out_free_cqs; + } + cq->shared = true; + list_add_tail(&cq->pool_entry, &tmp_list); + } + + spin_lock_irq(&dev->cq_pools_lock); + list_splice(&tmp_list, &dev->cq_pools[poll_ctx]); + spin_unlock_irq(&dev->cq_pools_lock); + + return 0; + +out_free_cqs: + list_for_each_entry(cq, &tmp_list, pool_entry) { + cq->shared = false; + ib_free_cq(cq); + } + return ret; +} + +/** + * ib_cq_pool_get() - Find the least used completion queue that matches + * a given cpu hint (or least used for wild card affinity) and fits + * nr_cqe. + * @dev: rdma device + * @nr_cqe: number of needed cqe entries + * @comp_vector_hint: completion vector hint (-1) for the driver to assign + * a comp vector based on internal counter + * @poll_ctx: cq polling context + * + * Finds a cq that satisfies @comp_vector_hint and @nr_cqe requirements and + * claim entries in it for us. In case there is no available cq, allocate + * a new cq with the requirements and add it to the device pool. + * IB_POLL_DIRECT cannot be used for shared cqs so it is not a valid value + * for @poll_ctx. + */ +struct ib_cq *ib_cq_pool_get(struct ib_device *dev, unsigned int nr_cqe, + int comp_vector_hint, + enum ib_poll_context poll_ctx) +{ + static unsigned int default_comp_vector; + unsigned int vector, num_comp_vectors; + struct ib_cq *cq, *found = NULL; + int ret; + + if (poll_ctx > IB_POLL_LAST_POOL_TYPE) { + WARN_ON_ONCE(poll_ctx > IB_POLL_LAST_POOL_TYPE); + return ERR_PTR(-EINVAL); + } + + num_comp_vectors = + min_t(unsigned int, dev->num_comp_vectors, num_online_cpus()); + /* Project the affinty to the device completion vector range */ + if (comp_vector_hint < 0) { + comp_vector_hint = + (READ_ONCE(default_comp_vector) + 1) % num_comp_vectors; + WRITE_ONCE(default_comp_vector, comp_vector_hint); + } + vector = comp_vector_hint % num_comp_vectors; + + /* + * Find the least used CQ with correct affinity and + * enough free CQ entries + */ + while (!found) { + spin_lock_irq(&dev->cq_pools_lock); + list_for_each_entry(cq, &dev->cq_pools[poll_ctx], + pool_entry) { + /* + * Check to see if we have found a CQ with the + * correct completion vector + */ + if (vector != cq->comp_vector) + continue; + if (cq->cqe_used + nr_cqe > cq->cqe) + continue; + found = cq; + break; + } + + if (found) { + found->cqe_used += nr_cqe; + spin_unlock_irq(&dev->cq_pools_lock); + + return found; + } + spin_unlock_irq(&dev->cq_pools_lock); + + /* + * Didn't find a match or ran out of CQs in the device + * pool, allocate a new array of CQs. + */ + ret = ib_alloc_cqs(dev, nr_cqe, poll_ctx); + if (ret) + return ERR_PTR(ret); + } + + return found; +} +EXPORT_SYMBOL(ib_cq_pool_get); + +/** + * ib_cq_pool_put - Return a CQ taken from a shared pool. + * @cq: The CQ to return. + * @nr_cqe: The max number of cqes that the user had requested. + */ +void ib_cq_pool_put(struct ib_cq *cq, unsigned int nr_cqe) +{ + if (WARN_ON_ONCE(nr_cqe > cq->cqe_used)) + return; + + spin_lock_irq(&cq->device->cq_pools_lock); + cq->cqe_used -= nr_cqe; + spin_unlock_irq(&cq->device->cq_pools_lock); +} +EXPORT_SYMBOL(ib_cq_pool_put); diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index d9f565a779df..53f541f41ff3 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -1393,6 +1393,7 @@ int ib_register_device(struct ib_device *device, const char *name) goto dev_cleanup; } + ib_cq_pool_init(device); ret = enable_device_and_get(device); dev_set_uevent_suppress(&device->dev, false); /* Mark for userspace that device is ready */ @@ -1447,6 +1448,7 @@ static void __ib_unregister_device(struct ib_device *ib_dev) goto out; disable_device(ib_dev); + ib_cq_pool_destroy(ib_dev); /* Expedite removing unregistered pointers from the hash table */ free_netdevs(ib_dev); diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index cc515025cbdb..19864da78649 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -1588,10 +1588,12 @@ struct ib_ah { typedef void (*ib_comp_handler)(struct ib_cq *cq, void *cq_context); enum ib_poll_context { - IB_POLL_DIRECT, /* caller context, no hw completions */ IB_POLL_SOFTIRQ, /* poll from softirq context */ IB_POLL_WORKQUEUE, /* poll from workqueue */ IB_POLL_UNBOUND_WORKQUEUE, /* poll from unbound workqueue */ + IB_POLL_LAST_POOL_TYPE = IB_POLL_UNBOUND_WORKQUEUE, + + IB_POLL_DIRECT, /* caller context, no hw completions */ }; struct ib_cq { @@ -1601,9 +1603,11 @@ struct ib_cq { void (*event_handler)(struct ib_event *, void *); void *cq_context; int cqe; + unsigned int cqe_used; atomic_t usecnt; /* count number of work queues */ enum ib_poll_context poll_ctx; struct ib_wc *wc; + struct list_head pool_entry; union { struct irq_poll iop; struct work_struct work; @@ -1615,6 +1619,7 @@ struct ib_cq { ktime_t timestamp; u8 interrupt:1; u8 shared:1; + unsigned int comp_vector; /* * Implementation details of the RDMA core, don't use in drivers: @@ -2734,6 +2739,10 @@ struct ib_device { #endif u32 index; + + spinlock_t cq_pools_lock; + struct list_head cq_pools[IB_POLL_LAST_POOL_TYPE + 1]; + struct rdma_restrack_root *res; const struct uapi_definition *driver_def; @@ -4037,6 +4046,12 @@ static inline int ib_req_notify_cq(struct ib_cq *cq, return cq->device->ops.req_notify_cq(cq, flags); } +struct ib_cq *ib_cq_pool_get(struct ib_device *dev, unsigned int nr_cqe, + int comp_vector_hint, + enum ib_poll_context poll_ctx); + +void ib_cq_pool_put(struct ib_cq *cq, unsigned int nr_cqe); + /** * ib_req_ncomp_notif - Request completion notification when there are * at least the specified number of unreaped completions on the CQ. -- cgit v1.2.3 From 1fc431320a53f3e9b33b399667c8788fa00eb8b0 Mon Sep 17 00:00:00 2001 From: Israel Rukshin Date: Thu, 28 May 2020 16:45:43 -0300 Subject: RDMA/iser: Remove support for FMR memory registration FMR is not supported on most recent RDMA devices (that use fast memory registration mechanism). Also, FMR was recently removed from NFS/RDMA ULP. Link: https://lore.kernel.org/r/1-v3-f58e6669d5d3+2cf-fmr_removal_jgg@mellanox.com Signed-off-by: Israel Rukshin Signed-off-by: Max Gurtovoy Reviewed-by: Sagi Grimberg Signed-off-by: Jason Gunthorpe --- drivers/infiniband/ulp/iser/iscsi_iser.h | 79 +---------- drivers/infiniband/ulp/iser/iser_initiator.c | 19 ++- drivers/infiniband/ulp/iser/iser_memory.c | 188 ++------------------------- drivers/infiniband/ulp/iser/iser_verbs.c | 126 +++--------------- 4 files changed, 40 insertions(+), 372 deletions(-) diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.h b/drivers/infiniband/ulp/iser/iscsi_iser.h index 029c00163442..1d77c7f42e38 100644 --- a/drivers/infiniband/ulp/iser/iscsi_iser.h +++ b/drivers/infiniband/ulp/iser/iscsi_iser.h @@ -65,7 +65,6 @@ #include #include -#include #include #define DRV_NAME "iser" @@ -312,33 +311,6 @@ struct iser_comp { int active_qps; }; -/** - * struct iser_reg_ops - Memory registration operations - * per-device registration schemes - * - * @alloc_reg_res: Allocate registration resources - * @free_reg_res: Free registration resources - * @reg_mem: Register memory buffers - * @unreg_mem: Un-register memory buffers - * @reg_desc_get: Get a registration descriptor for pool - * @reg_desc_put: Get a registration descriptor to pool - */ -struct iser_reg_ops { - int (*alloc_reg_res)(struct ib_conn *ib_conn, - unsigned cmds_max, - unsigned int size); - void (*free_reg_res)(struct ib_conn *ib_conn); - int (*reg_mem)(struct iscsi_iser_task *iser_task, - struct iser_data_buf *mem, - struct iser_reg_resources *rsc, - struct iser_mem_reg *reg); - void (*unreg_mem)(struct iscsi_iser_task *iser_task, - enum iser_data_dir cmd_dir); - struct iser_fr_desc * (*reg_desc_get)(struct ib_conn *ib_conn); - void (*reg_desc_put)(struct ib_conn *ib_conn, - struct iser_fr_desc *desc); -}; - /** * struct iser_device - iSER device handle * @@ -351,8 +323,6 @@ struct iser_reg_ops { * @comps_used: Number of completion contexts used, Min between online * cpus and device max completion vectors * @comps: Dinamically allocated array of completion handlers - * @reg_ops: Registration ops - * @remote_inv_sup: Remote invalidate is supported on this device */ struct iser_device { struct ib_device *ib_device; @@ -362,26 +332,18 @@ struct iser_device { int refcount; int comps_used; struct iser_comp *comps; - const struct iser_reg_ops *reg_ops; - bool remote_inv_sup; }; /** * struct iser_reg_resources - Fast registration resources * * @mr: memory region - * @fmr_pool: pool of fmrs * @sig_mr: signature memory region - * @page_vec: fast reg page list used by fmr pool * @mr_valid: is mr valid indicator */ struct iser_reg_resources { - union { - struct ib_mr *mr; - struct ib_fmr_pool *fmr_pool; - }; + struct ib_mr *mr; struct ib_mr *sig_mr; - struct iser_page_vec *page_vec; u8 mr_valid:1; }; @@ -403,7 +365,7 @@ struct iser_fr_desc { * struct iser_fr_pool - connection fast registration pool * * @list: list of fastreg descriptors - * @lock: protects fmr/fastreg pool + * @lock: protects fastreg pool * @size: size of the pool */ struct iser_fr_pool { @@ -518,12 +480,6 @@ struct iscsi_iser_task { struct iser_data_buf prot[ISER_DIRS_NUM]; }; -struct iser_page_vec { - u64 *pages; - int npages; - struct ib_mr fake_mr; -}; - /** * struct iser_global - iSER global context * @@ -548,8 +504,6 @@ extern int iser_pi_guard; extern unsigned int iser_max_sectors; extern bool iser_always_reg; -int iser_assign_reg_ops(struct iser_device *device); - int iser_send_control(struct iscsi_conn *conn, struct iscsi_task *task); @@ -591,22 +545,17 @@ void iser_finalize_rdma_unaligned_sg(struct iscsi_iser_task *iser_task, struct iser_data_buf *mem, enum iser_data_dir cmd_dir); -int iser_reg_rdma_mem(struct iscsi_iser_task *task, - enum iser_data_dir dir, - bool all_imm); -void iser_unreg_rdma_mem(struct iscsi_iser_task *task, - enum iser_data_dir dir); +int iser_reg_mem_fastreg(struct iscsi_iser_task *task, + enum iser_data_dir dir, + bool all_imm); +void iser_unreg_mem_fastreg(struct iscsi_iser_task *task, + enum iser_data_dir dir); int iser_connect(struct iser_conn *iser_conn, struct sockaddr *src_addr, struct sockaddr *dst_addr, int non_blocking); -void iser_unreg_mem_fmr(struct iscsi_iser_task *iser_task, - enum iser_data_dir cmd_dir); -void iser_unreg_mem_fastreg(struct iscsi_iser_task *iser_task, - enum iser_data_dir cmd_dir); - int iser_post_recvl(struct iser_conn *iser_conn); int iser_post_recvm(struct iser_conn *iser_conn, int count); int iser_post_send(struct ib_conn *ib_conn, struct iser_tx_desc *tx_desc, @@ -625,26 +574,12 @@ int iser_initialize_task_headers(struct iscsi_task *task, struct iser_tx_desc *tx_desc); int iser_alloc_rx_descriptors(struct iser_conn *iser_conn, struct iscsi_session *session); -int iser_alloc_fmr_pool(struct ib_conn *ib_conn, - unsigned cmds_max, - unsigned int size); -void iser_free_fmr_pool(struct ib_conn *ib_conn); int iser_alloc_fastreg_pool(struct ib_conn *ib_conn, unsigned cmds_max, unsigned int size); void iser_free_fastreg_pool(struct ib_conn *ib_conn); u8 iser_check_task_pi_status(struct iscsi_iser_task *iser_task, enum iser_data_dir cmd_dir, sector_t *sector); -struct iser_fr_desc * -iser_reg_desc_get_fr(struct ib_conn *ib_conn); -void -iser_reg_desc_put_fr(struct ib_conn *ib_conn, - struct iser_fr_desc *desc); -struct iser_fr_desc * -iser_reg_desc_get_fmr(struct ib_conn *ib_conn); -void -iser_reg_desc_put_fmr(struct ib_conn *ib_conn, - struct iser_fr_desc *desc); static inline struct iser_conn * to_iser_conn(struct ib_conn *ib_conn) diff --git a/drivers/infiniband/ulp/iser/iser_initiator.c b/drivers/infiniband/ulp/iser/iser_initiator.c index 4a7045bb0831..27a6f75a9912 100644 --- a/drivers/infiniband/ulp/iser/iser_initiator.c +++ b/drivers/infiniband/ulp/iser/iser_initiator.c @@ -72,7 +72,7 @@ static int iser_prepare_read_cmd(struct iscsi_task *task) return err; } - err = iser_reg_rdma_mem(iser_task, ISER_DIR_IN, false); + err = iser_reg_mem_fastreg(iser_task, ISER_DIR_IN, false); if (err) { iser_err("Failed to set up Data-IN RDMA\n"); return err; @@ -126,8 +126,8 @@ iser_prepare_write_cmd(struct iscsi_task *task, return err; } - err = iser_reg_rdma_mem(iser_task, ISER_DIR_OUT, - buf_out->data_len == imm_sz); + err = iser_reg_mem_fastreg(iser_task, ISER_DIR_OUT, + buf_out->data_len == imm_sz); if (err != 0) { iser_err("Failed to register write cmd RDMA mem\n"); return err; @@ -250,8 +250,8 @@ int iser_alloc_rx_descriptors(struct iser_conn *iser_conn, iser_conn->qp_max_recv_dtos_mask = session->cmds_max - 1; /* cmds_max is 2^N */ iser_conn->min_posted_rx = iser_conn->qp_max_recv_dtos >> 2; - if (device->reg_ops->alloc_reg_res(ib_conn, session->scsi_cmds_max, - iser_conn->pages_per_mr)) + if (iser_alloc_fastreg_pool(ib_conn, session->scsi_cmds_max, + iser_conn->pages_per_mr)) goto create_rdma_reg_res_failed; if (iser_alloc_login_buf(iser_conn)) @@ -293,7 +293,7 @@ rx_desc_dma_map_failed: rx_desc_alloc_fail: iser_free_login_buf(iser_conn); alloc_login_buf_fail: - device->reg_ops->free_reg_res(ib_conn); + iser_free_fastreg_pool(ib_conn); create_rdma_reg_res_failed: iser_err("failed allocating rx descriptors / data buffers\n"); return -ENOMEM; @@ -306,8 +306,7 @@ void iser_free_rx_descriptors(struct iser_conn *iser_conn) struct ib_conn *ib_conn = &iser_conn->ib_conn; struct iser_device *device = ib_conn->device; - if (device->reg_ops->free_reg_res) - device->reg_ops->free_reg_res(ib_conn); + iser_free_fastreg_pool(ib_conn); rx_desc = iser_conn->rx_descs; for (i = 0; i < iser_conn->qp_max_recv_dtos; i++, rx_desc++) @@ -768,7 +767,7 @@ void iser_task_rdma_finalize(struct iscsi_iser_task *iser_task) int prot_count = scsi_prot_sg_count(iser_task->sc); if (iser_task->dir[ISER_DIR_IN]) { - iser_unreg_rdma_mem(iser_task, ISER_DIR_IN); + iser_unreg_mem_fastreg(iser_task, ISER_DIR_IN); iser_dma_unmap_task_data(iser_task, &iser_task->data[ISER_DIR_IN], DMA_FROM_DEVICE); @@ -779,7 +778,7 @@ void iser_task_rdma_finalize(struct iscsi_iser_task *iser_task) } if (iser_task->dir[ISER_DIR_OUT]) { - iser_unreg_rdma_mem(iser_task, ISER_DIR_OUT); + iser_unreg_mem_fastreg(iser_task, ISER_DIR_OUT); iser_dma_unmap_task_data(iser_task, &iser_task->data[ISER_DIR_OUT], DMA_TO_DEVICE); diff --git a/drivers/infiniband/ulp/iser/iser_memory.c b/drivers/infiniband/ulp/iser/iser_memory.c index 999ef7cdd05e..d4e057fac219 100644 --- a/drivers/infiniband/ulp/iser/iser_memory.c +++ b/drivers/infiniband/ulp/iser/iser_memory.c @@ -38,62 +38,13 @@ #include #include "iscsi_iser.h" -static -int iser_fast_reg_fmr(struct iscsi_iser_task *iser_task, - struct iser_data_buf *mem, - struct iser_reg_resources *rsc, - struct iser_mem_reg *mem_reg); -static -int iser_fast_reg_mr(struct iscsi_iser_task *iser_task, - struct iser_data_buf *mem, - struct iser_reg_resources *rsc, - struct iser_mem_reg *mem_reg); - -static const struct iser_reg_ops fastreg_ops = { - .alloc_reg_res = iser_alloc_fastreg_pool, - .free_reg_res = iser_free_fastreg_pool, - .reg_mem = iser_fast_reg_mr, - .unreg_mem = iser_unreg_mem_fastreg, - .reg_desc_get = iser_reg_desc_get_fr, - .reg_desc_put = iser_reg_desc_put_fr, -}; - -static const struct iser_reg_ops fmr_ops = { - .alloc_reg_res = iser_alloc_fmr_pool, - .free_reg_res = iser_free_fmr_pool, - .reg_mem = iser_fast_reg_fmr, - .unreg_mem = iser_unreg_mem_fmr, - .reg_desc_get = iser_reg_desc_get_fmr, - .reg_desc_put = iser_reg_desc_put_fmr, -}; void iser_reg_comp(struct ib_cq *cq, struct ib_wc *wc) { iser_err_comp(wc, "memreg"); } -int iser_assign_reg_ops(struct iser_device *device) -{ - struct ib_device *ib_dev = device->ib_device; - - /* Assign function handles - based on FMR support */ - if (ib_dev->ops.alloc_fmr && ib_dev->ops.dealloc_fmr && - ib_dev->ops.map_phys_fmr && ib_dev->ops.unmap_fmr) { - iser_info("FMR supported, using FMR for registration\n"); - device->reg_ops = &fmr_ops; - } else if (ib_dev->attrs.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) { - iser_info("FastReg supported, using FastReg for registration\n"); - device->reg_ops = &fastreg_ops; - device->remote_inv_sup = iser_always_reg; - } else { - iser_err("IB device does not support FMRs nor FastRegs, can't register memory\n"); - return -1; - } - - return 0; -} - -struct iser_fr_desc * +static struct iser_fr_desc * iser_reg_desc_get_fr(struct ib_conn *ib_conn) { struct iser_fr_pool *fr_pool = &ib_conn->fr_pool; @@ -109,7 +60,7 @@ iser_reg_desc_get_fr(struct ib_conn *ib_conn) return desc; } -void +static void iser_reg_desc_put_fr(struct ib_conn *ib_conn, struct iser_fr_desc *desc) { @@ -121,44 +72,6 @@ iser_reg_desc_put_fr(struct ib_conn *ib_conn, spin_unlock_irqrestore(&fr_pool->lock, flags); } -struct iser_fr_desc * -iser_reg_desc_get_fmr(struct ib_conn *ib_conn) -{ - struct iser_fr_pool *fr_pool = &ib_conn->fr_pool; - - return list_first_entry(&fr_pool->list, - struct iser_fr_desc, list); -} - -void -iser_reg_desc_put_fmr(struct ib_conn *ib_conn, - struct iser_fr_desc *desc) -{ -} - -static void iser_data_buf_dump(struct iser_data_buf *data, - struct ib_device *ibdev) -{ - struct scatterlist *sg; - int i; - - for_each_sg(data->sg, sg, data->dma_nents, i) - iser_dbg("sg[%d] dma_addr:0x%lX page:0x%p " - "off:0x%x sz:0x%x dma_len:0x%x\n", - i, (unsigned long)sg_dma_address(sg), - sg_page(sg), sg->offset, sg->length, sg_dma_len(sg)); -} - -static void iser_dump_page_vec(struct iser_page_vec *page_vec) -{ - int i; - - iser_err("page vec npages %d data length %lld\n", - page_vec->npages, page_vec->fake_mr.length); - for (i = 0; i < page_vec->npages; i++) - iser_err("vec[%d]: %llx\n", i, page_vec->pages[i]); -} - int iser_dma_map_task_data(struct iscsi_iser_task *iser_task, struct iser_data_buf *data, enum iser_data_dir iser_dir, @@ -213,84 +126,9 @@ iser_reg_dma(struct iser_device *device, struct iser_data_buf *mem, return 0; } -static int iser_set_page(struct ib_mr *mr, u64 addr) -{ - struct iser_page_vec *page_vec = - container_of(mr, struct iser_page_vec, fake_mr); - - page_vec->pages[page_vec->npages++] = addr; - - return 0; -} - -static -int iser_fast_reg_fmr(struct iscsi_iser_task *iser_task, - struct iser_data_buf *mem, - struct iser_reg_resources *rsc, - struct iser_mem_reg *reg) -{ - struct ib_conn *ib_conn = &iser_task->iser_conn->ib_conn; - struct iser_device *device = ib_conn->device; - struct iser_page_vec *page_vec = rsc->page_vec; - struct ib_fmr_pool *fmr_pool = rsc->fmr_pool; - struct ib_pool_fmr *fmr; - int ret, plen; - - page_vec->npages = 0; - page_vec->fake_mr.page_size = SZ_4K; - plen = ib_sg_to_pages(&page_vec->fake_mr, mem->sg, - mem->dma_nents, NULL, iser_set_page); - if (unlikely(plen < mem->dma_nents)) { - iser_err("page vec too short to hold this SG\n"); - iser_data_buf_dump(mem, device->ib_device); - iser_dump_page_vec(page_vec); - return -EINVAL; - } - - fmr = ib_fmr_pool_map_phys(fmr_pool, page_vec->pages, - page_vec->npages, page_vec->pages[0]); - if (IS_ERR(fmr)) { - ret = PTR_ERR(fmr); - iser_err("ib_fmr_pool_map_phys failed: %d\n", ret); - return ret; - } - - reg->sge.lkey = fmr->fmr->lkey; - reg->rkey = fmr->fmr->rkey; - reg->sge.addr = page_vec->fake_mr.iova; - reg->sge.length = page_vec->fake_mr.length; - reg->mem_h = fmr; - - iser_dbg("fmr reg: lkey=0x%x, rkey=0x%x, addr=0x%llx," - " length=0x%x\n", reg->sge.lkey, reg->rkey, - reg->sge.addr, reg->sge.length); - - return 0; -} - -/** - * Unregister (previosuly registered using FMR) memory. - * If memory is non-FMR does nothing. - */ -void iser_unreg_mem_fmr(struct iscsi_iser_task *iser_task, - enum iser_data_dir cmd_dir) -{ - struct iser_mem_reg *reg = &iser_task->rdma_reg[cmd_dir]; - - if (!reg->mem_h) - return; - - iser_dbg("PHYSICAL Mem.Unregister mem_h %p\n", reg->mem_h); - - ib_fmr_pool_unmap((struct ib_pool_fmr *)reg->mem_h); - - reg->mem_h = NULL; -} - void iser_unreg_mem_fastreg(struct iscsi_iser_task *iser_task, enum iser_data_dir cmd_dir) { - struct iser_device *device = iser_task->iser_conn->ib_conn.device; struct iser_mem_reg *reg = &iser_task->rdma_reg[cmd_dir]; struct iser_fr_desc *desc; struct ib_mr_status mr_status; @@ -312,7 +150,7 @@ void iser_unreg_mem_fastreg(struct iscsi_iser_task *iser_task, ib_check_mr_status(desc->rsc.sig_mr, IB_MR_CHECK_SIG_STATUS, &mr_status); } - device->reg_ops->reg_desc_put(&iser_task->iser_conn->ib_conn, desc); + iser_reg_desc_put_fr(&iser_task->iser_conn->ib_conn, reg->mem_h); reg->mem_h = NULL; } @@ -509,15 +347,14 @@ iser_reg_data_sg(struct iscsi_iser_task *task, if (use_dma_key) return iser_reg_dma(device, mem, reg); - return device->reg_ops->reg_mem(task, mem, &desc->rsc, reg); + return iser_fast_reg_mr(task, mem, &desc->rsc, reg); } -int iser_reg_rdma_mem(struct iscsi_iser_task *task, - enum iser_data_dir dir, - bool all_imm) +int iser_reg_mem_fastreg(struct iscsi_iser_task *task, + enum iser_data_dir dir, + bool all_imm) { struct ib_conn *ib_conn = &task->iser_conn->ib_conn; - struct iser_device *device = ib_conn->device; struct iser_data_buf *mem = &task->data[dir]; struct iser_mem_reg *reg = &task->rdma_reg[dir]; struct iser_fr_desc *desc = NULL; @@ -528,7 +365,7 @@ int iser_reg_rdma_mem(struct iscsi_iser_task *task, scsi_get_prot_op(task->sc) == SCSI_PROT_NORMAL; if (!use_dma_key) { - desc = device->reg_ops->reg_desc_get(ib_conn); + desc = iser_reg_desc_get_fr(ib_conn); reg->mem_h = desc; } @@ -549,15 +386,8 @@ int iser_reg_rdma_mem(struct iscsi_iser_task *task, err_reg: if (desc) - device->reg_ops->reg_desc_put(ib_conn, desc); + iser_reg_desc_put_fr(ib_conn, desc); return err; } -void iser_unreg_rdma_mem(struct iscsi_iser_task *task, - enum iser_data_dir dir) -{ - struct iser_device *device = task->iser_conn->ib_conn.device; - - device->reg_ops->unreg_mem(task, dir); -} diff --git a/drivers/infiniband/ulp/iser/iser_verbs.c b/drivers/infiniband/ulp/iser/iser_verbs.c index 127887c6c03f..c1f44c41f501 100644 --- a/drivers/infiniband/ulp/iser/iser_verbs.c +++ b/drivers/infiniband/ulp/iser/iser_verbs.c @@ -68,11 +68,12 @@ static void iser_event_handler(struct ib_event_handler *handler, static int iser_create_device_ib_res(struct iser_device *device) { struct ib_device *ib_dev = device->ib_device; - int ret, i, max_cqe; + int i, max_cqe; - ret = iser_assign_reg_ops(device); - if (ret) - return ret; + if (!(ib_dev->attrs.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS)) { + iser_err("IB device does not support memory registrations\n"); + return -1; + } device->comps_used = min_t(int, num_online_cpus(), ib_dev->num_comp_vectors); @@ -147,96 +148,6 @@ static void iser_free_device_ib_res(struct iser_device *device) device->pd = NULL; } -/** - * iser_alloc_fmr_pool - Creates FMR pool and page_vector - * @ib_conn: connection RDMA resources - * @cmds_max: max number of SCSI commands for this connection - * @size: max number of pages per map request - * - * Return: 0 on success, or errno code on failure - */ -int iser_alloc_fmr_pool(struct ib_conn *ib_conn, - unsigned cmds_max, - unsigned int size) -{ - struct iser_device *device = ib_conn->device; - struct iser_fr_pool *fr_pool = &ib_conn->fr_pool; - struct iser_page_vec *page_vec; - struct iser_fr_desc *desc; - struct ib_fmr_pool *fmr_pool; - struct ib_fmr_pool_param params; - int ret; - - INIT_LIST_HEAD(&fr_pool->list); - spin_lock_init(&fr_pool->lock); - - desc = kzalloc(sizeof(*desc), GFP_KERNEL); - if (!desc) - return -ENOMEM; - - page_vec = kmalloc(sizeof(*page_vec) + (sizeof(u64) * size), - GFP_KERNEL); - if (!page_vec) { - ret = -ENOMEM; - goto err_frpl; - } - - page_vec->pages = (u64 *)(page_vec + 1); - - params.page_shift = ilog2(SZ_4K); - params.max_pages_per_fmr = size; - /* make the pool size twice the max number of SCSI commands * - * the ML is expected to queue, watermark for unmap at 50% */ - params.pool_size = cmds_max * 2; - params.dirty_watermark = cmds_max; - params.cache = 0; - params.flush_function = NULL; - params.access = (IB_ACCESS_LOCAL_WRITE | - IB_ACCESS_REMOTE_WRITE | - IB_ACCESS_REMOTE_READ); - - fmr_pool = ib_create_fmr_pool(device->pd, ¶ms); - if (IS_ERR(fmr_pool)) { - ret = PTR_ERR(fmr_pool); - iser_err("FMR allocation failed, err %d\n", ret); - goto err_fmr; - } - - desc->rsc.page_vec = page_vec; - desc->rsc.fmr_pool = fmr_pool; - list_add(&desc->list, &fr_pool->list); - - return 0; - -err_fmr: - kfree(page_vec); -err_frpl: - kfree(desc); - - return ret; -} - -/** - * iser_free_fmr_pool - releases the FMR pool and page vec - * @ib_conn: connection RDMA resources - */ -void iser_free_fmr_pool(struct ib_conn *ib_conn) -{ - struct iser_fr_pool *fr_pool = &ib_conn->fr_pool; - struct iser_fr_desc *desc; - - desc = list_first_entry(&fr_pool->list, - struct iser_fr_desc, list); - list_del(&desc->list); - - iser_info("freeing conn %p fmr pool %p\n", - ib_conn, desc->rsc.fmr_pool); - - ib_destroy_fmr_pool(desc->rsc.fmr_pool); - kfree(desc->rsc.page_vec); - kfree(desc); -} - static struct iser_fr_desc * iser_create_fastreg_desc(struct iser_device *device, struct ib_pd *pd, @@ -667,13 +578,12 @@ iser_calc_scsi_params(struct iser_conn *iser_conn, u32 max_num_sg; /* - * FRs without SG_GAPS or FMRs can only map up to a (device) page per - * entry, but if the first entry is misaligned we'll end up using two - * entries (head and tail) for a single page worth data, so one - * additional entry is required. + * FRs without SG_GAPS can only map up to a (device) page per entry, + * but if the first entry is misaligned we'll end up using two entries + * (head and tail) for a single page worth data, so one additional + * entry is required. */ - if ((attr->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) && - (attr->device_cap_flags & IB_DEVICE_SG_GAPS_REG)) + if (attr->device_cap_flags & IB_DEVICE_SG_GAPS_REG) reserved_mr_pages = 0; else reserved_mr_pages = 1; @@ -684,14 +594,8 @@ iser_calc_scsi_params(struct iser_conn *iser_conn, max_num_sg = attr->max_fast_reg_page_list_len; sg_tablesize = DIV_ROUND_UP(max_sectors * SECTOR_SIZE, SZ_4K); - if (attr->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) - sup_sg_tablesize = - min_t( - uint, ISCSI_ISER_MAX_SG_TABLESIZE, - max_num_sg - reserved_mr_pages); - else - sup_sg_tablesize = ISCSI_ISER_MAX_SG_TABLESIZE; - + sup_sg_tablesize = min_t(uint, ISCSI_ISER_MAX_SG_TABLESIZE, + max_num_sg - reserved_mr_pages); iser_conn->scsi_sg_tablesize = min(sg_tablesize, sup_sg_tablesize); iser_conn->pages_per_mr = iser_conn->scsi_sg_tablesize + reserved_mr_pages; @@ -755,7 +659,7 @@ static void iser_route_handler(struct rdma_cm_id *cma_id) struct iser_cm_hdr req_hdr; struct iser_conn *iser_conn = (struct iser_conn *)cma_id->context; struct ib_conn *ib_conn = &iser_conn->ib_conn; - struct iser_device *device = ib_conn->device; + struct ib_device *ib_dev = ib_conn->device->ib_device; if (iser_conn->state != ISER_CONN_PENDING) /* bailout */ @@ -766,14 +670,14 @@ static void iser_route_handler(struct rdma_cm_id *cma_id) goto failure; memset(&conn_param, 0, sizeof conn_param); - conn_param.responder_resources = device->ib_device->attrs.max_qp_rd_atom; + conn_param.responder_resources = ib_dev->attrs.max_qp_rd_atom; conn_param.initiator_depth = 1; conn_param.retry_count = 7; conn_param.rnr_retry_count = 6; memset(&req_hdr, 0, sizeof(req_hdr)); req_hdr.flags = ISER_ZBVA_NOT_SUP; - if (!device->remote_inv_sup) + if (!iser_always_reg) req_hdr.flags |= ISER_SEND_W_INV_NOT_SUP; conn_param.private_data = (void *)&req_hdr; conn_param.private_data_len = sizeof(struct iser_cm_hdr); -- cgit v1.2.3 From f273ad4f8d90bb87d2259fe37caee82e9aa7906c Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Thu, 28 May 2020 16:45:44 -0300 Subject: RDMA/srp: Remove support for FMR memory registration FMR is not supported on most recent RDMA devices (that use fast memory registration mechanism). Also, FMR was recently removed from NFS/RDMA ULP. Link: https://lore.kernel.org/r/2-v3-f58e6669d5d3+2cf-fmr_removal_jgg@mellanox.com Signed-off-by: Max Gurtovoy Reviewed-by: Israel Rukshin Reviewed-by: Bart Van Assche Signed-off-by: Jason Gunthorpe --- drivers/infiniband/ulp/srp/ib_srp.c | 222 +++--------------------------------- drivers/infiniband/ulp/srp/ib_srp.h | 27 +---- 2 files changed, 22 insertions(+), 227 deletions(-) diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index 22fa8722a03a..d8fcd21ab472 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -71,7 +71,6 @@ static unsigned int srp_sg_tablesize; static unsigned int cmd_sg_entries; static unsigned int indirect_sg_entries; static bool allow_ext_sg; -static bool prefer_fr = true; static bool register_always = true; static bool never_register; static int topspin_workarounds = 1; @@ -95,10 +94,6 @@ module_param(topspin_workarounds, int, 0444); MODULE_PARM_DESC(topspin_workarounds, "Enable workarounds for Topspin/Cisco SRP target bugs if != 0"); -module_param(prefer_fr, bool, 0444); -MODULE_PARM_DESC(prefer_fr, -"Whether to use fast registration if both FMR and fast registration are supported"); - module_param(register_always, bool, 0444); MODULE_PARM_DESC(register_always, "Use memory registration even for contiguous memory regions"); @@ -388,24 +383,6 @@ static int srp_new_cm_id(struct srp_rdma_ch *ch) srp_new_ib_cm_id(ch); } -static struct ib_fmr_pool *srp_alloc_fmr_pool(struct srp_target_port *target) -{ - struct srp_device *dev = target->srp_host->srp_dev; - struct ib_fmr_pool_param fmr_param; - - memset(&fmr_param, 0, sizeof(fmr_param)); - fmr_param.pool_size = target->mr_pool_size; - fmr_param.dirty_watermark = fmr_param.pool_size / 4; - fmr_param.cache = 1; - fmr_param.max_pages_per_fmr = dev->max_pages_per_mr; - fmr_param.page_shift = ilog2(dev->mr_page_size); - fmr_param.access = (IB_ACCESS_LOCAL_WRITE | - IB_ACCESS_REMOTE_WRITE | - IB_ACCESS_REMOTE_READ); - - return ib_create_fmr_pool(dev->pd, &fmr_param); -} - /** * srp_destroy_fr_pool() - free the resources owned by a pool * @pool: Fast registration pool to be destroyed. @@ -556,7 +533,6 @@ static int srp_create_ch_ib(struct srp_rdma_ch *ch) struct ib_qp_init_attr *init_attr; struct ib_cq *recv_cq, *send_cq; struct ib_qp *qp; - struct ib_fmr_pool *fmr_pool = NULL; struct srp_fr_pool *fr_pool = NULL; const int m = 1 + dev->use_fast_reg * target->mr_per_cmd * 2; int ret; @@ -619,14 +595,6 @@ static int srp_create_ch_ib(struct srp_rdma_ch *ch) "FR pool allocation failed (%d)\n", ret); goto err_qp; } - } else if (dev->use_fmr) { - fmr_pool = srp_alloc_fmr_pool(target); - if (IS_ERR(fmr_pool)) { - ret = PTR_ERR(fmr_pool); - shost_printk(KERN_WARNING, target->scsi_host, PFX - "FMR pool allocation failed (%d)\n", ret); - goto err_qp; - } } if (ch->qp) @@ -644,10 +612,6 @@ static int srp_create_ch_ib(struct srp_rdma_ch *ch) if (ch->fr_pool) srp_destroy_fr_pool(ch->fr_pool); ch->fr_pool = fr_pool; - } else if (dev->use_fmr) { - if (ch->fmr_pool) - ib_destroy_fmr_pool(ch->fmr_pool); - ch->fmr_pool = fmr_pool; } kfree(init_attr); @@ -702,9 +666,6 @@ static void srp_free_ch_ib(struct srp_target_port *target, if (dev->use_fast_reg) { if (ch->fr_pool) srp_destroy_fr_pool(ch->fr_pool); - } else if (dev->use_fmr) { - if (ch->fmr_pool) - ib_destroy_fmr_pool(ch->fmr_pool); } srp_destroy_qp(ch); @@ -1017,12 +978,8 @@ static void srp_free_req_data(struct srp_target_port *target, for (i = 0; i < target->req_ring_size; ++i) { req = &ch->req_ring[i]; - if (dev->use_fast_reg) { + if (dev->use_fast_reg) kfree(req->fr_list); - } else { - kfree(req->fmr_list); - kfree(req->map_page); - } if (req->indirect_dma_addr) { ib_dma_unmap_single(ibdev, req->indirect_dma_addr, target->indirect_size, @@ -1056,16 +1013,8 @@ static int srp_alloc_req_data(struct srp_rdma_ch *ch) GFP_KERNEL); if (!mr_list) goto out; - if (srp_dev->use_fast_reg) { + if (srp_dev->use_fast_reg) req->fr_list = mr_list; - } else { - req->fmr_list = mr_list; - req->map_page = kmalloc_array(srp_dev->max_pages_per_mr, - sizeof(void *), - GFP_KERNEL); - if (!req->map_page) - goto out; - } req->indirect_desc = kmalloc(target->indirect_size, GFP_KERNEL); if (!req->indirect_desc) goto out; @@ -1272,11 +1221,6 @@ static void srp_unmap_data(struct scsi_cmnd *scmnd, if (req->nmdesc) srp_fr_pool_put(ch->fr_pool, req->fr_list, req->nmdesc); - } else if (dev->use_fmr) { - struct ib_pool_fmr **pfmr; - - for (i = req->nmdesc, pfmr = req->fmr_list; i > 0; i--, pfmr++) - ib_fmr_pool_unmap(*pfmr); } ib_dma_unmap_sg(ibdev, scsi_sglist(scmnd), scsi_sg_count(scmnd), @@ -1472,50 +1416,6 @@ static void srp_map_desc(struct srp_map_state *state, dma_addr_t dma_addr, state->ndesc++; } -static int srp_map_finish_fmr(struct srp_map_state *state, - struct srp_rdma_ch *ch) -{ - struct srp_target_port *target = ch->target; - struct srp_device *dev = target->srp_host->srp_dev; - struct ib_pool_fmr *fmr; - u64 io_addr = 0; - - if (state->fmr.next >= state->fmr.end) { - shost_printk(KERN_ERR, ch->target->scsi_host, - PFX "Out of MRs (mr_per_cmd = %d)\n", - ch->target->mr_per_cmd); - return -ENOMEM; - } - - WARN_ON_ONCE(!dev->use_fmr); - - if (state->npages == 0) - return 0; - - if (state->npages == 1 && target->global_rkey) { - srp_map_desc(state, state->base_dma_addr, state->dma_len, - target->global_rkey); - goto reset_state; - } - - fmr = ib_fmr_pool_map_phys(ch->fmr_pool, state->pages, - state->npages, io_addr); - if (IS_ERR(fmr)) - return PTR_ERR(fmr); - - *state->fmr.next++ = fmr; - state->nmdesc++; - - srp_map_desc(state, state->base_dma_addr & ~dev->mr_page_mask, - state->dma_len, fmr->fmr->rkey); - -reset_state: - state->npages = 0; - state->dma_len = 0; - - return 0; -} - static void srp_reg_mr_err_done(struct ib_cq *cq, struct ib_wc *wc) { srp_handle_qp_err(cq, wc, "FAST REG"); @@ -1606,74 +1506,6 @@ static int srp_map_finish_fr(struct srp_map_state *state, return n; } -static int srp_map_sg_entry(struct srp_map_state *state, - struct srp_rdma_ch *ch, - struct scatterlist *sg) -{ - struct srp_target_port *target = ch->target; - struct srp_device *dev = target->srp_host->srp_dev; - dma_addr_t dma_addr = sg_dma_address(sg); - unsigned int dma_len = sg_dma_len(sg); - unsigned int len = 0; - int ret; - - WARN_ON_ONCE(!dma_len); - - while (dma_len) { - unsigned offset = dma_addr & ~dev->mr_page_mask; - - if (state->npages == dev->max_pages_per_mr || - (state->npages > 0 && offset != 0)) { - ret = srp_map_finish_fmr(state, ch); - if (ret) - return ret; - } - - len = min_t(unsigned int, dma_len, dev->mr_page_size - offset); - - if (!state->npages) - state->base_dma_addr = dma_addr; - state->pages[state->npages++] = dma_addr & dev->mr_page_mask; - state->dma_len += len; - dma_addr += len; - dma_len -= len; - } - - /* - * If the end of the MR is not on a page boundary then we need to - * close it out and start a new one -- we can only merge at page - * boundaries. - */ - ret = 0; - if ((dma_addr & ~dev->mr_page_mask) != 0) - ret = srp_map_finish_fmr(state, ch); - return ret; -} - -static int srp_map_sg_fmr(struct srp_map_state *state, struct srp_rdma_ch *ch, - struct srp_request *req, struct scatterlist *scat, - int count) -{ - struct scatterlist *sg; - int i, ret; - - state->pages = req->map_page; - state->fmr.next = req->fmr_list; - state->fmr.end = req->fmr_list + ch->target->mr_per_cmd; - - for_each_sg(scat, sg, count, i) { - ret = srp_map_sg_entry(state, ch, sg); - if (ret) - return ret; - } - - ret = srp_map_finish_fmr(state, ch); - if (ret) - return ret; - - return 0; -} - static int srp_map_sg_fr(struct srp_map_state *state, struct srp_rdma_ch *ch, struct srp_request *req, struct scatterlist *scat, int count) @@ -1733,7 +1565,6 @@ static int srp_map_idb(struct srp_rdma_ch *ch, struct srp_request *req, struct srp_device *dev = target->srp_host->srp_dev; struct srp_map_state state; struct srp_direct_buf idb_desc; - u64 idb_pages[1]; struct scatterlist idb_sg[1]; int ret; @@ -1756,14 +1587,6 @@ static int srp_map_idb(struct srp_rdma_ch *ch, struct srp_request *req, if (ret < 0) return ret; WARN_ON_ONCE(ret < 1); - } else if (dev->use_fmr) { - state.pages = idb_pages; - state.pages[0] = (req->indirect_dma_addr & - dev->mr_page_mask); - state.npages = 1; - ret = srp_map_finish_fmr(&state, ch); - if (ret < 0) - return ret; } else { return -EINVAL; } @@ -1787,9 +1610,6 @@ static void srp_check_mapping(struct srp_map_state *state, if (dev->use_fast_reg) for (i = 0, pfr = req->fr_list; i < state->nmdesc; i++, pfr++) mr_len += (*pfr)->mr->length; - else if (dev->use_fmr) - for (i = 0; i < state->nmdesc; i++) - mr_len += be32_to_cpu(req->indirect_desc[i].len); if (desc_len != scsi_bufflen(req->scmnd) || mr_len > scsi_bufflen(req->scmnd)) pr_err("Inconsistent: scsi len %d <> desc len %lld <> mr len %lld; ndesc %d; nmdesc = %d\n", @@ -1904,8 +1724,6 @@ static int srp_map_data(struct scsi_cmnd *scmnd, struct srp_rdma_ch *ch, state.desc = req->indirect_desc; if (dev->use_fast_reg) ret = srp_map_sg_fr(&state, ch, req, scat, count); - else if (dev->use_fmr) - ret = srp_map_sg_fmr(&state, ch, req, scat, count); else ret = srp_map_sg_dma(&state, ch, req, scat, count); req->nmdesc = state.nmdesc; @@ -3874,13 +3692,13 @@ static ssize_t srp_create_target(struct device *dev, goto out; } - if (!srp_dev->has_fmr && !srp_dev->has_fr && !target->allow_ext_sg && + if (!srp_dev->has_fr && !target->allow_ext_sg && target->cmd_sg_cnt < target->sg_tablesize) { pr_warn("No MR pool and no external indirect descriptors, limiting sg_tablesize to cmd_sg_cnt\n"); target->sg_tablesize = target->cmd_sg_cnt; } - if (srp_dev->use_fast_reg || srp_dev->use_fmr) { + if (srp_dev->use_fast_reg) { bool gaps_reg = (ibdev->attrs.device_cap_flags & IB_DEVICE_SG_GAPS_REG); @@ -3888,12 +3706,12 @@ static ssize_t srp_create_target(struct device *dev, (ilog2(srp_dev->mr_page_size) - 9); if (!gaps_reg) { /* - * FR and FMR can only map one HCA page per entry. If - * the start address is not aligned on a HCA page - * boundary two entries will be used for the head and - * the tail although these two entries combined - * contain at most one HCA page of data. Hence the "+ - * 1" in the calculation below. + * FR can only map one HCA page per entry. If the start + * address is not aligned on a HCA page boundary two + * entries will be used for the head and the tail + * although these two entries combined contain at most + * one HCA page of data. Hence the "+ 1" in the + * calculation below. * * The indirect data buffer descriptor is contiguous * so the memory for that buffer will only be @@ -4174,23 +3992,15 @@ static int srp_add_one(struct ib_device *device) srp_dev->max_pages_per_mr = min_t(u64, SRP_MAX_PAGES_PER_MR, max_pages_per_mr); - srp_dev->has_fmr = (device->ops.alloc_fmr && - device->ops.dealloc_fmr && - device->ops.map_phys_fmr && - device->ops.unmap_fmr); srp_dev->has_fr = (attr->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS); - if (!never_register && !srp_dev->has_fmr && !srp_dev->has_fr) { - dev_warn(&device->dev, "neither FMR nor FR is supported\n"); - } else if (!never_register && - attr->max_mr_size >= 2 * srp_dev->mr_page_size) { - srp_dev->use_fast_reg = (srp_dev->has_fr && - (!srp_dev->has_fmr || prefer_fr)); - srp_dev->use_fmr = !srp_dev->use_fast_reg && srp_dev->has_fmr; - } + if (!never_register && !srp_dev->has_fr) + dev_warn(&device->dev, "FR is not supported\n"); + else if (!never_register && + attr->max_mr_size >= 2 * srp_dev->mr_page_size) + srp_dev->use_fast_reg = srp_dev->has_fr; - if (never_register || !register_always || - (!srp_dev->has_fmr && !srp_dev->has_fr)) + if (never_register || !register_always || !srp_dev->has_fr) flags |= IB_PD_UNSAFE_GLOBAL_RKEY; if (srp_dev->use_fast_reg) { diff --git a/drivers/infiniband/ulp/srp/ib_srp.h b/drivers/infiniband/ulp/srp/ib_srp.h index 6fabcc2faf1f..6818cac0a3b7 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.h +++ b/drivers/infiniband/ulp/srp/ib_srp.h @@ -44,7 +44,6 @@ #include #include #include -#include #include enum { @@ -95,8 +94,7 @@ enum srp_iu_type { /* * @mr_page_mask: HCA memory registration page mask. * @mr_page_size: HCA memory registration page size. - * @mr_max_size: Maximum size in bytes of a single FMR / FR registration - * request. + * @mr_max_size: Maximum size in bytes of a single FR registration request. */ struct srp_device { struct list_head dev_list; @@ -107,9 +105,7 @@ struct srp_device { int mr_page_size; int mr_max_size; int max_pages_per_mr; - bool has_fmr; bool has_fr; - bool use_fmr; bool use_fast_reg; }; @@ -127,11 +123,7 @@ struct srp_host { struct srp_request { struct scsi_cmnd *scmnd; struct srp_iu *cmd; - union { - struct ib_pool_fmr **fmr_list; - struct srp_fr_desc **fr_list; - }; - u64 *map_page; + struct srp_fr_desc **fr_list; struct srp_direct_buf *indirect_desc; dma_addr_t indirect_dma_addr; short nmdesc; @@ -155,10 +147,7 @@ struct srp_rdma_ch { struct ib_cq *send_cq; struct ib_cq *recv_cq; struct ib_qp *qp; - union { - struct ib_fmr_pool *fmr_pool; - struct srp_fr_pool *fr_pool; - }; + struct srp_fr_pool *fr_pool; uint32_t max_it_iu_len; uint32_t max_ti_iu_len; u8 max_imm_sge; @@ -319,19 +308,15 @@ struct srp_fr_pool { * @pages: Array with DMA addresses of pages being considered for * memory registration. * @base_dma_addr: DMA address of the first page that has not yet been mapped. - * @dma_len: Number of bytes that will be registered with the next - * FMR or FR memory registration call. + * @dma_len: Number of bytes that will be registered with the next FR + * memory registration call. * @total_len: Total number of bytes in the sg-list being mapped. * @npages: Number of page addresses in the pages[] array. - * @nmdesc: Number of FMR or FR memory descriptors used for mapping. + * @nmdesc: Number of FR memory descriptors used for mapping. * @ndesc: Number of SRP buffer descriptors that have been filled in. */ struct srp_map_state { union { - struct { - struct ib_pool_fmr **next; - struct ib_pool_fmr **end; - } fmr; struct { struct srp_fr_desc **next; struct srp_fr_desc **end; -- cgit v1.2.3 From 07549ee21ce5247143ffb069bf838025d86b908c Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Thu, 28 May 2020 16:45:45 -0300 Subject: RDMA/rds: Remove FMR support for memory registration Use FRWR method for memory registration by default and remove the ancient and unsafe FMR method. Link: https://lore.kernel.org/r/3-v3-f58e6669d5d3+2cf-fmr_removal_jgg@mellanox.com Signed-off-by: Max Gurtovoy Signed-off-by: Jason Gunthorpe --- net/rds/Makefile | 2 +- net/rds/ib.c | 20 ++-- net/rds/ib.h | 2 - net/rds/ib_cm.c | 4 +- net/rds/ib_fmr.c | 269 ------------------------------------------------------ net/rds/ib_frmr.c | 4 +- net/rds/ib_mr.h | 14 +-- net/rds/ib_rdma.c | 28 ++---- 8 files changed, 21 insertions(+), 322 deletions(-) delete mode 100644 net/rds/ib_fmr.c diff --git a/net/rds/Makefile b/net/rds/Makefile index e647f9de104a..8fdc118e2927 100644 --- a/net/rds/Makefile +++ b/net/rds/Makefile @@ -7,7 +7,7 @@ rds-y := af_rds.o bind.o cong.o connection.o info.o message.o \ obj-$(CONFIG_RDS_RDMA) += rds_rdma.o rds_rdma-y := rdma_transport.o \ ib.o ib_cm.o ib_recv.o ib_ring.o ib_send.o ib_stats.o \ - ib_sysctl.o ib_rdma.o ib_fmr.o ib_frmr.o + ib_sysctl.o ib_rdma.o ib_frmr.o obj-$(CONFIG_RDS_TCP) += rds_tcp.o diff --git a/net/rds/ib.c b/net/rds/ib.c index 90212ed3edf1..6c43b3e4c736 100644 --- a/net/rds/ib.c +++ b/net/rds/ib.c @@ -130,13 +130,16 @@ void rds_ib_dev_put(struct rds_ib_device *rds_ibdev) static int rds_ib_add_one(struct ib_device *device) { struct rds_ib_device *rds_ibdev; - bool has_fr, has_fmr; int ret; /* Only handle IB (no iWARP) devices */ if (device->node_type != RDMA_NODE_IB_CA) return -EOPNOTSUPP; + /* Device must support FRWR */ + if (!(device->attrs.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS)) + return -EOPNOTSUPP; + rds_ibdev = kzalloc_node(sizeof(struct rds_ib_device), GFP_KERNEL, ibdev_to_node(device)); if (!rds_ibdev) @@ -152,11 +155,6 @@ static int rds_ib_add_one(struct ib_device *device) rds_ibdev->max_wrs = device->attrs.max_qp_wr; rds_ibdev->max_sge = min(device->attrs.max_send_sge, RDS_IB_MAX_SGE); - has_fr = (device->attrs.device_cap_flags & - IB_DEVICE_MEM_MGT_EXTENSIONS); - has_fmr = (device->ops.alloc_fmr && device->ops.dealloc_fmr && - device->ops.map_phys_fmr && device->ops.unmap_fmr); - rds_ibdev->use_fastreg = (has_fr && !has_fmr); rds_ibdev->odp_capable = !!(device->attrs.device_cap_flags & IB_DEVICE_ON_DEMAND_PAGING) && @@ -165,7 +163,6 @@ static int rds_ib_add_one(struct ib_device *device) !!(device->attrs.odp_caps.per_transport_caps.rc_odp_caps & IB_ODP_SUPPORT_READ); - rds_ibdev->fmr_max_remaps = device->attrs.max_map_per_fmr?: 32; rds_ibdev->max_1m_mrs = device->attrs.max_mr ? min_t(unsigned int, (device->attrs.max_mr / 2), rds_ib_mr_1m_pool_size) : rds_ib_mr_1m_pool_size; @@ -219,14 +216,11 @@ static int rds_ib_add_one(struct ib_device *device) goto put_dev; } - rdsdebug("RDS/IB: max_mr = %d, max_wrs = %d, max_sge = %d, fmr_max_remaps = %d, max_1m_mrs = %d, max_8k_mrs = %d\n", + rdsdebug("RDS/IB: max_mr = %d, max_wrs = %d, max_sge = %d, max_1m_mrs = %d, max_8k_mrs = %d\n", device->attrs.max_fmr, rds_ibdev->max_wrs, rds_ibdev->max_sge, - rds_ibdev->fmr_max_remaps, rds_ibdev->max_1m_mrs, - rds_ibdev->max_8k_mrs); + rds_ibdev->max_1m_mrs, rds_ibdev->max_8k_mrs); - pr_info("RDS/IB: %s: %s supported and preferred\n", - device->name, - rds_ibdev->use_fastreg ? "FRMR" : "FMR"); + pr_info("RDS/IB: %s: added\n", device->name); down_write(&rds_ib_devices_lock); list_add_tail_rcu(&rds_ibdev->list, &rds_ib_devices); diff --git a/net/rds/ib.h b/net/rds/ib.h index 0296f1f7acda..5ae069d39eab 100644 --- a/net/rds/ib.h +++ b/net/rds/ib.h @@ -247,13 +247,11 @@ struct rds_ib_device { struct ib_device *dev; struct ib_pd *pd; struct dma_pool *rid_hdrs_pool; /* RDS headers DMA pool */ - u8 use_fastreg:1; u8 odp_capable:1; unsigned int max_mrs; struct rds_ib_mr_pool *mr_1m_pool; struct rds_ib_mr_pool *mr_8k_pool; - unsigned int fmr_max_remaps; unsigned int max_8k_mrs; unsigned int max_1m_mrs; int max_sge; diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c index 0fec4171564e..c3319ff3ee11 100644 --- a/net/rds/ib_cm.c +++ b/net/rds/ib_cm.c @@ -527,10 +527,10 @@ static int rds_ib_setup_qp(struct rds_connection *conn) return -EOPNOTSUPP; /* The fr_queue_space is currently set to 512, to add extra space on - * completion queue and send queue. This extra space is used for FRMR + * completion queue and send queue. This extra space is used for FRWR * registration and invalidation work requests */ - fr_queue_space = (rds_ibdev->use_fastreg ? RDS_IB_DEFAULT_FR_WR : 0); + fr_queue_space = RDS_IB_DEFAULT_FR_WR; /* add the conn now so that connection establishment has the dev */ rds_ib_add_conn(rds_ibdev, conn); diff --git a/net/rds/ib_fmr.c b/net/rds/ib_fmr.c deleted file mode 100644 index 93c0437e6a5f..000000000000 --- a/net/rds/ib_fmr.c +++ /dev/null @@ -1,269 +0,0 @@ -/* - * Copyright (c) 2016 Oracle. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "ib_mr.h" - -struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev, int npages) -{ - struct rds_ib_mr_pool *pool; - struct rds_ib_mr *ibmr = NULL; - struct rds_ib_fmr *fmr; - int err = 0; - - if (npages <= RDS_MR_8K_MSG_SIZE) - pool = rds_ibdev->mr_8k_pool; - else - pool = rds_ibdev->mr_1m_pool; - - if (atomic_read(&pool->dirty_count) >= pool->max_items / 10) - queue_delayed_work(rds_ib_mr_wq, &pool->flush_worker, 10); - - /* Switch pools if one of the pool is reaching upper limit */ - if (atomic_read(&pool->dirty_count) >= pool->max_items * 9 / 10) { - if (pool->pool_type == RDS_IB_MR_8K_POOL) - pool = rds_ibdev->mr_1m_pool; - else - pool = rds_ibdev->mr_8k_pool; - } - - ibmr = rds_ib_try_reuse_ibmr(pool); - if (ibmr) - return ibmr; - - ibmr = kzalloc_node(sizeof(*ibmr), GFP_KERNEL, - rdsibdev_to_node(rds_ibdev)); - if (!ibmr) { - err = -ENOMEM; - goto out_no_cigar; - } - - fmr = &ibmr->u.fmr; - fmr->fmr = ib_alloc_fmr(rds_ibdev->pd, - (IB_ACCESS_LOCAL_WRITE | - IB_ACCESS_REMOTE_READ | - IB_ACCESS_REMOTE_WRITE | - IB_ACCESS_REMOTE_ATOMIC), - &pool->fmr_attr); - if (IS_ERR(fmr->fmr)) { - err = PTR_ERR(fmr->fmr); - fmr->fmr = NULL; - pr_warn("RDS/IB: %s failed (err=%d)\n", __func__, err); - goto out_no_cigar; - } - - ibmr->pool = pool; - if (pool->pool_type == RDS_IB_MR_8K_POOL) - rds_ib_stats_inc(s_ib_rdma_mr_8k_alloc); - else - rds_ib_stats_inc(s_ib_rdma_mr_1m_alloc); - - return ibmr; - -out_no_cigar: - kfree(ibmr); - atomic_dec(&pool->item_count); - - return ERR_PTR(err); -} - -static int rds_ib_map_fmr(struct rds_ib_device *rds_ibdev, - struct rds_ib_mr *ibmr, struct scatterlist *sg, - unsigned int nents) -{ - struct ib_device *dev = rds_ibdev->dev; - struct rds_ib_fmr *fmr = &ibmr->u.fmr; - struct scatterlist *scat = sg; - u64 io_addr = 0; - u64 *dma_pages; - u32 len; - int page_cnt, sg_dma_len; - int i, j; - int ret; - - sg_dma_len = ib_dma_map_sg(dev, sg, nents, DMA_BIDIRECTIONAL); - if (unlikely(!sg_dma_len)) { - pr_warn("RDS/IB: %s failed!\n", __func__); - return -EBUSY; - } - - len = 0; - page_cnt = 0; - - for (i = 0; i < sg_dma_len; ++i) { - unsigned int dma_len = sg_dma_len(&scat[i]); - u64 dma_addr = sg_dma_address(&scat[i]); - - if (dma_addr & ~PAGE_MASK) { - if (i > 0) { - ib_dma_unmap_sg(dev, sg, nents, - DMA_BIDIRECTIONAL); - return -EINVAL; - } else { - ++page_cnt; - } - } - if ((dma_addr + dma_len) & ~PAGE_MASK) { - if (i < sg_dma_len - 1) { - ib_dma_unmap_sg(dev, sg, nents, - DMA_BIDIRECTIONAL); - return -EINVAL; - } else { - ++page_cnt; - } - } - - len += dma_len; - } - - page_cnt += len >> PAGE_SHIFT; - if (page_cnt > ibmr->pool->fmr_attr.max_pages) { - ib_dma_unmap_sg(dev, sg, nents, DMA_BIDIRECTIONAL); - return -EINVAL; - } - - dma_pages = kmalloc_array_node(sizeof(u64), page_cnt, GFP_ATOMIC, - rdsibdev_to_node(rds_ibdev)); - if (!dma_pages) { - ib_dma_unmap_sg(dev, sg, nents, DMA_BIDIRECTIONAL); - return -ENOMEM; - } - - page_cnt = 0; - for (i = 0; i < sg_dma_len; ++i) { - unsigned int dma_len = sg_dma_len(&scat[i]); - u64 dma_addr = sg_dma_address(&scat[i]); - - for (j = 0; j < dma_len; j += PAGE_SIZE) - dma_pages[page_cnt++] = - (dma_addr & PAGE_MASK) + j; - } - - ret = ib_map_phys_fmr(fmr->fmr, dma_pages, page_cnt, io_addr); - if (ret) { - ib_dma_unmap_sg(dev, sg, nents, DMA_BIDIRECTIONAL); - goto out; - } - - /* Success - we successfully remapped the MR, so we can - * safely tear down the old mapping. - */ - rds_ib_teardown_mr(ibmr); - - ibmr->sg = scat; - ibmr->sg_len = nents; - ibmr->sg_dma_len = sg_dma_len; - ibmr->remap_count++; - - if (ibmr->pool->pool_type == RDS_IB_MR_8K_POOL) - rds_ib_stats_inc(s_ib_rdma_mr_8k_used); - else - rds_ib_stats_inc(s_ib_rdma_mr_1m_used); - ret = 0; - -out: - kfree(dma_pages); - - return ret; -} - -struct rds_ib_mr *rds_ib_reg_fmr(struct rds_ib_device *rds_ibdev, - struct scatterlist *sg, - unsigned long nents, - u32 *key) -{ - struct rds_ib_mr *ibmr = NULL; - struct rds_ib_fmr *fmr; - int ret; - - ibmr = rds_ib_alloc_fmr(rds_ibdev, nents); - if (IS_ERR(ibmr)) - return ibmr; - - ibmr->device = rds_ibdev; - fmr = &ibmr->u.fmr; - ret = rds_ib_map_fmr(rds_ibdev, ibmr, sg, nents); - if (ret == 0) - *key = fmr->fmr->rkey; - else - rds_ib_free_mr(ibmr, 0); - - return ibmr; -} - -void rds_ib_unreg_fmr(struct list_head *list, unsigned int *nfreed, - unsigned long *unpinned, unsigned int goal) -{ - struct rds_ib_mr *ibmr, *next; - struct rds_ib_fmr *fmr; - LIST_HEAD(fmr_list); - int ret = 0; - unsigned int freed = *nfreed; - - /* String all ib_mr's onto one list and hand them to ib_unmap_fmr */ - list_for_each_entry(ibmr, list, unmap_list) { - fmr = &ibmr->u.fmr; - list_add(&fmr->fmr->list, &fmr_list); - } - - ret = ib_unmap_fmr(&fmr_list); - if (ret) - pr_warn("RDS/IB: FMR invalidation failed (err=%d)\n", ret); - - /* Now we can destroy the DMA mapping and unpin any pages */ - list_for_each_entry_safe(ibmr, next, list, unmap_list) { - fmr = &ibmr->u.fmr; - *unpinned += ibmr->sg_len; - __rds_ib_teardown_mr(ibmr); - if (freed < goal || - ibmr->remap_count >= ibmr->pool->fmr_attr.max_maps) { - if (ibmr->pool->pool_type == RDS_IB_MR_8K_POOL) - rds_ib_stats_inc(s_ib_rdma_mr_8k_free); - else - rds_ib_stats_inc(s_ib_rdma_mr_1m_free); - list_del(&ibmr->unmap_list); - ib_dealloc_fmr(fmr->fmr); - kfree(ibmr); - freed++; - } - } - *nfreed = freed; -} - -void rds_ib_free_fmr_list(struct rds_ib_mr *ibmr) -{ - struct rds_ib_mr_pool *pool = ibmr->pool; - - if (ibmr->remap_count >= pool->fmr_attr.max_maps) - llist_add(&ibmr->llnode, &pool->drop_list); - else - llist_add(&ibmr->llnode, &pool->free_list); -} diff --git a/net/rds/ib_frmr.c b/net/rds/ib_frmr.c index 06ecf9d2d4bf..9b6ffff72f2d 100644 --- a/net/rds/ib_frmr.c +++ b/net/rds/ib_frmr.c @@ -76,7 +76,7 @@ static struct rds_ib_mr *rds_ib_alloc_frmr(struct rds_ib_device *rds_ibdev, frmr = &ibmr->u.frmr; frmr->mr = ib_alloc_mr(rds_ibdev->pd, IB_MR_TYPE_MEM_REG, - pool->fmr_attr.max_pages); + pool->max_pages); if (IS_ERR(frmr->mr)) { pr_warn("RDS/IB: %s failed to allocate MR", __func__); err = PTR_ERR(frmr->mr); @@ -240,7 +240,7 @@ static int rds_ib_map_frmr(struct rds_ib_device *rds_ibdev, } frmr->dma_npages += len >> PAGE_SHIFT; - if (frmr->dma_npages > ibmr->pool->fmr_attr.max_pages) { + if (frmr->dma_npages > ibmr->pool->max_pages) { ret = -EMSGSIZE; goto out_unmap; } diff --git a/net/rds/ib_mr.h b/net/rds/ib_mr.h index 0c8252d7fe2b..ea5e9aee4959 100644 --- a/net/rds/ib_mr.h +++ b/net/rds/ib_mr.h @@ -43,10 +43,6 @@ #define RDS_MR_8K_SCALE (256 / (RDS_MR_8K_MSG_SIZE + 1)) #define RDS_MR_8K_POOL_SIZE (RDS_MR_8K_SCALE * (8192 / 2)) -struct rds_ib_fmr { - struct ib_fmr *fmr; -}; - enum rds_ib_fr_state { FRMR_IS_FREE, /* mr invalidated & ready for use */ FRMR_IS_INUSE, /* mr is in use or used & can be invalidated */ @@ -84,7 +80,6 @@ struct rds_ib_mr { u8 odp:1; union { - struct rds_ib_fmr fmr; struct rds_ib_frmr frmr; struct ib_mr *mr; } u; @@ -109,8 +104,7 @@ struct rds_ib_mr_pool { unsigned long max_items; unsigned long max_items_soft; unsigned long max_free_pinned; - struct ib_fmr_attr fmr_attr; - bool use_fastreg; + unsigned int max_pages; }; extern struct workqueue_struct *rds_ib_mr_wq; @@ -136,15 +130,9 @@ u32 rds_ib_get_lkey(void *trans_private); void __rds_ib_teardown_mr(struct rds_ib_mr *); void rds_ib_teardown_mr(struct rds_ib_mr *); -struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *, int); struct rds_ib_mr *rds_ib_reuse_mr(struct rds_ib_mr_pool *); int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *, int, struct rds_ib_mr **); -struct rds_ib_mr *rds_ib_reg_fmr(struct rds_ib_device *, struct scatterlist *, - unsigned long, u32 *); struct rds_ib_mr *rds_ib_try_reuse_ibmr(struct rds_ib_mr_pool *); -void rds_ib_unreg_fmr(struct list_head *, unsigned int *, - unsigned long *, unsigned int); -void rds_ib_free_fmr_list(struct rds_ib_mr *); struct rds_ib_mr *rds_ib_reg_frmr(struct rds_ib_device *rds_ibdev, struct rds_ib_connection *ic, struct scatterlist *sg, diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c index b34b24e237f8..8f070ee7e742 100644 --- a/net/rds/ib_rdma.c +++ b/net/rds/ib_rdma.c @@ -181,7 +181,7 @@ void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_co struct rds_ib_mr_pool *pool_1m = rds_ibdev->mr_1m_pool; iinfo->rdma_mr_max = pool_1m->max_items; - iinfo->rdma_mr_size = pool_1m->fmr_attr.max_pages; + iinfo->rdma_mr_size = pool_1m->max_pages; } #if IS_ENABLED(CONFIG_IPV6) @@ -191,7 +191,7 @@ void rds6_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_ib_mr_pool *pool_1m = rds_ibdev->mr_1m_pool; iinfo6->rdma_mr_max = pool_1m->max_items; - iinfo6->rdma_mr_size = pool_1m->fmr_attr.max_pages; + iinfo6->rdma_mr_size = pool_1m->max_pages; } #endif @@ -406,10 +406,7 @@ int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, if (list_empty(&unmap_list)) goto out; - if (pool->use_fastreg) - rds_ib_unreg_frmr(&unmap_list, &nfreed, &unpinned, free_goal); - else - rds_ib_unreg_fmr(&unmap_list, &nfreed, &unpinned, free_goal); + rds_ib_unreg_frmr(&unmap_list, &nfreed, &unpinned, free_goal); if (!list_empty(&unmap_list)) { unsigned long flags; @@ -503,10 +500,7 @@ void rds_ib_free_mr(void *trans_private, int invalidate) } /* Return it to the pool's free list */ - if (rds_ibdev->use_fastreg) - rds_ib_free_frmr_list(ibmr); - else - rds_ib_free_fmr_list(ibmr); + rds_ib_free_frmr_list(ibmr); atomic_add(ibmr->sg_len, &pool->free_pinned); atomic_inc(&pool->dirty_count); @@ -622,10 +616,7 @@ void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents, goto out; } - if (rds_ibdev->use_fastreg) - ibmr = rds_ib_reg_frmr(rds_ibdev, ic, sg, nents, key_ret); - else - ibmr = rds_ib_reg_fmr(rds_ibdev, sg, nents, key_ret); + ibmr = rds_ib_reg_frmr(rds_ibdev, ic, sg, nents, key_ret); if (IS_ERR(ibmr)) { ret = PTR_ERR(ibmr); pr_warn("RDS/IB: rds_ib_get_mr failed (errno=%d)\n", ret); @@ -669,19 +660,16 @@ struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_ibdev, if (pool_type == RDS_IB_MR_1M_POOL) { /* +1 allows for unaligned MRs */ - pool->fmr_attr.max_pages = RDS_MR_1M_MSG_SIZE + 1; + pool->max_pages = RDS_MR_1M_MSG_SIZE + 1; pool->max_items = rds_ibdev->max_1m_mrs; } else { /* pool_type == RDS_IB_MR_8K_POOL */ - pool->fmr_attr.max_pages = RDS_MR_8K_MSG_SIZE + 1; + pool->max_pages = RDS_MR_8K_MSG_SIZE + 1; pool->max_items = rds_ibdev->max_8k_mrs; } - pool->max_free_pinned = pool->max_items * pool->fmr_attr.max_pages / 4; - pool->fmr_attr.max_maps = rds_ibdev->fmr_max_remaps; - pool->fmr_attr.page_shift = PAGE_SHIFT; + pool->max_free_pinned = pool->max_items * pool->max_pages / 4; pool->max_items_soft = rds_ibdev->max_mrs * 3 / 4; - pool->use_fastreg = rds_ibdev->use_fastreg; return pool; } -- cgit v1.2.3 From 4e373d5417ecbb4f438a8500f0379a2fc29c2643 Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Thu, 28 May 2020 16:45:46 -0300 Subject: RDMA/core: Remove FMR pool API This ancient and unsafe method for memory registration is no longer used by any RDMA based ULP. Remove the FMR pool API from the core driver. Link: https://lore.kernel.org/r/4-v3-f58e6669d5d3+2cf-fmr_removal_jgg@mellanox.com Signed-off-by: Max Gurtovoy Signed-off-by: Jason Gunthorpe --- Documentation/driver-api/infiniband.rst | 3 - drivers/infiniband/core/Makefile | 2 +- drivers/infiniband/core/fmr_pool.c | 494 -------------------------------- include/rdma/ib_fmr_pool.h | 93 ------ 4 files changed, 1 insertion(+), 591 deletions(-) delete mode 100644 drivers/infiniband/core/fmr_pool.c delete mode 100644 include/rdma/ib_fmr_pool.h diff --git a/Documentation/driver-api/infiniband.rst b/Documentation/driver-api/infiniband.rst index 1a3116f32ff0..30e142ccbee9 100644 --- a/Documentation/driver-api/infiniband.rst +++ b/Documentation/driver-api/infiniband.rst @@ -37,9 +37,6 @@ InfiniBand core interfaces .. kernel-doc:: drivers/infiniband/core/ud_header.c :export: -.. kernel-doc:: drivers/infiniband/core/fmr_pool.c - :export: - .. kernel-doc:: drivers/infiniband/core/umem.c :export: diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile index 63c1591223ac..24cb71a16a28 100644 --- a/drivers/infiniband/core/Makefile +++ b/drivers/infiniband/core/Makefile @@ -8,7 +8,7 @@ obj-$(CONFIG_INFINIBAND_USER_MAD) += ib_umad.o obj-$(CONFIG_INFINIBAND_USER_ACCESS) += ib_uverbs.o $(user_access-y) ib_core-y := packer.o ud_header.o verbs.o cq.o rw.o sysfs.o \ - device.o fmr_pool.o cache.o netlink.o \ + device.o cache.o netlink.o \ roce_gid_mgmt.o mr_pool.o addr.o sa_query.o \ multicast.o mad.o smi.o agent.o mad_rmpp.o \ nldev.o restrack.o counters.o ib_core_uverbs.o \ diff --git a/drivers/infiniband/core/fmr_pool.c b/drivers/infiniband/core/fmr_pool.c deleted file mode 100644 index e08aec427027..000000000000 --- a/drivers/infiniband/core/fmr_pool.c +++ /dev/null @@ -1,494 +0,0 @@ -/* - * Copyright (c) 2004 Topspin Communications. All rights reserved. - * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include -#include -#include -#include -#include -#include - -#include - -#include "core_priv.h" - -#define PFX "fmr_pool: " - -enum { - IB_FMR_MAX_REMAPS = 32, - - IB_FMR_HASH_BITS = 8, - IB_FMR_HASH_SIZE = 1 << IB_FMR_HASH_BITS, - IB_FMR_HASH_MASK = IB_FMR_HASH_SIZE - 1 -}; - -/* - * If an FMR is not in use, then the list member will point to either - * its pool's free_list (if the FMR can be mapped again; that is, - * remap_count < pool->max_remaps) or its pool's dirty_list (if the - * FMR needs to be unmapped before being remapped). In either of - * these cases it is a bug if the ref_count is not 0. In other words, - * if ref_count is > 0, then the list member must not be linked into - * either free_list or dirty_list. - * - * The cache_node member is used to link the FMR into a cache bucket - * (if caching is enabled). This is independent of the reference - * count of the FMR. When a valid FMR is released, its ref_count is - * decremented, and if ref_count reaches 0, the FMR is placed in - * either free_list or dirty_list as appropriate. However, it is not - * removed from the cache and may be "revived" if a call to - * ib_fmr_register_physical() occurs before the FMR is remapped. In - * this case we just increment the ref_count and remove the FMR from - * free_list/dirty_list. - * - * Before we remap an FMR from free_list, we remove it from the cache - * (to prevent another user from obtaining a stale FMR). When an FMR - * is released, we add it to the tail of the free list, so that our - * cache eviction policy is "least recently used." - * - * All manipulation of ref_count, list and cache_node is protected by - * pool_lock to maintain consistency. - */ - -struct ib_fmr_pool { - spinlock_t pool_lock; - - int pool_size; - int max_pages; - int max_remaps; - int dirty_watermark; - int dirty_len; - struct list_head free_list; - struct list_head dirty_list; - struct hlist_head *cache_bucket; - - void (*flush_function)(struct ib_fmr_pool *pool, - void * arg); - void *flush_arg; - - struct kthread_worker *worker; - struct kthread_work work; - - atomic_t req_ser; - atomic_t flush_ser; - - wait_queue_head_t force_wait; -}; - -static inline u32 ib_fmr_hash(u64 first_page) -{ - return jhash_2words((u32) first_page, (u32) (first_page >> 32), 0) & - (IB_FMR_HASH_SIZE - 1); -} - -/* Caller must hold pool_lock */ -static inline struct ib_pool_fmr *ib_fmr_cache_lookup(struct ib_fmr_pool *pool, - u64 *page_list, - int page_list_len, - u64 io_virtual_address) -{ - struct hlist_head *bucket; - struct ib_pool_fmr *fmr; - - if (!pool->cache_bucket) - return NULL; - - bucket = pool->cache_bucket + ib_fmr_hash(*page_list); - - hlist_for_each_entry(fmr, bucket, cache_node) - if (io_virtual_address == fmr->io_virtual_address && - page_list_len == fmr->page_list_len && - !memcmp(page_list, fmr->page_list, - page_list_len * sizeof *page_list)) - return fmr; - - return NULL; -} - -static void ib_fmr_batch_release(struct ib_fmr_pool *pool) -{ - int ret; - struct ib_pool_fmr *fmr; - LIST_HEAD(unmap_list); - LIST_HEAD(fmr_list); - - spin_lock_irq(&pool->pool_lock); - - list_for_each_entry(fmr, &pool->dirty_list, list) { - hlist_del_init(&fmr->cache_node); - fmr->remap_count = 0; - list_add_tail(&fmr->fmr->list, &fmr_list); - } - - list_splice_init(&pool->dirty_list, &unmap_list); - pool->dirty_len = 0; - - spin_unlock_irq(&pool->pool_lock); - - if (list_empty(&unmap_list)) { - return; - } - - ret = ib_unmap_fmr(&fmr_list); - if (ret) - pr_warn(PFX "ib_unmap_fmr returned %d\n", ret); - - spin_lock_irq(&pool->pool_lock); - list_splice(&unmap_list, &pool->free_list); - spin_unlock_irq(&pool->pool_lock); -} - -static void ib_fmr_cleanup_func(struct kthread_work *work) -{ - struct ib_fmr_pool *pool = container_of(work, struct ib_fmr_pool, work); - - ib_fmr_batch_release(pool); - atomic_inc(&pool->flush_ser); - wake_up_interruptible(&pool->force_wait); - - if (pool->flush_function) - pool->flush_function(pool, pool->flush_arg); - - if (atomic_read(&pool->flush_ser) - atomic_read(&pool->req_ser) < 0) - kthread_queue_work(pool->worker, &pool->work); -} - -/** - * ib_create_fmr_pool - Create an FMR pool - * @pd:Protection domain for FMRs - * @params:FMR pool parameters - * - * Create a pool of FMRs. Return value is pointer to new pool or - * error code if creation failed. - */ -struct ib_fmr_pool *ib_create_fmr_pool(struct ib_pd *pd, - struct ib_fmr_pool_param *params) -{ - struct ib_device *device; - struct ib_fmr_pool *pool; - int i; - int ret; - int max_remaps; - - if (!params) - return ERR_PTR(-EINVAL); - - device = pd->device; - if (!device->ops.alloc_fmr || !device->ops.dealloc_fmr || - !device->ops.map_phys_fmr || !device->ops.unmap_fmr) { - dev_info(&device->dev, "Device does not support FMRs\n"); - return ERR_PTR(-ENOSYS); - } - - if (!device->attrs.max_map_per_fmr) - max_remaps = IB_FMR_MAX_REMAPS; - else - max_remaps = device->attrs.max_map_per_fmr; - - pool = kmalloc(sizeof *pool, GFP_KERNEL); - if (!pool) - return ERR_PTR(-ENOMEM); - - pool->cache_bucket = NULL; - pool->flush_function = params->flush_function; - pool->flush_arg = params->flush_arg; - - INIT_LIST_HEAD(&pool->free_list); - INIT_LIST_HEAD(&pool->dirty_list); - - if (params->cache) { - pool->cache_bucket = - kmalloc_array(IB_FMR_HASH_SIZE, - sizeof(*pool->cache_bucket), - GFP_KERNEL); - if (!pool->cache_bucket) { - ret = -ENOMEM; - goto out_free_pool; - } - - for (i = 0; i < IB_FMR_HASH_SIZE; ++i) - INIT_HLIST_HEAD(pool->cache_bucket + i); - } - - pool->pool_size = 0; - pool->max_pages = params->max_pages_per_fmr; - pool->max_remaps = max_remaps; - pool->dirty_watermark = params->dirty_watermark; - pool->dirty_len = 0; - spin_lock_init(&pool->pool_lock); - atomic_set(&pool->req_ser, 0); - atomic_set(&pool->flush_ser, 0); - init_waitqueue_head(&pool->force_wait); - - pool->worker = - kthread_create_worker(0, "ib_fmr(%s)", dev_name(&device->dev)); - if (IS_ERR(pool->worker)) { - pr_warn(PFX "couldn't start cleanup kthread worker\n"); - ret = PTR_ERR(pool->worker); - goto out_free_pool; - } - kthread_init_work(&pool->work, ib_fmr_cleanup_func); - - { - struct ib_pool_fmr *fmr; - struct ib_fmr_attr fmr_attr = { - .max_pages = params->max_pages_per_fmr, - .max_maps = pool->max_remaps, - .page_shift = params->page_shift - }; - int bytes_per_fmr = sizeof *fmr; - - if (pool->cache_bucket) - bytes_per_fmr += params->max_pages_per_fmr * sizeof (u64); - - for (i = 0; i < params->pool_size; ++i) { - fmr = kmalloc(bytes_per_fmr, GFP_KERNEL); - if (!fmr) - goto out_fail; - - fmr->pool = pool; - fmr->remap_count = 0; - fmr->ref_count = 0; - INIT_HLIST_NODE(&fmr->cache_node); - - fmr->fmr = ib_alloc_fmr(pd, params->access, &fmr_attr); - if (IS_ERR(fmr->fmr)) { - pr_warn(PFX "fmr_create failed for FMR %d\n", - i); - kfree(fmr); - goto out_fail; - } - - list_add_tail(&fmr->list, &pool->free_list); - ++pool->pool_size; - } - } - - return pool; - - out_free_pool: - kfree(pool->cache_bucket); - kfree(pool); - - return ERR_PTR(ret); - - out_fail: - ib_destroy_fmr_pool(pool); - - return ERR_PTR(-ENOMEM); -} -EXPORT_SYMBOL(ib_create_fmr_pool); - -/** - * ib_destroy_fmr_pool - Free FMR pool - * @pool:FMR pool to free - * - * Destroy an FMR pool and free all associated resources. - */ -void ib_destroy_fmr_pool(struct ib_fmr_pool *pool) -{ - struct ib_pool_fmr *fmr; - struct ib_pool_fmr *tmp; - LIST_HEAD(fmr_list); - int i; - - kthread_destroy_worker(pool->worker); - ib_fmr_batch_release(pool); - - i = 0; - list_for_each_entry_safe(fmr, tmp, &pool->free_list, list) { - if (fmr->remap_count) { - INIT_LIST_HEAD(&fmr_list); - list_add_tail(&fmr->fmr->list, &fmr_list); - ib_unmap_fmr(&fmr_list); - } - ib_dealloc_fmr(fmr->fmr); - list_del(&fmr->list); - kfree(fmr); - ++i; - } - - if (i < pool->pool_size) - pr_warn(PFX "pool still has %d regions registered\n", - pool->pool_size - i); - - kfree(pool->cache_bucket); - kfree(pool); -} -EXPORT_SYMBOL(ib_destroy_fmr_pool); - -/** - * ib_flush_fmr_pool - Invalidate all unmapped FMRs - * @pool:FMR pool to flush - * - * Ensure that all unmapped FMRs are fully invalidated. - */ -int ib_flush_fmr_pool(struct ib_fmr_pool *pool) -{ - int serial; - struct ib_pool_fmr *fmr, *next; - - /* - * The free_list holds FMRs that may have been used - * but have not been remapped enough times to be dirty. - * Put them on the dirty list now so that the cleanup - * thread will reap them too. - */ - spin_lock_irq(&pool->pool_lock); - list_for_each_entry_safe(fmr, next, &pool->free_list, list) { - if (fmr->remap_count > 0) - list_move(&fmr->list, &pool->dirty_list); - } - spin_unlock_irq(&pool->pool_lock); - - serial = atomic_inc_return(&pool->req_ser); - kthread_queue_work(pool->worker, &pool->work); - - if (wait_event_interruptible(pool->force_wait, - atomic_read(&pool->flush_ser) - serial >= 0)) - return -EINTR; - - return 0; -} -EXPORT_SYMBOL(ib_flush_fmr_pool); - -/** - * ib_fmr_pool_map_phys - Map an FMR from an FMR pool. - * @pool_handle: FMR pool to allocate FMR from - * @page_list: List of pages to map - * @list_len: Number of pages in @page_list - * @io_virtual_address: I/O virtual address for new FMR - */ -struct ib_pool_fmr *ib_fmr_pool_map_phys(struct ib_fmr_pool *pool_handle, - u64 *page_list, - int list_len, - u64 io_virtual_address) -{ - struct ib_fmr_pool *pool = pool_handle; - struct ib_pool_fmr *fmr; - unsigned long flags; - int result; - - if (list_len < 1 || list_len > pool->max_pages) - return ERR_PTR(-EINVAL); - - spin_lock_irqsave(&pool->pool_lock, flags); - fmr = ib_fmr_cache_lookup(pool, - page_list, - list_len, - io_virtual_address); - if (fmr) { - /* found in cache */ - ++fmr->ref_count; - if (fmr->ref_count == 1) { - list_del(&fmr->list); - } - - spin_unlock_irqrestore(&pool->pool_lock, flags); - - return fmr; - } - - if (list_empty(&pool->free_list)) { - spin_unlock_irqrestore(&pool->pool_lock, flags); - return ERR_PTR(-EAGAIN); - } - - fmr = list_entry(pool->free_list.next, struct ib_pool_fmr, list); - list_del(&fmr->list); - hlist_del_init(&fmr->cache_node); - spin_unlock_irqrestore(&pool->pool_lock, flags); - - result = ib_map_phys_fmr(fmr->fmr, page_list, list_len, - io_virtual_address); - - if (result) { - spin_lock_irqsave(&pool->pool_lock, flags); - list_add(&fmr->list, &pool->free_list); - spin_unlock_irqrestore(&pool->pool_lock, flags); - - pr_warn(PFX "fmr_map returns %d\n", result); - - return ERR_PTR(result); - } - - ++fmr->remap_count; - fmr->ref_count = 1; - - if (pool->cache_bucket) { - fmr->io_virtual_address = io_virtual_address; - fmr->page_list_len = list_len; - memcpy(fmr->page_list, page_list, list_len * sizeof(*page_list)); - - spin_lock_irqsave(&pool->pool_lock, flags); - hlist_add_head(&fmr->cache_node, - pool->cache_bucket + ib_fmr_hash(fmr->page_list[0])); - spin_unlock_irqrestore(&pool->pool_lock, flags); - } - - return fmr; -} -EXPORT_SYMBOL(ib_fmr_pool_map_phys); - -/** - * ib_fmr_pool_unmap - Unmap FMR - * @fmr:FMR to unmap - * - * Unmap an FMR. The FMR mapping may remain valid until the FMR is - * reused (or until ib_flush_fmr_pool() is called). - */ -void ib_fmr_pool_unmap(struct ib_pool_fmr *fmr) -{ - struct ib_fmr_pool *pool; - unsigned long flags; - - pool = fmr->pool; - - spin_lock_irqsave(&pool->pool_lock, flags); - - --fmr->ref_count; - if (!fmr->ref_count) { - if (fmr->remap_count < pool->max_remaps) { - list_add_tail(&fmr->list, &pool->free_list); - } else { - list_add_tail(&fmr->list, &pool->dirty_list); - if (++pool->dirty_len >= pool->dirty_watermark) { - atomic_inc(&pool->req_ser); - kthread_queue_work(pool->worker, &pool->work); - } - } - } - - spin_unlock_irqrestore(&pool->pool_lock, flags); -} -EXPORT_SYMBOL(ib_fmr_pool_unmap); diff --git a/include/rdma/ib_fmr_pool.h b/include/rdma/ib_fmr_pool.h deleted file mode 100644 index 2fd9bfb6d648..000000000000 --- a/include/rdma/ib_fmr_pool.h +++ /dev/null @@ -1,93 +0,0 @@ -/* - * Copyright (c) 2004 Topspin Corporation. All rights reserved. - * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#if !defined(IB_FMR_POOL_H) -#define IB_FMR_POOL_H - -#include - -struct ib_fmr_pool; - -/** - * struct ib_fmr_pool_param - Parameters for creating FMR pool - * @max_pages_per_fmr:Maximum number of pages per map request. - * @page_shift: Log2 of sizeof "pages" mapped by this fmr - * @access:Access flags for FMRs in pool. - * @pool_size:Number of FMRs to allocate for pool. - * @dirty_watermark:Flush is triggered when @dirty_watermark dirty - * FMRs are present. - * @flush_function:Callback called when unmapped FMRs are flushed and - * more FMRs are possibly available for mapping - * @flush_arg:Context passed to user's flush function. - * @cache:If set, FMRs may be reused after unmapping for identical map - * requests. - */ -struct ib_fmr_pool_param { - int max_pages_per_fmr; - int page_shift; - enum ib_access_flags access; - int pool_size; - int dirty_watermark; - void (*flush_function)(struct ib_fmr_pool *pool, - void *arg); - void *flush_arg; - unsigned cache:1; -}; - -struct ib_pool_fmr { - struct ib_fmr *fmr; - struct ib_fmr_pool *pool; - struct list_head list; - struct hlist_node cache_node; - int ref_count; - int remap_count; - u64 io_virtual_address; - int page_list_len; - u64 page_list[]; -}; - -struct ib_fmr_pool *ib_create_fmr_pool(struct ib_pd *pd, - struct ib_fmr_pool_param *params); - -void ib_destroy_fmr_pool(struct ib_fmr_pool *pool); - -int ib_flush_fmr_pool(struct ib_fmr_pool *pool); - -struct ib_pool_fmr *ib_fmr_pool_map_phys(struct ib_fmr_pool *pool_handle, - u64 *page_list, - int list_len, - u64 io_virtual_address); - -void ib_fmr_pool_unmap(struct ib_pool_fmr *fmr); - -#endif /* IB_FMR_POOL_H */ -- cgit v1.2.3 From d29d58e772ecde84df15f547dc06f0b7afc7ae5c Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Thu, 28 May 2020 16:45:47 -0300 Subject: RDMA/mlx5: Remove FMR leftovers Remove a few leftovers from FMR functionality which are no longer used. Link: https://lore.kernel.org/r/5-v3-f58e6669d5d3+2cf-fmr_removal_jgg@mellanox.com Signed-off-by: Gal Pressman Signed-off-by: Max Gurtovoy Acked-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/mlx5_ib.h | 8 -------- 1 file changed, 8 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 2a702fa9e943..5dbe3eb0d9cb 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -675,12 +675,6 @@ struct umr_common { struct semaphore sem; }; -enum { - MLX5_FMR_INVALID, - MLX5_FMR_VALID, - MLX5_FMR_BUSY, -}; - struct mlx5_cache_ent { struct list_head head; /* sync access to the cahce entry @@ -1253,8 +1247,6 @@ int mlx5_query_mad_ifc_port(struct ib_device *ibdev, u8 port, struct ib_port_attr *props); int mlx5_ib_query_port(struct ib_device *ibdev, u8 port, struct ib_port_attr *props); -int mlx5_ib_init_fmr(struct mlx5_ib_dev *dev); -void mlx5_ib_cleanup_fmr(struct mlx5_ib_dev *dev); void mlx5_ib_cont_pages(struct ib_umem *umem, u64 addr, unsigned long max_page_shift, int *count, int *shift, -- cgit v1.2.3 From 7c08bc195665201c207fb3fd91a75a8f77c3b3b0 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 28 May 2020 16:45:48 -0300 Subject: RDMA/bnxt_re: Remove FMR leftovers The bnxt_re_fmr struct is never referenced and the max_fmr items in bnxt_qplib_dev_attr are never read. Link: https://lore.kernel.org/r/6-v3-f58e6669d5d3+2cf-fmr_removal_jgg@mellanox.com Reviewed-by: Max Gurtovoy Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/bnxt_re/ib_verbs.c | 3 --- drivers/infiniband/hw/bnxt_re/ib_verbs.h | 6 ------ drivers/infiniband/hw/bnxt_re/qplib_sp.c | 3 --- drivers/infiniband/hw/bnxt_re/qplib_sp.h | 2 -- 4 files changed, 14 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c index 5a7c090204c5..8b6ad5cddfce 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c @@ -177,9 +177,6 @@ int bnxt_re_query_device(struct ib_device *ibdev, ib_attr->max_total_mcast_qp_attach = 0; ib_attr->max_ah = dev_attr->max_ah; - ib_attr->max_fmr = 0; - ib_attr->max_map_per_fmr = 0; - ib_attr->max_srq = dev_attr->max_srq; ib_attr->max_srq_wr = dev_attr->max_srq_wqes; ib_attr->max_srq_sge = dev_attr->max_srq_sges; diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.h b/drivers/infiniband/hw/bnxt_re/ib_verbs.h index 204c0849ba28..e5fbbeba6d28 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.h +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.h @@ -122,12 +122,6 @@ struct bnxt_re_frpl { u64 *page_list; }; -struct bnxt_re_fmr { - struct bnxt_re_dev *rdev; - struct ib_fmr ib_fmr; - struct bnxt_qplib_mrw qplib_fmr; -}; - struct bnxt_re_mw { struct bnxt_re_dev *rdev; struct ib_mw ib_mw; diff --git a/drivers/infiniband/hw/bnxt_re/qplib_sp.c b/drivers/infiniband/hw/bnxt_re/qplib_sp.c index 66954ff6a2f2..4cd475ea97a2 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_sp.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_sp.c @@ -132,9 +132,6 @@ int bnxt_qplib_get_dev_attr(struct bnxt_qplib_rcfw *rcfw, attr->max_raw_ethy_qp = le32_to_cpu(sb->max_raw_eth_qp); attr->max_ah = le32_to_cpu(sb->max_ah); - attr->max_fmr = le32_to_cpu(sb->max_fmr); - attr->max_map_per_fmr = sb->max_map_per_fmr; - attr->max_srq = le16_to_cpu(sb->max_srq); attr->max_srq_wqes = le32_to_cpu(sb->max_srq_wr) - 1; attr->max_srq_sges = sb->max_srq_sge; diff --git a/drivers/infiniband/hw/bnxt_re/qplib_sp.h b/drivers/infiniband/hw/bnxt_re/qplib_sp.h index 13d9432d5ce2..6404f0da1051 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_sp.h +++ b/drivers/infiniband/hw/bnxt_re/qplib_sp.h @@ -64,8 +64,6 @@ struct bnxt_qplib_dev_attr { u32 max_mw; u32 max_raw_ethy_qp; u32 max_ah; - u32 max_fmr; - u32 max_map_per_fmr; u32 max_srq; u32 max_srq_wqes; u32 max_srq_sges; -- cgit v1.2.3 From f0c73c70db99e30f790572b97531aa569ec1ba60 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 28 May 2020 16:45:49 -0300 Subject: RDMA/i40iw: Remove FMR leftovers The ibfmr member is never referenced, remove it. Link: https://lore.kernel.org/r/7-v3-f58e6669d5d3+2cf-fmr_removal_jgg@mellanox.com Reviewed-by: Max Gurtovoy Acked-by: Shiraz Saleem Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/i40iw/i40iw.h | 9 --------- drivers/infiniband/hw/i40iw/i40iw_verbs.h | 1 - 2 files changed, 10 deletions(-) diff --git a/drivers/infiniband/hw/i40iw/i40iw.h b/drivers/infiniband/hw/i40iw/i40iw.h index 3c62c9327a9c..49d92638e0db 100644 --- a/drivers/infiniband/hw/i40iw/i40iw.h +++ b/drivers/infiniband/hw/i40iw/i40iw.h @@ -381,15 +381,6 @@ static inline struct i40iw_mr *to_iwmr(struct ib_mr *ibmr) return container_of(ibmr, struct i40iw_mr, ibmr); } -/** - * to_iwmr_from_ibfmr - get device memory region - * @ibfmr: ib fmr - **/ -static inline struct i40iw_mr *to_iwmr_from_ibfmr(struct ib_fmr *ibfmr) -{ - return container_of(ibfmr, struct i40iw_mr, ibfmr); -} - /** * to_iwmw - get device memory window * @ibmw: ib memory window diff --git a/drivers/infiniband/hw/i40iw/i40iw_verbs.h b/drivers/infiniband/hw/i40iw/i40iw_verbs.h index 3a413752ccc3..331bc21cbcc7 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_verbs.h +++ b/drivers/infiniband/hw/i40iw/i40iw_verbs.h @@ -89,7 +89,6 @@ struct i40iw_mr { union { struct ib_mr ibmr; struct ib_mw ibmw; - struct ib_fmr ibfmr; }; struct ib_umem *region; u16 type; -- cgit v1.2.3 From 1f55b7ab907d373581e9abf3fc4b24ed19cf831f Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Thu, 28 May 2020 16:45:50 -0300 Subject: RDMA/mlx4: Remove FMR support for memory registration HCA's that are driven by mlx4 driver support FRWR method to register memory. Remove the ancient and unsafe FMR method. Link: https://lore.kernel.org/r/8-v3-f58e6669d5d3+2cf-fmr_removal_jgg@mellanox.com Reviewed-by: Max Gurtovoy Signed-off-by: Max Gurtovoy Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx4/main.c | 11 -- drivers/infiniband/hw/mlx4/mlx4_ib.h | 16 --- drivers/infiniband/hw/mlx4/mr.c | 93 --------------- drivers/net/ethernet/mellanox/mlx4/main.c | 2 - drivers/net/ethernet/mellanox/mlx4/mr.c | 183 ------------------------------ include/linux/mlx4/device.h | 22 +--- 6 files changed, 2 insertions(+), 325 deletions(-) diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index 275722cec8c6..816d28854a8e 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -558,7 +558,6 @@ static int mlx4_ib_query_device(struct ib_device *ibdev, props->max_mcast_qp_attach = dev->dev->caps.num_qp_per_mgm; props->max_total_mcast_qp_attach = props->max_mcast_qp_attach * props->max_mcast_grp; - props->max_map_per_fmr = dev->dev->caps.max_fmr_maps; props->hca_core_clock = dev->dev->caps.hca_core_clock * 1000UL; props->timestamp_mask = 0xFFFFFFFFFFFFULL; props->max_ah = INT_MAX; @@ -2600,13 +2599,6 @@ static const struct ib_device_ops mlx4_ib_dev_wq_ops = { .modify_wq = mlx4_ib_modify_wq, }; -static const struct ib_device_ops mlx4_ib_dev_fmr_ops = { - .alloc_fmr = mlx4_ib_fmr_alloc, - .dealloc_fmr = mlx4_ib_fmr_dealloc, - .map_phys_fmr = mlx4_ib_map_phys_fmr, - .unmap_fmr = mlx4_ib_unmap_fmr, -}; - static const struct ib_device_ops mlx4_ib_dev_mw_ops = { .alloc_mw = mlx4_ib_alloc_mw, .dealloc_mw = mlx4_ib_dealloc_mw, @@ -2724,9 +2716,6 @@ static void *mlx4_ib_add(struct mlx4_dev *dev) ib_set_device_ops(&ibdev->ib_dev, &mlx4_ib_dev_wq_ops); } - if (!mlx4_is_slave(ibdev->dev)) - ib_set_device_ops(&ibdev->ib_dev, &mlx4_ib_dev_fmr_ops); - if (dev->caps.flags & MLX4_DEV_CAP_FLAG_MEM_WINDOW || dev->caps.bmme_flags & MLX4_BMME_FLAG_TYPE_2_WIN) { ibdev->ib_dev.uverbs_cmd_mask |= diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h index 182a237b87f7..6f4ea1067095 100644 --- a/drivers/infiniband/hw/mlx4/mlx4_ib.h +++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h @@ -146,11 +146,6 @@ struct mlx4_ib_mw { struct mlx4_mw mmw; }; -struct mlx4_ib_fmr { - struct ib_fmr ibfmr; - struct mlx4_fmr mfmr; -}; - #define MAX_REGS_PER_FLOW 2 struct mlx4_flow_reg_id { @@ -679,11 +674,6 @@ static inline struct mlx4_ib_mw *to_mmw(struct ib_mw *ibmw) return container_of(ibmw, struct mlx4_ib_mw, ibmw); } -static inline struct mlx4_ib_fmr *to_mfmr(struct ib_fmr *ibfmr) -{ - return container_of(ibfmr, struct mlx4_ib_fmr, ibfmr); -} - static inline struct mlx4_ib_flow *to_mflow(struct ib_flow *ibflow) { return container_of(ibflow, struct mlx4_ib_flow, ibflow); @@ -794,12 +784,6 @@ int mlx4_ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, int mlx4_ib_mad_init(struct mlx4_ib_dev *dev); void mlx4_ib_mad_cleanup(struct mlx4_ib_dev *dev); -struct ib_fmr *mlx4_ib_fmr_alloc(struct ib_pd *pd, int mr_access_flags, - struct ib_fmr_attr *fmr_attr); -int mlx4_ib_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list, int npages, - u64 iova); -int mlx4_ib_unmap_fmr(struct list_head *fmr_list); -int mlx4_ib_fmr_dealloc(struct ib_fmr *fmr); int __mlx4_ib_query_port(struct ib_device *ibdev, u8 port, struct ib_port_attr *props, int netw_view); int __mlx4_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index, diff --git a/drivers/infiniband/hw/mlx4/mr.c b/drivers/infiniband/hw/mlx4/mr.c index b0121c90c561..e2fb71b23c80 100644 --- a/drivers/infiniband/hw/mlx4/mr.c +++ b/drivers/infiniband/hw/mlx4/mr.c @@ -698,99 +698,6 @@ err_free: return ERR_PTR(err); } -struct ib_fmr *mlx4_ib_fmr_alloc(struct ib_pd *pd, int acc, - struct ib_fmr_attr *fmr_attr) -{ - struct mlx4_ib_dev *dev = to_mdev(pd->device); - struct mlx4_ib_fmr *fmr; - int err = -ENOMEM; - - fmr = kmalloc(sizeof *fmr, GFP_KERNEL); - if (!fmr) - return ERR_PTR(-ENOMEM); - - err = mlx4_fmr_alloc(dev->dev, to_mpd(pd)->pdn, convert_access(acc), - fmr_attr->max_pages, fmr_attr->max_maps, - fmr_attr->page_shift, &fmr->mfmr); - if (err) - goto err_free; - - err = mlx4_fmr_enable(to_mdev(pd->device)->dev, &fmr->mfmr); - if (err) - goto err_mr; - - fmr->ibfmr.rkey = fmr->ibfmr.lkey = fmr->mfmr.mr.key; - - return &fmr->ibfmr; - -err_mr: - (void) mlx4_mr_free(to_mdev(pd->device)->dev, &fmr->mfmr.mr); - -err_free: - kfree(fmr); - - return ERR_PTR(err); -} - -int mlx4_ib_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list, - int npages, u64 iova) -{ - struct mlx4_ib_fmr *ifmr = to_mfmr(ibfmr); - struct mlx4_ib_dev *dev = to_mdev(ifmr->ibfmr.device); - - return mlx4_map_phys_fmr(dev->dev, &ifmr->mfmr, page_list, npages, iova, - &ifmr->ibfmr.lkey, &ifmr->ibfmr.rkey); -} - -int mlx4_ib_unmap_fmr(struct list_head *fmr_list) -{ - struct ib_fmr *ibfmr; - int err; - struct mlx4_dev *mdev = NULL; - - list_for_each_entry(ibfmr, fmr_list, list) { - if (mdev && to_mdev(ibfmr->device)->dev != mdev) - return -EINVAL; - mdev = to_mdev(ibfmr->device)->dev; - } - - if (!mdev) - return 0; - - list_for_each_entry(ibfmr, fmr_list, list) { - struct mlx4_ib_fmr *ifmr = to_mfmr(ibfmr); - - mlx4_fmr_unmap(mdev, &ifmr->mfmr, &ifmr->ibfmr.lkey, &ifmr->ibfmr.rkey); - } - - /* - * Make sure all MPT status updates are visible before issuing - * SYNC_TPT firmware command. - */ - wmb(); - - err = mlx4_SYNC_TPT(mdev); - if (err) - pr_warn("SYNC_TPT error %d when " - "unmapping FMRs\n", err); - - return 0; -} - -int mlx4_ib_fmr_dealloc(struct ib_fmr *ibfmr) -{ - struct mlx4_ib_fmr *ifmr = to_mfmr(ibfmr); - struct mlx4_ib_dev *dev = to_mdev(ibfmr->device); - int err; - - err = mlx4_fmr_free(dev->dev, &ifmr->mfmr); - - if (!err) - kfree(ifmr); - - return err; -} - static int mlx4_set_page(struct ib_mr *ibmr, u64 addr) { struct mlx4_ib_mr *mr = to_mmr(ibmr); diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c index c72c4e1ea383..3d9aa7da95e9 100644 --- a/drivers/net/ethernet/mellanox/mlx4/main.c +++ b/drivers/net/ethernet/mellanox/mlx4/main.c @@ -2345,8 +2345,6 @@ static int mlx4_init_hca(struct mlx4_dev *dev) goto out_free; } - dev->caps.max_fmr_maps = (1 << (32 - ilog2(dev->caps.num_mpts))) - 1; - if (enable_4k_uar || !dev->persist->num_vfs) { init_hca->log_uar_sz = ilog2(dev->caps.num_uars) + PAGE_SHIFT - DEFAULT_UAR_PAGE_SHIFT; diff --git a/drivers/net/ethernet/mellanox/mlx4/mr.c b/drivers/net/ethernet/mellanox/mlx4/mr.c index 1a11bc0e1612..d2986f1f2db0 100644 --- a/drivers/net/ethernet/mellanox/mlx4/mr.c +++ b/drivers/net/ethernet/mellanox/mlx4/mr.c @@ -966,189 +966,6 @@ void mlx4_cleanup_mr_table(struct mlx4_dev *dev) mlx4_bitmap_cleanup(&mr_table->mpt_bitmap); } -static inline int mlx4_check_fmr(struct mlx4_fmr *fmr, u64 *page_list, - int npages, u64 iova) -{ - int i, page_mask; - - if (npages > fmr->max_pages) - return -EINVAL; - - page_mask = (1 << fmr->page_shift) - 1; - - /* We are getting page lists, so va must be page aligned. */ - if (iova & page_mask) - return -EINVAL; - - /* Trust the user not to pass misaligned data in page_list */ - if (0) - for (i = 0; i < npages; ++i) { - if (page_list[i] & ~page_mask) - return -EINVAL; - } - - if (fmr->maps >= fmr->max_maps) - return -EINVAL; - - return 0; -} - -int mlx4_map_phys_fmr(struct mlx4_dev *dev, struct mlx4_fmr *fmr, u64 *page_list, - int npages, u64 iova, u32 *lkey, u32 *rkey) -{ - u32 key; - int i, err; - - err = mlx4_check_fmr(fmr, page_list, npages, iova); - if (err) - return err; - - ++fmr->maps; - - key = key_to_hw_index(fmr->mr.key); - key += dev->caps.num_mpts; - *lkey = *rkey = fmr->mr.key = hw_index_to_key(key); - - *(u8 *) fmr->mpt = MLX4_MPT_STATUS_SW; - - /* Make sure MPT status is visible before writing MTT entries */ - wmb(); - - dma_sync_single_for_cpu(&dev->persist->pdev->dev, fmr->dma_handle, - npages * sizeof(u64), DMA_TO_DEVICE); - - for (i = 0; i < npages; ++i) - fmr->mtts[i] = cpu_to_be64(page_list[i] | MLX4_MTT_FLAG_PRESENT); - - dma_sync_single_for_device(&dev->persist->pdev->dev, fmr->dma_handle, - npages * sizeof(u64), DMA_TO_DEVICE); - - fmr->mpt->key = cpu_to_be32(key); - fmr->mpt->lkey = cpu_to_be32(key); - fmr->mpt->length = cpu_to_be64(npages * (1ull << fmr->page_shift)); - fmr->mpt->start = cpu_to_be64(iova); - - /* Make MTT entries are visible before setting MPT status */ - wmb(); - - *(u8 *) fmr->mpt = MLX4_MPT_STATUS_HW; - - /* Make sure MPT status is visible before consumer can use FMR */ - wmb(); - - return 0; -} -EXPORT_SYMBOL_GPL(mlx4_map_phys_fmr); - -int mlx4_fmr_alloc(struct mlx4_dev *dev, u32 pd, u32 access, int max_pages, - int max_maps, u8 page_shift, struct mlx4_fmr *fmr) -{ - struct mlx4_priv *priv = mlx4_priv(dev); - int err = -ENOMEM; - - if (max_maps > dev->caps.max_fmr_maps) - return -EINVAL; - - if (page_shift < (ffs(dev->caps.page_size_cap) - 1) || page_shift >= 32) - return -EINVAL; - - /* All MTTs must fit in the same page */ - if (max_pages * sizeof(*fmr->mtts) > PAGE_SIZE) - return -EINVAL; - - fmr->page_shift = page_shift; - fmr->max_pages = max_pages; - fmr->max_maps = max_maps; - fmr->maps = 0; - - err = mlx4_mr_alloc(dev, pd, 0, 0, access, max_pages, - page_shift, &fmr->mr); - if (err) - return err; - - fmr->mtts = mlx4_table_find(&priv->mr_table.mtt_table, - fmr->mr.mtt.offset, - &fmr->dma_handle); - - if (!fmr->mtts) { - err = -ENOMEM; - goto err_free; - } - - return 0; - -err_free: - (void) mlx4_mr_free(dev, &fmr->mr); - return err; -} -EXPORT_SYMBOL_GPL(mlx4_fmr_alloc); - -int mlx4_fmr_enable(struct mlx4_dev *dev, struct mlx4_fmr *fmr) -{ - struct mlx4_priv *priv = mlx4_priv(dev); - int err; - - err = mlx4_mr_enable(dev, &fmr->mr); - if (err) - return err; - - fmr->mpt = mlx4_table_find(&priv->mr_table.dmpt_table, - key_to_hw_index(fmr->mr.key), NULL); - if (!fmr->mpt) - return -ENOMEM; - - return 0; -} -EXPORT_SYMBOL_GPL(mlx4_fmr_enable); - -void mlx4_fmr_unmap(struct mlx4_dev *dev, struct mlx4_fmr *fmr, - u32 *lkey, u32 *rkey) -{ - if (!fmr->maps) - return; - - /* To unmap: it is sufficient to take back ownership from HW */ - *(u8 *)fmr->mpt = MLX4_MPT_STATUS_SW; - - /* Make sure MPT status is visible */ - wmb(); - - fmr->maps = 0; -} -EXPORT_SYMBOL_GPL(mlx4_fmr_unmap); - -int mlx4_fmr_free(struct mlx4_dev *dev, struct mlx4_fmr *fmr) -{ - int ret; - - if (fmr->maps) - return -EBUSY; - if (fmr->mr.enabled == MLX4_MPT_EN_HW) { - /* In case of FMR was enabled and unmapped - * make sure to give ownership of MPT back to HW - * so HW2SW_MPT command will success. - */ - *(u8 *)fmr->mpt = MLX4_MPT_STATUS_SW; - /* Make sure MPT status is visible before changing MPT fields */ - wmb(); - fmr->mpt->length = 0; - fmr->mpt->start = 0; - /* Make sure MPT data is visible after changing MPT status */ - wmb(); - *(u8 *)fmr->mpt = MLX4_MPT_STATUS_HW; - /* make sure MPT status is visible */ - wmb(); - } - - ret = mlx4_mr_free(dev, &fmr->mr); - if (ret) - return ret; - fmr->mr.enabled = MLX4_MPT_DISABLED; - - return 0; -} -EXPORT_SYMBOL_GPL(mlx4_fmr_free); - int mlx4_SYNC_TPT(struct mlx4_dev *dev) { return mlx4_cmd(dev, 0, 0, 0, MLX4_CMD_SYNC_TPT, diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h index 20372de0b587..06e066e04a4b 100644 --- a/include/linux/mlx4/device.h +++ b/include/linux/mlx4/device.h @@ -573,7 +573,6 @@ struct mlx4_caps { int reserved_eqs; int num_comp_vectors; int num_mpts; - int max_fmr_maps; int num_mtts; int fmr_reserved_mtts; int reserved_mtts; @@ -707,17 +706,6 @@ struct mlx4_mw { int enabled; }; -struct mlx4_fmr { - struct mlx4_mr mr; - struct mlx4_mpt_entry *mpt; - __be64 *mtts; - dma_addr_t dma_handle; - int max_pages; - int max_maps; - int maps; - u8 page_shift; -}; - struct mlx4_uar { unsigned long pfn; int index; @@ -1412,14 +1400,6 @@ int mlx4_find_cached_vlan(struct mlx4_dev *dev, u8 port, u16 vid, int *idx); int mlx4_register_vlan(struct mlx4_dev *dev, u8 port, u16 vlan, int *index); void mlx4_unregister_vlan(struct mlx4_dev *dev, u8 port, u16 vlan); -int mlx4_map_phys_fmr(struct mlx4_dev *dev, struct mlx4_fmr *fmr, u64 *page_list, - int npages, u64 iova, u32 *lkey, u32 *rkey); -int mlx4_fmr_alloc(struct mlx4_dev *dev, u32 pd, u32 access, int max_pages, - int max_maps, u8 page_shift, struct mlx4_fmr *fmr); -int mlx4_fmr_enable(struct mlx4_dev *dev, struct mlx4_fmr *fmr); -void mlx4_fmr_unmap(struct mlx4_dev *dev, struct mlx4_fmr *fmr, - u32 *lkey, u32 *rkey); -int mlx4_fmr_free(struct mlx4_dev *dev, struct mlx4_fmr *fmr); int mlx4_SYNC_TPT(struct mlx4_dev *dev); int mlx4_test_interrupt(struct mlx4_dev *dev, int vector); int mlx4_test_async(struct mlx4_dev *dev); @@ -1522,6 +1502,8 @@ int mlx4_vf_smi_enabled(struct mlx4_dev *dev, int slave, int port); int mlx4_vf_get_enable_smi_admin(struct mlx4_dev *dev, int slave, int port); int mlx4_vf_set_enable_smi_admin(struct mlx4_dev *dev, int slave, int port, int enable); + +struct mlx4_mpt_entry; int mlx4_mr_hw_get_mpt(struct mlx4_dev *dev, struct mlx4_mr *mmr, struct mlx4_mpt_entry ***mpt_entry); int mlx4_mr_hw_write_mpt(struct mlx4_dev *dev, struct mlx4_mr *mmr, -- cgit v1.2.3 From d6747b3715561ddc14e805e7ad4dfab2c9f245bb Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Thu, 28 May 2020 16:45:51 -0300 Subject: RDMA/mthca: Remove FMR support for memory registration Remove the ancient and unsafe FMR method. Link: https://lore.kernel.org/r/9-v3-f58e6669d5d3+2cf-fmr_removal_jgg@mellanox.com Reviewed-by: Max Gurtovoy Signed-off-by: Max Gurtovoy Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mthca/mthca_dev.h | 10 - drivers/infiniband/hw/mthca/mthca_mr.c | 262 +-------------------------- drivers/infiniband/hw/mthca/mthca_provider.c | 86 --------- drivers/infiniband/hw/mthca/mthca_provider.h | 23 --- 4 files changed, 1 insertion(+), 380 deletions(-) diff --git a/drivers/infiniband/hw/mthca/mthca_dev.h b/drivers/infiniband/hw/mthca/mthca_dev.h index 599794c5a78f..7550e9d03dec 100644 --- a/drivers/infiniband/hw/mthca/mthca_dev.h +++ b/drivers/infiniband/hw/mthca/mthca_dev.h @@ -478,16 +478,6 @@ int mthca_mr_alloc_phys(struct mthca_dev *dev, u32 pd, u32 access, struct mthca_mr *mr); void mthca_free_mr(struct mthca_dev *dev, struct mthca_mr *mr); -int mthca_fmr_alloc(struct mthca_dev *dev, u32 pd, - u32 access, struct mthca_fmr *fmr); -int mthca_tavor_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list, - int list_len, u64 iova); -void mthca_tavor_fmr_unmap(struct mthca_dev *dev, struct mthca_fmr *fmr); -int mthca_arbel_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list, - int list_len, u64 iova); -void mthca_arbel_fmr_unmap(struct mthca_dev *dev, struct mthca_fmr *fmr); -int mthca_free_fmr(struct mthca_dev *dev, struct mthca_fmr *fmr); - int mthca_map_eq_icm(struct mthca_dev *dev, u64 icm_virt); void mthca_unmap_eq_icm(struct mthca_dev *dev); diff --git a/drivers/infiniband/hw/mthca/mthca_mr.c b/drivers/infiniband/hw/mthca/mthca_mr.c index 4250b2c18c64..ce0e0867e488 100644 --- a/drivers/infiniband/hw/mthca/mthca_mr.c +++ b/drivers/infiniband/hw/mthca/mthca_mr.c @@ -541,7 +541,7 @@ int mthca_mr_alloc_phys(struct mthca_dev *dev, u32 pd, return err; } -/* Free mr or fmr */ +/* Free mr */ static void mthca_free_region(struct mthca_dev *dev, u32 lkey) { mthca_table_put(dev, dev->mr_table.mpt_table, @@ -564,266 +564,6 @@ void mthca_free_mr(struct mthca_dev *dev, struct mthca_mr *mr) mthca_free_mtt(dev, mr->mtt); } -int mthca_fmr_alloc(struct mthca_dev *dev, u32 pd, - u32 access, struct mthca_fmr *mr) -{ - struct mthca_mpt_entry *mpt_entry; - struct mthca_mailbox *mailbox; - u64 mtt_seg; - u32 key, idx; - int list_len = mr->attr.max_pages; - int err = -ENOMEM; - int i; - - if (mr->attr.page_shift < 12 || mr->attr.page_shift >= 32) - return -EINVAL; - - /* For Arbel, all MTTs must fit in the same page. */ - if (mthca_is_memfree(dev) && - mr->attr.max_pages * sizeof *mr->mem.arbel.mtts > PAGE_SIZE) - return -EINVAL; - - mr->maps = 0; - - key = mthca_alloc(&dev->mr_table.mpt_alloc); - if (key == -1) - return -ENOMEM; - key = adjust_key(dev, key); - - idx = key & (dev->limits.num_mpts - 1); - mr->ibmr.rkey = mr->ibmr.lkey = hw_index_to_key(dev, key); - - if (mthca_is_memfree(dev)) { - err = mthca_table_get(dev, dev->mr_table.mpt_table, key); - if (err) - goto err_out_mpt_free; - - mr->mem.arbel.mpt = mthca_table_find(dev->mr_table.mpt_table, key, NULL); - BUG_ON(!mr->mem.arbel.mpt); - } else - mr->mem.tavor.mpt = dev->mr_table.tavor_fmr.mpt_base + - sizeof *(mr->mem.tavor.mpt) * idx; - - mr->mtt = __mthca_alloc_mtt(dev, list_len, dev->mr_table.fmr_mtt_buddy); - if (IS_ERR(mr->mtt)) { - err = PTR_ERR(mr->mtt); - goto err_out_table; - } - - mtt_seg = mr->mtt->first_seg * dev->limits.mtt_seg_size; - - if (mthca_is_memfree(dev)) { - mr->mem.arbel.mtts = mthca_table_find(dev->mr_table.mtt_table, - mr->mtt->first_seg, - &mr->mem.arbel.dma_handle); - BUG_ON(!mr->mem.arbel.mtts); - } else - mr->mem.tavor.mtts = dev->mr_table.tavor_fmr.mtt_base + mtt_seg; - - mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL); - if (IS_ERR(mailbox)) { - err = PTR_ERR(mailbox); - goto err_out_free_mtt; - } - - mpt_entry = mailbox->buf; - - mpt_entry->flags = cpu_to_be32(MTHCA_MPT_FLAG_SW_OWNS | - MTHCA_MPT_FLAG_MIO | - MTHCA_MPT_FLAG_REGION | - access); - - mpt_entry->page_size = cpu_to_be32(mr->attr.page_shift - 12); - mpt_entry->key = cpu_to_be32(key); - mpt_entry->pd = cpu_to_be32(pd); - memset(&mpt_entry->start, 0, - sizeof *mpt_entry - offsetof(struct mthca_mpt_entry, start)); - mpt_entry->mtt_seg = cpu_to_be64(dev->mr_table.mtt_base + mtt_seg); - - if (0) { - mthca_dbg(dev, "Dumping MPT entry %08x:\n", mr->ibmr.lkey); - for (i = 0; i < sizeof (struct mthca_mpt_entry) / 4; ++i) { - if (i % 4 == 0) - printk("[%02x] ", i * 4); - printk(" %08x", be32_to_cpu(((__be32 *) mpt_entry)[i])); - if ((i + 1) % 4 == 0) - printk("\n"); - } - } - - err = mthca_SW2HW_MPT(dev, mailbox, - key & (dev->limits.num_mpts - 1)); - if (err) { - mthca_warn(dev, "SW2HW_MPT failed (%d)\n", err); - goto err_out_mailbox_free; - } - - mthca_free_mailbox(dev, mailbox); - return 0; - -err_out_mailbox_free: - mthca_free_mailbox(dev, mailbox); - -err_out_free_mtt: - mthca_free_mtt(dev, mr->mtt); - -err_out_table: - mthca_table_put(dev, dev->mr_table.mpt_table, key); - -err_out_mpt_free: - mthca_free(&dev->mr_table.mpt_alloc, key); - return err; -} - -int mthca_free_fmr(struct mthca_dev *dev, struct mthca_fmr *fmr) -{ - if (fmr->maps) - return -EBUSY; - - mthca_free_region(dev, fmr->ibmr.lkey); - mthca_free_mtt(dev, fmr->mtt); - - return 0; -} - -static inline int mthca_check_fmr(struct mthca_fmr *fmr, u64 *page_list, - int list_len, u64 iova) -{ - int i, page_mask; - - if (list_len > fmr->attr.max_pages) - return -EINVAL; - - page_mask = (1 << fmr->attr.page_shift) - 1; - - /* We are getting page lists, so va must be page aligned. */ - if (iova & page_mask) - return -EINVAL; - - /* Trust the user not to pass misaligned data in page_list */ - if (0) - for (i = 0; i < list_len; ++i) { - if (page_list[i] & ~page_mask) - return -EINVAL; - } - - if (fmr->maps >= fmr->attr.max_maps) - return -EINVAL; - - return 0; -} - - -int mthca_tavor_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list, - int list_len, u64 iova) -{ - struct mthca_fmr *fmr = to_mfmr(ibfmr); - struct mthca_dev *dev = to_mdev(ibfmr->device); - struct mthca_mpt_entry mpt_entry; - u32 key; - int i, err; - - err = mthca_check_fmr(fmr, page_list, list_len, iova); - if (err) - return err; - - ++fmr->maps; - - key = tavor_key_to_hw_index(fmr->ibmr.lkey); - key += dev->limits.num_mpts; - fmr->ibmr.lkey = fmr->ibmr.rkey = tavor_hw_index_to_key(key); - - writeb(MTHCA_MPT_STATUS_SW, fmr->mem.tavor.mpt); - - for (i = 0; i < list_len; ++i) { - __be64 mtt_entry = cpu_to_be64(page_list[i] | - MTHCA_MTT_FLAG_PRESENT); - mthca_write64_raw(mtt_entry, fmr->mem.tavor.mtts + i); - } - - mpt_entry.lkey = cpu_to_be32(key); - mpt_entry.length = cpu_to_be64(list_len * (1ull << fmr->attr.page_shift)); - mpt_entry.start = cpu_to_be64(iova); - - __raw_writel((__force u32) mpt_entry.lkey, &fmr->mem.tavor.mpt->key); - memcpy_toio(&fmr->mem.tavor.mpt->start, &mpt_entry.start, - offsetof(struct mthca_mpt_entry, window_count) - - offsetof(struct mthca_mpt_entry, start)); - - writeb(MTHCA_MPT_STATUS_HW, fmr->mem.tavor.mpt); - - return 0; -} - -int mthca_arbel_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list, - int list_len, u64 iova) -{ - struct mthca_fmr *fmr = to_mfmr(ibfmr); - struct mthca_dev *dev = to_mdev(ibfmr->device); - u32 key; - int i, err; - - err = mthca_check_fmr(fmr, page_list, list_len, iova); - if (err) - return err; - - ++fmr->maps; - - key = arbel_key_to_hw_index(fmr->ibmr.lkey); - if (dev->mthca_flags & MTHCA_FLAG_SINAI_OPT) - key += SINAI_FMR_KEY_INC; - else - key += dev->limits.num_mpts; - fmr->ibmr.lkey = fmr->ibmr.rkey = arbel_hw_index_to_key(key); - - *(u8 *) fmr->mem.arbel.mpt = MTHCA_MPT_STATUS_SW; - - wmb(); - - dma_sync_single_for_cpu(&dev->pdev->dev, fmr->mem.arbel.dma_handle, - list_len * sizeof(u64), DMA_TO_DEVICE); - - for (i = 0; i < list_len; ++i) - fmr->mem.arbel.mtts[i] = cpu_to_be64(page_list[i] | - MTHCA_MTT_FLAG_PRESENT); - - dma_sync_single_for_device(&dev->pdev->dev, fmr->mem.arbel.dma_handle, - list_len * sizeof(u64), DMA_TO_DEVICE); - - fmr->mem.arbel.mpt->key = cpu_to_be32(key); - fmr->mem.arbel.mpt->lkey = cpu_to_be32(key); - fmr->mem.arbel.mpt->length = cpu_to_be64(list_len * (1ull << fmr->attr.page_shift)); - fmr->mem.arbel.mpt->start = cpu_to_be64(iova); - - wmb(); - - *(u8 *) fmr->mem.arbel.mpt = MTHCA_MPT_STATUS_HW; - - wmb(); - - return 0; -} - -void mthca_tavor_fmr_unmap(struct mthca_dev *dev, struct mthca_fmr *fmr) -{ - if (!fmr->maps) - return; - - fmr->maps = 0; - - writeb(MTHCA_MPT_STATUS_SW, fmr->mem.tavor.mpt); -} - -void mthca_arbel_fmr_unmap(struct mthca_dev *dev, struct mthca_fmr *fmr) -{ - if (!fmr->maps) - return; - - fmr->maps = 0; - - *(u8 *) fmr->mem.arbel.mpt = MTHCA_MPT_STATUS_SW; -} - int mthca_init_mr_table(struct mthca_dev *dev) { phys_addr_t addr; diff --git a/drivers/infiniband/hw/mthca/mthca_provider.c b/drivers/infiniband/hw/mthca/mthca_provider.c index bc3e3d741ca3..de2124a8ee2b 100644 --- a/drivers/infiniband/hw/mthca/mthca_provider.c +++ b/drivers/infiniband/hw/mthca/mthca_provider.c @@ -958,69 +958,6 @@ static int mthca_dereg_mr(struct ib_mr *mr, struct ib_udata *udata) return 0; } -static struct ib_fmr *mthca_alloc_fmr(struct ib_pd *pd, int mr_access_flags, - struct ib_fmr_attr *fmr_attr) -{ - struct mthca_fmr *fmr; - int err; - - fmr = kmalloc(sizeof *fmr, GFP_KERNEL); - if (!fmr) - return ERR_PTR(-ENOMEM); - - memcpy(&fmr->attr, fmr_attr, sizeof *fmr_attr); - err = mthca_fmr_alloc(to_mdev(pd->device), to_mpd(pd)->pd_num, - convert_access(mr_access_flags), fmr); - - if (err) { - kfree(fmr); - return ERR_PTR(err); - } - - return &fmr->ibmr; -} - -static int mthca_dealloc_fmr(struct ib_fmr *fmr) -{ - struct mthca_fmr *mfmr = to_mfmr(fmr); - int err; - - err = mthca_free_fmr(to_mdev(fmr->device), mfmr); - if (err) - return err; - - kfree(mfmr); - return 0; -} - -static int mthca_unmap_fmr(struct list_head *fmr_list) -{ - struct ib_fmr *fmr; - int err; - struct mthca_dev *mdev = NULL; - - list_for_each_entry(fmr, fmr_list, list) { - if (mdev && to_mdev(fmr->device) != mdev) - return -EINVAL; - mdev = to_mdev(fmr->device); - } - - if (!mdev) - return 0; - - if (mthca_is_memfree(mdev)) { - list_for_each_entry(fmr, fmr_list, list) - mthca_arbel_fmr_unmap(mdev, to_mfmr(fmr)); - - wmb(); - } else - list_for_each_entry(fmr, fmr_list, list) - mthca_tavor_fmr_unmap(mdev, to_mfmr(fmr)); - - err = mthca_SYNC_TPT(mdev); - return err; -} - static ssize_t hw_rev_show(struct device *device, struct device_attribute *attr, char *buf) { @@ -1204,20 +1141,6 @@ static const struct ib_device_ops mthca_dev_tavor_srq_ops = { INIT_RDMA_OBJ_SIZE(ib_srq, mthca_srq, ibsrq), }; -static const struct ib_device_ops mthca_dev_arbel_fmr_ops = { - .alloc_fmr = mthca_alloc_fmr, - .dealloc_fmr = mthca_dealloc_fmr, - .map_phys_fmr = mthca_arbel_map_phys_fmr, - .unmap_fmr = mthca_unmap_fmr, -}; - -static const struct ib_device_ops mthca_dev_tavor_fmr_ops = { - .alloc_fmr = mthca_alloc_fmr, - .dealloc_fmr = mthca_dealloc_fmr, - .map_phys_fmr = mthca_tavor_map_phys_fmr, - .unmap_fmr = mthca_unmap_fmr, -}; - static const struct ib_device_ops mthca_dev_arbel_ops = { .post_recv = mthca_arbel_post_receive, .post_send = mthca_arbel_post_send, @@ -1276,15 +1199,6 @@ int mthca_register_device(struct mthca_dev *dev) &mthca_dev_tavor_srq_ops); } - if (dev->mthca_flags & MTHCA_FLAG_FMR) { - if (mthca_is_memfree(dev)) - ib_set_device_ops(&dev->ib_dev, - &mthca_dev_arbel_fmr_ops); - else - ib_set_device_ops(&dev->ib_dev, - &mthca_dev_tavor_fmr_ops); - } - ib_set_device_ops(&dev->ib_dev, &mthca_dev_ops); if (mthca_is_memfree(dev)) diff --git a/drivers/infiniband/hw/mthca/mthca_provider.h b/drivers/infiniband/hw/mthca/mthca_provider.h index 596acc45569b..84c64bff0d92 100644 --- a/drivers/infiniband/hw/mthca/mthca_provider.h +++ b/drivers/infiniband/hw/mthca/mthca_provider.h @@ -76,24 +76,6 @@ struct mthca_mr { struct mthca_mtt *mtt; }; -struct mthca_fmr { - struct ib_fmr ibmr; - struct ib_fmr_attr attr; - struct mthca_mtt *mtt; - int maps; - union { - struct { - struct mthca_mpt_entry __iomem *mpt; - u64 __iomem *mtts; - } tavor; - struct { - struct mthca_mpt_entry *mpt; - __be64 *mtts; - dma_addr_t dma_handle; - } arbel; - } mem; -}; - struct mthca_pd { struct ib_pd ibpd; u32 pd_num; @@ -301,11 +283,6 @@ static inline struct mthca_ucontext *to_mucontext(struct ib_ucontext *ibucontext return container_of(ibucontext, struct mthca_ucontext, ibucontext); } -static inline struct mthca_fmr *to_mfmr(struct ib_fmr *ibmr) -{ - return container_of(ibmr, struct mthca_fmr, ibmr); -} - static inline struct mthca_mr *to_mmr(struct ib_mr *ibmr) { return container_of(ibmr, struct mthca_mr, ibmr); -- cgit v1.2.3 From 22c9cc2408b734d2e5b193d287572cd2c7011183 Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Thu, 28 May 2020 16:45:52 -0300 Subject: RDMA/rdmavt: Remove FMR memory registration Use FRWR method to register memory by default and remove the ancient and unsafe FMR method. Link: https://lore.kernel.org/r/10-v3-f58e6669d5d3+2cf-fmr_removal_jgg@mellanox.com Signed-off-by: Max Gurtovoy Tested-by: Dennis Dalessandro Acked-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rdmavt/mr.c | 154 -------------------------------------- drivers/infiniband/sw/rdmavt/mr.h | 15 ---- drivers/infiniband/sw/rdmavt/vt.c | 4 - 3 files changed, 173 deletions(-) diff --git a/drivers/infiniband/sw/rdmavt/mr.c b/drivers/infiniband/sw/rdmavt/mr.c index 72f6534fbb52..ddb0c0d771c2 100644 --- a/drivers/infiniband/sw/rdmavt/mr.c +++ b/drivers/infiniband/sw/rdmavt/mr.c @@ -713,160 +713,6 @@ bail: } EXPORT_SYMBOL(rvt_invalidate_rkey); -/** - * rvt_alloc_fmr - allocate a fast memory region - * @pd: the protection domain for this memory region - * @mr_access_flags: access flags for this memory region - * @fmr_attr: fast memory region attributes - * - * Return: the memory region on success, otherwise returns an errno. - */ -struct ib_fmr *rvt_alloc_fmr(struct ib_pd *pd, int mr_access_flags, - struct ib_fmr_attr *fmr_attr) -{ - struct rvt_fmr *fmr; - int m; - struct ib_fmr *ret; - int rval = -ENOMEM; - - /* Allocate struct plus pointers to first level page tables. */ - m = (fmr_attr->max_pages + RVT_SEGSZ - 1) / RVT_SEGSZ; - fmr = kzalloc(struct_size(fmr, mr.map, m), GFP_KERNEL); - if (!fmr) - goto bail; - - rval = rvt_init_mregion(&fmr->mr, pd, fmr_attr->max_pages, - PERCPU_REF_INIT_ATOMIC); - if (rval) - goto bail; - - /* - * ib_alloc_fmr() will initialize fmr->ibfmr except for lkey & - * rkey. - */ - rval = rvt_alloc_lkey(&fmr->mr, 0); - if (rval) - goto bail_mregion; - fmr->ibfmr.rkey = fmr->mr.lkey; - fmr->ibfmr.lkey = fmr->mr.lkey; - /* - * Resources are allocated but no valid mapping (RKEY can't be - * used). - */ - fmr->mr.access_flags = mr_access_flags; - fmr->mr.max_segs = fmr_attr->max_pages; - fmr->mr.page_shift = fmr_attr->page_shift; - - ret = &fmr->ibfmr; -done: - return ret; - -bail_mregion: - rvt_deinit_mregion(&fmr->mr); -bail: - kfree(fmr); - ret = ERR_PTR(rval); - goto done; -} - -/** - * rvt_map_phys_fmr - set up a fast memory region - * @ibfmr: the fast memory region to set up - * @page_list: the list of pages to associate with the fast memory region - * @list_len: the number of pages to associate with the fast memory region - * @iova: the virtual address of the start of the fast memory region - * - * This may be called from interrupt context. - * - * Return: 0 on success - */ - -int rvt_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list, - int list_len, u64 iova) -{ - struct rvt_fmr *fmr = to_ifmr(ibfmr); - struct rvt_lkey_table *rkt; - unsigned long flags; - int m, n; - unsigned long i; - u32 ps; - struct rvt_dev_info *rdi = ib_to_rvt(ibfmr->device); - - i = atomic_long_read(&fmr->mr.refcount.count); - if (i > 2) - return -EBUSY; - - if (list_len > fmr->mr.max_segs) - return -EINVAL; - - rkt = &rdi->lkey_table; - spin_lock_irqsave(&rkt->lock, flags); - fmr->mr.user_base = iova; - fmr->mr.iova = iova; - ps = 1 << fmr->mr.page_shift; - fmr->mr.length = list_len * ps; - m = 0; - n = 0; - for (i = 0; i < list_len; i++) { - fmr->mr.map[m]->segs[n].vaddr = (void *)page_list[i]; - fmr->mr.map[m]->segs[n].length = ps; - trace_rvt_mr_fmr_seg(&fmr->mr, m, n, (void *)page_list[i], ps); - if (++n == RVT_SEGSZ) { - m++; - n = 0; - } - } - spin_unlock_irqrestore(&rkt->lock, flags); - return 0; -} - -/** - * rvt_unmap_fmr - unmap fast memory regions - * @fmr_list: the list of fast memory regions to unmap - * - * Return: 0 on success. - */ -int rvt_unmap_fmr(struct list_head *fmr_list) -{ - struct rvt_fmr *fmr; - struct rvt_lkey_table *rkt; - unsigned long flags; - struct rvt_dev_info *rdi; - - list_for_each_entry(fmr, fmr_list, ibfmr.list) { - rdi = ib_to_rvt(fmr->ibfmr.device); - rkt = &rdi->lkey_table; - spin_lock_irqsave(&rkt->lock, flags); - fmr->mr.user_base = 0; - fmr->mr.iova = 0; - fmr->mr.length = 0; - spin_unlock_irqrestore(&rkt->lock, flags); - } - return 0; -} - -/** - * rvt_dealloc_fmr - deallocate a fast memory region - * @ibfmr: the fast memory region to deallocate - * - * Return: 0 on success. - */ -int rvt_dealloc_fmr(struct ib_fmr *ibfmr) -{ - struct rvt_fmr *fmr = to_ifmr(ibfmr); - int ret = 0; - - rvt_free_lkey(&fmr->mr); - rvt_put_mr(&fmr->mr); /* will set completion if last */ - ret = rvt_check_refs(&fmr->mr, __func__); - if (ret) - goto out; - rvt_deinit_mregion(&fmr->mr); - kfree(fmr); -out: - return ret; -} - /** * rvt_sge_adjacent - is isge compressible * @last_sge: last outgoing SGE written diff --git a/drivers/infiniband/sw/rdmavt/mr.h b/drivers/infiniband/sw/rdmavt/mr.h index 2c8d0752e8e3..780fc63af98b 100644 --- a/drivers/infiniband/sw/rdmavt/mr.h +++ b/drivers/infiniband/sw/rdmavt/mr.h @@ -49,10 +49,6 @@ */ #include -struct rvt_fmr { - struct ib_fmr ibfmr; - struct rvt_mregion mr; /* must be last */ -}; struct rvt_mr { struct ib_mr ibmr; @@ -60,11 +56,6 @@ struct rvt_mr { struct rvt_mregion mr; /* must be last */ }; -static inline struct rvt_fmr *to_ifmr(struct ib_fmr *ibfmr) -{ - return container_of(ibfmr, struct rvt_fmr, ibfmr); -} - static inline struct rvt_mr *to_imr(struct ib_mr *ibmr) { return container_of(ibmr, struct rvt_mr, ibmr); @@ -83,11 +74,5 @@ struct ib_mr *rvt_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type, u32 max_num_sg, struct ib_udata *udata); int rvt_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents, unsigned int *sg_offset); -struct ib_fmr *rvt_alloc_fmr(struct ib_pd *pd, int mr_access_flags, - struct ib_fmr_attr *fmr_attr); -int rvt_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list, - int list_len, u64 iova); -int rvt_unmap_fmr(struct list_head *fmr_list); -int rvt_dealloc_fmr(struct ib_fmr *ibfmr); #endif /* DEF_RVTMR_H */ diff --git a/drivers/infiniband/sw/rdmavt/vt.c b/drivers/infiniband/sw/rdmavt/vt.c index 72b031ab7092..f904bb34477a 100644 --- a/drivers/infiniband/sw/rdmavt/vt.c +++ b/drivers/infiniband/sw/rdmavt/vt.c @@ -378,7 +378,6 @@ enum { static const struct ib_device_ops rvt_dev_ops = { .uverbs_abi_ver = RVT_UVERBS_ABI_VERSION, - .alloc_fmr = rvt_alloc_fmr, .alloc_mr = rvt_alloc_mr, .alloc_pd = rvt_alloc_pd, .alloc_ucontext = rvt_alloc_ucontext, @@ -387,7 +386,6 @@ static const struct ib_device_ops rvt_dev_ops = { .create_cq = rvt_create_cq, .create_qp = rvt_create_qp, .create_srq = rvt_create_srq, - .dealloc_fmr = rvt_dealloc_fmr, .dealloc_pd = rvt_dealloc_pd, .dealloc_ucontext = rvt_dealloc_ucontext, .dereg_mr = rvt_dereg_mr, @@ -399,7 +397,6 @@ static const struct ib_device_ops rvt_dev_ops = { .get_dma_mr = rvt_get_dma_mr, .get_port_immutable = rvt_get_port_immutable, .map_mr_sg = rvt_map_mr_sg, - .map_phys_fmr = rvt_map_phys_fmr, .mmap = rvt_mmap, .modify_ah = rvt_modify_ah, .modify_device = rvt_modify_device, @@ -420,7 +417,6 @@ static const struct ib_device_ops rvt_dev_ops = { .reg_user_mr = rvt_reg_user_mr, .req_notify_cq = rvt_req_notify_cq, .resize_cq = rvt_resize_cq, - .unmap_fmr = rvt_unmap_fmr, INIT_RDMA_OBJ_SIZE(ib_ah, rvt_ah, ibah), INIT_RDMA_OBJ_SIZE(ib_cq, rvt_cq, ibcq), -- cgit v1.2.3 From 3a578152a9208bbcd196210be2f5396744cda302 Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Thu, 28 May 2020 16:45:53 -0300 Subject: RDMA/core: Remove FMR device ops After removing FMR support from all the RDMA ULPs and providers, there is no need to keep FMR operation for IB devices. Link: https://lore.kernel.org/r/11-v3-f58e6669d5d3+2cf-fmr_removal_jgg@mellanox.com Signed-off-by: Max Gurtovoy Signed-off-by: Jason Gunthorpe --- Documentation/infiniband/core_locking.rst | 2 -- drivers/infiniband/core/device.c | 4 --- drivers/infiniband/core/verbs.c | 48 ------------------------- include/rdma/ib_verbs.h | 59 ------------------------------- 4 files changed, 113 deletions(-) diff --git a/Documentation/infiniband/core_locking.rst b/Documentation/infiniband/core_locking.rst index 8f76a8a5a38f..efd5e7603014 100644 --- a/Documentation/infiniband/core_locking.rst +++ b/Documentation/infiniband/core_locking.rst @@ -22,7 +22,6 @@ Sleeping and interrupt context - post_recv - poll_cq - req_notify_cq - - map_phys_fmr which may not sleep and must be callable from any context. @@ -36,7 +35,6 @@ Sleeping and interrupt context - ib_post_send - ib_post_recv - ib_req_notify_cq - - ib_map_phys_fmr are therefore safe to call from any context. diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 53f541f41ff3..905a2beaf885 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -2571,7 +2571,6 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops) SET_DEVICE_OP(dev_ops, add_gid); SET_DEVICE_OP(dev_ops, advise_mr); SET_DEVICE_OP(dev_ops, alloc_dm); - SET_DEVICE_OP(dev_ops, alloc_fmr); SET_DEVICE_OP(dev_ops, alloc_hw_stats); SET_DEVICE_OP(dev_ops, alloc_mr); SET_DEVICE_OP(dev_ops, alloc_mr_integrity); @@ -2598,7 +2597,6 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops) SET_DEVICE_OP(dev_ops, create_wq); SET_DEVICE_OP(dev_ops, dealloc_dm); SET_DEVICE_OP(dev_ops, dealloc_driver); - SET_DEVICE_OP(dev_ops, dealloc_fmr); SET_DEVICE_OP(dev_ops, dealloc_mw); SET_DEVICE_OP(dev_ops, dealloc_pd); SET_DEVICE_OP(dev_ops, dealloc_ucontext); @@ -2642,7 +2640,6 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops) SET_DEVICE_OP(dev_ops, iw_rem_ref); SET_DEVICE_OP(dev_ops, map_mr_sg); SET_DEVICE_OP(dev_ops, map_mr_sg_pi); - SET_DEVICE_OP(dev_ops, map_phys_fmr); SET_DEVICE_OP(dev_ops, mmap); SET_DEVICE_OP(dev_ops, mmap_free); SET_DEVICE_OP(dev_ops, modify_ah); @@ -2676,7 +2673,6 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops) SET_DEVICE_OP(dev_ops, resize_cq); SET_DEVICE_OP(dev_ops, set_vf_guid); SET_DEVICE_OP(dev_ops, set_vf_link_state); - SET_DEVICE_OP(dev_ops, unmap_fmr); SET_OBJ_SIZE(dev_ops, ib_ah); SET_OBJ_SIZE(dev_ops, ib_cq); diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index 21815e125e98..53d6505c0c7b 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -2212,54 +2212,6 @@ out: } EXPORT_SYMBOL(ib_alloc_mr_integrity); -/* "Fast" memory regions */ - -struct ib_fmr *ib_alloc_fmr(struct ib_pd *pd, - int mr_access_flags, - struct ib_fmr_attr *fmr_attr) -{ - struct ib_fmr *fmr; - - if (!pd->device->ops.alloc_fmr) - return ERR_PTR(-EOPNOTSUPP); - - fmr = pd->device->ops.alloc_fmr(pd, mr_access_flags, fmr_attr); - if (!IS_ERR(fmr)) { - fmr->device = pd->device; - fmr->pd = pd; - atomic_inc(&pd->usecnt); - } - - return fmr; -} -EXPORT_SYMBOL(ib_alloc_fmr); - -int ib_unmap_fmr(struct list_head *fmr_list) -{ - struct ib_fmr *fmr; - - if (list_empty(fmr_list)) - return 0; - - fmr = list_entry(fmr_list->next, struct ib_fmr, list); - return fmr->device->ops.unmap_fmr(fmr_list); -} -EXPORT_SYMBOL(ib_unmap_fmr); - -int ib_dealloc_fmr(struct ib_fmr *fmr) -{ - struct ib_pd *pd; - int ret; - - pd = fmr->pd; - ret = fmr->device->ops.dealloc_fmr(fmr); - if (!ret) - atomic_dec(&pd->usecnt); - - return ret; -} -EXPORT_SYMBOL(ib_dealloc_fmr); - /* Multicast groups */ static bool is_valid_mcast_lid(struct ib_qp *qp, u16 lid) diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 19864da78649..ff6a8053ec52 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -1475,12 +1475,6 @@ enum ib_mr_rereg_flags { IB_MR_REREG_SUPPORTED = ((IB_MR_REREG_ACCESS << 1) - 1) }; -struct ib_fmr_attr { - int max_pages; - int max_maps; - u8 page_shift; -}; - struct ib_umem; enum rdma_remove_reason { @@ -1855,14 +1849,6 @@ struct ib_mw { enum ib_mw_type type; }; -struct ib_fmr { - struct ib_device *device; - struct ib_pd *pd; - struct list_head list; - u32 lkey; - u32 rkey; -}; - /* Supported steering options */ enum ib_flow_attr_type { /* steering according to rule specifications */ @@ -2505,12 +2491,6 @@ struct ib_device_ops { struct ib_mw *(*alloc_mw)(struct ib_pd *pd, enum ib_mw_type type, struct ib_udata *udata); int (*dealloc_mw)(struct ib_mw *mw); - struct ib_fmr *(*alloc_fmr)(struct ib_pd *pd, int mr_access_flags, - struct ib_fmr_attr *fmr_attr); - int (*map_phys_fmr)(struct ib_fmr *fmr, u64 *page_list, int list_len, - u64 iova); - int (*unmap_fmr)(struct list_head *fmr_list); - int (*dealloc_fmr)(struct ib_fmr *fmr); int (*attach_mcast)(struct ib_qp *qp, union ib_gid *gid, u16 lid); int (*detach_mcast)(struct ib_qp *qp, union ib_gid *gid, u16 lid); struct ib_xrcd *(*alloc_xrcd)(struct ib_device *device, @@ -4319,45 +4299,6 @@ static inline u32 ib_inc_rkey(u32 rkey) return ((rkey + 1) & mask) | (rkey & ~mask); } -/** - * ib_alloc_fmr - Allocates a unmapped fast memory region. - * @pd: The protection domain associated with the unmapped region. - * @mr_access_flags: Specifies the memory access rights. - * @fmr_attr: Attributes of the unmapped region. - * - * A fast memory region must be mapped before it can be used as part of - * a work request. - */ -struct ib_fmr *ib_alloc_fmr(struct ib_pd *pd, - int mr_access_flags, - struct ib_fmr_attr *fmr_attr); - -/** - * ib_map_phys_fmr - Maps a list of physical pages to a fast memory region. - * @fmr: The fast memory region to associate with the pages. - * @page_list: An array of physical pages to map to the fast memory region. - * @list_len: The number of pages in page_list. - * @iova: The I/O virtual address to use with the mapped region. - */ -static inline int ib_map_phys_fmr(struct ib_fmr *fmr, - u64 *page_list, int list_len, - u64 iova) -{ - return fmr->device->ops.map_phys_fmr(fmr, page_list, list_len, iova); -} - -/** - * ib_unmap_fmr - Removes the mapping from a list of fast memory regions. - * @fmr_list: A linked list of fast memory regions to unmap. - */ -int ib_unmap_fmr(struct list_head *fmr_list); - -/** - * ib_dealloc_fmr - Deallocates a fast memory region. - * @fmr: The fast memory region to deallocate. - */ -int ib_dealloc_fmr(struct ib_fmr *fmr); - /** * ib_attach_mcast - Attaches the specified QP to a multicast group. * @qp: QP to attach to the multicast group. The QP must be type -- cgit v1.2.3 From 649392bf75a423287a9c4936b341677f12e8cf0b Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 28 May 2020 16:45:54 -0300 Subject: RDMA: Remove 'max_fmr' Now that FMR support is gone, this attribute can be deleted from all places. Link: https://lore.kernel.org/r/12-v3-f58e6669d5d3+2cf-fmr_removal_jgg@mellanox.com Reviewed-by: Max Gurtovoy Reviewed-by: Bernard Metzler Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/uverbs_cmd.c | 1 - drivers/infiniband/hw/ocrdma/ocrdma.h | 1 - drivers/infiniband/hw/ocrdma/ocrdma_hw.c | 1 - drivers/infiniband/hw/ocrdma/ocrdma_verbs.c | 1 - drivers/infiniband/hw/qedr/main.c | 1 - drivers/infiniband/hw/qedr/qedr.h | 1 - drivers/infiniband/hw/qedr/verbs.c | 1 - drivers/infiniband/sw/rdmavt/mr.c | 1 - drivers/infiniband/sw/siw/siw.h | 2 -- drivers/infiniband/sw/siw/siw_main.c | 1 - drivers/infiniband/sw/siw/siw_verbs.c | 1 - drivers/net/ethernet/qlogic/qed/qed_rdma.c | 1 - drivers/net/ethernet/qlogic/qed/qed_rdma.h | 1 - include/linux/qed/qed_rdma_if.h | 1 - include/rdma/ib_verbs.h | 1 - net/rds/ib.c | 2 +- 16 files changed, 1 insertion(+), 17 deletions(-) diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 2067a939788b..56d207405dbd 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -356,7 +356,6 @@ static void copy_query_dev_fields(struct ib_ucontext *ucontext, resp->max_mcast_qp_attach = attr->max_mcast_qp_attach; resp->max_total_mcast_qp_attach = attr->max_total_mcast_qp_attach; resp->max_ah = attr->max_ah; - resp->max_fmr = attr->max_fmr; resp->max_map_per_fmr = attr->max_map_per_fmr; resp->max_srq = attr->max_srq; resp->max_srq_wr = attr->max_srq_wr; diff --git a/drivers/infiniband/hw/ocrdma/ocrdma.h b/drivers/infiniband/hw/ocrdma/ocrdma.h index 7baedc74e39d..fcfe0e82197a 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma.h +++ b/drivers/infiniband/hw/ocrdma/ocrdma.h @@ -98,7 +98,6 @@ struct ocrdma_dev_attr { u64 max_mr_size; u32 max_num_mr_pbl; int max_mw; - int max_fmr; int max_map_per_fmr; int max_pages_per_frmr; u16 max_ord_per_qp; diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_hw.c b/drivers/infiniband/hw/ocrdma/ocrdma_hw.c index d82d3ec3649e..e07bf0b2209a 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_hw.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_hw.c @@ -1190,7 +1190,6 @@ static void ocrdma_get_attr(struct ocrdma_dev *dev, attr->max_mr = rsp->max_mr; attr->max_mr_size = ((u64)rsp->max_mr_size_hi << 32) | rsp->max_mr_size_lo; - attr->max_fmr = 0; attr->max_pages_per_frmr = rsp->max_pages_per_frmr; attr->max_num_mr_pbl = rsp->max_num_mr_pbl; attr->max_cqe = rsp->max_cq_cqes_per_cq & diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c index 10e343894595..890e3fd41d21 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c @@ -99,7 +99,6 @@ int ocrdma_query_device(struct ib_device *ibdev, struct ib_device_attr *attr, attr->max_mw = dev->attr.max_mw; attr->max_pd = dev->attr.max_pd; attr->atomic_cap = 0; - attr->max_fmr = 0; attr->max_map_per_fmr = 0; attr->max_qp_rd_atom = min(dev->attr.max_ord_per_qp, dev->attr.max_ird_per_qp); diff --git a/drivers/infiniband/hw/qedr/main.c b/drivers/infiniband/hw/qedr/main.c index dcdc85a1ab25..ccaedfd53e49 100644 --- a/drivers/infiniband/hw/qedr/main.c +++ b/drivers/infiniband/hw/qedr/main.c @@ -632,7 +632,6 @@ static int qedr_set_device_attr(struct qedr_dev *dev) attr->max_mr_size = qed_attr->max_mr_size; attr->max_cqe = min_t(u64, qed_attr->max_cqe, QEDR_MAX_CQES); attr->max_mw = qed_attr->max_mw; - attr->max_fmr = qed_attr->max_fmr; attr->max_mr_mw_fmr_pbl = qed_attr->max_mr_mw_fmr_pbl; attr->max_mr_mw_fmr_size = qed_attr->max_mr_mw_fmr_size; attr->max_pd = qed_attr->max_pd; diff --git a/drivers/infiniband/hw/qedr/qedr.h b/drivers/infiniband/hw/qedr/qedr.h index 5488dbd59d3c..fdf90ecb2699 100644 --- a/drivers/infiniband/hw/qedr/qedr.h +++ b/drivers/infiniband/hw/qedr/qedr.h @@ -103,7 +103,6 @@ struct qedr_device_attr { u64 max_mr_size; u32 max_cqe; u32 max_mw; - u32 max_fmr; u32 max_mr_mw_fmr_pbl; u64 max_mr_mw_fmr_size; u32 max_pd; diff --git a/drivers/infiniband/hw/qedr/verbs.c b/drivers/infiniband/hw/qedr/verbs.c index d6b94a713573..ca88006eaa66 100644 --- a/drivers/infiniband/hw/qedr/verbs.c +++ b/drivers/infiniband/hw/qedr/verbs.c @@ -145,7 +145,6 @@ int qedr_query_device(struct ib_device *ibdev, attr->max_mw = qattr->max_mw; attr->max_pd = qattr->max_pd; attr->atomic_cap = dev->atomic_cap; - attr->max_fmr = qattr->max_fmr; attr->max_map_per_fmr = 16; attr->max_qp_init_rd_atom = 1 << (fls(qattr->max_qp_req_rd_atomic_resc) - 1); diff --git a/drivers/infiniband/sw/rdmavt/mr.c b/drivers/infiniband/sw/rdmavt/mr.c index ddb0c0d771c2..60864e5ca7cb 100644 --- a/drivers/infiniband/sw/rdmavt/mr.c +++ b/drivers/infiniband/sw/rdmavt/mr.c @@ -97,7 +97,6 @@ int rvt_driver_mr_init(struct rvt_dev_info *rdi) RCU_INIT_POINTER(rdi->lkey_table.table[i], NULL); rdi->dparms.props.max_mr = rdi->lkey_table.max; - rdi->dparms.props.max_fmr = rdi->lkey_table.max; return 0; } diff --git a/drivers/infiniband/sw/siw/siw.h b/drivers/infiniband/sw/siw/siw.h index 5a58a1cc7a7e..e9753831ac3f 100644 --- a/drivers/infiniband/sw/siw/siw.h +++ b/drivers/infiniband/sw/siw/siw.h @@ -30,7 +30,6 @@ #define SIW_MAX_MR (SIW_MAX_QP * 10) #define SIW_MAX_PD SIW_MAX_QP #define SIW_MAX_MW 0 /* to be set if MW's are supported */ -#define SIW_MAX_FMR SIW_MAX_MR #define SIW_MAX_SRQ SIW_MAX_QP #define SIW_MAX_SRQ_WR (SIW_MAX_QP_WR * 10) #define SIW_MAX_CONTEXT SIW_MAX_PD @@ -59,7 +58,6 @@ struct siw_dev_cap { int max_mr; int max_pd; int max_mw; - int max_fmr; int max_srq; int max_srq_wr; int max_srq_sge; diff --git a/drivers/infiniband/sw/siw/siw_main.c b/drivers/infiniband/sw/siw/siw_main.c index 5cd40fb9e20c..a0b8cc643c5c 100644 --- a/drivers/infiniband/sw/siw/siw_main.c +++ b/drivers/infiniband/sw/siw/siw_main.c @@ -413,7 +413,6 @@ static struct siw_device *siw_device_create(struct net_device *netdev) sdev->attrs.max_mr = SIW_MAX_MR; sdev->attrs.max_pd = SIW_MAX_PD; sdev->attrs.max_mw = SIW_MAX_MW; - sdev->attrs.max_fmr = SIW_MAX_FMR; sdev->attrs.max_srq = SIW_MAX_SRQ; sdev->attrs.max_srq_wr = SIW_MAX_SRQ_WR; sdev->attrs.max_srq_sge = SIW_MAX_SGE; diff --git a/drivers/infiniband/sw/siw/siw_verbs.c b/drivers/infiniband/sw/siw/siw_verbs.c index aeb842bc7a1e..987e2ba05dbc 100644 --- a/drivers/infiniband/sw/siw/siw_verbs.c +++ b/drivers/infiniband/sw/siw/siw_verbs.c @@ -136,7 +136,6 @@ int siw_query_device(struct ib_device *base_dev, struct ib_device_attr *attr, attr->max_cq = sdev->attrs.max_cq; attr->max_cqe = sdev->attrs.max_cqe; attr->max_fast_reg_page_list_len = SIW_MAX_SGE_PBL; - attr->max_fmr = sdev->attrs.max_fmr; attr->max_mr = sdev->attrs.max_mr; attr->max_mw = sdev->attrs.max_mw; attr->max_mr_size = ~0ull; diff --git a/drivers/net/ethernet/qlogic/qed/qed_rdma.c b/drivers/net/ethernet/qlogic/qed/qed_rdma.c index 38b1f402f7ed..5dc18a4bdda4 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_rdma.c +++ b/drivers/net/ethernet/qlogic/qed/qed_rdma.c @@ -499,7 +499,6 @@ static void qed_rdma_init_devinfo(struct qed_hwfn *p_hwfn, dev->max_cqe = QED_RDMA_MAX_CQE_16_BIT; dev->max_mw = 0; - dev->max_fmr = QED_RDMA_MAX_FMR; dev->max_mr_mw_fmr_pbl = (PAGE_SIZE / 8) * (PAGE_SIZE / 8); dev->max_mr_mw_fmr_size = dev->max_mr_mw_fmr_pbl * PAGE_SIZE; dev->max_pkey = QED_RDMA_MAX_P_KEY; diff --git a/drivers/net/ethernet/qlogic/qed/qed_rdma.h b/drivers/net/ethernet/qlogic/qed/qed_rdma.h index 3689fe3e5935..dfaa2f552627 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_rdma.h +++ b/drivers/net/ethernet/qlogic/qed/qed_rdma.h @@ -45,7 +45,6 @@ #include "qed_iwarp.h" #include "qed_roce.h" -#define QED_RDMA_MAX_FMR (RDMA_MAX_TIDS) #define QED_RDMA_MAX_P_KEY (1) #define QED_RDMA_MAX_WQE (0x7FFF) #define QED_RDMA_MAX_SRQ_WQE_ELEM (0x7FFF) diff --git a/include/linux/qed/qed_rdma_if.h b/include/linux/qed/qed_rdma_if.h index 74efca15fde7..c90276cda5c1 100644 --- a/include/linux/qed/qed_rdma_if.h +++ b/include/linux/qed/qed_rdma_if.h @@ -91,7 +91,6 @@ struct qed_rdma_device { u64 max_mr_size; u32 max_cqe; u32 max_mw; - u32 max_fmr; u32 max_mr_mw_fmr_pbl; u64 max_mr_mw_fmr_size; u32 max_pd; diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index ff6a8053ec52..c4708b3243f9 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -430,7 +430,6 @@ struct ib_device_attr { int max_mcast_qp_attach; int max_total_mcast_qp_attach; int max_ah; - int max_fmr; int max_map_per_fmr; int max_srq; int max_srq_wr; diff --git a/net/rds/ib.c b/net/rds/ib.c index 6c43b3e4c736..deecbdcdae84 100644 --- a/net/rds/ib.c +++ b/net/rds/ib.c @@ -217,7 +217,7 @@ static int rds_ib_add_one(struct ib_device *device) } rdsdebug("RDS/IB: max_mr = %d, max_wrs = %d, max_sge = %d, max_1m_mrs = %d, max_8k_mrs = %d\n", - device->attrs.max_fmr, rds_ibdev->max_wrs, rds_ibdev->max_sge, + device->attrs.max_mr, rds_ibdev->max_wrs, rds_ibdev->max_sge, rds_ibdev->max_1m_mrs, rds_ibdev->max_8k_mrs); pr_info("RDS/IB: %s: added\n", device->name); -- cgit v1.2.3 From 4d12c04caa88cd3115f25acd832a7cddb698981b Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 28 May 2020 16:45:55 -0300 Subject: RDMA: Remove 'max_map_per_fmr' Now that FMR support is gone, this attribute can be deleted from all places. Link: https://lore.kernel.org/r/13-v3-f58e6669d5d3+2cf-fmr_removal_jgg@mellanox.com Reviewed-by: Max Gurtovoy Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/uverbs_cmd.c | 1 - drivers/infiniband/hw/hfi1/verbs.c | 1 - drivers/infiniband/hw/i40iw/i40iw_verbs.c | 1 - drivers/infiniband/hw/mlx5/main.c | 1 - drivers/infiniband/hw/mthca/mthca_provider.c | 10 ---------- drivers/infiniband/hw/ocrdma/ocrdma_verbs.c | 1 - drivers/infiniband/hw/qedr/verbs.c | 1 - drivers/infiniband/hw/qib/qib_verbs.c | 1 - drivers/infiniband/hw/usnic/usnic_ib_verbs.c | 1 - include/rdma/ib_verbs.h | 1 - 10 files changed, 19 deletions(-) diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 56d207405dbd..b48b3f6e632d 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -356,7 +356,6 @@ static void copy_query_dev_fields(struct ib_ucontext *ucontext, resp->max_mcast_qp_attach = attr->max_mcast_qp_attach; resp->max_total_mcast_qp_attach = attr->max_total_mcast_qp_attach; resp->max_ah = attr->max_ah; - resp->max_map_per_fmr = attr->max_map_per_fmr; resp->max_srq = attr->max_srq; resp->max_srq_wr = attr->max_srq_wr; resp->max_srq_sge = attr->max_srq_sge; diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c index 43ddced15951..30865635b449 100644 --- a/drivers/infiniband/hw/hfi1/verbs.c +++ b/drivers/infiniband/hw/hfi1/verbs.c @@ -1361,7 +1361,6 @@ static void hfi1_fill_device_attr(struct hfi1_devdata *dd) rdi->dparms.props.max_cq = hfi1_max_cqs; rdi->dparms.props.max_ah = hfi1_max_ahs; rdi->dparms.props.max_cqe = hfi1_max_cqes; - rdi->dparms.props.max_map_per_fmr = 32767; rdi->dparms.props.max_pd = hfi1_max_pds; rdi->dparms.props.max_qp_rd_atom = HFI1_MAX_RDMA_ATOMIC; rdi->dparms.props.max_qp_init_rd_atom = 255; diff --git a/drivers/infiniband/hw/i40iw/i40iw_verbs.c b/drivers/infiniband/hw/i40iw/i40iw_verbs.c index 1b6fb1380961..19af29a48c55 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_verbs.c +++ b/drivers/infiniband/hw/i40iw/i40iw_verbs.c @@ -83,7 +83,6 @@ static int i40iw_query_device(struct ib_device *ibdev, props->max_qp_rd_atom = I40IW_MAX_IRD_SIZE; props->max_qp_init_rd_atom = props->max_qp_rd_atom; props->atomic_cap = IB_ATOMIC_NONE; - props->max_map_per_fmr = 1; props->max_fast_reg_page_list_len = I40IW_MAX_PAGES_PER_FMR; return 0; } diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 49a1aff72715..343a8b8361e7 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -999,7 +999,6 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, props->max_mcast_qp_attach = MLX5_CAP_GEN(mdev, max_qp_mcg); props->max_total_mcast_qp_attach = props->max_mcast_qp_attach * props->max_mcast_grp; - props->max_map_per_fmr = INT_MAX; /* no limit in ConnectIB */ props->max_ah = INT_MAX; props->hca_core_clock = MLX5_CAP_GEN(mdev, device_frequency_khz); props->timestamp_mask = 0x7FFFFFFFFFFFFFFFULL; diff --git a/drivers/infiniband/hw/mthca/mthca_provider.c b/drivers/infiniband/hw/mthca/mthca_provider.c index de2124a8ee2b..9fa2f9164a47 100644 --- a/drivers/infiniband/hw/mthca/mthca_provider.c +++ b/drivers/infiniband/hw/mthca/mthca_provider.c @@ -118,16 +118,6 @@ static int mthca_query_device(struct ib_device *ibdev, struct ib_device_attr *pr props->max_mcast_qp_attach = MTHCA_QP_PER_MGM; props->max_total_mcast_qp_attach = props->max_mcast_qp_attach * props->max_mcast_grp; - /* - * If Sinai memory key optimization is being used, then only - * the 8-bit key portion will change. For other HCAs, the - * unused index bits will also be used for FMR remapping. - */ - if (mdev->mthca_flags & MTHCA_FLAG_SINAI_OPT) - props->max_map_per_fmr = 255; - else - props->max_map_per_fmr = - (1 << (32 - ilog2(mdev->limits.num_mpts))) - 1; err = 0; out: diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c index 890e3fd41d21..d11c74390a12 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c @@ -99,7 +99,6 @@ int ocrdma_query_device(struct ib_device *ibdev, struct ib_device_attr *attr, attr->max_mw = dev->attr.max_mw; attr->max_pd = dev->attr.max_pd; attr->atomic_cap = 0; - attr->max_map_per_fmr = 0; attr->max_qp_rd_atom = min(dev->attr.max_ord_per_qp, dev->attr.max_ird_per_qp); attr->max_qp_init_rd_atom = dev->attr.max_ord_per_qp; diff --git a/drivers/infiniband/hw/qedr/verbs.c b/drivers/infiniband/hw/qedr/verbs.c index ca88006eaa66..9b9e80266367 100644 --- a/drivers/infiniband/hw/qedr/verbs.c +++ b/drivers/infiniband/hw/qedr/verbs.c @@ -145,7 +145,6 @@ int qedr_query_device(struct ib_device *ibdev, attr->max_mw = qattr->max_mw; attr->max_pd = qattr->max_pd; attr->atomic_cap = dev->atomic_cap; - attr->max_map_per_fmr = 16; attr->max_qp_init_rd_atom = 1 << (fls(qattr->max_qp_req_rd_atomic_resc) - 1); attr->max_qp_rd_atom = diff --git a/drivers/infiniband/hw/qib/qib_verbs.c b/drivers/infiniband/hw/qib/qib_verbs.c index 7508abb6a0fa..7acf9ba5358a 100644 --- a/drivers/infiniband/hw/qib/qib_verbs.c +++ b/drivers/infiniband/hw/qib/qib_verbs.c @@ -1460,7 +1460,6 @@ static void qib_fill_device_attr(struct qib_devdata *dd) rdi->dparms.props.max_cq = ib_qib_max_cqs; rdi->dparms.props.max_cqe = ib_qib_max_cqes; rdi->dparms.props.max_ah = ib_qib_max_ahs; - rdi->dparms.props.max_map_per_fmr = 32767; rdi->dparms.props.max_qp_rd_atom = QIB_MAX_RDMA_ATOMIC; rdi->dparms.props.max_qp_init_rd_atom = 255; rdi->dparms.props.max_srq = ib_qib_max_srqs; diff --git a/drivers/infiniband/hw/usnic/usnic_ib_verbs.c b/drivers/infiniband/hw/usnic/usnic_ib_verbs.c index 71f82339446c..b8a77ce11590 100644 --- a/drivers/infiniband/hw/usnic/usnic_ib_verbs.c +++ b/drivers/infiniband/hw/usnic/usnic_ib_verbs.c @@ -322,7 +322,6 @@ int usnic_ib_query_device(struct ib_device *ibdev, props->max_mcast_grp = 0; props->max_mcast_qp_attach = 0; props->max_total_mcast_qp_attach = 0; - props->max_map_per_fmr = 0; /* Owned by Userspace * max_qp_wr, max_sge, max_sge_rd, max_cqe */ mutex_unlock(&us_ibdev->usdev_lock); diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index c4708b3243f9..033e7044f29c 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -430,7 +430,6 @@ struct ib_device_attr { int max_mcast_qp_attach; int max_total_mcast_qp_attach; int max_ah; - int max_map_per_fmr; int max_srq; int max_srq_wr; int max_srq_sge; -- cgit v1.2.3 From 24c567ff75d342ed2392cd470c3acd29a09e03ba Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Sat, 30 May 2020 17:02:24 +0300 Subject: IB/hfi1: Fix hfi1_netdev_rx_init() error handling The hfi1_vnic_up() function doesn't check whether hfi1_netdev_rx_init() returns errors. In hfi1_vnic_init() we need to change the code to preserve the error code instead of returning success. Fixes: 2280740f01ae ("IB/hfi1: Virtual Network Interface Controller (VNIC) HW support") Fixes: 4730f4a6c6b2 ("IB/hfi1: Activate the dummy netdev") Link: https://lore.kernel.org/r/20200530140224.GA1330098@mwanda Signed-off-by: Dan Carpenter Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/vnic_main.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/vnic_main.c b/drivers/infiniband/hw/hfi1/vnic_main.c index b183c56b7b6a..a90824de0f57 100644 --- a/drivers/infiniband/hw/hfi1/vnic_main.c +++ b/drivers/infiniband/hw/hfi1/vnic_main.c @@ -457,13 +457,19 @@ static int hfi1_vnic_up(struct hfi1_vnic_vport_info *vinfo) if (rc < 0) return rc; - hfi1_netdev_rx_init(dd); + rc = hfi1_netdev_rx_init(dd); + if (rc) + goto err_remove; netif_carrier_on(netdev); netif_tx_start_all_queues(netdev); set_bit(HFI1_VNIC_UP, &vinfo->flags); return 0; + +err_remove: + hfi1_netdev_remove_data(dd, VNIC_ID(vinfo->vesw_id)); + return rc; } static void hfi1_vnic_down(struct hfi1_vnic_vport_info *vinfo) @@ -512,7 +518,8 @@ static int hfi1_vnic_init(struct hfi1_vnic_vport_info *vinfo) goto txreq_fail; } - if (hfi1_netdev_rx_init(dd)) { + rc = hfi1_netdev_rx_init(dd); + if (rc) { dd_dev_err(dd, "Unable to initialize netdev contexts\n"); goto alloc_fail; } -- cgit v1.2.3 From 278f74b39e641e1315e1b7f11b26aa1f989a40fc Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Sat, 30 May 2020 13:52:58 -0400 Subject: RDMA/core: Move and rename trace_cm_id_create() The restrack ID for an rdma_cm_id is not assigned until it is associated with a device. Here's an example I captured while testing NFS/RDMA's support for DEVICE_REMOVAL. The new tracepoint name is "cm_id_attach". <...>-4261 [001] 366.581299: cm_event_handler: cm.id=0 src=0.0.0.0:45919 dst=192.168.2.55:20049 tos=0 ADDR_ERROR (1/-19) <...>-4261 [001] 366.581304: cm_event_done: cm.id=0 src=0.0.0.0:45919 dst=192.168.2.55:20049 tos=0 ADDR_ERROR consumer returns 0 <...>-1950 [000] 366.581309: cm_id_destroy: cm.id=0 src=0.0.0.0:45919 dst=192.168.2.55:20049 tos=0 <...>-7 [001] 369.589400: cm_event_handler: cm.id=0 src=0.0.0.0:49023 dst=192.168.2.55:20049 tos=0 ADDR_ERROR (1/-19) <...>-7 [001] 369.589404: cm_event_done: cm.id=0 src=0.0.0.0:49023 dst=192.168.2.55:20049 tos=0 ADDR_ERROR consumer returns 0 <...>-1950 [000] 369.589407: cm_id_destroy: cm.id=0 src=0.0.0.0:49023 dst=192.168.2.55:20049 tos=0 <...>-4261 [001] 372.597650: cm_id_attach: cm.id=0 src=192.168.2.51:47492 dst=192.168.2.55:20049 device=mlx4_0 <...>-4261 [001] 372.597652: cm_event_handler: cm.id=0 src=192.168.2.51:47492 dst=192.168.2.55:20049 tos=0 ADDR_RESOLVED (0/0) <...>-4261 [001] 372.597654: cm_event_done: cm.id=0 src=192.168.2.51:47492 dst=192.168.2.55:20049 tos=0 ADDR_RESOLVED consumer returns 0 <...>-4261 [001] 372.597738: cm_event_handler: cm.id=0 src=192.168.2.51:47492 dst=192.168.2.55:20049 tos=0 ROUTE_RESOLVED (2/0) <...>-4261 [001] 372.597740: cm_event_done: cm.id=0 src=192.168.2.51:47492 dst=192.168.2.55:20049 tos=0 ROUTE_RESOLVED consumer returns 0 <...>-4691 [007] 372.600101: cm_qp_create: cm.id=0 src=192.168.2.51:47492 dst=192.168.2.55:20049 tos=0 pd.id=2 qp_type=RC send_wr=4091 recv_wr=256 qp_num=530 rc=0 <...>-4691 [007] 372.600207: cm_send_req: cm.id=0 src=192.168.2.51:47492 dst=192.168.2.55:20049 tos=0 qp_num=530 <...>-185 [002] 372.601212: cm_send_mra: cm.id=0 src=192.168.2.51:47492 dst=192.168.2.55:20049 tos=0 <...>-185 [002] 372.601362: cm_send_rtu: cm.id=0 src=192.168.2.51:47492 dst=192.168.2.55:20049 tos=0 <...>-185 [002] 372.601372: cm_event_handler: cm.id=0 src=192.168.2.51:47492 dst=192.168.2.55:20049 tos=0 ESTABLISHED (9/0) <...>-185 [002] 372.601379: cm_event_done: cm.id=0 src=192.168.2.51:47492 dst=192.168.2.55:20049 tos=0 ESTABLISHED consumer returns 0 Fixes: ed999f820a6c ("RDMA/cma: Add trace points in RDMA Connection Manager") Link: https://lore.kernel.org/r/20200530174934.21362.56754.stgit@manet.1015granger.net Signed-off-by: Chuck Lever Reviewed-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cma.c | 2 +- drivers/infiniband/core/cma_trace.h | 20 +++++++++++++++----- 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 8026ee56546a..3d7cc9f0f3d4 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -484,6 +484,7 @@ static void _cma_attach_to_dev(struct rdma_id_private *id_priv, rdma_restrack_kadd(&id_priv->res); else rdma_restrack_uadd(&id_priv->res); + trace_cm_id_attach(id_priv, cma_dev->device); } static void cma_attach_to_dev(struct rdma_id_private *id_priv, @@ -888,7 +889,6 @@ struct rdma_cm_id *__rdma_create_id(struct net *net, id_priv->id.route.addr.dev_addr.net = get_net(net); id_priv->seq_num &= 0x00ffffff; - trace_cm_id_create(id_priv); return &id_priv->id; } EXPORT_SYMBOL(__rdma_create_id); diff --git a/drivers/infiniband/core/cma_trace.h b/drivers/infiniband/core/cma_trace.h index 81e36bf13159..e6e20c36c538 100644 --- a/drivers/infiniband/core/cma_trace.h +++ b/drivers/infiniband/core/cma_trace.h @@ -103,23 +103,33 @@ DEFINE_CMA_FSM_EVENT(sent_drep); DEFINE_CMA_FSM_EVENT(sent_dreq); DEFINE_CMA_FSM_EVENT(id_destroy); -TRACE_EVENT(cm_id_create, +TRACE_EVENT(cm_id_attach, TP_PROTO( - const struct rdma_id_private *id_priv + const struct rdma_id_private *id_priv, + const struct ib_device *device ), - TP_ARGS(id_priv), + TP_ARGS(id_priv, device), TP_STRUCT__entry( __field(u32, cm_id) + __array(unsigned char, srcaddr, sizeof(struct sockaddr_in6)) + __array(unsigned char, dstaddr, sizeof(struct sockaddr_in6)) + __string(devname, device->name) ), TP_fast_assign( __entry->cm_id = id_priv->res.id; + memcpy(__entry->srcaddr, &id_priv->id.route.addr.src_addr, + sizeof(struct sockaddr_in6)); + memcpy(__entry->dstaddr, &id_priv->id.route.addr.dst_addr, + sizeof(struct sockaddr_in6)); + __assign_str(devname, device->name); ), - TP_printk("cm.id=%u", - __entry->cm_id + TP_printk("cm.id=%u src=%pISpc dst=%pISpc device=%s", + __entry->cm_id, __entry->srcaddr, __entry->dstaddr, + __get_str(devname) ) ); -- cgit v1.2.3 From 87d9e568496aeb519f1da61f54ac0dcb8d37561e Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 29 May 2020 11:39:18 +0300 Subject: RDMA/hns: Uninitialized variable in modify_qp_init_to_rtr() The "dmac" variable is used before it is initialized. Fixes: 494c3b312255 ("RDMA/hns: Refactor the QP context filling process related to WQE buffer configure") Link: https://lore.kernel.org/r/20200529083918.GA1298465@mwanda Signed-off-by: Dan Carpenter Reviewed-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 6454ac4ad06f..c597d7281629 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -4030,6 +4030,7 @@ static int modify_qp_init_to_rtr(struct ib_qp *ibqp, port = (attr_mask & IB_QP_PORT) ? (attr->port_num - 1) : hr_qp->port; smac = (u8 *)hr_dev->dev_addr[port]; + dmac = (u8 *)attr->ah_attr.roce.dmac; /* when dmac equals smac or loop_idc is 1, it should loopback */ if (ether_addr_equal_unaligned(dmac, smac) || hr_dev->loop_idc == 0x1) { @@ -4053,7 +4054,6 @@ static int modify_qp_init_to_rtr(struct ib_qp *ibqp, roce_set_field(qpc_mask->byte_20_smac_sgid_idx, V2_QPC_BYTE_20_SGID_IDX_M, V2_QPC_BYTE_20_SGID_IDX_S, 0); - dmac = (u8 *)attr->ah_attr.roce.dmac; memcpy(&(context->dmac), dmac, sizeof(u32)); roce_set_field(context->byte_52_udpspn_dmac, V2_QPC_BYTE_52_DMAC_M, V2_QPC_BYTE_52_DMAC_S, *((u16 *)(&dmac[4]))); -- cgit v1.2.3 From 193ba03141bb987c3af985f6479840030fec0534 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Tue, 2 Jun 2020 14:16:35 +0800 Subject: IB/hfi1: Use free_netdev() in hfi1_netdev_free() dummy_netdev shold be freed by free_netdev() instead of kfree(). Also remove unneeded variable 'priv' Fixes: 4730f4a6c6b2 ("IB/hfi1: Activate the dummy netdev") Link: https://lore.kernel.org/r/20200602061635.31224-1-yuehaibing@huawei.com Signed-off-by: YueHaibing Reported-by: kbuild test robot Reported-by: Dan Carpenter Reviewed-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/netdev_rx.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/netdev_rx.c b/drivers/infiniband/hw/hfi1/netdev_rx.c index 58af6a454761..63688e85e8da 100644 --- a/drivers/infiniband/hw/hfi1/netdev_rx.c +++ b/drivers/infiniband/hw/hfi1/netdev_rx.c @@ -371,12 +371,9 @@ int hfi1_netdev_alloc(struct hfi1_devdata *dd) void hfi1_netdev_free(struct hfi1_devdata *dd) { - struct hfi1_netdev_priv *priv; - if (dd->dummy_netdev) { - priv = hfi1_netdev_priv(dd->dummy_netdev); dd_dev_info(dd, "hfi1 netdev freed\n"); - kfree(dd->dummy_netdev); + free_netdev(dd->dummy_netdev); dd->dummy_netdev = NULL; } } -- cgit v1.2.3 From 6512f11d386c7cf83a48e71cfd7c7c1b0003c151 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 2 Jun 2020 15:55:46 +0300 Subject: RDMA/mlx5: Return an error if copy_to_user fails In theoretical event, the ib_copy_to_udata() can fail, so return -EFAULT error to the user, so he will destroy the QP. Fixes: 50aec2c3135e ("RDMA/mlx5: Return ECE data after modify QP") Link: https://lore.kernel.org/r/20200602125548.172654-2-leon@kernel.org Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/qp.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 9364a7a76ac2..9f0b7f1908da 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -4305,12 +4305,8 @@ int mlx5_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, /* resp.response_length is set in ECE supported flows only */ if (!err && resp.response_length && udata->outlen >= resp.response_length) - /* - * We don't check return value of the function below - * on purpose, because it is unclear how to unwind the - * error flow after QP was modified to the new state. - */ - ib_copy_to_udata(udata, &resp, resp.response_length); + /* Return -EFAULT to the user and expect him to destroy QP. */ + err = ib_copy_to_udata(udata, &resp, resp.response_length); out: mutex_unlock(&qp->mutex); -- cgit v1.2.3 From 92cd667c0e8a67de024134be0a6f0bdb320606a8 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 2 Jun 2020 15:55:47 +0300 Subject: RDMA/mlx5: Don't rely on FW to set zeros in ECE response The FW returns zeros in case feature is not enabled, but it is better to have the capability check and ensure that returned result is cleared. Fixes: 3e09a427ae7a ("RDMA/mlx5: Get ECE options from FW during create QP") Link: https://lore.kernel.org/r/20200602125548.172654-3-leon@kernel.org Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/qp.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 9f0b7f1908da..18135f908971 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -1906,7 +1906,8 @@ static int create_xrc_tgt_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, base->container_mibqp = qp; base->mqp.event = mlx5_ib_qp_event; - params->resp.ece_options = MLX5_GET(create_qp_out, out, ece); + if (MLX5_CAP_GEN(mdev, ece_support)) + params->resp.ece_options = MLX5_GET(create_qp_out, out, ece); spin_lock_irqsave(&dev->reset_flow_resource_lock, flags); list_add_tail(&qp->qps_list, &dev->qp_list); @@ -2082,7 +2083,8 @@ static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd, base->container_mibqp = qp; base->mqp.event = mlx5_ib_qp_event; - params->resp.ece_options = MLX5_GET(create_qp_out, out, ece); + if (MLX5_CAP_GEN(mdev, ece_support)) + params->resp.ece_options = MLX5_GET(create_qp_out, out, ece); get_cqs(qp->type, init_attr->send_cq, init_attr->recv_cq, &send_cq, &recv_cq); -- cgit v1.2.3 From a645a89d9a780a8fbb6e283f84fc91ad538c2edc Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 2 Jun 2020 15:55:48 +0300 Subject: RDMA/mlx5: Return ECE DC support The DC QPs are many-to-one QP types that means that first connection will establish ECE options that coming connections should follow. Due to this property, the ECE code was removed between first [1] and second [2] ECE submissions. This patch returns the dropped code, because ECE is a property of a connection and like any other connection users are needed to manage this data. Allow them to set ECE parameter for DC too and avoid need of having compatibility flag for the DC ECE. [1] https://lore.kernel.org/linux-rdma/20200523132243.817936-1-leon@kernel.org/ [2] https://lore.kernel.org/linux-rdma/20200525174401.71152-1-leon@kernel.org/ Link: https://lore.kernel.org/r/20200602125548.172654-4-leon@kernel.org Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/qp.c | 45 ++++++++++++++++++++++++++++------------- include/linux/mlx5/mlx5_ifc.h | 5 +++-- 2 files changed, 34 insertions(+), 16 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 18135f908971..81bf6b975e0e 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -2404,7 +2404,8 @@ static void destroy_qp_common(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, destroy_qp(dev, qp, base, udata); } -static int create_dct(struct ib_pd *pd, struct mlx5_ib_qp *qp, +static int create_dct(struct mlx5_ib_dev *dev, struct ib_pd *pd, + struct mlx5_ib_qp *qp, struct mlx5_create_qp_params *params) { struct ib_qp_init_attr *attr = params->attr; @@ -2423,6 +2424,8 @@ static int create_dct(struct ib_pd *pd, struct mlx5_ib_qp *qp, MLX5_SET(dctc, dctc, cqn, to_mcq(attr->recv_cq)->mcq.cqn); MLX5_SET64(dctc, dctc, dc_access_key, ucmd->access_key); MLX5_SET(dctc, dctc, user_index, uidx); + if (MLX5_CAP_GEN(dev->mdev, ece_support)) + MLX5_SET(dctc, dctc, ece, ucmd->ece_options); if (qp->flags_en & MLX5_QP_FLAG_SCATTER_CQE) { int rcqe_sz = mlx5_ib_get_cqe_size(attr->recv_cq); @@ -2768,7 +2771,7 @@ static int create_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd, } if (qp->type == MLX5_IB_QPT_DCT) { - err = create_dct(pd, qp, params); + err = create_dct(dev, pd, qp, params); goto out; } @@ -2882,9 +2885,8 @@ static int check_ucmd_data(struct mlx5_ib_dev *dev, */ last = sizeof(struct mlx5_ib_create_qp_rss); else - /* IB_QPT_RAW_PACKET and IB_QPT_DRIVER don't have ECE data */ + /* IB_QPT_RAW_PACKET doesn't have ECE data */ switch (attr->qp_type) { - case IB_QPT_DRIVER: case IB_QPT_RAW_PACKET: last = offsetof(struct mlx5_ib_create_qp, ece_options); break; @@ -4095,7 +4097,8 @@ static bool modify_dci_qp_is_ok(enum ib_qp_state cur_state, enum ib_qp_state new * Other transitions and attributes are illegal */ static int mlx5_ib_modify_dct(struct ib_qp *ibqp, struct ib_qp_attr *attr, - int attr_mask, struct ib_udata *udata) + int attr_mask, struct mlx5_ib_modify_qp *ucmd, + struct ib_udata *udata) { struct mlx5_ib_qp *qp = to_mqp(ibqp); struct mlx5_ib_dev *dev = to_mdev(ibqp->device); @@ -4111,6 +4114,15 @@ static int mlx5_ib_modify_dct(struct ib_qp *ibqp, struct ib_qp_attr *attr, new_state = attr->qp_state; dctc = MLX5_ADDR_OF(create_dct_in, qp->dct.in, dct_context_entry); + if (MLX5_CAP_GEN(dev->mdev, ece_support) && ucmd->ece_options) + /* + * DCT doesn't initialize QP till modify command is executed, + * so we need to overwrite previously set ECE field if user + * provided any value except zero, which means not set/not + * valid. + */ + MLX5_SET(dctc, dctc, ece, ucmd->ece_options); + if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) { u16 set_id; @@ -4145,14 +4157,21 @@ static int mlx5_ib_modify_dct(struct ib_qp *ibqp, struct ib_qp_attr *attr, MLX5_SET(dctc, dctc, counter_set_id, set_id); } else if (cur_state == IB_QPS_INIT && new_state == IB_QPS_RTR) { struct mlx5_ib_modify_qp_resp resp = {}; - u32 out[MLX5_ST_SZ_DW(create_dct_out)] = {0}; - u32 min_resp_len = offsetof(typeof(resp), dctn) + - sizeof(resp.dctn); + u32 out[MLX5_ST_SZ_DW(create_dct_out)] = {}; + u32 min_resp_len = offsetofend(typeof(resp), dctn); if (udata->outlen < min_resp_len) return -EINVAL; resp.response_length = min_resp_len; + /* + * If we don't have enough space for the ECE options, + * simply indicate it with resp.response_length. + */ + resp.response_length = (udata->outlen < sizeof(resp)) ? + min_resp_len : + sizeof(resp); + required |= IB_QP_MIN_RNR_TIMER | IB_QP_AV | IB_QP_PATH_MTU; if (!is_valid_mask(attr_mask, required, 0)) return -EINVAL; @@ -4169,6 +4188,8 @@ static int mlx5_ib_modify_dct(struct ib_qp *ibqp, struct ib_qp_attr *attr, if (err) return err; resp.dctn = qp->dct.mdct.mqp.qpn; + if (MLX5_CAP_GEN(dev->mdev, ece_support)) + resp.ece_options = MLX5_GET(create_dct_out, out, ece); err = ib_copy_to_udata(udata, &resp, resp.response_length); if (err) { mlx5_core_destroy_dct(dev, &qp->dct.mdct); @@ -4226,12 +4247,8 @@ int mlx5_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, qp_type = (unlikely(ibqp->qp_type == MLX5_IB_QPT_HW_GSI)) ? IB_QPT_GSI : qp->type; - if (qp_type == MLX5_IB_QPT_DCT) { - if (memchr_inv(&ucmd.ece_options, 0, sizeof(ucmd.ece_options))) - return -EOPNOTSUPP; - - return mlx5_ib_modify_dct(ibqp, attr, attr_mask, udata); - } + if (qp_type == MLX5_IB_QPT_DCT) + return mlx5_ib_modify_dct(ibqp, attr, attr_mask, &ucmd, udata); mutex_lock(&qp->mutex); diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 4d2e36dd2c6b..116bd9bb347f 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -3690,7 +3690,8 @@ struct mlx5_ifc_dctc_bits { u8 ecn[0x2]; u8 dscp[0x6]; - u8 reserved_at_1c0[0x40]; + u8 reserved_at_1c0[0x20]; + u8 ece[0x20]; }; enum { @@ -7940,7 +7941,7 @@ struct mlx5_ifc_create_dct_out_bits { u8 reserved_at_40[0x8]; u8 dctn[0x18]; - u8 reserved_at_60[0x20]; + u8 ece[0x20]; }; struct mlx5_ifc_create_dct_in_bits { -- cgit v1.2.3 From fba97dc7fc76b2c9a909fa0b3786d30a9899f5cf Mon Sep 17 00:00:00 2001 From: Ka-Cheong Poon Date: Wed, 3 Jun 2020 06:33:38 -0700 Subject: RDMA/cm: Spurious WARNING triggered in cm_destroy_id() If the cm_id state is IB_CM_REP_SENT when cm_destroy_id() is called, it calls cm_send_rej_locked(). In cm_send_rej_locked(), it calls cm_enter_timewait() and the state is changed to IB_CM_TIMEWAIT. Now back to cm_destroy_id(), it breaks from the switch statement, and the next call is WARN_ON(cm_id->state != IB_CM_IDLE). This triggers a spurious warning. Instead, the code should goto retest after returning from cm_send_rej_locked() to move the state to IDLE. Fixes: 67b3c8dceac6 ("RDMA/cm: Make sure the cm_id is in the IB_CM_IDLE state in destroy") Link: https://lore.kernel.org/r/1591191218-9446-1-git-send-email-ka-cheong.poon@oracle.com Signed-off-by: Ka-Cheong Poon Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cm.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c index 085c146fe400..9ce787e37e22 100644 --- a/drivers/infiniband/core/cm.c +++ b/drivers/infiniband/core/cm.c @@ -1111,7 +1111,9 @@ retest: case IB_CM_REP_SENT: case IB_CM_MRA_REP_RCVD: ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg); - /* Fall through */ + cm_send_rej_locked(cm_id_priv, IB_CM_REJ_CONSUMER_DEFINED, NULL, + 0, NULL, 0); + goto retest; case IB_CM_MRA_REQ_SENT: case IB_CM_REP_RCVD: case IB_CM_MRA_REP_SENT: -- cgit v1.2.3