From 2a152512a155aaf27c3e67834ffafaed9525a7b5 Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Sun, 3 Oct 2021 13:56:04 +0300 Subject: RDMA/efa: CQ notifications This patch adds support for CQ notifications through the standard verbs api. In order to achieve that, a new event queue (EQ) object is introduced, which is in charge of reporting completion events to the driver. On driver load, EQs are allocated and their affinity is set to a single cpu. When a user app creates a CQ with a completion channel, the completion vector number is converted to a EQ number, which is in charge of reporting the CQ events. In addition, the CQ creation admin command now returns an offset for the CQ doorbell, which is mapped to the userspace provider and is used to arm the CQ when requested by the user. The EQs use a single doorbell (located on the registers BAR), which encodes the EQ number and arm as part of the doorbell value. The EQs are polled by the driver on each new EQE, and arm it when the poll is completed. Link: https://lore.kernel.org/r/20211003105605.29222-1-galpress@amazon.com Reviewed-by: Firas JahJah Reviewed-by: Yossi Leybovich Signed-off-by: Gal Pressman Signed-off-by: Jason Gunthorpe --- include/uapi/rdma/efa-abi.h | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) (limited to 'include/uapi/rdma') diff --git a/include/uapi/rdma/efa-abi.h b/include/uapi/rdma/efa-abi.h index f89fbb5b1e8d..08035ccf1fff 100644 --- a/include/uapi/rdma/efa-abi.h +++ b/include/uapi/rdma/efa-abi.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-2-Clause) */ /* - * Copyright 2018-2020 Amazon.com, Inc. or its affiliates. All rights reserved. + * Copyright 2018-2021 Amazon.com, Inc. or its affiliates. All rights reserved. */ #ifndef EFA_ABI_USER_H @@ -52,11 +52,20 @@ struct efa_ibv_alloc_pd_resp { __u8 reserved_30[2]; }; +enum { + EFA_CREATE_CQ_WITH_COMPLETION_CHANNEL = 1 << 0, +}; + struct efa_ibv_create_cq { __u32 comp_mask; __u32 cq_entry_size; __u16 num_sub_cqs; - __u8 reserved_50[6]; + __u8 flags; + __u8 reserved_58[5]; +}; + +enum { + EFA_CREATE_CQ_RESP_DB_OFF = 1 << 0, }; struct efa_ibv_create_cq_resp { @@ -65,7 +74,9 @@ struct efa_ibv_create_cq_resp { __aligned_u64 q_mmap_key; __aligned_u64 q_mmap_size; __u16 cq_idx; - __u8 reserved_d0[6]; + __u8 reserved_d0[2]; + __u32 db_off; + __aligned_u64 db_mmap_key; }; enum { @@ -106,6 +117,7 @@ struct efa_ibv_create_ah_resp { enum { EFA_QUERY_DEVICE_CAPS_RDMA_READ = 1 << 0, EFA_QUERY_DEVICE_CAPS_RNR_RETRY = 1 << 1, + EFA_QUERY_DEVICE_CAPS_CQ_NOTIFICATIONS = 1 << 2, }; struct efa_ibv_ex_query_device_resp { -- cgit v1.2.3 From 7301d0a9834c7f1f0c91c1f0a46c7b191b1fd0da Mon Sep 17 00:00:00 2001 From: Aharon Landau Date: Fri, 8 Oct 2021 15:24:33 +0300 Subject: RDMA/nldev: Add support to get status of all counters This patch adds the ability to get the name, index and status of all counters for each link through RDMA netlink. This can be used for user-space to get the current optional-counter mode. Examples: $ rdma statistic mode link rocep8s0f0/1 optional-counters cc_rx_ce_pkts $ rdma statistic mode supported link rocep8s0f0/1 supported optional-counters cc_rx_ce_pkts,cc_rx_cnp_pkts,cc_tx_cnp_pkts link rocep8s0f1/1 supported optional-counters cc_rx_ce_pkts,cc_rx_cnp_pkts,cc_tx_cnp_pkts Link: https://lore.kernel.org/r/20211008122439.166063-8-markzhang@nvidia.com Signed-off-by: Aharon Landau Signed-off-by: Neta Ostrovsky Signed-off-by: Leon Romanovsky Signed-off-by: Mark Zhang Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/nldev.c | 98 ++++++++++++++++++++++++++++++++++++++++ include/uapi/rdma/rdma_netlink.h | 5 ++ 2 files changed, 103 insertions(+) (limited to 'include/uapi/rdma') diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c index 67519730b1ac..210057fef7bd 100644 --- a/drivers/infiniband/core/nldev.c +++ b/drivers/infiniband/core/nldev.c @@ -154,6 +154,8 @@ static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = { [RDMA_NLDEV_NET_NS_FD] = { .type = NLA_U32 }, [RDMA_NLDEV_SYS_ATTR_NETNS_MODE] = { .type = NLA_U8 }, [RDMA_NLDEV_SYS_ATTR_COPY_ON_FORK] = { .type = NLA_U8 }, + [RDMA_NLDEV_ATTR_STAT_HWCOUNTER_INDEX] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_STAT_HWCOUNTER_DYNAMIC] = { .type = NLA_U8 }, }; static int put_driver_name_print_type(struct sk_buff *msg, const char *name, @@ -2264,6 +2266,99 @@ static int nldev_stat_get_dumpit(struct sk_buff *skb, return ret; } +static int nldev_stat_get_counter_status_doit(struct sk_buff *skb, + struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[RDMA_NLDEV_ATTR_MAX], *table, *entry; + struct rdma_hw_stats *stats; + struct ib_device *device; + struct sk_buff *msg; + u32 devid, port; + int ret, i; + + ret = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, + nldev_policy, extack); + if (ret || !tb[RDMA_NLDEV_ATTR_DEV_INDEX] || + !tb[RDMA_NLDEV_ATTR_PORT_INDEX]) + return -EINVAL; + + devid = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); + device = ib_device_get_by_index(sock_net(skb->sk), devid); + if (!device) + return -EINVAL; + + port = nla_get_u32(tb[RDMA_NLDEV_ATTR_PORT_INDEX]); + if (!rdma_is_port_valid(device, port)) { + ret = -EINVAL; + goto err; + } + + stats = ib_get_hw_stats_port(device, port); + if (!stats) { + ret = -EINVAL; + goto err; + } + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) { + ret = -ENOMEM; + goto err; + } + + nlh = nlmsg_put( + msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, + RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_STAT_GET_STATUS), + 0, 0); + + ret = -EMSGSIZE; + if (fill_nldev_handle(msg, device) || + nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, port)) + goto err_msg; + + table = nla_nest_start(msg, RDMA_NLDEV_ATTR_STAT_HWCOUNTERS); + if (!table) + goto err_msg; + + mutex_lock(&stats->lock); + for (i = 0; i < stats->num_counters; i++) { + entry = nla_nest_start(msg, + RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY); + if (!entry) + goto err_msg_table; + + if (nla_put_string(msg, + RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY_NAME, + stats->descs[i].name) || + nla_put_u32(msg, RDMA_NLDEV_ATTR_STAT_HWCOUNTER_INDEX, i)) + goto err_msg_entry; + + if ((stats->descs[i].flags & IB_STAT_FLAG_OPTIONAL) && + (nla_put_u8(msg, RDMA_NLDEV_ATTR_STAT_HWCOUNTER_DYNAMIC, + !test_bit(i, stats->is_disabled)))) + goto err_msg_entry; + + nla_nest_end(msg, entry); + } + mutex_unlock(&stats->lock); + + nla_nest_end(msg, table); + nlmsg_end(msg, nlh); + ib_device_put(device); + return rdma_nl_unicast(sock_net(skb->sk), msg, NETLINK_CB(skb).portid); + +err_msg_entry: + nla_nest_cancel(msg, entry); +err_msg_table: + mutex_unlock(&stats->lock); + nla_nest_cancel(msg, table); +err_msg: + nlmsg_free(msg); +err: + ib_device_put(device); + return ret; +} + static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = { [RDMA_NLDEV_CMD_GET] = { .doit = nldev_get_doit, @@ -2353,6 +2448,9 @@ static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = { .dump = nldev_res_get_mr_raw_dumpit, .flags = RDMA_NL_ADMIN_PERM, }, + [RDMA_NLDEV_CMD_STAT_GET_STATUS] = { + .doit = nldev_stat_get_counter_status_doit, + }, }; void __init nldev_init(void) diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h index 75a1ae2311d8..e50c357367db 100644 --- a/include/uapi/rdma/rdma_netlink.h +++ b/include/uapi/rdma/rdma_netlink.h @@ -297,6 +297,8 @@ enum rdma_nldev_command { RDMA_NLDEV_CMD_RES_SRQ_GET, /* can dump */ + RDMA_NLDEV_CMD_STAT_GET_STATUS, + RDMA_NLDEV_NUM_OPS }; @@ -549,6 +551,9 @@ enum rdma_nldev_attr { RDMA_NLDEV_SYS_ATTR_COPY_ON_FORK, /* u8 */ + RDMA_NLDEV_ATTR_STAT_HWCOUNTER_INDEX, /* u32 */ + RDMA_NLDEV_ATTR_STAT_HWCOUNTER_DYNAMIC, /* u8 */ + /* * Always the end */ -- cgit v1.2.3 From cfc0312d9c83a5bbb66fa73ba47dd1301d75b2e8 Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Thu, 7 Oct 2021 15:40:47 -0500 Subject: RDMA/rxe: Move AV from rxe_send_wqe to rxe_send_wr Move the struct rxe_av av from struct rxe_send_wqe to struct rxe_send_wr placing it in wr.ud at the same offset as it was previously. This has the effect of increasing the size of struct rxe_send_wr while keeping the size of struct rxe_send_wqe the same. This better reflects the use of this field which is only used for UD sends. This change has no effect on ABI compatibility so the modified rxe driver will operate with older versions of rdma-core. Link: https://lore.kernel.org/r/20211007204051.10086-2-rpearsonhpe@gmail.com Signed-off-by: Bob Pearson Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_av.c | 2 +- drivers/infiniband/sw/rxe/rxe_verbs.c | 3 ++- include/uapi/rdma/rdma_user_rxe.h | 4 +++- 3 files changed, 6 insertions(+), 3 deletions(-) (limited to 'include/uapi/rdma') diff --git a/drivers/infiniband/sw/rxe/rxe_av.c b/drivers/infiniband/sw/rxe/rxe_av.c index da2e867a1ed9..85580ea5eed0 100644 --- a/drivers/infiniband/sw/rxe/rxe_av.c +++ b/drivers/infiniband/sw/rxe/rxe_av.c @@ -107,5 +107,5 @@ struct rxe_av *rxe_get_av(struct rxe_pkt_info *pkt) if (qp_type(pkt->qp) == IB_QPT_RC || qp_type(pkt->qp) == IB_QPT_UC) return &pkt->qp->pri_av; - return (pkt->wqe) ? &pkt->wqe->av : NULL; + return (pkt->wqe) ? &pkt->wqe->wr.wr.ud.av : NULL; } diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c index c49ba0381964..4233fd9edfd1 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.c +++ b/drivers/infiniband/sw/rxe/rxe_verbs.c @@ -581,7 +581,8 @@ static void init_send_wqe(struct rxe_qp *qp, const struct ib_send_wr *ibwr, if (qp_type(qp) == IB_QPT_UD || qp_type(qp) == IB_QPT_SMI || qp_type(qp) == IB_QPT_GSI) - memcpy(&wqe->av, &to_rah(ud_wr(ibwr)->ah)->av, sizeof(wqe->av)); + memcpy(&wqe->wr.wr.ud.av, &to_rah(ud_wr(ibwr)->ah)->av, + sizeof(struct rxe_av)); if (unlikely(ibwr->send_flags & IB_SEND_INLINE)) copy_inline_data_to_wqe(wqe, ibwr); diff --git a/include/uapi/rdma/rdma_user_rxe.h b/include/uapi/rdma/rdma_user_rxe.h index e283c2220aba..2f1ebbe96434 100644 --- a/include/uapi/rdma/rdma_user_rxe.h +++ b/include/uapi/rdma/rdma_user_rxe.h @@ -98,6 +98,9 @@ struct rxe_send_wr { __u32 remote_qpn; __u32 remote_qkey; __u16 pkey_index; + __u16 reserved; + __u32 pad[5]; + struct rxe_av av; } ud; struct { __aligned_u64 addr; @@ -148,7 +151,6 @@ struct rxe_dma_info { struct rxe_send_wqe { struct rxe_send_wr wr; - struct rxe_av av; __u32 status; __u32 state; __aligned_u64 iova; -- cgit v1.2.3 From 73a54932100375ba94b31710f1e3f1234f23be0b Mon Sep 17 00:00:00 2001 From: Bob Pearson Date: Thu, 7 Oct 2021 15:40:49 -0500 Subject: RDMA/rxe: Create AH index and return to user space Make changes to rdma_user_rxe.h to allow indexing AH objects, passing the index in UD send WRs to the driver and returning the index to the rxe provider. Modify rxe_create_ah() to add an index to AH when created and if called from a new user provider return it to user space. If called from an old provider mark the AH as not having a useful index. Modify rxe_destroy_ah to drop the index before deleting the object. Link: https://lore.kernel.org/r/20211007204051.10086-4-rpearsonhpe@gmail.com Signed-off-by: Bob Pearson Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_verbs.c | 31 ++++++++++++++++++++++++++++++- drivers/infiniband/sw/rxe/rxe_verbs.h | 2 ++ include/uapi/rdma/rdma_user_rxe.h | 8 +++++++- 3 files changed, 39 insertions(+), 2 deletions(-) (limited to 'include/uapi/rdma') diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c index 4233fd9edfd1..e20f2dd20639 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.c +++ b/drivers/infiniband/sw/rxe/rxe_verbs.c @@ -158,9 +158,19 @@ static int rxe_create_ah(struct ib_ah *ibah, struct ib_udata *udata) { - int err; struct rxe_dev *rxe = to_rdev(ibah->device); struct rxe_ah *ah = to_rah(ibah); + struct rxe_create_ah_resp __user *uresp = NULL; + int err; + + if (udata) { + /* test if new user provider */ + if (udata->outlen >= sizeof(*uresp)) + uresp = udata->outbuf; + ah->is_user = true; + } else { + ah->is_user = false; + } err = rxe_av_chk_attr(rxe, init_attr->ah_attr); if (err) @@ -170,6 +180,24 @@ static int rxe_create_ah(struct ib_ah *ibah, if (err) return err; + /* create index > 0 */ + rxe_add_index(ah); + ah->ah_num = ah->pelem.index; + + if (uresp) { + /* only if new user provider */ + err = copy_to_user(&uresp->ah_num, &ah->ah_num, + sizeof(uresp->ah_num)); + if (err) { + rxe_drop_index(ah); + rxe_drop_ref(ah); + return -EFAULT; + } + } else if (ah->is_user) { + /* only if old user provider */ + ah->ah_num = 0; + } + rxe_init_av(init_attr->ah_attr, &ah->av); return 0; } @@ -202,6 +230,7 @@ static int rxe_destroy_ah(struct ib_ah *ibah, u32 flags) { struct rxe_ah *ah = to_rah(ibah); + rxe_drop_index(ah); rxe_drop_ref(ah); return 0; } diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h b/drivers/infiniband/sw/rxe/rxe_verbs.h index 098fde693dbd..c56fae23b6a9 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.h +++ b/drivers/infiniband/sw/rxe/rxe_verbs.h @@ -48,6 +48,8 @@ struct rxe_ah { struct rxe_pool_entry pelem; struct rxe_pd *pd; struct rxe_av av; + bool is_user; + int ah_num; }; struct rxe_cqe { diff --git a/include/uapi/rdma/rdma_user_rxe.h b/include/uapi/rdma/rdma_user_rxe.h index 2f1ebbe96434..dc9f7a5e203a 100644 --- a/include/uapi/rdma/rdma_user_rxe.h +++ b/include/uapi/rdma/rdma_user_rxe.h @@ -99,7 +99,8 @@ struct rxe_send_wr { __u32 remote_qkey; __u16 pkey_index; __u16 reserved; - __u32 pad[5]; + __u32 ah_num; + __u32 pad[4]; struct rxe_av av; } ud; struct { @@ -170,6 +171,11 @@ struct rxe_recv_wqe { struct rxe_dma_info dma; }; +struct rxe_create_ah_resp { + __u32 ah_num; + __u32 reserved; +}; + struct rxe_create_cq_resp { struct mminfo mi; }; -- cgit v1.2.3