From e19c0d237873be2426dac45887edf293da13c339 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 3 Apr 2018 07:52:02 +0300 Subject: RDMA/rdma_cm: Remove process_req and timer sorting Now that the work queue is used directly to launch and track the work there is no need for the second processing function to do 'all list entries'. Just schedule all entries onto the main work queue directly. We can also drop all of the useless list sorting now, as the workqueue sorts by expiration time automatically. This change requires switching lock to a spinlock as netdev notifiers are called in an atomic context, this is now easy since the lock does not need to be held across the lookup, that is already single threaded due to the work queue. Signed-off-by: Leon Romanovsky Reviewed-by: Parav Pandit Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/addr.c | 96 +++++++++++------------------------------- 1 file changed, 25 insertions(+), 71 deletions(-) (limited to 'drivers') diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c index 88a7542d8c7b..8ef4b98e6a3a 100644 --- a/drivers/infiniband/core/addr.c +++ b/drivers/infiniband/core/addr.c @@ -68,11 +68,8 @@ struct addr_req { static atomic_t ib_nl_addr_request_seq = ATOMIC_INIT(0); -static void process_req(struct work_struct *work); - -static DEFINE_MUTEX(lock); +static DEFINE_SPINLOCK(lock); static LIST_HEAD(req_list); -static DECLARE_DELAYED_WORK(work, process_req); static struct workqueue_struct *addr_wq; static const struct nla_policy ib_nl_addr_policy[LS_NLA_TYPE_MAX] = { @@ -112,7 +109,7 @@ static void ib_nl_process_good_ip_rsep(const struct nlmsghdr *nlh) memcpy(&gid, nla_data(curr), nla_len(curr)); } - mutex_lock(&lock); + spin_lock_bh(&lock); list_for_each_entry(req, &req_list, list) { if (nlh->nlmsg_seq != req->seq) continue; @@ -122,7 +119,7 @@ static void ib_nl_process_good_ip_rsep(const struct nlmsghdr *nlh) found = 1; break; } - mutex_unlock(&lock); + spin_unlock_bh(&lock); if (!found) pr_info("Couldn't find request waiting for DGID: %pI6\n", @@ -302,7 +299,7 @@ int rdma_translate_ip(const struct sockaddr *addr, } EXPORT_SYMBOL(rdma_translate_ip); -static void set_timeout(struct delayed_work *delayed_work, unsigned long time) +static void set_timeout(struct addr_req *req, unsigned long time) { unsigned long delay; @@ -310,23 +307,15 @@ static void set_timeout(struct delayed_work *delayed_work, unsigned long time) if ((long)delay < 0) delay = 0; - mod_delayed_work(addr_wq, delayed_work, delay); + mod_delayed_work(addr_wq, &req->work, delay); } static void queue_req(struct addr_req *req) { - struct addr_req *temp_req; - - mutex_lock(&lock); - list_for_each_entry_reverse(temp_req, &req_list, list) { - if (time_after_eq(req->timeout, temp_req->timeout)) - break; - } - - list_add(&req->list, &temp_req->list); - - set_timeout(&req->work, req->timeout); - mutex_unlock(&lock); + spin_lock_bh(&lock); + list_add_tail(&req->list, &req_list); + set_timeout(req, req->timeout); + spin_unlock_bh(&lock); } static int ib_nl_fetch_ha(const struct dst_entry *dst, @@ -584,7 +573,6 @@ static void process_one_req(struct work_struct *_work) struct addr_req *req; struct sockaddr *src_in, *dst_in; - mutex_lock(&lock); req = container_of(_work, struct addr_req, work.work); if (req->status == -ENODATA) { @@ -596,13 +584,15 @@ static void process_one_req(struct work_struct *_work) req->status = -ETIMEDOUT; } else if (req->status == -ENODATA) { /* requeue the work for retrying again */ - set_timeout(&req->work, req->timeout); - mutex_unlock(&lock); + spin_lock_bh(&lock); + set_timeout(req, req->timeout); + spin_unlock_bh(&lock); return; } } + spin_lock_bh(&lock); list_del(&req->list); - mutex_unlock(&lock); + spin_unlock_bh(&lock); /* * Although the work will normally have been canceled by the @@ -619,47 +609,6 @@ static void process_one_req(struct work_struct *_work) kfree(req); } -static void process_req(struct work_struct *work) -{ - struct addr_req *req, *temp_req; - struct sockaddr *src_in, *dst_in; - struct list_head done_list; - - INIT_LIST_HEAD(&done_list); - - mutex_lock(&lock); - list_for_each_entry_safe(req, temp_req, &req_list, list) { - if (req->status == -ENODATA) { - src_in = (struct sockaddr *) &req->src_addr; - dst_in = (struct sockaddr *) &req->dst_addr; - req->status = addr_resolve(src_in, dst_in, req->addr, - true, req->seq); - if (req->status && time_after_eq(jiffies, req->timeout)) - req->status = -ETIMEDOUT; - else if (req->status == -ENODATA) { - set_timeout(&req->work, req->timeout); - continue; - } - } - list_move_tail(&req->list, &done_list); - } - - mutex_unlock(&lock); - - list_for_each_entry_safe(req, temp_req, &done_list, list) { - list_del(&req->list); - /* It is safe to cancel other work items from this work item - * because at a time there can be only one work item running - * with this single threaded work queue. - */ - cancel_delayed_work(&req->work); - req->callback(req->status, (struct sockaddr *) &req->src_addr, - req->addr, req->context); - put_client(req->client); - kfree(req); - } -} - int rdma_resolve_ip(struct rdma_addr_client *client, struct sockaddr *src_addr, struct sockaddr *dst_addr, struct rdma_dev_addr *addr, int timeout_ms, @@ -743,17 +692,16 @@ void rdma_addr_cancel(struct rdma_dev_addr *addr) { struct addr_req *req, *temp_req; - mutex_lock(&lock); + spin_lock_bh(&lock); list_for_each_entry_safe(req, temp_req, &req_list, list) { if (req->addr == addr) { req->status = -ECANCELED; req->timeout = jiffies; - list_move(&req->list, &req_list); - set_timeout(&req->work, req->timeout); + set_timeout(req, req->timeout); break; } } - mutex_unlock(&lock); + spin_unlock_bh(&lock); } EXPORT_SYMBOL(rdma_addr_cancel); @@ -810,11 +758,17 @@ int rdma_addr_find_l2_eth_by_grh(const union ib_gid *sgid, static int netevent_callback(struct notifier_block *self, unsigned long event, void *ctx) { + struct addr_req *req; + if (event == NETEVENT_NEIGH_UPDATE) { struct neighbour *neigh = ctx; - if (neigh->nud_state & NUD_VALID) - set_timeout(&work, jiffies); + if (neigh->nud_state & NUD_VALID) { + spin_lock_bh(&lock); + list_for_each_entry(req, &req_list, list) + set_timeout(req, jiffies); + spin_unlock_bh(&lock); + } } return 0; } -- cgit v1.2.3 From 44e75052bc2ae4d39386c1d9e218861639905873 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 3 Apr 2018 07:52:03 +0300 Subject: RDMA/rdma_cm: Make rdma_addr_cancel into a fence Currently rdma_addr_cancel does not prevent the callback from being used, this is surprising and hard to reason about. There does not appear to be a bug here as the only user of this API does refcount properly, fixing it only to increase clarity. Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/addr.c | 58 +++++++++++++++++++++++++++++------------- 1 file changed, 40 insertions(+), 18 deletions(-) (limited to 'drivers') diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c index 8ef4b98e6a3a..9756cfbdef0e 100644 --- a/drivers/infiniband/core/addr.c +++ b/drivers/infiniband/core/addr.c @@ -585,28 +585,30 @@ static void process_one_req(struct work_struct *_work) } else if (req->status == -ENODATA) { /* requeue the work for retrying again */ spin_lock_bh(&lock); - set_timeout(req, req->timeout); + if (!list_empty(&req->list)) + set_timeout(req, req->timeout); spin_unlock_bh(&lock); return; } } - spin_lock_bh(&lock); - list_del(&req->list); - spin_unlock_bh(&lock); - - /* - * Although the work will normally have been canceled by the - * workqueue, it can still be requeued as long as it is on the - * req_list, so it could have been requeued before we grabbed &lock. - * We need to cancel it after it is removed from req_list to really be - * sure it is safe to free. - */ - cancel_delayed_work(&req->work); req->callback(req->status, (struct sockaddr *)&req->src_addr, req->addr, req->context); - put_client(req->client); - kfree(req); + req->callback = NULL; + + spin_lock_bh(&lock); + if (!list_empty(&req->list)) { + /* + * Although the work will normally have been canceled by the + * workqueue, it can still be requeued as long as it is on the + * req_list. + */ + cancel_delayed_work(&req->work); + list_del_init(&req->list); + put_client(req->client); + kfree(req); + } + spin_unlock_bh(&lock); } int rdma_resolve_ip(struct rdma_addr_client *client, @@ -691,17 +693,37 @@ int rdma_resolve_ip_route(struct sockaddr *src_addr, void rdma_addr_cancel(struct rdma_dev_addr *addr) { struct addr_req *req, *temp_req; + struct addr_req *found = NULL; spin_lock_bh(&lock); list_for_each_entry_safe(req, temp_req, &req_list, list) { if (req->addr == addr) { - req->status = -ECANCELED; - req->timeout = jiffies; - set_timeout(req, req->timeout); + /* + * Removing from the list means we take ownership of + * the req + */ + list_del_init(&req->list); + found = req; break; } } spin_unlock_bh(&lock); + + if (!found) + return; + + /* + * sync canceling the work after removing it from the req_list + * guarentees no work is running and none will be started. + */ + cancel_delayed_work_sync(&found->work); + + if (found->callback) + found->callback(-ECANCELED, (struct sockaddr *)&found->src_addr, + found->addr, found->context); + + put_client(found->client); + kfree(found); } EXPORT_SYMBOL(rdma_addr_cancel); -- cgit v1.2.3 From ee6548d1d98df7df3b9c8103a42cf68b31c29417 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 3 Apr 2018 07:52:04 +0300 Subject: RDMA/rdma_cm: Delete rdma_addr_client The only thing it does is block module unload while work is posted from rdma_resolve_ip(). However, this is not the right place to do this. The users of rdma_resolve_ip() must ensure their own module does not unload until rdma_resolve_ip() calls the callback, or until rdma_addr_cancel() is called. Similarly callers to rdma_addr_find_l2_eth_by_grh() must ensure their module does not unload while they are calling code. The only two users are already safe, so there is no need for this. Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/addr.c | 38 ++++---------------------------------- drivers/infiniband/core/cma.c | 6 +----- include/rdma/ib_addr.h | 20 +------------------- 3 files changed, 6 insertions(+), 58 deletions(-) (limited to 'drivers') diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c index 9756cfbdef0e..4f32c4062fb6 100644 --- a/drivers/infiniband/core/addr.c +++ b/drivers/infiniband/core/addr.c @@ -56,7 +56,6 @@ struct addr_req { struct sockaddr_storage src_addr; struct sockaddr_storage dst_addr; struct rdma_dev_addr *addr; - struct rdma_addr_client *client; void *context; void (*callback)(int status, struct sockaddr *src_addr, struct rdma_dev_addr *addr, void *context); @@ -220,28 +219,6 @@ int rdma_addr_size_kss(struct __kernel_sockaddr_storage *addr) } EXPORT_SYMBOL(rdma_addr_size_kss); -static struct rdma_addr_client self; - -void rdma_addr_register_client(struct rdma_addr_client *client) -{ - atomic_set(&client->refcount, 1); - init_completion(&client->comp); -} -EXPORT_SYMBOL(rdma_addr_register_client); - -static inline void put_client(struct rdma_addr_client *client) -{ - if (atomic_dec_and_test(&client->refcount)) - complete(&client->comp); -} - -void rdma_addr_unregister_client(struct rdma_addr_client *client) -{ - put_client(client); - wait_for_completion(&client->comp); -} -EXPORT_SYMBOL(rdma_addr_unregister_client); - void rdma_copy_addr(struct rdma_dev_addr *dev_addr, const struct net_device *dev, const unsigned char *dst_dev_addr) @@ -605,14 +582,12 @@ static void process_one_req(struct work_struct *_work) */ cancel_delayed_work(&req->work); list_del_init(&req->list); - put_client(req->client); kfree(req); } spin_unlock_bh(&lock); } -int rdma_resolve_ip(struct rdma_addr_client *client, - struct sockaddr *src_addr, struct sockaddr *dst_addr, +int rdma_resolve_ip(struct sockaddr *src_addr, struct sockaddr *dst_addr, struct rdma_dev_addr *addr, int timeout_ms, void (*callback)(int status, struct sockaddr *src_addr, struct rdma_dev_addr *addr, void *context), @@ -644,8 +619,6 @@ int rdma_resolve_ip(struct rdma_addr_client *client, req->addr = addr; req->callback = callback; req->context = context; - req->client = client; - atomic_inc(&client->refcount); INIT_DELAYED_WORK(&req->work, process_one_req); req->seq = (u32)atomic_inc_return(&ib_nl_addr_request_seq); @@ -661,7 +634,6 @@ int rdma_resolve_ip(struct rdma_addr_client *client, break; default: ret = req->status; - atomic_dec(&client->refcount); goto err; } return ret; @@ -722,7 +694,6 @@ void rdma_addr_cancel(struct rdma_dev_addr *addr) found->callback(-ECANCELED, (struct sockaddr *)&found->src_addr, found->addr, found->context); - put_client(found->client); kfree(found); } EXPORT_SYMBOL(rdma_addr_cancel); @@ -761,8 +732,8 @@ int rdma_addr_find_l2_eth_by_grh(const union ib_gid *sgid, dev_addr.net = &init_net; init_completion(&ctx.comp); - ret = rdma_resolve_ip(&self, &sgid_addr._sockaddr, &dgid_addr._sockaddr, - &dev_addr, 1000, resolve_cb, &ctx); + ret = rdma_resolve_ip(&sgid_addr._sockaddr, &dgid_addr._sockaddr, + &dev_addr, 1000, resolve_cb, &ctx); if (ret) return ret; @@ -806,14 +777,13 @@ int addr_init(void) return -ENOMEM; register_netevent_notifier(&nb); - rdma_addr_register_client(&self); return 0; } void addr_cleanup(void) { - rdma_addr_unregister_client(&self); unregister_netevent_notifier(&nb); destroy_workqueue(addr_wq); + WARN_ON(!list_empty(&req_list)); } diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 51a641002e10..48300838e354 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -156,7 +156,6 @@ static struct ib_client cma_client = { }; static struct ib_sa_client sa_client; -static struct rdma_addr_client addr_client; static LIST_HEAD(dev_list); static LIST_HEAD(listen_any_list); static DEFINE_MUTEX(lock); @@ -2910,7 +2909,7 @@ int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, if (dst_addr->sa_family == AF_IB) { ret = cma_resolve_ib_addr(id_priv); } else { - ret = rdma_resolve_ip(&addr_client, cma_src_addr(id_priv), + ret = rdma_resolve_ip(cma_src_addr(id_priv), dst_addr, &id->route.addr.dev_addr, timeout_ms, addr_handler, id_priv); } @@ -4547,7 +4546,6 @@ static int __init cma_init(void) goto err_wq; ib_sa_register_client(&sa_client); - rdma_addr_register_client(&addr_client); register_netdevice_notifier(&cma_nb); ret = ib_register_client(&cma_client); @@ -4561,7 +4559,6 @@ static int __init cma_init(void) err: unregister_netdevice_notifier(&cma_nb); - rdma_addr_unregister_client(&addr_client); ib_sa_unregister_client(&sa_client); err_wq: destroy_workqueue(cma_wq); @@ -4574,7 +4571,6 @@ static void __exit cma_cleanup(void) rdma_nl_unregister(RDMA_NL_RDMA_CM); ib_unregister_client(&cma_client); unregister_netdevice_notifier(&cma_nb); - rdma_addr_unregister_client(&addr_client); ib_sa_unregister_client(&sa_client); unregister_pernet_subsys(&cma_pernet_operations); destroy_workqueue(cma_wq); diff --git a/include/rdma/ib_addr.h b/include/rdma/ib_addr.h index a08cc7278980..c2c8b1fdeead 100644 --- a/include/rdma/ib_addr.h +++ b/include/rdma/ib_addr.h @@ -49,22 +49,6 @@ #include #include -struct rdma_addr_client { - atomic_t refcount; - struct completion comp; -}; - -/** - * rdma_addr_register_client - Register an address client. - */ -void rdma_addr_register_client(struct rdma_addr_client *client); - -/** - * rdma_addr_unregister_client - Deregister an address client. - * @client: Client object to deregister. - */ -void rdma_addr_unregister_client(struct rdma_addr_client *client); - /** * struct rdma_dev_addr - Contains resolved RDMA hardware addresses * @src_dev_addr: Source MAC address. @@ -99,7 +83,6 @@ int rdma_translate_ip(const struct sockaddr *addr, /** * rdma_resolve_ip - Resolve source and destination IP addresses to * RDMA hardware addresses. - * @client: Address client associated with request. * @src_addr: An optional source address to use in the resolution. If a * source address is not provided, a usable address will be returned via * the callback. @@ -112,8 +95,7 @@ int rdma_translate_ip(const struct sockaddr *addr, * or been canceled. A status of 0 indicates success. * @context: User-specified context associated with the call. */ -int rdma_resolve_ip(struct rdma_addr_client *client, - struct sockaddr *src_addr, struct sockaddr *dst_addr, +int rdma_resolve_ip(struct sockaddr *src_addr, struct sockaddr *dst_addr, struct rdma_dev_addr *addr, int timeout_ms, void (*callback)(int status, struct sockaddr *src_addr, struct rdma_dev_addr *addr, void *context), -- cgit v1.2.3 From 0f02ba7ed16acdbc8c4f0b46a6fee81bb94f3407 Mon Sep 17 00:00:00 2001 From: Zhu Yanjun Date: Tue, 3 Apr 2018 20:08:20 -0400 Subject: IB/rxe: make the variable static The variable rxe_net_notifier is only used in the file rxe_net.c. So remove it from rxe_net.h file and make it static in the file rxe_net.c. CC: Srinivas Eeda CC: Junxiao Bi Signed-off-by: Zhu Yanjun Reviewed-by: Bart Van Assche Reviewed-by: Yuval Shaia Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_net.c | 2 +- drivers/infiniband/sw/rxe/rxe_net.h | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) (limited to 'drivers') diff --git a/drivers/infiniband/sw/rxe/rxe_net.c b/drivers/infiniband/sw/rxe/rxe_net.c index 9da6e37fb70c..241762606a66 100644 --- a/drivers/infiniband/sw/rxe/rxe_net.c +++ b/drivers/infiniband/sw/rxe/rxe_net.c @@ -707,7 +707,7 @@ out: return NOTIFY_OK; } -struct notifier_block rxe_net_notifier = { +static struct notifier_block rxe_net_notifier = { .notifier_call = rxe_notify, }; diff --git a/drivers/infiniband/sw/rxe/rxe_net.h b/drivers/infiniband/sw/rxe/rxe_net.h index 728d8c71b36a..15a0caf98629 100644 --- a/drivers/infiniband/sw/rxe/rxe_net.h +++ b/drivers/infiniband/sw/rxe/rxe_net.h @@ -43,7 +43,6 @@ struct rxe_recv_sockets { struct socket *sk6; }; -extern struct notifier_block rxe_net_notifier; void rxe_release_udp_tunnel(struct socket *sk); struct rxe_dev *rxe_net_add(struct net_device *ndev); -- cgit v1.2.3 From 76be04500be2d3e3c0ed7e13d3bc4453b985355b Mon Sep 17 00:00:00 2001 From: Zhu Yanjun Date: Sat, 14 Apr 2018 07:44:44 -0400 Subject: IB/rxe: avoid export symbols The functions rxe_set_mtu, rxe_add and rxe_remove are only used in their own module. So it is not necessary to export them. CC: Srinivas Eeda CC: Junxiao Bi Signed-off-by: Zhu Yanjun Reviewed-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'drivers') diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c index e493fdbd61c6..4d1d96805ca5 100644 --- a/drivers/infiniband/sw/rxe/rxe.c +++ b/drivers/infiniband/sw/rxe/rxe.c @@ -306,7 +306,6 @@ int rxe_set_mtu(struct rxe_dev *rxe, unsigned int ndev_mtu) return 0; } -EXPORT_SYMBOL(rxe_set_mtu); /* called by ifc layer to create new rxe device. * The caller should allocate memory for rxe by calling ib_alloc_device. @@ -335,7 +334,6 @@ err1: rxe_dev_put(rxe); return err; } -EXPORT_SYMBOL(rxe_add); /* called by the ifc layer to remove a device */ void rxe_remove(struct rxe_dev *rxe) @@ -344,7 +342,6 @@ void rxe_remove(struct rxe_dev *rxe) rxe_dev_put(rxe); } -EXPORT_SYMBOL(rxe_remove); static int __init rxe_module_init(void) { -- cgit v1.2.3 From 39e487faaf706fa94bab4d0cf9f543a3430c746e Mon Sep 17 00:00:00 2001 From: Jia-Ju Bai Date: Wed, 11 Apr 2018 15:32:25 +0800 Subject: infiniband: i40iw: Replace GFP_ATOMIC with GFP_KERNEL in i40iw_add_mqh_4 i40iw_add_mqh_4() is never called in atomic context, because it calls rtnl_lock() that can sleep. Despite never getting called from atomic context, i40iw_add_mqh_4() calls kzalloc() with GFP_ATOMIC, which does not sleep for allocation. GFP_ATOMIC is not necessary and can be replaced with GFP_KERNEL, which can sleep and improve the possibility of sucessful allocation. This is found by a static analysis tool named DCNS written by myself. And I also manually check it. Signed-off-by: Jia-Ju Bai Acked-by: Shiraz Saleem Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/i40iw/i40iw_cm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/infiniband/hw/i40iw/i40iw_cm.c b/drivers/infiniband/hw/i40iw/i40iw_cm.c index 4cfa8f4647e2..8310d2488681 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_cm.c +++ b/drivers/infiniband/hw/i40iw/i40iw_cm.c @@ -1788,7 +1788,7 @@ static enum i40iw_status_code i40iw_add_mqh_4( &ifa->ifa_address, rdma_vlan_dev_vlan_id(dev), dev->dev_addr); - child_listen_node = kzalloc(sizeof(*child_listen_node), GFP_ATOMIC); + child_listen_node = kzalloc(sizeof(*child_listen_node), GFP_KERNEL); cm_parent_listen_node->cm_core->stats_listen_nodes_created++; i40iw_debug(&iwdev->sc_dev, I40IW_DEBUG_CM, -- cgit v1.2.3 From f9af8730143a0fdc572f90b8a388795ee812cd74 Mon Sep 17 00:00:00 2001 From: Jia-Ju Bai Date: Wed, 11 Apr 2018 15:32:48 +0800 Subject: infiniband: i40iw: Replace GFP_ATOMIC with GFP_KERNEL in i40iw_make_listen_node i40iw_make_listen_node() is never called in atomic context. i40iw_make_listen_node() is only called by i40iw_create_listen, which is set as ".create_listen" in struct iw_cm_verbs. Despite never getting called from atomic context, i40iw_make_listen_node() calls kzalloc() with GFP_ATOMIC, which does not sleep for allocation. GFP_ATOMIC is not necessary and can be replaced with GFP_KERNEL, which can sleep and improve the possibility of sucessful allocation. This is found by a static analysis tool named DCNS written by myself. And I also manually check it. Signed-off-by: Jia-Ju Bai Acked-by: Shiraz Saleem Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/i40iw/i40iw_cm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/infiniband/hw/i40iw/i40iw_cm.c b/drivers/infiniband/hw/i40iw/i40iw_cm.c index 8310d2488681..0243ec48e4b5 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_cm.c +++ b/drivers/infiniband/hw/i40iw/i40iw_cm.c @@ -2872,7 +2872,7 @@ static struct i40iw_cm_listener *i40iw_make_listen_node( if (!listener) { /* create a CM listen node (1/2 node to compare incoming traffic to) */ - listener = kzalloc(sizeof(*listener), GFP_ATOMIC); + listener = kzalloc(sizeof(*listener), GFP_KERNEL); if (!listener) return NULL; cm_core->stats_listen_nodes_created++; -- cgit v1.2.3 From 4e56569cee1505846b3dcb15fbf400f6a7e9f015 Mon Sep 17 00:00:00 2001 From: Jia-Ju Bai Date: Wed, 11 Apr 2018 15:33:06 +0800 Subject: infiniband: i40iw: Replace GFP_ATOMIC with GFP_KERNEL in i40iw_l2param_change i40iw_l2param_change() is never called in atomic context. i40iw_make_listen_node() is only set as ".l2_param_change" in struct i40e_client_ops, and this function pointer is not called in atomic context. Despite never getting called from atomic context, i40iw_l2param_change() calls kzalloc() with GFP_ATOMIC, which does not sleep for allocation. GFP_ATOMIC is not necessary and can be replaced with GFP_KERNEL, which can sleep and improve the possibility of sucessful allocation. This is found by a static analysis tool named DCNS written by myself. And I also manually check it. Signed-off-by: Jia-Ju Bai Acked-by: Shiraz Saleem Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/i40iw/i40iw_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/infiniband/hw/i40iw/i40iw_main.c b/drivers/infiniband/hw/i40iw/i40iw_main.c index 9cd0d3ef9057..a220794dcdb0 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_main.c +++ b/drivers/infiniband/hw/i40iw/i40iw_main.c @@ -1758,7 +1758,7 @@ static void i40iw_l2param_change(struct i40e_info *ldev, struct i40e_client *cli return; - work = kzalloc(sizeof(*work), GFP_ATOMIC); + work = kzalloc(sizeof(*work), GFP_KERNEL); if (!work) return; -- cgit v1.2.3 From 8f1a72c815cf121f8a842c60c837f0d7605cdad4 Mon Sep 17 00:00:00 2001 From: Zhu Yanjun Date: Mon, 9 Apr 2018 08:19:18 -0400 Subject: IB/rxe: make rxe_release_udp_tunnel static The function rxe_release_udp_tunnel is only used in rxe_net.c. So it is necessary to make this function as static. CC: Srinivas Eeda CC: Junxiao Bi Signed-off-by: Zhu Yanjun Reviewed-by: Yuval Shaia Signed-off-by: Doug Ledford --- drivers/infiniband/sw/rxe/rxe_net.c | 2 +- drivers/infiniband/sw/rxe/rxe_net.h | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) (limited to 'drivers') diff --git a/drivers/infiniband/sw/rxe/rxe_net.c b/drivers/infiniband/sw/rxe/rxe_net.c index 241762606a66..97e128579137 100644 --- a/drivers/infiniband/sw/rxe/rxe_net.c +++ b/drivers/infiniband/sw/rxe/rxe_net.c @@ -315,7 +315,7 @@ static struct socket *rxe_setup_udp_tunnel(struct net *net, __be16 port, return sock; } -void rxe_release_udp_tunnel(struct socket *sk) +static void rxe_release_udp_tunnel(struct socket *sk) { if (sk) udp_tunnel_sock_release(sk); diff --git a/drivers/infiniband/sw/rxe/rxe_net.h b/drivers/infiniband/sw/rxe/rxe_net.h index 15a0caf98629..106c586dbb26 100644 --- a/drivers/infiniband/sw/rxe/rxe_net.h +++ b/drivers/infiniband/sw/rxe/rxe_net.h @@ -43,8 +43,6 @@ struct rxe_recv_sockets { struct socket *sk6; }; -void rxe_release_udp_tunnel(struct socket *sk); - struct rxe_dev *rxe_net_add(struct net_device *ndev); int rxe_net_init(void); -- cgit v1.2.3 From 2e47350789ebbc002b06d4549f60b5f9cba326ea Mon Sep 17 00:00:00 2001 From: Zhu Yanjun Date: Tue, 10 Apr 2018 00:47:15 -0400 Subject: IB/rxe: optimize the function duplicate_request In the function duplicate_request, the reference of skb can be increased to replace the function skb_clone. This will make rxe performace better and save memory. CC: Srinivas Eeda CC: Junxiao Bi Signed-off-by: Zhu Yanjun Reviewed-by: Bart Van Assche Signed-off-by: Doug Ledford --- drivers/infiniband/sw/rxe/rxe_resp.c | 17 +++-------------- 1 file changed, 3 insertions(+), 14 deletions(-) (limited to 'drivers') diff --git a/drivers/infiniband/sw/rxe/rxe_resp.c b/drivers/infiniband/sw/rxe/rxe_resp.c index a65c9969f7fc..c4172edf1f07 100644 --- a/drivers/infiniband/sw/rxe/rxe_resp.c +++ b/drivers/infiniband/sw/rxe/rxe_resp.c @@ -1124,24 +1124,13 @@ static enum resp_states duplicate_request(struct rxe_qp *qp, /* Find the operation in our list of responder resources. */ res = find_resource(qp, pkt->psn); if (res) { - struct sk_buff *skb_copy; - - skb_copy = skb_clone(res->atomic.skb, GFP_ATOMIC); - if (skb_copy) { - rxe_add_ref(qp); /* for the new SKB */ - } else { - pr_warn("Couldn't clone atomic resp\n"); - rc = RESPST_CLEANUP; - goto out; - } - + skb_get(res->atomic.skb); /* Resend the result. */ rc = rxe_xmit_packet(to_rdev(qp->ibqp.device), qp, - pkt, skb_copy); + pkt, res->atomic.skb); if (rc) { pr_err("Failed resending result. This flow is not handled - skb ignored\n"); - rxe_drop_ref(qp); - kfree_skb(skb_copy); + kfree_skb(res->atomic.skb); rc = RESPST_CLEANUP; goto out; } -- cgit v1.2.3 From fe896ceb577252966ec3339d511424e2495e1072 Mon Sep 17 00:00:00 2001 From: Zhu Yanjun Date: Tue, 10 Apr 2018 09:37:39 -0400 Subject: IB/rxe: replace refcount_inc with skb_get Follow the advice from Bart, the function refcount_inc is replaced with skb_get in commit 99dae690255e ("IB/rxe: optimize mcast recv process") and commit 86af61764151 ("IB/rxe: remove unnecessary skb_clone"). CC: Srinivas Eeda CC: Junxiao Bi Suggested-by: Bart Van Assche Signed-off-by: Zhu Yanjun Signed-off-by: Doug Ledford --- drivers/infiniband/sw/rxe/rxe_recv.c | 2 +- drivers/infiniband/sw/rxe/rxe_resp.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers') diff --git a/drivers/infiniband/sw/rxe/rxe_recv.c b/drivers/infiniband/sw/rxe/rxe_recv.c index dd80c7d9074a..c4c9f3be33d9 100644 --- a/drivers/infiniband/sw/rxe/rxe_recv.c +++ b/drivers/infiniband/sw/rxe/rxe_recv.c @@ -311,7 +311,7 @@ static void rxe_rcv_mcast_pkt(struct rxe_dev *rxe, struct sk_buff *skb) * increase the users of the skb then post to the next qp */ if (mce->qp_list.next != &mcg->qp_list) - refcount_inc(&skb->users); + skb_get(skb); pkt->qp = qp; rxe_add_ref(qp); diff --git a/drivers/infiniband/sw/rxe/rxe_resp.c b/drivers/infiniband/sw/rxe/rxe_resp.c index c4172edf1f07..ed402f028471 100644 --- a/drivers/infiniband/sw/rxe/rxe_resp.c +++ b/drivers/infiniband/sw/rxe/rxe_resp.c @@ -990,7 +990,7 @@ static int send_atomic_ack(struct rxe_qp *qp, struct rxe_pkt_info *pkt, memset((unsigned char *)SKB_TO_PKT(skb) + sizeof(ack_pkt), 0, sizeof(skb->cb) - sizeof(ack_pkt)); - refcount_inc(&skb->users); + skb_get(skb); res->type = RXE_ATOMIC_MASK; res->atomic.skb = skb; res->first_psn = ack_pkt.psn; -- cgit v1.2.3 From d819734126ce705784ca2cd847ad7623825f1a08 Mon Sep 17 00:00:00 2001 From: Souptick Joarder Date: Tue, 17 Apr 2018 19:53:58 +0530 Subject: infiniband: hw: hfi1: Change return type to vm_fault_t Use new return type vm_fault_t for fault handler. For now, this is just documenting that the function returns a VM_FAULT value rather than an errno. Once all instances are converted, vm_fault_t will become a distinct type. Reference id -> 1c8f422059ae ("mm: change return type to vm_fault_t") Signed-off-by: Souptick Joarder Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/file_ops.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers') diff --git a/drivers/infiniband/hw/hfi1/file_ops.c b/drivers/infiniband/hw/hfi1/file_ops.c index da4aa1a95b11..1b778fd16a32 100644 --- a/drivers/infiniband/hw/hfi1/file_ops.c +++ b/drivers/infiniband/hw/hfi1/file_ops.c @@ -110,7 +110,7 @@ static int set_ctxt_pkey(struct hfi1_ctxtdata *uctxt, unsigned long arg); static int ctxt_reset(struct hfi1_ctxtdata *uctxt); static int manage_rcvq(struct hfi1_ctxtdata *uctxt, u16 subctxt, unsigned long arg); -static int vma_fault(struct vm_fault *vmf); +static vm_fault_t vma_fault(struct vm_fault *vmf); static long hfi1_file_ioctl(struct file *fp, unsigned int cmd, unsigned long arg); @@ -591,7 +591,7 @@ done: * Local (non-chip) user memory is not mapped right away but as it is * accessed by the user-level code. */ -static int vma_fault(struct vm_fault *vmf) +static vm_fault_t vma_fault(struct vm_fault *vmf) { struct page *page; -- cgit v1.2.3 From 7991d96dd137408385f425cdf8ff815738ea2b49 Mon Sep 17 00:00:00 2001 From: Souptick Joarder Date: Tue, 17 Apr 2018 20:04:28 +0530 Subject: infiniband: hw: qib: Change return type to vm_fault_t Use new return type vm_fault_t for fault handler. For now, this is just documenting that the function returns a VM_FAULT value rather than an errno. Once all instances are converted, vm_fault_t will become a distinct type. Reference id -> 1c8f422059ae ("mm: change return type to vm_fault_t") Signed-off-by: Souptick Joarder Signed-off-by: Doug Ledford --- drivers/infiniband/hw/qib/qib_file_ops.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/infiniband/hw/qib/qib_file_ops.c b/drivers/infiniband/hw/qib/qib_file_ops.c index 6a8800b65047..bbb720bfd030 100644 --- a/drivers/infiniband/hw/qib/qib_file_ops.c +++ b/drivers/infiniband/hw/qib/qib_file_ops.c @@ -868,7 +868,7 @@ bail: /* * qib_file_vma_fault - handle a VMA page fault. */ -static int qib_file_vma_fault(struct vm_fault *vmf) +static vm_fault_t qib_file_vma_fault(struct vm_fault *vmf) { struct page *page; -- cgit v1.2.3 From 10c47d560603a8ba9d74889028cf6728a2d845ca Mon Sep 17 00:00:00 2001 From: Yuval Shaia Date: Fri, 20 Apr 2018 17:05:03 +0300 Subject: IB/rxe: Change rxe_rcv to return void It always returns 0. Change return type to void. Signed-off-by: Yuval Shaia Reviewed-by: Zhu Yanjun Reviewed-by: Johannes Thumshirn Signed-off-by: Doug Ledford --- drivers/infiniband/sw/rxe/rxe.h | 2 +- drivers/infiniband/sw/rxe/rxe_loc.h | 5 +++-- drivers/infiniband/sw/rxe/rxe_net.c | 9 ++++++--- drivers/infiniband/sw/rxe/rxe_recv.c | 5 ++--- 4 files changed, 12 insertions(+), 9 deletions(-) (limited to 'drivers') diff --git a/drivers/infiniband/sw/rxe/rxe.h b/drivers/infiniband/sw/rxe/rxe.h index 561ad307c6ec..1275fde6503a 100644 --- a/drivers/infiniband/sw/rxe/rxe.h +++ b/drivers/infiniband/sw/rxe/rxe.h @@ -98,7 +98,7 @@ int rxe_add(struct rxe_dev *rxe, unsigned int mtu); void rxe_remove(struct rxe_dev *rxe); void rxe_remove_all(void); -int rxe_rcv(struct sk_buff *skb); +void rxe_rcv(struct sk_buff *skb); static inline void rxe_dev_put(struct rxe_dev *rxe) { diff --git a/drivers/infiniband/sw/rxe/rxe_loc.h b/drivers/infiniband/sw/rxe/rxe_loc.h index b71023c1c58b..2f8ab8eebcb1 100644 --- a/drivers/infiniband/sw/rxe/rxe_loc.h +++ b/drivers/infiniband/sw/rxe/rxe_loc.h @@ -143,7 +143,7 @@ void rxe_mem_cleanup(struct rxe_pool_entry *arg); int advance_dma_data(struct rxe_dma_info *dma, unsigned int length); /* rxe_net.c */ -int rxe_loopback(struct sk_buff *skb); +void rxe_loopback(struct sk_buff *skb); int rxe_send(struct rxe_pkt_info *pkt, struct sk_buff *skb); struct sk_buff *rxe_init_packet(struct rxe_dev *rxe, struct rxe_av *av, int paylen, struct rxe_pkt_info *pkt); @@ -268,7 +268,8 @@ static inline int rxe_xmit_packet(struct rxe_dev *rxe, struct rxe_qp *qp, if (pkt->mask & RXE_LOOPBACK_MASK) { memcpy(SKB_TO_PKT(skb), pkt, sizeof(*pkt)); - err = rxe_loopback(skb); + rxe_loopback(skb); + err = 0; } else { err = rxe_send(pkt, skb); } diff --git a/drivers/infiniband/sw/rxe/rxe_net.c b/drivers/infiniband/sw/rxe/rxe_net.c index 97e128579137..fca13a6281f0 100644 --- a/drivers/infiniband/sw/rxe/rxe_net.c +++ b/drivers/infiniband/sw/rxe/rxe_net.c @@ -276,9 +276,12 @@ static int rxe_udp_encap_recv(struct sock *sk, struct sk_buff *skb) pkt->mask = RXE_GRH_MASK; pkt->paylen = be16_to_cpu(udph->len) - sizeof(*udph); - return rxe_rcv(skb); + rxe_rcv(skb); + + return 0; drop: kfree_skb(skb); + return 0; } @@ -517,9 +520,9 @@ int rxe_send(struct rxe_pkt_info *pkt, struct sk_buff *skb) return 0; } -int rxe_loopback(struct sk_buff *skb) +void rxe_loopback(struct sk_buff *skb) { - return rxe_rcv(skb); + rxe_rcv(skb); } static inline int addr_same(struct rxe_dev *rxe, struct rxe_av *av) diff --git a/drivers/infiniband/sw/rxe/rxe_recv.c b/drivers/infiniband/sw/rxe/rxe_recv.c index c4c9f3be33d9..dfba44a40f0b 100644 --- a/drivers/infiniband/sw/rxe/rxe_recv.c +++ b/drivers/infiniband/sw/rxe/rxe_recv.c @@ -345,7 +345,7 @@ static int rxe_match_dgid(struct rxe_dev *rxe, struct sk_buff *skb) } /* rxe_rcv is called from the interface driver */ -int rxe_rcv(struct sk_buff *skb) +void rxe_rcv(struct sk_buff *skb) { int err; struct rxe_pkt_info *pkt = SKB_TO_PKT(skb); @@ -403,12 +403,11 @@ int rxe_rcv(struct sk_buff *skb) else rxe_rcv_pkt(rxe, pkt, skb); - return 0; + return; drop: if (pkt->qp) rxe_drop_ref(pkt->qp); kfree_skb(skb); - return 0; } -- cgit v1.2.3 From 0dff463a6a867072f2c779e2fed651b498901801 Mon Sep 17 00:00:00 2001 From: Zhu Yanjun Date: Fri, 20 Apr 2018 10:30:54 -0400 Subject: IB/rxe: change rxe_set_mtu function type to void The function rxe_set_mtu always returns zero. So this function type is changed to void. CC: Srinivas Eeda CC: Junxiao Bi Signed-off-by: Zhu Yanjun Reviewed-by: Yuval Shaia Signed-off-by: Doug Ledford --- drivers/infiniband/sw/rxe/rxe.c | 8 ++------ drivers/infiniband/sw/rxe/rxe.h | 2 +- 2 files changed, 3 insertions(+), 7 deletions(-) (limited to 'drivers') diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c index 4d1d96805ca5..7121e1b1eb89 100644 --- a/drivers/infiniband/sw/rxe/rxe.c +++ b/drivers/infiniband/sw/rxe/rxe.c @@ -291,7 +291,7 @@ err1: return err; } -int rxe_set_mtu(struct rxe_dev *rxe, unsigned int ndev_mtu) +void rxe_set_mtu(struct rxe_dev *rxe, unsigned int ndev_mtu) { struct rxe_port *port = &rxe->port; enum ib_mtu mtu; @@ -303,8 +303,6 @@ int rxe_set_mtu(struct rxe_dev *rxe, unsigned int ndev_mtu) port->attr.active_mtu = mtu; port->mtu_cap = ib_mtu_enum_to_int(mtu); - - return 0; } /* called by ifc layer to create new rxe device. @@ -320,9 +318,7 @@ int rxe_add(struct rxe_dev *rxe, unsigned int mtu) if (err) goto err1; - err = rxe_set_mtu(rxe, mtu); - if (err) - goto err1; + rxe_set_mtu(rxe, mtu); err = rxe_register_device(rxe); if (err) diff --git a/drivers/infiniband/sw/rxe/rxe.h b/drivers/infiniband/sw/rxe/rxe.h index 1275fde6503a..d9ec2de68738 100644 --- a/drivers/infiniband/sw/rxe/rxe.h +++ b/drivers/infiniband/sw/rxe/rxe.h @@ -92,7 +92,7 @@ static inline u32 rxe_crc32(struct rxe_dev *rxe, return retval; } -int rxe_set_mtu(struct rxe_dev *rxe, unsigned int dev_mtu); +void rxe_set_mtu(struct rxe_dev *rxe, unsigned int dev_mtu); int rxe_add(struct rxe_dev *rxe, unsigned int mtu); void rxe_remove(struct rxe_dev *rxe); -- cgit v1.2.3 From e12ee8ce51435c4d24f437f10e0fce773505c674 Mon Sep 17 00:00:00 2001 From: Zhu Yanjun Date: Mon, 23 Apr 2018 03:57:58 -0400 Subject: IB/rxe: remove unused function variable In the functions rxe_mem_init_dma, rxe_mem_init_user, rxe_mem_init_fast and copy_data, the function variable rxe is not used. So this function variable rxe is removed. CC: Srinivas Eeda CC: Junxiao Bi Signed-off-by: Zhu Yanjun Reviewed-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/sw/rxe/rxe_comp.c | 6 ++---- drivers/infiniband/sw/rxe/rxe_loc.h | 8 ++++---- drivers/infiniband/sw/rxe/rxe_mr.c | 13 ++++++------- drivers/infiniband/sw/rxe/rxe_req.c | 2 +- drivers/infiniband/sw/rxe/rxe_resp.c | 3 +-- drivers/infiniband/sw/rxe/rxe_verbs.c | 6 +++--- 6 files changed, 17 insertions(+), 21 deletions(-) (limited to 'drivers') diff --git a/drivers/infiniband/sw/rxe/rxe_comp.c b/drivers/infiniband/sw/rxe/rxe_comp.c index 6cdc40ed8a9f..a285978aa7fe 100644 --- a/drivers/infiniband/sw/rxe/rxe_comp.c +++ b/drivers/infiniband/sw/rxe/rxe_comp.c @@ -355,10 +355,9 @@ static inline enum comp_state do_read(struct rxe_qp *qp, struct rxe_pkt_info *pkt, struct rxe_send_wqe *wqe) { - struct rxe_dev *rxe = to_rdev(qp->ibqp.device); int ret; - ret = copy_data(rxe, qp->pd, IB_ACCESS_LOCAL_WRITE, + ret = copy_data(qp->pd, IB_ACCESS_LOCAL_WRITE, &wqe->dma, payload_addr(pkt), payload_size(pkt), to_mem_obj, NULL); if (ret) @@ -374,12 +373,11 @@ static inline enum comp_state do_atomic(struct rxe_qp *qp, struct rxe_pkt_info *pkt, struct rxe_send_wqe *wqe) { - struct rxe_dev *rxe = to_rdev(qp->ibqp.device); int ret; u64 atomic_orig = atmack_orig(pkt); - ret = copy_data(rxe, qp->pd, IB_ACCESS_LOCAL_WRITE, + ret = copy_data(qp->pd, IB_ACCESS_LOCAL_WRITE, &wqe->dma, &atomic_orig, sizeof(u64), to_mem_obj, NULL); if (ret) diff --git a/drivers/infiniband/sw/rxe/rxe_loc.h b/drivers/infiniband/sw/rxe/rxe_loc.h index 2f8ab8eebcb1..a51ece596c43 100644 --- a/drivers/infiniband/sw/rxe/rxe_loc.h +++ b/drivers/infiniband/sw/rxe/rxe_loc.h @@ -106,20 +106,20 @@ enum copy_direction { from_mem_obj, }; -int rxe_mem_init_dma(struct rxe_dev *rxe, struct rxe_pd *pd, +int rxe_mem_init_dma(struct rxe_pd *pd, int access, struct rxe_mem *mem); -int rxe_mem_init_user(struct rxe_dev *rxe, struct rxe_pd *pd, u64 start, +int rxe_mem_init_user(struct rxe_pd *pd, u64 start, u64 length, u64 iova, int access, struct ib_udata *udata, struct rxe_mem *mr); -int rxe_mem_init_fast(struct rxe_dev *rxe, struct rxe_pd *pd, +int rxe_mem_init_fast(struct rxe_pd *pd, int max_pages, struct rxe_mem *mem); int rxe_mem_copy(struct rxe_mem *mem, u64 iova, void *addr, int length, enum copy_direction dir, u32 *crcp); -int copy_data(struct rxe_dev *rxe, struct rxe_pd *pd, int access, +int copy_data(struct rxe_pd *pd, int access, struct rxe_dma_info *dma, void *addr, int length, enum copy_direction dir, u32 *crcp); diff --git a/drivers/infiniband/sw/rxe/rxe_mr.c b/drivers/infiniband/sw/rxe/rxe_mr.c index 5c2684bf430f..dff605fdf60f 100644 --- a/drivers/infiniband/sw/rxe/rxe_mr.c +++ b/drivers/infiniband/sw/rxe/rxe_mr.c @@ -107,7 +107,7 @@ void rxe_mem_cleanup(struct rxe_pool_entry *arg) } } -static int rxe_mem_alloc(struct rxe_dev *rxe, struct rxe_mem *mem, int num_buf) +static int rxe_mem_alloc(struct rxe_mem *mem, int num_buf) { int i; int num_map; @@ -145,7 +145,7 @@ err1: return -ENOMEM; } -int rxe_mem_init_dma(struct rxe_dev *rxe, struct rxe_pd *pd, +int rxe_mem_init_dma(struct rxe_pd *pd, int access, struct rxe_mem *mem) { rxe_mem_init(access, mem); @@ -158,7 +158,7 @@ int rxe_mem_init_dma(struct rxe_dev *rxe, struct rxe_pd *pd, return 0; } -int rxe_mem_init_user(struct rxe_dev *rxe, struct rxe_pd *pd, u64 start, +int rxe_mem_init_user(struct rxe_pd *pd, u64 start, u64 length, u64 iova, int access, struct ib_udata *udata, struct rxe_mem *mem) { @@ -184,7 +184,7 @@ int rxe_mem_init_user(struct rxe_dev *rxe, struct rxe_pd *pd, u64 start, rxe_mem_init(access, mem); - err = rxe_mem_alloc(rxe, mem, num_buf); + err = rxe_mem_alloc(mem, num_buf); if (err) { pr_warn("err %d from rxe_mem_alloc\n", err); ib_umem_release(umem); @@ -236,7 +236,7 @@ err1: return err; } -int rxe_mem_init_fast(struct rxe_dev *rxe, struct rxe_pd *pd, +int rxe_mem_init_fast(struct rxe_pd *pd, int max_pages, struct rxe_mem *mem) { int err; @@ -246,7 +246,7 @@ int rxe_mem_init_fast(struct rxe_dev *rxe, struct rxe_pd *pd, /* In fastreg, we also set the rkey */ mem->ibmr.rkey = mem->ibmr.lkey; - err = rxe_mem_alloc(rxe, mem, max_pages); + err = rxe_mem_alloc(mem, max_pages); if (err) goto err1; @@ -434,7 +434,6 @@ err1: * under the control of a dma descriptor */ int copy_data( - struct rxe_dev *rxe, struct rxe_pd *pd, int access, struct rxe_dma_info *dma, diff --git a/drivers/infiniband/sw/rxe/rxe_req.c b/drivers/infiniband/sw/rxe/rxe_req.c index 7bdaf71b8221..957826dde94f 100644 --- a/drivers/infiniband/sw/rxe/rxe_req.c +++ b/drivers/infiniband/sw/rxe/rxe_req.c @@ -490,7 +490,7 @@ static int fill_packet(struct rxe_qp *qp, struct rxe_send_wqe *wqe, wqe->dma.resid -= paylen; wqe->dma.sge_offset += paylen; } else { - err = copy_data(rxe, qp->pd, 0, &wqe->dma, + err = copy_data(qp->pd, 0, &wqe->dma, payload_addr(pkt), paylen, from_mem_obj, &crc); diff --git a/drivers/infiniband/sw/rxe/rxe_resp.c b/drivers/infiniband/sw/rxe/rxe_resp.c index ed402f028471..c45c1ff24497 100644 --- a/drivers/infiniband/sw/rxe/rxe_resp.c +++ b/drivers/infiniband/sw/rxe/rxe_resp.c @@ -511,9 +511,8 @@ static enum resp_states send_data_in(struct rxe_qp *qp, void *data_addr, int data_len) { int err; - struct rxe_dev *rxe = to_rdev(qp->ibqp.device); - err = copy_data(rxe, qp->pd, IB_ACCESS_LOCAL_WRITE, &qp->resp.wqe->dma, + err = copy_data(qp->pd, IB_ACCESS_LOCAL_WRITE, &qp->resp.wqe->dma, data_addr, data_len, to_mem_obj, NULL); if (unlikely(err)) return (err == -ENOSPC) ? RESPST_ERR_LENGTH diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c index 2cb52fd48cf1..c5206148243c 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.c +++ b/drivers/infiniband/sw/rxe/rxe_verbs.c @@ -1011,7 +1011,7 @@ static struct ib_mr *rxe_get_dma_mr(struct ib_pd *ibpd, int access) rxe_add_ref(pd); - err = rxe_mem_init_dma(rxe, pd, access, mr); + err = rxe_mem_init_dma(pd, access, mr); if (err) goto err2; @@ -1046,7 +1046,7 @@ static struct ib_mr *rxe_reg_user_mr(struct ib_pd *ibpd, rxe_add_ref(pd); - err = rxe_mem_init_user(rxe, pd, start, length, iova, + err = rxe_mem_init_user(pd, start, length, iova, access, udata, mr); if (err) goto err3; @@ -1094,7 +1094,7 @@ static struct ib_mr *rxe_alloc_mr(struct ib_pd *ibpd, rxe_add_ref(pd); - err = rxe_mem_init_fast(rxe, pd, max_num_sg, mr); + err = rxe_mem_init_fast(pd, max_num_sg, mr); if (err) goto err2; -- cgit v1.2.3 From 2f6e51365727a1428d281821ec928904c723e47d Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Thu, 26 Apr 2018 10:56:34 +0300 Subject: IB/core: Use CONFIG_SECURITY_INFINIBAND to compile out security code Make security.c depends on CONFIG_SECURITY_INFINIBAND. Reviewed-by: Daniel Jurgens Signed-off-by: Parav Pandit Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/core/Makefile | 3 ++- drivers/infiniband/core/security.c | 4 ---- 2 files changed, 2 insertions(+), 5 deletions(-) (limited to 'drivers') diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile index dda9e856e3fa..1cfedc469b23 100644 --- a/drivers/infiniband/core/Makefile +++ b/drivers/infiniband/core/Makefile @@ -12,8 +12,9 @@ ib_core-y := packer.o ud_header.o verbs.o cq.o rw.o sysfs.o \ device.o fmr_pool.o cache.o netlink.o \ roce_gid_mgmt.o mr_pool.o addr.o sa_query.o \ multicast.o mad.o smi.o agent.o mad_rmpp.o \ - security.o nldev.o restrack.o + nldev.o restrack.o +ib_core-$(CONFIG_SECURITY_INFINIBAND) += security.o ib_core-$(CONFIG_INFINIBAND_USER_MEM) += umem.o ib_core-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += umem_odp.o ib_core-$(CONFIG_CGROUP_RDMA) += cgroup.o diff --git a/drivers/infiniband/core/security.c b/drivers/infiniband/core/security.c index b61dda6b04fc..9b0bea8303e0 100644 --- a/drivers/infiniband/core/security.c +++ b/drivers/infiniband/core/security.c @@ -30,8 +30,6 @@ * SOFTWARE. */ -#ifdef CONFIG_SECURITY_INFINIBAND - #include #include #include @@ -751,5 +749,3 @@ int ib_mad_enforce_security(struct ib_mad_agent_private *map, u16 pkey_index) pkey_index, map->agent.security); } - -#endif /* CONFIG_SECURITY_INFINIBAND */ -- cgit v1.2.3 From ecb238f6a7f369b5e0eece4e913c9d671208860c Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Sat, 28 Apr 2018 15:31:06 +0800 Subject: IB/cxgb4: use skb_put_zero()/__skb_put_zero Use the recently introduced helper to replace the pattern of skb_put_zero/__skb_put() && memset(). Signed-off-by: YueHaibing Reviewed-by: Steve Wise Signed-off-by: Doug Ledford --- drivers/infiniband/hw/cxgb4/qp.c | 9 +++------ drivers/infiniband/sw/rxe/rxe_net.c | 4 +--- 2 files changed, 4 insertions(+), 9 deletions(-) (limited to 'drivers') diff --git a/drivers/infiniband/hw/cxgb4/qp.c b/drivers/infiniband/hw/cxgb4/qp.c index de77b6027d69..2dc94997ea11 100644 --- a/drivers/infiniband/hw/cxgb4/qp.c +++ b/drivers/infiniband/hw/cxgb4/qp.c @@ -1297,8 +1297,7 @@ static void post_terminate(struct c4iw_qp *qhp, struct t4_cqe *err_cqe, set_wr_txq(skb, CPL_PRIORITY_DATA, qhp->ep->txq_idx); - wqe = __skb_put(skb, sizeof(*wqe)); - memset(wqe, 0, sizeof *wqe); + wqe = __skb_put_zero(skb, sizeof(*wqe)); wqe->op_compl = cpu_to_be32(FW_WR_OP_V(FW_RI_INIT_WR)); wqe->flowid_len16 = cpu_to_be32( FW_WR_FLOWID_V(qhp->ep->hwtid) | @@ -1421,8 +1420,7 @@ static int rdma_fini(struct c4iw_dev *rhp, struct c4iw_qp *qhp, set_wr_txq(skb, CPL_PRIORITY_DATA, ep->txq_idx); - wqe = __skb_put(skb, sizeof(*wqe)); - memset(wqe, 0, sizeof *wqe); + wqe = __skb_put_zero(skb, sizeof(*wqe)); wqe->op_compl = cpu_to_be32( FW_WR_OP_V(FW_RI_INIT_WR) | FW_WR_COMPL_F); @@ -1487,8 +1485,7 @@ static int rdma_init(struct c4iw_dev *rhp, struct c4iw_qp *qhp) } set_wr_txq(skb, CPL_PRIORITY_DATA, qhp->ep->txq_idx); - wqe = __skb_put(skb, sizeof(*wqe)); - memset(wqe, 0, sizeof *wqe); + wqe = __skb_put_zero(skb, sizeof(*wqe)); wqe->op_compl = cpu_to_be32( FW_WR_OP_V(FW_RI_INIT_WR) | FW_WR_COMPL_F); diff --git a/drivers/infiniband/sw/rxe/rxe_net.c b/drivers/infiniband/sw/rxe/rxe_net.c index fca13a6281f0..95e52b3ec757 100644 --- a/drivers/infiniband/sw/rxe/rxe_net.c +++ b/drivers/infiniband/sw/rxe/rxe_net.c @@ -565,11 +565,9 @@ struct sk_buff *rxe_init_packet(struct rxe_dev *rxe, struct rxe_av *av, pkt->rxe = rxe; pkt->port_num = port_num; - pkt->hdr = skb_put(skb, paylen); + pkt->hdr = skb_put_zero(skb, paylen); pkt->mask |= RXE_GRH_MASK; - memset(pkt->hdr, 0, paylen); - dev_put(ndev); return skb; } -- cgit v1.2.3 From ffab8c89ba59c4e01f9c277f1baaad12bd5a3c0c Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 1 May 2018 09:25:49 +0100 Subject: RDMA/qedr: fix spelling mistake: "failes" -> "fails" Trivial fix to spelling mistake in DP_ERR error message Signed-off-by: Colin Ian King Signed-off-by: Doug Ledford --- drivers/infiniband/hw/qedr/verbs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/infiniband/hw/qedr/verbs.c b/drivers/infiniband/hw/qedr/verbs.c index 7d3763b2e01c..35f3b6f8fd45 100644 --- a/drivers/infiniband/hw/qedr/verbs.c +++ b/drivers/infiniband/hw/qedr/verbs.c @@ -2579,7 +2579,7 @@ static int qedr_set_page(struct ib_mr *ibmr, u64 addr) u32 pbes_in_page; if (unlikely(mr->npages == mr->info.pbl_info.num_pbes)) { - DP_ERR(mr->dev, "qedr_set_page failes when %d\n", mr->npages); + DP_ERR(mr->dev, "qedr_set_page fails when %d\n", mr->npages); return -ENOMEM; } -- cgit v1.2.3 From 25a0ad85156a7b697d4340560fff0d25a3b19243 Mon Sep 17 00:00:00 2001 From: Steve Wise Date: Thu, 3 May 2018 08:40:49 -0700 Subject: RDMA/nldev: Add explicit pad attribute Add a specific RDMA_NLDEV_ATTR_PAD attribute to be used for 64b attribute padding. To preserve the ABI, make this attribute equal to RDMA_NLDEV_ATTR_UNSPEC, which has a value of 0, because that has been used up until now as the pad attribute. Change all the previous use of 0 as the pad with this new enum. Signed-off-by: Steve Wise Reviewed-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/core/nldev.c | 26 +++++++++++++++----------- include/uapi/rdma/rdma_netlink.h | 3 +++ 2 files changed, 18 insertions(+), 11 deletions(-) (limited to 'drivers') diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c index eb567765f45c..6b0c1eb71ea0 100644 --- a/drivers/infiniband/core/nldev.c +++ b/drivers/infiniband/core/nldev.c @@ -122,7 +122,8 @@ static int fill_dev_info(struct sk_buff *msg, struct ib_device *device) BUILD_BUG_ON(sizeof(device->attrs.device_cap_flags) != sizeof(u64)); if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_CAP_FLAGS, - device->attrs.device_cap_flags, 0)) + device->attrs.device_cap_flags, + RDMA_NLDEV_ATTR_PAD)) return -EMSGSIZE; ib_get_device_fw_str(device, fw); @@ -131,10 +132,12 @@ static int fill_dev_info(struct sk_buff *msg, struct ib_device *device) return -EMSGSIZE; if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_NODE_GUID, - be64_to_cpu(device->node_guid), 0)) + be64_to_cpu(device->node_guid), + RDMA_NLDEV_ATTR_PAD)) return -EMSGSIZE; if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_SYS_IMAGE_GUID, - be64_to_cpu(device->attrs.sys_image_guid), 0)) + be64_to_cpu(device->attrs.sys_image_guid), + RDMA_NLDEV_ATTR_PAD)) return -EMSGSIZE; if (nla_put_u8(msg, RDMA_NLDEV_ATTR_DEV_NODE_TYPE, device->node_type)) return -EMSGSIZE; @@ -161,11 +164,11 @@ static int fill_port_info(struct sk_buff *msg, BUILD_BUG_ON(sizeof(attr.port_cap_flags) > sizeof(u64)); if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_CAP_FLAGS, - (u64)attr.port_cap_flags, 0)) + (u64)attr.port_cap_flags, RDMA_NLDEV_ATTR_PAD)) return -EMSGSIZE; if (rdma_protocol_ib(device, port) && nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_SUBNET_PREFIX, - attr.subnet_prefix, 0)) + attr.subnet_prefix, RDMA_NLDEV_ATTR_PAD)) return -EMSGSIZE; if (rdma_protocol_ib(device, port)) { if (nla_put_u32(msg, RDMA_NLDEV_ATTR_LID, attr.lid)) @@ -209,8 +212,8 @@ static int fill_res_info_entry(struct sk_buff *msg, if (nla_put_string(msg, RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY_NAME, name)) goto err; - if (nla_put_u64_64bit(msg, - RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY_CURR, curr, 0)) + if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY_CURR, curr, + RDMA_NLDEV_ATTR_PAD)) goto err; nla_nest_end(msg, entry_attr); @@ -409,7 +412,7 @@ static int fill_res_cq_entry(struct sk_buff *msg, struct netlink_callback *cb, if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_CQE, cq->cqe)) goto err; if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_RES_USECNT, - atomic_read(&cq->usecnt), 0)) + atomic_read(&cq->usecnt), RDMA_NLDEV_ATTR_PAD)) goto err; /* Poll context is only valid for kernel CQs */ @@ -445,11 +448,12 @@ static int fill_res_mr_entry(struct sk_buff *msg, struct netlink_callback *cb, if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_LKEY, mr->lkey)) goto err; if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_RES_IOVA, - mr->iova, 0)) + mr->iova, RDMA_NLDEV_ATTR_PAD)) goto err; } - if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_RES_MRLEN, mr->length, 0)) + if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_RES_MRLEN, mr->length, + RDMA_NLDEV_ATTR_PAD)) goto err; if (fill_res_name_pid(msg, res)) @@ -484,7 +488,7 @@ static int fill_res_pd_entry(struct sk_buff *msg, struct netlink_callback *cb, goto err; } if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_RES_USECNT, - atomic_read(&pd->usecnt), 0)) + atomic_read(&pd->usecnt), RDMA_NLDEV_ATTR_PAD)) goto err; if ((pd->flags & IB_PD_UNSAFE_GLOBAL_RKEY) && nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_UNSAFE_GLOBAL_RKEY, diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h index 0ce0943fc808..fbf99aa8a03c 100644 --- a/include/uapi/rdma/rdma_netlink.h +++ b/include/uapi/rdma/rdma_netlink.h @@ -253,6 +253,9 @@ enum rdma_nldev_attr { /* don't change the order or add anything between, this is ABI! */ RDMA_NLDEV_ATTR_UNSPEC, + /* Pad attribute for 64b alignment */ + RDMA_NLDEV_ATTR_PAD = RDMA_NLDEV_ATTR_UNSPEC, + /* Identifier for ib_device */ RDMA_NLDEV_ATTR_DEV_INDEX, /* u32 */ -- cgit v1.2.3 From da5c8507821573b8ed6e3f47e009f273493ffaf7 Mon Sep 17 00:00:00 2001 From: Steve Wise Date: Thu, 3 May 2018 08:41:30 -0700 Subject: RDMA/nldev: add driver-specific resource tracking Each driver can register a "fill entry" function with the restrack core. This function will be called when filling out a resource, allowing the driver to add driver-specific details. The details consist of a nltable of nested attributes, that are in the form of tuples. Both key and value attributes are mandatory. The key nlattr must be a string, and the value nlattr can be one of the driver attributes that are generic, but typed, allowing the attributes to be validated. Currently the driver nlattr types include string, s32, u32, s64, and u64. The print-type nlattr allows a driver to specify an alternative display format for user tools displaying the attribute. For example, a u32 attribute will default to "%u", but a print-type attribute can be included for it to be displayed in hex. This allows the user tool to print the number in the format desired by the driver driver. More attrs can be defined as they become needed by drivers. Signed-off-by: Steve Wise Reviewed-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/core/nldev.c | 29 +++++++++++++++++++++++++++++ drivers/infiniband/core/restrack.c | 7 +++++++ include/rdma/restrack.h | 9 +++++++++ include/uapi/rdma/rdma_netlink.h | 23 +++++++++++++++++++++++ 4 files changed, 68 insertions(+) (limited to 'drivers') diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c index 6b0c1eb71ea0..50efca482a6c 100644 --- a/drivers/infiniband/core/nldev.c +++ b/drivers/infiniband/core/nldev.c @@ -98,6 +98,15 @@ static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = { [RDMA_NLDEV_ATTR_NDEV_INDEX] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_NDEV_NAME] = { .type = NLA_NUL_STRING, .len = IFNAMSIZ }, + [RDMA_NLDEV_ATTR_DRIVER] = { .type = NLA_NESTED }, + [RDMA_NLDEV_ATTR_DRIVER_ENTRY] = { .type = NLA_NESTED }, + [RDMA_NLDEV_ATTR_DRIVER_STRING] = { .type = NLA_NUL_STRING, + .len = RDMA_NLDEV_ATTR_ENTRY_STRLEN }, + [RDMA_NLDEV_ATTR_DRIVER_PRINT_TYPE] = { .type = NLA_U8 }, + [RDMA_NLDEV_ATTR_DRIVER_S32] = { .type = NLA_S32 }, + [RDMA_NLDEV_ATTR_DRIVER_U32] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_DRIVER_S64] = { .type = NLA_S64 }, + [RDMA_NLDEV_ATTR_DRIVER_U64] = { .type = NLA_U64 }, }; static int fill_nldev_handle(struct sk_buff *msg, struct ib_device *device) @@ -285,6 +294,7 @@ static int fill_res_qp_entry(struct sk_buff *msg, struct netlink_callback *cb, struct rdma_restrack_entry *res, uint32_t port) { struct ib_qp *qp = container_of(res, struct ib_qp, res); + struct rdma_restrack_root *resroot = &qp->device->res; struct ib_qp_init_attr qp_init_attr; struct nlattr *entry_attr; struct ib_qp_attr qp_attr; @@ -334,6 +344,9 @@ static int fill_res_qp_entry(struct sk_buff *msg, struct netlink_callback *cb, if (fill_res_name_pid(msg, res)) goto err; + if (resroot->fill_res_entry(msg, res)) + goto err; + nla_nest_end(msg, entry_attr); return 0; @@ -349,6 +362,7 @@ static int fill_res_cm_id_entry(struct sk_buff *msg, { struct rdma_id_private *id_priv = container_of(res, struct rdma_id_private, res); + struct rdma_restrack_root *resroot = &id_priv->id.device->res; struct rdma_cm_id *cm_id = &id_priv->id; struct nlattr *entry_attr; @@ -390,6 +404,9 @@ static int fill_res_cm_id_entry(struct sk_buff *msg, if (fill_res_name_pid(msg, res)) goto err; + if (resroot->fill_res_entry(msg, res)) + goto err; + nla_nest_end(msg, entry_attr); return 0; @@ -403,6 +420,7 @@ static int fill_res_cq_entry(struct sk_buff *msg, struct netlink_callback *cb, struct rdma_restrack_entry *res, uint32_t port) { struct ib_cq *cq = container_of(res, struct ib_cq, res); + struct rdma_restrack_root *resroot = &cq->device->res; struct nlattr *entry_attr; entry_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_RES_CQ_ENTRY); @@ -423,6 +441,9 @@ static int fill_res_cq_entry(struct sk_buff *msg, struct netlink_callback *cb, if (fill_res_name_pid(msg, res)) goto err; + if (resroot->fill_res_entry(msg, res)) + goto err; + nla_nest_end(msg, entry_attr); return 0; @@ -436,6 +457,7 @@ static int fill_res_mr_entry(struct sk_buff *msg, struct netlink_callback *cb, struct rdma_restrack_entry *res, uint32_t port) { struct ib_mr *mr = container_of(res, struct ib_mr, res); + struct rdma_restrack_root *resroot = &mr->pd->device->res; struct nlattr *entry_attr; entry_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_RES_MR_ENTRY); @@ -459,6 +481,9 @@ static int fill_res_mr_entry(struct sk_buff *msg, struct netlink_callback *cb, if (fill_res_name_pid(msg, res)) goto err; + if (resroot->fill_res_entry(msg, res)) + goto err; + nla_nest_end(msg, entry_attr); return 0; @@ -472,6 +497,7 @@ static int fill_res_pd_entry(struct sk_buff *msg, struct netlink_callback *cb, struct rdma_restrack_entry *res, uint32_t port) { struct ib_pd *pd = container_of(res, struct ib_pd, res); + struct rdma_restrack_root *resroot = &pd->device->res; struct nlattr *entry_attr; entry_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_RES_PD_ENTRY); @@ -498,6 +524,9 @@ static int fill_res_pd_entry(struct sk_buff *msg, struct netlink_callback *cb, if (fill_res_name_pid(msg, res)) goto err; + if (resroot->fill_res_entry(msg, res)) + goto err; + nla_nest_end(msg, entry_attr); return 0; diff --git a/drivers/infiniband/core/restrack.c b/drivers/infiniband/core/restrack.c index efddd13e3edb..172b517dc7b9 100644 --- a/drivers/infiniband/core/restrack.c +++ b/drivers/infiniband/core/restrack.c @@ -12,9 +12,16 @@ #include "cma_priv.h" +static int fill_res_noop(struct sk_buff *msg, + struct rdma_restrack_entry *entry) +{ + return 0; +} + void rdma_restrack_init(struct rdma_restrack_root *res) { init_rwsem(&res->rwsem); + res->fill_res_entry = fill_res_noop; } static const char *type2str(enum rdma_restrack_type type) diff --git a/include/rdma/restrack.h b/include/rdma/restrack.h index f3b3e3576f6a..14304e249685 100644 --- a/include/rdma/restrack.h +++ b/include/rdma/restrack.h @@ -44,6 +44,8 @@ enum rdma_restrack_type { }; #define RDMA_RESTRACK_HASH_BITS 8 +struct rdma_restrack_entry; + /** * struct rdma_restrack_root - main resource tracking management * entity, per-device @@ -57,6 +59,13 @@ struct rdma_restrack_root { * @hash: global database for all resources per-device */ DECLARE_HASHTABLE(hash, RDMA_RESTRACK_HASH_BITS); + /** + * @fill_res_entry: driver-specific fill function + * + * Allows rdma drivers to add their own restrack attributes. + */ + int (*fill_res_entry)(struct sk_buff *msg, + struct rdma_restrack_entry *entry); }; /** diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h index fbf99aa8a03c..07ff6c72fc50 100644 --- a/include/uapi/rdma/rdma_netlink.h +++ b/include/uapi/rdma/rdma_netlink.h @@ -249,6 +249,15 @@ enum rdma_nldev_command { RDMA_NLDEV_NUM_OPS }; +enum { + RDMA_NLDEV_ATTR_ENTRY_STRLEN = 16, +}; + +enum rdma_nldev_print_type { + RDMA_NLDEV_PRINT_TYPE_UNSPEC, + RDMA_NLDEV_PRINT_TYPE_HEX, +}; + enum rdma_nldev_attr { /* don't change the order or add anything between, this is ABI! */ RDMA_NLDEV_ATTR_UNSPEC, @@ -390,6 +399,20 @@ enum rdma_nldev_attr { RDMA_NLDEV_ATTR_RES_PD_ENTRY, /* nested table */ RDMA_NLDEV_ATTR_RES_LOCAL_DMA_LKEY, /* u32 */ RDMA_NLDEV_ATTR_RES_UNSAFE_GLOBAL_RKEY, /* u32 */ + /* + * driver-specific attributes. + */ + RDMA_NLDEV_ATTR_DRIVER, /* nested table */ + RDMA_NLDEV_ATTR_DRIVER_ENTRY, /* nested table */ + RDMA_NLDEV_ATTR_DRIVER_STRING, /* string */ + /* + * u8 values from enum rdma_nldev_print_type + */ + RDMA_NLDEV_ATTR_DRIVER_PRINT_TYPE, /* u8 */ + RDMA_NLDEV_ATTR_DRIVER_S32, /* s32 */ + RDMA_NLDEV_ATTR_DRIVER_U32, /* u32 */ + RDMA_NLDEV_ATTR_DRIVER_S64, /* s64 */ + RDMA_NLDEV_ATTR_DRIVER_U64, /* u64 */ /* * Provides logical name and index of netdevice which is -- cgit v1.2.3 From 73937e8a030b046c6b0fa73868bee25647a29be4 Mon Sep 17 00:00:00 2001 From: Steve Wise Date: Thu, 3 May 2018 08:41:42 -0700 Subject: RDMA/nldev: helper functions to add driver attributes These help rdma drivers to fill out the driver entries. Signed-off-by: Steve Wise Signed-off-by: Doug Ledford --- drivers/infiniband/core/nldev.c | 66 +++++++++++++++++++++++++++++++++++++++++ include/rdma/restrack.h | 11 +++++++ 2 files changed, 77 insertions(+) (limited to 'drivers') diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c index 50efca482a6c..8674ca2d8f91 100644 --- a/drivers/infiniband/core/nldev.c +++ b/drivers/infiniband/core/nldev.c @@ -109,6 +109,72 @@ static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = { [RDMA_NLDEV_ATTR_DRIVER_U64] = { .type = NLA_U64 }, }; +static int put_driver_name_print_type(struct sk_buff *msg, const char *name, + enum rdma_nldev_print_type print_type) +{ + if (nla_put_string(msg, RDMA_NLDEV_ATTR_DRIVER_STRING, name)) + return -EMSGSIZE; + if (print_type != RDMA_NLDEV_PRINT_TYPE_UNSPEC && + nla_put_u8(msg, RDMA_NLDEV_ATTR_DRIVER_PRINT_TYPE, print_type)) + return -EMSGSIZE; + + return 0; +} + +static int _rdma_nl_put_driver_u32(struct sk_buff *msg, const char *name, + enum rdma_nldev_print_type print_type, + u32 value) +{ + if (put_driver_name_print_type(msg, name, print_type)) + return -EMSGSIZE; + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_DRIVER_U32, value)) + return -EMSGSIZE; + + return 0; +} + +static int _rdma_nl_put_driver_u64(struct sk_buff *msg, const char *name, + enum rdma_nldev_print_type print_type, + u64 value) +{ + if (put_driver_name_print_type(msg, name, print_type)) + return -EMSGSIZE; + if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_DRIVER_U64, value, + RDMA_NLDEV_ATTR_PAD)) + return -EMSGSIZE; + + return 0; +} + +int rdma_nl_put_driver_u32(struct sk_buff *msg, const char *name, u32 value) +{ + return _rdma_nl_put_driver_u32(msg, name, RDMA_NLDEV_PRINT_TYPE_UNSPEC, + value); +} +EXPORT_SYMBOL(rdma_nl_put_driver_u32); + +int rdma_nl_put_driver_u32_hex(struct sk_buff *msg, const char *name, + u32 value) +{ + return _rdma_nl_put_driver_u32(msg, name, RDMA_NLDEV_PRINT_TYPE_HEX, + value); +} +EXPORT_SYMBOL(rdma_nl_put_driver_u32_hex); + +int rdma_nl_put_driver_u64(struct sk_buff *msg, const char *name, u64 value) +{ + return _rdma_nl_put_driver_u64(msg, name, RDMA_NLDEV_PRINT_TYPE_UNSPEC, + value); +} +EXPORT_SYMBOL(rdma_nl_put_driver_u64); + +int rdma_nl_put_driver_u64_hex(struct sk_buff *msg, const char *name, u64 value) +{ + return _rdma_nl_put_driver_u64(msg, name, RDMA_NLDEV_PRINT_TYPE_HEX, + value); +} +EXPORT_SYMBOL(rdma_nl_put_driver_u64_hex); + static int fill_nldev_handle(struct sk_buff *msg, struct ib_device *device) { if (nla_put_u32(msg, RDMA_NLDEV_ATTR_DEV_INDEX, device->index)) diff --git a/include/rdma/restrack.h b/include/rdma/restrack.h index 14304e249685..637968589922 100644 --- a/include/rdma/restrack.h +++ b/include/rdma/restrack.h @@ -12,6 +12,7 @@ #include #include #include +#include /** * enum rdma_restrack_type - HW objects to track @@ -183,4 +184,14 @@ static inline void rdma_restrack_set_task(struct rdma_restrack_entry *res, res->task = task; } +/* + * Helper functions for rdma drivers when filling out + * nldev driver attributes. + */ +int rdma_nl_put_driver_u32(struct sk_buff *msg, const char *name, u32 value); +int rdma_nl_put_driver_u32_hex(struct sk_buff *msg, const char *name, + u32 value); +int rdma_nl_put_driver_u64(struct sk_buff *msg, const char *name, u64 value); +int rdma_nl_put_driver_u64_hex(struct sk_buff *msg, const char *name, + u64 value); #endif /* _RDMA_RESTRACK_H_ */ -- cgit v1.2.3 From 056f9c7f39bf517d58f32797f1eb1465bb6f6ef2 Mon Sep 17 00:00:00 2001 From: Steve Wise Date: Thu, 3 May 2018 08:41:49 -0700 Subject: iw_cxgb4: dump detailed driver-specific QP information Provide a cxgb4-specific function to fill in qp state details. This allows dumping important c4iw_qp state useful for debugging. Included in the dump are the t4_sq, t4_rq structs, plus a dump of the t4_swsqe and t4swrqe descriptors for the first and last pending entries. Signed-off-by: Steve Wise Signed-off-by: Doug Ledford --- drivers/infiniband/hw/cxgb4/Makefile | 3 +- drivers/infiniband/hw/cxgb4/iw_cxgb4.h | 5 + drivers/infiniband/hw/cxgb4/provider.c | 8 ++ drivers/infiniband/hw/cxgb4/restrack.c | 248 +++++++++++++++++++++++++++++++++ 4 files changed, 263 insertions(+), 1 deletion(-) create mode 100644 drivers/infiniband/hw/cxgb4/restrack.c (limited to 'drivers') diff --git a/drivers/infiniband/hw/cxgb4/Makefile b/drivers/infiniband/hw/cxgb4/Makefile index fa40b685831b..9edd92023e18 100644 --- a/drivers/infiniband/hw/cxgb4/Makefile +++ b/drivers/infiniband/hw/cxgb4/Makefile @@ -3,4 +3,5 @@ ccflags-y += -Idrivers/net/ethernet/chelsio/libcxgb obj-$(CONFIG_INFINIBAND_CXGB4) += iw_cxgb4.o -iw_cxgb4-y := device.o cm.o provider.o mem.o cq.o qp.o resource.o ev.o id_table.o +iw_cxgb4-y := device.o cm.o provider.o mem.o cq.o qp.o resource.o ev.o id_table.o \ + restrack.o diff --git a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h index cc929002c05e..5eec8772468c 100644 --- a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h +++ b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h @@ -55,6 +55,7 @@ #include #include #include +#include #include "cxgb4.h" #include "cxgb4_uld.h" @@ -1078,4 +1079,8 @@ extern int use_dsgl; void c4iw_invalidate_mr(struct c4iw_dev *rhp, u32 rkey); struct c4iw_wr_wait *c4iw_alloc_wr_wait(gfp_t gfp); +typedef int c4iw_restrack_func(struct sk_buff *msg, + struct rdma_restrack_entry *res); +extern c4iw_restrack_func *c4iw_restrack_funcs[RDMA_RESTRACK_MAX]; + #endif diff --git a/drivers/infiniband/hw/cxgb4/provider.c b/drivers/infiniband/hw/cxgb4/provider.c index 0b9cc73c3ded..1feade8bb4b3 100644 --- a/drivers/infiniband/hw/cxgb4/provider.c +++ b/drivers/infiniband/hw/cxgb4/provider.c @@ -551,6 +551,13 @@ static struct net_device *get_netdev(struct ib_device *dev, u8 port) return ndev; } +static int fill_res_entry(struct sk_buff *msg, struct rdma_restrack_entry *res) +{ + return (res->type < ARRAY_SIZE(c4iw_restrack_funcs) && + c4iw_restrack_funcs[res->type]) ? + c4iw_restrack_funcs[res->type](msg, res) : 0; +} + void c4iw_register_device(struct work_struct *work) { int ret; @@ -645,6 +652,7 @@ void c4iw_register_device(struct work_struct *work) dev->ibdev.iwcm->add_ref = c4iw_qp_add_ref; dev->ibdev.iwcm->rem_ref = c4iw_qp_rem_ref; dev->ibdev.iwcm->get_qp = c4iw_get_qp; + dev->ibdev.res.fill_res_entry = fill_res_entry; memcpy(dev->ibdev.iwcm->ifname, dev->rdev.lldi.ports[0]->name, sizeof(dev->ibdev.iwcm->ifname)); diff --git a/drivers/infiniband/hw/cxgb4/restrack.c b/drivers/infiniband/hw/cxgb4/restrack.c new file mode 100644 index 000000000000..a677940b164a --- /dev/null +++ b/drivers/infiniband/hw/cxgb4/restrack.c @@ -0,0 +1,248 @@ +/* + * Copyright (c) 2018 Chelsio, Inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "iw_cxgb4.h" +#include +#include + +static int fill_sq(struct sk_buff *msg, struct t4_wq *wq) +{ + /* WQ+SQ */ + if (rdma_nl_put_driver_u32(msg, "sqid", wq->sq.qid)) + goto err; + if (rdma_nl_put_driver_u32(msg, "flushed", wq->flushed)) + goto err; + if (rdma_nl_put_driver_u32(msg, "memsize", wq->sq.memsize)) + goto err; + if (rdma_nl_put_driver_u32(msg, "cidx", wq->sq.cidx)) + goto err; + if (rdma_nl_put_driver_u32(msg, "pidx", wq->sq.pidx)) + goto err; + if (rdma_nl_put_driver_u32(msg, "wq_pidx", wq->sq.wq_pidx)) + goto err; + if (rdma_nl_put_driver_u32(msg, "flush_cidx", wq->sq.flush_cidx)) + goto err; + if (rdma_nl_put_driver_u32(msg, "in_use", wq->sq.in_use)) + goto err; + if (rdma_nl_put_driver_u32(msg, "size", wq->sq.size)) + goto err; + if (rdma_nl_put_driver_u32_hex(msg, "flags", wq->sq.flags)) + goto err; + return 0; +err: + return -EMSGSIZE; +} + +static int fill_rq(struct sk_buff *msg, struct t4_wq *wq) +{ + /* RQ */ + if (rdma_nl_put_driver_u32(msg, "rqid", wq->rq.qid)) + goto err; + if (rdma_nl_put_driver_u32(msg, "memsize", wq->rq.memsize)) + goto err; + if (rdma_nl_put_driver_u32(msg, "cidx", wq->rq.cidx)) + goto err; + if (rdma_nl_put_driver_u32(msg, "pidx", wq->rq.pidx)) + goto err; + if (rdma_nl_put_driver_u32(msg, "wq_pidx", wq->rq.wq_pidx)) + goto err; + if (rdma_nl_put_driver_u32(msg, "msn", wq->rq.msn)) + goto err; + if (rdma_nl_put_driver_u32_hex(msg, "rqt_hwaddr", wq->rq.rqt_hwaddr)) + goto err; + if (rdma_nl_put_driver_u32(msg, "rqt_size", wq->rq.rqt_size)) + goto err; + if (rdma_nl_put_driver_u32(msg, "in_use", wq->rq.in_use)) + goto err; + if (rdma_nl_put_driver_u32(msg, "size", wq->rq.size)) + goto err; + return 0; +err: + return -EMSGSIZE; +} + +static int fill_swsqe(struct sk_buff *msg, struct t4_sq *sq, u16 idx, + struct t4_swsqe *sqe) +{ + if (rdma_nl_put_driver_u32(msg, "idx", idx)) + goto err; + if (rdma_nl_put_driver_u32(msg, "opcode", sqe->opcode)) + goto err; + if (rdma_nl_put_driver_u64_hex(msg, "wr_id", sqe->wr_id)) + goto err; + if (rdma_nl_put_driver_u32(msg, "complete", sqe->complete)) + goto err; + if (sqe->complete && + rdma_nl_put_driver_u32(msg, "cqe_status", CQE_STATUS(&sqe->cqe))) + goto err; + if (rdma_nl_put_driver_u32(msg, "signaled", sqe->signaled)) + goto err; + if (rdma_nl_put_driver_u32(msg, "flushed", sqe->flushed)) + goto err; + return 0; +err: + return -EMSGSIZE; +} + +/* + * Dump the first and last pending sqes. + */ +static int fill_swsqes(struct sk_buff *msg, struct t4_sq *sq, + u16 first_idx, struct t4_swsqe *first_sqe, + u16 last_idx, struct t4_swsqe *last_sqe) +{ + if (!first_sqe) + goto out; + if (fill_swsqe(msg, sq, first_idx, first_sqe)) + goto err; + if (!last_sqe) + goto out; + if (fill_swsqe(msg, sq, last_idx, last_sqe)) + goto err; +out: + return 0; +err: + return -EMSGSIZE; +} + +static int fill_swrqe(struct sk_buff *msg, struct t4_rq *rq, u16 idx, + struct t4_swrqe *rqe) +{ + if (rdma_nl_put_driver_u32(msg, "idx", idx)) + goto err; + if (rdma_nl_put_driver_u64_hex(msg, "wr_id", rqe->wr_id)) + goto err; + return 0; +err: + return -EMSGSIZE; +} + +/* + * Dump the first and last pending rqes. + */ +static int fill_swrqes(struct sk_buff *msg, struct t4_rq *rq, + u16 first_idx, struct t4_swrqe *first_rqe, + u16 last_idx, struct t4_swrqe *last_rqe) +{ + if (!first_rqe) + goto out; + if (fill_swrqe(msg, rq, first_idx, first_rqe)) + goto err; + if (!last_rqe) + goto out; + if (fill_swrqe(msg, rq, last_idx, last_rqe)) + goto err; +out: + return 0; +err: + return -EMSGSIZE; +} + +static int fill_res_qp_entry(struct sk_buff *msg, + struct rdma_restrack_entry *res) +{ + struct ib_qp *ibqp = container_of(res, struct ib_qp, res); + struct t4_swsqe *fsp = NULL, *lsp = NULL; + struct t4_swrqe *frp = NULL, *lrp = NULL; + struct c4iw_qp *qhp = to_c4iw_qp(ibqp); + struct t4_swsqe first_sqe, last_sqe; + struct t4_swrqe first_rqe, last_rqe; + u16 first_sq_idx, last_sq_idx; + u16 first_rq_idx, last_rq_idx; + struct nlattr *table_attr; + struct t4_wq wq; + + /* User qp state is not available, so don't dump user qps */ + if (qhp->ucontext) + return 0; + + table_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_DRIVER); + if (!table_attr) + goto err; + + /* Get a consistent snapshot */ + spin_lock_irq(&qhp->lock); + wq = qhp->wq; + + /* If there are any pending sqes, copy the first and last */ + if (wq.sq.cidx != wq.sq.pidx) { + first_sq_idx = wq.sq.cidx; + first_sqe = qhp->wq.sq.sw_sq[first_sq_idx]; + fsp = &first_sqe; + last_sq_idx = wq.sq.pidx; + if (last_sq_idx-- == 0) + last_sq_idx = wq.sq.size - 1; + if (last_sq_idx != first_sq_idx) { + last_sqe = qhp->wq.sq.sw_sq[last_sq_idx]; + lsp = &last_sqe; + } + } + + /* If there are any pending rqes, copy the first and last */ + if (wq.rq.cidx != wq.rq.pidx) { + first_rq_idx = wq.rq.cidx; + first_rqe = qhp->wq.rq.sw_rq[first_rq_idx]; + frp = &first_rqe; + last_rq_idx = wq.rq.pidx; + if (last_rq_idx-- == 0) + last_rq_idx = wq.rq.size - 1; + if (last_rq_idx != first_rq_idx) { + last_rqe = qhp->wq.rq.sw_rq[last_rq_idx]; + lrp = &last_rqe; + } + } + spin_unlock_irq(&qhp->lock); + + if (fill_sq(msg, &wq)) + goto err_cancel_table; + + if (fill_swsqes(msg, &wq.sq, first_sq_idx, fsp, last_sq_idx, lsp)) + goto err_cancel_table; + + if (fill_rq(msg, &wq)) + goto err_cancel_table; + + if (fill_swrqes(msg, &wq.rq, first_rq_idx, frp, last_rq_idx, lrp)) + goto err_cancel_table; + + nla_nest_end(msg, table_attr); + return 0; + +err_cancel_table: + nla_nest_cancel(msg, table_attr); +err: + return -EMSGSIZE; +} + +c4iw_restrack_func *c4iw_restrack_funcs[RDMA_RESTRACK_MAX] = { + [RDMA_RESTRACK_QP] = fill_res_qp_entry, +}; -- cgit v1.2.3 From 25a1cd3fe551bfeffc12d6ef1aafb2f2ef5e54f6 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Wed, 2 May 2018 13:12:55 +0300 Subject: IB/core: Make gid_table_reserve_default() return void gid_table_reserve_default() always returns zero. Make it return void and simplify error checking. rdma_port is already calculated, use that while calling gid_table_reserve_default() instead of recalculating it. Reviewed-by: Daniel Jurgens Signed-off-by: Parav Pandit Signed-off-by: Leon Romanovsky Reviewed-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/core/cache.c | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) (limited to 'drivers') diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c index e337b08de2ff..140fd351764d 100644 --- a/drivers/infiniband/core/cache.c +++ b/drivers/infiniband/core/cache.c @@ -757,8 +757,8 @@ void ib_cache_gid_set_default_gid(struct ib_device *ib_dev, u8 port, } } -static int gid_table_reserve_default(struct ib_device *ib_dev, u8 port, - struct ib_gid_table *table) +static void gid_table_reserve_default(struct ib_device *ib_dev, u8 port, + struct ib_gid_table *table) { unsigned int i; unsigned long roce_gid_type_mask; @@ -768,8 +768,7 @@ static int gid_table_reserve_default(struct ib_device *ib_dev, u8 port, roce_gid_type_mask = roce_gid_type_mask_support(ib_dev, port); num_default_gids = hweight_long(roce_gid_type_mask); for (i = 0; i < num_default_gids && i < table->sz; i++) { - struct ib_gid_table_entry *entry = - &table->data_vec[i]; + struct ib_gid_table_entry *entry = &table->data_vec[i]; entry->props |= GID_TABLE_ENTRY_DEFAULT; current_gid = find_next_bit(&roce_gid_type_mask, @@ -777,8 +776,6 @@ static int gid_table_reserve_default(struct ib_device *ib_dev, u8 port, current_gid); entry->attr.gid_type = current_gid++; } - - return 0; } static int _gid_table_setup_one(struct ib_device *ib_dev) @@ -790,19 +787,14 @@ static int _gid_table_setup_one(struct ib_device *ib_dev) for (port = 0; port < ib_dev->phys_port_cnt; port++) { u8 rdma_port = port + rdma_start_port(ib_dev); - table = - alloc_gid_table( + table = alloc_gid_table( ib_dev->port_immutable[rdma_port].gid_tbl_len); if (!table) { err = -ENOMEM; goto rollback_table_setup; } - err = gid_table_reserve_default(ib_dev, - port + rdma_start_port(ib_dev), - table); - if (err) - goto rollback_table_setup; + gid_table_reserve_default(ib_dev, rdma_port, table); ib_dev->cache.ports[port].gid = table; } -- cgit v1.2.3 From be0e8f34b66c47fb9bf65858d0cd4b145d1e47b1 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Wed, 2 May 2018 13:12:56 +0300 Subject: IB/core: Reuse gid_table_release_one() in table allocation failure _gid_table_setup_one() only performs GID table cache memory allocation, marks entries as invalid (free) and marks the reserved entries. At this point GID table is empty and no entries are added. On dual port device if _gid_table_setup_one() fails to allocate the gid table for 2nd port, there is no need to perform cleanup_gid_table_port() to delete GID entries, as GID table is empty. Therefore make use of existing gid_table_release_one() routine which frees the GID table memory and avoid code duplication. Reviewed-by: Daniel Jurgens Signed-off-by: Parav Pandit Signed-off-by: Leon Romanovsky Reviewed-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/core/cache.c | 42 ++++++++++++++++------------------------- 1 file changed, 16 insertions(+), 26 deletions(-) (limited to 'drivers') diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c index 140fd351764d..f0887f29da97 100644 --- a/drivers/infiniband/core/cache.c +++ b/drivers/infiniband/core/cache.c @@ -778,50 +778,40 @@ static void gid_table_reserve_default(struct ib_device *ib_dev, u8 port, } } + +static void gid_table_release_one(struct ib_device *ib_dev) +{ + struct ib_gid_table *table; + u8 port; + + for (port = 0; port < ib_dev->phys_port_cnt; port++) { + table = ib_dev->cache.ports[port].gid; + release_gid_table(table); + ib_dev->cache.ports[port].gid = NULL; + } +} + static int _gid_table_setup_one(struct ib_device *ib_dev) { u8 port; struct ib_gid_table *table; - int err = 0; for (port = 0; port < ib_dev->phys_port_cnt; port++) { u8 rdma_port = port + rdma_start_port(ib_dev); table = alloc_gid_table( ib_dev->port_immutable[rdma_port].gid_tbl_len); - if (!table) { - err = -ENOMEM; + if (!table) goto rollback_table_setup; - } gid_table_reserve_default(ib_dev, rdma_port, table); ib_dev->cache.ports[port].gid = table; } - return 0; rollback_table_setup: - for (port = 0; port < ib_dev->phys_port_cnt; port++) { - table = ib_dev->cache.ports[port].gid; - - cleanup_gid_table_port(ib_dev, port + rdma_start_port(ib_dev), - table); - release_gid_table(table); - } - - return err; -} - -static void gid_table_release_one(struct ib_device *ib_dev) -{ - struct ib_gid_table *table; - u8 port; - - for (port = 0; port < ib_dev->phys_port_cnt; port++) { - table = ib_dev->cache.ports[port].gid; - release_gid_table(table); - ib_dev->cache.ports[port].gid = NULL; - } + gid_table_release_one(ib_dev); + return -ENOMEM; } static void gid_table_cleanup_one(struct ib_device *ib_dev) -- cgit v1.2.3 From ed3dd9b017b85e00a459c35bd4d3fe2b83b0d092 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Wed, 2 May 2018 13:15:24 +0300 Subject: RDMA/hns: Drop local zgid in favor of core defined variable The zgid is already provided by IB/core, so there is no need in locally defined variable, let's drop it and reuse common one. Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hns/hns_roce_main.c | 1 - 1 file changed, 1 deletion(-) (limited to 'drivers') diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c index 9d48bc07a9e6..1b79a388e9d1 100644 --- a/drivers/infiniband/hw/hns/hns_roce_main.c +++ b/drivers/infiniband/hw/hns/hns_roce_main.c @@ -99,7 +99,6 @@ static int hns_roce_del_gid(const struct ib_gid_attr *attr, void **context) { struct hns_roce_dev *hr_dev = to_hr_dev(attr->device); struct ib_gid_attr zattr = { }; - union ib_gid zgid = { {0} }; u8 port = attr->port_num - 1; unsigned long flags; int ret; -- cgit v1.2.3 From 064e526247070c79aa3063d93384db378649a640 Mon Sep 17 00:00:00 2001 From: Idan Burstein Date: Wed, 2 May 2018 13:16:39 +0300 Subject: IB/mlx5: posting klm/mtt list inline in the send queue for reg_wr As most kernel RDMA ULPs, (e.g. NVMe over Fabrics in its default "register_always=Y" mode) registers and invalidates user buffer upon each IO. Today the mlx5 driver is posting the registration work request using scatter/gather entry for the MTT/KLM list. The fetch of the MTT/KLM list becomes the bottleneck in number of IO operation could be done by NVMe over Fabrics host driver on a single adapter as shown below. This patch is adding the support for inline registration work request upon MTT/KLM list of size <=64B. The result for NVMe over Fabrics is increase of > x3.5 for small IOs as shown below, I expect other ULPs (e.g iSER, SRP, NFS over RDMA) performance to be enhanced as well. The following results were taken against a single NVMe-oF (RoCE link layer) subsystem with a single namespace backed by null_blk using fio benchmark (with rw=randread, numjobs=48, iodepth={16,64}, ioengine=libaio direct=1): ConnectX-5 (pci Width x16) --------------------------- Block Size s/g reg_wr inline reg_wr ++++++++++ +++++++++++++++ ++++++++++++++++ 512B 1302.8K/34.82% 4951.9K/99.02% 1KB 1284.3K/33.86% 4232.7K/98.09% 2KB 1238.6K/34.1% 2797.5K/80.04% 4KB 1169.3K/32.46% 1941.3K/61.35% 8KB 1013.4K/30.08% 1236.6K/39.47% 16KB 695.7K/20.19% 696.9K/20.59% 32KB 350.3K/9.64% 350.6K/10.3% 64KB 175.86K/5.27% 175.9K/5.28% ConnectX-4 (pci Width x8) --------------------------- Block Size s/g reg_wr inline reg_wr ++++++++++ +++++++++++++++ ++++++++++++++++ 512B 1285.8K/42.66% 4242.7K/98.18% 1KB 1254.1K/41.74% 3569.2K/96.00% 2KB 1185.9K/39.83% 2173.9K/75.58% 4KB 1069.4K/36.46% 1343.3K/47.47% 8KB 755.1K/27.77% 748.7K/29.14% Tested-by: Nitzan Carmi Signed-off-by: Idan Burstein Signed-off-by: Max Gurtovoy Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/hw/mlx5/qp.c | 43 ++++++++++++++++++++++++++++++++++------- 1 file changed, 36 insertions(+), 7 deletions(-) (limited to 'drivers') diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 7ed4b70f6447..7a9870a4823f 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -54,6 +54,7 @@ enum { enum { MLX5_IB_SQ_STRIDE = 6, + MLX5_IB_SQ_UMR_INLINE_THRESHOLD = 64, }; static const u32 mlx5_ib_opcode[] = { @@ -298,7 +299,9 @@ static int sq_overhead(struct ib_qp_init_attr *attr) max(sizeof(struct mlx5_wqe_atomic_seg) + sizeof(struct mlx5_wqe_raddr_seg), sizeof(struct mlx5_wqe_umr_ctrl_seg) + - sizeof(struct mlx5_mkey_seg)); + sizeof(struct mlx5_mkey_seg) + + MLX5_IB_SQ_UMR_INLINE_THRESHOLD / + MLX5_IB_UMR_OCTOWORD); break; case IB_QPT_XRC_TGT: @@ -3633,13 +3636,15 @@ static __be64 sig_mkey_mask(void) } static void set_reg_umr_seg(struct mlx5_wqe_umr_ctrl_seg *umr, - struct mlx5_ib_mr *mr) + struct mlx5_ib_mr *mr, bool umr_inline) { int size = mr->ndescs * mr->desc_size; memset(umr, 0, sizeof(*umr)); umr->flags = MLX5_UMR_CHECK_NOT_FREE; + if (umr_inline) + umr->flags |= MLX5_UMR_INLINE; umr->xlt_octowords = cpu_to_be16(get_xlt_octo(size)); umr->mkey_mask = frwr_mkey_mask(); } @@ -3823,6 +3828,24 @@ static void set_reg_data_seg(struct mlx5_wqe_data_seg *dseg, dseg->lkey = cpu_to_be32(pd->ibpd.local_dma_lkey); } +static void set_reg_umr_inline_seg(void *seg, struct mlx5_ib_qp *qp, + struct mlx5_ib_mr *mr, int mr_list_size) +{ + void *qend = qp->sq.qend; + void *addr = mr->descs; + int copy; + + if (unlikely(seg + mr_list_size > qend)) { + copy = qend - seg; + memcpy(seg, addr, copy); + addr += copy; + mr_list_size -= copy; + seg = mlx5_get_send_wqe(qp, 0); + } + memcpy(seg, addr, mr_list_size); + seg += mr_list_size; +} + static __be32 send_ieth(struct ib_send_wr *wr) { switch (wr->opcode) { @@ -4217,6 +4240,8 @@ static int set_reg_wr(struct mlx5_ib_qp *qp, { struct mlx5_ib_mr *mr = to_mmr(wr->mr); struct mlx5_ib_pd *pd = to_mpd(qp->ibqp.pd); + int mr_list_size = mr->ndescs * mr->desc_size; + bool umr_inline = mr_list_size <= MLX5_IB_SQ_UMR_INLINE_THRESHOLD; if (unlikely(wr->wr.send_flags & IB_SEND_INLINE)) { mlx5_ib_warn(to_mdev(qp->ibqp.device), @@ -4224,7 +4249,7 @@ static int set_reg_wr(struct mlx5_ib_qp *qp, return -EINVAL; } - set_reg_umr_seg(*seg, mr); + set_reg_umr_seg(*seg, mr, umr_inline); *seg += sizeof(struct mlx5_wqe_umr_ctrl_seg); *size += sizeof(struct mlx5_wqe_umr_ctrl_seg) / 16; if (unlikely((*seg == qp->sq.qend))) @@ -4236,10 +4261,14 @@ static int set_reg_wr(struct mlx5_ib_qp *qp, if (unlikely((*seg == qp->sq.qend))) *seg = mlx5_get_send_wqe(qp, 0); - set_reg_data_seg(*seg, mr, pd); - *seg += sizeof(struct mlx5_wqe_data_seg); - *size += (sizeof(struct mlx5_wqe_data_seg) / 16); - + if (umr_inline) { + set_reg_umr_inline_seg(*seg, qp, mr, mr_list_size); + *size += get_xlt_octo(mr_list_size); + } else { + set_reg_data_seg(*seg, mr, pd); + *seg += sizeof(struct mlx5_wqe_data_seg); + *size += (sizeof(struct mlx5_wqe_data_seg) / 16); + } return 0; } -- cgit v1.2.3 From 254361c1890e67486cd957e9072e518b1c464e27 Mon Sep 17 00:00:00 2001 From: Sebastian Sanchez Date: Wed, 2 May 2018 06:42:21 -0700 Subject: IB/hfi1: Prevent LNI hang when LCB can't obtain lanes When the LCB isn't able to get any lanes operational on the first transition into mission mode, the link transfer active never happens and the LNI stays in the polling state indefinitely. Reset LCB upon receiving an 8051 interrupt for LCB to try to obtain lanes with firmware version 1.25.0 or later. Also, update the LCB reset value in other parts of the code with a macro defined to make the code more maintainable and rename functions with the link_width label to link_mode to reflect the fact that those functions set and read link related data not just the link width. Reviewed-by: Michael J. Ruhl Reviewed-by: Mike Marciniszyn Signed-off-by: Sebastian Sanchez Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/chip.c | 51 ++++++++++++++++++++--------- drivers/infiniband/hw/hfi1/chip.h | 15 +++++++-- drivers/infiniband/hw/hfi1/chip_registers.h | 7 ++-- 3 files changed, 53 insertions(+), 20 deletions(-) (limited to 'drivers') diff --git a/drivers/infiniband/hw/hfi1/chip.c b/drivers/infiniband/hw/hfi1/chip.c index e6a60fa59f2b..cb9095d2cbc9 100644 --- a/drivers/infiniband/hw/hfi1/chip.c +++ b/drivers/infiniband/hw/hfi1/chip.c @@ -1032,8 +1032,8 @@ static void read_vc_remote_fabric(struct hfi1_devdata *dd, u8 *vau, u8 *z, u8 *vcu, u16 *vl15buf, u8 *crc_sizes); static void read_vc_remote_link_width(struct hfi1_devdata *dd, u8 *remote_tx_rate, u16 *link_widths); -static void read_vc_local_link_width(struct hfi1_devdata *dd, u8 *misc_bits, - u8 *flag_bits, u16 *link_widths); +static void read_vc_local_link_mode(struct hfi1_devdata *dd, u8 *misc_bits, + u8 *flag_bits, u16 *link_widths); static void read_remote_device_id(struct hfi1_devdata *dd, u16 *device_id, u8 *device_rev); static void read_local_lni(struct hfi1_devdata *dd, u8 *enable_lane_rx); @@ -6350,6 +6350,18 @@ static void handle_8051_request(struct hfi1_pportdata *ppd) dd_dev_info(dd, "8051 request: request 0x%x not supported\n", type); hreq_response(dd, HREQ_NOT_SUPPORTED, 0); + break; + case HREQ_LCB_RESET: + /* Put the LCB, RX FPE and TX FPE into reset */ + write_csr(dd, DCC_CFG_RESET, LCB_RX_FPE_TX_FPE_INTO_RESET); + /* Make sure the write completed */ + (void)read_csr(dd, DCC_CFG_RESET); + /* Hold the reset long enough to take effect */ + udelay(1); + /* Take the LCB, RX FPE and TX FPE out of reset */ + write_csr(dd, DCC_CFG_RESET, LCB_RX_FPE_TX_FPE_OUT_OF_RESET); + hreq_response(dd, HREQ_SUCCESS, 0); + break; case HREQ_CONFIG_DONE: hreq_response(dd, HREQ_SUCCESS, 0); @@ -6461,8 +6473,7 @@ static void lcb_shutdown(struct hfi1_devdata *dd, int abort) dd->lcb_err_en = read_csr(dd, DC_LCB_ERR_EN); reg = read_csr(dd, DCC_CFG_RESET); write_csr(dd, DCC_CFG_RESET, reg | - (1ull << DCC_CFG_RESET_RESET_LCB_SHIFT) | - (1ull << DCC_CFG_RESET_RESET_RX_FPE_SHIFT)); + DCC_CFG_RESET_RESET_LCB | DCC_CFG_RESET_RESET_RX_FPE); (void)read_csr(dd, DCC_CFG_RESET); /* make sure the write completed */ if (!abort) { udelay(1); /* must hold for the longer of 16cclks or 20ns */ @@ -6527,7 +6538,7 @@ static void _dc_start(struct hfi1_devdata *dd) __func__); /* Take away reset for LCB and RX FPE (set in lcb_shutdown). */ - write_csr(dd, DCC_CFG_RESET, 0x10); + write_csr(dd, DCC_CFG_RESET, LCB_RX_FPE_TX_FPE_OUT_OF_RESET); /* lcb_shutdown() with abort=1 does not restore these */ write_csr(dd, DC_LCB_ERR_EN, dd->lcb_err_en); dd->dc_shutdown = 0; @@ -7348,7 +7359,7 @@ static void get_linkup_widths(struct hfi1_devdata *dd, u16 *tx_width, u8 misc_bits, local_flags; u16 active_tx, active_rx; - read_vc_local_link_width(dd, &misc_bits, &local_flags, &widths); + read_vc_local_link_mode(dd, &misc_bits, &local_flags, &widths); tx = widths >> 12; rx = (widths >> 8) & 0xf; @@ -8820,29 +8831,29 @@ static int write_vc_local_fabric(struct hfi1_devdata *dd, u8 vau, u8 z, u8 vcu, GENERAL_CONFIG, frame); } -static void read_vc_local_link_width(struct hfi1_devdata *dd, u8 *misc_bits, - u8 *flag_bits, u16 *link_widths) +static void read_vc_local_link_mode(struct hfi1_devdata *dd, u8 *misc_bits, + u8 *flag_bits, u16 *link_widths) { u32 frame; - read_8051_config(dd, VERIFY_CAP_LOCAL_LINK_WIDTH, GENERAL_CONFIG, + read_8051_config(dd, VERIFY_CAP_LOCAL_LINK_MODE, GENERAL_CONFIG, &frame); *misc_bits = (frame >> MISC_CONFIG_BITS_SHIFT) & MISC_CONFIG_BITS_MASK; *flag_bits = (frame >> LOCAL_FLAG_BITS_SHIFT) & LOCAL_FLAG_BITS_MASK; *link_widths = (frame >> LINK_WIDTH_SHIFT) & LINK_WIDTH_MASK; } -static int write_vc_local_link_width(struct hfi1_devdata *dd, - u8 misc_bits, - u8 flag_bits, - u16 link_widths) +static int write_vc_local_link_mode(struct hfi1_devdata *dd, + u8 misc_bits, + u8 flag_bits, + u16 link_widths) { u32 frame; frame = (u32)misc_bits << MISC_CONFIG_BITS_SHIFT | (u32)flag_bits << LOCAL_FLAG_BITS_SHIFT | (u32)link_widths << LINK_WIDTH_SHIFT; - return load_8051_config(dd, VERIFY_CAP_LOCAL_LINK_WIDTH, GENERAL_CONFIG, + return load_8051_config(dd, VERIFY_CAP_LOCAL_LINK_MODE, GENERAL_CONFIG, frame); } @@ -9312,8 +9323,16 @@ static int set_local_link_attributes(struct hfi1_pportdata *ppd) if (loopback == LOOPBACK_SERDES) misc_bits |= 1 << LOOPBACK_SERDES_CONFIG_BIT_MASK_SHIFT; - ret = write_vc_local_link_width(dd, misc_bits, 0, - opa_to_vc_link_widths( + /* + * An external device configuration request is used to reset the LCB + * to retry to obtain operational lanes when the first attempt is + * unsuccesful. + */ + if (dd->dc8051_ver >= dc8051_ver(1, 25, 0)) + misc_bits |= 1 << EXT_CFG_LCB_RESET_SUPPORTED_SHIFT; + + ret = write_vc_local_link_mode(dd, misc_bits, 0, + opa_to_vc_link_widths( ppd->link_width_enabled)); if (ret != HCMD_SUCCESS) goto set_local_link_attributes_fail; diff --git a/drivers/infiniband/hw/hfi1/chip.h b/drivers/infiniband/hw/hfi1/chip.h index c0d70f255050..fdf389e46e19 100644 --- a/drivers/infiniband/hw/hfi1/chip.h +++ b/drivers/infiniband/hw/hfi1/chip.h @@ -196,6 +196,15 @@ #define LSTATE_ARMED 0x3 #define LSTATE_ACTIVE 0x4 +/* DCC_CFG_RESET reset states */ +#define LCB_RX_FPE_TX_FPE_INTO_RESET (DCC_CFG_RESET_RESET_LCB | \ + DCC_CFG_RESET_RESET_TX_FPE | \ + DCC_CFG_RESET_RESET_RX_FPE | \ + DCC_CFG_RESET_ENABLE_CCLK_BCC) + /* 0x17 */ + +#define LCB_RX_FPE_TX_FPE_OUT_OF_RESET DCC_CFG_RESET_ENABLE_CCLK_BCC /* 0x10 */ + /* DC8051_STS_CUR_STATE port values (physical link states) */ #define PLS_DISABLED 0x30 #define PLS_OFFLINE 0x90 @@ -283,6 +292,7 @@ #define HREQ_SET_TX_EQ_ABS 0x04 #define HREQ_SET_TX_EQ_REL 0x05 #define HREQ_ENABLE 0x06 +#define HREQ_LCB_RESET 0x07 #define HREQ_CONFIG_DONE 0xfe #define HREQ_INTERFACE_TEST 0xff @@ -383,7 +393,7 @@ #define TX_SETTINGS 0x06 #define VERIFY_CAP_LOCAL_PHY 0x07 #define VERIFY_CAP_LOCAL_FABRIC 0x08 -#define VERIFY_CAP_LOCAL_LINK_WIDTH 0x09 +#define VERIFY_CAP_LOCAL_LINK_MODE 0x09 #define LOCAL_DEVICE_ID 0x0a #define RESERVED_REGISTERS 0x0b #define LOCAL_LNI_INFO 0x0c @@ -584,8 +594,9 @@ enum { #define LOOPBACK_LCB 2 #define LOOPBACK_CABLE 3 /* external cable */ -/* set up serdes bit in MISC_CONFIG_BITS */ +/* set up bits in MISC_CONFIG_BITS */ #define LOOPBACK_SERDES_CONFIG_BIT_MASK_SHIFT 0 +#define EXT_CFG_LCB_RESET_SUPPORTED_SHIFT 3 /* read and write hardware registers */ u64 read_csr(const struct hfi1_devdata *dd, u32 offset); diff --git a/drivers/infiniband/hw/hfi1/chip_registers.h b/drivers/infiniband/hw/hfi1/chip_registers.h index 793514f1d15f..da598b5fe8f6 100644 --- a/drivers/infiniband/hw/hfi1/chip_registers.h +++ b/drivers/infiniband/hw/hfi1/chip_registers.h @@ -97,8 +97,11 @@ #define DCC_CFG_PORT_CONFIG_MTU_CAP_SHIFT 32 #define DCC_CFG_PORT_CONFIG_MTU_CAP_SMASK 0x700000000ull #define DCC_CFG_RESET (DCC_CSRS + 0x000000000000) -#define DCC_CFG_RESET_RESET_LCB_SHIFT 0 -#define DCC_CFG_RESET_RESET_RX_FPE_SHIFT 2 +#define DCC_CFG_RESET_RESET_LCB BIT_ULL(0) +#define DCC_CFG_RESET_RESET_TX_FPE BIT_ULL(1) +#define DCC_CFG_RESET_RESET_RX_FPE BIT_ULL(2) +#define DCC_CFG_RESET_RESET_8051 BIT_ULL(3) +#define DCC_CFG_RESET_ENABLE_CCLK_BCC BIT_ULL(4) #define DCC_CFG_SC_VL_TABLE_15_0 (DCC_CSRS + 0x000000000028) #define DCC_CFG_SC_VL_TABLE_15_0_ENTRY0_SHIFT 0 #define DCC_CFG_SC_VL_TABLE_15_0_ENTRY10_SHIFT 40 -- cgit v1.2.3 From 48e0a6559dd8e6aa87841270868423b23076220e Mon Sep 17 00:00:00 2001 From: "Michael J. Ruhl" Date: Wed, 2 May 2018 06:42:29 -0700 Subject: IB/hfi1: Return actual error value from program_rcvarray() A failure of program_rcvarray() is treated inconsistently by the calling function. In one case the error is returned, in a second case, the error is overwritten with EFAULT. In both cases the code path is doing the same thing, allocating memory for groups, so it should be consistent. Make the error path consistent and return the error generated by program_rcvarray(). Reviewed-by: Harish Chegondi Fixes: 7e7a436ecb6e ("staging/hfi1: Add TID entry program function body") Signed-off-by: Michael J. Ruhl Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/user_exp_rcv.c | 1 - 1 file changed, 1 deletion(-) (limited to 'drivers') diff --git a/drivers/infiniband/hw/hfi1/user_exp_rcv.c b/drivers/infiniband/hw/hfi1/user_exp_rcv.c index 0d5330b7353d..6a4c5142515a 100644 --- a/drivers/infiniband/hw/hfi1/user_exp_rcv.c +++ b/drivers/infiniband/hw/hfi1/user_exp_rcv.c @@ -437,7 +437,6 @@ int hfi1_user_exp_rcv_setup(struct hfi1_filedata *fd, hfi1_cdbg(TID, "Failed to program RcvArray entries %d", ret); - ret = -EFAULT; goto unlock; } else if (ret > 0) { if (grp->used == grp->size) -- cgit v1.2.3 From 959f2d172daa0133e4d5b7a64344f8b2c2d87fc1 Mon Sep 17 00:00:00 2001 From: Alex Estrin Date: Wed, 2 May 2018 06:42:36 -0700 Subject: IB/hfi1: Complete check for locally terminated smp For lid routed packets 'hop_cnt' is zero, therefore current test is incomplete. Fix it by using local mad check for both lid routed and direct routed MADs. Reviewed-by: Mike Mariciniszyn Signed-off-by: Alex Estrin Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/mad.c | 36 ++++++++++++++++++++---------------- 1 file changed, 20 insertions(+), 16 deletions(-) (limited to 'drivers') diff --git a/drivers/infiniband/hw/hfi1/mad.c b/drivers/infiniband/hw/hfi1/mad.c index e9962c65c68f..983b5794a660 100644 --- a/drivers/infiniband/hw/hfi1/mad.c +++ b/drivers/infiniband/hw/hfi1/mad.c @@ -1238,7 +1238,7 @@ static int port_states_transition_allowed(struct hfi1_pportdata *ppd, } static int set_port_states(struct hfi1_pportdata *ppd, struct opa_smp *smp, - u32 logical_state, u32 phys_state) + u32 logical_state, u32 phys_state, int local_mad) { struct hfi1_devdata *dd = ppd->dd; u32 link_state; @@ -1314,7 +1314,7 @@ static int set_port_states(struct hfi1_pportdata *ppd, struct opa_smp *smp, * Don't send a reply if the response would be sent * through the disabled port. */ - if (link_state == HLS_DN_DISABLE && smp->hop_cnt) + if (link_state == HLS_DN_DISABLE && !local_mad) return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED; break; case IB_PORT_ARMED: @@ -1350,7 +1350,7 @@ static int set_port_states(struct hfi1_pportdata *ppd, struct opa_smp *smp, */ static int __subn_set_opa_portinfo(struct opa_smp *smp, u32 am, u8 *data, struct ib_device *ibdev, u8 port, - u32 *resp_len, u32 max_len) + u32 *resp_len, u32 max_len, int local_mad) { struct opa_port_info *pi = (struct opa_port_info *)data; struct ib_event event; @@ -1634,7 +1634,7 @@ static int __subn_set_opa_portinfo(struct opa_smp *smp, u32 am, u8 *data, */ if (!invalid) { - ret = set_port_states(ppd, smp, ls_new, ps_new); + ret = set_port_states(ppd, smp, ls_new, ps_new, local_mad); if (ret) return ret; } @@ -2085,7 +2085,7 @@ static int __subn_get_opa_psi(struct opa_smp *smp, u32 am, u8 *data, static int __subn_set_opa_psi(struct opa_smp *smp, u32 am, u8 *data, struct ib_device *ibdev, u8 port, - u32 *resp_len, u32 max_len) + u32 *resp_len, u32 max_len, int local_mad) { u32 nports = OPA_AM_NPORT(am); u32 start_of_sm_config = OPA_AM_START_SM_CFG(am); @@ -2122,7 +2122,7 @@ static int __subn_set_opa_psi(struct opa_smp *smp, u32 am, u8 *data, } if (!invalid) { - ret = set_port_states(ppd, smp, ls_new, ps_new); + ret = set_port_states(ppd, smp, ls_new, ps_new, local_mad); if (ret) return ret; } @@ -4190,7 +4190,7 @@ static int subn_get_opa_sma(__be16 attr_id, struct opa_smp *smp, u32 am, static int subn_set_opa_sma(__be16 attr_id, struct opa_smp *smp, u32 am, u8 *data, struct ib_device *ibdev, u8 port, - u32 *resp_len, u32 max_len) + u32 *resp_len, u32 max_len, int local_mad) { int ret; struct hfi1_ibport *ibp = to_iport(ibdev, port); @@ -4198,7 +4198,7 @@ static int subn_set_opa_sma(__be16 attr_id, struct opa_smp *smp, u32 am, switch (attr_id) { case IB_SMP_ATTR_PORT_INFO: ret = __subn_set_opa_portinfo(smp, am, data, ibdev, port, - resp_len, max_len); + resp_len, max_len, local_mad); break; case IB_SMP_ATTR_PKEY_TABLE: ret = __subn_set_opa_pkeytable(smp, am, data, ibdev, port, @@ -4222,7 +4222,7 @@ static int subn_set_opa_sma(__be16 attr_id, struct opa_smp *smp, u32 am, break; case OPA_ATTRIB_ID_PORT_STATE_INFO: ret = __subn_set_opa_psi(smp, am, data, ibdev, port, - resp_len, max_len); + resp_len, max_len, local_mad); break; case OPA_ATTRIB_ID_BUFFER_CONTROL_TABLE: ret = __subn_set_opa_bct(smp, am, data, ibdev, port, @@ -4314,7 +4314,7 @@ static int subn_get_opa_aggregate(struct opa_smp *smp, static int subn_set_opa_aggregate(struct opa_smp *smp, struct ib_device *ibdev, u8 port, - u32 *resp_len) + u32 *resp_len, int local_mad) { int i; u32 num_attr = be32_to_cpu(smp->attr_mod) & 0x000000ff; @@ -4344,7 +4344,9 @@ static int subn_set_opa_aggregate(struct opa_smp *smp, } (void)subn_set_opa_sma(agg->attr_id, smp, am, agg->data, - ibdev, port, NULL, (u32)agg_data_len); + ibdev, port, NULL, (u32)agg_data_len, + local_mad); + if (smp->status & IB_SMP_INVALID_FIELD) break; if (smp->status & ~IB_SMP_DIRECTION) { @@ -4519,7 +4521,7 @@ static int hfi1_pkey_validation_pma(struct hfi1_ibport *ibp, static int process_subn_opa(struct ib_device *ibdev, int mad_flags, u8 port, const struct opa_mad *in_mad, struct opa_mad *out_mad, - u32 *resp_len) + u32 *resp_len, int local_mad) { struct opa_smp *smp = (struct opa_smp *)out_mad; struct hfi1_ibport *ibp = to_iport(ibdev, port); @@ -4588,11 +4590,11 @@ static int process_subn_opa(struct ib_device *ibdev, int mad_flags, default: ret = subn_set_opa_sma(attr_id, smp, am, data, ibdev, port, resp_len, - data_size); + data_size, local_mad); break; case OPA_ATTRIB_ID_AGGREGATE: ret = subn_set_opa_aggregate(smp, ibdev, port, - resp_len); + resp_len, local_mad); break; } break; @@ -4832,6 +4834,7 @@ static int hfi1_process_opa_mad(struct ib_device *ibdev, int mad_flags, { int ret; int pkey_idx; + int local_mad = 0; u32 resp_len = 0; struct hfi1_ibport *ibp = to_iport(ibdev, port); @@ -4846,13 +4849,14 @@ static int hfi1_process_opa_mad(struct ib_device *ibdev, int mad_flags, switch (in_mad->mad_hdr.mgmt_class) { case IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE: case IB_MGMT_CLASS_SUBN_LID_ROUTED: - if (is_local_mad(ibp, in_mad, in_wc)) { + local_mad = is_local_mad(ibp, in_mad, in_wc); + if (local_mad) { ret = opa_local_smp_check(ibp, in_wc); if (ret) return IB_MAD_RESULT_FAILURE; } ret = process_subn_opa(ibdev, mad_flags, port, in_mad, - out_mad, &resp_len); + out_mad, &resp_len, local_mad); goto bail; case IB_MGMT_CLASS_PERF_MGMT: ret = hfi1_pkey_validation_pma(ibp, in_mad, in_wc); -- cgit v1.2.3 From 8c79d8223bb11b2f005695a32ddd3985de97727c Mon Sep 17 00:00:00 2001 From: Mike Marciniszyn Date: Wed, 2 May 2018 06:42:44 -0700 Subject: IB/hfi1: Fix fault injection init/exit issues There are config dependent code paths that expose panics in unload paths both in this file and in debugfs_remove_recursive() because CONFIG_FAULT_INJECTION and CONFIG_FAULT_INJECTION_DEBUG_FS can be set independently. Having CONFIG_FAULT_INJECTION set and CONFIG_FAULT_INJECTION_DEBUG_FS reset causes fault_create_debugfs_attr() to return an error. The debugfs.c routines tolerate failures, but the module unload panics dereferencing a NULL in the two exit routines. If that is fixed, the dir passed to debugfs_remove_recursive comes from a memory location that was freed and potentially reused causing a segfault or corrupting memory. Here is an example of the NULL deref panic: [66866.286829] BUG: unable to handle kernel NULL pointer dereference at 0000000000000088 [66866.295602] IP: hfi1_dbg_ibdev_exit+0x2a/0x80 [hfi1] [66866.301138] PGD 858496067 P4D 858496067 PUD 8433a7067 PMD 0 [66866.307452] Oops: 0000 [#1] SMP [66866.310953] Modules linked in: hfi1(-) rdmavt rdma_ucm ib_ucm ib_uverbs ib_umad rdma_cm iw_cm ib_cm ib_core rpcsec_gss_krb5 nfsv4 dns_resolver nfsv3 nfs fscache sb_edac x86_pkg_temp_thermal intel_powerclamp vfat fat coretemp kvm irqbypass crct10dif_pclmul crc32_pclmul ghash_clmulni_intel pcbc aesni_intel iTCO_wdt iTCO_vendor_support crypto_simd mei_me glue_helper cryptd mxm_wmi ipmi_si pcspkr lpc_ich sg mei ioatdma ipmi_devintf i2c_i801 mfd_core shpchp ipmi_msghandler wmi acpi_power_meter acpi_cpufreq nfsd auth_rpcgss nfs_acl lockd grace sunrpc ip_tables ext4 mbcache jbd2 sd_mod mgag200 drm_kms_helper syscopyarea sysfillrect sysimgblt igb fb_sys_fops ttm ahci ptp crc32c_intel libahci pps_core drm dca libata i2c_algo_bit i2c_core [last unloaded: opa_vnic] [66866.385551] CPU: 8 PID: 7470 Comm: rmmod Not tainted 4.14.0-mam-tid-rdma #2 [66866.393317] Hardware name: Intel Corporation S2600WT2/S2600WT2, BIOS SE5C610.86B.01.01.0018.C4.072020161249 07/20/2016 [66866.405252] task: ffff88084f28c380 task.stack: ffffc90008454000 [66866.411866] RIP: 0010:hfi1_dbg_ibdev_exit+0x2a/0x80 [hfi1] [66866.417984] RSP: 0018:ffffc90008457da0 EFLAGS: 00010202 [66866.423812] RAX: 0000000000000000 RBX: ffff880857de0000 RCX: 0000000180040001 [66866.431773] RDX: 0000000180040002 RSI: ffffea0021088200 RDI: 0000000040000000 [66866.439734] RBP: ffffc90008457da8 R08: ffff88084220e000 R09: 0000000180040001 [66866.447696] R10: 000000004220e001 R11: ffff88084220e000 R12: ffff88085a31c000 [66866.455657] R13: ffffffffa07c9820 R14: ffffffffa07c9890 R15: ffff881059d78100 [66866.463618] FS: 00007f6876047740(0000) GS:ffff88085f800000(0000) knlGS:0000000000000000 [66866.472644] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [66866.479053] CR2: 0000000000000088 CR3: 0000000856357006 CR4: 00000000001606e0 [66866.487013] Call Trace: [66866.489747] remove_one+0x1f/0x220 [hfi1] [66866.494221] pci_device_remove+0x39/0xc0 [66866.498596] device_release_driver_internal+0x141/0x210 [66866.504424] driver_detach+0x3f/0x80 [66866.508409] bus_remove_driver+0x55/0xd0 [66866.512784] driver_unregister+0x2c/0x50 [66866.517164] pci_unregister_driver+0x2a/0xa0 [66866.521934] hfi1_mod_cleanup+0x10/0xaa2 [hfi1] [66866.526988] SyS_delete_module+0x171/0x250 [66866.531558] do_syscall_64+0x67/0x1b0 [66866.535644] entry_SYSCALL64_slow_path+0x25/0x25 [66866.540792] RIP: 0033:0x7f6875525c27 [66866.544777] RSP: 002b:00007ffd48528e78 EFLAGS: 00000206 ORIG_RAX: 00000000000000b0 [66866.553224] RAX: ffffffffffffffda RBX: 0000000001cc01d0 RCX: 00007f6875525c27 [66866.561185] RDX: 00007f6875596000 RSI: 0000000000000800 RDI: 0000000001cc0238 [66866.569146] RBP: 0000000000000000 R08: 00007f68757e9060 R09: 00007f6875596000 [66866.577120] R10: 00007ffd48528c00 R11: 0000000000000206 R12: 00007ffd48529db4 [66866.585080] R13: 0000000000000000 R14: 0000000001cc01d0 R15: 0000000001cc0010 [66866.593040] Code: 90 0f 1f 44 00 00 48 83 3d a3 8b 03 00 00 55 48 89 e5 53 48 89 fb 74 4e 48 8d bf 18 0c 00 00 e8 9d f2 ff ff 48 8b 83 20 0c 00 00 <48> 8b b8 88 00 00 00 e8 2a 21 b3 e0 48 8b bb 20 0c 00 00 e8 0e [66866.614127] RIP: hfi1_dbg_ibdev_exit+0x2a/0x80 [hfi1] RSP: ffffc90008457da0 [66866.621885] CR2: 0000000000000088 [66866.625618] ---[ end trace c4817425783fb092 ]--- Fix by insuring that upon failure from fault_create_debugfs_attr() the parent pointer for the routines is always set to NULL and guards added in the exit routines to insure that debugfs_remove_recursive() is not called when when the parent pointer is NULL. Fixes: 0181ce31b260 ("IB/hfi1: Add receive fault injection feature") Cc: # 4.14.x Reviewed-by: Michael J. Ruhl Signed-off-by: Mike Marciniszyn Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/debugfs.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'drivers') diff --git a/drivers/infiniband/hw/hfi1/debugfs.c b/drivers/infiniband/hw/hfi1/debugfs.c index 852173bf05d0..5343960610fe 100644 --- a/drivers/infiniband/hw/hfi1/debugfs.c +++ b/drivers/infiniband/hw/hfi1/debugfs.c @@ -1227,7 +1227,8 @@ DEBUGFS_FILE_OPS(fault_stats); static void fault_exit_opcode_debugfs(struct hfi1_ibdev *ibd) { - debugfs_remove_recursive(ibd->fault_opcode->dir); + if (ibd->fault_opcode) + debugfs_remove_recursive(ibd->fault_opcode->dir); kfree(ibd->fault_opcode); ibd->fault_opcode = NULL; } @@ -1255,6 +1256,7 @@ static int fault_init_opcode_debugfs(struct hfi1_ibdev *ibd) &ibd->fault_opcode->attr); if (IS_ERR(ibd->fault_opcode->dir)) { kfree(ibd->fault_opcode); + ibd->fault_opcode = NULL; return -ENOENT; } @@ -1278,7 +1280,8 @@ fail: static void fault_exit_packet_debugfs(struct hfi1_ibdev *ibd) { - debugfs_remove_recursive(ibd->fault_packet->dir); + if (ibd->fault_packet) + debugfs_remove_recursive(ibd->fault_packet->dir); kfree(ibd->fault_packet); ibd->fault_packet = NULL; } @@ -1304,6 +1307,7 @@ static int fault_init_packet_debugfs(struct hfi1_ibdev *ibd) &ibd->fault_opcode->attr); if (IS_ERR(ibd->fault_packet->dir)) { kfree(ibd->fault_packet); + ibd->fault_packet = NULL; return -ENOENT; } -- cgit v1.2.3 From e4607073ffa5c72279370ba91113b76e70f62e16 Mon Sep 17 00:00:00 2001 From: "Michael J. Ruhl" Date: Wed, 2 May 2018 06:42:59 -0700 Subject: IB/hfi1: Return correct value for device state The driver_pstate() function is used to map internal driver state information to externally defined states. The VERIFY_CAP and GOING_UP states are config/training states, but the mapping routing returns the POLLING value. Update the return values for VERIFY_CAP and GOING_UP to return the correct value: TRAINING. Reviewed-by: Sebastian Sanchez Signed-off-by: Michael J. Ruhl Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/chip.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers') diff --git a/drivers/infiniband/hw/hfi1/chip.c b/drivers/infiniband/hw/hfi1/chip.c index cb9095d2cbc9..4cd422ff92f8 100644 --- a/drivers/infiniband/hw/hfi1/chip.c +++ b/drivers/infiniband/hw/hfi1/chip.c @@ -10510,9 +10510,9 @@ u32 driver_pstate(struct hfi1_pportdata *ppd) case HLS_DN_OFFLINE: return OPA_PORTPHYSSTATE_OFFLINE; case HLS_VERIFY_CAP: - return IB_PORTPHYSSTATE_POLLING; + return IB_PORTPHYSSTATE_TRAINING; case HLS_GOING_UP: - return IB_PORTPHYSSTATE_POLLING; + return IB_PORTPHYSSTATE_TRAINING; case HLS_GOING_OFFLINE: return OPA_PORTPHYSSTATE_OFFLINE; case HLS_LINK_COOLDOWN: -- cgit v1.2.3 From a93a0a31111231bb1949f4a83b17238f0fa32d6a Mon Sep 17 00:00:00 2001 From: "Michael J. Ruhl" Date: Wed, 2 May 2018 06:43:07 -0700 Subject: IB/hfi1: Reorder incorrect send context disable User send context integrity bits are cleared before the context is disabled. If the send context is still processing data, any packets that need those integrity bits will cause an error and halt the send context. During the disable handling, the driver waits for the context to drain. If the context is halted, the driver will eventually timeout because the context won't drain and then incorrectly bounce the link. Reorder the bit clearing and the context disable. Examine the software state and send context status as well as the egress status to determine if a send context is in the halted state. Promote the check macros to static functions for consistency with the new check and to follow kernel style. Remove an unused define that refers to the egress timeout. Cc: # 4.9.x Reviewed-by: Mitko Haralanov Reviewed-by: Mike Marciniszyn Signed-off-by: Michael J. Ruhl Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/file_ops.c | 2 +- drivers/infiniband/hw/hfi1/pio.c | 44 +++++++++++++++++++++++++++-------- 2 files changed, 35 insertions(+), 11 deletions(-) (limited to 'drivers') diff --git a/drivers/infiniband/hw/hfi1/file_ops.c b/drivers/infiniband/hw/hfi1/file_ops.c index 1b778fd16a32..c9d23c37a371 100644 --- a/drivers/infiniband/hw/hfi1/file_ops.c +++ b/drivers/infiniband/hw/hfi1/file_ops.c @@ -689,8 +689,8 @@ static int hfi1_file_close(struct inode *inode, struct file *fp) * checks to default and disable the send context. */ if (uctxt->sc) { - set_pio_integrity(uctxt->sc); sc_disable(uctxt->sc); + set_pio_integrity(uctxt->sc); } hfi1_free_ctxt_rcv_groups(uctxt); diff --git a/drivers/infiniband/hw/hfi1/pio.c b/drivers/infiniband/hw/hfi1/pio.c index 40dac4d16eb8..9cac15d10c4f 100644 --- a/drivers/infiniband/hw/hfi1/pio.c +++ b/drivers/infiniband/hw/hfi1/pio.c @@ -50,8 +50,6 @@ #include "qp.h" #include "trace.h" -#define SC_CTXT_PACKET_EGRESS_TIMEOUT 350 /* in chip cycles */ - #define SC(name) SEND_CTXT_##name /* * Send Context functions @@ -961,15 +959,40 @@ void sc_disable(struct send_context *sc) } /* return SendEgressCtxtStatus.PacketOccupancy */ -#define packet_occupancy(r) \ - (((r) & SEND_EGRESS_CTXT_STATUS_CTXT_EGRESS_PACKET_OCCUPANCY_SMASK)\ - >> SEND_EGRESS_CTXT_STATUS_CTXT_EGRESS_PACKET_OCCUPANCY_SHIFT) +static u64 packet_occupancy(u64 reg) +{ + return (reg & + SEND_EGRESS_CTXT_STATUS_CTXT_EGRESS_PACKET_OCCUPANCY_SMASK) + >> SEND_EGRESS_CTXT_STATUS_CTXT_EGRESS_PACKET_OCCUPANCY_SHIFT; +} /* is egress halted on the context? */ -#define egress_halted(r) \ - ((r) & SEND_EGRESS_CTXT_STATUS_CTXT_EGRESS_HALT_STATUS_SMASK) +static bool egress_halted(u64 reg) +{ + return !!(reg & SEND_EGRESS_CTXT_STATUS_CTXT_EGRESS_HALT_STATUS_SMASK); +} -/* wait for packet egress, optionally pause for credit return */ +/* is the send context halted? */ +static bool is_sc_halted(struct hfi1_devdata *dd, u32 hw_context) +{ + return !!(read_kctxt_csr(dd, hw_context, SC(STATUS)) & + SC(STATUS_CTXT_HALTED_SMASK)); +} + +/** + * sc_wait_for_packet_egress + * @sc: valid send context + * @pause: wait for credit return + * + * Wait for packet egress, optionally pause for credit return + * + * Egress halt and Context halt are not necessarily the same thing, so + * check for both. + * + * NOTE: The context halt bit may not be set immediately. Because of this, + * it is necessary to check the SW SFC_HALTED bit (set in the IRQ) and the HW + * context bit to determine if the context is halted. + */ static void sc_wait_for_packet_egress(struct send_context *sc, int pause) { struct hfi1_devdata *dd = sc->dd; @@ -981,8 +1004,9 @@ static void sc_wait_for_packet_egress(struct send_context *sc, int pause) reg_prev = reg; reg = read_csr(dd, sc->hw_context * 8 + SEND_EGRESS_CTXT_STATUS); - /* done if egress is stopped */ - if (egress_halted(reg)) + /* done if any halt bits, SW or HW are set */ + if (sc->flags & SCF_HALTED || + is_sc_halted(dd, sc->hw_context) || egress_halted(reg)) break; reg = packet_occupancy(reg); if (reg == 0) -- cgit v1.2.3 From 8d3e71136a080d007620472f50c7b3e63ba0f5cf Mon Sep 17 00:00:00 2001 From: Alex Estrin Date: Wed, 2 May 2018 06:43:15 -0700 Subject: IB/{hfi1, qib}: Add handling of kernel restart A warm restart will fail to unload the driver, leaving link state potentially flapping up to the point the BIOS resets the adapter. Correct the issue by hooking the shutdown pci method, which will bring port down. Cc: # 4.9.x Reviewed-by: Mike Marciniszyn Signed-off-by: Alex Estrin Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/hfi.h | 1 + drivers/infiniband/hw/hfi1/init.c | 13 +++++++++++++ drivers/infiniband/hw/qib/qib.h | 1 + drivers/infiniband/hw/qib/qib_init.c | 13 +++++++++++++ 4 files changed, 28 insertions(+) (limited to 'drivers') diff --git a/drivers/infiniband/hw/hfi1/hfi.h b/drivers/infiniband/hw/hfi1/hfi.h index cac2c62bc42d..9c97c180c35e 100644 --- a/drivers/infiniband/hw/hfi1/hfi.h +++ b/drivers/infiniband/hw/hfi1/hfi.h @@ -1856,6 +1856,7 @@ struct cc_state *get_cc_state_protected(struct hfi1_pportdata *ppd) #define HFI1_HAS_SDMA_TIMEOUT 0x8 #define HFI1_HAS_SEND_DMA 0x10 /* Supports Send DMA */ #define HFI1_FORCED_FREEZE 0x80 /* driver forced freeze mode */ +#define HFI1_SHUTDOWN 0x100 /* device is shutting down */ /* IB dword length mask in PBC (lower 11 bits); same for all chips */ #define HFI1_PBC_LENGTH_MASK ((1 << 11) - 1) diff --git a/drivers/infiniband/hw/hfi1/init.c b/drivers/infiniband/hw/hfi1/init.c index 6309edf811df..790542ce89a5 100644 --- a/drivers/infiniband/hw/hfi1/init.c +++ b/drivers/infiniband/hw/hfi1/init.c @@ -1058,6 +1058,10 @@ static void shutdown_device(struct hfi1_devdata *dd) unsigned pidx; int i; + if (dd->flags & HFI1_SHUTDOWN) + return; + dd->flags |= HFI1_SHUTDOWN; + for (pidx = 0; pidx < dd->num_pports; ++pidx) { ppd = dd->pport + pidx; @@ -1391,6 +1395,7 @@ void hfi1_disable_after_error(struct hfi1_devdata *dd) static void remove_one(struct pci_dev *); static int init_one(struct pci_dev *, const struct pci_device_id *); +static void shutdown_one(struct pci_dev *); #define DRIVER_LOAD_MSG "Intel " DRIVER_NAME " loaded: " #define PFX DRIVER_NAME ": " @@ -1407,6 +1412,7 @@ static struct pci_driver hfi1_pci_driver = { .name = DRIVER_NAME, .probe = init_one, .remove = remove_one, + .shutdown = shutdown_one, .id_table = hfi1_pci_tbl, .err_handler = &hfi1_pci_err_handler, }; @@ -1816,6 +1822,13 @@ static void remove_one(struct pci_dev *pdev) postinit_cleanup(dd); } +static void shutdown_one(struct pci_dev *pdev) +{ + struct hfi1_devdata *dd = pci_get_drvdata(pdev); + + shutdown_device(dd); +} + /** * hfi1_create_rcvhdrq - create a receive header queue * @dd: the hfi1_ib device diff --git a/drivers/infiniband/hw/qib/qib.h b/drivers/infiniband/hw/qib/qib.h index 46072455130c..43a68d7b51bb 100644 --- a/drivers/infiniband/hw/qib/qib.h +++ b/drivers/infiniband/hw/qib/qib.h @@ -1228,6 +1228,7 @@ static inline struct qib_ibport *to_iport(struct ib_device *ibdev, u8 port) #define QIB_BADINTR 0x8000 /* severe interrupt problems */ #define QIB_DCA_ENABLED 0x10000 /* Direct Cache Access enabled */ #define QIB_HAS_QSFP 0x20000 /* device (card instance) has QSFP */ +#define QIB_SHUTDOWN 0x40000 /* device is shutting down */ /* * values for ppd->lflags (_ib_port_ related flags) diff --git a/drivers/infiniband/hw/qib/qib_init.c b/drivers/infiniband/hw/qib/qib_init.c index 6c68f8a97018..015520289735 100644 --- a/drivers/infiniband/hw/qib/qib_init.c +++ b/drivers/infiniband/hw/qib/qib_init.c @@ -841,6 +841,10 @@ static void qib_shutdown_device(struct qib_devdata *dd) struct qib_pportdata *ppd; unsigned pidx; + if (dd->flags & QIB_SHUTDOWN) + return; + dd->flags |= QIB_SHUTDOWN; + for (pidx = 0; pidx < dd->num_pports; ++pidx) { ppd = dd->pport + pidx; @@ -1182,6 +1186,7 @@ void qib_disable_after_error(struct qib_devdata *dd) static void qib_remove_one(struct pci_dev *); static int qib_init_one(struct pci_dev *, const struct pci_device_id *); +static void qib_shutdown_one(struct pci_dev *); #define DRIVER_LOAD_MSG "Intel " QIB_DRV_NAME " loaded: " #define PFX QIB_DRV_NAME ": " @@ -1199,6 +1204,7 @@ static struct pci_driver qib_driver = { .name = QIB_DRV_NAME, .probe = qib_init_one, .remove = qib_remove_one, + .shutdown = qib_shutdown_one, .id_table = qib_pci_tbl, .err_handler = &qib_pci_err_handler, }; @@ -1549,6 +1555,13 @@ static void qib_remove_one(struct pci_dev *pdev) qib_postinit_cleanup(dd); } +static void qib_shutdown_one(struct pci_dev *pdev) +{ + struct qib_devdata *dd = pci_get_drvdata(pdev); + + qib_shutdown_device(dd); +} + /** * qib_create_rcvhdrq - create a receive header queue * @dd: the qlogic_ib device -- cgit v1.2.3 From a74d5307caba42fe9bbc180feb03003f14f9f45c Mon Sep 17 00:00:00 2001 From: Mitko Haralanov Date: Wed, 2 May 2018 06:43:24 -0700 Subject: IB/hfi1: Rework fault injection machinery The packet fault injection code present in the HFI1 driver had some issues which not only fragment the code but also created user confusion. Furthermore, it suffered from the following issues: 1. The fault_packet method only worked for received packets. This meant that the only fault injection mode available for sent packets is fault_opcode, which did not allow for random packet drops on all egressing packets. 2. The mask available for the fault_opcode mode did not really work due to the fact that the opcode values are not bits in a bitmask but rather sequential integer values. Creating a opcode/mask pair that would successfully capture a set of packets was nearly impossible. 3. The code was fragmented and used too many debugfs entries to operate and control. This was confusing to users. 4. It did not allow filtering fault injection on a per direction basis - egress vs. ingress. In order to improve or fix the above issues, the following changes have been made: 1. The fault injection methods have been combined into a single fault injection facility. As such, the fault injection has been plugged into both the send and receive code paths. Regardless of method used the fault injection will operate on both egress and ingress packets. 2. The type of fault injection - by packet or by opcode - is now controlled by changing the boolean value of the file "opcode_mode". When the value is set to True, fault injection is done by opcode. Otherwise, by packet. 2. The masking ability has been removed in favor of a bitmap that holds opcodes of interest (one bit per opcode, a total of 256 bits). This works in tandem with the "opcode_mode" value. When the value of "opcode_mode" is False, this bitmap is ignored. When the value is True, the bitmap lists all opcodes to be considered for fault injection. By default, the bitmap is empty. When the user wants to filter by opcode, the user sets the corresponding bit in the bitmap by echo'ing the bit position into the 'opcodes' file. This gets around the issue that the set of opcodes does not lend itself to effective masks and allow for extremely fine-grained filtering by opcode. 4. fault_packet and fault_opcode methods have been combined. Hence, there is only one debugfs directory controlling the entire operation of the fault injection machinery. This reduces the number of debugfs entries and provides a more unified user experience. 5. A new control files - "direction" - is provided to allow the user to control the direction of packets, which are subject to fault injection. 6. A new control file - "skip_usec" - is added that would allow the user to specify a "timeout" during which no fault injection will occur. In addition, the following bug fixes have been applied: 1. The fault injection code has been split into its own header and source files. This was done to better organize the code and support conditional compilation without littering the code with #ifdef's. 2. The method by which the TX PIO packets were being marked for drop conflicted with the way send contexts were being setup. As a result, the send context was repeatedly being reset. 3. The fault injection only makes sense when the user can control it through the debugfs entries. However, a kernel configuration can enable fault injection but keep fault injection debugfs entries disabled. Therefore, it makes sense that the HFI fault injection code depends on both. 4. Error suppression did not take into account the method by which PIO packets were being dropped. Therefore, even with error suppression turned on, errors would still be displayed to the screen. A larger enough packet drop percentage would case the kernel to crash because the driver would be stuck printing errors. Reviewed-by: Dennis Dalessandro Reviewed-by: Don Hiatt Reviewed-by: Mike Marciniszyn Signed-off-by: Mitko Haralanov Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/Makefile | 10 +- drivers/infiniband/hw/hfi1/chip.c | 3 +- drivers/infiniband/hw/hfi1/debugfs.c | 296 +-------------------------- drivers/infiniband/hw/hfi1/debugfs.h | 93 +++++---- drivers/infiniband/hw/hfi1/driver.c | 20 +- drivers/infiniband/hw/hfi1/fault.c | 375 +++++++++++++++++++++++++++++++++++ drivers/infiniband/hw/hfi1/fault.h | 109 ++++++++++ drivers/infiniband/hw/hfi1/hfi.h | 10 +- drivers/infiniband/hw/hfi1/verbs.c | 13 +- drivers/infiniband/hw/hfi1/verbs.h | 6 +- 10 files changed, 577 insertions(+), 358 deletions(-) create mode 100644 drivers/infiniband/hw/hfi1/fault.c create mode 100644 drivers/infiniband/hw/hfi1/fault.h (limited to 'drivers') diff --git a/drivers/infiniband/hw/hfi1/Makefile b/drivers/infiniband/hw/hfi1/Makefile index ce4010bad982..f451ba912f47 100644 --- a/drivers/infiniband/hw/hfi1/Makefile +++ b/drivers/infiniband/hw/hfi1/Makefile @@ -14,7 +14,15 @@ hfi1-y := affinity.o chip.o device.o driver.o efivar.o \ qp.o qsfp.o rc.o ruc.o sdma.o sysfs.o trace.o \ uc.o ud.o user_exp_rcv.o user_pages.o user_sdma.o verbs.o \ verbs_txreq.o vnic_main.o vnic_sdma.o -hfi1-$(CONFIG_DEBUG_FS) += debugfs.o + +ifdef CONFIG_DEBUG_FS +hfi1-y += debugfs.o +ifdef CONFIG_FAULT_INJECTION +ifdef CONFIG_FAULT_INJECTION_DEBUG_FS +hfi1-y += fault.o +endif +endif +endif CFLAGS_trace.o = -I$(src) ifdef MVERSION diff --git a/drivers/infiniband/hw/hfi1/chip.c b/drivers/infiniband/hw/hfi1/chip.c index 4cd422ff92f8..582cf7eb779f 100644 --- a/drivers/infiniband/hw/hfi1/chip.c +++ b/drivers/infiniband/hw/hfi1/chip.c @@ -1,5 +1,5 @@ /* - * Copyright(c) 2015 - 2017 Intel Corporation. + * Copyright(c) 2015 - 2018 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. @@ -65,6 +65,7 @@ #include "aspm.h" #include "affinity.h" #include "debugfs.h" +#include "fault.h" #define NUM_IB_PORTS 1 diff --git a/drivers/infiniband/hw/hfi1/debugfs.c b/drivers/infiniband/hw/hfi1/debugfs.c index 5343960610fe..9f992ae36c89 100644 --- a/drivers/infiniband/hw/hfi1/debugfs.c +++ b/drivers/infiniband/hw/hfi1/debugfs.c @@ -60,15 +60,13 @@ #include "device.h" #include "qp.h" #include "sdma.h" +#include "fault.h" static struct dentry *hfi1_dbg_root; /* wrappers to enforce srcu in seq file */ -static ssize_t hfi1_seq_read( - struct file *file, - char __user *buf, - size_t size, - loff_t *ppos) +ssize_t hfi1_seq_read(struct file *file, char __user *buf, size_t size, + loff_t *ppos) { struct dentry *d = file->f_path.dentry; ssize_t r; @@ -81,10 +79,7 @@ static ssize_t hfi1_seq_read( return r; } -static loff_t hfi1_seq_lseek( - struct file *file, - loff_t offset, - int whence) +loff_t hfi1_seq_lseek(struct file *file, loff_t offset, int whence) { struct dentry *d = file->f_path.dentry; loff_t r; @@ -100,48 +95,6 @@ static loff_t hfi1_seq_lseek( #define private2dd(file) (file_inode(file)->i_private) #define private2ppd(file) (file_inode(file)->i_private) -#define DEBUGFS_SEQ_FILE_OPS(name) \ -static const struct seq_operations _##name##_seq_ops = { \ - .start = _##name##_seq_start, \ - .next = _##name##_seq_next, \ - .stop = _##name##_seq_stop, \ - .show = _##name##_seq_show \ -} - -#define DEBUGFS_SEQ_FILE_OPEN(name) \ -static int _##name##_open(struct inode *inode, struct file *s) \ -{ \ - struct seq_file *seq; \ - int ret; \ - ret = seq_open(s, &_##name##_seq_ops); \ - if (ret) \ - return ret; \ - seq = s->private_data; \ - seq->private = inode->i_private; \ - return 0; \ -} - -#define DEBUGFS_FILE_OPS(name) \ -static const struct file_operations _##name##_file_ops = { \ - .owner = THIS_MODULE, \ - .open = _##name##_open, \ - .read = hfi1_seq_read, \ - .llseek = hfi1_seq_lseek, \ - .release = seq_release \ -} - -#define DEBUGFS_FILE_CREATE(name, parent, data, ops, mode) \ -do { \ - struct dentry *ent; \ - ent = debugfs_create_file(name, mode, parent, \ - data, ops); \ - if (!ent) \ - pr_warn("create of %s failed\n", name); \ -} while (0) - -#define DEBUGFS_SEQ_FILE_CREATE(name, parent, data) \ - DEBUGFS_FILE_CREATE(#name, parent, data, &_##name##_file_ops, S_IRUGO) - static void *_opcode_stats_seq_start(struct seq_file *s, loff_t *pos) { struct hfi1_opcode_stats_perctx *opstats; @@ -1160,236 +1113,6 @@ DEBUGFS_SEQ_FILE_OPS(sdma_cpu_list); DEBUGFS_SEQ_FILE_OPEN(sdma_cpu_list) DEBUGFS_FILE_OPS(sdma_cpu_list); -#ifdef CONFIG_FAULT_INJECTION -static void *_fault_stats_seq_start(struct seq_file *s, loff_t *pos) -{ - struct hfi1_opcode_stats_perctx *opstats; - - if (*pos >= ARRAY_SIZE(opstats->stats)) - return NULL; - return pos; -} - -static void *_fault_stats_seq_next(struct seq_file *s, void *v, loff_t *pos) -{ - struct hfi1_opcode_stats_perctx *opstats; - - ++*pos; - if (*pos >= ARRAY_SIZE(opstats->stats)) - return NULL; - return pos; -} - -static void _fault_stats_seq_stop(struct seq_file *s, void *v) -{ -} - -static int _fault_stats_seq_show(struct seq_file *s, void *v) -{ - loff_t *spos = v; - loff_t i = *spos, j; - u64 n_packets = 0, n_bytes = 0; - struct hfi1_ibdev *ibd = (struct hfi1_ibdev *)s->private; - struct hfi1_devdata *dd = dd_from_dev(ibd); - struct hfi1_ctxtdata *rcd; - - for (j = 0; j < dd->first_dyn_alloc_ctxt; j++) { - rcd = hfi1_rcd_get_by_index(dd, j); - if (rcd) { - n_packets += rcd->opstats->stats[i].n_packets; - n_bytes += rcd->opstats->stats[i].n_bytes; - } - hfi1_rcd_put(rcd); - } - for_each_possible_cpu(j) { - struct hfi1_opcode_stats_perctx *sp = - per_cpu_ptr(dd->tx_opstats, j); - - n_packets += sp->stats[i].n_packets; - n_bytes += sp->stats[i].n_bytes; - } - if (!n_packets && !n_bytes) - return SEQ_SKIP; - if (!ibd->fault_opcode->n_rxfaults[i] && - !ibd->fault_opcode->n_txfaults[i]) - return SEQ_SKIP; - seq_printf(s, "%02llx %llu/%llu (faults rx:%llu faults: tx:%llu)\n", i, - (unsigned long long)n_packets, - (unsigned long long)n_bytes, - (unsigned long long)ibd->fault_opcode->n_rxfaults[i], - (unsigned long long)ibd->fault_opcode->n_txfaults[i]); - return 0; -} - -DEBUGFS_SEQ_FILE_OPS(fault_stats); -DEBUGFS_SEQ_FILE_OPEN(fault_stats); -DEBUGFS_FILE_OPS(fault_stats); - -static void fault_exit_opcode_debugfs(struct hfi1_ibdev *ibd) -{ - if (ibd->fault_opcode) - debugfs_remove_recursive(ibd->fault_opcode->dir); - kfree(ibd->fault_opcode); - ibd->fault_opcode = NULL; -} - -static int fault_init_opcode_debugfs(struct hfi1_ibdev *ibd) -{ - struct dentry *parent = ibd->hfi1_ibdev_dbg; - - ibd->fault_opcode = kzalloc(sizeof(*ibd->fault_opcode), GFP_KERNEL); - if (!ibd->fault_opcode) - return -ENOMEM; - - ibd->fault_opcode->attr.interval = 1; - ibd->fault_opcode->attr.require_end = ULONG_MAX; - ibd->fault_opcode->attr.stacktrace_depth = 32; - ibd->fault_opcode->attr.dname = NULL; - ibd->fault_opcode->attr.verbose = 0; - ibd->fault_opcode->fault_by_opcode = false; - ibd->fault_opcode->opcode = 0; - ibd->fault_opcode->mask = 0xff; - - ibd->fault_opcode->dir = - fault_create_debugfs_attr("fault_opcode", - parent, - &ibd->fault_opcode->attr); - if (IS_ERR(ibd->fault_opcode->dir)) { - kfree(ibd->fault_opcode); - ibd->fault_opcode = NULL; - return -ENOENT; - } - - DEBUGFS_SEQ_FILE_CREATE(fault_stats, ibd->fault_opcode->dir, ibd); - if (!debugfs_create_bool("fault_by_opcode", 0600, - ibd->fault_opcode->dir, - &ibd->fault_opcode->fault_by_opcode)) - goto fail; - if (!debugfs_create_x8("opcode", 0600, ibd->fault_opcode->dir, - &ibd->fault_opcode->opcode)) - goto fail; - if (!debugfs_create_x8("mask", 0600, ibd->fault_opcode->dir, - &ibd->fault_opcode->mask)) - goto fail; - - return 0; -fail: - fault_exit_opcode_debugfs(ibd); - return -ENOMEM; -} - -static void fault_exit_packet_debugfs(struct hfi1_ibdev *ibd) -{ - if (ibd->fault_packet) - debugfs_remove_recursive(ibd->fault_packet->dir); - kfree(ibd->fault_packet); - ibd->fault_packet = NULL; -} - -static int fault_init_packet_debugfs(struct hfi1_ibdev *ibd) -{ - struct dentry *parent = ibd->hfi1_ibdev_dbg; - - ibd->fault_packet = kzalloc(sizeof(*ibd->fault_packet), GFP_KERNEL); - if (!ibd->fault_packet) - return -ENOMEM; - - ibd->fault_packet->attr.interval = 1; - ibd->fault_packet->attr.require_end = ULONG_MAX; - ibd->fault_packet->attr.stacktrace_depth = 32; - ibd->fault_packet->attr.dname = NULL; - ibd->fault_packet->attr.verbose = 0; - ibd->fault_packet->fault_by_packet = false; - - ibd->fault_packet->dir = - fault_create_debugfs_attr("fault_packet", - parent, - &ibd->fault_opcode->attr); - if (IS_ERR(ibd->fault_packet->dir)) { - kfree(ibd->fault_packet); - ibd->fault_packet = NULL; - return -ENOENT; - } - - if (!debugfs_create_bool("fault_by_packet", 0600, - ibd->fault_packet->dir, - &ibd->fault_packet->fault_by_packet)) - goto fail; - if (!debugfs_create_u64("fault_stats", 0400, - ibd->fault_packet->dir, - &ibd->fault_packet->n_faults)) - goto fail; - - return 0; -fail: - fault_exit_packet_debugfs(ibd); - return -ENOMEM; -} - -static void fault_exit_debugfs(struct hfi1_ibdev *ibd) -{ - fault_exit_opcode_debugfs(ibd); - fault_exit_packet_debugfs(ibd); -} - -static int fault_init_debugfs(struct hfi1_ibdev *ibd) -{ - int ret = 0; - - ret = fault_init_opcode_debugfs(ibd); - if (ret) - return ret; - - ret = fault_init_packet_debugfs(ibd); - if (ret) - fault_exit_opcode_debugfs(ibd); - - return ret; -} - -bool hfi1_dbg_fault_suppress_err(struct hfi1_ibdev *ibd) -{ - return ibd->fault_suppress_err; -} - -bool hfi1_dbg_fault_opcode(struct rvt_qp *qp, u32 opcode, bool rx) -{ - bool ret = false; - struct hfi1_ibdev *ibd = to_idev(qp->ibqp.device); - - if (!ibd->fault_opcode || !ibd->fault_opcode->fault_by_opcode) - return false; - if (ibd->fault_opcode->opcode != (opcode & ibd->fault_opcode->mask)) - return false; - ret = should_fail(&ibd->fault_opcode->attr, 1); - if (ret) { - trace_hfi1_fault_opcode(qp, opcode); - if (rx) - ibd->fault_opcode->n_rxfaults[opcode]++; - else - ibd->fault_opcode->n_txfaults[opcode]++; - } - return ret; -} - -bool hfi1_dbg_fault_packet(struct hfi1_packet *packet) -{ - struct rvt_dev_info *rdi = &packet->rcd->ppd->dd->verbs_dev.rdi; - struct hfi1_ibdev *ibd = dev_from_rdi(rdi); - bool ret = false; - - if (!ibd->fault_packet || !ibd->fault_packet->fault_by_packet) - return false; - - ret = should_fail(&ibd->fault_packet->attr, 1); - if (ret) { - ++ibd->fault_packet->n_faults; - trace_hfi1_fault_packet(packet); - } - return ret; -} -#endif - void hfi1_dbg_ibdev_init(struct hfi1_ibdev *ibd) { char name[sizeof("port0counters") + 1]; @@ -1442,21 +1165,14 @@ void hfi1_dbg_ibdev_init(struct hfi1_ibdev *ibd) S_IRUGO : S_IRUGO | S_IWUSR); } -#ifdef CONFIG_FAULT_INJECTION - debugfs_create_bool("fault_suppress_err", 0600, - ibd->hfi1_ibdev_dbg, - &ibd->fault_suppress_err); - fault_init_debugfs(ibd); -#endif + hfi1_fault_init_debugfs(ibd); } void hfi1_dbg_ibdev_exit(struct hfi1_ibdev *ibd) { if (!hfi1_dbg_root) goto out; -#ifdef CONFIG_FAULT_INJECTION - fault_exit_debugfs(ibd); -#endif + hfi1_fault_exit_debugfs(ibd); debugfs_remove(ibd->hfi1_ibdev_link); debugfs_remove_recursive(ibd->hfi1_ibdev_dbg); out: diff --git a/drivers/infiniband/hw/hfi1/debugfs.h b/drivers/infiniband/hw/hfi1/debugfs.h index 38c38a98156d..1c91461b108f 100644 --- a/drivers/infiniband/hw/hfi1/debugfs.h +++ b/drivers/infiniband/hw/hfi1/debugfs.h @@ -1,7 +1,7 @@ #ifndef _HFI1_DEBUGFS_H #define _HFI1_DEBUGFS_H /* - * Copyright(c) 2015, 2016 Intel Corporation. + * Copyright(c) 2015, 2016, 2018 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. @@ -48,51 +48,59 @@ */ struct hfi1_ibdev; -#ifdef CONFIG_DEBUG_FS -void hfi1_dbg_ibdev_init(struct hfi1_ibdev *ibd); -void hfi1_dbg_ibdev_exit(struct hfi1_ibdev *ibd); -void hfi1_dbg_init(void); -void hfi1_dbg_exit(void); -#ifdef CONFIG_FAULT_INJECTION -#include -struct fault_opcode { - struct fault_attr attr; - struct dentry *dir; - bool fault_by_opcode; - u64 n_rxfaults[256]; - u64 n_txfaults[256]; - u8 opcode; - u8 mask; -}; - -struct fault_packet { - struct fault_attr attr; - struct dentry *dir; - bool fault_by_packet; - u64 n_faults; -}; - -bool hfi1_dbg_fault_opcode(struct rvt_qp *qp, u32 opcode, bool rx); -bool hfi1_dbg_fault_packet(struct hfi1_packet *packet); -bool hfi1_dbg_fault_suppress_err(struct hfi1_ibdev *ibd); -#else -static inline bool hfi1_dbg_fault_packet(struct hfi1_packet *packet) -{ - return false; +#define DEBUGFS_FILE_CREATE(name, parent, data, ops, mode) \ +do { \ + struct dentry *ent; \ + const char *__name = name; \ + ent = debugfs_create_file(__name, mode, parent, \ + data, ops); \ + if (!ent) \ + pr_warn("create of %s failed\n", __name); \ +} while (0) + +#define DEBUGFS_SEQ_FILE_OPS(name) \ +static const struct seq_operations _##name##_seq_ops = { \ + .start = _##name##_seq_start, \ + .next = _##name##_seq_next, \ + .stop = _##name##_seq_stop, \ + .show = _##name##_seq_show \ } -static inline bool hfi1_dbg_fault_opcode(struct rvt_qp *qp, - u32 opcode, bool rx) -{ - return false; +#define DEBUGFS_SEQ_FILE_OPEN(name) \ +static int _##name##_open(struct inode *inode, struct file *s) \ +{ \ + struct seq_file *seq; \ + int ret; \ + ret = seq_open(s, &_##name##_seq_ops); \ + if (ret) \ + return ret; \ + seq = s->private_data; \ + seq->private = inode->i_private; \ + return 0; \ } -static inline bool hfi1_dbg_fault_suppress_err(struct hfi1_ibdev *ibd) -{ - return false; +#define DEBUGFS_FILE_OPS(name) \ +static const struct file_operations _##name##_file_ops = { \ + .owner = THIS_MODULE, \ + .open = _##name##_open, \ + .read = hfi1_seq_read, \ + .llseek = hfi1_seq_lseek, \ + .release = seq_release \ } -#endif + +#define DEBUGFS_SEQ_FILE_CREATE(name, parent, data) \ + DEBUGFS_FILE_CREATE(#name, parent, data, &_##name##_file_ops, 0444) + +ssize_t hfi1_seq_read(struct file *file, char __user *buf, size_t size, + loff_t *ppos); +loff_t hfi1_seq_lseek(struct file *file, loff_t offset, int whence); + +#ifdef CONFIG_DEBUG_FS +void hfi1_dbg_ibdev_init(struct hfi1_ibdev *ibd); +void hfi1_dbg_ibdev_exit(struct hfi1_ibdev *ibd); +void hfi1_dbg_init(void); +void hfi1_dbg_exit(void); #else static inline void hfi1_dbg_ibdev_init(struct hfi1_ibdev *ibd) @@ -111,13 +119,12 @@ static inline void hfi1_dbg_exit(void) { } -static inline bool hfi1_dbg_fault_packet(struct hfi1_packet *packet) +static inline bool hfi1_dbg_should_fault_rx(struct hfi1_packet *packet) { return false; } -static inline bool hfi1_dbg_fault_opcode(struct rvt_qp *qp, - u32 opcode, bool rx) +static inline bool hfi1_dbg_should_fault_tx(struct rvt_qp *qp, u32 opcode) { return false; } diff --git a/drivers/infiniband/hw/hfi1/driver.c b/drivers/infiniband/hw/hfi1/driver.c index bd837a048bf4..e5a57ebd8da4 100644 --- a/drivers/infiniband/hw/hfi1/driver.c +++ b/drivers/infiniband/hw/hfi1/driver.c @@ -1,5 +1,5 @@ /* - * Copyright(c) 2015-2017 Intel Corporation. + * Copyright(c) 2015-2018 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. @@ -61,6 +61,7 @@ #include "sdma.h" #include "debugfs.h" #include "vnic.h" +#include "fault.h" #undef pr_fmt #define pr_fmt(fmt) DRIVER_NAME ": " fmt @@ -1565,10 +1566,10 @@ void handle_eflags(struct hfi1_packet *packet) */ int process_receive_ib(struct hfi1_packet *packet) { - if (unlikely(hfi1_dbg_fault_packet(packet))) + if (hfi1_setup_9B_packet(packet)) return RHF_RCV_CONTINUE; - if (hfi1_setup_9B_packet(packet)) + if (unlikely(hfi1_dbg_should_fault_rx(packet))) return RHF_RCV_CONTINUE; trace_hfi1_rcvhdr(packet); @@ -1642,7 +1643,8 @@ int process_receive_error(struct hfi1_packet *packet) /* KHdrHCRCErr -- KDETH packet with a bad HCRC */ if (unlikely( hfi1_dbg_fault_suppress_err(&packet->rcd->dd->verbs_dev) && - rhf_rcv_type_err(packet->rhf) == 3)) + (rhf_rcv_type_err(packet->rhf) == RHF_RCV_TYPE_ERROR || + packet->rhf & RHF_DC_ERR))) return RHF_RCV_CONTINUE; hfi1_setup_ib_header(packet); @@ -1657,10 +1659,10 @@ int process_receive_error(struct hfi1_packet *packet) int kdeth_process_expected(struct hfi1_packet *packet) { - if (unlikely(hfi1_dbg_fault_packet(packet))) + hfi1_setup_9B_packet(packet); + if (unlikely(hfi1_dbg_should_fault_rx(packet))) return RHF_RCV_CONTINUE; - hfi1_setup_ib_header(packet); if (unlikely(rhf_err_flags(packet->rhf))) handle_eflags(packet); @@ -1671,11 +1673,11 @@ int kdeth_process_expected(struct hfi1_packet *packet) int kdeth_process_eager(struct hfi1_packet *packet) { - hfi1_setup_ib_header(packet); + hfi1_setup_9B_packet(packet); + if (unlikely(hfi1_dbg_should_fault_rx(packet))) + return RHF_RCV_CONTINUE; if (unlikely(rhf_err_flags(packet->rhf))) handle_eflags(packet); - if (unlikely(hfi1_dbg_fault_packet(packet))) - return RHF_RCV_CONTINUE; dd_dev_err(packet->rcd->dd, "Unhandled eager packet received. Dropping.\n"); diff --git a/drivers/infiniband/hw/hfi1/fault.c b/drivers/infiniband/hw/hfi1/fault.c new file mode 100644 index 000000000000..e2290f32c8d9 --- /dev/null +++ b/drivers/infiniband/hw/hfi1/fault.c @@ -0,0 +1,375 @@ +/* + * Copyright(c) 2018 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ +#include +#include +#include +#include +#include +#include + +#include "debugfs.h" +#include "fault.h" +#include "trace.h" + +#define HFI1_FAULT_DIR_TX BIT(0) +#define HFI1_FAULT_DIR_RX BIT(1) +#define HFI1_FAULT_DIR_TXRX (HFI1_FAULT_DIR_TX | HFI1_FAULT_DIR_RX) + +static void *_fault_stats_seq_start(struct seq_file *s, loff_t *pos) +{ + struct hfi1_opcode_stats_perctx *opstats; + + if (*pos >= ARRAY_SIZE(opstats->stats)) + return NULL; + return pos; +} + +static void *_fault_stats_seq_next(struct seq_file *s, void *v, loff_t *pos) +{ + struct hfi1_opcode_stats_perctx *opstats; + + ++*pos; + if (*pos >= ARRAY_SIZE(opstats->stats)) + return NULL; + return pos; +} + +static void _fault_stats_seq_stop(struct seq_file *s, void *v) +{ +} + +static int _fault_stats_seq_show(struct seq_file *s, void *v) +{ + loff_t *spos = v; + loff_t i = *spos, j; + u64 n_packets = 0, n_bytes = 0; + struct hfi1_ibdev *ibd = (struct hfi1_ibdev *)s->private; + struct hfi1_devdata *dd = dd_from_dev(ibd); + struct hfi1_ctxtdata *rcd; + + for (j = 0; j < dd->first_dyn_alloc_ctxt; j++) { + rcd = hfi1_rcd_get_by_index(dd, j); + if (rcd) { + n_packets += rcd->opstats->stats[i].n_packets; + n_bytes += rcd->opstats->stats[i].n_bytes; + } + hfi1_rcd_put(rcd); + } + for_each_possible_cpu(j) { + struct hfi1_opcode_stats_perctx *sp = + per_cpu_ptr(dd->tx_opstats, j); + + n_packets += sp->stats[i].n_packets; + n_bytes += sp->stats[i].n_bytes; + } + if (!n_packets && !n_bytes) + return SEQ_SKIP; + if (!ibd->fault->n_rxfaults[i] && !ibd->fault->n_txfaults[i]) + return SEQ_SKIP; + seq_printf(s, "%02llx %llu/%llu (faults rx:%llu faults: tx:%llu)\n", i, + (unsigned long long)n_packets, + (unsigned long long)n_bytes, + (unsigned long long)ibd->fault->n_rxfaults[i], + (unsigned long long)ibd->fault->n_txfaults[i]); + return 0; +} + +DEBUGFS_SEQ_FILE_OPS(fault_stats); +DEBUGFS_SEQ_FILE_OPEN(fault_stats); +DEBUGFS_FILE_OPS(fault_stats); + +static int fault_opcodes_open(struct inode *inode, struct file *file) +{ + file->private_data = inode->i_private; + return nonseekable_open(inode, file); +} + +static ssize_t fault_opcodes_write(struct file *file, const char __user *buf, + size_t len, loff_t *pos) +{ + ssize_t ret = 0; + /* 1280 = 256 opcodes * 4 chars/opcode + 255 commas + NULL */ + size_t copy, datalen = 1280; + char *data, *token, *ptr, *end; + struct fault *fault = file->private_data; + + data = kcalloc(datalen, sizeof(*data), GFP_KERNEL); + if (!data) + return -ENOMEM; + copy = min(len, datalen - 1); + if (copy_from_user(data, buf, copy)) + return -EFAULT; + + ret = debugfs_file_get(file->f_path.dentry); + if (unlikely(ret)) + return ret; + ptr = data; + token = ptr; + for (ptr = data; *ptr; ptr = end + 1, token = ptr) { + char *dash; + unsigned long range_start, range_end, i; + bool remove = false; + + end = strchr(ptr, ','); + if (end) + *end = '\0'; + if (token[0] == '-') { + remove = true; + token++; + } + dash = strchr(token, '-'); + if (dash) + *dash = '\0'; + if (kstrtoul(token, 0, &range_start)) + break; + if (dash) { + token = dash + 1; + if (kstrtoul(token, 0, &range_end)) + break; + } else { + range_end = range_start; + } + if (range_start == range_end && range_start == -1UL) { + bitmap_zero(fault->opcodes, sizeof(fault->opcodes) * + BITS_PER_BYTE); + break; + } + for (i = range_start; i <= range_end; i++) { + if (remove) + clear_bit(i, fault->opcodes); + else + set_bit(i, fault->opcodes); + } + if (!end) + break; + } + ret = len; + + debugfs_file_put(file->f_path.dentry); + kfree(data); + return ret; +} + +static ssize_t fault_opcodes_read(struct file *file, char __user *buf, + size_t len, loff_t *pos) +{ + ssize_t ret = 0; + char *data; + size_t datalen = 1280, size = 0; /* see fault_opcodes_write() */ + unsigned long bit = 0, zero = 0; + struct fault *fault = file->private_data; + size_t bitsize = sizeof(fault->opcodes) * BITS_PER_BYTE; + + data = kcalloc(datalen, sizeof(*data), GFP_KERNEL); + if (!data) + return -ENOMEM; + ret = debugfs_file_get(file->f_path.dentry); + if (unlikely(ret)) + return ret; + bit = find_first_bit(fault->opcodes, bitsize); + while (bit < bitsize) { + zero = find_next_zero_bit(fault->opcodes, bitsize, bit); + if (zero - 1 != bit) + size += snprintf(data + size, + datalen - size - 1, + "0x%lx-0x%lx,", bit, zero - 1); + else + size += snprintf(data + size, + datalen - size - 1, "0x%lx,", + bit); + bit = find_next_bit(fault->opcodes, bitsize, zero); + } + debugfs_file_put(file->f_path.dentry); + data[size - 1] = '\n'; + data[size] = '\0'; + ret = simple_read_from_buffer(buf, len, pos, data, size); + kfree(data); + return ret; +} + +static const struct file_operations __fault_opcodes_fops = { + .owner = THIS_MODULE, + .open = fault_opcodes_open, + .read = fault_opcodes_read, + .write = fault_opcodes_write, + .llseek = no_llseek +}; + +void hfi1_fault_exit_debugfs(struct hfi1_ibdev *ibd) +{ + if (ibd->fault) + debugfs_remove_recursive(ibd->fault->dir); + kfree(ibd->fault); + ibd->fault = NULL; +} + +int hfi1_fault_init_debugfs(struct hfi1_ibdev *ibd) +{ + struct dentry *parent = ibd->hfi1_ibdev_dbg; + + ibd->fault = kzalloc(sizeof(*ibd->fault), GFP_KERNEL); + if (!ibd->fault) + return -ENOMEM; + + ibd->fault->attr.interval = 1; + ibd->fault->attr.require_end = ULONG_MAX; + ibd->fault->attr.stacktrace_depth = 32; + ibd->fault->attr.dname = NULL; + ibd->fault->attr.verbose = 0; + ibd->fault->enable = false; + ibd->fault->opcode = false; + ibd->fault->fault_skip = 0; + ibd->fault->skip = 0; + ibd->fault->direction = HFI1_FAULT_DIR_TXRX; + ibd->fault->suppress_err = false; + bitmap_zero(ibd->fault->opcodes, + sizeof(ibd->fault->opcodes) * BITS_PER_BYTE); + + ibd->fault->dir = + fault_create_debugfs_attr("fault", parent, + &ibd->fault->attr); + if (IS_ERR(ibd->fault->dir)) { + kfree(ibd->fault); + ibd->fault = NULL; + return -ENOENT; + } + + DEBUGFS_SEQ_FILE_CREATE(fault_stats, ibd->fault->dir, ibd); + if (!debugfs_create_bool("enable", 0600, ibd->fault->dir, + &ibd->fault->enable)) + goto fail; + if (!debugfs_create_bool("suppress_err", 0600, + ibd->fault->dir, + &ibd->fault->suppress_err)) + goto fail; + if (!debugfs_create_bool("opcode_mode", 0600, ibd->fault->dir, + &ibd->fault->opcode)) + goto fail; + if (!debugfs_create_file("opcodes", 0600, ibd->fault->dir, + ibd->fault, &__fault_opcodes_fops)) + goto fail; + if (!debugfs_create_u64("skip_pkts", 0600, + ibd->fault->dir, + &ibd->fault->fault_skip)) + goto fail; + if (!debugfs_create_u64("skip_usec", 0600, + ibd->fault->dir, + &ibd->fault->fault_skip_usec)) + goto fail; + if (!debugfs_create_u8("direction", 0600, ibd->fault->dir, + &ibd->fault->direction)) + goto fail; + + return 0; +fail: + hfi1_fault_exit_debugfs(ibd); + return -ENOMEM; +} + +bool hfi1_dbg_fault_suppress_err(struct hfi1_ibdev *ibd) +{ + if (ibd->fault) + return ibd->fault->suppress_err; + return false; +} + +static bool __hfi1_should_fault(struct hfi1_ibdev *ibd, u32 opcode, + u8 direction) +{ + bool ret = false; + + if (!ibd->fault || !ibd->fault->enable) + return false; + if (!(ibd->fault->direction & direction)) + return false; + if (ibd->fault->opcode) { + if (bitmap_empty(ibd->fault->opcodes, + (sizeof(ibd->fault->opcodes) * + BITS_PER_BYTE))) + return false; + if (!(test_bit(opcode, ibd->fault->opcodes))) + return false; + } + if (ibd->fault->fault_skip_usec && + time_before(jiffies, ibd->fault->skip_usec)) + return false; + if (ibd->fault->fault_skip && ibd->fault->skip) { + ibd->fault->skip--; + return false; + } + ret = should_fail(&ibd->fault->attr, 1); + if (ret) { + ibd->fault->skip = ibd->fault->fault_skip; + ibd->fault->skip_usec = jiffies + + usecs_to_jiffies(ibd->fault->fault_skip_usec); + } + return ret; +} + +bool hfi1_dbg_should_fault_tx(struct rvt_qp *qp, u32 opcode) +{ + struct hfi1_ibdev *ibd = to_idev(qp->ibqp.device); + + if (__hfi1_should_fault(ibd, opcode, HFI1_FAULT_DIR_TX)) { + trace_hfi1_fault_opcode(qp, opcode); + ibd->fault->n_txfaults[opcode]++; + return true; + } + return false; +} + +bool hfi1_dbg_should_fault_rx(struct hfi1_packet *packet) +{ + struct hfi1_ibdev *ibd = &packet->rcd->dd->verbs_dev; + + if (__hfi1_should_fault(ibd, packet->opcode, HFI1_FAULT_DIR_RX)) { + trace_hfi1_fault_packet(packet); + ibd->fault->n_rxfaults[packet->opcode]++; + return true; + } + return false; +} diff --git a/drivers/infiniband/hw/hfi1/fault.h b/drivers/infiniband/hw/hfi1/fault.h new file mode 100644 index 000000000000..a83382700a7c --- /dev/null +++ b/drivers/infiniband/hw/hfi1/fault.h @@ -0,0 +1,109 @@ +#ifndef _HFI1_FAULT_H +#define _HFI1_FAULT_H +/* + * Copyright(c) 2018 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ +#include +#include +#include +#include +#include + +#include "hfi.h" + +struct hfi1_ibdev; + +#if defined(CONFIG_FAULT_INJECTION) && defined(CONFIG_FAULT_INJECTION_DEBUG_FS) +struct fault { + struct fault_attr attr; + struct dentry *dir; + u64 n_rxfaults[(1U << BITS_PER_BYTE)]; + u64 n_txfaults[(1U << BITS_PER_BYTE)]; + u64 fault_skip; + u64 skip; + u64 fault_skip_usec; + unsigned long skip_usec; + unsigned long opcodes[(1U << BITS_PER_BYTE) / BITS_PER_LONG]; + bool enable; + bool suppress_err; + bool opcode; + u8 direction; +}; + +int hfi1_fault_init_debugfs(struct hfi1_ibdev *ibd); +bool hfi1_dbg_should_fault_tx(struct rvt_qp *qp, u32 opcode); +bool hfi1_dbg_should_fault_rx(struct hfi1_packet *packet); +bool hfi1_dbg_fault_suppress_err(struct hfi1_ibdev *ibd); +void hfi1_fault_exit_debugfs(struct hfi1_ibdev *ibd); + +#else + +static inline int hfi1_fault_init_debugfs(struct hfi1_ibdev *ibd) +{ + return 0; +} + +static inline bool hfi1_dbg_should_fault_rx(struct hfi1_packet *packet) +{ + return false; +} + +static inline bool hfi1_dbg_should_fault_tx(struct rvt_qp *qp, + u32 opcode) +{ + return false; +} + +static inline bool hfi1_dbg_fault_suppress_err(struct hfi1_ibdev *ibd) +{ + return false; +} + +static inline void hfi1_fault_exit_debugfs(struct hfi1_ibdev *ibd) +{ +} +#endif +#endif /* _HFI1_FAULT_H */ diff --git a/drivers/infiniband/hw/hfi1/hfi.h b/drivers/infiniband/hw/hfi1/hfi.h index 9c97c180c35e..9cd758ce7764 100644 --- a/drivers/infiniband/hw/hfi1/hfi.h +++ b/drivers/infiniband/hw/hfi1/hfi.h @@ -1,7 +1,7 @@ #ifndef _HFI1_KERNEL_H #define _HFI1_KERNEL_H /* - * Copyright(c) 2015-2017 Intel Corporation. + * Copyright(c) 2015-2018 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. @@ -2049,7 +2049,9 @@ static inline u64 hfi1_pkt_default_send_ctxt_mask(struct hfi1_devdata *dd, | SEND_CTXT_CHECK_ENABLE_DISALLOW_TOO_LONG_BYPASS_PACKETS_SMASK | SEND_CTXT_CHECK_ENABLE_DISALLOW_TOO_LONG_IB_PACKETS_SMASK | SEND_CTXT_CHECK_ENABLE_DISALLOW_BAD_PKT_LEN_SMASK +#ifndef CONFIG_FAULT_INJECTION | SEND_CTXT_CHECK_ENABLE_DISALLOW_PBC_TEST_SMASK +#endif | SEND_CTXT_CHECK_ENABLE_DISALLOW_TOO_SMALL_BYPASS_PACKETS_SMASK | SEND_CTXT_CHECK_ENABLE_DISALLOW_TOO_SMALL_IB_PACKETS_SMASK | SEND_CTXT_CHECK_ENABLE_DISALLOW_RAW_IPV6_SMASK @@ -2062,7 +2064,11 @@ static inline u64 hfi1_pkt_default_send_ctxt_mask(struct hfi1_devdata *dd, | SEND_CTXT_CHECK_ENABLE_CHECK_ENABLE_SMASK; if (ctxt_type == SC_USER) - base_sc_integrity |= HFI1_PKT_USER_SC_INTEGRITY; + base_sc_integrity |= +#ifndef CONFIG_FAULT_INJECTION + SEND_CTXT_CHECK_ENABLE_DISALLOW_PBC_TEST_SMASK | +#endif + HFI1_PKT_USER_SC_INTEGRITY; else base_sc_integrity |= HFI1_PKT_KERNEL_SC_INTEGRITY; diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c index c8cf4d4984d3..9554e912af98 100644 --- a/drivers/infiniband/hw/hfi1/verbs.c +++ b/drivers/infiniband/hw/hfi1/verbs.c @@ -1,5 +1,5 @@ /* - * Copyright(c) 2015 - 2017 Intel Corporation. + * Copyright(c) 2015 - 2018 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. @@ -63,6 +63,7 @@ #include "verbs_txreq.h" #include "debugfs.h" #include "vnic.h" +#include "fault.h" static unsigned int hfi1_lkey_table_size = 16; module_param_named(lkey_table_size, hfi1_lkey_table_size, uint, @@ -624,10 +625,6 @@ static inline void hfi1_handle_packet(struct hfi1_packet *packet, if (hfi1_do_pkey_check(packet)) goto unlock_drop; - if (unlikely(hfi1_dbg_fault_opcode(packet->qp, packet->opcode, - true))) - goto unlock_drop; - spin_lock_irqsave(&packet->qp->r_lock, flags); packet_handler = qp_ok(packet); if (likely(packet_handler)) @@ -934,8 +931,7 @@ int hfi1_verbs_send_dma(struct rvt_qp *qp, struct hfi1_pkt_state *ps, else pbc |= (ib_is_sc5(sc5) << PBC_DC_INFO_SHIFT); - if (unlikely(hfi1_dbg_fault_opcode(qp, ps->opcode, - false))) + if (unlikely(hfi1_dbg_should_fault_tx(qp, ps->opcode))) pbc = hfi1_fault_tx(qp, ps->opcode, pbc); pbc = create_pbc(ppd, pbc, @@ -1088,7 +1084,8 @@ int hfi1_verbs_send_pio(struct rvt_qp *qp, struct hfi1_pkt_state *ps, pbc |= PBC_PACKET_BYPASS | PBC_INSERT_BYPASS_ICRC; else pbc |= (ib_is_sc5(sc5) << PBC_DC_INFO_SHIFT); - if (unlikely(hfi1_dbg_fault_opcode(qp, ps->opcode, false))) + + if (unlikely(hfi1_dbg_should_fault_tx(qp, ps->opcode))) pbc = hfi1_fault_tx(qp, ps->opcode, pbc); pbc = create_pbc(ppd, pbc, qp->srate_mbps, vl, plen); } diff --git a/drivers/infiniband/hw/hfi1/verbs.h b/drivers/infiniband/hw/hfi1/verbs.h index 2d787b8346ca..081ca52e6621 100644 --- a/drivers/infiniband/hw/hfi1/verbs.h +++ b/drivers/infiniband/hw/hfi1/verbs.h @@ -1,5 +1,5 @@ /* - * Copyright(c) 2015 - 2017 Intel Corporation. + * Copyright(c) 2015 - 2018 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. @@ -227,9 +227,7 @@ struct hfi1_ibdev { /* per HFI symlinks to above */ struct dentry *hfi1_ibdev_link; #ifdef CONFIG_FAULT_INJECTION - struct fault_opcode *fault_opcode; - struct fault_packet *fault_packet; - bool fault_suppress_err; + struct fault *fault; #endif #endif }; -- cgit v1.2.3 From c872a1f9e3aaf51b091ff19ef6cb1e1a298f3c90 Mon Sep 17 00:00:00 2001 From: Kamenee Arumugam Date: Wed, 2 May 2018 06:43:31 -0700 Subject: IB/Hfi1: Read CCE Revision register to verify the device is responsive When Hfi1 device is unresponsive, reading the RcvArrayCnt register will return all 1's. This value is then used to remap chip's RcvArray. The incorrect all ones value used in remapping RcvArray will cause warn on as shown by trace below: [] dump_stack+0x19/0x1b [] warn_slowpath_common+0x70/0xb0 [] warn_slowpath_fmt+0x5c/0x80 [] __ioremap_caller+0x279/0x320 [] ? _dev_info+0x6c/0x90 [] ? hfi1_pcie_ddinit+0x1d5/0x330 [hfi1] [] ioremap_wc+0x32/0x40 [] hfi1_pcie_ddinit+0x1d5/0x330 [hfi1] [] hfi1_init_dd+0x1d1/0x2440 [hfi1] [] ? pci_write_config_word+0x1c/0x20 Read CCE revision register first to verify that WFR device is responsive. If the read return "all ones", bail out from init and fail the driver load. Reviewed-by: Mike Marciniszyn Reviewed-by: Michael J. Ruhl Signed-off-by: Kamenee Arumugam Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/chip.c | 7 ------- drivers/infiniband/hw/hfi1/pcie.c | 8 ++++++++ 2 files changed, 8 insertions(+), 7 deletions(-) (limited to 'drivers') diff --git a/drivers/infiniband/hw/hfi1/chip.c b/drivers/infiniband/hw/hfi1/chip.c index 582cf7eb779f..0fab6df0a345 100644 --- a/drivers/infiniband/hw/hfi1/chip.c +++ b/drivers/infiniband/hw/hfi1/chip.c @@ -15038,13 +15038,6 @@ struct hfi1_devdata *hfi1_init_dd(struct pci_dev *pdev, if (ret < 0) goto bail_cleanup; - /* verify that reads actually work, save revision for reset check */ - dd->revision = read_csr(dd, CCE_REVISION); - if (dd->revision == ~(u64)0) { - dd_dev_err(dd, "cannot read chip CSRs\n"); - ret = -EINVAL; - goto bail_cleanup; - } dd->majrev = (dd->revision >> CCE_REVISION_CHIP_REV_MAJOR_SHIFT) & CCE_REVISION_CHIP_REV_MAJOR_MASK; dd->minrev = (dd->revision >> CCE_REVISION_CHIP_REV_MINOR_SHIFT) diff --git a/drivers/infiniband/hw/hfi1/pcie.c b/drivers/infiniband/hw/hfi1/pcie.c index c1c982908b4b..87bd6b60cb53 100644 --- a/drivers/infiniband/hw/hfi1/pcie.c +++ b/drivers/infiniband/hw/hfi1/pcie.c @@ -183,6 +183,14 @@ int hfi1_pcie_ddinit(struct hfi1_devdata *dd, struct pci_dev *pdev) return -ENOMEM; } dd_dev_info(dd, "UC base1: %p for %x\n", dd->kregbase1, RCV_ARRAY); + + /* verify that reads actually work, save revision for reset check */ + dd->revision = readq(dd->kregbase1 + CCE_REVISION); + if (dd->revision == ~(u64)0) { + dd_dev_err(dd, "Cannot read chip CSRs\n"); + goto nomem; + } + dd->chip_rcv_array_count = readq(dd->kregbase1 + RCV_ARRAY_CNT); dd_dev_info(dd, "RcvArray count: %u\n", dd->chip_rcv_array_count); dd->base2_start = RCV_ARRAY + dd->chip_rcv_array_count * 8; -- cgit v1.2.3 From af8aab71370a692eaf7e7969ba5b1a455ac20113 Mon Sep 17 00:00:00 2001 From: Sebastian Sanchez Date: Wed, 2 May 2018 06:43:39 -0700 Subject: IB/hfi1: Optimize kthread pointer locking when queuing CQ entries All threads queuing CQ entries on different CQs are unnecessarily synchronized by a spin lock to check if the CQ kthread worker hasn't been destroyed before queuing an CQ entry. The lock used in 6efaf10f163d ("IB/rdmavt: Avoid queuing work into a destroyed cq kthread worker") is a device global lock and will have poor performance at scale as completions are entered from a large number of CPUs. Convert to use RCU where the read side of RCU is rvt_cq_enter() to determine that the worker is alive prior to triggering the completion event. Apply write side RCU semantics in rvt_driver_cq_init() and rvt_cq_exit(). Fixes: 6efaf10f163d ("IB/rdmavt: Avoid queuing work into a destroyed cq kthread worker") Cc: # 4.14.x Reviewed-by: Mike Marciniszyn Signed-off-by: Sebastian Sanchez Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/sw/rdmavt/cq.c | 31 +++++++++++++++++++------------ include/rdma/rdma_vt.h | 2 +- 2 files changed, 20 insertions(+), 13 deletions(-) (limited to 'drivers') diff --git a/drivers/infiniband/sw/rdmavt/cq.c b/drivers/infiniband/sw/rdmavt/cq.c index fb52b669bfce..340c17aba3b0 100644 --- a/drivers/infiniband/sw/rdmavt/cq.c +++ b/drivers/infiniband/sw/rdmavt/cq.c @@ -120,17 +120,20 @@ void rvt_cq_enter(struct rvt_cq *cq, struct ib_wc *entry, bool solicited) if (cq->notify == IB_CQ_NEXT_COMP || (cq->notify == IB_CQ_SOLICITED && (solicited || entry->status != IB_WC_SUCCESS))) { + struct kthread_worker *worker; + /* * This will cause send_complete() to be called in * another thread. */ - spin_lock(&cq->rdi->n_cqs_lock); - if (likely(cq->rdi->worker)) { + rcu_read_lock(); + worker = rcu_dereference(cq->rdi->worker); + if (likely(worker)) { cq->notify = RVT_CQ_NONE; cq->triggered++; - kthread_queue_work(cq->rdi->worker, &cq->comptask); + kthread_queue_work(worker, &cq->comptask); } - spin_unlock(&cq->rdi->n_cqs_lock); + rcu_read_unlock(); } spin_unlock_irqrestore(&cq->lock, flags); @@ -512,7 +515,7 @@ int rvt_driver_cq_init(struct rvt_dev_info *rdi) int cpu; struct kthread_worker *worker; - if (rdi->worker) + if (rcu_access_pointer(rdi->worker)) return 0; spin_lock_init(&rdi->n_cqs_lock); @@ -524,7 +527,7 @@ int rvt_driver_cq_init(struct rvt_dev_info *rdi) return PTR_ERR(worker); set_user_nice(worker->task, MIN_NICE); - rdi->worker = worker; + RCU_INIT_POINTER(rdi->worker, worker); return 0; } @@ -536,15 +539,19 @@ void rvt_cq_exit(struct rvt_dev_info *rdi) { struct kthread_worker *worker; - /* block future queuing from send_complete() */ - spin_lock_irq(&rdi->n_cqs_lock); - worker = rdi->worker; + if (!rcu_access_pointer(rdi->worker)) + return; + + spin_lock(&rdi->n_cqs_lock); + worker = rcu_dereference_protected(rdi->worker, + lockdep_is_held(&rdi->n_cqs_lock)); if (!worker) { - spin_unlock_irq(&rdi->n_cqs_lock); + spin_unlock(&rdi->n_cqs_lock); return; } - rdi->worker = NULL; - spin_unlock_irq(&rdi->n_cqs_lock); + RCU_INIT_POINTER(rdi->worker, NULL); + spin_unlock(&rdi->n_cqs_lock); + synchronize_rcu(); kthread_destroy_worker(worker); } diff --git a/include/rdma/rdma_vt.h b/include/rdma/rdma_vt.h index 3f4c187e435d..eec495e68823 100644 --- a/include/rdma/rdma_vt.h +++ b/include/rdma/rdma_vt.h @@ -402,7 +402,7 @@ struct rvt_dev_info { spinlock_t pending_lock; /* protect pending mmap list */ /* CQ */ - struct kthread_worker *worker; /* per device cq worker */ + struct kthread_worker __rcu *worker; /* per device cq worker */ u32 n_cqs_allocated; /* number of CQs allocated for device */ spinlock_t n_cqs_lock; /* protect count of in use cqs */ -- cgit v1.2.3 From cf38ea100edfcc0ec0a5797966d69ec4e10fe4f1 Mon Sep 17 00:00:00 2001 From: Sebastian Sanchez Date: Wed, 2 May 2018 06:43:47 -0700 Subject: IB/hfi1: Create common functions for affinity CPU mask operations CPU masks are used to keep track of affinity assignments for IRQs and processes. Operations performed on these affinity CPU masks are duplicated throughout the code. Create common functions for affinity CPU mask operations to remove duplicate code. Reviewed-by: Michael J. Ruhl Signed-off-by: Sebastian Sanchez Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/affinity.c | 83 +++++++++++++++++++++++++---------- 1 file changed, 60 insertions(+), 23 deletions(-) (limited to 'drivers') diff --git a/drivers/infiniband/hw/hfi1/affinity.c b/drivers/infiniband/hw/hfi1/affinity.c index b5fab55cc275..eca9e6354017 100644 --- a/drivers/infiniband/hw/hfi1/affinity.c +++ b/drivers/infiniband/hw/hfi1/affinity.c @@ -77,6 +77,58 @@ static inline void init_cpu_mask_set(struct cpu_mask_set *set) set->gen = 0; } +/* Increment generation of CPU set if needed */ +static void _cpu_mask_set_gen_inc(struct cpu_mask_set *set) +{ + if (cpumask_equal(&set->mask, &set->used)) { + /* + * We've used up all the CPUs, bump up the generation + * and reset the 'used' map + */ + set->gen++; + cpumask_clear(&set->used); + } +} + +static void _cpu_mask_set_gen_dec(struct cpu_mask_set *set) +{ + if (cpumask_empty(&set->used) && set->gen) { + set->gen--; + cpumask_copy(&set->used, &set->mask); + } +} + +/* Get the first CPU from the list of unused CPUs in a CPU set data structure */ +static int cpu_mask_set_get_first(struct cpu_mask_set *set, cpumask_var_t diff) +{ + int cpu; + + if (!diff || !set) + return -EINVAL; + + _cpu_mask_set_gen_inc(set); + + /* Find out CPUs left in CPU mask */ + cpumask_andnot(diff, &set->mask, &set->used); + + cpu = cpumask_first(diff); + if (cpu >= nr_cpu_ids) /* empty */ + cpu = -EINVAL; + else + cpumask_set_cpu(cpu, &set->used); + + return cpu; +} + +static void cpu_mask_set_put(struct cpu_mask_set *set, int cpu) +{ + if (!set) + return; + + cpumask_clear_cpu(cpu, &set->used); + _cpu_mask_set_gen_dec(set); +} + /* Initialize non-HT cpu cores mask */ void init_real_cpu_mask(void) { @@ -456,17 +508,12 @@ static int get_irq_affinity(struct hfi1_devdata *dd, if (!zalloc_cpumask_var(&diff, GFP_KERNEL)) return -ENOMEM; - if (cpumask_equal(&set->mask, &set->used)) { - /* - * We've used up all the CPUs, bump up the generation - * and reset the 'used' map - */ - set->gen++; - cpumask_clear(&set->used); + cpu = cpu_mask_set_get_first(set, diff); + if (cpu < 0) { + free_cpumask_var(diff); + dd_dev_err(dd, "Failure to obtain CPU for IRQ\n"); + return cpu; } - cpumask_andnot(diff, &set->mask, &set->used); - cpu = cpumask_first(diff); - cpumask_set_cpu(cpu, &set->used); free_cpumask_var(diff); } @@ -526,10 +573,7 @@ void hfi1_put_irq_affinity(struct hfi1_devdata *dd, if (set) { cpumask_andnot(&set->used, &set->used, &msix->mask); - if (cpumask_empty(&set->used) && set->gen) { - set->gen--; - cpumask_copy(&set->used, &set->mask); - } + _cpu_mask_set_gen_dec(set); } irq_set_affinity_hint(msix->irq, NULL); @@ -640,10 +684,7 @@ int hfi1_get_proc_affinity(int node) * If we've used all available HW threads, clear the mask and start * overloading. */ - if (cpumask_equal(&set->mask, &set->used)) { - set->gen++; - cpumask_clear(&set->used); - } + _cpu_mask_set_gen_inc(set); /* * If NUMA node has CPUs used by interrupt handlers, include them in the @@ -767,11 +808,7 @@ void hfi1_put_proc_affinity(int cpu) return; mutex_lock(&affinity->lock); - cpumask_clear_cpu(cpu, &set->used); + cpu_mask_set_put(set, cpu); hfi1_cdbg(PROC, "Returning CPU %d for future process assignment", cpu); - if (cpumask_empty(&set->used) && set->gen) { - set->gen--; - cpumask_copy(&set->used, &set->mask); - } mutex_unlock(&affinity->lock); } -- cgit v1.2.3 From 5d18ee67d4c1735f5c1f757e89228ec68e4f4ef3 Mon Sep 17 00:00:00 2001 From: Sebastian Sanchez Date: Wed, 2 May 2018 06:43:55 -0700 Subject: IB/{hfi1, rdmavt, qib}: Implement CQ completion vector support Currently the driver doesn't support completion vectors. These are used to indicate which sets of CQs should be grouped together into the same vector. A vector is a CQ processing thread that runs on a specific CPU. If an application has several CQs bound to different completion vectors, and each completion vector runs on different CPUs, then the completion queue workload is balanced. This helps scale as more nodes are used. Implement CQ completion vector support using a global workqueue where a CQ entry is queued to the CPU corresponding to the CQ's completion vector. Since the workqueue is global, it's guaranteed to always be there when queueing CQ entries; Therefore, the RCU locking for cq->rdi->worker in the hot path is superfluous. Each completion vector is assigned to a different CPU. The number of completion vectors available is computed by taking the number of online, physical CPUs from the local NUMA node and subtracting the CPUs used for kernel receive queues and the general interrupt. Special use cases: * If there are no CPUs left for completion vectors, the same CPU for the general interrupt is used; Therefore, there would only be one completion vector available. * For multi-HFI systems, the number of completion vectors available for each device is the total number of completion vectors in the local NUMA node divided by the number of devices in the same NUMA node. If there's a division remainder, the first device to get initialized gets an extra completion vector. Upon a CQ creation, an invalid completion vector could be specified. Handle it as follows: * If the completion vector is less than 0, set it to 0. * Set the completion vector to the result of the passed completion vector moded with the number of device completion vectors available. Reviewed-by: Mike Marciniszyn Signed-off-by: Sebastian Sanchez Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/affinity.c | 414 +++++++++++++++++++++++++++++++- drivers/infiniband/hw/hfi1/affinity.h | 10 +- drivers/infiniband/hw/hfi1/chip.c | 5 + drivers/infiniband/hw/hfi1/hfi.h | 3 + drivers/infiniband/hw/hfi1/init.c | 15 +- drivers/infiniband/hw/hfi1/trace.c | 3 +- drivers/infiniband/hw/hfi1/trace_dbg.h | 3 +- drivers/infiniband/hw/hfi1/verbs.c | 7 +- drivers/infiniband/hw/qib/qib_verbs.c | 6 +- drivers/infiniband/sw/rdmavt/cq.c | 81 +++---- drivers/infiniband/sw/rdmavt/cq.h | 6 +- drivers/infiniband/sw/rdmavt/trace_cq.h | 35 ++- drivers/infiniband/sw/rdmavt/vt.c | 35 +-- include/rdma/rdma_vt.h | 7 +- include/rdma/rdmavt_cq.h | 5 +- 15 files changed, 534 insertions(+), 101 deletions(-) (limited to 'drivers') diff --git a/drivers/infiniband/hw/hfi1/affinity.c b/drivers/infiniband/hw/hfi1/affinity.c index eca9e6354017..fbe7198a715a 100644 --- a/drivers/infiniband/hw/hfi1/affinity.c +++ b/drivers/infiniband/hw/hfi1/affinity.c @@ -1,5 +1,5 @@ /* - * Copyright(c) 2015 - 2017 Intel Corporation. + * Copyright(c) 2015 - 2018 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. @@ -208,7 +208,13 @@ int node_affinity_init(void) return 0; } -void node_affinity_destroy(void) +static void node_affinity_destroy(struct hfi1_affinity_node *entry) +{ + free_percpu(entry->comp_vect_affinity); + kfree(entry); +} + +void node_affinity_destroy_all(void) { struct list_head *pos, *q; struct hfi1_affinity_node *entry; @@ -218,7 +224,7 @@ void node_affinity_destroy(void) entry = list_entry(pos, struct hfi1_affinity_node, list); list_del(pos); - kfree(entry); + node_affinity_destroy(entry); } mutex_unlock(&node_affinity.lock); kfree(hfi1_per_node_cntr); @@ -232,6 +238,7 @@ static struct hfi1_affinity_node *node_affinity_allocate(int node) if (!entry) return NULL; entry->node = node; + entry->comp_vect_affinity = alloc_percpu(u16); INIT_LIST_HEAD(&entry->list); return entry; @@ -261,6 +268,341 @@ static struct hfi1_affinity_node *node_affinity_lookup(int node) return NULL; } +static int per_cpu_affinity_get(cpumask_var_t possible_cpumask, + u16 __percpu *comp_vect_affinity) +{ + int curr_cpu; + u16 cntr; + u16 prev_cntr; + int ret_cpu; + + if (!possible_cpumask) { + ret_cpu = -EINVAL; + goto fail; + } + + if (!comp_vect_affinity) { + ret_cpu = -EINVAL; + goto fail; + } + + ret_cpu = cpumask_first(possible_cpumask); + if (ret_cpu >= nr_cpu_ids) { + ret_cpu = -EINVAL; + goto fail; + } + + prev_cntr = *per_cpu_ptr(comp_vect_affinity, ret_cpu); + for_each_cpu(curr_cpu, possible_cpumask) { + cntr = *per_cpu_ptr(comp_vect_affinity, curr_cpu); + + if (cntr < prev_cntr) { + ret_cpu = curr_cpu; + prev_cntr = cntr; + } + } + + *per_cpu_ptr(comp_vect_affinity, ret_cpu) += 1; + +fail: + return ret_cpu; +} + +static int per_cpu_affinity_put_max(cpumask_var_t possible_cpumask, + u16 __percpu *comp_vect_affinity) +{ + int curr_cpu; + int max_cpu; + u16 cntr; + u16 prev_cntr; + + if (!possible_cpumask) + return -EINVAL; + + if (!comp_vect_affinity) + return -EINVAL; + + max_cpu = cpumask_first(possible_cpumask); + if (max_cpu >= nr_cpu_ids) + return -EINVAL; + + prev_cntr = *per_cpu_ptr(comp_vect_affinity, max_cpu); + for_each_cpu(curr_cpu, possible_cpumask) { + cntr = *per_cpu_ptr(comp_vect_affinity, curr_cpu); + + if (cntr > prev_cntr) { + max_cpu = curr_cpu; + prev_cntr = cntr; + } + } + + *per_cpu_ptr(comp_vect_affinity, max_cpu) -= 1; + + return max_cpu; +} + +/* + * Non-interrupt CPUs are used first, then interrupt CPUs. + * Two already allocated cpu masks must be passed. + */ +static int _dev_comp_vect_cpu_get(struct hfi1_devdata *dd, + struct hfi1_affinity_node *entry, + cpumask_var_t non_intr_cpus, + cpumask_var_t available_cpus) + __must_hold(&node_affinity.lock) +{ + int cpu; + struct cpu_mask_set *set = dd->comp_vect; + + lockdep_assert_held(&node_affinity.lock); + if (!non_intr_cpus) { + cpu = -1; + goto fail; + } + + if (!available_cpus) { + cpu = -1; + goto fail; + } + + /* Available CPUs for pinning completion vectors */ + _cpu_mask_set_gen_inc(set); + cpumask_andnot(available_cpus, &set->mask, &set->used); + + /* Available CPUs without SDMA engine interrupts */ + cpumask_andnot(non_intr_cpus, available_cpus, + &entry->def_intr.used); + + /* If there are non-interrupt CPUs available, use them first */ + if (!cpumask_empty(non_intr_cpus)) + cpu = cpumask_first(non_intr_cpus); + else /* Otherwise, use interrupt CPUs */ + cpu = cpumask_first(available_cpus); + + if (cpu >= nr_cpu_ids) { /* empty */ + cpu = -1; + goto fail; + } + cpumask_set_cpu(cpu, &set->used); + +fail: + return cpu; +} + +static void _dev_comp_vect_cpu_put(struct hfi1_devdata *dd, int cpu) +{ + struct cpu_mask_set *set = dd->comp_vect; + + if (cpu < 0) + return; + + cpu_mask_set_put(set, cpu); +} + +/* _dev_comp_vect_mappings_destroy() is reentrant */ +static void _dev_comp_vect_mappings_destroy(struct hfi1_devdata *dd) +{ + int i, cpu; + + if (!dd->comp_vect_mappings) + return; + + for (i = 0; i < dd->comp_vect_possible_cpus; i++) { + cpu = dd->comp_vect_mappings[i]; + _dev_comp_vect_cpu_put(dd, cpu); + dd->comp_vect_mappings[i] = -1; + hfi1_cdbg(AFFINITY, + "[%s] Release CPU %d from completion vector %d", + rvt_get_ibdev_name(&(dd)->verbs_dev.rdi), cpu, i); + } + + kfree(dd->comp_vect_mappings); + dd->comp_vect_mappings = NULL; +} + +/* + * This function creates the table for looking up CPUs for completion vectors. + * num_comp_vectors needs to have been initilized before calling this function. + */ +static int _dev_comp_vect_mappings_create(struct hfi1_devdata *dd, + struct hfi1_affinity_node *entry) + __must_hold(&node_affinity.lock) +{ + int i, cpu, ret; + cpumask_var_t non_intr_cpus; + cpumask_var_t available_cpus; + + lockdep_assert_held(&node_affinity.lock); + + if (!zalloc_cpumask_var(&non_intr_cpus, GFP_KERNEL)) + return -ENOMEM; + + if (!zalloc_cpumask_var(&available_cpus, GFP_KERNEL)) { + free_cpumask_var(non_intr_cpus); + return -ENOMEM; + } + + dd->comp_vect_mappings = kcalloc(dd->comp_vect_possible_cpus, + sizeof(*dd->comp_vect_mappings), + GFP_KERNEL); + if (!dd->comp_vect_mappings) { + ret = -ENOMEM; + goto fail; + } + for (i = 0; i < dd->comp_vect_possible_cpus; i++) + dd->comp_vect_mappings[i] = -1; + + for (i = 0; i < dd->comp_vect_possible_cpus; i++) { + cpu = _dev_comp_vect_cpu_get(dd, entry, non_intr_cpus, + available_cpus); + if (cpu < 0) { + ret = -EINVAL; + goto fail; + } + + dd->comp_vect_mappings[i] = cpu; + hfi1_cdbg(AFFINITY, + "[%s] Completion Vector %d -> CPU %d", + rvt_get_ibdev_name(&(dd)->verbs_dev.rdi), i, cpu); + } + + return 0; + +fail: + free_cpumask_var(available_cpus); + free_cpumask_var(non_intr_cpus); + _dev_comp_vect_mappings_destroy(dd); + + return ret; +} + +int hfi1_comp_vectors_set_up(struct hfi1_devdata *dd) +{ + int ret; + struct hfi1_affinity_node *entry; + + mutex_lock(&node_affinity.lock); + entry = node_affinity_lookup(dd->node); + if (!entry) { + ret = -EINVAL; + goto unlock; + } + ret = _dev_comp_vect_mappings_create(dd, entry); +unlock: + mutex_unlock(&node_affinity.lock); + + return ret; +} + +void hfi1_comp_vectors_clean_up(struct hfi1_devdata *dd) +{ + _dev_comp_vect_mappings_destroy(dd); +} + +int hfi1_comp_vect_mappings_lookup(struct rvt_dev_info *rdi, int comp_vect) +{ + struct hfi1_ibdev *verbs_dev = dev_from_rdi(rdi); + struct hfi1_devdata *dd = dd_from_dev(verbs_dev); + + if (!dd->comp_vect_mappings) + return -EINVAL; + if (comp_vect >= dd->comp_vect_possible_cpus) + return -EINVAL; + + return dd->comp_vect_mappings[comp_vect]; +} + +/* + * It assumes dd->comp_vect_possible_cpus is available. + */ +static int _dev_comp_vect_cpu_mask_init(struct hfi1_devdata *dd, + struct hfi1_affinity_node *entry, + bool first_dev_init) + __must_hold(&node_affinity.lock) +{ + int i, j, curr_cpu; + int possible_cpus_comp_vect = 0; + struct cpumask *dev_comp_vect_mask = &dd->comp_vect->mask; + + lockdep_assert_held(&node_affinity.lock); + /* + * If there's only one CPU available for completion vectors, then + * there will only be one completion vector available. Othewise, + * the number of completion vector available will be the number of + * available CPUs divide it by the number of devices in the + * local NUMA node. + */ + if (cpumask_weight(&entry->comp_vect_mask) == 1) { + possible_cpus_comp_vect = 1; + dd_dev_warn(dd, + "Number of kernel receive queues is too large for completion vector affinity to be effective\n"); + } else { + possible_cpus_comp_vect += + cpumask_weight(&entry->comp_vect_mask) / + hfi1_per_node_cntr[dd->node]; + + /* + * If the completion vector CPUs available doesn't divide + * evenly among devices, then the first device device to be + * initialized gets an extra CPU. + */ + if (first_dev_init && + cpumask_weight(&entry->comp_vect_mask) % + hfi1_per_node_cntr[dd->node] != 0) + possible_cpus_comp_vect++; + } + + dd->comp_vect_possible_cpus = possible_cpus_comp_vect; + + /* Reserving CPUs for device completion vector */ + for (i = 0; i < dd->comp_vect_possible_cpus; i++) { + curr_cpu = per_cpu_affinity_get(&entry->comp_vect_mask, + entry->comp_vect_affinity); + if (curr_cpu < 0) + goto fail; + + cpumask_set_cpu(curr_cpu, dev_comp_vect_mask); + } + + hfi1_cdbg(AFFINITY, + "[%s] Completion vector affinity CPU set(s) %*pbl", + rvt_get_ibdev_name(&(dd)->verbs_dev.rdi), + cpumask_pr_args(dev_comp_vect_mask)); + + return 0; + +fail: + for (j = 0; j < i; j++) + per_cpu_affinity_put_max(&entry->comp_vect_mask, + entry->comp_vect_affinity); + + return curr_cpu; +} + +/* + * It assumes dd->comp_vect_possible_cpus is available. + */ +static void _dev_comp_vect_cpu_mask_clean_up(struct hfi1_devdata *dd, + struct hfi1_affinity_node *entry) + __must_hold(&node_affinity.lock) +{ + int i, cpu; + + lockdep_assert_held(&node_affinity.lock); + if (!dd->comp_vect_possible_cpus) + return; + + for (i = 0; i < dd->comp_vect_possible_cpus; i++) { + cpu = per_cpu_affinity_put_max(&dd->comp_vect->mask, + entry->comp_vect_affinity); + /* Clearing CPU in device completion vector cpu mask */ + if (cpu >= 0) + cpumask_clear_cpu(cpu, &dd->comp_vect->mask); + } + + dd->comp_vect_possible_cpus = 0; +} + /* * Interrupt affinity. * @@ -277,7 +619,8 @@ int hfi1_dev_affinity_init(struct hfi1_devdata *dd) int node = pcibus_to_node(dd->pcidev->bus); struct hfi1_affinity_node *entry; const struct cpumask *local_mask; - int curr_cpu, possible, i; + int curr_cpu, possible, i, ret; + bool new_entry = false; if (node < 0) node = numa_node_id(); @@ -299,11 +642,14 @@ int hfi1_dev_affinity_init(struct hfi1_devdata *dd) if (!entry) { dd_dev_err(dd, "Unable to allocate global affinity node\n"); - mutex_unlock(&node_affinity.lock); - return -ENOMEM; + ret = -ENOMEM; + goto fail; } + new_entry = true; + init_cpu_mask_set(&entry->def_intr); init_cpu_mask_set(&entry->rcv_intr); + cpumask_clear(&entry->comp_vect_mask); cpumask_clear(&entry->general_intr_mask); /* Use the "real" cpu mask of this node as the default */ cpumask_and(&entry->def_intr.mask, &node_affinity.real_cpu_mask, @@ -356,10 +702,64 @@ int hfi1_dev_affinity_init(struct hfi1_devdata *dd) &entry->general_intr_mask); } - node_affinity_add_tail(entry); + /* Determine completion vector CPUs for the entire node */ + cpumask_and(&entry->comp_vect_mask, + &node_affinity.real_cpu_mask, local_mask); + cpumask_andnot(&entry->comp_vect_mask, + &entry->comp_vect_mask, + &entry->rcv_intr.mask); + cpumask_andnot(&entry->comp_vect_mask, + &entry->comp_vect_mask, + &entry->general_intr_mask); + + /* + * If there ends up being 0 CPU cores leftover for completion + * vectors, use the same CPU core as the general/control + * context. + */ + if (cpumask_weight(&entry->comp_vect_mask) == 0) + cpumask_copy(&entry->comp_vect_mask, + &entry->general_intr_mask); } + + ret = _dev_comp_vect_cpu_mask_init(dd, entry, new_entry); + if (ret < 0) + goto fail; + + if (new_entry) + node_affinity_add_tail(entry); + mutex_unlock(&node_affinity.lock); + return 0; + +fail: + if (new_entry) + node_affinity_destroy(entry); + mutex_unlock(&node_affinity.lock); + return ret; +} + +void hfi1_dev_affinity_clean_up(struct hfi1_devdata *dd) +{ + struct hfi1_affinity_node *entry; + + if (dd->node < 0) + return; + + mutex_lock(&node_affinity.lock); + entry = node_affinity_lookup(dd->node); + if (!entry) + goto unlock; + + /* + * Free device completion vector CPUs to be used by future + * completion vectors + */ + _dev_comp_vect_cpu_mask_clean_up(dd, entry); +unlock: + mutex_unlock(&node_affinity.lock); + dd->node = -1; } /* diff --git a/drivers/infiniband/hw/hfi1/affinity.h b/drivers/infiniband/hw/hfi1/affinity.h index 2a1e374169c0..6a7e6ea4e426 100644 --- a/drivers/infiniband/hw/hfi1/affinity.h +++ b/drivers/infiniband/hw/hfi1/affinity.h @@ -1,5 +1,5 @@ /* - * Copyright(c) 2015 - 2017 Intel Corporation. + * Copyright(c) 2015 - 2018 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. @@ -98,9 +98,11 @@ void hfi1_put_proc_affinity(int cpu); struct hfi1_affinity_node { int node; + u16 __percpu *comp_vect_affinity; struct cpu_mask_set def_intr; struct cpu_mask_set rcv_intr; struct cpumask general_intr_mask; + struct cpumask comp_vect_mask; struct list_head list; }; @@ -116,7 +118,11 @@ struct hfi1_affinity_node_list { }; int node_affinity_init(void); -void node_affinity_destroy(void); +void node_affinity_destroy_all(void); extern struct hfi1_affinity_node_list node_affinity; +void hfi1_dev_affinity_clean_up(struct hfi1_devdata *dd); +int hfi1_comp_vect_mappings_lookup(struct rvt_dev_info *rdi, int comp_vect); +int hfi1_comp_vectors_set_up(struct hfi1_devdata *dd); +void hfi1_comp_vectors_clean_up(struct hfi1_devdata *dd); #endif /* _HFI1_AFFINITY_H */ diff --git a/drivers/infiniband/hw/hfi1/chip.c b/drivers/infiniband/hw/hfi1/chip.c index 0fab6df0a345..46e9e4ffcba4 100644 --- a/drivers/infiniband/hw/hfi1/chip.c +++ b/drivers/infiniband/hw/hfi1/chip.c @@ -15233,6 +15233,10 @@ struct hfi1_devdata *hfi1_init_dd(struct pci_dev *pdev, if (ret) goto bail_cleanup; + ret = hfi1_comp_vectors_set_up(dd); + if (ret) + goto bail_clear_intr; + /* set up LCB access - must be after set_up_interrupts() */ init_lcb_access(dd); @@ -15275,6 +15279,7 @@ bail_free_rcverr: bail_free_cntrs: free_cntrs(dd); bail_clear_intr: + hfi1_comp_vectors_clean_up(dd); hfi1_clean_up_interrupts(dd); bail_cleanup: hfi1_pcie_ddcleanup(dd); diff --git a/drivers/infiniband/hw/hfi1/hfi.h b/drivers/infiniband/hw/hfi1/hfi.h index 9cd758ce7764..dd84238c1aac 100644 --- a/drivers/infiniband/hw/hfi1/hfi.h +++ b/drivers/infiniband/hw/hfi1/hfi.h @@ -1263,6 +1263,9 @@ struct hfi1_devdata { /* Save the enabled LCB error bits */ u64 lcb_err_en; + struct cpu_mask_set *comp_vect; + int *comp_vect_mappings; + u32 comp_vect_possible_cpus; /* * Capability to have different send engines simply by changing a diff --git a/drivers/infiniband/hw/hfi1/init.c b/drivers/infiniband/hw/hfi1/init.c index 790542ce89a5..5d1adfc450d3 100644 --- a/drivers/infiniband/hw/hfi1/init.c +++ b/drivers/infiniband/hw/hfi1/init.c @@ -1,5 +1,5 @@ /* - * Copyright(c) 2015-2017 Intel Corporation. + * Copyright(c) 2015 - 2018 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. @@ -1244,6 +1244,8 @@ static void hfi1_clean_devdata(struct hfi1_devdata *dd) dd->rcv_limit = NULL; dd->send_schedule = NULL; dd->tx_opstats = NULL; + kfree(dd->comp_vect); + dd->comp_vect = NULL; sdma_clean(dd, dd->num_sdma); rvt_dealloc_device(&dd->verbs_dev.rdi); } @@ -1300,6 +1302,7 @@ struct hfi1_devdata *hfi1_alloc_devdata(struct pci_dev *pdev, size_t extra) dd->unit = ret; list_add(&dd->list, &hfi1_dev_list); } + dd->node = -1; spin_unlock_irqrestore(&hfi1_devs_lock, flags); idr_preload_end(); @@ -1352,6 +1355,12 @@ struct hfi1_devdata *hfi1_alloc_devdata(struct pci_dev *pdev, size_t extra) goto bail; } + dd->comp_vect = kzalloc(sizeof(*dd->comp_vect), GFP_KERNEL); + if (!dd->comp_vect) { + ret = -ENOMEM; + goto bail; + } + kobject_init(&dd->kobj, &hfi1_devdata_type); return dd; @@ -1521,7 +1530,7 @@ module_init(hfi1_mod_init); static void __exit hfi1_mod_cleanup(void) { pci_unregister_driver(&hfi1_pci_driver); - node_affinity_destroy(); + node_affinity_destroy_all(); hfi1_wss_exit(); hfi1_dbg_exit(); @@ -1605,6 +1614,8 @@ static void cleanup_device_data(struct hfi1_devdata *dd) static void postinit_cleanup(struct hfi1_devdata *dd) { hfi1_start_cleanup(dd); + hfi1_comp_vectors_clean_up(dd); + hfi1_dev_affinity_clean_up(dd); hfi1_pcie_ddcleanup(dd); hfi1_pcie_cleanup(dd->pcidev); diff --git a/drivers/infiniband/hw/hfi1/trace.c b/drivers/infiniband/hw/hfi1/trace.c index 89bd9851065b..332b9b7c554a 100644 --- a/drivers/infiniband/hw/hfi1/trace.c +++ b/drivers/infiniband/hw/hfi1/trace.c @@ -1,5 +1,5 @@ /* - * Copyright(c) 2015 - 2017 Intel Corporation. + * Copyright(c) 2015 - 2018 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. @@ -374,6 +374,7 @@ const char *print_u32_array( return ret; } +__hfi1_trace_fn(AFFINITY); __hfi1_trace_fn(PKT); __hfi1_trace_fn(PROC); __hfi1_trace_fn(SDMA); diff --git a/drivers/infiniband/hw/hfi1/trace_dbg.h b/drivers/infiniband/hw/hfi1/trace_dbg.h index 0e7d929530c5..e62171fb7379 100644 --- a/drivers/infiniband/hw/hfi1/trace_dbg.h +++ b/drivers/infiniband/hw/hfi1/trace_dbg.h @@ -1,5 +1,5 @@ /* -* Copyright(c) 2015, 2016 Intel Corporation. +* Copyright(c) 2015 - 2018 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. @@ -113,6 +113,7 @@ void __hfi1_trace_##lvl(const char *func, char *fmt, ...) \ * hfi1_cdbg(LVL, fmt, ...); as well as take care of all * the debugfs stuff. */ +__hfi1_trace_def(AFFINITY); __hfi1_trace_def(PKT); __hfi1_trace_def(PROC); __hfi1_trace_def(SDMA); diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c index 9554e912af98..fc2e44cde161 100644 --- a/drivers/infiniband/hw/hfi1/verbs.c +++ b/drivers/infiniband/hw/hfi1/verbs.c @@ -64,6 +64,7 @@ #include "debugfs.h" #include "vnic.h" #include "fault.h" +#include "affinity.h" static unsigned int hfi1_lkey_table_size = 16; module_param_named(lkey_table_size, hfi1_lkey_table_size, uint, @@ -1934,11 +1935,11 @@ int hfi1_register_ib_device(struct hfi1_devdata *dd) dd->verbs_dev.rdi.driver_f.modify_qp = hfi1_modify_qp; dd->verbs_dev.rdi.driver_f.notify_restart_rc = hfi1_restart_rc; dd->verbs_dev.rdi.driver_f.check_send_wqe = hfi1_check_send_wqe; + dd->verbs_dev.rdi.driver_f.comp_vect_cpu_lookup = + hfi1_comp_vect_mappings_lookup; /* completeion queue */ - snprintf(dd->verbs_dev.rdi.dparms.cq_name, - sizeof(dd->verbs_dev.rdi.dparms.cq_name), - "hfi1_cq%d", dd->unit); + dd->verbs_dev.rdi.ibdev.num_comp_vectors = dd->comp_vect_possible_cpus; dd->verbs_dev.rdi.dparms.node = dd->node; /* misc settings */ diff --git a/drivers/infiniband/hw/qib/qib_verbs.c b/drivers/infiniband/hw/qib/qib_verbs.c index 3977abbc83ad..14b4057a2b8f 100644 --- a/drivers/infiniband/hw/qib/qib_verbs.c +++ b/drivers/infiniband/hw/qib/qib_verbs.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2012, 2013 Intel Corporation. All rights reserved. + * Copyright (c) 2012 - 2018 Intel Corporation. All rights reserved. * Copyright (c) 2006 - 2012 QLogic Corporation. All rights reserved. * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. * @@ -1631,10 +1631,6 @@ int qib_register_ib_device(struct qib_devdata *dd) dd->verbs_dev.rdi.dparms.core_cap_flags = RDMA_CORE_PORT_IBA_IB; dd->verbs_dev.rdi.dparms.max_mad_size = IB_MGMT_MAD_SIZE; - snprintf(dd->verbs_dev.rdi.dparms.cq_name, - sizeof(dd->verbs_dev.rdi.dparms.cq_name), - "qib_cq%d", dd->unit); - qib_fill_device_attr(dd); ppd = dd->pport; diff --git a/drivers/infiniband/sw/rdmavt/cq.c b/drivers/infiniband/sw/rdmavt/cq.c index 340c17aba3b0..4f1544ad4aff 100644 --- a/drivers/infiniband/sw/rdmavt/cq.c +++ b/drivers/infiniband/sw/rdmavt/cq.c @@ -1,5 +1,5 @@ /* - * Copyright(c) 2016 Intel Corporation. + * Copyright(c) 2016 - 2018 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. @@ -47,11 +47,12 @@ #include #include -#include #include "cq.h" #include "vt.h" #include "trace.h" +static struct workqueue_struct *comp_vector_wq; + /** * rvt_cq_enter - add a new entry to the completion queue * @cq: completion queue @@ -120,27 +121,21 @@ void rvt_cq_enter(struct rvt_cq *cq, struct ib_wc *entry, bool solicited) if (cq->notify == IB_CQ_NEXT_COMP || (cq->notify == IB_CQ_SOLICITED && (solicited || entry->status != IB_WC_SUCCESS))) { - struct kthread_worker *worker; - /* * This will cause send_complete() to be called in * another thread. */ - rcu_read_lock(); - worker = rcu_dereference(cq->rdi->worker); - if (likely(worker)) { - cq->notify = RVT_CQ_NONE; - cq->triggered++; - kthread_queue_work(worker, &cq->comptask); - } - rcu_read_unlock(); + cq->notify = RVT_CQ_NONE; + cq->triggered++; + queue_work_on(cq->comp_vector_cpu, comp_vector_wq, + &cq->comptask); } spin_unlock_irqrestore(&cq->lock, flags); } EXPORT_SYMBOL(rvt_cq_enter); -static void send_complete(struct kthread_work *work) +static void send_complete(struct work_struct *work) { struct rvt_cq *cq = container_of(work, struct rvt_cq, comptask); @@ -192,6 +187,7 @@ struct ib_cq *rvt_create_cq(struct ib_device *ibdev, struct ib_cq *ret; u32 sz; unsigned int entries = attr->cqe; + int comp_vector = attr->comp_vector; if (attr->flags) return ERR_PTR(-EINVAL); @@ -199,6 +195,11 @@ struct ib_cq *rvt_create_cq(struct ib_device *ibdev, if (entries < 1 || entries > rdi->dparms.props.max_cqe) return ERR_PTR(-EINVAL); + if (comp_vector < 0) + comp_vector = 0; + + comp_vector = comp_vector % rdi->ibdev.num_comp_vectors; + /* Allocate the completion queue structure. */ cq = kzalloc_node(sizeof(*cq), GFP_KERNEL, rdi->dparms.node); if (!cq) @@ -267,14 +268,22 @@ struct ib_cq *rvt_create_cq(struct ib_device *ibdev, * an error. */ cq->rdi = rdi; + if (rdi->driver_f.comp_vect_cpu_lookup) + cq->comp_vector_cpu = + rdi->driver_f.comp_vect_cpu_lookup(rdi, comp_vector); + else + cq->comp_vector_cpu = + cpumask_first(cpumask_of_node(rdi->dparms.node)); + cq->ibcq.cqe = entries; cq->notify = RVT_CQ_NONE; spin_lock_init(&cq->lock); - kthread_init_work(&cq->comptask, send_complete); + INIT_WORK(&cq->comptask, send_complete); cq->queue = wc; ret = &cq->ibcq; + trace_rvt_create_cq(cq, attr); goto done; bail_ip: @@ -300,7 +309,7 @@ int rvt_destroy_cq(struct ib_cq *ibcq) struct rvt_cq *cq = ibcq_to_rvtcq(ibcq); struct rvt_dev_info *rdi = cq->rdi; - kthread_flush_work(&cq->comptask); + flush_work(&cq->comptask); spin_lock_irq(&rdi->n_cqs_lock); rdi->n_cqs_allocated--; spin_unlock_irq(&rdi->n_cqs_lock); @@ -510,24 +519,13 @@ int rvt_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry) * * Return: 0 on success */ -int rvt_driver_cq_init(struct rvt_dev_info *rdi) +int rvt_driver_cq_init(void) { - int cpu; - struct kthread_worker *worker; - - if (rcu_access_pointer(rdi->worker)) - return 0; - - spin_lock_init(&rdi->n_cqs_lock); - - cpu = cpumask_first(cpumask_of_node(rdi->dparms.node)); - worker = kthread_create_worker_on_cpu(cpu, 0, - "%s", rdi->dparms.cq_name); - if (IS_ERR(worker)) - return PTR_ERR(worker); + comp_vector_wq = alloc_workqueue("%s", WQ_HIGHPRI | WQ_CPU_INTENSIVE, + 0, "rdmavt_cq"); + if (!comp_vector_wq) + return -ENOMEM; - set_user_nice(worker->task, MIN_NICE); - RCU_INIT_POINTER(rdi->worker, worker); return 0; } @@ -535,23 +533,8 @@ int rvt_driver_cq_init(struct rvt_dev_info *rdi) * rvt_cq_exit - tear down cq reources * @rdi: rvt dev structure */ -void rvt_cq_exit(struct rvt_dev_info *rdi) +void rvt_cq_exit(void) { - struct kthread_worker *worker; - - if (!rcu_access_pointer(rdi->worker)) - return; - - spin_lock(&rdi->n_cqs_lock); - worker = rcu_dereference_protected(rdi->worker, - lockdep_is_held(&rdi->n_cqs_lock)); - if (!worker) { - spin_unlock(&rdi->n_cqs_lock); - return; - } - RCU_INIT_POINTER(rdi->worker, NULL); - spin_unlock(&rdi->n_cqs_lock); - synchronize_rcu(); - - kthread_destroy_worker(worker); + destroy_workqueue(comp_vector_wq); + comp_vector_wq = NULL; } diff --git a/drivers/infiniband/sw/rdmavt/cq.h b/drivers/infiniband/sw/rdmavt/cq.h index 6182c29eff66..72184b1c176b 100644 --- a/drivers/infiniband/sw/rdmavt/cq.h +++ b/drivers/infiniband/sw/rdmavt/cq.h @@ -2,7 +2,7 @@ #define DEF_RVTCQ_H /* - * Copyright(c) 2016 Intel Corporation. + * Copyright(c) 2016 - 2018 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. @@ -59,6 +59,6 @@ int rvt_destroy_cq(struct ib_cq *ibcq); int rvt_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags notify_flags); int rvt_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata); int rvt_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry); -int rvt_driver_cq_init(struct rvt_dev_info *rdi); -void rvt_cq_exit(struct rvt_dev_info *rdi); +int rvt_driver_cq_init(void); +void rvt_cq_exit(void); #endif /* DEF_RVTCQ_H */ diff --git a/drivers/infiniband/sw/rdmavt/trace_cq.h b/drivers/infiniband/sw/rdmavt/trace_cq.h index a315850aa9bb..df8e1adbef9d 100644 --- a/drivers/infiniband/sw/rdmavt/trace_cq.h +++ b/drivers/infiniband/sw/rdmavt/trace_cq.h @@ -1,5 +1,5 @@ /* - * Copyright(c) 2016 Intel Corporation. + * Copyright(c) 2016 - 2018 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. @@ -71,6 +71,39 @@ __print_symbolic(opcode, \ wc_opcode_name(RECV), \ wc_opcode_name(RECV_RDMA_WITH_IMM)) +#define CQ_ATTR_PRINT \ +"[%s] user cq %s cqe %u comp_vector %d comp_vector_cpu %d flags %x" + +DECLARE_EVENT_CLASS(rvt_cq_template, + TP_PROTO(struct rvt_cq *cq, + const struct ib_cq_init_attr *attr), + TP_ARGS(cq, attr), + TP_STRUCT__entry(RDI_DEV_ENTRY(cq->rdi) + __field(struct rvt_mmap_info *, ip) + __field(unsigned int, cqe) + __field(int, comp_vector) + __field(int, comp_vector_cpu) + __field(u32, flags) + ), + TP_fast_assign(RDI_DEV_ASSIGN(cq->rdi) + __entry->ip = cq->ip; + __entry->cqe = attr->cqe; + __entry->comp_vector = attr->comp_vector; + __entry->comp_vector_cpu = + cq->comp_vector_cpu; + __entry->flags = attr->flags; + ), + TP_printk(CQ_ATTR_PRINT, __get_str(dev), + __entry->ip ? "true" : "false", __entry->cqe, + __entry->comp_vector, __entry->comp_vector_cpu, + __entry->flags + ) +); + +DEFINE_EVENT(rvt_cq_template, rvt_create_cq, + TP_PROTO(struct rvt_cq *cq, const struct ib_cq_init_attr *attr), + TP_ARGS(cq, attr)); + #define CQ_PRN \ "[%s] idx %u wr_id %llx status %u opcode %u,%s length %u qpn %x" diff --git a/drivers/infiniband/sw/rdmavt/vt.c b/drivers/infiniband/sw/rdmavt/vt.c index 434199d0bc96..17e4abc067af 100644 --- a/drivers/infiniband/sw/rdmavt/vt.c +++ b/drivers/infiniband/sw/rdmavt/vt.c @@ -1,5 +1,5 @@ /* - * Copyright(c) 2016 Intel Corporation. + * Copyright(c) 2016 - 2018 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. @@ -49,6 +49,7 @@ #include #include #include "vt.h" +#include "cq.h" #include "trace.h" #define RVT_UVERBS_ABI_VERSION 2 @@ -58,21 +59,18 @@ MODULE_DESCRIPTION("RDMA Verbs Transport Library"); static int rvt_init(void) { - /* - * rdmavt does not need to do anything special when it starts up. All it - * needs to do is sit and wait until a driver attempts registration. - */ - return 0; + int ret = rvt_driver_cq_init(); + + if (ret) + pr_err("Error in driver CQ init.\n"); + + return ret; } module_init(rvt_init); static void rvt_cleanup(void) { - /* - * Nothing to do at exit time either. The module won't be able to be - * removed until all drivers are gone which means all the dev structs - * are gone so there is really nothing to do. - */ + rvt_cq_exit(); } module_exit(rvt_cleanup); @@ -777,11 +775,7 @@ int rvt_register_device(struct rvt_dev_info *rdi, u32 driver_id) } /* Completion queues */ - ret = rvt_driver_cq_init(rdi); - if (ret) { - pr_err("Error in driver CQ init.\n"); - goto bail_mr; - } + spin_lock_init(&rdi->n_cqs_lock); /* DMA Operations */ rdi->ibdev.dev.dma_ops = rdi->ibdev.dev.dma_ops ? : &dma_virt_ops; @@ -829,14 +823,15 @@ int rvt_register_device(struct rvt_dev_info *rdi, u32 driver_id) (1ull << IB_USER_VERBS_CMD_DESTROY_SRQ) | (1ull << IB_USER_VERBS_CMD_POST_SRQ_RECV); rdi->ibdev.node_type = RDMA_NODE_IB_CA; - rdi->ibdev.num_comp_vectors = 1; + if (!rdi->ibdev.num_comp_vectors) + rdi->ibdev.num_comp_vectors = 1; rdi->ibdev.driver_id = driver_id; /* We are now good to announce we exist */ ret = ib_register_device(&rdi->ibdev, rdi->driver_f.port_callback); if (ret) { rvt_pr_err(rdi, "Failed to register driver with ib core.\n"); - goto bail_cq; + goto bail_mr; } rvt_create_mad_agents(rdi); @@ -844,9 +839,6 @@ int rvt_register_device(struct rvt_dev_info *rdi, u32 driver_id) rvt_pr_info(rdi, "Registration with rdmavt done.\n"); return ret; -bail_cq: - rvt_cq_exit(rdi); - bail_mr: rvt_mr_exit(rdi); @@ -870,7 +862,6 @@ void rvt_unregister_device(struct rvt_dev_info *rdi) rvt_free_mad_agents(rdi); ib_unregister_device(&rdi->ibdev); - rvt_cq_exit(rdi); rvt_mr_exit(rdi); rvt_qp_exit(rdi); } diff --git a/include/rdma/rdma_vt.h b/include/rdma/rdma_vt.h index eec495e68823..e79229a0cf01 100644 --- a/include/rdma/rdma_vt.h +++ b/include/rdma/rdma_vt.h @@ -2,7 +2,7 @@ #define DEF_RDMA_VT_H /* - * Copyright(c) 2016 Intel Corporation. + * Copyright(c) 2016 - 2018 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. @@ -167,7 +167,6 @@ struct rvt_driver_params { int qpn_res_end; int nports; int npkeys; - char cq_name[RVT_CQN_MAX]; int node; int psn_mask; int psn_shift; @@ -347,6 +346,9 @@ struct rvt_driver_provided { /* Notify driver to restart rc */ void (*notify_restart_rc)(struct rvt_qp *qp, u32 psn, int wait); + + /* Get and return CPU to pin CQ processing thread */ + int (*comp_vect_cpu_lookup)(struct rvt_dev_info *rdi, int comp_vect); }; struct rvt_dev_info { @@ -402,7 +404,6 @@ struct rvt_dev_info { spinlock_t pending_lock; /* protect pending mmap list */ /* CQ */ - struct kthread_worker __rcu *worker; /* per device cq worker */ u32 n_cqs_allocated; /* number of CQs allocated for device */ spinlock_t n_cqs_lock; /* protect count of in use cqs */ diff --git a/include/rdma/rdmavt_cq.h b/include/rdma/rdmavt_cq.h index 51fd00b243d0..75dc65c0bfb8 100644 --- a/include/rdma/rdmavt_cq.h +++ b/include/rdma/rdmavt_cq.h @@ -8,7 +8,7 @@ * * GPL LICENSE SUMMARY * - * Copyright(c) 2016 Intel Corporation. + * Copyright(c) 2016 - 2018 Intel Corporation. * * This program is free software; you can redistribute it and/or modify * it under the terms of version 2 of the GNU General Public License as @@ -80,10 +80,11 @@ struct rvt_cq_wc { */ struct rvt_cq { struct ib_cq ibcq; - struct kthread_work comptask; + struct work_struct comptask; spinlock_t lock; /* protect changes in this struct */ u8 notify; u8 triggered; + int comp_vector_cpu; struct rvt_dev_info *rdi; struct rvt_cq_wc *queue; struct rvt_mmap_info *ip; -- cgit v1.2.3 From 832369fa6410c93547264ad449ebbf16567bbccd Mon Sep 17 00:00:00 2001 From: Brian Welty Date: Wed, 2 May 2018 06:44:03 -0700 Subject: IB/{hfi1, qib, rdmavt}: Move logic to allocate receive WQE into rdmavt Moving receive-side WQE allocation logic into rdmavt will allow further code reuse between qib and hfi1 drivers. Reviewed-by: Mike Marciniszyn Reviewed-by: Dennis Dalessandro Signed-off-by: Brian Welty Signed-off-by: Harish Chegondi Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/rc.c | 8 +- drivers/infiniband/hw/hfi1/ruc.c | 154 +--------------------------------- drivers/infiniband/hw/hfi1/uc.c | 4 +- drivers/infiniband/hw/hfi1/ud.c | 4 +- drivers/infiniband/hw/hfi1/verbs.h | 2 - drivers/infiniband/hw/qib/qib_rc.c | 8 +- drivers/infiniband/hw/qib/qib_ruc.c | 154 +--------------------------------- drivers/infiniband/hw/qib/qib_uc.c | 4 +- drivers/infiniband/hw/qib/qib_ud.c | 4 +- drivers/infiniband/hw/qib/qib_verbs.h | 2 - drivers/infiniband/sw/rdmavt/qp.c | 149 ++++++++++++++++++++++++++++++++ include/rdma/rdmavt_qp.h | 1 + 12 files changed, 170 insertions(+), 324 deletions(-) (limited to 'drivers') diff --git a/drivers/infiniband/hw/hfi1/rc.c b/drivers/infiniband/hw/hfi1/rc.c index da58046a02ea..79ee2b9e28c6 100644 --- a/drivers/infiniband/hw/hfi1/rc.c +++ b/drivers/infiniband/hw/hfi1/rc.c @@ -2123,7 +2123,7 @@ void hfi1_rc_rcv(struct hfi1_packet *packet) /* OK, process the packet. */ switch (opcode) { case OP(SEND_FIRST): - ret = hfi1_rvt_get_rwqe(qp, 0); + ret = rvt_get_rwqe(qp, false); if (ret < 0) goto nack_op_err; if (!ret) @@ -2149,7 +2149,7 @@ send_middle: case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE): /* consume RWQE */ - ret = hfi1_rvt_get_rwqe(qp, 1); + ret = rvt_get_rwqe(qp, true); if (ret < 0) goto nack_op_err; if (!ret) @@ -2159,7 +2159,7 @@ send_middle: case OP(SEND_ONLY): case OP(SEND_ONLY_WITH_IMMEDIATE): case OP(SEND_ONLY_WITH_INVALIDATE): - ret = hfi1_rvt_get_rwqe(qp, 0); + ret = rvt_get_rwqe(qp, false); if (ret < 0) goto nack_op_err; if (!ret) @@ -2271,7 +2271,7 @@ send_last: goto send_middle; else if (opcode == OP(RDMA_WRITE_ONLY)) goto no_immediate_data; - ret = hfi1_rvt_get_rwqe(qp, 1); + ret = rvt_get_rwqe(qp, true); if (ret < 0) goto nack_op_err; if (!ret) { diff --git a/drivers/infiniband/hw/hfi1/ruc.c b/drivers/infiniband/hw/hfi1/ruc.c index c0071ca4147a..ef4c566e206f 100644 --- a/drivers/infiniband/hw/hfi1/ruc.c +++ b/drivers/infiniband/hw/hfi1/ruc.c @@ -53,156 +53,6 @@ #include "verbs_txreq.h" #include "trace.h" -/* - * Validate a RWQE and fill in the SGE state. - * Return 1 if OK. - */ -static int init_sge(struct rvt_qp *qp, struct rvt_rwqe *wqe) -{ - int i, j, ret; - struct ib_wc wc; - struct rvt_lkey_table *rkt; - struct rvt_pd *pd; - struct rvt_sge_state *ss; - - rkt = &to_idev(qp->ibqp.device)->rdi.lkey_table; - pd = ibpd_to_rvtpd(qp->ibqp.srq ? qp->ibqp.srq->pd : qp->ibqp.pd); - ss = &qp->r_sge; - ss->sg_list = qp->r_sg_list; - qp->r_len = 0; - for (i = j = 0; i < wqe->num_sge; i++) { - if (wqe->sg_list[i].length == 0) - continue; - /* Check LKEY */ - ret = rvt_lkey_ok(rkt, pd, j ? &ss->sg_list[j - 1] : &ss->sge, - NULL, &wqe->sg_list[i], - IB_ACCESS_LOCAL_WRITE); - if (unlikely(ret <= 0)) - goto bad_lkey; - qp->r_len += wqe->sg_list[i].length; - j++; - } - ss->num_sge = j; - ss->total_len = qp->r_len; - ret = 1; - goto bail; - -bad_lkey: - while (j) { - struct rvt_sge *sge = --j ? &ss->sg_list[j - 1] : &ss->sge; - - rvt_put_mr(sge->mr); - } - ss->num_sge = 0; - memset(&wc, 0, sizeof(wc)); - wc.wr_id = wqe->wr_id; - wc.status = IB_WC_LOC_PROT_ERR; - wc.opcode = IB_WC_RECV; - wc.qp = &qp->ibqp; - /* Signal solicited completion event. */ - rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc, 1); - ret = 0; -bail: - return ret; -} - -/** - * hfi1_rvt_get_rwqe - copy the next RWQE into the QP's RWQE - * @qp: the QP - * @wr_id_only: update qp->r_wr_id only, not qp->r_sge - * - * Return -1 if there is a local error, 0 if no RWQE is available, - * otherwise return 1. - * - * Can be called from interrupt level. - */ -int hfi1_rvt_get_rwqe(struct rvt_qp *qp, int wr_id_only) -{ - unsigned long flags; - struct rvt_rq *rq; - struct rvt_rwq *wq; - struct rvt_srq *srq; - struct rvt_rwqe *wqe; - void (*handler)(struct ib_event *, void *); - u32 tail; - int ret; - - if (qp->ibqp.srq) { - srq = ibsrq_to_rvtsrq(qp->ibqp.srq); - handler = srq->ibsrq.event_handler; - rq = &srq->rq; - } else { - srq = NULL; - handler = NULL; - rq = &qp->r_rq; - } - - spin_lock_irqsave(&rq->lock, flags); - if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) { - ret = 0; - goto unlock; - } - - wq = rq->wq; - tail = wq->tail; - /* Validate tail before using it since it is user writable. */ - if (tail >= rq->size) - tail = 0; - if (unlikely(tail == wq->head)) { - ret = 0; - goto unlock; - } - /* Make sure entry is read after head index is read. */ - smp_rmb(); - wqe = rvt_get_rwqe_ptr(rq, tail); - /* - * Even though we update the tail index in memory, the verbs - * consumer is not supposed to post more entries until a - * completion is generated. - */ - if (++tail >= rq->size) - tail = 0; - wq->tail = tail; - if (!wr_id_only && !init_sge(qp, wqe)) { - ret = -1; - goto unlock; - } - qp->r_wr_id = wqe->wr_id; - - ret = 1; - set_bit(RVT_R_WRID_VALID, &qp->r_aflags); - if (handler) { - u32 n; - - /* - * Validate head pointer value and compute - * the number of remaining WQEs. - */ - n = wq->head; - if (n >= rq->size) - n = 0; - if (n < tail) - n += rq->size - tail; - else - n -= tail; - if (n < srq->limit) { - struct ib_event ev; - - srq->limit = 0; - spin_unlock_irqrestore(&rq->lock, flags); - ev.device = qp->ibqp.device; - ev.element.srq = qp->ibqp.srq; - ev.event = IB_EVENT_SRQ_LIMIT_REACHED; - handler(&ev, srq->ibsrq.srq_context); - goto bail; - } - } -unlock: - spin_unlock_irqrestore(&rq->lock, flags); -bail: - return ret; -} - static int gid_ok(union ib_gid *gid, __be64 gid_prefix, __be64 id) { return (gid->global.interface_id == id && @@ -423,7 +273,7 @@ again: /* FALLTHROUGH */ case IB_WR_SEND: send: - ret = hfi1_rvt_get_rwqe(qp, 0); + ret = rvt_get_rwqe(qp, false); if (ret < 0) goto op_err; if (!ret) @@ -435,7 +285,7 @@ send: goto inv_err; wc.wc_flags = IB_WC_WITH_IMM; wc.ex.imm_data = wqe->wr.ex.imm_data; - ret = hfi1_rvt_get_rwqe(qp, 1); + ret = rvt_get_rwqe(qp, true); if (ret < 0) goto op_err; if (!ret) diff --git a/drivers/infiniband/hw/hfi1/uc.c b/drivers/infiniband/hw/hfi1/uc.c index 9d7a3110c14c..b7b671017e59 100644 --- a/drivers/infiniband/hw/hfi1/uc.c +++ b/drivers/infiniband/hw/hfi1/uc.c @@ -397,7 +397,7 @@ send_first: if (test_and_clear_bit(RVT_R_REWIND_SGE, &qp->r_aflags)) { qp->r_sge = qp->s_rdma_read_sge; } else { - ret = hfi1_rvt_get_rwqe(qp, 0); + ret = rvt_get_rwqe(qp, false); if (ret < 0) goto op_err; if (!ret) @@ -542,7 +542,7 @@ rdma_last_imm: if (test_and_clear_bit(RVT_R_REWIND_SGE, &qp->r_aflags)) { rvt_put_ss(&qp->s_rdma_read_sge); } else { - ret = hfi1_rvt_get_rwqe(qp, 1); + ret = rvt_get_rwqe(qp, true); if (ret < 0) goto op_err; if (!ret) diff --git a/drivers/infiniband/hw/hfi1/ud.c b/drivers/infiniband/hw/hfi1/ud.c index 69c17a5ef038..6ad203f6da88 100644 --- a/drivers/infiniband/hw/hfi1/ud.c +++ b/drivers/infiniband/hw/hfi1/ud.c @@ -163,7 +163,7 @@ static void ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe) } else { int ret; - ret = hfi1_rvt_get_rwqe(qp, 0); + ret = rvt_get_rwqe(qp, false); if (ret < 0) { rvt_rc_error(qp, IB_WC_LOC_QP_OP_ERR); goto bail_unlock; @@ -974,7 +974,7 @@ void hfi1_ud_rcv(struct hfi1_packet *packet) } else { int ret; - ret = hfi1_rvt_get_rwqe(qp, 0); + ret = rvt_get_rwqe(qp, false); if (ret < 0) { rvt_rc_error(qp, IB_WC_LOC_QP_OP_ERR); return; diff --git a/drivers/infiniband/hw/hfi1/verbs.h b/drivers/infiniband/hw/hfi1/verbs.h index 081ca52e6621..a16fe5d3f7c4 100644 --- a/drivers/infiniband/hw/hfi1/verbs.h +++ b/drivers/infiniband/hw/hfi1/verbs.h @@ -328,8 +328,6 @@ void hfi1_ud_rcv(struct hfi1_packet *packet); int hfi1_lookup_pkey_idx(struct hfi1_ibport *ibp, u16 pkey); -int hfi1_rvt_get_rwqe(struct rvt_qp *qp, int wr_id_only); - void hfi1_migrate_qp(struct rvt_qp *qp); int hfi1_check_modify_qp(struct rvt_qp *qp, struct ib_qp_attr *attr, diff --git a/drivers/infiniband/hw/qib/qib_rc.c b/drivers/infiniband/hw/qib/qib_rc.c index c9955d48c50f..f35fdeb14347 100644 --- a/drivers/infiniband/hw/qib/qib_rc.c +++ b/drivers/infiniband/hw/qib/qib_rc.c @@ -1828,7 +1828,7 @@ void qib_rc_rcv(struct qib_ctxtdata *rcd, struct ib_header *hdr, /* OK, process the packet. */ switch (opcode) { case OP(SEND_FIRST): - ret = qib_get_rwqe(qp, 0); + ret = rvt_get_rwqe(qp, false); if (ret < 0) goto nack_op_err; if (!ret) @@ -1849,7 +1849,7 @@ send_middle: case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE): /* consume RWQE */ - ret = qib_get_rwqe(qp, 1); + ret = rvt_get_rwqe(qp, true); if (ret < 0) goto nack_op_err; if (!ret) @@ -1858,7 +1858,7 @@ send_middle: case OP(SEND_ONLY): case OP(SEND_ONLY_WITH_IMMEDIATE): - ret = qib_get_rwqe(qp, 0); + ret = rvt_get_rwqe(qp, false); if (ret < 0) goto nack_op_err; if (!ret) @@ -1949,7 +1949,7 @@ send_last: goto send_middle; else if (opcode == OP(RDMA_WRITE_ONLY)) goto no_immediate_data; - ret = qib_get_rwqe(qp, 1); + ret = rvt_get_rwqe(qp, true); if (ret < 0) goto nack_op_err; if (!ret) { diff --git a/drivers/infiniband/hw/qib/qib_ruc.c b/drivers/infiniband/hw/qib/qib_ruc.c index 4662cc7bde92..f8a7de795beb 100644 --- a/drivers/infiniband/hw/qib/qib_ruc.c +++ b/drivers/infiniband/hw/qib/qib_ruc.c @@ -37,156 +37,6 @@ #include "qib.h" #include "qib_mad.h" -/* - * Validate a RWQE and fill in the SGE state. - * Return 1 if OK. - */ -static int qib_init_sge(struct rvt_qp *qp, struct rvt_rwqe *wqe) -{ - int i, j, ret; - struct ib_wc wc; - struct rvt_lkey_table *rkt; - struct rvt_pd *pd; - struct rvt_sge_state *ss; - - rkt = &to_idev(qp->ibqp.device)->rdi.lkey_table; - pd = ibpd_to_rvtpd(qp->ibqp.srq ? qp->ibqp.srq->pd : qp->ibqp.pd); - ss = &qp->r_sge; - ss->sg_list = qp->r_sg_list; - qp->r_len = 0; - for (i = j = 0; i < wqe->num_sge; i++) { - if (wqe->sg_list[i].length == 0) - continue; - /* Check LKEY */ - ret = rvt_lkey_ok(rkt, pd, j ? &ss->sg_list[j - 1] : &ss->sge, - NULL, &wqe->sg_list[i], - IB_ACCESS_LOCAL_WRITE); - if (unlikely(ret <= 0)) - goto bad_lkey; - qp->r_len += wqe->sg_list[i].length; - j++; - } - ss->num_sge = j; - ss->total_len = qp->r_len; - ret = 1; - goto bail; - -bad_lkey: - while (j) { - struct rvt_sge *sge = --j ? &ss->sg_list[j - 1] : &ss->sge; - - rvt_put_mr(sge->mr); - } - ss->num_sge = 0; - memset(&wc, 0, sizeof(wc)); - wc.wr_id = wqe->wr_id; - wc.status = IB_WC_LOC_PROT_ERR; - wc.opcode = IB_WC_RECV; - wc.qp = &qp->ibqp; - /* Signal solicited completion event. */ - rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc, 1); - ret = 0; -bail: - return ret; -} - -/** - * qib_get_rwqe - copy the next RWQE into the QP's RWQE - * @qp: the QP - * @wr_id_only: update qp->r_wr_id only, not qp->r_sge - * - * Return -1 if there is a local error, 0 if no RWQE is available, - * otherwise return 1. - * - * Can be called from interrupt level. - */ -int qib_get_rwqe(struct rvt_qp *qp, int wr_id_only) -{ - unsigned long flags; - struct rvt_rq *rq; - struct rvt_rwq *wq; - struct rvt_srq *srq; - struct rvt_rwqe *wqe; - void (*handler)(struct ib_event *, void *); - u32 tail; - int ret; - - if (qp->ibqp.srq) { - srq = ibsrq_to_rvtsrq(qp->ibqp.srq); - handler = srq->ibsrq.event_handler; - rq = &srq->rq; - } else { - srq = NULL; - handler = NULL; - rq = &qp->r_rq; - } - - spin_lock_irqsave(&rq->lock, flags); - if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) { - ret = 0; - goto unlock; - } - - wq = rq->wq; - tail = wq->tail; - /* Validate tail before using it since it is user writable. */ - if (tail >= rq->size) - tail = 0; - if (unlikely(tail == wq->head)) { - ret = 0; - goto unlock; - } - /* Make sure entry is read after head index is read. */ - smp_rmb(); - wqe = rvt_get_rwqe_ptr(rq, tail); - /* - * Even though we update the tail index in memory, the verbs - * consumer is not supposed to post more entries until a - * completion is generated. - */ - if (++tail >= rq->size) - tail = 0; - wq->tail = tail; - if (!wr_id_only && !qib_init_sge(qp, wqe)) { - ret = -1; - goto unlock; - } - qp->r_wr_id = wqe->wr_id; - - ret = 1; - set_bit(RVT_R_WRID_VALID, &qp->r_aflags); - if (handler) { - u32 n; - - /* - * Validate head pointer value and compute - * the number of remaining WQEs. - */ - n = wq->head; - if (n >= rq->size) - n = 0; - if (n < tail) - n += rq->size - tail; - else - n -= tail; - if (n < srq->limit) { - struct ib_event ev; - - srq->limit = 0; - spin_unlock_irqrestore(&rq->lock, flags); - ev.device = qp->ibqp.device; - ev.element.srq = qp->ibqp.srq; - ev.event = IB_EVENT_SRQ_LIMIT_REACHED; - handler(&ev, srq->ibsrq.srq_context); - goto bail; - } - } -unlock: - spin_unlock_irqrestore(&rq->lock, flags); -bail: - return ret; -} - /* * Switch to alternate path. * The QP s_lock should be held and interrupts disabled. @@ -419,7 +269,7 @@ again: wc.ex.imm_data = wqe->wr.ex.imm_data; /* FALLTHROUGH */ case IB_WR_SEND: - ret = qib_get_rwqe(qp, 0); + ret = rvt_get_rwqe(qp, false); if (ret < 0) goto op_err; if (!ret) @@ -431,7 +281,7 @@ again: goto inv_err; wc.wc_flags = IB_WC_WITH_IMM; wc.ex.imm_data = wqe->wr.ex.imm_data; - ret = qib_get_rwqe(qp, 1); + ret = rvt_get_rwqe(qp, true); if (ret < 0) goto op_err; if (!ret) diff --git a/drivers/infiniband/hw/qib/qib_uc.c b/drivers/infiniband/hw/qib/qib_uc.c index 840eec6ebc33..3e54bc11e0ae 100644 --- a/drivers/infiniband/hw/qib/qib_uc.c +++ b/drivers/infiniband/hw/qib/qib_uc.c @@ -335,7 +335,7 @@ send_first: if (test_and_clear_bit(RVT_R_REWIND_SGE, &qp->r_aflags)) qp->r_sge = qp->s_rdma_read_sge; else { - ret = qib_get_rwqe(qp, 0); + ret = rvt_get_rwqe(qp, false); if (ret < 0) goto op_err; if (!ret) @@ -471,7 +471,7 @@ rdma_last_imm: if (test_and_clear_bit(RVT_R_REWIND_SGE, &qp->r_aflags)) rvt_put_ss(&qp->s_rdma_read_sge); else { - ret = qib_get_rwqe(qp, 1); + ret = rvt_get_rwqe(qp, true); if (ret < 0) goto op_err; if (!ret) diff --git a/drivers/infiniband/hw/qib/qib_ud.c b/drivers/infiniband/hw/qib/qib_ud.c index 3e4ff77260c2..f8d029a2390f 100644 --- a/drivers/infiniband/hw/qib/qib_ud.c +++ b/drivers/infiniband/hw/qib/qib_ud.c @@ -139,7 +139,7 @@ static void qib_ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe) else { int ret; - ret = qib_get_rwqe(qp, 0); + ret = rvt_get_rwqe(qp, false); if (ret < 0) { rvt_rc_error(qp, IB_WC_LOC_QP_OP_ERR); goto bail_unlock; @@ -534,7 +534,7 @@ void qib_ud_rcv(struct qib_ibport *ibp, struct ib_header *hdr, else { int ret; - ret = qib_get_rwqe(qp, 0); + ret = rvt_get_rwqe(qp, false); if (ret < 0) { rvt_rc_error(qp, IB_WC_LOC_QP_OP_ERR); return; diff --git a/drivers/infiniband/hw/qib/qib_verbs.h b/drivers/infiniband/hw/qib/qib_verbs.h index f887737ac142..f9a46768a19a 100644 --- a/drivers/infiniband/hw/qib/qib_verbs.h +++ b/drivers/infiniband/hw/qib/qib_verbs.h @@ -321,8 +321,6 @@ void qib_ud_rcv(struct qib_ibport *ibp, struct ib_header *hdr, void mr_rcu_callback(struct rcu_head *list); -int qib_get_rwqe(struct rvt_qp *qp, int wr_id_only); - void qib_migrate_qp(struct rvt_qp *qp); int qib_ruc_check_hdr(struct qib_ibport *ibp, struct ib_header *hdr, diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c index c82e6bb3d77c..6e9a351f45fb 100644 --- a/drivers/infiniband/sw/rdmavt/qp.c +++ b/drivers/infiniband/sw/rdmavt/qp.c @@ -1987,6 +1987,155 @@ int rvt_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr, return 0; } +/* + * Validate a RWQE and fill in the SGE state. + * Return 1 if OK. + */ +static int init_sge(struct rvt_qp *qp, struct rvt_rwqe *wqe) +{ + int i, j, ret; + struct ib_wc wc; + struct rvt_lkey_table *rkt; + struct rvt_pd *pd; + struct rvt_sge_state *ss; + struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device); + + rkt = &rdi->lkey_table; + pd = ibpd_to_rvtpd(qp->ibqp.srq ? qp->ibqp.srq->pd : qp->ibqp.pd); + ss = &qp->r_sge; + ss->sg_list = qp->r_sg_list; + qp->r_len = 0; + for (i = j = 0; i < wqe->num_sge; i++) { + if (wqe->sg_list[i].length == 0) + continue; + /* Check LKEY */ + ret = rvt_lkey_ok(rkt, pd, j ? &ss->sg_list[j - 1] : &ss->sge, + NULL, &wqe->sg_list[i], + IB_ACCESS_LOCAL_WRITE); + if (unlikely(ret <= 0)) + goto bad_lkey; + qp->r_len += wqe->sg_list[i].length; + j++; + } + ss->num_sge = j; + ss->total_len = qp->r_len; + return 1; + +bad_lkey: + while (j) { + struct rvt_sge *sge = --j ? &ss->sg_list[j - 1] : &ss->sge; + + rvt_put_mr(sge->mr); + } + ss->num_sge = 0; + memset(&wc, 0, sizeof(wc)); + wc.wr_id = wqe->wr_id; + wc.status = IB_WC_LOC_PROT_ERR; + wc.opcode = IB_WC_RECV; + wc.qp = &qp->ibqp; + /* Signal solicited completion event. */ + rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc, 1); + return 0; +} + +/** + * rvt_get_rwqe - copy the next RWQE into the QP's RWQE + * @qp: the QP + * @wr_id_only: update qp->r_wr_id only, not qp->r_sge + * + * Return -1 if there is a local error, 0 if no RWQE is available, + * otherwise return 1. + * + * Can be called from interrupt level. + */ +int rvt_get_rwqe(struct rvt_qp *qp, bool wr_id_only) +{ + unsigned long flags; + struct rvt_rq *rq; + struct rvt_rwq *wq; + struct rvt_srq *srq; + struct rvt_rwqe *wqe; + void (*handler)(struct ib_event *, void *); + u32 tail; + int ret; + + if (qp->ibqp.srq) { + srq = ibsrq_to_rvtsrq(qp->ibqp.srq); + handler = srq->ibsrq.event_handler; + rq = &srq->rq; + } else { + srq = NULL; + handler = NULL; + rq = &qp->r_rq; + } + + spin_lock_irqsave(&rq->lock, flags); + if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) { + ret = 0; + goto unlock; + } + + wq = rq->wq; + tail = wq->tail; + /* Validate tail before using it since it is user writable. */ + if (tail >= rq->size) + tail = 0; + if (unlikely(tail == wq->head)) { + ret = 0; + goto unlock; + } + /* Make sure entry is read after head index is read. */ + smp_rmb(); + wqe = rvt_get_rwqe_ptr(rq, tail); + /* + * Even though we update the tail index in memory, the verbs + * consumer is not supposed to post more entries until a + * completion is generated. + */ + if (++tail >= rq->size) + tail = 0; + wq->tail = tail; + if (!wr_id_only && !init_sge(qp, wqe)) { + ret = -1; + goto unlock; + } + qp->r_wr_id = wqe->wr_id; + + ret = 1; + set_bit(RVT_R_WRID_VALID, &qp->r_aflags); + if (handler) { + u32 n; + + /* + * Validate head pointer value and compute + * the number of remaining WQEs. + */ + n = wq->head; + if (n >= rq->size) + n = 0; + if (n < tail) + n += rq->size - tail; + else + n -= tail; + if (n < srq->limit) { + struct ib_event ev; + + srq->limit = 0; + spin_unlock_irqrestore(&rq->lock, flags); + ev.device = qp->ibqp.device; + ev.element.srq = qp->ibqp.srq; + ev.event = IB_EVENT_SRQ_LIMIT_REACHED; + handler(&ev, srq->ibsrq.srq_context); + goto bail; + } + } +unlock: + spin_unlock_irqrestore(&rq->lock, flags); +bail: + return ret; +} +EXPORT_SYMBOL(rvt_get_rwqe); + /** * qp_comm_est - handle trap with QP established * @qp: the QP diff --git a/include/rdma/rdmavt_qp.h b/include/rdma/rdmavt_qp.h index 89ab88c342b6..1145a4c154b2 100644 --- a/include/rdma/rdmavt_qp.h +++ b/include/rdma/rdmavt_qp.h @@ -663,6 +663,7 @@ static inline unsigned long rvt_timeout_to_jiffies(u8 timeout) extern const int ib_rvt_state_ops[]; struct rvt_dev_info; +int rvt_get_rwqe(struct rvt_qp *qp, bool wr_id_only); void rvt_comm_est(struct rvt_qp *qp); int rvt_error_qp(struct rvt_qp *qp, enum ib_wc_status err); void rvt_rc_error(struct rvt_qp *qp, enum ib_wc_status err); -- cgit v1.2.3 From e02637e97d68a54b1527bc654bf8377eda310226 Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Wed, 9 May 2018 22:48:41 +0100 Subject: IB: Fix RDMA_RXE and INFINIBAND_RDMAVT dependencies for DMA_VIRT_OPS DMA_VIRT_OPS requires that dma_addr_t is at least as wide as a pointer, which is expressed as a dependency on !64BIT || ARCH_DMA_ADDR_T_64BIT. For parisc64 this is not true, and if these IB modules are enabled, kconfig warns: WARNING: unmet direct dependencies detected for DMA_VIRT_OPS Depends on [n]: HAS_DMA [=y] && (!64BIT [=y] || ARCH_DMA_ADDR_T_64BIT) Selected by [m]: - INFINIBAND_RDMAVT [=m] && INFINIBAND [=m] && 64BIT [=y] && PCI [=y] - RDMA_RXE [=m] && INET [=y] && PCI [=y] && INFINIBAND [=m] Add dependencies to fix this. Signed-off-by: Ben Hutchings Signed-off-by: Doug Ledford --- drivers/infiniband/sw/rdmavt/Kconfig | 2 +- drivers/infiniband/sw/rxe/Kconfig | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/infiniband/sw/rdmavt/Kconfig b/drivers/infiniband/sw/rdmavt/Kconfig index 2b5513da7e83..98e798007f75 100644 --- a/drivers/infiniband/sw/rdmavt/Kconfig +++ b/drivers/infiniband/sw/rdmavt/Kconfig @@ -1,6 +1,6 @@ config INFINIBAND_RDMAVT tristate "RDMA verbs transport library" - depends on 64BIT + depends on 64BIT && ARCH_DMA_ADDR_T_64BIT depends on PCI select DMA_VIRT_OPS ---help--- diff --git a/drivers/infiniband/sw/rxe/Kconfig b/drivers/infiniband/sw/rxe/Kconfig index bad4a576d7cf..67ae960ab523 100644 --- a/drivers/infiniband/sw/rxe/Kconfig +++ b/drivers/infiniband/sw/rxe/Kconfig @@ -1,6 +1,7 @@ config RDMA_RXE tristate "Software RDMA over Ethernet (RoCE) driver" depends on INET && PCI && INFINIBAND + depends on !64BIT || ARCH_DMA_ADDR_T_64BIT select NET_UDP_TUNNEL select CRYPTO_CRC32 select DMA_VIRT_OPS -- cgit v1.2.3 From 009669cfad50beb10b3f63434863de231c6b408e Mon Sep 17 00:00:00 2001 From: Doug Ledford Date: Tue, 15 May 2018 14:19:19 -0400 Subject: RDMA/hfi1: Fix build error with debugfs disabled A recent patch set to rework the usage of debugfs and to add fault injection capabilities via debugfs files to the hfi1 driver introduced a build error that only shows up when debugfs is fully disabled. The patchset mistakenly defines some empty stub functions in two different headers when debugfs is disabled. Remove the set that shouldn't have been there to resolve the issue. Fixes: a74d5307caba ("IB/hfi1: Rework fault injection machinery") Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/debugfs.h | 15 --------------- 1 file changed, 15 deletions(-) (limited to 'drivers') diff --git a/drivers/infiniband/hw/hfi1/debugfs.h b/drivers/infiniband/hw/hfi1/debugfs.h index 1c91461b108f..d5d824459fcc 100644 --- a/drivers/infiniband/hw/hfi1/debugfs.h +++ b/drivers/infiniband/hw/hfi1/debugfs.h @@ -118,21 +118,6 @@ static inline void hfi1_dbg_init(void) static inline void hfi1_dbg_exit(void) { } - -static inline bool hfi1_dbg_should_fault_rx(struct hfi1_packet *packet) -{ - return false; -} - -static inline bool hfi1_dbg_should_fault_tx(struct rvt_qp *qp, u32 opcode) -{ - return false; -} - -static inline bool hfi1_dbg_fault_suppress_err(struct hfi1_ibdev *ibd) -{ - return false; -} #endif #endif /* _HFI1_DEBUGFS_H */ -- cgit v1.2.3 From 1ea62e816407987fc27a1bb2d011ea6d81338933 Mon Sep 17 00:00:00 2001 From: Steve Wise Date: Mon, 14 May 2018 11:12:26 -0700 Subject: iw_cxgb4: fix uninitialized variable warnings Fixes: 056f9c7f39bf ("iw_cxgb4: dump detailed driver-specific QP information") Signed-off-by: Steve Wise Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/cxgb4/restrack.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers') diff --git a/drivers/infiniband/hw/cxgb4/restrack.c b/drivers/infiniband/hw/cxgb4/restrack.c index a677940b164a..8d1106befc5c 100644 --- a/drivers/infiniband/hw/cxgb4/restrack.c +++ b/drivers/infiniband/hw/cxgb4/restrack.c @@ -174,10 +174,10 @@ static int fill_res_qp_entry(struct sk_buff *msg, struct t4_swsqe *fsp = NULL, *lsp = NULL; struct t4_swrqe *frp = NULL, *lrp = NULL; struct c4iw_qp *qhp = to_c4iw_qp(ibqp); + u16 first_sq_idx = 0, last_sq_idx = 0; + u16 first_rq_idx = 0, last_rq_idx = 0; struct t4_swsqe first_sqe, last_sqe; struct t4_swrqe first_rqe, last_rqe; - u16 first_sq_idx, last_sq_idx; - u16 first_rq_idx, last_rq_idx; struct nlattr *table_attr; struct t4_wq wq; -- cgit v1.2.3 From e6125a254d7d2af806752c9b776d1ff4b565ce2e Mon Sep 17 00:00:00 2001 From: Steve Wise Date: Tue, 15 May 2018 11:19:30 -0700 Subject: RDMA/NLDEV: remove mr iova attribute Remove mr iova attribute because we don't want to pass up kernel pointers. Fixes: fccec5b89ac6 ("RDMA/nldev: provide detailed MR information") Signed-off-by: Steve Wise Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/nldev.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'drivers') diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c index 8674ca2d8f91..340c7bea45ab 100644 --- a/drivers/infiniband/core/nldev.c +++ b/drivers/infiniband/core/nldev.c @@ -535,9 +535,6 @@ static int fill_res_mr_entry(struct sk_buff *msg, struct netlink_callback *cb, goto err; if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_LKEY, mr->lkey)) goto err; - if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_RES_IOVA, - mr->iova, RDMA_NLDEV_ATTR_PAD)) - goto err; } if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_RES_MRLEN, mr->length, -- cgit v1.2.3 From 2d478b28596f4fa6efd10a696f05e354be05de45 Mon Sep 17 00:00:00 2001 From: Steve Wise Date: Tue, 15 May 2018 11:19:21 -0700 Subject: iw_cxgb4: remove wr_id attributes Remove sq/rq wr_id attributes because typically they are pointers and we don't want to pass up kernel pointers. Fixes: 056f9c7f39bf ("iw_cxgb4: dump detailed driver-specific QP information") Signed-off-by: Steve Wise Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/cxgb4/restrack.c | 55 ---------------------------------- 1 file changed, 55 deletions(-) (limited to 'drivers') diff --git a/drivers/infiniband/hw/cxgb4/restrack.c b/drivers/infiniband/hw/cxgb4/restrack.c index 8d1106befc5c..b9724d0b32e0 100644 --- a/drivers/infiniband/hw/cxgb4/restrack.c +++ b/drivers/infiniband/hw/cxgb4/restrack.c @@ -97,8 +97,6 @@ static int fill_swsqe(struct sk_buff *msg, struct t4_sq *sq, u16 idx, goto err; if (rdma_nl_put_driver_u32(msg, "opcode", sqe->opcode)) goto err; - if (rdma_nl_put_driver_u64_hex(msg, "wr_id", sqe->wr_id)) - goto err; if (rdma_nl_put_driver_u32(msg, "complete", sqe->complete)) goto err; if (sqe->complete && @@ -134,50 +132,14 @@ err: return -EMSGSIZE; } -static int fill_swrqe(struct sk_buff *msg, struct t4_rq *rq, u16 idx, - struct t4_swrqe *rqe) -{ - if (rdma_nl_put_driver_u32(msg, "idx", idx)) - goto err; - if (rdma_nl_put_driver_u64_hex(msg, "wr_id", rqe->wr_id)) - goto err; - return 0; -err: - return -EMSGSIZE; -} - -/* - * Dump the first and last pending rqes. - */ -static int fill_swrqes(struct sk_buff *msg, struct t4_rq *rq, - u16 first_idx, struct t4_swrqe *first_rqe, - u16 last_idx, struct t4_swrqe *last_rqe) -{ - if (!first_rqe) - goto out; - if (fill_swrqe(msg, rq, first_idx, first_rqe)) - goto err; - if (!last_rqe) - goto out; - if (fill_swrqe(msg, rq, last_idx, last_rqe)) - goto err; -out: - return 0; -err: - return -EMSGSIZE; -} - static int fill_res_qp_entry(struct sk_buff *msg, struct rdma_restrack_entry *res) { struct ib_qp *ibqp = container_of(res, struct ib_qp, res); struct t4_swsqe *fsp = NULL, *lsp = NULL; - struct t4_swrqe *frp = NULL, *lrp = NULL; struct c4iw_qp *qhp = to_c4iw_qp(ibqp); u16 first_sq_idx = 0, last_sq_idx = 0; - u16 first_rq_idx = 0, last_rq_idx = 0; struct t4_swsqe first_sqe, last_sqe; - struct t4_swrqe first_rqe, last_rqe; struct nlattr *table_attr; struct t4_wq wq; @@ -206,20 +168,6 @@ static int fill_res_qp_entry(struct sk_buff *msg, lsp = &last_sqe; } } - - /* If there are any pending rqes, copy the first and last */ - if (wq.rq.cidx != wq.rq.pidx) { - first_rq_idx = wq.rq.cidx; - first_rqe = qhp->wq.rq.sw_rq[first_rq_idx]; - frp = &first_rqe; - last_rq_idx = wq.rq.pidx; - if (last_rq_idx-- == 0) - last_rq_idx = wq.rq.size - 1; - if (last_rq_idx != first_rq_idx) { - last_rqe = qhp->wq.rq.sw_rq[last_rq_idx]; - lrp = &last_rqe; - } - } spin_unlock_irq(&qhp->lock); if (fill_sq(msg, &wq)) @@ -231,9 +179,6 @@ static int fill_res_qp_entry(struct sk_buff *msg, if (fill_rq(msg, &wq)) goto err_cancel_table; - if (fill_swrqes(msg, &wq.rq, first_rq_idx, frp, last_rq_idx, lrp)) - goto err_cancel_table; - nla_nest_end(msg, table_attr); return 0; -- cgit v1.2.3 From aec05afe641b9c10024c7e4838c0d6cd734f3565 Mon Sep 17 00:00:00 2001 From: Yuval Shaia Date: Thu, 10 May 2018 09:32:43 +0300 Subject: IB/core: Remove redundant return "return" statement at the end of void function is redundant, removing it. Signed-off-by: Yuval Shaia Reviewed-by: Zhu Yanjun Reviewed-by: Qing Huang Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/umem.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'drivers') diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c index 9a4e899d94b3..b3cdc099d32a 100644 --- a/drivers/infiniband/core/umem.c +++ b/drivers/infiniband/core/umem.c @@ -64,8 +64,6 @@ static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int d } sg_free_table(&umem->sg_head); - return; - } /** -- cgit v1.2.3 From f43c00c04bbf01be0822ef9f0281cc69b56c4e40 Mon Sep 17 00:00:00 2001 From: Shiraz Saleem Date: Sat, 12 May 2018 07:50:30 -0500 Subject: i40iw: Extend port reuse support for listeners If two listeners are created with different IP's but same port, the second rdma_listen fails due to a duplicate port entry being added from the CQP add APBVT OP. commit f16dc0aa5ea2 ("i40iw: Add support for port reuse on active side connections") does not account for listener side port reuse. Check for duplicate port before invoking the CQP command to add APBVT entry and delete the entry only if the port is not in use. Additionally, consolidate all port-reuse logic into i40iw_manage_apbvt. Fixes: f16dc0aa5ea2 ("i40iw: Add support for port reuse on active side connections") Signed-off-by: Shiraz Saleem Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/i40iw/i40iw_cm.c | 63 ++++++++++++++-------------------- drivers/infiniband/hw/i40iw/i40iw_cm.h | 4 ++- drivers/infiniband/hw/i40iw/i40iw_hw.c | 34 ++++++++++++++++-- 3 files changed, 59 insertions(+), 42 deletions(-) (limited to 'drivers') diff --git a/drivers/infiniband/hw/i40iw/i40iw_cm.c b/drivers/infiniband/hw/i40iw/i40iw_cm.c index 0243ec48e4b5..a24daac719c3 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_cm.c +++ b/drivers/infiniband/hw/i40iw/i40iw_cm.c @@ -1519,18 +1519,13 @@ static void i40iw_add_hte_node(struct i40iw_cm_core *cm_core, /** * i40iw_find_port - find port that matches reference port - * @port: port number + * @hte: ptr to accelerated or non-accelerated list * @accelerated_list: flag for accelerated vs non-accelerated list */ -static bool i40iw_find_port(struct i40iw_cm_core *cm_core, u16 port, - bool accelerated_list) +static bool i40iw_find_port(struct list_head *hte, u16 port) { - struct list_head *hte; struct i40iw_cm_node *cm_node; - hte = accelerated_list ? - &cm_core->accelerated_list : &cm_core->non_accelerated_list; - list_for_each_entry(cm_node, hte, list) { if (cm_node->loc_port == port) return true; @@ -1540,35 +1535,32 @@ static bool i40iw_find_port(struct i40iw_cm_core *cm_core, u16 port, /** * i40iw_port_in_use - determine if port is in use + * @cm_core: cm's core * @port: port number - * @active_side: flag for listener side vs active side */ -static bool i40iw_port_in_use(struct i40iw_cm_core *cm_core, u16 port, bool active_side) +bool i40iw_port_in_use(struct i40iw_cm_core *cm_core, u16 port) { struct i40iw_cm_listener *listen_node; unsigned long flags; - bool ret = false; - if (active_side) { - spin_lock_irqsave(&cm_core->ht_lock, flags); - ret = i40iw_find_port(cm_core, port, true); - if (!ret) - ret = i40iw_find_port(cm_core, port, false); - if (!ret) - clear_bit(port, cm_core->active_side_ports); + spin_lock_irqsave(&cm_core->ht_lock, flags); + if (i40iw_find_port(&cm_core->accelerated_list, port) || + i40iw_find_port(&cm_core->non_accelerated_list, port)) { spin_unlock_irqrestore(&cm_core->ht_lock, flags); - } else { - spin_lock_irqsave(&cm_core->listen_list_lock, flags); - list_for_each_entry(listen_node, &cm_core->listen_nodes, list) { - if (listen_node->loc_port == port) { - ret = true; - break; - } + return true; + } + spin_unlock_irqrestore(&cm_core->ht_lock, flags); + + spin_lock_irqsave(&cm_core->listen_list_lock, flags); + list_for_each_entry(listen_node, &cm_core->listen_nodes, list) { + if (listen_node->loc_port == port) { + spin_unlock_irqrestore(&cm_core->listen_list_lock, flags); + return true; } - spin_unlock_irqrestore(&cm_core->listen_list_lock, flags); } + spin_unlock_irqrestore(&cm_core->listen_list_lock, flags); - return ret; + return false; } /** @@ -1917,7 +1909,7 @@ static int i40iw_dec_refcnt_listen(struct i40iw_cm_core *cm_core, spin_unlock_irqrestore(&cm_core->listen_list_lock, flags); if (listener->iwdev) { - if (apbvt_del && !i40iw_port_in_use(cm_core, listener->loc_port, false)) + if (apbvt_del) i40iw_manage_apbvt(listener->iwdev, listener->loc_port, I40IW_MANAGE_APBVT_DEL); @@ -2298,7 +2290,7 @@ static void i40iw_rem_ref_cm_node(struct i40iw_cm_node *cm_node) if (cm_node->listener) { i40iw_dec_refcnt_listen(cm_core, cm_node->listener, 0, true); } else { - if (!i40iw_port_in_use(cm_core, cm_node->loc_port, true) && cm_node->apbvt_set) { + if (cm_node->apbvt_set) { i40iw_manage_apbvt(cm_node->iwdev, cm_node->loc_port, I40IW_MANAGE_APBVT_DEL); @@ -3244,6 +3236,7 @@ void i40iw_setup_cm_core(struct i40iw_device *iwdev) spin_lock_init(&cm_core->ht_lock); spin_lock_init(&cm_core->listen_list_lock); + spin_lock_init(&cm_core->apbvt_lock); cm_core->event_wq = alloc_ordered_workqueue("iwewq", WQ_MEM_RECLAIM); @@ -3811,7 +3804,6 @@ int i40iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) struct sockaddr_in6 *laddr6; struct sockaddr_in6 *raddr6; int ret = 0; - unsigned long flags; ibqp = i40iw_get_qp(cm_id->device, conn_param->qpn); if (!ibqp) @@ -3882,15 +3874,10 @@ int i40iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) cm_node->qhash_set = true; } - spin_lock_irqsave(&iwdev->cm_core.ht_lock, flags); - if (!test_and_set_bit(cm_info.loc_port, iwdev->cm_core.active_side_ports)) { - spin_unlock_irqrestore(&iwdev->cm_core.ht_lock, flags); - if (i40iw_manage_apbvt(iwdev, cm_info.loc_port, I40IW_MANAGE_APBVT_ADD)) { - ret = -EINVAL; - goto err; - } - } else { - spin_unlock_irqrestore(&iwdev->cm_core.ht_lock, flags); + if (i40iw_manage_apbvt(iwdev, cm_info.loc_port, + I40IW_MANAGE_APBVT_ADD)) { + ret = -EINVAL; + goto err; } cm_node->apbvt_set = true; diff --git a/drivers/infiniband/hw/i40iw/i40iw_cm.h b/drivers/infiniband/hw/i40iw/i40iw_cm.h index 78ba36ae2bbe..66dc1ba03389 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_cm.h +++ b/drivers/infiniband/hw/i40iw/i40iw_cm.h @@ -413,8 +413,9 @@ struct i40iw_cm_core { spinlock_t ht_lock; /* manage hash table */ spinlock_t listen_list_lock; /* listen list */ + spinlock_t apbvt_lock; /*manage apbvt entries*/ - unsigned long active_side_ports[BITS_TO_LONGS(MAX_PORTS)]; + unsigned long ports_in_use[BITS_TO_LONGS(MAX_PORTS)]; u64 stats_nodes_created; u64 stats_nodes_destroyed; @@ -457,4 +458,5 @@ void i40iw_if_notify(struct i40iw_device *iwdev, struct net_device *netdev, void i40iw_cm_teardown_connections(struct i40iw_device *iwdev, u32 *ipaddr, struct i40iw_cm_info *nfo, bool disconnect_all); +bool i40iw_port_in_use(struct i40iw_cm_core *cm_core, u16 port); #endif /* I40IW_CM_H */ diff --git a/drivers/infiniband/hw/i40iw/i40iw_hw.c b/drivers/infiniband/hw/i40iw/i40iw_hw.c index 6139836fb533..414a36ce16af 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_hw.c +++ b/drivers/infiniband/hw/i40iw/i40iw_hw.c @@ -443,13 +443,37 @@ void i40iw_process_aeq(struct i40iw_device *iwdev) int i40iw_manage_apbvt(struct i40iw_device *iwdev, u16 accel_local_port, bool add_port) { struct i40iw_apbvt_info *info; - enum i40iw_status_code status; struct i40iw_cqp_request *cqp_request; struct cqp_commands_info *cqp_info; + unsigned long flags; + struct i40iw_cm_core *cm_core = &iwdev->cm_core; + enum i40iw_status_code status = 0; + bool in_use; + + /* apbvt_lock is held across CQP delete APBVT OP (non-waiting) to + * protect against race where add APBVT CQP can race ahead of the delete + * APBVT for same port. + */ + spin_lock_irqsave(&cm_core->apbvt_lock, flags); + + if (!add_port) { + in_use = i40iw_port_in_use(cm_core, accel_local_port); + if (in_use) + goto exit; + clear_bit(accel_local_port, cm_core->ports_in_use); + } else { + in_use = test_and_set_bit(accel_local_port, + cm_core->ports_in_use); + spin_unlock_irqrestore(&cm_core->apbvt_lock, flags); + if (in_use) + return 0; + } cqp_request = i40iw_get_cqp_request(&iwdev->cqp, add_port); - if (!cqp_request) - return -ENOMEM; + if (!cqp_request) { + status = -ENOMEM; + goto exit; + } cqp_info = &cqp_request->info; info = &cqp_info->in.u.manage_apbvt_entry.info; @@ -465,6 +489,10 @@ int i40iw_manage_apbvt(struct i40iw_device *iwdev, u16 accel_local_port, bool ad status = i40iw_handle_cqp_op(iwdev, cqp_request); if (status) i40iw_pr_err("CQP-OP Manage APBVT entry fail"); +exit: + if (!add_port) + spin_unlock_irqrestore(&cm_core->apbvt_lock, flags); + return status; } -- cgit v1.2.3 From a5c57d327272bdf3a8b19686eaca2ec683449e67 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Mon, 14 May 2018 11:11:07 +0300 Subject: IB/cm: Avoid AV ah_attr overwriting during LAP message handling AH attribute of the cm_id can be overwritten if LAP message is received on CM request which is in progress. This bug got introduced to avoid sleeping when spin lock is held as part of commit in Fixes tag. Therefore validate the cm_id state first and continue to perform AV ah_attr initialization. Given that Aleternative path related messages are not supported for RoCE, init_av_from_response/path is such messages are ok to be called from blocking context. Fixes: 33f93e1ebcf5 ("IB/cm: Fix sleeping while spin lock is held") Signed-off-by: Parav Pandit Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cm.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'drivers') diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c index a92e1a5c202b..34b4defc3d1f 100644 --- a/drivers/infiniband/core/cm.c +++ b/drivers/infiniband/core/cm.c @@ -3189,12 +3189,6 @@ static int cm_lap_handler(struct cm_work *work) if (!cm_id_priv) return -EINVAL; - ret = cm_init_av_for_response(work->port, work->mad_recv_wc->wc, - work->mad_recv_wc->recv_buf.grh, - &cm_id_priv->av); - if (ret) - goto deref; - param = &work->cm_event.param.lap_rcvd; memset(&work->path[0], 0, sizeof(work->path[1])); cm_path_set_rec_type(work->port->cm_dev->ib_device, @@ -3239,10 +3233,16 @@ static int cm_lap_handler(struct cm_work *work) goto unlock; } - cm_id_priv->id.lap_state = IB_CM_LAP_RCVD; - cm_id_priv->tid = lap_msg->hdr.tid; + ret = cm_init_av_for_response(work->port, work->mad_recv_wc->wc, + work->mad_recv_wc->recv_buf.grh, + &cm_id_priv->av); + if (ret) + goto unlock; + cm_init_av_by_path(param->alternate_path, &cm_id_priv->alt_av, cm_id_priv); + cm_id_priv->id.lap_state = IB_CM_LAP_RCVD; + cm_id_priv->tid = lap_msg->hdr.tid; ret = atomic_inc_and_test(&cm_id_priv->work_count); if (!ret) list_add_tail(&work->list, &cm_id_priv->work_list); -- cgit v1.2.3 From 0e225dcb7681c0a8e52fb9dc68bd8ab973de4ca2 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Mon, 14 May 2018 11:11:08 +0300 Subject: IB/cm: Store and restore ah_attr during LAP msg processing During CM LAP processing, ah_attr is reinitialized on receiving LAP request. First likely during CM request processing. ah_attr might get zero out if LAP processing fails. Therefore, attempt to create new ah_attr for the LAP message. If the initialization fails, continue with older ah_attr. If the initialization passes, consider the new ah_attr by overwriting the older one. Signed-off-by: Parav Pandit Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cm.c | 32 +++++++++++++++++++++++++++++--- 1 file changed, 29 insertions(+), 3 deletions(-) (limited to 'drivers') diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c index 34b4defc3d1f..8b640cb88f3a 100644 --- a/drivers/infiniband/core/cm.c +++ b/drivers/infiniband/core/cm.c @@ -452,6 +452,32 @@ static void cm_set_private_data(struct cm_id_private *cm_id_priv, cm_id_priv->private_data_len = private_data_len; } +static int cm_init_av_for_lap(struct cm_port *port, struct ib_wc *wc, + struct ib_grh *grh, struct cm_av *av) +{ + struct rdma_ah_attr new_ah_attr; + int ret; + + av->port = port; + av->pkey_index = wc->pkey_index; + + /* + * av->ah_attr might be initialized based on past wc during incoming + * connect request or while sending out connect request. So initialize + * a new ah_attr on stack. If initialization fails, old ah_attr is + * used for sending any responses. If initialization is successful, + * than new ah_attr is used by overwriting old one. + */ + ret = ib_init_ah_attr_from_wc(port->cm_dev->ib_device, + port->port_num, wc, + grh, &new_ah_attr); + if (ret) + return ret; + + memcpy(&av->ah_attr, &new_ah_attr, sizeof(new_ah_attr)); + return 0; +} + static int cm_init_av_for_response(struct cm_port *port, struct ib_wc *wc, struct ib_grh *grh, struct cm_av *av) { @@ -3233,9 +3259,9 @@ static int cm_lap_handler(struct cm_work *work) goto unlock; } - ret = cm_init_av_for_response(work->port, work->mad_recv_wc->wc, - work->mad_recv_wc->recv_buf.grh, - &cm_id_priv->av); + ret = cm_init_av_for_lap(work->port, work->mad_recv_wc->wc, + work->mad_recv_wc->recv_buf.grh, + &cm_id_priv->av); if (ret) goto unlock; -- cgit v1.2.3 From e822ff213fe60dc539a7e2137a08b04004f8ec25 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Mon, 14 May 2018 11:11:09 +0300 Subject: IB/cm: Store and restore ah_attr during CM message processing During CM request processing flow, ah_attr is initialized twice. First based on wc. Secondly based on primary path record. ah_attr initialization from path record can fail, which leads to ah_attr zeroed out. Therefore, always initialize ah_attr on stack during reinitialization phase. If ah_attr init is successful, use the new ah_attry by overwriting the old one. If the ah_attr init fails, continue to use the last ah_attr. Signed-off-by: Parav Pandit Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cm.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) (limited to 'drivers') diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c index 8b640cb88f3a..7df4c7173607 100644 --- a/drivers/infiniband/core/cm.c +++ b/drivers/infiniband/core/cm.c @@ -535,6 +535,7 @@ static struct cm_port *get_cm_port_from_path(struct sa_path_rec *path) static int cm_init_av_by_path(struct sa_path_rec *path, struct cm_av *av, struct cm_id_private *cm_id_priv) { + struct rdma_ah_attr new_ah_attr; struct cm_device *cm_dev; struct cm_port *port; int ret; @@ -550,15 +551,26 @@ static int cm_init_av_by_path(struct sa_path_rec *path, struct cm_av *av, return ret; av->port = port; + + /* + * av->ah_attr might be initialized based on wc or during + * request processing time. So initialize a new ah_attr on stack. + * If initialization fails, old ah_attr is used for sending any + * responses. If initialization is successful, than new ah_attr + * is used by overwriting the old one. + */ ret = ib_init_ah_attr_from_path(cm_dev->ib_device, port->port_num, path, - &av->ah_attr); + &new_ah_attr); if (ret) return ret; av->timeout = path->packet_life_time + 1; ret = add_cm_id_to_port_list(cm_id_priv, av, port); - return ret; + if (ret) + return ret; + memcpy(&av->ah_attr, &new_ah_attr, sizeof(new_ah_attr)); + return 0; } static int cm_alloc_id(struct cm_id_private *cm_id_priv) -- cgit v1.2.3 From d90e5e5038ef8e62c82710e3970317ef4eb367cf Mon Sep 17 00:00:00 2001 From: Ariel Levkovich Date: Sun, 13 May 2018 14:33:30 +0300 Subject: IB/uverbs: Introduce a GRE steering match filter Adding a new GRE steering match filter that can match against key and protocol fields. Reviewed-by: Mark Bloch Signed-off-by: Ariel Levkovich Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/uverbs_cmd.c | 11 +++++++++++ include/rdma/ib_verbs.h | 17 +++++++++++++++++ 2 files changed, 28 insertions(+) (limited to 'drivers') diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 21a887c9523b..82c8c8b3e10b 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -2948,6 +2948,17 @@ int ib_uverbs_kern_spec_to_ib_spec_filter(enum ib_flow_spec_type type, memcpy(&ib_spec->esp.val, kern_spec_val, actual_filter_sz); memcpy(&ib_spec->esp.mask, kern_spec_mask, actual_filter_sz); break; + case IB_FLOW_SPEC_GRE: + ib_filter_sz = offsetof(struct ib_flow_gre_filter, real_sz); + actual_filter_sz = spec_filter_size(kern_spec_mask, + kern_filter_sz, + ib_filter_sz); + if (actual_filter_sz <= 0) + return -EINVAL; + ib_spec->gre.size = sizeof(struct ib_flow_spec_gre); + memcpy(&ib_spec->gre.val, kern_spec_val, actual_filter_sz); + memcpy(&ib_spec->gre.mask, kern_spec_mask, actual_filter_sz); + break; default: return -EINVAL; } diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 9fc8a825aa28..e3ec61fe6060 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -1852,6 +1852,7 @@ enum ib_flow_spec_type { IB_FLOW_SPEC_TCP = 0x40, IB_FLOW_SPEC_UDP = 0x41, IB_FLOW_SPEC_VXLAN_TUNNEL = 0x50, + IB_FLOW_SPEC_GRE = 0x51, IB_FLOW_SPEC_INNER = 0x100, /* Actions */ IB_FLOW_SPEC_ACTION_TAG = 0x1000, @@ -1994,6 +1995,21 @@ struct ib_flow_spec_esp { struct ib_flow_esp_filter mask; }; +struct ib_flow_gre_filter { + __be16 c_ks_res0_ver; + __be16 protocol; + __be32 key; + /* Must be last */ + u8 real_sz[0]; +}; + +struct ib_flow_spec_gre { + u32 type; + u16 size; + struct ib_flow_gre_filter val; + struct ib_flow_gre_filter mask; +}; + struct ib_flow_spec_action_tag { enum ib_flow_spec_type type; u16 size; @@ -2023,6 +2039,7 @@ union ib_flow_spec { struct ib_flow_spec_ipv6 ipv6; struct ib_flow_spec_tunnel tunnel; struct ib_flow_spec_esp esp; + struct ib_flow_spec_gre gre; struct ib_flow_spec_action_tag flow_tag; struct ib_flow_spec_action_drop drop; struct ib_flow_spec_action_handle action; -- cgit v1.2.3 From b04f0f036ac17b879e4d8c83f6503a19322ddde4 Mon Sep 17 00:00:00 2001 From: Ariel Levkovich Date: Sun, 13 May 2018 14:33:32 +0300 Subject: IB/uverbs: Introduce a MPLS steering match filter Add a new MPLS steering match filter that can match against a single MPLS tag field. Since the MPLS header can reside in different locations in the packet's protocol stack as well as be encapsulated with a tunnel protocol, it is required to know the exact location of the header in the protocol stack. Therefore, when including the MPLS protocol spec in the specs list, it is mandatory to provide the list in an ordered manner, so that it represents the actual header order in a matching packet. Drivers that process the spec list and apply the matching rule should treat the position of the MPLS spec in the spec list as the actual location of the MPLS label in the packet's protocol stack. Reviewed-by: Mark Bloch Signed-off-by: Ariel Levkovich Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/uverbs_cmd.c | 12 ++++++++++++ include/rdma/ib_verbs.h | 15 +++++++++++++++ 2 files changed, 27 insertions(+) (limited to 'drivers') diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 82c8c8b3e10b..e74262ee104c 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -2959,6 +2959,17 @@ int ib_uverbs_kern_spec_to_ib_spec_filter(enum ib_flow_spec_type type, memcpy(&ib_spec->gre.val, kern_spec_val, actual_filter_sz); memcpy(&ib_spec->gre.mask, kern_spec_mask, actual_filter_sz); break; + case IB_FLOW_SPEC_MPLS: + ib_filter_sz = offsetof(struct ib_flow_mpls_filter, real_sz); + actual_filter_sz = spec_filter_size(kern_spec_mask, + kern_filter_sz, + ib_filter_sz); + if (actual_filter_sz <= 0) + return -EINVAL; + ib_spec->mpls.size = sizeof(struct ib_flow_spec_mpls); + memcpy(&ib_spec->mpls.val, kern_spec_val, actual_filter_sz); + memcpy(&ib_spec->mpls.mask, kern_spec_mask, actual_filter_sz); + break; default: return -EINVAL; } @@ -3518,6 +3529,7 @@ int ib_uverbs_ex_create_flow(struct ib_uverbs_file *file, uflow_res); if (err) goto err_free; + flow_attr->size += ((union ib_flow_spec *) ib_spec)->size; cmd.flow_attr.size -= ((struct ib_uverbs_flow_spec *)kern_spec)->size; diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index e3ec61fe6060..e849bd0fc618 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -1853,6 +1853,7 @@ enum ib_flow_spec_type { IB_FLOW_SPEC_UDP = 0x41, IB_FLOW_SPEC_VXLAN_TUNNEL = 0x50, IB_FLOW_SPEC_GRE = 0x51, + IB_FLOW_SPEC_MPLS = 0x60, IB_FLOW_SPEC_INNER = 0x100, /* Actions */ IB_FLOW_SPEC_ACTION_TAG = 0x1000, @@ -2010,6 +2011,19 @@ struct ib_flow_spec_gre { struct ib_flow_gre_filter mask; }; +struct ib_flow_mpls_filter { + __be32 tag; + /* Must be last */ + u8 real_sz[0]; +}; + +struct ib_flow_spec_mpls { + u32 type; + u16 size; + struct ib_flow_mpls_filter val; + struct ib_flow_mpls_filter mask; +}; + struct ib_flow_spec_action_tag { enum ib_flow_spec_type type; u16 size; @@ -2040,6 +2054,7 @@ union ib_flow_spec { struct ib_flow_spec_tunnel tunnel; struct ib_flow_spec_esp esp; struct ib_flow_spec_gre gre; + struct ib_flow_spec_mpls mpls; struct ib_flow_spec_action_tag flow_tag; struct ib_flow_spec_action_drop drop; struct ib_flow_spec_action_handle action; -- cgit v1.2.3 From da2f22ae7707b6ed254983aa3b23e013e07cd532 Mon Sep 17 00:00:00 2001 From: Ariel Levkovich Date: Sun, 13 May 2018 14:33:33 +0300 Subject: IB/mlx5: Add support for GRE flow specification This patch introduces support for the GRE flow spec and allowing the creation of rules based on the protocol and key fields that are part of GRE protocol header. Reviewed-by: Mark Bloch Signed-off-by: Ariel Levkovich Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/main.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'drivers') diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index a42c6b1cdb5a..81f696b21356 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -2448,6 +2448,7 @@ static void set_tos(void *outer_c, void *outer_v, u8 mask, u8 val) #define LAST_TUNNEL_FIELD tunnel_id #define LAST_FLOW_TAG_FIELD tag_id #define LAST_DROP_FIELD size +#define LAST_DROP_FIELD size /* Field is the last supported field */ #define FIELDS_NOT_SUPPORTED(filter, field)\ @@ -2689,6 +2690,29 @@ static int parse_flow_attr(struct mlx5_core_dev *mdev, u32 *match_c, MLX5_SET(fte_match_set_lyr_2_4, headers_v, udp_dport, ntohs(ib_spec->tcp_udp.val.dst_port)); break; + case IB_FLOW_SPEC_GRE: + if (ib_spec->gre.mask.c_ks_res0_ver) + return -EOPNOTSUPP; + + MLX5_SET(fte_match_set_lyr_2_4, headers_c, ip_protocol, + 0xff); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_protocol, + IPPROTO_GRE); + + MLX5_SET(fte_match_set_misc, misc_params_c, gre_protocol, + 0xffff); + MLX5_SET(fte_match_set_misc, misc_params_v, gre_protocol, + ntohs(ib_spec->gre.val.protocol)); + + memcpy(MLX5_ADDR_OF(fte_match_set_misc, misc_params_c, + gre_key_h), + &ib_spec->gre.mask.key, + sizeof(ib_spec->gre.mask.key)); + memcpy(MLX5_ADDR_OF(fte_match_set_misc, misc_params_v, + gre_key_h), + &ib_spec->gre.val.key, + sizeof(ib_spec->gre.val.key)); + break; case IB_FLOW_SPEC_VXLAN_TUNNEL: if (FIELDS_NOT_SUPPORTED(ib_spec->tunnel.mask, LAST_TUNNEL_FIELD)) -- cgit v1.2.3 From 71c6e8638ce3baf9ec16d64d263aab74beac912d Mon Sep 17 00:00:00 2001 From: Ariel Levkovich Date: Sun, 13 May 2018 14:33:34 +0300 Subject: IB/mlx5: Add support for MPLS flow specification This patch introduces support for the MPLS flow spec and allows the creation of rules that are matching on the MPLS label. Applying the rule matching depends on the flow specs order and the location of the MPLS in the spec list as there are different configurations to be made in the device in the cases of MPLSoGRE and MPLSoUDP vs. non-encapsulated MPLS. Reviewed-by: Mark Bloch Signed-off-by: Ariel Levkovich Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/main.c | 102 +++++++++++++++++++++- drivers/net/ethernet/mellanox/mlx5/core/fs_core.c | 14 ++- drivers/net/ethernet/mellanox/mlx5/core/fs_core.h | 2 +- include/linux/mlx5/device.h | 7 ++ include/linux/mlx5/mlx5_ifc.h | 45 ++++++++-- 5 files changed, 159 insertions(+), 11 deletions(-) (limited to 'drivers') diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 81f696b21356..8792248034cb 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -2386,7 +2386,8 @@ static int mlx5_ib_dealloc_pd(struct ib_pd *pd) enum { MATCH_CRITERIA_ENABLE_OUTER_BIT, MATCH_CRITERIA_ENABLE_MISC_BIT, - MATCH_CRITERIA_ENABLE_INNER_BIT + MATCH_CRITERIA_ENABLE_INNER_BIT, + MATCH_CRITERIA_ENABLE_MISC2_BIT }; #define HEADER_IS_ZERO(match_criteria, headers) \ @@ -2406,6 +2407,9 @@ static u8 get_match_criteria_enable(u32 *match_criteria) match_criteria_enable |= (!HEADER_IS_ZERO(match_criteria, inner_headers)) << MATCH_CRITERIA_ENABLE_INNER_BIT; + match_criteria_enable |= + (!HEADER_IS_ZERO(match_criteria, misc_parameters_2)) << + MATCH_CRITERIA_ENABLE_MISC2_BIT; return match_criteria_enable; } @@ -2440,6 +2444,27 @@ static void set_tos(void *outer_c, void *outer_v, u8 mask, u8 val) MLX5_SET(fte_match_set_lyr_2_4, outer_v, ip_dscp, val >> 2); } +static int check_mpls_supp_fields(u32 field_support, const __be32 *set_mask) +{ + if (MLX5_GET(fte_match_mpls, set_mask, mpls_label) && + !(field_support & MLX5_FIELD_SUPPORT_MPLS_LABEL)) + return -EOPNOTSUPP; + + if (MLX5_GET(fte_match_mpls, set_mask, mpls_exp) && + !(field_support & MLX5_FIELD_SUPPORT_MPLS_EXP)) + return -EOPNOTSUPP; + + if (MLX5_GET(fte_match_mpls, set_mask, mpls_s_bos) && + !(field_support & MLX5_FIELD_SUPPORT_MPLS_S_BOS)) + return -EOPNOTSUPP; + + if (MLX5_GET(fte_match_mpls, set_mask, mpls_ttl) && + !(field_support & MLX5_FIELD_SUPPORT_MPLS_TTL)) + return -EOPNOTSUPP; + + return 0; +} + #define LAST_ETH_FIELD vlan_tag #define LAST_IB_FIELD sl #define LAST_IPV4_FIELD tos @@ -2480,12 +2505,16 @@ static int parse_flow_flow_action(const union ib_flow_spec *ib_spec, static int parse_flow_attr(struct mlx5_core_dev *mdev, u32 *match_c, u32 *match_v, const union ib_flow_spec *ib_spec, const struct ib_flow_attr *flow_attr, - struct mlx5_flow_act *action) + struct mlx5_flow_act *action, u32 prev_type) { void *misc_params_c = MLX5_ADDR_OF(fte_match_param, match_c, misc_parameters); void *misc_params_v = MLX5_ADDR_OF(fte_match_param, match_v, misc_parameters); + void *misc_params2_c = MLX5_ADDR_OF(fte_match_param, match_c, + misc_parameters_2); + void *misc_params2_v = MLX5_ADDR_OF(fte_match_param, match_v, + misc_parameters_2); void *headers_c; void *headers_v; int match_ipv; @@ -2713,6 +2742,70 @@ static int parse_flow_attr(struct mlx5_core_dev *mdev, u32 *match_c, &ib_spec->gre.val.key, sizeof(ib_spec->gre.val.key)); break; + case IB_FLOW_SPEC_MPLS: + switch (prev_type) { + case IB_FLOW_SPEC_UDP: + if (check_mpls_supp_fields(MLX5_CAP_FLOWTABLE_NIC_RX(mdev, + ft_field_support.outer_first_mpls_over_udp), + &ib_spec->mpls.mask.tag)) + return -EOPNOTSUPP; + + memcpy(MLX5_ADDR_OF(fte_match_set_misc2, misc_params2_v, + outer_first_mpls_over_udp), + &ib_spec->mpls.val.tag, + sizeof(ib_spec->mpls.val.tag)); + memcpy(MLX5_ADDR_OF(fte_match_set_misc2, misc_params2_c, + outer_first_mpls_over_udp), + &ib_spec->mpls.mask.tag, + sizeof(ib_spec->mpls.mask.tag)); + break; + case IB_FLOW_SPEC_GRE: + if (check_mpls_supp_fields(MLX5_CAP_FLOWTABLE_NIC_RX(mdev, + ft_field_support.outer_first_mpls_over_gre), + &ib_spec->mpls.mask.tag)) + return -EOPNOTSUPP; + + memcpy(MLX5_ADDR_OF(fte_match_set_misc2, misc_params2_v, + outer_first_mpls_over_gre), + &ib_spec->mpls.val.tag, + sizeof(ib_spec->mpls.val.tag)); + memcpy(MLX5_ADDR_OF(fte_match_set_misc2, misc_params2_c, + outer_first_mpls_over_gre), + &ib_spec->mpls.mask.tag, + sizeof(ib_spec->mpls.mask.tag)); + break; + default: + if (ib_spec->type & IB_FLOW_SPEC_INNER) { + if (check_mpls_supp_fields(MLX5_CAP_FLOWTABLE_NIC_RX(mdev, + ft_field_support.inner_first_mpls), + &ib_spec->mpls.mask.tag)) + return -EOPNOTSUPP; + + memcpy(MLX5_ADDR_OF(fte_match_set_misc2, misc_params2_v, + inner_first_mpls), + &ib_spec->mpls.val.tag, + sizeof(ib_spec->mpls.val.tag)); + memcpy(MLX5_ADDR_OF(fte_match_set_misc2, misc_params2_c, + inner_first_mpls), + &ib_spec->mpls.mask.tag, + sizeof(ib_spec->mpls.mask.tag)); + } else { + if (check_mpls_supp_fields(MLX5_CAP_FLOWTABLE_NIC_RX(mdev, + ft_field_support.outer_first_mpls), + &ib_spec->mpls.mask.tag)) + return -EOPNOTSUPP; + + memcpy(MLX5_ADDR_OF(fte_match_set_misc2, misc_params2_v, + outer_first_mpls), + &ib_spec->mpls.val.tag, + sizeof(ib_spec->mpls.val.tag)); + memcpy(MLX5_ADDR_OF(fte_match_set_misc2, misc_params2_c, + outer_first_mpls), + &ib_spec->mpls.mask.tag, + sizeof(ib_spec->mpls.mask.tag)); + } + } + break; case IB_FLOW_SPEC_VXLAN_TUNNEL: if (FIELDS_NOT_SUPPORTED(ib_spec->tunnel.mask, LAST_TUNNEL_FIELD)) @@ -3044,6 +3137,7 @@ static struct mlx5_ib_flow_handler *_create_flow_rule(struct mlx5_ib_dev *dev, struct mlx5_flow_destination *rule_dst = dst; const void *ib_flow = (const void *)flow_attr + sizeof(*flow_attr); unsigned int spec_index; + u32 prev_type = 0; int err = 0; int dest_num = 1; bool is_egress = flow_attr->flags & IB_FLOW_ATTR_FLAGS_EGRESS; @@ -3063,10 +3157,12 @@ static struct mlx5_ib_flow_handler *_create_flow_rule(struct mlx5_ib_dev *dev, for (spec_index = 0; spec_index < flow_attr->num_of_specs; spec_index++) { err = parse_flow_attr(dev->mdev, spec->match_criteria, spec->match_value, - ib_flow, flow_attr, &flow_act); + ib_flow, flow_attr, &flow_act, + prev_type); if (err < 0) goto free; + prev_type = ((union ib_flow_spec *)ib_flow)->type; ib_flow += ((union ib_flow_spec *)ib_flow)->size; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c index de51e7c39bc8..556202b9256a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c @@ -324,7 +324,8 @@ static bool check_valid_mask(u8 match_criteria_enable, const u32 *match_criteria if (match_criteria_enable & ~( (1 << MLX5_CREATE_FLOW_GROUP_IN_MATCH_CRITERIA_ENABLE_OUTER_HEADERS) | (1 << MLX5_CREATE_FLOW_GROUP_IN_MATCH_CRITERIA_ENABLE_MISC_PARAMETERS) | - (1 << MLX5_CREATE_FLOW_GROUP_IN_MATCH_CRITERIA_ENABLE_INNER_HEADERS))) + (1 << MLX5_CREATE_FLOW_GROUP_IN_MATCH_CRITERIA_ENABLE_INNER_HEADERS) | + (1 << MLX5_CREATE_FLOW_GROUP_IN_MATCH_CRITERIA_ENABLE_MISC_PARAMETERS_2))) return false; if (!(match_criteria_enable & @@ -360,6 +361,17 @@ static bool check_valid_mask(u8 match_criteria_enable, const u32 *match_criteria return false; } + if (!(match_criteria_enable & + 1 << MLX5_CREATE_FLOW_GROUP_IN_MATCH_CRITERIA_ENABLE_MISC_PARAMETERS_2)) { + char *fg_type_mask = MLX5_ADDR_OF(fte_match_param, + match_criteria, misc_parameters_2); + + if (fg_type_mask[0] || + memcmp(fg_type_mask, fg_type_mask + 1, + MLX5_ST_SZ_BYTES(fte_match_set_misc2) - 1)) + return false; + } + return check_last_reserved(match_criteria); } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h index e26d3e9d5f9f..b6da322a8016 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h @@ -159,7 +159,7 @@ struct mlx5_ft_underlay_qp { u32 qpn; }; -#define MLX5_FTE_MATCH_PARAM_RESERVED reserved_at_600 +#define MLX5_FTE_MATCH_PARAM_RESERVED reserved_at_800 /* Calculate the fte_match_param length and without the reserved length. * Make sure the reserved field is the last. */ diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 2bc27f8c5b87..fd1a9341edfa 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -994,6 +994,13 @@ enum mlx5_wol_mode { MLX5_WOL_PHY_ACTIVITY = 1 << 7, }; +enum mlx5_mpls_supported_fields { + MLX5_FIELD_SUPPORT_MPLS_LABEL = 1 << 0, + MLX5_FIELD_SUPPORT_MPLS_EXP = 1 << 1, + MLX5_FIELD_SUPPORT_MPLS_S_BOS = 1 << 2, + MLX5_FIELD_SUPPORT_MPLS_TTL = 1 << 3 +}; + /* MLX5 DEV CAPs */ /* TODO: EAT.ME */ diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 1aad455538f4..3fee2f74d09d 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -298,9 +298,15 @@ struct mlx5_ifc_flow_table_fields_supported_bits { u8 inner_tcp_dport[0x1]; u8 inner_tcp_flags[0x1]; u8 reserved_at_37[0x9]; - u8 reserved_at_40[0x17]; + + u8 reserved_at_40[0x5]; + u8 outer_first_mpls_over_udp[0x4]; + u8 outer_first_mpls_over_gre[0x4]; + u8 inner_first_mpls[0x4]; + u8 outer_first_mpls[0x4]; + u8 reserved_at_55[0x2]; u8 outer_esp_spi[0x1]; - u8 reserved_at_58[0x2]; + u8 reserved_at_58[0x2]; u8 bth_dst_qp[0x1]; u8 reserved_at_5b[0x25]; @@ -450,6 +456,29 @@ struct mlx5_ifc_fte_match_set_misc_bits { u8 reserved_at_1a0[0x60]; }; +struct mlx5_ifc_fte_match_mpls_bits { + u8 mpls_label[0x14]; + u8 mpls_exp[0x3]; + u8 mpls_s_bos[0x1]; + u8 mpls_ttl[0x8]; +}; + +struct mlx5_ifc_fte_match_set_misc2_bits { + struct mlx5_ifc_fte_match_mpls_bits outer_first_mpls; + + struct mlx5_ifc_fte_match_mpls_bits inner_first_mpls; + + struct mlx5_ifc_fte_match_mpls_bits outer_first_mpls_over_gre; + + struct mlx5_ifc_fte_match_mpls_bits outer_first_mpls_over_udp; + + u8 reserved_at_80[0x100]; + + u8 metadata_reg_a[0x20]; + + u8 reserved_at_1a0[0x60]; +}; + struct mlx5_ifc_cmd_pas_bits { u8 pa_h[0x20]; @@ -1170,7 +1199,9 @@ struct mlx5_ifc_fte_match_param_bits { struct mlx5_ifc_fte_match_set_lyr_2_4_bits inner_headers; - u8 reserved_at_600[0xa00]; + struct mlx5_ifc_fte_match_set_misc2_bits misc_parameters_2; + + u8 reserved_at_800[0x800]; }; enum { @@ -4579,6 +4610,7 @@ enum { MLX5_QUERY_FLOW_GROUP_OUT_MATCH_CRITERIA_ENABLE_OUTER_HEADERS = 0x0, MLX5_QUERY_FLOW_GROUP_OUT_MATCH_CRITERIA_ENABLE_MISC_PARAMETERS = 0x1, MLX5_QUERY_FLOW_GROUP_OUT_MATCH_CRITERIA_ENABLE_INNER_HEADERS = 0x2, + MLX5_QUERY_FLOW_GROUP_IN_MATCH_CRITERIA_ENABLE_MISC_PARAMETERS_2 = 0X3, }; struct mlx5_ifc_query_flow_group_out_bits { @@ -6969,9 +7001,10 @@ struct mlx5_ifc_create_flow_group_out_bits { }; enum { - MLX5_CREATE_FLOW_GROUP_IN_MATCH_CRITERIA_ENABLE_OUTER_HEADERS = 0x0, - MLX5_CREATE_FLOW_GROUP_IN_MATCH_CRITERIA_ENABLE_MISC_PARAMETERS = 0x1, - MLX5_CREATE_FLOW_GROUP_IN_MATCH_CRITERIA_ENABLE_INNER_HEADERS = 0x2, + MLX5_CREATE_FLOW_GROUP_IN_MATCH_CRITERIA_ENABLE_OUTER_HEADERS = 0x0, + MLX5_CREATE_FLOW_GROUP_IN_MATCH_CRITERIA_ENABLE_MISC_PARAMETERS = 0x1, + MLX5_CREATE_FLOW_GROUP_IN_MATCH_CRITERIA_ENABLE_INNER_HEADERS = 0x2, + MLX5_CREATE_FLOW_GROUP_IN_MATCH_CRITERIA_ENABLE_MISC_PARAMETERS_2 = 0x3, }; struct mlx5_ifc_create_flow_group_in_bits { -- cgit v1.2.3 From e818e255a58d64e86c8c93e3aa52498b1a3d1760 Mon Sep 17 00:00:00 2001 From: Ariel Levkovich Date: Sun, 13 May 2018 14:33:35 +0300 Subject: IB/mlx5: Expose MPLS related tunneling offloads This patch reports the device's capbilities to offload encapsulated MPLS tunnel protocols to user-space: - Capability to offload MPLS over GRE. - Capability to offload MPLS over UDP. Reviewed-by: Mark Bloch Signed-off-by: Ariel Levkovich Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/main.c | 8 ++++++++ include/linux/mlx5/device.h | 5 +++++ include/linux/mlx5/mlx5_ifc.h | 4 +++- include/uapi/rdma/mlx5-abi.h | 4 +++- 4 files changed, 19 insertions(+), 2 deletions(-) (limited to 'drivers') diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 8792248034cb..ab8cd5c034a2 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -1084,6 +1084,14 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, if (MLX5_CAP_ETH(mdev, tunnel_stateless_gre)) resp.tunnel_offloads_caps |= MLX5_IB_TUNNELED_OFFLOADS_GRE; + if (MLX5_CAP_GEN(mdev, flex_parser_protocols) & + MLX5_FLEX_PROTO_CW_MPLS_GRE) + resp.tunnel_offloads_caps |= + MLX5_IB_TUNNELED_OFFLOADS_MPLS_GRE; + if (MLX5_CAP_GEN(mdev, flex_parser_protocols) & + MLX5_FLEX_PROTO_CW_MPLS_UDP) + resp.tunnel_offloads_caps |= + MLX5_IB_TUNNELED_OFFLOADS_MPLS_UDP; } if (uhw->outlen) { diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index fd1a9341edfa..5004ddc702e3 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -1001,6 +1001,11 @@ enum mlx5_mpls_supported_fields { MLX5_FIELD_SUPPORT_MPLS_TTL = 1 << 3 }; +enum mlx5_flex_parser_protos { + MLX5_FLEX_PROTO_CW_MPLS_GRE = 1 << 4, + MLX5_FLEX_PROTO_CW_MPLS_UDP = 1 << 5, +}; + /* MLX5 DEV CAPs */ /* TODO: EAT.ME */ diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 3fee2f74d09d..68f756ea550d 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -1138,7 +1138,9 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 reserved_at_500[0x20]; u8 num_of_uars_per_page[0x20]; - u8 reserved_at_540[0x40]; + + u8 flex_parser_protocols[0x20]; + u8 reserved_at_560[0x20]; u8 reserved_at_580[0x3d]; u8 cqe_128_always[0x1]; diff --git a/include/uapi/rdma/mlx5-abi.h b/include/uapi/rdma/mlx5-abi.h index fdaf00e20649..508ea8c82da7 100644 --- a/include/uapi/rdma/mlx5-abi.h +++ b/include/uapi/rdma/mlx5-abi.h @@ -233,7 +233,9 @@ enum mlx5_ib_query_dev_resp_flags { enum mlx5_ib_tunnel_offloads { MLX5_IB_TUNNELED_OFFLOADS_VXLAN = 1 << 0, MLX5_IB_TUNNELED_OFFLOADS_GRE = 1 << 1, - MLX5_IB_TUNNELED_OFFLOADS_GENEVE = 1 << 2 + MLX5_IB_TUNNELED_OFFLOADS_GENEVE = 1 << 2, + MLX5_IB_TUNNELED_OFFLOADS_MPLS_GRE = 1 << 3, + MLX5_IB_TUNNELED_OFFLOADS_MPLS_UDP = 1 << 4, }; struct mlx5_ib_query_device_resp { -- cgit v1.2.3 From 112f5c8199dd02dbee2aa9f21e087e777fb7f15e Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 16 May 2018 21:47:32 +0200 Subject: IB/ipoib: replace local_irq_disable() with proper locking In ipoib_mcast_restart_task() the netif_addr_lock() is invoked prior local_irq_save(). netif_addr_lock() should not be invoked in interrupt disabled section, only in BH disabled sections. The priv->lock is always acquired with disabled interrupts. The only place where netif_addr_lock() and priv->lock nest ist ipoib_mcast_restart_task(). Drop the local_irq_save() and acquire priv->lock with spin_lock_irq() inside the netif_addr locked section. It's safe to do so because the caller is either a worker function or __ipoib_ib_dev_flush() which are both calling with interrupts enabled (and since BH is enabled here, too so netif_addr_lock_bh() needs to be used). Cc: Doug Ledford Cc: Jason Gunthorpe Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Jason Gunthorpe --- drivers/infiniband/ulp/ipoib/ipoib_multicast.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) (limited to 'drivers') diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c index 9b3f47ae2016..6709328d90f8 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c @@ -886,7 +886,6 @@ void ipoib_mcast_restart_task(struct work_struct *work) struct netdev_hw_addr *ha; struct ipoib_mcast *mcast, *tmcast; LIST_HEAD(remove_list); - unsigned long flags; struct ib_sa_mcmember_rec rec; if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)) @@ -898,9 +897,8 @@ void ipoib_mcast_restart_task(struct work_struct *work) ipoib_dbg_mcast(priv, "restarting multicast task\n"); - local_irq_save(flags); - netif_addr_lock(dev); - spin_lock(&priv->lock); + netif_addr_lock_bh(dev); + spin_lock_irq(&priv->lock); /* * Unfortunately, the networking core only gives us a list of all of @@ -978,9 +976,8 @@ void ipoib_mcast_restart_task(struct work_struct *work) } } - spin_unlock(&priv->lock); - netif_addr_unlock(dev); - local_irq_restore(flags); + spin_unlock_irq(&priv->lock); + netif_addr_unlock_bh(dev); ipoib_mcast_remove_list(&remove_list); @@ -988,9 +985,9 @@ void ipoib_mcast_restart_task(struct work_struct *work) * Double check that we are still up */ if (test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)) { - spin_lock_irqsave(&priv->lock, flags); + spin_lock_irq(&priv->lock); __ipoib_mcast_schedule_join_thread(priv, NULL, 0); - spin_unlock_irqrestore(&priv->lock, flags); + spin_unlock_irq(&priv->lock); } } -- cgit v1.2.3 From 5e6e78dbd3b93414ca53af5d51c090878d1c9e5d Mon Sep 17 00:00:00 2001 From: Yixian Liu Date: Fri, 11 May 2018 16:31:23 +0800 Subject: RDMA/hns: Add 64KB page size support for hip08 This patch adds the support of 64KB page size for hip08 in kernel. Signed-off-by: Yixian Liu Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_device.h | 3 +++ drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 36 ++++++++++++++++------------- drivers/infiniband/hw/hns/hns_roce_mr.c | 6 ----- 3 files changed, 23 insertions(+), 22 deletions(-) (limited to 'drivers') diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index fb305b7f99a8..53c2f1b8d068 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -100,6 +100,9 @@ #define SERV_TYPE_UC 2 #define SERV_TYPE_UD 3 +/* Configure to HW for PAGE_SIZE larger than 4KB */ +#define PG_SHIFT_OFFSET (PAGE_SHIFT - 12) + #define PAGES_SHIFT_8 8 #define PAGES_SHIFT_16 16 #define PAGES_SHIFT_24 24 diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 25916e8522ed..e0ab672e1c0a 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -1024,40 +1024,40 @@ static int hns_roce_v2_set_bt(struct hns_roce_dev *hr_dev) roce_set_field(req->vf_qpc_cfg, CFG_BT_ATTR_DATA_0_VF_QPC_BA_PGSZ_M, CFG_BT_ATTR_DATA_0_VF_QPC_BA_PGSZ_S, - hr_dev->caps.qpc_ba_pg_sz); + hr_dev->caps.qpc_ba_pg_sz + PG_SHIFT_OFFSET); roce_set_field(req->vf_qpc_cfg, CFG_BT_ATTR_DATA_0_VF_QPC_BUF_PGSZ_M, CFG_BT_ATTR_DATA_0_VF_QPC_BUF_PGSZ_S, - hr_dev->caps.qpc_buf_pg_sz); + hr_dev->caps.qpc_buf_pg_sz + PG_SHIFT_OFFSET); roce_set_field(req->vf_qpc_cfg, CFG_BT_ATTR_DATA_0_VF_QPC_HOPNUM_M, CFG_BT_ATTR_DATA_0_VF_QPC_HOPNUM_S, qpc_hop_num == HNS_ROCE_HOP_NUM_0 ? 0 : qpc_hop_num); roce_set_field(req->vf_srqc_cfg, CFG_BT_ATTR_DATA_1_VF_SRQC_BA_PGSZ_M, CFG_BT_ATTR_DATA_1_VF_SRQC_BA_PGSZ_S, - hr_dev->caps.srqc_ba_pg_sz); + hr_dev->caps.srqc_ba_pg_sz + PG_SHIFT_OFFSET); roce_set_field(req->vf_srqc_cfg, CFG_BT_ATTR_DATA_1_VF_SRQC_BUF_PGSZ_M, CFG_BT_ATTR_DATA_1_VF_SRQC_BUF_PGSZ_S, - hr_dev->caps.srqc_buf_pg_sz); + hr_dev->caps.srqc_buf_pg_sz + PG_SHIFT_OFFSET); roce_set_field(req->vf_srqc_cfg, CFG_BT_ATTR_DATA_1_VF_SRQC_HOPNUM_M, CFG_BT_ATTR_DATA_1_VF_SRQC_HOPNUM_S, srqc_hop_num == HNS_ROCE_HOP_NUM_0 ? 0 : srqc_hop_num); roce_set_field(req->vf_cqc_cfg, CFG_BT_ATTR_DATA_2_VF_CQC_BA_PGSZ_M, CFG_BT_ATTR_DATA_2_VF_CQC_BA_PGSZ_S, - hr_dev->caps.cqc_ba_pg_sz); + hr_dev->caps.cqc_ba_pg_sz + PG_SHIFT_OFFSET); roce_set_field(req->vf_cqc_cfg, CFG_BT_ATTR_DATA_2_VF_CQC_BUF_PGSZ_M, CFG_BT_ATTR_DATA_2_VF_CQC_BUF_PGSZ_S, - hr_dev->caps.cqc_buf_pg_sz); + hr_dev->caps.cqc_buf_pg_sz + PG_SHIFT_OFFSET); roce_set_field(req->vf_cqc_cfg, CFG_BT_ATTR_DATA_2_VF_CQC_HOPNUM_M, CFG_BT_ATTR_DATA_2_VF_CQC_HOPNUM_S, cqc_hop_num == HNS_ROCE_HOP_NUM_0 ? 0 : cqc_hop_num); roce_set_field(req->vf_mpt_cfg, CFG_BT_ATTR_DATA_3_VF_MPT_BA_PGSZ_M, CFG_BT_ATTR_DATA_3_VF_MPT_BA_PGSZ_S, - hr_dev->caps.mpt_ba_pg_sz); + hr_dev->caps.mpt_ba_pg_sz + PG_SHIFT_OFFSET); roce_set_field(req->vf_mpt_cfg, CFG_BT_ATTR_DATA_3_VF_MPT_BUF_PGSZ_M, CFG_BT_ATTR_DATA_3_VF_MPT_BUF_PGSZ_S, - hr_dev->caps.mpt_buf_pg_sz); + hr_dev->caps.mpt_buf_pg_sz + PG_SHIFT_OFFSET); roce_set_field(req->vf_mpt_cfg, CFG_BT_ATTR_DATA_3_VF_MPT_HOPNUM_M, CFG_BT_ATTR_DATA_3_VF_MPT_HOPNUM_S, mpt_hop_num == HNS_ROCE_HOP_NUM_0 ? 0 : mpt_hop_num); @@ -1351,7 +1351,8 @@ static int hns_roce_v2_write_mtpt(void *mb_buf, struct hns_roce_mr *mr, HNS_ROCE_HOP_NUM_0 ? 0 : mr->pbl_hop_num); roce_set_field(mpt_entry->byte_4_pd_hop_st, V2_MPT_BYTE_4_PBL_BA_PG_SZ_M, - V2_MPT_BYTE_4_PBL_BA_PG_SZ_S, mr->pbl_ba_pg_sz); + V2_MPT_BYTE_4_PBL_BA_PG_SZ_S, + mr->pbl_ba_pg_sz + PG_SHIFT_OFFSET); roce_set_field(mpt_entry->byte_4_pd_hop_st, V2_MPT_BYTE_4_PD_M, V2_MPT_BYTE_4_PD_S, mr->pd); mpt_entry->byte_4_pd_hop_st = cpu_to_le32(mpt_entry->byte_4_pd_hop_st); @@ -1425,7 +1426,8 @@ found: roce_set_field(mpt_entry->byte_64_buf_pa1, V2_MPT_BYTE_64_PBL_BUF_PG_SZ_M, - V2_MPT_BYTE_64_PBL_BUF_PG_SZ_S, mr->pbl_buf_pg_sz); + V2_MPT_BYTE_64_PBL_BUF_PG_SZ_S, + mr->pbl_buf_pg_sz + PG_SHIFT_OFFSET); mpt_entry->byte_64_buf_pa1 = cpu_to_le32(mpt_entry->byte_64_buf_pa1); return 0; @@ -1606,11 +1608,11 @@ static void hns_roce_v2_write_cqc(struct hns_roce_dev *hr_dev, roce_set_field(cq_context->byte_24_pgsz_addr, V2_CQC_BYTE_24_CQE_BA_PG_SZ_M, V2_CQC_BYTE_24_CQE_BA_PG_SZ_S, - hr_dev->caps.cqe_ba_pg_sz); + hr_dev->caps.cqe_ba_pg_sz + PG_SHIFT_OFFSET); roce_set_field(cq_context->byte_24_pgsz_addr, V2_CQC_BYTE_24_CQE_BUF_PG_SZ_M, V2_CQC_BYTE_24_CQE_BUF_PG_SZ_S, - hr_dev->caps.cqe_buf_pg_sz); + hr_dev->caps.cqe_buf_pg_sz + PG_SHIFT_OFFSET); cq_context->cqe_ba = (u32)(dma_handle >> 3); @@ -2707,7 +2709,7 @@ static int modify_qp_init_to_rtr(struct ib_qp *ibqp, roce_set_field(context->byte_16_buf_ba_pg_sz, V2_QPC_BYTE_16_WQE_SGE_BA_PG_SZ_M, V2_QPC_BYTE_16_WQE_SGE_BA_PG_SZ_S, - hr_dev->caps.mtt_ba_pg_sz); + hr_dev->caps.mtt_ba_pg_sz + PG_SHIFT_OFFSET); roce_set_field(qpc_mask->byte_16_buf_ba_pg_sz, V2_QPC_BYTE_16_WQE_SGE_BA_PG_SZ_M, V2_QPC_BYTE_16_WQE_SGE_BA_PG_SZ_S, 0); @@ -2715,7 +2717,7 @@ static int modify_qp_init_to_rtr(struct ib_qp *ibqp, roce_set_field(context->byte_16_buf_ba_pg_sz, V2_QPC_BYTE_16_WQE_SGE_BUF_PG_SZ_M, V2_QPC_BYTE_16_WQE_SGE_BUF_PG_SZ_S, - hr_dev->caps.mtt_buf_pg_sz); + hr_dev->caps.mtt_buf_pg_sz + PG_SHIFT_OFFSET); roce_set_field(qpc_mask->byte_16_buf_ba_pg_sz, V2_QPC_BYTE_16_WQE_SGE_BUF_PG_SZ_M, V2_QPC_BYTE_16_WQE_SGE_BUF_PG_SZ_S, 0); @@ -4149,12 +4151,14 @@ static void hns_roce_config_eqc(struct hns_roce_dev *hr_dev, /* set eqe_ba_pg_sz */ roce_set_field(eqc->byte_8, HNS_ROCE_EQC_BA_PG_SZ_M, - HNS_ROCE_EQC_BA_PG_SZ_S, eq->eqe_ba_pg_sz); + HNS_ROCE_EQC_BA_PG_SZ_S, + eq->eqe_ba_pg_sz + PG_SHIFT_OFFSET); /* set eqe_buf_pg_sz */ roce_set_field(eqc->byte_8, HNS_ROCE_EQC_BUF_PG_SZ_M, - HNS_ROCE_EQC_BUF_PG_SZ_S, eq->eqe_buf_pg_sz); + HNS_ROCE_EQC_BUF_PG_SZ_S, + eq->eqe_buf_pg_sz + PG_SHIFT_OFFSET); /* set eq_producer_idx */ roce_set_field(eqc->byte_8, diff --git a/drivers/infiniband/hw/hns/hns_roce_mr.c b/drivers/infiniband/hw/hns/hns_roce_mr.c index f7256d88d38f..d1fe0e7957e3 100644 --- a/drivers/infiniband/hw/hns/hns_roce_mr.c +++ b/drivers/infiniband/hw/hns/hns_roce_mr.c @@ -1007,12 +1007,6 @@ struct ib_mr *hns_roce_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, } n = ib_umem_page_count(mr->umem); - if (mr->umem->page_shift != HNS_ROCE_HEM_PAGE_SHIFT) { - dev_err(dev, "Just support 4K page size but is 0x%lx now!\n", - BIT(mr->umem->page_shift)); - ret = -EINVAL; - goto err_umem; - } if (!hr_dev->caps.pbl_hop_num) { if (n > HNS_ROCE_MAX_MTPT_PBL_NUM) { -- cgit v1.2.3 From da2f3b286a396928e9ddcb8c9a987597e751aa84 Mon Sep 17 00:00:00 2001 From: Zhu Yanjun Date: Thu, 17 May 2018 04:55:22 -0400 Subject: IB/rxe: avoid calling WARN_ON_ONCE twice In the exit branch, WARN_ON_ONCE is called to show stack. So it is not necessary to call WARN_ON_ONCE before going to exit. Signed-off-by: Zhu Yanjun Reviewed-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_comp.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'drivers') diff --git a/drivers/infiniband/sw/rxe/rxe_comp.c b/drivers/infiniband/sw/rxe/rxe_comp.c index a285978aa7fe..98d470d1f3fc 100644 --- a/drivers/infiniband/sw/rxe/rxe_comp.c +++ b/drivers/infiniband/sw/rxe/rxe_comp.c @@ -659,7 +659,6 @@ int rxe_completer(void *arg) qp->qp_timeout_jiffies) mod_timer(&qp->retrans_timer, jiffies + qp->qp_timeout_jiffies); - WARN_ON_ONCE(skb); goto exit; case COMPST_ERROR_RETRY: @@ -673,7 +672,6 @@ int rxe_completer(void *arg) /* there is nothing to retry in this case */ if (!wqe || (wqe->state == wqe_state_posted)) { - WARN_ON_ONCE(skb); goto exit; } @@ -702,7 +700,6 @@ int rxe_completer(void *arg) skb = NULL; } - WARN_ON_ONCE(skb); goto exit; } else { @@ -746,7 +743,6 @@ int rxe_completer(void *arg) skb = NULL; } - WARN_ON_ONCE(skb); goto exit; } } -- cgit v1.2.3 From fa9391dbad4b868512ed22a7e41765f881a8a935 Mon Sep 17 00:00:00 2001 From: Doug Ledford Date: Fri, 18 May 2018 11:36:09 -0400 Subject: RDMA/ipoib: Update paths on CLIENT_REREG/SM_CHANGE events We do a light flush on CLIENT_REREG and SM_CHANGE events. This goes through and marks paths invalid. But we weren't always checking for this validity when we needed to, and so we could keep using a path marked invalid. What's more, once we establish a path with a valid ah, we put a pointer to the ah in the neigh struct directly, so even if we mark the path as invalid, as long as the neigh has a direct pointer to the ah, it keeps using the old, outdated ah. To fix this we do several things. 1) Put the valid flag in the ah instead of the path struct, so when we put the ah pointer directly in the neigh struct, we can easily check the validity of the ah on send events. 2) Check the neigh->ah and neigh->ah->valid elements in the needed places, and if we have an ah, but it's invalid, then invoke a refresh of the ah. 3) Fix the various places that check for path, but didn't check for path->valid (now path->ah && path->ah->valid). Reported-by: Evgenii Smirnov Fixes: ee1e2c82c245 ("IPoIB: Refresh paths instead of flushing them on SM change events") Signed-off-by: Doug Ledford --- drivers/infiniband/ulp/ipoib/ipoib.h | 2 +- drivers/infiniband/ulp/ipoib/ipoib_main.c | 33 +++++++++++++++++++++++++------ 2 files changed, 28 insertions(+), 7 deletions(-) (limited to 'drivers') diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h index 308e0ce49289..a50b062ed13e 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib.h +++ b/drivers/infiniband/ulp/ipoib/ipoib.h @@ -415,6 +415,7 @@ struct ipoib_ah { struct list_head list; struct kref ref; unsigned last_send; + int valid; }; struct ipoib_path { @@ -431,7 +432,6 @@ struct ipoib_path { struct rb_node rb_node; struct list_head list; - int valid; }; struct ipoib_neigh { diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index cf291f90b58f..788bb9573f1f 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -697,7 +697,8 @@ void ipoib_mark_paths_invalid(struct net_device *dev) ipoib_dbg(priv, "mark path LID 0x%08x GID %pI6 invalid\n", be32_to_cpu(sa_path_get_dlid(&path->pathrec)), path->pathrec.dgid.raw); - path->valid = 0; + if (path->ah) + path->ah->valid = 0; } spin_unlock_irq(&priv->lock); @@ -833,7 +834,7 @@ static void path_rec_completion(int status, while ((skb = __skb_dequeue(&neigh->queue))) __skb_queue_tail(&skqueue, skb); } - path->valid = 1; + path->ah->valid = 1; } path->query = NULL; @@ -926,6 +927,24 @@ static int path_rec_start(struct net_device *dev, return 0; } +static void neigh_refresh_path(struct ipoib_neigh *neigh, u8 *daddr, + struct net_device *dev) +{ + struct ipoib_dev_priv *priv = ipoib_priv(dev); + struct ipoib_path *path; + unsigned long flags; + + spin_lock_irqsave(&priv->lock, flags); + + path = __path_find(dev, daddr + 4); + if (!path) + goto out; + if (!path->query) + path_rec_start(dev, path); +out: + spin_unlock_irqrestore(&priv->lock, flags); +} + static struct ipoib_neigh *neigh_add_path(struct sk_buff *skb, u8 *daddr, struct net_device *dev) { @@ -963,7 +982,7 @@ static struct ipoib_neigh *neigh_add_path(struct sk_buff *skb, u8 *daddr, list_add_tail(&neigh->list, &path->neigh_list); - if (path->ah) { + if (path->ah && path->ah->valid) { kref_get(&path->ah->ref); neigh->ah = path->ah; @@ -1034,7 +1053,7 @@ static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev, goto drop_and_unlock; path = __path_find(dev, phdr->hwaddr + 4); - if (!path || !path->valid) { + if (!path || !path->ah || !path->ah->valid) { int new_path = 0; if (!path) { @@ -1069,7 +1088,7 @@ static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev, return; } - if (path->ah) { + if (path->ah && path->ah->valid) { ipoib_dbg(priv, "Send unicast ARP to %08x\n", be32_to_cpu(sa_path_get_dlid(&path->pathrec))); @@ -1161,10 +1180,12 @@ send_using_neigh: ipoib_cm_send(dev, skb, ipoib_cm_get(neigh)); goto unref; } - } else if (neigh->ah) { + } else if (neigh->ah && neigh->ah->valid) { neigh->ah->last_send = rn->send(dev, skb, neigh->ah->ah, IPOIB_QPN(phdr->hwaddr)); goto unref; + } else if (neigh->ah) { + neigh_refresh_path(neigh, phdr->hwaddr, dev); } if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) { -- cgit v1.2.3 From b06f2efd3bbe522ee0e118c3f29497c857d97f8b Mon Sep 17 00:00:00 2001 From: Steve Wise Date: Thu, 10 May 2018 07:31:28 -0700 Subject: iw_cxgb4: always set iw_cm_id.provider_data In active side connections, the provider_data field is not getting set. This will be used in a subsequent patch to dump state, so always set it. Signed-off-by: Steve Wise Signed-off-by: Doug Ledford --- drivers/infiniband/hw/cxgb4/cm.c | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers') diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c index 4cf17c650c36..0912fa026327 100644 --- a/drivers/infiniband/hw/cxgb4/cm.c +++ b/drivers/infiniband/hw/cxgb4/cm.c @@ -3210,6 +3210,7 @@ int c4iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) ep->com.cm_id = cm_id; ref_cm_id(&ep->com); + cm_id->provider_data = ep; ep->com.dev = dev; ep->com.qp = get_qhp(dev, conn_param->qpn); if (!ep->com.qp) { -- cgit v1.2.3 From fbdb0a9181cb4c489a857f6bf71648276c85969c Mon Sep 17 00:00:00 2001 From: Steve Wise Date: Thu, 10 May 2018 07:31:36 -0700 Subject: RDMA/CMA: add rdma_iw_cm_id() and rdma_res_to_id() helpers Add a helper function for iwarp drivers to be able to map an rdma_cm_id to an iw_cm_id. This is useful for dumping driver specific NLDEV/RESTRACK connection state. Add a helper to return the rdma_cm_id pointer from the rdma_restack pointer. This is needed for rdma drivers to map a res entry back to the public rdma_cm_id struct. Signed-off-by: Steve Wise Signed-off-by: Doug Ledford --- drivers/infiniband/core/cma.c | 28 ++++++++++++++++++++++++++++ include/rdma/rdma_cm.h | 3 +++ 2 files changed, 31 insertions(+) (limited to 'drivers') diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index a403e679c6c1..441555a35525 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -146,6 +146,34 @@ const void *rdma_consumer_reject_data(struct rdma_cm_id *id, } EXPORT_SYMBOL(rdma_consumer_reject_data); +/** + * rdma_iw_cm_id() - return the iw_cm_id pointer for this cm_id. + * @id: Communication Identifier + */ +struct iw_cm_id *rdma_iw_cm_id(struct rdma_cm_id *id) +{ + struct rdma_id_private *id_priv; + + id_priv = container_of(id, struct rdma_id_private, id); + if (id->device->node_type == RDMA_NODE_RNIC) + return id_priv->cm_id.iw; + return NULL; +} +EXPORT_SYMBOL(rdma_iw_cm_id); + +/** + * rdma_res_to_id() - return the rdma_cm_id pointer for this restrack. + * @res: rdma resource tracking entry pointer + */ +struct rdma_cm_id *rdma_res_to_id(struct rdma_restrack_entry *res) +{ + struct rdma_id_private *id_priv = + container_of(res, struct rdma_id_private, res); + + return &id_priv->id; +} +EXPORT_SYMBOL(rdma_res_to_id); + static void cma_add_one(struct ib_device *device); static void cma_remove_one(struct ib_device *device, void *client_data); diff --git a/include/rdma/rdma_cm.h b/include/rdma/rdma_cm.h index 690934733ba7..c5c1435c129a 100644 --- a/include/rdma/rdma_cm.h +++ b/include/rdma/rdma_cm.h @@ -420,4 +420,7 @@ const void *rdma_consumer_reject_data(struct rdma_cm_id *id, void rdma_read_gids(struct rdma_cm_id *cm_id, union ib_gid *sgid, union ib_gid *dgid); +struct iw_cm_id *rdma_iw_cm_id(struct rdma_cm_id *cm_id); +struct rdma_cm_id *rdma_res_to_id(struct rdma_restrack_entry *res); + #endif /* RDMA_CM_H */ -- cgit v1.2.3 From 15517080c5418831012fdfb89c761fa1d055f440 Mon Sep 17 00:00:00 2001 From: Evgenii Smirnov Date: Tue, 22 May 2018 16:00:47 +0200 Subject: RDMA/ipoib: drop skb on path record lookup failure In unicast_arp_send function there is an inconsistency in error handling of path_rec_start call. If path_rec_start is called because of an absent ah field, skb will be dropped. But if it is called on a creation of a new path, or if the path is invalid, skb will be added to the tail of path queue. In case of a new path it will be dropped on path_free, but in case of invalid path it can stay in the queue forever. This patch unifies the behavior, dropping skb in all cases of path_rec_start failure. Signed-off-by: Evgenii Smirnov Signed-off-by: Doug Ledford --- drivers/infiniband/ulp/ipoib/ipoib_main.c | 62 +++++++++++-------------------- 1 file changed, 21 insertions(+), 41 deletions(-) (limited to 'drivers') diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index 788bb9573f1f..2ce40a7ff604 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -1054,62 +1054,42 @@ static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev, path = __path_find(dev, phdr->hwaddr + 4); if (!path || !path->ah || !path->ah->valid) { - int new_path = 0; - if (!path) { path = path_rec_create(dev, phdr->hwaddr + 4); - new_path = 1; + if (!path) + goto drop_and_unlock; + __path_add(dev, path); + } else { + /* + * make sure there are no changes in the existing + * path record + */ + init_path_rec(priv, path, phdr->hwaddr + 4); + } + if (!path->query && path_rec_start(dev, path)) { + goto drop_and_unlock; } - if (path) { - if (!new_path) - /* make sure there is no changes in the existing path record */ - init_path_rec(priv, path, phdr->hwaddr + 4); - - if (skb_queue_len(&path->queue) < IPOIB_MAX_PATH_REC_QUEUE) { - push_pseudo_header(skb, phdr->hwaddr); - __skb_queue_tail(&path->queue, skb); - } else { - ++dev->stats.tx_dropped; - dev_kfree_skb_any(skb); - } - if (!path->query && path_rec_start(dev, path)) { - spin_unlock_irqrestore(&priv->lock, flags); - if (new_path) - path_free(dev, path); - return; - } else - __path_add(dev, path); + if (skb_queue_len(&path->queue) < IPOIB_MAX_PATH_REC_QUEUE) { + push_pseudo_header(skb, phdr->hwaddr); + __skb_queue_tail(&path->queue, skb); + goto unlock; } else { goto drop_and_unlock; } - - spin_unlock_irqrestore(&priv->lock, flags); - return; - } - - if (path->ah && path->ah->valid) { - ipoib_dbg(priv, "Send unicast ARP to %08x\n", - be32_to_cpu(sa_path_get_dlid(&path->pathrec))); - - spin_unlock_irqrestore(&priv->lock, flags); - path->ah->last_send = rn->send(dev, skb, path->ah->ah, - IPOIB_QPN(phdr->hwaddr)); - return; - } else if ((path->query || !path_rec_start(dev, path)) && - skb_queue_len(&path->queue) < IPOIB_MAX_PATH_REC_QUEUE) { - push_pseudo_header(skb, phdr->hwaddr); - __skb_queue_tail(&path->queue, skb); - } else { - goto drop_and_unlock; } spin_unlock_irqrestore(&priv->lock, flags); + ipoib_dbg(priv, "Send unicast ARP to %08x\n", + be32_to_cpu(sa_path_get_dlid(&path->pathrec))); + path->ah->last_send = rn->send(dev, skb, path->ah->ah, + IPOIB_QPN(phdr->hwaddr)); return; drop_and_unlock: ++dev->stats.tx_dropped; dev_kfree_skb_any(skb); +unlock: spin_unlock_irqrestore(&priv->lock, flags); } -- cgit v1.2.3 From 116aeb8873712ea559d26b0d9d88147af5c88db5 Mon Sep 17 00:00:00 2001 From: Steve Wise Date: Thu, 10 May 2018 07:31:43 -0700 Subject: iw_cxgb4: provide detailed provider-specific CM_ID information Add a table of important fields from the c4iw_ep* structures to the cm_id resource tracking table. This is helpful in debugging. Signed-off-by: Steve Wise Signed-off-by: Doug Ledford --- drivers/infiniband/hw/cxgb4/restrack.c | 84 ++++++++++++++++++++++++++++++++++ 1 file changed, 84 insertions(+) (limited to 'drivers') diff --git a/drivers/infiniband/hw/cxgb4/restrack.c b/drivers/infiniband/hw/cxgb4/restrack.c index b9724d0b32e0..463ef5813a59 100644 --- a/drivers/infiniband/hw/cxgb4/restrack.c +++ b/drivers/infiniband/hw/cxgb4/restrack.c @@ -30,6 +30,8 @@ * SOFTWARE. */ +#include + #include "iw_cxgb4.h" #include #include @@ -188,6 +190,88 @@ err: return -EMSGSIZE; } +union union_ep { + struct c4iw_listen_ep lep; + struct c4iw_ep ep; +}; + +static int fill_res_ep_entry(struct sk_buff *msg, + struct rdma_restrack_entry *res) +{ + struct rdma_cm_id *cm_id = rdma_res_to_id(res); + struct nlattr *table_attr; + struct c4iw_ep_common *epcp; + struct c4iw_listen_ep *listen_ep = NULL; + struct c4iw_ep *ep = NULL; + struct iw_cm_id *iw_cm_id; + union union_ep *uep; + + iw_cm_id = rdma_iw_cm_id(cm_id); + if (!iw_cm_id) + return 0; + epcp = (struct c4iw_ep_common *)iw_cm_id->provider_data; + if (!epcp) + return 0; + uep = kcalloc(1, sizeof(*uep), GFP_KERNEL); + if (!uep) + return 0; + + table_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_DRIVER); + if (!table_attr) + goto err_free_uep; + + /* Get a consistent snapshot */ + mutex_lock(&epcp->mutex); + if (epcp->state == LISTEN) { + uep->lep = *(struct c4iw_listen_ep *)epcp; + mutex_unlock(&epcp->mutex); + listen_ep = &uep->lep; + epcp = &listen_ep->com; + } else { + uep->ep = *(struct c4iw_ep *)epcp; + mutex_unlock(&epcp->mutex); + ep = &uep->ep; + epcp = &ep->com; + } + + if (rdma_nl_put_driver_u32(msg, "state", epcp->state)) + goto err_cancel_table; + if (rdma_nl_put_driver_u64_hex(msg, "flags", epcp->flags)) + goto err_cancel_table; + if (rdma_nl_put_driver_u64_hex(msg, "history", epcp->history)) + goto err_cancel_table; + + if (epcp->state == LISTEN) { + if (rdma_nl_put_driver_u32(msg, "stid", listen_ep->stid)) + goto err_cancel_table; + if (rdma_nl_put_driver_u32(msg, "backlog", listen_ep->backlog)) + goto err_cancel_table; + } else { + if (rdma_nl_put_driver_u32(msg, "hwtid", ep->hwtid)) + goto err_cancel_table; + if (rdma_nl_put_driver_u32(msg, "ord", ep->ord)) + goto err_cancel_table; + if (rdma_nl_put_driver_u32(msg, "ird", ep->ird)) + goto err_cancel_table; + if (rdma_nl_put_driver_u32(msg, "emss", ep->emss)) + goto err_cancel_table; + + if (!ep->parent_ep && rdma_nl_put_driver_u32(msg, "atid", + ep->atid)) + goto err_cancel_table; + } + nla_nest_end(msg, table_attr); + kfree(uep); + return 0; + +err_cancel_table: + nla_nest_cancel(msg, table_attr); +err_free_uep: + kfree(uep); + return -EMSGSIZE; +} + c4iw_restrack_func *c4iw_restrack_funcs[RDMA_RESTRACK_MAX] = { [RDMA_RESTRACK_QP] = fill_res_qp_entry, + [RDMA_RESTRACK_CM_ID] = fill_res_ep_entry, }; -- cgit v1.2.3 From 54e7688e54bed5db5c7859cbdbf393e8b2b7ef0b Mon Sep 17 00:00:00 2001 From: Steve Wise Date: Thu, 10 May 2018 07:31:51 -0700 Subject: iw_cxgb4: provide detailed driver-specific CQ information Add a table of important fields from the c4iw_cq* structures to the cq resource tracking table. This is helpful in debugging. Signed-off-by: Steve Wise Signed-off-by: Doug Ledford --- drivers/infiniband/hw/cxgb4/restrack.c | 163 +++++++++++++++++++++++++++++++++ 1 file changed, 163 insertions(+) (limited to 'drivers') diff --git a/drivers/infiniband/hw/cxgb4/restrack.c b/drivers/infiniband/hw/cxgb4/restrack.c index 463ef5813a59..013524a093cf 100644 --- a/drivers/infiniband/hw/cxgb4/restrack.c +++ b/drivers/infiniband/hw/cxgb4/restrack.c @@ -271,7 +271,170 @@ err_free_uep: return -EMSGSIZE; } +static int fill_cq(struct sk_buff *msg, struct t4_cq *cq) +{ + if (rdma_nl_put_driver_u32(msg, "cqid", cq->cqid)) + goto err; + if (rdma_nl_put_driver_u32(msg, "memsize", cq->memsize)) + goto err; + if (rdma_nl_put_driver_u32(msg, "size", cq->size)) + goto err; + if (rdma_nl_put_driver_u32(msg, "cidx", cq->cidx)) + goto err; + if (rdma_nl_put_driver_u32(msg, "cidx_inc", cq->cidx_inc)) + goto err; + if (rdma_nl_put_driver_u32(msg, "sw_cidx", cq->sw_cidx)) + goto err; + if (rdma_nl_put_driver_u32(msg, "sw_pidx", cq->sw_pidx)) + goto err; + if (rdma_nl_put_driver_u32(msg, "sw_in_use", cq->sw_in_use)) + goto err; + if (rdma_nl_put_driver_u32(msg, "vector", cq->vector)) + goto err; + if (rdma_nl_put_driver_u32(msg, "gen", cq->gen)) + goto err; + if (rdma_nl_put_driver_u32(msg, "error", cq->error)) + goto err; + if (rdma_nl_put_driver_u64_hex(msg, "bits_type_ts", + be64_to_cpu(cq->bits_type_ts))) + goto err; + if (rdma_nl_put_driver_u64_hex(msg, "flags", cq->flags)) + goto err; + + return 0; + +err: + return -EMSGSIZE; +} + +static int fill_cqe(struct sk_buff *msg, struct t4_cqe *cqe, u16 idx, + const char *qstr) +{ + if (rdma_nl_put_driver_u32(msg, qstr, idx)) + goto err; + if (rdma_nl_put_driver_u32_hex(msg, "header", + be32_to_cpu(cqe->header))) + goto err; + if (rdma_nl_put_driver_u32(msg, "len", be32_to_cpu(cqe->len))) + goto err; + if (rdma_nl_put_driver_u32_hex(msg, "wrid_hi", + be32_to_cpu(cqe->u.gen.wrid_hi))) + goto err; + if (rdma_nl_put_driver_u32_hex(msg, "wrid_low", + be32_to_cpu(cqe->u.gen.wrid_low))) + goto err; + if (rdma_nl_put_driver_u64_hex(msg, "bits_type_ts", + be64_to_cpu(cqe->bits_type_ts))) + goto err; + + return 0; + +err: + return -EMSGSIZE; +} + +static int fill_hwcqes(struct sk_buff *msg, struct t4_cq *cq, + struct t4_cqe *cqes) +{ + u16 idx; + + idx = (cq->cidx > 0) ? cq->cidx - 1 : cq->size - 1; + if (fill_cqe(msg, cqes, idx, "hwcq_idx")) + goto err; + idx = cq->cidx; + if (fill_cqe(msg, cqes + 1, idx, "hwcq_idx")) + goto err; + + return 0; +err: + return -EMSGSIZE; +} + +static int fill_swcqes(struct sk_buff *msg, struct t4_cq *cq, + struct t4_cqe *cqes) +{ + u16 idx; + + if (!cq->sw_in_use) + return 0; + + idx = cq->sw_cidx; + if (fill_cqe(msg, cqes, idx, "swcq_idx")) + goto err; + if (cq->sw_in_use == 1) + goto out; + idx = (cq->sw_pidx > 0) ? cq->sw_pidx - 1 : cq->size - 1; + if (fill_cqe(msg, cqes + 1, idx, "swcq_idx")) + goto err; +out: + return 0; +err: + return -EMSGSIZE; +} + +static int fill_res_cq_entry(struct sk_buff *msg, + struct rdma_restrack_entry *res) +{ + struct ib_cq *ibcq = container_of(res, struct ib_cq, res); + struct c4iw_cq *chp = to_c4iw_cq(ibcq); + struct nlattr *table_attr; + struct t4_cqe hwcqes[2]; + struct t4_cqe swcqes[2]; + struct t4_cq cq; + u16 idx; + + /* User cq state is not available, so don't dump user cqs */ + if (ibcq->uobject) + return 0; + + table_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_DRIVER); + if (!table_attr) + goto err; + + /* Get a consistent snapshot */ + spin_lock_irq(&chp->lock); + + /* t4_cq struct */ + cq = chp->cq; + + /* get 2 hw cqes: cidx-1, and cidx */ + idx = (cq.cidx > 0) ? cq.cidx - 1 : cq.size - 1; + hwcqes[0] = chp->cq.queue[idx]; + + idx = cq.cidx; + hwcqes[1] = chp->cq.queue[idx]; + + /* get first and last sw cqes */ + if (cq.sw_in_use) { + swcqes[0] = chp->cq.sw_queue[cq.sw_cidx]; + if (cq.sw_in_use > 1) { + idx = (cq.sw_pidx > 0) ? cq.sw_pidx - 1 : cq.size - 1; + swcqes[1] = chp->cq.sw_queue[idx]; + } + } + + spin_unlock_irq(&chp->lock); + + if (fill_cq(msg, &cq)) + goto err_cancel_table; + + if (fill_swcqes(msg, &cq, swcqes)) + goto err_cancel_table; + + if (fill_hwcqes(msg, &cq, hwcqes)) + goto err_cancel_table; + + nla_nest_end(msg, table_attr); + return 0; + +err_cancel_table: + nla_nest_cancel(msg, table_attr); +err: + return -EMSGSIZE; +} + c4iw_restrack_func *c4iw_restrack_funcs[RDMA_RESTRACK_MAX] = { [RDMA_RESTRACK_QP] = fill_res_qp_entry, [RDMA_RESTRACK_CM_ID] = fill_res_ep_entry, + [RDMA_RESTRACK_CQ] = fill_res_cq_entry, }; -- cgit v1.2.3 From 013f64a88059fc65f01f1b967f1cf9c666a231a2 Mon Sep 17 00:00:00 2001 From: Steve Wise Date: Thu, 10 May 2018 07:32:01 -0700 Subject: iw_cxgb4: provide detailed driver-specific MR information Add a table of important fields from the fw_ri_tpte structure to the mr resource tracking table. This is helpful in debugging. Signed-off-by: Steve Wise Signed-off-by: Doug Ledford --- drivers/infiniband/hw/cxgb4/restrack.c | 61 ++++++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) (limited to 'drivers') diff --git a/drivers/infiniband/hw/cxgb4/restrack.c b/drivers/infiniband/hw/cxgb4/restrack.c index 013524a093cf..9a7520ee41e0 100644 --- a/drivers/infiniband/hw/cxgb4/restrack.c +++ b/drivers/infiniband/hw/cxgb4/restrack.c @@ -433,8 +433,69 @@ err: return -EMSGSIZE; } +static int fill_res_mr_entry(struct sk_buff *msg, + struct rdma_restrack_entry *res) +{ + struct ib_mr *ibmr = container_of(res, struct ib_mr, res); + struct c4iw_mr *mhp = to_c4iw_mr(ibmr); + struct c4iw_dev *dev = mhp->rhp; + u32 stag = mhp->attr.stag; + struct nlattr *table_attr; + struct fw_ri_tpte tpte; + int ret; + + if (!stag) + return 0; + + table_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_DRIVER); + if (!table_attr) + goto err; + + ret = cxgb4_read_tpte(dev->rdev.lldi.ports[0], stag, (__be32 *)&tpte); + if (ret) { + dev_err(&dev->rdev.lldi.pdev->dev, + "%s cxgb4_read_tpte err %d\n", __func__, ret); + return 0; + } + + if (rdma_nl_put_driver_u32_hex(msg, "idx", stag >> 8)) + goto err_cancel_table; + if (rdma_nl_put_driver_u32(msg, "valid", + FW_RI_TPTE_VALID_G(ntohl(tpte.valid_to_pdid)))) + goto err_cancel_table; + if (rdma_nl_put_driver_u32_hex(msg, "key", stag & 0xff)) + goto err_cancel_table; + if (rdma_nl_put_driver_u32(msg, "state", + FW_RI_TPTE_STAGSTATE_G(ntohl(tpte.valid_to_pdid)))) + goto err_cancel_table; + if (rdma_nl_put_driver_u32(msg, "pdid", + FW_RI_TPTE_PDID_G(ntohl(tpte.valid_to_pdid)))) + goto err_cancel_table; + if (rdma_nl_put_driver_u32_hex(msg, "perm", + FW_RI_TPTE_PERM_G(ntohl(tpte.locread_to_qpid)))) + goto err_cancel_table; + if (rdma_nl_put_driver_u32(msg, "ps", + FW_RI_TPTE_PS_G(ntohl(tpte.locread_to_qpid)))) + goto err_cancel_table; + if (rdma_nl_put_driver_u64(msg, "len", + ((u64)ntohl(tpte.len_hi) << 32) | ntohl(tpte.len_lo))) + goto err_cancel_table; + if (rdma_nl_put_driver_u32_hex(msg, "pbl_addr", + FW_RI_TPTE_PBLADDR_G(ntohl(tpte.nosnoop_pbladdr)))) + goto err_cancel_table; + + nla_nest_end(msg, table_attr); + return 0; + +err_cancel_table: + nla_nest_cancel(msg, table_attr); +err: + return -EMSGSIZE; +} + c4iw_restrack_func *c4iw_restrack_funcs[RDMA_RESTRACK_MAX] = { [RDMA_RESTRACK_QP] = fill_res_qp_entry, [RDMA_RESTRACK_CM_ID] = fill_res_ep_entry, [RDMA_RESTRACK_CQ] = fill_res_cq_entry, + [RDMA_RESTRACK_MR] = fill_res_mr_entry, }; -- cgit v1.2.3 From 4171a693a5159e47f72eea3331bebf538dea9b83 Mon Sep 17 00:00:00 2001 From: Don Hiatt Date: Tue, 15 May 2018 18:28:07 -0700 Subject: IB/hfi1: Define 16B Management Packets Add 16B Management Packet definition. This optimized packet format replaces the ib_other_headers and BTH with a source and destination QP number. To support these packets we introduce struct opa_16b_mgmt into the struct hfi1_16b_header. This packet format is only used for MAD packets using the IB_OPCODE_UD_SEND_ONLY opcode on QP0/1. The original 16B implementation failed to use 16B management packets so now we add their definition. Reviewed-by: Ira Weiny Signed-off-by: Don Hiatt Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/hfi.h | 1 + drivers/infiniband/hw/hfi1/verbs.h | 7 +++++++ 2 files changed, 8 insertions(+) (limited to 'drivers') diff --git a/drivers/infiniband/hw/hfi1/hfi.h b/drivers/infiniband/hw/hfi1/hfi.h index dd84238c1aac..531ac89c8213 100644 --- a/drivers/infiniband/hw/hfi1/hfi.h +++ b/drivers/infiniband/hw/hfi1/hfi.h @@ -392,6 +392,7 @@ struct hfi1_packet { */ #define OPA_16B_L4_9B 0x00 #define OPA_16B_L2_TYPE 0x02 +#define OPA_16B_L4_FM 0x08 #define OPA_16B_L4_IB_LOCAL 0x09 #define OPA_16B_L4_IB_GLOBAL 0x0A #define OPA_16B_L4_ETHR OPA_VNIC_L4_ETHR diff --git a/drivers/infiniband/hw/hfi1/verbs.h b/drivers/infiniband/hw/hfi1/verbs.h index a16fe5d3f7c4..a4d06502f06d 100644 --- a/drivers/infiniband/hw/hfi1/verbs.h +++ b/drivers/infiniband/hw/hfi1/verbs.h @@ -110,6 +110,12 @@ enum { #define LRH_9B_BYTES (FIELD_SIZEOF(struct ib_header, lrh)) #define LRH_9B_DWORDS (LRH_9B_BYTES / sizeof(u32)) +/* 24Bits for qpn, upper 8Bits reserved */ +struct opa_16b_mgmt { + __be32 dest_qpn; + __be32 src_qpn; +}; + struct hfi1_16b_header { u32 lrh[4]; union { @@ -118,6 +124,7 @@ struct hfi1_16b_header { struct ib_other_headers oth; } l; struct ib_other_headers oth; + struct opa_16b_mgmt mgmt; } u; } __packed; -- cgit v1.2.3 From 81cd3891f021b88319f7243715c30945aaabe9ea Mon Sep 17 00:00:00 2001 From: Don Hiatt Date: Tue, 15 May 2018 18:28:15 -0700 Subject: IB/hfi1: Add support for 16B Management Packets 16B Management Packets (L4=0x08) replace the BTH and DETH of normal MAD packet packets with a header containing the the source and destination queue pair numbers; fields that were originally retrieved from the BTH/DETH are now populated from this header as well as from the 16B LRH (e.g. pkey). 16B Management Packets are used as an optimized management format on 16B fabrics. These management packets have an opcode of IB_OPCODE_UD_SEND_ONLY, a fixed 3Byte pad, and a header length of 24Bytes. The decision as to when we send a management packet is based upon either the source or destination queue pair number being 0 or 1. Reviewed-by: Ira Weiny Signed-off-by: Don Hiatt Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/driver.c | 33 ++++++++++++++------- drivers/infiniband/hw/hfi1/hfi.h | 28 ++++++++++++++++++ drivers/infiniband/hw/hfi1/ud.c | 57 +++++++++++++++++++++++++++---------- drivers/infiniband/hw/hfi1/verbs.c | 25 +++++++++++----- 4 files changed, 110 insertions(+), 33 deletions(-) (limited to 'drivers') diff --git a/drivers/infiniband/hw/hfi1/driver.c b/drivers/infiniband/hw/hfi1/driver.c index e5a57ebd8da4..94dca95db04f 100644 --- a/drivers/infiniband/hw/hfi1/driver.c +++ b/drivers/infiniband/hw/hfi1/driver.c @@ -1483,38 +1483,51 @@ static int hfi1_setup_bypass_packet(struct hfi1_packet *packet) struct hfi1_pportdata *ppd = rcd->ppd; struct hfi1_ibport *ibp = &ppd->ibport_data; u8 l4; - u8 grh_len; packet->hdr = (struct hfi1_16b_header *) hfi1_get_16B_header(packet->rcd->dd, packet->rhf_addr); - packet->hlen = (u8 *)packet->rhf_addr - (u8 *)packet->hdr; - l4 = hfi1_16B_get_l4(packet->hdr); if (l4 == OPA_16B_L4_IB_LOCAL) { - grh_len = 0; packet->ohdr = packet->ebuf; packet->grh = NULL; + packet->opcode = ib_bth_get_opcode(packet->ohdr); + packet->pad = hfi1_16B_bth_get_pad(packet->ohdr); + /* hdr_len_by_opcode already has an IB LRH factored in */ + packet->hlen = hdr_len_by_opcode[packet->opcode] + + (LRH_16B_BYTES - LRH_9B_BYTES); + packet->migrated = opa_bth_is_migration(packet->ohdr); } else if (l4 == OPA_16B_L4_IB_GLOBAL) { u32 vtf; + u8 grh_len = sizeof(struct ib_grh); - grh_len = sizeof(struct ib_grh); packet->ohdr = packet->ebuf + grh_len; packet->grh = packet->ebuf; + packet->opcode = ib_bth_get_opcode(packet->ohdr); + packet->pad = hfi1_16B_bth_get_pad(packet->ohdr); + /* hdr_len_by_opcode already has an IB LRH factored in */ + packet->hlen = hdr_len_by_opcode[packet->opcode] + + (LRH_16B_BYTES - LRH_9B_BYTES) + grh_len; + packet->migrated = opa_bth_is_migration(packet->ohdr); + if (packet->grh->next_hdr != IB_GRH_NEXT_HDR) goto drop; vtf = be32_to_cpu(packet->grh->version_tclass_flow); if ((vtf >> IB_GRH_VERSION_SHIFT) != IB_GRH_VERSION) goto drop; + } else if (l4 == OPA_16B_L4_FM) { + packet->mgmt = packet->ebuf; + packet->ohdr = NULL; + packet->grh = NULL; + packet->opcode = IB_OPCODE_UD_SEND_ONLY; + packet->pad = OPA_16B_L4_FM_PAD; + packet->hlen = OPA_16B_L4_FM_HLEN; + packet->migrated = false; } else { goto drop; } /* Query commonly used fields from packet header */ - packet->opcode = ib_bth_get_opcode(packet->ohdr); - /* hdr_len_by_opcode already has an IB LRH factored in */ - packet->hlen = hdr_len_by_opcode[packet->opcode] + - (LRH_16B_BYTES - LRH_9B_BYTES) + grh_len; packet->payload = packet->ebuf + packet->hlen - LRH_16B_BYTES; packet->slid = hfi1_16B_get_slid(packet->hdr); packet->dlid = hfi1_16B_get_dlid(packet->hdr); @@ -1524,10 +1537,8 @@ static int hfi1_setup_bypass_packet(struct hfi1_packet *packet) 16B); packet->sc = hfi1_16B_get_sc(packet->hdr); packet->sl = ibp->sc_to_sl[packet->sc]; - packet->pad = hfi1_16B_bth_get_pad(packet->ohdr); packet->extra_byte = SIZE_OF_LT; packet->pkey = hfi1_16B_get_pkey(packet->hdr); - packet->migrated = opa_bth_is_migration(packet->ohdr); if (hfi1_bypass_ingress_pkt_check(packet)) goto drop; diff --git a/drivers/infiniband/hw/hfi1/hfi.h b/drivers/infiniband/hw/hfi1/hfi.h index 531ac89c8213..f49cd80df557 100644 --- a/drivers/infiniband/hw/hfi1/hfi.h +++ b/drivers/infiniband/hw/hfi1/hfi.h @@ -333,6 +333,7 @@ struct hfi1_packet { struct rvt_qp *qp; struct ib_other_headers *ohdr; struct ib_grh *grh; + struct opa_16b_mgmt *mgmt; u64 rhf; u32 maxcnt; u32 rhqoff; @@ -397,6 +398,12 @@ struct hfi1_packet { #define OPA_16B_L4_IB_GLOBAL 0x0A #define OPA_16B_L4_ETHR OPA_VNIC_L4_ETHR +/* + * OPA 16B Management + */ +#define OPA_16B_L4_FM_PAD 3 /* fixed 3B pad */ +#define OPA_16B_L4_FM_HLEN 24 /* 16B(16) + L4_FM(8) */ + static inline u8 hfi1_16B_get_l4(struct hfi1_16b_header *hdr) { return (u8)(hdr->lrh[2] & OPA_16B_L4_MASK); @@ -473,6 +480,27 @@ static inline u8 hfi1_16B_bth_get_pad(struct ib_other_headers *ohdr) OPA_16B_BTH_PAD_MASK); } +/* + * 16B Management + */ +#define OPA_16B_MGMT_QPN_MASK 0xFFFFFF +static inline u32 hfi1_16B_get_dest_qpn(struct opa_16b_mgmt *mgmt) +{ + return be32_to_cpu(mgmt->dest_qpn) & OPA_16B_MGMT_QPN_MASK; +} + +static inline u32 hfi1_16B_get_src_qpn(struct opa_16b_mgmt *mgmt) +{ + return be32_to_cpu(mgmt->src_qpn) & OPA_16B_MGMT_QPN_MASK; +} + +static inline void hfi1_16B_set_qpn(struct opa_16b_mgmt *mgmt, + u32 dest_qp, u32 src_qp) +{ + mgmt->dest_qpn = cpu_to_be32(dest_qp & OPA_16B_MGMT_QPN_MASK); + mgmt->src_qpn = cpu_to_be32(src_qp & OPA_16B_MGMT_QPN_MASK); +} + struct rvt_sge_state; /* diff --git a/drivers/infiniband/hw/hfi1/ud.c b/drivers/infiniband/hw/hfi1/ud.c index 6ad203f6da88..1ab332f1866e 100644 --- a/drivers/infiniband/hw/hfi1/ud.c +++ b/drivers/infiniband/hw/hfi1/ud.c @@ -399,16 +399,30 @@ void hfi1_make_ud_req_16B(struct rvt_qp *qp, struct hfi1_pkt_state *ps, struct hfi1_pportdata *ppd; struct hfi1_ibport *ibp; u32 dlid, slid, nwords, extra_bytes; + u32 dest_qp = wqe->ud_wr.remote_qpn; + u32 src_qp = qp->ibqp.qp_num; u16 len, pkey; u8 l4, sc5; + bool is_mgmt = false; ibp = to_iport(qp->ibqp.device, qp->port_num); ppd = ppd_from_ibp(ibp); ah_attr = &ibah_to_rvtah(wqe->ud_wr.ah)->attr; - /* header size in dwords 16B LRH+BTH+DETH = (16+12+8)/4. */ - ps->s_txreq->hdr_dwords = 9; - if (wqe->wr.opcode == IB_WR_SEND_WITH_IMM) - ps->s_txreq->hdr_dwords++; + + /* + * Build 16B Management Packet if either the destination + * or source queue pair number is 0 or 1. + */ + if (dest_qp == 0 || src_qp == 0 || dest_qp == 1 || src_qp == 1) { + /* header size in dwords 16B LRH+L4_FM = (16+8)/4. */ + ps->s_txreq->hdr_dwords = 6; + is_mgmt = true; + } else { + /* header size in dwords 16B LRH+BTH+DETH = (16+12+8)/4. */ + ps->s_txreq->hdr_dwords = 9; + if (wqe->wr.opcode == IB_WR_SEND_WITH_IMM) + ps->s_txreq->hdr_dwords++; + } /* SW provides space for CRC and LT for bypass packets. */ extra_bytes = hfi1_get_16b_padding((ps->s_txreq->hdr_dwords << 2), @@ -453,7 +467,14 @@ void hfi1_make_ud_req_16B(struct rvt_qp *qp, struct hfi1_pkt_state *ps, slid = ppd->lid | (rdma_ah_get_path_bits(ah_attr) & ((1 << ppd->lmc) - 1)); - hfi1_make_bth_deth(qp, wqe, ohdr, &pkey, extra_bytes, true); + if (is_mgmt) { + l4 = OPA_16B_L4_FM; + pkey = hfi1_get_pkey(ibp, wqe->ud_wr.pkey_index); + hfi1_16B_set_qpn(&ps->s_txreq->phdr.hdr.opah.u.mgmt, + dest_qp, src_qp); + } else { + hfi1_make_bth_deth(qp, wqe, ohdr, &pkey, extra_bytes, true); + } /* Convert dwords to flits */ len = (ps->s_txreq->hdr_dwords + nwords) >> 1; @@ -845,10 +866,8 @@ static int opa_smp_check(struct hfi1_ibport *ibp, u16 pkey, u8 sc5, */ void hfi1_ud_rcv(struct hfi1_packet *packet) { - struct ib_other_headers *ohdr = packet->ohdr; u32 hdrsize = packet->hlen; struct ib_wc wc; - u32 qkey; u32 src_qp; u16 pkey; int mgmt_pkey_idx = -1; @@ -864,27 +883,35 @@ void hfi1_ud_rcv(struct hfi1_packet *packet) u32 dlid = packet->dlid; u32 slid = packet->slid; u8 extra_bytes; + u8 l4 = 0; bool dlid_is_permissive; bool slid_is_permissive; + bool solicited = false; extra_bytes = packet->pad + packet->extra_byte + (SIZE_OF_CRC << 2); - qkey = ib_get_qkey(ohdr); - src_qp = ib_get_sqpn(ohdr); if (packet->etype == RHF_RCV_TYPE_BYPASS) { u32 permissive_lid = opa_get_lid(be32_to_cpu(OPA_LID_PERMISSIVE), 16B); + l4 = hfi1_16B_get_l4(packet->hdr); pkey = hfi1_16B_get_pkey(packet->hdr); dlid_is_permissive = (dlid == permissive_lid); slid_is_permissive = (slid == permissive_lid); } else { - pkey = ib_bth_get_pkey(ohdr); + pkey = ib_bth_get_pkey(packet->ohdr); dlid_is_permissive = (dlid == be16_to_cpu(IB_LID_PERMISSIVE)); slid_is_permissive = (slid == be16_to_cpu(IB_LID_PERMISSIVE)); } sl_from_sc = ibp->sc_to_sl[sc5]; + if (likely(l4 != OPA_16B_L4_FM)) { + src_qp = ib_get_sqpn(packet->ohdr); + solicited = ib_bth_is_solicited(packet->ohdr); + } else { + src_qp = hfi1_16B_get_src_qpn(packet->mgmt); + } + process_ecn(qp, packet, (opcode != IB_OPCODE_CNP)); /* * Get the number of bytes the message was padded by @@ -922,8 +949,9 @@ void hfi1_ud_rcv(struct hfi1_packet *packet) if (mgmt_pkey_idx < 0) goto drop; } - if (unlikely(qkey != qp->qkey)) /* Silent drop */ - return; + if (unlikely(l4 != OPA_16B_L4_FM && + ib_get_qkey(packet->ohdr) != qp->qkey)) + return; /* Silent drop */ /* Drop invalid MAD packets (see 13.5.3.1). */ if (unlikely(qp->ibqp.qp_num == 1 && @@ -950,7 +978,7 @@ void hfi1_ud_rcv(struct hfi1_packet *packet) if (qp->ibqp.qp_num > 1 && opcode == IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE) { - wc.ex.imm_data = ohdr->u.ud.imm_data; + wc.ex.imm_data = packet->ohdr->u.ud.imm_data; wc.wc_flags = IB_WC_WITH_IMM; tlen -= sizeof(u32); } else if (opcode == IB_OPCODE_UD_SEND_ONLY) { @@ -1047,8 +1075,7 @@ void hfi1_ud_rcv(struct hfi1_packet *packet) dlid & ((1 << ppd_from_ibp(ibp)->lmc) - 1); wc.port_num = qp->port_num; /* Signal completion event if the solicited bit is set. */ - rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc, - ib_bth_is_solicited(ohdr)); + rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc, solicited); return; drop: diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c index fc2e44cde161..08991874c0e2 100644 --- a/drivers/infiniband/hw/hfi1/verbs.c +++ b/drivers/infiniband/hw/hfi1/verbs.c @@ -617,7 +617,12 @@ static inline void hfi1_handle_packet(struct hfi1_packet *packet, wake_up(&mcast->wait); } else { /* Get the destination QP number. */ - qp_num = ib_bth_get_qpn(packet->ohdr); + if (packet->etype == RHF_RCV_TYPE_BYPASS && + hfi1_16B_get_l4(packet->hdr) == OPA_16B_L4_FM) + qp_num = hfi1_16B_get_dest_qpn(packet->mgmt); + else + qp_num = ib_bth_get_qpn(packet->ohdr); + rcu_read_lock(); packet->qp = rvt_lookup_qpn(rdi, &ibp->rvp, qp_num); if (!packet->qp) @@ -1308,21 +1313,23 @@ int hfi1_verbs_send(struct rvt_qp *qp, struct hfi1_pkt_state *ps) { struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device); struct hfi1_qp_priv *priv = qp->priv; - struct ib_other_headers *ohdr; + struct ib_other_headers *ohdr = NULL; send_routine sr; int ret; u16 pkey; u32 slid; + u8 l4 = 0; /* locate the pkey within the headers */ if (ps->s_txreq->phdr.hdr.hdr_type) { struct hfi1_16b_header *hdr = &ps->s_txreq->phdr.hdr.opah; - u8 l4 = hfi1_16B_get_l4(hdr); - if (l4 == OPA_16B_L4_IB_GLOBAL) - ohdr = &hdr->u.l.oth; - else + l4 = hfi1_16B_get_l4(hdr); + if (l4 == OPA_16B_L4_IB_LOCAL) ohdr = &hdr->u.oth; + else if (l4 == OPA_16B_L4_IB_GLOBAL) + ohdr = &hdr->u.l.oth; + slid = hfi1_16B_get_slid(hdr); pkey = hfi1_16B_get_pkey(hdr); } else { @@ -1337,7 +1344,11 @@ int hfi1_verbs_send(struct rvt_qp *qp, struct hfi1_pkt_state *ps) pkey = ib_bth_get_pkey(ohdr); } - ps->opcode = ib_bth_get_opcode(ohdr); + if (likely(l4 != OPA_16B_L4_FM)) + ps->opcode = ib_bth_get_opcode(ohdr); + else + ps->opcode = IB_OPCODE_UD_SEND_ONLY; + sr = get_send_routine(qp, ps); ret = egress_pkey_check(dd->pport, slid, pkey, priv->s_sc, qp->s_pkey_index); -- cgit v1.2.3 From 43a68c35c7b1135ec05b8c84e7509a50925b00b6 Mon Sep 17 00:00:00 2001 From: Don Hiatt Date: Tue, 15 May 2018 18:28:22 -0700 Subject: IB/hfi1: Add 16B Management Packet trace support Add trace support for 16B Management Packets. Reviewed-by: Ira Weiny Signed-off-by: Don Hiatt Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/trace.c | 40 ++++++-- drivers/infiniband/hw/hfi1/trace_ibhdrs.h | 160 +++++++++++++++++++----------- 2 files changed, 130 insertions(+), 70 deletions(-) (limited to 'drivers') diff --git a/drivers/infiniband/hw/hfi1/trace.c b/drivers/infiniband/hw/hfi1/trace.c index 332b9b7c554a..7c8aed0ffc07 100644 --- a/drivers/infiniband/hw/hfi1/trace.c +++ b/drivers/infiniband/hw/hfi1/trace.c @@ -63,13 +63,20 @@ static u8 __get_ib_hdr_len(struct ib_header *hdr) static u8 __get_16b_hdr_len(struct hfi1_16b_header *hdr) { - struct ib_other_headers *ohdr; + struct ib_other_headers *ohdr = NULL; u8 opcode; + u8 l4 = hfi1_16B_get_l4(hdr); + + if (l4 == OPA_16B_L4_FM) { + opcode = IB_OPCODE_UD_SEND_ONLY; + return (8 + 8); /* No BTH */ + } - if (hfi1_16B_get_l4(hdr) == OPA_16B_L4_IB_LOCAL) + if (l4 == OPA_16B_L4_IB_LOCAL) ohdr = &hdr->u.oth; else ohdr = &hdr->u.l.oth; + opcode = ib_bth_get_opcode(ohdr); return hdr_len_by_opcode[opcode] == 0 ? 0 : hdr_len_by_opcode[opcode] - (12 + 8 + 8); @@ -234,17 +241,24 @@ const char *hfi1_trace_fmt_lrh(struct trace_seq *p, bool bypass, #define BTH_16B_PRN \ "op:0x%.2x,%s se:%d m:%d pad:%d tver:%d " \ "qpn:0x%.6x a:%d psn:0x%.8x" -const char *hfi1_trace_fmt_bth(struct trace_seq *p, bool bypass, - u8 ack, bool becn, bool fecn, u8 mig, - u8 se, u8 pad, u8 opcode, const char *opname, - u8 tver, u16 pkey, u32 psn, u32 qpn) +#define L4_FM_16B_PRN \ + "op:0x%.2x,%s dest_qpn:0x%.6x src_qpn:0x%.6x" +const char *hfi1_trace_fmt_rest(struct trace_seq *p, bool bypass, u8 l4, + u8 ack, bool becn, bool fecn, u8 mig, + u8 se, u8 pad, u8 opcode, const char *opname, + u8 tver, u16 pkey, u32 psn, u32 qpn, + u32 dest_qpn, u32 src_qpn) { const char *ret = trace_seq_buffer_ptr(p); if (bypass) - trace_seq_printf(p, BTH_16B_PRN, - opcode, opname, - se, mig, pad, tver, qpn, ack, psn); + if (l4 == OPA_16B_L4_FM) + trace_seq_printf(p, L4_FM_16B_PRN, + opcode, opname, dest_qpn, src_qpn); + else + trace_seq_printf(p, BTH_16B_PRN, + opcode, opname, + se, mig, pad, tver, qpn, ack, psn); else trace_seq_printf(p, BTH_9B_PRN, @@ -258,12 +272,17 @@ const char *hfi1_trace_fmt_bth(struct trace_seq *p, bool bypass, const char *parse_everbs_hdrs( struct trace_seq *p, - u8 opcode, + u8 opcode, u8 l4, u32 dest_qpn, u32 src_qpn, void *ehdrs) { union ib_ehdrs *eh = ehdrs; const char *ret = trace_seq_buffer_ptr(p); + if (l4 == OPA_16B_L4_FM) { + trace_seq_printf(p, "mgmt pkt"); + goto out; + } + switch (opcode) { /* imm */ case OP(RC, SEND_LAST_WITH_IMMEDIATE): @@ -334,6 +353,7 @@ const char *parse_everbs_hdrs( be32_to_cpu(eh->ieth)); break; } +out: trace_seq_putc(p, 0); return ret; } diff --git a/drivers/infiniband/hw/hfi1/trace_ibhdrs.h b/drivers/infiniband/hw/hfi1/trace_ibhdrs.h index 2847626d3819..1dc2c28fc96e 100644 --- a/drivers/infiniband/hw/hfi1/trace_ibhdrs.h +++ b/drivers/infiniband/hw/hfi1/trace_ibhdrs.h @@ -96,7 +96,9 @@ __print_symbolic(opcode, \ ib_opcode_name(CNP)) u8 ibhdr_exhdr_len(struct ib_header *hdr); -const char *parse_everbs_hdrs(struct trace_seq *p, u8 opcode, void *ehdrs); +const char *parse_everbs_hdrs(struct trace_seq *p, u8 opcode, + u8 l4, u32 dest_qpn, u32 src_qpn, + void *ehdrs); u8 hfi1_trace_opa_hdr_len(struct hfi1_opa_header *opah); u8 hfi1_trace_packet_hdr_len(struct hfi1_packet *packet); const char *hfi1_trace_get_packet_l4_str(u8 l4); @@ -123,14 +125,16 @@ const char *hfi1_trace_fmt_lrh(struct trace_seq *p, bool bypass, u8 rc, u8 sc, u8 sl, u16 entropy, u16 len, u16 pkey, u32 dlid, u32 slid); -const char *hfi1_trace_fmt_bth(struct trace_seq *p, bool bypass, - u8 ack, bool becn, bool fecn, u8 mig, - u8 se, u8 pad, u8 opcode, const char *opname, - u8 tver, u16 pkey, u32 psn, u32 qpn); +const char *hfi1_trace_fmt_rest(struct trace_seq *p, bool bypass, u8 l4, + u8 ack, bool becn, bool fecn, u8 mig, + u8 se, u8 pad, u8 opcode, const char *opname, + u8 tver, u16 pkey, u32 psn, u32 qpn, + u32 dest_qpn, u32 src_qpn); const char *hfi1_trace_get_packet_l2_str(u8 l2); -#define __parse_ib_ehdrs(op, ehdrs) parse_everbs_hdrs(p, op, ehdrs) +#define __parse_ib_ehdrs(op, l4, dest_qpn, src_qpn, ehdrs) \ + parse_everbs_hdrs(p, op, l4, dest_qpn, src_qpn, ehdrs) #define lrh_name(lrh) { HFI1_##lrh, #lrh } #define show_lnh(lrh) \ @@ -169,6 +173,8 @@ DECLARE_EVENT_CLASS(hfi1_input_ibhdr_template, __field(u32, psn) __field(u32, qpn) __field(u32, slid) + __field(u32, dest_qpn) + __field(u32, src_qpn) /* extended headers */ __dynamic_array(u8, ehdrs, hfi1_trace_packet_hdr_len(packet)) @@ -178,6 +184,8 @@ DECLARE_EVENT_CLASS(hfi1_input_ibhdr_template, __entry->etype = packet->etype; __entry->l2 = hfi1_16B_get_l2(packet->hdr); + __entry->dest_qpn = 0; + __entry->src_qpn = 0; if (__entry->etype == RHF_RCV_TYPE_BYPASS) { hfi1_trace_parse_16b_hdr(packet->hdr, &__entry->age, @@ -192,16 +200,23 @@ DECLARE_EVENT_CLASS(hfi1_input_ibhdr_template, &__entry->dlid, &__entry->slid); - hfi1_trace_parse_16b_bth(packet->ohdr, - &__entry->ack, - &__entry->mig, - &__entry->opcode, - &__entry->pad, - &__entry->se, - &__entry->tver, - &__entry->psn, - &__entry->qpn); + if (__entry->l4 == OPA_16B_L4_FM) { + __entry->opcode = IB_OPCODE_UD_SEND_ONLY; + __entry->dest_qpn = hfi1_16B_get_dest_qpn(packet->mgmt); + __entry->src_qpn = hfi1_16B_get_src_qpn(packet->mgmt); + } else { + hfi1_trace_parse_16b_bth(packet->ohdr, + &__entry->ack, + &__entry->mig, + &__entry->opcode, + &__entry->pad, + &__entry->se, + &__entry->tver, + &__entry->psn, + &__entry->qpn); + } } else { + __entry->l4 = OPA_16B_L4_9B; hfi1_trace_parse_9b_hdr(packet->hdr, sc5, &__entry->lnh, &__entry->lver, @@ -223,8 +238,9 @@ DECLARE_EVENT_CLASS(hfi1_input_ibhdr_template, &__entry->pkey, &__entry->psn, &__entry->qpn); - } - /* extended headers */ + } + /* extended headers */ + if (__entry->l4 != OPA_16B_L4_FM) memcpy(__get_dynamic_array(ehdrs), &packet->ohdr->u, __get_dynamic_array_len(ehdrs)); @@ -253,25 +269,31 @@ DECLARE_EVENT_CLASS(hfi1_input_ibhdr_template, __entry->pkey, __entry->dlid, __entry->slid), - hfi1_trace_fmt_bth(p, - __entry->etype == + hfi1_trace_fmt_rest(p, + __entry->etype == RHF_RCV_TYPE_BYPASS, - __entry->ack, - __entry->becn, - __entry->fecn, - __entry->mig, - __entry->se, - __entry->pad, - __entry->opcode, - show_ib_opcode(__entry->opcode), - __entry->tver, - __entry->pkey, - __entry->psn, - __entry->qpn), + __entry->l4, + __entry->ack, + __entry->becn, + __entry->fecn, + __entry->mig, + __entry->se, + __entry->pad, + __entry->opcode, + show_ib_opcode(__entry->opcode), + __entry->tver, + __entry->pkey, + __entry->psn, + __entry->qpn, + __entry->dest_qpn, + __entry->src_qpn), /* extended headers */ __get_dynamic_array_len(ehdrs), __parse_ib_ehdrs( __entry->opcode, + __entry->l4, + __entry->dest_qpn, + __entry->src_qpn, (void *)__get_dynamic_array(ehdrs)) ) ); @@ -310,6 +332,8 @@ DECLARE_EVENT_CLASS(hfi1_output_ibhdr_template, __field(u32, psn) __field(u32, qpn) __field(u32, slid) + __field(u32, dest_qpn) + __field(u32, src_qpn) /* extended headers */ __dynamic_array(u8, ehdrs, hfi1_trace_opa_hdr_len(opah)) @@ -320,6 +344,8 @@ DECLARE_EVENT_CLASS(hfi1_output_ibhdr_template, DD_DEV_ASSIGN(dd); __entry->hdr_type = opah->hdr_type; + __entry->dest_qpn = 0; + __entry->src_qpn = 0; if (__entry->hdr_type) { hfi1_trace_parse_16b_hdr(&opah->opah, &__entry->age, @@ -334,19 +360,26 @@ DECLARE_EVENT_CLASS(hfi1_output_ibhdr_template, &__entry->dlid, &__entry->slid); - if (__entry->l4 == OPA_16B_L4_IB_LOCAL) - ohdr = &opah->opah.u.oth; - else - ohdr = &opah->opah.u.l.oth; - hfi1_trace_parse_16b_bth(ohdr, - &__entry->ack, - &__entry->mig, - &__entry->opcode, - &__entry->pad, - &__entry->se, - &__entry->tver, - &__entry->psn, - &__entry->qpn); + if (__entry->l4 == OPA_16B_L4_FM) { + ohdr = NULL; + __entry->opcode = IB_OPCODE_UD_SEND_ONLY; + __entry->dest_qpn = hfi1_16B_get_dest_qpn(&opah->opah.u.mgmt); + __entry->src_qpn = hfi1_16B_get_src_qpn(&opah->opah.u.mgmt); + } else { + if (__entry->l4 == OPA_16B_L4_IB_LOCAL) + ohdr = &opah->opah.u.oth; + else + ohdr = &opah->opah.u.l.oth; + hfi1_trace_parse_16b_bth(ohdr, + &__entry->ack, + &__entry->mig, + &__entry->opcode, + &__entry->pad, + &__entry->se, + &__entry->tver, + &__entry->psn, + &__entry->qpn); + } } else { __entry->l4 = OPA_16B_L4_9B; hfi1_trace_parse_9b_hdr(&opah->ibh, sc5, @@ -376,8 +409,9 @@ DECLARE_EVENT_CLASS(hfi1_output_ibhdr_template, } /* extended headers */ - memcpy(__get_dynamic_array(ehdrs), - &ohdr->u, __get_dynamic_array_len(ehdrs)); + if (__entry->l4 != OPA_16B_L4_FM) + memcpy(__get_dynamic_array(ehdrs), + &ohdr->u, __get_dynamic_array_len(ehdrs)); ), TP_printk("[%s] (%s) %s %s hlen:%d %s", __get_str(dev), @@ -399,24 +433,30 @@ DECLARE_EVENT_CLASS(hfi1_output_ibhdr_template, __entry->pkey, __entry->dlid, __entry->slid), - hfi1_trace_fmt_bth(p, - !!__entry->hdr_type, - __entry->ack, - __entry->becn, - __entry->fecn, - __entry->mig, - __entry->se, - __entry->pad, - __entry->opcode, - show_ib_opcode(__entry->opcode), - __entry->tver, - __entry->pkey, - __entry->psn, - __entry->qpn), + hfi1_trace_fmt_rest(p, + !!__entry->hdr_type, + __entry->l4, + __entry->ack, + __entry->becn, + __entry->fecn, + __entry->mig, + __entry->se, + __entry->pad, + __entry->opcode, + show_ib_opcode(__entry->opcode), + __entry->tver, + __entry->pkey, + __entry->psn, + __entry->qpn, + __entry->dest_qpn, + __entry->src_qpn), /* extended headers */ __get_dynamic_array_len(ehdrs), __parse_ib_ehdrs( __entry->opcode, + __entry->l4, + __entry->dest_qpn, + __entry->src_qpn, (void *)__get_dynamic_array(ehdrs)) ) ); -- cgit v1.2.3 From c8314811f9b2068eb53728d7a06b1ea195579e79 Mon Sep 17 00:00:00 2001 From: Mike Marciniszyn Date: Tue, 15 May 2018 18:31:09 -0700 Subject: IB/hfi1: Cleanup of exp_rcv The knowledge of the internal workings of the expect receive is too distributed. Fix by: - right size several rcd fields associated with expect receive - making an init entrance to init all the lists - consolidate all the allocations into an array anchored in the rcd Reviewed-by: Michael J. Ruhl Reviewed-by: Kaike Wan Signed-off-by: Mike Marciniszyn Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/exp_rcv.c | 39 ++++++++++++++++++++++-------------- drivers/infiniband/hw/hfi1/exp_rcv.h | 24 +++++++++++++++++++++- drivers/infiniband/hw/hfi1/hfi.h | 14 +++++++------ drivers/infiniband/hw/hfi1/init.c | 4 +--- 4 files changed, 56 insertions(+), 25 deletions(-) (limited to 'drivers') diff --git a/drivers/infiniband/hw/hfi1/exp_rcv.c b/drivers/infiniband/hw/hfi1/exp_rcv.c index 0af91675acc6..1be49a0d9c11 100644 --- a/drivers/infiniband/hw/hfi1/exp_rcv.c +++ b/drivers/infiniband/hw/hfi1/exp_rcv.c @@ -52,12 +52,23 @@ * exp_tid_group_init - initialize exp_tid_set * @set - the set */ -void hfi1_exp_tid_group_init(struct exp_tid_set *set) +static void hfi1_exp_tid_set_init(struct exp_tid_set *set) { INIT_LIST_HEAD(&set->list); set->count = 0; } +/** + * hfi1_exp_tid_group_init - initialize rcd expected receive + * @rcd - the rcd + */ +void hfi1_exp_tid_group_init(struct hfi1_ctxtdata *rcd) +{ + hfi1_exp_tid_set_init(&rcd->tid_group_list); + hfi1_exp_tid_set_init(&rcd->tid_used_list); + hfi1_exp_tid_set_init(&rcd->tid_full_list); +} + /** * alloc_ctxt_rcv_groups - initialize expected receive groups * @rcd - the context to add the groupings to @@ -68,13 +79,17 @@ int hfi1_alloc_ctxt_rcv_groups(struct hfi1_ctxtdata *rcd) u32 tidbase; struct tid_group *grp; int i; + u32 ngroups; + ngroups = rcd->expected_count / dd->rcv_entries.group_size; + rcd->groups = + kcalloc_node(ngroups, sizeof(*rcd->groups), + GFP_KERNEL, rcd->numa_id); + if (!rcd->groups) + return -ENOMEM; tidbase = rcd->expected_base; - for (i = 0; i < rcd->expected_count / - dd->rcv_entries.group_size; i++) { - grp = kzalloc(sizeof(*grp), GFP_KERNEL); - if (!grp) - goto bail; + for (i = 0; i < ngroups; i++) { + grp = &rcd->groups[i]; grp->size = dd->rcv_entries.group_size; grp->base = tidbase; tid_group_add_tail(grp, &rcd->tid_group_list); @@ -82,9 +97,6 @@ int hfi1_alloc_ctxt_rcv_groups(struct hfi1_ctxtdata *rcd) } return 0; -bail: - hfi1_free_ctxt_rcv_groups(rcd); - return -ENOMEM; } /** @@ -100,15 +112,12 @@ bail: */ void hfi1_free_ctxt_rcv_groups(struct hfi1_ctxtdata *rcd) { - struct tid_group *grp, *gptr; - WARN_ON(!EXP_TID_SET_EMPTY(rcd->tid_full_list)); WARN_ON(!EXP_TID_SET_EMPTY(rcd->tid_used_list)); - list_for_each_entry_safe(grp, gptr, &rcd->tid_group_list.list, list) { - tid_group_remove(grp, &rcd->tid_group_list); - kfree(grp); - } + kfree(rcd->groups); + rcd->groups = NULL; + hfi1_exp_tid_group_init(rcd); hfi1_clear_tids(rcd); } diff --git a/drivers/infiniband/hw/hfi1/exp_rcv.h b/drivers/infiniband/hw/hfi1/exp_rcv.h index 08719047628a..f25362015095 100644 --- a/drivers/infiniband/hw/hfi1/exp_rcv.h +++ b/drivers/infiniband/hw/hfi1/exp_rcv.h @@ -183,8 +183,30 @@ static inline u32 rcventry2tidinfo(u32 rcventry) EXP_TID_SET(CTRL, 1 << (rcventry - pair)); } +/** + * hfi1_tid_group_to_idx - convert an index to a group + * @rcd - the receive context + * @grp - the group pointer + */ +static inline u16 +hfi1_tid_group_to_idx(struct hfi1_ctxtdata *rcd, struct tid_group *grp) +{ + return grp - &rcd->groups[0]; +} + +/** + * hfi1_idx_to_tid_group - convert a group to an index + * @rcd - the receive context + * @idx - the index + */ +static inline struct tid_group * +hfi1_idx_to_tid_group(struct hfi1_ctxtdata *rcd, u16 idx) +{ + return &rcd->groups[idx]; +} + int hfi1_alloc_ctxt_rcv_groups(struct hfi1_ctxtdata *rcd); void hfi1_free_ctxt_rcv_groups(struct hfi1_ctxtdata *rcd); -void hfi1_exp_tid_group_init(struct exp_tid_set *set); +void hfi1_exp_tid_group_init(struct hfi1_ctxtdata *rcd); #endif /* _HFI1_EXP_RCV_H */ diff --git a/drivers/infiniband/hw/hfi1/hfi.h b/drivers/infiniband/hw/hfi1/hfi.h index f49cd80df557..5eb3bf0849c7 100644 --- a/drivers/infiniband/hw/hfi1/hfi.h +++ b/drivers/infiniband/hw/hfi1/hfi.h @@ -231,13 +231,15 @@ struct hfi1_ctxtdata { /* job key */ u16 jkey; /* number of RcvArray groups for this context. */ - u32 rcv_array_groups; + u16 rcv_array_groups; /* index of first eager TID entry. */ - u32 eager_base; + u16 eager_base; /* number of expected TID entries */ - u32 expected_count; + u16 expected_count; /* index of first expected TID entry. */ - u32 expected_base; + u16 expected_base; + /* array of tid_groups */ + struct tid_group *groups; struct exp_tid_set tid_group_list; struct exp_tid_set tid_used_list; @@ -282,7 +284,7 @@ struct hfi1_ctxtdata { /* interrupt handling */ u64 imask; /* clear interrupt mask */ int ireg; /* clear interrupt register */ - unsigned numa_id; /* numa node of this context */ + int numa_id; /* numa node of this context */ /* verbs rx_stats per rcd */ struct hfi1_opcode_stats_perctx *opstats; @@ -909,9 +911,9 @@ typedef void (*hfi1_make_req)(struct rvt_qp *qp, #define RHF_RCV_REPROCESS 2 /* stop. retain this packet */ struct rcv_array_data { - u8 group_size; u16 ngroups; u16 nctxt_extra; + u8 group_size; }; struct per_vl_data { diff --git a/drivers/infiniband/hw/hfi1/init.c b/drivers/infiniband/hw/hfi1/init.c index 5d1adfc450d3..3feecf926322 100644 --- a/drivers/infiniband/hw/hfi1/init.c +++ b/drivers/infiniband/hw/hfi1/init.c @@ -361,9 +361,7 @@ int hfi1_create_ctxtdata(struct hfi1_pportdata *ppd, int numa, } INIT_LIST_HEAD(&rcd->qp_wait_list); - hfi1_exp_tid_group_init(&rcd->tid_group_list); - hfi1_exp_tid_group_init(&rcd->tid_used_list); - hfi1_exp_tid_group_init(&rcd->tid_full_list); + hfi1_exp_tid_group_init(rcd); rcd->ppd = ppd; rcd->dd = dd; __set_bit(0, rcd->in_use_ctxts); -- cgit v1.2.3 From 5938d94cf0380bca9e6fbb910c010f4483906c11 Mon Sep 17 00:00:00 2001 From: "Michael J. Ruhl" Date: Tue, 15 May 2018 18:31:17 -0700 Subject: IB/hfi1: Set port number for errorinfo MAD response For errorinfo MAD requests, the response has a 0 port number left over from a memset. Instead we should always set the port number in the response. Reviewed-by: Mike Marciniszyn Signed-off-by: Michael J. Ruhl Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/mad.c | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers') diff --git a/drivers/infiniband/hw/hfi1/mad.c b/drivers/infiniband/hw/hfi1/mad.c index 983b5794a660..0307405491e0 100644 --- a/drivers/infiniband/hw/hfi1/mad.c +++ b/drivers/infiniband/hw/hfi1/mad.c @@ -3424,6 +3424,7 @@ static int pma_get_opa_errorinfo(struct opa_pma_mad *pmp, pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD; return reply((struct ib_mad_hdr *)pmp); } + rsp->port_number = port; /* PortRcvErrorInfo */ rsp->port_rcv_ei.status_and_code = -- cgit v1.2.3 From 3ce459cd684b7f18ca79838e62310ffdc930920b Mon Sep 17 00:00:00 2001 From: Mike Marciniszyn Date: Tue, 15 May 2018 18:31:24 -0700 Subject: IB/{rdmavt,hfi1}: Change hrtimer add to use pinned version Given we are dealing with nano-second level timers, when the timer pops, ensure it happens on the CPU which caused the timer to be set in the first place. This avoids excessive jitter from the desired expiration time by avoiding the cost of switching our context to another CPU that is cache cold for this given timer. Reviewed-by: Kaike Wan Signed-off-by: Mike Marciniszyn Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/rc.c | 2 +- drivers/infiniband/sw/rdmavt/qp.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers') diff --git a/drivers/infiniband/hw/hfi1/rc.c b/drivers/infiniband/hw/hfi1/rc.c index 79ee2b9e28c6..1a1a47ac53c6 100644 --- a/drivers/infiniband/hw/hfi1/rc.c +++ b/drivers/infiniband/hw/hfi1/rc.c @@ -2012,7 +2012,7 @@ void process_becn(struct hfi1_pportdata *ppd, u8 sl, u32 rlid, u32 lqpn, unsigned long nsec = 1024 * ccti_timer; hrtimer_start(&cca_timer->hrtimer, ns_to_ktime(nsec), - HRTIMER_MODE_REL); + HRTIMER_MODE_REL_PINNED); } spin_unlock_irqrestore(&ppd->cca_timer_lock, flags); diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c index 6e9a351f45fb..40046135c509 100644 --- a/drivers/infiniband/sw/rdmavt/qp.c +++ b/drivers/infiniband/sw/rdmavt/qp.c @@ -2225,7 +2225,7 @@ void rvt_add_rnr_timer(struct rvt_qp *qp, u32 aeth) to = rvt_aeth_to_usec(aeth); trace_rvt_rnrnak_add(qp, to); hrtimer_start(&qp->s_rnr_timer, - ns_to_ktime(1000 * to), HRTIMER_MODE_REL); + ns_to_ktime(1000 * to), HRTIMER_MODE_REL_PINNED); } EXPORT_SYMBOL(rvt_add_rnr_timer); -- cgit v1.2.3 From 763b69654bfb88ea3230d015e7d755ee8339f8ee Mon Sep 17 00:00:00 2001 From: Alex Estrin Date: Tue, 15 May 2018 18:31:39 -0700 Subject: IB/isert: Fix for lib/dma_debug check_sync warning The following error message occurs on a target host in a debug build during session login: [ 3524.411874] WARNING: CPU: 5 PID: 12063 at lib/dma-debug.c:1207 check_sync+0x4ec/0x5b0 [ 3524.421057] infiniband hfi1_0: DMA-API: device driver tries to sync DMA memory it has not allocated [device address=0x0000000000000000] [size=76 bytes] ......snip ..... [ 3524.535846] CPU: 5 PID: 12063 Comm: iscsi_np Kdump: loaded Not tainted 3.10.0-862.el7.x86_64.debug #1 [ 3524.546764] Hardware name: Dell Inc. PowerEdge R430/03XKDV, BIOS 1.2.6 06/08/2015 [ 3524.555740] Call Trace: [ 3524.559102] [] dump_stack+0x19/0x1b [ 3524.565477] [] __warn+0xd8/0x100 [ 3524.571557] [] warn_slowpath_fmt+0x5f/0x80 [ 3524.578610] [] check_sync+0x4ec/0x5b0 [ 3524.585177] [] ? set_cpus_allowed_ptr+0x5f/0x1c0 [ 3524.592812] [] debug_dma_sync_single_for_cpu+0x80/0x90 [ 3524.601029] [] ? x2apic_send_IPI_mask+0x13/0x20 [ 3524.608574] [] ? native_smp_send_reschedule+0x5b/0x80 [ 3524.616699] [] ? resched_curr+0xf6/0x140 [ 3524.623567] [] isert_create_send_desc.isra.26+0xe0/0x110 [ib_isert] [ 3524.633060] [] isert_put_login_tx+0x55/0x8b0 [ib_isert] [ 3524.641383] [] ? try_to_wake_up+0x1a4/0x430 [ 3524.648561] [] iscsi_target_do_tx_login_io+0xdd/0x230 [iscsi_target_mod] [ 3524.658557] [] iscsi_target_do_login+0x1a7/0x600 [iscsi_target_mod] [ 3524.668084] [] ? kstrdup+0x49/0x60 [ 3524.674420] [] iscsi_target_start_negotiation+0x56/0xc0 [iscsi_target_mod] [ 3524.684656] [] __iscsi_target_login_thread+0x90e/0x1070 [iscsi_target_mod] [ 3524.694901] [] ? __iscsi_target_login_thread+0x1070/0x1070 [iscsi_target_mod] [ 3524.705446] [] ? __iscsi_target_login_thread+0x1070/0x1070 [iscsi_target_mod] [ 3524.715976] [] iscsi_target_login_thread+0x28/0x60 [iscsi_target_mod] [ 3524.725739] [] kthread+0xef/0x100 [ 3524.732007] [] ? insert_kthread_work+0x80/0x80 [ 3524.739540] [] ret_from_fork_nospec_begin+0x21/0x21 [ 3524.747558] [] ? insert_kthread_work+0x80/0x80 [ 3524.755088] ---[ end trace 23f8bf9238bd1ed8 ]--- [ 3595.510822] iSCSI/iqn.1994-05.com.redhat:537fa56299: Unsupported SCSI Opcode 0xa3, sending CHECK_CONDITION. The code calls dma_sync on login_tx_desc->dma_addr prior to initializing it with dma-mapped address. login_tx_desc is a part of iser_conn structure and is used only once during login negotiation, so the issue is fixed by eliminating dma_sync call for this buffer using a special case routine. Cc: Reviewed-by: Mike Marciniszyn Reviewed-by: Don Dutile Signed-off-by: Alex Estrin Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/ulp/isert/ib_isert.c | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) (limited to 'drivers') diff --git a/drivers/infiniband/ulp/isert/ib_isert.c b/drivers/infiniband/ulp/isert/ib_isert.c index fff40b097947..6a55b870b863 100644 --- a/drivers/infiniband/ulp/isert/ib_isert.c +++ b/drivers/infiniband/ulp/isert/ib_isert.c @@ -886,15 +886,9 @@ isert_login_post_send(struct isert_conn *isert_conn, struct iser_tx_desc *tx_des } static void -isert_create_send_desc(struct isert_conn *isert_conn, - struct isert_cmd *isert_cmd, - struct iser_tx_desc *tx_desc) +__isert_create_send_desc(struct isert_device *device, + struct iser_tx_desc *tx_desc) { - struct isert_device *device = isert_conn->device; - struct ib_device *ib_dev = device->ib_device; - - ib_dma_sync_single_for_cpu(ib_dev, tx_desc->dma_addr, - ISER_HEADERS_LEN, DMA_TO_DEVICE); memset(&tx_desc->iser_header, 0, sizeof(struct iser_ctrl)); tx_desc->iser_header.flags = ISCSI_CTRL; @@ -907,6 +901,20 @@ isert_create_send_desc(struct isert_conn *isert_conn, } } +static void +isert_create_send_desc(struct isert_conn *isert_conn, + struct isert_cmd *isert_cmd, + struct iser_tx_desc *tx_desc) +{ + struct isert_device *device = isert_conn->device; + struct ib_device *ib_dev = device->ib_device; + + ib_dma_sync_single_for_cpu(ib_dev, tx_desc->dma_addr, + ISER_HEADERS_LEN, DMA_TO_DEVICE); + + __isert_create_send_desc(device, tx_desc); +} + static int isert_init_tx_hdrs(struct isert_conn *isert_conn, struct iser_tx_desc *tx_desc) @@ -994,7 +1002,7 @@ isert_put_login_tx(struct iscsi_conn *conn, struct iscsi_login *login, struct iser_tx_desc *tx_desc = &isert_conn->login_tx_desc; int ret; - isert_create_send_desc(isert_conn, NULL, tx_desc); + __isert_create_send_desc(device, tx_desc); memcpy(&tx_desc->iscsi_header, &login->rsp[0], sizeof(struct iscsi_hdr)); -- cgit v1.2.3 From 0252f73334f9ef68868e4684200bea3565a4fcee Mon Sep 17 00:00:00 2001 From: Mike Marciniszyn Date: Fri, 18 May 2018 17:07:01 -0700 Subject: IB/qib: Fix DMA api warning with debug kernel The following error occurs in a debug build when running MPI PSM: [ 307.415911] WARNING: CPU: 4 PID: 23867 at lib/dma-debug.c:1158 check_unmap+0x4ee/0xa20 [ 307.455661] ib_qib 0000:05:00.0: DMA-API: device driver failed to check map error[device address=0x00000000df82b000] [size=4096 bytes] [mapped as page] [ 307.517494] Modules linked in: [ 307.531584] ib_isert iscsi_target_mod ib_srpt target_core_mod rpcrdma sunrpc ib_srp scsi_transport_srp scsi_tgt ib_iser libiscsi ib_ipoib scsi_transport_iscsi rdma_ucm ib_ucm ib_uverbs ib_umad rdma_cm ib_cm iw_cm ib_qib intel_powerclamp coretemp rdmavt intel_rapl iosf_mbi kvm_intel kvm irqbypass crc32_pclmul ghash_clmulni_intel ipmi_ssif ib_core aesni_intel sg ipmi_si lrw gf128mul dca glue_helper ipmi_devintf iTCO_wdt gpio_ich hpwdt iTCO_vendor_support ablk_helper hpilo acpi_power_meter cryptd ipmi_msghandler ie31200_edac shpchp pcc_cpufreq lpc_ich pcspkr ip_tables xfs libcrc32c sd_mod crc_t10dif crct10dif_generic mgag200 i2c_algo_bit drm_kms_helper syscopyarea sysfillrect sysimgblt fb_sys_fops ttm ahci crct10dif_pclmul crct10dif_common drm crc32c_intel libahci tg3 libata serio_raw ptp i2c_core [ 307.846113] pps_core dm_mirror dm_region_hash dm_log dm_mod [ 307.866505] CPU: 4 PID: 23867 Comm: mpitests-IMB-MP Kdump: loaded Not tainted 3.10.0-862.el7.x86_64.debug #1 [ 307.911178] Hardware name: HP ProLiant DL320e Gen8, BIOS J05 11/09/2013 [ 307.944206] Call Trace: [ 307.956973] [] dump_stack+0x19/0x1b [ 307.982201] [] __warn+0xd8/0x100 [ 308.005999] [] warn_slowpath_fmt+0x5f/0x80 [ 308.034260] [] check_unmap+0x4ee/0xa20 [ 308.060801] [] ? page_add_file_rmap+0x2a/0x1d0 [ 308.090689] [] debug_dma_unmap_page+0x9d/0xb0 [ 308.120155] [] ? might_fault+0xa0/0xb0 [ 308.146656] [] qib_tid_free.isra.14+0x215/0x2a0 [ib_qib] [ 308.180739] [] qib_write+0x894/0x1280 [ib_qib] [ 308.210733] [] ? __inode_security_revalidate+0x70/0x80 [ 308.244837] [] ? security_file_permission+0x27/0xb0 [ 308.266025] qib_ib0.8006: multicast join failed for ff12:401b:8006:0000:0000:0000:ffff:ffff, status -22 [ 308.323421] [] vfs_write+0xc3/0x1f0 [ 308.347077] [] ? fget_light+0xfc/0x510 [ 308.372533] [] SyS_write+0x8a/0x100 [ 308.396456] [] system_call_fastpath+0x1c/0x21 The code calls a qib_map_page() which has never correctly tested for a mapping error. Fix by testing for pci_dma_mapping_error() in all cases and properly handling the failure in the caller. Additionally, streamline qib_map_page() arguments to satisfy just the single caller. Cc: Reviewed-by: Alex Estrin Tested-by: Don Dutile Reviewed-by: Don Dutile Signed-off-by: Mike Marciniszyn Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/qib/qib.h | 3 +-- drivers/infiniband/hw/qib/qib_file_ops.c | 10 +++++++--- drivers/infiniband/hw/qib/qib_user_pages.c | 20 ++++++++++++-------- 3 files changed, 20 insertions(+), 13 deletions(-) (limited to 'drivers') diff --git a/drivers/infiniband/hw/qib/qib.h b/drivers/infiniband/hw/qib/qib.h index 43a68d7b51bb..3461df002f81 100644 --- a/drivers/infiniband/hw/qib/qib.h +++ b/drivers/infiniband/hw/qib/qib.h @@ -1424,8 +1424,7 @@ u64 qib_sps_ints(void); /* * dma_addr wrappers - all 0's invalid for hw */ -dma_addr_t qib_map_page(struct pci_dev *, struct page *, unsigned long, - size_t, int); +int qib_map_page(struct pci_dev *d, struct page *p, dma_addr_t *daddr); struct pci_dev *qib_get_pci_dev(struct rvt_dev_info *rdi); /* diff --git a/drivers/infiniband/hw/qib/qib_file_ops.c b/drivers/infiniband/hw/qib/qib_file_ops.c index bbb720bfd030..98e1ce14fa2a 100644 --- a/drivers/infiniband/hw/qib/qib_file_ops.c +++ b/drivers/infiniband/hw/qib/qib_file_ops.c @@ -364,6 +364,8 @@ static int qib_tid_update(struct qib_ctxtdata *rcd, struct file *fp, goto done; } for (i = 0; i < cnt; i++, vaddr += PAGE_SIZE) { + dma_addr_t daddr; + for (; ntids--; tid++) { if (tid == tidcnt) tid = 0; @@ -380,12 +382,14 @@ static int qib_tid_update(struct qib_ctxtdata *rcd, struct file *fp, ret = -ENOMEM; break; } + ret = qib_map_page(dd->pcidev, pagep[i], &daddr); + if (ret) + break; + tidlist[i] = tid + tidoff; /* we "know" system pages and TID pages are same size */ dd->pageshadow[ctxttid + tid] = pagep[i]; - dd->physshadow[ctxttid + tid] = - qib_map_page(dd->pcidev, pagep[i], 0, PAGE_SIZE, - PCI_DMA_FROMDEVICE); + dd->physshadow[ctxttid + tid] = daddr; /* * don't need atomic or it's overhead */ diff --git a/drivers/infiniband/hw/qib/qib_user_pages.c b/drivers/infiniband/hw/qib/qib_user_pages.c index ce83ba9a12ef..16543d5e80c3 100644 --- a/drivers/infiniband/hw/qib/qib_user_pages.c +++ b/drivers/infiniband/hw/qib/qib_user_pages.c @@ -99,23 +99,27 @@ bail: * * I'm sure we won't be so lucky with other iommu's, so FIXME. */ -dma_addr_t qib_map_page(struct pci_dev *hwdev, struct page *page, - unsigned long offset, size_t size, int direction) +int qib_map_page(struct pci_dev *hwdev, struct page *page, dma_addr_t *daddr) { dma_addr_t phys; - phys = pci_map_page(hwdev, page, offset, size, direction); + phys = pci_map_page(hwdev, page, 0, PAGE_SIZE, PCI_DMA_FROMDEVICE); + if (pci_dma_mapping_error(hwdev, phys)) + return -ENOMEM; - if (phys == 0) { - pci_unmap_page(hwdev, phys, size, direction); - phys = pci_map_page(hwdev, page, offset, size, direction); + if (!phys) { + pci_unmap_page(hwdev, phys, PAGE_SIZE, PCI_DMA_FROMDEVICE); + phys = pci_map_page(hwdev, page, 0, PAGE_SIZE, + PCI_DMA_FROMDEVICE); + if (pci_dma_mapping_error(hwdev, phys)) + return -ENOMEM; /* * FIXME: If we get 0 again, we should keep this page, * map another, then free the 0 page. */ } - - return phys; + *daddr = phys; + return 0; } /** -- cgit v1.2.3 From cc3391cb5356edad235555e5930723cb4c0ac9af Mon Sep 17 00:00:00 2001 From: oulijun Date: Tue, 22 May 2018 20:47:16 +0800 Subject: RDMA/hns: Rename the idx field of db The lower 15 bit of paramter of db structure means different meanings when db type is sq, rq and srq. Signed-off-by: Lijun Ou Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 4 ++-- drivers/infiniband/hw/hns/hns_roce_hw_v2.h | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'drivers') diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index e0ab672e1c0a..a25c3daaff20 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -480,8 +480,8 @@ out: V2_DB_BYTE_4_TAG_S, qp->doorbell_qpn); roce_set_field(sq_db.byte_4, V2_DB_BYTE_4_CMD_M, V2_DB_BYTE_4_CMD_S, HNS_ROCE_V2_SQ_DB); - roce_set_field(sq_db.parameter, V2_DB_PARAMETER_CONS_IDX_M, - V2_DB_PARAMETER_CONS_IDX_S, + roce_set_field(sq_db.parameter, V2_DB_PARAMETER_IDX_M, + V2_DB_PARAMETER_IDX_S, qp->sq.head & ((qp->sq.wqe_cnt << 1) - 1)); roce_set_field(sq_db.parameter, V2_DB_PARAMETER_SL_M, V2_DB_PARAMETER_SL_S, qp->sl); diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h index 182b6726f783..983c0be2afd0 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h @@ -897,8 +897,8 @@ struct hns_roce_v2_mpt_entry { #define V2_DB_BYTE_4_CMD_S 24 #define V2_DB_BYTE_4_CMD_M GENMASK(27, 24) -#define V2_DB_PARAMETER_CONS_IDX_S 0 -#define V2_DB_PARAMETER_CONS_IDX_M GENMASK(15, 0) +#define V2_DB_PARAMETER_IDX_S 0 +#define V2_DB_PARAMETER_IDX_M GENMASK(15, 0) #define V2_DB_PARAMETER_SL_S 16 #define V2_DB_PARAMETER_SL_M GENMASK(18, 16) -- cgit v1.2.3 From 8f06228733582bcaba5533a5dcae7e017bd18317 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 22 May 2018 08:31:03 +0300 Subject: RDMA/mlx5: Remove debug prints of VMA pointers Remove various prints of VMA pointers. Reported-by: Michal Kalderon Signed-off-by: Leon Romanovsky Reviewed-by: Michal Kalderon Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/main.c | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) (limited to 'drivers') diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index ab8cd5c034a2..f3e7d7cd3d87 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -2059,10 +2059,6 @@ static int mlx5_ib_mmap_clock_info_page(struct mlx5_ib_dev *dev, if (err) return err; - mlx5_ib_dbg(dev, "mapped clock info at 0x%lx, PA 0x%llx\n", - vma->vm_start, - (unsigned long long)pfn << PAGE_SHIFT); - return mlx5_ib_set_vma_data(vma, context); } @@ -2157,15 +2153,14 @@ static int uar_mmap(struct mlx5_ib_dev *dev, enum mlx5_ib_mmap_cmd cmd, err = io_remap_pfn_range(vma, vma->vm_start, pfn, PAGE_SIZE, vma->vm_page_prot); if (err) { - mlx5_ib_err(dev, "io_remap_pfn_range failed with error=%d, vm_start=0x%lx, pfn=%pa, mmap_cmd=%s\n", - err, vma->vm_start, &pfn, mmap_cmd2str(cmd)); + mlx5_ib_err(dev, + "io_remap_pfn_range failed with error=%d, mmap_cmd=%s\n", + err, mmap_cmd2str(cmd)); err = -EAGAIN; goto err; } pa = pfn << PAGE_SHIFT; - mlx5_ib_dbg(dev, "mapped %s at 0x%lx, PA %pa\n", mmap_cmd2str(cmd), - vma->vm_start, &pa); err = mlx5_ib_set_vma_data(vma, context); if (err) @@ -2251,10 +2246,6 @@ static int mlx5_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vm if (io_remap_pfn_range(vma, vma->vm_start, pfn, PAGE_SIZE, vma->vm_page_prot)) return -EAGAIN; - - mlx5_ib_dbg(dev, "mapped internal timer at 0x%lx, PA 0x%llx\n", - vma->vm_start, - (unsigned long long)pfn << PAGE_SHIFT); break; case MLX5_IB_MMAP_CLOCK_INFO: return mlx5_ib_mmap_clock_info_page(dev, vma, context); -- cgit v1.2.3 From 9906224f60d4438df4b5f300270dbbe56a102dfc Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Tue, 22 May 2018 08:34:09 +0300 Subject: IB/core: Remove duplicate declaration of gid_cache_wq Remove duplicate declaration of gid_cache_wq. Fixes: d41861942 ("IB/core: Add generic function to extract IB speed from netdev") Signed-off-by: Parav Pandit Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/roce_gid_mgmt.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'drivers') diff --git a/drivers/infiniband/core/roce_gid_mgmt.c b/drivers/infiniband/core/roce_gid_mgmt.c index c0e4fd55e2cc..a4fbdc5d28fa 100644 --- a/drivers/infiniband/core/roce_gid_mgmt.c +++ b/drivers/infiniband/core/roce_gid_mgmt.c @@ -44,8 +44,6 @@ static struct workqueue_struct *gid_cache_wq; -static struct workqueue_struct *gid_cache_wq; - enum gid_op_type { GID_DEL = 0, GID_ADD -- cgit v1.2.3 From 7a8690ed6f5346f6738971892205e91d39b6b901 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Wed, 23 May 2018 08:22:11 +0300 Subject: RDMA/ucm: Mark UCM interface as BROKEN In commit 357d23c811a7 ("Remove the obsolete libibcm library") in rdma-core [1], we removed obsolete library which used the /dev/infiniband/ucmX interface. Following multiple syzkaller reports about non-sanitized user input in the UCMA module, the short audit reveals the same issues in UCM module too. It is better to disable this interface in the kernel, before syzkaller team invests time and energy to harden this unused interface. [1] https://github.com/linux-rdma/rdma-core/pull/279 Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/Kconfig | 11 +++++++++++ drivers/infiniband/core/Makefile | 4 ++-- 2 files changed, 13 insertions(+), 2 deletions(-) (limited to 'drivers') diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig index 2a972ed6851b..b03af54367c0 100644 --- a/drivers/infiniband/Kconfig +++ b/drivers/infiniband/Kconfig @@ -35,6 +35,17 @@ config INFINIBAND_USER_ACCESS libibverbs, libibcm and a hardware driver library from rdma-core . +config INFINIBAND_USER_ACCESS_UCM + bool "Userspace CM (UCM, DEPRECATED)" + depends on BROKEN + depends on INFINIBAND_USER_ACCESS + help + The UCM module has known security flaws, which no one is + interested to fix. The user-space part of this code was + dropped from the upstream a long time ago. + + This option is DEPRECATED and planned to be removed. + config INFINIBAND_EXP_LEGACY_VERBS_NEW_UAPI bool "Allow experimental legacy verbs in new ioctl uAPI (EXPERIMENTAL)" depends on INFINIBAND_USER_ACCESS diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile index 1cfedc469b23..8d42373a2d8a 100644 --- a/drivers/infiniband/core/Makefile +++ b/drivers/infiniband/core/Makefile @@ -5,8 +5,8 @@ user_access-$(CONFIG_INFINIBAND_ADDR_TRANS) := rdma_ucm.o obj-$(CONFIG_INFINIBAND) += ib_core.o ib_cm.o iw_cm.o \ $(infiniband-y) obj-$(CONFIG_INFINIBAND_USER_MAD) += ib_umad.o -obj-$(CONFIG_INFINIBAND_USER_ACCESS) += ib_uverbs.o ib_ucm.o \ - $(user_access-y) +obj-$(CONFIG_INFINIBAND_USER_ACCESS) += ib_uverbs.o $(user_access-y) +obj-$(CONFIG_INFINIBAND_USER_ACCESS_UCM) += ib_ucm.o $(user_access-y) ib_core-y := packer.o ud_header.o verbs.o cq.o rw.o sysfs.o \ device.o fmr_pool.o cache.o netlink.o \ -- cgit v1.2.3 From 7b74a83cf54a3747e22c57e25712bd70eef8acee Mon Sep 17 00:00:00 2001 From: Erez Shitrit Date: Mon, 21 May 2018 11:41:01 +0300 Subject: IB/mlx5: Fetch soft WQE's on fatal error state On fatal error the driver simulates CQE's for ULPs that rely on completion of all their posted work-request. For the GSI traffic, the mlx5 has its own mechanism that sends the completions via software CQE's directly to the relevant CQ. This should be kept in fatal error too, so the driver should simulate such CQE's with the specified error state in order to complete GSI QP work requests. Without the fix the next deadlock might appears: schedule_timeout+0x274/0x350 wait_for_common+0xec/0x240 mcast_remove_one+0xd0/0x120 [ib_core] ib_unregister_device+0x12c/0x230 [ib_core] mlx5_ib_remove+0xc4/0x270 [mlx5_ib] mlx5_detach_device+0x184/0x1a0 [mlx5_core] mlx5_unload_one+0x308/0x340 [mlx5_core] mlx5_pci_err_detected+0x74/0xe0 [mlx5_core] Cc: # 4.7 Fixes: 89ea94a7b6c4 ("IB/mlx5: Reset flow support for IB kernel ULPs") Signed-off-by: Erez Shitrit Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/cq.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) (limited to 'drivers') diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c index 77d257ec899b..9f6bc34cd4db 100644 --- a/drivers/infiniband/hw/mlx5/cq.c +++ b/drivers/infiniband/hw/mlx5/cq.c @@ -637,7 +637,7 @@ repoll: } static int poll_soft_wc(struct mlx5_ib_cq *cq, int num_entries, - struct ib_wc *wc) + struct ib_wc *wc, bool is_fatal_err) { struct mlx5_ib_dev *dev = to_mdev(cq->ibcq.device); struct mlx5_ib_wc *soft_wc, *next; @@ -650,6 +650,10 @@ static int poll_soft_wc(struct mlx5_ib_cq *cq, int num_entries, mlx5_ib_dbg(dev, "polled software generated completion on CQ 0x%x\n", cq->mcq.cqn); + if (unlikely(is_fatal_err)) { + soft_wc->wc.status = IB_WC_WR_FLUSH_ERR; + soft_wc->wc.vendor_err = MLX5_CQE_SYNDROME_WR_FLUSH_ERR; + } wc[npolled++] = soft_wc->wc; list_del(&soft_wc->list); kfree(soft_wc); @@ -670,12 +674,17 @@ int mlx5_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc) spin_lock_irqsave(&cq->lock, flags); if (mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) { - mlx5_ib_poll_sw_comp(cq, num_entries, wc, &npolled); + /* make sure no soft wqe's are waiting */ + if (unlikely(!list_empty(&cq->wc_list))) + soft_polled = poll_soft_wc(cq, num_entries, wc, true); + + mlx5_ib_poll_sw_comp(cq, num_entries - soft_polled, + wc + soft_polled, &npolled); goto out; } if (unlikely(!list_empty(&cq->wc_list))) - soft_polled = poll_soft_wc(cq, num_entries, wc); + soft_polled = poll_soft_wc(cq, num_entries, wc, false); for (npolled = 0; npolled < num_entries - soft_polled; npolled++) { if (mlx5_poll_one(cq, &cur_qp, wc + soft_polled + npolled)) -- cgit v1.2.3 From 25e62655c79395c596601a35805c3c7376d097b6 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Tue, 22 May 2018 20:33:45 +0300 Subject: IB/core: Reduce the places that use zgid Instead of open coding memcmp() to check whether a given GID is zero or not, use a helper function to do so, and replace instances of memcpy(z,&zgid) with memset. Signed-off-by: Parav Pandit Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cache.c | 19 ++++++++++++++----- drivers/infiniband/hw/mlx4/main.c | 5 +++-- drivers/infiniband/hw/mlx4/qp.c | 2 +- include/rdma/ib_cache.h | 1 + 4 files changed, 19 insertions(+), 8 deletions(-) (limited to 'drivers') diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c index 5f1a8333a45a..82699f70e9b6 100644 --- a/drivers/infiniband/core/cache.c +++ b/drivers/infiniband/core/cache.c @@ -125,6 +125,16 @@ const char *ib_cache_gid_type_str(enum ib_gid_type gid_type) } EXPORT_SYMBOL(ib_cache_gid_type_str); +/** rdma_is_zero_gid - Check if given GID is zero or not. + * @gid: GID to check + * Returns true if given GID is zero, returns false otherwise. + */ +bool rdma_is_zero_gid(const union ib_gid *gid) +{ + return !memcmp(gid, &zgid, sizeof(*gid)); +} +EXPORT_SYMBOL(rdma_is_zero_gid); + int ib_cache_gid_parse_type_str(const char *buf) { unsigned int i; @@ -231,7 +241,7 @@ static int add_modify_gid(struct ib_gid_table *table, * So ignore such behavior for IB link layer and don't * fail the call, but don't add such entry to GID cache. */ - if (!memcmp(gid, &zgid, sizeof(*gid))) + if (rdma_is_zero_gid(gid)) return 0; } @@ -264,7 +274,7 @@ static void del_gid(struct ib_device *ib_dev, u8 port, if (rdma_protocol_roce(ib_dev, port)) del_roce_gid(ib_dev, port, table, ix); - memcpy(&table->data_vec[ix].gid, &zgid, sizeof(zgid)); + memset(&table->data_vec[ix].gid, 0, sizeof(table->data_vec[ix].gid)); memset(&table->data_vec[ix].attr, 0, sizeof(table->data_vec[ix].attr)); table->data_vec[ix].context = NULL; } @@ -363,7 +373,7 @@ static int __ib_cache_gid_add(struct ib_device *ib_dev, u8 port, * IB spec version 1.3 section 4.1.1 point (6) and * section 12.7.10 and section 12.7.20 */ - if (!memcmp(gid, &zgid, sizeof(*gid))) + if (rdma_is_zero_gid(gid)) return -EINVAL; table = ib_dev->cache.ports[port - rdma_start_port(ib_dev)].gid; @@ -724,8 +734,7 @@ static void cleanup_gid_table_port(struct ib_device *ib_dev, u8 port, mutex_lock(&table->lock); for (i = 0; i < table->sz; ++i) { - if (memcmp(&table->data_vec[i].gid, &zgid, - sizeof(table->data_vec[i].gid))) { + if (!rdma_is_zero_gid(&table->data_vec[i].gid)) { del_gid(ib_dev, port, table, i); deleted = true; } diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index 5b70744f414a..bf12394c13c1 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -276,7 +276,7 @@ static int mlx4_ib_add_gid(const union ib_gid *gid, found = i; break; } - if (free < 0 && !memcmp(&port_gid_table->gids[i].gid, &zgid, sizeof(*gid))) + if (free < 0 && rdma_is_zero_gid(&port_gid_table->gids[i].gid)) free = i; /* HW has space */ } @@ -345,7 +345,8 @@ static int mlx4_ib_del_gid(const struct ib_gid_attr *attr, void **context) if (!ctx->refcount) { unsigned int real_index = ctx->real_index; - memcpy(&port_gid_table->gids[real_index].gid, &zgid, sizeof(zgid)); + memset(&port_gid_table->gids[real_index].gid, 0, + sizeof(port_gid_table->gids[real_index].gid)); kfree(port_gid_table->gids[real_index].ctx); port_gid_table->gids[real_index].ctx = NULL; hw_update = 1; diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c index 199648adac74..cd2c08c45334 100644 --- a/drivers/infiniband/hw/mlx4/qp.c +++ b/drivers/infiniband/hw/mlx4/qp.c @@ -3078,7 +3078,7 @@ static int fill_gid_by_hw_index(struct mlx4_ib_dev *ibdev, u8 port_num, memcpy(gid, &port_gid_table->gids[index].gid, sizeof(*gid)); *gid_type = port_gid_table->gids[index].gid_type; spin_unlock_irqrestore(&iboe->lock, flags); - if (!memcmp(gid, &zgid, sizeof(*gid))) + if (rdma_is_zero_gid(gid)) return -ENOENT; return 0; diff --git a/include/rdma/ib_cache.h b/include/rdma/ib_cache.h index eb49cc8d1f95..a5f249828115 100644 --- a/include/rdma/ib_cache.h +++ b/include/rdma/ib_cache.h @@ -149,4 +149,5 @@ int ib_get_cached_port_state(struct ib_device *device, u8 port_num, enum ib_port_state *port_active); +bool rdma_is_zero_gid(const union ib_gid *gid); #endif /* _IB_CACHE_H */ -- cgit v1.2.3 From 724631a9c6e98e39480c70047cc3fe2d1f25e1fc Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Tue, 22 May 2018 20:33:46 +0300 Subject: IB/core: Introduce and use rdma_gid_table() There are several places a gid table is accessed. Have a helper tiny function rdma_gid_table() to avoid code duplication at such places. Signed-off-by: Parav Pandit Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cache.c | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) (limited to 'drivers') diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c index 82699f70e9b6..0d11538c2a25 100644 --- a/drivers/infiniband/core/cache.c +++ b/drivers/infiniband/core/cache.c @@ -159,6 +159,11 @@ int ib_cache_gid_parse_type_str(const char *buf) } EXPORT_SYMBOL(ib_cache_gid_parse_type_str); +static struct ib_gid_table *rdma_gid_table(struct ib_device *device, u8 port) +{ + return device->cache.ports[port - rdma_start_port(device)].gid; +} + static void del_roce_gid(struct ib_device *device, u8 port_num, struct ib_gid_table *table, int ix) { @@ -376,7 +381,7 @@ static int __ib_cache_gid_add(struct ib_device *ib_dev, u8 port, if (rdma_is_zero_gid(gid)) return -EINVAL; - table = ib_dev->cache.ports[port - rdma_start_port(ib_dev)].gid; + table = rdma_gid_table(ib_dev, port); mutex_lock(&table->lock); @@ -443,7 +448,7 @@ _ib_cache_gid_del(struct ib_device *ib_dev, u8 port, int ret = 0; int ix; - table = ib_dev->cache.ports[port - rdma_start_port(ib_dev)].gid; + table = rdma_gid_table(ib_dev, port); mutex_lock(&table->lock); @@ -482,7 +487,7 @@ int ib_cache_gid_del_all_netdev_gids(struct ib_device *ib_dev, u8 port, int ix; bool deleted = false; - table = ib_dev->cache.ports[port - rdma_start_port(ib_dev)].gid; + table = rdma_gid_table(ib_dev, port); mutex_lock(&table->lock); @@ -506,7 +511,7 @@ static int __ib_cache_gid_get(struct ib_device *ib_dev, u8 port, int index, { struct ib_gid_table *table; - table = ib_dev->cache.ports[port - rdma_start_port(ib_dev)].gid; + table = rdma_gid_table(ib_dev, port); if (index < 0 || index >= table->sz) return -EINVAL; @@ -599,7 +604,7 @@ int ib_find_cached_gid_by_port(struct ib_device *ib_dev, if (!rdma_is_port_valid(ib_dev, port)) return -ENOENT; - table = ib_dev->cache.ports[port - rdma_start_port(ib_dev)].gid; + table = rdma_gid_table(ib_dev, port); if (ndev) mask |= GID_ATTR_FIND_MASK_NETDEV; @@ -657,7 +662,7 @@ static int ib_cache_gid_find_by_filter(struct ib_device *ib_dev, !rdma_protocol_roce(ib_dev, port)) return -EPROTONOSUPPORT; - table = ib_dev->cache.ports[port - rdma_start_port(ib_dev)].gid; + table = rdma_gid_table(ib_dev, port); read_lock_irqsave(&table->rwlock, flags); for (i = 0; i < table->sz; i++) { @@ -756,7 +761,7 @@ void ib_cache_gid_set_default_gid(struct ib_device *ib_dev, u8 port, unsigned int gid_type; unsigned long mask; - table = ib_dev->cache.ports[port - rdma_start_port(ib_dev)].gid; + table = rdma_gid_table(ib_dev, port); mask = GID_ATTR_FIND_MASK_GID_TYPE | GID_ATTR_FIND_MASK_DEFAULT | @@ -877,7 +882,7 @@ int ib_get_cached_gid(struct ib_device *device, if (!rdma_is_port_valid(device, port_num)) return -EINVAL; - table = device->cache.ports[port_num - rdma_start_port(device)].gid; + table = rdma_gid_table(device, port_num); read_lock_irqsave(&table->rwlock, flags); res = __ib_cache_gid_get(device, port_num, index, gid, gid_attr); read_unlock_irqrestore(&table->rwlock, flags); @@ -1095,7 +1100,7 @@ static int config_non_roce_gid_cache(struct ib_device *device, gid_attr.device = device; gid_attr.port_num = port; - table = device->cache.ports[port - rdma_start_port(device)].gid; + table = rdma_gid_table(device, port); mutex_lock(&table->lock); for (i = 0; i < gid_tbl_len; ++i) { @@ -1128,7 +1133,7 @@ static void ib_cache_update(struct ib_device *device, if (!rdma_is_port_valid(device, port)) return; - table = device->cache.ports[port - rdma_start_port(device)].gid; + table = rdma_gid_table(device, port); tprops = kmalloc(sizeof *tprops, GFP_KERNEL); if (!tprops) -- cgit v1.2.3 From 5b6eb54f586ba0a6385f1523bce4c96cbdb79afd Mon Sep 17 00:00:00 2001 From: "Wei Hu(Xavier)" Date: Wed, 23 May 2018 18:16:27 +0800 Subject: RDMA/hns: Modify uar allocation algorithm to avoid bitmap exhaust This patch modified uar allocation algorithm in hns_roce_uar_alloc function to avoid bitmap exhaust. Signed-off-by: Wei Hu (Xavier) Reviewed-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_device.h | 1 + drivers/infiniband/hw/hns/hns_roce_pd.c | 10 ++++++---- 2 files changed, 7 insertions(+), 4 deletions(-) (limited to 'drivers') diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index 53c2f1b8d068..412297d4b86c 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -214,6 +214,7 @@ enum { struct hns_roce_uar { u64 pfn; unsigned long index; + unsigned long logic_idx; }; struct hns_roce_ucontext { diff --git a/drivers/infiniband/hw/hns/hns_roce_pd.c b/drivers/infiniband/hw/hns/hns_roce_pd.c index 4b41e041799c..b9f2c871ff9a 100644 --- a/drivers/infiniband/hw/hns/hns_roce_pd.c +++ b/drivers/infiniband/hw/hns/hns_roce_pd.c @@ -107,13 +107,15 @@ int hns_roce_uar_alloc(struct hns_roce_dev *hr_dev, struct hns_roce_uar *uar) int ret = 0; /* Using bitmap to manager UAR index */ - ret = hns_roce_bitmap_alloc(&hr_dev->uar_table.bitmap, &uar->index); + ret = hns_roce_bitmap_alloc(&hr_dev->uar_table.bitmap, &uar->logic_idx); if (ret == -1) return -ENOMEM; - if (uar->index > 0) - uar->index = (uar->index - 1) % + if (uar->logic_idx > 0 && hr_dev->caps.phy_num_uars > 1) + uar->index = (uar->logic_idx - 1) % (hr_dev->caps.phy_num_uars - 1) + 1; + else + uar->index = 0; if (!dev_is_pci(hr_dev->dev)) { res = platform_get_resource(hr_dev->pdev, IORESOURCE_MEM, 0); @@ -132,7 +134,7 @@ int hns_roce_uar_alloc(struct hns_roce_dev *hr_dev, struct hns_roce_uar *uar) void hns_roce_uar_free(struct hns_roce_dev *hr_dev, struct hns_roce_uar *uar) { - hns_roce_bitmap_free(&hr_dev->uar_table.bitmap, uar->index, + hns_roce_bitmap_free(&hr_dev->uar_table.bitmap, uar->logic_idx, BITMAP_NO_RR); } -- cgit v1.2.3 From d59fcacc4b089c9920ff4a148e33a3f3f7275ef6 Mon Sep 17 00:00:00 2001 From: "Wei Hu(Xavier)" Date: Wed, 23 May 2018 18:16:28 +0800 Subject: RDMA/hns: Increase checking CMQ status timeout value This patch increases checking CMQ status timeout value and uses the same value with NIC driver to avoid deficiency of time. Signed-off-by: Wei Hu (Xavier) Reviewed-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_hw_v2.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h index 983c0be2afd0..2caeb4cdad5c 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h @@ -76,7 +76,7 @@ #define HNS_ROCE_V2_PAGE_SIZE_SUPPORTED 0xFFFFF000 #define HNS_ROCE_V2_MAX_INNER_MTPT_NUM 2 #define HNS_ROCE_INVALID_LKEY 0x100 -#define HNS_ROCE_CMQ_TX_TIMEOUT 200 +#define HNS_ROCE_CMQ_TX_TIMEOUT 30000 #define HNS_ROCE_CONTEXT_HOP_NUM 1 #define HNS_ROCE_MTT_HOP_NUM 1 -- cgit v1.2.3 From bb42f87e2924e3b118aed3b3790ce792ae7bb86c Mon Sep 17 00:00:00 2001 From: Zhu Yanjun Date: Mon, 28 May 2018 05:03:41 -0400 Subject: IB/rxe: avoid unnecessary export The function rxe_remove_all is only used in this modules. There is no other modules that call this function. So it is not necessary to export it. Signed-off-by: Zhu Yanjun Reviewed-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_net.c | 1 - 1 file changed, 1 deletion(-) (limited to 'drivers') diff --git a/drivers/infiniband/sw/rxe/rxe_net.c b/drivers/infiniband/sw/rxe/rxe_net.c index 95e52b3ec757..59ec6d918ed4 100644 --- a/drivers/infiniband/sw/rxe/rxe_net.c +++ b/drivers/infiniband/sw/rxe/rxe_net.c @@ -623,7 +623,6 @@ void rxe_remove_all(void) } spin_unlock_bh(&dev_list_lock); } -EXPORT_SYMBOL(rxe_remove_all); static void rxe_port_event(struct rxe_dev *rxe, enum ib_event_type event) -- cgit v1.2.3 From 08bb558ac11ab944e0539e78619d7b4c356278bd Mon Sep 17 00:00:00 2001 From: Jack Morgenstein Date: Wed, 23 May 2018 15:30:30 +0300 Subject: IB/core: Make testing MR flags for writability a static inline function Make the MR writability flags check, which is performed in umem.c, a static inline function in file ib_verbs.h This allows the function to be used by low-level infiniband drivers. Cc: Signed-off-by: Jason Gunthorpe Signed-off-by: Jack Morgenstein Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/umem.c | 11 +---------- include/rdma/ib_verbs.h | 14 ++++++++++++++ 2 files changed, 15 insertions(+), 10 deletions(-) (limited to 'drivers') diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c index 2b6c9b516070..d76455edd292 100644 --- a/drivers/infiniband/core/umem.c +++ b/drivers/infiniband/core/umem.c @@ -119,16 +119,7 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, umem->length = size; umem->address = addr; umem->page_shift = PAGE_SHIFT; - /* - * We ask for writable memory if any of the following - * access flags are set. "Local write" and "remote write" - * obviously require write access. "Remote atomic" can do - * things like fetch and add, which will modify memory, and - * "MW bind" can change permissions by binding a window. - */ - umem->writable = !!(access & - (IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE | - IB_ACCESS_REMOTE_ATOMIC | IB_ACCESS_MW_BIND)); + umem->writable = ib_access_writable(access); if (access & IB_ACCESS_ON_DEMAND) { ret = ib_umem_odp_get(context, umem, access); diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 9fc8a825aa28..20fa5c591e81 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -3734,6 +3734,20 @@ static inline int ib_check_mr_access(int flags) return 0; } +static inline bool ib_access_writable(int access_flags) +{ + /* + * We have writable memory backing the MR if any of the following + * access flags are set. "Local write" and "remote write" obviously + * require write access. "Remote atomic" can do things like fetch and + * add, which will modify memory, and "MW bind" can change permissions + * by binding a window. + */ + return access_flags & + (IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE | + IB_ACCESS_REMOTE_ATOMIC | IB_ACCESS_MW_BIND); +} + /** * ib_check_mr_status: lightweight check of MR status. * This routine may provide status checks on a selected -- cgit v1.2.3 From d8f9cc328c8888369880e2527e9186d745f2bbf6 Mon Sep 17 00:00:00 2001 From: Jack Morgenstein Date: Wed, 23 May 2018 15:30:31 +0300 Subject: IB/mlx4: Mark user MR as writable if actual virtual memory is writable To allow rereg_user_mr to modify the MR from read-only to writable without using get_user_pages again, we needed to define the initial MR as writable. However, this was originally done unconditionally, without taking into account the writability of the underlying virtual memory. As a result, any attempt to register a read-only MR over read-only virtual memory failed. To fix this, do not add the writable flag bit when the user virtual memory is not writable (e.g. const memory). However, when the underlying memory is NOT writable (and we therefore do not define the initial MR as writable), the IB core adds a "force writable" flag to its user-pages request. If this succeeds, the reg_user_mr caller gets a writable copy of the original pages. If the user-space caller then does a rereg_user_mr operation to enable writability, this will succeed. This should not be allowed, since the original virtual memory was not writable. Cc: Fixes: 9376932d0c26 ("IB/mlx4_ib: Add support for user MR re-registration") Signed-off-by: Jason Gunthorpe Signed-off-by: Jack Morgenstein Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx4/mr.c | 50 ++++++++++++++++++++++++++++++++++------- 1 file changed, 42 insertions(+), 8 deletions(-) (limited to 'drivers') diff --git a/drivers/infiniband/hw/mlx4/mr.c b/drivers/infiniband/hw/mlx4/mr.c index 61d8b06375bb..ed1f253faf97 100644 --- a/drivers/infiniband/hw/mlx4/mr.c +++ b/drivers/infiniband/hw/mlx4/mr.c @@ -367,6 +367,40 @@ end: return block_shift; } +static struct ib_umem *mlx4_get_umem_mr(struct ib_ucontext *context, u64 start, + u64 length, u64 virt_addr, + int access_flags) +{ + /* + * Force registering the memory as writable if the underlying pages + * are writable. This is so rereg can change the access permissions + * from readable to writable without having to run through ib_umem_get + * again + */ + if (!ib_access_writable(access_flags)) { + struct vm_area_struct *vma; + + down_read(¤t->mm->mmap_sem); + /* + * FIXME: Ideally this would iterate over all the vmas that + * cover the memory, but for now it requires a single vma to + * entirely cover the MR to support RO mappings. + */ + vma = find_vma(current->mm, start); + if (vma && vma->vm_end >= start + length && + vma->vm_start <= start) { + if (vma->vm_flags & VM_WRITE) + access_flags |= IB_ACCESS_LOCAL_WRITE; + } else { + access_flags |= IB_ACCESS_LOCAL_WRITE; + } + + up_read(¤t->mm->mmap_sem); + } + + return ib_umem_get(context, start, length, access_flags, 0); +} + struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, u64 virt_addr, int access_flags, struct ib_udata *udata) @@ -381,10 +415,8 @@ struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, if (!mr) return ERR_PTR(-ENOMEM); - /* Force registering the memory as writable. */ - /* Used for memory re-registeration. HCA protects the access */ - mr->umem = ib_umem_get(pd->uobject->context, start, length, - access_flags | IB_ACCESS_LOCAL_WRITE, 0); + mr->umem = mlx4_get_umem_mr(pd->uobject->context, start, length, + virt_addr, access_flags); if (IS_ERR(mr->umem)) { err = PTR_ERR(mr->umem); goto err_free; @@ -454,6 +486,9 @@ int mlx4_ib_rereg_user_mr(struct ib_mr *mr, int flags, } if (flags & IB_MR_REREG_ACCESS) { + if (ib_access_writable(mr_access_flags) && !mmr->umem->writable) + return -EPERM; + err = mlx4_mr_hw_change_access(dev->dev, *pmpt_entry, convert_access(mr_access_flags)); @@ -467,10 +502,9 @@ int mlx4_ib_rereg_user_mr(struct ib_mr *mr, int flags, mlx4_mr_rereg_mem_cleanup(dev->dev, &mmr->mmr); ib_umem_release(mmr->umem); - mmr->umem = ib_umem_get(mr->uobject->context, start, length, - mr_access_flags | - IB_ACCESS_LOCAL_WRITE, - 0); + mmr->umem = + mlx4_get_umem_mr(mr->uobject->context, start, length, + virt_addr, mr_access_flags); if (IS_ERR(mmr->umem)) { err = PTR_ERR(mmr->umem); /* Prevent mlx4_ib_dereg_mr from free'ing invalid pointer */ -- cgit v1.2.3 From 572f46bf947c3eeca8d16518e0fb70f9250b4416 Mon Sep 17 00:00:00 2001 From: Yonatan Cohen Date: Sun, 27 May 2018 13:42:33 +0300 Subject: IB/mlx5: Refactor CQE compression response Refactor CQE compression response to be fully set only when it`s really supported. There is no change from user perspective because anyway resp.cqe_comp_caps.max_num was set to zero. Reviewed-by: Yishai Hadas Signed-off-by: Yonatan Cohen W Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/main.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'drivers') diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index daa919e5a442..029c310a0dd1 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -983,13 +983,17 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, } if (field_avail(typeof(resp), cqe_comp_caps, uhw->outlen)) { - resp.cqe_comp_caps.max_num = - MLX5_CAP_GEN(dev->mdev, cqe_compression) ? - MLX5_CAP_GEN(dev->mdev, cqe_compression_max_num) : 0; - resp.cqe_comp_caps.supported_format = - MLX5_IB_CQE_RES_FORMAT_HASH | - MLX5_IB_CQE_RES_FORMAT_CSUM; resp.response_length += sizeof(resp.cqe_comp_caps); + + if (MLX5_CAP_GEN(dev->mdev, cqe_compression)) { + resp.cqe_comp_caps.max_num = + MLX5_CAP_GEN(dev->mdev, + cqe_compression_max_num); + + resp.cqe_comp_caps.supported_format = + MLX5_IB_CQE_RES_FORMAT_HASH | + MLX5_IB_CQE_RES_FORMAT_CSUM; + } } if (field_avail(typeof(resp), packet_pacing_caps, uhw->outlen) && -- cgit v1.2.3 From 6f1006a43869ff82745eea3b88204d0a3bcc0158 Mon Sep 17 00:00:00 2001 From: Yonatan Cohen Date: Sun, 27 May 2018 13:42:34 +0300 Subject: IB/mlx5: Introduce a new mini-CQE format The new mini-CQE format includes the stride index, byte count and packet checksum. Stride index is needed for striding WQ feature. This patch exposes this capability and enables its setting via mlx5 UHW data as part of query device and cq creation. Reviewed-by: Yishai Hadas Reviewed-by: Guy Levi Signed-off-by: Yonatan Cohen Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/cq.c | 42 +++++++++++++++++++++++++++++---------- drivers/infiniband/hw/mlx5/main.c | 4 ++++ include/uapi/rdma/mlx5-abi.h | 2 +- 3 files changed, 37 insertions(+), 11 deletions(-) (limited to 'drivers') diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c index 6d52ea03574e..68775e12d721 100644 --- a/drivers/infiniband/hw/mlx5/cq.c +++ b/drivers/infiniband/hw/mlx5/cq.c @@ -742,6 +742,28 @@ static int alloc_cq_frag_buf(struct mlx5_ib_dev *dev, return 0; } +enum { + MLX5_CQE_RES_FORMAT_HASH = 0, + MLX5_CQE_RES_FORMAT_CSUM = 1, + MLX5_CQE_RES_FORMAT_CSUM_STRIDX = 3, +}; + +static int mini_cqe_res_format_to_hw(struct mlx5_ib_dev *dev, u8 format) +{ + switch (format) { + case MLX5_IB_CQE_RES_FORMAT_HASH: + return MLX5_CQE_RES_FORMAT_HASH; + case MLX5_IB_CQE_RES_FORMAT_CSUM: + return MLX5_CQE_RES_FORMAT_CSUM; + case MLX5_IB_CQE_RES_FORMAT_CSUM_STRIDX: + if (MLX5_CAP_GEN(dev->mdev, mini_cqe_resp_stride_index)) + return MLX5_CQE_RES_FORMAT_CSUM_STRIDX; + return -EOPNOTSUPP; + default: + return -EINVAL; + } +} + static int create_cq_user(struct mlx5_ib_dev *dev, struct ib_udata *udata, struct ib_ucontext *context, struct mlx5_ib_cq *cq, int entries, u32 **cqb, @@ -807,6 +829,8 @@ static int create_cq_user(struct mlx5_ib_dev *dev, struct ib_udata *udata, *index = to_mucontext(context)->bfregi.sys_pages[0]; if (ucmd.cqe_comp_en == 1) { + int mini_cqe_format; + if (!((*cqe_size == 128 && MLX5_CAP_GEN(dev->mdev, cqe_compression_128)) || (*cqe_size == 64 && @@ -817,20 +841,18 @@ static int create_cq_user(struct mlx5_ib_dev *dev, struct ib_udata *udata, goto err_cqb; } - if (unlikely(!ucmd.cqe_comp_res_format || - !(ucmd.cqe_comp_res_format < - MLX5_IB_CQE_RES_RESERVED) || - (ucmd.cqe_comp_res_format & - (ucmd.cqe_comp_res_format - 1)))) { - err = -EOPNOTSUPP; - mlx5_ib_warn(dev, "CQE compression res format %d is not supported!\n", - ucmd.cqe_comp_res_format); + mini_cqe_format = + mini_cqe_res_format_to_hw(dev, + ucmd.cqe_comp_res_format); + if (mini_cqe_format < 0) { + err = mini_cqe_format; + mlx5_ib_dbg(dev, "CQE compression res format %d error: %d\n", + ucmd.cqe_comp_res_format, err); goto err_cqb; } MLX5_SET(cqc, cqc, cqe_comp_en, 1); - MLX5_SET(cqc, cqc, mini_cqe_res_format, - ilog2(ucmd.cqe_comp_res_format)); + MLX5_SET(cqc, cqc, mini_cqe_res_format, mini_cqe_format); } if (ucmd.flags & MLX5_IB_CREATE_CQ_FLAGS_CQE_128B_PAD) { diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 029c310a0dd1..e0894b203d59 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -993,6 +993,10 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, resp.cqe_comp_caps.supported_format = MLX5_IB_CQE_RES_FORMAT_HASH | MLX5_IB_CQE_RES_FORMAT_CSUM; + + if (MLX5_CAP_GEN(dev->mdev, mini_cqe_resp_stride_index)) + resp.cqe_comp_caps.supported_format |= + MLX5_IB_CQE_RES_FORMAT_CSUM_STRIDX; } } diff --git a/include/uapi/rdma/mlx5-abi.h b/include/uapi/rdma/mlx5-abi.h index cb4a02c4a1ce..9783648d5511 100644 --- a/include/uapi/rdma/mlx5-abi.h +++ b/include/uapi/rdma/mlx5-abi.h @@ -163,7 +163,7 @@ struct mlx5_ib_rss_caps { enum mlx5_ib_cqe_comp_res_format { MLX5_IB_CQE_RES_FORMAT_HASH = 1 << 0, MLX5_IB_CQE_RES_FORMAT_CSUM = 1 << 1, - MLX5_IB_CQE_RES_RESERVED = 1 << 2, + MLX5_IB_CQE_RES_FORMAT_CSUM_STRIDX = 1 << 2, }; struct mlx5_ib_cqe_comp_caps { -- cgit v1.2.3 From 5ef8c0c180a6318542dce7e0701dd8e341c1265b Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 28 May 2018 20:32:40 -0600 Subject: RDMA/core: Remove indirection through ib_cache_setup() This once might have made sense when cache.c was in a different module from device.c, but today it just obfuscation. Get rid of the wrappers and call roge_gid_mgmt_init()/cleanup() directly. Signed-off-by: Jason Gunthorpe Reviewed-by: Leon Romanovsky --- drivers/infiniband/core/cache.c | 10 ---------- drivers/infiniband/core/core_priv.h | 3 --- drivers/infiniband/core/device.c | 4 ++-- 3 files changed, 2 insertions(+), 15 deletions(-) (limited to 'drivers') diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c index 0d11538c2a25..2bdfc4b4a15c 100644 --- a/drivers/infiniband/core/cache.c +++ b/drivers/infiniband/core/cache.c @@ -1295,13 +1295,3 @@ void ib_cache_cleanup_one(struct ib_device *device) flush_workqueue(ib_wq); gid_table_cleanup_one(device); } - -void __init ib_cache_setup(void) -{ - roce_gid_mgmt_init(); -} - -void __exit ib_cache_cleanup(void) -{ - roce_gid_mgmt_cleanup(); -} diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h index 54163a6e4067..fae417a391fb 100644 --- a/drivers/infiniband/core/core_priv.h +++ b/drivers/infiniband/core/core_priv.h @@ -88,9 +88,6 @@ int ib_device_register_sysfs(struct ib_device *device, u8, struct kobject *)); void ib_device_unregister_sysfs(struct ib_device *device); -void ib_cache_setup(void); -void ib_cache_cleanup(void); - typedef void (*roce_netdev_callback)(struct ib_device *device, u8 port, struct net_device *idev, void *cookie); diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index ea9fbcfb21bd..84f51386e1e3 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -1225,7 +1225,7 @@ static int __init ib_core_init(void) nldev_init(); rdma_nl_register(RDMA_NL_LS, ibnl_ls_cb_table); - ib_cache_setup(); + roce_gid_mgmt_init(); return 0; @@ -1248,7 +1248,7 @@ err: static void __exit ib_core_cleanup(void) { - ib_cache_cleanup(); + roce_gid_mgmt_cleanup(); nldev_exit(); rdma_nl_unregister(RDMA_NL_LS); unregister_lsm_notifier(&ibdev_lsm_nb); -- cgit v1.2.3 From cb7a94c9c808d291d813f90bdb53e2005324a332 Mon Sep 17 00:00:00 2001 From: "Wei Hu(Xavier)" Date: Mon, 28 May 2018 19:39:24 +0800 Subject: RDMA/hns: Add reset process for RoCE in hip08 This patch added reset process for RoCE in hip08. Signed-off-by: Wei Hu (Xavier) Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hns/hns_roce_cmd.c | 3 ++ drivers/infiniband/hw/hns/hns_roce_device.h | 2 + drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 76 +++++++++++++++++++++++++++++ drivers/infiniband/hw/hns/hns_roce_main.c | 7 +++ 4 files changed, 88 insertions(+) (limited to 'drivers') diff --git a/drivers/infiniband/hw/hns/hns_roce_cmd.c b/drivers/infiniband/hw/hns/hns_roce_cmd.c index 9ebe839d8b24..a0ba19d4a10e 100644 --- a/drivers/infiniband/hw/hns/hns_roce_cmd.c +++ b/drivers/infiniband/hw/hns/hns_roce_cmd.c @@ -176,6 +176,9 @@ int hns_roce_cmd_mbox(struct hns_roce_dev *hr_dev, u64 in_param, u64 out_param, unsigned long in_modifier, u8 op_modifier, u16 op, unsigned long timeout) { + if (hr_dev->is_reset) + return 0; + if (hr_dev->cmd.use_events) return hns_roce_cmd_mbox_wait(hr_dev, in_param, out_param, in_modifier, op_modifier, op, diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index 412297d4b86c..da8512b40252 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -774,6 +774,8 @@ struct hns_roce_dev { const char *irq_names[HNS_ROCE_MAX_IRQ_NUM]; spinlock_t sm_lock; spinlock_t bt_cmd_lock; + bool active; + bool is_reset; struct hns_roce_ib_iboe iboe; struct list_head pgdir_list; diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index dd716ed60661..166f0469b5f5 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -775,6 +775,9 @@ static int hns_roce_cmq_send(struct hns_roce_dev *hr_dev, int ret = 0; int ntc; + if (hr_dev->is_reset) + return 0; + spin_lock_bh(&csq->lock); if (num > hns_roce_cmq_space(csq)) { @@ -4804,14 +4807,87 @@ static void hns_roce_hw_v2_uninit_instance(struct hnae3_handle *handle, { struct hns_roce_dev *hr_dev = (struct hns_roce_dev *)handle->priv; + if (!hr_dev) + return; + hns_roce_exit(hr_dev); kfree(hr_dev->priv); ib_dealloc_device(&hr_dev->ib_dev); } +static int hns_roce_hw_v2_reset_notify_down(struct hnae3_handle *handle) +{ + struct hns_roce_dev *hr_dev = (struct hns_roce_dev *)handle->priv; + struct ib_event event; + + if (!hr_dev) { + dev_err(&handle->pdev->dev, + "Input parameter handle->priv is NULL!\n"); + return -EINVAL; + } + + hr_dev->active = false; + hr_dev->is_reset = true; + + event.event = IB_EVENT_DEVICE_FATAL; + event.device = &hr_dev->ib_dev; + event.element.port_num = 1; + ib_dispatch_event(&event); + + return 0; +} + +static int hns_roce_hw_v2_reset_notify_init(struct hnae3_handle *handle) +{ + int ret; + + ret = hns_roce_hw_v2_init_instance(handle); + if (ret) { + /* when reset notify type is HNAE3_INIT_CLIENT In reset notify + * callback function, RoCE Engine reinitialize. If RoCE reinit + * failed, we should inform NIC driver. + */ + handle->priv = NULL; + dev_err(&handle->pdev->dev, + "In reset process RoCE reinit failed %d.\n", ret); + } + + return ret; +} + +static int hns_roce_hw_v2_reset_notify_uninit(struct hnae3_handle *handle) +{ + msleep(100); + hns_roce_hw_v2_uninit_instance(handle, false); + return 0; +} + +static int hns_roce_hw_v2_reset_notify(struct hnae3_handle *handle, + enum hnae3_reset_notify_type type) +{ + int ret = 0; + + switch (type) { + case HNAE3_DOWN_CLIENT: + ret = hns_roce_hw_v2_reset_notify_down(handle); + break; + case HNAE3_INIT_CLIENT: + ret = hns_roce_hw_v2_reset_notify_init(handle); + break; + case HNAE3_UNINIT_CLIENT: + ret = hns_roce_hw_v2_reset_notify_uninit(handle); + break; + default: + break; + } + + return ret; +} + static const struct hnae3_client_ops hns_roce_hw_v2_ops = { .init_instance = hns_roce_hw_v2_init_instance, .uninit_instance = hns_roce_hw_v2_uninit_instance, + .reset_notify = hns_roce_hw_v2_reset_notify, }; static struct hnae3_client hns_roce_hw_v2_client = { diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c index c614f9182b1a..fbb0c0a857b8 100644 --- a/drivers/infiniband/hw/hns/hns_roce_main.c +++ b/drivers/infiniband/hw/hns/hns_roce_main.c @@ -332,6 +332,9 @@ static struct ib_ucontext *hns_roce_alloc_ucontext(struct ib_device *ib_dev, struct hns_roce_ib_alloc_ucontext_resp resp = {}; struct hns_roce_dev *hr_dev = to_hr_dev(ib_dev); + if (!hr_dev->active) + return ERR_PTR(-EAGAIN); + resp.qp_tab_size = hr_dev->caps.num_qps; context = kmalloc(sizeof(*context), GFP_KERNEL); @@ -425,6 +428,7 @@ static void hns_roce_unregister_device(struct hns_roce_dev *hr_dev) { struct hns_roce_ib_iboe *iboe = &hr_dev->iboe; + hr_dev->active = false; unregister_netdevice_notifier(&iboe->nb); ib_unregister_device(&hr_dev->ib_dev); } @@ -536,6 +540,7 @@ static int hns_roce_register_device(struct hns_roce_dev *hr_dev) goto error_failed_setup_mtu_mac; } + hr_dev->active = true; return 0; error_failed_setup_mtu_mac: @@ -728,6 +733,7 @@ int hns_roce_init(struct hns_roce_dev *hr_dev) return ret; } } + hr_dev->is_reset = false; if (hr_dev->hw->cmq_init) { ret = hr_dev->hw->cmq_init(hr_dev); @@ -827,6 +833,7 @@ EXPORT_SYMBOL_GPL(hns_roce_init); void hns_roce_exit(struct hns_roce_dev *hr_dev) { hns_roce_unregister_device(hr_dev); + if (hr_dev->hw->hw_exit) hr_dev->hw->hw_exit(hr_dev); hns_roce_cleanup_bitmap(hr_dev); -- cgit v1.2.3 From 0b25c9cc53b5c0f87fab5e3cab0ff64e8d4ccc0b Mon Sep 17 00:00:00 2001 From: "Wei Hu(Xavier)" Date: Mon, 28 May 2018 19:39:25 +0800 Subject: RDMA/hns: Fix the illegal memory operation when cross page This patch fixed the potential illegal operation when using the extend sge buffer cross page in post send operation. The bug will cause the calltrace as below. [ 3302.922107] Unable to handle kernel paging request at virtual address ffff00003b3a0004 [ 3302.930009] Mem abort info: [ 3302.932790] Exception class = DABT (current EL), IL = 32 bits [ 3302.938695] SET = 0, FnV = 0 [ 3302.941735] EA = 0, S1PTW = 0 [ 3302.944863] Data abort info: [ 3302.947729] ISV = 0, ISS = 0x00000047 [ 3302.951551] CM = 0, WnR = 1 [ 3302.954506] swapper pgtable: 4k pages, 48-bit VAs, pgd = ffff000009ea5000 [ 3302.961279] [ffff00003b3a0004] *pgd=00000023dfffe003, *pud=00000023dfffd003, *pmd=00000022dc84c003, *pte=0000000000000000 [ 3302.972224] Internal error: Oops: 96000047 [#1] SMP [ 3302.999509] CPU: 9 PID: 19628 Comm: roce_test_main Tainted: G OE 4.14.10 #1 [ 3303.007498] task: ffff80234df78000 task.stack: ffff00000f640000 [ 3303.013412] PC is at hns_roce_v2_post_send+0x690/0xe20 [hns_roce_pci] [ 3303.019843] LR is at hns_roce_v2_post_send+0x658/0xe20 [hns_roce_pci] [ 3303.026269] pc : [] lr : [] pstate: 804001c9 [ 3303.033649] sp : ffff00000f643870 [ 3303.036951] x29: ffff00000f643870 x28: ffff80232bfa9c00 [ 3303.042250] x27: ffff80234d909380 x26: ffff00003b37f0c0 [ 3303.047549] x25: 0000000000000000 x24: 0000000000000003 [ 3303.052848] x23: 0000000000000000 x22: 0000000000000000 [ 3303.058148] x21: 0000000000000101 x20: 0000000000000001 [ 3303.063447] x19: ffff80236163f800 x18: 0000000000000000 [ 3303.068746] x17: 0000ffff86b76fc8 x16: ffff000008301600 [ 3303.074045] x15: 000020a51c000000 x14: 3128726464615f65 [ 3303.079344] x13: 746f6d6572202c29 x12: 303035312879656b [ 3303.084643] x11: 723a6f666e692072 x10: 573a6f666e693a5d [ 3303.089943] x9 : 0000000000000004 x8 : ffff8023ce38b000 [ 3303.095242] x7 : ffff8023ce38b320 x6 : 0000000000000418 [ 3303.100541] x5 : ffff80232bfa9cc8 x4 : 0000000000000030 [ 3303.105839] x3 : 0000000000000100 x2 : 0000000000000200 [ 3303.111138] x1 : 0000000000000320 x0 : ffff00003b3a0000 [ 3303.116438] Process roce_test_main (pid: 19628, stack limit = 0xffff00000f640000) [ 3303.123906] Call trace: [ 3303.126339] Exception stack(0xffff00000f643730 to 0xffff00000f643870) [ 3303.215790] [] hns_roce_v2_post_send+0x690/0xe20 [hns_roce_pci] [ 3303.223293] [] rt_ktest_post_send+0x5d0/0x8b8 [rdma_test] [ 3303.230261] [] exec_send_cmd+0x664/0x1350 [rdma_test] [ 3303.236881] [] rt_ktest_dispatch_cmd_3+0x1510/0x3790 [rdma_test] [ 3303.244455] [] rt_ktest_dispatch_cmd_2+0xa4/0x118 [rdma_test] [ 3303.251770] [] rt_ktest_dispatch_cmd+0x124/0xaa8 [rdma_test] [ 3303.258997] [] rt_ktest_dev_write+0x2cc/0x568 [rdma_test] [ 3303.265947] [] __vfs_write+0x60/0x18c [ 3303.271158] [] vfs_write+0xa8/0x198 [ 3303.276196] [] SyS_write+0x6c/0xd4 [ 3303.281147] Exception stack(0xffff00000f643ec0 to 0xffff00000f644000) [ 3303.287573] 3ec0: 0000000000000003 0000fffffc85faa8 0000000000004e60 0000000000000000 [ 3303.295388] 3ee0: 0000000021fb2000 000000000000ffff eff0e3efe4e58080 0000fffffcc724fe [ 3303.303204] 3f00: 0000000000000040 1999999999999999 0101010101010101 0000000000000038 [ 3303.311019] 3f20: 0000000000000005 ffffffffffffffff 0d73757461747320 ffffffffffffffff [ 3303.318835] 3f40: 0000000000000000 0000000000459b00 0000fffffc85e360 000000000043d788 [ 3303.326650] 3f60: 0000000000000000 0000000000000000 0000000000000000 0000000000000000 [ 3303.334465] 3f80: 0000000000000000 0000000000000000 0000000000000000 0000000000000000 [ 3303.342281] 3fa0: 0000000000000000 0000fffffc85e570 0000000000438804 0000fffffc85e570 [ 3303.350096] 3fc0: 0000ffff8553f618 0000000080000000 0000000000000003 0000000000000040 [ 3303.357911] 3fe0: 0000000000000000 0000000000000000 0000000000000000 0000000000000000 [ 3303.365729] [] __sys_trace_return+0x0/0x4 [ 3303.371288] Code: b94008e9 34000129 b9400ce2 110006b5 (b9000402) [ 3303.377377] ---[ end trace fd5ab98b3325cf9a ]--- Reported-by: Jie Chen Reported-by: Xiping Zhang (Francis) Fixes: b1c158350968("RDMA/hns: Get rid of virt_to_page and vmap calls after dma_alloc_coherent") Signed-off-by: Wei Hu (Xavier) Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 72 +++++++++++++++++++++--------- drivers/infiniband/hw/hns/hns_roce_hw_v2.h | 1 + 2 files changed, 53 insertions(+), 20 deletions(-) (limited to 'drivers') diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 166f0469b5f5..0e8dad68910a 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include @@ -52,6 +53,53 @@ static void set_data_seg_v2(struct hns_roce_v2_wqe_data_seg *dseg, dseg->len = cpu_to_le32(sg->length); } +static void set_extend_sge(struct hns_roce_qp *qp, struct ib_send_wr *wr, + unsigned int *sge_ind) +{ + struct hns_roce_v2_wqe_data_seg *dseg; + struct ib_sge *sg; + int num_in_wqe = 0; + int extend_sge_num; + int fi_sge_num; + int se_sge_num; + int shift; + int i; + + if (qp->ibqp.qp_type == IB_QPT_RC || qp->ibqp.qp_type == IB_QPT_UC) + num_in_wqe = HNS_ROCE_V2_UC_RC_SGE_NUM_IN_WQE; + extend_sge_num = wr->num_sge - num_in_wqe; + sg = wr->sg_list + num_in_wqe; + shift = qp->hr_buf.page_shift; + + /* + * Check whether wr->num_sge sges are in the same page. If not, we + * should calculate how many sges in the first page and the second + * page. + */ + dseg = get_send_extend_sge(qp, (*sge_ind) & (qp->sge.sge_cnt - 1)); + fi_sge_num = (round_up((uintptr_t)dseg, 1 << shift) - + (uintptr_t)dseg) / + sizeof(struct hns_roce_v2_wqe_data_seg); + if (extend_sge_num > fi_sge_num) { + se_sge_num = extend_sge_num - fi_sge_num; + for (i = 0; i < fi_sge_num; i++) { + set_data_seg_v2(dseg++, sg + i); + (*sge_ind)++; + } + dseg = get_send_extend_sge(qp, + (*sge_ind) & (qp->sge.sge_cnt - 1)); + for (i = 0; i < se_sge_num; i++) { + set_data_seg_v2(dseg++, sg + fi_sge_num + i); + (*sge_ind)++; + } + } else { + for (i = 0; i < extend_sge_num; i++) { + set_data_seg_v2(dseg++, sg + i); + (*sge_ind)++; + } + } +} + static int set_rwqe_data_seg(struct ib_qp *ibqp, struct ib_send_wr *wr, struct hns_roce_v2_rc_send_wqe *rc_sq_wqe, void *wqe, unsigned int *sge_ind, @@ -85,7 +133,7 @@ static int set_rwqe_data_seg(struct ib_qp *ibqp, struct ib_send_wr *wr, roce_set_bit(rc_sq_wqe->byte_4, V2_RC_SEND_WQE_BYTE_4_INLINE_S, 1); } else { - if (wr->num_sge <= 2) { + if (wr->num_sge <= HNS_ROCE_V2_UC_RC_SGE_NUM_IN_WQE) { for (i = 0; i < wr->num_sge; i++) { if (likely(wr->sg_list[i].length)) { set_data_seg_v2(dseg, wr->sg_list + i); @@ -98,24 +146,14 @@ static int set_rwqe_data_seg(struct ib_qp *ibqp, struct ib_send_wr *wr, V2_RC_SEND_WQE_BYTE_20_MSG_START_SGE_IDX_S, (*sge_ind) & (qp->sge.sge_cnt - 1)); - for (i = 0; i < 2; i++) { + for (i = 0; i < HNS_ROCE_V2_UC_RC_SGE_NUM_IN_WQE; i++) { if (likely(wr->sg_list[i].length)) { set_data_seg_v2(dseg, wr->sg_list + i); dseg++; } } - dseg = get_send_extend_sge(qp, - (*sge_ind) & (qp->sge.sge_cnt - 1)); - - for (i = 0; i < wr->num_sge - 2; i++) { - if (likely(wr->sg_list[i + 2].length)) { - set_data_seg_v2(dseg, - wr->sg_list + 2 + i); - dseg++; - (*sge_ind)++; - } - } + set_extend_sge(qp, wr, sge_ind); } roce_set_field(rc_sq_wqe->byte_16, @@ -319,13 +357,7 @@ static int hns_roce_v2_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, memcpy(&ud_sq_wqe->dgid[0], &ah->av.dgid[0], GID_LEN_V2); - dseg = get_send_extend_sge(qp, - sge_ind & (qp->sge.sge_cnt - 1)); - for (i = 0; i < wr->num_sge; i++) { - set_data_seg_v2(dseg + i, wr->sg_list + i); - sge_ind++; - } - + set_extend_sge(qp, wr, &sge_ind); ind++; } else if (ibqp->qp_type == IB_QPT_RC) { rc_sq_wqe = wqe; diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h index 2caeb4cdad5c..d47675f365c7 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h @@ -77,6 +77,7 @@ #define HNS_ROCE_V2_MAX_INNER_MTPT_NUM 2 #define HNS_ROCE_INVALID_LKEY 0x100 #define HNS_ROCE_CMQ_TX_TIMEOUT 30000 +#define HNS_ROCE_V2_UC_RC_SGE_NUM_IN_WQE 2 #define HNS_ROCE_CONTEXT_HOP_NUM 1 #define HNS_ROCE_MTT_HOP_NUM 1 -- cgit v1.2.3 From a0976f418daf6f93c3c572767f0cf1e770df4717 Mon Sep 17 00:00:00 2001 From: "Wei Hu(Xavier)" Date: Mon, 28 May 2018 19:39:26 +0800 Subject: RDMA/uverbs: Hoist the common process of disassociate_ucontext into ib core This patch hoisted the common process of disassociate_ucontext callback function into ib core code, and these code are common to ervery ib_device driver. Signed-off-by: Wei Hu (Xavier) Acked-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/core/uverbs_main.c | 42 ++++++++++++++++++++++++++++++++++- drivers/infiniband/hw/mlx4/main.c | 34 ---------------------------- drivers/infiniband/hw/mlx5/main.c | 34 ---------------------------- 3 files changed, 41 insertions(+), 69 deletions(-) (limited to 'drivers') diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index 4445d8ee9314..3ae2339dd27a 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -41,6 +41,8 @@ #include #include #include +#include +#include #include #include #include @@ -1090,6 +1092,44 @@ err: return; } +static void ib_uverbs_disassociate_ucontext(struct ib_ucontext *ibcontext) +{ + struct ib_device *ib_dev = ibcontext->device; + struct task_struct *owning_process = NULL; + struct mm_struct *owning_mm = NULL; + + owning_process = get_pid_task(ibcontext->tgid, PIDTYPE_PID); + if (!owning_process) + return; + + owning_mm = get_task_mm(owning_process); + if (!owning_mm) { + pr_info("no mm, disassociate ucontext is pending task termination\n"); + while (1) { + put_task_struct(owning_process); + usleep_range(1000, 2000); + owning_process = get_pid_task(ibcontext->tgid, + PIDTYPE_PID); + if (!owning_process || + owning_process->state == TASK_DEAD) { + pr_info("disassociate ucontext done, task was terminated\n"); + /* in case task was dead need to release the + * task struct. + */ + if (owning_process) + put_task_struct(owning_process); + return; + } + } + } + + down_write(&owning_mm->mmap_sem); + ib_dev->disassociate_ucontext(ibcontext); + up_write(&owning_mm->mmap_sem); + mmput(owning_mm); + put_task_struct(owning_process); +} + static void ib_uverbs_free_hw_resources(struct ib_uverbs_device *uverbs_dev, struct ib_device *ib_dev) { @@ -1130,7 +1170,7 @@ static void ib_uverbs_free_hw_resources(struct ib_uverbs_device *uverbs_dev, * (e.g mmput). */ ib_uverbs_event_handler(&file->event_handler, &event); - ib_dev->disassociate_ucontext(ucontext); + ib_uverbs_disassociate_ucontext(ucontext); mutex_lock(&file->cleanup_mutex); ib_uverbs_cleanup_ucontext(file, ucontext, true); mutex_unlock(&file->cleanup_mutex); diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index bf12394c13c1..59aed458d3be 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -1189,40 +1189,10 @@ static void mlx4_ib_disassociate_ucontext(struct ib_ucontext *ibcontext) int ret = 0; struct vm_area_struct *vma; struct mlx4_ib_ucontext *context = to_mucontext(ibcontext); - struct task_struct *owning_process = NULL; - struct mm_struct *owning_mm = NULL; - - owning_process = get_pid_task(ibcontext->tgid, PIDTYPE_PID); - if (!owning_process) - return; - - owning_mm = get_task_mm(owning_process); - if (!owning_mm) { - pr_info("no mm, disassociate ucontext is pending task termination\n"); - while (1) { - /* make sure that task is dead before returning, it may - * prevent a rare case of module down in parallel to a - * call to mlx4_ib_vma_close. - */ - put_task_struct(owning_process); - usleep_range(1000, 2000); - owning_process = get_pid_task(ibcontext->tgid, - PIDTYPE_PID); - if (!owning_process || - owning_process->state == TASK_DEAD) { - pr_info("disassociate ucontext done, task was terminated\n"); - /* in case task was dead need to release the task struct */ - if (owning_process) - put_task_struct(owning_process); - return; - } - } - } /* need to protect from a race on closing the vma as part of * mlx4_ib_vma_close(). */ - down_write(&owning_mm->mmap_sem); for (i = 0; i < HW_BAR_COUNT; i++) { vma = context->hw_bar_info[i].vma; if (!vma) @@ -1241,10 +1211,6 @@ static void mlx4_ib_disassociate_ucontext(struct ib_ucontext *ibcontext) /* context going to be destroyed, should not access ops any more */ context->hw_bar_info[i].vma->vm_ops = NULL; } - - up_write(&owning_mm->mmap_sem); - mmput(owning_mm); - put_task_struct(owning_process); } static void mlx4_ib_set_vma_data(struct vm_area_struct *vma, diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 92879d2d3026..a182d19c557e 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -1973,38 +1973,7 @@ static void mlx5_ib_disassociate_ucontext(struct ib_ucontext *ibcontext) struct vm_area_struct *vma; struct mlx5_ib_vma_private_data *vma_private, *n; struct mlx5_ib_ucontext *context = to_mucontext(ibcontext); - struct task_struct *owning_process = NULL; - struct mm_struct *owning_mm = NULL; - owning_process = get_pid_task(ibcontext->tgid, PIDTYPE_PID); - if (!owning_process) - return; - - owning_mm = get_task_mm(owning_process); - if (!owning_mm) { - pr_info("no mm, disassociate ucontext is pending task termination\n"); - while (1) { - put_task_struct(owning_process); - usleep_range(1000, 2000); - owning_process = get_pid_task(ibcontext->tgid, - PIDTYPE_PID); - if (!owning_process || - owning_process->state == TASK_DEAD) { - pr_info("disassociate ucontext done, task was terminated\n"); - /* in case task was dead need to release the - * task struct. - */ - if (owning_process) - put_task_struct(owning_process); - return; - } - } - } - - /* need to protect from a race on closing the vma as part of - * mlx5_ib_vma_close. - */ - down_write(&owning_mm->mmap_sem); mutex_lock(&context->vma_private_list_mutex); list_for_each_entry_safe(vma_private, n, &context->vma_private_list, list) { @@ -2021,9 +1990,6 @@ static void mlx5_ib_disassociate_ucontext(struct ib_ucontext *ibcontext) kfree(vma_private); } mutex_unlock(&context->vma_private_list_mutex); - up_write(&owning_mm->mmap_sem); - mmput(owning_mm); - put_task_struct(owning_process); } static inline char *mmap_cmd2str(enum mlx5_ib_mmap_cmd cmd) -- cgit v1.2.3 From fedc3abe7bd2dcc4c80bcf3cff8708a3908d8219 Mon Sep 17 00:00:00 2001 From: "Wei Hu(Xavier)" Date: Mon, 28 May 2018 19:39:27 +0800 Subject: RDMA/hns: Implement the disassociate_ucontext API This patch implemented the IB core disassociate_ucontext API. Signed-off-by: Wei Hu (Xavier) Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hns/hns_roce_device.h | 8 ++++ drivers/infiniband/hw/hns/hns_roce_main.c | 70 ++++++++++++++++++++++++++++- 2 files changed, 77 insertions(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index da8512b40252..31221d506d9a 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -217,11 +217,19 @@ struct hns_roce_uar { unsigned long logic_idx; }; +struct hns_roce_vma_data { + struct list_head list; + struct vm_area_struct *vma; + struct mutex *vma_list_mutex; +}; + struct hns_roce_ucontext { struct ib_ucontext ibucontext; struct hns_roce_uar uar; struct list_head page_list; struct mutex page_mutex; + struct list_head vma_list; + struct mutex vma_list_mutex; }; struct hns_roce_pd { diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c index fbb0c0a857b8..08c795e11cdd 100644 --- a/drivers/infiniband/hw/hns/hns_roce_main.c +++ b/drivers/infiniband/hw/hns/hns_roce_main.c @@ -345,6 +345,8 @@ static struct ib_ucontext *hns_roce_alloc_ucontext(struct ib_device *ib_dev, if (ret) goto error_fail_uar_alloc; + INIT_LIST_HEAD(&context->vma_list); + mutex_init(&context->vma_list_mutex); if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB) { INIT_LIST_HEAD(&context->page_list); mutex_init(&context->page_mutex); @@ -375,6 +377,50 @@ static int hns_roce_dealloc_ucontext(struct ib_ucontext *ibcontext) return 0; } +static void hns_roce_vma_open(struct vm_area_struct *vma) +{ + vma->vm_ops = NULL; +} + +static void hns_roce_vma_close(struct vm_area_struct *vma) +{ + struct hns_roce_vma_data *vma_data; + + vma_data = (struct hns_roce_vma_data *)vma->vm_private_data; + vma_data->vma = NULL; + mutex_lock(vma_data->vma_list_mutex); + list_del(&vma_data->list); + mutex_unlock(vma_data->vma_list_mutex); + kfree(vma_data); +} + +static const struct vm_operations_struct hns_roce_vm_ops = { + .open = hns_roce_vma_open, + .close = hns_roce_vma_close, +}; + +static int hns_roce_set_vma_data(struct vm_area_struct *vma, + struct hns_roce_ucontext *context) +{ + struct list_head *vma_head = &context->vma_list; + struct hns_roce_vma_data *vma_data; + + vma_data = kzalloc(sizeof(*vma_data), GFP_KERNEL); + if (!vma_data) + return -ENOMEM; + + vma_data->vma = vma; + vma_data->vma_list_mutex = &context->vma_list_mutex; + vma->vm_private_data = vma_data; + vma->vm_ops = &hns_roce_vm_ops; + + mutex_lock(&context->vma_list_mutex); + list_add(&vma_data->list, vma_head); + mutex_unlock(&context->vma_list_mutex); + + return 0; +} + static int hns_roce_mmap(struct ib_ucontext *context, struct vm_area_struct *vma) { @@ -400,7 +446,7 @@ static int hns_roce_mmap(struct ib_ucontext *context, } else return -EINVAL; - return 0; + return hns_roce_set_vma_data(vma, to_hr_ucontext(context)); } static int hns_roce_port_immutable(struct ib_device *ib_dev, u8 port_num, @@ -424,6 +470,27 @@ static int hns_roce_port_immutable(struct ib_device *ib_dev, u8 port_num, return 0; } +static void hns_roce_disassociate_ucontext(struct ib_ucontext *ibcontext) +{ + struct hns_roce_ucontext *context = to_hr_ucontext(ibcontext); + struct hns_roce_vma_data *vma_data, *n; + struct vm_area_struct *vma; + int ret; + + mutex_lock(&context->vma_list_mutex); + list_for_each_entry_safe(vma_data, n, &context->vma_list, list) { + vma = vma_data->vma; + ret = zap_vma_ptes(vma, vma->vm_start, PAGE_SIZE); + WARN_ONCE(ret, "%s: zap_vma_ptes failed", __func__); + + vma->vm_flags &= ~(VM_SHARED | VM_MAYSHARE); + vma->vm_ops = NULL; + list_del(&vma_data->list); + kfree(vma_data); + } + mutex_unlock(&context->vma_list_mutex); +} + static void hns_roce_unregister_device(struct hns_roce_dev *hr_dev) { struct hns_roce_ib_iboe *iboe = &hr_dev->iboe; @@ -519,6 +586,7 @@ static int hns_roce_register_device(struct hns_roce_dev *hr_dev) /* OTHERS */ ib_dev->get_port_immutable = hns_roce_port_immutable; + ib_dev->disassociate_ucontext = hns_roce_disassociate_ucontext; ib_dev->driver_id = RDMA_DRIVER_HNS; ret = ib_register_device(ib_dev, NULL); -- cgit v1.2.3 From 434dda422cb2d52af4c75857c3f2084fe8f3a2ba Mon Sep 17 00:00:00 2001 From: Sergey Gorenko Date: Mon, 21 May 2018 18:55:53 +0300 Subject: IB/iser: Do not reduce max_sectors The iSER driver reduces max_sectors. For example, if you load the ib_iser module with max_sectors=1024, you will see that /sys/class/block//queue/max_hw_sectors_kb is 508. It is an incorrect value. The expected value is (max_sectors * sector_size) / 1024 = 512. Reducing of max_sectors can cause performance degradation due to unnecessary splitting of IO requests. The number of pages per MR has been fixed here, so there is no longer any need to reduce max_sectors. Fixes: 9c674815d346 ("IB/iser: Fix max_sectors calculation") Signed-off-by: Sergey Gorenko Reviewed-by: Israel Rukshin Reviewed-by: Max Gurtovoy Cc: Christoph Hellwig Cc: Sagi Grimberg Acked-by: Sagi Grimberg Signed-off-by: Doug Ledford --- drivers/infiniband/ulp/iser/iscsi_iser.c | 12 +++++------- drivers/infiniband/ulp/iser/iscsi_iser.h | 2 ++ drivers/infiniband/ulp/iser/iser_initiator.c | 2 +- drivers/infiniband/ulp/iser/iser_verbs.c | 21 ++++++++++++++++++--- 4 files changed, 26 insertions(+), 11 deletions(-) (limited to 'drivers') diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.c b/drivers/infiniband/ulp/iser/iscsi_iser.c index 0336643c2ed6..9a6434c31db2 100644 --- a/drivers/infiniband/ulp/iser/iscsi_iser.c +++ b/drivers/infiniband/ulp/iser/iscsi_iser.c @@ -665,19 +665,17 @@ iscsi_iser_session_create(struct iscsi_endpoint *ep, goto free_host; } - /* - * FRs or FMRs can only map up to a (device) page per entry, but if the - * first entry is misaligned we'll end up using using two entries - * (head and tail) for a single page worth data, so we have to drop - * one segment from the calculation. - */ - max_fr_sectors = ((shost->sg_tablesize - 1) * PAGE_SIZE) >> 9; + max_fr_sectors = (shost->sg_tablesize * PAGE_SIZE) >> 9; shost->max_sectors = min(iser_max_sectors, max_fr_sectors); iser_dbg("iser_conn %p, sg_tablesize %u, max_sectors %u\n", iser_conn, shost->sg_tablesize, shost->max_sectors); + if (shost->max_sectors < iser_max_sectors) + iser_warn("max_sectors was reduced from %u to %u\n", + iser_max_sectors, shost->max_sectors); + if (cmds_max > max_cmds) { iser_info("cmds_max changed from %u to %u\n", cmds_max, max_cmds); diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.h b/drivers/infiniband/ulp/iser/iscsi_iser.h index c1ae4aeae2f9..c30ceb08db21 100644 --- a/drivers/infiniband/ulp/iser/iscsi_iser.h +++ b/drivers/infiniband/ulp/iser/iscsi_iser.h @@ -498,6 +498,7 @@ struct ib_conn { * @rx_descs: rx buffers array (cyclic buffer) * @num_rx_descs: number of rx descriptors * @scsi_sg_tablesize: scsi host sg_tablesize + * @pages_per_mr: maximum pages available for registration */ struct iser_conn { struct ib_conn ib_conn; @@ -520,6 +521,7 @@ struct iser_conn { struct iser_rx_desc *rx_descs; u32 num_rx_descs; unsigned short scsi_sg_tablesize; + unsigned short pages_per_mr; bool snd_w_inv; }; diff --git a/drivers/infiniband/ulp/iser/iser_initiator.c b/drivers/infiniband/ulp/iser/iser_initiator.c index df49c4eb67f7..ca858d6bd37a 100644 --- a/drivers/infiniband/ulp/iser/iser_initiator.c +++ b/drivers/infiniband/ulp/iser/iser_initiator.c @@ -251,7 +251,7 @@ int iser_alloc_rx_descriptors(struct iser_conn *iser_conn, iser_conn->min_posted_rx = iser_conn->qp_max_recv_dtos >> 2; if (device->reg_ops->alloc_reg_res(ib_conn, session->scsi_cmds_max, - iser_conn->scsi_sg_tablesize)) + iser_conn->pages_per_mr)) goto create_rdma_reg_res_failed; if (iser_alloc_login_buf(iser_conn)) diff --git a/drivers/infiniband/ulp/iser/iser_verbs.c b/drivers/infiniband/ulp/iser/iser_verbs.c index 56b7240a3fc3..616d978cbf2b 100644 --- a/drivers/infiniband/ulp/iser/iser_verbs.c +++ b/drivers/infiniband/ulp/iser/iser_verbs.c @@ -703,19 +703,34 @@ iser_calc_scsi_params(struct iser_conn *iser_conn, unsigned int max_sectors) { struct iser_device *device = iser_conn->ib_conn.device; + struct ib_device_attr *attr = &device->ib_device->attrs; unsigned short sg_tablesize, sup_sg_tablesize; + unsigned short reserved_mr_pages; + + /* + * FRs without SG_GAPS or FMRs can only map up to a (device) page per + * entry, but if the first entry is misaligned we'll end up using two + * entries (head and tail) for a single page worth data, so one + * additional entry is required. + */ + if ((attr->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) && + (attr->device_cap_flags & IB_DEVICE_SG_GAPS_REG)) + reserved_mr_pages = 0; + else + reserved_mr_pages = 1; sg_tablesize = DIV_ROUND_UP(max_sectors * 512, SIZE_4K); - if (device->ib_device->attrs.device_cap_flags & - IB_DEVICE_MEM_MGT_EXTENSIONS) + if (attr->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) sup_sg_tablesize = min_t( uint, ISCSI_ISER_MAX_SG_TABLESIZE, - device->ib_device->attrs.max_fast_reg_page_list_len); + attr->max_fast_reg_page_list_len - reserved_mr_pages); else sup_sg_tablesize = ISCSI_ISER_MAX_SG_TABLESIZE; iser_conn->scsi_sg_tablesize = min(sg_tablesize, sup_sg_tablesize); + iser_conn->pages_per_mr = + iser_conn->scsi_sg_tablesize + reserved_mr_pages; } /** -- cgit v1.2.3 From 367d2f0787e8363f30cbac4d5270a772b69828c1 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 30 May 2018 10:40:29 +0100 Subject: RDMA/qedr: fix spelling mistake: "adrresses" -> "addresses" Trivial fix to spelling mistake in DP_ERR error message Signed-off-by: Colin Ian King Signed-off-by: Doug Ledford --- drivers/infiniband/hw/qedr/verbs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/infiniband/hw/qedr/verbs.c b/drivers/infiniband/hw/qedr/verbs.c index 988aace89430..614a954d0757 100644 --- a/drivers/infiniband/hw/qedr/verbs.c +++ b/drivers/infiniband/hw/qedr/verbs.c @@ -414,7 +414,7 @@ int qedr_mmap(struct ib_ucontext *context, struct vm_area_struct *vma) if ((vma->vm_start & (PAGE_SIZE - 1)) || (len & (PAGE_SIZE - 1))) { DP_ERR(dev, - "failed mmap, adrresses must be page aligned: start=0x%pK, end=0x%pK\n", + "failed mmap, addresses must be page aligned: start=0x%pK, end=0x%pK\n", (void *)vma->vm_start, (void *)vma->vm_end); return -EINVAL; } -- cgit v1.2.3 From f77f3036264b2e0d9abd2938b8999dc9d33819ed Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 29 May 2018 14:56:13 +0300 Subject: RDMA/mlx4: Catch FW<->SW misalignment without machine crash Any steering QP is supposed be above steering_qp_base, see function mlx4_ib_steer_qp_alloc() for it, however in case of misalignment between SW and FW, this qp_base can be wrong. Use WARN() to catch such situation without killing the machine. Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/hw/mlx4/main.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index 59aed458d3be..1fea1497263b 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -3017,7 +3017,10 @@ void mlx4_ib_steer_qp_free(struct mlx4_ib_dev *dev, u32 qpn, int count) dev->steering_support != MLX4_STEERING_MODE_DEVICE_MANAGED) return; - BUG_ON(qpn < dev->steer_qpn_base); + if (WARN(qpn < dev->steer_qpn_base, "qpn = %u, steer_qpn_base = %u\n", + qpn, dev->steer_qpn_base)) + /* not supposed to be here */ + return; bitmap_release_region(dev->ib_uc_qpns_bitmap, qpn - dev->steer_qpn_base, -- cgit v1.2.3 From 6b1ca7ece15e94251d1d0d919f813943e4a58059 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 29 May 2018 14:56:14 +0300 Subject: RDMA/mlx4: Discard unknown SQP work requests There is no need to crash the machine if unknown work request was received in SQP MAD. Cc: # 3.6 Fixes: 37bfc7c1e83f ("IB/mlx4: SR-IOV multiplex and demultiplex MADs") Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/hw/mlx4/mad.c | 1 - 1 file changed, 1 deletion(-) (limited to 'drivers') diff --git a/drivers/infiniband/hw/mlx4/mad.c b/drivers/infiniband/hw/mlx4/mad.c index 0793a21d76f4..d604b3d5aa3e 100644 --- a/drivers/infiniband/hw/mlx4/mad.c +++ b/drivers/infiniband/hw/mlx4/mad.c @@ -1934,7 +1934,6 @@ static void mlx4_ib_sqp_comp_worker(struct work_struct *work) "buf:%lld\n", wc.wr_id); break; default: - BUG_ON(1); break; } } else { -- cgit v1.2.3 From cd13a399e66c1b9b039064e8aa2f959eb90d6947 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 29 May 2018 14:56:15 +0300 Subject: RDMA/cxgb3: Don't crash kernel just because IDR is full cxgb3 driver properly handles errors returned by IDR, so there is no need to have special case (kernel crash) just because IDR is full. Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/hw/cxgb3/iwch.h | 1 - 1 file changed, 1 deletion(-) (limited to 'drivers') diff --git a/drivers/infiniband/hw/cxgb3/iwch.h b/drivers/infiniband/hw/cxgb3/iwch.h index 837862287a29..c69bc4f52049 100644 --- a/drivers/infiniband/hw/cxgb3/iwch.h +++ b/drivers/infiniband/hw/cxgb3/iwch.h @@ -162,7 +162,6 @@ static inline int insert_handle(struct iwch_dev *rhp, struct idr *idr, spin_unlock_irq(&rhp->lock); idr_preload_end(); - BUG_ON(ret == -ENOSPC); return ret < 0 ? ret : 0; } -- cgit v1.2.3 From 2f5059a7af144426086555b62dd268d8060fe6d3 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 29 May 2018 14:56:16 +0300 Subject: RDMA/cm: Abort loop in case of CM dequeue In case CM work list is empty, the work pointer will be NULL, so instead of kernel crash it is better to abort processing of works. Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/core/cm.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c index 7df4c7173607..724f123c037f 100644 --- a/drivers/infiniband/core/cm.c +++ b/drivers/infiniband/core/cm.c @@ -1707,7 +1707,9 @@ static void cm_process_work(struct cm_id_private *cm_id_priv, spin_lock_irq(&cm_id_priv->lock); work = cm_dequeue_work(cm_id_priv); spin_unlock_irq(&cm_id_priv->lock); - BUG_ON(!work); + if (!work) + return; + ret = cm_id_priv->id.cm_handler(&cm_id_priv->id, &work->cm_event); cm_free_work(work); -- cgit v1.2.3 From 671a6cc2ba05aa97ee5afe3479f1bdba2436970d Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 29 May 2018 14:56:17 +0300 Subject: RDMA/cma: Ignore unknown event There is no need to bring down the whole machine, just because unknown event was received. It is better to ignore it silently. Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/core/cma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 441555a35525..6813ee717a38 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -2130,7 +2130,7 @@ static int cma_iw_handler(struct iw_cm_id *iw_id, struct iw_cm_event *iw_event) event.param.conn.responder_resources = iw_event->ord; break; default: - BUG_ON(1); + goto out; } event.status = iw_event->status; -- cgit v1.2.3 From dee92c4bf53b2701d382ce9e32c57d194a393bfe Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 29 May 2018 14:56:18 +0300 Subject: RDMA/mad: Delete inaccessible BUG_ON There is no need to check existence of mad_queue, because we already did pointer dereference before call to dequeue_mad(). Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/core/mad.c | 1 - 1 file changed, 1 deletion(-) (limited to 'drivers') diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c index b28452a55a08..35295e74de04 100644 --- a/drivers/infiniband/core/mad.c +++ b/drivers/infiniband/core/mad.c @@ -651,7 +651,6 @@ static void dequeue_mad(struct ib_mad_list_head *mad_list) struct ib_mad_queue *mad_queue; unsigned long flags; - BUG_ON(!mad_list->mad_queue); mad_queue = mad_list->mad_queue; spin_lock_irqsave(&mad_queue->lock, flags); list_del(&mad_list->list); -- cgit v1.2.3 From 2468b82d69e3a53d024f28d79ba0fdb8bf43dfbf Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 29 May 2018 14:56:19 +0300 Subject: RDMA/mad: Convert BUG_ONs to error flows Let's perform checks in-place instead of BUG_ONs. Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/core/mad.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'drivers') diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c index 35295e74de04..f742ae7a768b 100644 --- a/drivers/infiniband/core/mad.c +++ b/drivers/infiniband/core/mad.c @@ -1556,7 +1556,8 @@ static int add_oui_reg_req(struct ib_mad_reg_req *mad_reg_req, mad_reg_req->oui, 3)) { method = &(*vendor_table)->vendor_class[ vclass]->method_table[i]; - BUG_ON(!*method); + if (!*method) + goto error3; goto check_in_use; } } @@ -1566,10 +1567,12 @@ static int add_oui_reg_req(struct ib_mad_reg_req *mad_reg_req, vclass]->oui[i])) { method = &(*vendor_table)->vendor_class[ vclass]->method_table[i]; - BUG_ON(*method); /* Allocate method table for this OUI */ - if ((ret = allocate_method_table(method))) - goto error3; + if (!*method) { + ret = allocate_method_table(method); + if (ret) + goto error3; + } memcpy((*vendor_table)->vendor_class[vclass]->oui[i], mad_reg_req->oui, 3); goto check_in_use; -- cgit v1.2.3 From 2cb4079188a1421520372f5e57ddaceab39435aa Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 29 May 2018 15:14:05 +0300 Subject: RDMA/mlx5: Don't check return value of zap_vma_ptes() There is no need to check return value of zap_vma_ptes() because there is nothing to do with this knowledge. Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/hw/mlx5/main.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'drivers') diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index a182d19c557e..0541581c5d84 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -1969,7 +1969,6 @@ static int mlx5_ib_set_vma_data(struct vm_area_struct *vma, static void mlx5_ib_disassociate_ucontext(struct ib_ucontext *ibcontext) { - int ret; struct vm_area_struct *vma; struct mlx5_ib_vma_private_data *vma_private, *n; struct mlx5_ib_ucontext *context = to_mucontext(ibcontext); @@ -1978,9 +1977,7 @@ static void mlx5_ib_disassociate_ucontext(struct ib_ucontext *ibcontext) list_for_each_entry_safe(vma_private, n, &context->vma_private_list, list) { vma = vma_private->vma; - ret = zap_vma_ptes(vma, vma->vm_start, - PAGE_SIZE); - WARN_ONCE(ret, "%s: zap_vma_ptes failed", __func__); + zap_vma_ptes(vma, vma->vm_start, PAGE_SIZE); /* context going to be destroyed, should * not access ops any more. */ -- cgit v1.2.3 From 7fc8ff267d8a94964626b847a0440b6feef9dd68 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 29 May 2018 15:14:06 +0300 Subject: RDMA/mlx4: Don't crash machine if zap_vma_ptes() fails The failure reported by zap_vma_ptes() means that wrong VMA pages were supplied, however it is impossible for this type of address. Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/hw/mlx4/main.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) (limited to 'drivers') diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index 1fea1497263b..722c825e3e71 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -1186,7 +1186,6 @@ static const struct vm_operations_struct mlx4_ib_vm_ops = { static void mlx4_ib_disassociate_ucontext(struct ib_ucontext *ibcontext) { int i; - int ret = 0; struct vm_area_struct *vma; struct mlx4_ib_ucontext *context = to_mucontext(ibcontext); @@ -1198,13 +1197,8 @@ static void mlx4_ib_disassociate_ucontext(struct ib_ucontext *ibcontext) if (!vma) continue; - ret = zap_vma_ptes(context->hw_bar_info[i].vma, - context->hw_bar_info[i].vma->vm_start, - PAGE_SIZE); - if (ret) { - pr_err("Error: zap_vma_ptes failed for index=%d, ret=%d\n", i, ret); - BUG_ON(1); - } + zap_vma_ptes(context->hw_bar_info[i].vma, + context->hw_bar_info[i].vma->vm_start, PAGE_SIZE); context->hw_bar_info[i].vma->vm_flags &= ~(VM_SHARED | VM_MAYSHARE); -- cgit v1.2.3 From 008fba465d7c010dc14c9d7fd57a7a743d50bf8e Mon Sep 17 00:00:00 2001 From: Doug Ledford Date: Fri, 1 Jun 2018 11:19:19 -0400 Subject: RDMA/hns_roce: Don't check return value of zap_vma_ptes() There is no need to check return value of zap_vma_ptes() because there is nothing to do with this knowledge. Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hns/hns_roce_main.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'drivers') diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c index 08c795e11cdd..21b901cfa2d6 100644 --- a/drivers/infiniband/hw/hns/hns_roce_main.c +++ b/drivers/infiniband/hw/hns/hns_roce_main.c @@ -475,13 +475,11 @@ static void hns_roce_disassociate_ucontext(struct ib_ucontext *ibcontext) struct hns_roce_ucontext *context = to_hr_ucontext(ibcontext); struct hns_roce_vma_data *vma_data, *n; struct vm_area_struct *vma; - int ret; mutex_lock(&context->vma_list_mutex); list_for_each_entry_safe(vma_data, n, &context->vma_list, list) { vma = vma_data->vma; - ret = zap_vma_ptes(vma, vma->vm_start, PAGE_SIZE); - WARN_ONCE(ret, "%s: zap_vma_ptes failed", __func__); + zap_vma_ptes(vma, vma->vm_start, PAGE_SIZE); vma->vm_flags &= ~(VM_SHARED | VM_MAYSHARE); vma->vm_ops = NULL; -- cgit v1.2.3 From 930821e39d0a5f91ed58fea1692afe04f0fe0e1f Mon Sep 17 00:00:00 2001 From: Or Gerlitz Date: Thu, 31 May 2018 16:43:29 +0300 Subject: net/mlx5: Use flow counter pointer as input to the query function This allows to un-expose the details of struct mlx5_fc and keep it internal to the core driver. Signed-off-by: Or Gerlitz Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/net/ethernet/mellanox/mlx5/core/eswitch.c | 15 ++++++--------- drivers/net/ethernet/mellanox/mlx5/core/fs_core.h | 2 +- drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c | 4 ++-- 3 files changed, 9 insertions(+), 12 deletions(-) (limited to 'drivers') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c index 9a24314b817a..bb9665b7e8e7 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c @@ -2104,21 +2104,18 @@ static int mlx5_eswitch_query_vport_drop_stats(struct mlx5_core_dev *dev, struct mlx5_vport *vport = &esw->vports[vport_idx]; u64 rx_discard_vport_down, tx_discard_vport_down; u64 bytes = 0; - u16 idx = 0; int err = 0; if (!vport->enabled || esw->mode != SRIOV_LEGACY) return 0; - if (vport->egress.drop_counter) { - idx = vport->egress.drop_counter->id; - mlx5_fc_query(dev, idx, &stats->rx_dropped, &bytes); - } + if (vport->egress.drop_counter) + mlx5_fc_query(dev, vport->egress.drop_counter, + &stats->rx_dropped, &bytes); - if (vport->ingress.drop_counter) { - idx = vport->ingress.drop_counter->id; - mlx5_fc_query(dev, idx, &stats->tx_dropped, &bytes); - } + if (vport->ingress.drop_counter) + mlx5_fc_query(dev, vport->ingress.drop_counter, + &stats->tx_dropped, &bytes); if (!MLX5_CAP_GEN(dev, receive_discard_vport_down) && !MLX5_CAP_GEN(dev, transmit_discard_vport_down)) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h index e26d3e9d5f9f..276b13d7ea43 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h @@ -233,7 +233,7 @@ void mlx5_fc_queue_stats_work(struct mlx5_core_dev *dev, unsigned long delay); void mlx5_fc_update_sampling_interval(struct mlx5_core_dev *dev, unsigned long interval); -int mlx5_fc_query(struct mlx5_core_dev *dev, u16 id, +int mlx5_fc_query(struct mlx5_core_dev *dev, struct mlx5_fc *counter, u64 *packets, u64 *bytes); int mlx5_init_fs(struct mlx5_core_dev *dev); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c index b7ab929d5f8e..f5cb3fa5d8cf 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c @@ -312,10 +312,10 @@ void mlx5_cleanup_fc_stats(struct mlx5_core_dev *dev) } } -int mlx5_fc_query(struct mlx5_core_dev *dev, u16 id, +int mlx5_fc_query(struct mlx5_core_dev *dev, struct mlx5_fc *counter, u64 *packets, u64 *bytes) { - return mlx5_cmd_fc_query(dev, id, packets, bytes); + return mlx5_cmd_fc_query(dev, counter->id, packets, bytes); } void mlx5_fc_query_cached(struct mlx5_fc *counter, -- cgit v1.2.3 From 5f9bf63ae80c4d0e5e986b6c1280bf8174978545 Mon Sep 17 00:00:00 2001 From: Raed Salem Date: Thu, 31 May 2018 16:43:30 +0300 Subject: net/mlx5: Export flow counter related API Exports counters API to be used in both IB and EN. Reviewed-by: Yishai Hadas Signed-off-by: Raed Salem Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/net/ethernet/mellanox/mlx5/core/fs_core.h | 2 -- drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c | 3 +++ include/linux/mlx5/fs.h | 3 +++ 3 files changed, 6 insertions(+), 2 deletions(-) (limited to 'drivers') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h index 276b13d7ea43..4157dd76165e 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h @@ -233,8 +233,6 @@ void mlx5_fc_queue_stats_work(struct mlx5_core_dev *dev, unsigned long delay); void mlx5_fc_update_sampling_interval(struct mlx5_core_dev *dev, unsigned long interval); -int mlx5_fc_query(struct mlx5_core_dev *dev, struct mlx5_fc *counter, - u64 *packets, u64 *bytes); int mlx5_init_fs(struct mlx5_core_dev *dev); void mlx5_cleanup_fs(struct mlx5_core_dev *dev); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c index f5cb3fa5d8cf..58af6be13dfa 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_counters.c @@ -243,6 +243,7 @@ err_out: return ERR_PTR(err); } +EXPORT_SYMBOL(mlx5_fc_create); void mlx5_fc_destroy(struct mlx5_core_dev *dev, struct mlx5_fc *counter) { @@ -260,6 +261,7 @@ void mlx5_fc_destroy(struct mlx5_core_dev *dev, struct mlx5_fc *counter) mlx5_cmd_fc_free(dev, counter->id); kfree(counter); } +EXPORT_SYMBOL(mlx5_fc_destroy); int mlx5_init_fc_stats(struct mlx5_core_dev *dev) { @@ -317,6 +319,7 @@ int mlx5_fc_query(struct mlx5_core_dev *dev, struct mlx5_fc *counter, { return mlx5_cmd_fc_query(dev, counter->id, packets, bytes); } +EXPORT_SYMBOL(mlx5_fc_query); void mlx5_fc_query_cached(struct mlx5_fc *counter, u64 *bytes, u64 *packets, u64 *lastuse) diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h index 9f4d32e41c06..3b4c3298061c 100644 --- a/include/linux/mlx5/fs.h +++ b/include/linux/mlx5/fs.h @@ -186,6 +186,9 @@ struct mlx5_fc *mlx5_fc_create(struct mlx5_core_dev *dev, bool aging); void mlx5_fc_destroy(struct mlx5_core_dev *dev, struct mlx5_fc *counter); void mlx5_fc_query_cached(struct mlx5_fc *counter, u64 *bytes, u64 *packets, u64 *lastuse); +int mlx5_fc_query(struct mlx5_core_dev *dev, struct mlx5_fc *counter, + u64 *packets, u64 *bytes); + int mlx5_fs_add_rx_underlay_qpn(struct mlx5_core_dev *dev, u32 underlay_qpn); int mlx5_fs_remove_rx_underlay_qpn(struct mlx5_core_dev *dev, u32 underlay_qpn); -- cgit v1.2.3 From 3efa38125b81fb4eb7dbb43295cb82a203dc0398 Mon Sep 17 00:00:00 2001 From: Matan Barak Date: Thu, 31 May 2018 16:43:28 +0300 Subject: IB/uverbs: Add an ib_uobject getter to ioctl() infrastructure Previously, the user had to dig inside the attribute to get the uobject. Add a helper function that correctly extract it (and do the required checks) for him/her. Signed-off-by: Matan Barak Reviewed-by: Michael J. Ruhl Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/uverbs_std_types_cq.c | 23 +++++++++++----------- .../infiniband/core/uverbs_std_types_flow_action.c | 4 ++-- include/rdma/uverbs_ioctl.h | 11 +++++++++++ 3 files changed, 25 insertions(+), 13 deletions(-) (limited to 'drivers') diff --git a/drivers/infiniband/core/uverbs_std_types_cq.c b/drivers/infiniband/core/uverbs_std_types_cq.c index b0dbae9dd0d7..3d293d01afea 100644 --- a/drivers/infiniband/core/uverbs_std_types_cq.c +++ b/drivers/infiniband/core/uverbs_std_types_cq.c @@ -65,7 +65,6 @@ static int UVERBS_HANDLER(UVERBS_METHOD_CQ_CREATE)(struct ib_device *ib_dev, struct ib_cq_init_attr attr = {}; struct ib_cq *cq; struct ib_uverbs_completion_event_file *ev_file = NULL; - const struct uverbs_attr *ev_file_attr; struct ib_uobject *ev_file_uobj; if (!(ib_dev->uverbs_cmd_mask & 1ULL << IB_USER_VERBS_CMD_CREATE_CQ)) @@ -87,10 +86,8 @@ static int UVERBS_HANDLER(UVERBS_METHOD_CQ_CREATE)(struct ib_device *ib_dev, UVERBS_ATTR_CREATE_CQ_FLAGS))) return -EFAULT; - ev_file_attr = uverbs_attr_get(attrs, UVERBS_ATTR_CREATE_CQ_COMP_CHANNEL); - if (!IS_ERR(ev_file_attr)) { - ev_file_uobj = ev_file_attr->obj_attr.uobject; - + ev_file_uobj = uverbs_attr_get_uobject(attrs, UVERBS_ATTR_CREATE_CQ_COMP_CHANNEL); + if (!IS_ERR(ev_file_uobj)) { ev_file = container_of(ev_file_uobj, struct ib_uverbs_completion_event_file, uobj_file.uobj); @@ -102,8 +99,8 @@ static int UVERBS_HANDLER(UVERBS_METHOD_CQ_CREATE)(struct ib_device *ib_dev, goto err_event_file; } - obj = container_of(uverbs_attr_get(attrs, - UVERBS_ATTR_CREATE_CQ_HANDLE)->obj_attr.uobject, + obj = container_of(uverbs_attr_get_uobject(attrs, + UVERBS_ATTR_CREATE_CQ_HANDLE), typeof(*obj), uobject); obj->uverbs_file = ucontext->ufile; obj->comp_events_reported = 0; @@ -170,13 +167,17 @@ static int UVERBS_HANDLER(UVERBS_METHOD_CQ_DESTROY)(struct ib_device *ib_dev, struct ib_uverbs_file *file, struct uverbs_attr_bundle *attrs) { - struct ib_uverbs_destroy_cq_resp resp; struct ib_uobject *uobj = - uverbs_attr_get(attrs, UVERBS_ATTR_DESTROY_CQ_HANDLE)->obj_attr.uobject; - struct ib_ucq_object *obj = container_of(uobj, struct ib_ucq_object, - uobject); + uverbs_attr_get_uobject(attrs, UVERBS_ATTR_DESTROY_CQ_HANDLE); + struct ib_uverbs_destroy_cq_resp resp; + struct ib_ucq_object *obj; int ret; + if (IS_ERR(uobj)) + return PTR_ERR(uobj); + + obj = container_of(uobj, struct ib_ucq_object, uobject); + if (!(ib_dev->uverbs_cmd_mask & 1ULL << IB_USER_VERBS_CMD_DESTROY_CQ)) return -EOPNOTSUPP; diff --git a/drivers/infiniband/core/uverbs_std_types_flow_action.c b/drivers/infiniband/core/uverbs_std_types_flow_action.c index cbcec3da12f6..00e6b34bc16b 100644 --- a/drivers/infiniband/core/uverbs_std_types_flow_action.c +++ b/drivers/infiniband/core/uverbs_std_types_flow_action.c @@ -320,7 +320,7 @@ static int UVERBS_HANDLER(UVERBS_METHOD_FLOW_ACTION_ESP_CREATE)(struct ib_device return ret; /* No need to check as this attribute is marked as MANDATORY */ - uobj = uverbs_attr_get(attrs, UVERBS_ATTR_FLOW_ACTION_ESP_HANDLE)->obj_attr.uobject; + uobj = uverbs_attr_get_uobject(attrs, UVERBS_ATTR_FLOW_ACTION_ESP_HANDLE); action = ib_dev->create_flow_action_esp(ib_dev, &esp_attr.hdr, attrs); if (IS_ERR(action)) return PTR_ERR(action); @@ -350,7 +350,7 @@ static int UVERBS_HANDLER(UVERBS_METHOD_FLOW_ACTION_ESP_MODIFY)(struct ib_device if (ret) return ret; - uobj = uverbs_attr_get(attrs, UVERBS_ATTR_FLOW_ACTION_ESP_HANDLE)->obj_attr.uobject; + uobj = uverbs_attr_get_uobject(attrs, UVERBS_ATTR_FLOW_ACTION_ESP_HANDLE); action = uobj->object; if (action->type != IB_FLOW_ACTION_ESP) diff --git a/include/rdma/uverbs_ioctl.h b/include/rdma/uverbs_ioctl.h index 4a4201d997a7..7ac6271a5ee0 100644 --- a/include/rdma/uverbs_ioctl.h +++ b/include/rdma/uverbs_ioctl.h @@ -420,6 +420,17 @@ static inline void *uverbs_attr_get_obj(const struct uverbs_attr_bundle *attrs_b return uobj->object; } +static inline struct ib_uobject *uverbs_attr_get_uobject(const struct uverbs_attr_bundle *attrs_bundle, + u16 idx) +{ + const struct uverbs_attr *attr = uverbs_attr_get(attrs_bundle, idx); + + if (IS_ERR(attr)) + return ERR_CAST(attr); + + return attr->obj_attr.uobject; +} + static inline int uverbs_copy_to(const struct uverbs_attr_bundle *attrs_bundle, size_t idx, const void *from, size_t size) { -- cgit v1.2.3 From d9a5a6441e9dde080e9d69e736c623f7369472ed Mon Sep 17 00:00:00 2001 From: Raed Salem Date: Thu, 31 May 2018 16:43:32 +0300 Subject: IB/uverbs: Add create/destroy counters support User space application which uses counters functionality, is expected to allocate/release the counters resources by calling create/destroy verbs and in turn get a unique handle that can be used to attach the counters to its counted type. Reviewed-by: Yishai Hadas Signed-off-by: Raed Salem Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/Makefile | 2 +- drivers/infiniband/core/uverbs.h | 1 + drivers/infiniband/core/uverbs_std_types.c | 3 +- .../infiniband/core/uverbs_std_types_counters.c | 100 +++++++++++++++++++++ include/uapi/rdma/ib_user_ioctl_cmds.h | 14 +++ 5 files changed, 118 insertions(+), 2 deletions(-) create mode 100644 drivers/infiniband/core/uverbs_std_types_counters.c (limited to 'drivers') diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile index dda9e856e3fa..589d3b744b1e 100644 --- a/drivers/infiniband/core/Makefile +++ b/drivers/infiniband/core/Makefile @@ -36,4 +36,4 @@ ib_uverbs-y := uverbs_main.o uverbs_cmd.o uverbs_marshall.o \ rdma_core.o uverbs_std_types.o uverbs_ioctl.o \ uverbs_ioctl_merge.o uverbs_std_types_cq.o \ uverbs_std_types_flow_action.o uverbs_std_types_dm.o \ - uverbs_std_types_mr.o + uverbs_std_types_mr.o uverbs_std_types_counters.o diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h index cfb51618ab7a..5b2461fa634d 100644 --- a/drivers/infiniband/core/uverbs.h +++ b/drivers/infiniband/core/uverbs.h @@ -287,6 +287,7 @@ extern const struct uverbs_object_def UVERBS_OBJECT(UVERBS_OBJECT_RWQ_IND_TBL); extern const struct uverbs_object_def UVERBS_OBJECT(UVERBS_OBJECT_XRCD); extern const struct uverbs_object_def UVERBS_OBJECT(UVERBS_OBJECT_FLOW_ACTION); extern const struct uverbs_object_def UVERBS_OBJECT(UVERBS_OBJECT_DM); +extern const struct uverbs_object_def UVERBS_OBJECT(UVERBS_OBJECT_COUNTERS); #define IB_UVERBS_DECLARE_CMD(name) \ ssize_t ib_uverbs_##name(struct ib_uverbs_file *file, \ diff --git a/drivers/infiniband/core/uverbs_std_types.c b/drivers/infiniband/core/uverbs_std_types.c index 569f48bd821e..b570acbd94af 100644 --- a/drivers/infiniband/core/uverbs_std_types.c +++ b/drivers/infiniband/core/uverbs_std_types.c @@ -302,7 +302,8 @@ static DECLARE_UVERBS_OBJECT_TREE(uverbs_default_objects, &UVERBS_OBJECT(UVERBS_OBJECT_RWQ_IND_TBL), &UVERBS_OBJECT(UVERBS_OBJECT_XRCD), &UVERBS_OBJECT(UVERBS_OBJECT_FLOW_ACTION), - &UVERBS_OBJECT(UVERBS_OBJECT_DM)); + &UVERBS_OBJECT(UVERBS_OBJECT_DM), + &UVERBS_OBJECT(UVERBS_OBJECT_COUNTERS)); const struct uverbs_object_tree_def *uverbs_default_get_objects(void) { diff --git a/drivers/infiniband/core/uverbs_std_types_counters.c b/drivers/infiniband/core/uverbs_std_types_counters.c new file mode 100644 index 000000000000..4b6a985aad97 --- /dev/null +++ b/drivers/infiniband/core/uverbs_std_types_counters.c @@ -0,0 +1,100 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2018, Mellanox Technologies inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "uverbs.h" +#include + +static int uverbs_free_counters(struct ib_uobject *uobject, + enum rdma_remove_reason why) +{ + struct ib_counters *counters = uobject->object; + + if (why == RDMA_REMOVE_DESTROY && + atomic_read(&counters->usecnt)) + return -EBUSY; + + return counters->device->destroy_counters(counters); +} + +static int UVERBS_HANDLER(UVERBS_METHOD_COUNTERS_CREATE)(struct ib_device *ib_dev, + struct ib_uverbs_file *file, + struct uverbs_attr_bundle *attrs) +{ + struct ib_counters *counters; + struct ib_uobject *uobj; + int ret; + + /* + * This check should be removed once the infrastructure + * have the ability to remove methods from parse tree once + * such condition is met. + */ + if (!ib_dev->create_counters) + return -EOPNOTSUPP; + + uobj = uverbs_attr_get_uobject(attrs, UVERBS_ATTR_CREATE_COUNTERS_HANDLE); + counters = ib_dev->create_counters(ib_dev, attrs); + if (IS_ERR(counters)) { + ret = PTR_ERR(counters); + goto err_create_counters; + } + + counters->device = ib_dev; + counters->uobject = uobj; + uobj->object = counters; + atomic_set(&counters->usecnt, 0); + + return 0; + +err_create_counters: + return ret; +} + +static DECLARE_UVERBS_NAMED_METHOD(UVERBS_METHOD_COUNTERS_CREATE, + &UVERBS_ATTR_IDR(UVERBS_ATTR_CREATE_COUNTERS_HANDLE, + UVERBS_OBJECT_COUNTERS, + UVERBS_ACCESS_NEW, + UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY))); + +static DECLARE_UVERBS_NAMED_METHOD_WITH_HANDLER(UVERBS_METHOD_COUNTERS_DESTROY, + uverbs_destroy_def_handler, + &UVERBS_ATTR_IDR(UVERBS_ATTR_DESTROY_COUNTERS_HANDLE, + UVERBS_OBJECT_COUNTERS, + UVERBS_ACCESS_DESTROY, + UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY))); + +DECLARE_UVERBS_NAMED_OBJECT(UVERBS_OBJECT_COUNTERS, + &UVERBS_TYPE_ALLOC_IDR(0, uverbs_free_counters), + &UVERBS_METHOD(UVERBS_METHOD_COUNTERS_CREATE), + &UVERBS_METHOD(UVERBS_METHOD_COUNTERS_DESTROY)); + diff --git a/include/uapi/rdma/ib_user_ioctl_cmds.h b/include/uapi/rdma/ib_user_ioctl_cmds.h index 83e3890eef20..c28ce62d2e40 100644 --- a/include/uapi/rdma/ib_user_ioctl_cmds.h +++ b/include/uapi/rdma/ib_user_ioctl_cmds.h @@ -55,6 +55,7 @@ enum uverbs_default_objects { UVERBS_OBJECT_WQ, UVERBS_OBJECT_FLOW_ACTION, UVERBS_OBJECT_DM, + UVERBS_OBJECT_COUNTERS, }; enum { @@ -131,4 +132,17 @@ enum uverbs_methods_mr { UVERBS_METHOD_DM_MR_REG, }; +enum uverbs_attrs_create_counters_cmd_attr_ids { + UVERBS_ATTR_CREATE_COUNTERS_HANDLE, +}; + +enum uverbs_attrs_destroy_counters_cmd_attr_ids { + UVERBS_ATTR_DESTROY_COUNTERS_HANDLE, +}; + +enum uverbs_methods_actions_counters_ops { + UVERBS_METHOD_COUNTERS_CREATE, + UVERBS_METHOD_COUNTERS_DESTROY, +}; + #endif -- cgit v1.2.3 From ebb6796bd397f3fb9b2b46398b2fbb585e1b8bb6 Mon Sep 17 00:00:00 2001 From: Raed Salem Date: Thu, 31 May 2018 16:43:34 +0300 Subject: IB/uverbs: Add read counters support This patch exposes the read counters verb to user space applications. By that verb the user can read the hardware counters which are associated with the counters object. The application needs to provide a sufficient memory to hold the statistics. Reviewed-by: Yishai Hadas Signed-off-by: Raed Salem Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- .../infiniband/core/uverbs_std_types_counters.c | 59 +++++++++++++++++++++- include/uapi/rdma/ib_user_ioctl_cmds.h | 7 +++ 2 files changed, 65 insertions(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/infiniband/core/uverbs_std_types_counters.c b/drivers/infiniband/core/uverbs_std_types_counters.c index 4b6a985aad97..03b182a684a6 100644 --- a/drivers/infiniband/core/uverbs_std_types_counters.c +++ b/drivers/infiniband/core/uverbs_std_types_counters.c @@ -80,6 +80,49 @@ err_create_counters: return ret; } +static int UVERBS_HANDLER(UVERBS_METHOD_COUNTERS_READ)(struct ib_device *ib_dev, + struct ib_uverbs_file *file, + struct uverbs_attr_bundle *attrs) +{ + struct ib_counters_read_attr read_attr = {}; + const struct uverbs_attr *uattr; + struct ib_counters *counters = + uverbs_attr_get_obj(attrs, UVERBS_ATTR_READ_COUNTERS_HANDLE); + int ret; + + if (!ib_dev->read_counters) + return -EOPNOTSUPP; + + if (!atomic_read(&counters->usecnt)) + return -EINVAL; + + ret = uverbs_copy_from(&read_attr.flags, attrs, + UVERBS_ATTR_READ_COUNTERS_FLAGS); + if (ret) + return ret; + + uattr = uverbs_attr_get(attrs, UVERBS_ATTR_READ_COUNTERS_BUFF); + read_attr.ncounters = uattr->ptr_attr.len / sizeof(u64); + read_attr.counters_buff = kcalloc(read_attr.ncounters, + sizeof(u64), GFP_KERNEL); + if (!read_attr.counters_buff) + return -ENOMEM; + + ret = ib_dev->read_counters(counters, + &read_attr, + attrs); + if (ret) + goto err_read; + + ret = uverbs_copy_to(attrs, UVERBS_ATTR_READ_COUNTERS_BUFF, + read_attr.counters_buff, + read_attr.ncounters * sizeof(u64)); + +err_read: + kfree(read_attr.counters_buff); + return ret; +} + static DECLARE_UVERBS_NAMED_METHOD(UVERBS_METHOD_COUNTERS_CREATE, &UVERBS_ATTR_IDR(UVERBS_ATTR_CREATE_COUNTERS_HANDLE, UVERBS_OBJECT_COUNTERS, @@ -93,8 +136,22 @@ static DECLARE_UVERBS_NAMED_METHOD_WITH_HANDLER(UVERBS_METHOD_COUNTERS_DESTROY, UVERBS_ACCESS_DESTROY, UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY))); +#define MAX_COUNTERS_BUFF_SIZE USHRT_MAX +static DECLARE_UVERBS_NAMED_METHOD(UVERBS_METHOD_COUNTERS_READ, + &UVERBS_ATTR_IDR(UVERBS_ATTR_READ_COUNTERS_HANDLE, + UVERBS_OBJECT_COUNTERS, + UVERBS_ACCESS_READ, + UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)), + &UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_READ_COUNTERS_BUFF, + UVERBS_ATTR_SIZE(0, MAX_COUNTERS_BUFF_SIZE), + UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)), + &UVERBS_ATTR_PTR_IN(UVERBS_ATTR_READ_COUNTERS_FLAGS, + UVERBS_ATTR_TYPE(__u32), + UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY))); + DECLARE_UVERBS_NAMED_OBJECT(UVERBS_OBJECT_COUNTERS, &UVERBS_TYPE_ALLOC_IDR(0, uverbs_free_counters), &UVERBS_METHOD(UVERBS_METHOD_COUNTERS_CREATE), - &UVERBS_METHOD(UVERBS_METHOD_COUNTERS_DESTROY)); + &UVERBS_METHOD(UVERBS_METHOD_COUNTERS_DESTROY), + &UVERBS_METHOD(UVERBS_METHOD_COUNTERS_READ)); diff --git a/include/uapi/rdma/ib_user_ioctl_cmds.h b/include/uapi/rdma/ib_user_ioctl_cmds.h index c28ce62d2e40..888ac5975a6c 100644 --- a/include/uapi/rdma/ib_user_ioctl_cmds.h +++ b/include/uapi/rdma/ib_user_ioctl_cmds.h @@ -140,9 +140,16 @@ enum uverbs_attrs_destroy_counters_cmd_attr_ids { UVERBS_ATTR_DESTROY_COUNTERS_HANDLE, }; +enum uverbs_attrs_read_counters_cmd_attr_ids { + UVERBS_ATTR_READ_COUNTERS_HANDLE, + UVERBS_ATTR_READ_COUNTERS_BUFF, + UVERBS_ATTR_READ_COUNTERS_FLAGS, +}; + enum uverbs_methods_actions_counters_ops { UVERBS_METHOD_COUNTERS_CREATE, UVERBS_METHOD_COUNTERS_DESTROY, + UVERBS_METHOD_COUNTERS_READ, }; #endif -- cgit v1.2.3 From 59082a327d0145c69b419a0f5bed96b13c5e9ed4 Mon Sep 17 00:00:00 2001 From: Matan Barak Date: Thu, 31 May 2018 16:43:35 +0300 Subject: IB/core: Support passing uhw for create_flow This is required when user-space drivers need to pass extra information regarding how to handle this flow steering specification. Reviewed-by: Yishai Hadas Signed-off-by: Matan Barak Signed-off-by: Boris Pismenny Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/uverbs_cmd.c | 7 ++++++- drivers/infiniband/core/verbs.c | 2 +- drivers/infiniband/hw/mlx4/main.c | 6 +++++- drivers/infiniband/hw/mlx5/main.c | 7 ++++++- include/rdma/ib_verbs.h | 3 ++- 5 files changed, 20 insertions(+), 5 deletions(-) (limited to 'drivers') diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 13cb5e4deb86..c735c13e53b0 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -3513,11 +3513,16 @@ int ib_uverbs_ex_create_flow(struct ib_uverbs_file *file, err = -EINVAL; goto err_free; } - flow_id = ib_create_flow(qp, flow_attr, IB_FLOW_DOMAIN_USER); + + flow_id = qp->device->create_flow(qp, flow_attr, + IB_FLOW_DOMAIN_USER, uhw); + if (IS_ERR(flow_id)) { err = PTR_ERR(flow_id); goto err_free; } + atomic_inc(&qp->usecnt); + flow_id->qp = qp; flow_id->uobject = uobj; uobj->object = flow_id; uflow = container_of(uobj, typeof(*uflow), uobject); diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index 7eff3aeffe01..9a3e886c12fd 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -1982,7 +1982,7 @@ struct ib_flow *ib_create_flow(struct ib_qp *qp, if (!qp->device->create_flow) return ERR_PTR(-EOPNOTSUPP); - flow_id = qp->device->create_flow(qp, flow_attr, domain); + flow_id = qp->device->create_flow(qp, flow_attr, domain, NULL); if (!IS_ERR(flow_id)) { atomic_inc(&qp->usecnt); flow_id->qp = qp; diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index 5b70744f414a..5b88bdd1ecef 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -1847,7 +1847,7 @@ static int mlx4_ib_add_dont_trap_rule(struct mlx4_dev *dev, static struct ib_flow *mlx4_ib_create_flow(struct ib_qp *qp, struct ib_flow_attr *flow_attr, - int domain) + int domain, struct ib_udata *udata) { int err = 0, i = 0, j = 0; struct mlx4_ib_flow *mflow; @@ -1865,6 +1865,10 @@ static struct ib_flow *mlx4_ib_create_flow(struct ib_qp *qp, (flow_attr->type != IB_FLOW_ATTR_NORMAL)) return ERR_PTR(-EOPNOTSUPP); + if (udata && + udata->inlen && !ib_is_udata_cleared(udata, 0, udata->inlen)) + return ERR_PTR(-EOPNOTSUPP); + memset(type, 0, sizeof(type)); mflow = kzalloc(sizeof(*mflow), GFP_KERNEL); diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index daa919e5a442..e94df85ddf08 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -3245,7 +3245,8 @@ err: static struct ib_flow *mlx5_ib_create_flow(struct ib_qp *qp, struct ib_flow_attr *flow_attr, - int domain) + int domain, + struct ib_udata *udata) { struct mlx5_ib_dev *dev = to_mdev(qp->device); struct mlx5_ib_qp *mqp = to_mqp(qp); @@ -3257,6 +3258,10 @@ static struct ib_flow *mlx5_ib_create_flow(struct ib_qp *qp, int err; int underlay_qpn; + if (udata && + udata->inlen && !ib_is_udata_cleared(udata, 0, udata->inlen)) + return ERR_PTR(-EOPNOTSUPP); + if (flow_attr->priority > MLX5_IB_FLOW_LAST_PRIO) return ERR_PTR(-ENOMEM); diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 3769a1cc99b0..ea97b91dd88c 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2427,7 +2427,8 @@ struct ib_device { struct ib_flow * (*create_flow)(struct ib_qp *qp, struct ib_flow_attr *flow_attr, - int domain); + int domain, + struct ib_udata *udata); int (*destroy_flow)(struct ib_flow *flow_id); int (*check_mr_status)(struct ib_mr *mr, u32 check_mask, struct ib_mr_status *mr_status); -- cgit v1.2.3 From b6ba4a9aa59fe99c6e9ca6ec941cd5f9823b0cae Mon Sep 17 00:00:00 2001 From: Raed Salem Date: Thu, 31 May 2018 16:43:37 +0300 Subject: IB/uverbs: Add support for flow counters The struct ib_uverbs_flow_spec_action_count associates a counters object with the flow. Post this association the flow counters can be read via the counters object. Reviewed-by: Yishai Hadas Signed-off-by: Raed Salem Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/uverbs.h | 1 + drivers/infiniband/core/uverbs_cmd.c | 81 +++++++++++++++++++++++++++++++----- include/uapi/rdma/ib_user_verbs.h | 13 ++++++ 3 files changed, 84 insertions(+), 11 deletions(-) (limited to 'drivers') diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h index 5b2461fa634d..c0d40fc3a53a 100644 --- a/drivers/infiniband/core/uverbs.h +++ b/drivers/infiniband/core/uverbs.h @@ -263,6 +263,7 @@ struct ib_uverbs_flow_spec { struct ib_uverbs_flow_spec_action_tag flow_tag; struct ib_uverbs_flow_spec_action_drop drop; struct ib_uverbs_flow_spec_action_handle action; + struct ib_uverbs_flow_spec_action_count flow_count; }; }; diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index c735c13e53b0..8fbeece7f831 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -2742,43 +2742,82 @@ out_put: struct ib_uflow_resources { size_t max; size_t num; - struct ib_flow_action *collection[0]; + size_t collection_num; + size_t counters_num; + struct ib_counters **counters; + struct ib_flow_action **collection; }; static struct ib_uflow_resources *flow_resources_alloc(size_t num_specs) { struct ib_uflow_resources *resources; - resources = - kmalloc(sizeof(*resources) + - num_specs * sizeof(*resources->collection), GFP_KERNEL); + resources = kzalloc(sizeof(*resources), GFP_KERNEL); if (!resources) - return NULL; + goto err_res; + + resources->counters = + kcalloc(num_specs, sizeof(*resources->counters), GFP_KERNEL); + + if (!resources->counters) + goto err_cnt; + + resources->collection = + kcalloc(num_specs, sizeof(*resources->collection), GFP_KERNEL); + + if (!resources->collection) + goto err_collection; - resources->num = 0; resources->max = num_specs; return resources; + +err_collection: + kfree(resources->counters); +err_cnt: + kfree(resources); +err_res: + return NULL; } void ib_uverbs_flow_resources_free(struct ib_uflow_resources *uflow_res) { unsigned int i; - for (i = 0; i < uflow_res->num; i++) + for (i = 0; i < uflow_res->collection_num; i++) atomic_dec(&uflow_res->collection[i]->usecnt); + for (i = 0; i < uflow_res->counters_num; i++) + atomic_dec(&uflow_res->counters[i]->usecnt); + + kfree(uflow_res->collection); + kfree(uflow_res->counters); kfree(uflow_res); } static void flow_resources_add(struct ib_uflow_resources *uflow_res, - struct ib_flow_action *action) + enum ib_flow_spec_type type, + void *ibobj) { WARN_ON(uflow_res->num >= uflow_res->max); - atomic_inc(&action->usecnt); - uflow_res->collection[uflow_res->num++] = action; + switch (type) { + case IB_FLOW_SPEC_ACTION_HANDLE: + atomic_inc(&((struct ib_flow_action *)ibobj)->usecnt); + uflow_res->collection[uflow_res->collection_num++] = + (struct ib_flow_action *)ibobj; + break; + case IB_FLOW_SPEC_ACTION_COUNT: + atomic_inc(&((struct ib_counters *)ibobj)->usecnt); + uflow_res->counters[uflow_res->counters_num++] = + (struct ib_counters *)ibobj; + break; + default: + WARN_ON(1); + } + + uflow_res->num++; } static int kern_spec_to_ib_spec_action(struct ib_ucontext *ucontext, @@ -2815,9 +2854,29 @@ static int kern_spec_to_ib_spec_action(struct ib_ucontext *ucontext, return -EINVAL; ib_spec->action.size = sizeof(struct ib_flow_spec_action_handle); - flow_resources_add(uflow_res, ib_spec->action.act); + flow_resources_add(uflow_res, + IB_FLOW_SPEC_ACTION_HANDLE, + ib_spec->action.act); uobj_put_obj_read(ib_spec->action.act); break; + case IB_FLOW_SPEC_ACTION_COUNT: + if (kern_spec->flow_count.size != + sizeof(struct ib_uverbs_flow_spec_action_count)) + return -EINVAL; + ib_spec->flow_count.counters = + uobj_get_obj_read(counters, + UVERBS_OBJECT_COUNTERS, + kern_spec->flow_count.handle, + ucontext); + if (!ib_spec->flow_count.counters) + return -EINVAL; + ib_spec->flow_count.size = + sizeof(struct ib_flow_spec_action_count); + flow_resources_add(uflow_res, + IB_FLOW_SPEC_ACTION_COUNT, + ib_spec->flow_count.counters); + uobj_put_obj_read(ib_spec->flow_count.counters); + break; default: return -EINVAL; } diff --git a/include/uapi/rdma/ib_user_verbs.h b/include/uapi/rdma/ib_user_verbs.h index 9be07394fdbe..f83e8ef6f056 100644 --- a/include/uapi/rdma/ib_user_verbs.h +++ b/include/uapi/rdma/ib_user_verbs.h @@ -998,6 +998,19 @@ struct ib_uverbs_flow_spec_action_handle { __u32 reserved1; }; +struct ib_uverbs_flow_spec_action_count { + union { + struct ib_uverbs_flow_spec_hdr hdr; + struct { + __u32 type; + __u16 size; + __u16 reserved; + }; + }; + __u32 handle; + __u32 reserved1; +}; + struct ib_uverbs_flow_tunnel_filter { __be32 tunnel_id; }; -- cgit v1.2.3 From b29e2a1309e38cd1afa598a54f3ccb4e4d2ee01c Mon Sep 17 00:00:00 2001 From: Raed Salem Date: Thu, 31 May 2018 16:43:38 +0300 Subject: IB/mlx5: Add counters create and destroy support This patch implements the device counters create and destroy APIs and introducing some internal management structures. Downstream patches in this series will add the functionality to support flow counters binding and reading. Reviewed-by: Yishai Hadas Signed-off-by: Raed Salem Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/main.c | 23 +++++++++++++++++++++++ drivers/infiniband/hw/mlx5/mlx5_ib.h | 10 ++++++++++ 2 files changed, 33 insertions(+) (limited to 'drivers') diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index e94df85ddf08..81471013b776 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -5006,6 +5006,27 @@ static void depopulate_specs_root(struct mlx5_ib_dev *dev) uverbs_free_spec_tree(dev->ib_dev.specs_root); } +static int mlx5_ib_destroy_counters(struct ib_counters *counters) +{ + struct mlx5_ib_mcounters *mcounters = to_mcounters(counters); + + kfree(mcounters); + + return 0; +} + +static struct ib_counters *mlx5_ib_create_counters(struct ib_device *device, + struct uverbs_attr_bundle *attrs) +{ + struct mlx5_ib_mcounters *mcounters; + + mcounters = kzalloc(sizeof(*mcounters), GFP_KERNEL); + if (!mcounters) + return ERR_PTR(-ENOMEM); + + return &mcounters->ibcntrs; +} + void mlx5_ib_stage_init_cleanup(struct mlx5_ib_dev *dev) { mlx5_ib_cleanup_multiport_master(dev); @@ -5249,6 +5270,8 @@ int mlx5_ib_stage_caps_init(struct mlx5_ib_dev *dev) dev->ib_dev.destroy_flow_action = mlx5_ib_destroy_flow_action; dev->ib_dev.modify_flow_action_esp = mlx5_ib_modify_flow_action_esp; dev->ib_dev.driver_id = RDMA_DRIVER_MLX5; + dev->ib_dev.create_counters = mlx5_ib_create_counters; + dev->ib_dev.destroy_counters = mlx5_ib_destroy_counters; err = init_node_data(dev); if (err) diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 49a1aa0ff429..fd27ec1aed08 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -813,6 +813,16 @@ struct mlx5_memic { DECLARE_BITMAP(memic_alloc_pages, MLX5_MAX_MEMIC_PAGES); }; +struct mlx5_ib_mcounters { + struct ib_counters ibcntrs; +}; + +static inline struct mlx5_ib_mcounters * +to_mcounters(struct ib_counters *ibcntrs) +{ + return container_of(ibcntrs, struct mlx5_ib_mcounters, ibcntrs); +} + struct mlx5_ib_dev { struct ib_device ib_dev; struct mlx5_core_dev *mdev; -- cgit v1.2.3 From 3b3233fbf02ee4c5de4d635ca6c4f2566d9716df Mon Sep 17 00:00:00 2001 From: Raed Salem Date: Thu, 31 May 2018 16:43:39 +0300 Subject: IB/mlx5: Add flow counters binding support Associates a counters with a flow when IB_FLOW_SPEC_ACTION_COUNT is part of the flow specifications. The counters user space placements of location and description (index, description) pairs are passed as private data of the counters flow specification. Reviewed-by: Yishai Hadas Signed-off-by: Raed Salem Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/main.c | 222 +++++++++++++++++++++++++++++++++-- drivers/infiniband/hw/mlx5/mlx5_ib.h | 15 +++ include/linux/mlx5/fs.h | 1 + include/uapi/rdma/mlx5-abi.h | 24 ++++ 4 files changed, 249 insertions(+), 13 deletions(-) (limited to 'drivers') diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 81471013b776..c52841bad4e7 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -2449,6 +2449,7 @@ static void set_tos(void *outer_c, void *outer_v, u8 mask, u8 val) #define LAST_TUNNEL_FIELD tunnel_id #define LAST_FLOW_TAG_FIELD tag_id #define LAST_DROP_FIELD size +#define LAST_COUNTERS_FIELD counters /* Field is the last supported field */ #define FIELDS_NOT_SUPPORTED(filter, field)\ @@ -2721,6 +2722,18 @@ static int parse_flow_attr(struct mlx5_core_dev *mdev, u32 *match_c, if (ret) return ret; break; + case IB_FLOW_SPEC_ACTION_COUNT: + if (FIELDS_NOT_SUPPORTED(ib_spec->flow_count, + LAST_COUNTERS_FIELD)) + return -EOPNOTSUPP; + + /* for now support only one counters spec per flow */ + if (action->action & MLX5_FLOW_CONTEXT_ACTION_COUNT) + return -EINVAL; + + action->counters = ib_spec->flow_count.counters; + action->action |= MLX5_FLOW_CONTEXT_ACTION_COUNT; + break; default: return -EINVAL; } @@ -2868,6 +2881,17 @@ static void put_flow_table(struct mlx5_ib_dev *dev, } } +static void counters_clear_description(struct ib_counters *counters) +{ + struct mlx5_ib_mcounters *mcounters = to_mcounters(counters); + + mutex_lock(&mcounters->mcntrs_mutex); + kfree(mcounters->counters_data); + mcounters->counters_data = NULL; + mcounters->cntrs_max_index = 0; + mutex_unlock(&mcounters->mcntrs_mutex); +} + static int mlx5_ib_destroy_flow(struct ib_flow *flow_id) { struct mlx5_ib_dev *dev = to_mdev(flow_id->qp->device); @@ -2887,8 +2911,11 @@ static int mlx5_ib_destroy_flow(struct ib_flow *flow_id) mlx5_del_flow_rules(handler->rule); put_flow_table(dev, handler->prio, true); - mutex_unlock(&dev->flow_db->lock); + if (handler->ibcounters && + atomic_read(&handler->ibcounters->usecnt) == 1) + counters_clear_description(handler->ibcounters); + mutex_unlock(&dev->flow_db->lock); kfree(handler); return 0; @@ -3008,21 +3035,127 @@ static void set_underlay_qp(struct mlx5_ib_dev *dev, } } +static int counters_set_description(struct ib_counters *counters, + enum mlx5_ib_counters_type counters_type, + struct mlx5_ib_flow_counters_desc *desc_data, + u32 ncounters) +{ + struct mlx5_ib_mcounters *mcounters = to_mcounters(counters); + u32 cntrs_max_index = 0; + int i; + + if (counters_type != MLX5_IB_COUNTERS_FLOW) + return -EINVAL; + + /* init the fields for the object */ + mcounters->type = counters_type; + mcounters->ncounters = ncounters; + /* each counter entry have both description and index pair */ + for (i = 0; i < ncounters; i++) { + if (desc_data[i].description > IB_COUNTER_BYTES) + return -EINVAL; + + if (cntrs_max_index <= desc_data[i].index) + cntrs_max_index = desc_data[i].index + 1; + } + + mutex_lock(&mcounters->mcntrs_mutex); + mcounters->counters_data = desc_data; + mcounters->cntrs_max_index = cntrs_max_index; + mutex_unlock(&mcounters->mcntrs_mutex); + + return 0; +} + +#define MAX_COUNTERS_NUM (USHRT_MAX / (sizeof(u32) * 2)) +static int flow_counters_set_data(struct ib_counters *ibcounters, + struct mlx5_ib_create_flow *ucmd) +{ + struct mlx5_ib_mcounters *mcounters = to_mcounters(ibcounters); + struct mlx5_ib_flow_counters_data *cntrs_data = NULL; + struct mlx5_ib_flow_counters_desc *desc_data = NULL; + bool hw_hndl = false; + int ret = 0; + + if (ucmd && ucmd->ncounters_data != 0) { + cntrs_data = ucmd->data; + if (cntrs_data->ncounters > MAX_COUNTERS_NUM) + return -EINVAL; + + desc_data = kcalloc(cntrs_data->ncounters, + sizeof(*desc_data), + GFP_KERNEL); + if (!desc_data) + return -ENOMEM; + + if (copy_from_user(desc_data, + u64_to_user_ptr(cntrs_data->counters_data), + sizeof(*desc_data) * cntrs_data->ncounters)) { + ret = -EFAULT; + goto free; + } + } + + if (!mcounters->hw_cntrs_hndl) { + mcounters->hw_cntrs_hndl = mlx5_fc_create( + to_mdev(ibcounters->device)->mdev, false); + if (!mcounters->hw_cntrs_hndl) { + ret = -ENOMEM; + goto free; + } + hw_hndl = true; + } + + if (desc_data) { + /* counters already bound to at least one flow */ + if (mcounters->cntrs_max_index) { + ret = -EINVAL; + goto free_hndl; + } + + ret = counters_set_description(ibcounters, + MLX5_IB_COUNTERS_FLOW, + desc_data, + cntrs_data->ncounters); + if (ret) + goto free_hndl; + + } else if (!mcounters->cntrs_max_index) { + /* counters not bound yet, must have udata passed */ + ret = -EINVAL; + goto free_hndl; + } + + return 0; + +free_hndl: + if (hw_hndl) { + mlx5_fc_destroy(to_mdev(ibcounters->device)->mdev, + mcounters->hw_cntrs_hndl); + mcounters->hw_cntrs_hndl = NULL; + } +free: + kfree(desc_data); + return ret; +} + static struct mlx5_ib_flow_handler *_create_flow_rule(struct mlx5_ib_dev *dev, struct mlx5_ib_flow_prio *ft_prio, const struct ib_flow_attr *flow_attr, struct mlx5_flow_destination *dst, - u32 underlay_qpn) + u32 underlay_qpn, + struct mlx5_ib_create_flow *ucmd) { struct mlx5_flow_table *ft = ft_prio->flow_table; struct mlx5_ib_flow_handler *handler; struct mlx5_flow_act flow_act = {.flow_tag = MLX5_FS_DEFAULT_FLOW_TAG}; struct mlx5_flow_spec *spec; - struct mlx5_flow_destination *rule_dst = dst; + struct mlx5_flow_destination dest_arr[2] = {}; + struct mlx5_flow_destination *rule_dst = dest_arr; const void *ib_flow = (const void *)flow_attr + sizeof(*flow_attr); unsigned int spec_index; int err = 0; - int dest_num = 1; + int dest_num = 0; bool is_egress = flow_attr->flags & IB_FLOW_ATTR_FLAGS_EGRESS; if (!is_valid_attr(dev->mdev, flow_attr)) @@ -3036,6 +3169,10 @@ static struct mlx5_ib_flow_handler *_create_flow_rule(struct mlx5_ib_dev *dev, } INIT_LIST_HEAD(&handler->list); + if (dst) { + memcpy(&dest_arr[0], dst, sizeof(*dst)); + dest_num++; + } for (spec_index = 0; spec_index < flow_attr->num_of_specs; spec_index++) { err = parse_flow_attr(dev->mdev, spec->match_criteria, @@ -3070,15 +3207,30 @@ static struct mlx5_ib_flow_handler *_create_flow_rule(struct mlx5_ib_dev *dev, goto free; } + if (flow_act.action & MLX5_FLOW_CONTEXT_ACTION_COUNT) { + err = flow_counters_set_data(flow_act.counters, ucmd); + if (err) + goto free; + + handler->ibcounters = flow_act.counters; + dest_arr[dest_num].type = + MLX5_FLOW_DESTINATION_TYPE_COUNTER; + dest_arr[dest_num].counter = + to_mcounters(flow_act.counters)->hw_cntrs_hndl; + dest_num++; + } + if (flow_act.action & MLX5_FLOW_CONTEXT_ACTION_DROP) { - rule_dst = NULL; - dest_num = 0; + if (!(flow_act.action & MLX5_FLOW_CONTEXT_ACTION_COUNT)) { + rule_dst = NULL; + dest_num = 0; + } } else { if (is_egress) flow_act.action |= MLX5_FLOW_CONTEXT_ACTION_ALLOW; else flow_act.action |= - dst ? MLX5_FLOW_CONTEXT_ACTION_FWD_DEST : + dest_num ? MLX5_FLOW_CONTEXT_ACTION_FWD_DEST : MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_PRIO; } @@ -3104,8 +3256,12 @@ static struct mlx5_ib_flow_handler *_create_flow_rule(struct mlx5_ib_dev *dev, ft_prio->flow_table = ft; free: - if (err) + if (err && handler) { + if (handler->ibcounters && + atomic_read(&handler->ibcounters->usecnt) == 1) + counters_clear_description(handler->ibcounters); kfree(handler); + } kvfree(spec); return err ? ERR_PTR(err) : handler; } @@ -3115,7 +3271,7 @@ static struct mlx5_ib_flow_handler *create_flow_rule(struct mlx5_ib_dev *dev, const struct ib_flow_attr *flow_attr, struct mlx5_flow_destination *dst) { - return _create_flow_rule(dev, ft_prio, flow_attr, dst, 0); + return _create_flow_rule(dev, ft_prio, flow_attr, dst, 0, NULL); } static struct mlx5_ib_flow_handler *create_dont_trap_rule(struct mlx5_ib_dev *dev, @@ -3255,12 +3411,43 @@ static struct ib_flow *mlx5_ib_create_flow(struct ib_qp *qp, struct mlx5_ib_flow_prio *ft_prio_tx = NULL; struct mlx5_ib_flow_prio *ft_prio; bool is_egress = flow_attr->flags & IB_FLOW_ATTR_FLAGS_EGRESS; + struct mlx5_ib_create_flow *ucmd = NULL, ucmd_hdr; + size_t min_ucmd_sz, required_ucmd_sz; int err; int underlay_qpn; - if (udata && - udata->inlen && !ib_is_udata_cleared(udata, 0, udata->inlen)) - return ERR_PTR(-EOPNOTSUPP); + if (udata && udata->inlen) { + min_ucmd_sz = offsetof(typeof(ucmd_hdr), reserved) + + sizeof(ucmd_hdr.reserved); + if (udata->inlen < min_ucmd_sz) + return ERR_PTR(-EOPNOTSUPP); + + err = ib_copy_from_udata(&ucmd_hdr, udata, min_ucmd_sz); + if (err) + return ERR_PTR(err); + + /* currently supports only one counters data */ + if (ucmd_hdr.ncounters_data > 1) + return ERR_PTR(-EINVAL); + + required_ucmd_sz = min_ucmd_sz + + sizeof(struct mlx5_ib_flow_counters_data) * + ucmd_hdr.ncounters_data; + if (udata->inlen > required_ucmd_sz && + !ib_is_udata_cleared(udata, required_ucmd_sz, + udata->inlen - required_ucmd_sz)) + return ERR_PTR(-EOPNOTSUPP); + + ucmd = kzalloc(required_ucmd_sz, GFP_KERNEL); + if (!ucmd) + return ERR_PTR(-ENOMEM); + + err = ib_copy_from_udata(ucmd, udata, required_ucmd_sz); + if (err) { + kfree(ucmd); + return ERR_PTR(err); + } + } if (flow_attr->priority > MLX5_IB_FLOW_LAST_PRIO) return ERR_PTR(-ENOMEM); @@ -3315,7 +3502,7 @@ static struct ib_flow *mlx5_ib_create_flow(struct ib_qp *qp, underlay_qpn = (mqp->flags & MLX5_IB_QP_UNDERLAY) ? mqp->underlay_qpn : 0; handler = _create_flow_rule(dev, ft_prio, flow_attr, - dst, underlay_qpn); + dst, underlay_qpn, ucmd); } } else if (flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT || flow_attr->type == IB_FLOW_ATTR_MC_DEFAULT) { @@ -3336,6 +3523,7 @@ static struct ib_flow *mlx5_ib_create_flow(struct ib_qp *qp, mutex_unlock(&dev->flow_db->lock); kfree(dst); + kfree(ucmd); return &handler->ibflow; @@ -3346,6 +3534,7 @@ destroy_ft: unlock: mutex_unlock(&dev->flow_db->lock); kfree(dst); + kfree(ucmd); kfree(handler); return ERR_PTR(err); } @@ -5010,6 +5199,11 @@ static int mlx5_ib_destroy_counters(struct ib_counters *counters) { struct mlx5_ib_mcounters *mcounters = to_mcounters(counters); + counters_clear_description(counters); + if (mcounters->hw_cntrs_hndl) + mlx5_fc_destroy(to_mdev(counters->device)->mdev, + mcounters->hw_cntrs_hndl); + kfree(mcounters); return 0; @@ -5024,6 +5218,8 @@ static struct ib_counters *mlx5_ib_create_counters(struct ib_device *device, if (!mcounters) return ERR_PTR(-ENOMEM); + mutex_init(&mcounters->mcntrs_mutex); + return &mcounters->ibcntrs; } diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index fd27ec1aed08..155bca627222 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -175,6 +175,7 @@ struct mlx5_ib_flow_handler { struct ib_flow ibflow; struct mlx5_ib_flow_prio *prio; struct mlx5_flow_handle *rule; + struct ib_counters *ibcounters; }; struct mlx5_ib_flow_db { @@ -813,8 +814,22 @@ struct mlx5_memic { DECLARE_BITMAP(memic_alloc_pages, MLX5_MAX_MEMIC_PAGES); }; +enum mlx5_ib_counters_type { + MLX5_IB_COUNTERS_FLOW, +}; + struct mlx5_ib_mcounters { struct ib_counters ibcntrs; + enum mlx5_ib_counters_type type; + void *hw_cntrs_hndl; + /* max index set as part of create_flow */ + u32 cntrs_max_index; + /* number of counters data entries ( pair) */ + u32 ncounters; + /* counters data array for descriptions and indexes */ + struct mlx5_ib_flow_counters_desc *counters_data; + /* protects access to mcounters internal data */ + struct mutex mcntrs_mutex; }; static inline struct mlx5_ib_mcounters * diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h index 3b4c3298061c..757b4a30281e 100644 --- a/include/linux/mlx5/fs.h +++ b/include/linux/mlx5/fs.h @@ -160,6 +160,7 @@ struct mlx5_flow_act { u32 modify_id; uintptr_t esp_id; struct mlx5_fs_vlan vlan; + struct ib_counters *counters; }; #define MLX5_DECLARE_FLOW_ACT(name) \ diff --git a/include/uapi/rdma/mlx5-abi.h b/include/uapi/rdma/mlx5-abi.h index cb4a02c4a1ce..ab71e939eb78 100644 --- a/include/uapi/rdma/mlx5-abi.h +++ b/include/uapi/rdma/mlx5-abi.h @@ -36,6 +36,7 @@ #include #include /* For ETH_ALEN. */ +#include enum { MLX5_QP_FLAG_SIGNATURE = 1 << 0, @@ -441,4 +442,27 @@ enum { enum { MLX5_IB_CLOCK_INFO_V1 = 0, }; + +struct mlx5_ib_flow_counters_desc { + __u32 description; + __u32 index; +}; + +struct mlx5_ib_flow_counters_data { + RDMA_UAPI_PTR(struct mlx5_ib_flow_counters_desc *, counters_data); + __u32 ncounters; + __u32 reserved; +}; + +struct mlx5_ib_create_flow { + __u32 ncounters_data; + __u32 reserved; + /* + * Following are counters data based on ncounters_data, each + * entry in the data[] should match a corresponding counter object + * that was pointed by a counters spec upon the flow creation + */ + struct mlx5_ib_flow_counters_data data[]; +}; + #endif /* MLX5_ABI_USER_H */ -- cgit v1.2.3 From 5e95af5f7b60796ccd890a39c0ed9c5df3537952 Mon Sep 17 00:00:00 2001 From: Raed Salem Date: Thu, 31 May 2018 16:43:40 +0300 Subject: IB/mlx5: Add flow counters read support Implements the flow counters read wrapper. Reviewed-by: Yishai Hadas Signed-off-by: Raed Salem Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/main.c | 15 +++++++++++++++ drivers/infiniband/hw/mlx5/mlx5_ib.h | 13 ++++++++++++- 2 files changed, 27 insertions(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index c52841bad4e7..59e9d10e54b7 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -3035,6 +3035,19 @@ static void set_underlay_qp(struct mlx5_ib_dev *dev, } } +static int read_flow_counters(struct ib_device *ibdev, + struct mlx5_read_counters_attr *read_attr) +{ + struct mlx5_fc *fc = read_attr->hw_cntrs_hndl; + struct mlx5_ib_dev *dev = to_mdev(ibdev); + + return mlx5_fc_query(dev->mdev, fc, + &read_attr->out[IB_COUNTER_PACKETS], + &read_attr->out[IB_COUNTER_BYTES]); +} + +/* flow counters currently expose two counters packets and bytes */ +#define FLOW_COUNTERS_NUM 2 static int counters_set_description(struct ib_counters *counters, enum mlx5_ib_counters_type counters_type, struct mlx5_ib_flow_counters_desc *desc_data, @@ -3049,6 +3062,8 @@ static int counters_set_description(struct ib_counters *counters, /* init the fields for the object */ mcounters->type = counters_type; + mcounters->read_counters = read_flow_counters; + mcounters->counters_num = FLOW_COUNTERS_NUM; mcounters->ncounters = ncounters; /* each counter entry have both description and index pair */ for (i = 0; i < ncounters; i++) { diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 155bca627222..d89c8fe626f6 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -814,6 +814,12 @@ struct mlx5_memic { DECLARE_BITMAP(memic_alloc_pages, MLX5_MAX_MEMIC_PAGES); }; +struct mlx5_read_counters_attr { + struct mlx5_fc *hw_cntrs_hndl; + u64 *out; + u32 flags; +}; + enum mlx5_ib_counters_type { MLX5_IB_COUNTERS_FLOW, }; @@ -821,7 +827,12 @@ enum mlx5_ib_counters_type { struct mlx5_ib_mcounters { struct ib_counters ibcntrs; enum mlx5_ib_counters_type type; - void *hw_cntrs_hndl; + /* number of counters supported for this counters type */ + u32 counters_num; + struct mlx5_fc *hw_cntrs_hndl; + /* read function for this counters type */ + int (*read_counters)(struct ib_device *ibdev, + struct mlx5_read_counters_attr *read_attr); /* max index set as part of create_flow */ u32 cntrs_max_index; /* number of counters data entries ( pair) */ -- cgit v1.2.3 From 1a1e03dc15cfa94b7e878a32a979705df614d9c4 Mon Sep 17 00:00:00 2001 From: Raed Salem Date: Thu, 31 May 2018 16:43:41 +0300 Subject: IB/mlx5: Add counters read support This patch implements the uverbs counters read API, it will use the specific read counters function to the given type to accomplish its task. Reviewed-by: Yishai Hadas Signed-off-by: Raed Salem Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/main.c | 43 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) (limited to 'drivers') diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 59e9d10e54b7..7a563478d0b2 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -5210,6 +5210,48 @@ static void depopulate_specs_root(struct mlx5_ib_dev *dev) uverbs_free_spec_tree(dev->ib_dev.specs_root); } +static int mlx5_ib_read_counters(struct ib_counters *counters, + struct ib_counters_read_attr *read_attr, + struct uverbs_attr_bundle *attrs) +{ + struct mlx5_ib_mcounters *mcounters = to_mcounters(counters); + struct mlx5_read_counters_attr mread_attr = {}; + struct mlx5_ib_flow_counters_desc *desc; + int ret, i; + + mutex_lock(&mcounters->mcntrs_mutex); + if (mcounters->cntrs_max_index > read_attr->ncounters) { + ret = -EINVAL; + goto err_bound; + } + + mread_attr.out = kcalloc(mcounters->counters_num, sizeof(u64), + GFP_KERNEL); + if (!mread_attr.out) { + ret = -ENOMEM; + goto err_bound; + } + + mread_attr.hw_cntrs_hndl = mcounters->hw_cntrs_hndl; + mread_attr.flags = read_attr->flags; + ret = mcounters->read_counters(counters->device, &mread_attr); + if (ret) + goto err_read; + + /* do the pass over the counters data array to assign according to the + * descriptions and indexing pairs + */ + desc = mcounters->counters_data; + for (i = 0; i < mcounters->ncounters; i++) + read_attr->counters_buff[desc[i].index] += mread_attr.out[desc[i].description]; + +err_read: + kfree(mread_attr.out); +err_bound: + mutex_unlock(&mcounters->mcntrs_mutex); + return ret; +} + static int mlx5_ib_destroy_counters(struct ib_counters *counters) { struct mlx5_ib_mcounters *mcounters = to_mcounters(counters); @@ -5483,6 +5525,7 @@ int mlx5_ib_stage_caps_init(struct mlx5_ib_dev *dev) dev->ib_dev.driver_id = RDMA_DRIVER_MLX5; dev->ib_dev.create_counters = mlx5_ib_create_counters; dev->ib_dev.destroy_counters = mlx5_ib_destroy_counters; + dev->ib_dev.read_counters = mlx5_ib_read_counters; err = init_node_data(dev); if (err) -- cgit v1.2.3 From 0e12af84cdd3056460f928adc164f9e87f4b303b Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Thu, 31 May 2018 11:05:23 +0300 Subject: IB/isert: fix T10-pi check mask setting A copy/paste bug (probably) caused setting of an app_tag check mask in case where a ref_tag check was needed. Fixes: 38a2d0d429f1 ("IB/isert: convert to the generic RDMA READ/WRITE API") Fixes: 9e961ae73c2c ("IB/isert: Support T10-PI protected transactions") Cc: stable@vger.kernel.org Reviewed-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Reviewed-by: Martin K. Petersen Signed-off-by: Max Gurtovoy Signed-off-by: Jason Gunthorpe --- drivers/infiniband/ulp/isert/ib_isert.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/infiniband/ulp/isert/ib_isert.c b/drivers/infiniband/ulp/isert/ib_isert.c index 6a55b870b863..3130698fee70 100644 --- a/drivers/infiniband/ulp/isert/ib_isert.c +++ b/drivers/infiniband/ulp/isert/ib_isert.c @@ -2116,7 +2116,7 @@ isert_set_sig_attrs(struct se_cmd *se_cmd, struct ib_sig_attrs *sig_attrs) sig_attrs->check_mask = (se_cmd->prot_checks & TARGET_DIF_CHECK_GUARD ? 0xc0 : 0) | - (se_cmd->prot_checks & TARGET_DIF_CHECK_REFTAG ? 0x30 : 0) | + (se_cmd->prot_checks & TARGET_DIF_CHECK_APPTAG ? 0x30 : 0) | (se_cmd->prot_checks & TARGET_DIF_CHECK_REFTAG ? 0x0f : 0); return 0; } -- cgit v1.2.3 From c6c2c03a665af99785341eb0481f4170942890d2 Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Thu, 31 May 2018 11:05:25 +0300 Subject: IB/iser: use T10-PI check mask definitions from core layer No reason to re-define protection information check in ib_iser driver. Use check masks from RDMA core driver. Reviewed-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Reviewed-by: Martin K. Petersen Signed-off-by: Max Gurtovoy Signed-off-by: Jason Gunthorpe --- drivers/infiniband/ulp/iser/iscsi_iser.h | 4 ---- drivers/infiniband/ulp/iser/iser_memory.c | 4 ++-- 2 files changed, 2 insertions(+), 6 deletions(-) (limited to 'drivers') diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.h b/drivers/infiniband/ulp/iser/iscsi_iser.h index c30ceb08db21..120b40829560 100644 --- a/drivers/infiniband/ulp/iser/iscsi_iser.h +++ b/drivers/infiniband/ulp/iser/iscsi_iser.h @@ -383,10 +383,6 @@ struct iser_device { bool remote_inv_sup; }; -#define ISER_CHECK_GUARD 0xc0 -#define ISER_CHECK_REFTAG 0x0f -#define ISER_CHECK_APPTAG 0x30 - /** * struct iser_reg_resources - Fast registration recources * diff --git a/drivers/infiniband/ulp/iser/iser_memory.c b/drivers/infiniband/ulp/iser/iser_memory.c index 322209d5ff58..ca844a926e6a 100644 --- a/drivers/infiniband/ulp/iser/iser_memory.c +++ b/drivers/infiniband/ulp/iser/iser_memory.c @@ -362,9 +362,9 @@ iser_set_prot_checks(struct scsi_cmnd *sc, u8 *mask) { *mask = 0; if (sc->prot_flags & SCSI_PROT_REF_CHECK) - *mask |= ISER_CHECK_REFTAG; + *mask |= IB_SIG_CHECK_REFTAG; if (sc->prot_flags & SCSI_PROT_GUARD_CHECK) - *mask |= ISER_CHECK_GUARD; + *mask |= IB_SIG_CHECK_GUARD; } static inline void -- cgit v1.2.3 From 26232872b1e957eb4ad10b07ae9e1cd2999848d2 Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Thu, 31 May 2018 11:05:26 +0300 Subject: IB/isert: use T10-PI check mask definitions from core layer No reason to use hard-coded protection information checks in ib_isert driver. Use check masks from RDMA core driver. Also, while we here, reduce the number of instructions made for setting the check mask (no need to do bitwise or with 0 since we zero the mask in the beginning of the function). Reviewed-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Reviewed-by: Martin K. Petersen Signed-off-by: Max Gurtovoy Signed-off-by: Jason Gunthorpe --- drivers/infiniband/ulp/isert/ib_isert.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'drivers') diff --git a/drivers/infiniband/ulp/isert/ib_isert.c b/drivers/infiniband/ulp/isert/ib_isert.c index 3130698fee70..f2f9318e1f49 100644 --- a/drivers/infiniband/ulp/isert/ib_isert.c +++ b/drivers/infiniband/ulp/isert/ib_isert.c @@ -2114,10 +2114,13 @@ isert_set_sig_attrs(struct se_cmd *se_cmd, struct ib_sig_attrs *sig_attrs) return -EINVAL; } - sig_attrs->check_mask = - (se_cmd->prot_checks & TARGET_DIF_CHECK_GUARD ? 0xc0 : 0) | - (se_cmd->prot_checks & TARGET_DIF_CHECK_APPTAG ? 0x30 : 0) | - (se_cmd->prot_checks & TARGET_DIF_CHECK_REFTAG ? 0x0f : 0); + if (se_cmd->prot_checks & TARGET_DIF_CHECK_GUARD) + sig_attrs->check_mask |= IB_SIG_CHECK_GUARD; + if (se_cmd->prot_checks & TARGET_DIF_CHECK_APPTAG) + sig_attrs->check_mask |= IB_SIG_CHECK_APPTAG; + if (se_cmd->prot_checks & TARGET_DIF_CHECK_REFTAG) + sig_attrs->check_mask |= IB_SIG_CHECK_REFTAG; + return 0; } -- cgit v1.2.3 From e4b1672ac0a54c7740cbc4ff39dfdc56182236cb Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 30 May 2018 23:58:18 +0200 Subject: iw_cxgb4: add INFINIBAND_ADDR_TRANS dependency The newly added fill_res_ep_entry function fails to link if CONFIG_INFINIBAND_ADDR_TRANS is not set: drivers/infiniband/hw/cxgb4/restrack.o: In function `fill_res_ep_entry': restrack.c:(.text+0x3cc): undefined reference to `rdma_res_to_id' restrack.c:(.text+0x3d0): undefined reference to `rdma_iw_cm_id' This adds a Kconfig dependency for the driver. Fixes: 116aeb887371 ("iw_cxgb4: provide detailed provider-specific CM_ID information") Signed-off-by: Arnd Bergmann Acked-by: Greg Thelen Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/cxgb4/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers') diff --git a/drivers/infiniband/hw/cxgb4/Kconfig b/drivers/infiniband/hw/cxgb4/Kconfig index 0a671a61fc92..e0522a5d5a06 100644 --- a/drivers/infiniband/hw/cxgb4/Kconfig +++ b/drivers/infiniband/hw/cxgb4/Kconfig @@ -1,6 +1,7 @@ config INFINIBAND_CXGB4 tristate "Chelsio T4/T5 RDMA Driver" depends on CHELSIO_T4 && INET + depends on INFINIBAND_ADDR_TRANS select CHELSIO_LIB select GENERIC_ALLOCATOR ---help--- -- cgit v1.2.3 From cb2595c1393b4a5211534e6f0a0fbad369e21ad8 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Fri, 1 Jun 2018 11:31:44 -0700 Subject: infiniband: fix a possible use-after-free bug ucma_process_join() will free the new allocated "mc" struct, if there is any error after that, especially the copy_to_user(). But in parallel, ucma_leave_multicast() could find this "mc" through idr_find() before ucma_process_join() frees it, since it is already published. So "mc" could be used in ucma_leave_multicast() after it is been allocated and freed in ucma_process_join(), since we don't refcnt it. Fix this by separating "publish" from ID allocation, so that we can get an ID first and publish it later after copy_to_user(). Fixes: c8f6a362bf3e ("RDMA/cma: Add multicast communication support") Reported-by: Noam Rathaus Signed-off-by: Cong Wang Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/ucma.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c index eab43b17e9cf..ec8fb289621f 100644 --- a/drivers/infiniband/core/ucma.c +++ b/drivers/infiniband/core/ucma.c @@ -235,7 +235,7 @@ static struct ucma_multicast* ucma_alloc_multicast(struct ucma_context *ctx) return NULL; mutex_lock(&mut); - mc->id = idr_alloc(&multicast_idr, mc, 0, 0, GFP_KERNEL); + mc->id = idr_alloc(&multicast_idr, NULL, 0, 0, GFP_KERNEL); mutex_unlock(&mut); if (mc->id < 0) goto error; @@ -1421,6 +1421,10 @@ static ssize_t ucma_process_join(struct ucma_file *file, goto err3; } + mutex_lock(&mut); + idr_replace(&multicast_idr, mc, mc->id); + mutex_unlock(&mut); + mutex_unlock(&file->mut); ucma_put_ctx(ctx); return 0; -- cgit v1.2.3 From 8c61b24585c44e1de337e45858129abce9c3a008 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Sun, 3 Jun 2018 17:32:22 +0800 Subject: IB/hns: Use zeroing memory allocator instead of allocator/memset Use dma_zalloc_coherent for allocating zeroed memory and remove unnecessary memset function. Signed-off-by: YueHaibing Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_alloc.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'drivers') diff --git a/drivers/infiniband/hw/hns/hns_roce_alloc.c b/drivers/infiniband/hw/hns/hns_roce_alloc.c index a40ec939ece5..46f65f9f59d0 100644 --- a/drivers/infiniband/hw/hns/hns_roce_alloc.c +++ b/drivers/infiniband/hw/hns/hns_roce_alloc.c @@ -197,7 +197,8 @@ int hns_roce_buf_alloc(struct hns_roce_dev *hr_dev, u32 size, u32 max_direct, buf->npages = 1 << order; buf->page_shift = page_shift; /* MTT PA must be recorded in 4k alignment, t is 4k aligned */ - buf->direct.buf = dma_alloc_coherent(dev, size, &t, GFP_KERNEL); + buf->direct.buf = dma_zalloc_coherent(dev, + size, &t, GFP_KERNEL); if (!buf->direct.buf) return -ENOMEM; @@ -207,8 +208,6 @@ int hns_roce_buf_alloc(struct hns_roce_dev *hr_dev, u32 size, u32 max_direct, --buf->page_shift; buf->npages *= 2; } - - memset(buf->direct.buf, 0, size); } else { buf->nbufs = (size + page_size - 1) / page_size; buf->npages = buf->nbufs; @@ -220,7 +219,7 @@ int hns_roce_buf_alloc(struct hns_roce_dev *hr_dev, u32 size, u32 max_direct, return -ENOMEM; for (i = 0; i < buf->nbufs; ++i) { - buf->page_list[i].buf = dma_alloc_coherent(dev, + buf->page_list[i].buf = dma_zalloc_coherent(dev, page_size, &t, GFP_KERNEL); @@ -228,7 +227,6 @@ int hns_roce_buf_alloc(struct hns_roce_dev *hr_dev, u32 size, u32 max_direct, goto err_free; buf->page_list[i].map = t; - memset(buf->page_list[i].buf, 0, page_size); } } -- cgit v1.2.3 From 1bc0299d976e000ececc6acd76e33b4582646cb7 Mon Sep 17 00:00:00 2001 From: Mike Marciniszyn Date: Thu, 31 May 2018 11:30:09 -0700 Subject: IB/hfi1: Fix user context tail allocation for DMA_RTAIL The following code fails to allocate a buffer for the tail address that the hardware DMAs into when the user context DMA_RTAIL is set. if (HFI1_CAP_KGET_MASK(rcd->flags, DMA_RTAIL)) { rcd->rcvhdrtail_kvaddr = dma_zalloc_coherent( &dd->pcidev->dev, PAGE_SIZE, &dma_hdrqtail, gfp_flags); if (!rcd->rcvhdrtail_kvaddr) goto bail_free; rcd->rcvhdrqtailaddr_dma = dma_hdrqtail; } So the rcvhdrtail_kvaddr would then be NULL. The mmap logic fails to check for a NULL rcvhdrtail_kvaddr. The fix is to test for both user and kernel DMA_TAIL options during the allocation as well as testing for a NULL rcvhdrtail_kvaddr during the mmap processing. Additionally, all downstream testing of the capmask for DMA_RTAIL have been eliminated in favor of testing rcvhdrtail_kvaddr. Cc: # 4.9.x Reviewed-by: Michael J. Ruhl Signed-off-by: Mike Marciniszyn Signed-off-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/chip.c | 8 ++++---- drivers/infiniband/hw/hfi1/file_ops.c | 2 +- drivers/infiniband/hw/hfi1/init.c | 9 ++++----- 3 files changed, 9 insertions(+), 10 deletions(-) (limited to 'drivers') diff --git a/drivers/infiniband/hw/hfi1/chip.c b/drivers/infiniband/hw/hfi1/chip.c index 68580cb2ae1e..f75080d63142 100644 --- a/drivers/infiniband/hw/hfi1/chip.c +++ b/drivers/infiniband/hw/hfi1/chip.c @@ -6841,7 +6841,7 @@ static void rxe_kernel_unfreeze(struct hfi1_devdata *dd) } rcvmask = HFI1_RCVCTRL_CTXT_ENB; /* HFI1_RCVCTRL_TAILUPD_[ENB|DIS] needs to be set explicitly */ - rcvmask |= HFI1_CAP_KGET_MASK(rcd->flags, DMA_RTAIL) ? + rcvmask |= rcd->rcvhdrtail_kvaddr ? HFI1_RCVCTRL_TAILUPD_ENB : HFI1_RCVCTRL_TAILUPD_DIS; hfi1_rcvctrl(dd, rcvmask, rcd); hfi1_rcd_put(rcd); @@ -8367,7 +8367,7 @@ static inline int check_packet_present(struct hfi1_ctxtdata *rcd) u32 tail; int present; - if (!HFI1_CAP_IS_KSET(DMA_RTAIL)) + if (!rcd->rcvhdrtail_kvaddr) present = (rcd->seq_cnt == rhf_rcv_seq(rhf_to_cpu(get_rhf_addr(rcd)))); else /* is RDMA rtail */ @@ -11843,7 +11843,7 @@ void hfi1_rcvctrl(struct hfi1_devdata *dd, unsigned int op, /* reset the tail and hdr addresses, and sequence count */ write_kctxt_csr(dd, ctxt, RCV_HDR_ADDR, rcd->rcvhdrq_dma); - if (HFI1_CAP_KGET_MASK(rcd->flags, DMA_RTAIL)) + if (rcd->rcvhdrtail_kvaddr) write_kctxt_csr(dd, ctxt, RCV_HDR_TAIL_ADDR, rcd->rcvhdrqtailaddr_dma); rcd->seq_cnt = 1; @@ -11923,7 +11923,7 @@ void hfi1_rcvctrl(struct hfi1_devdata *dd, unsigned int op, rcvctrl |= RCV_CTXT_CTRL_INTR_AVAIL_SMASK; if (op & HFI1_RCVCTRL_INTRAVAIL_DIS) rcvctrl &= ~RCV_CTXT_CTRL_INTR_AVAIL_SMASK; - if (op & HFI1_RCVCTRL_TAILUPD_ENB && rcd->rcvhdrqtailaddr_dma) + if ((op & HFI1_RCVCTRL_TAILUPD_ENB) && rcd->rcvhdrtail_kvaddr) rcvctrl |= RCV_CTXT_CTRL_TAIL_UPD_SMASK; if (op & HFI1_RCVCTRL_TAILUPD_DIS) { /* See comment on RcvCtxtCtrl.TailUpd above */ diff --git a/drivers/infiniband/hw/hfi1/file_ops.c b/drivers/infiniband/hw/hfi1/file_ops.c index c9d23c37a371..0fc4aa9455c3 100644 --- a/drivers/infiniband/hw/hfi1/file_ops.c +++ b/drivers/infiniband/hw/hfi1/file_ops.c @@ -505,7 +505,7 @@ static int hfi1_file_mmap(struct file *fp, struct vm_area_struct *vma) ret = -EINVAL; goto done; } - if (flags & VM_WRITE) { + if ((flags & VM_WRITE) || !uctxt->rcvhdrtail_kvaddr) { ret = -EPERM; goto done; } diff --git a/drivers/infiniband/hw/hfi1/init.c b/drivers/infiniband/hw/hfi1/init.c index 3feecf926322..4a478ee0a79b 100644 --- a/drivers/infiniband/hw/hfi1/init.c +++ b/drivers/infiniband/hw/hfi1/init.c @@ -1853,7 +1853,6 @@ int hfi1_create_rcvhdrq(struct hfi1_devdata *dd, struct hfi1_ctxtdata *rcd) u64 reg; if (!rcd->rcvhdrq) { - dma_addr_t dma_hdrqtail; gfp_t gfp_flags; /* @@ -1878,13 +1877,13 @@ int hfi1_create_rcvhdrq(struct hfi1_devdata *dd, struct hfi1_ctxtdata *rcd) goto bail; } - if (HFI1_CAP_KGET_MASK(rcd->flags, DMA_RTAIL)) { + if (HFI1_CAP_KGET_MASK(rcd->flags, DMA_RTAIL) || + HFI1_CAP_UGET_MASK(rcd->flags, DMA_RTAIL)) { rcd->rcvhdrtail_kvaddr = dma_zalloc_coherent( - &dd->pcidev->dev, PAGE_SIZE, &dma_hdrqtail, - gfp_flags); + &dd->pcidev->dev, PAGE_SIZE, + &rcd->rcvhdrqtailaddr_dma, gfp_flags); if (!rcd->rcvhdrtail_kvaddr) goto bail_free; - rcd->rcvhdrqtailaddr_dma = dma_hdrqtail; } rcd->rcvhdrq_size = amt; -- cgit v1.2.3 From f9458bc2c1303bcbd02645de0d59e4b0210c669a Mon Sep 17 00:00:00 2001 From: Kaike Wan Date: Thu, 31 May 2018 11:30:17 -0700 Subject: IB/hfi1: Ensure VL index is within bounds Improve the safety of the code and ensure the array cannot be indexed out of bounds when picking the CPU for a given SDMA engine. Reviewed-by: Mike Marciniszyn Signed-off-by: Dennis Dalessandro Signed-off-by: Kaike Wan Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/sdma.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) (limited to 'drivers') diff --git a/drivers/infiniband/hw/hfi1/sdma.c b/drivers/infiniband/hw/hfi1/sdma.c index 1f203309cf24..298e0e3fc0c9 100644 --- a/drivers/infiniband/hw/hfi1/sdma.c +++ b/drivers/infiniband/hw/hfi1/sdma.c @@ -1,5 +1,5 @@ /* - * Copyright(c) 2015, 2016 Intel Corporation. + * Copyright(c) 2015 - 2018 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. @@ -923,9 +923,10 @@ ssize_t sdma_set_cpu_to_sde_map(struct sdma_engine *sde, const char *buf, cpumask_var_t mask, new_mask; unsigned long cpu; int ret, vl, sz; + struct sdma_rht_node *rht_node; vl = sdma_engine_get_vl(sde); - if (unlikely(vl < 0)) + if (unlikely(vl < 0 || vl >= ARRAY_SIZE(rht_node->map))) return -EINVAL; ret = zalloc_cpumask_var(&mask, GFP_KERNEL); @@ -953,19 +954,12 @@ ssize_t sdma_set_cpu_to_sde_map(struct sdma_engine *sde, const char *buf, mutex_lock(&process_to_sde_mutex); for_each_cpu(cpu, mask) { - struct sdma_rht_node *rht_node; - /* Check if we have this already mapped */ if (cpumask_test_cpu(cpu, &sde->cpu_mask)) { cpumask_set_cpu(cpu, new_mask); continue; } - if (vl >= ARRAY_SIZE(rht_node->map)) { - ret = -EINVAL; - goto out; - } - rht_node = rhashtable_lookup_fast(dd->sdma_rht, &cpu, sdma_rht_params); if (!rht_node) { -- cgit v1.2.3 From 5465f11083629e99cb34767790316ea076f7502f Mon Sep 17 00:00:00 2001 From: Kaike Wan Date: Mon, 4 Jun 2018 11:43:12 -0700 Subject: IB/hfi1: Remove unused variable The variable extended_psn was not used any more. Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/user_sdma.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'drivers') diff --git a/drivers/infiniband/hw/hfi1/user_sdma.h b/drivers/infiniband/hw/hfi1/user_sdma.h index a3d192424344..d2bc77f75253 100644 --- a/drivers/infiniband/hw/hfi1/user_sdma.h +++ b/drivers/infiniband/hw/hfi1/user_sdma.h @@ -1,7 +1,7 @@ #ifndef _HFI1_USER_SDMA_H #define _HFI1_USER_SDMA_H /* - * Copyright(c) 2015 - 2017 Intel Corporation. + * Copyright(c) 2015 - 2018 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. @@ -122,8 +122,6 @@ static inline int ahg_header_set(u32 *arr, int idx, size_t array_size, (req)->pq->ctxt, (req)->pq->subctxt, (req)->info.comp_idx, \ ##__VA_ARGS__) -extern uint extended_psn; - struct hfi1_user_sdma_pkt_q { u16 ctxt; u16 subctxt; -- cgit v1.2.3 From dc2b2a917c3427223188ac476afc915831b1244c Mon Sep 17 00:00:00 2001 From: Mike Marciniszyn Date: Mon, 4 Jun 2018 11:43:21 -0700 Subject: IB/hfi1: Add bypass register defines and replace blind constants These registers were not added in the 16B work. Add them and replace blind constants with the correct defines. Fixes: 72c07e2b671e ("IB/hfi1: Add support to receive 16B bypass packets") Reviewed-by: Don Hiatt Signed-off-by: Mike Marciniszyn Signed-off-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/chip.c | 4 +++- drivers/infiniband/hw/hfi1/chip_registers.h | 6 ++++++ 2 files changed, 9 insertions(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/infiniband/hw/hfi1/chip.c b/drivers/infiniband/hw/hfi1/chip.c index f75080d63142..6deb101cdd43 100644 --- a/drivers/infiniband/hw/hfi1/chip.c +++ b/drivers/infiniband/hw/hfi1/chip.c @@ -14640,7 +14640,9 @@ static void init_rxe(struct hfi1_devdata *dd) /* Have 16 bytes (4DW) of bypass header available in header queue */ val = read_csr(dd, RCV_BYPASS); - val |= (4ull << 16); + val &= ~RCV_BYPASS_HDR_SIZE_SMASK; + val |= ((4ull & RCV_BYPASS_HDR_SIZE_MASK) << + RCV_BYPASS_HDR_SIZE_SHIFT); write_csr(dd, RCV_BYPASS, val); } diff --git a/drivers/infiniband/hw/hfi1/chip_registers.h b/drivers/infiniband/hw/hfi1/chip_registers.h index da598b5fe8f6..ee6dca5e2a2f 100644 --- a/drivers/infiniband/hw/hfi1/chip_registers.h +++ b/drivers/infiniband/hw/hfi1/chip_registers.h @@ -638,6 +638,12 @@ #define RCV_BTH_QP_KDETH_QP_MASK 0xFFull #define RCV_BTH_QP_KDETH_QP_SHIFT 16 #define RCV_BYPASS (RXE + 0x000000000038) +#define RCV_BYPASS_HDR_SIZE_SHIFT 16 +#define RCV_BYPASS_HDR_SIZE_MASK 0x1Full +#define RCV_BYPASS_HDR_SIZE_SMASK 0x1F0000ull +#define RCV_BYPASS_BYPASS_CONTEXT_SHIFT 0 +#define RCV_BYPASS_BYPASS_CONTEXT_MASK 0xFFull +#define RCV_BYPASS_BYPASS_CONTEXT_SMASK 0xFFull #define RCV_CONTEXTS (RXE + 0x000000000010) #define RCV_COUNTER_ARRAY32 (RXE + 0x000000000400) #define RCV_COUNTER_ARRAY64 (RXE + 0x000000000500) -- cgit v1.2.3 From ed71e86a8d66ec018d16047cdd507f95c89e257b Mon Sep 17 00:00:00 2001 From: Kaike Wan Date: Mon, 4 Jun 2018 11:43:54 -0700 Subject: IB/hfi1: Rename exp_lock to exp_mutex The mutex exp_lock in struct hfi1_ctxtdata is used to protect all Expected TID data of a user context. This patch renames it to exp_mutex to better reflect its identity and prepare for upcoming patches. Reviewed-by: Ashutosh Dixit Reviewed-by: Mike Marciniszyn Signed-off-by: Harish Chegondi Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/hfi.h | 4 ++-- drivers/infiniband/hw/hfi1/init.c | 2 +- drivers/infiniband/hw/hfi1/user_exp_rcv.c | 10 +++++----- 3 files changed, 8 insertions(+), 8 deletions(-) (limited to 'drivers') diff --git a/drivers/infiniband/hw/hfi1/hfi.h b/drivers/infiniband/hw/hfi1/hfi.h index 5eb3bf0849c7..4ab8b5bfbed1 100644 --- a/drivers/infiniband/hw/hfi1/hfi.h +++ b/drivers/infiniband/hw/hfi1/hfi.h @@ -245,8 +245,8 @@ struct hfi1_ctxtdata { struct exp_tid_set tid_used_list; struct exp_tid_set tid_full_list; - /* lock protecting all Expected TID data */ - struct mutex exp_lock; + /* lock protecting all Expected TID data of user contexts */ + struct mutex exp_mutex; /* per-context configuration flags */ unsigned long flags; /* per-context event flags for fileops/intr communication */ diff --git a/drivers/infiniband/hw/hfi1/init.c b/drivers/infiniband/hw/hfi1/init.c index 4a478ee0a79b..561ad66d0ab3 100644 --- a/drivers/infiniband/hw/hfi1/init.c +++ b/drivers/infiniband/hw/hfi1/init.c @@ -368,7 +368,7 @@ int hfi1_create_ctxtdata(struct hfi1_pportdata *ppd, int numa, rcd->numa_id = numa; rcd->rcv_array_groups = dd->rcv_entries.ngroups; - mutex_init(&rcd->exp_lock); + mutex_init(&rcd->exp_mutex); hfi1_cdbg(PROC, "setting up context %u\n", rcd->ctxt); diff --git a/drivers/infiniband/hw/hfi1/user_exp_rcv.c b/drivers/infiniband/hw/hfi1/user_exp_rcv.c index 6a4c5142515a..dbe7d14a5c76 100644 --- a/drivers/infiniband/hw/hfi1/user_exp_rcv.c +++ b/drivers/infiniband/hw/hfi1/user_exp_rcv.c @@ -1,5 +1,5 @@ /* - * Copyright(c) 2015-2017 Intel Corporation. + * Copyright(c) 2015-2018 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. @@ -375,7 +375,7 @@ int hfi1_user_exp_rcv_setup(struct hfi1_filedata *fd, * From this point on, we are going to be using shared (between master * and subcontexts) context resources. We need to take the lock. */ - mutex_lock(&uctxt->exp_lock); + mutex_lock(&uctxt->exp_mutex); /* * The first step is to program the RcvArray entries which are complete * groups. @@ -461,7 +461,7 @@ int hfi1_user_exp_rcv_setup(struct hfi1_filedata *fd, } } unlock: - mutex_unlock(&uctxt->exp_lock); + mutex_unlock(&uctxt->exp_mutex); nomem: hfi1_cdbg(TID, "total mapped: tidpairs:%u pages:%u (%d)", tididx, mapped_pages, ret); @@ -517,7 +517,7 @@ int hfi1_user_exp_rcv_clear(struct hfi1_filedata *fd, if (IS_ERR(tidinfo)) return PTR_ERR(tidinfo); - mutex_lock(&uctxt->exp_lock); + mutex_lock(&uctxt->exp_mutex); for (tididx = 0; tididx < tinfo->tidcnt; tididx++) { ret = unprogram_rcvarray(fd, tidinfo[tididx], NULL); if (ret) { @@ -530,7 +530,7 @@ int hfi1_user_exp_rcv_clear(struct hfi1_filedata *fd, fd->tid_used -= tididx; spin_unlock(&fd->tid_lock); tinfo->tidcnt = tididx; - mutex_unlock(&uctxt->exp_lock); + mutex_unlock(&uctxt->exp_mutex); kfree(tidinfo); return ret; -- cgit v1.2.3 From d9a6ce68a0117fd13785d8316bf24f61bb6e2e72 Mon Sep 17 00:00:00 2001 From: Mike Marciniszyn Date: Mon, 4 Jun 2018 11:44:11 -0700 Subject: IB/hfi1: Fix comment on default hdr entry size The comment for the default header queue entry size is incorrect. Correct the comment and fix the resulting S_IRUGO warning that shows up in the widened patch context. Reviewed-by: Michael J. Ruhl Signed-off-by: Mike Marciniszyn Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/init.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers') diff --git a/drivers/infiniband/hw/hfi1/init.c b/drivers/infiniband/hw/hfi1/init.c index 561ad66d0ab3..f110842b91f5 100644 --- a/drivers/infiniband/hw/hfi1/init.c +++ b/drivers/infiniband/hw/hfi1/init.c @@ -113,8 +113,8 @@ module_param_named(rcvhdrcnt, rcvhdrcnt, uint, S_IRUGO); MODULE_PARM_DESC(rcvhdrcnt, "Receive header queue count (default 2048)"); static uint hfi1_hdrq_entsize = 32; -module_param_named(hdrq_entsize, hfi1_hdrq_entsize, uint, S_IRUGO); -MODULE_PARM_DESC(hdrq_entsize, "Size of header queue entries: 2 - 8B, 16 - 64B (default), 32 - 128B"); +module_param_named(hdrq_entsize, hfi1_hdrq_entsize, uint, 0444); +MODULE_PARM_DESC(hdrq_entsize, "Size of header queue entries: 2 - 8B, 16 - 64B, 32 - 128B (default)"); unsigned int user_credit_return_threshold = 33; /* default is 33% */ module_param(user_credit_return_threshold, uint, S_IRUGO); -- cgit v1.2.3 From 33edc3b2db8735cafa9d72f8510dc6bb394ddec8 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 5 Jun 2018 07:48:08 +0300 Subject: RDMA/restrack: Change SPDX tag to properly reflect license Resource tracking is supposed to be dual licensed: GPL-2.0 and OpenIB, but the SPDX tag was not compliant to it. Update the tag to properly reflect license. Fixes: 02d8883f520e ("RDMA/restrack: Add general infrastructure to track RDMA resources") Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/restrack.c | 2 +- include/rdma/restrack.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers') diff --git a/drivers/infiniband/core/restrack.c b/drivers/infiniband/core/restrack.c index 172b517dc7b9..3b7fa0ccaa08 100644 --- a/drivers/infiniband/core/restrack.c +++ b/drivers/infiniband/core/restrack.c @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: (GPL-2.0+ OR BSD-3-Clause) */ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB /* * Copyright (c) 2017-2018 Mellanox Technologies. All rights reserved. */ diff --git a/include/rdma/restrack.h b/include/rdma/restrack.h index 637968589922..9654d33edd98 100644 --- a/include/rdma/restrack.h +++ b/include/rdma/restrack.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: (GPL-2.0+ OR BSD-3-Clause) */ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ /* * Copyright (c) 2017-2018 Mellanox Technologies. All rights reserved. */ -- cgit v1.2.3 From c1191a19fecad92b73c25770a7f47174280ca564 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 5 Jun 2018 07:53:53 +0300 Subject: RDMA/mlx5: Update SPDX tags to show proper license Mellanox code is supposed to be OpenIB compliant code, so let's update SPDX tags to show it. Fixes: fc385b7ac480 ("IB/mlx5: Add basic regiser/unregister representors code") Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/ib_rep.c | 2 +- drivers/infiniband/hw/mlx5/ib_rep.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers') diff --git a/drivers/infiniband/hw/mlx5/ib_rep.c b/drivers/infiniband/hw/mlx5/ib_rep.c index 0e04fdddf670..35a0e04c38f2 100644 --- a/drivers/infiniband/hw/mlx5/ib_rep.c +++ b/drivers/infiniband/hw/mlx5/ib_rep.c @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: (GPL-2.0+ OR BSD-3-Clause) */ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB /* * Copyright (c) 2018 Mellanox Technologies. All rights reserved. */ diff --git a/drivers/infiniband/hw/mlx5/ib_rep.h b/drivers/infiniband/hw/mlx5/ib_rep.h index 046fd942fd46..2ba73636a2fb 100644 --- a/drivers/infiniband/hw/mlx5/ib_rep.h +++ b/drivers/infiniband/hw/mlx5/ib_rep.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: (GPL-2.0+ OR BSD-3-Clause) */ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ /* * Copyright (c) 2018 Mellanox Technologies. All rights reserved. */ -- cgit v1.2.3