diff options
Diffstat (limited to 'drivers/infiniband/core')
32 files changed, 1679 insertions, 712 deletions
diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c index 744b6ec0acb0..ba01b90c04e7 100644 --- a/drivers/infiniband/core/addr.c +++ b/drivers/infiniband/core/addr.c @@ -45,6 +45,7 @@ #include <net/ipv6_stubs.h> #include <net/ip6_route.h> #include <rdma/ib_addr.h> +#include <rdma/ib_cache.h> #include <rdma/ib_sa.h> #include <rdma/ib.h> #include <rdma/rdma_netlink.h> diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c index 43c67e5f43c6..18e476b3ced0 100644 --- a/drivers/infiniband/core/cache.c +++ b/drivers/infiniband/core/cache.c @@ -78,11 +78,22 @@ enum gid_table_entry_state { GID_TABLE_ENTRY_PENDING_DEL = 3, }; +struct roce_gid_ndev_storage { + struct rcu_head rcu_head; + struct net_device *ndev; +}; + struct ib_gid_table_entry { struct kref kref; struct work_struct del_work; struct ib_gid_attr attr; void *context; + /* Store the ndev pointer to release reference later on in + * call_rcu context because by that time gid_table_entry + * and attr might be already freed. So keep a copy of it. + * ndev_storage is freed by rcu callback. + */ + struct roce_gid_ndev_storage *ndev_storage; enum gid_table_entry_state state; }; @@ -206,6 +217,20 @@ static void schedule_free_gid(struct kref *kref) queue_work(ib_wq, &entry->del_work); } +static void put_gid_ndev(struct rcu_head *head) +{ + struct roce_gid_ndev_storage *storage = + container_of(head, struct roce_gid_ndev_storage, rcu_head); + + WARN_ON(!storage->ndev); + /* At this point its safe to release netdev reference, + * as all callers working on gid_attr->ndev are done + * using this netdev. + */ + dev_put(storage->ndev); + kfree(storage); +} + static void free_gid_entry_locked(struct ib_gid_table_entry *entry) { struct ib_device *device = entry->attr.device; @@ -228,8 +253,8 @@ static void free_gid_entry_locked(struct ib_gid_table_entry *entry) /* Now this index is ready to be allocated */ write_unlock_irq(&table->rwlock); - if (entry->attr.ndev) - dev_put(entry->attr.ndev); + if (entry->ndev_storage) + call_rcu(&entry->ndev_storage->rcu_head, put_gid_ndev); kfree(entry); } @@ -266,14 +291,25 @@ static struct ib_gid_table_entry * alloc_gid_entry(const struct ib_gid_attr *attr) { struct ib_gid_table_entry *entry; + struct net_device *ndev; entry = kzalloc(sizeof(*entry), GFP_KERNEL); if (!entry) return NULL; + + ndev = rcu_dereference_protected(attr->ndev, 1); + if (ndev) { + entry->ndev_storage = kzalloc(sizeof(*entry->ndev_storage), + GFP_KERNEL); + if (!entry->ndev_storage) { + kfree(entry); + return NULL; + } + dev_hold(ndev); + entry->ndev_storage->ndev = ndev; + } kref_init(&entry->kref); memcpy(&entry->attr, attr, sizeof(*attr)); - if (entry->attr.ndev) - dev_hold(entry->attr.ndev); INIT_WORK(&entry->del_work, free_gid_work); entry->state = GID_TABLE_ENTRY_INVALID; return entry; @@ -343,6 +379,7 @@ static int add_roce_gid(struct ib_gid_table_entry *entry) static void del_gid(struct ib_device *ib_dev, u8 port, struct ib_gid_table *table, int ix) { + struct roce_gid_ndev_storage *ndev_storage; struct ib_gid_table_entry *entry; lockdep_assert_held(&table->lock); @@ -360,6 +397,13 @@ static void del_gid(struct ib_device *ib_dev, u8 port, table->data_vec[ix] = NULL; write_unlock_irq(&table->rwlock); + ndev_storage = entry->ndev_storage; + if (ndev_storage) { + entry->ndev_storage = NULL; + rcu_assign_pointer(entry->attr.ndev, NULL); + call_rcu(&ndev_storage->rcu_head, put_gid_ndev); + } + if (rdma_cap_roce_gid_table(ib_dev, port)) ib_dev->ops.del_gid(&entry->attr, &entry->context); @@ -543,30 +587,11 @@ out_unlock: int ib_cache_gid_add(struct ib_device *ib_dev, u8 port, union ib_gid *gid, struct ib_gid_attr *attr) { - struct net_device *idev; - unsigned long mask; - int ret; - - idev = ib_device_get_netdev(ib_dev, port); - if (idev && attr->ndev != idev) { - union ib_gid default_gid; - - /* Adding default GIDs is not permitted */ - make_default_gid(idev, &default_gid); - if (!memcmp(gid, &default_gid, sizeof(*gid))) { - dev_put(idev); - return -EPERM; - } - } - if (idev) - dev_put(idev); - - mask = GID_ATTR_FIND_MASK_GID | - GID_ATTR_FIND_MASK_GID_TYPE | - GID_ATTR_FIND_MASK_NETDEV; + unsigned long mask = GID_ATTR_FIND_MASK_GID | + GID_ATTR_FIND_MASK_GID_TYPE | + GID_ATTR_FIND_MASK_NETDEV; - ret = __ib_cache_gid_add(ib_dev, port, gid, attr, mask, false); - return ret; + return __ib_cache_gid_add(ib_dev, port, gid, attr, mask, false); } static int @@ -1263,11 +1288,72 @@ struct net_device *rdma_read_gid_attr_ndev_rcu(const struct ib_gid_attr *attr) read_lock_irqsave(&table->rwlock, flags); valid = is_gid_entry_valid(table->data_vec[attr->index]); - if (valid && attr->ndev && (READ_ONCE(attr->ndev->flags) & IFF_UP)) - ndev = attr->ndev; + if (valid) { + ndev = rcu_dereference(attr->ndev); + if (!ndev || + (ndev && ((READ_ONCE(ndev->flags) & IFF_UP) == 0))) + ndev = ERR_PTR(-ENODEV); + } read_unlock_irqrestore(&table->rwlock, flags); return ndev; } +EXPORT_SYMBOL(rdma_read_gid_attr_ndev_rcu); + +static int get_lower_dev_vlan(struct net_device *lower_dev, void *data) +{ + u16 *vlan_id = data; + + if (is_vlan_dev(lower_dev)) + *vlan_id = vlan_dev_vlan_id(lower_dev); + + /* We are interested only in first level vlan device, so + * always return 1 to stop iterating over next level devices. + */ + return 1; +} + +/** + * rdma_read_gid_l2_fields - Read the vlan ID and source MAC address + * of a GID entry. + * + * @attr: GID attribute pointer whose L2 fields to be read + * @vlan_id: Pointer to vlan id to fill up if the GID entry has + * vlan id. It is optional. + * @smac: Pointer to smac to fill up for a GID entry. It is optional. + * + * rdma_read_gid_l2_fields() returns 0 on success and returns vlan id + * (if gid entry has vlan) and source MAC, or returns error. + */ +int rdma_read_gid_l2_fields(const struct ib_gid_attr *attr, + u16 *vlan_id, u8 *smac) +{ + struct net_device *ndev; + + rcu_read_lock(); + ndev = rcu_dereference(attr->ndev); + if (!ndev) { + rcu_read_unlock(); + return -ENODEV; + } + if (smac) + ether_addr_copy(smac, ndev->dev_addr); + if (vlan_id) { + *vlan_id = 0xffff; + if (is_vlan_dev(ndev)) { + *vlan_id = vlan_dev_vlan_id(ndev); + } else { + /* If the netdev is upper device and if it's lower + * device is vlan device, consider vlan id of the + * the lower vlan device for this gid entry. + */ + netdev_walk_all_lower_dev_rcu(attr->ndev, + get_lower_dev_vlan, vlan_id); + } + } + rcu_read_unlock(); + return 0; +} +EXPORT_SYMBOL(rdma_read_gid_l2_fields); static int config_non_roce_gid_cache(struct ib_device *device, u8 port, int gid_tbl_len) @@ -1392,7 +1478,6 @@ static void ib_cache_event(struct ib_event_handler *handler, event->event == IB_EVENT_PORT_ACTIVE || event->event == IB_EVENT_LID_CHANGE || event->event == IB_EVENT_PKEY_CHANGE || - event->event == IB_EVENT_SM_CHANGE || event->event == IB_EVENT_CLIENT_REREGISTER || event->event == IB_EVENT_GID_CHANGE) { work = kmalloc(sizeof *work, GFP_ATOMIC); diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c index b9416a6fca36..da10e6ccb43c 100644 --- a/drivers/infiniband/core/cm.c +++ b/drivers/infiniband/core/cm.c @@ -52,6 +52,7 @@ #include <rdma/ib_cache.h> #include <rdma/ib_cm.h> #include "cm_msgs.h" +#include "core_priv.h" MODULE_AUTHOR("Sean Hefty"); MODULE_DESCRIPTION("InfiniBand CM"); @@ -124,7 +125,8 @@ static struct ib_cm { struct rb_root remote_qp_table; struct rb_root remote_id_table; struct rb_root remote_sidr_table; - struct idr local_id_table; + struct xarray local_id_table; + u32 local_id_next; __be32 random_id_operand; struct list_head timewait_list; struct workqueue_struct *wq; @@ -219,7 +221,6 @@ struct cm_port { struct cm_device { struct list_head list; struct ib_device *ib_device; - struct device *device; u8 ack_delay; int going_down; struct cm_port *port[0]; @@ -598,35 +599,31 @@ static int cm_init_av_by_path(struct sa_path_rec *path, static int cm_alloc_id(struct cm_id_private *cm_id_priv) { - unsigned long flags; - int id; - - idr_preload(GFP_KERNEL); - spin_lock_irqsave(&cm.lock, flags); + int err; + u32 id; - id = idr_alloc_cyclic(&cm.local_id_table, cm_id_priv, 0, 0, GFP_NOWAIT); - - spin_unlock_irqrestore(&cm.lock, flags); - idr_preload_end(); + err = xa_alloc_cyclic_irq(&cm.local_id_table, &id, cm_id_priv, + xa_limit_32b, &cm.local_id_next, GFP_KERNEL); cm_id_priv->id.local_id = (__force __be32)id ^ cm.random_id_operand; - return id < 0 ? id : 0; + return err; +} + +static u32 cm_local_id(__be32 local_id) +{ + return (__force u32) (local_id ^ cm.random_id_operand); } static void cm_free_id(__be32 local_id) { - spin_lock_irq(&cm.lock); - idr_remove(&cm.local_id_table, - (__force int) (local_id ^ cm.random_id_operand)); - spin_unlock_irq(&cm.lock); + xa_erase_irq(&cm.local_id_table, cm_local_id(local_id)); } static struct cm_id_private * cm_get_id(__be32 local_id, __be32 remote_id) { struct cm_id_private *cm_id_priv; - cm_id_priv = idr_find(&cm.local_id_table, - (__force int) (local_id ^ cm.random_id_operand)); + cm_id_priv = xa_load(&cm.local_id_table, cm_local_id(local_id)); if (cm_id_priv) { if (cm_id_priv->id.remote_id == remote_id) atomic_inc(&cm_id_priv->refcount); @@ -1988,11 +1985,12 @@ static int cm_req_handler(struct cm_work *work) grh = rdma_ah_read_grh(&cm_id_priv->av.ah_attr); gid_attr = grh->sgid_attr; - if (gid_attr && gid_attr->ndev) { + if (gid_attr && + rdma_protocol_roce(work->port->cm_dev->ib_device, + work->port->port_num)) { work->path[0].rec_type = sa_conv_gid_to_pathrec_type(gid_attr->gid_type); } else { - /* If no GID attribute or ndev is null, it is not RoCE. */ cm_path_set_rec_type(work->port->cm_dev->ib_device, work->port->port_num, &work->path[0], @@ -2824,9 +2822,8 @@ static struct cm_id_private * cm_acquire_rejected_id(struct cm_rej_msg *rej_msg) spin_unlock_irq(&cm.lock); return NULL; } - cm_id_priv = idr_find(&cm.local_id_table, (__force int) - (timewait_info->work.local_id ^ - cm.random_id_operand)); + cm_id_priv = xa_load(&cm.local_id_table, + cm_local_id(timewait_info->work.local_id)); if (cm_id_priv) { if (cm_id_priv->id.remote_id == remote_id) atomic_inc(&cm_id_priv->refcount); @@ -4276,18 +4273,6 @@ static struct kobj_type cm_counter_obj_type = { .default_attrs = cm_counter_default_attrs }; -static void cm_release_port_obj(struct kobject *obj) -{ - struct cm_port *cm_port; - - cm_port = container_of(obj, struct cm_port, port_obj); - kfree(cm_port); -} - -static struct kobj_type cm_port_obj_type = { - .release = cm_release_port_obj -}; - static char *cm_devnode(struct device *dev, umode_t *mode) { if (mode) @@ -4306,19 +4291,12 @@ static int cm_create_port_fs(struct cm_port *port) { int i, ret; - ret = kobject_init_and_add(&port->port_obj, &cm_port_obj_type, - &port->cm_dev->device->kobj, - "%d", port->port_num); - if (ret) { - kfree(port); - return ret; - } - for (i = 0; i < CM_COUNTER_GROUPS; i++) { - ret = kobject_init_and_add(&port->counter_group[i].obj, - &cm_counter_obj_type, - &port->port_obj, - "%s", counter_group_names[i]); + ret = ib_port_register_module_stat(port->cm_dev->ib_device, + port->port_num, + &port->counter_group[i].obj, + &cm_counter_obj_type, + counter_group_names[i]); if (ret) goto error; } @@ -4327,8 +4305,7 @@ static int cm_create_port_fs(struct cm_port *port) error: while (i--) - kobject_put(&port->counter_group[i].obj); - kobject_put(&port->port_obj); + ib_port_unregister_module_stat(&port->counter_group[i].obj); return ret; } @@ -4338,9 +4315,8 @@ static void cm_remove_port_fs(struct cm_port *port) int i; for (i = 0; i < CM_COUNTER_GROUPS; i++) - kobject_put(&port->counter_group[i].obj); + ib_port_unregister_module_stat(&port->counter_group[i].obj); - kobject_put(&port->port_obj); } static void cm_add_one(struct ib_device *ib_device) @@ -4367,13 +4343,6 @@ static void cm_add_one(struct ib_device *ib_device) cm_dev->ib_device = ib_device; cm_dev->ack_delay = ib_device->attrs.local_ca_ack_delay; cm_dev->going_down = 0; - cm_dev->device = device_create(&cm_class, &ib_device->dev, - MKDEV(0, 0), NULL, - "%s", dev_name(&ib_device->dev)); - if (IS_ERR(cm_dev->device)) { - kfree(cm_dev); - return; - } set_bit(IB_MGMT_METHOD_SEND, reg_req.method_mask); for (i = 1; i <= ib_device->phys_port_cnt; i++) { @@ -4440,7 +4409,6 @@ error1: cm_remove_port_fs(port); } free: - device_unregister(cm_dev->device); kfree(cm_dev); } @@ -4494,7 +4462,6 @@ static void cm_remove_one(struct ib_device *ib_device, void *client_data) cm_remove_port_fs(port); } - device_unregister(cm_dev->device); kfree(cm_dev); } @@ -4502,7 +4469,6 @@ static int __init ib_cm_init(void) { int ret; - memset(&cm, 0, sizeof cm); INIT_LIST_HEAD(&cm.device_list); rwlock_init(&cm.device_lock); spin_lock_init(&cm.lock); @@ -4512,7 +4478,7 @@ static int __init ib_cm_init(void) cm.remote_id_table = RB_ROOT; cm.remote_qp_table = RB_ROOT; cm.remote_sidr_table = RB_ROOT; - idr_init(&cm.local_id_table); + xa_init_flags(&cm.local_id_table, XA_FLAGS_ALLOC | XA_FLAGS_LOCK_IRQ); get_random_bytes(&cm.random_id_operand, sizeof cm.random_id_operand); INIT_LIST_HEAD(&cm.timewait_list); @@ -4538,7 +4504,6 @@ error3: error2: class_unregister(&cm_class); error1: - idr_destroy(&cm.local_id_table); return ret; } @@ -4560,9 +4525,8 @@ static void __exit ib_cm_cleanup(void) } class_unregister(&cm_class); - idr_destroy(&cm.local_id_table); + WARN_ON(!xa_empty(&cm.local_id_table)); } module_init(ib_cm_init); module_exit(ib_cm_cleanup); - diff --git a/drivers/infiniband/core/cm_msgs.h b/drivers/infiniband/core/cm_msgs.h index 476d4309576d..3d16d614aff6 100644 --- a/drivers/infiniband/core/cm_msgs.h +++ b/drivers/infiniband/core/cm_msgs.h @@ -98,7 +98,7 @@ struct cm_req_msg { u32 private_data[IB_CM_REQ_PRIVATE_DATA_SIZE / sizeof(u32)]; -} __attribute__ ((packed)); +} __packed; static inline __be32 cm_req_get_local_qpn(struct cm_req_msg *req_msg) { @@ -423,7 +423,7 @@ enum cm_msg_response { u8 private_data[IB_CM_MRA_PRIVATE_DATA_SIZE]; -} __attribute__ ((packed)); +} __packed; static inline u8 cm_mra_get_msg_mraed(struct cm_mra_msg *mra_msg) { @@ -461,7 +461,7 @@ struct cm_rej_msg { u8 private_data[IB_CM_REJ_PRIVATE_DATA_SIZE]; -} __attribute__ ((packed)); +} __packed; static inline u8 cm_rej_get_msg_rejected(struct cm_rej_msg *rej_msg) { @@ -506,7 +506,7 @@ struct cm_rep_msg { u8 private_data[IB_CM_REP_PRIVATE_DATA_SIZE]; -} __attribute__ ((packed)); +} __packed; static inline __be32 cm_rep_get_local_qpn(struct cm_rep_msg *rep_msg) { @@ -614,7 +614,7 @@ struct cm_rtu_msg { u8 private_data[IB_CM_RTU_PRIVATE_DATA_SIZE]; -} __attribute__ ((packed)); +} __packed; struct cm_dreq_msg { struct ib_mad_hdr hdr; @@ -626,7 +626,7 @@ struct cm_dreq_msg { u8 private_data[IB_CM_DREQ_PRIVATE_DATA_SIZE]; -} __attribute__ ((packed)); +} __packed; static inline __be32 cm_dreq_get_remote_qpn(struct cm_dreq_msg *dreq_msg) { @@ -647,7 +647,7 @@ struct cm_drep_msg { u8 private_data[IB_CM_DREP_PRIVATE_DATA_SIZE]; -} __attribute__ ((packed)); +} __packed; struct cm_lap_msg { struct ib_mad_hdr hdr; @@ -675,7 +675,7 @@ struct cm_lap_msg { u8 offset63; u8 private_data[IB_CM_LAP_PRIVATE_DATA_SIZE]; -} __attribute__ ((packed)); +} __packed; static inline __be32 cm_lap_get_remote_qpn(struct cm_lap_msg *lap_msg) { @@ -784,7 +784,7 @@ struct cm_apr_msg { u8 info[IB_CM_APR_INFO_LENGTH]; u8 private_data[IB_CM_APR_PRIVATE_DATA_SIZE]; -} __attribute__ ((packed)); +} __packed; struct cm_sidr_req_msg { struct ib_mad_hdr hdr; @@ -795,7 +795,7 @@ struct cm_sidr_req_msg { __be64 service_id; u32 private_data[IB_CM_SIDR_REQ_PRIVATE_DATA_SIZE / sizeof(u32)]; -} __attribute__ ((packed)); +} __packed; struct cm_sidr_rep_msg { struct ib_mad_hdr hdr; @@ -811,7 +811,7 @@ struct cm_sidr_rep_msg { u8 info[IB_CM_SIDR_REP_INFO_LENGTH]; u8 private_data[IB_CM_SIDR_REP_PRIVATE_DATA_SIZE]; -} __attribute__ ((packed)); +} __packed; static inline __be32 cm_sidr_rep_get_qpn(struct cm_sidr_rep_msg *sidr_rep_msg) { diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 68c997be2429..19f1730a4f24 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -39,7 +39,7 @@ #include <linux/mutex.h> #include <linux/random.h> #include <linux/igmp.h> -#include <linux/idr.h> +#include <linux/xarray.h> #include <linux/inetdevice.h> #include <linux/slab.h> #include <linux/module.h> @@ -191,10 +191,10 @@ static struct workqueue_struct *cma_wq; static unsigned int cma_pernet_id; struct cma_pernet { - struct idr tcp_ps; - struct idr udp_ps; - struct idr ipoib_ps; - struct idr ib_ps; + struct xarray tcp_ps; + struct xarray udp_ps; + struct xarray ipoib_ps; + struct xarray ib_ps; }; static struct cma_pernet *cma_pernet(struct net *net) @@ -202,7 +202,8 @@ static struct cma_pernet *cma_pernet(struct net *net) return net_generic(net, cma_pernet_id); } -static struct idr *cma_pernet_idr(struct net *net, enum rdma_ucm_port_space ps) +static +struct xarray *cma_pernet_xa(struct net *net, enum rdma_ucm_port_space ps) { struct cma_pernet *pernet = cma_pernet(net); @@ -247,25 +248,25 @@ struct class_port_info_context { static int cma_ps_alloc(struct net *net, enum rdma_ucm_port_space ps, struct rdma_bind_list *bind_list, int snum) { - struct idr *idr = cma_pernet_idr(net, ps); + struct xarray *xa = cma_pernet_xa(net, ps); - return idr_alloc(idr, bind_list, snum, snum + 1, GFP_KERNEL); + return xa_insert(xa, snum, bind_list, GFP_KERNEL); } static struct rdma_bind_list *cma_ps_find(struct net *net, enum rdma_ucm_port_space ps, int snum) { - struct idr *idr = cma_pernet_idr(net, ps); + struct xarray *xa = cma_pernet_xa(net, ps); - return idr_find(idr, snum); + return xa_load(xa, snum); } static void cma_ps_remove(struct net *net, enum rdma_ucm_port_space ps, int snum) { - struct idr *idr = cma_pernet_idr(net, ps); + struct xarray *xa = cma_pernet_xa(net, ps); - idr_remove(idr, snum); + xa_erase(xa, snum); } enum { @@ -615,6 +616,9 @@ cma_validate_port(struct ib_device *device, u8 port, int dev_type = dev_addr->dev_type; struct net_device *ndev = NULL; + if (!rdma_dev_access_netns(device, id_priv->id.route.addr.dev_addr.net)) + return ERR_PTR(-ENODEV); + if ((dev_type == ARPHRD_INFINIBAND) && !rdma_protocol_ib(device, port)) return ERR_PTR(-ENODEV); @@ -1173,18 +1177,31 @@ static inline bool cma_any_addr(const struct sockaddr *addr) return cma_zero_addr(addr) || cma_loopback_addr(addr); } -static int cma_addr_cmp(struct sockaddr *src, struct sockaddr *dst) +static int cma_addr_cmp(const struct sockaddr *src, const struct sockaddr *dst) { if (src->sa_family != dst->sa_family) return -1; switch (src->sa_family) { case AF_INET: - return ((struct sockaddr_in *) src)->sin_addr.s_addr != - ((struct sockaddr_in *) dst)->sin_addr.s_addr; - case AF_INET6: - return ipv6_addr_cmp(&((struct sockaddr_in6 *) src)->sin6_addr, - &((struct sockaddr_in6 *) dst)->sin6_addr); + return ((struct sockaddr_in *)src)->sin_addr.s_addr != + ((struct sockaddr_in *)dst)->sin_addr.s_addr; + case AF_INET6: { + struct sockaddr_in6 *src_addr6 = (struct sockaddr_in6 *)src; + struct sockaddr_in6 *dst_addr6 = (struct sockaddr_in6 *)dst; + bool link_local; + + if (ipv6_addr_cmp(&src_addr6->sin6_addr, + &dst_addr6->sin6_addr)) + return 1; + link_local = ipv6_addr_type(&dst_addr6->sin6_addr) & + IPV6_ADDR_LINKLOCAL; + /* Link local must match their scope_ids */ + return link_local ? (src_addr6->sin6_scope_id != + dst_addr6->sin6_scope_id) : + 0; + } + default: return ib_addr_cmp(&((struct sockaddr_ib *) src)->sib_addr, &((struct sockaddr_ib *) dst)->sib_addr); @@ -1469,6 +1486,7 @@ static struct net_device * roce_get_net_dev_by_cm_event(const struct ib_cm_event *ib_event) { const struct ib_gid_attr *sgid_attr = NULL; + struct net_device *ndev; if (ib_event->event == IB_CM_REQ_RECEIVED) sgid_attr = ib_event->param.req_rcvd.ppath_sgid_attr; @@ -1477,8 +1495,15 @@ roce_get_net_dev_by_cm_event(const struct ib_cm_event *ib_event) if (!sgid_attr) return NULL; - dev_hold(sgid_attr->ndev); - return sgid_attr->ndev; + + rcu_read_lock(); + ndev = rdma_read_gid_attr_ndev_rcu(sgid_attr); + if (IS_ERR(ndev)) + ndev = NULL; + else + dev_hold(ndev); + rcu_read_unlock(); + return ndev; } static struct net_device *cma_get_net_dev(const struct ib_cm_event *ib_event, @@ -3247,7 +3272,7 @@ static int cma_alloc_port(enum rdma_ucm_port_space ps, goto err; bind_list->ps = ps; - bind_list->port = (unsigned short)ret; + bind_list->port = snum; cma_bind_port(bind_list, id_priv); return 0; err: @@ -4655,10 +4680,10 @@ static int cma_init_net(struct net *net) { struct cma_pernet *pernet = cma_pernet(net); - idr_init(&pernet->tcp_ps); - idr_init(&pernet->udp_ps); - idr_init(&pernet->ipoib_ps); - idr_init(&pernet->ib_ps); + xa_init(&pernet->tcp_ps); + xa_init(&pernet->udp_ps); + xa_init(&pernet->ipoib_ps); + xa_init(&pernet->ib_ps); return 0; } @@ -4667,10 +4692,10 @@ static void cma_exit_net(struct net *net) { struct cma_pernet *pernet = cma_pernet(net); - idr_destroy(&pernet->tcp_ps); - idr_destroy(&pernet->udp_ps); - idr_destroy(&pernet->ipoib_ps); - idr_destroy(&pernet->ib_ps); + WARN_ON(!xa_empty(&pernet->tcp_ps)); + WARN_ON(!xa_empty(&pernet->udp_ps)); + WARN_ON(!xa_empty(&pernet->ipoib_ps)); + WARN_ON(!xa_empty(&pernet->ib_ps)); } static struct pernet_operations cma_pernet_operations = { diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h index 08c690249594..ff40a450b5d2 100644 --- a/drivers/infiniband/core/core_priv.h +++ b/drivers/infiniband/core/core_priv.h @@ -55,6 +55,7 @@ struct pkey_index_qp_list { }; extern const struct attribute_group ib_dev_attr_group; +extern bool ib_devices_shared_netns; int ib_device_register_sysfs(struct ib_device *device); void ib_device_unregister_sysfs(struct ib_device *device); @@ -279,7 +280,8 @@ static inline void ib_mad_agent_security_change(void) } #endif -struct ib_device *ib_device_get_by_index(u32 ifindex); +struct ib_device *ib_device_get_by_index(const struct net *net, u32 index); + /* RDMA device netlink */ void nldev_init(void); void nldev_exit(void); @@ -302,6 +304,7 @@ static inline struct ib_qp *_ib_create_qp(struct ib_device *dev, qp->device = dev; qp->pd = pd; qp->uobject = uobj; + qp->real_qp = qp; /* * We don't track XRC QPs for now, because they don't have PD * and more importantly they are created internaly by driver, @@ -336,4 +339,17 @@ int roce_resolve_route_from_path(struct sa_path_rec *rec, const struct ib_gid_attr *attr); struct net_device *rdma_read_gid_attr_ndev_rcu(const struct ib_gid_attr *attr); + +void ib_free_port_attrs(struct ib_core_device *coredev); +int ib_setup_port_attrs(struct ib_core_device *coredev); + +int rdma_compatdev_set(u8 enable); + +int ib_port_register_module_stat(struct ib_device *device, u8 port_num, + struct kobject *kobj, struct kobj_type *ktype, + const char *name); +void ib_port_unregister_module_stat(struct kobject *kobj); + +int ib_device_set_netns_put(struct sk_buff *skb, + struct ib_device *dev, u32 ns_fd); #endif /* _CORE_PRIV_H */ diff --git a/drivers/infiniband/core/cq.c b/drivers/infiniband/core/cq.c index d61e5e1427c2..a4c81992267c 100644 --- a/drivers/infiniband/core/cq.c +++ b/drivers/infiniband/core/cq.c @@ -128,15 +128,17 @@ static void ib_cq_completion_workqueue(struct ib_cq *cq, void *private) * @comp_vector: HCA completion vectors for this CQ * @poll_ctx: context to poll the CQ from. * @caller: module owner name. + * @udata: Valid user data or NULL for kernel object * * This is the proper interface to allocate a CQ for in-kernel users. A * CQ allocated with this interface will automatically be polled from the * specified context. The ULP must use wr->wr_cqe instead of wr->wr_id * to use this CQ abstraction. */ -struct ib_cq *__ib_alloc_cq(struct ib_device *dev, void *private, - int nr_cqe, int comp_vector, - enum ib_poll_context poll_ctx, const char *caller) +struct ib_cq *__ib_alloc_cq_user(struct ib_device *dev, void *private, + int nr_cqe, int comp_vector, + enum ib_poll_context poll_ctx, + const char *caller, struct ib_udata *udata) { struct ib_cq_init_attr cq_attr = { .cqe = nr_cqe, @@ -145,7 +147,7 @@ struct ib_cq *__ib_alloc_cq(struct ib_device *dev, void *private, struct ib_cq *cq; int ret = -ENOMEM; - cq = dev->ops.create_cq(dev, &cq_attr, NULL, NULL); + cq = dev->ops.create_cq(dev, &cq_attr, NULL); if (IS_ERR(cq)) return cq; @@ -193,16 +195,17 @@ out_free_wc: kfree(cq->wc); rdma_restrack_del(&cq->res); out_destroy_cq: - cq->device->ops.destroy_cq(cq); + cq->device->ops.destroy_cq(cq, udata); return ERR_PTR(ret); } -EXPORT_SYMBOL(__ib_alloc_cq); +EXPORT_SYMBOL(__ib_alloc_cq_user); /** * ib_free_cq - free a completion queue * @cq: completion queue to free. + * @udata: User data or NULL for kernel object */ -void ib_free_cq(struct ib_cq *cq) +void ib_free_cq_user(struct ib_cq *cq, struct ib_udata *udata) { int ret; @@ -225,7 +228,7 @@ void ib_free_cq(struct ib_cq *cq) kfree(cq->wc); rdma_restrack_del(&cq->res); - ret = cq->device->ops.destroy_cq(cq); + ret = cq->device->ops.destroy_cq(cq, udata); WARN_ON_ONCE(ret); } -EXPORT_SYMBOL(ib_free_cq); +EXPORT_SYMBOL(ib_free_cq_user); diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 7421ec4883fb..78dc07c6ac4b 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -38,6 +38,8 @@ #include <linux/slab.h> #include <linux/init.h> #include <linux/netdevice.h> +#include <net/net_namespace.h> +#include <net/netns/generic.h> #include <linux/security.h> #include <linux/notifier.h> #include <linux/hashtable.h> @@ -101,6 +103,54 @@ static DECLARE_RWSEM(clients_rwsem); * be registered. */ #define CLIENT_DATA_REGISTERED XA_MARK_1 + +/** + * struct rdma_dev_net - rdma net namespace metadata for a net + * @net: Pointer to owner net namespace + * @id: xarray id to identify the net namespace. + */ +struct rdma_dev_net { + possible_net_t net; + u32 id; +}; + +static unsigned int rdma_dev_net_id; + +/* + * A list of net namespaces is maintained in an xarray. This is necessary + * because we can't get the locking right using the existing net ns list. We + * would require a init_net callback after the list is updated. + */ +static DEFINE_XARRAY_FLAGS(rdma_nets, XA_FLAGS_ALLOC); +/* + * rwsem to protect accessing the rdma_nets xarray entries. + */ +static DECLARE_RWSEM(rdma_nets_rwsem); + +bool ib_devices_shared_netns = true; +module_param_named(netns_mode, ib_devices_shared_netns, bool, 0444); +MODULE_PARM_DESC(netns_mode, + "Share device among net namespaces; default=1 (shared)"); +/** + * rdma_dev_access_netns() - Return whether a rdma device can be accessed + * from a specified net namespace or not. + * @device: Pointer to rdma device which needs to be checked + * @net: Pointer to net namesapce for which access to be checked + * + * rdma_dev_access_netns() - Return whether a rdma device can be accessed + * from a specified net namespace or not. When + * rdma device is in shared mode, it ignores the + * net namespace. When rdma device is exclusive + * to a net namespace, rdma device net namespace is + * checked against the specified one. + */ +bool rdma_dev_access_netns(const struct ib_device *dev, const struct net *net) +{ + return (ib_devices_shared_netns || + net_eq(read_pnet(&dev->coredev.rdma_net), net)); +} +EXPORT_SYMBOL(rdma_dev_access_netns); + /* * xarray has this behavior where it won't iterate over NULL values stored in * allocated arrays. So we need our own iterator to see all values stored in @@ -147,10 +197,73 @@ static int ib_security_change(struct notifier_block *nb, unsigned long event, static void ib_policy_change_task(struct work_struct *work); static DECLARE_WORK(ib_policy_change_work, ib_policy_change_task); +static void __ibdev_printk(const char *level, const struct ib_device *ibdev, + struct va_format *vaf) +{ + if (ibdev && ibdev->dev.parent) + dev_printk_emit(level[1] - '0', + ibdev->dev.parent, + "%s %s %s: %pV", + dev_driver_string(ibdev->dev.parent), + dev_name(ibdev->dev.parent), + dev_name(&ibdev->dev), + vaf); + else if (ibdev) + printk("%s%s: %pV", + level, dev_name(&ibdev->dev), vaf); + else + printk("%s(NULL ib_device): %pV", level, vaf); +} + +void ibdev_printk(const char *level, const struct ib_device *ibdev, + const char *format, ...) +{ + struct va_format vaf; + va_list args; + + va_start(args, format); + + vaf.fmt = format; + vaf.va = &args; + + __ibdev_printk(level, ibdev, &vaf); + + va_end(args); +} +EXPORT_SYMBOL(ibdev_printk); + +#define define_ibdev_printk_level(func, level) \ +void func(const struct ib_device *ibdev, const char *fmt, ...) \ +{ \ + struct va_format vaf; \ + va_list args; \ + \ + va_start(args, fmt); \ + \ + vaf.fmt = fmt; \ + vaf.va = &args; \ + \ + __ibdev_printk(level, ibdev, &vaf); \ + \ + va_end(args); \ +} \ +EXPORT_SYMBOL(func); + +define_ibdev_printk_level(ibdev_emerg, KERN_EMERG); +define_ibdev_printk_level(ibdev_alert, KERN_ALERT); +define_ibdev_printk_level(ibdev_crit, KERN_CRIT); +define_ibdev_printk_level(ibdev_err, KERN_ERR); +define_ibdev_printk_level(ibdev_warn, KERN_WARNING); +define_ibdev_printk_level(ibdev_notice, KERN_NOTICE); +define_ibdev_printk_level(ibdev_info, KERN_INFO); + static struct notifier_block ibdev_lsm_nb = { .notifier_call = ib_security_change, }; +static int rdma_dev_change_netns(struct ib_device *device, struct net *cur_net, + struct net *net); + /* Pointer to the RCU head at the start of the ib_port_data array */ struct ib_port_data_rcu { struct rcu_head rcu_head; @@ -200,16 +313,22 @@ static int ib_device_check_mandatory(struct ib_device *device) * Caller must perform ib_device_put() to return the device reference count * when ib_device_get_by_index() returns valid device pointer. */ -struct ib_device *ib_device_get_by_index(u32 index) +struct ib_device *ib_device_get_by_index(const struct net *net, u32 index) { struct ib_device *device; down_read(&devices_rwsem); device = xa_load(&devices, index); if (device) { + if (!rdma_dev_access_netns(device, net)) { + device = NULL; + goto out; + } + if (!ib_device_try_get(device)) device = NULL; } +out: up_read(&devices_rwsem); return device; } @@ -268,6 +387,26 @@ struct ib_device *ib_device_get_by_name(const char *name, } EXPORT_SYMBOL(ib_device_get_by_name); +static int rename_compat_devs(struct ib_device *device) +{ + struct ib_core_device *cdev; + unsigned long index; + int ret = 0; + + mutex_lock(&device->compat_devs_mutex); + xa_for_each (&device->compat_devs, index, cdev) { + ret = device_rename(&cdev->dev, dev_name(&device->dev)); + if (ret) { + dev_warn(&cdev->dev, + "Fail to rename compatdev to new name %s\n", + dev_name(&device->dev)); + break; + } + } + mutex_unlock(&device->compat_devs_mutex); + return ret; +} + int ib_device_rename(struct ib_device *ibdev, const char *name) { int ret; @@ -287,6 +426,7 @@ int ib_device_rename(struct ib_device *ibdev, const char *name) if (ret) goto out; strlcpy(ibdev->name, name, IB_DEVICE_NAME_MAX); + ret = rename_compat_devs(ibdev); out: up_write(&devices_rwsem); return ret; @@ -336,6 +476,7 @@ static void ib_device_release(struct device *device) WARN_ON(refcount_read(&dev->refcount)); ib_cache_release_one(dev); ib_security_release_port_pkey_list(dev); + xa_destroy(&dev->compat_devs); xa_destroy(&dev->client_data); if (dev->port_data) kfree_rcu(container_of(dev->port_data, struct ib_port_data_rcu, @@ -357,12 +498,42 @@ static int ib_device_uevent(struct device *device, return 0; } +static const void *net_namespace(struct device *d) +{ + struct ib_core_device *coredev = + container_of(d, struct ib_core_device, dev); + + return read_pnet(&coredev->rdma_net); +} + static struct class ib_class = { .name = "infiniband", .dev_release = ib_device_release, .dev_uevent = ib_device_uevent, + .ns_type = &net_ns_type_operations, + .namespace = net_namespace, }; +static void rdma_init_coredev(struct ib_core_device *coredev, + struct ib_device *dev, struct net *net) +{ + /* This BUILD_BUG_ON is intended to catch layout change + * of union of ib_core_device and device. + * dev must be the first element as ib_core and providers + * driver uses it. Adding anything in ib_core_device before + * device will break this assumption. + */ + BUILD_BUG_ON(offsetof(struct ib_device, coredev.dev) != + offsetof(struct ib_device, dev)); + + coredev->dev.class = &ib_class; + coredev->dev.groups = dev->groups; + device_initialize(&coredev->dev); + coredev->owner = dev; + INIT_LIST_HEAD(&coredev->port_list); + write_pnet(&coredev->rdma_net, net); +} + /** * _ib_alloc_device - allocate an IB device struct * @size:size of structure to allocate @@ -389,10 +560,8 @@ struct ib_device *_ib_alloc_device(size_t size) return NULL; } - device->dev.class = &ib_class; device->groups[0] = &ib_dev_attr_group; - device->dev.groups = device->groups; - device_initialize(&device->dev); + rdma_init_coredev(&device->coredev, device, &init_net); INIT_LIST_HEAD(&device->event_handler_list); spin_lock_init(&device->event_handler_lock); @@ -403,7 +572,8 @@ struct ib_device *_ib_alloc_device(size_t size) */ xa_init_flags(&device->client_data, XA_FLAGS_ALLOC); init_rwsem(&device->client_data_rwsem); - INIT_LIST_HEAD(&device->port_list); + xa_init_flags(&device->compat_devs, XA_FLAGS_ALLOC); + mutex_init(&device->compat_devs_mutex); init_completion(&device->unreg_completion); INIT_WORK(&device->unregistration_work, ib_unregister_work); @@ -436,6 +606,7 @@ void ib_dealloc_device(struct ib_device *device) /* Expedite releasing netdev references */ free_netdevs(device); + WARN_ON(!xa_empty(&device->compat_devs)); WARN_ON(!xa_empty(&device->client_data)); WARN_ON(refcount_read(&device->refcount)); rdma_restrack_clean(device); @@ -644,6 +815,283 @@ static int ib_security_change(struct notifier_block *nb, unsigned long event, return NOTIFY_OK; } +static void compatdev_release(struct device *dev) +{ + struct ib_core_device *cdev = + container_of(dev, struct ib_core_device, dev); + + kfree(cdev); +} + +static int add_one_compat_dev(struct ib_device *device, + struct rdma_dev_net *rnet) +{ + struct ib_core_device *cdev; + int ret; + + lockdep_assert_held(&rdma_nets_rwsem); + if (!ib_devices_shared_netns) + return 0; + + /* + * Create and add compat device in all namespaces other than where it + * is currently bound to. + */ + if (net_eq(read_pnet(&rnet->net), + read_pnet(&device->coredev.rdma_net))) + return 0; + + /* + * The first of init_net() or ib_register_device() to take the + * compat_devs_mutex wins and gets to add the device. Others will wait + * for completion here. + */ + mutex_lock(&device->compat_devs_mutex); + cdev = xa_load(&device->compat_devs, rnet->id); + if (cdev) { + ret = 0; + goto done; + } + ret = xa_reserve(&device->compat_devs, rnet->id, GFP_KERNEL); + if (ret) + goto done; + + cdev = kzalloc(sizeof(*cdev), GFP_KERNEL); + if (!cdev) { + ret = -ENOMEM; + goto cdev_err; + } + + cdev->dev.parent = device->dev.parent; + rdma_init_coredev(cdev, device, read_pnet(&rnet->net)); + cdev->dev.release = compatdev_release; + dev_set_name(&cdev->dev, "%s", dev_name(&device->dev)); + + ret = device_add(&cdev->dev); + if (ret) + goto add_err; + ret = ib_setup_port_attrs(cdev); + if (ret) + goto port_err; + + ret = xa_err(xa_store(&device->compat_devs, rnet->id, + cdev, GFP_KERNEL)); + if (ret) + goto insert_err; + + mutex_unlock(&device->compat_devs_mutex); + return 0; + +insert_err: + ib_free_port_attrs(cdev); +port_err: + device_del(&cdev->dev); +add_err: + put_device(&cdev->dev); +cdev_err: + xa_release(&device->compat_devs, rnet->id); +done: + mutex_unlock(&device->compat_devs_mutex); + return ret; +} + +static void remove_one_compat_dev(struct ib_device *device, u32 id) +{ + struct ib_core_device *cdev; + + mutex_lock(&device->compat_devs_mutex); + cdev = xa_erase(&device->compat_devs, id); + mutex_unlock(&device->compat_devs_mutex); + if (cdev) { + ib_free_port_attrs(cdev); + device_del(&cdev->dev); + put_device(&cdev->dev); + } +} + +static void remove_compat_devs(struct ib_device *device) +{ + struct ib_core_device *cdev; + unsigned long index; + + xa_for_each (&device->compat_devs, index, cdev) + remove_one_compat_dev(device, index); +} + +static int add_compat_devs(struct ib_device *device) +{ + struct rdma_dev_net *rnet; + unsigned long index; + int ret = 0; + + lockdep_assert_held(&devices_rwsem); + + down_read(&rdma_nets_rwsem); + xa_for_each (&rdma_nets, index, rnet) { + ret = add_one_compat_dev(device, rnet); + if (ret) + break; + } + up_read(&rdma_nets_rwsem); + return ret; +} + +static void remove_all_compat_devs(void) +{ + struct ib_compat_device *cdev; + struct ib_device *dev; + unsigned long index; + + down_read(&devices_rwsem); + xa_for_each (&devices, index, dev) { + unsigned long c_index = 0; + + /* Hold nets_rwsem so that any other thread modifying this + * system param can sync with this thread. + */ + down_read(&rdma_nets_rwsem); + xa_for_each (&dev->compat_devs, c_index, cdev) + remove_one_compat_dev(dev, c_index); + up_read(&rdma_nets_rwsem); + } + up_read(&devices_rwsem); +} + +static int add_all_compat_devs(void) +{ + struct rdma_dev_net *rnet; + struct ib_device *dev; + unsigned long index; + int ret = 0; + + down_read(&devices_rwsem); + xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) { + unsigned long net_index = 0; + + /* Hold nets_rwsem so that any other thread modifying this + * system param can sync with this thread. + */ + down_read(&rdma_nets_rwsem); + xa_for_each (&rdma_nets, net_index, rnet) { + ret = add_one_compat_dev(dev, rnet); + if (ret) + break; + } + up_read(&rdma_nets_rwsem); + } + up_read(&devices_rwsem); + if (ret) + remove_all_compat_devs(); + return ret; +} + +int rdma_compatdev_set(u8 enable) +{ + struct rdma_dev_net *rnet; + unsigned long index; + int ret = 0; + + down_write(&rdma_nets_rwsem); + if (ib_devices_shared_netns == enable) { + up_write(&rdma_nets_rwsem); + return 0; + } + + /* enable/disable of compat devices is not supported + * when more than default init_net exists. + */ + xa_for_each (&rdma_nets, index, rnet) { + ret++; + break; + } + if (!ret) + ib_devices_shared_netns = enable; + up_write(&rdma_nets_rwsem); + if (ret) + return -EBUSY; + + if (enable) + ret = add_all_compat_devs(); + else + remove_all_compat_devs(); + return ret; +} + +static void rdma_dev_exit_net(struct net *net) +{ + struct rdma_dev_net *rnet = net_generic(net, rdma_dev_net_id); + struct ib_device *dev; + unsigned long index; + int ret; + + down_write(&rdma_nets_rwsem); + /* + * Prevent the ID from being re-used and hide the id from xa_for_each. + */ + ret = xa_err(xa_store(&rdma_nets, rnet->id, NULL, GFP_KERNEL)); + WARN_ON(ret); + up_write(&rdma_nets_rwsem); + + down_read(&devices_rwsem); + xa_for_each (&devices, index, dev) { + get_device(&dev->dev); + /* + * Release the devices_rwsem so that pontentially blocking + * device_del, doesn't hold the devices_rwsem for too long. + */ + up_read(&devices_rwsem); + + remove_one_compat_dev(dev, rnet->id); + + /* + * If the real device is in the NS then move it back to init. + */ + rdma_dev_change_netns(dev, net, &init_net); + + put_device(&dev->dev); + down_read(&devices_rwsem); + } + up_read(&devices_rwsem); + + xa_erase(&rdma_nets, rnet->id); +} + +static __net_init int rdma_dev_init_net(struct net *net) +{ + struct rdma_dev_net *rnet = net_generic(net, rdma_dev_net_id); + unsigned long index; + struct ib_device *dev; + int ret; + + /* No need to create any compat devices in default init_net. */ + if (net_eq(net, &init_net)) + return 0; + + write_pnet(&rnet->net, net); + + ret = xa_alloc(&rdma_nets, &rnet->id, rnet, xa_limit_32b, GFP_KERNEL); + if (ret) + return ret; + + down_read(&devices_rwsem); + xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) { + /* Hold nets_rwsem so that netlink command cannot change + * system configuration for device sharing mode. + */ + down_read(&rdma_nets_rwsem); + ret = add_one_compat_dev(dev, rnet); + up_read(&rdma_nets_rwsem); + if (ret) + break; + } + up_read(&devices_rwsem); + + if (ret) + rdma_dev_exit_net(net); + + return ret; +} + /* * Assign the unique string device name and the unique device index. This is * undone by ib_dealloc_device. @@ -711,6 +1159,9 @@ static void setup_dma_device(struct ib_device *device) WARN_ON_ONCE(!parent); device->dma_device = parent; } + /* Setup default max segment size for all IB devices */ + dma_set_max_seg_size(device->dma_device, SZ_2G); + } /* @@ -765,8 +1216,12 @@ static void disable_device(struct ib_device *device) ib_device_put(device); wait_for_completion(&device->unreg_completion); - /* Expedite removing unregistered pointers from the hash table */ - free_netdevs(device); + /* + * compat devices must be removed after device refcount drops to zero. + * Otherwise init_net() may add more compatdevs after removing compat + * devices and before device is disabled. + */ + remove_compat_devs(device); } /* @@ -807,7 +1262,8 @@ static int enable_device_and_get(struct ib_device *device) break; } up_read(&clients_rwsem); - + if (!ret) + ret = add_compat_devs(device); out: up_read(&devices_rwsem); return ret; @@ -847,6 +1303,11 @@ int ib_register_device(struct ib_device *device, const char *name) ib_device_register_rdmacg(device); + /* + * Ensure that ADD uevent is not fired because it + * is too early amd device is not initialized yet. + */ + dev_set_uevent_suppress(&device->dev, true); ret = device_add(&device->dev); if (ret) goto cg_cleanup; @@ -859,6 +1320,9 @@ int ib_register_device(struct ib_device *device, const char *name) } ret = enable_device_and_get(device); + dev_set_uevent_suppress(&device->dev, false); + /* Mark for userspace that device is ready */ + kobject_uevent(&device->dev.kobj, KOBJ_ADD); if (ret) { void (*dealloc_fn)(struct ib_device *); @@ -887,6 +1351,7 @@ int ib_register_device(struct ib_device *device, const char *name) dev_cleanup: device_del(&device->dev); cg_cleanup: + dev_set_uevent_suppress(&device->dev, false); ib_device_unregister_rdmacg(device); ib_cache_cleanup_one(device); return ret; @@ -908,6 +1373,10 @@ static void __ib_unregister_device(struct ib_device *ib_dev) goto out; disable_device(ib_dev); + + /* Expedite removing unregistered pointers from the hash table */ + free_netdevs(ib_dev); + ib_device_unregister_sysfs(ib_dev); device_del(&ib_dev->dev); ib_device_unregister_rdmacg(ib_dev); @@ -1038,6 +1507,126 @@ void ib_unregister_device_queued(struct ib_device *ib_dev) } EXPORT_SYMBOL(ib_unregister_device_queued); +/* + * The caller must pass in a device that has the kref held and the refcount + * released. If the device is in cur_net and still registered then it is moved + * into net. + */ +static int rdma_dev_change_netns(struct ib_device *device, struct net *cur_net, + struct net *net) +{ + int ret2 = -EINVAL; + int ret; + + mutex_lock(&device->unregistration_lock); + + /* + * If a device not under ib_device_get() or if the unregistration_lock + * is not held, the namespace can be changed, or it can be unregistered. + * Check again under the lock. + */ + if (refcount_read(&device->refcount) == 0 || + !net_eq(cur_net, read_pnet(&device->coredev.rdma_net))) { + ret = -ENODEV; + goto out; + } + + kobject_uevent(&device->dev.kobj, KOBJ_REMOVE); + disable_device(device); + + /* + * At this point no one can be using the device, so it is safe to + * change the namespace. + */ + write_pnet(&device->coredev.rdma_net, net); + + down_read(&devices_rwsem); + /* + * Currently rdma devices are system wide unique. So the device name + * is guaranteed free in the new namespace. Publish the new namespace + * at the sysfs level. + */ + ret = device_rename(&device->dev, dev_name(&device->dev)); + up_read(&devices_rwsem); + if (ret) { + dev_warn(&device->dev, + "%s: Couldn't rename device after namespace change\n", + __func__); + /* Try and put things back and re-enable the device */ + write_pnet(&device->coredev.rdma_net, cur_net); + } + + ret2 = enable_device_and_get(device); + if (ret2) { + /* + * This shouldn't really happen, but if it does, let the user + * retry at later point. So don't disable the device. + */ + dev_warn(&device->dev, + "%s: Couldn't re-enable device after namespace change\n", + __func__); + } + kobject_uevent(&device->dev.kobj, KOBJ_ADD); + + ib_device_put(device); +out: + mutex_unlock(&device->unregistration_lock); + if (ret) + return ret; + return ret2; +} + +int ib_device_set_netns_put(struct sk_buff *skb, + struct ib_device *dev, u32 ns_fd) +{ + struct net *net; + int ret; + + net = get_net_ns_by_fd(ns_fd); + if (IS_ERR(net)) { + ret = PTR_ERR(net); + goto net_err; + } + + if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) { + ret = -EPERM; + goto ns_err; + } + + /* + * Currently supported only for those providers which support + * disassociation and don't do port specific sysfs init. Once a + * port_cleanup infrastructure is implemented, this limitation will be + * removed. + */ + if (!dev->ops.disassociate_ucontext || dev->ops.init_port || + ib_devices_shared_netns) { + ret = -EOPNOTSUPP; + goto ns_err; + } + + get_device(&dev->dev); + ib_device_put(dev); + ret = rdma_dev_change_netns(dev, current->nsproxy->net_ns, net); + put_device(&dev->dev); + + put_net(net); + return ret; + +ns_err: + put_net(net); +net_err: + ib_device_put(dev); + return ret; +} + +static struct pernet_operations rdma_dev_net_ops = { + .init = rdma_dev_init_net, + .exit = rdma_dev_exit_net, + .id = &rdma_dev_net_id, + .size = sizeof(struct rdma_dev_net), +}; + static int assign_client_id(struct ib_client *client) { int ret; @@ -1515,6 +2104,9 @@ int ib_enum_all_devs(nldev_callback nldev_cb, struct sk_buff *skb, down_read(&devices_rwsem); xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) { + if (!rdma_dev_access_netns(dev, sock_net(skb->sk))) + continue; + ret = nldev_cb(dev, skb, cb, idx); if (ret) break; @@ -1787,6 +2379,14 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops) SET_DEVICE_OP(dev_ops, get_vf_config); SET_DEVICE_OP(dev_ops, get_vf_stats); SET_DEVICE_OP(dev_ops, init_port); + SET_DEVICE_OP(dev_ops, iw_accept); + SET_DEVICE_OP(dev_ops, iw_add_ref); + SET_DEVICE_OP(dev_ops, iw_connect); + SET_DEVICE_OP(dev_ops, iw_create_listen); + SET_DEVICE_OP(dev_ops, iw_destroy_listen); + SET_DEVICE_OP(dev_ops, iw_get_qp); + SET_DEVICE_OP(dev_ops, iw_reject); + SET_DEVICE_OP(dev_ops, iw_rem_ref); SET_DEVICE_OP(dev_ops, map_mr_sg); SET_DEVICE_OP(dev_ops, map_phys_fmr); SET_DEVICE_OP(dev_ops, mmap); @@ -1823,7 +2423,9 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops) SET_DEVICE_OP(dev_ops, set_vf_link_state); SET_DEVICE_OP(dev_ops, unmap_fmr); + SET_OBJ_SIZE(dev_ops, ib_ah); SET_OBJ_SIZE(dev_ops, ib_pd); + SET_OBJ_SIZE(dev_ops, ib_srq); SET_OBJ_SIZE(dev_ops, ib_ucontext); } EXPORT_SYMBOL(ib_set_device_ops); @@ -1903,12 +2505,20 @@ static int __init ib_core_init(void) goto err_sa; } + ret = register_pernet_device(&rdma_dev_net_ops); + if (ret) { + pr_warn("Couldn't init compat dev. ret %d\n", ret); + goto err_compat; + } + nldev_init(); rdma_nl_register(RDMA_NL_LS, ibnl_ls_cb_table); roce_gid_mgmt_init(); return 0; +err_compat: + unregister_lsm_notifier(&ibdev_lsm_nb); err_sa: ib_sa_cleanup(); err_mad: @@ -1933,6 +2543,7 @@ static void __exit ib_core_cleanup(void) roce_gid_mgmt_cleanup(); nldev_exit(); rdma_nl_unregister(RDMA_NL_LS); + unregister_pernet_device(&rdma_dev_net_ops); unregister_lsm_notifier(&ibdev_lsm_nb); ib_sa_cleanup(); ib_mad_cleanup(); @@ -1950,5 +2561,8 @@ static void __exit ib_core_cleanup(void) MODULE_ALIAS_RDMA_NETLINK(RDMA_NL_LS, 4); -subsys_initcall(ib_core_init); +/* ib core relies on netdev stack to first register net_ns_type_operations + * ns kobject type before ib_core initialization. + */ +fs_initcall(ib_core_init); module_exit(ib_core_cleanup); diff --git a/drivers/infiniband/core/iwcm.c b/drivers/infiniband/core/iwcm.c index 732637c913d9..72141c5b7c95 100644 --- a/drivers/infiniband/core/iwcm.c +++ b/drivers/infiniband/core/iwcm.c @@ -394,7 +394,7 @@ static void destroy_cm_id(struct iw_cm_id *cm_id) cm_id_priv->state = IW_CM_STATE_DESTROYING; spin_unlock_irqrestore(&cm_id_priv->lock, flags); /* destroy the listening endpoint */ - cm_id->device->iwcm->destroy_listen(cm_id); + cm_id->device->ops.iw_destroy_listen(cm_id); spin_lock_irqsave(&cm_id_priv->lock, flags); break; case IW_CM_STATE_ESTABLISHED: @@ -417,7 +417,7 @@ static void destroy_cm_id(struct iw_cm_id *cm_id) */ cm_id_priv->state = IW_CM_STATE_DESTROYING; spin_unlock_irqrestore(&cm_id_priv->lock, flags); - cm_id->device->iwcm->reject(cm_id, NULL, 0); + cm_id->device->ops.iw_reject(cm_id, NULL, 0); spin_lock_irqsave(&cm_id_priv->lock, flags); break; case IW_CM_STATE_CONN_SENT: @@ -427,7 +427,7 @@ static void destroy_cm_id(struct iw_cm_id *cm_id) break; } if (cm_id_priv->qp) { - cm_id_priv->id.device->iwcm->rem_ref(cm_id_priv->qp); + cm_id_priv->id.device->ops.iw_rem_ref(cm_id_priv->qp); cm_id_priv->qp = NULL; } spin_unlock_irqrestore(&cm_id_priv->lock, flags); @@ -504,7 +504,7 @@ static void iw_cm_check_wildcard(struct sockaddr_storage *pm_addr, static int iw_cm_map(struct iw_cm_id *cm_id, bool active) { const char *devname = dev_name(&cm_id->device->dev); - const char *ifname = cm_id->device->iwcm->ifname; + const char *ifname = cm_id->device->iw_ifname; struct iwpm_dev_data pm_reg_msg = {}; struct iwpm_sa_data pm_msg; int status; @@ -526,7 +526,7 @@ static int iw_cm_map(struct iw_cm_id *cm_id, bool active) cm_id->mapped = true; pm_msg.loc_addr = cm_id->local_addr; pm_msg.rem_addr = cm_id->remote_addr; - pm_msg.flags = (cm_id->device->iwcm->driver_flags & IW_F_NO_PORT_MAP) ? + pm_msg.flags = (cm_id->device->iw_driver_flags & IW_F_NO_PORT_MAP) ? IWPM_FLAGS_NO_PORT_MAP : 0; if (active) status = iwpm_add_and_query_mapping(&pm_msg, @@ -577,7 +577,8 @@ int iw_cm_listen(struct iw_cm_id *cm_id, int backlog) spin_unlock_irqrestore(&cm_id_priv->lock, flags); ret = iw_cm_map(cm_id, false); if (!ret) - ret = cm_id->device->iwcm->create_listen(cm_id, backlog); + ret = cm_id->device->ops.iw_create_listen(cm_id, + backlog); if (ret) cm_id_priv->state = IW_CM_STATE_IDLE; spin_lock_irqsave(&cm_id_priv->lock, flags); @@ -617,7 +618,7 @@ int iw_cm_reject(struct iw_cm_id *cm_id, cm_id_priv->state = IW_CM_STATE_IDLE; spin_unlock_irqrestore(&cm_id_priv->lock, flags); - ret = cm_id->device->iwcm->reject(cm_id, private_data, + ret = cm_id->device->ops.iw_reject(cm_id, private_data, private_data_len); clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags); @@ -653,25 +654,25 @@ int iw_cm_accept(struct iw_cm_id *cm_id, return -EINVAL; } /* Get the ib_qp given the QPN */ - qp = cm_id->device->iwcm->get_qp(cm_id->device, iw_param->qpn); + qp = cm_id->device->ops.iw_get_qp(cm_id->device, iw_param->qpn); if (!qp) { spin_unlock_irqrestore(&cm_id_priv->lock, flags); clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags); wake_up_all(&cm_id_priv->connect_wait); return -EINVAL; } - cm_id->device->iwcm->add_ref(qp); + cm_id->device->ops.iw_add_ref(qp); cm_id_priv->qp = qp; spin_unlock_irqrestore(&cm_id_priv->lock, flags); - ret = cm_id->device->iwcm->accept(cm_id, iw_param); + ret = cm_id->device->ops.iw_accept(cm_id, iw_param); if (ret) { /* An error on accept precludes provider events */ BUG_ON(cm_id_priv->state != IW_CM_STATE_CONN_RECV); cm_id_priv->state = IW_CM_STATE_IDLE; spin_lock_irqsave(&cm_id_priv->lock, flags); if (cm_id_priv->qp) { - cm_id->device->iwcm->rem_ref(qp); + cm_id->device->ops.iw_rem_ref(qp); cm_id_priv->qp = NULL; } spin_unlock_irqrestore(&cm_id_priv->lock, flags); @@ -712,25 +713,25 @@ int iw_cm_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *iw_param) } /* Get the ib_qp given the QPN */ - qp = cm_id->device->iwcm->get_qp(cm_id->device, iw_param->qpn); + qp = cm_id->device->ops.iw_get_qp(cm_id->device, iw_param->qpn); if (!qp) { ret = -EINVAL; goto err; } - cm_id->device->iwcm->add_ref(qp); + cm_id->device->ops.iw_add_ref(qp); cm_id_priv->qp = qp; cm_id_priv->state = IW_CM_STATE_CONN_SENT; spin_unlock_irqrestore(&cm_id_priv->lock, flags); ret = iw_cm_map(cm_id, true); if (!ret) - ret = cm_id->device->iwcm->connect(cm_id, iw_param); + ret = cm_id->device->ops.iw_connect(cm_id, iw_param); if (!ret) return 0; /* success */ spin_lock_irqsave(&cm_id_priv->lock, flags); if (cm_id_priv->qp) { - cm_id->device->iwcm->rem_ref(qp); + cm_id->device->ops.iw_rem_ref(qp); cm_id_priv->qp = NULL; } cm_id_priv->state = IW_CM_STATE_IDLE; @@ -895,7 +896,7 @@ static int cm_conn_rep_handler(struct iwcm_id_private *cm_id_priv, cm_id_priv->state = IW_CM_STATE_ESTABLISHED; } else { /* REJECTED or RESET */ - cm_id_priv->id.device->iwcm->rem_ref(cm_id_priv->qp); + cm_id_priv->id.device->ops.iw_rem_ref(cm_id_priv->qp); cm_id_priv->qp = NULL; cm_id_priv->state = IW_CM_STATE_IDLE; } @@ -946,7 +947,7 @@ static int cm_close_handler(struct iwcm_id_private *cm_id_priv, spin_lock_irqsave(&cm_id_priv->lock, flags); if (cm_id_priv->qp) { - cm_id_priv->id.device->iwcm->rem_ref(cm_id_priv->qp); + cm_id_priv->id.device->ops.iw_rem_ref(cm_id_priv->qp); cm_id_priv->qp = NULL; } switch (cm_id_priv->state) { diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c index e742a6a2c138..cc99479b2c09 100644 --- a/drivers/infiniband/core/mad.c +++ b/drivers/infiniband/core/mad.c @@ -3,7 +3,7 @@ * Copyright (c) 2005 Intel Corporation. All rights reserved. * Copyright (c) 2005 Mellanox Technologies Ltd. All rights reserved. * Copyright (c) 2009 HNR Consulting. All rights reserved. - * Copyright (c) 2014 Intel Corporation. All rights reserved. + * Copyright (c) 2014,2018 Intel Corporation. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -38,10 +38,10 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/dma-mapping.h> -#include <linux/idr.h> #include <linux/slab.h> #include <linux/module.h> #include <linux/security.h> +#include <linux/xarray.h> #include <rdma/ib_cache.h> #include "mad_priv.h" @@ -51,6 +51,32 @@ #include "opa_smi.h" #include "agent.h" +#define CREATE_TRACE_POINTS +#include <trace/events/ib_mad.h> + +#ifdef CONFIG_TRACEPOINTS +static void create_mad_addr_info(struct ib_mad_send_wr_private *mad_send_wr, + struct ib_mad_qp_info *qp_info, + struct trace_event_raw_ib_mad_send_template *entry) +{ + u16 pkey; + struct ib_device *dev = qp_info->port_priv->device; + u8 pnum = qp_info->port_priv->port_num; + struct ib_ud_wr *wr = &mad_send_wr->send_wr; + struct rdma_ah_attr attr = {}; + + rdma_query_ah(wr->ah, &attr); + + /* These are common */ + entry->sl = attr.sl; + ib_query_pkey(dev, pnum, wr->pkey_index, &pkey); + entry->pkey = pkey; + entry->rqpn = wr->remote_qpn; + entry->rqkey = wr->remote_qkey; + entry->dlid = rdma_ah_get_dlid(&attr); +} +#endif + static int mad_sendq_size = IB_MAD_QP_SEND_SIZE; static int mad_recvq_size = IB_MAD_QP_RECV_SIZE; @@ -59,12 +85,9 @@ MODULE_PARM_DESC(send_queue_size, "Size of send queue in number of work requests module_param_named(recv_queue_size, mad_recvq_size, int, 0444); MODULE_PARM_DESC(recv_queue_size, "Size of receive queue in number of work requests"); -/* - * The mlx4 driver uses the top byte to distinguish which virtual function - * generated the MAD, so we must avoid using it. - */ -#define AGENT_ID_LIMIT (1 << 24) -static DEFINE_IDR(ib_mad_clients); +/* Client ID 0 is used for snoop-only clients */ +static DEFINE_XARRAY_ALLOC1(ib_mad_clients); +static u32 ib_mad_client_next; static struct list_head ib_mad_port_list; /* Port list lock */ @@ -389,18 +412,17 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device, goto error4; } - idr_preload(GFP_KERNEL); - idr_lock(&ib_mad_clients); - ret2 = idr_alloc_cyclic(&ib_mad_clients, mad_agent_priv, 0, - AGENT_ID_LIMIT, GFP_ATOMIC); - idr_unlock(&ib_mad_clients); - idr_preload_end(); - + /* + * The mlx4 driver uses the top byte to distinguish which virtual + * function generated the MAD, so we must avoid using it. + */ + ret2 = xa_alloc_cyclic(&ib_mad_clients, &mad_agent_priv->agent.hi_tid, + mad_agent_priv, XA_LIMIT(0, (1 << 24) - 1), + &ib_mad_client_next, GFP_KERNEL); if (ret2 < 0) { ret = ERR_PTR(ret2); goto error5; } - mad_agent_priv->agent.hi_tid = ret2; /* * Make sure MAD registration (if supplied) @@ -445,12 +467,11 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device, } spin_unlock_irq(&port_priv->reg_lock); + trace_ib_mad_create_agent(mad_agent_priv); return &mad_agent_priv->agent; error6: spin_unlock_irq(&port_priv->reg_lock); - idr_lock(&ib_mad_clients); - idr_remove(&ib_mad_clients, mad_agent_priv->agent.hi_tid); - idr_unlock(&ib_mad_clients); + xa_erase(&ib_mad_clients, mad_agent_priv->agent.hi_tid); error5: ib_mad_agent_security_cleanup(&mad_agent_priv->agent); error4: @@ -602,6 +623,7 @@ static void unregister_mad_agent(struct ib_mad_agent_private *mad_agent_priv) struct ib_mad_port_private *port_priv; /* Note that we could still be handling received MADs */ + trace_ib_mad_unregister_agent(mad_agent_priv); /* * Canceling all sends results in dropping received response @@ -614,9 +636,7 @@ static void unregister_mad_agent(struct ib_mad_agent_private *mad_agent_priv) spin_lock_irq(&port_priv->reg_lock); remove_mad_reg_req(mad_agent_priv); spin_unlock_irq(&port_priv->reg_lock); - idr_lock(&ib_mad_clients); - idr_remove(&ib_mad_clients, mad_agent_priv->agent.hi_tid); - idr_unlock(&ib_mad_clients); + xa_erase(&ib_mad_clients, mad_agent_priv->agent.hi_tid); flush_workqueue(port_priv->wq); ib_cancel_rmpp_recvs(mad_agent_priv); @@ -821,6 +841,8 @@ static int handle_outgoing_dr_smp(struct ib_mad_agent_private *mad_agent_priv, if (opa && smp->class_version == OPA_SM_CLASS_VERSION) { u32 opa_drslid; + trace_ib_mad_handle_out_opa_smi(opa_smp); + if ((opa_get_smp_direction(opa_smp) ? opa_smp->route.dr.dr_dlid : opa_smp->route.dr.dr_slid) == OPA_LID_PERMISSIVE && @@ -846,6 +868,8 @@ static int handle_outgoing_dr_smp(struct ib_mad_agent_private *mad_agent_priv, opa_smi_check_local_returning_smp(opa_smp, device) == IB_SMI_DISCARD) goto out; } else { + trace_ib_mad_handle_out_ib_smi(smp); + if ((ib_get_smp_direction(smp) ? smp->dr_dlid : smp->dr_slid) == IB_LID_PERMISSIVE && smi_handle_dr_smp_send(smp, rdma_cap_ib_switch(device), port_num) == @@ -1223,6 +1247,7 @@ int ib_send_mad(struct ib_mad_send_wr_private *mad_send_wr) spin_lock_irqsave(&qp_info->send_queue.lock, flags); if (qp_info->send_queue.count < qp_info->send_queue.max_active) { + trace_ib_mad_ib_send_mad(mad_send_wr, qp_info); ret = ib_post_send(mad_agent->qp, &mad_send_wr->send_wr.wr, NULL); list = &qp_info->send_queue.list; @@ -1756,7 +1781,7 @@ find_mad_agent(struct ib_mad_port_private *port_priv, */ hi_tid = be64_to_cpu(mad_hdr->tid) >> 32; rcu_read_lock(); - mad_agent = idr_find(&ib_mad_clients, hi_tid); + mad_agent = xa_load(&ib_mad_clients, hi_tid); if (mad_agent && !atomic_inc_not_zero(&mad_agent->refcount)) mad_agent = NULL; rcu_read_unlock(); @@ -2077,6 +2102,8 @@ static enum smi_action handle_ib_smi(const struct ib_mad_port_private *port_priv enum smi_forward_action retsmi; struct ib_smp *smp = (struct ib_smp *)recv->mad; + trace_ib_mad_handle_ib_smi(smp); + if (smi_handle_dr_smp_recv(smp, rdma_cap_ib_switch(port_priv->device), port_num, @@ -2162,6 +2189,8 @@ handle_opa_smi(struct ib_mad_port_private *port_priv, enum smi_forward_action retsmi; struct opa_smp *smp = (struct opa_smp *)recv->mad; + trace_ib_mad_handle_opa_smi(smp); + if (opa_smi_handle_dr_smp_recv(smp, rdma_cap_ib_switch(port_priv->device), port_num, @@ -2286,6 +2315,9 @@ static void ib_mad_recv_done(struct ib_cq *cq, struct ib_wc *wc) if (!validate_mad((const struct ib_mad_hdr *)recv->mad, qp_info, opa)) goto out; + trace_ib_mad_recv_done_handler(qp_info, wc, + (struct ib_mad_hdr *)recv->mad); + mad_size = recv->mad_size; response = alloc_mad_private(mad_size, GFP_KERNEL); if (!response) @@ -2332,6 +2364,7 @@ static void ib_mad_recv_done(struct ib_cq *cq, struct ib_wc *wc) mad_agent = find_mad_agent(port_priv, (const struct ib_mad_hdr *)recv->mad); if (mad_agent) { + trace_ib_mad_recv_done_agent(mad_agent); ib_mad_complete_recv(mad_agent, &recv->header.recv_wc); /* * recv is freed up in error cases in ib_mad_complete_recv @@ -2496,6 +2529,9 @@ static void ib_mad_send_done(struct ib_cq *cq, struct ib_wc *wc) send_queue = mad_list->mad_queue; qp_info = send_queue->qp_info; + trace_ib_mad_send_done_agent(mad_send_wr->mad_agent_priv); + trace_ib_mad_send_done_handler(mad_send_wr, wc); + retry: ib_dma_unmap_single(mad_send_wr->send_buf.mad_agent->device, mad_send_wr->header_mapping, @@ -2527,6 +2563,7 @@ retry: ib_mad_complete_send_wr(mad_send_wr, &mad_send_wc); if (queued_send_wr) { + trace_ib_mad_send_done_resend(queued_send_wr, qp_info); ret = ib_post_send(qp_info->qp, &queued_send_wr->send_wr.wr, NULL); if (ret) { @@ -2574,6 +2611,7 @@ static bool ib_mad_send_error(struct ib_mad_port_private *port_priv, if (mad_send_wr->retry) { /* Repost send */ mad_send_wr->retry = 0; + trace_ib_mad_error_handler(mad_send_wr, qp_info); ret = ib_post_send(qp_info->qp, &mad_send_wr->send_wr.wr, NULL); if (!ret) @@ -3356,9 +3394,6 @@ int ib_mad_init(void) INIT_LIST_HEAD(&ib_mad_port_list); - /* Client ID 0 is used for snoop-only clients */ - idr_alloc(&ib_mad_clients, NULL, 0, 0, GFP_KERNEL); - if (ib_register_client(&mad_client)) { pr_err("Couldn't register ib_mad client\n"); return -EINVAL; diff --git a/drivers/infiniband/core/mad_priv.h b/drivers/infiniband/core/mad_priv.h index 216509036aa8..956b3a7dfed7 100644 --- a/drivers/infiniband/core/mad_priv.h +++ b/drivers/infiniband/core/mad_priv.h @@ -73,14 +73,14 @@ struct ib_mad_private_header { struct ib_mad_recv_wc recv_wc; struct ib_wc wc; u64 mapping; -} __attribute__ ((packed)); +} __packed; struct ib_mad_private { struct ib_mad_private_header header; size_t mad_size; struct ib_grh grh; u8 mad[0]; -} __attribute__ ((packed)); +} __packed; struct ib_rmpp_segment { struct list_head list; diff --git a/drivers/infiniband/core/multicast.c b/drivers/infiniband/core/multicast.c index d50ff70bb24b..cd338ddc4a39 100644 --- a/drivers/infiniband/core/multicast.c +++ b/drivers/infiniband/core/multicast.c @@ -804,7 +804,6 @@ static void mcast_event_handler(struct ib_event_handler *handler, switch (event->event) { case IB_EVENT_PORT_ERR: case IB_EVENT_LID_CHANGE: - case IB_EVENT_SM_CHANGE: case IB_EVENT_CLIENT_REREGISTER: mcast_groups_event(&dev->port[index], MCAST_GROUP_ERROR); break; diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c index 85324012bf07..98eadd3089ce 100644 --- a/drivers/infiniband/core/nldev.c +++ b/drivers/infiniband/core/nldev.c @@ -116,6 +116,10 @@ static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = { [RDMA_NLDEV_ATTR_RES_CTXN] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_LINK_TYPE] = { .type = NLA_NUL_STRING, .len = RDMA_NLDEV_ATTR_ENTRY_STRLEN }, + [RDMA_NLDEV_SYS_ATTR_NETNS_MODE] = { .type = NLA_U8 }, + [RDMA_NLDEV_ATTR_DEV_PROTOCOL] = { .type = NLA_NUL_STRING, + .len = RDMA_NLDEV_ATTR_ENTRY_STRLEN }, + [RDMA_NLDEV_NET_NS_FD] = { .type = NLA_U32 }, }; static int put_driver_name_print_type(struct sk_buff *msg, const char *name, @@ -198,6 +202,8 @@ static int fill_nldev_handle(struct sk_buff *msg, struct ib_device *device) static int fill_dev_info(struct sk_buff *msg, struct ib_device *device) { char fw[IB_FW_VERSION_NAME_MAX]; + int ret = 0; + u8 port; if (fill_nldev_handle(msg, device)) return -EMSGSIZE; @@ -226,7 +232,25 @@ static int fill_dev_info(struct sk_buff *msg, struct ib_device *device) return -EMSGSIZE; if (nla_put_u8(msg, RDMA_NLDEV_ATTR_DEV_NODE_TYPE, device->node_type)) return -EMSGSIZE; - return 0; + + /* + * Link type is determined on first port and mlx4 device + * which can potentially have two different link type for the same + * IB device is considered as better to be avoided in the future, + */ + port = rdma_start_port(device); + if (rdma_cap_opa_mad(device, port)) + ret = nla_put_string(msg, RDMA_NLDEV_ATTR_DEV_PROTOCOL, "opa"); + else if (rdma_protocol_ib(device, port)) + ret = nla_put_string(msg, RDMA_NLDEV_ATTR_DEV_PROTOCOL, "ib"); + else if (rdma_protocol_iwarp(device, port)) + ret = nla_put_string(msg, RDMA_NLDEV_ATTR_DEV_PROTOCOL, "iw"); + else if (rdma_protocol_roce(device, port)) + ret = nla_put_string(msg, RDMA_NLDEV_ATTR_DEV_PROTOCOL, "roce"); + else if (rdma_protocol_usnic(device, port)) + ret = nla_put_string(msg, RDMA_NLDEV_ATTR_DEV_PROTOCOL, + "usnic"); + return ret; } static int fill_port_info(struct sk_buff *msg, @@ -615,7 +639,7 @@ static int nldev_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh, index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); - device = ib_device_get_by_index(index); + device = ib_device_get_by_index(sock_net(skb->sk), index); if (!device) return -EINVAL; @@ -659,7 +683,7 @@ static int nldev_set_doit(struct sk_buff *skb, struct nlmsghdr *nlh, return -EINVAL; index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); - device = ib_device_get_by_index(index); + device = ib_device_get_by_index(sock_net(skb->sk), index); if (!device) return -EINVAL; @@ -669,9 +693,20 @@ static int nldev_set_doit(struct sk_buff *skb, struct nlmsghdr *nlh, nla_strlcpy(name, tb[RDMA_NLDEV_ATTR_DEV_NAME], IB_DEVICE_NAME_MAX); err = ib_device_rename(device, name); + goto done; } + if (tb[RDMA_NLDEV_NET_NS_FD]) { + u32 ns_fd; + + ns_fd = nla_get_u32(tb[RDMA_NLDEV_NET_NS_FD]); + err = ib_device_set_netns_put(skb, device, ns_fd); + goto put_done; + } + +done: ib_device_put(device); +put_done: return err; } @@ -707,7 +742,7 @@ static int nldev_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { /* * There is no need to take lock, because - * we are relying on ib_core's lists_rwsem + * we are relying on ib_core's locking. */ return ib_enum_all_devs(_nldev_get_dumpit, skb, cb); } @@ -730,7 +765,7 @@ static int nldev_port_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh, return -EINVAL; index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); - device = ib_device_get_by_index(index); + device = ib_device_get_by_index(sock_net(skb->sk), index); if (!device) return -EINVAL; @@ -784,7 +819,7 @@ static int nldev_port_get_dumpit(struct sk_buff *skb, return -EINVAL; ifindex = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); - device = ib_device_get_by_index(ifindex); + device = ib_device_get_by_index(sock_net(skb->sk), ifindex); if (!device) return -EINVAL; @@ -839,7 +874,7 @@ static int nldev_res_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh, return -EINVAL; index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); - device = ib_device_get_by_index(index); + device = ib_device_get_by_index(sock_net(skb->sk), index); if (!device) return -EINVAL; @@ -887,7 +922,6 @@ static int _nldev_res_get_dumpit(struct ib_device *device, nlmsg_cancel(skb, nlh); goto out; } - nlmsg_end(skb, nlh); idx++; @@ -988,7 +1022,7 @@ static int res_get_common_doit(struct sk_buff *skb, struct nlmsghdr *nlh, return -EINVAL; index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); - device = ib_device_get_by_index(index); + device = ib_device_get_by_index(sock_net(skb->sk), index); if (!device) return -EINVAL; @@ -1085,7 +1119,7 @@ static int res_get_common_dumpit(struct sk_buff *skb, return -EINVAL; index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); - device = ib_device_get_by_index(index); + device = ib_device_get_by_index(sock_net(skb->sk), index); if (!device) return -EINVAL; @@ -1300,7 +1334,7 @@ static int nldev_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, return -EINVAL; index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); - device = ib_device_get_by_index(index); + device = ib_device_get_by_index(sock_net(skb->sk), index); if (!device) return -EINVAL; @@ -1313,6 +1347,55 @@ static int nldev_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, return 0; } +static int nldev_get_sys_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; + struct nlmsghdr *nlh; + int err; + + err = nlmsg_parse(cb->nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, + nldev_policy, NULL); + if (err) + return err; + + nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, + RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, + RDMA_NLDEV_CMD_SYS_GET), + 0, 0); + + err = nla_put_u8(skb, RDMA_NLDEV_SYS_ATTR_NETNS_MODE, + (u8)ib_devices_shared_netns); + if (err) { + nlmsg_cancel(skb, nlh); + return err; + } + + nlmsg_end(skb, nlh); + return skb->len; +} + +static int nldev_set_sys_set_doit(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; + u8 enable; + int err; + + err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, + nldev_policy, extack); + if (err || !tb[RDMA_NLDEV_SYS_ATTR_NETNS_MODE]) + return -EINVAL; + + enable = nla_get_u8(tb[RDMA_NLDEV_SYS_ATTR_NETNS_MODE]); + /* Only 0 and 1 are supported */ + if (enable > 1) + return -EINVAL; + + err = rdma_compatdev_set(enable); + return err; +} + static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = { [RDMA_NLDEV_CMD_GET] = { .doit = nldev_get_doit, @@ -1358,6 +1441,13 @@ static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = { .doit = nldev_res_get_pd_doit, .dump = nldev_res_get_pd_dumpit, }, + [RDMA_NLDEV_CMD_SYS_GET] = { + .dump = nldev_get_sys_get_dumpit, + }, + [RDMA_NLDEV_CMD_SYS_SET] = { + .doit = nldev_set_sys_set_doit, + .flags = RDMA_NL_ADMIN_PERM, + }, }; void __init nldev_init(void) diff --git a/drivers/infiniband/core/rdma_core.c b/drivers/infiniband/core/rdma_core.c index 778375ff664e..ccf4d069c25c 100644 --- a/drivers/infiniband/core/rdma_core.c +++ b/drivers/infiniband/core/rdma_core.c @@ -125,9 +125,10 @@ static void assert_uverbs_usecnt(struct ib_uobject *uobj, * and consumes the kref on the uobj. */ static int uverbs_destroy_uobject(struct ib_uobject *uobj, - enum rdma_remove_reason reason) + enum rdma_remove_reason reason, + struct uverbs_attr_bundle *attrs) { - struct ib_uverbs_file *ufile = uobj->ufile; + struct ib_uverbs_file *ufile = attrs->ufile; unsigned long flags; int ret; @@ -135,7 +136,8 @@ static int uverbs_destroy_uobject(struct ib_uobject *uobj, assert_uverbs_usecnt(uobj, UVERBS_LOOKUP_WRITE); if (uobj->object) { - ret = uobj->uapi_object->type_class->destroy_hw(uobj, reason); + ret = uobj->uapi_object->type_class->destroy_hw(uobj, reason, + attrs); if (ret) { if (ib_is_destroy_retryable(ret, reason, uobj)) return ret; @@ -196,9 +198,9 @@ static int uverbs_destroy_uobject(struct ib_uobject *uobj, * version requires the caller to have already obtained an * LOOKUP_DESTROY uobject kref. */ -int uobj_destroy(struct ib_uobject *uobj) +int uobj_destroy(struct ib_uobject *uobj, struct uverbs_attr_bundle *attrs) { - struct ib_uverbs_file *ufile = uobj->ufile; + struct ib_uverbs_file *ufile = attrs->ufile; int ret; down_read(&ufile->hw_destroy_rwsem); @@ -207,7 +209,7 @@ int uobj_destroy(struct ib_uobject *uobj) if (ret) goto out_unlock; - ret = uverbs_destroy_uobject(uobj, RDMA_REMOVE_DESTROY); + ret = uverbs_destroy_uobject(uobj, RDMA_REMOVE_DESTROY, attrs); if (ret) { atomic_set(&uobj->usecnt, 0); goto out_unlock; @@ -224,18 +226,17 @@ out_unlock: * uverbs_put_destroy. */ struct ib_uobject *__uobj_get_destroy(const struct uverbs_api_object *obj, - u32 id, - const struct uverbs_attr_bundle *attrs) + u32 id, struct uverbs_attr_bundle *attrs) { struct ib_uobject *uobj; int ret; uobj = rdma_lookup_get_uobject(obj, attrs->ufile, id, - UVERBS_LOOKUP_DESTROY); + UVERBS_LOOKUP_DESTROY, attrs); if (IS_ERR(uobj)) return uobj; - ret = uobj_destroy(uobj); + ret = uobj_destroy(uobj, attrs); if (ret) { rdma_lookup_put_uobject(uobj, UVERBS_LOOKUP_DESTROY); return ERR_PTR(ret); @@ -249,7 +250,7 @@ struct ib_uobject *__uobj_get_destroy(const struct uverbs_api_object *obj, * (negative errno on failure). For use by callers that do not need the uobj. */ int __uobj_perform_destroy(const struct uverbs_api_object *obj, u32 id, - const struct uverbs_attr_bundle *attrs) + struct uverbs_attr_bundle *attrs) { struct ib_uobject *uobj; @@ -296,25 +297,13 @@ static struct ib_uobject *alloc_uobj(struct ib_uverbs_file *ufile, static int idr_add_uobj(struct ib_uobject *uobj) { - int ret; - - idr_preload(GFP_KERNEL); - spin_lock(&uobj->ufile->idr_lock); - - /* - * We start with allocating an idr pointing to NULL. This represents an - * object which isn't initialized yet. We'll replace it later on with - * the real object once we commit. - */ - ret = idr_alloc(&uobj->ufile->idr, NULL, 0, - min_t(unsigned long, U32_MAX - 1, INT_MAX), GFP_NOWAIT); - if (ret >= 0) - uobj->id = ret; - - spin_unlock(&uobj->ufile->idr_lock); - idr_preload_end(); - - return ret < 0 ? ret : 0; + /* + * We start with allocating an idr pointing to NULL. This represents an + * object which isn't initialized yet. We'll replace it later on with + * the real object once we commit. + */ + return xa_alloc(&uobj->ufile->idr, &uobj->id, NULL, xa_limit_32b, + GFP_KERNEL); } /* Returns the ib_uobject or an error. The caller should check for IS_ERR. */ @@ -324,29 +313,20 @@ lookup_get_idr_uobject(const struct uverbs_api_object *obj, enum rdma_lookup_mode mode) { struct ib_uobject *uobj; - unsigned long idrno = id; if (id < 0 || id > ULONG_MAX) return ERR_PTR(-EINVAL); rcu_read_lock(); - /* object won't be released as we're protected in rcu */ - uobj = idr_find(&ufile->idr, idrno); - if (!uobj) { - uobj = ERR_PTR(-ENOENT); - goto free; - } - /* * The idr_find is guaranteed to return a pointer to something that * isn't freed yet, or NULL, as the free after idr_remove goes through * kfree_rcu(). However the object may still have been released and * kfree() could be called at any time. */ - if (!kref_get_unless_zero(&uobj->ref)) + uobj = xa_load(&ufile->idr, id); + if (!uobj || !kref_get_unless_zero(&uobj->ref)) uobj = ERR_PTR(-ENOENT); - -free: rcu_read_unlock(); return uobj; } @@ -393,12 +373,13 @@ lookup_get_fd_uobject(const struct uverbs_api_object *obj, struct ib_uobject *rdma_lookup_get_uobject(const struct uverbs_api_object *obj, struct ib_uverbs_file *ufile, s64 id, - enum rdma_lookup_mode mode) + enum rdma_lookup_mode mode, + struct uverbs_attr_bundle *attrs) { struct ib_uobject *uobj; int ret; - if (IS_ERR(obj) && PTR_ERR(obj) == -ENOMSG) { + if (obj == ERR_PTR(-ENOMSG)) { /* must be UVERBS_IDR_ANY_OBJECT, see uapi_get_object() */ uobj = lookup_get_idr_uobject(NULL, ufile, id, mode); if (IS_ERR(uobj)) @@ -431,6 +412,8 @@ struct ib_uobject *rdma_lookup_get_uobject(const struct uverbs_api_object *obj, ret = uverbs_try_lock_object(uobj, mode); if (ret) goto free; + if (attrs) + attrs->context = uobj->context; return uobj; free: @@ -438,38 +421,6 @@ free: uverbs_uobject_put(uobj); return ERR_PTR(ret); } -struct ib_uobject *_uobj_get_read(enum uverbs_default_objects type, - u32 object_id, - struct uverbs_attr_bundle *attrs) -{ - struct ib_uobject *uobj; - - uobj = rdma_lookup_get_uobject(uobj_get_type(attrs, type), attrs->ufile, - object_id, UVERBS_LOOKUP_READ); - if (IS_ERR(uobj)) - return uobj; - - attrs->context = uobj->context; - - return uobj; -} - -struct ib_uobject *_uobj_get_write(enum uverbs_default_objects type, - u32 object_id, - struct uverbs_attr_bundle *attrs) -{ - struct ib_uobject *uobj; - - uobj = rdma_lookup_get_uobject(uobj_get_type(attrs, type), attrs->ufile, - object_id, UVERBS_LOOKUP_WRITE); - - if (IS_ERR(uobj)) - return uobj; - - attrs->context = uobj->context; - - return uobj; -} static struct ib_uobject * alloc_begin_idr_uobject(const struct uverbs_api_object *obj, @@ -489,14 +440,12 @@ alloc_begin_idr_uobject(const struct uverbs_api_object *obj, ret = ib_rdmacg_try_charge(&uobj->cg_obj, uobj->context->device, RDMACG_RESOURCE_HCA_OBJECT); if (ret) - goto idr_remove; + goto remove; return uobj; -idr_remove: - spin_lock(&ufile->idr_lock); - idr_remove(&ufile->idr, uobj->id); - spin_unlock(&ufile->idr_lock); +remove: + xa_erase(&ufile->idr, uobj->id); uobj_put: uverbs_uobject_put(uobj); return ERR_PTR(ret); @@ -526,7 +475,8 @@ alloc_begin_fd_uobject(const struct uverbs_api_object *obj, } struct ib_uobject *rdma_alloc_begin_uobject(const struct uverbs_api_object *obj, - struct ib_uverbs_file *ufile) + struct ib_uverbs_file *ufile, + struct uverbs_attr_bundle *attrs) { struct ib_uobject *ret; @@ -546,6 +496,8 @@ struct ib_uobject *rdma_alloc_begin_uobject(const struct uverbs_api_object *obj, up_read(&ufile->hw_destroy_rwsem); return ret; } + if (attrs) + attrs->context = ret->context; return ret; } @@ -554,18 +506,17 @@ static void alloc_abort_idr_uobject(struct ib_uobject *uobj) ib_rdmacg_uncharge(&uobj->cg_obj, uobj->context->device, RDMACG_RESOURCE_HCA_OBJECT); - spin_lock(&uobj->ufile->idr_lock); - idr_remove(&uobj->ufile->idr, uobj->id); - spin_unlock(&uobj->ufile->idr_lock); + xa_erase(&uobj->ufile->idr, uobj->id); } static int __must_check destroy_hw_idr_uobject(struct ib_uobject *uobj, - enum rdma_remove_reason why) + enum rdma_remove_reason why, + struct uverbs_attr_bundle *attrs) { const struct uverbs_obj_idr_type *idr_type = container_of(uobj->uapi_object->type_attrs, struct uverbs_obj_idr_type, type); - int ret = idr_type->destroy_object(uobj, why); + int ret = idr_type->destroy_object(uobj, why, attrs); /* * We can only fail gracefully if the user requested to destroy the @@ -586,9 +537,7 @@ static int __must_check destroy_hw_idr_uobject(struct ib_uobject *uobj, static void remove_handle_idr_uobject(struct ib_uobject *uobj) { - spin_lock(&uobj->ufile->idr_lock); - idr_remove(&uobj->ufile->idr, uobj->id); - spin_unlock(&uobj->ufile->idr_lock); + xa_erase(&uobj->ufile->idr, uobj->id); /* Matches the kref in alloc_commit_idr_uobject */ uverbs_uobject_put(uobj); } @@ -599,7 +548,8 @@ static void alloc_abort_fd_uobject(struct ib_uobject *uobj) } static int __must_check destroy_hw_fd_uobject(struct ib_uobject *uobj, - enum rdma_remove_reason why) + enum rdma_remove_reason why, + struct uverbs_attr_bundle *attrs) { const struct uverbs_obj_fd_type *fd_type = container_of( uobj->uapi_object->type_attrs, struct uverbs_obj_fd_type, type); @@ -618,17 +568,17 @@ static void remove_handle_fd_uobject(struct ib_uobject *uobj) static int alloc_commit_idr_uobject(struct ib_uobject *uobj) { struct ib_uverbs_file *ufile = uobj->ufile; + void *old; - spin_lock(&ufile->idr_lock); /* * We already allocated this IDR with a NULL object, so * this shouldn't fail. * - * NOTE: Once we set the IDR we loose ownership of our kref on uobj. + * NOTE: Storing the uobj transfers our kref on uobj to the XArray. * It will be put by remove_commit_idr_uobject() */ - WARN_ON(idr_replace(&ufile->idr, uobj, uobj->id)); - spin_unlock(&ufile->idr_lock); + old = xa_store(&ufile->idr, uobj->id, uobj, GFP_KERNEL); + WARN_ON(old != NULL); return 0; } @@ -675,15 +625,16 @@ static int alloc_commit_fd_uobject(struct ib_uobject *uobj) * caller can no longer assume uobj is valid. If this function fails it * destroys the uboject, including the attached HW object. */ -int __must_check rdma_alloc_commit_uobject(struct ib_uobject *uobj) +int __must_check rdma_alloc_commit_uobject(struct ib_uobject *uobj, + struct uverbs_attr_bundle *attrs) { - struct ib_uverbs_file *ufile = uobj->ufile; + struct ib_uverbs_file *ufile = attrs->ufile; int ret; /* alloc_commit consumes the uobj kref */ ret = uobj->uapi_object->type_class->alloc_commit(uobj); if (ret) { - uverbs_destroy_uobject(uobj, RDMA_REMOVE_ABORT); + uverbs_destroy_uobject(uobj, RDMA_REMOVE_ABORT, attrs); up_read(&ufile->hw_destroy_rwsem); return ret; } @@ -707,12 +658,13 @@ int __must_check rdma_alloc_commit_uobject(struct ib_uobject *uobj) * This consumes the kref for uobj. It is up to the caller to unwind the HW * object and anything else connected to uobj before calling this. */ -void rdma_alloc_abort_uobject(struct ib_uobject *uobj) +void rdma_alloc_abort_uobject(struct ib_uobject *uobj, + struct uverbs_attr_bundle *attrs) { struct ib_uverbs_file *ufile = uobj->ufile; uobj->object = NULL; - uverbs_destroy_uobject(uobj, RDMA_REMOVE_ABORT); + uverbs_destroy_uobject(uobj, RDMA_REMOVE_ABORT, attrs); /* Matches the down_read in rdma_alloc_begin_uobject */ up_read(&ufile->hw_destroy_rwsem); @@ -760,29 +712,28 @@ void rdma_lookup_put_uobject(struct ib_uobject *uobj, void setup_ufile_idr_uobject(struct ib_uverbs_file *ufile) { - spin_lock_init(&ufile->idr_lock); - idr_init(&ufile->idr); + xa_init_flags(&ufile->idr, XA_FLAGS_ALLOC); } void release_ufile_idr_uobject(struct ib_uverbs_file *ufile) { struct ib_uobject *entry; - int id; + unsigned long id; /* * At this point uverbs_cleanup_ufile() is guaranteed to have run, and - * there are no HW objects left, however the IDR is still populated + * there are no HW objects left, however the xarray is still populated * with anything that has not been cleaned up by userspace. Since the * kref on ufile is 0, nothing is allowed to call lookup_get. * * This is an optimized equivalent to remove_handle_idr_uobject */ - idr_for_each_entry(&ufile->idr, entry, id) { + xa_for_each(&ufile->idr, id, entry) { WARN_ON(entry->object); uverbs_uobject_put(entry); } - idr_destroy(&ufile->idr); + xa_destroy(&ufile->idr); } const struct uverbs_obj_type_class uverbs_idr_class = { @@ -814,6 +765,10 @@ void uverbs_close_fd(struct file *f) { struct ib_uobject *uobj = f->private_data; struct ib_uverbs_file *ufile = uobj->ufile; + struct uverbs_attr_bundle attrs = { + .context = uobj->context, + .ufile = ufile, + }; if (down_read_trylock(&ufile->hw_destroy_rwsem)) { /* @@ -823,7 +778,7 @@ void uverbs_close_fd(struct file *f) * write lock here, or we have a kernel bug. */ WARN_ON(uverbs_try_lock_object(uobj, UVERBS_LOOKUP_WRITE)); - uverbs_destroy_uobject(uobj, RDMA_REMOVE_CLOSE); + uverbs_destroy_uobject(uobj, RDMA_REMOVE_CLOSE, &attrs); up_read(&ufile->hw_destroy_rwsem); } @@ -872,6 +827,7 @@ static int __uverbs_cleanup_ufile(struct ib_uverbs_file *ufile, { struct ib_uobject *obj, *next_obj; int ret = -EINVAL; + struct uverbs_attr_bundle attrs = { .ufile = ufile }; /* * This shouldn't run while executing other commands on this @@ -883,12 +839,13 @@ static int __uverbs_cleanup_ufile(struct ib_uverbs_file *ufile, * other threads (which might still use the FDs) chance to run. */ list_for_each_entry_safe(obj, next_obj, &ufile->uobjects, list) { + attrs.context = obj->context; /* * if we hit this WARN_ON, that means we are * racing with a lookup_get. */ WARN_ON(uverbs_try_lock_object(obj, UVERBS_LOOKUP_WRITE)); - if (!uverbs_destroy_uobject(obj, reason)) + if (!uverbs_destroy_uobject(obj, reason, &attrs)) ret = 0; else atomic_set(&obj->usecnt, 0); @@ -967,26 +924,25 @@ const struct uverbs_obj_type_class uverbs_fd_class = { EXPORT_SYMBOL(uverbs_fd_class); struct ib_uobject * -uverbs_get_uobject_from_file(u16 object_id, - struct ib_uverbs_file *ufile, - enum uverbs_obj_access access, s64 id) +uverbs_get_uobject_from_file(u16 object_id, enum uverbs_obj_access access, + s64 id, struct uverbs_attr_bundle *attrs) { const struct uverbs_api_object *obj = - uapi_get_object(ufile->device->uapi, object_id); + uapi_get_object(attrs->ufile->device->uapi, object_id); switch (access) { case UVERBS_ACCESS_READ: - return rdma_lookup_get_uobject(obj, ufile, id, - UVERBS_LOOKUP_READ); + return rdma_lookup_get_uobject(obj, attrs->ufile, id, + UVERBS_LOOKUP_READ, attrs); case UVERBS_ACCESS_DESTROY: /* Actual destruction is done inside uverbs_handle_method */ - return rdma_lookup_get_uobject(obj, ufile, id, - UVERBS_LOOKUP_DESTROY); + return rdma_lookup_get_uobject(obj, attrs->ufile, id, + UVERBS_LOOKUP_DESTROY, attrs); case UVERBS_ACCESS_WRITE: - return rdma_lookup_get_uobject(obj, ufile, id, - UVERBS_LOOKUP_WRITE); + return rdma_lookup_get_uobject(obj, attrs->ufile, id, + UVERBS_LOOKUP_WRITE, attrs); case UVERBS_ACCESS_NEW: - return rdma_alloc_begin_uobject(obj, ufile); + return rdma_alloc_begin_uobject(obj, attrs->ufile, attrs); default: WARN_ON(true); return ERR_PTR(-EOPNOTSUPP); @@ -994,8 +950,8 @@ uverbs_get_uobject_from_file(u16 object_id, } int uverbs_finalize_object(struct ib_uobject *uobj, - enum uverbs_obj_access access, - bool commit) + enum uverbs_obj_access access, bool commit, + struct uverbs_attr_bundle *attrs) { int ret = 0; @@ -1018,9 +974,9 @@ int uverbs_finalize_object(struct ib_uobject *uobj, break; case UVERBS_ACCESS_NEW: if (commit) - ret = rdma_alloc_commit_uobject(uobj); + ret = rdma_alloc_commit_uobject(uobj, attrs); else - rdma_alloc_abort_uobject(uobj); + rdma_alloc_abort_uobject(uobj, attrs); break; default: WARN_ON(true); diff --git a/drivers/infiniband/core/rdma_core.h b/drivers/infiniband/core/rdma_core.h index 69f8db66925e..5445323629b5 100644 --- a/drivers/infiniband/core/rdma_core.h +++ b/drivers/infiniband/core/rdma_core.h @@ -48,7 +48,7 @@ struct ib_uverbs_device; void uverbs_destroy_ufile_hw(struct ib_uverbs_file *ufile, enum rdma_remove_reason reason); -int uobj_destroy(struct ib_uobject *uobj); +int uobj_destroy(struct ib_uobject *uobj, struct uverbs_attr_bundle *attrs); /* * uverbs_uobject_get is called in order to increase the reference count on @@ -83,9 +83,8 @@ void uverbs_close_fd(struct file *f); * uverbs_finalize_objects are called. */ struct ib_uobject * -uverbs_get_uobject_from_file(u16 object_id, - struct ib_uverbs_file *ufile, - enum uverbs_obj_access access, s64 id); +uverbs_get_uobject_from_file(u16 object_id, enum uverbs_obj_access access, + s64 id, struct uverbs_attr_bundle *attrs); /* * Note that certain finalize stages could return a status: @@ -103,8 +102,8 @@ uverbs_get_uobject_from_file(u16 object_id, * object. */ int uverbs_finalize_object(struct ib_uobject *uobj, - enum uverbs_obj_access access, - bool commit); + enum uverbs_obj_access access, bool commit, + struct uverbs_attr_bundle *attrs); int uverbs_output_written(const struct uverbs_attr_bundle *bundle, size_t idx); diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c index bb534959abf0..7d8071c7e564 100644 --- a/drivers/infiniband/core/sa_query.c +++ b/drivers/infiniband/core/sa_query.c @@ -40,7 +40,7 @@ #include <linux/slab.h> #include <linux/dma-mapping.h> #include <linux/kref.h> -#include <linux/idr.h> +#include <linux/xarray.h> #include <linux/workqueue.h> #include <uapi/linux/if_ether.h> #include <rdma/ib_pack.h> @@ -183,8 +183,7 @@ static struct ib_client sa_client = { .remove = ib_sa_remove_one }; -static DEFINE_SPINLOCK(idr_lock); -static DEFINE_IDR(query_idr); +static DEFINE_XARRAY_FLAGS(queries, XA_FLAGS_ALLOC | XA_FLAGS_LOCK_IRQ); static DEFINE_SPINLOCK(tid_lock); static u32 tid; @@ -1180,14 +1179,14 @@ void ib_sa_cancel_query(int id, struct ib_sa_query *query) struct ib_mad_agent *agent; struct ib_mad_send_buf *mad_buf; - spin_lock_irqsave(&idr_lock, flags); - if (idr_find(&query_idr, id) != query) { - spin_unlock_irqrestore(&idr_lock, flags); + xa_lock_irqsave(&queries, flags); + if (xa_load(&queries, id) != query) { + xa_unlock_irqrestore(&queries, flags); return; } agent = query->port->agent; mad_buf = query->mad_buf; - spin_unlock_irqrestore(&idr_lock, flags); + xa_unlock_irqrestore(&queries, flags); /* * If the query is still on the netlink request list, schedule @@ -1363,21 +1362,14 @@ static void init_mad(struct ib_sa_query *query, struct ib_mad_agent *agent) static int send_mad(struct ib_sa_query *query, unsigned long timeout_ms, gfp_t gfp_mask) { - bool preload = gfpflags_allow_blocking(gfp_mask); unsigned long flags; int ret, id; - if (preload) - idr_preload(gfp_mask); - spin_lock_irqsave(&idr_lock, flags); - - id = idr_alloc(&query_idr, query, 0, 0, GFP_NOWAIT); - - spin_unlock_irqrestore(&idr_lock, flags); - if (preload) - idr_preload_end(); - if (id < 0) - return id; + xa_lock_irqsave(&queries, flags); + ret = __xa_alloc(&queries, &id, query, xa_limit_32b, gfp_mask); + xa_unlock_irqrestore(&queries, flags); + if (ret < 0) + return ret; query->mad_buf->timeout_ms = timeout_ms; query->mad_buf->context[0] = query; @@ -1394,9 +1386,9 @@ static int send_mad(struct ib_sa_query *query, unsigned long timeout_ms, ret = ib_post_send_mad(query->mad_buf, NULL); if (ret) { - spin_lock_irqsave(&idr_lock, flags); - idr_remove(&query_idr, id); - spin_unlock_irqrestore(&idr_lock, flags); + xa_lock_irqsave(&queries, flags); + __xa_erase(&queries, id); + xa_unlock_irqrestore(&queries, flags); } /* @@ -2188,9 +2180,9 @@ static void send_handler(struct ib_mad_agent *agent, break; } - spin_lock_irqsave(&idr_lock, flags); - idr_remove(&query_idr, query->id); - spin_unlock_irqrestore(&idr_lock, flags); + xa_lock_irqsave(&queries, flags); + __xa_erase(&queries, query->id); + xa_unlock_irqrestore(&queries, flags); free_mad(query); if (query->client) @@ -2475,5 +2467,5 @@ void ib_sa_cleanup(void) destroy_workqueue(ib_nl_wq); mcast_cleanup(); ib_unregister_client(&sa_client); - idr_destroy(&query_idr); + WARN_ON(!xa_empty(&queries)); } diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c index 9b6a065bdfa5..c78d0c9646ae 100644 --- a/drivers/infiniband/core/sysfs.c +++ b/drivers/infiniband/core/sysfs.c @@ -349,10 +349,15 @@ static struct attribute *port_default_attrs[] = { static size_t print_ndev(const struct ib_gid_attr *gid_attr, char *buf) { - if (!gid_attr->ndev) - return -EINVAL; - - return sprintf(buf, "%s\n", gid_attr->ndev->name); + struct net_device *ndev; + size_t ret = -EINVAL; + + rcu_read_lock(); + ndev = rcu_dereference(gid_attr->ndev); + if (ndev) + ret = sprintf(buf, "%s\n", ndev->name); + rcu_read_unlock(); + return ret; } static size_t print_gid_type(const struct ib_gid_attr *gid_attr, char *buf) @@ -1015,8 +1020,10 @@ err_free_stats: return; } -static int add_port(struct ib_device *device, int port_num) +static int add_port(struct ib_core_device *coredev, int port_num) { + struct ib_device *device = rdma_device_to_ibdev(&coredev->dev); + bool is_full_dev = &device->coredev == coredev; struct ib_port *p; struct ib_port_attr attr; int i; @@ -1034,7 +1041,7 @@ static int add_port(struct ib_device *device, int port_num) p->port_num = port_num; ret = kobject_init_and_add(&p->kobj, &port_type, - device->ports_kobj, + coredev->ports_kobj, "%d", port_num); if (ret) { kfree(p); @@ -1055,7 +1062,7 @@ static int add_port(struct ib_device *device, int port_num) goto err_put; } - if (device->ops.process_mad) { + if (device->ops.process_mad && is_full_dev) { p->pma_table = get_counter_table(device, port_num); ret = sysfs_create_group(&p->kobj, p->pma_table); if (ret) @@ -1111,7 +1118,7 @@ static int add_port(struct ib_device *device, int port_num) if (ret) goto err_free_pkey; - if (device->ops.init_port) { + if (device->ops.init_port && is_full_dev) { ret = device->ops.init_port(device, port_num, &p->kobj); if (ret) goto err_remove_pkey; @@ -1122,10 +1129,10 @@ static int add_port(struct ib_device *device, int port_num) * port, so holder should be device. Therefore skip per port conunter * initialization. */ - if (device->ops.alloc_hw_stats && port_num) + if (device->ops.alloc_hw_stats && port_num && is_full_dev) setup_hw_stats(device, p, port_num); - list_add_tail(&p->kobj.entry, &device->port_list); + list_add_tail(&p->kobj.entry, &coredev->port_list); kobject_uevent(&p->kobj, KOBJ_ADD); return 0; @@ -1194,6 +1201,7 @@ static ssize_t node_type_show(struct device *device, case RDMA_NODE_RNIC: return sprintf(buf, "%d: RNIC\n", dev->node_type); case RDMA_NODE_USNIC: return sprintf(buf, "%d: usNIC\n", dev->node_type); case RDMA_NODE_USNIC_UDP: return sprintf(buf, "%d: usNIC UDP\n", dev->node_type); + case RDMA_NODE_UNSPECIFIED: return sprintf(buf, "%d: unspecified\n", dev->node_type); case RDMA_NODE_IB_SWITCH: return sprintf(buf, "%d: switch\n", dev->node_type); case RDMA_NODE_IB_ROUTER: return sprintf(buf, "%d: router\n", dev->node_type); default: return sprintf(buf, "%d: <unknown>\n", dev->node_type); @@ -1279,11 +1287,11 @@ const struct attribute_group ib_dev_attr_group = { .attrs = ib_dev_attrs, }; -static void ib_free_port_attrs(struct ib_device *device) +void ib_free_port_attrs(struct ib_core_device *coredev) { struct kobject *p, *t; - list_for_each_entry_safe(p, t, &device->port_list, entry) { + list_for_each_entry_safe(p, t, &coredev->port_list, entry) { struct ib_port *port = container_of(p, struct ib_port, kobj); list_del(&p->entry); @@ -1303,20 +1311,22 @@ static void ib_free_port_attrs(struct ib_device *device) kobject_put(p); } - kobject_put(device->ports_kobj); + kobject_put(coredev->ports_kobj); } -static int ib_setup_port_attrs(struct ib_device *device) +int ib_setup_port_attrs(struct ib_core_device *coredev) { + struct ib_device *device = rdma_device_to_ibdev(&coredev->dev); unsigned int port; int ret; - device->ports_kobj = kobject_create_and_add("ports", &device->dev.kobj); - if (!device->ports_kobj) + coredev->ports_kobj = kobject_create_and_add("ports", + &coredev->dev.kobj); + if (!coredev->ports_kobj) return -ENOMEM; rdma_for_each_port (device, port) { - ret = add_port(device, port); + ret = add_port(coredev, port); if (ret) goto err_put; } @@ -1324,7 +1334,7 @@ static int ib_setup_port_attrs(struct ib_device *device) return 0; err_put: - ib_free_port_attrs(device); + ib_free_port_attrs(coredev); return ret; } @@ -1332,7 +1342,7 @@ int ib_device_register_sysfs(struct ib_device *device) { int ret; - ret = ib_setup_port_attrs(device); + ret = ib_setup_port_attrs(&device->coredev); if (ret) return ret; @@ -1348,5 +1358,48 @@ void ib_device_unregister_sysfs(struct ib_device *device) free_hsag(&device->dev.kobj, device->hw_stats_ag); kfree(device->hw_stats); - ib_free_port_attrs(device); + ib_free_port_attrs(&device->coredev); +} + +/** + * ib_port_register_module_stat - add module counters under relevant port + * of IB device. + * + * @device: IB device to add counters + * @port_num: valid port number + * @kobj: pointer to the kobject to initialize + * @ktype: pointer to the ktype for this kobject. + * @name: the name of the kobject + */ +int ib_port_register_module_stat(struct ib_device *device, u8 port_num, + struct kobject *kobj, struct kobj_type *ktype, + const char *name) +{ + struct kobject *p, *t; + int ret; + + list_for_each_entry_safe(p, t, &device->coredev.port_list, entry) { + struct ib_port *port = container_of(p, struct ib_port, kobj); + + if (port->port_num != port_num) + continue; + + ret = kobject_init_and_add(kobj, ktype, &port->kobj, "%s", + name); + if (ret) + return ret; + } + + return 0; +} +EXPORT_SYMBOL(ib_port_register_module_stat); + +/** + * ib_port_unregister_module_stat - release module counters + * @kobj: pointer to the kobject to release + */ +void ib_port_unregister_module_stat(struct kobject *kobj) +{ + kobject_put(kobj); } +EXPORT_SYMBOL(ib_port_unregister_module_stat); diff --git a/drivers/infiniband/core/ucm.c b/drivers/infiniband/core/ucm.c index 65c3230f5663..8e7da2d41fd8 100644 --- a/drivers/infiniband/core/ucm.c +++ b/drivers/infiniband/core/ucm.c @@ -42,7 +42,7 @@ #include <linux/file.h> #include <linux/mount.h> #include <linux/cdev.h> -#include <linux/idr.h> +#include <linux/xarray.h> #include <linux/mutex.h> #include <linux/slab.h> @@ -125,23 +125,22 @@ static struct ib_client ucm_client = { .remove = ib_ucm_remove_one }; -static DEFINE_MUTEX(ctx_id_mutex); -static DEFINE_IDR(ctx_id_table); +static DEFINE_XARRAY_ALLOC(ctx_id_table); static DECLARE_BITMAP(dev_map, IB_UCM_MAX_DEVICES); static struct ib_ucm_context *ib_ucm_ctx_get(struct ib_ucm_file *file, int id) { struct ib_ucm_context *ctx; - mutex_lock(&ctx_id_mutex); - ctx = idr_find(&ctx_id_table, id); + xa_lock(&ctx_id_table); + ctx = xa_load(&ctx_id_table, id); if (!ctx) ctx = ERR_PTR(-ENOENT); else if (ctx->file != file) ctx = ERR_PTR(-EINVAL); else atomic_inc(&ctx->ref); - mutex_unlock(&ctx_id_mutex); + xa_unlock(&ctx_id_table); return ctx; } @@ -194,10 +193,7 @@ static struct ib_ucm_context *ib_ucm_ctx_alloc(struct ib_ucm_file *file) ctx->file = file; INIT_LIST_HEAD(&ctx->events); - mutex_lock(&ctx_id_mutex); - ctx->id = idr_alloc(&ctx_id_table, ctx, 0, 0, GFP_KERNEL); - mutex_unlock(&ctx_id_mutex); - if (ctx->id < 0) + if (xa_alloc(&ctx_id_table, &ctx->id, ctx, xa_limit_32b, GFP_KERNEL)) goto error; list_add_tail(&ctx->file_list, &file->ctxs); @@ -514,9 +510,7 @@ static ssize_t ib_ucm_create_id(struct ib_ucm_file *file, err2: ib_destroy_cm_id(ctx->cm_id); err1: - mutex_lock(&ctx_id_mutex); - idr_remove(&ctx_id_table, ctx->id); - mutex_unlock(&ctx_id_mutex); + xa_erase(&ctx_id_table, ctx->id); kfree(ctx); return result; } @@ -536,15 +530,15 @@ static ssize_t ib_ucm_destroy_id(struct ib_ucm_file *file, if (copy_from_user(&cmd, inbuf, sizeof(cmd))) return -EFAULT; - mutex_lock(&ctx_id_mutex); - ctx = idr_find(&ctx_id_table, cmd.id); + xa_lock(&ctx_id_table); + ctx = xa_load(&ctx_id_table, cmd.id); if (!ctx) ctx = ERR_PTR(-ENOENT); else if (ctx->file != file) ctx = ERR_PTR(-EINVAL); else - idr_remove(&ctx_id_table, ctx->id); - mutex_unlock(&ctx_id_mutex); + __xa_erase(&ctx_id_table, ctx->id); + xa_unlock(&ctx_id_table); if (IS_ERR(ctx)) return PTR_ERR(ctx); @@ -1189,10 +1183,7 @@ static int ib_ucm_close(struct inode *inode, struct file *filp) struct ib_ucm_context, file_list); mutex_unlock(&file->file_mutex); - mutex_lock(&ctx_id_mutex); - idr_remove(&ctx_id_table, ctx->id); - mutex_unlock(&ctx_id_mutex); - + xa_erase(&ctx_id_table, ctx->id); ib_destroy_cm_id(ctx->cm_id); ib_ucm_cleanup_events(ctx); kfree(ctx); @@ -1352,7 +1343,7 @@ static void __exit ib_ucm_cleanup(void) class_remove_file(&cm_class, &class_attr_abi_version.attr); unregister_chrdev_region(IB_UCM_BASE_DEV, IB_UCM_NUM_FIXED_MINOR); unregister_chrdev_region(dynamic_ucm_dev, IB_UCM_NUM_DYNAMIC_MINOR); - idr_destroy(&ctx_id_table); + WARN_ON(!xa_empty(&ctx_id_table)); } module_init(ib_ucm_init); diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c index fe5551562dbc..0a23048db523 100644 --- a/drivers/infiniband/core/umem.c +++ b/drivers/infiniband/core/umem.c @@ -37,27 +37,23 @@ #include <linux/sched/signal.h> #include <linux/sched/mm.h> #include <linux/export.h> -#include <linux/hugetlb.h> #include <linux/slab.h> +#include <linux/pagemap.h> #include <rdma/ib_umem_odp.h> #include "uverbs.h" - static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int dirty) { - struct scatterlist *sg; + struct sg_page_iter sg_iter; struct page *page; - int i; if (umem->nmap > 0) - ib_dma_unmap_sg(dev, umem->sg_head.sgl, - umem->npages, + ib_dma_unmap_sg(dev, umem->sg_head.sgl, umem->sg_nents, DMA_BIDIRECTIONAL); - for_each_sg(umem->sg_head.sgl, sg, umem->npages, i) { - - page = sg_page(sg); + for_each_sg_page(umem->sg_head.sgl, &sg_iter, umem->sg_nents, 0) { + page = sg_page_iter_page(&sg_iter); if (!PageDirty(page) && umem->writable && dirty) set_page_dirty_lock(page); put_page(page); @@ -66,6 +62,124 @@ static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int d sg_free_table(&umem->sg_head); } +/* ib_umem_add_sg_table - Add N contiguous pages to scatter table + * + * sg: current scatterlist entry + * page_list: array of npage struct page pointers + * npages: number of pages in page_list + * max_seg_sz: maximum segment size in bytes + * nents: [out] number of entries in the scatterlist + * + * Return new end of scatterlist + */ +static struct scatterlist *ib_umem_add_sg_table(struct scatterlist *sg, + struct page **page_list, + unsigned long npages, + unsigned int max_seg_sz, + int *nents) +{ + unsigned long first_pfn; + unsigned long i = 0; + bool update_cur_sg = false; + bool first = !sg_page(sg); + + /* Check if new page_list is contiguous with end of previous page_list. + * sg->length here is a multiple of PAGE_SIZE and sg->offset is 0. + */ + if (!first && (page_to_pfn(sg_page(sg)) + (sg->length >> PAGE_SHIFT) == + page_to_pfn(page_list[0]))) + update_cur_sg = true; + + while (i != npages) { + unsigned long len; + struct page *first_page = page_list[i]; + + first_pfn = page_to_pfn(first_page); + + /* Compute the number of contiguous pages we have starting + * at i + */ + for (len = 0; i != npages && + first_pfn + len == page_to_pfn(page_list[i]) && + len < (max_seg_sz >> PAGE_SHIFT); + len++) + i++; + + /* Squash N contiguous pages from page_list into current sge */ + if (update_cur_sg) { + if ((max_seg_sz - sg->length) >= (len << PAGE_SHIFT)) { + sg_set_page(sg, sg_page(sg), + sg->length + (len << PAGE_SHIFT), + 0); + update_cur_sg = false; + continue; + } + update_cur_sg = false; + } + + /* Squash N contiguous pages into next sge or first sge */ + if (!first) + sg = sg_next(sg); + + (*nents)++; + sg_set_page(sg, first_page, len << PAGE_SHIFT, 0); + first = false; + } + + return sg; +} + +/** + * ib_umem_find_best_pgsz - Find best HW page size to use for this MR + * + * @umem: umem struct + * @pgsz_bitmap: bitmap of HW supported page sizes + * @virt: IOVA + * + * This helper is intended for HW that support multiple page + * sizes but can do only a single page size in an MR. + * + * Returns 0 if the umem requires page sizes not supported by + * the driver to be mapped. Drivers always supporting PAGE_SIZE + * or smaller will never see a 0 result. + */ +unsigned long ib_umem_find_best_pgsz(struct ib_umem *umem, + unsigned long pgsz_bitmap, + unsigned long virt) +{ + struct scatterlist *sg; + unsigned int best_pg_bit; + unsigned long va, pgoff; + dma_addr_t mask; + int i; + + /* At minimum, drivers must support PAGE_SIZE or smaller */ + if (WARN_ON(!(pgsz_bitmap & GENMASK(PAGE_SHIFT, 0)))) + return 0; + + va = virt; + /* max page size not to exceed MR length */ + mask = roundup_pow_of_two(umem->length); + /* offset into first SGL */ + pgoff = umem->address & ~PAGE_MASK; + + for_each_sg(umem->sg_head.sgl, sg, umem->nmap, i) { + /* Walk SGL and reduce max page size if VA/PA bits differ + * for any address. + */ + mask |= (sg_dma_address(sg) + pgoff) ^ va; + if (i && i != (umem->nmap - 1)) + /* restrict by length as well for interior SGEs */ + mask |= sg_dma_len(sg); + va += sg_dma_len(sg) - pgoff; + pgoff = 0; + } + best_pg_bit = rdma_find_pg_bit(mask, pgsz_bitmap); + + return BIT_ULL(best_pg_bit); +} +EXPORT_SYMBOL(ib_umem_find_best_pgsz); + /** * ib_umem_get - Pin and DMA map userspace memory. * @@ -84,16 +198,14 @@ struct ib_umem *ib_umem_get(struct ib_udata *udata, unsigned long addr, struct ib_ucontext *context; struct ib_umem *umem; struct page **page_list; - struct vm_area_struct **vma_list; unsigned long lock_limit; unsigned long new_pinned; unsigned long cur_base; struct mm_struct *mm; unsigned long npages; int ret; - int i; unsigned long dma_attrs = 0; - struct scatterlist *sg, *sg_list_start; + struct scatterlist *sg; unsigned int gup_flags = FOLL_WRITE; if (!udata) @@ -138,29 +250,23 @@ struct ib_umem *ib_umem_get(struct ib_udata *udata, unsigned long addr, mmgrab(mm); if (access & IB_ACCESS_ON_DEMAND) { + if (WARN_ON_ONCE(!context->invalidate_range)) { + ret = -EINVAL; + goto umem_kfree; + } + ret = ib_umem_odp_get(to_ib_umem_odp(umem), access); if (ret) goto umem_kfree; return umem; } - /* We assume the memory is from hugetlb until proved otherwise */ - umem->hugetlb = 1; - page_list = (struct page **) __get_free_page(GFP_KERNEL); if (!page_list) { ret = -ENOMEM; goto umem_kfree; } - /* - * if we can't alloc the vma_list, it's not so bad; - * just assume the memory is not hugetlb memory - */ - vma_list = (struct vm_area_struct **) __get_free_page(GFP_KERNEL); - if (!vma_list) - umem->hugetlb = 0; - npages = ib_umem_num_pages(umem); if (npages == 0 || npages > UINT_MAX) { ret = -EINVAL; @@ -185,41 +291,34 @@ struct ib_umem *ib_umem_get(struct ib_udata *udata, unsigned long addr, if (!umem->writable) gup_flags |= FOLL_FORCE; - sg_list_start = umem->sg_head.sgl; + sg = umem->sg_head.sgl; while (npages) { down_read(&mm->mmap_sem); ret = get_user_pages_longterm(cur_base, min_t(unsigned long, npages, PAGE_SIZE / sizeof (struct page *)), - gup_flags, page_list, vma_list); + gup_flags, page_list, NULL); if (ret < 0) { up_read(&mm->mmap_sem); goto umem_release; } - umem->npages += ret; cur_base += ret * PAGE_SIZE; npages -= ret; - /* Continue to hold the mmap_sem as vma_list access - * needs to be protected. - */ - for_each_sg(sg_list_start, sg, ret, i) { - if (vma_list && !is_vm_hugetlb_page(vma_list[i])) - umem->hugetlb = 0; + sg = ib_umem_add_sg_table(sg, page_list, ret, + dma_get_max_seg_size(context->device->dma_device), + &umem->sg_nents); - sg_set_page(sg, page_list[i], PAGE_SIZE, 0); - } up_read(&mm->mmap_sem); - - /* preparing for next loop */ - sg_list_start = sg; } + sg_mark_end(sg); + umem->nmap = ib_dma_map_sg_attrs(context->device, umem->sg_head.sgl, - umem->npages, + umem->sg_nents, DMA_BIDIRECTIONAL, dma_attrs); @@ -236,8 +335,6 @@ umem_release: vma: atomic64_sub(ib_umem_num_pages(umem), &mm->pinned_vm); out: - if (vma_list) - free_page((unsigned long) vma_list); free_page((unsigned long) page_list); umem_kfree: if (ret) { @@ -315,7 +412,7 @@ int ib_umem_copy_from(void *dst, struct ib_umem *umem, size_t offset, return -EINVAL; } - ret = sg_pcopy_to_buffer(umem->sg_head.sgl, umem->npages, dst, length, + ret = sg_pcopy_to_buffer(umem->sg_head.sgl, umem->sg_nents, dst, length, offset + ib_umem_offset(umem)); if (ret < 0) diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c index e6ec79ad9cc8..c7226cf52acc 100644 --- a/drivers/infiniband/core/umem_odp.c +++ b/drivers/infiniband/core/umem_odp.c @@ -241,7 +241,7 @@ static struct ib_ucontext_per_mm *alloc_per_mm(struct ib_ucontext *ctx, per_mm->mm = mm; per_mm->umem_tree = RB_ROOT_CACHED; init_rwsem(&per_mm->umem_rwsem); - per_mm->active = ctx->invalidate_range; + per_mm->active = true; rcu_read_lock(); per_mm->tgid = get_task_pid(current->group_leader, PIDTYPE_PID); @@ -417,9 +417,6 @@ int ib_umem_odp_get(struct ib_umem_odp *umem_odp, int access) h = hstate_vma(vma); umem->page_shift = huge_page_shift(h); up_read(&mm->mmap_sem); - umem->hugetlb = 1; - } else { - umem->hugetlb = 0; } mutex_init(&umem_odp->umem_mutex); @@ -503,7 +500,6 @@ static int ib_umem_odp_map_dma_single_page( struct ib_umem *umem = &umem_odp->umem; struct ib_device *dev = umem->context->device; dma_addr_t dma_addr; - int stored_page = 0; int remove_existing_mapping = 0; int ret = 0; @@ -527,8 +523,7 @@ static int ib_umem_odp_map_dma_single_page( } umem_odp->dma_list[page_index] = dma_addr | access_mask; umem_odp->page_list[page_index] = page; - umem->npages++; - stored_page = 1; + umem_odp->npages++; } else if (umem_odp->page_list[page_index] == page) { umem_odp->dma_list[page_index] |= access_mask; } else { @@ -540,11 +535,9 @@ static int ib_umem_odp_map_dma_single_page( } out: - /* On Demand Paging - avoid pinning the page */ - if (umem->context->invalidate_range || !stored_page) - put_page(page); + put_page(page); - if (remove_existing_mapping && umem->context->invalidate_range) { + if (remove_existing_mapping) { ib_umem_notifier_start_account(umem_odp); umem->context->invalidate_range( umem_odp, @@ -754,12 +747,9 @@ void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 virt, */ set_page_dirty(head_page); } - /* on demand pinning support */ - if (!umem->context->invalidate_range) - put_page(page); umem_odp->page_list[idx] = NULL; umem_odp->dma_list[idx] = 0; - umem->npages--; + umem_odp->npages--; } } mutex_unlock(&umem_odp->umem_mutex); diff --git a/drivers/infiniband/core/user_mad.c b/drivers/infiniband/core/user_mad.c index b58b07c03cfb..671f07ba1fad 100644 --- a/drivers/infiniband/core/user_mad.c +++ b/drivers/infiniband/core/user_mad.c @@ -129,6 +129,9 @@ struct ib_umad_packet { struct ib_user_mad mad; }; +#define CREATE_TRACE_POINTS +#include <trace/events/ib_umad.h> + static const dev_t base_umad_dev = MKDEV(IB_UMAD_MAJOR, IB_UMAD_MINOR_BASE); static const dev_t base_issm_dev = MKDEV(IB_UMAD_MAJOR, IB_UMAD_MINOR_BASE) + IB_UMAD_NUM_FIXED_MINOR; @@ -334,6 +337,9 @@ static ssize_t copy_recv_mad(struct ib_umad_file *file, char __user *buf, return -EFAULT; } } + + trace_ib_umad_read_recv(file, &packet->mad.hdr, &recv_buf->mad->mad_hdr); + return hdr_size(file) + packet->length; } @@ -353,6 +359,9 @@ static ssize_t copy_send_mad(struct ib_umad_file *file, char __user *buf, if (copy_to_user(buf, packet->mad.data, packet->length)) return -EFAULT; + trace_ib_umad_read_send(file, &packet->mad.hdr, + (struct ib_mad_hdr *)&packet->mad.data); + return size; } @@ -508,6 +517,9 @@ static ssize_t ib_umad_write(struct file *filp, const char __user *buf, mutex_lock(&file->mutex); + trace_ib_umad_write(file, &packet->mad.hdr, + (struct ib_mad_hdr *)&packet->mad.data); + agent = __get_agent(file, packet->mad.hdr.id); if (!agent) { ret = -EINVAL; @@ -968,6 +980,11 @@ static int ib_umad_open(struct inode *inode, struct file *filp) goto out; } + if (!rdma_dev_access_netns(port->ib_dev, current->nsproxy->net_ns)) { + ret = -EPERM; + goto out; + } + file = kzalloc(sizeof(*file), GFP_KERNEL); if (!file) { ret = -ENOMEM; @@ -1061,6 +1078,11 @@ static int ib_umad_sm_open(struct inode *inode, struct file *filp) } } + if (!rdma_dev_access_netns(port->ib_dev, current->nsproxy->net_ns)) { + ret = -EPERM; + goto err_up_sem; + } + ret = ib_modify_port(port->ib_dev, port->port_num, 0, &props); if (ret) goto err_up_sem; diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h index 32cc8fe7902f..1e5aeb39f774 100644 --- a/drivers/infiniband/core/uverbs.h +++ b/drivers/infiniband/core/uverbs.h @@ -162,9 +162,7 @@ struct ib_uverbs_file { struct list_head umaps; struct page *disassociate_page; - struct idr idr; - /* spinlock protects write access to idr */ - spinlock_t idr_lock; + struct xarray idr; }; struct ib_uverbs_event { @@ -241,7 +239,8 @@ void ib_uverbs_srq_event_handler(struct ib_event *event, void *context_ptr); void ib_uverbs_event_handler(struct ib_event_handler *handler, struct ib_event *event); int ib_uverbs_dealloc_xrcd(struct ib_uobject *uobject, struct ib_xrcd *xrcd, - enum rdma_remove_reason why); + enum rdma_remove_reason why, + struct uverbs_attr_bundle *attrs); int uverbs_dealloc_mw(struct ib_mw *mw); void ib_uverbs_detach_umcast(struct ib_qp *qp, diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 062a86c04123..5a3a1780ceea 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -162,7 +162,7 @@ static const void __user *uverbs_request_next_ptr(struct uverbs_req_iter *iter, const void __user *res = iter->cur; if (iter->cur + len > iter->end) - return ERR_PTR(-ENOSPC); + return (void __force __user *)ERR_PTR(-ENOSPC); iter->cur += len; return res; } @@ -175,7 +175,7 @@ static int uverbs_request_finish(struct uverbs_req_iter *iter) } static struct ib_uverbs_completion_event_file * -_ib_uverbs_lookup_comp_file(s32 fd, const struct uverbs_attr_bundle *attrs) +_ib_uverbs_lookup_comp_file(s32 fd, struct uverbs_attr_bundle *attrs) { struct ib_uobject *uobj = ufd_get_read(UVERBS_OBJECT_COMP_CHANNEL, fd, attrs); @@ -230,6 +230,8 @@ static int ib_uverbs_get_context(struct uverbs_attr_bundle *attrs) goto err_alloc; } + attrs->context = ucontext; + ucontext->res.type = RDMA_RESTRACK_CTX; ucontext->device = ib_dev; ucontext->cg_obj = cg_obj; @@ -423,7 +425,7 @@ static int ib_uverbs_alloc_pd(struct uverbs_attr_bundle *attrs) atomic_set(&pd->usecnt, 0); pd->res.type = RDMA_RESTRACK_PD; - ret = ib_dev->ops.alloc_pd(pd, uobj->context, &attrs->driver_udata); + ret = ib_dev->ops.alloc_pd(pd, &attrs->driver_udata); if (ret) goto err_alloc; @@ -436,15 +438,15 @@ static int ib_uverbs_alloc_pd(struct uverbs_attr_bundle *attrs) if (ret) goto err_copy; - return uobj_alloc_commit(uobj); + return uobj_alloc_commit(uobj, attrs); err_copy: - ib_dealloc_pd(pd); + ib_dealloc_pd_user(pd, &attrs->driver_udata); pd = NULL; err_alloc: kfree(pd); err: - uobj_alloc_abort(uobj); + uobj_alloc_abort(uobj, attrs); return ret; } @@ -594,8 +596,7 @@ static int ib_uverbs_open_xrcd(struct uverbs_attr_bundle *attrs) } if (!xrcd) { - xrcd = ib_dev->ops.alloc_xrcd(ib_dev, obj->uobject.context, - &attrs->driver_udata); + xrcd = ib_dev->ops.alloc_xrcd(ib_dev, &attrs->driver_udata); if (IS_ERR(xrcd)) { ret = PTR_ERR(xrcd); goto err; @@ -633,7 +634,7 @@ static int ib_uverbs_open_xrcd(struct uverbs_attr_bundle *attrs) mutex_unlock(&ibudev->xrcd_tree_mutex); - return uobj_alloc_commit(&obj->uobject); + return uobj_alloc_commit(&obj->uobject, attrs); err_copy: if (inode) { @@ -643,10 +644,10 @@ err_copy: } err_dealloc_xrcd: - ib_dealloc_xrcd(xrcd); + ib_dealloc_xrcd(xrcd, &attrs->driver_udata); err: - uobj_alloc_abort(&obj->uobject); + uobj_alloc_abort(&obj->uobject, attrs); err_tree_mutex_unlock: if (f.file) @@ -669,19 +670,19 @@ static int ib_uverbs_close_xrcd(struct uverbs_attr_bundle *attrs) return uobj_perform_destroy(UVERBS_OBJECT_XRCD, cmd.xrcd_handle, attrs); } -int ib_uverbs_dealloc_xrcd(struct ib_uobject *uobject, - struct ib_xrcd *xrcd, - enum rdma_remove_reason why) +int ib_uverbs_dealloc_xrcd(struct ib_uobject *uobject, struct ib_xrcd *xrcd, + enum rdma_remove_reason why, + struct uverbs_attr_bundle *attrs) { struct inode *inode; int ret; - struct ib_uverbs_device *dev = uobject->context->ufile->device; + struct ib_uverbs_device *dev = attrs->ufile->device; inode = xrcd->inode; if (inode && !atomic_dec_and_test(&xrcd->usecnt)) return 0; - ret = ib_dealloc_xrcd(xrcd); + ret = ib_dealloc_xrcd(xrcd, &attrs->driver_udata); if (ib_is_destroy_retryable(ret, why, uobject)) { atomic_inc(&xrcd->usecnt); @@ -763,16 +764,16 @@ static int ib_uverbs_reg_mr(struct uverbs_attr_bundle *attrs) uobj_put_obj_read(pd); - return uobj_alloc_commit(uobj); + return uobj_alloc_commit(uobj, attrs); err_copy: - ib_dereg_mr(mr); + ib_dereg_mr_user(mr, &attrs->driver_udata); err_put: uobj_put_obj_read(pd); err_free: - uobj_alloc_abort(uobj); + uobj_alloc_abort(uobj, attrs); return ret; } @@ -917,14 +918,14 @@ static int ib_uverbs_alloc_mw(struct uverbs_attr_bundle *attrs) goto err_copy; uobj_put_obj_read(pd); - return uobj_alloc_commit(uobj); + return uobj_alloc_commit(uobj, attrs); err_copy: uverbs_dealloc_mw(mw); err_put: uobj_put_obj_read(pd); err_free: - uobj_alloc_abort(uobj); + uobj_alloc_abort(uobj, attrs); return ret; } @@ -965,11 +966,11 @@ static int ib_uverbs_create_comp_channel(struct uverbs_attr_bundle *attrs) ret = uverbs_response(attrs, &resp, sizeof(resp)); if (ret) { - uobj_alloc_abort(uobj); + uobj_alloc_abort(uobj, attrs); return ret; } - return uobj_alloc_commit(uobj); + return uobj_alloc_commit(uobj, attrs); } static struct ib_ucq_object *create_cq(struct uverbs_attr_bundle *attrs, @@ -1009,8 +1010,7 @@ static struct ib_ucq_object *create_cq(struct uverbs_attr_bundle *attrs, attr.comp_vector = cmd->comp_vector; attr.flags = cmd->flags; - cq = ib_dev->ops.create_cq(ib_dev, &attr, obj->uobject.context, - &attrs->driver_udata); + cq = ib_dev->ops.create_cq(ib_dev, &attr, &attrs->driver_udata); if (IS_ERR(cq)) { ret = PTR_ERR(cq); goto err_file; @@ -1036,7 +1036,7 @@ static struct ib_ucq_object *create_cq(struct uverbs_attr_bundle *attrs, if (ret) goto err_cb; - ret = uobj_alloc_commit(&obj->uobject); + ret = uobj_alloc_commit(&obj->uobject, attrs); if (ret) return ERR_PTR(ret); return obj; @@ -1049,7 +1049,7 @@ err_file: ib_uverbs_release_ucq(attrs->ufile, ev_file, obj); err: - uobj_alloc_abort(&obj->uobject); + uobj_alloc_abort(&obj->uobject, attrs); return ERR_PTR(ret); } @@ -1418,7 +1418,6 @@ static int create_qp(struct uverbs_attr_bundle *attrs, if (ret) goto err_cb; - qp->real_qp = qp; qp->pd = pd; qp->send_cq = attr.send_cq; qp->recv_cq = attr.recv_cq; @@ -1477,7 +1476,7 @@ static int create_qp(struct uverbs_attr_bundle *attrs, if (ind_tbl) uobj_put_obj_read(ind_tbl); - return uobj_alloc_commit(&obj->uevent.uobject); + return uobj_alloc_commit(&obj->uevent.uobject, attrs); err_cb: ib_destroy_qp(qp); @@ -1495,7 +1494,7 @@ err_put: if (ind_tbl) uobj_put_obj_read(ind_tbl); - uobj_alloc_abort(&obj->uevent.uobject); + uobj_alloc_abort(&obj->uevent.uobject, attrs); return ret; } @@ -1609,14 +1608,14 @@ static int ib_uverbs_open_qp(struct uverbs_attr_bundle *attrs) qp->uobject = &obj->uevent.uobject; uobj_put_read(xrcd_uobj); - return uobj_alloc_commit(&obj->uevent.uobject); + return uobj_alloc_commit(&obj->uevent.uobject, attrs); err_destroy: ib_destroy_qp(qp); err_xrcd: uobj_put_read(xrcd_uobj); err_put: - uobj_alloc_abort(&obj->uevent.uobject); + uobj_alloc_abort(&obj->uevent.uobject, attrs); return ret; } @@ -2451,7 +2450,7 @@ static int ib_uverbs_create_ah(struct uverbs_attr_bundle *attrs) goto err_copy; uobj_put_obj_read(pd); - return uobj_alloc_commit(uobj); + return uobj_alloc_commit(uobj, attrs); err_copy: rdma_destroy_ah(ah, RDMA_DESTROY_AH_SLEEPABLE); @@ -2460,7 +2459,7 @@ err_put: uobj_put_obj_read(pd); err: - uobj_alloc_abort(uobj); + uobj_alloc_abort(uobj, attrs); return ret; } @@ -2962,16 +2961,16 @@ static int ib_uverbs_ex_create_wq(struct uverbs_attr_bundle *attrs) uobj_put_obj_read(pd); uobj_put_obj_read(cq); - return uobj_alloc_commit(&obj->uevent.uobject); + return uobj_alloc_commit(&obj->uevent.uobject, attrs); err_copy: - ib_destroy_wq(wq); + ib_destroy_wq(wq, &attrs->driver_udata); err_put_cq: uobj_put_obj_read(cq); err_put_pd: uobj_put_obj_read(pd); err_uobj: - uobj_alloc_abort(&obj->uevent.uobject); + uobj_alloc_abort(&obj->uevent.uobject, attrs); return err; } @@ -3136,12 +3135,12 @@ static int ib_uverbs_ex_create_rwq_ind_table(struct uverbs_attr_bundle *attrs) for (j = 0; j < num_read_wqs; j++) uobj_put_obj_read(wqs[j]); - return uobj_alloc_commit(uobj); + return uobj_alloc_commit(uobj, attrs); err_copy: ib_destroy_rwq_ind_table(rwq_ind_tbl); err_uobj: - uobj_alloc_abort(uobj); + uobj_alloc_abort(uobj, attrs); put_wqs: for (j = 0; j < num_read_wqs; j++) uobj_put_obj_read(wqs[j]); @@ -3314,7 +3313,7 @@ static int ib_uverbs_ex_create_flow(struct uverbs_attr_bundle *attrs) kfree(flow_attr); if (cmd.flow_attr.num_of_specs) kfree(kern_flow_attr); - return uobj_alloc_commit(uobj); + return uobj_alloc_commit(uobj, attrs); err_copy: if (!qp->device->ops.destroy_flow(flow_id)) atomic_dec(&qp->usecnt); @@ -3325,7 +3324,7 @@ err_free_flow_attr: err_put: uobj_put_obj_read(qp); err_uobj: - uobj_alloc_abort(uobj); + uobj_alloc_abort(uobj, attrs); err_free_attr: if (cmd.flow_attr.num_of_specs) kfree(kern_flow_attr); @@ -3411,9 +3410,9 @@ static int __uverbs_create_xsrq(struct uverbs_attr_bundle *attrs, obj->uevent.events_reported = 0; INIT_LIST_HEAD(&obj->uevent.event_list); - srq = pd->device->ops.create_srq(pd, &attr, udata); - if (IS_ERR(srq)) { - ret = PTR_ERR(srq); + srq = rdma_zalloc_drv_obj(ib_dev, ib_srq); + if (!srq) { + ret = -ENOMEM; goto err_put; } @@ -3424,6 +3423,10 @@ static int __uverbs_create_xsrq(struct uverbs_attr_bundle *attrs, srq->event_handler = attr.event_handler; srq->srq_context = attr.srq_context; + ret = pd->device->ops.create_srq(srq, &attr, udata); + if (ret) + goto err_free; + if (ib_srq_has_cq(cmd->srq_type)) { srq->ext.cq = attr.ext.cq; atomic_inc(&attr.ext.cq->usecnt); @@ -3458,11 +3461,13 @@ static int __uverbs_create_xsrq(struct uverbs_attr_bundle *attrs, uobj_put_obj_read(attr.ext.cq); uobj_put_obj_read(pd); - return uobj_alloc_commit(&obj->uevent.uobject); + return uobj_alloc_commit(&obj->uevent.uobject, attrs); err_copy: - ib_destroy_srq(srq); + ib_destroy_srq_user(srq, &attrs->driver_udata); +err_free: + kfree(srq); err_put: uobj_put_obj_read(pd); @@ -3477,7 +3482,7 @@ err_put_xrcd: } err: - uobj_alloc_abort(&obj->uevent.uobject); + uobj_alloc_abort(&obj->uevent.uobject, attrs); return ret; } diff --git a/drivers/infiniband/core/uverbs_ioctl.c b/drivers/infiniband/core/uverbs_ioctl.c index e1379949e663..829b0c6944d8 100644 --- a/drivers/infiniband/core/uverbs_ioctl.c +++ b/drivers/infiniband/core/uverbs_ioctl.c @@ -207,13 +207,12 @@ static int uverbs_process_idrs_array(struct bundle_priv *pbundle, for (i = 0; i != array_len; i++) { attr->uobjects[i] = uverbs_get_uobject_from_file( - spec->u2.objs_arr.obj_type, pbundle->bundle.ufile, - spec->u2.objs_arr.access, idr_vals[i]); + spec->u2.objs_arr.obj_type, spec->u2.objs_arr.access, + idr_vals[i], &pbundle->bundle); if (IS_ERR(attr->uobjects[i])) { ret = PTR_ERR(attr->uobjects[i]); break; } - pbundle->bundle.context = attr->uobjects[i]->context; } attr->len = i; @@ -223,7 +222,7 @@ static int uverbs_process_idrs_array(struct bundle_priv *pbundle, static int uverbs_free_idrs_array(const struct uverbs_api_attr *attr_uapi, struct uverbs_objs_arr_attr *attr, - bool commit) + bool commit, struct uverbs_attr_bundle *attrs) { const struct uverbs_attr_spec *spec = &attr_uapi->spec; int current_ret; @@ -231,8 +230,9 @@ static int uverbs_free_idrs_array(const struct uverbs_api_attr *attr_uapi, size_t i; for (i = 0; i != attr->len; i++) { - current_ret = uverbs_finalize_object( - attr->uobjects[i], spec->u2.objs_arr.access, commit); + current_ret = uverbs_finalize_object(attr->uobjects[i], + spec->u2.objs_arr.access, + commit, attrs); if (!ret) ret = current_ret; } @@ -325,13 +325,10 @@ static int uverbs_process_attr(struct bundle_priv *pbundle, * IDR implementation today rejects negative IDs */ o_attr->uobject = uverbs_get_uobject_from_file( - spec->u.obj.obj_type, - pbundle->bundle.ufile, - spec->u.obj.access, - uattr->data_s64); + spec->u.obj.obj_type, spec->u.obj.access, + uattr->data_s64, &pbundle->bundle); if (IS_ERR(o_attr->uobject)) return PTR_ERR(o_attr->uobject); - pbundle->bundle.context = o_attr->uobject->context; __set_bit(attr_bkey, pbundle->uobj_finalize); if (spec->u.obj.access == UVERBS_ACCESS_NEW) { @@ -456,12 +453,14 @@ static int ib_uverbs_run_method(struct bundle_priv *pbundle, uverbs_fill_udata(&pbundle->bundle, &pbundle->bundle.driver_udata, UVERBS_ATTR_UHW_IN, UVERBS_ATTR_UHW_OUT); + else + pbundle->bundle.driver_udata = (struct ib_udata){}; if (destroy_bkey != UVERBS_API_ATTR_BKEY_LEN) { struct uverbs_obj_attr *destroy_attr = &pbundle->bundle.attrs[destroy_bkey].obj_attr; - ret = uobj_destroy(destroy_attr->uobject); + ret = uobj_destroy(destroy_attr->uobject, &pbundle->bundle); if (ret) return ret; __clear_bit(destroy_bkey, pbundle->uobj_finalize); @@ -512,7 +511,8 @@ static int bundle_destroy(struct bundle_priv *pbundle, bool commit) current_ret = uverbs_finalize_object( attr->obj_attr.uobject, - attr->obj_attr.attr_elm->spec.u.obj.access, commit); + attr->obj_attr.attr_elm->spec.u.obj.access, commit, + &pbundle->bundle); if (!ret) ret = current_ret; } @@ -535,7 +535,8 @@ static int bundle_destroy(struct bundle_priv *pbundle, bool commit) if (attr_uapi->spec.type == UVERBS_ATTR_TYPE_IDRS_ARRAY) { current_ret = uverbs_free_idrs_array( - attr_uapi, &attr->objs_arr_attr, commit); + attr_uapi, &attr->objs_arr_attr, commit, + &pbundle->bundle); if (!ret) ret = current_ret; } diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index 8b43dd96d3b2..84a5e9a6d483 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -723,7 +723,7 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf, * then the command request structure starts * with a '__aligned u64 response' member. */ - ret = get_user(response, (const u64 *)buf); + ret = get_user(response, (const u64 __user *)buf); if (ret) goto out_unlock; @@ -926,43 +926,32 @@ static const struct vm_operations_struct rdma_umap_ops = { .fault = rdma_umap_fault, }; -static struct rdma_umap_priv *rdma_user_mmap_pre(struct ib_ucontext *ucontext, - struct vm_area_struct *vma, - unsigned long size) +/* + * Map IO memory into a process. This is to be called by drivers as part of + * their mmap() functions if they wish to send something like PCI-E BAR memory + * to userspace. + */ +int rdma_user_mmap_io(struct ib_ucontext *ucontext, struct vm_area_struct *vma, + unsigned long pfn, unsigned long size, pgprot_t prot) { struct ib_uverbs_file *ufile = ucontext->ufile; struct rdma_umap_priv *priv; if (!(vma->vm_flags & VM_SHARED)) - return ERR_PTR(-EINVAL); + return -EINVAL; if (vma->vm_end - vma->vm_start != size) - return ERR_PTR(-EINVAL); + return -EINVAL; /* Driver is using this wrong, must be called by ib_uverbs_mmap */ if (WARN_ON(!vma->vm_file || vma->vm_file->private_data != ufile)) - return ERR_PTR(-EINVAL); + return -EINVAL; lockdep_assert_held(&ufile->device->disassociate_srcu); priv = kzalloc(sizeof(*priv), GFP_KERNEL); if (!priv) - return ERR_PTR(-ENOMEM); - return priv; -} - -/* - * Map IO memory into a process. This is to be called by drivers as part of - * their mmap() functions if they wish to send something like PCI-E BAR memory - * to userspace. - */ -int rdma_user_mmap_io(struct ib_ucontext *ucontext, struct vm_area_struct *vma, - unsigned long pfn, unsigned long size, pgprot_t prot) -{ - struct rdma_umap_priv *priv = rdma_user_mmap_pre(ucontext, vma, size); - - if (IS_ERR(priv)) - return PTR_ERR(priv); + return -ENOMEM; vma->vm_page_prot = prot; if (io_remap_pfn_range(vma, vma->vm_start, pfn, size, prot)) { @@ -975,35 +964,6 @@ int rdma_user_mmap_io(struct ib_ucontext *ucontext, struct vm_area_struct *vma, } EXPORT_SYMBOL(rdma_user_mmap_io); -/* - * The page case is here for a slightly different reason, the driver expects - * to be able to free the page it is sharing to user space when it destroys - * its ucontext, which means we need to zap the user space references. - * - * We could handle this differently by providing an API to allocate a shared - * page and then only freeing the shared page when the last ufile is - * destroyed. - */ -int rdma_user_mmap_page(struct ib_ucontext *ucontext, - struct vm_area_struct *vma, struct page *page, - unsigned long size) -{ - struct rdma_umap_priv *priv = rdma_user_mmap_pre(ucontext, vma, size); - - if (IS_ERR(priv)) - return PTR_ERR(priv); - - if (remap_pfn_range(vma, vma->vm_start, page_to_pfn(page), size, - vma->vm_page_prot)) { - kfree(priv); - return -EAGAIN; - } - - rdma_umap_priv_init(priv, vma); - return 0; -} -EXPORT_SYMBOL(rdma_user_mmap_page); - void uverbs_user_mmap_disassociate(struct ib_uverbs_file *ufile) { struct rdma_umap_priv *priv, *next_priv; @@ -1094,6 +1054,11 @@ static int ib_uverbs_open(struct inode *inode, struct file *filp) goto err; } + if (!rdma_dev_access_netns(ib_dev, current->nsproxy->net_ns)) { + ret = -EPERM; + goto err; + } + /* In case IB device supports disassociate ucontext, there is no hard * dependency between uverbs device and its low level device. */ diff --git a/drivers/infiniband/core/uverbs_std_types.c b/drivers/infiniband/core/uverbs_std_types.c index f224cb727224..35b2e2c640cc 100644 --- a/drivers/infiniband/core/uverbs_std_types.c +++ b/drivers/infiniband/core/uverbs_std_types.c @@ -40,14 +40,17 @@ #include "uverbs.h" static int uverbs_free_ah(struct ib_uobject *uobject, - enum rdma_remove_reason why) + enum rdma_remove_reason why, + struct uverbs_attr_bundle *attrs) { - return rdma_destroy_ah((struct ib_ah *)uobject->object, - RDMA_DESTROY_AH_SLEEPABLE); + return rdma_destroy_ah_user((struct ib_ah *)uobject->object, + RDMA_DESTROY_AH_SLEEPABLE, + &attrs->driver_udata); } static int uverbs_free_flow(struct ib_uobject *uobject, - enum rdma_remove_reason why) + enum rdma_remove_reason why, + struct uverbs_attr_bundle *attrs) { struct ib_flow *flow = (struct ib_flow *)uobject->object; struct ib_uflow_object *uflow = @@ -66,13 +69,15 @@ static int uverbs_free_flow(struct ib_uobject *uobject, } static int uverbs_free_mw(struct ib_uobject *uobject, - enum rdma_remove_reason why) + enum rdma_remove_reason why, + struct uverbs_attr_bundle *attrs) { return uverbs_dealloc_mw((struct ib_mw *)uobject->object); } static int uverbs_free_qp(struct ib_uobject *uobject, - enum rdma_remove_reason why) + enum rdma_remove_reason why, + struct uverbs_attr_bundle *attrs) { struct ib_qp *qp = uobject->object; struct ib_uqp_object *uqp = @@ -93,19 +98,20 @@ static int uverbs_free_qp(struct ib_uobject *uobject, ib_uverbs_detach_umcast(qp, uqp); } - ret = ib_destroy_qp(qp); + ret = ib_destroy_qp_user(qp, &attrs->driver_udata); if (ib_is_destroy_retryable(ret, why, uobject)) return ret; if (uqp->uxrcd) atomic_dec(&uqp->uxrcd->refcnt); - ib_uverbs_release_uevent(uobject->context->ufile, &uqp->uevent); + ib_uverbs_release_uevent(attrs->ufile, &uqp->uevent); return ret; } static int uverbs_free_rwq_ind_tbl(struct ib_uobject *uobject, - enum rdma_remove_reason why) + enum rdma_remove_reason why, + struct uverbs_attr_bundle *attrs) { struct ib_rwq_ind_table *rwq_ind_tbl = uobject->object; struct ib_wq **ind_tbl = rwq_ind_tbl->ind_tbl; @@ -120,23 +126,25 @@ static int uverbs_free_rwq_ind_tbl(struct ib_uobject *uobject, } static int uverbs_free_wq(struct ib_uobject *uobject, - enum rdma_remove_reason why) + enum rdma_remove_reason why, + struct uverbs_attr_bundle *attrs) { struct ib_wq *wq = uobject->object; struct ib_uwq_object *uwq = container_of(uobject, struct ib_uwq_object, uevent.uobject); int ret; - ret = ib_destroy_wq(wq); + ret = ib_destroy_wq(wq, &attrs->driver_udata); if (ib_is_destroy_retryable(ret, why, uobject)) return ret; - ib_uverbs_release_uevent(uobject->context->ufile, &uwq->uevent); + ib_uverbs_release_uevent(attrs->ufile, &uwq->uevent); return ret; } static int uverbs_free_srq(struct ib_uobject *uobject, - enum rdma_remove_reason why) + enum rdma_remove_reason why, + struct uverbs_attr_bundle *attrs) { struct ib_srq *srq = uobject->object; struct ib_uevent_object *uevent = @@ -144,7 +152,7 @@ static int uverbs_free_srq(struct ib_uobject *uobject, enum ib_srq_type srq_type = srq->srq_type; int ret; - ret = ib_destroy_srq(srq); + ret = ib_destroy_srq_user(srq, &attrs->driver_udata); if (ib_is_destroy_retryable(ret, why, uobject)) return ret; @@ -155,12 +163,13 @@ static int uverbs_free_srq(struct ib_uobject *uobject, atomic_dec(&us->uxrcd->refcnt); } - ib_uverbs_release_uevent(uobject->context->ufile, uevent); + ib_uverbs_release_uevent(attrs->ufile, uevent); return ret; } static int uverbs_free_xrcd(struct ib_uobject *uobject, - enum rdma_remove_reason why) + enum rdma_remove_reason why, + struct uverbs_attr_bundle *attrs) { struct ib_xrcd *xrcd = uobject->object; struct ib_uxrcd_object *uxrcd = @@ -171,15 +180,16 @@ static int uverbs_free_xrcd(struct ib_uobject *uobject, if (ret) return ret; - mutex_lock(&uobject->context->ufile->device->xrcd_tree_mutex); - ret = ib_uverbs_dealloc_xrcd(uobject, xrcd, why); - mutex_unlock(&uobject->context->ufile->device->xrcd_tree_mutex); + mutex_lock(&attrs->ufile->device->xrcd_tree_mutex); + ret = ib_uverbs_dealloc_xrcd(uobject, xrcd, why, attrs); + mutex_unlock(&attrs->ufile->device->xrcd_tree_mutex); return ret; } static int uverbs_free_pd(struct ib_uobject *uobject, - enum rdma_remove_reason why) + enum rdma_remove_reason why, + struct uverbs_attr_bundle *attrs) { struct ib_pd *pd = uobject->object; int ret; @@ -188,7 +198,7 @@ static int uverbs_free_pd(struct ib_uobject *uobject, if (ret) return ret; - ib_dealloc_pd(pd); + ib_dealloc_pd_user(pd, &attrs->driver_udata); return 0; } diff --git a/drivers/infiniband/core/uverbs_std_types_counters.c b/drivers/infiniband/core/uverbs_std_types_counters.c index 309c5e80988d..9f013304e677 100644 --- a/drivers/infiniband/core/uverbs_std_types_counters.c +++ b/drivers/infiniband/core/uverbs_std_types_counters.c @@ -31,11 +31,13 @@ * SOFTWARE. */ +#include "rdma_core.h" #include "uverbs.h" #include <rdma/uverbs_std_types.h> static int uverbs_free_counters(struct ib_uobject *uobject, - enum rdma_remove_reason why) + enum rdma_remove_reason why, + struct uverbs_attr_bundle *attrs) { struct ib_counters *counters = uobject->object; int ret; @@ -52,7 +54,7 @@ static int UVERBS_HANDLER(UVERBS_METHOD_COUNTERS_CREATE)( { struct ib_uobject *uobj = uverbs_attr_get_uobject( attrs, UVERBS_ATTR_CREATE_COUNTERS_HANDLE); - struct ib_device *ib_dev = uobj->context->device; + struct ib_device *ib_dev = attrs->context->device; struct ib_counters *counters; int ret; diff --git a/drivers/infiniband/core/uverbs_std_types_cq.c b/drivers/infiniband/core/uverbs_std_types_cq.c index a59ea89e3f2b..db5c46a1bb2d 100644 --- a/drivers/infiniband/core/uverbs_std_types_cq.c +++ b/drivers/infiniband/core/uverbs_std_types_cq.c @@ -35,7 +35,8 @@ #include "uverbs.h" static int uverbs_free_cq(struct ib_uobject *uobject, - enum rdma_remove_reason why) + enum rdma_remove_reason why, + struct uverbs_attr_bundle *attrs) { struct ib_cq *cq = uobject->object; struct ib_uverbs_event_queue *ev_queue = cq->cq_context; @@ -43,12 +44,12 @@ static int uverbs_free_cq(struct ib_uobject *uobject, container_of(uobject, struct ib_ucq_object, uobject); int ret; - ret = ib_destroy_cq(cq); + ret = ib_destroy_cq_user(cq, &attrs->driver_udata); if (ib_is_destroy_retryable(ret, why, uobject)) return ret; ib_uverbs_release_ucq( - uobject->context->ufile, + attrs->ufile, ev_queue ? container_of(ev_queue, struct ib_uverbs_completion_event_file, ev_queue) : @@ -63,7 +64,7 @@ static int UVERBS_HANDLER(UVERBS_METHOD_CQ_CREATE)( struct ib_ucq_object *obj = container_of( uverbs_attr_get_uobject(attrs, UVERBS_ATTR_CREATE_CQ_HANDLE), typeof(*obj), uobject); - struct ib_device *ib_dev = obj->uobject.context->device; + struct ib_device *ib_dev = attrs->context->device; int ret; u64 user_handle; struct ib_cq_init_attr attr = {}; @@ -110,8 +111,7 @@ static int UVERBS_HANDLER(UVERBS_METHOD_CQ_CREATE)( INIT_LIST_HEAD(&obj->comp_list); INIT_LIST_HEAD(&obj->async_list); - cq = ib_dev->ops.create_cq(ib_dev, &attr, obj->uobject.context, - &attrs->driver_udata); + cq = ib_dev->ops.create_cq(ib_dev, &attr, &attrs->driver_udata); if (IS_ERR(cq)) { ret = PTR_ERR(cq); goto err_event_file; diff --git a/drivers/infiniband/core/uverbs_std_types_dm.c b/drivers/infiniband/core/uverbs_std_types_dm.c index 2ef70637bee1..d5a1de33c2c9 100644 --- a/drivers/infiniband/core/uverbs_std_types_dm.c +++ b/drivers/infiniband/core/uverbs_std_types_dm.c @@ -30,11 +30,13 @@ * SOFTWARE. */ +#include "rdma_core.h" #include "uverbs.h" #include <rdma/uverbs_std_types.h> static int uverbs_free_dm(struct ib_uobject *uobject, - enum rdma_remove_reason why) + enum rdma_remove_reason why, + struct uverbs_attr_bundle *attrs) { struct ib_dm *dm = uobject->object; int ret; @@ -43,7 +45,7 @@ static int uverbs_free_dm(struct ib_uobject *uobject, if (ret) return ret; - return dm->device->ops.dealloc_dm(dm); + return dm->device->ops.dealloc_dm(dm, attrs); } static int UVERBS_HANDLER(UVERBS_METHOD_DM_ALLOC)( @@ -53,7 +55,7 @@ static int UVERBS_HANDLER(UVERBS_METHOD_DM_ALLOC)( struct ib_uobject *uobj = uverbs_attr_get(attrs, UVERBS_ATTR_ALLOC_DM_HANDLE) ->obj_attr.uobject; - struct ib_device *ib_dev = uobj->context->device; + struct ib_device *ib_dev = attrs->context->device; struct ib_dm *dm; int ret; @@ -70,7 +72,7 @@ static int UVERBS_HANDLER(UVERBS_METHOD_DM_ALLOC)( if (ret) return ret; - dm = ib_dev->ops.alloc_dm(ib_dev, uobj->context, &attr, attrs); + dm = ib_dev->ops.alloc_dm(ib_dev, attrs->context, &attr, attrs); if (IS_ERR(dm)) return PTR_ERR(dm); diff --git a/drivers/infiniband/core/uverbs_std_types_flow_action.c b/drivers/infiniband/core/uverbs_std_types_flow_action.c index 4962b87fa600..459cf165b231 100644 --- a/drivers/infiniband/core/uverbs_std_types_flow_action.c +++ b/drivers/infiniband/core/uverbs_std_types_flow_action.c @@ -30,11 +30,13 @@ * SOFTWARE. */ +#include "rdma_core.h" #include "uverbs.h" #include <rdma/uverbs_std_types.h> static int uverbs_free_flow_action(struct ib_uobject *uobject, - enum rdma_remove_reason why) + enum rdma_remove_reason why, + struct uverbs_attr_bundle *attrs) { struct ib_flow_action *action = uobject->object; int ret; @@ -308,7 +310,7 @@ static int UVERBS_HANDLER(UVERBS_METHOD_FLOW_ACTION_ESP_CREATE)( { struct ib_uobject *uobj = uverbs_attr_get_uobject( attrs, UVERBS_ATTR_CREATE_FLOW_ACTION_ESP_HANDLE); - struct ib_device *ib_dev = uobj->context->device; + struct ib_device *ib_dev = attrs->context->device; int ret; struct ib_flow_action *action; struct ib_flow_action_esp_attr esp_attr = {}; diff --git a/drivers/infiniband/core/uverbs_std_types_mr.c b/drivers/infiniband/core/uverbs_std_types_mr.c index 4d4be0c2b752..610d3b9f7654 100644 --- a/drivers/infiniband/core/uverbs_std_types_mr.c +++ b/drivers/infiniband/core/uverbs_std_types_mr.c @@ -30,13 +30,16 @@ * SOFTWARE. */ +#include "rdma_core.h" #include "uverbs.h" #include <rdma/uverbs_std_types.h> static int uverbs_free_mr(struct ib_uobject *uobject, - enum rdma_remove_reason why) + enum rdma_remove_reason why, + struct uverbs_attr_bundle *attrs) { - return ib_dereg_mr((struct ib_mr *)uobject->object); + return ib_dereg_mr_user((struct ib_mr *)uobject->object, + &attrs->driver_udata); } static int UVERBS_HANDLER(UVERBS_METHOD_ADVISE_MR)( @@ -145,7 +148,7 @@ static int UVERBS_HANDLER(UVERBS_METHOD_DM_MR_REG)( return 0; err_dereg: - ib_dereg_mr(mr); + ib_dereg_mr_user(mr, &attrs->driver_udata); return ret; } diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index 5a5e83f5f0fc..e666a1f7608d 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -218,6 +218,8 @@ rdma_node_get_transport(enum rdma_node_type node_type) return RDMA_TRANSPORT_USNIC_UDP; if (node_type == RDMA_NODE_RNIC) return RDMA_TRANSPORT_IWARP; + if (node_type == RDMA_NODE_UNSPECIFIED) + return RDMA_TRANSPORT_UNSPECIFIED; return RDMA_TRANSPORT_IB; } @@ -269,7 +271,7 @@ struct ib_pd *__ib_alloc_pd(struct ib_device *device, unsigned int flags, pd->res.type = RDMA_RESTRACK_PD; rdma_restrack_set_task(&pd->res, caller); - ret = device->ops.alloc_pd(pd, NULL, NULL); + ret = device->ops.alloc_pd(pd, NULL); if (ret) { kfree(pd); return ERR_PTR(ret); @@ -316,17 +318,18 @@ EXPORT_SYMBOL(__ib_alloc_pd); /** * ib_dealloc_pd - Deallocates a protection domain. * @pd: The protection domain to deallocate. + * @udata: Valid user data or NULL for kernel object * * It is an error to call this function while any resources in the pd still * exist. The caller is responsible to synchronously destroy them and * guarantee no new allocations will happen. */ -void ib_dealloc_pd(struct ib_pd *pd) +void ib_dealloc_pd_user(struct ib_pd *pd, struct ib_udata *udata) { int ret; if (pd->__internal_mr) { - ret = pd->device->ops.dereg_mr(pd->__internal_mr); + ret = pd->device->ops.dereg_mr(pd->__internal_mr, NULL); WARN_ON(ret); pd->__internal_mr = NULL; } @@ -336,10 +339,10 @@ void ib_dealloc_pd(struct ib_pd *pd) WARN_ON(atomic_read(&pd->usecnt)); rdma_restrack_del(&pd->res); - pd->device->ops.dealloc_pd(pd); + pd->device->ops.dealloc_pd(pd, udata); kfree(pd); } -EXPORT_SYMBOL(ib_dealloc_pd); +EXPORT_SYMBOL(ib_dealloc_pd_user); /* Address handles */ @@ -495,25 +498,33 @@ static struct ib_ah *_rdma_create_ah(struct ib_pd *pd, u32 flags, struct ib_udata *udata) { + struct ib_device *device = pd->device; struct ib_ah *ah; + int ret; might_sleep_if(flags & RDMA_CREATE_AH_SLEEPABLE); - if (!pd->device->ops.create_ah) + if (!device->ops.create_ah) return ERR_PTR(-EOPNOTSUPP); - ah = pd->device->ops.create_ah(pd, ah_attr, flags, udata); + ah = rdma_zalloc_drv_obj_gfp( + device, ib_ah, + (flags & RDMA_CREATE_AH_SLEEPABLE) ? GFP_KERNEL : GFP_ATOMIC); + if (!ah) + return ERR_PTR(-ENOMEM); - if (!IS_ERR(ah)) { - ah->device = pd->device; - ah->pd = pd; - ah->uobject = NULL; - ah->type = ah_attr->type; - ah->sgid_attr = rdma_update_sgid_attr(ah_attr, NULL); + ah->device = device; + ah->pd = pd; + ah->type = ah_attr->type; + ah->sgid_attr = rdma_update_sgid_attr(ah_attr, NULL); - atomic_inc(&pd->usecnt); + ret = device->ops.create_ah(ah, ah_attr, flags, udata); + if (ret) { + kfree(ah); + return ERR_PTR(ret); } + atomic_inc(&pd->usecnt); return ah; } @@ -930,25 +941,24 @@ int rdma_query_ah(struct ib_ah *ah, struct rdma_ah_attr *ah_attr) } EXPORT_SYMBOL(rdma_query_ah); -int rdma_destroy_ah(struct ib_ah *ah, u32 flags) +int rdma_destroy_ah_user(struct ib_ah *ah, u32 flags, struct ib_udata *udata) { const struct ib_gid_attr *sgid_attr = ah->sgid_attr; struct ib_pd *pd; - int ret; might_sleep_if(flags & RDMA_DESTROY_AH_SLEEPABLE); pd = ah->pd; - ret = ah->device->ops.destroy_ah(ah, flags); - if (!ret) { - atomic_dec(&pd->usecnt); - if (sgid_attr) - rdma_put_gid_attr(sgid_attr); - } - return ret; + ah->device->ops.destroy_ah(ah, flags); + atomic_dec(&pd->usecnt); + if (sgid_attr) + rdma_put_gid_attr(sgid_attr); + + kfree(ah); + return 0; } -EXPORT_SYMBOL(rdma_destroy_ah); +EXPORT_SYMBOL(rdma_destroy_ah_user); /* Shared receive queues */ @@ -956,29 +966,40 @@ struct ib_srq *ib_create_srq(struct ib_pd *pd, struct ib_srq_init_attr *srq_init_attr) { struct ib_srq *srq; + int ret; if (!pd->device->ops.create_srq) return ERR_PTR(-EOPNOTSUPP); - srq = pd->device->ops.create_srq(pd, srq_init_attr, NULL); - - if (!IS_ERR(srq)) { - srq->device = pd->device; - srq->pd = pd; - srq->uobject = NULL; - srq->event_handler = srq_init_attr->event_handler; - srq->srq_context = srq_init_attr->srq_context; - srq->srq_type = srq_init_attr->srq_type; - if (ib_srq_has_cq(srq->srq_type)) { - srq->ext.cq = srq_init_attr->ext.cq; - atomic_inc(&srq->ext.cq->usecnt); - } - if (srq->srq_type == IB_SRQT_XRC) { - srq->ext.xrc.xrcd = srq_init_attr->ext.xrc.xrcd; - atomic_inc(&srq->ext.xrc.xrcd->usecnt); - } - atomic_inc(&pd->usecnt); - atomic_set(&srq->usecnt, 0); + srq = rdma_zalloc_drv_obj(pd->device, ib_srq); + if (!srq) + return ERR_PTR(-ENOMEM); + + srq->device = pd->device; + srq->pd = pd; + srq->event_handler = srq_init_attr->event_handler; + srq->srq_context = srq_init_attr->srq_context; + srq->srq_type = srq_init_attr->srq_type; + + if (ib_srq_has_cq(srq->srq_type)) { + srq->ext.cq = srq_init_attr->ext.cq; + atomic_inc(&srq->ext.cq->usecnt); + } + if (srq->srq_type == IB_SRQT_XRC) { + srq->ext.xrc.xrcd = srq_init_attr->ext.xrc.xrcd; + atomic_inc(&srq->ext.xrc.xrcd->usecnt); + } + atomic_inc(&pd->usecnt); + + ret = pd->device->ops.create_srq(srq, srq_init_attr, NULL); + if (ret) { + atomic_dec(&srq->pd->usecnt); + if (srq->srq_type == IB_SRQT_XRC) + atomic_dec(&srq->ext.xrc.xrcd->usecnt); + if (ib_srq_has_cq(srq->srq_type)) + atomic_dec(&srq->ext.cq->usecnt); + kfree(srq); + return ERR_PTR(ret); } return srq; @@ -1003,36 +1024,23 @@ int ib_query_srq(struct ib_srq *srq, } EXPORT_SYMBOL(ib_query_srq); -int ib_destroy_srq(struct ib_srq *srq) +int ib_destroy_srq_user(struct ib_srq *srq, struct ib_udata *udata) { - struct ib_pd *pd; - enum ib_srq_type srq_type; - struct ib_xrcd *uninitialized_var(xrcd); - struct ib_cq *uninitialized_var(cq); - int ret; - if (atomic_read(&srq->usecnt)) return -EBUSY; - pd = srq->pd; - srq_type = srq->srq_type; - if (ib_srq_has_cq(srq_type)) - cq = srq->ext.cq; - if (srq_type == IB_SRQT_XRC) - xrcd = srq->ext.xrc.xrcd; + srq->device->ops.destroy_srq(srq, udata); - ret = srq->device->ops.destroy_srq(srq); - if (!ret) { - atomic_dec(&pd->usecnt); - if (srq_type == IB_SRQT_XRC) - atomic_dec(&xrcd->usecnt); - if (ib_srq_has_cq(srq_type)) - atomic_dec(&cq->usecnt); - } + atomic_dec(&srq->pd->usecnt); + if (srq->srq_type == IB_SRQT_XRC) + atomic_dec(&srq->ext.xrc.xrcd->usecnt); + if (ib_srq_has_cq(srq->srq_type)) + atomic_dec(&srq->ext.cq->usecnt); + kfree(srq); - return ret; + return 0; } -EXPORT_SYMBOL(ib_destroy_srq); +EXPORT_SYMBOL(ib_destroy_srq_user); /* Queue pairs */ @@ -1111,8 +1119,9 @@ struct ib_qp *ib_open_qp(struct ib_xrcd *xrcd, } EXPORT_SYMBOL(ib_open_qp); -static struct ib_qp *create_xrc_qp(struct ib_qp *qp, - struct ib_qp_init_attr *qp_init_attr) +static struct ib_qp *create_xrc_qp_user(struct ib_qp *qp, + struct ib_qp_init_attr *qp_init_attr, + struct ib_udata *udata) { struct ib_qp *real_qp = qp; @@ -1134,8 +1143,9 @@ static struct ib_qp *create_xrc_qp(struct ib_qp *qp, return qp; } -struct ib_qp *ib_create_qp(struct ib_pd *pd, - struct ib_qp_init_attr *qp_init_attr) +struct ib_qp *ib_create_qp_user(struct ib_pd *pd, + struct ib_qp_init_attr *qp_init_attr, + struct ib_udata *udata) { struct ib_device *device = pd ? pd->device : qp_init_attr->xrcd->device; struct ib_qp *qp; @@ -1164,7 +1174,6 @@ struct ib_qp *ib_create_qp(struct ib_pd *pd, if (ret) goto err; - qp->real_qp = qp; qp->qp_type = qp_init_attr->qp_type; qp->rwq_ind_tbl = qp_init_attr->rwq_ind_tbl; @@ -1176,7 +1185,8 @@ struct ib_qp *ib_create_qp(struct ib_pd *pd, qp->port = 0; if (qp_init_attr->qp_type == IB_QPT_XRC_TGT) { - struct ib_qp *xrc_qp = create_xrc_qp(qp, qp_init_attr); + struct ib_qp *xrc_qp = + create_xrc_qp_user(qp, qp_init_attr, udata); if (IS_ERR(xrc_qp)) { ret = PTR_ERR(xrc_qp); @@ -1230,7 +1240,7 @@ err: return ERR_PTR(ret); } -EXPORT_SYMBOL(ib_create_qp); +EXPORT_SYMBOL(ib_create_qp_user); static const struct { int valid; @@ -1837,7 +1847,7 @@ static int __ib_destroy_shared_qp(struct ib_qp *qp) return 0; } -int ib_destroy_qp(struct ib_qp *qp) +int ib_destroy_qp_user(struct ib_qp *qp, struct ib_udata *udata) { const struct ib_gid_attr *alt_path_sgid_attr = qp->alt_path_sgid_attr; const struct ib_gid_attr *av_sgid_attr = qp->av_sgid_attr; @@ -1869,7 +1879,7 @@ int ib_destroy_qp(struct ib_qp *qp) rdma_rw_cleanup_mrs(qp); rdma_restrack_del(&qp->res); - ret = qp->device->ops.destroy_qp(qp); + ret = qp->device->ops.destroy_qp(qp, udata); if (!ret) { if (alt_path_sgid_attr) rdma_put_gid_attr(alt_path_sgid_attr); @@ -1894,7 +1904,7 @@ int ib_destroy_qp(struct ib_qp *qp) return ret; } -EXPORT_SYMBOL(ib_destroy_qp); +EXPORT_SYMBOL(ib_destroy_qp_user); /* Completion queues */ @@ -1907,7 +1917,7 @@ struct ib_cq *__ib_create_cq(struct ib_device *device, { struct ib_cq *cq; - cq = device->ops.create_cq(device, cq_attr, NULL, NULL); + cq = device->ops.create_cq(device, cq_attr, NULL); if (!IS_ERR(cq)) { cq->device = device; @@ -1933,15 +1943,15 @@ int rdma_set_cq_moderation(struct ib_cq *cq, u16 cq_count, u16 cq_period) } EXPORT_SYMBOL(rdma_set_cq_moderation); -int ib_destroy_cq(struct ib_cq *cq) +int ib_destroy_cq_user(struct ib_cq *cq, struct ib_udata *udata) { if (atomic_read(&cq->usecnt)) return -EBUSY; rdma_restrack_del(&cq->res); - return cq->device->ops.destroy_cq(cq); + return cq->device->ops.destroy_cq(cq, udata); } -EXPORT_SYMBOL(ib_destroy_cq); +EXPORT_SYMBOL(ib_destroy_cq_user); int ib_resize_cq(struct ib_cq *cq, int cqe) { @@ -1952,14 +1962,14 @@ EXPORT_SYMBOL(ib_resize_cq); /* Memory regions */ -int ib_dereg_mr(struct ib_mr *mr) +int ib_dereg_mr_user(struct ib_mr *mr, struct ib_udata *udata) { struct ib_pd *pd = mr->pd; struct ib_dm *dm = mr->dm; int ret; rdma_restrack_del(&mr->res); - ret = mr->device->ops.dereg_mr(mr); + ret = mr->device->ops.dereg_mr(mr, udata); if (!ret) { atomic_dec(&pd->usecnt); if (dm) @@ -1968,13 +1978,14 @@ int ib_dereg_mr(struct ib_mr *mr) return ret; } -EXPORT_SYMBOL(ib_dereg_mr); +EXPORT_SYMBOL(ib_dereg_mr_user); /** * ib_alloc_mr() - Allocates a memory region * @pd: protection domain associated with the region * @mr_type: memory region type * @max_num_sg: maximum sg entries available for registration. + * @udata: user data or null for kernel objects * * Notes: * Memory registeration page/sg lists must not exceed max_num_sg. @@ -1982,16 +1993,15 @@ EXPORT_SYMBOL(ib_dereg_mr); * max_num_sg * used_page_size. * */ -struct ib_mr *ib_alloc_mr(struct ib_pd *pd, - enum ib_mr_type mr_type, - u32 max_num_sg) +struct ib_mr *ib_alloc_mr_user(struct ib_pd *pd, enum ib_mr_type mr_type, + u32 max_num_sg, struct ib_udata *udata) { struct ib_mr *mr; if (!pd->device->ops.alloc_mr) return ERR_PTR(-EOPNOTSUPP); - mr = pd->device->ops.alloc_mr(pd, mr_type, max_num_sg); + mr = pd->device->ops.alloc_mr(pd, mr_type, max_num_sg, udata); if (!IS_ERR(mr)) { mr->device = pd->device; mr->pd = pd; @@ -2005,7 +2015,7 @@ struct ib_mr *ib_alloc_mr(struct ib_pd *pd, return mr; } -EXPORT_SYMBOL(ib_alloc_mr); +EXPORT_SYMBOL(ib_alloc_mr_user); /* "Fast" memory regions */ @@ -2138,7 +2148,7 @@ struct ib_xrcd *__ib_alloc_xrcd(struct ib_device *device, const char *caller) if (!device->ops.alloc_xrcd) return ERR_PTR(-EOPNOTSUPP); - xrcd = device->ops.alloc_xrcd(device, NULL, NULL); + xrcd = device->ops.alloc_xrcd(device, NULL); if (!IS_ERR(xrcd)) { xrcd->device = device; xrcd->inode = NULL; @@ -2151,7 +2161,7 @@ struct ib_xrcd *__ib_alloc_xrcd(struct ib_device *device, const char *caller) } EXPORT_SYMBOL(__ib_alloc_xrcd); -int ib_dealloc_xrcd(struct ib_xrcd *xrcd) +int ib_dealloc_xrcd(struct ib_xrcd *xrcd, struct ib_udata *udata) { struct ib_qp *qp; int ret; @@ -2166,7 +2176,7 @@ int ib_dealloc_xrcd(struct ib_xrcd *xrcd) return ret; } - return xrcd->device->ops.dealloc_xrcd(xrcd); + return xrcd->device->ops.dealloc_xrcd(xrcd, udata); } EXPORT_SYMBOL(ib_dealloc_xrcd); @@ -2210,10 +2220,11 @@ struct ib_wq *ib_create_wq(struct ib_pd *pd, EXPORT_SYMBOL(ib_create_wq); /** - * ib_destroy_wq - Destroys the specified WQ. + * ib_destroy_wq - Destroys the specified user WQ. * @wq: The WQ to destroy. + * @udata: Valid user data */ -int ib_destroy_wq(struct ib_wq *wq) +int ib_destroy_wq(struct ib_wq *wq, struct ib_udata *udata) { int err; struct ib_cq *cq = wq->cq; @@ -2222,7 +2233,7 @@ int ib_destroy_wq(struct ib_wq *wq) if (atomic_read(&wq->usecnt)) return -EBUSY; - err = wq->device->ops.destroy_wq(wq); + err = wq->device->ops.destroy_wq(wq, udata); if (!err) { atomic_dec(&pd->usecnt); atomic_dec(&cq->usecnt); @@ -2701,3 +2712,37 @@ int rdma_init_netdev(struct ib_device *device, u8 port_num, netdev, params.param); } EXPORT_SYMBOL(rdma_init_netdev); + +void __rdma_block_iter_start(struct ib_block_iter *biter, + struct scatterlist *sglist, unsigned int nents, + unsigned long pgsz) +{ + memset(biter, 0, sizeof(struct ib_block_iter)); + biter->__sg = sglist; + biter->__sg_nents = nents; + + /* Driver provides best block size to use */ + biter->__pg_bit = __fls(pgsz); +} +EXPORT_SYMBOL(__rdma_block_iter_start); + +bool __rdma_block_iter_next(struct ib_block_iter *biter) +{ + unsigned int block_offset; + + if (!biter->__sg_nents || !biter->__sg) + return false; + + biter->__dma_addr = sg_dma_address(biter->__sg) + biter->__sg_advance; + block_offset = biter->__dma_addr & (BIT_ULL(biter->__pg_bit) - 1); + biter->__sg_advance += BIT_ULL(biter->__pg_bit) - block_offset; + + if (biter->__sg_advance >= sg_dma_len(biter->__sg)) { + biter->__sg_advance = 0; + biter->__sg = sg_next(biter->__sg); + biter->__sg_nents--; + } + + return true; +} +EXPORT_SYMBOL(__rdma_block_iter_next); |