diff options
Diffstat (limited to 'drivers/infiniband/core')
24 files changed, 2150 insertions, 533 deletions
diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile index d43a8994ac5c..f818538a7f4e 100644 --- a/drivers/infiniband/core/Makefile +++ b/drivers/infiniband/core/Makefile @@ -8,7 +8,7 @@ obj-$(CONFIG_INFINIBAND_USER_MAD) += ib_umad.o obj-$(CONFIG_INFINIBAND_USER_ACCESS) += ib_uverbs.o ib_ucm.o \ $(user_access-y) -ib_core-y := packer.o ud_header.o verbs.o sysfs.o \ +ib_core-y := packer.o ud_header.o verbs.o cq.o sysfs.o \ device.o fmr_pool.o cache.o netlink.o \ roce_gid_mgmt.o ib_core-$(CONFIG_INFINIBAND_USER_MEM) += umem.o @@ -24,6 +24,8 @@ iw_cm-y := iwcm.o iwpm_util.o iwpm_msg.o rdma_cm-y := cma.o +rdma_cm-$(CONFIG_INFINIBAND_ADDR_TRANS_CONFIGFS) += cma_configfs.o + rdma_ucm-y := ucma.o ib_addr-y := addr.o diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c index 34b1adad07aa..337353d86cfa 100644 --- a/drivers/infiniband/core/addr.c +++ b/drivers/infiniband/core/addr.c @@ -121,7 +121,8 @@ int rdma_copy_addr(struct rdma_dev_addr *dev_addr, struct net_device *dev, } EXPORT_SYMBOL(rdma_copy_addr); -int rdma_translate_ip(struct sockaddr *addr, struct rdma_dev_addr *dev_addr, +int rdma_translate_ip(const struct sockaddr *addr, + struct rdma_dev_addr *dev_addr, u16 *vlan_id) { struct net_device *dev; @@ -139,7 +140,7 @@ int rdma_translate_ip(struct sockaddr *addr, struct rdma_dev_addr *dev_addr, switch (addr->sa_family) { case AF_INET: dev = ip_dev_find(dev_addr->net, - ((struct sockaddr_in *) addr)->sin_addr.s_addr); + ((const struct sockaddr_in *)addr)->sin_addr.s_addr); if (!dev) return ret; @@ -154,7 +155,7 @@ int rdma_translate_ip(struct sockaddr *addr, struct rdma_dev_addr *dev_addr, rcu_read_lock(); for_each_netdev_rcu(dev_addr->net, dev) { if (ipv6_chk_addr(dev_addr->net, - &((struct sockaddr_in6 *) addr)->sin6_addr, + &((const struct sockaddr_in6 *)addr)->sin6_addr, dev, 1)) { ret = rdma_copy_addr(dev_addr, dev, NULL); if (vlan_id) @@ -198,7 +199,8 @@ static void queue_req(struct addr_req *req) mutex_unlock(&lock); } -static int dst_fetch_ha(struct dst_entry *dst, struct rdma_dev_addr *dev_addr, void *daddr) +static int dst_fetch_ha(struct dst_entry *dst, struct rdma_dev_addr *dev_addr, + const void *daddr) { struct neighbour *n; int ret; @@ -222,8 +224,9 @@ static int dst_fetch_ha(struct dst_entry *dst, struct rdma_dev_addr *dev_addr, v } static int addr4_resolve(struct sockaddr_in *src_in, - struct sockaddr_in *dst_in, - struct rdma_dev_addr *addr) + const struct sockaddr_in *dst_in, + struct rdma_dev_addr *addr, + struct rtable **prt) { __be32 src_ip = src_in->sin_addr.s_addr; __be32 dst_ip = dst_in->sin_addr.s_addr; @@ -243,33 +246,29 @@ static int addr4_resolve(struct sockaddr_in *src_in, src_in->sin_family = AF_INET; src_in->sin_addr.s_addr = fl4.saddr; - if (rt->dst.dev->flags & IFF_LOOPBACK) { - ret = rdma_translate_ip((struct sockaddr *)dst_in, addr, NULL); - if (!ret) - memcpy(addr->dst_dev_addr, addr->src_dev_addr, MAX_ADDR_LEN); - goto put; - } + /* If there's a gateway, we're definitely in RoCE v2 (as RoCE v1 isn't + * routable) and we could set the network type accordingly. + */ + if (rt->rt_uses_gateway) + addr->network = RDMA_NETWORK_IPV4; - /* If the device does ARP internally, return 'done' */ - if (rt->dst.dev->flags & IFF_NOARP) { - ret = rdma_copy_addr(addr, rt->dst.dev, NULL); - goto put; - } + addr->hoplimit = ip4_dst_hoplimit(&rt->dst); - ret = dst_fetch_ha(&rt->dst, addr, &fl4.daddr); -put: - ip_rt_put(rt); + *prt = rt; + return 0; out: return ret; } #if IS_ENABLED(CONFIG_IPV6) static int addr6_resolve(struct sockaddr_in6 *src_in, - struct sockaddr_in6 *dst_in, - struct rdma_dev_addr *addr) + const struct sockaddr_in6 *dst_in, + struct rdma_dev_addr *addr, + struct dst_entry **pdst) { struct flowi6 fl6; struct dst_entry *dst; + struct rt6_info *rt; int ret; memset(&fl6, 0, sizeof fl6); @@ -281,6 +280,7 @@ static int addr6_resolve(struct sockaddr_in6 *src_in, if ((ret = dst->error)) goto put; + rt = (struct rt6_info *)dst; if (ipv6_addr_any(&fl6.saddr)) { ret = ipv6_dev_get_saddr(addr->net, ip6_dst_idev(dst)->dev, &fl6.daddr, 0, &fl6.saddr); @@ -291,43 +291,111 @@ static int addr6_resolve(struct sockaddr_in6 *src_in, src_in->sin6_addr = fl6.saddr; } - if (dst->dev->flags & IFF_LOOPBACK) { - ret = rdma_translate_ip((struct sockaddr *)dst_in, addr, NULL); - if (!ret) - memcpy(addr->dst_dev_addr, addr->src_dev_addr, MAX_ADDR_LEN); - goto put; - } + /* If there's a gateway, we're definitely in RoCE v2 (as RoCE v1 isn't + * routable) and we could set the network type accordingly. + */ + if (rt->rt6i_flags & RTF_GATEWAY) + addr->network = RDMA_NETWORK_IPV6; - /* If the device does ARP internally, return 'done' */ - if (dst->dev->flags & IFF_NOARP) { - ret = rdma_copy_addr(addr, dst->dev, NULL); - goto put; - } + addr->hoplimit = ip6_dst_hoplimit(dst); - ret = dst_fetch_ha(dst, addr, &fl6.daddr); + *pdst = dst; + return 0; put: dst_release(dst); return ret; } #else static int addr6_resolve(struct sockaddr_in6 *src_in, - struct sockaddr_in6 *dst_in, - struct rdma_dev_addr *addr) + const struct sockaddr_in6 *dst_in, + struct rdma_dev_addr *addr, + struct dst_entry **pdst) { return -EADDRNOTAVAIL; } #endif +static int addr_resolve_neigh(struct dst_entry *dst, + const struct sockaddr *dst_in, + struct rdma_dev_addr *addr) +{ + if (dst->dev->flags & IFF_LOOPBACK) { + int ret; + + ret = rdma_translate_ip(dst_in, addr, NULL); + if (!ret) + memcpy(addr->dst_dev_addr, addr->src_dev_addr, + MAX_ADDR_LEN); + + return ret; + } + + /* If the device doesn't do ARP internally */ + if (!(dst->dev->flags & IFF_NOARP)) { + const struct sockaddr_in *dst_in4 = + (const struct sockaddr_in *)dst_in; + const struct sockaddr_in6 *dst_in6 = + (const struct sockaddr_in6 *)dst_in; + + return dst_fetch_ha(dst, addr, + dst_in->sa_family == AF_INET ? + (const void *)&dst_in4->sin_addr.s_addr : + (const void *)&dst_in6->sin6_addr); + } + + return rdma_copy_addr(addr, dst->dev, NULL); +} + static int addr_resolve(struct sockaddr *src_in, - struct sockaddr *dst_in, - struct rdma_dev_addr *addr) + const struct sockaddr *dst_in, + struct rdma_dev_addr *addr, + bool resolve_neigh) { + struct net_device *ndev; + struct dst_entry *dst; + int ret; + if (src_in->sa_family == AF_INET) { - return addr4_resolve((struct sockaddr_in *) src_in, - (struct sockaddr_in *) dst_in, addr); - } else - return addr6_resolve((struct sockaddr_in6 *) src_in, - (struct sockaddr_in6 *) dst_in, addr); + struct rtable *rt = NULL; + const struct sockaddr_in *dst_in4 = + (const struct sockaddr_in *)dst_in; + + ret = addr4_resolve((struct sockaddr_in *)src_in, + dst_in4, addr, &rt); + if (ret) + return ret; + + if (resolve_neigh) + ret = addr_resolve_neigh(&rt->dst, dst_in, addr); + + ndev = rt->dst.dev; + dev_hold(ndev); + + ip_rt_put(rt); + } else { + const struct sockaddr_in6 *dst_in6 = + (const struct sockaddr_in6 *)dst_in; + + ret = addr6_resolve((struct sockaddr_in6 *)src_in, + dst_in6, addr, + &dst); + if (ret) + return ret; + + if (resolve_neigh) + ret = addr_resolve_neigh(dst, dst_in, addr); + + ndev = dst->dev; + dev_hold(ndev); + + dst_release(dst); + } + + addr->bound_dev_if = ndev->ifindex; + addr->net = dev_net(ndev); + dev_put(ndev); + + return ret; } static void process_req(struct work_struct *work) @@ -343,7 +411,8 @@ static void process_req(struct work_struct *work) if (req->status == -ENODATA) { src_in = (struct sockaddr *) &req->src_addr; dst_in = (struct sockaddr *) &req->dst_addr; - req->status = addr_resolve(src_in, dst_in, req->addr); + req->status = addr_resolve(src_in, dst_in, req->addr, + true); if (req->status && time_after_eq(jiffies, req->timeout)) req->status = -ETIMEDOUT; else if (req->status == -ENODATA) @@ -403,7 +472,7 @@ int rdma_resolve_ip(struct rdma_addr_client *client, req->client = client; atomic_inc(&client->refcount); - req->status = addr_resolve(src_in, dst_in, addr); + req->status = addr_resolve(src_in, dst_in, addr, true); switch (req->status) { case 0: req->timeout = jiffies; @@ -425,6 +494,26 @@ err: } EXPORT_SYMBOL(rdma_resolve_ip); +int rdma_resolve_ip_route(struct sockaddr *src_addr, + const struct sockaddr *dst_addr, + struct rdma_dev_addr *addr) +{ + struct sockaddr_storage ssrc_addr = {}; + struct sockaddr *src_in = (struct sockaddr *)&ssrc_addr; + + if (src_addr) { + if (src_addr->sa_family != dst_addr->sa_family) + return -EINVAL; + + memcpy(src_in, src_addr, rdma_addr_size(src_addr)); + } else { + src_in->sa_family = dst_addr->sa_family; + } + + return addr_resolve(src_in, dst_addr, addr, false); +} +EXPORT_SYMBOL(rdma_resolve_ip_route); + void rdma_addr_cancel(struct rdma_dev_addr *addr) { struct addr_req *req, *temp_req; @@ -456,8 +545,10 @@ static void resolve_cb(int status, struct sockaddr *src_addr, complete(&((struct resolve_cb_context *)context)->comp); } -int rdma_addr_find_dmac_by_grh(const union ib_gid *sgid, const union ib_gid *dgid, - u8 *dmac, u16 *vlan_id, int if_index) +int rdma_addr_find_l2_eth_by_grh(const union ib_gid *sgid, + const union ib_gid *dgid, + u8 *dmac, u16 *vlan_id, int *if_index, + int *hoplimit) { int ret = 0; struct rdma_dev_addr dev_addr; @@ -475,7 +566,8 @@ int rdma_addr_find_dmac_by_grh(const union ib_gid *sgid, const union ib_gid *dgi rdma_gid2ip(&dgid_addr._sockaddr, dgid); memset(&dev_addr, 0, sizeof(dev_addr)); - dev_addr.bound_dev_if = if_index; + if (if_index) + dev_addr.bound_dev_if = *if_index; dev_addr.net = &init_net; ctx.addr = &dev_addr; @@ -491,12 +583,16 @@ int rdma_addr_find_dmac_by_grh(const union ib_gid *sgid, const union ib_gid *dgi dev = dev_get_by_index(&init_net, dev_addr.bound_dev_if); if (!dev) return -ENODEV; + if (if_index) + *if_index = dev_addr.bound_dev_if; if (vlan_id) *vlan_id = rdma_vlan_dev_vlan_id(dev); + if (hoplimit) + *hoplimit = dev_addr.hoplimit; dev_put(dev); return ret; } -EXPORT_SYMBOL(rdma_addr_find_dmac_by_grh); +EXPORT_SYMBOL(rdma_addr_find_l2_eth_by_grh); int rdma_addr_find_smac_by_sgid(union ib_gid *sgid, u8 *smac, u16 *vlan_id) { diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c index 89bebeada38b..53343ffbff7a 100644 --- a/drivers/infiniband/core/cache.c +++ b/drivers/infiniband/core/cache.c @@ -64,6 +64,7 @@ enum gid_attr_find_mask { GID_ATTR_FIND_MASK_GID = 1UL << 0, GID_ATTR_FIND_MASK_NETDEV = 1UL << 1, GID_ATTR_FIND_MASK_DEFAULT = 1UL << 2, + GID_ATTR_FIND_MASK_GID_TYPE = 1UL << 3, }; enum gid_table_entry_props { @@ -81,10 +82,6 @@ enum gid_table_write_action { }; struct ib_gid_table_entry { - /* This lock protects an entry from being - * read and written simultaneously. - */ - rwlock_t lock; unsigned long props; union ib_gid gid; struct ib_gid_attr attr; @@ -109,28 +106,86 @@ struct ib_gid_table { * are locked by this lock. **/ struct mutex lock; + /* This lock protects the table entries from being + * read and written simultaneously. + */ + rwlock_t rwlock; struct ib_gid_table_entry *data_vec; }; +static void dispatch_gid_change_event(struct ib_device *ib_dev, u8 port) +{ + if (rdma_cap_roce_gid_table(ib_dev, port)) { + struct ib_event event; + + event.device = ib_dev; + event.element.port_num = port; + event.event = IB_EVENT_GID_CHANGE; + + ib_dispatch_event(&event); + } +} + +static const char * const gid_type_str[] = { + [IB_GID_TYPE_IB] = "IB/RoCE v1", + [IB_GID_TYPE_ROCE_UDP_ENCAP] = "RoCE v2", +}; + +const char *ib_cache_gid_type_str(enum ib_gid_type gid_type) +{ + if (gid_type < ARRAY_SIZE(gid_type_str) && gid_type_str[gid_type]) + return gid_type_str[gid_type]; + + return "Invalid GID type"; +} +EXPORT_SYMBOL(ib_cache_gid_type_str); + +int ib_cache_gid_parse_type_str(const char *buf) +{ + unsigned int i; + size_t len; + int err = -EINVAL; + + len = strlen(buf); + if (len == 0) + return -EINVAL; + + if (buf[len - 1] == '\n') + len--; + + for (i = 0; i < ARRAY_SIZE(gid_type_str); ++i) + if (gid_type_str[i] && !strncmp(buf, gid_type_str[i], len) && + len == strlen(gid_type_str[i])) { + err = i; + break; + } + + return err; +} +EXPORT_SYMBOL(ib_cache_gid_parse_type_str); + +/* This function expects that rwlock will be write locked in all + * scenarios and that lock will be locked in sleep-able (RoCE) + * scenarios. + */ static int write_gid(struct ib_device *ib_dev, u8 port, struct ib_gid_table *table, int ix, const union ib_gid *gid, const struct ib_gid_attr *attr, enum gid_table_write_action action, bool default_gid) + __releases(&table->rwlock) __acquires(&table->rwlock) { int ret = 0; struct net_device *old_net_dev; - unsigned long flags; /* in rdma_cap_roce_gid_table, this funciton should be protected by a * sleep-able lock. */ - write_lock_irqsave(&table->data_vec[ix].lock, flags); if (rdma_cap_roce_gid_table(ib_dev, port)) { table->data_vec[ix].props |= GID_TABLE_ENTRY_INVALID; - write_unlock_irqrestore(&table->data_vec[ix].lock, flags); + write_unlock_irq(&table->rwlock); /* GID_TABLE_WRITE_ACTION_MODIFY currently isn't supported by * RoCE providers and thus only updates the cache. */ @@ -140,7 +195,7 @@ static int write_gid(struct ib_device *ib_dev, u8 port, else if (action == GID_TABLE_WRITE_ACTION_DEL) ret = ib_dev->del_gid(ib_dev, port, ix, &table->data_vec[ix].context); - write_lock_irqsave(&table->data_vec[ix].lock, flags); + write_lock_irq(&table->rwlock); } old_net_dev = table->data_vec[ix].attr.ndev; @@ -162,17 +217,6 @@ static int write_gid(struct ib_device *ib_dev, u8 port, table->data_vec[ix].props &= ~GID_TABLE_ENTRY_INVALID; - write_unlock_irqrestore(&table->data_vec[ix].lock, flags); - - if (!ret && rdma_cap_roce_gid_table(ib_dev, port)) { - struct ib_event event; - - event.device = ib_dev; - event.element.port_num = port; - event.event = IB_EVENT_GID_CHANGE; - - ib_dispatch_event(&event); - } return ret; } @@ -201,41 +245,58 @@ static int del_gid(struct ib_device *ib_dev, u8 port, GID_TABLE_WRITE_ACTION_DEL, default_gid); } +/* rwlock should be read locked */ static int find_gid(struct ib_gid_table *table, const union ib_gid *gid, const struct ib_gid_attr *val, bool default_gid, - unsigned long mask) + unsigned long mask, int *pempty) { - int i; + int i = 0; + int found = -1; + int empty = pempty ? -1 : 0; - for (i = 0; i < table->sz; i++) { - unsigned long flags; - struct ib_gid_attr *attr = &table->data_vec[i].attr; + while (i < table->sz && (found < 0 || empty < 0)) { + struct ib_gid_table_entry *data = &table->data_vec[i]; + struct ib_gid_attr *attr = &data->attr; + int curr_index = i; - read_lock_irqsave(&table->data_vec[i].lock, flags); + i++; - if (table->data_vec[i].props & GID_TABLE_ENTRY_INVALID) - goto next; + if (data->props & GID_TABLE_ENTRY_INVALID) + continue; + + if (empty < 0) + if (!memcmp(&data->gid, &zgid, sizeof(*gid)) && + !memcmp(attr, &zattr, sizeof(*attr)) && + !data->props) + empty = curr_index; + + if (found >= 0) + continue; + + if (mask & GID_ATTR_FIND_MASK_GID_TYPE && + attr->gid_type != val->gid_type) + continue; if (mask & GID_ATTR_FIND_MASK_GID && - memcmp(gid, &table->data_vec[i].gid, sizeof(*gid))) - goto next; + memcmp(gid, &data->gid, sizeof(*gid))) + continue; if (mask & GID_ATTR_FIND_MASK_NETDEV && attr->ndev != val->ndev) - goto next; + continue; if (mask & GID_ATTR_FIND_MASK_DEFAULT && - !!(table->data_vec[i].props & GID_TABLE_ENTRY_DEFAULT) != + !!(data->props & GID_TABLE_ENTRY_DEFAULT) != default_gid) - goto next; + continue; - read_unlock_irqrestore(&table->data_vec[i].lock, flags); - return i; -next: - read_unlock_irqrestore(&table->data_vec[i].lock, flags); + found = curr_index; } - return -1; + if (pempty) + *pempty = empty; + + return found; } static void make_default_gid(struct net_device *dev, union ib_gid *gid) @@ -252,6 +313,7 @@ int ib_cache_gid_add(struct ib_device *ib_dev, u8 port, int ix; int ret = 0; struct net_device *idev; + int empty; table = ports_table[port - rdma_start_port(ib_dev)]; @@ -275,22 +337,25 @@ int ib_cache_gid_add(struct ib_device *ib_dev, u8 port, } mutex_lock(&table->lock); + write_lock_irq(&table->rwlock); ix = find_gid(table, gid, attr, false, GID_ATTR_FIND_MASK_GID | - GID_ATTR_FIND_MASK_NETDEV); + GID_ATTR_FIND_MASK_GID_TYPE | + GID_ATTR_FIND_MASK_NETDEV, &empty); if (ix >= 0) goto out_unlock; - ix = find_gid(table, &zgid, NULL, false, GID_ATTR_FIND_MASK_GID | - GID_ATTR_FIND_MASK_DEFAULT); - if (ix < 0) { + if (empty < 0) { ret = -ENOSPC; goto out_unlock; } - add_gid(ib_dev, port, table, ix, gid, attr, false); + ret = add_gid(ib_dev, port, table, empty, gid, attr, false); + if (!ret) + dispatch_gid_change_event(ib_dev, port); out_unlock: + write_unlock_irq(&table->rwlock); mutex_unlock(&table->lock); return ret; } @@ -305,17 +370,22 @@ int ib_cache_gid_del(struct ib_device *ib_dev, u8 port, table = ports_table[port - rdma_start_port(ib_dev)]; mutex_lock(&table->lock); + write_lock_irq(&table->rwlock); ix = find_gid(table, gid, attr, false, GID_ATTR_FIND_MASK_GID | + GID_ATTR_FIND_MASK_GID_TYPE | GID_ATTR_FIND_MASK_NETDEV | - GID_ATTR_FIND_MASK_DEFAULT); + GID_ATTR_FIND_MASK_DEFAULT, + NULL); if (ix < 0) goto out_unlock; - del_gid(ib_dev, port, table, ix, false); + if (!del_gid(ib_dev, port, table, ix, false)) + dispatch_gid_change_event(ib_dev, port); out_unlock: + write_unlock_irq(&table->rwlock); mutex_unlock(&table->lock); return 0; } @@ -326,16 +396,24 @@ int ib_cache_gid_del_all_netdev_gids(struct ib_device *ib_dev, u8 port, struct ib_gid_table **ports_table = ib_dev->cache.gid_cache; struct ib_gid_table *table; int ix; + bool deleted = false; table = ports_table[port - rdma_start_port(ib_dev)]; mutex_lock(&table->lock); + write_lock_irq(&table->rwlock); for (ix = 0; ix < table->sz; ix++) if (table->data_vec[ix].attr.ndev == ndev) - del_gid(ib_dev, port, table, ix, false); + if (!del_gid(ib_dev, port, table, ix, false)) + deleted = true; + write_unlock_irq(&table->rwlock); mutex_unlock(&table->lock); + + if (deleted) + dispatch_gid_change_event(ib_dev, port); + return 0; } @@ -344,18 +422,14 @@ static int __ib_cache_gid_get(struct ib_device *ib_dev, u8 port, int index, { struct ib_gid_table **ports_table = ib_dev->cache.gid_cache; struct ib_gid_table *table; - unsigned long flags; table = ports_table[port - rdma_start_port(ib_dev)]; if (index < 0 || index >= table->sz) return -EINVAL; - read_lock_irqsave(&table->data_vec[index].lock, flags); - if (table->data_vec[index].props & GID_TABLE_ENTRY_INVALID) { - read_unlock_irqrestore(&table->data_vec[index].lock, flags); + if (table->data_vec[index].props & GID_TABLE_ENTRY_INVALID) return -EAGAIN; - } memcpy(gid, &table->data_vec[index].gid, sizeof(*gid)); if (attr) { @@ -364,7 +438,6 @@ static int __ib_cache_gid_get(struct ib_device *ib_dev, u8 port, int index, dev_hold(attr->ndev); } - read_unlock_irqrestore(&table->data_vec[index].lock, flags); return 0; } @@ -378,17 +451,21 @@ static int _ib_cache_gid_table_find(struct ib_device *ib_dev, struct ib_gid_table *table; u8 p; int local_index; + unsigned long flags; for (p = 0; p < ib_dev->phys_port_cnt; p++) { table = ports_table[p]; - local_index = find_gid(table, gid, val, false, mask); + read_lock_irqsave(&table->rwlock, flags); + local_index = find_gid(table, gid, val, false, mask, NULL); if (local_index >= 0) { if (index) *index = local_index; if (port) *port = p + rdma_start_port(ib_dev); + read_unlock_irqrestore(&table->rwlock, flags); return 0; } + read_unlock_irqrestore(&table->rwlock, flags); } return -ENOENT; @@ -396,11 +473,13 @@ static int _ib_cache_gid_table_find(struct ib_device *ib_dev, static int ib_cache_gid_find(struct ib_device *ib_dev, const union ib_gid *gid, + enum ib_gid_type gid_type, struct net_device *ndev, u8 *port, u16 *index) { - unsigned long mask = GID_ATTR_FIND_MASK_GID; - struct ib_gid_attr gid_attr_val = {.ndev = ndev}; + unsigned long mask = GID_ATTR_FIND_MASK_GID | + GID_ATTR_FIND_MASK_GID_TYPE; + struct ib_gid_attr gid_attr_val = {.ndev = ndev, .gid_type = gid_type}; if (ndev) mask |= GID_ATTR_FIND_MASK_NETDEV; @@ -411,14 +490,17 @@ static int ib_cache_gid_find(struct ib_device *ib_dev, int ib_find_cached_gid_by_port(struct ib_device *ib_dev, const union ib_gid *gid, + enum ib_gid_type gid_type, u8 port, struct net_device *ndev, u16 *index) { int local_index; struct ib_gid_table **ports_table = ib_dev->cache.gid_cache; struct ib_gid_table *table; - unsigned long mask = GID_ATTR_FIND_MASK_GID; - struct ib_gid_attr val = {.ndev = ndev}; + unsigned long mask = GID_ATTR_FIND_MASK_GID | + GID_ATTR_FIND_MASK_GID_TYPE; + struct ib_gid_attr val = {.ndev = ndev, .gid_type = gid_type}; + unsigned long flags; if (port < rdma_start_port(ib_dev) || port > rdma_end_port(ib_dev)) @@ -429,13 +511,16 @@ int ib_find_cached_gid_by_port(struct ib_device *ib_dev, if (ndev) mask |= GID_ATTR_FIND_MASK_NETDEV; - local_index = find_gid(table, gid, &val, false, mask); + read_lock_irqsave(&table->rwlock, flags); + local_index = find_gid(table, gid, &val, false, mask, NULL); if (local_index >= 0) { if (index) *index = local_index; + read_unlock_irqrestore(&table->rwlock, flags); return 0; } + read_unlock_irqrestore(&table->rwlock, flags); return -ENOENT; } EXPORT_SYMBOL(ib_find_cached_gid_by_port); @@ -472,6 +557,7 @@ static int ib_cache_gid_find_by_filter(struct ib_device *ib_dev, struct ib_gid_table **ports_table = ib_dev->cache.gid_cache; struct ib_gid_table *table; unsigned int i; + unsigned long flags; bool found = false; if (!ports_table) @@ -484,11 +570,10 @@ static int ib_cache_gid_find_by_filter(struct ib_device *ib_dev, table = ports_table[port - rdma_start_port(ib_dev)]; + read_lock_irqsave(&table->rwlock, flags); for (i = 0; i < table->sz; i++) { struct ib_gid_attr attr; - unsigned long flags; - read_lock_irqsave(&table->data_vec[i].lock, flags); if (table->data_vec[i].props & GID_TABLE_ENTRY_INVALID) goto next; @@ -501,11 +586,10 @@ static int ib_cache_gid_find_by_filter(struct ib_device *ib_dev, found = true; next: - read_unlock_irqrestore(&table->data_vec[i].lock, flags); - if (found) break; } + read_unlock_irqrestore(&table->rwlock, flags); if (!found) return -ENOENT; @@ -517,9 +601,9 @@ next: static struct ib_gid_table *alloc_gid_table(int sz) { - unsigned int i; struct ib_gid_table *table = kzalloc(sizeof(struct ib_gid_table), GFP_KERNEL); + if (!table) return NULL; @@ -530,9 +614,7 @@ static struct ib_gid_table *alloc_gid_table(int sz) mutex_init(&table->lock); table->sz = sz; - - for (i = 0; i < sz; i++) - rwlock_init(&table->data_vec[i].lock); + rwlock_init(&table->rwlock); return table; @@ -553,30 +635,37 @@ static void cleanup_gid_table_port(struct ib_device *ib_dev, u8 port, struct ib_gid_table *table) { int i; + bool deleted = false; if (!table) return; + write_lock_irq(&table->rwlock); for (i = 0; i < table->sz; ++i) { if (memcmp(&table->data_vec[i].gid, &zgid, sizeof(table->data_vec[i].gid))) - del_gid(ib_dev, port, table, i, - table->data_vec[i].props & - GID_ATTR_FIND_MASK_DEFAULT); + if (!del_gid(ib_dev, port, table, i, + table->data_vec[i].props & + GID_ATTR_FIND_MASK_DEFAULT)) + deleted = true; } + write_unlock_irq(&table->rwlock); + + if (deleted) + dispatch_gid_change_event(ib_dev, port); } void ib_cache_gid_set_default_gid(struct ib_device *ib_dev, u8 port, struct net_device *ndev, + unsigned long gid_type_mask, enum ib_cache_gid_default_mode mode) { struct ib_gid_table **ports_table = ib_dev->cache.gid_cache; union ib_gid gid; struct ib_gid_attr gid_attr; + struct ib_gid_attr zattr_type = zattr; struct ib_gid_table *table; - int ix; - union ib_gid current_gid; - struct ib_gid_attr current_gid_attr = {}; + unsigned int gid_type; table = ports_table[port - rdma_start_port(ib_dev)]; @@ -584,46 +673,82 @@ void ib_cache_gid_set_default_gid(struct ib_device *ib_dev, u8 port, memset(&gid_attr, 0, sizeof(gid_attr)); gid_attr.ndev = ndev; - mutex_lock(&table->lock); - ix = find_gid(table, NULL, NULL, true, GID_ATTR_FIND_MASK_DEFAULT); - - /* Coudn't find default GID location */ - WARN_ON(ix < 0); - - if (!__ib_cache_gid_get(ib_dev, port, ix, - ¤t_gid, ¤t_gid_attr) && - mode == IB_CACHE_GID_DEFAULT_MODE_SET && - !memcmp(&gid, ¤t_gid, sizeof(gid)) && - !memcmp(&gid_attr, ¤t_gid_attr, sizeof(gid_attr))) - goto unlock; - - if ((memcmp(¤t_gid, &zgid, sizeof(current_gid)) || - memcmp(¤t_gid_attr, &zattr, - sizeof(current_gid_attr))) && - del_gid(ib_dev, port, table, ix, true)) { - pr_warn("ib_cache_gid: can't delete index %d for default gid %pI6\n", - ix, gid.raw); - goto unlock; - } + for (gid_type = 0; gid_type < IB_GID_TYPE_SIZE; ++gid_type) { + int ix; + union ib_gid current_gid; + struct ib_gid_attr current_gid_attr = {}; + + if (1UL << gid_type & ~gid_type_mask) + continue; + + gid_attr.gid_type = gid_type; + + mutex_lock(&table->lock); + write_lock_irq(&table->rwlock); + ix = find_gid(table, NULL, &gid_attr, true, + GID_ATTR_FIND_MASK_GID_TYPE | + GID_ATTR_FIND_MASK_DEFAULT, + NULL); + + /* Coudn't find default GID location */ + WARN_ON(ix < 0); + + zattr_type.gid_type = gid_type; + + if (!__ib_cache_gid_get(ib_dev, port, ix, + ¤t_gid, ¤t_gid_attr) && + mode == IB_CACHE_GID_DEFAULT_MODE_SET && + !memcmp(&gid, ¤t_gid, sizeof(gid)) && + !memcmp(&gid_attr, ¤t_gid_attr, sizeof(gid_attr))) + goto release; + + if (memcmp(¤t_gid, &zgid, sizeof(current_gid)) || + memcmp(¤t_gid_attr, &zattr_type, + sizeof(current_gid_attr))) { + if (del_gid(ib_dev, port, table, ix, true)) { + pr_warn("ib_cache_gid: can't delete index %d for default gid %pI6\n", + ix, gid.raw); + goto release; + } else { + dispatch_gid_change_event(ib_dev, port); + } + } - if (mode == IB_CACHE_GID_DEFAULT_MODE_SET) - if (add_gid(ib_dev, port, table, ix, &gid, &gid_attr, true)) - pr_warn("ib_cache_gid: unable to add default gid %pI6\n", - gid.raw); + if (mode == IB_CACHE_GID_DEFAULT_MODE_SET) { + if (add_gid(ib_dev, port, table, ix, &gid, &gid_attr, true)) + pr_warn("ib_cache_gid: unable to add default gid %pI6\n", + gid.raw); + else + dispatch_gid_change_event(ib_dev, port); + } -unlock: - if (current_gid_attr.ndev) - dev_put(current_gid_attr.ndev); - mutex_unlock(&table->lock); +release: + if (current_gid_attr.ndev) + dev_put(current_gid_attr.ndev); + write_unlock_irq(&table->rwlock); + mutex_unlock(&table->lock); + } } static int gid_table_reserve_default(struct ib_device *ib_dev, u8 port, struct ib_gid_table *table) { - if (rdma_protocol_roce(ib_dev, port)) { - struct ib_gid_table_entry *entry = &table->data_vec[0]; + unsigned int i; + unsigned long roce_gid_type_mask; + unsigned int num_default_gids; + unsigned int current_gid = 0; + + roce_gid_type_mask = roce_gid_type_mask_support(ib_dev, port); + num_default_gids = hweight_long(roce_gid_type_mask); + for (i = 0; i < num_default_gids && i < table->sz; i++) { + struct ib_gid_table_entry *entry = + &table->data_vec[i]; entry->props |= GID_TABLE_ENTRY_DEFAULT; + current_gid = find_next_bit(&roce_gid_type_mask, + BITS_PER_LONG, + current_gid); + entry->attr.gid_type = current_gid++; } return 0; @@ -728,20 +853,30 @@ int ib_get_cached_gid(struct ib_device *device, union ib_gid *gid, struct ib_gid_attr *gid_attr) { + int res; + unsigned long flags; + struct ib_gid_table **ports_table = device->cache.gid_cache; + struct ib_gid_table *table = ports_table[port_num - rdma_start_port(device)]; + if (port_num < rdma_start_port(device) || port_num > rdma_end_port(device)) return -EINVAL; - return __ib_cache_gid_get(device, port_num, index, gid, gid_attr); + read_lock_irqsave(&table->rwlock, flags); + res = __ib_cache_gid_get(device, port_num, index, gid, gid_attr); + read_unlock_irqrestore(&table->rwlock, flags); + + return res; } EXPORT_SYMBOL(ib_get_cached_gid); int ib_find_cached_gid(struct ib_device *device, const union ib_gid *gid, + enum ib_gid_type gid_type, struct net_device *ndev, u8 *port_num, u16 *index) { - return ib_cache_gid_find(device, gid, ndev, port_num, index); + return ib_cache_gid_find(device, gid, gid_type, ndev, port_num, index); } EXPORT_SYMBOL(ib_find_cached_gid); @@ -956,10 +1091,12 @@ static void ib_cache_update(struct ib_device *device, device->cache.pkey_cache[port - rdma_start_port(device)] = pkey_cache; if (!use_roce_gid_table) { + write_lock(&table->rwlock); for (i = 0; i < gid_cache->table_len; i++) { modify_gid(device, port, table, i, gid_cache->table + i, &zattr, false); } + write_unlock(&table->rwlock); } device->cache.lmc_cache[port - rdma_start_port(device)] = tprops->lmc; diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c index 0a26dd6d9b19..1d92e091e22e 100644 --- a/drivers/infiniband/core/cm.c +++ b/drivers/infiniband/core/cm.c @@ -364,7 +364,7 @@ static int cm_init_av_by_path(struct ib_sa_path_rec *path, struct cm_av *av) read_lock_irqsave(&cm.device_lock, flags); list_for_each_entry(cm_dev, &cm.device_list, list) { if (!ib_find_cached_gid(cm_dev->ib_device, &path->sgid, - ndev, &p, NULL)) { + path->gid_type, ndev, &p, NULL)) { port = cm_dev->port[p-1]; break; } @@ -782,11 +782,11 @@ static void cm_enter_timewait(struct cm_id_private *cm_id_priv) wait_time = cm_convert_to_ms(cm_id_priv->av.timeout); /* Check if the device started its remove_one */ - spin_lock_irq(&cm.lock); + spin_lock_irqsave(&cm.lock, flags); if (!cm_dev->going_down) queue_delayed_work(cm.wq, &cm_id_priv->timewait_info->work.work, msecs_to_jiffies(wait_time)); - spin_unlock_irq(&cm.lock); + spin_unlock_irqrestore(&cm.lock, flags); cm_id_priv->timewait_info = NULL; } @@ -1600,6 +1600,8 @@ static int cm_req_handler(struct cm_work *work) struct ib_cm_id *cm_id; struct cm_id_private *cm_id_priv, *listen_cm_id_priv; struct cm_req_msg *req_msg; + union ib_gid gid; + struct ib_gid_attr gid_attr; int ret; req_msg = (struct cm_req_msg *)work->mad_recv_wc->recv_buf.mad; @@ -1639,11 +1641,31 @@ static int cm_req_handler(struct cm_work *work) cm_format_paths_from_req(req_msg, &work->path[0], &work->path[1]); memcpy(work->path[0].dmac, cm_id_priv->av.ah_attr.dmac, ETH_ALEN); - ret = cm_init_av_by_path(&work->path[0], &cm_id_priv->av); + work->path[0].hop_limit = cm_id_priv->av.ah_attr.grh.hop_limit; + ret = ib_get_cached_gid(work->port->cm_dev->ib_device, + work->port->port_num, + cm_id_priv->av.ah_attr.grh.sgid_index, + &gid, &gid_attr); + if (!ret) { + if (gid_attr.ndev) { + work->path[0].ifindex = gid_attr.ndev->ifindex; + work->path[0].net = dev_net(gid_attr.ndev); + dev_put(gid_attr.ndev); + } + work->path[0].gid_type = gid_attr.gid_type; + ret = cm_init_av_by_path(&work->path[0], &cm_id_priv->av); + } if (ret) { - ib_get_cached_gid(work->port->cm_dev->ib_device, - work->port->port_num, 0, &work->path[0].sgid, - NULL); + int err = ib_get_cached_gid(work->port->cm_dev->ib_device, + work->port->port_num, 0, + &work->path[0].sgid, + &gid_attr); + if (!err && gid_attr.ndev) { + work->path[0].ifindex = gid_attr.ndev->ifindex; + work->path[0].net = dev_net(gid_attr.ndev); + dev_put(gid_attr.ndev); + } + work->path[0].gid_type = gid_attr.gid_type; ib_send_cm_rej(cm_id, IB_CM_REJ_INVALID_GID, &work->path[0].sgid, sizeof work->path[0].sgid, NULL, 0); @@ -3482,6 +3504,7 @@ int ib_cm_notify(struct ib_cm_id *cm_id, enum ib_event_type event) EXPORT_SYMBOL(ib_cm_notify); static void cm_recv_handler(struct ib_mad_agent *mad_agent, + struct ib_mad_send_buf *send_buf, struct ib_mad_recv_wc *mad_recv_wc) { struct cm_port *port = mad_agent->context; @@ -3731,16 +3754,6 @@ int ib_cm_init_qp_attr(struct ib_cm_id *cm_id, } EXPORT_SYMBOL(ib_cm_init_qp_attr); -static void cm_get_ack_delay(struct cm_device *cm_dev) -{ - struct ib_device_attr attr; - - if (ib_query_device(cm_dev->ib_device, &attr)) - cm_dev->ack_delay = 0; /* acks will rely on packet life time */ - else - cm_dev->ack_delay = attr.local_ca_ack_delay; -} - static ssize_t cm_show_counter(struct kobject *obj, struct attribute *attr, char *buf) { @@ -3852,7 +3865,7 @@ static void cm_add_one(struct ib_device *ib_device) return; cm_dev->ib_device = ib_device; - cm_get_ack_delay(cm_dev); + cm_dev->ack_delay = ib_device->attrs.local_ca_ack_delay; cm_dev->going_down = 0; cm_dev->device = device_create(&cm_class, &ib_device->dev, MKDEV(0, 0), NULL, diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 2d762a2ecd81..9729639df407 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -38,6 +38,7 @@ #include <linux/in6.h> #include <linux/mutex.h> #include <linux/random.h> +#include <linux/igmp.h> #include <linux/idr.h> #include <linux/inetdevice.h> #include <linux/slab.h> @@ -60,6 +61,8 @@ #include <rdma/ib_sa.h> #include <rdma/iw_cm.h> +#include "core_priv.h" + MODULE_AUTHOR("Sean Hefty"); MODULE_DESCRIPTION("Generic RDMA CM Agent"); MODULE_LICENSE("Dual BSD/GPL"); @@ -150,6 +153,7 @@ struct cma_device { struct completion comp; atomic_t refcount; struct list_head id_list; + enum ib_gid_type *default_gid_type; }; struct rdma_bind_list { @@ -185,6 +189,67 @@ enum { CMA_OPTION_AFONLY, }; +void cma_ref_dev(struct cma_device *cma_dev) +{ + atomic_inc(&cma_dev->refcount); +} + +struct cma_device *cma_enum_devices_by_ibdev(cma_device_filter filter, + void *cookie) +{ + struct cma_device *cma_dev; + struct cma_device *found_cma_dev = NULL; + + mutex_lock(&lock); + + list_for_each_entry(cma_dev, &dev_list, list) + if (filter(cma_dev->device, cookie)) { + found_cma_dev = cma_dev; + break; + } + + if (found_cma_dev) + cma_ref_dev(found_cma_dev); + mutex_unlock(&lock); + return found_cma_dev; +} + +int cma_get_default_gid_type(struct cma_device *cma_dev, + unsigned int port) +{ + if (port < rdma_start_port(cma_dev->device) || + port > rdma_end_port(cma_dev->device)) + return -EINVAL; + + return cma_dev->default_gid_type[port - rdma_start_port(cma_dev->device)]; +} + +int cma_set_default_gid_type(struct cma_device *cma_dev, + unsigned int port, + enum ib_gid_type default_gid_type) +{ + unsigned long supported_gids; + + if (port < rdma_start_port(cma_dev->device) || + port > rdma_end_port(cma_dev->device)) + return -EINVAL; + + supported_gids = roce_gid_type_mask_support(cma_dev->device, port); + + if (!(supported_gids & 1 << default_gid_type)) + return -EINVAL; + + cma_dev->default_gid_type[port - rdma_start_port(cma_dev->device)] = + default_gid_type; + + return 0; +} + +struct ib_device *cma_get_ib_dev(struct cma_device *cma_dev) +{ + return cma_dev->device; +} + /* * Device removal can occur at anytime, so we need extra handling to * serialize notifying the user of device removal with other callbacks. @@ -228,6 +293,7 @@ struct rdma_id_private { u8 tos; u8 reuseaddr; u8 afonly; + enum ib_gid_type gid_type; }; struct cma_multicast { @@ -239,6 +305,7 @@ struct cma_multicast { void *context; struct sockaddr_storage addr; struct kref mcref; + bool igmp_joined; }; struct cma_work { @@ -335,18 +402,48 @@ static inline void cma_set_ip_ver(struct cma_hdr *hdr, u8 ip_ver) hdr->ip_version = (ip_ver << 4) | (hdr->ip_version & 0xF); } -static void cma_attach_to_dev(struct rdma_id_private *id_priv, - struct cma_device *cma_dev) +static int cma_igmp_send(struct net_device *ndev, union ib_gid *mgid, bool join) { - atomic_inc(&cma_dev->refcount); + struct in_device *in_dev = NULL; + + if (ndev) { + rtnl_lock(); + in_dev = __in_dev_get_rtnl(ndev); + if (in_dev) { + if (join) + ip_mc_inc_group(in_dev, + *(__be32 *)(mgid->raw + 12)); + else + ip_mc_dec_group(in_dev, + *(__be32 *)(mgid->raw + 12)); + } + rtnl_unlock(); + } + return (in_dev) ? 0 : -ENODEV; +} + +static void _cma_attach_to_dev(struct rdma_id_private *id_priv, + struct cma_device *cma_dev) +{ + cma_ref_dev(cma_dev); id_priv->cma_dev = cma_dev; + id_priv->gid_type = 0; id_priv->id.device = cma_dev->device; id_priv->id.route.addr.dev_addr.transport = rdma_node_get_transport(cma_dev->device->node_type); list_add_tail(&id_priv->list, &cma_dev->id_list); } -static inline void cma_deref_dev(struct cma_device *cma_dev) +static void cma_attach_to_dev(struct rdma_id_private *id_priv, + struct cma_device *cma_dev) +{ + _cma_attach_to_dev(id_priv, cma_dev); + id_priv->gid_type = + cma_dev->default_gid_type[id_priv->id.port_num - + rdma_start_port(cma_dev->device)]; +} + +void cma_deref_dev(struct cma_device *cma_dev) { if (atomic_dec_and_test(&cma_dev->refcount)) complete(&cma_dev->comp); @@ -441,6 +538,7 @@ static int cma_translate_addr(struct sockaddr *addr, struct rdma_dev_addr *dev_a } static inline int cma_validate_port(struct ib_device *device, u8 port, + enum ib_gid_type gid_type, union ib_gid *gid, int dev_type, int bound_if_index) { @@ -453,10 +551,25 @@ static inline int cma_validate_port(struct ib_device *device, u8 port, if ((dev_type != ARPHRD_INFINIBAND) && rdma_protocol_ib(device, port)) return ret; - if (dev_type == ARPHRD_ETHER) + if (dev_type == ARPHRD_ETHER && rdma_protocol_roce(device, port)) { ndev = dev_get_by_index(&init_net, bound_if_index); + if (ndev && ndev->flags & IFF_LOOPBACK) { + pr_info("detected loopback device\n"); + dev_put(ndev); - ret = ib_find_cached_gid_by_port(device, gid, port, ndev, NULL); + if (!device->get_netdev) + return -EOPNOTSUPP; + + ndev = device->get_netdev(device, port); + if (!ndev) + return -ENODEV; + } + } else { + gid_type = IB_GID_TYPE_IB; + } + + ret = ib_find_cached_gid_by_port(device, gid, gid_type, port, + ndev, NULL); if (ndev) dev_put(ndev); @@ -490,7 +603,10 @@ static int cma_acquire_dev(struct rdma_id_private *id_priv, gidp = rdma_protocol_roce(cma_dev->device, port) ? &iboe_gid : &gid; - ret = cma_validate_port(cma_dev->device, port, gidp, + ret = cma_validate_port(cma_dev->device, port, + rdma_protocol_ib(cma_dev->device, port) ? + IB_GID_TYPE_IB : + listen_id_priv->gid_type, gidp, dev_addr->dev_type, dev_addr->bound_dev_if); if (!ret) { @@ -509,8 +625,11 @@ static int cma_acquire_dev(struct rdma_id_private *id_priv, gidp = rdma_protocol_roce(cma_dev->device, port) ? &iboe_gid : &gid; - ret = cma_validate_port(cma_dev->device, port, gidp, - dev_addr->dev_type, + ret = cma_validate_port(cma_dev->device, port, + rdma_protocol_ib(cma_dev->device, port) ? + IB_GID_TYPE_IB : + cma_dev->default_gid_type[port - 1], + gidp, dev_addr->dev_type, dev_addr->bound_dev_if); if (!ret) { id_priv->id.port_num = port; @@ -1437,8 +1556,24 @@ static void cma_leave_mc_groups(struct rdma_id_private *id_priv) id_priv->id.port_num)) { ib_sa_free_multicast(mc->multicast.ib); kfree(mc); - } else + } else { + if (mc->igmp_joined) { + struct rdma_dev_addr *dev_addr = + &id_priv->id.route.addr.dev_addr; + struct net_device *ndev = NULL; + + if (dev_addr->bound_dev_if) + ndev = dev_get_by_index(&init_net, + dev_addr->bound_dev_if); + if (ndev) { + cma_igmp_send(ndev, + &mc->multicast.ib->rec.mgid, + false); + dev_put(ndev); + } + } kref_put(&mc->mcref, release_mc); + } } } @@ -1896,7 +2031,6 @@ static int iw_conn_req_handler(struct iw_cm_id *cm_id, struct rdma_id_private *listen_id, *conn_id; struct rdma_cm_event event; int ret; - struct ib_device_attr attr; struct sockaddr *laddr = (struct sockaddr *)&iw_event->local_addr; struct sockaddr *raddr = (struct sockaddr *)&iw_event->remote_addr; @@ -1938,13 +2072,6 @@ static int iw_conn_req_handler(struct iw_cm_id *cm_id, memcpy(cma_src_addr(conn_id), laddr, rdma_addr_size(laddr)); memcpy(cma_dst_addr(conn_id), raddr, rdma_addr_size(raddr)); - ret = ib_query_device(conn_id->id.device, &attr); - if (ret) { - mutex_unlock(&conn_id->handler_mutex); - rdma_destroy_id(new_cm_id); - goto out; - } - memset(&event, 0, sizeof event); event.event = RDMA_CM_EVENT_CONNECT_REQUEST; event.param.conn.private_data = iw_event->private_data; @@ -2051,7 +2178,7 @@ static void cma_listen_on_dev(struct rdma_id_private *id_priv, memcpy(cma_src_addr(dev_id_priv), cma_src_addr(id_priv), rdma_addr_size(cma_src_addr(id_priv))); - cma_attach_to_dev(dev_id_priv, cma_dev); + _cma_attach_to_dev(dev_id_priv, cma_dev); list_add_tail(&dev_id_priv->listen_list, &id_priv->listen_list); atomic_inc(&id_priv->refcount); dev_id_priv->internal_id = 1; @@ -2321,8 +2448,23 @@ static int cma_resolve_iboe_route(struct rdma_id_private *id_priv) if (addr->dev_addr.bound_dev_if) { ndev = dev_get_by_index(&init_net, addr->dev_addr.bound_dev_if); + if (!ndev) + return -ENODEV; + + if (ndev->flags & IFF_LOOPBACK) { + dev_put(ndev); + if (!id_priv->id.device->get_netdev) + return -EOPNOTSUPP; + + ndev = id_priv->id.device->get_netdev(id_priv->id.device, + id_priv->id.port_num); + if (!ndev) + return -ENODEV; + } + route->path_rec->net = &init_net; - route->path_rec->ifindex = addr->dev_addr.bound_dev_if; + route->path_rec->ifindex = ndev->ifindex; + route->path_rec->gid_type = id_priv->gid_type; } if (!ndev) { ret = -ENODEV; @@ -2336,7 +2478,14 @@ static int cma_resolve_iboe_route(struct rdma_id_private *id_priv) rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.dst_addr, &route->path_rec->dgid); - route->path_rec->hop_limit = 1; + /* Use the hint from IP Stack to select GID Type */ + if (route->path_rec->gid_type < ib_network_to_gid_type(addr->dev_addr.network)) + route->path_rec->gid_type = ib_network_to_gid_type(addr->dev_addr.network); + if (((struct sockaddr *)&id_priv->id.route.addr.dst_addr)->sa_family != AF_IB) + /* TODO: get the hoplimit from the inet/inet6 device */ + route->path_rec->hop_limit = addr->dev_addr.hoplimit; + else + route->path_rec->hop_limit = 1; route->path_rec->reversible = 1; route->path_rec->pkey = cpu_to_be16(0xffff); route->path_rec->mtu_selector = IB_SA_EQ; @@ -3534,12 +3683,23 @@ static int cma_ib_mc_handler(int status, struct ib_sa_multicast *multicast) event.status = status; event.param.ud.private_data = mc->context; if (!status) { + struct rdma_dev_addr *dev_addr = + &id_priv->id.route.addr.dev_addr; + struct net_device *ndev = + dev_get_by_index(&init_net, dev_addr->bound_dev_if); + enum ib_gid_type gid_type = + id_priv->cma_dev->default_gid_type[id_priv->id.port_num - + rdma_start_port(id_priv->cma_dev->device)]; + event.event = RDMA_CM_EVENT_MULTICAST_JOIN; ib_init_ah_from_mcmember(id_priv->id.device, id_priv->id.port_num, &multicast->rec, + ndev, gid_type, &event.param.ud.ah_attr); event.param.ud.qp_num = 0xFFFFFF; event.param.ud.qkey = be32_to_cpu(multicast->rec.qkey); + if (ndev) + dev_put(ndev); } else event.event = RDMA_CM_EVENT_MULTICAST_ERROR; @@ -3672,9 +3832,10 @@ static int cma_iboe_join_multicast(struct rdma_id_private *id_priv, { struct iboe_mcast_work *work; struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; - int err; + int err = 0; struct sockaddr *addr = (struct sockaddr *)&mc->addr; struct net_device *ndev = NULL; + enum ib_gid_type gid_type; if (cma_zero_addr((struct sockaddr *)&mc->addr)) return -EINVAL; @@ -3704,9 +3865,25 @@ static int cma_iboe_join_multicast(struct rdma_id_private *id_priv, mc->multicast.ib->rec.rate = iboe_get_rate(ndev); mc->multicast.ib->rec.hop_limit = 1; mc->multicast.ib->rec.mtu = iboe_get_mtu(ndev->mtu); + + gid_type = id_priv->cma_dev->default_gid_type[id_priv->id.port_num - + rdma_start_port(id_priv->cma_dev->device)]; + if (addr->sa_family == AF_INET) { + if (gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) + err = cma_igmp_send(ndev, &mc->multicast.ib->rec.mgid, + true); + if (!err) { + mc->igmp_joined = true; + mc->multicast.ib->rec.hop_limit = IPV6_DEFAULT_HOPLIMIT; + } + } else { + if (gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) + err = -ENOTSUPP; + } dev_put(ndev); - if (!mc->multicast.ib->rec.mtu) { - err = -EINVAL; + if (err || !mc->multicast.ib->rec.mtu) { + if (!err) + err = -EINVAL; goto out2; } rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr, @@ -3745,7 +3922,7 @@ int rdma_join_multicast(struct rdma_cm_id *id, struct sockaddr *addr, memcpy(&mc->addr, addr, rdma_addr_size(addr)); mc->context = context; mc->id_priv = id_priv; - + mc->igmp_joined = false; spin_lock(&id_priv->lock); list_add(&mc->list, &id_priv->mc_list); spin_unlock(&id_priv->lock); @@ -3790,9 +3967,25 @@ void rdma_leave_multicast(struct rdma_cm_id *id, struct sockaddr *addr) if (rdma_cap_ib_mcast(id->device, id->port_num)) { ib_sa_free_multicast(mc->multicast.ib); kfree(mc); - } else if (rdma_protocol_roce(id->device, id->port_num)) + } else if (rdma_protocol_roce(id->device, id->port_num)) { + if (mc->igmp_joined) { + struct rdma_dev_addr *dev_addr = + &id->route.addr.dev_addr; + struct net_device *ndev = NULL; + + if (dev_addr->bound_dev_if) + ndev = dev_get_by_index(&init_net, + dev_addr->bound_dev_if); + if (ndev) { + cma_igmp_send(ndev, + &mc->multicast.ib->rec.mgid, + false); + dev_put(ndev); + } + mc->igmp_joined = false; + } kref_put(&mc->mcref, release_mc); - + } return; } } @@ -3861,12 +4054,27 @@ static void cma_add_one(struct ib_device *device) { struct cma_device *cma_dev; struct rdma_id_private *id_priv; + unsigned int i; + unsigned long supported_gids = 0; cma_dev = kmalloc(sizeof *cma_dev, GFP_KERNEL); if (!cma_dev) return; cma_dev->device = device; + cma_dev->default_gid_type = kcalloc(device->phys_port_cnt, + sizeof(*cma_dev->default_gid_type), + GFP_KERNEL); + if (!cma_dev->default_gid_type) { + kfree(cma_dev); + return; + } + for (i = rdma_start_port(device); i <= rdma_end_port(device); i++) { + supported_gids = roce_gid_type_mask_support(device, i); + WARN_ON(!supported_gids); + cma_dev->default_gid_type[i - rdma_start_port(device)] = + find_first_bit(&supported_gids, BITS_PER_LONG); + } init_completion(&cma_dev->comp); atomic_set(&cma_dev->refcount, 1); @@ -3946,6 +4154,7 @@ static void cma_remove_one(struct ib_device *device, void *client_data) mutex_unlock(&lock); cma_process_remove(cma_dev); + kfree(cma_dev->default_gid_type); kfree(cma_dev); } @@ -4079,6 +4288,7 @@ static int __init cma_init(void) if (ibnl_add_client(RDMA_NL_RDMA_CM, RDMA_NL_RDMA_CM_NUM_OPS, cma_cb_table)) printk(KERN_WARNING "RDMA CMA: failed to add netlink callback\n"); + cma_configfs_init(); return 0; @@ -4093,6 +4303,7 @@ err_wq: static void __exit cma_cleanup(void) { + cma_configfs_exit(); ibnl_remove_client(RDMA_NL_RDMA_CM); ib_unregister_client(&cma_client); unregister_netdevice_notifier(&cma_nb); diff --git a/drivers/infiniband/core/cma_configfs.c b/drivers/infiniband/core/cma_configfs.c new file mode 100644 index 000000000000..18b112aa577e --- /dev/null +++ b/drivers/infiniband/core/cma_configfs.c @@ -0,0 +1,321 @@ +/* + * Copyright (c) 2015, Mellanox Technologies inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <linux/module.h> +#include <linux/configfs.h> +#include <rdma/ib_verbs.h> +#include "core_priv.h" + +struct cma_device; + +struct cma_dev_group; + +struct cma_dev_port_group { + unsigned int port_num; + struct cma_dev_group *cma_dev_group; + struct config_group group; +}; + +struct cma_dev_group { + char name[IB_DEVICE_NAME_MAX]; + struct config_group device_group; + struct config_group ports_group; + struct config_group *default_dev_group[2]; + struct config_group **default_ports_group; + struct cma_dev_port_group *ports; +}; + +static struct cma_dev_port_group *to_dev_port_group(struct config_item *item) +{ + struct config_group *group; + + if (!item) + return NULL; + + group = container_of(item, struct config_group, cg_item); + return container_of(group, struct cma_dev_port_group, group); +} + +static bool filter_by_name(struct ib_device *ib_dev, void *cookie) +{ + return !strcmp(ib_dev->name, cookie); +} + +static int cma_configfs_params_get(struct config_item *item, + struct cma_device **pcma_dev, + struct cma_dev_port_group **pgroup) +{ + struct cma_dev_port_group *group = to_dev_port_group(item); + struct cma_device *cma_dev; + + if (!group) + return -ENODEV; + + cma_dev = cma_enum_devices_by_ibdev(filter_by_name, + group->cma_dev_group->name); + if (!cma_dev) + return -ENODEV; + + *pcma_dev = cma_dev; + *pgroup = group; + + return 0; +} + +static void cma_configfs_params_put(struct cma_device *cma_dev) +{ + cma_deref_dev(cma_dev); +} + +static ssize_t default_roce_mode_show(struct config_item *item, + char *buf) +{ + struct cma_device *cma_dev; + struct cma_dev_port_group *group; + int gid_type; + ssize_t ret; + + ret = cma_configfs_params_get(item, &cma_dev, &group); + if (ret) + return ret; + + gid_type = cma_get_default_gid_type(cma_dev, group->port_num); + cma_configfs_params_put(cma_dev); + + if (gid_type < 0) + return gid_type; + + return sprintf(buf, "%s\n", ib_cache_gid_type_str(gid_type)); +} + +static ssize_t default_roce_mode_store(struct config_item *item, + const char *buf, size_t count) +{ + struct cma_device *cma_dev; + struct cma_dev_port_group *group; + int gid_type = ib_cache_gid_parse_type_str(buf); + ssize_t ret; + + if (gid_type < 0) + return -EINVAL; + + ret = cma_configfs_params_get(item, &cma_dev, &group); + if (ret) + return ret; + + ret = cma_set_default_gid_type(cma_dev, group->port_num, gid_type); + + cma_configfs_params_put(cma_dev); + + return !ret ? strnlen(buf, count) : ret; +} + +CONFIGFS_ATTR(, default_roce_mode); + +static struct configfs_attribute *cma_configfs_attributes[] = { + &attr_default_roce_mode, + NULL, +}; + +static struct config_item_type cma_port_group_type = { + .ct_attrs = cma_configfs_attributes, + .ct_owner = THIS_MODULE +}; + +static int make_cma_ports(struct cma_dev_group *cma_dev_group, + struct cma_device *cma_dev) +{ + struct ib_device *ibdev; + unsigned int i; + unsigned int ports_num; + struct cma_dev_port_group *ports; + struct config_group **ports_group; + int err; + + ibdev = cma_get_ib_dev(cma_dev); + + if (!ibdev) + return -ENODEV; + + ports_num = ibdev->phys_port_cnt; + ports = kcalloc(ports_num, sizeof(*cma_dev_group->ports), + GFP_KERNEL); + ports_group = kcalloc(ports_num + 1, sizeof(*ports_group), GFP_KERNEL); + + if (!ports || !ports_group) { + err = -ENOMEM; + goto free; + } + + for (i = 0; i < ports_num; i++) { + char port_str[10]; + + ports[i].port_num = i + 1; + snprintf(port_str, sizeof(port_str), "%u", i + 1); + ports[i].cma_dev_group = cma_dev_group; + config_group_init_type_name(&ports[i].group, + port_str, + &cma_port_group_type); + ports_group[i] = &ports[i].group; + } + ports_group[i] = NULL; + cma_dev_group->default_ports_group = ports_group; + cma_dev_group->ports = ports; + + return 0; +free: + kfree(ports); + kfree(ports_group); + cma_dev_group->ports = NULL; + cma_dev_group->default_ports_group = NULL; + return err; +} + +static void release_cma_dev(struct config_item *item) +{ + struct config_group *group = container_of(item, struct config_group, + cg_item); + struct cma_dev_group *cma_dev_group = container_of(group, + struct cma_dev_group, + device_group); + + kfree(cma_dev_group); +}; + +static void release_cma_ports_group(struct config_item *item) +{ + struct config_group *group = container_of(item, struct config_group, + cg_item); + struct cma_dev_group *cma_dev_group = container_of(group, + struct cma_dev_group, + ports_group); + + kfree(cma_dev_group->ports); + kfree(cma_dev_group->default_ports_group); + cma_dev_group->ports = NULL; + cma_dev_group->default_ports_group = NULL; +}; + +static struct configfs_item_operations cma_ports_item_ops = { + .release = release_cma_ports_group +}; + +static struct config_item_type cma_ports_group_type = { + .ct_item_ops = &cma_ports_item_ops, + .ct_owner = THIS_MODULE +}; + +static struct configfs_item_operations cma_device_item_ops = { + .release = release_cma_dev +}; + +static struct config_item_type cma_device_group_type = { + .ct_item_ops = &cma_device_item_ops, + .ct_owner = THIS_MODULE +}; + +static struct config_group *make_cma_dev(struct config_group *group, + const char *name) +{ + int err = -ENODEV; + struct cma_device *cma_dev = cma_enum_devices_by_ibdev(filter_by_name, + (void *)name); + struct cma_dev_group *cma_dev_group = NULL; + + if (!cma_dev) + goto fail; + + cma_dev_group = kzalloc(sizeof(*cma_dev_group), GFP_KERNEL); + + if (!cma_dev_group) { + err = -ENOMEM; + goto fail; + } + + strncpy(cma_dev_group->name, name, sizeof(cma_dev_group->name)); + + err = make_cma_ports(cma_dev_group, cma_dev); + if (err) + goto fail; + + cma_dev_group->ports_group.default_groups = + cma_dev_group->default_ports_group; + config_group_init_type_name(&cma_dev_group->ports_group, "ports", + &cma_ports_group_type); + + cma_dev_group->device_group.default_groups + = cma_dev_group->default_dev_group; + cma_dev_group->default_dev_group[0] = &cma_dev_group->ports_group; + cma_dev_group->default_dev_group[1] = NULL; + + config_group_init_type_name(&cma_dev_group->device_group, name, + &cma_device_group_type); + + cma_deref_dev(cma_dev); + return &cma_dev_group->device_group; + +fail: + if (cma_dev) + cma_deref_dev(cma_dev); + kfree(cma_dev_group); + return ERR_PTR(err); +} + +static struct configfs_group_operations cma_subsys_group_ops = { + .make_group = make_cma_dev, +}; + +static struct config_item_type cma_subsys_type = { + .ct_group_ops = &cma_subsys_group_ops, + .ct_owner = THIS_MODULE, +}; + +static struct configfs_subsystem cma_subsys = { + .su_group = { + .cg_item = { + .ci_namebuf = "rdma_cm", + .ci_type = &cma_subsys_type, + }, + }, +}; + +int __init cma_configfs_init(void) +{ + config_group_init(&cma_subsys.su_group); + mutex_init(&cma_subsys.su_mutex); + return configfs_register_subsystem(&cma_subsys); +} + +void __exit cma_configfs_exit(void) +{ + configfs_unregister_subsystem(&cma_subsys); +} diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h index 5cf6eb716f00..eab32215756b 100644 --- a/drivers/infiniband/core/core_priv.h +++ b/drivers/infiniband/core/core_priv.h @@ -38,6 +38,32 @@ #include <rdma/ib_verbs.h> +#if IS_ENABLED(CONFIG_INFINIBAND_ADDR_TRANS_CONFIGFS) +int cma_configfs_init(void); +void cma_configfs_exit(void); +#else +static inline int cma_configfs_init(void) +{ + return 0; +} + +static inline void cma_configfs_exit(void) +{ +} +#endif +struct cma_device; +void cma_ref_dev(struct cma_device *cma_dev); +void cma_deref_dev(struct cma_device *cma_dev); +typedef bool (*cma_device_filter)(struct ib_device *, void *); +struct cma_device *cma_enum_devices_by_ibdev(cma_device_filter filter, + void *cookie); +int cma_get_default_gid_type(struct cma_device *cma_dev, + unsigned int port); +int cma_set_default_gid_type(struct cma_device *cma_dev, + unsigned int port, + enum ib_gid_type default_gid_type); +struct ib_device *cma_get_ib_dev(struct cma_device *cma_dev); + int ib_device_register_sysfs(struct ib_device *device, int (*port_callback)(struct ib_device *, u8, struct kobject *)); @@ -70,8 +96,13 @@ enum ib_cache_gid_default_mode { IB_CACHE_GID_DEFAULT_MODE_DELETE }; +int ib_cache_gid_parse_type_str(const char *buf); + +const char *ib_cache_gid_type_str(enum ib_gid_type gid_type); + void ib_cache_gid_set_default_gid(struct ib_device *ib_dev, u8 port, struct net_device *ndev, + unsigned long gid_type_mask, enum ib_cache_gid_default_mode mode); int ib_cache_gid_add(struct ib_device *ib_dev, u8 port, @@ -87,9 +118,23 @@ int roce_gid_mgmt_init(void); void roce_gid_mgmt_cleanup(void); int roce_rescan_device(struct ib_device *ib_dev); +unsigned long roce_gid_type_mask_support(struct ib_device *ib_dev, u8 port); int ib_cache_setup_one(struct ib_device *device); void ib_cache_cleanup_one(struct ib_device *device); void ib_cache_release_one(struct ib_device *device); +static inline bool rdma_is_upper_dev_rcu(struct net_device *dev, + struct net_device *upper) +{ + struct net_device *_upper = NULL; + struct list_head *iter; + + netdev_for_each_all_upper_dev_rcu(dev, _upper, iter) + if (_upper == upper) + break; + + return _upper == upper; +} + #endif /* _CORE_PRIV_H */ diff --git a/drivers/infiniband/core/cq.c b/drivers/infiniband/core/cq.c new file mode 100644 index 000000000000..a754fc727de5 --- /dev/null +++ b/drivers/infiniband/core/cq.c @@ -0,0 +1,209 @@ +/* + * Copyright (c) 2015 HGST, a Western Digital Company. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ +#include <linux/module.h> +#include <linux/err.h> +#include <linux/slab.h> +#include <rdma/ib_verbs.h> + +/* # of WCs to poll for with a single call to ib_poll_cq */ +#define IB_POLL_BATCH 16 + +/* # of WCs to iterate over before yielding */ +#define IB_POLL_BUDGET_IRQ 256 +#define IB_POLL_BUDGET_WORKQUEUE 65536 + +#define IB_POLL_FLAGS \ + (IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS) + +static int __ib_process_cq(struct ib_cq *cq, int budget) +{ + int i, n, completed = 0; + + while ((n = ib_poll_cq(cq, IB_POLL_BATCH, cq->wc)) > 0) { + for (i = 0; i < n; i++) { + struct ib_wc *wc = &cq->wc[i]; + + if (wc->wr_cqe) + wc->wr_cqe->done(cq, wc); + else + WARN_ON_ONCE(wc->status == IB_WC_SUCCESS); + } + + completed += n; + + if (n != IB_POLL_BATCH || + (budget != -1 && completed >= budget)) + break; + } + + return completed; +} + +/** + * ib_process_direct_cq - process a CQ in caller context + * @cq: CQ to process + * @budget: number of CQEs to poll for + * + * This function is used to process all outstanding CQ entries on a + * %IB_POLL_DIRECT CQ. It does not offload CQ processing to a different + * context and does not ask for completion interrupts from the HCA. + * + * Note: for compatibility reasons -1 can be passed in %budget for unlimited + * polling. Do not use this feature in new code, it will be removed soon. + */ +int ib_process_cq_direct(struct ib_cq *cq, int budget) +{ + WARN_ON_ONCE(cq->poll_ctx != IB_POLL_DIRECT); + + return __ib_process_cq(cq, budget); +} +EXPORT_SYMBOL(ib_process_cq_direct); + +static void ib_cq_completion_direct(struct ib_cq *cq, void *private) +{ + WARN_ONCE(1, "got unsolicited completion for CQ 0x%p\n", cq); +} + +static int ib_poll_handler(struct irq_poll *iop, int budget) +{ + struct ib_cq *cq = container_of(iop, struct ib_cq, iop); + int completed; + + completed = __ib_process_cq(cq, budget); + if (completed < budget) { + irq_poll_complete(&cq->iop); + if (ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0) + irq_poll_sched(&cq->iop); + } + + return completed; +} + +static void ib_cq_completion_softirq(struct ib_cq *cq, void *private) +{ + irq_poll_sched(&cq->iop); +} + +static void ib_cq_poll_work(struct work_struct *work) +{ + struct ib_cq *cq = container_of(work, struct ib_cq, work); + int completed; + + completed = __ib_process_cq(cq, IB_POLL_BUDGET_WORKQUEUE); + if (completed >= IB_POLL_BUDGET_WORKQUEUE || + ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0) + queue_work(ib_comp_wq, &cq->work); +} + +static void ib_cq_completion_workqueue(struct ib_cq *cq, void *private) +{ + queue_work(ib_comp_wq, &cq->work); +} + +/** + * ib_alloc_cq - allocate a completion queue + * @dev: device to allocate the CQ for + * @private: driver private data, accessible from cq->cq_context + * @nr_cqe: number of CQEs to allocate + * @comp_vector: HCA completion vectors for this CQ + * @poll_ctx: context to poll the CQ from. + * + * This is the proper interface to allocate a CQ for in-kernel users. A + * CQ allocated with this interface will automatically be polled from the + * specified context. The ULP needs must use wr->wr_cqe instead of wr->wr_id + * to use this CQ abstraction. + */ +struct ib_cq *ib_alloc_cq(struct ib_device *dev, void *private, + int nr_cqe, int comp_vector, enum ib_poll_context poll_ctx) +{ + struct ib_cq_init_attr cq_attr = { + .cqe = nr_cqe, + .comp_vector = comp_vector, + }; + struct ib_cq *cq; + int ret = -ENOMEM; + + cq = dev->create_cq(dev, &cq_attr, NULL, NULL); + if (IS_ERR(cq)) + return cq; + + cq->device = dev; + cq->uobject = NULL; + cq->event_handler = NULL; + cq->cq_context = private; + cq->poll_ctx = poll_ctx; + atomic_set(&cq->usecnt, 0); + + cq->wc = kmalloc_array(IB_POLL_BATCH, sizeof(*cq->wc), GFP_KERNEL); + if (!cq->wc) + goto out_destroy_cq; + + switch (cq->poll_ctx) { + case IB_POLL_DIRECT: + cq->comp_handler = ib_cq_completion_direct; + break; + case IB_POLL_SOFTIRQ: + cq->comp_handler = ib_cq_completion_softirq; + + irq_poll_init(&cq->iop, IB_POLL_BUDGET_IRQ, ib_poll_handler); + ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); + break; + case IB_POLL_WORKQUEUE: + cq->comp_handler = ib_cq_completion_workqueue; + INIT_WORK(&cq->work, ib_cq_poll_work); + ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); + break; + default: + ret = -EINVAL; + goto out_free_wc; + } + + return cq; + +out_free_wc: + kfree(cq->wc); +out_destroy_cq: + cq->device->destroy_cq(cq); + return ERR_PTR(ret); +} +EXPORT_SYMBOL(ib_alloc_cq); + +/** + * ib_free_cq - free a completion queue + * @cq: completion queue to free. + */ +void ib_free_cq(struct ib_cq *cq) +{ + int ret; + + if (WARN_ON_ONCE(atomic_read(&cq->usecnt))) + return; + + switch (cq->poll_ctx) { + case IB_POLL_DIRECT: + break; + case IB_POLL_SOFTIRQ: + irq_poll_disable(&cq->iop); + break; + case IB_POLL_WORKQUEUE: + flush_work(&cq->work); + break; + default: + WARN_ON_ONCE(1); + } + + kfree(cq->wc); + ret = cq->device->destroy_cq(cq); + WARN_ON_ONCE(ret); +} +EXPORT_SYMBOL(ib_free_cq); diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 179e8134d57f..00da80e02154 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -58,6 +58,7 @@ struct ib_client_data { bool going_down; }; +struct workqueue_struct *ib_comp_wq; struct workqueue_struct *ib_wq; EXPORT_SYMBOL_GPL(ib_wq); @@ -325,6 +326,7 @@ int ib_register_device(struct ib_device *device, { int ret; struct ib_client *client; + struct ib_udata uhw = {.outlen = 0, .inlen = 0}; mutex_lock(&device_mutex); @@ -352,6 +354,13 @@ int ib_register_device(struct ib_device *device, goto out; } + memset(&device->attrs, 0, sizeof(device->attrs)); + ret = device->query_device(device, &device->attrs, &uhw); + if (ret) { + printk(KERN_WARNING "Couldn't query the device attributes\n"); + goto out; + } + ret = ib_device_register_sysfs(device, port_callback); if (ret) { printk(KERN_WARNING "Couldn't register device %s with driver model\n", @@ -628,25 +637,6 @@ void ib_dispatch_event(struct ib_event *event) EXPORT_SYMBOL(ib_dispatch_event); /** - * ib_query_device - Query IB device attributes - * @device:Device to query - * @device_attr:Device attributes - * - * ib_query_device() returns the attributes of a device through the - * @device_attr pointer. - */ -int ib_query_device(struct ib_device *device, - struct ib_device_attr *device_attr) -{ - struct ib_udata uhw = {.outlen = 0, .inlen = 0}; - - memset(device_attr, 0, sizeof(*device_attr)); - - return device->query_device(device, device_attr, &uhw); -} -EXPORT_SYMBOL(ib_query_device); - -/** * ib_query_port - Query IB port attributes * @device:Device to query * @port_num:Port number to query @@ -825,26 +815,31 @@ EXPORT_SYMBOL(ib_modify_port); * a specified GID value occurs. * @device: The device to query. * @gid: The GID value to search for. + * @gid_type: Type of GID. * @ndev: The ndev related to the GID to search for. * @port_num: The port number of the device where the GID value was found. * @index: The index into the GID table where the GID was found. This * parameter may be NULL. */ int ib_find_gid(struct ib_device *device, union ib_gid *gid, - struct net_device *ndev, u8 *port_num, u16 *index) + enum ib_gid_type gid_type, struct net_device *ndev, + u8 *port_num, u16 *index) { union ib_gid tmp_gid; int ret, port, i; for (port = rdma_start_port(device); port <= rdma_end_port(device); ++port) { if (rdma_cap_roce_gid_table(device, port)) { - if (!ib_find_cached_gid_by_port(device, gid, port, + if (!ib_find_cached_gid_by_port(device, gid, gid_type, port, ndev, index)) { *port_num = port; return 0; } } + if (gid_type != IB_GID_TYPE_IB) + continue; + for (i = 0; i < device->port_immutable[port].gid_tbl_len; ++i) { ret = ib_query_gid(device, port, i, &tmp_gid, NULL); if (ret) @@ -954,10 +949,18 @@ static int __init ib_core_init(void) if (!ib_wq) return -ENOMEM; + ib_comp_wq = alloc_workqueue("ib-comp-wq", + WQ_UNBOUND | WQ_HIGHPRI | WQ_MEM_RECLAIM, + WQ_UNBOUND_MAX_ACTIVE); + if (!ib_comp_wq) { + ret = -ENOMEM; + goto err; + } + ret = class_register(&ib_class); if (ret) { printk(KERN_WARNING "Couldn't create InfiniBand device class\n"); - goto err; + goto err_comp; } ret = ibnl_init(); @@ -972,7 +975,8 @@ static int __init ib_core_init(void) err_sysfs: class_unregister(&ib_class); - +err_comp: + destroy_workqueue(ib_comp_wq); err: destroy_workqueue(ib_wq); return ret; @@ -983,6 +987,7 @@ static void __exit ib_core_cleanup(void) ib_cache_cleanup(); ibnl_cleanup(); class_unregister(&ib_class); + destroy_workqueue(ib_comp_wq); /* Make sure that any pending umem accounting work is done. */ destroy_workqueue(ib_wq); } diff --git a/drivers/infiniband/core/fmr_pool.c b/drivers/infiniband/core/fmr_pool.c index 9f5ad7cc33c8..6ac3683c144b 100644 --- a/drivers/infiniband/core/fmr_pool.c +++ b/drivers/infiniband/core/fmr_pool.c @@ -212,7 +212,6 @@ struct ib_fmr_pool *ib_create_fmr_pool(struct ib_pd *pd, { struct ib_device *device; struct ib_fmr_pool *pool; - struct ib_device_attr *attr; int i; int ret; int max_remaps; @@ -228,25 +227,10 @@ struct ib_fmr_pool *ib_create_fmr_pool(struct ib_pd *pd, return ERR_PTR(-ENOSYS); } - attr = kmalloc(sizeof *attr, GFP_KERNEL); - if (!attr) { - printk(KERN_WARNING PFX "couldn't allocate device attr struct\n"); - return ERR_PTR(-ENOMEM); - } - - ret = ib_query_device(device, attr); - if (ret) { - printk(KERN_WARNING PFX "couldn't query device: %d\n", ret); - kfree(attr); - return ERR_PTR(ret); - } - - if (!attr->max_map_per_fmr) + if (!device->attrs.max_map_per_fmr) max_remaps = IB_FMR_MAX_REMAPS; else - max_remaps = attr->max_map_per_fmr; - - kfree(attr); + max_remaps = device->attrs.max_map_per_fmr; pool = kmalloc(sizeof *pool, GFP_KERNEL); if (!pool) { diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c index 2281de122038..9fa5bf33f5a3 100644 --- a/drivers/infiniband/core/mad.c +++ b/drivers/infiniband/core/mad.c @@ -84,6 +84,9 @@ static int add_nonoui_reg_req(struct ib_mad_reg_req *mad_reg_req, u8 mgmt_class); static int add_oui_reg_req(struct ib_mad_reg_req *mad_reg_req, struct ib_mad_agent_private *agent_priv); +static bool ib_mad_send_error(struct ib_mad_port_private *port_priv, + struct ib_wc *wc); +static void ib_mad_send_done(struct ib_cq *cq, struct ib_wc *wc); /* * Returns a ib_mad_port_private structure or NULL for a device/port @@ -681,7 +684,7 @@ static void snoop_recv(struct ib_mad_qp_info *qp_info, atomic_inc(&mad_snoop_priv->refcount); spin_unlock_irqrestore(&qp_info->snoop_lock, flags); - mad_snoop_priv->agent.recv_handler(&mad_snoop_priv->agent, + mad_snoop_priv->agent.recv_handler(&mad_snoop_priv->agent, NULL, mad_recv_wc); deref_snoop_agent(mad_snoop_priv); spin_lock_irqsave(&qp_info->snoop_lock, flags); @@ -689,12 +692,11 @@ static void snoop_recv(struct ib_mad_qp_info *qp_info, spin_unlock_irqrestore(&qp_info->snoop_lock, flags); } -static void build_smp_wc(struct ib_qp *qp, - u64 wr_id, u16 slid, u16 pkey_index, u8 port_num, - struct ib_wc *wc) +static void build_smp_wc(struct ib_qp *qp, struct ib_cqe *cqe, u16 slid, + u16 pkey_index, u8 port_num, struct ib_wc *wc) { memset(wc, 0, sizeof *wc); - wc->wr_id = wr_id; + wc->wr_cqe = cqe; wc->status = IB_WC_SUCCESS; wc->opcode = IB_WC_RECV; wc->pkey_index = pkey_index; @@ -832,7 +834,7 @@ static int handle_outgoing_dr_smp(struct ib_mad_agent_private *mad_agent_priv, } build_smp_wc(mad_agent_priv->agent.qp, - send_wr->wr.wr_id, drslid, + send_wr->wr.wr_cqe, drslid, send_wr->pkey_index, send_wr->port_num, &mad_wc); @@ -1039,7 +1041,9 @@ struct ib_mad_send_buf * ib_create_send_mad(struct ib_mad_agent *mad_agent, mad_send_wr->sg_list[1].lkey = mad_agent->qp->pd->local_dma_lkey; - mad_send_wr->send_wr.wr.wr_id = (unsigned long) mad_send_wr; + mad_send_wr->mad_list.cqe.done = ib_mad_send_done; + + mad_send_wr->send_wr.wr.wr_cqe = &mad_send_wr->mad_list.cqe; mad_send_wr->send_wr.wr.sg_list = mad_send_wr->sg_list; mad_send_wr->send_wr.wr.num_sge = 2; mad_send_wr->send_wr.wr.opcode = IB_WR_SEND; @@ -1151,8 +1155,9 @@ int ib_send_mad(struct ib_mad_send_wr_private *mad_send_wr) /* Set WR ID to find mad_send_wr upon completion */ qp_info = mad_send_wr->mad_agent_priv->qp_info; - mad_send_wr->send_wr.wr.wr_id = (unsigned long)&mad_send_wr->mad_list; mad_send_wr->mad_list.mad_queue = &qp_info->send_queue; + mad_send_wr->mad_list.cqe.done = ib_mad_send_done; + mad_send_wr->send_wr.wr.wr_cqe = &mad_send_wr->mad_list.cqe; mad_agent = mad_send_wr->send_buf.mad_agent; sge = mad_send_wr->sg_list; @@ -1982,9 +1987,9 @@ static void ib_mad_complete_recv(struct ib_mad_agent_private *mad_agent_priv, /* user rmpp is in effect * and this is an active RMPP MAD */ - mad_recv_wc->wc->wr_id = 0; - mad_agent_priv->agent.recv_handler(&mad_agent_priv->agent, - mad_recv_wc); + mad_agent_priv->agent.recv_handler( + &mad_agent_priv->agent, NULL, + mad_recv_wc); atomic_dec(&mad_agent_priv->refcount); } else { /* not user rmpp, revert to normal behavior and @@ -1998,9 +2003,10 @@ static void ib_mad_complete_recv(struct ib_mad_agent_private *mad_agent_priv, spin_unlock_irqrestore(&mad_agent_priv->lock, flags); /* Defined behavior is to complete response before request */ - mad_recv_wc->wc->wr_id = (unsigned long) &mad_send_wr->send_buf; - mad_agent_priv->agent.recv_handler(&mad_agent_priv->agent, - mad_recv_wc); + mad_agent_priv->agent.recv_handler( + &mad_agent_priv->agent, + &mad_send_wr->send_buf, + mad_recv_wc); atomic_dec(&mad_agent_priv->refcount); mad_send_wc.status = IB_WC_SUCCESS; @@ -2009,7 +2015,7 @@ static void ib_mad_complete_recv(struct ib_mad_agent_private *mad_agent_priv, ib_mad_complete_send_wr(mad_send_wr, &mad_send_wc); } } else { - mad_agent_priv->agent.recv_handler(&mad_agent_priv->agent, + mad_agent_priv->agent.recv_handler(&mad_agent_priv->agent, NULL, mad_recv_wc); deref_mad_agent(mad_agent_priv); } @@ -2172,13 +2178,14 @@ handle_smi(struct ib_mad_port_private *port_priv, return handle_ib_smi(port_priv, qp_info, wc, port_num, recv, response); } -static void ib_mad_recv_done_handler(struct ib_mad_port_private *port_priv, - struct ib_wc *wc) +static void ib_mad_recv_done(struct ib_cq *cq, struct ib_wc *wc) { + struct ib_mad_port_private *port_priv = cq->cq_context; + struct ib_mad_list_head *mad_list = + container_of(wc->wr_cqe, struct ib_mad_list_head, cqe); struct ib_mad_qp_info *qp_info; struct ib_mad_private_header *mad_priv_hdr; struct ib_mad_private *recv, *response = NULL; - struct ib_mad_list_head *mad_list; struct ib_mad_agent_private *mad_agent; int port_num; int ret = IB_MAD_RESULT_SUCCESS; @@ -2186,7 +2193,17 @@ static void ib_mad_recv_done_handler(struct ib_mad_port_private *port_priv, u16 resp_mad_pkey_index = 0; bool opa; - mad_list = (struct ib_mad_list_head *)(unsigned long)wc->wr_id; + if (list_empty_careful(&port_priv->port_list)) + return; + + if (wc->status != IB_WC_SUCCESS) { + /* + * Receive errors indicate that the QP has entered the error + * state - error handling/shutdown code will cleanup + */ + return; + } + qp_info = mad_list->mad_queue->qp_info; dequeue_mad(mad_list); @@ -2227,7 +2244,7 @@ static void ib_mad_recv_done_handler(struct ib_mad_port_private *port_priv, response = alloc_mad_private(mad_size, GFP_KERNEL); if (!response) { dev_err(&port_priv->device->dev, - "ib_mad_recv_done_handler no memory for response buffer\n"); + "%s: no memory for response buffer\n", __func__); goto out; } @@ -2413,11 +2430,12 @@ done: spin_unlock_irqrestore(&mad_agent_priv->lock, flags); } -static void ib_mad_send_done_handler(struct ib_mad_port_private *port_priv, - struct ib_wc *wc) +static void ib_mad_send_done(struct ib_cq *cq, struct ib_wc *wc) { + struct ib_mad_port_private *port_priv = cq->cq_context; + struct ib_mad_list_head *mad_list = + container_of(wc->wr_cqe, struct ib_mad_list_head, cqe); struct ib_mad_send_wr_private *mad_send_wr, *queued_send_wr; - struct ib_mad_list_head *mad_list; struct ib_mad_qp_info *qp_info; struct ib_mad_queue *send_queue; struct ib_send_wr *bad_send_wr; @@ -2425,7 +2443,14 @@ static void ib_mad_send_done_handler(struct ib_mad_port_private *port_priv, unsigned long flags; int ret; - mad_list = (struct ib_mad_list_head *)(unsigned long)wc->wr_id; + if (list_empty_careful(&port_priv->port_list)) + return; + + if (wc->status != IB_WC_SUCCESS) { + if (!ib_mad_send_error(port_priv, wc)) + return; + } + mad_send_wr = container_of(mad_list, struct ib_mad_send_wr_private, mad_list); send_queue = mad_list->mad_queue; @@ -2490,24 +2515,15 @@ static void mark_sends_for_retry(struct ib_mad_qp_info *qp_info) spin_unlock_irqrestore(&qp_info->send_queue.lock, flags); } -static void mad_error_handler(struct ib_mad_port_private *port_priv, - struct ib_wc *wc) +static bool ib_mad_send_error(struct ib_mad_port_private *port_priv, + struct ib_wc *wc) { - struct ib_mad_list_head *mad_list; - struct ib_mad_qp_info *qp_info; + struct ib_mad_list_head *mad_list = + container_of(wc->wr_cqe, struct ib_mad_list_head, cqe); + struct ib_mad_qp_info *qp_info = mad_list->mad_queue->qp_info; struct ib_mad_send_wr_private *mad_send_wr; int ret; - /* Determine if failure was a send or receive */ - mad_list = (struct ib_mad_list_head *)(unsigned long)wc->wr_id; - qp_info = mad_list->mad_queue->qp_info; - if (mad_list->mad_queue == &qp_info->recv_queue) - /* - * Receive errors indicate that the QP has entered the error - * state - error handling/shutdown code will cleanup - */ - return; - /* * Send errors will transition the QP to SQE - move * QP to RTS and repost flushed work requests @@ -2522,10 +2538,9 @@ static void mad_error_handler(struct ib_mad_port_private *port_priv, mad_send_wr->retry = 0; ret = ib_post_send(qp_info->qp, &mad_send_wr->send_wr.wr, &bad_send_wr); - if (ret) - ib_mad_send_done_handler(port_priv, wc); - } else - ib_mad_send_done_handler(port_priv, wc); + if (!ret) + return false; + } } else { struct ib_qp_attr *attr; @@ -2539,42 +2554,14 @@ static void mad_error_handler(struct ib_mad_port_private *port_priv, kfree(attr); if (ret) dev_err(&port_priv->device->dev, - "mad_error_handler - ib_modify_qp to RTS : %d\n", - ret); + "%s - ib_modify_qp to RTS: %d\n", + __func__, ret); else mark_sends_for_retry(qp_info); } - ib_mad_send_done_handler(port_priv, wc); } -} -/* - * IB MAD completion callback - */ -static void ib_mad_completion_handler(struct work_struct *work) -{ - struct ib_mad_port_private *port_priv; - struct ib_wc wc; - - port_priv = container_of(work, struct ib_mad_port_private, work); - ib_req_notify_cq(port_priv->cq, IB_CQ_NEXT_COMP); - - while (ib_poll_cq(port_priv->cq, 1, &wc) == 1) { - if (wc.status == IB_WC_SUCCESS) { - switch (wc.opcode) { - case IB_WC_SEND: - ib_mad_send_done_handler(port_priv, &wc); - break; - case IB_WC_RECV: - ib_mad_recv_done_handler(port_priv, &wc); - break; - default: - BUG_ON(1); - break; - } - } else - mad_error_handler(port_priv, &wc); - } + return true; } static void cancel_mads(struct ib_mad_agent_private *mad_agent_priv) @@ -2716,7 +2703,7 @@ static void local_completions(struct work_struct *work) * before request */ build_smp_wc(recv_mad_agent->agent.qp, - (unsigned long) local->mad_send_wr, + local->mad_send_wr->send_wr.wr.wr_cqe, be16_to_cpu(IB_LID_PERMISSIVE), local->mad_send_wr->send_wr.pkey_index, recv_mad_agent->agent.port_num, &wc); @@ -2744,6 +2731,7 @@ static void local_completions(struct work_struct *work) IB_MAD_SNOOP_RECVS); recv_mad_agent->agent.recv_handler( &recv_mad_agent->agent, + &local->mad_send_wr->send_buf, &local->mad_priv->header.recv_wc); spin_lock_irqsave(&recv_mad_agent->lock, flags); atomic_dec(&recv_mad_agent->refcount); @@ -2855,17 +2843,6 @@ static void timeout_sends(struct work_struct *work) spin_unlock_irqrestore(&mad_agent_priv->lock, flags); } -static void ib_mad_thread_completion_handler(struct ib_cq *cq, void *arg) -{ - struct ib_mad_port_private *port_priv = cq->cq_context; - unsigned long flags; - - spin_lock_irqsave(&ib_mad_port_list_lock, flags); - if (!list_empty(&port_priv->port_list)) - queue_work(port_priv->wq, &port_priv->work); - spin_unlock_irqrestore(&ib_mad_port_list_lock, flags); -} - /* * Allocate receive MADs and post receive WRs for them */ @@ -2913,8 +2890,9 @@ static int ib_mad_post_receive_mads(struct ib_mad_qp_info *qp_info, break; } mad_priv->header.mapping = sg_list.addr; - recv_wr.wr_id = (unsigned long)&mad_priv->header.mad_list; mad_priv->header.mad_list.mad_queue = recv_queue; + mad_priv->header.mad_list.cqe.done = ib_mad_recv_done; + recv_wr.wr_cqe = &mad_priv->header.mad_list.cqe; /* Post receive WR */ spin_lock_irqsave(&recv_queue->lock, flags); @@ -3151,7 +3129,6 @@ static int ib_mad_port_open(struct ib_device *device, unsigned long flags; char name[sizeof "ib_mad123"]; int has_smi; - struct ib_cq_init_attr cq_attr = {}; if (WARN_ON(rdma_max_mad_size(device, port_num) < IB_MGMT_MAD_SIZE)) return -EFAULT; @@ -3179,10 +3156,8 @@ static int ib_mad_port_open(struct ib_device *device, if (has_smi) cq_size *= 2; - cq_attr.cqe = cq_size; - port_priv->cq = ib_create_cq(port_priv->device, - ib_mad_thread_completion_handler, - NULL, port_priv, &cq_attr); + port_priv->cq = ib_alloc_cq(port_priv->device, port_priv, cq_size, 0, + IB_POLL_WORKQUEUE); if (IS_ERR(port_priv->cq)) { dev_err(&device->dev, "Couldn't create ib_mad CQ\n"); ret = PTR_ERR(port_priv->cq); @@ -3211,7 +3186,6 @@ static int ib_mad_port_open(struct ib_device *device, ret = -ENOMEM; goto error8; } - INIT_WORK(&port_priv->work, ib_mad_completion_handler); spin_lock_irqsave(&ib_mad_port_list_lock, flags); list_add_tail(&port_priv->port_list, &ib_mad_port_list); @@ -3238,7 +3212,7 @@ error7: error6: ib_dealloc_pd(port_priv->pd); error4: - ib_destroy_cq(port_priv->cq); + ib_free_cq(port_priv->cq); cleanup_recv_queue(&port_priv->qp_info[1]); cleanup_recv_queue(&port_priv->qp_info[0]); error3: @@ -3271,7 +3245,7 @@ static int ib_mad_port_close(struct ib_device *device, int port_num) destroy_mad_qp(&port_priv->qp_info[1]); destroy_mad_qp(&port_priv->qp_info[0]); ib_dealloc_pd(port_priv->pd); - ib_destroy_cq(port_priv->cq); + ib_free_cq(port_priv->cq); cleanup_recv_queue(&port_priv->qp_info[1]); cleanup_recv_queue(&port_priv->qp_info[0]); /* XXX: Handle deallocation of MAD registration tables */ diff --git a/drivers/infiniband/core/mad_priv.h b/drivers/infiniband/core/mad_priv.h index 990698a6ab4b..28669f6419e1 100644 --- a/drivers/infiniband/core/mad_priv.h +++ b/drivers/infiniband/core/mad_priv.h @@ -64,6 +64,7 @@ struct ib_mad_list_head { struct list_head list; + struct ib_cqe cqe; struct ib_mad_queue *mad_queue; }; @@ -204,7 +205,6 @@ struct ib_mad_port_private { struct ib_mad_mgmt_version_table version[MAX_MGMT_VERSION]; struct list_head agent_list; struct workqueue_struct *wq; - struct work_struct work; struct ib_mad_qp_info qp_info[IB_MAD_QPS_CORE]; }; diff --git a/drivers/infiniband/core/multicast.c b/drivers/infiniband/core/multicast.c index bb6685fb08c6..250937cb9a1a 100644 --- a/drivers/infiniband/core/multicast.c +++ b/drivers/infiniband/core/multicast.c @@ -723,14 +723,27 @@ EXPORT_SYMBOL(ib_sa_get_mcmember_rec); int ib_init_ah_from_mcmember(struct ib_device *device, u8 port_num, struct ib_sa_mcmember_rec *rec, + struct net_device *ndev, + enum ib_gid_type gid_type, struct ib_ah_attr *ah_attr) { int ret; u16 gid_index; u8 p; - ret = ib_find_cached_gid(device, &rec->port_gid, - NULL, &p, &gid_index); + if (rdma_protocol_roce(device, port_num)) { + ret = ib_find_cached_gid_by_port(device, &rec->port_gid, + gid_type, port_num, + ndev, + &gid_index); + } else if (rdma_protocol_ib(device, port_num)) { + ret = ib_find_cached_gid(device, &rec->port_gid, + IB_GID_TYPE_IB, NULL, &p, + &gid_index); + } else { + ret = -EINVAL; + } + if (ret) return ret; diff --git a/drivers/infiniband/core/roce_gid_mgmt.c b/drivers/infiniband/core/roce_gid_mgmt.c index 178f98482e13..06556c34606d 100644 --- a/drivers/infiniband/core/roce_gid_mgmt.c +++ b/drivers/infiniband/core/roce_gid_mgmt.c @@ -67,17 +67,53 @@ struct netdev_event_work { struct netdev_event_work_cmd cmds[ROCE_NETDEV_CALLBACK_SZ]; }; +static const struct { + bool (*is_supported)(const struct ib_device *device, u8 port_num); + enum ib_gid_type gid_type; +} PORT_CAP_TO_GID_TYPE[] = { + {rdma_protocol_roce_eth_encap, IB_GID_TYPE_ROCE}, + {rdma_protocol_roce_udp_encap, IB_GID_TYPE_ROCE_UDP_ENCAP}, +}; + +#define CAP_TO_GID_TABLE_SIZE ARRAY_SIZE(PORT_CAP_TO_GID_TYPE) + +unsigned long roce_gid_type_mask_support(struct ib_device *ib_dev, u8 port) +{ + int i; + unsigned int ret_flags = 0; + + if (!rdma_protocol_roce(ib_dev, port)) + return 1UL << IB_GID_TYPE_IB; + + for (i = 0; i < CAP_TO_GID_TABLE_SIZE; i++) + if (PORT_CAP_TO_GID_TYPE[i].is_supported(ib_dev, port)) + ret_flags |= 1UL << PORT_CAP_TO_GID_TYPE[i].gid_type; + + return ret_flags; +} +EXPORT_SYMBOL(roce_gid_type_mask_support); + static void update_gid(enum gid_op_type gid_op, struct ib_device *ib_dev, u8 port, union ib_gid *gid, struct ib_gid_attr *gid_attr) { - switch (gid_op) { - case GID_ADD: - ib_cache_gid_add(ib_dev, port, gid, gid_attr); - break; - case GID_DEL: - ib_cache_gid_del(ib_dev, port, gid, gid_attr); - break; + int i; + unsigned long gid_type_mask = roce_gid_type_mask_support(ib_dev, port); + + for (i = 0; i < IB_GID_TYPE_SIZE; i++) { + if ((1UL << i) & gid_type_mask) { + gid_attr->gid_type = i; + switch (gid_op) { + case GID_ADD: + ib_cache_gid_add(ib_dev, port, + gid, gid_attr); + break; + case GID_DEL: + ib_cache_gid_del(ib_dev, port, + gid, gid_attr); + break; + } + } } } @@ -103,18 +139,6 @@ static enum bonding_slave_state is_eth_active_slave_of_bonding_rcu(struct net_de return BONDING_SLAVE_STATE_NA; } -static bool is_upper_dev_rcu(struct net_device *dev, struct net_device *upper) -{ - struct net_device *_upper = NULL; - struct list_head *iter; - - netdev_for_each_all_upper_dev_rcu(dev, _upper, iter) - if (_upper == upper) - break; - - return _upper == upper; -} - #define REQUIRED_BOND_STATES (BONDING_SLAVE_STATE_ACTIVE | \ BONDING_SLAVE_STATE_NA) static int is_eth_port_of_netdev(struct ib_device *ib_dev, u8 port, @@ -132,7 +156,7 @@ static int is_eth_port_of_netdev(struct ib_device *ib_dev, u8 port, if (!real_dev) real_dev = event_ndev; - res = ((is_upper_dev_rcu(rdma_ndev, event_ndev) && + res = ((rdma_is_upper_dev_rcu(rdma_ndev, event_ndev) && (is_eth_active_slave_of_bonding_rcu(rdma_ndev, real_dev) & REQUIRED_BOND_STATES)) || real_dev == rdma_ndev); @@ -178,7 +202,7 @@ static int upper_device_filter(struct ib_device *ib_dev, u8 port, return 1; rcu_read_lock(); - res = is_upper_dev_rcu(rdma_ndev, event_ndev); + res = rdma_is_upper_dev_rcu(rdma_ndev, event_ndev); rcu_read_unlock(); return res; @@ -203,10 +227,12 @@ static void enum_netdev_default_gids(struct ib_device *ib_dev, u8 port, struct net_device *event_ndev, struct net_device *rdma_ndev) { + unsigned long gid_type_mask; + rcu_read_lock(); if (!rdma_ndev || ((rdma_ndev != event_ndev && - !is_upper_dev_rcu(rdma_ndev, event_ndev)) || + !rdma_is_upper_dev_rcu(rdma_ndev, event_ndev)) || is_eth_active_slave_of_bonding_rcu(rdma_ndev, netdev_master_upper_dev_get_rcu(rdma_ndev)) == BONDING_SLAVE_STATE_INACTIVE)) { @@ -215,7 +241,9 @@ static void enum_netdev_default_gids(struct ib_device *ib_dev, } rcu_read_unlock(); - ib_cache_gid_set_default_gid(ib_dev, port, rdma_ndev, + gid_type_mask = roce_gid_type_mask_support(ib_dev, port); + + ib_cache_gid_set_default_gid(ib_dev, port, rdma_ndev, gid_type_mask, IB_CACHE_GID_DEFAULT_MODE_SET); } @@ -234,12 +262,17 @@ static void bond_delete_netdev_default_gids(struct ib_device *ib_dev, rcu_read_lock(); - if (is_upper_dev_rcu(rdma_ndev, event_ndev) && + if (rdma_is_upper_dev_rcu(rdma_ndev, event_ndev) && is_eth_active_slave_of_bonding_rcu(rdma_ndev, real_dev) == BONDING_SLAVE_STATE_INACTIVE) { + unsigned long gid_type_mask; + rcu_read_unlock(); + gid_type_mask = roce_gid_type_mask_support(ib_dev, port); + ib_cache_gid_set_default_gid(ib_dev, port, rdma_ndev, + gid_type_mask, IB_CACHE_GID_DEFAULT_MODE_DELETE); } else { rcu_read_unlock(); diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c index a95a32ba596e..f334090bb612 100644 --- a/drivers/infiniband/core/sa_query.c +++ b/drivers/infiniband/core/sa_query.c @@ -49,7 +49,9 @@ #include <net/netlink.h> #include <uapi/rdma/ib_user_sa.h> #include <rdma/ib_marshall.h> +#include <rdma/ib_addr.h> #include "sa.h" +#include "core_priv.h" MODULE_AUTHOR("Roland Dreier"); MODULE_DESCRIPTION("InfiniBand subnet administration query support"); @@ -715,7 +717,9 @@ static int ib_nl_handle_set_timeout(struct sk_buff *skb, struct nlattr *tb[LS_NLA_TYPE_MAX]; int ret; - if (!netlink_capable(skb, CAP_NET_ADMIN)) + if (!(nlh->nlmsg_flags & NLM_F_REQUEST) || + !(NETLINK_CB(skb).sk) || + !netlink_capable(skb, CAP_NET_ADMIN)) return -EPERM; ret = nla_parse(tb, LS_NLA_TYPE_MAX - 1, nlmsg_data(nlh), @@ -789,7 +793,9 @@ static int ib_nl_handle_resolve_resp(struct sk_buff *skb, int found = 0; int ret; - if (!netlink_capable(skb, CAP_NET_ADMIN)) + if ((nlh->nlmsg_flags & NLM_F_REQUEST) || + !(NETLINK_CB(skb).sk) || + !netlink_capable(skb, CAP_NET_ADMIN)) return -EPERM; spin_lock_irqsave(&ib_nl_request_lock, flags); @@ -996,7 +1002,8 @@ int ib_init_ah_from_path(struct ib_device *device, u8 port_num, { int ret; u16 gid_index; - int force_grh; + int use_roce; + struct net_device *ndev = NULL; memset(ah_attr, 0, sizeof *ah_attr); ah_attr->dlid = be16_to_cpu(rec->dlid); @@ -1006,16 +1013,71 @@ int ib_init_ah_from_path(struct ib_device *device, u8 port_num, ah_attr->port_num = port_num; ah_attr->static_rate = rec->rate; - force_grh = rdma_cap_eth_ah(device, port_num); + use_roce = rdma_cap_eth_ah(device, port_num); + + if (use_roce) { + struct net_device *idev; + struct net_device *resolved_dev; + struct rdma_dev_addr dev_addr = {.bound_dev_if = rec->ifindex, + .net = rec->net ? rec->net : + &init_net}; + union { + struct sockaddr _sockaddr; + struct sockaddr_in _sockaddr_in; + struct sockaddr_in6 _sockaddr_in6; + } sgid_addr, dgid_addr; + + if (!device->get_netdev) + return -EOPNOTSUPP; + + rdma_gid2ip(&sgid_addr._sockaddr, &rec->sgid); + rdma_gid2ip(&dgid_addr._sockaddr, &rec->dgid); + + /* validate the route */ + ret = rdma_resolve_ip_route(&sgid_addr._sockaddr, + &dgid_addr._sockaddr, &dev_addr); + if (ret) + return ret; - if (rec->hop_limit > 1 || force_grh) { - struct net_device *ndev = ib_get_ndev_from_path(rec); + if ((dev_addr.network == RDMA_NETWORK_IPV4 || + dev_addr.network == RDMA_NETWORK_IPV6) && + rec->gid_type != IB_GID_TYPE_ROCE_UDP_ENCAP) + return -EINVAL; + + idev = device->get_netdev(device, port_num); + if (!idev) + return -ENODEV; + + resolved_dev = dev_get_by_index(dev_addr.net, + dev_addr.bound_dev_if); + if (resolved_dev->flags & IFF_LOOPBACK) { + dev_put(resolved_dev); + resolved_dev = idev; + dev_hold(resolved_dev); + } + ndev = ib_get_ndev_from_path(rec); + rcu_read_lock(); + if ((ndev && ndev != resolved_dev) || + (resolved_dev != idev && + !rdma_is_upper_dev_rcu(idev, resolved_dev))) + ret = -EHOSTUNREACH; + rcu_read_unlock(); + dev_put(idev); + dev_put(resolved_dev); + if (ret) { + if (ndev) + dev_put(ndev); + return ret; + } + } + if (rec->hop_limit > 1 || use_roce) { ah_attr->ah_flags = IB_AH_GRH; ah_attr->grh.dgid = rec->dgid; - ret = ib_find_cached_gid(device, &rec->sgid, ndev, &port_num, - &gid_index); + ret = ib_find_cached_gid_by_port(device, &rec->sgid, + rec->gid_type, port_num, ndev, + &gid_index); if (ret) { if (ndev) dev_put(ndev); @@ -1029,9 +1091,10 @@ int ib_init_ah_from_path(struct ib_device *device, u8 port_num, if (ndev) dev_put(ndev); } - if (force_grh) { + + if (use_roce) memcpy(ah_attr->dmac, rec->dmac, ETH_ALEN); - } + return 0; } EXPORT_SYMBOL(ib_init_ah_from_path); @@ -1157,6 +1220,7 @@ static void ib_sa_path_rec_callback(struct ib_sa_query *sa_query, mad->data, &rec); rec.net = NULL; rec.ifindex = 0; + rec.gid_type = IB_GID_TYPE_IB; memset(rec.dmac, 0, ETH_ALEN); query->callback(status, &rec, query->context); } else @@ -1609,14 +1673,15 @@ static void send_handler(struct ib_mad_agent *agent, } static void recv_handler(struct ib_mad_agent *mad_agent, + struct ib_mad_send_buf *send_buf, struct ib_mad_recv_wc *mad_recv_wc) { struct ib_sa_query *query; - struct ib_mad_send_buf *mad_buf; - mad_buf = (void *) (unsigned long) mad_recv_wc->wc->wr_id; - query = mad_buf->context[0]; + if (!send_buf) + return; + query = send_buf->context[0]; if (query->callback) { if (mad_recv_wc->wc->status == IB_WC_SUCCESS) query->callback(query, diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c index b1f37d4095fa..3de93517efe4 100644 --- a/drivers/infiniband/core/sysfs.c +++ b/drivers/infiniband/core/sysfs.c @@ -37,15 +37,27 @@ #include <linux/slab.h> #include <linux/stat.h> #include <linux/string.h> +#include <linux/netdevice.h> #include <rdma/ib_mad.h> +#include <rdma/ib_pma.h> +struct ib_port; + +struct gid_attr_group { + struct ib_port *port; + struct kobject kobj; + struct attribute_group ndev; + struct attribute_group type; +}; struct ib_port { struct kobject kobj; struct ib_device *ibdev; + struct gid_attr_group *gid_attr_group; struct attribute_group gid_group; struct attribute_group pkey_group; u8 port_num; + struct attribute_group *pma_table; }; struct port_attribute { @@ -65,6 +77,7 @@ struct port_table_attribute { struct port_attribute attr; char name[8]; int index; + __be16 attr_id; }; static ssize_t port_attr_show(struct kobject *kobj, @@ -84,6 +97,24 @@ static const struct sysfs_ops port_sysfs_ops = { .show = port_attr_show }; +static ssize_t gid_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct port_attribute *port_attr = + container_of(attr, struct port_attribute, attr); + struct ib_port *p = container_of(kobj, struct gid_attr_group, + kobj)->port; + + if (!port_attr->show) + return -EIO; + + return port_attr->show(p, port_attr, buf); +} + +static const struct sysfs_ops gid_attr_sysfs_ops = { + .show = gid_attr_show +}; + static ssize_t state_show(struct ib_port *p, struct port_attribute *unused, char *buf) { @@ -281,6 +312,46 @@ static struct attribute *port_default_attrs[] = { NULL }; +static size_t print_ndev(struct ib_gid_attr *gid_attr, char *buf) +{ + if (!gid_attr->ndev) + return -EINVAL; + + return sprintf(buf, "%s\n", gid_attr->ndev->name); +} + +static size_t print_gid_type(struct ib_gid_attr *gid_attr, char *buf) +{ + return sprintf(buf, "%s\n", ib_cache_gid_type_str(gid_attr->gid_type)); +} + +static ssize_t _show_port_gid_attr(struct ib_port *p, + struct port_attribute *attr, + char *buf, + size_t (*print)(struct ib_gid_attr *gid_attr, + char *buf)) +{ + struct port_table_attribute *tab_attr = + container_of(attr, struct port_table_attribute, attr); + union ib_gid gid; + struct ib_gid_attr gid_attr = {}; + ssize_t ret; + va_list args; + + ret = ib_query_gid(p->ibdev, p->port_num, tab_attr->index, &gid, + &gid_attr); + if (ret) + goto err; + + ret = print(&gid_attr, buf); + +err: + if (gid_attr.ndev) + dev_put(gid_attr.ndev); + va_end(args); + return ret; +} + static ssize_t show_port_gid(struct ib_port *p, struct port_attribute *attr, char *buf) { @@ -296,6 +367,19 @@ static ssize_t show_port_gid(struct ib_port *p, struct port_attribute *attr, return sprintf(buf, "%pI6\n", gid.raw); } +static ssize_t show_port_gid_attr_ndev(struct ib_port *p, + struct port_attribute *attr, char *buf) +{ + return _show_port_gid_attr(p, attr, buf, print_ndev); +} + +static ssize_t show_port_gid_attr_gid_type(struct ib_port *p, + struct port_attribute *attr, + char *buf) +{ + return _show_port_gid_attr(p, attr, buf, print_gid_type); +} + static ssize_t show_port_pkey(struct ib_port *p, struct port_attribute *attr, char *buf) { @@ -314,24 +398,32 @@ static ssize_t show_port_pkey(struct ib_port *p, struct port_attribute *attr, #define PORT_PMA_ATTR(_name, _counter, _width, _offset) \ struct port_table_attribute port_pma_attr_##_name = { \ .attr = __ATTR(_name, S_IRUGO, show_pma_counter, NULL), \ - .index = (_offset) | ((_width) << 16) | ((_counter) << 24) \ + .index = (_offset) | ((_width) << 16) | ((_counter) << 24), \ + .attr_id = IB_PMA_PORT_COUNTERS , \ } -static ssize_t show_pma_counter(struct ib_port *p, struct port_attribute *attr, - char *buf) +#define PORT_PMA_ATTR_EXT(_name, _width, _offset) \ +struct port_table_attribute port_pma_attr_ext_##_name = { \ + .attr = __ATTR(_name, S_IRUGO, show_pma_counter, NULL), \ + .index = (_offset) | ((_width) << 16), \ + .attr_id = IB_PMA_PORT_COUNTERS_EXT , \ +} + +/* + * Get a Perfmgmt MAD block of data. + * Returns error code or the number of bytes retrieved. + */ +static int get_perf_mad(struct ib_device *dev, int port_num, __be16 attr, + void *data, int offset, size_t size) { - struct port_table_attribute *tab_attr = - container_of(attr, struct port_table_attribute, attr); - int offset = tab_attr->index & 0xffff; - int width = (tab_attr->index >> 16) & 0xff; - struct ib_mad *in_mad = NULL; - struct ib_mad *out_mad = NULL; + struct ib_mad *in_mad; + struct ib_mad *out_mad; size_t mad_size = sizeof(*out_mad); u16 out_mad_pkey_index = 0; ssize_t ret; - if (!p->ibdev->process_mad) - return sprintf(buf, "N/A (no PMA)\n"); + if (!dev->process_mad) + return -ENOSYS; in_mad = kzalloc(sizeof *in_mad, GFP_KERNEL); out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL); @@ -344,12 +436,13 @@ static ssize_t show_pma_counter(struct ib_port *p, struct port_attribute *attr, in_mad->mad_hdr.mgmt_class = IB_MGMT_CLASS_PERF_MGMT; in_mad->mad_hdr.class_version = 1; in_mad->mad_hdr.method = IB_MGMT_METHOD_GET; - in_mad->mad_hdr.attr_id = cpu_to_be16(0x12); /* PortCounters */ + in_mad->mad_hdr.attr_id = attr; - in_mad->data[41] = p->port_num; /* PortSelect field */ + if (attr != IB_PMA_CLASS_PORT_INFO) + in_mad->data[41] = port_num; /* PortSelect field */ - if ((p->ibdev->process_mad(p->ibdev, IB_MAD_IGNORE_MKEY, - p->port_num, NULL, NULL, + if ((dev->process_mad(dev, IB_MAD_IGNORE_MKEY, + port_num, NULL, NULL, (const struct ib_mad_hdr *)in_mad, mad_size, (struct ib_mad_hdr *)out_mad, &mad_size, &out_mad_pkey_index) & @@ -358,31 +451,54 @@ static ssize_t show_pma_counter(struct ib_port *p, struct port_attribute *attr, ret = -EINVAL; goto out; } + memcpy(data, out_mad->data + offset, size); + ret = size; +out: + kfree(in_mad); + kfree(out_mad); + return ret; +} + +static ssize_t show_pma_counter(struct ib_port *p, struct port_attribute *attr, + char *buf) +{ + struct port_table_attribute *tab_attr = + container_of(attr, struct port_table_attribute, attr); + int offset = tab_attr->index & 0xffff; + int width = (tab_attr->index >> 16) & 0xff; + ssize_t ret; + u8 data[8]; + + ret = get_perf_mad(p->ibdev, p->port_num, tab_attr->attr_id, &data, + 40 + offset / 8, sizeof(data)); + if (ret < 0) + return sprintf(buf, "N/A (no PMA)\n"); switch (width) { case 4: - ret = sprintf(buf, "%u\n", (out_mad->data[40 + offset / 8] >> + ret = sprintf(buf, "%u\n", (*data >> (4 - (offset % 8))) & 0xf); break; case 8: - ret = sprintf(buf, "%u\n", out_mad->data[40 + offset / 8]); + ret = sprintf(buf, "%u\n", *data); break; case 16: ret = sprintf(buf, "%u\n", - be16_to_cpup((__be16 *)(out_mad->data + 40 + offset / 8))); + be16_to_cpup((__be16 *)data)); break; case 32: ret = sprintf(buf, "%u\n", - be32_to_cpup((__be32 *)(out_mad->data + 40 + offset / 8))); + be32_to_cpup((__be32 *)data)); + break; + case 64: + ret = sprintf(buf, "%llu\n", + be64_to_cpup((__be64 *)data)); break; + default: ret = 0; } -out: - kfree(in_mad); - kfree(out_mad); - return ret; } @@ -403,6 +519,18 @@ static PORT_PMA_ATTR(port_rcv_data , 13, 32, 224); static PORT_PMA_ATTR(port_xmit_packets , 14, 32, 256); static PORT_PMA_ATTR(port_rcv_packets , 15, 32, 288); +/* + * Counters added by extended set + */ +static PORT_PMA_ATTR_EXT(port_xmit_data , 64, 64); +static PORT_PMA_ATTR_EXT(port_rcv_data , 64, 128); +static PORT_PMA_ATTR_EXT(port_xmit_packets , 64, 192); +static PORT_PMA_ATTR_EXT(port_rcv_packets , 64, 256); +static PORT_PMA_ATTR_EXT(unicast_xmit_packets , 64, 320); +static PORT_PMA_ATTR_EXT(unicast_rcv_packets , 64, 384); +static PORT_PMA_ATTR_EXT(multicast_xmit_packets , 64, 448); +static PORT_PMA_ATTR_EXT(multicast_rcv_packets , 64, 512); + static struct attribute *pma_attrs[] = { &port_pma_attr_symbol_error.attr.attr, &port_pma_attr_link_error_recovery.attr.attr, @@ -423,11 +551,65 @@ static struct attribute *pma_attrs[] = { NULL }; +static struct attribute *pma_attrs_ext[] = { + &port_pma_attr_symbol_error.attr.attr, + &port_pma_attr_link_error_recovery.attr.attr, + &port_pma_attr_link_downed.attr.attr, + &port_pma_attr_port_rcv_errors.attr.attr, + &port_pma_attr_port_rcv_remote_physical_errors.attr.attr, + &port_pma_attr_port_rcv_switch_relay_errors.attr.attr, + &port_pma_attr_port_xmit_discards.attr.attr, + &port_pma_attr_port_xmit_constraint_errors.attr.attr, + &port_pma_attr_port_rcv_constraint_errors.attr.attr, + &port_pma_attr_local_link_integrity_errors.attr.attr, + &port_pma_attr_excessive_buffer_overrun_errors.attr.attr, + &port_pma_attr_VL15_dropped.attr.attr, + &port_pma_attr_ext_port_xmit_data.attr.attr, + &port_pma_attr_ext_port_rcv_data.attr.attr, + &port_pma_attr_ext_port_xmit_packets.attr.attr, + &port_pma_attr_ext_port_rcv_packets.attr.attr, + &port_pma_attr_ext_unicast_rcv_packets.attr.attr, + &port_pma_attr_ext_unicast_xmit_packets.attr.attr, + &port_pma_attr_ext_multicast_rcv_packets.attr.attr, + &port_pma_attr_ext_multicast_xmit_packets.attr.attr, + NULL +}; + +static struct attribute *pma_attrs_noietf[] = { + &port_pma_attr_symbol_error.attr.attr, + &port_pma_attr_link_error_recovery.attr.attr, + &port_pma_attr_link_downed.attr.attr, + &port_pma_attr_port_rcv_errors.attr.attr, + &port_pma_attr_port_rcv_remote_physical_errors.attr.attr, + &port_pma_attr_port_rcv_switch_relay_errors.attr.attr, + &port_pma_attr_port_xmit_discards.attr.attr, + &port_pma_attr_port_xmit_constraint_errors.attr.attr, + &port_pma_attr_port_rcv_constraint_errors.attr.attr, + &port_pma_attr_local_link_integrity_errors.attr.attr, + &port_pma_attr_excessive_buffer_overrun_errors.attr.attr, + &port_pma_attr_VL15_dropped.attr.attr, + &port_pma_attr_ext_port_xmit_data.attr.attr, + &port_pma_attr_ext_port_rcv_data.attr.attr, + &port_pma_attr_ext_port_xmit_packets.attr.attr, + &port_pma_attr_ext_port_rcv_packets.attr.attr, + NULL +}; + static struct attribute_group pma_group = { .name = "counters", .attrs = pma_attrs }; +static struct attribute_group pma_group_ext = { + .name = "counters", + .attrs = pma_attrs_ext +}; + +static struct attribute_group pma_group_noietf = { + .name = "counters", + .attrs = pma_attrs_noietf +}; + static void ib_port_release(struct kobject *kobj) { struct ib_port *p = container_of(kobj, struct ib_port, kobj); @@ -451,12 +633,41 @@ static void ib_port_release(struct kobject *kobj) kfree(p); } +static void ib_port_gid_attr_release(struct kobject *kobj) +{ + struct gid_attr_group *g = container_of(kobj, struct gid_attr_group, + kobj); + struct attribute *a; + int i; + + if (g->ndev.attrs) { + for (i = 0; (a = g->ndev.attrs[i]); ++i) + kfree(a); + + kfree(g->ndev.attrs); + } + + if (g->type.attrs) { + for (i = 0; (a = g->type.attrs[i]); ++i) + kfree(a); + + kfree(g->type.attrs); + } + + kfree(g); +} + static struct kobj_type port_type = { .release = ib_port_release, .sysfs_ops = &port_sysfs_ops, .default_attrs = port_default_attrs }; +static struct kobj_type gid_attr_type = { + .sysfs_ops = &gid_attr_sysfs_ops, + .release = ib_port_gid_attr_release +}; + static struct attribute ** alloc_group_attrs(ssize_t (*show)(struct ib_port *, struct port_attribute *, char *buf), @@ -500,6 +711,31 @@ err: return NULL; } +/* + * Figure out which counter table to use depending on + * the device capabilities. + */ +static struct attribute_group *get_counter_table(struct ib_device *dev, + int port_num) +{ + struct ib_class_port_info cpi; + + if (get_perf_mad(dev, port_num, IB_PMA_CLASS_PORT_INFO, + &cpi, 40, sizeof(cpi)) >= 0) { + + if (cpi.capability_mask && IB_PMA_CLASS_CAP_EXT_WIDTH) + /* We have extended counters */ + return &pma_group_ext; + + if (cpi.capability_mask && IB_PMA_CLASS_CAP_EXT_WIDTH_NOIETF) + /* But not the IETF ones */ + return &pma_group_noietf; + } + + /* Fall back to normal counters */ + return &pma_group; +} + static int add_port(struct ib_device *device, int port_num, int (*port_callback)(struct ib_device *, u8, struct kobject *)) @@ -528,9 +764,24 @@ static int add_port(struct ib_device *device, int port_num, return ret; } - ret = sysfs_create_group(&p->kobj, &pma_group); - if (ret) + p->gid_attr_group = kzalloc(sizeof(*p->gid_attr_group), GFP_KERNEL); + if (!p->gid_attr_group) { + ret = -ENOMEM; goto err_put; + } + + p->gid_attr_group->port = p; + ret = kobject_init_and_add(&p->gid_attr_group->kobj, &gid_attr_type, + &p->kobj, "gid_attrs"); + if (ret) { + kfree(p->gid_attr_group); + goto err_put; + } + + p->pma_table = get_counter_table(device, port_num); + ret = sysfs_create_group(&p->kobj, p->pma_table); + if (ret) + goto err_put_gid_attrs; p->gid_group.name = "gids"; p->gid_group.attrs = alloc_group_attrs(show_port_gid, attr.gid_tbl_len); @@ -543,12 +794,38 @@ static int add_port(struct ib_device *device, int port_num, if (ret) goto err_free_gid; + p->gid_attr_group->ndev.name = "ndevs"; + p->gid_attr_group->ndev.attrs = alloc_group_attrs(show_port_gid_attr_ndev, + attr.gid_tbl_len); + if (!p->gid_attr_group->ndev.attrs) { + ret = -ENOMEM; + goto err_remove_gid; + } + + ret = sysfs_create_group(&p->gid_attr_group->kobj, + &p->gid_attr_group->ndev); + if (ret) + goto err_free_gid_ndev; + + p->gid_attr_group->type.name = "types"; + p->gid_attr_group->type.attrs = alloc_group_attrs(show_port_gid_attr_gid_type, + attr.gid_tbl_len); + if (!p->gid_attr_group->type.attrs) { + ret = -ENOMEM; + goto err_remove_gid_ndev; + } + + ret = sysfs_create_group(&p->gid_attr_group->kobj, + &p->gid_attr_group->type); + if (ret) + goto err_free_gid_type; + p->pkey_group.name = "pkeys"; p->pkey_group.attrs = alloc_group_attrs(show_port_pkey, attr.pkey_tbl_len); if (!p->pkey_group.attrs) { ret = -ENOMEM; - goto err_remove_gid; + goto err_remove_gid_type; } ret = sysfs_create_group(&p->kobj, &p->pkey_group); @@ -576,6 +853,28 @@ err_free_pkey: kfree(p->pkey_group.attrs); p->pkey_group.attrs = NULL; +err_remove_gid_type: + sysfs_remove_group(&p->gid_attr_group->kobj, + &p->gid_attr_group->type); + +err_free_gid_type: + for (i = 0; i < attr.gid_tbl_len; ++i) + kfree(p->gid_attr_group->type.attrs[i]); + + kfree(p->gid_attr_group->type.attrs); + p->gid_attr_group->type.attrs = NULL; + +err_remove_gid_ndev: + sysfs_remove_group(&p->gid_attr_group->kobj, + &p->gid_attr_group->ndev); + +err_free_gid_ndev: + for (i = 0; i < attr.gid_tbl_len; ++i) + kfree(p->gid_attr_group->ndev.attrs[i]); + + kfree(p->gid_attr_group->ndev.attrs); + p->gid_attr_group->ndev.attrs = NULL; + err_remove_gid: sysfs_remove_group(&p->kobj, &p->gid_group); @@ -587,7 +886,10 @@ err_free_gid: p->gid_group.attrs = NULL; err_remove_pma: - sysfs_remove_group(&p->kobj, &pma_group); + sysfs_remove_group(&p->kobj, p->pma_table); + +err_put_gid_attrs: + kobject_put(&p->gid_attr_group->kobj); err_put: kobject_put(&p->kobj); @@ -614,18 +916,12 @@ static ssize_t show_sys_image_guid(struct device *device, struct device_attribute *dev_attr, char *buf) { struct ib_device *dev = container_of(device, struct ib_device, dev); - struct ib_device_attr attr; - ssize_t ret; - - ret = ib_query_device(dev, &attr); - if (ret) - return ret; return sprintf(buf, "%04x:%04x:%04x:%04x\n", - be16_to_cpu(((__be16 *) &attr.sys_image_guid)[0]), - be16_to_cpu(((__be16 *) &attr.sys_image_guid)[1]), - be16_to_cpu(((__be16 *) &attr.sys_image_guid)[2]), - be16_to_cpu(((__be16 *) &attr.sys_image_guid)[3])); + be16_to_cpu(((__be16 *) &dev->attrs.sys_image_guid)[0]), + be16_to_cpu(((__be16 *) &dev->attrs.sys_image_guid)[1]), + be16_to_cpu(((__be16 *) &dev->attrs.sys_image_guid)[2]), + be16_to_cpu(((__be16 *) &dev->attrs.sys_image_guid)[3])); } static ssize_t show_node_guid(struct device *device, @@ -800,9 +1096,14 @@ static void free_port_list_attributes(struct ib_device *device) list_for_each_entry_safe(p, t, &device->port_list, entry) { struct ib_port *port = container_of(p, struct ib_port, kobj); list_del(&p->entry); - sysfs_remove_group(p, &pma_group); + sysfs_remove_group(p, port->pma_table); sysfs_remove_group(p, &port->pkey_group); sysfs_remove_group(p, &port->gid_group); + sysfs_remove_group(&port->gid_attr_group->kobj, + &port->gid_attr_group->ndev); + sysfs_remove_group(&port->gid_attr_group->kobj, + &port->gid_attr_group->type); + kobject_put(&port->gid_attr_group->kobj); kobject_put(p); } diff --git a/drivers/infiniband/core/ud_header.c b/drivers/infiniband/core/ud_header.c index 72feee620ebf..19837d270278 100644 --- a/drivers/infiniband/core/ud_header.c +++ b/drivers/infiniband/core/ud_header.c @@ -35,6 +35,7 @@ #include <linux/string.h> #include <linux/export.h> #include <linux/if_ether.h> +#include <linux/ip.h> #include <rdma/ib_pack.h> @@ -116,6 +117,72 @@ static const struct ib_field vlan_table[] = { .size_bits = 16 } }; +static const struct ib_field ip4_table[] = { + { STRUCT_FIELD(ip4, ver), + .offset_words = 0, + .offset_bits = 0, + .size_bits = 4 }, + { STRUCT_FIELD(ip4, hdr_len), + .offset_words = 0, + .offset_bits = 4, + .size_bits = 4 }, + { STRUCT_FIELD(ip4, tos), + .offset_words = 0, + .offset_bits = 8, + .size_bits = 8 }, + { STRUCT_FIELD(ip4, tot_len), + .offset_words = 0, + .offset_bits = 16, + .size_bits = 16 }, + { STRUCT_FIELD(ip4, id), + .offset_words = 1, + .offset_bits = 0, + .size_bits = 16 }, + { STRUCT_FIELD(ip4, frag_off), + .offset_words = 1, + .offset_bits = 16, + .size_bits = 16 }, + { STRUCT_FIELD(ip4, ttl), + .offset_words = 2, + .offset_bits = 0, + .size_bits = 8 }, + { STRUCT_FIELD(ip4, protocol), + .offset_words = 2, + .offset_bits = 8, + .size_bits = 8 }, + { STRUCT_FIELD(ip4, check), + .offset_words = 2, + .offset_bits = 16, + .size_bits = 16 }, + { STRUCT_FIELD(ip4, saddr), + .offset_words = 3, + .offset_bits = 0, + .size_bits = 32 }, + { STRUCT_FIELD(ip4, daddr), + .offset_words = 4, + .offset_bits = 0, + .size_bits = 32 } +}; + +static const struct ib_field udp_table[] = { + { STRUCT_FIELD(udp, sport), + .offset_words = 0, + .offset_bits = 0, + .size_bits = 16 }, + { STRUCT_FIELD(udp, dport), + .offset_words = 0, + .offset_bits = 16, + .size_bits = 16 }, + { STRUCT_FIELD(udp, length), + .offset_words = 1, + .offset_bits = 0, + .size_bits = 16 }, + { STRUCT_FIELD(udp, csum), + .offset_words = 1, + .offset_bits = 16, + .size_bits = 16 } +}; + static const struct ib_field grh_table[] = { { STRUCT_FIELD(grh, ip_version), .offset_words = 0, @@ -213,26 +280,57 @@ static const struct ib_field deth_table[] = { .size_bits = 24 } }; +__sum16 ib_ud_ip4_csum(struct ib_ud_header *header) +{ + struct iphdr iph; + + iph.ihl = 5; + iph.version = 4; + iph.tos = header->ip4.tos; + iph.tot_len = header->ip4.tot_len; + iph.id = header->ip4.id; + iph.frag_off = header->ip4.frag_off; + iph.ttl = header->ip4.ttl; + iph.protocol = header->ip4.protocol; + iph.check = 0; + iph.saddr = header->ip4.saddr; + iph.daddr = header->ip4.daddr; + + return ip_fast_csum((u8 *)&iph, iph.ihl); +} +EXPORT_SYMBOL(ib_ud_ip4_csum); + /** * ib_ud_header_init - Initialize UD header structure * @payload_bytes:Length of packet payload * @lrh_present: specify if LRH is present * @eth_present: specify if Eth header is present * @vlan_present: packet is tagged vlan - * @grh_present:GRH flag (if non-zero, GRH will be included) + * @grh_present: GRH flag (if non-zero, GRH will be included) + * @ip_version: if non-zero, IP header, V4 or V6, will be included + * @udp_present :if non-zero, UDP header will be included * @immediate_present: specify if immediate data is present * @header:Structure to initialize */ -void ib_ud_header_init(int payload_bytes, - int lrh_present, - int eth_present, - int vlan_present, - int grh_present, - int immediate_present, - struct ib_ud_header *header) +int ib_ud_header_init(int payload_bytes, + int lrh_present, + int eth_present, + int vlan_present, + int grh_present, + int ip_version, + int udp_present, + int immediate_present, + struct ib_ud_header *header) { + grh_present = grh_present && !ip_version; memset(header, 0, sizeof *header); + /* + * UDP header without IP header doesn't make sense + */ + if (udp_present && ip_version != 4 && ip_version != 6) + return -EINVAL; + if (lrh_present) { u16 packet_length; @@ -252,7 +350,7 @@ void ib_ud_header_init(int payload_bytes, if (vlan_present) header->eth.type = cpu_to_be16(ETH_P_8021Q); - if (grh_present) { + if (ip_version == 6 || grh_present) { header->grh.ip_version = 6; header->grh.payload_length = cpu_to_be16((IB_BTH_BYTES + @@ -260,8 +358,30 @@ void ib_ud_header_init(int payload_bytes, payload_bytes + 4 + /* ICRC */ 3) & ~3); /* round up */ - header->grh.next_header = 0x1b; + header->grh.next_header = udp_present ? IPPROTO_UDP : 0x1b; + } + + if (ip_version == 4) { + int udp_bytes = udp_present ? IB_UDP_BYTES : 0; + + header->ip4.ver = 4; /* version 4 */ + header->ip4.hdr_len = 5; /* 5 words */ + header->ip4.tot_len = + cpu_to_be16(IB_IP4_BYTES + + udp_bytes + + IB_BTH_BYTES + + IB_DETH_BYTES + + payload_bytes + + 4); /* ICRC */ + header->ip4.protocol = IPPROTO_UDP; } + if (udp_present && ip_version) + header->udp.length = + cpu_to_be16(IB_UDP_BYTES + + IB_BTH_BYTES + + IB_DETH_BYTES + + payload_bytes + + 4); /* ICRC */ if (immediate_present) header->bth.opcode = IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE; @@ -273,8 +393,11 @@ void ib_ud_header_init(int payload_bytes, header->lrh_present = lrh_present; header->eth_present = eth_present; header->vlan_present = vlan_present; - header->grh_present = grh_present; + header->grh_present = grh_present || (ip_version == 6); + header->ipv4_present = ip_version == 4; + header->udp_present = udp_present; header->immediate_present = immediate_present; + return 0; } EXPORT_SYMBOL(ib_ud_header_init); @@ -311,6 +434,16 @@ int ib_ud_header_pack(struct ib_ud_header *header, &header->grh, buf + len); len += IB_GRH_BYTES; } + if (header->ipv4_present) { + ib_pack(ip4_table, ARRAY_SIZE(ip4_table), + &header->ip4, buf + len); + len += IB_IP4_BYTES; + } + if (header->udp_present) { + ib_pack(udp_table, ARRAY_SIZE(udp_table), + &header->udp, buf + len); + len += IB_UDP_BYTES; + } ib_pack(bth_table, ARRAY_SIZE(bth_table), &header->bth, buf + len); diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c index 40becdb3196e..e69bf266049d 100644 --- a/drivers/infiniband/core/umem_odp.c +++ b/drivers/infiniband/core/umem_odp.c @@ -232,7 +232,7 @@ static void ib_umem_notifier_invalidate_range_end(struct mmu_notifier *mn, ib_ucontext_notifier_end_account(context); } -static struct mmu_notifier_ops ib_umem_notifiers = { +static const struct mmu_notifier_ops ib_umem_notifiers = { .release = ib_umem_notifier_release, .invalidate_page = ib_umem_notifier_invalidate_page, .invalidate_range_start = ib_umem_notifier_invalidate_range_start, diff --git a/drivers/infiniband/core/user_mad.c b/drivers/infiniband/core/user_mad.c index 57f281f8d686..415a3185cde7 100644 --- a/drivers/infiniband/core/user_mad.c +++ b/drivers/infiniband/core/user_mad.c @@ -210,6 +210,7 @@ static void send_handler(struct ib_mad_agent *agent, } static void recv_handler(struct ib_mad_agent *agent, + struct ib_mad_send_buf *send_buf, struct ib_mad_recv_wc *mad_recv_wc) { struct ib_umad_file *file = agent->context; diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h index 94bbd8c155fc..612ccfd39bf9 100644 --- a/drivers/infiniband/core/uverbs.h +++ b/drivers/infiniband/core/uverbs.h @@ -204,6 +204,8 @@ void ib_uverbs_event_handler(struct ib_event_handler *handler, struct ib_event *event); void ib_uverbs_dealloc_xrcd(struct ib_uverbs_device *dev, struct ib_xrcd *xrcd); +int uverbs_dealloc_mw(struct ib_mw *mw); + struct ib_uverbs_flow_spec { union { union { diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 1c02deab068f..6ffc9c4e93af 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -291,9 +291,6 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file, struct ib_uverbs_get_context cmd; struct ib_uverbs_get_context_resp resp; struct ib_udata udata; -#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING - struct ib_device_attr dev_attr; -#endif struct ib_ucontext *ucontext; struct file *filp; int ret; @@ -342,10 +339,7 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file, ucontext->odp_mrs_count = 0; INIT_LIST_HEAD(&ucontext->no_private_counters); - ret = ib_query_device(ib_dev, &dev_attr); - if (ret) - goto err_free; - if (!(dev_attr.device_cap_flags & IB_DEVICE_ON_DEMAND_PAGING)) + if (!(ib_dev->attrs.device_cap_flags & IB_DEVICE_ON_DEMAND_PAGING)) ucontext->invalidate_range = NULL; #endif @@ -447,8 +441,6 @@ ssize_t ib_uverbs_query_device(struct ib_uverbs_file *file, { struct ib_uverbs_query_device cmd; struct ib_uverbs_query_device_resp resp; - struct ib_device_attr attr; - int ret; if (out_len < sizeof resp) return -ENOSPC; @@ -456,12 +448,8 @@ ssize_t ib_uverbs_query_device(struct ib_uverbs_file *file, if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; - ret = ib_query_device(ib_dev, &attr); - if (ret) - return ret; - memset(&resp, 0, sizeof resp); - copy_query_dev_fields(file, ib_dev, &resp, &attr); + copy_query_dev_fields(file, ib_dev, &resp, &ib_dev->attrs); if (copy_to_user((void __user *) (unsigned long) cmd.response, &resp, sizeof resp)) @@ -986,11 +974,8 @@ ssize_t ib_uverbs_reg_mr(struct ib_uverbs_file *file, } if (cmd.access_flags & IB_ACCESS_ON_DEMAND) { - struct ib_device_attr attr; - - ret = ib_query_device(pd->device, &attr); - if (ret || !(attr.device_cap_flags & - IB_DEVICE_ON_DEMAND_PAGING)) { + if (!(pd->device->attrs.device_cap_flags & + IB_DEVICE_ON_DEMAND_PAGING)) { pr_debug("ODP support not available\n"); ret = -EINVAL; goto err_put; @@ -1008,7 +993,6 @@ ssize_t ib_uverbs_reg_mr(struct ib_uverbs_file *file, mr->pd = pd; mr->uobject = uobj; atomic_inc(&pd->usecnt); - atomic_set(&mr->usecnt, 0); uobj->object = mr; ret = idr_add_uobj(&ib_uverbs_mr_idr, uobj); @@ -1106,11 +1090,6 @@ ssize_t ib_uverbs_rereg_mr(struct ib_uverbs_file *file, } } - if (atomic_read(&mr->usecnt)) { - ret = -EBUSY; - goto put_uobj_pd; - } - old_pd = mr->pd; ret = mr->device->rereg_user_mr(mr, cmd.flags, cmd.start, cmd.length, cmd.hca_va, @@ -1258,7 +1237,7 @@ err_copy: idr_remove_uobj(&ib_uverbs_mw_idr, uobj); err_unalloc: - ib_dealloc_mw(mw); + uverbs_dealloc_mw(mw); err_put: put_pd_read(pd); @@ -1287,7 +1266,7 @@ ssize_t ib_uverbs_dealloc_mw(struct ib_uverbs_file *file, mw = uobj->object; - ret = ib_dealloc_mw(mw); + ret = uverbs_dealloc_mw(mw); if (!ret) uobj->live = 0; @@ -1845,7 +1824,10 @@ static int create_qp(struct ib_uverbs_file *file, sizeof(cmd->create_flags)) attr.create_flags = cmd->create_flags; - if (attr.create_flags & ~IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK) { + if (attr.create_flags & ~(IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK | + IB_QP_CREATE_CROSS_CHANNEL | + IB_QP_CREATE_MANAGED_SEND | + IB_QP_CREATE_MANAGED_RECV)) { ret = -EINVAL; goto err_put; } diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index e3ef28861be6..39680aed99dd 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -133,6 +133,17 @@ static int (*uverbs_ex_cmd_table[])(struct ib_uverbs_file *file, static void ib_uverbs_add_one(struct ib_device *device); static void ib_uverbs_remove_one(struct ib_device *device, void *client_data); +int uverbs_dealloc_mw(struct ib_mw *mw) +{ + struct ib_pd *pd = mw->pd; + int ret; + + ret = mw->device->dealloc_mw(mw); + if (!ret) + atomic_dec(&pd->usecnt); + return ret; +} + static void ib_uverbs_release_dev(struct kobject *kobj) { struct ib_uverbs_device *dev = @@ -224,7 +235,7 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file, struct ib_mw *mw = uobj->object; idr_remove_uobj(&ib_uverbs_mw_idr, uobj); - ib_dealloc_mw(mw); + uverbs_dealloc_mw(mw); kfree(uobj); } diff --git a/drivers/infiniband/core/uverbs_marshall.c b/drivers/infiniband/core/uverbs_marshall.c index 7d2f14c9bbef..af020f80d50f 100644 --- a/drivers/infiniband/core/uverbs_marshall.c +++ b/drivers/infiniband/core/uverbs_marshall.c @@ -144,5 +144,6 @@ void ib_copy_path_rec_from_user(struct ib_sa_path_rec *dst, memset(dst->dmac, 0, sizeof(dst->dmac)); dst->net = NULL; dst->ifindex = 0; + dst->gid_type = IB_GID_TYPE_IB; } EXPORT_SYMBOL(ib_copy_path_rec_from_user); diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index 545906dec26d..5af6d024e053 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -229,12 +229,6 @@ EXPORT_SYMBOL(rdma_port_get_link_layer); struct ib_pd *ib_alloc_pd(struct ib_device *device) { struct ib_pd *pd; - struct ib_device_attr devattr; - int rc; - - rc = ib_query_device(device, &devattr); - if (rc) - return ERR_PTR(rc); pd = device->alloc_pd(device, NULL, NULL); if (IS_ERR(pd)) @@ -245,7 +239,7 @@ struct ib_pd *ib_alloc_pd(struct ib_device *device) pd->local_mr = NULL; atomic_set(&pd->usecnt, 0); - if (devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) + if (device->attrs.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) pd->local_dma_lkey = device->local_dma_lkey; else { struct ib_mr *mr; @@ -311,8 +305,61 @@ struct ib_ah *ib_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr) } EXPORT_SYMBOL(ib_create_ah); +static int ib_get_header_version(const union rdma_network_hdr *hdr) +{ + const struct iphdr *ip4h = (struct iphdr *)&hdr->roce4grh; + struct iphdr ip4h_checked; + const struct ipv6hdr *ip6h = (struct ipv6hdr *)&hdr->ibgrh; + + /* If it's IPv6, the version must be 6, otherwise, the first + * 20 bytes (before the IPv4 header) are garbled. + */ + if (ip6h->version != 6) + return (ip4h->version == 4) ? 4 : 0; + /* version may be 6 or 4 because the first 20 bytes could be garbled */ + + /* RoCE v2 requires no options, thus header length + * must be 5 words + */ + if (ip4h->ihl != 5) + return 6; + + /* Verify checksum. + * We can't write on scattered buffers so we need to copy to + * temp buffer. + */ + memcpy(&ip4h_checked, ip4h, sizeof(ip4h_checked)); + ip4h_checked.check = 0; + ip4h_checked.check = ip_fast_csum((u8 *)&ip4h_checked, 5); + /* if IPv4 header checksum is OK, believe it */ + if (ip4h->check == ip4h_checked.check) + return 4; + return 6; +} + +static enum rdma_network_type ib_get_net_type_by_grh(struct ib_device *device, + u8 port_num, + const struct ib_grh *grh) +{ + int grh_version; + + if (rdma_protocol_ib(device, port_num)) + return RDMA_NETWORK_IB; + + grh_version = ib_get_header_version((union rdma_network_hdr *)grh); + + if (grh_version == 4) + return RDMA_NETWORK_IPV4; + + if (grh->next_hdr == IPPROTO_UDP) + return RDMA_NETWORK_IPV6; + + return RDMA_NETWORK_ROCE_V1; +} + struct find_gid_index_context { u16 vlan_id; + enum ib_gid_type gid_type; }; static bool find_gid_index(const union ib_gid *gid, @@ -322,6 +369,9 @@ static bool find_gid_index(const union ib_gid *gid, struct find_gid_index_context *ctx = (struct find_gid_index_context *)context; + if (ctx->gid_type != gid_attr->gid_type) + return false; + if ((!!(ctx->vlan_id != 0xffff) == !is_vlan_dev(gid_attr->ndev)) || (is_vlan_dev(gid_attr->ndev) && vlan_dev_vlan_id(gid_attr->ndev) != ctx->vlan_id)) @@ -332,14 +382,49 @@ static bool find_gid_index(const union ib_gid *gid, static int get_sgid_index_from_eth(struct ib_device *device, u8 port_num, u16 vlan_id, const union ib_gid *sgid, + enum ib_gid_type gid_type, u16 *gid_index) { - struct find_gid_index_context context = {.vlan_id = vlan_id}; + struct find_gid_index_context context = {.vlan_id = vlan_id, + .gid_type = gid_type}; return ib_find_gid_by_filter(device, sgid, port_num, find_gid_index, &context, gid_index); } +static int get_gids_from_rdma_hdr(union rdma_network_hdr *hdr, + enum rdma_network_type net_type, + union ib_gid *sgid, union ib_gid *dgid) +{ + struct sockaddr_in src_in; + struct sockaddr_in dst_in; + __be32 src_saddr, dst_saddr; + + if (!sgid || !dgid) + return -EINVAL; + + if (net_type == RDMA_NETWORK_IPV4) { + memcpy(&src_in.sin_addr.s_addr, + &hdr->roce4grh.saddr, 4); + memcpy(&dst_in.sin_addr.s_addr, + &hdr->roce4grh.daddr, 4); + src_saddr = src_in.sin_addr.s_addr; + dst_saddr = dst_in.sin_addr.s_addr; + ipv6_addr_set_v4mapped(src_saddr, + (struct in6_addr *)sgid); + ipv6_addr_set_v4mapped(dst_saddr, + (struct in6_addr *)dgid); + return 0; + } else if (net_type == RDMA_NETWORK_IPV6 || + net_type == RDMA_NETWORK_IB) { + *dgid = hdr->ibgrh.dgid; + *sgid = hdr->ibgrh.sgid; + return 0; + } else { + return -EINVAL; + } +} + int ib_init_ah_from_wc(struct ib_device *device, u8 port_num, const struct ib_wc *wc, const struct ib_grh *grh, struct ib_ah_attr *ah_attr) @@ -347,33 +432,72 @@ int ib_init_ah_from_wc(struct ib_device *device, u8 port_num, u32 flow_class; u16 gid_index; int ret; + enum rdma_network_type net_type = RDMA_NETWORK_IB; + enum ib_gid_type gid_type = IB_GID_TYPE_IB; + int hoplimit = 0xff; + union ib_gid dgid; + union ib_gid sgid; memset(ah_attr, 0, sizeof *ah_attr); if (rdma_cap_eth_ah(device, port_num)) { + if (wc->wc_flags & IB_WC_WITH_NETWORK_HDR_TYPE) + net_type = wc->network_hdr_type; + else + net_type = ib_get_net_type_by_grh(device, port_num, grh); + gid_type = ib_network_to_gid_type(net_type); + } + ret = get_gids_from_rdma_hdr((union rdma_network_hdr *)grh, net_type, + &sgid, &dgid); + if (ret) + return ret; + + if (rdma_protocol_roce(device, port_num)) { + int if_index = 0; u16 vlan_id = wc->wc_flags & IB_WC_WITH_VLAN ? wc->vlan_id : 0xffff; + struct net_device *idev; + struct net_device *resolved_dev; if (!(wc->wc_flags & IB_WC_GRH)) return -EPROTOTYPE; - if (!(wc->wc_flags & IB_WC_WITH_SMAC) || - !(wc->wc_flags & IB_WC_WITH_VLAN)) { - ret = rdma_addr_find_dmac_by_grh(&grh->dgid, &grh->sgid, - ah_attr->dmac, - wc->wc_flags & IB_WC_WITH_VLAN ? - NULL : &vlan_id, - 0); - if (ret) - return ret; + if (!device->get_netdev) + return -EOPNOTSUPP; + + idev = device->get_netdev(device, port_num); + if (!idev) + return -ENODEV; + + ret = rdma_addr_find_l2_eth_by_grh(&dgid, &sgid, + ah_attr->dmac, + wc->wc_flags & IB_WC_WITH_VLAN ? + NULL : &vlan_id, + &if_index, &hoplimit); + if (ret) { + dev_put(idev); + return ret; } - ret = get_sgid_index_from_eth(device, port_num, vlan_id, - &grh->dgid, &gid_index); + resolved_dev = dev_get_by_index(&init_net, if_index); + if (resolved_dev->flags & IFF_LOOPBACK) { + dev_put(resolved_dev); + resolved_dev = idev; + dev_hold(resolved_dev); + } + rcu_read_lock(); + if (resolved_dev != idev && !rdma_is_upper_dev_rcu(idev, + resolved_dev)) + ret = -EHOSTUNREACH; + rcu_read_unlock(); + dev_put(idev); + dev_put(resolved_dev); if (ret) return ret; - if (wc->wc_flags & IB_WC_WITH_SMAC) - memcpy(ah_attr->dmac, wc->smac, ETH_ALEN); + ret = get_sgid_index_from_eth(device, port_num, vlan_id, + &dgid, gid_type, &gid_index); + if (ret) + return ret; } ah_attr->dlid = wc->slid; @@ -383,10 +507,11 @@ int ib_init_ah_from_wc(struct ib_device *device, u8 port_num, if (wc->wc_flags & IB_WC_GRH) { ah_attr->ah_flags = IB_AH_GRH; - ah_attr->grh.dgid = grh->sgid; + ah_attr->grh.dgid = sgid; if (!rdma_cap_eth_ah(device, port_num)) { - ret = ib_find_cached_gid_by_port(device, &grh->dgid, + ret = ib_find_cached_gid_by_port(device, &dgid, + IB_GID_TYPE_IB, port_num, NULL, &gid_index); if (ret) @@ -396,7 +521,7 @@ int ib_init_ah_from_wc(struct ib_device *device, u8 port_num, ah_attr->grh.sgid_index = (u8) gid_index; flow_class = be32_to_cpu(grh->version_tclass_flow); ah_attr->grh.flow_label = flow_class & 0xFFFFF; - ah_attr->grh.hop_limit = 0xFF; + ah_attr->grh.hop_limit = hoplimit; ah_attr->grh.traffic_class = (flow_class >> 20) & 0xFF; } return 0; @@ -1014,6 +1139,7 @@ int ib_resolve_eth_dmac(struct ib_qp *qp, union ib_gid sgid; struct ib_gid_attr sgid_attr; int ifindex; + int hop_limit; ret = ib_query_gid(qp->device, qp_attr->ah_attr.port_num, @@ -1028,12 +1154,14 @@ int ib_resolve_eth_dmac(struct ib_qp *qp, ifindex = sgid_attr.ndev->ifindex; - ret = rdma_addr_find_dmac_by_grh(&sgid, - &qp_attr->ah_attr.grh.dgid, - qp_attr->ah_attr.dmac, - NULL, ifindex); + ret = rdma_addr_find_l2_eth_by_grh(&sgid, + &qp_attr->ah_attr.grh.dgid, + qp_attr->ah_attr.dmac, + NULL, &ifindex, &hop_limit); dev_put(sgid_attr.ndev); + + qp_attr->ah_attr.grh.hop_limit = hop_limit; } } out: @@ -1215,29 +1343,17 @@ struct ib_mr *ib_get_dma_mr(struct ib_pd *pd, int mr_access_flags) mr->pd = pd; mr->uobject = NULL; atomic_inc(&pd->usecnt); - atomic_set(&mr->usecnt, 0); } return mr; } EXPORT_SYMBOL(ib_get_dma_mr); -int ib_query_mr(struct ib_mr *mr, struct ib_mr_attr *mr_attr) -{ - return mr->device->query_mr ? - mr->device->query_mr(mr, mr_attr) : -ENOSYS; -} -EXPORT_SYMBOL(ib_query_mr); - int ib_dereg_mr(struct ib_mr *mr) { - struct ib_pd *pd; + struct ib_pd *pd = mr->pd; int ret; - if (atomic_read(&mr->usecnt)) - return -EBUSY; - - pd = mr->pd; ret = mr->device->dereg_mr(mr); if (!ret) atomic_dec(&pd->usecnt); @@ -1273,49 +1389,12 @@ struct ib_mr *ib_alloc_mr(struct ib_pd *pd, mr->pd = pd; mr->uobject = NULL; atomic_inc(&pd->usecnt); - atomic_set(&mr->usecnt, 0); } return mr; } EXPORT_SYMBOL(ib_alloc_mr); -/* Memory windows */ - -struct ib_mw *ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type) -{ - struct ib_mw *mw; - - if (!pd->device->alloc_mw) - return ERR_PTR(-ENOSYS); - - mw = pd->device->alloc_mw(pd, type); - if (!IS_ERR(mw)) { - mw->device = pd->device; - mw->pd = pd; - mw->uobject = NULL; - mw->type = type; - atomic_inc(&pd->usecnt); - } - - return mw; -} -EXPORT_SYMBOL(ib_alloc_mw); - -int ib_dealloc_mw(struct ib_mw *mw) -{ - struct ib_pd *pd; - int ret; - - pd = mw->pd; - ret = mw->device->dealloc_mw(mw); - if (!ret) - atomic_dec(&pd->usecnt); - - return ret; -} -EXPORT_SYMBOL(ib_dealloc_mw); - /* "Fast" memory regions */ struct ib_fmr *ib_alloc_fmr(struct ib_pd *pd, @@ -1530,7 +1609,7 @@ int ib_sg_to_pages(struct ib_mr *mr, int (*set_page)(struct ib_mr *, u64)) { struct scatterlist *sg; - u64 last_end_dma_addr = 0, last_page_addr = 0; + u64 last_end_dma_addr = 0; unsigned int last_page_off = 0; u64 page_mask = ~((u64)mr->page_size - 1); int i, ret; @@ -1572,7 +1651,6 @@ next_page: mr->length += dma_len; last_end_dma_addr = end_dma_addr; - last_page_addr = end_dma_addr & page_mask; last_page_off = end_dma_addr & ~page_mask; } |