diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2018-10-26 07:38:19 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2018-10-26 07:38:19 -0700 |
commit | da19a102ce87bf3e0a7fe277a659d1fc35330d6d (patch) | |
tree | a6c1d40ef544e812b31f4b5f497c20d449d45ec3 /drivers/infiniband/core | |
parent | e5f6d9afa3415104e402cd69288bb03f7165eeba (diff) | |
parent | a60109dc9a954ef9eddba6577e2d2e9e7952e487 (diff) | |
download | linux-da19a102ce87bf3e0a7fe277a659d1fc35330d6d.tar.bz2 |
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma
Pull rdma updates from Jason Gunthorpe:
"This has been a smaller cycle with many of the commits being smallish
code fixes and improvements across the drivers.
- Driver updates for bnxt_re, cxgb4, hfi1, hns, mlx5, nes, qedr, and
rxe
- Memory window support in hns
- mlx5 user API 'flow mutate/steering' allows accessing the full
packet mangling and matching machinery from user space
- Support inter-working with verbs API calls in the 'devx' mlx5 user
API, and provide options to use devx with less privilege
- Modernize the use of syfs and the device interface to use attribute
groups and cdev properly for uverbs, and clean up some of the core
code's device list management
- More progress on net namespaces for RDMA devices
- Consolidate driver BAR mmapping support into core code helpers and
rework how RDMA holds poitners to mm_struct for get_user_pages
cases
- First pass to use 'dev_name' instead of ib_device->name
- Device renaming for RDMA devices"
* tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma: (242 commits)
IB/mlx5: Add support for extended atomic operations
RDMA/core: Fix comment for hw stats init for port == 0
RDMA/core: Refactor ib_register_device() function
RDMA/core: Fix unwinding flow in case of error to register device
ib_srp: Remove WARN_ON in srp_terminate_io()
IB/mlx5: Allow scatter to CQE without global signaled WRs
IB/mlx5: Verify that driver supports user flags
IB/mlx5: Support scatter to CQE for DC transport type
RDMA/drivers: Use core provided API for registering device attributes
RDMA/core: Allow existing drivers to set one sysfs group per device
IB/rxe: Remove unnecessary enum values
RDMA/umad: Use kernel API to allocate umad indexes
RDMA/uverbs: Use kernel API to allocate uverbs indexes
RDMA/core: Increase total number of RDMA ports across all devices
IB/mlx4: Add port and TID to MAD debug print
IB/mlx4: Enable debug print of SMPs
RDMA/core: Rename ports_parent to ports_kobj
RDMA/core: Do not expose unsupported counters
IB/mlx4: Refer to the device kobject instead of ports_parent
RDMA/nldev: Allow IB device rename through RDMA netlink
...
Diffstat (limited to 'drivers/infiniband/core')
31 files changed, 1678 insertions, 1093 deletions
diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c index 46b855a42884..0dce94e3c495 100644 --- a/drivers/infiniband/core/addr.c +++ b/drivers/infiniband/core/addr.c @@ -45,6 +45,7 @@ #include <net/addrconf.h> #include <net/ip6_route.h> #include <rdma/ib_addr.h> +#include <rdma/ib_sa.h> #include <rdma/ib.h> #include <rdma/rdma_netlink.h> #include <net/netlink.h> @@ -61,6 +62,7 @@ struct addr_req { struct rdma_dev_addr *addr, void *context); unsigned long timeout; struct delayed_work work; + bool resolve_by_gid_attr; /* Consider gid attr in resolve phase */ int status; u32 seq; }; @@ -219,60 +221,75 @@ int rdma_addr_size_kss(struct __kernel_sockaddr_storage *addr) } EXPORT_SYMBOL(rdma_addr_size_kss); -void rdma_copy_addr(struct rdma_dev_addr *dev_addr, - const struct net_device *dev, - const unsigned char *dst_dev_addr) +/** + * rdma_copy_src_l2_addr - Copy netdevice source addresses + * @dev_addr: Destination address pointer where to copy the addresses + * @dev: Netdevice whose source addresses to copy + * + * rdma_copy_src_l2_addr() copies source addresses from the specified netdevice. + * This includes unicast address, broadcast address, device type and + * interface index. + */ +void rdma_copy_src_l2_addr(struct rdma_dev_addr *dev_addr, + const struct net_device *dev) { dev_addr->dev_type = dev->type; memcpy(dev_addr->src_dev_addr, dev->dev_addr, MAX_ADDR_LEN); memcpy(dev_addr->broadcast, dev->broadcast, MAX_ADDR_LEN); - if (dst_dev_addr) - memcpy(dev_addr->dst_dev_addr, dst_dev_addr, MAX_ADDR_LEN); dev_addr->bound_dev_if = dev->ifindex; } -EXPORT_SYMBOL(rdma_copy_addr); +EXPORT_SYMBOL(rdma_copy_src_l2_addr); -int rdma_translate_ip(const struct sockaddr *addr, - struct rdma_dev_addr *dev_addr) +static struct net_device * +rdma_find_ndev_for_src_ip_rcu(struct net *net, const struct sockaddr *src_in) { - struct net_device *dev; + struct net_device *dev = NULL; + int ret = -EADDRNOTAVAIL; - if (dev_addr->bound_dev_if) { - dev = dev_get_by_index(dev_addr->net, dev_addr->bound_dev_if); - if (!dev) - return -ENODEV; - rdma_copy_addr(dev_addr, dev, NULL); - dev_put(dev); - return 0; - } - - switch (addr->sa_family) { + switch (src_in->sa_family) { case AF_INET: - dev = ip_dev_find(dev_addr->net, - ((const struct sockaddr_in *)addr)->sin_addr.s_addr); - - if (!dev) - return -EADDRNOTAVAIL; - - rdma_copy_addr(dev_addr, dev, NULL); - dev_put(dev); + dev = __ip_dev_find(net, + ((const struct sockaddr_in *)src_in)->sin_addr.s_addr, + false); + if (dev) + ret = 0; break; #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: - rcu_read_lock(); - for_each_netdev_rcu(dev_addr->net, dev) { - if (ipv6_chk_addr(dev_addr->net, - &((const struct sockaddr_in6 *)addr)->sin6_addr, + for_each_netdev_rcu(net, dev) { + if (ipv6_chk_addr(net, + &((const struct sockaddr_in6 *)src_in)->sin6_addr, dev, 1)) { - rdma_copy_addr(dev_addr, dev, NULL); + ret = 0; break; } } - rcu_read_unlock(); break; #endif } - return 0; + return ret ? ERR_PTR(ret) : dev; +} + +int rdma_translate_ip(const struct sockaddr *addr, + struct rdma_dev_addr *dev_addr) +{ + struct net_device *dev; + + if (dev_addr->bound_dev_if) { + dev = dev_get_by_index(dev_addr->net, dev_addr->bound_dev_if); + if (!dev) + return -ENODEV; + rdma_copy_src_l2_addr(dev_addr, dev); + dev_put(dev); + return 0; + } + + rcu_read_lock(); + dev = rdma_find_ndev_for_src_ip_rcu(dev_addr->net, addr); + if (!IS_ERR(dev)) + rdma_copy_src_l2_addr(dev_addr, dev); + rcu_read_unlock(); + return PTR_ERR_OR_ZERO(dev); } EXPORT_SYMBOL(rdma_translate_ip); @@ -295,15 +312,12 @@ static void queue_req(struct addr_req *req) spin_unlock_bh(&lock); } -static int ib_nl_fetch_ha(const struct dst_entry *dst, - struct rdma_dev_addr *dev_addr, +static int ib_nl_fetch_ha(struct rdma_dev_addr *dev_addr, const void *daddr, u32 seq, u16 family) { - if (rdma_nl_chk_listeners(RDMA_NL_GROUP_LS)) + if (!rdma_nl_chk_listeners(RDMA_NL_GROUP_LS)) return -EADDRNOTAVAIL; - /* We fill in what we can, the response will fill the rest */ - rdma_copy_addr(dev_addr, dst->dev, NULL); return ib_nl_ip_send_msg(dev_addr, daddr, seq, family); } @@ -322,7 +336,7 @@ static int dst_fetch_ha(const struct dst_entry *dst, neigh_event_send(n, NULL); ret = -ENODATA; } else { - rdma_copy_addr(dev_addr, dst->dev, n->ha); + memcpy(dev_addr->dst_dev_addr, n->ha, MAX_ADDR_LEN); } neigh_release(n); @@ -356,18 +370,22 @@ static int fetch_ha(const struct dst_entry *dst, struct rdma_dev_addr *dev_addr, (const void *)&dst_in6->sin6_addr; sa_family_t family = dst_in->sa_family; - /* Gateway + ARPHRD_INFINIBAND -> IB router */ - if (has_gateway(dst, family) && dst->dev->type == ARPHRD_INFINIBAND) - return ib_nl_fetch_ha(dst, dev_addr, daddr, seq, family); + /* If we have a gateway in IB mode then it must be an IB network */ + if (has_gateway(dst, family) && dev_addr->network == RDMA_NETWORK_IB) + return ib_nl_fetch_ha(dev_addr, daddr, seq, family); else return dst_fetch_ha(dst, dev_addr, daddr); } -static int addr4_resolve(struct sockaddr_in *src_in, - const struct sockaddr_in *dst_in, +static int addr4_resolve(struct sockaddr *src_sock, + const struct sockaddr *dst_sock, struct rdma_dev_addr *addr, struct rtable **prt) { + struct sockaddr_in *src_in = (struct sockaddr_in *)src_sock; + const struct sockaddr_in *dst_in = + (const struct sockaddr_in *)dst_sock; + __be32 src_ip = src_in->sin_addr.s_addr; __be32 dst_ip = dst_in->sin_addr.s_addr; struct rtable *rt; @@ -383,16 +401,8 @@ static int addr4_resolve(struct sockaddr_in *src_in, if (ret) return ret; - src_in->sin_family = AF_INET; src_in->sin_addr.s_addr = fl4.saddr; - /* If there's a gateway and type of device not ARPHRD_INFINIBAND, we're - * definitely in RoCE v2 (as RoCE v1 isn't routable) set the network - * type accordingly. - */ - if (rt->rt_uses_gateway && rt->dst.dev->type != ARPHRD_INFINIBAND) - addr->network = RDMA_NETWORK_IPV4; - addr->hoplimit = ip4_dst_hoplimit(&rt->dst); *prt = rt; @@ -400,14 +410,16 @@ static int addr4_resolve(struct sockaddr_in *src_in, } #if IS_ENABLED(CONFIG_IPV6) -static int addr6_resolve(struct sockaddr_in6 *src_in, - const struct sockaddr_in6 *dst_in, +static int addr6_resolve(struct sockaddr *src_sock, + const struct sockaddr *dst_sock, struct rdma_dev_addr *addr, struct dst_entry **pdst) { + struct sockaddr_in6 *src_in = (struct sockaddr_in6 *)src_sock; + const struct sockaddr_in6 *dst_in = + (const struct sockaddr_in6 *)dst_sock; struct flowi6 fl6; struct dst_entry *dst; - struct rt6_info *rt; int ret; memset(&fl6, 0, sizeof fl6); @@ -419,19 +431,8 @@ static int addr6_resolve(struct sockaddr_in6 *src_in, if (ret < 0) return ret; - rt = (struct rt6_info *)dst; - if (ipv6_addr_any(&src_in->sin6_addr)) { - src_in->sin6_family = AF_INET6; + if (ipv6_addr_any(&src_in->sin6_addr)) src_in->sin6_addr = fl6.saddr; - } - - /* If there's a gateway and type of device not ARPHRD_INFINIBAND, we're - * definitely in RoCE v2 (as RoCE v1 isn't routable) set the network - * type accordingly. - */ - if (rt->rt6i_flags & RTF_GATEWAY && - ip6_dst_idev(dst)->dev->type != ARPHRD_INFINIBAND) - addr->network = RDMA_NETWORK_IPV6; addr->hoplimit = ip6_dst_hoplimit(dst); @@ -439,8 +440,8 @@ static int addr6_resolve(struct sockaddr_in6 *src_in, return 0; } #else -static int addr6_resolve(struct sockaddr_in6 *src_in, - const struct sockaddr_in6 *dst_in, +static int addr6_resolve(struct sockaddr *src_sock, + const struct sockaddr *dst_sock, struct rdma_dev_addr *addr, struct dst_entry **pdst) { @@ -451,36 +452,110 @@ static int addr6_resolve(struct sockaddr_in6 *src_in, static int addr_resolve_neigh(const struct dst_entry *dst, const struct sockaddr *dst_in, struct rdma_dev_addr *addr, + unsigned int ndev_flags, u32 seq) { - if (dst->dev->flags & IFF_LOOPBACK) { - int ret; + int ret = 0; - ret = rdma_translate_ip(dst_in, addr); - if (!ret) - memcpy(addr->dst_dev_addr, addr->src_dev_addr, - MAX_ADDR_LEN); + if (ndev_flags & IFF_LOOPBACK) { + memcpy(addr->dst_dev_addr, addr->src_dev_addr, MAX_ADDR_LEN); + } else { + if (!(ndev_flags & IFF_NOARP)) { + /* If the device doesn't do ARP internally */ + ret = fetch_ha(dst, addr, dst_in, seq); + } + } + return ret; +} - return ret; +static int copy_src_l2_addr(struct rdma_dev_addr *dev_addr, + const struct sockaddr *dst_in, + const struct dst_entry *dst, + const struct net_device *ndev) +{ + int ret = 0; + + if (dst->dev->flags & IFF_LOOPBACK) + ret = rdma_translate_ip(dst_in, dev_addr); + else + rdma_copy_src_l2_addr(dev_addr, dst->dev); + + /* + * If there's a gateway and type of device not ARPHRD_INFINIBAND, + * we're definitely in RoCE v2 (as RoCE v1 isn't routable) set the + * network type accordingly. + */ + if (has_gateway(dst, dst_in->sa_family) && + ndev->type != ARPHRD_INFINIBAND) + dev_addr->network = dst_in->sa_family == AF_INET ? + RDMA_NETWORK_IPV4 : + RDMA_NETWORK_IPV6; + else + dev_addr->network = RDMA_NETWORK_IB; + + return ret; +} + +static int rdma_set_src_addr_rcu(struct rdma_dev_addr *dev_addr, + unsigned int *ndev_flags, + const struct sockaddr *dst_in, + const struct dst_entry *dst) +{ + struct net_device *ndev = READ_ONCE(dst->dev); + + *ndev_flags = ndev->flags; + /* A physical device must be the RDMA device to use */ + if (ndev->flags & IFF_LOOPBACK) { + /* + * RDMA (IB/RoCE, iWarp) doesn't run on lo interface or + * loopback IP address. So if route is resolved to loopback + * interface, translate that to a real ndev based on non + * loopback IP address. + */ + ndev = rdma_find_ndev_for_src_ip_rcu(dev_net(ndev), dst_in); + if (IS_ERR(ndev)) + return -ENODEV; } - /* If the device doesn't do ARP internally */ - if (!(dst->dev->flags & IFF_NOARP)) - return fetch_ha(dst, addr, dst_in, seq); + return copy_src_l2_addr(dev_addr, dst_in, dst, ndev); +} + +static int set_addr_netns_by_gid_rcu(struct rdma_dev_addr *addr) +{ + struct net_device *ndev; - rdma_copy_addr(addr, dst->dev, NULL); + ndev = rdma_read_gid_attr_ndev_rcu(addr->sgid_attr); + if (IS_ERR(ndev)) + return PTR_ERR(ndev); + /* + * Since we are holding the rcu, reading net and ifindex + * are safe without any additional reference; because + * change_net_namespace() in net/core/dev.c does rcu sync + * after it changes the state to IFF_DOWN and before + * updating netdev fields {net, ifindex}. + */ + addr->net = dev_net(ndev); + addr->bound_dev_if = ndev->ifindex; return 0; } +static void rdma_addr_set_net_defaults(struct rdma_dev_addr *addr) +{ + addr->net = &init_net; + addr->bound_dev_if = 0; +} + static int addr_resolve(struct sockaddr *src_in, const struct sockaddr *dst_in, struct rdma_dev_addr *addr, bool resolve_neigh, + bool resolve_by_gid_attr, u32 seq) { - struct net_device *ndev; - struct dst_entry *dst; + struct dst_entry *dst = NULL; + unsigned int ndev_flags = 0; + struct rtable *rt = NULL; int ret; if (!addr->net) { @@ -488,58 +563,55 @@ static int addr_resolve(struct sockaddr *src_in, return -EINVAL; } - if (src_in->sa_family == AF_INET) { - struct rtable *rt = NULL; - const struct sockaddr_in *dst_in4 = - (const struct sockaddr_in *)dst_in; - - ret = addr4_resolve((struct sockaddr_in *)src_in, - dst_in4, addr, &rt); - if (ret) - return ret; - - if (resolve_neigh) - ret = addr_resolve_neigh(&rt->dst, dst_in, addr, seq); - - if (addr->bound_dev_if) { - ndev = dev_get_by_index(addr->net, addr->bound_dev_if); - } else { - ndev = rt->dst.dev; - dev_hold(ndev); + rcu_read_lock(); + if (resolve_by_gid_attr) { + if (!addr->sgid_attr) { + rcu_read_unlock(); + pr_warn_ratelimited("%s: missing gid_attr\n", __func__); + return -EINVAL; } - - ip_rt_put(rt); - } else { - const struct sockaddr_in6 *dst_in6 = - (const struct sockaddr_in6 *)dst_in; - - ret = addr6_resolve((struct sockaddr_in6 *)src_in, - dst_in6, addr, - &dst); - if (ret) + /* + * If the request is for a specific gid attribute of the + * rdma_dev_addr, derive net from the netdevice of the + * GID attribute. + */ + ret = set_addr_netns_by_gid_rcu(addr); + if (ret) { + rcu_read_unlock(); return ret; - - if (resolve_neigh) - ret = addr_resolve_neigh(dst, dst_in, addr, seq); - - if (addr->bound_dev_if) { - ndev = dev_get_by_index(addr->net, addr->bound_dev_if); - } else { - ndev = dst->dev; - dev_hold(ndev); } - - dst_release(dst); } - - if (ndev) { - if (ndev->flags & IFF_LOOPBACK) - ret = rdma_translate_ip(dst_in, addr); - else - addr->bound_dev_if = ndev->ifindex; - dev_put(ndev); + if (src_in->sa_family == AF_INET) { + ret = addr4_resolve(src_in, dst_in, addr, &rt); + dst = &rt->dst; + } else { + ret = addr6_resolve(src_in, dst_in, addr, &dst); } + if (ret) { + rcu_read_unlock(); + goto done; + } + ret = rdma_set_src_addr_rcu(addr, &ndev_flags, dst_in, dst); + rcu_read_unlock(); + + /* + * Resolve neighbor destination address if requested and + * only if src addr translation didn't fail. + */ + if (!ret && resolve_neigh) + ret = addr_resolve_neigh(dst, dst_in, addr, ndev_flags, seq); + if (src_in->sa_family == AF_INET) + ip_rt_put(rt); + else + dst_release(dst); +done: + /* + * Clear the addr net to go back to its original state, only if it was + * derived from GID attribute in this context. + */ + if (resolve_by_gid_attr) + rdma_addr_set_net_defaults(addr); return ret; } @@ -554,7 +626,8 @@ static void process_one_req(struct work_struct *_work) src_in = (struct sockaddr *)&req->src_addr; dst_in = (struct sockaddr *)&req->dst_addr; req->status = addr_resolve(src_in, dst_in, req->addr, - true, req->seq); + true, req->resolve_by_gid_attr, + req->seq); if (req->status && time_after_eq(jiffies, req->timeout)) { req->status = -ETIMEDOUT; } else if (req->status == -ENODATA) { @@ -586,10 +659,10 @@ static void process_one_req(struct work_struct *_work) } int rdma_resolve_ip(struct sockaddr *src_addr, const struct sockaddr *dst_addr, - struct rdma_dev_addr *addr, int timeout_ms, + struct rdma_dev_addr *addr, unsigned long timeout_ms, void (*callback)(int status, struct sockaddr *src_addr, struct rdma_dev_addr *addr, void *context), - void *context) + bool resolve_by_gid_attr, void *context) { struct sockaddr *src_in, *dst_in; struct addr_req *req; @@ -617,10 +690,12 @@ int rdma_resolve_ip(struct sockaddr *src_addr, const struct sockaddr *dst_addr, req->addr = addr; req->callback = callback; req->context = context; + req->resolve_by_gid_attr = resolve_by_gid_attr; INIT_DELAYED_WORK(&req->work, process_one_req); req->seq = (u32)atomic_inc_return(&ib_nl_addr_request_seq); - req->status = addr_resolve(src_in, dst_in, addr, true, req->seq); + req->status = addr_resolve(src_in, dst_in, addr, true, + req->resolve_by_gid_attr, req->seq); switch (req->status) { case 0: req->timeout = jiffies; @@ -641,25 +716,53 @@ err: } EXPORT_SYMBOL(rdma_resolve_ip); -int rdma_resolve_ip_route(struct sockaddr *src_addr, - const struct sockaddr *dst_addr, - struct rdma_dev_addr *addr) +int roce_resolve_route_from_path(struct sa_path_rec *rec, + const struct ib_gid_attr *attr) { - struct sockaddr_storage ssrc_addr = {}; - struct sockaddr *src_in = (struct sockaddr *)&ssrc_addr; + union { + struct sockaddr _sockaddr; + struct sockaddr_in _sockaddr_in; + struct sockaddr_in6 _sockaddr_in6; + } sgid, dgid; + struct rdma_dev_addr dev_addr = {}; + int ret; - if (src_addr) { - if (src_addr->sa_family != dst_addr->sa_family) - return -EINVAL; + if (rec->roce.route_resolved) + return 0; - memcpy(src_in, src_addr, rdma_addr_size(src_addr)); - } else { - src_in->sa_family = dst_addr->sa_family; - } + rdma_gid2ip(&sgid._sockaddr, &rec->sgid); + rdma_gid2ip(&dgid._sockaddr, &rec->dgid); + + if (sgid._sockaddr.sa_family != dgid._sockaddr.sa_family) + return -EINVAL; + + if (!attr || !attr->ndev) + return -EINVAL; + + dev_addr.net = &init_net; + dev_addr.sgid_attr = attr; + + ret = addr_resolve(&sgid._sockaddr, &dgid._sockaddr, + &dev_addr, false, true, 0); + if (ret) + return ret; + + if ((dev_addr.network == RDMA_NETWORK_IPV4 || + dev_addr.network == RDMA_NETWORK_IPV6) && + rec->rec_type != SA_PATH_REC_TYPE_ROCE_V2) + return -EINVAL; - return addr_resolve(src_in, dst_addr, addr, false, 0); + rec->roce.route_resolved = true; + return 0; } +/** + * rdma_addr_cancel - Cancel resolve ip request + * @addr: Pointer to address structure given previously + * during rdma_resolve_ip(). + * rdma_addr_cancel() is synchronous function which cancels any pending + * request if there is any. + */ void rdma_addr_cancel(struct rdma_dev_addr *addr) { struct addr_req *req, *temp_req; @@ -687,11 +790,6 @@ void rdma_addr_cancel(struct rdma_dev_addr *addr) * guarentees no work is running and none will be started. */ cancel_delayed_work_sync(&found->work); - - if (found->callback) - found->callback(-ECANCELED, (struct sockaddr *)&found->src_addr, - found->addr, found->context); - kfree(found); } EXPORT_SYMBOL(rdma_addr_cancel); @@ -710,7 +808,7 @@ static void resolve_cb(int status, struct sockaddr *src_addr, int rdma_addr_find_l2_eth_by_grh(const union ib_gid *sgid, const union ib_gid *dgid, - u8 *dmac, const struct net_device *ndev, + u8 *dmac, const struct ib_gid_attr *sgid_attr, int *hoplimit) { struct rdma_dev_addr dev_addr; @@ -726,12 +824,12 @@ int rdma_addr_find_l2_eth_by_grh(const union ib_gid *sgid, rdma_gid2ip(&dgid_addr._sockaddr, dgid); memset(&dev_addr, 0, sizeof(dev_addr)); - dev_addr.bound_dev_if = ndev->ifindex; dev_addr.net = &init_net; + dev_addr.sgid_attr = sgid_attr; init_completion(&ctx.comp); ret = rdma_resolve_ip(&sgid_addr._sockaddr, &dgid_addr._sockaddr, - &dev_addr, 1000, resolve_cb, &ctx); + &dev_addr, 1000, resolve_cb, true, &ctx); if (ret) return ret; diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c index 3208ad6ad540..5b2fce4a7091 100644 --- a/drivers/infiniband/core/cache.c +++ b/drivers/infiniband/core/cache.c @@ -212,9 +212,8 @@ static void free_gid_entry_locked(struct ib_gid_table_entry *entry) u8 port_num = entry->attr.port_num; struct ib_gid_table *table = rdma_gid_table(device, port_num); - pr_debug("%s device=%s port=%d index=%d gid %pI6\n", __func__, - device->name, port_num, entry->attr.index, - entry->attr.gid.raw); + dev_dbg(&device->dev, "%s port=%d index=%d gid %pI6\n", __func__, + port_num, entry->attr.index, entry->attr.gid.raw); if (rdma_cap_roce_gid_table(device, port_num) && entry->state != GID_TABLE_ENTRY_INVALID) @@ -289,9 +288,9 @@ static void store_gid_entry(struct ib_gid_table *table, { entry->state = GID_TABLE_ENTRY_VALID; - pr_debug("%s device=%s port=%d index=%d gid %pI6\n", __func__, - entry->attr.device->name, entry->attr.port_num, - entry->attr.index, entry->attr.gid.raw); + dev_dbg(&entry->attr.device->dev, "%s port=%d index=%d gid %pI6\n", + __func__, entry->attr.port_num, entry->attr.index, + entry->attr.gid.raw); lockdep_assert_held(&table->lock); write_lock_irq(&table->rwlock); @@ -320,17 +319,16 @@ static int add_roce_gid(struct ib_gid_table_entry *entry) int ret; if (!attr->ndev) { - pr_err("%s NULL netdev device=%s port=%d index=%d\n", - __func__, attr->device->name, attr->port_num, - attr->index); + dev_err(&attr->device->dev, "%s NULL netdev port=%d index=%d\n", + __func__, attr->port_num, attr->index); return -EINVAL; } if (rdma_cap_roce_gid_table(attr->device, attr->port_num)) { ret = attr->device->add_gid(attr, &entry->context); if (ret) { - pr_err("%s GID add failed device=%s port=%d index=%d\n", - __func__, attr->device->name, attr->port_num, - attr->index); + dev_err(&attr->device->dev, + "%s GID add failed port=%d index=%d\n", + __func__, attr->port_num, attr->index); return ret; } } @@ -353,9 +351,8 @@ static void del_gid(struct ib_device *ib_dev, u8 port, lockdep_assert_held(&table->lock); - pr_debug("%s device=%s port=%d index=%d gid %pI6\n", __func__, - ib_dev->name, port, ix, - table->data_vec[ix]->attr.gid.raw); + dev_dbg(&ib_dev->dev, "%s port=%d index=%d gid %pI6\n", __func__, port, + ix, table->data_vec[ix]->attr.gid.raw); write_lock_irq(&table->rwlock); entry = table->data_vec[ix]; @@ -782,9 +779,9 @@ static void release_gid_table(struct ib_device *device, u8 port, if (is_gid_entry_free(table->data_vec[i])) continue; if (kref_read(&table->data_vec[i]->kref) > 1) { - pr_err("GID entry ref leak for %s (index %d) ref=%d\n", - device->name, i, - kref_read(&table->data_vec[i]->kref)); + dev_err(&device->dev, + "GID entry ref leak for index %d ref=%d\n", i, + kref_read(&table->data_vec[i]->kref)); leak = true; } } @@ -1252,6 +1249,39 @@ void rdma_hold_gid_attr(const struct ib_gid_attr *attr) } EXPORT_SYMBOL(rdma_hold_gid_attr); +/** + * rdma_read_gid_attr_ndev_rcu - Read GID attribute netdevice + * which must be in UP state. + * + * @attr:Pointer to the GID attribute + * + * Returns pointer to netdevice if the netdevice was attached to GID and + * netdevice is in UP state. Caller must hold RCU lock as this API + * reads the netdev flags which can change while netdevice migrates to + * different net namespace. Returns ERR_PTR with error code otherwise. + * + */ +struct net_device *rdma_read_gid_attr_ndev_rcu(const struct ib_gid_attr *attr) +{ + struct ib_gid_table_entry *entry = + container_of(attr, struct ib_gid_table_entry, attr); + struct ib_device *device = entry->attr.device; + struct net_device *ndev = ERR_PTR(-ENODEV); + u8 port_num = entry->attr.port_num; + struct ib_gid_table *table; + unsigned long flags; + bool valid; + + table = rdma_gid_table(device, port_num); + + read_lock_irqsave(&table->rwlock, flags); + valid = is_gid_entry_valid(table->data_vec[attr->index]); + if (valid && attr->ndev && (READ_ONCE(attr->ndev->flags) & IFF_UP)) + ndev = attr->ndev; + read_unlock_irqrestore(&table->rwlock, flags); + return ndev; +} + static int config_non_roce_gid_cache(struct ib_device *device, u8 port, int gid_tbl_len) { @@ -1270,8 +1300,9 @@ static int config_non_roce_gid_cache(struct ib_device *device, continue; ret = device->query_gid(device, port, i, &gid_attr.gid); if (ret) { - pr_warn("query_gid failed (%d) for %s (index %d)\n", - ret, device->name, i); + dev_warn(&device->dev, + "query_gid failed (%d) for index %d\n", ret, + i); goto err; } gid_attr.index = i; @@ -1300,8 +1331,7 @@ static void ib_cache_update(struct ib_device *device, ret = ib_query_port(device, port, tprops); if (ret) { - pr_warn("ib_query_port failed (%d) for %s\n", - ret, device->name); + dev_warn(&device->dev, "ib_query_port failed (%d)\n", ret); goto err; } @@ -1323,8 +1353,9 @@ static void ib_cache_update(struct ib_device *device, for (i = 0; i < pkey_cache->table_len; ++i) { ret = ib_query_pkey(device, port, i, pkey_cache->table + i); if (ret) { - pr_warn("ib_query_pkey failed (%d) for %s (index %d)\n", - ret, device->name, i); + dev_warn(&device->dev, + "ib_query_pkey failed (%d) for index %d\n", + ret, i); goto err; } } diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c index 6e39c27dca8e..edb2cb758be7 100644 --- a/drivers/infiniband/core/cm.c +++ b/drivers/infiniband/core/cm.c @@ -3292,8 +3292,11 @@ static int cm_lap_handler(struct cm_work *work) if (ret) goto unlock; - cm_init_av_by_path(param->alternate_path, NULL, &cm_id_priv->alt_av, - cm_id_priv); + ret = cm_init_av_by_path(param->alternate_path, NULL, + &cm_id_priv->alt_av, cm_id_priv); + if (ret) + goto unlock; + cm_id_priv->id.lap_state = IB_CM_LAP_RCVD; cm_id_priv->tid = lap_msg->hdr.tid; ret = atomic_inc_and_test(&cm_id_priv->work_count); @@ -4367,7 +4370,7 @@ static void cm_add_one(struct ib_device *ib_device) cm_dev->going_down = 0; cm_dev->device = device_create(&cm_class, &ib_device->dev, MKDEV(0, 0), NULL, - "%s", ib_device->name); + "%s", dev_name(&ib_device->dev)); if (IS_ERR(cm_dev->device)) { kfree(cm_dev); return; diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index a36c94930c31..15d5bb7bf6bb 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -639,13 +639,21 @@ static void cma_bind_sgid_attr(struct rdma_id_private *id_priv, id_priv->id.route.addr.dev_addr.sgid_attr = sgid_attr; } -static int cma_acquire_dev(struct rdma_id_private *id_priv, - const struct rdma_id_private *listen_id_priv) +/** + * cma_acquire_dev_by_src_ip - Acquire cma device, port, gid attribute + * based on source ip address. + * @id_priv: cm_id which should be bound to cma device + * + * cma_acquire_dev_by_src_ip() binds cm id to cma device, port and GID attribute + * based on source IP address. It returns 0 on success or error code otherwise. + * It is applicable to active and passive side cm_id. + */ +static int cma_acquire_dev_by_src_ip(struct rdma_id_private *id_priv) { struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; const struct ib_gid_attr *sgid_attr; - struct cma_device *cma_dev; union ib_gid gid, iboe_gid, *gidp; + struct cma_device *cma_dev; enum ib_gid_type gid_type; int ret = -ENODEV; u8 port; @@ -654,41 +662,125 @@ static int cma_acquire_dev(struct rdma_id_private *id_priv, id_priv->id.ps == RDMA_PS_IPOIB) return -EINVAL; - mutex_lock(&lock); rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr, &iboe_gid); memcpy(&gid, dev_addr->src_dev_addr + - rdma_addr_gid_offset(dev_addr), sizeof gid); - - if (listen_id_priv) { - cma_dev = listen_id_priv->cma_dev; - port = listen_id_priv->id.port_num; - gidp = rdma_protocol_roce(cma_dev->device, port) ? - &iboe_gid : &gid; - gid_type = listen_id_priv->gid_type; - sgid_attr = cma_validate_port(cma_dev->device, port, - gid_type, gidp, id_priv); - if (!IS_ERR(sgid_attr)) { - id_priv->id.port_num = port; - cma_bind_sgid_attr(id_priv, sgid_attr); - ret = 0; - goto out; + rdma_addr_gid_offset(dev_addr), sizeof(gid)); + + mutex_lock(&lock); + list_for_each_entry(cma_dev, &dev_list, list) { + for (port = rdma_start_port(cma_dev->device); + port <= rdma_end_port(cma_dev->device); port++) { + gidp = rdma_protocol_roce(cma_dev->device, port) ? + &iboe_gid : &gid; + gid_type = cma_dev->default_gid_type[port - 1]; + sgid_attr = cma_validate_port(cma_dev->device, port, + gid_type, gidp, id_priv); + if (!IS_ERR(sgid_attr)) { + id_priv->id.port_num = port; + cma_bind_sgid_attr(id_priv, sgid_attr); + cma_attach_to_dev(id_priv, cma_dev); + ret = 0; + goto out; + } } } +out: + mutex_unlock(&lock); + return ret; +} + +/** + * cma_ib_acquire_dev - Acquire cma device, port and SGID attribute + * @id_priv: cm id to bind to cma device + * @listen_id_priv: listener cm id to match against + * @req: Pointer to req structure containaining incoming + * request information + * cma_ib_acquire_dev() acquires cma device, port and SGID attribute when + * rdma device matches for listen_id and incoming request. It also verifies + * that a GID table entry is present for the source address. + * Returns 0 on success, or returns error code otherwise. + */ +static int cma_ib_acquire_dev(struct rdma_id_private *id_priv, + const struct rdma_id_private *listen_id_priv, + struct cma_req_info *req) +{ + struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; + const struct ib_gid_attr *sgid_attr; + enum ib_gid_type gid_type; + union ib_gid gid; + + if (dev_addr->dev_type != ARPHRD_INFINIBAND && + id_priv->id.ps == RDMA_PS_IPOIB) + return -EINVAL; + + if (rdma_protocol_roce(req->device, req->port)) + rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr, + &gid); + else + memcpy(&gid, dev_addr->src_dev_addr + + rdma_addr_gid_offset(dev_addr), sizeof(gid)); + + gid_type = listen_id_priv->cma_dev->default_gid_type[req->port - 1]; + sgid_attr = cma_validate_port(req->device, req->port, + gid_type, &gid, id_priv); + if (IS_ERR(sgid_attr)) + return PTR_ERR(sgid_attr); + + id_priv->id.port_num = req->port; + cma_bind_sgid_attr(id_priv, sgid_attr); + /* Need to acquire lock to protect against reader + * of cma_dev->id_list such as cma_netdev_callback() and + * cma_process_remove(). + */ + mutex_lock(&lock); + cma_attach_to_dev(id_priv, listen_id_priv->cma_dev); + mutex_unlock(&lock); + return 0; +} + +static int cma_iw_acquire_dev(struct rdma_id_private *id_priv, + const struct rdma_id_private *listen_id_priv) +{ + struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; + const struct ib_gid_attr *sgid_attr; + struct cma_device *cma_dev; + enum ib_gid_type gid_type; + int ret = -ENODEV; + union ib_gid gid; + u8 port; + + if (dev_addr->dev_type != ARPHRD_INFINIBAND && + id_priv->id.ps == RDMA_PS_IPOIB) + return -EINVAL; + + memcpy(&gid, dev_addr->src_dev_addr + + rdma_addr_gid_offset(dev_addr), sizeof(gid)); + + mutex_lock(&lock); + + cma_dev = listen_id_priv->cma_dev; + port = listen_id_priv->id.port_num; + gid_type = listen_id_priv->gid_type; + sgid_attr = cma_validate_port(cma_dev->device, port, + gid_type, &gid, id_priv); + if (!IS_ERR(sgid_attr)) { + id_priv->id.port_num = port; + cma_bind_sgid_attr(id_priv, sgid_attr); + ret = 0; + goto out; + } list_for_each_entry(cma_dev, &dev_list, list) { for (port = 1; port <= cma_dev->device->phys_port_cnt; ++port) { - if (listen_id_priv && - listen_id_priv->cma_dev == cma_dev && + if (listen_id_priv->cma_dev == cma_dev && listen_id_priv->id.port_num == port) continue; - gidp = rdma_protocol_roce(cma_dev->device, port) ? - &iboe_gid : &gid; gid_type = cma_dev->default_gid_type[port - 1]; sgid_attr = cma_validate_port(cma_dev->device, port, - gid_type, gidp, id_priv); + gid_type, &gid, id_priv); if (!IS_ERR(sgid_attr)) { id_priv->id.port_num = port; cma_bind_sgid_attr(id_priv, sgid_attr); @@ -785,10 +877,7 @@ struct rdma_cm_id *__rdma_create_id(struct net *net, if (!id_priv) return ERR_PTR(-ENOMEM); - if (caller) - id_priv->res.kern_name = caller; - else - rdma_restrack_set_task(&id_priv->res, current); + rdma_restrack_set_task(&id_priv->res, caller); id_priv->res.type = RDMA_RESTRACK_CM_ID; id_priv->state = RDMA_CM_IDLE; id_priv->id.context = context; @@ -1462,18 +1551,35 @@ static bool cma_protocol_roce(const struct rdma_cm_id *id) return rdma_protocol_roce(device, port_num); } +static bool cma_is_req_ipv6_ll(const struct cma_req_info *req) +{ + const struct sockaddr *daddr = + (const struct sockaddr *)&req->listen_addr_storage; + const struct sockaddr_in6 *daddr6 = (const struct sockaddr_in6 *)daddr; + + /* Returns true if the req is for IPv6 link local */ + return (daddr->sa_family == AF_INET6 && + (ipv6_addr_type(&daddr6->sin6_addr) & IPV6_ADDR_LINKLOCAL)); +} + static bool cma_match_net_dev(const struct rdma_cm_id *id, const struct net_device *net_dev, - u8 port_num) + const struct cma_req_info *req) { const struct rdma_addr *addr = &id->route.addr; if (!net_dev) /* This request is an AF_IB request */ - return (!id->port_num || id->port_num == port_num) && + return (!id->port_num || id->port_num == req->port) && (addr->src_addr.ss_family == AF_IB); /* + * If the request is not for IPv6 link local, allow matching + * request to any netdevice of the one or multiport rdma device. + */ + if (!cma_is_req_ipv6_ll(req)) + return true; + /* * Net namespaces must match, and if the listner is listening * on a specific netdevice than netdevice must match as well. */ @@ -1500,13 +1606,14 @@ static struct rdma_id_private *cma_find_listener( hlist_for_each_entry(id_priv, &bind_list->owners, node) { if (cma_match_private_data(id_priv, ib_event->private_data)) { if (id_priv->id.device == cm_id->device && - cma_match_net_dev(&id_priv->id, net_dev, req->port)) + cma_match_net_dev(&id_priv->id, net_dev, req)) return id_priv; list_for_each_entry(id_priv_dev, &id_priv->listen_list, listen_list) { if (id_priv_dev->id.device == cm_id->device && - cma_match_net_dev(&id_priv_dev->id, net_dev, req->port)) + cma_match_net_dev(&id_priv_dev->id, + net_dev, req)) return id_priv_dev; } } @@ -1518,18 +1625,18 @@ static struct rdma_id_private *cma_find_listener( static struct rdma_id_private * cma_ib_id_from_event(struct ib_cm_id *cm_id, const struct ib_cm_event *ib_event, + struct cma_req_info *req, struct net_device **net_dev) { - struct cma_req_info req; struct rdma_bind_list *bind_list; struct rdma_id_private *id_priv; int err; - err = cma_save_req_info(ib_event, &req); + err = cma_save_req_info(ib_event, req); if (err) return ERR_PTR(err); - *net_dev = cma_get_net_dev(ib_event, &req); + *net_dev = cma_get_net_dev(ib_event, req); if (IS_ERR(*net_dev)) { if (PTR_ERR(*net_dev) == -EAFNOSUPPORT) { /* Assuming the protocol is AF_IB */ @@ -1567,17 +1674,17 @@ cma_ib_id_from_event(struct ib_cm_id *cm_id, } if (!validate_net_dev(*net_dev, - (struct sockaddr *)&req.listen_addr_storage, - (struct sockaddr *)&req.src_addr_storage)) { + (struct sockaddr *)&req->listen_addr_storage, + (struct sockaddr *)&req->src_addr_storage)) { id_priv = ERR_PTR(-EHOSTUNREACH); goto err; } } bind_list = cma_ps_find(*net_dev ? dev_net(*net_dev) : &init_net, - rdma_ps_from_service_id(req.service_id), - cma_port_from_service_id(req.service_id)); - id_priv = cma_find_listener(bind_list, cm_id, ib_event, &req, *net_dev); + rdma_ps_from_service_id(req->service_id), + cma_port_from_service_id(req->service_id)); + id_priv = cma_find_listener(bind_list, cm_id, ib_event, req, *net_dev); err: rcu_read_unlock(); if (IS_ERR(id_priv) && *net_dev) { @@ -1710,8 +1817,8 @@ void rdma_destroy_id(struct rdma_cm_id *id) mutex_lock(&id_priv->handler_mutex); mutex_unlock(&id_priv->handler_mutex); + rdma_restrack_del(&id_priv->res); if (id_priv->cma_dev) { - rdma_restrack_del(&id_priv->res); if (rdma_cap_ib_cm(id_priv->id.device, 1)) { if (id_priv->cm_id.ib) ib_destroy_cm_id(id_priv->cm_id.ib); @@ -1902,7 +2009,7 @@ cma_ib_new_conn_id(const struct rdma_cm_id *listen_id, rt->path_rec[1] = *ib_event->param.req_rcvd.alternate_path; if (net_dev) { - rdma_copy_addr(&rt->addr.dev_addr, net_dev, NULL); + rdma_copy_src_l2_addr(&rt->addr.dev_addr, net_dev); } else { if (!cma_protocol_roce(listen_id) && cma_any_addr(cma_src_addr(id_priv))) { @@ -1952,7 +2059,7 @@ cma_ib_new_udp_id(const struct rdma_cm_id *listen_id, goto err; if (net_dev) { - rdma_copy_addr(&id->route.addr.dev_addr, net_dev, NULL); + rdma_copy_src_l2_addr(&id->route.addr.dev_addr, net_dev); } else { if (!cma_any_addr(cma_src_addr(id_priv))) { ret = cma_translate_addr(cma_src_addr(id_priv), @@ -1999,11 +2106,12 @@ static int cma_ib_req_handler(struct ib_cm_id *cm_id, { struct rdma_id_private *listen_id, *conn_id = NULL; struct rdma_cm_event event = {}; + struct cma_req_info req = {}; struct net_device *net_dev; u8 offset; int ret; - listen_id = cma_ib_id_from_event(cm_id, ib_event, &net_dev); + listen_id = cma_ib_id_from_event(cm_id, ib_event, &req, &net_dev); if (IS_ERR(listen_id)) return PTR_ERR(listen_id); @@ -2036,7 +2144,7 @@ static int cma_ib_req_handler(struct ib_cm_id *cm_id, } mutex_lock_nested(&conn_id->handler_mutex, SINGLE_DEPTH_NESTING); - ret = cma_acquire_dev(conn_id, listen_id); + ret = cma_ib_acquire_dev(conn_id, listen_id, &req); if (ret) goto err2; @@ -2232,7 +2340,7 @@ static int iw_conn_req_handler(struct iw_cm_id *cm_id, goto out; } - ret = cma_acquire_dev(conn_id, listen_id); + ret = cma_iw_acquire_dev(conn_id, listen_id); if (ret) { mutex_unlock(&conn_id->handler_mutex); rdma_destroy_id(new_cm_id); @@ -2354,8 +2462,8 @@ static void cma_listen_on_dev(struct rdma_id_private *id_priv, ret = rdma_listen(id, id_priv->backlog); if (ret) - pr_warn("RDMA CMA: cma_listen_on_dev, error %d, listening on device %s\n", - ret, cma_dev->device->name); + dev_warn(&cma_dev->device->dev, + "RDMA CMA: cma_listen_on_dev, error %d\n", ret); } static void cma_listen_on_all(struct rdma_id_private *id_priv) @@ -2402,8 +2510,8 @@ static void cma_query_handler(int status, struct sa_path_rec *path_rec, queue_work(cma_wq, &work->work); } -static int cma_query_ib_route(struct rdma_id_private *id_priv, int timeout_ms, - struct cma_work *work) +static int cma_query_ib_route(struct rdma_id_private *id_priv, + unsigned long timeout_ms, struct cma_work *work) { struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; struct sa_path_rec path_rec; @@ -2521,7 +2629,8 @@ static void cma_init_resolve_addr_work(struct cma_work *work, work->event.event = RDMA_CM_EVENT_ADDR_RESOLVED; } -static int cma_resolve_ib_route(struct rdma_id_private *id_priv, int timeout_ms) +static int cma_resolve_ib_route(struct rdma_id_private *id_priv, + unsigned long timeout_ms) { struct rdma_route *route = &id_priv->id.route; struct cma_work *work; @@ -2643,7 +2752,7 @@ err: } EXPORT_SYMBOL(rdma_set_ib_path); -static int cma_resolve_iw_route(struct rdma_id_private *id_priv, int timeout_ms) +static int cma_resolve_iw_route(struct rdma_id_private *id_priv) { struct cma_work *work; @@ -2744,7 +2853,7 @@ err1: return ret; } -int rdma_resolve_route(struct rdma_cm_id *id, int timeout_ms) +int rdma_resolve_route(struct rdma_cm_id *id, unsigned long timeout_ms) { struct rdma_id_private *id_priv; int ret; @@ -2759,7 +2868,7 @@ int rdma_resolve_route(struct rdma_cm_id *id, int timeout_ms) else if (rdma_protocol_roce(id->device, id->port_num)) ret = cma_resolve_iboe_route(id_priv); else if (rdma_protocol_iwarp(id->device, id->port_num)) - ret = cma_resolve_iw_route(id_priv, timeout_ms); + ret = cma_resolve_iw_route(id_priv); else ret = -ENOSYS; @@ -2862,7 +2971,7 @@ static void addr_handler(int status, struct sockaddr *src_addr, memcpy(cma_src_addr(id_priv), src_addr, rdma_addr_size(src_addr)); if (!status && !id_priv->cma_dev) { - status = cma_acquire_dev(id_priv, NULL); + status = cma_acquire_dev_by_src_ip(id_priv); if (status) pr_debug_ratelimited("RDMA CM: ADDR_ERROR: failed to acquire device. status %d\n", status); @@ -2882,13 +2991,11 @@ static void addr_handler(int status, struct sockaddr *src_addr, if (id_priv->id.event_handler(&id_priv->id, &event)) { cma_exch(id_priv, RDMA_CM_DESTROYING); mutex_unlock(&id_priv->handler_mutex); - cma_deref_id(id_priv); rdma_destroy_id(&id_priv->id); return; } out: mutex_unlock(&id_priv->handler_mutex); - cma_deref_id(id_priv); } static int cma_resolve_loopback(struct rdma_id_private *id_priv) @@ -2966,7 +3073,7 @@ static int cma_bind_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, } int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, - const struct sockaddr *dst_addr, int timeout_ms) + const struct sockaddr *dst_addr, unsigned long timeout_ms) { struct rdma_id_private *id_priv; int ret; @@ -2985,16 +3092,16 @@ int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, return -EINVAL; memcpy(cma_dst_addr(id_priv), dst_addr, rdma_addr_size(dst_addr)); - atomic_inc(&id_priv->refcount); if (cma_any_addr(dst_addr)) { ret = cma_resolve_loopback(id_priv); } else { if (dst_addr->sa_family == AF_IB) { ret = cma_resolve_ib_addr(id_priv); } else { - ret = rdma_resolve_ip(cma_src_addr(id_priv), - dst_addr, &id->route.addr.dev_addr, - timeout_ms, addr_handler, id_priv); + ret = rdma_resolve_ip(cma_src_addr(id_priv), dst_addr, + &id->route.addr.dev_addr, + timeout_ms, addr_handler, + false, id_priv); } } if (ret) @@ -3003,7 +3110,6 @@ int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, return 0; err: cma_comp_exch(id_priv, RDMA_CM_ADDR_QUERY, RDMA_CM_ADDR_BOUND); - cma_deref_id(id_priv); return ret; } EXPORT_SYMBOL(rdma_resolve_addr); @@ -3414,7 +3520,7 @@ int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr) if (ret) goto err1; - ret = cma_acquire_dev(id_priv, NULL); + ret = cma_acquire_dev_by_src_ip(id_priv); if (ret) goto err1; } @@ -3439,10 +3545,9 @@ int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr) return 0; err2: - if (id_priv->cma_dev) { - rdma_restrack_del(&id_priv->res); + rdma_restrack_del(&id_priv->res); + if (id_priv->cma_dev) cma_release_dev(id_priv); - } err1: cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_IDLE); return ret; @@ -3839,10 +3944,7 @@ int __rdma_accept(struct rdma_cm_id *id, struct rdma_conn_param *conn_param, id_priv = container_of(id, struct rdma_id_private, id); - if (caller) - id_priv->res.kern_name = caller; - else - rdma_restrack_set_task(&id_priv->res, current); + rdma_restrack_set_task(&id_priv->res, caller); if (!cma_comp(id_priv, RDMA_CM_CONNECT)) return -EINVAL; @@ -4087,9 +4189,10 @@ static int cma_join_ib_multicast(struct rdma_id_private *id_priv, (!ib_sa_sendonly_fullmem_support(&sa_client, id_priv->id.device, id_priv->id.port_num))) { - pr_warn("RDMA CM: %s port %u Unable to multicast join\n" - "RDMA CM: SM doesn't support Send Only Full Member option\n", - id_priv->id.device->name, id_priv->id.port_num); + dev_warn( + &id_priv->id.device->dev, + "RDMA CM: port %u Unable to multicast join: SM doesn't support Send Only Full Member option\n", + id_priv->id.port_num); return -EOPNOTSUPP; } diff --git a/drivers/infiniband/core/cma_configfs.c b/drivers/infiniband/core/cma_configfs.c index eee38b40be99..8c2dfb3e294e 100644 --- a/drivers/infiniband/core/cma_configfs.c +++ b/drivers/infiniband/core/cma_configfs.c @@ -65,7 +65,7 @@ static struct cma_dev_port_group *to_dev_port_group(struct config_item *item) static bool filter_by_name(struct ib_device *ib_dev, void *cookie) { - return !strcmp(ib_dev->name, cookie); + return !strcmp(dev_name(&ib_dev->dev), cookie); } static int cma_configfs_params_get(struct config_item *item, diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h index 77c7005c396c..bb9007a0cca7 100644 --- a/drivers/infiniband/core/core_priv.h +++ b/drivers/infiniband/core/core_priv.h @@ -44,7 +44,7 @@ #include "mad_priv.h" /* Total number of ports combined across all struct ib_devices's */ -#define RDMA_MAX_PORTS 1024 +#define RDMA_MAX_PORTS 8192 struct pkey_index_qp_list { struct list_head pkey_index_list; @@ -87,6 +87,7 @@ int ib_device_register_sysfs(struct ib_device *device, int (*port_callback)(struct ib_device *, u8, struct kobject *)); void ib_device_unregister_sysfs(struct ib_device *device); +int ib_device_rename(struct ib_device *ibdev, const char *name); typedef void (*roce_netdev_callback)(struct ib_device *device, u8 port, struct net_device *idev, void *cookie); @@ -338,7 +339,14 @@ int rdma_resolve_ip_route(struct sockaddr *src_addr, int rdma_addr_find_l2_eth_by_grh(const union ib_gid *sgid, const union ib_gid *dgid, - u8 *dmac, const struct net_device *ndev, + u8 *dmac, const struct ib_gid_attr *sgid_attr, int *hoplimit); +void rdma_copy_src_l2_addr(struct rdma_dev_addr *dev_addr, + const struct net_device *dev); +struct sa_path_rec; +int roce_resolve_route_from_path(struct sa_path_rec *rec, + const struct ib_gid_attr *attr); + +struct net_device *rdma_read_gid_attr_ndev_rcu(const struct ib_gid_attr *attr); #endif /* _CORE_PRIV_H */ diff --git a/drivers/infiniband/core/cq.c b/drivers/infiniband/core/cq.c index af5ad6a56ae4..b1e5365ddafa 100644 --- a/drivers/infiniband/core/cq.c +++ b/drivers/infiniband/core/cq.c @@ -112,12 +112,12 @@ static void ib_cq_poll_work(struct work_struct *work) IB_POLL_BATCH); if (completed >= IB_POLL_BUDGET_WORKQUEUE || ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0) - queue_work(ib_comp_wq, &cq->work); + queue_work(cq->comp_wq, &cq->work); } static void ib_cq_completion_workqueue(struct ib_cq *cq, void *private) { - queue_work(ib_comp_wq, &cq->work); + queue_work(cq->comp_wq, &cq->work); } /** @@ -161,7 +161,7 @@ struct ib_cq *__ib_alloc_cq(struct ib_device *dev, void *private, goto out_destroy_cq; cq->res.type = RDMA_RESTRACK_CQ; - cq->res.kern_name = caller; + rdma_restrack_set_task(&cq->res, caller); rdma_restrack_add(&cq->res); switch (cq->poll_ctx) { @@ -175,9 +175,12 @@ struct ib_cq *__ib_alloc_cq(struct ib_device *dev, void *private, ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); break; case IB_POLL_WORKQUEUE: + case IB_POLL_UNBOUND_WORKQUEUE: cq->comp_handler = ib_cq_completion_workqueue; INIT_WORK(&cq->work, ib_cq_poll_work); ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); + cq->comp_wq = (cq->poll_ctx == IB_POLL_WORKQUEUE) ? + ib_comp_wq : ib_comp_unbound_wq; break; default: ret = -EINVAL; @@ -213,6 +216,7 @@ void ib_free_cq(struct ib_cq *cq) irq_poll_disable(&cq->iop); break; case IB_POLL_WORKQUEUE: + case IB_POLL_UNBOUND_WORKQUEUE: cancel_work_sync(&cq->work); break; default: diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index db3b6271f09d..87eb4f2cdd7d 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -61,6 +61,7 @@ struct ib_client_data { }; struct workqueue_struct *ib_comp_wq; +struct workqueue_struct *ib_comp_unbound_wq; struct workqueue_struct *ib_wq; EXPORT_SYMBOL_GPL(ib_wq); @@ -122,8 +123,9 @@ static int ib_device_check_mandatory(struct ib_device *device) for (i = 0; i < ARRAY_SIZE(mandatory_table); ++i) { if (!*(void **) ((void *) device + mandatory_table[i].offset)) { - pr_warn("Device %s is missing mandatory function %s\n", - device->name, mandatory_table[i].name); + dev_warn(&device->dev, + "Device is missing mandatory function %s\n", + mandatory_table[i].name); return -EINVAL; } } @@ -163,16 +165,40 @@ static struct ib_device *__ib_device_get_by_name(const char *name) struct ib_device *device; list_for_each_entry(device, &device_list, core_list) - if (!strncmp(name, device->name, IB_DEVICE_NAME_MAX)) + if (!strcmp(name, dev_name(&device->dev))) return device; return NULL; } -static int alloc_name(char *name) +int ib_device_rename(struct ib_device *ibdev, const char *name) +{ + struct ib_device *device; + int ret = 0; + + if (!strcmp(name, dev_name(&ibdev->dev))) + return ret; + + mutex_lock(&device_mutex); + list_for_each_entry(device, &device_list, core_list) { + if (!strcmp(name, dev_name(&device->dev))) { + ret = -EEXIST; + goto out; + } + } + + ret = device_rename(&ibdev->dev, name); + if (ret) + goto out; + strlcpy(ibdev->name, name, IB_DEVICE_NAME_MAX); +out: + mutex_unlock(&device_mutex); + return ret; +} + +static int alloc_name(struct ib_device *ibdev, const char *name) { unsigned long *inuse; - char buf[IB_DEVICE_NAME_MAX]; struct ib_device *device; int i; @@ -181,24 +207,21 @@ static int alloc_name(char *name) return -ENOMEM; list_for_each_entry(device, &device_list, core_list) { - if (!sscanf(device->name, name, &i)) + char buf[IB_DEVICE_NAME_MAX]; + + if (sscanf(dev_name(&device->dev), name, &i) != 1) continue; if (i < 0 || i >= PAGE_SIZE * 8) continue; snprintf(buf, sizeof buf, name, i); - if (!strncmp(buf, device->name, IB_DEVICE_NAME_MAX)) + if (!strcmp(buf, dev_name(&device->dev))) set_bit(i, inuse); } i = find_first_zero_bit(inuse, PAGE_SIZE * 8); free_page((unsigned long) inuse); - snprintf(buf, sizeof buf, name, i); - if (__ib_device_get_by_name(buf)) - return -ENFILE; - - strlcpy(name, buf, IB_DEVICE_NAME_MAX); - return 0; + return dev_set_name(&ibdev->dev, name, i); } static void ib_device_release(struct device *device) @@ -221,9 +244,7 @@ static void ib_device_release(struct device *device) static int ib_device_uevent(struct device *device, struct kobj_uevent_env *env) { - struct ib_device *dev = container_of(device, struct ib_device, dev); - - if (add_uevent_var(env, "NAME=%s", dev->name)) + if (add_uevent_var(env, "NAME=%s", dev_name(device))) return -ENOMEM; /* @@ -269,7 +290,7 @@ struct ib_device *ib_alloc_device(size_t size) INIT_LIST_HEAD(&device->event_handler_list); spin_lock_init(&device->event_handler_lock); - spin_lock_init(&device->client_data_lock); + rwlock_init(&device->client_data_lock); INIT_LIST_HEAD(&device->client_data_list); INIT_LIST_HEAD(&device->port_list); @@ -285,6 +306,7 @@ EXPORT_SYMBOL(ib_alloc_device); */ void ib_dealloc_device(struct ib_device *device) { + WARN_ON(!list_empty(&device->client_data_list)); WARN_ON(device->reg_state != IB_DEV_UNREGISTERED && device->reg_state != IB_DEV_UNINITIALIZED); rdma_restrack_clean(&device->res); @@ -295,9 +317,8 @@ EXPORT_SYMBOL(ib_dealloc_device); static int add_client_context(struct ib_device *device, struct ib_client *client) { struct ib_client_data *context; - unsigned long flags; - context = kmalloc(sizeof *context, GFP_KERNEL); + context = kmalloc(sizeof(*context), GFP_KERNEL); if (!context) return -ENOMEM; @@ -306,9 +327,9 @@ static int add_client_context(struct ib_device *device, struct ib_client *client context->going_down = false; down_write(&lists_rwsem); - spin_lock_irqsave(&device->client_data_lock, flags); + write_lock_irq(&device->client_data_lock); list_add(&context->list, &device->client_data_list); - spin_unlock_irqrestore(&device->client_data_lock, flags); + write_unlock_irq(&device->client_data_lock); up_write(&lists_rwsem); return 0; @@ -444,22 +465,8 @@ static u32 __dev_new_index(void) } } -/** - * ib_register_device - Register an IB device with IB core - * @device:Device to register - * - * Low-level drivers use ib_register_device() to register their - * devices with the IB core. All registered clients will receive a - * callback for each device that is added. @device must be allocated - * with ib_alloc_device(). - */ -int ib_register_device(struct ib_device *device, - int (*port_callback)(struct ib_device *, - u8, struct kobject *)) +static void setup_dma_device(struct ib_device *device) { - int ret; - struct ib_client *client; - struct ib_udata uhw = {.outlen = 0, .inlen = 0}; struct device *parent = device->dev.parent; WARN_ON_ONCE(device->dma_device); @@ -491,56 +498,113 @@ int ib_register_device(struct ib_device *device, WARN_ON_ONCE(!parent); device->dma_device = parent; } +} - mutex_lock(&device_mutex); +static void cleanup_device(struct ib_device *device) +{ + ib_cache_cleanup_one(device); + ib_cache_release_one(device); + kfree(device->port_pkey_list); + kfree(device->port_immutable); +} - if (strchr(device->name, '%')) { - ret = alloc_name(device->name); - if (ret) - goto out; - } +static int setup_device(struct ib_device *device) +{ + struct ib_udata uhw = {.outlen = 0, .inlen = 0}; + int ret; - if (ib_device_check_mandatory(device)) { - ret = -EINVAL; - goto out; - } + ret = ib_device_check_mandatory(device); + if (ret) + return ret; ret = read_port_immutable(device); if (ret) { - pr_warn("Couldn't create per port immutable data %s\n", - device->name); - goto out; + dev_warn(&device->dev, + "Couldn't create per port immutable data\n"); + return ret; } - ret = setup_port_pkey_list(device); + memset(&device->attrs, 0, sizeof(device->attrs)); + ret = device->query_device(device, &device->attrs, &uhw); if (ret) { - pr_warn("Couldn't create per port_pkey_list\n"); - goto out; + dev_warn(&device->dev, + "Couldn't query the device attributes\n"); + goto port_cleanup; } - ret = ib_cache_setup_one(device); + ret = setup_port_pkey_list(device); if (ret) { - pr_warn("Couldn't set up InfiniBand P_Key/GID cache\n"); + dev_warn(&device->dev, "Couldn't create per port_pkey_list\n"); goto port_cleanup; } - ret = ib_device_register_rdmacg(device); + ret = ib_cache_setup_one(device); if (ret) { - pr_warn("Couldn't register device with rdma cgroup\n"); - goto cache_cleanup; + dev_warn(&device->dev, + "Couldn't set up InfiniBand P_Key/GID cache\n"); + goto pkey_cleanup; + } + return 0; + +pkey_cleanup: + kfree(device->port_pkey_list); +port_cleanup: + kfree(device->port_immutable); + return ret; +} + +/** + * ib_register_device - Register an IB device with IB core + * @device:Device to register + * + * Low-level drivers use ib_register_device() to register their + * devices with the IB core. All registered clients will receive a + * callback for each device that is added. @device must be allocated + * with ib_alloc_device(). + */ +int ib_register_device(struct ib_device *device, const char *name, + int (*port_callback)(struct ib_device *, u8, + struct kobject *)) +{ + int ret; + struct ib_client *client; + + setup_dma_device(device); + + mutex_lock(&device_mutex); + + if (strchr(name, '%')) { + ret = alloc_name(device, name); + if (ret) + goto out; + } else { + ret = dev_set_name(&device->dev, name); + if (ret) + goto out; + } + if (__ib_device_get_by_name(dev_name(&device->dev))) { + ret = -ENFILE; + goto out; } + strlcpy(device->name, dev_name(&device->dev), IB_DEVICE_NAME_MAX); - memset(&device->attrs, 0, sizeof(device->attrs)); - ret = device->query_device(device, &device->attrs, &uhw); + ret = setup_device(device); + if (ret) + goto out; + + device->index = __dev_new_index(); + + ret = ib_device_register_rdmacg(device); if (ret) { - pr_warn("Couldn't query the device attributes\n"); - goto cg_cleanup; + dev_warn(&device->dev, + "Couldn't register device with rdma cgroup\n"); + goto dev_cleanup; } ret = ib_device_register_sysfs(device, port_callback); if (ret) { - pr_warn("Couldn't register device %s with driver model\n", - device->name); + dev_warn(&device->dev, + "Couldn't register device with driver model\n"); goto cg_cleanup; } @@ -550,7 +614,6 @@ int ib_register_device(struct ib_device *device, if (!add_client_context(device, client) && client->add) client->add(device); - device->index = __dev_new_index(); down_write(&lists_rwsem); list_add_tail(&device->core_list, &device_list); up_write(&lists_rwsem); @@ -559,11 +622,8 @@ int ib_register_device(struct ib_device *device, cg_cleanup: ib_device_unregister_rdmacg(device); -cache_cleanup: - ib_cache_cleanup_one(device); - ib_cache_release_one(device); -port_cleanup: - kfree(device->port_immutable); +dev_cleanup: + cleanup_device(device); out: mutex_unlock(&device_mutex); return ret; @@ -585,21 +645,20 @@ void ib_unregister_device(struct ib_device *device) down_write(&lists_rwsem); list_del(&device->core_list); - spin_lock_irqsave(&device->client_data_lock, flags); - list_for_each_entry_safe(context, tmp, &device->client_data_list, list) + write_lock_irq(&device->client_data_lock); + list_for_each_entry(context, &device->client_data_list, list) context->going_down = true; - spin_unlock_irqrestore(&device->client_data_lock, flags); + write_unlock_irq(&device->client_data_lock); downgrade_write(&lists_rwsem); - list_for_each_entry_safe(context, tmp, &device->client_data_list, - list) { + list_for_each_entry(context, &device->client_data_list, list) { if (context->client->remove) context->client->remove(device, context->data); } up_read(&lists_rwsem); - ib_device_unregister_rdmacg(device); ib_device_unregister_sysfs(device); + ib_device_unregister_rdmacg(device); mutex_unlock(&device_mutex); @@ -609,10 +668,13 @@ void ib_unregister_device(struct ib_device *device) kfree(device->port_pkey_list); down_write(&lists_rwsem); - spin_lock_irqsave(&device->client_data_lock, flags); - list_for_each_entry_safe(context, tmp, &device->client_data_list, list) + write_lock_irqsave(&device->client_data_lock, flags); + list_for_each_entry_safe(context, tmp, &device->client_data_list, + list) { + list_del(&context->list); kfree(context); - spin_unlock_irqrestore(&device->client_data_lock, flags); + } + write_unlock_irqrestore(&device->client_data_lock, flags); up_write(&lists_rwsem); device->reg_state = IB_DEV_UNREGISTERED; @@ -662,9 +724,8 @@ EXPORT_SYMBOL(ib_register_client); */ void ib_unregister_client(struct ib_client *client) { - struct ib_client_data *context, *tmp; + struct ib_client_data *context; struct ib_device *device; - unsigned long flags; mutex_lock(&device_mutex); @@ -676,14 +737,14 @@ void ib_unregister_client(struct ib_client *client) struct ib_client_data *found_context = NULL; down_write(&lists_rwsem); - spin_lock_irqsave(&device->client_data_lock, flags); - list_for_each_entry_safe(context, tmp, &device->client_data_list, list) + write_lock_irq(&device->client_data_lock); + list_for_each_entry(context, &device->client_data_list, list) if (context->client == client) { context->going_down = true; found_context = context; break; } - spin_unlock_irqrestore(&device->client_data_lock, flags); + write_unlock_irq(&device->client_data_lock); up_write(&lists_rwsem); if (client->remove) @@ -691,17 +752,18 @@ void ib_unregister_client(struct ib_client *client) found_context->data : NULL); if (!found_context) { - pr_warn("No client context found for %s/%s\n", - device->name, client->name); + dev_warn(&device->dev, + "No client context found for %s\n", + client->name); continue; } down_write(&lists_rwsem); - spin_lock_irqsave(&device->client_data_lock, flags); + write_lock_irq(&device->client_data_lock); list_del(&found_context->list); - kfree(found_context); - spin_unlock_irqrestore(&device->client_data_lock, flags); + write_unlock_irq(&device->client_data_lock); up_write(&lists_rwsem); + kfree(found_context); } mutex_unlock(&device_mutex); @@ -722,13 +784,13 @@ void *ib_get_client_data(struct ib_device *device, struct ib_client *client) void *ret = NULL; unsigned long flags; - spin_lock_irqsave(&device->client_data_lock, flags); + read_lock_irqsave(&device->client_data_lock, flags); list_for_each_entry(context, &device->client_data_list, list) if (context->client == client) { ret = context->data; break; } - spin_unlock_irqrestore(&device->client_data_lock, flags); + read_unlock_irqrestore(&device->client_data_lock, flags); return ret; } @@ -749,18 +811,18 @@ void ib_set_client_data(struct ib_device *device, struct ib_client *client, struct ib_client_data *context; unsigned long flags; - spin_lock_irqsave(&device->client_data_lock, flags); + write_lock_irqsave(&device->client_data_lock, flags); list_for_each_entry(context, &device->client_data_list, list) if (context->client == client) { context->data = data; goto out; } - pr_warn("No client context found for %s/%s\n", - device->name, client->name); + dev_warn(&device->dev, "No client context found for %s\n", + client->name); out: - spin_unlock_irqrestore(&device->client_data_lock, flags); + write_unlock_irqrestore(&device->client_data_lock, flags); } EXPORT_SYMBOL(ib_set_client_data); @@ -1166,10 +1228,19 @@ static int __init ib_core_init(void) goto err; } + ib_comp_unbound_wq = + alloc_workqueue("ib-comp-unb-wq", + WQ_UNBOUND | WQ_HIGHPRI | WQ_MEM_RECLAIM | + WQ_SYSFS, WQ_UNBOUND_MAX_ACTIVE); + if (!ib_comp_unbound_wq) { + ret = -ENOMEM; + goto err_comp; + } + ret = class_register(&ib_class); if (ret) { pr_warn("Couldn't create InfiniBand device class\n"); - goto err_comp; + goto err_comp_unbound; } ret = rdma_nl_init(); @@ -1218,6 +1289,8 @@ err_ibnl: rdma_nl_exit(); err_sysfs: class_unregister(&ib_class); +err_comp_unbound: + destroy_workqueue(ib_comp_unbound_wq); err_comp: destroy_workqueue(ib_comp_wq); err: @@ -1236,6 +1309,7 @@ static void __exit ib_core_cleanup(void) addr_cleanup(); rdma_nl_exit(); class_unregister(&ib_class); + destroy_workqueue(ib_comp_unbound_wq); destroy_workqueue(ib_comp_wq); /* Make sure that any pending umem accounting work is done. */ destroy_workqueue(ib_wq); diff --git a/drivers/infiniband/core/fmr_pool.c b/drivers/infiniband/core/fmr_pool.c index a077500f7f32..83ba0068e8bb 100644 --- a/drivers/infiniband/core/fmr_pool.c +++ b/drivers/infiniband/core/fmr_pool.c @@ -213,7 +213,7 @@ struct ib_fmr_pool *ib_create_fmr_pool(struct ib_pd *pd, device = pd->device; if (!device->alloc_fmr || !device->dealloc_fmr || !device->map_phys_fmr || !device->unmap_fmr) { - pr_info(PFX "Device %s does not support FMRs\n", device->name); + dev_info(&device->dev, "Device does not support FMRs\n"); return ERR_PTR(-ENOSYS); } @@ -257,7 +257,8 @@ struct ib_fmr_pool *ib_create_fmr_pool(struct ib_pd *pd, atomic_set(&pool->flush_ser, 0); init_waitqueue_head(&pool->force_wait); - pool->worker = kthread_create_worker(0, "ib_fmr(%s)", device->name); + pool->worker = + kthread_create_worker(0, "ib_fmr(%s)", dev_name(&device->dev)); if (IS_ERR(pool->worker)) { pr_warn(PFX "couldn't start cleanup kthread worker\n"); ret = PTR_ERR(pool->worker); diff --git a/drivers/infiniband/core/iwcm.c b/drivers/infiniband/core/iwcm.c index 5d676cff41f4..ba668d49c751 100644 --- a/drivers/infiniband/core/iwcm.c +++ b/drivers/infiniband/core/iwcm.c @@ -509,7 +509,7 @@ static int iw_cm_map(struct iw_cm_id *cm_id, bool active) cm_id->m_local_addr = cm_id->local_addr; cm_id->m_remote_addr = cm_id->remote_addr; - memcpy(pm_reg_msg.dev_name, cm_id->device->name, + memcpy(pm_reg_msg.dev_name, dev_name(&cm_id->device->dev), sizeof(pm_reg_msg.dev_name)); memcpy(pm_reg_msg.if_name, cm_id->device->iwcm->ifname, sizeof(pm_reg_msg.if_name)); diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c index ef459f2f2eeb..d7025cd5be28 100644 --- a/drivers/infiniband/core/mad.c +++ b/drivers/infiniband/core/mad.c @@ -220,33 +220,37 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device, int ret2, qpn; u8 mgmt_class, vclass; + if ((qp_type == IB_QPT_SMI && !rdma_cap_ib_smi(device, port_num)) || + (qp_type == IB_QPT_GSI && !rdma_cap_ib_cm(device, port_num))) + return ERR_PTR(-EPROTONOSUPPORT); + /* Validate parameters */ qpn = get_spl_qp_index(qp_type); if (qpn == -1) { - dev_notice(&device->dev, - "ib_register_mad_agent: invalid QP Type %d\n", - qp_type); + dev_dbg_ratelimited(&device->dev, "%s: invalid QP Type %d\n", + __func__, qp_type); goto error1; } if (rmpp_version && rmpp_version != IB_MGMT_RMPP_VERSION) { - dev_notice(&device->dev, - "ib_register_mad_agent: invalid RMPP Version %u\n", - rmpp_version); + dev_dbg_ratelimited(&device->dev, + "%s: invalid RMPP Version %u\n", + __func__, rmpp_version); goto error1; } /* Validate MAD registration request if supplied */ if (mad_reg_req) { if (mad_reg_req->mgmt_class_version >= MAX_MGMT_VERSION) { - dev_notice(&device->dev, - "ib_register_mad_agent: invalid Class Version %u\n", - mad_reg_req->mgmt_class_version); + dev_dbg_ratelimited(&device->dev, + "%s: invalid Class Version %u\n", + __func__, + mad_reg_req->mgmt_class_version); goto error1; } if (!recv_handler) { - dev_notice(&device->dev, - "ib_register_mad_agent: no recv_handler\n"); + dev_dbg_ratelimited(&device->dev, + "%s: no recv_handler\n", __func__); goto error1; } if (mad_reg_req->mgmt_class >= MAX_MGMT_CLASS) { @@ -256,9 +260,9 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device, */ if (mad_reg_req->mgmt_class != IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) { - dev_notice(&device->dev, - "ib_register_mad_agent: Invalid Mgmt Class 0x%x\n", - mad_reg_req->mgmt_class); + dev_dbg_ratelimited(&device->dev, + "%s: Invalid Mgmt Class 0x%x\n", + __func__, mad_reg_req->mgmt_class); goto error1; } } else if (mad_reg_req->mgmt_class == 0) { @@ -266,8 +270,9 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device, * Class 0 is reserved in IBA and is used for * aliasing of IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE */ - dev_notice(&device->dev, - "ib_register_mad_agent: Invalid Mgmt Class 0\n"); + dev_dbg_ratelimited(&device->dev, + "%s: Invalid Mgmt Class 0\n", + __func__); goto error1; } else if (is_vendor_class(mad_reg_req->mgmt_class)) { /* @@ -275,18 +280,19 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device, * ensure supplied OUI is not zero */ if (!is_vendor_oui(mad_reg_req->oui)) { - dev_notice(&device->dev, - "ib_register_mad_agent: No OUI specified for class 0x%x\n", - mad_reg_req->mgmt_class); + dev_dbg_ratelimited(&device->dev, + "%s: No OUI specified for class 0x%x\n", + __func__, + mad_reg_req->mgmt_class); goto error1; } } /* Make sure class supplied is consistent with RMPP */ if (!ib_is_mad_class_rmpp(mad_reg_req->mgmt_class)) { if (rmpp_version) { - dev_notice(&device->dev, - "ib_register_mad_agent: RMPP version for non-RMPP class 0x%x\n", - mad_reg_req->mgmt_class); + dev_dbg_ratelimited(&device->dev, + "%s: RMPP version for non-RMPP class 0x%x\n", + __func__, mad_reg_req->mgmt_class); goto error1; } } @@ -297,9 +303,9 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device, IB_MGMT_CLASS_SUBN_LID_ROUTED) && (mad_reg_req->mgmt_class != IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)) { - dev_notice(&device->dev, - "ib_register_mad_agent: Invalid SM QP type: class 0x%x\n", - mad_reg_req->mgmt_class); + dev_dbg_ratelimited(&device->dev, + "%s: Invalid SM QP type: class 0x%x\n", + __func__, mad_reg_req->mgmt_class); goto error1; } } else { @@ -307,9 +313,9 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device, IB_MGMT_CLASS_SUBN_LID_ROUTED) || (mad_reg_req->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)) { - dev_notice(&device->dev, - "ib_register_mad_agent: Invalid GS QP type: class 0x%x\n", - mad_reg_req->mgmt_class); + dev_dbg_ratelimited(&device->dev, + "%s: Invalid GS QP type: class 0x%x\n", + __func__, mad_reg_req->mgmt_class); goto error1; } } @@ -324,18 +330,18 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device, /* Validate device and port */ port_priv = ib_get_mad_port(device, port_num); if (!port_priv) { - dev_notice(&device->dev, - "ib_register_mad_agent: Invalid port %d\n", - port_num); + dev_dbg_ratelimited(&device->dev, "%s: Invalid port %d\n", + __func__, port_num); ret = ERR_PTR(-ENODEV); goto error1; } - /* Verify the QP requested is supported. For example, Ethernet devices - * will not have QP0 */ + /* Verify the QP requested is supported. For example, Ethernet devices + * will not have QP0. + */ if (!port_priv->qp_info[qpn].qp) { - dev_notice(&device->dev, - "ib_register_mad_agent: QP %d not supported\n", qpn); + dev_dbg_ratelimited(&device->dev, "%s: QP %d not supported\n", + __func__, qpn); ret = ERR_PTR(-EPROTONOSUPPORT); goto error1; } @@ -2408,7 +2414,7 @@ static void wait_for_response(struct ib_mad_send_wr_private *mad_send_wr) } void ib_reset_mad_timeout(struct ib_mad_send_wr_private *mad_send_wr, - int timeout_ms) + unsigned long timeout_ms) { mad_send_wr->timeout = msecs_to_jiffies(timeout_ms); wait_for_response(mad_send_wr); @@ -3183,7 +3189,7 @@ static int ib_mad_port_open(struct ib_device *device, cq_size *= 2; port_priv->cq = ib_alloc_cq(port_priv->device, port_priv, cq_size, 0, - IB_POLL_WORKQUEUE); + IB_POLL_UNBOUND_WORKQUEUE); if (IS_ERR(port_priv->cq)) { dev_err(&device->dev, "Couldn't create ib_mad CQ\n"); ret = PTR_ERR(port_priv->cq); diff --git a/drivers/infiniband/core/mad_priv.h b/drivers/infiniband/core/mad_priv.h index d84ae1671898..216509036aa8 100644 --- a/drivers/infiniband/core/mad_priv.h +++ b/drivers/infiniband/core/mad_priv.h @@ -221,6 +221,6 @@ void ib_mad_complete_send_wr(struct ib_mad_send_wr_private *mad_send_wr, void ib_mark_mad_done(struct ib_mad_send_wr_private *mad_send_wr); void ib_reset_mad_timeout(struct ib_mad_send_wr_private *mad_send_wr, - int timeout_ms); + unsigned long timeout_ms); #endif /* __IB_MAD_PRIV_H__ */ diff --git a/drivers/infiniband/core/netlink.c b/drivers/infiniband/core/netlink.c index 3ccaae18ad75..724f5a62e82f 100644 --- a/drivers/infiniband/core/netlink.c +++ b/drivers/infiniband/core/netlink.c @@ -47,9 +47,9 @@ static struct { const struct rdma_nl_cbs *cb_table; } rdma_nl_types[RDMA_NL_NUM_CLIENTS]; -int rdma_nl_chk_listeners(unsigned int group) +bool rdma_nl_chk_listeners(unsigned int group) { - return (netlink_has_listeners(nls, group)) ? 0 : -1; + return netlink_has_listeners(nls, group); } EXPORT_SYMBOL(rdma_nl_chk_listeners); diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c index 0385ab438320..573399e3ccc1 100644 --- a/drivers/infiniband/core/nldev.c +++ b/drivers/infiniband/core/nldev.c @@ -179,7 +179,8 @@ static int fill_nldev_handle(struct sk_buff *msg, struct ib_device *device) { if (nla_put_u32(msg, RDMA_NLDEV_ATTR_DEV_INDEX, device->index)) return -EMSGSIZE; - if (nla_put_string(msg, RDMA_NLDEV_ATTR_DEV_NAME, device->name)) + if (nla_put_string(msg, RDMA_NLDEV_ATTR_DEV_NAME, + dev_name(&device->dev))) return -EMSGSIZE; return 0; @@ -645,6 +646,36 @@ err: return err; } +static int nldev_set_doit(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; + struct ib_device *device; + u32 index; + int err; + + err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, nldev_policy, + extack); + if (err || !tb[RDMA_NLDEV_ATTR_DEV_INDEX]) + return -EINVAL; + + index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); + device = ib_device_get_by_index(index); + if (!device) + return -EINVAL; + + if (tb[RDMA_NLDEV_ATTR_DEV_NAME]) { + char name[IB_DEVICE_NAME_MAX] = {}; + + nla_strlcpy(name, tb[RDMA_NLDEV_ATTR_DEV_NAME], + IB_DEVICE_NAME_MAX); + err = ib_device_rename(device, name); + } + + put_device(&device->dev); + return err; +} + static int _nldev_get_dumpit(struct ib_device *device, struct sk_buff *skb, struct netlink_callback *cb, @@ -1077,6 +1108,10 @@ static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = { .doit = nldev_get_doit, .dump = nldev_get_dumpit, }, + [RDMA_NLDEV_CMD_SET] = { + .doit = nldev_set_doit, + .flags = RDMA_NL_ADMIN_PERM, + }, [RDMA_NLDEV_CMD_PORT_GET] = { .doit = nldev_port_get_doit, .dump = nldev_port_get_dumpit, diff --git a/drivers/infiniband/core/rdma_core.c b/drivers/infiniband/core/rdma_core.c index c4118bcd5103..752a55c6bdce 100644 --- a/drivers/infiniband/core/rdma_core.c +++ b/drivers/infiniband/core/rdma_core.c @@ -794,44 +794,6 @@ void uverbs_close_fd(struct file *f) uverbs_uobject_put(uobj); } -static void ufile_disassociate_ucontext(struct ib_ucontext *ibcontext) -{ - struct ib_device *ib_dev = ibcontext->device; - struct task_struct *owning_process = NULL; - struct mm_struct *owning_mm = NULL; - - owning_process = get_pid_task(ibcontext->tgid, PIDTYPE_PID); - if (!owning_process) - return; - - owning_mm = get_task_mm(owning_process); - if (!owning_mm) { - pr_info("no mm, disassociate ucontext is pending task termination\n"); - while (1) { - put_task_struct(owning_process); - usleep_range(1000, 2000); - owning_process = get_pid_task(ibcontext->tgid, - PIDTYPE_PID); - if (!owning_process || - owning_process->state == TASK_DEAD) { - pr_info("disassociate ucontext done, task was terminated\n"); - /* in case task was dead need to release the - * task struct. - */ - if (owning_process) - put_task_struct(owning_process); - return; - } - } - } - - down_write(&owning_mm->mmap_sem); - ib_dev->disassociate_ucontext(ibcontext); - up_write(&owning_mm->mmap_sem); - mmput(owning_mm); - put_task_struct(owning_process); -} - /* * Drop the ucontext off the ufile and completely disconnect it from the * ib_device @@ -840,20 +802,28 @@ static void ufile_destroy_ucontext(struct ib_uverbs_file *ufile, enum rdma_remove_reason reason) { struct ib_ucontext *ucontext = ufile->ucontext; + struct ib_device *ib_dev = ucontext->device; int ret; - if (reason == RDMA_REMOVE_DRIVER_REMOVE) - ufile_disassociate_ucontext(ucontext); + /* + * If we are closing the FD then the user mmap VMAs must have + * already been destroyed as they hold on to the filep, otherwise + * they need to be zap'd. + */ + if (reason == RDMA_REMOVE_DRIVER_REMOVE) { + uverbs_user_mmap_disassociate(ufile); + if (ib_dev->disassociate_ucontext) + ib_dev->disassociate_ucontext(ucontext); + } - put_pid(ucontext->tgid); - ib_rdmacg_uncharge(&ucontext->cg_obj, ucontext->device, + ib_rdmacg_uncharge(&ucontext->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_HANDLE); /* * FIXME: Drivers are not permitted to fail dealloc_ucontext, remove * the error return. */ - ret = ucontext->device->dealloc_ucontext(ucontext); + ret = ib_dev->dealloc_ucontext(ucontext); WARN_ON(ret); ufile->ucontext = NULL; diff --git a/drivers/infiniband/core/rdma_core.h b/drivers/infiniband/core/rdma_core.h index f962f2a593ba..4886d2bba7c7 100644 --- a/drivers/infiniband/core/rdma_core.h +++ b/drivers/infiniband/core/rdma_core.h @@ -160,5 +160,6 @@ void uverbs_disassociate_api(struct uverbs_api *uapi); void uverbs_destroy_api(struct uverbs_api *uapi); void uapi_compute_bundle_size(struct uverbs_api_ioctl_method *method_elm, unsigned int num_attrs); +void uverbs_user_mmap_disassociate(struct ib_uverbs_file *ufile); #endif /* RDMA_CORE_H */ diff --git a/drivers/infiniband/core/restrack.c b/drivers/infiniband/core/restrack.c index 3b7fa0ccaa08..06d8657ce583 100644 --- a/drivers/infiniband/core/restrack.c +++ b/drivers/infiniband/core/restrack.c @@ -50,8 +50,7 @@ void rdma_restrack_clean(struct rdma_restrack_root *res) dev = container_of(res, struct ib_device, res); pr_err("restrack: %s", CUT_HERE); - pr_err("restrack: BUG: RESTRACK detected leak of resources on %s\n", - dev->name); + dev_err(&dev->dev, "BUG: RESTRACK detected leak of resources\n"); hash_for_each(res->hash, bkt, e, node) { if (rdma_is_kernel_res(e)) { owner = e->kern_name; @@ -156,6 +155,21 @@ static bool res_is_user(struct rdma_restrack_entry *res) } } +void rdma_restrack_set_task(struct rdma_restrack_entry *res, + const char *caller) +{ + if (caller) { + res->kern_name = caller; + return; + } + + if (res->task) + put_task_struct(res->task); + get_task_struct(current); + res->task = current; +} +EXPORT_SYMBOL(rdma_restrack_set_task); + void rdma_restrack_add(struct rdma_restrack_entry *res) { struct ib_device *dev = res_to_dev(res); @@ -168,7 +182,7 @@ void rdma_restrack_add(struct rdma_restrack_entry *res) if (res_is_user(res)) { if (!res->task) - rdma_restrack_set_task(res, current); + rdma_restrack_set_task(res, NULL); res->kern_name = NULL; } else { set_kern_name(res); @@ -209,7 +223,7 @@ void rdma_restrack_del(struct rdma_restrack_entry *res) struct ib_device *dev; if (!res->valid) - return; + goto out; dev = res_to_dev(res); if (!dev) @@ -222,8 +236,12 @@ void rdma_restrack_del(struct rdma_restrack_entry *res) down_write(&dev->res.rwsem); hash_del(&res->node); res->valid = false; - if (res->task) - put_task_struct(res->task); up_write(&dev->res.rwsem); + +out: + if (res->task) { + put_task_struct(res->task); + res->task = NULL; + } } EXPORT_SYMBOL(rdma_restrack_del); diff --git a/drivers/infiniband/core/sa.h b/drivers/infiniband/core/sa.h index b1d4bbf4ce5c..cbaaaa92fff3 100644 --- a/drivers/infiniband/core/sa.h +++ b/drivers/infiniband/core/sa.h @@ -49,16 +49,14 @@ static inline void ib_sa_client_put(struct ib_sa_client *client) } int ib_sa_mcmember_rec_query(struct ib_sa_client *client, - struct ib_device *device, u8 port_num, - u8 method, + struct ib_device *device, u8 port_num, u8 method, struct ib_sa_mcmember_rec *rec, ib_sa_comp_mask comp_mask, - int timeout_ms, gfp_t gfp_mask, + unsigned long timeout_ms, gfp_t gfp_mask, void (*callback)(int status, struct ib_sa_mcmember_rec *resp, void *context), - void *context, - struct ib_sa_query **sa_query); + void *context, struct ib_sa_query **sa_query); int mcast_init(void); void mcast_cleanup(void); diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c index 7b794a14d6e8..be5ba5e15496 100644 --- a/drivers/infiniband/core/sa_query.c +++ b/drivers/infiniband/core/sa_query.c @@ -761,7 +761,7 @@ static void ib_nl_set_path_rec_attrs(struct sk_buff *skb, /* Construct the family header first */ header = skb_put(skb, NLMSG_ALIGN(sizeof(*header))); - memcpy(header->device_name, query->port->agent->device->name, + memcpy(header->device_name, dev_name(&query->port->agent->device->dev), LS_DEVICE_NAME_MAX); header->port_num = query->port->port_num; @@ -835,7 +835,6 @@ static int ib_nl_send_msg(struct ib_sa_query *query, gfp_t gfp_mask) struct sk_buff *skb = NULL; struct nlmsghdr *nlh; void *data; - int ret = 0; struct ib_sa_mad *mad; int len; @@ -862,13 +861,7 @@ static int ib_nl_send_msg(struct ib_sa_query *query, gfp_t gfp_mask) /* Repair the nlmsg header length */ nlmsg_end(skb, nlh); - ret = rdma_nl_multicast(skb, RDMA_NL_GROUP_LS, gfp_mask); - if (!ret) - ret = len; - else - ret = 0; - - return ret; + return rdma_nl_multicast(skb, RDMA_NL_GROUP_LS, gfp_mask); } static int ib_nl_make_request(struct ib_sa_query *query, gfp_t gfp_mask) @@ -891,14 +884,12 @@ static int ib_nl_make_request(struct ib_sa_query *query, gfp_t gfp_mask) spin_unlock_irqrestore(&ib_nl_request_lock, flags); ret = ib_nl_send_msg(query, gfp_mask); - if (ret <= 0) { + if (ret) { ret = -EIO; /* Remove the request */ spin_lock_irqsave(&ib_nl_request_lock, flags); list_del(&query->list); spin_unlock_irqrestore(&ib_nl_request_lock, flags); - } else { - ret = 0; } return ret; @@ -1227,46 +1218,6 @@ static u8 get_src_path_mask(struct ib_device *device, u8 port_num) return src_path_mask; } -static int roce_resolve_route_from_path(struct sa_path_rec *rec, - const struct ib_gid_attr *attr) -{ - struct rdma_dev_addr dev_addr = {}; - union { - struct sockaddr _sockaddr; - struct sockaddr_in _sockaddr_in; - struct sockaddr_in6 _sockaddr_in6; - } sgid_addr, dgid_addr; - int ret; - - if (rec->roce.route_resolved) - return 0; - if (!attr || !attr->ndev) - return -EINVAL; - - dev_addr.bound_dev_if = attr->ndev->ifindex; - /* TODO: Use net from the ib_gid_attr once it is added to it, - * until than, limit itself to init_net. - */ - dev_addr.net = &init_net; - - rdma_gid2ip(&sgid_addr._sockaddr, &rec->sgid); - rdma_gid2ip(&dgid_addr._sockaddr, &rec->dgid); - - /* validate the route */ - ret = rdma_resolve_ip_route(&sgid_addr._sockaddr, - &dgid_addr._sockaddr, &dev_addr); - if (ret) - return ret; - - if ((dev_addr.network == RDMA_NETWORK_IPV4 || - dev_addr.network == RDMA_NETWORK_IPV6) && - rec->rec_type != SA_PATH_REC_TYPE_ROCE_V2) - return -EINVAL; - - rec->roce.route_resolved = true; - return 0; -} - static int init_ah_attr_grh_fields(struct ib_device *device, u8 port_num, struct sa_path_rec *rec, struct rdma_ah_attr *ah_attr, @@ -1409,7 +1360,8 @@ static void init_mad(struct ib_sa_query *query, struct ib_mad_agent *agent) spin_unlock_irqrestore(&tid_lock, flags); } -static int send_mad(struct ib_sa_query *query, int timeout_ms, gfp_t gfp_mask) +static int send_mad(struct ib_sa_query *query, unsigned long timeout_ms, + gfp_t gfp_mask) { bool preload = gfpflags_allow_blocking(gfp_mask); unsigned long flags; @@ -1433,7 +1385,7 @@ static int send_mad(struct ib_sa_query *query, int timeout_ms, gfp_t gfp_mask) if ((query->flags & IB_SA_ENABLE_LOCAL_SERVICE) && (!(query->flags & IB_SA_QUERY_OPA))) { - if (!rdma_nl_chk_listeners(RDMA_NL_GROUP_LS)) { + if (rdma_nl_chk_listeners(RDMA_NL_GROUP_LS)) { if (!ib_nl_make_request(query, gfp_mask)) return id; } @@ -1599,7 +1551,7 @@ int ib_sa_path_rec_get(struct ib_sa_client *client, struct ib_device *device, u8 port_num, struct sa_path_rec *rec, ib_sa_comp_mask comp_mask, - int timeout_ms, gfp_t gfp_mask, + unsigned long timeout_ms, gfp_t gfp_mask, void (*callback)(int status, struct sa_path_rec *resp, void *context), @@ -1753,7 +1705,7 @@ int ib_sa_service_rec_query(struct ib_sa_client *client, struct ib_device *device, u8 port_num, u8 method, struct ib_sa_service_rec *rec, ib_sa_comp_mask comp_mask, - int timeout_ms, gfp_t gfp_mask, + unsigned long timeout_ms, gfp_t gfp_mask, void (*callback)(int status, struct ib_sa_service_rec *resp, void *context), @@ -1850,7 +1802,7 @@ int ib_sa_mcmember_rec_query(struct ib_sa_client *client, u8 method, struct ib_sa_mcmember_rec *rec, ib_sa_comp_mask comp_mask, - int timeout_ms, gfp_t gfp_mask, + unsigned long timeout_ms, gfp_t gfp_mask, void (*callback)(int status, struct ib_sa_mcmember_rec *resp, void *context), @@ -1941,7 +1893,7 @@ int ib_sa_guid_info_rec_query(struct ib_sa_client *client, struct ib_device *device, u8 port_num, struct ib_sa_guidinfo_rec *rec, ib_sa_comp_mask comp_mask, u8 method, - int timeout_ms, gfp_t gfp_mask, + unsigned long timeout_ms, gfp_t gfp_mask, void (*callback)(int status, struct ib_sa_guidinfo_rec *resp, void *context), @@ -2108,7 +2060,7 @@ static void ib_sa_classport_info_rec_release(struct ib_sa_query *sa_query) } static int ib_sa_classport_info_rec_query(struct ib_sa_port *port, - int timeout_ms, + unsigned long timeout_ms, void (*callback)(void *context), void *context, struct ib_sa_query **sa_query) diff --git a/drivers/infiniband/core/security.c b/drivers/infiniband/core/security.c index 9b0bea8303e0..1143c0448666 100644 --- a/drivers/infiniband/core/security.c +++ b/drivers/infiniband/core/security.c @@ -685,9 +685,8 @@ static int ib_mad_agent_security_change(struct notifier_block *nb, if (event != LSM_POLICY_CHANGE) return NOTIFY_DONE; - ag->smp_allowed = !security_ib_endport_manage_subnet(ag->security, - ag->device->name, - ag->port_num); + ag->smp_allowed = !security_ib_endport_manage_subnet( + ag->security, dev_name(&ag->device->dev), ag->port_num); return NOTIFY_OK; } @@ -708,7 +707,7 @@ int ib_mad_agent_security_setup(struct ib_mad_agent *agent, return 0; ret = security_ib_endport_manage_subnet(agent->security, - agent->device->name, + dev_name(&agent->device->dev), agent->port_num); if (ret) return ret; diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c index 7fd14ead7b37..6fcce2c206c6 100644 --- a/drivers/infiniband/core/sysfs.c +++ b/drivers/infiniband/core/sysfs.c @@ -512,7 +512,7 @@ static ssize_t show_pma_counter(struct ib_port *p, struct port_attribute *attr, ret = get_perf_mad(p->ibdev, p->port_num, tab_attr->attr_id, &data, 40 + offset / 8, sizeof(data)); if (ret < 0) - return sprintf(buf, "N/A (no PMA)\n"); + return ret; switch (width) { case 4: @@ -1036,7 +1036,7 @@ static int add_port(struct ib_device *device, int port_num, p->port_num = port_num; ret = kobject_init_and_add(&p->kobj, &port_type, - device->ports_parent, + device->ports_kobj, "%d", port_num); if (ret) { kfree(p); @@ -1057,10 +1057,12 @@ static int add_port(struct ib_device *device, int port_num, goto err_put; } - p->pma_table = get_counter_table(device, port_num); - ret = sysfs_create_group(&p->kobj, p->pma_table); - if (ret) - goto err_put_gid_attrs; + if (device->process_mad) { + p->pma_table = get_counter_table(device, port_num); + ret = sysfs_create_group(&p->kobj, p->pma_table); + if (ret) + goto err_put_gid_attrs; + } p->gid_group.name = "gids"; p->gid_group.attrs = alloc_group_attrs(show_port_gid, attr.gid_tbl_len); @@ -1118,9 +1120,9 @@ static int add_port(struct ib_device *device, int port_num, } /* - * If port == 0, it means we have only one port and the parent - * device, not this port device, should be the holder of the - * hw_counters + * If port == 0, it means hw_counters are per device and not per + * port, so holder should be device. Therefore skip per port conunter + * initialization. */ if (device->alloc_hw_stats && port_num) setup_hw_stats(device, p, port_num); @@ -1173,7 +1175,8 @@ err_free_gid: p->gid_group.attrs = NULL; err_remove_pma: - sysfs_remove_group(&p->kobj, p->pma_table); + if (p->pma_table) + sysfs_remove_group(&p->kobj, p->pma_table); err_put_gid_attrs: kobject_put(&p->gid_attr_group->kobj); @@ -1183,7 +1186,7 @@ err_put: return ret; } -static ssize_t show_node_type(struct device *device, +static ssize_t node_type_show(struct device *device, struct device_attribute *attr, char *buf) { struct ib_device *dev = container_of(device, struct ib_device, dev); @@ -1198,8 +1201,9 @@ static ssize_t show_node_type(struct device *device, default: return sprintf(buf, "%d: <unknown>\n", dev->node_type); } } +static DEVICE_ATTR_RO(node_type); -static ssize_t show_sys_image_guid(struct device *device, +static ssize_t sys_image_guid_show(struct device *device, struct device_attribute *dev_attr, char *buf) { struct ib_device *dev = container_of(device, struct ib_device, dev); @@ -1210,8 +1214,9 @@ static ssize_t show_sys_image_guid(struct device *device, be16_to_cpu(((__be16 *) &dev->attrs.sys_image_guid)[2]), be16_to_cpu(((__be16 *) &dev->attrs.sys_image_guid)[3])); } +static DEVICE_ATTR_RO(sys_image_guid); -static ssize_t show_node_guid(struct device *device, +static ssize_t node_guid_show(struct device *device, struct device_attribute *attr, char *buf) { struct ib_device *dev = container_of(device, struct ib_device, dev); @@ -1222,8 +1227,9 @@ static ssize_t show_node_guid(struct device *device, be16_to_cpu(((__be16 *) &dev->node_guid)[2]), be16_to_cpu(((__be16 *) &dev->node_guid)[3])); } +static DEVICE_ATTR_RO(node_guid); -static ssize_t show_node_desc(struct device *device, +static ssize_t node_desc_show(struct device *device, struct device_attribute *attr, char *buf) { struct ib_device *dev = container_of(device, struct ib_device, dev); @@ -1231,9 +1237,9 @@ static ssize_t show_node_desc(struct device *device, return sprintf(buf, "%.64s\n", dev->node_desc); } -static ssize_t set_node_desc(struct device *device, - struct device_attribute *attr, - const char *buf, size_t count) +static ssize_t node_desc_store(struct device *device, + struct device_attribute *attr, + const char *buf, size_t count) { struct ib_device *dev = container_of(device, struct ib_device, dev); struct ib_device_modify desc = {}; @@ -1249,8 +1255,9 @@ static ssize_t set_node_desc(struct device *device, return count; } +static DEVICE_ATTR_RW(node_desc); -static ssize_t show_fw_ver(struct device *device, struct device_attribute *attr, +static ssize_t fw_ver_show(struct device *device, struct device_attribute *attr, char *buf) { struct ib_device *dev = container_of(device, struct ib_device, dev); @@ -1259,19 +1266,19 @@ static ssize_t show_fw_ver(struct device *device, struct device_attribute *attr, strlcat(buf, "\n", IB_FW_VERSION_NAME_MAX); return strlen(buf); } +static DEVICE_ATTR_RO(fw_ver); + +static struct attribute *ib_dev_attrs[] = { + &dev_attr_node_type.attr, + &dev_attr_node_guid.attr, + &dev_attr_sys_image_guid.attr, + &dev_attr_fw_ver.attr, + &dev_attr_node_desc.attr, + NULL, +}; -static DEVICE_ATTR(node_type, S_IRUGO, show_node_type, NULL); -static DEVICE_ATTR(sys_image_guid, S_IRUGO, show_sys_image_guid, NULL); -static DEVICE_ATTR(node_guid, S_IRUGO, show_node_guid, NULL); -static DEVICE_ATTR(node_desc, S_IRUGO | S_IWUSR, show_node_desc, set_node_desc); -static DEVICE_ATTR(fw_ver, S_IRUGO, show_fw_ver, NULL); - -static struct device_attribute *ib_class_attributes[] = { - &dev_attr_node_type, - &dev_attr_sys_image_guid, - &dev_attr_node_guid, - &dev_attr_node_desc, - &dev_attr_fw_ver, +static const struct attribute_group dev_attr_group = { + .attrs = ib_dev_attrs, }; static void free_port_list_attributes(struct ib_device *device) @@ -1285,7 +1292,9 @@ static void free_port_list_attributes(struct ib_device *device) kfree(port->hw_stats); free_hsag(&port->kobj, port->hw_stats_ag); } - sysfs_remove_group(p, port->pma_table); + + if (port->pma_table) + sysfs_remove_group(p, port->pma_table); sysfs_remove_group(p, &port->pkey_group); sysfs_remove_group(p, &port->gid_group); sysfs_remove_group(&port->gid_attr_group->kobj, @@ -1296,7 +1305,7 @@ static void free_port_list_attributes(struct ib_device *device) kobject_put(p); } - kobject_put(device->ports_parent); + kobject_put(device->ports_kobj); } int ib_device_register_sysfs(struct ib_device *device, @@ -1307,23 +1316,15 @@ int ib_device_register_sysfs(struct ib_device *device, int ret; int i; - ret = dev_set_name(class_dev, "%s", device->name); - if (ret) - return ret; + device->groups[0] = &dev_attr_group; + class_dev->groups = device->groups; ret = device_add(class_dev); if (ret) goto err; - for (i = 0; i < ARRAY_SIZE(ib_class_attributes); ++i) { - ret = device_create_file(class_dev, ib_class_attributes[i]); - if (ret) - goto err_unregister; - } - - device->ports_parent = kobject_create_and_add("ports", - &class_dev->kobj); - if (!device->ports_parent) { + device->ports_kobj = kobject_create_and_add("ports", &class_dev->kobj); + if (!device->ports_kobj) { ret = -ENOMEM; goto err_put; } @@ -1347,20 +1348,15 @@ int ib_device_register_sysfs(struct ib_device *device, err_put: free_port_list_attributes(device); - -err_unregister: device_del(class_dev); - err: return ret; } void ib_device_unregister_sysfs(struct ib_device *device) { - int i; - - /* Hold kobject until ib_dealloc_device() */ - kobject_get(&device->dev.kobj); + /* Hold device until ib_dealloc_device() */ + get_device(&device->dev); free_port_list_attributes(device); @@ -1369,8 +1365,5 @@ void ib_device_unregister_sysfs(struct ib_device *device) free_hsag(&device->dev.kobj, device->hw_stats_ag); } - for (i = 0; i < ARRAY_SIZE(ib_class_attributes); ++i) - device_remove_file(&device->dev, ib_class_attributes[i]); - device_unregister(&device->dev); } diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c index a41792dbae1f..c6144df47ea4 100644 --- a/drivers/infiniband/core/umem.c +++ b/drivers/infiniband/core/umem.c @@ -85,7 +85,9 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, struct page **page_list; struct vm_area_struct **vma_list; unsigned long lock_limit; + unsigned long new_pinned; unsigned long cur_base; + struct mm_struct *mm; unsigned long npages; int ret; int i; @@ -107,25 +109,32 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, if (!can_do_mlock()) return ERR_PTR(-EPERM); - umem = kzalloc(sizeof *umem, GFP_KERNEL); - if (!umem) - return ERR_PTR(-ENOMEM); + if (access & IB_ACCESS_ON_DEMAND) { + umem = kzalloc(sizeof(struct ib_umem_odp), GFP_KERNEL); + if (!umem) + return ERR_PTR(-ENOMEM); + umem->is_odp = 1; + } else { + umem = kzalloc(sizeof(*umem), GFP_KERNEL); + if (!umem) + return ERR_PTR(-ENOMEM); + } umem->context = context; umem->length = size; umem->address = addr; umem->page_shift = PAGE_SHIFT; umem->writable = ib_access_writable(access); + umem->owning_mm = mm = current->mm; + mmgrab(mm); if (access & IB_ACCESS_ON_DEMAND) { - ret = ib_umem_odp_get(context, umem, access); + ret = ib_umem_odp_get(to_ib_umem_odp(umem), access); if (ret) goto umem_kfree; return umem; } - umem->odp_data = NULL; - /* We assume the memory is from hugetlb until proved otherwise */ umem->hugetlb = 1; @@ -144,25 +153,25 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, umem->hugetlb = 0; npages = ib_umem_num_pages(umem); + if (npages == 0 || npages > UINT_MAX) { + ret = -EINVAL; + goto out; + } lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; - down_write(¤t->mm->mmap_sem); - current->mm->pinned_vm += npages; - if ((current->mm->pinned_vm > lock_limit) && !capable(CAP_IPC_LOCK)) { - up_write(¤t->mm->mmap_sem); + down_write(&mm->mmap_sem); + if (check_add_overflow(mm->pinned_vm, npages, &new_pinned) || + (new_pinned > lock_limit && !capable(CAP_IPC_LOCK))) { + up_write(&mm->mmap_sem); ret = -ENOMEM; - goto vma; + goto out; } - up_write(¤t->mm->mmap_sem); + mm->pinned_vm = new_pinned; + up_write(&mm->mmap_sem); cur_base = addr & PAGE_MASK; - if (npages == 0 || npages > UINT_MAX) { - ret = -EINVAL; - goto vma; - } - ret = sg_alloc_table(&umem->sg_head, npages, GFP_KERNEL); if (ret) goto vma; @@ -172,14 +181,14 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, sg_list_start = umem->sg_head.sgl; - down_read(¤t->mm->mmap_sem); while (npages) { + down_read(&mm->mmap_sem); ret = get_user_pages_longterm(cur_base, min_t(unsigned long, npages, PAGE_SIZE / sizeof (struct page *)), gup_flags, page_list, vma_list); if (ret < 0) { - up_read(¤t->mm->mmap_sem); + up_read(&mm->mmap_sem); goto umem_release; } @@ -187,17 +196,20 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, cur_base += ret * PAGE_SIZE; npages -= ret; + /* Continue to hold the mmap_sem as vma_list access + * needs to be protected. + */ for_each_sg(sg_list_start, sg, ret, i) { if (vma_list && !is_vm_hugetlb_page(vma_list[i])) umem->hugetlb = 0; sg_set_page(sg, page_list[i], PAGE_SIZE, 0); } + up_read(&mm->mmap_sem); /* preparing for next loop */ sg_list_start = sg; } - up_read(¤t->mm->mmap_sem); umem->nmap = ib_dma_map_sg_attrs(context->device, umem->sg_head.sgl, @@ -216,29 +228,40 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, umem_release: __ib_umem_release(context->device, umem, 0); vma: - down_write(¤t->mm->mmap_sem); - current->mm->pinned_vm -= ib_umem_num_pages(umem); - up_write(¤t->mm->mmap_sem); + down_write(&mm->mmap_sem); + mm->pinned_vm -= ib_umem_num_pages(umem); + up_write(&mm->mmap_sem); out: if (vma_list) free_page((unsigned long) vma_list); free_page((unsigned long) page_list); umem_kfree: - if (ret) + if (ret) { + mmdrop(umem->owning_mm); kfree(umem); + } return ret ? ERR_PTR(ret) : umem; } EXPORT_SYMBOL(ib_umem_get); -static void ib_umem_account(struct work_struct *work) +static void __ib_umem_release_tail(struct ib_umem *umem) +{ + mmdrop(umem->owning_mm); + if (umem->is_odp) + kfree(to_ib_umem_odp(umem)); + else + kfree(umem); +} + +static void ib_umem_release_defer(struct work_struct *work) { struct ib_umem *umem = container_of(work, struct ib_umem, work); - down_write(&umem->mm->mmap_sem); - umem->mm->pinned_vm -= umem->diff; - up_write(&umem->mm->mmap_sem); - mmput(umem->mm); - kfree(umem); + down_write(&umem->owning_mm->mmap_sem); + umem->owning_mm->pinned_vm -= ib_umem_num_pages(umem); + up_write(&umem->owning_mm->mmap_sem); + + __ib_umem_release_tail(umem); } /** @@ -248,52 +271,36 @@ static void ib_umem_account(struct work_struct *work) void ib_umem_release(struct ib_umem *umem) { struct ib_ucontext *context = umem->context; - struct mm_struct *mm; - struct task_struct *task; - unsigned long diff; - if (umem->odp_data) { - ib_umem_odp_release(umem); + if (umem->is_odp) { + ib_umem_odp_release(to_ib_umem_odp(umem)); + __ib_umem_release_tail(umem); return; } __ib_umem_release(umem->context->device, umem, 1); - task = get_pid_task(umem->context->tgid, PIDTYPE_PID); - if (!task) - goto out; - mm = get_task_mm(task); - put_task_struct(task); - if (!mm) - goto out; - - diff = ib_umem_num_pages(umem); - /* * We may be called with the mm's mmap_sem already held. This * can happen when a userspace munmap() is the call that drops * the last reference to our file and calls our release * method. If there are memory regions to destroy, we'll end * up here and not be able to take the mmap_sem. In that case - * we defer the vm_locked accounting to the system workqueue. + * we defer the vm_locked accounting a workqueue. */ if (context->closing) { - if (!down_write_trylock(&mm->mmap_sem)) { - INIT_WORK(&umem->work, ib_umem_account); - umem->mm = mm; - umem->diff = diff; - + if (!down_write_trylock(&umem->owning_mm->mmap_sem)) { + INIT_WORK(&umem->work, ib_umem_release_defer); queue_work(ib_wq, &umem->work); return; } - } else - down_write(&mm->mmap_sem); + } else { + down_write(&umem->owning_mm->mmap_sem); + } + umem->owning_mm->pinned_vm -= ib_umem_num_pages(umem); + up_write(&umem->owning_mm->mmap_sem); - mm->pinned_vm -= diff; - up_write(&mm->mmap_sem); - mmput(mm); -out: - kfree(umem); + __ib_umem_release_tail(umem); } EXPORT_SYMBOL(ib_umem_release); @@ -303,7 +310,7 @@ int ib_umem_page_count(struct ib_umem *umem) int n; struct scatterlist *sg; - if (umem->odp_data) + if (umem->is_odp) return ib_umem_num_pages(umem); n = 0; diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c index 6ec748eccff7..2b4c5e7dd5a1 100644 --- a/drivers/infiniband/core/umem_odp.c +++ b/drivers/infiniband/core/umem_odp.c @@ -58,7 +58,7 @@ static u64 node_start(struct umem_odp_node *n) struct ib_umem_odp *umem_odp = container_of(n, struct ib_umem_odp, interval_tree); - return ib_umem_start(umem_odp->umem); + return ib_umem_start(&umem_odp->umem); } /* Note that the representation of the intervals in the interval tree @@ -71,140 +71,86 @@ static u64 node_last(struct umem_odp_node *n) struct ib_umem_odp *umem_odp = container_of(n, struct ib_umem_odp, interval_tree); - return ib_umem_end(umem_odp->umem) - 1; + return ib_umem_end(&umem_odp->umem) - 1; } INTERVAL_TREE_DEFINE(struct umem_odp_node, rb, u64, __subtree_last, node_start, node_last, static, rbt_ib_umem) -static void ib_umem_notifier_start_account(struct ib_umem *item) +static void ib_umem_notifier_start_account(struct ib_umem_odp *umem_odp) { - mutex_lock(&item->odp_data->umem_mutex); - - /* Only update private counters for this umem if it has them. - * Otherwise skip it. All page faults will be delayed for this umem. */ - if (item->odp_data->mn_counters_active) { - int notifiers_count = item->odp_data->notifiers_count++; - - if (notifiers_count == 0) - /* Initialize the completion object for waiting on - * notifiers. Since notifier_count is zero, no one - * should be waiting right now. */ - reinit_completion(&item->odp_data->notifier_completion); - } - mutex_unlock(&item->odp_data->umem_mutex); -} - -static void ib_umem_notifier_end_account(struct ib_umem *item) -{ - mutex_lock(&item->odp_data->umem_mutex); - - /* Only update private counters for this umem if it has them. - * Otherwise skip it. All page faults will be delayed for this umem. */ - if (item->odp_data->mn_counters_active) { + mutex_lock(&umem_odp->umem_mutex); + if (umem_odp->notifiers_count++ == 0) /* - * This sequence increase will notify the QP page fault that - * the page that is going to be mapped in the spte could have - * been freed. + * Initialize the completion object for waiting on + * notifiers. Since notifier_count is zero, no one should be + * waiting right now. */ - ++item->odp_data->notifiers_seq; - if (--item->odp_data->notifiers_count == 0) - complete_all(&item->odp_data->notifier_completion); - } - mutex_unlock(&item->odp_data->umem_mutex); + reinit_completion(&umem_odp->notifier_completion); + mutex_unlock(&umem_odp->umem_mutex); } -/* Account for a new mmu notifier in an ib_ucontext. */ -static void ib_ucontext_notifier_start_account(struct ib_ucontext *context) +static void ib_umem_notifier_end_account(struct ib_umem_odp *umem_odp) { - atomic_inc(&context->notifier_count); + mutex_lock(&umem_odp->umem_mutex); + /* + * This sequence increase will notify the QP page fault that the page + * that is going to be mapped in the spte could have been freed. + */ + ++umem_odp->notifiers_seq; + if (--umem_odp->notifiers_count == 0) + complete_all(&umem_odp->notifier_completion); + mutex_unlock(&umem_odp->umem_mutex); } -/* Account for a terminating mmu notifier in an ib_ucontext. - * - * Must be called with the ib_ucontext->umem_rwsem semaphore unlocked, since - * the function takes the semaphore itself. */ -static void ib_ucontext_notifier_end_account(struct ib_ucontext *context) +static int ib_umem_notifier_release_trampoline(struct ib_umem_odp *umem_odp, + u64 start, u64 end, void *cookie) { - int zero_notifiers = atomic_dec_and_test(&context->notifier_count); - - if (zero_notifiers && - !list_empty(&context->no_private_counters)) { - /* No currently running mmu notifiers. Now is the chance to - * add private accounting to all previously added umems. */ - struct ib_umem_odp *odp_data, *next; - - /* Prevent concurrent mmu notifiers from working on the - * no_private_counters list. */ - down_write(&context->umem_rwsem); - - /* Read the notifier_count again, with the umem_rwsem - * semaphore taken for write. */ - if (!atomic_read(&context->notifier_count)) { - list_for_each_entry_safe(odp_data, next, - &context->no_private_counters, - no_private_counters) { - mutex_lock(&odp_data->umem_mutex); - odp_data->mn_counters_active = true; - list_del(&odp_data->no_private_counters); - complete_all(&odp_data->notifier_completion); - mutex_unlock(&odp_data->umem_mutex); - } - } - - up_write(&context->umem_rwsem); - } -} + struct ib_umem *umem = &umem_odp->umem; -static int ib_umem_notifier_release_trampoline(struct ib_umem *item, u64 start, - u64 end, void *cookie) { /* * Increase the number of notifiers running, to * prevent any further fault handling on this MR. */ - ib_umem_notifier_start_account(item); - item->odp_data->dying = 1; + ib_umem_notifier_start_account(umem_odp); + umem_odp->dying = 1; /* Make sure that the fact the umem is dying is out before we release * all pending page faults. */ smp_wmb(); - complete_all(&item->odp_data->notifier_completion); - item->context->invalidate_range(item, ib_umem_start(item), - ib_umem_end(item)); + complete_all(&umem_odp->notifier_completion); + umem->context->invalidate_range(umem_odp, ib_umem_start(umem), + ib_umem_end(umem)); return 0; } static void ib_umem_notifier_release(struct mmu_notifier *mn, struct mm_struct *mm) { - struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn); - - if (!context->invalidate_range) - return; - - ib_ucontext_notifier_start_account(context); - down_read(&context->umem_rwsem); - rbt_ib_umem_for_each_in_range(&context->umem_tree, 0, - ULLONG_MAX, - ib_umem_notifier_release_trampoline, - true, - NULL); - up_read(&context->umem_rwsem); + struct ib_ucontext_per_mm *per_mm = + container_of(mn, struct ib_ucontext_per_mm, mn); + + down_read(&per_mm->umem_rwsem); + if (per_mm->active) + rbt_ib_umem_for_each_in_range( + &per_mm->umem_tree, 0, ULLONG_MAX, + ib_umem_notifier_release_trampoline, true, NULL); + up_read(&per_mm->umem_rwsem); } -static int invalidate_page_trampoline(struct ib_umem *item, u64 start, +static int invalidate_page_trampoline(struct ib_umem_odp *item, u64 start, u64 end, void *cookie) { ib_umem_notifier_start_account(item); - item->context->invalidate_range(item, start, start + PAGE_SIZE); + item->umem.context->invalidate_range(item, start, start + PAGE_SIZE); ib_umem_notifier_end_account(item); return 0; } -static int invalidate_range_start_trampoline(struct ib_umem *item, u64 start, - u64 end, void *cookie) +static int invalidate_range_start_trampoline(struct ib_umem_odp *item, + u64 start, u64 end, void *cookie) { ib_umem_notifier_start_account(item); - item->context->invalidate_range(item, start, end); + item->umem.context->invalidate_range(item, start, end); return 0; } @@ -214,28 +160,30 @@ static int ib_umem_notifier_invalidate_range_start(struct mmu_notifier *mn, unsigned long end, bool blockable) { - struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn); - int ret; - - if (!context->invalidate_range) - return 0; + struct ib_ucontext_per_mm *per_mm = + container_of(mn, struct ib_ucontext_per_mm, mn); if (blockable) - down_read(&context->umem_rwsem); - else if (!down_read_trylock(&context->umem_rwsem)) + down_read(&per_mm->umem_rwsem); + else if (!down_read_trylock(&per_mm->umem_rwsem)) return -EAGAIN; - ib_ucontext_notifier_start_account(context); - ret = rbt_ib_umem_for_each_in_range(&context->umem_tree, start, - end, - invalidate_range_start_trampoline, - blockable, NULL); - up_read(&context->umem_rwsem); + if (!per_mm->active) { + up_read(&per_mm->umem_rwsem); + /* + * At this point active is permanently set and visible to this + * CPU without a lock, that fact is relied on to skip the unlock + * in range_end. + */ + return 0; + } - return ret; + return rbt_ib_umem_for_each_in_range(&per_mm->umem_tree, start, end, + invalidate_range_start_trampoline, + blockable, NULL); } -static int invalidate_range_end_trampoline(struct ib_umem *item, u64 start, +static int invalidate_range_end_trampoline(struct ib_umem_odp *item, u64 start, u64 end, void *cookie) { ib_umem_notifier_end_account(item); @@ -247,22 +195,16 @@ static void ib_umem_notifier_invalidate_range_end(struct mmu_notifier *mn, unsigned long start, unsigned long end) { - struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn); + struct ib_ucontext_per_mm *per_mm = + container_of(mn, struct ib_ucontext_per_mm, mn); - if (!context->invalidate_range) + if (unlikely(!per_mm->active)) return; - /* - * TODO: we currently bail out if there is any sleepable work to be done - * in ib_umem_notifier_invalidate_range_start so we shouldn't really block - * here. But this is ugly and fragile. - */ - down_read(&context->umem_rwsem); - rbt_ib_umem_for_each_in_range(&context->umem_tree, start, + rbt_ib_umem_for_each_in_range(&per_mm->umem_tree, start, end, invalidate_range_end_trampoline, true, NULL); - up_read(&context->umem_rwsem); - ib_ucontext_notifier_end_account(context); + up_read(&per_mm->umem_rwsem); } static const struct mmu_notifier_ops ib_umem_notifiers = { @@ -271,31 +213,158 @@ static const struct mmu_notifier_ops ib_umem_notifiers = { .invalidate_range_end = ib_umem_notifier_invalidate_range_end, }; -struct ib_umem *ib_alloc_odp_umem(struct ib_ucontext *context, - unsigned long addr, - size_t size) +static void add_umem_to_per_mm(struct ib_umem_odp *umem_odp) { - struct ib_umem *umem; + struct ib_ucontext_per_mm *per_mm = umem_odp->per_mm; + struct ib_umem *umem = &umem_odp->umem; + + down_write(&per_mm->umem_rwsem); + if (likely(ib_umem_start(umem) != ib_umem_end(umem))) + rbt_ib_umem_insert(&umem_odp->interval_tree, + &per_mm->umem_tree); + up_write(&per_mm->umem_rwsem); +} + +static void remove_umem_from_per_mm(struct ib_umem_odp *umem_odp) +{ + struct ib_ucontext_per_mm *per_mm = umem_odp->per_mm; + struct ib_umem *umem = &umem_odp->umem; + + down_write(&per_mm->umem_rwsem); + if (likely(ib_umem_start(umem) != ib_umem_end(umem))) + rbt_ib_umem_remove(&umem_odp->interval_tree, + &per_mm->umem_tree); + complete_all(&umem_odp->notifier_completion); + + up_write(&per_mm->umem_rwsem); +} + +static struct ib_ucontext_per_mm *alloc_per_mm(struct ib_ucontext *ctx, + struct mm_struct *mm) +{ + struct ib_ucontext_per_mm *per_mm; + int ret; + + per_mm = kzalloc(sizeof(*per_mm), GFP_KERNEL); + if (!per_mm) + return ERR_PTR(-ENOMEM); + + per_mm->context = ctx; + per_mm->mm = mm; + per_mm->umem_tree = RB_ROOT_CACHED; + init_rwsem(&per_mm->umem_rwsem); + per_mm->active = ctx->invalidate_range; + + rcu_read_lock(); + per_mm->tgid = get_task_pid(current->group_leader, PIDTYPE_PID); + rcu_read_unlock(); + + WARN_ON(mm != current->mm); + + per_mm->mn.ops = &ib_umem_notifiers; + ret = mmu_notifier_register(&per_mm->mn, per_mm->mm); + if (ret) { + dev_err(&ctx->device->dev, + "Failed to register mmu_notifier %d\n", ret); + goto out_pid; + } + + list_add(&per_mm->ucontext_list, &ctx->per_mm_list); + return per_mm; + +out_pid: + put_pid(per_mm->tgid); + kfree(per_mm); + return ERR_PTR(ret); +} + +static int get_per_mm(struct ib_umem_odp *umem_odp) +{ + struct ib_ucontext *ctx = umem_odp->umem.context; + struct ib_ucontext_per_mm *per_mm; + + /* + * Generally speaking we expect only one or two per_mm in this list, + * so no reason to optimize this search today. + */ + mutex_lock(&ctx->per_mm_list_lock); + list_for_each_entry(per_mm, &ctx->per_mm_list, ucontext_list) { + if (per_mm->mm == umem_odp->umem.owning_mm) + goto found; + } + + per_mm = alloc_per_mm(ctx, umem_odp->umem.owning_mm); + if (IS_ERR(per_mm)) { + mutex_unlock(&ctx->per_mm_list_lock); + return PTR_ERR(per_mm); + } + +found: + umem_odp->per_mm = per_mm; + per_mm->odp_mrs_count++; + mutex_unlock(&ctx->per_mm_list_lock); + + return 0; +} + +static void free_per_mm(struct rcu_head *rcu) +{ + kfree(container_of(rcu, struct ib_ucontext_per_mm, rcu)); +} + +void put_per_mm(struct ib_umem_odp *umem_odp) +{ + struct ib_ucontext_per_mm *per_mm = umem_odp->per_mm; + struct ib_ucontext *ctx = umem_odp->umem.context; + bool need_free; + + mutex_lock(&ctx->per_mm_list_lock); + umem_odp->per_mm = NULL; + per_mm->odp_mrs_count--; + need_free = per_mm->odp_mrs_count == 0; + if (need_free) + list_del(&per_mm->ucontext_list); + mutex_unlock(&ctx->per_mm_list_lock); + + if (!need_free) + return; + + /* + * NOTE! mmu_notifier_unregister() can happen between a start/end + * callback, resulting in an start/end, and thus an unbalanced + * lock. This doesn't really matter to us since we are about to kfree + * the memory that holds the lock, however LOCKDEP doesn't like this. + */ + down_write(&per_mm->umem_rwsem); + per_mm->active = false; + up_write(&per_mm->umem_rwsem); + + WARN_ON(!RB_EMPTY_ROOT(&per_mm->umem_tree.rb_root)); + mmu_notifier_unregister_no_release(&per_mm->mn, per_mm->mm); + put_pid(per_mm->tgid); + mmu_notifier_call_srcu(&per_mm->rcu, free_per_mm); +} + +struct ib_umem_odp *ib_alloc_odp_umem(struct ib_ucontext_per_mm *per_mm, + unsigned long addr, size_t size) +{ + struct ib_ucontext *ctx = per_mm->context; struct ib_umem_odp *odp_data; + struct ib_umem *umem; int pages = size >> PAGE_SHIFT; int ret; - umem = kzalloc(sizeof(*umem), GFP_KERNEL); - if (!umem) + odp_data = kzalloc(sizeof(*odp_data), GFP_KERNEL); + if (!odp_data) return ERR_PTR(-ENOMEM); - - umem->context = context; + umem = &odp_data->umem; + umem->context = ctx; umem->length = size; umem->address = addr; umem->page_shift = PAGE_SHIFT; umem->writable = 1; - - odp_data = kzalloc(sizeof(*odp_data), GFP_KERNEL); - if (!odp_data) { - ret = -ENOMEM; - goto out_umem; - } - odp_data->umem = umem; + umem->is_odp = 1; + odp_data->per_mm = per_mm; mutex_init(&odp_data->umem_mutex); init_completion(&odp_data->notifier_completion); @@ -314,39 +383,34 @@ struct ib_umem *ib_alloc_odp_umem(struct ib_ucontext *context, goto out_page_list; } - down_write(&context->umem_rwsem); - context->odp_mrs_count++; - rbt_ib_umem_insert(&odp_data->interval_tree, &context->umem_tree); - if (likely(!atomic_read(&context->notifier_count))) - odp_data->mn_counters_active = true; - else - list_add(&odp_data->no_private_counters, - &context->no_private_counters); - up_write(&context->umem_rwsem); - - umem->odp_data = odp_data; + /* + * Caller must ensure that the umem_odp that the per_mm came from + * cannot be freed during the call to ib_alloc_odp_umem. + */ + mutex_lock(&ctx->per_mm_list_lock); + per_mm->odp_mrs_count++; + mutex_unlock(&ctx->per_mm_list_lock); + add_umem_to_per_mm(odp_data); - return umem; + return odp_data; out_page_list: vfree(odp_data->page_list); out_odp_data: kfree(odp_data); -out_umem: - kfree(umem); return ERR_PTR(ret); } EXPORT_SYMBOL(ib_alloc_odp_umem); -int ib_umem_odp_get(struct ib_ucontext *context, struct ib_umem *umem, - int access) +int ib_umem_odp_get(struct ib_umem_odp *umem_odp, int access) { + struct ib_umem *umem = &umem_odp->umem; + /* + * NOTE: This must called in a process context where umem->owning_mm + * == current->mm + */ + struct mm_struct *mm = umem->owning_mm; int ret_val; - struct pid *our_pid; - struct mm_struct *mm = get_task_mm(current); - - if (!mm) - return -EINVAL; if (access & IB_ACCESS_HUGETLB) { struct vm_area_struct *vma; @@ -366,111 +430,43 @@ int ib_umem_odp_get(struct ib_ucontext *context, struct ib_umem *umem, umem->hugetlb = 0; } - /* Prevent creating ODP MRs in child processes */ - rcu_read_lock(); - our_pid = get_task_pid(current->group_leader, PIDTYPE_PID); - rcu_read_unlock(); - put_pid(our_pid); - if (context->tgid != our_pid) { - ret_val = -EINVAL; - goto out_mm; - } - - umem->odp_data = kzalloc(sizeof(*umem->odp_data), GFP_KERNEL); - if (!umem->odp_data) { - ret_val = -ENOMEM; - goto out_mm; - } - umem->odp_data->umem = umem; - - mutex_init(&umem->odp_data->umem_mutex); + mutex_init(&umem_odp->umem_mutex); - init_completion(&umem->odp_data->notifier_completion); + init_completion(&umem_odp->notifier_completion); if (ib_umem_num_pages(umem)) { - umem->odp_data->page_list = - vzalloc(array_size(sizeof(*umem->odp_data->page_list), + umem_odp->page_list = + vzalloc(array_size(sizeof(*umem_odp->page_list), ib_umem_num_pages(umem))); - if (!umem->odp_data->page_list) { - ret_val = -ENOMEM; - goto out_odp_data; - } + if (!umem_odp->page_list) + return -ENOMEM; - umem->odp_data->dma_list = - vzalloc(array_size(sizeof(*umem->odp_data->dma_list), + umem_odp->dma_list = + vzalloc(array_size(sizeof(*umem_odp->dma_list), ib_umem_num_pages(umem))); - if (!umem->odp_data->dma_list) { + if (!umem_odp->dma_list) { ret_val = -ENOMEM; goto out_page_list; } } - /* - * When using MMU notifiers, we will get a - * notification before the "current" task (and MM) is - * destroyed. We use the umem_rwsem semaphore to synchronize. - */ - down_write(&context->umem_rwsem); - context->odp_mrs_count++; - if (likely(ib_umem_start(umem) != ib_umem_end(umem))) - rbt_ib_umem_insert(&umem->odp_data->interval_tree, - &context->umem_tree); - if (likely(!atomic_read(&context->notifier_count)) || - context->odp_mrs_count == 1) - umem->odp_data->mn_counters_active = true; - else - list_add(&umem->odp_data->no_private_counters, - &context->no_private_counters); - downgrade_write(&context->umem_rwsem); - - if (context->odp_mrs_count == 1) { - /* - * Note that at this point, no MMU notifier is running - * for this context! - */ - atomic_set(&context->notifier_count, 0); - INIT_HLIST_NODE(&context->mn.hlist); - context->mn.ops = &ib_umem_notifiers; - /* - * Lock-dep detects a false positive for mmap_sem vs. - * umem_rwsem, due to not grasping downgrade_write correctly. - */ - lockdep_off(); - ret_val = mmu_notifier_register(&context->mn, mm); - lockdep_on(); - if (ret_val) { - pr_err("Failed to register mmu_notifier %d\n", ret_val); - ret_val = -EBUSY; - goto out_mutex; - } - } - - up_read(&context->umem_rwsem); + ret_val = get_per_mm(umem_odp); + if (ret_val) + goto out_dma_list; + add_umem_to_per_mm(umem_odp); - /* - * Note that doing an mmput can cause a notifier for the relevant mm. - * If the notifier is called while we hold the umem_rwsem, this will - * cause a deadlock. Therefore, we release the reference only after we - * released the semaphore. - */ - mmput(mm); return 0; -out_mutex: - up_read(&context->umem_rwsem); - vfree(umem->odp_data->dma_list); +out_dma_list: + vfree(umem_odp->dma_list); out_page_list: - vfree(umem->odp_data->page_list); -out_odp_data: - kfree(umem->odp_data); -out_mm: - mmput(mm); + vfree(umem_odp->page_list); return ret_val; } -void ib_umem_odp_release(struct ib_umem *umem) +void ib_umem_odp_release(struct ib_umem_odp *umem_odp) { - struct ib_ucontext *context = umem->context; + struct ib_umem *umem = &umem_odp->umem; /* * Ensure that no more pages are mapped in the umem. @@ -478,61 +474,13 @@ void ib_umem_odp_release(struct ib_umem *umem) * It is the driver's responsibility to ensure, before calling us, * that the hardware will not attempt to access the MR any more. */ - ib_umem_odp_unmap_dma_pages(umem, ib_umem_start(umem), + ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem), ib_umem_end(umem)); - down_write(&context->umem_rwsem); - if (likely(ib_umem_start(umem) != ib_umem_end(umem))) - rbt_ib_umem_remove(&umem->odp_data->interval_tree, - &context->umem_tree); - context->odp_mrs_count--; - if (!umem->odp_data->mn_counters_active) { - list_del(&umem->odp_data->no_private_counters); - complete_all(&umem->odp_data->notifier_completion); - } - - /* - * Downgrade the lock to a read lock. This ensures that the notifiers - * (who lock the mutex for reading) will be able to finish, and we - * will be able to enventually obtain the mmu notifiers SRCU. Note - * that since we are doing it atomically, no other user could register - * and unregister while we do the check. - */ - downgrade_write(&context->umem_rwsem); - if (!context->odp_mrs_count) { - struct task_struct *owning_process = NULL; - struct mm_struct *owning_mm = NULL; - - owning_process = get_pid_task(context->tgid, - PIDTYPE_PID); - if (owning_process == NULL) - /* - * The process is already dead, notifier were removed - * already. - */ - goto out; - - owning_mm = get_task_mm(owning_process); - if (owning_mm == NULL) - /* - * The process' mm is already dead, notifier were - * removed already. - */ - goto out_put_task; - mmu_notifier_unregister(&context->mn, owning_mm); - - mmput(owning_mm); - -out_put_task: - put_task_struct(owning_process); - } -out: - up_read(&context->umem_rwsem); - - vfree(umem->odp_data->dma_list); - vfree(umem->odp_data->page_list); - kfree(umem->odp_data); - kfree(umem); + remove_umem_from_per_mm(umem_odp); + put_per_mm(umem_odp); + vfree(umem_odp->dma_list); + vfree(umem_odp->page_list); } /* @@ -544,7 +492,7 @@ out: * @access_mask: access permissions needed for this page. * @current_seq: sequence number for synchronization with invalidations. * the sequence number is taken from - * umem->odp_data->notifiers_seq. + * umem_odp->notifiers_seq. * * The function returns -EFAULT if the DMA mapping operation fails. It returns * -EAGAIN if a concurrent invalidation prevents us from updating the page. @@ -554,12 +502,13 @@ out: * umem. */ static int ib_umem_odp_map_dma_single_page( - struct ib_umem *umem, + struct ib_umem_odp *umem_odp, int page_index, struct page *page, u64 access_mask, unsigned long current_seq) { + struct ib_umem *umem = &umem_odp->umem; struct ib_device *dev = umem->context->device; dma_addr_t dma_addr; int stored_page = 0; @@ -571,11 +520,11 @@ static int ib_umem_odp_map_dma_single_page( * handle case of a racing notifier. This check also allows us to bail * early if we have a notifier running in parallel with us. */ - if (ib_umem_mmu_notifier_retry(umem, current_seq)) { + if (ib_umem_mmu_notifier_retry(umem_odp, current_seq)) { ret = -EAGAIN; goto out; } - if (!(umem->odp_data->dma_list[page_index])) { + if (!(umem_odp->dma_list[page_index])) { dma_addr = ib_dma_map_page(dev, page, 0, BIT(umem->page_shift), @@ -584,15 +533,15 @@ static int ib_umem_odp_map_dma_single_page( ret = -EFAULT; goto out; } - umem->odp_data->dma_list[page_index] = dma_addr | access_mask; - umem->odp_data->page_list[page_index] = page; + umem_odp->dma_list[page_index] = dma_addr | access_mask; + umem_odp->page_list[page_index] = page; umem->npages++; stored_page = 1; - } else if (umem->odp_data->page_list[page_index] == page) { - umem->odp_data->dma_list[page_index] |= access_mask; + } else if (umem_odp->page_list[page_index] == page) { + umem_odp->dma_list[page_index] |= access_mask; } else { pr_err("error: got different pages in IB device and from get_user_pages. IB device page: %p, gup page: %p\n", - umem->odp_data->page_list[page_index], page); + umem_odp->page_list[page_index], page); /* Better remove the mapping now, to prevent any further * damage. */ remove_existing_mapping = 1; @@ -605,7 +554,7 @@ out: if (remove_existing_mapping && umem->context->invalidate_range) { invalidate_page_trampoline( - umem, + umem_odp, ib_umem_start(umem) + (page_index >> umem->page_shift), ib_umem_start(umem) + ((page_index + 1) >> umem->page_shift), @@ -621,7 +570,7 @@ out: * * Pins the range of pages passed in the argument, and maps them to * DMA addresses. The DMA addresses of the mapped pages is updated in - * umem->odp_data->dma_list. + * umem_odp->dma_list. * * Returns the number of pages mapped in success, negative error code * for failure. @@ -629,7 +578,7 @@ out: * the function from completing its task. * An -ENOENT error code indicates that userspace process is being terminated * and mm was already destroyed. - * @umem: the umem to map and pin + * @umem_odp: the umem to map and pin * @user_virt: the address from which we need to map. * @bcnt: the minimal number of bytes to pin and map. The mapping might be * bigger due to alignment, and may also be smaller in case of an error @@ -639,13 +588,15 @@ out: * range. * @current_seq: the MMU notifiers sequance value for synchronization with * invalidations. the sequance number is read from - * umem->odp_data->notifiers_seq before calling this function + * umem_odp->notifiers_seq before calling this function */ -int ib_umem_odp_map_dma_pages(struct ib_umem *umem, u64 user_virt, u64 bcnt, - u64 access_mask, unsigned long current_seq) +int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 user_virt, + u64 bcnt, u64 access_mask, + unsigned long current_seq) { + struct ib_umem *umem = &umem_odp->umem; struct task_struct *owning_process = NULL; - struct mm_struct *owning_mm = NULL; + struct mm_struct *owning_mm = umem_odp->umem.owning_mm; struct page **local_page_list = NULL; u64 page_mask, off; int j, k, ret = 0, start_idx, npages = 0, page_shift; @@ -669,15 +620,14 @@ int ib_umem_odp_map_dma_pages(struct ib_umem *umem, u64 user_virt, u64 bcnt, user_virt = user_virt & page_mask; bcnt += off; /* Charge for the first page offset as well. */ - owning_process = get_pid_task(umem->context->tgid, PIDTYPE_PID); - if (owning_process == NULL) { + /* + * owning_process is allowed to be NULL, this means somehow the mm is + * existing beyond the lifetime of the originating process.. Presumably + * mmget_not_zero will fail in this case. + */ + owning_process = get_pid_task(umem_odp->per_mm->tgid, PIDTYPE_PID); + if (WARN_ON(!mmget_not_zero(umem_odp->umem.owning_mm))) { ret = -EINVAL; - goto out_no_task; - } - - owning_mm = get_task_mm(owning_process); - if (owning_mm == NULL) { - ret = -ENOENT; goto out_put_task; } @@ -709,7 +659,7 @@ int ib_umem_odp_map_dma_pages(struct ib_umem *umem, u64 user_virt, u64 bcnt, break; bcnt -= min_t(size_t, npages << PAGE_SHIFT, bcnt); - mutex_lock(&umem->odp_data->umem_mutex); + mutex_lock(&umem_odp->umem_mutex); for (j = 0; j < npages; j++, user_virt += PAGE_SIZE) { if (user_virt & ~page_mask) { p += PAGE_SIZE; @@ -722,7 +672,7 @@ int ib_umem_odp_map_dma_pages(struct ib_umem *umem, u64 user_virt, u64 bcnt, } ret = ib_umem_odp_map_dma_single_page( - umem, k, local_page_list[j], + umem_odp, k, local_page_list[j], access_mask, current_seq); if (ret < 0) break; @@ -730,7 +680,7 @@ int ib_umem_odp_map_dma_pages(struct ib_umem *umem, u64 user_virt, u64 bcnt, p = page_to_phys(local_page_list[j]); k++; } - mutex_unlock(&umem->odp_data->umem_mutex); + mutex_unlock(&umem_odp->umem_mutex); if (ret < 0) { /* Release left over pages when handling errors. */ @@ -749,16 +699,17 @@ int ib_umem_odp_map_dma_pages(struct ib_umem *umem, u64 user_virt, u64 bcnt, mmput(owning_mm); out_put_task: - put_task_struct(owning_process); -out_no_task: + if (owning_process) + put_task_struct(owning_process); free_page((unsigned long)local_page_list); return ret; } EXPORT_SYMBOL(ib_umem_odp_map_dma_pages); -void ib_umem_odp_unmap_dma_pages(struct ib_umem *umem, u64 virt, +void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 virt, u64 bound) { + struct ib_umem *umem = &umem_odp->umem; int idx; u64 addr; struct ib_device *dev = umem->context->device; @@ -770,12 +721,12 @@ void ib_umem_odp_unmap_dma_pages(struct ib_umem *umem, u64 virt, * faults from completion. We might be racing with other * invalidations, so we must make sure we free each page only * once. */ - mutex_lock(&umem->odp_data->umem_mutex); + mutex_lock(&umem_odp->umem_mutex); for (addr = virt; addr < bound; addr += BIT(umem->page_shift)) { idx = (addr - ib_umem_start(umem)) >> umem->page_shift; - if (umem->odp_data->page_list[idx]) { - struct page *page = umem->odp_data->page_list[idx]; - dma_addr_t dma = umem->odp_data->dma_list[idx]; + if (umem_odp->page_list[idx]) { + struct page *page = umem_odp->page_list[idx]; + dma_addr_t dma = umem_odp->dma_list[idx]; dma_addr_t dma_addr = dma & ODP_DMA_ADDR_MASK; WARN_ON(!dma_addr); @@ -798,12 +749,12 @@ void ib_umem_odp_unmap_dma_pages(struct ib_umem *umem, u64 virt, /* on demand pinning support */ if (!umem->context->invalidate_range) put_page(page); - umem->odp_data->page_list[idx] = NULL; - umem->odp_data->dma_list[idx] = 0; + umem_odp->page_list[idx] = NULL; + umem_odp->dma_list[idx] = 0; umem->npages--; } } - mutex_unlock(&umem->odp_data->umem_mutex); + mutex_unlock(&umem_odp->umem_mutex); } EXPORT_SYMBOL(ib_umem_odp_unmap_dma_pages); @@ -830,7 +781,7 @@ int rbt_ib_umem_for_each_in_range(struct rb_root_cached *root, return -EAGAIN; next = rbt_ib_umem_iter_next(node, start, last - 1); umem = container_of(node, struct ib_umem_odp, interval_tree); - ret_val = cb(umem->umem, start, last, cookie) || ret_val; + ret_val = cb(umem, start, last, cookie) || ret_val; } return ret_val; diff --git a/drivers/infiniband/core/user_mad.c b/drivers/infiniband/core/user_mad.c index c34a6852d691..f55f48f6b272 100644 --- a/drivers/infiniband/core/user_mad.c +++ b/drivers/infiniband/core/user_mad.c @@ -138,7 +138,7 @@ static const dev_t base_issm_dev = MKDEV(IB_UMAD_MAJOR, IB_UMAD_MINOR_BASE) + static dev_t dynamic_umad_dev; static dev_t dynamic_issm_dev; -static DECLARE_BITMAP(dev_map, IB_UMAD_MAX_PORTS); +static DEFINE_IDA(umad_ida); static void ib_umad_add_one(struct ib_device *device); static void ib_umad_remove_one(struct ib_device *device, void *client_data); @@ -1132,7 +1132,7 @@ static ssize_t show_ibdev(struct device *dev, struct device_attribute *attr, if (!port) return -ENODEV; - return sprintf(buf, "%s\n", port->ib_dev->name); + return sprintf(buf, "%s\n", dev_name(&port->ib_dev->dev)); } static DEVICE_ATTR(ibdev, S_IRUGO, show_ibdev, NULL); @@ -1159,11 +1159,10 @@ static int ib_umad_init_port(struct ib_device *device, int port_num, dev_t base_umad; dev_t base_issm; - devnum = find_first_zero_bit(dev_map, IB_UMAD_MAX_PORTS); - if (devnum >= IB_UMAD_MAX_PORTS) + devnum = ida_alloc_max(&umad_ida, IB_UMAD_MAX_PORTS - 1, GFP_KERNEL); + if (devnum < 0) return -1; port->dev_num = devnum; - set_bit(devnum, dev_map); if (devnum >= IB_UMAD_NUM_FIXED_MINOR) { base_umad = dynamic_umad_dev + devnum - IB_UMAD_NUM_FIXED_MINOR; base_issm = dynamic_issm_dev + devnum - IB_UMAD_NUM_FIXED_MINOR; @@ -1227,7 +1226,7 @@ err_dev: err_cdev: cdev_del(&port->cdev); - clear_bit(devnum, dev_map); + ida_free(&umad_ida, devnum); return -1; } @@ -1261,7 +1260,7 @@ static void ib_umad_kill_port(struct ib_umad_port *port) } mutex_unlock(&port->file_mutex); - clear_bit(port->dev_num, dev_map); + ida_free(&umad_ida, port->dev_num); } static void ib_umad_add_one(struct ib_device *device) diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h index 5df8e548cc14..c97935a0c7c6 100644 --- a/drivers/infiniband/core/uverbs.h +++ b/drivers/infiniband/core/uverbs.h @@ -100,13 +100,14 @@ struct ib_uverbs_device { atomic_t refcount; int num_comp_vectors; struct completion comp; - struct device *dev; + struct device dev; + /* First group for device attributes, NULL terminated array */ + const struct attribute_group *groups[2]; struct ib_device __rcu *ib_dev; int devnum; struct cdev cdev; struct rb_root xrcd_tree; struct mutex xrcd_tree_mutex; - struct kobject kobj; struct srcu_struct disassociate_srcu; struct mutex lists_mutex; /* protect lists */ struct list_head uverbs_file_list; @@ -146,7 +147,6 @@ struct ib_uverbs_file { struct ib_event_handler event_handler; struct ib_uverbs_async_event_file *async_file; struct list_head list; - int is_closed; /* * To access the uobjects list hw_destroy_rwsem must be held for write @@ -158,6 +158,9 @@ struct ib_uverbs_file { spinlock_t uobjects_lock; struct list_head uobjects; + struct mutex umap_lock; + struct list_head umaps; + u64 uverbs_cmd_mask; u64 uverbs_ex_cmd_mask; @@ -218,12 +221,6 @@ struct ib_ucq_object { u32 async_events_reported; }; -struct ib_uflow_resources; -struct ib_uflow_object { - struct ib_uobject uobject; - struct ib_uflow_resources *resources; -}; - extern const struct file_operations uverbs_event_fops; void ib_uverbs_init_event_queue(struct ib_uverbs_event_queue *ev_queue); struct file *ib_uverbs_alloc_async_event_file(struct ib_uverbs_file *uverbs_file, diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index e012ca80f9d1..a93853770e3c 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -117,18 +117,12 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file, /* ufile is required when some objects are released */ ucontext->ufile = file; - rcu_read_lock(); - ucontext->tgid = get_task_pid(current->group_leader, PIDTYPE_PID); - rcu_read_unlock(); - ucontext->closing = 0; + ucontext->closing = false; ucontext->cleanup_retryable = false; #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING - ucontext->umem_tree = RB_ROOT_CACHED; - init_rwsem(&ucontext->umem_rwsem); - ucontext->odp_mrs_count = 0; - INIT_LIST_HEAD(&ucontext->no_private_counters); - + mutex_init(&ucontext->per_mm_list_lock); + INIT_LIST_HEAD(&ucontext->per_mm_list); if (!(ib_dev->attrs.device_cap_flags & IB_DEVICE_ON_DEMAND_PAGING)) ucontext->invalidate_range = NULL; @@ -172,7 +166,6 @@ err_fd: put_unused_fd(resp.async_fd); err_free: - put_pid(ucontext->tgid); ib_dev->dealloc_ucontext(ucontext); err_alloc: @@ -2769,16 +2762,7 @@ out_put: return ret ? ret : in_len; } -struct ib_uflow_resources { - size_t max; - size_t num; - size_t collection_num; - size_t counters_num; - struct ib_counters **counters; - struct ib_flow_action **collection; -}; - -static struct ib_uflow_resources *flow_resources_alloc(size_t num_specs) +struct ib_uflow_resources *flow_resources_alloc(size_t num_specs) { struct ib_uflow_resources *resources; @@ -2808,6 +2792,7 @@ err: return NULL; } +EXPORT_SYMBOL(flow_resources_alloc); void ib_uverbs_flow_resources_free(struct ib_uflow_resources *uflow_res) { @@ -2826,10 +2811,11 @@ void ib_uverbs_flow_resources_free(struct ib_uflow_resources *uflow_res) kfree(uflow_res->counters); kfree(uflow_res); } +EXPORT_SYMBOL(ib_uverbs_flow_resources_free); -static void flow_resources_add(struct ib_uflow_resources *uflow_res, - enum ib_flow_spec_type type, - void *ibobj) +void flow_resources_add(struct ib_uflow_resources *uflow_res, + enum ib_flow_spec_type type, + void *ibobj) { WARN_ON(uflow_res->num >= uflow_res->max); @@ -2850,6 +2836,7 @@ static void flow_resources_add(struct ib_uflow_resources *uflow_res, uflow_res->num++; } +EXPORT_SYMBOL(flow_resources_add); static int kern_spec_to_ib_spec_action(struct ib_uverbs_file *ufile, struct ib_uverbs_flow_spec *kern_spec, @@ -3484,7 +3471,6 @@ int ib_uverbs_ex_create_flow(struct ib_uverbs_file *file, struct ib_uverbs_create_flow cmd; struct ib_uverbs_create_flow_resp resp; struct ib_uobject *uobj; - struct ib_uflow_object *uflow; struct ib_flow *flow_id; struct ib_uverbs_flow_attr *kern_flow_attr; struct ib_flow_attr *flow_attr; @@ -3623,13 +3609,8 @@ int ib_uverbs_ex_create_flow(struct ib_uverbs_file *file, err = PTR_ERR(flow_id); goto err_free; } - atomic_inc(&qp->usecnt); - flow_id->qp = qp; - flow_id->device = qp->device; - flow_id->uobject = uobj; - uobj->object = flow_id; - uflow = container_of(uobj, typeof(*uflow), uobject); - uflow->resources = uflow_res; + + ib_set_flow(uobj, flow_id, qp, qp->device, uflow_res); memset(&resp, 0, sizeof(resp)); resp.flow_handle = uobj->id; diff --git a/drivers/infiniband/core/uverbs_ioctl.c b/drivers/infiniband/core/uverbs_ioctl.c index 1a6b229e3db3..b0e493e8d860 100644 --- a/drivers/infiniband/core/uverbs_ioctl.c +++ b/drivers/infiniband/core/uverbs_ioctl.c @@ -57,6 +57,7 @@ struct bundle_priv { struct ib_uverbs_attr *uattrs; DECLARE_BITMAP(uobj_finalize, UVERBS_API_ATTR_BKEY_LEN); + DECLARE_BITMAP(spec_finalize, UVERBS_API_ATTR_BKEY_LEN); /* * Must be last. bundle ends in a flex array which overlaps @@ -143,6 +144,86 @@ static bool uverbs_is_attr_cleared(const struct ib_uverbs_attr *uattr, 0, uattr->len - len); } +static int uverbs_process_idrs_array(struct bundle_priv *pbundle, + const struct uverbs_api_attr *attr_uapi, + struct uverbs_objs_arr_attr *attr, + struct ib_uverbs_attr *uattr, + u32 attr_bkey) +{ + const struct uverbs_attr_spec *spec = &attr_uapi->spec; + size_t array_len; + u32 *idr_vals; + int ret = 0; + size_t i; + + if (uattr->attr_data.reserved) + return -EINVAL; + + if (uattr->len % sizeof(u32)) + return -EINVAL; + + array_len = uattr->len / sizeof(u32); + if (array_len < spec->u2.objs_arr.min_len || + array_len > spec->u2.objs_arr.max_len) + return -EINVAL; + + attr->uobjects = + uverbs_alloc(&pbundle->bundle, + array_size(array_len, sizeof(*attr->uobjects))); + if (IS_ERR(attr->uobjects)) + return PTR_ERR(attr->uobjects); + + /* + * Since idr is 4B and *uobjects is >= 4B, we can use attr->uobjects + * to store idrs array and avoid additional memory allocation. The + * idrs array is offset to the end of the uobjects array so we will be + * able to read idr and replace with a pointer. + */ + idr_vals = (u32 *)(attr->uobjects + array_len) - array_len; + + if (uattr->len > sizeof(uattr->data)) { + ret = copy_from_user(idr_vals, u64_to_user_ptr(uattr->data), + uattr->len); + if (ret) + return -EFAULT; + } else { + memcpy(idr_vals, &uattr->data, uattr->len); + } + + for (i = 0; i != array_len; i++) { + attr->uobjects[i] = uverbs_get_uobject_from_file( + spec->u2.objs_arr.obj_type, pbundle->bundle.ufile, + spec->u2.objs_arr.access, idr_vals[i]); + if (IS_ERR(attr->uobjects[i])) { + ret = PTR_ERR(attr->uobjects[i]); + break; + } + } + + attr->len = i; + __set_bit(attr_bkey, pbundle->spec_finalize); + return ret; +} + +static int uverbs_free_idrs_array(const struct uverbs_api_attr *attr_uapi, + struct uverbs_objs_arr_attr *attr, + bool commit) +{ + const struct uverbs_attr_spec *spec = &attr_uapi->spec; + int current_ret; + int ret = 0; + size_t i; + + for (i = 0; i != attr->len; i++) { + current_ret = uverbs_finalize_object( + attr->uobjects[i], spec->u2.objs_arr.access, commit); + if (!ret) + ret = current_ret; + } + + return ret; +} + static int uverbs_process_attr(struct bundle_priv *pbundle, const struct uverbs_api_attr *attr_uapi, struct ib_uverbs_attr *uattr, u32 attr_bkey) @@ -246,6 +327,11 @@ static int uverbs_process_attr(struct bundle_priv *pbundle, } break; + + case UVERBS_ATTR_TYPE_IDRS_ARRAY: + return uverbs_process_idrs_array(pbundle, attr_uapi, + &e->objs_arr_attr, uattr, + attr_bkey); default: return -EOPNOTSUPP; } @@ -300,8 +386,7 @@ static int uverbs_set_attr(struct bundle_priv *pbundle, return -EPROTONOSUPPORT; return 0; } - attr = srcu_dereference( - *slot, &pbundle->bundle.ufile->device->disassociate_srcu); + attr = rcu_dereference_protected(*slot, true); /* Reject duplicate attributes from user-space */ if (test_bit(attr_bkey, pbundle->bundle.attr_present)) @@ -384,6 +469,7 @@ static int bundle_destroy(struct bundle_priv *pbundle, bool commit) unsigned int i; int ret = 0; + /* fast path for simple uobjects */ i = -1; while ((i = find_next_bit(pbundle->uobj_finalize, key_bitmap_len, i + 1)) < key_bitmap_len) { @@ -397,6 +483,30 @@ static int bundle_destroy(struct bundle_priv *pbundle, bool commit) ret = current_ret; } + i = -1; + while ((i = find_next_bit(pbundle->spec_finalize, key_bitmap_len, + i + 1)) < key_bitmap_len) { + struct uverbs_attr *attr = &pbundle->bundle.attrs[i]; + const struct uverbs_api_attr *attr_uapi; + void __rcu **slot; + int current_ret; + + slot = uapi_get_attr_for_method( + pbundle, + pbundle->method_key | uapi_bkey_to_key_attr(i)); + if (WARN_ON(!slot)) + continue; + + attr_uapi = rcu_dereference_protected(*slot, true); + + if (attr_uapi->spec.type == UVERBS_ATTR_TYPE_IDRS_ARRAY) { + current_ret = uverbs_free_idrs_array( + attr_uapi, &attr->objs_arr_attr, commit); + if (!ret) + ret = current_ret; + } + } + for (memblock = pbundle->allocated_mem; memblock;) { struct bundle_alloc_head *tmp = memblock; @@ -429,7 +539,7 @@ static int ib_uverbs_cmd_verbs(struct ib_uverbs_file *ufile, uapi_key_ioctl_method(hdr->method_id)); if (unlikely(!slot)) return -EPROTONOSUPPORT; - method_elm = srcu_dereference(*slot, &ufile->device->disassociate_srcu); + method_elm = rcu_dereference_protected(*slot, true); if (!method_elm->use_stack) { pbundle = kmalloc(method_elm->bundle_size, GFP_KERNEL); @@ -461,6 +571,7 @@ static int ib_uverbs_cmd_verbs(struct ib_uverbs_file *ufile, memset(pbundle->bundle.attr_present, 0, sizeof(pbundle->bundle.attr_present)); memset(pbundle->uobj_finalize, 0, sizeof(pbundle->uobj_finalize)); + memset(pbundle->spec_finalize, 0, sizeof(pbundle->spec_finalize)); ret = ib_uverbs_run_method(pbundle, hdr->num_attrs); destroy_ret = bundle_destroy(pbundle, ret == 0); @@ -611,3 +722,26 @@ int uverbs_copy_to(const struct uverbs_attr_bundle *bundle, size_t idx, return 0; } EXPORT_SYMBOL(uverbs_copy_to); + +int _uverbs_get_const(s64 *to, const struct uverbs_attr_bundle *attrs_bundle, + size_t idx, s64 lower_bound, u64 upper_bound, + s64 *def_val) +{ + const struct uverbs_attr *attr; + + attr = uverbs_attr_get(attrs_bundle, idx); + if (IS_ERR(attr)) { + if ((PTR_ERR(attr) != -ENOENT) || !def_val) + return PTR_ERR(attr); + + *to = *def_val; + } else { + *to = attr->ptr_attr.data; + } + + if (*to < lower_bound || (*to > 0 && (u64)*to > upper_bound)) + return -EINVAL; + + return 0; +} +EXPORT_SYMBOL(_uverbs_get_const); diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index 50152c1b1004..6d373f5515b7 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -45,6 +45,7 @@ #include <linux/cdev.h> #include <linux/anon_inodes.h> #include <linux/slab.h> +#include <linux/sched/mm.h> #include <linux/uaccess.h> @@ -72,7 +73,7 @@ enum { static dev_t dynamic_uverbs_dev; static struct class *uverbs_class; -static DECLARE_BITMAP(dev_map, IB_UVERBS_MAX_DEVICES); +static DEFINE_IDA(uverbs_ida); static ssize_t (*uverbs_cmd_table[])(struct ib_uverbs_file *file, const char __user *buf, int in_len, @@ -169,20 +170,16 @@ int uverbs_dealloc_mw(struct ib_mw *mw) return ret; } -static void ib_uverbs_release_dev(struct kobject *kobj) +static void ib_uverbs_release_dev(struct device *device) { struct ib_uverbs_device *dev = - container_of(kobj, struct ib_uverbs_device, kobj); + container_of(device, struct ib_uverbs_device, dev); uverbs_destroy_api(dev->uapi); cleanup_srcu_struct(&dev->disassociate_srcu); kfree(dev); } -static struct kobj_type ib_uverbs_dev_ktype = { - .release = ib_uverbs_release_dev, -}; - static void ib_uverbs_release_async_event_file(struct kref *ref) { struct ib_uverbs_async_event_file *file = @@ -265,7 +262,7 @@ void ib_uverbs_release_file(struct kref *ref) if (atomic_dec_and_test(&file->device->refcount)) ib_uverbs_comp_dev(file->device); - kobject_put(&file->device->kobj); + put_device(&file->device->dev); kfree(file); } @@ -817,6 +814,226 @@ out: } /* + * Each time we map IO memory into user space this keeps track of the mapping. + * When the device is hot-unplugged we 'zap' the mmaps in user space to point + * to the zero page and allow the hot unplug to proceed. + * + * This is necessary for cases like PCI physical hot unplug as the actual BAR + * memory may vanish after this and access to it from userspace could MCE. + * + * RDMA drivers supporting disassociation must have their user space designed + * to cope in some way with their IO pages going to the zero page. + */ +struct rdma_umap_priv { + struct vm_area_struct *vma; + struct list_head list; +}; + +static const struct vm_operations_struct rdma_umap_ops; + +static void rdma_umap_priv_init(struct rdma_umap_priv *priv, + struct vm_area_struct *vma) +{ + struct ib_uverbs_file *ufile = vma->vm_file->private_data; + + priv->vma = vma; + vma->vm_private_data = priv; + vma->vm_ops = &rdma_umap_ops; + + mutex_lock(&ufile->umap_lock); + list_add(&priv->list, &ufile->umaps); + mutex_unlock(&ufile->umap_lock); +} + +/* + * The VMA has been dup'd, initialize the vm_private_data with a new tracking + * struct + */ +static void rdma_umap_open(struct vm_area_struct *vma) +{ + struct ib_uverbs_file *ufile = vma->vm_file->private_data; + struct rdma_umap_priv *opriv = vma->vm_private_data; + struct rdma_umap_priv *priv; + + if (!opriv) + return; + + /* We are racing with disassociation */ + if (!down_read_trylock(&ufile->hw_destroy_rwsem)) + goto out_zap; + /* + * Disassociation already completed, the VMA should already be zapped. + */ + if (!ufile->ucontext) + goto out_unlock; + + priv = kzalloc(sizeof(*priv), GFP_KERNEL); + if (!priv) + goto out_unlock; + rdma_umap_priv_init(priv, vma); + + up_read(&ufile->hw_destroy_rwsem); + return; + +out_unlock: + up_read(&ufile->hw_destroy_rwsem); +out_zap: + /* + * We can't allow the VMA to be created with the actual IO pages, that + * would break our API contract, and it can't be stopped at this + * point, so zap it. + */ + vma->vm_private_data = NULL; + zap_vma_ptes(vma, vma->vm_start, vma->vm_end - vma->vm_start); +} + +static void rdma_umap_close(struct vm_area_struct *vma) +{ + struct ib_uverbs_file *ufile = vma->vm_file->private_data; + struct rdma_umap_priv *priv = vma->vm_private_data; + + if (!priv) + return; + + /* + * The vma holds a reference on the struct file that created it, which + * in turn means that the ib_uverbs_file is guaranteed to exist at + * this point. + */ + mutex_lock(&ufile->umap_lock); + list_del(&priv->list); + mutex_unlock(&ufile->umap_lock); + kfree(priv); +} + +static const struct vm_operations_struct rdma_umap_ops = { + .open = rdma_umap_open, + .close = rdma_umap_close, +}; + +static struct rdma_umap_priv *rdma_user_mmap_pre(struct ib_ucontext *ucontext, + struct vm_area_struct *vma, + unsigned long size) +{ + struct ib_uverbs_file *ufile = ucontext->ufile; + struct rdma_umap_priv *priv; + + if (vma->vm_end - vma->vm_start != size) + return ERR_PTR(-EINVAL); + + /* Driver is using this wrong, must be called by ib_uverbs_mmap */ + if (WARN_ON(!vma->vm_file || + vma->vm_file->private_data != ufile)) + return ERR_PTR(-EINVAL); + lockdep_assert_held(&ufile->device->disassociate_srcu); + + priv = kzalloc(sizeof(*priv), GFP_KERNEL); + if (!priv) + return ERR_PTR(-ENOMEM); + return priv; +} + +/* + * Map IO memory into a process. This is to be called by drivers as part of + * their mmap() functions if they wish to send something like PCI-E BAR memory + * to userspace. + */ +int rdma_user_mmap_io(struct ib_ucontext *ucontext, struct vm_area_struct *vma, + unsigned long pfn, unsigned long size, pgprot_t prot) +{ + struct rdma_umap_priv *priv = rdma_user_mmap_pre(ucontext, vma, size); + + if (IS_ERR(priv)) + return PTR_ERR(priv); + + vma->vm_page_prot = prot; + if (io_remap_pfn_range(vma, vma->vm_start, pfn, size, prot)) { + kfree(priv); + return -EAGAIN; + } + + rdma_umap_priv_init(priv, vma); + return 0; +} +EXPORT_SYMBOL(rdma_user_mmap_io); + +/* + * The page case is here for a slightly different reason, the driver expects + * to be able to free the page it is sharing to user space when it destroys + * its ucontext, which means we need to zap the user space references. + * + * We could handle this differently by providing an API to allocate a shared + * page and then only freeing the shared page when the last ufile is + * destroyed. + */ +int rdma_user_mmap_page(struct ib_ucontext *ucontext, + struct vm_area_struct *vma, struct page *page, + unsigned long size) +{ + struct rdma_umap_priv *priv = rdma_user_mmap_pre(ucontext, vma, size); + + if (IS_ERR(priv)) + return PTR_ERR(priv); + + if (remap_pfn_range(vma, vma->vm_start, page_to_pfn(page), size, + vma->vm_page_prot)) { + kfree(priv); + return -EAGAIN; + } + + rdma_umap_priv_init(priv, vma); + return 0; +} +EXPORT_SYMBOL(rdma_user_mmap_page); + +void uverbs_user_mmap_disassociate(struct ib_uverbs_file *ufile) +{ + struct rdma_umap_priv *priv, *next_priv; + + lockdep_assert_held(&ufile->hw_destroy_rwsem); + + while (1) { + struct mm_struct *mm = NULL; + + /* Get an arbitrary mm pointer that hasn't been cleaned yet */ + mutex_lock(&ufile->umap_lock); + if (!list_empty(&ufile->umaps)) { + mm = list_first_entry(&ufile->umaps, + struct rdma_umap_priv, list) + ->vma->vm_mm; + mmget(mm); + } + mutex_unlock(&ufile->umap_lock); + if (!mm) + return; + + /* + * The umap_lock is nested under mmap_sem since it used within + * the vma_ops callbacks, so we have to clean the list one mm + * at a time to get the lock ordering right. Typically there + * will only be one mm, so no big deal. + */ + down_write(&mm->mmap_sem); + mutex_lock(&ufile->umap_lock); + list_for_each_entry_safe (priv, next_priv, &ufile->umaps, + list) { + struct vm_area_struct *vma = priv->vma; + + if (vma->vm_mm != mm) + continue; + list_del_init(&priv->list); + + zap_vma_ptes(vma, vma->vm_start, + vma->vm_end - vma->vm_start); + vma->vm_flags &= ~(VM_SHARED | VM_MAYSHARE); + } + mutex_unlock(&ufile->umap_lock); + up_write(&mm->mmap_sem); + mmput(mm); + } +} + +/* * ib_uverbs_open() does not need the BKL: * * - the ib_uverbs_device structures are properly reference counted and @@ -839,6 +1056,7 @@ static int ib_uverbs_open(struct inode *inode, struct file *filp) if (!atomic_inc_not_zero(&dev->refcount)) return -ENXIO; + get_device(&dev->dev); srcu_key = srcu_read_lock(&dev->disassociate_srcu); mutex_lock(&dev->lists_mutex); ib_dev = srcu_dereference(dev->ib_dev, @@ -876,9 +1094,10 @@ static int ib_uverbs_open(struct inode *inode, struct file *filp) spin_lock_init(&file->uobjects_lock); INIT_LIST_HEAD(&file->uobjects); init_rwsem(&file->hw_destroy_rwsem); + mutex_init(&file->umap_lock); + INIT_LIST_HEAD(&file->umaps); filp->private_data = file; - kobject_get(&dev->kobj); list_add_tail(&file->list, &dev->uverbs_file_list); mutex_unlock(&dev->lists_mutex); srcu_read_unlock(&dev->disassociate_srcu, srcu_key); @@ -899,6 +1118,7 @@ err: if (atomic_dec_and_test(&dev->refcount)) ib_uverbs_comp_dev(dev); + put_device(&dev->dev); return ret; } @@ -909,10 +1129,7 @@ static int ib_uverbs_close(struct inode *inode, struct file *filp) uverbs_destroy_ufile_hw(file, RDMA_REMOVE_CLOSE); mutex_lock(&file->device->lists_mutex); - if (!file->is_closed) { - list_del(&file->list); - file->is_closed = 1; - } + list_del_init(&file->list); mutex_unlock(&file->device->lists_mutex); if (file->async_file) @@ -951,37 +1168,34 @@ static struct ib_client uverbs_client = { .remove = ib_uverbs_remove_one }; -static ssize_t show_ibdev(struct device *device, struct device_attribute *attr, +static ssize_t ibdev_show(struct device *device, struct device_attribute *attr, char *buf) { + struct ib_uverbs_device *dev = + container_of(device, struct ib_uverbs_device, dev); int ret = -ENODEV; int srcu_key; - struct ib_uverbs_device *dev = dev_get_drvdata(device); struct ib_device *ib_dev; - if (!dev) - return -ENODEV; - srcu_key = srcu_read_lock(&dev->disassociate_srcu); ib_dev = srcu_dereference(dev->ib_dev, &dev->disassociate_srcu); if (ib_dev) - ret = sprintf(buf, "%s\n", ib_dev->name); + ret = sprintf(buf, "%s\n", dev_name(&ib_dev->dev)); srcu_read_unlock(&dev->disassociate_srcu, srcu_key); return ret; } -static DEVICE_ATTR(ibdev, S_IRUGO, show_ibdev, NULL); +static DEVICE_ATTR_RO(ibdev); -static ssize_t show_dev_abi_version(struct device *device, - struct device_attribute *attr, char *buf) +static ssize_t abi_version_show(struct device *device, + struct device_attribute *attr, char *buf) { - struct ib_uverbs_device *dev = dev_get_drvdata(device); + struct ib_uverbs_device *dev = + container_of(device, struct ib_uverbs_device, dev); int ret = -ENODEV; int srcu_key; struct ib_device *ib_dev; - if (!dev) - return -ENODEV; srcu_key = srcu_read_lock(&dev->disassociate_srcu); ib_dev = srcu_dereference(dev->ib_dev, &dev->disassociate_srcu); if (ib_dev) @@ -990,7 +1204,17 @@ static ssize_t show_dev_abi_version(struct device *device, return ret; } -static DEVICE_ATTR(abi_version, S_IRUGO, show_dev_abi_version, NULL); +static DEVICE_ATTR_RO(abi_version); + +static struct attribute *ib_dev_attrs[] = { + &dev_attr_abi_version.attr, + &dev_attr_ibdev.attr, + NULL, +}; + +static const struct attribute_group dev_attr_group = { + .attrs = ib_dev_attrs, +}; static CLASS_ATTR_STRING(abi_version, S_IRUGO, __stringify(IB_USER_VERBS_ABI_VERSION)); @@ -1028,65 +1252,56 @@ static void ib_uverbs_add_one(struct ib_device *device) return; } + device_initialize(&uverbs_dev->dev); + uverbs_dev->dev.class = uverbs_class; + uverbs_dev->dev.parent = device->dev.parent; + uverbs_dev->dev.release = ib_uverbs_release_dev; + uverbs_dev->groups[0] = &dev_attr_group; + uverbs_dev->dev.groups = uverbs_dev->groups; atomic_set(&uverbs_dev->refcount, 1); init_completion(&uverbs_dev->comp); uverbs_dev->xrcd_tree = RB_ROOT; mutex_init(&uverbs_dev->xrcd_tree_mutex); - kobject_init(&uverbs_dev->kobj, &ib_uverbs_dev_ktype); mutex_init(&uverbs_dev->lists_mutex); INIT_LIST_HEAD(&uverbs_dev->uverbs_file_list); INIT_LIST_HEAD(&uverbs_dev->uverbs_events_file_list); + rcu_assign_pointer(uverbs_dev->ib_dev, device); + uverbs_dev->num_comp_vectors = device->num_comp_vectors; - devnum = find_first_zero_bit(dev_map, IB_UVERBS_MAX_DEVICES); - if (devnum >= IB_UVERBS_MAX_DEVICES) + devnum = ida_alloc_max(&uverbs_ida, IB_UVERBS_MAX_DEVICES - 1, + GFP_KERNEL); + if (devnum < 0) goto err; uverbs_dev->devnum = devnum; - set_bit(devnum, dev_map); if (devnum >= IB_UVERBS_NUM_FIXED_MINOR) base = dynamic_uverbs_dev + devnum - IB_UVERBS_NUM_FIXED_MINOR; else base = IB_UVERBS_BASE_DEV + devnum; - rcu_assign_pointer(uverbs_dev->ib_dev, device); - uverbs_dev->num_comp_vectors = device->num_comp_vectors; - if (ib_uverbs_create_uapi(device, uverbs_dev)) goto err_uapi; - cdev_init(&uverbs_dev->cdev, NULL); + uverbs_dev->dev.devt = base; + dev_set_name(&uverbs_dev->dev, "uverbs%d", uverbs_dev->devnum); + + cdev_init(&uverbs_dev->cdev, + device->mmap ? &uverbs_mmap_fops : &uverbs_fops); uverbs_dev->cdev.owner = THIS_MODULE; - uverbs_dev->cdev.ops = device->mmap ? &uverbs_mmap_fops : &uverbs_fops; - cdev_set_parent(&uverbs_dev->cdev, &uverbs_dev->kobj); - kobject_set_name(&uverbs_dev->cdev.kobj, "uverbs%d", uverbs_dev->devnum); - if (cdev_add(&uverbs_dev->cdev, base, 1)) - goto err_cdev; - - uverbs_dev->dev = device_create(uverbs_class, device->dev.parent, - uverbs_dev->cdev.dev, uverbs_dev, - "uverbs%d", uverbs_dev->devnum); - if (IS_ERR(uverbs_dev->dev)) - goto err_cdev; - - if (device_create_file(uverbs_dev->dev, &dev_attr_ibdev)) - goto err_class; - if (device_create_file(uverbs_dev->dev, &dev_attr_abi_version)) - goto err_class; - ib_set_client_data(device, &uverbs_client, uverbs_dev); + ret = cdev_device_add(&uverbs_dev->cdev, &uverbs_dev->dev); + if (ret) + goto err_uapi; + ib_set_client_data(device, &uverbs_client, uverbs_dev); return; -err_class: - device_destroy(uverbs_class, uverbs_dev->cdev.dev); -err_cdev: - cdev_del(&uverbs_dev->cdev); err_uapi: - clear_bit(devnum, dev_map); + ida_free(&uverbs_ida, devnum); err: if (atomic_dec_and_test(&uverbs_dev->refcount)) ib_uverbs_comp_dev(uverbs_dev); wait_for_completion(&uverbs_dev->comp); - kobject_put(&uverbs_dev->kobj); + put_device(&uverbs_dev->dev); return; } @@ -1107,8 +1322,7 @@ static void ib_uverbs_free_hw_resources(struct ib_uverbs_device *uverbs_dev, while (!list_empty(&uverbs_dev->uverbs_file_list)) { file = list_first_entry(&uverbs_dev->uverbs_file_list, struct ib_uverbs_file, list); - file->is_closed = 1; - list_del(&file->list); + list_del_init(&file->list); kref_get(&file->ref); /* We must release the mutex before going ahead and calling @@ -1156,10 +1370,8 @@ static void ib_uverbs_remove_one(struct ib_device *device, void *client_data) if (!uverbs_dev) return; - dev_set_drvdata(uverbs_dev->dev, NULL); - device_destroy(uverbs_class, uverbs_dev->cdev.dev); - cdev_del(&uverbs_dev->cdev); - clear_bit(uverbs_dev->devnum, dev_map); + cdev_device_del(&uverbs_dev->cdev, &uverbs_dev->dev); + ida_free(&uverbs_ida, uverbs_dev->devnum); if (device->disassociate_ucontext) { /* We disassociate HW resources and immediately return. @@ -1182,7 +1394,7 @@ static void ib_uverbs_remove_one(struct ib_device *device, void *client_data) if (wait_clients) wait_for_completion(&uverbs_dev->comp); - kobject_put(&uverbs_dev->kobj); + put_device(&uverbs_dev->dev); } static char *uverbs_devnode(struct device *dev, umode_t *mode) diff --git a/drivers/infiniband/core/uverbs_std_types_flow_action.c b/drivers/infiniband/core/uverbs_std_types_flow_action.c index d8cfafe23bd9..cb9486ad5c67 100644 --- a/drivers/infiniband/core/uverbs_std_types_flow_action.c +++ b/drivers/infiniband/core/uverbs_std_types_flow_action.c @@ -326,11 +326,8 @@ static int UVERBS_HANDLER(UVERBS_METHOD_FLOW_ACTION_ESP_CREATE)( if (IS_ERR(action)) return PTR_ERR(action); - atomic_set(&action->usecnt, 0); - action->device = ib_dev; - action->type = IB_FLOW_ACTION_ESP; - action->uobject = uobj; - uobj->object = action; + uverbs_flow_action_fill_action(action, uobj, ib_dev, + IB_FLOW_ACTION_ESP); return 0; } diff --git a/drivers/infiniband/core/uverbs_uapi.c b/drivers/infiniband/core/uverbs_uapi.c index be854628a7c6..86f3fc5e04b4 100644 --- a/drivers/infiniband/core/uverbs_uapi.c +++ b/drivers/infiniband/core/uverbs_uapi.c @@ -73,6 +73,18 @@ static int uapi_merge_method(struct uverbs_api *uapi, if (attr->attr.type == UVERBS_ATTR_TYPE_ENUM_IN) method_elm->driver_method |= is_driver; + /* + * Like other uobject based things we only support a single + * uobject being NEW'd or DESTROY'd + */ + if (attr->attr.type == UVERBS_ATTR_TYPE_IDRS_ARRAY) { + u8 access = attr->attr.u2.objs_arr.access; + + if (WARN_ON(access == UVERBS_ACCESS_NEW || + access == UVERBS_ACCESS_DESTROY)) + return -EINVAL; + } + attr_slot = uapi_add_elm(uapi, method_key | uapi_key_attr(attr->id), sizeof(*attr_slot)); diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index 8ec7418e99f0..178899e3ce73 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -264,7 +264,7 @@ struct ib_pd *__ib_alloc_pd(struct ib_device *device, unsigned int flags, } pd->res.type = RDMA_RESTRACK_PD; - pd->res.kern_name = caller; + rdma_restrack_set_task(&pd->res, caller); rdma_restrack_add(&pd->res); if (mr_access_flags) { @@ -710,7 +710,7 @@ static int ib_resolve_unicast_gid_dmac(struct ib_device *device, ret = rdma_addr_find_l2_eth_by_grh(&sgid_attr->gid, &grh->dgid, ah_attr->roce.dmac, - sgid_attr->ndev, &hop_limit); + sgid_attr, &hop_limit); grh->hop_limit = hop_limit; return ret; @@ -1509,8 +1509,7 @@ static const struct { }; bool ib_modify_qp_is_ok(enum ib_qp_state cur_state, enum ib_qp_state next_state, - enum ib_qp_type type, enum ib_qp_attr_mask mask, - enum rdma_link_layer ll) + enum ib_qp_type type, enum ib_qp_attr_mask mask) { enum ib_qp_attr_mask req_param, opt_param; @@ -1629,14 +1628,16 @@ static int _ib_modify_qp(struct ib_qp *qp, struct ib_qp_attr *attr, if (rdma_ib_or_roce(qp->device, port)) { if (attr_mask & IB_QP_RQ_PSN && attr->rq_psn & ~0xffffff) { - pr_warn("%s: %s rq_psn overflow, masking to 24 bits\n", - __func__, qp->device->name); + dev_warn(&qp->device->dev, + "%s rq_psn overflow, masking to 24 bits\n", + __func__); attr->rq_psn &= 0xffffff; } if (attr_mask & IB_QP_SQ_PSN && attr->sq_psn & ~0xffffff) { - pr_warn("%s: %s sq_psn overflow, masking to 24 bits\n", - __func__, qp->device->name); + dev_warn(&qp->device->dev, + " %s sq_psn overflow, masking to 24 bits\n", + __func__); attr->sq_psn &= 0xffffff; } } @@ -1888,7 +1889,7 @@ struct ib_cq *__ib_create_cq(struct ib_device *device, cq->cq_context = cq_context; atomic_set(&cq->usecnt, 0); cq->res.type = RDMA_RESTRACK_CQ; - cq->res.kern_name = caller; + rdma_restrack_set_task(&cq->res, caller); rdma_restrack_add(&cq->res); } |