summaryrefslogtreecommitdiffstats
path: root/drivers/infiniband/core
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/infiniband/core')
-rw-r--r--drivers/infiniband/core/Makefile3
-rw-r--r--drivers/infiniband/core/cache.c773
-rw-r--r--drivers/infiniband/core/cm.c215
-rw-r--r--drivers/infiniband/core/cma.c657
-rw-r--r--drivers/infiniband/core/core_priv.h54
-rw-r--r--drivers/infiniband/core/device.c335
-rw-r--r--drivers/infiniband/core/mad.c28
-rw-r--r--drivers/infiniband/core/mad_priv.h1
-rw-r--r--drivers/infiniband/core/multicast.c7
-rw-r--r--drivers/infiniband/core/netlink.c55
-rw-r--r--drivers/infiniband/core/roce_gid_mgmt.c728
-rw-r--r--drivers/infiniband/core/sa_query.c515
-rw-r--r--drivers/infiniband/core/sysfs.c51
-rw-r--r--drivers/infiniband/core/ucm.c9
-rw-r--r--drivers/infiniband/core/ucma.c146
-rw-r--r--drivers/infiniband/core/user_mad.c6
-rw-r--r--drivers/infiniband/core/uverbs.h16
-rw-r--r--drivers/infiniband/core/uverbs_cmd.c147
-rw-r--r--drivers/infiniband/core/uverbs_main.c448
-rw-r--r--drivers/infiniband/core/verbs.c131
20 files changed, 3618 insertions, 707 deletions
diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile
index acf736764445..d43a8994ac5c 100644
--- a/drivers/infiniband/core/Makefile
+++ b/drivers/infiniband/core/Makefile
@@ -9,7 +9,8 @@ obj-$(CONFIG_INFINIBAND_USER_ACCESS) += ib_uverbs.o ib_ucm.o \
$(user_access-y)
ib_core-y := packer.o ud_header.o verbs.o sysfs.o \
- device.o fmr_pool.o cache.o netlink.o
+ device.o fmr_pool.o cache.o netlink.o \
+ roce_gid_mgmt.o
ib_core-$(CONFIG_INFINIBAND_USER_MEM) += umem.o
ib_core-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += umem_odp.o umem_rbtree.o
diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c
index 871da832d016..8f66c67ff0df 100644
--- a/drivers/infiniband/core/cache.c
+++ b/drivers/infiniband/core/cache.c
@@ -37,6 +37,8 @@
#include <linux/errno.h>
#include <linux/slab.h>
#include <linux/workqueue.h>
+#include <linux/netdevice.h>
+#include <net/addrconf.h>
#include <rdma/ib_cache.h>
@@ -47,76 +49,621 @@ struct ib_pkey_cache {
u16 table[0];
};
-struct ib_gid_cache {
- int table_len;
- union ib_gid table[0];
-};
-
struct ib_update_work {
struct work_struct work;
struct ib_device *device;
u8 port_num;
};
-int ib_get_cached_gid(struct ib_device *device,
- u8 port_num,
- int index,
- union ib_gid *gid)
+union ib_gid zgid;
+EXPORT_SYMBOL(zgid);
+
+static const struct ib_gid_attr zattr;
+
+enum gid_attr_find_mask {
+ GID_ATTR_FIND_MASK_GID = 1UL << 0,
+ GID_ATTR_FIND_MASK_NETDEV = 1UL << 1,
+ GID_ATTR_FIND_MASK_DEFAULT = 1UL << 2,
+};
+
+enum gid_table_entry_props {
+ GID_TABLE_ENTRY_INVALID = 1UL << 0,
+ GID_TABLE_ENTRY_DEFAULT = 1UL << 1,
+};
+
+enum gid_table_write_action {
+ GID_TABLE_WRITE_ACTION_ADD,
+ GID_TABLE_WRITE_ACTION_DEL,
+ /* MODIFY only updates the GID table. Currently only used by
+ * ib_cache_update.
+ */
+ GID_TABLE_WRITE_ACTION_MODIFY
+};
+
+struct ib_gid_table_entry {
+ /* This lock protects an entry from being
+ * read and written simultaneously.
+ */
+ rwlock_t lock;
+ unsigned long props;
+ union ib_gid gid;
+ struct ib_gid_attr attr;
+ void *context;
+};
+
+struct ib_gid_table {
+ int sz;
+ /* In RoCE, adding a GID to the table requires:
+ * (a) Find if this GID is already exists.
+ * (b) Find a free space.
+ * (c) Write the new GID
+ *
+ * Delete requires different set of operations:
+ * (a) Find the GID
+ * (b) Delete it.
+ *
+ * Add/delete should be carried out atomically.
+ * This is done by locking this mutex from multiple
+ * writers. We don't need this lock for IB, as the MAD
+ * layer replaces all entries. All data_vec entries
+ * are locked by this lock.
+ **/
+ struct mutex lock;
+ struct ib_gid_table_entry *data_vec;
+};
+
+static int write_gid(struct ib_device *ib_dev, u8 port,
+ struct ib_gid_table *table, int ix,
+ const union ib_gid *gid,
+ const struct ib_gid_attr *attr,
+ enum gid_table_write_action action,
+ bool default_gid)
{
- struct ib_gid_cache *cache;
+ int ret = 0;
+ struct net_device *old_net_dev;
unsigned long flags;
+
+ /* in rdma_cap_roce_gid_table, this funciton should be protected by a
+ * sleep-able lock.
+ */
+ write_lock_irqsave(&table->data_vec[ix].lock, flags);
+
+ if (rdma_cap_roce_gid_table(ib_dev, port)) {
+ table->data_vec[ix].props |= GID_TABLE_ENTRY_INVALID;
+ write_unlock_irqrestore(&table->data_vec[ix].lock, flags);
+ /* GID_TABLE_WRITE_ACTION_MODIFY currently isn't supported by
+ * RoCE providers and thus only updates the cache.
+ */
+ if (action == GID_TABLE_WRITE_ACTION_ADD)
+ ret = ib_dev->add_gid(ib_dev, port, ix, gid, attr,
+ &table->data_vec[ix].context);
+ else if (action == GID_TABLE_WRITE_ACTION_DEL)
+ ret = ib_dev->del_gid(ib_dev, port, ix,
+ &table->data_vec[ix].context);
+ write_lock_irqsave(&table->data_vec[ix].lock, flags);
+ }
+
+ old_net_dev = table->data_vec[ix].attr.ndev;
+ if (old_net_dev && old_net_dev != attr->ndev)
+ dev_put(old_net_dev);
+ /* if modify_gid failed, just delete the old gid */
+ if (ret || action == GID_TABLE_WRITE_ACTION_DEL) {
+ gid = &zgid;
+ attr = &zattr;
+ table->data_vec[ix].context = NULL;
+ }
+ if (default_gid)
+ table->data_vec[ix].props |= GID_TABLE_ENTRY_DEFAULT;
+ memcpy(&table->data_vec[ix].gid, gid, sizeof(*gid));
+ memcpy(&table->data_vec[ix].attr, attr, sizeof(*attr));
+ if (table->data_vec[ix].attr.ndev &&
+ table->data_vec[ix].attr.ndev != old_net_dev)
+ dev_hold(table->data_vec[ix].attr.ndev);
+
+ table->data_vec[ix].props &= ~GID_TABLE_ENTRY_INVALID;
+
+ write_unlock_irqrestore(&table->data_vec[ix].lock, flags);
+
+ if (!ret && rdma_cap_roce_gid_table(ib_dev, port)) {
+ struct ib_event event;
+
+ event.device = ib_dev;
+ event.element.port_num = port;
+ event.event = IB_EVENT_GID_CHANGE;
+
+ ib_dispatch_event(&event);
+ }
+ return ret;
+}
+
+static int add_gid(struct ib_device *ib_dev, u8 port,
+ struct ib_gid_table *table, int ix,
+ const union ib_gid *gid,
+ const struct ib_gid_attr *attr,
+ bool default_gid) {
+ return write_gid(ib_dev, port, table, ix, gid, attr,
+ GID_TABLE_WRITE_ACTION_ADD, default_gid);
+}
+
+static int modify_gid(struct ib_device *ib_dev, u8 port,
+ struct ib_gid_table *table, int ix,
+ const union ib_gid *gid,
+ const struct ib_gid_attr *attr,
+ bool default_gid) {
+ return write_gid(ib_dev, port, table, ix, gid, attr,
+ GID_TABLE_WRITE_ACTION_MODIFY, default_gid);
+}
+
+static int del_gid(struct ib_device *ib_dev, u8 port,
+ struct ib_gid_table *table, int ix,
+ bool default_gid) {
+ return write_gid(ib_dev, port, table, ix, &zgid, &zattr,
+ GID_TABLE_WRITE_ACTION_DEL, default_gid);
+}
+
+static int find_gid(struct ib_gid_table *table, const union ib_gid *gid,
+ const struct ib_gid_attr *val, bool default_gid,
+ unsigned long mask)
+{
+ int i;
+
+ for (i = 0; i < table->sz; i++) {
+ unsigned long flags;
+ struct ib_gid_attr *attr = &table->data_vec[i].attr;
+
+ read_lock_irqsave(&table->data_vec[i].lock, flags);
+
+ if (table->data_vec[i].props & GID_TABLE_ENTRY_INVALID)
+ goto next;
+
+ if (mask & GID_ATTR_FIND_MASK_GID &&
+ memcmp(gid, &table->data_vec[i].gid, sizeof(*gid)))
+ goto next;
+
+ if (mask & GID_ATTR_FIND_MASK_NETDEV &&
+ attr->ndev != val->ndev)
+ goto next;
+
+ if (mask & GID_ATTR_FIND_MASK_DEFAULT &&
+ !!(table->data_vec[i].props & GID_TABLE_ENTRY_DEFAULT) !=
+ default_gid)
+ goto next;
+
+ read_unlock_irqrestore(&table->data_vec[i].lock, flags);
+ return i;
+next:
+ read_unlock_irqrestore(&table->data_vec[i].lock, flags);
+ }
+
+ return -1;
+}
+
+static void make_default_gid(struct net_device *dev, union ib_gid *gid)
+{
+ gid->global.subnet_prefix = cpu_to_be64(0xfe80000000000000LL);
+ addrconf_ifid_eui48(&gid->raw[8], dev);
+}
+
+int ib_cache_gid_add(struct ib_device *ib_dev, u8 port,
+ union ib_gid *gid, struct ib_gid_attr *attr)
+{
+ struct ib_gid_table **ports_table = ib_dev->cache.gid_cache;
+ struct ib_gid_table *table;
+ int ix;
int ret = 0;
+ struct net_device *idev;
- if (port_num < rdma_start_port(device) || port_num > rdma_end_port(device))
+ table = ports_table[port - rdma_start_port(ib_dev)];
+
+ if (!memcmp(gid, &zgid, sizeof(*gid)))
return -EINVAL;
- read_lock_irqsave(&device->cache.lock, flags);
+ if (ib_dev->get_netdev) {
+ idev = ib_dev->get_netdev(ib_dev, port);
+ if (idev && attr->ndev != idev) {
+ union ib_gid default_gid;
- cache = device->cache.gid_cache[port_num - rdma_start_port(device)];
+ /* Adding default GIDs in not permitted */
+ make_default_gid(idev, &default_gid);
+ if (!memcmp(gid, &default_gid, sizeof(*gid))) {
+ dev_put(idev);
+ return -EPERM;
+ }
+ }
+ if (idev)
+ dev_put(idev);
+ }
- if (index < 0 || index >= cache->table_len)
- ret = -EINVAL;
- else
- *gid = cache->table[index];
+ mutex_lock(&table->lock);
- read_unlock_irqrestore(&device->cache.lock, flags);
+ ix = find_gid(table, gid, attr, false, GID_ATTR_FIND_MASK_GID |
+ GID_ATTR_FIND_MASK_NETDEV);
+ if (ix >= 0)
+ goto out_unlock;
+ ix = find_gid(table, &zgid, NULL, false, GID_ATTR_FIND_MASK_GID |
+ GID_ATTR_FIND_MASK_DEFAULT);
+ if (ix < 0) {
+ ret = -ENOSPC;
+ goto out_unlock;
+ }
+
+ add_gid(ib_dev, port, table, ix, gid, attr, false);
+
+out_unlock:
+ mutex_unlock(&table->lock);
return ret;
}
-EXPORT_SYMBOL(ib_get_cached_gid);
-int ib_find_cached_gid(struct ib_device *device,
- const union ib_gid *gid,
- u8 *port_num,
- u16 *index)
+int ib_cache_gid_del(struct ib_device *ib_dev, u8 port,
+ union ib_gid *gid, struct ib_gid_attr *attr)
{
- struct ib_gid_cache *cache;
+ struct ib_gid_table **ports_table = ib_dev->cache.gid_cache;
+ struct ib_gid_table *table;
+ int ix;
+
+ table = ports_table[port - rdma_start_port(ib_dev)];
+
+ mutex_lock(&table->lock);
+
+ ix = find_gid(table, gid, attr, false,
+ GID_ATTR_FIND_MASK_GID |
+ GID_ATTR_FIND_MASK_NETDEV |
+ GID_ATTR_FIND_MASK_DEFAULT);
+ if (ix < 0)
+ goto out_unlock;
+
+ del_gid(ib_dev, port, table, ix, false);
+
+out_unlock:
+ mutex_unlock(&table->lock);
+ return 0;
+}
+
+int ib_cache_gid_del_all_netdev_gids(struct ib_device *ib_dev, u8 port,
+ struct net_device *ndev)
+{
+ struct ib_gid_table **ports_table = ib_dev->cache.gid_cache;
+ struct ib_gid_table *table;
+ int ix;
+
+ table = ports_table[port - rdma_start_port(ib_dev)];
+
+ mutex_lock(&table->lock);
+
+ for (ix = 0; ix < table->sz; ix++)
+ if (table->data_vec[ix].attr.ndev == ndev)
+ del_gid(ib_dev, port, table, ix, false);
+
+ mutex_unlock(&table->lock);
+ return 0;
+}
+
+static int __ib_cache_gid_get(struct ib_device *ib_dev, u8 port, int index,
+ union ib_gid *gid, struct ib_gid_attr *attr)
+{
+ struct ib_gid_table **ports_table = ib_dev->cache.gid_cache;
+ struct ib_gid_table *table;
unsigned long flags;
- int p, i;
- int ret = -ENOENT;
- *port_num = -1;
- if (index)
- *index = -1;
+ table = ports_table[port - rdma_start_port(ib_dev)];
- read_lock_irqsave(&device->cache.lock, flags);
+ if (index < 0 || index >= table->sz)
+ return -EINVAL;
- for (p = 0; p <= rdma_end_port(device) - rdma_start_port(device); ++p) {
- cache = device->cache.gid_cache[p];
- for (i = 0; i < cache->table_len; ++i) {
- if (!memcmp(gid, &cache->table[i], sizeof *gid)) {
- *port_num = p + rdma_start_port(device);
- if (index)
- *index = i;
- ret = 0;
- goto found;
- }
+ read_lock_irqsave(&table->data_vec[index].lock, flags);
+ if (table->data_vec[index].props & GID_TABLE_ENTRY_INVALID) {
+ read_unlock_irqrestore(&table->data_vec[index].lock, flags);
+ return -EAGAIN;
+ }
+
+ memcpy(gid, &table->data_vec[index].gid, sizeof(*gid));
+ if (attr) {
+ memcpy(attr, &table->data_vec[index].attr, sizeof(*attr));
+ if (attr->ndev)
+ dev_hold(attr->ndev);
+ }
+
+ read_unlock_irqrestore(&table->data_vec[index].lock, flags);
+ return 0;
+}
+
+static int _ib_cache_gid_table_find(struct ib_device *ib_dev,
+ const union ib_gid *gid,
+ const struct ib_gid_attr *val,
+ unsigned long mask,
+ u8 *port, u16 *index)
+{
+ struct ib_gid_table **ports_table = ib_dev->cache.gid_cache;
+ struct ib_gid_table *table;
+ u8 p;
+ int local_index;
+
+ for (p = 0; p < ib_dev->phys_port_cnt; p++) {
+ table = ports_table[p];
+ local_index = find_gid(table, gid, val, false, mask);
+ if (local_index >= 0) {
+ if (index)
+ *index = local_index;
+ if (port)
+ *port = p + rdma_start_port(ib_dev);
+ return 0;
}
}
-found:
- read_unlock_irqrestore(&device->cache.lock, flags);
- return ret;
+ return -ENOENT;
+}
+
+static int ib_cache_gid_find(struct ib_device *ib_dev,
+ const union ib_gid *gid,
+ struct net_device *ndev, u8 *port,
+ u16 *index)
+{
+ unsigned long mask = GID_ATTR_FIND_MASK_GID;
+ struct ib_gid_attr gid_attr_val = {.ndev = ndev};
+
+ if (ndev)
+ mask |= GID_ATTR_FIND_MASK_NETDEV;
+
+ return _ib_cache_gid_table_find(ib_dev, gid, &gid_attr_val,
+ mask, port, index);
+}
+
+int ib_cache_gid_find_by_port(struct ib_device *ib_dev,
+ const union ib_gid *gid,
+ u8 port, struct net_device *ndev,
+ u16 *index)
+{
+ int local_index;
+ struct ib_gid_table **ports_table = ib_dev->cache.gid_cache;
+ struct ib_gid_table *table;
+ unsigned long mask = GID_ATTR_FIND_MASK_GID;
+ struct ib_gid_attr val = {.ndev = ndev};
+
+ if (port < rdma_start_port(ib_dev) ||
+ port > rdma_end_port(ib_dev))
+ return -ENOENT;
+
+ table = ports_table[port - rdma_start_port(ib_dev)];
+
+ if (ndev)
+ mask |= GID_ATTR_FIND_MASK_NETDEV;
+
+ local_index = find_gid(table, gid, &val, false, mask);
+ if (local_index >= 0) {
+ if (index)
+ *index = local_index;
+ return 0;
+ }
+
+ return -ENOENT;
+}
+
+static struct ib_gid_table *alloc_gid_table(int sz)
+{
+ unsigned int i;
+ struct ib_gid_table *table =
+ kzalloc(sizeof(struct ib_gid_table), GFP_KERNEL);
+ if (!table)
+ return NULL;
+
+ table->data_vec = kcalloc(sz, sizeof(*table->data_vec), GFP_KERNEL);
+ if (!table->data_vec)
+ goto err_free_table;
+
+ mutex_init(&table->lock);
+
+ table->sz = sz;
+
+ for (i = 0; i < sz; i++)
+ rwlock_init(&table->data_vec[i].lock);
+
+ return table;
+
+err_free_table:
+ kfree(table);
+ return NULL;
+}
+
+static void release_gid_table(struct ib_gid_table *table)
+{
+ if (table) {
+ kfree(table->data_vec);
+ kfree(table);
+ }
+}
+
+static void cleanup_gid_table_port(struct ib_device *ib_dev, u8 port,
+ struct ib_gid_table *table)
+{
+ int i;
+
+ if (!table)
+ return;
+
+ for (i = 0; i < table->sz; ++i) {
+ if (memcmp(&table->data_vec[i].gid, &zgid,
+ sizeof(table->data_vec[i].gid)))
+ del_gid(ib_dev, port, table, i,
+ table->data_vec[i].props &
+ GID_ATTR_FIND_MASK_DEFAULT);
+ }
+}
+
+void ib_cache_gid_set_default_gid(struct ib_device *ib_dev, u8 port,
+ struct net_device *ndev,
+ enum ib_cache_gid_default_mode mode)
+{
+ struct ib_gid_table **ports_table = ib_dev->cache.gid_cache;
+ union ib_gid gid;
+ struct ib_gid_attr gid_attr;
+ struct ib_gid_table *table;
+ int ix;
+ union ib_gid current_gid;
+ struct ib_gid_attr current_gid_attr = {};
+
+ table = ports_table[port - rdma_start_port(ib_dev)];
+
+ make_default_gid(ndev, &gid);
+ memset(&gid_attr, 0, sizeof(gid_attr));
+ gid_attr.ndev = ndev;
+
+ ix = find_gid(table, NULL, NULL, true, GID_ATTR_FIND_MASK_DEFAULT);
+
+ /* Coudn't find default GID location */
+ WARN_ON(ix < 0);
+
+ mutex_lock(&table->lock);
+ if (!__ib_cache_gid_get(ib_dev, port, ix,
+ &current_gid, &current_gid_attr) &&
+ mode == IB_CACHE_GID_DEFAULT_MODE_SET &&
+ !memcmp(&gid, &current_gid, sizeof(gid)) &&
+ !memcmp(&gid_attr, &current_gid_attr, sizeof(gid_attr)))
+ goto unlock;
+
+ if ((memcmp(&current_gid, &zgid, sizeof(current_gid)) ||
+ memcmp(&current_gid_attr, &zattr,
+ sizeof(current_gid_attr))) &&
+ del_gid(ib_dev, port, table, ix, true)) {
+ pr_warn("ib_cache_gid: can't delete index %d for default gid %pI6\n",
+ ix, gid.raw);
+ goto unlock;
+ }
+
+ if (mode == IB_CACHE_GID_DEFAULT_MODE_SET)
+ if (add_gid(ib_dev, port, table, ix, &gid, &gid_attr, true))
+ pr_warn("ib_cache_gid: unable to add default gid %pI6\n",
+ gid.raw);
+
+unlock:
+ if (current_gid_attr.ndev)
+ dev_put(current_gid_attr.ndev);
+ mutex_unlock(&table->lock);
+}
+
+static int gid_table_reserve_default(struct ib_device *ib_dev, u8 port,
+ struct ib_gid_table *table)
+{
+ if (rdma_protocol_roce(ib_dev, port)) {
+ struct ib_gid_table_entry *entry = &table->data_vec[0];
+
+ entry->props |= GID_TABLE_ENTRY_DEFAULT;
+ }
+
+ return 0;
+}
+
+static int _gid_table_setup_one(struct ib_device *ib_dev)
+{
+ u8 port;
+ struct ib_gid_table **table;
+ int err = 0;
+
+ table = kcalloc(ib_dev->phys_port_cnt, sizeof(*table), GFP_KERNEL);
+
+ if (!table) {
+ pr_warn("failed to allocate ib gid cache for %s\n",
+ ib_dev->name);
+ return -ENOMEM;
+ }
+
+ for (port = 0; port < ib_dev->phys_port_cnt; port++) {
+ u8 rdma_port = port + rdma_start_port(ib_dev);
+
+ table[port] =
+ alloc_gid_table(
+ ib_dev->port_immutable[rdma_port].gid_tbl_len);
+ if (!table[port]) {
+ err = -ENOMEM;
+ goto rollback_table_setup;
+ }
+
+ err = gid_table_reserve_default(ib_dev,
+ port + rdma_start_port(ib_dev),
+ table[port]);
+ if (err)
+ goto rollback_table_setup;
+ }
+
+ ib_dev->cache.gid_cache = table;
+ return 0;
+
+rollback_table_setup:
+ for (port = 0; port < ib_dev->phys_port_cnt; port++) {
+ cleanup_gid_table_port(ib_dev, port + rdma_start_port(ib_dev),
+ table[port]);
+ release_gid_table(table[port]);
+ }
+
+ kfree(table);
+ return err;
+}
+
+static void gid_table_release_one(struct ib_device *ib_dev)
+{
+ struct ib_gid_table **table = ib_dev->cache.gid_cache;
+ u8 port;
+
+ if (!table)
+ return;
+
+ for (port = 0; port < ib_dev->phys_port_cnt; port++)
+ release_gid_table(table[port]);
+
+ kfree(table);
+ ib_dev->cache.gid_cache = NULL;
+}
+
+static void gid_table_cleanup_one(struct ib_device *ib_dev)
+{
+ struct ib_gid_table **table = ib_dev->cache.gid_cache;
+ u8 port;
+
+ if (!table)
+ return;
+
+ for (port = 0; port < ib_dev->phys_port_cnt; port++)
+ cleanup_gid_table_port(ib_dev, port + rdma_start_port(ib_dev),
+ table[port]);
+}
+
+static int gid_table_setup_one(struct ib_device *ib_dev)
+{
+ int err;
+
+ err = _gid_table_setup_one(ib_dev);
+
+ if (err)
+ return err;
+
+ err = roce_rescan_device(ib_dev);
+
+ if (err) {
+ gid_table_cleanup_one(ib_dev);
+ gid_table_release_one(ib_dev);
+ }
+
+ return err;
+}
+
+int ib_get_cached_gid(struct ib_device *device,
+ u8 port_num,
+ int index,
+ union ib_gid *gid)
+{
+ if (port_num < rdma_start_port(device) || port_num > rdma_end_port(device))
+ return -EINVAL;
+
+ return __ib_cache_gid_get(device, port_num, index, gid, NULL);
+}
+EXPORT_SYMBOL(ib_get_cached_gid);
+
+int ib_find_cached_gid(struct ib_device *device,
+ const union ib_gid *gid,
+ u8 *port_num,
+ u16 *index)
+{
+ return ib_cache_gid_find(device, gid, NULL, port_num, index);
}
EXPORT_SYMBOL(ib_find_cached_gid);
@@ -243,9 +790,21 @@ static void ib_cache_update(struct ib_device *device,
{
struct ib_port_attr *tprops = NULL;
struct ib_pkey_cache *pkey_cache = NULL, *old_pkey_cache;
- struct ib_gid_cache *gid_cache = NULL, *old_gid_cache;
+ struct ib_gid_cache {
+ int table_len;
+ union ib_gid table[0];
+ } *gid_cache = NULL;
int i;
int ret;
+ struct ib_gid_table *table;
+ struct ib_gid_table **ports_table = device->cache.gid_cache;
+ bool use_roce_gid_table =
+ rdma_cap_roce_gid_table(device, port);
+
+ if (port < rdma_start_port(device) || port > rdma_end_port(device))
+ return;
+
+ table = ports_table[port - rdma_start_port(device)];
tprops = kmalloc(sizeof *tprops, GFP_KERNEL);
if (!tprops)
@@ -265,12 +824,14 @@ static void ib_cache_update(struct ib_device *device,
pkey_cache->table_len = tprops->pkey_tbl_len;
- gid_cache = kmalloc(sizeof *gid_cache + tprops->gid_tbl_len *
- sizeof *gid_cache->table, GFP_KERNEL);
- if (!gid_cache)
- goto err;
+ if (!use_roce_gid_table) {
+ gid_cache = kmalloc(sizeof(*gid_cache) + tprops->gid_tbl_len *
+ sizeof(*gid_cache->table), GFP_KERNEL);
+ if (!gid_cache)
+ goto err;
- gid_cache->table_len = tprops->gid_tbl_len;
+ gid_cache->table_len = tprops->gid_tbl_len;
+ }
for (i = 0; i < pkey_cache->table_len; ++i) {
ret = ib_query_pkey(device, port, i, pkey_cache->table + i);
@@ -281,29 +842,36 @@ static void ib_cache_update(struct ib_device *device,
}
}
- for (i = 0; i < gid_cache->table_len; ++i) {
- ret = ib_query_gid(device, port, i, gid_cache->table + i);
- if (ret) {
- printk(KERN_WARNING "ib_query_gid failed (%d) for %s (index %d)\n",
- ret, device->name, i);
- goto err;
+ if (!use_roce_gid_table) {
+ for (i = 0; i < gid_cache->table_len; ++i) {
+ ret = ib_query_gid(device, port, i,
+ gid_cache->table + i);
+ if (ret) {
+ printk(KERN_WARNING "ib_query_gid failed (%d) for %s (index %d)\n",
+ ret, device->name, i);
+ goto err;
+ }
}
}
write_lock_irq(&device->cache.lock);
old_pkey_cache = device->cache.pkey_cache[port - rdma_start_port(device)];
- old_gid_cache = device->cache.gid_cache [port - rdma_start_port(device)];
device->cache.pkey_cache[port - rdma_start_port(device)] = pkey_cache;
- device->cache.gid_cache [port - rdma_start_port(device)] = gid_cache;
+ if (!use_roce_gid_table) {
+ for (i = 0; i < gid_cache->table_len; i++) {
+ modify_gid(device, port, table, i, gid_cache->table + i,
+ &zattr, false);
+ }
+ }
device->cache.lmc_cache[port - rdma_start_port(device)] = tprops->lmc;
write_unlock_irq(&device->cache.lock);
+ kfree(gid_cache);
kfree(old_pkey_cache);
- kfree(old_gid_cache);
kfree(tprops);
return;
@@ -344,85 +912,88 @@ static void ib_cache_event(struct ib_event_handler *handler,
}
}
-static void ib_cache_setup_one(struct ib_device *device)
+int ib_cache_setup_one(struct ib_device *device)
{
int p;
+ int err;
rwlock_init(&device->cache.lock);
device->cache.pkey_cache =
- kmalloc(sizeof *device->cache.pkey_cache *
- (rdma_end_port(device) - rdma_start_port(device) + 1), GFP_KERNEL);
- device->cache.gid_cache =
- kmalloc(sizeof *device->cache.gid_cache *
+ kzalloc(sizeof *device->cache.pkey_cache *
(rdma_end_port(device) - rdma_start_port(device) + 1), GFP_KERNEL);
-
device->cache.lmc_cache = kmalloc(sizeof *device->cache.lmc_cache *
(rdma_end_port(device) -
rdma_start_port(device) + 1),
GFP_KERNEL);
-
- if (!device->cache.pkey_cache || !device->cache.gid_cache ||
+ if (!device->cache.pkey_cache ||
!device->cache.lmc_cache) {
printk(KERN_WARNING "Couldn't allocate cache "
"for %s\n", device->name);
- goto err;
+ return -ENOMEM;
}
- for (p = 0; p <= rdma_end_port(device) - rdma_start_port(device); ++p) {
- device->cache.pkey_cache[p] = NULL;
- device->cache.gid_cache [p] = NULL;
+ err = gid_table_setup_one(device);
+ if (err)
+ /* Allocated memory will be cleaned in the release function */
+ return err;
+
+ for (p = 0; p <= rdma_end_port(device) - rdma_start_port(device); ++p)
ib_cache_update(device, p + rdma_start_port(device));
- }
INIT_IB_EVENT_HANDLER(&device->cache.event_handler,
device, ib_cache_event);
- if (ib_register_event_handler(&device->cache.event_handler))
- goto err_cache;
-
- return;
+ err = ib_register_event_handler(&device->cache.event_handler);
+ if (err)
+ goto err;
-err_cache:
- for (p = 0; p <= rdma_end_port(device) - rdma_start_port(device); ++p) {
- kfree(device->cache.pkey_cache[p]);
- kfree(device->cache.gid_cache[p]);
- }
+ return 0;
err:
- kfree(device->cache.pkey_cache);
- kfree(device->cache.gid_cache);
- kfree(device->cache.lmc_cache);
+ gid_table_cleanup_one(device);
+ return err;
}
-static void ib_cache_cleanup_one(struct ib_device *device)
+void ib_cache_release_one(struct ib_device *device)
{
int p;
- ib_unregister_event_handler(&device->cache.event_handler);
- flush_workqueue(ib_wq);
-
- for (p = 0; p <= rdma_end_port(device) - rdma_start_port(device); ++p) {
- kfree(device->cache.pkey_cache[p]);
- kfree(device->cache.gid_cache[p]);
- }
-
+ /*
+ * The release function frees all the cache elements.
+ * This function should be called as part of freeing
+ * all the device's resources when the cache could no
+ * longer be accessed.
+ */
+ if (device->cache.pkey_cache)
+ for (p = 0;
+ p <= rdma_end_port(device) - rdma_start_port(device); ++p)
+ kfree(device->cache.pkey_cache[p]);
+
+ gid_table_release_one(device);
kfree(device->cache.pkey_cache);
- kfree(device->cache.gid_cache);
kfree(device->cache.lmc_cache);
}
-static struct ib_client cache_client = {
- .name = "cache",
- .add = ib_cache_setup_one,
- .remove = ib_cache_cleanup_one
-};
+void ib_cache_cleanup_one(struct ib_device *device)
+{
+ /* The cleanup function unregisters the event handler,
+ * waits for all in-progress workqueue elements and cleans
+ * up the GID cache. This function should be called after
+ * the device was removed from the devices list and all
+ * clients were removed, so the cache exists but is
+ * non-functional and shouldn't be updated anymore.
+ */
+ ib_unregister_event_handler(&device->cache.event_handler);
+ flush_workqueue(ib_wq);
+ gid_table_cleanup_one(device);
+}
-int __init ib_cache_setup(void)
+void __init ib_cache_setup(void)
{
- return ib_register_client(&cache_client);
+ roce_gid_mgmt_init();
}
void __exit ib_cache_cleanup(void)
{
- ib_unregister_client(&cache_client);
+ roce_gid_mgmt_cleanup();
}
diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c
index 3a972ebf3c0d..ea4db9c1d44f 100644
--- a/drivers/infiniband/core/cm.c
+++ b/drivers/infiniband/core/cm.c
@@ -58,7 +58,7 @@ MODULE_DESCRIPTION("InfiniBand CM");
MODULE_LICENSE("Dual BSD/GPL");
static void cm_add_one(struct ib_device *device);
-static void cm_remove_one(struct ib_device *device);
+static void cm_remove_one(struct ib_device *device, void *client_data);
static struct ib_client cm_client = {
.name = "cm",
@@ -213,13 +213,15 @@ struct cm_id_private {
spinlock_t lock; /* Do not acquire inside cm.lock */
struct completion comp;
atomic_t refcount;
+ /* Number of clients sharing this ib_cm_id. Only valid for listeners.
+ * Protected by the cm.lock spinlock. */
+ int listen_sharecount;
struct ib_mad_send_buf *msg;
struct cm_timewait_info *timewait_info;
/* todo: use alternate port on send failure */
struct cm_av av;
struct cm_av alt_av;
- struct ib_cm_compare_data *compare_data;
void *private_data;
__be64 tid;
@@ -440,40 +442,6 @@ static struct cm_id_private * cm_acquire_id(__be32 local_id, __be32 remote_id)
return cm_id_priv;
}
-static void cm_mask_copy(u32 *dst, const u32 *src, const u32 *mask)
-{
- int i;
-
- for (i = 0; i < IB_CM_COMPARE_SIZE; i++)
- dst[i] = src[i] & mask[i];
-}
-
-static int cm_compare_data(struct ib_cm_compare_data *src_data,
- struct ib_cm_compare_data *dst_data)
-{
- u32 src[IB_CM_COMPARE_SIZE];
- u32 dst[IB_CM_COMPARE_SIZE];
-
- if (!src_data || !dst_data)
- return 0;
-
- cm_mask_copy(src, src_data->data, dst_data->mask);
- cm_mask_copy(dst, dst_data->data, src_data->mask);
- return memcmp(src, dst, sizeof(src));
-}
-
-static int cm_compare_private_data(u32 *private_data,
- struct ib_cm_compare_data *dst_data)
-{
- u32 src[IB_CM_COMPARE_SIZE];
-
- if (!dst_data)
- return 0;
-
- cm_mask_copy(src, private_data, dst_data->mask);
- return memcmp(src, dst_data->data, sizeof(src));
-}
-
/*
* Trivial helpers to strip endian annotation and compare; the
* endianness doesn't actually matter since we just need a stable
@@ -506,18 +474,14 @@ static struct cm_id_private * cm_insert_listen(struct cm_id_private *cm_id_priv)
struct cm_id_private *cur_cm_id_priv;
__be64 service_id = cm_id_priv->id.service_id;
__be64 service_mask = cm_id_priv->id.service_mask;
- int data_cmp;
while (*link) {
parent = *link;
cur_cm_id_priv = rb_entry(parent, struct cm_id_private,
service_node);
- data_cmp = cm_compare_data(cm_id_priv->compare_data,
- cur_cm_id_priv->compare_data);
if ((cur_cm_id_priv->id.service_mask & service_id) ==
(service_mask & cur_cm_id_priv->id.service_id) &&
- (cm_id_priv->id.device == cur_cm_id_priv->id.device) &&
- !data_cmp)
+ (cm_id_priv->id.device == cur_cm_id_priv->id.device))
return cur_cm_id_priv;
if (cm_id_priv->id.device < cur_cm_id_priv->id.device)
@@ -528,8 +492,6 @@ static struct cm_id_private * cm_insert_listen(struct cm_id_private *cm_id_priv)
link = &(*link)->rb_left;
else if (be64_gt(service_id, cur_cm_id_priv->id.service_id))
link = &(*link)->rb_right;
- else if (data_cmp < 0)
- link = &(*link)->rb_left;
else
link = &(*link)->rb_right;
}
@@ -539,20 +501,16 @@ static struct cm_id_private * cm_insert_listen(struct cm_id_private *cm_id_priv)
}
static struct cm_id_private * cm_find_listen(struct ib_device *device,
- __be64 service_id,
- u32 *private_data)
+ __be64 service_id)
{
struct rb_node *node = cm.listen_service_table.rb_node;
struct cm_id_private *cm_id_priv;
- int data_cmp;
while (node) {
cm_id_priv = rb_entry(node, struct cm_id_private, service_node);
- data_cmp = cm_compare_private_data(private_data,
- cm_id_priv->compare_data);
if ((cm_id_priv->id.service_mask & service_id) ==
cm_id_priv->id.service_id &&
- (cm_id_priv->id.device == device) && !data_cmp)
+ (cm_id_priv->id.device == device))
return cm_id_priv;
if (device < cm_id_priv->id.device)
@@ -563,8 +521,6 @@ static struct cm_id_private * cm_find_listen(struct ib_device *device,
node = node->rb_left;
else if (be64_gt(service_id, cm_id_priv->id.service_id))
node = node->rb_right;
- else if (data_cmp < 0)
- node = node->rb_left;
else
node = node->rb_right;
}
@@ -859,9 +815,15 @@ retest:
spin_lock_irq(&cm_id_priv->lock);
switch (cm_id->state) {
case IB_CM_LISTEN:
- cm_id->state = IB_CM_IDLE;
spin_unlock_irq(&cm_id_priv->lock);
+
spin_lock_irq(&cm.lock);
+ if (--cm_id_priv->listen_sharecount > 0) {
+ /* The id is still shared. */
+ cm_deref_id(cm_id_priv);
+ spin_unlock_irq(&cm.lock);
+ return;
+ }
rb_erase(&cm_id_priv->service_node, &cm.listen_service_table);
spin_unlock_irq(&cm.lock);
break;
@@ -930,7 +892,6 @@ retest:
wait_for_completion(&cm_id_priv->comp);
while ((work = cm_dequeue_work(cm_id_priv)) != NULL)
cm_free_work(work);
- kfree(cm_id_priv->compare_data);
kfree(cm_id_priv->private_data);
kfree(cm_id_priv);
}
@@ -941,11 +902,23 @@ void ib_destroy_cm_id(struct ib_cm_id *cm_id)
}
EXPORT_SYMBOL(ib_destroy_cm_id);
-int ib_cm_listen(struct ib_cm_id *cm_id, __be64 service_id, __be64 service_mask,
- struct ib_cm_compare_data *compare_data)
+/**
+ * __ib_cm_listen - Initiates listening on the specified service ID for
+ * connection and service ID resolution requests.
+ * @cm_id: Connection identifier associated with the listen request.
+ * @service_id: Service identifier matched against incoming connection
+ * and service ID resolution requests. The service ID should be specified
+ * network-byte order. If set to IB_CM_ASSIGN_SERVICE_ID, the CM will
+ * assign a service ID to the caller.
+ * @service_mask: Mask applied to service ID used to listen across a
+ * range of service IDs. If set to 0, the service ID is matched
+ * exactly. This parameter is ignored if %service_id is set to
+ * IB_CM_ASSIGN_SERVICE_ID.
+ */
+static int __ib_cm_listen(struct ib_cm_id *cm_id, __be64 service_id,
+ __be64 service_mask)
{
struct cm_id_private *cm_id_priv, *cur_cm_id_priv;
- unsigned long flags;
int ret = 0;
service_mask = service_mask ? service_mask : ~cpu_to_be64(0);
@@ -958,20 +931,9 @@ int ib_cm_listen(struct ib_cm_id *cm_id, __be64 service_id, __be64 service_mask,
if (cm_id->state != IB_CM_IDLE)
return -EINVAL;
- if (compare_data) {
- cm_id_priv->compare_data = kzalloc(sizeof *compare_data,
- GFP_KERNEL);
- if (!cm_id_priv->compare_data)
- return -ENOMEM;
- cm_mask_copy(cm_id_priv->compare_data->data,
- compare_data->data, compare_data->mask);
- memcpy(cm_id_priv->compare_data->mask, compare_data->mask,
- sizeof(compare_data->mask));
- }
-
cm_id->state = IB_CM_LISTEN;
+ ++cm_id_priv->listen_sharecount;
- spin_lock_irqsave(&cm.lock, flags);
if (service_id == IB_CM_ASSIGN_SERVICE_ID) {
cm_id->service_id = cpu_to_be64(cm.listen_service_id++);
cm_id->service_mask = ~cpu_to_be64(0);
@@ -980,18 +942,95 @@ int ib_cm_listen(struct ib_cm_id *cm_id, __be64 service_id, __be64 service_mask,
cm_id->service_mask = service_mask;
}
cur_cm_id_priv = cm_insert_listen(cm_id_priv);
- spin_unlock_irqrestore(&cm.lock, flags);
if (cur_cm_id_priv) {
cm_id->state = IB_CM_IDLE;
- kfree(cm_id_priv->compare_data);
- cm_id_priv->compare_data = NULL;
+ --cm_id_priv->listen_sharecount;
ret = -EBUSY;
}
return ret;
}
+
+int ib_cm_listen(struct ib_cm_id *cm_id, __be64 service_id, __be64 service_mask)
+{
+ unsigned long flags;
+ int ret;
+
+ spin_lock_irqsave(&cm.lock, flags);
+ ret = __ib_cm_listen(cm_id, service_id, service_mask);
+ spin_unlock_irqrestore(&cm.lock, flags);
+
+ return ret;
+}
EXPORT_SYMBOL(ib_cm_listen);
+/**
+ * Create a new listening ib_cm_id and listen on the given service ID.
+ *
+ * If there's an existing ID listening on that same device and service ID,
+ * return it.
+ *
+ * @device: Device associated with the cm_id. All related communication will
+ * be associated with the specified device.
+ * @cm_handler: Callback invoked to notify the user of CM events.
+ * @service_id: Service identifier matched against incoming connection
+ * and service ID resolution requests. The service ID should be specified
+ * network-byte order. If set to IB_CM_ASSIGN_SERVICE_ID, the CM will
+ * assign a service ID to the caller.
+ *
+ * Callers should call ib_destroy_cm_id when done with the listener ID.
+ */
+struct ib_cm_id *ib_cm_insert_listen(struct ib_device *device,
+ ib_cm_handler cm_handler,
+ __be64 service_id)
+{
+ struct cm_id_private *cm_id_priv;
+ struct ib_cm_id *cm_id;
+ unsigned long flags;
+ int err = 0;
+
+ /* Create an ID in advance, since the creation may sleep */
+ cm_id = ib_create_cm_id(device, cm_handler, NULL);
+ if (IS_ERR(cm_id))
+ return cm_id;
+
+ spin_lock_irqsave(&cm.lock, flags);
+
+ if (service_id == IB_CM_ASSIGN_SERVICE_ID)
+ goto new_id;
+
+ /* Find an existing ID */
+ cm_id_priv = cm_find_listen(device, service_id);
+ if (cm_id_priv) {
+ if (cm_id->cm_handler != cm_handler || cm_id->context) {
+ /* Sharing an ib_cm_id with different handlers is not
+ * supported */
+ spin_unlock_irqrestore(&cm.lock, flags);
+ return ERR_PTR(-EINVAL);
+ }
+ atomic_inc(&cm_id_priv->refcount);
+ ++cm_id_priv->listen_sharecount;
+ spin_unlock_irqrestore(&cm.lock, flags);
+
+ ib_destroy_cm_id(cm_id);
+ cm_id = &cm_id_priv->id;
+ return cm_id;
+ }
+
+new_id:
+ /* Use newly created ID */
+ err = __ib_cm_listen(cm_id, service_id, 0);
+
+ spin_unlock_irqrestore(&cm.lock, flags);
+
+ if (err) {
+ ib_destroy_cm_id(cm_id);
+ return ERR_PTR(err);
+ }
+ return cm_id;
+}
+EXPORT_SYMBOL(ib_cm_insert_listen);
+
static __be64 cm_form_tid(struct cm_id_private *cm_id_priv,
enum cm_msg_sequence msg_seq)
{
@@ -1268,6 +1307,7 @@ static void cm_format_paths_from_req(struct cm_req_msg *req_msg,
primary_path->packet_life_time =
cm_req_get_primary_local_ack_timeout(req_msg);
primary_path->packet_life_time -= (primary_path->packet_life_time > 0);
+ primary_path->service_id = req_msg->service_id;
if (req_msg->alt_local_lid) {
memset(alt_path, 0, sizeof *alt_path);
@@ -1289,7 +1329,26 @@ static void cm_format_paths_from_req(struct cm_req_msg *req_msg,
alt_path->packet_life_time =
cm_req_get_alt_local_ack_timeout(req_msg);
alt_path->packet_life_time -= (alt_path->packet_life_time > 0);
+ alt_path->service_id = req_msg->service_id;
+ }
+}
+
+static u16 cm_get_bth_pkey(struct cm_work *work)
+{
+ struct ib_device *ib_dev = work->port->cm_dev->ib_device;
+ u8 port_num = work->port->port_num;
+ u16 pkey_index = work->mad_recv_wc->wc->pkey_index;
+ u16 pkey;
+ int ret;
+
+ ret = ib_get_cached_pkey(ib_dev, port_num, pkey_index, &pkey);
+ if (ret) {
+ dev_warn_ratelimited(&ib_dev->dev, "ib_cm: Couldn't retrieve pkey for incoming request (port %d, pkey index %d). %d\n",
+ port_num, pkey_index, ret);
+ return 0;
}
+
+ return pkey;
}
static void cm_format_req_event(struct cm_work *work,
@@ -1302,6 +1361,7 @@ static void cm_format_req_event(struct cm_work *work,
req_msg = (struct cm_req_msg *)work->mad_recv_wc->recv_buf.mad;
param = &work->cm_event.param.req_rcvd;
param->listen_id = listen_id;
+ param->bth_pkey = cm_get_bth_pkey(work);
param->port = cm_id_priv->av.port->port_num;
param->primary_path = &work->path[0];
if (req_msg->alt_local_lid)
@@ -1484,8 +1544,7 @@ static struct cm_id_private * cm_match_req(struct cm_work *work,
/* Find matching listen request. */
listen_cm_id_priv = cm_find_listen(cm_id_priv->id.device,
- req_msg->service_id,
- req_msg->private_data);
+ req_msg->service_id);
if (!listen_cm_id_priv) {
cm_cleanup_timewait(cm_id_priv->timewait_info);
spin_unlock_irq(&cm.lock);
@@ -2992,6 +3051,8 @@ static void cm_format_sidr_req_event(struct cm_work *work,
param = &work->cm_event.param.sidr_req_rcvd;
param->pkey = __be16_to_cpu(sidr_req_msg->pkey);
param->listen_id = listen_id;
+ param->service_id = sidr_req_msg->service_id;
+ param->bth_pkey = cm_get_bth_pkey(work);
param->port = work->port->port_num;
work->cm_event.private_data = &sidr_req_msg->private_data;
}
@@ -3031,8 +3092,7 @@ static int cm_sidr_req_handler(struct cm_work *work)
}
cm_id_priv->id.state = IB_CM_SIDR_REQ_RCVD;
cur_cm_id_priv = cm_find_listen(cm_id->device,
- sidr_req_msg->service_id,
- sidr_req_msg->private_data);
+ sidr_req_msg->service_id);
if (!cur_cm_id_priv) {
spin_unlock_irq(&cm.lock);
cm_reject_sidr_req(cm_id_priv, IB_SIDR_UNSUPPORTED);
@@ -3886,9 +3946,9 @@ free:
kfree(cm_dev);
}
-static void cm_remove_one(struct ib_device *ib_device)
+static void cm_remove_one(struct ib_device *ib_device, void *client_data)
{
- struct cm_device *cm_dev;
+ struct cm_device *cm_dev = client_data;
struct cm_port *port;
struct ib_port_modify port_modify = {
.clr_port_cap_mask = IB_PORT_CM_SUP
@@ -3896,7 +3956,6 @@ static void cm_remove_one(struct ib_device *ib_device)
unsigned long flags;
int i;
- cm_dev = ib_get_client_data(ib_device, &cm_client);
if (!cm_dev)
return;
diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
index 143ded2bbe7c..b1ab13f3e182 100644
--- a/drivers/infiniband/core/cma.c
+++ b/drivers/infiniband/core/cma.c
@@ -46,6 +46,8 @@
#include <net/tcp.h>
#include <net/ipv6.h>
+#include <net/ip_fib.h>
+#include <net/ip6_route.h>
#include <rdma/rdma_cm.h>
#include <rdma/rdma_cm_ib.h>
@@ -94,7 +96,7 @@ const char *rdma_event_msg(enum rdma_cm_event_type event)
EXPORT_SYMBOL(rdma_event_msg);
static void cma_add_one(struct ib_device *device);
-static void cma_remove_one(struct ib_device *device);
+static void cma_remove_one(struct ib_device *device, void *client_data);
static struct ib_client cma_client = {
.name = "cma",
@@ -113,6 +115,22 @@ static DEFINE_IDR(udp_ps);
static DEFINE_IDR(ipoib_ps);
static DEFINE_IDR(ib_ps);
+static struct idr *cma_idr(enum rdma_port_space ps)
+{
+ switch (ps) {
+ case RDMA_PS_TCP:
+ return &tcp_ps;
+ case RDMA_PS_UDP:
+ return &udp_ps;
+ case RDMA_PS_IPOIB:
+ return &ipoib_ps;
+ case RDMA_PS_IB:
+ return &ib_ps;
+ default:
+ return NULL;
+ }
+}
+
struct cma_device {
struct list_head list;
struct ib_device *device;
@@ -122,11 +140,33 @@ struct cma_device {
};
struct rdma_bind_list {
- struct idr *ps;
+ enum rdma_port_space ps;
struct hlist_head owners;
unsigned short port;
};
+static int cma_ps_alloc(enum rdma_port_space ps,
+ struct rdma_bind_list *bind_list, int snum)
+{
+ struct idr *idr = cma_idr(ps);
+
+ return idr_alloc(idr, bind_list, snum, snum + 1, GFP_KERNEL);
+}
+
+static struct rdma_bind_list *cma_ps_find(enum rdma_port_space ps, int snum)
+{
+ struct idr *idr = cma_idr(ps);
+
+ return idr_find(idr, snum);
+}
+
+static void cma_ps_remove(enum rdma_port_space ps, int snum)
+{
+ struct idr *idr = cma_idr(ps);
+
+ idr_remove(idr, snum);
+}
+
enum {
CMA_OPTION_AFONLY,
};
@@ -225,6 +265,15 @@ struct cma_hdr {
#define CMA_VERSION 0x00
+struct cma_req_info {
+ struct ib_device *device;
+ int port;
+ union ib_gid local_gid;
+ __be64 service_id;
+ u16 pkey;
+ bool has_gid:1;
+};
+
static int cma_comp(struct rdma_id_private *id_priv, enum rdma_cm_state comp)
{
unsigned long flags;
@@ -262,7 +311,7 @@ static enum rdma_cm_state cma_exch(struct rdma_id_private *id_priv,
return old;
}
-static inline u8 cma_get_ip_ver(struct cma_hdr *hdr)
+static inline u8 cma_get_ip_ver(const struct cma_hdr *hdr)
{
return hdr->ip_version >> 4;
}
@@ -870,107 +919,397 @@ static inline int cma_any_port(struct sockaddr *addr)
return !cma_port(addr);
}
-static void cma_save_ib_info(struct rdma_cm_id *id, struct rdma_cm_id *listen_id,
+static void cma_save_ib_info(struct sockaddr *src_addr,
+ struct sockaddr *dst_addr,
+ struct rdma_cm_id *listen_id,
struct ib_sa_path_rec *path)
{
struct sockaddr_ib *listen_ib, *ib;
listen_ib = (struct sockaddr_ib *) &listen_id->route.addr.src_addr;
- ib = (struct sockaddr_ib *) &id->route.addr.src_addr;
- ib->sib_family = listen_ib->sib_family;
- if (path) {
- ib->sib_pkey = path->pkey;
- ib->sib_flowinfo = path->flow_label;
- memcpy(&ib->sib_addr, &path->sgid, 16);
- } else {
- ib->sib_pkey = listen_ib->sib_pkey;
- ib->sib_flowinfo = listen_ib->sib_flowinfo;
- ib->sib_addr = listen_ib->sib_addr;
- }
- ib->sib_sid = listen_ib->sib_sid;
- ib->sib_sid_mask = cpu_to_be64(0xffffffffffffffffULL);
- ib->sib_scope_id = listen_ib->sib_scope_id;
-
- if (path) {
- ib = (struct sockaddr_ib *) &id->route.addr.dst_addr;
- ib->sib_family = listen_ib->sib_family;
- ib->sib_pkey = path->pkey;
- ib->sib_flowinfo = path->flow_label;
- memcpy(&ib->sib_addr, &path->dgid, 16);
+ if (src_addr) {
+ ib = (struct sockaddr_ib *)src_addr;
+ ib->sib_family = AF_IB;
+ if (path) {
+ ib->sib_pkey = path->pkey;
+ ib->sib_flowinfo = path->flow_label;
+ memcpy(&ib->sib_addr, &path->sgid, 16);
+ ib->sib_sid = path->service_id;
+ ib->sib_scope_id = 0;
+ } else {
+ ib->sib_pkey = listen_ib->sib_pkey;
+ ib->sib_flowinfo = listen_ib->sib_flowinfo;
+ ib->sib_addr = listen_ib->sib_addr;
+ ib->sib_sid = listen_ib->sib_sid;
+ ib->sib_scope_id = listen_ib->sib_scope_id;
+ }
+ ib->sib_sid_mask = cpu_to_be64(0xffffffffffffffffULL);
+ }
+ if (dst_addr) {
+ ib = (struct sockaddr_ib *)dst_addr;
+ ib->sib_family = AF_IB;
+ if (path) {
+ ib->sib_pkey = path->pkey;
+ ib->sib_flowinfo = path->flow_label;
+ memcpy(&ib->sib_addr, &path->dgid, 16);
+ }
}
}
-static __be16 ss_get_port(const struct sockaddr_storage *ss)
-{
- if (ss->ss_family == AF_INET)
- return ((struct sockaddr_in *)ss)->sin_port;
- else if (ss->ss_family == AF_INET6)
- return ((struct sockaddr_in6 *)ss)->sin6_port;
- BUG();
-}
-
-static void cma_save_ip4_info(struct rdma_cm_id *id, struct rdma_cm_id *listen_id,
- struct cma_hdr *hdr)
+static void cma_save_ip4_info(struct sockaddr *src_addr,
+ struct sockaddr *dst_addr,
+ struct cma_hdr *hdr,
+ __be16 local_port)
{
struct sockaddr_in *ip4;
- ip4 = (struct sockaddr_in *) &id->route.addr.src_addr;
- ip4->sin_family = AF_INET;
- ip4->sin_addr.s_addr = hdr->dst_addr.ip4.addr;
- ip4->sin_port = ss_get_port(&listen_id->route.addr.src_addr);
+ if (src_addr) {
+ ip4 = (struct sockaddr_in *)src_addr;
+ ip4->sin_family = AF_INET;
+ ip4->sin_addr.s_addr = hdr->dst_addr.ip4.addr;
+ ip4->sin_port = local_port;
+ }
- ip4 = (struct sockaddr_in *) &id->route.addr.dst_addr;
- ip4->sin_family = AF_INET;
- ip4->sin_addr.s_addr = hdr->src_addr.ip4.addr;
- ip4->sin_port = hdr->port;
+ if (dst_addr) {
+ ip4 = (struct sockaddr_in *)dst_addr;
+ ip4->sin_family = AF_INET;
+ ip4->sin_addr.s_addr = hdr->src_addr.ip4.addr;
+ ip4->sin_port = hdr->port;
+ }
}
-static void cma_save_ip6_info(struct rdma_cm_id *id, struct rdma_cm_id *listen_id,
- struct cma_hdr *hdr)
+static void cma_save_ip6_info(struct sockaddr *src_addr,
+ struct sockaddr *dst_addr,
+ struct cma_hdr *hdr,
+ __be16 local_port)
{
struct sockaddr_in6 *ip6;
- ip6 = (struct sockaddr_in6 *) &id->route.addr.src_addr;
- ip6->sin6_family = AF_INET6;
- ip6->sin6_addr = hdr->dst_addr.ip6;
- ip6->sin6_port = ss_get_port(&listen_id->route.addr.src_addr);
+ if (src_addr) {
+ ip6 = (struct sockaddr_in6 *)src_addr;
+ ip6->sin6_family = AF_INET6;
+ ip6->sin6_addr = hdr->dst_addr.ip6;
+ ip6->sin6_port = local_port;
+ }
- ip6 = (struct sockaddr_in6 *) &id->route.addr.dst_addr;
- ip6->sin6_family = AF_INET6;
- ip6->sin6_addr = hdr->src_addr.ip6;
- ip6->sin6_port = hdr->port;
+ if (dst_addr) {
+ ip6 = (struct sockaddr_in6 *)dst_addr;
+ ip6->sin6_family = AF_INET6;
+ ip6->sin6_addr = hdr->src_addr.ip6;
+ ip6->sin6_port = hdr->port;
+ }
}
-static int cma_save_net_info(struct rdma_cm_id *id, struct rdma_cm_id *listen_id,
- struct ib_cm_event *ib_event)
+static u16 cma_port_from_service_id(__be64 service_id)
{
- struct cma_hdr *hdr;
+ return (u16)be64_to_cpu(service_id);
+}
- if (listen_id->route.addr.src_addr.ss_family == AF_IB) {
- if (ib_event->event == IB_CM_REQ_RECEIVED)
- cma_save_ib_info(id, listen_id, ib_event->param.req_rcvd.primary_path);
- else if (ib_event->event == IB_CM_SIDR_REQ_RECEIVED)
- cma_save_ib_info(id, listen_id, NULL);
- return 0;
- }
+static int cma_save_ip_info(struct sockaddr *src_addr,
+ struct sockaddr *dst_addr,
+ struct ib_cm_event *ib_event,
+ __be64 service_id)
+{
+ struct cma_hdr *hdr;
+ __be16 port;
hdr = ib_event->private_data;
if (hdr->cma_version != CMA_VERSION)
return -EINVAL;
+ port = htons(cma_port_from_service_id(service_id));
+
switch (cma_get_ip_ver(hdr)) {
case 4:
- cma_save_ip4_info(id, listen_id, hdr);
+ cma_save_ip4_info(src_addr, dst_addr, hdr, port);
break;
case 6:
- cma_save_ip6_info(id, listen_id, hdr);
+ cma_save_ip6_info(src_addr, dst_addr, hdr, port);
+ break;
+ default:
+ return -EAFNOSUPPORT;
+ }
+
+ return 0;
+}
+
+static int cma_save_net_info(struct sockaddr *src_addr,
+ struct sockaddr *dst_addr,
+ struct rdma_cm_id *listen_id,
+ struct ib_cm_event *ib_event,
+ sa_family_t sa_family, __be64 service_id)
+{
+ if (sa_family == AF_IB) {
+ if (ib_event->event == IB_CM_REQ_RECEIVED)
+ cma_save_ib_info(src_addr, dst_addr, listen_id,
+ ib_event->param.req_rcvd.primary_path);
+ else if (ib_event->event == IB_CM_SIDR_REQ_RECEIVED)
+ cma_save_ib_info(src_addr, dst_addr, listen_id, NULL);
+ return 0;
+ }
+
+ return cma_save_ip_info(src_addr, dst_addr, ib_event, service_id);
+}
+
+static int cma_save_req_info(const struct ib_cm_event *ib_event,
+ struct cma_req_info *req)
+{
+ const struct ib_cm_req_event_param *req_param =
+ &ib_event->param.req_rcvd;
+ const struct ib_cm_sidr_req_event_param *sidr_param =
+ &ib_event->param.sidr_req_rcvd;
+
+ switch (ib_event->event) {
+ case IB_CM_REQ_RECEIVED:
+ req->device = req_param->listen_id->device;
+ req->port = req_param->port;
+ memcpy(&req->local_gid, &req_param->primary_path->sgid,
+ sizeof(req->local_gid));
+ req->has_gid = true;
+ req->service_id = req_param->primary_path->service_id;
+ req->pkey = req_param->bth_pkey;
+ break;
+ case IB_CM_SIDR_REQ_RECEIVED:
+ req->device = sidr_param->listen_id->device;
+ req->port = sidr_param->port;
+ req->has_gid = false;
+ req->service_id = sidr_param->service_id;
+ req->pkey = sidr_param->bth_pkey;
break;
default:
return -EINVAL;
}
+
return 0;
}
+static bool validate_ipv4_net_dev(struct net_device *net_dev,
+ const struct sockaddr_in *dst_addr,
+ const struct sockaddr_in *src_addr)
+{
+ __be32 daddr = dst_addr->sin_addr.s_addr,
+ saddr = src_addr->sin_addr.s_addr;
+ struct fib_result res;
+ struct flowi4 fl4;
+ int err;
+ bool ret;
+
+ if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
+ ipv4_is_lbcast(daddr) || ipv4_is_zeronet(saddr) ||
+ ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr) ||
+ ipv4_is_loopback(saddr))
+ return false;
+
+ memset(&fl4, 0, sizeof(fl4));
+ fl4.flowi4_iif = net_dev->ifindex;
+ fl4.daddr = daddr;
+ fl4.saddr = saddr;
+
+ rcu_read_lock();
+ err = fib_lookup(dev_net(net_dev), &fl4, &res, 0);
+ if (err)
+ return false;
+
+ ret = FIB_RES_DEV(res) == net_dev;
+ rcu_read_unlock();
+
+ return ret;
+}
+
+static bool validate_ipv6_net_dev(struct net_device *net_dev,
+ const struct sockaddr_in6 *dst_addr,
+ const struct sockaddr_in6 *src_addr)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+ const int strict = ipv6_addr_type(&dst_addr->sin6_addr) &
+ IPV6_ADDR_LINKLOCAL;
+ struct rt6_info *rt = rt6_lookup(dev_net(net_dev), &dst_addr->sin6_addr,
+ &src_addr->sin6_addr, net_dev->ifindex,
+ strict);
+ bool ret;
+
+ if (!rt)
+ return false;
+
+ ret = rt->rt6i_idev->dev == net_dev;
+ ip6_rt_put(rt);
+
+ return ret;
+#else
+ return false;
+#endif
+}
+
+static bool validate_net_dev(struct net_device *net_dev,
+ const struct sockaddr *daddr,
+ const struct sockaddr *saddr)
+{
+ const struct sockaddr_in *daddr4 = (const struct sockaddr_in *)daddr;
+ const struct sockaddr_in *saddr4 = (const struct sockaddr_in *)saddr;
+ const struct sockaddr_in6 *daddr6 = (const struct sockaddr_in6 *)daddr;
+ const struct sockaddr_in6 *saddr6 = (const struct sockaddr_in6 *)saddr;
+
+ switch (daddr->sa_family) {
+ case AF_INET:
+ return saddr->sa_family == AF_INET &&
+ validate_ipv4_net_dev(net_dev, daddr4, saddr4);
+
+ case AF_INET6:
+ return saddr->sa_family == AF_INET6 &&
+ validate_ipv6_net_dev(net_dev, daddr6, saddr6);
+
+ default:
+ return false;
+ }
+}
+
+static struct net_device *cma_get_net_dev(struct ib_cm_event *ib_event,
+ const struct cma_req_info *req)
+{
+ struct sockaddr_storage listen_addr_storage, src_addr_storage;
+ struct sockaddr *listen_addr = (struct sockaddr *)&listen_addr_storage,
+ *src_addr = (struct sockaddr *)&src_addr_storage;
+ struct net_device *net_dev;
+ const union ib_gid *gid = req->has_gid ? &req->local_gid : NULL;
+ int err;
+
+ err = cma_save_ip_info(listen_addr, src_addr, ib_event,
+ req->service_id);
+ if (err)
+ return ERR_PTR(err);
+
+ net_dev = ib_get_net_dev_by_params(req->device, req->port, req->pkey,
+ gid, listen_addr);
+ if (!net_dev)
+ return ERR_PTR(-ENODEV);
+
+ if (!validate_net_dev(net_dev, listen_addr, src_addr)) {
+ dev_put(net_dev);
+ return ERR_PTR(-EHOSTUNREACH);
+ }
+
+ return net_dev;
+}
+
+static enum rdma_port_space rdma_ps_from_service_id(__be64 service_id)
+{
+ return (be64_to_cpu(service_id) >> 16) & 0xffff;
+}
+
+static bool cma_match_private_data(struct rdma_id_private *id_priv,
+ const struct cma_hdr *hdr)
+{
+ struct sockaddr *addr = cma_src_addr(id_priv);
+ __be32 ip4_addr;
+ struct in6_addr ip6_addr;
+
+ if (cma_any_addr(addr) && !id_priv->afonly)
+ return true;
+
+ switch (addr->sa_family) {
+ case AF_INET:
+ ip4_addr = ((struct sockaddr_in *)addr)->sin_addr.s_addr;
+ if (cma_get_ip_ver(hdr) != 4)
+ return false;
+ if (!cma_any_addr(addr) &&
+ hdr->dst_addr.ip4.addr != ip4_addr)
+ return false;
+ break;
+ case AF_INET6:
+ ip6_addr = ((struct sockaddr_in6 *)addr)->sin6_addr;
+ if (cma_get_ip_ver(hdr) != 6)
+ return false;
+ if (!cma_any_addr(addr) &&
+ memcmp(&hdr->dst_addr.ip6, &ip6_addr, sizeof(ip6_addr)))
+ return false;
+ break;
+ case AF_IB:
+ return true;
+ default:
+ return false;
+ }
+
+ return true;
+}
+
+static bool cma_match_net_dev(const struct rdma_id_private *id_priv,
+ const struct net_device *net_dev)
+{
+ const struct rdma_addr *addr = &id_priv->id.route.addr;
+
+ if (!net_dev)
+ /* This request is an AF_IB request */
+ return addr->src_addr.ss_family == AF_IB;
+
+ return !addr->dev_addr.bound_dev_if ||
+ (net_eq(dev_net(net_dev), &init_net) &&
+ addr->dev_addr.bound_dev_if == net_dev->ifindex);
+}
+
+static struct rdma_id_private *cma_find_listener(
+ const struct rdma_bind_list *bind_list,
+ const struct ib_cm_id *cm_id,
+ const struct ib_cm_event *ib_event,
+ const struct cma_req_info *req,
+ const struct net_device *net_dev)
+{
+ struct rdma_id_private *id_priv, *id_priv_dev;
+
+ if (!bind_list)
+ return ERR_PTR(-EINVAL);
+
+ hlist_for_each_entry(id_priv, &bind_list->owners, node) {
+ if (cma_match_private_data(id_priv, ib_event->private_data)) {
+ if (id_priv->id.device == cm_id->device &&
+ cma_match_net_dev(id_priv, net_dev))
+ return id_priv;
+ list_for_each_entry(id_priv_dev,
+ &id_priv->listen_list,
+ listen_list) {
+ if (id_priv_dev->id.device == cm_id->device &&
+ cma_match_net_dev(id_priv_dev, net_dev))
+ return id_priv_dev;
+ }
+ }
+ }
+
+ return ERR_PTR(-EINVAL);
+}
+
+static struct rdma_id_private *cma_id_from_event(struct ib_cm_id *cm_id,
+ struct ib_cm_event *ib_event,
+ struct net_device **net_dev)
+{
+ struct cma_req_info req;
+ struct rdma_bind_list *bind_list;
+ struct rdma_id_private *id_priv;
+ int err;
+
+ err = cma_save_req_info(ib_event, &req);
+ if (err)
+ return ERR_PTR(err);
+
+ *net_dev = cma_get_net_dev(ib_event, &req);
+ if (IS_ERR(*net_dev)) {
+ if (PTR_ERR(*net_dev) == -EAFNOSUPPORT) {
+ /* Assuming the protocol is AF_IB */
+ *net_dev = NULL;
+ } else {
+ return ERR_CAST(*net_dev);
+ }
+ }
+
+ bind_list = cma_ps_find(rdma_ps_from_service_id(req.service_id),
+ cma_port_from_service_id(req.service_id));
+ id_priv = cma_find_listener(bind_list, cm_id, ib_event, &req, *net_dev);
+ if (IS_ERR(id_priv)) {
+ dev_put(*net_dev);
+ *net_dev = NULL;
+ }
+
+ return id_priv;
+}
+
static inline int cma_user_data_offset(struct rdma_id_private *id_priv)
{
return cma_family(id_priv) == AF_IB ? 0 : sizeof(struct cma_hdr);
@@ -1038,7 +1377,7 @@ static void cma_release_port(struct rdma_id_private *id_priv)
mutex_lock(&lock);
hlist_del(&id_priv->node);
if (hlist_empty(&bind_list->owners)) {
- idr_remove(bind_list->ps, bind_list->port);
+ cma_ps_remove(bind_list->ps, bind_list->port);
kfree(bind_list);
}
mutex_unlock(&lock);
@@ -1216,11 +1555,15 @@ out:
}
static struct rdma_id_private *cma_new_conn_id(struct rdma_cm_id *listen_id,
- struct ib_cm_event *ib_event)
+ struct ib_cm_event *ib_event,
+ struct net_device *net_dev)
{
struct rdma_id_private *id_priv;
struct rdma_cm_id *id;
struct rdma_route *rt;
+ const sa_family_t ss_family = listen_id->route.addr.src_addr.ss_family;
+ const __be64 service_id =
+ ib_event->param.req_rcvd.primary_path->service_id;
int ret;
id = rdma_create_id(listen_id->event_handler, listen_id->context,
@@ -1229,7 +1572,9 @@ static struct rdma_id_private *cma_new_conn_id(struct rdma_cm_id *listen_id,
return NULL;
id_priv = container_of(id, struct rdma_id_private, id);
- if (cma_save_net_info(id, listen_id, ib_event))
+ if (cma_save_net_info((struct sockaddr *)&id->route.addr.src_addr,
+ (struct sockaddr *)&id->route.addr.dst_addr,
+ listen_id, ib_event, ss_family, service_id))
goto err;
rt = &id->route;
@@ -1243,14 +1588,16 @@ static struct rdma_id_private *cma_new_conn_id(struct rdma_cm_id *listen_id,
if (rt->num_paths == 2)
rt->path_rec[1] = *ib_event->param.req_rcvd.alternate_path;
- if (cma_any_addr(cma_src_addr(id_priv))) {
- rt->addr.dev_addr.dev_type = ARPHRD_INFINIBAND;
- rdma_addr_set_sgid(&rt->addr.dev_addr, &rt->path_rec[0].sgid);
- ib_addr_set_pkey(&rt->addr.dev_addr, be16_to_cpu(rt->path_rec[0].pkey));
- } else {
- ret = cma_translate_addr(cma_src_addr(id_priv), &rt->addr.dev_addr);
+ if (net_dev) {
+ ret = rdma_copy_addr(&rt->addr.dev_addr, net_dev, NULL);
if (ret)
goto err;
+ } else {
+ /* An AF_IB connection */
+ WARN_ON_ONCE(ss_family != AF_IB);
+
+ cma_translate_ib((struct sockaddr_ib *)cma_src_addr(id_priv),
+ &rt->addr.dev_addr);
}
rdma_addr_set_dgid(&rt->addr.dev_addr, &rt->path_rec[0].dgid);
@@ -1263,10 +1610,12 @@ err:
}
static struct rdma_id_private *cma_new_udp_id(struct rdma_cm_id *listen_id,
- struct ib_cm_event *ib_event)
+ struct ib_cm_event *ib_event,
+ struct net_device *net_dev)
{
struct rdma_id_private *id_priv;
struct rdma_cm_id *id;
+ const sa_family_t ss_family = listen_id->route.addr.src_addr.ss_family;
int ret;
id = rdma_create_id(listen_id->event_handler, listen_id->context,
@@ -1275,13 +1624,24 @@ static struct rdma_id_private *cma_new_udp_id(struct rdma_cm_id *listen_id,
return NULL;
id_priv = container_of(id, struct rdma_id_private, id);
- if (cma_save_net_info(id, listen_id, ib_event))
+ if (cma_save_net_info((struct sockaddr *)&id->route.addr.src_addr,
+ (struct sockaddr *)&id->route.addr.dst_addr,
+ listen_id, ib_event, ss_family,
+ ib_event->param.sidr_req_rcvd.service_id))
goto err;
- if (!cma_any_addr((struct sockaddr *) &id->route.addr.src_addr)) {
- ret = cma_translate_addr(cma_src_addr(id_priv), &id->route.addr.dev_addr);
+ if (net_dev) {
+ ret = rdma_copy_addr(&id->route.addr.dev_addr, net_dev, NULL);
if (ret)
goto err;
+ } else {
+ /* An AF_IB connection */
+ WARN_ON_ONCE(ss_family != AF_IB);
+
+ if (!cma_any_addr(cma_src_addr(id_priv)))
+ cma_translate_ib((struct sockaddr_ib *)
+ cma_src_addr(id_priv),
+ &id->route.addr.dev_addr);
}
id_priv->state = RDMA_CM_CONNECT;
@@ -1319,25 +1679,33 @@ static int cma_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event)
{
struct rdma_id_private *listen_id, *conn_id;
struct rdma_cm_event event;
+ struct net_device *net_dev;
int offset, ret;
- listen_id = cm_id->context;
- if (!cma_check_req_qp_type(&listen_id->id, ib_event))
- return -EINVAL;
+ listen_id = cma_id_from_event(cm_id, ib_event, &net_dev);
+ if (IS_ERR(listen_id))
+ return PTR_ERR(listen_id);
- if (cma_disable_callback(listen_id, RDMA_CM_LISTEN))
- return -ECONNABORTED;
+ if (!cma_check_req_qp_type(&listen_id->id, ib_event)) {
+ ret = -EINVAL;
+ goto net_dev_put;
+ }
+
+ if (cma_disable_callback(listen_id, RDMA_CM_LISTEN)) {
+ ret = -ECONNABORTED;
+ goto net_dev_put;
+ }
memset(&event, 0, sizeof event);
offset = cma_user_data_offset(listen_id);
event.event = RDMA_CM_EVENT_CONNECT_REQUEST;
if (ib_event->event == IB_CM_SIDR_REQ_RECEIVED) {
- conn_id = cma_new_udp_id(&listen_id->id, ib_event);
+ conn_id = cma_new_udp_id(&listen_id->id, ib_event, net_dev);
event.param.ud.private_data = ib_event->private_data + offset;
event.param.ud.private_data_len =
IB_CM_SIDR_REQ_PRIVATE_DATA_SIZE - offset;
} else {
- conn_id = cma_new_conn_id(&listen_id->id, ib_event);
+ conn_id = cma_new_conn_id(&listen_id->id, ib_event, net_dev);
cma_set_req_event_data(&event, &ib_event->param.req_rcvd,
ib_event->private_data, offset);
}
@@ -1375,6 +1743,8 @@ static int cma_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event)
mutex_unlock(&conn_id->handler_mutex);
mutex_unlock(&listen_id->handler_mutex);
cma_deref_id(conn_id);
+ if (net_dev)
+ dev_put(net_dev);
return 0;
err3:
@@ -1388,6 +1758,11 @@ err1:
mutex_unlock(&listen_id->handler_mutex);
if (conn_id)
rdma_destroy_id(&conn_id->id);
+
+net_dev_put:
+ if (net_dev)
+ dev_put(net_dev);
+
return ret;
}
@@ -1400,42 +1775,6 @@ __be64 rdma_get_service_id(struct rdma_cm_id *id, struct sockaddr *addr)
}
EXPORT_SYMBOL(rdma_get_service_id);
-static void cma_set_compare_data(enum rdma_port_space ps, struct sockaddr *addr,
- struct ib_cm_compare_data *compare)
-{
- struct cma_hdr *cma_data, *cma_mask;
- __be32 ip4_addr;
- struct in6_addr ip6_addr;
-
- memset(compare, 0, sizeof *compare);
- cma_data = (void *) compare->data;
- cma_mask = (void *) compare->mask;
-
- switch (addr->sa_family) {
- case AF_INET:
- ip4_addr = ((struct sockaddr_in *) addr)->sin_addr.s_addr;
- cma_set_ip_ver(cma_data, 4);
- cma_set_ip_ver(cma_mask, 0xF);
- if (!cma_any_addr(addr)) {
- cma_data->dst_addr.ip4.addr = ip4_addr;
- cma_mask->dst_addr.ip4.addr = htonl(~0);
- }
- break;
- case AF_INET6:
- ip6_addr = ((struct sockaddr_in6 *) addr)->sin6_addr;
- cma_set_ip_ver(cma_data, 6);
- cma_set_ip_ver(cma_mask, 0xF);
- if (!cma_any_addr(addr)) {
- cma_data->dst_addr.ip6 = ip6_addr;
- memset(&cma_mask->dst_addr.ip6, 0xFF,
- sizeof cma_mask->dst_addr.ip6);
- }
- break;
- default:
- break;
- }
-}
-
static int cma_iw_handler(struct iw_cm_id *iw_id, struct iw_cm_event *iw_event)
{
struct rdma_id_private *id_priv = iw_id->context;
@@ -1589,33 +1928,18 @@ out:
static int cma_ib_listen(struct rdma_id_private *id_priv)
{
- struct ib_cm_compare_data compare_data;
struct sockaddr *addr;
struct ib_cm_id *id;
__be64 svc_id;
- int ret;
- id = ib_create_cm_id(id_priv->id.device, cma_req_handler, id_priv);
+ addr = cma_src_addr(id_priv);
+ svc_id = rdma_get_service_id(&id_priv->id, addr);
+ id = ib_cm_insert_listen(id_priv->id.device, cma_req_handler, svc_id);
if (IS_ERR(id))
return PTR_ERR(id);
-
id_priv->cm_id.ib = id;
- addr = cma_src_addr(id_priv);
- svc_id = rdma_get_service_id(&id_priv->id, addr);
- if (cma_any_addr(addr) && !id_priv->afonly)
- ret = ib_cm_listen(id_priv->cm_id.ib, svc_id, 0, NULL);
- else {
- cma_set_compare_data(id_priv->id.ps, addr, &compare_data);
- ret = ib_cm_listen(id_priv->cm_id.ib, svc_id, 0, &compare_data);
- }
-
- if (ret) {
- ib_destroy_cm_id(id_priv->cm_id.ib);
- id_priv->cm_id.ib = NULL;
- }
-
- return ret;
+ return 0;
}
static int cma_iw_listen(struct rdma_id_private *id_priv, int backlog)
@@ -2203,8 +2527,11 @@ static int cma_bind_addr(struct rdma_cm_id *id, struct sockaddr *src_addr,
src_addr = (struct sockaddr *) &id->route.addr.src_addr;
src_addr->sa_family = dst_addr->sa_family;
if (dst_addr->sa_family == AF_INET6) {
- ((struct sockaddr_in6 *) src_addr)->sin6_scope_id =
- ((struct sockaddr_in6 *) dst_addr)->sin6_scope_id;
+ struct sockaddr_in6 *src_addr6 = (struct sockaddr_in6 *) src_addr;
+ struct sockaddr_in6 *dst_addr6 = (struct sockaddr_in6 *) dst_addr;
+ src_addr6->sin6_scope_id = dst_addr6->sin6_scope_id;
+ if (ipv6_addr_type(&dst_addr6->sin6_addr) & IPV6_ADDR_LINKLOCAL)
+ id->route.addr.dev_addr.bound_dev_if = dst_addr6->sin6_scope_id;
} else if (dst_addr->sa_family == AF_IB) {
((struct sockaddr_ib *) src_addr)->sib_pkey =
((struct sockaddr_ib *) dst_addr)->sib_pkey;
@@ -2325,8 +2652,8 @@ static void cma_bind_port(struct rdma_bind_list *bind_list,
hlist_add_head(&id_priv->node, &bind_list->owners);
}
-static int cma_alloc_port(struct idr *ps, struct rdma_id_private *id_priv,
- unsigned short snum)
+static int cma_alloc_port(enum rdma_port_space ps,
+ struct rdma_id_private *id_priv, unsigned short snum)
{
struct rdma_bind_list *bind_list;
int ret;
@@ -2335,7 +2662,7 @@ static int cma_alloc_port(struct idr *ps, struct rdma_id_private *id_priv,
if (!bind_list)
return -ENOMEM;
- ret = idr_alloc(ps, bind_list, snum, snum + 1, GFP_KERNEL);
+ ret = cma_ps_alloc(ps, bind_list, snum);
if (ret < 0)
goto err;
@@ -2348,7 +2675,8 @@ err:
return ret == -ENOSPC ? -EADDRNOTAVAIL : ret;
}
-static int cma_alloc_any_port(struct idr *ps, struct rdma_id_private *id_priv)
+static int cma_alloc_any_port(enum rdma_port_space ps,
+ struct rdma_id_private *id_priv)
{
static unsigned int last_used_port;
int low, high, remaining;
@@ -2359,7 +2687,7 @@ static int cma_alloc_any_port(struct idr *ps, struct rdma_id_private *id_priv)
rover = prandom_u32() % remaining + low;
retry:
if (last_used_port != rover &&
- !idr_find(ps, (unsigned short) rover)) {
+ !cma_ps_find(ps, (unsigned short)rover)) {
int ret = cma_alloc_port(ps, id_priv, rover);
/*
* Remember previously used port number in order to avoid
@@ -2414,7 +2742,8 @@ static int cma_check_port(struct rdma_bind_list *bind_list,
return 0;
}
-static int cma_use_port(struct idr *ps, struct rdma_id_private *id_priv)
+static int cma_use_port(enum rdma_port_space ps,
+ struct rdma_id_private *id_priv)
{
struct rdma_bind_list *bind_list;
unsigned short snum;
@@ -2424,7 +2753,7 @@ static int cma_use_port(struct idr *ps, struct rdma_id_private *id_priv)
if (snum < PROT_SOCK && !capable(CAP_NET_BIND_SERVICE))
return -EACCES;
- bind_list = idr_find(ps, snum);
+ bind_list = cma_ps_find(ps, snum);
if (!bind_list) {
ret = cma_alloc_port(ps, id_priv, snum);
} else {
@@ -2447,25 +2776,24 @@ static int cma_bind_listen(struct rdma_id_private *id_priv)
return ret;
}
-static struct idr *cma_select_inet_ps(struct rdma_id_private *id_priv)
+static enum rdma_port_space cma_select_inet_ps(
+ struct rdma_id_private *id_priv)
{
switch (id_priv->id.ps) {
case RDMA_PS_TCP:
- return &tcp_ps;
case RDMA_PS_UDP:
- return &udp_ps;
case RDMA_PS_IPOIB:
- return &ipoib_ps;
case RDMA_PS_IB:
- return &ib_ps;
+ return id_priv->id.ps;
default:
- return NULL;
+
+ return 0;
}
}
-static struct idr *cma_select_ib_ps(struct rdma_id_private *id_priv)
+static enum rdma_port_space cma_select_ib_ps(struct rdma_id_private *id_priv)
{
- struct idr *ps = NULL;
+ enum rdma_port_space ps = 0;
struct sockaddr_ib *sib;
u64 sid_ps, mask, sid;
@@ -2475,15 +2803,15 @@ static struct idr *cma_select_ib_ps(struct rdma_id_private *id_priv)
if ((id_priv->id.ps == RDMA_PS_IB) && (sid == (RDMA_IB_IP_PS_IB & mask))) {
sid_ps = RDMA_IB_IP_PS_IB;
- ps = &ib_ps;
+ ps = RDMA_PS_IB;
} else if (((id_priv->id.ps == RDMA_PS_IB) || (id_priv->id.ps == RDMA_PS_TCP)) &&
(sid == (RDMA_IB_IP_PS_TCP & mask))) {
sid_ps = RDMA_IB_IP_PS_TCP;
- ps = &tcp_ps;
+ ps = RDMA_PS_TCP;
} else if (((id_priv->id.ps == RDMA_PS_IB) || (id_priv->id.ps == RDMA_PS_UDP)) &&
(sid == (RDMA_IB_IP_PS_UDP & mask))) {
sid_ps = RDMA_IB_IP_PS_UDP;
- ps = &udp_ps;
+ ps = RDMA_PS_UDP;
}
if (ps) {
@@ -2496,7 +2824,7 @@ static struct idr *cma_select_ib_ps(struct rdma_id_private *id_priv)
static int cma_get_port(struct rdma_id_private *id_priv)
{
- struct idr *ps;
+ enum rdma_port_space ps;
int ret;
if (cma_family(id_priv) != AF_IB)
@@ -3551,11 +3879,10 @@ static void cma_process_remove(struct cma_device *cma_dev)
wait_for_completion(&cma_dev->comp);
}
-static void cma_remove_one(struct ib_device *device)
+static void cma_remove_one(struct ib_device *device, void *client_data)
{
- struct cma_device *cma_dev;
+ struct cma_device *cma_dev = client_data;
- cma_dev = ib_get_client_data(device, &cma_client);
if (!cma_dev)
return;
diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h
index 87d1936f5c1c..70bb36ebb03b 100644
--- a/drivers/infiniband/core/core_priv.h
+++ b/drivers/infiniband/core/core_priv.h
@@ -43,12 +43,58 @@ int ib_device_register_sysfs(struct ib_device *device,
u8, struct kobject *));
void ib_device_unregister_sysfs(struct ib_device *device);
-int ib_sysfs_setup(void);
-void ib_sysfs_cleanup(void);
-
-int ib_cache_setup(void);
+void ib_cache_setup(void);
void ib_cache_cleanup(void);
int ib_resolve_eth_l2_attrs(struct ib_qp *qp,
struct ib_qp_attr *qp_attr, int *qp_attr_mask);
+
+typedef void (*roce_netdev_callback)(struct ib_device *device, u8 port,
+ struct net_device *idev, void *cookie);
+
+typedef int (*roce_netdev_filter)(struct ib_device *device, u8 port,
+ struct net_device *idev, void *cookie);
+
+void ib_enum_roce_netdev(struct ib_device *ib_dev,
+ roce_netdev_filter filter,
+ void *filter_cookie,
+ roce_netdev_callback cb,
+ void *cookie);
+void ib_enum_all_roce_netdevs(roce_netdev_filter filter,
+ void *filter_cookie,
+ roce_netdev_callback cb,
+ void *cookie);
+
+int ib_cache_gid_find_by_port(struct ib_device *ib_dev,
+ const union ib_gid *gid,
+ u8 port, struct net_device *ndev,
+ u16 *index);
+
+enum ib_cache_gid_default_mode {
+ IB_CACHE_GID_DEFAULT_MODE_SET,
+ IB_CACHE_GID_DEFAULT_MODE_DELETE
+};
+
+void ib_cache_gid_set_default_gid(struct ib_device *ib_dev, u8 port,
+ struct net_device *ndev,
+ enum ib_cache_gid_default_mode mode);
+
+int ib_cache_gid_add(struct ib_device *ib_dev, u8 port,
+ union ib_gid *gid, struct ib_gid_attr *attr);
+
+int ib_cache_gid_del(struct ib_device *ib_dev, u8 port,
+ union ib_gid *gid, struct ib_gid_attr *attr);
+
+int ib_cache_gid_del_all_netdev_gids(struct ib_device *ib_dev, u8 port,
+ struct net_device *ndev);
+
+int roce_gid_mgmt_init(void);
+void roce_gid_mgmt_cleanup(void);
+
+int roce_rescan_device(struct ib_device *ib_dev);
+
+int ib_cache_setup_one(struct ib_device *device);
+void ib_cache_cleanup_one(struct ib_device *device);
+void ib_cache_release_one(struct ib_device *device);
+
#endif /* _CORE_PRIV_H */
diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index 9567756ca4f9..17639117afc6 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -38,7 +38,10 @@
#include <linux/slab.h>
#include <linux/init.h>
#include <linux/mutex.h>
+#include <linux/netdevice.h>
#include <rdma/rdma_netlink.h>
+#include <rdma/ib_addr.h>
+#include <rdma/ib_cache.h>
#include "core_priv.h"
@@ -50,22 +53,34 @@ struct ib_client_data {
struct list_head list;
struct ib_client *client;
void * data;
+ /* The device or client is going down. Do not call client or device
+ * callbacks other than remove(). */
+ bool going_down;
};
struct workqueue_struct *ib_wq;
EXPORT_SYMBOL_GPL(ib_wq);
+/* The device_list and client_list contain devices and clients after their
+ * registration has completed, and the devices and clients are removed
+ * during unregistration. */
static LIST_HEAD(device_list);
static LIST_HEAD(client_list);
/*
- * device_mutex protects access to both device_list and client_list.
- * There's no real point to using multiple locks or something fancier
- * like an rwsem: we always access both lists, and we're always
- * modifying one list or the other list. In any case this is not a
- * hot path so there's no point in trying to optimize.
+ * device_mutex and lists_rwsem protect access to both device_list and
+ * client_list. device_mutex protects writer access by device and client
+ * registration / de-registration. lists_rwsem protects reader access to
+ * these lists. Iterators of these lists must lock it for read, while updates
+ * to the lists must be done with a write lock. A special case is when the
+ * device_mutex is locked. In this case locking the lists for read access is
+ * not necessary as the device_mutex implies it.
+ *
+ * lists_rwsem also protects access to the client data list.
*/
static DEFINE_MUTEX(device_mutex);
+static DECLARE_RWSEM(lists_rwsem);
+
static int ib_device_check_mandatory(struct ib_device *device)
{
@@ -152,6 +167,36 @@ static int alloc_name(char *name)
return 0;
}
+static void ib_device_release(struct device *device)
+{
+ struct ib_device *dev = container_of(device, struct ib_device, dev);
+
+ ib_cache_release_one(dev);
+ kfree(dev->port_immutable);
+ kfree(dev);
+}
+
+static int ib_device_uevent(struct device *device,
+ struct kobj_uevent_env *env)
+{
+ struct ib_device *dev = container_of(device, struct ib_device, dev);
+
+ if (add_uevent_var(env, "NAME=%s", dev->name))
+ return -ENOMEM;
+
+ /*
+ * It would be nice to pass the node GUID with the event...
+ */
+
+ return 0;
+}
+
+static struct class ib_class = {
+ .name = "infiniband",
+ .dev_release = ib_device_release,
+ .dev_uevent = ib_device_uevent,
+};
+
/**
* ib_alloc_device - allocate an IB device struct
* @size:size of structure to allocate
@@ -164,9 +209,27 @@ static int alloc_name(char *name)
*/
struct ib_device *ib_alloc_device(size_t size)
{
- BUG_ON(size < sizeof (struct ib_device));
+ struct ib_device *device;
+
+ if (WARN_ON(size < sizeof(struct ib_device)))
+ return NULL;
+
+ device = kzalloc(size, GFP_KERNEL);
+ if (!device)
+ return NULL;
+
+ device->dev.class = &ib_class;
+ device_initialize(&device->dev);
+
+ dev_set_drvdata(&device->dev, device);
+
+ INIT_LIST_HEAD(&device->event_handler_list);
+ spin_lock_init(&device->event_handler_lock);
+ spin_lock_init(&device->client_data_lock);
+ INIT_LIST_HEAD(&device->client_data_list);
+ INIT_LIST_HEAD(&device->port_list);
- return kzalloc(size, GFP_KERNEL);
+ return device;
}
EXPORT_SYMBOL(ib_alloc_device);
@@ -178,13 +241,8 @@ EXPORT_SYMBOL(ib_alloc_device);
*/
void ib_dealloc_device(struct ib_device *device)
{
- if (device->reg_state == IB_DEV_UNINITIALIZED) {
- kfree(device);
- return;
- }
-
- BUG_ON(device->reg_state != IB_DEV_UNREGISTERED);
-
+ WARN_ON(device->reg_state != IB_DEV_UNREGISTERED &&
+ device->reg_state != IB_DEV_UNINITIALIZED);
kobject_put(&device->dev.kobj);
}
EXPORT_SYMBOL(ib_dealloc_device);
@@ -203,10 +261,13 @@ static int add_client_context(struct ib_device *device, struct ib_client *client
context->client = client;
context->data = NULL;
+ context->going_down = false;
+ down_write(&lists_rwsem);
spin_lock_irqsave(&device->client_data_lock, flags);
list_add(&context->list, &device->client_data_list);
spin_unlock_irqrestore(&device->client_data_lock, flags);
+ up_write(&lists_rwsem);
return 0;
}
@@ -219,7 +280,7 @@ static int verify_immutable(const struct ib_device *dev, u8 port)
static int read_port_immutable(struct ib_device *device)
{
- int ret = -ENOMEM;
+ int ret;
u8 start_port = rdma_start_port(device);
u8 end_port = rdma_end_port(device);
u8 port;
@@ -235,26 +296,18 @@ static int read_port_immutable(struct ib_device *device)
* (end_port + 1),
GFP_KERNEL);
if (!device->port_immutable)
- goto err;
+ return -ENOMEM;
for (port = start_port; port <= end_port; ++port) {
ret = device->get_port_immutable(device, port,
&device->port_immutable[port]);
if (ret)
- goto err;
+ return ret;
- if (verify_immutable(device, port)) {
- ret = -EINVAL;
- goto err;
- }
+ if (verify_immutable(device, port))
+ return -EINVAL;
}
-
- ret = 0;
- goto out;
-err:
- kfree(device->port_immutable);
-out:
- return ret;
+ return 0;
}
/**
@@ -271,6 +324,7 @@ int ib_register_device(struct ib_device *device,
u8, struct kobject *))
{
int ret;
+ struct ib_client *client;
mutex_lock(&device_mutex);
@@ -285,11 +339,6 @@ int ib_register_device(struct ib_device *device,
goto out;
}
- INIT_LIST_HEAD(&device->event_handler_list);
- INIT_LIST_HEAD(&device->client_data_list);
- spin_lock_init(&device->event_handler_lock);
- spin_lock_init(&device->client_data_lock);
-
ret = read_port_immutable(device);
if (ret) {
printk(KERN_WARNING "Couldn't create per port immutable data %s\n",
@@ -297,27 +346,30 @@ int ib_register_device(struct ib_device *device,
goto out;
}
+ ret = ib_cache_setup_one(device);
+ if (ret) {
+ printk(KERN_WARNING "Couldn't set up InfiniBand P_Key/GID cache\n");
+ goto out;
+ }
+
ret = ib_device_register_sysfs(device, port_callback);
if (ret) {
printk(KERN_WARNING "Couldn't register device %s with driver model\n",
device->name);
- kfree(device->port_immutable);
+ ib_cache_cleanup_one(device);
goto out;
}
- list_add_tail(&device->core_list, &device_list);
-
device->reg_state = IB_DEV_REGISTERED;
- {
- struct ib_client *client;
-
- list_for_each_entry(client, &client_list, list)
- if (client->add && !add_client_context(device, client))
- client->add(device);
- }
+ list_for_each_entry(client, &client_list, list)
+ if (client->add && !add_client_context(device, client))
+ client->add(device);
- out:
+ down_write(&lists_rwsem);
+ list_add_tail(&device->core_list, &device_list);
+ up_write(&lists_rwsem);
+out:
mutex_unlock(&device_mutex);
return ret;
}
@@ -331,26 +383,37 @@ EXPORT_SYMBOL(ib_register_device);
*/
void ib_unregister_device(struct ib_device *device)
{
- struct ib_client *client;
struct ib_client_data *context, *tmp;
unsigned long flags;
mutex_lock(&device_mutex);
- list_for_each_entry_reverse(client, &client_list, list)
- if (client->remove)
- client->remove(device);
-
+ down_write(&lists_rwsem);
list_del(&device->core_list);
+ spin_lock_irqsave(&device->client_data_lock, flags);
+ list_for_each_entry_safe(context, tmp, &device->client_data_list, list)
+ context->going_down = true;
+ spin_unlock_irqrestore(&device->client_data_lock, flags);
+ downgrade_write(&lists_rwsem);
+
+ list_for_each_entry_safe(context, tmp, &device->client_data_list,
+ list) {
+ if (context->client->remove)
+ context->client->remove(device, context->data);
+ }
+ up_read(&lists_rwsem);
mutex_unlock(&device_mutex);
ib_device_unregister_sysfs(device);
+ ib_cache_cleanup_one(device);
+ down_write(&lists_rwsem);
spin_lock_irqsave(&device->client_data_lock, flags);
list_for_each_entry_safe(context, tmp, &device->client_data_list, list)
kfree(context);
spin_unlock_irqrestore(&device->client_data_lock, flags);
+ up_write(&lists_rwsem);
device->reg_state = IB_DEV_UNREGISTERED;
}
@@ -375,11 +438,14 @@ int ib_register_client(struct ib_client *client)
mutex_lock(&device_mutex);
- list_add_tail(&client->list, &client_list);
list_for_each_entry(device, &device_list, core_list)
if (client->add && !add_client_context(device, client))
client->add(device);
+ down_write(&lists_rwsem);
+ list_add_tail(&client->list, &client_list);
+ up_write(&lists_rwsem);
+
mutex_unlock(&device_mutex);
return 0;
@@ -402,19 +468,41 @@ void ib_unregister_client(struct ib_client *client)
mutex_lock(&device_mutex);
+ down_write(&lists_rwsem);
+ list_del(&client->list);
+ up_write(&lists_rwsem);
+
list_for_each_entry(device, &device_list, core_list) {
- if (client->remove)
- client->remove(device);
+ struct ib_client_data *found_context = NULL;
+ down_write(&lists_rwsem);
spin_lock_irqsave(&device->client_data_lock, flags);
list_for_each_entry_safe(context, tmp, &device->client_data_list, list)
if (context->client == client) {
- list_del(&context->list);
- kfree(context);
+ context->going_down = true;
+ found_context = context;
+ break;
}
spin_unlock_irqrestore(&device->client_data_lock, flags);
+ up_write(&lists_rwsem);
+
+ if (client->remove)
+ client->remove(device, found_context ?
+ found_context->data : NULL);
+
+ if (!found_context) {
+ pr_warn("No client context found for %s/%s\n",
+ device->name, client->name);
+ continue;
+ }
+
+ down_write(&lists_rwsem);
+ spin_lock_irqsave(&device->client_data_lock, flags);
+ list_del(&found_context->list);
+ kfree(found_context);
+ spin_unlock_irqrestore(&device->client_data_lock, flags);
+ up_write(&lists_rwsem);
}
- list_del(&client->list);
mutex_unlock(&device_mutex);
}
@@ -590,11 +678,80 @@ EXPORT_SYMBOL(ib_query_port);
int ib_query_gid(struct ib_device *device,
u8 port_num, int index, union ib_gid *gid)
{
+ if (rdma_cap_roce_gid_table(device, port_num))
+ return ib_get_cached_gid(device, port_num, index, gid);
+
return device->query_gid(device, port_num, index, gid);
}
EXPORT_SYMBOL(ib_query_gid);
/**
+ * ib_enum_roce_netdev - enumerate all RoCE ports
+ * @ib_dev : IB device we want to query
+ * @filter: Should we call the callback?
+ * @filter_cookie: Cookie passed to filter
+ * @cb: Callback to call for each found RoCE ports
+ * @cookie: Cookie passed back to the callback
+ *
+ * Enumerates all of the physical RoCE ports of ib_dev
+ * which are related to netdevice and calls callback() on each
+ * device for which filter() function returns non zero.
+ */
+void ib_enum_roce_netdev(struct ib_device *ib_dev,
+ roce_netdev_filter filter,
+ void *filter_cookie,
+ roce_netdev_callback cb,
+ void *cookie)
+{
+ u8 port;
+
+ for (port = rdma_start_port(ib_dev); port <= rdma_end_port(ib_dev);
+ port++)
+ if (rdma_protocol_roce(ib_dev, port)) {
+ struct net_device *idev = NULL;
+
+ if (ib_dev->get_netdev)
+ idev = ib_dev->get_netdev(ib_dev, port);
+
+ if (idev &&
+ idev->reg_state >= NETREG_UNREGISTERED) {
+ dev_put(idev);
+ idev = NULL;
+ }
+
+ if (filter(ib_dev, port, idev, filter_cookie))
+ cb(ib_dev, port, idev, cookie);
+
+ if (idev)
+ dev_put(idev);
+ }
+}
+
+/**
+ * ib_enum_all_roce_netdevs - enumerate all RoCE devices
+ * @filter: Should we call the callback?
+ * @filter_cookie: Cookie passed to filter
+ * @cb: Callback to call for each found RoCE ports
+ * @cookie: Cookie passed back to the callback
+ *
+ * Enumerates all RoCE devices' physical ports which are related
+ * to netdevices and calls callback() on each device for which
+ * filter() function returns non zero.
+ */
+void ib_enum_all_roce_netdevs(roce_netdev_filter filter,
+ void *filter_cookie,
+ roce_netdev_callback cb,
+ void *cookie)
+{
+ struct ib_device *dev;
+
+ down_read(&lists_rwsem);
+ list_for_each_entry(dev, &device_list, core_list)
+ ib_enum_roce_netdev(dev, filter, filter_cookie, cb, cookie);
+ up_read(&lists_rwsem);
+}
+
+/**
* ib_query_pkey - Get P_Key table entry
* @device:Device to query
* @port_num:Port number to query
@@ -673,6 +830,14 @@ int ib_find_gid(struct ib_device *device, union ib_gid *gid,
int ret, port, i;
for (port = rdma_start_port(device); port <= rdma_end_port(device); ++port) {
+ if (rdma_cap_roce_gid_table(device, port)) {
+ if (!ib_cache_gid_find_by_port(device, gid, port,
+ NULL, index)) {
+ *port_num = port;
+ return 0;
+ }
+ }
+
for (i = 0; i < device->port_immutable[port].gid_tbl_len; ++i) {
ret = ib_query_gid(device, port, i, &tmp_gid);
if (ret)
@@ -729,6 +894,51 @@ int ib_find_pkey(struct ib_device *device,
}
EXPORT_SYMBOL(ib_find_pkey);
+/**
+ * ib_get_net_dev_by_params() - Return the appropriate net_dev
+ * for a received CM request
+ * @dev: An RDMA device on which the request has been received.
+ * @port: Port number on the RDMA device.
+ * @pkey: The Pkey the request came on.
+ * @gid: A GID that the net_dev uses to communicate.
+ * @addr: Contains the IP address that the request specified as its
+ * destination.
+ */
+struct net_device *ib_get_net_dev_by_params(struct ib_device *dev,
+ u8 port,
+ u16 pkey,
+ const union ib_gid *gid,
+ const struct sockaddr *addr)
+{
+ struct net_device *net_dev = NULL;
+ struct ib_client_data *context;
+
+ if (!rdma_protocol_ib(dev, port))
+ return NULL;
+
+ down_read(&lists_rwsem);
+
+ list_for_each_entry(context, &dev->client_data_list, list) {
+ struct ib_client *client = context->client;
+
+ if (context->going_down)
+ continue;
+
+ if (client->get_net_dev_by_params) {
+ net_dev = client->get_net_dev_by_params(dev, port, pkey,
+ gid, addr,
+ context->data);
+ if (net_dev)
+ break;
+ }
+ }
+
+ up_read(&lists_rwsem);
+
+ return net_dev;
+}
+EXPORT_SYMBOL(ib_get_net_dev_by_params);
+
static int __init ib_core_init(void)
{
int ret;
@@ -737,7 +947,7 @@ static int __init ib_core_init(void)
if (!ib_wq)
return -ENOMEM;
- ret = ib_sysfs_setup();
+ ret = class_register(&ib_class);
if (ret) {
printk(KERN_WARNING "Couldn't create InfiniBand device class\n");
goto err;
@@ -749,19 +959,12 @@ static int __init ib_core_init(void)
goto err_sysfs;
}
- ret = ib_cache_setup();
- if (ret) {
- printk(KERN_WARNING "Couldn't set up InfiniBand P_Key/GID cache\n");
- goto err_nl;
- }
+ ib_cache_setup();
return 0;
-err_nl:
- ibnl_cleanup();
-
err_sysfs:
- ib_sysfs_cleanup();
+ class_unregister(&ib_class);
err:
destroy_workqueue(ib_wq);
@@ -772,7 +975,7 @@ static void __exit ib_core_cleanup(void)
{
ib_cache_cleanup();
ibnl_cleanup();
- ib_sysfs_cleanup();
+ class_unregister(&ib_class);
/* Make sure that any pending umem accounting work is done. */
destroy_workqueue(ib_wq);
}
diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c
index 786fc51bf04b..4b5c72311deb 100644
--- a/drivers/infiniband/core/mad.c
+++ b/drivers/infiniband/core/mad.c
@@ -338,13 +338,6 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device,
goto error1;
}
- mad_agent_priv->agent.mr = ib_get_dma_mr(port_priv->qp_info[qpn].qp->pd,
- IB_ACCESS_LOCAL_WRITE);
- if (IS_ERR(mad_agent_priv->agent.mr)) {
- ret = ERR_PTR(-ENOMEM);
- goto error2;
- }
-
if (mad_reg_req) {
reg_req = kmemdup(mad_reg_req, sizeof *reg_req, GFP_KERNEL);
if (!reg_req) {
@@ -429,8 +422,6 @@ error4:
spin_unlock_irqrestore(&port_priv->reg_lock, flags);
kfree(reg_req);
error3:
- ib_dereg_mr(mad_agent_priv->agent.mr);
-error2:
kfree(mad_agent_priv);
error1:
return ret;
@@ -590,7 +581,6 @@ static void unregister_mad_agent(struct ib_mad_agent_private *mad_agent_priv)
wait_for_completion(&mad_agent_priv->comp);
kfree(mad_agent_priv->reg_req);
- ib_dereg_mr(mad_agent_priv->agent.mr);
kfree(mad_agent_priv);
}
@@ -1038,7 +1028,7 @@ struct ib_mad_send_buf * ib_create_send_mad(struct ib_mad_agent *mad_agent,
mad_send_wr->mad_agent_priv = mad_agent_priv;
mad_send_wr->sg_list[0].length = hdr_len;
- mad_send_wr->sg_list[0].lkey = mad_agent->mr->lkey;
+ mad_send_wr->sg_list[0].lkey = mad_agent->qp->pd->local_dma_lkey;
/* OPA MADs don't have to be the full 2048 bytes */
if (opa && base_version == OPA_MGMT_BASE_VERSION &&
@@ -1047,7 +1037,7 @@ struct ib_mad_send_buf * ib_create_send_mad(struct ib_mad_agent *mad_agent,
else
mad_send_wr->sg_list[1].length = mad_size - hdr_len;
- mad_send_wr->sg_list[1].lkey = mad_agent->mr->lkey;
+ mad_send_wr->sg_list[1].lkey = mad_agent->qp->pd->local_dma_lkey;
mad_send_wr->send_wr.wr_id = (unsigned long) mad_send_wr;
mad_send_wr->send_wr.sg_list = mad_send_wr->sg_list;
@@ -2885,7 +2875,7 @@ static int ib_mad_post_receive_mads(struct ib_mad_qp_info *qp_info,
struct ib_mad_queue *recv_queue = &qp_info->recv_queue;
/* Initialize common scatter list fields */
- sg_list.lkey = (*qp_info->port_priv->mr).lkey;
+ sg_list.lkey = qp_info->port_priv->pd->local_dma_lkey;
/* Initialize common receive WR fields */
recv_wr.next = NULL;
@@ -3201,13 +3191,6 @@ static int ib_mad_port_open(struct ib_device *device,
goto error4;
}
- port_priv->mr = ib_get_dma_mr(port_priv->pd, IB_ACCESS_LOCAL_WRITE);
- if (IS_ERR(port_priv->mr)) {
- dev_err(&device->dev, "Couldn't get ib_mad DMA MR\n");
- ret = PTR_ERR(port_priv->mr);
- goto error5;
- }
-
if (has_smi) {
ret = create_mad_qp(&port_priv->qp_info[0], IB_QPT_SMI);
if (ret)
@@ -3248,8 +3231,6 @@ error8:
error7:
destroy_mad_qp(&port_priv->qp_info[0]);
error6:
- ib_dereg_mr(port_priv->mr);
-error5:
ib_dealloc_pd(port_priv->pd);
error4:
ib_destroy_cq(port_priv->cq);
@@ -3284,7 +3265,6 @@ static int ib_mad_port_close(struct ib_device *device, int port_num)
destroy_workqueue(port_priv->wq);
destroy_mad_qp(&port_priv->qp_info[1]);
destroy_mad_qp(&port_priv->qp_info[0]);
- ib_dereg_mr(port_priv->mr);
ib_dealloc_pd(port_priv->pd);
ib_destroy_cq(port_priv->cq);
cleanup_recv_queue(&port_priv->qp_info[1]);
@@ -3335,7 +3315,7 @@ error:
}
}
-static void ib_mad_remove_device(struct ib_device *device)
+static void ib_mad_remove_device(struct ib_device *device, void *client_data)
{
int i;
diff --git a/drivers/infiniband/core/mad_priv.h b/drivers/infiniband/core/mad_priv.h
index 5be89f98928f..4a4f7aad0978 100644
--- a/drivers/infiniband/core/mad_priv.h
+++ b/drivers/infiniband/core/mad_priv.h
@@ -199,7 +199,6 @@ struct ib_mad_port_private {
int port_num;
struct ib_cq *cq;
struct ib_pd *pd;
- struct ib_mr *mr;
spinlock_t reg_lock;
struct ib_mad_mgmt_version_table version[MAX_MGMT_VERSION];
diff --git a/drivers/infiniband/core/multicast.c b/drivers/infiniband/core/multicast.c
index 2cb865c7ce7a..d38d8b2b2979 100644
--- a/drivers/infiniband/core/multicast.c
+++ b/drivers/infiniband/core/multicast.c
@@ -43,7 +43,7 @@
#include "sa.h"
static void mcast_add_one(struct ib_device *device);
-static void mcast_remove_one(struct ib_device *device);
+static void mcast_remove_one(struct ib_device *device, void *client_data);
static struct ib_client mcast_client = {
.name = "ib_multicast",
@@ -840,13 +840,12 @@ static void mcast_add_one(struct ib_device *device)
ib_register_event_handler(&dev->event_handler);
}
-static void mcast_remove_one(struct ib_device *device)
+static void mcast_remove_one(struct ib_device *device, void *client_data)
{
- struct mcast_device *dev;
+ struct mcast_device *dev = client_data;
struct mcast_port *port;
int i;
- dev = ib_get_client_data(device, &mcast_client);
if (!dev)
return;
diff --git a/drivers/infiniband/core/netlink.c b/drivers/infiniband/core/netlink.c
index 23dd5a5c7597..d47df9356779 100644
--- a/drivers/infiniband/core/netlink.c
+++ b/drivers/infiniband/core/netlink.c
@@ -49,6 +49,14 @@ static DEFINE_MUTEX(ibnl_mutex);
static struct sock *nls;
static LIST_HEAD(client_list);
+int ibnl_chk_listeners(unsigned int group)
+{
+ if (netlink_has_listeners(nls, group) == 0)
+ return -1;
+ return 0;
+}
+EXPORT_SYMBOL(ibnl_chk_listeners);
+
int ibnl_add_client(int index, int nops,
const struct ibnl_client_cbs cb_table[])
{
@@ -151,6 +159,23 @@ static int ibnl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
!client->cb_table[op].dump)
return -EINVAL;
+ /*
+ * For response or local service set_timeout request,
+ * there is no need to use netlink_dump_start.
+ */
+ if (!(nlh->nlmsg_flags & NLM_F_REQUEST) ||
+ (index == RDMA_NL_LS &&
+ op == RDMA_NL_LS_OP_SET_TIMEOUT)) {
+ struct netlink_callback cb = {
+ .skb = skb,
+ .nlh = nlh,
+ .dump = client->cb_table[op].dump,
+ .module = client->cb_table[op].module,
+ };
+
+ return cb.dump(skb, &cb);
+ }
+
{
struct netlink_dump_control c = {
.dump = client->cb_table[op].dump,
@@ -165,9 +190,39 @@ static int ibnl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
return -EINVAL;
}
+static void ibnl_rcv_reply_skb(struct sk_buff *skb)
+{
+ struct nlmsghdr *nlh;
+ int msglen;
+
+ /*
+ * Process responses until there is no more message or the first
+ * request. Generally speaking, it is not recommended to mix responses
+ * with requests.
+ */
+ while (skb->len >= nlmsg_total_size(0)) {
+ nlh = nlmsg_hdr(skb);
+
+ if (nlh->nlmsg_len < NLMSG_HDRLEN || skb->len < nlh->nlmsg_len)
+ return;
+
+ /* Handle response only */
+ if (nlh->nlmsg_flags & NLM_F_REQUEST)
+ return;
+
+ ibnl_rcv_msg(skb, nlh);
+
+ msglen = NLMSG_ALIGN(nlh->nlmsg_len);
+ if (msglen > skb->len)
+ msglen = skb->len;
+ skb_pull(skb, msglen);
+ }
+}
+
static void ibnl_rcv(struct sk_buff *skb)
{
mutex_lock(&ibnl_mutex);
+ ibnl_rcv_reply_skb(skb);
netlink_rcv_skb(skb, &ibnl_rcv_msg);
mutex_unlock(&ibnl_mutex);
}
diff --git a/drivers/infiniband/core/roce_gid_mgmt.c b/drivers/infiniband/core/roce_gid_mgmt.c
new file mode 100644
index 000000000000..6b24cba1e474
--- /dev/null
+++ b/drivers/infiniband/core/roce_gid_mgmt.c
@@ -0,0 +1,728 @@
+/*
+ * Copyright (c) 2015, Mellanox Technologies inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "core_priv.h"
+
+#include <linux/in.h>
+#include <linux/in6.h>
+
+/* For in6_dev_get/in6_dev_put */
+#include <net/addrconf.h>
+#include <net/bonding.h>
+
+#include <rdma/ib_cache.h>
+#include <rdma/ib_addr.h>
+
+enum gid_op_type {
+ GID_DEL = 0,
+ GID_ADD
+};
+
+struct update_gid_event_work {
+ struct work_struct work;
+ union ib_gid gid;
+ struct ib_gid_attr gid_attr;
+ enum gid_op_type gid_op;
+};
+
+#define ROCE_NETDEV_CALLBACK_SZ 3
+struct netdev_event_work_cmd {
+ roce_netdev_callback cb;
+ roce_netdev_filter filter;
+ struct net_device *ndev;
+ struct net_device *filter_ndev;
+};
+
+struct netdev_event_work {
+ struct work_struct work;
+ struct netdev_event_work_cmd cmds[ROCE_NETDEV_CALLBACK_SZ];
+};
+
+static void update_gid(enum gid_op_type gid_op, struct ib_device *ib_dev,
+ u8 port, union ib_gid *gid,
+ struct ib_gid_attr *gid_attr)
+{
+ switch (gid_op) {
+ case GID_ADD:
+ ib_cache_gid_add(ib_dev, port, gid, gid_attr);
+ break;
+ case GID_DEL:
+ ib_cache_gid_del(ib_dev, port, gid, gid_attr);
+ break;
+ }
+}
+
+enum bonding_slave_state {
+ BONDING_SLAVE_STATE_ACTIVE = 1UL << 0,
+ BONDING_SLAVE_STATE_INACTIVE = 1UL << 1,
+ /* No primary slave or the device isn't a slave in bonding */
+ BONDING_SLAVE_STATE_NA = 1UL << 2,
+};
+
+static enum bonding_slave_state is_eth_active_slave_of_bonding_rcu(struct net_device *dev,
+ struct net_device *upper)
+{
+ if (upper && netif_is_bond_master(upper)) {
+ struct net_device *pdev =
+ bond_option_active_slave_get_rcu(netdev_priv(upper));
+
+ if (pdev)
+ return dev == pdev ? BONDING_SLAVE_STATE_ACTIVE :
+ BONDING_SLAVE_STATE_INACTIVE;
+ }
+
+ return BONDING_SLAVE_STATE_NA;
+}
+
+static bool is_upper_dev_rcu(struct net_device *dev, struct net_device *upper)
+{
+ struct net_device *_upper = NULL;
+ struct list_head *iter;
+
+ netdev_for_each_all_upper_dev_rcu(dev, _upper, iter)
+ if (_upper == upper)
+ break;
+
+ return _upper == upper;
+}
+
+#define REQUIRED_BOND_STATES (BONDING_SLAVE_STATE_ACTIVE | \
+ BONDING_SLAVE_STATE_NA)
+static int is_eth_port_of_netdev(struct ib_device *ib_dev, u8 port,
+ struct net_device *rdma_ndev, void *cookie)
+{
+ struct net_device *event_ndev = (struct net_device *)cookie;
+ struct net_device *real_dev;
+ int res;
+
+ if (!rdma_ndev)
+ return 0;
+
+ rcu_read_lock();
+ real_dev = rdma_vlan_dev_real_dev(event_ndev);
+ if (!real_dev)
+ real_dev = event_ndev;
+
+ res = ((is_upper_dev_rcu(rdma_ndev, event_ndev) &&
+ (is_eth_active_slave_of_bonding_rcu(rdma_ndev, real_dev) &
+ REQUIRED_BOND_STATES)) ||
+ real_dev == rdma_ndev);
+
+ rcu_read_unlock();
+ return res;
+}
+
+static int is_eth_port_inactive_slave(struct ib_device *ib_dev, u8 port,
+ struct net_device *rdma_ndev, void *cookie)
+{
+ struct net_device *master_dev;
+ int res;
+
+ if (!rdma_ndev)
+ return 0;
+
+ rcu_read_lock();
+ master_dev = netdev_master_upper_dev_get_rcu(rdma_ndev);
+ res = is_eth_active_slave_of_bonding_rcu(rdma_ndev, master_dev) ==
+ BONDING_SLAVE_STATE_INACTIVE;
+ rcu_read_unlock();
+
+ return res;
+}
+
+static int pass_all_filter(struct ib_device *ib_dev, u8 port,
+ struct net_device *rdma_ndev, void *cookie)
+{
+ return 1;
+}
+
+static int upper_device_filter(struct ib_device *ib_dev, u8 port,
+ struct net_device *rdma_ndev, void *cookie)
+{
+ struct net_device *event_ndev = (struct net_device *)cookie;
+ int res;
+
+ if (!rdma_ndev)
+ return 0;
+
+ if (rdma_ndev == event_ndev)
+ return 1;
+
+ rcu_read_lock();
+ res = is_upper_dev_rcu(rdma_ndev, event_ndev);
+ rcu_read_unlock();
+
+ return res;
+}
+
+static void update_gid_ip(enum gid_op_type gid_op,
+ struct ib_device *ib_dev,
+ u8 port, struct net_device *ndev,
+ struct sockaddr *addr)
+{
+ union ib_gid gid;
+ struct ib_gid_attr gid_attr;
+
+ rdma_ip2gid(addr, &gid);
+ memset(&gid_attr, 0, sizeof(gid_attr));
+ gid_attr.ndev = ndev;
+
+ update_gid(gid_op, ib_dev, port, &gid, &gid_attr);
+}
+
+static void enum_netdev_default_gids(struct ib_device *ib_dev,
+ u8 port, struct net_device *event_ndev,
+ struct net_device *rdma_ndev)
+{
+ rcu_read_lock();
+ if (!rdma_ndev ||
+ ((rdma_ndev != event_ndev &&
+ !is_upper_dev_rcu(rdma_ndev, event_ndev)) ||
+ is_eth_active_slave_of_bonding_rcu(rdma_ndev,
+ netdev_master_upper_dev_get_rcu(rdma_ndev)) ==
+ BONDING_SLAVE_STATE_INACTIVE)) {
+ rcu_read_unlock();
+ return;
+ }
+ rcu_read_unlock();
+
+ ib_cache_gid_set_default_gid(ib_dev, port, rdma_ndev,
+ IB_CACHE_GID_DEFAULT_MODE_SET);
+}
+
+static void bond_delete_netdev_default_gids(struct ib_device *ib_dev,
+ u8 port,
+ struct net_device *event_ndev,
+ struct net_device *rdma_ndev)
+{
+ struct net_device *real_dev = rdma_vlan_dev_real_dev(event_ndev);
+
+ if (!rdma_ndev)
+ return;
+
+ if (!real_dev)
+ real_dev = event_ndev;
+
+ rcu_read_lock();
+
+ if (is_upper_dev_rcu(rdma_ndev, event_ndev) &&
+ is_eth_active_slave_of_bonding_rcu(rdma_ndev, real_dev) ==
+ BONDING_SLAVE_STATE_INACTIVE) {
+ rcu_read_unlock();
+
+ ib_cache_gid_set_default_gid(ib_dev, port, rdma_ndev,
+ IB_CACHE_GID_DEFAULT_MODE_DELETE);
+ } else {
+ rcu_read_unlock();
+ }
+}
+
+static void enum_netdev_ipv4_ips(struct ib_device *ib_dev,
+ u8 port, struct net_device *ndev)
+{
+ struct in_device *in_dev;
+
+ if (ndev->reg_state >= NETREG_UNREGISTERING)
+ return;
+
+ in_dev = in_dev_get(ndev);
+ if (!in_dev)
+ return;
+
+ for_ifa(in_dev) {
+ struct sockaddr_in ip;
+
+ ip.sin_family = AF_INET;
+ ip.sin_addr.s_addr = ifa->ifa_address;
+ update_gid_ip(GID_ADD, ib_dev, port, ndev,
+ (struct sockaddr *)&ip);
+ }
+ endfor_ifa(in_dev);
+
+ in_dev_put(in_dev);
+}
+
+static void enum_netdev_ipv6_ips(struct ib_device *ib_dev,
+ u8 port, struct net_device *ndev)
+{
+ struct inet6_ifaddr *ifp;
+ struct inet6_dev *in6_dev;
+ struct sin6_list {
+ struct list_head list;
+ struct sockaddr_in6 sin6;
+ };
+ struct sin6_list *sin6_iter;
+ struct sin6_list *sin6_temp;
+ struct ib_gid_attr gid_attr = {.ndev = ndev};
+ LIST_HEAD(sin6_list);
+
+ if (ndev->reg_state >= NETREG_UNREGISTERING)
+ return;
+
+ in6_dev = in6_dev_get(ndev);
+ if (!in6_dev)
+ return;
+
+ read_lock_bh(&in6_dev->lock);
+ list_for_each_entry(ifp, &in6_dev->addr_list, if_list) {
+ struct sin6_list *entry = kzalloc(sizeof(*entry), GFP_ATOMIC);
+
+ if (!entry) {
+ pr_warn("roce_gid_mgmt: couldn't allocate entry for IPv6 update\n");
+ continue;
+ }
+
+ entry->sin6.sin6_family = AF_INET6;
+ entry->sin6.sin6_addr = ifp->addr;
+ list_add_tail(&entry->list, &sin6_list);
+ }
+ read_unlock_bh(&in6_dev->lock);
+
+ in6_dev_put(in6_dev);
+
+ list_for_each_entry_safe(sin6_iter, sin6_temp, &sin6_list, list) {
+ union ib_gid gid;
+
+ rdma_ip2gid((struct sockaddr *)&sin6_iter->sin6, &gid);
+ update_gid(GID_ADD, ib_dev, port, &gid, &gid_attr);
+ list_del(&sin6_iter->list);
+ kfree(sin6_iter);
+ }
+}
+
+static void _add_netdev_ips(struct ib_device *ib_dev, u8 port,
+ struct net_device *ndev)
+{
+ enum_netdev_ipv4_ips(ib_dev, port, ndev);
+ if (IS_ENABLED(CONFIG_IPV6))
+ enum_netdev_ipv6_ips(ib_dev, port, ndev);
+}
+
+static void add_netdev_ips(struct ib_device *ib_dev, u8 port,
+ struct net_device *rdma_ndev, void *cookie)
+{
+ struct net_device *event_ndev = (struct net_device *)cookie;
+
+ enum_netdev_default_gids(ib_dev, port, event_ndev, rdma_ndev);
+ _add_netdev_ips(ib_dev, port, event_ndev);
+}
+
+static void del_netdev_ips(struct ib_device *ib_dev, u8 port,
+ struct net_device *rdma_ndev, void *cookie)
+{
+ struct net_device *event_ndev = (struct net_device *)cookie;
+
+ ib_cache_gid_del_all_netdev_gids(ib_dev, port, event_ndev);
+}
+
+static void enum_all_gids_of_dev_cb(struct ib_device *ib_dev,
+ u8 port,
+ struct net_device *rdma_ndev,
+ void *cookie)
+{
+ struct net *net;
+ struct net_device *ndev;
+
+ /* Lock the rtnl to make sure the netdevs does not move under
+ * our feet
+ */
+ rtnl_lock();
+ for_each_net(net)
+ for_each_netdev(net, ndev)
+ if (is_eth_port_of_netdev(ib_dev, port, rdma_ndev, ndev))
+ add_netdev_ips(ib_dev, port, rdma_ndev, ndev);
+ rtnl_unlock();
+}
+
+/* This function will rescan all of the network devices in the system
+ * and add their gids, as needed, to the relevant RoCE devices. */
+int roce_rescan_device(struct ib_device *ib_dev)
+{
+ ib_enum_roce_netdev(ib_dev, pass_all_filter, NULL,
+ enum_all_gids_of_dev_cb, NULL);
+
+ return 0;
+}
+
+static void callback_for_addr_gid_device_scan(struct ib_device *device,
+ u8 port,
+ struct net_device *rdma_ndev,
+ void *cookie)
+{
+ struct update_gid_event_work *parsed = cookie;
+
+ return update_gid(parsed->gid_op, device,
+ port, &parsed->gid,
+ &parsed->gid_attr);
+}
+
+static void handle_netdev_upper(struct ib_device *ib_dev, u8 port,
+ void *cookie,
+ void (*handle_netdev)(struct ib_device *ib_dev,
+ u8 port,
+ struct net_device *ndev))
+{
+ struct net_device *ndev = (struct net_device *)cookie;
+ struct upper_list {
+ struct list_head list;
+ struct net_device *upper;
+ };
+ struct net_device *upper;
+ struct list_head *iter;
+ struct upper_list *upper_iter;
+ struct upper_list *upper_temp;
+ LIST_HEAD(upper_list);
+
+ rcu_read_lock();
+ netdev_for_each_all_upper_dev_rcu(ndev, upper, iter) {
+ struct upper_list *entry = kmalloc(sizeof(*entry),
+ GFP_ATOMIC);
+
+ if (!entry) {
+ pr_info("roce_gid_mgmt: couldn't allocate entry to delete ndev\n");
+ continue;
+ }
+
+ list_add_tail(&entry->list, &upper_list);
+ dev_hold(upper);
+ entry->upper = upper;
+ }
+ rcu_read_unlock();
+
+ handle_netdev(ib_dev, port, ndev);
+ list_for_each_entry_safe(upper_iter, upper_temp, &upper_list,
+ list) {
+ handle_netdev(ib_dev, port, upper_iter->upper);
+ dev_put(upper_iter->upper);
+ list_del(&upper_iter->list);
+ kfree(upper_iter);
+ }
+}
+
+static void _roce_del_all_netdev_gids(struct ib_device *ib_dev, u8 port,
+ struct net_device *event_ndev)
+{
+ ib_cache_gid_del_all_netdev_gids(ib_dev, port, event_ndev);
+}
+
+static void del_netdev_upper_ips(struct ib_device *ib_dev, u8 port,
+ struct net_device *rdma_ndev, void *cookie)
+{
+ handle_netdev_upper(ib_dev, port, cookie, _roce_del_all_netdev_gids);
+}
+
+static void add_netdev_upper_ips(struct ib_device *ib_dev, u8 port,
+ struct net_device *rdma_ndev, void *cookie)
+{
+ handle_netdev_upper(ib_dev, port, cookie, _add_netdev_ips);
+}
+
+static void del_netdev_default_ips_join(struct ib_device *ib_dev, u8 port,
+ struct net_device *rdma_ndev,
+ void *cookie)
+{
+ struct net_device *master_ndev;
+
+ rcu_read_lock();
+ master_ndev = netdev_master_upper_dev_get_rcu(rdma_ndev);
+ if (master_ndev)
+ dev_hold(master_ndev);
+ rcu_read_unlock();
+
+ if (master_ndev) {
+ bond_delete_netdev_default_gids(ib_dev, port, master_ndev,
+ rdma_ndev);
+ dev_put(master_ndev);
+ }
+}
+
+static void del_netdev_default_ips(struct ib_device *ib_dev, u8 port,
+ struct net_device *rdma_ndev, void *cookie)
+{
+ struct net_device *event_ndev = (struct net_device *)cookie;
+
+ bond_delete_netdev_default_gids(ib_dev, port, event_ndev, rdma_ndev);
+}
+
+/* The following functions operate on all IB devices. netdevice_event and
+ * addr_event execute ib_enum_all_roce_netdevs through a work.
+ * ib_enum_all_roce_netdevs iterates through all IB devices.
+ */
+
+static void netdevice_event_work_handler(struct work_struct *_work)
+{
+ struct netdev_event_work *work =
+ container_of(_work, struct netdev_event_work, work);
+ unsigned int i;
+
+ for (i = 0; i < ARRAY_SIZE(work->cmds) && work->cmds[i].cb; i++) {
+ ib_enum_all_roce_netdevs(work->cmds[i].filter,
+ work->cmds[i].filter_ndev,
+ work->cmds[i].cb,
+ work->cmds[i].ndev);
+ dev_put(work->cmds[i].ndev);
+ dev_put(work->cmds[i].filter_ndev);
+ }
+
+ kfree(work);
+}
+
+static int netdevice_queue_work(struct netdev_event_work_cmd *cmds,
+ struct net_device *ndev)
+{
+ unsigned int i;
+ struct netdev_event_work *ndev_work =
+ kmalloc(sizeof(*ndev_work), GFP_KERNEL);
+
+ if (!ndev_work) {
+ pr_warn("roce_gid_mgmt: can't allocate work for netdevice_event\n");
+ return NOTIFY_DONE;
+ }
+
+ memcpy(ndev_work->cmds, cmds, sizeof(ndev_work->cmds));
+ for (i = 0; i < ARRAY_SIZE(ndev_work->cmds) && ndev_work->cmds[i].cb; i++) {
+ if (!ndev_work->cmds[i].ndev)
+ ndev_work->cmds[i].ndev = ndev;
+ if (!ndev_work->cmds[i].filter_ndev)
+ ndev_work->cmds[i].filter_ndev = ndev;
+ dev_hold(ndev_work->cmds[i].ndev);
+ dev_hold(ndev_work->cmds[i].filter_ndev);
+ }
+ INIT_WORK(&ndev_work->work, netdevice_event_work_handler);
+
+ queue_work(ib_wq, &ndev_work->work);
+
+ return NOTIFY_DONE;
+}
+
+static const struct netdev_event_work_cmd add_cmd = {
+ .cb = add_netdev_ips, .filter = is_eth_port_of_netdev};
+static const struct netdev_event_work_cmd add_cmd_upper_ips = {
+ .cb = add_netdev_upper_ips, .filter = is_eth_port_of_netdev};
+
+static void netdevice_event_changeupper(struct netdev_notifier_changeupper_info *changeupper_info,
+ struct netdev_event_work_cmd *cmds)
+{
+ static const struct netdev_event_work_cmd upper_ips_del_cmd = {
+ .cb = del_netdev_upper_ips, .filter = upper_device_filter};
+ static const struct netdev_event_work_cmd bonding_default_del_cmd = {
+ .cb = del_netdev_default_ips, .filter = is_eth_port_inactive_slave};
+
+ if (changeupper_info->linking == false) {
+ cmds[0] = upper_ips_del_cmd;
+ cmds[0].ndev = changeupper_info->upper_dev;
+ cmds[1] = add_cmd;
+ } else {
+ cmds[0] = bonding_default_del_cmd;
+ cmds[0].ndev = changeupper_info->upper_dev;
+ cmds[1] = add_cmd_upper_ips;
+ cmds[1].ndev = changeupper_info->upper_dev;
+ cmds[1].filter_ndev = changeupper_info->upper_dev;
+ }
+}
+
+static int netdevice_event(struct notifier_block *this, unsigned long event,
+ void *ptr)
+{
+ static const struct netdev_event_work_cmd del_cmd = {
+ .cb = del_netdev_ips, .filter = pass_all_filter};
+ static const struct netdev_event_work_cmd bonding_default_del_cmd_join = {
+ .cb = del_netdev_default_ips_join, .filter = is_eth_port_inactive_slave};
+ static const struct netdev_event_work_cmd default_del_cmd = {
+ .cb = del_netdev_default_ips, .filter = pass_all_filter};
+ static const struct netdev_event_work_cmd bonding_event_ips_del_cmd = {
+ .cb = del_netdev_upper_ips, .filter = upper_device_filter};
+ struct net_device *ndev = netdev_notifier_info_to_dev(ptr);
+ struct netdev_event_work_cmd cmds[ROCE_NETDEV_CALLBACK_SZ] = { {NULL} };
+
+ if (ndev->type != ARPHRD_ETHER)
+ return NOTIFY_DONE;
+
+ switch (event) {
+ case NETDEV_REGISTER:
+ case NETDEV_UP:
+ cmds[0] = bonding_default_del_cmd_join;
+ cmds[1] = add_cmd;
+ break;
+
+ case NETDEV_UNREGISTER:
+ if (ndev->reg_state < NETREG_UNREGISTERED)
+ cmds[0] = del_cmd;
+ else
+ return NOTIFY_DONE;
+ break;
+
+ case NETDEV_CHANGEADDR:
+ cmds[0] = default_del_cmd;
+ cmds[1] = add_cmd;
+ break;
+
+ case NETDEV_CHANGEUPPER:
+ netdevice_event_changeupper(
+ container_of(ptr, struct netdev_notifier_changeupper_info, info),
+ cmds);
+ break;
+
+ case NETDEV_BONDING_FAILOVER:
+ cmds[0] = bonding_event_ips_del_cmd;
+ cmds[1] = bonding_default_del_cmd_join;
+ cmds[2] = add_cmd_upper_ips;
+ break;
+
+ default:
+ return NOTIFY_DONE;
+ }
+
+ return netdevice_queue_work(cmds, ndev);
+}
+
+static void update_gid_event_work_handler(struct work_struct *_work)
+{
+ struct update_gid_event_work *work =
+ container_of(_work, struct update_gid_event_work, work);
+
+ ib_enum_all_roce_netdevs(is_eth_port_of_netdev, work->gid_attr.ndev,
+ callback_for_addr_gid_device_scan, work);
+
+ dev_put(work->gid_attr.ndev);
+ kfree(work);
+}
+
+static int addr_event(struct notifier_block *this, unsigned long event,
+ struct sockaddr *sa, struct net_device *ndev)
+{
+ struct update_gid_event_work *work;
+ enum gid_op_type gid_op;
+
+ if (ndev->type != ARPHRD_ETHER)
+ return NOTIFY_DONE;
+
+ switch (event) {
+ case NETDEV_UP:
+ gid_op = GID_ADD;
+ break;
+
+ case NETDEV_DOWN:
+ gid_op = GID_DEL;
+ break;
+
+ default:
+ return NOTIFY_DONE;
+ }
+
+ work = kmalloc(sizeof(*work), GFP_ATOMIC);
+ if (!work) {
+ pr_warn("roce_gid_mgmt: Couldn't allocate work for addr_event\n");
+ return NOTIFY_DONE;
+ }
+
+ INIT_WORK(&work->work, update_gid_event_work_handler);
+
+ rdma_ip2gid(sa, &work->gid);
+ work->gid_op = gid_op;
+
+ memset(&work->gid_attr, 0, sizeof(work->gid_attr));
+ dev_hold(ndev);
+ work->gid_attr.ndev = ndev;
+
+ queue_work(ib_wq, &work->work);
+
+ return NOTIFY_DONE;
+}
+
+static int inetaddr_event(struct notifier_block *this, unsigned long event,
+ void *ptr)
+{
+ struct sockaddr_in in;
+ struct net_device *ndev;
+ struct in_ifaddr *ifa = ptr;
+
+ in.sin_family = AF_INET;
+ in.sin_addr.s_addr = ifa->ifa_address;
+ ndev = ifa->ifa_dev->dev;
+
+ return addr_event(this, event, (struct sockaddr *)&in, ndev);
+}
+
+static int inet6addr_event(struct notifier_block *this, unsigned long event,
+ void *ptr)
+{
+ struct sockaddr_in6 in6;
+ struct net_device *ndev;
+ struct inet6_ifaddr *ifa6 = ptr;
+
+ in6.sin6_family = AF_INET6;
+ in6.sin6_addr = ifa6->addr;
+ ndev = ifa6->idev->dev;
+
+ return addr_event(this, event, (struct sockaddr *)&in6, ndev);
+}
+
+static struct notifier_block nb_netdevice = {
+ .notifier_call = netdevice_event
+};
+
+static struct notifier_block nb_inetaddr = {
+ .notifier_call = inetaddr_event
+};
+
+static struct notifier_block nb_inet6addr = {
+ .notifier_call = inet6addr_event
+};
+
+int __init roce_gid_mgmt_init(void)
+{
+ register_inetaddr_notifier(&nb_inetaddr);
+ if (IS_ENABLED(CONFIG_IPV6))
+ register_inet6addr_notifier(&nb_inet6addr);
+ /* We relay on the netdevice notifier to enumerate all
+ * existing devices in the system. Register to this notifier
+ * last to make sure we will not miss any IP add/del
+ * callbacks.
+ */
+ register_netdevice_notifier(&nb_netdevice);
+
+ return 0;
+}
+
+void __exit roce_gid_mgmt_cleanup(void)
+{
+ if (IS_ENABLED(CONFIG_IPV6))
+ unregister_inet6addr_notifier(&nb_inet6addr);
+ unregister_inetaddr_notifier(&nb_inetaddr);
+ unregister_netdevice_notifier(&nb_netdevice);
+ /* Ensure all gid deletion tasks complete before we go down,
+ * to avoid any reference to free'd memory. By the time
+ * ib-core is removed, all physical devices have been removed,
+ * so no issue with remaining hardware contexts.
+ */
+}
diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c
index ca919f429666..8c014b33d8e0 100644
--- a/drivers/infiniband/core/sa_query.c
+++ b/drivers/infiniband/core/sa_query.c
@@ -45,12 +45,21 @@
#include <uapi/linux/if_ether.h>
#include <rdma/ib_pack.h>
#include <rdma/ib_cache.h>
+#include <rdma/rdma_netlink.h>
+#include <net/netlink.h>
+#include <uapi/rdma/ib_user_sa.h>
+#include <rdma/ib_marshall.h>
#include "sa.h"
MODULE_AUTHOR("Roland Dreier");
MODULE_DESCRIPTION("InfiniBand subnet administration query support");
MODULE_LICENSE("Dual BSD/GPL");
+#define IB_SA_LOCAL_SVC_TIMEOUT_MIN 100
+#define IB_SA_LOCAL_SVC_TIMEOUT_DEFAULT 2000
+#define IB_SA_LOCAL_SVC_TIMEOUT_MAX 200000
+static int sa_local_svc_timeout_ms = IB_SA_LOCAL_SVC_TIMEOUT_DEFAULT;
+
struct ib_sa_sm_ah {
struct ib_ah *ah;
struct kref ref;
@@ -80,8 +89,16 @@ struct ib_sa_query {
struct ib_mad_send_buf *mad_buf;
struct ib_sa_sm_ah *sm_ah;
int id;
+ u32 flags;
+ struct list_head list; /* Local svc request list */
+ u32 seq; /* Local svc request sequence number */
+ unsigned long timeout; /* Local svc timeout */
+ u8 path_use; /* How will the pathrecord be used */
};
+#define IB_SA_ENABLE_LOCAL_SERVICE 0x00000001
+#define IB_SA_CANCEL 0x00000002
+
struct ib_sa_service_query {
void (*callback)(int, struct ib_sa_service_rec *, void *);
void *context;
@@ -106,8 +123,28 @@ struct ib_sa_mcmember_query {
struct ib_sa_query sa_query;
};
+static LIST_HEAD(ib_nl_request_list);
+static DEFINE_SPINLOCK(ib_nl_request_lock);
+static atomic_t ib_nl_sa_request_seq;
+static struct workqueue_struct *ib_nl_wq;
+static struct delayed_work ib_nl_timed_work;
+static const struct nla_policy ib_nl_policy[LS_NLA_TYPE_MAX] = {
+ [LS_NLA_TYPE_PATH_RECORD] = {.type = NLA_BINARY,
+ .len = sizeof(struct ib_path_rec_data)},
+ [LS_NLA_TYPE_TIMEOUT] = {.type = NLA_U32},
+ [LS_NLA_TYPE_SERVICE_ID] = {.type = NLA_U64},
+ [LS_NLA_TYPE_DGID] = {.type = NLA_BINARY,
+ .len = sizeof(struct rdma_nla_ls_gid)},
+ [LS_NLA_TYPE_SGID] = {.type = NLA_BINARY,
+ .len = sizeof(struct rdma_nla_ls_gid)},
+ [LS_NLA_TYPE_TCLASS] = {.type = NLA_U8},
+ [LS_NLA_TYPE_PKEY] = {.type = NLA_U16},
+ [LS_NLA_TYPE_QOS_CLASS] = {.type = NLA_U16},
+};
+
+
static void ib_sa_add_one(struct ib_device *device);
-static void ib_sa_remove_one(struct ib_device *device);
+static void ib_sa_remove_one(struct ib_device *device, void *client_data);
static struct ib_client sa_client = {
.name = "sa",
@@ -381,6 +418,427 @@ static const struct ib_field guidinfo_rec_table[] = {
.size_bits = 512 },
};
+static inline void ib_sa_disable_local_svc(struct ib_sa_query *query)
+{
+ query->flags &= ~IB_SA_ENABLE_LOCAL_SERVICE;
+}
+
+static inline int ib_sa_query_cancelled(struct ib_sa_query *query)
+{
+ return (query->flags & IB_SA_CANCEL);
+}
+
+static void ib_nl_set_path_rec_attrs(struct sk_buff *skb,
+ struct ib_sa_query *query)
+{
+ struct ib_sa_path_rec *sa_rec = query->mad_buf->context[1];
+ struct ib_sa_mad *mad = query->mad_buf->mad;
+ ib_sa_comp_mask comp_mask = mad->sa_hdr.comp_mask;
+ u16 val16;
+ u64 val64;
+ struct rdma_ls_resolve_header *header;
+
+ query->mad_buf->context[1] = NULL;
+
+ /* Construct the family header first */
+ header = (struct rdma_ls_resolve_header *)
+ skb_put(skb, NLMSG_ALIGN(sizeof(*header)));
+ memcpy(header->device_name, query->port->agent->device->name,
+ LS_DEVICE_NAME_MAX);
+ header->port_num = query->port->port_num;
+
+ if ((comp_mask & IB_SA_PATH_REC_REVERSIBLE) &&
+ sa_rec->reversible != 0)
+ query->path_use = LS_RESOLVE_PATH_USE_GMP;
+ else
+ query->path_use = LS_RESOLVE_PATH_USE_UNIDIRECTIONAL;
+ header->path_use = query->path_use;
+
+ /* Now build the attributes */
+ if (comp_mask & IB_SA_PATH_REC_SERVICE_ID) {
+ val64 = be64_to_cpu(sa_rec->service_id);
+ nla_put(skb, RDMA_NLA_F_MANDATORY | LS_NLA_TYPE_SERVICE_ID,
+ sizeof(val64), &val64);
+ }
+ if (comp_mask & IB_SA_PATH_REC_DGID)
+ nla_put(skb, RDMA_NLA_F_MANDATORY | LS_NLA_TYPE_DGID,
+ sizeof(sa_rec->dgid), &sa_rec->dgid);
+ if (comp_mask & IB_SA_PATH_REC_SGID)
+ nla_put(skb, RDMA_NLA_F_MANDATORY | LS_NLA_TYPE_SGID,
+ sizeof(sa_rec->sgid), &sa_rec->sgid);
+ if (comp_mask & IB_SA_PATH_REC_TRAFFIC_CLASS)
+ nla_put(skb, RDMA_NLA_F_MANDATORY | LS_NLA_TYPE_TCLASS,
+ sizeof(sa_rec->traffic_class), &sa_rec->traffic_class);
+
+ if (comp_mask & IB_SA_PATH_REC_PKEY) {
+ val16 = be16_to_cpu(sa_rec->pkey);
+ nla_put(skb, RDMA_NLA_F_MANDATORY | LS_NLA_TYPE_PKEY,
+ sizeof(val16), &val16);
+ }
+ if (comp_mask & IB_SA_PATH_REC_QOS_CLASS) {
+ val16 = be16_to_cpu(sa_rec->qos_class);
+ nla_put(skb, RDMA_NLA_F_MANDATORY | LS_NLA_TYPE_QOS_CLASS,
+ sizeof(val16), &val16);
+ }
+}
+
+static int ib_nl_get_path_rec_attrs_len(ib_sa_comp_mask comp_mask)
+{
+ int len = 0;
+
+ if (comp_mask & IB_SA_PATH_REC_SERVICE_ID)
+ len += nla_total_size(sizeof(u64));
+ if (comp_mask & IB_SA_PATH_REC_DGID)
+ len += nla_total_size(sizeof(struct rdma_nla_ls_gid));
+ if (comp_mask & IB_SA_PATH_REC_SGID)
+ len += nla_total_size(sizeof(struct rdma_nla_ls_gid));
+ if (comp_mask & IB_SA_PATH_REC_TRAFFIC_CLASS)
+ len += nla_total_size(sizeof(u8));
+ if (comp_mask & IB_SA_PATH_REC_PKEY)
+ len += nla_total_size(sizeof(u16));
+ if (comp_mask & IB_SA_PATH_REC_QOS_CLASS)
+ len += nla_total_size(sizeof(u16));
+
+ /*
+ * Make sure that at least some of the required comp_mask bits are
+ * set.
+ */
+ if (WARN_ON(len == 0))
+ return len;
+
+ /* Add the family header */
+ len += NLMSG_ALIGN(sizeof(struct rdma_ls_resolve_header));
+
+ return len;
+}
+
+static int ib_nl_send_msg(struct ib_sa_query *query)
+{
+ struct sk_buff *skb = NULL;
+ struct nlmsghdr *nlh;
+ void *data;
+ int ret = 0;
+ struct ib_sa_mad *mad;
+ int len;
+
+ mad = query->mad_buf->mad;
+ len = ib_nl_get_path_rec_attrs_len(mad->sa_hdr.comp_mask);
+ if (len <= 0)
+ return -EMSGSIZE;
+
+ skb = nlmsg_new(len, GFP_KERNEL);
+ if (!skb)
+ return -ENOMEM;
+
+ /* Put nlmsg header only for now */
+ data = ibnl_put_msg(skb, &nlh, query->seq, 0, RDMA_NL_LS,
+ RDMA_NL_LS_OP_RESOLVE, NLM_F_REQUEST);
+ if (!data) {
+ kfree_skb(skb);
+ return -EMSGSIZE;
+ }
+
+ /* Add attributes */
+ ib_nl_set_path_rec_attrs(skb, query);
+
+ /* Repair the nlmsg header length */
+ nlmsg_end(skb, nlh);
+
+ ret = ibnl_multicast(skb, nlh, RDMA_NL_GROUP_LS, GFP_KERNEL);
+ if (!ret)
+ ret = len;
+ else
+ ret = 0;
+
+ return ret;
+}
+
+static int ib_nl_make_request(struct ib_sa_query *query)
+{
+ unsigned long flags;
+ unsigned long delay;
+ int ret;
+
+ INIT_LIST_HEAD(&query->list);
+ query->seq = (u32)atomic_inc_return(&ib_nl_sa_request_seq);
+
+ spin_lock_irqsave(&ib_nl_request_lock, flags);
+ ret = ib_nl_send_msg(query);
+ if (ret <= 0) {
+ ret = -EIO;
+ goto request_out;
+ } else {
+ ret = 0;
+ }
+
+ delay = msecs_to_jiffies(sa_local_svc_timeout_ms);
+ query->timeout = delay + jiffies;
+ list_add_tail(&query->list, &ib_nl_request_list);
+ /* Start the timeout if this is the only request */
+ if (ib_nl_request_list.next == &query->list)
+ queue_delayed_work(ib_nl_wq, &ib_nl_timed_work, delay);
+
+request_out:
+ spin_unlock_irqrestore(&ib_nl_request_lock, flags);
+
+ return ret;
+}
+
+static int ib_nl_cancel_request(struct ib_sa_query *query)
+{
+ unsigned long flags;
+ struct ib_sa_query *wait_query;
+ int found = 0;
+
+ spin_lock_irqsave(&ib_nl_request_lock, flags);
+ list_for_each_entry(wait_query, &ib_nl_request_list, list) {
+ /* Let the timeout to take care of the callback */
+ if (query == wait_query) {
+ query->flags |= IB_SA_CANCEL;
+ query->timeout = jiffies;
+ list_move(&query->list, &ib_nl_request_list);
+ found = 1;
+ mod_delayed_work(ib_nl_wq, &ib_nl_timed_work, 1);
+ break;
+ }
+ }
+ spin_unlock_irqrestore(&ib_nl_request_lock, flags);
+
+ return found;
+}
+
+static void send_handler(struct ib_mad_agent *agent,
+ struct ib_mad_send_wc *mad_send_wc);
+
+static void ib_nl_process_good_resolve_rsp(struct ib_sa_query *query,
+ const struct nlmsghdr *nlh)
+{
+ struct ib_mad_send_wc mad_send_wc;
+ struct ib_sa_mad *mad = NULL;
+ const struct nlattr *head, *curr;
+ struct ib_path_rec_data *rec;
+ int len, rem;
+ u32 mask = 0;
+ int status = -EIO;
+
+ if (query->callback) {
+ head = (const struct nlattr *) nlmsg_data(nlh);
+ len = nlmsg_len(nlh);
+ switch (query->path_use) {
+ case LS_RESOLVE_PATH_USE_UNIDIRECTIONAL:
+ mask = IB_PATH_PRIMARY | IB_PATH_OUTBOUND;
+ break;
+
+ case LS_RESOLVE_PATH_USE_ALL:
+ case LS_RESOLVE_PATH_USE_GMP:
+ default:
+ mask = IB_PATH_PRIMARY | IB_PATH_GMP |
+ IB_PATH_BIDIRECTIONAL;
+ break;
+ }
+ nla_for_each_attr(curr, head, len, rem) {
+ if (curr->nla_type == LS_NLA_TYPE_PATH_RECORD) {
+ rec = nla_data(curr);
+ /*
+ * Get the first one. In the future, we may
+ * need to get up to 6 pathrecords.
+ */
+ if ((rec->flags & mask) == mask) {
+ mad = query->mad_buf->mad;
+ mad->mad_hdr.method |=
+ IB_MGMT_METHOD_RESP;
+ memcpy(mad->data, rec->path_rec,
+ sizeof(rec->path_rec));
+ status = 0;
+ break;
+ }
+ }
+ }
+ query->callback(query, status, mad);
+ }
+
+ mad_send_wc.send_buf = query->mad_buf;
+ mad_send_wc.status = IB_WC_SUCCESS;
+ send_handler(query->mad_buf->mad_agent, &mad_send_wc);
+}
+
+static void ib_nl_request_timeout(struct work_struct *work)
+{
+ unsigned long flags;
+ struct ib_sa_query *query;
+ unsigned long delay;
+ struct ib_mad_send_wc mad_send_wc;
+ int ret;
+
+ spin_lock_irqsave(&ib_nl_request_lock, flags);
+ while (!list_empty(&ib_nl_request_list)) {
+ query = list_entry(ib_nl_request_list.next,
+ struct ib_sa_query, list);
+
+ if (time_after(query->timeout, jiffies)) {
+ delay = query->timeout - jiffies;
+ if ((long)delay <= 0)
+ delay = 1;
+ queue_delayed_work(ib_nl_wq, &ib_nl_timed_work, delay);
+ break;
+ }
+
+ list_del(&query->list);
+ ib_sa_disable_local_svc(query);
+ /* Hold the lock to protect against query cancellation */
+ if (ib_sa_query_cancelled(query))
+ ret = -1;
+ else
+ ret = ib_post_send_mad(query->mad_buf, NULL);
+ if (ret) {
+ mad_send_wc.send_buf = query->mad_buf;
+ mad_send_wc.status = IB_WC_WR_FLUSH_ERR;
+ spin_unlock_irqrestore(&ib_nl_request_lock, flags);
+ send_handler(query->port->agent, &mad_send_wc);
+ spin_lock_irqsave(&ib_nl_request_lock, flags);
+ }
+ }
+ spin_unlock_irqrestore(&ib_nl_request_lock, flags);
+}
+
+static int ib_nl_handle_set_timeout(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ const struct nlmsghdr *nlh = (struct nlmsghdr *)cb->nlh;
+ int timeout, delta, abs_delta;
+ const struct nlattr *attr;
+ unsigned long flags;
+ struct ib_sa_query *query;
+ long delay = 0;
+ struct nlattr *tb[LS_NLA_TYPE_MAX];
+ int ret;
+
+ if (!netlink_capable(skb, CAP_NET_ADMIN))
+ return -EPERM;
+
+ ret = nla_parse(tb, LS_NLA_TYPE_MAX - 1, nlmsg_data(nlh),
+ nlmsg_len(nlh), ib_nl_policy);
+ attr = (const struct nlattr *)tb[LS_NLA_TYPE_TIMEOUT];
+ if (ret || !attr)
+ goto settimeout_out;
+
+ timeout = *(int *) nla_data(attr);
+ if (timeout < IB_SA_LOCAL_SVC_TIMEOUT_MIN)
+ timeout = IB_SA_LOCAL_SVC_TIMEOUT_MIN;
+ if (timeout > IB_SA_LOCAL_SVC_TIMEOUT_MAX)
+ timeout = IB_SA_LOCAL_SVC_TIMEOUT_MAX;
+
+ delta = timeout - sa_local_svc_timeout_ms;
+ if (delta < 0)
+ abs_delta = -delta;
+ else
+ abs_delta = delta;
+
+ if (delta != 0) {
+ spin_lock_irqsave(&ib_nl_request_lock, flags);
+ sa_local_svc_timeout_ms = timeout;
+ list_for_each_entry(query, &ib_nl_request_list, list) {
+ if (delta < 0 && abs_delta > query->timeout)
+ query->timeout = 0;
+ else
+ query->timeout += delta;
+
+ /* Get the new delay from the first entry */
+ if (!delay) {
+ delay = query->timeout - jiffies;
+ if (delay <= 0)
+ delay = 1;
+ }
+ }
+ if (delay)
+ mod_delayed_work(ib_nl_wq, &ib_nl_timed_work,
+ (unsigned long)delay);
+ spin_unlock_irqrestore(&ib_nl_request_lock, flags);
+ }
+
+settimeout_out:
+ return skb->len;
+}
+
+static inline int ib_nl_is_good_resolve_resp(const struct nlmsghdr *nlh)
+{
+ struct nlattr *tb[LS_NLA_TYPE_MAX];
+ int ret;
+
+ if (nlh->nlmsg_flags & RDMA_NL_LS_F_ERR)
+ return 0;
+
+ ret = nla_parse(tb, LS_NLA_TYPE_MAX - 1, nlmsg_data(nlh),
+ nlmsg_len(nlh), ib_nl_policy);
+ if (ret)
+ return 0;
+
+ return 1;
+}
+
+static int ib_nl_handle_resolve_resp(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ const struct nlmsghdr *nlh = (struct nlmsghdr *)cb->nlh;
+ unsigned long flags;
+ struct ib_sa_query *query;
+ struct ib_mad_send_buf *send_buf;
+ struct ib_mad_send_wc mad_send_wc;
+ int found = 0;
+ int ret;
+
+ if (!netlink_capable(skb, CAP_NET_ADMIN))
+ return -EPERM;
+
+ spin_lock_irqsave(&ib_nl_request_lock, flags);
+ list_for_each_entry(query, &ib_nl_request_list, list) {
+ /*
+ * If the query is cancelled, let the timeout routine
+ * take care of it.
+ */
+ if (nlh->nlmsg_seq == query->seq) {
+ found = !ib_sa_query_cancelled(query);
+ if (found)
+ list_del(&query->list);
+ break;
+ }
+ }
+
+ if (!found) {
+ spin_unlock_irqrestore(&ib_nl_request_lock, flags);
+ goto resp_out;
+ }
+
+ send_buf = query->mad_buf;
+
+ if (!ib_nl_is_good_resolve_resp(nlh)) {
+ /* if the result is a failure, send out the packet via IB */
+ ib_sa_disable_local_svc(query);
+ ret = ib_post_send_mad(query->mad_buf, NULL);
+ spin_unlock_irqrestore(&ib_nl_request_lock, flags);
+ if (ret) {
+ mad_send_wc.send_buf = send_buf;
+ mad_send_wc.status = IB_WC_GENERAL_ERR;
+ send_handler(query->port->agent, &mad_send_wc);
+ }
+ } else {
+ spin_unlock_irqrestore(&ib_nl_request_lock, flags);
+ ib_nl_process_good_resolve_rsp(query, nlh);
+ }
+
+resp_out:
+ return skb->len;
+}
+
+static struct ibnl_client_cbs ib_sa_cb_table[] = {
+ [RDMA_NL_LS_OP_RESOLVE] = {
+ .dump = ib_nl_handle_resolve_resp,
+ .module = THIS_MODULE },
+ [RDMA_NL_LS_OP_SET_TIMEOUT] = {
+ .dump = ib_nl_handle_set_timeout,
+ .module = THIS_MODULE },
+};
+
static void free_sm_ah(struct kref *kref)
{
struct ib_sa_sm_ah *sm_ah = container_of(kref, struct ib_sa_sm_ah, ref);
@@ -502,7 +960,13 @@ void ib_sa_cancel_query(int id, struct ib_sa_query *query)
mad_buf = query->mad_buf;
spin_unlock_irqrestore(&idr_lock, flags);
- ib_cancel_mad(agent, mad_buf);
+ /*
+ * If the query is still on the netlink request list, schedule
+ * it to be cancelled by the timeout routine. Otherwise, it has been
+ * sent to the MAD layer and has to be cancelled from there.
+ */
+ if (!ib_nl_cancel_request(query))
+ ib_cancel_mad(agent, mad_buf);
}
EXPORT_SYMBOL(ib_sa_cancel_query);
@@ -639,6 +1103,14 @@ static int send_mad(struct ib_sa_query *query, int timeout_ms, gfp_t gfp_mask)
query->mad_buf->context[0] = query;
query->id = id;
+ if (query->flags & IB_SA_ENABLE_LOCAL_SERVICE) {
+ if (!ibnl_chk_listeners(RDMA_NL_GROUP_LS)) {
+ if (!ib_nl_make_request(query))
+ return id;
+ }
+ ib_sa_disable_local_svc(query);
+ }
+
ret = ib_post_send_mad(query->mad_buf, NULL);
if (ret) {
spin_lock_irqsave(&idr_lock, flags);
@@ -740,7 +1212,7 @@ int ib_sa_path_rec_get(struct ib_sa_client *client,
port = &sa_dev->port[port_num - sa_dev->start_port];
agent = port->agent;
- query = kmalloc(sizeof *query, gfp_mask);
+ query = kzalloc(sizeof(*query), gfp_mask);
if (!query)
return -ENOMEM;
@@ -767,6 +1239,9 @@ int ib_sa_path_rec_get(struct ib_sa_client *client,
*sa_query = &query->sa_query;
+ query->sa_query.flags |= IB_SA_ENABLE_LOCAL_SERVICE;
+ query->sa_query.mad_buf->context[1] = rec;
+
ret = send_mad(&query->sa_query, timeout_ms, gfp_mask);
if (ret < 0)
goto err2;
@@ -862,7 +1337,7 @@ int ib_sa_service_rec_query(struct ib_sa_client *client,
method != IB_SA_METHOD_DELETE)
return -EINVAL;
- query = kmalloc(sizeof *query, gfp_mask);
+ query = kzalloc(sizeof(*query), gfp_mask);
if (!query)
return -ENOMEM;
@@ -954,7 +1429,7 @@ int ib_sa_mcmember_rec_query(struct ib_sa_client *client,
port = &sa_dev->port[port_num - sa_dev->start_port];
agent = port->agent;
- query = kmalloc(sizeof *query, gfp_mask);
+ query = kzalloc(sizeof(*query), gfp_mask);
if (!query)
return -ENOMEM;
@@ -1051,7 +1526,7 @@ int ib_sa_guid_info_rec_query(struct ib_sa_client *client,
port = &sa_dev->port[port_num - sa_dev->start_port];
agent = port->agent;
- query = kmalloc(sizeof *query, gfp_mask);
+ query = kzalloc(sizeof(*query), gfp_mask);
if (!query)
return -ENOMEM;
@@ -1221,9 +1696,9 @@ free:
return;
}
-static void ib_sa_remove_one(struct ib_device *device)
+static void ib_sa_remove_one(struct ib_device *device, void *client_data)
{
- struct ib_sa_device *sa_dev = ib_get_client_data(device, &sa_client);
+ struct ib_sa_device *sa_dev = client_data;
int i;
if (!sa_dev)
@@ -1251,6 +1726,8 @@ static int __init ib_sa_init(void)
get_random_bytes(&tid, sizeof tid);
+ atomic_set(&ib_nl_sa_request_seq, 0);
+
ret = ib_register_client(&sa_client);
if (ret) {
printk(KERN_ERR "Couldn't register ib_sa client\n");
@@ -1263,7 +1740,25 @@ static int __init ib_sa_init(void)
goto err2;
}
+ ib_nl_wq = create_singlethread_workqueue("ib_nl_sa_wq");
+ if (!ib_nl_wq) {
+ ret = -ENOMEM;
+ goto err3;
+ }
+
+ if (ibnl_add_client(RDMA_NL_LS, RDMA_NL_LS_NUM_OPS,
+ ib_sa_cb_table)) {
+ pr_err("Failed to add netlink callback\n");
+ ret = -EINVAL;
+ goto err4;
+ }
+ INIT_DELAYED_WORK(&ib_nl_timed_work, ib_nl_request_timeout);
+
return 0;
+err4:
+ destroy_workqueue(ib_nl_wq);
+err3:
+ mcast_cleanup();
err2:
ib_unregister_client(&sa_client);
err1:
@@ -1272,6 +1767,10 @@ err1:
static void __exit ib_sa_cleanup(void)
{
+ ibnl_remove_client(RDMA_NL_LS);
+ cancel_delayed_work(&ib_nl_timed_work);
+ flush_workqueue(ib_nl_wq);
+ destroy_workqueue(ib_nl_wq);
mcast_cleanup();
ib_unregister_client(&sa_client);
idr_destroy(&query_idr);
diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c
index 0b84a9cdfe5b..34cdd74b0a17 100644
--- a/drivers/infiniband/core/sysfs.c
+++ b/drivers/infiniband/core/sysfs.c
@@ -457,29 +457,6 @@ static struct kobj_type port_type = {
.default_attrs = port_default_attrs
};
-static void ib_device_release(struct device *device)
-{
- struct ib_device *dev = container_of(device, struct ib_device, dev);
-
- kfree(dev->port_immutable);
- kfree(dev);
-}
-
-static int ib_device_uevent(struct device *device,
- struct kobj_uevent_env *env)
-{
- struct ib_device *dev = container_of(device, struct ib_device, dev);
-
- if (add_uevent_var(env, "NAME=%s", dev->name))
- return -ENOMEM;
-
- /*
- * It would be nice to pass the node GUID with the event...
- */
-
- return 0;
-}
-
static struct attribute **
alloc_group_attrs(ssize_t (*show)(struct ib_port *,
struct port_attribute *, char *buf),
@@ -702,12 +679,6 @@ static struct device_attribute *ib_class_attributes[] = {
&dev_attr_node_desc
};
-static struct class ib_class = {
- .name = "infiniband",
- .dev_release = ib_device_release,
- .dev_uevent = ib_device_uevent,
-};
-
/* Show a given an attribute in the statistics group */
static ssize_t show_protocol_stat(const struct device *device,
struct device_attribute *attr, char *buf,
@@ -846,14 +817,12 @@ int ib_device_register_sysfs(struct ib_device *device,
int ret;
int i;
- class_dev->class = &ib_class;
- class_dev->parent = device->dma_device;
- dev_set_name(class_dev, "%s", device->name);
- dev_set_drvdata(class_dev, device);
-
- INIT_LIST_HEAD(&device->port_list);
+ device->dev.parent = device->dma_device;
+ ret = dev_set_name(class_dev, "%s", device->name);
+ if (ret)
+ return ret;
- ret = device_register(class_dev);
+ ret = device_add(class_dev);
if (ret)
goto err;
@@ -916,13 +885,3 @@ void ib_device_unregister_sysfs(struct ib_device *device)
device_unregister(&device->dev);
}
-
-int ib_sysfs_setup(void)
-{
- return class_register(&ib_class);
-}
-
-void ib_sysfs_cleanup(void)
-{
- class_unregister(&ib_class);
-}
diff --git a/drivers/infiniband/core/ucm.c b/drivers/infiniband/core/ucm.c
index 009481073644..6b4e8a008bc0 100644
--- a/drivers/infiniband/core/ucm.c
+++ b/drivers/infiniband/core/ucm.c
@@ -109,7 +109,7 @@ enum {
#define IB_UCM_BASE_DEV MKDEV(IB_UCM_MAJOR, IB_UCM_BASE_MINOR)
static void ib_ucm_add_one(struct ib_device *device);
-static void ib_ucm_remove_one(struct ib_device *device);
+static void ib_ucm_remove_one(struct ib_device *device, void *client_data);
static struct ib_client ucm_client = {
.name = "ucm",
@@ -658,8 +658,7 @@ static ssize_t ib_ucm_listen(struct ib_ucm_file *file,
if (result)
goto out;
- result = ib_cm_listen(ctx->cm_id, cmd.service_id, cmd.service_mask,
- NULL);
+ result = ib_cm_listen(ctx->cm_id, cmd.service_id, cmd.service_mask);
out:
ib_ucm_ctx_put(ctx);
return result;
@@ -1310,9 +1309,9 @@ err:
return;
}
-static void ib_ucm_remove_one(struct ib_device *device)
+static void ib_ucm_remove_one(struct ib_device *device, void *client_data)
{
- struct ib_ucm_device *ucm_dev = ib_get_client_data(device, &ucm_client);
+ struct ib_ucm_device *ucm_dev = client_data;
if (!ucm_dev)
return;
diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c
index 29b21213ea75..a53fc9b01c69 100644
--- a/drivers/infiniband/core/ucma.c
+++ b/drivers/infiniband/core/ucma.c
@@ -74,6 +74,7 @@ struct ucma_file {
struct list_head ctx_list;
struct list_head event_list;
wait_queue_head_t poll_wait;
+ struct workqueue_struct *close_wq;
};
struct ucma_context {
@@ -89,6 +90,13 @@ struct ucma_context {
struct list_head list;
struct list_head mc_list;
+ /* mark that device is in process of destroying the internal HW
+ * resources, protected by the global mut
+ */
+ int closing;
+ /* sync between removal event and id destroy, protected by file mut */
+ int destroying;
+ struct work_struct close_work;
};
struct ucma_multicast {
@@ -107,6 +115,7 @@ struct ucma_event {
struct list_head list;
struct rdma_cm_id *cm_id;
struct rdma_ucm_event_resp resp;
+ struct work_struct close_work;
};
static DEFINE_MUTEX(mut);
@@ -132,8 +141,12 @@ static struct ucma_context *ucma_get_ctx(struct ucma_file *file, int id)
mutex_lock(&mut);
ctx = _ucma_find_context(id, file);
- if (!IS_ERR(ctx))
- atomic_inc(&ctx->ref);
+ if (!IS_ERR(ctx)) {
+ if (ctx->closing)
+ ctx = ERR_PTR(-EIO);
+ else
+ atomic_inc(&ctx->ref);
+ }
mutex_unlock(&mut);
return ctx;
}
@@ -144,6 +157,28 @@ static void ucma_put_ctx(struct ucma_context *ctx)
complete(&ctx->comp);
}
+static void ucma_close_event_id(struct work_struct *work)
+{
+ struct ucma_event *uevent_close = container_of(work, struct ucma_event, close_work);
+
+ rdma_destroy_id(uevent_close->cm_id);
+ kfree(uevent_close);
+}
+
+static void ucma_close_id(struct work_struct *work)
+{
+ struct ucma_context *ctx = container_of(work, struct ucma_context, close_work);
+
+ /* once all inflight tasks are finished, we close all underlying
+ * resources. The context is still alive till its explicit destryoing
+ * by its creator.
+ */
+ ucma_put_ctx(ctx);
+ wait_for_completion(&ctx->comp);
+ /* No new events will be generated after destroying the id. */
+ rdma_destroy_id(ctx->cm_id);
+}
+
static struct ucma_context *ucma_alloc_ctx(struct ucma_file *file)
{
struct ucma_context *ctx;
@@ -152,6 +187,7 @@ static struct ucma_context *ucma_alloc_ctx(struct ucma_file *file)
if (!ctx)
return NULL;
+ INIT_WORK(&ctx->close_work, ucma_close_id);
atomic_set(&ctx->ref, 1);
init_completion(&ctx->comp);
INIT_LIST_HEAD(&ctx->mc_list);
@@ -242,6 +278,44 @@ static void ucma_set_event_context(struct ucma_context *ctx,
}
}
+/* Called with file->mut locked for the relevant context. */
+static void ucma_removal_event_handler(struct rdma_cm_id *cm_id)
+{
+ struct ucma_context *ctx = cm_id->context;
+ struct ucma_event *con_req_eve;
+ int event_found = 0;
+
+ if (ctx->destroying)
+ return;
+
+ /* only if context is pointing to cm_id that it owns it and can be
+ * queued to be closed, otherwise that cm_id is an inflight one that
+ * is part of that context event list pending to be detached and
+ * reattached to its new context as part of ucma_get_event,
+ * handled separately below.
+ */
+ if (ctx->cm_id == cm_id) {
+ mutex_lock(&mut);
+ ctx->closing = 1;
+ mutex_unlock(&mut);
+ queue_work(ctx->file->close_wq, &ctx->close_work);
+ return;
+ }
+
+ list_for_each_entry(con_req_eve, &ctx->file->event_list, list) {
+ if (con_req_eve->cm_id == cm_id &&
+ con_req_eve->resp.event == RDMA_CM_EVENT_CONNECT_REQUEST) {
+ list_del(&con_req_eve->list);
+ INIT_WORK(&con_req_eve->close_work, ucma_close_event_id);
+ queue_work(ctx->file->close_wq, &con_req_eve->close_work);
+ event_found = 1;
+ break;
+ }
+ }
+ if (!event_found)
+ printk(KERN_ERR "ucma_removal_event_handler: warning: connect request event wasn't found\n");
+}
+
static int ucma_event_handler(struct rdma_cm_id *cm_id,
struct rdma_cm_event *event)
{
@@ -276,14 +350,21 @@ static int ucma_event_handler(struct rdma_cm_id *cm_id,
* We ignore events for new connections until userspace has set
* their context. This can only happen if an error occurs on a
* new connection before the user accepts it. This is okay,
- * since the accept will just fail later.
+ * since the accept will just fail later. However, we do need
+ * to release the underlying HW resources in case of a device
+ * removal event.
*/
+ if (event->event == RDMA_CM_EVENT_DEVICE_REMOVAL)
+ ucma_removal_event_handler(cm_id);
+
kfree(uevent);
goto out;
}
list_add_tail(&uevent->list, &ctx->file->event_list);
wake_up_interruptible(&ctx->file->poll_wait);
+ if (event->event == RDMA_CM_EVENT_DEVICE_REMOVAL)
+ ucma_removal_event_handler(cm_id);
out:
mutex_unlock(&ctx->file->mut);
return ret;
@@ -442,9 +523,15 @@ static void ucma_cleanup_mc_events(struct ucma_multicast *mc)
}
/*
- * We cannot hold file->mut when calling rdma_destroy_id() or we can
- * deadlock. We also acquire file->mut in ucma_event_handler(), and
- * rdma_destroy_id() will wait until all callbacks have completed.
+ * ucma_free_ctx is called after the underlying rdma CM-ID is destroyed. At
+ * this point, no new events will be reported from the hardware. However, we
+ * still need to cleanup the UCMA context for this ID. Specifically, there
+ * might be events that have not yet been consumed by the user space software.
+ * These might include pending connect requests which we have not completed
+ * processing. We cannot call rdma_destroy_id while holding the lock of the
+ * context (file->mut), as it might cause a deadlock. We therefore extract all
+ * relevant events from the context pending events list while holding the
+ * mutex. After that we release them as needed.
*/
static int ucma_free_ctx(struct ucma_context *ctx)
{
@@ -452,8 +539,6 @@ static int ucma_free_ctx(struct ucma_context *ctx)
struct ucma_event *uevent, *tmp;
LIST_HEAD(list);
- /* No new events will be generated after destroying the id. */
- rdma_destroy_id(ctx->cm_id);
ucma_cleanup_multicast(ctx);
@@ -501,10 +586,24 @@ static ssize_t ucma_destroy_id(struct ucma_file *file, const char __user *inbuf,
if (IS_ERR(ctx))
return PTR_ERR(ctx);
- ucma_put_ctx(ctx);
- wait_for_completion(&ctx->comp);
- resp.events_reported = ucma_free_ctx(ctx);
+ mutex_lock(&ctx->file->mut);
+ ctx->destroying = 1;
+ mutex_unlock(&ctx->file->mut);
+ flush_workqueue(ctx->file->close_wq);
+ /* At this point it's guaranteed that there is no inflight
+ * closing task */
+ mutex_lock(&mut);
+ if (!ctx->closing) {
+ mutex_unlock(&mut);
+ ucma_put_ctx(ctx);
+ wait_for_completion(&ctx->comp);
+ rdma_destroy_id(ctx->cm_id);
+ } else {
+ mutex_unlock(&mut);
+ }
+
+ resp.events_reported = ucma_free_ctx(ctx);
if (copy_to_user((void __user *)(unsigned long)cmd.response,
&resp, sizeof(resp)))
ret = -EFAULT;
@@ -1321,10 +1420,10 @@ static ssize_t ucma_leave_multicast(struct ucma_file *file,
mc = ERR_PTR(-ENOENT);
else if (mc->ctx->file != file)
mc = ERR_PTR(-EINVAL);
- else {
+ else if (!atomic_inc_not_zero(&mc->ctx->ref))
+ mc = ERR_PTR(-ENXIO);
+ else
idr_remove(&multicast_idr, mc->id);
- atomic_inc(&mc->ctx->ref);
- }
mutex_unlock(&mut);
if (IS_ERR(mc)) {
@@ -1529,6 +1628,7 @@ static int ucma_open(struct inode *inode, struct file *filp)
INIT_LIST_HEAD(&file->ctx_list);
init_waitqueue_head(&file->poll_wait);
mutex_init(&file->mut);
+ file->close_wq = create_singlethread_workqueue("ucma_close_id");
filp->private_data = file;
file->filp = filp;
@@ -1543,16 +1643,34 @@ static int ucma_close(struct inode *inode, struct file *filp)
mutex_lock(&file->mut);
list_for_each_entry_safe(ctx, tmp, &file->ctx_list, list) {
+ ctx->destroying = 1;
mutex_unlock(&file->mut);
mutex_lock(&mut);
idr_remove(&ctx_idr, ctx->id);
mutex_unlock(&mut);
+ flush_workqueue(file->close_wq);
+ /* At that step once ctx was marked as destroying and workqueue
+ * was flushed we are safe from any inflights handlers that
+ * might put other closing task.
+ */
+ mutex_lock(&mut);
+ if (!ctx->closing) {
+ mutex_unlock(&mut);
+ /* rdma_destroy_id ensures that no event handlers are
+ * inflight for that id before releasing it.
+ */
+ rdma_destroy_id(ctx->cm_id);
+ } else {
+ mutex_unlock(&mut);
+ }
+
ucma_free_ctx(ctx);
mutex_lock(&file->mut);
}
mutex_unlock(&file->mut);
+ destroy_workqueue(file->close_wq);
kfree(file);
return 0;
}
diff --git a/drivers/infiniband/core/user_mad.c b/drivers/infiniband/core/user_mad.c
index 35567fffaa4e..57f281f8d686 100644
--- a/drivers/infiniband/core/user_mad.c
+++ b/drivers/infiniband/core/user_mad.c
@@ -133,7 +133,7 @@ static DEFINE_SPINLOCK(port_lock);
static DECLARE_BITMAP(dev_map, IB_UMAD_MAX_PORTS);
static void ib_umad_add_one(struct ib_device *device);
-static void ib_umad_remove_one(struct ib_device *device);
+static void ib_umad_remove_one(struct ib_device *device, void *client_data);
static void ib_umad_release_dev(struct kobject *kobj)
{
@@ -1322,9 +1322,9 @@ free:
kobject_put(&umad_dev->kobj);
}
-static void ib_umad_remove_one(struct ib_device *device)
+static void ib_umad_remove_one(struct ib_device *device, void *client_data)
{
- struct ib_umad_device *umad_dev = ib_get_client_data(device, &umad_client);
+ struct ib_umad_device *umad_dev = client_data;
int i;
if (!umad_dev)
diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h
index ba365b6d1e8d..3863d33c243d 100644
--- a/drivers/infiniband/core/uverbs.h
+++ b/drivers/infiniband/core/uverbs.h
@@ -85,15 +85,20 @@
*/
struct ib_uverbs_device {
- struct kref ref;
+ atomic_t refcount;
int num_comp_vectors;
struct completion comp;
struct device *dev;
- struct ib_device *ib_dev;
+ struct ib_device __rcu *ib_dev;
int devnum;
struct cdev cdev;
struct rb_root xrcd_tree;
struct mutex xrcd_tree_mutex;
+ struct kobject kobj;
+ struct srcu_struct disassociate_srcu;
+ struct mutex lists_mutex; /* protect lists */
+ struct list_head uverbs_file_list;
+ struct list_head uverbs_events_file_list;
};
struct ib_uverbs_event_file {
@@ -105,6 +110,7 @@ struct ib_uverbs_event_file {
wait_queue_head_t poll_wait;
struct fasync_struct *async_queue;
struct list_head event_list;
+ struct list_head list;
};
struct ib_uverbs_file {
@@ -114,6 +120,8 @@ struct ib_uverbs_file {
struct ib_ucontext *ucontext;
struct ib_event_handler event_handler;
struct ib_uverbs_event_file *async_file;
+ struct list_head list;
+ int is_closed;
};
struct ib_uverbs_event {
@@ -177,7 +185,9 @@ extern struct idr ib_uverbs_rule_idr;
void idr_remove_uobj(struct idr *idp, struct ib_uobject *uobj);
struct file *ib_uverbs_alloc_event_file(struct ib_uverbs_file *uverbs_file,
+ struct ib_device *ib_dev,
int is_async);
+void ib_uverbs_free_async_event_file(struct ib_uverbs_file *uverbs_file);
struct ib_uverbs_event_file *ib_uverbs_lookup_comp_file(int fd);
void ib_uverbs_release_ucq(struct ib_uverbs_file *file,
@@ -212,6 +222,7 @@ struct ib_uverbs_flow_spec {
#define IB_UVERBS_DECLARE_CMD(name) \
ssize_t ib_uverbs_##name(struct ib_uverbs_file *file, \
+ struct ib_device *ib_dev, \
const char __user *buf, int in_len, \
int out_len)
@@ -253,6 +264,7 @@ IB_UVERBS_DECLARE_CMD(close_xrcd);
#define IB_UVERBS_DECLARE_EX_CMD(name) \
int ib_uverbs_ex_##name(struct ib_uverbs_file *file, \
+ struct ib_device *ib_dev, \
struct ib_udata *ucore, \
struct ib_udata *uhw)
diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c
index bbb02ffe87df..be4cb9f04be3 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -282,13 +282,13 @@ static void put_xrcd_read(struct ib_uobject *uobj)
}
ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
const char __user *buf,
int in_len, int out_len)
{
struct ib_uverbs_get_context cmd;
struct ib_uverbs_get_context_resp resp;
struct ib_udata udata;
- struct ib_device *ibdev = file->device->ib_dev;
#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
struct ib_device_attr dev_attr;
#endif
@@ -313,13 +313,13 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file,
(unsigned long) cmd.response + sizeof resp,
in_len - sizeof cmd, out_len - sizeof resp);
- ucontext = ibdev->alloc_ucontext(ibdev, &udata);
+ ucontext = ib_dev->alloc_ucontext(ib_dev, &udata);
if (IS_ERR(ucontext)) {
ret = PTR_ERR(ucontext);
goto err;
}
- ucontext->device = ibdev;
+ ucontext->device = ib_dev;
INIT_LIST_HEAD(&ucontext->pd_list);
INIT_LIST_HEAD(&ucontext->mr_list);
INIT_LIST_HEAD(&ucontext->mw_list);
@@ -340,7 +340,7 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file,
ucontext->odp_mrs_count = 0;
INIT_LIST_HEAD(&ucontext->no_private_counters);
- ret = ib_query_device(ibdev, &dev_attr);
+ ret = ib_query_device(ib_dev, &dev_attr);
if (ret)
goto err_free;
if (!(dev_attr.device_cap_flags & IB_DEVICE_ON_DEMAND_PAGING))
@@ -355,7 +355,7 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file,
goto err_free;
resp.async_fd = ret;
- filp = ib_uverbs_alloc_event_file(file, 1);
+ filp = ib_uverbs_alloc_event_file(file, ib_dev, 1);
if (IS_ERR(filp)) {
ret = PTR_ERR(filp);
goto err_fd;
@@ -367,16 +367,6 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file,
goto err_file;
}
- file->async_file = filp->private_data;
-
- INIT_IB_EVENT_HANDLER(&file->event_handler, file->device->ib_dev,
- ib_uverbs_event_handler);
- ret = ib_register_event_handler(&file->event_handler);
- if (ret)
- goto err_file;
-
- kref_get(&file->async_file->ref);
- kref_get(&file->ref);
file->ucontext = ucontext;
fd_install(resp.async_fd, filp);
@@ -386,6 +376,7 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file,
return in_len;
err_file:
+ ib_uverbs_free_async_event_file(file);
fput(filp);
err_fd:
@@ -393,7 +384,7 @@ err_fd:
err_free:
put_pid(ucontext->tgid);
- ibdev->dealloc_ucontext(ucontext);
+ ib_dev->dealloc_ucontext(ucontext);
err:
mutex_unlock(&file->mutex);
@@ -401,11 +392,12 @@ err:
}
static void copy_query_dev_fields(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
struct ib_uverbs_query_device_resp *resp,
struct ib_device_attr *attr)
{
resp->fw_ver = attr->fw_ver;
- resp->node_guid = file->device->ib_dev->node_guid;
+ resp->node_guid = ib_dev->node_guid;
resp->sys_image_guid = attr->sys_image_guid;
resp->max_mr_size = attr->max_mr_size;
resp->page_size_cap = attr->page_size_cap;
@@ -443,10 +435,11 @@ static void copy_query_dev_fields(struct ib_uverbs_file *file,
resp->max_srq_sge = attr->max_srq_sge;
resp->max_pkeys = attr->max_pkeys;
resp->local_ca_ack_delay = attr->local_ca_ack_delay;
- resp->phys_port_cnt = file->device->ib_dev->phys_port_cnt;
+ resp->phys_port_cnt = ib_dev->phys_port_cnt;
}
ssize_t ib_uverbs_query_device(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
const char __user *buf,
int in_len, int out_len)
{
@@ -461,12 +454,12 @@ ssize_t ib_uverbs_query_device(struct ib_uverbs_file *file,
if (copy_from_user(&cmd, buf, sizeof cmd))
return -EFAULT;
- ret = ib_query_device(file->device->ib_dev, &attr);
+ ret = ib_query_device(ib_dev, &attr);
if (ret)
return ret;
memset(&resp, 0, sizeof resp);
- copy_query_dev_fields(file, &resp, &attr);
+ copy_query_dev_fields(file, ib_dev, &resp, &attr);
if (copy_to_user((void __user *) (unsigned long) cmd.response,
&resp, sizeof resp))
@@ -476,6 +469,7 @@ ssize_t ib_uverbs_query_device(struct ib_uverbs_file *file,
}
ssize_t ib_uverbs_query_port(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
const char __user *buf,
int in_len, int out_len)
{
@@ -490,7 +484,7 @@ ssize_t ib_uverbs_query_port(struct ib_uverbs_file *file,
if (copy_from_user(&cmd, buf, sizeof cmd))
return -EFAULT;
- ret = ib_query_port(file->device->ib_dev, cmd.port_num, &attr);
+ ret = ib_query_port(ib_dev, cmd.port_num, &attr);
if (ret)
return ret;
@@ -515,7 +509,7 @@ ssize_t ib_uverbs_query_port(struct ib_uverbs_file *file,
resp.active_width = attr.active_width;
resp.active_speed = attr.active_speed;
resp.phys_state = attr.phys_state;
- resp.link_layer = rdma_port_get_link_layer(file->device->ib_dev,
+ resp.link_layer = rdma_port_get_link_layer(ib_dev,
cmd.port_num);
if (copy_to_user((void __user *) (unsigned long) cmd.response,
@@ -526,6 +520,7 @@ ssize_t ib_uverbs_query_port(struct ib_uverbs_file *file,
}
ssize_t ib_uverbs_alloc_pd(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
const char __user *buf,
int in_len, int out_len)
{
@@ -553,15 +548,15 @@ ssize_t ib_uverbs_alloc_pd(struct ib_uverbs_file *file,
init_uobj(uobj, 0, file->ucontext, &pd_lock_class);
down_write(&uobj->mutex);
- pd = file->device->ib_dev->alloc_pd(file->device->ib_dev,
- file->ucontext, &udata);
+ pd = ib_dev->alloc_pd(ib_dev, file->ucontext, &udata);
if (IS_ERR(pd)) {
ret = PTR_ERR(pd);
goto err;
}
- pd->device = file->device->ib_dev;
+ pd->device = ib_dev;
pd->uobject = uobj;
+ pd->local_mr = NULL;
atomic_set(&pd->usecnt, 0);
uobj->object = pd;
@@ -600,11 +595,13 @@ err:
}
ssize_t ib_uverbs_dealloc_pd(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
const char __user *buf,
int in_len, int out_len)
{
struct ib_uverbs_dealloc_pd cmd;
struct ib_uobject *uobj;
+ struct ib_pd *pd;
int ret;
if (copy_from_user(&cmd, buf, sizeof cmd))
@@ -613,15 +610,20 @@ ssize_t ib_uverbs_dealloc_pd(struct ib_uverbs_file *file,
uobj = idr_write_uobj(&ib_uverbs_pd_idr, cmd.pd_handle, file->ucontext);
if (!uobj)
return -EINVAL;
+ pd = uobj->object;
- ret = ib_dealloc_pd(uobj->object);
- if (!ret)
- uobj->live = 0;
-
- put_uobj_write(uobj);
+ if (atomic_read(&pd->usecnt)) {
+ ret = -EBUSY;
+ goto err_put;
+ }
+ ret = pd->device->dealloc_pd(uobj->object);
+ WARN_ONCE(ret, "Infiniband HW driver failed dealloc_pd");
if (ret)
- return ret;
+ goto err_put;
+
+ uobj->live = 0;
+ put_uobj_write(uobj);
idr_remove_uobj(&ib_uverbs_pd_idr, uobj);
@@ -632,6 +634,10 @@ ssize_t ib_uverbs_dealloc_pd(struct ib_uverbs_file *file,
put_uobj(uobj);
return in_len;
+
+err_put:
+ put_uobj_write(uobj);
+ return ret;
}
struct xrcd_table_entry {
@@ -720,6 +726,7 @@ static void xrcd_table_delete(struct ib_uverbs_device *dev,
}
ssize_t ib_uverbs_open_xrcd(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
const char __user *buf, int in_len,
int out_len)
{
@@ -778,15 +785,14 @@ ssize_t ib_uverbs_open_xrcd(struct ib_uverbs_file *file,
down_write(&obj->uobject.mutex);
if (!xrcd) {
- xrcd = file->device->ib_dev->alloc_xrcd(file->device->ib_dev,
- file->ucontext, &udata);
+ xrcd = ib_dev->alloc_xrcd(ib_dev, file->ucontext, &udata);
if (IS_ERR(xrcd)) {
ret = PTR_ERR(xrcd);
goto err;
}
xrcd->inode = inode;
- xrcd->device = file->device->ib_dev;
+ xrcd->device = ib_dev;
atomic_set(&xrcd->usecnt, 0);
mutex_init(&xrcd->tgt_qp_mutex);
INIT_LIST_HEAD(&xrcd->tgt_qp_list);
@@ -857,6 +863,7 @@ err_tree_mutex_unlock:
}
ssize_t ib_uverbs_close_xrcd(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
const char __user *buf, int in_len,
int out_len)
{
@@ -934,6 +941,7 @@ void ib_uverbs_dealloc_xrcd(struct ib_uverbs_device *dev,
}
ssize_t ib_uverbs_reg_mr(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
const char __user *buf, int in_len,
int out_len)
{
@@ -1043,6 +1051,7 @@ err_free:
}
ssize_t ib_uverbs_rereg_mr(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
const char __user *buf, int in_len,
int out_len)
{
@@ -1136,6 +1145,7 @@ put_uobjs:
}
ssize_t ib_uverbs_dereg_mr(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
const char __user *buf, int in_len,
int out_len)
{
@@ -1174,8 +1184,9 @@ ssize_t ib_uverbs_dereg_mr(struct ib_uverbs_file *file,
}
ssize_t ib_uverbs_alloc_mw(struct ib_uverbs_file *file,
- const char __user *buf, int in_len,
- int out_len)
+ struct ib_device *ib_dev,
+ const char __user *buf, int in_len,
+ int out_len)
{
struct ib_uverbs_alloc_mw cmd;
struct ib_uverbs_alloc_mw_resp resp;
@@ -1256,8 +1267,9 @@ err_free:
}
ssize_t ib_uverbs_dealloc_mw(struct ib_uverbs_file *file,
- const char __user *buf, int in_len,
- int out_len)
+ struct ib_device *ib_dev,
+ const char __user *buf, int in_len,
+ int out_len)
{
struct ib_uverbs_dealloc_mw cmd;
struct ib_mw *mw;
@@ -1294,6 +1306,7 @@ ssize_t ib_uverbs_dealloc_mw(struct ib_uverbs_file *file,
}
ssize_t ib_uverbs_create_comp_channel(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
const char __user *buf, int in_len,
int out_len)
{
@@ -1313,7 +1326,7 @@ ssize_t ib_uverbs_create_comp_channel(struct ib_uverbs_file *file,
return ret;
resp.fd = ret;
- filp = ib_uverbs_alloc_event_file(file, 0);
+ filp = ib_uverbs_alloc_event_file(file, ib_dev, 0);
if (IS_ERR(filp)) {
put_unused_fd(resp.fd);
return PTR_ERR(filp);
@@ -1331,6 +1344,7 @@ ssize_t ib_uverbs_create_comp_channel(struct ib_uverbs_file *file,
}
static struct ib_ucq_object *create_cq(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
struct ib_udata *ucore,
struct ib_udata *uhw,
struct ib_uverbs_ex_create_cq *cmd,
@@ -1379,14 +1393,14 @@ static struct ib_ucq_object *create_cq(struct ib_uverbs_file *file,
if (cmd_sz > offsetof(typeof(*cmd), flags) + sizeof(cmd->flags))
attr.flags = cmd->flags;
- cq = file->device->ib_dev->create_cq(file->device->ib_dev, &attr,
+ cq = ib_dev->create_cq(ib_dev, &attr,
file->ucontext, uhw);
if (IS_ERR(cq)) {
ret = PTR_ERR(cq);
goto err_file;
}
- cq->device = file->device->ib_dev;
+ cq->device = ib_dev;
cq->uobject = &obj->uobject;
cq->comp_handler = ib_uverbs_comp_handler;
cq->event_handler = ib_uverbs_cq_event_handler;
@@ -1447,6 +1461,7 @@ static int ib_uverbs_create_cq_cb(struct ib_uverbs_file *file,
}
ssize_t ib_uverbs_create_cq(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
const char __user *buf, int in_len,
int out_len)
{
@@ -1475,7 +1490,7 @@ ssize_t ib_uverbs_create_cq(struct ib_uverbs_file *file,
cmd_ex.comp_vector = cmd.comp_vector;
cmd_ex.comp_channel = cmd.comp_channel;
- obj = create_cq(file, &ucore, &uhw, &cmd_ex,
+ obj = create_cq(file, ib_dev, &ucore, &uhw, &cmd_ex,
offsetof(typeof(cmd_ex), comp_channel) +
sizeof(cmd.comp_channel), ib_uverbs_create_cq_cb,
NULL);
@@ -1498,6 +1513,7 @@ static int ib_uverbs_ex_create_cq_cb(struct ib_uverbs_file *file,
}
int ib_uverbs_ex_create_cq(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
struct ib_udata *ucore,
struct ib_udata *uhw)
{
@@ -1523,7 +1539,7 @@ int ib_uverbs_ex_create_cq(struct ib_uverbs_file *file,
sizeof(resp.response_length)))
return -ENOSPC;
- obj = create_cq(file, ucore, uhw, &cmd,
+ obj = create_cq(file, ib_dev, ucore, uhw, &cmd,
min(ucore->inlen, sizeof(cmd)),
ib_uverbs_ex_create_cq_cb, NULL);
@@ -1534,6 +1550,7 @@ int ib_uverbs_ex_create_cq(struct ib_uverbs_file *file,
}
ssize_t ib_uverbs_resize_cq(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
const char __user *buf, int in_len,
int out_len)
{
@@ -1597,6 +1614,7 @@ static int copy_wc_to_user(void __user *dest, struct ib_wc *wc)
}
ssize_t ib_uverbs_poll_cq(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
const char __user *buf, int in_len,
int out_len)
{
@@ -1648,6 +1666,7 @@ out_put:
}
ssize_t ib_uverbs_req_notify_cq(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
const char __user *buf, int in_len,
int out_len)
{
@@ -1670,6 +1689,7 @@ ssize_t ib_uverbs_req_notify_cq(struct ib_uverbs_file *file,
}
ssize_t ib_uverbs_destroy_cq(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
const char __user *buf, int in_len,
int out_len)
{
@@ -1722,6 +1742,7 @@ ssize_t ib_uverbs_destroy_cq(struct ib_uverbs_file *file,
}
ssize_t ib_uverbs_create_qp(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
const char __user *buf, int in_len,
int out_len)
{
@@ -1917,6 +1938,7 @@ err_put:
}
ssize_t ib_uverbs_open_qp(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
const char __user *buf, int in_len, int out_len)
{
struct ib_uverbs_open_qp cmd;
@@ -2011,6 +2033,7 @@ err_put:
}
ssize_t ib_uverbs_query_qp(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
const char __user *buf, int in_len,
int out_len)
{
@@ -2125,6 +2148,7 @@ static int modify_qp_mask(enum ib_qp_type qp_type, int mask)
}
ssize_t ib_uverbs_modify_qp(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
const char __user *buf, int in_len,
int out_len)
{
@@ -2221,6 +2245,7 @@ out:
}
ssize_t ib_uverbs_destroy_qp(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
const char __user *buf, int in_len,
int out_len)
{
@@ -2279,6 +2304,7 @@ ssize_t ib_uverbs_destroy_qp(struct ib_uverbs_file *file,
}
ssize_t ib_uverbs_post_send(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
const char __user *buf, int in_len,
int out_len)
{
@@ -2346,6 +2372,12 @@ ssize_t ib_uverbs_post_send(struct ib_uverbs_file *file,
next->send_flags = user_wr->send_flags;
if (is_ud) {
+ if (next->opcode != IB_WR_SEND &&
+ next->opcode != IB_WR_SEND_WITH_IMM) {
+ ret = -EINVAL;
+ goto out_put;
+ }
+
next->wr.ud.ah = idr_read_ah(user_wr->wr.ud.ah,
file->ucontext);
if (!next->wr.ud.ah) {
@@ -2385,9 +2417,11 @@ ssize_t ib_uverbs_post_send(struct ib_uverbs_file *file,
user_wr->wr.atomic.compare_add;
next->wr.atomic.swap = user_wr->wr.atomic.swap;
next->wr.atomic.rkey = user_wr->wr.atomic.rkey;
+ case IB_WR_SEND:
break;
default:
- break;
+ ret = -EINVAL;
+ goto out_put;
}
}
@@ -2523,6 +2557,7 @@ err:
}
ssize_t ib_uverbs_post_recv(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
const char __user *buf, int in_len,
int out_len)
{
@@ -2572,6 +2607,7 @@ out:
}
ssize_t ib_uverbs_post_srq_recv(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
const char __user *buf, int in_len,
int out_len)
{
@@ -2621,6 +2657,7 @@ out:
}
ssize_t ib_uverbs_create_ah(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
const char __user *buf, int in_len,
int out_len)
{
@@ -2713,6 +2750,7 @@ err:
}
ssize_t ib_uverbs_destroy_ah(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
const char __user *buf, int in_len, int out_len)
{
struct ib_uverbs_destroy_ah cmd;
@@ -2749,6 +2787,7 @@ ssize_t ib_uverbs_destroy_ah(struct ib_uverbs_file *file,
}
ssize_t ib_uverbs_attach_mcast(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
const char __user *buf, int in_len,
int out_len)
{
@@ -2796,6 +2835,7 @@ out_put:
}
ssize_t ib_uverbs_detach_mcast(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
const char __user *buf, int in_len,
int out_len)
{
@@ -2876,6 +2916,7 @@ static int kern_spec_to_ib_spec(struct ib_uverbs_flow_spec *kern_spec,
}
int ib_uverbs_ex_create_flow(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
struct ib_udata *ucore,
struct ib_udata *uhw)
{
@@ -3036,6 +3077,7 @@ err_free_attr:
}
int ib_uverbs_ex_destroy_flow(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
struct ib_udata *ucore,
struct ib_udata *uhw)
{
@@ -3078,6 +3120,7 @@ int ib_uverbs_ex_destroy_flow(struct ib_uverbs_file *file,
}
static int __uverbs_create_xsrq(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
struct ib_uverbs_create_xsrq *cmd,
struct ib_udata *udata)
{
@@ -3211,6 +3254,7 @@ err:
}
ssize_t ib_uverbs_create_srq(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
const char __user *buf, int in_len,
int out_len)
{
@@ -3238,7 +3282,7 @@ ssize_t ib_uverbs_create_srq(struct ib_uverbs_file *file,
(unsigned long) cmd.response + sizeof resp,
in_len - sizeof cmd, out_len - sizeof resp);
- ret = __uverbs_create_xsrq(file, &xcmd, &udata);
+ ret = __uverbs_create_xsrq(file, ib_dev, &xcmd, &udata);
if (ret)
return ret;
@@ -3246,6 +3290,7 @@ ssize_t ib_uverbs_create_srq(struct ib_uverbs_file *file,
}
ssize_t ib_uverbs_create_xsrq(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
const char __user *buf, int in_len, int out_len)
{
struct ib_uverbs_create_xsrq cmd;
@@ -3263,7 +3308,7 @@ ssize_t ib_uverbs_create_xsrq(struct ib_uverbs_file *file,
(unsigned long) cmd.response + sizeof resp,
in_len - sizeof cmd, out_len - sizeof resp);
- ret = __uverbs_create_xsrq(file, &cmd, &udata);
+ ret = __uverbs_create_xsrq(file, ib_dev, &cmd, &udata);
if (ret)
return ret;
@@ -3271,6 +3316,7 @@ ssize_t ib_uverbs_create_xsrq(struct ib_uverbs_file *file,
}
ssize_t ib_uverbs_modify_srq(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
const char __user *buf, int in_len,
int out_len)
{
@@ -3301,6 +3347,7 @@ ssize_t ib_uverbs_modify_srq(struct ib_uverbs_file *file,
}
ssize_t ib_uverbs_query_srq(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
const char __user *buf,
int in_len, int out_len)
{
@@ -3341,6 +3388,7 @@ ssize_t ib_uverbs_query_srq(struct ib_uverbs_file *file,
}
ssize_t ib_uverbs_destroy_srq(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
const char __user *buf, int in_len,
int out_len)
{
@@ -3398,16 +3446,15 @@ ssize_t ib_uverbs_destroy_srq(struct ib_uverbs_file *file,
}
int ib_uverbs_ex_query_device(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
struct ib_udata *ucore,
struct ib_udata *uhw)
{
struct ib_uverbs_ex_query_device_resp resp;
struct ib_uverbs_ex_query_device cmd;
struct ib_device_attr attr;
- struct ib_device *device;
int err;
- device = file->device->ib_dev;
if (ucore->inlen < sizeof(cmd))
return -EINVAL;
@@ -3428,11 +3475,11 @@ int ib_uverbs_ex_query_device(struct ib_uverbs_file *file,
memset(&attr, 0, sizeof(attr));
- err = device->query_device(device, &attr, uhw);
+ err = ib_dev->query_device(ib_dev, &attr, uhw);
if (err)
return err;
- copy_query_dev_fields(file, &resp.base, &attr);
+ copy_query_dev_fields(file, ib_dev, &resp.base, &attr);
resp.comp_mask = 0;
if (ucore->outlen < resp.response_length + sizeof(resp.odp_caps))
diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c
index f6eef2da7097..c29a660c72fe 100644
--- a/drivers/infiniband/core/uverbs_main.c
+++ b/drivers/infiniband/core/uverbs_main.c
@@ -79,6 +79,7 @@ static DEFINE_SPINLOCK(map_lock);
static DECLARE_BITMAP(dev_map, IB_UVERBS_MAX_DEVICES);
static ssize_t (*uverbs_cmd_table[])(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
const char __user *buf, int in_len,
int out_len) = {
[IB_USER_VERBS_CMD_GET_CONTEXT] = ib_uverbs_get_context,
@@ -119,6 +120,7 @@ static ssize_t (*uverbs_cmd_table[])(struct ib_uverbs_file *file,
};
static int (*uverbs_ex_cmd_table[])(struct ib_uverbs_file *file,
+ struct ib_device *ib_dev,
struct ib_udata *ucore,
struct ib_udata *uhw) = {
[IB_USER_VERBS_EX_CMD_CREATE_FLOW] = ib_uverbs_ex_create_flow,
@@ -128,16 +130,21 @@ static int (*uverbs_ex_cmd_table[])(struct ib_uverbs_file *file,
};
static void ib_uverbs_add_one(struct ib_device *device);
-static void ib_uverbs_remove_one(struct ib_device *device);
+static void ib_uverbs_remove_one(struct ib_device *device, void *client_data);
-static void ib_uverbs_release_dev(struct kref *ref)
+static void ib_uverbs_release_dev(struct kobject *kobj)
{
struct ib_uverbs_device *dev =
- container_of(ref, struct ib_uverbs_device, ref);
+ container_of(kobj, struct ib_uverbs_device, kobj);
- complete(&dev->comp);
+ cleanup_srcu_struct(&dev->disassociate_srcu);
+ kfree(dev);
}
+static struct kobj_type ib_uverbs_dev_ktype = {
+ .release = ib_uverbs_release_dev,
+};
+
static void ib_uverbs_release_event_file(struct kref *ref)
{
struct ib_uverbs_event_file *file =
@@ -201,9 +208,6 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file,
{
struct ib_uobject *uobj, *tmp;
- if (!context)
- return 0;
-
context->closing = 1;
list_for_each_entry_safe(uobj, tmp, &context->ah_list, list) {
@@ -303,13 +307,27 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file,
return context->device->dealloc_ucontext(context);
}
+static void ib_uverbs_comp_dev(struct ib_uverbs_device *dev)
+{
+ complete(&dev->comp);
+}
+
static void ib_uverbs_release_file(struct kref *ref)
{
struct ib_uverbs_file *file =
container_of(ref, struct ib_uverbs_file, ref);
+ struct ib_device *ib_dev;
+ int srcu_key;
+
+ srcu_key = srcu_read_lock(&file->device->disassociate_srcu);
+ ib_dev = srcu_dereference(file->device->ib_dev,
+ &file->device->disassociate_srcu);
+ if (ib_dev && !ib_dev->disassociate_ucontext)
+ module_put(ib_dev->owner);
+ srcu_read_unlock(&file->device->disassociate_srcu, srcu_key);
- module_put(file->device->ib_dev->owner);
- kref_put(&file->device->ref, ib_uverbs_release_dev);
+ if (atomic_dec_and_test(&file->device->refcount))
+ ib_uverbs_comp_dev(file->device);
kfree(file);
}
@@ -331,9 +349,19 @@ static ssize_t ib_uverbs_event_read(struct file *filp, char __user *buf,
return -EAGAIN;
if (wait_event_interruptible(file->poll_wait,
- !list_empty(&file->event_list)))
+ (!list_empty(&file->event_list) ||
+ /* The barriers built into wait_event_interruptible()
+ * and wake_up() guarentee this will see the null set
+ * without using RCU
+ */
+ !file->uverbs_file->device->ib_dev)))
return -ERESTARTSYS;
+ /* If device was disassociated and no event exists set an error */
+ if (list_empty(&file->event_list) &&
+ !file->uverbs_file->device->ib_dev)
+ return -EIO;
+
spin_lock_irq(&file->lock);
}
@@ -396,8 +424,11 @@ static int ib_uverbs_event_close(struct inode *inode, struct file *filp)
{
struct ib_uverbs_event_file *file = filp->private_data;
struct ib_uverbs_event *entry, *tmp;
+ int closed_already = 0;
+ mutex_lock(&file->uverbs_file->device->lists_mutex);
spin_lock_irq(&file->lock);
+ closed_already = file->is_closed;
file->is_closed = 1;
list_for_each_entry_safe(entry, tmp, &file->event_list, list) {
if (entry->counter)
@@ -405,11 +436,15 @@ static int ib_uverbs_event_close(struct inode *inode, struct file *filp)
kfree(entry);
}
spin_unlock_irq(&file->lock);
-
- if (file->is_async) {
- ib_unregister_event_handler(&file->uverbs_file->event_handler);
- kref_put(&file->uverbs_file->ref, ib_uverbs_release_file);
+ if (!closed_already) {
+ list_del(&file->list);
+ if (file->is_async)
+ ib_unregister_event_handler(&file->uverbs_file->
+ event_handler);
}
+ mutex_unlock(&file->uverbs_file->device->lists_mutex);
+
+ kref_put(&file->uverbs_file->ref, ib_uverbs_release_file);
kref_put(&file->ref, ib_uverbs_release_event_file);
return 0;
@@ -541,13 +576,21 @@ void ib_uverbs_event_handler(struct ib_event_handler *handler,
NULL, NULL);
}
+void ib_uverbs_free_async_event_file(struct ib_uverbs_file *file)
+{
+ kref_put(&file->async_file->ref, ib_uverbs_release_event_file);
+ file->async_file = NULL;
+}
+
struct file *ib_uverbs_alloc_event_file(struct ib_uverbs_file *uverbs_file,
+ struct ib_device *ib_dev,
int is_async)
{
struct ib_uverbs_event_file *ev_file;
struct file *filp;
+ int ret;
- ev_file = kmalloc(sizeof *ev_file, GFP_KERNEL);
+ ev_file = kzalloc(sizeof(*ev_file), GFP_KERNEL);
if (!ev_file)
return ERR_PTR(-ENOMEM);
@@ -556,16 +599,47 @@ struct file *ib_uverbs_alloc_event_file(struct ib_uverbs_file *uverbs_file,
INIT_LIST_HEAD(&ev_file->event_list);
init_waitqueue_head(&ev_file->poll_wait);
ev_file->uverbs_file = uverbs_file;
+ kref_get(&ev_file->uverbs_file->ref);
ev_file->async_queue = NULL;
- ev_file->is_async = is_async;
ev_file->is_closed = 0;
filp = anon_inode_getfile("[infinibandevent]", &uverbs_event_fops,
ev_file, O_RDONLY);
if (IS_ERR(filp))
- kfree(ev_file);
+ goto err_put_refs;
+
+ mutex_lock(&uverbs_file->device->lists_mutex);
+ list_add_tail(&ev_file->list,
+ &uverbs_file->device->uverbs_events_file_list);
+ mutex_unlock(&uverbs_file->device->lists_mutex);
+
+ if (is_async) {
+ WARN_ON(uverbs_file->async_file);
+ uverbs_file->async_file = ev_file;
+ kref_get(&uverbs_file->async_file->ref);
+ INIT_IB_EVENT_HANDLER(&uverbs_file->event_handler,
+ ib_dev,
+ ib_uverbs_event_handler);
+ ret = ib_register_event_handler(&uverbs_file->event_handler);
+ if (ret)
+ goto err_put_file;
+
+ /* At that point async file stuff was fully set */
+ ev_file->is_async = 1;
+ }
return filp;
+
+err_put_file:
+ fput(filp);
+ kref_put(&uverbs_file->async_file->ref, ib_uverbs_release_event_file);
+ uverbs_file->async_file = NULL;
+ return ERR_PTR(ret);
+
+err_put_refs:
+ kref_put(&ev_file->uverbs_file->ref, ib_uverbs_release_file);
+ kref_put(&ev_file->ref, ib_uverbs_release_event_file);
+ return filp;
}
/*
@@ -601,8 +675,11 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf,
size_t count, loff_t *pos)
{
struct ib_uverbs_file *file = filp->private_data;
+ struct ib_device *ib_dev;
struct ib_uverbs_cmd_hdr hdr;
__u32 flags;
+ int srcu_key;
+ ssize_t ret;
if (count < sizeof hdr)
return -EINVAL;
@@ -610,6 +687,14 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf,
if (copy_from_user(&hdr, buf, sizeof hdr))
return -EFAULT;
+ srcu_key = srcu_read_lock(&file->device->disassociate_srcu);
+ ib_dev = srcu_dereference(file->device->ib_dev,
+ &file->device->disassociate_srcu);
+ if (!ib_dev) {
+ ret = -EIO;
+ goto out;
+ }
+
flags = (hdr.command &
IB_USER_VERBS_CMD_FLAGS_MASK) >> IB_USER_VERBS_CMD_FLAGS_SHIFT;
@@ -617,26 +702,36 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf,
__u32 command;
if (hdr.command & ~(__u32)(IB_USER_VERBS_CMD_FLAGS_MASK |
- IB_USER_VERBS_CMD_COMMAND_MASK))
- return -EINVAL;
+ IB_USER_VERBS_CMD_COMMAND_MASK)) {
+ ret = -EINVAL;
+ goto out;
+ }
command = hdr.command & IB_USER_VERBS_CMD_COMMAND_MASK;
if (command >= ARRAY_SIZE(uverbs_cmd_table) ||
- !uverbs_cmd_table[command])
- return -EINVAL;
+ !uverbs_cmd_table[command]) {
+ ret = -EINVAL;
+ goto out;
+ }
if (!file->ucontext &&
- command != IB_USER_VERBS_CMD_GET_CONTEXT)
- return -EINVAL;
+ command != IB_USER_VERBS_CMD_GET_CONTEXT) {
+ ret = -EINVAL;
+ goto out;
+ }
- if (!(file->device->ib_dev->uverbs_cmd_mask & (1ull << command)))
- return -ENOSYS;
+ if (!(ib_dev->uverbs_cmd_mask & (1ull << command))) {
+ ret = -ENOSYS;
+ goto out;
+ }
- if (hdr.in_words * 4 != count)
- return -EINVAL;
+ if (hdr.in_words * 4 != count) {
+ ret = -EINVAL;
+ goto out;
+ }
- return uverbs_cmd_table[command](file,
+ ret = uverbs_cmd_table[command](file, ib_dev,
buf + sizeof(hdr),
hdr.in_words * 4,
hdr.out_words * 4);
@@ -647,51 +742,72 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf,
struct ib_uverbs_ex_cmd_hdr ex_hdr;
struct ib_udata ucore;
struct ib_udata uhw;
- int err;
size_t written_count = count;
if (hdr.command & ~(__u32)(IB_USER_VERBS_CMD_FLAGS_MASK |
- IB_USER_VERBS_CMD_COMMAND_MASK))
- return -EINVAL;
+ IB_USER_VERBS_CMD_COMMAND_MASK)) {
+ ret = -EINVAL;
+ goto out;
+ }
command = hdr.command & IB_USER_VERBS_CMD_COMMAND_MASK;
if (command >= ARRAY_SIZE(uverbs_ex_cmd_table) ||
- !uverbs_ex_cmd_table[command])
- return -ENOSYS;
+ !uverbs_ex_cmd_table[command]) {
+ ret = -ENOSYS;
+ goto out;
+ }
- if (!file->ucontext)
- return -EINVAL;
+ if (!file->ucontext) {
+ ret = -EINVAL;
+ goto out;
+ }
- if (!(file->device->ib_dev->uverbs_ex_cmd_mask & (1ull << command)))
- return -ENOSYS;
+ if (!(ib_dev->uverbs_ex_cmd_mask & (1ull << command))) {
+ ret = -ENOSYS;
+ goto out;
+ }
- if (count < (sizeof(hdr) + sizeof(ex_hdr)))
- return -EINVAL;
+ if (count < (sizeof(hdr) + sizeof(ex_hdr))) {
+ ret = -EINVAL;
+ goto out;
+ }
- if (copy_from_user(&ex_hdr, buf + sizeof(hdr), sizeof(ex_hdr)))
- return -EFAULT;
+ if (copy_from_user(&ex_hdr, buf + sizeof(hdr), sizeof(ex_hdr))) {
+ ret = -EFAULT;
+ goto out;
+ }
count -= sizeof(hdr) + sizeof(ex_hdr);
buf += sizeof(hdr) + sizeof(ex_hdr);
- if ((hdr.in_words + ex_hdr.provider_in_words) * 8 != count)
- return -EINVAL;
+ if ((hdr.in_words + ex_hdr.provider_in_words) * 8 != count) {
+ ret = -EINVAL;
+ goto out;
+ }
- if (ex_hdr.cmd_hdr_reserved)
- return -EINVAL;
+ if (ex_hdr.cmd_hdr_reserved) {
+ ret = -EINVAL;
+ goto out;
+ }
if (ex_hdr.response) {
- if (!hdr.out_words && !ex_hdr.provider_out_words)
- return -EINVAL;
+ if (!hdr.out_words && !ex_hdr.provider_out_words) {
+ ret = -EINVAL;
+ goto out;
+ }
if (!access_ok(VERIFY_WRITE,
(void __user *) (unsigned long) ex_hdr.response,
- (hdr.out_words + ex_hdr.provider_out_words) * 8))
- return -EFAULT;
+ (hdr.out_words + ex_hdr.provider_out_words) * 8)) {
+ ret = -EFAULT;
+ goto out;
+ }
} else {
- if (hdr.out_words || ex_hdr.provider_out_words)
- return -EINVAL;
+ if (hdr.out_words || ex_hdr.provider_out_words) {
+ ret = -EINVAL;
+ goto out;
+ }
}
INIT_UDATA_BUF_OR_NULL(&ucore, buf, (unsigned long) ex_hdr.response,
@@ -703,27 +819,43 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf,
ex_hdr.provider_in_words * 8,
ex_hdr.provider_out_words * 8);
- err = uverbs_ex_cmd_table[command](file,
+ ret = uverbs_ex_cmd_table[command](file,
+ ib_dev,
&ucore,
&uhw);
-
- if (err)
- return err;
-
- return written_count;
+ if (!ret)
+ ret = written_count;
+ } else {
+ ret = -ENOSYS;
}
- return -ENOSYS;
+out:
+ srcu_read_unlock(&file->device->disassociate_srcu, srcu_key);
+ return ret;
}
static int ib_uverbs_mmap(struct file *filp, struct vm_area_struct *vma)
{
struct ib_uverbs_file *file = filp->private_data;
+ struct ib_device *ib_dev;
+ int ret = 0;
+ int srcu_key;
+
+ srcu_key = srcu_read_lock(&file->device->disassociate_srcu);
+ ib_dev = srcu_dereference(file->device->ib_dev,
+ &file->device->disassociate_srcu);
+ if (!ib_dev) {
+ ret = -EIO;
+ goto out;
+ }
if (!file->ucontext)
- return -ENODEV;
+ ret = -ENODEV;
else
- return file->device->ib_dev->mmap(file->ucontext, vma);
+ ret = ib_dev->mmap(file->ucontext, vma);
+out:
+ srcu_read_unlock(&file->device->disassociate_srcu, srcu_key);
+ return ret;
}
/*
@@ -740,23 +872,43 @@ static int ib_uverbs_open(struct inode *inode, struct file *filp)
{
struct ib_uverbs_device *dev;
struct ib_uverbs_file *file;
+ struct ib_device *ib_dev;
int ret;
+ int module_dependent;
+ int srcu_key;
dev = container_of(inode->i_cdev, struct ib_uverbs_device, cdev);
- if (dev)
- kref_get(&dev->ref);
- else
+ if (!atomic_inc_not_zero(&dev->refcount))
return -ENXIO;
- if (!try_module_get(dev->ib_dev->owner)) {
- ret = -ENODEV;
+ srcu_key = srcu_read_lock(&dev->disassociate_srcu);
+ mutex_lock(&dev->lists_mutex);
+ ib_dev = srcu_dereference(dev->ib_dev,
+ &dev->disassociate_srcu);
+ if (!ib_dev) {
+ ret = -EIO;
goto err;
}
- file = kmalloc(sizeof *file, GFP_KERNEL);
+ /* In case IB device supports disassociate ucontext, there is no hard
+ * dependency between uverbs device and its low level device.
+ */
+ module_dependent = !(ib_dev->disassociate_ucontext);
+
+ if (module_dependent) {
+ if (!try_module_get(ib_dev->owner)) {
+ ret = -ENODEV;
+ goto err;
+ }
+ }
+
+ file = kzalloc(sizeof(*file), GFP_KERNEL);
if (!file) {
ret = -ENOMEM;
- goto err_module;
+ if (module_dependent)
+ goto err_module;
+
+ goto err;
}
file->device = dev;
@@ -766,27 +918,47 @@ static int ib_uverbs_open(struct inode *inode, struct file *filp)
mutex_init(&file->mutex);
filp->private_data = file;
+ kobject_get(&dev->kobj);
+ list_add_tail(&file->list, &dev->uverbs_file_list);
+ mutex_unlock(&dev->lists_mutex);
+ srcu_read_unlock(&dev->disassociate_srcu, srcu_key);
return nonseekable_open(inode, filp);
err_module:
- module_put(dev->ib_dev->owner);
+ module_put(ib_dev->owner);
err:
- kref_put(&dev->ref, ib_uverbs_release_dev);
+ mutex_unlock(&dev->lists_mutex);
+ srcu_read_unlock(&dev->disassociate_srcu, srcu_key);
+ if (atomic_dec_and_test(&dev->refcount))
+ ib_uverbs_comp_dev(dev);
+
return ret;
}
static int ib_uverbs_close(struct inode *inode, struct file *filp)
{
struct ib_uverbs_file *file = filp->private_data;
-
- ib_uverbs_cleanup_ucontext(file, file->ucontext);
+ struct ib_uverbs_device *dev = file->device;
+ struct ib_ucontext *ucontext = NULL;
+
+ mutex_lock(&file->device->lists_mutex);
+ ucontext = file->ucontext;
+ file->ucontext = NULL;
+ if (!file->is_closed) {
+ list_del(&file->list);
+ file->is_closed = 1;
+ }
+ mutex_unlock(&file->device->lists_mutex);
+ if (ucontext)
+ ib_uverbs_cleanup_ucontext(file, ucontext);
if (file->async_file)
kref_put(&file->async_file->ref, ib_uverbs_release_event_file);
kref_put(&file->ref, ib_uverbs_release_file);
+ kobject_put(&dev->kobj);
return 0;
}
@@ -817,12 +989,21 @@ static struct ib_client uverbs_client = {
static ssize_t show_ibdev(struct device *device, struct device_attribute *attr,
char *buf)
{
+ int ret = -ENODEV;
+ int srcu_key;
struct ib_uverbs_device *dev = dev_get_drvdata(device);
+ struct ib_device *ib_dev;
if (!dev)
return -ENODEV;
- return sprintf(buf, "%s\n", dev->ib_dev->name);
+ srcu_key = srcu_read_lock(&dev->disassociate_srcu);
+ ib_dev = srcu_dereference(dev->ib_dev, &dev->disassociate_srcu);
+ if (ib_dev)
+ ret = sprintf(buf, "%s\n", ib_dev->name);
+ srcu_read_unlock(&dev->disassociate_srcu, srcu_key);
+
+ return ret;
}
static DEVICE_ATTR(ibdev, S_IRUGO, show_ibdev, NULL);
@@ -830,11 +1011,19 @@ static ssize_t show_dev_abi_version(struct device *device,
struct device_attribute *attr, char *buf)
{
struct ib_uverbs_device *dev = dev_get_drvdata(device);
+ int ret = -ENODEV;
+ int srcu_key;
+ struct ib_device *ib_dev;
if (!dev)
return -ENODEV;
+ srcu_key = srcu_read_lock(&dev->disassociate_srcu);
+ ib_dev = srcu_dereference(dev->ib_dev, &dev->disassociate_srcu);
+ if (ib_dev)
+ ret = sprintf(buf, "%d\n", ib_dev->uverbs_abi_ver);
+ srcu_read_unlock(&dev->disassociate_srcu, srcu_key);
- return sprintf(buf, "%d\n", dev->ib_dev->uverbs_abi_ver);
+ return ret;
}
static DEVICE_ATTR(abi_version, S_IRUGO, show_dev_abi_version, NULL);
@@ -874,6 +1063,7 @@ static void ib_uverbs_add_one(struct ib_device *device)
int devnum;
dev_t base;
struct ib_uverbs_device *uverbs_dev;
+ int ret;
if (!device->alloc_ucontext)
return;
@@ -882,10 +1072,20 @@ static void ib_uverbs_add_one(struct ib_device *device)
if (!uverbs_dev)
return;
- kref_init(&uverbs_dev->ref);
+ ret = init_srcu_struct(&uverbs_dev->disassociate_srcu);
+ if (ret) {
+ kfree(uverbs_dev);
+ return;
+ }
+
+ atomic_set(&uverbs_dev->refcount, 1);
init_completion(&uverbs_dev->comp);
uverbs_dev->xrcd_tree = RB_ROOT;
mutex_init(&uverbs_dev->xrcd_tree_mutex);
+ kobject_init(&uverbs_dev->kobj, &ib_uverbs_dev_ktype);
+ mutex_init(&uverbs_dev->lists_mutex);
+ INIT_LIST_HEAD(&uverbs_dev->uverbs_file_list);
+ INIT_LIST_HEAD(&uverbs_dev->uverbs_events_file_list);
spin_lock(&map_lock);
devnum = find_first_zero_bit(dev_map, IB_UVERBS_MAX_DEVICES);
@@ -906,12 +1106,13 @@ static void ib_uverbs_add_one(struct ib_device *device)
}
spin_unlock(&map_lock);
- uverbs_dev->ib_dev = device;
+ rcu_assign_pointer(uverbs_dev->ib_dev, device);
uverbs_dev->num_comp_vectors = device->num_comp_vectors;
cdev_init(&uverbs_dev->cdev, NULL);
uverbs_dev->cdev.owner = THIS_MODULE;
uverbs_dev->cdev.ops = device->mmap ? &uverbs_mmap_fops : &uverbs_fops;
+ uverbs_dev->cdev.kobj.parent = &uverbs_dev->kobj;
kobject_set_name(&uverbs_dev->cdev.kobj, "uverbs%d", uverbs_dev->devnum);
if (cdev_add(&uverbs_dev->cdev, base, 1))
goto err_cdev;
@@ -942,15 +1143,79 @@ err_cdev:
clear_bit(devnum, overflow_map);
err:
- kref_put(&uverbs_dev->ref, ib_uverbs_release_dev);
+ if (atomic_dec_and_test(&uverbs_dev->refcount))
+ ib_uverbs_comp_dev(uverbs_dev);
wait_for_completion(&uverbs_dev->comp);
- kfree(uverbs_dev);
+ kobject_put(&uverbs_dev->kobj);
return;
}
-static void ib_uverbs_remove_one(struct ib_device *device)
+static void ib_uverbs_free_hw_resources(struct ib_uverbs_device *uverbs_dev,
+ struct ib_device *ib_dev)
{
- struct ib_uverbs_device *uverbs_dev = ib_get_client_data(device, &uverbs_client);
+ struct ib_uverbs_file *file;
+ struct ib_uverbs_event_file *event_file;
+ struct ib_event event;
+
+ /* Pending running commands to terminate */
+ synchronize_srcu(&uverbs_dev->disassociate_srcu);
+ event.event = IB_EVENT_DEVICE_FATAL;
+ event.element.port_num = 0;
+ event.device = ib_dev;
+
+ mutex_lock(&uverbs_dev->lists_mutex);
+ while (!list_empty(&uverbs_dev->uverbs_file_list)) {
+ struct ib_ucontext *ucontext;
+
+ file = list_first_entry(&uverbs_dev->uverbs_file_list,
+ struct ib_uverbs_file, list);
+ file->is_closed = 1;
+ ucontext = file->ucontext;
+ list_del(&file->list);
+ file->ucontext = NULL;
+ kref_get(&file->ref);
+ mutex_unlock(&uverbs_dev->lists_mutex);
+ /* We must release the mutex before going ahead and calling
+ * disassociate_ucontext. disassociate_ucontext might end up
+ * indirectly calling uverbs_close, for example due to freeing
+ * the resources (e.g mmput).
+ */
+ ib_uverbs_event_handler(&file->event_handler, &event);
+ if (ucontext) {
+ ib_dev->disassociate_ucontext(ucontext);
+ ib_uverbs_cleanup_ucontext(file, ucontext);
+ }
+
+ mutex_lock(&uverbs_dev->lists_mutex);
+ kref_put(&file->ref, ib_uverbs_release_file);
+ }
+
+ while (!list_empty(&uverbs_dev->uverbs_events_file_list)) {
+ event_file = list_first_entry(&uverbs_dev->
+ uverbs_events_file_list,
+ struct ib_uverbs_event_file,
+ list);
+ spin_lock_irq(&event_file->lock);
+ event_file->is_closed = 1;
+ spin_unlock_irq(&event_file->lock);
+
+ list_del(&event_file->list);
+ if (event_file->is_async) {
+ ib_unregister_event_handler(&event_file->uverbs_file->
+ event_handler);
+ event_file->uverbs_file->event_handler.device = NULL;
+ }
+
+ wake_up_interruptible(&event_file->poll_wait);
+ kill_fasync(&event_file->async_queue, SIGIO, POLL_IN);
+ }
+ mutex_unlock(&uverbs_dev->lists_mutex);
+}
+
+static void ib_uverbs_remove_one(struct ib_device *device, void *client_data)
+{
+ struct ib_uverbs_device *uverbs_dev = client_data;
+ int wait_clients = 1;
if (!uverbs_dev)
return;
@@ -964,9 +1229,28 @@ static void ib_uverbs_remove_one(struct ib_device *device)
else
clear_bit(uverbs_dev->devnum - IB_UVERBS_MAX_DEVICES, overflow_map);
- kref_put(&uverbs_dev->ref, ib_uverbs_release_dev);
- wait_for_completion(&uverbs_dev->comp);
- kfree(uverbs_dev);
+ if (device->disassociate_ucontext) {
+ /* We disassociate HW resources and immediately return.
+ * Userspace will see a EIO errno for all future access.
+ * Upon returning, ib_device may be freed internally and is not
+ * valid any more.
+ * uverbs_device is still available until all clients close
+ * their files, then the uverbs device ref count will be zero
+ * and its resources will be freed.
+ * Note: At this point no more files can be opened since the
+ * cdev was deleted, however active clients can still issue
+ * commands and close their open files.
+ */
+ rcu_assign_pointer(uverbs_dev->ib_dev, NULL);
+ ib_uverbs_free_hw_resources(uverbs_dev, device);
+ wait_clients = 0;
+ }
+
+ if (atomic_dec_and_test(&uverbs_dev->refcount))
+ ib_uverbs_comp_dev(uverbs_dev);
+ if (wait_clients)
+ wait_for_completion(&uverbs_dev->comp);
+ kobject_put(&uverbs_dev->kobj);
}
static char *uverbs_devnode(struct device *dev, umode_t *mode)
diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
index 30eb2457000c..e1f2c9887f3f 100644
--- a/drivers/infiniband/core/verbs.c
+++ b/drivers/infiniband/core/verbs.c
@@ -213,28 +213,79 @@ EXPORT_SYMBOL(rdma_port_get_link_layer);
/* Protection domains */
+/**
+ * ib_alloc_pd - Allocates an unused protection domain.
+ * @device: The device on which to allocate the protection domain.
+ *
+ * A protection domain object provides an association between QPs, shared
+ * receive queues, address handles, memory regions, and memory windows.
+ *
+ * Every PD has a local_dma_lkey which can be used as the lkey value for local
+ * memory operations.
+ */
struct ib_pd *ib_alloc_pd(struct ib_device *device)
{
struct ib_pd *pd;
+ struct ib_device_attr devattr;
+ int rc;
+
+ rc = ib_query_device(device, &devattr);
+ if (rc)
+ return ERR_PTR(rc);
pd = device->alloc_pd(device, NULL, NULL);
+ if (IS_ERR(pd))
+ return pd;
+
+ pd->device = device;
+ pd->uobject = NULL;
+ pd->local_mr = NULL;
+ atomic_set(&pd->usecnt, 0);
+
+ if (devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY)
+ pd->local_dma_lkey = device->local_dma_lkey;
+ else {
+ struct ib_mr *mr;
+
+ mr = ib_get_dma_mr(pd, IB_ACCESS_LOCAL_WRITE);
+ if (IS_ERR(mr)) {
+ ib_dealloc_pd(pd);
+ return (struct ib_pd *)mr;
+ }
- if (!IS_ERR(pd)) {
- pd->device = device;
- pd->uobject = NULL;
- atomic_set(&pd->usecnt, 0);
+ pd->local_mr = mr;
+ pd->local_dma_lkey = pd->local_mr->lkey;
}
-
return pd;
}
EXPORT_SYMBOL(ib_alloc_pd);
-int ib_dealloc_pd(struct ib_pd *pd)
+/**
+ * ib_dealloc_pd - Deallocates a protection domain.
+ * @pd: The protection domain to deallocate.
+ *
+ * It is an error to call this function while any resources in the pd still
+ * exist. The caller is responsible to synchronously destroy them and
+ * guarantee no new allocations will happen.
+ */
+void ib_dealloc_pd(struct ib_pd *pd)
{
- if (atomic_read(&pd->usecnt))
- return -EBUSY;
+ int ret;
+
+ if (pd->local_mr) {
+ ret = ib_dereg_mr(pd->local_mr);
+ WARN_ON(ret);
+ pd->local_mr = NULL;
+ }
- return pd->device->dealloc_pd(pd);
+ /* uverbs manipulates usecnt with proper locking, while the kabi
+ requires the caller to guarantee we can't race here. */
+ WARN_ON(atomic_read(&pd->usecnt));
+
+ /* Making delalloc_pd a void return is a WIP, no driver should return
+ an error here. */
+ ret = pd->device->dealloc_pd(pd);
+ WARN_ONCE(ret, "Infiniband HW driver failed dealloc_pd");
}
EXPORT_SYMBOL(ib_dealloc_pd);
@@ -1168,54 +1219,28 @@ int ib_dereg_mr(struct ib_mr *mr)
}
EXPORT_SYMBOL(ib_dereg_mr);
-struct ib_mr *ib_create_mr(struct ib_pd *pd,
- struct ib_mr_init_attr *mr_init_attr)
-{
- struct ib_mr *mr;
-
- if (!pd->device->create_mr)
- return ERR_PTR(-ENOSYS);
-
- mr = pd->device->create_mr(pd, mr_init_attr);
-
- if (!IS_ERR(mr)) {
- mr->device = pd->device;
- mr->pd = pd;
- mr->uobject = NULL;
- atomic_inc(&pd->usecnt);
- atomic_set(&mr->usecnt, 0);
- }
-
- return mr;
-}
-EXPORT_SYMBOL(ib_create_mr);
-
-int ib_destroy_mr(struct ib_mr *mr)
-{
- struct ib_pd *pd;
- int ret;
-
- if (atomic_read(&mr->usecnt))
- return -EBUSY;
-
- pd = mr->pd;
- ret = mr->device->destroy_mr(mr);
- if (!ret)
- atomic_dec(&pd->usecnt);
-
- return ret;
-}
-EXPORT_SYMBOL(ib_destroy_mr);
-
-struct ib_mr *ib_alloc_fast_reg_mr(struct ib_pd *pd, int max_page_list_len)
+/**
+ * ib_alloc_mr() - Allocates a memory region
+ * @pd: protection domain associated with the region
+ * @mr_type: memory region type
+ * @max_num_sg: maximum sg entries available for registration.
+ *
+ * Notes:
+ * Memory registeration page/sg lists must not exceed max_num_sg.
+ * For mr_type IB_MR_TYPE_MEM_REG, the total length cannot exceed
+ * max_num_sg * used_page_size.
+ *
+ */
+struct ib_mr *ib_alloc_mr(struct ib_pd *pd,
+ enum ib_mr_type mr_type,
+ u32 max_num_sg)
{
struct ib_mr *mr;
- if (!pd->device->alloc_fast_reg_mr)
+ if (!pd->device->alloc_mr)
return ERR_PTR(-ENOSYS);
- mr = pd->device->alloc_fast_reg_mr(pd, max_page_list_len);
-
+ mr = pd->device->alloc_mr(pd, mr_type, max_num_sg);
if (!IS_ERR(mr)) {
mr->device = pd->device;
mr->pd = pd;
@@ -1226,7 +1251,7 @@ struct ib_mr *ib_alloc_fast_reg_mr(struct ib_pd *pd, int max_page_list_len)
return mr;
}
-EXPORT_SYMBOL(ib_alloc_fast_reg_mr);
+EXPORT_SYMBOL(ib_alloc_mr);
struct ib_fast_reg_page_list *ib_alloc_fast_reg_page_list(struct ib_device *device,
int max_page_list_len)