diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2018-08-17 12:44:48 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2018-08-17 12:44:48 -0700 |
commit | 9bd553929f68921be0f2014dd06561e0c8249a0d (patch) | |
tree | 720e556374e3500af9a0210178fabfc6bd0f754c /drivers/infiniband | |
parent | 022ff62c3d8c3758d15ccc6b58615fd8f257ba85 (diff) | |
parent | 0a3173a5f09bc58a3638ecfd0a80bdbae55e123c (diff) | |
download | linux-9bd553929f68921be0f2014dd06561e0c8249a0d.tar.bz2 |
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma
Pull rdma updates from Jason Gunthorpe:
"This has been a large cycle for RDMA, with several major patch series
reworking parts of the core code.
- Rework the so-called 'gid cache' and internal APIs to use a kref'd
pointer to a struct instead of copying, push this upwards into the
callers and add more stuff to the struct. The new design avoids
some ugly races the old one suffered with. This is part of the
namespace enablement work as the new struct is learning to be
namespace aware.
- Various uapi cleanups, moving more stuff to include/uapi and fixing
some long standing bugs that have recently been discovered.
- Driver updates for mlx5, mlx4 i40iw, rxe, cxgb4, hfi1, usnic,
pvrdma, and hns
- Provide max_send_sge and max_recv_sge attributes to better support
HW where these values are asymmetric.
- mlx5 user API 'devx' allows sending commands directly to the device
FW, instead of trying to cram every wild and niche feature into the
common API. Sort of like what GPU does.
- Major write() and ioctl() API rework to cleanly support PCI device
hot unplug and advance the ioctl conversion work
- Sparse and compile warning cleanups
- Add 'const' to the ib_poll_cq() signature, and permit a NULL
'bad_wr', which is the common use case
- Various patches to avoid high order allocations across the stack
- SRQ support for cxgb4, hns and qedr
- Changes to IPoIB to better follow the netdev model for working with
struct net_device liftime"
* tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma: (312 commits)
Revert "net/smc: Replace ib_query_gid with rdma_get_gid_attr"
RDMA/hns: Fix usage of bitmap allocation functions return values
IB/core: Change filter function return type from int to bool
IB/core: Update GID entries for netdevice whose mac address changes
IB/core: Add default GIDs of the bond master netdev
IB/core: Consider adding default GIDs of bond device
IB/core: Delete lower netdevice default GID entries in bonding scenario
IB/core: Avoid confusing del_netdev_default_ips
IB/core: Add comment for change upper netevent handling
qedr: Add user space support for SRQ
qedr: Add support for kernel mode SRQ's
qedr: Add wrapping generic structure for qpidr and adjust idr routines.
IB/mlx5: Fix leaking stack memory to userspace
Update the e-mail address of Bart Van Assche
IB/ucm: Fix compiling ucm.c
IB/uverbs: Do not check for device disassociation during ioctl
IB/uverbs: Remove struct uverbs_root_spec and all supporting code
IB/uverbs: Use uverbs_api to unmarshal ioctl commands
IB/uverbs: Use uverbs_alloc for allocations
IB/uverbs: Add a simple allocator to uverbs_attr_bundle
...
Diffstat (limited to 'drivers/infiniband')
175 files changed, 10923 insertions, 6256 deletions
diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig index b03af54367c0..d160d2d1f3a3 100644 --- a/drivers/infiniband/Kconfig +++ b/drivers/infiniband/Kconfig @@ -37,7 +37,7 @@ config INFINIBAND_USER_ACCESS config INFINIBAND_USER_ACCESS_UCM bool "Userspace CM (UCM, DEPRECATED)" - depends on BROKEN + depends on BROKEN || COMPILE_TEST depends on INFINIBAND_USER_ACCESS help The UCM module has known security flaws, which no one is diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile index 61667705d746..867cee5e27b2 100644 --- a/drivers/infiniband/core/Makefile +++ b/drivers/infiniband/core/Makefile @@ -35,6 +35,7 @@ ib_ucm-y := ucm.o ib_uverbs-y := uverbs_main.o uverbs_cmd.o uverbs_marshall.o \ rdma_core.o uverbs_std_types.o uverbs_ioctl.o \ - uverbs_ioctl_merge.o uverbs_std_types_cq.o \ + uverbs_std_types_cq.o \ uverbs_std_types_flow_action.o uverbs_std_types_dm.o \ - uverbs_std_types_mr.o uverbs_std_types_counters.o + uverbs_std_types_mr.o uverbs_std_types_counters.o \ + uverbs_uapi.o diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c index 4f32c4062fb6..46b855a42884 100644 --- a/drivers/infiniband/core/addr.c +++ b/drivers/infiniband/core/addr.c @@ -188,7 +188,7 @@ static int ib_nl_ip_send_msg(struct rdma_dev_addr *dev_addr, return -ENODATA; } -int rdma_addr_size(struct sockaddr *addr) +int rdma_addr_size(const struct sockaddr *addr) { switch (addr->sa_family) { case AF_INET: @@ -315,19 +315,17 @@ static int dst_fetch_ha(const struct dst_entry *dst, int ret = 0; n = dst_neigh_lookup(dst, daddr); + if (!n) + return -ENODATA; - rcu_read_lock(); - if (!n || !(n->nud_state & NUD_VALID)) { - if (n) - neigh_event_send(n, NULL); + if (!(n->nud_state & NUD_VALID)) { + neigh_event_send(n, NULL); ret = -ENODATA; } else { rdma_copy_addr(dev_addr, dst->dev, n->ha); } - rcu_read_unlock(); - if (n) - neigh_release(n); + neigh_release(n); return ret; } @@ -587,7 +585,7 @@ static void process_one_req(struct work_struct *_work) spin_unlock_bh(&lock); } -int rdma_resolve_ip(struct sockaddr *src_addr, struct sockaddr *dst_addr, +int rdma_resolve_ip(struct sockaddr *src_addr, const struct sockaddr *dst_addr, struct rdma_dev_addr *addr, int timeout_ms, void (*callback)(int status, struct sockaddr *src_addr, struct rdma_dev_addr *addr, void *context), diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c index 81d66f56e38f..0bee1f4b914e 100644 --- a/drivers/infiniband/core/cache.c +++ b/drivers/infiniband/core/cache.c @@ -66,20 +66,28 @@ enum gid_attr_find_mask { GID_ATTR_FIND_MASK_GID_TYPE = 1UL << 3, }; -enum gid_table_entry_props { - GID_TABLE_ENTRY_INVALID = 1UL << 0, - GID_TABLE_ENTRY_DEFAULT = 1UL << 1, +enum gid_table_entry_state { + GID_TABLE_ENTRY_INVALID = 1, + GID_TABLE_ENTRY_VALID = 2, + /* + * Indicates that entry is pending to be removed, there may + * be active users of this GID entry. + * When last user of the GID entry releases reference to it, + * GID entry is detached from the table. + */ + GID_TABLE_ENTRY_PENDING_DEL = 3, }; struct ib_gid_table_entry { - unsigned long props; - union ib_gid gid; - struct ib_gid_attr attr; - void *context; + struct kref kref; + struct work_struct del_work; + struct ib_gid_attr attr; + void *context; + enum gid_table_entry_state state; }; struct ib_gid_table { - int sz; + int sz; /* In RoCE, adding a GID to the table requires: * (a) Find if this GID is already exists. * (b) Find a free space. @@ -91,13 +99,16 @@ struct ib_gid_table { * **/ /* Any writer to data_vec must hold this lock and the write side of - * rwlock. readers must hold only rwlock. All writers must be in a + * rwlock. Readers must hold only rwlock. All writers must be in a * sleepable context. */ - struct mutex lock; - /* rwlock protects data_vec[ix]->props. */ - rwlock_t rwlock; - struct ib_gid_table_entry *data_vec; + struct mutex lock; + /* rwlock protects data_vec[ix]->state and entry pointer. + */ + rwlock_t rwlock; + struct ib_gid_table_entry **data_vec; + /* bit field, each bit indicates the index of default GID */ + u32 default_gid_indices; }; static void dispatch_gid_change_event(struct ib_device *ib_dev, u8 port) @@ -135,6 +146,19 @@ bool rdma_is_zero_gid(const union ib_gid *gid) } EXPORT_SYMBOL(rdma_is_zero_gid); +/** is_gid_index_default - Check if a given index belongs to + * reserved default GIDs or not. + * @table: GID table pointer + * @index: Index to check in GID table + * Returns true if index is one of the reserved default GID index otherwise + * returns false. + */ +static bool is_gid_index_default(const struct ib_gid_table *table, + unsigned int index) +{ + return index < 32 && (BIT(index) & table->default_gid_indices); +} + int ib_cache_gid_parse_type_str(const char *buf) { unsigned int i; @@ -164,26 +188,136 @@ static struct ib_gid_table *rdma_gid_table(struct ib_device *device, u8 port) return device->cache.ports[port - rdma_start_port(device)].gid; } -static void del_roce_gid(struct ib_device *device, u8 port_num, - struct ib_gid_table *table, int ix) +static bool is_gid_entry_free(const struct ib_gid_table_entry *entry) +{ + return !entry; +} + +static bool is_gid_entry_valid(const struct ib_gid_table_entry *entry) +{ + return entry && entry->state == GID_TABLE_ENTRY_VALID; +} + +static void schedule_free_gid(struct kref *kref) { + struct ib_gid_table_entry *entry = + container_of(kref, struct ib_gid_table_entry, kref); + + queue_work(ib_wq, &entry->del_work); +} + +static void free_gid_entry_locked(struct ib_gid_table_entry *entry) +{ + struct ib_device *device = entry->attr.device; + u8 port_num = entry->attr.port_num; + struct ib_gid_table *table = rdma_gid_table(device, port_num); + pr_debug("%s device=%s port=%d index=%d gid %pI6\n", __func__, - device->name, port_num, ix, - table->data_vec[ix].gid.raw); + device->name, port_num, entry->attr.index, + entry->attr.gid.raw); + + if (rdma_cap_roce_gid_table(device, port_num) && + entry->state != GID_TABLE_ENTRY_INVALID) + device->del_gid(&entry->attr, &entry->context); + + write_lock_irq(&table->rwlock); - if (rdma_cap_roce_gid_table(device, port_num)) - device->del_gid(&table->data_vec[ix].attr, - &table->data_vec[ix].context); - dev_put(table->data_vec[ix].attr.ndev); + /* + * The only way to avoid overwriting NULL in table is + * by comparing if it is same entry in table or not! + * If new entry in table is added by the time we free here, + * don't overwrite the table entry. + */ + if (entry == table->data_vec[entry->attr.index]) + table->data_vec[entry->attr.index] = NULL; + /* Now this index is ready to be allocated */ + write_unlock_irq(&table->rwlock); + + if (entry->attr.ndev) + dev_put(entry->attr.ndev); + kfree(entry); } -static int add_roce_gid(struct ib_gid_table *table, - const union ib_gid *gid, - const struct ib_gid_attr *attr) +static void free_gid_entry(struct kref *kref) +{ + struct ib_gid_table_entry *entry = + container_of(kref, struct ib_gid_table_entry, kref); + + free_gid_entry_locked(entry); +} + +/** + * free_gid_work - Release reference to the GID entry + * @work: Work structure to refer to GID entry which needs to be + * deleted. + * + * free_gid_work() frees the entry from the HCA's hardware table + * if provider supports it. It releases reference to netdevice. + */ +static void free_gid_work(struct work_struct *work) +{ + struct ib_gid_table_entry *entry = + container_of(work, struct ib_gid_table_entry, del_work); + struct ib_device *device = entry->attr.device; + u8 port_num = entry->attr.port_num; + struct ib_gid_table *table = rdma_gid_table(device, port_num); + + mutex_lock(&table->lock); + free_gid_entry_locked(entry); + mutex_unlock(&table->lock); +} + +static struct ib_gid_table_entry * +alloc_gid_entry(const struct ib_gid_attr *attr) { struct ib_gid_table_entry *entry; - int ix = attr->index; - int ret = 0; + + entry = kzalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) + return NULL; + kref_init(&entry->kref); + memcpy(&entry->attr, attr, sizeof(*attr)); + if (entry->attr.ndev) + dev_hold(entry->attr.ndev); + INIT_WORK(&entry->del_work, free_gid_work); + entry->state = GID_TABLE_ENTRY_INVALID; + return entry; +} + +static void store_gid_entry(struct ib_gid_table *table, + struct ib_gid_table_entry *entry) +{ + entry->state = GID_TABLE_ENTRY_VALID; + + pr_debug("%s device=%s port=%d index=%d gid %pI6\n", __func__, + entry->attr.device->name, entry->attr.port_num, + entry->attr.index, entry->attr.gid.raw); + + lockdep_assert_held(&table->lock); + write_lock_irq(&table->rwlock); + table->data_vec[entry->attr.index] = entry; + write_unlock_irq(&table->rwlock); +} + +static void get_gid_entry(struct ib_gid_table_entry *entry) +{ + kref_get(&entry->kref); +} + +static void put_gid_entry(struct ib_gid_table_entry *entry) +{ + kref_put(&entry->kref, schedule_free_gid); +} + +static void put_gid_entry_locked(struct ib_gid_table_entry *entry) +{ + kref_put(&entry->kref, free_gid_entry); +} + +static int add_roce_gid(struct ib_gid_table_entry *entry) +{ + const struct ib_gid_attr *attr = &entry->attr; + int ret; if (!attr->ndev) { pr_err("%s NULL netdev device=%s port=%d index=%d\n", @@ -191,38 +325,22 @@ static int add_roce_gid(struct ib_gid_table *table, attr->index); return -EINVAL; } - - entry = &table->data_vec[ix]; - if ((entry->props & GID_TABLE_ENTRY_INVALID) == 0) { - WARN(1, "GID table corruption device=%s port=%d index=%d\n", - attr->device->name, attr->port_num, - attr->index); - return -EINVAL; - } - if (rdma_cap_roce_gid_table(attr->device, attr->port_num)) { - ret = attr->device->add_gid(gid, attr, &entry->context); + ret = attr->device->add_gid(attr, &entry->context); if (ret) { pr_err("%s GID add failed device=%s port=%d index=%d\n", __func__, attr->device->name, attr->port_num, attr->index); - goto add_err; + return ret; } } - dev_hold(attr->ndev); - -add_err: - if (!ret) - pr_debug("%s device=%s port=%d index=%d gid %pI6\n", __func__, - attr->device->name, attr->port_num, ix, gid->raw); - return ret; + return 0; } /** * add_modify_gid - Add or modify GID table entry * * @table: GID table in which GID to be added or modified - * @gid: GID content * @attr: Attributes of the GID * * Returns 0 on success or appropriate error code. It accepts zero @@ -230,34 +348,42 @@ add_err: * GID. However such zero GIDs are not added to the cache. */ static int add_modify_gid(struct ib_gid_table *table, - const union ib_gid *gid, const struct ib_gid_attr *attr) { - int ret; + struct ib_gid_table_entry *entry; + int ret = 0; + + /* + * Invalidate any old entry in the table to make it safe to write to + * this index. + */ + if (is_gid_entry_valid(table->data_vec[attr->index])) + put_gid_entry(table->data_vec[attr->index]); + + /* + * Some HCA's report multiple GID entries with only one valid GID, and + * leave other unused entries as the zero GID. Convert zero GIDs to + * empty table entries instead of storing them. + */ + if (rdma_is_zero_gid(&attr->gid)) + return 0; + + entry = alloc_gid_entry(attr); + if (!entry) + return -ENOMEM; if (rdma_protocol_roce(attr->device, attr->port_num)) { - ret = add_roce_gid(table, gid, attr); + ret = add_roce_gid(entry); if (ret) - return ret; - } else { - /* - * Some HCA's report multiple GID entries with only one - * valid GID, but remaining as zero GID. - * So ignore such behavior for IB link layer and don't - * fail the call, but don't add such entry to GID cache. - */ - if (rdma_is_zero_gid(gid)) - return 0; + goto done; } - lockdep_assert_held(&table->lock); - memcpy(&table->data_vec[attr->index].gid, gid, sizeof(*gid)); - memcpy(&table->data_vec[attr->index].attr, attr, sizeof(*attr)); - - write_lock_irq(&table->rwlock); - table->data_vec[attr->index].props &= ~GID_TABLE_ENTRY_INVALID; - write_unlock_irq(&table->rwlock); + store_gid_entry(table, entry); return 0; + +done: + put_gid_entry(entry); + return ret; } /** @@ -272,16 +398,25 @@ static int add_modify_gid(struct ib_gid_table *table, static void del_gid(struct ib_device *ib_dev, u8 port, struct ib_gid_table *table, int ix) { + struct ib_gid_table_entry *entry; + lockdep_assert_held(&table->lock); + + pr_debug("%s device=%s port=%d index=%d gid %pI6\n", __func__, + ib_dev->name, port, ix, + table->data_vec[ix]->attr.gid.raw); + write_lock_irq(&table->rwlock); - table->data_vec[ix].props |= GID_TABLE_ENTRY_INVALID; + entry = table->data_vec[ix]; + entry->state = GID_TABLE_ENTRY_PENDING_DEL; + /* + * For non RoCE protocol, GID entry slot is ready to use. + */ + if (!rdma_protocol_roce(ib_dev, port)) + table->data_vec[ix] = NULL; write_unlock_irq(&table->rwlock); - if (rdma_protocol_roce(ib_dev, port)) - del_roce_gid(ib_dev, port, table, ix); - memset(&table->data_vec[ix].gid, 0, sizeof(table->data_vec[ix].gid)); - memset(&table->data_vec[ix].attr, 0, sizeof(table->data_vec[ix].attr)); - table->data_vec[ix].context = NULL; + put_gid_entry_locked(entry); } /* rwlock should be read locked, or lock should be held */ @@ -294,8 +429,8 @@ static int find_gid(struct ib_gid_table *table, const union ib_gid *gid, int empty = pempty ? -1 : 0; while (i < table->sz && (found < 0 || empty < 0)) { - struct ib_gid_table_entry *data = &table->data_vec[i]; - struct ib_gid_attr *attr = &data->attr; + struct ib_gid_table_entry *data = table->data_vec[i]; + struct ib_gid_attr *attr; int curr_index = i; i++; @@ -306,9 +441,9 @@ static int find_gid(struct ib_gid_table *table, const union ib_gid *gid, * so lookup free slot only if requested. */ if (pempty && empty < 0) { - if (data->props & GID_TABLE_ENTRY_INVALID && - (default_gid == - !!(data->props & GID_TABLE_ENTRY_DEFAULT))) { + if (is_gid_entry_free(data) && + default_gid == + is_gid_index_default(table, curr_index)) { /* * Found an invalid (free) entry; allocate it. * If default GID is requested, then our @@ -323,22 +458,23 @@ static int find_gid(struct ib_gid_table *table, const union ib_gid *gid, /* * Additionally find_gid() is used to find valid entry during - * lookup operation, where validity needs to be checked. So - * find the empty entry first to continue to search for a free - * slot and ignore its INVALID flag. + * lookup operation; so ignore the entries which are marked as + * pending for removal and the entries which are marked as + * invalid. */ - if (data->props & GID_TABLE_ENTRY_INVALID) + if (!is_gid_entry_valid(data)) continue; if (found >= 0) continue; + attr = &data->attr; if (mask & GID_ATTR_FIND_MASK_GID_TYPE && attr->gid_type != val->gid_type) continue; if (mask & GID_ATTR_FIND_MASK_GID && - memcmp(gid, &data->gid, sizeof(*gid))) + memcmp(gid, &data->attr.gid, sizeof(*gid))) continue; if (mask & GID_ATTR_FIND_MASK_NETDEV && @@ -346,8 +482,7 @@ static int find_gid(struct ib_gid_table *table, const union ib_gid *gid, continue; if (mask & GID_ATTR_FIND_MASK_DEFAULT && - !!(data->props & GID_TABLE_ENTRY_DEFAULT) != - default_gid) + is_gid_index_default(table, curr_index) != default_gid) continue; found = curr_index; @@ -396,7 +531,8 @@ static int __ib_cache_gid_add(struct ib_device *ib_dev, u8 port, attr->device = ib_dev; attr->index = empty; attr->port_num = port; - ret = add_modify_gid(table, gid, attr); + attr->gid = *gid; + ret = add_modify_gid(table, attr); if (!ret) dispatch_gid_change_event(ib_dev, port); @@ -492,7 +628,8 @@ int ib_cache_gid_del_all_netdev_gids(struct ib_device *ib_dev, u8 port, mutex_lock(&table->lock); for (ix = 0; ix < table->sz; ix++) { - if (table->data_vec[ix].attr.ndev == ndev) { + if (is_gid_entry_valid(table->data_vec[ix]) && + table->data_vec[ix]->attr.ndev == ndev) { del_gid(ib_dev, port, table, ix); deleted = true; } @@ -506,103 +643,37 @@ int ib_cache_gid_del_all_netdev_gids(struct ib_device *ib_dev, u8 port, return 0; } -static int __ib_cache_gid_get(struct ib_device *ib_dev, u8 port, int index, - union ib_gid *gid, struct ib_gid_attr *attr) -{ - struct ib_gid_table *table; - - table = rdma_gid_table(ib_dev, port); - - if (index < 0 || index >= table->sz) - return -EINVAL; - - if (table->data_vec[index].props & GID_TABLE_ENTRY_INVALID) - return -EINVAL; - - memcpy(gid, &table->data_vec[index].gid, sizeof(*gid)); - if (attr) { - memcpy(attr, &table->data_vec[index].attr, sizeof(*attr)); - if (attr->ndev) - dev_hold(attr->ndev); - } - - return 0; -} - -static int _ib_cache_gid_table_find(struct ib_device *ib_dev, - const union ib_gid *gid, - const struct ib_gid_attr *val, - unsigned long mask, - u8 *port, u16 *index) -{ - struct ib_gid_table *table; - u8 p; - int local_index; - unsigned long flags; - - for (p = 0; p < ib_dev->phys_port_cnt; p++) { - table = ib_dev->cache.ports[p].gid; - read_lock_irqsave(&table->rwlock, flags); - local_index = find_gid(table, gid, val, false, mask, NULL); - if (local_index >= 0) { - if (index) - *index = local_index; - if (port) - *port = p + rdma_start_port(ib_dev); - read_unlock_irqrestore(&table->rwlock, flags); - return 0; - } - read_unlock_irqrestore(&table->rwlock, flags); - } - - return -ENOENT; -} - -static int ib_cache_gid_find(struct ib_device *ib_dev, - const union ib_gid *gid, - enum ib_gid_type gid_type, - struct net_device *ndev, u8 *port, - u16 *index) -{ - unsigned long mask = GID_ATTR_FIND_MASK_GID | - GID_ATTR_FIND_MASK_GID_TYPE; - struct ib_gid_attr gid_attr_val = {.ndev = ndev, .gid_type = gid_type}; - - if (ndev) - mask |= GID_ATTR_FIND_MASK_NETDEV; - - return _ib_cache_gid_table_find(ib_dev, gid, &gid_attr_val, - mask, port, index); -} - /** - * ib_find_cached_gid_by_port - Returns the GID table index where a specified - * GID value occurs. It searches for the specified GID value in the local - * software cache. + * rdma_find_gid_by_port - Returns the GID entry attributes when it finds + * a valid GID entry for given search parameters. It searches for the specified + * GID value in the local software cache. * @device: The device to query. * @gid: The GID value to search for. * @gid_type: The GID type to search for. * @port_num: The port number of the device where the GID value should be * searched. - * @ndev: In RoCE, the net device of the device. Null means ignore. - * @index: The index into the cached GID table where the GID was found. This - * parameter may be NULL. + * @ndev: In RoCE, the net device of the device. NULL means ignore. + * + * Returns sgid attributes if the GID is found with valid reference or + * returns ERR_PTR for the error. + * The caller must invoke rdma_put_gid_attr() to release the reference. */ -int ib_find_cached_gid_by_port(struct ib_device *ib_dev, - const union ib_gid *gid, - enum ib_gid_type gid_type, - u8 port, struct net_device *ndev, - u16 *index) +const struct ib_gid_attr * +rdma_find_gid_by_port(struct ib_device *ib_dev, + const union ib_gid *gid, + enum ib_gid_type gid_type, + u8 port, struct net_device *ndev) { int local_index; struct ib_gid_table *table; unsigned long mask = GID_ATTR_FIND_MASK_GID | GID_ATTR_FIND_MASK_GID_TYPE; struct ib_gid_attr val = {.ndev = ndev, .gid_type = gid_type}; + const struct ib_gid_attr *attr; unsigned long flags; if (!rdma_is_port_valid(ib_dev, port)) - return -ENOENT; + return ERR_PTR(-ENOENT); table = rdma_gid_table(ib_dev, port); @@ -612,89 +683,73 @@ int ib_find_cached_gid_by_port(struct ib_device *ib_dev, read_lock_irqsave(&table->rwlock, flags); local_index = find_gid(table, gid, &val, false, mask, NULL); if (local_index >= 0) { - if (index) - *index = local_index; + get_gid_entry(table->data_vec[local_index]); + attr = &table->data_vec[local_index]->attr; read_unlock_irqrestore(&table->rwlock, flags); - return 0; + return attr; } read_unlock_irqrestore(&table->rwlock, flags); - return -ENOENT; + return ERR_PTR(-ENOENT); } -EXPORT_SYMBOL(ib_find_cached_gid_by_port); +EXPORT_SYMBOL(rdma_find_gid_by_port); /** - * ib_cache_gid_find_by_filter - Returns the GID table index where a specified - * GID value occurs + * rdma_find_gid_by_filter - Returns the GID table attribute where a + * specified GID value occurs * @device: The device to query. * @gid: The GID value to search for. - * @port_num: The port number of the device where the GID value could be + * @port: The port number of the device where the GID value could be * searched. * @filter: The filter function is executed on any matching GID in the table. * If the filter function returns true, the corresponding index is returned, * otherwise, we continue searching the GID table. It's guaranteed that * while filter is executed, ndev field is valid and the structure won't * change. filter is executed in an atomic context. filter must not be NULL. - * @index: The index into the cached GID table where the GID was found. This - * parameter may be NULL. * - * ib_cache_gid_find_by_filter() searches for the specified GID value + * rdma_find_gid_by_filter() searches for the specified GID value * of which the filter function returns true in the port's GID table. - * This function is only supported on RoCE ports. * */ -static int ib_cache_gid_find_by_filter(struct ib_device *ib_dev, - const union ib_gid *gid, - u8 port, - bool (*filter)(const union ib_gid *, - const struct ib_gid_attr *, - void *), - void *context, - u16 *index) +const struct ib_gid_attr *rdma_find_gid_by_filter( + struct ib_device *ib_dev, const union ib_gid *gid, u8 port, + bool (*filter)(const union ib_gid *gid, const struct ib_gid_attr *, + void *), + void *context) { + const struct ib_gid_attr *res = ERR_PTR(-ENOENT); struct ib_gid_table *table; - unsigned int i; unsigned long flags; - bool found = false; - + unsigned int i; - if (!rdma_is_port_valid(ib_dev, port) || - !rdma_protocol_roce(ib_dev, port)) - return -EPROTONOSUPPORT; + if (!rdma_is_port_valid(ib_dev, port)) + return ERR_PTR(-EINVAL); table = rdma_gid_table(ib_dev, port); read_lock_irqsave(&table->rwlock, flags); for (i = 0; i < table->sz; i++) { - struct ib_gid_attr attr; + struct ib_gid_table_entry *entry = table->data_vec[i]; - if (table->data_vec[i].props & GID_TABLE_ENTRY_INVALID) + if (!is_gid_entry_valid(entry)) continue; - if (memcmp(gid, &table->data_vec[i].gid, sizeof(*gid))) + if (memcmp(gid, &entry->attr.gid, sizeof(*gid))) continue; - memcpy(&attr, &table->data_vec[i].attr, sizeof(attr)); - - if (filter(gid, &attr, context)) { - found = true; - if (index) - *index = i; + if (filter(gid, &entry->attr, context)) { + get_gid_entry(entry); + res = &entry->attr; break; } } read_unlock_irqrestore(&table->rwlock, flags); - - if (!found) - return -ENOENT; - return 0; + return res; } static struct ib_gid_table *alloc_gid_table(int sz) { - struct ib_gid_table *table = - kzalloc(sizeof(struct ib_gid_table), GFP_KERNEL); - int i; + struct ib_gid_table *table = kzalloc(sizeof(*table), GFP_KERNEL); if (!table) return NULL; @@ -707,12 +762,6 @@ static struct ib_gid_table *alloc_gid_table(int sz) table->sz = sz; rwlock_init(&table->rwlock); - - /* Mark all entries as invalid so that allocator can allocate - * one of the invalid (free) entry. - */ - for (i = 0; i < sz; i++) - table->data_vec[i].props |= GID_TABLE_ENTRY_INVALID; return table; err_free_table: @@ -720,12 +769,30 @@ err_free_table: return NULL; } -static void release_gid_table(struct ib_gid_table *table) +static void release_gid_table(struct ib_device *device, u8 port, + struct ib_gid_table *table) { - if (table) { - kfree(table->data_vec); - kfree(table); + bool leak = false; + int i; + + if (!table) + return; + + for (i = 0; i < table->sz; i++) { + if (is_gid_entry_free(table->data_vec[i])) + continue; + if (kref_read(&table->data_vec[i]->kref) > 1) { + pr_err("GID entry ref leak for %s (index %d) ref=%d\n", + device->name, i, + kref_read(&table->data_vec[i]->kref)); + leak = true; + } } + if (leak) + return; + + kfree(table->data_vec); + kfree(table); } static void cleanup_gid_table_port(struct ib_device *ib_dev, u8 port, @@ -739,7 +806,7 @@ static void cleanup_gid_table_port(struct ib_device *ib_dev, u8 port, mutex_lock(&table->lock); for (i = 0; i < table->sz; ++i) { - if (!rdma_is_zero_gid(&table->data_vec[i].gid)) { + if (is_gid_entry_valid(table->data_vec[i])) { del_gid(ib_dev, port, table, i); deleted = true; } @@ -757,12 +824,9 @@ void ib_cache_gid_set_default_gid(struct ib_device *ib_dev, u8 port, { union ib_gid gid = { }; struct ib_gid_attr gid_attr; - struct ib_gid_table *table; unsigned int gid_type; unsigned long mask; - table = rdma_gid_table(ib_dev, port); - mask = GID_ATTR_FIND_MASK_GID_TYPE | GID_ATTR_FIND_MASK_DEFAULT | GID_ATTR_FIND_MASK_NETDEV; @@ -792,19 +856,12 @@ static void gid_table_reserve_default(struct ib_device *ib_dev, u8 port, unsigned int i; unsigned long roce_gid_type_mask; unsigned int num_default_gids; - unsigned int current_gid = 0; roce_gid_type_mask = roce_gid_type_mask_support(ib_dev, port); num_default_gids = hweight_long(roce_gid_type_mask); - for (i = 0; i < num_default_gids && i < table->sz; i++) { - struct ib_gid_table_entry *entry = &table->data_vec[i]; - - entry->props |= GID_TABLE_ENTRY_DEFAULT; - current_gid = find_next_bit(&roce_gid_type_mask, - BITS_PER_LONG, - current_gid); - entry->attr.gid_type = current_gid++; - } + /* Reserve starting indices for default GIDs */ + for (i = 0; i < num_default_gids && i < table->sz; i++) + table->default_gid_indices |= BIT(i); } @@ -815,7 +872,7 @@ static void gid_table_release_one(struct ib_device *ib_dev) for (port = 0; port < ib_dev->phys_port_cnt; port++) { table = ib_dev->cache.ports[port].gid; - release_gid_table(table); + release_gid_table(ib_dev, port, table); ib_dev->cache.ports[port].gid = NULL; } } @@ -869,69 +926,94 @@ static int gid_table_setup_one(struct ib_device *ib_dev) return err; } -int ib_get_cached_gid(struct ib_device *device, - u8 port_num, - int index, - union ib_gid *gid, - struct ib_gid_attr *gid_attr) +/** + * rdma_query_gid - Read the GID content from the GID software cache + * @device: Device to query the GID + * @port_num: Port number of the device + * @index: Index of the GID table entry to read + * @gid: Pointer to GID where to store the entry's GID + * + * rdma_query_gid() only reads the GID entry content for requested device, + * port and index. It reads for IB, RoCE and iWarp link layers. It doesn't + * hold any reference to the GID table entry in the HCA or software cache. + * + * Returns 0 on success or appropriate error code. + * + */ +int rdma_query_gid(struct ib_device *device, u8 port_num, + int index, union ib_gid *gid) { - int res; - unsigned long flags; struct ib_gid_table *table; + unsigned long flags; + int res = -EINVAL; if (!rdma_is_port_valid(device, port_num)) return -EINVAL; table = rdma_gid_table(device, port_num); read_lock_irqsave(&table->rwlock, flags); - res = __ib_cache_gid_get(device, port_num, index, gid, gid_attr); - read_unlock_irqrestore(&table->rwlock, flags); + if (index < 0 || index >= table->sz || + !is_gid_entry_valid(table->data_vec[index])) + goto done; + + memcpy(gid, &table->data_vec[index]->attr.gid, sizeof(*gid)); + res = 0; + +done: + read_unlock_irqrestore(&table->rwlock, flags); return res; } -EXPORT_SYMBOL(ib_get_cached_gid); +EXPORT_SYMBOL(rdma_query_gid); /** - * ib_find_cached_gid - Returns the port number and GID table index where - * a specified GID value occurs. + * rdma_find_gid - Returns SGID attributes if the matching GID is found. * @device: The device to query. * @gid: The GID value to search for. * @gid_type: The GID type to search for. * @ndev: In RoCE, the net device of the device. NULL means ignore. - * @port_num: The port number of the device where the GID value was found. - * @index: The index into the cached GID table where the GID was found. This - * parameter may be NULL. * - * ib_find_cached_gid() searches for the specified GID value in - * the local software cache. + * rdma_find_gid() searches for the specified GID value in the software cache. + * + * Returns GID attributes if a valid GID is found or returns ERR_PTR for the + * error. The caller must invoke rdma_put_gid_attr() to release the reference. + * */ -int ib_find_cached_gid(struct ib_device *device, - const union ib_gid *gid, - enum ib_gid_type gid_type, - struct net_device *ndev, - u8 *port_num, - u16 *index) -{ - return ib_cache_gid_find(device, gid, gid_type, ndev, port_num, index); -} -EXPORT_SYMBOL(ib_find_cached_gid); - -int ib_find_gid_by_filter(struct ib_device *device, - const union ib_gid *gid, - u8 port_num, - bool (*filter)(const union ib_gid *gid, - const struct ib_gid_attr *, - void *), - void *context, u16 *index) +const struct ib_gid_attr *rdma_find_gid(struct ib_device *device, + const union ib_gid *gid, + enum ib_gid_type gid_type, + struct net_device *ndev) { - /* Only RoCE GID table supports filter function */ - if (!rdma_protocol_roce(device, port_num) && filter) - return -EPROTONOSUPPORT; + unsigned long mask = GID_ATTR_FIND_MASK_GID | + GID_ATTR_FIND_MASK_GID_TYPE; + struct ib_gid_attr gid_attr_val = {.ndev = ndev, .gid_type = gid_type}; + u8 p; + + if (ndev) + mask |= GID_ATTR_FIND_MASK_NETDEV; + + for (p = 0; p < device->phys_port_cnt; p++) { + struct ib_gid_table *table; + unsigned long flags; + int index; + + table = device->cache.ports[p].gid; + read_lock_irqsave(&table->rwlock, flags); + index = find_gid(table, gid, &gid_attr_val, false, mask, NULL); + if (index >= 0) { + const struct ib_gid_attr *attr; + + get_gid_entry(table->data_vec[index]); + attr = &table->data_vec[index]->attr; + read_unlock_irqrestore(&table->rwlock, flags); + return attr; + } + read_unlock_irqrestore(&table->rwlock, flags); + } - return ib_cache_gid_find_by_filter(device, gid, - port_num, filter, - context, index); + return ERR_PTR(-ENOENT); } +EXPORT_SYMBOL(rdma_find_gid); int ib_get_cached_pkey(struct ib_device *device, u8 port_num, @@ -1089,12 +1171,92 @@ int ib_get_cached_port_state(struct ib_device *device, } EXPORT_SYMBOL(ib_get_cached_port_state); +/** + * rdma_get_gid_attr - Returns GID attributes for a port of a device + * at a requested gid_index, if a valid GID entry exists. + * @device: The device to query. + * @port_num: The port number on the device where the GID value + * is to be queried. + * @index: Index of the GID table entry whose attributes are to + * be queried. + * + * rdma_get_gid_attr() acquires reference count of gid attributes from the + * cached GID table. Caller must invoke rdma_put_gid_attr() to release + * reference to gid attribute regardless of link layer. + * + * Returns pointer to valid gid attribute or ERR_PTR for the appropriate error + * code. + */ +const struct ib_gid_attr * +rdma_get_gid_attr(struct ib_device *device, u8 port_num, int index) +{ + const struct ib_gid_attr *attr = ERR_PTR(-EINVAL); + struct ib_gid_table *table; + unsigned long flags; + + if (!rdma_is_port_valid(device, port_num)) + return ERR_PTR(-EINVAL); + + table = rdma_gid_table(device, port_num); + if (index < 0 || index >= table->sz) + return ERR_PTR(-EINVAL); + + read_lock_irqsave(&table->rwlock, flags); + if (!is_gid_entry_valid(table->data_vec[index])) + goto done; + + get_gid_entry(table->data_vec[index]); + attr = &table->data_vec[index]->attr; +done: + read_unlock_irqrestore(&table->rwlock, flags); + return attr; +} +EXPORT_SYMBOL(rdma_get_gid_attr); + +/** + * rdma_put_gid_attr - Release reference to the GID attribute + * @attr: Pointer to the GID attribute whose reference + * needs to be released. + * + * rdma_put_gid_attr() must be used to release reference whose + * reference is acquired using rdma_get_gid_attr() or any APIs + * which returns a pointer to the ib_gid_attr regardless of link layer + * of IB or RoCE. + * + */ +void rdma_put_gid_attr(const struct ib_gid_attr *attr) +{ + struct ib_gid_table_entry *entry = + container_of(attr, struct ib_gid_table_entry, attr); + + put_gid_entry(entry); +} +EXPORT_SYMBOL(rdma_put_gid_attr); + +/** + * rdma_hold_gid_attr - Get reference to existing GID attribute + * + * @attr: Pointer to the GID attribute whose reference + * needs to be taken. + * + * Increase the reference count to a GID attribute to keep it from being + * freed. Callers are required to already be holding a reference to attribute. + * + */ +void rdma_hold_gid_attr(const struct ib_gid_attr *attr) +{ + struct ib_gid_table_entry *entry = + container_of(attr, struct ib_gid_table_entry, attr); + + get_gid_entry(entry); +} +EXPORT_SYMBOL(rdma_hold_gid_attr); + static int config_non_roce_gid_cache(struct ib_device *device, u8 port, int gid_tbl_len) { struct ib_gid_attr gid_attr = {}; struct ib_gid_table *table; - union ib_gid gid; int ret = 0; int i; @@ -1106,14 +1268,14 @@ static int config_non_roce_gid_cache(struct ib_device *device, for (i = 0; i < gid_tbl_len; ++i) { if (!device->query_gid) continue; - ret = device->query_gid(device, port, i, &gid); + ret = device->query_gid(device, port, i, &gid_attr.gid); if (ret) { pr_warn("query_gid failed (%d) for %s (index %d)\n", ret, device->name, i); goto err; } gid_attr.index = i; - add_modify_gid(table, &gid, &gid_attr); + add_modify_gid(table, &gid_attr); } err: mutex_unlock(&table->lock); @@ -1128,13 +1290,10 @@ static void ib_cache_update(struct ib_device *device, struct ib_pkey_cache *pkey_cache = NULL, *old_pkey_cache; int i; int ret; - struct ib_gid_table *table; if (!rdma_is_port_valid(device, port)) return; - table = rdma_gid_table(device, port); - tprops = kmalloc(sizeof *tprops, GFP_KERNEL); if (!tprops) return; @@ -1296,4 +1455,9 @@ void ib_cache_cleanup_one(struct ib_device *device) ib_unregister_event_handler(&device->cache.event_handler); flush_workqueue(ib_wq); gid_table_cleanup_one(device); + + /* + * Flush the wq second time for any pending GID delete work. + */ + flush_workqueue(ib_wq); } diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c index 27a7b0a2e27a..6e39c27dca8e 100644 --- a/drivers/infiniband/core/cm.c +++ b/drivers/infiniband/core/cm.c @@ -474,7 +474,7 @@ static int cm_init_av_for_lap(struct cm_port *port, struct ib_wc *wc, if (ret) return ret; - memcpy(&av->ah_attr, &new_ah_attr, sizeof(new_ah_attr)); + rdma_move_ah_attr(&av->ah_attr, &new_ah_attr); return 0; } @@ -508,31 +508,50 @@ static int add_cm_id_to_port_list(struct cm_id_private *cm_id_priv, return ret; } -static struct cm_port *get_cm_port_from_path(struct sa_path_rec *path) +static struct cm_port * +get_cm_port_from_path(struct sa_path_rec *path, const struct ib_gid_attr *attr) { struct cm_device *cm_dev; struct cm_port *port = NULL; unsigned long flags; - u8 p; - struct net_device *ndev = ib_get_ndev_from_path(path); - - read_lock_irqsave(&cm.device_lock, flags); - list_for_each_entry(cm_dev, &cm.device_list, list) { - if (!ib_find_cached_gid(cm_dev->ib_device, &path->sgid, - sa_conv_pathrec_to_gid_type(path), - ndev, &p, NULL)) { - port = cm_dev->port[p - 1]; - break; + + if (attr) { + read_lock_irqsave(&cm.device_lock, flags); + list_for_each_entry(cm_dev, &cm.device_list, list) { + if (cm_dev->ib_device == attr->device) { + port = cm_dev->port[attr->port_num - 1]; + break; + } + } + read_unlock_irqrestore(&cm.device_lock, flags); + } else { + /* SGID attribute can be NULL in following + * conditions. + * (a) Alternative path + * (b) IB link layer without GRH + * (c) LAP send messages + */ + read_lock_irqsave(&cm.device_lock, flags); + list_for_each_entry(cm_dev, &cm.device_list, list) { + attr = rdma_find_gid(cm_dev->ib_device, + &path->sgid, + sa_conv_pathrec_to_gid_type(path), + NULL); + if (!IS_ERR(attr)) { + port = cm_dev->port[attr->port_num - 1]; + break; + } } + read_unlock_irqrestore(&cm.device_lock, flags); + if (port) + rdma_put_gid_attr(attr); } - read_unlock_irqrestore(&cm.device_lock, flags); - - if (ndev) - dev_put(ndev); return port; } -static int cm_init_av_by_path(struct sa_path_rec *path, struct cm_av *av, +static int cm_init_av_by_path(struct sa_path_rec *path, + const struct ib_gid_attr *sgid_attr, + struct cm_av *av, struct cm_id_private *cm_id_priv) { struct rdma_ah_attr new_ah_attr; @@ -540,7 +559,7 @@ static int cm_init_av_by_path(struct sa_path_rec *path, struct cm_av *av, struct cm_port *port; int ret; - port = get_cm_port_from_path(path); + port = get_cm_port_from_path(path, sgid_attr); if (!port) return -EINVAL; cm_dev = port->cm_dev; @@ -554,22 +573,26 @@ static int cm_init_av_by_path(struct sa_path_rec *path, struct cm_av *av, /* * av->ah_attr might be initialized based on wc or during - * request processing time. So initialize a new ah_attr on stack. + * request processing time which might have reference to sgid_attr. + * So initialize a new ah_attr on stack. * If initialization fails, old ah_attr is used for sending any * responses. If initialization is successful, than new ah_attr - * is used by overwriting the old one. + * is used by overwriting the old one. So that right ah_attr + * can be used to return an error response. */ ret = ib_init_ah_attr_from_path(cm_dev->ib_device, port->port_num, path, - &new_ah_attr); + &new_ah_attr, sgid_attr); if (ret) return ret; av->timeout = path->packet_life_time + 1; ret = add_cm_id_to_port_list(cm_id_priv, av, port); - if (ret) + if (ret) { + rdma_destroy_ah_attr(&new_ah_attr); return ret; - memcpy(&av->ah_attr, &new_ah_attr, sizeof(new_ah_attr)); + } + rdma_move_ah_attr(&av->ah_attr, &new_ah_attr); return 0; } @@ -1091,6 +1114,9 @@ retest: wait_for_completion(&cm_id_priv->comp); while ((work = cm_dequeue_work(cm_id_priv)) != NULL) cm_free_work(work); + + rdma_destroy_ah_attr(&cm_id_priv->av.ah_attr); + rdma_destroy_ah_attr(&cm_id_priv->alt_av.ah_attr); kfree(cm_id_priv->private_data); kfree(cm_id_priv); } @@ -1230,14 +1256,12 @@ new_id: } EXPORT_SYMBOL(ib_cm_insert_listen); -static __be64 cm_form_tid(struct cm_id_private *cm_id_priv, - enum cm_msg_sequence msg_seq) +static __be64 cm_form_tid(struct cm_id_private *cm_id_priv) { u64 hi_tid, low_tid; hi_tid = ((u64) cm_id_priv->av.port->mad_agent->hi_tid) << 32; - low_tid = (u64) ((__force u32)cm_id_priv->id.local_id | - (msg_seq << 30)); + low_tid = (u64)cm_id_priv->id.local_id; return cpu_to_be64(hi_tid | low_tid); } @@ -1265,7 +1289,7 @@ static void cm_format_req(struct cm_req_msg *req_msg, pri_path->opa.slid); cm_format_mad_hdr(&req_msg->hdr, CM_REQ_ATTR_ID, - cm_form_tid(cm_id_priv, CM_MSG_SEQUENCE_REQ)); + cm_form_tid(cm_id_priv)); req_msg->local_comm_id = cm_id_priv->id.local_id; req_msg->service_id = param->service_id; @@ -1413,12 +1437,13 @@ int ib_send_cm_req(struct ib_cm_id *cm_id, goto out; } - ret = cm_init_av_by_path(param->primary_path, &cm_id_priv->av, + ret = cm_init_av_by_path(param->primary_path, + param->ppath_sgid_attr, &cm_id_priv->av, cm_id_priv); if (ret) goto error1; if (param->alternate_path) { - ret = cm_init_av_by_path(param->alternate_path, + ret = cm_init_av_by_path(param->alternate_path, NULL, &cm_id_priv->alt_av, cm_id_priv); if (ret) goto error1; @@ -1646,7 +1671,7 @@ static void cm_opa_to_ib_sgid(struct cm_work *work, (ib_is_opa_gid(&path->sgid))) { union ib_gid sgid; - if (ib_get_cached_gid(dev, port_num, 0, &sgid, NULL)) { + if (rdma_query_gid(dev, port_num, 0, &sgid)) { dev_warn(&dev->dev, "Error updating sgid in CM request\n"); return; @@ -1691,6 +1716,7 @@ static void cm_format_req_event(struct cm_work *work, param->retry_count = cm_req_get_retry_count(req_msg); param->rnr_retry_count = cm_req_get_rnr_retry_count(req_msg); param->srq = cm_req_get_srq(req_msg); + param->ppath_sgid_attr = cm_id_priv->av.ah_attr.grh.sgid_attr; work->cm_event.private_data = &req_msg->private_data; } @@ -1914,9 +1940,8 @@ static int cm_req_handler(struct cm_work *work) struct ib_cm_id *cm_id; struct cm_id_private *cm_id_priv, *listen_cm_id_priv; struct cm_req_msg *req_msg; - union ib_gid gid; - struct ib_gid_attr gid_attr; const struct ib_global_route *grh; + const struct ib_gid_attr *gid_attr; int ret; req_msg = (struct cm_req_msg *)work->mad_recv_wc->recv_buf.mad; @@ -1961,24 +1986,13 @@ static int cm_req_handler(struct cm_work *work) if (cm_req_has_alt_path(req_msg)) memset(&work->path[1], 0, sizeof(work->path[1])); grh = rdma_ah_read_grh(&cm_id_priv->av.ah_attr); - ret = ib_get_cached_gid(work->port->cm_dev->ib_device, - work->port->port_num, - grh->sgid_index, - &gid, &gid_attr); - if (ret) { - ib_send_cm_rej(cm_id, IB_CM_REJ_UNSUPPORTED, NULL, 0, NULL, 0); - goto rejected; - } + gid_attr = grh->sgid_attr; - if (gid_attr.ndev) { + if (gid_attr && gid_attr->ndev) { work->path[0].rec_type = - sa_conv_gid_to_pathrec_type(gid_attr.gid_type); - sa_path_set_ifindex(&work->path[0], - gid_attr.ndev->ifindex); - sa_path_set_ndev(&work->path[0], - dev_net(gid_attr.ndev)); - dev_put(gid_attr.ndev); + sa_conv_gid_to_pathrec_type(gid_attr->gid_type); } else { + /* If no GID attribute or ndev is null, it is not RoCE. */ cm_path_set_rec_type(work->port->cm_dev->ib_device, work->port->port_num, &work->path[0], @@ -1992,15 +2006,14 @@ static int cm_req_handler(struct cm_work *work) sa_path_set_dmac(&work->path[0], cm_id_priv->av.ah_attr.roce.dmac); work->path[0].hop_limit = grh->hop_limit; - ret = cm_init_av_by_path(&work->path[0], &cm_id_priv->av, + ret = cm_init_av_by_path(&work->path[0], gid_attr, &cm_id_priv->av, cm_id_priv); if (ret) { int err; - err = ib_get_cached_gid(work->port->cm_dev->ib_device, - work->port->port_num, 0, - &work->path[0].sgid, - NULL); + err = rdma_query_gid(work->port->cm_dev->ib_device, + work->port->port_num, 0, + &work->path[0].sgid); if (err) ib_send_cm_rej(cm_id, IB_CM_REJ_INVALID_GID, NULL, 0, NULL, 0); @@ -2012,8 +2025,8 @@ static int cm_req_handler(struct cm_work *work) goto rejected; } if (cm_req_has_alt_path(req_msg)) { - ret = cm_init_av_by_path(&work->path[1], &cm_id_priv->alt_av, - cm_id_priv); + ret = cm_init_av_by_path(&work->path[1], NULL, + &cm_id_priv->alt_av, cm_id_priv); if (ret) { ib_send_cm_rej(cm_id, IB_CM_REJ_INVALID_ALT_GID, &work->path[0].sgid, @@ -2451,7 +2464,7 @@ static void cm_format_dreq(struct cm_dreq_msg *dreq_msg, u8 private_data_len) { cm_format_mad_hdr(&dreq_msg->hdr, CM_DREQ_ATTR_ID, - cm_form_tid(cm_id_priv, CM_MSG_SEQUENCE_DREQ)); + cm_form_tid(cm_id_priv)); dreq_msg->local_comm_id = cm_id_priv->id.local_id; dreq_msg->remote_comm_id = cm_id_priv->id.remote_id; cm_dreq_set_remote_qpn(dreq_msg, cm_id_priv->remote_qpn); @@ -3082,7 +3095,7 @@ static void cm_format_lap(struct cm_lap_msg *lap_msg, alt_ext = opa_is_extended_lid(alternate_path->opa.dlid, alternate_path->opa.slid); cm_format_mad_hdr(&lap_msg->hdr, CM_LAP_ATTR_ID, - cm_form_tid(cm_id_priv, CM_MSG_SEQUENCE_LAP)); + cm_form_tid(cm_id_priv)); lap_msg->local_comm_id = cm_id_priv->id.local_id; lap_msg->remote_comm_id = cm_id_priv->id.remote_id; cm_lap_set_remote_qpn(lap_msg, cm_id_priv->remote_qpn); @@ -3136,7 +3149,7 @@ int ib_send_cm_lap(struct ib_cm_id *cm_id, goto out; } - ret = cm_init_av_by_path(alternate_path, &cm_id_priv->alt_av, + ret = cm_init_av_by_path(alternate_path, NULL, &cm_id_priv->alt_av, cm_id_priv); if (ret) goto out; @@ -3279,7 +3292,7 @@ static int cm_lap_handler(struct cm_work *work) if (ret) goto unlock; - cm_init_av_by_path(param->alternate_path, &cm_id_priv->alt_av, + cm_init_av_by_path(param->alternate_path, NULL, &cm_id_priv->alt_av, cm_id_priv); cm_id_priv->id.lap_state = IB_CM_LAP_RCVD; cm_id_priv->tid = lap_msg->hdr.tid; @@ -3458,7 +3471,7 @@ static void cm_format_sidr_req(struct cm_sidr_req_msg *sidr_req_msg, struct ib_cm_sidr_req_param *param) { cm_format_mad_hdr(&sidr_req_msg->hdr, CM_SIDR_REQ_ATTR_ID, - cm_form_tid(cm_id_priv, CM_MSG_SEQUENCE_SIDR)); + cm_form_tid(cm_id_priv)); sidr_req_msg->request_id = cm_id_priv->id.local_id; sidr_req_msg->pkey = param->path->pkey; sidr_req_msg->service_id = param->service_id; @@ -3481,7 +3494,9 @@ int ib_send_cm_sidr_req(struct ib_cm_id *cm_id, return -EINVAL; cm_id_priv = container_of(cm_id, struct cm_id_private, id); - ret = cm_init_av_by_path(param->path, &cm_id_priv->av, cm_id_priv); + ret = cm_init_av_by_path(param->path, param->sgid_attr, + &cm_id_priv->av, + cm_id_priv); if (ret) goto out; @@ -3518,6 +3533,7 @@ out: EXPORT_SYMBOL(ib_send_cm_sidr_req); static void cm_format_sidr_req_event(struct cm_work *work, + const struct cm_id_private *rx_cm_id, struct ib_cm_id *listen_id) { struct cm_sidr_req_msg *sidr_req_msg; @@ -3531,6 +3547,7 @@ static void cm_format_sidr_req_event(struct cm_work *work, param->service_id = sidr_req_msg->service_id; param->bth_pkey = cm_get_bth_pkey(work); param->port = work->port->port_num; + param->sgid_attr = rx_cm_id->av.ah_attr.grh.sgid_attr; work->cm_event.private_data = &sidr_req_msg->private_data; } @@ -3588,7 +3605,7 @@ static int cm_sidr_req_handler(struct cm_work *work) cm_id_priv->id.service_id = sidr_req_msg->service_id; cm_id_priv->id.service_mask = ~cpu_to_be64(0); - cm_format_sidr_req_event(work, &cur_cm_id_priv->id); + cm_format_sidr_req_event(work, cm_id_priv, &cur_cm_id_priv->id); cm_process_work(cm_id_priv, work); cm_deref_id(cur_cm_id_priv); return 0; @@ -3665,7 +3682,8 @@ error: spin_unlock_irqrestore(&cm_id_priv->lock, flags); } EXPORT_SYMBOL(ib_send_cm_sidr_rep); -static void cm_format_sidr_rep_event(struct cm_work *work) +static void cm_format_sidr_rep_event(struct cm_work *work, + const struct cm_id_private *cm_id_priv) { struct cm_sidr_rep_msg *sidr_rep_msg; struct ib_cm_sidr_rep_event_param *param; @@ -3678,6 +3696,7 @@ static void cm_format_sidr_rep_event(struct cm_work *work) param->qpn = be32_to_cpu(cm_sidr_rep_get_qpn(sidr_rep_msg)); param->info = &sidr_rep_msg->info; param->info_len = sidr_rep_msg->info_length; + param->sgid_attr = cm_id_priv->av.ah_attr.grh.sgid_attr; work->cm_event.private_data = &sidr_rep_msg->private_data; } @@ -3701,7 +3720,7 @@ static int cm_sidr_rep_handler(struct cm_work *work) ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg); spin_unlock_irq(&cm_id_priv->lock); - cm_format_sidr_rep_event(work); + cm_format_sidr_rep_event(work, cm_id_priv); cm_process_work(cm_id_priv, work); return 0; out: diff --git a/drivers/infiniband/core/cm_msgs.h b/drivers/infiniband/core/cm_msgs.h index 8b76f0ef965e..476d4309576d 100644 --- a/drivers/infiniband/core/cm_msgs.h +++ b/drivers/infiniband/core/cm_msgs.h @@ -44,13 +44,6 @@ #define IB_CM_CLASS_VERSION 2 /* IB specification 1.2 */ -enum cm_msg_sequence { - CM_MSG_SEQUENCE_REQ, - CM_MSG_SEQUENCE_LAP, - CM_MSG_SEQUENCE_DREQ, - CM_MSG_SEQUENCE_SIDR -}; - struct cm_req_msg { struct ib_mad_hdr hdr; diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index bff10ab141b0..f72677291b69 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -366,7 +366,6 @@ struct cma_multicast { void *context; struct sockaddr_storage addr; struct kref mcref; - bool igmp_joined; u8 join_state; }; @@ -412,11 +411,11 @@ struct cma_req_info { struct sockaddr_storage listen_addr_storage; struct sockaddr_storage src_addr_storage; struct ib_device *device; - int port; union ib_gid local_gid; __be64 service_id; + int port; + bool has_gid; u16 pkey; - bool has_gid:1; }; static int cma_comp(struct rdma_id_private *id_priv, enum rdma_cm_state comp) @@ -491,12 +490,10 @@ static void _cma_attach_to_dev(struct rdma_id_private *id_priv, { cma_ref_dev(cma_dev); id_priv->cma_dev = cma_dev; - id_priv->gid_type = 0; id_priv->id.device = cma_dev->device; id_priv->id.route.addr.dev_addr.transport = rdma_node_get_transport(cma_dev->device->node_type); list_add_tail(&id_priv->list, &cma_dev->id_list); - id_priv->res.type = RDMA_RESTRACK_CM_ID; rdma_restrack_add(&id_priv->res); } @@ -603,46 +600,53 @@ static int cma_translate_addr(struct sockaddr *addr, struct rdma_dev_addr *dev_a return ret; } -static inline int cma_validate_port(struct ib_device *device, u8 port, - enum ib_gid_type gid_type, - union ib_gid *gid, - struct rdma_id_private *id_priv) +static const struct ib_gid_attr * +cma_validate_port(struct ib_device *device, u8 port, + enum ib_gid_type gid_type, + union ib_gid *gid, + struct rdma_id_private *id_priv) { struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; int bound_if_index = dev_addr->bound_dev_if; + const struct ib_gid_attr *sgid_attr; int dev_type = dev_addr->dev_type; struct net_device *ndev = NULL; - int ret = -ENODEV; if ((dev_type == ARPHRD_INFINIBAND) && !rdma_protocol_ib(device, port)) - return ret; + return ERR_PTR(-ENODEV); if ((dev_type != ARPHRD_INFINIBAND) && rdma_protocol_ib(device, port)) - return ret; + return ERR_PTR(-ENODEV); if (dev_type == ARPHRD_ETHER && rdma_protocol_roce(device, port)) { ndev = dev_get_by_index(dev_addr->net, bound_if_index); if (!ndev) - return ret; + return ERR_PTR(-ENODEV); } else { gid_type = IB_GID_TYPE_IB; } - ret = ib_find_cached_gid_by_port(device, gid, gid_type, port, - ndev, NULL); - + sgid_attr = rdma_find_gid_by_port(device, gid, gid_type, port, ndev); if (ndev) dev_put(ndev); + return sgid_attr; +} - return ret; +static void cma_bind_sgid_attr(struct rdma_id_private *id_priv, + const struct ib_gid_attr *sgid_attr) +{ + WARN_ON(id_priv->id.route.addr.dev_addr.sgid_attr); + id_priv->id.route.addr.dev_addr.sgid_attr = sgid_attr; } static int cma_acquire_dev(struct rdma_id_private *id_priv, - struct rdma_id_private *listen_id_priv) + const struct rdma_id_private *listen_id_priv) { struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; + const struct ib_gid_attr *sgid_attr; struct cma_device *cma_dev; union ib_gid gid, iboe_gid, *gidp; + enum ib_gid_type gid_type; int ret = -ENODEV; u8 port; @@ -662,14 +666,13 @@ static int cma_acquire_dev(struct rdma_id_private *id_priv, port = listen_id_priv->id.port_num; gidp = rdma_protocol_roce(cma_dev->device, port) ? &iboe_gid : &gid; - - ret = cma_validate_port(cma_dev->device, port, - rdma_protocol_ib(cma_dev->device, port) ? - IB_GID_TYPE_IB : - listen_id_priv->gid_type, gidp, - id_priv); - if (!ret) { + gid_type = listen_id_priv->gid_type; + sgid_attr = cma_validate_port(cma_dev->device, port, + gid_type, gidp, id_priv); + if (!IS_ERR(sgid_attr)) { id_priv->id.port_num = port; + cma_bind_sgid_attr(id_priv, sgid_attr); + ret = 0; goto out; } } @@ -683,14 +686,13 @@ static int cma_acquire_dev(struct rdma_id_private *id_priv, gidp = rdma_protocol_roce(cma_dev->device, port) ? &iboe_gid : &gid; - - ret = cma_validate_port(cma_dev->device, port, - rdma_protocol_ib(cma_dev->device, port) ? - IB_GID_TYPE_IB : - cma_dev->default_gid_type[port - 1], - gidp, id_priv); - if (!ret) { + gid_type = cma_dev->default_gid_type[port - 1]; + sgid_attr = cma_validate_port(cma_dev->device, port, + gid_type, gidp, id_priv); + if (!IS_ERR(sgid_attr)) { id_priv->id.port_num = port; + cma_bind_sgid_attr(id_priv, sgid_attr); + ret = 0; goto out; } } @@ -732,8 +734,8 @@ static int cma_resolve_ib_dev(struct rdma_id_private *id_priv) if (ib_get_cached_port_state(cur_dev->device, p, &port_state)) continue; - for (i = 0; !ib_get_cached_gid(cur_dev->device, p, i, - &gid, NULL); + for (i = 0; !rdma_query_gid(cur_dev->device, + p, i, &gid); i++) { if (!memcmp(&gid, dgid, sizeof(gid))) { cma_dev = cur_dev; @@ -785,12 +787,14 @@ struct rdma_cm_id *__rdma_create_id(struct net *net, id_priv->res.kern_name = caller; else rdma_restrack_set_task(&id_priv->res, current); + id_priv->res.type = RDMA_RESTRACK_CM_ID; id_priv->state = RDMA_CM_IDLE; id_priv->id.context = context; id_priv->id.event_handler = event_handler; id_priv->id.ps = ps; id_priv->id.qp_type = qp_type; id_priv->tos_set = false; + id_priv->gid_type = IB_GID_TYPE_IB; spin_lock_init(&id_priv->lock); mutex_init(&id_priv->qp_mutex); init_completion(&id_priv->comp); @@ -1036,35 +1040,38 @@ int rdma_init_qp_attr(struct rdma_cm_id *id, struct ib_qp_attr *qp_attr, } EXPORT_SYMBOL(rdma_init_qp_attr); -static inline int cma_zero_addr(struct sockaddr *addr) +static inline bool cma_zero_addr(const struct sockaddr *addr) { switch (addr->sa_family) { case AF_INET: return ipv4_is_zeronet(((struct sockaddr_in *)addr)->sin_addr.s_addr); case AF_INET6: - return ipv6_addr_any(&((struct sockaddr_in6 *) addr)->sin6_addr); + return ipv6_addr_any(&((struct sockaddr_in6 *)addr)->sin6_addr); case AF_IB: - return ib_addr_any(&((struct sockaddr_ib *) addr)->sib_addr); + return ib_addr_any(&((struct sockaddr_ib *)addr)->sib_addr); default: - return 0; + return false; } } -static inline int cma_loopback_addr(struct sockaddr *addr) +static inline bool cma_loopback_addr(const struct sockaddr *addr) { switch (addr->sa_family) { case AF_INET: - return ipv4_is_loopback(((struct sockaddr_in *) addr)->sin_addr.s_addr); + return ipv4_is_loopback( + ((struct sockaddr_in *)addr)->sin_addr.s_addr); case AF_INET6: - return ipv6_addr_loopback(&((struct sockaddr_in6 *) addr)->sin6_addr); + return ipv6_addr_loopback( + &((struct sockaddr_in6 *)addr)->sin6_addr); case AF_IB: - return ib_addr_loopback(&((struct sockaddr_ib *) addr)->sib_addr); + return ib_addr_loopback( + &((struct sockaddr_ib *)addr)->sib_addr); default: - return 0; + return false; } } -static inline int cma_any_addr(struct sockaddr *addr) +static inline bool cma_any_addr(const struct sockaddr *addr) { return cma_zero_addr(addr) || cma_loopback_addr(addr); } @@ -1087,7 +1094,7 @@ static int cma_addr_cmp(struct sockaddr *src, struct sockaddr *dst) } } -static __be16 cma_port(struct sockaddr *addr) +static __be16 cma_port(const struct sockaddr *addr) { struct sockaddr_ib *sib; @@ -1105,15 +1112,15 @@ static __be16 cma_port(struct sockaddr *addr) } } -static inline int cma_any_port(struct sockaddr *addr) +static inline int cma_any_port(const struct sockaddr *addr) { return !cma_port(addr); } static void cma_save_ib_info(struct sockaddr *src_addr, struct sockaddr *dst_addr, - struct rdma_cm_id *listen_id, - struct sa_path_rec *path) + const struct rdma_cm_id *listen_id, + const struct sa_path_rec *path) { struct sockaddr_ib *listen_ib, *ib; @@ -1198,7 +1205,7 @@ static u16 cma_port_from_service_id(__be64 service_id) static int cma_save_ip_info(struct sockaddr *src_addr, struct sockaddr *dst_addr, - struct ib_cm_event *ib_event, + const struct ib_cm_event *ib_event, __be64 service_id) { struct cma_hdr *hdr; @@ -1228,8 +1235,8 @@ static int cma_save_ip_info(struct sockaddr *src_addr, static int cma_save_net_info(struct sockaddr *src_addr, struct sockaddr *dst_addr, - struct rdma_cm_id *listen_id, - struct ib_cm_event *ib_event, + const struct rdma_cm_id *listen_id, + const struct ib_cm_event *ib_event, sa_family_t sa_family, __be64 service_id) { if (sa_family == AF_IB) { @@ -1361,7 +1368,23 @@ static bool validate_net_dev(struct net_device *net_dev, } } -static struct net_device *cma_get_net_dev(struct ib_cm_event *ib_event, +static struct net_device * +roce_get_net_dev_by_cm_event(const struct ib_cm_event *ib_event) +{ + const struct ib_gid_attr *sgid_attr = NULL; + + if (ib_event->event == IB_CM_REQ_RECEIVED) + sgid_attr = ib_event->param.req_rcvd.ppath_sgid_attr; + else if (ib_event->event == IB_CM_SIDR_REQ_RECEIVED) + sgid_attr = ib_event->param.sidr_req_rcvd.sgid_attr; + + if (!sgid_attr) + return NULL; + dev_hold(sgid_attr->ndev); + return sgid_attr->ndev; +} + +static struct net_device *cma_get_net_dev(const struct ib_cm_event *ib_event, struct cma_req_info *req) { struct sockaddr *listen_addr = @@ -1376,8 +1399,12 @@ static struct net_device *cma_get_net_dev(struct ib_cm_event *ib_event, if (err) return ERR_PTR(err); - net_dev = ib_get_net_dev_by_params(req->device, req->port, req->pkey, - gid, listen_addr); + if (rdma_protocol_roce(req->device, req->port)) + net_dev = roce_get_net_dev_by_cm_event(ib_event); + else + net_dev = ib_get_net_dev_by_params(req->device, req->port, + req->pkey, + gid, listen_addr); if (!net_dev) return ERR_PTR(-ENODEV); @@ -1440,14 +1467,20 @@ static bool cma_match_net_dev(const struct rdma_cm_id *id, const struct rdma_addr *addr = &id->route.addr; if (!net_dev) - /* This request is an AF_IB request or a RoCE request */ + /* This request is an AF_IB request */ return (!id->port_num || id->port_num == port_num) && - (addr->src_addr.ss_family == AF_IB || - rdma_protocol_roce(id->device, port_num)); + (addr->src_addr.ss_family == AF_IB); - return !addr->dev_addr.bound_dev_if || - (net_eq(dev_net(net_dev), addr->dev_addr.net) && - addr->dev_addr.bound_dev_if == net_dev->ifindex); + /* + * Net namespaces must match, and if the listner is listening + * on a specific netdevice than netdevice must match as well. + */ + if (net_eq(dev_net(net_dev), addr->dev_addr.net) && + (!!addr->dev_addr.bound_dev_if == + (addr->dev_addr.bound_dev_if == net_dev->ifindex))) + return true; + else + return false; } static struct rdma_id_private *cma_find_listener( @@ -1480,9 +1513,10 @@ static struct rdma_id_private *cma_find_listener( return ERR_PTR(-EINVAL); } -static struct rdma_id_private *cma_id_from_event(struct ib_cm_id *cm_id, - struct ib_cm_event *ib_event, - struct net_device **net_dev) +static struct rdma_id_private * +cma_ib_id_from_event(struct ib_cm_id *cm_id, + const struct ib_cm_event *ib_event, + struct net_device **net_dev) { struct cma_req_info req; struct rdma_bind_list *bind_list; @@ -1498,10 +1532,6 @@ static struct rdma_id_private *cma_id_from_event(struct ib_cm_id *cm_id, if (PTR_ERR(*net_dev) == -EAFNOSUPPORT) { /* Assuming the protocol is AF_IB */ *net_dev = NULL; - } else if (rdma_protocol_roce(req.device, req.port)) { - /* TODO find the net dev matching the request parameters - * through the RoCE GID table */ - *net_dev = NULL; } else { return ERR_CAST(*net_dev); } @@ -1629,6 +1659,21 @@ static void cma_release_port(struct rdma_id_private *id_priv) mutex_unlock(&lock); } +static void cma_leave_roce_mc_group(struct rdma_id_private *id_priv, + struct cma_multicast *mc) +{ + struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; + struct net_device *ndev = NULL; + + if (dev_addr->bound_dev_if) + ndev = dev_get_by_index(dev_addr->net, dev_addr->bound_dev_if); + if (ndev) { + cma_igmp_send(ndev, &mc->multicast.ib->rec.mgid, false); + dev_put(ndev); + } + kref_put(&mc->mcref, release_mc); +} + static void cma_leave_mc_groups(struct rdma_id_private *id_priv) { struct cma_multicast *mc; @@ -1642,22 +1687,7 @@ static void cma_leave_mc_groups(struct rdma_id_private *id_priv) ib_sa_free_multicast(mc->multicast.ib); kfree(mc); } else { - if (mc->igmp_joined) { - struct rdma_dev_addr *dev_addr = - &id_priv->id.route.addr.dev_addr; - struct net_device *ndev = NULL; - - if (dev_addr->bound_dev_if) - ndev = dev_get_by_index(&init_net, - dev_addr->bound_dev_if); - if (ndev) { - cma_igmp_send(ndev, - &mc->multicast.ib->rec.mgid, - false); - dev_put(ndev); - } - } - kref_put(&mc->mcref, release_mc); + cma_leave_roce_mc_group(id_priv, mc); } } } @@ -1699,6 +1729,10 @@ void rdma_destroy_id(struct rdma_cm_id *id) cma_deref_id(id_priv->id.context); kfree(id_priv->id.route.path_rec); + + if (id_priv->id.route.addr.dev_addr.sgid_attr) + rdma_put_gid_attr(id_priv->id.route.addr.dev_addr.sgid_attr); + put_net(id_priv->id.route.addr.dev_addr.net); kfree(id_priv); } @@ -1730,7 +1764,7 @@ reject: } static void cma_set_rep_event_data(struct rdma_cm_event *event, - struct ib_cm_rep_event_param *rep_data, + const struct ib_cm_rep_event_param *rep_data, void *private_data) { event->param.conn.private_data = private_data; @@ -1743,10 +1777,11 @@ static void cma_set_rep_event_data(struct rdma_cm_event *event, event->param.conn.qp_num = rep_data->remote_qpn; } -static int cma_ib_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event) +static int cma_ib_handler(struct ib_cm_id *cm_id, + const struct ib_cm_event *ib_event) { struct rdma_id_private *id_priv = cm_id->context; - struct rdma_cm_event event; + struct rdma_cm_event event = {}; int ret = 0; mutex_lock(&id_priv->handler_mutex); @@ -1756,7 +1791,6 @@ static int cma_ib_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event) id_priv->state != RDMA_CM_DISCONNECT)) goto out; - memset(&event, 0, sizeof event); switch (ib_event->event) { case IB_CM_REQ_ERROR: case IB_CM_REP_ERROR: @@ -1825,9 +1859,10 @@ out: return ret; } -static struct rdma_id_private *cma_new_conn_id(struct rdma_cm_id *listen_id, - struct ib_cm_event *ib_event, - struct net_device *net_dev) +static struct rdma_id_private * +cma_ib_new_conn_id(const struct rdma_cm_id *listen_id, + const struct ib_cm_event *ib_event, + struct net_device *net_dev) { struct rdma_id_private *listen_id_priv; struct rdma_id_private *id_priv; @@ -1888,11 +1923,12 @@ err: return NULL; } -static struct rdma_id_private *cma_new_udp_id(struct rdma_cm_id *listen_id, - struct ib_cm_event *ib_event, - struct net_device *net_dev) +static struct rdma_id_private * +cma_ib_new_udp_id(const struct rdma_cm_id *listen_id, + const struct ib_cm_event *ib_event, + struct net_device *net_dev) { - struct rdma_id_private *listen_id_priv; + const struct rdma_id_private *listen_id_priv; struct rdma_id_private *id_priv; struct rdma_cm_id *id; const sa_family_t ss_family = listen_id->route.addr.src_addr.ss_family; @@ -1932,7 +1968,7 @@ err: } static void cma_set_req_event_data(struct rdma_cm_event *event, - struct ib_cm_req_event_param *req_data, + const struct ib_cm_req_event_param *req_data, void *private_data, int offset) { event->param.conn.private_data = private_data + offset; @@ -1946,7 +1982,8 @@ static void cma_set_req_event_data(struct rdma_cm_event *event, event->param.conn.qp_num = req_data->remote_qpn; } -static int cma_check_req_qp_type(struct rdma_cm_id *id, struct ib_cm_event *ib_event) +static int cma_ib_check_req_qp_type(const struct rdma_cm_id *id, + const struct ib_cm_event *ib_event) { return (((ib_event->event == IB_CM_REQ_RECEIVED) && (ib_event->param.req_rcvd.qp_type == id->qp_type)) || @@ -1955,19 +1992,20 @@ static int cma_check_req_qp_type(struct rdma_cm_id *id, struct ib_cm_event *ib_e (!id->qp_type)); } -static int cma_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event) +static int cma_ib_req_handler(struct ib_cm_id *cm_id, + const struct ib_cm_event *ib_event) { struct rdma_id_private *listen_id, *conn_id = NULL; - struct rdma_cm_event event; + struct rdma_cm_event event = {}; struct net_device *net_dev; u8 offset; int ret; - listen_id = cma_id_from_event(cm_id, ib_event, &net_dev); + listen_id = cma_ib_id_from_event(cm_id, ib_event, &net_dev); if (IS_ERR(listen_id)) return PTR_ERR(listen_id); - if (!cma_check_req_qp_type(&listen_id->id, ib_event)) { + if (!cma_ib_check_req_qp_type(&listen_id->id, ib_event)) { ret = -EINVAL; goto net_dev_put; } @@ -1978,16 +2016,15 @@ static int cma_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event) goto err1; } - memset(&event, 0, sizeof event); offset = cma_user_data_offset(listen_id); event.event = RDMA_CM_EVENT_CONNECT_REQUEST; if (ib_event->event == IB_CM_SIDR_REQ_RECEIVED) { - conn_id = cma_new_udp_id(&listen_id->id, ib_event, net_dev); + conn_id = cma_ib_new_udp_id(&listen_id->id, ib_event, net_dev); event.param.ud.private_data = ib_event->private_data + offset; event.param.ud.private_data_len = IB_CM_SIDR_REQ_PRIVATE_DATA_SIZE - offset; } else { - conn_id = cma_new_conn_id(&listen_id->id, ib_event, net_dev); + conn_id = cma_ib_new_conn_id(&listen_id->id, ib_event, net_dev); cma_set_req_event_data(&event, &ib_event->param.req_rcvd, ib_event->private_data, offset); } @@ -2087,7 +2124,7 @@ EXPORT_SYMBOL(rdma_read_gids); static int cma_iw_handler(struct iw_cm_id *iw_id, struct iw_cm_event *iw_event) { struct rdma_id_private *id_priv = iw_id->context; - struct rdma_cm_event event; + struct rdma_cm_event event = {}; int ret = 0; struct sockaddr *laddr = (struct sockaddr *)&iw_event->local_addr; struct sockaddr *raddr = (struct sockaddr *)&iw_event->remote_addr; @@ -2096,7 +2133,6 @@ static int cma_iw_handler(struct iw_cm_id *iw_id, struct iw_cm_event *iw_event) if (id_priv->state != RDMA_CM_CONNECT) goto out; - memset(&event, 0, sizeof event); switch (iw_event->event) { case IW_CM_EVENT_CLOSE: event.event = RDMA_CM_EVENT_DISCONNECTED; @@ -2156,11 +2192,17 @@ static int iw_conn_req_handler(struct iw_cm_id *cm_id, { struct rdma_cm_id *new_cm_id; struct rdma_id_private *listen_id, *conn_id; - struct rdma_cm_event event; + struct rdma_cm_event event = {}; int ret = -ECONNABORTED; struct sockaddr *laddr = (struct sockaddr *)&iw_event->local_addr; struct sockaddr *raddr = (struct sockaddr *)&iw_event->remote_addr; + event.event = RDMA_CM_EVENT_CONNECT_REQUEST; + event.param.conn.private_data = iw_event->private_data; + event.param.conn.private_data_len = iw_event->private_data_len; + event.param.conn.initiator_depth = iw_event->ird; + event.param.conn.responder_resources = iw_event->ord; + listen_id = cm_id->context; mutex_lock(&listen_id->handler_mutex); @@ -2202,13 +2244,6 @@ static int iw_conn_req_handler(struct iw_cm_id *cm_id, memcpy(cma_src_addr(conn_id), laddr, rdma_addr_size(laddr)); memcpy(cma_dst_addr(conn_id), raddr, rdma_addr_size(raddr)); - memset(&event, 0, sizeof event); - event.event = RDMA_CM_EVENT_CONNECT_REQUEST; - event.param.conn.private_data = iw_event->private_data; - event.param.conn.private_data_len = iw_event->private_data_len; - event.param.conn.initiator_depth = iw_event->ird; - event.param.conn.responder_resources = iw_event->ord; - /* * Protect against the user destroying conn_id from another thread * until we're done accessing it. @@ -2241,7 +2276,8 @@ static int cma_ib_listen(struct rdma_id_private *id_priv) addr = cma_src_addr(id_priv); svc_id = rdma_get_service_id(&id_priv->id, addr); - id = ib_cm_insert_listen(id_priv->id.device, cma_req_handler, svc_id); + id = ib_cm_insert_listen(id_priv->id.device, + cma_ib_req_handler, svc_id); if (IS_ERR(id)) return PTR_ERR(id); id_priv->cm_id.ib = id; @@ -2561,8 +2597,6 @@ cma_iboe_set_path_rec_l2_fields(struct rdma_id_private *id_priv) route->path_rec->rec_type = sa_conv_gid_to_pathrec_type(gid_type); route->path_rec->roce.route_resolved = true; - sa_path_set_ndev(route->path_rec, addr->dev_addr.net); - sa_path_set_ifindex(route->path_rec, ndev->ifindex); sa_path_set_dmac(route->path_rec, addr->dev_addr.dst_dev_addr); return ndev; } @@ -2791,7 +2825,7 @@ static int cma_bind_loopback(struct rdma_id_private *id_priv) p = 1; port_found: - ret = ib_get_cached_gid(cma_dev->device, p, 0, &gid, NULL); + ret = rdma_query_gid(cma_dev->device, p, 0, &gid); if (ret) goto out; @@ -2817,9 +2851,8 @@ static void addr_handler(int status, struct sockaddr *src_addr, struct rdma_dev_addr *dev_addr, void *context) { struct rdma_id_private *id_priv = context; - struct rdma_cm_event event; + struct rdma_cm_event event = {}; - memset(&event, 0, sizeof event); mutex_lock(&id_priv->handler_mutex); if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_QUERY, RDMA_CM_ADDR_RESOLVED)) @@ -2910,7 +2943,7 @@ err: } static int cma_bind_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, - struct sockaddr *dst_addr) + const struct sockaddr *dst_addr) { if (!src_addr || !src_addr->sa_family) { src_addr = (struct sockaddr *) &id->route.addr.src_addr; @@ -2931,31 +2964,25 @@ static int cma_bind_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, } int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, - struct sockaddr *dst_addr, int timeout_ms) + const struct sockaddr *dst_addr, int timeout_ms) { struct rdma_id_private *id_priv; int ret; id_priv = container_of(id, struct rdma_id_private, id); - memcpy(cma_dst_addr(id_priv), dst_addr, rdma_addr_size(dst_addr)); if (id_priv->state == RDMA_CM_IDLE) { ret = cma_bind_addr(id, src_addr, dst_addr); - if (ret) { - memset(cma_dst_addr(id_priv), 0, rdma_addr_size(dst_addr)); + if (ret) return ret; - } } - if (cma_family(id_priv) != dst_addr->sa_family) { - memset(cma_dst_addr(id_priv), 0, rdma_addr_size(dst_addr)); + if (cma_family(id_priv) != dst_addr->sa_family) return -EINVAL; - } - if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_ADDR_QUERY)) { - memset(cma_dst_addr(id_priv), 0, rdma_addr_size(dst_addr)); + if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_ADDR_QUERY)) return -EINVAL; - } + memcpy(cma_dst_addr(id_priv), dst_addr, rdma_addr_size(dst_addr)); atomic_inc(&id_priv->refcount); if (cma_any_addr(dst_addr)) { ret = cma_resolve_loopback(id_priv); @@ -3451,18 +3478,18 @@ static int cma_format_hdr(void *hdr, struct rdma_id_private *id_priv) } static int cma_sidr_rep_handler(struct ib_cm_id *cm_id, - struct ib_cm_event *ib_event) + const struct ib_cm_event *ib_event) { struct rdma_id_private *id_priv = cm_id->context; - struct rdma_cm_event event; - struct ib_cm_sidr_rep_event_param *rep = &ib_event->param.sidr_rep_rcvd; + struct rdma_cm_event event = {}; + const struct ib_cm_sidr_rep_event_param *rep = + &ib_event->param.sidr_rep_rcvd; int ret = 0; mutex_lock(&id_priv->handler_mutex); if (id_priv->state != RDMA_CM_CONNECT) goto out; - memset(&event, 0, sizeof event); switch (ib_event->event) { case IB_CM_SIDR_REQ_ERROR: event.event = RDMA_CM_EVENT_UNREACHABLE; @@ -3488,7 +3515,8 @@ static int cma_sidr_rep_handler(struct ib_cm_id *cm_id, ib_init_ah_attr_from_path(id_priv->id.device, id_priv->id.port_num, id_priv->id.route.path_rec, - &event.param.ud.ah_attr); + &event.param.ud.ah_attr, + rep->sgid_attr); event.param.ud.qp_num = rep->qpn; event.param.ud.qkey = rep->qkey; event.event = RDMA_CM_EVENT_ESTABLISHED; @@ -3501,6 +3529,8 @@ static int cma_sidr_rep_handler(struct ib_cm_id *cm_id, } ret = id_priv->id.event_handler(&id_priv->id, &event); + + rdma_destroy_ah_attr(&event.param.ud.ah_attr); if (ret) { /* Destroy the CM ID by returning a non-zero value. */ id_priv->cm_id.ib = NULL; @@ -3557,6 +3587,7 @@ static int cma_resolve_ib_udp(struct rdma_id_private *id_priv, id_priv->cm_id.ib = id; req.path = id_priv->id.route.path_rec; + req.sgid_attr = id_priv->id.route.addr.dev_addr.sgid_attr; req.service_id = rdma_get_service_id(&id_priv->id, cma_dst_addr(id_priv)); req.timeout_ms = 1 << (CMA_CM_RESPONSE_TIMEOUT - 8); req.max_cm_retries = CMA_MAX_CM_RETRIES; @@ -3618,6 +3649,8 @@ static int cma_connect_ib(struct rdma_id_private *id_priv, if (route->num_paths == 2) req.alternate_path = &route->path_rec[1]; + req.ppath_sgid_attr = id_priv->id.route.addr.dev_addr.sgid_attr; + /* Alternate path SGID attribute currently unsupported */ req.service_id = rdma_get_service_id(&id_priv->id, cma_dst_addr(id_priv)); req.qp_num = id_priv->qp_num; req.qp_type = id_priv->id.qp_type; @@ -3928,7 +3961,7 @@ static int cma_ib_mc_handler(int status, struct ib_sa_multicast *multicast) { struct rdma_id_private *id_priv; struct cma_multicast *mc = multicast->context; - struct rdma_cm_event event; + struct rdma_cm_event event = {}; int ret = 0; id_priv = mc->id_priv; @@ -3952,7 +3985,6 @@ static int cma_ib_mc_handler(int status, struct ib_sa_multicast *multicast) } mutex_unlock(&id_priv->qp_mutex); - memset(&event, 0, sizeof event); event.status = status; event.param.ud.private_data = mc->context; if (!status) { @@ -3981,6 +4013,8 @@ static int cma_ib_mc_handler(int status, struct ib_sa_multicast *multicast) event.event = RDMA_CM_EVENT_MULTICAST_ERROR; ret = id_priv->id.event_handler(&id_priv->id, &event); + + rdma_destroy_ah_attr(&event.param.ud.ah_attr); if (ret) { cma_exch(id_priv, RDMA_CM_DESTROYING); mutex_unlock(&id_priv->handler_mutex); @@ -4010,7 +4044,7 @@ static void cma_set_mgid(struct rdma_id_private *id_priv, memcpy(mgid, &sin6->sin6_addr, sizeof *mgid); } else if (addr->sa_family == AF_IB) { memcpy(mgid, &((struct sockaddr_ib *) addr)->sib_addr, sizeof *mgid); - } else if ((addr->sa_family == AF_INET6)) { + } else if (addr->sa_family == AF_INET6) { ipv6_ib_mc_map(&sin6->sin6_addr, dev_addr->broadcast, mc_map); if (id_priv->id.ps == RDMA_PS_UDP) mc_map[7] = 0x01; /* Use RDMA CM signature */ @@ -4168,8 +4202,6 @@ static int cma_iboe_join_multicast(struct rdma_id_private *id_priv, if (!send_only) { err = cma_igmp_send(ndev, &mc->multicast.ib->rec.mgid, true); - if (!err) - mc->igmp_joined = true; } } } else { @@ -4221,26 +4253,29 @@ int rdma_join_multicast(struct rdma_cm_id *id, struct sockaddr *addr, memcpy(&mc->addr, addr, rdma_addr_size(addr)); mc->context = context; mc->id_priv = id_priv; - mc->igmp_joined = false; mc->join_state = join_state; - spin_lock(&id_priv->lock); - list_add(&mc->list, &id_priv->mc_list); - spin_unlock(&id_priv->lock); if (rdma_protocol_roce(id->device, id->port_num)) { kref_init(&mc->mcref); ret = cma_iboe_join_multicast(id_priv, mc); - } else if (rdma_cap_ib_mcast(id->device, id->port_num)) + if (ret) + goto out_err; + } else if (rdma_cap_ib_mcast(id->device, id->port_num)) { ret = cma_join_ib_multicast(id_priv, mc); - else + if (ret) + goto out_err; + } else { ret = -ENOSYS; - - if (ret) { - spin_lock_irq(&id_priv->lock); - list_del(&mc->list); - spin_unlock_irq(&id_priv->lock); - kfree(mc); + goto out_err; } + + spin_lock(&id_priv->lock); + list_add(&mc->list, &id_priv->mc_list); + spin_unlock(&id_priv->lock); + + return 0; +out_err: + kfree(mc); return ret; } EXPORT_SYMBOL(rdma_join_multicast); @@ -4268,23 +4303,7 @@ void rdma_leave_multicast(struct rdma_cm_id *id, struct sockaddr *addr) ib_sa_free_multicast(mc->multicast.ib); kfree(mc); } else if (rdma_protocol_roce(id->device, id->port_num)) { - if (mc->igmp_joined) { - struct rdma_dev_addr *dev_addr = - &id->route.addr.dev_addr; - struct net_device *ndev = NULL; - - if (dev_addr->bound_dev_if) - ndev = dev_get_by_index(dev_addr->net, - dev_addr->bound_dev_if); - if (ndev) { - cma_igmp_send(ndev, - &mc->multicast.ib->rec.mgid, - false); - dev_put(ndev); - } - mc->igmp_joined = false; - } - kref_put(&mc->mcref, release_mc); + cma_leave_roce_mc_group(id_priv, mc); } return; } @@ -4410,7 +4429,7 @@ free_cma_dev: static int cma_remove_id_dev(struct rdma_id_private *id_priv) { - struct rdma_cm_event event; + struct rdma_cm_event event = {}; enum rdma_cm_state state; int ret = 0; @@ -4426,7 +4445,6 @@ static int cma_remove_id_dev(struct rdma_id_private *id_priv) if (!cma_comp(id_priv, RDMA_CM_DEVICE_REMOVAL)) goto out; - memset(&event, 0, sizeof event); event.event = RDMA_CM_EVENT_DEVICE_REMOVAL; ret = id_priv->id.event_handler(&id_priv->id, &event); out: diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h index fae417a391fb..77c7005c396c 100644 --- a/drivers/infiniband/core/core_priv.h +++ b/drivers/infiniband/core/core_priv.h @@ -91,8 +91,8 @@ void ib_device_unregister_sysfs(struct ib_device *device); typedef void (*roce_netdev_callback)(struct ib_device *device, u8 port, struct net_device *idev, void *cookie); -typedef int (*roce_netdev_filter)(struct ib_device *device, u8 port, - struct net_device *idev, void *cookie); +typedef bool (*roce_netdev_filter)(struct ib_device *device, u8 port, + struct net_device *idev, void *cookie); void ib_enum_roce_netdev(struct ib_device *ib_dev, roce_netdev_filter filter, diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 6fa4c59dc7a7..db3b6271f09d 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -105,8 +105,6 @@ static int ib_device_check_mandatory(struct ib_device *device) IB_MANDATORY_FUNC(query_pkey), IB_MANDATORY_FUNC(alloc_pd), IB_MANDATORY_FUNC(dealloc_pd), - IB_MANDATORY_FUNC(create_ah), - IB_MANDATORY_FUNC(destroy_ah), IB_MANDATORY_FUNC(create_qp), IB_MANDATORY_FUNC(modify_qp), IB_MANDATORY_FUNC(destroy_qp), @@ -862,25 +860,6 @@ int ib_query_port(struct ib_device *device, EXPORT_SYMBOL(ib_query_port); /** - * ib_query_gid - Get GID table entry - * @device:Device to query - * @port_num:Port number to query - * @index:GID table index to query - * @gid:Returned GID - * @attr: Returned GID attributes related to this GID index (only in RoCE). - * NULL means ignore. - * - * ib_query_gid() fetches the specified GID table entry from the cache. - */ -int ib_query_gid(struct ib_device *device, - u8 port_num, int index, union ib_gid *gid, - struct ib_gid_attr *attr) -{ - return ib_get_cached_gid(device, port_num, index, gid, attr); -} -EXPORT_SYMBOL(ib_query_gid); - -/** * ib_enum_roce_netdev - enumerate all RoCE ports * @ib_dev : IB device we want to query * @filter: Should we call the callback? @@ -1057,7 +1036,7 @@ int ib_find_gid(struct ib_device *device, union ib_gid *gid, continue; for (i = 0; i < device->port_immutable[port].gid_tbl_len; ++i) { - ret = ib_query_gid(device, port, i, &tmp_gid, NULL); + ret = rdma_query_gid(device, port, i, &tmp_gid); if (ret) return ret; if (!memcmp(&tmp_gid, gid, sizeof *gid)) { diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c index f742ae7a768b..ef459f2f2eeb 100644 --- a/drivers/infiniband/core/mad.c +++ b/drivers/infiniband/core/mad.c @@ -38,6 +38,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/dma-mapping.h> +#include <linux/idr.h> #include <linux/slab.h> #include <linux/module.h> #include <linux/security.h> @@ -58,8 +59,13 @@ MODULE_PARM_DESC(send_queue_size, "Size of send queue in number of work requests module_param_named(recv_queue_size, mad_recvq_size, int, 0444); MODULE_PARM_DESC(recv_queue_size, "Size of receive queue in number of work requests"); +/* + * The mlx4 driver uses the top byte to distinguish which virtual function + * generated the MAD, so we must avoid using it. + */ +#define AGENT_ID_LIMIT (1 << 24) +static DEFINE_IDR(ib_mad_clients); static struct list_head ib_mad_port_list; -static atomic_t ib_mad_client_id = ATOMIC_INIT(0); /* Port list lock */ static DEFINE_SPINLOCK(ib_mad_port_list_lock); @@ -190,6 +196,8 @@ EXPORT_SYMBOL(ib_response_mad); /* * ib_register_mad_agent - Register to send/receive MADs + * + * Context: Process context. */ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device, u8 port_num, @@ -210,7 +218,6 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device, struct ib_mad_mgmt_vendor_class *vendor_class; struct ib_mad_mgmt_method_table *method; int ret2, qpn; - unsigned long flags; u8 mgmt_class, vclass; /* Validate parameters */ @@ -376,13 +383,24 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device, goto error4; } - spin_lock_irqsave(&port_priv->reg_lock, flags); - mad_agent_priv->agent.hi_tid = atomic_inc_return(&ib_mad_client_id); + idr_preload(GFP_KERNEL); + idr_lock(&ib_mad_clients); + ret2 = idr_alloc_cyclic(&ib_mad_clients, mad_agent_priv, 0, + AGENT_ID_LIMIT, GFP_ATOMIC); + idr_unlock(&ib_mad_clients); + idr_preload_end(); + + if (ret2 < 0) { + ret = ERR_PTR(ret2); + goto error5; + } + mad_agent_priv->agent.hi_tid = ret2; /* * Make sure MAD registration (if supplied) * is non overlapping with any existing ones */ + spin_lock_irq(&port_priv->reg_lock); if (mad_reg_req) { mgmt_class = convert_mgmt_class(mad_reg_req->mgmt_class); if (!is_vendor_class(mgmt_class)) { @@ -393,7 +411,7 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device, if (method) { if (method_in_use(&method, mad_reg_req)) - goto error5; + goto error6; } } ret2 = add_nonoui_reg_req(mad_reg_req, mad_agent_priv, @@ -409,24 +427,25 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device, if (is_vendor_method_in_use( vendor_class, mad_reg_req)) - goto error5; + goto error6; } } ret2 = add_oui_reg_req(mad_reg_req, mad_agent_priv); } if (ret2) { ret = ERR_PTR(ret2); - goto error5; + goto error6; } } - - /* Add mad agent into port's agent list */ - list_add_tail(&mad_agent_priv->agent_list, &port_priv->agent_list); - spin_unlock_irqrestore(&port_priv->reg_lock, flags); + spin_unlock_irq(&port_priv->reg_lock); return &mad_agent_priv->agent; +error6: + spin_unlock_irq(&port_priv->reg_lock); + idr_lock(&ib_mad_clients); + idr_remove(&ib_mad_clients, mad_agent_priv->agent.hi_tid); + idr_unlock(&ib_mad_clients); error5: - spin_unlock_irqrestore(&port_priv->reg_lock, flags); ib_mad_agent_security_cleanup(&mad_agent_priv->agent); error4: kfree(reg_req); @@ -575,7 +594,6 @@ static inline void deref_snoop_agent(struct ib_mad_snoop_private *mad_snoop_priv static void unregister_mad_agent(struct ib_mad_agent_private *mad_agent_priv) { struct ib_mad_port_private *port_priv; - unsigned long flags; /* Note that we could still be handling received MADs */ @@ -587,10 +605,12 @@ static void unregister_mad_agent(struct ib_mad_agent_private *mad_agent_priv) port_priv = mad_agent_priv->qp_info->port_priv; cancel_delayed_work(&mad_agent_priv->timed_work); - spin_lock_irqsave(&port_priv->reg_lock, flags); + spin_lock_irq(&port_priv->reg_lock); remove_mad_reg_req(mad_agent_priv); - list_del(&mad_agent_priv->agent_list); - spin_unlock_irqrestore(&port_priv->reg_lock, flags); + spin_unlock_irq(&port_priv->reg_lock); + idr_lock(&ib_mad_clients); + idr_remove(&ib_mad_clients, mad_agent_priv->agent.hi_tid); + idr_unlock(&ib_mad_clients); flush_workqueue(port_priv->wq); ib_cancel_rmpp_recvs(mad_agent_priv); @@ -601,7 +621,7 @@ static void unregister_mad_agent(struct ib_mad_agent_private *mad_agent_priv) ib_mad_agent_security_cleanup(&mad_agent_priv->agent); kfree(mad_agent_priv->reg_req); - kfree(mad_agent_priv); + kfree_rcu(mad_agent_priv, rcu); } static void unregister_mad_snoop(struct ib_mad_snoop_private *mad_snoop_priv) @@ -625,6 +645,8 @@ static void unregister_mad_snoop(struct ib_mad_snoop_private *mad_snoop_priv) /* * ib_unregister_mad_agent - Unregisters a client from using MAD services + * + * Context: Process context. */ void ib_unregister_mad_agent(struct ib_mad_agent *mad_agent) { @@ -1159,7 +1181,6 @@ int ib_send_mad(struct ib_mad_send_wr_private *mad_send_wr) { struct ib_mad_qp_info *qp_info; struct list_head *list; - struct ib_send_wr *bad_send_wr; struct ib_mad_agent *mad_agent; struct ib_sge *sge; unsigned long flags; @@ -1197,7 +1218,7 @@ int ib_send_mad(struct ib_mad_send_wr_private *mad_send_wr) spin_lock_irqsave(&qp_info->send_queue.lock, flags); if (qp_info->send_queue.count < qp_info->send_queue.max_active) { ret = ib_post_send(mad_agent->qp, &mad_send_wr->send_wr.wr, - &bad_send_wr); + NULL); list = &qp_info->send_queue.list; } else { ret = 0; @@ -1720,22 +1741,19 @@ find_mad_agent(struct ib_mad_port_private *port_priv, struct ib_mad_agent_private *mad_agent = NULL; unsigned long flags; - spin_lock_irqsave(&port_priv->reg_lock, flags); if (ib_response_mad(mad_hdr)) { u32 hi_tid; - struct ib_mad_agent_private *entry; /* * Routing is based on high 32 bits of transaction ID * of MAD. */ hi_tid = be64_to_cpu(mad_hdr->tid) >> 32; - list_for_each_entry(entry, &port_priv->agent_list, agent_list) { - if (entry->agent.hi_tid == hi_tid) { - mad_agent = entry; - break; - } - } + rcu_read_lock(); + mad_agent = idr_find(&ib_mad_clients, hi_tid); + if (mad_agent && !atomic_inc_not_zero(&mad_agent->refcount)) + mad_agent = NULL; + rcu_read_unlock(); } else { struct ib_mad_mgmt_class_table *class; struct ib_mad_mgmt_method_table *method; @@ -1744,6 +1762,7 @@ find_mad_agent(struct ib_mad_port_private *port_priv, const struct ib_vendor_mad *vendor_mad; int index; + spin_lock_irqsave(&port_priv->reg_lock, flags); /* * Routing is based on version, class, and method * For "newer" vendor MADs, also based on OUI @@ -1783,20 +1802,19 @@ find_mad_agent(struct ib_mad_port_private *port_priv, ~IB_MGMT_METHOD_RESP]; } } + if (mad_agent) + atomic_inc(&mad_agent->refcount); +out: + spin_unlock_irqrestore(&port_priv->reg_lock, flags); } - if (mad_agent) { - if (mad_agent->agent.recv_handler) - atomic_inc(&mad_agent->refcount); - else { - dev_notice(&port_priv->device->dev, - "No receive handler for client %p on port %d\n", - &mad_agent->agent, port_priv->port_num); - mad_agent = NULL; - } + if (mad_agent && !mad_agent->agent.recv_handler) { + dev_notice(&port_priv->device->dev, + "No receive handler for client %p on port %d\n", + &mad_agent->agent, port_priv->port_num); + deref_mad_agent(mad_agent); + mad_agent = NULL; } -out: - spin_unlock_irqrestore(&port_priv->reg_lock, flags); return mad_agent; } @@ -1896,8 +1914,8 @@ static inline int rcv_has_same_gid(const struct ib_mad_agent_private *mad_agent_ const struct ib_global_route *grh = rdma_ah_read_grh(&attr); - if (ib_get_cached_gid(device, port_num, - grh->sgid_index, &sgid, NULL)) + if (rdma_query_gid(device, port_num, + grh->sgid_index, &sgid)) return 0; return !memcmp(sgid.raw, rwc->recv_buf.grh->dgid.raw, 16); @@ -2457,7 +2475,6 @@ static void ib_mad_send_done(struct ib_cq *cq, struct ib_wc *wc) struct ib_mad_send_wr_private *mad_send_wr, *queued_send_wr; struct ib_mad_qp_info *qp_info; struct ib_mad_queue *send_queue; - struct ib_send_wr *bad_send_wr; struct ib_mad_send_wc mad_send_wc; unsigned long flags; int ret; @@ -2507,7 +2524,7 @@ retry: if (queued_send_wr) { ret = ib_post_send(qp_info->qp, &queued_send_wr->send_wr.wr, - &bad_send_wr); + NULL); if (ret) { dev_err(&port_priv->device->dev, "ib_post_send failed: %d\n", ret); @@ -2552,11 +2569,9 @@ static bool ib_mad_send_error(struct ib_mad_port_private *port_priv, if (wc->status == IB_WC_WR_FLUSH_ERR) { if (mad_send_wr->retry) { /* Repost send */ - struct ib_send_wr *bad_send_wr; - mad_send_wr->retry = 0; ret = ib_post_send(qp_info->qp, &mad_send_wr->send_wr.wr, - &bad_send_wr); + NULL); if (!ret) return false; } @@ -2872,7 +2887,7 @@ static int ib_mad_post_receive_mads(struct ib_mad_qp_info *qp_info, int post, ret; struct ib_mad_private *mad_priv; struct ib_sge sg_list; - struct ib_recv_wr recv_wr, *bad_recv_wr; + struct ib_recv_wr recv_wr; struct ib_mad_queue *recv_queue = &qp_info->recv_queue; /* Initialize common scatter list fields */ @@ -2916,7 +2931,7 @@ static int ib_mad_post_receive_mads(struct ib_mad_qp_info *qp_info, post = (++recv_queue->count < recv_queue->max_active); list_add_tail(&mad_priv->header.mad_list.list, &recv_queue->list); spin_unlock_irqrestore(&recv_queue->lock, flags); - ret = ib_post_recv(qp_info->qp, &recv_wr, &bad_recv_wr); + ret = ib_post_recv(qp_info->qp, &recv_wr, NULL); if (ret) { spin_lock_irqsave(&recv_queue->lock, flags); list_del(&mad_priv->header.mad_list.list); @@ -3159,7 +3174,6 @@ static int ib_mad_port_open(struct ib_device *device, port_priv->device = device; port_priv->port_num = port_num; spin_lock_init(&port_priv->reg_lock); - INIT_LIST_HEAD(&port_priv->agent_list); init_mad_qp(port_priv, &port_priv->qp_info[0]); init_mad_qp(port_priv, &port_priv->qp_info[1]); @@ -3338,6 +3352,9 @@ int ib_mad_init(void) INIT_LIST_HEAD(&ib_mad_port_list); + /* Client ID 0 is used for snoop-only clients */ + idr_alloc(&ib_mad_clients, NULL, 0, 0, GFP_KERNEL); + if (ib_register_client(&mad_client)) { pr_err("Couldn't register ib_mad client\n"); return -EINVAL; diff --git a/drivers/infiniband/core/mad_priv.h b/drivers/infiniband/core/mad_priv.h index 28669f6419e1..d84ae1671898 100644 --- a/drivers/infiniband/core/mad_priv.h +++ b/drivers/infiniband/core/mad_priv.h @@ -89,7 +89,6 @@ struct ib_rmpp_segment { }; struct ib_mad_agent_private { - struct list_head agent_list; struct ib_mad_agent agent; struct ib_mad_reg_req *reg_req; struct ib_mad_qp_info *qp_info; @@ -105,7 +104,10 @@ struct ib_mad_agent_private { struct list_head rmpp_list; atomic_t refcount; - struct completion comp; + union { + struct completion comp; + struct rcu_head rcu; + }; }; struct ib_mad_snoop_private { @@ -203,7 +205,6 @@ struct ib_mad_port_private { spinlock_t reg_lock; struct ib_mad_mgmt_version_table version[MAX_MGMT_VERSION]; - struct list_head agent_list; struct workqueue_struct *wq; struct ib_mad_qp_info qp_info[IB_MAD_QPS_CORE]; }; diff --git a/drivers/infiniband/core/multicast.c b/drivers/infiniband/core/multicast.c index 6c48f4193dda..d50ff70bb24b 100644 --- a/drivers/infiniband/core/multicast.c +++ b/drivers/infiniband/core/multicast.c @@ -716,14 +716,28 @@ int ib_sa_get_mcmember_rec(struct ib_device *device, u8 port_num, } EXPORT_SYMBOL(ib_sa_get_mcmember_rec); +/** + * ib_init_ah_from_mcmember - Initialize AH attribute from multicast + * member record and gid of the device. + * @device: RDMA device + * @port_num: Port of the rdma device to consider + * @ndev: Optional netdevice, applicable only for RoCE + * @gid_type: GID type to consider + * @ah_attr: AH attribute to fillup on successful completion + * + * ib_init_ah_from_mcmember() initializes AH attribute based on multicast + * member record and other device properties. On success the caller is + * responsible to call rdma_destroy_ah_attr on the ah_attr. Returns 0 on + * success or appropriate error code. + * + */ int ib_init_ah_from_mcmember(struct ib_device *device, u8 port_num, struct ib_sa_mcmember_rec *rec, struct net_device *ndev, enum ib_gid_type gid_type, struct rdma_ah_attr *ah_attr) { - int ret; - u16 gid_index; + const struct ib_gid_attr *sgid_attr; /* GID table is not based on the netdevice for IB link layer, * so ignore ndev during search. @@ -733,26 +747,22 @@ int ib_init_ah_from_mcmember(struct ib_device *device, u8 port_num, else if (!rdma_protocol_roce(device, port_num)) return -EINVAL; - ret = ib_find_cached_gid_by_port(device, &rec->port_gid, - gid_type, port_num, - ndev, - &gid_index); - if (ret) - return ret; + sgid_attr = rdma_find_gid_by_port(device, &rec->port_gid, + gid_type, port_num, ndev); + if (IS_ERR(sgid_attr)) + return PTR_ERR(sgid_attr); - memset(ah_attr, 0, sizeof *ah_attr); + memset(ah_attr, 0, sizeof(*ah_attr)); ah_attr->type = rdma_ah_find_type(device, port_num); rdma_ah_set_dlid(ah_attr, be16_to_cpu(rec->mlid)); rdma_ah_set_sl(ah_attr, rec->sl); rdma_ah_set_port_num(ah_attr, port_num); rdma_ah_set_static_rate(ah_attr, rec->rate); - - rdma_ah_set_grh(ah_attr, &rec->mgid, - be32_to_cpu(rec->flow_label), - (u8)gid_index, - rec->hop_limit, - rec->traffic_class); + rdma_move_grh_sgid_attr(ah_attr, &rec->mgid, + be32_to_cpu(rec->flow_label), + rec->hop_limit, rec->traffic_class, + sgid_attr); return 0; } EXPORT_SYMBOL(ib_init_ah_from_mcmember); diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c index 340c7bea45ab..0385ab438320 100644 --- a/drivers/infiniband/core/nldev.c +++ b/drivers/infiniband/core/nldev.c @@ -237,15 +237,15 @@ static int fill_port_info(struct sk_buff *msg, if (ret) return ret; - BUILD_BUG_ON(sizeof(attr.port_cap_flags) > sizeof(u64)); - if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_CAP_FLAGS, - (u64)attr.port_cap_flags, RDMA_NLDEV_ATTR_PAD)) - return -EMSGSIZE; - if (rdma_protocol_ib(device, port) && - nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_SUBNET_PREFIX, - attr.subnet_prefix, RDMA_NLDEV_ATTR_PAD)) - return -EMSGSIZE; if (rdma_protocol_ib(device, port)) { + BUILD_BUG_ON(sizeof(attr.port_cap_flags) > sizeof(u64)); + if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_CAP_FLAGS, + (u64)attr.port_cap_flags, + RDMA_NLDEV_ATTR_PAD)) + return -EMSGSIZE; + if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_SUBNET_PREFIX, + attr.subnet_prefix, RDMA_NLDEV_ATTR_PAD)) + return -EMSGSIZE; if (nla_put_u32(msg, RDMA_NLDEV_ATTR_LID, attr.lid)) return -EMSGSIZE; if (nla_put_u32(msg, RDMA_NLDEV_ATTR_SM_LID, attr.sm_lid)) diff --git a/drivers/infiniband/core/rdma_core.c b/drivers/infiniband/core/rdma_core.c index 475910ffbcb6..6eb64c6f0802 100644 --- a/drivers/infiniband/core/rdma_core.c +++ b/drivers/infiniband/core/rdma_core.c @@ -32,6 +32,7 @@ #include <linux/file.h> #include <linux/anon_inodes.h> +#include <linux/sched/mm.h> #include <rdma/ib_verbs.h> #include <rdma/uverbs_types.h> #include <linux/rcupdate.h> @@ -41,51 +42,6 @@ #include "core_priv.h" #include "rdma_core.h" -int uverbs_ns_idx(u16 *id, unsigned int ns_count) -{ - int ret = (*id & UVERBS_ID_NS_MASK) >> UVERBS_ID_NS_SHIFT; - - if (ret >= ns_count) - return -EINVAL; - - *id &= ~UVERBS_ID_NS_MASK; - return ret; -} - -const struct uverbs_object_spec *uverbs_get_object(const struct ib_device *ibdev, - uint16_t object) -{ - const struct uverbs_root_spec *object_hash = ibdev->specs_root; - const struct uverbs_object_spec_hash *objects; - int ret = uverbs_ns_idx(&object, object_hash->num_buckets); - - if (ret < 0) - return NULL; - - objects = object_hash->object_buckets[ret]; - - if (object >= objects->num_objects) - return NULL; - - return objects->objects[object]; -} - -const struct uverbs_method_spec *uverbs_get_method(const struct uverbs_object_spec *object, - uint16_t method) -{ - const struct uverbs_method_spec_hash *methods; - int ret = uverbs_ns_idx(&method, object->num_buckets); - - if (ret < 0) - return NULL; - - methods = object->method_buckets[ret]; - if (method >= methods->num_methods) - return NULL; - - return methods->methods[method]; -} - void uverbs_uobject_get(struct ib_uobject *uobject) { kref_get(&uobject->ref); @@ -96,7 +52,7 @@ static void uverbs_uobject_free(struct kref *ref) struct ib_uobject *uobj = container_of(ref, struct ib_uobject, ref); - if (uobj->type->type_class->needs_kfree_rcu) + if (uobj->uapi_object->type_class->needs_kfree_rcu) kfree_rcu(uobj, rcu); else kfree(uobj); @@ -107,7 +63,8 @@ void uverbs_uobject_put(struct ib_uobject *uobject) kref_put(&uobject->ref, uverbs_uobject_free); } -static int uverbs_try_lock_object(struct ib_uobject *uobj, bool exclusive) +static int uverbs_try_lock_object(struct ib_uobject *uobj, + enum rdma_lookup_mode mode) { /* * When a shared access is required, we use a positive counter. Each @@ -120,27 +77,211 @@ static int uverbs_try_lock_object(struct ib_uobject *uobj, bool exclusive) * concurrently, setting the counter to zero is enough for releasing * this lock. */ - if (!exclusive) + switch (mode) { + case UVERBS_LOOKUP_READ: return atomic_fetch_add_unless(&uobj->usecnt, 1, -1) == -1 ? -EBUSY : 0; + case UVERBS_LOOKUP_WRITE: + /* lock is exclusive */ + return atomic_cmpxchg(&uobj->usecnt, 0, -1) == 0 ? 0 : -EBUSY; + case UVERBS_LOOKUP_DESTROY: + return 0; + } + return 0; +} + +static void assert_uverbs_usecnt(struct ib_uobject *uobj, + enum rdma_lookup_mode mode) +{ +#ifdef CONFIG_LOCKDEP + switch (mode) { + case UVERBS_LOOKUP_READ: + WARN_ON(atomic_read(&uobj->usecnt) <= 0); + break; + case UVERBS_LOOKUP_WRITE: + WARN_ON(atomic_read(&uobj->usecnt) != -1); + break; + case UVERBS_LOOKUP_DESTROY: + break; + } +#endif +} + +/* + * This must be called with the hw_destroy_rwsem locked for read or write, + * also the uobject itself must be locked for write. + * + * Upon return the HW object is guaranteed to be destroyed. + * + * For RDMA_REMOVE_ABORT, the hw_destroy_rwsem is not required to be held, + * however the type's allocat_commit function cannot have been called and the + * uobject cannot be on the uobjects_lists + * + * For RDMA_REMOVE_DESTROY the caller shold be holding a kref (eg via + * rdma_lookup_get_uobject) and the object is left in a state where the caller + * needs to call rdma_lookup_put_uobject. + * + * For all other destroy modes this function internally unlocks the uobject + * and consumes the kref on the uobj. + */ +static int uverbs_destroy_uobject(struct ib_uobject *uobj, + enum rdma_remove_reason reason) +{ + struct ib_uverbs_file *ufile = uobj->ufile; + unsigned long flags; + int ret; + + lockdep_assert_held(&ufile->hw_destroy_rwsem); + assert_uverbs_usecnt(uobj, UVERBS_LOOKUP_WRITE); + + if (uobj->object) { + ret = uobj->uapi_object->type_class->destroy_hw(uobj, reason); + if (ret) { + if (ib_is_destroy_retryable(ret, reason, uobj)) + return ret; + + /* Nothing to be done, dangle the memory and move on */ + WARN(true, + "ib_uverbs: failed to remove uobject id %d, driver err=%d", + uobj->id, ret); + } + + uobj->object = NULL; + } - /* lock is either WRITE or DESTROY - should be exclusive */ - return atomic_cmpxchg(&uobj->usecnt, 0, -1) == 0 ? 0 : -EBUSY; + if (reason == RDMA_REMOVE_ABORT) { + WARN_ON(!list_empty(&uobj->list)); + WARN_ON(!uobj->context); + uobj->uapi_object->type_class->alloc_abort(uobj); + } + + uobj->context = NULL; + + /* + * For DESTROY the usecnt is held write locked, the caller is expected + * to put it unlock and put the object when done with it. Only DESTROY + * can remove the IDR handle. + */ + if (reason != RDMA_REMOVE_DESTROY) + atomic_set(&uobj->usecnt, 0); + else + uobj->uapi_object->type_class->remove_handle(uobj); + + if (!list_empty(&uobj->list)) { + spin_lock_irqsave(&ufile->uobjects_lock, flags); + list_del_init(&uobj->list); + spin_unlock_irqrestore(&ufile->uobjects_lock, flags); + + /* + * Pairs with the get in rdma_alloc_commit_uobject(), could + * destroy uobj. + */ + uverbs_uobject_put(uobj); + } + + /* + * When aborting the stack kref remains owned by the core code, and is + * not transferred into the type. Pairs with the get in alloc_uobj + */ + if (reason == RDMA_REMOVE_ABORT) + uverbs_uobject_put(uobj); + + return 0; } -static struct ib_uobject *alloc_uobj(struct ib_ucontext *context, - const struct uverbs_obj_type *type) +/* + * This calls uverbs_destroy_uobject() using the RDMA_REMOVE_DESTROY + * sequence. It should only be used from command callbacks. On success the + * caller must pair this with rdma_lookup_put_uobject(LOOKUP_WRITE). This + * version requires the caller to have already obtained an + * LOOKUP_DESTROY uobject kref. + */ +int uobj_destroy(struct ib_uobject *uobj) { - struct ib_uobject *uobj = kzalloc(type->obj_size, GFP_KERNEL); + struct ib_uverbs_file *ufile = uobj->ufile; + int ret; + + down_read(&ufile->hw_destroy_rwsem); + + ret = uverbs_try_lock_object(uobj, UVERBS_LOOKUP_WRITE); + if (ret) + goto out_unlock; + + ret = uverbs_destroy_uobject(uobj, RDMA_REMOVE_DESTROY); + if (ret) { + atomic_set(&uobj->usecnt, 0); + goto out_unlock; + } +out_unlock: + up_read(&ufile->hw_destroy_rwsem); + return ret; +} + +/* + * uobj_get_destroy destroys the HW object and returns a handle to the uobj + * with a NULL object pointer. The caller must pair this with + * uverbs_put_destroy. + */ +struct ib_uobject *__uobj_get_destroy(const struct uverbs_api_object *obj, + u32 id, struct ib_uverbs_file *ufile) +{ + struct ib_uobject *uobj; + int ret; + + uobj = rdma_lookup_get_uobject(obj, ufile, id, UVERBS_LOOKUP_DESTROY); + if (IS_ERR(uobj)) + return uobj; + + ret = uobj_destroy(uobj); + if (ret) { + rdma_lookup_put_uobject(uobj, UVERBS_LOOKUP_DESTROY); + return ERR_PTR(ret); + } + + return uobj; +} + +/* + * Does both uobj_get_destroy() and uobj_put_destroy(). Returns success_res + * on success (negative errno on failure). For use by callers that do not need + * the uobj. + */ +int __uobj_perform_destroy(const struct uverbs_api_object *obj, u32 id, + struct ib_uverbs_file *ufile, int success_res) +{ + struct ib_uobject *uobj; + + uobj = __uobj_get_destroy(obj, id, ufile); + if (IS_ERR(uobj)) + return PTR_ERR(uobj); + + rdma_lookup_put_uobject(uobj, UVERBS_LOOKUP_WRITE); + return success_res; +} + +/* alloc_uobj must be undone by uverbs_destroy_uobject() */ +static struct ib_uobject *alloc_uobj(struct ib_uverbs_file *ufile, + const struct uverbs_api_object *obj) +{ + struct ib_uobject *uobj; + struct ib_ucontext *ucontext; + + ucontext = ib_uverbs_get_ucontext(ufile); + if (IS_ERR(ucontext)) + return ERR_CAST(ucontext); + + uobj = kzalloc(obj->type_attrs->obj_size, GFP_KERNEL); if (!uobj) return ERR_PTR(-ENOMEM); /* * user_handle should be filled by the handler, * The object is added to the list in the commit stage. */ - uobj->context = context; - uobj->type = type; + uobj->ufile = ufile; + uobj->context = ucontext; + INIT_LIST_HEAD(&uobj->list); + uobj->uapi_object = obj; /* * Allocated objects start out as write locked to deny any other * syscalls from accessing them until they are committed. See @@ -157,45 +298,39 @@ static int idr_add_uobj(struct ib_uobject *uobj) int ret; idr_preload(GFP_KERNEL); - spin_lock(&uobj->context->ufile->idr_lock); + spin_lock(&uobj->ufile->idr_lock); /* * We start with allocating an idr pointing to NULL. This represents an * object which isn't initialized yet. We'll replace it later on with * the real object once we commit. */ - ret = idr_alloc(&uobj->context->ufile->idr, NULL, 0, + ret = idr_alloc(&uobj->ufile->idr, NULL, 0, min_t(unsigned long, U32_MAX - 1, INT_MAX), GFP_NOWAIT); if (ret >= 0) uobj->id = ret; - spin_unlock(&uobj->context->ufile->idr_lock); + spin_unlock(&uobj->ufile->idr_lock); idr_preload_end(); return ret < 0 ? ret : 0; } -/* - * It only removes it from the uobjects list, uverbs_uobject_put() is still - * required. - */ -static void uverbs_idr_remove_uobj(struct ib_uobject *uobj) -{ - spin_lock(&uobj->context->ufile->idr_lock); - idr_remove(&uobj->context->ufile->idr, uobj->id); - spin_unlock(&uobj->context->ufile->idr_lock); -} - /* Returns the ib_uobject or an error. The caller should check for IS_ERR. */ -static struct ib_uobject *lookup_get_idr_uobject(const struct uverbs_obj_type *type, - struct ib_ucontext *ucontext, - int id, bool exclusive) +static struct ib_uobject * +lookup_get_idr_uobject(const struct uverbs_api_object *obj, + struct ib_uverbs_file *ufile, s64 id, + enum rdma_lookup_mode mode) { struct ib_uobject *uobj; + unsigned long idrno = id; + + if (id < 0 || id > ULONG_MAX) + return ERR_PTR(-EINVAL); rcu_read_lock(); /* object won't be released as we're protected in rcu */ - uobj = idr_find(&ucontext->ufile->idr, id); + uobj = idr_find(&ufile->idr, idrno); if (!uobj) { uobj = ERR_PTR(-ENOENT); goto free; @@ -215,19 +350,28 @@ free: return uobj; } -static struct ib_uobject *lookup_get_fd_uobject(const struct uverbs_obj_type *type, - struct ib_ucontext *ucontext, - int id, bool exclusive) +static struct ib_uobject * +lookup_get_fd_uobject(const struct uverbs_api_object *obj, + struct ib_uverbs_file *ufile, s64 id, + enum rdma_lookup_mode mode) { + const struct uverbs_obj_fd_type *fd_type; struct file *f; struct ib_uobject *uobject; - const struct uverbs_obj_fd_type *fd_type = - container_of(type, struct uverbs_obj_fd_type, type); + int fdno = id; - if (exclusive) + if (fdno != id) + return ERR_PTR(-EINVAL); + + if (mode != UVERBS_LOOKUP_READ) return ERR_PTR(-EOPNOTSUPP); - f = fget(id); + if (!obj->type_attrs) + return ERR_PTR(-EIO); + fd_type = + container_of(obj->type_attrs, struct uverbs_obj_fd_type, type); + + f = fget(fdno); if (!f) return ERR_PTR(-EBADF); @@ -246,43 +390,55 @@ static struct ib_uobject *lookup_get_fd_uobject(const struct uverbs_obj_type *ty return uobject; } -struct ib_uobject *rdma_lookup_get_uobject(const struct uverbs_obj_type *type, - struct ib_ucontext *ucontext, - int id, bool exclusive) +struct ib_uobject *rdma_lookup_get_uobject(const struct uverbs_api_object *obj, + struct ib_uverbs_file *ufile, s64 id, + enum rdma_lookup_mode mode) { struct ib_uobject *uobj; int ret; - uobj = type->type_class->lookup_get(type, ucontext, id, exclusive); + if (!obj) + return ERR_PTR(-EINVAL); + + uobj = obj->type_class->lookup_get(obj, ufile, id, mode); if (IS_ERR(uobj)) return uobj; - if (uobj->type != type) { + if (uobj->uapi_object != obj) { ret = -EINVAL; goto free; } - ret = uverbs_try_lock_object(uobj, exclusive); - if (ret) { - WARN(ucontext->cleanup_reason, - "ib_uverbs: Trying to lookup_get while cleanup context\n"); + /* + * If we have been disassociated block every command except for + * DESTROY based commands. + */ + if (mode != UVERBS_LOOKUP_DESTROY && + !srcu_dereference(ufile->device->ib_dev, + &ufile->device->disassociate_srcu)) { + ret = -EIO; goto free; } + ret = uverbs_try_lock_object(uobj, mode); + if (ret) + goto free; + return uobj; free: - uobj->type->type_class->lookup_put(uobj, exclusive); + obj->type_class->lookup_put(uobj, mode); uverbs_uobject_put(uobj); return ERR_PTR(ret); } -static struct ib_uobject *alloc_begin_idr_uobject(const struct uverbs_obj_type *type, - struct ib_ucontext *ucontext) +static struct ib_uobject * +alloc_begin_idr_uobject(const struct uverbs_api_object *obj, + struct ib_uverbs_file *ufile) { int ret; struct ib_uobject *uobj; - uobj = alloc_uobj(ucontext, type); + uobj = alloc_uobj(ufile, obj); if (IS_ERR(uobj)) return uobj; @@ -290,7 +446,7 @@ static struct ib_uobject *alloc_begin_idr_uobject(const struct uverbs_obj_type * if (ret) goto uobj_put; - ret = ib_rdmacg_try_charge(&uobj->cg_obj, ucontext->device, + ret = ib_rdmacg_try_charge(&uobj->cg_obj, uobj->context->device, RDMACG_RESOURCE_HCA_OBJECT); if (ret) goto idr_remove; @@ -298,304 +454,305 @@ static struct ib_uobject *alloc_begin_idr_uobject(const struct uverbs_obj_type * return uobj; idr_remove: - uverbs_idr_remove_uobj(uobj); + spin_lock(&ufile->idr_lock); + idr_remove(&ufile->idr, uobj->id); + spin_unlock(&ufile->idr_lock); uobj_put: uverbs_uobject_put(uobj); return ERR_PTR(ret); } -static struct ib_uobject *alloc_begin_fd_uobject(const struct uverbs_obj_type *type, - struct ib_ucontext *ucontext) +static struct ib_uobject * +alloc_begin_fd_uobject(const struct uverbs_api_object *obj, + struct ib_uverbs_file *ufile) { - const struct uverbs_obj_fd_type *fd_type = - container_of(type, struct uverbs_obj_fd_type, type); int new_fd; struct ib_uobject *uobj; - struct ib_uobject_file *uobj_file; - struct file *filp; new_fd = get_unused_fd_flags(O_CLOEXEC); if (new_fd < 0) return ERR_PTR(new_fd); - uobj = alloc_uobj(ucontext, type); + uobj = alloc_uobj(ufile, obj); if (IS_ERR(uobj)) { put_unused_fd(new_fd); return uobj; } - uobj_file = container_of(uobj, struct ib_uobject_file, uobj); - filp = anon_inode_getfile(fd_type->name, - fd_type->fops, - uobj_file, - fd_type->flags); - if (IS_ERR(filp)) { - put_unused_fd(new_fd); - uverbs_uobject_put(uobj); - return (void *)filp; - } - - uobj_file->uobj.id = new_fd; - uobj_file->uobj.object = filp; - uobj_file->ufile = ucontext->ufile; - INIT_LIST_HEAD(&uobj->list); - kref_get(&uobj_file->ufile->ref); + uobj->id = new_fd; + uobj->ufile = ufile; return uobj; } -struct ib_uobject *rdma_alloc_begin_uobject(const struct uverbs_obj_type *type, - struct ib_ucontext *ucontext) +struct ib_uobject *rdma_alloc_begin_uobject(const struct uverbs_api_object *obj, + struct ib_uverbs_file *ufile) { - return type->type_class->alloc_begin(type, ucontext); -} + struct ib_uobject *ret; -static int __must_check remove_commit_idr_uobject(struct ib_uobject *uobj, - enum rdma_remove_reason why) -{ - const struct uverbs_obj_idr_type *idr_type = - container_of(uobj->type, struct uverbs_obj_idr_type, - type); - int ret = idr_type->destroy_object(uobj, why); + if (!obj) + return ERR_PTR(-EINVAL); /* - * We can only fail gracefully if the user requested to destroy the - * object. In the rest of the cases, just remove whatever you can. + * The hw_destroy_rwsem is held across the entire object creation and + * released during rdma_alloc_commit_uobject or + * rdma_alloc_abort_uobject */ - if (why == RDMA_REMOVE_DESTROY && ret) - return ret; - - ib_rdmacg_uncharge(&uobj->cg_obj, uobj->context->device, - RDMACG_RESOURCE_HCA_OBJECT); - uverbs_idr_remove_uobj(uobj); + if (!down_read_trylock(&ufile->hw_destroy_rwsem)) + return ERR_PTR(-EIO); + ret = obj->type_class->alloc_begin(obj, ufile); + if (IS_ERR(ret)) { + up_read(&ufile->hw_destroy_rwsem); + return ret; + } return ret; } -static void alloc_abort_fd_uobject(struct ib_uobject *uobj) +static void alloc_abort_idr_uobject(struct ib_uobject *uobj) { - struct ib_uobject_file *uobj_file = - container_of(uobj, struct ib_uobject_file, uobj); - struct file *filp = uobj->object; - int id = uobj_file->uobj.id; + ib_rdmacg_uncharge(&uobj->cg_obj, uobj->context->device, + RDMACG_RESOURCE_HCA_OBJECT); - /* Unsuccessful NEW */ - fput(filp); - put_unused_fd(id); + spin_lock(&uobj->ufile->idr_lock); + idr_remove(&uobj->ufile->idr, uobj->id); + spin_unlock(&uobj->ufile->idr_lock); } -static int __must_check remove_commit_fd_uobject(struct ib_uobject *uobj, - enum rdma_remove_reason why) +static int __must_check destroy_hw_idr_uobject(struct ib_uobject *uobj, + enum rdma_remove_reason why) { - const struct uverbs_obj_fd_type *fd_type = - container_of(uobj->type, struct uverbs_obj_fd_type, type); - struct ib_uobject_file *uobj_file = - container_of(uobj, struct ib_uobject_file, uobj); - int ret = fd_type->context_closed(uobj_file, why); + const struct uverbs_obj_idr_type *idr_type = + container_of(uobj->uapi_object->type_attrs, + struct uverbs_obj_idr_type, type); + int ret = idr_type->destroy_object(uobj, why); - if (why == RDMA_REMOVE_DESTROY && ret) + /* + * We can only fail gracefully if the user requested to destroy the + * object or when a retry may be called upon an error. + * In the rest of the cases, just remove whatever you can. + */ + if (ib_is_destroy_retryable(ret, why, uobj)) return ret; - if (why == RDMA_REMOVE_DURING_CLEANUP) { - alloc_abort_fd_uobject(uobj); - return ret; - } + if (why == RDMA_REMOVE_ABORT) + return 0; - uobj_file->uobj.context = NULL; - return ret; + ib_rdmacg_uncharge(&uobj->cg_obj, uobj->context->device, + RDMACG_RESOURCE_HCA_OBJECT); + + return 0; } -static void assert_uverbs_usecnt(struct ib_uobject *uobj, bool exclusive) +static void remove_handle_idr_uobject(struct ib_uobject *uobj) { -#ifdef CONFIG_LOCKDEP - if (exclusive) - WARN_ON(atomic_read(&uobj->usecnt) != -1); - else - WARN_ON(atomic_read(&uobj->usecnt) <= 0); -#endif + spin_lock(&uobj->ufile->idr_lock); + idr_remove(&uobj->ufile->idr, uobj->id); + spin_unlock(&uobj->ufile->idr_lock); + /* Matches the kref in alloc_commit_idr_uobject */ + uverbs_uobject_put(uobj); } -static int __must_check _rdma_remove_commit_uobject(struct ib_uobject *uobj, - enum rdma_remove_reason why) +static void alloc_abort_fd_uobject(struct ib_uobject *uobj) { - int ret; - struct ib_ucontext *ucontext = uobj->context; - - ret = uobj->type->type_class->remove_commit(uobj, why); - if (ret && why == RDMA_REMOVE_DESTROY) { - /* We couldn't remove the object, so just unlock the uobject */ - atomic_set(&uobj->usecnt, 0); - uobj->type->type_class->lookup_put(uobj, true); - } else { - mutex_lock(&ucontext->uobjects_lock); - list_del(&uobj->list); - mutex_unlock(&ucontext->uobjects_lock); - /* put the ref we took when we created the object */ - uverbs_uobject_put(uobj); - } - - return ret; + put_unused_fd(uobj->id); } -/* This is called only for user requested DESTROY reasons */ -int __must_check rdma_remove_commit_uobject(struct ib_uobject *uobj) +static int __must_check destroy_hw_fd_uobject(struct ib_uobject *uobj, + enum rdma_remove_reason why) { - int ret; - struct ib_ucontext *ucontext = uobj->context; - - /* put the ref count we took at lookup_get */ - uverbs_uobject_put(uobj); - /* Cleanup is running. Calling this should have been impossible */ - if (!down_read_trylock(&ucontext->cleanup_rwsem)) { - WARN(true, "ib_uverbs: Cleanup is running while removing an uobject\n"); - return 0; - } - assert_uverbs_usecnt(uobj, true); - ret = _rdma_remove_commit_uobject(uobj, RDMA_REMOVE_DESTROY); + const struct uverbs_obj_fd_type *fd_type = container_of( + uobj->uapi_object->type_attrs, struct uverbs_obj_fd_type, type); + int ret = fd_type->context_closed(uobj, why); - up_read(&ucontext->cleanup_rwsem); - return ret; -} + if (ib_is_destroy_retryable(ret, why, uobj)) + return ret; -static int null_obj_type_class_remove_commit(struct ib_uobject *uobj, - enum rdma_remove_reason why) -{ return 0; } -static const struct uverbs_obj_type null_obj_type = { - .type_class = &((const struct uverbs_obj_type_class){ - .remove_commit = null_obj_type_class_remove_commit, - /* be cautious */ - .needs_kfree_rcu = true}), -}; - -int rdma_explicit_destroy(struct ib_uobject *uobject) +static void remove_handle_fd_uobject(struct ib_uobject *uobj) { - int ret; - struct ib_ucontext *ucontext = uobject->context; - - /* Cleanup is running. Calling this should have been impossible */ - if (!down_read_trylock(&ucontext->cleanup_rwsem)) { - WARN(true, "ib_uverbs: Cleanup is running while removing an uobject\n"); - return 0; - } - assert_uverbs_usecnt(uobject, true); - ret = uobject->type->type_class->remove_commit(uobject, - RDMA_REMOVE_DESTROY); - if (ret) - goto out; - - uobject->type = &null_obj_type; - -out: - up_read(&ucontext->cleanup_rwsem); - return ret; } -static void alloc_commit_idr_uobject(struct ib_uobject *uobj) +static int alloc_commit_idr_uobject(struct ib_uobject *uobj) { - spin_lock(&uobj->context->ufile->idr_lock); + struct ib_uverbs_file *ufile = uobj->ufile; + + spin_lock(&ufile->idr_lock); /* * We already allocated this IDR with a NULL object, so * this shouldn't fail. + * + * NOTE: Once we set the IDR we loose ownership of our kref on uobj. + * It will be put by remove_commit_idr_uobject() */ - WARN_ON(idr_replace(&uobj->context->ufile->idr, - uobj, uobj->id)); - spin_unlock(&uobj->context->ufile->idr_lock); + WARN_ON(idr_replace(&ufile->idr, uobj, uobj->id)); + spin_unlock(&ufile->idr_lock); + + return 0; } -static void alloc_commit_fd_uobject(struct ib_uobject *uobj) +static int alloc_commit_fd_uobject(struct ib_uobject *uobj) { - struct ib_uobject_file *uobj_file = - container_of(uobj, struct ib_uobject_file, uobj); + const struct uverbs_obj_fd_type *fd_type = container_of( + uobj->uapi_object->type_attrs, struct uverbs_obj_fd_type, type); + int fd = uobj->id; + struct file *filp; + + /* + * The kref for uobj is moved into filp->private data and put in + * uverbs_close_fd(). Once alloc_commit() succeeds uverbs_close_fd() + * must be guaranteed to be called from the provided fops release + * callback. + */ + filp = anon_inode_getfile(fd_type->name, + fd_type->fops, + uobj, + fd_type->flags); + if (IS_ERR(filp)) + return PTR_ERR(filp); + + uobj->object = filp; + + /* Matching put will be done in uverbs_close_fd() */ + kref_get(&uobj->ufile->ref); - fd_install(uobj_file->uobj.id, uobj->object); /* This shouldn't be used anymore. Use the file object instead */ - uobj_file->uobj.id = 0; - /* Get another reference as we export this to the fops */ - uverbs_uobject_get(&uobj_file->uobj); + uobj->id = 0; + + /* + * NOTE: Once we install the file we loose ownership of our kref on + * uobj. It will be put by uverbs_close_fd() + */ + fd_install(fd, filp); + + return 0; } -int rdma_alloc_commit_uobject(struct ib_uobject *uobj) +/* + * In all cases rdma_alloc_commit_uobject() consumes the kref to uobj and the + * caller can no longer assume uobj is valid. If this function fails it + * destroys the uboject, including the attached HW object. + */ +int __must_check rdma_alloc_commit_uobject(struct ib_uobject *uobj) { - /* Cleanup is running. Calling this should have been impossible */ - if (!down_read_trylock(&uobj->context->cleanup_rwsem)) { - int ret; + struct ib_uverbs_file *ufile = uobj->ufile; + int ret; - WARN(true, "ib_uverbs: Cleanup is running while allocating an uobject\n"); - ret = uobj->type->type_class->remove_commit(uobj, - RDMA_REMOVE_DURING_CLEANUP); - if (ret) - pr_warn("ib_uverbs: cleanup of idr object %d failed\n", - uobj->id); + /* alloc_commit consumes the uobj kref */ + ret = uobj->uapi_object->type_class->alloc_commit(uobj); + if (ret) { + uverbs_destroy_uobject(uobj, RDMA_REMOVE_ABORT); + up_read(&ufile->hw_destroy_rwsem); return ret; } + /* kref is held so long as the uobj is on the uobj list. */ + uverbs_uobject_get(uobj); + spin_lock_irq(&ufile->uobjects_lock); + list_add(&uobj->list, &ufile->uobjects); + spin_unlock_irq(&ufile->uobjects_lock); + /* matches atomic_set(-1) in alloc_uobj */ - assert_uverbs_usecnt(uobj, true); atomic_set(&uobj->usecnt, 0); - mutex_lock(&uobj->context->uobjects_lock); - list_add(&uobj->list, &uobj->context->uobjects); - mutex_unlock(&uobj->context->uobjects_lock); - - uobj->type->type_class->alloc_commit(uobj); - up_read(&uobj->context->cleanup_rwsem); + /* Matches the down_read in rdma_alloc_begin_uobject */ + up_read(&ufile->hw_destroy_rwsem); return 0; } -static void alloc_abort_idr_uobject(struct ib_uobject *uobj) -{ - uverbs_idr_remove_uobj(uobj); - ib_rdmacg_uncharge(&uobj->cg_obj, uobj->context->device, - RDMACG_RESOURCE_HCA_OBJECT); - uverbs_uobject_put(uobj); -} - +/* + * This consumes the kref for uobj. It is up to the caller to unwind the HW + * object and anything else connected to uobj before calling this. + */ void rdma_alloc_abort_uobject(struct ib_uobject *uobj) { - uobj->type->type_class->alloc_abort(uobj); + struct ib_uverbs_file *ufile = uobj->ufile; + + uobj->object = NULL; + uverbs_destroy_uobject(uobj, RDMA_REMOVE_ABORT); + + /* Matches the down_read in rdma_alloc_begin_uobject */ + up_read(&ufile->hw_destroy_rwsem); } -static void lookup_put_idr_uobject(struct ib_uobject *uobj, bool exclusive) +static void lookup_put_idr_uobject(struct ib_uobject *uobj, + enum rdma_lookup_mode mode) { } -static void lookup_put_fd_uobject(struct ib_uobject *uobj, bool exclusive) +static void lookup_put_fd_uobject(struct ib_uobject *uobj, + enum rdma_lookup_mode mode) { struct file *filp = uobj->object; - WARN_ON(exclusive); + WARN_ON(mode != UVERBS_LOOKUP_READ); /* This indirectly calls uverbs_close_fd and free the object */ fput(filp); } -void rdma_lookup_put_uobject(struct ib_uobject *uobj, bool exclusive) +void rdma_lookup_put_uobject(struct ib_uobject *uobj, + enum rdma_lookup_mode mode) { - assert_uverbs_usecnt(uobj, exclusive); - uobj->type->type_class->lookup_put(uobj, exclusive); + assert_uverbs_usecnt(uobj, mode); + uobj->uapi_object->type_class->lookup_put(uobj, mode); /* * In order to unlock an object, either decrease its usecnt for * read access or zero it in case of exclusive access. See * uverbs_try_lock_object for locking schema information. */ - if (!exclusive) + switch (mode) { + case UVERBS_LOOKUP_READ: atomic_dec(&uobj->usecnt); - else + break; + case UVERBS_LOOKUP_WRITE: atomic_set(&uobj->usecnt, 0); + break; + case UVERBS_LOOKUP_DESTROY: + break; + } + /* Pairs with the kref obtained by type->lookup_get */ uverbs_uobject_put(uobj); } +void setup_ufile_idr_uobject(struct ib_uverbs_file *ufile) +{ + spin_lock_init(&ufile->idr_lock); + idr_init(&ufile->idr); +} + +void release_ufile_idr_uobject(struct ib_uverbs_file *ufile) +{ + struct ib_uobject *entry; + int id; + + /* + * At this point uverbs_cleanup_ufile() is guaranteed to have run, and + * there are no HW objects left, however the IDR is still populated + * with anything that has not been cleaned up by userspace. Since the + * kref on ufile is 0, nothing is allowed to call lookup_get. + * + * This is an optimized equivalent to remove_handle_idr_uobject + */ + idr_for_each_entry(&ufile->idr, entry, id) { + WARN_ON(entry->object); + uverbs_uobject_put(entry); + } + + idr_destroy(&ufile->idr); +} + const struct uverbs_obj_type_class uverbs_idr_class = { .alloc_begin = alloc_begin_idr_uobject, .lookup_get = lookup_get_idr_uobject, .alloc_commit = alloc_commit_idr_uobject, .alloc_abort = alloc_abort_idr_uobject, .lookup_put = lookup_put_idr_uobject, - .remove_commit = remove_commit_idr_uobject, + .destroy_hw = destroy_hw_idr_uobject, + .remove_handle = remove_handle_idr_uobject, /* * When we destroy an object, we first just lock it for WRITE and * actually DESTROY it in the finalize stage. So, the problematic @@ -611,103 +768,180 @@ const struct uverbs_obj_type_class uverbs_idr_class = { */ .needs_kfree_rcu = true, }; +EXPORT_SYMBOL(uverbs_idr_class); -static void _uverbs_close_fd(struct ib_uobject_file *uobj_file) +void uverbs_close_fd(struct file *f) { - struct ib_ucontext *ucontext; - struct ib_uverbs_file *ufile = uobj_file->ufile; - int ret; + struct ib_uobject *uobj = f->private_data; + struct ib_uverbs_file *ufile = uobj->ufile; - mutex_lock(&uobj_file->ufile->cleanup_mutex); + if (down_read_trylock(&ufile->hw_destroy_rwsem)) { + /* + * lookup_get_fd_uobject holds the kref on the struct file any + * time a FD uobj is locked, which prevents this release + * method from being invoked. Meaning we can always get the + * write lock here, or we have a kernel bug. + */ + WARN_ON(uverbs_try_lock_object(uobj, UVERBS_LOOKUP_WRITE)); + uverbs_destroy_uobject(uobj, RDMA_REMOVE_CLOSE); + up_read(&ufile->hw_destroy_rwsem); + } - /* uobject was either already cleaned up or is cleaned up right now anyway */ - if (!uobj_file->uobj.context || - !down_read_trylock(&uobj_file->uobj.context->cleanup_rwsem)) - goto unlock; + /* Matches the get in alloc_begin_fd_uobject */ + kref_put(&ufile->ref, ib_uverbs_release_file); - ucontext = uobj_file->uobj.context; - ret = _rdma_remove_commit_uobject(&uobj_file->uobj, RDMA_REMOVE_CLOSE); - up_read(&ucontext->cleanup_rwsem); - if (ret) - pr_warn("uverbs: unable to clean up uobject file in uverbs_close_fd.\n"); -unlock: - mutex_unlock(&ufile->cleanup_mutex); + /* Pairs with filp->private_data in alloc_begin_fd_uobject */ + uverbs_uobject_put(uobj); } -void uverbs_close_fd(struct file *f) -{ - struct ib_uobject_file *uobj_file = f->private_data; - struct kref *uverbs_file_ref = &uobj_file->ufile->ref; +static void ufile_disassociate_ucontext(struct ib_ucontext *ibcontext) +{ + struct ib_device *ib_dev = ibcontext->device; + struct task_struct *owning_process = NULL; + struct mm_struct *owning_mm = NULL; + + owning_process = get_pid_task(ibcontext->tgid, PIDTYPE_PID); + if (!owning_process) + return; + + owning_mm = get_task_mm(owning_process); + if (!owning_mm) { + pr_info("no mm, disassociate ucontext is pending task termination\n"); + while (1) { + put_task_struct(owning_process); + usleep_range(1000, 2000); + owning_process = get_pid_task(ibcontext->tgid, + PIDTYPE_PID); + if (!owning_process || + owning_process->state == TASK_DEAD) { + pr_info("disassociate ucontext done, task was terminated\n"); + /* in case task was dead need to release the + * task struct. + */ + if (owning_process) + put_task_struct(owning_process); + return; + } + } + } - _uverbs_close_fd(uobj_file); - uverbs_uobject_put(&uobj_file->uobj); - kref_put(uverbs_file_ref, ib_uverbs_release_file); + down_write(&owning_mm->mmap_sem); + ib_dev->disassociate_ucontext(ibcontext); + up_write(&owning_mm->mmap_sem); + mmput(owning_mm); + put_task_struct(owning_process); } -void uverbs_cleanup_ucontext(struct ib_ucontext *ucontext, bool device_removed) +/* + * Drop the ucontext off the ufile and completely disconnect it from the + * ib_device + */ +static void ufile_destroy_ucontext(struct ib_uverbs_file *ufile, + enum rdma_remove_reason reason) { - enum rdma_remove_reason reason = device_removed ? - RDMA_REMOVE_DRIVER_REMOVE : RDMA_REMOVE_CLOSE; - unsigned int cur_order = 0; + struct ib_ucontext *ucontext = ufile->ucontext; + int ret; + + if (reason == RDMA_REMOVE_DRIVER_REMOVE) + ufile_disassociate_ucontext(ucontext); + + put_pid(ucontext->tgid); + ib_rdmacg_uncharge(&ucontext->cg_obj, ucontext->device, + RDMACG_RESOURCE_HCA_HANDLE); - ucontext->cleanup_reason = reason; /* - * Waits for all remove_commit and alloc_commit to finish. Logically, We - * want to hold this forever as the context is going to be destroyed, - * but we'll release it since it causes a "held lock freed" BUG message. + * FIXME: Drivers are not permitted to fail dealloc_ucontext, remove + * the error return. */ - down_write(&ucontext->cleanup_rwsem); + ret = ucontext->device->dealloc_ucontext(ucontext); + WARN_ON(ret); - while (!list_empty(&ucontext->uobjects)) { - struct ib_uobject *obj, *next_obj; - unsigned int next_order = UINT_MAX; + ufile->ucontext = NULL; +} + +static int __uverbs_cleanup_ufile(struct ib_uverbs_file *ufile, + enum rdma_remove_reason reason) +{ + struct ib_uobject *obj, *next_obj; + int ret = -EINVAL; + /* + * This shouldn't run while executing other commands on this + * context. Thus, the only thing we should take care of is + * releasing a FD while traversing this list. The FD could be + * closed and released from the _release fop of this FD. + * In order to mitigate this, we add a lock. + * We take and release the lock per traversal in order to let + * other threads (which might still use the FDs) chance to run. + */ + list_for_each_entry_safe(obj, next_obj, &ufile->uobjects, list) { /* - * This shouldn't run while executing other commands on this - * context. Thus, the only thing we should take care of is - * releasing a FD while traversing this list. The FD could be - * closed and released from the _release fop of this FD. - * In order to mitigate this, we add a lock. - * We take and release the lock per order traversal in order - * to let other threads (which might still use the FDs) chance - * to run. + * if we hit this WARN_ON, that means we are + * racing with a lookup_get. */ - mutex_lock(&ucontext->uobjects_lock); - list_for_each_entry_safe(obj, next_obj, &ucontext->uobjects, - list) { - if (obj->type->destroy_order == cur_order) { - int ret; - - /* - * if we hit this WARN_ON, that means we are - * racing with a lookup_get. - */ - WARN_ON(uverbs_try_lock_object(obj, true)); - ret = obj->type->type_class->remove_commit(obj, - reason); - list_del(&obj->list); - if (ret) - pr_warn("ib_uverbs: failed to remove uobject id %d order %u\n", - obj->id, cur_order); - /* put the ref we took when we created the object */ - uverbs_uobject_put(obj); - } else { - next_order = min(next_order, - obj->type->destroy_order); - } - } - mutex_unlock(&ucontext->uobjects_lock); - cur_order = next_order; + WARN_ON(uverbs_try_lock_object(obj, UVERBS_LOOKUP_WRITE)); + if (!uverbs_destroy_uobject(obj, reason)) + ret = 0; } - up_write(&ucontext->cleanup_rwsem); + return ret; } -void uverbs_initialize_ucontext(struct ib_ucontext *ucontext) +/* + * Destroy the uncontext and every uobject associated with it. If called with + * reason != RDMA_REMOVE_CLOSE this will not return until the destruction has + * been completed and ufile->ucontext is NULL. + * + * This is internally locked and can be called in parallel from multiple + * contexts. + */ +void uverbs_destroy_ufile_hw(struct ib_uverbs_file *ufile, + enum rdma_remove_reason reason) { - ucontext->cleanup_reason = 0; - mutex_init(&ucontext->uobjects_lock); - INIT_LIST_HEAD(&ucontext->uobjects); - init_rwsem(&ucontext->cleanup_rwsem); + if (reason == RDMA_REMOVE_CLOSE) { + /* + * During destruction we might trigger something that + * synchronously calls release on any file descriptor. For + * this reason all paths that come from file_operations + * release must use try_lock. They can progress knowing that + * there is an ongoing uverbs_destroy_ufile_hw that will clean + * up the driver resources. + */ + if (!mutex_trylock(&ufile->ucontext_lock)) + return; + + } else { + mutex_lock(&ufile->ucontext_lock); + } + + down_write(&ufile->hw_destroy_rwsem); + + /* + * If a ucontext was never created then we can't have any uobjects to + * cleanup, nothing to do. + */ + if (!ufile->ucontext) + goto done; + + ufile->ucontext->closing = true; + ufile->ucontext->cleanup_retryable = true; + while (!list_empty(&ufile->uobjects)) + if (__uverbs_cleanup_ufile(ufile, reason)) { + /* + * No entry was cleaned-up successfully during this + * iteration + */ + break; + } + + ufile->ucontext->cleanup_retryable = false; + if (!list_empty(&ufile->uobjects)) + __uverbs_cleanup_ufile(ufile, reason); + + ufile_destroy_ucontext(ufile, reason); + +done: + up_write(&ufile->hw_destroy_rwsem); + mutex_unlock(&ufile->ucontext_lock); } const struct uverbs_obj_type_class uverbs_fd_class = { @@ -716,23 +950,33 @@ const struct uverbs_obj_type_class uverbs_fd_class = { .alloc_commit = alloc_commit_fd_uobject, .alloc_abort = alloc_abort_fd_uobject, .lookup_put = lookup_put_fd_uobject, - .remove_commit = remove_commit_fd_uobject, + .destroy_hw = destroy_hw_fd_uobject, + .remove_handle = remove_handle_fd_uobject, .needs_kfree_rcu = false, }; +EXPORT_SYMBOL(uverbs_fd_class); -struct ib_uobject *uverbs_get_uobject_from_context(const struct uverbs_obj_type *type_attrs, - struct ib_ucontext *ucontext, - enum uverbs_obj_access access, - int id) +struct ib_uobject * +uverbs_get_uobject_from_file(u16 object_id, + struct ib_uverbs_file *ufile, + enum uverbs_obj_access access, s64 id) { + const struct uverbs_api_object *obj = + uapi_get_object(ufile->device->uapi, object_id); + switch (access) { case UVERBS_ACCESS_READ: - return rdma_lookup_get_uobject(type_attrs, ucontext, id, false); + return rdma_lookup_get_uobject(obj, ufile, id, + UVERBS_LOOKUP_READ); case UVERBS_ACCESS_DESTROY: + /* Actual destruction is done inside uverbs_handle_method */ + return rdma_lookup_get_uobject(obj, ufile, id, + UVERBS_LOOKUP_DESTROY); case UVERBS_ACCESS_WRITE: - return rdma_lookup_get_uobject(type_attrs, ucontext, id, true); + return rdma_lookup_get_uobject(obj, ufile, id, + UVERBS_LOOKUP_WRITE); case UVERBS_ACCESS_NEW: - return rdma_alloc_begin_uobject(type_attrs, ucontext); + return rdma_alloc_begin_uobject(obj, ufile); default: WARN_ON(true); return ERR_PTR(-EOPNOTSUPP); @@ -753,16 +997,14 @@ int uverbs_finalize_object(struct ib_uobject *uobj, switch (access) { case UVERBS_ACCESS_READ: - rdma_lookup_put_uobject(uobj, false); + rdma_lookup_put_uobject(uobj, UVERBS_LOOKUP_READ); break; case UVERBS_ACCESS_WRITE: - rdma_lookup_put_uobject(uobj, true); + rdma_lookup_put_uobject(uobj, UVERBS_LOOKUP_WRITE); break; case UVERBS_ACCESS_DESTROY: - if (commit) - ret = rdma_remove_commit_uobject(uobj); - else - rdma_lookup_put_uobject(uobj, true); + if (uobj) + rdma_lookup_put_uobject(uobj, UVERBS_LOOKUP_DESTROY); break; case UVERBS_ACCESS_NEW: if (commit) @@ -777,43 +1019,3 @@ int uverbs_finalize_object(struct ib_uobject *uobj, return ret; } - -int uverbs_finalize_objects(struct uverbs_attr_bundle *attrs_bundle, - struct uverbs_attr_spec_hash * const *spec_hash, - size_t num, - bool commit) -{ - unsigned int i; - int ret = 0; - - for (i = 0; i < num; i++) { - struct uverbs_attr_bundle_hash *curr_bundle = - &attrs_bundle->hash[i]; - const struct uverbs_attr_spec_hash *curr_spec_bucket = - spec_hash[i]; - unsigned int j; - - for (j = 0; j < curr_bundle->num_attrs; j++) { - struct uverbs_attr *attr; - const struct uverbs_attr_spec *spec; - - if (!uverbs_attr_is_valid_in_hash(curr_bundle, j)) - continue; - - attr = &curr_bundle->attrs[j]; - spec = &curr_spec_bucket->attrs[j]; - - if (spec->type == UVERBS_ATTR_TYPE_IDR || - spec->type == UVERBS_ATTR_TYPE_FD) { - int current_ret; - - current_ret = uverbs_finalize_object(attr->obj_attr.uobject, - spec->obj.access, - commit); - if (!ret) - ret = current_ret; - } - } - } - return ret; -} diff --git a/drivers/infiniband/core/rdma_core.h b/drivers/infiniband/core/rdma_core.h index 1efcf93238dd..f962f2a593ba 100644 --- a/drivers/infiniband/core/rdma_core.h +++ b/drivers/infiniband/core/rdma_core.h @@ -43,20 +43,12 @@ #include <rdma/ib_verbs.h> #include <linux/mutex.h> -int uverbs_ns_idx(u16 *id, unsigned int ns_count); -const struct uverbs_object_spec *uverbs_get_object(const struct ib_device *ibdev, - uint16_t object); -const struct uverbs_method_spec *uverbs_get_method(const struct uverbs_object_spec *object, - uint16_t method); -/* - * These functions initialize the context and cleanups its uobjects. - * The context has a list of objects which is protected by a mutex - * on the context. initialize_ucontext should be called when we create - * a context. - * cleanup_ucontext removes all uobjects from the context and puts them. - */ -void uverbs_cleanup_ucontext(struct ib_ucontext *ucontext, bool device_removed); -void uverbs_initialize_ucontext(struct ib_ucontext *ucontext); +struct ib_uverbs_device; + +void uverbs_destroy_ufile_hw(struct ib_uverbs_file *ufile, + enum rdma_remove_reason reason); + +int uobj_destroy(struct ib_uobject *uobj); /* * uverbs_uobject_get is called in order to increase the reference count on @@ -82,7 +74,7 @@ void uverbs_uobject_put(struct ib_uobject *uobject); void uverbs_close_fd(struct file *f); /* - * Get an ib_uobject that corresponds to the given id from ucontext, assuming + * Get an ib_uobject that corresponds to the given id from ufile, assuming * the object is from the given type. Lock it to the required access when * applicable. * This function could create (access == NEW), destroy (access == DESTROY) @@ -90,13 +82,11 @@ void uverbs_close_fd(struct file *f); * The action will be finalized only when uverbs_finalize_object or * uverbs_finalize_objects are called. */ -struct ib_uobject *uverbs_get_uobject_from_context(const struct uverbs_obj_type *type_attrs, - struct ib_ucontext *ucontext, - enum uverbs_obj_access access, - int id); -int uverbs_finalize_object(struct ib_uobject *uobj, - enum uverbs_obj_access access, - bool commit); +struct ib_uobject * +uverbs_get_uobject_from_file(u16 object_id, + struct ib_uverbs_file *ufile, + enum uverbs_obj_access access, s64 id); + /* * Note that certain finalize stages could return a status: * (a) alloc_commit could return a failure if the object is committed at the @@ -112,9 +102,63 @@ int uverbs_finalize_object(struct ib_uobject *uobj, * function. For example, this could happen when we couldn't destroy an * object. */ -int uverbs_finalize_objects(struct uverbs_attr_bundle *attrs_bundle, - struct uverbs_attr_spec_hash * const *spec_hash, - size_t num, - bool commit); +int uverbs_finalize_object(struct ib_uobject *uobj, + enum uverbs_obj_access access, + bool commit); + +void setup_ufile_idr_uobject(struct ib_uverbs_file *ufile); +void release_ufile_idr_uobject(struct ib_uverbs_file *ufile); + +/* + * This is the runtime description of the uverbs API, used by the syscall + * machinery to validate and dispatch calls. + */ + +/* + * Depending on ID the slot pointer in the radix tree points at one of these + * structs. + */ +struct uverbs_api_object { + const struct uverbs_obj_type *type_attrs; + const struct uverbs_obj_type_class *type_class; +}; + +struct uverbs_api_ioctl_method { + int (__rcu *handler)(struct ib_uverbs_file *ufile, + struct uverbs_attr_bundle *ctx); + DECLARE_BITMAP(attr_mandatory, UVERBS_API_ATTR_BKEY_LEN); + u16 bundle_size; + u8 use_stack:1; + u8 driver_method:1; + u8 key_bitmap_len; + u8 destroy_bkey; +}; + +struct uverbs_api_attr { + struct uverbs_attr_spec spec; +}; + +struct uverbs_api_object; +struct uverbs_api { + /* radix tree contains struct uverbs_api_* pointers */ + struct radix_tree_root radix; + enum rdma_driver_id driver_id; +}; + +static inline const struct uverbs_api_object * +uapi_get_object(struct uverbs_api *uapi, u16 object_id) +{ + return radix_tree_lookup(&uapi->radix, uapi_key_obj(object_id)); +} + +char *uapi_key_format(char *S, unsigned int key); +struct uverbs_api *uverbs_alloc_api( + const struct uverbs_object_tree_def *const *driver_specs, + enum rdma_driver_id driver_id); +void uverbs_disassociate_api_pre(struct ib_uverbs_device *uverbs_dev); +void uverbs_disassociate_api(struct uverbs_api *uapi); +void uverbs_destroy_api(struct uverbs_api *uapi); +void uapi_compute_bundle_size(struct uverbs_api_ioctl_method *method_elm, + unsigned int num_attrs); #endif /* RDMA_CORE_H */ diff --git a/drivers/infiniband/core/roce_gid_mgmt.c b/drivers/infiniband/core/roce_gid_mgmt.c index a4fbdc5d28fa..ee366199b169 100644 --- a/drivers/infiniband/core/roce_gid_mgmt.c +++ b/drivers/infiniband/core/roce_gid_mgmt.c @@ -143,14 +143,15 @@ static enum bonding_slave_state is_eth_active_slave_of_bonding_rcu(struct net_de #define REQUIRED_BOND_STATES (BONDING_SLAVE_STATE_ACTIVE | \ BONDING_SLAVE_STATE_NA) -static int is_eth_port_of_netdev(struct ib_device *ib_dev, u8 port, - struct net_device *rdma_ndev, void *cookie) +static bool +is_eth_port_of_netdev_filter(struct ib_device *ib_dev, u8 port, + struct net_device *rdma_ndev, void *cookie) { struct net_device *real_dev; - int res; + bool res; if (!rdma_ndev) - return 0; + return false; rcu_read_lock(); real_dev = rdma_vlan_dev_real_dev(cookie); @@ -166,14 +167,15 @@ static int is_eth_port_of_netdev(struct ib_device *ib_dev, u8 port, return res; } -static int is_eth_port_inactive_slave(struct ib_device *ib_dev, u8 port, - struct net_device *rdma_ndev, void *cookie) +static bool +is_eth_port_inactive_slave_filter(struct ib_device *ib_dev, u8 port, + struct net_device *rdma_ndev, void *cookie) { struct net_device *master_dev; - int res; + bool res; if (!rdma_ndev) - return 0; + return false; rcu_read_lock(); master_dev = netdev_master_upper_dev_get_rcu(rdma_ndev); @@ -184,22 +186,59 @@ static int is_eth_port_inactive_slave(struct ib_device *ib_dev, u8 port, return res; } -static int pass_all_filter(struct ib_device *ib_dev, u8 port, - struct net_device *rdma_ndev, void *cookie) +/** is_ndev_for_default_gid_filter - Check if a given netdevice + * can be considered for default GIDs or not. + * @ib_dev: IB device to check + * @port: Port to consider for adding default GID + * @rdma_ndev: rdma netdevice pointer + * @cookie_ndev: Netdevice to consider to form a default GID + * + * is_ndev_for_default_gid_filter() returns true if a given netdevice can be + * considered for deriving default RoCE GID, returns false otherwise. + */ +static bool +is_ndev_for_default_gid_filter(struct ib_device *ib_dev, u8 port, + struct net_device *rdma_ndev, void *cookie) +{ + struct net_device *cookie_ndev = cookie; + bool res; + + if (!rdma_ndev) + return false; + + rcu_read_lock(); + + /* + * When rdma netdevice is used in bonding, bonding master netdevice + * should be considered for default GIDs. Therefore, ignore slave rdma + * netdevices when bonding is considered. + * Additionally when event(cookie) netdevice is bond master device, + * make sure that it the upper netdevice of rdma netdevice. + */ + res = ((cookie_ndev == rdma_ndev && !netif_is_bond_slave(rdma_ndev)) || + (netif_is_bond_master(cookie_ndev) && + rdma_is_upper_dev_rcu(rdma_ndev, cookie_ndev))); + + rcu_read_unlock(); + return res; +} + +static bool pass_all_filter(struct ib_device *ib_dev, u8 port, + struct net_device *rdma_ndev, void *cookie) { - return 1; + return true; } -static int upper_device_filter(struct ib_device *ib_dev, u8 port, - struct net_device *rdma_ndev, void *cookie) +static bool upper_device_filter(struct ib_device *ib_dev, u8 port, + struct net_device *rdma_ndev, void *cookie) { - int res; + bool res; if (!rdma_ndev) - return 0; + return false; if (rdma_ndev == cookie) - return 1; + return true; rcu_read_lock(); res = rdma_is_upper_dev_rcu(rdma_ndev, cookie); @@ -208,6 +247,34 @@ static int upper_device_filter(struct ib_device *ib_dev, u8 port, return res; } +/** + * is_upper_ndev_bond_master_filter - Check if a given netdevice + * is bond master device of netdevice of the the RDMA device of port. + * @ib_dev: IB device to check + * @port: Port to consider for adding default GID + * @rdma_ndev: Pointer to rdma netdevice + * @cookie: Netdevice to consider to form a default GID + * + * is_upper_ndev_bond_master_filter() returns true if a cookie_netdev + * is bond master device and rdma_ndev is its lower netdevice. It might + * not have been established as slave device yet. + */ +static bool +is_upper_ndev_bond_master_filter(struct ib_device *ib_dev, u8 port, + struct net_device *rdma_ndev, + void *cookie) +{ + struct net_device *cookie_ndev = cookie; + bool match = false; + + rcu_read_lock(); + if (netif_is_bond_master(cookie_ndev) && + rdma_is_upper_dev_rcu(rdma_ndev, cookie_ndev)) + match = true; + rcu_read_unlock(); + return match; +} + static void update_gid_ip(enum gid_op_type gid_op, struct ib_device *ib_dev, u8 port, struct net_device *ndev, @@ -223,34 +290,10 @@ static void update_gid_ip(enum gid_op_type gid_op, update_gid(gid_op, ib_dev, port, &gid, &gid_attr); } -static void enum_netdev_default_gids(struct ib_device *ib_dev, - u8 port, struct net_device *event_ndev, - struct net_device *rdma_ndev) -{ - unsigned long gid_type_mask; - - rcu_read_lock(); - if (!rdma_ndev || - ((rdma_ndev != event_ndev && - !rdma_is_upper_dev_rcu(rdma_ndev, event_ndev)) || - is_eth_active_slave_of_bonding_rcu(rdma_ndev, - netdev_master_upper_dev_get_rcu(rdma_ndev)) == - BONDING_SLAVE_STATE_INACTIVE)) { - rcu_read_unlock(); - return; - } - rcu_read_unlock(); - - gid_type_mask = roce_gid_type_mask_support(ib_dev, port); - - ib_cache_gid_set_default_gid(ib_dev, port, rdma_ndev, gid_type_mask, - IB_CACHE_GID_DEFAULT_MODE_SET); -} - static void bond_delete_netdev_default_gids(struct ib_device *ib_dev, u8 port, - struct net_device *event_ndev, - struct net_device *rdma_ndev) + struct net_device *rdma_ndev, + struct net_device *event_ndev) { struct net_device *real_dev = rdma_vlan_dev_real_dev(event_ndev); unsigned long gid_type_mask; @@ -381,7 +424,6 @@ static void _add_netdev_ips(struct ib_device *ib_dev, u8 port, static void add_netdev_ips(struct ib_device *ib_dev, u8 port, struct net_device *rdma_ndev, void *cookie) { - enum_netdev_default_gids(ib_dev, port, cookie, rdma_ndev); _add_netdev_ips(ib_dev, port, cookie); } @@ -391,6 +433,38 @@ static void del_netdev_ips(struct ib_device *ib_dev, u8 port, ib_cache_gid_del_all_netdev_gids(ib_dev, port, cookie); } +/** + * del_default_gids - Delete default GIDs of the event/cookie netdevice + * @ib_dev: RDMA device pointer + * @port: Port of the RDMA device whose GID table to consider + * @rdma_ndev: Unused rdma netdevice + * @cookie: Pointer to event netdevice + * + * del_default_gids() deletes the default GIDs of the event/cookie netdevice. + */ +static void del_default_gids(struct ib_device *ib_dev, u8 port, + struct net_device *rdma_ndev, void *cookie) +{ + struct net_device *cookie_ndev = cookie; + unsigned long gid_type_mask; + + gid_type_mask = roce_gid_type_mask_support(ib_dev, port); + + ib_cache_gid_set_default_gid(ib_dev, port, cookie_ndev, gid_type_mask, + IB_CACHE_GID_DEFAULT_MODE_DELETE); +} + +static void add_default_gids(struct ib_device *ib_dev, u8 port, + struct net_device *rdma_ndev, void *cookie) +{ + struct net_device *event_ndev = cookie; + unsigned long gid_type_mask; + + gid_type_mask = roce_gid_type_mask_support(ib_dev, port); + ib_cache_gid_set_default_gid(ib_dev, port, event_ndev, gid_type_mask, + IB_CACHE_GID_DEFAULT_MODE_SET); +} + static void enum_all_gids_of_dev_cb(struct ib_device *ib_dev, u8 port, struct net_device *rdma_ndev, @@ -405,9 +479,20 @@ static void enum_all_gids_of_dev_cb(struct ib_device *ib_dev, rtnl_lock(); down_read(&net_rwsem); for_each_net(net) - for_each_netdev(net, ndev) - if (is_eth_port_of_netdev(ib_dev, port, rdma_ndev, ndev)) - add_netdev_ips(ib_dev, port, rdma_ndev, ndev); + for_each_netdev(net, ndev) { + /* + * Filter and add default GIDs of the primary netdevice + * when not in bonding mode, or add default GIDs + * of bond master device, when in bonding mode. + */ + if (is_ndev_for_default_gid_filter(ib_dev, port, + rdma_ndev, ndev)) + add_default_gids(ib_dev, port, rdma_ndev, ndev); + + if (is_eth_port_of_netdev_filter(ib_dev, port, + rdma_ndev, ndev)) + _add_netdev_ips(ib_dev, port, ndev); + } up_read(&net_rwsem); rtnl_unlock(); } @@ -513,18 +598,12 @@ static void del_netdev_default_ips_join(struct ib_device *ib_dev, u8 port, rcu_read_unlock(); if (master_ndev) { - bond_delete_netdev_default_gids(ib_dev, port, master_ndev, - rdma_ndev); + bond_delete_netdev_default_gids(ib_dev, port, rdma_ndev, + master_ndev); dev_put(master_ndev); } } -static void del_netdev_default_ips(struct ib_device *ib_dev, u8 port, - struct net_device *rdma_ndev, void *cookie) -{ - bond_delete_netdev_default_gids(ib_dev, port, cookie, rdma_ndev); -} - /* The following functions operate on all IB devices. netdevice_event and * addr_event execute ib_enum_all_roce_netdevs through a work. * ib_enum_all_roce_netdevs iterates through all IB devices. @@ -575,40 +654,94 @@ static int netdevice_queue_work(struct netdev_event_work_cmd *cmds, } static const struct netdev_event_work_cmd add_cmd = { - .cb = add_netdev_ips, .filter = is_eth_port_of_netdev}; + .cb = add_netdev_ips, + .filter = is_eth_port_of_netdev_filter +}; + static const struct netdev_event_work_cmd add_cmd_upper_ips = { - .cb = add_netdev_upper_ips, .filter = is_eth_port_of_netdev}; + .cb = add_netdev_upper_ips, + .filter = is_eth_port_of_netdev_filter +}; -static void netdevice_event_changeupper(struct netdev_notifier_changeupper_info *changeupper_info, - struct netdev_event_work_cmd *cmds) +static void +ndev_event_unlink(struct netdev_notifier_changeupper_info *changeupper_info, + struct netdev_event_work_cmd *cmds) { - static const struct netdev_event_work_cmd upper_ips_del_cmd = { - .cb = del_netdev_upper_ips, .filter = upper_device_filter}; - static const struct netdev_event_work_cmd bonding_default_del_cmd = { - .cb = del_netdev_default_ips, .filter = is_eth_port_inactive_slave}; - - if (changeupper_info->linking == false) { - cmds[0] = upper_ips_del_cmd; - cmds[0].ndev = changeupper_info->upper_dev; - cmds[1] = add_cmd; - } else { - cmds[0] = bonding_default_del_cmd; - cmds[0].ndev = changeupper_info->upper_dev; - cmds[1] = add_cmd_upper_ips; - cmds[1].ndev = changeupper_info->upper_dev; - cmds[1].filter_ndev = changeupper_info->upper_dev; - } + static const struct netdev_event_work_cmd + upper_ips_del_cmd = { + .cb = del_netdev_upper_ips, + .filter = upper_device_filter + }; + + cmds[0] = upper_ips_del_cmd; + cmds[0].ndev = changeupper_info->upper_dev; + cmds[1] = add_cmd; } +static const struct netdev_event_work_cmd bonding_default_add_cmd = { + .cb = add_default_gids, + .filter = is_upper_ndev_bond_master_filter +}; + +static void +ndev_event_link(struct net_device *event_ndev, + struct netdev_notifier_changeupper_info *changeupper_info, + struct netdev_event_work_cmd *cmds) +{ + static const struct netdev_event_work_cmd + bonding_default_del_cmd = { + .cb = del_default_gids, + .filter = is_upper_ndev_bond_master_filter + }; + /* + * When a lower netdev is linked to its upper bonding + * netdev, delete lower slave netdev's default GIDs. + */ + cmds[0] = bonding_default_del_cmd; + cmds[0].ndev = event_ndev; + cmds[0].filter_ndev = changeupper_info->upper_dev; + + /* Now add bonding upper device default GIDs */ + cmds[1] = bonding_default_add_cmd; + cmds[1].ndev = changeupper_info->upper_dev; + cmds[1].filter_ndev = changeupper_info->upper_dev; + + /* Now add bonding upper device IP based GIDs */ + cmds[2] = add_cmd_upper_ips; + cmds[2].ndev = changeupper_info->upper_dev; + cmds[2].filter_ndev = changeupper_info->upper_dev; +} + +static void netdevice_event_changeupper(struct net_device *event_ndev, + struct netdev_notifier_changeupper_info *changeupper_info, + struct netdev_event_work_cmd *cmds) +{ + if (changeupper_info->linking) + ndev_event_link(event_ndev, changeupper_info, cmds); + else + ndev_event_unlink(changeupper_info, cmds); +} + +static const struct netdev_event_work_cmd add_default_gid_cmd = { + .cb = add_default_gids, + .filter = is_ndev_for_default_gid_filter, +}; + static int netdevice_event(struct notifier_block *this, unsigned long event, void *ptr) { static const struct netdev_event_work_cmd del_cmd = { .cb = del_netdev_ips, .filter = pass_all_filter}; - static const struct netdev_event_work_cmd bonding_default_del_cmd_join = { - .cb = del_netdev_default_ips_join, .filter = is_eth_port_inactive_slave}; - static const struct netdev_event_work_cmd default_del_cmd = { - .cb = del_netdev_default_ips, .filter = pass_all_filter}; + static const struct netdev_event_work_cmd + bonding_default_del_cmd_join = { + .cb = del_netdev_default_ips_join, + .filter = is_eth_port_inactive_slave_filter + }; + static const struct netdev_event_work_cmd + netdev_del_cmd = { + .cb = del_netdev_ips, + .filter = is_eth_port_of_netdev_filter + }; static const struct netdev_event_work_cmd bonding_event_ips_del_cmd = { .cb = del_netdev_upper_ips, .filter = upper_device_filter}; struct net_device *ndev = netdev_notifier_info_to_dev(ptr); @@ -621,7 +754,8 @@ static int netdevice_event(struct notifier_block *this, unsigned long event, case NETDEV_REGISTER: case NETDEV_UP: cmds[0] = bonding_default_del_cmd_join; - cmds[1] = add_cmd; + cmds[1] = add_default_gid_cmd; + cmds[2] = add_cmd; break; case NETDEV_UNREGISTER: @@ -632,19 +766,22 @@ static int netdevice_event(struct notifier_block *this, unsigned long event, break; case NETDEV_CHANGEADDR: - cmds[0] = default_del_cmd; - cmds[1] = add_cmd; + cmds[0] = netdev_del_cmd; + cmds[1] = add_default_gid_cmd; + cmds[2] = add_cmd; break; case NETDEV_CHANGEUPPER: - netdevice_event_changeupper( + netdevice_event_changeupper(ndev, container_of(ptr, struct netdev_notifier_changeupper_info, info), cmds); break; case NETDEV_BONDING_FAILOVER: cmds[0] = bonding_event_ips_del_cmd; - cmds[1] = bonding_default_del_cmd_join; + /* Add default GIDs of the bond device */ + cmds[1] = bonding_default_add_cmd; + /* Add IP based GIDs of the bond device */ cmds[2] = add_cmd_upper_ips; break; @@ -660,7 +797,8 @@ static void update_gid_event_work_handler(struct work_struct *_work) struct update_gid_event_work *work = container_of(_work, struct update_gid_event_work, work); - ib_enum_all_roce_netdevs(is_eth_port_of_netdev, work->gid_attr.ndev, + ib_enum_all_roce_netdevs(is_eth_port_of_netdev_filter, + work->gid_attr.ndev, callback_for_addr_gid_device_scan, work); dev_put(work->gid_attr.ndev); diff --git a/drivers/infiniband/core/rw.c b/drivers/infiniband/core/rw.c index c8963e91f92a..683e6d11a564 100644 --- a/drivers/infiniband/core/rw.c +++ b/drivers/infiniband/core/rw.c @@ -87,7 +87,7 @@ static int rdma_rw_init_one_mr(struct ib_qp *qp, u8 port_num, } ret = ib_map_mr_sg(reg->mr, sg, nents, &offset, PAGE_SIZE); - if (ret < nents) { + if (ret < 0 || ret < nents) { ib_mr_pool_put(qp, &qp->rdma_mrs, reg->mr); return -EINVAL; } @@ -325,7 +325,7 @@ out_unmap_sg: EXPORT_SYMBOL(rdma_rw_ctx_init); /** - * rdma_rw_ctx_signature init - initialize a RW context with signature offload + * rdma_rw_ctx_signature_init - initialize a RW context with signature offload * @ctx: context to initialize * @qp: queue pair to operate on * @port_num: port num to which the connection is bound @@ -564,10 +564,10 @@ EXPORT_SYMBOL(rdma_rw_ctx_wrs); int rdma_rw_ctx_post(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u8 port_num, struct ib_cqe *cqe, struct ib_send_wr *chain_wr) { - struct ib_send_wr *first_wr, *bad_wr; + struct ib_send_wr *first_wr; first_wr = rdma_rw_ctx_wrs(ctx, qp, port_num, cqe, chain_wr); - return ib_post_send(qp, first_wr, &bad_wr); + return ib_post_send(qp, first_wr, NULL); } EXPORT_SYMBOL(rdma_rw_ctx_post); diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c index a61ec7e33613..7b794a14d6e8 100644 --- a/drivers/infiniband/core/sa_query.c +++ b/drivers/infiniband/core/sa_query.c @@ -1227,20 +1227,10 @@ static u8 get_src_path_mask(struct ib_device *device, u8 port_num) return src_path_mask; } -static int -roce_resolve_route_from_path(struct ib_device *device, u8 port_num, - struct sa_path_rec *rec) +static int roce_resolve_route_from_path(struct sa_path_rec *rec, + const struct ib_gid_attr *attr) { - struct net_device *resolved_dev; - struct net_device *ndev; - struct net_device *idev; - struct rdma_dev_addr dev_addr = { - .bound_dev_if = ((sa_path_get_ifindex(rec) >= 0) ? - sa_path_get_ifindex(rec) : 0), - .net = sa_path_get_ndev(rec) ? - sa_path_get_ndev(rec) : - &init_net - }; + struct rdma_dev_addr dev_addr = {}; union { struct sockaddr _sockaddr; struct sockaddr_in _sockaddr_in; @@ -1250,9 +1240,14 @@ roce_resolve_route_from_path(struct ib_device *device, u8 port_num, if (rec->roce.route_resolved) return 0; + if (!attr || !attr->ndev) + return -EINVAL; - if (!device->get_netdev) - return -EOPNOTSUPP; + dev_addr.bound_dev_if = attr->ndev->ifindex; + /* TODO: Use net from the ib_gid_attr once it is added to it, + * until than, limit itself to init_net. + */ + dev_addr.net = &init_net; rdma_gid2ip(&sgid_addr._sockaddr, &rec->sgid); rdma_gid2ip(&dgid_addr._sockaddr, &rec->dgid); @@ -1268,60 +1263,52 @@ roce_resolve_route_from_path(struct ib_device *device, u8 port_num, rec->rec_type != SA_PATH_REC_TYPE_ROCE_V2) return -EINVAL; - idev = device->get_netdev(device, port_num); - if (!idev) - return -ENODEV; - - resolved_dev = dev_get_by_index(dev_addr.net, - dev_addr.bound_dev_if); - if (!resolved_dev) { - ret = -ENODEV; - goto done; - } - ndev = ib_get_ndev_from_path(rec); - rcu_read_lock(); - if ((ndev && ndev != resolved_dev) || - (resolved_dev != idev && - !rdma_is_upper_dev_rcu(idev, resolved_dev))) - ret = -EHOSTUNREACH; - rcu_read_unlock(); - dev_put(resolved_dev); - if (ndev) - dev_put(ndev); -done: - dev_put(idev); - if (!ret) - rec->roce.route_resolved = true; - return ret; + rec->roce.route_resolved = true; + return 0; } static int init_ah_attr_grh_fields(struct ib_device *device, u8 port_num, struct sa_path_rec *rec, - struct rdma_ah_attr *ah_attr) + struct rdma_ah_attr *ah_attr, + const struct ib_gid_attr *gid_attr) { enum ib_gid_type type = sa_conv_pathrec_to_gid_type(rec); - struct net_device *ndev; - u16 gid_index; - int ret; - ndev = ib_get_ndev_from_path(rec); - ret = ib_find_cached_gid_by_port(device, &rec->sgid, type, - port_num, ndev, &gid_index); - if (ndev) - dev_put(ndev); - if (ret) - return ret; + if (!gid_attr) { + gid_attr = rdma_find_gid_by_port(device, &rec->sgid, type, + port_num, NULL); + if (IS_ERR(gid_attr)) + return PTR_ERR(gid_attr); + } else + rdma_hold_gid_attr(gid_attr); - rdma_ah_set_grh(ah_attr, &rec->dgid, - be32_to_cpu(rec->flow_label), - gid_index, rec->hop_limit, - rec->traffic_class); + rdma_move_grh_sgid_attr(ah_attr, &rec->dgid, + be32_to_cpu(rec->flow_label), + rec->hop_limit, rec->traffic_class, + gid_attr); return 0; } +/** + * ib_init_ah_attr_from_path - Initialize address handle attributes based on + * an SA path record. + * @device: Device associated ah attributes initialization. + * @port_num: Port on the specified device. + * @rec: path record entry to use for ah attributes initialization. + * @ah_attr: address handle attributes to initialization from path record. + * @sgid_attr: SGID attribute to consider during initialization. + * + * When ib_init_ah_attr_from_path() returns success, + * (a) for IB link layer it optionally contains a reference to SGID attribute + * when GRH is present for IB link layer. + * (b) for RoCE link layer it contains a reference to SGID attribute. + * User must invoke rdma_destroy_ah_attr() to release reference to SGID + * attributes which are initialized using ib_init_ah_attr_from_path(). + */ int ib_init_ah_attr_from_path(struct ib_device *device, u8 port_num, struct sa_path_rec *rec, - struct rdma_ah_attr *ah_attr) + struct rdma_ah_attr *ah_attr, + const struct ib_gid_attr *gid_attr) { int ret = 0; @@ -1332,7 +1319,7 @@ int ib_init_ah_attr_from_path(struct ib_device *device, u8 port_num, rdma_ah_set_static_rate(ah_attr, rec->rate); if (sa_path_is_roce(rec)) { - ret = roce_resolve_route_from_path(device, port_num, rec); + ret = roce_resolve_route_from_path(rec, gid_attr); if (ret) return ret; @@ -1349,7 +1336,8 @@ int ib_init_ah_attr_from_path(struct ib_device *device, u8 port_num, } if (rec->hop_limit > 0 || sa_path_is_roce(rec)) - ret = init_ah_attr_grh_fields(device, port_num, rec, ah_attr); + ret = init_ah_attr_grh_fields(device, port_num, + rec, ah_attr, gid_attr); return ret; } EXPORT_SYMBOL(ib_init_ah_attr_from_path); @@ -1557,8 +1545,6 @@ static void ib_sa_path_rec_callback(struct ib_sa_query *sa_query, ARRAY_SIZE(path_rec_table), mad->data, &rec); rec.rec_type = SA_PATH_REC_TYPE_IB; - sa_path_set_ndev(&rec, NULL); - sa_path_set_ifindex(&rec, 0); sa_path_set_dmac_zero(&rec); if (query->conv_pr) { @@ -2290,6 +2276,7 @@ static void update_sm_ah(struct work_struct *work) struct ib_sa_sm_ah *new_ah; struct ib_port_attr port_attr; struct rdma_ah_attr ah_attr; + bool grh_required; if (ib_query_port(port->agent->device, port->port_num, &port_attr)) { pr_warn("Couldn't query port\n"); @@ -2314,16 +2301,27 @@ static void update_sm_ah(struct work_struct *work) rdma_ah_set_dlid(&ah_attr, port_attr.sm_lid); rdma_ah_set_sl(&ah_attr, port_attr.sm_sl); rdma_ah_set_port_num(&ah_attr, port->port_num); - if (port_attr.grh_required) { - if (ah_attr.type == RDMA_AH_ATTR_TYPE_OPA) { - rdma_ah_set_make_grd(&ah_attr, true); - } else { - rdma_ah_set_ah_flags(&ah_attr, IB_AH_GRH); - rdma_ah_set_subnet_prefix(&ah_attr, - cpu_to_be64(port_attr.subnet_prefix)); - rdma_ah_set_interface_id(&ah_attr, - cpu_to_be64(IB_SA_WELL_KNOWN_GUID)); - } + + grh_required = rdma_is_grh_required(port->agent->device, + port->port_num); + + /* + * The OPA sm_lid of 0xFFFF needs special handling so that it can be + * differentiated from a permissive LID of 0xFFFF. We set the + * grh_required flag here so the SA can program the DGID in the + * address handle appropriately + */ + if (ah_attr.type == RDMA_AH_ATTR_TYPE_OPA && + (grh_required || + port_attr.sm_lid == be16_to_cpu(IB_LID_PERMISSIVE))) + rdma_ah_set_make_grd(&ah_attr, true); + + if (ah_attr.type == RDMA_AH_ATTR_TYPE_IB && grh_required) { + rdma_ah_set_ah_flags(&ah_attr, IB_AH_GRH); + rdma_ah_set_subnet_prefix(&ah_attr, + cpu_to_be64(port_attr.subnet_prefix)); + rdma_ah_set_interface_id(&ah_attr, + cpu_to_be64(IB_SA_WELL_KNOWN_GUID)); } new_ah->ah = rdma_create_ah(port->agent->qp->pd, &ah_attr); diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c index 31c7efaf8e7a..7fd14ead7b37 100644 --- a/drivers/infiniband/core/sysfs.c +++ b/drivers/infiniband/core/sysfs.c @@ -42,6 +42,7 @@ #include <rdma/ib_mad.h> #include <rdma/ib_pma.h> +#include <rdma/ib_cache.h> struct ib_port; @@ -346,7 +347,7 @@ static struct attribute *port_default_attrs[] = { NULL }; -static size_t print_ndev(struct ib_gid_attr *gid_attr, char *buf) +static size_t print_ndev(const struct ib_gid_attr *gid_attr, char *buf) { if (!gid_attr->ndev) return -EINVAL; @@ -354,33 +355,26 @@ static size_t print_ndev(struct ib_gid_attr *gid_attr, char *buf) return sprintf(buf, "%s\n", gid_attr->ndev->name); } -static size_t print_gid_type(struct ib_gid_attr *gid_attr, char *buf) +static size_t print_gid_type(const struct ib_gid_attr *gid_attr, char *buf) { return sprintf(buf, "%s\n", ib_cache_gid_type_str(gid_attr->gid_type)); } -static ssize_t _show_port_gid_attr(struct ib_port *p, - struct port_attribute *attr, - char *buf, - size_t (*print)(struct ib_gid_attr *gid_attr, - char *buf)) +static ssize_t _show_port_gid_attr( + struct ib_port *p, struct port_attribute *attr, char *buf, + size_t (*print)(const struct ib_gid_attr *gid_attr, char *buf)) { struct port_table_attribute *tab_attr = container_of(attr, struct port_table_attribute, attr); - union ib_gid gid; - struct ib_gid_attr gid_attr = {}; + const struct ib_gid_attr *gid_attr; ssize_t ret; - ret = ib_query_gid(p->ibdev, p->port_num, tab_attr->index, &gid, - &gid_attr); - if (ret) - goto err; + gid_attr = rdma_get_gid_attr(p->ibdev, p->port_num, tab_attr->index); + if (IS_ERR(gid_attr)) + return PTR_ERR(gid_attr); - ret = print(&gid_attr, buf); - -err: - if (gid_attr.ndev) - dev_put(gid_attr.ndev); + ret = print(gid_attr, buf); + rdma_put_gid_attr(gid_attr); return ret; } @@ -389,26 +383,28 @@ static ssize_t show_port_gid(struct ib_port *p, struct port_attribute *attr, { struct port_table_attribute *tab_attr = container_of(attr, struct port_table_attribute, attr); - union ib_gid *pgid; - union ib_gid gid; + const struct ib_gid_attr *gid_attr; ssize_t ret; - ret = ib_query_gid(p->ibdev, p->port_num, tab_attr->index, &gid, NULL); + gid_attr = rdma_get_gid_attr(p->ibdev, p->port_num, tab_attr->index); + if (IS_ERR(gid_attr)) { + const union ib_gid zgid = {}; + + /* If reading GID fails, it is likely due to GID entry being + * empty (invalid) or reserved GID in the table. User space + * expects to read GID table entries as long as it given index + * is within GID table size. Administrative/debugging tool + * fails to query rest of the GID entries if it hits error + * while querying a GID of the given index. To avoid user + * space throwing such error on fail to read gid, return zero + * GID as before. This maintains backward compatibility. + */ + return sprintf(buf, "%pI6\n", zgid.raw); + } - /* If reading GID fails, it is likely due to GID entry being empty - * (invalid) or reserved GID in the table. - * User space expects to read GID table entries as long as it given - * index is within GID table size. - * Administrative/debugging tool fails to query rest of the GID entries - * if it hits error while querying a GID of the given index. - * To avoid user space throwing such error on fail to read gid, return - * zero GID as before. This maintains backward compatibility. - */ - if (ret) - pgid = &zgid; - else - pgid = &gid; - return sprintf(buf, "%pI6\n", pgid->raw); + ret = sprintf(buf, "%pI6\n", gid_attr->gid.raw); + rdma_put_gid_attr(gid_attr); + return ret; } static ssize_t show_port_gid_attr_ndev(struct ib_port *p, diff --git a/drivers/infiniband/core/ucm.c b/drivers/infiniband/core/ucm.c index 9eef96dacbd7..faa9e6116b2f 100644 --- a/drivers/infiniband/core/ucm.c +++ b/drivers/infiniband/core/ucm.c @@ -207,7 +207,7 @@ error: } static void ib_ucm_event_req_get(struct ib_ucm_req_event_resp *ureq, - struct ib_cm_req_event_param *kreq) + const struct ib_cm_req_event_param *kreq) { ureq->remote_ca_guid = kreq->remote_ca_guid; ureq->remote_qkey = kreq->remote_qkey; @@ -231,7 +231,7 @@ static void ib_ucm_event_req_get(struct ib_ucm_req_event_resp *ureq, } static void ib_ucm_event_rep_get(struct ib_ucm_rep_event_resp *urep, - struct ib_cm_rep_event_param *krep) + const struct ib_cm_rep_event_param *krep) { urep->remote_ca_guid = krep->remote_ca_guid; urep->remote_qkey = krep->remote_qkey; @@ -247,14 +247,14 @@ static void ib_ucm_event_rep_get(struct ib_ucm_rep_event_resp *urep, } static void ib_ucm_event_sidr_rep_get(struct ib_ucm_sidr_rep_event_resp *urep, - struct ib_cm_sidr_rep_event_param *krep) + const struct ib_cm_sidr_rep_event_param *krep) { urep->status = krep->status; urep->qkey = krep->qkey; urep->qpn = krep->qpn; }; -static int ib_ucm_event_process(struct ib_cm_event *evt, +static int ib_ucm_event_process(const struct ib_cm_event *evt, struct ib_ucm_event *uvt) { void *info = NULL; @@ -351,7 +351,7 @@ err1: } static int ib_ucm_event_handler(struct ib_cm_id *cm_id, - struct ib_cm_event *event) + const struct ib_cm_event *event) { struct ib_ucm_event *uevent; struct ib_ucm_context *ctx; @@ -1000,14 +1000,11 @@ static ssize_t ib_ucm_send_sidr_req(struct ib_ucm_file *file, const char __user *inbuf, int in_len, int out_len) { - struct ib_cm_sidr_req_param param; + struct ib_cm_sidr_req_param param = {}; struct ib_ucm_context *ctx; struct ib_ucm_sidr_req cmd; int result; - param.private_data = NULL; - param.path = NULL; - if (copy_from_user(&cmd, inbuf, sizeof(cmd))) return -EFAULT; diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c index 54ab6335c48d..a41792dbae1f 100644 --- a/drivers/infiniband/core/umem.c +++ b/drivers/infiniband/core/umem.c @@ -84,7 +84,6 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, struct ib_umem *umem; struct page **page_list; struct vm_area_struct **vma_list; - unsigned long locked; unsigned long lock_limit; unsigned long cur_base; unsigned long npages; @@ -92,7 +91,6 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, int i; unsigned long dma_attrs = 0; struct scatterlist *sg, *sg_list_start; - int need_release = 0; unsigned int gup_flags = FOLL_WRITE; if (dmasync) @@ -121,10 +119,8 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, if (access & IB_ACCESS_ON_DEMAND) { ret = ib_umem_odp_get(context, umem, access); - if (ret) { - kfree(umem); - return ERR_PTR(ret); - } + if (ret) + goto umem_kfree; return umem; } @@ -135,8 +131,8 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, page_list = (struct page **) __get_free_page(GFP_KERNEL); if (!page_list) { - kfree(umem); - return ERR_PTR(-ENOMEM); + ret = -ENOMEM; + goto umem_kfree; } /* @@ -149,41 +145,43 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, npages = ib_umem_num_pages(umem); - down_write(¤t->mm->mmap_sem); - - locked = npages + current->mm->pinned_vm; lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; - if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) { + down_write(¤t->mm->mmap_sem); + current->mm->pinned_vm += npages; + if ((current->mm->pinned_vm > lock_limit) && !capable(CAP_IPC_LOCK)) { + up_write(¤t->mm->mmap_sem); ret = -ENOMEM; - goto out; + goto vma; } + up_write(¤t->mm->mmap_sem); cur_base = addr & PAGE_MASK; if (npages == 0 || npages > UINT_MAX) { ret = -EINVAL; - goto out; + goto vma; } ret = sg_alloc_table(&umem->sg_head, npages, GFP_KERNEL); if (ret) - goto out; + goto vma; if (!umem->writable) gup_flags |= FOLL_FORCE; - need_release = 1; sg_list_start = umem->sg_head.sgl; + down_read(¤t->mm->mmap_sem); while (npages) { ret = get_user_pages_longterm(cur_base, min_t(unsigned long, npages, PAGE_SIZE / sizeof (struct page *)), gup_flags, page_list, vma_list); - - if (ret < 0) - goto out; + if (ret < 0) { + up_read(¤t->mm->mmap_sem); + goto umem_release; + } umem->npages += ret; cur_base += ret * PAGE_SIZE; @@ -199,6 +197,7 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, /* preparing for next loop */ sg_list_start = sg; } + up_read(¤t->mm->mmap_sem); umem->nmap = ib_dma_map_sg_attrs(context->device, umem->sg_head.sgl, @@ -206,27 +205,28 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, DMA_BIDIRECTIONAL, dma_attrs); - if (umem->nmap <= 0) { + if (!umem->nmap) { ret = -ENOMEM; - goto out; + goto umem_release; } ret = 0; + goto out; -out: - if (ret < 0) { - if (need_release) - __ib_umem_release(context->device, umem, 0); - kfree(umem); - } else - current->mm->pinned_vm = locked; - +umem_release: + __ib_umem_release(context->device, umem, 0); +vma: + down_write(¤t->mm->mmap_sem); + current->mm->pinned_vm -= ib_umem_num_pages(umem); up_write(¤t->mm->mmap_sem); +out: if (vma_list) free_page((unsigned long) vma_list); free_page((unsigned long) page_list); - - return ret < 0 ? ERR_PTR(ret) : umem; +umem_kfree: + if (ret) + kfree(umem); + return ret ? ERR_PTR(ret) : umem; } EXPORT_SYMBOL(ib_umem_get); diff --git a/drivers/infiniband/core/user_mad.c b/drivers/infiniband/core/user_mad.c index bb98c9e4a7fd..c34a6852d691 100644 --- a/drivers/infiniband/core/user_mad.c +++ b/drivers/infiniband/core/user_mad.c @@ -268,6 +268,7 @@ static void recv_handler(struct ib_mad_agent *agent, packet->mad.hdr.traffic_class = grh->traffic_class; memcpy(packet->mad.hdr.gid, &grh->dgid, 16); packet->mad.hdr.flow_label = cpu_to_be32(grh->flow_label); + rdma_destroy_ah_attr(&ah_attr); } if (queue_packet(file, agent, packet)) diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h index c0d40fc3a53a..5df8e548cc14 100644 --- a/drivers/infiniband/core/uverbs.h +++ b/drivers/infiniband/core/uverbs.h @@ -111,7 +111,7 @@ struct ib_uverbs_device { struct mutex lists_mutex; /* protect lists */ struct list_head uverbs_file_list; struct list_head uverbs_events_file_list; - struct uverbs_root_spec *specs_root; + struct uverbs_api *uapi; }; struct ib_uverbs_event_queue { @@ -130,21 +130,37 @@ struct ib_uverbs_async_event_file { }; struct ib_uverbs_completion_event_file { - struct ib_uobject_file uobj_file; + struct ib_uobject uobj; struct ib_uverbs_event_queue ev_queue; }; struct ib_uverbs_file { struct kref ref; - struct mutex mutex; - struct mutex cleanup_mutex; /* protect cleanup */ struct ib_uverbs_device *device; + struct mutex ucontext_lock; + /* + * ucontext must be accessed via ib_uverbs_get_ucontext() or with + * ucontext_lock held + */ struct ib_ucontext *ucontext; struct ib_event_handler event_handler; struct ib_uverbs_async_event_file *async_file; struct list_head list; int is_closed; + /* + * To access the uobjects list hw_destroy_rwsem must be held for write + * OR hw_destroy_rwsem held for read AND uobjects_lock held. + * hw_destroy_rwsem should be called across any destruction of the HW + * object of an associated uobject. + */ + struct rw_semaphore hw_destroy_rwsem; + spinlock_t uobjects_lock; + struct list_head uobjects; + + u64 uverbs_cmd_mask; + u64 uverbs_ex_cmd_mask; + struct idr idr; /* spinlock protects write access to idr */ spinlock_t idr_lock; @@ -196,7 +212,6 @@ struct ib_uwq_object { struct ib_ucq_object { struct ib_uobject uobject; - struct ib_uverbs_file *uverbs_file; struct list_head comp_list; struct list_head async_list; u32 comp_events_reported; @@ -230,7 +245,7 @@ void ib_uverbs_wq_event_handler(struct ib_event *event, void *context_ptr); void ib_uverbs_srq_event_handler(struct ib_event *event, void *context_ptr); void ib_uverbs_event_handler(struct ib_event_handler *handler, struct ib_event *event); -int ib_uverbs_dealloc_xrcd(struct ib_uverbs_device *dev, struct ib_xrcd *xrcd, +int ib_uverbs_dealloc_xrcd(struct ib_uobject *uobject, struct ib_xrcd *xrcd, enum rdma_remove_reason why); int uverbs_dealloc_mw(struct ib_mw *mw); @@ -238,12 +253,7 @@ void ib_uverbs_detach_umcast(struct ib_qp *qp, struct ib_uqp_object *uobj); void create_udata(struct uverbs_attr_bundle *ctx, struct ib_udata *udata); -extern const struct uverbs_attr_def uverbs_uhw_compat_in; -extern const struct uverbs_attr_def uverbs_uhw_compat_out; long ib_uverbs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg); -int uverbs_destroy_def_handler(struct ib_device *ib_dev, - struct ib_uverbs_file *file, - struct uverbs_attr_bundle *attrs); struct ib_uverbs_flow_spec { union { @@ -292,7 +302,6 @@ extern const struct uverbs_object_def UVERBS_OBJECT(UVERBS_OBJECT_COUNTERS); #define IB_UVERBS_DECLARE_CMD(name) \ ssize_t ib_uverbs_##name(struct ib_uverbs_file *file, \ - struct ib_device *ib_dev, \ const char __user *buf, int in_len, \ int out_len) @@ -334,7 +343,6 @@ IB_UVERBS_DECLARE_CMD(close_xrcd); #define IB_UVERBS_DECLARE_EX_CMD(name) \ int ib_uverbs_ex_##name(struct ib_uverbs_file *file, \ - struct ib_device *ib_dev, \ struct ib_udata *ucore, \ struct ib_udata *uhw) diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 583d3a10b940..a21d5214afc3 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -48,11 +48,10 @@ #include "core_priv.h" static struct ib_uverbs_completion_event_file * -ib_uverbs_lookup_comp_file(int fd, struct ib_ucontext *context) +_ib_uverbs_lookup_comp_file(s32 fd, struct ib_uverbs_file *ufile) { - struct ib_uobject *uobj = uobj_get_read(UVERBS_OBJECT_COMP_CHANNEL, - fd, context); - struct ib_uobject_file *uobj_file; + struct ib_uobject *uobj = ufd_get_read(UVERBS_OBJECT_COMP_CHANNEL, + fd, ufile); if (IS_ERR(uobj)) return (void *)uobj; @@ -60,13 +59,13 @@ ib_uverbs_lookup_comp_file(int fd, struct ib_ucontext *context) uverbs_uobject_get(uobj); uobj_put_read(uobj); - uobj_file = container_of(uobj, struct ib_uobject_file, uobj); - return container_of(uobj_file, struct ib_uverbs_completion_event_file, - uobj_file); + return container_of(uobj, struct ib_uverbs_completion_event_file, + uobj); } +#define ib_uverbs_lookup_comp_file(_fd, _ufile) \ + _ib_uverbs_lookup_comp_file((_fd)*typecheck(s32, _fd), _ufile) ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file, - struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) { @@ -76,6 +75,7 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file, struct ib_ucontext *ucontext; struct file *filp; struct ib_rdmacg_object cg_obj; + struct ib_device *ib_dev; int ret; if (out_len < sizeof resp) @@ -84,7 +84,13 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file, if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; - mutex_lock(&file->mutex); + mutex_lock(&file->ucontext_lock); + ib_dev = srcu_dereference(file->device->ib_dev, + &file->device->disassociate_srcu); + if (!ib_dev) { + ret = -EIO; + goto err; + } if (file->ucontext) { ret = -EINVAL; @@ -110,12 +116,12 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file, ucontext->cg_obj = cg_obj; /* ufile is required when some objects are released */ ucontext->ufile = file; - uverbs_initialize_ucontext(ucontext); rcu_read_lock(); ucontext->tgid = get_task_pid(current->group_leader, PIDTYPE_PID); rcu_read_unlock(); ucontext->closing = 0; + ucontext->cleanup_retryable = false; #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING ucontext->umem_tree = RB_ROOT_CACHED; @@ -146,11 +152,15 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file, goto err_file; } - file->ucontext = ucontext; - fd_install(resp.async_fd, filp); - mutex_unlock(&file->mutex); + /* + * Make sure that ib_uverbs_get_ucontext() sees the pointer update + * only after all writes to setup the ucontext have completed + */ + smp_store_release(&file->ucontext, ucontext); + + mutex_unlock(&file->ucontext_lock); return in_len; @@ -169,15 +179,16 @@ err_alloc: ib_rdmacg_uncharge(&cg_obj, ib_dev, RDMACG_RESOURCE_HCA_HANDLE); err: - mutex_unlock(&file->mutex); + mutex_unlock(&file->ucontext_lock); return ret; } -static void copy_query_dev_fields(struct ib_uverbs_file *file, - struct ib_device *ib_dev, +static void copy_query_dev_fields(struct ib_ucontext *ucontext, struct ib_uverbs_query_device_resp *resp, struct ib_device_attr *attr) { + struct ib_device *ib_dev = ucontext->device; + resp->fw_ver = attr->fw_ver; resp->node_guid = ib_dev->node_guid; resp->sys_image_guid = attr->sys_image_guid; @@ -189,7 +200,7 @@ static void copy_query_dev_fields(struct ib_uverbs_file *file, resp->max_qp = attr->max_qp; resp->max_qp_wr = attr->max_qp_wr; resp->device_cap_flags = lower_32_bits(attr->device_cap_flags); - resp->max_sge = attr->max_sge; + resp->max_sge = min(attr->max_send_sge, attr->max_recv_sge); resp->max_sge_rd = attr->max_sge_rd; resp->max_cq = attr->max_cq; resp->max_cqe = attr->max_cqe; @@ -221,12 +232,16 @@ static void copy_query_dev_fields(struct ib_uverbs_file *file, } ssize_t ib_uverbs_query_device(struct ib_uverbs_file *file, - struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) { struct ib_uverbs_query_device cmd; struct ib_uverbs_query_device_resp resp; + struct ib_ucontext *ucontext; + + ucontext = ib_uverbs_get_ucontext(file); + if (IS_ERR(ucontext)) + return PTR_ERR(ucontext); if (out_len < sizeof resp) return -ENOSPC; @@ -235,7 +250,7 @@ ssize_t ib_uverbs_query_device(struct ib_uverbs_file *file, return -EFAULT; memset(&resp, 0, sizeof resp); - copy_query_dev_fields(file, ib_dev, &resp, &ib_dev->attrs); + copy_query_dev_fields(ucontext, &resp, &ucontext->device->attrs); if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, sizeof resp)) return -EFAULT; @@ -243,8 +258,28 @@ ssize_t ib_uverbs_query_device(struct ib_uverbs_file *file, return in_len; } +/* + * ib_uverbs_query_port_resp.port_cap_flags started out as just a copy of the + * PortInfo CapabilityMask, but was extended with unique bits. + */ +static u32 make_port_cap_flags(const struct ib_port_attr *attr) +{ + u32 res; + + /* All IBA CapabilityMask bits are passed through here, except bit 26, + * which is overridden with IP_BASED_GIDS. This is due to a historical + * mistake in the implementation of IP_BASED_GIDS. Otherwise all other + * bits match the IBA definition across all kernel versions. + */ + res = attr->port_cap_flags & ~(u32)IB_UVERBS_PCF_IP_BASED_GIDS; + + if (attr->ip_gids) + res |= IB_UVERBS_PCF_IP_BASED_GIDS; + + return res; +} + ssize_t ib_uverbs_query_port(struct ib_uverbs_file *file, - struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) { @@ -252,6 +287,13 @@ ssize_t ib_uverbs_query_port(struct ib_uverbs_file *file, struct ib_uverbs_query_port_resp resp; struct ib_port_attr attr; int ret; + struct ib_ucontext *ucontext; + struct ib_device *ib_dev; + + ucontext = ib_uverbs_get_ucontext(file); + if (IS_ERR(ucontext)) + return PTR_ERR(ucontext); + ib_dev = ucontext->device; if (out_len < sizeof resp) return -ENOSPC; @@ -269,12 +311,15 @@ ssize_t ib_uverbs_query_port(struct ib_uverbs_file *file, resp.max_mtu = attr.max_mtu; resp.active_mtu = attr.active_mtu; resp.gid_tbl_len = attr.gid_tbl_len; - resp.port_cap_flags = attr.port_cap_flags; + resp.port_cap_flags = make_port_cap_flags(&attr); resp.max_msg_sz = attr.max_msg_sz; resp.bad_pkey_cntr = attr.bad_pkey_cntr; resp.qkey_viol_cntr = attr.qkey_viol_cntr; resp.pkey_tbl_len = attr.pkey_tbl_len; + if (rdma_is_grh_required(ib_dev, cmd.port_num)) + resp.flags |= IB_UVERBS_QPF_GRH_REQUIRED; + if (rdma_cap_opa_ah(ib_dev, cmd.port_num)) { resp.lid = OPA_TO_IB_UCAST_LID(attr.lid); resp.sm_lid = OPA_TO_IB_UCAST_LID(attr.sm_lid); @@ -300,7 +345,6 @@ ssize_t ib_uverbs_query_port(struct ib_uverbs_file *file, } ssize_t ib_uverbs_alloc_pd(struct ib_uverbs_file *file, - struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) { @@ -310,6 +354,7 @@ ssize_t ib_uverbs_alloc_pd(struct ib_uverbs_file *file, struct ib_uobject *uobj; struct ib_pd *pd; int ret; + struct ib_device *ib_dev; if (out_len < sizeof resp) return -ENOSPC; @@ -322,11 +367,11 @@ ssize_t ib_uverbs_alloc_pd(struct ib_uverbs_file *file, in_len - sizeof(cmd) - sizeof(struct ib_uverbs_cmd_hdr), out_len - sizeof(resp)); - uobj = uobj_alloc(UVERBS_OBJECT_PD, file->ucontext); + uobj = uobj_alloc(UVERBS_OBJECT_PD, file, &ib_dev); if (IS_ERR(uobj)) return PTR_ERR(uobj); - pd = ib_dev->alloc_pd(ib_dev, file->ucontext, &udata); + pd = ib_dev->alloc_pd(ib_dev, uobj->context, &udata); if (IS_ERR(pd)) { ret = PTR_ERR(pd); goto err; @@ -348,9 +393,7 @@ ssize_t ib_uverbs_alloc_pd(struct ib_uverbs_file *file, goto err_copy; } - uobj_alloc_commit(uobj); - - return in_len; + return uobj_alloc_commit(uobj, in_len); err_copy: ib_dealloc_pd(pd); @@ -361,25 +404,16 @@ err: } ssize_t ib_uverbs_dealloc_pd(struct ib_uverbs_file *file, - struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) { struct ib_uverbs_dealloc_pd cmd; - struct ib_uobject *uobj; - int ret; if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; - uobj = uobj_get_write(UVERBS_OBJECT_PD, cmd.pd_handle, - file->ucontext); - if (IS_ERR(uobj)) - return PTR_ERR(uobj); - - ret = uobj_remove_commit(uobj); - - return ret ?: in_len; + return uobj_perform_destroy(UVERBS_OBJECT_PD, cmd.pd_handle, file, + in_len); } struct xrcd_table_entry { @@ -468,7 +502,6 @@ static void xrcd_table_delete(struct ib_uverbs_device *dev, } ssize_t ib_uverbs_open_xrcd(struct ib_uverbs_file *file, - struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) { @@ -481,6 +514,7 @@ ssize_t ib_uverbs_open_xrcd(struct ib_uverbs_file *file, struct inode *inode = NULL; int ret = 0; int new_xrcd = 0; + struct ib_device *ib_dev; if (out_len < sizeof resp) return -ENOSPC; @@ -517,15 +551,15 @@ ssize_t ib_uverbs_open_xrcd(struct ib_uverbs_file *file, } } - obj = (struct ib_uxrcd_object *)uobj_alloc(UVERBS_OBJECT_XRCD, - file->ucontext); + obj = (struct ib_uxrcd_object *)uobj_alloc(UVERBS_OBJECT_XRCD, file, + &ib_dev); if (IS_ERR(obj)) { ret = PTR_ERR(obj); goto err_tree_mutex_unlock; } if (!xrcd) { - xrcd = ib_dev->alloc_xrcd(ib_dev, file->ucontext, &udata); + xrcd = ib_dev->alloc_xrcd(ib_dev, obj->uobject.context, &udata); if (IS_ERR(xrcd)) { ret = PTR_ERR(xrcd); goto err; @@ -564,9 +598,7 @@ ssize_t ib_uverbs_open_xrcd(struct ib_uverbs_file *file, mutex_unlock(&file->device->xrcd_tree_mutex); - uobj_alloc_commit(&obj->uobject); - - return in_len; + return uobj_alloc_commit(&obj->uobject, in_len); err_copy: if (inode) { @@ -591,32 +623,25 @@ err_tree_mutex_unlock: } ssize_t ib_uverbs_close_xrcd(struct ib_uverbs_file *file, - struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) { struct ib_uverbs_close_xrcd cmd; - struct ib_uobject *uobj; - int ret = 0; if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; - uobj = uobj_get_write(UVERBS_OBJECT_XRCD, cmd.xrcd_handle, - file->ucontext); - if (IS_ERR(uobj)) - return PTR_ERR(uobj); - - ret = uobj_remove_commit(uobj); - return ret ?: in_len; + return uobj_perform_destroy(UVERBS_OBJECT_XRCD, cmd.xrcd_handle, file, + in_len); } -int ib_uverbs_dealloc_xrcd(struct ib_uverbs_device *dev, +int ib_uverbs_dealloc_xrcd(struct ib_uobject *uobject, struct ib_xrcd *xrcd, enum rdma_remove_reason why) { struct inode *inode; int ret; + struct ib_uverbs_device *dev = uobject->context->ufile->device; inode = xrcd->inode; if (inode && !atomic_dec_and_test(&xrcd->usecnt)) @@ -624,16 +649,18 @@ int ib_uverbs_dealloc_xrcd(struct ib_uverbs_device *dev, ret = ib_dealloc_xrcd(xrcd); - if (why == RDMA_REMOVE_DESTROY && ret) + if (ib_is_destroy_retryable(ret, why, uobject)) { atomic_inc(&xrcd->usecnt); - else if (inode) + return ret; + } + + if (inode) xrcd_table_delete(dev, inode); return ret; } ssize_t ib_uverbs_reg_mr(struct ib_uverbs_file *file, - struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) { @@ -644,6 +671,7 @@ ssize_t ib_uverbs_reg_mr(struct ib_uverbs_file *file, struct ib_pd *pd; struct ib_mr *mr; int ret; + struct ib_device *ib_dev; if (out_len < sizeof resp) return -ENOSPC; @@ -663,11 +691,11 @@ ssize_t ib_uverbs_reg_mr(struct ib_uverbs_file *file, if (ret) return ret; - uobj = uobj_alloc(UVERBS_OBJECT_MR, file->ucontext); + uobj = uobj_alloc(UVERBS_OBJECT_MR, file, &ib_dev); if (IS_ERR(uobj)) return PTR_ERR(uobj); - pd = uobj_get_obj_read(pd, UVERBS_OBJECT_PD, cmd.pd_handle, file->ucontext); + pd = uobj_get_obj_read(pd, UVERBS_OBJECT_PD, cmd.pd_handle, file); if (!pd) { ret = -EINVAL; goto err_free; @@ -711,9 +739,7 @@ ssize_t ib_uverbs_reg_mr(struct ib_uverbs_file *file, uobj_put_obj_read(pd); - uobj_alloc_commit(uobj); - - return in_len; + return uobj_alloc_commit(uobj, in_len); err_copy: ib_dereg_mr(mr); @@ -727,7 +753,6 @@ err_free: } ssize_t ib_uverbs_rereg_mr(struct ib_uverbs_file *file, - struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) { @@ -759,8 +784,7 @@ ssize_t ib_uverbs_rereg_mr(struct ib_uverbs_file *file, (cmd.start & ~PAGE_MASK) != (cmd.hca_va & ~PAGE_MASK))) return -EINVAL; - uobj = uobj_get_write(UVERBS_OBJECT_MR, cmd.mr_handle, - file->ucontext); + uobj = uobj_get_write(UVERBS_OBJECT_MR, cmd.mr_handle, file); if (IS_ERR(uobj)) return PTR_ERR(uobj); @@ -778,7 +802,8 @@ ssize_t ib_uverbs_rereg_mr(struct ib_uverbs_file *file, } if (cmd.flags & IB_MR_REREG_PD) { - pd = uobj_get_obj_read(pd, UVERBS_OBJECT_PD, cmd.pd_handle, file->ucontext); + pd = uobj_get_obj_read(pd, UVERBS_OBJECT_PD, cmd.pd_handle, + file); if (!pd) { ret = -EINVAL; goto put_uobjs; @@ -819,29 +844,19 @@ put_uobjs: } ssize_t ib_uverbs_dereg_mr(struct ib_uverbs_file *file, - struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) { struct ib_uverbs_dereg_mr cmd; - struct ib_uobject *uobj; - int ret = -EINVAL; if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; - uobj = uobj_get_write(UVERBS_OBJECT_MR, cmd.mr_handle, - file->ucontext); - if (IS_ERR(uobj)) - return PTR_ERR(uobj); - - ret = uobj_remove_commit(uobj); - - return ret ?: in_len; + return uobj_perform_destroy(UVERBS_OBJECT_MR, cmd.mr_handle, file, + in_len); } ssize_t ib_uverbs_alloc_mw(struct ib_uverbs_file *file, - struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) { @@ -852,6 +867,7 @@ ssize_t ib_uverbs_alloc_mw(struct ib_uverbs_file *file, struct ib_mw *mw; struct ib_udata udata; int ret; + struct ib_device *ib_dev; if (out_len < sizeof(resp)) return -ENOSPC; @@ -859,11 +875,11 @@ ssize_t ib_uverbs_alloc_mw(struct ib_uverbs_file *file, if (copy_from_user(&cmd, buf, sizeof(cmd))) return -EFAULT; - uobj = uobj_alloc(UVERBS_OBJECT_MW, file->ucontext); + uobj = uobj_alloc(UVERBS_OBJECT_MW, file, &ib_dev); if (IS_ERR(uobj)) return PTR_ERR(uobj); - pd = uobj_get_obj_read(pd, UVERBS_OBJECT_PD, cmd.pd_handle, file->ucontext); + pd = uobj_get_obj_read(pd, UVERBS_OBJECT_PD, cmd.pd_handle, file); if (!pd) { ret = -EINVAL; goto err_free; @@ -897,9 +913,7 @@ ssize_t ib_uverbs_alloc_mw(struct ib_uverbs_file *file, } uobj_put_obj_read(pd); - uobj_alloc_commit(uobj); - - return in_len; + return uobj_alloc_commit(uobj, in_len); err_copy: uverbs_dealloc_mw(mw); @@ -911,28 +925,19 @@ err_free: } ssize_t ib_uverbs_dealloc_mw(struct ib_uverbs_file *file, - struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) { struct ib_uverbs_dealloc_mw cmd; - struct ib_uobject *uobj; - int ret = -EINVAL; if (copy_from_user(&cmd, buf, sizeof(cmd))) return -EFAULT; - uobj = uobj_get_write(UVERBS_OBJECT_MW, cmd.mw_handle, - file->ucontext); - if (IS_ERR(uobj)) - return PTR_ERR(uobj); - - ret = uobj_remove_commit(uobj); - return ret ?: in_len; + return uobj_perform_destroy(UVERBS_OBJECT_MW, cmd.mw_handle, file, + in_len); } ssize_t ib_uverbs_create_comp_channel(struct ib_uverbs_file *file, - struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) { @@ -940,6 +945,7 @@ ssize_t ib_uverbs_create_comp_channel(struct ib_uverbs_file *file, struct ib_uverbs_create_comp_channel_resp resp; struct ib_uobject *uobj; struct ib_uverbs_completion_event_file *ev_file; + struct ib_device *ib_dev; if (out_len < sizeof resp) return -ENOSPC; @@ -947,14 +953,14 @@ ssize_t ib_uverbs_create_comp_channel(struct ib_uverbs_file *file, if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; - uobj = uobj_alloc(UVERBS_OBJECT_COMP_CHANNEL, file->ucontext); + uobj = uobj_alloc(UVERBS_OBJECT_COMP_CHANNEL, file, &ib_dev); if (IS_ERR(uobj)) return PTR_ERR(uobj); resp.fd = uobj->id; ev_file = container_of(uobj, struct ib_uverbs_completion_event_file, - uobj_file.uobj); + uobj); ib_uverbs_init_event_queue(&ev_file->ev_queue); if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, sizeof resp)) { @@ -962,12 +968,10 @@ ssize_t ib_uverbs_create_comp_channel(struct ib_uverbs_file *file, return -EFAULT; } - uobj_alloc_commit(uobj); - return in_len; + return uobj_alloc_commit(uobj, in_len); } static struct ib_ucq_object *create_cq(struct ib_uverbs_file *file, - struct ib_device *ib_dev, struct ib_udata *ucore, struct ib_udata *uhw, struct ib_uverbs_ex_create_cq *cmd, @@ -985,21 +989,23 @@ static struct ib_ucq_object *create_cq(struct ib_uverbs_file *file, int ret; struct ib_uverbs_ex_create_cq_resp resp; struct ib_cq_init_attr attr = {}; - - if (!ib_dev->create_cq) - return ERR_PTR(-EOPNOTSUPP); + struct ib_device *ib_dev; if (cmd->comp_vector >= file->device->num_comp_vectors) return ERR_PTR(-EINVAL); - obj = (struct ib_ucq_object *)uobj_alloc(UVERBS_OBJECT_CQ, - file->ucontext); + obj = (struct ib_ucq_object *)uobj_alloc(UVERBS_OBJECT_CQ, file, + &ib_dev); if (IS_ERR(obj)) return obj; + if (!ib_dev->create_cq) { + ret = -EOPNOTSUPP; + goto err; + } + if (cmd->comp_channel >= 0) { - ev_file = ib_uverbs_lookup_comp_file(cmd->comp_channel, - file->ucontext); + ev_file = ib_uverbs_lookup_comp_file(cmd->comp_channel, file); if (IS_ERR(ev_file)) { ret = PTR_ERR(ev_file); goto err; @@ -1007,7 +1013,6 @@ static struct ib_ucq_object *create_cq(struct ib_uverbs_file *file, } obj->uobject.user_handle = cmd->user_handle; - obj->uverbs_file = file; obj->comp_events_reported = 0; obj->async_events_reported = 0; INIT_LIST_HEAD(&obj->comp_list); @@ -1019,7 +1024,7 @@ static struct ib_ucq_object *create_cq(struct ib_uverbs_file *file, if (cmd_sz > offsetof(typeof(*cmd), flags) + sizeof(cmd->flags)) attr.flags = cmd->flags; - cq = ib_dev->create_cq(ib_dev, &attr, file->ucontext, uhw); + cq = ib_dev->create_cq(ib_dev, &attr, obj->uobject.context, uhw); if (IS_ERR(cq)) { ret = PTR_ERR(cq); goto err_file; @@ -1047,7 +1052,9 @@ static struct ib_ucq_object *create_cq(struct ib_uverbs_file *file, if (ret) goto err_cb; - uobj_alloc_commit(&obj->uobject); + ret = uobj_alloc_commit(&obj->uobject, 0); + if (ret) + return ERR_PTR(ret); return obj; err_cb: @@ -1075,7 +1082,6 @@ static int ib_uverbs_create_cq_cb(struct ib_uverbs_file *file, } ssize_t ib_uverbs_create_cq(struct ib_uverbs_file *file, - struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) { @@ -1106,7 +1112,7 @@ ssize_t ib_uverbs_create_cq(struct ib_uverbs_file *file, cmd_ex.comp_vector = cmd.comp_vector; cmd_ex.comp_channel = cmd.comp_channel; - obj = create_cq(file, ib_dev, &ucore, &uhw, &cmd_ex, + obj = create_cq(file, &ucore, &uhw, &cmd_ex, offsetof(typeof(cmd_ex), comp_channel) + sizeof(cmd.comp_channel), ib_uverbs_create_cq_cb, NULL); @@ -1129,7 +1135,6 @@ static int ib_uverbs_ex_create_cq_cb(struct ib_uverbs_file *file, } int ib_uverbs_ex_create_cq(struct ib_uverbs_file *file, - struct ib_device *ib_dev, struct ib_udata *ucore, struct ib_udata *uhw) { @@ -1155,7 +1160,7 @@ int ib_uverbs_ex_create_cq(struct ib_uverbs_file *file, sizeof(resp.response_length))) return -ENOSPC; - obj = create_cq(file, ib_dev, ucore, uhw, &cmd, + obj = create_cq(file, ucore, uhw, &cmd, min(ucore->inlen, sizeof(cmd)), ib_uverbs_ex_create_cq_cb, NULL); @@ -1163,7 +1168,6 @@ int ib_uverbs_ex_create_cq(struct ib_uverbs_file *file, } ssize_t ib_uverbs_resize_cq(struct ib_uverbs_file *file, - struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) { @@ -1181,7 +1185,7 @@ ssize_t ib_uverbs_resize_cq(struct ib_uverbs_file *file, in_len - sizeof(cmd) - sizeof(struct ib_uverbs_cmd_hdr), out_len - sizeof(resp)); - cq = uobj_get_obj_read(cq, UVERBS_OBJECT_CQ, cmd.cq_handle, file->ucontext); + cq = uobj_get_obj_read(cq, UVERBS_OBJECT_CQ, cmd.cq_handle, file); if (!cq) return -EINVAL; @@ -1231,7 +1235,6 @@ static int copy_wc_to_user(struct ib_device *ib_dev, void __user *dest, } ssize_t ib_uverbs_poll_cq(struct ib_uverbs_file *file, - struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) { @@ -1246,7 +1249,7 @@ ssize_t ib_uverbs_poll_cq(struct ib_uverbs_file *file, if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; - cq = uobj_get_obj_read(cq, UVERBS_OBJECT_CQ, cmd.cq_handle, file->ucontext); + cq = uobj_get_obj_read(cq, UVERBS_OBJECT_CQ, cmd.cq_handle, file); if (!cq) return -EINVAL; @@ -1262,7 +1265,7 @@ ssize_t ib_uverbs_poll_cq(struct ib_uverbs_file *file, if (!ret) break; - ret = copy_wc_to_user(ib_dev, data_ptr, &wc); + ret = copy_wc_to_user(cq->device, data_ptr, &wc); if (ret) goto out_put; @@ -1283,7 +1286,6 @@ out_put: } ssize_t ib_uverbs_req_notify_cq(struct ib_uverbs_file *file, - struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) { @@ -1293,7 +1295,7 @@ ssize_t ib_uverbs_req_notify_cq(struct ib_uverbs_file *file, if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; - cq = uobj_get_obj_read(cq, UVERBS_OBJECT_CQ, cmd.cq_handle, file->ucontext); + cq = uobj_get_obj_read(cq, UVERBS_OBJECT_CQ, cmd.cq_handle, file); if (!cq) return -EINVAL; @@ -1306,45 +1308,28 @@ ssize_t ib_uverbs_req_notify_cq(struct ib_uverbs_file *file, } ssize_t ib_uverbs_destroy_cq(struct ib_uverbs_file *file, - struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) { struct ib_uverbs_destroy_cq cmd; struct ib_uverbs_destroy_cq_resp resp; struct ib_uobject *uobj; - struct ib_cq *cq; struct ib_ucq_object *obj; - int ret = -EINVAL; if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; - uobj = uobj_get_write(UVERBS_OBJECT_CQ, cmd.cq_handle, - file->ucontext); + uobj = uobj_get_destroy(UVERBS_OBJECT_CQ, cmd.cq_handle, file); if (IS_ERR(uobj)) return PTR_ERR(uobj); - /* - * Make sure we don't free the memory in remove_commit as we still - * needs the uobject memory to create the response. - */ - uverbs_uobject_get(uobj); - cq = uobj->object; - obj = container_of(cq->uobject, struct ib_ucq_object, uobject); - + obj = container_of(uobj, struct ib_ucq_object, uobject); memset(&resp, 0, sizeof(resp)); - - ret = uobj_remove_commit(uobj); - if (ret) { - uverbs_uobject_put(uobj); - return ret; - } - resp.comp_events_reported = obj->comp_events_reported; resp.async_events_reported = obj->async_events_reported; - uverbs_uobject_put(uobj); + uobj_put_destroy(uobj); + if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, sizeof resp)) return -EFAULT; @@ -1375,12 +1360,13 @@ static int create_qp(struct ib_uverbs_file *file, int ret; struct ib_rwq_ind_table *ind_tbl = NULL; bool has_sq = true; + struct ib_device *ib_dev; if (cmd->qp_type == IB_QPT_RAW_PACKET && !capable(CAP_NET_RAW)) return -EPERM; - obj = (struct ib_uqp_object *)uobj_alloc(UVERBS_OBJECT_QP, - file->ucontext); + obj = (struct ib_uqp_object *)uobj_alloc(UVERBS_OBJECT_QP, file, + &ib_dev); if (IS_ERR(obj)) return PTR_ERR(obj); obj->uxrcd = NULL; @@ -1390,9 +1376,9 @@ static int create_qp(struct ib_uverbs_file *file, if (cmd_sz >= offsetof(typeof(*cmd), rwq_ind_tbl_handle) + sizeof(cmd->rwq_ind_tbl_handle) && (cmd->comp_mask & IB_UVERBS_CREATE_QP_MASK_IND_TABLE)) { - ind_tbl = uobj_get_obj_read(rwq_ind_table, UVERBS_OBJECT_RWQ_IND_TBL, - cmd->rwq_ind_tbl_handle, - file->ucontext); + ind_tbl = uobj_get_obj_read(rwq_ind_table, + UVERBS_OBJECT_RWQ_IND_TBL, + cmd->rwq_ind_tbl_handle, file); if (!ind_tbl) { ret = -EINVAL; goto err_put; @@ -1418,7 +1404,7 @@ static int create_qp(struct ib_uverbs_file *file, if (cmd->qp_type == IB_QPT_XRC_TGT) { xrcd_uobj = uobj_get_read(UVERBS_OBJECT_XRCD, cmd->pd_handle, - file->ucontext); + file); if (IS_ERR(xrcd_uobj)) { ret = -EINVAL; @@ -1437,8 +1423,8 @@ static int create_qp(struct ib_uverbs_file *file, cmd->max_recv_sge = 0; } else { if (cmd->is_srq) { - srq = uobj_get_obj_read(srq, UVERBS_OBJECT_SRQ, cmd->srq_handle, - file->ucontext); + srq = uobj_get_obj_read(srq, UVERBS_OBJECT_SRQ, + cmd->srq_handle, file); if (!srq || srq->srq_type == IB_SRQT_XRC) { ret = -EINVAL; goto err_put; @@ -1447,8 +1433,9 @@ static int create_qp(struct ib_uverbs_file *file, if (!ind_tbl) { if (cmd->recv_cq_handle != cmd->send_cq_handle) { - rcq = uobj_get_obj_read(cq, UVERBS_OBJECT_CQ, cmd->recv_cq_handle, - file->ucontext); + rcq = uobj_get_obj_read( + cq, UVERBS_OBJECT_CQ, + cmd->recv_cq_handle, file); if (!rcq) { ret = -EINVAL; goto err_put; @@ -1458,11 +1445,12 @@ static int create_qp(struct ib_uverbs_file *file, } if (has_sq) - scq = uobj_get_obj_read(cq, UVERBS_OBJECT_CQ, cmd->send_cq_handle, - file->ucontext); + scq = uobj_get_obj_read(cq, UVERBS_OBJECT_CQ, + cmd->send_cq_handle, file); if (!ind_tbl) rcq = rcq ?: scq; - pd = uobj_get_obj_read(pd, UVERBS_OBJECT_PD, cmd->pd_handle, file->ucontext); + pd = uobj_get_obj_read(pd, UVERBS_OBJECT_PD, cmd->pd_handle, + file); if (!pd || (!scq && has_sq)) { ret = -EINVAL; goto err_put; @@ -1602,9 +1590,7 @@ static int create_qp(struct ib_uverbs_file *file, if (ind_tbl) uobj_put_obj_read(ind_tbl); - uobj_alloc_commit(&obj->uevent.uobject); - - return 0; + return uobj_alloc_commit(&obj->uevent.uobject, 0); err_cb: ib_destroy_qp(qp); @@ -1637,7 +1623,6 @@ static int ib_uverbs_create_qp_cb(struct ib_uverbs_file *file, } ssize_t ib_uverbs_create_qp(struct ib_uverbs_file *file, - struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) { @@ -1698,7 +1683,6 @@ static int ib_uverbs_ex_create_qp_cb(struct ib_uverbs_file *file, } int ib_uverbs_ex_create_qp(struct ib_uverbs_file *file, - struct ib_device *ib_dev, struct ib_udata *ucore, struct ib_udata *uhw) { @@ -1735,7 +1719,6 @@ int ib_uverbs_ex_create_qp(struct ib_uverbs_file *file, } ssize_t ib_uverbs_open_qp(struct ib_uverbs_file *file, - struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) { struct ib_uverbs_open_qp cmd; @@ -1747,6 +1730,7 @@ ssize_t ib_uverbs_open_qp(struct ib_uverbs_file *file, struct ib_qp *qp; struct ib_qp_open_attr attr; int ret; + struct ib_device *ib_dev; if (out_len < sizeof resp) return -ENOSPC; @@ -1759,13 +1743,12 @@ ssize_t ib_uverbs_open_qp(struct ib_uverbs_file *file, in_len - sizeof(cmd) - sizeof(struct ib_uverbs_cmd_hdr), out_len - sizeof(resp)); - obj = (struct ib_uqp_object *)uobj_alloc(UVERBS_OBJECT_QP, - file->ucontext); + obj = (struct ib_uqp_object *)uobj_alloc(UVERBS_OBJECT_QP, file, + &ib_dev); if (IS_ERR(obj)) return PTR_ERR(obj); - xrcd_uobj = uobj_get_read(UVERBS_OBJECT_XRCD, cmd.pd_handle, - file->ucontext); + xrcd_uobj = uobj_get_read(UVERBS_OBJECT_XRCD, cmd.pd_handle, file); if (IS_ERR(xrcd_uobj)) { ret = -EINVAL; goto err_put; @@ -1809,10 +1792,7 @@ ssize_t ib_uverbs_open_qp(struct ib_uverbs_file *file, qp->uobject = &obj->uevent.uobject; uobj_put_read(xrcd_uobj); - - uobj_alloc_commit(&obj->uevent.uobject); - - return in_len; + return uobj_alloc_commit(&obj->uevent.uobject, in_len); err_destroy: ib_destroy_qp(qp); @@ -1846,7 +1826,6 @@ static void copy_ah_attr_to_uverbs(struct ib_uverbs_qp_dest *uverb_attr, } ssize_t ib_uverbs_query_qp(struct ib_uverbs_file *file, - struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) { @@ -1867,7 +1846,7 @@ ssize_t ib_uverbs_query_qp(struct ib_uverbs_file *file, goto out; } - qp = uobj_get_obj_read(qp, UVERBS_OBJECT_QP, cmd.qp_handle, file->ucontext); + qp = uobj_get_obj_read(qp, UVERBS_OBJECT_QP, cmd.qp_handle, file); if (!qp) { ret = -EINVAL; goto out; @@ -1968,11 +1947,11 @@ static int modify_qp(struct ib_uverbs_file *file, struct ib_qp *qp; int ret; - attr = kmalloc(sizeof *attr, GFP_KERNEL); + attr = kzalloc(sizeof(*attr), GFP_KERNEL); if (!attr) return -ENOMEM; - qp = uobj_get_obj_read(qp, UVERBS_OBJECT_QP, cmd->base.qp_handle, file->ucontext); + qp = uobj_get_obj_read(qp, UVERBS_OBJECT_QP, cmd->base.qp_handle, file); if (!qp) { ret = -EINVAL; goto out; @@ -2098,7 +2077,6 @@ out: } ssize_t ib_uverbs_modify_qp(struct ib_uverbs_file *file, - struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) { @@ -2125,7 +2103,6 @@ ssize_t ib_uverbs_modify_qp(struct ib_uverbs_file *file, } int ib_uverbs_ex_modify_qp(struct ib_uverbs_file *file, - struct ib_device *ib_dev, struct ib_udata *ucore, struct ib_udata *uhw) { @@ -2161,7 +2138,6 @@ int ib_uverbs_ex_modify_qp(struct ib_uverbs_file *file, } ssize_t ib_uverbs_destroy_qp(struct ib_uverbs_file *file, - struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) { @@ -2169,33 +2145,19 @@ ssize_t ib_uverbs_destroy_qp(struct ib_uverbs_file *file, struct ib_uverbs_destroy_qp_resp resp; struct ib_uobject *uobj; struct ib_uqp_object *obj; - int ret = -EINVAL; if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; - memset(&resp, 0, sizeof resp); - - uobj = uobj_get_write(UVERBS_OBJECT_QP, cmd.qp_handle, - file->ucontext); + uobj = uobj_get_destroy(UVERBS_OBJECT_QP, cmd.qp_handle, file); if (IS_ERR(uobj)) return PTR_ERR(uobj); obj = container_of(uobj, struct ib_uqp_object, uevent.uobject); - /* - * Make sure we don't free the memory in remove_commit as we still - * needs the uobject memory to create the response. - */ - uverbs_uobject_get(uobj); - - ret = uobj_remove_commit(uobj); - if (ret) { - uverbs_uobject_put(uobj); - return ret; - } - + memset(&resp, 0, sizeof(resp)); resp.events_reported = obj->uevent.events_reported; - uverbs_uobject_put(uobj); + + uobj_put_destroy(uobj); if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, sizeof resp)) return -EFAULT; @@ -2214,14 +2176,14 @@ static void *alloc_wr(size_t wr_size, __u32 num_sge) } ssize_t ib_uverbs_post_send(struct ib_uverbs_file *file, - struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) { struct ib_uverbs_post_send cmd; struct ib_uverbs_post_send_resp resp; struct ib_uverbs_send_wr *user_wr; - struct ib_send_wr *wr = NULL, *last, *next, *bad_wr; + struct ib_send_wr *wr = NULL, *last, *next; + const struct ib_send_wr *bad_wr; struct ib_qp *qp; int i, sg_ind; int is_ud; @@ -2242,7 +2204,7 @@ ssize_t ib_uverbs_post_send(struct ib_uverbs_file *file, if (!user_wr) return -ENOMEM; - qp = uobj_get_obj_read(qp, UVERBS_OBJECT_QP, cmd.qp_handle, file->ucontext); + qp = uobj_get_obj_read(qp, UVERBS_OBJECT_QP, cmd.qp_handle, file); if (!qp) goto out; @@ -2278,8 +2240,8 @@ ssize_t ib_uverbs_post_send(struct ib_uverbs_file *file, goto out_put; } - ud->ah = uobj_get_obj_read(ah, UVERBS_OBJECT_AH, user_wr->wr.ud.ah, - file->ucontext); + ud->ah = uobj_get_obj_read(ah, UVERBS_OBJECT_AH, + user_wr->wr.ud.ah, file); if (!ud->ah) { kfree(ud); ret = -EINVAL; @@ -2494,13 +2456,13 @@ err: } ssize_t ib_uverbs_post_recv(struct ib_uverbs_file *file, - struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) { struct ib_uverbs_post_recv cmd; struct ib_uverbs_post_recv_resp resp; - struct ib_recv_wr *wr, *next, *bad_wr; + struct ib_recv_wr *wr, *next; + const struct ib_recv_wr *bad_wr; struct ib_qp *qp; ssize_t ret = -EINVAL; @@ -2513,7 +2475,7 @@ ssize_t ib_uverbs_post_recv(struct ib_uverbs_file *file, if (IS_ERR(wr)) return PTR_ERR(wr); - qp = uobj_get_obj_read(qp, UVERBS_OBJECT_QP, cmd.qp_handle, file->ucontext); + qp = uobj_get_obj_read(qp, UVERBS_OBJECT_QP, cmd.qp_handle, file); if (!qp) goto out; @@ -2543,13 +2505,13 @@ out: } ssize_t ib_uverbs_post_srq_recv(struct ib_uverbs_file *file, - struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) { struct ib_uverbs_post_srq_recv cmd; struct ib_uverbs_post_srq_recv_resp resp; - struct ib_recv_wr *wr, *next, *bad_wr; + struct ib_recv_wr *wr, *next; + const struct ib_recv_wr *bad_wr; struct ib_srq *srq; ssize_t ret = -EINVAL; @@ -2562,12 +2524,13 @@ ssize_t ib_uverbs_post_srq_recv(struct ib_uverbs_file *file, if (IS_ERR(wr)) return PTR_ERR(wr); - srq = uobj_get_obj_read(srq, UVERBS_OBJECT_SRQ, cmd.srq_handle, file->ucontext); + srq = uobj_get_obj_read(srq, UVERBS_OBJECT_SRQ, cmd.srq_handle, file); if (!srq) goto out; resp.bad_wr = 0; - ret = srq->device->post_srq_recv(srq, wr, &bad_wr); + ret = srq->device->post_srq_recv ? + srq->device->post_srq_recv(srq, wr, &bad_wr) : -EOPNOTSUPP; uobj_put_obj_read(srq); @@ -2592,7 +2555,6 @@ out: } ssize_t ib_uverbs_create_ah(struct ib_uverbs_file *file, - struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) { @@ -2601,9 +2563,10 @@ ssize_t ib_uverbs_create_ah(struct ib_uverbs_file *file, struct ib_uobject *uobj; struct ib_pd *pd; struct ib_ah *ah; - struct rdma_ah_attr attr; + struct rdma_ah_attr attr = {}; int ret; struct ib_udata udata; + struct ib_device *ib_dev; if (out_len < sizeof resp) return -ENOSPC; @@ -2611,19 +2574,21 @@ ssize_t ib_uverbs_create_ah(struct ib_uverbs_file *file, if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; - if (!rdma_is_port_valid(ib_dev, cmd.attr.port_num)) - return -EINVAL; - ib_uverbs_init_udata(&udata, buf + sizeof(cmd), u64_to_user_ptr(cmd.response) + sizeof(resp), in_len - sizeof(cmd) - sizeof(struct ib_uverbs_cmd_hdr), out_len - sizeof(resp)); - uobj = uobj_alloc(UVERBS_OBJECT_AH, file->ucontext); + uobj = uobj_alloc(UVERBS_OBJECT_AH, file, &ib_dev); if (IS_ERR(uobj)) return PTR_ERR(uobj); - pd = uobj_get_obj_read(pd, UVERBS_OBJECT_PD, cmd.pd_handle, file->ucontext); + if (!rdma_is_port_valid(ib_dev, cmd.attr.port_num)) { + ret = -EINVAL; + goto err; + } + + pd = uobj_get_obj_read(pd, UVERBS_OBJECT_PD, cmd.pd_handle, file); if (!pd) { ret = -EINVAL; goto err; @@ -2665,9 +2630,7 @@ ssize_t ib_uverbs_create_ah(struct ib_uverbs_file *file, } uobj_put_obj_read(pd); - uobj_alloc_commit(uobj); - - return in_len; + return uobj_alloc_commit(uobj, in_len); err_copy: rdma_destroy_ah(ah); @@ -2681,27 +2644,18 @@ err: } ssize_t ib_uverbs_destroy_ah(struct ib_uverbs_file *file, - struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) { struct ib_uverbs_destroy_ah cmd; - struct ib_uobject *uobj; - int ret; if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; - uobj = uobj_get_write(UVERBS_OBJECT_AH, cmd.ah_handle, - file->ucontext); - if (IS_ERR(uobj)) - return PTR_ERR(uobj); - - ret = uobj_remove_commit(uobj); - return ret ?: in_len; + return uobj_perform_destroy(UVERBS_OBJECT_AH, cmd.ah_handle, file, + in_len); } ssize_t ib_uverbs_attach_mcast(struct ib_uverbs_file *file, - struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) { @@ -2714,7 +2668,7 @@ ssize_t ib_uverbs_attach_mcast(struct ib_uverbs_file *file, if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; - qp = uobj_get_obj_read(qp, UVERBS_OBJECT_QP, cmd.qp_handle, file->ucontext); + qp = uobj_get_obj_read(qp, UVERBS_OBJECT_QP, cmd.qp_handle, file); if (!qp) return -EINVAL; @@ -2751,7 +2705,6 @@ out_put: } ssize_t ib_uverbs_detach_mcast(struct ib_uverbs_file *file, - struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) { @@ -2765,7 +2718,7 @@ ssize_t ib_uverbs_detach_mcast(struct ib_uverbs_file *file, if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; - qp = uobj_get_obj_read(qp, UVERBS_OBJECT_QP, cmd.qp_handle, file->ucontext); + qp = uobj_get_obj_read(qp, UVERBS_OBJECT_QP, cmd.qp_handle, file); if (!qp) return -EINVAL; @@ -2810,29 +2763,27 @@ static struct ib_uflow_resources *flow_resources_alloc(size_t num_specs) resources = kzalloc(sizeof(*resources), GFP_KERNEL); if (!resources) - goto err_res; + return NULL; + + if (!num_specs) + goto out; resources->counters = kcalloc(num_specs, sizeof(*resources->counters), GFP_KERNEL); - - if (!resources->counters) - goto err_cnt; - resources->collection = kcalloc(num_specs, sizeof(*resources->collection), GFP_KERNEL); - if (!resources->collection) - goto err_collection; + if (!resources->counters || !resources->collection) + goto err; +out: resources->max = num_specs; - return resources; -err_collection: +err: kfree(resources->counters); -err_cnt: kfree(resources); -err_res: + return NULL; } @@ -2840,6 +2791,9 @@ void ib_uverbs_flow_resources_free(struct ib_uflow_resources *uflow_res) { unsigned int i; + if (!uflow_res) + return; + for (i = 0; i < uflow_res->collection_num; i++) atomic_dec(&uflow_res->collection[i]->usecnt); @@ -2875,7 +2829,7 @@ static void flow_resources_add(struct ib_uflow_resources *uflow_res, uflow_res->num++; } -static int kern_spec_to_ib_spec_action(struct ib_ucontext *ucontext, +static int kern_spec_to_ib_spec_action(struct ib_uverbs_file *ufile, struct ib_uverbs_flow_spec *kern_spec, union ib_flow_spec *ib_spec, struct ib_uflow_resources *uflow_res) @@ -2904,7 +2858,7 @@ static int kern_spec_to_ib_spec_action(struct ib_ucontext *ucontext, ib_spec->action.act = uobj_get_obj_read(flow_action, UVERBS_OBJECT_FLOW_ACTION, kern_spec->action.handle, - ucontext); + ufile); if (!ib_spec->action.act) return -EINVAL; ib_spec->action.size = @@ -2922,7 +2876,7 @@ static int kern_spec_to_ib_spec_action(struct ib_ucontext *ucontext, uobj_get_obj_read(counters, UVERBS_OBJECT_COUNTERS, kern_spec->flow_count.handle, - ucontext); + ufile); if (!ib_spec->flow_count.counters) return -EINVAL; ib_spec->flow_count.size = @@ -3091,9 +3045,6 @@ static int kern_spec_to_ib_spec_filter(struct ib_uverbs_flow_spec *kern_spec, void *kern_spec_mask; void *kern_spec_val; - if (kern_spec->reserved) - return -EINVAL; - kern_filter_sz = kern_spec_filter_sz(&kern_spec->hdr); kern_spec_val = (void *)kern_spec + @@ -3106,7 +3057,7 @@ static int kern_spec_to_ib_spec_filter(struct ib_uverbs_flow_spec *kern_spec, kern_filter_sz, ib_spec); } -static int kern_spec_to_ib_spec(struct ib_ucontext *ucontext, +static int kern_spec_to_ib_spec(struct ib_uverbs_file *ufile, struct ib_uverbs_flow_spec *kern_spec, union ib_flow_spec *ib_spec, struct ib_uflow_resources *uflow_res) @@ -3115,14 +3066,13 @@ static int kern_spec_to_ib_spec(struct ib_ucontext *ucontext, return -EINVAL; if (kern_spec->type >= IB_FLOW_SPEC_ACTION_TAG) - return kern_spec_to_ib_spec_action(ucontext, kern_spec, ib_spec, + return kern_spec_to_ib_spec_action(ufile, kern_spec, ib_spec, uflow_res); else return kern_spec_to_ib_spec_filter(kern_spec, ib_spec); } int ib_uverbs_ex_create_wq(struct ib_uverbs_file *file, - struct ib_device *ib_dev, struct ib_udata *ucore, struct ib_udata *uhw) { @@ -3136,6 +3086,7 @@ int ib_uverbs_ex_create_wq(struct ib_uverbs_file *file, struct ib_wq_init_attr wq_init_attr = {}; size_t required_cmd_sz; size_t required_resp_len; + struct ib_device *ib_dev; required_cmd_sz = offsetof(typeof(cmd), max_sge) + sizeof(cmd.max_sge); required_resp_len = offsetof(typeof(resp), wqn) + sizeof(resp.wqn); @@ -3158,18 +3109,18 @@ int ib_uverbs_ex_create_wq(struct ib_uverbs_file *file, if (cmd.comp_mask) return -EOPNOTSUPP; - obj = (struct ib_uwq_object *)uobj_alloc(UVERBS_OBJECT_WQ, - file->ucontext); + obj = (struct ib_uwq_object *)uobj_alloc(UVERBS_OBJECT_WQ, file, + &ib_dev); if (IS_ERR(obj)) return PTR_ERR(obj); - pd = uobj_get_obj_read(pd, UVERBS_OBJECT_PD, cmd.pd_handle, file->ucontext); + pd = uobj_get_obj_read(pd, UVERBS_OBJECT_PD, cmd.pd_handle, file); if (!pd) { err = -EINVAL; goto err_uobj; } - cq = uobj_get_obj_read(cq, UVERBS_OBJECT_CQ, cmd.cq_handle, file->ucontext); + cq = uobj_get_obj_read(cq, UVERBS_OBJECT_CQ, cmd.cq_handle, file); if (!cq) { err = -EINVAL; goto err_put_pd; @@ -3223,8 +3174,7 @@ int ib_uverbs_ex_create_wq(struct ib_uverbs_file *file, uobj_put_obj_read(pd); uobj_put_obj_read(cq); - uobj_alloc_commit(&obj->uevent.uobject); - return 0; + return uobj_alloc_commit(&obj->uevent.uobject, 0); err_copy: ib_destroy_wq(wq); @@ -3239,7 +3189,6 @@ err_uobj: } int ib_uverbs_ex_destroy_wq(struct ib_uverbs_file *file, - struct ib_device *ib_dev, struct ib_udata *ucore, struct ib_udata *uhw) { @@ -3273,29 +3222,19 @@ int ib_uverbs_ex_destroy_wq(struct ib_uverbs_file *file, return -EOPNOTSUPP; resp.response_length = required_resp_len; - uobj = uobj_get_write(UVERBS_OBJECT_WQ, cmd.wq_handle, - file->ucontext); + uobj = uobj_get_destroy(UVERBS_OBJECT_WQ, cmd.wq_handle, file); if (IS_ERR(uobj)) return PTR_ERR(uobj); obj = container_of(uobj, struct ib_uwq_object, uevent.uobject); - /* - * Make sure we don't free the memory in remove_commit as we still - * needs the uobject memory to create the response. - */ - uverbs_uobject_get(uobj); - - ret = uobj_remove_commit(uobj); resp.events_reported = obj->uevent.events_reported; - uverbs_uobject_put(uobj); - if (ret) - return ret; + + uobj_put_destroy(uobj); return ib_copy_to_udata(ucore, &resp, resp.response_length); } int ib_uverbs_ex_modify_wq(struct ib_uverbs_file *file, - struct ib_device *ib_dev, struct ib_udata *ucore, struct ib_udata *uhw) { @@ -3324,7 +3263,7 @@ int ib_uverbs_ex_modify_wq(struct ib_uverbs_file *file, if (cmd.attr_mask > (IB_WQ_STATE | IB_WQ_CUR_STATE | IB_WQ_FLAGS)) return -EINVAL; - wq = uobj_get_obj_read(wq, UVERBS_OBJECT_WQ, cmd.wq_handle, file->ucontext); + wq = uobj_get_obj_read(wq, UVERBS_OBJECT_WQ, cmd.wq_handle, file); if (!wq) return -EINVAL; @@ -3345,7 +3284,6 @@ out: } int ib_uverbs_ex_create_rwq_ind_table(struct ib_uverbs_file *file, - struct ib_device *ib_dev, struct ib_udata *ucore, struct ib_udata *uhw) { @@ -3363,6 +3301,7 @@ int ib_uverbs_ex_create_rwq_ind_table(struct ib_uverbs_file *file, u32 expected_in_size; size_t required_cmd_sz_header; size_t required_resp_len; + struct ib_device *ib_dev; required_cmd_sz_header = offsetof(typeof(cmd), log_ind_tbl_size) + sizeof(cmd.log_ind_tbl_size); required_resp_len = offsetof(typeof(resp), ind_tbl_num) + sizeof(resp.ind_tbl_num); @@ -3418,8 +3357,8 @@ int ib_uverbs_ex_create_rwq_ind_table(struct ib_uverbs_file *file, for (num_read_wqs = 0; num_read_wqs < num_wq_handles; num_read_wqs++) { - wq = uobj_get_obj_read(wq, UVERBS_OBJECT_WQ, wqs_handles[num_read_wqs], - file->ucontext); + wq = uobj_get_obj_read(wq, UVERBS_OBJECT_WQ, + wqs_handles[num_read_wqs], file); if (!wq) { err = -EINVAL; goto put_wqs; @@ -3428,7 +3367,7 @@ int ib_uverbs_ex_create_rwq_ind_table(struct ib_uverbs_file *file, wqs[num_read_wqs] = wq; } - uobj = uobj_alloc(UVERBS_OBJECT_RWQ_IND_TBL, file->ucontext); + uobj = uobj_alloc(UVERBS_OBJECT_RWQ_IND_TBL, file, &ib_dev); if (IS_ERR(uobj)) { err = PTR_ERR(uobj); goto put_wqs; @@ -3472,8 +3411,7 @@ int ib_uverbs_ex_create_rwq_ind_table(struct ib_uverbs_file *file, for (j = 0; j < num_read_wqs; j++) uobj_put_obj_read(wqs[j]); - uobj_alloc_commit(uobj); - return 0; + return uobj_alloc_commit(uobj, 0); err_copy: ib_destroy_rwq_ind_table(rwq_ind_tbl); @@ -3489,12 +3427,10 @@ err_free: } int ib_uverbs_ex_destroy_rwq_ind_table(struct ib_uverbs_file *file, - struct ib_device *ib_dev, struct ib_udata *ucore, struct ib_udata *uhw) { struct ib_uverbs_ex_destroy_rwq_ind_table cmd = {}; - struct ib_uobject *uobj; int ret; size_t required_cmd_sz; @@ -3515,16 +3451,11 @@ int ib_uverbs_ex_destroy_rwq_ind_table(struct ib_uverbs_file *file, if (cmd.comp_mask) return -EOPNOTSUPP; - uobj = uobj_get_write(UVERBS_OBJECT_RWQ_IND_TBL, cmd.ind_tbl_handle, - file->ucontext); - if (IS_ERR(uobj)) - return PTR_ERR(uobj); - - return uobj_remove_commit(uobj); + return uobj_perform_destroy(UVERBS_OBJECT_RWQ_IND_TBL, + cmd.ind_tbl_handle, file, 0); } int ib_uverbs_ex_create_flow(struct ib_uverbs_file *file, - struct ib_device *ib_dev, struct ib_udata *ucore, struct ib_udata *uhw) { @@ -3541,6 +3472,7 @@ int ib_uverbs_ex_create_flow(struct ib_uverbs_file *file, int err = 0; void *ib_spec; int i; + struct ib_device *ib_dev; if (ucore->inlen < sizeof(cmd)) return -EINVAL; @@ -3596,13 +3528,13 @@ int ib_uverbs_ex_create_flow(struct ib_uverbs_file *file, kern_flow_attr = &cmd.flow_attr; } - uobj = uobj_alloc(UVERBS_OBJECT_FLOW, file->ucontext); + uobj = uobj_alloc(UVERBS_OBJECT_FLOW, file, &ib_dev); if (IS_ERR(uobj)) { err = PTR_ERR(uobj); goto err_free_attr; } - qp = uobj_get_obj_read(qp, UVERBS_OBJECT_QP, cmd.qp_handle, file->ucontext); + qp = uobj_get_obj_read(qp, UVERBS_OBJECT_QP, cmd.qp_handle, file); if (!qp) { err = -EINVAL; goto err_uobj; @@ -3613,6 +3545,11 @@ int ib_uverbs_ex_create_flow(struct ib_uverbs_file *file, goto err_put; } + if (!qp->device->create_flow) { + err = -EOPNOTSUPP; + goto err_put; + } + flow_attr = kzalloc(struct_size(flow_attr, flows, cmd.flow_attr.num_of_specs), GFP_KERNEL); if (!flow_attr) { @@ -3639,7 +3576,7 @@ int ib_uverbs_ex_create_flow(struct ib_uverbs_file *file, cmd.flow_attr.size >= kern_spec->size; i++) { err = kern_spec_to_ib_spec( - file->ucontext, (struct ib_uverbs_flow_spec *)kern_spec, + file, (struct ib_uverbs_flow_spec *)kern_spec, ib_spec, uflow_res); if (err) goto err_free; @@ -3666,6 +3603,7 @@ int ib_uverbs_ex_create_flow(struct ib_uverbs_file *file, } atomic_inc(&qp->usecnt); flow_id->qp = qp; + flow_id->device = qp->device; flow_id->uobject = uobj; uobj->object = flow_id; uflow = container_of(uobj, typeof(*uflow), uobject); @@ -3680,13 +3618,13 @@ int ib_uverbs_ex_create_flow(struct ib_uverbs_file *file, goto err_copy; uobj_put_obj_read(qp); - uobj_alloc_commit(uobj); kfree(flow_attr); if (cmd.flow_attr.num_of_specs) kfree(kern_flow_attr); - return 0; + return uobj_alloc_commit(uobj, 0); err_copy: - ib_destroy_flow(flow_id); + if (!qp->device->destroy_flow(flow_id)) + atomic_dec(&qp->usecnt); err_free: ib_uverbs_flow_resources_free(uflow_res); err_free_flow_attr: @@ -3702,12 +3640,10 @@ err_free_attr: } int ib_uverbs_ex_destroy_flow(struct ib_uverbs_file *file, - struct ib_device *ib_dev, struct ib_udata *ucore, struct ib_udata *uhw) { struct ib_uverbs_destroy_flow cmd; - struct ib_uobject *uobj; int ret; if (ucore->inlen < sizeof(cmd)) @@ -3720,17 +3656,11 @@ int ib_uverbs_ex_destroy_flow(struct ib_uverbs_file *file, if (cmd.comp_mask) return -EINVAL; - uobj = uobj_get_write(UVERBS_OBJECT_FLOW, cmd.flow_handle, - file->ucontext); - if (IS_ERR(uobj)) - return PTR_ERR(uobj); - - ret = uobj_remove_commit(uobj); - return ret; + return uobj_perform_destroy(UVERBS_OBJECT_FLOW, cmd.flow_handle, file, + 0); } static int __uverbs_create_xsrq(struct ib_uverbs_file *file, - struct ib_device *ib_dev, struct ib_uverbs_create_xsrq *cmd, struct ib_udata *udata) { @@ -3741,9 +3671,10 @@ static int __uverbs_create_xsrq(struct ib_uverbs_file *file, struct ib_uobject *uninitialized_var(xrcd_uobj); struct ib_srq_init_attr attr; int ret; + struct ib_device *ib_dev; - obj = (struct ib_usrq_object *)uobj_alloc(UVERBS_OBJECT_SRQ, - file->ucontext); + obj = (struct ib_usrq_object *)uobj_alloc(UVERBS_OBJECT_SRQ, file, + &ib_dev); if (IS_ERR(obj)) return PTR_ERR(obj); @@ -3752,7 +3683,7 @@ static int __uverbs_create_xsrq(struct ib_uverbs_file *file, if (cmd->srq_type == IB_SRQT_XRC) { xrcd_uobj = uobj_get_read(UVERBS_OBJECT_XRCD, cmd->xrcd_handle, - file->ucontext); + file); if (IS_ERR(xrcd_uobj)) { ret = -EINVAL; goto err; @@ -3769,15 +3700,15 @@ static int __uverbs_create_xsrq(struct ib_uverbs_file *file, } if (ib_srq_has_cq(cmd->srq_type)) { - attr.ext.cq = uobj_get_obj_read(cq, UVERBS_OBJECT_CQ, cmd->cq_handle, - file->ucontext); + attr.ext.cq = uobj_get_obj_read(cq, UVERBS_OBJECT_CQ, + cmd->cq_handle, file); if (!attr.ext.cq) { ret = -EINVAL; goto err_put_xrcd; } } - pd = uobj_get_obj_read(pd, UVERBS_OBJECT_PD, cmd->pd_handle, file->ucontext); + pd = uobj_get_obj_read(pd, UVERBS_OBJECT_PD, cmd->pd_handle, file); if (!pd) { ret = -EINVAL; goto err_put_cq; @@ -3842,9 +3773,7 @@ static int __uverbs_create_xsrq(struct ib_uverbs_file *file, uobj_put_obj_read(attr.ext.cq); uobj_put_obj_read(pd); - uobj_alloc_commit(&obj->uevent.uobject); - - return 0; + return uobj_alloc_commit(&obj->uevent.uobject, 0); err_copy: ib_destroy_srq(srq); @@ -3868,7 +3797,6 @@ err: } ssize_t ib_uverbs_create_srq(struct ib_uverbs_file *file, - struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) { @@ -3898,7 +3826,7 @@ ssize_t ib_uverbs_create_srq(struct ib_uverbs_file *file, in_len - sizeof(cmd) - sizeof(struct ib_uverbs_cmd_hdr), out_len - sizeof(resp)); - ret = __uverbs_create_xsrq(file, ib_dev, &xcmd, &udata); + ret = __uverbs_create_xsrq(file, &xcmd, &udata); if (ret) return ret; @@ -3906,7 +3834,6 @@ ssize_t ib_uverbs_create_srq(struct ib_uverbs_file *file, } ssize_t ib_uverbs_create_xsrq(struct ib_uverbs_file *file, - struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) { struct ib_uverbs_create_xsrq cmd; @@ -3925,7 +3852,7 @@ ssize_t ib_uverbs_create_xsrq(struct ib_uverbs_file *file, in_len - sizeof(cmd) - sizeof(struct ib_uverbs_cmd_hdr), out_len - sizeof(resp)); - ret = __uverbs_create_xsrq(file, ib_dev, &cmd, &udata); + ret = __uverbs_create_xsrq(file, &cmd, &udata); if (ret) return ret; @@ -3933,7 +3860,6 @@ ssize_t ib_uverbs_create_xsrq(struct ib_uverbs_file *file, } ssize_t ib_uverbs_modify_srq(struct ib_uverbs_file *file, - struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) { @@ -3949,7 +3875,7 @@ ssize_t ib_uverbs_modify_srq(struct ib_uverbs_file *file, ib_uverbs_init_udata(&udata, buf + sizeof cmd, NULL, in_len - sizeof cmd, out_len); - srq = uobj_get_obj_read(srq, UVERBS_OBJECT_SRQ, cmd.srq_handle, file->ucontext); + srq = uobj_get_obj_read(srq, UVERBS_OBJECT_SRQ, cmd.srq_handle, file); if (!srq) return -EINVAL; @@ -3964,7 +3890,6 @@ ssize_t ib_uverbs_modify_srq(struct ib_uverbs_file *file, } ssize_t ib_uverbs_query_srq(struct ib_uverbs_file *file, - struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) { @@ -3980,7 +3905,7 @@ ssize_t ib_uverbs_query_srq(struct ib_uverbs_file *file, if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; - srq = uobj_get_obj_read(srq, UVERBS_OBJECT_SRQ, cmd.srq_handle, file->ucontext); + srq = uobj_get_obj_read(srq, UVERBS_OBJECT_SRQ, cmd.srq_handle, file); if (!srq) return -EINVAL; @@ -4004,7 +3929,6 @@ ssize_t ib_uverbs_query_srq(struct ib_uverbs_file *file, } ssize_t ib_uverbs_destroy_srq(struct ib_uverbs_file *file, - struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) { @@ -4012,32 +3936,20 @@ ssize_t ib_uverbs_destroy_srq(struct ib_uverbs_file *file, struct ib_uverbs_destroy_srq_resp resp; struct ib_uobject *uobj; struct ib_uevent_object *obj; - int ret = -EINVAL; if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; - uobj = uobj_get_write(UVERBS_OBJECT_SRQ, cmd.srq_handle, - file->ucontext); + uobj = uobj_get_destroy(UVERBS_OBJECT_SRQ, cmd.srq_handle, file); if (IS_ERR(uobj)) return PTR_ERR(uobj); obj = container_of(uobj, struct ib_uevent_object, uobject); - /* - * Make sure we don't free the memory in remove_commit as we still - * needs the uobject memory to create the response. - */ - uverbs_uobject_get(uobj); - memset(&resp, 0, sizeof(resp)); - - ret = uobj_remove_commit(uobj); - if (ret) { - uverbs_uobject_put(uobj); - return ret; - } resp.events_reported = obj->events_reported; - uverbs_uobject_put(uobj); + + uobj_put_destroy(uobj); + if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, sizeof(resp))) return -EFAULT; @@ -4045,15 +3957,21 @@ ssize_t ib_uverbs_destroy_srq(struct ib_uverbs_file *file, } int ib_uverbs_ex_query_device(struct ib_uverbs_file *file, - struct ib_device *ib_dev, struct ib_udata *ucore, struct ib_udata *uhw) { struct ib_uverbs_ex_query_device_resp resp = { {0} }; struct ib_uverbs_ex_query_device cmd; struct ib_device_attr attr = {0}; + struct ib_ucontext *ucontext; + struct ib_device *ib_dev; int err; + ucontext = ib_uverbs_get_ucontext(file); + if (IS_ERR(ucontext)) + return PTR_ERR(ucontext); + ib_dev = ucontext->device; + if (!ib_dev->query_device) return -EOPNOTSUPP; @@ -4079,7 +3997,7 @@ int ib_uverbs_ex_query_device(struct ib_uverbs_file *file, if (err) return err; - copy_query_dev_fields(file, ib_dev, &resp.base, &attr); + copy_query_dev_fields(ucontext, &resp.base, &attr); if (ucore->outlen < resp.response_length + sizeof(resp.odp_caps)) goto end; @@ -4166,7 +4084,6 @@ end: } int ib_uverbs_ex_modify_cq(struct ib_uverbs_file *file, - struct ib_device *ib_dev, struct ib_udata *ucore, struct ib_udata *uhw) { @@ -4196,7 +4113,7 @@ int ib_uverbs_ex_modify_cq(struct ib_uverbs_file *file, if (cmd.attr_mask > IB_CQ_MODERATE) return -EOPNOTSUPP; - cq = uobj_get_obj_read(cq, UVERBS_OBJECT_CQ, cmd.cq_handle, file->ucontext); + cq = uobj_get_obj_read(cq, UVERBS_OBJECT_CQ, cmd.cq_handle, file); if (!cq) return -EINVAL; diff --git a/drivers/infiniband/core/uverbs_ioctl.c b/drivers/infiniband/core/uverbs_ioctl.c index 8d32c4ae368c..1a6b229e3db3 100644 --- a/drivers/infiniband/core/uverbs_ioctl.c +++ b/drivers/infiniband/core/uverbs_ioctl.c @@ -35,6 +35,103 @@ #include "rdma_core.h" #include "uverbs.h" +struct bundle_alloc_head { + struct bundle_alloc_head *next; + u8 data[]; +}; + +struct bundle_priv { + /* Must be first */ + struct bundle_alloc_head alloc_head; + struct bundle_alloc_head *allocated_mem; + size_t internal_avail; + size_t internal_used; + + struct radix_tree_root *radix; + const struct uverbs_api_ioctl_method *method_elm; + void __rcu **radix_slots; + unsigned long radix_slots_len; + u32 method_key; + + struct ib_uverbs_attr __user *user_attrs; + struct ib_uverbs_attr *uattrs; + + DECLARE_BITMAP(uobj_finalize, UVERBS_API_ATTR_BKEY_LEN); + + /* + * Must be last. bundle ends in a flex array which overlaps + * internal_buffer. + */ + struct uverbs_attr_bundle bundle; + u64 internal_buffer[32]; +}; + +/* + * Each method has an absolute minimum amount of memory it needs to allocate, + * precompute that amount and determine if the onstack memory can be used or + * if allocation is need. + */ +void uapi_compute_bundle_size(struct uverbs_api_ioctl_method *method_elm, + unsigned int num_attrs) +{ + struct bundle_priv *pbundle; + size_t bundle_size = + offsetof(struct bundle_priv, internal_buffer) + + sizeof(*pbundle->bundle.attrs) * method_elm->key_bitmap_len + + sizeof(*pbundle->uattrs) * num_attrs; + + method_elm->use_stack = bundle_size <= sizeof(*pbundle); + method_elm->bundle_size = + ALIGN(bundle_size + 256, sizeof(*pbundle->internal_buffer)); + + /* Do not want order-2 allocations for this. */ + WARN_ON_ONCE(method_elm->bundle_size > PAGE_SIZE); +} + +/** + * uverbs_alloc() - Quickly allocate memory for use with a bundle + * @bundle: The bundle + * @size: Number of bytes to allocate + * @flags: Allocator flags + * + * The bundle allocator is intended for allocations that are connected with + * processing the system call related to the bundle. The allocated memory is + * always freed once the system call completes, and cannot be freed any other + * way. + * + * This tries to use a small pool of pre-allocated memory for performance. + */ +__malloc void *_uverbs_alloc(struct uverbs_attr_bundle *bundle, size_t size, + gfp_t flags) +{ + struct bundle_priv *pbundle = + container_of(bundle, struct bundle_priv, bundle); + size_t new_used; + void *res; + + if (check_add_overflow(size, pbundle->internal_used, &new_used)) + return ERR_PTR(-EOVERFLOW); + + if (new_used > pbundle->internal_avail) { + struct bundle_alloc_head *buf; + + buf = kvmalloc(struct_size(buf, data, size), flags); + if (!buf) + return ERR_PTR(-ENOMEM); + buf->next = pbundle->allocated_mem; + pbundle->allocated_mem = buf; + return buf->data; + } + + res = (void *)pbundle->internal_buffer + pbundle->internal_used; + pbundle->internal_used = + ALIGN(new_used, sizeof(*pbundle->internal_buffer)); + if (flags & __GFP_ZERO) + memset(res, 0, size); + return res; +} +EXPORT_SYMBOL(_uverbs_alloc); + static bool uverbs_is_attr_cleared(const struct ib_uverbs_attr *uattr, u16 len) { @@ -46,45 +143,24 @@ static bool uverbs_is_attr_cleared(const struct ib_uverbs_attr *uattr, 0, uattr->len - len); } -static int uverbs_process_attr(struct ib_device *ibdev, - struct ib_ucontext *ucontext, - const struct ib_uverbs_attr *uattr, - u16 attr_id, - const struct uverbs_attr_spec_hash *attr_spec_bucket, - struct uverbs_attr_bundle_hash *attr_bundle_h, - struct ib_uverbs_attr __user *uattr_ptr) +static int uverbs_process_attr(struct bundle_priv *pbundle, + const struct uverbs_api_attr *attr_uapi, + struct ib_uverbs_attr *uattr, u32 attr_bkey) { - const struct uverbs_attr_spec *spec; - const struct uverbs_attr_spec *val_spec; - struct uverbs_attr *e; - const struct uverbs_object_spec *object; + const struct uverbs_attr_spec *spec = &attr_uapi->spec; + struct uverbs_attr *e = &pbundle->bundle.attrs[attr_bkey]; + const struct uverbs_attr_spec *val_spec = spec; struct uverbs_obj_attr *o_attr; - struct uverbs_attr *elements = attr_bundle_h->attrs; - - if (attr_id >= attr_spec_bucket->num_attrs) { - if (uattr->flags & UVERBS_ATTR_F_MANDATORY) - return -EINVAL; - else - return 0; - } - - if (test_bit(attr_id, attr_bundle_h->valid_bitmap)) - return -EINVAL; - - spec = &attr_spec_bucket->attrs[attr_id]; - val_spec = spec; - e = &elements[attr_id]; - e->uattr = uattr_ptr; switch (spec->type) { case UVERBS_ATTR_TYPE_ENUM_IN: - if (uattr->attr_data.enum_data.elem_id >= spec->enum_def.num_elems) + if (uattr->attr_data.enum_data.elem_id >= spec->u.enum_def.num_elems) return -EOPNOTSUPP; if (uattr->attr_data.enum_data.reserved) return -EINVAL; - val_spec = &spec->enum_def.ids[uattr->attr_data.enum_data.elem_id]; + val_spec = &spec->u2.enum_def.ids[uattr->attr_data.enum_data.elem_id]; /* Currently we only support PTR_IN based enums */ if (val_spec->type != UVERBS_ATTR_TYPE_PTR_IN) @@ -98,64 +174,75 @@ static int uverbs_process_attr(struct ib_device *ibdev, * longer struct will fail here if used with an old kernel and * non-zero content, making ABI compat/discovery simpler. */ - if (uattr->len > val_spec->ptr.len && - val_spec->flags & UVERBS_ATTR_SPEC_F_MIN_SZ_OR_ZERO && - !uverbs_is_attr_cleared(uattr, val_spec->ptr.len)) + if (uattr->len > val_spec->u.ptr.len && + val_spec->zero_trailing && + !uverbs_is_attr_cleared(uattr, val_spec->u.ptr.len)) return -EOPNOTSUPP; /* fall through */ case UVERBS_ATTR_TYPE_PTR_OUT: - if (uattr->len < val_spec->ptr.min_len || - (!(val_spec->flags & UVERBS_ATTR_SPEC_F_MIN_SZ_OR_ZERO) && - uattr->len > val_spec->ptr.len)) + if (uattr->len < val_spec->u.ptr.min_len || + (!val_spec->zero_trailing && + uattr->len > val_spec->u.ptr.len)) return -EINVAL; if (spec->type != UVERBS_ATTR_TYPE_ENUM_IN && uattr->attr_data.reserved) return -EINVAL; - e->ptr_attr.data = uattr->data; + e->ptr_attr.uattr_idx = uattr - pbundle->uattrs; e->ptr_attr.len = uattr->len; - e->ptr_attr.flags = uattr->flags; + + if (val_spec->alloc_and_copy && !uverbs_attr_ptr_is_inline(e)) { + void *p; + + p = uverbs_alloc(&pbundle->bundle, uattr->len); + if (IS_ERR(p)) + return PTR_ERR(p); + + e->ptr_attr.ptr = p; + + if (copy_from_user(p, u64_to_user_ptr(uattr->data), + uattr->len)) + return -EFAULT; + } else { + e->ptr_attr.data = uattr->data; + } break; case UVERBS_ATTR_TYPE_IDR: - if (uattr->data >> 32) - return -EINVAL; - /* fall through */ case UVERBS_ATTR_TYPE_FD: if (uattr->attr_data.reserved) return -EINVAL; - if (uattr->len != 0 || !ucontext || uattr->data > INT_MAX) + if (uattr->len != 0) return -EINVAL; o_attr = &e->obj_attr; - object = uverbs_get_object(ibdev, spec->obj.obj_type); - if (!object) - return -EINVAL; - o_attr->type = object->type_attrs; - - o_attr->id = (int)uattr->data; - o_attr->uobject = uverbs_get_uobject_from_context( - o_attr->type, - ucontext, - spec->obj.access, - o_attr->id); + o_attr->attr_elm = attr_uapi; + /* + * The type of uattr->data is u64 for UVERBS_ATTR_TYPE_IDR and + * s64 for UVERBS_ATTR_TYPE_FD. We can cast the u64 to s64 + * here without caring about truncation as we know that the + * IDR implementation today rejects negative IDs + */ + o_attr->uobject = uverbs_get_uobject_from_file( + spec->u.obj.obj_type, + pbundle->bundle.ufile, + spec->u.obj.access, + uattr->data_s64); if (IS_ERR(o_attr->uobject)) return PTR_ERR(o_attr->uobject); + __set_bit(attr_bkey, pbundle->uobj_finalize); - if (spec->obj.access == UVERBS_ACCESS_NEW) { - u64 id = o_attr->uobject->id; + if (spec->u.obj.access == UVERBS_ACCESS_NEW) { + unsigned int uattr_idx = uattr - pbundle->uattrs; + s64 id = o_attr->uobject->id; /* Copy the allocated id to the user-space */ - if (put_user(id, &e->uattr->data)) { - uverbs_finalize_object(o_attr->uobject, - UVERBS_ACCESS_NEW, - false); + if (put_user(id, &pbundle->user_attrs[uattr_idx].data)) return -EFAULT; - } } break; @@ -163,220 +250,225 @@ static int uverbs_process_attr(struct ib_device *ibdev, return -EOPNOTSUPP; } - set_bit(attr_id, attr_bundle_h->valid_bitmap); return 0; } -static int uverbs_uattrs_process(struct ib_device *ibdev, - struct ib_ucontext *ucontext, - const struct ib_uverbs_attr *uattrs, - size_t num_uattrs, - const struct uverbs_method_spec *method, - struct uverbs_attr_bundle *attr_bundle, - struct ib_uverbs_attr __user *uattr_ptr) +/* + * We search the radix tree with the method prefix and now we want to fast + * search the suffix bits to get a particular attribute pointer. It is not + * totally clear to me if this breaks the radix tree encasulation or not, but + * it uses the iter data to determine if the method iter points at the same + * chunk that will store the attribute, if so it just derefs it directly. By + * construction in most kernel configs the method and attrs will all fit in a + * single radix chunk, so in most cases this will have no search. Other cases + * this falls back to a full search. + */ +static void __rcu **uapi_get_attr_for_method(struct bundle_priv *pbundle, + u32 attr_key) { - size_t i; - int ret = 0; - int num_given_buckets = 0; - - for (i = 0; i < num_uattrs; i++) { - const struct ib_uverbs_attr *uattr = &uattrs[i]; - u16 attr_id = uattr->attr_id; - struct uverbs_attr_spec_hash *attr_spec_bucket; - - ret = uverbs_ns_idx(&attr_id, method->num_buckets); - if (ret < 0) { - if (uattr->flags & UVERBS_ATTR_F_MANDATORY) { - uverbs_finalize_objects(attr_bundle, - method->attr_buckets, - num_given_buckets, - false); - return ret; - } - continue; - } + void __rcu **slot; - /* - * ret is the found ns, so increase num_given_buckets if - * necessary. - */ - if (ret >= num_given_buckets) - num_given_buckets = ret + 1; - - attr_spec_bucket = method->attr_buckets[ret]; - ret = uverbs_process_attr(ibdev, ucontext, uattr, attr_id, - attr_spec_bucket, &attr_bundle->hash[ret], - uattr_ptr++); - if (ret) { - uverbs_finalize_objects(attr_bundle, - method->attr_buckets, - num_given_buckets, - false); - return ret; - } + if (likely(attr_key < pbundle->radix_slots_len)) { + void *entry; + + slot = pbundle->radix_slots + attr_key; + entry = rcu_dereference_raw(*slot); + if (likely(!radix_tree_is_internal_node(entry) && entry)) + return slot; } - return num_given_buckets; + return radix_tree_lookup_slot(pbundle->radix, + pbundle->method_key | attr_key); } -static int uverbs_validate_kernel_mandatory(const struct uverbs_method_spec *method_spec, - struct uverbs_attr_bundle *attr_bundle) +static int uverbs_set_attr(struct bundle_priv *pbundle, + struct ib_uverbs_attr *uattr) { - unsigned int i; - - for (i = 0; i < attr_bundle->num_buckets; i++) { - struct uverbs_attr_spec_hash *attr_spec_bucket = - method_spec->attr_buckets[i]; + u32 attr_key = uapi_key_attr(uattr->attr_id); + u32 attr_bkey = uapi_bkey_attr(attr_key); + const struct uverbs_api_attr *attr; + void __rcu **slot; + int ret; - if (!bitmap_subset(attr_spec_bucket->mandatory_attrs_bitmask, - attr_bundle->hash[i].valid_bitmap, - attr_spec_bucket->num_attrs)) - return -EINVAL; + slot = uapi_get_attr_for_method(pbundle, attr_key); + if (!slot) { + /* + * Kernel does not support the attribute but user-space says it + * is mandatory + */ + if (uattr->flags & UVERBS_ATTR_F_MANDATORY) + return -EPROTONOSUPPORT; + return 0; } + attr = srcu_dereference( + *slot, &pbundle->bundle.ufile->device->disassociate_srcu); - for (; i < method_spec->num_buckets; i++) { - struct uverbs_attr_spec_hash *attr_spec_bucket = - method_spec->attr_buckets[i]; + /* Reject duplicate attributes from user-space */ + if (test_bit(attr_bkey, pbundle->bundle.attr_present)) + return -EINVAL; - if (!bitmap_empty(attr_spec_bucket->mandatory_attrs_bitmask, - attr_spec_bucket->num_attrs)) - return -EINVAL; - } + ret = uverbs_process_attr(pbundle, attr, uattr, attr_bkey); + if (ret) + return ret; + + __set_bit(attr_bkey, pbundle->bundle.attr_present); return 0; } -static int uverbs_handle_method(struct ib_uverbs_attr __user *uattr_ptr, - const struct ib_uverbs_attr *uattrs, - size_t num_uattrs, - struct ib_device *ibdev, - struct ib_uverbs_file *ufile, - const struct uverbs_method_spec *method_spec, - struct uverbs_attr_bundle *attr_bundle) +static int ib_uverbs_run_method(struct bundle_priv *pbundle, + unsigned int num_attrs) { + int (*handler)(struct ib_uverbs_file *ufile, + struct uverbs_attr_bundle *ctx); + size_t uattrs_size = array_size(sizeof(*pbundle->uattrs), num_attrs); + unsigned int destroy_bkey = pbundle->method_elm->destroy_bkey; + unsigned int i; int ret; - int finalize_ret; - int num_given_buckets; - num_given_buckets = uverbs_uattrs_process(ibdev, ufile->ucontext, uattrs, - num_uattrs, method_spec, - attr_bundle, uattr_ptr); - if (num_given_buckets <= 0) + /* See uverbs_disassociate_api() */ + handler = srcu_dereference( + pbundle->method_elm->handler, + &pbundle->bundle.ufile->device->disassociate_srcu); + if (!handler) + return -EIO; + + pbundle->uattrs = uverbs_alloc(&pbundle->bundle, uattrs_size); + if (IS_ERR(pbundle->uattrs)) + return PTR_ERR(pbundle->uattrs); + if (copy_from_user(pbundle->uattrs, pbundle->user_attrs, uattrs_size)) + return -EFAULT; + + for (i = 0; i != num_attrs; i++) { + ret = uverbs_set_attr(pbundle, &pbundle->uattrs[i]); + if (unlikely(ret)) + return ret; + } + + /* User space did not provide all the mandatory attributes */ + if (unlikely(!bitmap_subset(pbundle->method_elm->attr_mandatory, + pbundle->bundle.attr_present, + pbundle->method_elm->key_bitmap_len))) return -EINVAL; - attr_bundle->num_buckets = num_given_buckets; - ret = uverbs_validate_kernel_mandatory(method_spec, attr_bundle); - if (ret) - goto cleanup; + if (destroy_bkey != UVERBS_API_ATTR_BKEY_LEN) { + struct uverbs_obj_attr *destroy_attr = + &pbundle->bundle.attrs[destroy_bkey].obj_attr; - ret = method_spec->handler(ibdev, ufile, attr_bundle); -cleanup: - finalize_ret = uverbs_finalize_objects(attr_bundle, - method_spec->attr_buckets, - attr_bundle->num_buckets, - !ret); + ret = uobj_destroy(destroy_attr->uobject); + if (ret) + return ret; + __clear_bit(destroy_bkey, pbundle->uobj_finalize); - return ret ? ret : finalize_ret; -} + ret = handler(pbundle->bundle.ufile, &pbundle->bundle); + uobj_put_destroy(destroy_attr->uobject); + } else { + ret = handler(pbundle->bundle.ufile, &pbundle->bundle); + } -#define UVERBS_OPTIMIZE_USING_STACK_SZ 256 -static long ib_uverbs_cmd_verbs(struct ib_device *ib_dev, - struct ib_uverbs_file *file, - struct ib_uverbs_ioctl_hdr *hdr, - void __user *buf) -{ - const struct uverbs_object_spec *object_spec; - const struct uverbs_method_spec *method_spec; - long err = 0; - unsigned int i; - struct { - struct ib_uverbs_attr *uattrs; - struct uverbs_attr_bundle *uverbs_attr_bundle; - } *ctx = NULL; - struct uverbs_attr *curr_attr; - unsigned long *curr_bitmap; - size_t ctx_size; - uintptr_t data[UVERBS_OPTIMIZE_USING_STACK_SZ / sizeof(uintptr_t)]; - - if (hdr->driver_id != ib_dev->driver_id) + /* + * EPROTONOSUPPORT is ONLY to be returned if the ioctl framework can + * not invoke the method because the request is not supported. No + * other cases should return this code. + */ + if (WARN_ON_ONCE(ret == -EPROTONOSUPPORT)) return -EINVAL; - object_spec = uverbs_get_object(ib_dev, hdr->object_id); - if (!object_spec) - return -EPROTONOSUPPORT; + return ret; +} - method_spec = uverbs_get_method(object_spec, hdr->method_id); - if (!method_spec) - return -EPROTONOSUPPORT; +static int bundle_destroy(struct bundle_priv *pbundle, bool commit) +{ + unsigned int key_bitmap_len = pbundle->method_elm->key_bitmap_len; + struct bundle_alloc_head *memblock; + unsigned int i; + int ret = 0; - if ((method_spec->flags & UVERBS_ACTION_FLAG_CREATE_ROOT) ^ !file->ucontext) - return -EINVAL; + i = -1; + while ((i = find_next_bit(pbundle->uobj_finalize, key_bitmap_len, + i + 1)) < key_bitmap_len) { + struct uverbs_attr *attr = &pbundle->bundle.attrs[i]; + int current_ret; + + current_ret = uverbs_finalize_object( + attr->obj_attr.uobject, + attr->obj_attr.attr_elm->spec.u.obj.access, commit); + if (!ret) + ret = current_ret; + } - ctx_size = sizeof(*ctx) + - sizeof(struct uverbs_attr_bundle) + - sizeof(struct uverbs_attr_bundle_hash) * method_spec->num_buckets + - sizeof(*ctx->uattrs) * hdr->num_attrs + - sizeof(*ctx->uverbs_attr_bundle->hash[0].attrs) * - method_spec->num_child_attrs + - sizeof(*ctx->uverbs_attr_bundle->hash[0].valid_bitmap) * - (method_spec->num_child_attrs / BITS_PER_LONG + - method_spec->num_buckets); - - if (ctx_size <= UVERBS_OPTIMIZE_USING_STACK_SZ) - ctx = (void *)data; - if (!ctx) - ctx = kmalloc(ctx_size, GFP_KERNEL); - if (!ctx) - return -ENOMEM; - - ctx->uverbs_attr_bundle = (void *)ctx + sizeof(*ctx); - ctx->uattrs = (void *)(ctx->uverbs_attr_bundle + 1) + - (sizeof(ctx->uverbs_attr_bundle->hash[0]) * - method_spec->num_buckets); - curr_attr = (void *)(ctx->uattrs + hdr->num_attrs); - curr_bitmap = (void *)(curr_attr + method_spec->num_child_attrs); + for (memblock = pbundle->allocated_mem; memblock;) { + struct bundle_alloc_head *tmp = memblock; - /* - * We just fill the pointers and num_attrs here. The data itself will be - * filled at a later stage (uverbs_process_attr) - */ - for (i = 0; i < method_spec->num_buckets; i++) { - unsigned int curr_num_attrs = method_spec->attr_buckets[i]->num_attrs; - - ctx->uverbs_attr_bundle->hash[i].attrs = curr_attr; - curr_attr += curr_num_attrs; - ctx->uverbs_attr_bundle->hash[i].num_attrs = curr_num_attrs; - ctx->uverbs_attr_bundle->hash[i].valid_bitmap = curr_bitmap; - bitmap_zero(curr_bitmap, curr_num_attrs); - curr_bitmap += BITS_TO_LONGS(curr_num_attrs); + memblock = memblock->next; + kvfree(tmp); } - err = copy_from_user(ctx->uattrs, buf, - sizeof(*ctx->uattrs) * hdr->num_attrs); - if (err) { - err = -EFAULT; - goto out; - } + return ret; +} - err = uverbs_handle_method(buf, ctx->uattrs, hdr->num_attrs, ib_dev, - file, method_spec, ctx->uverbs_attr_bundle); +static int ib_uverbs_cmd_verbs(struct ib_uverbs_file *ufile, + struct ib_uverbs_ioctl_hdr *hdr, + struct ib_uverbs_attr __user *user_attrs) +{ + const struct uverbs_api_ioctl_method *method_elm; + struct uverbs_api *uapi = ufile->device->uapi; + struct radix_tree_iter attrs_iter; + struct bundle_priv *pbundle; + struct bundle_priv onstack; + void __rcu **slot; + int destroy_ret; + int ret; - /* - * EPROTONOSUPPORT is ONLY to be returned if the ioctl framework can - * not invoke the method because the request is not supported. No - * other cases should return this code. - */ - if (unlikely(err == -EPROTONOSUPPORT)) { - WARN_ON_ONCE(err == -EPROTONOSUPPORT); - err = -EINVAL; + if (unlikely(hdr->driver_id != uapi->driver_id)) + return -EINVAL; + + slot = radix_tree_iter_lookup( + &uapi->radix, &attrs_iter, + uapi_key_obj(hdr->object_id) | + uapi_key_ioctl_method(hdr->method_id)); + if (unlikely(!slot)) + return -EPROTONOSUPPORT; + method_elm = srcu_dereference(*slot, &ufile->device->disassociate_srcu); + + if (!method_elm->use_stack) { + pbundle = kmalloc(method_elm->bundle_size, GFP_KERNEL); + if (!pbundle) + return -ENOMEM; + pbundle->internal_avail = + method_elm->bundle_size - + offsetof(struct bundle_priv, internal_buffer); + pbundle->alloc_head.next = NULL; + pbundle->allocated_mem = &pbundle->alloc_head; + } else { + pbundle = &onstack; + pbundle->internal_avail = sizeof(pbundle->internal_buffer); + pbundle->allocated_mem = NULL; } -out: - if (ctx != (void *)data) - kfree(ctx); - return err; -} -#define IB_UVERBS_MAX_CMD_SZ 4096 + /* Space for the pbundle->bundle.attrs flex array */ + pbundle->method_elm = method_elm; + pbundle->method_key = attrs_iter.index; + pbundle->bundle.ufile = ufile; + pbundle->radix = &uapi->radix; + pbundle->radix_slots = slot; + pbundle->radix_slots_len = radix_tree_chunk_size(&attrs_iter); + pbundle->user_attrs = user_attrs; + + pbundle->internal_used = ALIGN(pbundle->method_elm->key_bitmap_len * + sizeof(*pbundle->bundle.attrs), + sizeof(*pbundle->internal_buffer)); + memset(pbundle->bundle.attr_present, 0, + sizeof(pbundle->bundle.attr_present)); + memset(pbundle->uobj_finalize, 0, sizeof(pbundle->uobj_finalize)); + + ret = ib_uverbs_run_method(pbundle, hdr->num_attrs); + destroy_ret = bundle_destroy(pbundle, ret == 0); + if (unlikely(destroy_ret && !ret)) + return destroy_ret; + + return ret; +} long ib_uverbs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { @@ -384,39 +476,138 @@ long ib_uverbs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) struct ib_uverbs_ioctl_hdr __user *user_hdr = (struct ib_uverbs_ioctl_hdr __user *)arg; struct ib_uverbs_ioctl_hdr hdr; - struct ib_device *ib_dev; int srcu_key; - long err; + int err; + + if (unlikely(cmd != RDMA_VERBS_IOCTL)) + return -ENOIOCTLCMD; + + err = copy_from_user(&hdr, user_hdr, sizeof(hdr)); + if (err) + return -EFAULT; + + if (hdr.length > PAGE_SIZE || + hdr.length != struct_size(&hdr, attrs, hdr.num_attrs)) + return -EINVAL; + + if (hdr.reserved1 || hdr.reserved2) + return -EPROTONOSUPPORT; srcu_key = srcu_read_lock(&file->device->disassociate_srcu); - ib_dev = srcu_dereference(file->device->ib_dev, - &file->device->disassociate_srcu); - if (!ib_dev) { - err = -EIO; - goto out; + err = ib_uverbs_cmd_verbs(file, &hdr, user_hdr->attrs); + srcu_read_unlock(&file->device->disassociate_srcu, srcu_key); + return err; +} + +int uverbs_get_flags64(u64 *to, const struct uverbs_attr_bundle *attrs_bundle, + size_t idx, u64 allowed_bits) +{ + const struct uverbs_attr *attr; + u64 flags; + + attr = uverbs_attr_get(attrs_bundle, idx); + /* Missing attribute means 0 flags */ + if (IS_ERR(attr)) { + *to = 0; + return 0; } - if (cmd == RDMA_VERBS_IOCTL) { - err = copy_from_user(&hdr, user_hdr, sizeof(hdr)); + /* + * New userspace code should use 8 bytes to pass flags, but we + * transparently support old userspaces that were using 4 bytes as + * well. + */ + if (attr->ptr_attr.len == 8) + flags = attr->ptr_attr.data; + else if (attr->ptr_attr.len == 4) + flags = *(u32 *)&attr->ptr_attr.data; + else + return -EINVAL; - if (err || hdr.length > IB_UVERBS_MAX_CMD_SZ || - hdr.length != sizeof(hdr) + hdr.num_attrs * sizeof(struct ib_uverbs_attr)) { - err = -EINVAL; - goto out; - } + if (flags & ~allowed_bits) + return -EINVAL; - if (hdr.reserved1 || hdr.reserved2) { - err = -EPROTONOSUPPORT; - goto out; - } + *to = flags; + return 0; +} +EXPORT_SYMBOL(uverbs_get_flags64); - err = ib_uverbs_cmd_verbs(ib_dev, file, &hdr, - (__user void *)arg + sizeof(hdr)); +int uverbs_get_flags32(u32 *to, const struct uverbs_attr_bundle *attrs_bundle, + size_t idx, u64 allowed_bits) +{ + u64 flags; + int ret; + + ret = uverbs_get_flags64(&flags, attrs_bundle, idx, allowed_bits); + if (ret) + return ret; + + if (flags > U32_MAX) + return -EINVAL; + *to = flags; + + return 0; +} +EXPORT_SYMBOL(uverbs_get_flags32); + +/* + * This is for ease of conversion. The purpose is to convert all drivers to + * use uverbs_attr_bundle instead of ib_udata. Assume attr == 0 is input and + * attr == 1 is output. + */ +void create_udata(struct uverbs_attr_bundle *bundle, struct ib_udata *udata) +{ + struct bundle_priv *pbundle = + container_of(bundle, struct bundle_priv, bundle); + const struct uverbs_attr *uhw_in = + uverbs_attr_get(bundle, UVERBS_ATTR_UHW_IN); + const struct uverbs_attr *uhw_out = + uverbs_attr_get(bundle, UVERBS_ATTR_UHW_OUT); + + if (!IS_ERR(uhw_in)) { + udata->inlen = uhw_in->ptr_attr.len; + if (uverbs_attr_ptr_is_inline(uhw_in)) + udata->inbuf = + &pbundle->user_attrs[uhw_in->ptr_attr.uattr_idx] + .data; + else + udata->inbuf = u64_to_user_ptr(uhw_in->ptr_attr.data); } else { - err = -ENOIOCTLCMD; + udata->inbuf = NULL; + udata->inlen = 0; } -out: - srcu_read_unlock(&file->device->disassociate_srcu, srcu_key); - return err; + if (!IS_ERR(uhw_out)) { + udata->outbuf = u64_to_user_ptr(uhw_out->ptr_attr.data); + udata->outlen = uhw_out->ptr_attr.len; + } else { + udata->outbuf = NULL; + udata->outlen = 0; + } +} + +int uverbs_copy_to(const struct uverbs_attr_bundle *bundle, size_t idx, + const void *from, size_t size) +{ + struct bundle_priv *pbundle = + container_of(bundle, struct bundle_priv, bundle); + const struct uverbs_attr *attr = uverbs_attr_get(bundle, idx); + u16 flags; + size_t min_size; + + if (IS_ERR(attr)) + return PTR_ERR(attr); + + min_size = min_t(size_t, attr->ptr_attr.len, size); + if (copy_to_user(u64_to_user_ptr(attr->ptr_attr.data), from, min_size)) + return -EFAULT; + + flags = pbundle->uattrs[attr->ptr_attr.uattr_idx].flags | + UVERBS_ATTR_F_VALID_OUTPUT; + if (put_user(flags, + &pbundle->user_attrs[attr->ptr_attr.uattr_idx].flags)) + return -EFAULT; + + return 0; } +EXPORT_SYMBOL(uverbs_copy_to); diff --git a/drivers/infiniband/core/uverbs_ioctl_merge.c b/drivers/infiniband/core/uverbs_ioctl_merge.c deleted file mode 100644 index 6ceb672c4d46..000000000000 --- a/drivers/infiniband/core/uverbs_ioctl_merge.c +++ /dev/null @@ -1,664 +0,0 @@ -/* - * Copyright (c) 2017, Mellanox Technologies inc. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include <rdma/uverbs_ioctl.h> -#include <rdma/rdma_user_ioctl.h> -#include <linux/bitops.h> -#include "uverbs.h" - -#define UVERBS_NUM_NS (UVERBS_ID_NS_MASK >> UVERBS_ID_NS_SHIFT) -#define GET_NS_ID(idx) (((idx) & UVERBS_ID_NS_MASK) >> UVERBS_ID_NS_SHIFT) -#define GET_ID(idx) ((idx) & ~UVERBS_ID_NS_MASK) - -#define _for_each_element(elem, tmpi, tmpj, hashes, num_buckets_offset, \ - buckets_offset) \ - for (tmpj = 0, \ - elem = (*(const void ***)((hashes)[tmpi] + \ - (buckets_offset)))[0]; \ - tmpj < *(size_t *)((hashes)[tmpi] + (num_buckets_offset)); \ - tmpj++) \ - if ((elem = ((*(const void ***)(hashes[tmpi] + \ - (buckets_offset)))[tmpj]))) - -/* - * Iterate all elements of a few @hashes. The number of given hashes is - * indicated by @num_hashes. The offset of the number of buckets in the hash is - * represented by @num_buckets_offset, while the offset of the buckets array in - * the hash structure is represented by @buckets_offset. tmpi and tmpj are two - * short (or int) based indices that are given by the user. tmpi iterates over - * the different hashes. @elem points the current element in the hashes[tmpi] - * bucket we are looping on. To be honest, @hashes representation isn't exactly - * a hash, but more a collection of elements. These elements' ids are treated - * in a hash like manner, where the first upper bits are the bucket number. - * These elements are later mapped into a perfect-hash. - */ -#define for_each_element(elem, tmpi, tmpj, hashes, num_hashes, \ - num_buckets_offset, buckets_offset) \ - for (tmpi = 0; tmpi < (num_hashes); tmpi++) \ - _for_each_element(elem, tmpi, tmpj, hashes, num_buckets_offset,\ - buckets_offset) - -#define get_elements_iterators_entry_above(iters, num_elements, elements, \ - num_objects_fld, objects_fld, bucket,\ - min_id) \ - get_elements_above_id((const void **)iters, num_elements, \ - (const void **)(elements), \ - offsetof(typeof(**elements), \ - num_objects_fld), \ - offsetof(typeof(**elements), objects_fld),\ - offsetof(typeof(***(*elements)->objects_fld), id),\ - bucket, min_id) - -#define get_objects_above_id(iters, num_trees, trees, bucket, min_id) \ - get_elements_iterators_entry_above(iters, num_trees, trees, \ - num_objects, objects, bucket, min_id) - -#define get_methods_above_id(method_iters, num_iters, iters, bucket, min_id)\ - get_elements_iterators_entry_above(method_iters, num_iters, iters, \ - num_methods, methods, bucket, min_id) - -#define get_attrs_above_id(attrs_iters, num_iters, iters, bucket, min_id)\ - get_elements_iterators_entry_above(attrs_iters, num_iters, iters, \ - num_attrs, attrs, bucket, min_id) - -/* - * get_elements_above_id get a few hashes represented by @elements and - * @num_elements. The hashes fields are described by @num_offset, @data_offset - * and @id_offset in the same way as required by for_each_element. The function - * returns an array of @iters, represents an array of elements in the hashes - * buckets, which their ids are the smallest ids in all hashes but are all - * larger than the id given by min_id. Elements are only added to the iters - * array if their id belongs to the bucket @bucket. The number of elements in - * the returned array is returned by the function. @min_id is also updated to - * reflect the new min_id of all elements in iters. - */ -static size_t get_elements_above_id(const void **iters, - unsigned int num_elements, - const void **elements, - size_t num_offset, - size_t data_offset, - size_t id_offset, - u16 bucket, - short *min_id) -{ - size_t num_iters = 0; - short min = SHRT_MAX; - const void *elem; - int i, j, last_stored = -1; - unsigned int equal_min = 0; - - for_each_element(elem, i, j, elements, num_elements, num_offset, - data_offset) { - u16 id = *(u16 *)(elem + id_offset); - - if (GET_NS_ID(id) != bucket) - continue; - - if (GET_ID(id) < *min_id || - (min != SHRT_MAX && GET_ID(id) > min)) - continue; - - /* - * We first iterate all hashes represented by @elements. When - * we do, we try to find an element @elem in the bucket @bucket - * which its id is min. Since we can't ensure the user sorted - * the elements in increasing order, we override this hash's - * minimal id element we found, if a new element with a smaller - * id was just found. - */ - iters[last_stored == i ? num_iters - 1 : num_iters++] = elem; - last_stored = i; - if (min == GET_ID(id)) - equal_min++; - else - equal_min = 1; - min = GET_ID(id); - } - - /* - * We only insert to our iters array an element, if its id is smaller - * than all previous ids. Therefore, the final iters array is sorted so - * that smaller ids are in the end of the array. - * Therefore, we need to clean the beginning of the array to make sure - * all ids of final elements are equal to min. - */ - memmove(iters, iters + num_iters - equal_min, sizeof(*iters) * equal_min); - - *min_id = min; - return equal_min; -} - -#define find_max_element_entry_id(num_elements, elements, num_objects_fld, \ - objects_fld, bucket) \ - find_max_element_id(num_elements, (const void **)(elements), \ - offsetof(typeof(**elements), num_objects_fld), \ - offsetof(typeof(**elements), objects_fld), \ - offsetof(typeof(***(*elements)->objects_fld), id),\ - bucket) - -static short find_max_element_ns_id(unsigned int num_elements, - const void **elements, - size_t num_offset, - size_t data_offset, - size_t id_offset) -{ - short max_ns = SHRT_MIN; - const void *elem; - int i, j; - - for_each_element(elem, i, j, elements, num_elements, num_offset, - data_offset) { - u16 id = *(u16 *)(elem + id_offset); - - if (GET_NS_ID(id) > max_ns) - max_ns = GET_NS_ID(id); - } - - return max_ns; -} - -static short find_max_element_id(unsigned int num_elements, - const void **elements, - size_t num_offset, - size_t data_offset, - size_t id_offset, - u16 bucket) -{ - short max_id = SHRT_MIN; - const void *elem; - int i, j; - - for_each_element(elem, i, j, elements, num_elements, num_offset, - data_offset) { - u16 id = *(u16 *)(elem + id_offset); - - if (GET_NS_ID(id) == bucket && - GET_ID(id) > max_id) - max_id = GET_ID(id); - } - return max_id; -} - -#define find_max_element_entry_id(num_elements, elements, num_objects_fld, \ - objects_fld, bucket) \ - find_max_element_id(num_elements, (const void **)(elements), \ - offsetof(typeof(**elements), num_objects_fld), \ - offsetof(typeof(**elements), objects_fld), \ - offsetof(typeof(***(*elements)->objects_fld), id),\ - bucket) - -#define find_max_element_ns_entry_id(num_elements, elements, \ - num_objects_fld, objects_fld) \ - find_max_element_ns_id(num_elements, (const void **)(elements), \ - offsetof(typeof(**elements), num_objects_fld),\ - offsetof(typeof(**elements), objects_fld), \ - offsetof(typeof(***(*elements)->objects_fld), id)) - -/* - * find_max_xxxx_ns_id gets a few elements. Each element is described by an id - * which its upper bits represents a namespace. It finds the max namespace. This - * could be used in order to know how many buckets do we need to allocate. If no - * elements exist, SHRT_MIN is returned. Namespace represents here different - * buckets. The common example is "common bucket" and "driver bucket". - * - * find_max_xxxx_id gets a few elements and a bucket. Each element is described - * by an id which its upper bits represent a namespace. It returns the max id - * which is contained in the same namespace defined in @bucket. This could be - * used in order to know how many elements do we need to allocate in the bucket. - * If no elements exist, SHRT_MIN is returned. - */ - -#define find_max_object_id(num_trees, trees, bucket) \ - find_max_element_entry_id(num_trees, trees, num_objects,\ - objects, bucket) -#define find_max_object_ns_id(num_trees, trees) \ - find_max_element_ns_entry_id(num_trees, trees, \ - num_objects, objects) - -#define find_max_method_id(num_iters, iters, bucket) \ - find_max_element_entry_id(num_iters, iters, num_methods,\ - methods, bucket) -#define find_max_method_ns_id(num_iters, iters) \ - find_max_element_ns_entry_id(num_iters, iters, \ - num_methods, methods) - -#define find_max_attr_id(num_iters, iters, bucket) \ - find_max_element_entry_id(num_iters, iters, num_attrs, \ - attrs, bucket) -#define find_max_attr_ns_id(num_iters, iters) \ - find_max_element_ns_entry_id(num_iters, iters, \ - num_attrs, attrs) - -static void free_method(struct uverbs_method_spec *method) -{ - unsigned int i; - - if (!method) - return; - - for (i = 0; i < method->num_buckets; i++) - kfree(method->attr_buckets[i]); - - kfree(method); -} - -#define IS_ATTR_OBJECT(attr) ((attr)->type == UVERBS_ATTR_TYPE_IDR || \ - (attr)->type == UVERBS_ATTR_TYPE_FD) - -/* - * This function gets array of size @num_method_defs which contains pointers to - * method definitions @method_defs. The function allocates an - * uverbs_method_spec structure and initializes its number of buckets and the - * elements in buckets to the correct attributes. While doing that, it - * validates that there aren't conflicts between attributes of different - * method_defs. - */ -static struct uverbs_method_spec *build_method_with_attrs(const struct uverbs_method_def **method_defs, - size_t num_method_defs) -{ - int bucket_idx; - int max_attr_buckets = 0; - size_t num_attr_buckets = 0; - int res = 0; - struct uverbs_method_spec *method = NULL; - const struct uverbs_attr_def **attr_defs; - unsigned int num_of_singularities = 0; - - max_attr_buckets = find_max_attr_ns_id(num_method_defs, method_defs); - if (max_attr_buckets >= 0) - num_attr_buckets = max_attr_buckets + 1; - - method = kzalloc(struct_size(method, attr_buckets, num_attr_buckets), - GFP_KERNEL); - if (!method) - return ERR_PTR(-ENOMEM); - - method->num_buckets = num_attr_buckets; - attr_defs = kcalloc(num_method_defs, sizeof(*attr_defs), GFP_KERNEL); - if (!attr_defs) { - res = -ENOMEM; - goto free_method; - } - for (bucket_idx = 0; bucket_idx < method->num_buckets; bucket_idx++) { - short min_id = SHRT_MIN; - int attr_max_bucket = 0; - struct uverbs_attr_spec_hash *hash = NULL; - - attr_max_bucket = find_max_attr_id(num_method_defs, method_defs, - bucket_idx); - if (attr_max_bucket < 0) - continue; - - hash = kzalloc(sizeof(*hash) + - ALIGN(sizeof(*hash->attrs) * (attr_max_bucket + 1), - sizeof(long)) + - BITS_TO_LONGS(attr_max_bucket + 1) * sizeof(long), - GFP_KERNEL); - if (!hash) { - res = -ENOMEM; - goto free; - } - hash->num_attrs = attr_max_bucket + 1; - method->num_child_attrs += hash->num_attrs; - hash->mandatory_attrs_bitmask = (void *)(hash + 1) + - ALIGN(sizeof(*hash->attrs) * - (attr_max_bucket + 1), - sizeof(long)); - - method->attr_buckets[bucket_idx] = hash; - - do { - size_t num_attr_defs; - struct uverbs_attr_spec *attr; - bool attr_obj_with_special_access; - - num_attr_defs = - get_attrs_above_id(attr_defs, - num_method_defs, - method_defs, - bucket_idx, - &min_id); - /* Last attr in bucket */ - if (!num_attr_defs) - break; - - if (num_attr_defs > 1) { - /* - * We don't allow two attribute definitions for - * the same attribute. This is usually a - * programmer error. If required, it's better to - * just add a new attribute to capture the new - * semantics. - */ - res = -EEXIST; - goto free; - } - - attr = &hash->attrs[min_id]; - memcpy(attr, &attr_defs[0]->attr, sizeof(*attr)); - - attr_obj_with_special_access = IS_ATTR_OBJECT(attr) && - (attr->obj.access == UVERBS_ACCESS_NEW || - attr->obj.access == UVERBS_ACCESS_DESTROY); - num_of_singularities += !!attr_obj_with_special_access; - if (WARN(num_of_singularities > 1, - "ib_uverbs: Method contains more than one object attr (%d) with new/destroy access\n", - min_id) || - WARN(attr_obj_with_special_access && - !(attr->flags & UVERBS_ATTR_SPEC_F_MANDATORY), - "ib_uverbs: Tried to merge attr (%d) but it's an object with new/destroy access but isn't mandatory\n", - min_id) || - WARN(IS_ATTR_OBJECT(attr) && - attr->flags & UVERBS_ATTR_SPEC_F_MIN_SZ_OR_ZERO, - "ib_uverbs: Tried to merge attr (%d) but it's an object with min_sz flag\n", - min_id)) { - res = -EINVAL; - goto free; - } - - if (attr->flags & UVERBS_ATTR_SPEC_F_MANDATORY) - set_bit(min_id, hash->mandatory_attrs_bitmask); - min_id++; - - } while (1); - } - kfree(attr_defs); - return method; - -free: - kfree(attr_defs); -free_method: - free_method(method); - return ERR_PTR(res); -} - -static void free_object(struct uverbs_object_spec *object) -{ - unsigned int i, j; - - if (!object) - return; - - for (i = 0; i < object->num_buckets; i++) { - struct uverbs_method_spec_hash *method_buckets = - object->method_buckets[i]; - - if (!method_buckets) - continue; - - for (j = 0; j < method_buckets->num_methods; j++) - free_method(method_buckets->methods[j]); - - kfree(method_buckets); - } - - kfree(object); -} - -/* - * This function gets array of size @num_object_defs which contains pointers to - * object definitions @object_defs. The function allocated an - * uverbs_object_spec structure and initialize its number of buckets and the - * elements in buckets to the correct methods. While doing that, it - * sorts out the correct relationship between conflicts in the same method. - */ -static struct uverbs_object_spec *build_object_with_methods(const struct uverbs_object_def **object_defs, - size_t num_object_defs) -{ - u16 bucket_idx; - int max_method_buckets = 0; - u16 num_method_buckets = 0; - int res = 0; - struct uverbs_object_spec *object = NULL; - const struct uverbs_method_def **method_defs; - - max_method_buckets = find_max_method_ns_id(num_object_defs, object_defs); - if (max_method_buckets >= 0) - num_method_buckets = max_method_buckets + 1; - - object = kzalloc(struct_size(object, method_buckets, - num_method_buckets), - GFP_KERNEL); - if (!object) - return ERR_PTR(-ENOMEM); - - object->num_buckets = num_method_buckets; - method_defs = kcalloc(num_object_defs, sizeof(*method_defs), GFP_KERNEL); - if (!method_defs) { - res = -ENOMEM; - goto free_object; - } - - for (bucket_idx = 0; bucket_idx < object->num_buckets; bucket_idx++) { - short min_id = SHRT_MIN; - int methods_max_bucket = 0; - struct uverbs_method_spec_hash *hash = NULL; - - methods_max_bucket = find_max_method_id(num_object_defs, object_defs, - bucket_idx); - if (methods_max_bucket < 0) - continue; - - hash = kzalloc(struct_size(hash, methods, - methods_max_bucket + 1), - GFP_KERNEL); - if (!hash) { - res = -ENOMEM; - goto free; - } - - hash->num_methods = methods_max_bucket + 1; - object->method_buckets[bucket_idx] = hash; - - do { - size_t num_method_defs; - struct uverbs_method_spec *method; - int i; - - num_method_defs = - get_methods_above_id(method_defs, - num_object_defs, - object_defs, - bucket_idx, - &min_id); - /* Last method in bucket */ - if (!num_method_defs) - break; - - method = build_method_with_attrs(method_defs, - num_method_defs); - if (IS_ERR(method)) { - res = PTR_ERR(method); - goto free; - } - - /* - * The last tree which is given as an argument to the - * merge overrides previous method handler. - * Therefore, we iterate backwards and search for the - * first handler which != NULL. This also defines the - * set of flags used for this handler. - */ - for (i = num_method_defs - 1; - i >= 0 && !method_defs[i]->handler; i--) - ; - hash->methods[min_id++] = method; - /* NULL handler isn't allowed */ - if (WARN(i < 0, - "ib_uverbs: tried to merge function id %d, but all handlers are NULL\n", - min_id)) { - res = -EINVAL; - goto free; - } - method->handler = method_defs[i]->handler; - method->flags = method_defs[i]->flags; - - } while (1); - } - kfree(method_defs); - return object; - -free: - kfree(method_defs); -free_object: - free_object(object); - return ERR_PTR(res); -} - -void uverbs_free_spec_tree(struct uverbs_root_spec *root) -{ - unsigned int i, j; - - if (!root) - return; - - for (i = 0; i < root->num_buckets; i++) { - struct uverbs_object_spec_hash *object_hash = - root->object_buckets[i]; - - if (!object_hash) - continue; - - for (j = 0; j < object_hash->num_objects; j++) - free_object(object_hash->objects[j]); - - kfree(object_hash); - } - - kfree(root); -} -EXPORT_SYMBOL(uverbs_free_spec_tree); - -struct uverbs_root_spec *uverbs_alloc_spec_tree(unsigned int num_trees, - const struct uverbs_object_tree_def **trees) -{ - u16 bucket_idx; - short max_object_buckets = 0; - size_t num_objects_buckets = 0; - struct uverbs_root_spec *root_spec = NULL; - const struct uverbs_object_def **object_defs; - int i; - int res = 0; - - max_object_buckets = find_max_object_ns_id(num_trees, trees); - /* - * Devices which don't want to support ib_uverbs, should just allocate - * an empty parsing tree. Every user-space command won't hit any valid - * entry in the parsing tree and thus will fail. - */ - if (max_object_buckets >= 0) - num_objects_buckets = max_object_buckets + 1; - - root_spec = kzalloc(struct_size(root_spec, object_buckets, - num_objects_buckets), - GFP_KERNEL); - if (!root_spec) - return ERR_PTR(-ENOMEM); - root_spec->num_buckets = num_objects_buckets; - - object_defs = kcalloc(num_trees, sizeof(*object_defs), - GFP_KERNEL); - if (!object_defs) { - res = -ENOMEM; - goto free_root; - } - - for (bucket_idx = 0; bucket_idx < root_spec->num_buckets; bucket_idx++) { - short min_id = SHRT_MIN; - short objects_max_bucket; - struct uverbs_object_spec_hash *hash = NULL; - - objects_max_bucket = find_max_object_id(num_trees, trees, - bucket_idx); - if (objects_max_bucket < 0) - continue; - - hash = kzalloc(struct_size(hash, objects, - objects_max_bucket + 1), - GFP_KERNEL); - if (!hash) { - res = -ENOMEM; - goto free; - } - hash->num_objects = objects_max_bucket + 1; - root_spec->object_buckets[bucket_idx] = hash; - - do { - size_t num_object_defs; - struct uverbs_object_spec *object; - - num_object_defs = get_objects_above_id(object_defs, - num_trees, - trees, - bucket_idx, - &min_id); - /* Last object in bucket */ - if (!num_object_defs) - break; - - object = build_object_with_methods(object_defs, - num_object_defs); - if (IS_ERR(object)) { - res = PTR_ERR(object); - goto free; - } - - /* - * The last tree which is given as an argument to the - * merge overrides previous object's type_attrs. - * Therefore, we iterate backwards and search for the - * first type_attrs which != NULL. - */ - for (i = num_object_defs - 1; - i >= 0 && !object_defs[i]->type_attrs; i--) - ; - /* - * NULL is a valid type_attrs. It means an object we - * can't instantiate (like DEVICE). - */ - object->type_attrs = i < 0 ? NULL : - object_defs[i]->type_attrs; - - hash->objects[min_id++] = object; - } while (1); - } - - kfree(object_defs); - return root_spec; - -free: - kfree(object_defs); -free_root: - uverbs_free_spec_tree(root_spec); - return ERR_PTR(res); -} -EXPORT_SYMBOL(uverbs_alloc_spec_tree); diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index 2094d136513d..823beca448e1 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -41,8 +41,6 @@ #include <linux/fs.h> #include <linux/poll.h> #include <linux/sched.h> -#include <linux/sched/mm.h> -#include <linux/sched/task.h> #include <linux/file.h> #include <linux/cdev.h> #include <linux/anon_inodes.h> @@ -77,7 +75,6 @@ static struct class *uverbs_class; static DECLARE_BITMAP(dev_map, IB_UVERBS_MAX_DEVICES); static ssize_t (*uverbs_cmd_table[])(struct ib_uverbs_file *file, - struct ib_device *ib_dev, const char __user *buf, int in_len, int out_len) = { [IB_USER_VERBS_CMD_GET_CONTEXT] = ib_uverbs_get_context, @@ -118,7 +115,6 @@ static ssize_t (*uverbs_cmd_table[])(struct ib_uverbs_file *file, }; static int (*uverbs_ex_cmd_table[])(struct ib_uverbs_file *file, - struct ib_device *ib_dev, struct ib_udata *ucore, struct ib_udata *uhw) = { [IB_USER_VERBS_EX_CMD_CREATE_FLOW] = ib_uverbs_ex_create_flow, @@ -138,6 +134,30 @@ static int (*uverbs_ex_cmd_table[])(struct ib_uverbs_file *file, static void ib_uverbs_add_one(struct ib_device *device); static void ib_uverbs_remove_one(struct ib_device *device, void *client_data); +/* + * Must be called with the ufile->device->disassociate_srcu held, and the lock + * must be held until use of the ucontext is finished. + */ +struct ib_ucontext *ib_uverbs_get_ucontext(struct ib_uverbs_file *ufile) +{ + /* + * We do not hold the hw_destroy_rwsem lock for this flow, instead + * srcu is used. It does not matter if someone races this with + * get_context, we get NULL or valid ucontext. + */ + struct ib_ucontext *ucontext = smp_load_acquire(&ufile->ucontext); + + if (!srcu_dereference(ufile->device->ib_dev, + &ufile->device->disassociate_srcu)) + return ERR_PTR(-EIO); + + if (!ucontext) + return ERR_PTR(-EINVAL); + + return ucontext; +} +EXPORT_SYMBOL(ib_uverbs_get_ucontext); + int uverbs_dealloc_mw(struct ib_mw *mw) { struct ib_pd *pd = mw->pd; @@ -154,6 +174,7 @@ static void ib_uverbs_release_dev(struct kobject *kobj) struct ib_uverbs_device *dev = container_of(kobj, struct ib_uverbs_device, kobj); + uverbs_destroy_api(dev->uapi); cleanup_srcu_struct(&dev->disassociate_srcu); kfree(dev); } @@ -184,7 +205,7 @@ void ib_uverbs_release_ucq(struct ib_uverbs_file *file, } spin_unlock_irq(&ev_file->ev_queue.lock); - uverbs_uobject_put(&ev_file->uobj_file.uobj); + uverbs_uobject_put(&ev_file->uobj); } spin_lock_irq(&file->async_file->ev_queue.lock); @@ -220,20 +241,6 @@ void ib_uverbs_detach_umcast(struct ib_qp *qp, } } -static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file, - struct ib_ucontext *context, - bool device_removed) -{ - context->closing = 1; - uverbs_cleanup_ucontext(context, device_removed); - put_pid(context->tgid); - - ib_rdmacg_uncharge(&context->cg_obj, context->device, - RDMACG_RESOURCE_HCA_HANDLE); - - return context->device->dealloc_ucontext(context); -} - static void ib_uverbs_comp_dev(struct ib_uverbs_device *dev) { complete(&dev->comp); @@ -246,6 +253,8 @@ void ib_uverbs_release_file(struct kref *ref) struct ib_device *ib_dev; int srcu_key; + release_ufile_idr_uobject(file); + srcu_key = srcu_read_lock(&file->device->disassociate_srcu); ib_dev = srcu_dereference(file->device->ib_dev, &file->device->disassociate_srcu); @@ -338,7 +347,7 @@ static ssize_t ib_uverbs_comp_event_read(struct file *filp, char __user *buf, filp->private_data; return ib_uverbs_event_read(&comp_ev_file->ev_queue, - comp_ev_file->uobj_file.ufile, filp, + comp_ev_file->uobj.ufile, filp, buf, count, pos, sizeof(struct ib_uverbs_comp_event_desc)); } @@ -420,7 +429,9 @@ static int ib_uverbs_async_event_close(struct inode *inode, struct file *filp) static int ib_uverbs_comp_event_close(struct inode *inode, struct file *filp) { - struct ib_uverbs_completion_event_file *file = filp->private_data; + struct ib_uobject *uobj = filp->private_data; + struct ib_uverbs_completion_event_file *file = container_of( + uobj, struct ib_uverbs_completion_event_file, uobj); struct ib_uverbs_event *entry, *tmp; spin_lock_irq(&file->ev_queue.lock); @@ -528,7 +539,7 @@ void ib_uverbs_cq_event_handler(struct ib_event *event, void *context_ptr) struct ib_ucq_object *uobj = container_of(event->element.cq->uobject, struct ib_ucq_object, uobject); - ib_uverbs_async_handler(uobj->uverbs_file, uobj->uobject.user_handle, + ib_uverbs_async_handler(uobj->uobject.ufile, uobj->uobject.user_handle, event->event, &uobj->async_list, &uobj->async_events_reported); } @@ -637,13 +648,13 @@ err_put_refs: return filp; } -static bool verify_command_mask(struct ib_device *ib_dev, - u32 command, bool extended) +static bool verify_command_mask(struct ib_uverbs_file *ufile, u32 command, + bool extended) { if (!extended) - return ib_dev->uverbs_cmd_mask & BIT_ULL(command); + return ufile->uverbs_cmd_mask & BIT_ULL(command); - return ib_dev->uverbs_ex_cmd_mask & BIT_ULL(command); + return ufile->uverbs_ex_cmd_mask & BIT_ULL(command); } static bool verify_command_idx(u32 command, bool extended) @@ -713,7 +724,6 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf, { struct ib_uverbs_file *file = filp->private_data; struct ib_uverbs_ex_cmd_hdr ex_hdr; - struct ib_device *ib_dev; struct ib_uverbs_cmd_hdr hdr; bool extended; int srcu_key; @@ -748,24 +758,8 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf, return ret; srcu_key = srcu_read_lock(&file->device->disassociate_srcu); - ib_dev = srcu_dereference(file->device->ib_dev, - &file->device->disassociate_srcu); - if (!ib_dev) { - ret = -EIO; - goto out; - } - - /* - * Must be after the ib_dev check, as once the RCU clears ib_dev == - * NULL means ucontext == NULL - */ - if (!file->ucontext && - (command != IB_USER_VERBS_CMD_GET_CONTEXT || extended)) { - ret = -EINVAL; - goto out; - } - if (!verify_command_mask(ib_dev, command, extended)) { + if (!verify_command_mask(file, command, extended)) { ret = -EOPNOTSUPP; goto out; } @@ -773,7 +767,7 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf, buf += sizeof(hdr); if (!extended) { - ret = uverbs_cmd_table[command](file, ib_dev, buf, + ret = uverbs_cmd_table[command](file, buf, hdr.in_words * 4, hdr.out_words * 4); } else { @@ -792,7 +786,7 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf, ex_hdr.provider_in_words * 8, ex_hdr.provider_out_words * 8); - ret = uverbs_ex_cmd_table[command](file, ib_dev, &ucore, &uhw); + ret = uverbs_ex_cmd_table[command](file, &ucore, &uhw); ret = (ret) ? : count; } @@ -804,22 +798,18 @@ out: static int ib_uverbs_mmap(struct file *filp, struct vm_area_struct *vma) { struct ib_uverbs_file *file = filp->private_data; - struct ib_device *ib_dev; + struct ib_ucontext *ucontext; int ret = 0; int srcu_key; srcu_key = srcu_read_lock(&file->device->disassociate_srcu); - ib_dev = srcu_dereference(file->device->ib_dev, - &file->device->disassociate_srcu); - if (!ib_dev) { - ret = -EIO; + ucontext = ib_uverbs_get_ucontext(file); + if (IS_ERR(ucontext)) { + ret = PTR_ERR(ucontext); goto out; } - if (!file->ucontext) - ret = -ENODEV; - else - ret = ib_dev->mmap(file->ucontext, vma); + ret = ucontext->device->mmap(ucontext, vma); out: srcu_read_unlock(&file->device->disassociate_srcu, srcu_key); return ret; @@ -879,13 +869,12 @@ static int ib_uverbs_open(struct inode *inode, struct file *filp) } file->device = dev; - spin_lock_init(&file->idr_lock); - idr_init(&file->idr); - file->ucontext = NULL; - file->async_file = NULL; kref_init(&file->ref); - mutex_init(&file->mutex); - mutex_init(&file->cleanup_mutex); + mutex_init(&file->ucontext_lock); + + spin_lock_init(&file->uobjects_lock); + INIT_LIST_HEAD(&file->uobjects); + init_rwsem(&file->hw_destroy_rwsem); filp->private_data = file; kobject_get(&dev->kobj); @@ -893,6 +882,11 @@ static int ib_uverbs_open(struct inode *inode, struct file *filp) mutex_unlock(&dev->lists_mutex); srcu_read_unlock(&dev->disassociate_srcu, srcu_key); + file->uverbs_cmd_mask = ib_dev->uverbs_cmd_mask; + file->uverbs_ex_cmd_mask = ib_dev->uverbs_ex_cmd_mask; + + setup_ufile_idr_uobject(file); + return nonseekable_open(inode, filp); err_module: @@ -911,13 +905,7 @@ static int ib_uverbs_close(struct inode *inode, struct file *filp) { struct ib_uverbs_file *file = filp->private_data; - mutex_lock(&file->cleanup_mutex); - if (file->ucontext) { - ib_uverbs_cleanup_ucontext(file, file->ucontext, false); - file->ucontext = NULL; - } - mutex_unlock(&file->cleanup_mutex); - idr_destroy(&file->idr); + uverbs_destroy_ufile_hw(file, RDMA_REMOVE_CLOSE); mutex_lock(&file->device->lists_mutex); if (!file->is_closed) { @@ -1006,6 +994,19 @@ static DEVICE_ATTR(abi_version, S_IRUGO, show_dev_abi_version, NULL); static CLASS_ATTR_STRING(abi_version, S_IRUGO, __stringify(IB_USER_VERBS_ABI_VERSION)); +static int ib_uverbs_create_uapi(struct ib_device *device, + struct ib_uverbs_device *uverbs_dev) +{ + struct uverbs_api *uapi; + + uapi = uverbs_alloc_api(device->driver_specs, device->driver_id); + if (IS_ERR(uapi)) + return PTR_ERR(uapi); + + uverbs_dev->uapi = uapi; + return 0; +} + static void ib_uverbs_add_one(struct ib_device *device) { int devnum; @@ -1048,6 +1049,9 @@ static void ib_uverbs_add_one(struct ib_device *device) rcu_assign_pointer(uverbs_dev->ib_dev, device); uverbs_dev->num_comp_vectors = device->num_comp_vectors; + if (ib_uverbs_create_uapi(device, uverbs_dev)) + goto err; + cdev_init(&uverbs_dev->cdev, NULL); uverbs_dev->cdev.owner = THIS_MODULE; uverbs_dev->cdev.ops = device->mmap ? &uverbs_mmap_fops : &uverbs_fops; @@ -1067,18 +1071,6 @@ static void ib_uverbs_add_one(struct ib_device *device) if (device_create_file(uverbs_dev->dev, &dev_attr_abi_version)) goto err_class; - if (!device->specs_root) { - const struct uverbs_object_tree_def *default_root[] = { - uverbs_default_get_objects()}; - - uverbs_dev->specs_root = uverbs_alloc_spec_tree(1, - default_root); - if (IS_ERR(uverbs_dev->specs_root)) - goto err_class; - - device->specs_root = uverbs_dev->specs_root; - } - ib_set_client_data(device, &uverbs_client, uverbs_dev); return; @@ -1098,44 +1090,6 @@ err: return; } -static void ib_uverbs_disassociate_ucontext(struct ib_ucontext *ibcontext) -{ - struct ib_device *ib_dev = ibcontext->device; - struct task_struct *owning_process = NULL; - struct mm_struct *owning_mm = NULL; - - owning_process = get_pid_task(ibcontext->tgid, PIDTYPE_PID); - if (!owning_process) - return; - - owning_mm = get_task_mm(owning_process); - if (!owning_mm) { - pr_info("no mm, disassociate ucontext is pending task termination\n"); - while (1) { - put_task_struct(owning_process); - usleep_range(1000, 2000); - owning_process = get_pid_task(ibcontext->tgid, - PIDTYPE_PID); - if (!owning_process || - owning_process->state == TASK_DEAD) { - pr_info("disassociate ucontext done, task was terminated\n"); - /* in case task was dead need to release the - * task struct. - */ - if (owning_process) - put_task_struct(owning_process); - return; - } - } - } - - down_write(&owning_mm->mmap_sem); - ib_dev->disassociate_ucontext(ibcontext); - up_write(&owning_mm->mmap_sem); - mmput(owning_mm); - put_task_struct(owning_process); -} - static void ib_uverbs_free_hw_resources(struct ib_uverbs_device *uverbs_dev, struct ib_device *ib_dev) { @@ -1144,46 +1098,31 @@ static void ib_uverbs_free_hw_resources(struct ib_uverbs_device *uverbs_dev, struct ib_event event; /* Pending running commands to terminate */ - synchronize_srcu(&uverbs_dev->disassociate_srcu); + uverbs_disassociate_api_pre(uverbs_dev); event.event = IB_EVENT_DEVICE_FATAL; event.element.port_num = 0; event.device = ib_dev; mutex_lock(&uverbs_dev->lists_mutex); while (!list_empty(&uverbs_dev->uverbs_file_list)) { - struct ib_ucontext *ucontext; file = list_first_entry(&uverbs_dev->uverbs_file_list, struct ib_uverbs_file, list); file->is_closed = 1; list_del(&file->list); kref_get(&file->ref); - mutex_unlock(&uverbs_dev->lists_mutex); - - - mutex_lock(&file->cleanup_mutex); - ucontext = file->ucontext; - file->ucontext = NULL; - mutex_unlock(&file->cleanup_mutex); - /* At this point ib_uverbs_close cannot be running - * ib_uverbs_cleanup_ucontext + /* We must release the mutex before going ahead and calling + * uverbs_cleanup_ufile, as it might end up indirectly calling + * uverbs_close, for example due to freeing the resources (e.g + * mmput). */ - if (ucontext) { - /* We must release the mutex before going ahead and - * calling disassociate_ucontext. disassociate_ucontext - * might end up indirectly calling uverbs_close, - * for example due to freeing the resources - * (e.g mmput). - */ - ib_uverbs_event_handler(&file->event_handler, &event); - ib_uverbs_disassociate_ucontext(ucontext); - mutex_lock(&file->cleanup_mutex); - ib_uverbs_cleanup_ucontext(file, ucontext, true); - mutex_unlock(&file->cleanup_mutex); - } + mutex_unlock(&uverbs_dev->lists_mutex); - mutex_lock(&uverbs_dev->lists_mutex); + ib_uverbs_event_handler(&file->event_handler, &event); + uverbs_destroy_ufile_hw(file, RDMA_REMOVE_DRIVER_REMOVE); kref_put(&file->ref, ib_uverbs_release_file); + + mutex_lock(&uverbs_dev->lists_mutex); } while (!list_empty(&uverbs_dev->uverbs_events_file_list)) { @@ -1205,6 +1144,8 @@ static void ib_uverbs_free_hw_resources(struct ib_uverbs_device *uverbs_dev, kill_fasync(&event_file->ev_queue.async_queue, SIGIO, POLL_IN); } mutex_unlock(&uverbs_dev->lists_mutex); + + uverbs_disassociate_api(uverbs_dev->uapi); } static void ib_uverbs_remove_one(struct ib_device *device, void *client_data) @@ -1232,7 +1173,6 @@ static void ib_uverbs_remove_one(struct ib_device *device, void *client_data) * cdev was deleted, however active clients can still issue * commands and close their open files. */ - rcu_assign_pointer(uverbs_dev->ib_dev, NULL); ib_uverbs_free_hw_resources(uverbs_dev, device); wait_clients = 0; } @@ -1241,10 +1181,6 @@ static void ib_uverbs_remove_one(struct ib_device *device, void *client_data) ib_uverbs_comp_dev(uverbs_dev); if (wait_clients) wait_for_completion(&uverbs_dev->comp); - if (uverbs_dev->specs_root) { - uverbs_free_spec_tree(uverbs_dev->specs_root); - device->specs_root = NULL; - } kobject_put(&uverbs_dev->kobj); } diff --git a/drivers/infiniband/core/uverbs_marshall.c b/drivers/infiniband/core/uverbs_marshall.c index bb372b4713a4..b8d715c68ca4 100644 --- a/drivers/infiniband/core/uverbs_marshall.c +++ b/drivers/infiniband/core/uverbs_marshall.c @@ -211,7 +211,5 @@ void ib_copy_path_rec_from_user(struct sa_path_rec *dst, /* TODO: No need to set this */ sa_path_set_dmac_zero(dst); - sa_path_set_ndev(dst, NULL); - sa_path_set_ifindex(dst, 0); } EXPORT_SYMBOL(ib_copy_path_rec_from_user); diff --git a/drivers/infiniband/core/uverbs_std_types.c b/drivers/infiniband/core/uverbs_std_types.c index b570acbd94af..203cc96ac6f5 100644 --- a/drivers/infiniband/core/uverbs_std_types.c +++ b/drivers/infiniband/core/uverbs_std_types.c @@ -48,14 +48,18 @@ static int uverbs_free_ah(struct ib_uobject *uobject, static int uverbs_free_flow(struct ib_uobject *uobject, enum rdma_remove_reason why) { - int ret; struct ib_flow *flow = (struct ib_flow *)uobject->object; struct ib_uflow_object *uflow = container_of(uobject, struct ib_uflow_object, uobject); + struct ib_qp *qp = flow->qp; + int ret; - ret = ib_destroy_flow(flow); - if (!ret) + ret = flow->device->destroy_flow(flow); + if (!ret) { + if (qp) + atomic_dec(&qp->usecnt); ib_uverbs_flow_resources_free(uflow->resources); + } return ret; } @@ -74,6 +78,13 @@ static int uverbs_free_qp(struct ib_uobject *uobject, container_of(uobject, struct ib_uqp_object, uevent.uobject); int ret; + /* + * If this is a user triggered destroy then do not allow destruction + * until the user cleans up all the mcast bindings. Unlike in other + * places we forcibly clean up the mcast attachments for !DESTROY + * because the mcast attaches are not ubojects and will not be + * destroyed by anything else during cleanup processing. + */ if (why == RDMA_REMOVE_DESTROY) { if (!list_empty(&uqp->mcast_list)) return -EBUSY; @@ -82,7 +93,7 @@ static int uverbs_free_qp(struct ib_uobject *uobject, } ret = ib_destroy_qp(qp); - if (ret && why == RDMA_REMOVE_DESTROY) + if (ib_is_destroy_retryable(ret, why, uobject)) return ret; if (uqp->uxrcd) @@ -100,8 +111,10 @@ static int uverbs_free_rwq_ind_tbl(struct ib_uobject *uobject, int ret; ret = ib_destroy_rwq_ind_table(rwq_ind_tbl); - if (!ret || why != RDMA_REMOVE_DESTROY) - kfree(ind_tbl); + if (ib_is_destroy_retryable(ret, why, uobject)) + return ret; + + kfree(ind_tbl); return ret; } @@ -114,8 +127,10 @@ static int uverbs_free_wq(struct ib_uobject *uobject, int ret; ret = ib_destroy_wq(wq); - if (!ret || why != RDMA_REMOVE_DESTROY) - ib_uverbs_release_uevent(uobject->context->ufile, &uwq->uevent); + if (ib_is_destroy_retryable(ret, why, uobject)) + return ret; + + ib_uverbs_release_uevent(uobject->context->ufile, &uwq->uevent); return ret; } @@ -129,8 +144,7 @@ static int uverbs_free_srq(struct ib_uobject *uobject, int ret; ret = ib_destroy_srq(srq); - - if (ret && why == RDMA_REMOVE_DESTROY) + if (ib_is_destroy_retryable(ret, why, uobject)) return ret; if (srq_type == IB_SRQT_XRC) { @@ -152,12 +166,12 @@ static int uverbs_free_xrcd(struct ib_uobject *uobject, container_of(uobject, struct ib_uxrcd_object, uobject); int ret; + ret = ib_destroy_usecnt(&uxrcd->refcnt, why, uobject); + if (ret) + return ret; + mutex_lock(&uobject->context->ufile->device->xrcd_tree_mutex); - if (why == RDMA_REMOVE_DESTROY && atomic_read(&uxrcd->refcnt)) - ret = -EBUSY; - else - ret = ib_uverbs_dealloc_xrcd(uobject->context->ufile->device, - xrcd, why); + ret = ib_uverbs_dealloc_xrcd(uobject, xrcd, why); mutex_unlock(&uobject->context->ufile->device->xrcd_tree_mutex); return ret; @@ -167,20 +181,22 @@ static int uverbs_free_pd(struct ib_uobject *uobject, enum rdma_remove_reason why) { struct ib_pd *pd = uobject->object; + int ret; - if (why == RDMA_REMOVE_DESTROY && atomic_read(&pd->usecnt)) - return -EBUSY; + ret = ib_destroy_usecnt(&pd->usecnt, why, uobject); + if (ret) + return ret; ib_dealloc_pd((struct ib_pd *)uobject->object); return 0; } -static int uverbs_hot_unplug_completion_event_file(struct ib_uobject_file *uobj_file, +static int uverbs_hot_unplug_completion_event_file(struct ib_uobject *uobj, enum rdma_remove_reason why) { struct ib_uverbs_completion_event_file *comp_event_file = - container_of(uobj_file, struct ib_uverbs_completion_event_file, - uobj_file); + container_of(uobj, struct ib_uverbs_completion_event_file, + uobj); struct ib_uverbs_event_queue *event_queue = &comp_event_file->ev_queue; spin_lock_irq(&event_queue->lock); @@ -194,119 +210,77 @@ static int uverbs_hot_unplug_completion_event_file(struct ib_uobject_file *uobj_ return 0; }; -int uverbs_destroy_def_handler(struct ib_device *ib_dev, - struct ib_uverbs_file *file, +int uverbs_destroy_def_handler(struct ib_uverbs_file *file, struct uverbs_attr_bundle *attrs) { return 0; } +EXPORT_SYMBOL(uverbs_destroy_def_handler); -/* - * This spec is used in order to pass information to the hardware driver in a - * legacy way. Every verb that could get driver specific data should get this - * spec. - */ -const struct uverbs_attr_def uverbs_uhw_compat_in = - UVERBS_ATTR_PTR_IN_SZ(UVERBS_ATTR_UHW_IN, UVERBS_ATTR_SIZE(0, USHRT_MAX), - UA_FLAGS(UVERBS_ATTR_SPEC_F_MIN_SZ_OR_ZERO)); -const struct uverbs_attr_def uverbs_uhw_compat_out = - UVERBS_ATTR_PTR_OUT_SZ(UVERBS_ATTR_UHW_OUT, UVERBS_ATTR_SIZE(0, USHRT_MAX), - UA_FLAGS(UVERBS_ATTR_SPEC_F_MIN_SZ_OR_ZERO)); - -void create_udata(struct uverbs_attr_bundle *ctx, struct ib_udata *udata) -{ - /* - * This is for ease of conversion. The purpose is to convert all drivers - * to use uverbs_attr_bundle instead of ib_udata. - * Assume attr == 0 is input and attr == 1 is output. - */ - const struct uverbs_attr *uhw_in = - uverbs_attr_get(ctx, UVERBS_ATTR_UHW_IN); - const struct uverbs_attr *uhw_out = - uverbs_attr_get(ctx, UVERBS_ATTR_UHW_OUT); - - if (!IS_ERR(uhw_in)) { - udata->inlen = uhw_in->ptr_attr.len; - if (uverbs_attr_ptr_is_inline(uhw_in)) - udata->inbuf = &uhw_in->uattr->data; - else - udata->inbuf = u64_to_user_ptr(uhw_in->ptr_attr.data); - } else { - udata->inbuf = NULL; - udata->inlen = 0; - } - - if (!IS_ERR(uhw_out)) { - udata->outbuf = u64_to_user_ptr(uhw_out->ptr_attr.data); - udata->outlen = uhw_out->ptr_attr.len; - } else { - udata->outbuf = NULL; - udata->outlen = 0; - } -} - -DECLARE_UVERBS_NAMED_OBJECT(UVERBS_OBJECT_COMP_CHANNEL, - &UVERBS_TYPE_ALLOC_FD(0, - sizeof(struct ib_uverbs_completion_event_file), - uverbs_hot_unplug_completion_event_file, - &uverbs_event_fops, - "[infinibandevent]", O_RDONLY)); +DECLARE_UVERBS_NAMED_OBJECT( + UVERBS_OBJECT_COMP_CHANNEL, + UVERBS_TYPE_ALLOC_FD(sizeof(struct ib_uverbs_completion_event_file), + uverbs_hot_unplug_completion_event_file, + &uverbs_event_fops, + "[infinibandevent]", + O_RDONLY)); -DECLARE_UVERBS_NAMED_OBJECT(UVERBS_OBJECT_QP, - &UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_uqp_object), 0, - uverbs_free_qp)); +DECLARE_UVERBS_NAMED_OBJECT( + UVERBS_OBJECT_QP, + UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_uqp_object), uverbs_free_qp)); DECLARE_UVERBS_NAMED_OBJECT(UVERBS_OBJECT_MW, - &UVERBS_TYPE_ALLOC_IDR(0, uverbs_free_mw)); + UVERBS_TYPE_ALLOC_IDR(uverbs_free_mw)); -DECLARE_UVERBS_NAMED_OBJECT(UVERBS_OBJECT_SRQ, - &UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_usrq_object), 0, - uverbs_free_srq)); +DECLARE_UVERBS_NAMED_OBJECT( + UVERBS_OBJECT_SRQ, + UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_usrq_object), + uverbs_free_srq)); DECLARE_UVERBS_NAMED_OBJECT(UVERBS_OBJECT_AH, - &UVERBS_TYPE_ALLOC_IDR(0, uverbs_free_ah)); + UVERBS_TYPE_ALLOC_IDR(uverbs_free_ah)); -DECLARE_UVERBS_NAMED_OBJECT(UVERBS_OBJECT_FLOW, - &UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_uflow_object), - 0, uverbs_free_flow)); +DECLARE_UVERBS_NAMED_OBJECT( + UVERBS_OBJECT_FLOW, + UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_uflow_object), + uverbs_free_flow)); -DECLARE_UVERBS_NAMED_OBJECT(UVERBS_OBJECT_WQ, - &UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_uwq_object), 0, - uverbs_free_wq)); +DECLARE_UVERBS_NAMED_OBJECT( + UVERBS_OBJECT_WQ, + UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_uwq_object), uverbs_free_wq)); DECLARE_UVERBS_NAMED_OBJECT(UVERBS_OBJECT_RWQ_IND_TBL, - &UVERBS_TYPE_ALLOC_IDR(0, uverbs_free_rwq_ind_tbl)); + UVERBS_TYPE_ALLOC_IDR(uverbs_free_rwq_ind_tbl)); -DECLARE_UVERBS_NAMED_OBJECT(UVERBS_OBJECT_XRCD, - &UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_uxrcd_object), 0, - uverbs_free_xrcd)); +DECLARE_UVERBS_NAMED_OBJECT( + UVERBS_OBJECT_XRCD, + UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_uxrcd_object), + uverbs_free_xrcd)); DECLARE_UVERBS_NAMED_OBJECT(UVERBS_OBJECT_PD, - /* 2 is used in order to free the PD after MRs */ - &UVERBS_TYPE_ALLOC_IDR(2, uverbs_free_pd)); - -DECLARE_UVERBS_NAMED_OBJECT(UVERBS_OBJECT_DEVICE, NULL); - -static DECLARE_UVERBS_OBJECT_TREE(uverbs_default_objects, - &UVERBS_OBJECT(UVERBS_OBJECT_DEVICE), - &UVERBS_OBJECT(UVERBS_OBJECT_PD), - &UVERBS_OBJECT(UVERBS_OBJECT_MR), - &UVERBS_OBJECT(UVERBS_OBJECT_COMP_CHANNEL), - &UVERBS_OBJECT(UVERBS_OBJECT_CQ), - &UVERBS_OBJECT(UVERBS_OBJECT_QP), - &UVERBS_OBJECT(UVERBS_OBJECT_AH), - &UVERBS_OBJECT(UVERBS_OBJECT_MW), - &UVERBS_OBJECT(UVERBS_OBJECT_SRQ), - &UVERBS_OBJECT(UVERBS_OBJECT_FLOW), - &UVERBS_OBJECT(UVERBS_OBJECT_WQ), - &UVERBS_OBJECT(UVERBS_OBJECT_RWQ_IND_TBL), - &UVERBS_OBJECT(UVERBS_OBJECT_XRCD), - &UVERBS_OBJECT(UVERBS_OBJECT_FLOW_ACTION), - &UVERBS_OBJECT(UVERBS_OBJECT_DM), - &UVERBS_OBJECT(UVERBS_OBJECT_COUNTERS)); + UVERBS_TYPE_ALLOC_IDR(uverbs_free_pd)); + +DECLARE_UVERBS_GLOBAL_METHODS(UVERBS_OBJECT_DEVICE); + +DECLARE_UVERBS_OBJECT_TREE(uverbs_default_objects, + &UVERBS_OBJECT(UVERBS_OBJECT_DEVICE), + &UVERBS_OBJECT(UVERBS_OBJECT_PD), + &UVERBS_OBJECT(UVERBS_OBJECT_MR), + &UVERBS_OBJECT(UVERBS_OBJECT_COMP_CHANNEL), + &UVERBS_OBJECT(UVERBS_OBJECT_CQ), + &UVERBS_OBJECT(UVERBS_OBJECT_QP), + &UVERBS_OBJECT(UVERBS_OBJECT_AH), + &UVERBS_OBJECT(UVERBS_OBJECT_MW), + &UVERBS_OBJECT(UVERBS_OBJECT_SRQ), + &UVERBS_OBJECT(UVERBS_OBJECT_FLOW), + &UVERBS_OBJECT(UVERBS_OBJECT_WQ), + &UVERBS_OBJECT(UVERBS_OBJECT_RWQ_IND_TBL), + &UVERBS_OBJECT(UVERBS_OBJECT_XRCD), + &UVERBS_OBJECT(UVERBS_OBJECT_FLOW_ACTION), + &UVERBS_OBJECT(UVERBS_OBJECT_DM), + &UVERBS_OBJECT(UVERBS_OBJECT_COUNTERS)); const struct uverbs_object_tree_def *uverbs_default_get_objects(void) { return &uverbs_default_objects; } -EXPORT_SYMBOL_GPL(uverbs_default_get_objects); diff --git a/drivers/infiniband/core/uverbs_std_types_counters.c b/drivers/infiniband/core/uverbs_std_types_counters.c index 03b182a684a6..a0ffdcf9a51c 100644 --- a/drivers/infiniband/core/uverbs_std_types_counters.c +++ b/drivers/infiniband/core/uverbs_std_types_counters.c @@ -38,20 +38,22 @@ static int uverbs_free_counters(struct ib_uobject *uobject, enum rdma_remove_reason why) { struct ib_counters *counters = uobject->object; + int ret; - if (why == RDMA_REMOVE_DESTROY && - atomic_read(&counters->usecnt)) - return -EBUSY; + ret = ib_destroy_usecnt(&counters->usecnt, why, uobject); + if (ret) + return ret; return counters->device->destroy_counters(counters); } -static int UVERBS_HANDLER(UVERBS_METHOD_COUNTERS_CREATE)(struct ib_device *ib_dev, - struct ib_uverbs_file *file, - struct uverbs_attr_bundle *attrs) +static int UVERBS_HANDLER(UVERBS_METHOD_COUNTERS_CREATE)( + struct ib_uverbs_file *file, struct uverbs_attr_bundle *attrs) { + struct ib_uobject *uobj = uverbs_attr_get_uobject( + attrs, UVERBS_ATTR_CREATE_COUNTERS_HANDLE); + struct ib_device *ib_dev = uobj->context->device; struct ib_counters *counters; - struct ib_uobject *uobj; int ret; /* @@ -62,7 +64,6 @@ static int UVERBS_HANDLER(UVERBS_METHOD_COUNTERS_CREATE)(struct ib_device *ib_de if (!ib_dev->create_counters) return -EOPNOTSUPP; - uobj = uverbs_attr_get_uobject(attrs, UVERBS_ATTR_CREATE_COUNTERS_HANDLE); counters = ib_dev->create_counters(ib_dev, attrs); if (IS_ERR(counters)) { ret = PTR_ERR(counters); @@ -80,9 +81,8 @@ err_create_counters: return ret; } -static int UVERBS_HANDLER(UVERBS_METHOD_COUNTERS_READ)(struct ib_device *ib_dev, - struct ib_uverbs_file *file, - struct uverbs_attr_bundle *attrs) +static int UVERBS_HANDLER(UVERBS_METHOD_COUNTERS_READ)( + struct ib_uverbs_file *file, struct uverbs_attr_bundle *attrs) { struct ib_counters_read_attr read_attr = {}; const struct uverbs_attr *uattr; @@ -90,68 +90,62 @@ static int UVERBS_HANDLER(UVERBS_METHOD_COUNTERS_READ)(struct ib_device *ib_dev, uverbs_attr_get_obj(attrs, UVERBS_ATTR_READ_COUNTERS_HANDLE); int ret; - if (!ib_dev->read_counters) + if (!counters->device->read_counters) return -EOPNOTSUPP; if (!atomic_read(&counters->usecnt)) return -EINVAL; - ret = uverbs_copy_from(&read_attr.flags, attrs, - UVERBS_ATTR_READ_COUNTERS_FLAGS); + ret = uverbs_get_flags32(&read_attr.flags, attrs, + UVERBS_ATTR_READ_COUNTERS_FLAGS, + IB_UVERBS_READ_COUNTERS_PREFER_CACHED); if (ret) return ret; uattr = uverbs_attr_get(attrs, UVERBS_ATTR_READ_COUNTERS_BUFF); read_attr.ncounters = uattr->ptr_attr.len / sizeof(u64); - read_attr.counters_buff = kcalloc(read_attr.ncounters, - sizeof(u64), GFP_KERNEL); - if (!read_attr.counters_buff) - return -ENOMEM; - - ret = ib_dev->read_counters(counters, - &read_attr, - attrs); - if (ret) - goto err_read; + read_attr.counters_buff = uverbs_zalloc( + attrs, array_size(read_attr.ncounters, sizeof(u64))); + if (IS_ERR(read_attr.counters_buff)) + return PTR_ERR(read_attr.counters_buff); - ret = uverbs_copy_to(attrs, UVERBS_ATTR_READ_COUNTERS_BUFF, - read_attr.counters_buff, - read_attr.ncounters * sizeof(u64)); + ret = counters->device->read_counters(counters, &read_attr, attrs); + if (ret) + return ret; -err_read: - kfree(read_attr.counters_buff); - return ret; + return uverbs_copy_to(attrs, UVERBS_ATTR_READ_COUNTERS_BUFF, + read_attr.counters_buff, + read_attr.ncounters * sizeof(u64)); } -static DECLARE_UVERBS_NAMED_METHOD(UVERBS_METHOD_COUNTERS_CREATE, - &UVERBS_ATTR_IDR(UVERBS_ATTR_CREATE_COUNTERS_HANDLE, - UVERBS_OBJECT_COUNTERS, - UVERBS_ACCESS_NEW, - UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY))); - -static DECLARE_UVERBS_NAMED_METHOD_WITH_HANDLER(UVERBS_METHOD_COUNTERS_DESTROY, - uverbs_destroy_def_handler, - &UVERBS_ATTR_IDR(UVERBS_ATTR_DESTROY_COUNTERS_HANDLE, - UVERBS_OBJECT_COUNTERS, - UVERBS_ACCESS_DESTROY, - UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY))); - -#define MAX_COUNTERS_BUFF_SIZE USHRT_MAX -static DECLARE_UVERBS_NAMED_METHOD(UVERBS_METHOD_COUNTERS_READ, - &UVERBS_ATTR_IDR(UVERBS_ATTR_READ_COUNTERS_HANDLE, - UVERBS_OBJECT_COUNTERS, - UVERBS_ACCESS_READ, - UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)), - &UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_READ_COUNTERS_BUFF, - UVERBS_ATTR_SIZE(0, MAX_COUNTERS_BUFF_SIZE), - UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)), - &UVERBS_ATTR_PTR_IN(UVERBS_ATTR_READ_COUNTERS_FLAGS, - UVERBS_ATTR_TYPE(__u32), - UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY))); +DECLARE_UVERBS_NAMED_METHOD( + UVERBS_METHOD_COUNTERS_CREATE, + UVERBS_ATTR_IDR(UVERBS_ATTR_CREATE_COUNTERS_HANDLE, + UVERBS_OBJECT_COUNTERS, + UVERBS_ACCESS_NEW, + UA_MANDATORY)); + +DECLARE_UVERBS_NAMED_METHOD_DESTROY( + UVERBS_METHOD_COUNTERS_DESTROY, + UVERBS_ATTR_IDR(UVERBS_ATTR_DESTROY_COUNTERS_HANDLE, + UVERBS_OBJECT_COUNTERS, + UVERBS_ACCESS_DESTROY, + UA_MANDATORY)); + +DECLARE_UVERBS_NAMED_METHOD( + UVERBS_METHOD_COUNTERS_READ, + UVERBS_ATTR_IDR(UVERBS_ATTR_READ_COUNTERS_HANDLE, + UVERBS_OBJECT_COUNTERS, + UVERBS_ACCESS_READ, + UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_READ_COUNTERS_BUFF, + UVERBS_ATTR_MIN_SIZE(0), + UA_MANDATORY), + UVERBS_ATTR_FLAGS_IN(UVERBS_ATTR_READ_COUNTERS_FLAGS, + enum ib_uverbs_read_counters_flags)); DECLARE_UVERBS_NAMED_OBJECT(UVERBS_OBJECT_COUNTERS, - &UVERBS_TYPE_ALLOC_IDR(0, uverbs_free_counters), + UVERBS_TYPE_ALLOC_IDR(uverbs_free_counters), &UVERBS_METHOD(UVERBS_METHOD_COUNTERS_CREATE), &UVERBS_METHOD(UVERBS_METHOD_COUNTERS_DESTROY), &UVERBS_METHOD(UVERBS_METHOD_COUNTERS_READ)); - diff --git a/drivers/infiniband/core/uverbs_std_types_cq.c b/drivers/infiniband/core/uverbs_std_types_cq.c index 3d293d01afea..5b5f2052cd52 100644 --- a/drivers/infiniband/core/uverbs_std_types_cq.c +++ b/drivers/infiniband/core/uverbs_std_types_cq.c @@ -44,21 +44,26 @@ static int uverbs_free_cq(struct ib_uobject *uobject, int ret; ret = ib_destroy_cq(cq); - if (!ret || why != RDMA_REMOVE_DESTROY) - ib_uverbs_release_ucq(uobject->context->ufile, ev_queue ? - container_of(ev_queue, - struct ib_uverbs_completion_event_file, - ev_queue) : NULL, - ucq); + if (ib_is_destroy_retryable(ret, why, uobject)) + return ret; + + ib_uverbs_release_ucq( + uobject->context->ufile, + ev_queue ? container_of(ev_queue, + struct ib_uverbs_completion_event_file, + ev_queue) : + NULL, + ucq); return ret; } -static int UVERBS_HANDLER(UVERBS_METHOD_CQ_CREATE)(struct ib_device *ib_dev, - struct ib_uverbs_file *file, - struct uverbs_attr_bundle *attrs) +static int UVERBS_HANDLER(UVERBS_METHOD_CQ_CREATE)( + struct ib_uverbs_file *file, struct uverbs_attr_bundle *attrs) { - struct ib_ucontext *ucontext = file->ucontext; - struct ib_ucq_object *obj; + struct ib_ucq_object *obj = container_of( + uverbs_attr_get_uobject(attrs, UVERBS_ATTR_CREATE_CQ_HANDLE), + typeof(*obj), uobject); + struct ib_device *ib_dev = obj->uobject.context->device; struct ib_udata uhw; int ret; u64 user_handle; @@ -67,7 +72,7 @@ static int UVERBS_HANDLER(UVERBS_METHOD_CQ_CREATE)(struct ib_device *ib_dev, struct ib_uverbs_completion_event_file *ev_file = NULL; struct ib_uobject *ev_file_uobj; - if (!(ib_dev->uverbs_cmd_mask & 1ULL << IB_USER_VERBS_CMD_CREATE_CQ)) + if (!ib_dev->create_cq || !ib_dev->destroy_cq) return -EOPNOTSUPP; ret = uverbs_copy_from(&attr.comp_vector, attrs, @@ -81,28 +86,26 @@ static int UVERBS_HANDLER(UVERBS_METHOD_CQ_CREATE)(struct ib_device *ib_dev, if (ret) return ret; - /* Optional param, if it doesn't exist, we get -ENOENT and skip it */ - if (IS_UVERBS_COPY_ERR(uverbs_copy_from(&attr.flags, attrs, - UVERBS_ATTR_CREATE_CQ_FLAGS))) - return -EFAULT; + ret = uverbs_get_flags32(&attr.flags, attrs, + UVERBS_ATTR_CREATE_CQ_FLAGS, + IB_UVERBS_CQ_FLAGS_TIMESTAMP_COMPLETION | + IB_UVERBS_CQ_FLAGS_IGNORE_OVERRUN); + if (ret) + return ret; ev_file_uobj = uverbs_attr_get_uobject(attrs, UVERBS_ATTR_CREATE_CQ_COMP_CHANNEL); if (!IS_ERR(ev_file_uobj)) { ev_file = container_of(ev_file_uobj, struct ib_uverbs_completion_event_file, - uobj_file.uobj); + uobj); uverbs_uobject_get(ev_file_uobj); } - if (attr.comp_vector >= ucontext->ufile->device->num_comp_vectors) { + if (attr.comp_vector >= file->device->num_comp_vectors) { ret = -EINVAL; goto err_event_file; } - obj = container_of(uverbs_attr_get_uobject(attrs, - UVERBS_ATTR_CREATE_CQ_HANDLE), - typeof(*obj), uobject); - obj->uverbs_file = ucontext->ufile; obj->comp_events_reported = 0; obj->async_events_reported = 0; INIT_LIST_HEAD(&obj->comp_list); @@ -111,7 +114,7 @@ static int UVERBS_HANDLER(UVERBS_METHOD_CQ_CREATE)(struct ib_device *ib_dev, /* Temporary, only until drivers get the new uverbs_attr_bundle */ create_udata(attrs, &uhw); - cq = ib_dev->create_cq(ib_dev, &attr, ucontext, &uhw); + cq = ib_dev->create_cq(ib_dev, &attr, obj->uobject.context, &uhw); if (IS_ERR(cq)) { ret = PTR_ERR(cq); goto err_event_file; @@ -143,69 +146,64 @@ err_event_file: return ret; }; -static DECLARE_UVERBS_NAMED_METHOD(UVERBS_METHOD_CQ_CREATE, - &UVERBS_ATTR_IDR(UVERBS_ATTR_CREATE_CQ_HANDLE, UVERBS_OBJECT_CQ, - UVERBS_ACCESS_NEW, - UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)), - &UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_CQ_CQE, +DECLARE_UVERBS_NAMED_METHOD( + UVERBS_METHOD_CQ_CREATE, + UVERBS_ATTR_IDR(UVERBS_ATTR_CREATE_CQ_HANDLE, + UVERBS_OBJECT_CQ, + UVERBS_ACCESS_NEW, + UA_MANDATORY), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_CQ_CQE, + UVERBS_ATTR_TYPE(u32), + UA_MANDATORY), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_CQ_USER_HANDLE, + UVERBS_ATTR_TYPE(u64), + UA_MANDATORY), + UVERBS_ATTR_FD(UVERBS_ATTR_CREATE_CQ_COMP_CHANNEL, + UVERBS_OBJECT_COMP_CHANNEL, + UVERBS_ACCESS_READ, + UA_OPTIONAL), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_CQ_COMP_VECTOR, + UVERBS_ATTR_TYPE(u32), + UA_MANDATORY), + UVERBS_ATTR_FLAGS_IN(UVERBS_ATTR_CREATE_CQ_FLAGS, + enum ib_uverbs_ex_create_cq_flags), + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_CREATE_CQ_RESP_CQE, UVERBS_ATTR_TYPE(u32), - UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)), - &UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_CQ_USER_HANDLE, - UVERBS_ATTR_TYPE(u64), - UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)), - &UVERBS_ATTR_FD(UVERBS_ATTR_CREATE_CQ_COMP_CHANNEL, - UVERBS_OBJECT_COMP_CHANNEL, - UVERBS_ACCESS_READ), - &UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_CQ_COMP_VECTOR, UVERBS_ATTR_TYPE(u32), - UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)), - &UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_CQ_FLAGS, UVERBS_ATTR_TYPE(u32)), - &UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_CREATE_CQ_RESP_CQE, UVERBS_ATTR_TYPE(u32), - UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)), - &uverbs_uhw_compat_in, &uverbs_uhw_compat_out); - -static int UVERBS_HANDLER(UVERBS_METHOD_CQ_DESTROY)(struct ib_device *ib_dev, - struct ib_uverbs_file *file, - struct uverbs_attr_bundle *attrs) + UA_MANDATORY), + UVERBS_ATTR_UHW()); + +static int UVERBS_HANDLER(UVERBS_METHOD_CQ_DESTROY)( + struct ib_uverbs_file *file, struct uverbs_attr_bundle *attrs) { struct ib_uobject *uobj = uverbs_attr_get_uobject(attrs, UVERBS_ATTR_DESTROY_CQ_HANDLE); - struct ib_uverbs_destroy_cq_resp resp; - struct ib_ucq_object *obj; - int ret; - - if (IS_ERR(uobj)) - return PTR_ERR(uobj); - - obj = container_of(uobj, struct ib_ucq_object, uobject); - - if (!(ib_dev->uverbs_cmd_mask & 1ULL << IB_USER_VERBS_CMD_DESTROY_CQ)) - return -EOPNOTSUPP; - - ret = rdma_explicit_destroy(uobj); - if (ret) - return ret; - - resp.comp_events_reported = obj->comp_events_reported; - resp.async_events_reported = obj->async_events_reported; + struct ib_ucq_object *obj = + container_of(uobj, struct ib_ucq_object, uobject); + struct ib_uverbs_destroy_cq_resp resp = { + .comp_events_reported = obj->comp_events_reported, + .async_events_reported = obj->async_events_reported + }; return uverbs_copy_to(attrs, UVERBS_ATTR_DESTROY_CQ_RESP, &resp, sizeof(resp)); } -static DECLARE_UVERBS_NAMED_METHOD(UVERBS_METHOD_CQ_DESTROY, - &UVERBS_ATTR_IDR(UVERBS_ATTR_DESTROY_CQ_HANDLE, UVERBS_OBJECT_CQ, - UVERBS_ACCESS_DESTROY, - UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)), - &UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_DESTROY_CQ_RESP, - UVERBS_ATTR_TYPE(struct ib_uverbs_destroy_cq_resp), - UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY))); - -DECLARE_UVERBS_NAMED_OBJECT(UVERBS_OBJECT_CQ, - &UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_ucq_object), 0, - uverbs_free_cq), +DECLARE_UVERBS_NAMED_METHOD( + UVERBS_METHOD_CQ_DESTROY, + UVERBS_ATTR_IDR(UVERBS_ATTR_DESTROY_CQ_HANDLE, + UVERBS_OBJECT_CQ, + UVERBS_ACCESS_DESTROY, + UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_DESTROY_CQ_RESP, + UVERBS_ATTR_TYPE(struct ib_uverbs_destroy_cq_resp), + UA_MANDATORY)); + +DECLARE_UVERBS_NAMED_OBJECT( + UVERBS_OBJECT_CQ, + UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_ucq_object), uverbs_free_cq), + #if IS_ENABLED(CONFIG_INFINIBAND_EXP_LEGACY_VERBS_NEW_UAPI) - &UVERBS_METHOD(UVERBS_METHOD_CQ_CREATE), - &UVERBS_METHOD(UVERBS_METHOD_CQ_DESTROY) + &UVERBS_METHOD(UVERBS_METHOD_CQ_CREATE), + &UVERBS_METHOD(UVERBS_METHOD_CQ_DESTROY) #endif - ); - +); diff --git a/drivers/infiniband/core/uverbs_std_types_dm.c b/drivers/infiniband/core/uverbs_std_types_dm.c index 8b681575b615..edc3ff7733d4 100644 --- a/drivers/infiniband/core/uverbs_std_types_dm.c +++ b/drivers/infiniband/core/uverbs_std_types_dm.c @@ -37,20 +37,24 @@ static int uverbs_free_dm(struct ib_uobject *uobject, enum rdma_remove_reason why) { struct ib_dm *dm = uobject->object; + int ret; - if (why == RDMA_REMOVE_DESTROY && atomic_read(&dm->usecnt)) - return -EBUSY; + ret = ib_destroy_usecnt(&dm->usecnt, why, uobject); + if (ret) + return ret; return dm->device->dealloc_dm(dm); } -static int UVERBS_HANDLER(UVERBS_METHOD_DM_ALLOC)(struct ib_device *ib_dev, - struct ib_uverbs_file *file, - struct uverbs_attr_bundle *attrs) +static int +UVERBS_HANDLER(UVERBS_METHOD_DM_ALLOC)(struct ib_uverbs_file *file, + struct uverbs_attr_bundle *attrs) { - struct ib_ucontext *ucontext = file->ucontext; struct ib_dm_alloc_attr attr = {}; - struct ib_uobject *uobj; + struct ib_uobject *uobj = + uverbs_attr_get(attrs, UVERBS_ATTR_ALLOC_DM_HANDLE) + ->obj_attr.uobject; + struct ib_device *ib_dev = uobj->context->device; struct ib_dm *dm; int ret; @@ -67,9 +71,7 @@ static int UVERBS_HANDLER(UVERBS_METHOD_DM_ALLOC)(struct ib_device *ib_dev, if (ret) return ret; - uobj = uverbs_attr_get(attrs, UVERBS_ATTR_ALLOC_DM_HANDLE)->obj_attr.uobject; - - dm = ib_dev->alloc_dm(ib_dev, ucontext, &attr, attrs); + dm = ib_dev->alloc_dm(ib_dev, uobj->context, &attr, attrs); if (IS_ERR(dm)) return PTR_ERR(dm); @@ -83,26 +85,27 @@ static int UVERBS_HANDLER(UVERBS_METHOD_DM_ALLOC)(struct ib_device *ib_dev, return 0; } -static DECLARE_UVERBS_NAMED_METHOD(UVERBS_METHOD_DM_ALLOC, - &UVERBS_ATTR_IDR(UVERBS_ATTR_ALLOC_DM_HANDLE, UVERBS_OBJECT_DM, - UVERBS_ACCESS_NEW, - UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)), - &UVERBS_ATTR_PTR_IN(UVERBS_ATTR_ALLOC_DM_LENGTH, - UVERBS_ATTR_TYPE(u64), - UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)), - &UVERBS_ATTR_PTR_IN(UVERBS_ATTR_ALLOC_DM_ALIGNMENT, - UVERBS_ATTR_TYPE(u32), - UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY))); - -static DECLARE_UVERBS_NAMED_METHOD_WITH_HANDLER(UVERBS_METHOD_DM_FREE, - uverbs_destroy_def_handler, - &UVERBS_ATTR_IDR(UVERBS_ATTR_FREE_DM_HANDLE, - UVERBS_OBJECT_DM, - UVERBS_ACCESS_DESTROY, - UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY))); +DECLARE_UVERBS_NAMED_METHOD( + UVERBS_METHOD_DM_ALLOC, + UVERBS_ATTR_IDR(UVERBS_ATTR_ALLOC_DM_HANDLE, + UVERBS_OBJECT_DM, + UVERBS_ACCESS_NEW, + UA_MANDATORY), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_ALLOC_DM_LENGTH, + UVERBS_ATTR_TYPE(u64), + UA_MANDATORY), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_ALLOC_DM_ALIGNMENT, + UVERBS_ATTR_TYPE(u32), + UA_MANDATORY)); + +DECLARE_UVERBS_NAMED_METHOD_DESTROY( + UVERBS_METHOD_DM_FREE, + UVERBS_ATTR_IDR(UVERBS_ATTR_FREE_DM_HANDLE, + UVERBS_OBJECT_DM, + UVERBS_ACCESS_DESTROY, + UA_MANDATORY)); DECLARE_UVERBS_NAMED_OBJECT(UVERBS_OBJECT_DM, - /* 1 is used in order to free the DM after MRs */ - &UVERBS_TYPE_ALLOC_IDR(1, uverbs_free_dm), + UVERBS_TYPE_ALLOC_IDR(uverbs_free_dm), &UVERBS_METHOD(UVERBS_METHOD_DM_ALLOC), &UVERBS_METHOD(UVERBS_METHOD_DM_FREE)); diff --git a/drivers/infiniband/core/uverbs_std_types_flow_action.c b/drivers/infiniband/core/uverbs_std_types_flow_action.c index a7be51cf2e42..d8cfafe23bd9 100644 --- a/drivers/infiniband/core/uverbs_std_types_flow_action.c +++ b/drivers/infiniband/core/uverbs_std_types_flow_action.c @@ -37,10 +37,11 @@ static int uverbs_free_flow_action(struct ib_uobject *uobject, enum rdma_remove_reason why) { struct ib_flow_action *action = uobject->object; + int ret; - if (why == RDMA_REMOVE_DESTROY && - atomic_read(&action->usecnt)) - return -EBUSY; + ret = ib_destroy_usecnt(&action->usecnt, why, uobject); + if (ret) + return ret; return action->device->destroy_flow_action(action); } @@ -303,12 +304,13 @@ static int parse_flow_action_esp(struct ib_device *ib_dev, return 0; } -static int UVERBS_HANDLER(UVERBS_METHOD_FLOW_ACTION_ESP_CREATE)(struct ib_device *ib_dev, - struct ib_uverbs_file *file, - struct uverbs_attr_bundle *attrs) +static int UVERBS_HANDLER(UVERBS_METHOD_FLOW_ACTION_ESP_CREATE)( + struct ib_uverbs_file *file, struct uverbs_attr_bundle *attrs) { + struct ib_uobject *uobj = uverbs_attr_get_uobject( + attrs, UVERBS_ATTR_CREATE_FLOW_ACTION_ESP_HANDLE); + struct ib_device *ib_dev = uobj->context->device; int ret; - struct ib_uobject *uobj; struct ib_flow_action *action; struct ib_flow_action_esp_attr esp_attr = {}; @@ -320,7 +322,6 @@ static int UVERBS_HANDLER(UVERBS_METHOD_FLOW_ACTION_ESP_CREATE)(struct ib_device return ret; /* No need to check as this attribute is marked as MANDATORY */ - uobj = uverbs_attr_get_uobject(attrs, UVERBS_ATTR_FLOW_ACTION_ESP_HANDLE); action = ib_dev->create_flow_action_esp(ib_dev, &esp_attr.hdr, attrs); if (IS_ERR(action)) return PTR_ERR(action); @@ -334,102 +335,109 @@ static int UVERBS_HANDLER(UVERBS_METHOD_FLOW_ACTION_ESP_CREATE)(struct ib_device return 0; } -static int UVERBS_HANDLER(UVERBS_METHOD_FLOW_ACTION_ESP_MODIFY)(struct ib_device *ib_dev, - struct ib_uverbs_file *file, - struct uverbs_attr_bundle *attrs) +static int UVERBS_HANDLER(UVERBS_METHOD_FLOW_ACTION_ESP_MODIFY)( + struct ib_uverbs_file *file, struct uverbs_attr_bundle *attrs) { + struct ib_uobject *uobj = uverbs_attr_get_uobject( + attrs, UVERBS_ATTR_MODIFY_FLOW_ACTION_ESP_HANDLE); + struct ib_flow_action *action = uobj->object; int ret; - struct ib_uobject *uobj; - struct ib_flow_action *action; struct ib_flow_action_esp_attr esp_attr = {}; - if (!ib_dev->modify_flow_action_esp) + if (!action->device->modify_flow_action_esp) return -EOPNOTSUPP; - ret = parse_flow_action_esp(ib_dev, file, attrs, &esp_attr, true); + ret = parse_flow_action_esp(action->device, file, attrs, &esp_attr, + true); if (ret) return ret; - uobj = uverbs_attr_get_uobject(attrs, UVERBS_ATTR_FLOW_ACTION_ESP_HANDLE); - action = uobj->object; - if (action->type != IB_FLOW_ACTION_ESP) return -EINVAL; - return ib_dev->modify_flow_action_esp(action, - &esp_attr.hdr, - attrs); + return action->device->modify_flow_action_esp(action, &esp_attr.hdr, + attrs); } static const struct uverbs_attr_spec uverbs_flow_action_esp_keymat[] = { [IB_UVERBS_FLOW_ACTION_ESP_KEYMAT_AES_GCM] = { - { .ptr = { - .type = UVERBS_ATTR_TYPE_PTR_IN, - UVERBS_ATTR_TYPE(struct ib_uverbs_flow_action_esp_keymat_aes_gcm), - .flags = UVERBS_ATTR_SPEC_F_MIN_SZ_OR_ZERO, - } }, + .type = UVERBS_ATTR_TYPE_PTR_IN, + UVERBS_ATTR_STRUCT( + struct ib_uverbs_flow_action_esp_keymat_aes_gcm, + aes_key), }, }; static const struct uverbs_attr_spec uverbs_flow_action_esp_replay[] = { [IB_UVERBS_FLOW_ACTION_ESP_REPLAY_NONE] = { - { .ptr = { - .type = UVERBS_ATTR_TYPE_PTR_IN, - /* No need to specify any data */ - .len = 0, - } } + .type = UVERBS_ATTR_TYPE_PTR_IN, + UVERBS_ATTR_NO_DATA(), }, [IB_UVERBS_FLOW_ACTION_ESP_REPLAY_BMP] = { - { .ptr = { - .type = UVERBS_ATTR_TYPE_PTR_IN, - UVERBS_ATTR_STRUCT(struct ib_uverbs_flow_action_esp_replay_bmp, size), - .flags = UVERBS_ATTR_SPEC_F_MIN_SZ_OR_ZERO, - } } + .type = UVERBS_ATTR_TYPE_PTR_IN, + UVERBS_ATTR_STRUCT(struct ib_uverbs_flow_action_esp_replay_bmp, + size), }, }; -static DECLARE_UVERBS_NAMED_METHOD(UVERBS_METHOD_FLOW_ACTION_ESP_CREATE, - &UVERBS_ATTR_IDR(UVERBS_ATTR_FLOW_ACTION_ESP_HANDLE, UVERBS_OBJECT_FLOW_ACTION, - UVERBS_ACCESS_NEW, - UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)), - &UVERBS_ATTR_PTR_IN(UVERBS_ATTR_FLOW_ACTION_ESP_ATTRS, - UVERBS_ATTR_STRUCT(struct ib_uverbs_flow_action_esp, hard_limit_pkts), - UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY | - UVERBS_ATTR_SPEC_F_MIN_SZ_OR_ZERO)), - &UVERBS_ATTR_PTR_IN(UVERBS_ATTR_FLOW_ACTION_ESP_ESN, UVERBS_ATTR_TYPE(__u32)), - &UVERBS_ATTR_ENUM_IN(UVERBS_ATTR_FLOW_ACTION_ESP_KEYMAT, - uverbs_flow_action_esp_keymat, - UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)), - &UVERBS_ATTR_ENUM_IN(UVERBS_ATTR_FLOW_ACTION_ESP_REPLAY, - uverbs_flow_action_esp_replay), - &UVERBS_ATTR_PTR_IN(UVERBS_ATTR_FLOW_ACTION_ESP_ENCAP, - UVERBS_ATTR_STRUCT(struct ib_uverbs_flow_action_esp_encap, type))); - -static DECLARE_UVERBS_NAMED_METHOD(UVERBS_METHOD_FLOW_ACTION_ESP_MODIFY, - &UVERBS_ATTR_IDR(UVERBS_ATTR_FLOW_ACTION_ESP_HANDLE, UVERBS_OBJECT_FLOW_ACTION, - UVERBS_ACCESS_WRITE, - UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)), - &UVERBS_ATTR_PTR_IN(UVERBS_ATTR_FLOW_ACTION_ESP_ATTRS, - UVERBS_ATTR_STRUCT(struct ib_uverbs_flow_action_esp, hard_limit_pkts), - UA_FLAGS(UVERBS_ATTR_SPEC_F_MIN_SZ_OR_ZERO)), - &UVERBS_ATTR_PTR_IN(UVERBS_ATTR_FLOW_ACTION_ESP_ESN, UVERBS_ATTR_TYPE(__u32)), - &UVERBS_ATTR_ENUM_IN(UVERBS_ATTR_FLOW_ACTION_ESP_KEYMAT, - uverbs_flow_action_esp_keymat), - &UVERBS_ATTR_ENUM_IN(UVERBS_ATTR_FLOW_ACTION_ESP_REPLAY, - uverbs_flow_action_esp_replay), - &UVERBS_ATTR_PTR_IN(UVERBS_ATTR_FLOW_ACTION_ESP_ENCAP, - UVERBS_ATTR_STRUCT(struct ib_uverbs_flow_action_esp_encap, type))); - -static DECLARE_UVERBS_NAMED_METHOD_WITH_HANDLER(UVERBS_METHOD_FLOW_ACTION_DESTROY, - uverbs_destroy_def_handler, - &UVERBS_ATTR_IDR(UVERBS_ATTR_DESTROY_FLOW_ACTION_HANDLE, - UVERBS_OBJECT_FLOW_ACTION, - UVERBS_ACCESS_DESTROY, - UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY))); - -DECLARE_UVERBS_NAMED_OBJECT(UVERBS_OBJECT_FLOW_ACTION, - &UVERBS_TYPE_ALLOC_IDR(0, uverbs_free_flow_action), - &UVERBS_METHOD(UVERBS_METHOD_FLOW_ACTION_ESP_CREATE), - &UVERBS_METHOD(UVERBS_METHOD_FLOW_ACTION_DESTROY), - &UVERBS_METHOD(UVERBS_METHOD_FLOW_ACTION_ESP_MODIFY)); - +DECLARE_UVERBS_NAMED_METHOD( + UVERBS_METHOD_FLOW_ACTION_ESP_CREATE, + UVERBS_ATTR_IDR(UVERBS_ATTR_CREATE_FLOW_ACTION_ESP_HANDLE, + UVERBS_OBJECT_FLOW_ACTION, + UVERBS_ACCESS_NEW, + UA_MANDATORY), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_FLOW_ACTION_ESP_ATTRS, + UVERBS_ATTR_STRUCT(struct ib_uverbs_flow_action_esp, + hard_limit_pkts), + UA_MANDATORY), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_FLOW_ACTION_ESP_ESN, + UVERBS_ATTR_TYPE(__u32), + UA_OPTIONAL), + UVERBS_ATTR_ENUM_IN(UVERBS_ATTR_FLOW_ACTION_ESP_KEYMAT, + uverbs_flow_action_esp_keymat, + UA_MANDATORY), + UVERBS_ATTR_ENUM_IN(UVERBS_ATTR_FLOW_ACTION_ESP_REPLAY, + uverbs_flow_action_esp_replay, + UA_OPTIONAL), + UVERBS_ATTR_PTR_IN( + UVERBS_ATTR_FLOW_ACTION_ESP_ENCAP, + UVERBS_ATTR_TYPE(struct ib_uverbs_flow_action_esp_encap), + UA_OPTIONAL)); + +DECLARE_UVERBS_NAMED_METHOD( + UVERBS_METHOD_FLOW_ACTION_ESP_MODIFY, + UVERBS_ATTR_IDR(UVERBS_ATTR_MODIFY_FLOW_ACTION_ESP_HANDLE, + UVERBS_OBJECT_FLOW_ACTION, + UVERBS_ACCESS_WRITE, + UA_MANDATORY), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_FLOW_ACTION_ESP_ATTRS, + UVERBS_ATTR_STRUCT(struct ib_uverbs_flow_action_esp, + hard_limit_pkts), + UA_OPTIONAL), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_FLOW_ACTION_ESP_ESN, + UVERBS_ATTR_TYPE(__u32), + UA_OPTIONAL), + UVERBS_ATTR_ENUM_IN(UVERBS_ATTR_FLOW_ACTION_ESP_KEYMAT, + uverbs_flow_action_esp_keymat, + UA_OPTIONAL), + UVERBS_ATTR_ENUM_IN(UVERBS_ATTR_FLOW_ACTION_ESP_REPLAY, + uverbs_flow_action_esp_replay, + UA_OPTIONAL), + UVERBS_ATTR_PTR_IN( + UVERBS_ATTR_FLOW_ACTION_ESP_ENCAP, + UVERBS_ATTR_TYPE(struct ib_uverbs_flow_action_esp_encap), + UA_OPTIONAL)); + +DECLARE_UVERBS_NAMED_METHOD_DESTROY( + UVERBS_METHOD_FLOW_ACTION_DESTROY, + UVERBS_ATTR_IDR(UVERBS_ATTR_DESTROY_FLOW_ACTION_HANDLE, + UVERBS_OBJECT_FLOW_ACTION, + UVERBS_ACCESS_DESTROY, + UA_MANDATORY)); + +DECLARE_UVERBS_NAMED_OBJECT( + UVERBS_OBJECT_FLOW_ACTION, + UVERBS_TYPE_ALLOC_IDR(uverbs_free_flow_action), + &UVERBS_METHOD(UVERBS_METHOD_FLOW_ACTION_ESP_CREATE), + &UVERBS_METHOD(UVERBS_METHOD_FLOW_ACTION_DESTROY), + &UVERBS_METHOD(UVERBS_METHOD_FLOW_ACTION_ESP_MODIFY)); diff --git a/drivers/infiniband/core/uverbs_std_types_mr.c b/drivers/infiniband/core/uverbs_std_types_mr.c index 68f7cadf088f..cf02e774303e 100644 --- a/drivers/infiniband/core/uverbs_std_types_mr.c +++ b/drivers/infiniband/core/uverbs_std_types_mr.c @@ -39,14 +39,18 @@ static int uverbs_free_mr(struct ib_uobject *uobject, return ib_dereg_mr((struct ib_mr *)uobject->object); } -static int UVERBS_HANDLER(UVERBS_METHOD_DM_MR_REG)(struct ib_device *ib_dev, - struct ib_uverbs_file *file, - struct uverbs_attr_bundle *attrs) +static int UVERBS_HANDLER(UVERBS_METHOD_DM_MR_REG)( + struct ib_uverbs_file *file, struct uverbs_attr_bundle *attrs) { struct ib_dm_mr_attr attr = {}; - struct ib_uobject *uobj; - struct ib_dm *dm; - struct ib_pd *pd; + struct ib_uobject *uobj = + uverbs_attr_get_uobject(attrs, UVERBS_ATTR_REG_DM_MR_HANDLE); + struct ib_dm *dm = + uverbs_attr_get_obj(attrs, UVERBS_ATTR_REG_DM_MR_DM_HANDLE); + struct ib_pd *pd = + uverbs_attr_get_obj(attrs, UVERBS_ATTR_REG_DM_MR_PD_HANDLE); + struct ib_device *ib_dev = pd->device; + struct ib_mr *mr; int ret; @@ -62,8 +66,9 @@ static int UVERBS_HANDLER(UVERBS_METHOD_DM_MR_REG)(struct ib_device *ib_dev, if (ret) return ret; - ret = uverbs_copy_from(&attr.access_flags, attrs, - UVERBS_ATTR_REG_DM_MR_ACCESS_FLAGS); + ret = uverbs_get_flags32(&attr.access_flags, attrs, + UVERBS_ATTR_REG_DM_MR_ACCESS_FLAGS, + IB_ACCESS_SUPPORTED); if (ret) return ret; @@ -74,12 +79,6 @@ static int UVERBS_HANDLER(UVERBS_METHOD_DM_MR_REG)(struct ib_device *ib_dev, if (ret) return ret; - pd = uverbs_attr_get_obj(attrs, UVERBS_ATTR_REG_DM_MR_PD_HANDLE); - - dm = uverbs_attr_get_obj(attrs, UVERBS_ATTR_REG_DM_MR_DM_HANDLE); - - uobj = uverbs_attr_get(attrs, UVERBS_ATTR_REG_DM_MR_HANDLE)->obj_attr.uobject; - if (attr.offset > dm->length || attr.length > dm->length || attr.length > dm->length - attr.offset) return -EINVAL; @@ -115,33 +114,36 @@ err_dereg: return ret; } -static DECLARE_UVERBS_NAMED_METHOD(UVERBS_METHOD_DM_MR_REG, - &UVERBS_ATTR_IDR(UVERBS_ATTR_REG_DM_MR_HANDLE, UVERBS_OBJECT_MR, - UVERBS_ACCESS_NEW, - UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)), - &UVERBS_ATTR_PTR_IN(UVERBS_ATTR_REG_DM_MR_OFFSET, - UVERBS_ATTR_TYPE(u64), - UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)), - &UVERBS_ATTR_PTR_IN(UVERBS_ATTR_REG_DM_MR_LENGTH, - UVERBS_ATTR_TYPE(u64), - UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)), - &UVERBS_ATTR_IDR(UVERBS_ATTR_REG_DM_MR_PD_HANDLE, UVERBS_OBJECT_PD, - UVERBS_ACCESS_READ, - UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)), - &UVERBS_ATTR_PTR_IN(UVERBS_ATTR_REG_DM_MR_ACCESS_FLAGS, +DECLARE_UVERBS_NAMED_METHOD( + UVERBS_METHOD_DM_MR_REG, + UVERBS_ATTR_IDR(UVERBS_ATTR_REG_DM_MR_HANDLE, + UVERBS_OBJECT_MR, + UVERBS_ACCESS_NEW, + UA_MANDATORY), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_REG_DM_MR_OFFSET, + UVERBS_ATTR_TYPE(u64), + UA_MANDATORY), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_REG_DM_MR_LENGTH, + UVERBS_ATTR_TYPE(u64), + UA_MANDATORY), + UVERBS_ATTR_IDR(UVERBS_ATTR_REG_DM_MR_PD_HANDLE, + UVERBS_OBJECT_PD, + UVERBS_ACCESS_READ, + UA_MANDATORY), + UVERBS_ATTR_FLAGS_IN(UVERBS_ATTR_REG_DM_MR_ACCESS_FLAGS, + enum ib_access_flags), + UVERBS_ATTR_IDR(UVERBS_ATTR_REG_DM_MR_DM_HANDLE, + UVERBS_OBJECT_DM, + UVERBS_ACCESS_READ, + UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_REG_DM_MR_RESP_LKEY, + UVERBS_ATTR_TYPE(u32), + UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_REG_DM_MR_RESP_RKEY, UVERBS_ATTR_TYPE(u32), - UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)), - &UVERBS_ATTR_IDR(UVERBS_ATTR_REG_DM_MR_DM_HANDLE, UVERBS_OBJECT_DM, - UVERBS_ACCESS_READ, - UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)), - &UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_REG_DM_MR_RESP_LKEY, - UVERBS_ATTR_TYPE(u32), - UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)), - &UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_REG_DM_MR_RESP_RKEY, - UVERBS_ATTR_TYPE(u32), - UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY))); - -DECLARE_UVERBS_NAMED_OBJECT(UVERBS_OBJECT_MR, - /* 1 is used in order to free the MR after all the MWs */ - &UVERBS_TYPE_ALLOC_IDR(1, uverbs_free_mr), - &UVERBS_METHOD(UVERBS_METHOD_DM_MR_REG)); + UA_MANDATORY)); + +DECLARE_UVERBS_NAMED_OBJECT( + UVERBS_OBJECT_MR, + UVERBS_TYPE_ALLOC_IDR(uverbs_free_mr), + &UVERBS_METHOD(UVERBS_METHOD_DM_MR_REG)); diff --git a/drivers/infiniband/core/uverbs_uapi.c b/drivers/infiniband/core/uverbs_uapi.c new file mode 100644 index 000000000000..73ea6f0db88f --- /dev/null +++ b/drivers/infiniband/core/uverbs_uapi.c @@ -0,0 +1,346 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2017, Mellanox Technologies inc. All rights reserved. + */ +#include <rdma/uverbs_ioctl.h> +#include <rdma/rdma_user_ioctl.h> +#include <linux/bitops.h> +#include "rdma_core.h" +#include "uverbs.h" + +static void *uapi_add_elm(struct uverbs_api *uapi, u32 key, size_t alloc_size) +{ + void *elm; + int rc; + + if (key == UVERBS_API_KEY_ERR) + return ERR_PTR(-EOVERFLOW); + + elm = kzalloc(alloc_size, GFP_KERNEL); + rc = radix_tree_insert(&uapi->radix, key, elm); + if (rc) { + kfree(elm); + return ERR_PTR(rc); + } + + return elm; +} + +static int uapi_merge_method(struct uverbs_api *uapi, + struct uverbs_api_object *obj_elm, u32 obj_key, + const struct uverbs_method_def *method, + bool is_driver) +{ + u32 method_key = obj_key | uapi_key_ioctl_method(method->id); + struct uverbs_api_ioctl_method *method_elm; + unsigned int i; + + if (!method->attrs) + return 0; + + method_elm = uapi_add_elm(uapi, method_key, sizeof(*method_elm)); + if (IS_ERR(method_elm)) { + if (method_elm != ERR_PTR(-EEXIST)) + return PTR_ERR(method_elm); + + /* + * This occurs when a driver uses ADD_UVERBS_ATTRIBUTES_SIMPLE + */ + if (WARN_ON(method->handler)) + return -EINVAL; + method_elm = radix_tree_lookup(&uapi->radix, method_key); + if (WARN_ON(!method_elm)) + return -EINVAL; + } else { + WARN_ON(!method->handler); + rcu_assign_pointer(method_elm->handler, method->handler); + if (method->handler != uverbs_destroy_def_handler) + method_elm->driver_method = is_driver; + } + + for (i = 0; i != method->num_attrs; i++) { + const struct uverbs_attr_def *attr = (*method->attrs)[i]; + struct uverbs_api_attr *attr_slot; + + if (!attr) + continue; + + /* + * ENUM_IN contains the 'ids' pointer to the driver's .rodata, + * so if it is specified by a driver then it always makes this + * into a driver method. + */ + if (attr->attr.type == UVERBS_ATTR_TYPE_ENUM_IN) + method_elm->driver_method |= is_driver; + + attr_slot = + uapi_add_elm(uapi, method_key | uapi_key_attr(attr->id), + sizeof(*attr_slot)); + /* Attributes are not allowed to be modified by drivers */ + if (IS_ERR(attr_slot)) + return PTR_ERR(attr_slot); + + attr_slot->spec = attr->attr; + } + + return 0; +} + +static int uapi_merge_tree(struct uverbs_api *uapi, + const struct uverbs_object_tree_def *tree, + bool is_driver) +{ + unsigned int i, j; + int rc; + + if (!tree->objects) + return 0; + + for (i = 0; i != tree->num_objects; i++) { + const struct uverbs_object_def *obj = (*tree->objects)[i]; + struct uverbs_api_object *obj_elm; + u32 obj_key; + + if (!obj) + continue; + + obj_key = uapi_key_obj(obj->id); + obj_elm = uapi_add_elm(uapi, obj_key, sizeof(*obj_elm)); + if (IS_ERR(obj_elm)) { + if (obj_elm != ERR_PTR(-EEXIST)) + return PTR_ERR(obj_elm); + + /* This occurs when a driver uses ADD_UVERBS_METHODS */ + if (WARN_ON(obj->type_attrs)) + return -EINVAL; + obj_elm = radix_tree_lookup(&uapi->radix, obj_key); + if (WARN_ON(!obj_elm)) + return -EINVAL; + } else { + obj_elm->type_attrs = obj->type_attrs; + if (obj->type_attrs) { + obj_elm->type_class = + obj->type_attrs->type_class; + /* + * Today drivers are only permitted to use + * idr_class types. They cannot use FD types + * because we currently have no way to revoke + * the fops pointer after device + * disassociation. + */ + if (WARN_ON(is_driver && + obj->type_attrs->type_class != + &uverbs_idr_class)) + return -EINVAL; + } + } + + if (!obj->methods) + continue; + + for (j = 0; j != obj->num_methods; j++) { + const struct uverbs_method_def *method = + (*obj->methods)[j]; + if (!method) + continue; + + rc = uapi_merge_method(uapi, obj_elm, obj_key, method, + is_driver); + if (rc) + return rc; + } + } + + return 0; +} + +static int +uapi_finalize_ioctl_method(struct uverbs_api *uapi, + struct uverbs_api_ioctl_method *method_elm, + u32 method_key) +{ + struct radix_tree_iter iter; + unsigned int num_attrs = 0; + unsigned int max_bkey = 0; + bool single_uobj = false; + void __rcu **slot; + + method_elm->destroy_bkey = UVERBS_API_ATTR_BKEY_LEN; + radix_tree_for_each_slot (slot, &uapi->radix, &iter, + uapi_key_attrs_start(method_key)) { + struct uverbs_api_attr *elm = + rcu_dereference_protected(*slot, true); + u32 attr_key = iter.index & UVERBS_API_ATTR_KEY_MASK; + u32 attr_bkey = uapi_bkey_attr(attr_key); + u8 type = elm->spec.type; + + if (uapi_key_attr_to_method(iter.index) != + uapi_key_attr_to_method(method_key)) + break; + + if (elm->spec.mandatory) + __set_bit(attr_bkey, method_elm->attr_mandatory); + + if (type == UVERBS_ATTR_TYPE_IDR || + type == UVERBS_ATTR_TYPE_FD) { + u8 access = elm->spec.u.obj.access; + + /* + * Verbs specs may only have one NEW/DESTROY, we don't + * have the infrastructure to abort multiple NEW's or + * cope with multiple DESTROY failure. + */ + if (access == UVERBS_ACCESS_NEW || + access == UVERBS_ACCESS_DESTROY) { + if (WARN_ON(single_uobj)) + return -EINVAL; + + single_uobj = true; + if (WARN_ON(!elm->spec.mandatory)) + return -EINVAL; + } + + if (access == UVERBS_ACCESS_DESTROY) + method_elm->destroy_bkey = attr_bkey; + } + + max_bkey = max(max_bkey, attr_bkey); + num_attrs++; + } + + method_elm->key_bitmap_len = max_bkey + 1; + WARN_ON(method_elm->key_bitmap_len > UVERBS_API_ATTR_BKEY_LEN); + + uapi_compute_bundle_size(method_elm, num_attrs); + return 0; +} + +static int uapi_finalize(struct uverbs_api *uapi) +{ + struct radix_tree_iter iter; + void __rcu **slot; + int rc; + + radix_tree_for_each_slot (slot, &uapi->radix, &iter, 0) { + struct uverbs_api_ioctl_method *method_elm = + rcu_dereference_protected(*slot, true); + + if (uapi_key_is_ioctl_method(iter.index)) { + rc = uapi_finalize_ioctl_method(uapi, method_elm, + iter.index); + if (rc) + return rc; + } + } + + return 0; +} + +void uverbs_destroy_api(struct uverbs_api *uapi) +{ + struct radix_tree_iter iter; + void __rcu **slot; + + if (!uapi) + return; + + radix_tree_for_each_slot (slot, &uapi->radix, &iter, 0) { + kfree(rcu_dereference_protected(*slot, true)); + radix_tree_iter_delete(&uapi->radix, &iter, slot); + } +} + +struct uverbs_api *uverbs_alloc_api( + const struct uverbs_object_tree_def *const *driver_specs, + enum rdma_driver_id driver_id) +{ + struct uverbs_api *uapi; + int rc; + + uapi = kzalloc(sizeof(*uapi), GFP_KERNEL); + if (!uapi) + return ERR_PTR(-ENOMEM); + + INIT_RADIX_TREE(&uapi->radix, GFP_KERNEL); + uapi->driver_id = driver_id; + + rc = uapi_merge_tree(uapi, uverbs_default_get_objects(), false); + if (rc) + goto err; + + for (; driver_specs && *driver_specs; driver_specs++) { + rc = uapi_merge_tree(uapi, *driver_specs, true); + if (rc) + goto err; + } + + rc = uapi_finalize(uapi); + if (rc) + goto err; + + return uapi; +err: + if (rc != -ENOMEM) + pr_err("Setup of uverbs_api failed, kernel parsing tree description is not valid (%d)??\n", + rc); + + uverbs_destroy_api(uapi); + return ERR_PTR(rc); +} + +/* + * The pre version is done before destroying the HW objects, it only blocks + * off method access. All methods that require the ib_dev or the module data + * must test one of these assignments prior to continuing. + */ +void uverbs_disassociate_api_pre(struct ib_uverbs_device *uverbs_dev) +{ + struct uverbs_api *uapi = uverbs_dev->uapi; + struct radix_tree_iter iter; + void __rcu **slot; + + rcu_assign_pointer(uverbs_dev->ib_dev, NULL); + + radix_tree_for_each_slot (slot, &uapi->radix, &iter, 0) { + if (uapi_key_is_ioctl_method(iter.index)) { + struct uverbs_api_ioctl_method *method_elm = + rcu_dereference_protected(*slot, true); + + if (method_elm->driver_method) + rcu_assign_pointer(method_elm->handler, NULL); + } + } + + synchronize_srcu(&uverbs_dev->disassociate_srcu); +} + +/* + * Called when a driver disassociates from the ib_uverbs_device. The + * assumption is that the driver module will unload after. Replace everything + * related to the driver with NULL as a safety measure. + */ +void uverbs_disassociate_api(struct uverbs_api *uapi) +{ + struct radix_tree_iter iter; + void __rcu **slot; + + radix_tree_for_each_slot (slot, &uapi->radix, &iter, 0) { + if (uapi_key_is_object(iter.index)) { + struct uverbs_api_object *object_elm = + rcu_dereference_protected(*slot, true); + + /* + * Some type_attrs are in the driver module. We don't + * bother to keep track of which since there should be + * no use of this after disassociate. + */ + object_elm->type_attrs = NULL; + } else if (uapi_key_is_attr(iter.index)) { + struct uverbs_api_attr *elm = + rcu_dereference_protected(*slot, true); + + if (elm->spec.type == UVERBS_ATTR_TYPE_ENUM_IN) + elm->spec.u2.enum_def.ids = NULL; + } + } +} diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index 9d6beb948535..6ee03d6089eb 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -326,12 +326,162 @@ EXPORT_SYMBOL(ib_dealloc_pd); /* Address handles */ +/** + * rdma_copy_ah_attr - Copy rdma ah attribute from source to destination. + * @dest: Pointer to destination ah_attr. Contents of the destination + * pointer is assumed to be invalid and attribute are overwritten. + * @src: Pointer to source ah_attr. + */ +void rdma_copy_ah_attr(struct rdma_ah_attr *dest, + const struct rdma_ah_attr *src) +{ + *dest = *src; + if (dest->grh.sgid_attr) + rdma_hold_gid_attr(dest->grh.sgid_attr); +} +EXPORT_SYMBOL(rdma_copy_ah_attr); + +/** + * rdma_replace_ah_attr - Replace valid ah_attr with new new one. + * @old: Pointer to existing ah_attr which needs to be replaced. + * old is assumed to be valid or zero'd + * @new: Pointer to the new ah_attr. + * + * rdma_replace_ah_attr() first releases any reference in the old ah_attr if + * old the ah_attr is valid; after that it copies the new attribute and holds + * the reference to the replaced ah_attr. + */ +void rdma_replace_ah_attr(struct rdma_ah_attr *old, + const struct rdma_ah_attr *new) +{ + rdma_destroy_ah_attr(old); + *old = *new; + if (old->grh.sgid_attr) + rdma_hold_gid_attr(old->grh.sgid_attr); +} +EXPORT_SYMBOL(rdma_replace_ah_attr); + +/** + * rdma_move_ah_attr - Move ah_attr pointed by source to destination. + * @dest: Pointer to destination ah_attr to copy to. + * dest is assumed to be valid or zero'd + * @src: Pointer to the new ah_attr. + * + * rdma_move_ah_attr() first releases any reference in the destination ah_attr + * if it is valid. This also transfers ownership of internal references from + * src to dest, making src invalid in the process. No new reference of the src + * ah_attr is taken. + */ +void rdma_move_ah_attr(struct rdma_ah_attr *dest, struct rdma_ah_attr *src) +{ + rdma_destroy_ah_attr(dest); + *dest = *src; + src->grh.sgid_attr = NULL; +} +EXPORT_SYMBOL(rdma_move_ah_attr); + +/* + * Validate that the rdma_ah_attr is valid for the device before passing it + * off to the driver. + */ +static int rdma_check_ah_attr(struct ib_device *device, + struct rdma_ah_attr *ah_attr) +{ + if (!rdma_is_port_valid(device, ah_attr->port_num)) + return -EINVAL; + + if ((rdma_is_grh_required(device, ah_attr->port_num) || + ah_attr->type == RDMA_AH_ATTR_TYPE_ROCE) && + !(ah_attr->ah_flags & IB_AH_GRH)) + return -EINVAL; + + if (ah_attr->grh.sgid_attr) { + /* + * Make sure the passed sgid_attr is consistent with the + * parameters + */ + if (ah_attr->grh.sgid_attr->index != ah_attr->grh.sgid_index || + ah_attr->grh.sgid_attr->port_num != ah_attr->port_num) + return -EINVAL; + } + return 0; +} + +/* + * If the ah requires a GRH then ensure that sgid_attr pointer is filled in. + * On success the caller is responsible to call rdma_unfill_sgid_attr(). + */ +static int rdma_fill_sgid_attr(struct ib_device *device, + struct rdma_ah_attr *ah_attr, + const struct ib_gid_attr **old_sgid_attr) +{ + const struct ib_gid_attr *sgid_attr; + struct ib_global_route *grh; + int ret; + + *old_sgid_attr = ah_attr->grh.sgid_attr; + + ret = rdma_check_ah_attr(device, ah_attr); + if (ret) + return ret; + + if (!(ah_attr->ah_flags & IB_AH_GRH)) + return 0; + + grh = rdma_ah_retrieve_grh(ah_attr); + if (grh->sgid_attr) + return 0; + + sgid_attr = + rdma_get_gid_attr(device, ah_attr->port_num, grh->sgid_index); + if (IS_ERR(sgid_attr)) + return PTR_ERR(sgid_attr); + + /* Move ownerhip of the kref into the ah_attr */ + grh->sgid_attr = sgid_attr; + return 0; +} + +static void rdma_unfill_sgid_attr(struct rdma_ah_attr *ah_attr, + const struct ib_gid_attr *old_sgid_attr) +{ + /* + * Fill didn't change anything, the caller retains ownership of + * whatever it passed + */ + if (ah_attr->grh.sgid_attr == old_sgid_attr) + return; + + /* + * Otherwise, we need to undo what rdma_fill_sgid_attr so the caller + * doesn't see any change in the rdma_ah_attr. If we get here + * old_sgid_attr is NULL. + */ + rdma_destroy_ah_attr(ah_attr); +} + +static const struct ib_gid_attr * +rdma_update_sgid_attr(struct rdma_ah_attr *ah_attr, + const struct ib_gid_attr *old_attr) +{ + if (old_attr) + rdma_put_gid_attr(old_attr); + if (ah_attr->ah_flags & IB_AH_GRH) { + rdma_hold_gid_attr(ah_attr->grh.sgid_attr); + return ah_attr->grh.sgid_attr; + } + return NULL; +} + static struct ib_ah *_rdma_create_ah(struct ib_pd *pd, struct rdma_ah_attr *ah_attr, struct ib_udata *udata) { struct ib_ah *ah; + if (!pd->device->create_ah) + return ERR_PTR(-EOPNOTSUPP); + ah = pd->device->create_ah(pd, ah_attr, udata); if (!IS_ERR(ah)) { @@ -339,15 +489,38 @@ static struct ib_ah *_rdma_create_ah(struct ib_pd *pd, ah->pd = pd; ah->uobject = NULL; ah->type = ah_attr->type; + ah->sgid_attr = rdma_update_sgid_attr(ah_attr, NULL); + atomic_inc(&pd->usecnt); } return ah; } +/** + * rdma_create_ah - Creates an address handle for the + * given address vector. + * @pd: The protection domain associated with the address handle. + * @ah_attr: The attributes of the address vector. + * + * It returns 0 on success and returns appropriate error code on error. + * The address handle is used to reference a local or global destination + * in all UD QP post sends. + */ struct ib_ah *rdma_create_ah(struct ib_pd *pd, struct rdma_ah_attr *ah_attr) { - return _rdma_create_ah(pd, ah_attr, NULL); + const struct ib_gid_attr *old_sgid_attr; + struct ib_ah *ah; + int ret; + + ret = rdma_fill_sgid_attr(pd->device, ah_attr, &old_sgid_attr); + if (ret) + return ERR_PTR(ret); + + ah = _rdma_create_ah(pd, ah_attr, NULL); + + rdma_unfill_sgid_attr(ah_attr, old_sgid_attr); + return ah; } EXPORT_SYMBOL(rdma_create_ah); @@ -368,15 +541,27 @@ struct ib_ah *rdma_create_user_ah(struct ib_pd *pd, struct rdma_ah_attr *ah_attr, struct ib_udata *udata) { + const struct ib_gid_attr *old_sgid_attr; + struct ib_ah *ah; int err; + err = rdma_fill_sgid_attr(pd->device, ah_attr, &old_sgid_attr); + if (err) + return ERR_PTR(err); + if (ah_attr->type == RDMA_AH_ATTR_TYPE_ROCE) { err = ib_resolve_eth_dmac(pd->device, ah_attr); - if (err) - return ERR_PTR(err); + if (err) { + ah = ERR_PTR(err); + goto out; + } } - return _rdma_create_ah(pd, ah_attr, udata); + ah = _rdma_create_ah(pd, ah_attr, udata); + +out: + rdma_unfill_sgid_attr(ah_attr, old_sgid_attr); + return ah; } EXPORT_SYMBOL(rdma_create_user_ah); @@ -455,16 +640,16 @@ static bool find_gid_index(const union ib_gid *gid, return true; } -static int get_sgid_index_from_eth(struct ib_device *device, u8 port_num, - u16 vlan_id, const union ib_gid *sgid, - enum ib_gid_type gid_type, - u16 *gid_index) +static const struct ib_gid_attr * +get_sgid_attr_from_eth(struct ib_device *device, u8 port_num, + u16 vlan_id, const union ib_gid *sgid, + enum ib_gid_type gid_type) { struct find_gid_index_context context = {.vlan_id = vlan_id, .gid_type = gid_type}; - return ib_find_gid_by_filter(device, sgid, port_num, find_gid_index, - &context, gid_index); + return rdma_find_gid_by_filter(device, sgid, port_num, find_gid_index, + &context); } int ib_get_gids_from_rdma_hdr(const union rdma_network_hdr *hdr, @@ -508,39 +693,24 @@ EXPORT_SYMBOL(ib_get_gids_from_rdma_hdr); static int ib_resolve_unicast_gid_dmac(struct ib_device *device, struct rdma_ah_attr *ah_attr) { - struct ib_gid_attr sgid_attr; - struct ib_global_route *grh; + struct ib_global_route *grh = rdma_ah_retrieve_grh(ah_attr); + const struct ib_gid_attr *sgid_attr = grh->sgid_attr; int hop_limit = 0xff; - union ib_gid sgid; - int ret; - - grh = rdma_ah_retrieve_grh(ah_attr); - - ret = ib_query_gid(device, - rdma_ah_get_port_num(ah_attr), - grh->sgid_index, - &sgid, &sgid_attr); - if (ret || !sgid_attr.ndev) { - if (!ret) - ret = -ENXIO; - return ret; - } + int ret = 0; /* If destination is link local and source GID is RoCEv1, * IP stack is not used. */ if (rdma_link_local_addr((struct in6_addr *)grh->dgid.raw) && - sgid_attr.gid_type == IB_GID_TYPE_ROCE) { + sgid_attr->gid_type == IB_GID_TYPE_ROCE) { rdma_get_ll_mac((struct in6_addr *)grh->dgid.raw, ah_attr->roce.dmac); - goto done; + return ret; } - ret = rdma_addr_find_l2_eth_by_grh(&sgid, &grh->dgid, + ret = rdma_addr_find_l2_eth_by_grh(&sgid_attr->gid, &grh->dgid, ah_attr->roce.dmac, - sgid_attr.ndev, &hop_limit); -done: - dev_put(sgid_attr.ndev); + sgid_attr->ndev, &hop_limit); grh->hop_limit = hop_limit; return ret; @@ -555,16 +725,18 @@ done: * as sgid and, sgid is used as dgid because sgid contains destinations * GID whom to respond to. * + * On success the caller is responsible to call rdma_destroy_ah_attr on the + * attr. */ int ib_init_ah_attr_from_wc(struct ib_device *device, u8 port_num, const struct ib_wc *wc, const struct ib_grh *grh, struct rdma_ah_attr *ah_attr) { u32 flow_class; - u16 gid_index; int ret; enum rdma_network_type net_type = RDMA_NETWORK_IB; enum ib_gid_type gid_type = IB_GID_TYPE_IB; + const struct ib_gid_attr *sgid_attr; int hoplimit = 0xff; union ib_gid dgid; union ib_gid sgid; @@ -595,72 +767,141 @@ int ib_init_ah_attr_from_wc(struct ib_device *device, u8 port_num, if (!(wc->wc_flags & IB_WC_GRH)) return -EPROTOTYPE; - ret = get_sgid_index_from_eth(device, port_num, - vlan_id, &dgid, - gid_type, &gid_index); - if (ret) - return ret; + sgid_attr = get_sgid_attr_from_eth(device, port_num, + vlan_id, &dgid, + gid_type); + if (IS_ERR(sgid_attr)) + return PTR_ERR(sgid_attr); flow_class = be32_to_cpu(grh->version_tclass_flow); - rdma_ah_set_grh(ah_attr, &sgid, - flow_class & 0xFFFFF, - (u8)gid_index, hoplimit, - (flow_class >> 20) & 0xFF); - return ib_resolve_unicast_gid_dmac(device, ah_attr); + rdma_move_grh_sgid_attr(ah_attr, + &sgid, + flow_class & 0xFFFFF, + hoplimit, + (flow_class >> 20) & 0xFF, + sgid_attr); + + ret = ib_resolve_unicast_gid_dmac(device, ah_attr); + if (ret) + rdma_destroy_ah_attr(ah_attr); + + return ret; } else { rdma_ah_set_dlid(ah_attr, wc->slid); rdma_ah_set_path_bits(ah_attr, wc->dlid_path_bits); - if (wc->wc_flags & IB_WC_GRH) { - if (dgid.global.interface_id != cpu_to_be64(IB_SA_WELL_KNOWN_GUID)) { - ret = ib_find_cached_gid_by_port(device, &dgid, - IB_GID_TYPE_IB, - port_num, NULL, - &gid_index); - if (ret) - return ret; - } else { - gid_index = 0; - } + if ((wc->wc_flags & IB_WC_GRH) == 0) + return 0; + + if (dgid.global.interface_id != + cpu_to_be64(IB_SA_WELL_KNOWN_GUID)) { + sgid_attr = rdma_find_gid_by_port( + device, &dgid, IB_GID_TYPE_IB, port_num, NULL); + } else + sgid_attr = rdma_get_gid_attr(device, port_num, 0); - flow_class = be32_to_cpu(grh->version_tclass_flow); - rdma_ah_set_grh(ah_attr, &sgid, + if (IS_ERR(sgid_attr)) + return PTR_ERR(sgid_attr); + flow_class = be32_to_cpu(grh->version_tclass_flow); + rdma_move_grh_sgid_attr(ah_attr, + &sgid, flow_class & 0xFFFFF, - (u8)gid_index, hoplimit, - (flow_class >> 20) & 0xFF); - } + hoplimit, + (flow_class >> 20) & 0xFF, + sgid_attr); + return 0; } } EXPORT_SYMBOL(ib_init_ah_attr_from_wc); +/** + * rdma_move_grh_sgid_attr - Sets the sgid attribute of GRH, taking ownership + * of the reference + * + * @attr: Pointer to AH attribute structure + * @dgid: Destination GID + * @flow_label: Flow label + * @hop_limit: Hop limit + * @traffic_class: traffic class + * @sgid_attr: Pointer to SGID attribute + * + * This takes ownership of the sgid_attr reference. The caller must ensure + * rdma_destroy_ah_attr() is called before destroying the rdma_ah_attr after + * calling this function. + */ +void rdma_move_grh_sgid_attr(struct rdma_ah_attr *attr, union ib_gid *dgid, + u32 flow_label, u8 hop_limit, u8 traffic_class, + const struct ib_gid_attr *sgid_attr) +{ + rdma_ah_set_grh(attr, dgid, flow_label, sgid_attr->index, hop_limit, + traffic_class); + attr->grh.sgid_attr = sgid_attr; +} +EXPORT_SYMBOL(rdma_move_grh_sgid_attr); + +/** + * rdma_destroy_ah_attr - Release reference to SGID attribute of + * ah attribute. + * @ah_attr: Pointer to ah attribute + * + * Release reference to the SGID attribute of the ah attribute if it is + * non NULL. It is safe to call this multiple times, and safe to call it on + * a zero initialized ah_attr. + */ +void rdma_destroy_ah_attr(struct rdma_ah_attr *ah_attr) +{ + if (ah_attr->grh.sgid_attr) { + rdma_put_gid_attr(ah_attr->grh.sgid_attr); + ah_attr->grh.sgid_attr = NULL; + } +} +EXPORT_SYMBOL(rdma_destroy_ah_attr); + struct ib_ah *ib_create_ah_from_wc(struct ib_pd *pd, const struct ib_wc *wc, const struct ib_grh *grh, u8 port_num) { struct rdma_ah_attr ah_attr; + struct ib_ah *ah; int ret; ret = ib_init_ah_attr_from_wc(pd->device, port_num, wc, grh, &ah_attr); if (ret) return ERR_PTR(ret); - return rdma_create_ah(pd, &ah_attr); + ah = rdma_create_ah(pd, &ah_attr); + + rdma_destroy_ah_attr(&ah_attr); + return ah; } EXPORT_SYMBOL(ib_create_ah_from_wc); int rdma_modify_ah(struct ib_ah *ah, struct rdma_ah_attr *ah_attr) { + const struct ib_gid_attr *old_sgid_attr; + int ret; + if (ah->type != ah_attr->type) return -EINVAL; - return ah->device->modify_ah ? + ret = rdma_fill_sgid_attr(ah->device, ah_attr, &old_sgid_attr); + if (ret) + return ret; + + ret = ah->device->modify_ah ? ah->device->modify_ah(ah, ah_attr) : -EOPNOTSUPP; + + ah->sgid_attr = rdma_update_sgid_attr(ah_attr, ah->sgid_attr); + rdma_unfill_sgid_attr(ah_attr, old_sgid_attr); + return ret; } EXPORT_SYMBOL(rdma_modify_ah); int rdma_query_ah(struct ib_ah *ah, struct rdma_ah_attr *ah_attr) { + ah_attr->grh.sgid_attr = NULL; + return ah->device->query_ah ? ah->device->query_ah(ah, ah_attr) : -EOPNOTSUPP; @@ -669,13 +910,17 @@ EXPORT_SYMBOL(rdma_query_ah); int rdma_destroy_ah(struct ib_ah *ah) { + const struct ib_gid_attr *sgid_attr = ah->sgid_attr; struct ib_pd *pd; int ret; pd = ah->pd; ret = ah->device->destroy_ah(ah); - if (!ret) + if (!ret) { atomic_dec(&pd->usecnt); + if (sgid_attr) + rdma_put_gid_attr(sgid_attr); + } return ret; } @@ -1290,16 +1535,19 @@ bool ib_modify_qp_is_ok(enum ib_qp_state cur_state, enum ib_qp_state next_state, } EXPORT_SYMBOL(ib_modify_qp_is_ok); +/** + * ib_resolve_eth_dmac - Resolve destination mac address + * @device: Device to consider + * @ah_attr: address handle attribute which describes the + * source and destination parameters + * ib_resolve_eth_dmac() resolves destination mac address and L3 hop limit It + * returns 0 on success or appropriate error code. It initializes the + * necessary ah_attr fields when call is successful. + */ static int ib_resolve_eth_dmac(struct ib_device *device, struct rdma_ah_attr *ah_attr) { - int ret = 0; - struct ib_global_route *grh; - - if (!rdma_is_port_valid(device, rdma_ah_get_port_num(ah_attr))) - return -EINVAL; - - grh = rdma_ah_retrieve_grh(ah_attr); + int ret = 0; if (rdma_is_multicast_addr((struct in6_addr *)ah_attr->grh.dgid.raw)) { if (ipv6_addr_v4mapped((struct in6_addr *)ah_attr->grh.dgid.raw)) { @@ -1317,6 +1565,14 @@ static int ib_resolve_eth_dmac(struct ib_device *device, return ret; } +static bool is_qp_type_connected(const struct ib_qp *qp) +{ + return (qp->qp_type == IB_QPT_UC || + qp->qp_type == IB_QPT_RC || + qp->qp_type == IB_QPT_XRC_INI || + qp->qp_type == IB_QPT_XRC_TGT); +} + /** * IB core internal function to perform QP attributes modification. */ @@ -1324,8 +1580,53 @@ static int _ib_modify_qp(struct ib_qp *qp, struct ib_qp_attr *attr, int attr_mask, struct ib_udata *udata) { u8 port = attr_mask & IB_QP_PORT ? attr->port_num : qp->port; + const struct ib_gid_attr *old_sgid_attr_av; + const struct ib_gid_attr *old_sgid_attr_alt_av; int ret; + if (attr_mask & IB_QP_AV) { + ret = rdma_fill_sgid_attr(qp->device, &attr->ah_attr, + &old_sgid_attr_av); + if (ret) + return ret; + } + if (attr_mask & IB_QP_ALT_PATH) { + /* + * FIXME: This does not track the migration state, so if the + * user loads a new alternate path after the HW has migrated + * from primary->alternate we will keep the wrong + * references. This is OK for IB because the reference + * counting does not serve any functional purpose. + */ + ret = rdma_fill_sgid_attr(qp->device, &attr->alt_ah_attr, + &old_sgid_attr_alt_av); + if (ret) + goto out_av; + + /* + * Today the core code can only handle alternate paths and APM + * for IB. Ban them in roce mode. + */ + if (!(rdma_protocol_ib(qp->device, + attr->alt_ah_attr.port_num) && + rdma_protocol_ib(qp->device, port))) { + ret = EINVAL; + goto out; + } + } + + /* + * If the user provided the qp_attr then we have to resolve it. Kernel + * users have to provide already resolved rdma_ah_attr's + */ + if (udata && (attr_mask & IB_QP_AV) && + attr->ah_attr.type == RDMA_AH_ATTR_TYPE_ROCE && + is_qp_type_connected(qp)) { + ret = ib_resolve_eth_dmac(qp->device, &attr->ah_attr); + if (ret) + goto out; + } + if (rdma_ib_or_roce(qp->device, port)) { if (attr_mask & IB_QP_RQ_PSN && attr->rq_psn & ~0xffffff) { pr_warn("%s: %s rq_psn overflow, masking to 24 bits\n", @@ -1341,20 +1642,27 @@ static int _ib_modify_qp(struct ib_qp *qp, struct ib_qp_attr *attr, } ret = ib_security_modify_qp(qp, attr, attr_mask, udata); - if (!ret && (attr_mask & IB_QP_PORT)) - qp->port = attr->port_num; + if (ret) + goto out; + if (attr_mask & IB_QP_PORT) + qp->port = attr->port_num; + if (attr_mask & IB_QP_AV) + qp->av_sgid_attr = + rdma_update_sgid_attr(&attr->ah_attr, qp->av_sgid_attr); + if (attr_mask & IB_QP_ALT_PATH) + qp->alt_path_sgid_attr = rdma_update_sgid_attr( + &attr->alt_ah_attr, qp->alt_path_sgid_attr); + +out: + if (attr_mask & IB_QP_ALT_PATH) + rdma_unfill_sgid_attr(&attr->alt_ah_attr, old_sgid_attr_alt_av); +out_av: + if (attr_mask & IB_QP_AV) + rdma_unfill_sgid_attr(&attr->ah_attr, old_sgid_attr_av); return ret; } -static bool is_qp_type_connected(const struct ib_qp *qp) -{ - return (qp->qp_type == IB_QPT_UC || - qp->qp_type == IB_QPT_RC || - qp->qp_type == IB_QPT_XRC_INI || - qp->qp_type == IB_QPT_XRC_TGT); -} - /** * ib_modify_qp_with_udata - Modifies the attributes for the specified QP. * @ib_qp: The QP to modify. @@ -1369,17 +1677,7 @@ static bool is_qp_type_connected(const struct ib_qp *qp) int ib_modify_qp_with_udata(struct ib_qp *ib_qp, struct ib_qp_attr *attr, int attr_mask, struct ib_udata *udata) { - struct ib_qp *qp = ib_qp->real_qp; - int ret; - - if (attr_mask & IB_QP_AV && - attr->ah_attr.type == RDMA_AH_ATTR_TYPE_ROCE && - is_qp_type_connected(qp)) { - ret = ib_resolve_eth_dmac(qp->device, &attr->ah_attr); - if (ret) - return ret; - } - return _ib_modify_qp(qp, attr, attr_mask, udata); + return _ib_modify_qp(ib_qp->real_qp, attr, attr_mask, udata); } EXPORT_SYMBOL(ib_modify_qp_with_udata); @@ -1451,6 +1749,9 @@ int ib_query_qp(struct ib_qp *qp, int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr) { + qp_attr->ah_attr.grh.sgid_attr = NULL; + qp_attr->alt_ah_attr.grh.sgid_attr = NULL; + return qp->device->query_qp ? qp->device->query_qp(qp->real_qp, qp_attr, qp_attr_mask, qp_init_attr) : -EOPNOTSUPP; @@ -1509,6 +1810,8 @@ static int __ib_destroy_shared_qp(struct ib_qp *qp) int ib_destroy_qp(struct ib_qp *qp) { + const struct ib_gid_attr *alt_path_sgid_attr = qp->alt_path_sgid_attr; + const struct ib_gid_attr *av_sgid_attr = qp->av_sgid_attr; struct ib_pd *pd; struct ib_cq *scq, *rcq; struct ib_srq *srq; @@ -1539,6 +1842,10 @@ int ib_destroy_qp(struct ib_qp *qp) rdma_restrack_del(&qp->res); ret = qp->device->destroy_qp(qp); if (!ret) { + if (alt_path_sgid_attr) + rdma_put_gid_attr(alt_path_sgid_attr); + if (av_sgid_attr) + rdma_put_gid_attr(av_sgid_attr); if (pd) atomic_dec(&pd->usecnt); if (scq) @@ -1977,35 +2284,6 @@ int ib_destroy_rwq_ind_table(struct ib_rwq_ind_table *rwq_ind_table) } EXPORT_SYMBOL(ib_destroy_rwq_ind_table); -struct ib_flow *ib_create_flow(struct ib_qp *qp, - struct ib_flow_attr *flow_attr, - int domain) -{ - struct ib_flow *flow_id; - if (!qp->device->create_flow) - return ERR_PTR(-EOPNOTSUPP); - - flow_id = qp->device->create_flow(qp, flow_attr, domain, NULL); - if (!IS_ERR(flow_id)) { - atomic_inc(&qp->usecnt); - flow_id->qp = qp; - } - return flow_id; -} -EXPORT_SYMBOL(ib_create_flow); - -int ib_destroy_flow(struct ib_flow *flow_id) -{ - int err; - struct ib_qp *qp = flow_id->qp; - - err = qp->device->destroy_flow(flow_id); - if (!err) - atomic_dec(&qp->usecnt); - return err; -} -EXPORT_SYMBOL(ib_destroy_flow); - int ib_check_mr_status(struct ib_mr *mr, u32 check_mask, struct ib_mr_status *mr_status) { @@ -2200,7 +2478,6 @@ static void __ib_drain_sq(struct ib_qp *qp) struct ib_cq *cq = qp->send_cq; struct ib_qp_attr attr = { .qp_state = IB_QPS_ERR }; struct ib_drain_cqe sdrain; - struct ib_send_wr *bad_swr; struct ib_rdma_wr swr = { .wr = { .next = NULL, @@ -2219,7 +2496,7 @@ static void __ib_drain_sq(struct ib_qp *qp) sdrain.cqe.done = ib_drain_qp_done; init_completion(&sdrain.done); - ret = ib_post_send(qp, &swr.wr, &bad_swr); + ret = ib_post_send(qp, &swr.wr, NULL); if (ret) { WARN_ONCE(ret, "failed to drain send queue: %d\n", ret); return; @@ -2240,7 +2517,7 @@ static void __ib_drain_rq(struct ib_qp *qp) struct ib_cq *cq = qp->recv_cq; struct ib_qp_attr attr = { .qp_state = IB_QPS_ERR }; struct ib_drain_cqe rdrain; - struct ib_recv_wr rwr = {}, *bad_rwr; + struct ib_recv_wr rwr = {}; int ret; ret = ib_modify_qp(qp, &attr, IB_QP_STATE); @@ -2253,7 +2530,7 @@ static void __ib_drain_rq(struct ib_qp *qp) rdrain.cqe.done = ib_drain_qp_done; init_completion(&rdrain.done); - ret = ib_post_recv(qp, &rwr, &bad_rwr); + ret = ib_post_recv(qp, &rwr, NULL); if (ret) { WARN_ONCE(ret, "failed to drain recv queue: %d\n", ret); return; diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c index a76e206704d4..bbfb86eb2d24 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c @@ -166,7 +166,8 @@ int bnxt_re_query_device(struct ib_device *ibdev, | IB_DEVICE_MEM_WINDOW | IB_DEVICE_MEM_WINDOW_TYPE_2B | IB_DEVICE_MEM_MGT_EXTENSIONS; - ib_attr->max_sge = dev_attr->max_qp_sges; + ib_attr->max_send_sge = dev_attr->max_qp_sges; + ib_attr->max_recv_sge = dev_attr->max_qp_sges; ib_attr->max_sge_rd = dev_attr->max_qp_sges; ib_attr->max_cq = dev_attr->max_cq; ib_attr->max_cqe = dev_attr->max_cq_wqes; @@ -243,8 +244,8 @@ int bnxt_re_query_port(struct ib_device *ibdev, u8 port_num, port_attr->gid_tbl_len = dev_attr->max_sgid; port_attr->port_cap_flags = IB_PORT_CM_SUP | IB_PORT_REINIT_SUP | IB_PORT_DEVICE_MGMT_SUP | - IB_PORT_VENDOR_CLASS_SUP | - IB_PORT_IP_BASED_GIDS; + IB_PORT_VENDOR_CLASS_SUP; + port_attr->ip_gids = true; port_attr->max_msg_sz = (u32)BNXT_RE_MAX_MR_SIZE_LOW; port_attr->bad_pkey_cntr = 0; @@ -364,8 +365,7 @@ int bnxt_re_del_gid(const struct ib_gid_attr *attr, void **context) return rc; } -int bnxt_re_add_gid(const union ib_gid *gid, - const struct ib_gid_attr *attr, void **context) +int bnxt_re_add_gid(const struct ib_gid_attr *attr, void **context) { int rc; u32 tbl_idx = 0; @@ -377,7 +377,7 @@ int bnxt_re_add_gid(const union ib_gid *gid, if ((attr->ndev) && is_vlan_dev(attr->ndev)) vlan_id = vlan_dev_vlan_id(attr->ndev); - rc = bnxt_qplib_add_sgid(sgid_tbl, (struct bnxt_qplib_gid *)gid, + rc = bnxt_qplib_add_sgid(sgid_tbl, (struct bnxt_qplib_gid *)&attr->gid, rdev->qplib_res.netdev->dev_addr, vlan_id, true, &tbl_idx); if (rc == -EALREADY) { @@ -673,8 +673,6 @@ struct ib_ah *bnxt_re_create_ah(struct ib_pd *ib_pd, int rc; u8 nw_type; - struct ib_gid_attr sgid_attr; - if (!(rdma_ah_get_ah_flags(ah_attr) & IB_AH_GRH)) { dev_err(rdev_to_dev(rdev), "Failed to alloc AH: GRH not set"); return ERR_PTR(-EINVAL); @@ -705,20 +703,11 @@ struct ib_ah *bnxt_re_create_ah(struct ib_pd *ib_pd, grh->dgid.raw) && !rdma_link_local_addr((struct in6_addr *) grh->dgid.raw)) { - union ib_gid sgid; + const struct ib_gid_attr *sgid_attr; - rc = ib_get_cached_gid(&rdev->ibdev, 1, - grh->sgid_index, &sgid, - &sgid_attr); - if (rc) { - dev_err(rdev_to_dev(rdev), - "Failed to query gid at index %d", - grh->sgid_index); - goto fail; - } - dev_put(sgid_attr.ndev); + sgid_attr = grh->sgid_attr; /* Get network header type for this GID */ - nw_type = ib_gid_to_network_type(sgid_attr.gid_type, &sgid); + nw_type = rdma_gid_attr_network_type(sgid_attr); switch (nw_type) { case RDMA_NETWORK_IPV4: ah->qplib_ah.nw_type = CMDQ_CREATE_AH_TYPE_V2IPV4; @@ -1408,7 +1397,7 @@ struct ib_srq *bnxt_re_create_srq(struct ib_pd *ib_pd, } if (srq_init_attr->srq_type != IB_SRQT_BASIC) { - rc = -ENOTSUPP; + rc = -EOPNOTSUPP; goto exit; } @@ -1530,8 +1519,8 @@ int bnxt_re_query_srq(struct ib_srq *ib_srq, struct ib_srq_attr *srq_attr) return 0; } -int bnxt_re_post_srq_recv(struct ib_srq *ib_srq, struct ib_recv_wr *wr, - struct ib_recv_wr **bad_wr) +int bnxt_re_post_srq_recv(struct ib_srq *ib_srq, const struct ib_recv_wr *wr, + const struct ib_recv_wr **bad_wr) { struct bnxt_re_srq *srq = container_of(ib_srq, struct bnxt_re_srq, ib_srq); @@ -1599,9 +1588,6 @@ int bnxt_re_modify_qp(struct ib_qp *ib_qp, struct ib_qp_attr *qp_attr, struct bnxt_qplib_dev_attr *dev_attr = &rdev->dev_attr; enum ib_qp_state curr_qp_state, new_qp_state; int rc, entries; - int status; - union ib_gid sgid; - struct ib_gid_attr sgid_attr; unsigned int flags; u8 nw_type; @@ -1668,6 +1654,7 @@ int bnxt_re_modify_qp(struct ib_qp *ib_qp, struct ib_qp_attr *qp_attr, if (qp_attr_mask & IB_QP_AV) { const struct ib_global_route *grh = rdma_ah_read_grh(&qp_attr->ah_attr); + const struct ib_gid_attr *sgid_attr; qp->qplib_qp.modify_flags |= CMDQ_MODIFY_QP_MODIFY_MASK_DGID | CMDQ_MODIFY_QP_MODIFY_MASK_FLOW_LABEL | @@ -1691,29 +1678,23 @@ int bnxt_re_modify_qp(struct ib_qp *ib_qp, struct ib_qp_attr *qp_attr, ether_addr_copy(qp->qplib_qp.ah.dmac, qp_attr->ah_attr.roce.dmac); - status = ib_get_cached_gid(&rdev->ibdev, 1, - grh->sgid_index, - &sgid, &sgid_attr); - if (!status) { - memcpy(qp->qplib_qp.smac, sgid_attr.ndev->dev_addr, - ETH_ALEN); - dev_put(sgid_attr.ndev); - nw_type = ib_gid_to_network_type(sgid_attr.gid_type, - &sgid); - switch (nw_type) { - case RDMA_NETWORK_IPV4: - qp->qplib_qp.nw_type = - CMDQ_MODIFY_QP_NETWORK_TYPE_ROCEV2_IPV4; - break; - case RDMA_NETWORK_IPV6: - qp->qplib_qp.nw_type = - CMDQ_MODIFY_QP_NETWORK_TYPE_ROCEV2_IPV6; - break; - default: - qp->qplib_qp.nw_type = - CMDQ_MODIFY_QP_NETWORK_TYPE_ROCEV1; - break; - } + sgid_attr = qp_attr->ah_attr.grh.sgid_attr; + memcpy(qp->qplib_qp.smac, sgid_attr->ndev->dev_addr, + ETH_ALEN); + nw_type = rdma_gid_attr_network_type(sgid_attr); + switch (nw_type) { + case RDMA_NETWORK_IPV4: + qp->qplib_qp.nw_type = + CMDQ_MODIFY_QP_NETWORK_TYPE_ROCEV2_IPV4; + break; + case RDMA_NETWORK_IPV6: + qp->qplib_qp.nw_type = + CMDQ_MODIFY_QP_NETWORK_TYPE_ROCEV2_IPV6; + break; + default: + qp->qplib_qp.nw_type = + CMDQ_MODIFY_QP_NETWORK_TYPE_ROCEV1; + break; } } @@ -1895,19 +1876,17 @@ out: /* Routine for sending QP1 packets for RoCE V1 an V2 */ static int bnxt_re_build_qp1_send_v2(struct bnxt_re_qp *qp, - struct ib_send_wr *wr, + const struct ib_send_wr *wr, struct bnxt_qplib_swqe *wqe, int payload_size) { - struct ib_device *ibdev = &qp->rdev->ibdev; struct bnxt_re_ah *ah = container_of(ud_wr(wr)->ah, struct bnxt_re_ah, ib_ah); struct bnxt_qplib_ah *qplib_ah = &ah->qplib_ah; + const struct ib_gid_attr *sgid_attr = ah->ib_ah.sgid_attr; struct bnxt_qplib_sge sge; - union ib_gid sgid; u8 nw_type; u16 ether_type; - struct ib_gid_attr sgid_attr; union ib_gid dgid; bool is_eth = false; bool is_vlan = false; @@ -1920,22 +1899,10 @@ static int bnxt_re_build_qp1_send_v2(struct bnxt_re_qp *qp, memset(&qp->qp1_hdr, 0, sizeof(qp->qp1_hdr)); - rc = ib_get_cached_gid(ibdev, 1, - qplib_ah->host_sgid_index, &sgid, - &sgid_attr); - if (rc) { - dev_err(rdev_to_dev(qp->rdev), - "Failed to query gid at index %d", - qplib_ah->host_sgid_index); - return rc; - } - if (sgid_attr.ndev) { - if (is_vlan_dev(sgid_attr.ndev)) - vlan_id = vlan_dev_vlan_id(sgid_attr.ndev); - dev_put(sgid_attr.ndev); - } + if (is_vlan_dev(sgid_attr->ndev)) + vlan_id = vlan_dev_vlan_id(sgid_attr->ndev); /* Get network header type for this GID */ - nw_type = ib_gid_to_network_type(sgid_attr.gid_type, &sgid); + nw_type = rdma_gid_attr_network_type(sgid_attr); switch (nw_type) { case RDMA_NETWORK_IPV4: nw_type = BNXT_RE_ROCEV2_IPV4_PACKET; @@ -1948,9 +1915,9 @@ static int bnxt_re_build_qp1_send_v2(struct bnxt_re_qp *qp, break; } memcpy(&dgid.raw, &qplib_ah->dgid, 16); - is_udp = sgid_attr.gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP; + is_udp = sgid_attr->gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP; if (is_udp) { - if (ipv6_addr_v4mapped((struct in6_addr *)&sgid)) { + if (ipv6_addr_v4mapped((struct in6_addr *)&sgid_attr->gid)) { ip_version = 4; ether_type = ETH_P_IP; } else { @@ -1983,9 +1950,10 @@ static int bnxt_re_build_qp1_send_v2(struct bnxt_re_qp *qp, } if (is_grh || (ip_version == 6)) { - memcpy(qp->qp1_hdr.grh.source_gid.raw, sgid.raw, sizeof(sgid)); + memcpy(qp->qp1_hdr.grh.source_gid.raw, sgid_attr->gid.raw, + sizeof(sgid_attr->gid)); memcpy(qp->qp1_hdr.grh.destination_gid.raw, qplib_ah->dgid.data, - sizeof(sgid)); + sizeof(sgid_attr->gid)); qp->qp1_hdr.grh.hop_limit = qplib_ah->hop_limit; } @@ -1995,7 +1963,7 @@ static int bnxt_re_build_qp1_send_v2(struct bnxt_re_qp *qp, qp->qp1_hdr.ip4.frag_off = htons(IP_DF); qp->qp1_hdr.ip4.ttl = qplib_ah->hop_limit; - memcpy(&qp->qp1_hdr.ip4.saddr, sgid.raw + 12, 4); + memcpy(&qp->qp1_hdr.ip4.saddr, sgid_attr->gid.raw + 12, 4); memcpy(&qp->qp1_hdr.ip4.daddr, qplib_ah->dgid.data + 12, 4); qp->qp1_hdr.ip4.check = ib_ud_ip4_csum(&qp->qp1_hdr); } @@ -2080,7 +2048,7 @@ static int bnxt_re_build_qp1_send_v2(struct bnxt_re_qp *qp, * and the MAD datagram out to the provided SGE. */ static int bnxt_re_build_qp1_shadow_qp_recv(struct bnxt_re_qp *qp, - struct ib_recv_wr *wr, + const struct ib_recv_wr *wr, struct bnxt_qplib_swqe *wqe, int payload_size) { @@ -2125,7 +2093,7 @@ static int is_ud_qp(struct bnxt_re_qp *qp) } static int bnxt_re_build_send_wqe(struct bnxt_re_qp *qp, - struct ib_send_wr *wr, + const struct ib_send_wr *wr, struct bnxt_qplib_swqe *wqe) { struct bnxt_re_ah *ah = NULL; @@ -2163,7 +2131,7 @@ static int bnxt_re_build_send_wqe(struct bnxt_re_qp *qp, return 0; } -static int bnxt_re_build_rdma_wqe(struct ib_send_wr *wr, +static int bnxt_re_build_rdma_wqe(const struct ib_send_wr *wr, struct bnxt_qplib_swqe *wqe) { switch (wr->opcode) { @@ -2195,7 +2163,7 @@ static int bnxt_re_build_rdma_wqe(struct ib_send_wr *wr, return 0; } -static int bnxt_re_build_atomic_wqe(struct ib_send_wr *wr, +static int bnxt_re_build_atomic_wqe(const struct ib_send_wr *wr, struct bnxt_qplib_swqe *wqe) { switch (wr->opcode) { @@ -2222,7 +2190,7 @@ static int bnxt_re_build_atomic_wqe(struct ib_send_wr *wr, return 0; } -static int bnxt_re_build_inv_wqe(struct ib_send_wr *wr, +static int bnxt_re_build_inv_wqe(const struct ib_send_wr *wr, struct bnxt_qplib_swqe *wqe) { wqe->type = BNXT_QPLIB_SWQE_TYPE_LOCAL_INV; @@ -2241,7 +2209,7 @@ static int bnxt_re_build_inv_wqe(struct ib_send_wr *wr, return 0; } -static int bnxt_re_build_reg_wqe(struct ib_reg_wr *wr, +static int bnxt_re_build_reg_wqe(const struct ib_reg_wr *wr, struct bnxt_qplib_swqe *wqe) { struct bnxt_re_mr *mr = container_of(wr->mr, struct bnxt_re_mr, ib_mr); @@ -2283,7 +2251,7 @@ static int bnxt_re_build_reg_wqe(struct ib_reg_wr *wr, } static int bnxt_re_copy_inline_data(struct bnxt_re_dev *rdev, - struct ib_send_wr *wr, + const struct ib_send_wr *wr, struct bnxt_qplib_swqe *wqe) { /* Copy the inline data to the data field */ @@ -2313,7 +2281,7 @@ static int bnxt_re_copy_inline_data(struct bnxt_re_dev *rdev, } static int bnxt_re_copy_wr_payload(struct bnxt_re_dev *rdev, - struct ib_send_wr *wr, + const struct ib_send_wr *wr, struct bnxt_qplib_swqe *wqe) { int payload_sz = 0; @@ -2345,7 +2313,7 @@ static void bnxt_ud_qp_hw_stall_workaround(struct bnxt_re_qp *qp) static int bnxt_re_post_send_shadow_qp(struct bnxt_re_dev *rdev, struct bnxt_re_qp *qp, - struct ib_send_wr *wr) + const struct ib_send_wr *wr) { struct bnxt_qplib_swqe wqe; int rc = 0, payload_sz = 0; @@ -2393,8 +2361,8 @@ bad: return rc; } -int bnxt_re_post_send(struct ib_qp *ib_qp, struct ib_send_wr *wr, - struct ib_send_wr **bad_wr) +int bnxt_re_post_send(struct ib_qp *ib_qp, const struct ib_send_wr *wr, + const struct ib_send_wr **bad_wr) { struct bnxt_re_qp *qp = container_of(ib_qp, struct bnxt_re_qp, ib_qp); struct bnxt_qplib_swqe wqe; @@ -2441,7 +2409,7 @@ int bnxt_re_post_send(struct ib_qp *ib_qp, struct ib_send_wr *wr, default: break; } - /* Fall thru to build the wqe */ + /* fall through */ case IB_WR_SEND_WITH_INV: rc = bnxt_re_build_send_wqe(qp, wr, &wqe); break; @@ -2493,7 +2461,7 @@ bad: static int bnxt_re_post_recv_shadow_qp(struct bnxt_re_dev *rdev, struct bnxt_re_qp *qp, - struct ib_recv_wr *wr) + const struct ib_recv_wr *wr) { struct bnxt_qplib_swqe wqe; int rc = 0; @@ -2526,8 +2494,8 @@ static int bnxt_re_post_recv_shadow_qp(struct bnxt_re_dev *rdev, return rc; } -int bnxt_re_post_recv(struct ib_qp *ib_qp, struct ib_recv_wr *wr, - struct ib_recv_wr **bad_wr) +int bnxt_re_post_recv(struct ib_qp *ib_qp, const struct ib_recv_wr *wr, + const struct ib_recv_wr **bad_wr) { struct bnxt_re_qp *qp = container_of(ib_qp, struct bnxt_re_qp, ib_qp); struct bnxt_qplib_swqe wqe; diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.h b/drivers/infiniband/hw/bnxt_re/ib_verbs.h index 5c6414cad4af..aa33e7b82c84 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.h +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.h @@ -158,8 +158,7 @@ void bnxt_re_query_fw_str(struct ib_device *ibdev, char *str); int bnxt_re_query_pkey(struct ib_device *ibdev, u8 port_num, u16 index, u16 *pkey); int bnxt_re_del_gid(const struct ib_gid_attr *attr, void **context); -int bnxt_re_add_gid(const union ib_gid *gid, - const struct ib_gid_attr *attr, void **context); +int bnxt_re_add_gid(const struct ib_gid_attr *attr, void **context); int bnxt_re_query_gid(struct ib_device *ibdev, u8 port_num, int index, union ib_gid *gid); enum rdma_link_layer bnxt_re_get_link_layer(struct ib_device *ibdev, @@ -182,8 +181,8 @@ int bnxt_re_modify_srq(struct ib_srq *srq, struct ib_srq_attr *srq_attr, struct ib_udata *udata); int bnxt_re_query_srq(struct ib_srq *srq, struct ib_srq_attr *srq_attr); int bnxt_re_destroy_srq(struct ib_srq *srq); -int bnxt_re_post_srq_recv(struct ib_srq *srq, struct ib_recv_wr *recv_wr, - struct ib_recv_wr **bad_recv_wr); +int bnxt_re_post_srq_recv(struct ib_srq *srq, const struct ib_recv_wr *recv_wr, + const struct ib_recv_wr **bad_recv_wr); struct ib_qp *bnxt_re_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *qp_init_attr, struct ib_udata *udata); @@ -192,10 +191,10 @@ int bnxt_re_modify_qp(struct ib_qp *qp, struct ib_qp_attr *qp_attr, int bnxt_re_query_qp(struct ib_qp *qp, struct ib_qp_attr *qp_attr, int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr); int bnxt_re_destroy_qp(struct ib_qp *qp); -int bnxt_re_post_send(struct ib_qp *qp, struct ib_send_wr *send_wr, - struct ib_send_wr **bad_send_wr); -int bnxt_re_post_recv(struct ib_qp *qp, struct ib_recv_wr *recv_wr, - struct ib_recv_wr **bad_recv_wr); +int bnxt_re_post_send(struct ib_qp *qp, const struct ib_send_wr *send_wr, + const struct ib_send_wr **bad_send_wr); +int bnxt_re_post_recv(struct ib_qp *qp, const struct ib_recv_wr *recv_wr, + const struct ib_recv_wr **bad_recv_wr); struct ib_cq *bnxt_re_create_cq(struct ib_device *ibdev, const struct ib_cq_init_attr *attr, struct ib_ucontext *context, diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.c b/drivers/infiniband/hw/bnxt_re/qplib_fp.c index 50d8f1fc98d5..e426b990c1dd 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_fp.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.c @@ -2354,7 +2354,7 @@ static int bnxt_qplib_cq_process_res_rc(struct bnxt_qplib_cq *cq, srq = qp->srq; if (!srq) return -EINVAL; - if (wr_id_idx > srq->hwq.max_elements) { + if (wr_id_idx >= srq->hwq.max_elements) { dev_err(&cq->hwq.pdev->dev, "QPLIB: FP: CQ Process RC "); dev_err(&cq->hwq.pdev->dev, @@ -2369,7 +2369,7 @@ static int bnxt_qplib_cq_process_res_rc(struct bnxt_qplib_cq *cq, *pcqe = cqe; } else { rq = &qp->rq; - if (wr_id_idx > rq->hwq.max_elements) { + if (wr_id_idx >= rq->hwq.max_elements) { dev_err(&cq->hwq.pdev->dev, "QPLIB: FP: CQ Process RC "); dev_err(&cq->hwq.pdev->dev, @@ -2437,7 +2437,7 @@ static int bnxt_qplib_cq_process_res_ud(struct bnxt_qplib_cq *cq, if (!srq) return -EINVAL; - if (wr_id_idx > srq->hwq.max_elements) { + if (wr_id_idx >= srq->hwq.max_elements) { dev_err(&cq->hwq.pdev->dev, "QPLIB: FP: CQ Process UD "); dev_err(&cq->hwq.pdev->dev, @@ -2452,7 +2452,7 @@ static int bnxt_qplib_cq_process_res_ud(struct bnxt_qplib_cq *cq, *pcqe = cqe; } else { rq = &qp->rq; - if (wr_id_idx > rq->hwq.max_elements) { + if (wr_id_idx >= rq->hwq.max_elements) { dev_err(&cq->hwq.pdev->dev, "QPLIB: FP: CQ Process UD "); dev_err(&cq->hwq.pdev->dev, @@ -2546,7 +2546,7 @@ static int bnxt_qplib_cq_process_res_raweth_qp1(struct bnxt_qplib_cq *cq, "QPLIB: FP: SRQ used but not defined??"); return -EINVAL; } - if (wr_id_idx > srq->hwq.max_elements) { + if (wr_id_idx >= srq->hwq.max_elements) { dev_err(&cq->hwq.pdev->dev, "QPLIB: FP: CQ Process Raw/QP1 "); dev_err(&cq->hwq.pdev->dev, @@ -2561,7 +2561,7 @@ static int bnxt_qplib_cq_process_res_raweth_qp1(struct bnxt_qplib_cq *cq, *pcqe = cqe; } else { rq = &qp->rq; - if (wr_id_idx > rq->hwq.max_elements) { + if (wr_id_idx >= rq->hwq.max_elements) { dev_err(&cq->hwq.pdev->dev, "QPLIB: FP: CQ Process Raw/QP1 RQ wr_id "); dev_err(&cq->hwq.pdev->dev, diff --git a/drivers/infiniband/hw/bnxt_re/qplib_sp.c b/drivers/infiniband/hw/bnxt_re/qplib_sp.c index 2f3f32eaa1d5..4097f3fa25c5 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_sp.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_sp.c @@ -197,7 +197,7 @@ int bnxt_qplib_get_sgid(struct bnxt_qplib_res *res, struct bnxt_qplib_sgid_tbl *sgid_tbl, int index, struct bnxt_qplib_gid *gid) { - if (index > sgid_tbl->max) { + if (index >= sgid_tbl->max) { dev_err(&res->pdev->dev, "QPLIB: Index %d exceeded SGID table max (%d)", index, sgid_tbl->max); @@ -402,7 +402,7 @@ int bnxt_qplib_get_pkey(struct bnxt_qplib_res *res, *pkey = 0xFFFF; return 0; } - if (index > pkey_tbl->max) { + if (index >= pkey_tbl->max) { dev_err(&res->pdev->dev, "QPLIB: Index %d exceeded PKEY table max (%d)", index, pkey_tbl->max); diff --git a/drivers/infiniband/hw/cxgb3/iwch_cq.c b/drivers/infiniband/hw/cxgb3/iwch_cq.c index 0a8542c20804..a098c0140580 100644 --- a/drivers/infiniband/hw/cxgb3/iwch_cq.c +++ b/drivers/infiniband/hw/cxgb3/iwch_cq.c @@ -32,38 +32,16 @@ #include "iwch_provider.h" #include "iwch.h" -/* - * Get one cq entry from cxio and map it to openib. - * - * Returns: - * 0 EMPTY; - * 1 cqe returned - * -EAGAIN caller must try again - * any other -errno fatal error - */ -static int iwch_poll_cq_one(struct iwch_dev *rhp, struct iwch_cq *chp, - struct ib_wc *wc) +static int __iwch_poll_cq_one(struct iwch_dev *rhp, struct iwch_cq *chp, + struct iwch_qp *qhp, struct ib_wc *wc) { - struct iwch_qp *qhp = NULL; - struct t3_cqe cqe, *rd_cqe; - struct t3_wq *wq; + struct t3_wq *wq = qhp ? &qhp->wq : NULL; + struct t3_cqe cqe; u32 credit = 0; u8 cqe_flushed; u64 cookie; int ret = 1; - rd_cqe = cxio_next_cqe(&chp->cq); - - if (!rd_cqe) - return 0; - - qhp = get_qhp(rhp, CQE_QPID(*rd_cqe)); - if (!qhp) - wq = NULL; - else { - spin_lock(&qhp->lock); - wq = &(qhp->wq); - } ret = cxio_poll_cq(wq, &(chp->cq), &cqe, &cqe_flushed, &cookie, &credit); if (t3a_device(chp->rhp) && credit) { @@ -79,7 +57,7 @@ static int iwch_poll_cq_one(struct iwch_dev *rhp, struct iwch_cq *chp, ret = 1; wc->wr_id = cookie; - wc->qp = &qhp->ibqp; + wc->qp = qhp ? &qhp->ibqp : NULL; wc->vendor_err = CQE_STATUS(cqe); wc->wc_flags = 0; @@ -182,8 +160,38 @@ static int iwch_poll_cq_one(struct iwch_dev *rhp, struct iwch_cq *chp, } } out: - if (wq) + return ret; +} + +/* + * Get one cq entry from cxio and map it to openib. + * + * Returns: + * 0 EMPTY; + * 1 cqe returned + * -EAGAIN caller must try again + * any other -errno fatal error + */ +static int iwch_poll_cq_one(struct iwch_dev *rhp, struct iwch_cq *chp, + struct ib_wc *wc) +{ + struct iwch_qp *qhp; + struct t3_cqe *rd_cqe; + int ret; + + rd_cqe = cxio_next_cqe(&chp->cq); + + if (!rd_cqe) + return 0; + + qhp = get_qhp(rhp, CQE_QPID(*rd_cqe)); + if (qhp) { + spin_lock(&qhp->lock); + ret = __iwch_poll_cq_one(rhp, chp, qhp, wc); spin_unlock(&qhp->lock); + } else { + ret = __iwch_poll_cq_one(rhp, chp, NULL, wc); + } return ret; } diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.c b/drivers/infiniband/hw/cxgb3/iwch_provider.c index be097c6723c0..1b9ff21aa1d5 100644 --- a/drivers/infiniband/hw/cxgb3/iwch_provider.c +++ b/drivers/infiniband/hw/cxgb3/iwch_provider.c @@ -61,42 +61,6 @@ #include <rdma/cxgb3-abi.h> #include "common.h" -static struct ib_ah *iwch_ah_create(struct ib_pd *pd, - struct rdma_ah_attr *ah_attr, - struct ib_udata *udata) -{ - return ERR_PTR(-ENOSYS); -} - -static int iwch_ah_destroy(struct ib_ah *ah) -{ - return -ENOSYS; -} - -static int iwch_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) -{ - return -ENOSYS; -} - -static int iwch_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) -{ - return -ENOSYS; -} - -static int iwch_process_mad(struct ib_device *ibdev, - int mad_flags, - u8 port_num, - const struct ib_wc *in_wc, - const struct ib_grh *in_grh, - const struct ib_mad_hdr *in_mad, - size_t in_mad_size, - struct ib_mad_hdr *out_mad, - size_t *out_mad_size, - u16 *out_mad_pkey_index) -{ - return -ENOSYS; -} - static int iwch_dealloc_ucontext(struct ib_ucontext *context) { struct iwch_dev *rhp = to_iwch_dev(context->device); @@ -1103,7 +1067,8 @@ static int iwch_query_device(struct ib_device *ibdev, struct ib_device_attr *pro props->max_mr_size = dev->attr.max_mr_size; props->max_qp = dev->attr.max_qps; props->max_qp_wr = dev->attr.max_wrs; - props->max_sge = dev->attr.max_sge_per_wr; + props->max_send_sge = dev->attr.max_sge_per_wr; + props->max_recv_sge = dev->attr.max_sge_per_wr; props->max_sge_rd = 1; props->max_qp_rd_atom = dev->attr.max_rdma_reads_per_qp; props->max_qp_init_rd_atom = dev->attr.max_rdma_reads_per_qp; @@ -1398,8 +1363,6 @@ int iwch_register_device(struct iwch_dev *dev) dev->ibdev.mmap = iwch_mmap; dev->ibdev.alloc_pd = iwch_allocate_pd; dev->ibdev.dealloc_pd = iwch_deallocate_pd; - dev->ibdev.create_ah = iwch_ah_create; - dev->ibdev.destroy_ah = iwch_ah_destroy; dev->ibdev.create_qp = iwch_create_qp; dev->ibdev.modify_qp = iwch_ib_modify_qp; dev->ibdev.destroy_qp = iwch_destroy_qp; @@ -1414,9 +1377,6 @@ int iwch_register_device(struct iwch_dev *dev) dev->ibdev.dealloc_mw = iwch_dealloc_mw; dev->ibdev.alloc_mr = iwch_alloc_mr; dev->ibdev.map_mr_sg = iwch_map_mr_sg; - dev->ibdev.attach_mcast = iwch_multicast_attach; - dev->ibdev.detach_mcast = iwch_multicast_detach; - dev->ibdev.process_mad = iwch_process_mad; dev->ibdev.req_notify_cq = iwch_arm_cq; dev->ibdev.post_send = iwch_post_send; dev->ibdev.post_recv = iwch_post_receive; diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.h b/drivers/infiniband/hw/cxgb3/iwch_provider.h index 2e38ddefea8a..8adbe9658935 100644 --- a/drivers/infiniband/hw/cxgb3/iwch_provider.h +++ b/drivers/infiniband/hw/cxgb3/iwch_provider.h @@ -326,10 +326,10 @@ enum iwch_qp_query_flags { }; u16 iwch_rqes_posted(struct iwch_qp *qhp); -int iwch_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, - struct ib_send_wr **bad_wr); -int iwch_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr, - struct ib_recv_wr **bad_wr); +int iwch_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, + const struct ib_send_wr **bad_wr); +int iwch_post_receive(struct ib_qp *ibqp, const struct ib_recv_wr *wr, + const struct ib_recv_wr **bad_wr); int iwch_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc); int iwch_post_terminate(struct iwch_qp *qhp, struct respQ_msg_t *rsp_msg); int iwch_post_zb_read(struct iwch_ep *ep); diff --git a/drivers/infiniband/hw/cxgb3/iwch_qp.c b/drivers/infiniband/hw/cxgb3/iwch_qp.c index 3871e1fd8395..c649faad63f9 100644 --- a/drivers/infiniband/hw/cxgb3/iwch_qp.c +++ b/drivers/infiniband/hw/cxgb3/iwch_qp.c @@ -39,8 +39,8 @@ #define NO_SUPPORT -1 -static int build_rdma_send(union t3_wr *wqe, struct ib_send_wr *wr, - u8 * flit_cnt) +static int build_rdma_send(union t3_wr *wqe, const struct ib_send_wr *wr, + u8 *flit_cnt) { int i; u32 plen; @@ -84,8 +84,8 @@ static int build_rdma_send(union t3_wr *wqe, struct ib_send_wr *wr, return 0; } -static int build_rdma_write(union t3_wr *wqe, struct ib_send_wr *wr, - u8 *flit_cnt) +static int build_rdma_write(union t3_wr *wqe, const struct ib_send_wr *wr, + u8 *flit_cnt) { int i; u32 plen; @@ -125,8 +125,8 @@ static int build_rdma_write(union t3_wr *wqe, struct ib_send_wr *wr, return 0; } -static int build_rdma_read(union t3_wr *wqe, struct ib_send_wr *wr, - u8 *flit_cnt) +static int build_rdma_read(union t3_wr *wqe, const struct ib_send_wr *wr, + u8 *flit_cnt) { if (wr->num_sge > 1) return -EINVAL; @@ -146,8 +146,8 @@ static int build_rdma_read(union t3_wr *wqe, struct ib_send_wr *wr, return 0; } -static int build_memreg(union t3_wr *wqe, struct ib_reg_wr *wr, - u8 *flit_cnt, int *wr_cnt, struct t3_wq *wq) +static int build_memreg(union t3_wr *wqe, const struct ib_reg_wr *wr, + u8 *flit_cnt, int *wr_cnt, struct t3_wq *wq) { struct iwch_mr *mhp = to_iwch_mr(wr->mr); int i; @@ -189,8 +189,8 @@ static int build_memreg(union t3_wr *wqe, struct ib_reg_wr *wr, return 0; } -static int build_inv_stag(union t3_wr *wqe, struct ib_send_wr *wr, - u8 *flit_cnt) +static int build_inv_stag(union t3_wr *wqe, const struct ib_send_wr *wr, + u8 *flit_cnt) { wqe->local_inv.stag = cpu_to_be32(wr->ex.invalidate_rkey); wqe->local_inv.reserved = 0; @@ -246,7 +246,7 @@ static int iwch_sgl2pbl_map(struct iwch_dev *rhp, struct ib_sge *sg_list, } static int build_rdma_recv(struct iwch_qp *qhp, union t3_wr *wqe, - struct ib_recv_wr *wr) + const struct ib_recv_wr *wr) { int i, err = 0; u32 pbl_addr[T3_MAX_SGE]; @@ -286,7 +286,7 @@ static int build_rdma_recv(struct iwch_qp *qhp, union t3_wr *wqe, } static int build_zero_stag_recv(struct iwch_qp *qhp, union t3_wr *wqe, - struct ib_recv_wr *wr) + const struct ib_recv_wr *wr) { int i; u32 pbl_addr; @@ -348,8 +348,8 @@ static int build_zero_stag_recv(struct iwch_qp *qhp, union t3_wr *wqe, return 0; } -int iwch_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, - struct ib_send_wr **bad_wr) +int iwch_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, + const struct ib_send_wr **bad_wr) { int err = 0; u8 uninitialized_var(t3_wr_flit_cnt); @@ -463,8 +463,8 @@ out: return err; } -int iwch_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr, - struct ib_recv_wr **bad_wr) +int iwch_post_receive(struct ib_qp *ibqp, const struct ib_recv_wr *wr, + const struct ib_recv_wr **bad_wr) { int err = 0; struct iwch_qp *qhp; diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c index 0912fa026327..0f83cbec33f3 100644 --- a/drivers/infiniband/hw/cxgb4/cm.c +++ b/drivers/infiniband/hw/cxgb4/cm.c @@ -587,24 +587,29 @@ static int send_flowc(struct c4iw_ep *ep) { struct fw_flowc_wr *flowc; struct sk_buff *skb = skb_dequeue(&ep->com.ep_skb_list); - int i; u16 vlan = ep->l2t->vlan; int nparams; + int flowclen, flowclen16; if (WARN_ON(!skb)) return -ENOMEM; if (vlan == CPL_L2T_VLAN_NONE) - nparams = 8; - else nparams = 9; + else + nparams = 10; - flowc = __skb_put(skb, FLOWC_LEN); + flowclen = offsetof(struct fw_flowc_wr, mnemval[nparams]); + flowclen16 = DIV_ROUND_UP(flowclen, 16); + flowclen = flowclen16 * 16; + + flowc = __skb_put(skb, flowclen); + memset(flowc, 0, flowclen); flowc->op_to_nparams = cpu_to_be32(FW_WR_OP_V(FW_FLOWC_WR) | FW_FLOWC_WR_NPARAMS_V(nparams)); - flowc->flowid_len16 = cpu_to_be32(FW_WR_LEN16_V(DIV_ROUND_UP(FLOWC_LEN, - 16)) | FW_WR_FLOWID_V(ep->hwtid)); + flowc->flowid_len16 = cpu_to_be32(FW_WR_LEN16_V(flowclen16) | + FW_WR_FLOWID_V(ep->hwtid)); flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_PFNVFN; flowc->mnemval[0].val = cpu_to_be32(FW_PFVF_CMD_PFN_V @@ -623,21 +628,13 @@ static int send_flowc(struct c4iw_ep *ep) flowc->mnemval[6].val = cpu_to_be32(ep->snd_win); flowc->mnemval[7].mnemonic = FW_FLOWC_MNEM_MSS; flowc->mnemval[7].val = cpu_to_be32(ep->emss); - if (nparams == 9) { + flowc->mnemval[8].mnemonic = FW_FLOWC_MNEM_RCV_SCALE; + flowc->mnemval[8].val = cpu_to_be32(ep->snd_wscale); + if (nparams == 10) { u16 pri; - pri = (vlan & VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT; - flowc->mnemval[8].mnemonic = FW_FLOWC_MNEM_SCHEDCLASS; - flowc->mnemval[8].val = cpu_to_be32(pri); - } else { - /* Pad WR to 16 byte boundary */ - flowc->mnemval[8].mnemonic = 0; - flowc->mnemval[8].val = 0; - } - for (i = 0; i < 9; i++) { - flowc->mnemval[i].r4[0] = 0; - flowc->mnemval[i].r4[1] = 0; - flowc->mnemval[i].r4[2] = 0; + flowc->mnemval[9].mnemonic = FW_FLOWC_MNEM_SCHEDCLASS; + flowc->mnemval[9].val = cpu_to_be32(pri); } set_wr_txq(skb, CPL_PRIORITY_DATA, ep->txq_idx); @@ -1176,6 +1173,7 @@ static int act_establish(struct c4iw_dev *dev, struct sk_buff *skb) { struct c4iw_ep *ep; struct cpl_act_establish *req = cplhdr(skb); + unsigned short tcp_opt = ntohs(req->tcp_opt); unsigned int tid = GET_TID(req); unsigned int atid = TID_TID_G(ntohl(req->tos_atid)); struct tid_info *t = dev->rdev.lldi.tids; @@ -1196,8 +1194,9 @@ static int act_establish(struct c4iw_dev *dev, struct sk_buff *skb) ep->snd_seq = be32_to_cpu(req->snd_isn); ep->rcv_seq = be32_to_cpu(req->rcv_isn); + ep->snd_wscale = TCPOPT_SND_WSCALE_G(tcp_opt); - set_emss(ep, ntohs(req->tcp_opt)); + set_emss(ep, tcp_opt); /* dealloc the atid */ remove_handle(ep->com.dev, &ep->com.dev->atid_idr, atid); @@ -1853,10 +1852,33 @@ static int rx_data(struct c4iw_dev *dev, struct sk_buff *skb) return 0; } +static void complete_cached_srq_buffers(struct c4iw_ep *ep, + __be32 srqidx_status) +{ + enum chip_type adapter_type; + u32 srqidx; + + adapter_type = ep->com.dev->rdev.lldi.adapter_type; + srqidx = ABORT_RSS_SRQIDX_G(be32_to_cpu(srqidx_status)); + + /* + * If this TCB had a srq buffer cached, then we must complete + * it. For user mode, that means saving the srqidx in the + * user/kernel status page for this qp. For kernel mode, just + * synthesize the CQE now. + */ + if (CHELSIO_CHIP_VERSION(adapter_type) > CHELSIO_T5 && srqidx) { + if (ep->com.qp->ibqp.uobject) + t4_set_wq_in_error(&ep->com.qp->wq, srqidx); + else + c4iw_flush_srqidx(ep->com.qp, srqidx); + } +} + static int abort_rpl(struct c4iw_dev *dev, struct sk_buff *skb) { struct c4iw_ep *ep; - struct cpl_abort_rpl_rss *rpl = cplhdr(skb); + struct cpl_abort_rpl_rss6 *rpl = cplhdr(skb); int release = 0; unsigned int tid = GET_TID(rpl); @@ -1865,6 +1887,9 @@ static int abort_rpl(struct c4iw_dev *dev, struct sk_buff *skb) pr_warn("Abort rpl to freed endpoint\n"); return 0; } + + complete_cached_srq_buffers(ep, rpl->srqidx_status); + pr_debug("ep %p tid %u\n", ep, ep->hwtid); mutex_lock(&ep->com.mutex); switch (ep->com.state) { @@ -2603,16 +2628,17 @@ static int pass_establish(struct c4iw_dev *dev, struct sk_buff *skb) struct cpl_pass_establish *req = cplhdr(skb); unsigned int tid = GET_TID(req); int ret; + u16 tcp_opt = ntohs(req->tcp_opt); ep = get_ep_from_tid(dev, tid); pr_debug("ep %p tid %u\n", ep, ep->hwtid); ep->snd_seq = be32_to_cpu(req->snd_isn); ep->rcv_seq = be32_to_cpu(req->rcv_isn); + ep->snd_wscale = TCPOPT_SND_WSCALE_G(tcp_opt); - pr_debug("ep %p hwtid %u tcp_opt 0x%02x\n", ep, tid, - ntohs(req->tcp_opt)); + pr_debug("ep %p hwtid %u tcp_opt 0x%02x\n", ep, tid, tcp_opt); - set_emss(ep, ntohs(req->tcp_opt)); + set_emss(ep, tcp_opt); dst_confirm(ep->dst); mutex_lock(&ep->com.mutex); @@ -2719,28 +2745,35 @@ static int peer_close(struct c4iw_dev *dev, struct sk_buff *skb) static int peer_abort(struct c4iw_dev *dev, struct sk_buff *skb) { - struct cpl_abort_req_rss *req = cplhdr(skb); + struct cpl_abort_req_rss6 *req = cplhdr(skb); struct c4iw_ep *ep; struct sk_buff *rpl_skb; struct c4iw_qp_attributes attrs; int ret; int release = 0; unsigned int tid = GET_TID(req); + u8 status; + u32 len = roundup(sizeof(struct cpl_abort_rpl), 16); ep = get_ep_from_tid(dev, tid); if (!ep) return 0; - if (cxgb_is_neg_adv(req->status)) { + status = ABORT_RSS_STATUS_G(be32_to_cpu(req->srqidx_status)); + + if (cxgb_is_neg_adv(status)) { pr_debug("Negative advice on abort- tid %u status %d (%s)\n", - ep->hwtid, req->status, neg_adv_str(req->status)); + ep->hwtid, status, neg_adv_str(status)); ep->stats.abort_neg_adv++; mutex_lock(&dev->rdev.stats.lock); dev->rdev.stats.neg_adv++; mutex_unlock(&dev->rdev.stats.lock); goto deref_ep; } + + complete_cached_srq_buffers(ep, req->srqidx_status); + pr_debug("ep %p tid %u state %u\n", ep, ep->hwtid, ep->com.state); set_bit(PEER_ABORT, &ep->com.history); @@ -3444,9 +3477,6 @@ int c4iw_create_listen(struct iw_cm_id *cm_id, int backlog) } insert_handle(dev, &dev->stid_idr, ep, ep->stid); - memcpy(&ep->com.local_addr, &cm_id->m_local_addr, - sizeof(ep->com.local_addr)); - state_set(&ep->com, LISTEN); if (ep->com.local_addr.ss_family == AF_INET) err = create_server4(dev, ep); diff --git a/drivers/infiniband/hw/cxgb4/cq.c b/drivers/infiniband/hw/cxgb4/cq.c index 2be2e1ac1b5f..6d3042794094 100644 --- a/drivers/infiniband/hw/cxgb4/cq.c +++ b/drivers/infiniband/hw/cxgb4/cq.c @@ -77,6 +77,10 @@ static int create_cq(struct c4iw_rdev *rdev, struct t4_cq *cq, int user = (uctx != &rdev->uctx); int ret; struct sk_buff *skb; + struct c4iw_ucontext *ucontext = NULL; + + if (user) + ucontext = container_of(uctx, struct c4iw_ucontext, uctx); cq->cqid = c4iw_get_cqid(rdev, uctx); if (!cq->cqid) { @@ -100,6 +104,16 @@ static int create_cq(struct c4iw_rdev *rdev, struct t4_cq *cq, dma_unmap_addr_set(cq, mapping, cq->dma_addr); memset(cq->queue, 0, cq->memsize); + if (user && ucontext->is_32b_cqe) { + cq->qp_errp = &((struct t4_status_page *) + ((u8 *)cq->queue + (cq->size - 1) * + (sizeof(*cq->queue) / 2)))->qp_err; + } else { + cq->qp_errp = &((struct t4_status_page *) + ((u8 *)cq->queue + (cq->size - 1) * + sizeof(*cq->queue)))->qp_err; + } + /* build fw_ri_res_wr */ wr_len = sizeof *res_wr + sizeof *res; @@ -132,7 +146,9 @@ static int create_cq(struct c4iw_rdev *rdev, struct t4_cq *cq, FW_RI_RES_WR_IQPCIECH_V(2) | FW_RI_RES_WR_IQINTCNTTHRESH_V(0) | FW_RI_RES_WR_IQO_F | - FW_RI_RES_WR_IQESIZE_V(1)); + ((user && ucontext->is_32b_cqe) ? + FW_RI_RES_WR_IQESIZE_V(1) : + FW_RI_RES_WR_IQESIZE_V(2))); res->u.cq.iqsize = cpu_to_be16(cq->size); res->u.cq.iqaddr = cpu_to_be64(cq->dma_addr); @@ -166,7 +182,7 @@ err1: return ret; } -static void insert_recv_cqe(struct t4_wq *wq, struct t4_cq *cq) +static void insert_recv_cqe(struct t4_wq *wq, struct t4_cq *cq, u32 srqidx) { struct t4_cqe cqe; @@ -179,6 +195,8 @@ static void insert_recv_cqe(struct t4_wq *wq, struct t4_cq *cq) CQE_SWCQE_V(1) | CQE_QPID_V(wq->sq.qid)); cqe.bits_type_ts = cpu_to_be64(CQE_GENBIT_V((u64)cq->gen)); + if (srqidx) + cqe.u.srcqe.abs_rqe_idx = cpu_to_be32(srqidx); cq->sw_queue[cq->sw_pidx] = cqe; t4_swcq_produce(cq); } @@ -191,7 +209,7 @@ int c4iw_flush_rq(struct t4_wq *wq, struct t4_cq *cq, int count) pr_debug("wq %p cq %p rq.in_use %u skip count %u\n", wq, cq, wq->rq.in_use, count); while (in_use--) { - insert_recv_cqe(wq, cq); + insert_recv_cqe(wq, cq, 0); flushed++; } return flushed; @@ -442,6 +460,72 @@ void c4iw_count_rcqes(struct t4_cq *cq, struct t4_wq *wq, int *count) pr_debug("cq %p count %d\n", cq, *count); } +static void post_pending_srq_wrs(struct t4_srq *srq) +{ + struct t4_srq_pending_wr *pwr; + u16 idx = 0; + + while (srq->pending_in_use) { + pwr = &srq->pending_wrs[srq->pending_cidx]; + srq->sw_rq[srq->pidx].wr_id = pwr->wr_id; + srq->sw_rq[srq->pidx].valid = 1; + + pr_debug("%s posting pending cidx %u pidx %u wq_pidx %u in_use %u rq_size %u wr_id %llx\n", + __func__, + srq->cidx, srq->pidx, srq->wq_pidx, + srq->in_use, srq->size, + (unsigned long long)pwr->wr_id); + + c4iw_copy_wr_to_srq(srq, &pwr->wqe, pwr->len16); + t4_srq_consume_pending_wr(srq); + t4_srq_produce(srq, pwr->len16); + idx += DIV_ROUND_UP(pwr->len16 * 16, T4_EQ_ENTRY_SIZE); + } + + if (idx) { + t4_ring_srq_db(srq, idx, pwr->len16, &pwr->wqe); + srq->queue[srq->size].status.host_wq_pidx = + srq->wq_pidx; + } +} + +static u64 reap_srq_cqe(struct t4_cqe *hw_cqe, struct t4_srq *srq) +{ + int rel_idx = CQE_ABS_RQE_IDX(hw_cqe) - srq->rqt_abs_idx; + u64 wr_id; + + srq->sw_rq[rel_idx].valid = 0; + wr_id = srq->sw_rq[rel_idx].wr_id; + + if (rel_idx == srq->cidx) { + pr_debug("%s in order cqe rel_idx %u cidx %u pidx %u wq_pidx %u in_use %u rq_size %u wr_id %llx\n", + __func__, rel_idx, srq->cidx, srq->pidx, + srq->wq_pidx, srq->in_use, srq->size, + (unsigned long long)srq->sw_rq[rel_idx].wr_id); + t4_srq_consume(srq); + while (srq->ooo_count && !srq->sw_rq[srq->cidx].valid) { + pr_debug("%s eat ooo cidx %u pidx %u wq_pidx %u in_use %u rq_size %u ooo_count %u wr_id %llx\n", + __func__, srq->cidx, srq->pidx, + srq->wq_pidx, srq->in_use, + srq->size, srq->ooo_count, + (unsigned long long) + srq->sw_rq[srq->cidx].wr_id); + t4_srq_consume_ooo(srq); + } + if (srq->ooo_count == 0 && srq->pending_in_use) + post_pending_srq_wrs(srq); + } else { + pr_debug("%s ooo cqe rel_idx %u cidx %u pidx %u wq_pidx %u in_use %u rq_size %u ooo_count %u wr_id %llx\n", + __func__, rel_idx, srq->cidx, + srq->pidx, srq->wq_pidx, + srq->in_use, srq->size, + srq->ooo_count, + (unsigned long long)srq->sw_rq[rel_idx].wr_id); + t4_srq_produce_ooo(srq); + } + return wr_id; +} + /* * poll_cq * @@ -459,7 +543,8 @@ void c4iw_count_rcqes(struct t4_cq *cq, struct t4_wq *wq, int *count) * -EOVERFLOW CQ overflow detected. */ static int poll_cq(struct t4_wq *wq, struct t4_cq *cq, struct t4_cqe *cqe, - u8 *cqe_flushed, u64 *cookie, u32 *credit) + u8 *cqe_flushed, u64 *cookie, u32 *credit, + struct t4_srq *srq) { int ret = 0; struct t4_cqe *hw_cqe, read_cqe; @@ -524,7 +609,7 @@ static int poll_cq(struct t4_wq *wq, struct t4_cq *cq, struct t4_cqe *cqe, */ if (CQE_TYPE(hw_cqe) == 1) { if (CQE_STATUS(hw_cqe)) - t4_set_wq_in_error(wq); + t4_set_wq_in_error(wq, 0); ret = -EAGAIN; goto skip_cqe; } @@ -535,7 +620,7 @@ static int poll_cq(struct t4_wq *wq, struct t4_cq *cq, struct t4_cqe *cqe, */ if (CQE_WRID_STAG(hw_cqe) == 1) { if (CQE_STATUS(hw_cqe)) - t4_set_wq_in_error(wq); + t4_set_wq_in_error(wq, 0); ret = -EAGAIN; goto skip_cqe; } @@ -560,7 +645,7 @@ static int poll_cq(struct t4_wq *wq, struct t4_cq *cq, struct t4_cqe *cqe, if (CQE_STATUS(hw_cqe) || t4_wq_in_error(wq)) { *cqe_flushed = (CQE_STATUS(hw_cqe) == T4_ERR_SWFLUSH); - t4_set_wq_in_error(wq); + t4_set_wq_in_error(wq, 0); } /* @@ -574,15 +659,9 @@ static int poll_cq(struct t4_wq *wq, struct t4_cq *cq, struct t4_cqe *cqe, * then we complete this with T4_ERR_MSN and mark the wq in * error. */ - - if (t4_rq_empty(wq)) { - t4_set_wq_in_error(wq); - ret = -EAGAIN; - goto skip_cqe; - } if (unlikely(!CQE_STATUS(hw_cqe) && CQE_WRID_MSN(hw_cqe) != wq->rq.msn)) { - t4_set_wq_in_error(wq); + t4_set_wq_in_error(wq, 0); hw_cqe->header |= cpu_to_be32(CQE_STATUS_V(T4_ERR_MSN)); } goto proc_cqe; @@ -641,11 +720,16 @@ proc_cqe: c4iw_log_wr_stats(wq, hw_cqe); t4_sq_consume(wq); } else { - pr_debug("completing rq idx %u\n", wq->rq.cidx); - *cookie = wq->rq.sw_rq[wq->rq.cidx].wr_id; - if (c4iw_wr_log) - c4iw_log_wr_stats(wq, hw_cqe); - t4_rq_consume(wq); + if (!srq) { + pr_debug("completing rq idx %u\n", wq->rq.cidx); + *cookie = wq->rq.sw_rq[wq->rq.cidx].wr_id; + if (c4iw_wr_log) + c4iw_log_wr_stats(wq, hw_cqe); + t4_rq_consume(wq); + } else { + *cookie = reap_srq_cqe(hw_cqe, srq); + } + wq->rq.msn++; goto skip_cqe; } @@ -668,46 +752,33 @@ skip_cqe: return ret; } -/* - * Get one cq entry from c4iw and map it to openib. - * - * Returns: - * 0 cqe returned - * -ENODATA EMPTY; - * -EAGAIN caller must try again - * any other -errno fatal error - */ -static int c4iw_poll_cq_one(struct c4iw_cq *chp, struct ib_wc *wc) +static int __c4iw_poll_cq_one(struct c4iw_cq *chp, struct c4iw_qp *qhp, + struct ib_wc *wc, struct c4iw_srq *srq) { - struct c4iw_qp *qhp = NULL; - struct t4_cqe uninitialized_var(cqe), *rd_cqe; - struct t4_wq *wq; + struct t4_cqe uninitialized_var(cqe); + struct t4_wq *wq = qhp ? &qhp->wq : NULL; u32 credit = 0; u8 cqe_flushed; u64 cookie = 0; int ret; - ret = t4_next_cqe(&chp->cq, &rd_cqe); - - if (ret) - return ret; - - qhp = get_qhp(chp->rhp, CQE_QPID(rd_cqe)); - if (!qhp) - wq = NULL; - else { - spin_lock(&qhp->lock); - wq = &(qhp->wq); - } - ret = poll_cq(wq, &(chp->cq), &cqe, &cqe_flushed, &cookie, &credit); + ret = poll_cq(wq, &(chp->cq), &cqe, &cqe_flushed, &cookie, &credit, + srq ? &srq->wq : NULL); if (ret) goto out; wc->wr_id = cookie; - wc->qp = &qhp->ibqp; + wc->qp = qhp ? &qhp->ibqp : NULL; wc->vendor_err = CQE_STATUS(&cqe); wc->wc_flags = 0; + /* + * Simulate a SRQ_LIMIT_REACHED HW notification if required. + */ + if (srq && !(srq->flags & T4_SRQ_LIMIT_SUPPORT) && srq->armed && + srq->wq.in_use < srq->srq_limit) + c4iw_dispatch_srq_limit_reached_event(srq); + pr_debug("qpid 0x%x type %d opcode %d status 0x%x len %u wrid hi 0x%x lo 0x%x cookie 0x%llx\n", CQE_QPID(&cqe), CQE_TYPE(&cqe), CQE_OPCODE(&cqe), @@ -720,15 +791,32 @@ static int c4iw_poll_cq_one(struct c4iw_cq *chp, struct ib_wc *wc) wc->byte_len = CQE_LEN(&cqe); else wc->byte_len = 0; - wc->opcode = IB_WC_RECV; - if (CQE_OPCODE(&cqe) == FW_RI_SEND_WITH_INV || - CQE_OPCODE(&cqe) == FW_RI_SEND_WITH_SE_INV) { + + switch (CQE_OPCODE(&cqe)) { + case FW_RI_SEND: + wc->opcode = IB_WC_RECV; + break; + case FW_RI_SEND_WITH_INV: + case FW_RI_SEND_WITH_SE_INV: + wc->opcode = IB_WC_RECV; wc->ex.invalidate_rkey = CQE_WRID_STAG(&cqe); wc->wc_flags |= IB_WC_WITH_INVALIDATE; c4iw_invalidate_mr(qhp->rhp, wc->ex.invalidate_rkey); + break; + case FW_RI_WRITE_IMMEDIATE: + wc->opcode = IB_WC_RECV_RDMA_WITH_IMM; + wc->ex.imm_data = CQE_IMM_DATA(&cqe); + wc->wc_flags |= IB_WC_WITH_IMM; + break; + default: + pr_err("Unexpected opcode %d in the CQE received for QPID=0x%0x\n", + CQE_OPCODE(&cqe), CQE_QPID(&cqe)); + ret = -EINVAL; + goto out; } } else { switch (CQE_OPCODE(&cqe)) { + case FW_RI_WRITE_IMMEDIATE: case FW_RI_RDMA_WRITE: wc->opcode = IB_WC_RDMA_WRITE; break; @@ -819,8 +907,43 @@ static int c4iw_poll_cq_one(struct c4iw_cq *chp, struct ib_wc *wc) } } out: - if (wq) + return ret; +} + +/* + * Get one cq entry from c4iw and map it to openib. + * + * Returns: + * 0 cqe returned + * -ENODATA EMPTY; + * -EAGAIN caller must try again + * any other -errno fatal error + */ +static int c4iw_poll_cq_one(struct c4iw_cq *chp, struct ib_wc *wc) +{ + struct c4iw_srq *srq = NULL; + struct c4iw_qp *qhp = NULL; + struct t4_cqe *rd_cqe; + int ret; + + ret = t4_next_cqe(&chp->cq, &rd_cqe); + + if (ret) + return ret; + + qhp = get_qhp(chp->rhp, CQE_QPID(rd_cqe)); + if (qhp) { + spin_lock(&qhp->lock); + srq = qhp->srq; + if (srq) + spin_lock(&srq->lock); + ret = __c4iw_poll_cq_one(chp, qhp, wc, srq); spin_unlock(&qhp->lock); + if (srq) + spin_unlock(&srq->lock); + } else { + ret = __c4iw_poll_cq_one(chp, NULL, wc, NULL); + } return ret; } @@ -876,6 +999,7 @@ struct ib_cq *c4iw_create_cq(struct ib_device *ibdev, int vector = attr->comp_vector; struct c4iw_dev *rhp; struct c4iw_cq *chp; + struct c4iw_create_cq ucmd; struct c4iw_create_cq_resp uresp; struct c4iw_ucontext *ucontext = NULL; int ret, wr_len; @@ -891,9 +1015,16 @@ struct ib_cq *c4iw_create_cq(struct ib_device *ibdev, if (vector >= rhp->rdev.lldi.nciq) return ERR_PTR(-EINVAL); + if (ib_context) { + ucontext = to_c4iw_ucontext(ib_context); + if (udata->inlen < sizeof(ucmd)) + ucontext->is_32b_cqe = 1; + } + chp = kzalloc(sizeof(*chp), GFP_KERNEL); if (!chp) return ERR_PTR(-ENOMEM); + chp->wr_waitp = c4iw_alloc_wr_wait(GFP_KERNEL); if (!chp->wr_waitp) { ret = -ENOMEM; @@ -908,9 +1039,6 @@ struct ib_cq *c4iw_create_cq(struct ib_device *ibdev, goto err_free_wr_wait; } - if (ib_context) - ucontext = to_c4iw_ucontext(ib_context); - /* account for the status page. */ entries++; @@ -934,13 +1062,15 @@ struct ib_cq *c4iw_create_cq(struct ib_device *ibdev, if (hwentries < 64) hwentries = 64; - memsize = hwentries * sizeof *chp->cq.queue; + memsize = hwentries * ((ucontext && ucontext->is_32b_cqe) ? + (sizeof(*chp->cq.queue) / 2) : sizeof(*chp->cq.queue)); /* * memsize must be a multiple of the page size if its a user cq. */ if (ucontext) memsize = roundup(memsize, PAGE_SIZE); + chp->cq.size = hwentries; chp->cq.memsize = memsize; chp->cq.vector = vector; @@ -971,6 +1101,7 @@ struct ib_cq *c4iw_create_cq(struct ib_device *ibdev, if (!mm2) goto err_free_mm; + memset(&uresp, 0, sizeof(uresp)); uresp.qid_mask = rhp->rdev.cqmask; uresp.cqid = chp->cq.cqid; uresp.size = chp->cq.size; @@ -980,9 +1111,16 @@ struct ib_cq *c4iw_create_cq(struct ib_device *ibdev, ucontext->key += PAGE_SIZE; uresp.gts_key = ucontext->key; ucontext->key += PAGE_SIZE; + /* communicate to the userspace that + * kernel driver supports 64B CQE + */ + uresp.flags |= C4IW_64B_CQE; + spin_unlock(&ucontext->mmap_lock); ret = ib_copy_to_udata(udata, &uresp, - sizeof(uresp) - sizeof(uresp.reserved)); + ucontext->is_32b_cqe ? + sizeof(uresp) - sizeof(uresp.flags) : + sizeof(uresp)); if (ret) goto err_free_mm2; @@ -1019,11 +1157,6 @@ err_free_chp: return ERR_PTR(ret); } -int c4iw_resize_cq(struct ib_cq *cq, int cqe, struct ib_udata *udata) -{ - return -ENOSYS; -} - int c4iw_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags) { struct c4iw_cq *chp; @@ -1039,3 +1172,19 @@ int c4iw_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags) spin_unlock_irqrestore(&chp->lock, flag); return ret; } + +void c4iw_flush_srqidx(struct c4iw_qp *qhp, u32 srqidx) +{ + struct c4iw_cq *rchp = to_c4iw_cq(qhp->ibqp.recv_cq); + unsigned long flag; + + /* locking heirarchy: cq lock first, then qp lock. */ + spin_lock_irqsave(&rchp->lock, flag); + spin_lock(&qhp->lock); + + /* create a SRQ RECV CQE for srqidx */ + insert_recv_cqe(&qhp->wq, &rchp->cq, srqidx); + + spin_unlock(&qhp->lock); + spin_unlock_irqrestore(&rchp->lock, flag); +} diff --git a/drivers/infiniband/hw/cxgb4/device.c b/drivers/infiniband/hw/cxgb4/device.c index a3c3418afd73..c13c0ba30f63 100644 --- a/drivers/infiniband/hw/cxgb4/device.c +++ b/drivers/infiniband/hw/cxgb4/device.c @@ -275,10 +275,11 @@ static int dump_qp(int id, void *p, void *data) set_ep_sin_addrs(ep, &lsin, &rsin, &m_lsin, &m_rsin); cc = snprintf(qpd->buf + qpd->pos, space, - "rc qp sq id %u rq id %u state %u " + "rc qp sq id %u %s id %u state %u " "onchip %u ep tid %u state %u " "%pI4:%u/%u->%pI4:%u/%u\n", - qp->wq.sq.qid, qp->wq.rq.qid, + qp->wq.sq.qid, qp->srq ? "srq" : "rq", + qp->srq ? qp->srq->idx : qp->wq.rq.qid, (int)qp->attr.state, qp->wq.sq.flags & T4_SQ_ONCHIP, ep->hwtid, (int)ep->com.state, @@ -480,6 +481,9 @@ static int stats_show(struct seq_file *seq, void *v) seq_printf(seq, " QID: %10llu %10llu %10llu %10llu\n", dev->rdev.stats.qid.total, dev->rdev.stats.qid.cur, dev->rdev.stats.qid.max, dev->rdev.stats.qid.fail); + seq_printf(seq, " SRQS: %10llu %10llu %10llu %10llu\n", + dev->rdev.stats.srqt.total, dev->rdev.stats.srqt.cur, + dev->rdev.stats.srqt.max, dev->rdev.stats.srqt.fail); seq_printf(seq, " TPTMEM: %10llu %10llu %10llu %10llu\n", dev->rdev.stats.stag.total, dev->rdev.stats.stag.cur, dev->rdev.stats.stag.max, dev->rdev.stats.stag.fail); @@ -530,6 +534,8 @@ static ssize_t stats_clear(struct file *file, const char __user *buf, dev->rdev.stats.pbl.fail = 0; dev->rdev.stats.rqt.max = 0; dev->rdev.stats.rqt.fail = 0; + dev->rdev.stats.rqt.max = 0; + dev->rdev.stats.rqt.fail = 0; dev->rdev.stats.ocqp.max = 0; dev->rdev.stats.ocqp.fail = 0; dev->rdev.stats.db_full = 0; @@ -802,7 +808,7 @@ static int c4iw_rdev_open(struct c4iw_rdev *rdev) rdev->qpmask = rdev->lldi.udb_density - 1; rdev->cqmask = rdev->lldi.ucq_density - 1; - pr_debug("dev %s stag start 0x%0x size 0x%0x num stags %d pbl start 0x%0x size 0x%0x rq start 0x%0x size 0x%0x qp qid start %u size %u cq qid start %u size %u\n", + pr_debug("dev %s stag start 0x%0x size 0x%0x num stags %d pbl start 0x%0x size 0x%0x rq start 0x%0x size 0x%0x qp qid start %u size %u cq qid start %u size %u srq size %u\n", pci_name(rdev->lldi.pdev), rdev->lldi.vr->stag.start, rdev->lldi.vr->stag.size, c4iw_num_stags(rdev), rdev->lldi.vr->pbl.start, @@ -811,7 +817,8 @@ static int c4iw_rdev_open(struct c4iw_rdev *rdev) rdev->lldi.vr->qp.start, rdev->lldi.vr->qp.size, rdev->lldi.vr->cq.start, - rdev->lldi.vr->cq.size); + rdev->lldi.vr->cq.size, + rdev->lldi.vr->srq.size); pr_debug("udb %pR db_reg %p gts_reg %p qpmask 0x%x cqmask 0x%x\n", &rdev->lldi.pdev->resource[2], rdev->lldi.db_reg, rdev->lldi.gts_reg, @@ -824,10 +831,12 @@ static int c4iw_rdev_open(struct c4iw_rdev *rdev) rdev->stats.stag.total = rdev->lldi.vr->stag.size; rdev->stats.pbl.total = rdev->lldi.vr->pbl.size; rdev->stats.rqt.total = rdev->lldi.vr->rq.size; + rdev->stats.srqt.total = rdev->lldi.vr->srq.size; rdev->stats.ocqp.total = rdev->lldi.vr->ocq.size; rdev->stats.qid.total = rdev->lldi.vr->qp.size; - err = c4iw_init_resource(rdev, c4iw_num_stags(rdev), T4_MAX_NUM_PD); + err = c4iw_init_resource(rdev, c4iw_num_stags(rdev), + T4_MAX_NUM_PD, rdev->lldi.vr->srq.size); if (err) { pr_err("error %d initializing resources\n", err); return err; @@ -857,6 +866,7 @@ static int c4iw_rdev_open(struct c4iw_rdev *rdev) rdev->status_page->qp_size = rdev->lldi.vr->qp.size; rdev->status_page->cq_start = rdev->lldi.vr->cq.start; rdev->status_page->cq_size = rdev->lldi.vr->cq.size; + rdev->status_page->write_cmpl_supported = rdev->lldi.write_cmpl_support; if (c4iw_wr_log) { rdev->wr_log = kcalloc(1 << c4iw_wr_log_size_order, diff --git a/drivers/infiniband/hw/cxgb4/ev.c b/drivers/infiniband/hw/cxgb4/ev.c index 3e9d8b277ab9..8741d23168f3 100644 --- a/drivers/infiniband/hw/cxgb4/ev.c +++ b/drivers/infiniband/hw/cxgb4/ev.c @@ -70,9 +70,10 @@ static void dump_err_cqe(struct c4iw_dev *dev, struct t4_cqe *err_cqe) CQE_STATUS(err_cqe), CQE_TYPE(err_cqe), ntohl(err_cqe->len), CQE_WRID_HI(err_cqe), CQE_WRID_LOW(err_cqe)); - pr_debug("%016llx %016llx %016llx %016llx\n", + pr_debug("%016llx %016llx %016llx %016llx - %016llx %016llx %016llx %016llx\n", be64_to_cpu(p[0]), be64_to_cpu(p[1]), be64_to_cpu(p[2]), - be64_to_cpu(p[3])); + be64_to_cpu(p[3]), be64_to_cpu(p[4]), be64_to_cpu(p[5]), + be64_to_cpu(p[6]), be64_to_cpu(p[7])); /* * Ingress WRITE and READ_RESP errors provide diff --git a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h index 870649ff049c..f0fceadd0d12 100644 --- a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h +++ b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h @@ -97,6 +97,7 @@ struct c4iw_resource { struct c4iw_id_table tpt_table; struct c4iw_id_table qid_table; struct c4iw_id_table pdid_table; + struct c4iw_id_table srq_table; }; struct c4iw_qid_list { @@ -130,6 +131,8 @@ struct c4iw_stats { struct c4iw_stat stag; struct c4iw_stat pbl; struct c4iw_stat rqt; + struct c4iw_stat srqt; + struct c4iw_stat srq; struct c4iw_stat ocqp; u64 db_full; u64 db_empty; @@ -549,6 +552,7 @@ struct c4iw_qp { struct kref kref; wait_queue_head_t wait; int sq_sig_all; + struct c4iw_srq *srq; struct work_struct free_work; struct c4iw_ucontext *ucontext; struct c4iw_wr_wait *wr_waitp; @@ -559,6 +563,26 @@ static inline struct c4iw_qp *to_c4iw_qp(struct ib_qp *ibqp) return container_of(ibqp, struct c4iw_qp, ibqp); } +struct c4iw_srq { + struct ib_srq ibsrq; + struct list_head db_fc_entry; + struct c4iw_dev *rhp; + struct t4_srq wq; + struct sk_buff *destroy_skb; + u32 srq_limit; + u32 pdid; + int idx; + u32 flags; + spinlock_t lock; /* protects srq */ + struct c4iw_wr_wait *wr_waitp; + bool armed; +}; + +static inline struct c4iw_srq *to_c4iw_srq(struct ib_srq *ibsrq) +{ + return container_of(ibsrq, struct c4iw_srq, ibsrq); +} + struct c4iw_ucontext { struct ib_ucontext ibucontext; struct c4iw_dev_ucontext uctx; @@ -566,6 +590,7 @@ struct c4iw_ucontext { spinlock_t mmap_lock; struct list_head mmaps; struct kref kref; + bool is_32b_cqe; }; static inline struct c4iw_ucontext *to_c4iw_ucontext(struct ib_ucontext *c) @@ -885,7 +910,10 @@ enum conn_pre_alloc_buffers { CN_MAX_CON_BUF }; -#define FLOWC_LEN 80 +enum { + FLOWC_LEN = offsetof(struct fw_flowc_wr, mnemval[FW_FLOWC_MNEM_MAX]) +}; + union cpl_wr_size { struct cpl_abort_req abrt_req; struct cpl_abort_rpl abrt_rpl; @@ -952,6 +980,7 @@ struct c4iw_ep { unsigned int retry_count; int snd_win; int rcv_win; + u32 snd_wscale; struct c4iw_ep_stats stats; }; @@ -988,7 +1017,8 @@ void c4iw_put_qpid(struct c4iw_rdev *rdev, u32 qpid, struct c4iw_dev_ucontext *uctx); u32 c4iw_get_resource(struct c4iw_id_table *id_table); void c4iw_put_resource(struct c4iw_id_table *id_table, u32 entry); -int c4iw_init_resource(struct c4iw_rdev *rdev, u32 nr_tpt, u32 nr_pdid); +int c4iw_init_resource(struct c4iw_rdev *rdev, u32 nr_tpt, + u32 nr_pdid, u32 nr_srqt); int c4iw_init_ctrl_qp(struct c4iw_rdev *rdev); int c4iw_pblpool_create(struct c4iw_rdev *rdev); int c4iw_rqtpool_create(struct c4iw_rdev *rdev); @@ -1007,10 +1037,10 @@ void c4iw_release_dev_ucontext(struct c4iw_rdev *rdev, void c4iw_init_dev_ucontext(struct c4iw_rdev *rdev, struct c4iw_dev_ucontext *uctx); int c4iw_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc); -int c4iw_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, - struct ib_send_wr **bad_wr); -int c4iw_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr, - struct ib_recv_wr **bad_wr); +int c4iw_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, + const struct ib_send_wr **bad_wr); +int c4iw_post_receive(struct ib_qp *ibqp, const struct ib_recv_wr *wr, + const struct ib_recv_wr **bad_wr); int c4iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param); int c4iw_create_listen(struct iw_cm_id *cm_id, int backlog); int c4iw_destroy_listen(struct iw_cm_id *cm_id); @@ -1037,8 +1067,14 @@ struct ib_cq *c4iw_create_cq(struct ib_device *ibdev, const struct ib_cq_init_attr *attr, struct ib_ucontext *ib_context, struct ib_udata *udata); -int c4iw_resize_cq(struct ib_cq *cq, int cqe, struct ib_udata *udata); int c4iw_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags); +int c4iw_modify_srq(struct ib_srq *ib_srq, struct ib_srq_attr *attr, + enum ib_srq_attr_mask srq_attr_mask, + struct ib_udata *udata); +int c4iw_destroy_srq(struct ib_srq *ib_srq); +struct ib_srq *c4iw_create_srq(struct ib_pd *pd, + struct ib_srq_init_attr *attrs, + struct ib_udata *udata); int c4iw_destroy_qp(struct ib_qp *ib_qp); struct ib_qp *c4iw_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *attrs, @@ -1075,12 +1111,19 @@ extern c4iw_handler_func c4iw_handlers[NUM_CPL_CMDS]; void __iomem *c4iw_bar2_addrs(struct c4iw_rdev *rdev, unsigned int qid, enum cxgb4_bar2_qtype qtype, unsigned int *pbar2_qid, u64 *pbar2_pa); +int c4iw_alloc_srq_idx(struct c4iw_rdev *rdev); +void c4iw_free_srq_idx(struct c4iw_rdev *rdev, int idx); extern void c4iw_log_wr_stats(struct t4_wq *wq, struct t4_cqe *cqe); extern int c4iw_wr_log; extern int db_fc_threshold; extern int db_coalescing_threshold; extern int use_dsgl; void c4iw_invalidate_mr(struct c4iw_dev *rhp, u32 rkey); +void c4iw_dispatch_srq_limit_reached_event(struct c4iw_srq *srq); +void c4iw_copy_wr_to_srq(struct t4_srq *srq, union t4_recv_wr *wqe, u8 len16); +void c4iw_flush_srqidx(struct c4iw_qp *qhp, u32 srqidx); +int c4iw_post_srq_recv(struct ib_srq *ibsrq, const struct ib_recv_wr *wr, + const struct ib_recv_wr **bad_wr); struct c4iw_wr_wait *c4iw_alloc_wr_wait(gfp_t gfp); typedef int c4iw_restrack_func(struct sk_buff *msg, diff --git a/drivers/infiniband/hw/cxgb4/provider.c b/drivers/infiniband/hw/cxgb4/provider.c index 1feade8bb4b3..4eda6872e617 100644 --- a/drivers/infiniband/hw/cxgb4/provider.c +++ b/drivers/infiniband/hw/cxgb4/provider.c @@ -58,41 +58,6 @@ static int fastreg_support = 1; module_param(fastreg_support, int, 0644); MODULE_PARM_DESC(fastreg_support, "Advertise fastreg support (default=1)"); -static struct ib_ah *c4iw_ah_create(struct ib_pd *pd, - struct rdma_ah_attr *ah_attr, - struct ib_udata *udata) - -{ - return ERR_PTR(-ENOSYS); -} - -static int c4iw_ah_destroy(struct ib_ah *ah) -{ - return -ENOSYS; -} - -static int c4iw_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) -{ - return -ENOSYS; -} - -static int c4iw_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) -{ - return -ENOSYS; -} - -static int c4iw_process_mad(struct ib_device *ibdev, int mad_flags, - u8 port_num, const struct ib_wc *in_wc, - const struct ib_grh *in_grh, - const struct ib_mad_hdr *in_mad, - size_t in_mad_size, - struct ib_mad_hdr *out_mad, - size_t *out_mad_size, - u16 *out_mad_pkey_index) -{ - return -ENOSYS; -} - void _c4iw_free_ucontext(struct kref *kref) { struct c4iw_ucontext *ucontext; @@ -342,8 +307,12 @@ static int c4iw_query_device(struct ib_device *ibdev, struct ib_device_attr *pro props->vendor_part_id = (u32)dev->rdev.lldi.pdev->device; props->max_mr_size = T4_MAX_MR_SIZE; props->max_qp = dev->rdev.lldi.vr->qp.size / 2; + props->max_srq = dev->rdev.lldi.vr->srq.size; props->max_qp_wr = dev->rdev.hw_queue.t4_max_qp_depth; - props->max_sge = T4_MAX_RECV_SGE; + props->max_srq_wr = dev->rdev.hw_queue.t4_max_qp_depth; + props->max_send_sge = min(T4_MAX_SEND_SGE, T4_MAX_WRITE_SGE); + props->max_recv_sge = T4_MAX_RECV_SGE; + props->max_srq_sge = T4_MAX_RECV_SGE; props->max_sge_rd = 1; props->max_res_rd_atom = dev->rdev.lldi.max_ird_adapter; props->max_qp_rd_atom = min(dev->rdev.lldi.max_ordird_qp, @@ -592,7 +561,10 @@ void c4iw_register_device(struct work_struct *work) (1ull << IB_USER_VERBS_CMD_POLL_CQ) | (1ull << IB_USER_VERBS_CMD_DESTROY_QP) | (1ull << IB_USER_VERBS_CMD_POST_SEND) | - (1ull << IB_USER_VERBS_CMD_POST_RECV); + (1ull << IB_USER_VERBS_CMD_POST_RECV) | + (1ull << IB_USER_VERBS_CMD_CREATE_SRQ) | + (1ull << IB_USER_VERBS_CMD_MODIFY_SRQ) | + (1ull << IB_USER_VERBS_CMD_DESTROY_SRQ); dev->ibdev.node_type = RDMA_NODE_RNIC; BUILD_BUG_ON(sizeof(C4IW_NODE_DESC) > IB_DEVICE_NODE_DESC_MAX); memcpy(dev->ibdev.node_desc, C4IW_NODE_DESC, sizeof(C4IW_NODE_DESC)); @@ -608,15 +580,15 @@ void c4iw_register_device(struct work_struct *work) dev->ibdev.mmap = c4iw_mmap; dev->ibdev.alloc_pd = c4iw_allocate_pd; dev->ibdev.dealloc_pd = c4iw_deallocate_pd; - dev->ibdev.create_ah = c4iw_ah_create; - dev->ibdev.destroy_ah = c4iw_ah_destroy; dev->ibdev.create_qp = c4iw_create_qp; dev->ibdev.modify_qp = c4iw_ib_modify_qp; dev->ibdev.query_qp = c4iw_ib_query_qp; dev->ibdev.destroy_qp = c4iw_destroy_qp; + dev->ibdev.create_srq = c4iw_create_srq; + dev->ibdev.modify_srq = c4iw_modify_srq; + dev->ibdev.destroy_srq = c4iw_destroy_srq; dev->ibdev.create_cq = c4iw_create_cq; dev->ibdev.destroy_cq = c4iw_destroy_cq; - dev->ibdev.resize_cq = c4iw_resize_cq; dev->ibdev.poll_cq = c4iw_poll_cq; dev->ibdev.get_dma_mr = c4iw_get_dma_mr; dev->ibdev.reg_user_mr = c4iw_reg_user_mr; @@ -625,12 +597,10 @@ void c4iw_register_device(struct work_struct *work) dev->ibdev.dealloc_mw = c4iw_dealloc_mw; dev->ibdev.alloc_mr = c4iw_alloc_mr; dev->ibdev.map_mr_sg = c4iw_map_mr_sg; - dev->ibdev.attach_mcast = c4iw_multicast_attach; - dev->ibdev.detach_mcast = c4iw_multicast_detach; - dev->ibdev.process_mad = c4iw_process_mad; dev->ibdev.req_notify_cq = c4iw_arm_cq; dev->ibdev.post_send = c4iw_post_send; dev->ibdev.post_recv = c4iw_post_receive; + dev->ibdev.post_srq_recv = c4iw_post_srq_recv; dev->ibdev.alloc_hw_stats = c4iw_alloc_stats; dev->ibdev.get_hw_stats = c4iw_get_mib; dev->ibdev.uverbs_abi_ver = C4IW_UVERBS_ABI_VERSION; diff --git a/drivers/infiniband/hw/cxgb4/qp.c b/drivers/infiniband/hw/cxgb4/qp.c index aef53305f1c3..b3203afa3b1d 100644 --- a/drivers/infiniband/hw/cxgb4/qp.c +++ b/drivers/infiniband/hw/cxgb4/qp.c @@ -147,21 +147,24 @@ static int alloc_sq(struct c4iw_rdev *rdev, struct t4_sq *sq, int user) } static int destroy_qp(struct c4iw_rdev *rdev, struct t4_wq *wq, - struct c4iw_dev_ucontext *uctx) + struct c4iw_dev_ucontext *uctx, int has_rq) { /* * uP clears EQ contexts when the connection exits rdma mode, * so no need to post a RESET WR for these EQs. */ - dma_free_coherent(&(rdev->lldi.pdev->dev), - wq->rq.memsize, wq->rq.queue, - dma_unmap_addr(&wq->rq, mapping)); dealloc_sq(rdev, &wq->sq); - c4iw_rqtpool_free(rdev, wq->rq.rqt_hwaddr, wq->rq.rqt_size); - kfree(wq->rq.sw_rq); kfree(wq->sq.sw_sq); - c4iw_put_qpid(rdev, wq->rq.qid, uctx); c4iw_put_qpid(rdev, wq->sq.qid, uctx); + + if (has_rq) { + dma_free_coherent(&rdev->lldi.pdev->dev, + wq->rq.memsize, wq->rq.queue, + dma_unmap_addr(&wq->rq, mapping)); + c4iw_rqtpool_free(rdev, wq->rq.rqt_hwaddr, wq->rq.rqt_size); + kfree(wq->rq.sw_rq); + c4iw_put_qpid(rdev, wq->rq.qid, uctx); + } return 0; } @@ -195,7 +198,8 @@ void __iomem *c4iw_bar2_addrs(struct c4iw_rdev *rdev, unsigned int qid, static int create_qp(struct c4iw_rdev *rdev, struct t4_wq *wq, struct t4_cq *rcq, struct t4_cq *scq, struct c4iw_dev_ucontext *uctx, - struct c4iw_wr_wait *wr_waitp) + struct c4iw_wr_wait *wr_waitp, + int need_rq) { int user = (uctx != &rdev->uctx); struct fw_ri_res_wr *res_wr; @@ -209,10 +213,12 @@ static int create_qp(struct c4iw_rdev *rdev, struct t4_wq *wq, if (!wq->sq.qid) return -ENOMEM; - wq->rq.qid = c4iw_get_qpid(rdev, uctx); - if (!wq->rq.qid) { - ret = -ENOMEM; - goto free_sq_qid; + if (need_rq) { + wq->rq.qid = c4iw_get_qpid(rdev, uctx); + if (!wq->rq.qid) { + ret = -ENOMEM; + goto free_sq_qid; + } } if (!user) { @@ -220,25 +226,31 @@ static int create_qp(struct c4iw_rdev *rdev, struct t4_wq *wq, GFP_KERNEL); if (!wq->sq.sw_sq) { ret = -ENOMEM; - goto free_rq_qid; + goto free_rq_qid;//FIXME } - wq->rq.sw_rq = kcalloc(wq->rq.size, sizeof(*wq->rq.sw_rq), - GFP_KERNEL); - if (!wq->rq.sw_rq) { - ret = -ENOMEM; - goto free_sw_sq; + if (need_rq) { + wq->rq.sw_rq = kcalloc(wq->rq.size, + sizeof(*wq->rq.sw_rq), + GFP_KERNEL); + if (!wq->rq.sw_rq) { + ret = -ENOMEM; + goto free_sw_sq; + } } } - /* - * RQT must be a power of 2 and at least 16 deep. - */ - wq->rq.rqt_size = roundup_pow_of_two(max_t(u16, wq->rq.size, 16)); - wq->rq.rqt_hwaddr = c4iw_rqtpool_alloc(rdev, wq->rq.rqt_size); - if (!wq->rq.rqt_hwaddr) { - ret = -ENOMEM; - goto free_sw_rq; + if (need_rq) { + /* + * RQT must be a power of 2 and at least 16 deep. + */ + wq->rq.rqt_size = + roundup_pow_of_two(max_t(u16, wq->rq.size, 16)); + wq->rq.rqt_hwaddr = c4iw_rqtpool_alloc(rdev, wq->rq.rqt_size); + if (!wq->rq.rqt_hwaddr) { + ret = -ENOMEM; + goto free_sw_rq; + } } ret = alloc_sq(rdev, &wq->sq, user); @@ -247,34 +259,39 @@ static int create_qp(struct c4iw_rdev *rdev, struct t4_wq *wq, memset(wq->sq.queue, 0, wq->sq.memsize); dma_unmap_addr_set(&wq->sq, mapping, wq->sq.dma_addr); - wq->rq.queue = dma_alloc_coherent(&(rdev->lldi.pdev->dev), - wq->rq.memsize, &(wq->rq.dma_addr), - GFP_KERNEL); - if (!wq->rq.queue) { - ret = -ENOMEM; - goto free_sq; + if (need_rq) { + wq->rq.queue = dma_alloc_coherent(&rdev->lldi.pdev->dev, + wq->rq.memsize, + &wq->rq.dma_addr, + GFP_KERNEL); + if (!wq->rq.queue) { + ret = -ENOMEM; + goto free_sq; + } + pr_debug("sq base va 0x%p pa 0x%llx rq base va 0x%p pa 0x%llx\n", + wq->sq.queue, + (unsigned long long)virt_to_phys(wq->sq.queue), + wq->rq.queue, + (unsigned long long)virt_to_phys(wq->rq.queue)); + memset(wq->rq.queue, 0, wq->rq.memsize); + dma_unmap_addr_set(&wq->rq, mapping, wq->rq.dma_addr); } - pr_debug("sq base va 0x%p pa 0x%llx rq base va 0x%p pa 0x%llx\n", - wq->sq.queue, - (unsigned long long)virt_to_phys(wq->sq.queue), - wq->rq.queue, - (unsigned long long)virt_to_phys(wq->rq.queue)); - memset(wq->rq.queue, 0, wq->rq.memsize); - dma_unmap_addr_set(&wq->rq, mapping, wq->rq.dma_addr); wq->db = rdev->lldi.db_reg; wq->sq.bar2_va = c4iw_bar2_addrs(rdev, wq->sq.qid, T4_BAR2_QTYPE_EGRESS, &wq->sq.bar2_qid, user ? &wq->sq.bar2_pa : NULL); - wq->rq.bar2_va = c4iw_bar2_addrs(rdev, wq->rq.qid, T4_BAR2_QTYPE_EGRESS, - &wq->rq.bar2_qid, - user ? &wq->rq.bar2_pa : NULL); + if (need_rq) + wq->rq.bar2_va = c4iw_bar2_addrs(rdev, wq->rq.qid, + T4_BAR2_QTYPE_EGRESS, + &wq->rq.bar2_qid, + user ? &wq->rq.bar2_pa : NULL); /* * User mode must have bar2 access. */ - if (user && (!wq->sq.bar2_pa || !wq->rq.bar2_pa)) { + if (user && (!wq->sq.bar2_pa || (need_rq && !wq->rq.bar2_pa))) { pr_warn("%s: sqid %u or rqid %u not in BAR2 range\n", pci_name(rdev->lldi.pdev), wq->sq.qid, wq->rq.qid); goto free_dma; @@ -285,7 +302,8 @@ static int create_qp(struct c4iw_rdev *rdev, struct t4_wq *wq, /* build fw_ri_res_wr */ wr_len = sizeof *res_wr + 2 * sizeof *res; - + if (need_rq) + wr_len += sizeof(*res); skb = alloc_skb(wr_len, GFP_KERNEL); if (!skb) { ret = -ENOMEM; @@ -296,7 +314,7 @@ static int create_qp(struct c4iw_rdev *rdev, struct t4_wq *wq, res_wr = __skb_put_zero(skb, wr_len); res_wr->op_nres = cpu_to_be32( FW_WR_OP_V(FW_RI_RES_WR) | - FW_RI_RES_WR_NRES_V(2) | + FW_RI_RES_WR_NRES_V(need_rq ? 2 : 1) | FW_WR_COMPL_F); res_wr->len16_pkd = cpu_to_be32(DIV_ROUND_UP(wr_len, 16)); res_wr->cookie = (uintptr_t)wr_waitp; @@ -327,30 +345,36 @@ static int create_qp(struct c4iw_rdev *rdev, struct t4_wq *wq, FW_RI_RES_WR_EQSIZE_V(eqsize)); res->u.sqrq.eqid = cpu_to_be32(wq->sq.qid); res->u.sqrq.eqaddr = cpu_to_be64(wq->sq.dma_addr); - res++; - res->u.sqrq.restype = FW_RI_RES_TYPE_RQ; - res->u.sqrq.op = FW_RI_RES_OP_WRITE; - /* - * eqsize is the number of 64B entries plus the status page size. - */ - eqsize = wq->rq.size * T4_RQ_NUM_SLOTS + - rdev->hw_queue.t4_eq_status_entries; - res->u.sqrq.fetchszm_to_iqid = cpu_to_be32( - FW_RI_RES_WR_HOSTFCMODE_V(0) | /* no host cidx updates */ - FW_RI_RES_WR_CPRIO_V(0) | /* don't keep in chip cache */ - FW_RI_RES_WR_PCIECHN_V(0) | /* set by uP at ri_init time */ - FW_RI_RES_WR_IQID_V(rcq->cqid)); - res->u.sqrq.dcaen_to_eqsize = cpu_to_be32( - FW_RI_RES_WR_DCAEN_V(0) | - FW_RI_RES_WR_DCACPU_V(0) | - FW_RI_RES_WR_FBMIN_V(2) | - FW_RI_RES_WR_FBMAX_V(3) | - FW_RI_RES_WR_CIDXFTHRESHO_V(0) | - FW_RI_RES_WR_CIDXFTHRESH_V(0) | - FW_RI_RES_WR_EQSIZE_V(eqsize)); - res->u.sqrq.eqid = cpu_to_be32(wq->rq.qid); - res->u.sqrq.eqaddr = cpu_to_be64(wq->rq.dma_addr); + if (need_rq) { + res++; + res->u.sqrq.restype = FW_RI_RES_TYPE_RQ; + res->u.sqrq.op = FW_RI_RES_OP_WRITE; + + /* + * eqsize is the number of 64B entries plus the status page size + */ + eqsize = wq->rq.size * T4_RQ_NUM_SLOTS + + rdev->hw_queue.t4_eq_status_entries; + res->u.sqrq.fetchszm_to_iqid = + /* no host cidx updates */ + cpu_to_be32(FW_RI_RES_WR_HOSTFCMODE_V(0) | + /* don't keep in chip cache */ + FW_RI_RES_WR_CPRIO_V(0) | + /* set by uP at ri_init time */ + FW_RI_RES_WR_PCIECHN_V(0) | + FW_RI_RES_WR_IQID_V(rcq->cqid)); + res->u.sqrq.dcaen_to_eqsize = + cpu_to_be32(FW_RI_RES_WR_DCAEN_V(0) | + FW_RI_RES_WR_DCACPU_V(0) | + FW_RI_RES_WR_FBMIN_V(2) | + FW_RI_RES_WR_FBMAX_V(3) | + FW_RI_RES_WR_CIDXFTHRESHO_V(0) | + FW_RI_RES_WR_CIDXFTHRESH_V(0) | + FW_RI_RES_WR_EQSIZE_V(eqsize)); + res->u.sqrq.eqid = cpu_to_be32(wq->rq.qid); + res->u.sqrq.eqaddr = cpu_to_be64(wq->rq.dma_addr); + } c4iw_init_wr_wait(wr_waitp); ret = c4iw_ref_send_wait(rdev, skb, wr_waitp, 0, wq->sq.qid, __func__); @@ -363,26 +387,30 @@ static int create_qp(struct c4iw_rdev *rdev, struct t4_wq *wq, return 0; free_dma: - dma_free_coherent(&(rdev->lldi.pdev->dev), - wq->rq.memsize, wq->rq.queue, - dma_unmap_addr(&wq->rq, mapping)); + if (need_rq) + dma_free_coherent(&rdev->lldi.pdev->dev, + wq->rq.memsize, wq->rq.queue, + dma_unmap_addr(&wq->rq, mapping)); free_sq: dealloc_sq(rdev, &wq->sq); free_hwaddr: - c4iw_rqtpool_free(rdev, wq->rq.rqt_hwaddr, wq->rq.rqt_size); + if (need_rq) + c4iw_rqtpool_free(rdev, wq->rq.rqt_hwaddr, wq->rq.rqt_size); free_sw_rq: - kfree(wq->rq.sw_rq); + if (need_rq) + kfree(wq->rq.sw_rq); free_sw_sq: kfree(wq->sq.sw_sq); free_rq_qid: - c4iw_put_qpid(rdev, wq->rq.qid, uctx); + if (need_rq) + c4iw_put_qpid(rdev, wq->rq.qid, uctx); free_sq_qid: c4iw_put_qpid(rdev, wq->sq.qid, uctx); return ret; } static int build_immd(struct t4_sq *sq, struct fw_ri_immd *immdp, - struct ib_send_wr *wr, int max, u32 *plenp) + const struct ib_send_wr *wr, int max, u32 *plenp) { u8 *dstp, *srcp; u32 plen = 0; @@ -427,7 +455,12 @@ static int build_isgl(__be64 *queue_start, __be64 *queue_end, { int i; u32 plen = 0; - __be64 *flitp = (__be64 *)isglp->sge; + __be64 *flitp; + + if ((__be64 *)isglp == queue_end) + isglp = (struct fw_ri_isgl *)queue_start; + + flitp = (__be64 *)isglp->sge; for (i = 0; i < num_sge; i++) { if ((plen + sg_list[i].length) < plen) @@ -452,7 +485,7 @@ static int build_isgl(__be64 *queue_start, __be64 *queue_end, } static int build_rdma_send(struct t4_sq *sq, union t4_wr *wqe, - struct ib_send_wr *wr, u8 *len16) + const struct ib_send_wr *wr, u8 *len16) { u32 plen; int size; @@ -519,7 +552,7 @@ static int build_rdma_send(struct t4_sq *sq, union t4_wr *wqe, } static int build_rdma_write(struct t4_sq *sq, union t4_wr *wqe, - struct ib_send_wr *wr, u8 *len16) + const struct ib_send_wr *wr, u8 *len16) { u32 plen; int size; @@ -527,7 +560,15 @@ static int build_rdma_write(struct t4_sq *sq, union t4_wr *wqe, if (wr->num_sge > T4_MAX_SEND_SGE) return -EINVAL; - wqe->write.r2 = 0; + + /* + * iWARP protocol supports 64 bit immediate data but rdma api + * limits it to 32bit. + */ + if (wr->opcode == IB_WR_RDMA_WRITE_WITH_IMM) + wqe->write.iw_imm_data.ib_imm_data.imm_data32 = wr->ex.imm_data; + else + wqe->write.iw_imm_data.ib_imm_data.imm_data32 = 0; wqe->write.stag_sink = cpu_to_be32(rdma_wr(wr)->rkey); wqe->write.to_sink = cpu_to_be64(rdma_wr(wr)->remote_addr); if (wr->num_sge) { @@ -561,7 +602,58 @@ static int build_rdma_write(struct t4_sq *sq, union t4_wr *wqe, return 0; } -static int build_rdma_read(union t4_wr *wqe, struct ib_send_wr *wr, u8 *len16) +static void build_immd_cmpl(struct t4_sq *sq, struct fw_ri_immd_cmpl *immdp, + struct ib_send_wr *wr) +{ + memcpy((u8 *)immdp->data, (u8 *)(uintptr_t)wr->sg_list->addr, 16); + memset(immdp->r1, 0, 6); + immdp->op = FW_RI_DATA_IMMD; + immdp->immdlen = 16; +} + +static void build_rdma_write_cmpl(struct t4_sq *sq, + struct fw_ri_rdma_write_cmpl_wr *wcwr, + const struct ib_send_wr *wr, u8 *len16) +{ + u32 plen; + int size; + + /* + * This code assumes the struct fields preceding the write isgl + * fit in one 64B WR slot. This is because the WQE is built + * directly in the dma queue, and wrapping is only handled + * by the code buildling sgls. IE the "fixed part" of the wr + * structs must all fit in 64B. The WQE build code should probably be + * redesigned to avoid this restriction, but for now just add + * the BUILD_BUG_ON() to catch if this WQE struct gets too big. + */ + BUILD_BUG_ON(offsetof(struct fw_ri_rdma_write_cmpl_wr, u) > 64); + + wcwr->stag_sink = cpu_to_be32(rdma_wr(wr)->rkey); + wcwr->to_sink = cpu_to_be64(rdma_wr(wr)->remote_addr); + wcwr->stag_inv = cpu_to_be32(wr->next->ex.invalidate_rkey); + wcwr->r2 = 0; + wcwr->r3 = 0; + + /* SEND_INV SGL */ + if (wr->next->send_flags & IB_SEND_INLINE) + build_immd_cmpl(sq, &wcwr->u_cmpl.immd_src, wr->next); + else + build_isgl((__be64 *)sq->queue, (__be64 *)&sq->queue[sq->size], + &wcwr->u_cmpl.isgl_src, wr->next->sg_list, 1, NULL); + + /* WRITE SGL */ + build_isgl((__be64 *)sq->queue, (__be64 *)&sq->queue[sq->size], + wcwr->u.isgl_src, wr->sg_list, wr->num_sge, &plen); + + size = sizeof(*wcwr) + sizeof(struct fw_ri_isgl) + + wr->num_sge * sizeof(struct fw_ri_sge); + wcwr->plen = cpu_to_be32(plen); + *len16 = DIV_ROUND_UP(size, 16); +} + +static int build_rdma_read(union t4_wr *wqe, const struct ib_send_wr *wr, + u8 *len16) { if (wr->num_sge > 1) return -EINVAL; @@ -590,8 +682,74 @@ static int build_rdma_read(union t4_wr *wqe, struct ib_send_wr *wr, u8 *len16) return 0; } +static void post_write_cmpl(struct c4iw_qp *qhp, const struct ib_send_wr *wr) +{ + bool send_signaled = (wr->next->send_flags & IB_SEND_SIGNALED) || + qhp->sq_sig_all; + bool write_signaled = (wr->send_flags & IB_SEND_SIGNALED) || + qhp->sq_sig_all; + struct t4_swsqe *swsqe; + union t4_wr *wqe; + u16 write_wrid; + u8 len16; + u16 idx; + + /* + * The sw_sq entries still look like a WRITE and a SEND and consume + * 2 slots. The FW WR, however, will be a single uber-WR. + */ + wqe = (union t4_wr *)((u8 *)qhp->wq.sq.queue + + qhp->wq.sq.wq_pidx * T4_EQ_ENTRY_SIZE); + build_rdma_write_cmpl(&qhp->wq.sq, &wqe->write_cmpl, wr, &len16); + + /* WRITE swsqe */ + swsqe = &qhp->wq.sq.sw_sq[qhp->wq.sq.pidx]; + swsqe->opcode = FW_RI_RDMA_WRITE; + swsqe->idx = qhp->wq.sq.pidx; + swsqe->complete = 0; + swsqe->signaled = write_signaled; + swsqe->flushed = 0; + swsqe->wr_id = wr->wr_id; + if (c4iw_wr_log) { + swsqe->sge_ts = + cxgb4_read_sge_timestamp(qhp->rhp->rdev.lldi.ports[0]); + swsqe->host_time = ktime_get(); + } + + write_wrid = qhp->wq.sq.pidx; + + /* just bump the sw_sq */ + qhp->wq.sq.in_use++; + if (++qhp->wq.sq.pidx == qhp->wq.sq.size) + qhp->wq.sq.pidx = 0; + + /* SEND_WITH_INV swsqe */ + swsqe = &qhp->wq.sq.sw_sq[qhp->wq.sq.pidx]; + swsqe->opcode = FW_RI_SEND_WITH_INV; + swsqe->idx = qhp->wq.sq.pidx; + swsqe->complete = 0; + swsqe->signaled = send_signaled; + swsqe->flushed = 0; + swsqe->wr_id = wr->next->wr_id; + if (c4iw_wr_log) { + swsqe->sge_ts = + cxgb4_read_sge_timestamp(qhp->rhp->rdev.lldi.ports[0]); + swsqe->host_time = ktime_get(); + } + + wqe->write_cmpl.flags_send = send_signaled ? FW_RI_COMPLETION_FLAG : 0; + wqe->write_cmpl.wrid_send = qhp->wq.sq.pidx; + + init_wr_hdr(wqe, write_wrid, FW_RI_RDMA_WRITE_CMPL_WR, + write_signaled ? FW_RI_COMPLETION_FLAG : 0, len16); + t4_sq_produce(&qhp->wq, len16); + idx = DIV_ROUND_UP(len16 * 16, T4_EQ_ENTRY_SIZE); + + t4_ring_sq_db(&qhp->wq, idx, wqe); +} + static int build_rdma_recv(struct c4iw_qp *qhp, union t4_recv_wr *wqe, - struct ib_recv_wr *wr, u8 *len16) + const struct ib_recv_wr *wr, u8 *len16) { int ret; @@ -605,8 +763,22 @@ static int build_rdma_recv(struct c4iw_qp *qhp, union t4_recv_wr *wqe, return 0; } +static int build_srq_recv(union t4_recv_wr *wqe, const struct ib_recv_wr *wr, + u8 *len16) +{ + int ret; + + ret = build_isgl((__be64 *)wqe, (__be64 *)(wqe + 1), + &wqe->recv.isgl, wr->sg_list, wr->num_sge, NULL); + if (ret) + return ret; + *len16 = DIV_ROUND_UP(sizeof(wqe->recv) + + wr->num_sge * sizeof(struct fw_ri_sge), 16); + return 0; +} + static void build_tpte_memreg(struct fw_ri_fr_nsmr_tpte_wr *fr, - struct ib_reg_wr *wr, struct c4iw_mr *mhp, + const struct ib_reg_wr *wr, struct c4iw_mr *mhp, u8 *len16) { __be64 *p = (__be64 *)fr->pbl; @@ -638,8 +810,8 @@ static void build_tpte_memreg(struct fw_ri_fr_nsmr_tpte_wr *fr, } static int build_memreg(struct t4_sq *sq, union t4_wr *wqe, - struct ib_reg_wr *wr, struct c4iw_mr *mhp, u8 *len16, - bool dsgl_supported) + const struct ib_reg_wr *wr, struct c4iw_mr *mhp, + u8 *len16, bool dsgl_supported) { struct fw_ri_immd *imdp; __be64 *p; @@ -701,7 +873,8 @@ static int build_memreg(struct t4_sq *sq, union t4_wr *wqe, return 0; } -static int build_inv_stag(union t4_wr *wqe, struct ib_send_wr *wr, u8 *len16) +static int build_inv_stag(union t4_wr *wqe, const struct ib_send_wr *wr, + u8 *len16) { wqe->inv.stag_inv = cpu_to_be32(wr->ex.invalidate_rkey); wqe->inv.r2 = 0; @@ -721,7 +894,7 @@ static void free_qp_work(struct work_struct *work) pr_debug("qhp %p ucontext %p\n", qhp, ucontext); destroy_qp(&rhp->rdev, &qhp->wq, - ucontext ? &ucontext->uctx : &rhp->rdev.uctx); + ucontext ? &ucontext->uctx : &rhp->rdev.uctx, !qhp->srq); if (ucontext) c4iw_put_ucontext(ucontext); @@ -804,6 +977,9 @@ static int ib_to_fw_opcode(int ib_opcode) case IB_WR_RDMA_WRITE: opcode = FW_RI_RDMA_WRITE; break; + case IB_WR_RDMA_WRITE_WITH_IMM: + opcode = FW_RI_WRITE_IMMEDIATE; + break; case IB_WR_RDMA_READ: case IB_WR_RDMA_READ_WITH_INV: opcode = FW_RI_READ_REQ; @@ -820,7 +996,8 @@ static int ib_to_fw_opcode(int ib_opcode) return opcode; } -static int complete_sq_drain_wr(struct c4iw_qp *qhp, struct ib_send_wr *wr) +static int complete_sq_drain_wr(struct c4iw_qp *qhp, + const struct ib_send_wr *wr) { struct t4_cqe cqe = {}; struct c4iw_cq *schp; @@ -858,8 +1035,9 @@ static int complete_sq_drain_wr(struct c4iw_qp *qhp, struct ib_send_wr *wr) return 0; } -static int complete_sq_drain_wrs(struct c4iw_qp *qhp, struct ib_send_wr *wr, - struct ib_send_wr **bad_wr) +static int complete_sq_drain_wrs(struct c4iw_qp *qhp, + const struct ib_send_wr *wr, + const struct ib_send_wr **bad_wr) { int ret = 0; @@ -874,7 +1052,8 @@ static int complete_sq_drain_wrs(struct c4iw_qp *qhp, struct ib_send_wr *wr, return ret; } -static void complete_rq_drain_wr(struct c4iw_qp *qhp, struct ib_recv_wr *wr) +static void complete_rq_drain_wr(struct c4iw_qp *qhp, + const struct ib_recv_wr *wr) { struct t4_cqe cqe = {}; struct c4iw_cq *rchp; @@ -906,7 +1085,8 @@ static void complete_rq_drain_wr(struct c4iw_qp *qhp, struct ib_recv_wr *wr) } } -static void complete_rq_drain_wrs(struct c4iw_qp *qhp, struct ib_recv_wr *wr) +static void complete_rq_drain_wrs(struct c4iw_qp *qhp, + const struct ib_recv_wr *wr) { while (wr) { complete_rq_drain_wr(qhp, wr); @@ -914,14 +1094,15 @@ static void complete_rq_drain_wrs(struct c4iw_qp *qhp, struct ib_recv_wr *wr) } } -int c4iw_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, - struct ib_send_wr **bad_wr) +int c4iw_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, + const struct ib_send_wr **bad_wr) { int err = 0; u8 len16 = 0; enum fw_wr_opcodes fw_opcode = 0; enum fw_ri_wr_flags fw_flags; struct c4iw_qp *qhp; + struct c4iw_dev *rhp; union t4_wr *wqe = NULL; u32 num_wrs; struct t4_swsqe *swsqe; @@ -929,6 +1110,7 @@ int c4iw_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, u16 idx = 0; qhp = to_c4iw_qp(ibqp); + rhp = qhp->rhp; spin_lock_irqsave(&qhp->lock, flag); /* @@ -946,6 +1128,30 @@ int c4iw_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, *bad_wr = wr; return -ENOMEM; } + + /* + * Fastpath for NVMe-oF target WRITE + SEND_WITH_INV wr chain which is + * the response for small NVMEe-oF READ requests. If the chain is + * exactly a WRITE->SEND_WITH_INV and the sgl depths and lengths + * meet the requirements of the fw_ri_write_cmpl_wr work request, + * then build and post the write_cmpl WR. If any of the tests + * below are not true, then we continue on with the tradtional WRITE + * and SEND WRs. + */ + if (qhp->rhp->rdev.lldi.write_cmpl_support && + CHELSIO_CHIP_VERSION(qhp->rhp->rdev.lldi.adapter_type) >= + CHELSIO_T5 && + wr && wr->next && !wr->next->next && + wr->opcode == IB_WR_RDMA_WRITE && + wr->sg_list[0].length && wr->num_sge <= T4_WRITE_CMPL_MAX_SGL && + wr->next->opcode == IB_WR_SEND_WITH_INV && + wr->next->sg_list[0].length == T4_WRITE_CMPL_MAX_CQE && + wr->next->num_sge == 1 && num_wrs >= 2) { + post_write_cmpl(qhp, wr); + spin_unlock_irqrestore(&qhp->lock, flag); + return 0; + } + while (wr) { if (num_wrs == 0) { err = -ENOMEM; @@ -973,6 +1179,13 @@ int c4iw_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, swsqe->opcode = FW_RI_SEND_WITH_INV; err = build_rdma_send(&qhp->wq.sq, wqe, wr, &len16); break; + case IB_WR_RDMA_WRITE_WITH_IMM: + if (unlikely(!rhp->rdev.lldi.write_w_imm_support)) { + err = -EINVAL; + break; + } + fw_flags |= FW_RI_RDMA_WRITE_WITH_IMMEDIATE; + /*FALLTHROUGH*/ case IB_WR_RDMA_WRITE: fw_opcode = FW_RI_RDMA_WRITE_WR; swsqe->opcode = FW_RI_RDMA_WRITE; @@ -983,8 +1196,7 @@ int c4iw_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, fw_opcode = FW_RI_RDMA_READ_WR; swsqe->opcode = FW_RI_READ_REQ; if (wr->opcode == IB_WR_RDMA_READ_WITH_INV) { - c4iw_invalidate_mr(qhp->rhp, - wr->sg_list[0].lkey); + c4iw_invalidate_mr(rhp, wr->sg_list[0].lkey); fw_flags = FW_RI_RDMA_READ_INVALIDATE; } else { fw_flags = 0; @@ -1000,7 +1212,7 @@ int c4iw_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, struct c4iw_mr *mhp = to_c4iw_mr(reg_wr(wr)->mr); swsqe->opcode = FW_RI_FAST_REGISTER; - if (qhp->rhp->rdev.lldi.fr_nsmr_tpte_wr_support && + if (rhp->rdev.lldi.fr_nsmr_tpte_wr_support && !mhp->attr.state && mhp->mpl_len <= 2) { fw_opcode = FW_RI_FR_NSMR_TPTE_WR; build_tpte_memreg(&wqe->fr_tpte, reg_wr(wr), @@ -1009,7 +1221,7 @@ int c4iw_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, fw_opcode = FW_RI_FR_NSMR_WR; err = build_memreg(&qhp->wq.sq, wqe, reg_wr(wr), mhp, &len16, - qhp->rhp->rdev.lldi.ulptx_memwrite_dsgl); + rhp->rdev.lldi.ulptx_memwrite_dsgl); if (err) break; } @@ -1022,7 +1234,7 @@ int c4iw_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, fw_opcode = FW_RI_INV_LSTAG_WR; swsqe->opcode = FW_RI_LOCAL_INV; err = build_inv_stag(wqe, wr, &len16); - c4iw_invalidate_mr(qhp->rhp, wr->ex.invalidate_rkey); + c4iw_invalidate_mr(rhp, wr->ex.invalidate_rkey); break; default: pr_warn("%s post of type=%d TBD!\n", __func__, @@ -1041,7 +1253,7 @@ int c4iw_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, swsqe->wr_id = wr->wr_id; if (c4iw_wr_log) { swsqe->sge_ts = cxgb4_read_sge_timestamp( - qhp->rhp->rdev.lldi.ports[0]); + rhp->rdev.lldi.ports[0]); swsqe->host_time = ktime_get(); } @@ -1055,7 +1267,7 @@ int c4iw_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, t4_sq_produce(&qhp->wq, len16); idx += DIV_ROUND_UP(len16*16, T4_EQ_ENTRY_SIZE); } - if (!qhp->rhp->rdev.status_page->db_off) { + if (!rhp->rdev.status_page->db_off) { t4_ring_sq_db(&qhp->wq, idx, wqe); spin_unlock_irqrestore(&qhp->lock, flag); } else { @@ -1065,8 +1277,8 @@ int c4iw_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, return err; } -int c4iw_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr, - struct ib_recv_wr **bad_wr) +int c4iw_post_receive(struct ib_qp *ibqp, const struct ib_recv_wr *wr, + const struct ib_recv_wr **bad_wr) { int err = 0; struct c4iw_qp *qhp; @@ -1145,6 +1357,89 @@ int c4iw_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr, return err; } +static void defer_srq_wr(struct t4_srq *srq, union t4_recv_wr *wqe, + u64 wr_id, u8 len16) +{ + struct t4_srq_pending_wr *pwr = &srq->pending_wrs[srq->pending_pidx]; + + pr_debug("%s cidx %u pidx %u wq_pidx %u in_use %u ooo_count %u wr_id 0x%llx pending_cidx %u pending_pidx %u pending_in_use %u\n", + __func__, srq->cidx, srq->pidx, srq->wq_pidx, + srq->in_use, srq->ooo_count, + (unsigned long long)wr_id, srq->pending_cidx, + srq->pending_pidx, srq->pending_in_use); + pwr->wr_id = wr_id; + pwr->len16 = len16; + memcpy(&pwr->wqe, wqe, len16 * 16); + t4_srq_produce_pending_wr(srq); +} + +int c4iw_post_srq_recv(struct ib_srq *ibsrq, const struct ib_recv_wr *wr, + const struct ib_recv_wr **bad_wr) +{ + union t4_recv_wr *wqe, lwqe; + struct c4iw_srq *srq; + unsigned long flag; + u8 len16 = 0; + u16 idx = 0; + int err = 0; + u32 num_wrs; + + srq = to_c4iw_srq(ibsrq); + spin_lock_irqsave(&srq->lock, flag); + num_wrs = t4_srq_avail(&srq->wq); + if (num_wrs == 0) { + spin_unlock_irqrestore(&srq->lock, flag); + return -ENOMEM; + } + while (wr) { + if (wr->num_sge > T4_MAX_RECV_SGE) { + err = -EINVAL; + *bad_wr = wr; + break; + } + wqe = &lwqe; + if (num_wrs) + err = build_srq_recv(wqe, wr, &len16); + else + err = -ENOMEM; + if (err) { + *bad_wr = wr; + break; + } + + wqe->recv.opcode = FW_RI_RECV_WR; + wqe->recv.r1 = 0; + wqe->recv.wrid = srq->wq.pidx; + wqe->recv.r2[0] = 0; + wqe->recv.r2[1] = 0; + wqe->recv.r2[2] = 0; + wqe->recv.len16 = len16; + + if (srq->wq.ooo_count || + srq->wq.pending_in_use || + srq->wq.sw_rq[srq->wq.pidx].valid) { + defer_srq_wr(&srq->wq, wqe, wr->wr_id, len16); + } else { + srq->wq.sw_rq[srq->wq.pidx].wr_id = wr->wr_id; + srq->wq.sw_rq[srq->wq.pidx].valid = 1; + c4iw_copy_wr_to_srq(&srq->wq, wqe, len16); + pr_debug("%s cidx %u pidx %u wq_pidx %u in_use %u wr_id 0x%llx\n", + __func__, srq->wq.cidx, + srq->wq.pidx, srq->wq.wq_pidx, + srq->wq.in_use, + (unsigned long long)wr->wr_id); + t4_srq_produce(&srq->wq, len16); + idx += DIV_ROUND_UP(len16 * 16, T4_EQ_ENTRY_SIZE); + } + wr = wr->next; + num_wrs--; + } + if (idx) + t4_ring_srq_db(&srq->wq, idx, len16, wqe); + spin_unlock_irqrestore(&srq->lock, flag); + return err; +} + static inline void build_term_codes(struct t4_cqe *err_cqe, u8 *layer_type, u8 *ecode) { @@ -1321,7 +1616,7 @@ static void __flush_qp(struct c4iw_qp *qhp, struct c4iw_cq *rchp, struct c4iw_cq *schp) { int count; - int rq_flushed, sq_flushed; + int rq_flushed = 0, sq_flushed; unsigned long flag; pr_debug("qhp %p rchp %p schp %p\n", qhp, rchp, schp); @@ -1340,11 +1635,13 @@ static void __flush_qp(struct c4iw_qp *qhp, struct c4iw_cq *rchp, return; } qhp->wq.flushed = 1; - t4_set_wq_in_error(&qhp->wq); + t4_set_wq_in_error(&qhp->wq, 0); c4iw_flush_hw_cq(rchp, qhp); - c4iw_count_rcqes(&rchp->cq, &qhp->wq, &count); - rq_flushed = c4iw_flush_rq(&qhp->wq, &rchp->cq, count); + if (!qhp->srq) { + c4iw_count_rcqes(&rchp->cq, &qhp->wq, &count); + rq_flushed = c4iw_flush_rq(&qhp->wq, &rchp->cq, count); + } if (schp != rchp) c4iw_flush_hw_cq(schp, qhp); @@ -1388,7 +1685,7 @@ static void flush_qp(struct c4iw_qp *qhp) schp = to_c4iw_cq(qhp->ibqp.send_cq); if (qhp->ibqp.uobject) { - t4_set_wq_in_error(&qhp->wq); + t4_set_wq_in_error(&qhp->wq, 0); t4_set_cq_in_error(&rchp->cq); spin_lock_irqsave(&rchp->comp_handler_lock, flag); (*rchp->ibcq.comp_handler)(&rchp->ibcq, rchp->ibcq.cq_context); @@ -1517,16 +1814,21 @@ static int rdma_init(struct c4iw_dev *rhp, struct c4iw_qp *qhp) wqe->u.init.pdid = cpu_to_be32(qhp->attr.pd); wqe->u.init.qpid = cpu_to_be32(qhp->wq.sq.qid); wqe->u.init.sq_eqid = cpu_to_be32(qhp->wq.sq.qid); - wqe->u.init.rq_eqid = cpu_to_be32(qhp->wq.rq.qid); + if (qhp->srq) { + wqe->u.init.rq_eqid = cpu_to_be32(FW_RI_INIT_RQEQID_SRQ | + qhp->srq->idx); + } else { + wqe->u.init.rq_eqid = cpu_to_be32(qhp->wq.rq.qid); + wqe->u.init.hwrqsize = cpu_to_be32(qhp->wq.rq.rqt_size); + wqe->u.init.hwrqaddr = cpu_to_be32(qhp->wq.rq.rqt_hwaddr - + rhp->rdev.lldi.vr->rq.start); + } wqe->u.init.scqid = cpu_to_be32(qhp->attr.scq); wqe->u.init.rcqid = cpu_to_be32(qhp->attr.rcq); wqe->u.init.ord_max = cpu_to_be32(qhp->attr.max_ord); wqe->u.init.ird_max = cpu_to_be32(qhp->attr.max_ird); wqe->u.init.iss = cpu_to_be32(qhp->ep->snd_seq); wqe->u.init.irs = cpu_to_be32(qhp->ep->rcv_seq); - wqe->u.init.hwrqsize = cpu_to_be32(qhp->wq.rq.rqt_size); - wqe->u.init.hwrqaddr = cpu_to_be32(qhp->wq.rq.rqt_hwaddr - - rhp->rdev.lldi.vr->rq.start); if (qhp->attr.mpa_attr.initiator) build_rtr_msg(qhp->attr.mpa_attr.p2p_type, &wqe->u.init); @@ -1643,7 +1945,7 @@ int c4iw_modify_qp(struct c4iw_dev *rhp, struct c4iw_qp *qhp, case C4IW_QP_STATE_RTS: switch (attrs->next_state) { case C4IW_QP_STATE_CLOSING: - t4_set_wq_in_error(&qhp->wq); + t4_set_wq_in_error(&qhp->wq, 0); set_state(qhp, C4IW_QP_STATE_CLOSING); ep = qhp->ep; if (!internal) { @@ -1656,7 +1958,7 @@ int c4iw_modify_qp(struct c4iw_dev *rhp, struct c4iw_qp *qhp, goto err; break; case C4IW_QP_STATE_TERMINATE: - t4_set_wq_in_error(&qhp->wq); + t4_set_wq_in_error(&qhp->wq, 0); set_state(qhp, C4IW_QP_STATE_TERMINATE); qhp->attr.layer_etype = attrs->layer_etype; qhp->attr.ecode = attrs->ecode; @@ -1673,7 +1975,7 @@ int c4iw_modify_qp(struct c4iw_dev *rhp, struct c4iw_qp *qhp, } break; case C4IW_QP_STATE_ERROR: - t4_set_wq_in_error(&qhp->wq); + t4_set_wq_in_error(&qhp->wq, 0); set_state(qhp, C4IW_QP_STATE_ERROR); if (!internal) { abort = 1; @@ -1819,7 +2121,7 @@ struct ib_qp *c4iw_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *attrs, struct c4iw_cq *schp; struct c4iw_cq *rchp; struct c4iw_create_qp_resp uresp; - unsigned int sqsize, rqsize; + unsigned int sqsize, rqsize = 0; struct c4iw_ucontext *ucontext; int ret; struct c4iw_mm_entry *sq_key_mm, *rq_key_mm = NULL, *sq_db_key_mm; @@ -1840,11 +2142,13 @@ struct ib_qp *c4iw_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *attrs, if (attrs->cap.max_inline_data > T4_MAX_SEND_INLINE) return ERR_PTR(-EINVAL); - if (attrs->cap.max_recv_wr > rhp->rdev.hw_queue.t4_max_rq_size) - return ERR_PTR(-E2BIG); - rqsize = attrs->cap.max_recv_wr + 1; - if (rqsize < 8) - rqsize = 8; + if (!attrs->srq) { + if (attrs->cap.max_recv_wr > rhp->rdev.hw_queue.t4_max_rq_size) + return ERR_PTR(-E2BIG); + rqsize = attrs->cap.max_recv_wr + 1; + if (rqsize < 8) + rqsize = 8; + } if (attrs->cap.max_send_wr > rhp->rdev.hw_queue.t4_max_sq_size) return ERR_PTR(-E2BIG); @@ -1869,19 +2173,23 @@ struct ib_qp *c4iw_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *attrs, (sqsize + rhp->rdev.hw_queue.t4_eq_status_entries) * sizeof(*qhp->wq.sq.queue) + 16 * sizeof(__be64); qhp->wq.sq.flush_cidx = -1; - qhp->wq.rq.size = rqsize; - qhp->wq.rq.memsize = - (rqsize + rhp->rdev.hw_queue.t4_eq_status_entries) * - sizeof(*qhp->wq.rq.queue); + if (!attrs->srq) { + qhp->wq.rq.size = rqsize; + qhp->wq.rq.memsize = + (rqsize + rhp->rdev.hw_queue.t4_eq_status_entries) * + sizeof(*qhp->wq.rq.queue); + } if (ucontext) { qhp->wq.sq.memsize = roundup(qhp->wq.sq.memsize, PAGE_SIZE); - qhp->wq.rq.memsize = roundup(qhp->wq.rq.memsize, PAGE_SIZE); + if (!attrs->srq) + qhp->wq.rq.memsize = + roundup(qhp->wq.rq.memsize, PAGE_SIZE); } ret = create_qp(&rhp->rdev, &qhp->wq, &schp->cq, &rchp->cq, ucontext ? &ucontext->uctx : &rhp->rdev.uctx, - qhp->wr_waitp); + qhp->wr_waitp, !attrs->srq); if (ret) goto err_free_wr_wait; @@ -1894,10 +2202,12 @@ struct ib_qp *c4iw_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *attrs, qhp->attr.scq = ((struct c4iw_cq *) attrs->send_cq)->cq.cqid; qhp->attr.rcq = ((struct c4iw_cq *) attrs->recv_cq)->cq.cqid; qhp->attr.sq_num_entries = attrs->cap.max_send_wr; - qhp->attr.rq_num_entries = attrs->cap.max_recv_wr; qhp->attr.sq_max_sges = attrs->cap.max_send_sge; qhp->attr.sq_max_sges_rdma_write = attrs->cap.max_send_sge; - qhp->attr.rq_max_sges = attrs->cap.max_recv_sge; + if (!attrs->srq) { + qhp->attr.rq_num_entries = attrs->cap.max_recv_wr; + qhp->attr.rq_max_sges = attrs->cap.max_recv_sge; + } qhp->attr.state = C4IW_QP_STATE_IDLE; qhp->attr.next_state = C4IW_QP_STATE_IDLE; qhp->attr.enable_rdma_read = 1; @@ -1922,21 +2232,27 @@ struct ib_qp *c4iw_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *attrs, ret = -ENOMEM; goto err_remove_handle; } - rq_key_mm = kmalloc(sizeof(*rq_key_mm), GFP_KERNEL); - if (!rq_key_mm) { - ret = -ENOMEM; - goto err_free_sq_key; + if (!attrs->srq) { + rq_key_mm = kmalloc(sizeof(*rq_key_mm), GFP_KERNEL); + if (!rq_key_mm) { + ret = -ENOMEM; + goto err_free_sq_key; + } } sq_db_key_mm = kmalloc(sizeof(*sq_db_key_mm), GFP_KERNEL); if (!sq_db_key_mm) { ret = -ENOMEM; goto err_free_rq_key; } - rq_db_key_mm = kmalloc(sizeof(*rq_db_key_mm), GFP_KERNEL); - if (!rq_db_key_mm) { - ret = -ENOMEM; - goto err_free_sq_db_key; + if (!attrs->srq) { + rq_db_key_mm = + kmalloc(sizeof(*rq_db_key_mm), GFP_KERNEL); + if (!rq_db_key_mm) { + ret = -ENOMEM; + goto err_free_sq_db_key; + } } + memset(&uresp, 0, sizeof(uresp)); if (t4_sq_onchip(&qhp->wq.sq)) { ma_sync_key_mm = kmalloc(sizeof(*ma_sync_key_mm), GFP_KERNEL); @@ -1945,30 +2261,35 @@ struct ib_qp *c4iw_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *attrs, goto err_free_rq_db_key; } uresp.flags = C4IW_QPF_ONCHIP; - } else - uresp.flags = 0; + } + if (rhp->rdev.lldi.write_w_imm_support) + uresp.flags |= C4IW_QPF_WRITE_W_IMM; uresp.qid_mask = rhp->rdev.qpmask; uresp.sqid = qhp->wq.sq.qid; uresp.sq_size = qhp->wq.sq.size; uresp.sq_memsize = qhp->wq.sq.memsize; - uresp.rqid = qhp->wq.rq.qid; - uresp.rq_size = qhp->wq.rq.size; - uresp.rq_memsize = qhp->wq.rq.memsize; + if (!attrs->srq) { + uresp.rqid = qhp->wq.rq.qid; + uresp.rq_size = qhp->wq.rq.size; + uresp.rq_memsize = qhp->wq.rq.memsize; + } spin_lock(&ucontext->mmap_lock); if (ma_sync_key_mm) { uresp.ma_sync_key = ucontext->key; ucontext->key += PAGE_SIZE; - } else { - uresp.ma_sync_key = 0; } uresp.sq_key = ucontext->key; ucontext->key += PAGE_SIZE; - uresp.rq_key = ucontext->key; - ucontext->key += PAGE_SIZE; + if (!attrs->srq) { + uresp.rq_key = ucontext->key; + ucontext->key += PAGE_SIZE; + } uresp.sq_db_gts_key = ucontext->key; ucontext->key += PAGE_SIZE; - uresp.rq_db_gts_key = ucontext->key; - ucontext->key += PAGE_SIZE; + if (!attrs->srq) { + uresp.rq_db_gts_key = ucontext->key; + ucontext->key += PAGE_SIZE; + } spin_unlock(&ucontext->mmap_lock); ret = ib_copy_to_udata(udata, &uresp, sizeof uresp); if (ret) @@ -1977,18 +2298,23 @@ struct ib_qp *c4iw_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *attrs, sq_key_mm->addr = qhp->wq.sq.phys_addr; sq_key_mm->len = PAGE_ALIGN(qhp->wq.sq.memsize); insert_mmap(ucontext, sq_key_mm); - rq_key_mm->key = uresp.rq_key; - rq_key_mm->addr = virt_to_phys(qhp->wq.rq.queue); - rq_key_mm->len = PAGE_ALIGN(qhp->wq.rq.memsize); - insert_mmap(ucontext, rq_key_mm); + if (!attrs->srq) { + rq_key_mm->key = uresp.rq_key; + rq_key_mm->addr = virt_to_phys(qhp->wq.rq.queue); + rq_key_mm->len = PAGE_ALIGN(qhp->wq.rq.memsize); + insert_mmap(ucontext, rq_key_mm); + } sq_db_key_mm->key = uresp.sq_db_gts_key; sq_db_key_mm->addr = (u64)(unsigned long)qhp->wq.sq.bar2_pa; sq_db_key_mm->len = PAGE_SIZE; insert_mmap(ucontext, sq_db_key_mm); - rq_db_key_mm->key = uresp.rq_db_gts_key; - rq_db_key_mm->addr = (u64)(unsigned long)qhp->wq.rq.bar2_pa; - rq_db_key_mm->len = PAGE_SIZE; - insert_mmap(ucontext, rq_db_key_mm); + if (!attrs->srq) { + rq_db_key_mm->key = uresp.rq_db_gts_key; + rq_db_key_mm->addr = + (u64)(unsigned long)qhp->wq.rq.bar2_pa; + rq_db_key_mm->len = PAGE_SIZE; + insert_mmap(ucontext, rq_db_key_mm); + } if (ma_sync_key_mm) { ma_sync_key_mm->key = uresp.ma_sync_key; ma_sync_key_mm->addr = @@ -2001,7 +2327,19 @@ struct ib_qp *c4iw_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *attrs, c4iw_get_ucontext(ucontext); qhp->ucontext = ucontext; } + if (!attrs->srq) { + qhp->wq.qp_errp = + &qhp->wq.rq.queue[qhp->wq.rq.size].status.qp_err; + } else { + qhp->wq.qp_errp = + &qhp->wq.sq.queue[qhp->wq.sq.size].status.qp_err; + qhp->wq.srqidxp = + &qhp->wq.sq.queue[qhp->wq.sq.size].status.srqidx; + } + qhp->ibqp.qp_num = qhp->wq.sq.qid; + if (attrs->srq) + qhp->srq = to_c4iw_srq(attrs->srq); INIT_LIST_HEAD(&qhp->db_fc_entry); pr_debug("sq id %u size %u memsize %zu num_entries %u rq id %u size %u memsize %zu num_entries %u\n", qhp->wq.sq.qid, qhp->wq.sq.size, qhp->wq.sq.memsize, @@ -2011,18 +2349,20 @@ struct ib_qp *c4iw_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *attrs, err_free_ma_sync_key: kfree(ma_sync_key_mm); err_free_rq_db_key: - kfree(rq_db_key_mm); + if (!attrs->srq) + kfree(rq_db_key_mm); err_free_sq_db_key: kfree(sq_db_key_mm); err_free_rq_key: - kfree(rq_key_mm); + if (!attrs->srq) + kfree(rq_key_mm); err_free_sq_key: kfree(sq_key_mm); err_remove_handle: remove_handle(rhp, &rhp->qpidr, qhp->wq.sq.qid); err_destroy_qp: destroy_qp(&rhp->rdev, &qhp->wq, - ucontext ? &ucontext->uctx : &rhp->rdev.uctx); + ucontext ? &ucontext->uctx : &rhp->rdev.uctx, !attrs->srq); err_free_wr_wait: c4iw_put_wr_wait(qhp->wr_waitp); err_free_qhp: @@ -2088,6 +2428,45 @@ struct ib_qp *c4iw_get_qp(struct ib_device *dev, int qpn) return (struct ib_qp *)get_qhp(to_c4iw_dev(dev), qpn); } +void c4iw_dispatch_srq_limit_reached_event(struct c4iw_srq *srq) +{ + struct ib_event event = {}; + + event.device = &srq->rhp->ibdev; + event.element.srq = &srq->ibsrq; + event.event = IB_EVENT_SRQ_LIMIT_REACHED; + ib_dispatch_event(&event); +} + +int c4iw_modify_srq(struct ib_srq *ib_srq, struct ib_srq_attr *attr, + enum ib_srq_attr_mask srq_attr_mask, + struct ib_udata *udata) +{ + struct c4iw_srq *srq = to_c4iw_srq(ib_srq); + int ret = 0; + + /* + * XXX 0 mask == a SW interrupt for srq_limit reached... + */ + if (udata && !srq_attr_mask) { + c4iw_dispatch_srq_limit_reached_event(srq); + goto out; + } + + /* no support for this yet */ + if (srq_attr_mask & IB_SRQ_MAX_WR) { + ret = -EINVAL; + goto out; + } + + if (!udata && (srq_attr_mask & IB_SRQ_LIMIT)) { + srq->armed = true; + srq->srq_limit = attr->srq_limit; + } +out: + return ret; +} + int c4iw_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask, struct ib_qp_init_attr *init_attr) { @@ -2104,3 +2483,359 @@ int c4iw_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, init_attr->sq_sig_type = qhp->sq_sig_all ? IB_SIGNAL_ALL_WR : 0; return 0; } + +static void free_srq_queue(struct c4iw_srq *srq, struct c4iw_dev_ucontext *uctx, + struct c4iw_wr_wait *wr_waitp) +{ + struct c4iw_rdev *rdev = &srq->rhp->rdev; + struct sk_buff *skb = srq->destroy_skb; + struct t4_srq *wq = &srq->wq; + struct fw_ri_res_wr *res_wr; + struct fw_ri_res *res; + int wr_len; + + wr_len = sizeof(*res_wr) + sizeof(*res); + set_wr_txq(skb, CPL_PRIORITY_CONTROL, 0); + + res_wr = (struct fw_ri_res_wr *)__skb_put(skb, wr_len); + memset(res_wr, 0, wr_len); + res_wr->op_nres = cpu_to_be32(FW_WR_OP_V(FW_RI_RES_WR) | + FW_RI_RES_WR_NRES_V(1) | + FW_WR_COMPL_F); + res_wr->len16_pkd = cpu_to_be32(DIV_ROUND_UP(wr_len, 16)); + res_wr->cookie = (uintptr_t)wr_waitp; + res = res_wr->res; + res->u.srq.restype = FW_RI_RES_TYPE_SRQ; + res->u.srq.op = FW_RI_RES_OP_RESET; + res->u.srq.srqid = cpu_to_be32(srq->idx); + res->u.srq.eqid = cpu_to_be32(wq->qid); + + c4iw_init_wr_wait(wr_waitp); + c4iw_ref_send_wait(rdev, skb, wr_waitp, 0, 0, __func__); + + dma_free_coherent(&rdev->lldi.pdev->dev, + wq->memsize, wq->queue, + pci_unmap_addr(wq, mapping)); + c4iw_rqtpool_free(rdev, wq->rqt_hwaddr, wq->rqt_size); + kfree(wq->sw_rq); + c4iw_put_qpid(rdev, wq->qid, uctx); +} + +static int alloc_srq_queue(struct c4iw_srq *srq, struct c4iw_dev_ucontext *uctx, + struct c4iw_wr_wait *wr_waitp) +{ + struct c4iw_rdev *rdev = &srq->rhp->rdev; + int user = (uctx != &rdev->uctx); + struct t4_srq *wq = &srq->wq; + struct fw_ri_res_wr *res_wr; + struct fw_ri_res *res; + struct sk_buff *skb; + int wr_len; + int eqsize; + int ret = -ENOMEM; + + wq->qid = c4iw_get_qpid(rdev, uctx); + if (!wq->qid) + goto err; + + if (!user) { + wq->sw_rq = kcalloc(wq->size, sizeof(*wq->sw_rq), + GFP_KERNEL); + if (!wq->sw_rq) + goto err_put_qpid; + wq->pending_wrs = kcalloc(srq->wq.size, + sizeof(*srq->wq.pending_wrs), + GFP_KERNEL); + if (!wq->pending_wrs) + goto err_free_sw_rq; + } + + wq->rqt_size = wq->size; + wq->rqt_hwaddr = c4iw_rqtpool_alloc(rdev, wq->rqt_size); + if (!wq->rqt_hwaddr) + goto err_free_pending_wrs; + wq->rqt_abs_idx = (wq->rqt_hwaddr - rdev->lldi.vr->rq.start) >> + T4_RQT_ENTRY_SHIFT; + + wq->queue = dma_alloc_coherent(&rdev->lldi.pdev->dev, + wq->memsize, &wq->dma_addr, + GFP_KERNEL); + if (!wq->queue) + goto err_free_rqtpool; + + memset(wq->queue, 0, wq->memsize); + pci_unmap_addr_set(wq, mapping, wq->dma_addr); + + wq->bar2_va = c4iw_bar2_addrs(rdev, wq->qid, T4_BAR2_QTYPE_EGRESS, + &wq->bar2_qid, + user ? &wq->bar2_pa : NULL); + + /* + * User mode must have bar2 access. + */ + + if (user && !wq->bar2_va) { + pr_warn(MOD "%s: srqid %u not in BAR2 range.\n", + pci_name(rdev->lldi.pdev), wq->qid); + ret = -EINVAL; + goto err_free_queue; + } + + /* build fw_ri_res_wr */ + wr_len = sizeof(*res_wr) + sizeof(*res); + + skb = alloc_skb(wr_len, GFP_KERNEL | __GFP_NOFAIL); + if (!skb) + goto err_free_queue; + set_wr_txq(skb, CPL_PRIORITY_CONTROL, 0); + + res_wr = (struct fw_ri_res_wr *)__skb_put(skb, wr_len); + memset(res_wr, 0, wr_len); + res_wr->op_nres = cpu_to_be32(FW_WR_OP_V(FW_RI_RES_WR) | + FW_RI_RES_WR_NRES_V(1) | + FW_WR_COMPL_F); + res_wr->len16_pkd = cpu_to_be32(DIV_ROUND_UP(wr_len, 16)); + res_wr->cookie = (uintptr_t)wr_waitp; + res = res_wr->res; + res->u.srq.restype = FW_RI_RES_TYPE_SRQ; + res->u.srq.op = FW_RI_RES_OP_WRITE; + + /* + * eqsize is the number of 64B entries plus the status page size. + */ + eqsize = wq->size * T4_RQ_NUM_SLOTS + + rdev->hw_queue.t4_eq_status_entries; + res->u.srq.eqid = cpu_to_be32(wq->qid); + res->u.srq.fetchszm_to_iqid = + /* no host cidx updates */ + cpu_to_be32(FW_RI_RES_WR_HOSTFCMODE_V(0) | + FW_RI_RES_WR_CPRIO_V(0) | /* don't keep in chip cache */ + FW_RI_RES_WR_PCIECHN_V(0) | /* set by uP at ri_init time */ + FW_RI_RES_WR_FETCHRO_V(0)); /* relaxed_ordering */ + res->u.srq.dcaen_to_eqsize = + cpu_to_be32(FW_RI_RES_WR_DCAEN_V(0) | + FW_RI_RES_WR_DCACPU_V(0) | + FW_RI_RES_WR_FBMIN_V(2) | + FW_RI_RES_WR_FBMAX_V(3) | + FW_RI_RES_WR_CIDXFTHRESHO_V(0) | + FW_RI_RES_WR_CIDXFTHRESH_V(0) | + FW_RI_RES_WR_EQSIZE_V(eqsize)); + res->u.srq.eqaddr = cpu_to_be64(wq->dma_addr); + res->u.srq.srqid = cpu_to_be32(srq->idx); + res->u.srq.pdid = cpu_to_be32(srq->pdid); + res->u.srq.hwsrqsize = cpu_to_be32(wq->rqt_size); + res->u.srq.hwsrqaddr = cpu_to_be32(wq->rqt_hwaddr - + rdev->lldi.vr->rq.start); + + c4iw_init_wr_wait(wr_waitp); + + ret = c4iw_ref_send_wait(rdev, skb, wr_waitp, 0, wq->qid, __func__); + if (ret) + goto err_free_queue; + + pr_debug("%s srq %u eqid %u pdid %u queue va %p pa 0x%llx\n" + " bar2_addr %p rqt addr 0x%x size %d\n", + __func__, srq->idx, wq->qid, srq->pdid, wq->queue, + (u64)virt_to_phys(wq->queue), wq->bar2_va, + wq->rqt_hwaddr, wq->rqt_size); + + return 0; +err_free_queue: + dma_free_coherent(&rdev->lldi.pdev->dev, + wq->memsize, wq->queue, + pci_unmap_addr(wq, mapping)); +err_free_rqtpool: + c4iw_rqtpool_free(rdev, wq->rqt_hwaddr, wq->rqt_size); +err_free_pending_wrs: + if (!user) + kfree(wq->pending_wrs); +err_free_sw_rq: + if (!user) + kfree(wq->sw_rq); +err_put_qpid: + c4iw_put_qpid(rdev, wq->qid, uctx); +err: + return ret; +} + +void c4iw_copy_wr_to_srq(struct t4_srq *srq, union t4_recv_wr *wqe, u8 len16) +{ + u64 *src, *dst; + + src = (u64 *)wqe; + dst = (u64 *)((u8 *)srq->queue + srq->wq_pidx * T4_EQ_ENTRY_SIZE); + while (len16) { + *dst++ = *src++; + if (dst >= (u64 *)&srq->queue[srq->size]) + dst = (u64 *)srq->queue; + *dst++ = *src++; + if (dst >= (u64 *)&srq->queue[srq->size]) + dst = (u64 *)srq->queue; + len16--; + } +} + +struct ib_srq *c4iw_create_srq(struct ib_pd *pd, struct ib_srq_init_attr *attrs, + struct ib_udata *udata) +{ + struct c4iw_dev *rhp; + struct c4iw_srq *srq; + struct c4iw_pd *php; + struct c4iw_create_srq_resp uresp; + struct c4iw_ucontext *ucontext; + struct c4iw_mm_entry *srq_key_mm, *srq_db_key_mm; + int rqsize; + int ret; + int wr_len; + + pr_debug("%s ib_pd %p\n", __func__, pd); + + php = to_c4iw_pd(pd); + rhp = php->rhp; + + if (!rhp->rdev.lldi.vr->srq.size) + return ERR_PTR(-EINVAL); + if (attrs->attr.max_wr > rhp->rdev.hw_queue.t4_max_rq_size) + return ERR_PTR(-E2BIG); + if (attrs->attr.max_sge > T4_MAX_RECV_SGE) + return ERR_PTR(-E2BIG); + + /* + * SRQ RQT and RQ must be a power of 2 and at least 16 deep. + */ + rqsize = attrs->attr.max_wr + 1; + rqsize = roundup_pow_of_two(max_t(u16, rqsize, 16)); + + ucontext = pd->uobject ? to_c4iw_ucontext(pd->uobject->context) : NULL; + + srq = kzalloc(sizeof(*srq), GFP_KERNEL); + if (!srq) + return ERR_PTR(-ENOMEM); + + srq->wr_waitp = c4iw_alloc_wr_wait(GFP_KERNEL); + if (!srq->wr_waitp) { + ret = -ENOMEM; + goto err_free_srq; + } + + srq->idx = c4iw_alloc_srq_idx(&rhp->rdev); + if (srq->idx < 0) { + ret = -ENOMEM; + goto err_free_wr_wait; + } + + wr_len = sizeof(struct fw_ri_res_wr) + sizeof(struct fw_ri_res); + srq->destroy_skb = alloc_skb(wr_len, GFP_KERNEL); + if (!srq->destroy_skb) { + ret = -ENOMEM; + goto err_free_srq_idx; + } + + srq->rhp = rhp; + srq->pdid = php->pdid; + + srq->wq.size = rqsize; + srq->wq.memsize = + (rqsize + rhp->rdev.hw_queue.t4_eq_status_entries) * + sizeof(*srq->wq.queue); + if (ucontext) + srq->wq.memsize = roundup(srq->wq.memsize, PAGE_SIZE); + + ret = alloc_srq_queue(srq, ucontext ? &ucontext->uctx : + &rhp->rdev.uctx, srq->wr_waitp); + if (ret) + goto err_free_skb; + attrs->attr.max_wr = rqsize - 1; + + if (CHELSIO_CHIP_VERSION(rhp->rdev.lldi.adapter_type) > CHELSIO_T6) + srq->flags = T4_SRQ_LIMIT_SUPPORT; + + ret = insert_handle(rhp, &rhp->qpidr, srq, srq->wq.qid); + if (ret) + goto err_free_queue; + + if (udata) { + srq_key_mm = kmalloc(sizeof(*srq_key_mm), GFP_KERNEL); + if (!srq_key_mm) { + ret = -ENOMEM; + goto err_remove_handle; + } + srq_db_key_mm = kmalloc(sizeof(*srq_db_key_mm), GFP_KERNEL); + if (!srq_db_key_mm) { + ret = -ENOMEM; + goto err_free_srq_key_mm; + } + memset(&uresp, 0, sizeof(uresp)); + uresp.flags = srq->flags; + uresp.qid_mask = rhp->rdev.qpmask; + uresp.srqid = srq->wq.qid; + uresp.srq_size = srq->wq.size; + uresp.srq_memsize = srq->wq.memsize; + uresp.rqt_abs_idx = srq->wq.rqt_abs_idx; + spin_lock(&ucontext->mmap_lock); + uresp.srq_key = ucontext->key; + ucontext->key += PAGE_SIZE; + uresp.srq_db_gts_key = ucontext->key; + ucontext->key += PAGE_SIZE; + spin_unlock(&ucontext->mmap_lock); + ret = ib_copy_to_udata(udata, &uresp, sizeof(uresp)); + if (ret) + goto err_free_srq_db_key_mm; + srq_key_mm->key = uresp.srq_key; + srq_key_mm->addr = virt_to_phys(srq->wq.queue); + srq_key_mm->len = PAGE_ALIGN(srq->wq.memsize); + insert_mmap(ucontext, srq_key_mm); + srq_db_key_mm->key = uresp.srq_db_gts_key; + srq_db_key_mm->addr = (u64)(unsigned long)srq->wq.bar2_pa; + srq_db_key_mm->len = PAGE_SIZE; + insert_mmap(ucontext, srq_db_key_mm); + } + + pr_debug("%s srq qid %u idx %u size %u memsize %lu num_entries %u\n", + __func__, srq->wq.qid, srq->idx, srq->wq.size, + (unsigned long)srq->wq.memsize, attrs->attr.max_wr); + + spin_lock_init(&srq->lock); + return &srq->ibsrq; +err_free_srq_db_key_mm: + kfree(srq_db_key_mm); +err_free_srq_key_mm: + kfree(srq_key_mm); +err_remove_handle: + remove_handle(rhp, &rhp->qpidr, srq->wq.qid); +err_free_queue: + free_srq_queue(srq, ucontext ? &ucontext->uctx : &rhp->rdev.uctx, + srq->wr_waitp); +err_free_skb: + if (srq->destroy_skb) + kfree_skb(srq->destroy_skb); +err_free_srq_idx: + c4iw_free_srq_idx(&rhp->rdev, srq->idx); +err_free_wr_wait: + c4iw_put_wr_wait(srq->wr_waitp); +err_free_srq: + kfree(srq); + return ERR_PTR(ret); +} + +int c4iw_destroy_srq(struct ib_srq *ibsrq) +{ + struct c4iw_dev *rhp; + struct c4iw_srq *srq; + struct c4iw_ucontext *ucontext; + + srq = to_c4iw_srq(ibsrq); + rhp = srq->rhp; + + pr_debug("%s id %d\n", __func__, srq->wq.qid); + + remove_handle(rhp, &rhp->qpidr, srq->wq.qid); + ucontext = ibsrq->uobject ? + to_c4iw_ucontext(ibsrq->uobject->context) : NULL; + free_srq_queue(srq, ucontext ? &ucontext->uctx : &rhp->rdev.uctx, + srq->wr_waitp); + c4iw_free_srq_idx(&rhp->rdev, srq->idx); + c4iw_put_wr_wait(srq->wr_waitp); + kfree(srq); + return 0; +} diff --git a/drivers/infiniband/hw/cxgb4/resource.c b/drivers/infiniband/hw/cxgb4/resource.c index 0ef25ae05e6f..57ed26b3cc21 100644 --- a/drivers/infiniband/hw/cxgb4/resource.c +++ b/drivers/infiniband/hw/cxgb4/resource.c @@ -53,7 +53,8 @@ static int c4iw_init_qid_table(struct c4iw_rdev *rdev) } /* nr_* must be power of 2 */ -int c4iw_init_resource(struct c4iw_rdev *rdev, u32 nr_tpt, u32 nr_pdid) +int c4iw_init_resource(struct c4iw_rdev *rdev, u32 nr_tpt, + u32 nr_pdid, u32 nr_srqt) { int err = 0; err = c4iw_id_table_alloc(&rdev->resource.tpt_table, 0, nr_tpt, 1, @@ -67,7 +68,17 @@ int c4iw_init_resource(struct c4iw_rdev *rdev, u32 nr_tpt, u32 nr_pdid) nr_pdid, 1, 0); if (err) goto pdid_err; + if (!nr_srqt) + err = c4iw_id_table_alloc(&rdev->resource.srq_table, 0, + 1, 1, 0); + else + err = c4iw_id_table_alloc(&rdev->resource.srq_table, 0, + nr_srqt, 0, 0); + if (err) + goto srq_err; return 0; + srq_err: + c4iw_id_table_free(&rdev->resource.pdid_table); pdid_err: c4iw_id_table_free(&rdev->resource.qid_table); qid_err: @@ -371,13 +382,21 @@ void c4iw_rqtpool_free(struct c4iw_rdev *rdev, u32 addr, int size) int c4iw_rqtpool_create(struct c4iw_rdev *rdev) { unsigned rqt_start, rqt_chunk, rqt_top; + int skip = 0; rdev->rqt_pool = gen_pool_create(MIN_RQT_SHIFT, -1); if (!rdev->rqt_pool) return -ENOMEM; - rqt_start = rdev->lldi.vr->rq.start; - rqt_chunk = rdev->lldi.vr->rq.size; + /* + * If SRQs are supported, then never use the first RQE from + * the RQT region. This is because HW uses RQT index 0 as NULL. + */ + if (rdev->lldi.vr->srq.size) + skip = T4_RQT_ENTRY_SIZE; + + rqt_start = rdev->lldi.vr->rq.start + skip; + rqt_chunk = rdev->lldi.vr->rq.size - skip; rqt_top = rqt_start + rqt_chunk; while (rqt_start < rqt_top) { @@ -405,6 +424,32 @@ void c4iw_rqtpool_destroy(struct c4iw_rdev *rdev) kref_put(&rdev->rqt_kref, destroy_rqtpool); } +int c4iw_alloc_srq_idx(struct c4iw_rdev *rdev) +{ + int idx; + + idx = c4iw_id_alloc(&rdev->resource.srq_table); + mutex_lock(&rdev->stats.lock); + if (idx == -1) { + rdev->stats.srqt.fail++; + mutex_unlock(&rdev->stats.lock); + return -ENOMEM; + } + rdev->stats.srqt.cur++; + if (rdev->stats.srqt.cur > rdev->stats.srqt.max) + rdev->stats.srqt.max = rdev->stats.srqt.cur; + mutex_unlock(&rdev->stats.lock); + return idx; +} + +void c4iw_free_srq_idx(struct c4iw_rdev *rdev, int idx) +{ + c4iw_id_free(&rdev->resource.srq_table, idx); + mutex_lock(&rdev->stats.lock); + rdev->stats.srqt.cur--; + mutex_unlock(&rdev->stats.lock); +} + /* * On-Chip QP Memory. */ diff --git a/drivers/infiniband/hw/cxgb4/t4.h b/drivers/infiniband/hw/cxgb4/t4.h index 8369c7c8de83..e42021fd6fd6 100644 --- a/drivers/infiniband/hw/cxgb4/t4.h +++ b/drivers/infiniband/hw/cxgb4/t4.h @@ -52,12 +52,16 @@ struct t4_status_page { __be16 pidx; u8 qp_err; /* flit 1 - sw owns */ u8 db_off; - u8 pad; + u8 pad[2]; u16 host_wq_pidx; u16 host_cidx; u16 host_pidx; + u16 pad2; + u32 srqidx; }; +#define T4_RQT_ENTRY_SHIFT 6 +#define T4_RQT_ENTRY_SIZE BIT(T4_RQT_ENTRY_SHIFT) #define T4_EQ_ENTRY_SIZE 64 #define T4_SQ_NUM_SLOTS 5 @@ -87,6 +91,9 @@ static inline int t4_max_fr_depth(int use_dsgl) #define T4_RQ_NUM_BYTES (T4_EQ_ENTRY_SIZE * T4_RQ_NUM_SLOTS) #define T4_MAX_RECV_SGE 4 +#define T4_WRITE_CMPL_MAX_SGL 4 +#define T4_WRITE_CMPL_MAX_CQE 16 + union t4_wr { struct fw_ri_res_wr res; struct fw_ri_wr ri; @@ -97,6 +104,7 @@ union t4_wr { struct fw_ri_fr_nsmr_wr fr; struct fw_ri_fr_nsmr_tpte_wr fr_tpte; struct fw_ri_inv_lstag_wr inv; + struct fw_ri_rdma_write_cmpl_wr write_cmpl; struct t4_status_page status; __be64 flits[T4_EQ_ENTRY_SIZE / sizeof(__be64) * T4_SQ_NUM_SLOTS]; }; @@ -179,9 +187,32 @@ struct t4_cqe { __be32 wrid_hi; __be32 wrid_low; } gen; + struct { + __be32 stag; + __be32 msn; + __be32 reserved; + __be32 abs_rqe_idx; + } srcqe; + struct { + __be32 mo; + __be32 msn; + /* + * Use union for immediate data to be consistent with + * stack's 32 bit data and iWARP spec's 64 bit data. + */ + union { + struct { + __be32 imm_data32; + u32 reserved; + } ib_imm_data; + __be64 imm_data64; + } iw_imm_data; + } imm_data_rcqe; + u64 drain_cookie; + __be64 flits[3]; } u; - __be64 reserved; + __be64 reserved[3]; __be64 bits_type_ts; }; @@ -237,6 +268,9 @@ struct t4_cqe { /* used for RQ completion processing */ #define CQE_WRID_STAG(x) (be32_to_cpu((x)->u.rcqe.stag)) #define CQE_WRID_MSN(x) (be32_to_cpu((x)->u.rcqe.msn)) +#define CQE_ABS_RQE_IDX(x) (be32_to_cpu((x)->u.srcqe.abs_rqe_idx)) +#define CQE_IMM_DATA(x)( \ + (x)->u.imm_data_rcqe.iw_imm_data.ib_imm_data.imm_data32) /* used for SQ completion processing */ #define CQE_WRID_SQ_IDX(x) ((x)->u.scqe.cidx) @@ -320,6 +354,7 @@ struct t4_swrqe { u64 wr_id; ktime_t host_time; u64 sge_ts; + int valid; }; struct t4_rq { @@ -349,8 +384,98 @@ struct t4_wq { void __iomem *db; struct c4iw_rdev *rdev; int flushed; + u8 *qp_errp; + u32 *srqidxp; +}; + +struct t4_srq_pending_wr { + u64 wr_id; + union t4_recv_wr wqe; + u8 len16; +}; + +struct t4_srq { + union t4_recv_wr *queue; + dma_addr_t dma_addr; + DECLARE_PCI_UNMAP_ADDR(mapping); + struct t4_swrqe *sw_rq; + void __iomem *bar2_va; + u64 bar2_pa; + size_t memsize; + u32 bar2_qid; + u32 qid; + u32 msn; + u32 rqt_hwaddr; + u32 rqt_abs_idx; + u16 rqt_size; + u16 size; + u16 cidx; + u16 pidx; + u16 wq_pidx; + u16 wq_pidx_inc; + u16 in_use; + struct t4_srq_pending_wr *pending_wrs; + u16 pending_cidx; + u16 pending_pidx; + u16 pending_in_use; + u16 ooo_count; }; +static inline u32 t4_srq_avail(struct t4_srq *srq) +{ + return srq->size - 1 - srq->in_use; +} + +static inline void t4_srq_produce(struct t4_srq *srq, u8 len16) +{ + srq->in_use++; + if (++srq->pidx == srq->size) + srq->pidx = 0; + srq->wq_pidx += DIV_ROUND_UP(len16 * 16, T4_EQ_ENTRY_SIZE); + if (srq->wq_pidx >= srq->size * T4_RQ_NUM_SLOTS) + srq->wq_pidx %= srq->size * T4_RQ_NUM_SLOTS; + srq->queue[srq->size].status.host_pidx = srq->pidx; +} + +static inline void t4_srq_produce_pending_wr(struct t4_srq *srq) +{ + srq->pending_in_use++; + srq->in_use++; + if (++srq->pending_pidx == srq->size) + srq->pending_pidx = 0; +} + +static inline void t4_srq_consume_pending_wr(struct t4_srq *srq) +{ + srq->pending_in_use--; + srq->in_use--; + if (++srq->pending_cidx == srq->size) + srq->pending_cidx = 0; +} + +static inline void t4_srq_produce_ooo(struct t4_srq *srq) +{ + srq->in_use--; + srq->ooo_count++; +} + +static inline void t4_srq_consume_ooo(struct t4_srq *srq) +{ + srq->cidx++; + if (srq->cidx == srq->size) + srq->cidx = 0; + srq->queue[srq->size].status.host_cidx = srq->cidx; + srq->ooo_count--; +} + +static inline void t4_srq_consume(struct t4_srq *srq) +{ + srq->in_use--; + if (++srq->cidx == srq->size) + srq->cidx = 0; + srq->queue[srq->size].status.host_cidx = srq->cidx; +} + static inline int t4_rqes_posted(struct t4_wq *wq) { return wq->rq.in_use; @@ -384,7 +509,6 @@ static inline void t4_rq_produce(struct t4_wq *wq, u8 len16) static inline void t4_rq_consume(struct t4_wq *wq) { wq->rq.in_use--; - wq->rq.msn++; if (++wq->rq.cidx == wq->rq.size) wq->rq.cidx = 0; } @@ -464,6 +588,25 @@ static inline void pio_copy(u64 __iomem *dst, u64 *src) } } +static inline void t4_ring_srq_db(struct t4_srq *srq, u16 inc, u8 len16, + union t4_recv_wr *wqe) +{ + /* Flush host queue memory writes. */ + wmb(); + if (inc == 1 && srq->bar2_qid == 0 && wqe) { + pr_debug("%s : WC srq->pidx = %d; len16=%d\n", + __func__, srq->pidx, len16); + pio_copy(srq->bar2_va + SGE_UDB_WCDOORBELL, (u64 *)wqe); + } else { + pr_debug("%s: DB srq->pidx = %d; len16=%d\n", + __func__, srq->pidx, len16); + writel(PIDX_T5_V(inc) | QID_V(srq->bar2_qid), + srq->bar2_va + SGE_UDB_KDOORBELL); + } + /* Flush user doorbell area writes. */ + wmb(); +} + static inline void t4_ring_sq_db(struct t4_wq *wq, u16 inc, union t4_wr *wqe) { @@ -515,12 +658,14 @@ static inline void t4_ring_rq_db(struct t4_wq *wq, u16 inc, static inline int t4_wq_in_error(struct t4_wq *wq) { - return wq->rq.queue[wq->rq.size].status.qp_err; + return *wq->qp_errp; } -static inline void t4_set_wq_in_error(struct t4_wq *wq) +static inline void t4_set_wq_in_error(struct t4_wq *wq, u32 srqidx) { - wq->rq.queue[wq->rq.size].status.qp_err = 1; + if (srqidx) + *wq->srqidxp = srqidx; + *wq->qp_errp = 1; } static inline void t4_disable_wq_db(struct t4_wq *wq) @@ -565,6 +710,7 @@ struct t4_cq { u16 cidx_inc; u8 gen; u8 error; + u8 *qp_errp; unsigned long flags; }; @@ -698,18 +844,18 @@ static inline int t4_next_cqe(struct t4_cq *cq, struct t4_cqe **cqe) static inline int t4_cq_in_error(struct t4_cq *cq) { - return ((struct t4_status_page *)&cq->queue[cq->size])->qp_err; + return *cq->qp_errp; } static inline void t4_set_cq_in_error(struct t4_cq *cq) { - ((struct t4_status_page *)&cq->queue[cq->size])->qp_err = 1; + *cq->qp_errp = 1; } #endif struct t4_dev_status_page { u8 db_off; - u8 pad1; + u8 write_cmpl_supported; u16 pad2; u32 pad3; u64 qp_start; diff --git a/drivers/infiniband/hw/cxgb4/t4fw_ri_api.h b/drivers/infiniband/hw/cxgb4/t4fw_ri_api.h index 58c531db4f4a..cbdb300a4794 100644 --- a/drivers/infiniband/hw/cxgb4/t4fw_ri_api.h +++ b/drivers/infiniband/hw/cxgb4/t4fw_ri_api.h @@ -50,7 +50,8 @@ enum fw_ri_wr_opcode { FW_RI_BYPASS = 0xd, FW_RI_RECEIVE = 0xe, - FW_RI_SGE_EC_CR_RETURN = 0xf + FW_RI_SGE_EC_CR_RETURN = 0xf, + FW_RI_WRITE_IMMEDIATE = FW_RI_RDMA_INIT }; enum fw_ri_wr_flags { @@ -59,7 +60,8 @@ enum fw_ri_wr_flags { FW_RI_SOLICITED_EVENT_FLAG = 0x04, FW_RI_READ_FENCE_FLAG = 0x08, FW_RI_LOCAL_FENCE_FLAG = 0x10, - FW_RI_RDMA_READ_INVALIDATE = 0x20 + FW_RI_RDMA_READ_INVALIDATE = 0x20, + FW_RI_RDMA_WRITE_WITH_IMMEDIATE = 0x40 }; enum fw_ri_mpa_attrs { @@ -263,6 +265,7 @@ enum fw_ri_res_type { FW_RI_RES_TYPE_SQ, FW_RI_RES_TYPE_RQ, FW_RI_RES_TYPE_CQ, + FW_RI_RES_TYPE_SRQ, }; enum fw_ri_res_op { @@ -296,6 +299,20 @@ struct fw_ri_res { __be32 r6_lo; __be64 r7; } cq; + struct fw_ri_res_srq { + __u8 restype; + __u8 op; + __be16 r3; + __be32 eqid; + __be32 r4[2]; + __be32 fetchszm_to_iqid; + __be32 dcaen_to_eqsize; + __be64 eqaddr; + __be32 srqid; + __be32 pdid; + __be32 hwsrqsize; + __be32 hwsrqaddr; + } srq; } u; }; @@ -531,7 +548,17 @@ struct fw_ri_rdma_write_wr { __u16 wrid; __u8 r1[3]; __u8 len16; - __be64 r2; + /* + * Use union for immediate data to be consistent with stack's 32 bit + * data and iWARP spec's 64 bit data. + */ + union { + struct { + __be32 imm_data32; + u32 reserved; + } ib_imm_data; + __be64 imm_data64; + } iw_imm_data; __be32 plen; __be32 stag_sink; __be64 to_sink; @@ -568,6 +595,37 @@ struct fw_ri_send_wr { #define FW_RI_SEND_WR_SENDOP_G(x) \ (((x) >> FW_RI_SEND_WR_SENDOP_S) & FW_RI_SEND_WR_SENDOP_M) +struct fw_ri_rdma_write_cmpl_wr { + __u8 opcode; + __u8 flags; + __u16 wrid; + __u8 r1[3]; + __u8 len16; + __u8 r2; + __u8 flags_send; + __u16 wrid_send; + __be32 stag_inv; + __be32 plen; + __be32 stag_sink; + __be64 to_sink; + union fw_ri_cmpl { + struct fw_ri_immd_cmpl { + __u8 op; + __u8 r1[6]; + __u8 immdlen; + __u8 data[16]; + } immd_src; + struct fw_ri_isgl isgl_src; + } u_cmpl; + __be64 r3; +#ifndef C99_NOT_SUPPORTED + union fw_ri_write { + struct fw_ri_immd immd_src[0]; + struct fw_ri_isgl isgl_src[0]; + } u; +#endif +}; + struct fw_ri_rdma_read_wr { __u8 opcode; __u8 flags; @@ -707,6 +765,10 @@ enum fw_ri_init_p2ptype { FW_RI_INIT_P2PTYPE_DISABLED = 0xf, }; +enum fw_ri_init_rqeqid_srq { + FW_RI_INIT_RQEQID_SRQ = 1 << 31, +}; + struct fw_ri_wr { __be32 op_compl; __be32 flowid_len16; diff --git a/drivers/infiniband/hw/hfi1/chip.c b/drivers/infiniband/hw/hfi1/chip.c index 6deb101cdd43..2c19bf772451 100644 --- a/drivers/infiniband/hw/hfi1/chip.c +++ b/drivers/infiniband/hw/hfi1/chip.c @@ -8143,8 +8143,15 @@ static void is_sdma_eng_int(struct hfi1_devdata *dd, unsigned int source) } } -/* +/** + * is_rcv_avail_int() - User receive context available IRQ handler + * @dd: valid dd + * @source: logical IRQ source (offset from IS_RCVAVAIL_START) + * * RX block receive available interrupt. Source is < 160. + * + * This is the general interrupt handler for user (PSM) receive contexts, + * and can only be used for non-threaded IRQs. */ static void is_rcv_avail_int(struct hfi1_devdata *dd, unsigned int source) { @@ -8154,12 +8161,7 @@ static void is_rcv_avail_int(struct hfi1_devdata *dd, unsigned int source) if (likely(source < dd->num_rcv_contexts)) { rcd = hfi1_rcd_get_by_index(dd, source); if (rcd) { - /* Check for non-user contexts, including vnic */ - if (source < dd->first_dyn_alloc_ctxt || rcd->is_vnic) - rcd->do_interrupt(rcd, 0); - else - handle_user_interrupt(rcd); - + handle_user_interrupt(rcd); hfi1_rcd_put(rcd); return; /* OK */ } @@ -8173,8 +8175,14 @@ static void is_rcv_avail_int(struct hfi1_devdata *dd, unsigned int source) err_detail, source); } -/* +/** + * is_rcv_urgent_int() - User receive context urgent IRQ handler + * @dd: valid dd + * @source: logical IRQ source (ofse from IS_RCVURGENT_START) + * * RX block receive urgent interrupt. Source is < 160. + * + * NOTE: kernel receive contexts specifically do NOT enable this IRQ. */ static void is_rcv_urgent_int(struct hfi1_devdata *dd, unsigned int source) { @@ -8184,11 +8192,7 @@ static void is_rcv_urgent_int(struct hfi1_devdata *dd, unsigned int source) if (likely(source < dd->num_rcv_contexts)) { rcd = hfi1_rcd_get_by_index(dd, source); if (rcd) { - /* only pay attention to user urgent interrupts */ - if (source >= dd->first_dyn_alloc_ctxt && - !rcd->is_vnic) - handle_user_interrupt(rcd); - + handle_user_interrupt(rcd); hfi1_rcd_put(rcd); return; /* OK */ } @@ -8260,9 +8264,14 @@ static void is_interrupt(struct hfi1_devdata *dd, unsigned int source) dd_dev_err(dd, "invalid interrupt source %u\n", source); } -/* - * General interrupt handler. This is able to correctly handle - * all interrupts in case INTx is used. +/** + * gerneral_interrupt() - General interrupt handler + * @irq: MSIx IRQ vector + * @data: hfi1 devdata + * + * This is able to correctly handle all non-threaded interrupts. Receive + * context DATA IRQs are threaded and are not supported by this handler. + * */ static irqreturn_t general_interrupt(int irq, void *data) { @@ -10130,7 +10139,7 @@ static void set_lidlmc(struct hfi1_pportdata *ppd) (((lid & mask) & SEND_CTXT_CHECK_SLID_VALUE_MASK) << SEND_CTXT_CHECK_SLID_VALUE_SHIFT); - for (i = 0; i < dd->chip_send_contexts; i++) { + for (i = 0; i < chip_send_contexts(dd); i++) { hfi1_cdbg(LINKVERB, "SendContext[%d].SLID_CHECK = 0x%x", i, (u32)sreg); write_kctxt_csr(dd, i, SEND_CTXT_CHECK_SLID, sreg); @@ -11857,7 +11866,7 @@ void hfi1_rcvctrl(struct hfi1_devdata *dd, unsigned int op, * sequence numbers could land exactly on the same spot. * E.g. a rcd restart before the receive header wrapped. */ - memset(rcd->rcvhdrq, 0, rcd->rcvhdrq_size); + memset(rcd->rcvhdrq, 0, rcvhdrq_size(rcd)); /* starting timeout */ rcd->rcvavail_timeout = dd->rcv_intr_timeout_csr; @@ -11952,9 +11961,8 @@ void hfi1_rcvctrl(struct hfi1_devdata *dd, unsigned int op, rcvctrl |= RCV_CTXT_CTRL_DONT_DROP_EGR_FULL_SMASK; if (op & HFI1_RCVCTRL_NO_EGR_DROP_DIS) rcvctrl &= ~RCV_CTXT_CTRL_DONT_DROP_EGR_FULL_SMASK; - rcd->rcvctrl = rcvctrl; hfi1_cdbg(RCVCTRL, "ctxt %d rcvctrl 0x%llx\n", ctxt, rcvctrl); - write_kctxt_csr(dd, ctxt, RCV_CTXT_CTRL, rcd->rcvctrl); + write_kctxt_csr(dd, ctxt, RCV_CTXT_CTRL, rcvctrl); /* work around sticky RcvCtxtStatus.BlockedRHQFull */ if (did_enable && @@ -12042,7 +12050,7 @@ u32 hfi1_read_cntrs(struct hfi1_devdata *dd, char **namep, u64 **cntrp) } else if (entry->flags & CNTR_SDMA) { hfi1_cdbg(CNTR, "\t Per SDMA Engine\n"); - for (j = 0; j < dd->chip_sdma_engines; + for (j = 0; j < chip_sdma_engines(dd); j++) { val = entry->rw_cntr(entry, dd, j, @@ -12418,6 +12426,7 @@ static int init_cntrs(struct hfi1_devdata *dd) struct hfi1_pportdata *ppd; const char *bit_type_32 = ",32"; const int bit_type_32_sz = strlen(bit_type_32); + u32 sdma_engines = chip_sdma_engines(dd); /* set up the stats timer; the add_timer is done at the end */ timer_setup(&dd->synth_stats_timer, update_synth_timer, 0); @@ -12450,7 +12459,7 @@ static int init_cntrs(struct hfi1_devdata *dd) } } else if (dev_cntrs[i].flags & CNTR_SDMA) { dev_cntrs[i].offset = dd->ndevcntrs; - for (j = 0; j < dd->chip_sdma_engines; j++) { + for (j = 0; j < sdma_engines; j++) { snprintf(name, C_MAX_NAME, "%s%d", dev_cntrs[i].name, j); sz += strlen(name); @@ -12507,7 +12516,7 @@ static int init_cntrs(struct hfi1_devdata *dd) *p++ = '\n'; } } else if (dev_cntrs[i].flags & CNTR_SDMA) { - for (j = 0; j < dd->chip_sdma_engines; j++) { + for (j = 0; j < sdma_engines; j++) { snprintf(name, C_MAX_NAME, "%s%d", dev_cntrs[i].name, j); memcpy(p, name, strlen(name)); @@ -13020,9 +13029,9 @@ static void clear_all_interrupts(struct hfi1_devdata *dd) write_csr(dd, SEND_PIO_ERR_CLEAR, ~(u64)0); write_csr(dd, SEND_DMA_ERR_CLEAR, ~(u64)0); write_csr(dd, SEND_EGRESS_ERR_CLEAR, ~(u64)0); - for (i = 0; i < dd->chip_send_contexts; i++) + for (i = 0; i < chip_send_contexts(dd); i++) write_kctxt_csr(dd, i, SEND_CTXT_ERR_CLEAR, ~(u64)0); - for (i = 0; i < dd->chip_sdma_engines; i++) + for (i = 0; i < chip_sdma_engines(dd); i++) write_kctxt_csr(dd, i, SEND_DMA_ENG_ERR_CLEAR, ~(u64)0); write_csr(dd, DCC_ERR_FLG_CLR, ~(u64)0); @@ -13030,48 +13039,30 @@ static void clear_all_interrupts(struct hfi1_devdata *dd) write_csr(dd, DC_DC8051_ERR_CLR, ~(u64)0); } -/* Move to pcie.c? */ -static void disable_intx(struct pci_dev *pdev) -{ - pci_intx(pdev, 0); -} - /** * hfi1_clean_up_interrupts() - Free all IRQ resources * @dd: valid device data data structure * - * Free the MSI or INTx IRQs and assoicated PCI resources, - * if they have been allocated. + * Free the MSIx and assoicated PCI resources, if they have been allocated. */ void hfi1_clean_up_interrupts(struct hfi1_devdata *dd) { int i; + struct hfi1_msix_entry *me = dd->msix_entries; /* remove irqs - must happen before disabling/turning off */ - if (dd->num_msix_entries) { - /* MSI-X */ - struct hfi1_msix_entry *me = dd->msix_entries; - - for (i = 0; i < dd->num_msix_entries; i++, me++) { - if (!me->arg) /* => no irq, no affinity */ - continue; - hfi1_put_irq_affinity(dd, me); - pci_free_irq(dd->pcidev, i, me->arg); - } - - /* clean structures */ - kfree(dd->msix_entries); - dd->msix_entries = NULL; - dd->num_msix_entries = 0; - } else { - /* INTx */ - if (dd->requested_intx_irq) { - pci_free_irq(dd->pcidev, 0, dd); - dd->requested_intx_irq = 0; - } - disable_intx(dd->pcidev); + for (i = 0; i < dd->num_msix_entries; i++, me++) { + if (!me->arg) /* => no irq, no affinity */ + continue; + hfi1_put_irq_affinity(dd, me); + pci_free_irq(dd->pcidev, i, me->arg); } + /* clean structures */ + kfree(dd->msix_entries); + dd->msix_entries = NULL; + dd->num_msix_entries = 0; + pci_free_irq_vectors(dd->pcidev); } @@ -13121,20 +13112,6 @@ static void remap_sdma_interrupts(struct hfi1_devdata *dd, msix_intr); } -static int request_intx_irq(struct hfi1_devdata *dd) -{ - int ret; - - ret = pci_request_irq(dd->pcidev, 0, general_interrupt, NULL, dd, - DRIVER_NAME "_%d", dd->unit); - if (ret) - dd_dev_err(dd, "unable to request INTx interrupt, err %d\n", - ret); - else - dd->requested_intx_irq = 1; - return ret; -} - static int request_msix_irqs(struct hfi1_devdata *dd) { int first_general, last_general; @@ -13253,11 +13230,6 @@ void hfi1_vnic_synchronize_irq(struct hfi1_devdata *dd) { int i; - if (!dd->num_msix_entries) { - synchronize_irq(pci_irq_vector(dd->pcidev, 0)); - return; - } - for (i = 0; i < dd->vnic.num_ctxt; i++) { struct hfi1_ctxtdata *rcd = dd->vnic.ctxt[i]; struct hfi1_msix_entry *me = &dd->msix_entries[rcd->msix_intr]; @@ -13346,7 +13318,6 @@ static int set_up_interrupts(struct hfi1_devdata *dd) { u32 total; int ret, request; - int single_interrupt = 0; /* we expect to have all the interrupts */ /* * Interrupt count: @@ -13363,17 +13334,6 @@ static int set_up_interrupts(struct hfi1_devdata *dd) if (request < 0) { ret = request; goto fail; - } else if (request == 0) { - /* using INTx */ - /* dd->num_msix_entries already zero */ - single_interrupt = 1; - dd_dev_err(dd, "MSI-X failed, using INTx interrupts\n"); - } else if (request < total) { - /* using MSI-X, with reduced interrupts */ - dd_dev_err(dd, "reduced interrupt found, wanted %u, got %u\n", - total, request); - ret = -EINVAL; - goto fail; } else { dd->msix_entries = kcalloc(total, sizeof(*dd->msix_entries), GFP_KERNEL); @@ -13394,10 +13354,7 @@ static int set_up_interrupts(struct hfi1_devdata *dd) /* reset general handler mask, chip MSI-X mappings */ reset_interrupts(dd); - if (single_interrupt) - ret = request_intx_irq(dd); - else - ret = request_msix_irqs(dd); + ret = request_msix_irqs(dd); if (ret) goto fail; @@ -13429,6 +13386,8 @@ static int set_up_context_variables(struct hfi1_devdata *dd) int qos_rmt_count; int user_rmt_reduced; u32 n_usr_ctxts; + u32 send_contexts = chip_send_contexts(dd); + u32 rcv_contexts = chip_rcv_contexts(dd); /* * Kernel receive contexts: @@ -13450,16 +13409,16 @@ static int set_up_context_variables(struct hfi1_devdata *dd) * Every kernel receive context needs an ACK send context. * one send context is allocated for each VL{0-7} and VL15 */ - if (num_kernel_contexts > (dd->chip_send_contexts - num_vls - 1)) { + if (num_kernel_contexts > (send_contexts - num_vls - 1)) { dd_dev_err(dd, "Reducing # kernel rcv contexts to: %d, from %lu\n", - (int)(dd->chip_send_contexts - num_vls - 1), + send_contexts - num_vls - 1, num_kernel_contexts); - num_kernel_contexts = dd->chip_send_contexts - num_vls - 1; + num_kernel_contexts = send_contexts - num_vls - 1; } /* Accommodate VNIC contexts if possible */ - if ((num_kernel_contexts + num_vnic_contexts) > dd->chip_rcv_contexts) { + if ((num_kernel_contexts + num_vnic_contexts) > rcv_contexts) { dd_dev_err(dd, "No receive contexts available for VNIC\n"); num_vnic_contexts = 0; } @@ -13477,13 +13436,13 @@ static int set_up_context_variables(struct hfi1_devdata *dd) /* * Adjust the counts given a global max. */ - if (total_contexts + n_usr_ctxts > dd->chip_rcv_contexts) { + if (total_contexts + n_usr_ctxts > rcv_contexts) { dd_dev_err(dd, "Reducing # user receive contexts to: %d, from %u\n", - (int)(dd->chip_rcv_contexts - total_contexts), + rcv_contexts - total_contexts, n_usr_ctxts); /* recalculate */ - n_usr_ctxts = dd->chip_rcv_contexts - total_contexts; + n_usr_ctxts = rcv_contexts - total_contexts; } /* each user context requires an entry in the RMT */ @@ -13509,7 +13468,7 @@ static int set_up_context_variables(struct hfi1_devdata *dd) dd->freectxts = n_usr_ctxts; dd_dev_info(dd, "rcv contexts: chip %d, used %d (kernel %d, vnic %u, user %u)\n", - (int)dd->chip_rcv_contexts, + rcv_contexts, (int)dd->num_rcv_contexts, (int)dd->n_krcv_queues, dd->num_vnic_contexts, @@ -13527,7 +13486,7 @@ static int set_up_context_variables(struct hfi1_devdata *dd) * contexts. */ dd->rcv_entries.group_size = RCV_INCREMENT; - ngroups = dd->chip_rcv_array_count / dd->rcv_entries.group_size; + ngroups = chip_rcv_array_count(dd) / dd->rcv_entries.group_size; dd->rcv_entries.ngroups = ngroups / dd->num_rcv_contexts; dd->rcv_entries.nctxt_extra = ngroups - (dd->num_rcv_contexts * dd->rcv_entries.ngroups); @@ -13552,7 +13511,7 @@ static int set_up_context_variables(struct hfi1_devdata *dd) dd_dev_info( dd, "send contexts: chip %d, used %d (kernel %d, ack %d, user %d, vl15 %d)\n", - dd->chip_send_contexts, + send_contexts, dd->num_send_contexts, dd->sc_sizes[SC_KERNEL].count, dd->sc_sizes[SC_ACK].count, @@ -13610,7 +13569,7 @@ static void write_uninitialized_csrs_and_memories(struct hfi1_devdata *dd) write_csr(dd, CCE_INT_MAP + (8 * i), 0); /* SendCtxtCreditReturnAddr */ - for (i = 0; i < dd->chip_send_contexts; i++) + for (i = 0; i < chip_send_contexts(dd); i++) write_kctxt_csr(dd, i, SEND_CTXT_CREDIT_RETURN_ADDR, 0); /* PIO Send buffers */ @@ -13623,7 +13582,7 @@ static void write_uninitialized_csrs_and_memories(struct hfi1_devdata *dd) /* RcvHdrAddr */ /* RcvHdrTailAddr */ /* RcvTidFlowTable */ - for (i = 0; i < dd->chip_rcv_contexts; i++) { + for (i = 0; i < chip_rcv_contexts(dd); i++) { write_kctxt_csr(dd, i, RCV_HDR_ADDR, 0); write_kctxt_csr(dd, i, RCV_HDR_TAIL_ADDR, 0); for (j = 0; j < RXE_NUM_TID_FLOWS; j++) @@ -13631,7 +13590,7 @@ static void write_uninitialized_csrs_and_memories(struct hfi1_devdata *dd) } /* RcvArray */ - for (i = 0; i < dd->chip_rcv_array_count; i++) + for (i = 0; i < chip_rcv_array_count(dd); i++) hfi1_put_tid(dd, i, PT_INVALID_FLUSH, 0, 0); /* RcvQPMapTable */ @@ -13789,7 +13748,7 @@ static void reset_txe_csrs(struct hfi1_devdata *dd) write_csr(dd, SEND_LOW_PRIORITY_LIST + (8 * i), 0); for (i = 0; i < VL_ARB_HIGH_PRIO_TABLE_SIZE; i++) write_csr(dd, SEND_HIGH_PRIORITY_LIST + (8 * i), 0); - for (i = 0; i < dd->chip_send_contexts / NUM_CONTEXTS_PER_SET; i++) + for (i = 0; i < chip_send_contexts(dd) / NUM_CONTEXTS_PER_SET; i++) write_csr(dd, SEND_CONTEXT_SET_CTRL + (8 * i), 0); for (i = 0; i < TXE_NUM_32_BIT_COUNTER; i++) write_csr(dd, SEND_COUNTER_ARRAY32 + (8 * i), 0); @@ -13817,7 +13776,7 @@ static void reset_txe_csrs(struct hfi1_devdata *dd) /* * TXE Per-Context CSRs */ - for (i = 0; i < dd->chip_send_contexts; i++) { + for (i = 0; i < chip_send_contexts(dd); i++) { write_kctxt_csr(dd, i, SEND_CTXT_CTRL, 0); write_kctxt_csr(dd, i, SEND_CTXT_CREDIT_CTRL, 0); write_kctxt_csr(dd, i, SEND_CTXT_CREDIT_RETURN_ADDR, 0); @@ -13835,7 +13794,7 @@ static void reset_txe_csrs(struct hfi1_devdata *dd) /* * TXE Per-SDMA CSRs */ - for (i = 0; i < dd->chip_sdma_engines; i++) { + for (i = 0; i < chip_sdma_engines(dd); i++) { write_kctxt_csr(dd, i, SEND_DMA_CTRL, 0); /* SEND_DMA_STATUS read-only */ write_kctxt_csr(dd, i, SEND_DMA_BASE_ADDR, 0); @@ -13968,7 +13927,7 @@ static void reset_rxe_csrs(struct hfi1_devdata *dd) /* * RXE Kernel and User Per-Context CSRs */ - for (i = 0; i < dd->chip_rcv_contexts; i++) { + for (i = 0; i < chip_rcv_contexts(dd); i++) { /* kernel */ write_kctxt_csr(dd, i, RCV_CTXT_CTRL, 0); /* RCV_CTXT_STATUS read-only */ @@ -14084,13 +14043,13 @@ static int init_chip(struct hfi1_devdata *dd) /* disable send contexts and SDMA engines */ write_csr(dd, SEND_CTRL, 0); - for (i = 0; i < dd->chip_send_contexts; i++) + for (i = 0; i < chip_send_contexts(dd); i++) write_kctxt_csr(dd, i, SEND_CTXT_CTRL, 0); - for (i = 0; i < dd->chip_sdma_engines; i++) + for (i = 0; i < chip_sdma_engines(dd); i++) write_kctxt_csr(dd, i, SEND_DMA_CTRL, 0); /* disable port (turn off RXE inbound traffic) and contexts */ write_csr(dd, RCV_CTRL, 0); - for (i = 0; i < dd->chip_rcv_contexts; i++) + for (i = 0; i < chip_rcv_contexts(dd); i++) write_csr(dd, RCV_CTXT_CTRL, 0); /* mask all interrupt sources */ for (i = 0; i < CCE_NUM_INT_CSRS; i++) @@ -14709,9 +14668,9 @@ static void init_txe(struct hfi1_devdata *dd) write_csr(dd, SEND_EGRESS_ERR_MASK, ~0ull); /* enable all per-context and per-SDMA engine errors */ - for (i = 0; i < dd->chip_send_contexts; i++) + for (i = 0; i < chip_send_contexts(dd); i++) write_kctxt_csr(dd, i, SEND_CTXT_ERR_MASK, ~0ull); - for (i = 0; i < dd->chip_sdma_engines; i++) + for (i = 0; i < chip_sdma_engines(dd); i++) write_kctxt_csr(dd, i, SEND_DMA_ENG_ERR_MASK, ~0ull); /* set the local CU to AU mapping */ @@ -14979,11 +14938,13 @@ struct hfi1_devdata *hfi1_init_dd(struct pci_dev *pdev, "Functional simulator" }; struct pci_dev *parent = pdev->bus->self; + u32 sdma_engines; dd = hfi1_alloc_devdata(pdev, NUM_IB_PORTS * sizeof(struct hfi1_pportdata)); if (IS_ERR(dd)) goto bail; + sdma_engines = chip_sdma_engines(dd); ppd = dd->pport; for (i = 0; i < dd->num_pports; i++, ppd++) { int vl; @@ -15081,11 +15042,6 @@ struct hfi1_devdata *hfi1_init_dd(struct pci_dev *pdev, /* give a reasonable active value, will be set on link up */ dd->pport->link_speed_active = OPA_LINK_SPEED_25G; - dd->chip_rcv_contexts = read_csr(dd, RCV_CONTEXTS); - dd->chip_send_contexts = read_csr(dd, SEND_CONTEXTS); - dd->chip_sdma_engines = read_csr(dd, SEND_DMA_ENGINES); - dd->chip_pio_mem_size = read_csr(dd, SEND_PIO_MEM_SIZE); - dd->chip_sdma_mem_size = read_csr(dd, SEND_DMA_MEM_SIZE); /* fix up link widths for emulation _p */ ppd = dd->pport; if (dd->icode == ICODE_FPGA_EMULATION && is_emulator_p(dd)) { @@ -15096,11 +15052,11 @@ struct hfi1_devdata *hfi1_init_dd(struct pci_dev *pdev, OPA_LINK_WIDTH_1X; } /* insure num_vls isn't larger than number of sdma engines */ - if (HFI1_CAP_IS_KSET(SDMA) && num_vls > dd->chip_sdma_engines) { + if (HFI1_CAP_IS_KSET(SDMA) && num_vls > sdma_engines) { dd_dev_err(dd, "num_vls %u too large, using %u VLs\n", - num_vls, dd->chip_sdma_engines); - num_vls = dd->chip_sdma_engines; - ppd->vls_supported = dd->chip_sdma_engines; + num_vls, sdma_engines); + num_vls = sdma_engines; + ppd->vls_supported = sdma_engines; ppd->vls_operational = ppd->vls_supported; } @@ -15216,13 +15172,6 @@ struct hfi1_devdata *hfi1_init_dd(struct pci_dev *pdev, */ aspm_init(dd); - dd->rcvhdrsize = DEFAULT_RCVHDRSIZE; - /* - * rcd[0] is guaranteed to be valid by this point. Also, all - * context are using the same value, as per the module parameter. - */ - dd->rhf_offset = dd->rcd[0]->rcvhdrqentsize - sizeof(u64) / sizeof(u32); - ret = init_pervl_scs(dd); if (ret) goto bail_cleanup; diff --git a/drivers/infiniband/hw/hfi1/chip.h b/drivers/infiniband/hw/hfi1/chip.h index fdf389e46e19..36b04d6300e5 100644 --- a/drivers/infiniband/hw/hfi1/chip.h +++ b/drivers/infiniband/hw/hfi1/chip.h @@ -656,6 +656,36 @@ static inline void write_uctxt_csr(struct hfi1_devdata *dd, int ctxt, write_csr(dd, offset0 + (0x1000 * ctxt), value); } +static inline u32 chip_rcv_contexts(struct hfi1_devdata *dd) +{ + return read_csr(dd, RCV_CONTEXTS); +} + +static inline u32 chip_send_contexts(struct hfi1_devdata *dd) +{ + return read_csr(dd, SEND_CONTEXTS); +} + +static inline u32 chip_sdma_engines(struct hfi1_devdata *dd) +{ + return read_csr(dd, SEND_DMA_ENGINES); +} + +static inline u32 chip_pio_mem_size(struct hfi1_devdata *dd) +{ + return read_csr(dd, SEND_PIO_MEM_SIZE); +} + +static inline u32 chip_sdma_mem_size(struct hfi1_devdata *dd) +{ + return read_csr(dd, SEND_DMA_MEM_SIZE); +} + +static inline u32 chip_rcv_array_count(struct hfi1_devdata *dd) +{ + return read_csr(dd, RCV_ARRAY_CNT); +} + u64 create_pbc(struct hfi1_pportdata *ppd, u64 flags, int srate_mbs, u32 vl, u32 dw_len); diff --git a/drivers/infiniband/hw/hfi1/driver.c b/drivers/infiniband/hw/hfi1/driver.c index 94dca95db04f..a41f85558312 100644 --- a/drivers/infiniband/hw/hfi1/driver.c +++ b/drivers/infiniband/hw/hfi1/driver.c @@ -208,25 +208,25 @@ static inline void *get_egrbuf(const struct hfi1_ctxtdata *rcd, u64 rhf, (offset * RCV_BUF_BLOCK_SIZE)); } -static inline void *hfi1_get_header(struct hfi1_devdata *dd, +static inline void *hfi1_get_header(struct hfi1_ctxtdata *rcd, __le32 *rhf_addr) { u32 offset = rhf_hdrq_offset(rhf_to_cpu(rhf_addr)); - return (void *)(rhf_addr - dd->rhf_offset + offset); + return (void *)(rhf_addr - rcd->rhf_offset + offset); } -static inline struct ib_header *hfi1_get_msgheader(struct hfi1_devdata *dd, +static inline struct ib_header *hfi1_get_msgheader(struct hfi1_ctxtdata *rcd, __le32 *rhf_addr) { - return (struct ib_header *)hfi1_get_header(dd, rhf_addr); + return (struct ib_header *)hfi1_get_header(rcd, rhf_addr); } static inline struct hfi1_16b_header - *hfi1_get_16B_header(struct hfi1_devdata *dd, + *hfi1_get_16B_header(struct hfi1_ctxtdata *rcd, __le32 *rhf_addr) { - return (struct hfi1_16b_header *)hfi1_get_header(dd, rhf_addr); + return (struct hfi1_16b_header *)hfi1_get_header(rcd, rhf_addr); } /* @@ -591,13 +591,12 @@ static void __prescan_rxq(struct hfi1_packet *packet) init_ps_mdata(&mdata, packet); while (1) { - struct hfi1_devdata *dd = rcd->dd; struct hfi1_ibport *ibp = rcd_to_iport(rcd); __le32 *rhf_addr = (__le32 *)rcd->rcvhdrq + mdata.ps_head + - dd->rhf_offset; + packet->rcd->rhf_offset; struct rvt_qp *qp; struct ib_header *hdr; - struct rvt_dev_info *rdi = &dd->verbs_dev.rdi; + struct rvt_dev_info *rdi = &rcd->dd->verbs_dev.rdi; u64 rhf = rhf_to_cpu(rhf_addr); u32 etype = rhf_rcv_type(rhf), qpn, bth1; int is_ecn = 0; @@ -612,7 +611,7 @@ static void __prescan_rxq(struct hfi1_packet *packet) if (etype != RHF_RCV_TYPE_IB) goto next; - packet->hdr = hfi1_get_msgheader(dd, rhf_addr); + packet->hdr = hfi1_get_msgheader(packet->rcd, rhf_addr); hdr = packet->hdr; lnh = ib_get_lnh(hdr); @@ -718,7 +717,7 @@ static noinline int skip_rcv_packet(struct hfi1_packet *packet, int thread) ret = check_max_packet(packet, thread); packet->rhf_addr = (__le32 *)packet->rcd->rcvhdrq + packet->rhqoff + - packet->rcd->dd->rhf_offset; + packet->rcd->rhf_offset; packet->rhf = rhf_to_cpu(packet->rhf_addr); return ret; @@ -757,7 +756,7 @@ static inline int process_rcv_packet(struct hfi1_packet *packet, int thread) * crashing down. There is no need to eat another * comparison in this performance critical code. */ - packet->rcd->dd->rhf_rcv_function_map[packet->etype](packet); + packet->rcd->rhf_rcv_function_map[packet->etype](packet); packet->numpkt++; /* Set up for the next packet */ @@ -768,7 +767,7 @@ static inline int process_rcv_packet(struct hfi1_packet *packet, int thread) ret = check_max_packet(packet, thread); packet->rhf_addr = (__le32 *)packet->rcd->rcvhdrq + packet->rhqoff + - packet->rcd->dd->rhf_offset; + packet->rcd->rhf_offset; packet->rhf = rhf_to_cpu(packet->rhf_addr); return ret; @@ -949,12 +948,12 @@ static inline int set_armed_to_active(struct hfi1_ctxtdata *rcd, u8 sc = SC15_PACKET; if (etype == RHF_RCV_TYPE_IB) { - struct ib_header *hdr = hfi1_get_msgheader(packet->rcd->dd, + struct ib_header *hdr = hfi1_get_msgheader(packet->rcd, packet->rhf_addr); sc = hfi1_9B_get_sc5(hdr, packet->rhf); } else if (etype == RHF_RCV_TYPE_BYPASS) { struct hfi1_16b_header *hdr = hfi1_get_16B_header( - packet->rcd->dd, + packet->rcd, packet->rhf_addr); sc = hfi1_16B_get_sc(hdr); } @@ -1034,7 +1033,7 @@ int handle_receive_interrupt(struct hfi1_ctxtdata *rcd, int thread) packet.rhqoff += packet.rsize; packet.rhf_addr = (__le32 *)rcd->rcvhdrq + packet.rhqoff + - dd->rhf_offset; + rcd->rhf_offset; packet.rhf = rhf_to_cpu(packet.rhf_addr); } else if (skip_pkt) { @@ -1384,7 +1383,7 @@ bail: static inline void hfi1_setup_ib_header(struct hfi1_packet *packet) { packet->hdr = (struct hfi1_ib_message_header *) - hfi1_get_msgheader(packet->rcd->dd, + hfi1_get_msgheader(packet->rcd, packet->rhf_addr); packet->hlen = (u8 *)packet->rhf_addr - (u8 *)packet->hdr; } @@ -1485,7 +1484,7 @@ static int hfi1_setup_bypass_packet(struct hfi1_packet *packet) u8 l4; packet->hdr = (struct hfi1_16b_header *) - hfi1_get_16B_header(packet->rcd->dd, + hfi1_get_16B_header(packet->rcd, packet->rhf_addr); l4 = hfi1_16B_get_l4(packet->hdr); if (l4 == OPA_16B_L4_IB_LOCAL) { @@ -1575,7 +1574,7 @@ void handle_eflags(struct hfi1_packet *packet) * The following functions are called by the interrupt handler. They are type * specific handlers for each packet type. */ -int process_receive_ib(struct hfi1_packet *packet) +static int process_receive_ib(struct hfi1_packet *packet) { if (hfi1_setup_9B_packet(packet)) return RHF_RCV_CONTINUE; @@ -1607,7 +1606,7 @@ static inline bool hfi1_is_vnic_packet(struct hfi1_packet *packet) return false; } -int process_receive_bypass(struct hfi1_packet *packet) +static int process_receive_bypass(struct hfi1_packet *packet) { struct hfi1_devdata *dd = packet->rcd->dd; @@ -1649,7 +1648,7 @@ int process_receive_bypass(struct hfi1_packet *packet) return RHF_RCV_CONTINUE; } -int process_receive_error(struct hfi1_packet *packet) +static int process_receive_error(struct hfi1_packet *packet) { /* KHdrHCRCErr -- KDETH packet with a bad HCRC */ if (unlikely( @@ -1668,7 +1667,7 @@ int process_receive_error(struct hfi1_packet *packet) return RHF_RCV_CONTINUE; } -int kdeth_process_expected(struct hfi1_packet *packet) +static int kdeth_process_expected(struct hfi1_packet *packet) { hfi1_setup_9B_packet(packet); if (unlikely(hfi1_dbg_should_fault_rx(packet))) @@ -1682,7 +1681,7 @@ int kdeth_process_expected(struct hfi1_packet *packet) return RHF_RCV_CONTINUE; } -int kdeth_process_eager(struct hfi1_packet *packet) +static int kdeth_process_eager(struct hfi1_packet *packet) { hfi1_setup_9B_packet(packet); if (unlikely(hfi1_dbg_should_fault_rx(packet))) @@ -1695,7 +1694,7 @@ int kdeth_process_eager(struct hfi1_packet *packet) return RHF_RCV_CONTINUE; } -int process_receive_invalid(struct hfi1_packet *packet) +static int process_receive_invalid(struct hfi1_packet *packet) { dd_dev_err(packet->rcd->dd, "Invalid packet type %d. Dropping\n", rhf_rcv_type(packet->rhf)); @@ -1719,9 +1718,8 @@ void seqfile_dump_rcd(struct seq_file *s, struct hfi1_ctxtdata *rcd) init_ps_mdata(&mdata, &packet); while (1) { - struct hfi1_devdata *dd = rcd->dd; __le32 *rhf_addr = (__le32 *)rcd->rcvhdrq + mdata.ps_head + - dd->rhf_offset; + rcd->rhf_offset; struct ib_header *hdr; u64 rhf = rhf_to_cpu(rhf_addr); u32 etype = rhf_rcv_type(rhf), qpn; @@ -1738,7 +1736,7 @@ void seqfile_dump_rcd(struct seq_file *s, struct hfi1_ctxtdata *rcd) if (etype > RHF_RCV_TYPE_IB) goto next; - packet.hdr = hfi1_get_msgheader(dd, rhf_addr); + packet.hdr = hfi1_get_msgheader(rcd, rhf_addr); hdr = packet.hdr; lnh = be16_to_cpu(hdr->lrh[0]) & 3; @@ -1760,3 +1758,14 @@ next: update_ps_mdata(&mdata, rcd); } } + +const rhf_rcv_function_ptr normal_rhf_rcv_functions[] = { + [RHF_RCV_TYPE_EXPECTED] = kdeth_process_expected, + [RHF_RCV_TYPE_EAGER] = kdeth_process_eager, + [RHF_RCV_TYPE_IB] = process_receive_ib, + [RHF_RCV_TYPE_ERROR] = process_receive_error, + [RHF_RCV_TYPE_BYPASS] = process_receive_bypass, + [RHF_RCV_TYPE_INVALID5] = process_receive_invalid, + [RHF_RCV_TYPE_INVALID6] = process_receive_invalid, + [RHF_RCV_TYPE_INVALID7] = process_receive_invalid, +}; diff --git a/drivers/infiniband/hw/hfi1/file_ops.c b/drivers/infiniband/hw/hfi1/file_ops.c index 0fc4aa9455c3..1fc75647e47b 100644 --- a/drivers/infiniband/hw/hfi1/file_ops.c +++ b/drivers/infiniband/hw/hfi1/file_ops.c @@ -411,7 +411,7 @@ static int hfi1_file_mmap(struct file *fp, struct vm_area_struct *vma) mapio = 1; break; case RCV_HDRQ: - memlen = uctxt->rcvhdrq_size; + memlen = rcvhdrq_size(uctxt); memvirt = uctxt->rcvhdrq; break; case RCV_EGRBUF: { @@ -521,7 +521,7 @@ static int hfi1_file_mmap(struct file *fp, struct vm_area_struct *vma) break; case SUBCTXT_RCV_HDRQ: memaddr = (u64)uctxt->subctxt_rcvhdr_base; - memlen = uctxt->rcvhdrq_size * uctxt->subctxt_cnt; + memlen = rcvhdrq_size(uctxt) * uctxt->subctxt_cnt; flags |= VM_IO | VM_DONTEXPAND; vmf = 1; break; @@ -985,7 +985,11 @@ static int allocate_ctxt(struct hfi1_filedata *fd, struct hfi1_devdata *dd, * sub contexts. * This has to be done here so the rest of the sub-contexts find the * proper base context. + * NOTE: _set_bit() can be used here because the context creation is + * protected by the mutex (rather than the spin_lock), and will be the + * very first instance of this context. */ + __set_bit(0, uctxt->in_use_ctxts); if (uinfo->subctxt_cnt) init_subctxts(uctxt, uinfo); uctxt->userversion = uinfo->userversion; @@ -1040,7 +1044,7 @@ static int setup_subctxt(struct hfi1_ctxtdata *uctxt) return -ENOMEM; /* We can take the size of the RcvHdr Queue from the master */ - uctxt->subctxt_rcvhdr_base = vmalloc_user(uctxt->rcvhdrq_size * + uctxt->subctxt_rcvhdr_base = vmalloc_user(rcvhdrq_size(uctxt) * num_subctxts); if (!uctxt->subctxt_rcvhdr_base) { ret = -ENOMEM; diff --git a/drivers/infiniband/hw/hfi1/hfi.h b/drivers/infiniband/hw/hfi1/hfi.h index 4ab8b5bfbed1..d9470317983f 100644 --- a/drivers/infiniband/hw/hfi1/hfi.h +++ b/drivers/infiniband/hw/hfi1/hfi.h @@ -169,12 +169,6 @@ extern const struct pci_error_handlers hfi1_pci_err_handler; struct hfi1_opcode_stats_perctx; struct ctxt_eager_bufs { - ssize_t size; /* total size of eager buffers */ - u32 count; /* size of buffers array */ - u32 numbufs; /* number of buffers allocated */ - u32 alloced; /* number of rcvarray entries used */ - u32 rcvtid_size; /* size of each eager rcv tid */ - u32 threshold; /* head update threshold */ struct eager_buffer { void *addr; dma_addr_t dma; @@ -184,6 +178,12 @@ struct ctxt_eager_bufs { void *addr; dma_addr_t dma; } *rcvtids; + u32 size; /* total size of eager buffers */ + u32 rcvtid_size; /* size of each eager rcv tid */ + u16 count; /* size of buffers array */ + u16 numbufs; /* number of buffers allocated */ + u16 alloced; /* number of rcvarray entries used */ + u16 threshold; /* head update threshold */ }; struct exp_tid_set { @@ -191,43 +191,84 @@ struct exp_tid_set { u32 count; }; +typedef int (*rhf_rcv_function_ptr)(struct hfi1_packet *packet); struct hfi1_ctxtdata { - /* shadow the ctxt's RcvCtrl register */ - u64 rcvctrl; /* rcvhdrq base, needs mmap before useful */ void *rcvhdrq; /* kernel virtual address where hdrqtail is updated */ volatile __le64 *rcvhdrtail_kvaddr; - /* when waiting for rcv or pioavail */ - wait_queue_head_t wait; - /* rcvhdrq size (for freeing) */ - size_t rcvhdrq_size; + /* so functions that need physical port can get it easily */ + struct hfi1_pportdata *ppd; + /* so file ops can get at unit */ + struct hfi1_devdata *dd; + /* this receive context's assigned PIO ACK send context */ + struct send_context *sc; + /* per context recv functions */ + const rhf_rcv_function_ptr *rhf_rcv_function_map; + /* + * The interrupt handler for a particular receive context can vary + * throughout it's lifetime. This is not a lock protected data member so + * it must be updated atomically and the prev and new value must always + * be valid. Worst case is we process an extra interrupt and up to 64 + * packets with the wrong interrupt handler. + */ + int (*do_interrupt)(struct hfi1_ctxtdata *rcd, int threaded); + /* verbs rx_stats per rcd */ + struct hfi1_opcode_stats_perctx *opstats; + /* clear interrupt mask */ + u64 imask; + /* ctxt rcvhdrq head offset */ + u32 head; /* number of rcvhdrq entries */ u16 rcvhdrq_cnt; + u8 ireg; /* clear interrupt register */ + /* receive packet sequence counter */ + u8 seq_cnt; /* size of each of the rcvhdrq entries */ - u16 rcvhdrqentsize; + u8 rcvhdrqentsize; + /* offset of RHF within receive header entry */ + u8 rhf_offset; + /* dynamic receive available interrupt timeout */ + u8 rcvavail_timeout; + /* Indicates that this is vnic context */ + bool is_vnic; + /* vnic queue index this context is mapped to */ + u8 vnic_q_idx; + /* Is ASPM interrupt supported for this context */ + bool aspm_intr_supported; + /* ASPM state (enabled/disabled) for this context */ + bool aspm_enabled; + /* Is ASPM processing enabled for this context (in intr context) */ + bool aspm_intr_enable; + struct ctxt_eager_bufs egrbufs; + /* QPs waiting for context processing */ + struct list_head qp_wait_list; + /* tid allocation lists */ + struct exp_tid_set tid_group_list; + struct exp_tid_set tid_used_list; + struct exp_tid_set tid_full_list; + + /* Timer for re-enabling ASPM if interrupt activity quiets down */ + struct timer_list aspm_timer; + /* per-context configuration flags */ + unsigned long flags; + /* array of tid_groups */ + struct tid_group *groups; /* mmap of hdrq, must fit in 44 bits */ dma_addr_t rcvhdrq_dma; dma_addr_t rcvhdrqtailaddr_dma; - struct ctxt_eager_bufs egrbufs; - /* this receive context's assigned PIO ACK send context */ - struct send_context *sc; - - /* dynamic receive available interrupt timeout */ - u32 rcvavail_timeout; + /* Last interrupt timestamp */ + ktime_t aspm_ts_last_intr; + /* Last timestamp at which we scheduled a timer for this context */ + ktime_t aspm_ts_timer_sched; + /* Lock to serialize between intr, timer intr and user threads */ + spinlock_t aspm_lock; /* Reference count the base context usage */ struct kref kref; - - /* Device context index */ - u16 ctxt; - /* - * non-zero if ctxt can be shared, and defines the maximum number of - * sub-contexts for this device context. - */ - u16 subctxt_cnt; - /* non-zero if ctxt is being shared. */ - u16 subctxt_id; - u8 uuid[16]; + /* numa node of this context */ + int numa_id; + /* associated msix interrupt. */ + s16 msix_intr; /* job key */ u16 jkey; /* number of RcvArray groups for this context. */ @@ -238,87 +279,59 @@ struct hfi1_ctxtdata { u16 expected_count; /* index of first expected TID entry. */ u16 expected_base; - /* array of tid_groups */ - struct tid_group *groups; - - struct exp_tid_set tid_group_list; - struct exp_tid_set tid_used_list; - struct exp_tid_set tid_full_list; + /* Device context index */ + u8 ctxt; - /* lock protecting all Expected TID data of user contexts */ + /* PSM Specific fields */ + /* lock protecting all Expected TID data */ struct mutex exp_mutex; - /* per-context configuration flags */ - unsigned long flags; - /* per-context event flags for fileops/intr communication */ - unsigned long event_flags; - /* total number of polled urgent packets */ - u32 urgent; - /* saved total number of polled urgent packets for poll edge trigger */ - u32 urgent_poll; + /* when waiting for rcv or pioavail */ + wait_queue_head_t wait; + /* uuid from PSM */ + u8 uuid[16]; /* same size as task_struct .comm[], command that opened context */ char comm[TASK_COMM_LEN]; - /* so file ops can get at unit */ - struct hfi1_devdata *dd; - /* so functions that need physical port can get it easily */ - struct hfi1_pportdata *ppd; - /* associated msix interrupt */ - u32 msix_intr; + /* Bitmask of in use context(s) */ + DECLARE_BITMAP(in_use_ctxts, HFI1_MAX_SHARED_CTXTS); + /* per-context event flags for fileops/intr communication */ + unsigned long event_flags; /* A page of memory for rcvhdrhead, rcvegrhead, rcvegrtail * N */ void *subctxt_uregbase; /* An array of pages for the eager receive buffers * N */ void *subctxt_rcvegrbuf; /* An array of pages for the eager header queue entries * N */ void *subctxt_rcvhdr_base; - /* Bitmask of in use context(s) */ - DECLARE_BITMAP(in_use_ctxts, HFI1_MAX_SHARED_CTXTS); - /* The version of the library which opened this ctxt */ - u32 userversion; + /* total number of polled urgent packets */ + u32 urgent; + /* saved total number of polled urgent packets for poll edge trigger */ + u32 urgent_poll; /* Type of packets or conditions we want to poll for */ u16 poll_type; - /* receive packet sequence counter */ - u8 seq_cnt; - /* ctxt rcvhdrq head offset */ - u32 head; - /* QPs waiting for context processing */ - struct list_head qp_wait_list; - /* interrupt handling */ - u64 imask; /* clear interrupt mask */ - int ireg; /* clear interrupt register */ - int numa_id; /* numa node of this context */ - /* verbs rx_stats per rcd */ - struct hfi1_opcode_stats_perctx *opstats; - - /* Is ASPM interrupt supported for this context */ - bool aspm_intr_supported; - /* ASPM state (enabled/disabled) for this context */ - bool aspm_enabled; - /* Timer for re-enabling ASPM if interrupt activity quietens down */ - struct timer_list aspm_timer; - /* Lock to serialize between intr, timer intr and user threads */ - spinlock_t aspm_lock; - /* Is ASPM processing enabled for this context (in intr context) */ - bool aspm_intr_enable; - /* Last interrupt timestamp */ - ktime_t aspm_ts_last_intr; - /* Last timestamp at which we scheduled a timer for this context */ - ktime_t aspm_ts_timer_sched; - + /* non-zero if ctxt is being shared. */ + u16 subctxt_id; + /* The version of the library which opened this ctxt */ + u32 userversion; /* - * The interrupt handler for a particular receive context can vary - * throughout it's lifetime. This is not a lock protected data member so - * it must be updated atomically and the prev and new value must always - * be valid. Worst case is we process an extra interrupt and up to 64 - * packets with the wrong interrupt handler. + * non-zero if ctxt can be shared, and defines the maximum number of + * sub-contexts for this device context. */ - int (*do_interrupt)(struct hfi1_ctxtdata *rcd, int threaded); - - /* Indicates that this is vnic context */ - bool is_vnic; + u8 subctxt_cnt; - /* vnic queue index this context is mapped to */ - u8 vnic_q_idx; }; +/** + * rcvhdrq_size - return total size in bytes for header queue + * @rcd: the receive context + * + * rcvhdrqentsize is in DWs, so we have to convert to bytes + * + */ +static inline u32 rcvhdrq_size(struct hfi1_ctxtdata *rcd) +{ + return PAGE_ALIGN(rcd->rcvhdrq_cnt * + rcd->rcvhdrqentsize * sizeof(u32)); +} + /* * Represents a single packet at a high level. Put commonly computed things in * here so we do not have to keep doing them over and over. The rule of thumb is @@ -897,12 +910,11 @@ struct hfi1_pportdata { u64 vl_xmit_flit_cnt[C_VL_COUNT + 1]; }; -typedef int (*rhf_rcv_function_ptr)(struct hfi1_packet *packet); - typedef void (*opcode_handler)(struct hfi1_packet *packet); typedef void (*hfi1_make_req)(struct rvt_qp *qp, struct hfi1_pkt_state *ps, struct rvt_swqe *wqe); +extern const rhf_rcv_function_ptr normal_rhf_rcv_functions[]; /* return values for the RHF receive functions */ @@ -1046,8 +1058,6 @@ struct hfi1_devdata { dma_addr_t sdma_pad_phys; /* for deallocation */ size_t sdma_heads_size; - /* number from the chip */ - u32 chip_sdma_engines; /* num used */ u32 num_sdma; /* array of engines sized by num_sdma */ @@ -1102,8 +1112,6 @@ struct hfi1_devdata { /* base receive interrupt timeout, in CSR units */ u32 rcv_intr_timeout_csr; - u32 freezelen; /* max length of freezemsg */ - u64 __iomem *egrtidbase; spinlock_t sendctrl_lock; /* protect changes to SendCtrl */ spinlock_t rcvctrl_lock; /* protect changes to RcvCtrl */ spinlock_t uctxt_lock; /* protect rcd changes */ @@ -1130,25 +1138,6 @@ struct hfi1_devdata { /* Base GUID for device (network order) */ u64 base_guid; - /* these are the "32 bit" regs */ - - /* value we put in kr_rcvhdrsize */ - u32 rcvhdrsize; - /* number of receive contexts the chip supports */ - u32 chip_rcv_contexts; - /* number of receive array entries */ - u32 chip_rcv_array_count; - /* number of PIO send contexts the chip supports */ - u32 chip_send_contexts; - /* number of bytes in the PIO memory buffer */ - u32 chip_pio_mem_size; - /* number of bytes in the SDMA memory buffer */ - u32 chip_sdma_mem_size; - - /* size of each rcvegrbuffer */ - u32 rcvegrbufsize; - /* log2 of above */ - u16 rcvegrbufsize_shift; /* both sides of the PCIe link are gen3 capable */ u8 link_gen3_capable; u8 dc_shutdown; @@ -1221,9 +1210,6 @@ struct hfi1_devdata { u32 num_msix_entries; u32 first_dyn_msix_idx; - /* INTx information */ - u32 requested_intx_irq; /* did we request one? */ - /* general interrupt: mask of handled interrupts */ u64 gi_mask[CCE_NUM_INT_CSRS]; @@ -1289,8 +1275,6 @@ struct hfi1_devdata { u64 sw_cce_err_status_aggregate; /* Software counter that aggregates all bypass packet rcv errors */ u64 sw_rcv_bypass_packet_errors; - /* receive interrupt function */ - rhf_rcv_function_ptr normal_rhf_rcv_functions[8]; /* Save the enabled LCB error bits */ u64 lcb_err_en; @@ -1329,10 +1313,7 @@ struct hfi1_devdata { /* seqlock for sc2vl */ seqlock_t sc2vl_lock ____cacheline_aligned_in_smp; u64 sc2vl[4]; - /* receive interrupt functions */ - rhf_rcv_function_ptr *rhf_rcv_function_map; u64 __percpu *rcv_limit; - u16 rhf_offset; /* offset of RHF within receive header entry */ /* adding a new field here would make it part of this cacheline */ /* OUI comes from the HW. Used everywhere as 3 separate bytes. */ @@ -1471,7 +1452,7 @@ void hfi1_make_ud_req_16B(struct rvt_qp *qp, /* calculate the current RHF address */ static inline __le32 *get_rhf_addr(struct hfi1_ctxtdata *rcd) { - return (__le32 *)rcd->rcvhdrq + rcd->head + rcd->dd->rhf_offset; + return (__le32 *)rcd->rcvhdrq + rcd->head + rcd->rhf_offset; } int hfi1_reset_device(int); @@ -2021,12 +2002,6 @@ static inline void flush_wc(void) } void handle_eflags(struct hfi1_packet *packet); -int process_receive_ib(struct hfi1_packet *packet); -int process_receive_bypass(struct hfi1_packet *packet); -int process_receive_error(struct hfi1_packet *packet); -int kdeth_process_expected(struct hfi1_packet *packet); -int kdeth_process_eager(struct hfi1_packet *packet); -int process_receive_invalid(struct hfi1_packet *packet); void seqfile_dump_rcd(struct seq_file *s, struct hfi1_ctxtdata *rcd); /* global module parameter variables */ diff --git a/drivers/infiniband/hw/hfi1/init.c b/drivers/infiniband/hw/hfi1/init.c index f110842b91f5..758d273c32cf 100644 --- a/drivers/infiniband/hw/hfi1/init.c +++ b/drivers/infiniband/hw/hfi1/init.c @@ -364,9 +364,9 @@ int hfi1_create_ctxtdata(struct hfi1_pportdata *ppd, int numa, hfi1_exp_tid_group_init(rcd); rcd->ppd = ppd; rcd->dd = dd; - __set_bit(0, rcd->in_use_ctxts); rcd->numa_id = numa; rcd->rcv_array_groups = dd->rcv_entries.ngroups; + rcd->rhf_rcv_function_map = normal_rhf_rcv_functions; mutex_init(&rcd->exp_mutex); @@ -404,6 +404,8 @@ int hfi1_create_ctxtdata(struct hfi1_pportdata *ppd, int numa, rcd->rcvhdrq_cnt = rcvhdrcnt; rcd->rcvhdrqentsize = hfi1_hdrq_entsize; + rcd->rhf_offset = + rcd->rcvhdrqentsize - sizeof(u64) / sizeof(u32); /* * Simple Eager buffer allocation: we have already pre-allocated * the number of RcvArray entry groups. Each ctxtdata structure @@ -853,24 +855,6 @@ int hfi1_init(struct hfi1_devdata *dd, int reinit) struct hfi1_ctxtdata *rcd; struct hfi1_pportdata *ppd; - /* Set up recv low level handlers */ - dd->normal_rhf_rcv_functions[RHF_RCV_TYPE_EXPECTED] = - kdeth_process_expected; - dd->normal_rhf_rcv_functions[RHF_RCV_TYPE_EAGER] = - kdeth_process_eager; - dd->normal_rhf_rcv_functions[RHF_RCV_TYPE_IB] = process_receive_ib; - dd->normal_rhf_rcv_functions[RHF_RCV_TYPE_ERROR] = - process_receive_error; - dd->normal_rhf_rcv_functions[RHF_RCV_TYPE_BYPASS] = - process_receive_bypass; - dd->normal_rhf_rcv_functions[RHF_RCV_TYPE_INVALID5] = - process_receive_invalid; - dd->normal_rhf_rcv_functions[RHF_RCV_TYPE_INVALID6] = - process_receive_invalid; - dd->normal_rhf_rcv_functions[RHF_RCV_TYPE_INVALID7] = - process_receive_invalid; - dd->rhf_rcv_function_map = dd->normal_rhf_rcv_functions; - /* Set up send low level handlers */ dd->process_pio_send = hfi1_verbs_send_pio; dd->process_dma_send = hfi1_verbs_send_dma; @@ -936,7 +920,7 @@ int hfi1_init(struct hfi1_devdata *dd, int reinit) } /* Allocate enough memory for user event notification. */ - len = PAGE_ALIGN(dd->chip_rcv_contexts * HFI1_MAX_SHARED_CTXTS * + len = PAGE_ALIGN(chip_rcv_contexts(dd) * HFI1_MAX_SHARED_CTXTS * sizeof(*dd->events)); dd->events = vmalloc_user(len); if (!dd->events) @@ -948,9 +932,6 @@ int hfi1_init(struct hfi1_devdata *dd, int reinit) dd->status = vmalloc_user(PAGE_SIZE); if (!dd->status) dd_dev_err(dd, "Failed to allocate dev status page\n"); - else - dd->freezelen = PAGE_SIZE - (sizeof(*dd->status) - - sizeof(dd->status->freezemsg)); for (pidx = 0; pidx < dd->num_pports; ++pidx) { ppd = dd->pport + pidx; if (dd->status) @@ -1144,7 +1125,7 @@ void hfi1_free_ctxtdata(struct hfi1_devdata *dd, struct hfi1_ctxtdata *rcd) return; if (rcd->rcvhdrq) { - dma_free_coherent(&dd->pcidev->dev, rcd->rcvhdrq_size, + dma_free_coherent(&dd->pcidev->dev, rcvhdrq_size(rcd), rcd->rcvhdrq, rcd->rcvhdrq_dma); rcd->rcvhdrq = NULL; if (rcd->rcvhdrtail_kvaddr) { @@ -1855,12 +1836,7 @@ int hfi1_create_rcvhdrq(struct hfi1_devdata *dd, struct hfi1_ctxtdata *rcd) if (!rcd->rcvhdrq) { gfp_t gfp_flags; - /* - * rcvhdrqentsize is in DWs, so we have to convert to bytes - * (* sizeof(u32)). - */ - amt = PAGE_ALIGN(rcd->rcvhdrq_cnt * rcd->rcvhdrqentsize * - sizeof(u32)); + amt = rcvhdrq_size(rcd); if (rcd->ctxt < dd->first_dyn_alloc_ctxt || rcd->is_vnic) gfp_flags = GFP_KERNEL; @@ -1885,8 +1861,6 @@ int hfi1_create_rcvhdrq(struct hfi1_devdata *dd, struct hfi1_ctxtdata *rcd) if (!rcd->rcvhdrtail_kvaddr) goto bail_free; } - - rcd->rcvhdrq_size = amt; } /* * These values are per-context: @@ -1902,7 +1876,7 @@ int hfi1_create_rcvhdrq(struct hfi1_devdata *dd, struct hfi1_ctxtdata *rcd) & RCV_HDR_ENT_SIZE_ENT_SIZE_MASK) << RCV_HDR_ENT_SIZE_ENT_SIZE_SHIFT; write_kctxt_csr(dd, rcd->ctxt, RCV_HDR_ENT_SIZE, reg); - reg = (dd->rcvhdrsize & RCV_HDR_SIZE_HDR_SIZE_MASK) + reg = ((u64)DEFAULT_RCVHDRSIZE & RCV_HDR_SIZE_HDR_SIZE_MASK) << RCV_HDR_SIZE_HDR_SIZE_SHIFT; write_kctxt_csr(dd, rcd->ctxt, RCV_HDR_SIZE, reg); @@ -1938,9 +1912,9 @@ bail: int hfi1_setup_eagerbufs(struct hfi1_ctxtdata *rcd) { struct hfi1_devdata *dd = rcd->dd; - u32 max_entries, egrtop, alloced_bytes = 0, idx = 0; + u32 max_entries, egrtop, alloced_bytes = 0; gfp_t gfp_flags; - u16 order; + u16 order, idx = 0; int ret = 0; u16 round_mtu = roundup_pow_of_two(hfi1_max_mtu); diff --git a/drivers/infiniband/hw/hfi1/pcie.c b/drivers/infiniband/hw/hfi1/pcie.c index baf7c324f7b8..eec83757d55f 100644 --- a/drivers/infiniband/hw/hfi1/pcie.c +++ b/drivers/infiniband/hw/hfi1/pcie.c @@ -157,6 +157,7 @@ int hfi1_pcie_ddinit(struct hfi1_devdata *dd, struct pci_dev *pdev) unsigned long len; resource_size_t addr; int ret = 0; + u32 rcv_array_count; addr = pci_resource_start(pdev, 0); len = pci_resource_len(pdev, 0); @@ -186,9 +187,9 @@ int hfi1_pcie_ddinit(struct hfi1_devdata *dd, struct pci_dev *pdev) goto nomem; } - dd->chip_rcv_array_count = readq(dd->kregbase1 + RCV_ARRAY_CNT); - dd_dev_info(dd, "RcvArray count: %u\n", dd->chip_rcv_array_count); - dd->base2_start = RCV_ARRAY + dd->chip_rcv_array_count * 8; + rcv_array_count = readq(dd->kregbase1 + RCV_ARRAY_CNT); + dd_dev_info(dd, "RcvArray count: %u\n", rcv_array_count); + dd->base2_start = RCV_ARRAY + rcv_array_count * 8; dd->kregbase2 = ioremap_nocache( addr + dd->base2_start, @@ -214,13 +215,13 @@ int hfi1_pcie_ddinit(struct hfi1_devdata *dd, struct pci_dev *pdev) * to write an entire cacheline worth of entries in one shot. */ dd->rcvarray_wc = ioremap_wc(addr + RCV_ARRAY, - dd->chip_rcv_array_count * 8); + rcv_array_count * 8); if (!dd->rcvarray_wc) { dd_dev_err(dd, "WC mapping of receive array failed\n"); goto nomem; } dd_dev_info(dd, "WC RcvArray: %p for %x\n", - dd->rcvarray_wc, dd->chip_rcv_array_count * 8); + dd->rcvarray_wc, rcv_array_count * 8); dd->flags |= HFI1_PRESENT; /* chip.c CSR routines now work */ return 0; @@ -346,15 +347,13 @@ int pcie_speeds(struct hfi1_devdata *dd) /* * Returns: * - actual number of interrupts allocated or - * - 0 if fell back to INTx. * - error */ int request_msix(struct hfi1_devdata *dd, u32 msireq) { int nvec; - nvec = pci_alloc_irq_vectors(dd->pcidev, 1, msireq, - PCI_IRQ_MSIX | PCI_IRQ_LEGACY); + nvec = pci_alloc_irq_vectors(dd->pcidev, msireq, msireq, PCI_IRQ_MSIX); if (nvec < 0) { dd_dev_err(dd, "pci_alloc_irq_vectors() failed: %d\n", nvec); return nvec; @@ -362,10 +361,6 @@ int request_msix(struct hfi1_devdata *dd, u32 msireq) tune_pcie_caps(dd); - /* check for legacy IRQ */ - if (nvec == 1 && !dd->pcidev->msix_enabled) - return 0; - return nvec; } diff --git a/drivers/infiniband/hw/hfi1/pio.c b/drivers/infiniband/hw/hfi1/pio.c index 9cac15d10c4f..c2c1cba5b23b 100644 --- a/drivers/infiniband/hw/hfi1/pio.c +++ b/drivers/infiniband/hw/hfi1/pio.c @@ -1,5 +1,5 @@ /* - * Copyright(c) 2015-2017 Intel Corporation. + * Copyright(c) 2015-2018 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. @@ -226,7 +226,7 @@ static const char *sc_type_name(int index) int init_sc_pools_and_sizes(struct hfi1_devdata *dd) { struct mem_pool_info mem_pool_info[NUM_SC_POOLS] = { { 0 } }; - int total_blocks = (dd->chip_pio_mem_size / PIO_BLOCK_SIZE) - 1; + int total_blocks = (chip_pio_mem_size(dd) / PIO_BLOCK_SIZE) - 1; int total_contexts = 0; int fixed_blocks; int pool_blocks; @@ -343,8 +343,8 @@ int init_sc_pools_and_sizes(struct hfi1_devdata *dd) sc_type_name(i), count); return -EINVAL; } - if (total_contexts + count > dd->chip_send_contexts) - count = dd->chip_send_contexts - total_contexts; + if (total_contexts + count > chip_send_contexts(dd)) + count = chip_send_contexts(dd) - total_contexts; total_contexts += count; @@ -507,7 +507,7 @@ static int sc_hw_alloc(struct hfi1_devdata *dd, int type, u32 *sw_index, if (sci->type == type && sci->allocated == 0) { sci->allocated = 1; /* use a 1:1 mapping, but make them non-equal */ - context = dd->chip_send_contexts - index - 1; + context = chip_send_contexts(dd) - index - 1; dd->hw_to_sw[context] = index; *sw_index = index; *hw_context = context; @@ -1618,11 +1618,11 @@ static void sc_piobufavail(struct send_context *sc) /* Wake up the most starved one first */ if (n) hfi1_qp_wakeup(qps[max_idx], - RVT_S_WAIT_PIO | RVT_S_WAIT_PIO_DRAIN); + RVT_S_WAIT_PIO | HFI1_S_WAIT_PIO_DRAIN); for (i = 0; i < n; i++) if (i != max_idx) hfi1_qp_wakeup(qps[i], - RVT_S_WAIT_PIO | RVT_S_WAIT_PIO_DRAIN); + RVT_S_WAIT_PIO | HFI1_S_WAIT_PIO_DRAIN); } /* translate a send credit update to a bit code of reasons */ diff --git a/drivers/infiniband/hw/hfi1/qp.c b/drivers/infiniband/hw/hfi1/qp.c index 1697d96151bd..9b1e84a6b1cc 100644 --- a/drivers/infiniband/hw/hfi1/qp.c +++ b/drivers/infiniband/hw/hfi1/qp.c @@ -1,5 +1,5 @@ /* - * Copyright(c) 2015 - 2017 Intel Corporation. + * Copyright(c) 2015 - 2018 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. @@ -273,7 +273,7 @@ void hfi1_modify_qp(struct rvt_qp *qp, struct ib_qp_attr *attr, if (attr_mask & IB_QP_PATH_MIG_STATE && attr->path_mig_state == IB_MIG_MIGRATED && qp->s_mig_state == IB_MIG_ARMED) { - qp->s_flags |= RVT_S_AHG_CLEAR; + qp->s_flags |= HFI1_S_AHG_CLEAR; priv->s_sc = ah_to_sc(ibqp->device, &qp->remote_ah_attr); priv->s_sde = qp_to_sdma_engine(qp, priv->s_sc); priv->s_sendcontext = qp_to_send_context(qp, priv->s_sc); @@ -717,7 +717,7 @@ void hfi1_migrate_qp(struct rvt_qp *qp) qp->remote_ah_attr = qp->alt_ah_attr; qp->port_num = rdma_ah_get_port_num(&qp->alt_ah_attr); qp->s_pkey_index = qp->s_alt_pkey_index; - qp->s_flags |= RVT_S_AHG_CLEAR; + qp->s_flags |= HFI1_S_AHG_CLEAR; priv->s_sc = ah_to_sc(qp->ibqp.device, &qp->remote_ah_attr); priv->s_sde = qp_to_sdma_engine(qp, priv->s_sc); qp_set_16b(qp); diff --git a/drivers/infiniband/hw/hfi1/qp.h b/drivers/infiniband/hw/hfi1/qp.h index b2d4cba8d15b..078cff7560b6 100644 --- a/drivers/infiniband/hw/hfi1/qp.h +++ b/drivers/infiniband/hw/hfi1/qp.h @@ -1,7 +1,7 @@ #ifndef _QP_H #define _QP_H /* - * Copyright(c) 2015 - 2017 Intel Corporation. + * Copyright(c) 2015 - 2018 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. @@ -70,6 +70,26 @@ static inline int hfi1_send_ok(struct rvt_qp *qp) } /* + * Driver specific s_flags starting at bit 31 down to HFI1_S_MIN_BIT_MASK + * + * HFI1_S_AHG_VALID - ahg header valid on chip + * HFI1_S_AHG_CLEAR - have send engine clear ahg state + * HFI1_S_WAIT_PIO_DRAIN - qp waiting for PIOs to drain + * HFI1_S_MIN_BIT_MASK - the lowest bit that can be used by hfi1 + */ +#define HFI1_S_AHG_VALID 0x80000000 +#define HFI1_S_AHG_CLEAR 0x40000000 +#define HFI1_S_WAIT_PIO_DRAIN 0x20000000 +#define HFI1_S_MIN_BIT_MASK 0x01000000 + +/* + * overload wait defines + */ + +#define HFI1_S_ANY_WAIT_IO (RVT_S_ANY_WAIT_IO | HFI1_S_WAIT_PIO_DRAIN) +#define HFI1_S_ANY_WAIT (HFI1_S_ANY_WAIT_IO | RVT_S_ANY_WAIT_SEND) + +/* * free_ahg - clear ahg from QP */ static inline void clear_ahg(struct rvt_qp *qp) @@ -77,7 +97,7 @@ static inline void clear_ahg(struct rvt_qp *qp) struct hfi1_qp_priv *priv = qp->priv; priv->s_ahg->ahgcount = 0; - qp->s_flags &= ~(RVT_S_AHG_VALID | RVT_S_AHG_CLEAR); + qp->s_flags &= ~(HFI1_S_AHG_VALID | HFI1_S_AHG_CLEAR); if (priv->s_sde && qp->s_ahgidx >= 0) sdma_ahg_free(priv->s_sde, qp->s_ahgidx); qp->s_ahgidx = -1; diff --git a/drivers/infiniband/hw/hfi1/rc.c b/drivers/infiniband/hw/hfi1/rc.c index f15c93102081..9bd63abb2dfe 100644 --- a/drivers/infiniband/hw/hfi1/rc.c +++ b/drivers/infiniband/hw/hfi1/rc.c @@ -1,5 +1,5 @@ /* - * Copyright(c) 2015, 2016 Intel Corporation. + * Copyright(c) 2015 - 2018 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. @@ -241,7 +241,7 @@ bail: smp_wmb(); qp->s_flags &= ~(RVT_S_RESP_PENDING | RVT_S_ACK_PENDING - | RVT_S_AHG_VALID); + | HFI1_S_AHG_VALID); return 0; } @@ -1024,7 +1024,7 @@ done: if ((cmp_psn(qp->s_psn, qp->s_sending_hpsn) <= 0) && (cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) <= 0)) qp->s_flags |= RVT_S_WAIT_PSN; - qp->s_flags &= ~RVT_S_AHG_VALID; + qp->s_flags &= ~HFI1_S_AHG_VALID; } /* diff --git a/drivers/infiniband/hw/hfi1/ruc.c b/drivers/infiniband/hw/hfi1/ruc.c index ef4c566e206f..5f56f3c1b4c4 100644 --- a/drivers/infiniband/hw/hfi1/ruc.c +++ b/drivers/infiniband/hw/hfi1/ruc.c @@ -1,5 +1,5 @@ /* - * Copyright(c) 2015 - 2017 Intel Corporation. + * Copyright(c) 2015 - 2018 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. @@ -194,7 +194,7 @@ static void ruc_loopback(struct rvt_qp *sqp) spin_lock_irqsave(&sqp->s_lock, flags); /* Return if we are already busy processing a work request. */ - if ((sqp->s_flags & (RVT_S_BUSY | RVT_S_ANY_WAIT)) || + if ((sqp->s_flags & (RVT_S_BUSY | HFI1_S_ANY_WAIT)) || !(ib_rvt_state_ops[sqp->state] & RVT_PROCESS_OR_FLUSH_SEND)) goto unlock; @@ -533,9 +533,9 @@ static inline void build_ahg(struct rvt_qp *qp, u32 npsn) { struct hfi1_qp_priv *priv = qp->priv; - if (unlikely(qp->s_flags & RVT_S_AHG_CLEAR)) + if (unlikely(qp->s_flags & HFI1_S_AHG_CLEAR)) clear_ahg(qp); - if (!(qp->s_flags & RVT_S_AHG_VALID)) { + if (!(qp->s_flags & HFI1_S_AHG_VALID)) { /* first middle that needs copy */ if (qp->s_ahgidx < 0) qp->s_ahgidx = sdma_ahg_alloc(priv->s_sde); @@ -544,7 +544,7 @@ static inline void build_ahg(struct rvt_qp *qp, u32 npsn) priv->s_ahg->tx_flags |= SDMA_TXREQ_F_AHG_COPY; /* save to protect a change in another thread */ priv->s_ahg->ahgidx = qp->s_ahgidx; - qp->s_flags |= RVT_S_AHG_VALID; + qp->s_flags |= HFI1_S_AHG_VALID; } } else { /* subsequent middle after valid */ @@ -650,7 +650,7 @@ static inline void hfi1_make_ruc_header_16B(struct rvt_qp *qp, if (middle) build_ahg(qp, bth2); else - qp->s_flags &= ~RVT_S_AHG_VALID; + qp->s_flags &= ~HFI1_S_AHG_VALID; bth0 |= pkey; bth0 |= extra_bytes << 20; @@ -727,7 +727,7 @@ static inline void hfi1_make_ruc_header_9B(struct rvt_qp *qp, if (middle) build_ahg(qp, bth2); else - qp->s_flags &= ~RVT_S_AHG_VALID; + qp->s_flags &= ~HFI1_S_AHG_VALID; bth0 |= pkey; bth0 |= extra_bytes << 20; diff --git a/drivers/infiniband/hw/hfi1/sdma.c b/drivers/infiniband/hw/hfi1/sdma.c index 7fb350b87b49..88e326d6cc49 100644 --- a/drivers/infiniband/hw/hfi1/sdma.c +++ b/drivers/infiniband/hw/hfi1/sdma.c @@ -1351,7 +1351,7 @@ int sdma_init(struct hfi1_devdata *dd, u8 port) struct hfi1_pportdata *ppd = dd->pport + port; u32 per_sdma_credits; uint idle_cnt = sdma_idle_cnt; - size_t num_engines = dd->chip_sdma_engines; + size_t num_engines = chip_sdma_engines(dd); int ret = -ENOMEM; if (!HFI1_CAP_IS_KSET(SDMA)) { @@ -1360,18 +1360,18 @@ int sdma_init(struct hfi1_devdata *dd, u8 port) } if (mod_num_sdma && /* can't exceed chip support */ - mod_num_sdma <= dd->chip_sdma_engines && + mod_num_sdma <= chip_sdma_engines(dd) && /* count must be >= vls */ mod_num_sdma >= num_vls) num_engines = mod_num_sdma; dd_dev_info(dd, "SDMA mod_num_sdma: %u\n", mod_num_sdma); - dd_dev_info(dd, "SDMA chip_sdma_engines: %u\n", dd->chip_sdma_engines); + dd_dev_info(dd, "SDMA chip_sdma_engines: %u\n", chip_sdma_engines(dd)); dd_dev_info(dd, "SDMA chip_sdma_mem_size: %u\n", - dd->chip_sdma_mem_size); + chip_sdma_mem_size(dd)); per_sdma_credits = - dd->chip_sdma_mem_size / (num_engines * SDMA_BLOCK_SIZE); + chip_sdma_mem_size(dd) / (num_engines * SDMA_BLOCK_SIZE); /* set up freeze waitqueue */ init_waitqueue_head(&dd->sdma_unfreeze_wq); diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c index 08991874c0e2..13374c727b14 100644 --- a/drivers/infiniband/hw/hfi1/verbs.c +++ b/drivers/infiniband/hw/hfi1/verbs.c @@ -1007,7 +1007,7 @@ static int pio_wait(struct rvt_qp *qp, int was_empty; dev->n_piowait += !!(flag & RVT_S_WAIT_PIO); - dev->n_piodrain += !!(flag & RVT_S_WAIT_PIO_DRAIN); + dev->n_piodrain += !!(flag & HFI1_S_WAIT_PIO_DRAIN); qp->s_flags |= flag; was_empty = list_empty(&sc->piowait); iowait_queue(ps->pkts_sent, &priv->s_iowait, @@ -1376,7 +1376,7 @@ int hfi1_verbs_send(struct rvt_qp *qp, struct hfi1_pkt_state *ps) return pio_wait(qp, ps->s_txreq->psc, ps, - RVT_S_WAIT_PIO_DRAIN); + HFI1_S_WAIT_PIO_DRAIN); return sr(qp, ps, 0); } @@ -1410,7 +1410,8 @@ static void hfi1_fill_device_attr(struct hfi1_devdata *dd) rdi->dparms.props.max_fast_reg_page_list_len = UINT_MAX; rdi->dparms.props.max_qp = hfi1_max_qps; rdi->dparms.props.max_qp_wr = hfi1_max_qp_wrs; - rdi->dparms.props.max_sge = hfi1_max_sges; + rdi->dparms.props.max_send_sge = hfi1_max_sges; + rdi->dparms.props.max_recv_sge = hfi1_max_sges; rdi->dparms.props.max_sge_rd = hfi1_max_sges; rdi->dparms.props.max_cq = hfi1_max_cqs; rdi->dparms.props.max_ah = hfi1_max_ahs; @@ -1497,15 +1498,6 @@ static int query_port(struct rvt_dev_info *rdi, u8 port_num, props->active_mtu = !valid_ib_mtu(ppd->ibmtu) ? props->max_mtu : mtu_to_enum(ppd->ibmtu, IB_MTU_4096); - /* - * sm_lid of 0xFFFF needs special handling so that it can - * be differentiated from a permissve LID of 0xFFFF. - * We set the grh_required flag here so the SA can program - * the DGID in the address handle appropriately - */ - if (props->sm_lid == be16_to_cpu(IB_LID_PERMISSIVE)) - props->grh_required = true; - return 0; } @@ -1892,7 +1884,7 @@ int hfi1_register_ib_device(struct hfi1_devdata *dd) ibdev->process_mad = hfi1_process_mad; ibdev->get_dev_fw_str = hfi1_get_dev_fw_str; - strncpy(ibdev->node_desc, init_utsname()->nodename, + strlcpy(ibdev->node_desc, init_utsname()->nodename, sizeof(ibdev->node_desc)); /* diff --git a/drivers/infiniband/hw/hfi1/vnic_main.c b/drivers/infiniband/hw/hfi1/vnic_main.c index 616fc9b6fad8..c643d80c5a53 100644 --- a/drivers/infiniband/hw/hfi1/vnic_main.c +++ b/drivers/infiniband/hw/hfi1/vnic_main.c @@ -1,5 +1,5 @@ /* - * Copyright(c) 2017 Intel Corporation. + * Copyright(c) 2017 - 2018 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. @@ -120,8 +120,7 @@ static int allocate_vnic_ctxt(struct hfi1_devdata *dd, uctxt->seq_cnt = 1; uctxt->is_vnic = true; - if (dd->num_msix_entries) - hfi1_set_vnic_msix_info(uctxt); + hfi1_set_vnic_msix_info(uctxt); hfi1_stats.sps_ctxts++; dd_dev_dbg(dd, "created vnic context %d\n", uctxt->ctxt); @@ -136,8 +135,7 @@ static void deallocate_vnic_ctxt(struct hfi1_devdata *dd, dd_dev_dbg(dd, "closing vnic context %d\n", uctxt->ctxt); flush_wc(); - if (dd->num_msix_entries) - hfi1_reset_vnic_msix_info(uctxt); + hfi1_reset_vnic_msix_info(uctxt); /* * Disable receive context and interrupt available, reset all @@ -818,14 +816,14 @@ struct net_device *hfi1_vnic_alloc_rn(struct ib_device *device, size = sizeof(struct opa_vnic_rdma_netdev) + sizeof(*vinfo); netdev = alloc_netdev_mqs(size, name, name_assign_type, setup, - dd->chip_sdma_engines, dd->num_vnic_contexts); + chip_sdma_engines(dd), dd->num_vnic_contexts); if (!netdev) return ERR_PTR(-ENOMEM); rn = netdev_priv(netdev); vinfo = opa_vnic_dev_priv(netdev); vinfo->dd = dd; - vinfo->num_tx_q = dd->chip_sdma_engines; + vinfo->num_tx_q = chip_sdma_engines(dd); vinfo->num_rx_q = dd->num_vnic_contexts; vinfo->netdev = netdev; rn->free_rdma_netdev = hfi1_vnic_free_rn; diff --git a/drivers/infiniband/hw/hns/hns_roce_ah.c b/drivers/infiniband/hw/hns/hns_roce_ah.c index d74928621559..0d96c5bb38cd 100644 --- a/drivers/infiniband/hw/hns/hns_roce_ah.c +++ b/drivers/infiniband/hw/hns/hns_roce_ah.c @@ -44,13 +44,11 @@ struct ib_ah *hns_roce_create_ah(struct ib_pd *ibpd, struct ib_udata *udata) { struct hns_roce_dev *hr_dev = to_hr_dev(ibpd->device); + const struct ib_gid_attr *gid_attr; struct device *dev = hr_dev->dev; - struct ib_gid_attr gid_attr; struct hns_roce_ah *ah; u16 vlan_tag = 0xffff; const struct ib_global_route *grh = rdma_ah_read_grh(ah_attr); - union ib_gid sgid; - int ret; ah = kzalloc(sizeof(*ah), GFP_ATOMIC); if (!ah) @@ -59,18 +57,9 @@ struct ib_ah *hns_roce_create_ah(struct ib_pd *ibpd, /* Get mac address */ memcpy(ah->av.mac, ah_attr->roce.dmac, ETH_ALEN); - /* Get source gid */ - ret = ib_get_cached_gid(ibpd->device, rdma_ah_get_port_num(ah_attr), - grh->sgid_index, &sgid, &gid_attr); - if (ret) { - dev_err(dev, "get sgid failed! ret = %d\n", ret); - kfree(ah); - return ERR_PTR(ret); - } - - if (is_vlan_dev(gid_attr.ndev)) - vlan_tag = vlan_dev_vlan_id(gid_attr.ndev); - dev_put(gid_attr.ndev); + gid_attr = ah_attr->grh.sgid_attr; + if (is_vlan_dev(gid_attr->ndev)) + vlan_tag = vlan_dev_vlan_id(gid_attr->ndev); if (vlan_tag < 0x1000) vlan_tag |= (rdma_ah_get_sl(ah_attr) & @@ -108,7 +97,7 @@ int hns_roce_query_ah(struct ib_ah *ibah, struct rdma_ah_attr *ah_attr) rdma_ah_set_static_rate(ah_attr, ah->av.stat_rate); rdma_ah_set_grh(ah_attr, NULL, (le32_to_cpu(ah->av.sl_tclass_flowlabel) & - HNS_ROCE_FLOW_LABLE_MASK), ah->av.gid_index, + HNS_ROCE_FLOW_LABEL_MASK), ah->av.gid_index, ah->av.hop_limit, (le32_to_cpu(ah->av.sl_tclass_flowlabel) >> HNS_ROCE_TCLASS_SHIFT)); diff --git a/drivers/infiniband/hw/hns/hns_roce_common.h b/drivers/infiniband/hw/hns/hns_roce_common.h index 319cb74aebaf..93d4b4ec002d 100644 --- a/drivers/infiniband/hw/hns/hns_roce_common.h +++ b/drivers/infiniband/hw/hns/hns_roce_common.h @@ -382,15 +382,6 @@ #define ROCEE_VF_EQ_DB_CFG0_REG 0x238 #define ROCEE_VF_EQ_DB_CFG1_REG 0x23C -#define ROCEE_VF_SMAC_CFG0_REG 0x12000 -#define ROCEE_VF_SMAC_CFG1_REG 0x12004 - -#define ROCEE_VF_SGID_CFG0_REG 0x10000 -#define ROCEE_VF_SGID_CFG1_REG 0x10004 -#define ROCEE_VF_SGID_CFG2_REG 0x10008 -#define ROCEE_VF_SGID_CFG3_REG 0x1000c -#define ROCEE_VF_SGID_CFG4_REG 0x10010 - #define ROCEE_VF_ABN_INT_CFG_REG 0x13000 #define ROCEE_VF_ABN_INT_ST_REG 0x13004 #define ROCEE_VF_ABN_INT_EN_REG 0x13008 diff --git a/drivers/infiniband/hw/hns/hns_roce_db.c b/drivers/infiniband/hw/hns/hns_roce_db.c index ebee2782a573..e2f93c1ce86a 100644 --- a/drivers/infiniband/hw/hns/hns_roce_db.c +++ b/drivers/infiniband/hw/hns/hns_roce_db.c @@ -41,6 +41,8 @@ int hns_roce_db_map_user(struct hns_roce_ucontext *context, unsigned long virt, found: db->dma = sg_dma_address(page->umem->sg_head.sgl) + (virt & ~PAGE_MASK); + page->umem->sg_head.sgl->offset = virt & ~PAGE_MASK; + db->virt_addr = sg_virt(page->umem->sg_head.sgl); db->u.user_page = page; refcount_inc(&page->refcount); diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index 31221d506d9a..9a24fd0ee3e7 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -76,7 +76,7 @@ /* 4G/4K = 1M */ #define HNS_ROCE_SL_SHIFT 28 #define HNS_ROCE_TCLASS_SHIFT 20 -#define HNS_ROCE_FLOW_LABLE_MASK 0xfffff +#define HNS_ROCE_FLOW_LABEL_MASK 0xfffff #define HNS_ROCE_MAX_PORTS 6 #define HNS_ROCE_MAX_GID_NUM 16 @@ -110,6 +110,7 @@ enum { HNS_ROCE_SUPPORT_RQ_RECORD_DB = 1 << 0, + HNS_ROCE_SUPPORT_SQ_RECORD_DB = 1 << 1, }; enum { @@ -190,7 +191,8 @@ enum { HNS_ROCE_CAP_FLAG_REREG_MR = BIT(0), HNS_ROCE_CAP_FLAG_ROCE_V1_V2 = BIT(1), HNS_ROCE_CAP_FLAG_RQ_INLINE = BIT(2), - HNS_ROCE_CAP_FLAG_RECORD_DB = BIT(3) + HNS_ROCE_CAP_FLAG_RECORD_DB = BIT(3), + HNS_ROCE_CAP_FLAG_SQ_RECORD_DB = BIT(4), }; enum hns_roce_mtt_type { @@ -385,6 +387,7 @@ struct hns_roce_db { struct hns_roce_user_db_page *user_page; } u; dma_addr_t dma; + void *virt_addr; int index; int order; }; @@ -524,7 +527,9 @@ struct hns_roce_qp { struct hns_roce_buf hr_buf; struct hns_roce_wq rq; struct hns_roce_db rdb; + struct hns_roce_db sdb; u8 rdb_en; + u8 sdb_en; u32 doorbell_qpn; __le32 sq_signal_bits; u32 sq_next_wqe; @@ -579,22 +584,22 @@ struct hns_roce_ceqe { }; struct hns_roce_aeqe { - u32 asyn; + __le32 asyn; union { struct { - u32 qp; + __le32 qp; u32 rsv0; u32 rsv1; } qp_event; struct { - u32 cq; + __le32 cq; u32 rsv0; u32 rsv1; } cq_event; struct { - u32 ceqe; + __le32 ceqe; u32 rsv0; u32 rsv1; } ce_event; @@ -641,6 +646,8 @@ struct hns_roce_eq { int shift; dma_addr_t cur_eqe_ba; dma_addr_t nxt_eqe_ba; + int event_type; + int sub_type; }; struct hns_roce_eq_table { @@ -720,10 +727,21 @@ struct hns_roce_caps { u32 eqe_ba_pg_sz; u32 eqe_buf_pg_sz; u32 eqe_hop_num; + u32 sl_num; + u32 tsq_buf_pg_sz; + u32 tpq_buf_pg_sz; u32 chunk_sz; /* chunk size in non multihop mode*/ u64 flags; }; +struct hns_roce_work { + struct hns_roce_dev *hr_dev; + struct work_struct work; + u32 qpn; + int event_type; + int sub_type; +}; + struct hns_roce_hw { int (*reset)(struct hns_roce_dev *hr_dev, bool enable); int (*cmq_init)(struct hns_roce_dev *hr_dev); @@ -736,7 +754,7 @@ struct hns_roce_hw { u16 token, int event); int (*chk_mbox)(struct hns_roce_dev *hr_dev, unsigned long timeout); int (*set_gid)(struct hns_roce_dev *hr_dev, u8 port, int gid_index, - union ib_gid *gid, const struct ib_gid_attr *attr); + const union ib_gid *gid, const struct ib_gid_attr *attr); int (*set_mac)(struct hns_roce_dev *hr_dev, u8 phy_port, u8 *addr); void (*set_mtu)(struct hns_roce_dev *hr_dev, u8 phy_port, enum ib_mtu mtu); @@ -760,10 +778,10 @@ struct hns_roce_hw { int attr_mask, enum ib_qp_state cur_state, enum ib_qp_state new_state); int (*destroy_qp)(struct ib_qp *ibqp); - int (*post_send)(struct ib_qp *ibqp, struct ib_send_wr *wr, - struct ib_send_wr **bad_wr); - int (*post_recv)(struct ib_qp *qp, struct ib_recv_wr *recv_wr, - struct ib_recv_wr **bad_recv_wr); + int (*post_send)(struct ib_qp *ibqp, const struct ib_send_wr *wr, + const struct ib_send_wr **bad_wr); + int (*post_recv)(struct ib_qp *qp, const struct ib_recv_wr *recv_wr, + const struct ib_recv_wr **bad_recv_wr); int (*req_notify_cq)(struct ib_cq *ibcq, enum ib_cq_notify_flags flags); int (*poll_cq)(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc); int (*dereg_mr)(struct hns_roce_dev *hr_dev, struct hns_roce_mr *mr); @@ -816,6 +834,7 @@ struct hns_roce_dev { u32 tptr_size; /*only for hw v1*/ const struct hns_roce_hw *hw; void *priv; + struct workqueue_struct *irq_workq; }; static inline struct hns_roce_dev *to_hr_dev(struct ib_device *ib_dev) @@ -864,7 +883,7 @@ static inline struct hns_roce_sqp *hr_to_hr_sqp(struct hns_roce_qp *hr_qp) return container_of(hr_qp, struct hns_roce_sqp, hr_qp); } -static inline void hns_roce_write64_k(__be32 val[2], void __iomem *dest) +static inline void hns_roce_write64_k(__le32 val[2], void __iomem *dest) { __raw_writeq(*(u64 *) val, dest); } @@ -982,7 +1001,7 @@ void hns_roce_qp_remove(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp); void hns_roce_qp_free(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp); void hns_roce_release_range_qp(struct hns_roce_dev *hr_dev, int base_qpn, int cnt); -__be32 send_ieth(struct ib_send_wr *wr); +__be32 send_ieth(const struct ib_send_wr *wr); int to_hr_qp_type(int qp_type); struct ib_cq *hns_roce_ib_create_cq(struct ib_device *ib_dev, diff --git a/drivers/infiniband/hw/hns/hns_roce_hem.c b/drivers/infiniband/hw/hns/hns_roce_hem.c index 63b5b3edabcb..f6faefed96e8 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hem.c +++ b/drivers/infiniband/hw/hns/hns_roce_hem.c @@ -170,7 +170,7 @@ int hns_roce_calc_hem_mhop(struct hns_roce_dev *hr_dev, case 3: mhop->l2_idx = table_idx & (chunk_ba_num - 1); mhop->l1_idx = table_idx / chunk_ba_num & (chunk_ba_num - 1); - mhop->l0_idx = table_idx / chunk_ba_num / chunk_ba_num; + mhop->l0_idx = (table_idx / chunk_ba_num) / chunk_ba_num; break; case 2: mhop->l1_idx = table_idx & (chunk_ba_num - 1); @@ -342,7 +342,7 @@ static int hns_roce_set_hem(struct hns_roce_dev *hr_dev, } else { break; } - msleep(HW_SYNC_SLEEP_TIME_INTERVAL); + mdelay(HW_SYNC_SLEEP_TIME_INTERVAL); } bt_cmd_l = (u32)bt_ba; @@ -494,6 +494,9 @@ static int hns_roce_table_mhop_get(struct hns_roce_dev *hr_dev, step_idx = 1; } else if (hop_num == HNS_ROCE_HOP_NUM_0) { step_idx = 0; + } else { + ret = -EINVAL; + goto err_dma_alloc_l1; } /* set HEM base address to hardware */ diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c index 8444234ed092..081aa91fc162 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c @@ -58,8 +58,9 @@ static void set_raddr_seg(struct hns_roce_wqe_raddr_seg *rseg, u64 remote_addr, rseg->len = 0; } -static int hns_roce_v1_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, - struct ib_send_wr **bad_wr) +static int hns_roce_v1_post_send(struct ib_qp *ibqp, + const struct ib_send_wr *wr, + const struct ib_send_wr **bad_wr) { struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device); struct hns_roce_ah *ah = to_hr_ah(ud_wr(wr)->ah); @@ -173,12 +174,14 @@ static int hns_roce_v1_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, roce_set_field(ud_sq_wqe->u32_36, UD_SEND_WQE_U32_36_FLOW_LABEL_M, - UD_SEND_WQE_U32_36_FLOW_LABEL_S, 0); + UD_SEND_WQE_U32_36_FLOW_LABEL_S, + ah->av.sl_tclass_flowlabel & + HNS_ROCE_FLOW_LABEL_MASK); roce_set_field(ud_sq_wqe->u32_36, - UD_SEND_WQE_U32_36_PRIORITY_M, - UD_SEND_WQE_U32_36_PRIORITY_S, - ah->av.sl_tclass_flowlabel >> - HNS_ROCE_SL_SHIFT); + UD_SEND_WQE_U32_36_PRIORITY_M, + UD_SEND_WQE_U32_36_PRIORITY_S, + le32_to_cpu(ah->av.sl_tclass_flowlabel) >> + HNS_ROCE_SL_SHIFT); roce_set_field(ud_sq_wqe->u32_36, UD_SEND_WQE_U32_36_SGID_INDEX_M, UD_SEND_WQE_U32_36_SGID_INDEX_S, @@ -191,7 +194,9 @@ static int hns_roce_v1_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, ah->av.hop_limit); roce_set_field(ud_sq_wqe->u32_40, UD_SEND_WQE_U32_40_TRAFFIC_CLASS_M, - UD_SEND_WQE_U32_40_TRAFFIC_CLASS_S, 0); + UD_SEND_WQE_U32_40_TRAFFIC_CLASS_S, + ah->av.sl_tclass_flowlabel >> + HNS_ROCE_TCLASS_SHIFT); memcpy(&ud_sq_wqe->dgid[0], &ah->av.dgid[0], GID_LEN); @@ -333,7 +338,7 @@ out: doorbell[0] = le32_to_cpu(sq_db.u32_4); doorbell[1] = le32_to_cpu(sq_db.u32_8); - hns_roce_write64_k(doorbell, qp->sq.db_reg_l); + hns_roce_write64_k((__le32 *)doorbell, qp->sq.db_reg_l); qp->sq_next_wqe = ind; } @@ -342,14 +347,15 @@ out: return ret; } -static int hns_roce_v1_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr, - struct ib_recv_wr **bad_wr) +static int hns_roce_v1_post_recv(struct ib_qp *ibqp, + const struct ib_recv_wr *wr, + const struct ib_recv_wr **bad_wr) { int ret = 0; int nreq = 0; int ind = 0; int i = 0; - u32 reg_val = 0; + u32 reg_val; unsigned long flags = 0; struct hns_roce_rq_wqe_ctrl *ctrl = NULL; struct hns_roce_wqe_data_seg *scat = NULL; @@ -402,14 +408,18 @@ out: wmb(); if (ibqp->qp_type == IB_QPT_GSI) { + __le32 tmp; + /* SW update GSI rq header */ reg_val = roce_read(to_hr_dev(ibqp->device), ROCEE_QP1C_CFG3_0_REG + QP1C_CFGN_OFFSET * hr_qp->phy_port); - roce_set_field(reg_val, + tmp = cpu_to_le32(reg_val); + roce_set_field(tmp, ROCEE_QP1C_CFG3_0_ROCEE_QP1C_RQ_HEAD_M, ROCEE_QP1C_CFG3_0_ROCEE_QP1C_RQ_HEAD_S, hr_qp->rq.head); + reg_val = le32_to_cpu(tmp); roce_write(to_hr_dev(ibqp->device), ROCEE_QP1C_CFG3_0_REG + QP1C_CFGN_OFFSET * hr_qp->phy_port, reg_val); @@ -430,7 +440,8 @@ out: doorbell[0] = le32_to_cpu(rq_db.u32_4); doorbell[1] = le32_to_cpu(rq_db.u32_8); - hns_roce_write64_k(doorbell, hr_qp->rq.db_reg_l); + hns_roce_write64_k((__le32 *)doorbell, + hr_qp->rq.db_reg_l); } } spin_unlock_irqrestore(&hr_qp->rq.lock, flags); @@ -441,51 +452,63 @@ out: static void hns_roce_set_db_event_mode(struct hns_roce_dev *hr_dev, int sdb_mode, int odb_mode) { + __le32 tmp; u32 val; val = roce_read(hr_dev, ROCEE_GLB_CFG_REG); - roce_set_bit(val, ROCEE_GLB_CFG_ROCEE_DB_SQ_MODE_S, sdb_mode); - roce_set_bit(val, ROCEE_GLB_CFG_ROCEE_DB_OTH_MODE_S, odb_mode); + tmp = cpu_to_le32(val); + roce_set_bit(tmp, ROCEE_GLB_CFG_ROCEE_DB_SQ_MODE_S, sdb_mode); + roce_set_bit(tmp, ROCEE_GLB_CFG_ROCEE_DB_OTH_MODE_S, odb_mode); + val = le32_to_cpu(tmp); roce_write(hr_dev, ROCEE_GLB_CFG_REG, val); } static void hns_roce_set_db_ext_mode(struct hns_roce_dev *hr_dev, u32 sdb_mode, u32 odb_mode) { + __le32 tmp; u32 val; /* Configure SDB/ODB extend mode */ val = roce_read(hr_dev, ROCEE_GLB_CFG_REG); - roce_set_bit(val, ROCEE_GLB_CFG_SQ_EXT_DB_MODE_S, sdb_mode); - roce_set_bit(val, ROCEE_GLB_CFG_OTH_EXT_DB_MODE_S, odb_mode); + tmp = cpu_to_le32(val); + roce_set_bit(tmp, ROCEE_GLB_CFG_SQ_EXT_DB_MODE_S, sdb_mode); + roce_set_bit(tmp, ROCEE_GLB_CFG_OTH_EXT_DB_MODE_S, odb_mode); + val = le32_to_cpu(tmp); roce_write(hr_dev, ROCEE_GLB_CFG_REG, val); } static void hns_roce_set_sdb(struct hns_roce_dev *hr_dev, u32 sdb_alept, u32 sdb_alful) { + __le32 tmp; u32 val; /* Configure SDB */ val = roce_read(hr_dev, ROCEE_DB_SQ_WL_REG); - roce_set_field(val, ROCEE_DB_SQ_WL_ROCEE_DB_SQ_WL_M, + tmp = cpu_to_le32(val); + roce_set_field(tmp, ROCEE_DB_SQ_WL_ROCEE_DB_SQ_WL_M, ROCEE_DB_SQ_WL_ROCEE_DB_SQ_WL_S, sdb_alful); - roce_set_field(val, ROCEE_DB_SQ_WL_ROCEE_DB_SQ_WL_EMPTY_M, + roce_set_field(tmp, ROCEE_DB_SQ_WL_ROCEE_DB_SQ_WL_EMPTY_M, ROCEE_DB_SQ_WL_ROCEE_DB_SQ_WL_EMPTY_S, sdb_alept); + val = le32_to_cpu(tmp); roce_write(hr_dev, ROCEE_DB_SQ_WL_REG, val); } static void hns_roce_set_odb(struct hns_roce_dev *hr_dev, u32 odb_alept, u32 odb_alful) { + __le32 tmp; u32 val; /* Configure ODB */ val = roce_read(hr_dev, ROCEE_DB_OTHERS_WL_REG); - roce_set_field(val, ROCEE_DB_OTHERS_WL_ROCEE_DB_OTH_WL_M, + tmp = cpu_to_le32(val); + roce_set_field(tmp, ROCEE_DB_OTHERS_WL_ROCEE_DB_OTH_WL_M, ROCEE_DB_OTHERS_WL_ROCEE_DB_OTH_WL_S, odb_alful); - roce_set_field(val, ROCEE_DB_OTHERS_WL_ROCEE_DB_OTH_WL_EMPTY_M, + roce_set_field(tmp, ROCEE_DB_OTHERS_WL_ROCEE_DB_OTH_WL_EMPTY_M, ROCEE_DB_OTHERS_WL_ROCEE_DB_OTH_WL_EMPTY_S, odb_alept); + val = le32_to_cpu(tmp); roce_write(hr_dev, ROCEE_DB_OTHERS_WL_REG, val); } @@ -496,6 +519,7 @@ static void hns_roce_set_sdb_ext(struct hns_roce_dev *hr_dev, u32 ext_sdb_alept, struct hns_roce_v1_priv *priv; struct hns_roce_db_table *db; dma_addr_t sdb_dma_addr; + __le32 tmp; u32 val; priv = (struct hns_roce_v1_priv *)hr_dev->priv; @@ -511,7 +535,8 @@ static void hns_roce_set_sdb_ext(struct hns_roce_dev *hr_dev, u32 ext_sdb_alept, /* Configure extend SDB depth */ val = roce_read(hr_dev, ROCEE_EXT_DB_SQ_H_REG); - roce_set_field(val, ROCEE_EXT_DB_SQ_H_EXT_DB_SQ_SHIFT_M, + tmp = cpu_to_le32(val); + roce_set_field(tmp, ROCEE_EXT_DB_SQ_H_EXT_DB_SQ_SHIFT_M, ROCEE_EXT_DB_SQ_H_EXT_DB_SQ_SHIFT_S, db->ext_db->esdb_dep); /* @@ -519,8 +544,9 @@ static void hns_roce_set_sdb_ext(struct hns_roce_dev *hr_dev, u32 ext_sdb_alept, * using 4K page, and shift more 32 because of * caculating the high 32 bit value evaluated to hardware. */ - roce_set_field(val, ROCEE_EXT_DB_SQ_H_EXT_DB_SQ_BA_H_M, + roce_set_field(tmp, ROCEE_EXT_DB_SQ_H_EXT_DB_SQ_BA_H_M, ROCEE_EXT_DB_SQ_H_EXT_DB_SQ_BA_H_S, sdb_dma_addr >> 44); + val = le32_to_cpu(tmp); roce_write(hr_dev, ROCEE_EXT_DB_SQ_H_REG, val); dev_dbg(dev, "ext SDB depth: 0x%x\n", db->ext_db->esdb_dep); @@ -535,6 +561,7 @@ static void hns_roce_set_odb_ext(struct hns_roce_dev *hr_dev, u32 ext_odb_alept, struct hns_roce_v1_priv *priv; struct hns_roce_db_table *db; dma_addr_t odb_dma_addr; + __le32 tmp; u32 val; priv = (struct hns_roce_v1_priv *)hr_dev->priv; @@ -550,12 +577,14 @@ static void hns_roce_set_odb_ext(struct hns_roce_dev *hr_dev, u32 ext_odb_alept, /* Configure extend ODB depth */ val = roce_read(hr_dev, ROCEE_EXT_DB_OTH_H_REG); - roce_set_field(val, ROCEE_EXT_DB_OTH_H_EXT_DB_OTH_SHIFT_M, + tmp = cpu_to_le32(val); + roce_set_field(tmp, ROCEE_EXT_DB_OTH_H_EXT_DB_OTH_SHIFT_M, ROCEE_EXT_DB_OTH_H_EXT_DB_OTH_SHIFT_S, db->ext_db->eodb_dep); - roce_set_field(val, ROCEE_EXT_DB_SQ_H_EXT_DB_OTH_BA_H_M, + roce_set_field(tmp, ROCEE_EXT_DB_SQ_H_EXT_DB_OTH_BA_H_M, ROCEE_EXT_DB_SQ_H_EXT_DB_OTH_BA_H_S, db->ext_db->eodb_dep); + val = le32_to_cpu(tmp); roce_write(hr_dev, ROCEE_EXT_DB_OTH_H_REG, val); dev_dbg(dev, "ext ODB depth: 0x%x\n", db->ext_db->eodb_dep); @@ -762,6 +791,7 @@ static int hns_roce_v1_rsv_lp_qp(struct hns_roce_dev *hr_dev) free_mr->mr_free_qp[i] = hns_roce_v1_create_lp_qp(hr_dev, pd); if (!free_mr->mr_free_qp[i]) { dev_err(dev, "Create loop qp failed!\n"); + ret = -ENOMEM; goto create_lp_qp_failed; } hr_qp = free_mr->mr_free_qp[i]; @@ -831,7 +861,7 @@ alloc_pd_failed: if (hns_roce_ib_destroy_cq(cq)) dev_err(dev, "Destroy cq for create_lp_qp failed!\n"); - return -EINVAL; + return ret; } static void hns_roce_v1_release_lp_qp(struct hns_roce_dev *hr_dev) @@ -969,7 +999,8 @@ static int hns_roce_v1_send_lp_wqe(struct hns_roce_qp *hr_qp) { struct hns_roce_dev *hr_dev = to_hr_dev(hr_qp->ibqp.device); struct device *dev = &hr_dev->pdev->dev; - struct ib_send_wr send_wr, *bad_wr; + struct ib_send_wr send_wr; + const struct ib_send_wr *bad_wr; int ret; memset(&send_wr, 0, sizeof(send_wr)); @@ -1161,9 +1192,10 @@ static void hns_roce_db_free(struct hns_roce_dev *hr_dev) static int hns_roce_raq_init(struct hns_roce_dev *hr_dev) { int ret; + u32 val; + __le32 tmp; int raq_shift = 0; dma_addr_t addr; - u32 val; struct hns_roce_v1_priv *priv; struct hns_roce_raq_table *raq; struct device *dev = &hr_dev->pdev->dev; @@ -1189,46 +1221,54 @@ static int hns_roce_raq_init(struct hns_roce_dev *hr_dev) /* Configure raq_shift */ raq_shift = ilog2(HNS_ROCE_V1_RAQ_SIZE / HNS_ROCE_V1_RAQ_ENTRY); val = roce_read(hr_dev, ROCEE_EXT_RAQ_H_REG); - roce_set_field(val, ROCEE_EXT_RAQ_H_EXT_RAQ_SHIFT_M, + tmp = cpu_to_le32(val); + roce_set_field(tmp, ROCEE_EXT_RAQ_H_EXT_RAQ_SHIFT_M, ROCEE_EXT_RAQ_H_EXT_RAQ_SHIFT_S, raq_shift); /* * 44 = 32 + 12, When evaluating addr to hardware, shift 12 because of * using 4K page, and shift more 32 because of * caculating the high 32 bit value evaluated to hardware. */ - roce_set_field(val, ROCEE_EXT_RAQ_H_EXT_RAQ_BA_H_M, + roce_set_field(tmp, ROCEE_EXT_RAQ_H_EXT_RAQ_BA_H_M, ROCEE_EXT_RAQ_H_EXT_RAQ_BA_H_S, raq->e_raq_buf->map >> 44); + val = le32_to_cpu(tmp); roce_write(hr_dev, ROCEE_EXT_RAQ_H_REG, val); dev_dbg(dev, "Configure raq_shift 0x%x.\n", val); /* Configure raq threshold */ val = roce_read(hr_dev, ROCEE_RAQ_WL_REG); - roce_set_field(val, ROCEE_RAQ_WL_ROCEE_RAQ_WL_M, + tmp = cpu_to_le32(val); + roce_set_field(tmp, ROCEE_RAQ_WL_ROCEE_RAQ_WL_M, ROCEE_RAQ_WL_ROCEE_RAQ_WL_S, HNS_ROCE_V1_EXT_RAQ_WF); + val = le32_to_cpu(tmp); roce_write(hr_dev, ROCEE_RAQ_WL_REG, val); dev_dbg(dev, "Configure raq_wl 0x%x.\n", val); /* Enable extend raq */ val = roce_read(hr_dev, ROCEE_WRMS_POL_TIME_INTERVAL_REG); - roce_set_field(val, + tmp = cpu_to_le32(val); + roce_set_field(tmp, ROCEE_WRMS_POL_TIME_INTERVAL_WRMS_POL_TIME_INTERVAL_M, ROCEE_WRMS_POL_TIME_INTERVAL_WRMS_POL_TIME_INTERVAL_S, POL_TIME_INTERVAL_VAL); - roce_set_bit(val, ROCEE_WRMS_POL_TIME_INTERVAL_WRMS_EXT_RAQ_MODE, 1); - roce_set_field(val, + roce_set_bit(tmp, ROCEE_WRMS_POL_TIME_INTERVAL_WRMS_EXT_RAQ_MODE, 1); + roce_set_field(tmp, ROCEE_WRMS_POL_TIME_INTERVAL_WRMS_RAQ_TIMEOUT_CHK_CFG_M, ROCEE_WRMS_POL_TIME_INTERVAL_WRMS_RAQ_TIMEOUT_CHK_CFG_S, 2); - roce_set_bit(val, + roce_set_bit(tmp, ROCEE_WRMS_POL_TIME_INTERVAL_WRMS_RAQ_TIMEOUT_CHK_EN_S, 1); + val = le32_to_cpu(tmp); roce_write(hr_dev, ROCEE_WRMS_POL_TIME_INTERVAL_REG, val); dev_dbg(dev, "Configure WrmsPolTimeInterval 0x%x.\n", val); /* Enable raq drop */ val = roce_read(hr_dev, ROCEE_GLB_CFG_REG); - roce_set_bit(val, ROCEE_GLB_CFG_TRP_RAQ_DROP_EN_S, 1); + tmp = cpu_to_le32(val); + roce_set_bit(tmp, ROCEE_GLB_CFG_TRP_RAQ_DROP_EN_S, 1); + val = le32_to_cpu(tmp); roce_write(hr_dev, ROCEE_GLB_CFG_REG, val); dev_dbg(dev, "Configure GlbCfg = 0x%x.\n", val); @@ -1255,20 +1295,25 @@ static void hns_roce_raq_free(struct hns_roce_dev *hr_dev) static void hns_roce_port_enable(struct hns_roce_dev *hr_dev, int enable_flag) { + __le32 tmp; u32 val; if (enable_flag) { val = roce_read(hr_dev, ROCEE_GLB_CFG_REG); /* Open all ports */ - roce_set_field(val, ROCEE_GLB_CFG_ROCEE_PORT_ST_M, + tmp = cpu_to_le32(val); + roce_set_field(tmp, ROCEE_GLB_CFG_ROCEE_PORT_ST_M, ROCEE_GLB_CFG_ROCEE_PORT_ST_S, ALL_PORT_VAL_OPEN); + val = le32_to_cpu(tmp); roce_write(hr_dev, ROCEE_GLB_CFG_REG, val); } else { val = roce_read(hr_dev, ROCEE_GLB_CFG_REG); /* Close all ports */ - roce_set_field(val, ROCEE_GLB_CFG_ROCEE_PORT_ST_M, + tmp = cpu_to_le32(val); + roce_set_field(tmp, ROCEE_GLB_CFG_ROCEE_PORT_ST_M, ROCEE_GLB_CFG_ROCEE_PORT_ST_S, 0x0); + val = le32_to_cpu(tmp); roce_write(hr_dev, ROCEE_GLB_CFG_REG, val); } } @@ -1498,13 +1543,11 @@ static int hns_roce_v1_profile(struct hns_roce_dev *hr_dev) int i = 0; struct hns_roce_caps *caps = &hr_dev->caps; - hr_dev->vendor_id = le32_to_cpu(roce_read(hr_dev, ROCEE_VENDOR_ID_REG)); - hr_dev->vendor_part_id = le32_to_cpu(roce_read(hr_dev, - ROCEE_VENDOR_PART_ID_REG)); - hr_dev->sys_image_guid = le32_to_cpu(roce_read(hr_dev, - ROCEE_SYS_IMAGE_GUID_L_REG)) | - ((u64)le32_to_cpu(roce_read(hr_dev, - ROCEE_SYS_IMAGE_GUID_H_REG)) << 32); + hr_dev->vendor_id = roce_read(hr_dev, ROCEE_VENDOR_ID_REG); + hr_dev->vendor_part_id = roce_read(hr_dev, ROCEE_VENDOR_PART_ID_REG); + hr_dev->sys_image_guid = roce_read(hr_dev, ROCEE_SYS_IMAGE_GUID_L_REG) | + ((u64)roce_read(hr_dev, + ROCEE_SYS_IMAGE_GUID_H_REG) << 32); hr_dev->hw_rev = HNS_ROCE_HW_VER1; caps->num_qps = HNS_ROCE_V1_MAX_QP_NUM; @@ -1557,8 +1600,7 @@ static int hns_roce_v1_profile(struct hns_roce_dev *hr_dev) caps->ceqe_depth = HNS_ROCE_V1_COMP_EQE_NUM; caps->aeqe_depth = HNS_ROCE_V1_ASYNC_EQE_NUM; - caps->local_ca_ack_delay = le32_to_cpu(roce_read(hr_dev, - ROCEE_ACK_DELAY_REG)); + caps->local_ca_ack_delay = roce_read(hr_dev, ROCEE_ACK_DELAY_REG); caps->max_mtu = IB_MTU_2048; return 0; @@ -1568,21 +1610,25 @@ static int hns_roce_v1_init(struct hns_roce_dev *hr_dev) { int ret; u32 val; + __le32 tmp; struct device *dev = &hr_dev->pdev->dev; /* DMAE user config */ val = roce_read(hr_dev, ROCEE_DMAE_USER_CFG1_REG); - roce_set_field(val, ROCEE_DMAE_USER_CFG1_ROCEE_CACHE_TB_CFG_M, + tmp = cpu_to_le32(val); + roce_set_field(tmp, ROCEE_DMAE_USER_CFG1_ROCEE_CACHE_TB_CFG_M, ROCEE_DMAE_USER_CFG1_ROCEE_CACHE_TB_CFG_S, 0xf); - roce_set_field(val, ROCEE_DMAE_USER_CFG1_ROCEE_STREAM_ID_TB_CFG_M, + roce_set_field(tmp, ROCEE_DMAE_USER_CFG1_ROCEE_STREAM_ID_TB_CFG_M, ROCEE_DMAE_USER_CFG1_ROCEE_STREAM_ID_TB_CFG_S, 1 << PAGES_SHIFT_16); + val = le32_to_cpu(tmp); roce_write(hr_dev, ROCEE_DMAE_USER_CFG1_REG, val); val = roce_read(hr_dev, ROCEE_DMAE_USER_CFG2_REG); - roce_set_field(val, ROCEE_DMAE_USER_CFG2_ROCEE_CACHE_PKT_CFG_M, + tmp = cpu_to_le32(val); + roce_set_field(tmp, ROCEE_DMAE_USER_CFG2_ROCEE_CACHE_PKT_CFG_M, ROCEE_DMAE_USER_CFG2_ROCEE_CACHE_PKT_CFG_S, 0xf); - roce_set_field(val, ROCEE_DMAE_USER_CFG2_ROCEE_STREAM_ID_PKT_CFG_M, + roce_set_field(tmp, ROCEE_DMAE_USER_CFG2_ROCEE_STREAM_ID_PKT_CFG_M, ROCEE_DMAE_USER_CFG2_ROCEE_STREAM_ID_PKT_CFG_S, 1 << PAGES_SHIFT_16); @@ -1668,6 +1714,7 @@ static int hns_roce_v1_post_mbox(struct hns_roce_dev *hr_dev, u64 in_param, u32 __iomem *hcr = (u32 __iomem *)(hr_dev->reg_base + ROCEE_MB1_REG); unsigned long end; u32 val = 0; + __le32 tmp; end = msecs_to_jiffies(GO_BIT_TIMEOUT_MSECS) + jiffies; while (hns_roce_v1_cmd_pending(hr_dev)) { @@ -1679,15 +1726,17 @@ static int hns_roce_v1_post_mbox(struct hns_roce_dev *hr_dev, u64 in_param, cond_resched(); } - roce_set_field(val, ROCEE_MB6_ROCEE_MB_CMD_M, ROCEE_MB6_ROCEE_MB_CMD_S, + tmp = cpu_to_le32(val); + roce_set_field(tmp, ROCEE_MB6_ROCEE_MB_CMD_M, ROCEE_MB6_ROCEE_MB_CMD_S, op); - roce_set_field(val, ROCEE_MB6_ROCEE_MB_CMD_MDF_M, + roce_set_field(tmp, ROCEE_MB6_ROCEE_MB_CMD_MDF_M, ROCEE_MB6_ROCEE_MB_CMD_MDF_S, op_modifier); - roce_set_bit(val, ROCEE_MB6_ROCEE_MB_EVENT_S, event); - roce_set_bit(val, ROCEE_MB6_ROCEE_MB_HW_RUN_S, 1); - roce_set_field(val, ROCEE_MB6_ROCEE_MB_TOKEN_M, + roce_set_bit(tmp, ROCEE_MB6_ROCEE_MB_EVENT_S, event); + roce_set_bit(tmp, ROCEE_MB6_ROCEE_MB_HW_RUN_S, 1); + roce_set_field(tmp, ROCEE_MB6_ROCEE_MB_TOKEN_M, ROCEE_MB6_ROCEE_MB_TOKEN_S, token); + val = le32_to_cpu(tmp); writeq(in_param, hcr + 0); writeq(out_param, hcr + 2); writel(in_modifier, hcr + 4); @@ -1717,7 +1766,7 @@ static int hns_roce_v1_chk_mbox(struct hns_roce_dev *hr_dev, return -ETIMEDOUT; } - status = le32_to_cpu((__force __be32) + status = le32_to_cpu((__force __le32) __raw_readl(hcr + HCR_STATUS_OFFSET)); if ((status & STATUS_MASK) != 0x1) { dev_err(hr_dev->dev, "mailbox status 0x%x!\n", status); @@ -1728,7 +1777,7 @@ static int hns_roce_v1_chk_mbox(struct hns_roce_dev *hr_dev, } static int hns_roce_v1_set_gid(struct hns_roce_dev *hr_dev, u8 port, - int gid_index, union ib_gid *gid, + int gid_index, const union ib_gid *gid, const struct ib_gid_attr *attr) { u32 *p = NULL; @@ -1760,6 +1809,7 @@ static int hns_roce_v1_set_mac(struct hns_roce_dev *hr_dev, u8 phy_port, { u32 reg_smac_l; u16 reg_smac_h; + __le32 tmp; u16 *p_h; u32 *p; u32 val; @@ -1784,10 +1834,12 @@ static int hns_roce_v1_set_mac(struct hns_roce_dev *hr_dev, u8 phy_port, val = roce_read(hr_dev, ROCEE_SMAC_H_0_REG + phy_port * PHY_PORT_OFFSET); + tmp = cpu_to_le32(val); p_h = (u16 *)(&addr[4]); reg_smac_h = *p_h; - roce_set_field(val, ROCEE_SMAC_H_ROCEE_SMAC_H_M, + roce_set_field(tmp, ROCEE_SMAC_H_ROCEE_SMAC_H_M, ROCEE_SMAC_H_ROCEE_SMAC_H_S, reg_smac_h); + val = le32_to_cpu(tmp); roce_write(hr_dev, ROCEE_SMAC_H_0_REG + phy_port * PHY_PORT_OFFSET, val); @@ -1797,12 +1849,15 @@ static int hns_roce_v1_set_mac(struct hns_roce_dev *hr_dev, u8 phy_port, static void hns_roce_v1_set_mtu(struct hns_roce_dev *hr_dev, u8 phy_port, enum ib_mtu mtu) { + __le32 tmp; u32 val; val = roce_read(hr_dev, ROCEE_SMAC_H_0_REG + phy_port * PHY_PORT_OFFSET); - roce_set_field(val, ROCEE_SMAC_H_ROCEE_PORT_MTU_M, + tmp = cpu_to_le32(val); + roce_set_field(tmp, ROCEE_SMAC_H_ROCEE_PORT_MTU_M, ROCEE_SMAC_H_ROCEE_PORT_MTU_S, mtu); + val = le32_to_cpu(tmp); roce_write(hr_dev, ROCEE_SMAC_H_0_REG + phy_port * PHY_PORT_OFFSET, val); } @@ -1848,9 +1903,9 @@ static int hns_roce_v1_write_mtpt(void *mb_buf, struct hns_roce_mr *mr, roce_set_field(mpt_entry->mpt_byte_12, MPT_BYTE_12_MW_BIND_COUNTER_M, MPT_BYTE_12_MW_BIND_COUNTER_S, 0); - mpt_entry->virt_addr_l = (u32)mr->iova; - mpt_entry->virt_addr_h = (u32)(mr->iova >> 32); - mpt_entry->length = (u32)mr->size; + mpt_entry->virt_addr_l = cpu_to_le32((u32)mr->iova); + mpt_entry->virt_addr_h = cpu_to_le32((u32)(mr->iova >> 32)); + mpt_entry->length = cpu_to_le32((u32)mr->size); roce_set_field(mpt_entry->mpt_byte_28, MPT_BYTE_28_PD_M, MPT_BYTE_28_PD_S, mr->pd); @@ -1885,64 +1940,59 @@ static int hns_roce_v1_write_mtpt(void *mb_buf, struct hns_roce_mr *mr, roce_set_field(mpt_entry->mpt_byte_36, MPT_BYTE_36_PA0_H_M, MPT_BYTE_36_PA0_H_S, - cpu_to_le32((u32)(pages[i] >> PAGES_SHIFT_32))); + (u32)(pages[i] >> PAGES_SHIFT_32)); break; case 1: roce_set_field(mpt_entry->mpt_byte_36, MPT_BYTE_36_PA1_L_M, - MPT_BYTE_36_PA1_L_S, - cpu_to_le32((u32)(pages[i]))); + MPT_BYTE_36_PA1_L_S, (u32)(pages[i])); roce_set_field(mpt_entry->mpt_byte_40, MPT_BYTE_40_PA1_H_M, MPT_BYTE_40_PA1_H_S, - cpu_to_le32((u32)(pages[i] >> PAGES_SHIFT_24))); + (u32)(pages[i] >> PAGES_SHIFT_24)); break; case 2: roce_set_field(mpt_entry->mpt_byte_40, MPT_BYTE_40_PA2_L_M, - MPT_BYTE_40_PA2_L_S, - cpu_to_le32((u32)(pages[i]))); + MPT_BYTE_40_PA2_L_S, (u32)(pages[i])); roce_set_field(mpt_entry->mpt_byte_44, MPT_BYTE_44_PA2_H_M, MPT_BYTE_44_PA2_H_S, - cpu_to_le32((u32)(pages[i] >> PAGES_SHIFT_16))); + (u32)(pages[i] >> PAGES_SHIFT_16)); break; case 3: roce_set_field(mpt_entry->mpt_byte_44, MPT_BYTE_44_PA3_L_M, - MPT_BYTE_44_PA3_L_S, - cpu_to_le32((u32)(pages[i]))); + MPT_BYTE_44_PA3_L_S, (u32)(pages[i])); roce_set_field(mpt_entry->mpt_byte_48, MPT_BYTE_48_PA3_H_M, MPT_BYTE_48_PA3_H_S, - cpu_to_le32((u32)(pages[i] >> PAGES_SHIFT_8))); + (u32)(pages[i] >> PAGES_SHIFT_8)); break; case 4: mpt_entry->pa4_l = cpu_to_le32((u32)(pages[i])); roce_set_field(mpt_entry->mpt_byte_56, MPT_BYTE_56_PA4_H_M, MPT_BYTE_56_PA4_H_S, - cpu_to_le32((u32)(pages[i] >> PAGES_SHIFT_32))); + (u32)(pages[i] >> PAGES_SHIFT_32)); break; case 5: roce_set_field(mpt_entry->mpt_byte_56, MPT_BYTE_56_PA5_L_M, - MPT_BYTE_56_PA5_L_S, - cpu_to_le32((u32)(pages[i]))); + MPT_BYTE_56_PA5_L_S, (u32)(pages[i])); roce_set_field(mpt_entry->mpt_byte_60, MPT_BYTE_60_PA5_H_M, MPT_BYTE_60_PA5_H_S, - cpu_to_le32((u32)(pages[i] >> PAGES_SHIFT_24))); + (u32)(pages[i] >> PAGES_SHIFT_24)); break; case 6: roce_set_field(mpt_entry->mpt_byte_60, MPT_BYTE_60_PA6_L_M, - MPT_BYTE_60_PA6_L_S, - cpu_to_le32((u32)(pages[i]))); + MPT_BYTE_60_PA6_L_S, (u32)(pages[i])); roce_set_field(mpt_entry->mpt_byte_64, MPT_BYTE_64_PA6_H_M, MPT_BYTE_64_PA6_H_S, - cpu_to_le32((u32)(pages[i] >> PAGES_SHIFT_16))); + (u32)(pages[i] >> PAGES_SHIFT_16)); break; default: break; @@ -1951,7 +2001,7 @@ static int hns_roce_v1_write_mtpt(void *mb_buf, struct hns_roce_mr *mr, free_page((unsigned long) pages); - mpt_entry->pbl_addr_l = (u32)(mr->pbl_dma_addr); + mpt_entry->pbl_addr_l = cpu_to_le32((u32)(mr->pbl_dma_addr)); roce_set_field(mpt_entry->mpt_byte_12, MPT_BYTE_12_PBL_ADDR_H_M, MPT_BYTE_12_PBL_ADDR_H_S, @@ -1982,9 +2032,9 @@ static struct hns_roce_cqe *next_cqe_sw(struct hns_roce_cq *hr_cq) static void hns_roce_v1_cq_set_ci(struct hns_roce_cq *hr_cq, u32 cons_index) { - u32 doorbell[2]; + __le32 doorbell[2]; - doorbell[0] = cons_index & ((hr_cq->cq_depth << 1) - 1); + doorbell[0] = cpu_to_le32(cons_index & ((hr_cq->cq_depth << 1) - 1)); doorbell[1] = 0; roce_set_bit(doorbell[1], ROCEE_DB_OTHERS_H_ROCEE_DB_OTH_HW_SYNS_S, 1); roce_set_field(doorbell[1], ROCEE_DB_OTHERS_H_ROCEE_DB_OTH_CMD_M, @@ -2081,10 +2131,8 @@ static void hns_roce_v1_write_cqc(struct hns_roce_dev *hr_dev, CQ_CONTEXT_CQC_BYTE_4_CQC_STATE_S, CQ_STATE_VALID); roce_set_field(cq_context->cqc_byte_4, CQ_CONTEXT_CQC_BYTE_4_CQN_M, CQ_CONTEXT_CQC_BYTE_4_CQN_S, hr_cq->cqn); - cq_context->cqc_byte_4 = cpu_to_le32(cq_context->cqc_byte_4); - cq_context->cq_bt_l = (u32)dma_handle; - cq_context->cq_bt_l = cpu_to_le32(cq_context->cq_bt_l); + cq_context->cq_bt_l = cpu_to_le32((u32)dma_handle); roce_set_field(cq_context->cqc_byte_12, CQ_CONTEXT_CQC_BYTE_12_CQ_BT_H_M, @@ -2096,15 +2144,12 @@ static void hns_roce_v1_write_cqc(struct hns_roce_dev *hr_dev, ilog2((unsigned int)nent)); roce_set_field(cq_context->cqc_byte_12, CQ_CONTEXT_CQC_BYTE_12_CEQN_M, CQ_CONTEXT_CQC_BYTE_12_CEQN_S, vector); - cq_context->cqc_byte_12 = cpu_to_le32(cq_context->cqc_byte_12); - cq_context->cur_cqe_ba0_l = (u32)(mtts[0]); - cq_context->cur_cqe_ba0_l = cpu_to_le32(cq_context->cur_cqe_ba0_l); + cq_context->cur_cqe_ba0_l = cpu_to_le32((u32)(mtts[0])); roce_set_field(cq_context->cqc_byte_20, CQ_CONTEXT_CQC_BYTE_20_CUR_CQE_BA0_H_M, - CQ_CONTEXT_CQC_BYTE_20_CUR_CQE_BA0_H_S, - cpu_to_le32((mtts[0]) >> 32)); + CQ_CONTEXT_CQC_BYTE_20_CUR_CQE_BA0_H_S, (mtts[0]) >> 32); /* Dedicated hardware, directly set 0 */ roce_set_field(cq_context->cqc_byte_20, CQ_CONTEXT_CQC_BYTE_20_CQ_CUR_INDEX_M, @@ -2118,9 +2163,8 @@ static void hns_roce_v1_write_cqc(struct hns_roce_dev *hr_dev, CQ_CONTEXT_CQC_BYTE_20_CQE_TPTR_ADDR_H_M, CQ_CONTEXT_CQC_BYTE_20_CQE_TPTR_ADDR_H_S, tptr_dma_addr >> 44); - cq_context->cqc_byte_20 = cpu_to_le32(cq_context->cqc_byte_20); - cq_context->cqe_tptr_addr_l = (u32)(tptr_dma_addr >> 12); + cq_context->cqe_tptr_addr_l = cpu_to_le32((u32)(tptr_dma_addr >> 12)); roce_set_field(cq_context->cqc_byte_32, CQ_CONTEXT_CQC_BYTE_32_CUR_CQE_BA1_H_M, @@ -2138,7 +2182,6 @@ static void hns_roce_v1_write_cqc(struct hns_roce_dev *hr_dev, roce_set_field(cq_context->cqc_byte_32, CQ_CONTEXT_CQC_BYTE_32_CQ_CONS_IDX_M, CQ_CONTEXT_CQC_BYTE_32_CQ_CONS_IDX_S, 0); - cq_context->cqc_byte_32 = cpu_to_le32(cq_context->cqc_byte_32); } static int hns_roce_v1_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period) @@ -2151,7 +2194,7 @@ static int hns_roce_v1_req_notify_cq(struct ib_cq *ibcq, { struct hns_roce_cq *hr_cq = to_hr_cq(ibcq); u32 notification_flag; - u32 doorbell[2]; + __le32 doorbell[2]; notification_flag = (flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED ? CQ_DB_REQ_NOT : CQ_DB_REQ_NOT_SOL; @@ -2159,7 +2202,8 @@ static int hns_roce_v1_req_notify_cq(struct ib_cq *ibcq, * flags = 0; Notification Flag = 1, next * flags = 1; Notification Flag = 0, solocited */ - doorbell[0] = hr_cq->cons_index & ((hr_cq->cq_depth << 1) - 1); + doorbell[0] = + cpu_to_le32(hr_cq->cons_index & ((hr_cq->cq_depth << 1) - 1)); roce_set_bit(doorbell[1], ROCEE_DB_OTHERS_H_ROCEE_DB_OTH_HW_SYNS_S, 1); roce_set_field(doorbell[1], ROCEE_DB_OTHERS_H_ROCEE_DB_OTH_CMD_M, ROCEE_DB_OTHERS_H_ROCEE_DB_OTH_CMD_S, 3); @@ -2416,7 +2460,7 @@ static int hns_roce_v1_clear_hem(struct hns_roce_dev *hr_dev, struct device *dev = &hr_dev->pdev->dev; struct hns_roce_v1_priv *priv; unsigned long end = 0, flags = 0; - uint32_t bt_cmd_val[2] = {0}; + __le32 bt_cmd_val[2] = {0}; void __iomem *bt_cmd; u64 bt_ba = 0; @@ -2468,7 +2512,7 @@ static int hns_roce_v1_clear_hem(struct hns_roce_dev *hr_dev, msleep(HW_SYNC_SLEEP_TIME_INTERVAL); } - bt_cmd_val[0] = (uint32_t)bt_ba; + bt_cmd_val[0] = (__le32)bt_ba; roce_set_field(bt_cmd_val[1], ROCEE_BT_CMD_H_ROCEE_BT_CMD_BA_H_M, ROCEE_BT_CMD_H_ROCEE_BT_CMD_BA_H_S, bt_ba >> 32); hns_roce_write64_k(bt_cmd_val, hr_dev->reg_base + ROCEE_BT_CMD_L_REG); @@ -2569,10 +2613,11 @@ static int hns_roce_v1_m_sqp(struct ib_qp *ibqp, const struct ib_qp_attr *attr, struct hns_roce_sqp_context *context; struct device *dev = &hr_dev->pdev->dev; dma_addr_t dma_handle = 0; + u32 __iomem *addr; int rq_pa_start; + __le32 tmp; u32 reg_val; u64 *mtts; - u32 __iomem *addr; context = kzalloc(sizeof(*context), GFP_KERNEL); if (!context) @@ -2598,7 +2643,7 @@ static int hns_roce_v1_m_sqp(struct ib_qp *ibqp, const struct ib_qp_attr *attr, roce_set_field(context->qp1c_bytes_4, QP1C_BYTES_4_PD_M, QP1C_BYTES_4_PD_S, to_hr_pd(ibqp->pd)->pdn); - context->sq_rq_bt_l = (u32)(dma_handle); + context->sq_rq_bt_l = cpu_to_le32((u32)(dma_handle)); roce_set_field(context->qp1c_bytes_12, QP1C_BYTES_12_SQ_RQ_BT_H_M, QP1C_BYTES_12_SQ_RQ_BT_H_S, @@ -2610,7 +2655,7 @@ static int hns_roce_v1_m_sqp(struct ib_qp *ibqp, const struct ib_qp_attr *attr, QP1C_BYTES_16_PORT_NUM_S, hr_qp->phy_port); roce_set_bit(context->qp1c_bytes_16, QP1C_BYTES_16_SIGNALING_TYPE_S, - hr_qp->sq_signal_bits); + le32_to_cpu(hr_qp->sq_signal_bits)); roce_set_bit(context->qp1c_bytes_16, QP1C_BYTES_16_RQ_BA_FLG_S, 1); roce_set_bit(context->qp1c_bytes_16, QP1C_BYTES_16_SQ_BA_FLG_S, @@ -2624,7 +2669,8 @@ static int hns_roce_v1_m_sqp(struct ib_qp *ibqp, const struct ib_qp_attr *attr, QP1C_BYTES_20_PKEY_IDX_S, attr->pkey_index); rq_pa_start = (u32)hr_qp->rq.offset / PAGE_SIZE; - context->cur_rq_wqe_ba_l = (u32)(mtts[rq_pa_start]); + context->cur_rq_wqe_ba_l = + cpu_to_le32((u32)(mtts[rq_pa_start])); roce_set_field(context->qp1c_bytes_28, QP1C_BYTES_28_CUR_RQ_WQE_BA_H_M, @@ -2643,7 +2689,7 @@ static int hns_roce_v1_m_sqp(struct ib_qp *ibqp, const struct ib_qp_attr *attr, QP1C_BYTES_32_TX_CQ_NUM_S, to_hr_cq(ibqp->send_cq)->cqn); - context->cur_sq_wqe_ba_l = (u32)mtts[0]; + context->cur_sq_wqe_ba_l = cpu_to_le32((u32)mtts[0]); roce_set_field(context->qp1c_bytes_40, QP1C_BYTES_40_CUR_SQ_WQE_BA_H_M, @@ -2658,23 +2704,25 @@ static int hns_roce_v1_m_sqp(struct ib_qp *ibqp, const struct ib_qp_attr *attr, ROCEE_QP1C_CFG0_0_REG + hr_qp->phy_port * sizeof(*context)); - writel(context->qp1c_bytes_4, addr); - writel(context->sq_rq_bt_l, addr + 1); - writel(context->qp1c_bytes_12, addr + 2); - writel(context->qp1c_bytes_16, addr + 3); - writel(context->qp1c_bytes_20, addr + 4); - writel(context->cur_rq_wqe_ba_l, addr + 5); - writel(context->qp1c_bytes_28, addr + 6); - writel(context->qp1c_bytes_32, addr + 7); - writel(context->cur_sq_wqe_ba_l, addr + 8); - writel(context->qp1c_bytes_40, addr + 9); + writel(le32_to_cpu(context->qp1c_bytes_4), addr); + writel(le32_to_cpu(context->sq_rq_bt_l), addr + 1); + writel(le32_to_cpu(context->qp1c_bytes_12), addr + 2); + writel(le32_to_cpu(context->qp1c_bytes_16), addr + 3); + writel(le32_to_cpu(context->qp1c_bytes_20), addr + 4); + writel(le32_to_cpu(context->cur_rq_wqe_ba_l), addr + 5); + writel(le32_to_cpu(context->qp1c_bytes_28), addr + 6); + writel(le32_to_cpu(context->qp1c_bytes_32), addr + 7); + writel(le32_to_cpu(context->cur_sq_wqe_ba_l), addr + 8); + writel(le32_to_cpu(context->qp1c_bytes_40), addr + 9); } /* Modify QP1C status */ reg_val = roce_read(hr_dev, ROCEE_QP1C_CFG0_0_REG + hr_qp->phy_port * sizeof(*context)); - roce_set_field(reg_val, ROCEE_QP1C_CFG0_0_ROCEE_QP1C_QP_ST_M, + tmp = cpu_to_le32(reg_val); + roce_set_field(tmp, ROCEE_QP1C_CFG0_0_ROCEE_QP1C_QP_ST_M, ROCEE_QP1C_CFG0_0_ROCEE_QP1C_QP_ST_S, new_state); + reg_val = le32_to_cpu(tmp); roce_write(hr_dev, ROCEE_QP1C_CFG0_0_REG + hr_qp->phy_port * sizeof(*context), reg_val); @@ -2712,7 +2760,7 @@ static int hns_roce_v1_m_qp(struct ib_qp *ibqp, const struct ib_qp_attr *attr, const struct ib_global_route *grh = rdma_ah_read_grh(&attr->ah_attr); dma_addr_t dma_handle_2 = 0; dma_addr_t dma_handle = 0; - uint32_t doorbell[2] = {0}; + __le32 doorbell[2] = {0}; int rq_pa_start = 0; u64 *mtts_2 = NULL; int ret = -EINVAL; @@ -2887,7 +2935,7 @@ static int hns_roce_v1_m_qp(struct ib_qp *ibqp, const struct ib_qp_attr *attr, dmac = (u8 *)attr->ah_attr.roce.dmac; - context->sq_rq_bt_l = (u32)(dma_handle); + context->sq_rq_bt_l = cpu_to_le32((u32)(dma_handle)); roce_set_field(context->qpc_bytes_24, QP_CONTEXT_QPC_BYTES_24_SQ_RQ_BT_H_M, QP_CONTEXT_QPC_BYTES_24_SQ_RQ_BT_H_S, @@ -2899,7 +2947,7 @@ static int hns_roce_v1_m_qp(struct ib_qp *ibqp, const struct ib_qp_attr *attr, QP_CONTEXT_QPC_BYTES_24_MINIMUM_RNR_NAK_TIMER_M, QP_CONTEXT_QPC_BYTES_24_MINIMUM_RNR_NAK_TIMER_S, attr->min_rnr_timer); - context->irrl_ba_l = (u32)(dma_handle_2); + context->irrl_ba_l = cpu_to_le32((u32)(dma_handle_2)); roce_set_field(context->qpc_bytes_32, QP_CONTEXT_QPC_BYTES_32_IRRL_BA_H_M, QP_CONTEXT_QPC_BYTES_32_IRRL_BA_H_S, @@ -2913,7 +2961,7 @@ static int hns_roce_v1_m_qp(struct ib_qp *ibqp, const struct ib_qp_attr *attr, 1); roce_set_bit(context->qpc_bytes_32, QP_CONTEXT_QPC_BYTE_32_SIGNALING_TYPE_S, - hr_qp->sq_signal_bits); + le32_to_cpu(hr_qp->sq_signal_bits)); port = (attr_mask & IB_QP_PORT) ? (attr->port_num - 1) : hr_qp->port; @@ -2991,7 +3039,8 @@ static int hns_roce_v1_m_qp(struct ib_qp *ibqp, const struct ib_qp_attr *attr, QP_CONTEXT_QPC_BYTES_68_RQ_CUR_INDEX_S, 0); rq_pa_start = (u32)hr_qp->rq.offset / PAGE_SIZE; - context->cur_rq_wqe_ba_l = (u32)(mtts[rq_pa_start]); + context->cur_rq_wqe_ba_l = + cpu_to_le32((u32)(mtts[rq_pa_start])); roce_set_field(context->qpc_bytes_76, QP_CONTEXT_QPC_BYTES_76_CUR_RQ_WQE_BA_H_M, @@ -3071,7 +3120,7 @@ static int hns_roce_v1_m_qp(struct ib_qp *ibqp, const struct ib_qp_attr *attr, goto out; } - context->rx_cur_sq_wqe_ba_l = (u32)(mtts[0]); + context->rx_cur_sq_wqe_ba_l = cpu_to_le32((u32)(mtts[0])); roce_set_field(context->qpc_bytes_120, QP_CONTEXT_QPC_BYTES_120_RX_CUR_SQ_WQE_BA_H_M, @@ -3219,7 +3268,7 @@ static int hns_roce_v1_m_qp(struct ib_qp *ibqp, const struct ib_qp_attr *attr, QP_CONTEXT_QPC_BYTES_180_SQ_HEAD_M, QP_CONTEXT_QPC_BYTES_180_SQ_HEAD_S, 0); - context->tx_cur_sq_wqe_ba_l = (u32)(mtts[0]); + context->tx_cur_sq_wqe_ba_l = cpu_to_le32((u32)(mtts[0])); roce_set_field(context->qpc_bytes_188, QP_CONTEXT_QPC_BYTES_188_TX_CUR_SQ_WQE_BA_H_M, @@ -3386,16 +3435,16 @@ static int hns_roce_v1_q_sqp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, addr = ROCEE_QP1C_CFG0_0_REG + hr_qp->port * sizeof(struct hns_roce_sqp_context); - context.qp1c_bytes_4 = roce_read(hr_dev, addr); - context.sq_rq_bt_l = roce_read(hr_dev, addr + 1); - context.qp1c_bytes_12 = roce_read(hr_dev, addr + 2); - context.qp1c_bytes_16 = roce_read(hr_dev, addr + 3); - context.qp1c_bytes_20 = roce_read(hr_dev, addr + 4); - context.cur_rq_wqe_ba_l = roce_read(hr_dev, addr + 5); - context.qp1c_bytes_28 = roce_read(hr_dev, addr + 6); - context.qp1c_bytes_32 = roce_read(hr_dev, addr + 7); - context.cur_sq_wqe_ba_l = roce_read(hr_dev, addr + 8); - context.qp1c_bytes_40 = roce_read(hr_dev, addr + 9); + context.qp1c_bytes_4 = cpu_to_le32(roce_read(hr_dev, addr)); + context.sq_rq_bt_l = cpu_to_le32(roce_read(hr_dev, addr + 1)); + context.qp1c_bytes_12 = cpu_to_le32(roce_read(hr_dev, addr + 2)); + context.qp1c_bytes_16 = cpu_to_le32(roce_read(hr_dev, addr + 3)); + context.qp1c_bytes_20 = cpu_to_le32(roce_read(hr_dev, addr + 4)); + context.cur_rq_wqe_ba_l = cpu_to_le32(roce_read(hr_dev, addr + 5)); + context.qp1c_bytes_28 = cpu_to_le32(roce_read(hr_dev, addr + 6)); + context.qp1c_bytes_32 = cpu_to_le32(roce_read(hr_dev, addr + 7)); + context.cur_sq_wqe_ba_l = cpu_to_le32(roce_read(hr_dev, addr + 8)); + context.qp1c_bytes_40 = cpu_to_le32(roce_read(hr_dev, addr + 9)); hr_qp->state = roce_get_field(context.qp1c_bytes_4, QP1C_BYTES_4_QP_STATE_M, @@ -3557,7 +3606,7 @@ static int hns_roce_v1_q_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, qp_attr->retry_cnt = roce_get_field(context->qpc_bytes_148, QP_CONTEXT_QPC_BYTES_148_RETRY_COUNT_M, QP_CONTEXT_QPC_BYTES_148_RETRY_COUNT_S); - qp_attr->rnr_retry = context->rnr_retry; + qp_attr->rnr_retry = (u8)context->rnr_retry; done: qp_attr->cur_qp_state = qp_attr->qp_state; @@ -3595,42 +3644,47 @@ static void hns_roce_check_sdb_status(struct hns_roce_dev *hr_dev, u32 *old_send, u32 *old_retry, u32 *tsp_st, u32 *success_flags) { + __le32 *old_send_tmp, *old_retry_tmp; u32 sdb_retry_cnt; u32 sdb_send_ptr; u32 cur_cnt, old_cnt; + __le32 tmp, tmp1; u32 send_ptr; sdb_send_ptr = roce_read(hr_dev, ROCEE_SDB_SEND_PTR_REG); sdb_retry_cnt = roce_read(hr_dev, ROCEE_SDB_RETRY_CNT_REG); - cur_cnt = roce_get_field(sdb_send_ptr, - ROCEE_SDB_SEND_PTR_SDB_SEND_PTR_M, + tmp = cpu_to_le32(sdb_send_ptr); + tmp1 = cpu_to_le32(sdb_retry_cnt); + cur_cnt = roce_get_field(tmp, ROCEE_SDB_SEND_PTR_SDB_SEND_PTR_M, ROCEE_SDB_SEND_PTR_SDB_SEND_PTR_S) + - roce_get_field(sdb_retry_cnt, - ROCEE_SDB_RETRY_CNT_SDB_RETRY_CT_M, + roce_get_field(tmp1, ROCEE_SDB_RETRY_CNT_SDB_RETRY_CT_M, ROCEE_SDB_RETRY_CNT_SDB_RETRY_CT_S); + + old_send_tmp = (__le32 *)old_send; + old_retry_tmp = (__le32 *)old_retry; if (!roce_get_bit(*tsp_st, ROCEE_CNT_CLR_CE_CNT_CLR_CE_S)) { - old_cnt = roce_get_field(*old_send, + old_cnt = roce_get_field(*old_send_tmp, ROCEE_SDB_SEND_PTR_SDB_SEND_PTR_M, ROCEE_SDB_SEND_PTR_SDB_SEND_PTR_S) + - roce_get_field(*old_retry, + roce_get_field(*old_retry_tmp, ROCEE_SDB_RETRY_CNT_SDB_RETRY_CT_M, ROCEE_SDB_RETRY_CNT_SDB_RETRY_CT_S); if (cur_cnt - old_cnt > SDB_ST_CMP_VAL) *success_flags = 1; } else { - old_cnt = roce_get_field(*old_send, + old_cnt = roce_get_field(*old_send_tmp, ROCEE_SDB_SEND_PTR_SDB_SEND_PTR_M, ROCEE_SDB_SEND_PTR_SDB_SEND_PTR_S); if (cur_cnt - old_cnt > SDB_ST_CMP_VAL) { *success_flags = 1; } else { - send_ptr = roce_get_field(*old_send, + send_ptr = roce_get_field(*old_send_tmp, ROCEE_SDB_SEND_PTR_SDB_SEND_PTR_M, ROCEE_SDB_SEND_PTR_SDB_SEND_PTR_S) + - roce_get_field(sdb_retry_cnt, + roce_get_field(tmp1, ROCEE_SDB_RETRY_CNT_SDB_RETRY_CT_M, ROCEE_SDB_RETRY_CNT_SDB_RETRY_CT_S); - roce_set_field(*old_send, + roce_set_field(*old_send_tmp, ROCEE_SDB_SEND_PTR_SDB_SEND_PTR_M, ROCEE_SDB_SEND_PTR_SDB_SEND_PTR_S, send_ptr); @@ -3646,11 +3700,14 @@ static int check_qp_db_process_status(struct hns_roce_dev *hr_dev, { struct device *dev = &hr_dev->pdev->dev; u32 sdb_send_ptr, old_send; + __le32 sdb_issue_ptr_tmp; + __le32 sdb_send_ptr_tmp; u32 success_flags = 0; unsigned long end; u32 old_retry; u32 inv_cnt; u32 tsp_st; + __le32 tmp; if (*wait_stage > HNS_ROCE_V1_DB_STAGE2 || *wait_stage < HNS_ROCE_V1_DB_STAGE1) { @@ -3679,10 +3736,12 @@ static int check_qp_db_process_status(struct hns_roce_dev *hr_dev, ROCEE_SDB_SEND_PTR_REG); } - if (roce_get_field(sdb_issue_ptr, + sdb_send_ptr_tmp = cpu_to_le32(sdb_send_ptr); + sdb_issue_ptr_tmp = cpu_to_le32(sdb_issue_ptr); + if (roce_get_field(sdb_issue_ptr_tmp, ROCEE_SDB_ISSUE_PTR_SDB_ISSUE_PTR_M, ROCEE_SDB_ISSUE_PTR_SDB_ISSUE_PTR_S) == - roce_get_field(sdb_send_ptr, + roce_get_field(sdb_send_ptr_tmp, ROCEE_SDB_SEND_PTR_SDB_SEND_PTR_M, ROCEE_SDB_SEND_PTR_SDB_SEND_PTR_S)) { old_send = roce_read(hr_dev, ROCEE_SDB_SEND_PTR_REG); @@ -3690,7 +3749,8 @@ static int check_qp_db_process_status(struct hns_roce_dev *hr_dev, do { tsp_st = roce_read(hr_dev, ROCEE_TSP_BP_ST_REG); - if (roce_get_bit(tsp_st, + tmp = cpu_to_le32(tsp_st); + if (roce_get_bit(tmp, ROCEE_TSP_BP_ST_QH_FIFO_ENTRY_S) == 1) { *wait_stage = HNS_ROCE_V1_DB_WAIT_OK; return 0; @@ -3699,8 +3759,9 @@ static int check_qp_db_process_status(struct hns_roce_dev *hr_dev, if (!time_before(jiffies, end)) { dev_dbg(dev, "QP(0x%lx) db process stage1 timeout when send ptr equals issue ptr.\n" "issue 0x%x send 0x%x.\n", - hr_qp->qpn, sdb_issue_ptr, - sdb_send_ptr); + hr_qp->qpn, + le32_to_cpu(sdb_issue_ptr_tmp), + le32_to_cpu(sdb_send_ptr_tmp)); return 0; } @@ -4102,9 +4163,9 @@ static void hns_roce_v1_cq_err_handle(struct hns_roce_dev *hr_dev, struct device *dev = &hr_dev->pdev->dev; u32 cqn; - cqn = le32_to_cpu(roce_get_field(aeqe->event.cq_event.cq, + cqn = roce_get_field(aeqe->event.cq_event.cq, HNS_ROCE_AEQE_EVENT_CQ_EVENT_CQ_CQN_M, - HNS_ROCE_AEQE_EVENT_CQ_EVENT_CQ_CQN_S)); + HNS_ROCE_AEQE_EVENT_CQ_EVENT_CQ_CQN_S); switch (event_type) { case HNS_ROCE_EVENT_TYPE_CQ_ACCESS_ERROR: @@ -4340,6 +4401,7 @@ static irqreturn_t hns_roce_v1_msix_interrupt_abn(int irq, void *dev_id) u32 aeshift_val; u32 ceshift_val; u32 cemask_val; + __le32 tmp; int i; /* @@ -4348,30 +4410,34 @@ static irqreturn_t hns_roce_v1_msix_interrupt_abn(int irq, void *dev_id) * interrupt, mask irq, clear irq, cancel mask operation */ aeshift_val = roce_read(hr_dev, ROCEE_CAEP_AEQC_AEQE_SHIFT_REG); + tmp = cpu_to_le32(aeshift_val); /* AEQE overflow */ - if (roce_get_bit(aeshift_val, + if (roce_get_bit(tmp, ROCEE_CAEP_AEQC_AEQE_SHIFT_CAEP_AEQ_ALM_OVF_INT_ST_S) == 1) { dev_warn(dev, "AEQ overflow!\n"); /* Set mask */ caepaemask_val = roce_read(hr_dev, ROCEE_CAEP_AE_MASK_REG); - roce_set_bit(caepaemask_val, - ROCEE_CAEP_AE_MASK_CAEP_AEQ_ALM_OVF_MASK_S, + tmp = cpu_to_le32(caepaemask_val); + roce_set_bit(tmp, ROCEE_CAEP_AE_MASK_CAEP_AEQ_ALM_OVF_MASK_S, HNS_ROCE_INT_MASK_ENABLE); + caepaemask_val = le32_to_cpu(tmp); roce_write(hr_dev, ROCEE_CAEP_AE_MASK_REG, caepaemask_val); /* Clear int state(INT_WC : write 1 clear) */ caepaest_val = roce_read(hr_dev, ROCEE_CAEP_AE_ST_REG); - roce_set_bit(caepaest_val, - ROCEE_CAEP_AE_ST_CAEP_AEQ_ALM_OVF_S, 1); + tmp = cpu_to_le32(caepaest_val); + roce_set_bit(tmp, ROCEE_CAEP_AE_ST_CAEP_AEQ_ALM_OVF_S, 1); + caepaest_val = le32_to_cpu(tmp); roce_write(hr_dev, ROCEE_CAEP_AE_ST_REG, caepaest_val); /* Clear mask */ caepaemask_val = roce_read(hr_dev, ROCEE_CAEP_AE_MASK_REG); - roce_set_bit(caepaemask_val, - ROCEE_CAEP_AE_MASK_CAEP_AEQ_ALM_OVF_MASK_S, + tmp = cpu_to_le32(caepaemask_val); + roce_set_bit(tmp, ROCEE_CAEP_AE_MASK_CAEP_AEQ_ALM_OVF_MASK_S, HNS_ROCE_INT_MASK_DISABLE); + caepaemask_val = le32_to_cpu(tmp); roce_write(hr_dev, ROCEE_CAEP_AE_MASK_REG, caepaemask_val); } @@ -4379,8 +4445,9 @@ static irqreturn_t hns_roce_v1_msix_interrupt_abn(int irq, void *dev_id) for (i = 0; i < hr_dev->caps.num_comp_vectors; i++) { ceshift_val = roce_read(hr_dev, ROCEE_CAEP_CEQC_SHIFT_0_REG + i * CEQ_REG_OFFSET); + tmp = cpu_to_le32(ceshift_val); - if (roce_get_bit(ceshift_val, + if (roce_get_bit(tmp, ROCEE_CAEP_CEQC_SHIFT_CAEP_CEQ_ALM_OVF_INT_ST_S) == 1) { dev_warn(dev, "CEQ[%d] almost overflow!\n", i); int_work++; @@ -4389,9 +4456,11 @@ static irqreturn_t hns_roce_v1_msix_interrupt_abn(int irq, void *dev_id) cemask_val = roce_read(hr_dev, ROCEE_CAEP_CE_IRQ_MASK_0_REG + i * CEQ_REG_OFFSET); - roce_set_bit(cemask_val, + tmp = cpu_to_le32(cemask_val); + roce_set_bit(tmp, ROCEE_CAEP_CE_IRQ_MASK_CAEP_CEQ_ALM_OVF_MASK_S, HNS_ROCE_INT_MASK_ENABLE); + cemask_val = le32_to_cpu(tmp); roce_write(hr_dev, ROCEE_CAEP_CE_IRQ_MASK_0_REG + i * CEQ_REG_OFFSET, cemask_val); @@ -4399,9 +4468,11 @@ static irqreturn_t hns_roce_v1_msix_interrupt_abn(int irq, void *dev_id) cealmovf_val = roce_read(hr_dev, ROCEE_CAEP_CEQ_ALM_OVF_0_REG + i * CEQ_REG_OFFSET); - roce_set_bit(cealmovf_val, + tmp = cpu_to_le32(cealmovf_val); + roce_set_bit(tmp, ROCEE_CAEP_CEQ_ALM_OVF_CAEP_CEQ_ALM_OVF_S, 1); + cealmovf_val = le32_to_cpu(tmp); roce_write(hr_dev, ROCEE_CAEP_CEQ_ALM_OVF_0_REG + i * CEQ_REG_OFFSET, cealmovf_val); @@ -4409,9 +4480,11 @@ static irqreturn_t hns_roce_v1_msix_interrupt_abn(int irq, void *dev_id) cemask_val = roce_read(hr_dev, ROCEE_CAEP_CE_IRQ_MASK_0_REG + i * CEQ_REG_OFFSET); - roce_set_bit(cemask_val, + tmp = cpu_to_le32(cemask_val); + roce_set_bit(tmp, ROCEE_CAEP_CE_IRQ_MASK_CAEP_CEQ_ALM_OVF_MASK_S, HNS_ROCE_INT_MASK_DISABLE); + cemask_val = le32_to_cpu(tmp); roce_write(hr_dev, ROCEE_CAEP_CE_IRQ_MASK_0_REG + i * CEQ_REG_OFFSET, cemask_val); } @@ -4435,13 +4508,16 @@ static void hns_roce_v1_int_mask_enable(struct hns_roce_dev *hr_dev) { u32 aemask_val; int masken = 0; + __le32 tmp; int i; /* AEQ INT */ aemask_val = roce_read(hr_dev, ROCEE_CAEP_AE_MASK_REG); - roce_set_bit(aemask_val, ROCEE_CAEP_AE_MASK_CAEP_AEQ_ALM_OVF_MASK_S, + tmp = cpu_to_le32(aemask_val); + roce_set_bit(tmp, ROCEE_CAEP_AE_MASK_CAEP_AEQ_ALM_OVF_MASK_S, masken); - roce_set_bit(aemask_val, ROCEE_CAEP_AE_MASK_CAEP_AE_IRQ_MASK_S, masken); + roce_set_bit(tmp, ROCEE_CAEP_AE_MASK_CAEP_AE_IRQ_MASK_S, masken); + aemask_val = le32_to_cpu(tmp); roce_write(hr_dev, ROCEE_CAEP_AE_MASK_REG, aemask_val); /* CEQ INT */ @@ -4473,20 +4549,24 @@ static void hns_roce_v1_enable_eq(struct hns_roce_dev *hr_dev, int eq_num, int enable_flag) { void __iomem *eqc = hr_dev->eq_table.eqc_base[eq_num]; + __le32 tmp; u32 val; val = readl(eqc); + tmp = cpu_to_le32(val); if (enable_flag) - roce_set_field(val, + roce_set_field(tmp, ROCEE_CAEP_AEQC_AEQE_SHIFT_CAEP_AEQC_STATE_M, ROCEE_CAEP_AEQC_AEQE_SHIFT_CAEP_AEQC_STATE_S, HNS_ROCE_EQ_STAT_VALID); else - roce_set_field(val, + roce_set_field(tmp, ROCEE_CAEP_AEQC_AEQE_SHIFT_CAEP_AEQC_STATE_M, ROCEE_CAEP_AEQC_AEQE_SHIFT_CAEP_AEQC_STATE_S, HNS_ROCE_EQ_STAT_INVALID); + + val = le32_to_cpu(tmp); writel(val, eqc); } @@ -4499,6 +4579,9 @@ static int hns_roce_v1_create_eq(struct hns_roce_dev *hr_dev, u32 eqconsindx_val = 0; u32 eqcuridx_val = 0; u32 eqshift_val = 0; + __le32 tmp2 = 0; + __le32 tmp1 = 0; + __le32 tmp = 0; int num_bas; int ret; int i; @@ -4530,14 +4613,13 @@ static int hns_roce_v1_create_eq(struct hns_roce_dev *hr_dev, memset(eq->buf_list[i].buf, 0, HNS_ROCE_BA_SIZE); } eq->cons_index = 0; - roce_set_field(eqshift_val, - ROCEE_CAEP_AEQC_AEQE_SHIFT_CAEP_AEQC_STATE_M, + roce_set_field(tmp, ROCEE_CAEP_AEQC_AEQE_SHIFT_CAEP_AEQC_STATE_M, ROCEE_CAEP_AEQC_AEQE_SHIFT_CAEP_AEQC_STATE_S, HNS_ROCE_EQ_STAT_INVALID); - roce_set_field(eqshift_val, - ROCEE_CAEP_AEQC_AEQE_SHIFT_CAEP_AEQC_AEQE_SHIFT_M, + roce_set_field(tmp, ROCEE_CAEP_AEQC_AEQE_SHIFT_CAEP_AEQC_AEQE_SHIFT_M, ROCEE_CAEP_AEQC_AEQE_SHIFT_CAEP_AEQC_AEQE_SHIFT_S, eq->log_entries); + eqshift_val = le32_to_cpu(tmp); writel(eqshift_val, eqc); /* Configure eq extended address 12~44bit */ @@ -4549,18 +4631,18 @@ static int hns_roce_v1_create_eq(struct hns_roce_dev *hr_dev, * using 4K page, and shift more 32 because of * caculating the high 32 bit value evaluated to hardware. */ - roce_set_field(eqcuridx_val, ROCEE_CAEP_AEQE_CUR_IDX_CAEP_AEQ_BT_H_M, + roce_set_field(tmp1, ROCEE_CAEP_AEQE_CUR_IDX_CAEP_AEQ_BT_H_M, ROCEE_CAEP_AEQE_CUR_IDX_CAEP_AEQ_BT_H_S, eq->buf_list[0].map >> 44); - roce_set_field(eqcuridx_val, - ROCEE_CAEP_AEQE_CUR_IDX_CAEP_AEQE_CUR_IDX_M, + roce_set_field(tmp1, ROCEE_CAEP_AEQE_CUR_IDX_CAEP_AEQE_CUR_IDX_M, ROCEE_CAEP_AEQE_CUR_IDX_CAEP_AEQE_CUR_IDX_S, 0); + eqcuridx_val = le32_to_cpu(tmp1); writel(eqcuridx_val, eqc + 8); /* Configure eq consumer index */ - roce_set_field(eqconsindx_val, - ROCEE_CAEP_AEQE_CONS_IDX_CAEP_AEQE_CONS_IDX_M, + roce_set_field(tmp2, ROCEE_CAEP_AEQE_CONS_IDX_CAEP_AEQE_CONS_IDX_M, ROCEE_CAEP_AEQE_CONS_IDX_CAEP_AEQE_CONS_IDX_S, 0); + eqconsindx_val = le32_to_cpu(tmp2); writel(eqconsindx_val, eqc + 0xc); return 0; diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v1.h b/drivers/infiniband/hw/hns/hns_roce_hw_v1.h index e9a2717ea7cd..66440147d9eb 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v1.h +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v1.h @@ -260,7 +260,7 @@ struct hns_roce_cqe { __le32 cqe_byte_4; union { __le32 r_key; - __be32 immediate_data; + __le32 immediate_data; }; __le32 byte_cnt; __le32 cqe_byte_16; diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index a6e11be0ea0f..0218c0f8c2a7 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -36,6 +36,7 @@ #include <linux/kernel.h> #include <linux/types.h> #include <net/addrconf.h> +#include <rdma/ib_addr.h> #include <rdma/ib_umem.h> #include "hnae3.h" @@ -53,7 +54,7 @@ static void set_data_seg_v2(struct hns_roce_v2_wqe_data_seg *dseg, dseg->len = cpu_to_le32(sg->length); } -static void set_extend_sge(struct hns_roce_qp *qp, struct ib_send_wr *wr, +static void set_extend_sge(struct hns_roce_qp *qp, const struct ib_send_wr *wr, unsigned int *sge_ind) { struct hns_roce_v2_wqe_data_seg *dseg; @@ -100,10 +101,10 @@ static void set_extend_sge(struct hns_roce_qp *qp, struct ib_send_wr *wr, } } -static int set_rwqe_data_seg(struct ib_qp *ibqp, struct ib_send_wr *wr, +static int set_rwqe_data_seg(struct ib_qp *ibqp, const struct ib_send_wr *wr, struct hns_roce_v2_rc_send_wqe *rc_sq_wqe, void *wqe, unsigned int *sge_ind, - struct ib_send_wr **bad_wr) + const struct ib_send_wr **bad_wr) { struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device); struct hns_roce_v2_wqe_data_seg *dseg = wqe; @@ -164,23 +165,30 @@ static int set_rwqe_data_seg(struct ib_qp *ibqp, struct ib_send_wr *wr, return 0; } -static int hns_roce_v2_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, - struct ib_send_wr **bad_wr) +static int hns_roce_v2_modify_qp(struct ib_qp *ibqp, + const struct ib_qp_attr *attr, + int attr_mask, enum ib_qp_state cur_state, + enum ib_qp_state new_state); + +static int hns_roce_v2_post_send(struct ib_qp *ibqp, + const struct ib_send_wr *wr, + const struct ib_send_wr **bad_wr) { struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device); struct hns_roce_ah *ah = to_hr_ah(ud_wr(wr)->ah); struct hns_roce_v2_ud_send_wqe *ud_sq_wqe; struct hns_roce_v2_rc_send_wqe *rc_sq_wqe; struct hns_roce_qp *qp = to_hr_qp(ibqp); - struct hns_roce_v2_wqe_data_seg *dseg; struct device *dev = hr_dev->dev; struct hns_roce_v2_db sq_db; + struct ib_qp_attr attr; unsigned int sge_ind = 0; unsigned int owner_bit; unsigned long flags; unsigned int ind; void *wqe = NULL; bool loopback; + int attr_mask; u32 tmp_len; int ret = 0; u8 *smac; @@ -273,7 +281,8 @@ static int hns_roce_v2_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, switch (wr->opcode) { case IB_WR_SEND_WITH_IMM: case IB_WR_RDMA_WRITE_WITH_IMM: - ud_sq_wqe->immtdata = wr->ex.imm_data; + ud_sq_wqe->immtdata = + cpu_to_le32(be32_to_cpu(wr->ex.imm_data)); break; default: ud_sq_wqe->immtdata = 0; @@ -330,14 +339,13 @@ static int hns_roce_v2_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, roce_set_field(ud_sq_wqe->byte_36, V2_UD_SEND_WQE_BYTE_36_TCLASS_M, V2_UD_SEND_WQE_BYTE_36_TCLASS_S, - 0); - roce_set_field(ud_sq_wqe->byte_36, - V2_UD_SEND_WQE_BYTE_36_TCLASS_M, - V2_UD_SEND_WQE_BYTE_36_TCLASS_S, - 0); + ah->av.sl_tclass_flowlabel >> + HNS_ROCE_TCLASS_SHIFT); roce_set_field(ud_sq_wqe->byte_40, V2_UD_SEND_WQE_BYTE_40_FLOW_LABEL_M, - V2_UD_SEND_WQE_BYTE_40_FLOW_LABEL_S, 0); + V2_UD_SEND_WQE_BYTE_40_FLOW_LABEL_S, + ah->av.sl_tclass_flowlabel & + HNS_ROCE_FLOW_LABEL_MASK); roce_set_field(ud_sq_wqe->byte_40, V2_UD_SEND_WQE_BYTE_40_SL_M, V2_UD_SEND_WQE_BYTE_40_SL_S, @@ -371,7 +379,8 @@ static int hns_roce_v2_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, switch (wr->opcode) { case IB_WR_SEND_WITH_IMM: case IB_WR_RDMA_WRITE_WITH_IMM: - rc_sq_wqe->immtdata = wr->ex.imm_data; + rc_sq_wqe->immtdata = + cpu_to_le32(be32_to_cpu(wr->ex.imm_data)); break; case IB_WR_SEND_WITH_INV: rc_sq_wqe->inv_key = @@ -485,7 +494,6 @@ static int hns_roce_v2_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, } wqe += sizeof(struct hns_roce_v2_rc_send_wqe); - dseg = wqe; ret = set_rwqe_data_seg(ibqp, wr, rc_sq_wqe, wqe, &sge_ind, bad_wr); @@ -523,6 +531,19 @@ out: qp->sq_next_wqe = ind; qp->next_sge = sge_ind; + + if (qp->state == IB_QPS_ERR) { + attr_mask = IB_QP_STATE; + attr.qp_state = IB_QPS_ERR; + + ret = hns_roce_v2_modify_qp(&qp->ibqp, &attr, attr_mask, + qp->state, IB_QPS_ERR); + if (ret) { + spin_unlock_irqrestore(&qp->sq.lock, flags); + *bad_wr = wr; + return ret; + } + } } spin_unlock_irqrestore(&qp->sq.lock, flags); @@ -530,16 +551,19 @@ out: return ret; } -static int hns_roce_v2_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr, - struct ib_recv_wr **bad_wr) +static int hns_roce_v2_post_recv(struct ib_qp *ibqp, + const struct ib_recv_wr *wr, + const struct ib_recv_wr **bad_wr) { struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device); struct hns_roce_qp *hr_qp = to_hr_qp(ibqp); struct hns_roce_v2_wqe_data_seg *dseg; struct hns_roce_rinl_sge *sge_list; struct device *dev = hr_dev->dev; + struct ib_qp_attr attr; unsigned long flags; void *wqe = NULL; + int attr_mask; int ret = 0; int nreq; int ind; @@ -608,6 +632,20 @@ out: wmb(); *hr_qp->rdb.db_record = hr_qp->rq.head & 0xffff; + + if (hr_qp->state == IB_QPS_ERR) { + attr_mask = IB_QP_STATE; + attr.qp_state = IB_QPS_ERR; + + ret = hns_roce_v2_modify_qp(&hr_qp->ibqp, &attr, + attr_mask, hr_qp->state, + IB_QPS_ERR); + if (ret) { + spin_unlock_irqrestore(&hr_qp->rq.lock, flags); + *bad_wr = wr; + return ret; + } + } } spin_unlock_irqrestore(&hr_qp->rq.lock, flags); @@ -702,8 +740,8 @@ static int hns_roce_v2_cmq_init(struct hns_roce_dev *hr_dev) int ret; /* Setup the queue entries for command queue */ - priv->cmq.csq.desc_num = 1024; - priv->cmq.crq.desc_num = 1024; + priv->cmq.csq.desc_num = CMD_CSQ_DESC_NUM; + priv->cmq.crq.desc_num = CMD_CRQ_DESC_NUM; /* Setup the lock for command queue */ spin_lock_init(&priv->cmq.csq.lock); @@ -925,7 +963,8 @@ static int hns_roce_config_global_param(struct hns_roce_dev *hr_dev) static int hns_roce_query_pf_resource(struct hns_roce_dev *hr_dev) { struct hns_roce_cmq_desc desc[2]; - struct hns_roce_pf_res *res; + struct hns_roce_pf_res_a *req_a; + struct hns_roce_pf_res_b *req_b; int ret; int i; @@ -943,21 +982,26 @@ static int hns_roce_query_pf_resource(struct hns_roce_dev *hr_dev) if (ret) return ret; - res = (struct hns_roce_pf_res *)desc[0].data; + req_a = (struct hns_roce_pf_res_a *)desc[0].data; + req_b = (struct hns_roce_pf_res_b *)desc[1].data; - hr_dev->caps.qpc_bt_num = roce_get_field(res->qpc_bt_idx_num, + hr_dev->caps.qpc_bt_num = roce_get_field(req_a->qpc_bt_idx_num, PF_RES_DATA_1_PF_QPC_BT_NUM_M, PF_RES_DATA_1_PF_QPC_BT_NUM_S); - hr_dev->caps.srqc_bt_num = roce_get_field(res->srqc_bt_idx_num, + hr_dev->caps.srqc_bt_num = roce_get_field(req_a->srqc_bt_idx_num, PF_RES_DATA_2_PF_SRQC_BT_NUM_M, PF_RES_DATA_2_PF_SRQC_BT_NUM_S); - hr_dev->caps.cqc_bt_num = roce_get_field(res->cqc_bt_idx_num, + hr_dev->caps.cqc_bt_num = roce_get_field(req_a->cqc_bt_idx_num, PF_RES_DATA_3_PF_CQC_BT_NUM_M, PF_RES_DATA_3_PF_CQC_BT_NUM_S); - hr_dev->caps.mpt_bt_num = roce_get_field(res->mpt_bt_idx_num, + hr_dev->caps.mpt_bt_num = roce_get_field(req_a->mpt_bt_idx_num, PF_RES_DATA_4_PF_MPT_BT_NUM_M, PF_RES_DATA_4_PF_MPT_BT_NUM_S); + hr_dev->caps.sl_num = roce_get_field(req_b->qid_idx_sl_num, + PF_RES_DATA_3_PF_SL_NUM_M, + PF_RES_DATA_3_PF_SL_NUM_S); + return 0; } @@ -1203,12 +1247,14 @@ static int hns_roce_v2_profile(struct hns_roce_dev *hr_dev) caps->eqe_ba_pg_sz = 0; caps->eqe_buf_pg_sz = 0; caps->eqe_hop_num = HNS_ROCE_EQE_HOP_NUM; + caps->tsq_buf_pg_sz = 0; caps->chunk_sz = HNS_ROCE_V2_TABLE_CHUNK_SIZE; caps->flags = HNS_ROCE_CAP_FLAG_REREG_MR | HNS_ROCE_CAP_FLAG_ROCE_V1_V2 | HNS_ROCE_CAP_FLAG_RQ_INLINE | - HNS_ROCE_CAP_FLAG_RECORD_DB; + HNS_ROCE_CAP_FLAG_RECORD_DB | + HNS_ROCE_CAP_FLAG_SQ_RECORD_DB; caps->pkey_table_len[0] = 1; caps->gid_table_len[0] = HNS_ROCE_V2_GID_INDEX_NUM; caps->ceqe_depth = HNS_ROCE_V2_COMP_EQE_NUM; @@ -1224,6 +1270,228 @@ static int hns_roce_v2_profile(struct hns_roce_dev *hr_dev) return ret; } +static int hns_roce_config_link_table(struct hns_roce_dev *hr_dev, + enum hns_roce_link_table_type type) +{ + struct hns_roce_cmq_desc desc[2]; + struct hns_roce_cfg_llm_a *req_a = + (struct hns_roce_cfg_llm_a *)desc[0].data; + struct hns_roce_cfg_llm_b *req_b = + (struct hns_roce_cfg_llm_b *)desc[1].data; + struct hns_roce_v2_priv *priv = hr_dev->priv; + struct hns_roce_link_table *link_tbl; + struct hns_roce_link_table_entry *entry; + enum hns_roce_opcode_type opcode; + u32 page_num; + int i; + + switch (type) { + case TSQ_LINK_TABLE: + link_tbl = &priv->tsq; + opcode = HNS_ROCE_OPC_CFG_EXT_LLM; + break; + case TPQ_LINK_TABLE: + link_tbl = &priv->tpq; + opcode = HNS_ROCE_OPC_CFG_TMOUT_LLM; + break; + default: + return -EINVAL; + } + + page_num = link_tbl->npages; + entry = link_tbl->table.buf; + memset(req_a, 0, sizeof(*req_a)); + memset(req_b, 0, sizeof(*req_b)); + + for (i = 0; i < 2; i++) { + hns_roce_cmq_setup_basic_desc(&desc[i], opcode, false); + + if (i == 0) + desc[i].flag |= cpu_to_le16(HNS_ROCE_CMD_FLAG_NEXT); + else + desc[i].flag &= ~cpu_to_le16(HNS_ROCE_CMD_FLAG_NEXT); + + if (i == 0) { + req_a->base_addr_l = link_tbl->table.map & 0xffffffff; + req_a->base_addr_h = (link_tbl->table.map >> 32) & + 0xffffffff; + roce_set_field(req_a->depth_pgsz_init_en, + CFG_LLM_QUE_DEPTH_M, + CFG_LLM_QUE_DEPTH_S, + link_tbl->npages); + roce_set_field(req_a->depth_pgsz_init_en, + CFG_LLM_QUE_PGSZ_M, + CFG_LLM_QUE_PGSZ_S, + link_tbl->pg_sz); + req_a->head_ba_l = entry[0].blk_ba0; + req_a->head_ba_h_nxtptr = entry[0].blk_ba1_nxt_ptr; + roce_set_field(req_a->head_ptr, + CFG_LLM_HEAD_PTR_M, + CFG_LLM_HEAD_PTR_S, 0); + } else { + req_b->tail_ba_l = entry[page_num - 1].blk_ba0; + roce_set_field(req_b->tail_ba_h, + CFG_LLM_TAIL_BA_H_M, + CFG_LLM_TAIL_BA_H_S, + entry[page_num - 1].blk_ba1_nxt_ptr & + HNS_ROCE_LINK_TABLE_BA1_M); + roce_set_field(req_b->tail_ptr, + CFG_LLM_TAIL_PTR_M, + CFG_LLM_TAIL_PTR_S, + (entry[page_num - 2].blk_ba1_nxt_ptr & + HNS_ROCE_LINK_TABLE_NXT_PTR_M) >> + HNS_ROCE_LINK_TABLE_NXT_PTR_S); + } + } + roce_set_field(req_a->depth_pgsz_init_en, + CFG_LLM_INIT_EN_M, CFG_LLM_INIT_EN_S, 1); + + return hns_roce_cmq_send(hr_dev, desc, 2); +} + +static int hns_roce_init_link_table(struct hns_roce_dev *hr_dev, + enum hns_roce_link_table_type type) +{ + struct hns_roce_v2_priv *priv = hr_dev->priv; + struct hns_roce_link_table *link_tbl; + struct hns_roce_link_table_entry *entry; + struct device *dev = hr_dev->dev; + u32 buf_chk_sz; + dma_addr_t t; + int func_num = 1; + int pg_num_a; + int pg_num_b; + int pg_num; + int size; + int i; + + switch (type) { + case TSQ_LINK_TABLE: + link_tbl = &priv->tsq; + buf_chk_sz = 1 << (hr_dev->caps.tsq_buf_pg_sz + PAGE_SHIFT); + pg_num_a = hr_dev->caps.num_qps * 8 / buf_chk_sz; + pg_num_b = hr_dev->caps.sl_num * 4 + 2; + break; + case TPQ_LINK_TABLE: + link_tbl = &priv->tpq; + buf_chk_sz = 1 << (hr_dev->caps.tpq_buf_pg_sz + PAGE_SHIFT); + pg_num_a = hr_dev->caps.num_cqs * 4 / buf_chk_sz; + pg_num_b = 2 * 4 * func_num + 2; + break; + default: + return -EINVAL; + } + + pg_num = max(pg_num_a, pg_num_b); + size = pg_num * sizeof(struct hns_roce_link_table_entry); + + link_tbl->table.buf = dma_alloc_coherent(dev, size, + &link_tbl->table.map, + GFP_KERNEL); + if (!link_tbl->table.buf) + goto out; + + link_tbl->pg_list = kcalloc(pg_num, sizeof(*link_tbl->pg_list), + GFP_KERNEL); + if (!link_tbl->pg_list) + goto err_kcalloc_failed; + + entry = link_tbl->table.buf; + for (i = 0; i < pg_num; ++i) { + link_tbl->pg_list[i].buf = dma_alloc_coherent(dev, buf_chk_sz, + &t, GFP_KERNEL); + if (!link_tbl->pg_list[i].buf) + goto err_alloc_buf_failed; + + link_tbl->pg_list[i].map = t; + memset(link_tbl->pg_list[i].buf, 0, buf_chk_sz); + + entry[i].blk_ba0 = (t >> 12) & 0xffffffff; + roce_set_field(entry[i].blk_ba1_nxt_ptr, + HNS_ROCE_LINK_TABLE_BA1_M, + HNS_ROCE_LINK_TABLE_BA1_S, + t >> 44); + + if (i < (pg_num - 1)) + roce_set_field(entry[i].blk_ba1_nxt_ptr, + HNS_ROCE_LINK_TABLE_NXT_PTR_M, + HNS_ROCE_LINK_TABLE_NXT_PTR_S, + i + 1); + } + link_tbl->npages = pg_num; + link_tbl->pg_sz = buf_chk_sz; + + return hns_roce_config_link_table(hr_dev, type); + +err_alloc_buf_failed: + for (i -= 1; i >= 0; i--) + dma_free_coherent(dev, buf_chk_sz, + link_tbl->pg_list[i].buf, + link_tbl->pg_list[i].map); + kfree(link_tbl->pg_list); + +err_kcalloc_failed: + dma_free_coherent(dev, size, link_tbl->table.buf, + link_tbl->table.map); + +out: + return -ENOMEM; +} + +static void hns_roce_free_link_table(struct hns_roce_dev *hr_dev, + struct hns_roce_link_table *link_tbl) +{ + struct device *dev = hr_dev->dev; + int size; + int i; + + size = link_tbl->npages * sizeof(struct hns_roce_link_table_entry); + + for (i = 0; i < link_tbl->npages; ++i) + if (link_tbl->pg_list[i].buf) + dma_free_coherent(dev, link_tbl->pg_sz, + link_tbl->pg_list[i].buf, + link_tbl->pg_list[i].map); + kfree(link_tbl->pg_list); + + dma_free_coherent(dev, size, link_tbl->table.buf, + link_tbl->table.map); +} + +static int hns_roce_v2_init(struct hns_roce_dev *hr_dev) +{ + struct hns_roce_v2_priv *priv = hr_dev->priv; + int ret; + + /* TSQ includes SQ doorbell and ack doorbell */ + ret = hns_roce_init_link_table(hr_dev, TSQ_LINK_TABLE); + if (ret) { + dev_err(hr_dev->dev, "TSQ init failed, ret = %d.\n", ret); + return ret; + } + + ret = hns_roce_init_link_table(hr_dev, TPQ_LINK_TABLE); + if (ret) { + dev_err(hr_dev->dev, "TPQ init failed, ret = %d.\n", ret); + goto err_tpq_init_failed; + } + + return 0; + +err_tpq_init_failed: + hns_roce_free_link_table(hr_dev, &priv->tsq); + + return ret; +} + +static void hns_roce_v2_exit(struct hns_roce_dev *hr_dev) +{ + struct hns_roce_v2_priv *priv = hr_dev->priv; + + hns_roce_free_link_table(hr_dev, &priv->tpq); + hns_roce_free_link_table(hr_dev, &priv->tsq); +} + static int hns_roce_v2_cmd_pending(struct hns_roce_dev *hr_dev) { u32 status = readl(hr_dev->reg_base + ROCEE_VF_MB_STATUS_REG); @@ -1307,13 +1575,45 @@ static int hns_roce_v2_chk_mbox(struct hns_roce_dev *hr_dev, return 0; } +static int hns_roce_config_sgid_table(struct hns_roce_dev *hr_dev, + int gid_index, const union ib_gid *gid, + enum hns_roce_sgid_type sgid_type) +{ + struct hns_roce_cmq_desc desc; + struct hns_roce_cfg_sgid_tb *sgid_tb = + (struct hns_roce_cfg_sgid_tb *)desc.data; + u32 *p; + + hns_roce_cmq_setup_basic_desc(&desc, HNS_ROCE_OPC_CFG_SGID_TB, false); + + roce_set_field(sgid_tb->table_idx_rsv, + CFG_SGID_TB_TABLE_IDX_M, + CFG_SGID_TB_TABLE_IDX_S, gid_index); + roce_set_field(sgid_tb->vf_sgid_type_rsv, + CFG_SGID_TB_VF_SGID_TYPE_M, + CFG_SGID_TB_VF_SGID_TYPE_S, sgid_type); + + p = (u32 *)&gid->raw[0]; + sgid_tb->vf_sgid_l = cpu_to_le32(*p); + + p = (u32 *)&gid->raw[4]; + sgid_tb->vf_sgid_ml = cpu_to_le32(*p); + + p = (u32 *)&gid->raw[8]; + sgid_tb->vf_sgid_mh = cpu_to_le32(*p); + + p = (u32 *)&gid->raw[0xc]; + sgid_tb->vf_sgid_h = cpu_to_le32(*p); + + return hns_roce_cmq_send(hr_dev, &desc, 1); +} + static int hns_roce_v2_set_gid(struct hns_roce_dev *hr_dev, u8 port, - int gid_index, union ib_gid *gid, + int gid_index, const union ib_gid *gid, const struct ib_gid_attr *attr) { enum hns_roce_sgid_type sgid_type = GID_TYPE_FLAG_ROCE_V1; - u32 *p; - u32 val; + int ret; if (!gid || !attr) return -EINVAL; @@ -1328,49 +1628,37 @@ static int hns_roce_v2_set_gid(struct hns_roce_dev *hr_dev, u8 port, sgid_type = GID_TYPE_FLAG_ROCE_V2_IPV6; } - p = (u32 *)&gid->raw[0]; - roce_raw_write(*p, hr_dev->reg_base + ROCEE_VF_SGID_CFG0_REG + - 0x20 * gid_index); - - p = (u32 *)&gid->raw[4]; - roce_raw_write(*p, hr_dev->reg_base + ROCEE_VF_SGID_CFG1_REG + - 0x20 * gid_index); - - p = (u32 *)&gid->raw[8]; - roce_raw_write(*p, hr_dev->reg_base + ROCEE_VF_SGID_CFG2_REG + - 0x20 * gid_index); - - p = (u32 *)&gid->raw[0xc]; - roce_raw_write(*p, hr_dev->reg_base + ROCEE_VF_SGID_CFG3_REG + - 0x20 * gid_index); - - val = roce_read(hr_dev, ROCEE_VF_SGID_CFG4_REG + 0x20 * gid_index); - roce_set_field(val, ROCEE_VF_SGID_CFG4_SGID_TYPE_M, - ROCEE_VF_SGID_CFG4_SGID_TYPE_S, sgid_type); - - roce_write(hr_dev, ROCEE_VF_SGID_CFG4_REG + 0x20 * gid_index, val); + ret = hns_roce_config_sgid_table(hr_dev, gid_index, gid, sgid_type); + if (ret) + dev_err(hr_dev->dev, "Configure sgid table failed(%d)!\n", ret); - return 0; + return ret; } static int hns_roce_v2_set_mac(struct hns_roce_dev *hr_dev, u8 phy_port, u8 *addr) { + struct hns_roce_cmq_desc desc; + struct hns_roce_cfg_smac_tb *smac_tb = + (struct hns_roce_cfg_smac_tb *)desc.data; u16 reg_smac_h; u32 reg_smac_l; - u32 val; + + hns_roce_cmq_setup_basic_desc(&desc, HNS_ROCE_OPC_CFG_SMAC_TB, false); reg_smac_l = *(u32 *)(&addr[0]); - roce_raw_write(reg_smac_l, hr_dev->reg_base + ROCEE_VF_SMAC_CFG0_REG + - 0x08 * phy_port); - val = roce_read(hr_dev, ROCEE_VF_SMAC_CFG1_REG + 0x08 * phy_port); + reg_smac_h = *(u16 *)(&addr[4]); - reg_smac_h = *(u16 *)(&addr[4]); - roce_set_field(val, ROCEE_VF_SMAC_CFG1_VF_SMAC_H_M, - ROCEE_VF_SMAC_CFG1_VF_SMAC_H_S, reg_smac_h); - roce_write(hr_dev, ROCEE_VF_SMAC_CFG1_REG + 0x08 * phy_port, val); + memset(smac_tb, 0, sizeof(*smac_tb)); + roce_set_field(smac_tb->tb_idx_rsv, + CFG_SMAC_TB_IDX_M, + CFG_SMAC_TB_IDX_S, phy_port); + roce_set_field(smac_tb->vf_smac_h_rsv, + CFG_SMAC_TB_VF_SMAC_H_M, + CFG_SMAC_TB_VF_SMAC_H_S, reg_smac_h); + smac_tb->vf_smac_l = reg_smac_l; - return 0; + return hns_roce_cmq_send(hr_dev, &desc, 1); } static int hns_roce_v2_write_mtpt(void *mb_buf, struct hns_roce_mr *mr, @@ -1758,6 +2046,8 @@ static int hns_roce_v2_poll_one(struct hns_roce_cq *hr_cq, struct hns_roce_v2_cqe *cqe; struct hns_roce_qp *hr_qp; struct hns_roce_wq *wq; + struct ib_qp_attr attr; + int attr_mask; int is_send; u16 wqe_ctr; u32 opcode; @@ -1844,8 +2134,17 @@ static int hns_roce_v2_poll_one(struct hns_roce_cq *hr_cq, break; } - /* CQE status error, directly return */ - if (wc->status != IB_WC_SUCCESS) + /* flush cqe if wc status is error, excluding flush error */ + if ((wc->status != IB_WC_SUCCESS) && + (wc->status != IB_WC_WR_FLUSH_ERR)) { + attr_mask = IB_QP_STATE; + attr.qp_state = IB_QPS_ERR; + return hns_roce_v2_modify_qp(&(*cur_qp)->ibqp, + &attr, attr_mask, + (*cur_qp)->state, IB_QPS_ERR); + } + + if (wc->status == IB_WC_WR_FLUSH_ERR) return 0; if (is_send) { @@ -1931,7 +2230,8 @@ static int hns_roce_v2_poll_one(struct hns_roce_cq *hr_cq, case HNS_ROCE_V2_OPCODE_RDMA_WRITE_IMM: wc->opcode = IB_WC_RECV_RDMA_WITH_IMM; wc->wc_flags = IB_WC_WITH_IMM; - wc->ex.imm_data = cqe->immtdata; + wc->ex.imm_data = + cpu_to_be32(le32_to_cpu(cqe->immtdata)); break; case HNS_ROCE_V2_OPCODE_SEND: wc->opcode = IB_WC_RECV; @@ -1940,7 +2240,8 @@ static int hns_roce_v2_poll_one(struct hns_roce_cq *hr_cq, case HNS_ROCE_V2_OPCODE_SEND_WITH_IMM: wc->opcode = IB_WC_RECV; wc->wc_flags = IB_WC_WITH_IMM; - wc->ex.imm_data = cqe->immtdata; + wc->ex.imm_data = + cpu_to_be32(le32_to_cpu(cqe->immtdata)); break; case HNS_ROCE_V2_OPCODE_SEND_WITH_INV: wc->opcode = IB_WC_RECV; @@ -2273,10 +2574,10 @@ static void modify_qp_reset_to_init(struct ib_qp *ibqp, V2_QPC_BYTE_20_RQ_SHIFT_M, V2_QPC_BYTE_20_RQ_SHIFT_S, 0); /* No VLAN need to set 0xFFF */ - roce_set_field(context->byte_24_mtu_tc, V2_QPC_BYTE_24_VLAN_IDX_M, - V2_QPC_BYTE_24_VLAN_IDX_S, 0xfff); - roce_set_field(qpc_mask->byte_24_mtu_tc, V2_QPC_BYTE_24_VLAN_IDX_M, - V2_QPC_BYTE_24_VLAN_IDX_S, 0); + roce_set_field(context->byte_24_mtu_tc, V2_QPC_BYTE_24_VLAN_ID_M, + V2_QPC_BYTE_24_VLAN_ID_S, 0xfff); + roce_set_field(qpc_mask->byte_24_mtu_tc, V2_QPC_BYTE_24_VLAN_ID_M, + V2_QPC_BYTE_24_VLAN_ID_S, 0); /* * Set some fields in context to zero, Because the default values @@ -2886,21 +3187,6 @@ static int modify_qp_init_to_rtr(struct ib_qp *ibqp, roce_set_field(qpc_mask->byte_56_dqpn_err, V2_QPC_BYTE_56_LP_PKTN_INI_M, V2_QPC_BYTE_56_LP_PKTN_INI_S, 0); - roce_set_field(context->byte_24_mtu_tc, V2_QPC_BYTE_24_HOP_LIMIT_M, - V2_QPC_BYTE_24_HOP_LIMIT_S, grh->hop_limit); - roce_set_field(qpc_mask->byte_24_mtu_tc, V2_QPC_BYTE_24_HOP_LIMIT_M, - V2_QPC_BYTE_24_HOP_LIMIT_S, 0); - - roce_set_field(context->byte_28_at_fl, V2_QPC_BYTE_28_FL_M, - V2_QPC_BYTE_28_FL_S, grh->flow_label); - roce_set_field(qpc_mask->byte_28_at_fl, V2_QPC_BYTE_28_FL_M, - V2_QPC_BYTE_28_FL_S, 0); - - roce_set_field(context->byte_24_mtu_tc, V2_QPC_BYTE_24_TC_M, - V2_QPC_BYTE_24_TC_S, grh->traffic_class); - roce_set_field(qpc_mask->byte_24_mtu_tc, V2_QPC_BYTE_24_TC_M, - V2_QPC_BYTE_24_TC_S, 0); - if (ibqp->qp_type == IB_QPT_GSI || ibqp->qp_type == IB_QPT_UD) roce_set_field(context->byte_24_mtu_tc, V2_QPC_BYTE_24_MTU_M, V2_QPC_BYTE_24_MTU_S, IB_MTU_4096); @@ -2911,9 +3197,6 @@ static int modify_qp_init_to_rtr(struct ib_qp *ibqp, roce_set_field(qpc_mask->byte_24_mtu_tc, V2_QPC_BYTE_24_MTU_M, V2_QPC_BYTE_24_MTU_S, 0); - memcpy(context->dgid, grh->dgid.raw, sizeof(grh->dgid.raw)); - memset(qpc_mask->dgid, 0, sizeof(grh->dgid.raw)); - roce_set_field(context->byte_84_rq_ci_pi, V2_QPC_BYTE_84_RQ_PRODUCER_IDX_M, V2_QPC_BYTE_84_RQ_PRODUCER_IDX_S, hr_qp->rq.head); @@ -2952,12 +3235,6 @@ static int modify_qp_init_to_rtr(struct ib_qp *ibqp, V2_QPC_BYTE_168_LP_SGEN_INI_M, V2_QPC_BYTE_168_LP_SGEN_INI_S, 0); - roce_set_field(context->byte_28_at_fl, V2_QPC_BYTE_28_SL_M, - V2_QPC_BYTE_28_SL_S, rdma_ah_get_sl(&attr->ah_attr)); - roce_set_field(qpc_mask->byte_28_at_fl, V2_QPC_BYTE_28_SL_M, - V2_QPC_BYTE_28_SL_S, 0); - hr_qp->sl = rdma_ah_get_sl(&attr->ah_attr); - return 0; } @@ -3135,13 +3412,6 @@ static int modify_qp_rtr_to_rts(struct ib_qp *ibqp, V2_QPC_BYTE_28_AT_S, 0); } - roce_set_field(context->byte_28_at_fl, V2_QPC_BYTE_28_SL_M, - V2_QPC_BYTE_28_SL_S, - rdma_ah_get_sl(&attr->ah_attr)); - roce_set_field(qpc_mask->byte_28_at_fl, V2_QPC_BYTE_28_SL_M, - V2_QPC_BYTE_28_SL_S, 0); - hr_qp->sl = rdma_ah_get_sl(&attr->ah_attr); - roce_set_field(context->byte_172_sq_psn, V2_QPC_BYTE_172_SQ_CUR_PSN_M, V2_QPC_BYTE_172_SQ_CUR_PSN_S, attr->sq_psn); roce_set_field(qpc_mask->byte_172_sq_psn, V2_QPC_BYTE_172_SQ_CUR_PSN_M, @@ -3224,9 +3494,114 @@ static int hns_roce_v2_modify_qp(struct ib_qp *ibqp, ; } else { dev_err(dev, "Illegal state for QP!\n"); + ret = -EINVAL; goto out; } + /* When QP state is err, SQ and RQ WQE should be flushed */ + if (new_state == IB_QPS_ERR) { + roce_set_field(context->byte_160_sq_ci_pi, + V2_QPC_BYTE_160_SQ_PRODUCER_IDX_M, + V2_QPC_BYTE_160_SQ_PRODUCER_IDX_S, + hr_qp->sq.head); + roce_set_field(qpc_mask->byte_160_sq_ci_pi, + V2_QPC_BYTE_160_SQ_PRODUCER_IDX_M, + V2_QPC_BYTE_160_SQ_PRODUCER_IDX_S, 0); + roce_set_field(context->byte_84_rq_ci_pi, + V2_QPC_BYTE_84_RQ_PRODUCER_IDX_M, + V2_QPC_BYTE_84_RQ_PRODUCER_IDX_S, + hr_qp->rq.head); + roce_set_field(qpc_mask->byte_84_rq_ci_pi, + V2_QPC_BYTE_84_RQ_PRODUCER_IDX_M, + V2_QPC_BYTE_84_RQ_PRODUCER_IDX_S, 0); + } + + if (attr_mask & IB_QP_AV) { + const struct ib_global_route *grh = + rdma_ah_read_grh(&attr->ah_attr); + const struct ib_gid_attr *gid_attr = NULL; + u8 src_mac[ETH_ALEN]; + int is_roce_protocol; + u16 vlan = 0xffff; + u8 ib_port; + u8 hr_port; + + ib_port = (attr_mask & IB_QP_PORT) ? attr->port_num : + hr_qp->port + 1; + hr_port = ib_port - 1; + is_roce_protocol = rdma_cap_eth_ah(&hr_dev->ib_dev, ib_port) && + rdma_ah_get_ah_flags(&attr->ah_attr) & IB_AH_GRH; + + if (is_roce_protocol) { + gid_attr = attr->ah_attr.grh.sgid_attr; + vlan = rdma_vlan_dev_vlan_id(gid_attr->ndev); + memcpy(src_mac, gid_attr->ndev->dev_addr, ETH_ALEN); + } + + roce_set_field(context->byte_24_mtu_tc, + V2_QPC_BYTE_24_VLAN_ID_M, + V2_QPC_BYTE_24_VLAN_ID_S, vlan); + roce_set_field(qpc_mask->byte_24_mtu_tc, + V2_QPC_BYTE_24_VLAN_ID_M, + V2_QPC_BYTE_24_VLAN_ID_S, 0); + + if (grh->sgid_index >= hr_dev->caps.gid_table_len[hr_port]) { + dev_err(hr_dev->dev, + "sgid_index(%u) too large. max is %d\n", + grh->sgid_index, + hr_dev->caps.gid_table_len[hr_port]); + ret = -EINVAL; + goto out; + } + + if (attr->ah_attr.type != RDMA_AH_ATTR_TYPE_ROCE) { + dev_err(hr_dev->dev, "ah attr is not RDMA roce type\n"); + ret = -EINVAL; + goto out; + } + + roce_set_field(context->byte_52_udpspn_dmac, + V2_QPC_BYTE_52_UDPSPN_M, V2_QPC_BYTE_52_UDPSPN_S, + (gid_attr->gid_type != IB_GID_TYPE_ROCE_UDP_ENCAP) ? + 0 : 0x12b7); + + roce_set_field(qpc_mask->byte_52_udpspn_dmac, + V2_QPC_BYTE_52_UDPSPN_M, + V2_QPC_BYTE_52_UDPSPN_S, 0); + + roce_set_field(context->byte_20_smac_sgid_idx, + V2_QPC_BYTE_20_SGID_IDX_M, + V2_QPC_BYTE_20_SGID_IDX_S, grh->sgid_index); + + roce_set_field(qpc_mask->byte_20_smac_sgid_idx, + V2_QPC_BYTE_20_SGID_IDX_M, + V2_QPC_BYTE_20_SGID_IDX_S, 0); + + roce_set_field(context->byte_24_mtu_tc, + V2_QPC_BYTE_24_HOP_LIMIT_M, + V2_QPC_BYTE_24_HOP_LIMIT_S, grh->hop_limit); + roce_set_field(qpc_mask->byte_24_mtu_tc, + V2_QPC_BYTE_24_HOP_LIMIT_M, + V2_QPC_BYTE_24_HOP_LIMIT_S, 0); + + roce_set_field(context->byte_24_mtu_tc, V2_QPC_BYTE_24_TC_M, + V2_QPC_BYTE_24_TC_S, grh->traffic_class); + roce_set_field(qpc_mask->byte_24_mtu_tc, V2_QPC_BYTE_24_TC_M, + V2_QPC_BYTE_24_TC_S, 0); + roce_set_field(context->byte_28_at_fl, V2_QPC_BYTE_28_FL_M, + V2_QPC_BYTE_28_FL_S, grh->flow_label); + roce_set_field(qpc_mask->byte_28_at_fl, V2_QPC_BYTE_28_FL_M, + V2_QPC_BYTE_28_FL_S, 0); + memcpy(context->dgid, grh->dgid.raw, sizeof(grh->dgid.raw)); + memset(qpc_mask->dgid, 0, sizeof(grh->dgid.raw)); + roce_set_field(context->byte_28_at_fl, V2_QPC_BYTE_28_SL_M, + V2_QPC_BYTE_28_SL_S, + rdma_ah_get_sl(&attr->ah_attr)); + roce_set_field(qpc_mask->byte_28_at_fl, V2_QPC_BYTE_28_SL_M, + V2_QPC_BYTE_28_SL_S, 0); + hr_qp->sl = rdma_ah_get_sl(&attr->ah_attr); + } + if (attr_mask & (IB_QP_ACCESS_FLAGS | IB_QP_MAX_DEST_RD_ATOMIC)) set_access_flags(hr_qp, context, qpc_mask, attr, attr_mask); @@ -3497,6 +3872,11 @@ static int hns_roce_v2_destroy_qp_common(struct hns_roce_dev *hr_dev, hns_roce_mtt_cleanup(hr_dev, &hr_qp->mtt); if (is_user) { + if (hr_qp->sq.wqe_cnt && (hr_qp->sdb_en == 1)) + hns_roce_db_unmap_user( + to_hr_ucontext(hr_qp->ibqp.uobject->context), + &hr_qp->sdb); + if (hr_qp->rq.wqe_cnt && (hr_qp->rdb_en == 1)) hns_roce_db_unmap_user( to_hr_ucontext(hr_qp->ibqp.uobject->context), @@ -3579,6 +3959,74 @@ static int hns_roce_v2_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period) return ret; } +static void hns_roce_set_qps_to_err(struct hns_roce_dev *hr_dev, u32 qpn) +{ + struct hns_roce_qp *hr_qp; + struct ib_qp_attr attr; + int attr_mask; + int ret; + + hr_qp = __hns_roce_qp_lookup(hr_dev, qpn); + if (!hr_qp) { + dev_warn(hr_dev->dev, "no hr_qp can be found!\n"); + return; + } + + if (hr_qp->ibqp.uobject) { + if (hr_qp->sdb_en == 1) { + hr_qp->sq.head = *(int *)(hr_qp->sdb.virt_addr); + hr_qp->rq.head = *(int *)(hr_qp->rdb.virt_addr); + } else { + dev_warn(hr_dev->dev, "flush cqe is unsupported in userspace!\n"); + return; + } + } + + attr_mask = IB_QP_STATE; + attr.qp_state = IB_QPS_ERR; + ret = hns_roce_v2_modify_qp(&hr_qp->ibqp, &attr, attr_mask, + hr_qp->state, IB_QPS_ERR); + if (ret) + dev_err(hr_dev->dev, "failed to modify qp %d to err state.\n", + qpn); +} + +static void hns_roce_irq_work_handle(struct work_struct *work) +{ + struct hns_roce_work *irq_work = + container_of(work, struct hns_roce_work, work); + u32 qpn = irq_work->qpn; + + switch (irq_work->event_type) { + case HNS_ROCE_EVENT_TYPE_WQ_CATAS_ERROR: + case HNS_ROCE_EVENT_TYPE_INV_REQ_LOCAL_WQ_ERROR: + case HNS_ROCE_EVENT_TYPE_LOCAL_WQ_ACCESS_ERROR: + hns_roce_set_qps_to_err(irq_work->hr_dev, qpn); + break; + default: + break; + } + + kfree(irq_work); +} + +static void hns_roce_v2_init_irq_work(struct hns_roce_dev *hr_dev, + struct hns_roce_eq *eq, u32 qpn) +{ + struct hns_roce_work *irq_work; + + irq_work = kzalloc(sizeof(struct hns_roce_work), GFP_ATOMIC); + if (!irq_work) + return; + + INIT_WORK(&(irq_work->work), hns_roce_irq_work_handle); + irq_work->hr_dev = hr_dev; + irq_work->qpn = qpn; + irq_work->event_type = eq->event_type; + irq_work->sub_type = eq->sub_type; + queue_work(hr_dev->irq_workq, &(irq_work->work)); +} + static void set_eq_cons_index_v2(struct hns_roce_eq *eq) { u32 doorbell[2]; @@ -3681,14 +4129,9 @@ static void hns_roce_v2_local_wq_access_err_handle(struct hns_roce_dev *hr_dev, static void hns_roce_v2_qp_err_handle(struct hns_roce_dev *hr_dev, struct hns_roce_aeqe *aeqe, - int event_type) + int event_type, u32 qpn) { struct device *dev = hr_dev->dev; - u32 qpn; - - qpn = roce_get_field(aeqe->event.qp_event.qp, - HNS_ROCE_V2_AEQE_EVENT_QUEUE_NUM_M, - HNS_ROCE_V2_AEQE_EVENT_QUEUE_NUM_S); switch (event_type) { case HNS_ROCE_EVENT_TYPE_COMM_EST: @@ -3715,14 +4158,9 @@ static void hns_roce_v2_qp_err_handle(struct hns_roce_dev *hr_dev, static void hns_roce_v2_cq_err_handle(struct hns_roce_dev *hr_dev, struct hns_roce_aeqe *aeqe, - int event_type) + int event_type, u32 cqn) { struct device *dev = hr_dev->dev; - u32 cqn; - - cqn = roce_get_field(aeqe->event.cq_event.cq, - HNS_ROCE_V2_AEQE_EVENT_QUEUE_NUM_M, - HNS_ROCE_V2_AEQE_EVENT_QUEUE_NUM_S); switch (event_type) { case HNS_ROCE_EVENT_TYPE_CQ_ACCESS_ERROR: @@ -3787,6 +4225,9 @@ static int hns_roce_v2_aeq_int(struct hns_roce_dev *hr_dev, struct hns_roce_aeqe *aeqe; int aeqe_found = 0; int event_type; + int sub_type; + u32 qpn; + u32 cqn; while ((aeqe = next_aeqe_sw_v2(eq))) { @@ -3798,6 +4239,15 @@ static int hns_roce_v2_aeq_int(struct hns_roce_dev *hr_dev, event_type = roce_get_field(aeqe->asyn, HNS_ROCE_V2_AEQE_EVENT_TYPE_M, HNS_ROCE_V2_AEQE_EVENT_TYPE_S); + sub_type = roce_get_field(aeqe->asyn, + HNS_ROCE_V2_AEQE_SUB_TYPE_M, + HNS_ROCE_V2_AEQE_SUB_TYPE_S); + qpn = roce_get_field(aeqe->event.qp_event.qp, + HNS_ROCE_V2_AEQE_EVENT_QUEUE_NUM_M, + HNS_ROCE_V2_AEQE_EVENT_QUEUE_NUM_S); + cqn = roce_get_field(aeqe->event.cq_event.cq, + HNS_ROCE_V2_AEQE_EVENT_QUEUE_NUM_M, + HNS_ROCE_V2_AEQE_EVENT_QUEUE_NUM_S); switch (event_type) { case HNS_ROCE_EVENT_TYPE_PATH_MIG: @@ -3811,7 +4261,8 @@ static int hns_roce_v2_aeq_int(struct hns_roce_dev *hr_dev, case HNS_ROCE_EVENT_TYPE_WQ_CATAS_ERROR: case HNS_ROCE_EVENT_TYPE_INV_REQ_LOCAL_WQ_ERROR: case HNS_ROCE_EVENT_TYPE_LOCAL_WQ_ACCESS_ERROR: - hns_roce_v2_qp_err_handle(hr_dev, aeqe, event_type); + hns_roce_v2_qp_err_handle(hr_dev, aeqe, event_type, + qpn); break; case HNS_ROCE_EVENT_TYPE_SRQ_LIMIT_REACH: case HNS_ROCE_EVENT_TYPE_SRQ_LAST_WQE_REACH: @@ -3820,7 +4271,8 @@ static int hns_roce_v2_aeq_int(struct hns_roce_dev *hr_dev, break; case HNS_ROCE_EVENT_TYPE_CQ_ACCESS_ERROR: case HNS_ROCE_EVENT_TYPE_CQ_OVERFLOW: - hns_roce_v2_cq_err_handle(hr_dev, aeqe, event_type); + hns_roce_v2_cq_err_handle(hr_dev, aeqe, event_type, + cqn); break; case HNS_ROCE_EVENT_TYPE_DB_OVERFLOW: dev_warn(dev, "DB overflow.\n"); @@ -3843,6 +4295,8 @@ static int hns_roce_v2_aeq_int(struct hns_roce_dev *hr_dev, break; }; + eq->event_type = event_type; + eq->sub_type = sub_type; ++eq->cons_index; aeqe_found = 1; @@ -3850,6 +4304,7 @@ static int hns_roce_v2_aeq_int(struct hns_roce_dev *hr_dev, dev_warn(dev, "cons_index overflow, set back to 0.\n"); eq->cons_index = 0; } + hns_roce_v2_init_irq_work(hr_dev, eq, qpn); } set_eq_cons_index_v2(eq); @@ -4052,15 +4507,12 @@ static void hns_roce_mhop_free_eq(struct hns_roce_dev *hr_dev, u32 bt_chk_sz; u32 mhop_num; int eqe_alloc; - int ba_num; int i = 0; int j = 0; mhop_num = hr_dev->caps.eqe_hop_num; buf_chk_sz = 1 << (hr_dev->caps.eqe_buf_pg_sz + PAGE_SHIFT); bt_chk_sz = 1 << (hr_dev->caps.eqe_ba_pg_sz + PAGE_SHIFT); - ba_num = (PAGE_ALIGN(eq->entries * eq->eqe_size) + buf_chk_sz - 1) / - buf_chk_sz; /* hop_num = 0 */ if (mhop_num == HNS_ROCE_HOP_NUM_0) { @@ -4669,6 +5121,13 @@ static int hns_roce_v2_init_eq_table(struct hns_roce_dev *hr_dev) } } + hr_dev->irq_workq = + create_singlethread_workqueue("hns_roce_irq_workqueue"); + if (!hr_dev->irq_workq) { + dev_err(dev, "Create irq workqueue failed!\n"); + goto err_request_irq_fail; + } + return 0; err_request_irq_fail: @@ -4719,12 +5178,17 @@ static void hns_roce_v2_cleanup_eq_table(struct hns_roce_dev *hr_dev) kfree(hr_dev->irq_names[i]); kfree(eq_table->eq); + + flush_workqueue(hr_dev->irq_workq); + destroy_workqueue(hr_dev->irq_workq); } static const struct hns_roce_hw hns_roce_hw_v2 = { .cmq_init = hns_roce_v2_cmq_init, .cmq_exit = hns_roce_v2_cmq_exit, .hw_profile = hns_roce_v2_profile, + .hw_init = hns_roce_v2_init, + .hw_exit = hns_roce_v2_exit, .post_mbox = hns_roce_v2_post_mbox, .chk_mbox = hns_roce_v2_chk_mbox, .set_gid = hns_roce_v2_set_gid, @@ -4749,6 +5213,8 @@ static const struct hns_roce_hw hns_roce_hw_v2 = { static const struct pci_device_id hns_roce_hw_v2_pci_tbl[] = { {PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_25GE_RDMA), 0}, {PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_25GE_RDMA_MACSEC), 0}, + {PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_50GE_RDMA), 0}, + {PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_50GE_RDMA_MACSEC), 0}, {PCI_VDEVICE(HUAWEI, HNAE3_DEV_ID_100G_RDMA_MACSEC), 0}, /* required last entry */ {0, } diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h index d47675f365c7..14aa308befef 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h @@ -112,6 +112,9 @@ (step_idx == 1 && hop_num == 1) || \ (step_idx == 2 && hop_num == 2)) +#define CMD_CSQ_DESC_NUM 1024 +#define CMD_CRQ_DESC_NUM 1024 + enum { NO_ARMED = 0x0, REG_NXT_CEQE = 0x2, @@ -203,6 +206,10 @@ enum hns_roce_opcode_type { HNS_ROCE_OPC_ALLOC_PF_RES = 0x8004, HNS_ROCE_OPC_QUERY_PF_RES = 0x8400, HNS_ROCE_OPC_ALLOC_VF_RES = 0x8401, + HNS_ROCE_OPC_CFG_EXT_LLM = 0x8403, + HNS_ROCE_OPC_CFG_TMOUT_LLM = 0x8404, + HNS_ROCE_OPC_CFG_SGID_TB = 0x8500, + HNS_ROCE_OPC_CFG_SMAC_TB = 0x8501, HNS_ROCE_OPC_CFG_BT_ATTR = 0x8506, }; @@ -447,8 +454,8 @@ struct hns_roce_v2_qp_context { #define V2_QPC_BYTE_24_TC_S 8 #define V2_QPC_BYTE_24_TC_M GENMASK(15, 8) -#define V2_QPC_BYTE_24_VLAN_IDX_S 16 -#define V2_QPC_BYTE_24_VLAN_IDX_M GENMASK(27, 16) +#define V2_QPC_BYTE_24_VLAN_ID_S 16 +#define V2_QPC_BYTE_24_VLAN_ID_M GENMASK(27, 16) #define V2_QPC_BYTE_24_MTU_S 28 #define V2_QPC_BYTE_24_MTU_M GENMASK(31, 28) @@ -768,7 +775,7 @@ struct hns_roce_v2_cqe { __le32 byte_4; union { __le32 rkey; - __be32 immtdata; + __le32 immtdata; }; __le32 byte_12; __le32 byte_16; @@ -926,7 +933,7 @@ struct hns_roce_v2_cq_db { struct hns_roce_v2_ud_send_wqe { __le32 byte_4; __le32 msg_len; - __be32 immtdata; + __le32 immtdata; __le32 byte_16; __le32 byte_20; __le32 byte_24; @@ -1012,7 +1019,7 @@ struct hns_roce_v2_rc_send_wqe { __le32 msg_len; union { __le32 inv_key; - __be32 immtdata; + __le32 immtdata; }; __le32 byte_16; __le32 byte_20; @@ -1061,6 +1068,40 @@ struct hns_roce_query_version { __le32 rsv[5]; }; +struct hns_roce_cfg_llm_a { + __le32 base_addr_l; + __le32 base_addr_h; + __le32 depth_pgsz_init_en; + __le32 head_ba_l; + __le32 head_ba_h_nxtptr; + __le32 head_ptr; +}; + +#define CFG_LLM_QUE_DEPTH_S 0 +#define CFG_LLM_QUE_DEPTH_M GENMASK(12, 0) + +#define CFG_LLM_QUE_PGSZ_S 16 +#define CFG_LLM_QUE_PGSZ_M GENMASK(19, 16) + +#define CFG_LLM_INIT_EN_S 20 +#define CFG_LLM_INIT_EN_M GENMASK(20, 20) + +#define CFG_LLM_HEAD_PTR_S 0 +#define CFG_LLM_HEAD_PTR_M GENMASK(11, 0) + +struct hns_roce_cfg_llm_b { + __le32 tail_ba_l; + __le32 tail_ba_h; + __le32 tail_ptr; + __le32 rsv[3]; +}; + +#define CFG_LLM_TAIL_BA_H_S 0 +#define CFG_LLM_TAIL_BA_H_M GENMASK(19, 0) + +#define CFG_LLM_TAIL_PTR_S 0 +#define CFG_LLM_TAIL_PTR_M GENMASK(11, 0) + struct hns_roce_cfg_global_param { __le32 time_cfg_udp_port; __le32 rsv[5]; @@ -1072,7 +1113,7 @@ struct hns_roce_cfg_global_param { #define CFG_GLOBAL_PARAM_DATA_0_ROCEE_UDP_PORT_S 16 #define CFG_GLOBAL_PARAM_DATA_0_ROCEE_UDP_PORT_M GENMASK(31, 16) -struct hns_roce_pf_res { +struct hns_roce_pf_res_a { __le32 rsv; __le32 qpc_bt_idx_num; __le32 srqc_bt_idx_num; @@ -1111,6 +1152,32 @@ struct hns_roce_pf_res { #define PF_RES_DATA_5_PF_EQC_BT_NUM_S 16 #define PF_RES_DATA_5_PF_EQC_BT_NUM_M GENMASK(25, 16) +struct hns_roce_pf_res_b { + __le32 rsv0; + __le32 smac_idx_num; + __le32 sgid_idx_num; + __le32 qid_idx_sl_num; + __le32 rsv[2]; +}; + +#define PF_RES_DATA_1_PF_SMAC_IDX_S 0 +#define PF_RES_DATA_1_PF_SMAC_IDX_M GENMASK(7, 0) + +#define PF_RES_DATA_1_PF_SMAC_NUM_S 8 +#define PF_RES_DATA_1_PF_SMAC_NUM_M GENMASK(16, 8) + +#define PF_RES_DATA_2_PF_SGID_IDX_S 0 +#define PF_RES_DATA_2_PF_SGID_IDX_M GENMASK(7, 0) + +#define PF_RES_DATA_2_PF_SGID_NUM_S 8 +#define PF_RES_DATA_2_PF_SGID_NUM_M GENMASK(16, 8) + +#define PF_RES_DATA_3_PF_QID_IDX_S 0 +#define PF_RES_DATA_3_PF_QID_IDX_M GENMASK(9, 0) + +#define PF_RES_DATA_3_PF_SL_NUM_S 16 +#define PF_RES_DATA_3_PF_SL_NUM_M GENMASK(26, 16) + struct hns_roce_vf_res_a { __le32 vf_id; __le32 vf_qpc_bt_idx_num; @@ -1179,13 +1246,6 @@ struct hns_roce_vf_res_b { #define VF_RES_B_DATA_3_VF_SL_NUM_S 16 #define VF_RES_B_DATA_3_VF_SL_NUM_M GENMASK(19, 16) -/* Reg field definition */ -#define ROCEE_VF_SMAC_CFG1_VF_SMAC_H_S 0 -#define ROCEE_VF_SMAC_CFG1_VF_SMAC_H_M GENMASK(15, 0) - -#define ROCEE_VF_SGID_CFG4_SGID_TYPE_S 0 -#define ROCEE_VF_SGID_CFG4_SGID_TYPE_M GENMASK(1, 0) - struct hns_roce_cfg_bt_attr { __le32 vf_qpc_cfg; __le32 vf_srqc_cfg; @@ -1230,6 +1290,32 @@ struct hns_roce_cfg_bt_attr { #define CFG_BT_ATTR_DATA_3_VF_MPT_HOPNUM_S 8 #define CFG_BT_ATTR_DATA_3_VF_MPT_HOPNUM_M GENMASK(9, 8) +struct hns_roce_cfg_sgid_tb { + __le32 table_idx_rsv; + __le32 vf_sgid_l; + __le32 vf_sgid_ml; + __le32 vf_sgid_mh; + __le32 vf_sgid_h; + __le32 vf_sgid_type_rsv; +}; +#define CFG_SGID_TB_TABLE_IDX_S 0 +#define CFG_SGID_TB_TABLE_IDX_M GENMASK(7, 0) + +#define CFG_SGID_TB_VF_SGID_TYPE_S 0 +#define CFG_SGID_TB_VF_SGID_TYPE_M GENMASK(1, 0) + +struct hns_roce_cfg_smac_tb { + __le32 tb_idx_rsv; + __le32 vf_smac_l; + __le32 vf_smac_h_rsv; + __le32 rsv[3]; +}; +#define CFG_SMAC_TB_IDX_S 0 +#define CFG_SMAC_TB_IDX_M GENMASK(7, 0) + +#define CFG_SMAC_TB_VF_SMAC_H_S 0 +#define CFG_SMAC_TB_VF_SMAC_H_M GENMASK(15, 0) + struct hns_roce_cmq_desc { __le16 opcode; __le16 flag; @@ -1276,8 +1362,32 @@ struct hns_roce_v2_cmq { u16 last_status; }; +enum hns_roce_link_table_type { + TSQ_LINK_TABLE, + TPQ_LINK_TABLE, +}; + +struct hns_roce_link_table { + struct hns_roce_buf_list table; + struct hns_roce_buf_list *pg_list; + u32 npages; + u32 pg_sz; +}; + +struct hns_roce_link_table_entry { + u32 blk_ba0; + u32 blk_ba1_nxt_ptr; +}; +#define HNS_ROCE_LINK_TABLE_BA1_S 0 +#define HNS_ROCE_LINK_TABLE_BA1_M GENMASK(19, 0) + +#define HNS_ROCE_LINK_TABLE_NXT_PTR_S 20 +#define HNS_ROCE_LINK_TABLE_NXT_PTR_M GENMASK(31, 20) + struct hns_roce_v2_priv { struct hns_roce_v2_cmq cmq; + struct hns_roce_link_table tsq; + struct hns_roce_link_table tpq; }; struct hns_roce_eq_context { diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c index 21b901cfa2d6..c5cae9a38c04 100644 --- a/drivers/infiniband/hw/hns/hns_roce_main.c +++ b/drivers/infiniband/hw/hns/hns_roce_main.c @@ -74,8 +74,7 @@ static int hns_roce_set_mac(struct hns_roce_dev *hr_dev, u8 port, u8 *addr) return hr_dev->hw->set_mac(hr_dev, phy_port, addr); } -static int hns_roce_add_gid(const union ib_gid *gid, - const struct ib_gid_attr *attr, void **context) +static int hns_roce_add_gid(const struct ib_gid_attr *attr, void **context) { struct hns_roce_dev *hr_dev = to_hr_dev(attr->device); u8 port = attr->port_num - 1; @@ -87,8 +86,7 @@ static int hns_roce_add_gid(const union ib_gid *gid, spin_lock_irqsave(&hr_dev->iboe.lock, flags); - ret = hr_dev->hw->set_gid(hr_dev, port, attr->index, - (union ib_gid *)gid, attr); + ret = hr_dev->hw->set_gid(hr_dev, port, attr->index, &attr->gid, attr); spin_unlock_irqrestore(&hr_dev->iboe.lock, flags); @@ -208,7 +206,8 @@ static int hns_roce_query_device(struct ib_device *ib_dev, props->max_qp_wr = hr_dev->caps.max_wqes; props->device_cap_flags = IB_DEVICE_PORT_ACTIVE_EVENT | IB_DEVICE_RC_RNR_NAK_GEN; - props->max_sge = max(hr_dev->caps.max_sq_sg, hr_dev->caps.max_rq_sg); + props->max_send_sge = hr_dev->caps.max_sq_sg; + props->max_recv_sge = hr_dev->caps.max_rq_sg; props->max_sge_rd = 1; props->max_cq = hr_dev->caps.num_cqs; props->max_cqe = hr_dev->caps.max_cqes; @@ -535,6 +534,9 @@ static int hns_roce_register_device(struct hns_roce_dev *hr_dev) (1ULL << IB_USER_VERBS_CMD_QUERY_QP) | (1ULL << IB_USER_VERBS_CMD_DESTROY_QP); + ib_dev->uverbs_ex_cmd_mask |= + (1ULL << IB_USER_VERBS_EX_CMD_MODIFY_CQ); + /* HCA||device||port */ ib_dev->modify_device = hns_roce_modify_device; ib_dev->query_device = hns_roce_query_device; @@ -887,8 +889,7 @@ error_failed_cmd_init: error_failed_cmq_init: if (hr_dev->hw->reset) { - ret = hr_dev->hw->reset(hr_dev, false); - if (ret) + if (hr_dev->hw->reset(hr_dev, false)) dev_err(dev, "Dereset RoCE engine failed!\n"); } diff --git a/drivers/infiniband/hw/hns/hns_roce_pd.c b/drivers/infiniband/hw/hns/hns_roce_pd.c index b9f2c871ff9a..e11c149da04d 100644 --- a/drivers/infiniband/hw/hns/hns_roce_pd.c +++ b/drivers/infiniband/hw/hns/hns_roce_pd.c @@ -37,7 +37,7 @@ static int hns_roce_pd_alloc(struct hns_roce_dev *hr_dev, unsigned long *pdn) { - return hns_roce_bitmap_alloc(&hr_dev->pd_bitmap, pdn); + return hns_roce_bitmap_alloc(&hr_dev->pd_bitmap, pdn) ? -ENOMEM : 0; } static void hns_roce_pd_free(struct hns_roce_dev *hr_dev, unsigned long pdn) diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c index baaf906f7c2e..efb7e961ca65 100644 --- a/drivers/infiniband/hw/hns/hns_roce_qp.c +++ b/drivers/infiniband/hw/hns/hns_roce_qp.c @@ -115,7 +115,10 @@ static int hns_roce_reserve_range_qp(struct hns_roce_dev *hr_dev, int cnt, { struct hns_roce_qp_table *qp_table = &hr_dev->qp_table; - return hns_roce_bitmap_alloc_range(&qp_table->bitmap, cnt, align, base); + return hns_roce_bitmap_alloc_range(&qp_table->bitmap, cnt, align, + base) ? + -ENOMEM : + 0; } enum hns_roce_qp_state to_hns_roce_state(enum ib_qp_state state) @@ -489,6 +492,14 @@ static int hns_roce_set_kernel_sq_size(struct hns_roce_dev *hr_dev, return 0; } +static int hns_roce_qp_has_sq(struct ib_qp_init_attr *attr) +{ + if (attr->qp_type == IB_QPT_XRC_TGT) + return 0; + + return 1; +} + static int hns_roce_qp_has_rq(struct ib_qp_init_attr *attr) { if (attr->qp_type == IB_QPT_XRC_INI || @@ -613,6 +624,23 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev, goto err_mtt; } + if ((hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_SQ_RECORD_DB) && + (udata->inlen >= sizeof(ucmd)) && + (udata->outlen >= sizeof(resp)) && + hns_roce_qp_has_sq(init_attr)) { + ret = hns_roce_db_map_user( + to_hr_ucontext(ib_pd->uobject->context), + ucmd.sdb_addr, &hr_qp->sdb); + if (ret) { + dev_err(dev, "sq record doorbell map failed!\n"); + goto err_mtt; + } + + /* indicate kernel supports sq record db */ + resp.cap_flags |= HNS_ROCE_SUPPORT_SQ_RECORD_DB; + hr_qp->sdb_en = 1; + } + if ((hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB) && (udata->outlen >= sizeof(resp)) && hns_roce_qp_has_rq(init_attr)) { @@ -621,7 +649,7 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev, ucmd.db_addr, &hr_qp->rdb); if (ret) { dev_err(dev, "rq record doorbell map failed!\n"); - goto err_mtt; + goto err_sq_dbmap; } } } else { @@ -734,7 +762,7 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev, if (ib_pd->uobject && (udata->outlen >= sizeof(resp)) && (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB)) { - /* indicate kernel supports record db */ + /* indicate kernel supports rq record db */ resp.cap_flags |= HNS_ROCE_SUPPORT_RQ_RECORD_DB; ret = ib_copy_to_udata(udata, &resp, sizeof(resp)); if (ret) @@ -770,6 +798,16 @@ err_wrid: kfree(hr_qp->rq.wrid); } +err_sq_dbmap: + if (ib_pd->uobject) + if ((hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_SQ_RECORD_DB) && + (udata->inlen >= sizeof(ucmd)) && + (udata->outlen >= sizeof(resp)) && + hns_roce_qp_has_sq(init_attr)) + hns_roce_db_unmap_user( + to_hr_ucontext(ib_pd->uobject->context), + &hr_qp->sdb); + err_mtt: hns_roce_mtt_cleanup(hr_dev, &hr_qp->mtt); @@ -903,6 +941,17 @@ int hns_roce_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state; + if (ibqp->uobject && + (attr_mask & IB_QP_STATE) && new_state == IB_QPS_ERR) { + if (hr_qp->sdb_en == 1) { + hr_qp->sq.head = *(int *)(hr_qp->sdb.virt_addr); + hr_qp->rq.head = *(int *)(hr_qp->rdb.virt_addr); + } else { + dev_warn(dev, "flush cqe is not supported in userspace!\n"); + goto out; + } + } + if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, attr_mask, IB_LINK_LAYER_ETHERNET)) { dev_err(dev, "ib_modify_qp_is_ok failed\n"); diff --git a/drivers/infiniband/hw/i40iw/Kconfig b/drivers/infiniband/hw/i40iw/Kconfig index 2962979c06e9..d867ef1ac72a 100644 --- a/drivers/infiniband/hw/i40iw/Kconfig +++ b/drivers/infiniband/hw/i40iw/Kconfig @@ -1,6 +1,7 @@ config INFINIBAND_I40IW tristate "Intel(R) Ethernet X722 iWARP Driver" depends on INET && I40E + depends on IPV6 || !IPV6 depends on PCI select GENERIC_ALLOCATOR ---help--- diff --git a/drivers/infiniband/hw/i40iw/i40iw_cm.c b/drivers/infiniband/hw/i40iw/i40iw_cm.c index 7b2655128b9f..423818a7d333 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_cm.c +++ b/drivers/infiniband/hw/i40iw/i40iw_cm.c @@ -57,6 +57,7 @@ #include <net/addrconf.h> #include <net/ip6_route.h> #include <net/ip_fib.h> +#include <net/secure_seq.h> #include <net/tcp.h> #include <asm/checksum.h> @@ -2164,7 +2165,6 @@ static struct i40iw_cm_node *i40iw_make_cm_node( struct i40iw_cm_listener *listener) { struct i40iw_cm_node *cm_node; - struct timespec ts; int oldarpindex; int arpindex; struct net_device *netdev = iwdev->netdev; @@ -2214,10 +2214,26 @@ static struct i40iw_cm_node *i40iw_make_cm_node( cm_node->tcp_cntxt.rcv_wscale = I40IW_CM_DEFAULT_RCV_WND_SCALE; cm_node->tcp_cntxt.rcv_wnd = I40IW_CM_DEFAULT_RCV_WND_SCALED >> I40IW_CM_DEFAULT_RCV_WND_SCALE; - ts = current_kernel_time(); - cm_node->tcp_cntxt.loc_seq_num = ts.tv_nsec; - cm_node->tcp_cntxt.mss = (cm_node->ipv4) ? (iwdev->vsi.mtu - I40IW_MTU_TO_MSS_IPV4) : - (iwdev->vsi.mtu - I40IW_MTU_TO_MSS_IPV6); + if (cm_node->ipv4) { + cm_node->tcp_cntxt.loc_seq_num = secure_tcp_seq(htonl(cm_node->loc_addr[0]), + htonl(cm_node->rem_addr[0]), + htons(cm_node->loc_port), + htons(cm_node->rem_port)); + cm_node->tcp_cntxt.mss = iwdev->vsi.mtu - I40IW_MTU_TO_MSS_IPV4; + } else if (IS_ENABLED(CONFIG_IPV6)) { + __be32 loc[4] = { + htonl(cm_node->loc_addr[0]), htonl(cm_node->loc_addr[1]), + htonl(cm_node->loc_addr[2]), htonl(cm_node->loc_addr[3]) + }; + __be32 rem[4] = { + htonl(cm_node->rem_addr[0]), htonl(cm_node->rem_addr[1]), + htonl(cm_node->rem_addr[2]), htonl(cm_node->rem_addr[3]) + }; + cm_node->tcp_cntxt.loc_seq_num = secure_tcpv6_seq(loc, rem, + htons(cm_node->loc_port), + htons(cm_node->rem_port)); + cm_node->tcp_cntxt.mss = iwdev->vsi.mtu - I40IW_MTU_TO_MSS_IPV6; + } cm_node->iwdev = iwdev; cm_node->dev = &iwdev->sc_dev; diff --git a/drivers/infiniband/hw/i40iw/i40iw_hw.c b/drivers/infiniband/hw/i40iw/i40iw_hw.c index 2836c5420d60..55a1fbf0e670 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_hw.c +++ b/drivers/infiniband/hw/i40iw/i40iw_hw.c @@ -435,45 +435,24 @@ void i40iw_process_aeq(struct i40iw_device *iwdev) } /** - * i40iw_manage_apbvt - add or delete tcp port + * i40iw_cqp_manage_abvpt_cmd - send cqp command manage abpvt * @iwdev: iwarp device * @accel_local_port: port for apbvt * @add_port: add or delete port */ -int i40iw_manage_apbvt(struct i40iw_device *iwdev, u16 accel_local_port, bool add_port) +static enum i40iw_status_code +i40iw_cqp_manage_abvpt_cmd(struct i40iw_device *iwdev, + u16 accel_local_port, + bool add_port) { struct i40iw_apbvt_info *info; struct i40iw_cqp_request *cqp_request; struct cqp_commands_info *cqp_info; - unsigned long flags; - struct i40iw_cm_core *cm_core = &iwdev->cm_core; - enum i40iw_status_code status = 0; - bool in_use; - - /* apbvt_lock is held across CQP delete APBVT OP (non-waiting) to - * protect against race where add APBVT CQP can race ahead of the delete - * APBVT for same port. - */ - spin_lock_irqsave(&cm_core->apbvt_lock, flags); - - if (!add_port) { - in_use = i40iw_port_in_use(cm_core, accel_local_port); - if (in_use) - goto exit; - clear_bit(accel_local_port, cm_core->ports_in_use); - } else { - in_use = test_and_set_bit(accel_local_port, - cm_core->ports_in_use); - spin_unlock_irqrestore(&cm_core->apbvt_lock, flags); - if (in_use) - return 0; - } + enum i40iw_status_code status; cqp_request = i40iw_get_cqp_request(&iwdev->cqp, add_port); - if (!cqp_request) { - status = -ENOMEM; - goto exit; - } + if (!cqp_request) + return I40IW_ERR_NO_MEMORY; cqp_info = &cqp_request->info; info = &cqp_info->in.u.manage_apbvt_entry.info; @@ -489,14 +468,54 @@ int i40iw_manage_apbvt(struct i40iw_device *iwdev, u16 accel_local_port, bool ad status = i40iw_handle_cqp_op(iwdev, cqp_request); if (status) i40iw_pr_err("CQP-OP Manage APBVT entry fail"); -exit: - if (!add_port) - spin_unlock_irqrestore(&cm_core->apbvt_lock, flags); return status; } /** + * i40iw_manage_apbvt - add or delete tcp port + * @iwdev: iwarp device + * @accel_local_port: port for apbvt + * @add_port: add or delete port + */ +enum i40iw_status_code i40iw_manage_apbvt(struct i40iw_device *iwdev, + u16 accel_local_port, + bool add_port) +{ + struct i40iw_cm_core *cm_core = &iwdev->cm_core; + enum i40iw_status_code status; + unsigned long flags; + bool in_use; + + /* apbvt_lock is held across CQP delete APBVT OP (non-waiting) to + * protect against race where add APBVT CQP can race ahead of the delete + * APBVT for same port. + */ + if (add_port) { + spin_lock_irqsave(&cm_core->apbvt_lock, flags); + in_use = __test_and_set_bit(accel_local_port, + cm_core->ports_in_use); + spin_unlock_irqrestore(&cm_core->apbvt_lock, flags); + if (in_use) + return 0; + return i40iw_cqp_manage_abvpt_cmd(iwdev, accel_local_port, + true); + } else { + spin_lock_irqsave(&cm_core->apbvt_lock, flags); + in_use = i40iw_port_in_use(cm_core, accel_local_port); + if (in_use) { + spin_unlock_irqrestore(&cm_core->apbvt_lock, flags); + return 0; + } + __clear_bit(accel_local_port, cm_core->ports_in_use); + status = i40iw_cqp_manage_abvpt_cmd(iwdev, accel_local_port, + false); + spin_unlock_irqrestore(&cm_core->apbvt_lock, flags); + return status; + } +} + +/** * i40iw_manage_arp_cache - manage hw arp cache * @iwdev: iwarp device * @mac_addr: mac address ptr diff --git a/drivers/infiniband/hw/i40iw/i40iw_verbs.c b/drivers/infiniband/hw/i40iw/i40iw_verbs.c index 68679ad4c6da..e2e6c74a7452 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_verbs.c +++ b/drivers/infiniband/hw/i40iw/i40iw_verbs.c @@ -71,7 +71,8 @@ static int i40iw_query_device(struct ib_device *ibdev, props->max_mr_size = I40IW_MAX_OUTBOUND_MESSAGE_SIZE; props->max_qp = iwdev->max_qp - iwdev->used_qps; props->max_qp_wr = I40IW_MAX_QP_WRS; - props->max_sge = I40IW_MAX_WQ_FRAGMENT_COUNT; + props->max_send_sge = I40IW_MAX_WQ_FRAGMENT_COUNT; + props->max_recv_sge = I40IW_MAX_WQ_FRAGMENT_COUNT; props->max_cq = iwdev->max_cq - iwdev->used_cqs; props->max_cqe = iwdev->max_cqe; props->max_mr = iwdev->max_mr - iwdev->used_mrs; @@ -1409,6 +1410,7 @@ static void i40iw_set_hugetlb_values(u64 addr, struct i40iw_mr *iwmr) struct vm_area_struct *vma; struct hstate *h; + down_read(¤t->mm->mmap_sem); vma = find_vma(current->mm, addr); if (vma && is_vm_hugetlb_page(vma)) { h = hstate_vma(vma); @@ -1417,6 +1419,7 @@ static void i40iw_set_hugetlb_values(u64 addr, struct i40iw_mr *iwmr) iwmr->page_msk = huge_page_mask(h); } } + up_read(¤t->mm->mmap_sem); } /** @@ -2198,8 +2201,8 @@ static void i40iw_copy_sg_list(struct i40iw_sge *sg_list, struct ib_sge *sgl, in * @bad_wr: return of bad wr if err */ static int i40iw_post_send(struct ib_qp *ibqp, - struct ib_send_wr *ib_wr, - struct ib_send_wr **bad_wr) + const struct ib_send_wr *ib_wr, + const struct ib_send_wr **bad_wr) { struct i40iw_qp *iwqp; struct i40iw_qp_uk *ukqp; @@ -2374,9 +2377,8 @@ out: * @ib_wr: work request for receive * @bad_wr: bad wr caused an error */ -static int i40iw_post_recv(struct ib_qp *ibqp, - struct ib_recv_wr *ib_wr, - struct ib_recv_wr **bad_wr) +static int i40iw_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *ib_wr, + const struct ib_recv_wr **bad_wr) { struct i40iw_qp *iwqp; struct i40iw_qp_uk *ukqp; @@ -2701,21 +2703,6 @@ static int i40iw_query_gid(struct ib_device *ibdev, } /** - * i40iw_modify_port Modify port properties - * @ibdev: device pointer from stack - * @port: port number - * @port_modify_mask: mask for port modifications - * @props: port properties - */ -static int i40iw_modify_port(struct ib_device *ibdev, - u8 port, - int port_modify_mask, - struct ib_port_modify *props) -{ - return -ENOSYS; -} - -/** * i40iw_query_pkey - Query partition key * @ibdev: device pointer from stack * @port: port number @@ -2732,28 +2719,6 @@ static int i40iw_query_pkey(struct ib_device *ibdev, } /** - * i40iw_create_ah - create address handle - * @ibpd: ptr of pd - * @ah_attr: address handle attributes - */ -static struct ib_ah *i40iw_create_ah(struct ib_pd *ibpd, - struct rdma_ah_attr *attr, - struct ib_udata *udata) - -{ - return ERR_PTR(-ENOSYS); -} - -/** - * i40iw_destroy_ah - Destroy address handle - * @ah: pointer to address handle - */ -static int i40iw_destroy_ah(struct ib_ah *ah) -{ - return -ENOSYS; -} - -/** * i40iw_get_vector_affinity - report IRQ affinity mask * @ibdev: IB device * @comp_vector: completion vector index @@ -2820,7 +2785,6 @@ static struct i40iw_ib_device *i40iw_init_rdma_device(struct i40iw_device *iwdev iwibdev->ibdev.num_comp_vectors = iwdev->ceqs_count; iwibdev->ibdev.dev.parent = &pcidev->dev; iwibdev->ibdev.query_port = i40iw_query_port; - iwibdev->ibdev.modify_port = i40iw_modify_port; iwibdev->ibdev.query_pkey = i40iw_query_pkey; iwibdev->ibdev.query_gid = i40iw_query_gid; iwibdev->ibdev.alloc_ucontext = i40iw_alloc_ucontext; @@ -2840,8 +2804,6 @@ static struct i40iw_ib_device *i40iw_init_rdma_device(struct i40iw_device *iwdev iwibdev->ibdev.alloc_hw_stats = i40iw_alloc_hw_stats; iwibdev->ibdev.get_hw_stats = i40iw_get_hw_stats; iwibdev->ibdev.query_device = i40iw_query_device; - iwibdev->ibdev.create_ah = i40iw_create_ah; - iwibdev->ibdev.destroy_ah = i40iw_destroy_ah; iwibdev->ibdev.drain_sq = i40iw_drain_sq; iwibdev->ibdev.drain_rq = i40iw_drain_rq; iwibdev->ibdev.alloc_mr = i40iw_alloc_mr; diff --git a/drivers/infiniband/hw/mlx4/ah.c b/drivers/infiniband/hw/mlx4/ah.c index 9345d5b546d1..e9e3a6f390db 100644 --- a/drivers/infiniband/hw/mlx4/ah.c +++ b/drivers/infiniband/hw/mlx4/ah.c @@ -82,12 +82,11 @@ static struct ib_ah *create_iboe_ah(struct ib_pd *pd, struct mlx4_ib_ah *ah) { struct mlx4_ib_dev *ibdev = to_mdev(pd->device); + const struct ib_gid_attr *gid_attr; struct mlx4_dev *dev = ibdev->dev; int is_mcast = 0; struct in6_addr in6; u16 vlan_tag = 0xffff; - union ib_gid sgid; - struct ib_gid_attr gid_attr; const struct ib_global_route *grh = rdma_ah_read_grh(ah_attr); int ret; @@ -96,25 +95,30 @@ static struct ib_ah *create_iboe_ah(struct ib_pd *pd, is_mcast = 1; memcpy(ah->av.eth.mac, ah_attr->roce.dmac, ETH_ALEN); - ret = ib_get_cached_gid(pd->device, rdma_ah_get_port_num(ah_attr), - grh->sgid_index, &sgid, &gid_attr); - if (ret) - return ERR_PTR(ret); eth_zero_addr(ah->av.eth.s_mac); - if (is_vlan_dev(gid_attr.ndev)) - vlan_tag = vlan_dev_vlan_id(gid_attr.ndev); - memcpy(ah->av.eth.s_mac, gid_attr.ndev->dev_addr, ETH_ALEN); - dev_put(gid_attr.ndev); + + /* + * If sgid_attr is NULL we are being called by mlx4_ib_create_ah_slave + * and we are directly creating an AV for a slave's gid_index. + */ + gid_attr = ah_attr->grh.sgid_attr; + if (gid_attr) { + if (is_vlan_dev(gid_attr->ndev)) + vlan_tag = vlan_dev_vlan_id(gid_attr->ndev); + memcpy(ah->av.eth.s_mac, gid_attr->ndev->dev_addr, ETH_ALEN); + ret = mlx4_ib_gid_index_to_real_index(ibdev, gid_attr); + if (ret < 0) + return ERR_PTR(ret); + ah->av.eth.gid_index = ret; + } else { + /* mlx4_ib_create_ah_slave fills in the s_mac and the vlan */ + ah->av.eth.gid_index = ah_attr->grh.sgid_index; + } + if (vlan_tag < 0x1000) vlan_tag |= (rdma_ah_get_sl(ah_attr) & 7) << 13; ah->av.eth.port_pd = cpu_to_be32(to_mpd(pd)->pdn | (rdma_ah_get_port_num(ah_attr) << 24)); - ret = mlx4_ib_gid_index_to_real_index(ibdev, - rdma_ah_get_port_num(ah_attr), - grh->sgid_index); - if (ret < 0) - return ERR_PTR(ret); - ah->av.eth.gid_index = ret; ah->av.eth.vlan = cpu_to_be16(vlan_tag); ah->av.eth.hop_limit = grh->hop_limit; if (rdma_ah_get_static_rate(ah_attr)) { @@ -173,6 +177,40 @@ struct ib_ah *mlx4_ib_create_ah(struct ib_pd *pd, struct rdma_ah_attr *ah_attr, return create_ib_ah(pd, ah_attr, ah); /* never fails */ } +/* AH's created via this call must be free'd by mlx4_ib_destroy_ah. */ +struct ib_ah *mlx4_ib_create_ah_slave(struct ib_pd *pd, + struct rdma_ah_attr *ah_attr, + int slave_sgid_index, u8 *s_mac, + u16 vlan_tag) +{ + struct rdma_ah_attr slave_attr = *ah_attr; + struct mlx4_ib_ah *mah; + struct ib_ah *ah; + + slave_attr.grh.sgid_attr = NULL; + slave_attr.grh.sgid_index = slave_sgid_index; + ah = mlx4_ib_create_ah(pd, &slave_attr, NULL); + if (IS_ERR(ah)) + return ah; + + ah->device = pd->device; + ah->pd = pd; + ah->type = ah_attr->type; + mah = to_mah(ah); + + /* get rid of force-loopback bit */ + mah->av.ib.port_pd &= cpu_to_be32(0x7FFFFFFF); + + if (ah_attr->type == RDMA_AH_ATTR_TYPE_ROCE) + memcpy(mah->av.eth.s_mac, s_mac, 6); + + if (vlan_tag < 0x1000) + vlan_tag |= (rdma_ah_get_sl(ah_attr) & 7) << 13; + mah->av.eth.vlan = cpu_to_be16(vlan_tag); + + return ah; +} + int mlx4_ib_query_ah(struct ib_ah *ibah, struct rdma_ah_attr *ah_attr) { struct mlx4_ib_ah *ah = to_mah(ibah); diff --git a/drivers/infiniband/hw/mlx4/mad.c b/drivers/infiniband/hw/mlx4/mad.c index 90a3e2642c2e..e5466d786bb1 100644 --- a/drivers/infiniband/hw/mlx4/mad.c +++ b/drivers/infiniband/hw/mlx4/mad.c @@ -506,7 +506,7 @@ int mlx4_ib_send_to_slave(struct mlx4_ib_dev *dev, int slave, u8 port, { struct ib_sge list; struct ib_ud_wr wr; - struct ib_send_wr *bad_wr; + const struct ib_send_wr *bad_wr; struct mlx4_ib_demux_pv_ctx *tun_ctx; struct mlx4_ib_demux_pv_qp *tun_qp; struct mlx4_rcv_tunnel_mad *tun_mad; @@ -1310,7 +1310,8 @@ static int mlx4_ib_post_pv_qp_buf(struct mlx4_ib_demux_pv_ctx *ctx, int index) { struct ib_sge sg_list; - struct ib_recv_wr recv_wr, *bad_recv_wr; + struct ib_recv_wr recv_wr; + const struct ib_recv_wr *bad_recv_wr; int size; size = (tun_qp->qp->qp_type == IB_QPT_UD) ? @@ -1361,19 +1362,16 @@ int mlx4_ib_send_to_wire(struct mlx4_ib_dev *dev, int slave, u8 port, { struct ib_sge list; struct ib_ud_wr wr; - struct ib_send_wr *bad_wr; + const struct ib_send_wr *bad_wr; struct mlx4_ib_demux_pv_ctx *sqp_ctx; struct mlx4_ib_demux_pv_qp *sqp; struct mlx4_mad_snd_buf *sqp_mad; struct ib_ah *ah; struct ib_qp *send_qp = NULL; - struct ib_global_route *grh; unsigned wire_tx_ix = 0; int ret = 0; u16 wire_pkey_ix; int src_qpnum; - u8 sgid_index; - sqp_ctx = dev->sriov.sqps[port-1]; @@ -1394,16 +1392,11 @@ int mlx4_ib_send_to_wire(struct mlx4_ib_dev *dev, int slave, u8 port, send_qp = sqp->qp; /* create ah */ - grh = rdma_ah_retrieve_grh(attr); - sgid_index = grh->sgid_index; - grh->sgid_index = 0; - ah = rdma_create_ah(sqp_ctx->pd, attr); + ah = mlx4_ib_create_ah_slave(sqp_ctx->pd, attr, + rdma_ah_retrieve_grh(attr)->sgid_index, + s_mac, vlan_id); if (IS_ERR(ah)) return -ENOMEM; - grh->sgid_index = sgid_index; - to_mah(ah)->av.ib.gid_index = sgid_index; - /* get rid of force-loopback bit */ - to_mah(ah)->av.ib.port_pd &= cpu_to_be32(0x7FFFFFFF); spin_lock(&sqp->tx_lock); if (sqp->tx_ix_head - sqp->tx_ix_tail >= (MLX4_NUM_TUNNEL_BUFS - 1)) @@ -1445,12 +1438,6 @@ int mlx4_ib_send_to_wire(struct mlx4_ib_dev *dev, int slave, u8 port, wr.wr.num_sge = 1; wr.wr.opcode = IB_WR_SEND; wr.wr.send_flags = IB_SEND_SIGNALED; - if (s_mac) - memcpy(to_mah(ah)->av.eth.s_mac, s_mac, 6); - if (vlan_id < 0x1000) - vlan_id |= (rdma_ah_get_sl(attr) & 7) << 13; - to_mah(ah)->av.eth.vlan = cpu_to_be16(vlan_id); - ret = ib_post_send(send_qp, &wr.wr, &bad_wr); if (!ret) @@ -1461,7 +1448,7 @@ int mlx4_ib_send_to_wire(struct mlx4_ib_dev *dev, int slave, u8 port, spin_unlock(&sqp->tx_lock); sqp->tx_ring[wire_tx_ix].ah = NULL; out: - rdma_destroy_ah(ah); + mlx4_ib_destroy_ah(ah); return ret; } diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index 4ec519afc45b..ca0f1ee26091 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -246,9 +246,7 @@ static int mlx4_ib_update_gids(struct gid_entry *gids, return mlx4_ib_update_gids_v1(gids, ibdev, port_num); } -static int mlx4_ib_add_gid(const union ib_gid *gid, - const struct ib_gid_attr *attr, - void **context) +static int mlx4_ib_add_gid(const struct ib_gid_attr *attr, void **context) { struct mlx4_ib_dev *ibdev = to_mdev(attr->device); struct mlx4_ib_iboe *iboe = &ibdev->iboe; @@ -271,8 +269,9 @@ static int mlx4_ib_add_gid(const union ib_gid *gid, port_gid_table = &iboe->gids[attr->port_num - 1]; spin_lock_bh(&iboe->lock); for (i = 0; i < MLX4_MAX_PORT_GIDS; ++i) { - if (!memcmp(&port_gid_table->gids[i].gid, gid, sizeof(*gid)) && - (port_gid_table->gids[i].gid_type == attr->gid_type)) { + if (!memcmp(&port_gid_table->gids[i].gid, + &attr->gid, sizeof(attr->gid)) && + port_gid_table->gids[i].gid_type == attr->gid_type) { found = i; break; } @@ -289,7 +288,8 @@ static int mlx4_ib_add_gid(const union ib_gid *gid, ret = -ENOMEM; } else { *context = port_gid_table->gids[free].ctx; - memcpy(&port_gid_table->gids[free].gid, gid, sizeof(*gid)); + memcpy(&port_gid_table->gids[free].gid, + &attr->gid, sizeof(attr->gid)); port_gid_table->gids[free].gid_type = attr->gid_type; port_gid_table->gids[free].ctx->real_index = free; port_gid_table->gids[free].ctx->refcount = 1; @@ -380,17 +380,15 @@ static int mlx4_ib_del_gid(const struct ib_gid_attr *attr, void **context) } int mlx4_ib_gid_index_to_real_index(struct mlx4_ib_dev *ibdev, - u8 port_num, int index) + const struct ib_gid_attr *attr) { struct mlx4_ib_iboe *iboe = &ibdev->iboe; struct gid_cache_context *ctx = NULL; - union ib_gid gid; struct mlx4_port_gid_table *port_gid_table; int real_index = -EINVAL; int i; - int ret; unsigned long flags; - struct ib_gid_attr attr; + u8 port_num = attr->port_num; if (port_num > MLX4_MAX_PORTS) return -EINVAL; @@ -399,21 +397,15 @@ int mlx4_ib_gid_index_to_real_index(struct mlx4_ib_dev *ibdev, port_num = 1; if (!rdma_cap_roce_gid_table(&ibdev->ib_dev, port_num)) - return index; - - ret = ib_get_cached_gid(&ibdev->ib_dev, port_num, index, &gid, &attr); - if (ret) - return ret; - - if (attr.ndev) - dev_put(attr.ndev); + return attr->index; spin_lock_irqsave(&iboe->lock, flags); port_gid_table = &iboe->gids[port_num - 1]; for (i = 0; i < MLX4_MAX_PORT_GIDS; ++i) - if (!memcmp(&port_gid_table->gids[i].gid, &gid, sizeof(gid)) && - attr.gid_type == port_gid_table->gids[i].gid_type) { + if (!memcmp(&port_gid_table->gids[i].gid, + &attr->gid, sizeof(attr->gid)) && + attr->gid_type == port_gid_table->gids[i].gid_type) { ctx = port_gid_table->gids[i].ctx; break; } @@ -525,8 +517,8 @@ static int mlx4_ib_query_device(struct ib_device *ibdev, props->page_size_cap = dev->dev->caps.page_size_cap; props->max_qp = dev->dev->quotas.qp; props->max_qp_wr = dev->dev->caps.max_wqes - MLX4_IB_SQ_MAX_SPARE; - props->max_sge = min(dev->dev->caps.max_sq_sg, - dev->dev->caps.max_rq_sg); + props->max_send_sge = dev->dev->caps.max_sq_sg; + props->max_recv_sge = dev->dev->caps.max_rq_sg; props->max_sge_rd = MLX4_MAX_SGE_RD; props->max_cq = dev->dev->quotas.cq; props->max_cqe = dev->dev->caps.max_cqes; @@ -770,7 +762,8 @@ static int eth_link_query_port(struct ib_device *ibdev, u8 port, IB_WIDTH_4X : IB_WIDTH_1X; props->active_speed = (((u8 *)mailbox->buf)[5] == 0x20 /*56Gb*/) ? IB_SPEED_FDR : IB_SPEED_QDR; - props->port_cap_flags = IB_PORT_CM_SUP | IB_PORT_IP_BASED_GIDS; + props->port_cap_flags = IB_PORT_CM_SUP; + props->ip_gids = true; props->gid_tbl_len = mdev->dev->caps.gid_table_len[port]; props->max_msg_sz = mdev->dev->caps.max_msg_sz; props->pkey_tbl_len = 1; @@ -2709,6 +2702,8 @@ static void *mlx4_ib_add(struct mlx4_dev *dev) ibdev->ib_dev.modify_qp = mlx4_ib_modify_qp; ibdev->ib_dev.query_qp = mlx4_ib_query_qp; ibdev->ib_dev.destroy_qp = mlx4_ib_destroy_qp; + ibdev->ib_dev.drain_sq = mlx4_ib_drain_sq; + ibdev->ib_dev.drain_rq = mlx4_ib_drain_rq; ibdev->ib_dev.post_send = mlx4_ib_post_send; ibdev->ib_dev.post_recv = mlx4_ib_post_recv; ibdev->ib_dev.create_cq = mlx4_ib_create_cq; diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h index 7b1429917aba..e10dccc7958f 100644 --- a/drivers/infiniband/hw/mlx4/mlx4_ib.h +++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h @@ -322,7 +322,6 @@ struct mlx4_ib_qp { u32 doorbell_qpn; __be32 sq_signal_bits; unsigned sq_next_wqe; - int sq_max_wqes_per_wr; int sq_spare_wqes; struct mlx4_ib_wq sq; @@ -760,6 +759,10 @@ void mlx4_ib_cq_clean(struct mlx4_ib_cq *cq, u32 qpn, struct mlx4_ib_srq *srq); struct ib_ah *mlx4_ib_create_ah(struct ib_pd *pd, struct rdma_ah_attr *ah_attr, struct ib_udata *udata); +struct ib_ah *mlx4_ib_create_ah_slave(struct ib_pd *pd, + struct rdma_ah_attr *ah_attr, + int slave_sgid_index, u8 *s_mac, + u16 vlan_tag); int mlx4_ib_query_ah(struct ib_ah *ibah, struct rdma_ah_attr *ah_attr); int mlx4_ib_destroy_ah(struct ib_ah *ah); @@ -771,21 +774,23 @@ int mlx4_ib_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, int mlx4_ib_query_srq(struct ib_srq *srq, struct ib_srq_attr *srq_attr); int mlx4_ib_destroy_srq(struct ib_srq *srq); void mlx4_ib_free_srq_wqe(struct mlx4_ib_srq *srq, int wqe_index); -int mlx4_ib_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr, - struct ib_recv_wr **bad_wr); +int mlx4_ib_post_srq_recv(struct ib_srq *ibsrq, const struct ib_recv_wr *wr, + const struct ib_recv_wr **bad_wr); struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *init_attr, struct ib_udata *udata); int mlx4_ib_destroy_qp(struct ib_qp *qp); +void mlx4_ib_drain_sq(struct ib_qp *qp); +void mlx4_ib_drain_rq(struct ib_qp *qp); int mlx4_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask, struct ib_udata *udata); int mlx4_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr); -int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, - struct ib_send_wr **bad_wr); -int mlx4_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr, - struct ib_recv_wr **bad_wr); +int mlx4_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, + const struct ib_send_wr **bad_wr); +int mlx4_ib_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr, + const struct ib_recv_wr **bad_wr); int mlx4_MAD_IFC(struct mlx4_ib_dev *dev, int mad_ifc_flags, int port, const struct ib_wc *in_wc, const struct ib_grh *in_grh, @@ -900,7 +905,7 @@ int mlx4_ib_rereg_user_mr(struct ib_mr *mr, int flags, int mr_access_flags, struct ib_pd *pd, struct ib_udata *udata); int mlx4_ib_gid_index_to_real_index(struct mlx4_ib_dev *ibdev, - u8 port_num, int index); + const struct ib_gid_attr *attr); void mlx4_sched_ib_sl2vl_update_work(struct mlx4_ib_dev *ibdev, int port); diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c index 3b8045fd23ed..6dd3cd2c2f80 100644 --- a/drivers/infiniband/hw/mlx4/qp.c +++ b/drivers/infiniband/hw/mlx4/qp.c @@ -204,91 +204,26 @@ static void *get_send_wqe(struct mlx4_ib_qp *qp, int n) /* * Stamp a SQ WQE so that it is invalid if prefetched by marking the - * first four bytes of every 64 byte chunk with - * 0x7FFFFFF | (invalid_ownership_value << 31). - * - * When the max work request size is less than or equal to the WQE - * basic block size, as an optimization, we can stamp all WQEs with - * 0xffffffff, and skip the very first chunk of each WQE. + * first four bytes of every 64 byte chunk with 0xffffffff, except for + * the very first chunk of the WQE. */ -static void stamp_send_wqe(struct mlx4_ib_qp *qp, int n, int size) +static void stamp_send_wqe(struct mlx4_ib_qp *qp, int n) { __be32 *wqe; int i; int s; - int ind; void *buf; - __be32 stamp; struct mlx4_wqe_ctrl_seg *ctrl; - if (qp->sq_max_wqes_per_wr > 1) { - s = roundup(size, 1U << qp->sq.wqe_shift); - for (i = 0; i < s; i += 64) { - ind = (i >> qp->sq.wqe_shift) + n; - stamp = ind & qp->sq.wqe_cnt ? cpu_to_be32(0x7fffffff) : - cpu_to_be32(0xffffffff); - buf = get_send_wqe(qp, ind & (qp->sq.wqe_cnt - 1)); - wqe = buf + (i & ((1 << qp->sq.wqe_shift) - 1)); - *wqe = stamp; - } - } else { - ctrl = buf = get_send_wqe(qp, n & (qp->sq.wqe_cnt - 1)); - s = (ctrl->qpn_vlan.fence_size & 0x3f) << 4; - for (i = 64; i < s; i += 64) { - wqe = buf + i; - *wqe = cpu_to_be32(0xffffffff); - } + buf = get_send_wqe(qp, n & (qp->sq.wqe_cnt - 1)); + ctrl = (struct mlx4_wqe_ctrl_seg *)buf; + s = (ctrl->qpn_vlan.fence_size & 0x3f) << 4; + for (i = 64; i < s; i += 64) { + wqe = buf + i; + *wqe = cpu_to_be32(0xffffffff); } } -static void post_nop_wqe(struct mlx4_ib_qp *qp, int n, int size) -{ - struct mlx4_wqe_ctrl_seg *ctrl; - struct mlx4_wqe_inline_seg *inl; - void *wqe; - int s; - - ctrl = wqe = get_send_wqe(qp, n & (qp->sq.wqe_cnt - 1)); - s = sizeof(struct mlx4_wqe_ctrl_seg); - - if (qp->ibqp.qp_type == IB_QPT_UD) { - struct mlx4_wqe_datagram_seg *dgram = wqe + sizeof *ctrl; - struct mlx4_av *av = (struct mlx4_av *)dgram->av; - memset(dgram, 0, sizeof *dgram); - av->port_pd = cpu_to_be32((qp->port << 24) | to_mpd(qp->ibqp.pd)->pdn); - s += sizeof(struct mlx4_wqe_datagram_seg); - } - - /* Pad the remainder of the WQE with an inline data segment. */ - if (size > s) { - inl = wqe + s; - inl->byte_count = cpu_to_be32(1 << 31 | (size - s - sizeof *inl)); - } - ctrl->srcrb_flags = 0; - ctrl->qpn_vlan.fence_size = size / 16; - /* - * Make sure descriptor is fully written before setting ownership bit - * (because HW can start executing as soon as we do). - */ - wmb(); - - ctrl->owner_opcode = cpu_to_be32(MLX4_OPCODE_NOP | MLX4_WQE_CTRL_NEC) | - (n & qp->sq.wqe_cnt ? cpu_to_be32(1 << 31) : 0); - - stamp_send_wqe(qp, n + qp->sq_spare_wqes, size); -} - -/* Post NOP WQE to prevent wrap-around in the middle of WR */ -static inline unsigned pad_wraparound(struct mlx4_ib_qp *qp, int ind) -{ - unsigned s = qp->sq.wqe_cnt - (ind & (qp->sq.wqe_cnt - 1)); - if (unlikely(s < qp->sq_max_wqes_per_wr)) { - post_nop_wqe(qp, ind, s << qp->sq.wqe_shift); - ind += s; - } - return ind; -} - static void mlx4_ib_qp_event(struct mlx4_qp *qp, enum mlx4_event type) { struct ib_event event; @@ -433,8 +368,7 @@ static int set_rq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap, } static int set_kernel_sq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap, - enum mlx4_ib_qp_type type, struct mlx4_ib_qp *qp, - bool shrink_wqe) + enum mlx4_ib_qp_type type, struct mlx4_ib_qp *qp) { int s; @@ -461,70 +395,20 @@ static int set_kernel_sq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap, if (s > dev->dev->caps.max_sq_desc_sz) return -EINVAL; + qp->sq.wqe_shift = ilog2(roundup_pow_of_two(s)); + /* - * Hermon supports shrinking WQEs, such that a single work - * request can include multiple units of 1 << wqe_shift. This - * way, work requests can differ in size, and do not have to - * be a power of 2 in size, saving memory and speeding up send - * WR posting. Unfortunately, if we do this then the - * wqe_index field in CQEs can't be used to look up the WR ID - * anymore, so we do this only if selective signaling is off. - * - * Further, on 32-bit platforms, we can't use vmap() to make - * the QP buffer virtually contiguous. Thus we have to use - * constant-sized WRs to make sure a WR is always fully within - * a single page-sized chunk. - * - * Finally, we use NOP work requests to pad the end of the - * work queue, to avoid wrap-around in the middle of WR. We - * set NEC bit to avoid getting completions with error for - * these NOP WRs, but since NEC is only supported starting - * with firmware 2.2.232, we use constant-sized WRs for older - * firmware. - * - * And, since MLX QPs only support SEND, we use constant-sized - * WRs in this case. - * - * We look for the smallest value of wqe_shift such that the - * resulting number of wqes does not exceed device - * capabilities. - * - * We set WQE size to at least 64 bytes, this way stamping - * invalidates each WQE. + * We need to leave 2 KB + 1 WR of headroom in the SQ to + * allow HW to prefetch. */ - if (shrink_wqe && dev->dev->caps.fw_ver >= MLX4_FW_VER_WQE_CTRL_NEC && - qp->sq_signal_bits && BITS_PER_LONG == 64 && - type != MLX4_IB_QPT_SMI && type != MLX4_IB_QPT_GSI && - !(type & (MLX4_IB_QPT_PROXY_SMI_OWNER | MLX4_IB_QPT_PROXY_SMI | - MLX4_IB_QPT_PROXY_GSI | MLX4_IB_QPT_TUN_SMI_OWNER))) - qp->sq.wqe_shift = ilog2(64); - else - qp->sq.wqe_shift = ilog2(roundup_pow_of_two(s)); - - for (;;) { - qp->sq_max_wqes_per_wr = DIV_ROUND_UP(s, 1U << qp->sq.wqe_shift); - - /* - * We need to leave 2 KB + 1 WR of headroom in the SQ to - * allow HW to prefetch. - */ - qp->sq_spare_wqes = (2048 >> qp->sq.wqe_shift) + qp->sq_max_wqes_per_wr; - qp->sq.wqe_cnt = roundup_pow_of_two(cap->max_send_wr * - qp->sq_max_wqes_per_wr + - qp->sq_spare_wqes); - - if (qp->sq.wqe_cnt <= dev->dev->caps.max_wqes) - break; - - if (qp->sq_max_wqes_per_wr <= 1) - return -EINVAL; - - ++qp->sq.wqe_shift; - } - - qp->sq.max_gs = (min(dev->dev->caps.max_sq_desc_sz, - (qp->sq_max_wqes_per_wr << qp->sq.wqe_shift)) - - send_wqe_overhead(type, qp->flags)) / + qp->sq_spare_wqes = (2048 >> qp->sq.wqe_shift) + 1; + qp->sq.wqe_cnt = roundup_pow_of_two(cap->max_send_wr + + qp->sq_spare_wqes); + + qp->sq.max_gs = + (min(dev->dev->caps.max_sq_desc_sz, + (1 << qp->sq.wqe_shift)) - + send_wqe_overhead(type, qp->flags)) / sizeof (struct mlx4_wqe_data_seg); qp->buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) + @@ -538,7 +422,7 @@ static int set_kernel_sq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap, } cap->max_send_wr = qp->sq.max_post = - (qp->sq.wqe_cnt - qp->sq_spare_wqes) / qp->sq_max_wqes_per_wr; + qp->sq.wqe_cnt - qp->sq_spare_wqes; cap->max_send_sge = min(qp->sq.max_gs, min(dev->dev->caps.max_sq_sg, dev->dev->caps.max_rq_sg)); @@ -977,7 +861,6 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd, { int qpn; int err; - struct ib_qp_cap backup_cap; struct mlx4_ib_sqp *sqp = NULL; struct mlx4_ib_qp *qp; enum mlx4_ib_qp_type qp_type = (enum mlx4_ib_qp_type) init_attr->qp_type; @@ -1178,9 +1061,7 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd, goto err; } - memcpy(&backup_cap, &init_attr->cap, sizeof(backup_cap)); - err = set_kernel_sq_size(dev, &init_attr->cap, - qp_type, qp, true); + err = set_kernel_sq_size(dev, &init_attr->cap, qp_type, qp); if (err) goto err; @@ -1192,20 +1073,10 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd, *qp->db.db = 0; } - if (mlx4_buf_alloc(dev->dev, qp->buf_size, qp->buf_size, + if (mlx4_buf_alloc(dev->dev, qp->buf_size, PAGE_SIZE * 2, &qp->buf)) { - memcpy(&init_attr->cap, &backup_cap, - sizeof(backup_cap)); - err = set_kernel_sq_size(dev, &init_attr->cap, qp_type, - qp, false); - if (err) - goto err_db; - - if (mlx4_buf_alloc(dev->dev, qp->buf_size, - PAGE_SIZE * 2, &qp->buf)) { - err = -ENOMEM; - goto err_db; - } + err = -ENOMEM; + goto err_db; } err = mlx4_mtt_init(dev->dev, qp->buf.npages, qp->buf.page_shift, @@ -1859,8 +1730,7 @@ static int _mlx4_set_path(struct mlx4_ib_dev *dev, if (rdma_ah_get_ah_flags(ah) & IB_AH_GRH) { const struct ib_global_route *grh = rdma_ah_read_grh(ah); int real_sgid_index = - mlx4_ib_gid_index_to_real_index(dev, port, - grh->sgid_index); + mlx4_ib_gid_index_to_real_index(dev, grh->sgid_attr); if (real_sgid_index < 0) return real_sgid_index; @@ -2176,6 +2046,7 @@ static int __mlx4_ib_modify_qp(void *src, enum mlx4_ib_source_type src_type, { struct ib_uobject *ibuobject; struct ib_srq *ibsrq; + const struct ib_gid_attr *gid_attr = NULL; struct ib_rwq_ind_table *rwq_ind_tbl; enum ib_qp_type qp_type; struct mlx4_ib_dev *dev; @@ -2356,29 +2227,17 @@ static int __mlx4_ib_modify_qp(void *src, enum mlx4_ib_source_type src_type, if (attr_mask & IB_QP_AV) { u8 port_num = mlx4_is_bonded(dev->dev) ? 1 : attr_mask & IB_QP_PORT ? attr->port_num : qp->port; - union ib_gid gid; - struct ib_gid_attr gid_attr = {.gid_type = IB_GID_TYPE_IB}; u16 vlan = 0xffff; u8 smac[ETH_ALEN]; - int status = 0; int is_eth = rdma_cap_eth_ah(&dev->ib_dev, port_num) && rdma_ah_get_ah_flags(&attr->ah_attr) & IB_AH_GRH; if (is_eth) { - int index = - rdma_ah_read_grh(&attr->ah_attr)->sgid_index; - - status = ib_get_cached_gid(&dev->ib_dev, port_num, - index, &gid, &gid_attr); - if (!status) { - vlan = rdma_vlan_dev_vlan_id(gid_attr.ndev); - memcpy(smac, gid_attr.ndev->dev_addr, ETH_ALEN); - dev_put(gid_attr.ndev); - } + gid_attr = attr->ah_attr.grh.sgid_attr; + vlan = rdma_vlan_dev_vlan_id(gid_attr->ndev); + memcpy(smac, gid_attr->ndev->dev_addr, ETH_ALEN); } - if (status) - goto out; if (mlx4_set_path(dev, attr, attr_mask, qp, &context->pri_path, port_num, vlan, smac)) @@ -2389,7 +2248,7 @@ static int __mlx4_ib_modify_qp(void *src, enum mlx4_ib_source_type src_type, if (is_eth && (cur_state == IB_QPS_INIT && new_state == IB_QPS_RTR)) { - u8 qpc_roce_mode = gid_type_to_qpc(gid_attr.gid_type); + u8 qpc_roce_mode = gid_type_to_qpc(gid_attr->gid_type); if (qpc_roce_mode == MLX4_QPC_ROCE_MODE_UNDEFINED) { err = -EINVAL; @@ -2594,11 +2453,9 @@ static int __mlx4_ib_modify_qp(void *src, enum mlx4_ib_source_type src_type, for (i = 0; i < qp->sq.wqe_cnt; ++i) { ctrl = get_send_wqe(qp, i); ctrl->owner_opcode = cpu_to_be32(1 << 31); - if (qp->sq_max_wqes_per_wr == 1) - ctrl->qpn_vlan.fence_size = - 1 << (qp->sq.wqe_shift - 4); - - stamp_send_wqe(qp, i, 1 << qp->sq.wqe_shift); + ctrl->qpn_vlan.fence_size = + 1 << (qp->sq.wqe_shift - 4); + stamp_send_wqe(qp, i); } } @@ -2937,7 +2794,7 @@ static int vf_get_qp0_qkey(struct mlx4_dev *dev, int qpn, u32 *qkey) } static int build_sriov_qp0_header(struct mlx4_ib_sqp *sqp, - struct ib_ud_wr *wr, + const struct ib_ud_wr *wr, void *wqe, unsigned *mlx_seg_len) { struct mlx4_ib_dev *mdev = to_mdev(sqp->qp.ibqp.device); @@ -3085,7 +2942,7 @@ static int fill_gid_by_hw_index(struct mlx4_ib_dev *ibdev, u8 port_num, } #define MLX4_ROCEV2_QP1_SPORT 0xC000 -static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_ud_wr *wr, +static int build_mlx_header(struct mlx4_ib_sqp *sqp, const struct ib_ud_wr *wr, void *wqe, unsigned *mlx_seg_len) { struct ib_device *ib_dev = sqp->qp.ibqp.device; @@ -3181,10 +3038,8 @@ static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_ud_wr *wr, to_mdev(ib_dev)->sriov.demux[sqp->qp.port - 1]. guid_cache[ah->av.ib.gid_index]; } else { - ib_get_cached_gid(ib_dev, - be32_to_cpu(ah->av.ib.port_pd) >> 24, - ah->av.ib.gid_index, - &sqp->ud_header.grh.source_gid, NULL); + sqp->ud_header.grh.source_gid = + ah->ibah.sgid_attr->gid; } } memcpy(sqp->ud_header.grh.destination_gid.raw, @@ -3369,7 +3224,7 @@ static __be32 convert_access(int acc) } static void set_reg_seg(struct mlx4_wqe_fmr_seg *fseg, - struct ib_reg_wr *wr) + const struct ib_reg_wr *wr) { struct mlx4_ib_mr *mr = to_mmr(wr->mr); @@ -3399,7 +3254,7 @@ static __always_inline void set_raddr_seg(struct mlx4_wqe_raddr_seg *rseg, } static void set_atomic_seg(struct mlx4_wqe_atomic_seg *aseg, - struct ib_atomic_wr *wr) + const struct ib_atomic_wr *wr) { if (wr->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP) { aseg->swap_add = cpu_to_be64(wr->swap); @@ -3415,7 +3270,7 @@ static void set_atomic_seg(struct mlx4_wqe_atomic_seg *aseg, } static void set_masked_atomic_seg(struct mlx4_wqe_masked_atomic_seg *aseg, - struct ib_atomic_wr *wr) + const struct ib_atomic_wr *wr) { aseg->swap_add = cpu_to_be64(wr->swap); aseg->swap_add_mask = cpu_to_be64(wr->swap_mask); @@ -3424,7 +3279,7 @@ static void set_masked_atomic_seg(struct mlx4_wqe_masked_atomic_seg *aseg, } static void set_datagram_seg(struct mlx4_wqe_datagram_seg *dseg, - struct ib_ud_wr *wr) + const struct ib_ud_wr *wr) { memcpy(dseg->av, &to_mah(wr->ah)->av, sizeof (struct mlx4_av)); dseg->dqpn = cpu_to_be32(wr->remote_qpn); @@ -3435,7 +3290,7 @@ static void set_datagram_seg(struct mlx4_wqe_datagram_seg *dseg, static void set_tunnel_datagram_seg(struct mlx4_ib_dev *dev, struct mlx4_wqe_datagram_seg *dseg, - struct ib_ud_wr *wr, + const struct ib_ud_wr *wr, enum mlx4_ib_qp_type qpt) { union mlx4_ext_av *av = &to_mah(wr->ah)->av; @@ -3457,7 +3312,8 @@ static void set_tunnel_datagram_seg(struct mlx4_ib_dev *dev, dseg->qkey = cpu_to_be32(IB_QP_SET_QKEY); } -static void build_tunnel_header(struct ib_ud_wr *wr, void *wqe, unsigned *mlx_seg_len) +static void build_tunnel_header(const struct ib_ud_wr *wr, void *wqe, + unsigned *mlx_seg_len) { struct mlx4_wqe_inline_seg *inl = wqe; struct mlx4_ib_tunnel_header hdr; @@ -3540,9 +3396,9 @@ static void __set_data_seg(struct mlx4_wqe_data_seg *dseg, struct ib_sge *sg) dseg->addr = cpu_to_be64(sg->addr); } -static int build_lso_seg(struct mlx4_wqe_lso_seg *wqe, struct ib_ud_wr *wr, - struct mlx4_ib_qp *qp, unsigned *lso_seg_len, - __be32 *lso_hdr_sz, __be32 *blh) +static int build_lso_seg(struct mlx4_wqe_lso_seg *wqe, + const struct ib_ud_wr *wr, struct mlx4_ib_qp *qp, + unsigned *lso_seg_len, __be32 *lso_hdr_sz, __be32 *blh) { unsigned halign = ALIGN(sizeof *wqe + wr->hlen, 16); @@ -3560,7 +3416,7 @@ static int build_lso_seg(struct mlx4_wqe_lso_seg *wqe, struct ib_ud_wr *wr, return 0; } -static __be32 send_ieth(struct ib_send_wr *wr) +static __be32 send_ieth(const struct ib_send_wr *wr) { switch (wr->opcode) { case IB_WR_SEND_WITH_IMM: @@ -3582,8 +3438,8 @@ static void add_zero_len_inline(void *wqe) inl->byte_count = cpu_to_be32(1 << 31); } -int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, - struct ib_send_wr **bad_wr) +static int _mlx4_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, + const struct ib_send_wr **bad_wr, bool drain) { struct mlx4_ib_qp *qp = to_mqp(ibqp); void *wqe; @@ -3593,7 +3449,6 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, int nreq; int err = 0; unsigned ind; - int uninitialized_var(stamp); int uninitialized_var(size); unsigned uninitialized_var(seglen); __be32 dummy; @@ -3623,7 +3478,8 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, } spin_lock_irqsave(&qp->sq.lock, flags); - if (mdev->dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR) { + if (mdev->dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR && + !drain) { err = -EIO; *bad_wr = wr; nreq = 0; @@ -3865,22 +3721,14 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, ctrl->owner_opcode = mlx4_ib_opcode[wr->opcode] | (ind & qp->sq.wqe_cnt ? cpu_to_be32(1 << 31) : 0) | blh; - stamp = ind + qp->sq_spare_wqes; - ind += DIV_ROUND_UP(size * 16, 1U << qp->sq.wqe_shift); - /* * We can improve latency by not stamping the last * send queue WQE until after ringing the doorbell, so * only stamp here if there are still more WQEs to post. - * - * Same optimization applies to padding with NOP wqe - * in case of WQE shrinking (used to prevent wrap-around - * in the middle of WR). */ - if (wr->next) { - stamp_send_wqe(qp, stamp, size * 16); - ind = pad_wraparound(qp, ind); - } + if (wr->next) + stamp_send_wqe(qp, ind + qp->sq_spare_wqes); + ind++; } out: @@ -3902,9 +3750,8 @@ out: */ mmiowb(); - stamp_send_wqe(qp, stamp, size * 16); + stamp_send_wqe(qp, ind + qp->sq_spare_wqes - 1); - ind = pad_wraparound(qp, ind); qp->sq_next_wqe = ind; } @@ -3913,8 +3760,14 @@ out: return err; } -int mlx4_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr, - struct ib_recv_wr **bad_wr) +int mlx4_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, + const struct ib_send_wr **bad_wr) +{ + return _mlx4_ib_post_send(ibqp, wr, bad_wr, false); +} + +static int _mlx4_ib_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr, + const struct ib_recv_wr **bad_wr, bool drain) { struct mlx4_ib_qp *qp = to_mqp(ibqp); struct mlx4_wqe_data_seg *scat; @@ -3929,7 +3782,8 @@ int mlx4_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr, max_gs = qp->rq.max_gs; spin_lock_irqsave(&qp->rq.lock, flags); - if (mdev->dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR) { + if (mdev->dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR && + !drain) { err = -EIO; *bad_wr = wr; nreq = 0; @@ -4000,6 +3854,12 @@ out: return err; } +int mlx4_ib_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr, + const struct ib_recv_wr **bad_wr) +{ + return _mlx4_ib_post_recv(ibqp, wr, bad_wr, false); +} + static inline enum ib_qp_state to_ib_qp_state(enum mlx4_qp_state mlx4_state) { switch (mlx4_state) { @@ -4047,9 +3907,9 @@ static void to_rdma_ah_attr(struct mlx4_ib_dev *ibdev, u8 port_num = path->sched_queue & 0x40 ? 2 : 1; memset(ah_attr, 0, sizeof(*ah_attr)); - ah_attr->type = rdma_ah_find_type(&ibdev->ib_dev, port_num); if (port_num == 0 || port_num > dev->caps.num_ports) return; + ah_attr->type = rdma_ah_find_type(&ibdev->ib_dev, port_num); if (ah_attr->type == RDMA_AH_ATTR_TYPE_ROCE) rdma_ah_set_sl(ah_attr, ((path->sched_queue >> 3) & 0x7) | @@ -4465,3 +4325,132 @@ int mlx4_ib_destroy_rwq_ind_table(struct ib_rwq_ind_table *ib_rwq_ind_tbl) kfree(ib_rwq_ind_tbl); return 0; } + +struct mlx4_ib_drain_cqe { + struct ib_cqe cqe; + struct completion done; +}; + +static void mlx4_ib_drain_qp_done(struct ib_cq *cq, struct ib_wc *wc) +{ + struct mlx4_ib_drain_cqe *cqe = container_of(wc->wr_cqe, + struct mlx4_ib_drain_cqe, + cqe); + + complete(&cqe->done); +} + +/* This function returns only once the drained WR was completed */ +static void handle_drain_completion(struct ib_cq *cq, + struct mlx4_ib_drain_cqe *sdrain, + struct mlx4_ib_dev *dev) +{ + struct mlx4_dev *mdev = dev->dev; + + if (cq->poll_ctx == IB_POLL_DIRECT) { + while (wait_for_completion_timeout(&sdrain->done, HZ / 10) <= 0) + ib_process_cq_direct(cq, -1); + return; + } + + if (mdev->persist->state == MLX4_DEVICE_STATE_INTERNAL_ERROR) { + struct mlx4_ib_cq *mcq = to_mcq(cq); + bool triggered = false; + unsigned long flags; + + spin_lock_irqsave(&dev->reset_flow_resource_lock, flags); + /* Make sure that the CQ handler won't run if wasn't run yet */ + if (!mcq->mcq.reset_notify_added) + mcq->mcq.reset_notify_added = 1; + else + triggered = true; + spin_unlock_irqrestore(&dev->reset_flow_resource_lock, flags); + + if (triggered) { + /* Wait for any scheduled/running task to be ended */ + switch (cq->poll_ctx) { + case IB_POLL_SOFTIRQ: + irq_poll_disable(&cq->iop); + irq_poll_enable(&cq->iop); + break; + case IB_POLL_WORKQUEUE: + cancel_work_sync(&cq->work); + break; + default: + WARN_ON_ONCE(1); + } + } + + /* Run the CQ handler - this makes sure that the drain WR will + * be processed if wasn't processed yet. + */ + mcq->mcq.comp(&mcq->mcq); + } + + wait_for_completion(&sdrain->done); +} + +void mlx4_ib_drain_sq(struct ib_qp *qp) +{ + struct ib_cq *cq = qp->send_cq; + struct ib_qp_attr attr = { .qp_state = IB_QPS_ERR }; + struct mlx4_ib_drain_cqe sdrain; + const struct ib_send_wr *bad_swr; + struct ib_rdma_wr swr = { + .wr = { + .next = NULL, + { .wr_cqe = &sdrain.cqe, }, + .opcode = IB_WR_RDMA_WRITE, + }, + }; + int ret; + struct mlx4_ib_dev *dev = to_mdev(qp->device); + struct mlx4_dev *mdev = dev->dev; + + ret = ib_modify_qp(qp, &attr, IB_QP_STATE); + if (ret && mdev->persist->state != MLX4_DEVICE_STATE_INTERNAL_ERROR) { + WARN_ONCE(ret, "failed to drain send queue: %d\n", ret); + return; + } + + sdrain.cqe.done = mlx4_ib_drain_qp_done; + init_completion(&sdrain.done); + + ret = _mlx4_ib_post_send(qp, &swr.wr, &bad_swr, true); + if (ret) { + WARN_ONCE(ret, "failed to drain send queue: %d\n", ret); + return; + } + + handle_drain_completion(cq, &sdrain, dev); +} + +void mlx4_ib_drain_rq(struct ib_qp *qp) +{ + struct ib_cq *cq = qp->recv_cq; + struct ib_qp_attr attr = { .qp_state = IB_QPS_ERR }; + struct mlx4_ib_drain_cqe rdrain; + struct ib_recv_wr rwr = {}; + const struct ib_recv_wr *bad_rwr; + int ret; + struct mlx4_ib_dev *dev = to_mdev(qp->device); + struct mlx4_dev *mdev = dev->dev; + + ret = ib_modify_qp(qp, &attr, IB_QP_STATE); + if (ret && mdev->persist->state != MLX4_DEVICE_STATE_INTERNAL_ERROR) { + WARN_ONCE(ret, "failed to drain recv queue: %d\n", ret); + return; + } + + rwr.wr_cqe = &rdrain.cqe; + rdrain.cqe.done = mlx4_ib_drain_qp_done; + init_completion(&rdrain.done); + + ret = _mlx4_ib_post_recv(qp, &rwr, &bad_rwr, true); + if (ret) { + WARN_ONCE(ret, "failed to drain recv queue: %d\n", ret); + return; + } + + handle_drain_completion(cq, &rdrain, dev); +} diff --git a/drivers/infiniband/hw/mlx4/srq.c b/drivers/infiniband/hw/mlx4/srq.c index ebee56cbc0e2..3731b31c3653 100644 --- a/drivers/infiniband/hw/mlx4/srq.c +++ b/drivers/infiniband/hw/mlx4/srq.c @@ -307,8 +307,8 @@ void mlx4_ib_free_srq_wqe(struct mlx4_ib_srq *srq, int wqe_index) spin_unlock(&srq->lock); } -int mlx4_ib_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr, - struct ib_recv_wr **bad_wr) +int mlx4_ib_post_srq_recv(struct ib_srq *ibsrq, const struct ib_recv_wr *wr, + const struct ib_recv_wr **bad_wr) { struct mlx4_ib_srq *srq = to_msrq(ibsrq); struct mlx4_wqe_srq_next_seg *next; diff --git a/drivers/infiniband/hw/mlx5/Makefile b/drivers/infiniband/hw/mlx5/Makefile index d42b922bede8..b8e4b15e2674 100644 --- a/drivers/infiniband/hw/mlx5/Makefile +++ b/drivers/infiniband/hw/mlx5/Makefile @@ -3,3 +3,5 @@ obj-$(CONFIG_MLX5_INFINIBAND) += mlx5_ib.o mlx5_ib-y := main.o cq.o doorbell.o qp.o mem.o srq.o mr.o ah.o mad.o gsi.o ib_virt.o cmd.o cong.o mlx5_ib-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += odp.o mlx5_ib-$(CONFIG_MLX5_ESWITCH) += ib_rep.o +mlx5_ib-$(CONFIG_INFINIBAND_USER_ACCESS) += devx.o +mlx5_ib-$(CONFIG_INFINIBAND_USER_ACCESS) += flow.o diff --git a/drivers/infiniband/hw/mlx5/ah.c b/drivers/infiniband/hw/mlx5/ah.c index e6bde32a83f3..ffd03bf1a71e 100644 --- a/drivers/infiniband/hw/mlx5/ah.c +++ b/drivers/infiniband/hw/mlx5/ah.c @@ -37,7 +37,6 @@ static struct ib_ah *create_ib_ah(struct mlx5_ib_dev *dev, struct rdma_ah_attr *ah_attr) { enum ib_gid_type gid_type; - int err; if (rdma_ah_get_ah_flags(ah_attr) & IB_AH_GRH) { const struct ib_global_route *grh = rdma_ah_read_grh(ah_attr); @@ -53,18 +52,12 @@ static struct ib_ah *create_ib_ah(struct mlx5_ib_dev *dev, ah->av.stat_rate_sl = (rdma_ah_get_static_rate(ah_attr) << 4); if (ah_attr->type == RDMA_AH_ATTR_TYPE_ROCE) { - err = mlx5_get_roce_gid_type(dev, ah_attr->port_num, - ah_attr->grh.sgid_index, - &gid_type); - if (err) - return ERR_PTR(err); + gid_type = ah_attr->grh.sgid_attr->gid_type; memcpy(ah->av.rmac, ah_attr->roce.dmac, sizeof(ah_attr->roce.dmac)); ah->av.udp_sport = - mlx5_get_roce_udp_sport(dev, - rdma_ah_get_port_num(ah_attr), - rdma_ah_read_grh(ah_attr)->sgid_index); + mlx5_get_roce_udp_sport(dev, ah_attr->grh.sgid_attr); ah->av.stat_rate_sl |= (rdma_ah_get_sl(ah_attr) & 0x7) << 1; if (gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) #define MLX5_ECN_ENABLED BIT(1) diff --git a/drivers/infiniband/hw/mlx5/cmd.c b/drivers/infiniband/hw/mlx5/cmd.c index ccc0b5d06a7d..c84fef9a8a08 100644 --- a/drivers/infiniband/hw/mlx5/cmd.c +++ b/drivers/infiniband/hw/mlx5/cmd.c @@ -185,3 +185,15 @@ int mlx5_cmd_dealloc_memic(struct mlx5_memic *memic, u64 addr, u64 length) return err; } + +int mlx5_cmd_query_ext_ppcnt_counters(struct mlx5_core_dev *dev, void *out) +{ + u32 in[MLX5_ST_SZ_DW(ppcnt_reg)] = {}; + int sz = MLX5_ST_SZ_BYTES(ppcnt_reg); + + MLX5_SET(ppcnt_reg, in, local_port, 1); + + MLX5_SET(ppcnt_reg, in, grp, MLX5_ETHERNET_EXTENDED_COUNTERS_GROUP); + return mlx5_core_access_reg(dev, in, sz, out, sz, MLX5_REG_PPCNT, + 0, 0); +} diff --git a/drivers/infiniband/hw/mlx5/cmd.h b/drivers/infiniband/hw/mlx5/cmd.h index 98ea4648c655..88cbb1c41703 100644 --- a/drivers/infiniband/hw/mlx5/cmd.h +++ b/drivers/infiniband/hw/mlx5/cmd.h @@ -41,6 +41,7 @@ int mlx5_cmd_dump_fill_mkey(struct mlx5_core_dev *dev, u32 *mkey); int mlx5_cmd_null_mkey(struct mlx5_core_dev *dev, u32 *null_mkey); int mlx5_cmd_query_cong_params(struct mlx5_core_dev *dev, int cong_point, void *out, int out_size); +int mlx5_cmd_query_ext_ppcnt_counters(struct mlx5_core_dev *dev, void *out); int mlx5_cmd_modify_cong_params(struct mlx5_core_dev *mdev, void *in, int in_size); int mlx5_cmd_alloc_memic(struct mlx5_memic *memic, phys_addr_t *addr, diff --git a/drivers/infiniband/hw/mlx5/cong.c b/drivers/infiniband/hw/mlx5/cong.c index 985fa2637390..7e4e358a4fd8 100644 --- a/drivers/infiniband/hw/mlx5/cong.c +++ b/drivers/infiniband/hw/mlx5/cong.c @@ -359,9 +359,6 @@ static ssize_t get_param(struct file *filp, char __user *buf, size_t count, int ret; char lbuf[11]; - if (*pos) - return 0; - ret = mlx5_ib_get_cc_params(param->dev, param->port_num, offset, &var); if (ret) return ret; @@ -370,11 +367,7 @@ static ssize_t get_param(struct file *filp, char __user *buf, size_t count, if (ret < 0) return ret; - if (copy_to_user(buf, lbuf, ret)) - return -EFAULT; - - *pos += ret; - return ret; + return simple_read_from_buffer(buf, count, pos, lbuf, ret); } static const struct file_operations dbg_cc_fops = { diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c index ad39d64b8108..088205d7f1a1 100644 --- a/drivers/infiniband/hw/mlx5/cq.c +++ b/drivers/infiniband/hw/mlx5/cq.c @@ -1184,7 +1184,7 @@ int mlx5_ib_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period) int err; if (!MLX5_CAP_GEN(dev->mdev, cq_moderation)) - return -ENOSYS; + return -EOPNOTSUPP; if (cq_period > MLX5_MAX_CQ_PERIOD) return -EINVAL; diff --git a/drivers/infiniband/hw/mlx5/devx.c b/drivers/infiniband/hw/mlx5/devx.c new file mode 100644 index 000000000000..ac116d63e466 --- /dev/null +++ b/drivers/infiniband/hw/mlx5/devx.c @@ -0,0 +1,1119 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2018, Mellanox Technologies inc. All rights reserved. + */ + +#include <rdma/ib_user_verbs.h> +#include <rdma/ib_verbs.h> +#include <rdma/uverbs_types.h> +#include <rdma/uverbs_ioctl.h> +#include <rdma/mlx5_user_ioctl_cmds.h> +#include <rdma/ib_umem.h> +#include <linux/mlx5/driver.h> +#include <linux/mlx5/fs.h> +#include "mlx5_ib.h" + +#define UVERBS_MODULE_NAME mlx5_ib +#include <rdma/uverbs_named_ioctl.h> + +#define MLX5_MAX_DESTROY_INBOX_SIZE_DW MLX5_ST_SZ_DW(delete_fte_in) +struct devx_obj { + struct mlx5_core_dev *mdev; + u32 obj_id; + u32 dinlen; /* destroy inbox length */ + u32 dinbox[MLX5_MAX_DESTROY_INBOX_SIZE_DW]; +}; + +struct devx_umem { + struct mlx5_core_dev *mdev; + struct ib_umem *umem; + u32 page_offset; + int page_shift; + int ncont; + u32 dinlen; + u32 dinbox[MLX5_ST_SZ_DW(general_obj_in_cmd_hdr)]; +}; + +struct devx_umem_reg_cmd { + void *in; + u32 inlen; + u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)]; +}; + +static struct mlx5_ib_ucontext *devx_ufile2uctx(struct ib_uverbs_file *file) +{ + return to_mucontext(ib_uverbs_get_ucontext(file)); +} + +int mlx5_ib_devx_create(struct mlx5_ib_dev *dev, struct mlx5_ib_ucontext *context) +{ + u32 in[MLX5_ST_SZ_DW(create_uctx_in)] = {0}; + u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)] = {0}; + u64 general_obj_types; + void *hdr; + int err; + + hdr = MLX5_ADDR_OF(create_uctx_in, in, hdr); + + general_obj_types = MLX5_CAP_GEN_64(dev->mdev, general_obj_types); + if (!(general_obj_types & MLX5_GENERAL_OBJ_TYPES_CAP_UCTX) || + !(general_obj_types & MLX5_GENERAL_OBJ_TYPES_CAP_UMEM)) + return -EINVAL; + + if (!capable(CAP_NET_RAW)) + return -EPERM; + + MLX5_SET(general_obj_in_cmd_hdr, hdr, opcode, MLX5_CMD_OP_CREATE_GENERAL_OBJECT); + MLX5_SET(general_obj_in_cmd_hdr, hdr, obj_type, MLX5_OBJ_TYPE_UCTX); + + err = mlx5_cmd_exec(dev->mdev, in, sizeof(in), out, sizeof(out)); + if (err) + return err; + + context->devx_uid = MLX5_GET(general_obj_out_cmd_hdr, out, obj_id); + return 0; +} + +void mlx5_ib_devx_destroy(struct mlx5_ib_dev *dev, + struct mlx5_ib_ucontext *context) +{ + u32 in[MLX5_ST_SZ_DW(general_obj_in_cmd_hdr)] = {0}; + u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)] = {0}; + + MLX5_SET(general_obj_in_cmd_hdr, in, opcode, MLX5_CMD_OP_DESTROY_GENERAL_OBJECT); + MLX5_SET(general_obj_in_cmd_hdr, in, obj_type, MLX5_OBJ_TYPE_UCTX); + MLX5_SET(general_obj_in_cmd_hdr, in, obj_id, context->devx_uid); + + mlx5_cmd_exec(dev->mdev, in, sizeof(in), out, sizeof(out)); +} + +bool mlx5_ib_devx_is_flow_dest(void *obj, int *dest_id, int *dest_type) +{ + struct devx_obj *devx_obj = obj; + u16 opcode = MLX5_GET(general_obj_in_cmd_hdr, devx_obj->dinbox, opcode); + + switch (opcode) { + case MLX5_CMD_OP_DESTROY_TIR: + *dest_type = MLX5_FLOW_DESTINATION_TYPE_TIR; + *dest_id = MLX5_GET(general_obj_in_cmd_hdr, devx_obj->dinbox, + obj_id); + return true; + + case MLX5_CMD_OP_DESTROY_FLOW_TABLE: + *dest_type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE; + *dest_id = MLX5_GET(destroy_flow_table_in, devx_obj->dinbox, + table_id); + return true; + default: + return false; + } +} + +static int devx_is_valid_obj_id(struct devx_obj *obj, const void *in) +{ + u16 opcode = MLX5_GET(general_obj_in_cmd_hdr, in, opcode); + u32 obj_id; + + switch (opcode) { + case MLX5_CMD_OP_MODIFY_GENERAL_OBJECT: + case MLX5_CMD_OP_QUERY_GENERAL_OBJECT: + obj_id = MLX5_GET(general_obj_in_cmd_hdr, in, obj_id); + break; + case MLX5_CMD_OP_QUERY_MKEY: + obj_id = MLX5_GET(query_mkey_in, in, mkey_index); + break; + case MLX5_CMD_OP_QUERY_CQ: + obj_id = MLX5_GET(query_cq_in, in, cqn); + break; + case MLX5_CMD_OP_MODIFY_CQ: + obj_id = MLX5_GET(modify_cq_in, in, cqn); + break; + case MLX5_CMD_OP_QUERY_SQ: + obj_id = MLX5_GET(query_sq_in, in, sqn); + break; + case MLX5_CMD_OP_MODIFY_SQ: + obj_id = MLX5_GET(modify_sq_in, in, sqn); + break; + case MLX5_CMD_OP_QUERY_RQ: + obj_id = MLX5_GET(query_rq_in, in, rqn); + break; + case MLX5_CMD_OP_MODIFY_RQ: + obj_id = MLX5_GET(modify_rq_in, in, rqn); + break; + case MLX5_CMD_OP_QUERY_RMP: + obj_id = MLX5_GET(query_rmp_in, in, rmpn); + break; + case MLX5_CMD_OP_MODIFY_RMP: + obj_id = MLX5_GET(modify_rmp_in, in, rmpn); + break; + case MLX5_CMD_OP_QUERY_RQT: + obj_id = MLX5_GET(query_rqt_in, in, rqtn); + break; + case MLX5_CMD_OP_MODIFY_RQT: + obj_id = MLX5_GET(modify_rqt_in, in, rqtn); + break; + case MLX5_CMD_OP_QUERY_TIR: + obj_id = MLX5_GET(query_tir_in, in, tirn); + break; + case MLX5_CMD_OP_MODIFY_TIR: + obj_id = MLX5_GET(modify_tir_in, in, tirn); + break; + case MLX5_CMD_OP_QUERY_TIS: + obj_id = MLX5_GET(query_tis_in, in, tisn); + break; + case MLX5_CMD_OP_MODIFY_TIS: + obj_id = MLX5_GET(modify_tis_in, in, tisn); + break; + case MLX5_CMD_OP_QUERY_FLOW_TABLE: + obj_id = MLX5_GET(query_flow_table_in, in, table_id); + break; + case MLX5_CMD_OP_MODIFY_FLOW_TABLE: + obj_id = MLX5_GET(modify_flow_table_in, in, table_id); + break; + case MLX5_CMD_OP_QUERY_FLOW_GROUP: + obj_id = MLX5_GET(query_flow_group_in, in, group_id); + break; + case MLX5_CMD_OP_QUERY_FLOW_TABLE_ENTRY: + obj_id = MLX5_GET(query_fte_in, in, flow_index); + break; + case MLX5_CMD_OP_SET_FLOW_TABLE_ENTRY: + obj_id = MLX5_GET(set_fte_in, in, flow_index); + break; + case MLX5_CMD_OP_QUERY_Q_COUNTER: + obj_id = MLX5_GET(query_q_counter_in, in, counter_set_id); + break; + case MLX5_CMD_OP_QUERY_FLOW_COUNTER: + obj_id = MLX5_GET(query_flow_counter_in, in, flow_counter_id); + break; + case MLX5_CMD_OP_QUERY_MODIFY_HEADER_CONTEXT: + obj_id = MLX5_GET(general_obj_in_cmd_hdr, in, obj_id); + break; + case MLX5_CMD_OP_QUERY_SCHEDULING_ELEMENT: + obj_id = MLX5_GET(query_scheduling_element_in, in, + scheduling_element_id); + break; + case MLX5_CMD_OP_MODIFY_SCHEDULING_ELEMENT: + obj_id = MLX5_GET(modify_scheduling_element_in, in, + scheduling_element_id); + break; + case MLX5_CMD_OP_ADD_VXLAN_UDP_DPORT: + obj_id = MLX5_GET(add_vxlan_udp_dport_in, in, vxlan_udp_port); + break; + case MLX5_CMD_OP_QUERY_L2_TABLE_ENTRY: + obj_id = MLX5_GET(query_l2_table_entry_in, in, table_index); + break; + case MLX5_CMD_OP_SET_L2_TABLE_ENTRY: + obj_id = MLX5_GET(set_l2_table_entry_in, in, table_index); + break; + case MLX5_CMD_OP_QUERY_QP: + obj_id = MLX5_GET(query_qp_in, in, qpn); + break; + case MLX5_CMD_OP_RST2INIT_QP: + obj_id = MLX5_GET(rst2init_qp_in, in, qpn); + break; + case MLX5_CMD_OP_INIT2RTR_QP: + obj_id = MLX5_GET(init2rtr_qp_in, in, qpn); + break; + case MLX5_CMD_OP_RTR2RTS_QP: + obj_id = MLX5_GET(rtr2rts_qp_in, in, qpn); + break; + case MLX5_CMD_OP_RTS2RTS_QP: + obj_id = MLX5_GET(rts2rts_qp_in, in, qpn); + break; + case MLX5_CMD_OP_SQERR2RTS_QP: + obj_id = MLX5_GET(sqerr2rts_qp_in, in, qpn); + break; + case MLX5_CMD_OP_2ERR_QP: + obj_id = MLX5_GET(qp_2err_in, in, qpn); + break; + case MLX5_CMD_OP_2RST_QP: + obj_id = MLX5_GET(qp_2rst_in, in, qpn); + break; + case MLX5_CMD_OP_QUERY_DCT: + obj_id = MLX5_GET(query_dct_in, in, dctn); + break; + case MLX5_CMD_OP_QUERY_XRQ: + obj_id = MLX5_GET(query_xrq_in, in, xrqn); + break; + case MLX5_CMD_OP_QUERY_XRC_SRQ: + obj_id = MLX5_GET(query_xrc_srq_in, in, xrc_srqn); + break; + case MLX5_CMD_OP_ARM_XRC_SRQ: + obj_id = MLX5_GET(arm_xrc_srq_in, in, xrc_srqn); + break; + case MLX5_CMD_OP_QUERY_SRQ: + obj_id = MLX5_GET(query_srq_in, in, srqn); + break; + case MLX5_CMD_OP_ARM_RQ: + obj_id = MLX5_GET(arm_rq_in, in, srq_number); + break; + case MLX5_CMD_OP_DRAIN_DCT: + case MLX5_CMD_OP_ARM_DCT_FOR_KEY_VIOLATION: + obj_id = MLX5_GET(drain_dct_in, in, dctn); + break; + case MLX5_CMD_OP_ARM_XRQ: + obj_id = MLX5_GET(arm_xrq_in, in, xrqn); + break; + default: + return false; + } + + if (obj_id == obj->obj_id) + return true; + + return false; +} + +static bool devx_is_obj_create_cmd(const void *in) +{ + u16 opcode = MLX5_GET(general_obj_in_cmd_hdr, in, opcode); + + switch (opcode) { + case MLX5_CMD_OP_CREATE_GENERAL_OBJECT: + case MLX5_CMD_OP_CREATE_MKEY: + case MLX5_CMD_OP_CREATE_CQ: + case MLX5_CMD_OP_ALLOC_PD: + case MLX5_CMD_OP_ALLOC_TRANSPORT_DOMAIN: + case MLX5_CMD_OP_CREATE_RMP: + case MLX5_CMD_OP_CREATE_SQ: + case MLX5_CMD_OP_CREATE_RQ: + case MLX5_CMD_OP_CREATE_RQT: + case MLX5_CMD_OP_CREATE_TIR: + case MLX5_CMD_OP_CREATE_TIS: + case MLX5_CMD_OP_ALLOC_Q_COUNTER: + case MLX5_CMD_OP_CREATE_FLOW_TABLE: + case MLX5_CMD_OP_CREATE_FLOW_GROUP: + case MLX5_CMD_OP_ALLOC_FLOW_COUNTER: + case MLX5_CMD_OP_ALLOC_ENCAP_HEADER: + case MLX5_CMD_OP_ALLOC_MODIFY_HEADER_CONTEXT: + case MLX5_CMD_OP_CREATE_SCHEDULING_ELEMENT: + case MLX5_CMD_OP_ADD_VXLAN_UDP_DPORT: + case MLX5_CMD_OP_SET_L2_TABLE_ENTRY: + case MLX5_CMD_OP_CREATE_QP: + case MLX5_CMD_OP_CREATE_SRQ: + case MLX5_CMD_OP_CREATE_XRC_SRQ: + case MLX5_CMD_OP_CREATE_DCT: + case MLX5_CMD_OP_CREATE_XRQ: + case MLX5_CMD_OP_ATTACH_TO_MCG: + case MLX5_CMD_OP_ALLOC_XRCD: + return true; + case MLX5_CMD_OP_SET_FLOW_TABLE_ENTRY: + { + u16 op_mod = MLX5_GET(set_fte_in, in, op_mod); + if (op_mod == 0) + return true; + return false; + } + default: + return false; + } +} + +static bool devx_is_obj_modify_cmd(const void *in) +{ + u16 opcode = MLX5_GET(general_obj_in_cmd_hdr, in, opcode); + + switch (opcode) { + case MLX5_CMD_OP_MODIFY_GENERAL_OBJECT: + case MLX5_CMD_OP_MODIFY_CQ: + case MLX5_CMD_OP_MODIFY_RMP: + case MLX5_CMD_OP_MODIFY_SQ: + case MLX5_CMD_OP_MODIFY_RQ: + case MLX5_CMD_OP_MODIFY_RQT: + case MLX5_CMD_OP_MODIFY_TIR: + case MLX5_CMD_OP_MODIFY_TIS: + case MLX5_CMD_OP_MODIFY_FLOW_TABLE: + case MLX5_CMD_OP_MODIFY_SCHEDULING_ELEMENT: + case MLX5_CMD_OP_ADD_VXLAN_UDP_DPORT: + case MLX5_CMD_OP_SET_L2_TABLE_ENTRY: + case MLX5_CMD_OP_RST2INIT_QP: + case MLX5_CMD_OP_INIT2RTR_QP: + case MLX5_CMD_OP_RTR2RTS_QP: + case MLX5_CMD_OP_RTS2RTS_QP: + case MLX5_CMD_OP_SQERR2RTS_QP: + case MLX5_CMD_OP_2ERR_QP: + case MLX5_CMD_OP_2RST_QP: + case MLX5_CMD_OP_ARM_XRC_SRQ: + case MLX5_CMD_OP_ARM_RQ: + case MLX5_CMD_OP_DRAIN_DCT: + case MLX5_CMD_OP_ARM_DCT_FOR_KEY_VIOLATION: + case MLX5_CMD_OP_ARM_XRQ: + return true; + case MLX5_CMD_OP_SET_FLOW_TABLE_ENTRY: + { + u16 op_mod = MLX5_GET(set_fte_in, in, op_mod); + + if (op_mod == 1) + return true; + return false; + } + default: + return false; + } +} + +static bool devx_is_obj_query_cmd(const void *in) +{ + u16 opcode = MLX5_GET(general_obj_in_cmd_hdr, in, opcode); + + switch (opcode) { + case MLX5_CMD_OP_QUERY_GENERAL_OBJECT: + case MLX5_CMD_OP_QUERY_MKEY: + case MLX5_CMD_OP_QUERY_CQ: + case MLX5_CMD_OP_QUERY_RMP: + case MLX5_CMD_OP_QUERY_SQ: + case MLX5_CMD_OP_QUERY_RQ: + case MLX5_CMD_OP_QUERY_RQT: + case MLX5_CMD_OP_QUERY_TIR: + case MLX5_CMD_OP_QUERY_TIS: + case MLX5_CMD_OP_QUERY_Q_COUNTER: + case MLX5_CMD_OP_QUERY_FLOW_TABLE: + case MLX5_CMD_OP_QUERY_FLOW_GROUP: + case MLX5_CMD_OP_QUERY_FLOW_TABLE_ENTRY: + case MLX5_CMD_OP_QUERY_FLOW_COUNTER: + case MLX5_CMD_OP_QUERY_MODIFY_HEADER_CONTEXT: + case MLX5_CMD_OP_QUERY_SCHEDULING_ELEMENT: + case MLX5_CMD_OP_QUERY_L2_TABLE_ENTRY: + case MLX5_CMD_OP_QUERY_QP: + case MLX5_CMD_OP_QUERY_SRQ: + case MLX5_CMD_OP_QUERY_XRC_SRQ: + case MLX5_CMD_OP_QUERY_DCT: + case MLX5_CMD_OP_QUERY_XRQ: + return true; + default: + return false; + } +} + +static bool devx_is_general_cmd(void *in) +{ + u16 opcode = MLX5_GET(general_obj_in_cmd_hdr, in, opcode); + + switch (opcode) { + case MLX5_CMD_OP_QUERY_HCA_CAP: + case MLX5_CMD_OP_QUERY_VPORT_STATE: + case MLX5_CMD_OP_QUERY_ADAPTER: + case MLX5_CMD_OP_QUERY_ISSI: + case MLX5_CMD_OP_QUERY_NIC_VPORT_CONTEXT: + case MLX5_CMD_OP_QUERY_ROCE_ADDRESS: + case MLX5_CMD_OP_QUERY_VNIC_ENV: + case MLX5_CMD_OP_QUERY_VPORT_COUNTER: + case MLX5_CMD_OP_GET_DROPPED_PACKET_LOG: + case MLX5_CMD_OP_NOP: + case MLX5_CMD_OP_QUERY_CONG_STATUS: + case MLX5_CMD_OP_QUERY_CONG_PARAMS: + case MLX5_CMD_OP_QUERY_CONG_STATISTICS: + return true; + default: + return false; + } +} + +static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_QUERY_EQN)( + struct ib_uverbs_file *file, struct uverbs_attr_bundle *attrs) +{ + struct mlx5_ib_ucontext *c; + struct mlx5_ib_dev *dev; + int user_vector; + int dev_eqn; + unsigned int irqn; + int err; + + if (uverbs_copy_from(&user_vector, attrs, + MLX5_IB_ATTR_DEVX_QUERY_EQN_USER_VEC)) + return -EFAULT; + + c = devx_ufile2uctx(file); + if (IS_ERR(c)) + return PTR_ERR(c); + dev = to_mdev(c->ibucontext.device); + + err = mlx5_vector2eqn(dev->mdev, user_vector, &dev_eqn, &irqn); + if (err < 0) + return err; + + if (uverbs_copy_to(attrs, MLX5_IB_ATTR_DEVX_QUERY_EQN_DEV_EQN, + &dev_eqn, sizeof(dev_eqn))) + return -EFAULT; + + return 0; +} + +/* + *Security note: + * The hardware protection mechanism works like this: Each device object that + * is subject to UAR doorbells (QP/SQ/CQ) gets a UAR ID (called uar_page in + * the device specification manual) upon its creation. Then upon doorbell, + * hardware fetches the object context for which the doorbell was rang, and + * validates that the UAR through which the DB was rang matches the UAR ID + * of the object. + * If no match the doorbell is silently ignored by the hardware. Of course, + * the user cannot ring a doorbell on a UAR that was not mapped to it. + * Now in devx, as the devx kernel does not manipulate the QP/SQ/CQ command + * mailboxes (except tagging them with UID), we expose to the user its UAR + * ID, so it can embed it in these objects in the expected specification + * format. So the only thing the user can do is hurt itself by creating a + * QP/SQ/CQ with a UAR ID other than his, and then in this case other users + * may ring a doorbell on its objects. + * The consequence of that will be that another user can schedule a QP/SQ + * of the buggy user for execution (just insert it to the hardware schedule + * queue or arm its CQ for event generation), no further harm is expected. + */ +static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_QUERY_UAR)( + struct ib_uverbs_file *file, struct uverbs_attr_bundle *attrs) +{ + struct mlx5_ib_ucontext *c; + struct mlx5_ib_dev *dev; + u32 user_idx; + s32 dev_idx; + + c = devx_ufile2uctx(file); + if (IS_ERR(c)) + return PTR_ERR(c); + dev = to_mdev(c->ibucontext.device); + + if (uverbs_copy_from(&user_idx, attrs, + MLX5_IB_ATTR_DEVX_QUERY_UAR_USER_IDX)) + return -EFAULT; + + dev_idx = bfregn_to_uar_index(dev, &c->bfregi, user_idx, true); + if (dev_idx < 0) + return dev_idx; + + if (uverbs_copy_to(attrs, MLX5_IB_ATTR_DEVX_QUERY_UAR_DEV_IDX, + &dev_idx, sizeof(dev_idx))) + return -EFAULT; + + return 0; +} + +static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OTHER)( + struct ib_uverbs_file *file, struct uverbs_attr_bundle *attrs) +{ + struct mlx5_ib_ucontext *c; + struct mlx5_ib_dev *dev; + void *cmd_in = uverbs_attr_get_alloced_ptr( + attrs, MLX5_IB_ATTR_DEVX_OTHER_CMD_IN); + int cmd_out_len = uverbs_attr_get_len(attrs, + MLX5_IB_ATTR_DEVX_OTHER_CMD_OUT); + void *cmd_out; + int err; + + c = devx_ufile2uctx(file); + if (IS_ERR(c)) + return PTR_ERR(c); + dev = to_mdev(c->ibucontext.device); + + if (!c->devx_uid) + return -EPERM; + + /* Only white list of some general HCA commands are allowed for this method. */ + if (!devx_is_general_cmd(cmd_in)) + return -EINVAL; + + cmd_out = uverbs_zalloc(attrs, cmd_out_len); + if (IS_ERR(cmd_out)) + return PTR_ERR(cmd_out); + + MLX5_SET(general_obj_in_cmd_hdr, cmd_in, uid, c->devx_uid); + err = mlx5_cmd_exec(dev->mdev, cmd_in, + uverbs_attr_get_len(attrs, MLX5_IB_ATTR_DEVX_OTHER_CMD_IN), + cmd_out, cmd_out_len); + if (err) + return err; + + return uverbs_copy_to(attrs, MLX5_IB_ATTR_DEVX_OTHER_CMD_OUT, cmd_out, + cmd_out_len); +} + +static void devx_obj_build_destroy_cmd(void *in, void *out, void *din, + u32 *dinlen, + u32 *obj_id) +{ + u16 obj_type = MLX5_GET(general_obj_in_cmd_hdr, in, obj_type); + u16 uid = MLX5_GET(general_obj_in_cmd_hdr, in, uid); + + *obj_id = MLX5_GET(general_obj_out_cmd_hdr, out, obj_id); + *dinlen = MLX5_ST_SZ_BYTES(general_obj_in_cmd_hdr); + + MLX5_SET(general_obj_in_cmd_hdr, din, obj_id, *obj_id); + MLX5_SET(general_obj_in_cmd_hdr, din, uid, uid); + + switch (MLX5_GET(general_obj_in_cmd_hdr, in, opcode)) { + case MLX5_CMD_OP_CREATE_GENERAL_OBJECT: + MLX5_SET(general_obj_in_cmd_hdr, din, opcode, MLX5_CMD_OP_DESTROY_GENERAL_OBJECT); + MLX5_SET(general_obj_in_cmd_hdr, din, obj_type, obj_type); + break; + + case MLX5_CMD_OP_CREATE_MKEY: + MLX5_SET(general_obj_in_cmd_hdr, din, opcode, MLX5_CMD_OP_DESTROY_MKEY); + break; + case MLX5_CMD_OP_CREATE_CQ: + MLX5_SET(general_obj_in_cmd_hdr, din, opcode, MLX5_CMD_OP_DESTROY_CQ); + break; + case MLX5_CMD_OP_ALLOC_PD: + MLX5_SET(general_obj_in_cmd_hdr, din, opcode, MLX5_CMD_OP_DEALLOC_PD); + break; + case MLX5_CMD_OP_ALLOC_TRANSPORT_DOMAIN: + MLX5_SET(general_obj_in_cmd_hdr, din, opcode, + MLX5_CMD_OP_DEALLOC_TRANSPORT_DOMAIN); + break; + case MLX5_CMD_OP_CREATE_RMP: + MLX5_SET(general_obj_in_cmd_hdr, din, opcode, MLX5_CMD_OP_DESTROY_RMP); + break; + case MLX5_CMD_OP_CREATE_SQ: + MLX5_SET(general_obj_in_cmd_hdr, din, opcode, MLX5_CMD_OP_DESTROY_SQ); + break; + case MLX5_CMD_OP_CREATE_RQ: + MLX5_SET(general_obj_in_cmd_hdr, din, opcode, MLX5_CMD_OP_DESTROY_RQ); + break; + case MLX5_CMD_OP_CREATE_RQT: + MLX5_SET(general_obj_in_cmd_hdr, din, opcode, MLX5_CMD_OP_DESTROY_RQT); + break; + case MLX5_CMD_OP_CREATE_TIR: + MLX5_SET(general_obj_in_cmd_hdr, din, opcode, MLX5_CMD_OP_DESTROY_TIR); + break; + case MLX5_CMD_OP_CREATE_TIS: + MLX5_SET(general_obj_in_cmd_hdr, din, opcode, MLX5_CMD_OP_DESTROY_TIS); + break; + case MLX5_CMD_OP_ALLOC_Q_COUNTER: + MLX5_SET(general_obj_in_cmd_hdr, din, opcode, + MLX5_CMD_OP_DEALLOC_Q_COUNTER); + break; + case MLX5_CMD_OP_CREATE_FLOW_TABLE: + *dinlen = MLX5_ST_SZ_BYTES(destroy_flow_table_in); + *obj_id = MLX5_GET(create_flow_table_out, out, table_id); + MLX5_SET(destroy_flow_table_in, din, other_vport, + MLX5_GET(create_flow_table_in, in, other_vport)); + MLX5_SET(destroy_flow_table_in, din, vport_number, + MLX5_GET(create_flow_table_in, in, vport_number)); + MLX5_SET(destroy_flow_table_in, din, table_type, + MLX5_GET(create_flow_table_in, in, table_type)); + MLX5_SET(destroy_flow_table_in, din, table_id, *obj_id); + MLX5_SET(general_obj_in_cmd_hdr, din, opcode, + MLX5_CMD_OP_DESTROY_FLOW_TABLE); + break; + case MLX5_CMD_OP_CREATE_FLOW_GROUP: + *dinlen = MLX5_ST_SZ_BYTES(destroy_flow_group_in); + *obj_id = MLX5_GET(create_flow_group_out, out, group_id); + MLX5_SET(destroy_flow_group_in, din, other_vport, + MLX5_GET(create_flow_group_in, in, other_vport)); + MLX5_SET(destroy_flow_group_in, din, vport_number, + MLX5_GET(create_flow_group_in, in, vport_number)); + MLX5_SET(destroy_flow_group_in, din, table_type, + MLX5_GET(create_flow_group_in, in, table_type)); + MLX5_SET(destroy_flow_group_in, din, table_id, + MLX5_GET(create_flow_group_in, in, table_id)); + MLX5_SET(destroy_flow_group_in, din, group_id, *obj_id); + MLX5_SET(general_obj_in_cmd_hdr, din, opcode, + MLX5_CMD_OP_DESTROY_FLOW_GROUP); + break; + case MLX5_CMD_OP_SET_FLOW_TABLE_ENTRY: + *dinlen = MLX5_ST_SZ_BYTES(delete_fte_in); + *obj_id = MLX5_GET(set_fte_in, in, flow_index); + MLX5_SET(delete_fte_in, din, other_vport, + MLX5_GET(set_fte_in, in, other_vport)); + MLX5_SET(delete_fte_in, din, vport_number, + MLX5_GET(set_fte_in, in, vport_number)); + MLX5_SET(delete_fte_in, din, table_type, + MLX5_GET(set_fte_in, in, table_type)); + MLX5_SET(delete_fte_in, din, table_id, + MLX5_GET(set_fte_in, in, table_id)); + MLX5_SET(delete_fte_in, din, flow_index, *obj_id); + MLX5_SET(general_obj_in_cmd_hdr, din, opcode, + MLX5_CMD_OP_DELETE_FLOW_TABLE_ENTRY); + break; + case MLX5_CMD_OP_ALLOC_FLOW_COUNTER: + MLX5_SET(general_obj_in_cmd_hdr, din, opcode, + MLX5_CMD_OP_DEALLOC_FLOW_COUNTER); + break; + case MLX5_CMD_OP_ALLOC_ENCAP_HEADER: + MLX5_SET(general_obj_in_cmd_hdr, din, opcode, + MLX5_CMD_OP_DEALLOC_ENCAP_HEADER); + break; + case MLX5_CMD_OP_ALLOC_MODIFY_HEADER_CONTEXT: + MLX5_SET(general_obj_in_cmd_hdr, din, opcode, + MLX5_CMD_OP_DEALLOC_MODIFY_HEADER_CONTEXT); + break; + case MLX5_CMD_OP_CREATE_SCHEDULING_ELEMENT: + *dinlen = MLX5_ST_SZ_BYTES(destroy_scheduling_element_in); + *obj_id = MLX5_GET(create_scheduling_element_out, out, + scheduling_element_id); + MLX5_SET(destroy_scheduling_element_in, din, + scheduling_hierarchy, + MLX5_GET(create_scheduling_element_in, in, + scheduling_hierarchy)); + MLX5_SET(destroy_scheduling_element_in, din, + scheduling_element_id, *obj_id); + MLX5_SET(general_obj_in_cmd_hdr, din, opcode, + MLX5_CMD_OP_DESTROY_SCHEDULING_ELEMENT); + break; + case MLX5_CMD_OP_ADD_VXLAN_UDP_DPORT: + *dinlen = MLX5_ST_SZ_BYTES(delete_vxlan_udp_dport_in); + *obj_id = MLX5_GET(add_vxlan_udp_dport_in, in, vxlan_udp_port); + MLX5_SET(delete_vxlan_udp_dport_in, din, vxlan_udp_port, *obj_id); + MLX5_SET(general_obj_in_cmd_hdr, din, opcode, + MLX5_CMD_OP_DELETE_VXLAN_UDP_DPORT); + break; + case MLX5_CMD_OP_SET_L2_TABLE_ENTRY: + *dinlen = MLX5_ST_SZ_BYTES(delete_l2_table_entry_in); + *obj_id = MLX5_GET(set_l2_table_entry_in, in, table_index); + MLX5_SET(delete_l2_table_entry_in, din, table_index, *obj_id); + MLX5_SET(general_obj_in_cmd_hdr, din, opcode, + MLX5_CMD_OP_DELETE_L2_TABLE_ENTRY); + break; + case MLX5_CMD_OP_CREATE_QP: + MLX5_SET(general_obj_in_cmd_hdr, din, opcode, MLX5_CMD_OP_DESTROY_QP); + break; + case MLX5_CMD_OP_CREATE_SRQ: + MLX5_SET(general_obj_in_cmd_hdr, din, opcode, MLX5_CMD_OP_DESTROY_SRQ); + break; + case MLX5_CMD_OP_CREATE_XRC_SRQ: + MLX5_SET(general_obj_in_cmd_hdr, din, opcode, + MLX5_CMD_OP_DESTROY_XRC_SRQ); + break; + case MLX5_CMD_OP_CREATE_DCT: + MLX5_SET(general_obj_in_cmd_hdr, din, opcode, MLX5_CMD_OP_DESTROY_DCT); + break; + case MLX5_CMD_OP_CREATE_XRQ: + MLX5_SET(general_obj_in_cmd_hdr, din, opcode, MLX5_CMD_OP_DESTROY_XRQ); + break; + case MLX5_CMD_OP_ATTACH_TO_MCG: + *dinlen = MLX5_ST_SZ_BYTES(detach_from_mcg_in); + MLX5_SET(detach_from_mcg_in, din, qpn, + MLX5_GET(attach_to_mcg_in, in, qpn)); + memcpy(MLX5_ADDR_OF(detach_from_mcg_in, din, multicast_gid), + MLX5_ADDR_OF(attach_to_mcg_in, in, multicast_gid), + MLX5_FLD_SZ_BYTES(attach_to_mcg_in, multicast_gid)); + MLX5_SET(general_obj_in_cmd_hdr, din, opcode, MLX5_CMD_OP_DETACH_FROM_MCG); + break; + case MLX5_CMD_OP_ALLOC_XRCD: + MLX5_SET(general_obj_in_cmd_hdr, din, opcode, MLX5_CMD_OP_DEALLOC_XRCD); + break; + default: + /* The entry must match to one of the devx_is_obj_create_cmd */ + WARN_ON(true); + break; + } +} + +static int devx_obj_cleanup(struct ib_uobject *uobject, + enum rdma_remove_reason why) +{ + u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)]; + struct devx_obj *obj = uobject->object; + int ret; + + ret = mlx5_cmd_exec(obj->mdev, obj->dinbox, obj->dinlen, out, sizeof(out)); + if (ib_is_destroy_retryable(ret, why, uobject)) + return ret; + + kfree(obj); + return ret; +} + +static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OBJ_CREATE)( + struct ib_uverbs_file *file, struct uverbs_attr_bundle *attrs) +{ + void *cmd_in = uverbs_attr_get_alloced_ptr(attrs, MLX5_IB_ATTR_DEVX_OBJ_CREATE_CMD_IN); + int cmd_out_len = uverbs_attr_get_len(attrs, + MLX5_IB_ATTR_DEVX_OBJ_CREATE_CMD_OUT); + void *cmd_out; + struct ib_uobject *uobj = uverbs_attr_get_uobject( + attrs, MLX5_IB_ATTR_DEVX_OBJ_CREATE_HANDLE); + struct mlx5_ib_ucontext *c = to_mucontext(uobj->context); + struct mlx5_ib_dev *dev = to_mdev(c->ibucontext.device); + struct devx_obj *obj; + int err; + + if (!c->devx_uid) + return -EPERM; + + if (!devx_is_obj_create_cmd(cmd_in)) + return -EINVAL; + + cmd_out = uverbs_zalloc(attrs, cmd_out_len); + if (IS_ERR(cmd_out)) + return PTR_ERR(cmd_out); + + obj = kzalloc(sizeof(struct devx_obj), GFP_KERNEL); + if (!obj) + return -ENOMEM; + + MLX5_SET(general_obj_in_cmd_hdr, cmd_in, uid, c->devx_uid); + err = mlx5_cmd_exec(dev->mdev, cmd_in, + uverbs_attr_get_len(attrs, MLX5_IB_ATTR_DEVX_OBJ_CREATE_CMD_IN), + cmd_out, cmd_out_len); + if (err) + goto obj_free; + + uobj->object = obj; + obj->mdev = dev->mdev; + devx_obj_build_destroy_cmd(cmd_in, cmd_out, obj->dinbox, &obj->dinlen, &obj->obj_id); + WARN_ON(obj->dinlen > MLX5_MAX_DESTROY_INBOX_SIZE_DW * sizeof(u32)); + + err = uverbs_copy_to(attrs, MLX5_IB_ATTR_DEVX_OBJ_CREATE_CMD_OUT, cmd_out, cmd_out_len); + if (err) + goto obj_free; + + return 0; + +obj_free: + kfree(obj); + return err; +} + +static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OBJ_MODIFY)( + struct ib_uverbs_file *file, struct uverbs_attr_bundle *attrs) +{ + void *cmd_in = uverbs_attr_get_alloced_ptr(attrs, MLX5_IB_ATTR_DEVX_OBJ_MODIFY_CMD_IN); + int cmd_out_len = uverbs_attr_get_len(attrs, + MLX5_IB_ATTR_DEVX_OBJ_MODIFY_CMD_OUT); + struct ib_uobject *uobj = uverbs_attr_get_uobject(attrs, + MLX5_IB_ATTR_DEVX_OBJ_MODIFY_HANDLE); + struct mlx5_ib_ucontext *c = to_mucontext(uobj->context); + struct devx_obj *obj = uobj->object; + void *cmd_out; + int err; + + if (!c->devx_uid) + return -EPERM; + + if (!devx_is_obj_modify_cmd(cmd_in)) + return -EINVAL; + + if (!devx_is_valid_obj_id(obj, cmd_in)) + return -EINVAL; + + cmd_out = uverbs_zalloc(attrs, cmd_out_len); + if (IS_ERR(cmd_out)) + return PTR_ERR(cmd_out); + + MLX5_SET(general_obj_in_cmd_hdr, cmd_in, uid, c->devx_uid); + err = mlx5_cmd_exec(obj->mdev, cmd_in, + uverbs_attr_get_len(attrs, MLX5_IB_ATTR_DEVX_OBJ_MODIFY_CMD_IN), + cmd_out, cmd_out_len); + if (err) + return err; + + return uverbs_copy_to(attrs, MLX5_IB_ATTR_DEVX_OBJ_MODIFY_CMD_OUT, + cmd_out, cmd_out_len); +} + +static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OBJ_QUERY)( + struct ib_uverbs_file *file, struct uverbs_attr_bundle *attrs) +{ + void *cmd_in = uverbs_attr_get_alloced_ptr(attrs, MLX5_IB_ATTR_DEVX_OBJ_QUERY_CMD_IN); + int cmd_out_len = uverbs_attr_get_len(attrs, + MLX5_IB_ATTR_DEVX_OBJ_QUERY_CMD_OUT); + struct ib_uobject *uobj = uverbs_attr_get_uobject(attrs, + MLX5_IB_ATTR_DEVX_OBJ_QUERY_HANDLE); + struct mlx5_ib_ucontext *c = to_mucontext(uobj->context); + struct devx_obj *obj = uobj->object; + void *cmd_out; + int err; + + if (!c->devx_uid) + return -EPERM; + + if (!devx_is_obj_query_cmd(cmd_in)) + return -EINVAL; + + if (!devx_is_valid_obj_id(obj, cmd_in)) + return -EINVAL; + + cmd_out = uverbs_zalloc(attrs, cmd_out_len); + if (IS_ERR(cmd_out)) + return PTR_ERR(cmd_out); + + MLX5_SET(general_obj_in_cmd_hdr, cmd_in, uid, c->devx_uid); + err = mlx5_cmd_exec(obj->mdev, cmd_in, + uverbs_attr_get_len(attrs, MLX5_IB_ATTR_DEVX_OBJ_QUERY_CMD_IN), + cmd_out, cmd_out_len); + if (err) + return err; + + return uverbs_copy_to(attrs, MLX5_IB_ATTR_DEVX_OBJ_QUERY_CMD_OUT, + cmd_out, cmd_out_len); +} + +static int devx_umem_get(struct mlx5_ib_dev *dev, struct ib_ucontext *ucontext, + struct uverbs_attr_bundle *attrs, + struct devx_umem *obj) +{ + u64 addr; + size_t size; + u32 access; + int npages; + int err; + u32 page_mask; + + if (uverbs_copy_from(&addr, attrs, MLX5_IB_ATTR_DEVX_UMEM_REG_ADDR) || + uverbs_copy_from(&size, attrs, MLX5_IB_ATTR_DEVX_UMEM_REG_LEN)) + return -EFAULT; + + err = uverbs_get_flags32(&access, attrs, + MLX5_IB_ATTR_DEVX_UMEM_REG_ACCESS, + IB_ACCESS_SUPPORTED); + if (err) + return err; + + err = ib_check_mr_access(access); + if (err) + return err; + + obj->umem = ib_umem_get(ucontext, addr, size, access, 0); + if (IS_ERR(obj->umem)) + return PTR_ERR(obj->umem); + + mlx5_ib_cont_pages(obj->umem, obj->umem->address, + MLX5_MKEY_PAGE_SHIFT_MASK, &npages, + &obj->page_shift, &obj->ncont, NULL); + + if (!npages) { + ib_umem_release(obj->umem); + return -EINVAL; + } + + page_mask = (1 << obj->page_shift) - 1; + obj->page_offset = obj->umem->address & page_mask; + + return 0; +} + +static int devx_umem_reg_cmd_alloc(struct uverbs_attr_bundle *attrs, + struct devx_umem *obj, + struct devx_umem_reg_cmd *cmd) +{ + cmd->inlen = MLX5_ST_SZ_BYTES(create_umem_in) + + (MLX5_ST_SZ_BYTES(mtt) * obj->ncont); + cmd->in = uverbs_zalloc(attrs, cmd->inlen); + return PTR_ERR_OR_ZERO(cmd->in); +} + +static void devx_umem_reg_cmd_build(struct mlx5_ib_dev *dev, + struct devx_umem *obj, + struct devx_umem_reg_cmd *cmd) +{ + void *umem; + __be64 *mtt; + + umem = MLX5_ADDR_OF(create_umem_in, cmd->in, umem); + mtt = (__be64 *)MLX5_ADDR_OF(umem, umem, mtt); + + MLX5_SET(general_obj_in_cmd_hdr, cmd->in, opcode, MLX5_CMD_OP_CREATE_GENERAL_OBJECT); + MLX5_SET(general_obj_in_cmd_hdr, cmd->in, obj_type, MLX5_OBJ_TYPE_UMEM); + MLX5_SET64(umem, umem, num_of_mtt, obj->ncont); + MLX5_SET(umem, umem, log_page_size, obj->page_shift - + MLX5_ADAPTER_PAGE_SHIFT); + MLX5_SET(umem, umem, page_offset, obj->page_offset); + mlx5_ib_populate_pas(dev, obj->umem, obj->page_shift, mtt, + (obj->umem->writable ? MLX5_IB_MTT_WRITE : 0) | + MLX5_IB_MTT_READ); +} + +static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_UMEM_REG)( + struct ib_uverbs_file *file, struct uverbs_attr_bundle *attrs) +{ + struct devx_umem_reg_cmd cmd; + struct devx_umem *obj; + struct ib_uobject *uobj = uverbs_attr_get_uobject( + attrs, MLX5_IB_ATTR_DEVX_UMEM_REG_HANDLE); + u32 obj_id; + struct mlx5_ib_ucontext *c = to_mucontext(uobj->context); + struct mlx5_ib_dev *dev = to_mdev(c->ibucontext.device); + int err; + + if (!c->devx_uid) + return -EPERM; + + obj = kzalloc(sizeof(struct devx_umem), GFP_KERNEL); + if (!obj) + return -ENOMEM; + + err = devx_umem_get(dev, &c->ibucontext, attrs, obj); + if (err) + goto err_obj_free; + + err = devx_umem_reg_cmd_alloc(attrs, obj, &cmd); + if (err) + goto err_umem_release; + + devx_umem_reg_cmd_build(dev, obj, &cmd); + + MLX5_SET(general_obj_in_cmd_hdr, cmd.in, uid, c->devx_uid); + err = mlx5_cmd_exec(dev->mdev, cmd.in, cmd.inlen, cmd.out, + sizeof(cmd.out)); + if (err) + goto err_umem_release; + + obj->mdev = dev->mdev; + uobj->object = obj; + devx_obj_build_destroy_cmd(cmd.in, cmd.out, obj->dinbox, &obj->dinlen, &obj_id); + err = uverbs_copy_to(attrs, MLX5_IB_ATTR_DEVX_UMEM_REG_OUT_ID, &obj_id, sizeof(obj_id)); + if (err) + goto err_umem_destroy; + + return 0; + +err_umem_destroy: + mlx5_cmd_exec(obj->mdev, obj->dinbox, obj->dinlen, cmd.out, sizeof(cmd.out)); +err_umem_release: + ib_umem_release(obj->umem); +err_obj_free: + kfree(obj); + return err; +} + +static int devx_umem_cleanup(struct ib_uobject *uobject, + enum rdma_remove_reason why) +{ + struct devx_umem *obj = uobject->object; + u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)]; + int err; + + err = mlx5_cmd_exec(obj->mdev, obj->dinbox, obj->dinlen, out, sizeof(out)); + if (ib_is_destroy_retryable(err, why, uobject)) + return err; + + ib_umem_release(obj->umem); + kfree(obj); + return 0; +} + +DECLARE_UVERBS_NAMED_METHOD( + MLX5_IB_METHOD_DEVX_UMEM_REG, + UVERBS_ATTR_IDR(MLX5_IB_ATTR_DEVX_UMEM_REG_HANDLE, + MLX5_IB_OBJECT_DEVX_UMEM, + UVERBS_ACCESS_NEW, + UA_MANDATORY), + UVERBS_ATTR_PTR_IN(MLX5_IB_ATTR_DEVX_UMEM_REG_ADDR, + UVERBS_ATTR_TYPE(u64), + UA_MANDATORY), + UVERBS_ATTR_PTR_IN(MLX5_IB_ATTR_DEVX_UMEM_REG_LEN, + UVERBS_ATTR_TYPE(u64), + UA_MANDATORY), + UVERBS_ATTR_FLAGS_IN(MLX5_IB_ATTR_DEVX_UMEM_REG_ACCESS, + enum ib_access_flags), + UVERBS_ATTR_PTR_OUT(MLX5_IB_ATTR_DEVX_UMEM_REG_OUT_ID, + UVERBS_ATTR_TYPE(u32), + UA_MANDATORY)); + +DECLARE_UVERBS_NAMED_METHOD_DESTROY( + MLX5_IB_METHOD_DEVX_UMEM_DEREG, + UVERBS_ATTR_IDR(MLX5_IB_ATTR_DEVX_UMEM_DEREG_HANDLE, + MLX5_IB_OBJECT_DEVX_UMEM, + UVERBS_ACCESS_DESTROY, + UA_MANDATORY)); + +DECLARE_UVERBS_NAMED_METHOD( + MLX5_IB_METHOD_DEVX_QUERY_EQN, + UVERBS_ATTR_PTR_IN(MLX5_IB_ATTR_DEVX_QUERY_EQN_USER_VEC, + UVERBS_ATTR_TYPE(u32), + UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(MLX5_IB_ATTR_DEVX_QUERY_EQN_DEV_EQN, + UVERBS_ATTR_TYPE(u32), + UA_MANDATORY)); + +DECLARE_UVERBS_NAMED_METHOD( + MLX5_IB_METHOD_DEVX_QUERY_UAR, + UVERBS_ATTR_PTR_IN(MLX5_IB_ATTR_DEVX_QUERY_UAR_USER_IDX, + UVERBS_ATTR_TYPE(u32), + UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(MLX5_IB_ATTR_DEVX_QUERY_UAR_DEV_IDX, + UVERBS_ATTR_TYPE(u32), + UA_MANDATORY)); + +DECLARE_UVERBS_NAMED_METHOD( + MLX5_IB_METHOD_DEVX_OTHER, + UVERBS_ATTR_PTR_IN( + MLX5_IB_ATTR_DEVX_OTHER_CMD_IN, + UVERBS_ATTR_MIN_SIZE(MLX5_ST_SZ_BYTES(general_obj_in_cmd_hdr)), + UA_MANDATORY, + UA_ALLOC_AND_COPY), + UVERBS_ATTR_PTR_OUT( + MLX5_IB_ATTR_DEVX_OTHER_CMD_OUT, + UVERBS_ATTR_MIN_SIZE(MLX5_ST_SZ_BYTES(general_obj_out_cmd_hdr)), + UA_MANDATORY)); + +DECLARE_UVERBS_NAMED_METHOD( + MLX5_IB_METHOD_DEVX_OBJ_CREATE, + UVERBS_ATTR_IDR(MLX5_IB_ATTR_DEVX_OBJ_CREATE_HANDLE, + MLX5_IB_OBJECT_DEVX_OBJ, + UVERBS_ACCESS_NEW, + UA_MANDATORY), + UVERBS_ATTR_PTR_IN( + MLX5_IB_ATTR_DEVX_OBJ_CREATE_CMD_IN, + UVERBS_ATTR_MIN_SIZE(MLX5_ST_SZ_BYTES(general_obj_in_cmd_hdr)), + UA_MANDATORY, + UA_ALLOC_AND_COPY), + UVERBS_ATTR_PTR_OUT( + MLX5_IB_ATTR_DEVX_OBJ_CREATE_CMD_OUT, + UVERBS_ATTR_MIN_SIZE(MLX5_ST_SZ_BYTES(general_obj_out_cmd_hdr)), + UA_MANDATORY)); + +DECLARE_UVERBS_NAMED_METHOD_DESTROY( + MLX5_IB_METHOD_DEVX_OBJ_DESTROY, + UVERBS_ATTR_IDR(MLX5_IB_ATTR_DEVX_OBJ_DESTROY_HANDLE, + MLX5_IB_OBJECT_DEVX_OBJ, + UVERBS_ACCESS_DESTROY, + UA_MANDATORY)); + +DECLARE_UVERBS_NAMED_METHOD( + MLX5_IB_METHOD_DEVX_OBJ_MODIFY, + UVERBS_ATTR_IDR(MLX5_IB_ATTR_DEVX_OBJ_MODIFY_HANDLE, + MLX5_IB_OBJECT_DEVX_OBJ, + UVERBS_ACCESS_WRITE, + UA_MANDATORY), + UVERBS_ATTR_PTR_IN( + MLX5_IB_ATTR_DEVX_OBJ_MODIFY_CMD_IN, + UVERBS_ATTR_MIN_SIZE(MLX5_ST_SZ_BYTES(general_obj_in_cmd_hdr)), + UA_MANDATORY, + UA_ALLOC_AND_COPY), + UVERBS_ATTR_PTR_OUT( + MLX5_IB_ATTR_DEVX_OBJ_MODIFY_CMD_OUT, + UVERBS_ATTR_MIN_SIZE(MLX5_ST_SZ_BYTES(general_obj_out_cmd_hdr)), + UA_MANDATORY)); + +DECLARE_UVERBS_NAMED_METHOD( + MLX5_IB_METHOD_DEVX_OBJ_QUERY, + UVERBS_ATTR_IDR(MLX5_IB_ATTR_DEVX_OBJ_QUERY_HANDLE, + MLX5_IB_OBJECT_DEVX_OBJ, + UVERBS_ACCESS_READ, + UA_MANDATORY), + UVERBS_ATTR_PTR_IN( + MLX5_IB_ATTR_DEVX_OBJ_QUERY_CMD_IN, + UVERBS_ATTR_MIN_SIZE(MLX5_ST_SZ_BYTES(general_obj_in_cmd_hdr)), + UA_MANDATORY, + UA_ALLOC_AND_COPY), + UVERBS_ATTR_PTR_OUT( + MLX5_IB_ATTR_DEVX_OBJ_QUERY_CMD_OUT, + UVERBS_ATTR_MIN_SIZE(MLX5_ST_SZ_BYTES(general_obj_out_cmd_hdr)), + UA_MANDATORY)); + +DECLARE_UVERBS_GLOBAL_METHODS(MLX5_IB_OBJECT_DEVX, + &UVERBS_METHOD(MLX5_IB_METHOD_DEVX_OTHER), + &UVERBS_METHOD(MLX5_IB_METHOD_DEVX_QUERY_UAR), + &UVERBS_METHOD(MLX5_IB_METHOD_DEVX_QUERY_EQN)); + +DECLARE_UVERBS_NAMED_OBJECT(MLX5_IB_OBJECT_DEVX_OBJ, + UVERBS_TYPE_ALLOC_IDR(devx_obj_cleanup), + &UVERBS_METHOD(MLX5_IB_METHOD_DEVX_OBJ_CREATE), + &UVERBS_METHOD(MLX5_IB_METHOD_DEVX_OBJ_DESTROY), + &UVERBS_METHOD(MLX5_IB_METHOD_DEVX_OBJ_MODIFY), + &UVERBS_METHOD(MLX5_IB_METHOD_DEVX_OBJ_QUERY)); + +DECLARE_UVERBS_NAMED_OBJECT(MLX5_IB_OBJECT_DEVX_UMEM, + UVERBS_TYPE_ALLOC_IDR(devx_umem_cleanup), + &UVERBS_METHOD(MLX5_IB_METHOD_DEVX_UMEM_REG), + &UVERBS_METHOD(MLX5_IB_METHOD_DEVX_UMEM_DEREG)); + +DECLARE_UVERBS_OBJECT_TREE(devx_objects, + &UVERBS_OBJECT(MLX5_IB_OBJECT_DEVX), + &UVERBS_OBJECT(MLX5_IB_OBJECT_DEVX_OBJ), + &UVERBS_OBJECT(MLX5_IB_OBJECT_DEVX_UMEM)); + +const struct uverbs_object_tree_def *mlx5_ib_get_devx_tree(void) +{ + return &devx_objects; +} diff --git a/drivers/infiniband/hw/mlx5/flow.c b/drivers/infiniband/hw/mlx5/flow.c new file mode 100644 index 000000000000..1a29f47f836e --- /dev/null +++ b/drivers/infiniband/hw/mlx5/flow.c @@ -0,0 +1,252 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2018, Mellanox Technologies inc. All rights reserved. + */ + +#include <rdma/ib_user_verbs.h> +#include <rdma/ib_verbs.h> +#include <rdma/uverbs_types.h> +#include <rdma/uverbs_ioctl.h> +#include <rdma/mlx5_user_ioctl_cmds.h> +#include <rdma/ib_umem.h> +#include <linux/mlx5/driver.h> +#include <linux/mlx5/fs.h> +#include "mlx5_ib.h" + +#define UVERBS_MODULE_NAME mlx5_ib +#include <rdma/uverbs_named_ioctl.h> + +static const struct uverbs_attr_spec mlx5_ib_flow_type[] = { + [MLX5_IB_FLOW_TYPE_NORMAL] = { + .type = UVERBS_ATTR_TYPE_PTR_IN, + .u.ptr = { + .len = sizeof(u16), /* data is priority */ + .min_len = sizeof(u16), + } + }, + [MLX5_IB_FLOW_TYPE_SNIFFER] = { + .type = UVERBS_ATTR_TYPE_PTR_IN, + UVERBS_ATTR_NO_DATA(), + }, + [MLX5_IB_FLOW_TYPE_ALL_DEFAULT] = { + .type = UVERBS_ATTR_TYPE_PTR_IN, + UVERBS_ATTR_NO_DATA(), + }, + [MLX5_IB_FLOW_TYPE_MC_DEFAULT] = { + .type = UVERBS_ATTR_TYPE_PTR_IN, + UVERBS_ATTR_NO_DATA(), + }, +}; + +static int UVERBS_HANDLER(MLX5_IB_METHOD_CREATE_FLOW)( + struct ib_uverbs_file *file, struct uverbs_attr_bundle *attrs) +{ + struct mlx5_ib_flow_handler *flow_handler; + struct mlx5_ib_flow_matcher *fs_matcher; + void *devx_obj; + int dest_id, dest_type; + void *cmd_in; + int inlen; + bool dest_devx, dest_qp; + struct ib_qp *qp = NULL; + struct ib_uobject *uobj = + uverbs_attr_get_uobject(attrs, MLX5_IB_ATTR_CREATE_FLOW_HANDLE); + struct mlx5_ib_dev *dev = to_mdev(uobj->context->device); + + if (!capable(CAP_NET_RAW)) + return -EPERM; + + dest_devx = + uverbs_attr_is_valid(attrs, MLX5_IB_ATTR_CREATE_FLOW_DEST_DEVX); + dest_qp = uverbs_attr_is_valid(attrs, + MLX5_IB_ATTR_CREATE_FLOW_DEST_QP); + + if ((dest_devx && dest_qp) || (!dest_devx && !dest_qp)) + return -EINVAL; + + if (dest_devx) { + devx_obj = uverbs_attr_get_obj( + attrs, MLX5_IB_ATTR_CREATE_FLOW_DEST_DEVX); + if (IS_ERR(devx_obj)) + return PTR_ERR(devx_obj); + + /* Verify that the given DEVX object is a flow + * steering destination. + */ + if (!mlx5_ib_devx_is_flow_dest(devx_obj, &dest_id, &dest_type)) + return -EINVAL; + } else { + struct mlx5_ib_qp *mqp; + + qp = uverbs_attr_get_obj(attrs, + MLX5_IB_ATTR_CREATE_FLOW_DEST_QP); + if (IS_ERR(qp)) + return PTR_ERR(qp); + + if (qp->qp_type != IB_QPT_RAW_PACKET) + return -EINVAL; + + mqp = to_mqp(qp); + if (mqp->flags & MLX5_IB_QP_RSS) + dest_id = mqp->rss_qp.tirn; + else + dest_id = mqp->raw_packet_qp.rq.tirn; + dest_type = MLX5_FLOW_DESTINATION_TYPE_TIR; + } + + if (dev->rep) + return -ENOTSUPP; + + cmd_in = uverbs_attr_get_alloced_ptr( + attrs, MLX5_IB_ATTR_CREATE_FLOW_MATCH_VALUE); + inlen = uverbs_attr_get_len(attrs, + MLX5_IB_ATTR_CREATE_FLOW_MATCH_VALUE); + fs_matcher = uverbs_attr_get_obj(attrs, + MLX5_IB_ATTR_CREATE_FLOW_MATCHER); + flow_handler = mlx5_ib_raw_fs_rule_add(dev, fs_matcher, cmd_in, inlen, + dest_id, dest_type); + if (IS_ERR(flow_handler)) + return PTR_ERR(flow_handler); + + ib_set_flow(uobj, &flow_handler->ibflow, qp, &dev->ib_dev); + + return 0; +} + +static int flow_matcher_cleanup(struct ib_uobject *uobject, + enum rdma_remove_reason why) +{ + struct mlx5_ib_flow_matcher *obj = uobject->object; + int ret; + + ret = ib_destroy_usecnt(&obj->usecnt, why, uobject); + if (ret) + return ret; + + kfree(obj); + return 0; +} + +static int UVERBS_HANDLER(MLX5_IB_METHOD_FLOW_MATCHER_CREATE)( + struct ib_uverbs_file *file, struct uverbs_attr_bundle *attrs) +{ + struct ib_uobject *uobj = uverbs_attr_get_uobject( + attrs, MLX5_IB_ATTR_FLOW_MATCHER_CREATE_HANDLE); + struct mlx5_ib_dev *dev = to_mdev(uobj->context->device); + struct mlx5_ib_flow_matcher *obj; + int err; + + obj = kzalloc(sizeof(struct mlx5_ib_flow_matcher), GFP_KERNEL); + if (!obj) + return -ENOMEM; + + obj->mask_len = uverbs_attr_get_len( + attrs, MLX5_IB_ATTR_FLOW_MATCHER_MATCH_MASK); + err = uverbs_copy_from(&obj->matcher_mask, + attrs, + MLX5_IB_ATTR_FLOW_MATCHER_MATCH_MASK); + if (err) + goto end; + + obj->flow_type = uverbs_attr_get_enum_id( + attrs, MLX5_IB_ATTR_FLOW_MATCHER_FLOW_TYPE); + + if (obj->flow_type == MLX5_IB_FLOW_TYPE_NORMAL) { + err = uverbs_copy_from(&obj->priority, + attrs, + MLX5_IB_ATTR_FLOW_MATCHER_FLOW_TYPE); + if (err) + goto end; + } + + err = uverbs_copy_from(&obj->match_criteria_enable, + attrs, + MLX5_IB_ATTR_FLOW_MATCHER_MATCH_CRITERIA); + if (err) + goto end; + + uobj->object = obj; + obj->mdev = dev->mdev; + atomic_set(&obj->usecnt, 0); + return 0; + +end: + kfree(obj); + return err; +} + +DECLARE_UVERBS_NAMED_METHOD( + MLX5_IB_METHOD_CREATE_FLOW, + UVERBS_ATTR_IDR(MLX5_IB_ATTR_CREATE_FLOW_HANDLE, + UVERBS_OBJECT_FLOW, + UVERBS_ACCESS_NEW, + UA_MANDATORY), + UVERBS_ATTR_PTR_IN( + MLX5_IB_ATTR_CREATE_FLOW_MATCH_VALUE, + UVERBS_ATTR_SIZE(1, sizeof(struct mlx5_ib_match_params)), + UA_MANDATORY, + UA_ALLOC_AND_COPY), + UVERBS_ATTR_IDR(MLX5_IB_ATTR_CREATE_FLOW_MATCHER, + MLX5_IB_OBJECT_FLOW_MATCHER, + UVERBS_ACCESS_READ, + UA_MANDATORY), + UVERBS_ATTR_IDR(MLX5_IB_ATTR_CREATE_FLOW_DEST_QP, + UVERBS_OBJECT_QP, + UVERBS_ACCESS_READ), + UVERBS_ATTR_IDR(MLX5_IB_ATTR_CREATE_FLOW_DEST_DEVX, + MLX5_IB_OBJECT_DEVX_OBJ, + UVERBS_ACCESS_READ)); + +DECLARE_UVERBS_NAMED_METHOD_DESTROY( + MLX5_IB_METHOD_DESTROY_FLOW, + UVERBS_ATTR_IDR(MLX5_IB_ATTR_CREATE_FLOW_HANDLE, + UVERBS_OBJECT_FLOW, + UVERBS_ACCESS_DESTROY, + UA_MANDATORY)); + +ADD_UVERBS_METHODS(mlx5_ib_fs, + UVERBS_OBJECT_FLOW, + &UVERBS_METHOD(MLX5_IB_METHOD_CREATE_FLOW), + &UVERBS_METHOD(MLX5_IB_METHOD_DESTROY_FLOW)); + +DECLARE_UVERBS_NAMED_METHOD( + MLX5_IB_METHOD_FLOW_MATCHER_CREATE, + UVERBS_ATTR_IDR(MLX5_IB_ATTR_FLOW_MATCHER_CREATE_HANDLE, + MLX5_IB_OBJECT_FLOW_MATCHER, + UVERBS_ACCESS_NEW, + UA_MANDATORY), + UVERBS_ATTR_PTR_IN( + MLX5_IB_ATTR_FLOW_MATCHER_MATCH_MASK, + UVERBS_ATTR_SIZE(1, sizeof(struct mlx5_ib_match_params)), + UA_MANDATORY), + UVERBS_ATTR_ENUM_IN(MLX5_IB_ATTR_FLOW_MATCHER_FLOW_TYPE, + mlx5_ib_flow_type, + UA_MANDATORY), + UVERBS_ATTR_PTR_IN(MLX5_IB_ATTR_FLOW_MATCHER_MATCH_CRITERIA, + UVERBS_ATTR_TYPE(u8), + UA_MANDATORY)); + +DECLARE_UVERBS_NAMED_METHOD_DESTROY( + MLX5_IB_METHOD_FLOW_MATCHER_DESTROY, + UVERBS_ATTR_IDR(MLX5_IB_ATTR_FLOW_MATCHER_DESTROY_HANDLE, + MLX5_IB_OBJECT_FLOW_MATCHER, + UVERBS_ACCESS_DESTROY, + UA_MANDATORY)); + +DECLARE_UVERBS_NAMED_OBJECT(MLX5_IB_OBJECT_FLOW_MATCHER, + UVERBS_TYPE_ALLOC_IDR(flow_matcher_cleanup), + &UVERBS_METHOD(MLX5_IB_METHOD_FLOW_MATCHER_CREATE), + &UVERBS_METHOD(MLX5_IB_METHOD_FLOW_MATCHER_DESTROY)); + +DECLARE_UVERBS_OBJECT_TREE(flow_objects, + &UVERBS_OBJECT(MLX5_IB_OBJECT_FLOW_MATCHER)); + +int mlx5_ib_get_flow_trees(const struct uverbs_object_tree_def **root) +{ + int i = 0; + + root[i++] = &flow_objects; + root[i++] = &mlx5_ib_fs; + + return i; +} diff --git a/drivers/infiniband/hw/mlx5/gsi.c b/drivers/infiniband/hw/mlx5/gsi.c index 79e6309460dc..4950df3f71b6 100644 --- a/drivers/infiniband/hw/mlx5/gsi.c +++ b/drivers/infiniband/hw/mlx5/gsi.c @@ -477,8 +477,8 @@ static struct ib_qp *get_tx_qp(struct mlx5_ib_gsi_qp *gsi, struct ib_ud_wr *wr) return gsi->tx_qps[qp_index]; } -int mlx5_ib_gsi_post_send(struct ib_qp *qp, struct ib_send_wr *wr, - struct ib_send_wr **bad_wr) +int mlx5_ib_gsi_post_send(struct ib_qp *qp, const struct ib_send_wr *wr, + const struct ib_send_wr **bad_wr) { struct mlx5_ib_gsi_qp *gsi = gsi_qp(qp); struct ib_qp *tx_qp; @@ -522,8 +522,8 @@ err: return ret; } -int mlx5_ib_gsi_post_recv(struct ib_qp *qp, struct ib_recv_wr *wr, - struct ib_recv_wr **bad_wr) +int mlx5_ib_gsi_post_recv(struct ib_qp *qp, const struct ib_recv_wr *wr, + const struct ib_recv_wr **bad_wr) { struct mlx5_ib_gsi_qp *gsi = gsi_qp(qp); diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index b3ba9a222550..c414f3809e5c 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -419,8 +419,8 @@ static int mlx5_query_port_roce(struct ib_device *device, u8 port_num, translate_eth_proto_oper(eth_prot_oper, &props->active_speed, &props->active_width); - props->port_cap_flags |= IB_PORT_CM_SUP; - props->port_cap_flags |= IB_PORT_IP_BASED_GIDS; + props->port_cap_flags |= IB_PORT_CM_SUP; + props->ip_gids = true; props->gid_tbl_len = MLX5_CAP_ROCE(dev->mdev, roce_address_table_size); @@ -510,12 +510,11 @@ static int set_roce_addr(struct mlx5_ib_dev *dev, u8 port_num, vlan_id, port_num); } -static int mlx5_ib_add_gid(const union ib_gid *gid, - const struct ib_gid_attr *attr, +static int mlx5_ib_add_gid(const struct ib_gid_attr *attr, __always_unused void **context) { return set_roce_addr(to_mdev(attr->device), attr->port_num, - attr->index, gid, attr); + attr->index, &attr->gid, attr); } static int mlx5_ib_del_gid(const struct ib_gid_attr *attr, @@ -525,41 +524,15 @@ static int mlx5_ib_del_gid(const struct ib_gid_attr *attr, attr->index, NULL, NULL); } -__be16 mlx5_get_roce_udp_sport(struct mlx5_ib_dev *dev, u8 port_num, - int index) +__be16 mlx5_get_roce_udp_sport(struct mlx5_ib_dev *dev, + const struct ib_gid_attr *attr) { - struct ib_gid_attr attr; - union ib_gid gid; - - if (ib_get_cached_gid(&dev->ib_dev, port_num, index, &gid, &attr)) - return 0; - - dev_put(attr.ndev); - - if (attr.gid_type != IB_GID_TYPE_ROCE_UDP_ENCAP) + if (attr->gid_type != IB_GID_TYPE_ROCE_UDP_ENCAP) return 0; return cpu_to_be16(MLX5_CAP_ROCE(dev->mdev, r_roce_min_src_udp_port)); } -int mlx5_get_roce_gid_type(struct mlx5_ib_dev *dev, u8 port_num, - int index, enum ib_gid_type *gid_type) -{ - struct ib_gid_attr attr; - union ib_gid gid; - int ret; - - ret = ib_get_cached_gid(&dev->ib_dev, port_num, index, &gid, &attr); - if (ret) - return ret; - - dev_put(attr.ndev); - - *gid_type = attr.gid_type; - - return 0; -} - static int mlx5_use_mad_ifc(struct mlx5_ib_dev *dev) { if (MLX5_CAP_GEN(dev->mdev, port_type) == MLX5_CAP_PORT_TYPE_IB) @@ -915,7 +888,8 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, max_sq_sg = (max_sq_desc - sizeof(struct mlx5_wqe_ctrl_seg) - sizeof(struct mlx5_wqe_raddr_seg)) / sizeof(struct mlx5_wqe_data_seg); - props->max_sge = min(max_rq_sg, max_sq_sg); + props->max_send_sge = max_sq_sg; + props->max_recv_sge = max_rq_sg; props->max_sge_rd = MLX5_MAX_SGE_RD; props->max_cq = 1 << MLX5_CAP_GEN(mdev, log_max_cq); props->max_cqe = (1 << MLX5_CAP_GEN(mdev, log_max_cq_sz)) - 1; @@ -1246,7 +1220,6 @@ static int mlx5_query_hca_port(struct ib_device *ibdev, u8 port, props->qkey_viol_cntr = rep->qkey_violation_counter; props->subnet_timeout = rep->subnet_timeout; props->init_type_reply = rep->init_type_reply; - props->grh_required = rep->grh_required; err = mlx5_query_port_link_width_oper(mdev, &ib_link_width_oper, port); if (err) @@ -1585,31 +1558,26 @@ error: return err; } -static int deallocate_uars(struct mlx5_ib_dev *dev, struct mlx5_ib_ucontext *context) +static void deallocate_uars(struct mlx5_ib_dev *dev, + struct mlx5_ib_ucontext *context) { struct mlx5_bfreg_info *bfregi; - int err; int i; bfregi = &context->bfregi; - for (i = 0; i < bfregi->num_sys_pages; i++) { + for (i = 0; i < bfregi->num_sys_pages; i++) if (i < bfregi->num_static_sys_pages || - bfregi->sys_pages[i] != MLX5_IB_INVALID_UAR_INDEX) { - err = mlx5_cmd_free_uar(dev->mdev, bfregi->sys_pages[i]); - if (err) { - mlx5_ib_warn(dev, "failed to free uar %d, err=%d\n", i, err); - return err; - } - } - } - - return 0; + bfregi->sys_pages[i] != MLX5_IB_INVALID_UAR_INDEX) + mlx5_cmd_free_uar(dev->mdev, bfregi->sys_pages[i]); } static int mlx5_ib_alloc_transport_domain(struct mlx5_ib_dev *dev, u32 *tdn) { int err; + if (!MLX5_CAP_GEN(dev->mdev, log_max_transport_domain)) + return 0; + err = mlx5_core_alloc_transport_domain(dev->mdev, tdn); if (err) return err; @@ -1631,6 +1599,9 @@ static int mlx5_ib_alloc_transport_domain(struct mlx5_ib_dev *dev, u32 *tdn) static void mlx5_ib_dealloc_transport_domain(struct mlx5_ib_dev *dev, u32 tdn) { + if (!MLX5_CAP_GEN(dev->mdev, log_max_transport_domain)) + return; + mlx5_core_dealloc_transport_domain(dev->mdev, tdn); if ((MLX5_CAP_GEN(dev->mdev, port_type) != MLX5_CAP_PORT_TYPE_ETH) || @@ -1660,6 +1631,7 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev, int err; size_t min_req_v2 = offsetof(struct mlx5_ib_alloc_ucontext_req_v2, max_cqe_version); + u32 dump_fill_mkey; bool lib_uar_4k; if (!dev->ib_active) @@ -1676,8 +1648,8 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev, if (err) return ERR_PTR(err); - if (req.flags) - return ERR_PTR(-EINVAL); + if (req.flags & ~MLX5_IB_ALLOC_UCTX_DEVX) + return ERR_PTR(-EOPNOTSUPP); if (req.comp_mask || req.reserved0 || req.reserved1 || req.reserved2) return ERR_PTR(-EOPNOTSUPP); @@ -1755,10 +1727,26 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev, context->ibucontext.invalidate_range = &mlx5_ib_invalidate_range; #endif - if (MLX5_CAP_GEN(dev->mdev, log_max_transport_domain)) { - err = mlx5_ib_alloc_transport_domain(dev, &context->tdn); + err = mlx5_ib_alloc_transport_domain(dev, &context->tdn); + if (err) + goto out_uars; + + if (req.flags & MLX5_IB_ALLOC_UCTX_DEVX) { + /* Block DEVX on Infiniband as of SELinux */ + if (mlx5_ib_port_link_layer(ibdev, 1) != IB_LINK_LAYER_ETHERNET) { + err = -EPERM; + goto out_td; + } + + err = mlx5_ib_devx_create(dev, context); if (err) - goto out_uars; + goto out_td; + } + + if (MLX5_CAP_GEN(dev->mdev, dump_fill_mkey)) { + err = mlx5_cmd_dump_fill_mkey(dev->mdev, &dump_fill_mkey); + if (err) + goto out_mdev; } INIT_LIST_HEAD(&context->vma_private_list); @@ -1819,9 +1807,18 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev, resp.response_length += sizeof(resp.num_dyn_bfregs); } + if (field_avail(typeof(resp), dump_fill_mkey, udata->outlen)) { + if (MLX5_CAP_GEN(dev->mdev, dump_fill_mkey)) { + resp.dump_fill_mkey = dump_fill_mkey; + resp.comp_mask |= + MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_DUMP_FILL_MKEY; + } + resp.response_length += sizeof(resp.dump_fill_mkey); + } + err = ib_copy_to_udata(udata, &resp, resp.response_length); if (err) - goto out_td; + goto out_mdev; bfregi->ver = ver; bfregi->num_low_latency_bfregs = req.num_low_latency_bfregs; @@ -1831,9 +1828,11 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev, return &context->ibucontext; +out_mdev: + if (req.flags & MLX5_IB_ALLOC_UCTX_DEVX) + mlx5_ib_devx_destroy(dev, context); out_td: - if (MLX5_CAP_GEN(dev->mdev, log_max_transport_domain)) - mlx5_ib_dealloc_transport_domain(dev, context->tdn); + mlx5_ib_dealloc_transport_domain(dev, context->tdn); out_uars: deallocate_uars(dev, context); @@ -1856,9 +1855,11 @@ static int mlx5_ib_dealloc_ucontext(struct ib_ucontext *ibcontext) struct mlx5_ib_dev *dev = to_mdev(ibcontext->device); struct mlx5_bfreg_info *bfregi; + if (context->devx_uid) + mlx5_ib_devx_destroy(dev, context); + bfregi = &context->bfregi; - if (MLX5_CAP_GEN(dev->mdev, log_max_transport_domain)) - mlx5_ib_dealloc_transport_domain(dev, context->tdn); + mlx5_ib_dealloc_transport_domain(dev, context->tdn); deallocate_uars(dev, context); kfree(bfregi->sys_pages); @@ -2040,7 +2041,7 @@ static int uar_mmap(struct mlx5_ib_dev *dev, enum mlx5_ib_mmap_cmd cmd, struct mlx5_bfreg_info *bfregi = &context->bfregi; int err; unsigned long idx; - phys_addr_t pfn, pa; + phys_addr_t pfn; pgprot_t prot; u32 bfreg_dyn_idx = 0; u32 uar_index; @@ -2131,8 +2132,6 @@ static int uar_mmap(struct mlx5_ib_dev *dev, enum mlx5_ib_mmap_cmd cmd, goto err; } - pa = pfn << PAGE_SHIFT; - err = mlx5_ib_set_vma_data(vma, context); if (err) goto err; @@ -2699,7 +2698,7 @@ static int parse_flow_attr(struct mlx5_core_dev *mdev, u32 *match_c, IPPROTO_GRE); MLX5_SET(fte_match_set_misc, misc_params_c, gre_protocol, - 0xffff); + ntohs(ib_spec->gre.mask.protocol)); MLX5_SET(fte_match_set_misc, misc_params_v, gre_protocol, ntohs(ib_spec->gre.val.protocol)); @@ -2979,11 +2978,11 @@ static void counters_clear_description(struct ib_counters *counters) static int mlx5_ib_destroy_flow(struct ib_flow *flow_id) { - struct mlx5_ib_dev *dev = to_mdev(flow_id->qp->device); struct mlx5_ib_flow_handler *handler = container_of(flow_id, struct mlx5_ib_flow_handler, ibflow); struct mlx5_ib_flow_handler *iter, *tmp; + struct mlx5_ib_dev *dev = handler->dev; mutex_lock(&dev->flow_db->lock); @@ -3001,6 +3000,8 @@ static int mlx5_ib_destroy_flow(struct ib_flow *flow_id) counters_clear_description(handler->ibcounters); mutex_unlock(&dev->flow_db->lock); + if (handler->flow_matcher) + atomic_dec(&handler->flow_matcher->usecnt); kfree(handler); return 0; @@ -3021,6 +3022,26 @@ enum flow_table_type { #define MLX5_FS_MAX_TYPES 6 #define MLX5_FS_MAX_ENTRIES BIT(16) + +static struct mlx5_ib_flow_prio *_get_prio(struct mlx5_flow_namespace *ns, + struct mlx5_ib_flow_prio *prio, + int priority, + int num_entries, int num_groups) +{ + struct mlx5_flow_table *ft; + + ft = mlx5_create_auto_grouped_flow_table(ns, priority, + num_entries, + num_groups, + 0, 0); + if (IS_ERR(ft)) + return ERR_CAST(ft); + + prio->flow_table = ft; + prio->refcount = 0; + return prio; +} + static struct mlx5_ib_flow_prio *get_flow_table(struct mlx5_ib_dev *dev, struct ib_flow_attr *flow_attr, enum flow_table_type ft_type) @@ -3033,7 +3054,6 @@ static struct mlx5_ib_flow_prio *get_flow_table(struct mlx5_ib_dev *dev, int num_entries; int num_groups; int priority; - int err = 0; max_table_size = BIT(MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev, log_max_ft_size)); @@ -3083,21 +3103,10 @@ static struct mlx5_ib_flow_prio *get_flow_table(struct mlx5_ib_dev *dev, return ERR_PTR(-ENOMEM); ft = prio->flow_table; - if (!ft) { - ft = mlx5_create_auto_grouped_flow_table(ns, priority, - num_entries, - num_groups, - 0, 0); - - if (!IS_ERR(ft)) { - prio->refcount = 0; - prio->flow_table = ft; - } else { - err = PTR_ERR(ft); - } - } + if (!ft) + return _get_prio(ns, prio, priority, num_entries, num_groups); - return err ? ERR_PTR(err) : prio; + return prio; } static void set_underlay_qp(struct mlx5_ib_dev *dev, @@ -3356,6 +3365,7 @@ static struct mlx5_ib_flow_handler *_create_flow_rule(struct mlx5_ib_dev *dev, ft_prio->refcount++; handler->prio = ft_prio; + handler->dev = dev; ft_prio->flow_table = ft; free: @@ -3648,6 +3658,189 @@ free_ucmd: return ERR_PTR(err); } +static struct mlx5_ib_flow_prio *_get_flow_table(struct mlx5_ib_dev *dev, + int priority, bool mcast) +{ + int max_table_size; + struct mlx5_flow_namespace *ns = NULL; + struct mlx5_ib_flow_prio *prio; + + max_table_size = BIT(MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev, + log_max_ft_size)); + if (max_table_size < MLX5_FS_MAX_ENTRIES) + return ERR_PTR(-ENOMEM); + + if (mcast) + priority = MLX5_IB_FLOW_MCAST_PRIO; + else + priority = ib_prio_to_core_prio(priority, false); + + ns = mlx5_get_flow_namespace(dev->mdev, MLX5_FLOW_NAMESPACE_BYPASS); + if (!ns) + return ERR_PTR(-ENOTSUPP); + + prio = &dev->flow_db->prios[priority]; + + if (prio->flow_table) + return prio; + + return _get_prio(ns, prio, priority, MLX5_FS_MAX_ENTRIES, + MLX5_FS_MAX_TYPES); +} + +static struct mlx5_ib_flow_handler * +_create_raw_flow_rule(struct mlx5_ib_dev *dev, + struct mlx5_ib_flow_prio *ft_prio, + struct mlx5_flow_destination *dst, + struct mlx5_ib_flow_matcher *fs_matcher, + void *cmd_in, int inlen) +{ + struct mlx5_ib_flow_handler *handler; + struct mlx5_flow_act flow_act = {.flow_tag = MLX5_FS_DEFAULT_FLOW_TAG}; + struct mlx5_flow_spec *spec; + struct mlx5_flow_table *ft = ft_prio->flow_table; + int err = 0; + + spec = kvzalloc(sizeof(*spec), GFP_KERNEL); + handler = kzalloc(sizeof(*handler), GFP_KERNEL); + if (!handler || !spec) { + err = -ENOMEM; + goto free; + } + + INIT_LIST_HEAD(&handler->list); + + memcpy(spec->match_value, cmd_in, inlen); + memcpy(spec->match_criteria, fs_matcher->matcher_mask.match_params, + fs_matcher->mask_len); + spec->match_criteria_enable = fs_matcher->match_criteria_enable; + + flow_act.action |= MLX5_FLOW_CONTEXT_ACTION_FWD_DEST; + handler->rule = mlx5_add_flow_rules(ft, spec, + &flow_act, dst, 1); + + if (IS_ERR(handler->rule)) { + err = PTR_ERR(handler->rule); + goto free; + } + + ft_prio->refcount++; + handler->prio = ft_prio; + handler->dev = dev; + ft_prio->flow_table = ft; + +free: + if (err) + kfree(handler); + kvfree(spec); + return err ? ERR_PTR(err) : handler; +} + +static bool raw_fs_is_multicast(struct mlx5_ib_flow_matcher *fs_matcher, + void *match_v) +{ + void *match_c; + void *match_v_set_lyr_2_4, *match_c_set_lyr_2_4; + void *dmac, *dmac_mask; + void *ipv4, *ipv4_mask; + + if (!(fs_matcher->match_criteria_enable & + (1 << MATCH_CRITERIA_ENABLE_OUTER_BIT))) + return false; + + match_c = fs_matcher->matcher_mask.match_params; + match_v_set_lyr_2_4 = MLX5_ADDR_OF(fte_match_param, match_v, + outer_headers); + match_c_set_lyr_2_4 = MLX5_ADDR_OF(fte_match_param, match_c, + outer_headers); + + dmac = MLX5_ADDR_OF(fte_match_set_lyr_2_4, match_v_set_lyr_2_4, + dmac_47_16); + dmac_mask = MLX5_ADDR_OF(fte_match_set_lyr_2_4, match_c_set_lyr_2_4, + dmac_47_16); + + if (is_multicast_ether_addr(dmac) && + is_multicast_ether_addr(dmac_mask)) + return true; + + ipv4 = MLX5_ADDR_OF(fte_match_set_lyr_2_4, match_v_set_lyr_2_4, + dst_ipv4_dst_ipv6.ipv4_layout.ipv4); + + ipv4_mask = MLX5_ADDR_OF(fte_match_set_lyr_2_4, match_c_set_lyr_2_4, + dst_ipv4_dst_ipv6.ipv4_layout.ipv4); + + if (ipv4_is_multicast(*(__be32 *)(ipv4)) && + ipv4_is_multicast(*(__be32 *)(ipv4_mask))) + return true; + + return false; +} + +struct mlx5_ib_flow_handler * +mlx5_ib_raw_fs_rule_add(struct mlx5_ib_dev *dev, + struct mlx5_ib_flow_matcher *fs_matcher, + void *cmd_in, int inlen, int dest_id, + int dest_type) +{ + struct mlx5_flow_destination *dst; + struct mlx5_ib_flow_prio *ft_prio; + int priority = fs_matcher->priority; + struct mlx5_ib_flow_handler *handler; + bool mcast; + int err; + + if (fs_matcher->flow_type != MLX5_IB_FLOW_TYPE_NORMAL) + return ERR_PTR(-EOPNOTSUPP); + + if (fs_matcher->priority > MLX5_IB_FLOW_LAST_PRIO) + return ERR_PTR(-ENOMEM); + + dst = kzalloc(sizeof(*dst), GFP_KERNEL); + if (!dst) + return ERR_PTR(-ENOMEM); + + mcast = raw_fs_is_multicast(fs_matcher, cmd_in); + mutex_lock(&dev->flow_db->lock); + + ft_prio = _get_flow_table(dev, priority, mcast); + if (IS_ERR(ft_prio)) { + err = PTR_ERR(ft_prio); + goto unlock; + } + + if (dest_type == MLX5_FLOW_DESTINATION_TYPE_TIR) { + dst->type = dest_type; + dst->tir_num = dest_id; + } else { + dst->type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE_NUM; + dst->ft_num = dest_id; + } + + handler = _create_raw_flow_rule(dev, ft_prio, dst, fs_matcher, cmd_in, + inlen); + + if (IS_ERR(handler)) { + err = PTR_ERR(handler); + goto destroy_ft; + } + + mutex_unlock(&dev->flow_db->lock); + atomic_inc(&fs_matcher->usecnt); + handler->flow_matcher = fs_matcher; + + kfree(dst); + + return handler; + +destroy_ft: + put_flow_table(dev, ft_prio, false); +unlock: + mutex_unlock(&dev->flow_db->lock); + kfree(dst); + + return ERR_PTR(err); +} + static u32 mlx5_ib_flow_action_flags_to_accel_xfrm_flags(u32 mlx5_flags) { u32 flags = 0; @@ -3672,12 +3865,11 @@ mlx5_ib_create_flow_action_esp(struct ib_device *device, u64 flags; int err = 0; - if (IS_UVERBS_COPY_ERR(uverbs_copy_from(&action_flags, attrs, - MLX5_IB_ATTR_CREATE_FLOW_ACTION_FLAGS))) - return ERR_PTR(-EFAULT); - - if (action_flags >= (MLX5_FLOW_ACTION_ESP_CREATE_LAST_SUPPORTED << 1)) - return ERR_PTR(-EOPNOTSUPP); + err = uverbs_get_flags64( + &action_flags, attrs, MLX5_IB_ATTR_CREATE_FLOW_ACTION_FLAGS, + ((MLX5_FLOW_ACTION_ESP_CREATE_LAST_SUPPORTED << 1) - 1)); + if (err) + return ERR_PTR(err); flags = mlx5_ib_flow_action_flags_to_accel_xfrm_flags(action_flags); @@ -4466,7 +4658,8 @@ static void destroy_dev_resources(struct mlx5_ib_resources *devr) cancel_work_sync(&devr->ports[port].pkey_change_work); } -static u32 get_core_cap_flags(struct ib_device *ibdev) +static u32 get_core_cap_flags(struct ib_device *ibdev, + struct mlx5_hca_vport_context *rep) { struct mlx5_ib_dev *dev = to_mdev(ibdev); enum rdma_link_layer ll = mlx5_ib_port_link_layer(ibdev, 1); @@ -4475,11 +4668,14 @@ static u32 get_core_cap_flags(struct ib_device *ibdev) bool raw_support = !mlx5_core_mp_enabled(dev->mdev); u32 ret = 0; + if (rep->grh_required) + ret |= RDMA_CORE_CAP_IB_GRH_REQUIRED; + if (ll == IB_LINK_LAYER_INFINIBAND) - return RDMA_CORE_PORT_IBA_IB; + return ret | RDMA_CORE_PORT_IBA_IB; if (raw_support) - ret = RDMA_CORE_PORT_RAW_PACKET; + ret |= RDMA_CORE_PORT_RAW_PACKET; if (!(l3_type_cap & MLX5_ROCE_L3_TYPE_IPV4_CAP)) return ret; @@ -4502,17 +4698,23 @@ static int mlx5_port_immutable(struct ib_device *ibdev, u8 port_num, struct ib_port_attr attr; struct mlx5_ib_dev *dev = to_mdev(ibdev); enum rdma_link_layer ll = mlx5_ib_port_link_layer(ibdev, port_num); + struct mlx5_hca_vport_context rep = {0}; int err; - immutable->core_cap_flags = get_core_cap_flags(ibdev); - err = ib_query_port(ibdev, port_num, &attr); if (err) return err; + if (ll == IB_LINK_LAYER_INFINIBAND) { + err = mlx5_query_hca_vport_context(dev->mdev, 0, port_num, 0, + &rep); + if (err) + return err; + } + immutable->pkey_tbl_len = attr.pkey_tbl_len; immutable->gid_tbl_len = attr.gid_tbl_len; - immutable->core_cap_flags = get_core_cap_flags(ibdev); + immutable->core_cap_flags = get_core_cap_flags(ibdev, &rep); if ((ll == IB_LINK_LAYER_INFINIBAND) || MLX5_CAP_GEN(dev->mdev, roce)) immutable->max_mad_size = IB_MGMT_MAD_SIZE; @@ -4610,7 +4812,7 @@ static void mlx5_remove_netdev_notifier(struct mlx5_ib_dev *dev, u8 port_num) } } -static int mlx5_enable_eth(struct mlx5_ib_dev *dev, u8 port_num) +static int mlx5_enable_eth(struct mlx5_ib_dev *dev) { int err; @@ -4689,12 +4891,21 @@ static const struct mlx5_ib_counter extended_err_cnts[] = { INIT_Q_COUNTER(req_cqe_flush_error), }; +#define INIT_EXT_PPCNT_COUNTER(_name) \ + { .name = #_name, .offset = \ + MLX5_BYTE_OFF(ppcnt_reg, \ + counter_set.eth_extended_cntrs_grp_data_layout._name##_high)} + +static const struct mlx5_ib_counter ext_ppcnt_cnts[] = { + INIT_EXT_PPCNT_COUNTER(rx_icrc_encapsulated), +}; + static void mlx5_ib_dealloc_counters(struct mlx5_ib_dev *dev) { int i; for (i = 0; i < dev->num_ports; i++) { - if (dev->port[i].cnts.set_id) + if (dev->port[i].cnts.set_id_valid) mlx5_core_dealloc_q_counter(dev->mdev, dev->port[i].cnts.set_id); kfree(dev->port[i].cnts.names); @@ -4724,7 +4935,10 @@ static int __mlx5_ib_alloc_counters(struct mlx5_ib_dev *dev, cnts->num_cong_counters = ARRAY_SIZE(cong_cnts); num_counters += ARRAY_SIZE(cong_cnts); } - + if (MLX5_CAP_PCAM_FEATURE(dev->mdev, rx_icrc_encapsulated_counter)) { + cnts->num_ext_ppcnt_counters = ARRAY_SIZE(ext_ppcnt_cnts); + num_counters += ARRAY_SIZE(ext_ppcnt_cnts); + } cnts->names = kcalloc(num_counters, sizeof(cnts->names), GFP_KERNEL); if (!cnts->names) return -ENOMEM; @@ -4781,6 +4995,13 @@ static void mlx5_ib_fill_counters(struct mlx5_ib_dev *dev, offsets[j] = cong_cnts[i].offset; } } + + if (MLX5_CAP_PCAM_FEATURE(dev->mdev, rx_icrc_encapsulated_counter)) { + for (i = 0; i < ARRAY_SIZE(ext_ppcnt_cnts); i++, j++) { + names[j] = ext_ppcnt_cnts[i].name; + offsets[j] = ext_ppcnt_cnts[i].offset; + } + } } static int mlx5_ib_alloc_counters(struct mlx5_ib_dev *dev) @@ -4826,7 +5047,8 @@ static struct rdma_hw_stats *mlx5_ib_alloc_hw_stats(struct ib_device *ibdev, return rdma_alloc_hw_stats_struct(port->cnts.names, port->cnts.num_q_counters + - port->cnts.num_cong_counters, + port->cnts.num_cong_counters + + port->cnts.num_ext_ppcnt_counters, RDMA_HW_STATS_DEFAULT_LIFESPAN); } @@ -4859,6 +5081,34 @@ free: return ret; } +static int mlx5_ib_query_ext_ppcnt_counters(struct mlx5_ib_dev *dev, + struct mlx5_ib_port *port, + struct rdma_hw_stats *stats) +{ + int offset = port->cnts.num_q_counters + port->cnts.num_cong_counters; + int sz = MLX5_ST_SZ_BYTES(ppcnt_reg); + int ret, i; + void *out; + + out = kvzalloc(sz, GFP_KERNEL); + if (!out) + return -ENOMEM; + + ret = mlx5_cmd_query_ext_ppcnt_counters(dev->mdev, out); + if (ret) + goto free; + + for (i = 0; i < port->cnts.num_ext_ppcnt_counters; i++) { + stats->value[i + offset] = + be64_to_cpup((__be64 *)(out + + port->cnts.offsets[i + offset])); + } + +free: + kvfree(out); + return ret; +} + static int mlx5_ib_get_hw_stats(struct ib_device *ibdev, struct rdma_hw_stats *stats, u8 port_num, int index) @@ -4872,13 +5122,21 @@ static int mlx5_ib_get_hw_stats(struct ib_device *ibdev, if (!stats) return -EINVAL; - num_counters = port->cnts.num_q_counters + port->cnts.num_cong_counters; + num_counters = port->cnts.num_q_counters + + port->cnts.num_cong_counters + + port->cnts.num_ext_ppcnt_counters; /* q_counters are per IB device, query the master mdev */ ret = mlx5_ib_query_q_counters(dev->mdev, port, stats); if (ret) return ret; + if (MLX5_CAP_PCAM_FEATURE(dev->mdev, rx_icrc_encapsulated_counter)) { + ret = mlx5_ib_query_ext_ppcnt_counters(dev, port, stats); + if (ret) + return ret; + } + if (MLX5_CAP_GEN(dev->mdev, cc_query_allowed)) { mdev = mlx5_ib_get_native_port_mdev(dev, port_num, &mdev_port_num); @@ -4905,11 +5163,6 @@ done: return num_counters; } -static void mlx5_ib_free_rdma_netdev(struct net_device *netdev) -{ - return mlx5_rdma_netdev_free(netdev); -} - static struct net_device* mlx5_ib_alloc_rdma_netdev(struct ib_device *hca, u8 port_num, @@ -4919,17 +5172,12 @@ mlx5_ib_alloc_rdma_netdev(struct ib_device *hca, void (*setup)(struct net_device *)) { struct net_device *netdev; - struct rdma_netdev *rn; if (type != RDMA_NETDEV_IPOIB) return ERR_PTR(-EOPNOTSUPP); netdev = mlx5_rdma_netdev_alloc(to_mdev(hca)->mdev, hca, name, setup); - if (likely(!IS_ERR_OR_NULL(netdev))) { - rn = netdev_priv(netdev); - rn->free_rdma_netdev = mlx5_ib_free_rdma_netdev; - } return netdev; } @@ -5127,8 +5375,8 @@ static bool mlx5_ib_bind_slave_port(struct mlx5_ib_dev *ibdev, spin_lock(&ibdev->port[port_num].mp.mpi_lock); if (ibdev->port[port_num].mp.mpi) { - mlx5_ib_warn(ibdev, "port %d already affiliated.\n", - port_num + 1); + mlx5_ib_dbg(ibdev, "port %d already affiliated.\n", + port_num + 1); spin_unlock(&ibdev->port[port_num].mp.mpi_lock); return false; } @@ -5263,45 +5511,47 @@ static void mlx5_ib_cleanup_multiport_master(struct mlx5_ib_dev *dev) mlx5_nic_vport_disable_roce(dev->mdev); } -ADD_UVERBS_ATTRIBUTES_SIMPLE(mlx5_ib_dm, UVERBS_OBJECT_DM, - UVERBS_METHOD_DM_ALLOC, - &UVERBS_ATTR_PTR_OUT(MLX5_IB_ATTR_ALLOC_DM_RESP_START_OFFSET, - UVERBS_ATTR_TYPE(u64), - UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)), - &UVERBS_ATTR_PTR_OUT(MLX5_IB_ATTR_ALLOC_DM_RESP_PAGE_INDEX, - UVERBS_ATTR_TYPE(u16), - UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY))); - -ADD_UVERBS_ATTRIBUTES_SIMPLE(mlx5_ib_flow_action, UVERBS_OBJECT_FLOW_ACTION, - UVERBS_METHOD_FLOW_ACTION_ESP_CREATE, - &UVERBS_ATTR_PTR_IN(MLX5_IB_ATTR_CREATE_FLOW_ACTION_FLAGS, - UVERBS_ATTR_TYPE(u64), - UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY))); +ADD_UVERBS_ATTRIBUTES_SIMPLE( + mlx5_ib_dm, + UVERBS_OBJECT_DM, + UVERBS_METHOD_DM_ALLOC, + UVERBS_ATTR_PTR_OUT(MLX5_IB_ATTR_ALLOC_DM_RESP_START_OFFSET, + UVERBS_ATTR_TYPE(u64), + UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(MLX5_IB_ATTR_ALLOC_DM_RESP_PAGE_INDEX, + UVERBS_ATTR_TYPE(u16), + UA_MANDATORY)); + +ADD_UVERBS_ATTRIBUTES_SIMPLE( + mlx5_ib_flow_action, + UVERBS_OBJECT_FLOW_ACTION, + UVERBS_METHOD_FLOW_ACTION_ESP_CREATE, + UVERBS_ATTR_FLAGS_IN(MLX5_IB_ATTR_CREATE_FLOW_ACTION_FLAGS, + enum mlx5_ib_uapi_flow_action_flags)); -#define NUM_TREES 2 static int populate_specs_root(struct mlx5_ib_dev *dev) { - const struct uverbs_object_tree_def *default_root[NUM_TREES + 1] = { - uverbs_default_get_objects()}; - size_t num_trees = 1; + const struct uverbs_object_tree_def **trees = dev->driver_trees; + size_t num_trees = 0; - if (mlx5_accel_ipsec_device_caps(dev->mdev) & MLX5_ACCEL_IPSEC_CAP_DEVICE && - !WARN_ON(num_trees >= ARRAY_SIZE(default_root))) - default_root[num_trees++] = &mlx5_ib_flow_action; + if (mlx5_accel_ipsec_device_caps(dev->mdev) & + MLX5_ACCEL_IPSEC_CAP_DEVICE) + trees[num_trees++] = &mlx5_ib_flow_action; - if (MLX5_CAP_DEV_MEM(dev->mdev, memic) && - !WARN_ON(num_trees >= ARRAY_SIZE(default_root))) - default_root[num_trees++] = &mlx5_ib_dm; + if (MLX5_CAP_DEV_MEM(dev->mdev, memic)) + trees[num_trees++] = &mlx5_ib_dm; - dev->ib_dev.specs_root = - uverbs_alloc_spec_tree(num_trees, default_root); + if (MLX5_CAP_GEN_64(dev->mdev, general_obj_types) & + MLX5_GENERAL_OBJ_TYPES_CAP_UCTX) + trees[num_trees++] = mlx5_ib_get_devx_tree(); - return PTR_ERR_OR_ZERO(dev->ib_dev.specs_root); -} + num_trees += mlx5_ib_get_flow_trees(trees + num_trees); -static void depopulate_specs_root(struct mlx5_ib_dev *dev) -{ - uverbs_free_spec_tree(dev->ib_dev.specs_root); + WARN_ON(num_trees >= ARRAY_SIZE(dev->driver_trees)); + trees[num_trees] = NULL; + dev->ib_dev.driver_specs = trees; + + return 0; } static int mlx5_ib_read_counters(struct ib_counters *counters, @@ -5552,6 +5802,8 @@ int mlx5_ib_stage_caps_init(struct mlx5_ib_dev *dev) dev->ib_dev.modify_qp = mlx5_ib_modify_qp; dev->ib_dev.query_qp = mlx5_ib_query_qp; dev->ib_dev.destroy_qp = mlx5_ib_destroy_qp; + dev->ib_dev.drain_sq = mlx5_ib_drain_sq; + dev->ib_dev.drain_rq = mlx5_ib_drain_rq; dev->ib_dev.post_send = mlx5_ib_post_send; dev->ib_dev.post_recv = mlx5_ib_post_recv; dev->ib_dev.create_cq = mlx5_ib_create_cq; @@ -5649,9 +5901,9 @@ int mlx5_ib_stage_rep_non_default_cb(struct mlx5_ib_dev *dev) return 0; } -static int mlx5_ib_stage_common_roce_init(struct mlx5_ib_dev *dev, - u8 port_num) +static int mlx5_ib_stage_common_roce_init(struct mlx5_ib_dev *dev) { + u8 port_num; int i; for (i = 0; i < dev->num_ports; i++) { @@ -5674,6 +5926,8 @@ static int mlx5_ib_stage_common_roce_init(struct mlx5_ib_dev *dev, (1ull << IB_USER_VERBS_EX_CMD_CREATE_RWQ_IND_TBL) | (1ull << IB_USER_VERBS_EX_CMD_DESTROY_RWQ_IND_TBL); + port_num = mlx5_core_native_port_num(dev->mdev) - 1; + return mlx5_add_netdev_notifier(dev, port_num); } @@ -5690,14 +5944,12 @@ int mlx5_ib_stage_rep_roce_init(struct mlx5_ib_dev *dev) enum rdma_link_layer ll; int port_type_cap; int err = 0; - u8 port_num; - port_num = mlx5_core_native_port_num(dev->mdev) - 1; port_type_cap = MLX5_CAP_GEN(mdev, port_type); ll = mlx5_port_type_cap_to_rdma_ll(port_type_cap); if (ll == IB_LINK_LAYER_ETHERNET) - err = mlx5_ib_stage_common_roce_init(dev, port_num); + err = mlx5_ib_stage_common_roce_init(dev); return err; } @@ -5712,19 +5964,17 @@ static int mlx5_ib_stage_roce_init(struct mlx5_ib_dev *dev) struct mlx5_core_dev *mdev = dev->mdev; enum rdma_link_layer ll; int port_type_cap; - u8 port_num; int err; - port_num = mlx5_core_native_port_num(dev->mdev) - 1; port_type_cap = MLX5_CAP_GEN(mdev, port_type); ll = mlx5_port_type_cap_to_rdma_ll(port_type_cap); if (ll == IB_LINK_LAYER_ETHERNET) { - err = mlx5_ib_stage_common_roce_init(dev, port_num); + err = mlx5_ib_stage_common_roce_init(dev); if (err) return err; - err = mlx5_enable_eth(dev, port_num); + err = mlx5_enable_eth(dev); if (err) goto cleanup; } @@ -5741,9 +5991,7 @@ static void mlx5_ib_stage_roce_cleanup(struct mlx5_ib_dev *dev) struct mlx5_core_dev *mdev = dev->mdev; enum rdma_link_layer ll; int port_type_cap; - u8 port_num; - port_num = mlx5_core_native_port_num(dev->mdev) - 1; port_type_cap = MLX5_CAP_GEN(mdev, port_type); ll = mlx5_port_type_cap_to_rdma_ll(port_type_cap); @@ -5842,11 +6090,6 @@ int mlx5_ib_stage_ib_reg_init(struct mlx5_ib_dev *dev) return ib_register_device(&dev->ib_dev, NULL); } -static void mlx5_ib_stage_depopulate_specs(struct mlx5_ib_dev *dev) -{ - depopulate_specs_root(dev); -} - void mlx5_ib_stage_pre_ib_reg_umr_cleanup(struct mlx5_ib_dev *dev) { destroy_umrc_res(dev); @@ -5915,8 +6158,6 @@ void __mlx5_ib_remove(struct mlx5_ib_dev *dev, ib_dealloc_device((struct ib_device *)dev); } -static void *mlx5_ib_add_slave_port(struct mlx5_core_dev *mdev, u8 port_num); - void *__mlx5_ib_add(struct mlx5_ib_dev *dev, const struct mlx5_ib_profile *profile) { @@ -5983,7 +6224,7 @@ static const struct mlx5_ib_profile pf_profile = { mlx5_ib_stage_pre_ib_reg_umr_cleanup), STAGE_CREATE(MLX5_IB_STAGE_SPECS, mlx5_ib_stage_populate_specs, - mlx5_ib_stage_depopulate_specs), + NULL), STAGE_CREATE(MLX5_IB_STAGE_IB_REG, mlx5_ib_stage_ib_reg_init, mlx5_ib_stage_ib_reg_cleanup), @@ -6031,7 +6272,7 @@ static const struct mlx5_ib_profile nic_rep_profile = { mlx5_ib_stage_pre_ib_reg_umr_cleanup), STAGE_CREATE(MLX5_IB_STAGE_SPECS, mlx5_ib_stage_populate_specs, - mlx5_ib_stage_depopulate_specs), + NULL), STAGE_CREATE(MLX5_IB_STAGE_IB_REG, mlx5_ib_stage_ib_reg_init, mlx5_ib_stage_ib_reg_cleanup), @@ -6046,7 +6287,7 @@ static const struct mlx5_ib_profile nic_rep_profile = { mlx5_ib_stage_rep_reg_cleanup), }; -static void *mlx5_ib_add_slave_port(struct mlx5_core_dev *mdev, u8 port_num) +static void *mlx5_ib_add_slave_port(struct mlx5_core_dev *mdev) { struct mlx5_ib_multiport_info *mpi; struct mlx5_ib_dev *dev; @@ -6080,8 +6321,6 @@ static void *mlx5_ib_add_slave_port(struct mlx5_core_dev *mdev, u8 port_num) if (!bound) { list_add_tail(&mpi->list, &mlx5_ib_unaffiliated_port_list); dev_dbg(&mdev->pdev->dev, "no suitable IB device found to bind to, added to unaffiliated list.\n"); - } else { - mlx5_ib_dbg(dev, "bound port %u\n", port_num + 1); } mutex_unlock(&mlx5_ib_multiport_mutex); @@ -6099,11 +6338,8 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev) port_type_cap = MLX5_CAP_GEN(mdev, port_type); ll = mlx5_port_type_cap_to_rdma_ll(port_type_cap); - if (mlx5_core_is_mp_slave(mdev) && ll == IB_LINK_LAYER_ETHERNET) { - u8 port_num = mlx5_core_native_port_num(mdev) - 1; - - return mlx5_ib_add_slave_port(mdev, port_num); - } + if (mlx5_core_is_mp_slave(mdev) && ll == IB_LINK_LAYER_ETHERNET) + return mlx5_ib_add_slave_port(mdev); dev = (struct mlx5_ib_dev *)ib_alloc_device(sizeof(*dev)); if (!dev) diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index d89c8fe626f6..320d4dfe8c2f 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -46,6 +46,7 @@ #include <rdma/ib_user_verbs.h> #include <rdma/mlx5-abi.h> #include <rdma/uverbs_ioctl.h> +#include <rdma/mlx5_user_ioctl_cmds.h> #define mlx5_ib_dbg(dev, format, arg...) \ pr_debug("%s:%s:%d:(pid %d): " format, (dev)->ib_dev.name, __func__, \ @@ -78,12 +79,6 @@ enum { MLX5_REQ_SCAT_DATA64_CQE = 0x22, }; -enum mlx5_ib_latency_class { - MLX5_IB_LATENCY_CLASS_LOW, - MLX5_IB_LATENCY_CLASS_MEDIUM, - MLX5_IB_LATENCY_CLASS_HIGH, -}; - enum mlx5_ib_mad_ifc_flags { MLX5_MAD_IFC_IGNORE_MKEY = 1, MLX5_MAD_IFC_IGNORE_BKEY = 2, @@ -143,6 +138,7 @@ struct mlx5_ib_ucontext { u64 lib_caps; DECLARE_BITMAP(dm_pages, MLX5_MAX_MEMIC_PAGES); + u16 devx_uid; }; static inline struct mlx5_ib_ucontext *to_mucontext(struct ib_ucontext *ibucontext) @@ -176,6 +172,18 @@ struct mlx5_ib_flow_handler { struct mlx5_ib_flow_prio *prio; struct mlx5_flow_handle *rule; struct ib_counters *ibcounters; + struct mlx5_ib_dev *dev; + struct mlx5_ib_flow_matcher *flow_matcher; +}; + +struct mlx5_ib_flow_matcher { + struct mlx5_ib_match_params matcher_mask; + int mask_len; + enum mlx5_ib_flow_type flow_type; + u16 priority; + struct mlx5_core_dev *mdev; + atomic_t usecnt; + u8 match_criteria_enable; }; struct mlx5_ib_flow_db { @@ -461,7 +469,7 @@ struct mlx5_umr_wr { u32 mkey; }; -static inline struct mlx5_umr_wr *umr_wr(struct ib_send_wr *wr) +static inline const struct mlx5_umr_wr *umr_wr(const struct ib_send_wr *wr) { return container_of(wr, struct mlx5_umr_wr, wr); } @@ -665,6 +673,7 @@ struct mlx5_ib_counters { size_t *offsets; u32 num_q_counters; u32 num_cong_counters; + u32 num_ext_ppcnt_counters; u16 set_id; bool set_id_valid; }; @@ -851,6 +860,7 @@ to_mcounters(struct ib_counters *ibcntrs) struct mlx5_ib_dev { struct ib_device ib_dev; + const struct uverbs_object_tree_def *driver_trees[6]; struct mlx5_core_dev *mdev; struct mlx5_roce roce[MLX5_MAX_PORTS]; int num_ports; @@ -1004,8 +1014,8 @@ int mlx5_ib_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, enum ib_srq_attr_mask attr_mask, struct ib_udata *udata); int mlx5_ib_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *srq_attr); int mlx5_ib_destroy_srq(struct ib_srq *srq); -int mlx5_ib_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr, - struct ib_recv_wr **bad_wr); +int mlx5_ib_post_srq_recv(struct ib_srq *ibsrq, const struct ib_recv_wr *wr, + const struct ib_recv_wr **bad_wr); struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *init_attr, struct ib_udata *udata); @@ -1014,10 +1024,12 @@ int mlx5_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int mlx5_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr); int mlx5_ib_destroy_qp(struct ib_qp *qp); -int mlx5_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, - struct ib_send_wr **bad_wr); -int mlx5_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr, - struct ib_recv_wr **bad_wr); +void mlx5_ib_drain_sq(struct ib_qp *qp); +void mlx5_ib_drain_rq(struct ib_qp *qp); +int mlx5_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, + const struct ib_send_wr **bad_wr); +int mlx5_ib_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr, + const struct ib_recv_wr **bad_wr); void *mlx5_get_send_wqe(struct mlx5_ib_qp *qp, int n); int mlx5_ib_read_user_wqe(struct mlx5_ib_qp *qp, int send, int wqe_index, void *buffer, u32 length, @@ -1183,10 +1195,8 @@ int mlx5_ib_get_vf_stats(struct ib_device *device, int vf, int mlx5_ib_set_vf_guid(struct ib_device *device, int vf, u8 port, u64 guid, int type); -__be16 mlx5_get_roce_udp_sport(struct mlx5_ib_dev *dev, u8 port_num, - int index); -int mlx5_get_roce_gid_type(struct mlx5_ib_dev *dev, u8 port_num, - int index, enum ib_gid_type *gid_type); +__be16 mlx5_get_roce_udp_sport(struct mlx5_ib_dev *dev, + const struct ib_gid_attr *attr); void mlx5_ib_cleanup_cong_debugfs(struct mlx5_ib_dev *dev, u8 port_num); int mlx5_ib_init_cong_debugfs(struct mlx5_ib_dev *dev, u8 port_num); @@ -1200,10 +1210,10 @@ int mlx5_ib_gsi_modify_qp(struct ib_qp *qp, struct ib_qp_attr *attr, int mlx5_ib_gsi_query_qp(struct ib_qp *qp, struct ib_qp_attr *qp_attr, int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr); -int mlx5_ib_gsi_post_send(struct ib_qp *qp, struct ib_send_wr *wr, - struct ib_send_wr **bad_wr); -int mlx5_ib_gsi_post_recv(struct ib_qp *qp, struct ib_recv_wr *wr, - struct ib_recv_wr **bad_wr); +int mlx5_ib_gsi_post_send(struct ib_qp *qp, const struct ib_send_wr *wr, + const struct ib_send_wr **bad_wr); +int mlx5_ib_gsi_post_recv(struct ib_qp *qp, const struct ib_recv_wr *wr, + const struct ib_recv_wr **bad_wr); void mlx5_ib_gsi_pkey_change(struct mlx5_ib_gsi_qp *gsi); int mlx5_ib_generate_wc(struct ib_cq *ibcq, struct ib_wc *wc); @@ -1217,6 +1227,36 @@ struct mlx5_core_dev *mlx5_ib_get_native_port_mdev(struct mlx5_ib_dev *dev, void mlx5_ib_put_native_port_mdev(struct mlx5_ib_dev *dev, u8 port_num); +#if IS_ENABLED(CONFIG_INFINIBAND_USER_ACCESS) +int mlx5_ib_devx_create(struct mlx5_ib_dev *dev, + struct mlx5_ib_ucontext *context); +void mlx5_ib_devx_destroy(struct mlx5_ib_dev *dev, + struct mlx5_ib_ucontext *context); +const struct uverbs_object_tree_def *mlx5_ib_get_devx_tree(void); +struct mlx5_ib_flow_handler *mlx5_ib_raw_fs_rule_add( + struct mlx5_ib_dev *dev, struct mlx5_ib_flow_matcher *fs_matcher, + void *cmd_in, int inlen, int dest_id, int dest_type); +bool mlx5_ib_devx_is_flow_dest(void *obj, int *dest_id, int *dest_type); +int mlx5_ib_get_flow_trees(const struct uverbs_object_tree_def **root); +#else +static inline int +mlx5_ib_devx_create(struct mlx5_ib_dev *dev, + struct mlx5_ib_ucontext *context) { return -EOPNOTSUPP; }; +static inline void mlx5_ib_devx_destroy(struct mlx5_ib_dev *dev, + struct mlx5_ib_ucontext *context) {} +static inline const struct uverbs_object_tree_def * +mlx5_ib_get_devx_tree(void) { return NULL; } +static inline bool mlx5_ib_devx_is_flow_dest(void *obj, int *dest_id, + int *dest_type) +{ + return false; +} +static inline int +mlx5_ib_get_flow_trees(const struct uverbs_object_tree_def **root) +{ + return 0; +} +#endif static inline void init_query_mad(struct ib_smp *mad) { mad->base_version = 1; @@ -1318,4 +1358,7 @@ static inline int get_num_static_uars(struct mlx5_ib_dev *dev, unsigned long mlx5_ib_get_xlt_emergency_page(void); void mlx5_ib_put_xlt_emergency_page(void); +int bfregn_to_uar_index(struct mlx5_ib_dev *dev, + struct mlx5_bfreg_info *bfregi, u32 bfregn, + bool dyn_bfreg); #endif /* MLX5_IB_H */ diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index 90a9c461cedc..9fb1d9cb9401 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -271,16 +271,16 @@ static ssize_t size_write(struct file *filp, const char __user *buf, { struct mlx5_cache_ent *ent = filp->private_data; struct mlx5_ib_dev *dev = ent->dev; - char lbuf[20]; + char lbuf[20] = {0}; u32 var; int err; int c; - if (copy_from_user(lbuf, buf, sizeof(lbuf))) + count = min(count, sizeof(lbuf) - 1); + if (copy_from_user(lbuf, buf, count)) return -EFAULT; c = order2idx(dev, ent->order); - lbuf[sizeof(lbuf) - 1] = 0; if (sscanf(lbuf, "%u", &var) != 1) return -EINVAL; @@ -310,19 +310,11 @@ static ssize_t size_read(struct file *filp, char __user *buf, size_t count, char lbuf[20]; int err; - if (*pos) - return 0; - err = snprintf(lbuf, sizeof(lbuf), "%d\n", ent->size); if (err < 0) return err; - if (copy_to_user(buf, lbuf, err)) - return -EFAULT; - - *pos += err; - - return err; + return simple_read_from_buffer(buf, count, pos, lbuf, err); } static const struct file_operations size_fops = { @@ -337,16 +329,16 @@ static ssize_t limit_write(struct file *filp, const char __user *buf, { struct mlx5_cache_ent *ent = filp->private_data; struct mlx5_ib_dev *dev = ent->dev; - char lbuf[20]; + char lbuf[20] = {0}; u32 var; int err; int c; - if (copy_from_user(lbuf, buf, sizeof(lbuf))) + count = min(count, sizeof(lbuf) - 1); + if (copy_from_user(lbuf, buf, count)) return -EFAULT; c = order2idx(dev, ent->order); - lbuf[sizeof(lbuf) - 1] = 0; if (sscanf(lbuf, "%u", &var) != 1) return -EINVAL; @@ -372,19 +364,11 @@ static ssize_t limit_read(struct file *filp, char __user *buf, size_t count, char lbuf[20]; int err; - if (*pos) - return 0; - err = snprintf(lbuf, sizeof(lbuf), "%d\n", ent->limit); if (err < 0) return err; - if (copy_to_user(buf, lbuf, err)) - return -EFAULT; - - *pos += err; - - return err; + return simple_read_from_buffer(buf, count, pos, lbuf, err); } static const struct file_operations limit_fops = { @@ -914,7 +898,7 @@ static int mlx5_ib_post_send_wait(struct mlx5_ib_dev *dev, struct mlx5_umr_wr *umrwr) { struct umr_common *umrc = &dev->umrc; - struct ib_send_wr *bad; + const struct ib_send_wr *bad; int err; struct mlx5_ib_umr_context umr_context; diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index a4f1f638509f..6cba2a02d11b 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -563,32 +563,21 @@ static int alloc_med_class_bfreg(struct mlx5_ib_dev *dev, } static int alloc_bfreg(struct mlx5_ib_dev *dev, - struct mlx5_bfreg_info *bfregi, - enum mlx5_ib_latency_class lat) + struct mlx5_bfreg_info *bfregi) { - int bfregn = -EINVAL; + int bfregn = -ENOMEM; mutex_lock(&bfregi->lock); - switch (lat) { - case MLX5_IB_LATENCY_CLASS_LOW: + if (bfregi->ver >= 2) { + bfregn = alloc_high_class_bfreg(dev, bfregi); + if (bfregn < 0) + bfregn = alloc_med_class_bfreg(dev, bfregi); + } + + if (bfregn < 0) { BUILD_BUG_ON(NUM_NON_BLUE_FLAME_BFREGS != 1); bfregn = 0; bfregi->count[bfregn]++; - break; - - case MLX5_IB_LATENCY_CLASS_MEDIUM: - if (bfregi->ver < 2) - bfregn = -ENOMEM; - else - bfregn = alloc_med_class_bfreg(dev, bfregi); - break; - - case MLX5_IB_LATENCY_CLASS_HIGH: - if (bfregi->ver < 2) - bfregn = -ENOMEM; - else - bfregn = alloc_high_class_bfreg(dev, bfregi); - break; } mutex_unlock(&bfregi->lock); @@ -641,13 +630,13 @@ static void mlx5_ib_lock_cqs(struct mlx5_ib_cq *send_cq, static void mlx5_ib_unlock_cqs(struct mlx5_ib_cq *send_cq, struct mlx5_ib_cq *recv_cq); -static int bfregn_to_uar_index(struct mlx5_ib_dev *dev, - struct mlx5_bfreg_info *bfregi, int bfregn, - bool dyn_bfreg) +int bfregn_to_uar_index(struct mlx5_ib_dev *dev, + struct mlx5_bfreg_info *bfregi, u32 bfregn, + bool dyn_bfreg) { - int bfregs_per_sys_page; - int index_of_sys_page; - int offset; + unsigned int bfregs_per_sys_page; + u32 index_of_sys_page; + u32 offset; bfregs_per_sys_page = get_uars_per_sys_page(dev, bfregi->lib_uar_4k) * MLX5_NON_FP_BFREGS_PER_UAR; @@ -655,6 +644,10 @@ static int bfregn_to_uar_index(struct mlx5_ib_dev *dev, if (dyn_bfreg) { index_of_sys_page += bfregi->num_static_sys_pages; + + if (index_of_sys_page >= bfregi->num_sys_pages) + return -EINVAL; + if (bfregn > bfregi->num_dyn_bfregs || bfregi->sys_pages[index_of_sys_page] == MLX5_IB_INVALID_UAR_INDEX) { mlx5_ib_dbg(dev, "Invalid dynamic uar index\n"); @@ -819,21 +812,9 @@ static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd, bfregn = MLX5_CROSS_CHANNEL_BFREG; } else { - bfregn = alloc_bfreg(dev, &context->bfregi, MLX5_IB_LATENCY_CLASS_HIGH); - if (bfregn < 0) { - mlx5_ib_dbg(dev, "failed to allocate low latency BFREG\n"); - mlx5_ib_dbg(dev, "reverting to medium latency\n"); - bfregn = alloc_bfreg(dev, &context->bfregi, MLX5_IB_LATENCY_CLASS_MEDIUM); - if (bfregn < 0) { - mlx5_ib_dbg(dev, "failed to allocate medium latency BFREG\n"); - mlx5_ib_dbg(dev, "reverting to high latency\n"); - bfregn = alloc_bfreg(dev, &context->bfregi, MLX5_IB_LATENCY_CLASS_LOW); - if (bfregn < 0) { - mlx5_ib_warn(dev, "bfreg allocation failed\n"); - return bfregn; - } - } - } + bfregn = alloc_bfreg(dev, &context->bfregi); + if (bfregn < 0) + return bfregn; } mlx5_ib_dbg(dev, "bfregn 0x%x, uar_index 0x%x\n", bfregn, uar_index); @@ -1626,7 +1607,7 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, struct mlx5_ib_resources *devr = &dev->devr; int inlen = MLX5_ST_SZ_BYTES(create_qp_in); struct mlx5_core_dev *mdev = dev->mdev; - struct mlx5_ib_create_qp_resp resp; + struct mlx5_ib_create_qp_resp resp = {}; struct mlx5_ib_cq *send_cq; struct mlx5_ib_cq *recv_cq; unsigned long flags; @@ -2555,18 +2536,16 @@ static int mlx5_set_path(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, if (ah->type == RDMA_AH_ATTR_TYPE_ROCE) { if (!(ah_flags & IB_AH_GRH)) return -EINVAL; - err = mlx5_get_roce_gid_type(dev, port, grh->sgid_index, - &gid_type); - if (err) - return err; + memcpy(path->rmac, ah->roce.dmac, sizeof(ah->roce.dmac)); if (qp->ibqp.qp_type == IB_QPT_RC || qp->ibqp.qp_type == IB_QPT_UC || qp->ibqp.qp_type == IB_QPT_XRC_INI || qp->ibqp.qp_type == IB_QPT_XRC_TGT) - path->udp_sport = mlx5_get_roce_udp_sport(dev, port, - grh->sgid_index); + path->udp_sport = + mlx5_get_roce_udp_sport(dev, ah->grh.sgid_attr); path->dci_cfi_prio_sl = (sl & 0x7) << 4; + gid_type = ah->grh.sgid_attr->gid_type; if (gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) path->ecn_dscp = (grh->traffic_class >> 2) & 0x3f; } else { @@ -3529,7 +3508,7 @@ static __always_inline void set_raddr_seg(struct mlx5_wqe_raddr_seg *rseg, } static void *set_eth_seg(struct mlx5_wqe_eth_seg *eseg, - struct ib_send_wr *wr, void *qend, + const struct ib_send_wr *wr, void *qend, struct mlx5_ib_qp *qp, int *size) { void *seg = eseg; @@ -3582,7 +3561,7 @@ static void *set_eth_seg(struct mlx5_wqe_eth_seg *eseg, } static void set_datagram_seg(struct mlx5_wqe_datagram_seg *dseg, - struct ib_send_wr *wr) + const struct ib_send_wr *wr) { memcpy(&dseg->av, &to_mah(ud_wr(wr)->ah)->av, sizeof(struct mlx5_av)); dseg->av.dqp_dct = cpu_to_be32(ud_wr(wr)->remote_qpn | MLX5_EXTENDED_UD_AV); @@ -3730,9 +3709,9 @@ static int umr_check_mkey_mask(struct mlx5_ib_dev *dev, u64 mask) static int set_reg_umr_segment(struct mlx5_ib_dev *dev, struct mlx5_wqe_umr_ctrl_seg *umr, - struct ib_send_wr *wr, int atomic) + const struct ib_send_wr *wr, int atomic) { - struct mlx5_umr_wr *umrwr = umr_wr(wr); + const struct mlx5_umr_wr *umrwr = umr_wr(wr); memset(umr, 0, sizeof(*umr)); @@ -3803,9 +3782,10 @@ static void set_linv_mkey_seg(struct mlx5_mkey_seg *seg) seg->status = MLX5_MKEY_STATUS_FREE; } -static void set_reg_mkey_segment(struct mlx5_mkey_seg *seg, struct ib_send_wr *wr) +static void set_reg_mkey_segment(struct mlx5_mkey_seg *seg, + const struct ib_send_wr *wr) { - struct mlx5_umr_wr *umrwr = umr_wr(wr); + const struct mlx5_umr_wr *umrwr = umr_wr(wr); memset(seg, 0, sizeof(*seg)); if (wr->send_flags & MLX5_IB_SEND_UMR_DISABLE_MR) @@ -3854,7 +3834,7 @@ static void set_reg_umr_inline_seg(void *seg, struct mlx5_ib_qp *qp, seg += mr_list_size; } -static __be32 send_ieth(struct ib_send_wr *wr) +static __be32 send_ieth(const struct ib_send_wr *wr) { switch (wr->opcode) { case IB_WR_SEND_WITH_IMM: @@ -3886,7 +3866,7 @@ static u8 wq_sig(void *wqe) return calc_sig(wqe, (*((u8 *)wqe + 8) & 0x3f) << 4); } -static int set_data_inl_seg(struct mlx5_ib_qp *qp, struct ib_send_wr *wr, +static int set_data_inl_seg(struct mlx5_ib_qp *qp, const struct ib_send_wr *wr, void *wqe, int *sz) { struct mlx5_wqe_inline_seg *seg; @@ -4032,7 +4012,7 @@ static int mlx5_set_bsf(struct ib_mr *sig_mr, return 0; } -static int set_sig_data_segment(struct ib_sig_handover_wr *wr, +static int set_sig_data_segment(const struct ib_sig_handover_wr *wr, struct mlx5_ib_qp *qp, void **seg, int *size) { struct ib_sig_attrs *sig_attrs = wr->sig_attrs; @@ -4134,7 +4114,7 @@ static int set_sig_data_segment(struct ib_sig_handover_wr *wr, } static void set_sig_mkey_segment(struct mlx5_mkey_seg *seg, - struct ib_sig_handover_wr *wr, u32 size, + const struct ib_sig_handover_wr *wr, u32 size, u32 length, u32 pdn) { struct ib_mr *sig_mr = wr->sig_mr; @@ -4165,10 +4145,10 @@ static void set_sig_umr_segment(struct mlx5_wqe_umr_ctrl_seg *umr, } -static int set_sig_umr_wr(struct ib_send_wr *send_wr, struct mlx5_ib_qp *qp, - void **seg, int *size) +static int set_sig_umr_wr(const struct ib_send_wr *send_wr, + struct mlx5_ib_qp *qp, void **seg, int *size) { - struct ib_sig_handover_wr *wr = sig_handover_wr(send_wr); + const struct ib_sig_handover_wr *wr = sig_handover_wr(send_wr); struct mlx5_ib_mr *sig_mr = to_mmr(wr->sig_mr); u32 pdn = get_pd(qp)->pdn; u32 xlt_size; @@ -4243,7 +4223,7 @@ static int set_psv_wr(struct ib_sig_domain *domain, } static int set_reg_wr(struct mlx5_ib_qp *qp, - struct ib_reg_wr *wr, + const struct ib_reg_wr *wr, void **seg, int *size) { struct mlx5_ib_mr *mr = to_mmr(wr->mr); @@ -4314,10 +4294,10 @@ static void dump_wqe(struct mlx5_ib_qp *qp, int idx, int size_16) } } -static int begin_wqe(struct mlx5_ib_qp *qp, void **seg, +static int __begin_wqe(struct mlx5_ib_qp *qp, void **seg, struct mlx5_wqe_ctrl_seg **ctrl, - struct ib_send_wr *wr, unsigned *idx, - int *size, int nreq) + const struct ib_send_wr *wr, unsigned *idx, + int *size, int nreq, bool send_signaled, bool solicited) { if (unlikely(mlx5_wq_overflow(&qp->sq, nreq, qp->ibqp.send_cq))) return -ENOMEM; @@ -4328,10 +4308,8 @@ static int begin_wqe(struct mlx5_ib_qp *qp, void **seg, *(uint32_t *)(*seg + 8) = 0; (*ctrl)->imm = send_ieth(wr); (*ctrl)->fm_ce_se = qp->sq_signal_bits | - (wr->send_flags & IB_SEND_SIGNALED ? - MLX5_WQE_CTRL_CQ_UPDATE : 0) | - (wr->send_flags & IB_SEND_SOLICITED ? - MLX5_WQE_CTRL_SOLICITED : 0); + (send_signaled ? MLX5_WQE_CTRL_CQ_UPDATE : 0) | + (solicited ? MLX5_WQE_CTRL_SOLICITED : 0); *seg += sizeof(**ctrl); *size = sizeof(**ctrl) / 16; @@ -4339,6 +4317,16 @@ static int begin_wqe(struct mlx5_ib_qp *qp, void **seg, return 0; } +static int begin_wqe(struct mlx5_ib_qp *qp, void **seg, + struct mlx5_wqe_ctrl_seg **ctrl, + const struct ib_send_wr *wr, unsigned *idx, + int *size, int nreq) +{ + return __begin_wqe(qp, seg, ctrl, wr, idx, size, nreq, + wr->send_flags & IB_SEND_SIGNALED, + wr->send_flags & IB_SEND_SOLICITED); +} + static void finish_wqe(struct mlx5_ib_qp *qp, struct mlx5_wqe_ctrl_seg *ctrl, u8 size, unsigned idx, u64 wr_id, @@ -4360,9 +4348,8 @@ static void finish_wqe(struct mlx5_ib_qp *qp, qp->sq.w_list[idx].next = qp->sq.cur_post; } - -int mlx5_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, - struct ib_send_wr **bad_wr) +static int _mlx5_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, + const struct ib_send_wr **bad_wr, bool drain) { struct mlx5_wqe_ctrl_seg *ctrl = NULL; /* compiler warning */ struct mlx5_ib_dev *dev = to_mdev(ibqp->device); @@ -4393,7 +4380,7 @@ int mlx5_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, spin_lock_irqsave(&qp->sq.lock, flags); - if (mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) { + if (mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR && !drain) { err = -EIO; *bad_wr = wr; nreq = 0; @@ -4498,10 +4485,8 @@ int mlx5_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, * SET_PSV WQEs are not signaled and solicited * on error */ - wr->send_flags &= ~IB_SEND_SIGNALED; - wr->send_flags |= IB_SEND_SOLICITED; - err = begin_wqe(qp, &seg, &ctrl, wr, - &idx, &size, nreq); + err = __begin_wqe(qp, &seg, &ctrl, wr, &idx, + &size, nreq, false, true); if (err) { mlx5_ib_warn(dev, "\n"); err = -ENOMEM; @@ -4520,8 +4505,8 @@ int mlx5_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, finish_wqe(qp, ctrl, size, idx, wr->wr_id, nreq, fence, MLX5_OPCODE_SET_PSV); - err = begin_wqe(qp, &seg, &ctrl, wr, - &idx, &size, nreq); + err = __begin_wqe(qp, &seg, &ctrl, wr, &idx, + &size, nreq, false, true); if (err) { mlx5_ib_warn(dev, "\n"); err = -ENOMEM; @@ -4690,13 +4675,19 @@ out: return err; } +int mlx5_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, + const struct ib_send_wr **bad_wr) +{ + return _mlx5_ib_post_send(ibqp, wr, bad_wr, false); +} + static void set_sig_seg(struct mlx5_rwqe_sig *sig, int size) { sig->signature = calc_sig(sig, size); } -int mlx5_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr, - struct ib_recv_wr **bad_wr) +static int _mlx5_ib_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr, + const struct ib_recv_wr **bad_wr, bool drain) { struct mlx5_ib_qp *qp = to_mqp(ibqp); struct mlx5_wqe_data_seg *scat; @@ -4714,7 +4705,7 @@ int mlx5_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr, spin_lock_irqsave(&qp->rq.lock, flags); - if (mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) { + if (mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR && !drain) { err = -EIO; *bad_wr = wr; nreq = 0; @@ -4776,6 +4767,12 @@ out: return err; } +int mlx5_ib_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr, + const struct ib_recv_wr **bad_wr) +{ + return _mlx5_ib_post_recv(ibqp, wr, bad_wr, false); +} + static inline enum ib_qp_state to_ib_qp_state(enum mlx5_qp_state mlx5_state) { switch (mlx5_state) { @@ -5365,7 +5362,9 @@ static int set_user_rq_size(struct mlx5_ib_dev *dev, rwq->wqe_count = ucmd->rq_wqe_count; rwq->wqe_shift = ucmd->rq_wqe_shift; - rwq->buf_size = (rwq->wqe_count << rwq->wqe_shift); + if (check_shl_overflow(rwq->wqe_count, rwq->wqe_shift, &rwq->buf_size)) + return -EINVAL; + rwq->log_rq_stride = rwq->wqe_shift; rwq->log_rq_size = ilog2(rwq->wqe_count); return 0; @@ -5697,3 +5696,132 @@ out: kvfree(in); return err; } + +struct mlx5_ib_drain_cqe { + struct ib_cqe cqe; + struct completion done; +}; + +static void mlx5_ib_drain_qp_done(struct ib_cq *cq, struct ib_wc *wc) +{ + struct mlx5_ib_drain_cqe *cqe = container_of(wc->wr_cqe, + struct mlx5_ib_drain_cqe, + cqe); + + complete(&cqe->done); +} + +/* This function returns only once the drained WR was completed */ +static void handle_drain_completion(struct ib_cq *cq, + struct mlx5_ib_drain_cqe *sdrain, + struct mlx5_ib_dev *dev) +{ + struct mlx5_core_dev *mdev = dev->mdev; + + if (cq->poll_ctx == IB_POLL_DIRECT) { + while (wait_for_completion_timeout(&sdrain->done, HZ / 10) <= 0) + ib_process_cq_direct(cq, -1); + return; + } + + if (mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) { + struct mlx5_ib_cq *mcq = to_mcq(cq); + bool triggered = false; + unsigned long flags; + + spin_lock_irqsave(&dev->reset_flow_resource_lock, flags); + /* Make sure that the CQ handler won't run if wasn't run yet */ + if (!mcq->mcq.reset_notify_added) + mcq->mcq.reset_notify_added = 1; + else + triggered = true; + spin_unlock_irqrestore(&dev->reset_flow_resource_lock, flags); + + if (triggered) { + /* Wait for any scheduled/running task to be ended */ + switch (cq->poll_ctx) { + case IB_POLL_SOFTIRQ: + irq_poll_disable(&cq->iop); + irq_poll_enable(&cq->iop); + break; + case IB_POLL_WORKQUEUE: + cancel_work_sync(&cq->work); + break; + default: + WARN_ON_ONCE(1); + } + } + + /* Run the CQ handler - this makes sure that the drain WR will + * be processed if wasn't processed yet. + */ + mcq->mcq.comp(&mcq->mcq); + } + + wait_for_completion(&sdrain->done); +} + +void mlx5_ib_drain_sq(struct ib_qp *qp) +{ + struct ib_cq *cq = qp->send_cq; + struct ib_qp_attr attr = { .qp_state = IB_QPS_ERR }; + struct mlx5_ib_drain_cqe sdrain; + const struct ib_send_wr *bad_swr; + struct ib_rdma_wr swr = { + .wr = { + .next = NULL, + { .wr_cqe = &sdrain.cqe, }, + .opcode = IB_WR_RDMA_WRITE, + }, + }; + int ret; + struct mlx5_ib_dev *dev = to_mdev(qp->device); + struct mlx5_core_dev *mdev = dev->mdev; + + ret = ib_modify_qp(qp, &attr, IB_QP_STATE); + if (ret && mdev->state != MLX5_DEVICE_STATE_INTERNAL_ERROR) { + WARN_ONCE(ret, "failed to drain send queue: %d\n", ret); + return; + } + + sdrain.cqe.done = mlx5_ib_drain_qp_done; + init_completion(&sdrain.done); + + ret = _mlx5_ib_post_send(qp, &swr.wr, &bad_swr, true); + if (ret) { + WARN_ONCE(ret, "failed to drain send queue: %d\n", ret); + return; + } + + handle_drain_completion(cq, &sdrain, dev); +} + +void mlx5_ib_drain_rq(struct ib_qp *qp) +{ + struct ib_cq *cq = qp->recv_cq; + struct ib_qp_attr attr = { .qp_state = IB_QPS_ERR }; + struct mlx5_ib_drain_cqe rdrain; + struct ib_recv_wr rwr = {}; + const struct ib_recv_wr *bad_rwr; + int ret; + struct mlx5_ib_dev *dev = to_mdev(qp->device); + struct mlx5_core_dev *mdev = dev->mdev; + + ret = ib_modify_qp(qp, &attr, IB_QP_STATE); + if (ret && mdev->state != MLX5_DEVICE_STATE_INTERNAL_ERROR) { + WARN_ONCE(ret, "failed to drain recv queue: %d\n", ret); + return; + } + + rwr.wr_cqe = &rdrain.cqe; + rdrain.cqe.done = mlx5_ib_drain_qp_done; + init_completion(&rdrain.done); + + ret = _mlx5_ib_post_recv(qp, &rwr, &bad_rwr, true); + if (ret) { + WARN_ONCE(ret, "failed to drain recv queue: %d\n", ret); + return; + } + + handle_drain_completion(cq, &rdrain, dev); +} diff --git a/drivers/infiniband/hw/mlx5/srq.c b/drivers/infiniband/hw/mlx5/srq.c index f5de5adc9b1a..d359fecf7a5b 100644 --- a/drivers/infiniband/hw/mlx5/srq.c +++ b/drivers/infiniband/hw/mlx5/srq.c @@ -446,8 +446,8 @@ void mlx5_ib_free_srq_wqe(struct mlx5_ib_srq *srq, int wqe_index) spin_unlock(&srq->lock); } -int mlx5_ib_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr, - struct ib_recv_wr **bad_wr) +int mlx5_ib_post_srq_recv(struct ib_srq *ibsrq, const struct ib_recv_wr *wr, + const struct ib_recv_wr **bad_wr) { struct mlx5_ib_srq *srq = to_msrq(ibsrq); struct mlx5_wqe_srq_next_seg *next; diff --git a/drivers/infiniband/hw/mthca/mthca_av.c b/drivers/infiniband/hw/mthca/mthca_av.c index e7f6223e9c60..0823c0bc7e73 100644 --- a/drivers/infiniband/hw/mthca/mthca_av.c +++ b/drivers/infiniband/hw/mthca/mthca_av.c @@ -281,10 +281,7 @@ int mthca_read_ah(struct mthca_dev *dev, struct mthca_ah *ah, header->grh.flow_label = ah->av->sl_tclass_flowlabel & cpu_to_be32(0xfffff); header->grh.hop_limit = ah->av->hop_limit; - ib_get_cached_gid(&dev->ib_dev, - be32_to_cpu(ah->av->port_pd) >> 24, - ah->av->gid_index % dev->limits.gid_table_len, - &header->grh.source_gid, NULL); + header->grh.source_gid = ah->ibah.sgid_attr->gid; memcpy(header->grh.destination_gid.raw, ah->av->dgid, 16); } diff --git a/drivers/infiniband/hw/mthca/mthca_dev.h b/drivers/infiniband/hw/mthca/mthca_dev.h index 5508afbf1c67..220a3e4717a3 100644 --- a/drivers/infiniband/hw/mthca/mthca_dev.h +++ b/drivers/infiniband/hw/mthca/mthca_dev.h @@ -519,10 +519,10 @@ int mthca_max_srq_sge(struct mthca_dev *dev); void mthca_srq_event(struct mthca_dev *dev, u32 srqn, enum ib_event_type event_type); void mthca_free_srq_wqe(struct mthca_srq *srq, u32 wqe_addr); -int mthca_tavor_post_srq_recv(struct ib_srq *srq, struct ib_recv_wr *wr, - struct ib_recv_wr **bad_wr); -int mthca_arbel_post_srq_recv(struct ib_srq *srq, struct ib_recv_wr *wr, - struct ib_recv_wr **bad_wr); +int mthca_tavor_post_srq_recv(struct ib_srq *srq, const struct ib_recv_wr *wr, + const struct ib_recv_wr **bad_wr); +int mthca_arbel_post_srq_recv(struct ib_srq *srq, const struct ib_recv_wr *wr, + const struct ib_recv_wr **bad_wr); void mthca_qp_event(struct mthca_dev *dev, u32 qpn, enum ib_event_type event_type); @@ -530,14 +530,14 @@ int mthca_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr_m struct ib_qp_init_attr *qp_init_attr); int mthca_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask, struct ib_udata *udata); -int mthca_tavor_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, - struct ib_send_wr **bad_wr); -int mthca_tavor_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr, - struct ib_recv_wr **bad_wr); -int mthca_arbel_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, - struct ib_send_wr **bad_wr); -int mthca_arbel_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr, - struct ib_recv_wr **bad_wr); +int mthca_tavor_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, + const struct ib_send_wr **bad_wr); +int mthca_tavor_post_receive(struct ib_qp *ibqp, const struct ib_recv_wr *wr, + const struct ib_recv_wr **bad_wr); +int mthca_arbel_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, + const struct ib_send_wr **bad_wr); +int mthca_arbel_post_receive(struct ib_qp *ibqp, const struct ib_recv_wr *wr, + const struct ib_recv_wr **bad_wr); void mthca_free_err_wqe(struct mthca_dev *dev, struct mthca_qp *qp, int is_send, int index, int *dbd, __be32 *new_wqe); int mthca_alloc_qp(struct mthca_dev *dev, diff --git a/drivers/infiniband/hw/mthca/mthca_provider.c b/drivers/infiniband/hw/mthca/mthca_provider.c index 541f237965c7..0d3473b4596e 100644 --- a/drivers/infiniband/hw/mthca/mthca_provider.c +++ b/drivers/infiniband/hw/mthca/mthca_provider.c @@ -96,8 +96,9 @@ static int mthca_query_device(struct ib_device *ibdev, struct ib_device_attr *pr props->page_size_cap = mdev->limits.page_size_cap; props->max_qp = mdev->limits.num_qps - mdev->limits.reserved_qps; props->max_qp_wr = mdev->limits.max_wqes; - props->max_sge = mdev->limits.max_sg; - props->max_sge_rd = props->max_sge; + props->max_send_sge = mdev->limits.max_sg; + props->max_recv_sge = mdev->limits.max_sg; + props->max_sge_rd = mdev->limits.max_sg; props->max_cq = mdev->limits.num_cqs - mdev->limits.reserved_cqs; props->max_cqe = mdev->limits.max_cqes; props->max_mr = mdev->limits.num_mpts - mdev->limits.reserved_mrws; @@ -448,7 +449,7 @@ static struct ib_srq *mthca_create_srq(struct ib_pd *pd, int err; if (init_attr->srq_type != IB_SRQT_BASIC) - return ERR_PTR(-ENOSYS); + return ERR_PTR(-EOPNOTSUPP); srq = kmalloc(sizeof *srq, GFP_KERNEL); if (!srq) diff --git a/drivers/infiniband/hw/mthca/mthca_qp.c b/drivers/infiniband/hw/mthca/mthca_qp.c index af1c49d70b89..3d37f2373d63 100644 --- a/drivers/infiniband/hw/mthca/mthca_qp.c +++ b/drivers/infiniband/hw/mthca/mthca_qp.c @@ -1488,7 +1488,7 @@ void mthca_free_qp(struct mthca_dev *dev, /* Create UD header for an MLX send and build a data segment for it */ static int build_mlx_header(struct mthca_dev *dev, struct mthca_sqp *sqp, - int ind, struct ib_ud_wr *wr, + int ind, const struct ib_ud_wr *wr, struct mthca_mlx_seg *mlx, struct mthca_data_seg *data) { @@ -1581,7 +1581,7 @@ static __always_inline void set_raddr_seg(struct mthca_raddr_seg *rseg, } static __always_inline void set_atomic_seg(struct mthca_atomic_seg *aseg, - struct ib_atomic_wr *wr) + const struct ib_atomic_wr *wr) { if (wr->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP) { aseg->swap_add = cpu_to_be64(wr->swap); @@ -1594,7 +1594,7 @@ static __always_inline void set_atomic_seg(struct mthca_atomic_seg *aseg, } static void set_tavor_ud_seg(struct mthca_tavor_ud_seg *useg, - struct ib_ud_wr *wr) + const struct ib_ud_wr *wr) { useg->lkey = cpu_to_be32(to_mah(wr->ah)->key); useg->av_addr = cpu_to_be64(to_mah(wr->ah)->avdma); @@ -1604,15 +1604,15 @@ static void set_tavor_ud_seg(struct mthca_tavor_ud_seg *useg, } static void set_arbel_ud_seg(struct mthca_arbel_ud_seg *useg, - struct ib_ud_wr *wr) + const struct ib_ud_wr *wr) { memcpy(useg->av, to_mah(wr->ah)->av, MTHCA_AV_SIZE); useg->dqpn = cpu_to_be32(wr->remote_qpn); useg->qkey = cpu_to_be32(wr->remote_qkey); } -int mthca_tavor_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, - struct ib_send_wr **bad_wr) +int mthca_tavor_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, + const struct ib_send_wr **bad_wr) { struct mthca_dev *dev = to_mdev(ibqp->device); struct mthca_qp *qp = to_mqp(ibqp); @@ -1814,8 +1814,8 @@ out: return err; } -int mthca_tavor_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr, - struct ib_recv_wr **bad_wr) +int mthca_tavor_post_receive(struct ib_qp *ibqp, const struct ib_recv_wr *wr, + const struct ib_recv_wr **bad_wr) { struct mthca_dev *dev = to_mdev(ibqp->device); struct mthca_qp *qp = to_mqp(ibqp); @@ -1925,8 +1925,8 @@ out: return err; } -int mthca_arbel_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, - struct ib_send_wr **bad_wr) +int mthca_arbel_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, + const struct ib_send_wr **bad_wr) { struct mthca_dev *dev = to_mdev(ibqp->device); struct mthca_qp *qp = to_mqp(ibqp); @@ -2165,8 +2165,8 @@ out: return err; } -int mthca_arbel_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr, - struct ib_recv_wr **bad_wr) +int mthca_arbel_post_receive(struct ib_qp *ibqp, const struct ib_recv_wr *wr, + const struct ib_recv_wr **bad_wr) { struct mthca_dev *dev = to_mdev(ibqp->device); struct mthca_qp *qp = to_mqp(ibqp); diff --git a/drivers/infiniband/hw/mthca/mthca_srq.c b/drivers/infiniband/hw/mthca/mthca_srq.c index f79732bc73b4..9a3fc6fb0d7e 100644 --- a/drivers/infiniband/hw/mthca/mthca_srq.c +++ b/drivers/infiniband/hw/mthca/mthca_srq.c @@ -472,8 +472,8 @@ void mthca_free_srq_wqe(struct mthca_srq *srq, u32 wqe_addr) spin_unlock(&srq->lock); } -int mthca_tavor_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr, - struct ib_recv_wr **bad_wr) +int mthca_tavor_post_srq_recv(struct ib_srq *ibsrq, const struct ib_recv_wr *wr, + const struct ib_recv_wr **bad_wr) { struct mthca_dev *dev = to_mdev(ibsrq->device); struct mthca_srq *srq = to_msrq(ibsrq); @@ -572,8 +572,8 @@ int mthca_tavor_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr, return err; } -int mthca_arbel_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr, - struct ib_recv_wr **bad_wr) +int mthca_arbel_post_srq_recv(struct ib_srq *ibsrq, const struct ib_recv_wr *wr, + const struct ib_recv_wr **bad_wr) { struct mthca_dev *dev = to_mdev(ibsrq->device); struct mthca_srq *srq = to_msrq(ibsrq); diff --git a/drivers/infiniband/hw/nes/nes.h b/drivers/infiniband/hw/nes/nes.h index 00c27291dc26..bedaa02749fb 100644 --- a/drivers/infiniband/hw/nes/nes.h +++ b/drivers/infiniband/hw/nes/nes.h @@ -159,7 +159,7 @@ do { \ #define NES_EVENT_TIMEOUT 1200000 #else -#define nes_debug(level, fmt, args...) +#define nes_debug(level, fmt, args...) no_printk(fmt, ##args) #define assert(expr) do {} while (0) #define NES_EVENT_TIMEOUT 100000 diff --git a/drivers/infiniband/hw/nes/nes_cm.c b/drivers/infiniband/hw/nes/nes_cm.c index 6cdfbf8c5674..2b67ace5b614 100644 --- a/drivers/infiniband/hw/nes/nes_cm.c +++ b/drivers/infiniband/hw/nes/nes_cm.c @@ -58,6 +58,7 @@ #include <net/neighbour.h> #include <net/route.h> #include <net/ip_fib.h> +#include <net/secure_seq.h> #include <net/tcp.h> #include <linux/fcntl.h> @@ -1445,7 +1446,6 @@ static struct nes_cm_node *make_cm_node(struct nes_cm_core *cm_core, struct nes_cm_listener *listener) { struct nes_cm_node *cm_node; - struct timespec ts; int oldarpindex = 0; int arpindex = 0; struct nes_device *nesdev; @@ -1496,8 +1496,10 @@ static struct nes_cm_node *make_cm_node(struct nes_cm_core *cm_core, cm_node->tcp_cntxt.rcv_wscale = NES_CM_DEFAULT_RCV_WND_SCALE; cm_node->tcp_cntxt.rcv_wnd = NES_CM_DEFAULT_RCV_WND_SCALED >> NES_CM_DEFAULT_RCV_WND_SCALE; - ts = current_kernel_time(); - cm_node->tcp_cntxt.loc_seq_num = htonl(ts.tv_nsec); + cm_node->tcp_cntxt.loc_seq_num = secure_tcp_seq(htonl(cm_node->loc_addr), + htonl(cm_node->rem_addr), + htons(cm_node->loc_port), + htons(cm_node->rem_port)); cm_node->tcp_cntxt.mss = nesvnic->max_frame_size - sizeof(struct iphdr) - sizeof(struct tcphdr) - ETH_HLEN - VLAN_HLEN; cm_node->tcp_cntxt.rcv_nxt = 0; diff --git a/drivers/infiniband/hw/nes/nes_hw.c b/drivers/infiniband/hw/nes/nes_hw.c index 18a7de1c3923..bd0675d8f298 100644 --- a/drivers/infiniband/hw/nes/nes_hw.c +++ b/drivers/infiniband/hw/nes/nes_hw.c @@ -70,8 +70,7 @@ static void nes_process_mac_intr(struct nes_device *nesdev, u32 mac_number); static unsigned int nes_reset_adapter_ne020(struct nes_device *nesdev, u8 *OneG_Mode); static void nes_terminate_start_timer(struct nes_qp *nesqp); -#ifdef CONFIG_INFINIBAND_NES_DEBUG -static unsigned char *nes_iwarp_state_str[] = { +static const char *const nes_iwarp_state_str[] = { "Non-Existent", "Idle", "RTS", @@ -82,7 +81,7 @@ static unsigned char *nes_iwarp_state_str[] = { "RSVD2", }; -static unsigned char *nes_tcp_state_str[] = { +static const char *const nes_tcp_state_str[] = { "Non-Existent", "Closed", "Listen", @@ -100,7 +99,6 @@ static unsigned char *nes_tcp_state_str[] = { "RSVD3", "RSVD4", }; -#endif static inline void print_ip(struct nes_cm_node *cm_node) { diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c index 32f26556c808..6940c7215961 100644 --- a/drivers/infiniband/hw/nes/nes_verbs.c +++ b/drivers/infiniband/hw/nes/nes_verbs.c @@ -436,7 +436,8 @@ static int nes_query_device(struct ib_device *ibdev, struct ib_device_attr *prop props->max_mr_size = 0x80000000; props->max_qp = nesibdev->max_qp; props->max_qp_wr = nesdev->nesadapter->max_qp_wr - 2; - props->max_sge = nesdev->nesadapter->max_sge; + props->max_send_sge = nesdev->nesadapter->max_sge; + props->max_recv_sge = nesdev->nesadapter->max_sge; props->max_cq = nesibdev->max_cq; props->max_cqe = nesdev->nesadapter->max_cqe; props->max_mr = nesibdev->max_mr; @@ -754,26 +755,6 @@ static int nes_dealloc_pd(struct ib_pd *ibpd) /** - * nes_create_ah - */ -static struct ib_ah *nes_create_ah(struct ib_pd *pd, - struct rdma_ah_attr *ah_attr, - struct ib_udata *udata) -{ - return ERR_PTR(-ENOSYS); -} - - -/** - * nes_destroy_ah - */ -static int nes_destroy_ah(struct ib_ah *ah) -{ - return -ENOSYS; -} - - -/** * nes_get_encoded_size */ static inline u8 nes_get_encoded_size(int *size) @@ -3004,42 +2985,9 @@ int nes_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, return err; } - -/** - * nes_muticast_attach - */ -static int nes_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) -{ - nes_debug(NES_DBG_INIT, "\n"); - return -ENOSYS; -} - - -/** - * nes_multicast_detach - */ -static int nes_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) -{ - nes_debug(NES_DBG_INIT, "\n"); - return -ENOSYS; -} - - -/** - * nes_process_mad - */ -static int nes_process_mad(struct ib_device *ibdev, int mad_flags, - u8 port_num, const struct ib_wc *in_wc, const struct ib_grh *in_grh, - const struct ib_mad_hdr *in, size_t in_mad_size, - struct ib_mad_hdr *out, size_t *out_mad_size, - u16 *out_mad_pkey_index) -{ - nes_debug(NES_DBG_INIT, "\n"); - return -ENOSYS; -} - static inline void -fill_wqe_sg_send(struct nes_hw_qp_wqe *wqe, struct ib_send_wr *ib_wr, u32 uselkey) +fill_wqe_sg_send(struct nes_hw_qp_wqe *wqe, const struct ib_send_wr *ib_wr, + u32 uselkey) { int sge_index; int total_payload_length = 0; @@ -3065,8 +3013,8 @@ fill_wqe_sg_send(struct nes_hw_qp_wqe *wqe, struct ib_send_wr *ib_wr, u32 uselke /** * nes_post_send */ -static int nes_post_send(struct ib_qp *ibqp, struct ib_send_wr *ib_wr, - struct ib_send_wr **bad_wr) +static int nes_post_send(struct ib_qp *ibqp, const struct ib_send_wr *ib_wr, + const struct ib_send_wr **bad_wr) { u64 u64temp; unsigned long flags = 0; @@ -3327,8 +3275,8 @@ out: /** * nes_post_recv */ -static int nes_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *ib_wr, - struct ib_recv_wr **bad_wr) +static int nes_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *ib_wr, + const struct ib_recv_wr **bad_wr) { u64 u64temp; unsigned long flags = 0; @@ -3735,8 +3683,6 @@ struct nes_ib_device *nes_init_ofa_device(struct net_device *netdev) nesibdev->ibdev.mmap = nes_mmap; nesibdev->ibdev.alloc_pd = nes_alloc_pd; nesibdev->ibdev.dealloc_pd = nes_dealloc_pd; - nesibdev->ibdev.create_ah = nes_create_ah; - nesibdev->ibdev.destroy_ah = nes_destroy_ah; nesibdev->ibdev.create_qp = nes_create_qp; nesibdev->ibdev.modify_qp = nes_modify_qp; nesibdev->ibdev.query_qp = nes_query_qp; @@ -3753,10 +3699,6 @@ struct nes_ib_device *nes_init_ofa_device(struct net_device *netdev) nesibdev->ibdev.alloc_mr = nes_alloc_mr; nesibdev->ibdev.map_mr_sg = nes_map_mr_sg; - nesibdev->ibdev.attach_mcast = nes_multicast_attach; - nesibdev->ibdev.detach_mcast = nes_multicast_detach; - nesibdev->ibdev.process_mad = nes_process_mad; - nesibdev->ibdev.req_notify_cq = nes_req_notify_cq; nesibdev->ibdev.post_send = nes_post_send; nesibdev->ibdev.post_recv = nes_post_recv; diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_ah.c b/drivers/infiniband/hw/ocrdma/ocrdma_ah.c index 3897b64532e1..58188fe5aed2 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_ah.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_ah.c @@ -71,7 +71,7 @@ static u16 ocrdma_hdr_type_to_proto_num(int devid, u8 hdr_type) } static inline int set_av_attr(struct ocrdma_dev *dev, struct ocrdma_ah *ah, - struct rdma_ah_attr *attr, union ib_gid *sgid, + struct rdma_ah_attr *attr, const union ib_gid *sgid, int pdid, bool *isvlan, u16 vlan_tag) { int status; @@ -164,17 +164,14 @@ struct ib_ah *ocrdma_create_ah(struct ib_pd *ibpd, struct rdma_ah_attr *attr, struct ocrdma_ah *ah; bool isvlan = false; u16 vlan_tag = 0xffff; - struct ib_gid_attr sgid_attr; + const struct ib_gid_attr *sgid_attr; struct ocrdma_pd *pd = get_ocrdma_pd(ibpd); struct ocrdma_dev *dev = get_ocrdma_dev(ibpd->device); - const struct ib_global_route *grh; - union ib_gid sgid; if ((attr->type != RDMA_AH_ATTR_TYPE_ROCE) || !(rdma_ah_get_ah_flags(attr) & IB_AH_GRH)) return ERR_PTR(-EINVAL); - grh = rdma_ah_read_grh(attr); if (atomic_cmpxchg(&dev->update_sl, 1, 0)) ocrdma_init_service_level(dev); @@ -186,20 +183,15 @@ struct ib_ah *ocrdma_create_ah(struct ib_pd *ibpd, struct rdma_ah_attr *attr, if (status) goto av_err; - status = ib_get_cached_gid(&dev->ibdev, 1, grh->sgid_index, &sgid, - &sgid_attr); - if (status) { - pr_err("%s(): Failed to query sgid, status = %d\n", - __func__, status); - goto av_conf_err; - } - if (is_vlan_dev(sgid_attr.ndev)) - vlan_tag = vlan_dev_vlan_id(sgid_attr.ndev); - dev_put(sgid_attr.ndev); + sgid_attr = attr->grh.sgid_attr; + if (is_vlan_dev(sgid_attr->ndev)) + vlan_tag = vlan_dev_vlan_id(sgid_attr->ndev); + /* Get network header type for this GID */ - ah->hdr_type = ib_gid_to_network_type(sgid_attr.gid_type, &sgid); + ah->hdr_type = rdma_gid_attr_network_type(sgid_attr); - status = set_av_attr(dev, ah, attr, &sgid, pd->id, &isvlan, vlan_tag); + status = set_av_attr(dev, ah, attr, &sgid_attr->gid, pd->id, + &isvlan, vlan_tag); if (status) goto av_conf_err; @@ -262,12 +254,6 @@ int ocrdma_query_ah(struct ib_ah *ibah, struct rdma_ah_attr *attr) return 0; } -int ocrdma_modify_ah(struct ib_ah *ibah, struct rdma_ah_attr *attr) -{ - /* modify_ah is unsupported */ - return -ENOSYS; -} - int ocrdma_process_mad(struct ib_device *ibdev, int process_mad_flags, u8 port_num, diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_ah.h b/drivers/infiniband/hw/ocrdma/ocrdma_ah.h index 1a65c47945aa..c0c32c9b80ae 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_ah.h +++ b/drivers/infiniband/hw/ocrdma/ocrdma_ah.h @@ -55,7 +55,6 @@ struct ib_ah *ocrdma_create_ah(struct ib_pd *pd, struct rdma_ah_attr *ah_attr, struct ib_udata *udata); int ocrdma_destroy_ah(struct ib_ah *ah); int ocrdma_query_ah(struct ib_ah *ah, struct rdma_ah_attr *ah_attr); -int ocrdma_modify_ah(struct ib_ah *ah, struct rdma_ah_attr *ah_attr); int ocrdma_process_mad(struct ib_device *, int process_mad_flags, diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_hw.c b/drivers/infiniband/hw/ocrdma/ocrdma_hw.c index 6c136e5017fe..e578281471af 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_hw.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_hw.c @@ -1365,8 +1365,9 @@ static int ocrdma_mbx_get_ctrl_attribs(struct ocrdma_dev *dev) dev->hba_port_num = (hba_attribs->ptpnum_maxdoms_hbast_cv & OCRDMA_HBA_ATTRB_PTNUM_MASK) >> OCRDMA_HBA_ATTRB_PTNUM_SHIFT; - strncpy(dev->model_number, - hba_attribs->controller_model_number, 31); + strlcpy(dev->model_number, + hba_attribs->controller_model_number, + sizeof(dev->model_number)); } dma_free_coherent(&dev->nic_info.pdev->dev, dma.size, dma.va, dma.pa); free_mqe: @@ -2494,8 +2495,7 @@ static int ocrdma_set_av_params(struct ocrdma_qp *qp, { int status; struct rdma_ah_attr *ah_attr = &attrs->ah_attr; - union ib_gid sgid; - struct ib_gid_attr sgid_attr; + const struct ib_gid_attr *sgid_attr; u32 vlan_id = 0xFFFF; u8 mac_addr[6], hdr_type; union { @@ -2525,25 +2525,23 @@ static int ocrdma_set_av_params(struct ocrdma_qp *qp, memcpy(&cmd->params.dgid[0], &grh->dgid.raw[0], sizeof(cmd->params.dgid)); - status = ib_get_cached_gid(&dev->ibdev, 1, grh->sgid_index, - &sgid, &sgid_attr); - if (!status) { - vlan_id = rdma_vlan_dev_vlan_id(sgid_attr.ndev); - memcpy(mac_addr, sgid_attr.ndev->dev_addr, ETH_ALEN); - dev_put(sgid_attr.ndev); - } + sgid_attr = ah_attr->grh.sgid_attr; + vlan_id = rdma_vlan_dev_vlan_id(sgid_attr->ndev); + memcpy(mac_addr, sgid_attr->ndev->dev_addr, ETH_ALEN); qp->sgid_idx = grh->sgid_index; - memcpy(&cmd->params.sgid[0], &sgid.raw[0], sizeof(cmd->params.sgid)); + memcpy(&cmd->params.sgid[0], &sgid_attr->gid.raw[0], + sizeof(cmd->params.sgid)); status = ocrdma_resolve_dmac(dev, ah_attr, &mac_addr[0]); if (status) return status; + cmd->params.dmac_b0_to_b3 = mac_addr[0] | (mac_addr[1] << 8) | (mac_addr[2] << 16) | (mac_addr[3] << 24); - hdr_type = ib_gid_to_network_type(sgid_attr.gid_type, &sgid); + hdr_type = rdma_gid_attr_network_type(sgid_attr); if (hdr_type == RDMA_NETWORK_IPV4) { - rdma_gid2ip(&sgid_addr._sockaddr, &sgid); + rdma_gid2ip(&sgid_addr._sockaddr, &sgid_attr->gid); rdma_gid2ip(&dgid_addr._sockaddr, &grh->dgid); memcpy(&cmd->params.dgid[0], &dgid_addr._sockaddr_in.sin_addr.s_addr, 4); diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_main.c b/drivers/infiniband/hw/ocrdma/ocrdma_main.c index 5962c0ed9847..7832ee3e0c84 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_main.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_main.c @@ -176,7 +176,6 @@ static int ocrdma_register_device(struct ocrdma_dev *dev) dev->ibdev.create_ah = ocrdma_create_ah; dev->ibdev.destroy_ah = ocrdma_destroy_ah; dev->ibdev.query_ah = ocrdma_query_ah; - dev->ibdev.modify_ah = ocrdma_modify_ah; dev->ibdev.poll_cq = ocrdma_poll_cq; dev->ibdev.post_send = ocrdma_post_send; diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c index 82e20fc32890..c158ca9fde6d 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c @@ -89,7 +89,8 @@ int ocrdma_query_device(struct ib_device *ibdev, struct ib_device_attr *attr, IB_DEVICE_SYS_IMAGE_GUID | IB_DEVICE_LOCAL_DMA_LKEY | IB_DEVICE_MEM_MGT_EXTENSIONS; - attr->max_sge = min(dev->attr.max_send_sge, dev->attr.max_recv_sge); + attr->max_send_sge = dev->attr.max_send_sge; + attr->max_recv_sge = dev->attr.max_recv_sge; attr->max_sge_rd = dev->attr.max_rdma_sge; attr->max_cq = dev->attr.max_cq; attr->max_cqe = dev->attr.max_cqe; @@ -196,11 +197,10 @@ int ocrdma_query_port(struct ib_device *ibdev, props->sm_lid = 0; props->sm_sl = 0; props->state = port_state; - props->port_cap_flags = - IB_PORT_CM_SUP | - IB_PORT_REINIT_SUP | - IB_PORT_DEVICE_MGMT_SUP | IB_PORT_VENDOR_CLASS_SUP | - IB_PORT_IP_BASED_GIDS; + props->port_cap_flags = IB_PORT_CM_SUP | IB_PORT_REINIT_SUP | + IB_PORT_DEVICE_MGMT_SUP | + IB_PORT_VENDOR_CLASS_SUP; + props->ip_gids = true; props->gid_tbl_len = OCRDMA_MAX_SGID; props->pkey_tbl_len = 1; props->bad_pkey_cntr = 0; @@ -1774,13 +1774,13 @@ int ocrdma_destroy_qp(struct ib_qp *ibqp) * protect against proessing in-flight CQEs for this QP. */ spin_lock_irqsave(&qp->sq_cq->cq_lock, flags); - if (qp->rq_cq && (qp->rq_cq != qp->sq_cq)) + if (qp->rq_cq && (qp->rq_cq != qp->sq_cq)) { spin_lock(&qp->rq_cq->cq_lock); - - ocrdma_del_qpn_map(dev, qp); - - if (qp->rq_cq && (qp->rq_cq != qp->sq_cq)) + ocrdma_del_qpn_map(dev, qp); spin_unlock(&qp->rq_cq->cq_lock); + } else { + ocrdma_del_qpn_map(dev, qp); + } spin_unlock_irqrestore(&qp->sq_cq->cq_lock, flags); if (!pd->uctx) { @@ -1953,7 +1953,7 @@ int ocrdma_destroy_srq(struct ib_srq *ibsrq) /* unprivileged verbs and their support functions. */ static void ocrdma_build_ud_hdr(struct ocrdma_qp *qp, struct ocrdma_hdr_wqe *hdr, - struct ib_send_wr *wr) + const struct ib_send_wr *wr) { struct ocrdma_ewqe_ud_hdr *ud_hdr = (struct ocrdma_ewqe_ud_hdr *)(hdr + 1); @@ -2000,7 +2000,7 @@ static inline uint32_t ocrdma_sglist_len(struct ib_sge *sg_list, int num_sge) static int ocrdma_build_inline_sges(struct ocrdma_qp *qp, struct ocrdma_hdr_wqe *hdr, struct ocrdma_sge *sge, - struct ib_send_wr *wr, u32 wqe_size) + const struct ib_send_wr *wr, u32 wqe_size) { int i; char *dpp_addr; @@ -2038,7 +2038,7 @@ static int ocrdma_build_inline_sges(struct ocrdma_qp *qp, } static int ocrdma_build_send(struct ocrdma_qp *qp, struct ocrdma_hdr_wqe *hdr, - struct ib_send_wr *wr) + const struct ib_send_wr *wr) { int status; struct ocrdma_sge *sge; @@ -2057,7 +2057,7 @@ static int ocrdma_build_send(struct ocrdma_qp *qp, struct ocrdma_hdr_wqe *hdr, } static int ocrdma_build_write(struct ocrdma_qp *qp, struct ocrdma_hdr_wqe *hdr, - struct ib_send_wr *wr) + const struct ib_send_wr *wr) { int status; struct ocrdma_sge *ext_rw = (struct ocrdma_sge *)(hdr + 1); @@ -2075,7 +2075,7 @@ static int ocrdma_build_write(struct ocrdma_qp *qp, struct ocrdma_hdr_wqe *hdr, } static void ocrdma_build_read(struct ocrdma_qp *qp, struct ocrdma_hdr_wqe *hdr, - struct ib_send_wr *wr) + const struct ib_send_wr *wr) { struct ocrdma_sge *ext_rw = (struct ocrdma_sge *)(hdr + 1); struct ocrdma_sge *sge = ext_rw + 1; @@ -2105,7 +2105,7 @@ static int get_encoded_page_size(int pg_sz) static int ocrdma_build_reg(struct ocrdma_qp *qp, struct ocrdma_hdr_wqe *hdr, - struct ib_reg_wr *wr) + const struct ib_reg_wr *wr) { u64 fbo; struct ocrdma_ewqe_fr *fast_reg = (struct ocrdma_ewqe_fr *)(hdr + 1); @@ -2166,8 +2166,8 @@ static void ocrdma_ring_sq_db(struct ocrdma_qp *qp) iowrite32(val, qp->sq_db); } -int ocrdma_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, - struct ib_send_wr **bad_wr) +int ocrdma_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, + const struct ib_send_wr **bad_wr) { int status = 0; struct ocrdma_qp *qp = get_ocrdma_qp(ibqp); @@ -2278,8 +2278,8 @@ static void ocrdma_ring_rq_db(struct ocrdma_qp *qp) iowrite32(val, qp->rq_db); } -static void ocrdma_build_rqe(struct ocrdma_hdr_wqe *rqe, struct ib_recv_wr *wr, - u16 tag) +static void ocrdma_build_rqe(struct ocrdma_hdr_wqe *rqe, + const struct ib_recv_wr *wr, u16 tag) { u32 wqe_size = 0; struct ocrdma_sge *sge; @@ -2299,8 +2299,8 @@ static void ocrdma_build_rqe(struct ocrdma_hdr_wqe *rqe, struct ib_recv_wr *wr, ocrdma_cpu_to_le32(rqe, wqe_size); } -int ocrdma_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr, - struct ib_recv_wr **bad_wr) +int ocrdma_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr, + const struct ib_recv_wr **bad_wr) { int status = 0; unsigned long flags; @@ -2369,8 +2369,8 @@ static void ocrdma_ring_srq_db(struct ocrdma_srq *srq) iowrite32(val, srq->db + OCRDMA_DB_GEN2_SRQ_OFFSET); } -int ocrdma_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr, - struct ib_recv_wr **bad_wr) +int ocrdma_post_srq_recv(struct ib_srq *ibsrq, const struct ib_recv_wr *wr, + const struct ib_recv_wr **bad_wr) { int status = 0; unsigned long flags; diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h index 9a9971708646..b69cfdce7970 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h +++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h @@ -43,10 +43,10 @@ #ifndef __OCRDMA_VERBS_H__ #define __OCRDMA_VERBS_H__ -int ocrdma_post_send(struct ib_qp *, struct ib_send_wr *, - struct ib_send_wr **bad_wr); -int ocrdma_post_recv(struct ib_qp *, struct ib_recv_wr *, - struct ib_recv_wr **bad_wr); +int ocrdma_post_send(struct ib_qp *, const struct ib_send_wr *, + const struct ib_send_wr **bad_wr); +int ocrdma_post_recv(struct ib_qp *, const struct ib_recv_wr *, + const struct ib_recv_wr **bad_wr); int ocrdma_poll_cq(struct ib_cq *, int num_entries, struct ib_wc *wc); int ocrdma_arm_cq(struct ib_cq *, enum ib_cq_notify_flags flags); @@ -100,8 +100,8 @@ int ocrdma_modify_srq(struct ib_srq *, struct ib_srq_attr *, enum ib_srq_attr_mask, struct ib_udata *); int ocrdma_query_srq(struct ib_srq *, struct ib_srq_attr *); int ocrdma_destroy_srq(struct ib_srq *); -int ocrdma_post_srq_recv(struct ib_srq *, struct ib_recv_wr *, - struct ib_recv_wr **bad_recv_wr); +int ocrdma_post_srq_recv(struct ib_srq *, const struct ib_recv_wr *, + const struct ib_recv_wr **bad_recv_wr); int ocrdma_dereg_mr(struct ib_mr *); struct ib_mr *ocrdma_get_dma_mr(struct ib_pd *, int acc); diff --git a/drivers/infiniband/hw/qedr/main.c b/drivers/infiniband/hw/qedr/main.c index ad22b32bbd9c..a0af6d424aed 100644 --- a/drivers/infiniband/hw/qedr/main.c +++ b/drivers/infiniband/hw/qedr/main.c @@ -191,6 +191,11 @@ static int qedr_register_device(struct qedr_dev *dev) QEDR_UVERBS(MODIFY_QP) | QEDR_UVERBS(QUERY_QP) | QEDR_UVERBS(DESTROY_QP) | + QEDR_UVERBS(CREATE_SRQ) | + QEDR_UVERBS(DESTROY_SRQ) | + QEDR_UVERBS(QUERY_SRQ) | + QEDR_UVERBS(MODIFY_SRQ) | + QEDR_UVERBS(POST_SRQ_RECV) | QEDR_UVERBS(REG_MR) | QEDR_UVERBS(DEREG_MR) | QEDR_UVERBS(POLL_CQ) | @@ -229,6 +234,11 @@ static int qedr_register_device(struct qedr_dev *dev) dev->ibdev.query_qp = qedr_query_qp; dev->ibdev.destroy_qp = qedr_destroy_qp; + dev->ibdev.create_srq = qedr_create_srq; + dev->ibdev.destroy_srq = qedr_destroy_srq; + dev->ibdev.modify_srq = qedr_modify_srq; + dev->ibdev.query_srq = qedr_query_srq; + dev->ibdev.post_srq_recv = qedr_post_srq_recv; dev->ibdev.query_pkey = qedr_query_pkey; dev->ibdev.create_ah = qedr_create_ah; @@ -325,8 +335,8 @@ static int qedr_alloc_resources(struct qedr_dev *dev) spin_lock_init(&dev->sgid_lock); if (IS_IWARP(dev)) { - spin_lock_init(&dev->idr_lock); - idr_init(&dev->qpidr); + spin_lock_init(&dev->qpidr.idr_lock); + idr_init(&dev->qpidr.idr); dev->iwarp_wq = create_singlethread_workqueue("qedr_iwarpq"); } @@ -653,42 +663,70 @@ static void qedr_affiliated_event(void *context, u8 e_code, void *fw_handle) #define EVENT_TYPE_NOT_DEFINED 0 #define EVENT_TYPE_CQ 1 #define EVENT_TYPE_QP 2 +#define EVENT_TYPE_SRQ 3 struct qedr_dev *dev = (struct qedr_dev *)context; struct regpair *async_handle = (struct regpair *)fw_handle; u64 roce_handle64 = ((u64) async_handle->hi << 32) + async_handle->lo; u8 event_type = EVENT_TYPE_NOT_DEFINED; struct ib_event event; + struct ib_srq *ibsrq; + struct qedr_srq *srq; + unsigned long flags; struct ib_cq *ibcq; struct ib_qp *ibqp; struct qedr_cq *cq; struct qedr_qp *qp; + u16 srq_id; - switch (e_code) { - case ROCE_ASYNC_EVENT_CQ_OVERFLOW_ERR: - event.event = IB_EVENT_CQ_ERR; - event_type = EVENT_TYPE_CQ; - break; - case ROCE_ASYNC_EVENT_SQ_DRAINED: - event.event = IB_EVENT_SQ_DRAINED; - event_type = EVENT_TYPE_QP; - break; - case ROCE_ASYNC_EVENT_QP_CATASTROPHIC_ERR: - event.event = IB_EVENT_QP_FATAL; - event_type = EVENT_TYPE_QP; - break; - case ROCE_ASYNC_EVENT_LOCAL_INVALID_REQUEST_ERR: - event.event = IB_EVENT_QP_REQ_ERR; - event_type = EVENT_TYPE_QP; - break; - case ROCE_ASYNC_EVENT_LOCAL_ACCESS_ERR: - event.event = IB_EVENT_QP_ACCESS_ERR; - event_type = EVENT_TYPE_QP; - break; - default: + if (IS_ROCE(dev)) { + switch (e_code) { + case ROCE_ASYNC_EVENT_CQ_OVERFLOW_ERR: + event.event = IB_EVENT_CQ_ERR; + event_type = EVENT_TYPE_CQ; + break; + case ROCE_ASYNC_EVENT_SQ_DRAINED: + event.event = IB_EVENT_SQ_DRAINED; + event_type = EVENT_TYPE_QP; + break; + case ROCE_ASYNC_EVENT_QP_CATASTROPHIC_ERR: + event.event = IB_EVENT_QP_FATAL; + event_type = EVENT_TYPE_QP; + break; + case ROCE_ASYNC_EVENT_LOCAL_INVALID_REQUEST_ERR: + event.event = IB_EVENT_QP_REQ_ERR; + event_type = EVENT_TYPE_QP; + break; + case ROCE_ASYNC_EVENT_LOCAL_ACCESS_ERR: + event.event = IB_EVENT_QP_ACCESS_ERR; + event_type = EVENT_TYPE_QP; + break; + case ROCE_ASYNC_EVENT_SRQ_LIMIT: + event.event = IB_EVENT_SRQ_LIMIT_REACHED; + event_type = EVENT_TYPE_SRQ; + break; + case ROCE_ASYNC_EVENT_SRQ_EMPTY: + event.event = IB_EVENT_SRQ_ERR; + event_type = EVENT_TYPE_SRQ; + break; + default: + DP_ERR(dev, "unsupported event %d on handle=%llx\n", + e_code, roce_handle64); + } + } else { + switch (e_code) { + case QED_IWARP_EVENT_SRQ_LIMIT: + event.event = IB_EVENT_SRQ_LIMIT_REACHED; + event_type = EVENT_TYPE_SRQ; + break; + case QED_IWARP_EVENT_SRQ_EMPTY: + event.event = IB_EVENT_SRQ_ERR; + event_type = EVENT_TYPE_SRQ; + break; + default: DP_ERR(dev, "unsupported event %d on handle=%llx\n", e_code, roce_handle64); + } } - switch (event_type) { case EVENT_TYPE_CQ: cq = (struct qedr_cq *)(uintptr_t)roce_handle64; @@ -722,6 +760,25 @@ static void qedr_affiliated_event(void *context, u8 e_code, void *fw_handle) } DP_ERR(dev, "QP event %d on handle %p\n", e_code, qp); break; + case EVENT_TYPE_SRQ: + srq_id = (u16)roce_handle64; + spin_lock_irqsave(&dev->srqidr.idr_lock, flags); + srq = idr_find(&dev->srqidr.idr, srq_id); + if (srq) { + ibsrq = &srq->ibsrq; + if (ibsrq->event_handler) { + event.device = ibsrq->device; + event.element.srq = ibsrq; + ibsrq->event_handler(&event, + ibsrq->srq_context); + } + } else { + DP_NOTICE(dev, + "SRQ event with NULL pointer ibsrq. Handle=%llx\n", + roce_handle64); + } + spin_unlock_irqrestore(&dev->srqidr.idr_lock, flags); + DP_NOTICE(dev, "SRQ event %d on handle %p\n", e_code, srq); default: break; } diff --git a/drivers/infiniband/hw/qedr/qedr.h b/drivers/infiniband/hw/qedr/qedr.h index 86d4511e0d75..a2d708dceb8d 100644 --- a/drivers/infiniband/hw/qedr/qedr.h +++ b/drivers/infiniband/hw/qedr/qedr.h @@ -58,6 +58,7 @@ #define QEDR_MSG_RQ " RQ" #define QEDR_MSG_SQ " SQ" #define QEDR_MSG_QP " QP" +#define QEDR_MSG_SRQ " SRQ" #define QEDR_MSG_GSI " GSI" #define QEDR_MSG_IWARP " IW" @@ -122,6 +123,11 @@ struct qedr_device_attr { #define QEDR_ENET_STATE_BIT (0) +struct qedr_idr { + spinlock_t idr_lock; /* Protect idr data-structure */ + struct idr idr; +}; + struct qedr_dev { struct ib_device ibdev; struct qed_dev *cdev; @@ -165,8 +171,8 @@ struct qedr_dev { struct qedr_cq *gsi_rqcq; struct qedr_qp *gsi_qp; enum qed_rdma_type rdma_type; - spinlock_t idr_lock; /* Protect qpidr data-structure */ - struct idr qpidr; + struct qedr_idr qpidr; + struct qedr_idr srqidr; struct workqueue_struct *iwarp_wq; u16 iwarp_max_mtu; @@ -337,6 +343,34 @@ struct qedr_qp_hwq_info { qed_chain_get_capacity(p_info->pbl) \ } while (0) +struct qedr_srq_hwq_info { + u32 max_sges; + u32 max_wr; + struct qed_chain pbl; + u64 p_phys_addr_tbl; + u32 wqe_prod; + u32 sge_prod; + u32 wr_prod_cnt; + u32 wr_cons_cnt; + u32 num_elems; + + u32 *virt_prod_pair_addr; + dma_addr_t phy_prod_pair_addr; +}; + +struct qedr_srq { + struct ib_srq ibsrq; + struct qedr_dev *dev; + + struct qedr_userq usrq; + struct qedr_srq_hwq_info hw_srq; + struct ib_umem *prod_umem; + u16 srq_id; + u32 srq_limit; + /* lock to protect srq recv post */ + spinlock_t lock; +}; + enum qedr_qp_err_bitmap { QEDR_QP_ERR_SQ_FULL = 1, QEDR_QP_ERR_RQ_FULL = 2, @@ -538,4 +572,9 @@ static inline struct qedr_mr *get_qedr_mr(struct ib_mr *ibmr) { return container_of(ibmr, struct qedr_mr, ibmr); } + +static inline struct qedr_srq *get_qedr_srq(struct ib_srq *ibsrq) +{ + return container_of(ibsrq, struct qedr_srq, ibsrq); +} #endif diff --git a/drivers/infiniband/hw/qedr/qedr_hsi_rdma.h b/drivers/infiniband/hw/qedr/qedr_hsi_rdma.h index 7e1f7021396a..228dd7d49622 100644 --- a/drivers/infiniband/hw/qedr/qedr_hsi_rdma.h +++ b/drivers/infiniband/hw/qedr/qedr_hsi_rdma.h @@ -161,12 +161,23 @@ struct rdma_rq_sge { #define RDMA_RQ_SGE_L_KEY_HI_SHIFT 29 }; +struct rdma_srq_wqe_header { + struct regpair wr_id; + u8 num_sges /* number of SGEs in WQE */; + u8 reserved2[7]; +}; + struct rdma_srq_sge { struct regpair addr; __le32 length; __le32 l_key; }; +union rdma_srq_elm { + struct rdma_srq_wqe_header header; + struct rdma_srq_sge sge; +}; + /* Rdma doorbell data for flags update */ struct rdma_pwm_flags_data { __le16 icid; /* internal CID */ diff --git a/drivers/infiniband/hw/qedr/qedr_iw_cm.c b/drivers/infiniband/hw/qedr/qedr_iw_cm.c index 26dc374787f7..505fa3648762 100644 --- a/drivers/infiniband/hw/qedr/qedr_iw_cm.c +++ b/drivers/infiniband/hw/qedr/qedr_iw_cm.c @@ -491,7 +491,7 @@ int qedr_iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) int rc = 0; int i; - qp = idr_find(&dev->qpidr, conn_param->qpn); + qp = idr_find(&dev->qpidr.idr, conn_param->qpn); laddr = (struct sockaddr_in *)&cm_id->m_local_addr; raddr = (struct sockaddr_in *)&cm_id->m_remote_addr; @@ -679,7 +679,7 @@ int qedr_iw_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) DP_DEBUG(dev, QEDR_MSG_IWARP, "Accept on qpid=%d\n", conn_param->qpn); - qp = idr_find(&dev->qpidr, conn_param->qpn); + qp = idr_find(&dev->qpidr.idr, conn_param->qpn); if (!qp) { DP_ERR(dev, "Invalid QP number %d\n", conn_param->qpn); return -EINVAL; @@ -737,9 +737,9 @@ void qedr_iw_qp_rem_ref(struct ib_qp *ibqp) struct qedr_qp *qp = get_qedr_qp(ibqp); if (atomic_dec_and_test(&qp->refcnt)) { - spin_lock_irq(&qp->dev->idr_lock); - idr_remove(&qp->dev->qpidr, qp->qp_id); - spin_unlock_irq(&qp->dev->idr_lock); + spin_lock_irq(&qp->dev->qpidr.idr_lock); + idr_remove(&qp->dev->qpidr.idr, qp->qp_id); + spin_unlock_irq(&qp->dev->qpidr.idr_lock); kfree(qp); } } @@ -748,5 +748,5 @@ struct ib_qp *qedr_iw_get_qp(struct ib_device *ibdev, int qpn) { struct qedr_dev *dev = get_qedr_dev(ibdev); - return idr_find(&dev->qpidr, qpn); + return idr_find(&dev->qpidr.idr, qpn); } diff --git a/drivers/infiniband/hw/qedr/qedr_roce_cm.c b/drivers/infiniband/hw/qedr/qedr_roce_cm.c index 0f14e687bb91..85578887421b 100644 --- a/drivers/infiniband/hw/qedr/qedr_roce_cm.c +++ b/drivers/infiniband/hw/qedr/qedr_roce_cm.c @@ -380,18 +380,17 @@ int qedr_destroy_gsi_qp(struct qedr_dev *dev) #define QEDR_GSI_QPN (1) static inline int qedr_gsi_build_header(struct qedr_dev *dev, struct qedr_qp *qp, - struct ib_send_wr *swr, + const struct ib_send_wr *swr, struct ib_ud_header *udh, int *roce_mode) { bool has_vlan = false, has_grh_ipv6 = true; struct rdma_ah_attr *ah_attr = &get_qedr_ah(ud_wr(swr)->ah)->attr; const struct ib_global_route *grh = rdma_ah_read_grh(ah_attr); - union ib_gid sgid; + const struct ib_gid_attr *sgid_attr = grh->sgid_attr; int send_size = 0; u16 vlan_id = 0; u16 ether_type; - struct ib_gid_attr sgid_attr; int rc; int ip_ver = 0; @@ -402,28 +401,16 @@ static inline int qedr_gsi_build_header(struct qedr_dev *dev, for (i = 0; i < swr->num_sge; ++i) send_size += swr->sg_list[i].length; - rc = ib_get_cached_gid(qp->ibqp.device, rdma_ah_get_port_num(ah_attr), - grh->sgid_index, &sgid, &sgid_attr); - if (rc) { - DP_ERR(dev, - "gsi post send: failed to get cached GID (port=%d, ix=%d)\n", - rdma_ah_get_port_num(ah_attr), - grh->sgid_index); - return rc; - } - - vlan_id = rdma_vlan_dev_vlan_id(sgid_attr.ndev); + vlan_id = rdma_vlan_dev_vlan_id(sgid_attr->ndev); if (vlan_id < VLAN_CFI_MASK) has_vlan = true; - dev_put(sgid_attr.ndev); - - has_udp = (sgid_attr.gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP); + has_udp = (sgid_attr->gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP); if (!has_udp) { /* RoCE v1 */ ether_type = ETH_P_IBOE; *roce_mode = ROCE_V1; - } else if (ipv6_addr_v4mapped((struct in6_addr *)&sgid)) { + } else if (ipv6_addr_v4mapped((struct in6_addr *)&sgid_attr->gid)) { /* RoCE v2 IPv4 */ ip_ver = 4; ether_type = ETH_P_IP; @@ -471,7 +458,7 @@ static inline int qedr_gsi_build_header(struct qedr_dev *dev, udh->grh.flow_label = grh->flow_label; udh->grh.hop_limit = grh->hop_limit; udh->grh.destination_gid = grh->dgid; - memcpy(&udh->grh.source_gid.raw, &sgid.raw, + memcpy(&udh->grh.source_gid.raw, sgid_attr->gid.raw, sizeof(udh->grh.source_gid.raw)); } else { /* IPv4 header */ @@ -482,7 +469,7 @@ static inline int qedr_gsi_build_header(struct qedr_dev *dev, udh->ip4.frag_off = htons(IP_DF); udh->ip4.ttl = grh->hop_limit; - ipv4_addr = qedr_get_ipv4_from_gid(sgid.raw); + ipv4_addr = qedr_get_ipv4_from_gid(sgid_attr->gid.raw); udh->ip4.saddr = ipv4_addr; ipv4_addr = qedr_get_ipv4_from_gid(grh->dgid.raw); udh->ip4.daddr = ipv4_addr; @@ -501,7 +488,7 @@ static inline int qedr_gsi_build_header(struct qedr_dev *dev, static inline int qedr_gsi_build_packet(struct qedr_dev *dev, struct qedr_qp *qp, - struct ib_send_wr *swr, + const struct ib_send_wr *swr, struct qed_roce_ll2_packet **p_packet) { u8 ud_header_buffer[QEDR_MAX_UD_HEADER_SIZE]; @@ -550,8 +537,8 @@ static inline int qedr_gsi_build_packet(struct qedr_dev *dev, return 0; } -int qedr_gsi_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, - struct ib_send_wr **bad_wr) +int qedr_gsi_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, + const struct ib_send_wr **bad_wr) { struct qed_roce_ll2_packet *pkt = NULL; struct qedr_qp *qp = get_qedr_qp(ibqp); @@ -620,8 +607,8 @@ err: return rc; } -int qedr_gsi_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr, - struct ib_recv_wr **bad_wr) +int qedr_gsi_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr, + const struct ib_recv_wr **bad_wr) { struct qedr_dev *dev = get_qedr_dev(ibqp->device); struct qedr_qp *qp = get_qedr_qp(ibqp); diff --git a/drivers/infiniband/hw/qedr/qedr_roce_cm.h b/drivers/infiniband/hw/qedr/qedr_roce_cm.h index a55916323ea9..d46dcd3f6424 100644 --- a/drivers/infiniband/hw/qedr/qedr_roce_cm.h +++ b/drivers/infiniband/hw/qedr/qedr_roce_cm.h @@ -46,10 +46,10 @@ static inline u32 qedr_get_ipv4_from_gid(const u8 *gid) /* RDMA CM */ int qedr_gsi_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc); -int qedr_gsi_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr, - struct ib_recv_wr **bad_wr); -int qedr_gsi_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, - struct ib_send_wr **bad_wr); +int qedr_gsi_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr, + const struct ib_recv_wr **bad_wr); +int qedr_gsi_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, + const struct ib_send_wr **bad_wr); struct ib_qp *qedr_create_gsi_qp(struct qedr_dev *dev, struct ib_qp_init_attr *attrs, struct qedr_qp *qp); diff --git a/drivers/infiniband/hw/qedr/verbs.c b/drivers/infiniband/hw/qedr/verbs.c index f07b8df96f43..8cc3df24e04e 100644 --- a/drivers/infiniband/hw/qedr/verbs.c +++ b/drivers/infiniband/hw/qedr/verbs.c @@ -51,6 +51,10 @@ #include <rdma/qedr-abi.h> #include "qedr_roce_cm.h" +#define QEDR_SRQ_WQE_ELEM_SIZE sizeof(union rdma_srq_elm) +#define RDMA_MAX_SGE_PER_SRQ (4) +#define RDMA_MAX_SRQ_WQE_SIZE (RDMA_MAX_SGE_PER_SRQ + 1) + #define DB_ADDR_SHIFT(addr) ((addr) << DB_PWM_ADDR_OFFSET_SHIFT) static inline int qedr_ib_copy_to_udata(struct ib_udata *udata, void *src, @@ -84,6 +88,19 @@ int qedr_iw_query_gid(struct ib_device *ibdev, u8 port, return 0; } +int qedr_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *srq_attr) +{ + struct qedr_dev *dev = get_qedr_dev(ibsrq->device); + struct qedr_device_attr *qattr = &dev->attr; + struct qedr_srq *srq = get_qedr_srq(ibsrq); + + srq_attr->srq_limit = srq->srq_limit; + srq_attr->max_wr = qattr->max_srq_wr; + srq_attr->max_sge = qattr->max_sge; + + return 0; +} + int qedr_query_device(struct ib_device *ibdev, struct ib_device_attr *attr, struct ib_udata *udata) { @@ -112,7 +129,8 @@ int qedr_query_device(struct ib_device *ibdev, IB_DEVICE_RC_RNR_NAK_GEN | IB_DEVICE_LOCAL_DMA_LKEY | IB_DEVICE_MEM_MGT_EXTENSIONS; - attr->max_sge = qattr->max_sge; + attr->max_send_sge = qattr->max_sge; + attr->max_recv_sge = qattr->max_sge; attr->max_sge_rd = qattr->max_sge; attr->max_cq = qattr->max_cq; attr->max_cqe = qattr->max_cqe; @@ -224,7 +242,7 @@ int qedr_query_port(struct ib_device *ibdev, u8 port, struct ib_port_attr *attr) attr->lmc = 0; attr->sm_lid = 0; attr->sm_sl = 0; - attr->port_cap_flags = IB_PORT_IP_BASED_GIDS; + attr->ip_gids = true; if (rdma_protocol_iwarp(&dev->ibdev, 1)) { attr->gid_tbl_len = 1; attr->pkey_tbl_len = 1; @@ -1075,27 +1093,19 @@ static inline int get_gid_info_from_table(struct ib_qp *ibqp, struct qed_rdma_modify_qp_in_params *qp_params) { + const struct ib_gid_attr *gid_attr; enum rdma_network_type nw_type; - struct ib_gid_attr gid_attr; const struct ib_global_route *grh = rdma_ah_read_grh(&attr->ah_attr); - union ib_gid gid; u32 ipv4_addr; - int rc = 0; int i; - rc = ib_get_cached_gid(ibqp->device, - rdma_ah_get_port_num(&attr->ah_attr), - grh->sgid_index, &gid, &gid_attr); - if (rc) - return rc; - - qp_params->vlan_id = rdma_vlan_dev_vlan_id(gid_attr.ndev); + gid_attr = grh->sgid_attr; + qp_params->vlan_id = rdma_vlan_dev_vlan_id(gid_attr->ndev); - dev_put(gid_attr.ndev); - nw_type = ib_gid_to_network_type(gid_attr.gid_type, &gid); + nw_type = rdma_gid_attr_network_type(gid_attr); switch (nw_type) { case RDMA_NETWORK_IPV6: - memcpy(&qp_params->sgid.bytes[0], &gid.raw[0], + memcpy(&qp_params->sgid.bytes[0], &gid_attr->gid.raw[0], sizeof(qp_params->sgid)); memcpy(&qp_params->dgid.bytes[0], &grh->dgid, @@ -1105,7 +1115,7 @@ static inline int get_gid_info_from_table(struct ib_qp *ibqp, QED_ROCE_MODIFY_QP_VALID_ROCE_MODE, 1); break; case RDMA_NETWORK_IB: - memcpy(&qp_params->sgid.bytes[0], &gid.raw[0], + memcpy(&qp_params->sgid.bytes[0], &gid_attr->gid.raw[0], sizeof(qp_params->sgid)); memcpy(&qp_params->dgid.bytes[0], &grh->dgid, @@ -1115,7 +1125,7 @@ static inline int get_gid_info_from_table(struct ib_qp *ibqp, case RDMA_NETWORK_IPV4: memset(&qp_params->sgid, 0, sizeof(qp_params->sgid)); memset(&qp_params->dgid, 0, sizeof(qp_params->dgid)); - ipv4_addr = qedr_get_ipv4_from_gid(gid.raw); + ipv4_addr = qedr_get_ipv4_from_gid(gid_attr->gid.raw); qp_params->sgid.ipv4_addr = ipv4_addr; ipv4_addr = qedr_get_ipv4_from_gid(grh->dgid.raw); @@ -1189,6 +1199,21 @@ static int qedr_check_qp_attrs(struct ib_pd *ibpd, struct qedr_dev *dev, return 0; } +static int qedr_copy_srq_uresp(struct qedr_dev *dev, + struct qedr_srq *srq, struct ib_udata *udata) +{ + struct qedr_create_srq_uresp uresp = {}; + int rc; + + uresp.srq_id = srq->srq_id; + + rc = ib_copy_to_udata(udata, &uresp, sizeof(uresp)); + if (rc) + DP_ERR(dev, "create srq: problem copying data to user space\n"); + + return rc; +} + static void qedr_copy_rq_uresp(struct qedr_dev *dev, struct qedr_create_qp_uresp *uresp, struct qedr_qp *qp) @@ -1255,13 +1280,18 @@ static void qedr_set_common_qp_params(struct qedr_dev *dev, qp->state = QED_ROCE_QP_STATE_RESET; qp->signaled = (attrs->sq_sig_type == IB_SIGNAL_ALL_WR) ? true : false; qp->sq_cq = get_qedr_cq(attrs->send_cq); - qp->rq_cq = get_qedr_cq(attrs->recv_cq); qp->dev = dev; - qp->rq.max_sges = attrs->cap.max_recv_sge; - DP_DEBUG(dev, QEDR_MSG_QP, - "RQ params:\trq_max_sges = %d, rq_cq_id = %d\n", - qp->rq.max_sges, qp->rq_cq->icid); + if (attrs->srq) { + qp->srq = get_qedr_srq(attrs->srq); + } else { + qp->rq_cq = get_qedr_cq(attrs->recv_cq); + qp->rq.max_sges = attrs->cap.max_recv_sge; + DP_DEBUG(dev, QEDR_MSG_QP, + "RQ params:\trq_max_sges = %d, rq_cq_id = %d\n", + qp->rq.max_sges, qp->rq_cq->icid); + } + DP_DEBUG(dev, QEDR_MSG_QP, "QP params:\tpd = %d, qp_type = %d, max_inline_data = %d, state = %d, signaled = %d, use_srq=%d\n", pd->pd_id, qp->qp_type, qp->max_inline_data, @@ -1276,9 +1306,303 @@ static void qedr_set_roce_db_info(struct qedr_dev *dev, struct qedr_qp *qp) qp->sq.db = dev->db_addr + DB_ADDR_SHIFT(DQ_PWM_OFFSET_XCM_RDMA_SQ_PROD); qp->sq.db_data.data.icid = qp->icid + 1; - qp->rq.db = dev->db_addr + - DB_ADDR_SHIFT(DQ_PWM_OFFSET_TCM_ROCE_RQ_PROD); - qp->rq.db_data.data.icid = qp->icid; + if (!qp->srq) { + qp->rq.db = dev->db_addr + + DB_ADDR_SHIFT(DQ_PWM_OFFSET_TCM_ROCE_RQ_PROD); + qp->rq.db_data.data.icid = qp->icid; + } +} + +static int qedr_check_srq_params(struct ib_pd *ibpd, struct qedr_dev *dev, + struct ib_srq_init_attr *attrs, + struct ib_udata *udata) +{ + struct qedr_device_attr *qattr = &dev->attr; + + if (attrs->attr.max_wr > qattr->max_srq_wr) { + DP_ERR(dev, + "create srq: unsupported srq_wr=0x%x requested (max_srq_wr=0x%x)\n", + attrs->attr.max_wr, qattr->max_srq_wr); + return -EINVAL; + } + + if (attrs->attr.max_sge > qattr->max_sge) { + DP_ERR(dev, + "create srq: unsupported sge=0x%x requested (max_srq_sge=0x%x)\n", + attrs->attr.max_sge, qattr->max_sge); + return -EINVAL; + } + + return 0; +} + +static void qedr_free_srq_user_params(struct qedr_srq *srq) +{ + qedr_free_pbl(srq->dev, &srq->usrq.pbl_info, srq->usrq.pbl_tbl); + ib_umem_release(srq->usrq.umem); + ib_umem_release(srq->prod_umem); +} + +static void qedr_free_srq_kernel_params(struct qedr_srq *srq) +{ + struct qedr_srq_hwq_info *hw_srq = &srq->hw_srq; + struct qedr_dev *dev = srq->dev; + + dev->ops->common->chain_free(dev->cdev, &hw_srq->pbl); + + dma_free_coherent(&dev->pdev->dev, sizeof(struct rdma_srq_producers), + hw_srq->virt_prod_pair_addr, + hw_srq->phy_prod_pair_addr); +} + +static int qedr_init_srq_user_params(struct ib_ucontext *ib_ctx, + struct qedr_srq *srq, + struct qedr_create_srq_ureq *ureq, + int access, int dmasync) +{ + struct scatterlist *sg; + int rc; + + rc = qedr_init_user_queue(ib_ctx, srq->dev, &srq->usrq, ureq->srq_addr, + ureq->srq_len, access, dmasync, 1); + if (rc) + return rc; + + srq->prod_umem = ib_umem_get(ib_ctx, ureq->prod_pair_addr, + sizeof(struct rdma_srq_producers), + access, dmasync); + if (IS_ERR(srq->prod_umem)) { + qedr_free_pbl(srq->dev, &srq->usrq.pbl_info, srq->usrq.pbl_tbl); + ib_umem_release(srq->usrq.umem); + DP_ERR(srq->dev, + "create srq: failed ib_umem_get for producer, got %ld\n", + PTR_ERR(srq->prod_umem)); + return PTR_ERR(srq->prod_umem); + } + + sg = srq->prod_umem->sg_head.sgl; + srq->hw_srq.phy_prod_pair_addr = sg_dma_address(sg); + + return 0; +} + +static int qedr_alloc_srq_kernel_params(struct qedr_srq *srq, + struct qedr_dev *dev, + struct ib_srq_init_attr *init_attr) +{ + struct qedr_srq_hwq_info *hw_srq = &srq->hw_srq; + dma_addr_t phy_prod_pair_addr; + u32 num_elems; + void *va; + int rc; + + va = dma_alloc_coherent(&dev->pdev->dev, + sizeof(struct rdma_srq_producers), + &phy_prod_pair_addr, GFP_KERNEL); + if (!va) { + DP_ERR(dev, + "create srq: failed to allocate dma memory for producer\n"); + return -ENOMEM; + } + + hw_srq->phy_prod_pair_addr = phy_prod_pair_addr; + hw_srq->virt_prod_pair_addr = va; + + num_elems = init_attr->attr.max_wr * RDMA_MAX_SRQ_WQE_SIZE; + rc = dev->ops->common->chain_alloc(dev->cdev, + QED_CHAIN_USE_TO_CONSUME_PRODUCE, + QED_CHAIN_MODE_PBL, + QED_CHAIN_CNT_TYPE_U32, + num_elems, + QEDR_SRQ_WQE_ELEM_SIZE, + &hw_srq->pbl, NULL); + if (rc) + goto err0; + + hw_srq->num_elems = num_elems; + + return 0; + +err0: + dma_free_coherent(&dev->pdev->dev, sizeof(struct rdma_srq_producers), + va, phy_prod_pair_addr); + return rc; +} + +static int qedr_idr_add(struct qedr_dev *dev, struct qedr_idr *qidr, + void *ptr, u32 id); +static void qedr_idr_remove(struct qedr_dev *dev, + struct qedr_idr *qidr, u32 id); + +struct ib_srq *qedr_create_srq(struct ib_pd *ibpd, + struct ib_srq_init_attr *init_attr, + struct ib_udata *udata) +{ + struct qed_rdma_destroy_srq_in_params destroy_in_params; + struct qed_rdma_create_srq_in_params in_params = {}; + struct qedr_dev *dev = get_qedr_dev(ibpd->device); + struct qed_rdma_create_srq_out_params out_params; + struct qedr_pd *pd = get_qedr_pd(ibpd); + struct qedr_create_srq_ureq ureq = {}; + u64 pbl_base_addr, phy_prod_pair_addr; + struct ib_ucontext *ib_ctx = NULL; + struct qedr_srq_hwq_info *hw_srq; + struct qedr_ucontext *ctx = NULL; + u32 page_cnt, page_size; + struct qedr_srq *srq; + int rc = 0; + + DP_DEBUG(dev, QEDR_MSG_QP, + "create SRQ called from %s (pd %p)\n", + (udata) ? "User lib" : "kernel", pd); + + rc = qedr_check_srq_params(ibpd, dev, init_attr, udata); + if (rc) + return ERR_PTR(-EINVAL); + + srq = kzalloc(sizeof(*srq), GFP_KERNEL); + if (!srq) + return ERR_PTR(-ENOMEM); + + srq->dev = dev; + hw_srq = &srq->hw_srq; + spin_lock_init(&srq->lock); + + hw_srq->max_wr = init_attr->attr.max_wr; + hw_srq->max_sges = init_attr->attr.max_sge; + + if (udata && ibpd->uobject && ibpd->uobject->context) { + ib_ctx = ibpd->uobject->context; + ctx = get_qedr_ucontext(ib_ctx); + + if (ib_copy_from_udata(&ureq, udata, sizeof(ureq))) { + DP_ERR(dev, + "create srq: problem copying data from user space\n"); + goto err0; + } + + rc = qedr_init_srq_user_params(ib_ctx, srq, &ureq, 0, 0); + if (rc) + goto err0; + + page_cnt = srq->usrq.pbl_info.num_pbes; + pbl_base_addr = srq->usrq.pbl_tbl->pa; + phy_prod_pair_addr = hw_srq->phy_prod_pair_addr; + page_size = BIT(srq->usrq.umem->page_shift); + } else { + struct qed_chain *pbl; + + rc = qedr_alloc_srq_kernel_params(srq, dev, init_attr); + if (rc) + goto err0; + + pbl = &hw_srq->pbl; + page_cnt = qed_chain_get_page_cnt(pbl); + pbl_base_addr = qed_chain_get_pbl_phys(pbl); + phy_prod_pair_addr = hw_srq->phy_prod_pair_addr; + page_size = QED_CHAIN_PAGE_SIZE; + } + + in_params.pd_id = pd->pd_id; + in_params.pbl_base_addr = pbl_base_addr; + in_params.prod_pair_addr = phy_prod_pair_addr; + in_params.num_pages = page_cnt; + in_params.page_size = page_size; + + rc = dev->ops->rdma_create_srq(dev->rdma_ctx, &in_params, &out_params); + if (rc) + goto err1; + + srq->srq_id = out_params.srq_id; + + if (udata) { + rc = qedr_copy_srq_uresp(dev, srq, udata); + if (rc) + goto err2; + } + + rc = qedr_idr_add(dev, &dev->srqidr, srq, srq->srq_id); + if (rc) + goto err2; + + DP_DEBUG(dev, QEDR_MSG_SRQ, + "create srq: created srq with srq_id=0x%0x\n", srq->srq_id); + return &srq->ibsrq; + +err2: + destroy_in_params.srq_id = srq->srq_id; + + dev->ops->rdma_destroy_srq(dev->rdma_ctx, &destroy_in_params); +err1: + if (udata) + qedr_free_srq_user_params(srq); + else + qedr_free_srq_kernel_params(srq); +err0: + kfree(srq); + + return ERR_PTR(-EFAULT); +} + +int qedr_destroy_srq(struct ib_srq *ibsrq) +{ + struct qed_rdma_destroy_srq_in_params in_params = {}; + struct qedr_dev *dev = get_qedr_dev(ibsrq->device); + struct qedr_srq *srq = get_qedr_srq(ibsrq); + + qedr_idr_remove(dev, &dev->srqidr, srq->srq_id); + in_params.srq_id = srq->srq_id; + dev->ops->rdma_destroy_srq(dev->rdma_ctx, &in_params); + + if (ibsrq->pd->uobject) + qedr_free_srq_user_params(srq); + else + qedr_free_srq_kernel_params(srq); + + DP_DEBUG(dev, QEDR_MSG_SRQ, + "destroy srq: destroyed srq with srq_id=0x%0x\n", + srq->srq_id); + kfree(srq); + + return 0; +} + +int qedr_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, + enum ib_srq_attr_mask attr_mask, struct ib_udata *udata) +{ + struct qed_rdma_modify_srq_in_params in_params = {}; + struct qedr_dev *dev = get_qedr_dev(ibsrq->device); + struct qedr_srq *srq = get_qedr_srq(ibsrq); + int rc; + + if (attr_mask & IB_SRQ_MAX_WR) { + DP_ERR(dev, + "modify srq: invalid attribute mask=0x%x specified for %p\n", + attr_mask, srq); + return -EINVAL; + } + + if (attr_mask & IB_SRQ_LIMIT) { + if (attr->srq_limit >= srq->hw_srq.max_wr) { + DP_ERR(dev, + "modify srq: invalid srq_limit=0x%x (max_srq_limit=0x%x)\n", + attr->srq_limit, srq->hw_srq.max_wr); + return -EINVAL; + } + + in_params.srq_id = srq->srq_id; + in_params.wqe_limit = attr->srq_limit; + rc = dev->ops->rdma_modify_srq(dev->rdma_ctx, &in_params); + if (rc) + return rc; + } + + srq->srq_limit = attr->srq_limit; + + DP_DEBUG(dev, QEDR_MSG_SRQ, + "modify srq: modified srq with srq_id=0x%0x\n", srq->srq_id); + + return 0; } static inline void @@ -1299,9 +1623,17 @@ qedr_init_common_qp_in_params(struct qedr_dev *dev, params->dpi = pd->uctx ? pd->uctx->dpi : dev->dpi; params->sq_cq_id = get_qedr_cq(attrs->send_cq)->icid; params->stats_queue = 0; - params->rq_cq_id = get_qedr_cq(attrs->recv_cq)->icid; params->srq_id = 0; params->use_srq = false; + + if (!qp->srq) { + params->rq_cq_id = get_qedr_cq(attrs->recv_cq)->icid; + + } else { + params->rq_cq_id = get_qedr_cq(attrs->recv_cq)->icid; + params->srq_id = qp->srq->srq_id; + params->use_srq = true; + } } static inline void qedr_qp_user_print(struct qedr_dev *dev, struct qedr_qp *qp) @@ -1318,32 +1650,27 @@ static inline void qedr_qp_user_print(struct qedr_dev *dev, struct qedr_qp *qp) qp->usq.buf_len, qp->urq.buf_addr, qp->urq.buf_len); } -static int qedr_idr_add(struct qedr_dev *dev, void *ptr, u32 id) +static int qedr_idr_add(struct qedr_dev *dev, struct qedr_idr *qidr, + void *ptr, u32 id) { int rc; - if (!rdma_protocol_iwarp(&dev->ibdev, 1)) - return 0; - idr_preload(GFP_KERNEL); - spin_lock_irq(&dev->idr_lock); + spin_lock_irq(&qidr->idr_lock); - rc = idr_alloc(&dev->qpidr, ptr, id, id + 1, GFP_ATOMIC); + rc = idr_alloc(&qidr->idr, ptr, id, id + 1, GFP_ATOMIC); - spin_unlock_irq(&dev->idr_lock); + spin_unlock_irq(&qidr->idr_lock); idr_preload_end(); return rc < 0 ? rc : 0; } -static void qedr_idr_remove(struct qedr_dev *dev, u32 id) +static void qedr_idr_remove(struct qedr_dev *dev, struct qedr_idr *qidr, u32 id) { - if (!rdma_protocol_iwarp(&dev->ibdev, 1)) - return; - - spin_lock_irq(&dev->idr_lock); - idr_remove(&dev->qpidr, id); - spin_unlock_irq(&dev->idr_lock); + spin_lock_irq(&qidr->idr_lock); + idr_remove(&qidr->idr, id); + spin_unlock_irq(&qidr->idr_lock); } static inline void @@ -1356,9 +1683,10 @@ qedr_iwarp_populate_user_qp(struct qedr_dev *dev, qedr_populate_pbls(dev, qp->usq.umem, qp->usq.pbl_tbl, &qp->usq.pbl_info, FW_PAGE_SHIFT); - - qp->urq.pbl_tbl->va = out_params->rq_pbl_virt; - qp->urq.pbl_tbl->pa = out_params->rq_pbl_phys; + if (!qp->srq) { + qp->urq.pbl_tbl->va = out_params->rq_pbl_virt; + qp->urq.pbl_tbl->pa = out_params->rq_pbl_phys; + } qedr_populate_pbls(dev, qp->urq.umem, qp->urq.pbl_tbl, &qp->urq.pbl_info, FW_PAGE_SHIFT); @@ -1404,11 +1732,13 @@ static int qedr_create_user_qp(struct qedr_dev *dev, if (rc) return rc; - /* RQ - read access only (0), dma sync not required (0) */ - rc = qedr_init_user_queue(ib_ctx, dev, &qp->urq, ureq.rq_addr, - ureq.rq_len, 0, 0, alloc_and_init); - if (rc) - return rc; + if (!qp->srq) { + /* RQ - read access only (0), dma sync not required (0) */ + rc = qedr_init_user_queue(ib_ctx, dev, &qp->urq, ureq.rq_addr, + ureq.rq_len, 0, 0, alloc_and_init); + if (rc) + return rc; + } memset(&in_params, 0, sizeof(in_params)); qedr_init_common_qp_in_params(dev, pd, qp, attrs, false, &in_params); @@ -1416,8 +1746,10 @@ static int qedr_create_user_qp(struct qedr_dev *dev, in_params.qp_handle_hi = ureq.qp_handle_hi; in_params.sq_num_pages = qp->usq.pbl_info.num_pbes; in_params.sq_pbl_ptr = qp->usq.pbl_tbl->pa; - in_params.rq_num_pages = qp->urq.pbl_info.num_pbes; - in_params.rq_pbl_ptr = qp->urq.pbl_tbl->pa; + if (!qp->srq) { + in_params.rq_num_pages = qp->urq.pbl_info.num_pbes; + in_params.rq_pbl_ptr = qp->urq.pbl_tbl->pa; + } qp->qed_qp = dev->ops->rdma_create_qp(dev->rdma_ctx, &in_params, &out_params); @@ -1679,16 +2011,13 @@ struct ib_qp *qedr_create_qp(struct ib_pd *ibpd, if (rc) return ERR_PTR(rc); - if (attrs->srq) - return ERR_PTR(-EINVAL); - DP_DEBUG(dev, QEDR_MSG_QP, "create qp: called from %s, event_handler=%p, eepd=%p sq_cq=%p, sq_icid=%d, rq_cq=%p, rq_icid=%d\n", udata ? "user library" : "kernel", attrs->event_handler, pd, get_qedr_cq(attrs->send_cq), get_qedr_cq(attrs->send_cq)->icid, get_qedr_cq(attrs->recv_cq), - get_qedr_cq(attrs->recv_cq)->icid); + attrs->recv_cq ? get_qedr_cq(attrs->recv_cq)->icid : 0); qp = kzalloc(sizeof(*qp), GFP_KERNEL); if (!qp) { @@ -1715,9 +2044,11 @@ struct ib_qp *qedr_create_qp(struct ib_pd *ibpd, qp->ibqp.qp_num = qp->qp_id; - rc = qedr_idr_add(dev, qp, qp->qp_id); - if (rc) - goto err; + if (rdma_protocol_iwarp(&dev->ibdev, 1)) { + rc = qedr_idr_add(dev, &dev->qpidr, qp, qp->qp_id); + if (rc) + goto err; + } return &qp->ibqp; @@ -2289,8 +2620,9 @@ int qedr_destroy_qp(struct ib_qp *ibqp) qedr_free_qp_resources(dev, qp); - if (atomic_dec_and_test(&qp->refcnt)) { - qedr_idr_remove(dev, qp->qp_id); + if (atomic_dec_and_test(&qp->refcnt) && + rdma_protocol_iwarp(&dev->ibdev, 1)) { + qedr_idr_remove(dev, &dev->qpidr, qp->qp_id); kfree(qp); } return rc; @@ -2305,7 +2637,7 @@ struct ib_ah *qedr_create_ah(struct ib_pd *ibpd, struct rdma_ah_attr *attr, if (!ah) return ERR_PTR(-ENOMEM); - ah->attr = *attr; + rdma_copy_ah_attr(&ah->attr, attr); return &ah->ibah; } @@ -2314,6 +2646,7 @@ int qedr_destroy_ah(struct ib_ah *ibah) { struct qedr_ah *ah = get_qedr_ah(ibah); + rdma_destroy_ah_attr(&ah->attr); kfree(ah); return 0; } @@ -2705,9 +3038,9 @@ static void swap_wqe_data64(u64 *p) static u32 qedr_prepare_sq_inline_data(struct qedr_dev *dev, struct qedr_qp *qp, u8 *wqe_size, - struct ib_send_wr *wr, - struct ib_send_wr **bad_wr, u8 *bits, - u8 bit) + const struct ib_send_wr *wr, + const struct ib_send_wr **bad_wr, + u8 *bits, u8 bit) { u32 data_size = sge_data_len(wr->sg_list, wr->num_sge); char *seg_prt, *wqe; @@ -2790,7 +3123,7 @@ static u32 qedr_prepare_sq_inline_data(struct qedr_dev *dev, } while (0) static u32 qedr_prepare_sq_sges(struct qedr_qp *qp, u8 *wqe_size, - struct ib_send_wr *wr) + const struct ib_send_wr *wr) { u32 data_size = 0; int i; @@ -2814,8 +3147,8 @@ static u32 qedr_prepare_sq_rdma_data(struct qedr_dev *dev, struct qedr_qp *qp, struct rdma_sq_rdma_wqe_1st *rwqe, struct rdma_sq_rdma_wqe_2nd *rwqe2, - struct ib_send_wr *wr, - struct ib_send_wr **bad_wr) + const struct ib_send_wr *wr, + const struct ib_send_wr **bad_wr) { rwqe2->r_key = cpu_to_le32(rdma_wr(wr)->rkey); DMA_REGPAIR_LE(rwqe2->remote_va, rdma_wr(wr)->remote_addr); @@ -2837,8 +3170,8 @@ static u32 qedr_prepare_sq_send_data(struct qedr_dev *dev, struct qedr_qp *qp, struct rdma_sq_send_wqe_1st *swqe, struct rdma_sq_send_wqe_2st *swqe2, - struct ib_send_wr *wr, - struct ib_send_wr **bad_wr) + const struct ib_send_wr *wr, + const struct ib_send_wr **bad_wr) { memset(swqe2, 0, sizeof(*swqe2)); if (wr->send_flags & IB_SEND_INLINE) { @@ -2854,7 +3187,7 @@ static u32 qedr_prepare_sq_send_data(struct qedr_dev *dev, static int qedr_prepare_reg(struct qedr_qp *qp, struct rdma_sq_fmr_wqe_1st *fwqe1, - struct ib_reg_wr *wr) + const struct ib_reg_wr *wr) { struct qedr_mr *mr = get_qedr_mr(wr->mr); struct rdma_sq_fmr_wqe_2nd *fwqe2; @@ -2916,7 +3249,8 @@ static enum ib_wc_opcode qedr_ib_to_wc_opcode(enum ib_wr_opcode opcode) } } -static inline bool qedr_can_post_send(struct qedr_qp *qp, struct ib_send_wr *wr) +static inline bool qedr_can_post_send(struct qedr_qp *qp, + const struct ib_send_wr *wr) { int wq_is_full, err_wr, pbl_is_full; struct qedr_dev *dev = qp->dev; @@ -2953,8 +3287,8 @@ static inline bool qedr_can_post_send(struct qedr_qp *qp, struct ib_send_wr *wr) return true; } -static int __qedr_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, - struct ib_send_wr **bad_wr) +static int __qedr_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, + const struct ib_send_wr **bad_wr) { struct qedr_dev *dev = get_qedr_dev(ibqp->device); struct qedr_qp *qp = get_qedr_qp(ibqp); @@ -3168,8 +3502,8 @@ static int __qedr_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, return rc; } -int qedr_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, - struct ib_send_wr **bad_wr) +int qedr_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, + const struct ib_send_wr **bad_wr) { struct qedr_dev *dev = get_qedr_dev(ibqp->device); struct qedr_qp *qp = get_qedr_qp(ibqp); @@ -3234,8 +3568,104 @@ int qedr_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, return rc; } -int qedr_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr, - struct ib_recv_wr **bad_wr) +static u32 qedr_srq_elem_left(struct qedr_srq_hwq_info *hw_srq) +{ + u32 used; + + /* Calculate number of elements used based on producer + * count and consumer count and subtract it from max + * work request supported so that we get elements left. + */ + used = hw_srq->wr_prod_cnt - hw_srq->wr_cons_cnt; + + return hw_srq->max_wr - used; +} + +int qedr_post_srq_recv(struct ib_srq *ibsrq, const struct ib_recv_wr *wr, + const struct ib_recv_wr **bad_wr) +{ + struct qedr_srq *srq = get_qedr_srq(ibsrq); + struct qedr_srq_hwq_info *hw_srq; + struct qedr_dev *dev = srq->dev; + struct qed_chain *pbl; + unsigned long flags; + int status = 0; + u32 num_sge; + u32 offset; + + spin_lock_irqsave(&srq->lock, flags); + + hw_srq = &srq->hw_srq; + pbl = &srq->hw_srq.pbl; + while (wr) { + struct rdma_srq_wqe_header *hdr; + int i; + + if (!qedr_srq_elem_left(hw_srq) || + wr->num_sge > srq->hw_srq.max_sges) { + DP_ERR(dev, "Can't post WR (%d,%d) || (%d > %d)\n", + hw_srq->wr_prod_cnt, hw_srq->wr_cons_cnt, + wr->num_sge, srq->hw_srq.max_sges); + status = -ENOMEM; + *bad_wr = wr; + break; + } + + hdr = qed_chain_produce(pbl); + num_sge = wr->num_sge; + /* Set number of sge and work request id in header */ + SRQ_HDR_SET(hdr, wr->wr_id, num_sge); + + srq->hw_srq.wr_prod_cnt++; + hw_srq->wqe_prod++; + hw_srq->sge_prod++; + + DP_DEBUG(dev, QEDR_MSG_SRQ, + "SRQ WR: SGEs: %d with wr_id[%d] = %llx\n", + wr->num_sge, hw_srq->wqe_prod, wr->wr_id); + + for (i = 0; i < wr->num_sge; i++) { + struct rdma_srq_sge *srq_sge = qed_chain_produce(pbl); + + /* Set SGE length, lkey and address */ + SRQ_SGE_SET(srq_sge, wr->sg_list[i].addr, + wr->sg_list[i].length, wr->sg_list[i].lkey); + + DP_DEBUG(dev, QEDR_MSG_SRQ, + "[%d]: len %d key %x addr %x:%x\n", + i, srq_sge->length, srq_sge->l_key, + srq_sge->addr.hi, srq_sge->addr.lo); + hw_srq->sge_prod++; + } + + /* Flush WQE and SGE information before + * updating producer. + */ + wmb(); + + /* SRQ producer is 8 bytes. Need to update SGE producer index + * in first 4 bytes and need to update WQE producer in + * next 4 bytes. + */ + *srq->hw_srq.virt_prod_pair_addr = hw_srq->sge_prod; + offset = offsetof(struct rdma_srq_producers, wqe_prod); + *((u8 *)srq->hw_srq.virt_prod_pair_addr + offset) = + hw_srq->wqe_prod; + + /* Flush producer after updating it. */ + wmb(); + wr = wr->next; + } + + DP_DEBUG(dev, QEDR_MSG_SRQ, "POST: Elements in S-RQ: %d\n", + qed_chain_get_elem_left(pbl)); + spin_unlock_irqrestore(&srq->lock, flags); + + return status; +} + +int qedr_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr, + const struct ib_recv_wr **bad_wr) { struct qedr_qp *qp = get_qedr_qp(ibqp); struct qedr_dev *dev = qp->dev; @@ -3625,6 +4055,31 @@ static void __process_resp_one(struct qedr_dev *dev, struct qedr_qp *qp, wc->wr_id = wr_id; } +static int process_resp_one_srq(struct qedr_dev *dev, struct qedr_qp *qp, + struct qedr_cq *cq, struct ib_wc *wc, + struct rdma_cqe_responder *resp) +{ + struct qedr_srq *srq = qp->srq; + u64 wr_id; + + wr_id = HILO_GEN(le32_to_cpu(resp->srq_wr_id.hi), + le32_to_cpu(resp->srq_wr_id.lo), u64); + + if (resp->status == RDMA_CQE_RESP_STS_WORK_REQUEST_FLUSHED_ERR) { + wc->status = IB_WC_WR_FLUSH_ERR; + wc->vendor_err = 0; + wc->wr_id = wr_id; + wc->byte_len = 0; + wc->src_qp = qp->id; + wc->qp = &qp->ibqp; + wc->wr_id = wr_id; + } else { + __process_resp_one(dev, qp, cq, wc, resp, wr_id); + } + srq->hw_srq.wr_cons_cnt++; + + return 1; +} static int process_resp_one(struct qedr_dev *dev, struct qedr_qp *qp, struct qedr_cq *cq, struct ib_wc *wc, struct rdma_cqe_responder *resp) @@ -3674,6 +4129,19 @@ static void try_consume_resp_cqe(struct qedr_cq *cq, struct qedr_qp *qp, } } +static int qedr_poll_cq_resp_srq(struct qedr_dev *dev, struct qedr_qp *qp, + struct qedr_cq *cq, int num_entries, + struct ib_wc *wc, + struct rdma_cqe_responder *resp) +{ + int cnt; + + cnt = process_resp_one_srq(dev, qp, cq, wc, resp); + consume_cqe(cq); + + return cnt; +} + static int qedr_poll_cq_resp(struct qedr_dev *dev, struct qedr_qp *qp, struct qedr_cq *cq, int num_entries, struct ib_wc *wc, struct rdma_cqe_responder *resp, @@ -3751,6 +4219,11 @@ int qedr_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc) cnt = qedr_poll_cq_resp(dev, qp, cq, num_entries, wc, &cqe->resp, &update); break; + case RDMA_CQE_TYPE_RESPONDER_SRQ: + cnt = qedr_poll_cq_resp_srq(dev, qp, cq, num_entries, + wc, &cqe->resp); + update = 1; + break; case RDMA_CQE_TYPE_INVALID: default: DP_ERR(dev, "Error: invalid CQE type = %d\n", diff --git a/drivers/infiniband/hw/qedr/verbs.h b/drivers/infiniband/hw/qedr/verbs.h index 2c57e4c592a6..0b7d0124b16c 100644 --- a/drivers/infiniband/hw/qedr/verbs.h +++ b/drivers/infiniband/hw/qedr/verbs.h @@ -66,6 +66,15 @@ int qedr_query_qp(struct ib_qp *, struct ib_qp_attr *qp_attr, int qp_attr_mask, struct ib_qp_init_attr *); int qedr_destroy_qp(struct ib_qp *ibqp); +struct ib_srq *qedr_create_srq(struct ib_pd *ibpd, + struct ib_srq_init_attr *attr, + struct ib_udata *udata); +int qedr_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, + enum ib_srq_attr_mask attr_mask, struct ib_udata *udata); +int qedr_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr); +int qedr_destroy_srq(struct ib_srq *ibsrq); +int qedr_post_srq_recv(struct ib_srq *ibsrq, const struct ib_recv_wr *wr, + const struct ib_recv_wr **bad_recv_wr); struct ib_ah *qedr_create_ah(struct ib_pd *ibpd, struct rdma_ah_attr *attr, struct ib_udata *udata); int qedr_destroy_ah(struct ib_ah *ibah); @@ -82,10 +91,10 @@ int qedr_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, struct ib_mr *qedr_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type, u32 max_num_sg); int qedr_poll_cq(struct ib_cq *, int num_entries, struct ib_wc *wc); -int qedr_post_send(struct ib_qp *, struct ib_send_wr *, - struct ib_send_wr **bad_wr); -int qedr_post_recv(struct ib_qp *, struct ib_recv_wr *, - struct ib_recv_wr **bad_wr); +int qedr_post_send(struct ib_qp *, const struct ib_send_wr *, + const struct ib_send_wr **bad_wr); +int qedr_post_recv(struct ib_qp *, const struct ib_recv_wr *, + const struct ib_recv_wr **bad_wr); int qedr_process_mad(struct ib_device *ibdev, int process_mad_flags, u8 port_num, const struct ib_wc *in_wc, const struct ib_grh *in_grh, diff --git a/drivers/infiniband/hw/qib/qib_verbs.c b/drivers/infiniband/hw/qib/qib_verbs.c index 14b4057a2b8f..41babbc0db58 100644 --- a/drivers/infiniband/hw/qib/qib_verbs.c +++ b/drivers/infiniband/hw/qib/qib_verbs.c @@ -1489,7 +1489,8 @@ static void qib_fill_device_attr(struct qib_devdata *dd) rdi->dparms.props.max_mr_size = ~0ULL; rdi->dparms.props.max_qp = ib_qib_max_qps; rdi->dparms.props.max_qp_wr = ib_qib_max_qp_wrs; - rdi->dparms.props.max_sge = ib_qib_max_sges; + rdi->dparms.props.max_send_sge = ib_qib_max_sges; + rdi->dparms.props.max_recv_sge = ib_qib_max_sges; rdi->dparms.props.max_sge_rd = ib_qib_max_sges; rdi->dparms.props.max_cq = ib_qib_max_cqs; rdi->dparms.props.max_cqe = ib_qib_max_cqes; diff --git a/drivers/infiniband/hw/qib/qib_verbs.h b/drivers/infiniband/hw/qib/qib_verbs.h index f9a46768a19a..666613eef88f 100644 --- a/drivers/infiniband/hw/qib/qib_verbs.h +++ b/drivers/infiniband/hw/qib/qib_verbs.h @@ -78,9 +78,6 @@ struct qib_verbs_txreq; #define QIB_VENDOR_IPG cpu_to_be16(0xFFA0) -/* XXX Should be defined in ib_verbs.h enum ib_port_cap_flags */ -#define IB_PORT_OTHER_LOCAL_CHANGES_SUP (1 << 26) - #define IB_DEFAULT_GID_PREFIX cpu_to_be64(0xfe80000000000000ULL) /* Values for set/get portinfo VLCap OperationalVLs */ @@ -314,7 +311,7 @@ void qib_rc_rnr_retry(unsigned long arg); void qib_rc_send_complete(struct rvt_qp *qp, struct ib_header *hdr); -int qib_post_ud_send(struct rvt_qp *qp, struct ib_send_wr *wr); +int qib_post_ud_send(struct rvt_qp *qp, const struct ib_send_wr *wr); void qib_ud_rcv(struct qib_ibport *ibp, struct ib_header *hdr, int has_grh, void *data, u32 tlen, struct rvt_qp *qp); diff --git a/drivers/infiniband/hw/usnic/Kconfig b/drivers/infiniband/hw/usnic/Kconfig index 29ab11c34f3f..d1dae2af4ca9 100644 --- a/drivers/infiniband/hw/usnic/Kconfig +++ b/drivers/infiniband/hw/usnic/Kconfig @@ -1,10 +1,10 @@ config INFINIBAND_USNIC tristate "Verbs support for Cisco VIC" depends on NETDEVICES && ETHERNET && INET && PCI && INTEL_IOMMU + depends on INFINIBAND_USER_ACCESS select ENIC select NET_VENDOR_CISCO select PCI_IOV - select INFINIBAND_USER_ACCESS ---help--- This is a low-level driver for Cisco's Virtual Interface Cards (VICs), including the VIC 1240 and 1280 cards. diff --git a/drivers/infiniband/hw/usnic/usnic_fwd.c b/drivers/infiniband/hw/usnic/usnic_fwd.c index 995a26b65156..7875883621f4 100644 --- a/drivers/infiniband/hw/usnic/usnic_fwd.c +++ b/drivers/infiniband/hw/usnic/usnic_fwd.c @@ -92,8 +92,8 @@ struct usnic_fwd_dev *usnic_fwd_dev_alloc(struct pci_dev *pdev) ufdev->pdev = pdev; ufdev->netdev = pci_get_drvdata(pdev); spin_lock_init(&ufdev->lock); - strncpy(ufdev->name, netdev_name(ufdev->netdev), - sizeof(ufdev->name) - 1); + BUILD_BUG_ON(sizeof(ufdev->name) != sizeof(ufdev->netdev->name)); + strcpy(ufdev->name, ufdev->netdev->name); return ufdev; } diff --git a/drivers/infiniband/hw/usnic/usnic_fwd.h b/drivers/infiniband/hw/usnic/usnic_fwd.h index 0b2cc4e79707..f0b71d593da5 100644 --- a/drivers/infiniband/hw/usnic/usnic_fwd.h +++ b/drivers/infiniband/hw/usnic/usnic_fwd.h @@ -57,7 +57,7 @@ struct usnic_fwd_dev { char mac[ETH_ALEN]; unsigned int mtu; __be32 inaddr; - char name[IFNAMSIZ+1]; + char name[IFNAMSIZ]; }; struct usnic_fwd_flow { diff --git a/drivers/infiniband/hw/usnic/usnic_ib_verbs.c b/drivers/infiniband/hw/usnic/usnic_ib_verbs.c index a688a5669168..9973ac893635 100644 --- a/drivers/infiniband/hw/usnic/usnic_ib_verbs.c +++ b/drivers/infiniband/hw/usnic/usnic_ib_verbs.c @@ -666,7 +666,7 @@ int usnic_ib_dereg_mr(struct ib_mr *ibmr) usnic_dbg("va 0x%lx length 0x%zx\n", mr->umem->va, mr->umem->length); - usnic_uiom_reg_release(mr->umem, ibmr->pd->uobject->context->closing); + usnic_uiom_reg_release(mr->umem, ibmr->uobject->context); kfree(mr); return 0; } @@ -771,15 +771,15 @@ int usnic_ib_destroy_ah(struct ib_ah *ah) return -EINVAL; } -int usnic_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, - struct ib_send_wr **bad_wr) +int usnic_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, + const struct ib_send_wr **bad_wr) { usnic_dbg("\n"); return -EINVAL; } -int usnic_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr, - struct ib_recv_wr **bad_wr) +int usnic_ib_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr, + const struct ib_recv_wr **bad_wr) { usnic_dbg("\n"); return -EINVAL; diff --git a/drivers/infiniband/hw/usnic/usnic_ib_verbs.h b/drivers/infiniband/hw/usnic/usnic_ib_verbs.h index 1fda94425116..2a2c9beb715f 100644 --- a/drivers/infiniband/hw/usnic/usnic_ib_verbs.h +++ b/drivers/infiniband/hw/usnic/usnic_ib_verbs.h @@ -80,10 +80,10 @@ struct ib_ah *usnic_ib_create_ah(struct ib_pd *pd, struct ib_udata *udata); int usnic_ib_destroy_ah(struct ib_ah *ah); -int usnic_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, - struct ib_send_wr **bad_wr); -int usnic_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr, - struct ib_recv_wr **bad_wr); +int usnic_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, + const struct ib_send_wr **bad_wr); +int usnic_ib_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr, + const struct ib_recv_wr **bad_wr); int usnic_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc); int usnic_ib_req_notify_cq(struct ib_cq *cq, diff --git a/drivers/infiniband/hw/usnic/usnic_uiom.c b/drivers/infiniband/hw/usnic/usnic_uiom.c index 4381c0a9a873..9dd39daa602b 100644 --- a/drivers/infiniband/hw/usnic/usnic_uiom.c +++ b/drivers/infiniband/hw/usnic/usnic_uiom.c @@ -41,6 +41,7 @@ #include <linux/workqueue.h> #include <linux/list.h> #include <linux/pci.h> +#include <rdma/ib_verbs.h> #include "usnic_log.h" #include "usnic_uiom.h" @@ -88,7 +89,7 @@ static void usnic_uiom_put_pages(struct list_head *chunk_list, int dirty) for_each_sg(chunk->page_list, sg, chunk->nents, i) { page = sg_page(sg); pa = sg_phys(sg); - if (dirty) + if (!PageDirty(page) && dirty) set_page_dirty_lock(page); put_page(page); usnic_dbg("pa: %pa\n", &pa); @@ -114,6 +115,16 @@ static int usnic_uiom_get_pages(unsigned long addr, size_t size, int writable, dma_addr_t pa; unsigned int gup_flags; + /* + * If the combination of the addr and size requested for this memory + * region causes an integer overflow, return error. + */ + if (((addr + size) < addr) || PAGE_ALIGN(addr + size) < (addr + size)) + return -EINVAL; + + if (!size) + return -EINVAL; + if (!can_do_mlock()) return -EPERM; @@ -127,7 +138,7 @@ static int usnic_uiom_get_pages(unsigned long addr, size_t size, int writable, down_write(¤t->mm->mmap_sem); - locked = npages + current->mm->locked_vm; + locked = npages + current->mm->pinned_vm; lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) { @@ -143,7 +154,7 @@ static int usnic_uiom_get_pages(unsigned long addr, size_t size, int writable, ret = 0; while (npages) { - ret = get_user_pages(cur_base, + ret = get_user_pages_longterm(cur_base, min_t(unsigned long, npages, PAGE_SIZE / sizeof(struct page *)), gup_flags, page_list, NULL); @@ -186,7 +197,7 @@ out: if (ret < 0) usnic_uiom_put_pages(chunk_list, 0); else - current->mm->locked_vm = locked; + current->mm->pinned_vm = locked; up_write(¤t->mm->mmap_sem); free_page((unsigned long) page_list); @@ -420,18 +431,22 @@ out_free_uiomr: return ERR_PTR(err); } -void usnic_uiom_reg_release(struct usnic_uiom_reg *uiomr, int closing) +void usnic_uiom_reg_release(struct usnic_uiom_reg *uiomr, + struct ib_ucontext *ucontext) { + struct task_struct *task; struct mm_struct *mm; unsigned long diff; __usnic_uiom_reg_release(uiomr->pd, uiomr, 1); - mm = get_task_mm(current); - if (!mm) { - kfree(uiomr); - return; - } + task = get_pid_task(ucontext->tgid, PIDTYPE_PID); + if (!task) + goto out; + mm = get_task_mm(task); + put_task_struct(task); + if (!mm) + goto out; diff = PAGE_ALIGN(uiomr->length + uiomr->offset) >> PAGE_SHIFT; @@ -443,7 +458,7 @@ void usnic_uiom_reg_release(struct usnic_uiom_reg *uiomr, int closing) * up here and not be able to take the mmap_sem. In that case * we defer the vm_locked accounting to the system workqueue. */ - if (closing) { + if (ucontext->closing) { if (!down_write_trylock(&mm->mmap_sem)) { INIT_WORK(&uiomr->work, usnic_uiom_reg_account); uiomr->mm = mm; @@ -455,9 +470,10 @@ void usnic_uiom_reg_release(struct usnic_uiom_reg *uiomr, int closing) } else down_write(&mm->mmap_sem); - current->mm->locked_vm -= diff; + mm->pinned_vm -= diff; up_write(&mm->mmap_sem); mmput(mm); +out: kfree(uiomr); } diff --git a/drivers/infiniband/hw/usnic/usnic_uiom.h b/drivers/infiniband/hw/usnic/usnic_uiom.h index 431efe4143f4..8c096acff123 100644 --- a/drivers/infiniband/hw/usnic/usnic_uiom.h +++ b/drivers/infiniband/hw/usnic/usnic_uiom.h @@ -39,6 +39,8 @@ #include "usnic_uiom_interval_tree.h" +struct ib_ucontext; + #define USNIC_UIOM_READ (1) #define USNIC_UIOM_WRITE (2) @@ -89,7 +91,8 @@ void usnic_uiom_free_dev_list(struct device **devs); struct usnic_uiom_reg *usnic_uiom_reg_get(struct usnic_uiom_pd *pd, unsigned long addr, size_t size, int access, int dmasync); -void usnic_uiom_reg_release(struct usnic_uiom_reg *uiomr, int closing); +void usnic_uiom_reg_release(struct usnic_uiom_reg *uiomr, + struct ib_ucontext *ucontext); int usnic_uiom_init(char *drv_name); void usnic_uiom_fini(void); #endif /* USNIC_UIOM_H_ */ diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma.h b/drivers/infiniband/hw/vmw_pvrdma/pvrdma.h index 44cb1cfba417..42b8685c997e 100644 --- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma.h +++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma.h @@ -378,11 +378,6 @@ static inline enum ib_port_speed pvrdma_port_speed_to_ib( return (enum ib_port_speed)speed; } -static inline int pvrdma_qp_attr_mask_to_ib(int attr_mask) -{ - return attr_mask; -} - static inline int ib_qp_attr_mask_to_pvrdma(int attr_mask) { return attr_mask & PVRDMA_MASK(PVRDMA_QP_ATTR_MASK_MAX); diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c index f95b97646c25..0f004c737620 100644 --- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c +++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c @@ -278,19 +278,6 @@ int pvrdma_destroy_cq(struct ib_cq *cq) return ret; } -/** - * pvrdma_modify_cq - modify the CQ moderation parameters - * @ibcq: the CQ to modify - * @cq_count: number of CQEs that will trigger an event - * @cq_period: max period of time in usec before triggering an event - * - * @return: -EOPNOTSUPP as CQ resize is not supported. - */ -int pvrdma_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period) -{ - return -EOPNOTSUPP; -} - static inline struct pvrdma_cqe *get_cqe(struct pvrdma_cq *cq, int i) { return (struct pvrdma_cqe *)pvrdma_page_dir_get_ptr( @@ -428,16 +415,3 @@ int pvrdma_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc) /* Ensure we do not return errors from poll_cq */ return npolled; } - -/** - * pvrdma_resize_cq - resize CQ - * @ibcq: the completion queue - * @entries: CQ entries - * @udata: user data - * - * @return: -EOPNOTSUPP as CQ resize is not supported. - */ -int pvrdma_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata) -{ - return -EOPNOTSUPP; -} diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c index 0be33a81bbe6..a5719899f49a 100644 --- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c +++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c @@ -62,9 +62,7 @@ static DEFINE_MUTEX(pvrdma_device_list_lock); static LIST_HEAD(pvrdma_device_list); static struct workqueue_struct *event_wq; -static int pvrdma_add_gid(const union ib_gid *gid, - const struct ib_gid_attr *attr, - void **context); +static int pvrdma_add_gid(const struct ib_gid_attr *attr, void **context); static int pvrdma_del_gid(const struct ib_gid_attr *attr, void **context); static ssize_t show_hca(struct device *device, struct device_attribute *attr, @@ -216,8 +214,6 @@ static int pvrdma_register_device(struct pvrdma_dev *dev) dev->ib_dev.post_send = pvrdma_post_send; dev->ib_dev.post_recv = pvrdma_post_recv; dev->ib_dev.create_cq = pvrdma_create_cq; - dev->ib_dev.modify_cq = pvrdma_modify_cq; - dev->ib_dev.resize_cq = pvrdma_resize_cq; dev->ib_dev.destroy_cq = pvrdma_destroy_cq; dev->ib_dev.poll_cq = pvrdma_poll_cq; dev->ib_dev.req_notify_cq = pvrdma_req_notify_cq; @@ -261,7 +257,6 @@ static int pvrdma_register_device(struct pvrdma_dev *dev) dev->ib_dev.modify_srq = pvrdma_modify_srq; dev->ib_dev.query_srq = pvrdma_query_srq; dev->ib_dev.destroy_srq = pvrdma_destroy_srq; - dev->ib_dev.post_srq_recv = pvrdma_post_srq_recv; dev->srq_tbl = kcalloc(dev->dsr->caps.max_srq, sizeof(struct pvrdma_srq *), @@ -650,13 +645,11 @@ static int pvrdma_add_gid_at_index(struct pvrdma_dev *dev, return 0; } -static int pvrdma_add_gid(const union ib_gid *gid, - const struct ib_gid_attr *attr, - void **context) +static int pvrdma_add_gid(const struct ib_gid_attr *attr, void **context) { struct pvrdma_dev *dev = to_vdev(attr->device); - return pvrdma_add_gid_at_index(dev, gid, + return pvrdma_add_gid_at_index(dev, &attr->gid, ib_gid_type_to_pvrdma(attr->gid_type), attr->index); } @@ -699,8 +692,12 @@ static int pvrdma_del_gid(const struct ib_gid_attr *attr, void **context) } static void pvrdma_netdevice_event_handle(struct pvrdma_dev *dev, + struct net_device *ndev, unsigned long event) { + struct pci_dev *pdev_net; + unsigned int slot; + switch (event) { case NETDEV_REBOOT: case NETDEV_DOWN: @@ -718,6 +715,24 @@ static void pvrdma_netdevice_event_handle(struct pvrdma_dev *dev, else pvrdma_dispatch_event(dev, 1, IB_EVENT_PORT_ACTIVE); break; + case NETDEV_UNREGISTER: + dev_put(dev->netdev); + dev->netdev = NULL; + break; + case NETDEV_REGISTER: + /* vmxnet3 will have same bus, slot. But func will be 0 */ + slot = PCI_SLOT(dev->pdev->devfn); + pdev_net = pci_get_slot(dev->pdev->bus, + PCI_DEVFN(slot, 0)); + if ((dev->netdev == NULL) && + (pci_get_drvdata(pdev_net) == ndev)) { + /* this is our netdev */ + dev->netdev = ndev; + dev_hold(ndev); + } + pci_dev_put(pdev_net); + break; + default: dev_dbg(&dev->pdev->dev, "ignore netdevice event %ld on %s\n", event, dev->ib_dev.name); @@ -734,8 +749,11 @@ static void pvrdma_netdevice_event_work(struct work_struct *work) mutex_lock(&pvrdma_device_list_lock); list_for_each_entry(dev, &pvrdma_device_list, device_link) { - if (dev->netdev == netdev_work->event_netdev) { - pvrdma_netdevice_event_handle(dev, netdev_work->event); + if ((netdev_work->event == NETDEV_REGISTER) || + (dev->netdev == netdev_work->event_netdev)) { + pvrdma_netdevice_event_handle(dev, + netdev_work->event_netdev, + netdev_work->event); break; } } @@ -968,6 +986,7 @@ static int pvrdma_pci_probe(struct pci_dev *pdev, ret = -ENODEV; goto err_free_cq_ring; } + dev_hold(dev->netdev); dev_info(&pdev->dev, "paired device to %s\n", dev->netdev->name); @@ -1040,6 +1059,10 @@ err_free_intrs: pvrdma_free_irq(dev); pci_free_irq_vectors(pdev); err_free_cq_ring: + if (dev->netdev) { + dev_put(dev->netdev); + dev->netdev = NULL; + } pvrdma_page_dir_cleanup(dev, &dev->cq_pdir); err_free_async_ring: pvrdma_page_dir_cleanup(dev, &dev->async_pdir); @@ -1079,6 +1102,11 @@ static void pvrdma_pci_remove(struct pci_dev *pdev) flush_workqueue(event_wq); + if (dev->netdev) { + dev_put(dev->netdev); + dev->netdev = NULL; + } + /* Unregister ib device */ ib_unregister_device(&dev->ib_dev); diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_qp.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_qp.c index eb5b1065ec08..60083c0363a5 100644 --- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_qp.c +++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_qp.c @@ -599,7 +599,8 @@ static inline void *get_rq_wqe(struct pvrdma_qp *qp, unsigned int n) qp->rq.offset + n * qp->rq.wqe_size); } -static int set_reg_seg(struct pvrdma_sq_wqe_hdr *wqe_hdr, struct ib_reg_wr *wr) +static int set_reg_seg(struct pvrdma_sq_wqe_hdr *wqe_hdr, + const struct ib_reg_wr *wr) { struct pvrdma_user_mr *mr = to_vmr(wr->mr); @@ -623,8 +624,8 @@ static int set_reg_seg(struct pvrdma_sq_wqe_hdr *wqe_hdr, struct ib_reg_wr *wr) * * @return: 0 on success, otherwise errno returned. */ -int pvrdma_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, - struct ib_send_wr **bad_wr) +int pvrdma_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, + const struct ib_send_wr **bad_wr) { struct pvrdma_qp *qp = to_vqp(ibqp); struct pvrdma_dev *dev = to_vdev(ibqp->device); @@ -827,8 +828,8 @@ out: * * @return: 0 on success, otherwise errno returned. */ -int pvrdma_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr, - struct ib_recv_wr **bad_wr) +int pvrdma_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr, + const struct ib_recv_wr **bad_wr) { struct pvrdma_dev *dev = to_vdev(ibqp->device); unsigned long flags; diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_srq.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_srq.c index af235967a9c2..dc0ce877c7a3 100644 --- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_srq.c +++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_srq.c @@ -52,13 +52,6 @@ #include "pvrdma.h" -int pvrdma_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr, - struct ib_recv_wr **bad_wr) -{ - /* No support for kernel clients. */ - return -EOPNOTSUPP; -} - /** * pvrdma_query_srq - query shared receive queue * @ibsrq: the shared receive queue to query diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.c index a51463cd2f37..b65d10b0a875 100644 --- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.c +++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.c @@ -82,7 +82,8 @@ int pvrdma_query_device(struct ib_device *ibdev, props->max_qp = dev->dsr->caps.max_qp; props->max_qp_wr = dev->dsr->caps.max_qp_wr; props->device_cap_flags = dev->dsr->caps.device_cap_flags; - props->max_sge = dev->dsr->caps.max_sge; + props->max_send_sge = dev->dsr->caps.max_sge; + props->max_recv_sge = dev->dsr->caps.max_sge; props->max_sge_rd = PVRDMA_GET_CAP(dev, dev->dsr->caps.max_sge, dev->dsr->caps.max_sge_rd); props->max_srq = dev->dsr->caps.max_srq; @@ -154,7 +155,8 @@ int pvrdma_query_port(struct ib_device *ibdev, u8 port, props->gid_tbl_len = resp->attrs.gid_tbl_len; props->port_cap_flags = pvrdma_port_cap_flags_to_ib(resp->attrs.port_cap_flags); - props->port_cap_flags |= IB_PORT_CM_SUP | IB_PORT_IP_BASED_GIDS; + props->port_cap_flags |= IB_PORT_CM_SUP; + props->ip_gids = true; props->max_msg_sz = resp->attrs.max_msg_sz; props->bad_pkey_cntr = resp->attrs.bad_pkey_cntr; props->qkey_viol_cntr = resp->attrs.qkey_viol_cntr; diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h index b7b25728a7e5..b2e3ab50cb08 100644 --- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h +++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h @@ -412,15 +412,10 @@ struct ib_mr *pvrdma_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type, u32 max_num_sg); int pvrdma_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents, unsigned int *sg_offset); -int pvrdma_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period); -int pvrdma_resize_cq(struct ib_cq *ibcq, int entries, - struct ib_udata *udata); struct ib_cq *pvrdma_create_cq(struct ib_device *ibdev, const struct ib_cq_init_attr *attr, struct ib_ucontext *context, struct ib_udata *udata); -int pvrdma_resize_cq(struct ib_cq *ibcq, int entries, - struct ib_udata *udata); int pvrdma_destroy_cq(struct ib_cq *cq); int pvrdma_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc); int pvrdma_req_notify_cq(struct ib_cq *cq, enum ib_cq_notify_flags flags); @@ -435,8 +430,6 @@ int pvrdma_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, enum ib_srq_attr_mask attr_mask, struct ib_udata *udata); int pvrdma_query_srq(struct ib_srq *srq, struct ib_srq_attr *srq_attr); int pvrdma_destroy_srq(struct ib_srq *srq); -int pvrdma_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr, - struct ib_recv_wr **bad_wr); struct ib_qp *pvrdma_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *init_attr, @@ -446,9 +439,9 @@ int pvrdma_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int pvrdma_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr); int pvrdma_destroy_qp(struct ib_qp *qp); -int pvrdma_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, - struct ib_send_wr **bad_wr); -int pvrdma_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr, - struct ib_recv_wr **bad_wr); +int pvrdma_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, + const struct ib_send_wr **bad_wr); +int pvrdma_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr, + const struct ib_recv_wr **bad_wr); #endif /* __PVRDMA_VERBS_H__ */ diff --git a/drivers/infiniband/sw/rdmavt/ah.c b/drivers/infiniband/sw/rdmavt/ah.c index ba3639a0d77c..89ec0f64abfc 100644 --- a/drivers/infiniband/sw/rdmavt/ah.c +++ b/drivers/infiniband/sw/rdmavt/ah.c @@ -120,7 +120,8 @@ struct ib_ah *rvt_create_ah(struct ib_pd *pd, dev->n_ahs_allocated++; spin_unlock_irqrestore(&dev->n_ahs_lock, flags); - ah->attr = *ah_attr; + rdma_copy_ah_attr(&ah->attr, ah_attr); + atomic_set(&ah->refcount, 0); if (dev->driver_f.notify_new_ah) @@ -148,6 +149,7 @@ int rvt_destroy_ah(struct ib_ah *ibah) dev->n_ahs_allocated--; spin_unlock_irqrestore(&dev->n_ahs_lock, flags); + rdma_destroy_ah_attr(&ah->attr); kfree(ah); return 0; diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c index 41183bd665ca..5ce403c6cddb 100644 --- a/drivers/infiniband/sw/rdmavt/qp.c +++ b/drivers/infiniband/sw/rdmavt/qp.c @@ -780,14 +780,15 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd, if (!rdi) return ERR_PTR(-EINVAL); - if (init_attr->cap.max_send_sge > rdi->dparms.props.max_sge || + if (init_attr->cap.max_send_sge > rdi->dparms.props.max_send_sge || init_attr->cap.max_send_wr > rdi->dparms.props.max_qp_wr || init_attr->create_flags) return ERR_PTR(-EINVAL); /* Check receive queue parameters if no SRQ is specified. */ if (!init_attr->srq) { - if (init_attr->cap.max_recv_sge > rdi->dparms.props.max_sge || + if (init_attr->cap.max_recv_sge > + rdi->dparms.props.max_recv_sge || init_attr->cap.max_recv_wr > rdi->dparms.props.max_qp_wr) return ERR_PTR(-EINVAL); @@ -1336,13 +1337,13 @@ int rvt_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, qp->qp_access_flags = attr->qp_access_flags; if (attr_mask & IB_QP_AV) { - qp->remote_ah_attr = attr->ah_attr; + rdma_replace_ah_attr(&qp->remote_ah_attr, &attr->ah_attr); qp->s_srate = rdma_ah_get_static_rate(&attr->ah_attr); qp->srate_mbps = ib_rate_to_mbps(qp->s_srate); } if (attr_mask & IB_QP_ALT_PATH) { - qp->alt_ah_attr = attr->alt_ah_attr; + rdma_replace_ah_attr(&qp->alt_ah_attr, &attr->alt_ah_attr); qp->s_alt_pkey_index = attr->alt_pkey_index; } @@ -1459,6 +1460,8 @@ int rvt_destroy_qp(struct ib_qp *ibqp) vfree(qp->s_wq); rdi->driver_f.qp_priv_free(rdi, qp); kfree(qp->s_ack_queue); + rdma_destroy_ah_attr(&qp->remote_ah_attr); + rdma_destroy_ah_attr(&qp->alt_ah_attr); kfree(qp); return 0; } @@ -1535,8 +1538,8 @@ int rvt_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, * * Return: 0 on success otherwise errno */ -int rvt_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr, - struct ib_recv_wr **bad_wr) +int rvt_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr, + const struct ib_recv_wr **bad_wr) { struct rvt_qp *qp = ibqp_to_rvtqp(ibqp); struct rvt_rwq *wq = qp->r_rq.wq; @@ -1617,7 +1620,7 @@ int rvt_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr, static inline int rvt_qp_valid_operation( struct rvt_qp *qp, const struct rvt_operation_params *post_parms, - struct ib_send_wr *wr) + const struct ib_send_wr *wr) { int len; @@ -1714,7 +1717,7 @@ static inline int rvt_qp_is_avail( * @wr: the work request to send */ static int rvt_post_one_wr(struct rvt_qp *qp, - struct ib_send_wr *wr, + const struct ib_send_wr *wr, int *call_send) { struct rvt_swqe *wqe; @@ -1888,8 +1891,8 @@ bail_inval_free: * * Return: 0 on success else errno */ -int rvt_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, - struct ib_send_wr **bad_wr) +int rvt_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, + const struct ib_send_wr **bad_wr) { struct rvt_qp *qp = ibqp_to_rvtqp(ibqp); struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device); @@ -1945,8 +1948,8 @@ bail: * * Return: 0 on success else errno */ -int rvt_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr, - struct ib_recv_wr **bad_wr) +int rvt_post_srq_recv(struct ib_srq *ibsrq, const struct ib_recv_wr *wr, + const struct ib_recv_wr **bad_wr) { struct rvt_srq *srq = ibsrq_to_rvtsrq(ibsrq); struct rvt_rwq *wq; diff --git a/drivers/infiniband/sw/rdmavt/qp.h b/drivers/infiniband/sw/rdmavt/qp.h index 8409f80d5f25..264811fdc530 100644 --- a/drivers/infiniband/sw/rdmavt/qp.h +++ b/drivers/infiniband/sw/rdmavt/qp.h @@ -60,10 +60,10 @@ int rvt_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int rvt_destroy_qp(struct ib_qp *ibqp); int rvt_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask, struct ib_qp_init_attr *init_attr); -int rvt_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr, - struct ib_recv_wr **bad_wr); -int rvt_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, - struct ib_send_wr **bad_wr); -int rvt_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr, - struct ib_recv_wr **bad_wr); +int rvt_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr, + const struct ib_recv_wr **bad_wr); +int rvt_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, + const struct ib_send_wr **bad_wr); +int rvt_post_srq_recv(struct ib_srq *ibsrq, const struct ib_recv_wr *wr, + const struct ib_recv_wr **bad_wr); #endif /* DEF_RVTQP_H */ diff --git a/drivers/infiniband/sw/rdmavt/srq.c b/drivers/infiniband/sw/rdmavt/srq.c index 3707952b4364..78e06fc456c5 100644 --- a/drivers/infiniband/sw/rdmavt/srq.c +++ b/drivers/infiniband/sw/rdmavt/srq.c @@ -82,7 +82,7 @@ struct ib_srq *rvt_create_srq(struct ib_pd *ibpd, struct ib_srq *ret; if (srq_init_attr->srq_type != IB_SRQT_BASIC) - return ERR_PTR(-ENOSYS); + return ERR_PTR(-EOPNOTSUPP); if (srq_init_attr->attr.max_sge == 0 || srq_init_attr->attr.max_sge > dev->dparms.props.max_srq_sge || diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c index 7121e1b1eb89..10999fa69281 100644 --- a/drivers/infiniband/sw/rxe/rxe.c +++ b/drivers/infiniband/sw/rxe/rxe.c @@ -91,7 +91,8 @@ static void rxe_init_device_param(struct rxe_dev *rxe) rxe->attr.max_qp = RXE_MAX_QP; rxe->attr.max_qp_wr = RXE_MAX_QP_WR; rxe->attr.device_cap_flags = RXE_DEVICE_CAP_FLAGS; - rxe->attr.max_sge = RXE_MAX_SGE; + rxe->attr.max_send_sge = RXE_MAX_SGE; + rxe->attr.max_recv_sge = RXE_MAX_SGE; rxe->attr.max_sge_rd = RXE_MAX_SGE_RD; rxe->attr.max_cq = RXE_MAX_CQ; rxe->attr.max_cqe = (1 << RXE_MAX_LOG_CQE) - 1; diff --git a/drivers/infiniband/sw/rxe/rxe_av.c b/drivers/infiniband/sw/rxe/rxe_av.c index 7f1ae364088a..26fe8d7dbc55 100644 --- a/drivers/infiniband/sw/rxe/rxe_av.c +++ b/drivers/infiniband/sw/rxe/rxe_av.c @@ -55,29 +55,41 @@ int rxe_av_chk_attr(struct rxe_dev *rxe, struct rdma_ah_attr *attr) void rxe_av_from_attr(u8 port_num, struct rxe_av *av, struct rdma_ah_attr *attr) { + const struct ib_global_route *grh = rdma_ah_read_grh(attr); + memset(av, 0, sizeof(*av)); - memcpy(&av->grh, rdma_ah_read_grh(attr), - sizeof(*rdma_ah_read_grh(attr))); + memcpy(av->grh.dgid.raw, grh->dgid.raw, sizeof(grh->dgid.raw)); + av->grh.flow_label = grh->flow_label; + av->grh.sgid_index = grh->sgid_index; + av->grh.hop_limit = grh->hop_limit; + av->grh.traffic_class = grh->traffic_class; av->port_num = port_num; } void rxe_av_to_attr(struct rxe_av *av, struct rdma_ah_attr *attr) { + struct ib_global_route *grh = rdma_ah_retrieve_grh(attr); + attr->type = RDMA_AH_ATTR_TYPE_ROCE; - memcpy(rdma_ah_retrieve_grh(attr), &av->grh, sizeof(av->grh)); + + memcpy(grh->dgid.raw, av->grh.dgid.raw, sizeof(av->grh.dgid.raw)); + grh->flow_label = av->grh.flow_label; + grh->sgid_index = av->grh.sgid_index; + grh->hop_limit = av->grh.hop_limit; + grh->traffic_class = av->grh.traffic_class; + rdma_ah_set_ah_flags(attr, IB_AH_GRH); rdma_ah_set_port_num(attr, av->port_num); } -void rxe_av_fill_ip_info(struct rxe_av *av, - struct rdma_ah_attr *attr, - struct ib_gid_attr *sgid_attr, - union ib_gid *sgid) +void rxe_av_fill_ip_info(struct rxe_av *av, struct rdma_ah_attr *attr) { - rdma_gid2ip((struct sockaddr *)&av->sgid_addr, sgid); + const struct ib_gid_attr *sgid_attr = attr->grh.sgid_attr; + + rdma_gid2ip((struct sockaddr *)&av->sgid_addr, &sgid_attr->gid); rdma_gid2ip((struct sockaddr *)&av->dgid_addr, &rdma_ah_read_grh(attr)->dgid); - av->network_type = ib_gid_to_network_type(sgid_attr->gid_type, sgid); + av->network_type = rdma_gid_attr_network_type(sgid_attr); } struct rxe_av *rxe_get_av(struct rxe_pkt_info *pkt) diff --git a/drivers/infiniband/sw/rxe/rxe_comp.c b/drivers/infiniband/sw/rxe/rxe_comp.c index 98d470d1f3fc..83311dd07019 100644 --- a/drivers/infiniband/sw/rxe/rxe_comp.c +++ b/drivers/infiniband/sw/rxe/rxe_comp.c @@ -276,6 +276,7 @@ static inline enum comp_state check_ack(struct rxe_qp *qp, case IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE: if (wqe->wr.opcode != IB_WR_RDMA_READ && wqe->wr.opcode != IB_WR_RDMA_READ_WITH_INV) { + wqe->status = IB_WC_FATAL_ERR; return COMPST_ERROR; } reset_retry_counters(qp); diff --git a/drivers/infiniband/sw/rxe/rxe_loc.h b/drivers/infiniband/sw/rxe/rxe_loc.h index a51ece596c43..87d14f7ef21b 100644 --- a/drivers/infiniband/sw/rxe/rxe_loc.h +++ b/drivers/infiniband/sw/rxe/rxe_loc.h @@ -43,10 +43,7 @@ void rxe_av_from_attr(u8 port_num, struct rxe_av *av, void rxe_av_to_attr(struct rxe_av *av, struct rdma_ah_attr *attr); -void rxe_av_fill_ip_info(struct rxe_av *av, - struct rdma_ah_attr *attr, - struct ib_gid_attr *sgid_attr, - union ib_gid *sgid); +void rxe_av_fill_ip_info(struct rxe_av *av, struct rdma_ah_attr *attr); struct rxe_av *rxe_get_av(struct rxe_pkt_info *pkt); diff --git a/drivers/infiniband/sw/rxe/rxe_net.c b/drivers/infiniband/sw/rxe/rxe_net.c index 59ec6d918ed4..8094cbaa54a9 100644 --- a/drivers/infiniband/sw/rxe/rxe_net.c +++ b/drivers/infiniband/sw/rxe/rxe_net.c @@ -182,39 +182,19 @@ static struct dst_entry *rxe_find_route6(struct net_device *ndev, #endif -/* - * Derive the net_device from the av. - * For physical devices, this will just return rxe->ndev. - * But for VLAN devices, it will return the vlan dev. - * Caller should dev_put() the returned net_device. - */ -static struct net_device *rxe_netdev_from_av(struct rxe_dev *rxe, - int port_num, - struct rxe_av *av) -{ - union ib_gid gid; - struct ib_gid_attr attr; - struct net_device *ndev = rxe->ndev; - - if (ib_get_cached_gid(&rxe->ib_dev, port_num, av->grh.sgid_index, - &gid, &attr) == 0 && - attr.ndev && attr.ndev != ndev) - ndev = attr.ndev; - else - /* Only to ensure that caller may call dev_put() */ - dev_hold(ndev); - - return ndev; -} - static struct dst_entry *rxe_find_route(struct rxe_dev *rxe, struct rxe_qp *qp, struct rxe_av *av) { + const struct ib_gid_attr *attr; struct dst_entry *dst = NULL; struct net_device *ndev; - ndev = rxe_netdev_from_av(rxe, qp->attr.port_num, av); + attr = rdma_get_gid_attr(&rxe->ib_dev, qp->attr.port_num, + av->grh.sgid_index); + if (IS_ERR(attr)) + return NULL; + ndev = attr->ndev; if (qp_type(qp) == IB_QPT_RC) dst = sk_dst_get(qp->sk->sk); @@ -243,9 +223,13 @@ static struct dst_entry *rxe_find_route(struct rxe_dev *rxe, rt6_get_cookie((struct rt6_info *)dst); #endif } - } - dev_put(ndev); + if (dst && (qp_type(qp) == IB_QPT_RC)) { + dst_hold(dst); + sk_dst_set(qp->sk->sk, dst); + } + } + rdma_put_gid_attr(attr); return dst; } @@ -418,11 +402,7 @@ static int prepare4(struct rxe_dev *rxe, struct rxe_pkt_info *pkt, prepare_ipv4_hdr(dst, skb, saddr->s_addr, daddr->s_addr, IPPROTO_UDP, av->grh.traffic_class, av->grh.hop_limit, df, xnet); - if (qp_type(qp) == IB_QPT_RC) - sk_dst_set(qp->sk->sk, dst); - else - dst_release(dst); - + dst_release(dst); return 0; } @@ -450,11 +430,7 @@ static int prepare6(struct rxe_dev *rxe, struct rxe_pkt_info *pkt, av->grh.traffic_class, av->grh.hop_limit); - if (qp_type(qp) == IB_QPT_RC) - sk_dst_set(qp->sk->sk, dst); - else - dst_release(dst); - + dst_release(dst); return 0; } @@ -536,9 +512,13 @@ struct sk_buff *rxe_init_packet(struct rxe_dev *rxe, struct rxe_av *av, unsigned int hdr_len; struct sk_buff *skb; struct net_device *ndev; + const struct ib_gid_attr *attr; const int port_num = 1; - ndev = rxe_netdev_from_av(rxe, port_num, av); + attr = rdma_get_gid_attr(&rxe->ib_dev, port_num, av->grh.sgid_index); + if (IS_ERR(attr)) + return NULL; + ndev = attr->ndev; if (av->network_type == RDMA_NETWORK_IPV4) hdr_len = ETH_HLEN + sizeof(struct udphdr) + @@ -550,10 +530,8 @@ struct sk_buff *rxe_init_packet(struct rxe_dev *rxe, struct rxe_av *av, skb = alloc_skb(paylen + hdr_len + LL_RESERVED_SPACE(ndev), GFP_ATOMIC); - if (unlikely(!skb)) { - dev_put(ndev); - return NULL; - } + if (unlikely(!skb)) + goto out; skb_reserve(skb, hdr_len + LL_RESERVED_SPACE(rxe->ndev)); @@ -568,7 +546,8 @@ struct sk_buff *rxe_init_packet(struct rxe_dev *rxe, struct rxe_av *av, pkt->hdr = skb_put_zero(skb, paylen); pkt->mask |= RXE_GRH_MASK; - dev_put(ndev); +out: + rdma_put_gid_attr(attr); return skb; } diff --git a/drivers/infiniband/sw/rxe/rxe_param.h b/drivers/infiniband/sw/rxe/rxe_param.h index 1b596fbbe251..4555510d86c4 100644 --- a/drivers/infiniband/sw/rxe/rxe_param.h +++ b/drivers/infiniband/sw/rxe/rxe_param.h @@ -83,7 +83,7 @@ enum rxe_device_param { RXE_MAX_SGE_RD = 32, RXE_MAX_CQ = 16384, RXE_MAX_LOG_CQE = 15, - RXE_MAX_MR = 2 * 1024, + RXE_MAX_MR = 256 * 1024, RXE_MAX_PD = 0x7ffc, RXE_MAX_QP_RD_ATOM = 128, RXE_MAX_EE_RD_ATOM = 0, diff --git a/drivers/infiniband/sw/rxe/rxe_qp.c b/drivers/infiniband/sw/rxe/rxe_qp.c index b9f7aa1114b2..c58452daffc7 100644 --- a/drivers/infiniband/sw/rxe/rxe_qp.c +++ b/drivers/infiniband/sw/rxe/rxe_qp.c @@ -49,9 +49,9 @@ static int rxe_qp_chk_cap(struct rxe_dev *rxe, struct ib_qp_cap *cap, goto err1; } - if (cap->max_send_sge > rxe->attr.max_sge) { + if (cap->max_send_sge > rxe->attr.max_send_sge) { pr_warn("invalid send sge = %d > %d\n", - cap->max_send_sge, rxe->attr.max_sge); + cap->max_send_sge, rxe->attr.max_send_sge); goto err1; } @@ -62,9 +62,9 @@ static int rxe_qp_chk_cap(struct rxe_dev *rxe, struct ib_qp_cap *cap, goto err1; } - if (cap->max_recv_sge > rxe->attr.max_sge) { + if (cap->max_recv_sge > rxe->attr.max_recv_sge) { pr_warn("invalid recv sge = %d > %d\n", - cap->max_recv_sge, rxe->attr.max_sge); + cap->max_recv_sge, rxe->attr.max_recv_sge); goto err1; } } @@ -580,9 +580,6 @@ int rxe_qp_from_attr(struct rxe_qp *qp, struct ib_qp_attr *attr, int mask, struct ib_udata *udata) { int err; - struct rxe_dev *rxe = to_rdev(qp->ibqp.device); - union ib_gid sgid; - struct ib_gid_attr sgid_attr; if (mask & IB_QP_MAX_QP_RD_ATOMIC) { int max_rd_atomic = __roundup_pow_of_two(attr->max_rd_atomic); @@ -623,30 +620,14 @@ int rxe_qp_from_attr(struct rxe_qp *qp, struct ib_qp_attr *attr, int mask, qp->attr.qkey = attr->qkey; if (mask & IB_QP_AV) { - ib_get_cached_gid(&rxe->ib_dev, 1, - rdma_ah_read_grh(&attr->ah_attr)->sgid_index, - &sgid, &sgid_attr); rxe_av_from_attr(attr->port_num, &qp->pri_av, &attr->ah_attr); - rxe_av_fill_ip_info(&qp->pri_av, &attr->ah_attr, - &sgid_attr, &sgid); - if (sgid_attr.ndev) - dev_put(sgid_attr.ndev); + rxe_av_fill_ip_info(&qp->pri_av, &attr->ah_attr); } if (mask & IB_QP_ALT_PATH) { - u8 sgid_index = - rdma_ah_read_grh(&attr->alt_ah_attr)->sgid_index; - - ib_get_cached_gid(&rxe->ib_dev, 1, sgid_index, - &sgid, &sgid_attr); - rxe_av_from_attr(attr->alt_port_num, &qp->alt_av, &attr->alt_ah_attr); - rxe_av_fill_ip_info(&qp->alt_av, &attr->alt_ah_attr, - &sgid_attr, &sgid); - if (sgid_attr.ndev) - dev_put(sgid_attr.ndev); - + rxe_av_fill_ip_info(&qp->alt_av, &attr->alt_ah_attr); qp->attr.alt_port_num = attr->alt_port_num; qp->attr.alt_pkey_index = attr->alt_pkey_index; qp->attr.alt_timeout = attr->alt_timeout; diff --git a/drivers/infiniband/sw/rxe/rxe_recv.c b/drivers/infiniband/sw/rxe/rxe_recv.c index dfba44a40f0b..d30dbac24583 100644 --- a/drivers/infiniband/sw/rxe/rxe_recv.c +++ b/drivers/infiniband/sw/rxe/rxe_recv.c @@ -225,9 +225,14 @@ static int hdr_check(struct rxe_pkt_info *pkt) goto err1; } + if (unlikely(qpn == 0)) { + pr_warn_once("QP 0 not supported"); + goto err1; + } + if (qpn != IB_MULTICAST_QPN) { - index = (qpn == 0) ? port->qp_smi_index : - ((qpn == 1) ? port->qp_gsi_index : qpn); + index = (qpn == 1) ? port->qp_gsi_index : qpn; + qp = rxe_pool_get_index(&rxe->qp_pool, index); if (unlikely(!qp)) { pr_warn_ratelimited("no qp matches qpn 0x%x\n", qpn); @@ -256,8 +261,7 @@ static int hdr_check(struct rxe_pkt_info *pkt) return 0; err2: - if (qp) - rxe_drop_ref(qp); + rxe_drop_ref(qp); err1: return -EINVAL; } @@ -328,6 +332,7 @@ err1: static int rxe_match_dgid(struct rxe_dev *rxe, struct sk_buff *skb) { + const struct ib_gid_attr *gid_attr; union ib_gid dgid; union ib_gid *pdgid; @@ -339,9 +344,14 @@ static int rxe_match_dgid(struct rxe_dev *rxe, struct sk_buff *skb) pdgid = (union ib_gid *)&ipv6_hdr(skb)->daddr; } - return ib_find_cached_gid_by_port(&rxe->ib_dev, pdgid, - IB_GID_TYPE_ROCE_UDP_ENCAP, - 1, skb->dev, NULL); + gid_attr = rdma_find_gid_by_port(&rxe->ib_dev, pdgid, + IB_GID_TYPE_ROCE_UDP_ENCAP, + 1, skb->dev); + if (IS_ERR(gid_attr)) + return PTR_ERR(gid_attr); + + rdma_put_gid_attr(gid_attr); + return 0; } /* rxe_rcv is called from the interface driver */ diff --git a/drivers/infiniband/sw/rxe/rxe_resp.c b/drivers/infiniband/sw/rxe/rxe_resp.c index 5b57de30dee4..aa5833318372 100644 --- a/drivers/infiniband/sw/rxe/rxe_resp.c +++ b/drivers/infiniband/sw/rxe/rxe_resp.c @@ -884,6 +884,11 @@ static enum resp_states do_complete(struct rxe_qp *qp, else wc->network_hdr_type = RDMA_NETWORK_IPV6; + if (is_vlan_dev(skb->dev)) { + wc->wc_flags |= IB_WC_WITH_VLAN; + wc->vlan_id = vlan_dev_vlan_id(skb->dev); + } + if (pkt->mask & RXE_IMMDT_MASK) { wc->wc_flags |= IB_WC_WITH_IMM; wc->ex.imm_data = immdt_imm(pkt); diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c index 9deafc3aa6af..f5b1e0ad6142 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.c +++ b/drivers/infiniband/sw/rxe/rxe_verbs.c @@ -222,25 +222,11 @@ static int rxe_dealloc_pd(struct ib_pd *ibpd) return 0; } -static int rxe_init_av(struct rxe_dev *rxe, struct rdma_ah_attr *attr, - struct rxe_av *av) +static void rxe_init_av(struct rxe_dev *rxe, struct rdma_ah_attr *attr, + struct rxe_av *av) { - int err; - union ib_gid sgid; - struct ib_gid_attr sgid_attr; - - err = ib_get_cached_gid(&rxe->ib_dev, rdma_ah_get_port_num(attr), - rdma_ah_read_grh(attr)->sgid_index, &sgid, - &sgid_attr); - if (err) { - pr_err("Failed to query sgid. err = %d\n", err); - return err; - } - rxe_av_from_attr(rdma_ah_get_port_num(attr), av, attr); - rxe_av_fill_ip_info(av, attr, &sgid_attr, &sgid); - dev_put(sgid_attr.ndev); - return 0; + rxe_av_fill_ip_info(av, attr); } static struct ib_ah *rxe_create_ah(struct ib_pd *ibpd, @@ -255,28 +241,17 @@ static struct ib_ah *rxe_create_ah(struct ib_pd *ibpd, err = rxe_av_chk_attr(rxe, attr); if (err) - goto err1; + return ERR_PTR(err); ah = rxe_alloc(&rxe->ah_pool); - if (!ah) { - err = -ENOMEM; - goto err1; - } + if (!ah) + return ERR_PTR(-ENOMEM); rxe_add_ref(pd); ah->pd = pd; - err = rxe_init_av(rxe, attr, &ah->av); - if (err) - goto err2; - + rxe_init_av(rxe, attr, &ah->av); return &ah->ibah; - -err2: - rxe_drop_ref(pd); - rxe_drop_ref(ah); -err1: - return ERR_PTR(err); } static int rxe_modify_ah(struct ib_ah *ibah, struct rdma_ah_attr *attr) @@ -289,10 +264,7 @@ static int rxe_modify_ah(struct ib_ah *ibah, struct rdma_ah_attr *attr) if (err) return err; - err = rxe_init_av(rxe, attr, &ah->av); - if (err) - return err; - + rxe_init_av(rxe, attr, &ah->av); return 0; } @@ -315,7 +287,7 @@ static int rxe_destroy_ah(struct ib_ah *ibah) return 0; } -static int post_one_recv(struct rxe_rq *rq, struct ib_recv_wr *ibwr) +static int post_one_recv(struct rxe_rq *rq, const struct ib_recv_wr *ibwr) { int err; int i; @@ -466,8 +438,8 @@ static int rxe_destroy_srq(struct ib_srq *ibsrq) return 0; } -static int rxe_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr, - struct ib_recv_wr **bad_wr) +static int rxe_post_srq_recv(struct ib_srq *ibsrq, const struct ib_recv_wr *wr, + const struct ib_recv_wr **bad_wr) { int err = 0; unsigned long flags; @@ -582,7 +554,7 @@ static int rxe_destroy_qp(struct ib_qp *ibqp) return 0; } -static int validate_send_wr(struct rxe_qp *qp, struct ib_send_wr *ibwr, +static int validate_send_wr(struct rxe_qp *qp, const struct ib_send_wr *ibwr, unsigned int mask, unsigned int length) { int num_sge = ibwr->num_sge; @@ -610,7 +582,7 @@ err1: } static void init_send_wr(struct rxe_qp *qp, struct rxe_send_wr *wr, - struct ib_send_wr *ibwr) + const struct ib_send_wr *ibwr) { wr->wr_id = ibwr->wr_id; wr->num_sge = ibwr->num_sge; @@ -665,7 +637,7 @@ static void init_send_wr(struct rxe_qp *qp, struct rxe_send_wr *wr, } } -static int init_send_wqe(struct rxe_qp *qp, struct ib_send_wr *ibwr, +static int init_send_wqe(struct rxe_qp *qp, const struct ib_send_wr *ibwr, unsigned int mask, unsigned int length, struct rxe_send_wqe *wqe) { @@ -713,7 +685,7 @@ static int init_send_wqe(struct rxe_qp *qp, struct ib_send_wr *ibwr, return 0; } -static int post_one_send(struct rxe_qp *qp, struct ib_send_wr *ibwr, +static int post_one_send(struct rxe_qp *qp, const struct ib_send_wr *ibwr, unsigned int mask, u32 length) { int err; @@ -754,8 +726,8 @@ err1: return err; } -static int rxe_post_send_kernel(struct rxe_qp *qp, struct ib_send_wr *wr, - struct ib_send_wr **bad_wr) +static int rxe_post_send_kernel(struct rxe_qp *qp, const struct ib_send_wr *wr, + const struct ib_send_wr **bad_wr) { int err = 0; unsigned int mask; @@ -797,8 +769,8 @@ static int rxe_post_send_kernel(struct rxe_qp *qp, struct ib_send_wr *wr, return err; } -static int rxe_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, - struct ib_send_wr **bad_wr) +static int rxe_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, + const struct ib_send_wr **bad_wr) { struct rxe_qp *qp = to_rqp(ibqp); @@ -820,8 +792,8 @@ static int rxe_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, return rxe_post_send_kernel(qp, wr, bad_wr); } -static int rxe_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr, - struct ib_recv_wr **bad_wr) +static int rxe_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr, + const struct ib_recv_wr **bad_wr) { int err = 0; struct rxe_qp *qp = to_rqp(ibqp); diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h index a50b062ed13e..1abe3c62f106 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib.h +++ b/drivers/infiniband/ulp/ipoib/ipoib.h @@ -91,11 +91,9 @@ enum { IPOIB_STOP_REAPER = 7, IPOIB_FLAG_ADMIN_CM = 9, IPOIB_FLAG_UMCAST = 10, - IPOIB_STOP_NEIGH_GC = 11, IPOIB_NEIGH_TBL_FLUSH = 12, IPOIB_FLAG_DEV_ADDR_SET = 13, IPOIB_FLAG_DEV_ADDR_CTRL = 14, - IPOIB_FLAG_GOING_DOWN = 15, IPOIB_MAX_BACKOFF_SECONDS = 16, @@ -252,11 +250,11 @@ struct ipoib_cm_tx { struct ipoib_neigh *neigh; struct ipoib_path *path; struct ipoib_tx_buf *tx_ring; - unsigned tx_head; - unsigned tx_tail; + unsigned int tx_head; + unsigned int tx_tail; unsigned long flags; u32 mtu; - unsigned max_send_sge; + unsigned int max_send_sge; }; struct ipoib_cm_rx_buf { @@ -325,15 +323,22 @@ struct ipoib_dev_priv { spinlock_t lock; struct net_device *dev; + void (*next_priv_destructor)(struct net_device *dev); struct napi_struct send_napi; struct napi_struct recv_napi; unsigned long flags; + /* + * This protects access to the child_intfs list. + * To READ from child_intfs the RTNL or vlan_rwsem read side must be + * held. To WRITE RTNL and the vlan_rwsem write side must be held (in + * that order) This lock exists because we have a few contexts where + * we need the child_intfs, but do not want to grab the RTNL. + */ struct rw_semaphore vlan_rwsem; struct mutex mcast_mutex; - struct mutex sysfs_mutex; struct rb_root path_tree; struct list_head path_list; @@ -373,8 +378,8 @@ struct ipoib_dev_priv { struct ipoib_rx_buf *rx_ring; struct ipoib_tx_buf *tx_ring; - unsigned tx_head; - unsigned tx_tail; + unsigned int tx_head; + unsigned int tx_tail; struct ib_sge tx_sge[MAX_SKB_FRAGS + 1]; struct ib_ud_wr tx_wr; struct ib_wc send_wc[MAX_SEND_CQE]; @@ -404,7 +409,7 @@ struct ipoib_dev_priv { #endif u64 hca_caps; struct ipoib_ethtool_st ethtool; - unsigned max_send_sge; + unsigned int max_send_sge; bool sm_fullmember_sendonly_support; const struct net_device_ops *rn_ops; }; @@ -414,7 +419,7 @@ struct ipoib_ah { struct ib_ah *ah; struct list_head list; struct kref ref; - unsigned last_send; + unsigned int last_send; int valid; }; @@ -483,6 +488,7 @@ static inline void ipoib_put_ah(struct ipoib_ah *ah) kref_put(&ah->ref, ipoib_free_ah); } int ipoib_open(struct net_device *dev); +void ipoib_intf_free(struct net_device *dev); int ipoib_add_pkey_attr(struct net_device *dev); int ipoib_add_umcast_attr(struct net_device *dev); @@ -510,9 +516,6 @@ void ipoib_ib_dev_down(struct net_device *dev); int ipoib_ib_dev_stop_default(struct net_device *dev); void ipoib_pkey_dev_check_presence(struct net_device *dev); -int ipoib_dev_init(struct net_device *dev, struct ib_device *ca, int port); -void ipoib_dev_cleanup(struct net_device *dev); - void ipoib_mcast_join_task(struct work_struct *work); void ipoib_mcast_carrier_on_task(struct work_struct *work); void ipoib_mcast_send(struct net_device *dev, u8 *daddr, struct sk_buff *skb); @@ -600,7 +603,6 @@ void ipoib_pkey_open(struct ipoib_dev_priv *priv); void ipoib_drain_cq(struct net_device *dev); void ipoib_set_ethtool_ops(struct net_device *dev); -void ipoib_set_dev_features(struct ipoib_dev_priv *priv, struct ib_device *hca); #define IPOIB_FLAGS_RC 0x80 #define IPOIB_FLAGS_UC 0x40 @@ -729,7 +731,7 @@ void ipoib_cm_dev_stop(struct net_device *dev) static inline int ipoib_cm_dev_init(struct net_device *dev) { - return -ENOSYS; + return -EOPNOTSUPP; } static inline diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c index 6535d9beb24d..ea01b8dd2be6 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c @@ -78,7 +78,7 @@ static struct ib_send_wr ipoib_cm_rx_drain_wr = { }; static int ipoib_cm_tx_handler(struct ib_cm_id *cm_id, - struct ib_cm_event *event); + const struct ib_cm_event *event); static void ipoib_cm_dma_unmap_rx(struct ipoib_dev_priv *priv, int frags, u64 mapping[IPOIB_CM_RX_SG]) @@ -94,7 +94,6 @@ static void ipoib_cm_dma_unmap_rx(struct ipoib_dev_priv *priv, int frags, static int ipoib_cm_post_receive_srq(struct net_device *dev, int id) { struct ipoib_dev_priv *priv = ipoib_priv(dev); - struct ib_recv_wr *bad_wr; int i, ret; priv->cm.rx_wr.wr_id = id | IPOIB_OP_CM | IPOIB_OP_RECV; @@ -102,7 +101,7 @@ static int ipoib_cm_post_receive_srq(struct net_device *dev, int id) for (i = 0; i < priv->cm.num_frags; ++i) priv->cm.rx_sge[i].addr = priv->cm.srq_ring[id].mapping[i]; - ret = ib_post_srq_recv(priv->cm.srq, &priv->cm.rx_wr, &bad_wr); + ret = ib_post_srq_recv(priv->cm.srq, &priv->cm.rx_wr, NULL); if (unlikely(ret)) { ipoib_warn(priv, "post srq failed for buf %d (%d)\n", id, ret); ipoib_cm_dma_unmap_rx(priv, priv->cm.num_frags - 1, @@ -120,7 +119,6 @@ static int ipoib_cm_post_receive_nonsrq(struct net_device *dev, struct ib_sge *sge, int id) { struct ipoib_dev_priv *priv = ipoib_priv(dev); - struct ib_recv_wr *bad_wr; int i, ret; wr->wr_id = id | IPOIB_OP_CM | IPOIB_OP_RECV; @@ -128,7 +126,7 @@ static int ipoib_cm_post_receive_nonsrq(struct net_device *dev, for (i = 0; i < IPOIB_CM_RX_SG; ++i) sge[i].addr = rx->rx_ring[id].mapping[i]; - ret = ib_post_recv(rx->qp, wr, &bad_wr); + ret = ib_post_recv(rx->qp, wr, NULL); if (unlikely(ret)) { ipoib_warn(priv, "post recv failed for buf %d (%d)\n", id, ret); ipoib_cm_dma_unmap_rx(priv, IPOIB_CM_RX_SG - 1, @@ -212,7 +210,6 @@ static void ipoib_cm_free_rx_ring(struct net_device *dev, static void ipoib_cm_start_rx_drain(struct ipoib_dev_priv *priv) { - struct ib_send_wr *bad_wr; struct ipoib_cm_rx *p; /* We only reserved 1 extra slot in CQ for drain WRs, so @@ -227,7 +224,7 @@ static void ipoib_cm_start_rx_drain(struct ipoib_dev_priv *priv) */ p = list_entry(priv->cm.rx_flush_list.next, typeof(*p), list); ipoib_cm_rx_drain_wr.wr_id = IPOIB_CM_RX_DRAIN_WRID; - if (ib_post_send(p->qp, &ipoib_cm_rx_drain_wr, &bad_wr)) + if (ib_post_send(p->qp, &ipoib_cm_rx_drain_wr, NULL)) ipoib_warn(priv, "failed to post drain wr\n"); list_splice_init(&priv->cm.rx_flush_list, &priv->cm.rx_drain_list); @@ -275,7 +272,7 @@ static struct ib_qp *ipoib_cm_create_rx_qp(struct net_device *dev, static int ipoib_cm_modify_rx_qp(struct net_device *dev, struct ib_cm_id *cm_id, struct ib_qp *qp, - unsigned psn) + unsigned int psn) { struct ipoib_dev_priv *priv = ipoib_priv(dev); struct ib_qp_attr qp_attr; @@ -363,7 +360,7 @@ static int ipoib_cm_nonsrq_init_rx(struct net_device *dev, struct ib_cm_id *cm_i if (!rx->rx_ring) return -ENOMEM; - t = kmalloc(sizeof *t, GFP_KERNEL); + t = kmalloc(sizeof(*t), GFP_KERNEL); if (!t) { ret = -ENOMEM; goto err_free_1; @@ -421,8 +418,9 @@ err_free_1: } static int ipoib_cm_send_rep(struct net_device *dev, struct ib_cm_id *cm_id, - struct ib_qp *qp, struct ib_cm_req_event_param *req, - unsigned psn) + struct ib_qp *qp, + const struct ib_cm_req_event_param *req, + unsigned int psn) { struct ipoib_dev_priv *priv = ipoib_priv(dev); struct ipoib_cm_data data = {}; @@ -432,7 +430,7 @@ static int ipoib_cm_send_rep(struct net_device *dev, struct ib_cm_id *cm_id, data.mtu = cpu_to_be32(IPOIB_CM_BUF_SIZE); rep.private_data = &data; - rep.private_data_len = sizeof data; + rep.private_data_len = sizeof(data); rep.flow_control = 0; rep.rnr_retry_count = req->rnr_retry_count; rep.srq = ipoib_cm_has_srq(dev); @@ -441,16 +439,17 @@ static int ipoib_cm_send_rep(struct net_device *dev, struct ib_cm_id *cm_id, return ib_send_cm_rep(cm_id, &rep); } -static int ipoib_cm_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event) +static int ipoib_cm_req_handler(struct ib_cm_id *cm_id, + const struct ib_cm_event *event) { struct net_device *dev = cm_id->context; struct ipoib_dev_priv *priv = ipoib_priv(dev); struct ipoib_cm_rx *p; - unsigned psn; + unsigned int psn; int ret; ipoib_dbg(priv, "REQ arrived\n"); - p = kzalloc(sizeof *p, GFP_KERNEL); + p = kzalloc(sizeof(*p), GFP_KERNEL); if (!p) return -ENOMEM; p->dev = dev; @@ -503,7 +502,7 @@ err_qp: } static int ipoib_cm_rx_handler(struct ib_cm_id *cm_id, - struct ib_cm_event *event) + const struct ib_cm_event *event) { struct ipoib_cm_rx *p; struct ipoib_dev_priv *priv; @@ -547,7 +546,7 @@ static void skb_put_frags(struct sk_buff *skb, unsigned int hdr_space, 0, PAGE_SIZE); --skb_shinfo(skb)->nr_frags; } else { - size = min(length, (unsigned) PAGE_SIZE); + size = min_t(unsigned int, length, PAGE_SIZE); skb_frag_size_set(frag, size); skb->data_len += size; @@ -641,8 +640,9 @@ void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc) } } - frags = PAGE_ALIGN(wc->byte_len - min(wc->byte_len, - (unsigned)IPOIB_CM_HEAD_SIZE)) / PAGE_SIZE; + frags = PAGE_ALIGN(wc->byte_len - + min_t(u32, wc->byte_len, IPOIB_CM_HEAD_SIZE)) / + PAGE_SIZE; newskb = ipoib_cm_alloc_rx_skb(dev, rx_ring, wr_id, frags, mapping, GFP_ATOMIC); @@ -657,7 +657,7 @@ void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc) } ipoib_cm_dma_unmap_rx(priv, frags, rx_ring[wr_id].mapping); - memcpy(rx_ring[wr_id].mapping, mapping, (frags + 1) * sizeof *mapping); + memcpy(rx_ring[wr_id].mapping, mapping, (frags + 1) * sizeof(*mapping)); ipoib_dbg_data(priv, "received %d bytes, SLID 0x%04x\n", wc->byte_len, wc->slid); @@ -698,13 +698,11 @@ static inline int post_send(struct ipoib_dev_priv *priv, unsigned int wr_id, struct ipoib_tx_buf *tx_req) { - struct ib_send_wr *bad_wr; - ipoib_build_sge(priv, tx_req); priv->tx_wr.wr.wr_id = wr_id | IPOIB_OP_CM; - return ib_post_send(tx->qp, &priv->tx_wr.wr, &bad_wr); + return ib_post_send(tx->qp, &priv->tx_wr.wr, NULL); } void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_tx *tx) @@ -712,7 +710,7 @@ void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_ struct ipoib_dev_priv *priv = ipoib_priv(dev); struct ipoib_tx_buf *tx_req; int rc; - unsigned usable_sge = tx->max_send_sge - !!skb_headlen(skb); + unsigned int usable_sge = tx->max_send_sge - !!skb_headlen(skb); if (unlikely(skb->len > tx->mtu)) { ipoib_warn(priv, "packet len %d (> %d) too long to send, dropping\n", @@ -982,7 +980,8 @@ void ipoib_cm_dev_stop(struct net_device *dev) cancel_delayed_work(&priv->cm.stale_task); } -static int ipoib_cm_rep_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event) +static int ipoib_cm_rep_handler(struct ib_cm_id *cm_id, + const struct ib_cm_event *event) { struct ipoib_cm_tx *p = cm_id->context; struct ipoib_dev_priv *priv = ipoib_priv(p->dev); @@ -1068,8 +1067,8 @@ static struct ib_qp *ipoib_cm_create_tx_qp(struct net_device *dev, struct ipoib_ struct ib_qp *tx_qp; if (dev->features & NETIF_F_SG) - attr.cap.max_send_sge = - min_t(u32, priv->ca->attrs.max_sge, MAX_SKB_FRAGS + 1); + attr.cap.max_send_sge = min_t(u32, priv->ca->attrs.max_send_sge, + MAX_SKB_FRAGS + 1); tx_qp = ib_create_qp(priv->pd, &attr); tx->max_send_sge = attr.cap.max_send_sge; @@ -1094,7 +1093,7 @@ static int ipoib_cm_send_req(struct net_device *dev, req.qp_num = qp->qp_num; req.qp_type = qp->qp_type; req.private_data = &data; - req.private_data_len = sizeof data; + req.private_data_len = sizeof(data); req.flow_control = 0; req.starting_psn = 0; /* FIXME */ @@ -1152,7 +1151,7 @@ static int ipoib_cm_tx_init(struct ipoib_cm_tx *p, u32 qpn, ret = -ENOMEM; goto err_tx; } - memset(p->tx_ring, 0, ipoib_sendq_size * sizeof *p->tx_ring); + memset(p->tx_ring, 0, ipoib_sendq_size * sizeof(*p->tx_ring)); p->qp = ipoib_cm_create_tx_qp(p->dev, p); memalloc_noio_restore(noio_flag); @@ -1248,7 +1247,7 @@ timeout: } static int ipoib_cm_tx_handler(struct ib_cm_id *cm_id, - struct ib_cm_event *event) + const struct ib_cm_event *event) { struct ipoib_cm_tx *tx = cm_id->context; struct ipoib_dev_priv *priv = ipoib_priv(tx->dev); @@ -1305,7 +1304,7 @@ struct ipoib_cm_tx *ipoib_cm_create_tx(struct net_device *dev, struct ipoib_path struct ipoib_dev_priv *priv = ipoib_priv(dev); struct ipoib_cm_tx *tx; - tx = kzalloc(sizeof *tx, GFP_ATOMIC); + tx = kzalloc(sizeof(*tx), GFP_ATOMIC); if (!tx) return NULL; @@ -1370,7 +1369,7 @@ static void ipoib_cm_tx_start(struct work_struct *work) neigh->daddr + QPN_AND_OPTIONS_OFFSET); goto free_neigh; } - memcpy(&pathrec, &p->path->pathrec, sizeof pathrec); + memcpy(&pathrec, &p->path->pathrec, sizeof(pathrec)); spin_unlock_irqrestore(&priv->lock, flags); netif_tx_unlock_bh(dev); @@ -1428,7 +1427,7 @@ static void ipoib_cm_skb_reap(struct work_struct *work) struct net_device *dev = priv->dev; struct sk_buff *skb; unsigned long flags; - unsigned mtu = priv->mcast_mtu; + unsigned int mtu = priv->mcast_mtu; netif_tx_lock_bh(dev); spin_lock_irqsave(&priv->lock, flags); @@ -1518,19 +1517,16 @@ static ssize_t set_mode(struct device *d, struct device_attribute *attr, { struct net_device *dev = to_net_dev(d); int ret; - struct ipoib_dev_priv *priv = ipoib_priv(dev); - - if (test_bit(IPOIB_FLAG_GOING_DOWN, &priv->flags)) - return -EPERM; - - if (!mutex_trylock(&priv->sysfs_mutex)) - return restart_syscall(); if (!rtnl_trylock()) { - mutex_unlock(&priv->sysfs_mutex); return restart_syscall(); } + if (dev->reg_state != NETREG_REGISTERED) { + rtnl_unlock(); + return -EPERM; + } + ret = ipoib_set_mode(dev, buf); /* The assumption is that the function ipoib_set_mode returned @@ -1539,7 +1535,6 @@ static ssize_t set_mode(struct device *d, struct device_attribute *attr, */ if (ret != -EBUSY) rtnl_unlock(); - mutex_unlock(&priv->sysfs_mutex); return (!ret || ret == -EBUSY) ? count : ret; } @@ -1564,7 +1559,7 @@ static void ipoib_cm_create_srq(struct net_device *dev, int max_sge) priv->cm.srq = ib_create_srq(priv->pd, &srq_init_attr); if (IS_ERR(priv->cm.srq)) { - if (PTR_ERR(priv->cm.srq) != -ENOSYS) + if (PTR_ERR(priv->cm.srq) != -EOPNOTSUPP) pr_warn("%s: failed to allocate SRQ, error %ld\n", priv->ca->name, PTR_ERR(priv->cm.srq)); priv->cm.srq = NULL; diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c b/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c index 2706bf26cbac..83429925dfc6 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c @@ -102,7 +102,7 @@ static int ipoib_set_coalesce(struct net_device *dev, ret = rdma_set_cq_moderation(priv->recv_cq, coal->rx_max_coalesced_frames, coal->rx_coalesce_usecs); - if (ret && ret != -ENOSYS) { + if (ret && ret != -EOPNOTSUPP) { ipoib_warn(priv, "failed modifying CQ (%d)\n", ret); return ret; } diff --git a/drivers/infiniband/ulp/ipoib/ipoib_fs.c b/drivers/infiniband/ulp/ipoib/ipoib_fs.c index ea302b054601..178488028734 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_fs.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_fs.c @@ -262,15 +262,15 @@ static const struct file_operations ipoib_path_fops = { void ipoib_create_debug_files(struct net_device *dev) { struct ipoib_dev_priv *priv = ipoib_priv(dev); - char name[IFNAMSIZ + sizeof "_path"]; + char name[IFNAMSIZ + sizeof("_path")]; - snprintf(name, sizeof name, "%s_mcg", dev->name); + snprintf(name, sizeof(name), "%s_mcg", dev->name); priv->mcg_dentry = debugfs_create_file(name, S_IFREG | S_IRUGO, ipoib_root, dev, &ipoib_mcg_fops); if (!priv->mcg_dentry) ipoib_warn(priv, "failed to create mcg debug file\n"); - snprintf(name, sizeof name, "%s_path", dev->name); + snprintf(name, sizeof(name), "%s_path", dev->name); priv->path_dentry = debugfs_create_file(name, S_IFREG | S_IRUGO, ipoib_root, dev, &ipoib_path_fops); if (!priv->path_dentry) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c index f47f9ace1f48..9006a13af1de 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c @@ -40,6 +40,7 @@ #include <linux/ip.h> #include <linux/tcp.h> +#include <rdma/ib_cache.h> #include "ipoib.h" @@ -57,7 +58,7 @@ struct ipoib_ah *ipoib_create_ah(struct net_device *dev, struct ipoib_ah *ah; struct ib_ah *vah; - ah = kmalloc(sizeof *ah, GFP_KERNEL); + ah = kmalloc(sizeof(*ah), GFP_KERNEL); if (!ah) return ERR_PTR(-ENOMEM); @@ -100,7 +101,6 @@ static void ipoib_ud_dma_unmap_rx(struct ipoib_dev_priv *priv, static int ipoib_ib_post_receive(struct net_device *dev, int id) { struct ipoib_dev_priv *priv = ipoib_priv(dev); - struct ib_recv_wr *bad_wr; int ret; priv->rx_wr.wr_id = id | IPOIB_OP_RECV; @@ -108,7 +108,7 @@ static int ipoib_ib_post_receive(struct net_device *dev, int id) priv->rx_sge[1].addr = priv->rx_ring[id].mapping[1]; - ret = ib_post_recv(priv->qp, &priv->rx_wr, &bad_wr); + ret = ib_post_recv(priv->qp, &priv->rx_wr, NULL); if (unlikely(ret)) { ipoib_warn(priv, "receive failed for buf %d (%d)\n", id, ret); ipoib_ud_dma_unmap_rx(priv, priv->rx_ring[id].mapping); @@ -202,7 +202,7 @@ static void ipoib_ib_handle_rx_wc(struct net_device *dev, struct ib_wc *wc) } memcpy(mapping, priv->rx_ring[wr_id].mapping, - IPOIB_UD_RX_SG * sizeof *mapping); + IPOIB_UD_RX_SG * sizeof(*mapping)); /* * If we can't allocate a new RX buffer, dump @@ -541,7 +541,6 @@ static inline int post_send(struct ipoib_dev_priv *priv, struct ipoib_tx_buf *tx_req, void *head, int hlen) { - struct ib_send_wr *bad_wr; struct sk_buff *skb = tx_req->skb; ipoib_build_sge(priv, tx_req); @@ -558,7 +557,7 @@ static inline int post_send(struct ipoib_dev_priv *priv, } else priv->tx_wr.wr.opcode = IB_WR_SEND; - return ib_post_send(priv->qp, &priv->tx_wr.wr, &bad_wr); + return ib_post_send(priv->qp, &priv->tx_wr.wr, NULL); } int ipoib_send(struct net_device *dev, struct sk_buff *skb, @@ -568,7 +567,7 @@ int ipoib_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_tx_buf *tx_req; int hlen, rc; void *phead; - unsigned usable_sge = priv->max_send_sge - !!skb_headlen(skb); + unsigned int usable_sge = priv->max_send_sge - !!skb_headlen(skb); if (skb_is_gso(skb)) { hlen = skb_transport_offset(skb) + tcp_hdrlen(skb); @@ -1069,7 +1068,7 @@ static bool ipoib_dev_addr_changed_valid(struct ipoib_dev_priv *priv) bool ret = false; netdev_gid = (union ib_gid *)(priv->dev->dev_addr + 4); - if (ib_query_gid(priv->ca, priv->port, 0, &gid0, NULL)) + if (rdma_query_gid(priv->ca, priv->port, 0, &gid0)) return false; netif_addr_lock_bh(priv->dev); diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index 26cde95bc0f3..e3d28f9ad9c0 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -215,11 +215,6 @@ static int ipoib_stop(struct net_device *dev) return 0; } -static void ipoib_uninit(struct net_device *dev) -{ - ipoib_dev_cleanup(dev); -} - static netdev_features_t ipoib_fix_features(struct net_device *dev, netdev_features_t features) { struct ipoib_dev_priv *priv = ipoib_priv(dev); @@ -634,7 +629,7 @@ struct ipoib_path_iter *ipoib_path_iter_init(struct net_device *dev) { struct ipoib_path_iter *iter; - iter = kmalloc(sizeof *iter, GFP_KERNEL); + iter = kmalloc(sizeof(*iter), GFP_KERNEL); if (!iter) return NULL; @@ -770,8 +765,10 @@ static void path_rec_completion(int status, struct rdma_ah_attr av; if (!ib_init_ah_attr_from_path(priv->ca, priv->port, - pathrec, &av)) + pathrec, &av, NULL)) { ah = ipoib_create_ah(dev, priv->pd, &av); + rdma_destroy_ah_attr(&av); + } } spin_lock_irqsave(&priv->lock, flags); @@ -883,7 +880,7 @@ static struct ipoib_path *path_rec_create(struct net_device *dev, void *gid) if (!priv->broadcast) return NULL; - path = kzalloc(sizeof *path, GFP_ATOMIC); + path = kzalloc(sizeof(*path), GFP_ATOMIC); if (!path) return NULL; @@ -1199,11 +1196,13 @@ static void ipoib_timeout(struct net_device *dev) static int ipoib_hard_header(struct sk_buff *skb, struct net_device *dev, unsigned short type, - const void *daddr, const void *saddr, unsigned len) + const void *daddr, + const void *saddr, + unsigned int len) { struct ipoib_header *header; - header = skb_push(skb, sizeof *header); + header = skb_push(skb, sizeof(*header)); header->proto = htons(type); header->reserved = 0; @@ -1306,9 +1305,6 @@ static void __ipoib_reap_neigh(struct ipoib_dev_priv *priv) int i; LIST_HEAD(remove_list); - if (test_bit(IPOIB_STOP_NEIGH_GC, &priv->flags)) - return; - spin_lock_irqsave(&priv->lock, flags); htbl = rcu_dereference_protected(ntbl->htbl, @@ -1320,9 +1316,6 @@ static void __ipoib_reap_neigh(struct ipoib_dev_priv *priv) /* neigh is obsolete if it was idle for two GC periods */ dt = 2 * arp_tbl.gc_interval; neigh_obsolete = jiffies - dt; - /* handle possible race condition */ - if (test_bit(IPOIB_STOP_NEIGH_GC, &priv->flags)) - goto out_unlock; for (i = 0; i < htbl->size; i++) { struct ipoib_neigh *neigh; @@ -1360,9 +1353,8 @@ static void ipoib_reap_neigh(struct work_struct *work) __ipoib_reap_neigh(priv); - if (!test_bit(IPOIB_STOP_NEIGH_GC, &priv->flags)) - queue_delayed_work(priv->wq, &priv->neigh_reap_task, - arp_tbl.gc_interval); + queue_delayed_work(priv->wq, &priv->neigh_reap_task, + arp_tbl.gc_interval); } @@ -1371,7 +1363,7 @@ static struct ipoib_neigh *ipoib_neigh_ctor(u8 *daddr, { struct ipoib_neigh *neigh; - neigh = kzalloc(sizeof *neigh, GFP_ATOMIC); + neigh = kzalloc(sizeof(*neigh), GFP_ATOMIC); if (!neigh) return NULL; @@ -1524,9 +1516,8 @@ static int ipoib_neigh_hash_init(struct ipoib_dev_priv *priv) htbl = kzalloc(sizeof(*htbl), GFP_KERNEL); if (!htbl) return -ENOMEM; - set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags); size = roundup_pow_of_two(arp_tbl.gc_thresh3); - buckets = kcalloc(size, sizeof(*buckets), GFP_KERNEL); + buckets = kvcalloc(size, sizeof(*buckets), GFP_KERNEL); if (!buckets) { kfree(htbl); return -ENOMEM; @@ -1539,7 +1530,6 @@ static int ipoib_neigh_hash_init(struct ipoib_dev_priv *priv) atomic_set(&ntbl->entries, 0); /* start garbage collection */ - clear_bit(IPOIB_STOP_NEIGH_GC, &priv->flags); queue_delayed_work(priv->wq, &priv->neigh_reap_task, arp_tbl.gc_interval); @@ -1554,7 +1544,7 @@ static void neigh_hash_free_rcu(struct rcu_head *head) struct ipoib_neigh __rcu **buckets = htbl->buckets; struct ipoib_neigh_table *ntbl = htbl->ntbl; - kfree(buckets); + kvfree(buckets); kfree(htbl); complete(&ntbl->deleted); } @@ -1649,15 +1639,11 @@ out_unlock: static void ipoib_neigh_hash_uninit(struct net_device *dev) { struct ipoib_dev_priv *priv = ipoib_priv(dev); - int stopped; ipoib_dbg(priv, "ipoib_neigh_hash_uninit\n"); init_completion(&priv->ntbl.deleted); - /* Stop GC if called at init fail need to cancel work */ - stopped = test_and_set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags); - if (!stopped) - cancel_delayed_work(&priv->neigh_reap_task); + cancel_delayed_work_sync(&priv->neigh_reap_task); ipoib_flush_neighs(priv); @@ -1755,13 +1741,11 @@ static int ipoib_ioctl(struct net_device *dev, struct ifreq *ifr, return priv->rn_ops->ndo_do_ioctl(dev, ifr, cmd); } -int ipoib_dev_init(struct net_device *dev, struct ib_device *ca, int port) +static int ipoib_dev_init(struct net_device *dev) { struct ipoib_dev_priv *priv = ipoib_priv(dev); int ret = -ENOMEM; - priv->ca = ca; - priv->port = port; priv->qp = NULL; /* @@ -1777,7 +1761,7 @@ int ipoib_dev_init(struct net_device *dev, struct ib_device *ca, int port) /* create pd, which used both for control and datapath*/ priv->pd = ib_alloc_pd(priv->ca, 0); if (IS_ERR(priv->pd)) { - pr_warn("%s: failed to allocate PD\n", ca->name); + pr_warn("%s: failed to allocate PD\n", priv->ca->name); goto clean_wq; } @@ -1787,7 +1771,8 @@ int ipoib_dev_init(struct net_device *dev, struct ib_device *ca, int port) goto out_free_pd; } - if (ipoib_neigh_hash_init(priv) < 0) { + ret = ipoib_neigh_hash_init(priv); + if (ret) { pr_warn("%s failed to init neigh hash\n", dev->name); goto out_dev_uninit; } @@ -1796,12 +1781,15 @@ int ipoib_dev_init(struct net_device *dev, struct ib_device *ca, int port) if (ipoib_ib_dev_open(dev)) { pr_warn("%s failed to open device\n", dev->name); ret = -ENODEV; - goto out_dev_uninit; + goto out_hash_uninit; } } return 0; +out_hash_uninit: + ipoib_neigh_hash_uninit(dev); + out_dev_uninit: ipoib_ib_dev_cleanup(dev); @@ -1821,21 +1809,151 @@ out: return ret; } -void ipoib_dev_cleanup(struct net_device *dev) +/* + * This must be called before doing an unregister_netdev on a parent device to + * shutdown the IB event handler. + */ +static void ipoib_parent_unregister_pre(struct net_device *ndev) { - struct ipoib_dev_priv *priv = ipoib_priv(dev), *cpriv, *tcpriv; - LIST_HEAD(head); + struct ipoib_dev_priv *priv = ipoib_priv(ndev); - ASSERT_RTNL(); + /* + * ipoib_set_mac checks netif_running before pushing work, clearing + * running ensures the it will not add more work. + */ + rtnl_lock(); + dev_change_flags(priv->dev, priv->dev->flags & ~IFF_UP); + rtnl_unlock(); - /* Delete any child interfaces first */ - list_for_each_entry_safe(cpriv, tcpriv, &priv->child_intfs, list) { - /* Stop GC on child */ - set_bit(IPOIB_STOP_NEIGH_GC, &cpriv->flags); - cancel_delayed_work(&cpriv->neigh_reap_task); - unregister_netdevice_queue(cpriv->dev, &head); + /* ipoib_event() cannot be running once this returns */ + ib_unregister_event_handler(&priv->event_handler); + + /* + * Work on the queue grabs the rtnl lock, so this cannot be done while + * also holding it. + */ + flush_workqueue(ipoib_workqueue); +} + +static void ipoib_set_dev_features(struct ipoib_dev_priv *priv) +{ + priv->hca_caps = priv->ca->attrs.device_cap_flags; + + if (priv->hca_caps & IB_DEVICE_UD_IP_CSUM) { + priv->dev->hw_features |= NETIF_F_IP_CSUM | NETIF_F_RXCSUM; + + if (priv->hca_caps & IB_DEVICE_UD_TSO) + priv->dev->hw_features |= NETIF_F_TSO; + + priv->dev->features |= priv->dev->hw_features; + } +} + +static int ipoib_parent_init(struct net_device *ndev) +{ + struct ipoib_dev_priv *priv = ipoib_priv(ndev); + struct ib_port_attr attr; + int result; + + result = ib_query_port(priv->ca, priv->port, &attr); + if (result) { + pr_warn("%s: ib_query_port %d failed\n", priv->ca->name, + priv->port); + return result; + } + priv->max_ib_mtu = ib_mtu_enum_to_int(attr.max_mtu); + + result = ib_query_pkey(priv->ca, priv->port, 0, &priv->pkey); + if (result) { + pr_warn("%s: ib_query_pkey port %d failed (ret = %d)\n", + priv->ca->name, priv->port, result); + return result; } - unregister_netdevice_many(&head); + + result = rdma_query_gid(priv->ca, priv->port, 0, &priv->local_gid); + if (result) { + pr_warn("%s: rdma_query_gid port %d failed (ret = %d)\n", + priv->ca->name, priv->port, result); + return result; + } + memcpy(priv->dev->dev_addr + 4, priv->local_gid.raw, + sizeof(union ib_gid)); + + SET_NETDEV_DEV(priv->dev, priv->ca->dev.parent); + priv->dev->dev_id = priv->port - 1; + + return 0; +} + +static void ipoib_child_init(struct net_device *ndev) +{ + struct ipoib_dev_priv *priv = ipoib_priv(ndev); + struct ipoib_dev_priv *ppriv = ipoib_priv(priv->parent); + + dev_hold(priv->parent); + + down_write(&ppriv->vlan_rwsem); + list_add_tail(&priv->list, &ppriv->child_intfs); + up_write(&ppriv->vlan_rwsem); + + priv->max_ib_mtu = ppriv->max_ib_mtu; + set_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags); + memcpy(priv->dev->dev_addr, ppriv->dev->dev_addr, INFINIBAND_ALEN); + memcpy(&priv->local_gid, &ppriv->local_gid, sizeof(priv->local_gid)); +} + +static int ipoib_ndo_init(struct net_device *ndev) +{ + struct ipoib_dev_priv *priv = ipoib_priv(ndev); + int rc; + + if (priv->parent) { + ipoib_child_init(ndev); + } else { + rc = ipoib_parent_init(ndev); + if (rc) + return rc; + } + + /* MTU will be reset when mcast join happens */ + ndev->mtu = IPOIB_UD_MTU(priv->max_ib_mtu); + priv->mcast_mtu = priv->admin_mtu = ndev->mtu; + ndev->max_mtu = IPOIB_CM_MTU; + + ndev->neigh_priv_len = sizeof(struct ipoib_neigh); + + /* + * Set the full membership bit, so that we join the right + * broadcast group, etc. + */ + priv->pkey |= 0x8000; + + ndev->broadcast[8] = priv->pkey >> 8; + ndev->broadcast[9] = priv->pkey & 0xff; + set_bit(IPOIB_FLAG_DEV_ADDR_SET, &priv->flags); + + ipoib_set_dev_features(priv); + + rc = ipoib_dev_init(ndev); + if (rc) { + pr_warn("%s: failed to initialize device: %s port %d (ret = %d)\n", + priv->ca->name, priv->dev->name, priv->port, rc); + } + + return 0; +} + +static void ipoib_ndo_uninit(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = ipoib_priv(dev); + + ASSERT_RTNL(); + + /* + * ipoib_remove_one guarantees the children are removed before the + * parent, and that is the only place where a parent can be removed. + */ + WARN_ON(!list_empty(&priv->child_intfs)); ipoib_neigh_hash_uninit(dev); @@ -1847,6 +1965,16 @@ void ipoib_dev_cleanup(struct net_device *dev) destroy_workqueue(priv->wq); priv->wq = NULL; } + + if (priv->parent) { + struct ipoib_dev_priv *ppriv = ipoib_priv(priv->parent); + + down_write(&ppriv->vlan_rwsem); + list_del(&priv->list); + up_write(&ppriv->vlan_rwsem); + + dev_put(priv->parent); + } } static int ipoib_set_vf_link_state(struct net_device *dev, int vf, int link_state) @@ -1894,7 +2022,8 @@ static const struct header_ops ipoib_header_ops = { }; static const struct net_device_ops ipoib_netdev_ops_pf = { - .ndo_uninit = ipoib_uninit, + .ndo_init = ipoib_ndo_init, + .ndo_uninit = ipoib_ndo_uninit, .ndo_open = ipoib_open, .ndo_stop = ipoib_stop, .ndo_change_mtu = ipoib_change_mtu, @@ -1913,7 +2042,8 @@ static const struct net_device_ops ipoib_netdev_ops_pf = { }; static const struct net_device_ops ipoib_netdev_ops_vf = { - .ndo_uninit = ipoib_uninit, + .ndo_init = ipoib_ndo_init, + .ndo_uninit = ipoib_ndo_uninit, .ndo_open = ipoib_open, .ndo_stop = ipoib_stop, .ndo_change_mtu = ipoib_change_mtu, @@ -1945,6 +2075,13 @@ void ipoib_setup_common(struct net_device *dev) netif_keep_dst(dev); memcpy(dev->broadcast, ipv4_bcast_addr, INFINIBAND_ALEN); + + /* + * unregister_netdev always frees the netdev, we use this mode + * consistently to unify all the various unregister paths, including + * those connected to rtnl_link_ops which require it. + */ + dev->needs_free_netdev = true; } static void ipoib_build_priv(struct net_device *dev) @@ -1955,7 +2092,6 @@ static void ipoib_build_priv(struct net_device *dev) spin_lock_init(&priv->lock); init_rwsem(&priv->vlan_rwsem); mutex_init(&priv->mcast_mutex); - mutex_init(&priv->sysfs_mutex); INIT_LIST_HEAD(&priv->path_list); INIT_LIST_HEAD(&priv->child_intfs); @@ -1999,9 +2135,7 @@ static struct net_device rn->send = ipoib_send; rn->attach_mcast = ipoib_mcast_attach; rn->detach_mcast = ipoib_mcast_detach; - rn->free_rdma_netdev = free_netdev; rn->hca = hca; - dev->netdev_ops = &ipoib_netdev_default_pf; return dev; @@ -2039,6 +2173,9 @@ struct ipoib_dev_priv *ipoib_intf_alloc(struct ib_device *hca, u8 port, if (!priv) return NULL; + priv->ca = hca; + priv->port = port; + dev = ipoib_get_netdev(hca, port, name); if (!dev) goto free_priv; @@ -2053,6 +2190,15 @@ struct ipoib_dev_priv *ipoib_intf_alloc(struct ib_device *hca, u8 port, rn = netdev_priv(dev); rn->clnt_priv = priv; + + /* + * Only the child register_netdev flows can handle priv_destructor + * being set, so we force it to NULL here and handle manually until it + * is safe to turn on. + */ + priv->next_priv_destructor = dev->priv_destructor; + dev->priv_destructor = NULL; + ipoib_build_priv(dev); return priv; @@ -2061,6 +2207,27 @@ free_priv: return NULL; } +void ipoib_intf_free(struct net_device *dev) +{ + struct ipoib_dev_priv *priv = ipoib_priv(dev); + struct rdma_netdev *rn = netdev_priv(dev); + + dev->priv_destructor = priv->next_priv_destructor; + if (dev->priv_destructor) + dev->priv_destructor(dev); + + /* + * There are some error flows around register_netdev failing that may + * attempt to call priv_destructor twice, prevent that from happening. + */ + dev->priv_destructor = NULL; + + /* unregister/destroy is very complicated. Make bugs more obvious. */ + rn->clnt_priv = NULL; + + kfree(priv); +} + static ssize_t show_pkey(struct device *dev, struct device_attribute *attr, char *buf) { @@ -2186,12 +2353,6 @@ static ssize_t create_child(struct device *dev, if (pkey <= 0 || pkey > 0xffff || pkey == 0x8000) return -EINVAL; - /* - * Set the full membership bit, so that we join the right - * broadcast group, etc. - */ - pkey |= 0x8000; - ret = ipoib_vlan_add(to_net_dev(dev), pkey); return ret ? ret : count; @@ -2223,87 +2384,19 @@ int ipoib_add_pkey_attr(struct net_device *dev) return device_create_file(&dev->dev, &dev_attr_pkey); } -void ipoib_set_dev_features(struct ipoib_dev_priv *priv, struct ib_device *hca) -{ - priv->hca_caps = hca->attrs.device_cap_flags; - - if (priv->hca_caps & IB_DEVICE_UD_IP_CSUM) { - priv->dev->hw_features |= NETIF_F_IP_CSUM | NETIF_F_RXCSUM; - - if (priv->hca_caps & IB_DEVICE_UD_TSO) - priv->dev->hw_features |= NETIF_F_TSO; - - priv->dev->features |= priv->dev->hw_features; - } -} - static struct net_device *ipoib_add_port(const char *format, struct ib_device *hca, u8 port) { struct ipoib_dev_priv *priv; - struct ib_port_attr attr; - struct rdma_netdev *rn; - int result = -ENOMEM; + struct net_device *ndev; + int result; priv = ipoib_intf_alloc(hca, port, format); if (!priv) { pr_warn("%s, %d: ipoib_intf_alloc failed\n", hca->name, port); - goto alloc_mem_failed; - } - - SET_NETDEV_DEV(priv->dev, hca->dev.parent); - priv->dev->dev_id = port - 1; - - result = ib_query_port(hca, port, &attr); - if (result) { - pr_warn("%s: ib_query_port %d failed\n", hca->name, port); - goto device_init_failed; - } - - priv->max_ib_mtu = ib_mtu_enum_to_int(attr.max_mtu); - - /* MTU will be reset when mcast join happens */ - priv->dev->mtu = IPOIB_UD_MTU(priv->max_ib_mtu); - priv->mcast_mtu = priv->admin_mtu = priv->dev->mtu; - priv->dev->max_mtu = IPOIB_CM_MTU; - - priv->dev->neigh_priv_len = sizeof(struct ipoib_neigh); - - result = ib_query_pkey(hca, port, 0, &priv->pkey); - if (result) { - pr_warn("%s: ib_query_pkey port %d failed (ret = %d)\n", - hca->name, port, result); - goto device_init_failed; - } - - ipoib_set_dev_features(priv, hca); - - /* - * Set the full membership bit, so that we join the right - * broadcast group, etc. - */ - priv->pkey |= 0x8000; - - priv->dev->broadcast[8] = priv->pkey >> 8; - priv->dev->broadcast[9] = priv->pkey & 0xff; - - result = ib_query_gid(hca, port, 0, &priv->local_gid, NULL); - if (result) { - pr_warn("%s: ib_query_gid port %d failed (ret = %d)\n", - hca->name, port, result); - goto device_init_failed; - } - - memcpy(priv->dev->dev_addr + 4, priv->local_gid.raw, - sizeof(union ib_gid)); - set_bit(IPOIB_FLAG_DEV_ADDR_SET, &priv->flags); - - result = ipoib_dev_init(priv->dev, hca, port); - if (result) { - pr_warn("%s: failed to initialize port %d (ret = %d)\n", - hca->name, port, result); - goto device_init_failed; + return ERR_PTR(-ENOMEM); } + ndev = priv->dev; INIT_IB_EVENT_HANDLER(&priv->event_handler, priv->ca, ipoib_event); @@ -2312,46 +2405,43 @@ static struct net_device *ipoib_add_port(const char *format, /* call event handler to ensure pkey in sync */ queue_work(ipoib_workqueue, &priv->flush_heavy); - result = register_netdev(priv->dev); + result = register_netdev(ndev); if (result) { pr_warn("%s: couldn't register ipoib port %d; error %d\n", hca->name, port, result); - goto register_failed; + + ipoib_parent_unregister_pre(ndev); + ipoib_intf_free(ndev); + free_netdev(ndev); + + return ERR_PTR(result); } - result = -ENOMEM; - if (ipoib_cm_add_mode_attr(priv->dev)) + /* + * We cannot set priv_destructor before register_netdev because we + * need priv to be always valid during the error flow to execute + * ipoib_parent_unregister_pre(). Instead handle it manually and only + * enter priv_destructor mode once we are completely registered. + */ + ndev->priv_destructor = ipoib_intf_free; + + if (ipoib_cm_add_mode_attr(ndev)) goto sysfs_failed; - if (ipoib_add_pkey_attr(priv->dev)) + if (ipoib_add_pkey_attr(ndev)) goto sysfs_failed; - if (ipoib_add_umcast_attr(priv->dev)) + if (ipoib_add_umcast_attr(ndev)) goto sysfs_failed; - if (device_create_file(&priv->dev->dev, &dev_attr_create_child)) + if (device_create_file(&ndev->dev, &dev_attr_create_child)) goto sysfs_failed; - if (device_create_file(&priv->dev->dev, &dev_attr_delete_child)) + if (device_create_file(&ndev->dev, &dev_attr_delete_child)) goto sysfs_failed; - return priv->dev; + return ndev; sysfs_failed: - unregister_netdev(priv->dev); - -register_failed: - ib_unregister_event_handler(&priv->event_handler); - flush_workqueue(ipoib_workqueue); - /* Stop GC if started before flush */ - set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags); - cancel_delayed_work(&priv->neigh_reap_task); - flush_workqueue(priv->wq); - ipoib_dev_cleanup(priv->dev); - -device_init_failed: - rn = netdev_priv(priv->dev); - rn->free_rdma_netdev(priv->dev); - kfree(priv); - -alloc_mem_failed: - return ERR_PTR(result); + ipoib_parent_unregister_pre(ndev); + unregister_netdev(ndev); + return ERR_PTR(-ENOMEM); } static void ipoib_add_one(struct ib_device *device) @@ -2362,7 +2452,7 @@ static void ipoib_add_one(struct ib_device *device) int p; int count = 0; - dev_list = kmalloc(sizeof *dev_list, GFP_KERNEL); + dev_list = kmalloc(sizeof(*dev_list), GFP_KERNEL); if (!dev_list) return; @@ -2396,39 +2486,18 @@ static void ipoib_remove_one(struct ib_device *device, void *client_data) return; list_for_each_entry_safe(priv, tmp, dev_list, list) { - struct rdma_netdev *parent_rn = netdev_priv(priv->dev); - - ib_unregister_event_handler(&priv->event_handler); - flush_workqueue(ipoib_workqueue); - - /* mark interface in the middle of destruction */ - set_bit(IPOIB_FLAG_GOING_DOWN, &priv->flags); + LIST_HEAD(head); + ipoib_parent_unregister_pre(priv->dev); rtnl_lock(); - dev_change_flags(priv->dev, priv->dev->flags & ~IFF_UP); - rtnl_unlock(); - - /* Stop GC */ - set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags); - cancel_delayed_work(&priv->neigh_reap_task); - flush_workqueue(priv->wq); - - /* Wrap rtnl_lock/unlock with mutex to protect sysfs calls */ - mutex_lock(&priv->sysfs_mutex); - unregister_netdev(priv->dev); - mutex_unlock(&priv->sysfs_mutex); - - parent_rn->free_rdma_netdev(priv->dev); - list_for_each_entry_safe(cpriv, tcpriv, &priv->child_intfs, list) { - struct rdma_netdev *child_rn; + list_for_each_entry_safe(cpriv, tcpriv, &priv->child_intfs, + list) + unregister_netdevice_queue(cpriv->dev, &head); + unregister_netdevice_queue(priv->dev, &head); + unregister_netdevice_many(&head); - child_rn = netdev_priv(cpriv->dev); - child_rn->free_rdma_netdev(cpriv->dev); - kfree(cpriv); - } - - kfree(priv); + rtnl_unlock(); } kfree(dev_list); @@ -2476,8 +2545,7 @@ static int __init ipoib_init_module(void) * its private workqueue, and we only queue up flush events * on our global flush workqueue. This avoids the deadlocks. */ - ipoib_workqueue = alloc_ordered_workqueue("ipoib_flush", - WQ_MEM_RECLAIM); + ipoib_workqueue = alloc_ordered_workqueue("ipoib_flush", 0); if (!ipoib_workqueue) { ret = -ENOMEM; goto err_fs; diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c index 6709328d90f8..b9e9562f5034 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c @@ -140,7 +140,7 @@ static struct ipoib_mcast *ipoib_mcast_alloc(struct net_device *dev, { struct ipoib_mcast *mcast; - mcast = kzalloc(sizeof *mcast, can_sleep ? GFP_KERNEL : GFP_ATOMIC); + mcast = kzalloc(sizeof(*mcast), can_sleep ? GFP_KERNEL : GFP_ATOMIC); if (!mcast) return NULL; @@ -822,6 +822,7 @@ void ipoib_mcast_send(struct net_device *dev, u8 *daddr, struct sk_buff *skb) if (neigh && list_empty(&neigh->list)) { kref_get(&mcast->ah->ref); neigh->ah = mcast->ah; + neigh->ah->valid = 1; list_add_tail(&neigh->list, &mcast->neigh_list); } } @@ -917,7 +918,7 @@ void ipoib_mcast_restart_task(struct work_struct *work) if (!ipoib_mcast_addr_is_valid(ha->addr, dev->broadcast)) continue; - memcpy(mgid.raw, ha->addr + 4, sizeof mgid); + memcpy(mgid.raw, ha->addr + 4, sizeof(mgid)); mcast = __ipoib_mcast_find(dev, &mgid); if (!mcast || test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) { @@ -997,7 +998,7 @@ struct ipoib_mcast_iter *ipoib_mcast_iter_init(struct net_device *dev) { struct ipoib_mcast_iter *iter; - iter = kmalloc(sizeof *iter, GFP_KERNEL); + iter = kmalloc(sizeof(*iter), GFP_KERNEL); if (!iter) return NULL; diff --git a/drivers/infiniband/ulp/ipoib/ipoib_netlink.c b/drivers/infiniband/ulp/ipoib/ipoib_netlink.c index 3e44087935ae..d4d553a51fa9 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_netlink.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_netlink.c @@ -122,15 +122,6 @@ static int ipoib_new_child_link(struct net *src_net, struct net_device *dev, } else child_pkey = nla_get_u16(data[IFLA_IPOIB_PKEY]); - if (child_pkey == 0 || child_pkey == 0x8000) - return -EINVAL; - - /* - * Set the full membership bit, so that we join the right - * broadcast group, etc. - */ - child_pkey |= 0x8000; - err = __ipoib_vlan_add(ppriv, ipoib_priv(dev), child_pkey, IPOIB_RTNL_CHILD); @@ -139,19 +130,6 @@ static int ipoib_new_child_link(struct net *src_net, struct net_device *dev, return err; } -static void ipoib_unregister_child_dev(struct net_device *dev, struct list_head *head) -{ - struct ipoib_dev_priv *priv, *ppriv; - - priv = ipoib_priv(dev); - ppriv = ipoib_priv(priv->parent); - - down_write(&ppriv->vlan_rwsem); - unregister_netdevice_queue(dev, head); - list_del(&priv->list); - up_write(&ppriv->vlan_rwsem); -} - static size_t ipoib_get_size(const struct net_device *dev) { return nla_total_size(2) + /* IFLA_IPOIB_PKEY */ @@ -167,7 +145,6 @@ static struct rtnl_link_ops ipoib_link_ops __read_mostly = { .setup = ipoib_setup_common, .newlink = ipoib_new_child_link, .changelink = ipoib_changelink, - .dellink = ipoib_unregister_child_dev, .get_size = ipoib_get_size, .fill_info = ipoib_fill_info, }; diff --git a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c index 984a88096f39..9f36ca786df8 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c @@ -52,7 +52,7 @@ int ipoib_mcast_attach(struct net_device *dev, struct ib_device *hca, if (set_qkey) { ret = -ENOMEM; - qp_attr = kmalloc(sizeof *qp_attr, GFP_KERNEL); + qp_attr = kmalloc(sizeof(*qp_attr), GFP_KERNEL); if (!qp_attr) goto out; @@ -147,7 +147,7 @@ int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca) .cap = { .max_send_wr = ipoib_sendq_size, .max_recv_wr = ipoib_recvq_size, - .max_send_sge = min_t(u32, priv->ca->attrs.max_sge, + .max_send_sge = min_t(u32, priv->ca->attrs.max_send_sge, MAX_SKB_FRAGS + 1), .max_recv_sge = IPOIB_UD_RX_SG }, @@ -168,8 +168,8 @@ int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca) else size += ipoib_recvq_size * ipoib_max_conn_qp; } else - if (ret != -ENOSYS) - return -ENODEV; + if (ret != -EOPNOTSUPP) + return ret; req_vec = (priv->port - 1) * 2; diff --git a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c index 55a9b71ed05a..341753fbda54 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c @@ -50,68 +50,112 @@ static ssize_t show_parent(struct device *d, struct device_attribute *attr, } static DEVICE_ATTR(parent, S_IRUGO, show_parent, NULL); +static bool is_child_unique(struct ipoib_dev_priv *ppriv, + struct ipoib_dev_priv *priv) +{ + struct ipoib_dev_priv *tpriv; + + ASSERT_RTNL(); + + /* + * Since the legacy sysfs interface uses pkey for deletion it cannot + * support more than one interface with the same pkey, it creates + * ambiguity. The RTNL interface deletes using the netdev so it does + * not have a problem to support duplicated pkeys. + */ + if (priv->child_type != IPOIB_LEGACY_CHILD) + return true; + + /* + * First ensure this isn't a duplicate. We check the parent device and + * then all of the legacy child interfaces to make sure the Pkey + * doesn't match. + */ + if (ppriv->pkey == priv->pkey) + return false; + + list_for_each_entry(tpriv, &ppriv->child_intfs, list) { + if (tpriv->pkey == priv->pkey && + tpriv->child_type == IPOIB_LEGACY_CHILD) + return false; + } + + return true; +} + +/* + * NOTE: If this function fails then the priv->dev will remain valid, however + * priv can have been freed and must not be touched by caller in the error + * case. + * + * If (ndev->reg_state == NETREG_UNINITIALIZED) then it is up to the caller to + * free the net_device (just as rtnl_newlink does) otherwise the net_device + * will be freed when the rtnl is unlocked. + */ int __ipoib_vlan_add(struct ipoib_dev_priv *ppriv, struct ipoib_dev_priv *priv, u16 pkey, int type) { + struct net_device *ndev = priv->dev; int result; - priv->max_ib_mtu = ppriv->max_ib_mtu; - /* MTU will be reset when mcast join happens */ - priv->dev->mtu = IPOIB_UD_MTU(priv->max_ib_mtu); - priv->mcast_mtu = priv->admin_mtu = priv->dev->mtu; - priv->parent = ppriv->dev; - set_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags); + ASSERT_RTNL(); + + /* + * Racing with unregister of the parent must be prevented by the + * caller. + */ + WARN_ON(ppriv->dev->reg_state != NETREG_REGISTERED); - ipoib_set_dev_features(priv, ppriv->ca); + if (pkey == 0 || pkey == 0x8000) { + result = -EINVAL; + goto out_early; + } + priv->parent = ppriv->dev; priv->pkey = pkey; + priv->child_type = type; - memcpy(priv->dev->dev_addr, ppriv->dev->dev_addr, INFINIBAND_ALEN); - memcpy(&priv->local_gid, &ppriv->local_gid, sizeof(priv->local_gid)); - set_bit(IPOIB_FLAG_DEV_ADDR_SET, &priv->flags); - priv->dev->broadcast[8] = pkey >> 8; - priv->dev->broadcast[9] = pkey & 0xff; - - result = ipoib_dev_init(priv->dev, ppriv->ca, ppriv->port); - if (result < 0) { - ipoib_warn(ppriv, "failed to initialize subinterface: " - "device %s, port %d", - ppriv->ca->name, ppriv->port); - goto err; + if (!is_child_unique(ppriv, priv)) { + result = -ENOTUNIQ; + goto out_early; } - result = register_netdevice(priv->dev); + /* We do not need to touch priv if register_netdevice fails */ + ndev->priv_destructor = ipoib_intf_free; + + result = register_netdevice(ndev); if (result) { ipoib_warn(priv, "failed to initialize; error %i", result); - goto register_failed; + + /* + * register_netdevice sometimes calls priv_destructor, + * sometimes not. Make sure it was done. + */ + goto out_early; } /* RTNL childs don't need proprietary sysfs entries */ if (type == IPOIB_LEGACY_CHILD) { - if (ipoib_cm_add_mode_attr(priv->dev)) + if (ipoib_cm_add_mode_attr(ndev)) goto sysfs_failed; - if (ipoib_add_pkey_attr(priv->dev)) + if (ipoib_add_pkey_attr(ndev)) goto sysfs_failed; - if (ipoib_add_umcast_attr(priv->dev)) + if (ipoib_add_umcast_attr(ndev)) goto sysfs_failed; - if (device_create_file(&priv->dev->dev, &dev_attr_parent)) + if (device_create_file(&ndev->dev, &dev_attr_parent)) goto sysfs_failed; } - priv->child_type = type; - list_add_tail(&priv->list, &ppriv->child_intfs); - return 0; sysfs_failed: - result = -ENOMEM; unregister_netdevice(priv->dev); + return -ENOMEM; -register_failed: - ipoib_dev_cleanup(priv->dev); - -err: +out_early: + if (ndev->priv_destructor) + ndev->priv_destructor(ndev); return result; } @@ -119,129 +163,124 @@ int ipoib_vlan_add(struct net_device *pdev, unsigned short pkey) { struct ipoib_dev_priv *ppriv, *priv; char intf_name[IFNAMSIZ]; - struct ipoib_dev_priv *tpriv; + struct net_device *ndev; int result; if (!capable(CAP_NET_ADMIN)) return -EPERM; - ppriv = ipoib_priv(pdev); - - if (test_bit(IPOIB_FLAG_GOING_DOWN, &ppriv->flags)) - return -EPERM; - - snprintf(intf_name, sizeof intf_name, "%s.%04x", - ppriv->dev->name, pkey); - - if (!mutex_trylock(&ppriv->sysfs_mutex)) + if (!rtnl_trylock()) return restart_syscall(); - if (!rtnl_trylock()) { - mutex_unlock(&ppriv->sysfs_mutex); - return restart_syscall(); - } - - if (!down_write_trylock(&ppriv->vlan_rwsem)) { + if (pdev->reg_state != NETREG_REGISTERED) { rtnl_unlock(); - mutex_unlock(&ppriv->sysfs_mutex); - return restart_syscall(); + return -EPERM; } + ppriv = ipoib_priv(pdev); + + snprintf(intf_name, sizeof(intf_name), "%s.%04x", + ppriv->dev->name, pkey); + priv = ipoib_intf_alloc(ppriv->ca, ppriv->port, intf_name); if (!priv) { result = -ENOMEM; goto out; } - - /* - * First ensure this isn't a duplicate. We check the parent device and - * then all of the legacy child interfaces to make sure the Pkey - * doesn't match. - */ - if (ppriv->pkey == pkey) { - result = -ENOTUNIQ; - goto out; - } - - list_for_each_entry(tpriv, &ppriv->child_intfs, list) { - if (tpriv->pkey == pkey && - tpriv->child_type == IPOIB_LEGACY_CHILD) { - result = -ENOTUNIQ; - goto out; - } - } + ndev = priv->dev; result = __ipoib_vlan_add(ppriv, priv, pkey, IPOIB_LEGACY_CHILD); + if (result && ndev->reg_state == NETREG_UNINITIALIZED) + free_netdev(ndev); + out: - up_write(&ppriv->vlan_rwsem); rtnl_unlock(); - mutex_unlock(&ppriv->sysfs_mutex); - if (result && priv) { - struct rdma_netdev *rn; + return result; +} + +struct ipoib_vlan_delete_work { + struct work_struct work; + struct net_device *dev; +}; + +/* + * sysfs callbacks of a netdevice cannot obtain the rtnl lock as + * unregister_netdev ultimately deletes the sysfs files while holding the rtnl + * lock. This deadlocks the system. + * + * A callback can use rtnl_trylock to avoid the deadlock but it cannot call + * unregister_netdev as that internally takes and releases the rtnl_lock. So + * instead we find the netdev to unregister and then do the actual unregister + * from the global work queue where we can obtain the rtnl_lock safely. + */ +static void ipoib_vlan_delete_task(struct work_struct *work) +{ + struct ipoib_vlan_delete_work *pwork = + container_of(work, struct ipoib_vlan_delete_work, work); + struct net_device *dev = pwork->dev; + + rtnl_lock(); + + /* Unregistering tasks can race with another task or parent removal */ + if (dev->reg_state == NETREG_REGISTERED) { + struct ipoib_dev_priv *priv = ipoib_priv(dev); + struct ipoib_dev_priv *ppriv = ipoib_priv(priv->parent); - rn = netdev_priv(priv->dev); - rn->free_rdma_netdev(priv->dev); - kfree(priv); + ipoib_dbg(ppriv, "delete child vlan %s\n", dev->name); + unregister_netdevice(dev); } - return result; + rtnl_unlock(); + + kfree(pwork); } int ipoib_vlan_delete(struct net_device *pdev, unsigned short pkey) { struct ipoib_dev_priv *ppriv, *priv, *tpriv; - struct net_device *dev = NULL; + int rc; if (!capable(CAP_NET_ADMIN)) return -EPERM; - ppriv = ipoib_priv(pdev); - - if (test_bit(IPOIB_FLAG_GOING_DOWN, &ppriv->flags)) - return -EPERM; - - if (!mutex_trylock(&ppriv->sysfs_mutex)) + if (!rtnl_trylock()) return restart_syscall(); - if (!rtnl_trylock()) { - mutex_unlock(&ppriv->sysfs_mutex); - return restart_syscall(); - } - - if (!down_write_trylock(&ppriv->vlan_rwsem)) { + if (pdev->reg_state != NETREG_REGISTERED) { rtnl_unlock(); - mutex_unlock(&ppriv->sysfs_mutex); - return restart_syscall(); + return -EPERM; } + ppriv = ipoib_priv(pdev); + + rc = -ENODEV; list_for_each_entry_safe(priv, tpriv, &ppriv->child_intfs, list) { if (priv->pkey == pkey && priv->child_type == IPOIB_LEGACY_CHILD) { - list_del(&priv->list); - dev = priv->dev; + struct ipoib_vlan_delete_work *work; + + work = kmalloc(sizeof(*work), GFP_KERNEL); + if (!work) { + rc = -ENOMEM; + goto out; + } + + down_write(&ppriv->vlan_rwsem); + list_del_init(&priv->list); + up_write(&ppriv->vlan_rwsem); + work->dev = priv->dev; + INIT_WORK(&work->work, ipoib_vlan_delete_task); + queue_work(ipoib_workqueue, &work->work); + + rc = 0; break; } } - up_write(&ppriv->vlan_rwsem); - - if (dev) { - ipoib_dbg(ppriv, "delete child vlan %s\n", dev->name); - unregister_netdevice(dev); - } +out: rtnl_unlock(); - mutex_unlock(&ppriv->sysfs_mutex); - - if (dev) { - struct rdma_netdev *rn; - - rn = netdev_priv(dev); - rn->free_rdma_netdev(priv->dev); - kfree(priv); - return 0; - } - return -ENODEV; + return rc; } diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.c b/drivers/infiniband/ulp/iser/iscsi_iser.c index 9a6434c31db2..3fecd87c9f2b 100644 --- a/drivers/infiniband/ulp/iser/iscsi_iser.c +++ b/drivers/infiniband/ulp/iser/iscsi_iser.c @@ -610,12 +610,10 @@ iscsi_iser_session_create(struct iscsi_endpoint *ep, uint32_t initial_cmdsn) { struct iscsi_cls_session *cls_session; - struct iscsi_session *session; struct Scsi_Host *shost; struct iser_conn *iser_conn = NULL; struct ib_conn *ib_conn; u32 max_fr_sectors; - u16 max_cmds; shost = iscsi_host_alloc(&iscsi_iser_sht, 0, 0); if (!shost) @@ -633,8 +631,8 @@ iscsi_iser_session_create(struct iscsi_endpoint *ep, */ if (ep) { iser_conn = ep->dd_data; - max_cmds = iser_conn->max_cmds; shost->sg_tablesize = iser_conn->scsi_sg_tablesize; + shost->can_queue = min_t(u16, cmds_max, iser_conn->max_cmds); mutex_lock(&iser_conn->state_mutex); if (iser_conn->state != ISER_CONN_UP) { @@ -660,7 +658,7 @@ iscsi_iser_session_create(struct iscsi_endpoint *ep, } mutex_unlock(&iser_conn->state_mutex); } else { - max_cmds = ISER_DEF_XMIT_CMDS_MAX; + shost->can_queue = min_t(u16, cmds_max, ISER_DEF_XMIT_CMDS_MAX); if (iscsi_host_add(shost, NULL)) goto free_host; } @@ -676,21 +674,13 @@ iscsi_iser_session_create(struct iscsi_endpoint *ep, iser_warn("max_sectors was reduced from %u to %u\n", iser_max_sectors, shost->max_sectors); - if (cmds_max > max_cmds) { - iser_info("cmds_max changed from %u to %u\n", - cmds_max, max_cmds); - cmds_max = max_cmds; - } - cls_session = iscsi_session_setup(&iscsi_iser_transport, shost, - cmds_max, 0, + shost->can_queue, 0, sizeof(struct iscsi_iser_task), initial_cmdsn, 0); if (!cls_session) goto remove_host; - session = cls_session->dd_data; - shost->can_queue = session->scsi_cmds_max; return cls_session; remove_host: diff --git a/drivers/infiniband/ulp/iser/iser_memory.c b/drivers/infiniband/ulp/iser/iser_memory.c index 130bf163f066..009be8889d71 100644 --- a/drivers/infiniband/ulp/iser/iser_memory.c +++ b/drivers/infiniband/ulp/iser/iser_memory.c @@ -405,7 +405,8 @@ iser_reg_sig_mr(struct iscsi_iser_task *iser_task, ib_update_fast_reg_key(mr, ib_inc_rkey(mr->rkey)); - wr = sig_handover_wr(iser_tx_next_wr(tx_desc)); + wr = container_of(iser_tx_next_wr(tx_desc), struct ib_sig_handover_wr, + wr); wr->wr.opcode = IB_WR_REG_SIG_MR; wr->wr.wr_cqe = cqe; wr->wr.sg_list = &data_reg->sge; @@ -457,7 +458,7 @@ static int iser_fast_reg_mr(struct iscsi_iser_task *iser_task, return n < 0 ? n : -EINVAL; } - wr = reg_wr(iser_tx_next_wr(tx_desc)); + wr = container_of(iser_tx_next_wr(tx_desc), struct ib_reg_wr, wr); wr->wr.opcode = IB_WR_REG_MR; wr->wr.wr_cqe = cqe; wr->wr.send_flags = 0; diff --git a/drivers/infiniband/ulp/iser/iser_verbs.c b/drivers/infiniband/ulp/iser/iser_verbs.c index 616d978cbf2b..b686a4aaffe8 100644 --- a/drivers/infiniband/ulp/iser/iser_verbs.c +++ b/drivers/infiniband/ulp/iser/iser_verbs.c @@ -1022,7 +1022,7 @@ int iser_post_recvl(struct iser_conn *iser_conn) { struct ib_conn *ib_conn = &iser_conn->ib_conn; struct iser_login_desc *desc = &iser_conn->login_desc; - struct ib_recv_wr wr, *wr_failed; + struct ib_recv_wr wr; int ib_ret; desc->sge.addr = desc->rsp_dma; @@ -1036,7 +1036,7 @@ int iser_post_recvl(struct iser_conn *iser_conn) wr.next = NULL; ib_conn->post_recv_buf_count++; - ib_ret = ib_post_recv(ib_conn->qp, &wr, &wr_failed); + ib_ret = ib_post_recv(ib_conn->qp, &wr, NULL); if (ib_ret) { iser_err("ib_post_recv failed ret=%d\n", ib_ret); ib_conn->post_recv_buf_count--; @@ -1050,7 +1050,7 @@ int iser_post_recvm(struct iser_conn *iser_conn, int count) struct ib_conn *ib_conn = &iser_conn->ib_conn; unsigned int my_rx_head = iser_conn->rx_desc_head; struct iser_rx_desc *rx_desc; - struct ib_recv_wr *wr, *wr_failed; + struct ib_recv_wr *wr; int i, ib_ret; for (wr = ib_conn->rx_wr, i = 0; i < count; i++, wr++) { @@ -1067,7 +1067,7 @@ int iser_post_recvm(struct iser_conn *iser_conn, int count) wr->next = NULL; /* mark end of work requests list */ ib_conn->post_recv_buf_count += count; - ib_ret = ib_post_recv(ib_conn->qp, ib_conn->rx_wr, &wr_failed); + ib_ret = ib_post_recv(ib_conn->qp, ib_conn->rx_wr, NULL); if (ib_ret) { iser_err("ib_post_recv failed ret=%d\n", ib_ret); ib_conn->post_recv_buf_count -= count; @@ -1086,7 +1086,7 @@ int iser_post_recvm(struct iser_conn *iser_conn, int count) int iser_post_send(struct ib_conn *ib_conn, struct iser_tx_desc *tx_desc, bool signal) { - struct ib_send_wr *bad_wr, *wr = iser_tx_next_wr(tx_desc); + struct ib_send_wr *wr = iser_tx_next_wr(tx_desc); int ib_ret; ib_dma_sync_single_for_device(ib_conn->device->ib_device, @@ -1100,10 +1100,10 @@ int iser_post_send(struct ib_conn *ib_conn, struct iser_tx_desc *tx_desc, wr->opcode = IB_WR_SEND; wr->send_flags = signal ? IB_SEND_SIGNALED : 0; - ib_ret = ib_post_send(ib_conn->qp, &tx_desc->wrs[0].send, &bad_wr); + ib_ret = ib_post_send(ib_conn->qp, &tx_desc->wrs[0].send, NULL); if (ib_ret) iser_err("ib_post_send failed, ret:%d opcode:%d\n", - ib_ret, bad_wr->opcode); + ib_ret, wr->opcode); return ib_ret; } diff --git a/drivers/infiniband/ulp/isert/ib_isert.c b/drivers/infiniband/ulp/isert/ib_isert.c index cccbcf0eb035..f39670c5c25c 100644 --- a/drivers/infiniband/ulp/isert/ib_isert.c +++ b/drivers/infiniband/ulp/isert/ib_isert.c @@ -136,7 +136,7 @@ isert_create_qp(struct isert_conn *isert_conn, attr.cap.max_send_wr = ISERT_QP_MAX_REQ_DTOS + 1; attr.cap.max_recv_wr = ISERT_QP_MAX_RECV_DTOS + 1; attr.cap.max_rdma_ctxs = ISCSI_DEF_XMIT_CMDS_MAX; - attr.cap.max_send_sge = device->ib_device->attrs.max_sge; + attr.cap.max_send_sge = device->ib_device->attrs.max_send_sge; attr.cap.max_recv_sge = 1; attr.sq_sig_type = IB_SIGNAL_REQ_WR; attr.qp_type = IB_QPT_RC; @@ -299,7 +299,8 @@ isert_create_device_ib_res(struct isert_device *device) struct ib_device *ib_dev = device->ib_device; int ret; - isert_dbg("devattr->max_sge: %d\n", ib_dev->attrs.max_sge); + isert_dbg("devattr->max_send_sge: %d devattr->max_recv_sge %d\n", + ib_dev->attrs.max_send_sge, ib_dev->attrs.max_recv_sge); isert_dbg("devattr->max_sge_rd: %d\n", ib_dev->attrs.max_sge_rd); ret = isert_alloc_comps(device); @@ -809,7 +810,7 @@ isert_cma_handler(struct rdma_cm_id *cma_id, struct rdma_cm_event *event) static int isert_post_recvm(struct isert_conn *isert_conn, u32 count) { - struct ib_recv_wr *rx_wr, *rx_wr_failed; + struct ib_recv_wr *rx_wr; int i, ret; struct iser_rx_desc *rx_desc; @@ -825,8 +826,7 @@ isert_post_recvm(struct isert_conn *isert_conn, u32 count) rx_wr--; rx_wr->next = NULL; /* mark end of work requests list */ - ret = ib_post_recv(isert_conn->qp, isert_conn->rx_wr, - &rx_wr_failed); + ret = ib_post_recv(isert_conn->qp, isert_conn->rx_wr, NULL); if (ret) isert_err("ib_post_recv() failed with ret: %d\n", ret); @@ -836,7 +836,7 @@ isert_post_recvm(struct isert_conn *isert_conn, u32 count) static int isert_post_recv(struct isert_conn *isert_conn, struct iser_rx_desc *rx_desc) { - struct ib_recv_wr *rx_wr_failed, rx_wr; + struct ib_recv_wr rx_wr; int ret; if (!rx_desc->in_use) { @@ -853,7 +853,7 @@ isert_post_recv(struct isert_conn *isert_conn, struct iser_rx_desc *rx_desc) rx_wr.num_sge = 1; rx_wr.next = NULL; - ret = ib_post_recv(isert_conn->qp, &rx_wr, &rx_wr_failed); + ret = ib_post_recv(isert_conn->qp, &rx_wr, NULL); if (ret) isert_err("ib_post_recv() failed with ret: %d\n", ret); @@ -864,7 +864,7 @@ static int isert_login_post_send(struct isert_conn *isert_conn, struct iser_tx_desc *tx_desc) { struct ib_device *ib_dev = isert_conn->cm_id->device; - struct ib_send_wr send_wr, *send_wr_failed; + struct ib_send_wr send_wr; int ret; ib_dma_sync_single_for_device(ib_dev, tx_desc->dma_addr, @@ -879,7 +879,7 @@ isert_login_post_send(struct isert_conn *isert_conn, struct iser_tx_desc *tx_des send_wr.opcode = IB_WR_SEND; send_wr.send_flags = IB_SEND_SIGNALED; - ret = ib_post_send(isert_conn->qp, &send_wr, &send_wr_failed); + ret = ib_post_send(isert_conn->qp, &send_wr, NULL); if (ret) isert_err("ib_post_send() failed, ret: %d\n", ret); @@ -967,7 +967,7 @@ isert_init_send_wr(struct isert_conn *isert_conn, struct isert_cmd *isert_cmd, static int isert_login_post_recv(struct isert_conn *isert_conn) { - struct ib_recv_wr rx_wr, *rx_wr_fail; + struct ib_recv_wr rx_wr; struct ib_sge sge; int ret; @@ -986,7 +986,7 @@ isert_login_post_recv(struct isert_conn *isert_conn) rx_wr.sg_list = &sge; rx_wr.num_sge = 1; - ret = ib_post_recv(isert_conn->qp, &rx_wr, &rx_wr_fail); + ret = ib_post_recv(isert_conn->qp, &rx_wr, NULL); if (ret) isert_err("ib_post_recv() failed: %d\n", ret); @@ -1829,7 +1829,6 @@ isert_send_done(struct ib_cq *cq, struct ib_wc *wc) static int isert_post_response(struct isert_conn *isert_conn, struct isert_cmd *isert_cmd) { - struct ib_send_wr *wr_failed; int ret; ret = isert_post_recv(isert_conn, isert_cmd->rx_desc); @@ -1838,8 +1837,7 @@ isert_post_response(struct isert_conn *isert_conn, struct isert_cmd *isert_cmd) return ret; } - ret = ib_post_send(isert_conn->qp, &isert_cmd->tx_desc.send_wr, - &wr_failed); + ret = ib_post_send(isert_conn->qp, &isert_cmd->tx_desc.send_wr, NULL); if (ret) { isert_err("ib_post_send failed with %d\n", ret); return ret; diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index 9786b24b956f..444d16520506 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -57,13 +57,10 @@ #define DRV_NAME "ib_srp" #define PFX DRV_NAME ": " -#define DRV_VERSION "2.0" -#define DRV_RELDATE "July 26, 2015" MODULE_AUTHOR("Roland Dreier"); MODULE_DESCRIPTION("InfiniBand SCSI RDMA Protocol initiator"); MODULE_LICENSE("Dual BSD/GPL"); -MODULE_INFO(release_date, DRV_RELDATE); #if !defined(CONFIG_DYNAMIC_DEBUG) #define DEFINE_DYNAMIC_DEBUG_METADATA(name, fmt) @@ -145,7 +142,8 @@ static void srp_remove_one(struct ib_device *device, void *client_data); static void srp_recv_done(struct ib_cq *cq, struct ib_wc *wc); static void srp_handle_qp_err(struct ib_cq *cq, struct ib_wc *wc, const char *opname); -static int srp_ib_cm_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event); +static int srp_ib_cm_handler(struct ib_cm_id *cm_id, + const struct ib_cm_event *event); static int srp_rdma_cm_handler(struct rdma_cm_id *cm_id, struct rdma_cm_event *event); @@ -1211,7 +1209,6 @@ static void srp_inv_rkey_err_done(struct ib_cq *cq, struct ib_wc *wc) static int srp_inv_rkey(struct srp_request *req, struct srp_rdma_ch *ch, u32 rkey) { - struct ib_send_wr *bad_wr; struct ib_send_wr wr = { .opcode = IB_WR_LOCAL_INV, .next = NULL, @@ -1222,7 +1219,7 @@ static int srp_inv_rkey(struct srp_request *req, struct srp_rdma_ch *ch, wr.wr_cqe = &req->reg_cqe; req->reg_cqe.done = srp_inv_rkey_err_done; - return ib_post_send(ch->qp, &wr, &bad_wr); + return ib_post_send(ch->qp, &wr, NULL); } static void srp_unmap_data(struct scsi_cmnd *scmnd, @@ -1503,7 +1500,6 @@ static int srp_map_finish_fr(struct srp_map_state *state, { struct srp_target_port *target = ch->target; struct srp_device *dev = target->srp_host->srp_dev; - struct ib_send_wr *bad_wr; struct ib_reg_wr wr; struct srp_fr_desc *desc; u32 rkey; @@ -1567,7 +1563,7 @@ static int srp_map_finish_fr(struct srp_map_state *state, srp_map_desc(state, desc->mr->iova, desc->mr->length, desc->mr->rkey); - err = ib_post_send(ch->qp, &wr.wr, &bad_wr); + err = ib_post_send(ch->qp, &wr.wr, NULL); if (unlikely(err)) { WARN_ON_ONCE(err == -ENOMEM); return err; @@ -2018,7 +2014,7 @@ static int srp_post_send(struct srp_rdma_ch *ch, struct srp_iu *iu, int len) { struct srp_target_port *target = ch->target; struct ib_sge list; - struct ib_send_wr wr, *bad_wr; + struct ib_send_wr wr; list.addr = iu->dma; list.length = len; @@ -2033,13 +2029,13 @@ static int srp_post_send(struct srp_rdma_ch *ch, struct srp_iu *iu, int len) wr.opcode = IB_WR_SEND; wr.send_flags = IB_SEND_SIGNALED; - return ib_post_send(ch->qp, &wr, &bad_wr); + return ib_post_send(ch->qp, &wr, NULL); } static int srp_post_recv(struct srp_rdma_ch *ch, struct srp_iu *iu) { struct srp_target_port *target = ch->target; - struct ib_recv_wr wr, *bad_wr; + struct ib_recv_wr wr; struct ib_sge list; list.addr = iu->dma; @@ -2053,7 +2049,7 @@ static int srp_post_recv(struct srp_rdma_ch *ch, struct srp_iu *iu) wr.sg_list = &list; wr.num_sge = 1; - return ib_post_recv(ch->qp, &wr, &bad_wr); + return ib_post_recv(ch->qp, &wr, NULL); } static void srp_process_rsp(struct srp_rdma_ch *ch, struct srp_rsp *rsp) @@ -2558,7 +2554,7 @@ error: } static void srp_ib_cm_rej_handler(struct ib_cm_id *cm_id, - struct ib_cm_event *event, + const struct ib_cm_event *event, struct srp_rdma_ch *ch) { struct srp_target_port *target = ch->target; @@ -2643,7 +2639,8 @@ static void srp_ib_cm_rej_handler(struct ib_cm_id *cm_id, } } -static int srp_ib_cm_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event) +static int srp_ib_cm_handler(struct ib_cm_id *cm_id, + const struct ib_cm_event *event) { struct srp_rdma_ch *ch = cm_id->context; struct srp_target_port *target = ch->target; @@ -3843,7 +3840,7 @@ static ssize_t srp_create_target(struct device *dev, INIT_WORK(&target->tl_err_work, srp_tl_err_work); INIT_WORK(&target->remove_work, srp_remove_work); spin_lock_init(&target->lock); - ret = ib_query_gid(ibdev, host->port, 0, &target->sgid, NULL); + ret = rdma_query_gid(ibdev, host->port, 0, &target->sgid); if (ret) goto out; diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.c b/drivers/infiniband/ulp/srpt/ib_srpt.c index 1ae638b58b63..f37cbad022a2 100644 --- a/drivers/infiniband/ulp/srpt/ib_srpt.c +++ b/drivers/infiniband/ulp/srpt/ib_srpt.c @@ -575,8 +575,7 @@ static int srpt_refresh_port(struct srpt_port *sport) sport->sm_lid = port_attr.sm_lid; sport->lid = port_attr.lid; - ret = ib_query_gid(sport->sdev->device, sport->port, 0, &sport->gid, - NULL); + ret = rdma_query_gid(sport->sdev->device, sport->port, 0, &sport->gid); if (ret) goto err_query_port; @@ -720,7 +719,7 @@ static struct srpt_ioctx **srpt_alloc_ioctx_ring(struct srpt_device *sdev, WARN_ON(ioctx_size != sizeof(struct srpt_recv_ioctx) && ioctx_size != sizeof(struct srpt_send_ioctx)); - ring = kmalloc_array(ring_size, sizeof(ring[0]), GFP_KERNEL); + ring = kvmalloc_array(ring_size, sizeof(ring[0]), GFP_KERNEL); if (!ring) goto out; for (i = 0; i < ring_size; ++i) { @@ -734,7 +733,7 @@ static struct srpt_ioctx **srpt_alloc_ioctx_ring(struct srpt_device *sdev, err: while (--i >= 0) srpt_free_ioctx(sdev, ring[i], dma_size, dir); - kfree(ring); + kvfree(ring); ring = NULL; out: return ring; @@ -759,7 +758,7 @@ static void srpt_free_ioctx_ring(struct srpt_ioctx **ioctx_ring, for (i = 0; i < ring_size; ++i) srpt_free_ioctx(sdev, ioctx_ring[i], dma_size, dir); - kfree(ioctx_ring); + kvfree(ioctx_ring); } /** @@ -817,7 +816,7 @@ static int srpt_post_recv(struct srpt_device *sdev, struct srpt_rdma_ch *ch, struct srpt_recv_ioctx *ioctx) { struct ib_sge list; - struct ib_recv_wr wr, *bad_wr; + struct ib_recv_wr wr; BUG_ON(!sdev); list.addr = ioctx->ioctx.dma; @@ -831,9 +830,9 @@ static int srpt_post_recv(struct srpt_device *sdev, struct srpt_rdma_ch *ch, wr.num_sge = 1; if (sdev->use_srq) - return ib_post_srq_recv(sdev->srq, &wr, &bad_wr); + return ib_post_srq_recv(sdev->srq, &wr, NULL); else - return ib_post_recv(ch->qp, &wr, &bad_wr); + return ib_post_recv(ch->qp, &wr, NULL); } /** @@ -847,7 +846,6 @@ static int srpt_post_recv(struct srpt_device *sdev, struct srpt_rdma_ch *ch, */ static int srpt_zerolength_write(struct srpt_rdma_ch *ch) { - struct ib_send_wr *bad_wr; struct ib_rdma_wr wr = { .wr = { .next = NULL, @@ -860,7 +858,7 @@ static int srpt_zerolength_write(struct srpt_rdma_ch *ch) pr_debug("%s-%d: queued zerolength write\n", ch->sess_name, ch->qp->qp_num); - return ib_post_send(ch->qp, &wr.wr, &bad_wr); + return ib_post_send(ch->qp, &wr.wr, NULL); } static void srpt_zerolength_write_done(struct ib_cq *cq, struct ib_wc *wc) @@ -1754,13 +1752,15 @@ retry: */ qp_init->cap.max_send_wr = min(sq_size / 2, attrs->max_qp_wr); qp_init->cap.max_rdma_ctxs = sq_size / 2; - qp_init->cap.max_send_sge = min(attrs->max_sge, SRPT_MAX_SG_PER_WQE); + qp_init->cap.max_send_sge = min(attrs->max_send_sge, + SRPT_MAX_SG_PER_WQE); qp_init->port_num = ch->sport->port; if (sdev->use_srq) { qp_init->srq = sdev->srq; } else { qp_init->cap.max_recv_wr = ch->rq_size; - qp_init->cap.max_recv_sge = qp_init->cap.max_send_sge; + qp_init->cap.max_recv_sge = min(attrs->max_recv_sge, + SRPT_MAX_SG_PER_WQE); } if (ch->using_rdma_cm) { @@ -1833,8 +1833,7 @@ static bool srpt_close_ch(struct srpt_rdma_ch *ch) int ret; if (!srpt_set_ch_state(ch, CH_DRAINING)) { - pr_debug("%s-%d: already closed\n", ch->sess_name, - ch->qp->qp_num); + pr_debug("%s: already closed\n", ch->sess_name); return false; } @@ -1940,8 +1939,8 @@ static void __srpt_close_all_ch(struct srpt_port *sport) list_for_each_entry(nexus, &sport->nexus_list, entry) { list_for_each_entry(ch, &nexus->ch_list, list) { if (srpt_disconnect_ch(ch) >= 0) - pr_info("Closing channel %s-%d because target %s_%d has been disabled\n", - ch->sess_name, ch->qp->qp_num, + pr_info("Closing channel %s because target %s_%d has been disabled\n", + ch->sess_name, sport->sdev->device->name, sport->port); srpt_close_ch(ch); } @@ -2086,7 +2085,7 @@ static int srpt_cm_req_recv(struct srpt_device *const sdev, struct rdma_conn_param rdma_cm; struct ib_cm_rep_param ib_cm; } *rep_param = NULL; - struct srpt_rdma_ch *ch; + struct srpt_rdma_ch *ch = NULL; char i_port_id[36]; u32 it_iu_len; int i, ret; @@ -2233,13 +2232,15 @@ static int srpt_cm_req_recv(struct srpt_device *const sdev, TARGET_PROT_NORMAL, i_port_id + 2, ch, NULL); if (IS_ERR_OR_NULL(ch->sess)) { + WARN_ON_ONCE(ch->sess == NULL); ret = PTR_ERR(ch->sess); + ch->sess = NULL; pr_info("Rejected login for initiator %s: ret = %d.\n", ch->sess_name, ret); rej->reason = cpu_to_be32(ret == -ENOMEM ? SRP_LOGIN_REJ_INSUFFICIENT_RESOURCES : SRP_LOGIN_REJ_CHANNEL_LIMIT_REACHED); - goto reject; + goto destroy_ib; } mutex_lock(&sport->mutex); @@ -2278,7 +2279,7 @@ static int srpt_cm_req_recv(struct srpt_device *const sdev, rej->reason = cpu_to_be32(SRP_LOGIN_REJ_INSUFFICIENT_RESOURCES); pr_err("rejected SRP_LOGIN_REQ because enabling RTR failed (error code = %d)\n", ret); - goto destroy_ib; + goto reject; } pr_debug("Establish connection sess=%p name=%s ch=%p\n", ch->sess, @@ -2357,8 +2358,11 @@ free_ring: srpt_free_ioctx_ring((struct srpt_ioctx **)ch->ioctx_ring, ch->sport->sdev, ch->rq_size, ch->max_rsp_size, DMA_TO_DEVICE); + free_ch: - if (ib_cm_id) + if (rdma_cm_id) + rdma_cm_id->context = NULL; + else ib_cm_id->context = NULL; kfree(ch); ch = NULL; @@ -2378,6 +2382,15 @@ reject: ib_send_cm_rej(ib_cm_id, IB_CM_REJ_CONSUMER_DEFINED, NULL, 0, rej, sizeof(*rej)); + if (ch && ch->sess) { + srpt_close_ch(ch); + /* + * Tell the caller not to free cm_id since + * srpt_release_channel_work() will do that. + */ + ret = 0; + } + out: kfree(rep_param); kfree(rsp); @@ -2387,7 +2400,7 @@ out: } static int srpt_ib_cm_req_recv(struct ib_cm_id *cm_id, - struct ib_cm_req_event_param *param, + const struct ib_cm_req_event_param *param, void *private_data) { char sguid[40]; @@ -2499,7 +2512,8 @@ static void srpt_cm_rtu_recv(struct srpt_rdma_ch *ch) * a non-zero value in any other case will trigger a race with the * ib_destroy_cm_id() call in srpt_release_channel(). */ -static int srpt_cm_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event) +static int srpt_cm_handler(struct ib_cm_id *cm_id, + const struct ib_cm_event *event) { struct srpt_rdma_ch *ch = cm_id->context; int ret; @@ -2609,7 +2623,7 @@ static int srpt_write_pending(struct se_cmd *se_cmd) struct srpt_send_ioctx *ioctx = container_of(se_cmd, struct srpt_send_ioctx, cmd); struct srpt_rdma_ch *ch = ioctx->ch; - struct ib_send_wr *first_wr = NULL, *bad_wr; + struct ib_send_wr *first_wr = NULL; struct ib_cqe *cqe = &ioctx->rdma_cqe; enum srpt_command_state new_state; int ret, i; @@ -2633,7 +2647,7 @@ static int srpt_write_pending(struct se_cmd *se_cmd) cqe = NULL; } - ret = ib_post_send(ch->qp, first_wr, &bad_wr); + ret = ib_post_send(ch->qp, first_wr, NULL); if (ret) { pr_err("%s: ib_post_send() returned %d for %d (avail: %d)\n", __func__, ret, ioctx->n_rdma, @@ -2671,7 +2685,7 @@ static void srpt_queue_response(struct se_cmd *cmd) container_of(cmd, struct srpt_send_ioctx, cmd); struct srpt_rdma_ch *ch = ioctx->ch; struct srpt_device *sdev = ch->sport->sdev; - struct ib_send_wr send_wr, *first_wr = &send_wr, *bad_wr; + struct ib_send_wr send_wr, *first_wr = &send_wr; struct ib_sge sge; enum srpt_command_state state; int resp_len, ret, i; @@ -2744,7 +2758,7 @@ static void srpt_queue_response(struct se_cmd *cmd) send_wr.opcode = IB_WR_SEND; send_wr.send_flags = IB_SEND_SIGNALED; - ret = ib_post_send(ch->qp, first_wr, &bad_wr); + ret = ib_post_send(ch->qp, first_wr, NULL); if (ret < 0) { pr_err("%s: sending cmd response failed for tag %llu (%d)\n", __func__, ioctx->cmd.tag, ret); @@ -2968,7 +2982,8 @@ static void srpt_add_one(struct ib_device *device) pr_debug("device = %p\n", device); - sdev = kzalloc(sizeof(*sdev), GFP_KERNEL); + sdev = kzalloc(struct_size(sdev, port, device->phys_port_cnt), + GFP_KERNEL); if (!sdev) goto err; @@ -3022,8 +3037,6 @@ static void srpt_add_one(struct ib_device *device) srpt_event_handler); ib_register_event_handler(&sdev->event_handler); - WARN_ON(sdev->device->phys_port_cnt > ARRAY_SIZE(sdev->port)); - for (i = 1; i <= sdev->device->phys_port_cnt; i++) { sport = &sdev->port[i - 1]; INIT_LIST_HEAD(&sport->nexus_list); diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.h b/drivers/infiniband/ulp/srpt/ib_srpt.h index 2361483476a0..444dfd7281b5 100644 --- a/drivers/infiniband/ulp/srpt/ib_srpt.h +++ b/drivers/infiniband/ulp/srpt/ib_srpt.h @@ -396,9 +396,9 @@ struct srpt_port { * @sdev_mutex: Serializes use_srq changes. * @use_srq: Whether or not to use SRQ. * @ioctx_ring: Per-HCA SRQ. - * @port: Information about the ports owned by this HCA. * @event_handler: Per-HCA asynchronous IB event handler. * @list: Node in srpt_dev_list. + * @port: Information about the ports owned by this HCA. */ struct srpt_device { struct ib_device *device; @@ -410,9 +410,9 @@ struct srpt_device { struct mutex sdev_mutex; bool use_srq; struct srpt_recv_ioctx **ioctx_ring; - struct srpt_port port[2]; struct ib_event_handler event_handler; struct list_head list; + struct srpt_port port[]; }; #endif /* IB_SRPT_H */ |