diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2019-07-15 20:38:15 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2019-07-15 20:38:15 -0700 |
commit | 2a3c389a0fde49b241430df806a34276568cfb29 (patch) | |
tree | 9cf35829317e8cc2aaffc4341fb824dad63fce02 /drivers/infiniband/core | |
parent | 8de262531f5fbb7458463224a7587429800c24bf (diff) | |
parent | 0b043644c0ca601cb19943a81aa1f1455dbe9461 (diff) | |
download | linux-2a3c389a0fde49b241430df806a34276568cfb29.tar.bz2 |
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma
Pull rdma updates from Jason Gunthorpe:
"A smaller cycle this time. Notably we see another new driver, 'Soft
iWarp', and the deletion of an ancient unused driver for nes.
- Revise and simplify the signature offload RDMA MR APIs
- More progress on hoisting object allocation boiler plate code out
of the drivers
- Driver bug fixes and revisions for hns, hfi1, efa, cxgb4, qib,
i40iw
- Tree wide cleanups: struct_size, put_user_page, xarray, rst doc
conversion
- Removal of obsolete ib_ucm chardev and nes driver
- netlink based discovery of chardevs and autoloading of the modules
providing them
- Move more of the rdamvt/hfi1 uapi to include/uapi/rdma
- New driver 'siw' for software based iWarp running on top of netdev,
much like rxe's software RoCE.
- mlx5 feature to report events in their raw devx format to userspace
- Expose per-object counters through rdma tool
- Adaptive interrupt moderation for RDMA (DIM), sharing the DIM core
from netdev"
* tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma: (194 commits)
RMDA/siw: Require a 64 bit arch
RDMA/siw: Mark expected switch fall-throughs
RDMA/core: Fix -Wunused-const-variable warnings
rdma/siw: Remove set but not used variable 's'
rdma/siw: Add missing dependencies on LIBCRC32C and DMA_VIRT_OPS
RDMA/siw: Add missing rtnl_lock around access to ifa
rdma/siw: Use proper enumerated type in map_cqe_status
RDMA/siw: Remove unnecessary kthread create/destroy printouts
IB/rdmavt: Fix variable shadowing issue in rvt_create_cq
RDMA/core: Fix race when resolving IP address
RDMA/core: Make rdma_counter.h compile stand alone
IB/core: Work on the caller socket net namespace in nldev_newlink()
RDMA/rxe: Fill in wc byte_len with IB_WC_RECV_RDMA_WITH_IMM
RDMA/mlx5: Set RDMA DIM to be enabled by default
RDMA/nldev: Added configuration of RDMA dynamic interrupt moderation to netlink
RDMA/core: Provide RDMA DIM support for ULPs
linux/dim: Implement RDMA adaptive moderation (DIM)
IB/mlx5: Report correctly tag matching rendezvous capability
docs: infiniband: add it to the driver-api bookset
IB/mlx5: Implement VHCA tunnel mechanism in DEVX
...
Diffstat (limited to 'drivers/infiniband/core')
23 files changed, 2117 insertions, 1749 deletions
diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile index 313f2349b518..09881bd5f12d 100644 --- a/drivers/infiniband/core/Makefile +++ b/drivers/infiniband/core/Makefile @@ -6,13 +6,12 @@ obj-$(CONFIG_INFINIBAND) += ib_core.o ib_cm.o iw_cm.o \ $(infiniband-y) obj-$(CONFIG_INFINIBAND_USER_MAD) += ib_umad.o obj-$(CONFIG_INFINIBAND_USER_ACCESS) += ib_uverbs.o $(user_access-y) -obj-$(CONFIG_INFINIBAND_USER_ACCESS_UCM) += ib_ucm.o $(user_access-y) ib_core-y := packer.o ud_header.o verbs.o cq.o rw.o sysfs.o \ device.o fmr_pool.o cache.o netlink.o \ roce_gid_mgmt.o mr_pool.o addr.o sa_query.o \ multicast.o mad.o smi.o agent.o mad_rmpp.o \ - nldev.o restrack.o + nldev.o restrack.o counters.o ib_core-$(CONFIG_SECURITY_INFINIBAND) += security.o ib_core-$(CONFIG_CGROUP_RDMA) += cgroup.o @@ -29,8 +28,6 @@ rdma_ucm-y := ucma.o ib_umad-y := user_mad.o -ib_ucm-y := ucm.o - ib_uverbs-y := uverbs_main.o uverbs_cmd.o uverbs_marshall.o \ rdma_core.o uverbs_std_types.o uverbs_ioctl.o \ uverbs_std_types_cq.o \ diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c index 2f7d14159841..9b76a8fcdd24 100644 --- a/drivers/infiniband/core/addr.c +++ b/drivers/infiniband/core/addr.c @@ -337,7 +337,7 @@ static int dst_fetch_ha(const struct dst_entry *dst, neigh_event_send(n, NULL); ret = -ENODATA; } else { - memcpy(dev_addr->dst_dev_addr, n->ha, MAX_ADDR_LEN); + neigh_ha_snapshot(dev_addr->dst_dev_addr, n, dst->dev); } neigh_release(n); diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h index ff40a450b5d2..888d89ce81df 100644 --- a/drivers/infiniband/core/core_priv.h +++ b/drivers/infiniband/core/core_priv.h @@ -60,6 +60,7 @@ extern bool ib_devices_shared_netns; int ib_device_register_sysfs(struct ib_device *device); void ib_device_unregister_sysfs(struct ib_device *device); int ib_device_rename(struct ib_device *ibdev, const char *name); +int ib_device_set_dim(struct ib_device *ibdev, u8 use_dim); typedef void (*roce_netdev_callback)(struct ib_device *device, u8 port, struct net_device *idev, void *cookie); @@ -88,6 +89,15 @@ typedef int (*nldev_callback)(struct ib_device *device, int ib_enum_all_devs(nldev_callback nldev_cb, struct sk_buff *skb, struct netlink_callback *cb); +struct ib_client_nl_info { + struct sk_buff *nl_msg; + struct device *cdev; + unsigned int port; + u64 abi; +}; +int ib_get_client_nl_info(struct ib_device *ibdev, const char *client_name, + struct ib_client_nl_info *res); + enum ib_cache_gid_default_mode { IB_CACHE_GID_DEFAULT_MODE_SET, IB_CACHE_GID_DEFAULT_MODE_DELETE diff --git a/drivers/infiniband/core/counters.c b/drivers/infiniband/core/counters.c new file mode 100644 index 000000000000..01faef7bc061 --- /dev/null +++ b/drivers/infiniband/core/counters.c @@ -0,0 +1,634 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2019 Mellanox Technologies. All rights reserved. + */ +#include <rdma/ib_verbs.h> +#include <rdma/rdma_counter.h> + +#include "core_priv.h" +#include "restrack.h" + +#define ALL_AUTO_MODE_MASKS (RDMA_COUNTER_MASK_QP_TYPE) + +static int __counter_set_mode(struct rdma_counter_mode *curr, + enum rdma_nl_counter_mode new_mode, + enum rdma_nl_counter_mask new_mask) +{ + if ((new_mode == RDMA_COUNTER_MODE_AUTO) && + ((new_mask & (~ALL_AUTO_MODE_MASKS)) || + (curr->mode != RDMA_COUNTER_MODE_NONE))) + return -EINVAL; + + curr->mode = new_mode; + curr->mask = new_mask; + return 0; +} + +/** + * rdma_counter_set_auto_mode() - Turn on/off per-port auto mode + * + * When @on is true, the @mask must be set; When @on is false, it goes + * into manual mode if there's any counter, so that the user is able to + * manually access them. + */ +int rdma_counter_set_auto_mode(struct ib_device *dev, u8 port, + bool on, enum rdma_nl_counter_mask mask) +{ + struct rdma_port_counter *port_counter; + int ret; + + port_counter = &dev->port_data[port].port_counter; + mutex_lock(&port_counter->lock); + if (on) { + ret = __counter_set_mode(&port_counter->mode, + RDMA_COUNTER_MODE_AUTO, mask); + } else { + if (port_counter->mode.mode != RDMA_COUNTER_MODE_AUTO) { + ret = -EINVAL; + goto out; + } + + if (port_counter->num_counters) + ret = __counter_set_mode(&port_counter->mode, + RDMA_COUNTER_MODE_MANUAL, 0); + else + ret = __counter_set_mode(&port_counter->mode, + RDMA_COUNTER_MODE_NONE, 0); + } + +out: + mutex_unlock(&port_counter->lock); + return ret; +} + +static struct rdma_counter *rdma_counter_alloc(struct ib_device *dev, u8 port, + enum rdma_nl_counter_mode mode) +{ + struct rdma_port_counter *port_counter; + struct rdma_counter *counter; + int ret; + + if (!dev->ops.counter_dealloc || !dev->ops.counter_alloc_stats) + return NULL; + + counter = kzalloc(sizeof(*counter), GFP_KERNEL); + if (!counter) + return NULL; + + counter->device = dev; + counter->port = port; + counter->res.type = RDMA_RESTRACK_COUNTER; + counter->stats = dev->ops.counter_alloc_stats(counter); + if (!counter->stats) + goto err_stats; + + port_counter = &dev->port_data[port].port_counter; + mutex_lock(&port_counter->lock); + if (mode == RDMA_COUNTER_MODE_MANUAL) { + ret = __counter_set_mode(&port_counter->mode, + RDMA_COUNTER_MODE_MANUAL, 0); + if (ret) + goto err_mode; + } + + port_counter->num_counters++; + mutex_unlock(&port_counter->lock); + + counter->mode.mode = mode; + kref_init(&counter->kref); + mutex_init(&counter->lock); + + return counter; + +err_mode: + mutex_unlock(&port_counter->lock); + kfree(counter->stats); +err_stats: + kfree(counter); + return NULL; +} + +static void rdma_counter_free(struct rdma_counter *counter) +{ + struct rdma_port_counter *port_counter; + + port_counter = &counter->device->port_data[counter->port].port_counter; + mutex_lock(&port_counter->lock); + port_counter->num_counters--; + if (!port_counter->num_counters && + (port_counter->mode.mode == RDMA_COUNTER_MODE_MANUAL)) + __counter_set_mode(&port_counter->mode, RDMA_COUNTER_MODE_NONE, + 0); + + mutex_unlock(&port_counter->lock); + + rdma_restrack_del(&counter->res); + kfree(counter->stats); + kfree(counter); +} + +static void auto_mode_init_counter(struct rdma_counter *counter, + const struct ib_qp *qp, + enum rdma_nl_counter_mask new_mask) +{ + struct auto_mode_param *param = &counter->mode.param; + + counter->mode.mode = RDMA_COUNTER_MODE_AUTO; + counter->mode.mask = new_mask; + + if (new_mask & RDMA_COUNTER_MASK_QP_TYPE) + param->qp_type = qp->qp_type; +} + +static bool auto_mode_match(struct ib_qp *qp, struct rdma_counter *counter, + enum rdma_nl_counter_mask auto_mask) +{ + struct auto_mode_param *param = &counter->mode.param; + bool match = true; + + if (rdma_is_kernel_res(&counter->res) != rdma_is_kernel_res(&qp->res)) + return false; + + /* Ensure that counter belong to right PID */ + if (!rdma_is_kernel_res(&counter->res) && + !rdma_is_kernel_res(&qp->res) && + (task_pid_vnr(counter->res.task) != current->pid)) + return false; + + if (auto_mask & RDMA_COUNTER_MASK_QP_TYPE) + match &= (param->qp_type == qp->qp_type); + + return match; +} + +static int __rdma_counter_bind_qp(struct rdma_counter *counter, + struct ib_qp *qp) +{ + int ret; + + if (qp->counter) + return -EINVAL; + + if (!qp->device->ops.counter_bind_qp) + return -EOPNOTSUPP; + + mutex_lock(&counter->lock); + ret = qp->device->ops.counter_bind_qp(counter, qp); + mutex_unlock(&counter->lock); + + return ret; +} + +static int __rdma_counter_unbind_qp(struct ib_qp *qp) +{ + struct rdma_counter *counter = qp->counter; + int ret; + + if (!qp->device->ops.counter_unbind_qp) + return -EOPNOTSUPP; + + mutex_lock(&counter->lock); + ret = qp->device->ops.counter_unbind_qp(qp); + mutex_unlock(&counter->lock); + + return ret; +} + +static void counter_history_stat_update(const struct rdma_counter *counter) +{ + struct ib_device *dev = counter->device; + struct rdma_port_counter *port_counter; + int i; + + port_counter = &dev->port_data[counter->port].port_counter; + if (!port_counter->hstats) + return; + + for (i = 0; i < counter->stats->num_counters; i++) + port_counter->hstats->value[i] += counter->stats->value[i]; +} + +/** + * rdma_get_counter_auto_mode - Find the counter that @qp should be bound + * with in auto mode + * + * Return: The counter (with ref-count increased) if found + */ +static struct rdma_counter *rdma_get_counter_auto_mode(struct ib_qp *qp, + u8 port) +{ + struct rdma_port_counter *port_counter; + struct rdma_counter *counter = NULL; + struct ib_device *dev = qp->device; + struct rdma_restrack_entry *res; + struct rdma_restrack_root *rt; + unsigned long id = 0; + + port_counter = &dev->port_data[port].port_counter; + rt = &dev->res[RDMA_RESTRACK_COUNTER]; + xa_lock(&rt->xa); + xa_for_each(&rt->xa, id, res) { + if (!rdma_is_visible_in_pid_ns(res)) + continue; + + counter = container_of(res, struct rdma_counter, res); + if ((counter->device != qp->device) || (counter->port != port)) + goto next; + + if (auto_mode_match(qp, counter, port_counter->mode.mask)) + break; +next: + counter = NULL; + } + + if (counter && !kref_get_unless_zero(&counter->kref)) + counter = NULL; + + xa_unlock(&rt->xa); + return counter; +} + +static void rdma_counter_res_add(struct rdma_counter *counter, + struct ib_qp *qp) +{ + if (rdma_is_kernel_res(&qp->res)) { + rdma_restrack_set_task(&counter->res, qp->res.kern_name); + rdma_restrack_kadd(&counter->res); + } else { + rdma_restrack_attach_task(&counter->res, qp->res.task); + rdma_restrack_uadd(&counter->res); + } +} + +static void counter_release(struct kref *kref) +{ + struct rdma_counter *counter; + + counter = container_of(kref, struct rdma_counter, kref); + counter_history_stat_update(counter); + counter->device->ops.counter_dealloc(counter); + rdma_counter_free(counter); +} + +/** + * rdma_counter_bind_qp_auto - Check and bind the QP to a counter base on + * the auto-mode rule + */ +int rdma_counter_bind_qp_auto(struct ib_qp *qp, u8 port) +{ + struct rdma_port_counter *port_counter; + struct ib_device *dev = qp->device; + struct rdma_counter *counter; + int ret; + + if (!rdma_is_port_valid(dev, port)) + return -EINVAL; + + port_counter = &dev->port_data[port].port_counter; + if (port_counter->mode.mode != RDMA_COUNTER_MODE_AUTO) + return 0; + + counter = rdma_get_counter_auto_mode(qp, port); + if (counter) { + ret = __rdma_counter_bind_qp(counter, qp); + if (ret) { + kref_put(&counter->kref, counter_release); + return ret; + } + } else { + counter = rdma_counter_alloc(dev, port, RDMA_COUNTER_MODE_AUTO); + if (!counter) + return -ENOMEM; + + auto_mode_init_counter(counter, qp, port_counter->mode.mask); + + ret = __rdma_counter_bind_qp(counter, qp); + if (ret) { + rdma_counter_free(counter); + return ret; + } + + rdma_counter_res_add(counter, qp); + } + + return 0; +} + +/** + * rdma_counter_unbind_qp - Unbind a qp from a counter + * @force: + * true - Decrease the counter ref-count anyway (e.g., qp destroy) + */ +int rdma_counter_unbind_qp(struct ib_qp *qp, bool force) +{ + struct rdma_counter *counter = qp->counter; + int ret; + + if (!counter) + return -EINVAL; + + ret = __rdma_counter_unbind_qp(qp); + if (ret && !force) + return ret; + + kref_put(&counter->kref, counter_release); + return 0; +} + +int rdma_counter_query_stats(struct rdma_counter *counter) +{ + struct ib_device *dev = counter->device; + int ret; + + if (!dev->ops.counter_update_stats) + return -EINVAL; + + mutex_lock(&counter->lock); + ret = dev->ops.counter_update_stats(counter); + mutex_unlock(&counter->lock); + + return ret; +} + +static u64 get_running_counters_hwstat_sum(struct ib_device *dev, + u8 port, u32 index) +{ + struct rdma_restrack_entry *res; + struct rdma_restrack_root *rt; + struct rdma_counter *counter; + unsigned long id = 0; + u64 sum = 0; + + rt = &dev->res[RDMA_RESTRACK_COUNTER]; + xa_lock(&rt->xa); + xa_for_each(&rt->xa, id, res) { + if (!rdma_restrack_get(res)) + continue; + + xa_unlock(&rt->xa); + + counter = container_of(res, struct rdma_counter, res); + if ((counter->device != dev) || (counter->port != port) || + rdma_counter_query_stats(counter)) + goto next; + + sum += counter->stats->value[index]; + +next: + xa_lock(&rt->xa); + rdma_restrack_put(res); + } + + xa_unlock(&rt->xa); + return sum; +} + +/** + * rdma_counter_get_hwstat_value() - Get the sum value of all counters on a + * specific port, including the running ones and history data + */ +u64 rdma_counter_get_hwstat_value(struct ib_device *dev, u8 port, u32 index) +{ + struct rdma_port_counter *port_counter; + u64 sum; + + port_counter = &dev->port_data[port].port_counter; + sum = get_running_counters_hwstat_sum(dev, port, index); + sum += port_counter->hstats->value[index]; + + return sum; +} + +static struct ib_qp *rdma_counter_get_qp(struct ib_device *dev, u32 qp_num) +{ + struct rdma_restrack_entry *res = NULL; + struct ib_qp *qp = NULL; + + res = rdma_restrack_get_byid(dev, RDMA_RESTRACK_QP, qp_num); + if (IS_ERR(res)) + return NULL; + + if (!rdma_is_visible_in_pid_ns(res)) + goto err; + + qp = container_of(res, struct ib_qp, res); + if (qp->qp_type == IB_QPT_RAW_PACKET && !capable(CAP_NET_RAW)) + goto err; + + return qp; + +err: + rdma_restrack_put(&qp->res); + return NULL; +} + +static int rdma_counter_bind_qp_manual(struct rdma_counter *counter, + struct ib_qp *qp) +{ + if ((counter->device != qp->device) || (counter->port != qp->port)) + return -EINVAL; + + return __rdma_counter_bind_qp(counter, qp); +} + +static struct rdma_counter *rdma_get_counter_by_id(struct ib_device *dev, + u32 counter_id) +{ + struct rdma_restrack_entry *res; + struct rdma_counter *counter; + + res = rdma_restrack_get_byid(dev, RDMA_RESTRACK_COUNTER, counter_id); + if (IS_ERR(res)) + return NULL; + + if (!rdma_is_visible_in_pid_ns(res)) { + rdma_restrack_put(res); + return NULL; + } + + counter = container_of(res, struct rdma_counter, res); + kref_get(&counter->kref); + rdma_restrack_put(res); + + return counter; +} + +/** + * rdma_counter_bind_qpn() - Bind QP @qp_num to counter @counter_id + */ +int rdma_counter_bind_qpn(struct ib_device *dev, u8 port, + u32 qp_num, u32 counter_id) +{ + struct rdma_counter *counter; + struct ib_qp *qp; + int ret; + + qp = rdma_counter_get_qp(dev, qp_num); + if (!qp) + return -ENOENT; + + counter = rdma_get_counter_by_id(dev, counter_id); + if (!counter) { + ret = -ENOENT; + goto err; + } + + if (counter->res.task != qp->res.task) { + ret = -EINVAL; + goto err_task; + } + + ret = rdma_counter_bind_qp_manual(counter, qp); + if (ret) + goto err_task; + + rdma_restrack_put(&qp->res); + return 0; + +err_task: + kref_put(&counter->kref, counter_release); +err: + rdma_restrack_put(&qp->res); + return ret; +} + +/** + * rdma_counter_bind_qpn_alloc() - Alloc a counter and bind QP @qp_num to it + * The id of new counter is returned in @counter_id + */ +int rdma_counter_bind_qpn_alloc(struct ib_device *dev, u8 port, + u32 qp_num, u32 *counter_id) +{ + struct rdma_counter *counter; + struct ib_qp *qp; + int ret; + + if (!rdma_is_port_valid(dev, port)) + return -EINVAL; + + qp = rdma_counter_get_qp(dev, qp_num); + if (!qp) + return -ENOENT; + + if (rdma_is_port_valid(dev, qp->port) && (qp->port != port)) { + ret = -EINVAL; + goto err; + } + + counter = rdma_counter_alloc(dev, port, RDMA_COUNTER_MODE_MANUAL); + if (!counter) { + ret = -ENOMEM; + goto err; + } + + ret = rdma_counter_bind_qp_manual(counter, qp); + if (ret) + goto err_bind; + + if (counter_id) + *counter_id = counter->id; + + rdma_counter_res_add(counter, qp); + + rdma_restrack_put(&qp->res); + return ret; + +err_bind: + rdma_counter_free(counter); +err: + rdma_restrack_put(&qp->res); + return ret; +} + +/** + * rdma_counter_unbind_qpn() - Unbind QP @qp_num from a counter + */ +int rdma_counter_unbind_qpn(struct ib_device *dev, u8 port, + u32 qp_num, u32 counter_id) +{ + struct rdma_port_counter *port_counter; + struct ib_qp *qp; + int ret; + + if (!rdma_is_port_valid(dev, port)) + return -EINVAL; + + qp = rdma_counter_get_qp(dev, qp_num); + if (!qp) + return -ENOENT; + + if (rdma_is_port_valid(dev, qp->port) && (qp->port != port)) { + ret = -EINVAL; + goto out; + } + + port_counter = &dev->port_data[port].port_counter; + if (!qp->counter || qp->counter->id != counter_id || + port_counter->mode.mode != RDMA_COUNTER_MODE_MANUAL) { + ret = -EINVAL; + goto out; + } + + ret = rdma_counter_unbind_qp(qp, false); + +out: + rdma_restrack_put(&qp->res); + return ret; +} + +int rdma_counter_get_mode(struct ib_device *dev, u8 port, + enum rdma_nl_counter_mode *mode, + enum rdma_nl_counter_mask *mask) +{ + struct rdma_port_counter *port_counter; + + port_counter = &dev->port_data[port].port_counter; + *mode = port_counter->mode.mode; + *mask = port_counter->mode.mask; + + return 0; +} + +void rdma_counter_init(struct ib_device *dev) +{ + struct rdma_port_counter *port_counter; + u32 port; + + if (!dev->ops.alloc_hw_stats || !dev->port_data) + return; + + rdma_for_each_port(dev, port) { + port_counter = &dev->port_data[port].port_counter; + port_counter->mode.mode = RDMA_COUNTER_MODE_NONE; + mutex_init(&port_counter->lock); + + port_counter->hstats = dev->ops.alloc_hw_stats(dev, port); + if (!port_counter->hstats) + goto fail; + } + + return; + +fail: + rdma_for_each_port(dev, port) { + port_counter = &dev->port_data[port].port_counter; + kfree(port_counter->hstats); + port_counter->hstats = NULL; + } + + return; +} + +void rdma_counter_release(struct ib_device *dev) +{ + struct rdma_port_counter *port_counter; + u32 port; + + if (!dev->ops.alloc_hw_stats) + return; + + rdma_for_each_port(dev, port) { + port_counter = &dev->port_data[port].port_counter; + kfree(port_counter->hstats); + } +} diff --git a/drivers/infiniband/core/cq.c b/drivers/infiniband/core/cq.c index a24c900fbdf6..7c599878ccf7 100644 --- a/drivers/infiniband/core/cq.c +++ b/drivers/infiniband/core/cq.c @@ -18,6 +18,53 @@ #define IB_POLL_FLAGS \ (IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS) +static const struct dim_cq_moder +rdma_dim_prof[RDMA_DIM_PARAMS_NUM_PROFILES] = { + {1, 0, 1, 0}, + {1, 0, 4, 0}, + {2, 0, 4, 0}, + {2, 0, 8, 0}, + {4, 0, 8, 0}, + {16, 0, 8, 0}, + {16, 0, 16, 0}, + {32, 0, 16, 0}, + {32, 0, 32, 0}, +}; + +static void ib_cq_rdma_dim_work(struct work_struct *w) +{ + struct dim *dim = container_of(w, struct dim, work); + struct ib_cq *cq = dim->priv; + + u16 usec = rdma_dim_prof[dim->profile_ix].usec; + u16 comps = rdma_dim_prof[dim->profile_ix].comps; + + dim->state = DIM_START_MEASURE; + + cq->device->ops.modify_cq(cq, comps, usec); +} + +static void rdma_dim_init(struct ib_cq *cq) +{ + struct dim *dim; + + if (!cq->device->ops.modify_cq || !cq->device->use_cq_dim || + cq->poll_ctx == IB_POLL_DIRECT) + return; + + dim = kzalloc(sizeof(struct dim), GFP_KERNEL); + if (!dim) + return; + + dim->state = DIM_START_MEASURE; + dim->tune_state = DIM_GOING_RIGHT; + dim->profile_ix = RDMA_DIM_START_PROFILE; + dim->priv = cq; + cq->dim = dim; + + INIT_WORK(&dim->work, ib_cq_rdma_dim_work); +} + static int __ib_process_cq(struct ib_cq *cq, int budget, struct ib_wc *wcs, int batch) { @@ -78,6 +125,7 @@ static void ib_cq_completion_direct(struct ib_cq *cq, void *private) static int ib_poll_handler(struct irq_poll *iop, int budget) { struct ib_cq *cq = container_of(iop, struct ib_cq, iop); + struct dim *dim = cq->dim; int completed; completed = __ib_process_cq(cq, budget, cq->wc, IB_POLL_BATCH); @@ -87,6 +135,9 @@ static int ib_poll_handler(struct irq_poll *iop, int budget) irq_poll_sched(&cq->iop); } + if (dim) + rdma_dim(dim, completed); + return completed; } @@ -105,6 +156,8 @@ static void ib_cq_poll_work(struct work_struct *work) if (completed >= IB_POLL_BUDGET_WORKQUEUE || ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0) queue_work(cq->comp_wq, &cq->work); + else if (cq->dim) + rdma_dim(cq->dim, completed); } static void ib_cq_completion_workqueue(struct ib_cq *cq, void *private) @@ -113,7 +166,7 @@ static void ib_cq_completion_workqueue(struct ib_cq *cq, void *private) } /** - * __ib_alloc_cq - allocate a completion queue + * __ib_alloc_cq_user - allocate a completion queue * @dev: device to allocate the CQ for * @private: driver private data, accessible from cq->cq_context * @nr_cqe: number of CQEs to allocate @@ -139,25 +192,30 @@ struct ib_cq *__ib_alloc_cq_user(struct ib_device *dev, void *private, struct ib_cq *cq; int ret = -ENOMEM; - cq = dev->ops.create_cq(dev, &cq_attr, NULL); - if (IS_ERR(cq)) - return cq; + cq = rdma_zalloc_drv_obj(dev, ib_cq); + if (!cq) + return ERR_PTR(ret); cq->device = dev; - cq->uobject = NULL; - cq->event_handler = NULL; cq->cq_context = private; cq->poll_ctx = poll_ctx; atomic_set(&cq->usecnt, 0); cq->wc = kmalloc_array(IB_POLL_BATCH, sizeof(*cq->wc), GFP_KERNEL); if (!cq->wc) - goto out_destroy_cq; + goto out_free_cq; cq->res.type = RDMA_RESTRACK_CQ; rdma_restrack_set_task(&cq->res, caller); + + ret = dev->ops.create_cq(cq, &cq_attr, NULL); + if (ret) + goto out_free_wc; + rdma_restrack_kadd(&cq->res); + rdma_dim_init(cq); + switch (cq->poll_ctx) { case IB_POLL_DIRECT: cq->comp_handler = ib_cq_completion_direct; @@ -178,29 +236,29 @@ struct ib_cq *__ib_alloc_cq_user(struct ib_device *dev, void *private, break; default: ret = -EINVAL; - goto out_free_wc; + goto out_destroy_cq; } return cq; -out_free_wc: - kfree(cq->wc); - rdma_restrack_del(&cq->res); out_destroy_cq: + rdma_restrack_del(&cq->res); cq->device->ops.destroy_cq(cq, udata); +out_free_wc: + kfree(cq->wc); +out_free_cq: + kfree(cq); return ERR_PTR(ret); } EXPORT_SYMBOL(__ib_alloc_cq_user); /** - * ib_free_cq - free a completion queue + * ib_free_cq_user - free a completion queue * @cq: completion queue to free. * @udata: User data or NULL for kernel object */ void ib_free_cq_user(struct ib_cq *cq, struct ib_udata *udata) { - int ret; - if (WARN_ON_ONCE(atomic_read(&cq->usecnt))) return; @@ -218,9 +276,12 @@ void ib_free_cq_user(struct ib_cq *cq, struct ib_udata *udata) WARN_ON_ONCE(1); } - kfree(cq->wc); rdma_restrack_del(&cq->res); - ret = cq->device->ops.destroy_cq(cq, udata); - WARN_ON_ONCE(ret); + cq->device->ops.destroy_cq(cq, udata); + if (cq->dim) + cancel_work_sync(&cq->dim->work); + kfree(cq->dim); + kfree(cq->wc); + kfree(cq); } EXPORT_SYMBOL(ib_free_cq_user); diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 3352a107b4a3..9773145dee09 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -46,6 +46,7 @@ #include <rdma/rdma_netlink.h> #include <rdma/ib_addr.h> #include <rdma/ib_cache.h> +#include <rdma/rdma_counter.h> #include "core_priv.h" #include "restrack.h" @@ -270,7 +271,7 @@ struct ib_port_data_rcu { struct ib_port_data pdata[]; }; -static int ib_device_check_mandatory(struct ib_device *device) +static void ib_device_check_mandatory(struct ib_device *device) { #define IB_MANDATORY_FUNC(x) { offsetof(struct ib_device_ops, x), #x } static const struct { @@ -305,8 +306,6 @@ static int ib_device_check_mandatory(struct ib_device *device) break; } } - - return 0; } /* @@ -375,7 +374,7 @@ struct ib_device *ib_device_get_by_name(const char *name, down_read(&devices_rwsem); device = __ib_device_get_by_name(name); if (device && driver_id != RDMA_DRIVER_UNKNOWN && - device->driver_id != driver_id) + device->ops.driver_id != driver_id) device = NULL; if (device) { @@ -449,6 +448,15 @@ int ib_device_rename(struct ib_device *ibdev, const char *name) return 0; } +int ib_device_set_dim(struct ib_device *ibdev, u8 use_dim) +{ + if (use_dim > 1) + return -EINVAL; + ibdev->use_cq_dim = use_dim; + + return 0; +} + static int alloc_name(struct ib_device *ibdev, const char *name) { struct ib_device *device; @@ -494,10 +502,12 @@ static void ib_device_release(struct device *device) if (dev->port_data) { ib_cache_release_one(dev); ib_security_release_port_pkey_list(dev); + rdma_counter_release(dev); kfree_rcu(container_of(dev->port_data, struct ib_port_data_rcu, pdata[0]), rcu_head); } + xa_destroy(&dev->compat_devs); xa_destroy(&dev->client_data); kfree_rcu(dev, rcu_head); @@ -1193,10 +1203,7 @@ static int setup_device(struct ib_device *device) int ret; setup_dma_device(device); - - ret = ib_device_check_mandatory(device); - if (ret) - return ret; + ib_device_check_mandatory(device); ret = setup_port_data(device); if (ret) { @@ -1321,6 +1328,8 @@ int ib_register_device(struct ib_device *device, const char *name) ib_device_register_rdmacg(device); + rdma_counter_init(device); + /* * Ensure that ADD uevent is not fired because it * is too early amd device is not initialized yet. @@ -1479,7 +1488,7 @@ void ib_unregister_driver(enum rdma_driver_id driver_id) down_read(&devices_rwsem); xa_for_each (&devices, index, ib_dev) { - if (ib_dev->driver_id != driver_id) + if (ib_dev->ops.driver_id != driver_id) continue; get_device(&ib_dev->dev); @@ -1749,6 +1758,104 @@ void ib_unregister_client(struct ib_client *client) } EXPORT_SYMBOL(ib_unregister_client); +static int __ib_get_global_client_nl_info(const char *client_name, + struct ib_client_nl_info *res) +{ + struct ib_client *client; + unsigned long index; + int ret = -ENOENT; + + down_read(&clients_rwsem); + xa_for_each_marked (&clients, index, client, CLIENT_REGISTERED) { + if (strcmp(client->name, client_name) != 0) + continue; + if (!client->get_global_nl_info) { + ret = -EOPNOTSUPP; + break; + } + ret = client->get_global_nl_info(res); + if (WARN_ON(ret == -ENOENT)) + ret = -EINVAL; + if (!ret && res->cdev) + get_device(res->cdev); + break; + } + up_read(&clients_rwsem); + return ret; +} + +static int __ib_get_client_nl_info(struct ib_device *ibdev, + const char *client_name, + struct ib_client_nl_info *res) +{ + unsigned long index; + void *client_data; + int ret = -ENOENT; + + down_read(&ibdev->client_data_rwsem); + xan_for_each_marked (&ibdev->client_data, index, client_data, + CLIENT_DATA_REGISTERED) { + struct ib_client *client = xa_load(&clients, index); + + if (!client || strcmp(client->name, client_name) != 0) + continue; + if (!client->get_nl_info) { + ret = -EOPNOTSUPP; + break; + } + ret = client->get_nl_info(ibdev, client_data, res); + if (WARN_ON(ret == -ENOENT)) + ret = -EINVAL; + + /* + * The cdev is guaranteed valid as long as we are inside the + * client_data_rwsem as remove_one can't be called. Keep it + * valid for the caller. + */ + if (!ret && res->cdev) + get_device(res->cdev); + break; + } + up_read(&ibdev->client_data_rwsem); + + return ret; +} + +/** + * ib_get_client_nl_info - Fetch the nl_info from a client + * @device - IB device + * @client_name - Name of the client + * @res - Result of the query + */ +int ib_get_client_nl_info(struct ib_device *ibdev, const char *client_name, + struct ib_client_nl_info *res) +{ + int ret; + + if (ibdev) + ret = __ib_get_client_nl_info(ibdev, client_name, res); + else + ret = __ib_get_global_client_nl_info(client_name, res); +#ifdef CONFIG_MODULES + if (ret == -ENOENT) { + request_module("rdma-client-%s", client_name); + if (ibdev) + ret = __ib_get_client_nl_info(ibdev, client_name, res); + else + ret = __ib_get_global_client_nl_info(client_name, res); + } +#endif + if (ret) { + if (ret == -ENOENT) + return -EOPNOTSUPP; + return ret; + } + + if (WARN_ON(!res->cdev)) + return -EINVAL; + return 0; +} + /** * ib_set_client_data - Set IB client context * @device:Device to set context for @@ -2039,7 +2146,7 @@ struct ib_device *ib_device_get_by_netdev(struct net_device *ndev, (uintptr_t)ndev) { if (rcu_access_pointer(cur->netdev) == ndev && (driver_id == RDMA_DRIVER_UNKNOWN || - cur->ib_dev->driver_id == driver_id) && + cur->ib_dev->ops.driver_id == driver_id) && ib_device_try_get(cur->ib_dev)) { res = cur->ib_dev; break; @@ -2344,12 +2451,28 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops) #define SET_OBJ_SIZE(ptr, name) SET_DEVICE_OP(ptr, size_##name) + if (ops->driver_id != RDMA_DRIVER_UNKNOWN) { + WARN_ON(dev_ops->driver_id != RDMA_DRIVER_UNKNOWN && + dev_ops->driver_id != ops->driver_id); + dev_ops->driver_id = ops->driver_id; + } + if (ops->owner) { + WARN_ON(dev_ops->owner && dev_ops->owner != ops->owner); + dev_ops->owner = ops->owner; + } + if (ops->uverbs_abi_ver) + dev_ops->uverbs_abi_ver = ops->uverbs_abi_ver; + + dev_ops->uverbs_no_driver_id_binding |= + ops->uverbs_no_driver_id_binding; + SET_DEVICE_OP(dev_ops, add_gid); SET_DEVICE_OP(dev_ops, advise_mr); SET_DEVICE_OP(dev_ops, alloc_dm); SET_DEVICE_OP(dev_ops, alloc_fmr); SET_DEVICE_OP(dev_ops, alloc_hw_stats); SET_DEVICE_OP(dev_ops, alloc_mr); + SET_DEVICE_OP(dev_ops, alloc_mr_integrity); SET_DEVICE_OP(dev_ops, alloc_mw); SET_DEVICE_OP(dev_ops, alloc_pd); SET_DEVICE_OP(dev_ops, alloc_rdma_netdev); @@ -2357,6 +2480,11 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops) SET_DEVICE_OP(dev_ops, alloc_xrcd); SET_DEVICE_OP(dev_ops, attach_mcast); SET_DEVICE_OP(dev_ops, check_mr_status); + SET_DEVICE_OP(dev_ops, counter_alloc_stats); + SET_DEVICE_OP(dev_ops, counter_bind_qp); + SET_DEVICE_OP(dev_ops, counter_dealloc); + SET_DEVICE_OP(dev_ops, counter_unbind_qp); + SET_DEVICE_OP(dev_ops, counter_update_stats); SET_DEVICE_OP(dev_ops, create_ah); SET_DEVICE_OP(dev_ops, create_counters); SET_DEVICE_OP(dev_ops, create_cq); @@ -2409,6 +2537,7 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops) SET_DEVICE_OP(dev_ops, iw_reject); SET_DEVICE_OP(dev_ops, iw_rem_ref); SET_DEVICE_OP(dev_ops, map_mr_sg); + SET_DEVICE_OP(dev_ops, map_mr_sg_pi); SET_DEVICE_OP(dev_ops, map_phys_fmr); SET_DEVICE_OP(dev_ops, mmap); SET_DEVICE_OP(dev_ops, modify_ah); @@ -2445,6 +2574,7 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops) SET_DEVICE_OP(dev_ops, unmap_fmr); SET_OBJ_SIZE(dev_ops, ib_ah); + SET_OBJ_SIZE(dev_ops, ib_cq); SET_OBJ_SIZE(dev_ops, ib_pd); SET_OBJ_SIZE(dev_ops, ib_srq); SET_OBJ_SIZE(dev_ops, ib_ucontext); diff --git a/drivers/infiniband/core/mr_pool.c b/drivers/infiniband/core/mr_pool.c index d117f21ce9fd..c0e2df128b34 100644 --- a/drivers/infiniband/core/mr_pool.c +++ b/drivers/infiniband/core/mr_pool.c @@ -34,14 +34,18 @@ void ib_mr_pool_put(struct ib_qp *qp, struct list_head *list, struct ib_mr *mr) EXPORT_SYMBOL(ib_mr_pool_put); int ib_mr_pool_init(struct ib_qp *qp, struct list_head *list, int nr, - enum ib_mr_type type, u32 max_num_sg) + enum ib_mr_type type, u32 max_num_sg, u32 max_num_meta_sg) { struct ib_mr *mr; unsigned long flags; int ret, i; for (i = 0; i < nr; i++) { - mr = ib_alloc_mr(qp->pd, type, max_num_sg); + if (type == IB_MR_TYPE_INTEGRITY) + mr = ib_alloc_mr_integrity(qp->pd, max_num_sg, + max_num_meta_sg); + else + mr = ib_alloc_mr(qp->pd, type, max_num_sg); if (IS_ERR(mr)) { ret = PTR_ERR(mr); goto out; diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c index 69188cbbd99b..783e465e7c41 100644 --- a/drivers/infiniband/core/nldev.c +++ b/drivers/infiniband/core/nldev.c @@ -42,84 +42,105 @@ #include "cma_priv.h" #include "restrack.h" +/* + * Sort array elements by the netlink attribute name + */ static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = { - [RDMA_NLDEV_ATTR_DEV_INDEX] = { .type = NLA_U32 }, - [RDMA_NLDEV_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, - .len = IB_DEVICE_NAME_MAX - 1}, - [RDMA_NLDEV_ATTR_PORT_INDEX] = { .type = NLA_U32 }, - [RDMA_NLDEV_ATTR_FW_VERSION] = { .type = NLA_NUL_STRING, - .len = IB_FW_VERSION_NAME_MAX - 1}, - [RDMA_NLDEV_ATTR_NODE_GUID] = { .type = NLA_U64 }, - [RDMA_NLDEV_ATTR_SYS_IMAGE_GUID] = { .type = NLA_U64 }, - [RDMA_NLDEV_ATTR_SUBNET_PREFIX] = { .type = NLA_U64 }, - [RDMA_NLDEV_ATTR_LID] = { .type = NLA_U32 }, - [RDMA_NLDEV_ATTR_SM_LID] = { .type = NLA_U32 }, - [RDMA_NLDEV_ATTR_LMC] = { .type = NLA_U8 }, - [RDMA_NLDEV_ATTR_PORT_STATE] = { .type = NLA_U8 }, - [RDMA_NLDEV_ATTR_PORT_PHYS_STATE] = { .type = NLA_U8 }, - [RDMA_NLDEV_ATTR_DEV_NODE_TYPE] = { .type = NLA_U8 }, - [RDMA_NLDEV_ATTR_RES_SUMMARY] = { .type = NLA_NESTED }, - [RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY] = { .type = NLA_NESTED }, - [RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY_NAME] = { .type = NLA_NUL_STRING, - .len = 16 }, - [RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY_CURR] = { .type = NLA_U64 }, - [RDMA_NLDEV_ATTR_RES_QP] = { .type = NLA_NESTED }, - [RDMA_NLDEV_ATTR_RES_QP_ENTRY] = { .type = NLA_NESTED }, - [RDMA_NLDEV_ATTR_RES_LQPN] = { .type = NLA_U32 }, - [RDMA_NLDEV_ATTR_RES_RQPN] = { .type = NLA_U32 }, - [RDMA_NLDEV_ATTR_RES_RQ_PSN] = { .type = NLA_U32 }, - [RDMA_NLDEV_ATTR_RES_SQ_PSN] = { .type = NLA_U32 }, - [RDMA_NLDEV_ATTR_RES_PATH_MIG_STATE] = { .type = NLA_U8 }, - [RDMA_NLDEV_ATTR_RES_TYPE] = { .type = NLA_U8 }, - [RDMA_NLDEV_ATTR_RES_STATE] = { .type = NLA_U8 }, - [RDMA_NLDEV_ATTR_RES_PID] = { .type = NLA_U32 }, - [RDMA_NLDEV_ATTR_RES_KERN_NAME] = { .type = NLA_NUL_STRING, - .len = TASK_COMM_LEN }, + [RDMA_NLDEV_ATTR_CHARDEV] = { .type = NLA_U64 }, + [RDMA_NLDEV_ATTR_CHARDEV_ABI] = { .type = NLA_U64 }, + [RDMA_NLDEV_ATTR_CHARDEV_NAME] = { .type = NLA_NUL_STRING, + .len = RDMA_NLDEV_ATTR_EMPTY_STRING }, + [RDMA_NLDEV_ATTR_CHARDEV_TYPE] = { .type = NLA_NUL_STRING, + .len = RDMA_NLDEV_ATTR_CHARDEV_TYPE_SIZE }, + [RDMA_NLDEV_ATTR_DEV_DIM] = { .type = NLA_U8 }, + [RDMA_NLDEV_ATTR_DEV_INDEX] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, + .len = IB_DEVICE_NAME_MAX }, + [RDMA_NLDEV_ATTR_DEV_NODE_TYPE] = { .type = NLA_U8 }, + [RDMA_NLDEV_ATTR_DEV_PROTOCOL] = { .type = NLA_NUL_STRING, + .len = RDMA_NLDEV_ATTR_EMPTY_STRING }, + [RDMA_NLDEV_ATTR_DRIVER] = { .type = NLA_NESTED }, + [RDMA_NLDEV_ATTR_DRIVER_ENTRY] = { .type = NLA_NESTED }, + [RDMA_NLDEV_ATTR_DRIVER_PRINT_TYPE] = { .type = NLA_U8 }, + [RDMA_NLDEV_ATTR_DRIVER_STRING] = { .type = NLA_NUL_STRING, + .len = RDMA_NLDEV_ATTR_EMPTY_STRING }, + [RDMA_NLDEV_ATTR_DRIVER_S32] = { .type = NLA_S32 }, + [RDMA_NLDEV_ATTR_DRIVER_S64] = { .type = NLA_S64 }, + [RDMA_NLDEV_ATTR_DRIVER_U32] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_DRIVER_U64] = { .type = NLA_U64 }, + [RDMA_NLDEV_ATTR_FW_VERSION] = { .type = NLA_NUL_STRING, + .len = RDMA_NLDEV_ATTR_EMPTY_STRING }, + [RDMA_NLDEV_ATTR_LID] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_LINK_TYPE] = { .type = NLA_NUL_STRING, + .len = IFNAMSIZ }, + [RDMA_NLDEV_ATTR_LMC] = { .type = NLA_U8 }, + [RDMA_NLDEV_ATTR_NDEV_INDEX] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_NDEV_NAME] = { .type = NLA_NUL_STRING, + .len = IFNAMSIZ }, + [RDMA_NLDEV_ATTR_NODE_GUID] = { .type = NLA_U64 }, + [RDMA_NLDEV_ATTR_PORT_INDEX] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_PORT_PHYS_STATE] = { .type = NLA_U8 }, + [RDMA_NLDEV_ATTR_PORT_STATE] = { .type = NLA_U8 }, [RDMA_NLDEV_ATTR_RES_CM_ID] = { .type = NLA_NESTED }, + [RDMA_NLDEV_ATTR_RES_CM_IDN] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_RES_CM_ID_ENTRY] = { .type = NLA_NESTED }, - [RDMA_NLDEV_ATTR_RES_PS] = { .type = NLA_U32 }, - [RDMA_NLDEV_ATTR_RES_SRC_ADDR] = { - .len = sizeof(struct __kernel_sockaddr_storage) }, - [RDMA_NLDEV_ATTR_RES_DST_ADDR] = { - .len = sizeof(struct __kernel_sockaddr_storage) }, [RDMA_NLDEV_ATTR_RES_CQ] = { .type = NLA_NESTED }, - [RDMA_NLDEV_ATTR_RES_CQ_ENTRY] = { .type = NLA_NESTED }, [RDMA_NLDEV_ATTR_RES_CQE] = { .type = NLA_U32 }, - [RDMA_NLDEV_ATTR_RES_USECNT] = { .type = NLA_U64 }, - [RDMA_NLDEV_ATTR_RES_POLL_CTX] = { .type = NLA_U8 }, - [RDMA_NLDEV_ATTR_RES_MR] = { .type = NLA_NESTED }, - [RDMA_NLDEV_ATTR_RES_MR_ENTRY] = { .type = NLA_NESTED }, - [RDMA_NLDEV_ATTR_RES_RKEY] = { .type = NLA_U32 }, - [RDMA_NLDEV_ATTR_RES_LKEY] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_RES_CQN] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_RES_CQ_ENTRY] = { .type = NLA_NESTED }, + [RDMA_NLDEV_ATTR_RES_CTXN] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_RES_DST_ADDR] = { + .len = sizeof(struct __kernel_sockaddr_storage) }, [RDMA_NLDEV_ATTR_RES_IOVA] = { .type = NLA_U64 }, + [RDMA_NLDEV_ATTR_RES_KERN_NAME] = { .type = NLA_NUL_STRING, + .len = RDMA_NLDEV_ATTR_EMPTY_STRING }, + [RDMA_NLDEV_ATTR_RES_LKEY] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_RES_LOCAL_DMA_LKEY] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_RES_LQPN] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_RES_MR] = { .type = NLA_NESTED }, [RDMA_NLDEV_ATTR_RES_MRLEN] = { .type = NLA_U64 }, + [RDMA_NLDEV_ATTR_RES_MRN] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_RES_MR_ENTRY] = { .type = NLA_NESTED }, + [RDMA_NLDEV_ATTR_RES_PATH_MIG_STATE] = { .type = NLA_U8 }, [RDMA_NLDEV_ATTR_RES_PD] = { .type = NLA_NESTED }, - [RDMA_NLDEV_ATTR_RES_PD_ENTRY] = { .type = NLA_NESTED }, - [RDMA_NLDEV_ATTR_RES_LOCAL_DMA_LKEY] = { .type = NLA_U32 }, - [RDMA_NLDEV_ATTR_RES_UNSAFE_GLOBAL_RKEY] = { .type = NLA_U32 }, - [RDMA_NLDEV_ATTR_NDEV_INDEX] = { .type = NLA_U32 }, - [RDMA_NLDEV_ATTR_NDEV_NAME] = { .type = NLA_NUL_STRING, - .len = IFNAMSIZ }, - [RDMA_NLDEV_ATTR_DRIVER] = { .type = NLA_NESTED }, - [RDMA_NLDEV_ATTR_DRIVER_ENTRY] = { .type = NLA_NESTED }, - [RDMA_NLDEV_ATTR_DRIVER_STRING] = { .type = NLA_NUL_STRING, - .len = RDMA_NLDEV_ATTR_ENTRY_STRLEN }, - [RDMA_NLDEV_ATTR_DRIVER_PRINT_TYPE] = { .type = NLA_U8 }, - [RDMA_NLDEV_ATTR_DRIVER_S32] = { .type = NLA_S32 }, - [RDMA_NLDEV_ATTR_DRIVER_U32] = { .type = NLA_U32 }, - [RDMA_NLDEV_ATTR_DRIVER_S64] = { .type = NLA_S64 }, - [RDMA_NLDEV_ATTR_DRIVER_U64] = { .type = NLA_U64 }, [RDMA_NLDEV_ATTR_RES_PDN] = { .type = NLA_U32 }, - [RDMA_NLDEV_ATTR_RES_CQN] = { .type = NLA_U32 }, - [RDMA_NLDEV_ATTR_RES_MRN] = { .type = NLA_U32 }, - [RDMA_NLDEV_ATTR_RES_CM_IDN] = { .type = NLA_U32 }, - [RDMA_NLDEV_ATTR_RES_CTXN] = { .type = NLA_U32 }, - [RDMA_NLDEV_ATTR_LINK_TYPE] = { .type = NLA_NUL_STRING, - .len = RDMA_NLDEV_ATTR_ENTRY_STRLEN }, - [RDMA_NLDEV_SYS_ATTR_NETNS_MODE] = { .type = NLA_U8 }, - [RDMA_NLDEV_ATTR_DEV_PROTOCOL] = { .type = NLA_NUL_STRING, - .len = RDMA_NLDEV_ATTR_ENTRY_STRLEN }, + [RDMA_NLDEV_ATTR_RES_PD_ENTRY] = { .type = NLA_NESTED }, + [RDMA_NLDEV_ATTR_RES_PID] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_RES_POLL_CTX] = { .type = NLA_U8 }, + [RDMA_NLDEV_ATTR_RES_PS] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_RES_QP] = { .type = NLA_NESTED }, + [RDMA_NLDEV_ATTR_RES_QP_ENTRY] = { .type = NLA_NESTED }, + [RDMA_NLDEV_ATTR_RES_RKEY] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_RES_RQPN] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_RES_RQ_PSN] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_RES_SQ_PSN] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_RES_SRC_ADDR] = { + .len = sizeof(struct __kernel_sockaddr_storage) }, + [RDMA_NLDEV_ATTR_RES_STATE] = { .type = NLA_U8 }, + [RDMA_NLDEV_ATTR_RES_SUMMARY] = { .type = NLA_NESTED }, + [RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY] = { .type = NLA_NESTED }, + [RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY_CURR]= { .type = NLA_U64 }, + [RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY_NAME]= { .type = NLA_NUL_STRING, + .len = RDMA_NLDEV_ATTR_EMPTY_STRING }, + [RDMA_NLDEV_ATTR_RES_TYPE] = { .type = NLA_U8 }, + [RDMA_NLDEV_ATTR_RES_UNSAFE_GLOBAL_RKEY]= { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_RES_USECNT] = { .type = NLA_U64 }, + [RDMA_NLDEV_ATTR_SM_LID] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_SUBNET_PREFIX] = { .type = NLA_U64 }, + [RDMA_NLDEV_ATTR_STAT_AUTO_MODE_MASK] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_STAT_MODE] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_STAT_RES] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_STAT_COUNTER] = { .type = NLA_NESTED }, + [RDMA_NLDEV_ATTR_STAT_COUNTER_ENTRY] = { .type = NLA_NESTED }, + [RDMA_NLDEV_ATTR_STAT_COUNTER_ID] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_STAT_HWCOUNTERS] = { .type = NLA_NESTED }, + [RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY] = { .type = NLA_NESTED }, + [RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY_NAME] = { .type = NLA_NUL_STRING }, + [RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY_VALUE] = { .type = NLA_U64 }, + [RDMA_NLDEV_ATTR_SYS_IMAGE_GUID] = { .type = NLA_U64 }, + [RDMA_NLDEV_ATTR_UVERBS_DRIVER_ID] = { .type = NLA_U32 }, [RDMA_NLDEV_NET_NS_FD] = { .type = NLA_U32 }, + [RDMA_NLDEV_SYS_ATTR_NETNS_MODE] = { .type = NLA_U8 }, }; static int put_driver_name_print_type(struct sk_buff *msg, const char *name, @@ -232,6 +253,8 @@ static int fill_dev_info(struct sk_buff *msg, struct ib_device *device) return -EMSGSIZE; if (nla_put_u8(msg, RDMA_NLDEV_ATTR_DEV_NODE_TYPE, device->node_type)) return -EMSGSIZE; + if (nla_put_u8(msg, RDMA_NLDEV_ATTR_DEV_DIM, device->use_cq_dim)) + return -EMSGSIZE; /* * Link type is determined on first port and mlx4 device @@ -532,6 +555,9 @@ static int fill_res_cq_entry(struct sk_buff *msg, bool has_cap_net_admin, nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_POLL_CTX, cq->poll_ctx)) goto err; + if (nla_put_u8(msg, RDMA_NLDEV_ATTR_DEV_DIM, (cq->dim != NULL))) + goto err; + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_CQN, res->id)) goto err; if (!rdma_is_kernel_res(res) && @@ -623,6 +649,152 @@ static int fill_res_pd_entry(struct sk_buff *msg, bool has_cap_net_admin, err: return -EMSGSIZE; } +static int fill_stat_counter_mode(struct sk_buff *msg, + struct rdma_counter *counter) +{ + struct rdma_counter_mode *m = &counter->mode; + + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_STAT_MODE, m->mode)) + return -EMSGSIZE; + + if (m->mode == RDMA_COUNTER_MODE_AUTO) + if ((m->mask & RDMA_COUNTER_MASK_QP_TYPE) && + nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_TYPE, m->param.qp_type)) + return -EMSGSIZE; + + return 0; +} + +static int fill_stat_counter_qp_entry(struct sk_buff *msg, u32 qpn) +{ + struct nlattr *entry_attr; + + entry_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_RES_QP_ENTRY); + if (!entry_attr) + return -EMSGSIZE; + + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_LQPN, qpn)) + goto err; + + nla_nest_end(msg, entry_attr); + return 0; + +err: + nla_nest_cancel(msg, entry_attr); + return -EMSGSIZE; +} + +static int fill_stat_counter_qps(struct sk_buff *msg, + struct rdma_counter *counter) +{ + struct rdma_restrack_entry *res; + struct rdma_restrack_root *rt; + struct nlattr *table_attr; + struct ib_qp *qp = NULL; + unsigned long id = 0; + int ret = 0; + + table_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_RES_QP); + + rt = &counter->device->res[RDMA_RESTRACK_QP]; + xa_lock(&rt->xa); + xa_for_each(&rt->xa, id, res) { + if (!rdma_is_visible_in_pid_ns(res)) + continue; + + qp = container_of(res, struct ib_qp, res); + if (qp->qp_type == IB_QPT_RAW_PACKET && !capable(CAP_NET_RAW)) + continue; + + if (!qp->counter || (qp->counter->id != counter->id)) + continue; + + ret = fill_stat_counter_qp_entry(msg, qp->qp_num); + if (ret) + goto err; + } + + xa_unlock(&rt->xa); + nla_nest_end(msg, table_attr); + return 0; + +err: + xa_unlock(&rt->xa); + nla_nest_cancel(msg, table_attr); + return ret; +} + +static int fill_stat_hwcounter_entry(struct sk_buff *msg, + const char *name, u64 value) +{ + struct nlattr *entry_attr; + + entry_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY); + if (!entry_attr) + return -EMSGSIZE; + + if (nla_put_string(msg, RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY_NAME, + name)) + goto err; + if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_STAT_HWCOUNTER_ENTRY_VALUE, + value, RDMA_NLDEV_ATTR_PAD)) + goto err; + + nla_nest_end(msg, entry_attr); + return 0; + +err: + nla_nest_cancel(msg, entry_attr); + return -EMSGSIZE; +} + +static int fill_stat_counter_hwcounters(struct sk_buff *msg, + struct rdma_counter *counter) +{ + struct rdma_hw_stats *st = counter->stats; + struct nlattr *table_attr; + int i; + + table_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_STAT_HWCOUNTERS); + if (!table_attr) + return -EMSGSIZE; + + for (i = 0; i < st->num_counters; i++) + if (fill_stat_hwcounter_entry(msg, st->names[i], st->value[i])) + goto err; + + nla_nest_end(msg, table_attr); + return 0; + +err: + nla_nest_cancel(msg, table_attr); + return -EMSGSIZE; +} + +static int fill_res_counter_entry(struct sk_buff *msg, bool has_cap_net_admin, + struct rdma_restrack_entry *res, + uint32_t port) +{ + struct rdma_counter *counter = + container_of(res, struct rdma_counter, res); + + if (port && port != counter->port) + return 0; + + /* Dump it even query failed */ + rdma_counter_query_stats(counter); + + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, counter->port) || + nla_put_u32(msg, RDMA_NLDEV_ATTR_STAT_COUNTER_ID, counter->id) || + fill_res_name_pid(msg, &counter->res) || + fill_stat_counter_mode(msg, counter) || + fill_stat_counter_qps(msg, counter) || + fill_stat_counter_hwcounters(msg, counter)) + return -EMSGSIZE; + + return 0; +} + static int nldev_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { @@ -704,6 +876,14 @@ static int nldev_set_doit(struct sk_buff *skb, struct nlmsghdr *nlh, goto put_done; } + if (tb[RDMA_NLDEV_ATTR_DEV_DIM]) { + u8 use_dim; + + use_dim = nla_get_u8(tb[RDMA_NLDEV_ATTR_DEV_DIM]); + err = ib_device_set_dim(device, use_dim); + goto done; + } + done: ib_device_put(device); put_done: @@ -990,19 +1170,15 @@ static const struct nldev_fill_res_entry fill_entries[RDMA_RESTRACK_MAX] = { .entry = RDMA_NLDEV_ATTR_RES_PD_ENTRY, .id = RDMA_NLDEV_ATTR_RES_PDN, }, + [RDMA_RESTRACK_COUNTER] = { + .fill_res_func = fill_res_counter_entry, + .nldev_cmd = RDMA_NLDEV_CMD_STAT_GET, + .nldev_attr = RDMA_NLDEV_ATTR_STAT_COUNTER, + .entry = RDMA_NLDEV_ATTR_STAT_COUNTER_ENTRY, + .id = RDMA_NLDEV_ATTR_STAT_COUNTER_ID, + }, }; -static bool is_visible_in_pid_ns(struct rdma_restrack_entry *res) -{ - /* - * 1. Kern resources should be visible in init name space only - * 2. Present only resources visible in the current namespace - */ - if (rdma_is_kernel_res(res)) - return task_active_pid_ns(current) == &init_pid_ns; - return task_active_pid_ns(current) == task_active_pid_ns(res->task); -} - static int res_get_common_doit(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack, enum rdma_restrack_type res_type) @@ -1047,7 +1223,7 @@ static int res_get_common_doit(struct sk_buff *skb, struct nlmsghdr *nlh, goto err; } - if (!is_visible_in_pid_ns(res)) { + if (!rdma_is_visible_in_pid_ns(res)) { ret = -ENOENT; goto err_get; } @@ -1159,7 +1335,7 @@ static int res_get_common_dumpit(struct sk_buff *skb, * objects. */ xa_for_each(&rt->xa, id, res) { - if (!is_visible_in_pid_ns(res)) + if (!rdma_is_visible_in_pid_ns(res)) continue; if (idx < start || !rdma_restrack_get(res)) @@ -1237,6 +1413,7 @@ RES_GET_FUNCS(cm_id, RDMA_RESTRACK_CM_ID); RES_GET_FUNCS(cq, RDMA_RESTRACK_CQ); RES_GET_FUNCS(pd, RDMA_RESTRACK_PD); RES_GET_FUNCS(mr, RDMA_RESTRACK_MR); +RES_GET_FUNCS(counter, RDMA_RESTRACK_COUNTER); static LIST_HEAD(link_ops); static DECLARE_RWSEM(link_ops_rwsem); @@ -1299,7 +1476,7 @@ static int nldev_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, nla_strlcpy(ndev_name, tb[RDMA_NLDEV_ATTR_NDEV_NAME], sizeof(ndev_name)); - ndev = dev_get_by_name(&init_net, ndev_name); + ndev = dev_get_by_name(sock_net(skb->sk), ndev_name); if (!ndev) return -ENODEV; @@ -1347,6 +1524,90 @@ static int nldev_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, return 0; } +static int nldev_get_chardev(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; + char client_name[RDMA_NLDEV_ATTR_CHARDEV_TYPE_SIZE]; + struct ib_client_nl_info data = {}; + struct ib_device *ibdev = NULL; + struct sk_buff *msg; + u32 index; + int err; + + err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, nldev_policy, + extack); + if (err || !tb[RDMA_NLDEV_ATTR_CHARDEV_TYPE]) + return -EINVAL; + + nla_strlcpy(client_name, tb[RDMA_NLDEV_ATTR_CHARDEV_TYPE], + sizeof(client_name)); + + if (tb[RDMA_NLDEV_ATTR_DEV_INDEX]) { + index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); + ibdev = ib_device_get_by_index(sock_net(skb->sk), index); + if (!ibdev) + return -EINVAL; + + if (tb[RDMA_NLDEV_ATTR_PORT_INDEX]) { + data.port = nla_get_u32(tb[RDMA_NLDEV_ATTR_PORT_INDEX]); + if (!rdma_is_port_valid(ibdev, data.port)) { + err = -EINVAL; + goto out_put; + } + } else { + data.port = -1; + } + } else if (tb[RDMA_NLDEV_ATTR_PORT_INDEX]) { + return -EINVAL; + } + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) { + err = -ENOMEM; + goto out_put; + } + nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, + RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, + RDMA_NLDEV_CMD_GET_CHARDEV), + 0, 0); + + data.nl_msg = msg; + err = ib_get_client_nl_info(ibdev, client_name, &data); + if (err) + goto out_nlmsg; + + err = nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_CHARDEV, + huge_encode_dev(data.cdev->devt), + RDMA_NLDEV_ATTR_PAD); + if (err) + goto out_data; + err = nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_CHARDEV_ABI, data.abi, + RDMA_NLDEV_ATTR_PAD); + if (err) + goto out_data; + if (nla_put_string(msg, RDMA_NLDEV_ATTR_CHARDEV_NAME, + dev_name(data.cdev))) { + err = -EMSGSIZE; + goto out_data; + } + + nlmsg_end(msg, nlh); + put_device(data.cdev); + if (ibdev) + ib_device_put(ibdev); + return rdma_nl_unicast(msg, NETLINK_CB(skb).portid); + +out_data: + put_device(data.cdev); +out_nlmsg: + nlmsg_free(msg); +out_put: + if (ibdev) + ib_device_put(ibdev); + return err; +} + static int nldev_sys_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { @@ -1399,11 +1660,375 @@ static int nldev_set_sys_set_doit(struct sk_buff *skb, struct nlmsghdr *nlh, return err; } +static int nldev_stat_set_doit(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + u32 index, port, mode, mask = 0, qpn, cntn = 0; + struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; + struct ib_device *device; + struct sk_buff *msg; + int ret; + + ret = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, + nldev_policy, extack); + /* Currently only counter for QP is supported */ + if (ret || !tb[RDMA_NLDEV_ATTR_STAT_RES] || + !tb[RDMA_NLDEV_ATTR_DEV_INDEX] || + !tb[RDMA_NLDEV_ATTR_PORT_INDEX] || !tb[RDMA_NLDEV_ATTR_STAT_MODE]) + return -EINVAL; + + if (nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_RES]) != RDMA_NLDEV_ATTR_RES_QP) + return -EINVAL; + + index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); + device = ib_device_get_by_index(sock_net(skb->sk), index); + if (!device) + return -EINVAL; + + port = nla_get_u32(tb[RDMA_NLDEV_ATTR_PORT_INDEX]); + if (!rdma_is_port_valid(device, port)) { + ret = -EINVAL; + goto err; + } + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) { + ret = -ENOMEM; + goto err; + } + nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, + RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, + RDMA_NLDEV_CMD_STAT_SET), + 0, 0); + + mode = nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_MODE]); + if (mode == RDMA_COUNTER_MODE_AUTO) { + if (tb[RDMA_NLDEV_ATTR_STAT_AUTO_MODE_MASK]) + mask = nla_get_u32( + tb[RDMA_NLDEV_ATTR_STAT_AUTO_MODE_MASK]); + + ret = rdma_counter_set_auto_mode(device, port, + mask ? true : false, mask); + if (ret) + goto err_msg; + } else { + qpn = nla_get_u32(tb[RDMA_NLDEV_ATTR_RES_LQPN]); + if (tb[RDMA_NLDEV_ATTR_STAT_COUNTER_ID]) { + cntn = nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_COUNTER_ID]); + ret = rdma_counter_bind_qpn(device, port, qpn, cntn); + } else { + ret = rdma_counter_bind_qpn_alloc(device, port, + qpn, &cntn); + } + if (ret) + goto err_msg; + + if (fill_nldev_handle(msg, device) || + nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, port) || + nla_put_u32(msg, RDMA_NLDEV_ATTR_STAT_COUNTER_ID, cntn) || + nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_LQPN, qpn)) { + ret = -EMSGSIZE; + goto err_fill; + } + } + + nlmsg_end(msg, nlh); + ib_device_put(device); + return rdma_nl_unicast(msg, NETLINK_CB(skb).portid); + +err_fill: + rdma_counter_unbind_qpn(device, port, qpn, cntn); +err_msg: + nlmsg_free(msg); +err: + ib_device_put(device); + return ret; +} + +static int nldev_stat_del_doit(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; + struct ib_device *device; + struct sk_buff *msg; + u32 index, port, qpn, cntn; + int ret; + + ret = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, + nldev_policy, extack); + if (ret || !tb[RDMA_NLDEV_ATTR_STAT_RES] || + !tb[RDMA_NLDEV_ATTR_DEV_INDEX] || !tb[RDMA_NLDEV_ATTR_PORT_INDEX] || + !tb[RDMA_NLDEV_ATTR_STAT_COUNTER_ID] || + !tb[RDMA_NLDEV_ATTR_RES_LQPN]) + return -EINVAL; + + if (nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_RES]) != RDMA_NLDEV_ATTR_RES_QP) + return -EINVAL; + + index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); + device = ib_device_get_by_index(sock_net(skb->sk), index); + if (!device) + return -EINVAL; + + port = nla_get_u32(tb[RDMA_NLDEV_ATTR_PORT_INDEX]); + if (!rdma_is_port_valid(device, port)) { + ret = -EINVAL; + goto err; + } + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) { + ret = -ENOMEM; + goto err; + } + nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, + RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, + RDMA_NLDEV_CMD_STAT_SET), + 0, 0); + + cntn = nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_COUNTER_ID]); + qpn = nla_get_u32(tb[RDMA_NLDEV_ATTR_RES_LQPN]); + ret = rdma_counter_unbind_qpn(device, port, qpn, cntn); + if (ret) + goto err_unbind; + + if (fill_nldev_handle(msg, device) || + nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, port) || + nla_put_u32(msg, RDMA_NLDEV_ATTR_STAT_COUNTER_ID, cntn) || + nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_LQPN, qpn)) { + ret = -EMSGSIZE; + goto err_fill; + } + + nlmsg_end(msg, nlh); + ib_device_put(device); + return rdma_nl_unicast(msg, NETLINK_CB(skb).portid); + +err_fill: + rdma_counter_bind_qpn(device, port, qpn, cntn); +err_unbind: + nlmsg_free(msg); +err: + ib_device_put(device); + return ret; +} + +static int stat_get_doit_default_counter(struct sk_buff *skb, + struct nlmsghdr *nlh, + struct netlink_ext_ack *extack, + struct nlattr *tb[]) +{ + struct rdma_hw_stats *stats; + struct nlattr *table_attr; + struct ib_device *device; + int ret, num_cnts, i; + struct sk_buff *msg; + u32 index, port; + u64 v; + + if (!tb[RDMA_NLDEV_ATTR_DEV_INDEX] || !tb[RDMA_NLDEV_ATTR_PORT_INDEX]) + return -EINVAL; + + index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); + device = ib_device_get_by_index(sock_net(skb->sk), index); + if (!device) + return -EINVAL; + + if (!device->ops.alloc_hw_stats || !device->ops.get_hw_stats) { + ret = -EINVAL; + goto err; + } + + port = nla_get_u32(tb[RDMA_NLDEV_ATTR_PORT_INDEX]); + if (!rdma_is_port_valid(device, port)) { + ret = -EINVAL; + goto err; + } + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) { + ret = -ENOMEM; + goto err; + } + + nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, + RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, + RDMA_NLDEV_CMD_STAT_GET), + 0, 0); + + if (fill_nldev_handle(msg, device) || + nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, port)) { + ret = -EMSGSIZE; + goto err_msg; + } + + stats = device->port_data ? device->port_data[port].hw_stats : NULL; + if (stats == NULL) { + ret = -EINVAL; + goto err_msg; + } + mutex_lock(&stats->lock); + + num_cnts = device->ops.get_hw_stats(device, stats, port, 0); + if (num_cnts < 0) { + ret = -EINVAL; + goto err_stats; + } + + table_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_STAT_HWCOUNTERS); + if (!table_attr) { + ret = -EMSGSIZE; + goto err_stats; + } + for (i = 0; i < num_cnts; i++) { + v = stats->value[i] + + rdma_counter_get_hwstat_value(device, port, i); + if (fill_stat_hwcounter_entry(msg, stats->names[i], v)) { + ret = -EMSGSIZE; + goto err_table; + } + } + nla_nest_end(msg, table_attr); + + mutex_unlock(&stats->lock); + nlmsg_end(msg, nlh); + ib_device_put(device); + return rdma_nl_unicast(msg, NETLINK_CB(skb).portid); + +err_table: + nla_nest_cancel(msg, table_attr); +err_stats: + mutex_unlock(&stats->lock); +err_msg: + nlmsg_free(msg); +err: + ib_device_put(device); + return ret; +} + +static int stat_get_doit_qp(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack, struct nlattr *tb[]) + +{ + static enum rdma_nl_counter_mode mode; + static enum rdma_nl_counter_mask mask; + struct ib_device *device; + struct sk_buff *msg; + u32 index, port; + int ret; + + if (tb[RDMA_NLDEV_ATTR_STAT_COUNTER_ID]) + return nldev_res_get_counter_doit(skb, nlh, extack); + + if (!tb[RDMA_NLDEV_ATTR_STAT_MODE] || + !tb[RDMA_NLDEV_ATTR_DEV_INDEX] || !tb[RDMA_NLDEV_ATTR_PORT_INDEX]) + return -EINVAL; + + index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); + device = ib_device_get_by_index(sock_net(skb->sk), index); + if (!device) + return -EINVAL; + + port = nla_get_u32(tb[RDMA_NLDEV_ATTR_PORT_INDEX]); + if (!rdma_is_port_valid(device, port)) { + ret = -EINVAL; + goto err; + } + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) { + ret = -ENOMEM; + goto err; + } + + nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, + RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, + RDMA_NLDEV_CMD_STAT_GET), + 0, 0); + + ret = rdma_counter_get_mode(device, port, &mode, &mask); + if (ret) + goto err_msg; + + if (fill_nldev_handle(msg, device) || + nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, port) || + nla_put_u32(msg, RDMA_NLDEV_ATTR_STAT_MODE, mode)) + goto err_msg; + + if ((mode == RDMA_COUNTER_MODE_AUTO) && + nla_put_u32(msg, RDMA_NLDEV_ATTR_STAT_AUTO_MODE_MASK, mask)) + goto err_msg; + + nlmsg_end(msg, nlh); + ib_device_put(device); + return rdma_nl_unicast(msg, NETLINK_CB(skb).portid); + +err_msg: + nlmsg_free(msg); +err: + ib_device_put(device); + return ret; +} + +static int nldev_stat_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; + int ret; + + ret = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, + nldev_policy, extack); + if (ret) + return -EINVAL; + + if (!tb[RDMA_NLDEV_ATTR_STAT_RES]) + return stat_get_doit_default_counter(skb, nlh, extack, tb); + + switch (nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_RES])) { + case RDMA_NLDEV_ATTR_RES_QP: + ret = stat_get_doit_qp(skb, nlh, extack, tb); + break; + + default: + ret = -EINVAL; + break; + } + + return ret; +} + +static int nldev_stat_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; + int ret; + + ret = nlmsg_parse(cb->nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, + nldev_policy, NULL); + if (ret || !tb[RDMA_NLDEV_ATTR_STAT_RES]) + return -EINVAL; + + switch (nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_RES])) { + case RDMA_NLDEV_ATTR_RES_QP: + ret = nldev_res_get_counter_dumpit(skb, cb); + break; + + default: + ret = -EINVAL; + break; + } + + return ret; +} + static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = { [RDMA_NLDEV_CMD_GET] = { .doit = nldev_get_doit, .dump = nldev_get_dumpit, }, + [RDMA_NLDEV_CMD_GET_CHARDEV] = { + .doit = nldev_get_chardev, + }, [RDMA_NLDEV_CMD_SET] = { .doit = nldev_set_doit, .flags = RDMA_NL_ADMIN_PERM, @@ -1449,6 +2074,17 @@ static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = { }, [RDMA_NLDEV_CMD_SYS_SET] = { .doit = nldev_set_sys_set_doit, + }, + [RDMA_NLDEV_CMD_STAT_SET] = { + .doit = nldev_stat_set_doit, + .flags = RDMA_NL_ADMIN_PERM, + }, + [RDMA_NLDEV_CMD_STAT_GET] = { + .doit = nldev_stat_get_doit, + .dump = nldev_stat_get_dumpit, + }, + [RDMA_NLDEV_CMD_STAT_DEL] = { + .doit = nldev_stat_del_doit, .flags = RDMA_NL_ADMIN_PERM, }, }; diff --git a/drivers/infiniband/core/restrack.c b/drivers/infiniband/core/restrack.c index 3b5ff2f7b5f8..bddff426ee0f 100644 --- a/drivers/infiniband/core/restrack.c +++ b/drivers/infiniband/core/restrack.c @@ -6,6 +6,7 @@ #include <rdma/rdma_cm.h> #include <rdma/ib_verbs.h> #include <rdma/restrack.h> +#include <rdma/rdma_counter.h> #include <linux/mutex.h> #include <linux/sched/task.h> #include <linux/pid_namespace.h> @@ -45,6 +46,7 @@ static const char *type2str(enum rdma_restrack_type type) [RDMA_RESTRACK_CM_ID] = "CM_ID", [RDMA_RESTRACK_MR] = "MR", [RDMA_RESTRACK_CTX] = "CTX", + [RDMA_RESTRACK_COUNTER] = "COUNTER", }; return names[type]; @@ -169,6 +171,8 @@ static struct ib_device *res_to_dev(struct rdma_restrack_entry *res) return container_of(res, struct ib_mr, res)->device; case RDMA_RESTRACK_CTX: return container_of(res, struct ib_ucontext, res)->device; + case RDMA_RESTRACK_COUNTER: + return container_of(res, struct rdma_counter, res)->device; default: WARN_ONCE(true, "Wrong resource tracking type %u\n", res->type); return NULL; @@ -190,6 +194,20 @@ void rdma_restrack_set_task(struct rdma_restrack_entry *res, } EXPORT_SYMBOL(rdma_restrack_set_task); +/** + * rdma_restrack_attach_task() - attach the task onto this resource + * @res: resource entry + * @task: the task to attach, the current task will be used if it is NULL. + */ +void rdma_restrack_attach_task(struct rdma_restrack_entry *res, + struct task_struct *task) +{ + if (res->task) + put_task_struct(res->task); + get_task_struct(task); + res->task = task; +} + static void rdma_restrack_add(struct rdma_restrack_entry *res) { struct ib_device *dev = res_to_dev(res); @@ -203,15 +221,22 @@ static void rdma_restrack_add(struct rdma_restrack_entry *res) kref_init(&res->kref); init_completion(&res->comp); - if (res->type != RDMA_RESTRACK_QP) - ret = xa_alloc_cyclic(&rt->xa, &res->id, res, xa_limit_32b, - &rt->next_id, GFP_KERNEL); - else { + if (res->type == RDMA_RESTRACK_QP) { /* Special case to ensure that LQPN points to right QP */ struct ib_qp *qp = container_of(res, struct ib_qp, res); ret = xa_insert(&rt->xa, qp->qp_num, res, GFP_KERNEL); res->id = ret ? 0 : qp->qp_num; + } else if (res->type == RDMA_RESTRACK_COUNTER) { + /* Special case to ensure that cntn points to right counter */ + struct rdma_counter *counter; + + counter = container_of(res, struct rdma_counter, res); + ret = xa_insert(&rt->xa, counter->id, res, GFP_KERNEL); + res->id = ret ? 0 : counter->id; + } else { + ret = xa_alloc_cyclic(&rt->xa, &res->id, res, xa_limit_32b, + &rt->next_id, GFP_KERNEL); } if (!ret) @@ -237,7 +262,8 @@ EXPORT_SYMBOL(rdma_restrack_kadd); */ void rdma_restrack_uadd(struct rdma_restrack_entry *res) { - if (res->type != RDMA_RESTRACK_CM_ID) + if ((res->type != RDMA_RESTRACK_CM_ID) && + (res->type != RDMA_RESTRACK_COUNTER)) res->task = NULL; if (!res->task) @@ -323,3 +349,16 @@ out: } } EXPORT_SYMBOL(rdma_restrack_del); + +bool rdma_is_visible_in_pid_ns(struct rdma_restrack_entry *res) +{ + /* + * 1. Kern resources should be visible in init + * namespace only + * 2. Present only resources visible in the current + * namespace + */ + if (rdma_is_kernel_res(res)) + return task_active_pid_ns(current) == &init_pid_ns; + return task_active_pid_ns(current) == task_active_pid_ns(res->task); +} diff --git a/drivers/infiniband/core/restrack.h b/drivers/infiniband/core/restrack.h index 09a1fbdf578e..7bd177cc0a61 100644 --- a/drivers/infiniband/core/restrack.h +++ b/drivers/infiniband/core/restrack.h @@ -25,4 +25,7 @@ struct rdma_restrack_root { int rdma_restrack_init(struct ib_device *dev); void rdma_restrack_clean(struct ib_device *dev); +void rdma_restrack_attach_task(struct rdma_restrack_entry *res, + struct task_struct *task); +bool rdma_is_visible_in_pid_ns(struct rdma_restrack_entry *res); #endif /* _RDMA_CORE_RESTRACK_H_ */ diff --git a/drivers/infiniband/core/rw.c b/drivers/infiniband/core/rw.c index 32ca8429eaae..dce06108c8c3 100644 --- a/drivers/infiniband/core/rw.c +++ b/drivers/infiniband/core/rw.c @@ -51,10 +51,34 @@ static inline bool rdma_rw_io_needs_mr(struct ib_device *dev, u8 port_num, return false; } -static inline u32 rdma_rw_fr_page_list_len(struct ib_device *dev) +static inline u32 rdma_rw_fr_page_list_len(struct ib_device *dev, + bool pi_support) { + u32 max_pages; + + if (pi_support) + max_pages = dev->attrs.max_pi_fast_reg_page_list_len; + else + max_pages = dev->attrs.max_fast_reg_page_list_len; + /* arbitrary limit to avoid allocating gigantic resources */ - return min_t(u32, dev->attrs.max_fast_reg_page_list_len, 256); + return min_t(u32, max_pages, 256); +} + +static inline int rdma_rw_inv_key(struct rdma_rw_reg_ctx *reg) +{ + int count = 0; + + if (reg->mr->need_inval) { + reg->inv_wr.opcode = IB_WR_LOCAL_INV; + reg->inv_wr.ex.invalidate_rkey = reg->mr->lkey; + reg->inv_wr.next = ®->reg_wr.wr; + count++; + } else { + reg->inv_wr.next = NULL; + } + + return count; } /* Caller must have zero-initialized *reg. */ @@ -62,7 +86,8 @@ static int rdma_rw_init_one_mr(struct ib_qp *qp, u8 port_num, struct rdma_rw_reg_ctx *reg, struct scatterlist *sg, u32 sg_cnt, u32 offset) { - u32 pages_per_mr = rdma_rw_fr_page_list_len(qp->pd->device); + u32 pages_per_mr = rdma_rw_fr_page_list_len(qp->pd->device, + qp->integrity_en); u32 nents = min(sg_cnt, pages_per_mr); int count = 0, ret; @@ -70,14 +95,7 @@ static int rdma_rw_init_one_mr(struct ib_qp *qp, u8 port_num, if (!reg->mr) return -EAGAIN; - if (reg->mr->need_inval) { - reg->inv_wr.opcode = IB_WR_LOCAL_INV; - reg->inv_wr.ex.invalidate_rkey = reg->mr->lkey; - reg->inv_wr.next = ®->reg_wr.wr; - count++; - } else { - reg->inv_wr.next = NULL; - } + count += rdma_rw_inv_key(reg); ret = ib_map_mr_sg(reg->mr, sg, nents, &offset, PAGE_SIZE); if (ret < 0 || ret < nents) { @@ -102,7 +120,8 @@ static int rdma_rw_init_mr_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u64 remote_addr, u32 rkey, enum dma_data_direction dir) { struct rdma_rw_reg_ctx *prev = NULL; - u32 pages_per_mr = rdma_rw_fr_page_list_len(qp->pd->device); + u32 pages_per_mr = rdma_rw_fr_page_list_len(qp->pd->device, + qp->integrity_en); int i, j, ret = 0, count = 0; ctx->nr_ops = (sg_cnt + pages_per_mr - 1) / pages_per_mr; @@ -343,13 +362,14 @@ int rdma_rw_ctx_signature_init(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u64 remote_addr, u32 rkey, enum dma_data_direction dir) { struct ib_device *dev = qp->pd->device; - u32 pages_per_mr = rdma_rw_fr_page_list_len(qp->pd->device); + u32 pages_per_mr = rdma_rw_fr_page_list_len(qp->pd->device, + qp->integrity_en); struct ib_rdma_wr *rdma_wr; - struct ib_send_wr *prev_wr = NULL; int count = 0, ret; if (sg_cnt > pages_per_mr || prot_sg_cnt > pages_per_mr) { - pr_err("SG count too large\n"); + pr_err("SG count too large: sg_cnt=%d, prot_sg_cnt=%d, pages_per_mr=%d\n", + sg_cnt, prot_sg_cnt, pages_per_mr); return -EINVAL; } @@ -358,75 +378,58 @@ int rdma_rw_ctx_signature_init(struct rdma_rw_ctx *ctx, struct ib_qp *qp, return -ENOMEM; sg_cnt = ret; - ret = ib_dma_map_sg(dev, prot_sg, prot_sg_cnt, dir); - if (!ret) { - ret = -ENOMEM; - goto out_unmap_sg; + if (prot_sg_cnt) { + ret = ib_dma_map_sg(dev, prot_sg, prot_sg_cnt, dir); + if (!ret) { + ret = -ENOMEM; + goto out_unmap_sg; + } + prot_sg_cnt = ret; } - prot_sg_cnt = ret; ctx->type = RDMA_RW_SIG_MR; ctx->nr_ops = 1; - ctx->sig = kcalloc(1, sizeof(*ctx->sig), GFP_KERNEL); - if (!ctx->sig) { + ctx->reg = kcalloc(1, sizeof(*ctx->reg), GFP_KERNEL); + if (!ctx->reg) { ret = -ENOMEM; goto out_unmap_prot_sg; } - ret = rdma_rw_init_one_mr(qp, port_num, &ctx->sig->data, sg, sg_cnt, 0); - if (ret < 0) - goto out_free_ctx; - count += ret; - prev_wr = &ctx->sig->data.reg_wr.wr; - - ret = rdma_rw_init_one_mr(qp, port_num, &ctx->sig->prot, - prot_sg, prot_sg_cnt, 0); - if (ret < 0) - goto out_destroy_data_mr; - count += ret; - - if (ctx->sig->prot.inv_wr.next) - prev_wr->next = &ctx->sig->prot.inv_wr; - else - prev_wr->next = &ctx->sig->prot.reg_wr.wr; - prev_wr = &ctx->sig->prot.reg_wr.wr; - - ctx->sig->sig_mr = ib_mr_pool_get(qp, &qp->sig_mrs); - if (!ctx->sig->sig_mr) { + ctx->reg->mr = ib_mr_pool_get(qp, &qp->sig_mrs); + if (!ctx->reg->mr) { ret = -EAGAIN; - goto out_destroy_prot_mr; + goto out_free_ctx; } - if (ctx->sig->sig_mr->need_inval) { - memset(&ctx->sig->sig_inv_wr, 0, sizeof(ctx->sig->sig_inv_wr)); + count += rdma_rw_inv_key(ctx->reg); - ctx->sig->sig_inv_wr.opcode = IB_WR_LOCAL_INV; - ctx->sig->sig_inv_wr.ex.invalidate_rkey = ctx->sig->sig_mr->rkey; + memcpy(ctx->reg->mr->sig_attrs, sig_attrs, sizeof(struct ib_sig_attrs)); - prev_wr->next = &ctx->sig->sig_inv_wr; - prev_wr = &ctx->sig->sig_inv_wr; + ret = ib_map_mr_sg_pi(ctx->reg->mr, sg, sg_cnt, NULL, prot_sg, + prot_sg_cnt, NULL, SZ_4K); + if (unlikely(ret)) { + pr_err("failed to map PI sg (%d)\n", sg_cnt + prot_sg_cnt); + goto out_destroy_sig_mr; } - ctx->sig->sig_wr.wr.opcode = IB_WR_REG_SIG_MR; - ctx->sig->sig_wr.wr.wr_cqe = NULL; - ctx->sig->sig_wr.wr.sg_list = &ctx->sig->data.sge; - ctx->sig->sig_wr.wr.num_sge = 1; - ctx->sig->sig_wr.access_flags = IB_ACCESS_LOCAL_WRITE; - ctx->sig->sig_wr.sig_attrs = sig_attrs; - ctx->sig->sig_wr.sig_mr = ctx->sig->sig_mr; - if (prot_sg_cnt) - ctx->sig->sig_wr.prot = &ctx->sig->prot.sge; - prev_wr->next = &ctx->sig->sig_wr.wr; - prev_wr = &ctx->sig->sig_wr.wr; + ctx->reg->reg_wr.wr.opcode = IB_WR_REG_MR_INTEGRITY; + ctx->reg->reg_wr.wr.wr_cqe = NULL; + ctx->reg->reg_wr.wr.num_sge = 0; + ctx->reg->reg_wr.wr.send_flags = 0; + ctx->reg->reg_wr.access = IB_ACCESS_LOCAL_WRITE; + if (rdma_protocol_iwarp(qp->device, port_num)) + ctx->reg->reg_wr.access |= IB_ACCESS_REMOTE_WRITE; + ctx->reg->reg_wr.mr = ctx->reg->mr; + ctx->reg->reg_wr.key = ctx->reg->mr->lkey; count++; - ctx->sig->sig_sge.addr = 0; - ctx->sig->sig_sge.length = ctx->sig->data.sge.length; - if (sig_attrs->wire.sig_type != IB_SIG_TYPE_NONE) - ctx->sig->sig_sge.length += ctx->sig->prot.sge.length; + ctx->reg->sge.addr = ctx->reg->mr->iova; + ctx->reg->sge.length = ctx->reg->mr->length; + if (sig_attrs->wire.sig_type == IB_SIG_TYPE_NONE) + ctx->reg->sge.length -= ctx->reg->mr->sig_attrs->meta_length; - rdma_wr = &ctx->sig->data.wr; - rdma_wr->wr.sg_list = &ctx->sig->sig_sge; + rdma_wr = &ctx->reg->wr; + rdma_wr->wr.sg_list = &ctx->reg->sge; rdma_wr->wr.num_sge = 1; rdma_wr->remote_addr = remote_addr; rdma_wr->rkey = rkey; @@ -434,21 +437,18 @@ int rdma_rw_ctx_signature_init(struct rdma_rw_ctx *ctx, struct ib_qp *qp, rdma_wr->wr.opcode = IB_WR_RDMA_WRITE; else rdma_wr->wr.opcode = IB_WR_RDMA_READ; - prev_wr->next = &rdma_wr->wr; - prev_wr = &rdma_wr->wr; + ctx->reg->reg_wr.wr.next = &rdma_wr->wr; count++; return count; -out_destroy_prot_mr: - if (prot_sg_cnt) - ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->sig->prot.mr); -out_destroy_data_mr: - ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->sig->data.mr); +out_destroy_sig_mr: + ib_mr_pool_put(qp, &qp->sig_mrs, ctx->reg->mr); out_free_ctx: - kfree(ctx->sig); + kfree(ctx->reg); out_unmap_prot_sg: - ib_dma_unmap_sg(dev, prot_sg, prot_sg_cnt, dir); + if (prot_sg_cnt) + ib_dma_unmap_sg(dev, prot_sg, prot_sg_cnt, dir); out_unmap_sg: ib_dma_unmap_sg(dev, sg, sg_cnt, dir); return ret; @@ -491,22 +491,8 @@ struct ib_send_wr *rdma_rw_ctx_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp, switch (ctx->type) { case RDMA_RW_SIG_MR: - rdma_rw_update_lkey(&ctx->sig->data, true); - if (ctx->sig->prot.mr) - rdma_rw_update_lkey(&ctx->sig->prot, true); - - ctx->sig->sig_mr->need_inval = true; - ib_update_fast_reg_key(ctx->sig->sig_mr, - ib_inc_rkey(ctx->sig->sig_mr->lkey)); - ctx->sig->sig_sge.lkey = ctx->sig->sig_mr->lkey; - - if (ctx->sig->data.inv_wr.next) - first_wr = &ctx->sig->data.inv_wr; - else - first_wr = &ctx->sig->data.reg_wr.wr; - last_wr = &ctx->sig->data.wr.wr; - break; case RDMA_RW_MR: + /* fallthrough */ for (i = 0; i < ctx->nr_ops; i++) { rdma_rw_update_lkey(&ctx->reg[i], ctx->reg[i].wr.wr.opcode != @@ -605,7 +591,7 @@ EXPORT_SYMBOL(rdma_rw_ctx_destroy); /** * rdma_rw_ctx_destroy_signature - release all resources allocated by - * rdma_rw_ctx_init_signature + * rdma_rw_ctx_signature_init * @ctx: context to release * @qp: queue pair to operate on * @port_num: port num to which the connection is bound @@ -623,16 +609,12 @@ void rdma_rw_ctx_destroy_signature(struct rdma_rw_ctx *ctx, struct ib_qp *qp, if (WARN_ON_ONCE(ctx->type != RDMA_RW_SIG_MR)) return; - ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->sig->data.mr); - ib_dma_unmap_sg(qp->pd->device, sg, sg_cnt, dir); + ib_mr_pool_put(qp, &qp->sig_mrs, ctx->reg->mr); + kfree(ctx->reg); - if (ctx->sig->prot.mr) { - ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->sig->prot.mr); + ib_dma_unmap_sg(qp->pd->device, sg, sg_cnt, dir); + if (prot_sg_cnt) ib_dma_unmap_sg(qp->pd->device, prot_sg, prot_sg_cnt, dir); - } - - ib_mr_pool_put(qp, &qp->sig_mrs, ctx->sig->sig_mr); - kfree(ctx->sig); } EXPORT_SYMBOL(rdma_rw_ctx_destroy_signature); @@ -653,7 +635,7 @@ unsigned int rdma_rw_mr_factor(struct ib_device *device, u8 port_num, unsigned int mr_pages; if (rdma_rw_can_use_mr(device, port_num)) - mr_pages = rdma_rw_fr_page_list_len(device); + mr_pages = rdma_rw_fr_page_list_len(device, false); else mr_pages = device->attrs.max_sge_rd; return DIV_ROUND_UP(maxpages, mr_pages); @@ -679,9 +661,8 @@ void rdma_rw_init_qp(struct ib_device *dev, struct ib_qp_init_attr *attr) * we'll need two additional MRs for the registrations and the * invalidation. */ - if (attr->create_flags & IB_QP_CREATE_SIGNATURE_EN) - factor += 6; /* (inv + reg) * (data + prot + sig) */ - else if (rdma_rw_can_use_mr(dev, attr->port_num)) + if (attr->create_flags & IB_QP_CREATE_INTEGRITY_EN || + rdma_rw_can_use_mr(dev, attr->port_num)) factor += 2; /* inv + reg */ attr->cap.max_send_wr += factor * attr->cap.max_rdma_ctxs; @@ -697,20 +678,22 @@ void rdma_rw_init_qp(struct ib_device *dev, struct ib_qp_init_attr *attr) int rdma_rw_init_mrs(struct ib_qp *qp, struct ib_qp_init_attr *attr) { struct ib_device *dev = qp->pd->device; - u32 nr_mrs = 0, nr_sig_mrs = 0; + u32 nr_mrs = 0, nr_sig_mrs = 0, max_num_sg = 0; int ret = 0; - if (attr->create_flags & IB_QP_CREATE_SIGNATURE_EN) { + if (attr->create_flags & IB_QP_CREATE_INTEGRITY_EN) { nr_sig_mrs = attr->cap.max_rdma_ctxs; - nr_mrs = attr->cap.max_rdma_ctxs * 2; + nr_mrs = attr->cap.max_rdma_ctxs; + max_num_sg = rdma_rw_fr_page_list_len(dev, true); } else if (rdma_rw_can_use_mr(dev, attr->port_num)) { nr_mrs = attr->cap.max_rdma_ctxs; + max_num_sg = rdma_rw_fr_page_list_len(dev, false); } if (nr_mrs) { ret = ib_mr_pool_init(qp, &qp->rdma_mrs, nr_mrs, IB_MR_TYPE_MEM_REG, - rdma_rw_fr_page_list_len(dev)); + max_num_sg, 0); if (ret) { pr_err("%s: failed to allocated %d MRs\n", __func__, nr_mrs); @@ -720,10 +703,10 @@ int rdma_rw_init_mrs(struct ib_qp *qp, struct ib_qp_init_attr *attr) if (nr_sig_mrs) { ret = ib_mr_pool_init(qp, &qp->sig_mrs, nr_sig_mrs, - IB_MR_TYPE_SIGNATURE, 2); + IB_MR_TYPE_INTEGRITY, max_num_sg, max_num_sg); if (ret) { pr_err("%s: failed to allocated %d SIG MRs\n", - __func__, nr_mrs); + __func__, nr_sig_mrs); goto out_free_rdma_mrs; } } diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c index c78d0c9646ae..b477295a96c2 100644 --- a/drivers/infiniband/core/sysfs.c +++ b/drivers/infiniband/core/sysfs.c @@ -43,6 +43,7 @@ #include <rdma/ib_mad.h> #include <rdma/ib_pma.h> #include <rdma/ib_cache.h> +#include <rdma/rdma_counter.h> struct ib_port; @@ -800,9 +801,12 @@ static int update_hw_stats(struct ib_device *dev, struct rdma_hw_stats *stats, return 0; } -static ssize_t print_hw_stat(struct rdma_hw_stats *stats, int index, char *buf) +static ssize_t print_hw_stat(struct ib_device *dev, int port_num, + struct rdma_hw_stats *stats, int index, char *buf) { - return sprintf(buf, "%llu\n", stats->value[index]); + u64 v = rdma_counter_get_hwstat_value(dev, port_num, index); + + return sprintf(buf, "%llu\n", stats->value[index] + v); } static ssize_t show_hw_stats(struct kobject *kobj, struct attribute *attr, @@ -828,7 +832,7 @@ static ssize_t show_hw_stats(struct kobject *kobj, struct attribute *attr, ret = update_hw_stats(dev, stats, hsa->port_num, hsa->index); if (ret) goto unlock; - ret = print_hw_stat(stats, hsa->index, buf); + ret = print_hw_stat(dev, hsa->port_num, stats, hsa->index, buf); unlock: mutex_unlock(&stats->lock); @@ -999,6 +1003,8 @@ static void setup_hw_stats(struct ib_device *device, struct ib_port *port, goto err; port->hw_stats_ag = hsag; port->hw_stats = stats; + if (device->port_data) + device->port_data[port_num].hw_stats = stats; } else { struct kobject *kobj = &device->dev.kobj; ret = sysfs_create_group(kobj, hsag); @@ -1289,6 +1295,8 @@ const struct attribute_group ib_dev_attr_group = { void ib_free_port_attrs(struct ib_core_device *coredev) { + struct ib_device *device = rdma_device_to_ibdev(&coredev->dev); + bool is_full_dev = &device->coredev == coredev; struct kobject *p, *t; list_for_each_entry_safe(p, t, &coredev->port_list, entry) { @@ -1298,6 +1306,8 @@ void ib_free_port_attrs(struct ib_core_device *coredev) if (port->hw_stats_ag) free_hsag(&port->kobj, port->hw_stats_ag); kfree(port->hw_stats); + if (device->port_data && is_full_dev) + device->port_data[port->port_num].hw_stats = NULL; if (port->pma_table) sysfs_remove_group(p, port->pma_table); diff --git a/drivers/infiniband/core/ucm.c b/drivers/infiniband/core/ucm.c deleted file mode 100644 index 8e7da2d41fd8..000000000000 --- a/drivers/infiniband/core/ucm.c +++ /dev/null @@ -1,1350 +0,0 @@ -/* - * Copyright (c) 2005 Topspin Communications. All rights reserved. - * Copyright (c) 2005 Intel Corporation. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include <linux/completion.h> -#include <linux/init.h> -#include <linux/fs.h> -#include <linux/module.h> -#include <linux/device.h> -#include <linux/err.h> -#include <linux/poll.h> -#include <linux/sched.h> -#include <linux/file.h> -#include <linux/mount.h> -#include <linux/cdev.h> -#include <linux/xarray.h> -#include <linux/mutex.h> -#include <linux/slab.h> - -#include <linux/nospec.h> - -#include <linux/uaccess.h> - -#include <rdma/ib.h> -#include <rdma/ib_cm.h> -#include <rdma/ib_user_cm.h> -#include <rdma/ib_marshall.h> - -#include "core_priv.h" - -MODULE_AUTHOR("Libor Michalek"); -MODULE_DESCRIPTION("InfiniBand userspace Connection Manager access"); -MODULE_LICENSE("Dual BSD/GPL"); - -struct ib_ucm_device { - int devnum; - struct cdev cdev; - struct device dev; - struct ib_device *ib_dev; -}; - -struct ib_ucm_file { - struct mutex file_mutex; - struct file *filp; - struct ib_ucm_device *device; - - struct list_head ctxs; - struct list_head events; - wait_queue_head_t poll_wait; -}; - -struct ib_ucm_context { - int id; - struct completion comp; - atomic_t ref; - int events_reported; - - struct ib_ucm_file *file; - struct ib_cm_id *cm_id; - __u64 uid; - - struct list_head events; /* list of pending events. */ - struct list_head file_list; /* member in file ctx list */ -}; - -struct ib_ucm_event { - struct ib_ucm_context *ctx; - struct list_head file_list; /* member in file event list */ - struct list_head ctx_list; /* member in ctx event list */ - - struct ib_cm_id *cm_id; - struct ib_ucm_event_resp resp; - void *data; - void *info; - int data_len; - int info_len; -}; - -enum { - IB_UCM_MAJOR = 231, - IB_UCM_BASE_MINOR = 224, - IB_UCM_MAX_DEVICES = RDMA_MAX_PORTS, - IB_UCM_NUM_FIXED_MINOR = 32, - IB_UCM_NUM_DYNAMIC_MINOR = IB_UCM_MAX_DEVICES - IB_UCM_NUM_FIXED_MINOR, -}; - -#define IB_UCM_BASE_DEV MKDEV(IB_UCM_MAJOR, IB_UCM_BASE_MINOR) -static dev_t dynamic_ucm_dev; - -static void ib_ucm_add_one(struct ib_device *device); -static void ib_ucm_remove_one(struct ib_device *device, void *client_data); - -static struct ib_client ucm_client = { - .name = "ucm", - .add = ib_ucm_add_one, - .remove = ib_ucm_remove_one -}; - -static DEFINE_XARRAY_ALLOC(ctx_id_table); -static DECLARE_BITMAP(dev_map, IB_UCM_MAX_DEVICES); - -static struct ib_ucm_context *ib_ucm_ctx_get(struct ib_ucm_file *file, int id) -{ - struct ib_ucm_context *ctx; - - xa_lock(&ctx_id_table); - ctx = xa_load(&ctx_id_table, id); - if (!ctx) - ctx = ERR_PTR(-ENOENT); - else if (ctx->file != file) - ctx = ERR_PTR(-EINVAL); - else - atomic_inc(&ctx->ref); - xa_unlock(&ctx_id_table); - - return ctx; -} - -static void ib_ucm_ctx_put(struct ib_ucm_context *ctx) -{ - if (atomic_dec_and_test(&ctx->ref)) - complete(&ctx->comp); -} - -static inline int ib_ucm_new_cm_id(int event) -{ - return event == IB_CM_REQ_RECEIVED || event == IB_CM_SIDR_REQ_RECEIVED; -} - -static void ib_ucm_cleanup_events(struct ib_ucm_context *ctx) -{ - struct ib_ucm_event *uevent; - - mutex_lock(&ctx->file->file_mutex); - list_del(&ctx->file_list); - while (!list_empty(&ctx->events)) { - - uevent = list_entry(ctx->events.next, - struct ib_ucm_event, ctx_list); - list_del(&uevent->file_list); - list_del(&uevent->ctx_list); - mutex_unlock(&ctx->file->file_mutex); - - /* clear incoming connections. */ - if (ib_ucm_new_cm_id(uevent->resp.event)) - ib_destroy_cm_id(uevent->cm_id); - - kfree(uevent); - mutex_lock(&ctx->file->file_mutex); - } - mutex_unlock(&ctx->file->file_mutex); -} - -static struct ib_ucm_context *ib_ucm_ctx_alloc(struct ib_ucm_file *file) -{ - struct ib_ucm_context *ctx; - - ctx = kzalloc(sizeof *ctx, GFP_KERNEL); - if (!ctx) - return NULL; - - atomic_set(&ctx->ref, 1); - init_completion(&ctx->comp); - ctx->file = file; - INIT_LIST_HEAD(&ctx->events); - - if (xa_alloc(&ctx_id_table, &ctx->id, ctx, xa_limit_32b, GFP_KERNEL)) - goto error; - - list_add_tail(&ctx->file_list, &file->ctxs); - return ctx; - -error: - kfree(ctx); - return NULL; -} - -static void ib_ucm_event_req_get(struct ib_ucm_req_event_resp *ureq, - const struct ib_cm_req_event_param *kreq) -{ - ureq->remote_ca_guid = kreq->remote_ca_guid; - ureq->remote_qkey = kreq->remote_qkey; - ureq->remote_qpn = kreq->remote_qpn; - ureq->qp_type = kreq->qp_type; - ureq->starting_psn = kreq->starting_psn; - ureq->responder_resources = kreq->responder_resources; - ureq->initiator_depth = kreq->initiator_depth; - ureq->local_cm_response_timeout = kreq->local_cm_response_timeout; - ureq->flow_control = kreq->flow_control; - ureq->remote_cm_response_timeout = kreq->remote_cm_response_timeout; - ureq->retry_count = kreq->retry_count; - ureq->rnr_retry_count = kreq->rnr_retry_count; - ureq->srq = kreq->srq; - ureq->port = kreq->port; - - ib_copy_path_rec_to_user(&ureq->primary_path, kreq->primary_path); - if (kreq->alternate_path) - ib_copy_path_rec_to_user(&ureq->alternate_path, - kreq->alternate_path); -} - -static void ib_ucm_event_rep_get(struct ib_ucm_rep_event_resp *urep, - const struct ib_cm_rep_event_param *krep) -{ - urep->remote_ca_guid = krep->remote_ca_guid; - urep->remote_qkey = krep->remote_qkey; - urep->remote_qpn = krep->remote_qpn; - urep->starting_psn = krep->starting_psn; - urep->responder_resources = krep->responder_resources; - urep->initiator_depth = krep->initiator_depth; - urep->target_ack_delay = krep->target_ack_delay; - urep->failover_accepted = krep->failover_accepted; - urep->flow_control = krep->flow_control; - urep->rnr_retry_count = krep->rnr_retry_count; - urep->srq = krep->srq; -} - -static void ib_ucm_event_sidr_rep_get(struct ib_ucm_sidr_rep_event_resp *urep, - const struct ib_cm_sidr_rep_event_param *krep) -{ - urep->status = krep->status; - urep->qkey = krep->qkey; - urep->qpn = krep->qpn; -}; - -static int ib_ucm_event_process(const struct ib_cm_event *evt, - struct ib_ucm_event *uvt) -{ - void *info = NULL; - - switch (evt->event) { - case IB_CM_REQ_RECEIVED: - ib_ucm_event_req_get(&uvt->resp.u.req_resp, - &evt->param.req_rcvd); - uvt->data_len = IB_CM_REQ_PRIVATE_DATA_SIZE; - uvt->resp.present = IB_UCM_PRES_PRIMARY; - uvt->resp.present |= (evt->param.req_rcvd.alternate_path ? - IB_UCM_PRES_ALTERNATE : 0); - break; - case IB_CM_REP_RECEIVED: - ib_ucm_event_rep_get(&uvt->resp.u.rep_resp, - &evt->param.rep_rcvd); - uvt->data_len = IB_CM_REP_PRIVATE_DATA_SIZE; - break; - case IB_CM_RTU_RECEIVED: - uvt->data_len = IB_CM_RTU_PRIVATE_DATA_SIZE; - uvt->resp.u.send_status = evt->param.send_status; - break; - case IB_CM_DREQ_RECEIVED: - uvt->data_len = IB_CM_DREQ_PRIVATE_DATA_SIZE; - uvt->resp.u.send_status = evt->param.send_status; - break; - case IB_CM_DREP_RECEIVED: - uvt->data_len = IB_CM_DREP_PRIVATE_DATA_SIZE; - uvt->resp.u.send_status = evt->param.send_status; - break; - case IB_CM_MRA_RECEIVED: - uvt->resp.u.mra_resp.timeout = - evt->param.mra_rcvd.service_timeout; - uvt->data_len = IB_CM_MRA_PRIVATE_DATA_SIZE; - break; - case IB_CM_REJ_RECEIVED: - uvt->resp.u.rej_resp.reason = evt->param.rej_rcvd.reason; - uvt->data_len = IB_CM_REJ_PRIVATE_DATA_SIZE; - uvt->info_len = evt->param.rej_rcvd.ari_length; - info = evt->param.rej_rcvd.ari; - break; - case IB_CM_LAP_RECEIVED: - ib_copy_path_rec_to_user(&uvt->resp.u.lap_resp.path, - evt->param.lap_rcvd.alternate_path); - uvt->data_len = IB_CM_LAP_PRIVATE_DATA_SIZE; - uvt->resp.present = IB_UCM_PRES_ALTERNATE; - break; - case IB_CM_APR_RECEIVED: - uvt->resp.u.apr_resp.status = evt->param.apr_rcvd.ap_status; - uvt->data_len = IB_CM_APR_PRIVATE_DATA_SIZE; - uvt->info_len = evt->param.apr_rcvd.info_len; - info = evt->param.apr_rcvd.apr_info; - break; - case IB_CM_SIDR_REQ_RECEIVED: - uvt->resp.u.sidr_req_resp.pkey = - evt->param.sidr_req_rcvd.pkey; - uvt->resp.u.sidr_req_resp.port = - evt->param.sidr_req_rcvd.port; - uvt->data_len = IB_CM_SIDR_REQ_PRIVATE_DATA_SIZE; - break; - case IB_CM_SIDR_REP_RECEIVED: - ib_ucm_event_sidr_rep_get(&uvt->resp.u.sidr_rep_resp, - &evt->param.sidr_rep_rcvd); - uvt->data_len = IB_CM_SIDR_REP_PRIVATE_DATA_SIZE; - uvt->info_len = evt->param.sidr_rep_rcvd.info_len; - info = evt->param.sidr_rep_rcvd.info; - break; - default: - uvt->resp.u.send_status = evt->param.send_status; - break; - } - - if (uvt->data_len) { - uvt->data = kmemdup(evt->private_data, uvt->data_len, GFP_KERNEL); - if (!uvt->data) - goto err1; - - uvt->resp.present |= IB_UCM_PRES_DATA; - } - - if (uvt->info_len) { - uvt->info = kmemdup(info, uvt->info_len, GFP_KERNEL); - if (!uvt->info) - goto err2; - - uvt->resp.present |= IB_UCM_PRES_INFO; - } - return 0; - -err2: - kfree(uvt->data); -err1: - return -ENOMEM; -} - -static int ib_ucm_event_handler(struct ib_cm_id *cm_id, - const struct ib_cm_event *event) -{ - struct ib_ucm_event *uevent; - struct ib_ucm_context *ctx; - int result = 0; - - ctx = cm_id->context; - - uevent = kzalloc(sizeof *uevent, GFP_KERNEL); - if (!uevent) - goto err1; - - uevent->ctx = ctx; - uevent->cm_id = cm_id; - uevent->resp.uid = ctx->uid; - uevent->resp.id = ctx->id; - uevent->resp.event = event->event; - - result = ib_ucm_event_process(event, uevent); - if (result) - goto err2; - - mutex_lock(&ctx->file->file_mutex); - list_add_tail(&uevent->file_list, &ctx->file->events); - list_add_tail(&uevent->ctx_list, &ctx->events); - wake_up_interruptible(&ctx->file->poll_wait); - mutex_unlock(&ctx->file->file_mutex); - return 0; - -err2: - kfree(uevent); -err1: - /* Destroy new cm_id's */ - return ib_ucm_new_cm_id(event->event); -} - -static ssize_t ib_ucm_event(struct ib_ucm_file *file, - const char __user *inbuf, - int in_len, int out_len) -{ - struct ib_ucm_context *ctx; - struct ib_ucm_event_get cmd; - struct ib_ucm_event *uevent; - int result = 0; - - if (out_len < sizeof(struct ib_ucm_event_resp)) - return -ENOSPC; - - if (copy_from_user(&cmd, inbuf, sizeof(cmd))) - return -EFAULT; - - mutex_lock(&file->file_mutex); - while (list_empty(&file->events)) { - mutex_unlock(&file->file_mutex); - - if (file->filp->f_flags & O_NONBLOCK) - return -EAGAIN; - - if (wait_event_interruptible(file->poll_wait, - !list_empty(&file->events))) - return -ERESTARTSYS; - - mutex_lock(&file->file_mutex); - } - - uevent = list_entry(file->events.next, struct ib_ucm_event, file_list); - - if (ib_ucm_new_cm_id(uevent->resp.event)) { - ctx = ib_ucm_ctx_alloc(file); - if (!ctx) { - result = -ENOMEM; - goto done; - } - - ctx->cm_id = uevent->cm_id; - ctx->cm_id->context = ctx; - uevent->resp.id = ctx->id; - } - - if (copy_to_user(u64_to_user_ptr(cmd.response), - &uevent->resp, sizeof(uevent->resp))) { - result = -EFAULT; - goto done; - } - - if (uevent->data) { - if (cmd.data_len < uevent->data_len) { - result = -ENOMEM; - goto done; - } - if (copy_to_user(u64_to_user_ptr(cmd.data), - uevent->data, uevent->data_len)) { - result = -EFAULT; - goto done; - } - } - - if (uevent->info) { - if (cmd.info_len < uevent->info_len) { - result = -ENOMEM; - goto done; - } - if (copy_to_user(u64_to_user_ptr(cmd.info), - uevent->info, uevent->info_len)) { - result = -EFAULT; - goto done; - } - } - - list_del(&uevent->file_list); - list_del(&uevent->ctx_list); - uevent->ctx->events_reported++; - - kfree(uevent->data); - kfree(uevent->info); - kfree(uevent); -done: - mutex_unlock(&file->file_mutex); - return result; -} - -static ssize_t ib_ucm_create_id(struct ib_ucm_file *file, - const char __user *inbuf, - int in_len, int out_len) -{ - struct ib_ucm_create_id cmd; - struct ib_ucm_create_id_resp resp; - struct ib_ucm_context *ctx; - int result; - - if (out_len < sizeof(resp)) - return -ENOSPC; - - if (copy_from_user(&cmd, inbuf, sizeof(cmd))) - return -EFAULT; - - mutex_lock(&file->file_mutex); - ctx = ib_ucm_ctx_alloc(file); - mutex_unlock(&file->file_mutex); - if (!ctx) - return -ENOMEM; - - ctx->uid = cmd.uid; - ctx->cm_id = ib_create_cm_id(file->device->ib_dev, - ib_ucm_event_handler, ctx); - if (IS_ERR(ctx->cm_id)) { - result = PTR_ERR(ctx->cm_id); - goto err1; - } - - resp.id = ctx->id; - if (copy_to_user(u64_to_user_ptr(cmd.response), - &resp, sizeof(resp))) { - result = -EFAULT; - goto err2; - } - return 0; - -err2: - ib_destroy_cm_id(ctx->cm_id); -err1: - xa_erase(&ctx_id_table, ctx->id); - kfree(ctx); - return result; -} - -static ssize_t ib_ucm_destroy_id(struct ib_ucm_file *file, - const char __user *inbuf, - int in_len, int out_len) -{ - struct ib_ucm_destroy_id cmd; - struct ib_ucm_destroy_id_resp resp; - struct ib_ucm_context *ctx; - int result = 0; - - if (out_len < sizeof(resp)) - return -ENOSPC; - - if (copy_from_user(&cmd, inbuf, sizeof(cmd))) - return -EFAULT; - - xa_lock(&ctx_id_table); - ctx = xa_load(&ctx_id_table, cmd.id); - if (!ctx) - ctx = ERR_PTR(-ENOENT); - else if (ctx->file != file) - ctx = ERR_PTR(-EINVAL); - else - __xa_erase(&ctx_id_table, ctx->id); - xa_unlock(&ctx_id_table); - - if (IS_ERR(ctx)) - return PTR_ERR(ctx); - - ib_ucm_ctx_put(ctx); - wait_for_completion(&ctx->comp); - - /* No new events will be generated after destroying the cm_id. */ - ib_destroy_cm_id(ctx->cm_id); - /* Cleanup events not yet reported to the user. */ - ib_ucm_cleanup_events(ctx); - - resp.events_reported = ctx->events_reported; - if (copy_to_user(u64_to_user_ptr(cmd.response), - &resp, sizeof(resp))) - result = -EFAULT; - - kfree(ctx); - return result; -} - -static ssize_t ib_ucm_attr_id(struct ib_ucm_file *file, - const char __user *inbuf, - int in_len, int out_len) -{ - struct ib_ucm_attr_id_resp resp; - struct ib_ucm_attr_id cmd; - struct ib_ucm_context *ctx; - int result = 0; - - if (out_len < sizeof(resp)) - return -ENOSPC; - - if (copy_from_user(&cmd, inbuf, sizeof(cmd))) - return -EFAULT; - - ctx = ib_ucm_ctx_get(file, cmd.id); - if (IS_ERR(ctx)) - return PTR_ERR(ctx); - - resp.service_id = ctx->cm_id->service_id; - resp.service_mask = ctx->cm_id->service_mask; - resp.local_id = ctx->cm_id->local_id; - resp.remote_id = ctx->cm_id->remote_id; - - if (copy_to_user(u64_to_user_ptr(cmd.response), - &resp, sizeof(resp))) - result = -EFAULT; - - ib_ucm_ctx_put(ctx); - return result; -} - -static ssize_t ib_ucm_init_qp_attr(struct ib_ucm_file *file, - const char __user *inbuf, - int in_len, int out_len) -{ - struct ib_uverbs_qp_attr resp; - struct ib_ucm_init_qp_attr cmd; - struct ib_ucm_context *ctx; - struct ib_qp_attr qp_attr; - int result = 0; - - if (out_len < sizeof(resp)) - return -ENOSPC; - - if (copy_from_user(&cmd, inbuf, sizeof(cmd))) - return -EFAULT; - - ctx = ib_ucm_ctx_get(file, cmd.id); - if (IS_ERR(ctx)) - return PTR_ERR(ctx); - - resp.qp_attr_mask = 0; - memset(&qp_attr, 0, sizeof qp_attr); - qp_attr.qp_state = cmd.qp_state; - result = ib_cm_init_qp_attr(ctx->cm_id, &qp_attr, &resp.qp_attr_mask); - if (result) - goto out; - - ib_copy_qp_attr_to_user(ctx->cm_id->device, &resp, &qp_attr); - - if (copy_to_user(u64_to_user_ptr(cmd.response), - &resp, sizeof(resp))) - result = -EFAULT; - -out: - ib_ucm_ctx_put(ctx); - return result; -} - -static int ucm_validate_listen(__be64 service_id, __be64 service_mask) -{ - service_id &= service_mask; - - if (((service_id & IB_CMA_SERVICE_ID_MASK) == IB_CMA_SERVICE_ID) || - ((service_id & IB_SDP_SERVICE_ID_MASK) == IB_SDP_SERVICE_ID)) - return -EINVAL; - - return 0; -} - -static ssize_t ib_ucm_listen(struct ib_ucm_file *file, - const char __user *inbuf, - int in_len, int out_len) -{ - struct ib_ucm_listen cmd; - struct ib_ucm_context *ctx; - int result; - - if (copy_from_user(&cmd, inbuf, sizeof(cmd))) - return -EFAULT; - - ctx = ib_ucm_ctx_get(file, cmd.id); - if (IS_ERR(ctx)) - return PTR_ERR(ctx); - - result = ucm_validate_listen(cmd.service_id, cmd.service_mask); - if (result) - goto out; - - result = ib_cm_listen(ctx->cm_id, cmd.service_id, cmd.service_mask); -out: - ib_ucm_ctx_put(ctx); - return result; -} - -static ssize_t ib_ucm_notify(struct ib_ucm_file *file, - const char __user *inbuf, - int in_len, int out_len) -{ - struct ib_ucm_notify cmd; - struct ib_ucm_context *ctx; - int result; - - if (copy_from_user(&cmd, inbuf, sizeof(cmd))) - return -EFAULT; - - ctx = ib_ucm_ctx_get(file, cmd.id); - if (IS_ERR(ctx)) - return PTR_ERR(ctx); - - result = ib_cm_notify(ctx->cm_id, (enum ib_event_type) cmd.event); - ib_ucm_ctx_put(ctx); - return result; -} - -static int ib_ucm_alloc_data(const void **dest, u64 src, u32 len) -{ - void *data; - - *dest = NULL; - - if (!len) - return 0; - - data = memdup_user(u64_to_user_ptr(src), len); - if (IS_ERR(data)) - return PTR_ERR(data); - - *dest = data; - return 0; -} - -static int ib_ucm_path_get(struct sa_path_rec **path, u64 src) -{ - struct ib_user_path_rec upath; - struct sa_path_rec *sa_path; - - *path = NULL; - - if (!src) - return 0; - - sa_path = kmalloc(sizeof(*sa_path), GFP_KERNEL); - if (!sa_path) - return -ENOMEM; - - if (copy_from_user(&upath, u64_to_user_ptr(src), - sizeof(upath))) { - - kfree(sa_path); - return -EFAULT; - } - - ib_copy_path_rec_from_user(sa_path, &upath); - *path = sa_path; - return 0; -} - -static ssize_t ib_ucm_send_req(struct ib_ucm_file *file, - const char __user *inbuf, - int in_len, int out_len) -{ - struct ib_cm_req_param param; - struct ib_ucm_context *ctx; - struct ib_ucm_req cmd; - int result; - - param.private_data = NULL; - param.primary_path = NULL; - param.alternate_path = NULL; - - if (copy_from_user(&cmd, inbuf, sizeof(cmd))) - return -EFAULT; - - result = ib_ucm_alloc_data(¶m.private_data, cmd.data, cmd.len); - if (result) - goto done; - - result = ib_ucm_path_get(¶m.primary_path, cmd.primary_path); - if (result) - goto done; - - result = ib_ucm_path_get(¶m.alternate_path, cmd.alternate_path); - if (result) - goto done; - - param.private_data_len = cmd.len; - param.service_id = cmd.sid; - param.qp_num = cmd.qpn; - param.qp_type = cmd.qp_type; - param.starting_psn = cmd.psn; - param.peer_to_peer = cmd.peer_to_peer; - param.responder_resources = cmd.responder_resources; - param.initiator_depth = cmd.initiator_depth; - param.remote_cm_response_timeout = cmd.remote_cm_response_timeout; - param.flow_control = cmd.flow_control; - param.local_cm_response_timeout = cmd.local_cm_response_timeout; - param.retry_count = cmd.retry_count; - param.rnr_retry_count = cmd.rnr_retry_count; - param.max_cm_retries = cmd.max_cm_retries; - param.srq = cmd.srq; - - ctx = ib_ucm_ctx_get(file, cmd.id); - if (!IS_ERR(ctx)) { - result = ib_send_cm_req(ctx->cm_id, ¶m); - ib_ucm_ctx_put(ctx); - } else - result = PTR_ERR(ctx); - -done: - kfree(param.private_data); - kfree(param.primary_path); - kfree(param.alternate_path); - return result; -} - -static ssize_t ib_ucm_send_rep(struct ib_ucm_file *file, - const char __user *inbuf, - int in_len, int out_len) -{ - struct ib_cm_rep_param param; - struct ib_ucm_context *ctx; - struct ib_ucm_rep cmd; - int result; - - param.private_data = NULL; - - if (copy_from_user(&cmd, inbuf, sizeof(cmd))) - return -EFAULT; - - result = ib_ucm_alloc_data(¶m.private_data, cmd.data, cmd.len); - if (result) - return result; - - param.qp_num = cmd.qpn; - param.starting_psn = cmd.psn; - param.private_data_len = cmd.len; - param.responder_resources = cmd.responder_resources; - param.initiator_depth = cmd.initiator_depth; - param.failover_accepted = cmd.failover_accepted; - param.flow_control = cmd.flow_control; - param.rnr_retry_count = cmd.rnr_retry_count; - param.srq = cmd.srq; - - ctx = ib_ucm_ctx_get(file, cmd.id); - if (!IS_ERR(ctx)) { - ctx->uid = cmd.uid; - result = ib_send_cm_rep(ctx->cm_id, ¶m); - ib_ucm_ctx_put(ctx); - } else - result = PTR_ERR(ctx); - - kfree(param.private_data); - return result; -} - -static ssize_t ib_ucm_send_private_data(struct ib_ucm_file *file, - const char __user *inbuf, int in_len, - int (*func)(struct ib_cm_id *cm_id, - const void *private_data, - u8 private_data_len)) -{ - struct ib_ucm_private_data cmd; - struct ib_ucm_context *ctx; - const void *private_data = NULL; - int result; - - if (copy_from_user(&cmd, inbuf, sizeof(cmd))) - return -EFAULT; - - result = ib_ucm_alloc_data(&private_data, cmd.data, cmd.len); - if (result) - return result; - - ctx = ib_ucm_ctx_get(file, cmd.id); - if (!IS_ERR(ctx)) { - result = func(ctx->cm_id, private_data, cmd.len); - ib_ucm_ctx_put(ctx); - } else - result = PTR_ERR(ctx); - - kfree(private_data); - return result; -} - -static ssize_t ib_ucm_send_rtu(struct ib_ucm_file *file, - const char __user *inbuf, - int in_len, int out_len) -{ - return ib_ucm_send_private_data(file, inbuf, in_len, ib_send_cm_rtu); -} - -static ssize_t ib_ucm_send_dreq(struct ib_ucm_file *file, - const char __user *inbuf, - int in_len, int out_len) -{ - return ib_ucm_send_private_data(file, inbuf, in_len, ib_send_cm_dreq); -} - -static ssize_t ib_ucm_send_drep(struct ib_ucm_file *file, - const char __user *inbuf, - int in_len, int out_len) -{ - return ib_ucm_send_private_data(file, inbuf, in_len, ib_send_cm_drep); -} - -static ssize_t ib_ucm_send_info(struct ib_ucm_file *file, - const char __user *inbuf, int in_len, - int (*func)(struct ib_cm_id *cm_id, - int status, - const void *info, - u8 info_len, - const void *data, - u8 data_len)) -{ - struct ib_ucm_context *ctx; - struct ib_ucm_info cmd; - const void *data = NULL; - const void *info = NULL; - int result; - - if (copy_from_user(&cmd, inbuf, sizeof(cmd))) - return -EFAULT; - - result = ib_ucm_alloc_data(&data, cmd.data, cmd.data_len); - if (result) - goto done; - - result = ib_ucm_alloc_data(&info, cmd.info, cmd.info_len); - if (result) - goto done; - - ctx = ib_ucm_ctx_get(file, cmd.id); - if (!IS_ERR(ctx)) { - result = func(ctx->cm_id, cmd.status, info, cmd.info_len, - data, cmd.data_len); - ib_ucm_ctx_put(ctx); - } else - result = PTR_ERR(ctx); - -done: - kfree(data); - kfree(info); - return result; -} - -static ssize_t ib_ucm_send_rej(struct ib_ucm_file *file, - const char __user *inbuf, - int in_len, int out_len) -{ - return ib_ucm_send_info(file, inbuf, in_len, (void *)ib_send_cm_rej); -} - -static ssize_t ib_ucm_send_apr(struct ib_ucm_file *file, - const char __user *inbuf, - int in_len, int out_len) -{ - return ib_ucm_send_info(file, inbuf, in_len, (void *)ib_send_cm_apr); -} - -static ssize_t ib_ucm_send_mra(struct ib_ucm_file *file, - const char __user *inbuf, - int in_len, int out_len) -{ - struct ib_ucm_context *ctx; - struct ib_ucm_mra cmd; - const void *data = NULL; - int result; - - if (copy_from_user(&cmd, inbuf, sizeof(cmd))) - return -EFAULT; - - result = ib_ucm_alloc_data(&data, cmd.data, cmd.len); - if (result) - return result; - - ctx = ib_ucm_ctx_get(file, cmd.id); - if (!IS_ERR(ctx)) { - result = ib_send_cm_mra(ctx->cm_id, cmd.timeout, data, cmd.len); - ib_ucm_ctx_put(ctx); - } else - result = PTR_ERR(ctx); - - kfree(data); - return result; -} - -static ssize_t ib_ucm_send_lap(struct ib_ucm_file *file, - const char __user *inbuf, - int in_len, int out_len) -{ - struct ib_ucm_context *ctx; - struct sa_path_rec *path = NULL; - struct ib_ucm_lap cmd; - const void *data = NULL; - int result; - - if (copy_from_user(&cmd, inbuf, sizeof(cmd))) - return -EFAULT; - - result = ib_ucm_alloc_data(&data, cmd.data, cmd.len); - if (result) - goto done; - - result = ib_ucm_path_get(&path, cmd.path); - if (result) - goto done; - - ctx = ib_ucm_ctx_get(file, cmd.id); - if (!IS_ERR(ctx)) { - result = ib_send_cm_lap(ctx->cm_id, path, data, cmd.len); - ib_ucm_ctx_put(ctx); - } else - result = PTR_ERR(ctx); - -done: - kfree(data); - kfree(path); - return result; -} - -static ssize_t ib_ucm_send_sidr_req(struct ib_ucm_file *file, - const char __user *inbuf, - int in_len, int out_len) -{ - struct ib_cm_sidr_req_param param = {}; - struct ib_ucm_context *ctx; - struct ib_ucm_sidr_req cmd; - int result; - - if (copy_from_user(&cmd, inbuf, sizeof(cmd))) - return -EFAULT; - - result = ib_ucm_alloc_data(¶m.private_data, cmd.data, cmd.len); - if (result) - goto done; - - result = ib_ucm_path_get(¶m.path, cmd.path); - if (result) - goto done; - - param.private_data_len = cmd.len; - param.service_id = cmd.sid; - param.timeout_ms = cmd.timeout; - param.max_cm_retries = cmd.max_cm_retries; - - ctx = ib_ucm_ctx_get(file, cmd.id); - if (!IS_ERR(ctx)) { - result = ib_send_cm_sidr_req(ctx->cm_id, ¶m); - ib_ucm_ctx_put(ctx); - } else - result = PTR_ERR(ctx); - -done: - kfree(param.private_data); - kfree(param.path); - return result; -} - -static ssize_t ib_ucm_send_sidr_rep(struct ib_ucm_file *file, - const char __user *inbuf, - int in_len, int out_len) -{ - struct ib_cm_sidr_rep_param param; - struct ib_ucm_sidr_rep cmd; - struct ib_ucm_context *ctx; - int result; - - param.info = NULL; - - if (copy_from_user(&cmd, inbuf, sizeof(cmd))) - return -EFAULT; - - result = ib_ucm_alloc_data(¶m.private_data, - cmd.data, cmd.data_len); - if (result) - goto done; - - result = ib_ucm_alloc_data(¶m.info, cmd.info, cmd.info_len); - if (result) - goto done; - - param.qp_num = cmd.qpn; - param.qkey = cmd.qkey; - param.status = cmd.status; - param.info_length = cmd.info_len; - param.private_data_len = cmd.data_len; - - ctx = ib_ucm_ctx_get(file, cmd.id); - if (!IS_ERR(ctx)) { - result = ib_send_cm_sidr_rep(ctx->cm_id, ¶m); - ib_ucm_ctx_put(ctx); - } else - result = PTR_ERR(ctx); - -done: - kfree(param.private_data); - kfree(param.info); - return result; -} - -static ssize_t (*ucm_cmd_table[])(struct ib_ucm_file *file, - const char __user *inbuf, - int in_len, int out_len) = { - [IB_USER_CM_CMD_CREATE_ID] = ib_ucm_create_id, - [IB_USER_CM_CMD_DESTROY_ID] = ib_ucm_destroy_id, - [IB_USER_CM_CMD_ATTR_ID] = ib_ucm_attr_id, - [IB_USER_CM_CMD_LISTEN] = ib_ucm_listen, - [IB_USER_CM_CMD_NOTIFY] = ib_ucm_notify, - [IB_USER_CM_CMD_SEND_REQ] = ib_ucm_send_req, - [IB_USER_CM_CMD_SEND_REP] = ib_ucm_send_rep, - [IB_USER_CM_CMD_SEND_RTU] = ib_ucm_send_rtu, - [IB_USER_CM_CMD_SEND_DREQ] = ib_ucm_send_dreq, - [IB_USER_CM_CMD_SEND_DREP] = ib_ucm_send_drep, - [IB_USER_CM_CMD_SEND_REJ] = ib_ucm_send_rej, - [IB_USER_CM_CMD_SEND_MRA] = ib_ucm_send_mra, - [IB_USER_CM_CMD_SEND_LAP] = ib_ucm_send_lap, - [IB_USER_CM_CMD_SEND_APR] = ib_ucm_send_apr, - [IB_USER_CM_CMD_SEND_SIDR_REQ] = ib_ucm_send_sidr_req, - [IB_USER_CM_CMD_SEND_SIDR_REP] = ib_ucm_send_sidr_rep, - [IB_USER_CM_CMD_EVENT] = ib_ucm_event, - [IB_USER_CM_CMD_INIT_QP_ATTR] = ib_ucm_init_qp_attr, -}; - -static ssize_t ib_ucm_write(struct file *filp, const char __user *buf, - size_t len, loff_t *pos) -{ - struct ib_ucm_file *file = filp->private_data; - struct ib_ucm_cmd_hdr hdr; - ssize_t result; - - if (!ib_safe_file_access(filp)) { - pr_err_once("ucm_write: process %d (%s) changed security contexts after opening file descriptor, this is not allowed.\n", - task_tgid_vnr(current), current->comm); - return -EACCES; - } - - if (len < sizeof(hdr)) - return -EINVAL; - - if (copy_from_user(&hdr, buf, sizeof(hdr))) - return -EFAULT; - - if (hdr.cmd >= ARRAY_SIZE(ucm_cmd_table)) - return -EINVAL; - hdr.cmd = array_index_nospec(hdr.cmd, ARRAY_SIZE(ucm_cmd_table)); - - if (hdr.in + sizeof(hdr) > len) - return -EINVAL; - - result = ucm_cmd_table[hdr.cmd](file, buf + sizeof(hdr), - hdr.in, hdr.out); - if (!result) - result = len; - - return result; -} - -static __poll_t ib_ucm_poll(struct file *filp, - struct poll_table_struct *wait) -{ - struct ib_ucm_file *file = filp->private_data; - __poll_t mask = 0; - - poll_wait(filp, &file->poll_wait, wait); - - if (!list_empty(&file->events)) - mask = EPOLLIN | EPOLLRDNORM; - - return mask; -} - -/* - * ib_ucm_open() does not need the BKL: - * - * - no global state is referred to; - * - there is no ioctl method to race against; - * - no further module initialization is required for open to work - * after the device is registered. - */ -static int ib_ucm_open(struct inode *inode, struct file *filp) -{ - struct ib_ucm_file *file; - - file = kmalloc(sizeof(*file), GFP_KERNEL); - if (!file) - return -ENOMEM; - - INIT_LIST_HEAD(&file->events); - INIT_LIST_HEAD(&file->ctxs); - init_waitqueue_head(&file->poll_wait); - - mutex_init(&file->file_mutex); - - filp->private_data = file; - file->filp = filp; - file->device = container_of(inode->i_cdev, struct ib_ucm_device, cdev); - - return stream_open(inode, filp); -} - -static int ib_ucm_close(struct inode *inode, struct file *filp) -{ - struct ib_ucm_file *file = filp->private_data; - struct ib_ucm_context *ctx; - - mutex_lock(&file->file_mutex); - while (!list_empty(&file->ctxs)) { - ctx = list_entry(file->ctxs.next, - struct ib_ucm_context, file_list); - mutex_unlock(&file->file_mutex); - - xa_erase(&ctx_id_table, ctx->id); - ib_destroy_cm_id(ctx->cm_id); - ib_ucm_cleanup_events(ctx); - kfree(ctx); - - mutex_lock(&file->file_mutex); - } - mutex_unlock(&file->file_mutex); - kfree(file); - return 0; -} - -static void ib_ucm_release_dev(struct device *dev) -{ - struct ib_ucm_device *ucm_dev; - - ucm_dev = container_of(dev, struct ib_ucm_device, dev); - kfree(ucm_dev); -} - -static void ib_ucm_free_dev(struct ib_ucm_device *ucm_dev) -{ - clear_bit(ucm_dev->devnum, dev_map); -} - -static const struct file_operations ucm_fops = { - .owner = THIS_MODULE, - .open = ib_ucm_open, - .release = ib_ucm_close, - .write = ib_ucm_write, - .poll = ib_ucm_poll, - .llseek = no_llseek, -}; - -static ssize_t show_ibdev(struct device *dev, struct device_attribute *attr, - char *buf) -{ - struct ib_ucm_device *ucm_dev; - - ucm_dev = container_of(dev, struct ib_ucm_device, dev); - return sprintf(buf, "%s\n", ucm_dev->ib_dev->name); -} -static DEVICE_ATTR(ibdev, S_IRUGO, show_ibdev, NULL); - -static void ib_ucm_add_one(struct ib_device *device) -{ - int devnum; - dev_t base; - struct ib_ucm_device *ucm_dev; - - if (!device->ops.alloc_ucontext || !rdma_cap_ib_cm(device, 1)) - return; - - ucm_dev = kzalloc(sizeof *ucm_dev, GFP_KERNEL); - if (!ucm_dev) - return; - - device_initialize(&ucm_dev->dev); - ucm_dev->ib_dev = device; - ucm_dev->dev.release = ib_ucm_release_dev; - - devnum = find_first_zero_bit(dev_map, IB_UCM_MAX_DEVICES); - if (devnum >= IB_UCM_MAX_DEVICES) - goto err; - ucm_dev->devnum = devnum; - set_bit(devnum, dev_map); - if (devnum >= IB_UCM_NUM_FIXED_MINOR) - base = dynamic_ucm_dev + devnum - IB_UCM_NUM_FIXED_MINOR; - else - base = IB_UCM_BASE_DEV + devnum; - - cdev_init(&ucm_dev->cdev, &ucm_fops); - ucm_dev->cdev.owner = THIS_MODULE; - kobject_set_name(&ucm_dev->cdev.kobj, "ucm%d", ucm_dev->devnum); - - ucm_dev->dev.class = &cm_class; - ucm_dev->dev.parent = device->dev.parent; - ucm_dev->dev.devt = base; - - dev_set_name(&ucm_dev->dev, "ucm%d", ucm_dev->devnum); - if (cdev_device_add(&ucm_dev->cdev, &ucm_dev->dev)) - goto err_devnum; - - if (device_create_file(&ucm_dev->dev, &dev_attr_ibdev)) - goto err_dev; - - ib_set_client_data(device, &ucm_client, ucm_dev); - return; - -err_dev: - cdev_device_del(&ucm_dev->cdev, &ucm_dev->dev); -err_devnum: - ib_ucm_free_dev(ucm_dev); -err: - put_device(&ucm_dev->dev); - return; -} - -static void ib_ucm_remove_one(struct ib_device *device, void *client_data) -{ - struct ib_ucm_device *ucm_dev = client_data; - - if (!ucm_dev) - return; - - cdev_device_del(&ucm_dev->cdev, &ucm_dev->dev); - ib_ucm_free_dev(ucm_dev); - put_device(&ucm_dev->dev); -} - -static CLASS_ATTR_STRING(abi_version, S_IRUGO, - __stringify(IB_USER_CM_ABI_VERSION)); - -static int __init ib_ucm_init(void) -{ - int ret; - - ret = register_chrdev_region(IB_UCM_BASE_DEV, IB_UCM_NUM_FIXED_MINOR, - "infiniband_cm"); - if (ret) { - pr_err("ucm: couldn't register device number\n"); - goto error1; - } - - ret = alloc_chrdev_region(&dynamic_ucm_dev, 0, IB_UCM_NUM_DYNAMIC_MINOR, - "infiniband_cm"); - if (ret) { - pr_err("ucm: couldn't register dynamic device number\n"); - goto err_alloc; - } - - ret = class_create_file(&cm_class, &class_attr_abi_version.attr); - if (ret) { - pr_err("ucm: couldn't create abi_version attribute\n"); - goto error2; - } - - ret = ib_register_client(&ucm_client); - if (ret) { - pr_err("ucm: couldn't register client\n"); - goto error3; - } - return 0; - -error3: - class_remove_file(&cm_class, &class_attr_abi_version.attr); -error2: - unregister_chrdev_region(dynamic_ucm_dev, IB_UCM_NUM_DYNAMIC_MINOR); -err_alloc: - unregister_chrdev_region(IB_UCM_BASE_DEV, IB_UCM_NUM_FIXED_MINOR); -error1: - return ret; -} - -static void __exit ib_ucm_cleanup(void) -{ - ib_unregister_client(&ucm_client); - class_remove_file(&cm_class, &class_attr_abi_version.attr); - unregister_chrdev_region(IB_UCM_BASE_DEV, IB_UCM_NUM_FIXED_MINOR); - unregister_chrdev_region(dynamic_ucm_dev, IB_UCM_NUM_DYNAMIC_MINOR); - WARN_ON(!xa_empty(&ctx_id_table)); -} - -module_init(ib_ucm_init); -module_exit(ib_ucm_cleanup); diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c index 140a338a135f..0274e9b704be 100644 --- a/drivers/infiniband/core/ucma.c +++ b/drivers/infiniband/core/ucma.c @@ -52,6 +52,8 @@ #include <rdma/rdma_cm_ib.h> #include <rdma/ib_addr.h> #include <rdma/ib.h> +#include <rdma/rdma_netlink.h> +#include "core_priv.h" MODULE_AUTHOR("Sean Hefty"); MODULE_DESCRIPTION("RDMA Userspace Connection Manager Access"); @@ -81,7 +83,7 @@ struct ucma_file { }; struct ucma_context { - int id; + u32 id; struct completion comp; atomic_t ref; int events_reported; @@ -94,7 +96,7 @@ struct ucma_context { struct list_head list; struct list_head mc_list; /* mark that device is in process of destroying the internal HW - * resources, protected by the global mut + * resources, protected by the ctx_table lock */ int closing; /* sync between removal event and id destroy, protected by file mut */ @@ -104,7 +106,7 @@ struct ucma_context { struct ucma_multicast { struct ucma_context *ctx; - int id; + u32 id; int events_reported; u64 uid; @@ -122,9 +124,8 @@ struct ucma_event { struct work_struct close_work; }; -static DEFINE_MUTEX(mut); -static DEFINE_IDR(ctx_idr); -static DEFINE_IDR(multicast_idr); +static DEFINE_XARRAY_ALLOC(ctx_table); +static DEFINE_XARRAY_ALLOC(multicast_table); static const struct file_operations ucma_fops; @@ -133,7 +134,7 @@ static inline struct ucma_context *_ucma_find_context(int id, { struct ucma_context *ctx; - ctx = idr_find(&ctx_idr, id); + ctx = xa_load(&ctx_table, id); if (!ctx) ctx = ERR_PTR(-ENOENT); else if (ctx->file != file || !ctx->cm_id) @@ -145,7 +146,7 @@ static struct ucma_context *ucma_get_ctx(struct ucma_file *file, int id) { struct ucma_context *ctx; - mutex_lock(&mut); + xa_lock(&ctx_table); ctx = _ucma_find_context(id, file); if (!IS_ERR(ctx)) { if (ctx->closing) @@ -153,7 +154,7 @@ static struct ucma_context *ucma_get_ctx(struct ucma_file *file, int id) else atomic_inc(&ctx->ref); } - mutex_unlock(&mut); + xa_unlock(&ctx_table); return ctx; } @@ -216,10 +217,7 @@ static struct ucma_context *ucma_alloc_ctx(struct ucma_file *file) INIT_LIST_HEAD(&ctx->mc_list); ctx->file = file; - mutex_lock(&mut); - ctx->id = idr_alloc(&ctx_idr, ctx, 0, 0, GFP_KERNEL); - mutex_unlock(&mut); - if (ctx->id < 0) + if (xa_alloc(&ctx_table, &ctx->id, ctx, xa_limit_32b, GFP_KERNEL)) goto error; list_add_tail(&ctx->list, &file->ctx_list); @@ -238,13 +236,10 @@ static struct ucma_multicast* ucma_alloc_multicast(struct ucma_context *ctx) if (!mc) return NULL; - mutex_lock(&mut); - mc->id = idr_alloc(&multicast_idr, NULL, 0, 0, GFP_KERNEL); - mutex_unlock(&mut); - if (mc->id < 0) + mc->ctx = ctx; + if (xa_alloc(&multicast_table, &mc->id, NULL, xa_limit_32b, GFP_KERNEL)) goto error; - mc->ctx = ctx; list_add_tail(&mc->list, &ctx->mc_list); return mc; @@ -319,9 +314,9 @@ static void ucma_removal_event_handler(struct rdma_cm_id *cm_id) * handled separately below. */ if (ctx->cm_id == cm_id) { - mutex_lock(&mut); + xa_lock(&ctx_table); ctx->closing = 1; - mutex_unlock(&mut); + xa_unlock(&ctx_table); queue_work(ctx->file->close_wq, &ctx->close_work); return; } @@ -523,9 +518,7 @@ static ssize_t ucma_create_id(struct ucma_file *file, const char __user *inbuf, err2: rdma_destroy_id(cm_id); err1: - mutex_lock(&mut); - idr_remove(&ctx_idr, ctx->id); - mutex_unlock(&mut); + xa_erase(&ctx_table, ctx->id); mutex_lock(&file->mut); list_del(&ctx->list); mutex_unlock(&file->mut); @@ -537,13 +530,13 @@ static void ucma_cleanup_multicast(struct ucma_context *ctx) { struct ucma_multicast *mc, *tmp; - mutex_lock(&mut); + mutex_lock(&ctx->file->mut); list_for_each_entry_safe(mc, tmp, &ctx->mc_list, list) { list_del(&mc->list); - idr_remove(&multicast_idr, mc->id); + xa_erase(&multicast_table, mc->id); kfree(mc); } - mutex_unlock(&mut); + mutex_unlock(&ctx->file->mut); } static void ucma_cleanup_mc_events(struct ucma_multicast *mc) @@ -614,11 +607,11 @@ static ssize_t ucma_destroy_id(struct ucma_file *file, const char __user *inbuf, if (copy_from_user(&cmd, inbuf, sizeof(cmd))) return -EFAULT; - mutex_lock(&mut); + xa_lock(&ctx_table); ctx = _ucma_find_context(cmd.id, file); if (!IS_ERR(ctx)) - idr_remove(&ctx_idr, ctx->id); - mutex_unlock(&mut); + __xa_erase(&ctx_table, ctx->id); + xa_unlock(&ctx_table); if (IS_ERR(ctx)) return PTR_ERR(ctx); @@ -630,14 +623,14 @@ static ssize_t ucma_destroy_id(struct ucma_file *file, const char __user *inbuf, flush_workqueue(ctx->file->close_wq); /* At this point it's guaranteed that there is no inflight * closing task */ - mutex_lock(&mut); + xa_lock(&ctx_table); if (!ctx->closing) { - mutex_unlock(&mut); + xa_unlock(&ctx_table); ucma_put_ctx(ctx); wait_for_completion(&ctx->comp); rdma_destroy_id(ctx->cm_id); } else { - mutex_unlock(&mut); + xa_unlock(&ctx_table); } resp.events_reported = ucma_free_ctx(ctx); @@ -951,8 +944,7 @@ static ssize_t ucma_query_path(struct ucma_context *ctx, } } - if (copy_to_user(response, resp, - sizeof(*resp) + (i * sizeof(struct ib_path_rec_data)))) + if (copy_to_user(response, resp, struct_size(resp, path_data, i))) ret = -EFAULT; kfree(resp); @@ -1432,9 +1424,7 @@ static ssize_t ucma_process_join(struct ucma_file *file, goto err3; } - mutex_lock(&mut); - idr_replace(&multicast_idr, mc, mc->id); - mutex_unlock(&mut); + xa_store(&multicast_table, mc->id, mc, 0); mutex_unlock(&file->mut); ucma_put_ctx(ctx); @@ -1444,9 +1434,7 @@ err3: rdma_leave_multicast(ctx->cm_id, (struct sockaddr *) &mc->addr); ucma_cleanup_mc_events(mc); err2: - mutex_lock(&mut); - idr_remove(&multicast_idr, mc->id); - mutex_unlock(&mut); + xa_erase(&multicast_table, mc->id); list_del(&mc->list); kfree(mc); err1: @@ -1508,8 +1496,8 @@ static ssize_t ucma_leave_multicast(struct ucma_file *file, if (copy_from_user(&cmd, inbuf, sizeof(cmd))) return -EFAULT; - mutex_lock(&mut); - mc = idr_find(&multicast_idr, cmd.id); + xa_lock(&multicast_table); + mc = xa_load(&multicast_table, cmd.id); if (!mc) mc = ERR_PTR(-ENOENT); else if (mc->ctx->file != file) @@ -1517,8 +1505,8 @@ static ssize_t ucma_leave_multicast(struct ucma_file *file, else if (!atomic_inc_not_zero(&mc->ctx->ref)) mc = ERR_PTR(-ENXIO); else - idr_remove(&multicast_idr, mc->id); - mutex_unlock(&mut); + __xa_erase(&multicast_table, mc->id); + xa_unlock(&multicast_table); if (IS_ERR(mc)) { ret = PTR_ERR(mc); @@ -1615,14 +1603,14 @@ static ssize_t ucma_migrate_id(struct ucma_file *new_file, * events being added before existing events. */ ucma_lock_files(cur_file, new_file); - mutex_lock(&mut); + xa_lock(&ctx_table); list_move_tail(&ctx->list, &new_file->ctx_list); ucma_move_events(ctx, new_file); ctx->file = new_file; resp.events_reported = ctx->events_reported; - mutex_unlock(&mut); + xa_unlock(&ctx_table); ucma_unlock_files(cur_file, new_file); response: @@ -1757,18 +1745,15 @@ static int ucma_close(struct inode *inode, struct file *filp) ctx->destroying = 1; mutex_unlock(&file->mut); - mutex_lock(&mut); - idr_remove(&ctx_idr, ctx->id); - mutex_unlock(&mut); - + xa_erase(&ctx_table, ctx->id); flush_workqueue(file->close_wq); /* At that step once ctx was marked as destroying and workqueue * was flushed we are safe from any inflights handlers that * might put other closing task. */ - mutex_lock(&mut); + xa_lock(&ctx_table); if (!ctx->closing) { - mutex_unlock(&mut); + xa_unlock(&ctx_table); ucma_put_ctx(ctx); wait_for_completion(&ctx->comp); /* rdma_destroy_id ensures that no event handlers are @@ -1776,7 +1761,7 @@ static int ucma_close(struct inode *inode, struct file *filp) */ rdma_destroy_id(ctx->cm_id); } else { - mutex_unlock(&mut); + xa_unlock(&ctx_table); } ucma_free_ctx(ctx); @@ -1805,6 +1790,19 @@ static struct miscdevice ucma_misc = { .fops = &ucma_fops, }; +static int ucma_get_global_nl_info(struct ib_client_nl_info *res) +{ + res->abi = RDMA_USER_CM_ABI_VERSION; + res->cdev = ucma_misc.this_device; + return 0; +} + +static struct ib_client rdma_cma_client = { + .name = "rdma_cm", + .get_global_nl_info = ucma_get_global_nl_info, +}; +MODULE_ALIAS_RDMA_CLIENT("rdma_cm"); + static ssize_t show_abi_version(struct device *dev, struct device_attribute *attr, char *buf) @@ -1833,7 +1831,14 @@ static int __init ucma_init(void) ret = -ENOMEM; goto err2; } + + ret = ib_register_client(&rdma_cma_client); + if (ret) + goto err3; + return 0; +err3: + unregister_net_sysctl_table(ucma_ctl_table_hdr); err2: device_remove_file(ucma_misc.this_device, &dev_attr_abi_version); err1: @@ -1843,11 +1848,10 @@ err1: static void __exit ucma_cleanup(void) { + ib_unregister_client(&rdma_cma_client); unregister_net_sysctl_table(ucma_ctl_table_hdr); device_remove_file(ucma_misc.this_device, &dev_attr_abi_version); misc_deregister(&ucma_misc); - idr_destroy(&ctx_idr); - idr_destroy(&multicast_idr); } module_init(ucma_init); diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c index e7ea819fcb11..08da840ed7ee 100644 --- a/drivers/infiniband/core/umem.c +++ b/drivers/infiniband/core/umem.c @@ -54,9 +54,10 @@ static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int d for_each_sg_page(umem->sg_head.sgl, &sg_iter, umem->sg_nents, 0) { page = sg_page_iter_page(&sg_iter); - if (!PageDirty(page) && umem->writable && dirty) - set_page_dirty_lock(page); - put_page(page); + if (umem->writable && dirty) + put_user_pages_dirty_lock(&page, 1); + else + put_user_page(page); } sg_free_table(&umem->sg_head); @@ -244,7 +245,6 @@ struct ib_umem *ib_umem_get(struct ib_udata *udata, unsigned long addr, umem->context = context; umem->length = size; umem->address = addr; - umem->page_shift = PAGE_SHIFT; umem->writable = ib_access_writable(access); umem->owning_mm = mm = current->mm; mmgrab(mm); @@ -361,6 +361,9 @@ static void __ib_umem_release_tail(struct ib_umem *umem) */ void ib_umem_release(struct ib_umem *umem) { + if (!umem) + return; + if (umem->is_odp) { ib_umem_odp_release(to_ib_umem_odp(umem)); __ib_umem_release_tail(umem); @@ -385,7 +388,7 @@ int ib_umem_page_count(struct ib_umem *umem) n = 0; for_each_sg(umem->sg_head.sgl, sg, umem->nmap, i) - n += sg_dma_len(sg) >> umem->page_shift; + n += sg_dma_len(sg) >> PAGE_SHIFT; return n; } diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c index f962b5bbfa40..2a75c6f8d827 100644 --- a/drivers/infiniband/core/umem_odp.c +++ b/drivers/infiniband/core/umem_odp.c @@ -59,7 +59,7 @@ static u64 node_start(struct umem_odp_node *n) struct ib_umem_odp *umem_odp = container_of(n, struct ib_umem_odp, interval_tree); - return ib_umem_start(&umem_odp->umem); + return ib_umem_start(umem_odp); } /* Note that the representation of the intervals in the interval tree @@ -72,7 +72,7 @@ static u64 node_last(struct umem_odp_node *n) struct ib_umem_odp *umem_odp = container_of(n, struct ib_umem_odp, interval_tree); - return ib_umem_end(&umem_odp->umem) - 1; + return ib_umem_end(umem_odp) - 1; } INTERVAL_TREE_DEFINE(struct umem_odp_node, rb, u64, __subtree_last, @@ -107,8 +107,6 @@ static void ib_umem_notifier_end_account(struct ib_umem_odp *umem_odp) static int ib_umem_notifier_release_trampoline(struct ib_umem_odp *umem_odp, u64 start, u64 end, void *cookie) { - struct ib_umem *umem = &umem_odp->umem; - /* * Increase the number of notifiers running, to * prevent any further fault handling on this MR. @@ -119,8 +117,8 @@ static int ib_umem_notifier_release_trampoline(struct ib_umem_odp *umem_odp, * all pending page faults. */ smp_wmb(); complete_all(&umem_odp->notifier_completion); - umem->context->invalidate_range(umem_odp, ib_umem_start(umem), - ib_umem_end(umem)); + umem_odp->umem.context->invalidate_range( + umem_odp, ib_umem_start(umem_odp), ib_umem_end(umem_odp)); return 0; } @@ -151,6 +149,7 @@ static int ib_umem_notifier_invalidate_range_start(struct mmu_notifier *mn, { struct ib_ucontext_per_mm *per_mm = container_of(mn, struct ib_ucontext_per_mm, mn); + int rc; if (mmu_notifier_range_blockable(range)) down_read(&per_mm->umem_rwsem); @@ -167,11 +166,14 @@ static int ib_umem_notifier_invalidate_range_start(struct mmu_notifier *mn, return 0; } - return rbt_ib_umem_for_each_in_range(&per_mm->umem_tree, range->start, - range->end, - invalidate_range_start_trampoline, - mmu_notifier_range_blockable(range), - NULL); + rc = rbt_ib_umem_for_each_in_range(&per_mm->umem_tree, range->start, + range->end, + invalidate_range_start_trampoline, + mmu_notifier_range_blockable(range), + NULL); + if (rc) + up_read(&per_mm->umem_rwsem); + return rc; } static int invalidate_range_end_trampoline(struct ib_umem_odp *item, u64 start, @@ -205,10 +207,9 @@ static const struct mmu_notifier_ops ib_umem_notifiers = { static void add_umem_to_per_mm(struct ib_umem_odp *umem_odp) { struct ib_ucontext_per_mm *per_mm = umem_odp->per_mm; - struct ib_umem *umem = &umem_odp->umem; down_write(&per_mm->umem_rwsem); - if (likely(ib_umem_start(umem) != ib_umem_end(umem))) + if (likely(ib_umem_start(umem_odp) != ib_umem_end(umem_odp))) rbt_ib_umem_insert(&umem_odp->interval_tree, &per_mm->umem_tree); up_write(&per_mm->umem_rwsem); @@ -217,10 +218,9 @@ static void add_umem_to_per_mm(struct ib_umem_odp *umem_odp) static void remove_umem_from_per_mm(struct ib_umem_odp *umem_odp) { struct ib_ucontext_per_mm *per_mm = umem_odp->per_mm; - struct ib_umem *umem = &umem_odp->umem; down_write(&per_mm->umem_rwsem); - if (likely(ib_umem_start(umem) != ib_umem_end(umem))) + if (likely(ib_umem_start(umem_odp) != ib_umem_end(umem_odp))) rbt_ib_umem_remove(&umem_odp->interval_tree, &per_mm->umem_tree); complete_all(&umem_odp->notifier_completion); @@ -351,7 +351,7 @@ struct ib_umem_odp *ib_alloc_odp_umem(struct ib_umem_odp *root, umem->context = ctx; umem->length = size; umem->address = addr; - umem->page_shift = PAGE_SHIFT; + odp_data->page_shift = PAGE_SHIFT; umem->writable = root->umem.writable; umem->is_odp = 1; odp_data->per_mm = per_mm; @@ -405,18 +405,19 @@ int ib_umem_odp_get(struct ib_umem_odp *umem_odp, int access) struct mm_struct *mm = umem->owning_mm; int ret_val; + umem_odp->page_shift = PAGE_SHIFT; if (access & IB_ACCESS_HUGETLB) { struct vm_area_struct *vma; struct hstate *h; down_read(&mm->mmap_sem); - vma = find_vma(mm, ib_umem_start(umem)); + vma = find_vma(mm, ib_umem_start(umem_odp)); if (!vma || !is_vm_hugetlb_page(vma)) { up_read(&mm->mmap_sem); return -EINVAL; } h = hstate_vma(vma); - umem->page_shift = huge_page_shift(h); + umem_odp->page_shift = huge_page_shift(h); up_read(&mm->mmap_sem); } @@ -424,16 +425,16 @@ int ib_umem_odp_get(struct ib_umem_odp *umem_odp, int access) init_completion(&umem_odp->notifier_completion); - if (ib_umem_num_pages(umem)) { + if (ib_umem_odp_num_pages(umem_odp)) { umem_odp->page_list = vzalloc(array_size(sizeof(*umem_odp->page_list), - ib_umem_num_pages(umem))); + ib_umem_odp_num_pages(umem_odp))); if (!umem_odp->page_list) return -ENOMEM; umem_odp->dma_list = vzalloc(array_size(sizeof(*umem_odp->dma_list), - ib_umem_num_pages(umem))); + ib_umem_odp_num_pages(umem_odp))); if (!umem_odp->dma_list) { ret_val = -ENOMEM; goto out_page_list; @@ -456,16 +457,14 @@ out_page_list: void ib_umem_odp_release(struct ib_umem_odp *umem_odp) { - struct ib_umem *umem = &umem_odp->umem; - /* * Ensure that no more pages are mapped in the umem. * * It is the driver's responsibility to ensure, before calling us, * that the hardware will not attempt to access the MR any more. */ - ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem), - ib_umem_end(umem)); + ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem_odp), + ib_umem_end(umem_odp)); remove_umem_from_per_mm(umem_odp); put_per_mm(umem_odp); @@ -487,7 +486,7 @@ void ib_umem_odp_release(struct ib_umem_odp *umem_odp) * The function returns -EFAULT if the DMA mapping operation fails. It returns * -EAGAIN if a concurrent invalidation prevents us from updating the page. * - * The page is released via put_page even if the operation failed. For + * The page is released via put_user_page even if the operation failed. For * on-demand pinning, the page is released whenever it isn't stored in the * umem. */ @@ -498,8 +497,8 @@ static int ib_umem_odp_map_dma_single_page( u64 access_mask, unsigned long current_seq) { - struct ib_umem *umem = &umem_odp->umem; - struct ib_device *dev = umem->context->device; + struct ib_ucontext *context = umem_odp->umem.context; + struct ib_device *dev = context->device; dma_addr_t dma_addr; int remove_existing_mapping = 0; int ret = 0; @@ -514,10 +513,9 @@ static int ib_umem_odp_map_dma_single_page( goto out; } if (!(umem_odp->dma_list[page_index])) { - dma_addr = ib_dma_map_page(dev, - page, - 0, BIT(umem->page_shift), - DMA_BIDIRECTIONAL); + dma_addr = + ib_dma_map_page(dev, page, 0, BIT(umem_odp->page_shift), + DMA_BIDIRECTIONAL); if (ib_dma_mapping_error(dev, dma_addr)) { ret = -EFAULT; goto out; @@ -536,15 +534,16 @@ static int ib_umem_odp_map_dma_single_page( } out: - put_page(page); + put_user_page(page); if (remove_existing_mapping) { ib_umem_notifier_start_account(umem_odp); - umem->context->invalidate_range( + context->invalidate_range( umem_odp, - ib_umem_start(umem) + (page_index << umem->page_shift), - ib_umem_start(umem) + - ((page_index + 1) << umem->page_shift)); + ib_umem_start(umem_odp) + + (page_index << umem_odp->page_shift), + ib_umem_start(umem_odp) + + ((page_index + 1) << umem_odp->page_shift)); ib_umem_notifier_end_account(umem_odp); ret = -EAGAIN; } @@ -581,27 +580,26 @@ int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 user_virt, u64 bcnt, u64 access_mask, unsigned long current_seq) { - struct ib_umem *umem = &umem_odp->umem; struct task_struct *owning_process = NULL; struct mm_struct *owning_mm = umem_odp->umem.owning_mm; struct page **local_page_list = NULL; u64 page_mask, off; - int j, k, ret = 0, start_idx, npages = 0, page_shift; - unsigned int flags = 0; + int j, k, ret = 0, start_idx, npages = 0; + unsigned int flags = 0, page_shift; phys_addr_t p = 0; if (access_mask == 0) return -EINVAL; - if (user_virt < ib_umem_start(umem) || - user_virt + bcnt > ib_umem_end(umem)) + if (user_virt < ib_umem_start(umem_odp) || + user_virt + bcnt > ib_umem_end(umem_odp)) return -EFAULT; local_page_list = (struct page **)__get_free_page(GFP_KERNEL); if (!local_page_list) return -ENOMEM; - page_shift = umem->page_shift; + page_shift = umem_odp->page_shift; page_mask = ~(BIT(page_shift) - 1); off = user_virt & (~page_mask); user_virt = user_virt & page_mask; @@ -621,7 +619,7 @@ int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 user_virt, if (access_mask & ODP_WRITE_ALLOWED_BIT) flags |= FOLL_WRITE; - start_idx = (user_virt - ib_umem_start(umem)) >> page_shift; + start_idx = (user_virt - ib_umem_start(umem_odp)) >> page_shift; k = start_idx; while (bcnt > 0) { @@ -659,7 +657,7 @@ int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 user_virt, ret = -EFAULT; break; } - put_page(local_page_list[j]); + put_user_page(local_page_list[j]); continue; } @@ -686,8 +684,8 @@ int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 user_virt, * ib_umem_odp_map_dma_single_page(). */ if (npages - (j + 1) > 0) - release_pages(&local_page_list[j+1], - npages - (j + 1)); + put_user_pages(&local_page_list[j+1], + npages - (j + 1)); break; } } @@ -711,21 +709,20 @@ EXPORT_SYMBOL(ib_umem_odp_map_dma_pages); void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 virt, u64 bound) { - struct ib_umem *umem = &umem_odp->umem; int idx; u64 addr; - struct ib_device *dev = umem->context->device; + struct ib_device *dev = umem_odp->umem.context->device; - virt = max_t(u64, virt, ib_umem_start(umem)); - bound = min_t(u64, bound, ib_umem_end(umem)); + virt = max_t(u64, virt, ib_umem_start(umem_odp)); + bound = min_t(u64, bound, ib_umem_end(umem_odp)); /* Note that during the run of this function, the * notifiers_count of the MR is > 0, preventing any racing * faults from completion. We might be racing with other * invalidations, so we must make sure we free each page only * once. */ mutex_lock(&umem_odp->umem_mutex); - for (addr = virt; addr < bound; addr += BIT(umem->page_shift)) { - idx = (addr - ib_umem_start(umem)) >> umem->page_shift; + for (addr = virt; addr < bound; addr += BIT(umem_odp->page_shift)) { + idx = (addr - ib_umem_start(umem_odp)) >> umem_odp->page_shift; if (umem_odp->page_list[idx]) { struct page *page = umem_odp->page_list[idx]; dma_addr_t dma = umem_odp->dma_list[idx]; @@ -733,7 +730,8 @@ void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 virt, WARN_ON(!dma_addr); - ib_dma_unmap_page(dev, dma_addr, PAGE_SIZE, + ib_dma_unmap_page(dev, dma_addr, + BIT(umem_odp->page_shift), DMA_BIDIRECTIONAL); if (dma & ODP_WRITE_ALLOWED_BIT) { struct page *head_page = compound_head(page); diff --git a/drivers/infiniband/core/user_mad.c b/drivers/infiniband/core/user_mad.c index 671f07ba1fad..9f8a48016b41 100644 --- a/drivers/infiniband/core/user_mad.c +++ b/drivers/infiniband/core/user_mad.c @@ -54,6 +54,7 @@ #include <rdma/ib_mad.h> #include <rdma/ib_user_mad.h> +#include <rdma/rdma_netlink.h> #include "core_priv.h" @@ -744,7 +745,7 @@ found: "process %s did not enable P_Key index support.\n", current->comm); dev_warn(&file->port->dev, - " Documentation/infiniband/user_mad.txt has info on the new ABI.\n"); + " Documentation/infiniband/user_mad.rst has info on the new ABI.\n"); } } @@ -1124,11 +1125,48 @@ static const struct file_operations umad_sm_fops = { .llseek = no_llseek, }; +static int ib_umad_get_nl_info(struct ib_device *ibdev, void *client_data, + struct ib_client_nl_info *res) +{ + struct ib_umad_device *umad_dev = client_data; + + if (!rdma_is_port_valid(ibdev, res->port)) + return -EINVAL; + + res->abi = IB_USER_MAD_ABI_VERSION; + res->cdev = &umad_dev->ports[res->port - rdma_start_port(ibdev)].dev; + + return 0; +} + static struct ib_client umad_client = { .name = "umad", .add = ib_umad_add_one, - .remove = ib_umad_remove_one + .remove = ib_umad_remove_one, + .get_nl_info = ib_umad_get_nl_info, }; +MODULE_ALIAS_RDMA_CLIENT("umad"); + +static int ib_issm_get_nl_info(struct ib_device *ibdev, void *client_data, + struct ib_client_nl_info *res) +{ + struct ib_umad_device *umad_dev = + ib_get_client_data(ibdev, &umad_client); + + if (!rdma_is_port_valid(ibdev, res->port)) + return -EINVAL; + + res->abi = IB_USER_MAD_ABI_VERSION; + res->cdev = &umad_dev->ports[res->port - rdma_start_port(ibdev)].sm_dev; + + return 0; +} + +static struct ib_client issm_client = { + .name = "issm", + .get_nl_info = ib_issm_get_nl_info, +}; +MODULE_ALIAS_RDMA_CLIENT("issm"); static ssize_t ibdev_show(struct device *dev, struct device_attribute *attr, char *buf) @@ -1387,13 +1425,17 @@ static int __init ib_umad_init(void) } ret = ib_register_client(&umad_client); - if (ret) { - pr_err("couldn't register ib_umad client\n"); + if (ret) goto out_class; - } + + ret = ib_register_client(&issm_client); + if (ret) + goto out_client; return 0; +out_client: + ib_unregister_client(&umad_client); out_class: class_unregister(&umad_class); @@ -1411,6 +1453,7 @@ out: static void __exit ib_umad_cleanup(void) { + ib_unregister_client(&issm_client); ib_unregister_client(&umad_client); class_unregister(&umad_class); unregister_chrdev_region(base_umad_dev, diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 63fe14c7c68f..7ddd0e5bc6b3 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -756,7 +756,9 @@ static int ib_uverbs_reg_mr(struct uverbs_attr_bundle *attrs) mr->device = pd->device; mr->pd = pd; + mr->type = IB_MR_TYPE_USER; mr->dm = NULL; + mr->sig_attrs = NULL; mr->uobject = uobj; atomic_inc(&pd->usecnt); mr->res.type = RDMA_RESTRACK_MR; @@ -1021,12 +1023,11 @@ static struct ib_ucq_object *create_cq(struct uverbs_attr_bundle *attrs, attr.comp_vector = cmd->comp_vector; attr.flags = cmd->flags; - cq = ib_dev->ops.create_cq(ib_dev, &attr, &attrs->driver_udata); - if (IS_ERR(cq)) { - ret = PTR_ERR(cq); + cq = rdma_zalloc_drv_obj(ib_dev, ib_cq); + if (!cq) { + ret = -ENOMEM; goto err_file; } - cq->device = ib_dev; cq->uobject = &obj->uobject; cq->comp_handler = ib_uverbs_comp_handler; @@ -1034,6 +1035,10 @@ static struct ib_ucq_object *create_cq(struct uverbs_attr_bundle *attrs, cq->cq_context = ev_file ? &ev_file->ev_queue : NULL; atomic_set(&cq->usecnt, 0); + ret = ib_dev->ops.create_cq(cq, &attr, &attrs->driver_udata); + if (ret) + goto err_free; + obj->uobject.object = cq; memset(&resp, 0, sizeof resp); resp.base.cq_handle = obj->uobject.id; @@ -1054,7 +1059,9 @@ static struct ib_ucq_object *create_cq(struct uverbs_attr_bundle *attrs, err_cb: ib_destroy_cq_user(cq, uverbs_get_cleared_udata(attrs)); - + cq = NULL; +err_free: + kfree(cq); err_file: if (ev_file) ib_uverbs_release_ucq(attrs->ufile, ev_file, obj); @@ -2541,7 +2548,7 @@ static int ib_uverbs_detach_mcast(struct uverbs_attr_bundle *attrs) struct ib_uqp_object *obj; struct ib_qp *qp; struct ib_uverbs_mcast_entry *mcast; - int ret = -EINVAL; + int ret; bool found = false; ret = uverbs_request(attrs, &cmd, sizeof(cmd)); @@ -3715,9 +3722,6 @@ static int ib_uverbs_ex_modify_cq(struct uverbs_attr_bundle *attrs) * trailing driver_data flex array. In this case the size of the base struct * cannot be changed. */ -#define offsetof_after(_struct, _member) \ - (offsetof(_struct, _member) + sizeof(((_struct *)NULL)->_member)) - #define UAPI_DEF_WRITE_IO(req, resp) \ .write.has_resp = 1 + \ BUILD_BUG_ON_ZERO(offsetof(req, response) != 0) + \ @@ -3748,11 +3752,11 @@ static int ib_uverbs_ex_modify_cq(struct uverbs_attr_bundle *attrs) */ #define UAPI_DEF_WRITE_IO_EX(req, req_last_member, resp, resp_last_member) \ .write.has_resp = 1, \ - .write.req_size = offsetof_after(req, req_last_member), \ - .write.resp_size = offsetof_after(resp, resp_last_member) + .write.req_size = offsetofend(req, req_last_member), \ + .write.resp_size = offsetofend(resp, resp_last_member) #define UAPI_DEF_WRITE_I_EX(req, req_last_member) \ - .write.req_size = offsetof_after(req, req_last_member) + .write.req_size = offsetofend(req, req_last_member) const struct uapi_definition uverbs_def_write_intf[] = { DECLARE_UVERBS_OBJECT( diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index 84a5e9a6d483..11c13c1381cf 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -51,6 +51,7 @@ #include <rdma/ib.h> #include <rdma/uverbs_std_types.h> +#include <rdma/rdma_netlink.h> #include "uverbs.h" #include "core_priv.h" @@ -198,7 +199,7 @@ void ib_uverbs_release_file(struct kref *ref) ib_dev = srcu_dereference(file->device->ib_dev, &file->device->disassociate_srcu); if (ib_dev && !ib_dev->ops.disassociate_ucontext) - module_put(ib_dev->owner); + module_put(ib_dev->ops.owner); srcu_read_unlock(&file->device->disassociate_srcu, srcu_key); if (atomic_dec_and_test(&file->device->refcount)) @@ -1065,7 +1066,7 @@ static int ib_uverbs_open(struct inode *inode, struct file *filp) module_dependent = !(ib_dev->ops.disassociate_ucontext); if (module_dependent) { - if (!try_module_get(ib_dev->owner)) { + if (!try_module_get(ib_dev->ops.owner)) { ret = -ENODEV; goto err; } @@ -1100,7 +1101,7 @@ static int ib_uverbs_open(struct inode *inode, struct file *filp) return stream_open(inode, filp); err_module: - module_put(ib_dev->owner); + module_put(ib_dev->ops.owner); err: mutex_unlock(&dev->lists_mutex); @@ -1148,12 +1149,41 @@ static const struct file_operations uverbs_mmap_fops = { .compat_ioctl = ib_uverbs_ioctl, }; +static int ib_uverbs_get_nl_info(struct ib_device *ibdev, void *client_data, + struct ib_client_nl_info *res) +{ + struct ib_uverbs_device *uverbs_dev = client_data; + int ret; + + if (res->port != -1) + return -EINVAL; + + res->abi = ibdev->ops.uverbs_abi_ver; + res->cdev = &uverbs_dev->dev; + + /* + * To support DRIVER_ID binding in userspace some of the driver need + * upgrading to expose their PCI dependent revision information + * through get_context instead of relying on modalias matching. When + * the drivers are fixed they can drop this flag. + */ + if (!ibdev->ops.uverbs_no_driver_id_binding) { + ret = nla_put_u32(res->nl_msg, RDMA_NLDEV_ATTR_UVERBS_DRIVER_ID, + ibdev->ops.driver_id); + if (ret) + return ret; + } + return 0; +} + static struct ib_client uverbs_client = { .name = "uverbs", .no_kverbs_req = true, .add = ib_uverbs_add_one, - .remove = ib_uverbs_remove_one + .remove = ib_uverbs_remove_one, + .get_nl_info = ib_uverbs_get_nl_info, }; +MODULE_ALIAS_RDMA_CLIENT("uverbs"); static ssize_t ibdev_show(struct device *device, struct device_attribute *attr, char *buf) @@ -1186,7 +1216,7 @@ static ssize_t abi_version_show(struct device *device, srcu_key = srcu_read_lock(&dev->disassociate_srcu); ib_dev = srcu_dereference(dev->ib_dev, &dev->disassociate_srcu); if (ib_dev) - ret = sprintf(buf, "%d\n", ib_dev->uverbs_abi_ver); + ret = sprintf(buf, "%u\n", ib_dev->ops.uverbs_abi_ver); srcu_read_unlock(&dev->disassociate_srcu, srcu_key); return ret; diff --git a/drivers/infiniband/core/uverbs_std_types_cq.c b/drivers/infiniband/core/uverbs_std_types_cq.c index 07ea4e3c4566..e39fe6a8aac4 100644 --- a/drivers/infiniband/core/uverbs_std_types_cq.c +++ b/drivers/infiniband/core/uverbs_std_types_cq.c @@ -111,9 +111,9 @@ static int UVERBS_HANDLER(UVERBS_METHOD_CQ_CREATE)( INIT_LIST_HEAD(&obj->comp_list); INIT_LIST_HEAD(&obj->async_list); - cq = ib_dev->ops.create_cq(ib_dev, &attr, &attrs->driver_udata); - if (IS_ERR(cq)) { - ret = PTR_ERR(cq); + cq = rdma_zalloc_drv_obj(ib_dev, ib_cq); + if (!cq) { + ret = -ENOMEM; goto err_event_file; } @@ -122,10 +122,15 @@ static int UVERBS_HANDLER(UVERBS_METHOD_CQ_CREATE)( cq->comp_handler = ib_uverbs_comp_handler; cq->event_handler = ib_uverbs_cq_event_handler; cq->cq_context = ev_file ? &ev_file->ev_queue : NULL; - obj->uobject.object = cq; - obj->uobject.user_handle = user_handle; atomic_set(&cq->usecnt, 0); cq->res.type = RDMA_RESTRACK_CQ; + + ret = ib_dev->ops.create_cq(cq, &attr, &attrs->driver_udata); + if (ret) + goto err_free; + + obj->uobject.object = cq; + obj->uobject.user_handle = user_handle; rdma_restrack_uadd(&cq->res); ret = uverbs_copy_to(attrs, UVERBS_ATTR_CREATE_CQ_RESP_CQE, &cq->cqe, @@ -136,7 +141,9 @@ static int UVERBS_HANDLER(UVERBS_METHOD_CQ_CREATE)( return 0; err_cq: ib_destroy_cq_user(cq, uverbs_get_cleared_udata(attrs)); - + cq = NULL; +err_free: + kfree(cq); err_event_file: if (ev_file) uverbs_uobject_put(ev_file_uobj); diff --git a/drivers/infiniband/core/uverbs_std_types_mr.c b/drivers/infiniband/core/uverbs_std_types_mr.c index 997f7a3a558a..c1286a52dc84 100644 --- a/drivers/infiniband/core/uverbs_std_types_mr.c +++ b/drivers/infiniband/core/uverbs_std_types_mr.c @@ -128,6 +128,7 @@ static int UVERBS_HANDLER(UVERBS_METHOD_DM_MR_REG)( mr->device = pd->device; mr->pd = pd; + mr->type = IB_MR_TYPE_DM; mr->dm = dm; mr->uobject = uobj; atomic_inc(&pd->usecnt); diff --git a/drivers/infiniband/core/uverbs_uapi.c b/drivers/infiniband/core/uverbs_uapi.c index 7a987acf0c0b..00c547887132 100644 --- a/drivers/infiniband/core/uverbs_uapi.c +++ b/drivers/infiniband/core/uverbs_uapi.c @@ -22,6 +22,8 @@ static void *uapi_add_elm(struct uverbs_api *uapi, u32 key, size_t alloc_size) return ERR_PTR(-EOVERFLOW); elm = kzalloc(alloc_size, GFP_KERNEL); + if (!elm) + return ERR_PTR(-ENOMEM); rc = radix_tree_insert(&uapi->radix, key, elm); if (rc) { kfree(elm); @@ -645,7 +647,7 @@ struct uverbs_api *uverbs_alloc_api(struct ib_device *ibdev) return ERR_PTR(-ENOMEM); INIT_RADIX_TREE(&uapi->radix, GFP_KERNEL); - uapi->driver_id = ibdev->driver_id; + uapi->driver_id = ibdev->ops.driver_id; rc = uapi_merge_def(uapi, ibdev, uverbs_core_api, false); if (rc) diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index e666a1f7608d..92349bf37589 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -209,7 +209,7 @@ __attribute_const__ int ib_rate_to_mbps(enum ib_rate rate) EXPORT_SYMBOL(ib_rate_to_mbps); __attribute_const__ enum rdma_transport_type -rdma_node_get_transport(enum rdma_node_type node_type) +rdma_node_get_transport(unsigned int node_type) { if (node_type == RDMA_NODE_USNIC) @@ -299,6 +299,7 @@ struct ib_pd *__ib_alloc_pd(struct ib_device *device, unsigned int flags, mr->device = pd->device; mr->pd = pd; + mr->type = IB_MR_TYPE_DMA; mr->uobject = NULL; mr->need_inval = false; @@ -316,7 +317,7 @@ struct ib_pd *__ib_alloc_pd(struct ib_device *device, unsigned int flags, EXPORT_SYMBOL(__ib_alloc_pd); /** - * ib_dealloc_pd - Deallocates a protection domain. + * ib_dealloc_pd_user - Deallocates a protection domain. * @pd: The protection domain to deallocate. * @udata: Valid user data or NULL for kernel object * @@ -1157,6 +1158,10 @@ struct ib_qp *ib_create_qp_user(struct ib_pd *pd, qp_init_attr->cap.max_recv_sge)) return ERR_PTR(-EINVAL); + if ((qp_init_attr->create_flags & IB_QP_CREATE_INTEGRITY_EN) && + !(device->attrs.device_cap_flags & IB_DEVICE_INTEGRITY_HANDOVER)) + return ERR_PTR(-EINVAL); + /* * If the callers is using the RDMA API calculate the resources * needed for the RDMA READ/WRITE operations. @@ -1232,6 +1237,8 @@ struct ib_qp *ib_create_qp_user(struct ib_pd *pd, qp->max_write_sge = qp_init_attr->cap.max_send_sge; qp->max_read_sge = min_t(u32, qp_init_attr->cap.max_send_sge, device->attrs.max_sge_rd); + if (qp_init_attr->create_flags & IB_QP_CREATE_INTEGRITY_EN) + qp->integrity_en = true; return qp; @@ -1683,6 +1690,14 @@ static int _ib_modify_qp(struct ib_qp *qp, struct ib_qp_attr *attr, } } + /* + * Bind this qp to a counter automatically based on the rdma counter + * rules. This only set in RST2INIT with port specified + */ + if (!qp->counter && (attr_mask & IB_QP_PORT) && + ((attr_mask & IB_QP_STATE) && attr->qp_state == IB_QPS_INIT)) + rdma_counter_bind_qp_auto(qp, attr->port_num); + ret = ib_security_modify_qp(qp, attr, attr_mask, udata); if (ret) goto out; @@ -1878,6 +1893,7 @@ int ib_destroy_qp_user(struct ib_qp *qp, struct ib_udata *udata) if (!qp->uobject) rdma_rw_cleanup_mrs(qp); + rdma_counter_unbind_qp(qp, true); rdma_restrack_del(&qp->res); ret = qp->device->ops.destroy_qp(qp, udata); if (!ret) { @@ -1916,21 +1932,28 @@ struct ib_cq *__ib_create_cq(struct ib_device *device, const char *caller) { struct ib_cq *cq; + int ret; + + cq = rdma_zalloc_drv_obj(device, ib_cq); + if (!cq) + return ERR_PTR(-ENOMEM); - cq = device->ops.create_cq(device, cq_attr, NULL); - - if (!IS_ERR(cq)) { - cq->device = device; - cq->uobject = NULL; - cq->comp_handler = comp_handler; - cq->event_handler = event_handler; - cq->cq_context = cq_context; - atomic_set(&cq->usecnt, 0); - cq->res.type = RDMA_RESTRACK_CQ; - rdma_restrack_set_task(&cq->res, caller); - rdma_restrack_kadd(&cq->res); + cq->device = device; + cq->uobject = NULL; + cq->comp_handler = comp_handler; + cq->event_handler = event_handler; + cq->cq_context = cq_context; + atomic_set(&cq->usecnt, 0); + cq->res.type = RDMA_RESTRACK_CQ; + rdma_restrack_set_task(&cq->res, caller); + + ret = device->ops.create_cq(cq, cq_attr, NULL); + if (ret) { + kfree(cq); + return ERR_PTR(ret); } + rdma_restrack_kadd(&cq->res); return cq; } EXPORT_SYMBOL(__ib_create_cq); @@ -1949,7 +1972,9 @@ int ib_destroy_cq_user(struct ib_cq *cq, struct ib_udata *udata) return -EBUSY; rdma_restrack_del(&cq->res); - return cq->device->ops.destroy_cq(cq, udata); + cq->device->ops.destroy_cq(cq, udata); + kfree(cq); + return 0; } EXPORT_SYMBOL(ib_destroy_cq_user); @@ -1966,6 +1991,7 @@ int ib_dereg_mr_user(struct ib_mr *mr, struct ib_udata *udata) { struct ib_pd *pd = mr->pd; struct ib_dm *dm = mr->dm; + struct ib_sig_attrs *sig_attrs = mr->sig_attrs; int ret; rdma_restrack_del(&mr->res); @@ -1974,6 +2000,7 @@ int ib_dereg_mr_user(struct ib_mr *mr, struct ib_udata *udata) atomic_dec(&pd->usecnt); if (dm) atomic_dec(&dm->usecnt); + kfree(sig_attrs); } return ret; @@ -1981,7 +2008,7 @@ int ib_dereg_mr_user(struct ib_mr *mr, struct ib_udata *udata) EXPORT_SYMBOL(ib_dereg_mr_user); /** - * ib_alloc_mr() - Allocates a memory region + * ib_alloc_mr_user() - Allocates a memory region * @pd: protection domain associated with the region * @mr_type: memory region type * @max_num_sg: maximum sg entries available for registration. @@ -2001,6 +2028,9 @@ struct ib_mr *ib_alloc_mr_user(struct ib_pd *pd, enum ib_mr_type mr_type, if (!pd->device->ops.alloc_mr) return ERR_PTR(-EOPNOTSUPP); + if (WARN_ON_ONCE(mr_type == IB_MR_TYPE_INTEGRITY)) + return ERR_PTR(-EINVAL); + mr = pd->device->ops.alloc_mr(pd, mr_type, max_num_sg, udata); if (!IS_ERR(mr)) { mr->device = pd->device; @@ -2011,12 +2041,66 @@ struct ib_mr *ib_alloc_mr_user(struct ib_pd *pd, enum ib_mr_type mr_type, mr->need_inval = false; mr->res.type = RDMA_RESTRACK_MR; rdma_restrack_kadd(&mr->res); + mr->type = mr_type; + mr->sig_attrs = NULL; } return mr; } EXPORT_SYMBOL(ib_alloc_mr_user); +/** + * ib_alloc_mr_integrity() - Allocates an integrity memory region + * @pd: protection domain associated with the region + * @max_num_data_sg: maximum data sg entries available for registration + * @max_num_meta_sg: maximum metadata sg entries available for + * registration + * + * Notes: + * Memory registration page/sg lists must not exceed max_num_sg, + * also the integrity page/sg lists must not exceed max_num_meta_sg. + * + */ +struct ib_mr *ib_alloc_mr_integrity(struct ib_pd *pd, + u32 max_num_data_sg, + u32 max_num_meta_sg) +{ + struct ib_mr *mr; + struct ib_sig_attrs *sig_attrs; + + if (!pd->device->ops.alloc_mr_integrity || + !pd->device->ops.map_mr_sg_pi) + return ERR_PTR(-EOPNOTSUPP); + + if (!max_num_meta_sg) + return ERR_PTR(-EINVAL); + + sig_attrs = kzalloc(sizeof(struct ib_sig_attrs), GFP_KERNEL); + if (!sig_attrs) + return ERR_PTR(-ENOMEM); + + mr = pd->device->ops.alloc_mr_integrity(pd, max_num_data_sg, + max_num_meta_sg); + if (IS_ERR(mr)) { + kfree(sig_attrs); + return mr; + } + + mr->device = pd->device; + mr->pd = pd; + mr->dm = NULL; + mr->uobject = NULL; + atomic_inc(&pd->usecnt); + mr->need_inval = false; + mr->res.type = RDMA_RESTRACK_MR; + rdma_restrack_kadd(&mr->res); + mr->type = IB_MR_TYPE_INTEGRITY; + mr->sig_attrs = sig_attrs; + + return mr; +} +EXPORT_SYMBOL(ib_alloc_mr_integrity); + /* "Fast" memory regions */ struct ib_fmr *ib_alloc_fmr(struct ib_pd *pd, @@ -2226,19 +2310,17 @@ EXPORT_SYMBOL(ib_create_wq); */ int ib_destroy_wq(struct ib_wq *wq, struct ib_udata *udata) { - int err; struct ib_cq *cq = wq->cq; struct ib_pd *pd = wq->pd; if (atomic_read(&wq->usecnt)) return -EBUSY; - err = wq->device->ops.destroy_wq(wq, udata); - if (!err) { - atomic_dec(&pd->usecnt); - atomic_dec(&cq->usecnt); - } - return err; + wq->device->ops.destroy_wq(wq, udata); + atomic_dec(&pd->usecnt); + atomic_dec(&cq->usecnt); + + return 0; } EXPORT_SYMBOL(ib_destroy_wq); @@ -2376,6 +2458,43 @@ int ib_set_vf_guid(struct ib_device *device, int vf, u8 port, u64 guid, EXPORT_SYMBOL(ib_set_vf_guid); /** + * ib_map_mr_sg_pi() - Map the dma mapped SG lists for PI (protection + * information) and set an appropriate memory region for registration. + * @mr: memory region + * @data_sg: dma mapped scatterlist for data + * @data_sg_nents: number of entries in data_sg + * @data_sg_offset: offset in bytes into data_sg + * @meta_sg: dma mapped scatterlist for metadata + * @meta_sg_nents: number of entries in meta_sg + * @meta_sg_offset: offset in bytes into meta_sg + * @page_size: page vector desired page size + * + * Constraints: + * - The MR must be allocated with type IB_MR_TYPE_INTEGRITY. + * + * Return: 0 on success. + * + * After this completes successfully, the memory region + * is ready for registration. + */ +int ib_map_mr_sg_pi(struct ib_mr *mr, struct scatterlist *data_sg, + int data_sg_nents, unsigned int *data_sg_offset, + struct scatterlist *meta_sg, int meta_sg_nents, + unsigned int *meta_sg_offset, unsigned int page_size) +{ + if (unlikely(!mr->device->ops.map_mr_sg_pi || + WARN_ON_ONCE(mr->type != IB_MR_TYPE_INTEGRITY))) + return -EOPNOTSUPP; + + mr->page_size = page_size; + + return mr->device->ops.map_mr_sg_pi(mr, data_sg, data_sg_nents, + data_sg_offset, meta_sg, + meta_sg_nents, meta_sg_offset); +} +EXPORT_SYMBOL(ib_map_mr_sg_pi); + +/** * ib_map_mr_sg() - Map the largest prefix of a dma mapped SG list * and set it the memory region. * @mr: memory region |