From 0953fffec9ba022f63bfe01e86427530d8320d5c Mon Sep 17 00:00:00 2001 From: Mark Bloch Date: Tue, 28 Aug 2018 14:18:50 +0300 Subject: RDMA/uverbs: Add UVERBS_ATTR_CONST_IN to the specs language This makes it clear and safe to access constants passed in from user space. We define a consistent ABI of u64 for all constants, and verify that the data passed in can be represented by the type the user supplies. The expectation is this will always be used with an enum declaring the constant values, and the user will use the enum type as input to the accessor. To retrieve the attribute value we introduce two helper calls - one standard which may fail if attribute is not valid and one where caller can provide a default value which will be used in case the attribute is not valid (useful when attribute is optional). Signed-off-by: Jason Gunthorpe Signed-off-by: Ariel Levkovich Signed-off-by: Mark Bloch Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/uverbs_ioctl.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'drivers/infiniband/core') diff --git a/drivers/infiniband/core/uverbs_ioctl.c b/drivers/infiniband/core/uverbs_ioctl.c index 1a6b229e3db3..4bafd4671de2 100644 --- a/drivers/infiniband/core/uverbs_ioctl.c +++ b/drivers/infiniband/core/uverbs_ioctl.c @@ -611,3 +611,26 @@ int uverbs_copy_to(const struct uverbs_attr_bundle *bundle, size_t idx, return 0; } EXPORT_SYMBOL(uverbs_copy_to); + +int _uverbs_get_const(s64 *to, const struct uverbs_attr_bundle *attrs_bundle, + size_t idx, s64 lower_bound, u64 upper_bound, + s64 *def_val) +{ + const struct uverbs_attr *attr; + + attr = uverbs_attr_get(attrs_bundle, idx); + if (IS_ERR(attr)) { + if ((PTR_ERR(attr) != -ENOENT) || !def_val) + return PTR_ERR(attr); + + *to = *def_val; + } else { + *to = attr->ptr_attr.data; + } + + if (*to < lower_bound || (*to > 0 && (u64)*to > upper_bound)) + return -EINVAL; + + return 0; +} +EXPORT_SYMBOL(_uverbs_get_const); -- cgit v1.2.3 From 841eefc5cb57030ad05a0c4bc285f93ffa668ad9 Mon Sep 17 00:00:00 2001 From: Mark Bloch Date: Tue, 28 Aug 2018 14:18:52 +0300 Subject: RDMA/uverbs: Add generic function to fill in flow action object Refactor the initialization of a flow action object to a common function. Signed-off-by: Mark Bloch Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/uverbs_std_types_flow_action.c | 7 ++----- drivers/infiniband/hw/mlx5/flow.c | 8 +++----- include/rdma/uverbs_std_types.h | 12 ++++++++++++ 3 files changed, 17 insertions(+), 10 deletions(-) (limited to 'drivers/infiniband/core') diff --git a/drivers/infiniband/core/uverbs_std_types_flow_action.c b/drivers/infiniband/core/uverbs_std_types_flow_action.c index d8cfafe23bd9..cb9486ad5c67 100644 --- a/drivers/infiniband/core/uverbs_std_types_flow_action.c +++ b/drivers/infiniband/core/uverbs_std_types_flow_action.c @@ -326,11 +326,8 @@ static int UVERBS_HANDLER(UVERBS_METHOD_FLOW_ACTION_ESP_CREATE)( if (IS_ERR(action)) return PTR_ERR(action); - atomic_set(&action->usecnt, 0); - action->device = ib_dev; - action->type = IB_FLOW_ACTION_ESP; - action->uobject = uobj; - uobj->object = action; + uverbs_flow_action_fill_action(action, uobj, ib_dev, + IB_FLOW_ACTION_ESP); return 0; } diff --git a/drivers/infiniband/hw/mlx5/flow.c b/drivers/infiniband/hw/mlx5/flow.c index 02103a4b372c..0c89d5431c7e 100644 --- a/drivers/infiniband/hw/mlx5/flow.c +++ b/drivers/infiniband/hw/mlx5/flow.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -279,11 +280,8 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_FLOW_ACTION_CREATE_MODIFY_HEADER)( if (IS_ERR(action)) return PTR_ERR(action); - atomic_set(&action->usecnt, 0); - action->device = uobj->context->device; - action->type = IB_FLOW_ACTION_UNSPECIFIED; - action->uobject = uobj; - uobj->object = action; + uverbs_flow_action_fill_action(action, uobj, uobj->context->device, + IB_FLOW_ACTION_UNSPECIFIED); return 0; } diff --git a/include/rdma/uverbs_std_types.h b/include/rdma/uverbs_std_types.h index 3b00231cc084..526d918fcd5a 100644 --- a/include/rdma/uverbs_std_types.h +++ b/include/rdma/uverbs_std_types.h @@ -140,5 +140,17 @@ __uobj_alloc(const struct uverbs_api_object *obj, struct ib_uverbs_file *ufile, #define uobj_alloc(_type, _ufile, _ib_dev) \ __uobj_alloc(uobj_get_type(_ufile, _type), _ufile, _ib_dev) +static inline void uverbs_flow_action_fill_action(struct ib_flow_action *action, + struct ib_uobject *uobj, + struct ib_device *ib_dev, + enum ib_flow_action_type type) +{ + atomic_set(&action->usecnt, 0); + action->device = ib_dev; + action->type = type; + action->uobject = uobj; + uobj->object = action; +} + #endif -- cgit v1.2.3 From f794809a7259dfaa3d47d90ef5a86007cf48b1ce Mon Sep 17 00:00:00 2001 From: Jack Morgenstein Date: Mon, 27 Aug 2018 08:35:55 +0300 Subject: IB/core: Add an unbound WQ type to the new CQ API The upstream kernel commit cited below modified the workqueue in the new CQ API to be bound to a specific CPU (instead of being unbound). This caused ALL users of the new CQ API to use the same bound WQ. Specifically, MAD handling was severely delayed when the CPU bound to the WQ was busy handling (higher priority) interrupts. This caused a delay in the MAD "heartbeat" response handling, which resulted in ports being incorrectly classified as "down". To fix this, add a new "unbound" WQ type to the new CQ API, so that users have the option to choose either a bound WQ or an unbound WQ. For MADs, choose the new "unbound" WQ. Fixes: b7363e67b23e ("IB/device: Convert ib-comp-wq to be CPU-bound") Signed-off-by: Jack Morgenstein Signed-off-by: Leon Romanovsky Reviewed-by: Sagi Grimberg Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cq.c | 8 ++++++-- drivers/infiniband/core/device.c | 15 ++++++++++++++- drivers/infiniband/core/mad.c | 2 +- include/rdma/ib_verbs.h | 9 ++++++--- 4 files changed, 27 insertions(+), 7 deletions(-) (limited to 'drivers/infiniband/core') diff --git a/drivers/infiniband/core/cq.c b/drivers/infiniband/core/cq.c index af5ad6a56ae4..9271f7290005 100644 --- a/drivers/infiniband/core/cq.c +++ b/drivers/infiniband/core/cq.c @@ -112,12 +112,12 @@ static void ib_cq_poll_work(struct work_struct *work) IB_POLL_BATCH); if (completed >= IB_POLL_BUDGET_WORKQUEUE || ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0) - queue_work(ib_comp_wq, &cq->work); + queue_work(cq->comp_wq, &cq->work); } static void ib_cq_completion_workqueue(struct ib_cq *cq, void *private) { - queue_work(ib_comp_wq, &cq->work); + queue_work(cq->comp_wq, &cq->work); } /** @@ -175,9 +175,12 @@ struct ib_cq *__ib_alloc_cq(struct ib_device *dev, void *private, ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); break; case IB_POLL_WORKQUEUE: + case IB_POLL_UNBOUND_WORKQUEUE: cq->comp_handler = ib_cq_completion_workqueue; INIT_WORK(&cq->work, ib_cq_poll_work); ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); + cq->comp_wq = (cq->poll_ctx == IB_POLL_WORKQUEUE) ? + ib_comp_wq : ib_comp_unbound_wq; break; default: ret = -EINVAL; @@ -213,6 +216,7 @@ void ib_free_cq(struct ib_cq *cq) irq_poll_disable(&cq->iop); break; case IB_POLL_WORKQUEUE: + case IB_POLL_UNBOUND_WORKQUEUE: cancel_work_sync(&cq->work); break; default: diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index db3b6271f09d..6d8ac51a39cc 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -61,6 +61,7 @@ struct ib_client_data { }; struct workqueue_struct *ib_comp_wq; +struct workqueue_struct *ib_comp_unbound_wq; struct workqueue_struct *ib_wq; EXPORT_SYMBOL_GPL(ib_wq); @@ -1166,10 +1167,19 @@ static int __init ib_core_init(void) goto err; } + ib_comp_unbound_wq = + alloc_workqueue("ib-comp-unb-wq", + WQ_UNBOUND | WQ_HIGHPRI | WQ_MEM_RECLAIM | + WQ_SYSFS, WQ_UNBOUND_MAX_ACTIVE); + if (!ib_comp_unbound_wq) { + ret = -ENOMEM; + goto err_comp; + } + ret = class_register(&ib_class); if (ret) { pr_warn("Couldn't create InfiniBand device class\n"); - goto err_comp; + goto err_comp_unbound; } ret = rdma_nl_init(); @@ -1218,6 +1228,8 @@ err_ibnl: rdma_nl_exit(); err_sysfs: class_unregister(&ib_class); +err_comp_unbound: + destroy_workqueue(ib_comp_unbound_wq); err_comp: destroy_workqueue(ib_comp_wq); err: @@ -1236,6 +1248,7 @@ static void __exit ib_core_cleanup(void) addr_cleanup(); rdma_nl_exit(); class_unregister(&ib_class); + destroy_workqueue(ib_comp_unbound_wq); destroy_workqueue(ib_comp_wq); /* Make sure that any pending umem accounting work is done. */ destroy_workqueue(ib_wq); diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c index ef459f2f2eeb..b8977c3db5f3 100644 --- a/drivers/infiniband/core/mad.c +++ b/drivers/infiniband/core/mad.c @@ -3183,7 +3183,7 @@ static int ib_mad_port_open(struct ib_device *device, cq_size *= 2; port_priv->cq = ib_alloc_cq(port_priv->device, port_priv, cq_size, 0, - IB_POLL_WORKQUEUE); + IB_POLL_UNBOUND_WORKQUEUE); if (IS_ERR(port_priv->cq)) { dev_err(&device->dev, "Couldn't create ib_mad CQ\n"); ret = PTR_ERR(port_priv->cq); diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index e950c2a68f06..df8d234a2b56 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -71,6 +71,7 @@ extern struct workqueue_struct *ib_wq; extern struct workqueue_struct *ib_comp_wq; +extern struct workqueue_struct *ib_comp_unbound_wq; union ib_gid { u8 raw[16]; @@ -1570,9 +1571,10 @@ struct ib_ah { typedef void (*ib_comp_handler)(struct ib_cq *cq, void *cq_context); enum ib_poll_context { - IB_POLL_DIRECT, /* caller context, no hw completions */ - IB_POLL_SOFTIRQ, /* poll from softirq context */ - IB_POLL_WORKQUEUE, /* poll from workqueue */ + IB_POLL_DIRECT, /* caller context, no hw completions */ + IB_POLL_SOFTIRQ, /* poll from softirq context */ + IB_POLL_WORKQUEUE, /* poll from workqueue */ + IB_POLL_UNBOUND_WORKQUEUE, /* poll from unbound workqueue */ }; struct ib_cq { @@ -1589,6 +1591,7 @@ struct ib_cq { struct irq_poll iop; struct work_struct work; }; + struct workqueue_struct *comp_wq; /* * Implementation details of the RDMA core, don't use in drivers: */ -- cgit v1.2.3 From 6ceb6331b3291694fb6ceba625219f51447c3fa2 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 3 Sep 2018 20:18:03 +0300 Subject: RDMA/uverbs: Declare closing variable as boolean The "closing" variable is used as boolean and set to "true" in one place, update the declaration of that variable and their other assignment to proper type. Fixes: e951747a087a ("IB/uverbs: Rework the locking for cleaning up the ucontext") Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/uverbs_cmd.c | 2 +- include/rdma/ib_verbs.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/infiniband/core') diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index a21d5214afc3..4b72851ade24 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -120,7 +120,7 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file, rcu_read_lock(); ucontext->tgid = get_task_pid(current->group_leader, PIDTYPE_PID); rcu_read_unlock(); - ucontext->closing = 0; + ucontext->closing = false; ucontext->cleanup_retryable = false; #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index df8d234a2b56..a4c3a09a91bc 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -1486,7 +1486,7 @@ struct ib_ucontext { * it is set when we are closing the file descriptor and indicates * that mm_sem may be locked. */ - int closing; + bool closing; bool cleanup_retryable; -- cgit v1.2.3 From 627212c9d49ba2759b699450f5d8f45f73e062fa Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Mon, 3 Sep 2018 20:20:25 +0300 Subject: RDMA/core: Replace open-coded variant of get_device Reuse existing get_device() API to do it symmetric to already used put_device() in commit 924b8900a49d ("RDMA/core: Replace open-coded variant of put_device") Signed-off-by: Parav Pandit Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/sysfs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/infiniband/core') diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c index 7fd14ead7b37..62351b3fcafc 100644 --- a/drivers/infiniband/core/sysfs.c +++ b/drivers/infiniband/core/sysfs.c @@ -1359,8 +1359,8 @@ void ib_device_unregister_sysfs(struct ib_device *device) { int i; - /* Hold kobject until ib_dealloc_device() */ - kobject_get(&device->dev.kobj); + /* Hold device until ib_dealloc_device() */ + get_device(&device->dev); free_port_list_attributes(device); -- cgit v1.2.3 From adee9f3f3bbb317c5469f84deba01eef4b86515b Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Wed, 5 Sep 2018 09:47:58 +0300 Subject: RDMA/core: Depend on device_add() to add device attributes Instead of adding/removing device attribute files, depend on device_add() which considers adding these device files based on NULL terminated attributes group array. Signed-off-by: Parav Pandit Reviewed-by: Daniel Jurgens Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/sysfs.c | 61 ++++++++++++++++++----------------------- include/rdma/ib_verbs.h | 3 ++ 2 files changed, 30 insertions(+), 34 deletions(-) (limited to 'drivers/infiniband/core') diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c index 7fd14ead7b37..185075af3ad6 100644 --- a/drivers/infiniband/core/sysfs.c +++ b/drivers/infiniband/core/sysfs.c @@ -1183,7 +1183,7 @@ err_put: return ret; } -static ssize_t show_node_type(struct device *device, +static ssize_t node_type_show(struct device *device, struct device_attribute *attr, char *buf) { struct ib_device *dev = container_of(device, struct ib_device, dev); @@ -1198,8 +1198,9 @@ static ssize_t show_node_type(struct device *device, default: return sprintf(buf, "%d: \n", dev->node_type); } } +static DEVICE_ATTR_RO(node_type); -static ssize_t show_sys_image_guid(struct device *device, +static ssize_t sys_image_guid_show(struct device *device, struct device_attribute *dev_attr, char *buf) { struct ib_device *dev = container_of(device, struct ib_device, dev); @@ -1210,8 +1211,9 @@ static ssize_t show_sys_image_guid(struct device *device, be16_to_cpu(((__be16 *) &dev->attrs.sys_image_guid)[2]), be16_to_cpu(((__be16 *) &dev->attrs.sys_image_guid)[3])); } +static DEVICE_ATTR_RO(sys_image_guid); -static ssize_t show_node_guid(struct device *device, +static ssize_t node_guid_show(struct device *device, struct device_attribute *attr, char *buf) { struct ib_device *dev = container_of(device, struct ib_device, dev); @@ -1222,8 +1224,9 @@ static ssize_t show_node_guid(struct device *device, be16_to_cpu(((__be16 *) &dev->node_guid)[2]), be16_to_cpu(((__be16 *) &dev->node_guid)[3])); } +static DEVICE_ATTR_RO(node_guid); -static ssize_t show_node_desc(struct device *device, +static ssize_t node_desc_show(struct device *device, struct device_attribute *attr, char *buf) { struct ib_device *dev = container_of(device, struct ib_device, dev); @@ -1231,9 +1234,9 @@ static ssize_t show_node_desc(struct device *device, return sprintf(buf, "%.64s\n", dev->node_desc); } -static ssize_t set_node_desc(struct device *device, - struct device_attribute *attr, - const char *buf, size_t count) +static ssize_t node_desc_store(struct device *device, + struct device_attribute *attr, + const char *buf, size_t count) { struct ib_device *dev = container_of(device, struct ib_device, dev); struct ib_device_modify desc = {}; @@ -1249,8 +1252,9 @@ static ssize_t set_node_desc(struct device *device, return count; } +static DEVICE_ATTR_RW(node_desc); -static ssize_t show_fw_ver(struct device *device, struct device_attribute *attr, +static ssize_t fw_ver_show(struct device *device, struct device_attribute *attr, char *buf) { struct ib_device *dev = container_of(device, struct ib_device, dev); @@ -1259,19 +1263,19 @@ static ssize_t show_fw_ver(struct device *device, struct device_attribute *attr, strlcat(buf, "\n", IB_FW_VERSION_NAME_MAX); return strlen(buf); } +static DEVICE_ATTR_RO(fw_ver); + +static struct attribute *ib_dev_attrs[] = { + &dev_attr_node_type.attr, + &dev_attr_node_guid.attr, + &dev_attr_sys_image_guid.attr, + &dev_attr_fw_ver.attr, + &dev_attr_node_desc.attr, + NULL, +}; -static DEVICE_ATTR(node_type, S_IRUGO, show_node_type, NULL); -static DEVICE_ATTR(sys_image_guid, S_IRUGO, show_sys_image_guid, NULL); -static DEVICE_ATTR(node_guid, S_IRUGO, show_node_guid, NULL); -static DEVICE_ATTR(node_desc, S_IRUGO | S_IWUSR, show_node_desc, set_node_desc); -static DEVICE_ATTR(fw_ver, S_IRUGO, show_fw_ver, NULL); - -static struct device_attribute *ib_class_attributes[] = { - &dev_attr_node_type, - &dev_attr_sys_image_guid, - &dev_attr_node_guid, - &dev_attr_node_desc, - &dev_attr_fw_ver, +static const struct attribute_group dev_attr_group = { + .attrs = ib_dev_attrs, }; static void free_port_list_attributes(struct ib_device *device) @@ -1311,16 +1315,13 @@ int ib_device_register_sysfs(struct ib_device *device, if (ret) return ret; + device->groups[0] = &dev_attr_group; + class_dev->groups = device->groups; + ret = device_add(class_dev); if (ret) goto err; - for (i = 0; i < ARRAY_SIZE(ib_class_attributes); ++i) { - ret = device_create_file(class_dev, ib_class_attributes[i]); - if (ret) - goto err_unregister; - } - device->ports_parent = kobject_create_and_add("ports", &class_dev->kobj); if (!device->ports_parent) { @@ -1347,18 +1348,13 @@ int ib_device_register_sysfs(struct ib_device *device, err_put: free_port_list_attributes(device); - -err_unregister: device_del(class_dev); - err: return ret; } void ib_device_unregister_sysfs(struct ib_device *device) { - int i; - /* Hold kobject until ib_dealloc_device() */ kobject_get(&device->dev.kobj); @@ -1369,8 +1365,5 @@ void ib_device_unregister_sysfs(struct ib_device *device) free_hsag(&device->dev.kobj, device->hw_stats_ag); } - for (i = 0; i < ARRAY_SIZE(ib_class_attributes); ++i) - device_remove_file(&device->dev, ib_class_attributes[i]); - device_unregister(&device->dev); } diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index e950c2a68f06..cd0f935f0bc1 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2536,6 +2536,9 @@ struct ib_device { struct module *owner; struct device dev; + /* First group for device attributes, NULL terminated array */ + const struct attribute_group *groups[2]; + struct kobject *ports_parent; struct list_head port_list; -- cgit v1.2.3 From c5c4d92e70f37369b5bdca5e85f9fc55dc2c8a3b Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Wed, 5 Sep 2018 09:47:59 +0300 Subject: RDMA/uverbs: Use cdev_device_add() instead of cdev_add() Instead of doing two step process to add char device and create underlying device, use cdev_device_add() which does both. Currently a kobject per uverbs_device is created to keep reference to its holding ib_uverbs_device in addition to its underlying device 'dev'. Instead just use uverbs_device->dev to keep a reference to. With this change there is single reference tracker for ib_uverbs_device structure. This allows for subsequent patch to registers group attribute as well using single API cdev_device_add(). Signed-off-by: Parav Pandit Reviewed-by: Daniel Jurgens Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/uverbs.h | 3 +- drivers/infiniband/core/uverbs_main.c | 66 +++++++++++++++-------------------- 2 files changed, 30 insertions(+), 39 deletions(-) (limited to 'drivers/infiniband/core') diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h index 5df8e548cc14..0288aec432a4 100644 --- a/drivers/infiniband/core/uverbs.h +++ b/drivers/infiniband/core/uverbs.h @@ -100,13 +100,12 @@ struct ib_uverbs_device { atomic_t refcount; int num_comp_vectors; struct completion comp; - struct device *dev; + struct device dev; struct ib_device __rcu *ib_dev; int devnum; struct cdev cdev; struct rb_root xrcd_tree; struct mutex xrcd_tree_mutex; - struct kobject kobj; struct srcu_struct disassociate_srcu; struct mutex lists_mutex; /* protect lists */ struct list_head uverbs_file_list; diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index 6d974e2363df..1d2650f0f24c 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -169,20 +169,16 @@ int uverbs_dealloc_mw(struct ib_mw *mw) return ret; } -static void ib_uverbs_release_dev(struct kobject *kobj) +static void ib_uverbs_release_dev(struct device *device) { struct ib_uverbs_device *dev = - container_of(kobj, struct ib_uverbs_device, kobj); + container_of(device, struct ib_uverbs_device, dev); uverbs_destroy_api(dev->uapi); cleanup_srcu_struct(&dev->disassociate_srcu); kfree(dev); } -static struct kobj_type ib_uverbs_dev_ktype = { - .release = ib_uverbs_release_dev, -}; - static void ib_uverbs_release_async_event_file(struct kref *ref) { struct ib_uverbs_async_event_file *file = @@ -265,7 +261,7 @@ void ib_uverbs_release_file(struct kref *ref) if (atomic_dec_and_test(&file->device->refcount)) ib_uverbs_comp_dev(file->device); - kobject_put(&file->device->kobj); + put_device(&file->device->dev); kfree(file); } @@ -838,6 +834,7 @@ static int ib_uverbs_open(struct inode *inode, struct file *filp) if (!atomic_inc_not_zero(&dev->refcount)) return -ENXIO; + get_device(&dev->dev); srcu_key = srcu_read_lock(&dev->disassociate_srcu); mutex_lock(&dev->lists_mutex); ib_dev = srcu_dereference(dev->ib_dev, @@ -877,7 +874,6 @@ static int ib_uverbs_open(struct inode *inode, struct file *filp) init_rwsem(&file->hw_destroy_rwsem); filp->private_data = file; - kobject_get(&dev->kobj); list_add_tail(&file->list, &dev->uverbs_file_list); mutex_unlock(&dev->lists_mutex); srcu_read_unlock(&dev->disassociate_srcu, srcu_key); @@ -898,6 +894,7 @@ err: if (atomic_dec_and_test(&dev->refcount)) ib_uverbs_comp_dev(dev); + put_device(&dev->dev); return ret; } @@ -953,14 +950,12 @@ static struct ib_client uverbs_client = { static ssize_t show_ibdev(struct device *device, struct device_attribute *attr, char *buf) { + struct ib_uverbs_device *dev = + container_of(device, struct ib_uverbs_device, dev); int ret = -ENODEV; int srcu_key; - struct ib_uverbs_device *dev = dev_get_drvdata(device); struct ib_device *ib_dev; - if (!dev) - return -ENODEV; - srcu_key = srcu_read_lock(&dev->disassociate_srcu); ib_dev = srcu_dereference(dev->ib_dev, &dev->disassociate_srcu); if (ib_dev) @@ -974,13 +969,12 @@ static DEVICE_ATTR(ibdev, S_IRUGO, show_ibdev, NULL); static ssize_t show_dev_abi_version(struct device *device, struct device_attribute *attr, char *buf) { - struct ib_uverbs_device *dev = dev_get_drvdata(device); + struct ib_uverbs_device *dev = + container_of(device, struct ib_uverbs_device, dev); int ret = -ENODEV; int srcu_key; struct ib_device *ib_dev; - if (!dev) - return -ENODEV; srcu_key = srcu_read_lock(&dev->disassociate_srcu); ib_dev = srcu_dereference(dev->ib_dev, &dev->disassociate_srcu); if (ib_dev) @@ -1031,7 +1025,6 @@ static void ib_uverbs_add_one(struct ib_device *device) init_completion(&uverbs_dev->comp); uverbs_dev->xrcd_tree = RB_ROOT; mutex_init(&uverbs_dev->xrcd_tree_mutex); - kobject_init(&uverbs_dev->kobj, &ib_uverbs_dev_ktype); mutex_init(&uverbs_dev->lists_mutex); INIT_LIST_HEAD(&uverbs_dev->uverbs_file_list); INIT_LIST_HEAD(&uverbs_dev->uverbs_events_file_list); @@ -1052,40 +1045,41 @@ static void ib_uverbs_add_one(struct ib_device *device) if (ib_uverbs_create_uapi(device, uverbs_dev)) goto err_uapi; - cdev_init(&uverbs_dev->cdev, NULL); + device_initialize(&uverbs_dev->dev); + uverbs_dev->dev.class = uverbs_class; + uverbs_dev->dev.parent = device->dev.parent; + uverbs_dev->dev.devt = base; + uverbs_dev->dev.release = ib_uverbs_release_dev; + dev_set_name(&uverbs_dev->dev, "uverbs%d", uverbs_dev->devnum); + + cdev_init(&uverbs_dev->cdev, + device->mmap ? &uverbs_mmap_fops : &uverbs_fops); uverbs_dev->cdev.owner = THIS_MODULE; - uverbs_dev->cdev.ops = device->mmap ? &uverbs_mmap_fops : &uverbs_fops; - cdev_set_parent(&uverbs_dev->cdev, &uverbs_dev->kobj); - kobject_set_name(&uverbs_dev->cdev.kobj, "uverbs%d", uverbs_dev->devnum); - if (cdev_add(&uverbs_dev->cdev, base, 1)) - goto err_cdev; - uverbs_dev->dev = device_create(uverbs_class, device->dev.parent, - uverbs_dev->cdev.dev, uverbs_dev, - "uverbs%d", uverbs_dev->devnum); - if (IS_ERR(uverbs_dev->dev)) + ret = cdev_device_add(&uverbs_dev->cdev, &uverbs_dev->dev); + if (ret) goto err_cdev; - if (device_create_file(uverbs_dev->dev, &dev_attr_ibdev)) - goto err_class; - if (device_create_file(uverbs_dev->dev, &dev_attr_abi_version)) - goto err_class; + if (device_create_file(&uverbs_dev->dev, &dev_attr_ibdev)) + goto err_file; + if (device_create_file(&uverbs_dev->dev, &dev_attr_abi_version)) + goto err_file; ib_set_client_data(device, &uverbs_client, uverbs_dev); return; -err_class: - device_destroy(uverbs_class, uverbs_dev->cdev.dev); +err_file: + cdev_device_del(&uverbs_dev->cdev, &uverbs_dev->dev); err_cdev: cdev_del(&uverbs_dev->cdev); + put_device(&uverbs_dev->dev); err_uapi: clear_bit(devnum, dev_map); err: if (atomic_dec_and_test(&uverbs_dev->refcount)) ib_uverbs_comp_dev(uverbs_dev); wait_for_completion(&uverbs_dev->comp); - kobject_put(&uverbs_dev->kobj); return; } @@ -1155,9 +1149,7 @@ static void ib_uverbs_remove_one(struct ib_device *device, void *client_data) if (!uverbs_dev) return; - dev_set_drvdata(uverbs_dev->dev, NULL); - device_destroy(uverbs_class, uverbs_dev->cdev.dev); - cdev_del(&uverbs_dev->cdev); + cdev_device_del(&uverbs_dev->cdev, &uverbs_dev->dev); clear_bit(uverbs_dev->devnum, dev_map); if (device->disassociate_ucontext) { @@ -1181,7 +1173,7 @@ static void ib_uverbs_remove_one(struct ib_device *device, void *client_data) if (wait_clients) wait_for_completion(&uverbs_dev->comp); - kobject_put(&uverbs_dev->kobj); + put_device(&uverbs_dev->dev); } static char *uverbs_devnode(struct device *dev, umode_t *mode) -- cgit v1.2.3 From b53b1c08a23eb1091982daacb2122f90a7094a77 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Wed, 5 Sep 2018 09:48:00 +0300 Subject: RDMA/uverbs: Use device.groups to initialize device attributes Instead of explicitly adding device attribute files and handling such error conditions, depend on device core layer to create device attributes files based group pointer NULL terminated array. Signed-off-by: Parav Pandit Reviewed-by: Daniel Jurgens Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/uverbs.h | 2 ++ drivers/infiniband/core/uverbs_main.c | 30 +++++++++++++++++------------- 2 files changed, 19 insertions(+), 13 deletions(-) (limited to 'drivers/infiniband/core') diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h index 0288aec432a4..7199c275ab79 100644 --- a/drivers/infiniband/core/uverbs.h +++ b/drivers/infiniband/core/uverbs.h @@ -101,6 +101,8 @@ struct ib_uverbs_device { int num_comp_vectors; struct completion comp; struct device dev; + /* First group for device attributes, NULL terminated array */ + const struct attribute_group *groups[2]; struct ib_device __rcu *ib_dev; int devnum; struct cdev cdev; diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index 1d2650f0f24c..16e5f714ca53 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -947,7 +947,7 @@ static struct ib_client uverbs_client = { .remove = ib_uverbs_remove_one }; -static ssize_t show_ibdev(struct device *device, struct device_attribute *attr, +static ssize_t ibdev_show(struct device *device, struct device_attribute *attr, char *buf) { struct ib_uverbs_device *dev = @@ -964,10 +964,10 @@ static ssize_t show_ibdev(struct device *device, struct device_attribute *attr, return ret; } -static DEVICE_ATTR(ibdev, S_IRUGO, show_ibdev, NULL); +static DEVICE_ATTR_RO(ibdev); -static ssize_t show_dev_abi_version(struct device *device, - struct device_attribute *attr, char *buf) +static ssize_t abi_version_show(struct device *device, + struct device_attribute *attr, char *buf) { struct ib_uverbs_device *dev = container_of(device, struct ib_uverbs_device, dev); @@ -983,7 +983,17 @@ static ssize_t show_dev_abi_version(struct device *device, return ret; } -static DEVICE_ATTR(abi_version, S_IRUGO, show_dev_abi_version, NULL); +static DEVICE_ATTR_RO(abi_version); + +static struct attribute *ib_dev_attrs[] = { + &dev_attr_abi_version.attr, + &dev_attr_ibdev.attr, + NULL, +}; + +static const struct attribute_group dev_attr_group = { + .attrs = ib_dev_attrs, +}; static CLASS_ATTR_STRING(abi_version, S_IRUGO, __stringify(IB_USER_VERBS_ABI_VERSION)); @@ -1050,6 +1060,8 @@ static void ib_uverbs_add_one(struct ib_device *device) uverbs_dev->dev.parent = device->dev.parent; uverbs_dev->dev.devt = base; uverbs_dev->dev.release = ib_uverbs_release_dev; + uverbs_dev->groups[0] = &dev_attr_group; + uverbs_dev->dev.groups = uverbs_dev->groups; dev_set_name(&uverbs_dev->dev, "uverbs%d", uverbs_dev->devnum); cdev_init(&uverbs_dev->cdev, @@ -1060,17 +1072,9 @@ static void ib_uverbs_add_one(struct ib_device *device) if (ret) goto err_cdev; - if (device_create_file(&uverbs_dev->dev, &dev_attr_ibdev)) - goto err_file; - if (device_create_file(&uverbs_dev->dev, &dev_attr_abi_version)) - goto err_file; - ib_set_client_data(device, &uverbs_client, uverbs_dev); - return; -err_file: - cdev_device_del(&uverbs_dev->cdev, &uverbs_dev->dev); err_cdev: cdev_del(&uverbs_dev->cdev); put_device(&uverbs_dev->dev); -- cgit v1.2.3 From 798bba01b44b0ddf8cd6e542635b37cc9a9b739c Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Tue, 28 Aug 2018 14:45:28 +0300 Subject: RDMA/core: Fail early if unsupported QP is provided When requested QP type is not supported for a {device, port}, return the error right away before validating all parameters during mad agent registration time. Signed-off-by: Parav Pandit Reviewed-by: Daniel Jurgens Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/mad.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'drivers/infiniband/core') diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c index b8977c3db5f3..43343c4e033e 100644 --- a/drivers/infiniband/core/mad.c +++ b/drivers/infiniband/core/mad.c @@ -220,6 +220,10 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device, int ret2, qpn; u8 mgmt_class, vclass; + if ((qp_type == IB_QPT_SMI && !rdma_cap_ib_smi(device, port_num)) || + (qp_type == IB_QPT_GSI && !rdma_cap_ib_cm(device, port_num))) + return ERR_PTR(-EPROTONOSUPPORT); + /* Validate parameters */ qpn = get_spl_qp_index(qp_type); if (qpn == -1) { -- cgit v1.2.3 From f9d08f1e1939ad4d92e38bd3dee6842512f5bee6 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Tue, 28 Aug 2018 14:45:31 +0300 Subject: RDMA/core: Rate limit MAD error messages While registering a mad agent, a user space can trigger various errors and flood the logs. Therefore, decrease verbosity and rate limit such error messages. While we are at it, use __func__ to print function name. Signed-off-by: Parav Pandit Signed-off-by: Leon Romanovsky Reviewed-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/mad.c | 72 ++++++++++++++++++++++--------------------- 1 file changed, 37 insertions(+), 35 deletions(-) (limited to 'drivers/infiniband/core') diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c index 43343c4e033e..c355379e7534 100644 --- a/drivers/infiniband/core/mad.c +++ b/drivers/infiniband/core/mad.c @@ -227,30 +227,30 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device, /* Validate parameters */ qpn = get_spl_qp_index(qp_type); if (qpn == -1) { - dev_notice(&device->dev, - "ib_register_mad_agent: invalid QP Type %d\n", - qp_type); + dev_dbg_ratelimited(&device->dev, "%s: invalid QP Type %d\n", + __func__, qp_type); goto error1; } if (rmpp_version && rmpp_version != IB_MGMT_RMPP_VERSION) { - dev_notice(&device->dev, - "ib_register_mad_agent: invalid RMPP Version %u\n", - rmpp_version); + dev_dbg_ratelimited(&device->dev, + "%s: invalid RMPP Version %u\n", + __func__, rmpp_version); goto error1; } /* Validate MAD registration request if supplied */ if (mad_reg_req) { if (mad_reg_req->mgmt_class_version >= MAX_MGMT_VERSION) { - dev_notice(&device->dev, - "ib_register_mad_agent: invalid Class Version %u\n", - mad_reg_req->mgmt_class_version); + dev_dbg_ratelimited(&device->dev, + "%s: invalid Class Version %u\n", + __func__, + mad_reg_req->mgmt_class_version); goto error1; } if (!recv_handler) { - dev_notice(&device->dev, - "ib_register_mad_agent: no recv_handler\n"); + dev_dbg_ratelimited(&device->dev, + "%s: no recv_handler\n", __func__); goto error1; } if (mad_reg_req->mgmt_class >= MAX_MGMT_CLASS) { @@ -260,9 +260,9 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device, */ if (mad_reg_req->mgmt_class != IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) { - dev_notice(&device->dev, - "ib_register_mad_agent: Invalid Mgmt Class 0x%x\n", - mad_reg_req->mgmt_class); + dev_dbg_ratelimited(&device->dev, + "%s: Invalid Mgmt Class 0x%x\n", + __func__, mad_reg_req->mgmt_class); goto error1; } } else if (mad_reg_req->mgmt_class == 0) { @@ -270,8 +270,9 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device, * Class 0 is reserved in IBA and is used for * aliasing of IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE */ - dev_notice(&device->dev, - "ib_register_mad_agent: Invalid Mgmt Class 0\n"); + dev_dbg_ratelimited(&device->dev, + "%s: Invalid Mgmt Class 0\n", + __func__); goto error1; } else if (is_vendor_class(mad_reg_req->mgmt_class)) { /* @@ -279,18 +280,19 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device, * ensure supplied OUI is not zero */ if (!is_vendor_oui(mad_reg_req->oui)) { - dev_notice(&device->dev, - "ib_register_mad_agent: No OUI specified for class 0x%x\n", - mad_reg_req->mgmt_class); + dev_dbg_ratelimited(&device->dev, + "%s: No OUI specified for class 0x%x\n", + __func__, + mad_reg_req->mgmt_class); goto error1; } } /* Make sure class supplied is consistent with RMPP */ if (!ib_is_mad_class_rmpp(mad_reg_req->mgmt_class)) { if (rmpp_version) { - dev_notice(&device->dev, - "ib_register_mad_agent: RMPP version for non-RMPP class 0x%x\n", - mad_reg_req->mgmt_class); + dev_dbg_ratelimited(&device->dev, + "%s: RMPP version for non-RMPP class 0x%x\n", + __func__, mad_reg_req->mgmt_class); goto error1; } } @@ -301,9 +303,9 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device, IB_MGMT_CLASS_SUBN_LID_ROUTED) && (mad_reg_req->mgmt_class != IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)) { - dev_notice(&device->dev, - "ib_register_mad_agent: Invalid SM QP type: class 0x%x\n", - mad_reg_req->mgmt_class); + dev_dbg_ratelimited(&device->dev, + "%s: Invalid SM QP type: class 0x%x\n", + __func__, mad_reg_req->mgmt_class); goto error1; } } else { @@ -311,9 +313,9 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device, IB_MGMT_CLASS_SUBN_LID_ROUTED) || (mad_reg_req->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)) { - dev_notice(&device->dev, - "ib_register_mad_agent: Invalid GS QP type: class 0x%x\n", - mad_reg_req->mgmt_class); + dev_dbg_ratelimited(&device->dev, + "%s: Invalid GS QP type: class 0x%x\n", + __func__, mad_reg_req->mgmt_class); goto error1; } } @@ -328,18 +330,18 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device, /* Validate device and port */ port_priv = ib_get_mad_port(device, port_num); if (!port_priv) { - dev_notice(&device->dev, - "ib_register_mad_agent: Invalid port %d\n", - port_num); + dev_dbg_ratelimited(&device->dev, "%s: Invalid port %d\n", + __func__, port_num); ret = ERR_PTR(-ENODEV); goto error1; } - /* Verify the QP requested is supported. For example, Ethernet devices - * will not have QP0 */ + /* Verify the QP requested is supported. For example, Ethernet devices + * will not have QP0. + */ if (!port_priv->qp_info[qpn].qp) { - dev_notice(&device->dev, - "ib_register_mad_agent: QP %d not supported\n", qpn); + dev_dbg_ratelimited(&device->dev, "%s: QP %d not supported\n", + __func__, qpn); ret = ERR_PTR(-EPROTONOSUPPORT); goto error1; } -- cgit v1.2.3 From 722c7b2bfeadbae8d9aaa08552c456e09d17a7f7 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Tue, 28 Aug 2018 14:45:32 +0300 Subject: RDMA/{cma, core}: Avoid callback on rdma_addr_cancel() Currently rdma_addr_cancel() is an async operation, which notifies that cancel is done by executing the callback function given during rdma_resolve_ip(). If resolve_ip request is already completed than callback is not executed. Instead, now rdma_resolve_addr() and rdma_addr_cancel() simplified in following ways. 1. rdma_addr_cancel() now a synchronous method. If request was pending, after it is cancelled, no callback is notified. 2. rdma_resolve_addr() and respective addr_handler() callback doesn't need to hold reference to cm_id. Signed-off-by: Parav Pandit Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/addr.c | 12 +++++++----- drivers/infiniband/core/cma.c | 4 ---- 2 files changed, 7 insertions(+), 9 deletions(-) (limited to 'drivers/infiniband/core') diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c index 46b855a42884..94ff38731be8 100644 --- a/drivers/infiniband/core/addr.c +++ b/drivers/infiniband/core/addr.c @@ -660,6 +660,13 @@ int rdma_resolve_ip_route(struct sockaddr *src_addr, return addr_resolve(src_in, dst_addr, addr, false, 0); } +/** + * rdma_addr_cancel - Cancel resolve ip request + * @addr: Pointer to address structure given previously + * during rdma_resolve_ip(). + * rdma_addr_cancel() is synchronous function which cancels any pending + * request if there is any. + */ void rdma_addr_cancel(struct rdma_dev_addr *addr) { struct addr_req *req, *temp_req; @@ -687,11 +694,6 @@ void rdma_addr_cancel(struct rdma_dev_addr *addr) * guarentees no work is running and none will be started. */ cancel_delayed_work_sync(&found->work); - - if (found->callback) - found->callback(-ECANCELED, (struct sockaddr *)&found->src_addr, - found->addr, found->context); - kfree(found); } EXPORT_SYMBOL(rdma_addr_cancel); diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index f72677291b69..4ba77f4e7098 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -2880,13 +2880,11 @@ static void addr_handler(int status, struct sockaddr *src_addr, if (id_priv->id.event_handler(&id_priv->id, &event)) { cma_exch(id_priv, RDMA_CM_DESTROYING); mutex_unlock(&id_priv->handler_mutex); - cma_deref_id(id_priv); rdma_destroy_id(&id_priv->id); return; } out: mutex_unlock(&id_priv->handler_mutex); - cma_deref_id(id_priv); } static int cma_resolve_loopback(struct rdma_id_private *id_priv) @@ -2983,7 +2981,6 @@ int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, return -EINVAL; memcpy(cma_dst_addr(id_priv), dst_addr, rdma_addr_size(dst_addr)); - atomic_inc(&id_priv->refcount); if (cma_any_addr(dst_addr)) { ret = cma_resolve_loopback(id_priv); } else { @@ -3001,7 +2998,6 @@ int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, return 0; err: cma_comp_exch(id_priv, RDMA_CM_ADDR_QUERY, RDMA_CM_ADDR_BOUND); - cma_deref_id(id_priv); return ret; } EXPORT_SYMBOL(rdma_resolve_addr); -- cgit v1.2.3 From 93688ddbe1da1ead030b210dadc5a8cfbff95849 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Tue, 28 Aug 2018 15:08:41 +0300 Subject: RDMA/core: No need to protect kfree with spin lock and semaphore While unregistering a client, only context removal should be protected with lock. There is no need to protect a freeing of such context which is already removed from the list. Signed-off-by: Parav Pandit Reviewed-by: Daniel Jurgens Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/device.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/infiniband/core') diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 6d8ac51a39cc..9bc5ba2f488e 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -700,9 +700,9 @@ void ib_unregister_client(struct ib_client *client) down_write(&lists_rwsem); spin_lock_irqsave(&device->client_data_lock, flags); list_del(&found_context->list); - kfree(found_context); spin_unlock_irqrestore(&device->client_data_lock, flags); up_write(&lists_rwsem); + kfree(found_context); } mutex_unlock(&device_mutex); -- cgit v1.2.3 From f7b65d9bf2db0e4b319c0676900c6c25398a449f Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Tue, 28 Aug 2018 15:08:42 +0300 Subject: RDMA/core: Use simplified list_for_each While traversing client_data_list in following conditions, linked list is only read, no elements of the list are removed. Therefore, use list_for_each_entry(), instead of list_for_each_safe(). Signed-off-by: Parav Pandit Reviewed-by: Daniel Jurgens Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/device.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'drivers/infiniband/core') diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 9bc5ba2f488e..559fbe6a97c2 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -587,13 +587,12 @@ void ib_unregister_device(struct ib_device *device) down_write(&lists_rwsem); list_del(&device->core_list); spin_lock_irqsave(&device->client_data_lock, flags); - list_for_each_entry_safe(context, tmp, &device->client_data_list, list) + list_for_each_entry(context, &device->client_data_list, list) context->going_down = true; spin_unlock_irqrestore(&device->client_data_lock, flags); downgrade_write(&lists_rwsem); - list_for_each_entry_safe(context, tmp, &device->client_data_list, - list) { + list_for_each_entry(context, &device->client_data_list, list) { if (context->client->remove) context->client->remove(device, context->data); } @@ -663,7 +662,7 @@ EXPORT_SYMBOL(ib_register_client); */ void ib_unregister_client(struct ib_client *client) { - struct ib_client_data *context, *tmp; + struct ib_client_data *context; struct ib_device *device; unsigned long flags; @@ -678,7 +677,7 @@ void ib_unregister_client(struct ib_client *client) down_write(&lists_rwsem); spin_lock_irqsave(&device->client_data_lock, flags); - list_for_each_entry_safe(context, tmp, &device->client_data_list, list) + list_for_each_entry(context, &device->client_data_list, list) if (context->client == client) { context->going_down = true; found_context = context; -- cgit v1.2.3 From 4512acd0d34cea1bc0d9c69c1a60174016e121d7 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Tue, 28 Aug 2018 15:08:43 +0300 Subject: RDMA/core: Remove context entries from list while unregistering device While unregistering a device, remove the context elements from the list to not have any stale entries. With that any errors/bugs can be checked when device is freed. Signed-off-by: Parav Pandit Reviewed-by: Daniel Jurgens Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/device.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'drivers/infiniband/core') diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 559fbe6a97c2..81758477a882 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -286,6 +286,7 @@ EXPORT_SYMBOL(ib_alloc_device); */ void ib_dealloc_device(struct ib_device *device) { + WARN_ON(!list_empty(&device->client_data_list)); WARN_ON(device->reg_state != IB_DEV_UNREGISTERED && device->reg_state != IB_DEV_UNINITIALIZED); rdma_restrack_clean(&device->res); @@ -610,8 +611,11 @@ void ib_unregister_device(struct ib_device *device) down_write(&lists_rwsem); spin_lock_irqsave(&device->client_data_lock, flags); - list_for_each_entry_safe(context, tmp, &device->client_data_list, list) + list_for_each_entry_safe(context, tmp, &device->client_data_list, + list) { + list_del(&context->list); kfree(context); + } spin_unlock_irqrestore(&device->client_data_lock, flags); up_write(&lists_rwsem); -- cgit v1.2.3 From 2d65f49ff961da5e974a48e250edd24b0c6f54d6 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Tue, 28 Aug 2018 15:08:44 +0300 Subject: RDMA/core: Use simpler spin lock irq API from blocking context add_client_context(), ib_unregister_device() and ib_unregister_client() are designed to call from blocking context. There is no need to save and restore last interrupt state when called from such blocking context. Even though this is not a performance path, using the right spin lock API is desired for code clarity. To avoid checkpatch warning while removing flags, sizeof() is used. Signed-off-by: Parav Pandit Reviewed-by: Daniel Jurgens Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/device.c | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) (limited to 'drivers/infiniband/core') diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 81758477a882..a51d16ab1329 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -297,9 +297,8 @@ EXPORT_SYMBOL(ib_dealloc_device); static int add_client_context(struct ib_device *device, struct ib_client *client) { struct ib_client_data *context; - unsigned long flags; - context = kmalloc(sizeof *context, GFP_KERNEL); + context = kmalloc(sizeof(*context), GFP_KERNEL); if (!context) return -ENOMEM; @@ -308,9 +307,9 @@ static int add_client_context(struct ib_device *device, struct ib_client *client context->going_down = false; down_write(&lists_rwsem); - spin_lock_irqsave(&device->client_data_lock, flags); + spin_lock_irq(&device->client_data_lock); list_add(&context->list, &device->client_data_list); - spin_unlock_irqrestore(&device->client_data_lock, flags); + spin_unlock_irq(&device->client_data_lock); up_write(&lists_rwsem); return 0; @@ -587,10 +586,10 @@ void ib_unregister_device(struct ib_device *device) down_write(&lists_rwsem); list_del(&device->core_list); - spin_lock_irqsave(&device->client_data_lock, flags); + spin_lock_irq(&device->client_data_lock); list_for_each_entry(context, &device->client_data_list, list) context->going_down = true; - spin_unlock_irqrestore(&device->client_data_lock, flags); + spin_unlock_irq(&device->client_data_lock); downgrade_write(&lists_rwsem); list_for_each_entry(context, &device->client_data_list, list) { @@ -668,7 +667,6 @@ void ib_unregister_client(struct ib_client *client) { struct ib_client_data *context; struct ib_device *device; - unsigned long flags; mutex_lock(&device_mutex); @@ -680,14 +678,14 @@ void ib_unregister_client(struct ib_client *client) struct ib_client_data *found_context = NULL; down_write(&lists_rwsem); - spin_lock_irqsave(&device->client_data_lock, flags); + spin_lock_irq(&device->client_data_lock); list_for_each_entry(context, &device->client_data_list, list) if (context->client == client) { context->going_down = true; found_context = context; break; } - spin_unlock_irqrestore(&device->client_data_lock, flags); + spin_unlock_irq(&device->client_data_lock); up_write(&lists_rwsem); if (client->remove) @@ -701,9 +699,9 @@ void ib_unregister_client(struct ib_client *client) } down_write(&lists_rwsem); - spin_lock_irqsave(&device->client_data_lock, flags); + spin_lock_irq(&device->client_data_lock); list_del(&found_context->list); - spin_unlock_irqrestore(&device->client_data_lock, flags); + spin_unlock_irq(&device->client_data_lock); up_write(&lists_rwsem); kfree(found_context); } -- cgit v1.2.3 From e1f540c3ed0e9634d0f8c4600f3c85df8aff4ae2 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Tue, 28 Aug 2018 15:08:45 +0300 Subject: RDMA/core: Define client_data_lock as rwlock instead of spinlock Even though device registration/unregistration and client registration/unregistration is not a performance path, define the client_data_lock as rwlock for code clarity. Signed-off-by: Parav Pandit Reviewed-by: Daniel Jurgens Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/device.c | 30 +++++++++++++++--------------- include/rdma/ib_verbs.h | 5 +++-- 2 files changed, 18 insertions(+), 17 deletions(-) (limited to 'drivers/infiniband/core') diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index a51d16ab1329..a0939140ed3a 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -270,7 +270,7 @@ struct ib_device *ib_alloc_device(size_t size) INIT_LIST_HEAD(&device->event_handler_list); spin_lock_init(&device->event_handler_lock); - spin_lock_init(&device->client_data_lock); + rwlock_init(&device->client_data_lock); INIT_LIST_HEAD(&device->client_data_list); INIT_LIST_HEAD(&device->port_list); @@ -307,9 +307,9 @@ static int add_client_context(struct ib_device *device, struct ib_client *client context->going_down = false; down_write(&lists_rwsem); - spin_lock_irq(&device->client_data_lock); + write_lock_irq(&device->client_data_lock); list_add(&context->list, &device->client_data_list); - spin_unlock_irq(&device->client_data_lock); + write_unlock_irq(&device->client_data_lock); up_write(&lists_rwsem); return 0; @@ -586,10 +586,10 @@ void ib_unregister_device(struct ib_device *device) down_write(&lists_rwsem); list_del(&device->core_list); - spin_lock_irq(&device->client_data_lock); + write_lock_irq(&device->client_data_lock); list_for_each_entry(context, &device->client_data_list, list) context->going_down = true; - spin_unlock_irq(&device->client_data_lock); + write_unlock_irq(&device->client_data_lock); downgrade_write(&lists_rwsem); list_for_each_entry(context, &device->client_data_list, list) { @@ -609,13 +609,13 @@ void ib_unregister_device(struct ib_device *device) kfree(device->port_pkey_list); down_write(&lists_rwsem); - spin_lock_irqsave(&device->client_data_lock, flags); + write_lock_irqsave(&device->client_data_lock, flags); list_for_each_entry_safe(context, tmp, &device->client_data_list, list) { list_del(&context->list); kfree(context); } - spin_unlock_irqrestore(&device->client_data_lock, flags); + write_unlock_irqrestore(&device->client_data_lock, flags); up_write(&lists_rwsem); device->reg_state = IB_DEV_UNREGISTERED; @@ -678,14 +678,14 @@ void ib_unregister_client(struct ib_client *client) struct ib_client_data *found_context = NULL; down_write(&lists_rwsem); - spin_lock_irq(&device->client_data_lock); + write_lock_irq(&device->client_data_lock); list_for_each_entry(context, &device->client_data_list, list) if (context->client == client) { context->going_down = true; found_context = context; break; } - spin_unlock_irq(&device->client_data_lock); + write_unlock_irq(&device->client_data_lock); up_write(&lists_rwsem); if (client->remove) @@ -699,9 +699,9 @@ void ib_unregister_client(struct ib_client *client) } down_write(&lists_rwsem); - spin_lock_irq(&device->client_data_lock); + write_lock_irq(&device->client_data_lock); list_del(&found_context->list); - spin_unlock_irq(&device->client_data_lock); + write_unlock_irq(&device->client_data_lock); up_write(&lists_rwsem); kfree(found_context); } @@ -724,13 +724,13 @@ void *ib_get_client_data(struct ib_device *device, struct ib_client *client) void *ret = NULL; unsigned long flags; - spin_lock_irqsave(&device->client_data_lock, flags); + read_lock_irqsave(&device->client_data_lock, flags); list_for_each_entry(context, &device->client_data_list, list) if (context->client == client) { ret = context->data; break; } - spin_unlock_irqrestore(&device->client_data_lock, flags); + read_unlock_irqrestore(&device->client_data_lock, flags); return ret; } @@ -751,7 +751,7 @@ void ib_set_client_data(struct ib_device *device, struct ib_client *client, struct ib_client_data *context; unsigned long flags; - spin_lock_irqsave(&device->client_data_lock, flags); + write_lock_irqsave(&device->client_data_lock, flags); list_for_each_entry(context, &device->client_data_list, list) if (context->client == client) { context->data = data; @@ -762,7 +762,7 @@ void ib_set_client_data(struct ib_device *device, struct ib_client *client, device->name, client->name); out: - spin_unlock_irqrestore(&device->client_data_lock, flags); + write_unlock_irqrestore(&device->client_data_lock, flags); } EXPORT_SYMBOL(ib_set_client_data); diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index ddc7c317e136..995f176d4782 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2256,10 +2256,11 @@ struct ib_device { struct list_head event_handler_list; spinlock_t event_handler_lock; - spinlock_t client_data_lock; + rwlock_t client_data_lock; struct list_head core_list; /* Access to the client_data_list is protected by the client_data_lock - * spinlock and the lists_rwsem read-write semaphore */ + * rwlock and the lists_rwsem read-write semaphore + */ struct list_head client_data_list; struct ib_cache cache; -- cgit v1.2.3 From 50704e039ab1d7e6c035d8c27a0b314929bfbe10 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 3 Sep 2018 20:17:31 +0300 Subject: RDMA/umem: Restore lockdep check while downgrading lock Lockdep engine handles correctly downgrade of locks and it simply incorrect to disable lockdep checks prior to calling mmu_notifier. Remove lockdep_off and ensure locks correctness. Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/umem_odp.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'drivers/infiniband/core') diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c index 6ec748eccff7..29e34e6a6420 100644 --- a/drivers/infiniband/core/umem_odp.c +++ b/drivers/infiniband/core/umem_odp.c @@ -431,13 +431,7 @@ int ib_umem_odp_get(struct ib_ucontext *context, struct ib_umem *umem, atomic_set(&context->notifier_count, 0); INIT_HLIST_NODE(&context->mn.hlist); context->mn.ops = &ib_umem_notifiers; - /* - * Lock-dep detects a false positive for mmap_sem vs. - * umem_rwsem, due to not grasping downgrade_write correctly. - */ - lockdep_off(); ret_val = mmu_notifier_register(&context->mn, mm); - lockdep_on(); if (ret_val) { pr_err("Failed to register mmu_notifier %d\n", ret_val); ret_val = -EBUSY; -- cgit v1.2.3 From c715a39541bb399eb03d728a996b224d90ce1336 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Thu, 6 Sep 2018 10:55:31 +0300 Subject: RDMA/core: Follow correct unregister order between sysfs and cgroup During register_device() init sequence is, (a) register with rdma cgroup followed by (b) register with sysfs Therefore, unregister_device() sequence should follow the reverse order. Signed-off-by: Parav Pandit Reviewed-by: Daniel Jurgens Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/device.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/infiniband/core') diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index a0939140ed3a..e1155067954b 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -598,8 +598,8 @@ void ib_unregister_device(struct ib_device *device) } up_read(&lists_rwsem); - ib_device_unregister_rdmacg(device); ib_device_unregister_sysfs(device); + ib_device_unregister_rdmacg(device); mutex_unlock(&device_mutex); -- cgit v1.2.3 From 273993509f05623934dda14a56237738149b2906 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Thu, 6 Sep 2018 10:58:57 +0300 Subject: RDMA/core: Assign device ifindex before publishing the device Even though device->ifindex is assigned before adding the device in the list which is read by netlink flow, it is better to assign rdma device index before publishing the device in the system to users and clients. Signed-off-by: Parav Pandit Reviewed-by: Daniel Jurgens Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/device.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'drivers/infiniband/core') diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index e1155067954b..5a680a88aa87 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -525,6 +525,8 @@ int ib_register_device(struct ib_device *device, goto port_cleanup; } + device->index = __dev_new_index(); + ret = ib_device_register_rdmacg(device); if (ret) { pr_warn("Couldn't register device with rdma cgroup\n"); @@ -551,7 +553,6 @@ int ib_register_device(struct ib_device *device, if (!add_client_context(device, client) && client->add) client->add(device); - device->index = __dev_new_index(); down_write(&lists_rwsem); list_add_tail(&device->core_list, &device_list); up_write(&lists_rwsem); -- cgit v1.2.3 From 70cd20aed00f719f3536154df02596106e431e45 Mon Sep 17 00:00:00 2001 From: Guy Levi Date: Thu, 6 Sep 2018 17:27:01 +0300 Subject: IB/uverbs: Add IDRs array attribute type to ioctl() interface Methods sometimes need to get a flexible set of IDRs and not a strict set as can be achieved today by the conventional IDR attribute. Add a new IDRS_ARRAY attribute to the generic uverbs ioctl layer. IDRS_ARRAY points to array of idrs of the same object type and same access rights, only write and read are supported. Signed-off-by: Guy Levi Signed-off-by: Jason Gunthorpe `` Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/uverbs_ioctl.c | 114 +++++++++++++++++++++++++++++++ drivers/infiniband/core/uverbs_uapi.c | 12 ++++ include/rdma/uverbs_ioctl.h | 71 ++++++++++++++++++- include/uapi/rdma/rdma_user_ioctl_cmds.h | 7 +- 4 files changed, 201 insertions(+), 3 deletions(-) (limited to 'drivers/infiniband/core') diff --git a/drivers/infiniband/core/uverbs_ioctl.c b/drivers/infiniband/core/uverbs_ioctl.c index 4bafd4671de2..0e95a5888274 100644 --- a/drivers/infiniband/core/uverbs_ioctl.c +++ b/drivers/infiniband/core/uverbs_ioctl.c @@ -57,6 +57,7 @@ struct bundle_priv { struct ib_uverbs_attr *uattrs; DECLARE_BITMAP(uobj_finalize, UVERBS_API_ATTR_BKEY_LEN); + DECLARE_BITMAP(spec_finalize, UVERBS_API_ATTR_BKEY_LEN); /* * Must be last. bundle ends in a flex array which overlaps @@ -143,6 +144,86 @@ static bool uverbs_is_attr_cleared(const struct ib_uverbs_attr *uattr, 0, uattr->len - len); } +static int uverbs_process_idrs_array(struct bundle_priv *pbundle, + const struct uverbs_api_attr *attr_uapi, + struct uverbs_objs_arr_attr *attr, + struct ib_uverbs_attr *uattr, + u32 attr_bkey) +{ + const struct uverbs_attr_spec *spec = &attr_uapi->spec; + size_t array_len; + u32 *idr_vals; + int ret = 0; + size_t i; + + if (uattr->attr_data.reserved) + return -EINVAL; + + if (uattr->len % sizeof(u32)) + return -EINVAL; + + array_len = uattr->len / sizeof(u32); + if (array_len < spec->u2.objs_arr.min_len || + array_len > spec->u2.objs_arr.max_len) + return -EINVAL; + + attr->uobjects = + uverbs_alloc(&pbundle->bundle, + array_size(array_len, sizeof(*attr->uobjects))); + if (IS_ERR(attr->uobjects)) + return PTR_ERR(attr->uobjects); + + /* + * Since idr is 4B and *uobjects is >= 4B, we can use attr->uobjects + * to store idrs array and avoid additional memory allocation. The + * idrs array is offset to the end of the uobjects array so we will be + * able to read idr and replace with a pointer. + */ + idr_vals = (u32 *)(attr->uobjects + array_len) - array_len; + + if (uattr->len > sizeof(uattr->data)) { + ret = copy_from_user(idr_vals, u64_to_user_ptr(uattr->data), + uattr->len); + if (ret) + return -EFAULT; + } else { + memcpy(idr_vals, &uattr->data, uattr->len); + } + + for (i = 0; i != array_len; i++) { + attr->uobjects[i] = uverbs_get_uobject_from_file( + spec->u2.objs_arr.obj_type, pbundle->bundle.ufile, + spec->u2.objs_arr.access, idr_vals[i]); + if (IS_ERR(attr->uobjects[i])) { + ret = PTR_ERR(attr->uobjects[i]); + break; + } + } + + attr->len = i; + __set_bit(attr_bkey, pbundle->spec_finalize); + return ret; +} + +static int uverbs_free_idrs_array(const struct uverbs_api_attr *attr_uapi, + struct uverbs_objs_arr_attr *attr, + bool commit) +{ + const struct uverbs_attr_spec *spec = &attr_uapi->spec; + int current_ret; + int ret = 0; + size_t i; + + for (i = 0; i != attr->len; i++) { + current_ret = uverbs_finalize_object( + attr->uobjects[i], spec->u2.objs_arr.access, commit); + if (!ret) + ret = current_ret; + } + + return ret; +} + static int uverbs_process_attr(struct bundle_priv *pbundle, const struct uverbs_api_attr *attr_uapi, struct ib_uverbs_attr *uattr, u32 attr_bkey) @@ -246,6 +327,11 @@ static int uverbs_process_attr(struct bundle_priv *pbundle, } break; + + case UVERBS_ATTR_TYPE_IDRS_ARRAY: + return uverbs_process_idrs_array(pbundle, attr_uapi, + &e->objs_arr_attr, uattr, + attr_bkey); default: return -EOPNOTSUPP; } @@ -384,6 +470,7 @@ static int bundle_destroy(struct bundle_priv *pbundle, bool commit) unsigned int i; int ret = 0; + /* fast path for simple uobjects */ i = -1; while ((i = find_next_bit(pbundle->uobj_finalize, key_bitmap_len, i + 1)) < key_bitmap_len) { @@ -397,6 +484,32 @@ static int bundle_destroy(struct bundle_priv *pbundle, bool commit) ret = current_ret; } + i = -1; + while ((i = find_next_bit(pbundle->spec_finalize, key_bitmap_len, + i + 1)) < key_bitmap_len) { + struct uverbs_attr *attr = &pbundle->bundle.attrs[i]; + const struct uverbs_api_attr *attr_uapi; + void __rcu **slot; + int current_ret; + + slot = uapi_get_attr_for_method( + pbundle, + pbundle->method_key | uapi_bkey_to_key_attr(i)); + if (WARN_ON(!slot)) + continue; + + attr_uapi = srcu_dereference( + *slot, + &pbundle->bundle.ufile->device->disassociate_srcu); + + if (attr_uapi->spec.type == UVERBS_ATTR_TYPE_IDRS_ARRAY) { + current_ret = uverbs_free_idrs_array( + attr_uapi, &attr->objs_arr_attr, commit); + if (!ret) + ret = current_ret; + } + } + for (memblock = pbundle->allocated_mem; memblock;) { struct bundle_alloc_head *tmp = memblock; @@ -461,6 +574,7 @@ static int ib_uverbs_cmd_verbs(struct ib_uverbs_file *ufile, memset(pbundle->bundle.attr_present, 0, sizeof(pbundle->bundle.attr_present)); memset(pbundle->uobj_finalize, 0, sizeof(pbundle->uobj_finalize)); + memset(pbundle->spec_finalize, 0, sizeof(pbundle->spec_finalize)); ret = ib_uverbs_run_method(pbundle, hdr->num_attrs); destroy_ret = bundle_destroy(pbundle, ret == 0); diff --git a/drivers/infiniband/core/uverbs_uapi.c b/drivers/infiniband/core/uverbs_uapi.c index 73ea6f0db88f..cdf5ced2c84f 100644 --- a/drivers/infiniband/core/uverbs_uapi.c +++ b/drivers/infiniband/core/uverbs_uapi.c @@ -73,6 +73,18 @@ static int uapi_merge_method(struct uverbs_api *uapi, if (attr->attr.type == UVERBS_ATTR_TYPE_ENUM_IN) method_elm->driver_method |= is_driver; + /* + * Like other uobject based things we only support a single + * uobject being NEW'd or DESTROY'd + */ + if (attr->attr.type == UVERBS_ATTR_TYPE_IDRS_ARRAY) { + u8 access = attr->attr.u2.objs_arr.access; + + if (WARN_ON(access == UVERBS_ACCESS_NEW || + access == UVERBS_ACCESS_DESTROY)) + return -EINVAL; + } + attr_slot = uapi_add_elm(uapi, method_key | uapi_key_attr(attr->id), sizeof(*attr_slot)); diff --git a/include/rdma/uverbs_ioctl.h b/include/rdma/uverbs_ioctl.h index fc2e52234a2a..84d3d15f1f38 100644 --- a/include/rdma/uverbs_ioctl.h +++ b/include/rdma/uverbs_ioctl.h @@ -52,6 +52,7 @@ enum uverbs_attr_type { UVERBS_ATTR_TYPE_IDR, UVERBS_ATTR_TYPE_FD, UVERBS_ATTR_TYPE_ENUM_IN, + UVERBS_ATTR_TYPE_IDRS_ARRAY, }; enum uverbs_obj_access { @@ -101,7 +102,7 @@ struct uverbs_attr_spec { } enum_def; } u; - /* This weird split of the enum lets us remove some padding */ + /* This weird split lets us remove some padding */ union { struct { /* @@ -111,6 +112,17 @@ struct uverbs_attr_spec { */ const struct uverbs_attr_spec *ids; } enum_def; + + struct { + /* + * higher bits mean the namespace and lower bits mean + * the type id within the namespace. + */ + u16 obj_type; + u16 min_len; + u16 max_len; + u8 access; + } objs_arr; } u2; }; @@ -251,6 +263,11 @@ static inline __attribute_const__ u32 uapi_bkey_attr(u32 attr_key) return attr_key - 1; } +static inline __attribute_const__ u32 uapi_bkey_to_key_attr(u32 attr_bkey) +{ + return attr_bkey + 1; +} + /* * ======================================= * Verbs definitions @@ -323,6 +340,27 @@ struct uverbs_object_tree_def { #define UA_MANDATORY .mandatory = 1 #define UA_OPTIONAL .mandatory = 0 +/* + * min_len must be bigger than 0 and _max_len must be smaller than 4095. Only + * READ\WRITE accesses are supported. + */ +#define UVERBS_ATTR_IDRS_ARR(_attr_id, _idr_type, _access, _min_len, _max_len, \ + ...) \ + (&(const struct uverbs_attr_def){ \ + .id = (_attr_id) + \ + BUILD_BUG_ON_ZERO((_min_len) == 0 || \ + (_max_len) > \ + PAGE_SIZE / sizeof(void *) || \ + (_min_len) > (_max_len) || \ + (_access) == UVERBS_ACCESS_NEW || \ + (_access) == UVERBS_ACCESS_DESTROY), \ + .attr = { .type = UVERBS_ATTR_TYPE_IDRS_ARRAY, \ + .u2.objs_arr.obj_type = _idr_type, \ + .u2.objs_arr.access = _access, \ + .u2.objs_arr.min_len = _min_len, \ + .u2.objs_arr.max_len = _max_len, \ + __VA_ARGS__ } }) + #define UVERBS_ATTR_IDR(_attr_id, _idr_type, _access, ...) \ (&(const struct uverbs_attr_def){ \ .id = _attr_id, \ @@ -440,10 +478,16 @@ struct uverbs_obj_attr { const struct uverbs_api_attr *attr_elm; }; +struct uverbs_objs_arr_attr { + struct ib_uobject **uobjects; + u16 len; +}; + struct uverbs_attr { union { struct uverbs_ptr_attr ptr_attr; struct uverbs_obj_attr obj_attr; + struct uverbs_objs_arr_attr objs_arr_attr; }; }; @@ -516,6 +560,31 @@ uverbs_attr_get_len(const struct uverbs_attr_bundle *attrs_bundle, u16 idx) return attr->ptr_attr.len; } +/** + * uverbs_attr_get_uobjs_arr() - Provides array's properties for attribute for + * UVERBS_ATTR_TYPE_IDRS_ARRAY. + * @arr: Returned pointer to array of pointers for uobjects or NULL if + * the attribute isn't provided. + * + * Return: The array length or 0 if no attribute was provided. + */ +static inline int uverbs_attr_get_uobjs_arr( + const struct uverbs_attr_bundle *attrs_bundle, u16 attr_idx, + struct ib_uobject ***arr) +{ + const struct uverbs_attr *attr = + uverbs_attr_get(attrs_bundle, attr_idx); + + if (IS_ERR(attr)) { + *arr = NULL; + return 0; + } + + *arr = attr->objs_arr_attr.uobjects; + + return attr->objs_arr_attr.len; +} + static inline bool uverbs_attr_ptr_is_inline(const struct uverbs_attr *attr) { return attr->ptr_attr.len <= sizeof(attr->ptr_attr.data); diff --git a/include/uapi/rdma/rdma_user_ioctl_cmds.h b/include/uapi/rdma/rdma_user_ioctl_cmds.h index 24800c6c1f32..06c34d99be85 100644 --- a/include/uapi/rdma/rdma_user_ioctl_cmds.h +++ b/include/uapi/rdma/rdma_user_ioctl_cmds.h @@ -53,7 +53,7 @@ enum { struct ib_uverbs_attr { __u16 attr_id; /* command specific type attribute */ - __u16 len; /* only for pointers */ + __u16 len; /* only for pointers and IDRs array */ __u16 flags; /* combination of UVERBS_ATTR_F_XXXX */ union { struct { @@ -63,7 +63,10 @@ struct ib_uverbs_attr { __u16 reserved; } attr_data; union { - /* Used by PTR_IN/OUT, ENUM_IN and IDR */ + /* + * ptr to command, inline data, idr/fd or + * ptr to __u32 array of IDRs + */ __aligned_u64 data; /* Used by FD_IN and FD_OUT */ __s64 data_s64; -- cgit v1.2.3 From 86e1d464a8ccd627b6ea3e9a98a0389b0d27fd1f Mon Sep 17 00:00:00 2001 From: Mark Bloch Date: Thu, 6 Sep 2018 17:27:02 +0300 Subject: RDMA/uverbs: Move flow resources initialization Use ib_set_flow() when initializing flow related resources. Signed-off-by: Mark Bloch Reviewed-by: Yishai Hadas Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/uverbs.h | 6 ------ drivers/infiniband/core/uverbs_cmd.c | 19 ++----------------- drivers/infiniband/hw/mlx5/flow.c | 2 +- include/rdma/ib_verbs.h | 14 -------------- include/rdma/uverbs_std_types.h | 33 +++++++++++++++++++++++++++++++++ 5 files changed, 36 insertions(+), 38 deletions(-) (limited to 'drivers/infiniband/core') diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h index 7199c275ab79..717ab35b0af9 100644 --- a/drivers/infiniband/core/uverbs.h +++ b/drivers/infiniband/core/uverbs.h @@ -219,12 +219,6 @@ struct ib_ucq_object { u32 async_events_reported; }; -struct ib_uflow_resources; -struct ib_uflow_object { - struct ib_uobject uobject; - struct ib_uflow_resources *resources; -}; - extern const struct file_operations uverbs_event_fops; void ib_uverbs_init_event_queue(struct ib_uverbs_event_queue *ev_queue); struct file *ib_uverbs_alloc_async_event_file(struct ib_uverbs_file *uverbs_file, diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 4b72851ade24..c054d65dec1b 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -2747,15 +2747,6 @@ out_put: return ret ? ret : in_len; } -struct ib_uflow_resources { - size_t max; - size_t num; - size_t collection_num; - size_t counters_num; - struct ib_counters **counters; - struct ib_flow_action **collection; -}; - static struct ib_uflow_resources *flow_resources_alloc(size_t num_specs) { struct ib_uflow_resources *resources; @@ -3462,7 +3453,6 @@ int ib_uverbs_ex_create_flow(struct ib_uverbs_file *file, struct ib_uverbs_create_flow cmd; struct ib_uverbs_create_flow_resp resp; struct ib_uobject *uobj; - struct ib_uflow_object *uflow; struct ib_flow *flow_id; struct ib_uverbs_flow_attr *kern_flow_attr; struct ib_flow_attr *flow_attr; @@ -3601,13 +3591,8 @@ int ib_uverbs_ex_create_flow(struct ib_uverbs_file *file, err = PTR_ERR(flow_id); goto err_free; } - atomic_inc(&qp->usecnt); - flow_id->qp = qp; - flow_id->device = qp->device; - flow_id->uobject = uobj; - uobj->object = flow_id; - uflow = container_of(uobj, typeof(*uflow), uobject); - uflow->resources = uflow_res; + + ib_set_flow(uobj, flow_id, qp, qp->device, uflow_res); memset(&resp, 0, sizeof(resp)); resp.flow_handle = uobj->id; diff --git a/drivers/infiniband/hw/mlx5/flow.c b/drivers/infiniband/hw/mlx5/flow.c index 5750a650884e..12abbc02af99 100644 --- a/drivers/infiniband/hw/mlx5/flow.c +++ b/drivers/infiniband/hw/mlx5/flow.c @@ -128,7 +128,7 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_CREATE_FLOW)( if (IS_ERR(flow_handler)) return PTR_ERR(flow_handler); - ib_set_flow(uobj, &flow_handler->ibflow, qp, &dev->ib_dev); + ib_set_flow(uobj, &flow_handler->ibflow, qp, &dev->ib_dev, NULL); return 0; } diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index f687faadf33b..6076c9b72ab9 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -4162,20 +4162,6 @@ ib_get_vector_affinity(struct ib_device *device, int comp_vector) } -static inline void ib_set_flow(struct ib_uobject *uobj, struct ib_flow *ibflow, - struct ib_qp *qp, struct ib_device *device) -{ - uobj->object = ibflow; - ibflow->uobject = uobj; - - if (qp) { - atomic_inc(&qp->usecnt); - ibflow->qp = qp; - } - - ibflow->device = device; -} - /** * rdma_roce_rescan_device - Rescan all of the network devices in the system * and add their gids, as needed, to the relevant RoCE devices. diff --git a/include/rdma/uverbs_std_types.h b/include/rdma/uverbs_std_types.h index 526d918fcd5a..dfd6d35f1783 100644 --- a/include/rdma/uverbs_std_types.h +++ b/include/rdma/uverbs_std_types.h @@ -152,5 +152,38 @@ static inline void uverbs_flow_action_fill_action(struct ib_flow_action *action, uobj->object = action; } +struct ib_uflow_resources { + size_t max; + size_t num; + size_t collection_num; + size_t counters_num; + struct ib_counters **counters; + struct ib_flow_action **collection; +}; + +struct ib_uflow_object { + struct ib_uobject uobject; + struct ib_uflow_resources *resources; +}; + +static inline void ib_set_flow(struct ib_uobject *uobj, struct ib_flow *ibflow, + struct ib_qp *qp, struct ib_device *device, + struct ib_uflow_resources *uflow_res) +{ + struct ib_uflow_object *uflow; + + uobj->object = ibflow; + ibflow->uobject = uobj; + + if (qp) { + atomic_inc(&qp->usecnt); + ibflow->qp = qp; + } + + ibflow->device = device; + uflow = container_of(uobj, typeof(*uflow), uobject); + uflow->resources = uflow_res; +} + #endif -- cgit v1.2.3 From fa76d24ee0aa24fff3fa9ba71fc2179fb88fef6a Mon Sep 17 00:00:00 2001 From: Mark Bloch Date: Thu, 6 Sep 2018 17:27:06 +0300 Subject: RDMA/mlx5: Add flow actions support to raw create flow Support attaching flow actions to a flow rule via raw create flow. For now only NIC RX path is supported. This change requires to export flow resources management functions so we can maintain proper bookkeeping of flow actions. Signed-off-by: Mark Bloch Reviewed-by: Yishai Hadas Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/uverbs_cmd.c | 11 +++++---- drivers/infiniband/hw/mlx5/flow.c | 40 ++++++++++++++++++++++++++++---- include/rdma/uverbs_std_types.h | 6 +++++ include/uapi/rdma/mlx5_user_ioctl_cmds.h | 1 + 4 files changed, 50 insertions(+), 8 deletions(-) (limited to 'drivers/infiniband/core') diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index c054d65dec1b..9c87c98a0f19 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -2747,7 +2747,7 @@ out_put: return ret ? ret : in_len; } -static struct ib_uflow_resources *flow_resources_alloc(size_t num_specs) +struct ib_uflow_resources *flow_resources_alloc(size_t num_specs) { struct ib_uflow_resources *resources; @@ -2777,6 +2777,7 @@ err: return NULL; } +EXPORT_SYMBOL(flow_resources_alloc); void ib_uverbs_flow_resources_free(struct ib_uflow_resources *uflow_res) { @@ -2795,10 +2796,11 @@ void ib_uverbs_flow_resources_free(struct ib_uflow_resources *uflow_res) kfree(uflow_res->counters); kfree(uflow_res); } +EXPORT_SYMBOL(ib_uverbs_flow_resources_free); -static void flow_resources_add(struct ib_uflow_resources *uflow_res, - enum ib_flow_spec_type type, - void *ibobj) +void flow_resources_add(struct ib_uflow_resources *uflow_res, + enum ib_flow_spec_type type, + void *ibobj) { WARN_ON(uflow_res->num >= uflow_res->max); @@ -2819,6 +2821,7 @@ static void flow_resources_add(struct ib_uflow_resources *uflow_res, uflow_res->num++; } +EXPORT_SYMBOL(flow_resources_add); static int kern_spec_to_ib_spec_action(struct ib_uverbs_file *ufile, struct ib_uverbs_flow_spec *kern_spec, diff --git a/drivers/infiniband/hw/mlx5/flow.c b/drivers/infiniband/hw/mlx5/flow.c index 0e913491d139..ce9276a2aaa5 100644 --- a/drivers/infiniband/hw/mlx5/flow.c +++ b/drivers/infiniband/hw/mlx5/flow.c @@ -58,12 +58,15 @@ static const struct uverbs_attr_spec mlx5_ib_flow_type[] = { }, }; +#define MLX5_IB_CREATE_FLOW_MAX_FLOW_ACTIONS 2 static int UVERBS_HANDLER(MLX5_IB_METHOD_CREATE_FLOW)( struct ib_uverbs_file *file, struct uverbs_attr_bundle *attrs) { struct mlx5_flow_act flow_act = {.flow_tag = MLX5_FS_DEFAULT_FLOW_TAG}; struct mlx5_ib_flow_handler *flow_handler; struct mlx5_ib_flow_matcher *fs_matcher; + struct ib_uobject **arr_flow_actions; + struct ib_uflow_resources *uflow_res; void *devx_obj; int dest_id, dest_type; void *cmd_in; @@ -73,6 +76,7 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_CREATE_FLOW)( struct ib_uobject *uobj = uverbs_attr_get_uobject(attrs, MLX5_IB_ATTR_CREATE_FLOW_HANDLE); struct mlx5_ib_dev *dev = to_mdev(uobj->context->device); + int len, ret, i; if (!capable(CAP_NET_RAW)) return -EPERM; @@ -124,15 +128,38 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_CREATE_FLOW)( MLX5_IB_ATTR_CREATE_FLOW_MATCH_VALUE); fs_matcher = uverbs_attr_get_obj(attrs, MLX5_IB_ATTR_CREATE_FLOW_MATCHER); + + uflow_res = flow_resources_alloc(MLX5_IB_CREATE_FLOW_MAX_FLOW_ACTIONS); + if (!uflow_res) + return -ENOMEM; + + len = uverbs_attr_get_uobjs_arr(attrs, + MLX5_IB_ATTR_CREATE_FLOW_ARR_FLOW_ACTIONS, &arr_flow_actions); + for (i = 0; i < len; i++) { + struct mlx5_ib_flow_action *maction = + to_mflow_act(arr_flow_actions[i]->object); + + ret = parse_flow_flow_action(maction, false, &flow_act); + if (ret) + goto err_out; + flow_resources_add(uflow_res, IB_FLOW_SPEC_ACTION_HANDLE, + arr_flow_actions[i]->object); + } + flow_handler = mlx5_ib_raw_fs_rule_add(dev, fs_matcher, &flow_act, cmd_in, inlen, dest_id, dest_type); - if (IS_ERR(flow_handler)) - return PTR_ERR(flow_handler); + if (IS_ERR(flow_handler)) { + ret = PTR_ERR(flow_handler); + goto err_out; + } - ib_set_flow(uobj, &flow_handler->ibflow, qp, &dev->ib_dev, NULL); + ib_set_flow(uobj, &flow_handler->ibflow, qp, &dev->ib_dev, uflow_res); return 0; +err_out: + ib_uverbs_flow_resources_free(uflow_res); + return ret; } static int flow_matcher_cleanup(struct ib_uobject *uobject, @@ -459,7 +486,12 @@ DECLARE_UVERBS_NAMED_METHOD( UVERBS_ACCESS_READ), UVERBS_ATTR_IDR(MLX5_IB_ATTR_CREATE_FLOW_DEST_DEVX, MLX5_IB_OBJECT_DEVX_OBJ, - UVERBS_ACCESS_READ)); + UVERBS_ACCESS_READ), + UVERBS_ATTR_IDRS_ARR(MLX5_IB_ATTR_CREATE_FLOW_ARR_FLOW_ACTIONS, + UVERBS_OBJECT_FLOW_ACTION, + UVERBS_ACCESS_READ, 1, + MLX5_IB_CREATE_FLOW_MAX_FLOW_ACTIONS, + UA_OPTIONAL)); DECLARE_UVERBS_NAMED_METHOD_DESTROY( MLX5_IB_METHOD_DESTROY_FLOW, diff --git a/include/rdma/uverbs_std_types.h b/include/rdma/uverbs_std_types.h index dfd6d35f1783..3db2802fbc68 100644 --- a/include/rdma/uverbs_std_types.h +++ b/include/rdma/uverbs_std_types.h @@ -166,6 +166,12 @@ struct ib_uflow_object { struct ib_uflow_resources *resources; }; +struct ib_uflow_resources *flow_resources_alloc(size_t num_specs); +void flow_resources_add(struct ib_uflow_resources *uflow_res, + enum ib_flow_spec_type type, + void *ibobj); +void ib_uverbs_flow_resources_free(struct ib_uflow_resources *uflow_res); + static inline void ib_set_flow(struct ib_uobject *uobj, struct ib_flow *ibflow, struct ib_qp *qp, struct ib_device *device, struct ib_uflow_resources *uflow_res) diff --git a/include/uapi/rdma/mlx5_user_ioctl_cmds.h b/include/uapi/rdma/mlx5_user_ioctl_cmds.h index 75c7093fd95b..91c3d42ebd0f 100644 --- a/include/uapi/rdma/mlx5_user_ioctl_cmds.h +++ b/include/uapi/rdma/mlx5_user_ioctl_cmds.h @@ -155,6 +155,7 @@ enum mlx5_ib_create_flow_attrs { MLX5_IB_ATTR_CREATE_FLOW_DEST_QP, MLX5_IB_ATTR_CREATE_FLOW_DEST_DEVX, MLX5_IB_ATTR_CREATE_FLOW_MATCHER, + MLX5_IB_ATTR_CREATE_FLOW_ARR_FLOW_ACTIONS, }; enum mlx5_ib_destoy_flow_attrs { -- cgit v1.2.3 From caf1e3ae9fa648d6dd38468736868d6867cab273 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Wed, 5 Sep 2018 12:54:16 +0300 Subject: RDMA/core Introduce and use rdma_find_ndev_for_src_ip_rcu This fixes two issues: 1. When address family is other than IPv4 or v6, rdma_translate_ip() returns success which is incorrect. 2. When address familty is AF_INET6, and if the source address is not found, it returns success, which is also incorrect. Therefore, introduce and use rdma_find_ndev_for_src_ip_rcu() helper function which returns correct success or error status and is also useful for future code refactor in addr_resolve(). Signed-off-by: Parav Pandit Reviewed-by: Daniel Jurgens Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/addr.c | 61 ++++++++++++++++++++++++------------------ 1 file changed, 35 insertions(+), 26 deletions(-) (limited to 'drivers/infiniband/core') diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c index 94ff38731be8..50ab50f1908b 100644 --- a/drivers/infiniband/core/addr.c +++ b/drivers/infiniband/core/addr.c @@ -232,6 +232,36 @@ void rdma_copy_addr(struct rdma_dev_addr *dev_addr, } EXPORT_SYMBOL(rdma_copy_addr); +static struct net_device * +rdma_find_ndev_for_src_ip_rcu(struct net *net, const struct sockaddr *src_in) +{ + struct net_device *dev = NULL; + int ret = -EADDRNOTAVAIL; + + switch (src_in->sa_family) { + case AF_INET: + dev = __ip_dev_find(net, + ((const struct sockaddr_in *)src_in)->sin_addr.s_addr, + false); + if (dev) + ret = 0; + break; +#if IS_ENABLED(CONFIG_IPV6) + case AF_INET6: + for_each_netdev_rcu(net, dev) { + if (ipv6_chk_addr(net, + &((const struct sockaddr_in6 *)src_in)->sin6_addr, + dev, 1)) { + ret = 0; + break; + } + } + break; +#endif + } + return ret ? ERR_PTR(ret) : dev; +} + int rdma_translate_ip(const struct sockaddr *addr, struct rdma_dev_addr *dev_addr) { @@ -246,33 +276,12 @@ int rdma_translate_ip(const struct sockaddr *addr, return 0; } - switch (addr->sa_family) { - case AF_INET: - dev = ip_dev_find(dev_addr->net, - ((const struct sockaddr_in *)addr)->sin_addr.s_addr); - - if (!dev) - return -EADDRNOTAVAIL; - + rcu_read_lock(); + dev = rdma_find_ndev_for_src_ip_rcu(dev_addr->net, addr); + if (!IS_ERR(dev)) rdma_copy_addr(dev_addr, dev, NULL); - dev_put(dev); - break; -#if IS_ENABLED(CONFIG_IPV6) - case AF_INET6: - rcu_read_lock(); - for_each_netdev_rcu(dev_addr->net, dev) { - if (ipv6_chk_addr(dev_addr->net, - &((const struct sockaddr_in6 *)addr)->sin6_addr, - dev, 1)) { - rdma_copy_addr(dev_addr, dev, NULL); - break; - } - } - rcu_read_unlock(); - break; -#endif - } - return 0; + rcu_read_unlock(); + return PTR_ERR_OR_ZERO(dev); } EXPORT_SYMBOL(rdma_translate_ip); -- cgit v1.2.3 From f89b7dfa33537bba9ee082a17a55242fc727e9f4 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Wed, 5 Sep 2018 12:54:17 +0300 Subject: RDMA/core: Avoid unnecessary sa_family overwrite addr4_resolve() and addr6_resolve() are called by checking the value of sa_family. Both above functions overwrite the value after typecasting, this is not necessary. Signed-off-by: Parav Pandit Reviewed-by: Daniel Jurgens Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/addr.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'drivers/infiniband/core') diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c index 50ab50f1908b..858ceffbeeaa 100644 --- a/drivers/infiniband/core/addr.c +++ b/drivers/infiniband/core/addr.c @@ -392,7 +392,6 @@ static int addr4_resolve(struct sockaddr_in *src_in, if (ret) return ret; - src_in->sin_family = AF_INET; src_in->sin_addr.s_addr = fl4.saddr; /* If there's a gateway and type of device not ARPHRD_INFINIBAND, we're @@ -429,10 +428,8 @@ static int addr6_resolve(struct sockaddr_in6 *src_in, return ret; rt = (struct rt6_info *)dst; - if (ipv6_addr_any(&src_in->sin6_addr)) { - src_in->sin6_family = AF_INET6; + if (ipv6_addr_any(&src_in->sin6_addr)) src_in->sin6_addr = fl6.saddr; - } /* If there's a gateway and type of device not ARPHRD_INFINIBAND, we're * definitely in RoCE v2 (as RoCE v1 isn't routable) set the network -- cgit v1.2.3 From 89c5691cdd95ab39f43bd102ec3f0ff39716ae85 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Wed, 5 Sep 2018 12:54:18 +0300 Subject: RDMA/core: Let protocol specific function typecast sockaddr structure Current code typecasts destination address using extra variable but uses source address as is. Even though the compiler optimizes such code well, just let each protocol specific function typecast for src and dest both and have symmetric code. Signed-off-by: Parav Pandit Reviewed-by: Daniel Jurgens Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/addr.c | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) (limited to 'drivers/infiniband/core') diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c index 858ceffbeeaa..9649e5e55e9e 100644 --- a/drivers/infiniband/core/addr.c +++ b/drivers/infiniband/core/addr.c @@ -372,11 +372,15 @@ static int fetch_ha(const struct dst_entry *dst, struct rdma_dev_addr *dev_addr, return dst_fetch_ha(dst, dev_addr, daddr); } -static int addr4_resolve(struct sockaddr_in *src_in, - const struct sockaddr_in *dst_in, +static int addr4_resolve(struct sockaddr *src_sock, + const struct sockaddr *dst_sock, struct rdma_dev_addr *addr, struct rtable **prt) { + struct sockaddr_in *src_in = (struct sockaddr_in *)src_sock; + const struct sockaddr_in *dst_in = + (const struct sockaddr_in *)dst_sock; + __be32 src_ip = src_in->sin_addr.s_addr; __be32 dst_ip = dst_in->sin_addr.s_addr; struct rtable *rt; @@ -408,11 +412,14 @@ static int addr4_resolve(struct sockaddr_in *src_in, } #if IS_ENABLED(CONFIG_IPV6) -static int addr6_resolve(struct sockaddr_in6 *src_in, - const struct sockaddr_in6 *dst_in, +static int addr6_resolve(struct sockaddr *src_sock, + const struct sockaddr *dst_sock, struct rdma_dev_addr *addr, struct dst_entry **pdst) { + struct sockaddr_in6 *src_in = (struct sockaddr_in6 *)src_sock; + const struct sockaddr_in6 *dst_in = + (const struct sockaddr_in6 *)dst_sock; struct flowi6 fl6; struct dst_entry *dst; struct rt6_info *rt; @@ -445,8 +452,8 @@ static int addr6_resolve(struct sockaddr_in6 *src_in, return 0; } #else -static int addr6_resolve(struct sockaddr_in6 *src_in, - const struct sockaddr_in6 *dst_in, +static int addr6_resolve(struct sockaddr *src_sock, + const struct sockaddr *dst_sock, struct rdma_dev_addr *addr, struct dst_entry **pdst) { @@ -496,11 +503,8 @@ static int addr_resolve(struct sockaddr *src_in, if (src_in->sa_family == AF_INET) { struct rtable *rt = NULL; - const struct sockaddr_in *dst_in4 = - (const struct sockaddr_in *)dst_in; - ret = addr4_resolve((struct sockaddr_in *)src_in, - dst_in4, addr, &rt); + ret = addr4_resolve(src_in, dst_in, addr, &rt); if (ret) return ret; @@ -516,12 +520,7 @@ static int addr_resolve(struct sockaddr *src_in, ip_rt_put(rt); } else { - const struct sockaddr_in6 *dst_in6 = - (const struct sockaddr_in6 *)dst_in; - - ret = addr6_resolve((struct sockaddr_in6 *)src_in, - dst_in6, addr, - &dst); + ret = addr6_resolve(src_in, dst_in, addr, &dst); if (ret) return ret; -- cgit v1.2.3 From a362ea1d9e1acf674094614518f4245d17cfc01e Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Wed, 5 Sep 2018 12:54:19 +0300 Subject: RDMA/core: Introduce and use rdma_set_src_addr() between IPv4 and IPv6 rdma_translate_ip() is done while resolving address for the loopback addresses. The current flow is convoluted with resolve neighbor being optional. This patch simplifies the code in following ways. (a) Use common code between IPv4 and IPv6 for address translation, loopback checks and acquiring netdevice. (b) During neigh resolve in addr_resolve_neigh(), only copy destination address. (c) Always resolve the source address before the destination address, because it doesn't depend on resolving neigh being requested or not. This helps to reduce 3 calls of rdma_copy_addr and rdma_translate_ip to one and makes it easier to follow the code flow. Now that ib_nl_fetch_ha() doesn't depend on dst, drop dst argument from ib_nl_fetch_ha(). Signed-off-by: Parav Pandit Reviewed-by: Daniel Jurgens Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/addr.c | 65 +++++++++++++++--------------------------- 1 file changed, 23 insertions(+), 42 deletions(-) (limited to 'drivers/infiniband/core') diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c index 9649e5e55e9e..40f1c1563477 100644 --- a/drivers/infiniband/core/addr.c +++ b/drivers/infiniband/core/addr.c @@ -304,15 +304,12 @@ static void queue_req(struct addr_req *req) spin_unlock_bh(&lock); } -static int ib_nl_fetch_ha(const struct dst_entry *dst, - struct rdma_dev_addr *dev_addr, +static int ib_nl_fetch_ha(struct rdma_dev_addr *dev_addr, const void *daddr, u32 seq, u16 family) { if (rdma_nl_chk_listeners(RDMA_NL_GROUP_LS)) return -EADDRNOTAVAIL; - /* We fill in what we can, the response will fill the rest */ - rdma_copy_addr(dev_addr, dst->dev, NULL); return ib_nl_ip_send_msg(dev_addr, daddr, seq, family); } @@ -331,7 +328,7 @@ static int dst_fetch_ha(const struct dst_entry *dst, neigh_event_send(n, NULL); ret = -ENODATA; } else { - rdma_copy_addr(dev_addr, dst->dev, n->ha); + memcpy(dev_addr->dst_dev_addr, n->ha, MAX_ADDR_LEN); } neigh_release(n); @@ -367,7 +364,7 @@ static int fetch_ha(const struct dst_entry *dst, struct rdma_dev_addr *dev_addr, /* Gateway + ARPHRD_INFINIBAND -> IB router */ if (has_gateway(dst, family) && dst->dev->type == ARPHRD_INFINIBAND) - return ib_nl_fetch_ha(dst, dev_addr, daddr, seq, family); + return ib_nl_fetch_ha(dev_addr, daddr, seq, family); else return dst_fetch_ha(dst, dev_addr, daddr); } @@ -467,32 +464,37 @@ static int addr_resolve_neigh(const struct dst_entry *dst, u32 seq) { if (dst->dev->flags & IFF_LOOPBACK) { - int ret; - - ret = rdma_translate_ip(dst_in, addr); - if (!ret) - memcpy(addr->dst_dev_addr, addr->src_dev_addr, - MAX_ADDR_LEN); - - return ret; + memcpy(addr->dst_dev_addr, addr->src_dev_addr, MAX_ADDR_LEN); + return 0; } /* If the device doesn't do ARP internally */ if (!(dst->dev->flags & IFF_NOARP)) return fetch_ha(dst, addr, dst_in, seq); - rdma_copy_addr(addr, dst->dev, NULL); - return 0; } +static int rdma_set_src_addr(const struct dst_entry *dst, + const struct sockaddr *dst_in, + struct rdma_dev_addr *dev_addr) +{ + int ret = 0; + + if (dst->dev->flags & IFF_LOOPBACK) + ret = rdma_translate_ip(dst_in, dev_addr); + else + rdma_copy_addr(dev_addr, dst->dev, NULL); + return ret; +} + static int addr_resolve(struct sockaddr *src_in, const struct sockaddr *dst_in, struct rdma_dev_addr *addr, bool resolve_neigh, u32 seq) { - struct net_device *ndev; + struct rtable *rt = NULL; struct dst_entry *dst; int ret; @@ -502,49 +504,28 @@ static int addr_resolve(struct sockaddr *src_in, } if (src_in->sa_family == AF_INET) { - struct rtable *rt = NULL; ret = addr4_resolve(src_in, dst_in, addr, &rt); if (ret) return ret; - if (resolve_neigh) + ret = rdma_set_src_addr(&rt->dst, dst_in, addr); + if (!ret && resolve_neigh) ret = addr_resolve_neigh(&rt->dst, dst_in, addr, seq); - if (addr->bound_dev_if) { - ndev = dev_get_by_index(addr->net, addr->bound_dev_if); - } else { - ndev = rt->dst.dev; - dev_hold(ndev); - } - ip_rt_put(rt); } else { ret = addr6_resolve(src_in, dst_in, addr, &dst); if (ret) return ret; - if (resolve_neigh) + ret = rdma_set_src_addr(dst, dst_in, addr); + if (!ret && resolve_neigh) ret = addr_resolve_neigh(dst, dst_in, addr, seq); - if (addr->bound_dev_if) { - ndev = dev_get_by_index(addr->net, addr->bound_dev_if); - } else { - ndev = dst->dev; - dev_hold(ndev); - } - dst_release(dst); } - if (ndev) { - if (ndev->flags & IFF_LOOPBACK) - ret = rdma_translate_ip(dst_in, addr); - else - addr->bound_dev_if = ndev->ifindex; - dev_put(ndev); - } - return ret; } -- cgit v1.2.3 From 77addc524473ee9a85d2ef5747a32173c85768d4 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Wed, 5 Sep 2018 12:54:20 +0300 Subject: RDMA/core: Rename rdma_copy_addr to rdma_copy_src_l2_addr Now that rdma_copy_addr() only copies the source addresses and all callers are interested in copying only source addresses, simplify it to drop the destination address argument. Given that it only copies source layer2 addresses, rename it to rdma_copy_src_l2_addr for better code readability. Signed-off-by: Parav Pandit Reviewed-by: Daniel Jurgens Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/addr.c | 24 +++++++++++++++--------- drivers/infiniband/core/cma.c | 4 ++-- drivers/infiniband/core/core_priv.h | 2 ++ include/rdma/ib_addr.h | 4 ---- 4 files changed, 19 insertions(+), 15 deletions(-) (limited to 'drivers/infiniband/core') diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c index 40f1c1563477..c9d14d6996b2 100644 --- a/drivers/infiniband/core/addr.c +++ b/drivers/infiniband/core/addr.c @@ -219,18 +219,24 @@ int rdma_addr_size_kss(struct __kernel_sockaddr_storage *addr) } EXPORT_SYMBOL(rdma_addr_size_kss); -void rdma_copy_addr(struct rdma_dev_addr *dev_addr, - const struct net_device *dev, - const unsigned char *dst_dev_addr) +/** + * rdma_copy_src_l2_addr - Copy netdevice source addresses + * @dev_addr: Destination address pointer where to copy the addresses + * @dev: Netdevice whose source addresses to copy + * + * rdma_copy_src_l2_addr() copies source addresses from the specified netdevice. + * This includes unicast address, broadcast address, device type and + * interface index. + */ +void rdma_copy_src_l2_addr(struct rdma_dev_addr *dev_addr, + const struct net_device *dev) { dev_addr->dev_type = dev->type; memcpy(dev_addr->src_dev_addr, dev->dev_addr, MAX_ADDR_LEN); memcpy(dev_addr->broadcast, dev->broadcast, MAX_ADDR_LEN); - if (dst_dev_addr) - memcpy(dev_addr->dst_dev_addr, dst_dev_addr, MAX_ADDR_LEN); dev_addr->bound_dev_if = dev->ifindex; } -EXPORT_SYMBOL(rdma_copy_addr); +EXPORT_SYMBOL(rdma_copy_src_l2_addr); static struct net_device * rdma_find_ndev_for_src_ip_rcu(struct net *net, const struct sockaddr *src_in) @@ -271,7 +277,7 @@ int rdma_translate_ip(const struct sockaddr *addr, dev = dev_get_by_index(dev_addr->net, dev_addr->bound_dev_if); if (!dev) return -ENODEV; - rdma_copy_addr(dev_addr, dev, NULL); + rdma_copy_src_l2_addr(dev_addr, dev); dev_put(dev); return 0; } @@ -279,7 +285,7 @@ int rdma_translate_ip(const struct sockaddr *addr, rcu_read_lock(); dev = rdma_find_ndev_for_src_ip_rcu(dev_addr->net, addr); if (!IS_ERR(dev)) - rdma_copy_addr(dev_addr, dev, NULL); + rdma_copy_src_l2_addr(dev_addr, dev); rcu_read_unlock(); return PTR_ERR_OR_ZERO(dev); } @@ -484,7 +490,7 @@ static int rdma_set_src_addr(const struct dst_entry *dst, if (dst->dev->flags & IFF_LOOPBACK) ret = rdma_translate_ip(dst_in, dev_addr); else - rdma_copy_addr(dev_addr, dst->dev, NULL); + rdma_copy_src_l2_addr(dev_addr, dst->dev); return ret; } diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 4ba77f4e7098..ace2a4c757f6 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -1900,7 +1900,7 @@ cma_ib_new_conn_id(const struct rdma_cm_id *listen_id, rt->path_rec[1] = *ib_event->param.req_rcvd.alternate_path; if (net_dev) { - rdma_copy_addr(&rt->addr.dev_addr, net_dev, NULL); + rdma_copy_src_l2_addr(&rt->addr.dev_addr, net_dev); } else { if (!cma_protocol_roce(listen_id) && cma_any_addr(cma_src_addr(id_priv))) { @@ -1950,7 +1950,7 @@ cma_ib_new_udp_id(const struct rdma_cm_id *listen_id, goto err; if (net_dev) { - rdma_copy_addr(&id->route.addr.dev_addr, net_dev, NULL); + rdma_copy_src_l2_addr(&id->route.addr.dev_addr, net_dev); } else { if (!cma_any_addr(cma_src_addr(id_priv))) { ret = cma_translate_addr(cma_src_addr(id_priv), diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h index 77c7005c396c..c3d93350413c 100644 --- a/drivers/infiniband/core/core_priv.h +++ b/drivers/infiniband/core/core_priv.h @@ -340,5 +340,7 @@ int rdma_addr_find_l2_eth_by_grh(const union ib_gid *sgid, const union ib_gid *dgid, u8 *dmac, const struct net_device *ndev, int *hoplimit); +void rdma_copy_src_l2_addr(struct rdma_dev_addr *dev_addr, + const struct net_device *dev); #endif /* _CORE_PRIV_H */ diff --git a/include/rdma/ib_addr.h b/include/rdma/ib_addr.h index 77c7908b7d73..676514a930ab 100644 --- a/include/rdma/ib_addr.h +++ b/include/rdma/ib_addr.h @@ -105,10 +105,6 @@ int rdma_resolve_ip(struct sockaddr *src_addr, const struct sockaddr *dst_addr, void rdma_addr_cancel(struct rdma_dev_addr *addr); -void rdma_copy_addr(struct rdma_dev_addr *dev_addr, - const struct net_device *dev, - const unsigned char *dst_dev_addr); - int rdma_addr_size(const struct sockaddr *addr); int rdma_addr_size_in6(struct sockaddr_in6 *addr); int rdma_addr_size_kss(struct __kernel_sockaddr_storage *addr); -- cgit v1.2.3 From 783793b5543d3b886f0704803198feeb058cccab Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Wed, 5 Sep 2018 12:54:21 +0300 Subject: RDMA/core: Use common code flow for IPv4/6 for addr resolve Use common code flow for resolving neighbour and for finding source addresses. Signed-off-by: Parav Pandit Reviewed-by: Daniel Jurgens Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/addr.c | 32 +++++++++++++++----------------- 1 file changed, 15 insertions(+), 17 deletions(-) (limited to 'drivers/infiniband/core') diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c index c9d14d6996b2..cbc64de2d791 100644 --- a/drivers/infiniband/core/addr.c +++ b/drivers/infiniband/core/addr.c @@ -500,8 +500,8 @@ static int addr_resolve(struct sockaddr *src_in, bool resolve_neigh, u32 seq) { + struct dst_entry *dst = NULL; struct rtable *rt = NULL; - struct dst_entry *dst; int ret; if (!addr->net) { @@ -510,28 +510,26 @@ static int addr_resolve(struct sockaddr *src_in, } if (src_in->sa_family == AF_INET) { - ret = addr4_resolve(src_in, dst_in, addr, &rt); - if (ret) - return ret; - - ret = rdma_set_src_addr(&rt->dst, dst_in, addr); - if (!ret && resolve_neigh) - ret = addr_resolve_neigh(&rt->dst, dst_in, addr, seq); - - ip_rt_put(rt); + dst = &rt->dst; } else { ret = addr6_resolve(src_in, dst_in, addr, &dst); - if (ret) - return ret; + } + if (ret) + return ret; - ret = rdma_set_src_addr(dst, dst_in, addr); - if (!ret && resolve_neigh) - ret = addr_resolve_neigh(dst, dst_in, addr, seq); + ret = rdma_set_src_addr(dst, dst_in, addr); + /* + * Resolve neighbor destination address if requested and + * only if src addr translation didn't fail. + */ + if (!ret && resolve_neigh) + ret = addr_resolve_neigh(dst, dst_in, addr, seq); + if (src_in->sa_family == AF_INET) + ip_rt_put(rt); + else dst_release(dst); - } - return ret; } -- cgit v1.2.3 From 307edde8efb75cd39326f0f603c9693a5b2af019 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Wed, 5 Sep 2018 12:54:22 +0300 Subject: RDMA/core: Refer to network type instead of device type Set and refer to rdma_dev_addr network type instead of dst->ndev to reduce dependency on accessing dst netdevice. Signed-off-by: Parav Pandit Reviewed-by: Daniel Jurgens Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/addr.c | 35 ++++++++++++++++------------------- 1 file changed, 16 insertions(+), 19 deletions(-) (limited to 'drivers/infiniband/core') diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c index cbc64de2d791..97d0b36b5120 100644 --- a/drivers/infiniband/core/addr.c +++ b/drivers/infiniband/core/addr.c @@ -368,8 +368,8 @@ static int fetch_ha(const struct dst_entry *dst, struct rdma_dev_addr *dev_addr, (const void *)&dst_in6->sin6_addr; sa_family_t family = dst_in->sa_family; - /* Gateway + ARPHRD_INFINIBAND -> IB router */ - if (has_gateway(dst, family) && dst->dev->type == ARPHRD_INFINIBAND) + /* If we have a gateway in IB mode then it must be an IB network */ + if (has_gateway(dst, family) && dev_addr->network == RDMA_NETWORK_IB) return ib_nl_fetch_ha(dev_addr, daddr, seq, family); else return dst_fetch_ha(dst, dev_addr, daddr); @@ -401,13 +401,6 @@ static int addr4_resolve(struct sockaddr *src_sock, src_in->sin_addr.s_addr = fl4.saddr; - /* If there's a gateway and type of device not ARPHRD_INFINIBAND, we're - * definitely in RoCE v2 (as RoCE v1 isn't routable) set the network - * type accordingly. - */ - if (rt->rt_uses_gateway && rt->dst.dev->type != ARPHRD_INFINIBAND) - addr->network = RDMA_NETWORK_IPV4; - addr->hoplimit = ip4_dst_hoplimit(&rt->dst); *prt = rt; @@ -425,7 +418,6 @@ static int addr6_resolve(struct sockaddr *src_sock, (const struct sockaddr_in6 *)dst_sock; struct flowi6 fl6; struct dst_entry *dst; - struct rt6_info *rt; int ret; memset(&fl6, 0, sizeof fl6); @@ -437,18 +429,9 @@ static int addr6_resolve(struct sockaddr *src_sock, if (ret < 0) return ret; - rt = (struct rt6_info *)dst; if (ipv6_addr_any(&src_in->sin6_addr)) src_in->sin6_addr = fl6.saddr; - /* If there's a gateway and type of device not ARPHRD_INFINIBAND, we're - * definitely in RoCE v2 (as RoCE v1 isn't routable) set the network - * type accordingly. - */ - if (rt->rt6i_flags & RTF_GATEWAY && - ip6_dst_idev(dst)->dev->type != ARPHRD_INFINIBAND) - addr->network = RDMA_NETWORK_IPV6; - addr->hoplimit = ip6_dst_hoplimit(dst); *pdst = dst; @@ -491,6 +474,20 @@ static int rdma_set_src_addr(const struct dst_entry *dst, ret = rdma_translate_ip(dst_in, dev_addr); else rdma_copy_src_l2_addr(dev_addr, dst->dev); + + /* + * If there's a gateway and type of device not ARPHRD_INFINIBAND, + * we're definitely in RoCE v2 (as RoCE v1 isn't routable) set the + * network type accordingly. + */ + if (has_gateway(dst, dst_in->sa_family) && + dst->dev->type != ARPHRD_INFINIBAND) + dev_addr->network = dst_in->sa_family == AF_INET ? + RDMA_NETWORK_IPV4 : + RDMA_NETWORK_IPV6; + else + dev_addr->network = RDMA_NETWORK_IB; + return ret; } -- cgit v1.2.3 From c31d4b2ddf07ba74388cb8799517a7010e3e0c89 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Wed, 5 Sep 2018 12:54:23 +0300 Subject: RDMA/core: Protect against changing dst->dev during destination resolve During resolving address process, during route lookup and while performing src address translation in case of loopback mode, hold the rcu lock so that if netdevice is moving to different net namespace, or being unregistered, it can be synchronized with net/core/dev.c, ie change_net_namespace() ->dev_close_many() ->rt6_uncached_list_flush_dev() who would change dst->dev to loopback device of the given net namespace. Therefore, hold the rcu lock and sync with synchronize_net() of change_net_namespace() to ensure that netdevice cannot get freed while dst->dev is being used. Signed-off-by: Parav Pandit Reviewed-by: Daniel Jurgens Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/addr.c | 61 +++++++++++++++++++++++++++++++----------- 1 file changed, 46 insertions(+), 15 deletions(-) (limited to 'drivers/infiniband/core') diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c index 97d0b36b5120..316a53f59ee8 100644 --- a/drivers/infiniband/core/addr.c +++ b/drivers/infiniband/core/addr.c @@ -450,23 +450,26 @@ static int addr6_resolve(struct sockaddr *src_sock, static int addr_resolve_neigh(const struct dst_entry *dst, const struct sockaddr *dst_in, struct rdma_dev_addr *addr, + unsigned int ndev_flags, u32 seq) { - if (dst->dev->flags & IFF_LOOPBACK) { + int ret = 0; + + if (ndev_flags & IFF_LOOPBACK) { memcpy(addr->dst_dev_addr, addr->src_dev_addr, MAX_ADDR_LEN); - return 0; + } else { + if (!(ndev_flags & IFF_NOARP)) { + /* If the device doesn't do ARP internally */ + ret = fetch_ha(dst, addr, dst_in, seq); + } } - - /* If the device doesn't do ARP internally */ - if (!(dst->dev->flags & IFF_NOARP)) - return fetch_ha(dst, addr, dst_in, seq); - - return 0; + return ret; } -static int rdma_set_src_addr(const struct dst_entry *dst, +static void copy_src_l2_addr(struct rdma_dev_addr *dev_addr, const struct sockaddr *dst_in, - struct rdma_dev_addr *dev_addr) + const struct dst_entry *dst, + const struct net_device *ndev) { int ret = 0; @@ -481,14 +484,37 @@ static int rdma_set_src_addr(const struct dst_entry *dst, * network type accordingly. */ if (has_gateway(dst, dst_in->sa_family) && - dst->dev->type != ARPHRD_INFINIBAND) + ndev->type != ARPHRD_INFINIBAND) dev_addr->network = dst_in->sa_family == AF_INET ? RDMA_NETWORK_IPV4 : RDMA_NETWORK_IPV6; else dev_addr->network = RDMA_NETWORK_IB; +} - return ret; +static int rdma_set_src_addr_rcu(struct rdma_dev_addr *dev_addr, + unsigned int *ndev_flags, + const struct sockaddr *dst_in, + const struct dst_entry *dst) +{ + struct net_device *ndev = READ_ONCE(dst->dev); + + *ndev_flags = ndev->flags; + /* A physical device must be the RDMA device to use */ + if (ndev->flags & IFF_LOOPBACK) { + /* + * RDMA (IB/RoCE, iWarp) doesn't run on lo interface or + * loopback IP address. So if route is resolved to loopback + * interface, translate that to a real ndev based on non + * loopback IP address. + */ + ndev = rdma_find_ndev_for_src_ip_rcu(dev_net(ndev), dst_in); + if (!ndev) + return -ENODEV; + } + + copy_src_l2_addr(dev_addr, dst_in, dst, ndev); + return 0; } static int addr_resolve(struct sockaddr *src_in, @@ -498,6 +524,7 @@ static int addr_resolve(struct sockaddr *src_in, u32 seq) { struct dst_entry *dst = NULL; + unsigned int ndev_flags = 0; struct rtable *rt = NULL; int ret; @@ -506,22 +533,26 @@ static int addr_resolve(struct sockaddr *src_in, return -EINVAL; } + rcu_read_lock(); if (src_in->sa_family == AF_INET) { ret = addr4_resolve(src_in, dst_in, addr, &rt); dst = &rt->dst; } else { ret = addr6_resolve(src_in, dst_in, addr, &dst); } - if (ret) + if (ret) { + rcu_read_unlock(); return ret; + } + ret = rdma_set_src_addr_rcu(addr, &ndev_flags, dst_in, dst); + rcu_read_unlock(); - ret = rdma_set_src_addr(dst, dst_in, addr); /* * Resolve neighbor destination address if requested and * only if src addr translation didn't fail. */ if (!ret && resolve_neigh) - ret = addr_resolve_neigh(dst, dst_in, addr, seq); + ret = addr_resolve_neigh(dst, dst_in, addr, ndev_flags, seq); if (src_in->sa_family == AF_INET) ip_rt_put(rt); -- cgit v1.2.3 From 6aaecd38568557266ff7a5c3765c58322586e4ce Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Wed, 5 Sep 2018 12:54:24 +0300 Subject: RDMA/core: Simplify roce_resolve_route_from_path() Currently RoCE route resolve functionality is split between two functions. (a) roce_resolve_route_from_path() and its helper function rdma_resolve_ip_route(). Due to this multiple sockaddr src structures are created in both functions with rdma_dev_addr is an interface between the two for checks. Since there is only one user of rdma_resolve_ip_route() as RoCE, combine the functionality of both functions to roce_resolve_route_from_path() and further reduce the scope of rdma_dev_addr to core/addr.c This also allow to extend addr_resolve() in subsequent patch to consider netdev properties of GID in safer way under rcu lock. Additionally src and dst addresses were always provided, so skip the src addr NULL pointer check as they are present on the stack now. Signed-off-by: Parav Pandit Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/addr.c | 51 +++++++++++++++++++++++++++---------- drivers/infiniband/core/core_priv.h | 3 +++ drivers/infiniband/core/sa_query.c | 40 ----------------------------- 3 files changed, 41 insertions(+), 53 deletions(-) (limited to 'drivers/infiniband/core') diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c index 316a53f59ee8..c4c620334957 100644 --- a/drivers/infiniband/core/addr.c +++ b/drivers/infiniband/core/addr.c @@ -45,6 +45,7 @@ #include #include #include +#include #include #include #include @@ -659,23 +660,47 @@ err: } EXPORT_SYMBOL(rdma_resolve_ip); -int rdma_resolve_ip_route(struct sockaddr *src_addr, - const struct sockaddr *dst_addr, - struct rdma_dev_addr *addr) +int roce_resolve_route_from_path(struct sa_path_rec *rec, + const struct ib_gid_attr *attr) { - struct sockaddr_storage ssrc_addr = {}; - struct sockaddr *src_in = (struct sockaddr *)&ssrc_addr; + union { + struct sockaddr _sockaddr; + struct sockaddr_in _sockaddr_in; + struct sockaddr_in6 _sockaddr_in6; + } sgid, dgid; + struct rdma_dev_addr dev_addr = {}; + int ret; - if (src_addr) { - if (src_addr->sa_family != dst_addr->sa_family) - return -EINVAL; + if (rec->roce.route_resolved) + return 0; - memcpy(src_in, src_addr, rdma_addr_size(src_addr)); - } else { - src_in->sa_family = dst_addr->sa_family; - } + rdma_gid2ip(&sgid._sockaddr, &rec->sgid); + rdma_gid2ip(&dgid._sockaddr, &rec->dgid); + + if (sgid._sockaddr.sa_family != dgid._sockaddr.sa_family) + return -EINVAL; + + if (!attr || !attr->ndev) + return -EINVAL; + + dev_addr.bound_dev_if = attr->ndev->ifindex; + /* TODO: Use net from the ib_gid_attr once it is added to it, + * until than, limit itself to init_net. + */ + dev_addr.net = &init_net; - return addr_resolve(src_in, dst_addr, addr, false, 0); + ret = addr_resolve(&sgid._sockaddr, &dgid._sockaddr, + &dev_addr, false, 0); + if (ret) + return ret; + + if ((dev_addr.network == RDMA_NETWORK_IPV4 || + dev_addr.network == RDMA_NETWORK_IPV6) && + rec->rec_type != SA_PATH_REC_TYPE_ROCE_V2) + return -EINVAL; + + rec->roce.route_resolved = true; + return 0; } /** diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h index c3d93350413c..4dfcf41d83c0 100644 --- a/drivers/infiniband/core/core_priv.h +++ b/drivers/infiniband/core/core_priv.h @@ -343,4 +343,7 @@ int rdma_addr_find_l2_eth_by_grh(const union ib_gid *sgid, void rdma_copy_src_l2_addr(struct rdma_dev_addr *dev_addr, const struct net_device *dev); +struct sa_path_rec; +int roce_resolve_route_from_path(struct sa_path_rec *rec, + const struct ib_gid_attr *attr); #endif /* _CORE_PRIV_H */ diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c index 7b794a14d6e8..d3d6275b3b7e 100644 --- a/drivers/infiniband/core/sa_query.c +++ b/drivers/infiniband/core/sa_query.c @@ -1227,46 +1227,6 @@ static u8 get_src_path_mask(struct ib_device *device, u8 port_num) return src_path_mask; } -static int roce_resolve_route_from_path(struct sa_path_rec *rec, - const struct ib_gid_attr *attr) -{ - struct rdma_dev_addr dev_addr = {}; - union { - struct sockaddr _sockaddr; - struct sockaddr_in _sockaddr_in; - struct sockaddr_in6 _sockaddr_in6; - } sgid_addr, dgid_addr; - int ret; - - if (rec->roce.route_resolved) - return 0; - if (!attr || !attr->ndev) - return -EINVAL; - - dev_addr.bound_dev_if = attr->ndev->ifindex; - /* TODO: Use net from the ib_gid_attr once it is added to it, - * until than, limit itself to init_net. - */ - dev_addr.net = &init_net; - - rdma_gid2ip(&sgid_addr._sockaddr, &rec->sgid); - rdma_gid2ip(&dgid_addr._sockaddr, &rec->dgid); - - /* validate the route */ - ret = rdma_resolve_ip_route(&sgid_addr._sockaddr, - &dgid_addr._sockaddr, &dev_addr); - if (ret) - return ret; - - if ((dev_addr.network == RDMA_NETWORK_IPV4 || - dev_addr.network == RDMA_NETWORK_IPV6) && - rec->rec_type != SA_PATH_REC_TYPE_ROCE_V2) - return -EINVAL; - - rec->roce.route_resolved = true; - return 0; -} - static int init_ah_attr_grh_fields(struct ib_device *device, u8 port_num, struct sa_path_rec *rec, struct rdma_ah_attr *ah_attr, -- cgit v1.2.3 From d6b1764a8c5ac0ad3a66c6d11d24c4fe067fe933 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Wed, 5 Sep 2018 12:54:25 +0300 Subject: RDMA/core: Introduce rdma_read_gid_attr_ndev_rcu() to check GID attribute Introduce an API rdma_read_gid_attr_ndev_rcu() to return GID attribute netdevice which is in UP state for accessing netdevice's fields such as net namespace and ifindex. This is useful for users who intent to access netdevice fields under rcu lock. Signed-off-by: Parav Pandit Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cache.c | 33 +++++++++++++++++++++++++++++++++ drivers/infiniband/core/core_priv.h | 2 ++ 2 files changed, 35 insertions(+) (limited to 'drivers/infiniband/core') diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c index 0bee1f4b914e..8957d31d60ca 100644 --- a/drivers/infiniband/core/cache.c +++ b/drivers/infiniband/core/cache.c @@ -1252,6 +1252,39 @@ void rdma_hold_gid_attr(const struct ib_gid_attr *attr) } EXPORT_SYMBOL(rdma_hold_gid_attr); +/** + * rdma_read_gid_attr_ndev_rcu - Read GID attribute netdevice + * which must be in UP state. + * + * @attr:Pointer to the GID attribute + * + * Returns pointer to netdevice if the netdevice was attached to GID and + * netdevice is in UP state. Caller must hold RCU lock as this API + * reads the netdev flags which can change while netdevice migrates to + * different net namespace. Returns ERR_PTR with error code otherwise. + * + */ +struct net_device *rdma_read_gid_attr_ndev_rcu(const struct ib_gid_attr *attr) +{ + struct ib_gid_table_entry *entry = + container_of(attr, struct ib_gid_table_entry, attr); + struct ib_device *device = entry->attr.device; + struct net_device *ndev = ERR_PTR(-ENODEV); + u8 port_num = entry->attr.port_num; + struct ib_gid_table *table; + unsigned long flags; + bool valid; + + table = rdma_gid_table(device, port_num); + + read_lock_irqsave(&table->rwlock, flags); + valid = is_gid_entry_valid(table->data_vec[attr->index]); + if (valid && attr->ndev && (READ_ONCE(attr->ndev->flags) & IFF_UP)) + ndev = attr->ndev; + read_unlock_irqrestore(&table->rwlock, flags); + return ndev; +} + static int config_non_roce_gid_cache(struct ib_device *device, u8 port, int gid_tbl_len) { diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h index 4dfcf41d83c0..33f50e1929e7 100644 --- a/drivers/infiniband/core/core_priv.h +++ b/drivers/infiniband/core/core_priv.h @@ -346,4 +346,6 @@ void rdma_copy_src_l2_addr(struct rdma_dev_addr *dev_addr, struct sa_path_rec; int roce_resolve_route_from_path(struct sa_path_rec *rec, const struct ib_gid_attr *attr); + +struct net_device *rdma_read_gid_attr_ndev_rcu(const struct ib_gid_attr *attr); #endif /* _CORE_PRIV_H */ -- cgit v1.2.3 From 0e9d2c19bff1d351005afb2f990a913e395ba6d4 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Wed, 5 Sep 2018 12:54:26 +0300 Subject: RDMA/core: Consider net ns of gid attribute for RoCE When resolving destination address or route, when net namespace is unavailable, refer to the net namespace of the netdevice of the SGID attribute. This is typically the case for requests arriving from the network for RoCE ports. Signed-off-by: Parav Pandit Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/addr.c | 75 +++++++++++++++++++++++++++++++------ drivers/infiniband/core/cma.c | 7 ++-- drivers/infiniband/core/core_priv.h | 2 +- drivers/infiniband/core/verbs.c | 2 +- include/rdma/ib_addr.h | 3 ++ 5 files changed, 73 insertions(+), 16 deletions(-) (limited to 'drivers/infiniband/core') diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c index c4c620334957..7a0356c78f60 100644 --- a/drivers/infiniband/core/addr.c +++ b/drivers/infiniband/core/addr.c @@ -62,6 +62,7 @@ struct addr_req { struct rdma_dev_addr *addr, void *context); unsigned long timeout; struct delayed_work work; + bool resolve_by_gid_attr; /* Consider gid attr in resolve phase */ int status; u32 seq; }; @@ -518,10 +519,37 @@ static int rdma_set_src_addr_rcu(struct rdma_dev_addr *dev_addr, return 0; } +static int set_addr_netns_by_gid_rcu(struct rdma_dev_addr *addr) +{ + struct net_device *ndev; + + ndev = rdma_read_gid_attr_ndev_rcu(addr->sgid_attr); + if (IS_ERR(ndev)) + return PTR_ERR(ndev); + + /* + * Since we are holding the rcu, reading net and ifindex + * are safe without any additional reference; because + * change_net_namespace() in net/core/dev.c does rcu sync + * after it changes the state to IFF_DOWN and before + * updating netdev fields {net, ifindex}. + */ + addr->net = dev_net(ndev); + addr->bound_dev_if = ndev->ifindex; + return 0; +} + +static void rdma_addr_set_net_defaults(struct rdma_dev_addr *addr) +{ + addr->net = &init_net; + addr->bound_dev_if = 0; +} + static int addr_resolve(struct sockaddr *src_in, const struct sockaddr *dst_in, struct rdma_dev_addr *addr, bool resolve_neigh, + bool resolve_by_gid_attr, u32 seq) { struct dst_entry *dst = NULL; @@ -535,6 +563,23 @@ static int addr_resolve(struct sockaddr *src_in, } rcu_read_lock(); + if (resolve_by_gid_attr) { + if (!addr->sgid_attr) { + rcu_read_unlock(); + pr_warn_ratelimited("%s: missing gid_attr\n", __func__); + return -EINVAL; + } + /* + * If the request is for a specific gid attribute of the + * rdma_dev_addr, derive net from the netdevice of the + * GID attribute. + */ + ret = set_addr_netns_by_gid_rcu(addr); + if (ret) { + rcu_read_unlock(); + return ret; + } + } if (src_in->sa_family == AF_INET) { ret = addr4_resolve(src_in, dst_in, addr, &rt); dst = &rt->dst; @@ -543,7 +588,7 @@ static int addr_resolve(struct sockaddr *src_in, } if (ret) { rcu_read_unlock(); - return ret; + goto done; } ret = rdma_set_src_addr_rcu(addr, &ndev_flags, dst_in, dst); rcu_read_unlock(); @@ -559,6 +604,13 @@ static int addr_resolve(struct sockaddr *src_in, ip_rt_put(rt); else dst_release(dst); +done: + /* + * Clear the addr net to go back to its original state, only if it was + * derived from GID attribute in this context. + */ + if (resolve_by_gid_attr) + rdma_addr_set_net_defaults(addr); return ret; } @@ -573,7 +625,8 @@ static void process_one_req(struct work_struct *_work) src_in = (struct sockaddr *)&req->src_addr; dst_in = (struct sockaddr *)&req->dst_addr; req->status = addr_resolve(src_in, dst_in, req->addr, - true, req->seq); + true, req->resolve_by_gid_attr, + req->seq); if (req->status && time_after_eq(jiffies, req->timeout)) { req->status = -ETIMEDOUT; } else if (req->status == -ENODATA) { @@ -608,6 +661,7 @@ int rdma_resolve_ip(struct sockaddr *src_addr, const struct sockaddr *dst_addr, struct rdma_dev_addr *addr, int timeout_ms, void (*callback)(int status, struct sockaddr *src_addr, struct rdma_dev_addr *addr, void *context), + bool resolve_by_gid_attr, void *context) { struct sockaddr *src_in, *dst_in; @@ -636,10 +690,12 @@ int rdma_resolve_ip(struct sockaddr *src_addr, const struct sockaddr *dst_addr, req->addr = addr; req->callback = callback; req->context = context; + req->resolve_by_gid_attr = resolve_by_gid_attr; INIT_DELAYED_WORK(&req->work, process_one_req); req->seq = (u32)atomic_inc_return(&ib_nl_addr_request_seq); - req->status = addr_resolve(src_in, dst_in, addr, true, req->seq); + req->status = addr_resolve(src_in, dst_in, addr, true, + req->resolve_by_gid_attr, req->seq); switch (req->status) { case 0: req->timeout = jiffies; @@ -683,14 +739,11 @@ int roce_resolve_route_from_path(struct sa_path_rec *rec, if (!attr || !attr->ndev) return -EINVAL; - dev_addr.bound_dev_if = attr->ndev->ifindex; - /* TODO: Use net from the ib_gid_attr once it is added to it, - * until than, limit itself to init_net. - */ dev_addr.net = &init_net; + dev_addr.sgid_attr = attr; ret = addr_resolve(&sgid._sockaddr, &dgid._sockaddr, - &dev_addr, false, 0); + &dev_addr, false, true, 0); if (ret) return ret; @@ -755,7 +808,7 @@ static void resolve_cb(int status, struct sockaddr *src_addr, int rdma_addr_find_l2_eth_by_grh(const union ib_gid *sgid, const union ib_gid *dgid, - u8 *dmac, const struct net_device *ndev, + u8 *dmac, const struct ib_gid_attr *sgid_attr, int *hoplimit) { struct rdma_dev_addr dev_addr; @@ -771,12 +824,12 @@ int rdma_addr_find_l2_eth_by_grh(const union ib_gid *sgid, rdma_gid2ip(&dgid_addr._sockaddr, dgid); memset(&dev_addr, 0, sizeof(dev_addr)); - dev_addr.bound_dev_if = ndev->ifindex; dev_addr.net = &init_net; + dev_addr.sgid_attr = sgid_attr; init_completion(&ctx.comp); ret = rdma_resolve_ip(&sgid_addr._sockaddr, &dgid_addr._sockaddr, - &dev_addr, 1000, resolve_cb, &ctx); + &dev_addr, 1000, resolve_cb, true, &ctx); if (ret) return ret; diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index ace2a4c757f6..a57c8b823302 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -2987,9 +2987,10 @@ int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, if (dst_addr->sa_family == AF_IB) { ret = cma_resolve_ib_addr(id_priv); } else { - ret = rdma_resolve_ip(cma_src_addr(id_priv), - dst_addr, &id->route.addr.dev_addr, - timeout_ms, addr_handler, id_priv); + ret = rdma_resolve_ip(cma_src_addr(id_priv), dst_addr, + &id->route.addr.dev_addr, + timeout_ms, addr_handler, + false, id_priv); } } if (ret) diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h index 33f50e1929e7..d7399d5b1cb6 100644 --- a/drivers/infiniband/core/core_priv.h +++ b/drivers/infiniband/core/core_priv.h @@ -338,7 +338,7 @@ int rdma_resolve_ip_route(struct sockaddr *src_addr, int rdma_addr_find_l2_eth_by_grh(const union ib_gid *sgid, const union ib_gid *dgid, - u8 *dmac, const struct net_device *ndev, + u8 *dmac, const struct ib_gid_attr *sgid_attr, int *hoplimit); void rdma_copy_src_l2_addr(struct rdma_dev_addr *dev_addr, const struct net_device *dev); diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index 6ee03d6089eb..c36be384fe34 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -710,7 +710,7 @@ static int ib_resolve_unicast_gid_dmac(struct ib_device *device, ret = rdma_addr_find_l2_eth_by_grh(&sgid_attr->gid, &grh->dgid, ah_attr->roce.dmac, - sgid_attr->ndev, &hop_limit); + sgid_attr, &hop_limit); grh->hop_limit = hop_limit; return ret; diff --git a/include/rdma/ib_addr.h b/include/rdma/ib_addr.h index 676514a930ab..2e33b1529015 100644 --- a/include/rdma/ib_addr.h +++ b/include/rdma/ib_addr.h @@ -95,12 +95,15 @@ int rdma_translate_ip(const struct sockaddr *addr, * @timeout_ms: Amount of time to wait for the address resolution to complete. * @callback: Call invoked once address resolution has completed, timed out, * or been canceled. A status of 0 indicates success. + * @resolve_by_gid_attr: Resolve the ip based on the GID attribute from + * rdma_dev_addr. * @context: User-specified context associated with the call. */ int rdma_resolve_ip(struct sockaddr *src_addr, const struct sockaddr *dst_addr, struct rdma_dev_addr *addr, int timeout_ms, void (*callback)(int status, struct sockaddr *src_addr, struct rdma_dev_addr *addr, void *context), + bool resolve_by_gid_attr, void *context); void rdma_addr_cancel(struct rdma_dev_addr *addr); -- cgit v1.2.3 From 6ebce44746036c923d0c255672735399192467c3 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Sun, 16 Sep 2018 20:37:38 +0300 Subject: RDMA/uverbs: Remove is_closed from ib_uverbs_file This does nothing but indicate if the uverbs_file is in the device's list, use list_del_init instead. Signed-off-by: Jason Gunthorpe Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/uverbs.h | 1 - drivers/infiniband/core/uverbs_main.c | 8 ++------ 2 files changed, 2 insertions(+), 7 deletions(-) (limited to 'drivers/infiniband/core') diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h index 717ab35b0af9..24369eb66c67 100644 --- a/drivers/infiniband/core/uverbs.h +++ b/drivers/infiniband/core/uverbs.h @@ -147,7 +147,6 @@ struct ib_uverbs_file { struct ib_event_handler event_handler; struct ib_uverbs_async_event_file *async_file; struct list_head list; - int is_closed; /* * To access the uobjects list hw_destroy_rwsem must be held for write diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index 16e5f714ca53..176271db9ed7 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -905,10 +905,7 @@ static int ib_uverbs_close(struct inode *inode, struct file *filp) uverbs_destroy_ufile_hw(file, RDMA_REMOVE_CLOSE); mutex_lock(&file->device->lists_mutex); - if (!file->is_closed) { - list_del(&file->list); - file->is_closed = 1; - } + list_del_init(&file->list); mutex_unlock(&file->device->lists_mutex); if (file->async_file) @@ -1104,8 +1101,7 @@ static void ib_uverbs_free_hw_resources(struct ib_uverbs_device *uverbs_dev, while (!list_empty(&uverbs_dev->uverbs_file_list)) { file = list_first_entry(&uverbs_dev->uverbs_file_list, struct ib_uverbs_file, list); - file->is_closed = 1; - list_del(&file->list); + list_del_init(&file->list); kref_get(&file->ref); /* We must release the mutex before going ahead and calling -- cgit v1.2.3 From 0965cc953a235196b8d6ef0cba45ecb5c355194f Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Wed, 19 Sep 2018 20:28:38 +0800 Subject: RDMA/core: Properly return the error code of rdma_set_src_addr_rcu rdma_set_src_addr_rcu should check copy_src_l2_addr fails, rather than always return 0. Also copy_src_l2_addr should return 'ret' as its return value when rdma_translate_ip fails. Fixes: c31d4b2ddf07 ("RDMA/core: Protect against changing dst->dev during destination resolve") Signed-off-by: YueHaibing Reviewed-by: Parav Pandit Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/addr.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'drivers/infiniband/core') diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c index 7a0356c78f60..c2ca9e4b5160 100644 --- a/drivers/infiniband/core/addr.c +++ b/drivers/infiniband/core/addr.c @@ -468,10 +468,10 @@ static int addr_resolve_neigh(const struct dst_entry *dst, return ret; } -static void copy_src_l2_addr(struct rdma_dev_addr *dev_addr, - const struct sockaddr *dst_in, - const struct dst_entry *dst, - const struct net_device *ndev) +static int copy_src_l2_addr(struct rdma_dev_addr *dev_addr, + const struct sockaddr *dst_in, + const struct dst_entry *dst, + const struct net_device *ndev) { int ret = 0; @@ -492,6 +492,8 @@ static void copy_src_l2_addr(struct rdma_dev_addr *dev_addr, RDMA_NETWORK_IPV6; else dev_addr->network = RDMA_NETWORK_IB; + + return ret; } static int rdma_set_src_addr_rcu(struct rdma_dev_addr *dev_addr, @@ -515,8 +517,7 @@ static int rdma_set_src_addr_rcu(struct rdma_dev_addr *dev_addr, return -ENODEV; } - copy_src_l2_addr(dev_addr, dst_in, dst, ndev); - return 0; + return copy_src_l2_addr(dev_addr, dst_in, dst, ndev); } static int set_addr_netns_by_gid_rcu(struct rdma_dev_addr *addr) -- cgit v1.2.3 From 0099103926b68e6675a1be4644848f5b1c1b6f97 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 17 Sep 2018 15:44:46 -0600 Subject: RDMA/uverbs: Fix error unwind in ib_uverbs_add_one The error path has several mistakes - cdev_del should not be called if cdev_device_add fails - We must call put_device on all the goto exit paths as that is what frees the uapi, SRCU and the struct itself. While we are here consolidate all the uvdev_dev init that cannot fail at the top. Fixes: c5c4d92e70f3 ("RDMA/uverbs: Use cdev_device_add() instead of cdev_add()") Signed-off-by: Jason Gunthorpe Reviewed-by: Parav Pandit --- drivers/infiniband/core/uverbs_main.c | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) (limited to 'drivers/infiniband/core') diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index 176271db9ed7..db6de9157668 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -1028,6 +1028,12 @@ static void ib_uverbs_add_one(struct ib_device *device) return; } + device_initialize(&uverbs_dev->dev); + uverbs_dev->dev.class = uverbs_class; + uverbs_dev->dev.parent = device->dev.parent; + uverbs_dev->dev.release = ib_uverbs_release_dev; + uverbs_dev->groups[0] = &dev_attr_group; + uverbs_dev->dev.groups = uverbs_dev->groups; atomic_set(&uverbs_dev->refcount, 1); init_completion(&uverbs_dev->comp); uverbs_dev->xrcd_tree = RB_ROOT; @@ -1035,6 +1041,8 @@ static void ib_uverbs_add_one(struct ib_device *device) mutex_init(&uverbs_dev->lists_mutex); INIT_LIST_HEAD(&uverbs_dev->uverbs_file_list); INIT_LIST_HEAD(&uverbs_dev->uverbs_events_file_list); + rcu_assign_pointer(uverbs_dev->ib_dev, device); + uverbs_dev->num_comp_vectors = device->num_comp_vectors; devnum = find_first_zero_bit(dev_map, IB_UVERBS_MAX_DEVICES); if (devnum >= IB_UVERBS_MAX_DEVICES) @@ -1046,19 +1054,10 @@ static void ib_uverbs_add_one(struct ib_device *device) else base = IB_UVERBS_BASE_DEV + devnum; - rcu_assign_pointer(uverbs_dev->ib_dev, device); - uverbs_dev->num_comp_vectors = device->num_comp_vectors; - if (ib_uverbs_create_uapi(device, uverbs_dev)) goto err_uapi; - device_initialize(&uverbs_dev->dev); - uverbs_dev->dev.class = uverbs_class; - uverbs_dev->dev.parent = device->dev.parent; uverbs_dev->dev.devt = base; - uverbs_dev->dev.release = ib_uverbs_release_dev; - uverbs_dev->groups[0] = &dev_attr_group; - uverbs_dev->dev.groups = uverbs_dev->groups; dev_set_name(&uverbs_dev->dev, "uverbs%d", uverbs_dev->devnum); cdev_init(&uverbs_dev->cdev, @@ -1067,20 +1066,18 @@ static void ib_uverbs_add_one(struct ib_device *device) ret = cdev_device_add(&uverbs_dev->cdev, &uverbs_dev->dev); if (ret) - goto err_cdev; + goto err_uapi; ib_set_client_data(device, &uverbs_client, uverbs_dev); return; -err_cdev: - cdev_del(&uverbs_dev->cdev); - put_device(&uverbs_dev->dev); err_uapi: clear_bit(devnum, dev_map); err: if (atomic_dec_and_test(&uverbs_dev->refcount)) ib_uverbs_comp_dev(uverbs_dev); wait_for_completion(&uverbs_dev->comp); + put_device(&uverbs_dev->dev); return; } -- cgit v1.2.3 From 5f9794dc94f59ad1eb821724a8ae1f8e803ea188 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Sun, 16 Sep 2018 20:43:08 +0300 Subject: RDMA/ucontext: Add a core API for mmaping driver IO memory To support disassociation and PCI hot unplug, we have to track all the VMAs that refer to the device IO memory. When disassociation occurs the VMAs have to be revised to point to the zero page, not the IO memory, to allow the physical HW to be unplugged. The three drivers supporting this implemented three different versions of this algorithm, all leaving something to be desired. This new common implementation has a few differences from the driver versions: - Track all VMAs, including splitting/truncating/etc. Tie the lifetime of the private data allocation to the lifetime of the vma. This avoids any tricks with setting vm_ops which Linus didn't like. (see link) - Support multiple mms, and support properly tracking mmaps triggered by processes other than the one first opening the uverbs fd. This makes fork behavior of disassociation enabled drivers the same as fork support in normal drivers. - Don't use crazy get_task stuff. - Simplify the approach for to racing between vm_ops close and disassociation, fixing the related bugs most of the driver implementations had. Since we are in core code the tracking list can be placed in struct ib_uverbs_ufile, which has a lifetime strictly longer than any VMAs created by mmap on the uverbs FD. Link: https://www.spinics.net/lists/stable/msg248747.html Link: https://lkml.kernel.org/r/CA+55aFxJTV_g46AQPoPXen-UPiqR1HGMZictt7VpC-SMFbm3Cw@mail.gmail.com Signed-off-by: Jason Gunthorpe Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/core/rdma_core.c | 4 +- drivers/infiniband/core/rdma_core.h | 1 + drivers/infiniband/core/uverbs.h | 3 + drivers/infiniband/core/uverbs_main.c | 223 ++++++++++++++++++++++++++++++++++ include/rdma/ib_verbs.h | 22 ++++ 5 files changed, 252 insertions(+), 1 deletion(-) (limited to 'drivers/infiniband/core') diff --git a/drivers/infiniband/core/rdma_core.c b/drivers/infiniband/core/rdma_core.c index c4118bcd5103..06d31fe56677 100644 --- a/drivers/infiniband/core/rdma_core.c +++ b/drivers/infiniband/core/rdma_core.c @@ -842,8 +842,10 @@ static void ufile_destroy_ucontext(struct ib_uverbs_file *ufile, struct ib_ucontext *ucontext = ufile->ucontext; int ret; - if (reason == RDMA_REMOVE_DRIVER_REMOVE) + if (reason == RDMA_REMOVE_DRIVER_REMOVE) { + uverbs_user_mmap_disassociate(ufile); ufile_disassociate_ucontext(ucontext); + } put_pid(ucontext->tgid); ib_rdmacg_uncharge(&ucontext->cg_obj, ucontext->device, diff --git a/drivers/infiniband/core/rdma_core.h b/drivers/infiniband/core/rdma_core.h index f962f2a593ba..4886d2bba7c7 100644 --- a/drivers/infiniband/core/rdma_core.h +++ b/drivers/infiniband/core/rdma_core.h @@ -160,5 +160,6 @@ void uverbs_disassociate_api(struct uverbs_api *uapi); void uverbs_destroy_api(struct uverbs_api *uapi); void uapi_compute_bundle_size(struct uverbs_api_ioctl_method *method_elm, unsigned int num_attrs); +void uverbs_user_mmap_disassociate(struct ib_uverbs_file *ufile); #endif /* RDMA_CORE_H */ diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h index 24369eb66c67..c97935a0c7c6 100644 --- a/drivers/infiniband/core/uverbs.h +++ b/drivers/infiniband/core/uverbs.h @@ -158,6 +158,9 @@ struct ib_uverbs_file { spinlock_t uobjects_lock; struct list_head uobjects; + struct mutex umap_lock; + struct list_head umaps; + u64 uverbs_cmd_mask; u64 uverbs_ex_cmd_mask; diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index db6de9157668..8d56773aac56 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -45,6 +45,7 @@ #include #include #include +#include #include @@ -811,6 +812,226 @@ out: return ret; } +/* + * Each time we map IO memory into user space this keeps track of the mapping. + * When the device is hot-unplugged we 'zap' the mmaps in user space to point + * to the zero page and allow the hot unplug to proceed. + * + * This is necessary for cases like PCI physical hot unplug as the actual BAR + * memory may vanish after this and access to it from userspace could MCE. + * + * RDMA drivers supporting disassociation must have their user space designed + * to cope in some way with their IO pages going to the zero page. + */ +struct rdma_umap_priv { + struct vm_area_struct *vma; + struct list_head list; +}; + +static const struct vm_operations_struct rdma_umap_ops; + +static void rdma_umap_priv_init(struct rdma_umap_priv *priv, + struct vm_area_struct *vma) +{ + struct ib_uverbs_file *ufile = vma->vm_file->private_data; + + priv->vma = vma; + vma->vm_private_data = priv; + vma->vm_ops = &rdma_umap_ops; + + mutex_lock(&ufile->umap_lock); + list_add(&priv->list, &ufile->umaps); + mutex_unlock(&ufile->umap_lock); +} + +/* + * The VMA has been dup'd, initialize the vm_private_data with a new tracking + * struct + */ +static void rdma_umap_open(struct vm_area_struct *vma) +{ + struct ib_uverbs_file *ufile = vma->vm_file->private_data; + struct rdma_umap_priv *opriv = vma->vm_private_data; + struct rdma_umap_priv *priv; + + if (!opriv) + return; + + /* We are racing with disassociation */ + if (!down_read_trylock(&ufile->hw_destroy_rwsem)) + goto out_zap; + /* + * Disassociation already completed, the VMA should already be zapped. + */ + if (!ufile->ucontext) + goto out_unlock; + + priv = kzalloc(sizeof(*priv), GFP_KERNEL); + if (!priv) + goto out_unlock; + rdma_umap_priv_init(priv, vma); + + up_read(&ufile->hw_destroy_rwsem); + return; + +out_unlock: + up_read(&ufile->hw_destroy_rwsem); +out_zap: + /* + * We can't allow the VMA to be created with the actual IO pages, that + * would break our API contract, and it can't be stopped at this + * point, so zap it. + */ + vma->vm_private_data = NULL; + zap_vma_ptes(vma, vma->vm_start, vma->vm_end - vma->vm_start); +} + +static void rdma_umap_close(struct vm_area_struct *vma) +{ + struct ib_uverbs_file *ufile = vma->vm_file->private_data; + struct rdma_umap_priv *priv = vma->vm_private_data; + + if (!priv) + return; + + /* + * The vma holds a reference on the struct file that created it, which + * in turn means that the ib_uverbs_file is guaranteed to exist at + * this point. + */ + mutex_lock(&ufile->umap_lock); + list_del(&priv->list); + mutex_unlock(&ufile->umap_lock); + kfree(priv); +} + +static const struct vm_operations_struct rdma_umap_ops = { + .open = rdma_umap_open, + .close = rdma_umap_close, +}; + +static struct rdma_umap_priv *rdma_user_mmap_pre(struct ib_ucontext *ucontext, + struct vm_area_struct *vma, + unsigned long size) +{ + struct ib_uverbs_file *ufile = ucontext->ufile; + struct rdma_umap_priv *priv; + + if (vma->vm_end - vma->vm_start != size) + return ERR_PTR(-EINVAL); + + /* Driver is using this wrong, must be called by ib_uverbs_mmap */ + if (WARN_ON(!vma->vm_file || + vma->vm_file->private_data != ufile)) + return ERR_PTR(-EINVAL); + lockdep_assert_held(&ufile->device->disassociate_srcu); + + priv = kzalloc(sizeof(*priv), GFP_KERNEL); + if (!priv) + return ERR_PTR(-ENOMEM); + return priv; +} + +/* + * Map IO memory into a process. This is to be called by drivers as part of + * their mmap() functions if they wish to send something like PCI-E BAR memory + * to userspace. + */ +int rdma_user_mmap_io(struct ib_ucontext *ucontext, struct vm_area_struct *vma, + unsigned long pfn, unsigned long size, pgprot_t prot) +{ + struct rdma_umap_priv *priv = rdma_user_mmap_pre(ucontext, vma, size); + + if (IS_ERR(priv)) + return PTR_ERR(priv); + + vma->vm_page_prot = prot; + if (io_remap_pfn_range(vma, vma->vm_start, pfn, size, prot)) { + kfree(priv); + return -EAGAIN; + } + + rdma_umap_priv_init(priv, vma); + return 0; +} +EXPORT_SYMBOL(rdma_user_mmap_io); + +/* + * The page case is here for a slightly different reason, the driver expects + * to be able to free the page it is sharing to user space when it destroys + * its ucontext, which means we need to zap the user space references. + * + * We could handle this differently by providing an API to allocate a shared + * page and then only freeing the shared page when the last ufile is + * destroyed. + */ +int rdma_user_mmap_page(struct ib_ucontext *ucontext, + struct vm_area_struct *vma, struct page *page, + unsigned long size) +{ + struct rdma_umap_priv *priv = rdma_user_mmap_pre(ucontext, vma, size); + + if (IS_ERR(priv)) + return PTR_ERR(priv); + + if (remap_pfn_range(vma, vma->vm_start, page_to_pfn(page), size, + vma->vm_page_prot)) { + kfree(priv); + return -EAGAIN; + } + + rdma_umap_priv_init(priv, vma); + return 0; +} +EXPORT_SYMBOL(rdma_user_mmap_page); + +void uverbs_user_mmap_disassociate(struct ib_uverbs_file *ufile) +{ + struct rdma_umap_priv *priv, *next_priv; + + lockdep_assert_held(&ufile->hw_destroy_rwsem); + + while (1) { + struct mm_struct *mm = NULL; + + /* Get an arbitrary mm pointer that hasn't been cleaned yet */ + mutex_lock(&ufile->umap_lock); + if (!list_empty(&ufile->umaps)) { + mm = list_first_entry(&ufile->umaps, + struct rdma_umap_priv, list) + ->vma->vm_mm; + mmget(mm); + } + mutex_unlock(&ufile->umap_lock); + if (!mm) + return; + + /* + * The umap_lock is nested under mmap_sem since it used within + * the vma_ops callbacks, so we have to clean the list one mm + * at a time to get the lock ordering right. Typically there + * will only be one mm, so no big deal. + */ + down_write(&mm->mmap_sem); + mutex_lock(&ufile->umap_lock); + list_for_each_entry_safe (priv, next_priv, &ufile->umaps, + list) { + struct vm_area_struct *vma = priv->vma; + + if (vma->vm_mm != mm) + continue; + list_del_init(&priv->list); + + zap_vma_ptes(vma, vma->vm_start, + vma->vm_end - vma->vm_start); + vma->vm_flags &= ~(VM_SHARED | VM_MAYSHARE); + } + mutex_unlock(&ufile->umap_lock); + up_write(&mm->mmap_sem); + mmput(mm); + } +} + /* * ib_uverbs_open() does not need the BKL: * @@ -872,6 +1093,8 @@ static int ib_uverbs_open(struct inode *inode, struct file *filp) spin_lock_init(&file->uobjects_lock); INIT_LIST_HEAD(&file->uobjects); init_rwsem(&file->hw_destroy_rwsem); + mutex_init(&file->umap_lock); + INIT_LIST_HEAD(&file->umaps); filp->private_data = file; list_add_tail(&file->list, &dev->uverbs_file_list); diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index e463d3007a35..a66238d8a2a3 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2646,6 +2646,28 @@ void *ib_get_client_data(struct ib_device *device, struct ib_client *client); void ib_set_client_data(struct ib_device *device, struct ib_client *client, void *data); +#if IS_ENABLED(CONFIG_INFINIBAND_USER_ACCESS) +int rdma_user_mmap_io(struct ib_ucontext *ucontext, struct vm_area_struct *vma, + unsigned long pfn, unsigned long size, pgprot_t prot); +int rdma_user_mmap_page(struct ib_ucontext *ucontext, + struct vm_area_struct *vma, struct page *page, + unsigned long size); +#else +static inline int rdma_user_mmap_io(struct ib_ucontext *ucontext, + struct vm_area_struct *vma, + unsigned long pfn, unsigned long size, + pgprot_t prot) +{ + return -EINVAL; +} +static inline int rdma_user_mmap_page(struct ib_ucontext *ucontext, + struct vm_area_struct *vma, struct page *page, + unsigned long size) +{ + return -EINVAL; +} +#endif + static inline int ib_copy_from_udata(void *dest, struct ib_udata *udata, size_t len) { return copy_from_user(dest, udata->inbuf, len) ? -EFAULT : 0; -- cgit v1.2.3 From ce92db1ca84de2ebc5be7a81a68f2e220799fcf5 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Sun, 16 Sep 2018 20:43:12 +0300 Subject: RDMA/ucontext: Get rid of the old disassociate flow The disassociate_ucontext function in every driver is now empty, so we don't need this ugly and wrong code that was messing with tgids. rdma_user_mmap_io does this same work in a better way. Signed-off-by: Jason Gunthorpe Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/core/rdma_core.c | 51 ++++++++----------------------------- 1 file changed, 10 insertions(+), 41 deletions(-) (limited to 'drivers/infiniband/core') diff --git a/drivers/infiniband/core/rdma_core.c b/drivers/infiniband/core/rdma_core.c index 06d31fe56677..6a3acf4bf78a 100644 --- a/drivers/infiniband/core/rdma_core.c +++ b/drivers/infiniband/core/rdma_core.c @@ -794,44 +794,6 @@ void uverbs_close_fd(struct file *f) uverbs_uobject_put(uobj); } -static void ufile_disassociate_ucontext(struct ib_ucontext *ibcontext) -{ - struct ib_device *ib_dev = ibcontext->device; - struct task_struct *owning_process = NULL; - struct mm_struct *owning_mm = NULL; - - owning_process = get_pid_task(ibcontext->tgid, PIDTYPE_PID); - if (!owning_process) - return; - - owning_mm = get_task_mm(owning_process); - if (!owning_mm) { - pr_info("no mm, disassociate ucontext is pending task termination\n"); - while (1) { - put_task_struct(owning_process); - usleep_range(1000, 2000); - owning_process = get_pid_task(ibcontext->tgid, - PIDTYPE_PID); - if (!owning_process || - owning_process->state == TASK_DEAD) { - pr_info("disassociate ucontext done, task was terminated\n"); - /* in case task was dead need to release the - * task struct. - */ - if (owning_process) - put_task_struct(owning_process); - return; - } - } - } - - down_write(&owning_mm->mmap_sem); - ib_dev->disassociate_ucontext(ibcontext); - up_write(&owning_mm->mmap_sem); - mmput(owning_mm); - put_task_struct(owning_process); -} - /* * Drop the ucontext off the ufile and completely disconnect it from the * ib_device @@ -840,22 +802,29 @@ static void ufile_destroy_ucontext(struct ib_uverbs_file *ufile, enum rdma_remove_reason reason) { struct ib_ucontext *ucontext = ufile->ucontext; + struct ib_device *ib_dev = ucontext->device; int ret; + /* + * If we are closing the FD then the user mmap VMAs must have + * already been destroyed as they hold on to the filep, otherwise + * they need to be zap'd. + */ if (reason == RDMA_REMOVE_DRIVER_REMOVE) { uverbs_user_mmap_disassociate(ufile); - ufile_disassociate_ucontext(ucontext); + if (ib_dev->disassociate_ucontext) + ib_dev->disassociate_ucontext(ucontext); } put_pid(ucontext->tgid); - ib_rdmacg_uncharge(&ucontext->cg_obj, ucontext->device, + ib_rdmacg_uncharge(&ucontext->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_HANDLE); /* * FIXME: Drivers are not permitted to fail dealloc_ucontext, remove * the error return. */ - ret = ucontext->device->dealloc_ucontext(ucontext); + ret = ib_dev->dealloc_ucontext(ucontext); WARN_ON(ret); ufile->ucontext = NULL; -- cgit v1.2.3 From d4b4dd1b9706e48c370f88d3adfe713e43423cc9 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Sun, 16 Sep 2018 20:44:45 +0300 Subject: RDMA/umem: Do not use current->tgid to track the mm_struct This is just wrong, the process that calls into the reg_mr is the process associated with the umem, and that does not have to be the same process that created the context. When this code was first written mmgrab() didn't exist, however these days we can just directly hold the mm_struct pointer in the umem and have no ambiguity when it comes to releasing the umem as to which mm it was associated with. Signed-off-by: Jason Gunthorpe Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/core/umem.c | 77 ++++++++++++++++++++---------------------- include/rdma/ib_umem.h | 3 +- 2 files changed, 37 insertions(+), 43 deletions(-) (limited to 'drivers/infiniband/core') diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c index a41792dbae1f..c32a3e27a896 100644 --- a/drivers/infiniband/core/umem.c +++ b/drivers/infiniband/core/umem.c @@ -86,6 +86,7 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, struct vm_area_struct **vma_list; unsigned long lock_limit; unsigned long cur_base; + struct mm_struct *mm; unsigned long npages; int ret; int i; @@ -124,6 +125,8 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, return umem; } + umem->owning_mm = mm = current->mm; + mmgrab(mm); umem->odp_data = NULL; /* We assume the memory is from hugetlb until proved otherwise */ @@ -132,7 +135,7 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, page_list = (struct page **) __get_free_page(GFP_KERNEL); if (!page_list) { ret = -ENOMEM; - goto umem_kfree; + goto umem_kfree_drop; } /* @@ -147,14 +150,14 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; - down_write(¤t->mm->mmap_sem); - current->mm->pinned_vm += npages; - if ((current->mm->pinned_vm > lock_limit) && !capable(CAP_IPC_LOCK)) { - up_write(¤t->mm->mmap_sem); + down_write(&mm->mmap_sem); + mm->pinned_vm += npages; + if ((mm->pinned_vm > lock_limit) && !capable(CAP_IPC_LOCK)) { + up_write(&mm->mmap_sem); ret = -ENOMEM; goto vma; } - up_write(¤t->mm->mmap_sem); + up_write(&mm->mmap_sem); cur_base = addr & PAGE_MASK; @@ -172,14 +175,14 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, sg_list_start = umem->sg_head.sgl; - down_read(¤t->mm->mmap_sem); + down_read(&mm->mmap_sem); while (npages) { ret = get_user_pages_longterm(cur_base, min_t(unsigned long, npages, PAGE_SIZE / sizeof (struct page *)), gup_flags, page_list, vma_list); if (ret < 0) { - up_read(¤t->mm->mmap_sem); + up_read(&mm->mmap_sem); goto umem_release; } @@ -197,7 +200,7 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, /* preparing for next loop */ sg_list_start = sg; } - up_read(¤t->mm->mmap_sem); + up_read(&mm->mmap_sem); umem->nmap = ib_dma_map_sg_attrs(context->device, umem->sg_head.sgl, @@ -223,6 +226,9 @@ out: if (vma_list) free_page((unsigned long) vma_list); free_page((unsigned long) page_list); +umem_kfree_drop: + if (ret) + mmdrop(umem->owning_mm); umem_kfree: if (ret) kfree(umem); @@ -230,15 +236,21 @@ umem_kfree: } EXPORT_SYMBOL(ib_umem_get); -static void ib_umem_account(struct work_struct *work) +static void __ib_umem_release_tail(struct ib_umem *umem) +{ + mmdrop(umem->owning_mm); + kfree(umem); +} + +static void ib_umem_release_defer(struct work_struct *work) { struct ib_umem *umem = container_of(work, struct ib_umem, work); - down_write(&umem->mm->mmap_sem); - umem->mm->pinned_vm -= umem->diff; - up_write(&umem->mm->mmap_sem); - mmput(umem->mm); - kfree(umem); + down_write(&umem->owning_mm->mmap_sem); + umem->owning_mm->pinned_vm -= ib_umem_num_pages(umem); + up_write(&umem->owning_mm->mmap_sem); + + __ib_umem_release_tail(umem); } /** @@ -248,9 +260,6 @@ static void ib_umem_account(struct work_struct *work) void ib_umem_release(struct ib_umem *umem) { struct ib_ucontext *context = umem->context; - struct mm_struct *mm; - struct task_struct *task; - unsigned long diff; if (umem->odp_data) { ib_umem_odp_release(umem); @@ -259,41 +268,27 @@ void ib_umem_release(struct ib_umem *umem) __ib_umem_release(umem->context->device, umem, 1); - task = get_pid_task(umem->context->tgid, PIDTYPE_PID); - if (!task) - goto out; - mm = get_task_mm(task); - put_task_struct(task); - if (!mm) - goto out; - - diff = ib_umem_num_pages(umem); - /* * We may be called with the mm's mmap_sem already held. This * can happen when a userspace munmap() is the call that drops * the last reference to our file and calls our release * method. If there are memory regions to destroy, we'll end * up here and not be able to take the mmap_sem. In that case - * we defer the vm_locked accounting to the system workqueue. + * we defer the vm_locked accounting a workqueue. */ if (context->closing) { - if (!down_write_trylock(&mm->mmap_sem)) { - INIT_WORK(&umem->work, ib_umem_account); - umem->mm = mm; - umem->diff = diff; - + if (!down_write_trylock(&umem->owning_mm->mmap_sem)) { + INIT_WORK(&umem->work, ib_umem_release_defer); queue_work(ib_wq, &umem->work); return; } - } else - down_write(&mm->mmap_sem); + } else { + down_write(&umem->owning_mm->mmap_sem); + } + umem->owning_mm->pinned_vm -= ib_umem_num_pages(umem); + up_write(&umem->owning_mm->mmap_sem); - mm->pinned_vm -= diff; - up_write(&mm->mmap_sem); - mmput(mm); -out: - kfree(umem); + __ib_umem_release_tail(umem); } EXPORT_SYMBOL(ib_umem_release); diff --git a/include/rdma/ib_umem.h b/include/rdma/ib_umem.h index a1fd63871d17..e1c00b2ead19 100644 --- a/include/rdma/ib_umem.h +++ b/include/rdma/ib_umem.h @@ -42,14 +42,13 @@ struct ib_umem_odp; struct ib_umem { struct ib_ucontext *context; + struct mm_struct *owning_mm; size_t length; unsigned long address; int page_shift; int writable; int hugetlb; struct work_struct work; - struct mm_struct *mm; - unsigned long diff; struct ib_umem_odp *odp_data; struct sg_table sg_head; int nmap; -- cgit v1.2.3 From b5231b019d76521dd8c59a54c174770ec92c767c Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Sun, 16 Sep 2018 20:48:04 +0300 Subject: RDMA/umem: Use ib_umem_odp in all function signatures connected to ODP All of these functions already require the ODP version of the umem struct, make this very clear by having the signature require it. This paves the way to using the container_of() pattern to link umem_odp and umem together. Signed-off-by: Jason Gunthorpe Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/core/umem.c | 2 +- drivers/infiniband/core/umem_odp.c | 139 ++++++++++++++++++----------------- drivers/infiniband/hw/mlx5/mlx5_ib.h | 2 +- drivers/infiniband/hw/mlx5/mr.c | 3 +- drivers/infiniband/hw/mlx5/odp.c | 54 +++++++------- include/rdma/ib_umem_odp.h | 39 +++++----- include/rdma/ib_verbs.h | 4 +- 7 files changed, 129 insertions(+), 114 deletions(-) (limited to 'drivers/infiniband/core') diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c index c32a3e27a896..971d92ddea8f 100644 --- a/drivers/infiniband/core/umem.c +++ b/drivers/infiniband/core/umem.c @@ -262,7 +262,7 @@ void ib_umem_release(struct ib_umem *umem) struct ib_ucontext *context = umem->context; if (umem->odp_data) { - ib_umem_odp_release(umem); + ib_umem_odp_release(to_ib_umem_odp(umem)); return; } diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c index 29e34e6a6420..8405e9afd7dc 100644 --- a/drivers/infiniband/core/umem_odp.c +++ b/drivers/infiniband/core/umem_odp.c @@ -77,41 +77,41 @@ static u64 node_last(struct umem_odp_node *n) INTERVAL_TREE_DEFINE(struct umem_odp_node, rb, u64, __subtree_last, node_start, node_last, static, rbt_ib_umem) -static void ib_umem_notifier_start_account(struct ib_umem *item) +static void ib_umem_notifier_start_account(struct ib_umem_odp *umem_odp) { - mutex_lock(&item->odp_data->umem_mutex); + mutex_lock(&umem_odp->umem_mutex); /* Only update private counters for this umem if it has them. * Otherwise skip it. All page faults will be delayed for this umem. */ - if (item->odp_data->mn_counters_active) { - int notifiers_count = item->odp_data->notifiers_count++; + if (umem_odp->mn_counters_active) { + int notifiers_count = umem_odp->notifiers_count++; if (notifiers_count == 0) /* Initialize the completion object for waiting on * notifiers. Since notifier_count is zero, no one * should be waiting right now. */ - reinit_completion(&item->odp_data->notifier_completion); + reinit_completion(&umem_odp->notifier_completion); } - mutex_unlock(&item->odp_data->umem_mutex); + mutex_unlock(&umem_odp->umem_mutex); } -static void ib_umem_notifier_end_account(struct ib_umem *item) +static void ib_umem_notifier_end_account(struct ib_umem_odp *umem_odp) { - mutex_lock(&item->odp_data->umem_mutex); + mutex_lock(&umem_odp->umem_mutex); /* Only update private counters for this umem if it has them. * Otherwise skip it. All page faults will be delayed for this umem. */ - if (item->odp_data->mn_counters_active) { + if (umem_odp->mn_counters_active) { /* * This sequence increase will notify the QP page fault that * the page that is going to be mapped in the spte could have * been freed. */ - ++item->odp_data->notifiers_seq; - if (--item->odp_data->notifiers_count == 0) - complete_all(&item->odp_data->notifier_completion); + ++umem_odp->notifiers_seq; + if (--umem_odp->notifiers_count == 0) + complete_all(&umem_odp->notifier_completion); } - mutex_unlock(&item->odp_data->umem_mutex); + mutex_unlock(&umem_odp->umem_mutex); } /* Account for a new mmu notifier in an ib_ucontext. */ @@ -156,20 +156,23 @@ static void ib_ucontext_notifier_end_account(struct ib_ucontext *context) } } -static int ib_umem_notifier_release_trampoline(struct ib_umem *item, u64 start, - u64 end, void *cookie) { +static int ib_umem_notifier_release_trampoline(struct ib_umem_odp *umem_odp, + u64 start, u64 end, void *cookie) +{ + struct ib_umem *umem = umem_odp->umem; + /* * Increase the number of notifiers running, to * prevent any further fault handling on this MR. */ - ib_umem_notifier_start_account(item); - item->odp_data->dying = 1; + ib_umem_notifier_start_account(umem_odp); + umem_odp->dying = 1; /* Make sure that the fact the umem is dying is out before we release * all pending page faults. */ smp_wmb(); - complete_all(&item->odp_data->notifier_completion); - item->context->invalidate_range(item, ib_umem_start(item), - ib_umem_end(item)); + complete_all(&umem_odp->notifier_completion); + umem->context->invalidate_range(umem_odp, ib_umem_start(umem), + ib_umem_end(umem)); return 0; } @@ -191,20 +194,20 @@ static void ib_umem_notifier_release(struct mmu_notifier *mn, up_read(&context->umem_rwsem); } -static int invalidate_page_trampoline(struct ib_umem *item, u64 start, +static int invalidate_page_trampoline(struct ib_umem_odp *item, u64 start, u64 end, void *cookie) { ib_umem_notifier_start_account(item); - item->context->invalidate_range(item, start, start + PAGE_SIZE); + item->umem->context->invalidate_range(item, start, start + PAGE_SIZE); ib_umem_notifier_end_account(item); return 0; } -static int invalidate_range_start_trampoline(struct ib_umem *item, u64 start, - u64 end, void *cookie) +static int invalidate_range_start_trampoline(struct ib_umem_odp *item, + u64 start, u64 end, void *cookie) { ib_umem_notifier_start_account(item); - item->context->invalidate_range(item, start, end); + item->umem->context->invalidate_range(item, start, end); return 0; } @@ -235,7 +238,7 @@ static int ib_umem_notifier_invalidate_range_start(struct mmu_notifier *mn, return ret; } -static int invalidate_range_end_trampoline(struct ib_umem *item, u64 start, +static int invalidate_range_end_trampoline(struct ib_umem_odp *item, u64 start, u64 end, void *cookie) { ib_umem_notifier_end_account(item); @@ -271,9 +274,8 @@ static const struct mmu_notifier_ops ib_umem_notifiers = { .invalidate_range_end = ib_umem_notifier_invalidate_range_end, }; -struct ib_umem *ib_alloc_odp_umem(struct ib_ucontext *context, - unsigned long addr, - size_t size) +struct ib_umem_odp *ib_alloc_odp_umem(struct ib_ucontext *context, + unsigned long addr, size_t size) { struct ib_umem *umem; struct ib_umem_odp *odp_data; @@ -326,7 +328,7 @@ struct ib_umem *ib_alloc_odp_umem(struct ib_ucontext *context, umem->odp_data = odp_data; - return umem; + return odp_data; out_page_list: vfree(odp_data->page_list); @@ -462,8 +464,9 @@ out_mm: return ret_val; } -void ib_umem_odp_release(struct ib_umem *umem) +void ib_umem_odp_release(struct ib_umem_odp *umem_odp) { + struct ib_umem *umem = umem_odp->umem; struct ib_ucontext *context = umem->context; /* @@ -472,17 +475,17 @@ void ib_umem_odp_release(struct ib_umem *umem) * It is the driver's responsibility to ensure, before calling us, * that the hardware will not attempt to access the MR any more. */ - ib_umem_odp_unmap_dma_pages(umem, ib_umem_start(umem), + ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem), ib_umem_end(umem)); down_write(&context->umem_rwsem); if (likely(ib_umem_start(umem) != ib_umem_end(umem))) - rbt_ib_umem_remove(&umem->odp_data->interval_tree, + rbt_ib_umem_remove(&umem_odp->interval_tree, &context->umem_tree); context->odp_mrs_count--; - if (!umem->odp_data->mn_counters_active) { - list_del(&umem->odp_data->no_private_counters); - complete_all(&umem->odp_data->notifier_completion); + if (!umem_odp->mn_counters_active) { + list_del(&umem_odp->no_private_counters); + complete_all(&umem_odp->notifier_completion); } /* @@ -523,9 +526,9 @@ out_put_task: out: up_read(&context->umem_rwsem); - vfree(umem->odp_data->dma_list); - vfree(umem->odp_data->page_list); - kfree(umem->odp_data); + vfree(umem_odp->dma_list); + vfree(umem_odp->page_list); + kfree(umem_odp); kfree(umem); } @@ -538,7 +541,7 @@ out: * @access_mask: access permissions needed for this page. * @current_seq: sequence number for synchronization with invalidations. * the sequence number is taken from - * umem->odp_data->notifiers_seq. + * umem_odp->notifiers_seq. * * The function returns -EFAULT if the DMA mapping operation fails. It returns * -EAGAIN if a concurrent invalidation prevents us from updating the page. @@ -548,12 +551,13 @@ out: * umem. */ static int ib_umem_odp_map_dma_single_page( - struct ib_umem *umem, + struct ib_umem_odp *umem_odp, int page_index, struct page *page, u64 access_mask, unsigned long current_seq) { + struct ib_umem *umem = umem_odp->umem; struct ib_device *dev = umem->context->device; dma_addr_t dma_addr; int stored_page = 0; @@ -565,11 +569,11 @@ static int ib_umem_odp_map_dma_single_page( * handle case of a racing notifier. This check also allows us to bail * early if we have a notifier running in parallel with us. */ - if (ib_umem_mmu_notifier_retry(umem, current_seq)) { + if (ib_umem_mmu_notifier_retry(umem_odp, current_seq)) { ret = -EAGAIN; goto out; } - if (!(umem->odp_data->dma_list[page_index])) { + if (!(umem_odp->dma_list[page_index])) { dma_addr = ib_dma_map_page(dev, page, 0, BIT(umem->page_shift), @@ -578,15 +582,15 @@ static int ib_umem_odp_map_dma_single_page( ret = -EFAULT; goto out; } - umem->odp_data->dma_list[page_index] = dma_addr | access_mask; - umem->odp_data->page_list[page_index] = page; + umem_odp->dma_list[page_index] = dma_addr | access_mask; + umem_odp->page_list[page_index] = page; umem->npages++; stored_page = 1; - } else if (umem->odp_data->page_list[page_index] == page) { - umem->odp_data->dma_list[page_index] |= access_mask; + } else if (umem_odp->page_list[page_index] == page) { + umem_odp->dma_list[page_index] |= access_mask; } else { pr_err("error: got different pages in IB device and from get_user_pages. IB device page: %p, gup page: %p\n", - umem->odp_data->page_list[page_index], page); + umem_odp->page_list[page_index], page); /* Better remove the mapping now, to prevent any further * damage. */ remove_existing_mapping = 1; @@ -599,7 +603,7 @@ out: if (remove_existing_mapping && umem->context->invalidate_range) { invalidate_page_trampoline( - umem, + umem_odp, ib_umem_start(umem) + (page_index >> umem->page_shift), ib_umem_start(umem) + ((page_index + 1) >> umem->page_shift), @@ -615,7 +619,7 @@ out: * * Pins the range of pages passed in the argument, and maps them to * DMA addresses. The DMA addresses of the mapped pages is updated in - * umem->odp_data->dma_list. + * umem_odp->dma_list. * * Returns the number of pages mapped in success, negative error code * for failure. @@ -623,7 +627,7 @@ out: * the function from completing its task. * An -ENOENT error code indicates that userspace process is being terminated * and mm was already destroyed. - * @umem: the umem to map and pin + * @umem_odp: the umem to map and pin * @user_virt: the address from which we need to map. * @bcnt: the minimal number of bytes to pin and map. The mapping might be * bigger due to alignment, and may also be smaller in case of an error @@ -633,11 +637,13 @@ out: * range. * @current_seq: the MMU notifiers sequance value for synchronization with * invalidations. the sequance number is read from - * umem->odp_data->notifiers_seq before calling this function + * umem_odp->notifiers_seq before calling this function */ -int ib_umem_odp_map_dma_pages(struct ib_umem *umem, u64 user_virt, u64 bcnt, - u64 access_mask, unsigned long current_seq) +int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 user_virt, + u64 bcnt, u64 access_mask, + unsigned long current_seq) { + struct ib_umem *umem = umem_odp->umem; struct task_struct *owning_process = NULL; struct mm_struct *owning_mm = NULL; struct page **local_page_list = NULL; @@ -703,7 +709,7 @@ int ib_umem_odp_map_dma_pages(struct ib_umem *umem, u64 user_virt, u64 bcnt, break; bcnt -= min_t(size_t, npages << PAGE_SHIFT, bcnt); - mutex_lock(&umem->odp_data->umem_mutex); + mutex_lock(&umem_odp->umem_mutex); for (j = 0; j < npages; j++, user_virt += PAGE_SIZE) { if (user_virt & ~page_mask) { p += PAGE_SIZE; @@ -716,7 +722,7 @@ int ib_umem_odp_map_dma_pages(struct ib_umem *umem, u64 user_virt, u64 bcnt, } ret = ib_umem_odp_map_dma_single_page( - umem, k, local_page_list[j], + umem_odp, k, local_page_list[j], access_mask, current_seq); if (ret < 0) break; @@ -724,7 +730,7 @@ int ib_umem_odp_map_dma_pages(struct ib_umem *umem, u64 user_virt, u64 bcnt, p = page_to_phys(local_page_list[j]); k++; } - mutex_unlock(&umem->odp_data->umem_mutex); + mutex_unlock(&umem_odp->umem_mutex); if (ret < 0) { /* Release left over pages when handling errors. */ @@ -750,9 +756,10 @@ out_no_task: } EXPORT_SYMBOL(ib_umem_odp_map_dma_pages); -void ib_umem_odp_unmap_dma_pages(struct ib_umem *umem, u64 virt, +void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 virt, u64 bound) { + struct ib_umem *umem = umem_odp->umem; int idx; u64 addr; struct ib_device *dev = umem->context->device; @@ -764,12 +771,12 @@ void ib_umem_odp_unmap_dma_pages(struct ib_umem *umem, u64 virt, * faults from completion. We might be racing with other * invalidations, so we must make sure we free each page only * once. */ - mutex_lock(&umem->odp_data->umem_mutex); + mutex_lock(&umem_odp->umem_mutex); for (addr = virt; addr < bound; addr += BIT(umem->page_shift)) { idx = (addr - ib_umem_start(umem)) >> umem->page_shift; - if (umem->odp_data->page_list[idx]) { - struct page *page = umem->odp_data->page_list[idx]; - dma_addr_t dma = umem->odp_data->dma_list[idx]; + if (umem_odp->page_list[idx]) { + struct page *page = umem_odp->page_list[idx]; + dma_addr_t dma = umem_odp->dma_list[idx]; dma_addr_t dma_addr = dma & ODP_DMA_ADDR_MASK; WARN_ON(!dma_addr); @@ -792,12 +799,12 @@ void ib_umem_odp_unmap_dma_pages(struct ib_umem *umem, u64 virt, /* on demand pinning support */ if (!umem->context->invalidate_range) put_page(page); - umem->odp_data->page_list[idx] = NULL; - umem->odp_data->dma_list[idx] = 0; + umem_odp->page_list[idx] = NULL; + umem_odp->dma_list[idx] = 0; umem->npages--; } } - mutex_unlock(&umem->odp_data->umem_mutex); + mutex_unlock(&umem_odp->umem_mutex); } EXPORT_SYMBOL(ib_umem_odp_unmap_dma_pages); @@ -824,7 +831,7 @@ int rbt_ib_umem_for_each_in_range(struct rb_root_cached *root, return -EAGAIN; next = rbt_ib_umem_iter_next(node, start, last - 1); umem = container_of(node, struct ib_umem_odp, interval_tree); - ret_val = cb(umem->umem, start, last, cookie) || ret_val; + ret_val = cb(umem, start, last, cookie) || ret_val; } return ret_val; diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 81154b598266..dc34ffa4c8b3 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -1150,7 +1150,7 @@ void mlx5_ib_pfault(struct mlx5_core_dev *mdev, void *context, int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev); int __init mlx5_ib_odp_init(void); void mlx5_ib_odp_cleanup(void); -void mlx5_ib_invalidate_range(struct ib_umem *umem, unsigned long start, +void mlx5_ib_invalidate_range(struct ib_umem_odp *umem_odp, unsigned long start, unsigned long end); void mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent *ent); void mlx5_odp_populate_klm(struct mlx5_klm *pklm, size_t offset, diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index 9fb1d9cb9401..affbf2831ccd 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -1631,7 +1631,8 @@ static void dereg_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) synchronize_srcu(&dev->mr_srcu); /* Destroy all page mappings */ if (umem->odp_data->page_list) - mlx5_ib_invalidate_range(umem, ib_umem_start(umem), + mlx5_ib_invalidate_range(to_ib_umem_odp(umem), + ib_umem_start(umem), ib_umem_end(umem)); else mlx5_ib_free_implicit_mr(mr); diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index d216e0d2921d..8f4a4a8171eb 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -170,22 +170,24 @@ static void mr_leaf_free_action(struct work_struct *work) wake_up(&imr->q_leaf_free); } -void mlx5_ib_invalidate_range(struct ib_umem *umem, unsigned long start, +void mlx5_ib_invalidate_range(struct ib_umem_odp *umem_odp, unsigned long start, unsigned long end) { struct mlx5_ib_mr *mr; const u64 umr_block_mask = (MLX5_UMR_MTT_ALIGNMENT / sizeof(struct mlx5_mtt)) - 1; u64 idx = 0, blk_start_idx = 0; + struct ib_umem *umem; int in_block = 0; u64 addr; - if (!umem || !umem->odp_data) { + if (!umem_odp) { pr_err("invalidation called on NULL umem or non-ODP umem\n"); return; } + umem = umem_odp->umem; - mr = umem->odp_data->private; + mr = umem_odp->private; if (!mr || !mr->ibmr.pd) return; @@ -208,7 +210,7 @@ void mlx5_ib_invalidate_range(struct ib_umem *umem, unsigned long start, * estimate the cost of another UMR vs. the cost of bigger * UMR. */ - if (umem->odp_data->dma_list[idx] & + if (umem_odp->dma_list[idx] & (ODP_READ_ALLOWED_BIT | ODP_WRITE_ALLOWED_BIT)) { if (!in_block) { blk_start_idx = idx; @@ -237,13 +239,13 @@ void mlx5_ib_invalidate_range(struct ib_umem *umem, unsigned long start, * needed. */ - ib_umem_odp_unmap_dma_pages(umem, start, end); + ib_umem_odp_unmap_dma_pages(umem_odp, start, end); if (unlikely(!umem->npages && mr->parent && - !umem->odp_data->dying)) { - WRITE_ONCE(umem->odp_data->dying, 1); + !umem_odp->dying)) { + WRITE_ONCE(umem_odp->dying, 1); atomic_inc(&mr->parent->num_leaf_free); - schedule_work(&umem->odp_data->work); + schedule_work(&umem_odp->work); } } @@ -372,7 +374,6 @@ static struct ib_umem_odp *implicit_mr_get_data(struct mlx5_ib_mr *mr, u64 addr = io_virt & MLX5_IMR_MTT_MASK; int nentries = 0, start_idx = 0, ret; struct mlx5_ib_mr *mtt; - struct ib_umem *umem; mutex_lock(&mr->umem->odp_data->umem_mutex); odp = odp_lookup(ctx, addr, 1, mr); @@ -385,22 +386,22 @@ next_mr: if (nentries) nentries++; } else { - umem = ib_alloc_odp_umem(ctx, addr, MLX5_IMR_MTT_SIZE); - if (IS_ERR(umem)) { + odp = ib_alloc_odp_umem(ctx, addr, MLX5_IMR_MTT_SIZE); + if (IS_ERR(odp)) { mutex_unlock(&mr->umem->odp_data->umem_mutex); - return ERR_CAST(umem); + return ERR_CAST(odp); } - mtt = implicit_mr_alloc(mr->ibmr.pd, umem, 0, mr->access_flags); + mtt = implicit_mr_alloc(mr->ibmr.pd, odp->umem, 0, + mr->access_flags); if (IS_ERR(mtt)) { mutex_unlock(&mr->umem->odp_data->umem_mutex); - ib_umem_release(umem); + ib_umem_release(odp->umem); return ERR_CAST(mtt); } - odp = umem->odp_data; odp->private = mtt; - mtt->umem = umem; + mtt->umem = odp->umem; mtt->mmkey.iova = addr; mtt->parent = mr; INIT_WORK(&odp->work, mr_leaf_free_action); @@ -460,24 +461,24 @@ struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd, return imr; } -static int mr_leaf_free(struct ib_umem *umem, u64 start, - u64 end, void *cookie) +static int mr_leaf_free(struct ib_umem_odp *umem_odp, u64 start, u64 end, + void *cookie) { - struct mlx5_ib_mr *mr = umem->odp_data->private, *imr = cookie; + struct mlx5_ib_mr *mr = umem_odp->private, *imr = cookie; + struct ib_umem *umem = umem_odp->umem; if (mr->parent != imr) return 0; - ib_umem_odp_unmap_dma_pages(umem, - ib_umem_start(umem), + ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem), ib_umem_end(umem)); - if (umem->odp_data->dying) + if (umem_odp->dying) return 0; - WRITE_ONCE(umem->odp_data->dying, 1); + WRITE_ONCE(umem_odp->dying, 1); atomic_inc(&imr->num_leaf_free); - schedule_work(&umem->odp_data->work); + schedule_work(&umem_odp->work); return 0; } @@ -533,7 +534,7 @@ next_mr: */ smp_rmb(); - ret = ib_umem_odp_map_dma_pages(mr->umem, io_virt, size, + ret = ib_umem_odp_map_dma_pages(to_ib_umem_odp(mr->umem), io_virt, size, access_mask, current_seq); if (ret < 0) @@ -542,7 +543,8 @@ next_mr: np = ret; mutex_lock(&odp->umem_mutex); - if (!ib_umem_mmu_notifier_retry(mr->umem, current_seq)) { + if (!ib_umem_mmu_notifier_retry(to_ib_umem_odp(mr->umem), + current_seq)) { /* * No need to check whether the MTTs really belong to * this MR, since ib_umem_odp_map_dma_pages already diff --git a/include/rdma/ib_umem_odp.h b/include/rdma/ib_umem_odp.h index 381cdf5a9bd1..3ef2975b5fb2 100644 --- a/include/rdma/ib_umem_odp.h +++ b/include/rdma/ib_umem_odp.h @@ -82,15 +82,18 @@ struct ib_umem_odp { struct work_struct work; }; +static inline struct ib_umem_odp *to_ib_umem_odp(struct ib_umem *umem) +{ + return umem->odp_data; +} + #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING int ib_umem_odp_get(struct ib_ucontext *context, struct ib_umem *umem, int access); -struct ib_umem *ib_alloc_odp_umem(struct ib_ucontext *context, - unsigned long addr, - size_t size); - -void ib_umem_odp_release(struct ib_umem *umem); +struct ib_umem_odp *ib_alloc_odp_umem(struct ib_ucontext *context, + unsigned long addr, size_t size); +void ib_umem_odp_release(struct ib_umem_odp *umem_odp); /* * The lower 2 bits of the DMA address signal the R/W permissions for @@ -105,13 +108,14 @@ void ib_umem_odp_release(struct ib_umem *umem); #define ODP_DMA_ADDR_MASK (~(ODP_READ_ALLOWED_BIT | ODP_WRITE_ALLOWED_BIT)) -int ib_umem_odp_map_dma_pages(struct ib_umem *umem, u64 start_offset, u64 bcnt, - u64 access_mask, unsigned long current_seq); +int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 start_offset, + u64 bcnt, u64 access_mask, + unsigned long current_seq); -void ib_umem_odp_unmap_dma_pages(struct ib_umem *umem, u64 start_offset, +void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 start_offset, u64 bound); -typedef int (*umem_call_back)(struct ib_umem *item, u64 start, u64 end, +typedef int (*umem_call_back)(struct ib_umem_odp *item, u64 start, u64 end, void *cookie); /* * Call the callback on each ib_umem in the range. Returns the logical or of @@ -129,25 +133,25 @@ int rbt_ib_umem_for_each_in_range(struct rb_root_cached *root, struct ib_umem_odp *rbt_ib_umem_lookup(struct rb_root_cached *root, u64 addr, u64 length); -static inline int ib_umem_mmu_notifier_retry(struct ib_umem *item, +static inline int ib_umem_mmu_notifier_retry(struct ib_umem_odp *umem_odp, unsigned long mmu_seq) { /* * This code is strongly based on the KVM code from * mmu_notifier_retry. Should be called with - * the relevant locks taken (item->odp_data->umem_mutex + * the relevant locks taken (umem_odp->umem_mutex * and the ucontext umem_mutex semaphore locked for read). */ /* Do not allow page faults while the new ib_umem hasn't seen a state * with zero notifiers yet, and doesn't have its own valid set of * private counters. */ - if (!item->odp_data->mn_counters_active) + if (!umem_odp->mn_counters_active) return 1; - if (unlikely(item->odp_data->notifiers_count)) + if (unlikely(umem_odp->notifiers_count)) return 1; - if (item->odp_data->notifiers_seq != mmu_seq) + if (umem_odp->notifiers_seq != mmu_seq) return 1; return 0; } @@ -161,14 +165,13 @@ static inline int ib_umem_odp_get(struct ib_ucontext *context, return -EINVAL; } -static inline struct ib_umem *ib_alloc_odp_umem(struct ib_ucontext *context, - unsigned long addr, - size_t size) +static inline struct ib_umem_odp * +ib_alloc_odp_umem(struct ib_ucontext *context, unsigned long addr, size_t size) { return ERR_PTR(-EINVAL); } -static inline void ib_umem_odp_release(struct ib_umem *umem) {} +static inline void ib_umem_odp_release(struct ib_umem_odp *umem_odp) {} #endif /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */ diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index a66238d8a2a3..d611ce9df7fb 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -69,6 +69,8 @@ #define IB_FW_VERSION_NAME_MAX ETHTOOL_FWVERS_LEN +struct ib_umem_odp; + extern struct workqueue_struct *ib_wq; extern struct workqueue_struct *ib_comp_wq; extern struct workqueue_struct *ib_comp_unbound_wq; @@ -1506,7 +1508,7 @@ struct ib_ucontext { * mmu notifiers registration. */ struct rw_semaphore umem_rwsem; - void (*invalidate_range)(struct ib_umem *umem, + void (*invalidate_range)(struct ib_umem_odp *umem_odp, unsigned long start, unsigned long end); struct mmu_notifier mn; -- cgit v1.2.3 From 41b4deeaa123e62e1037af7a0be547af2e0e05f1 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Sun, 16 Sep 2018 20:48:05 +0300 Subject: RDMA/umem: Make ib_umem_odp into a sub structure of ib_umem These two structures are linked together, use the container_of pattern instead of a double allocation to make the code simpler and easier to follow. Signed-off-by: Jason Gunthorpe Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/core/umem.c | 36 ++++++++++------- drivers/infiniband/core/umem_odp.c | 79 +++++++++++++++----------------------- drivers/infiniband/hw/mlx5/odp.c | 26 ++++++------- include/rdma/ib_umem_odp.h | 11 ++---- 4 files changed, 69 insertions(+), 83 deletions(-) (limited to 'drivers/infiniband/core') diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c index 971d92ddea8f..88b9b88f90e1 100644 --- a/drivers/infiniband/core/umem.c +++ b/drivers/infiniband/core/umem.c @@ -108,34 +108,39 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, if (!can_do_mlock()) return ERR_PTR(-EPERM); - umem = kzalloc(sizeof *umem, GFP_KERNEL); - if (!umem) - return ERR_PTR(-ENOMEM); + if (access & IB_ACCESS_ON_DEMAND) { + umem = kzalloc(sizeof(struct ib_umem_odp), GFP_KERNEL); + if (!umem) + return ERR_PTR(-ENOMEM); + umem->odp_data = to_ib_umem_odp(umem); + } else { + umem = kzalloc(sizeof(*umem), GFP_KERNEL); + if (!umem) + return ERR_PTR(-ENOMEM); + } umem->context = context; umem->length = size; umem->address = addr; umem->page_shift = PAGE_SHIFT; umem->writable = ib_access_writable(access); + umem->owning_mm = mm = current->mm; + mmgrab(mm); if (access & IB_ACCESS_ON_DEMAND) { - ret = ib_umem_odp_get(context, umem, access); + ret = ib_umem_odp_get(to_ib_umem_odp(umem), access); if (ret) goto umem_kfree; return umem; } - umem->owning_mm = mm = current->mm; - mmgrab(mm); - umem->odp_data = NULL; - /* We assume the memory is from hugetlb until proved otherwise */ umem->hugetlb = 1; page_list = (struct page **) __get_free_page(GFP_KERNEL); if (!page_list) { ret = -ENOMEM; - goto umem_kfree_drop; + goto umem_kfree; } /* @@ -226,12 +231,11 @@ out: if (vma_list) free_page((unsigned long) vma_list); free_page((unsigned long) page_list); -umem_kfree_drop: - if (ret) - mmdrop(umem->owning_mm); umem_kfree: - if (ret) + if (ret) { + mmdrop(umem->owning_mm); kfree(umem); + } return ret ? ERR_PTR(ret) : umem; } EXPORT_SYMBOL(ib_umem_get); @@ -239,7 +243,10 @@ EXPORT_SYMBOL(ib_umem_get); static void __ib_umem_release_tail(struct ib_umem *umem) { mmdrop(umem->owning_mm); - kfree(umem); + if (umem->odp_data) + kfree(to_ib_umem_odp(umem)); + else + kfree(umem); } static void ib_umem_release_defer(struct work_struct *work) @@ -263,6 +270,7 @@ void ib_umem_release(struct ib_umem *umem) if (umem->odp_data) { ib_umem_odp_release(to_ib_umem_odp(umem)); + __ib_umem_release_tail(umem); return; } diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c index 8405e9afd7dc..900fdedfe910 100644 --- a/drivers/infiniband/core/umem_odp.c +++ b/drivers/infiniband/core/umem_odp.c @@ -58,7 +58,7 @@ static u64 node_start(struct umem_odp_node *n) struct ib_umem_odp *umem_odp = container_of(n, struct ib_umem_odp, interval_tree); - return ib_umem_start(umem_odp->umem); + return ib_umem_start(&umem_odp->umem); } /* Note that the representation of the intervals in the interval tree @@ -71,7 +71,7 @@ static u64 node_last(struct umem_odp_node *n) struct ib_umem_odp *umem_odp = container_of(n, struct ib_umem_odp, interval_tree); - return ib_umem_end(umem_odp->umem) - 1; + return ib_umem_end(&umem_odp->umem) - 1; } INTERVAL_TREE_DEFINE(struct umem_odp_node, rb, u64, __subtree_last, @@ -159,7 +159,7 @@ static void ib_ucontext_notifier_end_account(struct ib_ucontext *context) static int ib_umem_notifier_release_trampoline(struct ib_umem_odp *umem_odp, u64 start, u64 end, void *cookie) { - struct ib_umem *umem = umem_odp->umem; + struct ib_umem *umem = &umem_odp->umem; /* * Increase the number of notifiers running, to @@ -198,7 +198,7 @@ static int invalidate_page_trampoline(struct ib_umem_odp *item, u64 start, u64 end, void *cookie) { ib_umem_notifier_start_account(item); - item->umem->context->invalidate_range(item, start, start + PAGE_SIZE); + item->umem.context->invalidate_range(item, start, start + PAGE_SIZE); ib_umem_notifier_end_account(item); return 0; } @@ -207,7 +207,7 @@ static int invalidate_range_start_trampoline(struct ib_umem_odp *item, u64 start, u64 end, void *cookie) { ib_umem_notifier_start_account(item); - item->umem->context->invalidate_range(item, start, end); + item->umem.context->invalidate_range(item, start, end); return 0; } @@ -277,28 +277,21 @@ static const struct mmu_notifier_ops ib_umem_notifiers = { struct ib_umem_odp *ib_alloc_odp_umem(struct ib_ucontext *context, unsigned long addr, size_t size) { - struct ib_umem *umem; struct ib_umem_odp *odp_data; + struct ib_umem *umem; int pages = size >> PAGE_SHIFT; int ret; - umem = kzalloc(sizeof(*umem), GFP_KERNEL); - if (!umem) + odp_data = kzalloc(sizeof(*odp_data), GFP_KERNEL); + if (!odp_data) return ERR_PTR(-ENOMEM); - + umem = &odp_data->umem; umem->context = context; umem->length = size; umem->address = addr; umem->page_shift = PAGE_SHIFT; umem->writable = 1; - odp_data = kzalloc(sizeof(*odp_data), GFP_KERNEL); - if (!odp_data) { - ret = -ENOMEM; - goto out_umem; - } - odp_data->umem = umem; - mutex_init(&odp_data->umem_mutex); init_completion(&odp_data->notifier_completion); @@ -334,15 +327,14 @@ out_page_list: vfree(odp_data->page_list); out_odp_data: kfree(odp_data); -out_umem: - kfree(umem); return ERR_PTR(ret); } EXPORT_SYMBOL(ib_alloc_odp_umem); -int ib_umem_odp_get(struct ib_ucontext *context, struct ib_umem *umem, - int access) +int ib_umem_odp_get(struct ib_umem_odp *umem_odp, int access) { + struct ib_ucontext *context = umem_odp->umem.context; + struct ib_umem *umem = &umem_odp->umem; int ret_val; struct pid *our_pid; struct mm_struct *mm = get_task_mm(current); @@ -378,30 +370,23 @@ int ib_umem_odp_get(struct ib_ucontext *context, struct ib_umem *umem, goto out_mm; } - umem->odp_data = kzalloc(sizeof(*umem->odp_data), GFP_KERNEL); - if (!umem->odp_data) { - ret_val = -ENOMEM; - goto out_mm; - } - umem->odp_data->umem = umem; - - mutex_init(&umem->odp_data->umem_mutex); + mutex_init(&umem_odp->umem_mutex); - init_completion(&umem->odp_data->notifier_completion); + init_completion(&umem_odp->notifier_completion); if (ib_umem_num_pages(umem)) { - umem->odp_data->page_list = - vzalloc(array_size(sizeof(*umem->odp_data->page_list), + umem_odp->page_list = + vzalloc(array_size(sizeof(*umem_odp->page_list), ib_umem_num_pages(umem))); - if (!umem->odp_data->page_list) { + if (!umem_odp->page_list) { ret_val = -ENOMEM; - goto out_odp_data; + goto out_mm; } - umem->odp_data->dma_list = - vzalloc(array_size(sizeof(*umem->odp_data->dma_list), + umem_odp->dma_list = + vzalloc(array_size(sizeof(*umem_odp->dma_list), ib_umem_num_pages(umem))); - if (!umem->odp_data->dma_list) { + if (!umem_odp->dma_list) { ret_val = -ENOMEM; goto out_page_list; } @@ -415,13 +400,13 @@ int ib_umem_odp_get(struct ib_ucontext *context, struct ib_umem *umem, down_write(&context->umem_rwsem); context->odp_mrs_count++; if (likely(ib_umem_start(umem) != ib_umem_end(umem))) - rbt_ib_umem_insert(&umem->odp_data->interval_tree, + rbt_ib_umem_insert(&umem_odp->interval_tree, &context->umem_tree); if (likely(!atomic_read(&context->notifier_count)) || context->odp_mrs_count == 1) - umem->odp_data->mn_counters_active = true; + umem_odp->mn_counters_active = true; else - list_add(&umem->odp_data->no_private_counters, + list_add(&umem_odp->no_private_counters, &context->no_private_counters); downgrade_write(&context->umem_rwsem); @@ -454,11 +439,9 @@ int ib_umem_odp_get(struct ib_ucontext *context, struct ib_umem *umem, out_mutex: up_read(&context->umem_rwsem); - vfree(umem->odp_data->dma_list); + vfree(umem_odp->dma_list); out_page_list: - vfree(umem->odp_data->page_list); -out_odp_data: - kfree(umem->odp_data); + vfree(umem_odp->page_list); out_mm: mmput(mm); return ret_val; @@ -466,7 +449,7 @@ out_mm: void ib_umem_odp_release(struct ib_umem_odp *umem_odp) { - struct ib_umem *umem = umem_odp->umem; + struct ib_umem *umem = &umem_odp->umem; struct ib_ucontext *context = umem->context; /* @@ -528,8 +511,6 @@ out: vfree(umem_odp->dma_list); vfree(umem_odp->page_list); - kfree(umem_odp); - kfree(umem); } /* @@ -557,7 +538,7 @@ static int ib_umem_odp_map_dma_single_page( u64 access_mask, unsigned long current_seq) { - struct ib_umem *umem = umem_odp->umem; + struct ib_umem *umem = &umem_odp->umem; struct ib_device *dev = umem->context->device; dma_addr_t dma_addr; int stored_page = 0; @@ -643,7 +624,7 @@ int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 user_virt, u64 bcnt, u64 access_mask, unsigned long current_seq) { - struct ib_umem *umem = umem_odp->umem; + struct ib_umem *umem = &umem_odp->umem; struct task_struct *owning_process = NULL; struct mm_struct *owning_mm = NULL; struct page **local_page_list = NULL; @@ -759,7 +740,7 @@ EXPORT_SYMBOL(ib_umem_odp_map_dma_pages); void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 virt, u64 bound) { - struct ib_umem *umem = umem_odp->umem; + struct ib_umem *umem = &umem_odp->umem; int idx; u64 addr; struct ib_device *dev = umem->context->device; diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index 8f4a4a8171eb..5b9fd56186bd 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -64,7 +64,7 @@ static int check_parent(struct ib_umem_odp *odp, static struct ib_umem_odp *odp_next(struct ib_umem_odp *odp) { struct mlx5_ib_mr *mr = odp->private, *parent = mr->parent; - struct ib_ucontext *ctx = odp->umem->context; + struct ib_ucontext *ctx = odp->umem.context; struct rb_node *rb; down_read(&ctx->umem_rwsem); @@ -102,7 +102,7 @@ static struct ib_umem_odp *odp_lookup(struct ib_ucontext *ctx, if (!rb) goto not_found; odp = rb_entry(rb, struct ib_umem_odp, interval_tree.rb); - if (ib_umem_start(odp->umem) > start + length) + if (ib_umem_start(&odp->umem) > start + length) goto not_found; } not_found: @@ -137,7 +137,7 @@ void mlx5_odp_populate_klm(struct mlx5_klm *pklm, size_t offset, for (i = 0; i < nentries; i++, pklm++) { pklm->bcount = cpu_to_be32(MLX5_IMR_MTT_SIZE); va = (offset + i) * MLX5_IMR_MTT_SIZE; - if (odp && odp->umem->address == va) { + if (odp && odp->umem.address == va) { struct mlx5_ib_mr *mtt = odp->private; pklm->key = cpu_to_be32(mtt->ibmr.lkey); @@ -153,13 +153,13 @@ void mlx5_odp_populate_klm(struct mlx5_klm *pklm, size_t offset, static void mr_leaf_free_action(struct work_struct *work) { struct ib_umem_odp *odp = container_of(work, struct ib_umem_odp, work); - int idx = ib_umem_start(odp->umem) >> MLX5_IMR_MTT_SHIFT; + int idx = ib_umem_start(&odp->umem) >> MLX5_IMR_MTT_SHIFT; struct mlx5_ib_mr *mr = odp->private, *imr = mr->parent; mr->parent = NULL; synchronize_srcu(&mr->dev->mr_srcu); - ib_umem_release(odp->umem); + ib_umem_release(&odp->umem); if (imr->live) mlx5_ib_update_xlt(imr, idx, 1, 0, MLX5_IB_UPD_XLT_INDIRECT | @@ -185,7 +185,7 @@ void mlx5_ib_invalidate_range(struct ib_umem_odp *umem_odp, unsigned long start, pr_err("invalidation called on NULL umem or non-ODP umem\n"); return; } - umem = umem_odp->umem; + umem = &umem_odp->umem; mr = umem_odp->private; @@ -392,16 +392,16 @@ next_mr: return ERR_CAST(odp); } - mtt = implicit_mr_alloc(mr->ibmr.pd, odp->umem, 0, + mtt = implicit_mr_alloc(mr->ibmr.pd, &odp->umem, 0, mr->access_flags); if (IS_ERR(mtt)) { mutex_unlock(&mr->umem->odp_data->umem_mutex); - ib_umem_release(odp->umem); + ib_umem_release(&odp->umem); return ERR_CAST(mtt); } odp->private = mtt; - mtt->umem = odp->umem; + mtt->umem = &odp->umem; mtt->mmkey.iova = addr; mtt->parent = mr; INIT_WORK(&odp->work, mr_leaf_free_action); @@ -418,7 +418,7 @@ next_mr: addr += MLX5_IMR_MTT_SIZE; if (unlikely(addr < io_virt + bcnt)) { odp = odp_next(odp); - if (odp && odp->umem->address != addr) + if (odp && odp->umem.address != addr) odp = NULL; goto next_mr; } @@ -465,7 +465,7 @@ static int mr_leaf_free(struct ib_umem_odp *umem_odp, u64 start, u64 end, void *cookie) { struct mlx5_ib_mr *mr = umem_odp->private, *imr = cookie; - struct ib_umem *umem = umem_odp->umem; + struct ib_umem *umem = &umem_odp->umem; if (mr->parent != imr) return 0; @@ -518,7 +518,7 @@ static int pagefault_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr, } next_mr: - size = min_t(size_t, bcnt, ib_umem_end(odp->umem) - io_virt); + size = min_t(size_t, bcnt, ib_umem_end(&odp->umem) - io_virt); page_shift = mr->umem->page_shift; page_mask = ~(BIT(page_shift) - 1); @@ -577,7 +577,7 @@ next_mr: io_virt += size; next = odp_next(odp); - if (unlikely(!next || next->umem->address != io_virt)) { + if (unlikely(!next || next->umem.address != io_virt)) { mlx5_ib_dbg(dev, "next implicit leaf removed at 0x%llx. got %p\n", io_virt, next); return -EAGAIN; diff --git a/include/rdma/ib_umem_odp.h b/include/rdma/ib_umem_odp.h index 3ef2975b5fb2..4519ea663df5 100644 --- a/include/rdma/ib_umem_odp.h +++ b/include/rdma/ib_umem_odp.h @@ -43,6 +43,7 @@ struct umem_odp_node { }; struct ib_umem_odp { + struct ib_umem umem; /* * An array of the pages included in the on-demand paging umem. * Indices of pages that are currently not mapped into the device will @@ -72,7 +73,6 @@ struct ib_umem_odp { /* A linked list of umems that don't have private mmu notifier * counters yet. */ struct list_head no_private_counters; - struct ib_umem *umem; /* Tree tracking */ struct umem_odp_node interval_tree; @@ -84,13 +84,12 @@ struct ib_umem_odp { static inline struct ib_umem_odp *to_ib_umem_odp(struct ib_umem *umem) { - return umem->odp_data; + return container_of(umem, struct ib_umem_odp, umem); } #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING -int ib_umem_odp_get(struct ib_ucontext *context, struct ib_umem *umem, - int access); +int ib_umem_odp_get(struct ib_umem_odp *umem_odp, int access); struct ib_umem_odp *ib_alloc_odp_umem(struct ib_ucontext *context, unsigned long addr, size_t size); void ib_umem_odp_release(struct ib_umem_odp *umem_odp); @@ -158,9 +157,7 @@ static inline int ib_umem_mmu_notifier_retry(struct ib_umem_odp *umem_odp, #else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */ -static inline int ib_umem_odp_get(struct ib_ucontext *context, - struct ib_umem *umem, - int access) +static inline int ib_umem_odp_get(struct ib_umem_odp *umem_odp, int access) { return -EINVAL; } -- cgit v1.2.3 From 597ecc5a095406a668e53ab330495ddb65327f77 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Sun, 16 Sep 2018 20:48:06 +0300 Subject: RDMA/umem: Get rid of struct ib_umem.odp_data This no longer has any use, we can use container_of to get to the umem_odp, and a simple flag to indicate if this is an odp MR. Remove the few remaining references to it. Signed-off-by: Jason Gunthorpe Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/core/umem.c | 8 ++++---- drivers/infiniband/core/umem_odp.c | 3 +-- drivers/infiniband/hw/mlx5/mem.c | 9 ++++----- drivers/infiniband/hw/mlx5/mr.c | 13 +++++++------ drivers/infiniband/hw/mlx5/odp.c | 14 ++++++++------ include/rdma/ib_umem.h | 6 +++--- 6 files changed, 27 insertions(+), 26 deletions(-) (limited to 'drivers/infiniband/core') diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c index 88b9b88f90e1..fec5d489e311 100644 --- a/drivers/infiniband/core/umem.c +++ b/drivers/infiniband/core/umem.c @@ -112,7 +112,7 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, umem = kzalloc(sizeof(struct ib_umem_odp), GFP_KERNEL); if (!umem) return ERR_PTR(-ENOMEM); - umem->odp_data = to_ib_umem_odp(umem); + umem->is_odp = 1; } else { umem = kzalloc(sizeof(*umem), GFP_KERNEL); if (!umem) @@ -243,7 +243,7 @@ EXPORT_SYMBOL(ib_umem_get); static void __ib_umem_release_tail(struct ib_umem *umem) { mmdrop(umem->owning_mm); - if (umem->odp_data) + if (umem->is_odp) kfree(to_ib_umem_odp(umem)); else kfree(umem); @@ -268,7 +268,7 @@ void ib_umem_release(struct ib_umem *umem) { struct ib_ucontext *context = umem->context; - if (umem->odp_data) { + if (umem->is_odp) { ib_umem_odp_release(to_ib_umem_odp(umem)); __ib_umem_release_tail(umem); return; @@ -306,7 +306,7 @@ int ib_umem_page_count(struct ib_umem *umem) int n; struct scatterlist *sg; - if (umem->odp_data) + if (umem->is_odp) return ib_umem_num_pages(umem); n = 0; diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c index 900fdedfe910..42272b2bf595 100644 --- a/drivers/infiniband/core/umem_odp.c +++ b/drivers/infiniband/core/umem_odp.c @@ -291,6 +291,7 @@ struct ib_umem_odp *ib_alloc_odp_umem(struct ib_ucontext *context, umem->address = addr; umem->page_shift = PAGE_SHIFT; umem->writable = 1; + umem->is_odp = 1; mutex_init(&odp_data->umem_mutex); init_completion(&odp_data->notifier_completion); @@ -319,8 +320,6 @@ struct ib_umem_odp *ib_alloc_odp_umem(struct ib_ucontext *context, &context->no_private_counters); up_write(&context->umem_rwsem); - umem->odp_data = odp_data; - return odp_data; out_page_list: diff --git a/drivers/infiniband/hw/mlx5/mem.c b/drivers/infiniband/hw/mlx5/mem.c index f3dbd75a0a96..549234988bb4 100644 --- a/drivers/infiniband/hw/mlx5/mem.c +++ b/drivers/infiniband/hw/mlx5/mem.c @@ -57,7 +57,7 @@ void mlx5_ib_cont_pages(struct ib_umem *umem, u64 addr, int entry; unsigned long page_shift = umem->page_shift; - if (umem->odp_data) { + if (umem->is_odp) { *ncont = ib_umem_page_count(umem); *count = *ncont << (page_shift - PAGE_SHIFT); *shift = page_shift; @@ -152,14 +152,13 @@ void __mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem, struct scatterlist *sg; int entry; #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING - const bool odp = umem->odp_data != NULL; - - if (odp) { + if (umem->is_odp) { WARN_ON(shift != 0); WARN_ON(access_flags != (MLX5_IB_MTT_READ | MLX5_IB_MTT_WRITE)); for (i = 0; i < num_pages; ++i) { - dma_addr_t pa = umem->odp_data->dma_list[offset + i]; + dma_addr_t pa = + to_ib_umem_odp(umem)->dma_list[offset + i]; pas[i] = cpu_to_be64(umem_dma_to_mtt(pa)); } diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index affbf2831ccd..6aac3a107330 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -98,7 +98,7 @@ static bool use_umr_mtt_update(struct mlx5_ib_mr *mr, u64 start, u64 length) #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING static void update_odp_mr(struct mlx5_ib_mr *mr) { - if (mr->umem->odp_data) { + if (mr->umem->is_odp) { /* * This barrier prevents the compiler from moving the * setting of umem->odp_data->private to point to our @@ -107,7 +107,7 @@ static void update_odp_mr(struct mlx5_ib_mr *mr) * handle invalidations. */ smp_wmb(); - mr->umem->odp_data->private = mr; + to_ib_umem_odp(mr->umem)->private = mr; /* * Make sure we will see the new * umem->odp_data->private value in the invalidation @@ -1624,15 +1624,16 @@ static void dereg_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) struct ib_umem *umem = mr->umem; #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING - if (umem && umem->odp_data) { + if (umem && umem->is_odp) { + struct ib_umem_odp *umem_odp = to_ib_umem_odp(umem); + /* Prevent new page faults from succeeding */ mr->live = 0; /* Wait for all running page-fault handlers to finish. */ synchronize_srcu(&dev->mr_srcu); /* Destroy all page mappings */ - if (umem->odp_data->page_list) - mlx5_ib_invalidate_range(to_ib_umem_odp(umem), - ib_umem_start(umem), + if (umem_odp->page_list) + mlx5_ib_invalidate_range(umem_odp, ib_umem_start(umem), ib_umem_end(umem)); else mlx5_ib_free_implicit_mr(mr); diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index 5b9fd56186bd..d4780bded74a 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -371,11 +371,12 @@ static struct ib_umem_odp *implicit_mr_get_data(struct mlx5_ib_mr *mr, struct ib_ucontext *ctx = mr->ibmr.pd->uobject->context; struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.pd->device); struct ib_umem_odp *odp, *result = NULL; + struct ib_umem_odp *odp_mr = to_ib_umem_odp(mr->umem); u64 addr = io_virt & MLX5_IMR_MTT_MASK; int nentries = 0, start_idx = 0, ret; struct mlx5_ib_mr *mtt; - mutex_lock(&mr->umem->odp_data->umem_mutex); + mutex_lock(&odp_mr->umem_mutex); odp = odp_lookup(ctx, addr, 1, mr); mlx5_ib_dbg(dev, "io_virt:%llx bcnt:%zx addr:%llx odp:%p\n", @@ -388,14 +389,14 @@ next_mr: } else { odp = ib_alloc_odp_umem(ctx, addr, MLX5_IMR_MTT_SIZE); if (IS_ERR(odp)) { - mutex_unlock(&mr->umem->odp_data->umem_mutex); + mutex_unlock(&odp_mr->umem_mutex); return ERR_CAST(odp); } mtt = implicit_mr_alloc(mr->ibmr.pd, &odp->umem, 0, mr->access_flags); if (IS_ERR(mtt)) { - mutex_unlock(&mr->umem->odp_data->umem_mutex); + mutex_unlock(&odp_mr->umem_mutex); ib_umem_release(&odp->umem); return ERR_CAST(mtt); } @@ -433,7 +434,7 @@ next_mr: } } - mutex_unlock(&mr->umem->odp_data->umem_mutex); + mutex_unlock(&odp_mr->umem_mutex); return result; } @@ -498,6 +499,7 @@ void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *imr) static int pagefault_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr, u64 io_virt, size_t bcnt, u32 *bytes_mapped) { + struct ib_umem_odp *odp_mr = to_ib_umem_odp(mr->umem); u64 access_mask = ODP_READ_ALLOWED_BIT; int npages = 0, page_shift, np; u64 start_idx, page_mask; @@ -506,7 +508,7 @@ static int pagefault_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr, size_t size; int ret; - if (!mr->umem->odp_data->page_list) { + if (!odp_mr->page_list) { odp = implicit_mr_get_data(mr, io_virt, bcnt); if (IS_ERR(odp)) @@ -514,7 +516,7 @@ static int pagefault_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr, mr = odp->private; } else { - odp = mr->umem->odp_data; + odp = odp_mr; } next_mr: diff --git a/include/rdma/ib_umem.h b/include/rdma/ib_umem.h index e1c00b2ead19..5d3755ec5afa 100644 --- a/include/rdma/ib_umem.h +++ b/include/rdma/ib_umem.h @@ -46,10 +46,10 @@ struct ib_umem { size_t length; unsigned long address; int page_shift; - int writable; - int hugetlb; + u32 writable : 1; + u32 hugetlb : 1; + u32 is_odp : 1; struct work_struct work; - struct ib_umem_odp *odp_data; struct sg_table sg_head; int nmap; int npages; -- cgit v1.2.3 From c9990ab39b6e911003bab10a6da96e98ab1503a3 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Sun, 16 Sep 2018 20:48:07 +0300 Subject: RDMA/umem: Move all the ODP related stuff out of ucontext and into per_mm This is the first step to make ODP use the owning_mm that is now part of struct ib_umem. Each ODP umem is linked to a single per_mm structure, which in turn, is linked to a single mm, via the embedded mmu_notifier. This first patch introduces the structure and reworks eveything to use it. This also needs to introduce tgid into the ib_ucontext_per_mm, as get_user_pages_remote() requires the originating task for statistics tracking. Signed-off-by: Jason Gunthorpe Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/core/umem_odp.c | 127 +++++++++++++++++++---------------- drivers/infiniband/core/uverbs_cmd.c | 9 +-- drivers/infiniband/hw/mlx5/odp.c | 43 +++++++----- include/rdma/ib_umem_odp.h | 2 + include/rdma/ib_verbs.h | 32 +++++---- 5 files changed, 120 insertions(+), 93 deletions(-) (limited to 'drivers/infiniband/core') diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c index 42272b2bf595..6bf3fc0c12a1 100644 --- a/drivers/infiniband/core/umem_odp.c +++ b/drivers/infiniband/core/umem_odp.c @@ -115,34 +115,35 @@ static void ib_umem_notifier_end_account(struct ib_umem_odp *umem_odp) } /* Account for a new mmu notifier in an ib_ucontext. */ -static void ib_ucontext_notifier_start_account(struct ib_ucontext *context) +static void +ib_ucontext_notifier_start_account(struct ib_ucontext_per_mm *per_mm) { - atomic_inc(&context->notifier_count); + atomic_inc(&per_mm->notifier_count); } /* Account for a terminating mmu notifier in an ib_ucontext. * * Must be called with the ib_ucontext->umem_rwsem semaphore unlocked, since * the function takes the semaphore itself. */ -static void ib_ucontext_notifier_end_account(struct ib_ucontext *context) +static void ib_ucontext_notifier_end_account(struct ib_ucontext_per_mm *per_mm) { - int zero_notifiers = atomic_dec_and_test(&context->notifier_count); + int zero_notifiers = atomic_dec_and_test(&per_mm->notifier_count); if (zero_notifiers && - !list_empty(&context->no_private_counters)) { + !list_empty(&per_mm->no_private_counters)) { /* No currently running mmu notifiers. Now is the chance to * add private accounting to all previously added umems. */ struct ib_umem_odp *odp_data, *next; /* Prevent concurrent mmu notifiers from working on the * no_private_counters list. */ - down_write(&context->umem_rwsem); + down_write(&per_mm->umem_rwsem); /* Read the notifier_count again, with the umem_rwsem * semaphore taken for write. */ - if (!atomic_read(&context->notifier_count)) { + if (!atomic_read(&per_mm->notifier_count)) { list_for_each_entry_safe(odp_data, next, - &context->no_private_counters, + &per_mm->no_private_counters, no_private_counters) { mutex_lock(&odp_data->umem_mutex); odp_data->mn_counters_active = true; @@ -152,7 +153,7 @@ static void ib_ucontext_notifier_end_account(struct ib_ucontext *context) } } - up_write(&context->umem_rwsem); + up_write(&per_mm->umem_rwsem); } } @@ -179,19 +180,20 @@ static int ib_umem_notifier_release_trampoline(struct ib_umem_odp *umem_odp, static void ib_umem_notifier_release(struct mmu_notifier *mn, struct mm_struct *mm) { - struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn); + struct ib_ucontext_per_mm *per_mm = + container_of(mn, struct ib_ucontext_per_mm, mn); - if (!context->invalidate_range) + if (!per_mm->context->invalidate_range) return; - ib_ucontext_notifier_start_account(context); - down_read(&context->umem_rwsem); - rbt_ib_umem_for_each_in_range(&context->umem_tree, 0, + ib_ucontext_notifier_start_account(per_mm); + down_read(&per_mm->umem_rwsem); + rbt_ib_umem_for_each_in_range(&per_mm->umem_tree, 0, ULLONG_MAX, ib_umem_notifier_release_trampoline, true, NULL); - up_read(&context->umem_rwsem); + up_read(&per_mm->umem_rwsem); } static int invalidate_page_trampoline(struct ib_umem_odp *item, u64 start, @@ -217,23 +219,24 @@ static int ib_umem_notifier_invalidate_range_start(struct mmu_notifier *mn, unsigned long end, bool blockable) { - struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn); + struct ib_ucontext_per_mm *per_mm = + container_of(mn, struct ib_ucontext_per_mm, mn); int ret; - if (!context->invalidate_range) + if (!per_mm->context->invalidate_range) return 0; if (blockable) - down_read(&context->umem_rwsem); - else if (!down_read_trylock(&context->umem_rwsem)) + down_read(&per_mm->umem_rwsem); + else if (!down_read_trylock(&per_mm->umem_rwsem)) return -EAGAIN; - ib_ucontext_notifier_start_account(context); - ret = rbt_ib_umem_for_each_in_range(&context->umem_tree, start, + ib_ucontext_notifier_start_account(per_mm); + ret = rbt_ib_umem_for_each_in_range(&per_mm->umem_tree, start, end, invalidate_range_start_trampoline, blockable, NULL); - up_read(&context->umem_rwsem); + up_read(&per_mm->umem_rwsem); return ret; } @@ -250,9 +253,10 @@ static void ib_umem_notifier_invalidate_range_end(struct mmu_notifier *mn, unsigned long start, unsigned long end) { - struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn); + struct ib_ucontext_per_mm *per_mm = + container_of(mn, struct ib_ucontext_per_mm, mn); - if (!context->invalidate_range) + if (!per_mm->context->invalidate_range) return; /* @@ -260,12 +264,12 @@ static void ib_umem_notifier_invalidate_range_end(struct mmu_notifier *mn, * in ib_umem_notifier_invalidate_range_start so we shouldn't really block * here. But this is ugly and fragile. */ - down_read(&context->umem_rwsem); - rbt_ib_umem_for_each_in_range(&context->umem_tree, start, + down_read(&per_mm->umem_rwsem); + rbt_ib_umem_for_each_in_range(&per_mm->umem_tree, start, end, invalidate_range_end_trampoline, true, NULL); - up_read(&context->umem_rwsem); - ib_ucontext_notifier_end_account(context); + up_read(&per_mm->umem_rwsem); + ib_ucontext_notifier_end_account(per_mm); } static const struct mmu_notifier_ops ib_umem_notifiers = { @@ -277,6 +281,7 @@ static const struct mmu_notifier_ops ib_umem_notifiers = { struct ib_umem_odp *ib_alloc_odp_umem(struct ib_ucontext *context, unsigned long addr, size_t size) { + struct ib_ucontext_per_mm *per_mm; struct ib_umem_odp *odp_data; struct ib_umem *umem; int pages = size >> PAGE_SHIFT; @@ -292,6 +297,7 @@ struct ib_umem_odp *ib_alloc_odp_umem(struct ib_ucontext *context, umem->page_shift = PAGE_SHIFT; umem->writable = 1; umem->is_odp = 1; + odp_data->per_mm = per_mm = &context->per_mm; mutex_init(&odp_data->umem_mutex); init_completion(&odp_data->notifier_completion); @@ -310,15 +316,15 @@ struct ib_umem_odp *ib_alloc_odp_umem(struct ib_ucontext *context, goto out_page_list; } - down_write(&context->umem_rwsem); - context->odp_mrs_count++; - rbt_ib_umem_insert(&odp_data->interval_tree, &context->umem_tree); - if (likely(!atomic_read(&context->notifier_count))) + down_write(&per_mm->umem_rwsem); + per_mm->odp_mrs_count++; + rbt_ib_umem_insert(&odp_data->interval_tree, &per_mm->umem_tree); + if (likely(!atomic_read(&per_mm->notifier_count))) odp_data->mn_counters_active = true; else list_add(&odp_data->no_private_counters, - &context->no_private_counters); - up_write(&context->umem_rwsem); + &per_mm->no_private_counters); + up_write(&per_mm->umem_rwsem); return odp_data; @@ -334,6 +340,7 @@ int ib_umem_odp_get(struct ib_umem_odp *umem_odp, int access) { struct ib_ucontext *context = umem_odp->umem.context; struct ib_umem *umem = &umem_odp->umem; + struct ib_ucontext_per_mm *per_mm; int ret_val; struct pid *our_pid; struct mm_struct *mm = get_task_mm(current); @@ -396,28 +403,30 @@ int ib_umem_odp_get(struct ib_umem_odp *umem_odp, int access) * notification before the "current" task (and MM) is * destroyed. We use the umem_rwsem semaphore to synchronize. */ - down_write(&context->umem_rwsem); - context->odp_mrs_count++; + umem_odp->per_mm = per_mm = &context->per_mm; + + down_write(&per_mm->umem_rwsem); + per_mm->odp_mrs_count++; if (likely(ib_umem_start(umem) != ib_umem_end(umem))) rbt_ib_umem_insert(&umem_odp->interval_tree, - &context->umem_tree); - if (likely(!atomic_read(&context->notifier_count)) || - context->odp_mrs_count == 1) + &per_mm->umem_tree); + if (likely(!atomic_read(&per_mm->notifier_count)) || + per_mm->odp_mrs_count == 1) umem_odp->mn_counters_active = true; else list_add(&umem_odp->no_private_counters, - &context->no_private_counters); - downgrade_write(&context->umem_rwsem); + &per_mm->no_private_counters); + downgrade_write(&per_mm->umem_rwsem); - if (context->odp_mrs_count == 1) { + if (per_mm->odp_mrs_count == 1) { /* * Note that at this point, no MMU notifier is running - * for this context! + * for this per_mm! */ - atomic_set(&context->notifier_count, 0); - INIT_HLIST_NODE(&context->mn.hlist); - context->mn.ops = &ib_umem_notifiers; - ret_val = mmu_notifier_register(&context->mn, mm); + atomic_set(&per_mm->notifier_count, 0); + INIT_HLIST_NODE(&per_mm->mn.hlist); + per_mm->mn.ops = &ib_umem_notifiers; + ret_val = mmu_notifier_register(&per_mm->mn, mm); if (ret_val) { pr_err("Failed to register mmu_notifier %d\n", ret_val); ret_val = -EBUSY; @@ -425,7 +434,7 @@ int ib_umem_odp_get(struct ib_umem_odp *umem_odp, int access) } } - up_read(&context->umem_rwsem); + up_read(&per_mm->umem_rwsem); /* * Note that doing an mmput can cause a notifier for the relevant mm. @@ -437,7 +446,7 @@ int ib_umem_odp_get(struct ib_umem_odp *umem_odp, int access) return 0; out_mutex: - up_read(&context->umem_rwsem); + up_read(&per_mm->umem_rwsem); vfree(umem_odp->dma_list); out_page_list: vfree(umem_odp->page_list); @@ -449,7 +458,7 @@ out_mm: void ib_umem_odp_release(struct ib_umem_odp *umem_odp) { struct ib_umem *umem = &umem_odp->umem; - struct ib_ucontext *context = umem->context; + struct ib_ucontext_per_mm *per_mm = umem_odp->per_mm; /* * Ensure that no more pages are mapped in the umem. @@ -460,11 +469,11 @@ void ib_umem_odp_release(struct ib_umem_odp *umem_odp) ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem), ib_umem_end(umem)); - down_write(&context->umem_rwsem); + down_write(&per_mm->umem_rwsem); if (likely(ib_umem_start(umem) != ib_umem_end(umem))) rbt_ib_umem_remove(&umem_odp->interval_tree, - &context->umem_tree); - context->odp_mrs_count--; + &per_mm->umem_tree); + per_mm->odp_mrs_count--; if (!umem_odp->mn_counters_active) { list_del(&umem_odp->no_private_counters); complete_all(&umem_odp->notifier_completion); @@ -477,13 +486,13 @@ void ib_umem_odp_release(struct ib_umem_odp *umem_odp) * that since we are doing it atomically, no other user could register * and unregister while we do the check. */ - downgrade_write(&context->umem_rwsem); - if (!context->odp_mrs_count) { + downgrade_write(&per_mm->umem_rwsem); + if (!per_mm->odp_mrs_count) { struct task_struct *owning_process = NULL; struct mm_struct *owning_mm = NULL; - owning_process = get_pid_task(context->tgid, - PIDTYPE_PID); + owning_process = + get_pid_task(umem_odp->umem.context->tgid, PIDTYPE_PID); if (owning_process == NULL) /* * The process is already dead, notifier were removed @@ -498,7 +507,7 @@ void ib_umem_odp_release(struct ib_umem_odp *umem_odp) * removed already. */ goto out_put_task; - mmu_notifier_unregister(&context->mn, owning_mm); + mmu_notifier_unregister(&per_mm->mn, owning_mm); mmput(owning_mm); @@ -506,7 +515,7 @@ out_put_task: put_task_struct(owning_process); } out: - up_read(&context->umem_rwsem); + up_read(&per_mm->umem_rwsem); vfree(umem_odp->dma_list); vfree(umem_odp->page_list); diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 9c87c98a0f19..ce678e1008a4 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -124,10 +124,11 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file, ucontext->cleanup_retryable = false; #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING - ucontext->umem_tree = RB_ROOT_CACHED; - init_rwsem(&ucontext->umem_rwsem); - ucontext->odp_mrs_count = 0; - INIT_LIST_HEAD(&ucontext->no_private_counters); + ucontext->per_mm.umem_tree = RB_ROOT_CACHED; + init_rwsem(&ucontext->per_mm.umem_rwsem); + ucontext->per_mm.odp_mrs_count = 0; + INIT_LIST_HEAD(&ucontext->per_mm.no_private_counters); + ucontext->per_mm.context = ucontext; if (!(ib_dev->attrs.device_cap_flags & IB_DEVICE_ON_DEMAND_PAGING)) ucontext->invalidate_range = NULL; diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index d4780bded74a..9982b5f4e598 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -61,13 +61,21 @@ static int check_parent(struct ib_umem_odp *odp, return mr && mr->parent == parent && !odp->dying; } +struct ib_ucontext_per_mm *mr_to_per_mm(struct mlx5_ib_mr *mr) +{ + if (WARN_ON(!mr || !mr->umem || !mr->umem->is_odp)) + return NULL; + + return to_ib_umem_odp(mr->umem)->per_mm; +} + static struct ib_umem_odp *odp_next(struct ib_umem_odp *odp) { struct mlx5_ib_mr *mr = odp->private, *parent = mr->parent; - struct ib_ucontext *ctx = odp->umem.context; + struct ib_ucontext_per_mm *per_mm = odp->per_mm; struct rb_node *rb; - down_read(&ctx->umem_rwsem); + down_read(&per_mm->umem_rwsem); while (1) { rb = rb_next(&odp->interval_tree.rb); if (!rb) @@ -79,19 +87,19 @@ static struct ib_umem_odp *odp_next(struct ib_umem_odp *odp) not_found: odp = NULL; end: - up_read(&ctx->umem_rwsem); + up_read(&per_mm->umem_rwsem); return odp; } -static struct ib_umem_odp *odp_lookup(struct ib_ucontext *ctx, - u64 start, u64 length, +static struct ib_umem_odp *odp_lookup(u64 start, u64 length, struct mlx5_ib_mr *parent) { + struct ib_ucontext_per_mm *per_mm = mr_to_per_mm(parent); struct ib_umem_odp *odp; struct rb_node *rb; - down_read(&ctx->umem_rwsem); - odp = rbt_ib_umem_lookup(&ctx->umem_tree, start, length); + down_read(&per_mm->umem_rwsem); + odp = rbt_ib_umem_lookup(&per_mm->umem_tree, start, length); if (!odp) goto end; @@ -108,7 +116,7 @@ static struct ib_umem_odp *odp_lookup(struct ib_ucontext *ctx, not_found: odp = NULL; end: - up_read(&ctx->umem_rwsem); + up_read(&per_mm->umem_rwsem); return odp; } @@ -116,7 +124,6 @@ void mlx5_odp_populate_klm(struct mlx5_klm *pklm, size_t offset, size_t nentries, struct mlx5_ib_mr *mr, int flags) { struct ib_pd *pd = mr->ibmr.pd; - struct ib_ucontext *ctx = pd->uobject->context; struct mlx5_ib_dev *dev = to_mdev(pd->device); struct ib_umem_odp *odp; unsigned long va; @@ -131,8 +138,8 @@ void mlx5_odp_populate_klm(struct mlx5_klm *pklm, size_t offset, return; } - odp = odp_lookup(ctx, offset * MLX5_IMR_MTT_SIZE, - nentries * MLX5_IMR_MTT_SIZE, mr); + odp = odp_lookup(offset * MLX5_IMR_MTT_SIZE, + nentries * MLX5_IMR_MTT_SIZE, mr); for (i = 0; i < nentries; i++, pklm++) { pklm->bcount = cpu_to_be32(MLX5_IMR_MTT_SIZE); @@ -368,7 +375,6 @@ fail: static struct ib_umem_odp *implicit_mr_get_data(struct mlx5_ib_mr *mr, u64 io_virt, size_t bcnt) { - struct ib_ucontext *ctx = mr->ibmr.pd->uobject->context; struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.pd->device); struct ib_umem_odp *odp, *result = NULL; struct ib_umem_odp *odp_mr = to_ib_umem_odp(mr->umem); @@ -377,7 +383,7 @@ static struct ib_umem_odp *implicit_mr_get_data(struct mlx5_ib_mr *mr, struct mlx5_ib_mr *mtt; mutex_lock(&odp_mr->umem_mutex); - odp = odp_lookup(ctx, addr, 1, mr); + odp = odp_lookup(addr, 1, mr); mlx5_ib_dbg(dev, "io_virt:%llx bcnt:%zx addr:%llx odp:%p\n", io_virt, bcnt, addr, odp); @@ -387,7 +393,8 @@ next_mr: if (nentries) nentries++; } else { - odp = ib_alloc_odp_umem(ctx, addr, MLX5_IMR_MTT_SIZE); + odp = ib_alloc_odp_umem(odp_mr->umem.context, addr, + MLX5_IMR_MTT_SIZE); if (IS_ERR(odp)) { mutex_unlock(&odp_mr->umem_mutex); return ERR_CAST(odp); @@ -486,12 +493,12 @@ static int mr_leaf_free(struct ib_umem_odp *umem_odp, u64 start, u64 end, void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *imr) { - struct ib_ucontext *ctx = imr->ibmr.pd->uobject->context; + struct ib_ucontext_per_mm *per_mm = mr_to_per_mm(imr); - down_read(&ctx->umem_rwsem); - rbt_ib_umem_for_each_in_range(&ctx->umem_tree, 0, ULLONG_MAX, + down_read(&per_mm->umem_rwsem); + rbt_ib_umem_for_each_in_range(&per_mm->umem_tree, 0, ULLONG_MAX, mr_leaf_free, true, imr); - up_read(&ctx->umem_rwsem); + up_read(&per_mm->umem_rwsem); wait_event(imr->q_leaf_free, !atomic_read(&imr->num_leaf_free)); } diff --git a/include/rdma/ib_umem_odp.h b/include/rdma/ib_umem_odp.h index 4519ea663df5..394ea6b68db7 100644 --- a/include/rdma/ib_umem_odp.h +++ b/include/rdma/ib_umem_odp.h @@ -44,6 +44,8 @@ struct umem_odp_node { struct ib_umem_odp { struct ib_umem umem; + struct ib_ucontext_per_mm *per_mm; + /* * An array of the pages included in the on-demand paging umem. * Indices of pages that are currently not mapped into the device will diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index d611ce9df7fb..2cf2cee5a753 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -1488,6 +1488,25 @@ struct ib_rdmacg_object { #endif }; +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING +struct ib_ucontext_per_mm { + struct ib_ucontext *context; + + struct rb_root_cached umem_tree; + /* + * Protects .umem_rbroot and tree, as well as odp_mrs_count and + * mmu notifiers registration. + */ + struct rw_semaphore umem_rwsem; + + struct mmu_notifier mn; + atomic_t notifier_count; + /* A list of umems that don't have private mmu notifier counters yet. */ + struct list_head no_private_counters; + unsigned int odp_mrs_count; +}; +#endif + struct ib_ucontext { struct ib_device *device; struct ib_uverbs_file *ufile; @@ -1502,20 +1521,9 @@ struct ib_ucontext { struct pid *tgid; #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING - struct rb_root_cached umem_tree; - /* - * Protects .umem_rbroot and tree, as well as odp_mrs_count and - * mmu notifiers registration. - */ - struct rw_semaphore umem_rwsem; void (*invalidate_range)(struct ib_umem_odp *umem_odp, unsigned long start, unsigned long end); - - struct mmu_notifier mn; - atomic_t notifier_count; - /* A list of umems that don't have private mmu notifier counters yet. */ - struct list_head no_private_counters; - int odp_mrs_count; + struct ib_ucontext_per_mm per_mm; #endif struct ib_rdmacg_object cg_obj; -- cgit v1.2.3 From f27a0d50a4bc2861b472c2e3740d63a29d1ac460 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Sun, 16 Sep 2018 20:48:08 +0300 Subject: RDMA/umem: Use umem->owning_mm inside ODP Since ODP had a single struct mmu_notifier located in the ucontext it could only handle a single MM at a time, and this prevented it from using the new owning_mm system. With the prior rework it is now simple to let ODP track multiple MMs per ucontext, finish the job so that the per_mm is allocated on a mm by mm basis, and freed when the last umem is dropped from the ucontext. As a side effect the new saner locking removes the lockdep splat about nesting the umem_rwsem between mmu_notifier_unregister and ib_umem_odp_release. It also makes ODP work with multiple processes, across, fork, etc. Signed-off-by: Jason Gunthorpe Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/core/umem_odp.c | 301 +++++++++++++++++++---------------- drivers/infiniband/core/uverbs_cmd.c | 8 +- drivers/infiniband/hw/mlx5/main.c | 7 + drivers/infiniband/hw/mlx5/odp.c | 2 +- include/rdma/ib_umem_odp.h | 20 ++- include/rdma/ib_verbs.h | 22 +-- 6 files changed, 191 insertions(+), 169 deletions(-) (limited to 'drivers/infiniband/core') diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c index 6bf3fc0c12a1..0577f9ff600f 100644 --- a/drivers/infiniband/core/umem_odp.c +++ b/drivers/infiniband/core/umem_odp.c @@ -278,10 +278,135 @@ static const struct mmu_notifier_ops ib_umem_notifiers = { .invalidate_range_end = ib_umem_notifier_invalidate_range_end, }; -struct ib_umem_odp *ib_alloc_odp_umem(struct ib_ucontext *context, - unsigned long addr, size_t size) +static void add_umem_to_per_mm(struct ib_umem_odp *umem_odp) +{ + struct ib_ucontext_per_mm *per_mm = umem_odp->per_mm; + struct ib_umem *umem = &umem_odp->umem; + + down_write(&per_mm->umem_rwsem); + if (likely(ib_umem_start(umem) != ib_umem_end(umem))) + rbt_ib_umem_insert(&umem_odp->interval_tree, + &per_mm->umem_tree); + + if (likely(!atomic_read(&per_mm->notifier_count))) + umem_odp->mn_counters_active = true; + else + list_add(&umem_odp->no_private_counters, + &per_mm->no_private_counters); + up_write(&per_mm->umem_rwsem); +} + +static void remove_umem_from_per_mm(struct ib_umem_odp *umem_odp) +{ + struct ib_ucontext_per_mm *per_mm = umem_odp->per_mm; + struct ib_umem *umem = &umem_odp->umem; + + down_write(&per_mm->umem_rwsem); + if (likely(ib_umem_start(umem) != ib_umem_end(umem))) + rbt_ib_umem_remove(&umem_odp->interval_tree, + &per_mm->umem_tree); + if (!umem_odp->mn_counters_active) { + list_del(&umem_odp->no_private_counters); + complete_all(&umem_odp->notifier_completion); + } + + up_write(&per_mm->umem_rwsem); +} + +static struct ib_ucontext_per_mm *alloc_per_mm(struct ib_ucontext *ctx, + struct mm_struct *mm) { struct ib_ucontext_per_mm *per_mm; + int ret; + + per_mm = kzalloc(sizeof(*per_mm), GFP_KERNEL); + if (!per_mm) + return ERR_PTR(-ENOMEM); + + per_mm->context = ctx; + per_mm->mm = mm; + per_mm->umem_tree = RB_ROOT_CACHED; + init_rwsem(&per_mm->umem_rwsem); + INIT_LIST_HEAD(&per_mm->no_private_counters); + + rcu_read_lock(); + per_mm->tgid = get_task_pid(current->group_leader, PIDTYPE_PID); + rcu_read_unlock(); + + WARN_ON(mm != current->mm); + + per_mm->mn.ops = &ib_umem_notifiers; + ret = mmu_notifier_register(&per_mm->mn, per_mm->mm); + if (ret) { + dev_err(&ctx->device->dev, + "Failed to register mmu_notifier %d\n", ret); + goto out_pid; + } + + list_add(&per_mm->ucontext_list, &ctx->per_mm_list); + return per_mm; + +out_pid: + put_pid(per_mm->tgid); + kfree(per_mm); + return ERR_PTR(ret); +} + +static int get_per_mm(struct ib_umem_odp *umem_odp) +{ + struct ib_ucontext *ctx = umem_odp->umem.context; + struct ib_ucontext_per_mm *per_mm; + + /* + * Generally speaking we expect only one or two per_mm in this list, + * so no reason to optimize this search today. + */ + mutex_lock(&ctx->per_mm_list_lock); + list_for_each_entry(per_mm, &ctx->per_mm_list, ucontext_list) { + if (per_mm->mm == umem_odp->umem.owning_mm) + goto found; + } + + per_mm = alloc_per_mm(ctx, umem_odp->umem.owning_mm); + if (IS_ERR(per_mm)) { + mutex_unlock(&ctx->per_mm_list_lock); + return PTR_ERR(per_mm); + } + +found: + umem_odp->per_mm = per_mm; + per_mm->odp_mrs_count++; + mutex_unlock(&ctx->per_mm_list_lock); + + return 0; +} + +void put_per_mm(struct ib_umem_odp *umem_odp) +{ + struct ib_ucontext_per_mm *per_mm = umem_odp->per_mm; + struct ib_ucontext *ctx = umem_odp->umem.context; + bool need_free; + + mutex_lock(&ctx->per_mm_list_lock); + umem_odp->per_mm = NULL; + per_mm->odp_mrs_count--; + need_free = per_mm->odp_mrs_count == 0; + if (need_free) + list_del(&per_mm->ucontext_list); + mutex_unlock(&ctx->per_mm_list_lock); + + if (!need_free) + return; + + mmu_notifier_unregister(&per_mm->mn, per_mm->mm); + put_pid(per_mm->tgid); + kfree(per_mm); +} + +struct ib_umem_odp *ib_alloc_odp_umem(struct ib_ucontext_per_mm *per_mm, + unsigned long addr, size_t size) +{ + struct ib_ucontext *ctx = per_mm->context; struct ib_umem_odp *odp_data; struct ib_umem *umem; int pages = size >> PAGE_SHIFT; @@ -291,13 +416,13 @@ struct ib_umem_odp *ib_alloc_odp_umem(struct ib_ucontext *context, if (!odp_data) return ERR_PTR(-ENOMEM); umem = &odp_data->umem; - umem->context = context; + umem->context = ctx; umem->length = size; umem->address = addr; umem->page_shift = PAGE_SHIFT; umem->writable = 1; umem->is_odp = 1; - odp_data->per_mm = per_mm = &context->per_mm; + odp_data->per_mm = per_mm; mutex_init(&odp_data->umem_mutex); init_completion(&odp_data->notifier_completion); @@ -316,15 +441,14 @@ struct ib_umem_odp *ib_alloc_odp_umem(struct ib_ucontext *context, goto out_page_list; } - down_write(&per_mm->umem_rwsem); + /* + * Caller must ensure that the umem_odp that the per_mm came from + * cannot be freed during the call to ib_alloc_odp_umem. + */ + mutex_lock(&ctx->per_mm_list_lock); per_mm->odp_mrs_count++; - rbt_ib_umem_insert(&odp_data->interval_tree, &per_mm->umem_tree); - if (likely(!atomic_read(&per_mm->notifier_count))) - odp_data->mn_counters_active = true; - else - list_add(&odp_data->no_private_counters, - &per_mm->no_private_counters); - up_write(&per_mm->umem_rwsem); + mutex_unlock(&ctx->per_mm_list_lock); + add_umem_to_per_mm(odp_data); return odp_data; @@ -338,15 +462,13 @@ EXPORT_SYMBOL(ib_alloc_odp_umem); int ib_umem_odp_get(struct ib_umem_odp *umem_odp, int access) { - struct ib_ucontext *context = umem_odp->umem.context; struct ib_umem *umem = &umem_odp->umem; - struct ib_ucontext_per_mm *per_mm; + /* + * NOTE: This must called in a process context where umem->owning_mm + * == current->mm + */ + struct mm_struct *mm = umem->owning_mm; int ret_val; - struct pid *our_pid; - struct mm_struct *mm = get_task_mm(current); - - if (!mm) - return -EINVAL; if (access & IB_ACCESS_HUGETLB) { struct vm_area_struct *vma; @@ -366,16 +488,6 @@ int ib_umem_odp_get(struct ib_umem_odp *umem_odp, int access) umem->hugetlb = 0; } - /* Prevent creating ODP MRs in child processes */ - rcu_read_lock(); - our_pid = get_task_pid(current->group_leader, PIDTYPE_PID); - rcu_read_unlock(); - put_pid(our_pid); - if (context->tgid != our_pid) { - ret_val = -EINVAL; - goto out_mm; - } - mutex_init(&umem_odp->umem_mutex); init_completion(&umem_odp->notifier_completion); @@ -384,10 +496,8 @@ int ib_umem_odp_get(struct ib_umem_odp *umem_odp, int access) umem_odp->page_list = vzalloc(array_size(sizeof(*umem_odp->page_list), ib_umem_num_pages(umem))); - if (!umem_odp->page_list) { - ret_val = -ENOMEM; - goto out_mm; - } + if (!umem_odp->page_list) + return -ENOMEM; umem_odp->dma_list = vzalloc(array_size(sizeof(*umem_odp->dma_list), @@ -398,67 +508,23 @@ int ib_umem_odp_get(struct ib_umem_odp *umem_odp, int access) } } - /* - * When using MMU notifiers, we will get a - * notification before the "current" task (and MM) is - * destroyed. We use the umem_rwsem semaphore to synchronize. - */ - umem_odp->per_mm = per_mm = &context->per_mm; - - down_write(&per_mm->umem_rwsem); - per_mm->odp_mrs_count++; - if (likely(ib_umem_start(umem) != ib_umem_end(umem))) - rbt_ib_umem_insert(&umem_odp->interval_tree, - &per_mm->umem_tree); - if (likely(!atomic_read(&per_mm->notifier_count)) || - per_mm->odp_mrs_count == 1) - umem_odp->mn_counters_active = true; - else - list_add(&umem_odp->no_private_counters, - &per_mm->no_private_counters); - downgrade_write(&per_mm->umem_rwsem); + ret_val = get_per_mm(umem_odp); + if (ret_val) + goto out_dma_list; + add_umem_to_per_mm(umem_odp); - if (per_mm->odp_mrs_count == 1) { - /* - * Note that at this point, no MMU notifier is running - * for this per_mm! - */ - atomic_set(&per_mm->notifier_count, 0); - INIT_HLIST_NODE(&per_mm->mn.hlist); - per_mm->mn.ops = &ib_umem_notifiers; - ret_val = mmu_notifier_register(&per_mm->mn, mm); - if (ret_val) { - pr_err("Failed to register mmu_notifier %d\n", ret_val); - ret_val = -EBUSY; - goto out_mutex; - } - } - - up_read(&per_mm->umem_rwsem); - - /* - * Note that doing an mmput can cause a notifier for the relevant mm. - * If the notifier is called while we hold the umem_rwsem, this will - * cause a deadlock. Therefore, we release the reference only after we - * released the semaphore. - */ - mmput(mm); return 0; -out_mutex: - up_read(&per_mm->umem_rwsem); +out_dma_list: vfree(umem_odp->dma_list); out_page_list: vfree(umem_odp->page_list); -out_mm: - mmput(mm); return ret_val; } void ib_umem_odp_release(struct ib_umem_odp *umem_odp) { struct ib_umem *umem = &umem_odp->umem; - struct ib_ucontext_per_mm *per_mm = umem_odp->per_mm; /* * Ensure that no more pages are mapped in the umem. @@ -469,54 +535,8 @@ void ib_umem_odp_release(struct ib_umem_odp *umem_odp) ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem), ib_umem_end(umem)); - down_write(&per_mm->umem_rwsem); - if (likely(ib_umem_start(umem) != ib_umem_end(umem))) - rbt_ib_umem_remove(&umem_odp->interval_tree, - &per_mm->umem_tree); - per_mm->odp_mrs_count--; - if (!umem_odp->mn_counters_active) { - list_del(&umem_odp->no_private_counters); - complete_all(&umem_odp->notifier_completion); - } - - /* - * Downgrade the lock to a read lock. This ensures that the notifiers - * (who lock the mutex for reading) will be able to finish, and we - * will be able to enventually obtain the mmu notifiers SRCU. Note - * that since we are doing it atomically, no other user could register - * and unregister while we do the check. - */ - downgrade_write(&per_mm->umem_rwsem); - if (!per_mm->odp_mrs_count) { - struct task_struct *owning_process = NULL; - struct mm_struct *owning_mm = NULL; - - owning_process = - get_pid_task(umem_odp->umem.context->tgid, PIDTYPE_PID); - if (owning_process == NULL) - /* - * The process is already dead, notifier were removed - * already. - */ - goto out; - - owning_mm = get_task_mm(owning_process); - if (owning_mm == NULL) - /* - * The process' mm is already dead, notifier were - * removed already. - */ - goto out_put_task; - mmu_notifier_unregister(&per_mm->mn, owning_mm); - - mmput(owning_mm); - -out_put_task: - put_task_struct(owning_process); - } -out: - up_read(&per_mm->umem_rwsem); - + remove_umem_from_per_mm(umem_odp); + put_per_mm(umem_odp); vfree(umem_odp->dma_list); vfree(umem_odp->page_list); } @@ -634,7 +654,7 @@ int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 user_virt, { struct ib_umem *umem = &umem_odp->umem; struct task_struct *owning_process = NULL; - struct mm_struct *owning_mm = NULL; + struct mm_struct *owning_mm = umem_odp->umem.owning_mm; struct page **local_page_list = NULL; u64 page_mask, off; int j, k, ret = 0, start_idx, npages = 0, page_shift; @@ -658,15 +678,14 @@ int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 user_virt, user_virt = user_virt & page_mask; bcnt += off; /* Charge for the first page offset as well. */ - owning_process = get_pid_task(umem->context->tgid, PIDTYPE_PID); - if (owning_process == NULL) { + /* + * owning_process is allowed to be NULL, this means somehow the mm is + * existing beyond the lifetime of the originating process.. Presumably + * mmget_not_zero will fail in this case. + */ + owning_process = get_pid_task(umem_odp->per_mm->tgid, PIDTYPE_PID); + if (WARN_ON(!mmget_not_zero(umem_odp->umem.owning_mm))) { ret = -EINVAL; - goto out_no_task; - } - - owning_mm = get_task_mm(owning_process); - if (owning_mm == NULL) { - ret = -ENOENT; goto out_put_task; } @@ -738,8 +757,8 @@ int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 user_virt, mmput(owning_mm); out_put_task: - put_task_struct(owning_process); -out_no_task: + if (owning_process) + put_task_struct(owning_process); free_page((unsigned long)local_page_list); return ret; } diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index ce678e1008a4..d77b0b9793c7 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -124,12 +124,8 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file, ucontext->cleanup_retryable = false; #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING - ucontext->per_mm.umem_tree = RB_ROOT_CACHED; - init_rwsem(&ucontext->per_mm.umem_rwsem); - ucontext->per_mm.odp_mrs_count = 0; - INIT_LIST_HEAD(&ucontext->per_mm.no_private_counters); - ucontext->per_mm.context = ucontext; - + mutex_init(&ucontext->per_mm_list_lock); + INIT_LIST_HEAD(&ucontext->per_mm_list); if (!(ib_dev->attrs.device_cap_flags & IB_DEVICE_ON_DEMAND_PAGING)) ucontext->invalidate_range = NULL; diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index aeb328100986..1348a08261a9 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -1861,6 +1861,13 @@ static int mlx5_ib_dealloc_ucontext(struct ib_ucontext *ibcontext) struct mlx5_ib_dev *dev = to_mdev(ibcontext->device); struct mlx5_bfreg_info *bfregi; +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING + /* All umem's must be destroyed before destroying the ucontext. */ + mutex_lock(&ibcontext->per_mm_list_lock); + WARN_ON(!list_empty(&ibcontext->per_mm_list)); + mutex_unlock(&ibcontext->per_mm_list_lock); +#endif + if (context->devx_uid) mlx5_ib_devx_destroy(dev, context); diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index 9982b5f4e598..b04eb6775326 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -393,7 +393,7 @@ next_mr: if (nentries) nentries++; } else { - odp = ib_alloc_odp_umem(odp_mr->umem.context, addr, + odp = ib_alloc_odp_umem(odp_mr->per_mm, addr, MLX5_IMR_MTT_SIZE); if (IS_ERR(odp)) { mutex_unlock(&odp_mr->umem_mutex); diff --git a/include/rdma/ib_umem_odp.h b/include/rdma/ib_umem_odp.h index 394ea6b68db7..259eb08dfc9e 100644 --- a/include/rdma/ib_umem_odp.h +++ b/include/rdma/ib_umem_odp.h @@ -91,8 +91,26 @@ static inline struct ib_umem_odp *to_ib_umem_odp(struct ib_umem *umem) #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING +struct ib_ucontext_per_mm { + struct ib_ucontext *context; + struct mm_struct *mm; + struct pid *tgid; + + struct rb_root_cached umem_tree; + /* Protects umem_tree */ + struct rw_semaphore umem_rwsem; + atomic_t notifier_count; + + struct mmu_notifier mn; + /* A list of umems that don't have private mmu notifier counters yet. */ + struct list_head no_private_counters; + unsigned int odp_mrs_count; + + struct list_head ucontext_list; +}; + int ib_umem_odp_get(struct ib_umem_odp *umem_odp, int access); -struct ib_umem_odp *ib_alloc_odp_umem(struct ib_ucontext *context, +struct ib_umem_odp *ib_alloc_odp_umem(struct ib_ucontext_per_mm *per_mm, unsigned long addr, size_t size); void ib_umem_odp_release(struct ib_umem_odp *umem_odp); diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 2cf2cee5a753..6437e6af758d 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -1488,25 +1488,6 @@ struct ib_rdmacg_object { #endif }; -#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING -struct ib_ucontext_per_mm { - struct ib_ucontext *context; - - struct rb_root_cached umem_tree; - /* - * Protects .umem_rbroot and tree, as well as odp_mrs_count and - * mmu notifiers registration. - */ - struct rw_semaphore umem_rwsem; - - struct mmu_notifier mn; - atomic_t notifier_count; - /* A list of umems that don't have private mmu notifier counters yet. */ - struct list_head no_private_counters; - unsigned int odp_mrs_count; -}; -#endif - struct ib_ucontext { struct ib_device *device; struct ib_uverbs_file *ufile; @@ -1523,7 +1504,8 @@ struct ib_ucontext { #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING void (*invalidate_range)(struct ib_umem_odp *umem_odp, unsigned long start, unsigned long end); - struct ib_ucontext_per_mm per_mm; + struct mutex per_mm_list_lock; + struct list_head per_mm_list; #endif struct ib_rdmacg_object cg_obj; -- cgit v1.2.3 From ca748c39ea3f3c755295d64d69ba0b4375e34b5d Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Sun, 16 Sep 2018 20:48:09 +0300 Subject: RDMA/umem: Get rid of per_mm->notifier_count This is intrinsically racy and the scheme is simply unnecessary. New MR registration can wait for any on going invalidation to fully complete. CPU0 CPU1 if (atomic_read()) if (atomic_dec_and_test() && !list_empty()) { /* not taken */ } list_add() Putting the new UMEM into some kind of purgatory until another invalidate rolls through.. Instead hold the read side of the umem_rwsem across the pair'd start/end and get rid of the racy 'deferred add' approach. Since all umem's in the rbt are always ready to go, also get rid of the mn_counters_active stuff. Signed-off-by: Jason Gunthorpe Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/core/umem_odp.c | 113 ++++++------------------------------- include/rdma/ib_umem_odp.h | 15 ----- 2 files changed, 18 insertions(+), 110 deletions(-) (limited to 'drivers/infiniband/core') diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c index 0577f9ff600f..1c0c4a431218 100644 --- a/drivers/infiniband/core/umem_odp.c +++ b/drivers/infiniband/core/umem_odp.c @@ -80,83 +80,29 @@ INTERVAL_TREE_DEFINE(struct umem_odp_node, rb, u64, __subtree_last, static void ib_umem_notifier_start_account(struct ib_umem_odp *umem_odp) { mutex_lock(&umem_odp->umem_mutex); - - /* Only update private counters for this umem if it has them. - * Otherwise skip it. All page faults will be delayed for this umem. */ - if (umem_odp->mn_counters_active) { - int notifiers_count = umem_odp->notifiers_count++; - - if (notifiers_count == 0) - /* Initialize the completion object for waiting on - * notifiers. Since notifier_count is zero, no one - * should be waiting right now. */ - reinit_completion(&umem_odp->notifier_completion); - } + if (umem_odp->notifiers_count++ == 0) + /* + * Initialize the completion object for waiting on + * notifiers. Since notifier_count is zero, no one should be + * waiting right now. + */ + reinit_completion(&umem_odp->notifier_completion); mutex_unlock(&umem_odp->umem_mutex); } static void ib_umem_notifier_end_account(struct ib_umem_odp *umem_odp) { mutex_lock(&umem_odp->umem_mutex); - - /* Only update private counters for this umem if it has them. - * Otherwise skip it. All page faults will be delayed for this umem. */ - if (umem_odp->mn_counters_active) { - /* - * This sequence increase will notify the QP page fault that - * the page that is going to be mapped in the spte could have - * been freed. - */ - ++umem_odp->notifiers_seq; - if (--umem_odp->notifiers_count == 0) - complete_all(&umem_odp->notifier_completion); - } + /* + * This sequence increase will notify the QP page fault that the page + * that is going to be mapped in the spte could have been freed. + */ + ++umem_odp->notifiers_seq; + if (--umem_odp->notifiers_count == 0) + complete_all(&umem_odp->notifier_completion); mutex_unlock(&umem_odp->umem_mutex); } -/* Account for a new mmu notifier in an ib_ucontext. */ -static void -ib_ucontext_notifier_start_account(struct ib_ucontext_per_mm *per_mm) -{ - atomic_inc(&per_mm->notifier_count); -} - -/* Account for a terminating mmu notifier in an ib_ucontext. - * - * Must be called with the ib_ucontext->umem_rwsem semaphore unlocked, since - * the function takes the semaphore itself. */ -static void ib_ucontext_notifier_end_account(struct ib_ucontext_per_mm *per_mm) -{ - int zero_notifiers = atomic_dec_and_test(&per_mm->notifier_count); - - if (zero_notifiers && - !list_empty(&per_mm->no_private_counters)) { - /* No currently running mmu notifiers. Now is the chance to - * add private accounting to all previously added umems. */ - struct ib_umem_odp *odp_data, *next; - - /* Prevent concurrent mmu notifiers from working on the - * no_private_counters list. */ - down_write(&per_mm->umem_rwsem); - - /* Read the notifier_count again, with the umem_rwsem - * semaphore taken for write. */ - if (!atomic_read(&per_mm->notifier_count)) { - list_for_each_entry_safe(odp_data, next, - &per_mm->no_private_counters, - no_private_counters) { - mutex_lock(&odp_data->umem_mutex); - odp_data->mn_counters_active = true; - list_del(&odp_data->no_private_counters); - complete_all(&odp_data->notifier_completion); - mutex_unlock(&odp_data->umem_mutex); - } - } - - up_write(&per_mm->umem_rwsem); - } -} - static int ib_umem_notifier_release_trampoline(struct ib_umem_odp *umem_odp, u64 start, u64 end, void *cookie) { @@ -186,7 +132,6 @@ static void ib_umem_notifier_release(struct mmu_notifier *mn, if (!per_mm->context->invalidate_range) return; - ib_ucontext_notifier_start_account(per_mm); down_read(&per_mm->umem_rwsem); rbt_ib_umem_for_each_in_range(&per_mm->umem_tree, 0, ULLONG_MAX, @@ -231,14 +176,9 @@ static int ib_umem_notifier_invalidate_range_start(struct mmu_notifier *mn, else if (!down_read_trylock(&per_mm->umem_rwsem)) return -EAGAIN; - ib_ucontext_notifier_start_account(per_mm); - ret = rbt_ib_umem_for_each_in_range(&per_mm->umem_tree, start, - end, - invalidate_range_start_trampoline, - blockable, NULL); - up_read(&per_mm->umem_rwsem); - - return ret; + return rbt_ib_umem_for_each_in_range(&per_mm->umem_tree, start, end, + invalidate_range_start_trampoline, + blockable, NULL); } static int invalidate_range_end_trampoline(struct ib_umem_odp *item, u64 start, @@ -259,17 +199,10 @@ static void ib_umem_notifier_invalidate_range_end(struct mmu_notifier *mn, if (!per_mm->context->invalidate_range) return; - /* - * TODO: we currently bail out if there is any sleepable work to be done - * in ib_umem_notifier_invalidate_range_start so we shouldn't really block - * here. But this is ugly and fragile. - */ - down_read(&per_mm->umem_rwsem); rbt_ib_umem_for_each_in_range(&per_mm->umem_tree, start, end, invalidate_range_end_trampoline, true, NULL); up_read(&per_mm->umem_rwsem); - ib_ucontext_notifier_end_account(per_mm); } static const struct mmu_notifier_ops ib_umem_notifiers = { @@ -287,12 +220,6 @@ static void add_umem_to_per_mm(struct ib_umem_odp *umem_odp) if (likely(ib_umem_start(umem) != ib_umem_end(umem))) rbt_ib_umem_insert(&umem_odp->interval_tree, &per_mm->umem_tree); - - if (likely(!atomic_read(&per_mm->notifier_count))) - umem_odp->mn_counters_active = true; - else - list_add(&umem_odp->no_private_counters, - &per_mm->no_private_counters); up_write(&per_mm->umem_rwsem); } @@ -305,10 +232,7 @@ static void remove_umem_from_per_mm(struct ib_umem_odp *umem_odp) if (likely(ib_umem_start(umem) != ib_umem_end(umem))) rbt_ib_umem_remove(&umem_odp->interval_tree, &per_mm->umem_tree); - if (!umem_odp->mn_counters_active) { - list_del(&umem_odp->no_private_counters); - complete_all(&umem_odp->notifier_completion); - } + complete_all(&umem_odp->notifier_completion); up_write(&per_mm->umem_rwsem); } @@ -327,7 +251,6 @@ static struct ib_ucontext_per_mm *alloc_per_mm(struct ib_ucontext *ctx, per_mm->mm = mm; per_mm->umem_tree = RB_ROOT_CACHED; init_rwsem(&per_mm->umem_rwsem); - INIT_LIST_HEAD(&per_mm->no_private_counters); rcu_read_lock(); per_mm->tgid = get_task_pid(current->group_leader, PIDTYPE_PID); diff --git a/include/rdma/ib_umem_odp.h b/include/rdma/ib_umem_odp.h index 259eb08dfc9e..ce9502545903 100644 --- a/include/rdma/ib_umem_odp.h +++ b/include/rdma/ib_umem_odp.h @@ -67,15 +67,9 @@ struct ib_umem_odp { struct mutex umem_mutex; void *private; /* for the HW driver to use. */ - /* When false, use the notifier counter in the ucontext struct. */ - bool mn_counters_active; int notifiers_seq; int notifiers_count; - /* A linked list of umems that don't have private mmu notifier - * counters yet. */ - struct list_head no_private_counters; - /* Tree tracking */ struct umem_odp_node interval_tree; @@ -99,11 +93,8 @@ struct ib_ucontext_per_mm { struct rb_root_cached umem_tree; /* Protects umem_tree */ struct rw_semaphore umem_rwsem; - atomic_t notifier_count; struct mmu_notifier mn; - /* A list of umems that don't have private mmu notifier counters yet. */ - struct list_head no_private_counters; unsigned int odp_mrs_count; struct list_head ucontext_list; @@ -162,12 +153,6 @@ static inline int ib_umem_mmu_notifier_retry(struct ib_umem_odp *umem_odp, * and the ucontext umem_mutex semaphore locked for read). */ - /* Do not allow page faults while the new ib_umem hasn't seen a state - * with zero notifiers yet, and doesn't have its own valid set of - * private counters. */ - if (!umem_odp->mn_counters_active) - return 1; - if (unlikely(umem_odp->notifiers_count)) return 1; if (umem_odp->notifiers_seq != mmu_seq) -- cgit v1.2.3 From be7a57b41ad824dbc59d1ffa91160ee73f2999ee Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Sun, 16 Sep 2018 20:48:10 +0300 Subject: RDMA/umem: Handle a half-complete start/end sequence mmu_notifier_unregister() can race between a invalidate_start/end and cause the invalidate_end to be skipped. This causes an imbalance in the locking, which lockdep complains about. This is not actually a bug, as we immediately kfree the memory holding the lock, but it simple enough to fix. Mark when the notifier is being destroyed and abort the start callback. This can be done under the lock we already obtained, and can re-purpose the invalidate_range test we already have. Signed-off-by: Jason Gunthorpe Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/core/umem_odp.c | 39 +++++++++++++++++++++++++------------- include/rdma/ib_umem_odp.h | 1 + 2 files changed, 27 insertions(+), 13 deletions(-) (limited to 'drivers/infiniband/core') diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c index 1c0c4a431218..d7b6422b9611 100644 --- a/drivers/infiniband/core/umem_odp.c +++ b/drivers/infiniband/core/umem_odp.c @@ -129,15 +129,11 @@ static void ib_umem_notifier_release(struct mmu_notifier *mn, struct ib_ucontext_per_mm *per_mm = container_of(mn, struct ib_ucontext_per_mm, mn); - if (!per_mm->context->invalidate_range) - return; - down_read(&per_mm->umem_rwsem); - rbt_ib_umem_for_each_in_range(&per_mm->umem_tree, 0, - ULLONG_MAX, - ib_umem_notifier_release_trampoline, - true, - NULL); + if (per_mm->active) + rbt_ib_umem_for_each_in_range( + &per_mm->umem_tree, 0, ULLONG_MAX, + ib_umem_notifier_release_trampoline, true, NULL); up_read(&per_mm->umem_rwsem); } @@ -166,16 +162,22 @@ static int ib_umem_notifier_invalidate_range_start(struct mmu_notifier *mn, { struct ib_ucontext_per_mm *per_mm = container_of(mn, struct ib_ucontext_per_mm, mn); - int ret; - - if (!per_mm->context->invalidate_range) - return 0; if (blockable) down_read(&per_mm->umem_rwsem); else if (!down_read_trylock(&per_mm->umem_rwsem)) return -EAGAIN; + if (!per_mm->active) { + up_read(&per_mm->umem_rwsem); + /* + * At this point active is permanently set and visible to this + * CPU without a lock, that fact is relied on to skip the unlock + * in range_end. + */ + return 0; + } + return rbt_ib_umem_for_each_in_range(&per_mm->umem_tree, start, end, invalidate_range_start_trampoline, blockable, NULL); @@ -196,7 +198,7 @@ static void ib_umem_notifier_invalidate_range_end(struct mmu_notifier *mn, struct ib_ucontext_per_mm *per_mm = container_of(mn, struct ib_ucontext_per_mm, mn); - if (!per_mm->context->invalidate_range) + if (unlikely(!per_mm->active)) return; rbt_ib_umem_for_each_in_range(&per_mm->umem_tree, start, @@ -251,6 +253,7 @@ static struct ib_ucontext_per_mm *alloc_per_mm(struct ib_ucontext *ctx, per_mm->mm = mm; per_mm->umem_tree = RB_ROOT_CACHED; init_rwsem(&per_mm->umem_rwsem); + per_mm->active = ctx->invalidate_range; rcu_read_lock(); per_mm->tgid = get_task_pid(current->group_leader, PIDTYPE_PID); @@ -321,6 +324,16 @@ void put_per_mm(struct ib_umem_odp *umem_odp) if (!need_free) return; + /* + * NOTE! mmu_notifier_unregister() can happen between a start/end + * callback, resulting in an start/end, and thus an unbalanced + * lock. This doesn't really matter to us since we are about to kfree + * the memory that holds the lock, however LOCKDEP doesn't like this. + */ + down_write(&per_mm->umem_rwsem); + per_mm->active = false; + up_write(&per_mm->umem_rwsem); + mmu_notifier_unregister(&per_mm->mn, per_mm->mm); put_pid(per_mm->tgid); kfree(per_mm); diff --git a/include/rdma/ib_umem_odp.h b/include/rdma/ib_umem_odp.h index ce9502545903..ec05c82ead7a 100644 --- a/include/rdma/ib_umem_odp.h +++ b/include/rdma/ib_umem_odp.h @@ -89,6 +89,7 @@ struct ib_ucontext_per_mm { struct ib_ucontext *context; struct mm_struct *mm; struct pid *tgid; + bool active; struct rb_root_cached umem_tree; /* Protects umem_tree */ -- cgit v1.2.3 From 56ac9dd9177ce451ac8176311915b29e8b5f0ac2 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Sun, 16 Sep 2018 20:48:11 +0300 Subject: RDMA/umem: Avoid synchronize_srcu in the ODP MR destruction path synchronize_rcu is slow enough that it should be avoided on the syscall path when user space is destroying MRs. After all the rework we can now trivially do this by having call_srcu kfree the per_mm. Signed-off-by: Jason Gunthorpe Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/core/umem_odp.c | 10 ++++++++-- include/rdma/ib_umem_odp.h | 1 + 2 files changed, 9 insertions(+), 2 deletions(-) (limited to 'drivers/infiniband/core') diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c index d7b6422b9611..2b4c5e7dd5a1 100644 --- a/drivers/infiniband/core/umem_odp.c +++ b/drivers/infiniband/core/umem_odp.c @@ -307,6 +307,11 @@ found: return 0; } +static void free_per_mm(struct rcu_head *rcu) +{ + kfree(container_of(rcu, struct ib_ucontext_per_mm, rcu)); +} + void put_per_mm(struct ib_umem_odp *umem_odp) { struct ib_ucontext_per_mm *per_mm = umem_odp->per_mm; @@ -334,9 +339,10 @@ void put_per_mm(struct ib_umem_odp *umem_odp) per_mm->active = false; up_write(&per_mm->umem_rwsem); - mmu_notifier_unregister(&per_mm->mn, per_mm->mm); + WARN_ON(!RB_EMPTY_ROOT(&per_mm->umem_tree.rb_root)); + mmu_notifier_unregister_no_release(&per_mm->mn, per_mm->mm); put_pid(per_mm->tgid); - kfree(per_mm); + mmu_notifier_call_srcu(&per_mm->rcu, free_per_mm); } struct ib_umem_odp *ib_alloc_odp_umem(struct ib_ucontext_per_mm *per_mm, diff --git a/include/rdma/ib_umem_odp.h b/include/rdma/ib_umem_odp.h index ec05c82ead7a..0b1446fe2fab 100644 --- a/include/rdma/ib_umem_odp.h +++ b/include/rdma/ib_umem_odp.h @@ -99,6 +99,7 @@ struct ib_ucontext_per_mm { unsigned int odp_mrs_count; struct list_head ucontext_list; + struct rcu_head rcu; }; int ib_umem_odp_get(struct ib_umem_odp *umem_odp, int access); -- cgit v1.2.3 From 2a3ccfdbeb6a5f832d7203e230799f1ffa46e0fc Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Sun, 16 Sep 2018 20:48:12 +0300 Subject: RDMA/uverbs: Get rid of ucontext->tgid Nothing uses this now, just delete it. Signed-off-by: Jason Gunthorpe Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/core/rdma_core.c | 1 - drivers/infiniband/core/uverbs_cmd.c | 4 ---- include/rdma/ib_verbs.h | 1 - 3 files changed, 6 deletions(-) (limited to 'drivers/infiniband/core') diff --git a/drivers/infiniband/core/rdma_core.c b/drivers/infiniband/core/rdma_core.c index 6a3acf4bf78a..752a55c6bdce 100644 --- a/drivers/infiniband/core/rdma_core.c +++ b/drivers/infiniband/core/rdma_core.c @@ -816,7 +816,6 @@ static void ufile_destroy_ucontext(struct ib_uverbs_file *ufile, ib_dev->disassociate_ucontext(ucontext); } - put_pid(ucontext->tgid); ib_rdmacg_uncharge(&ucontext->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_HANDLE); diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index d77b0b9793c7..91d3e4029cd5 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -117,9 +117,6 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file, /* ufile is required when some objects are released */ ucontext->ufile = file; - rcu_read_lock(); - ucontext->tgid = get_task_pid(current->group_leader, PIDTYPE_PID); - rcu_read_unlock(); ucontext->closing = false; ucontext->cleanup_retryable = false; @@ -169,7 +166,6 @@ err_fd: put_unused_fd(resp.async_fd); err_free: - put_pid(ucontext->tgid); ib_dev->dealloc_ucontext(ucontext); err_alloc: diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 6437e6af758d..0d822a9db300 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -1500,7 +1500,6 @@ struct ib_ucontext { bool cleanup_retryable; - struct pid *tgid; #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING void (*invalidate_range)(struct ib_umem_odp *umem_odp, unsigned long start, unsigned long end); -- cgit v1.2.3 From 3312d1c6bdee6aa912c099c0ac0662d197c52842 Mon Sep 17 00:00:00 2001 From: Doug Ledford Date: Fri, 21 Sep 2018 11:30:12 -0400 Subject: RDMA/umem: Minor optimizations Noticed while reviewing commit d4b4dd1b9706 ("RDMA/umem: Do not use current->tgid to track the mm_struct") patch. Why would we take a lock, adjust a protected variable, drop the lock, and *then* check the input into our protected variable adjustment? Then we have to take the lock again on our error unwind. Let's just check the input early and skip taking the locks needlessly if the input isn't valid. It was also noticed that we set mm = current->mm, we then never modify mm, but we still go back and reference current->mm a number of times needlessly. Be consistent in using the stored reference in mm. Signed-off-by: Doug Ledford Reviewed-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/umem.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) (limited to 'drivers/infiniband/core') diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c index fec5d489e311..1886d7709911 100644 --- a/drivers/infiniband/core/umem.c +++ b/drivers/infiniband/core/umem.c @@ -152,6 +152,10 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, umem->hugetlb = 0; npages = ib_umem_num_pages(umem); + if (npages == 0 || npages > UINT_MAX) { + ret = -EINVAL; + goto out; + } lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; @@ -166,11 +170,6 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, cur_base = addr & PAGE_MASK; - if (npages == 0 || npages > UINT_MAX) { - ret = -EINVAL; - goto vma; - } - ret = sg_alloc_table(&umem->sg_head, npages, GFP_KERNEL); if (ret) goto vma; @@ -224,9 +223,9 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, umem_release: __ib_umem_release(context->device, umem, 0); vma: - down_write(¤t->mm->mmap_sem); - current->mm->pinned_vm -= ib_umem_num_pages(umem); - up_write(¤t->mm->mmap_sem); + down_write(&mm->mmap_sem); + mm->pinned_vm -= ib_umem_num_pages(umem); + up_write(&mm->mmap_sem); out: if (vma_list) free_page((unsigned long) vma_list); -- cgit v1.2.3 From c6ce580716372d71cd119bacf73f14a62e9af2ea Mon Sep 17 00:00:00 2001 From: Doug Ledford Date: Fri, 21 Sep 2018 11:30:13 -0400 Subject: RDMA/umem: Fix potential addition overflow Given a large enough memory allocation, it is possible to wrap the pinned_vm counter. Check for addition overflow to prevent such eventualities. Fixes: 40ddacf2dda9 ("RDMA/umem: Don't hold mmap_sem for too long") Reported-by: Jason Gunthorpe Signed-off-by: Doug Ledford Reviewed-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/umem.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'drivers/infiniband/core') diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c index 1886d7709911..8da1cf29a69f 100644 --- a/drivers/infiniband/core/umem.c +++ b/drivers/infiniband/core/umem.c @@ -85,6 +85,7 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, struct page **page_list; struct vm_area_struct **vma_list; unsigned long lock_limit; + unsigned long new_pinned; unsigned long cur_base; struct mm_struct *mm; unsigned long npages; @@ -160,12 +161,13 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; down_write(&mm->mmap_sem); - mm->pinned_vm += npages; - if ((mm->pinned_vm > lock_limit) && !capable(CAP_IPC_LOCK)) { + if (check_add_overflow(mm->pinned_vm, npages, &new_pinned) || + (new_pinned > lock_limit && !capable(CAP_IPC_LOCK))) { up_write(&mm->mmap_sem); ret = -ENOMEM; - goto vma; + goto out; } + mm->pinned_vm = new_pinned; up_write(&mm->mmap_sem); cur_base = addr & PAGE_MASK; -- cgit v1.2.3 From e349f858d29f300ad9ad327fd57735a1d15e147f Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 25 Sep 2018 16:58:09 -0600 Subject: RDMA: Fully setup the device name in ib_register_device The current code has two copies of the device name, ibdev->dev and dev_name(&ibdev->dev), and they are setup at different times, which is very confusing. Set them both up at the same time and make dev_name() the lead name, which is the proper use of the driver core APIs. To make it very clear that the name is not valid until registration pass it in to the ib_register_device() call rather than messing with ibdev->name directly. Also the reorganization now checks that dev_name is unique even if it does not contain a %. Signed-off-by: Jason Gunthorpe Acked-by: Adit Ranadive Reviewed-by: Steve Wise Acked-by: Devesh Sharma Reviewed-by: Shiraz Saleem Reviewed-by: Leon Romanovsky Reviewed-by: Dennis Dalessandro Reviewed-by: Michael J. Ruhl --- drivers/infiniband/core/device.c | 35 +++++++++++++++----------- drivers/infiniband/core/sysfs.c | 4 --- drivers/infiniband/hw/bnxt_re/main.c | 3 +-- drivers/infiniband/hw/cxgb3/iwch_provider.c | 3 +-- drivers/infiniband/hw/cxgb4/provider.c | 3 +-- drivers/infiniband/hw/hns/hns_roce_main.c | 3 +-- drivers/infiniband/hw/i40iw/i40iw_verbs.c | 3 +-- drivers/infiniband/hw/mlx4/main.c | 3 +-- drivers/infiniband/hw/mlx5/main.c | 15 ++++++----- drivers/infiniband/hw/mthca/mthca_provider.c | 3 +-- drivers/infiniband/hw/nes/nes_verbs.c | 3 +-- drivers/infiniband/hw/ocrdma/ocrdma_main.c | 3 +-- drivers/infiniband/hw/qedr/main.c | 4 +-- drivers/infiniband/hw/usnic/usnic_ib_main.c | 3 +-- drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c | 3 +-- drivers/infiniband/sw/rdmavt/vt.c | 3 ++- drivers/infiniband/sw/rxe/rxe_verbs.c | 3 +-- include/rdma/ib_verbs.h | 6 ++--- include/rdma/rdma_vt.h | 9 ++++++- 19 files changed, 53 insertions(+), 59 deletions(-) (limited to 'drivers/infiniband/core') diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 5a680a88aa87..faacf95699d7 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -170,10 +170,9 @@ static struct ib_device *__ib_device_get_by_name(const char *name) return NULL; } -static int alloc_name(char *name) +static int alloc_name(struct ib_device *ibdev, const char *name) { unsigned long *inuse; - char buf[IB_DEVICE_NAME_MAX]; struct ib_device *device; int i; @@ -182,24 +181,21 @@ static int alloc_name(char *name) return -ENOMEM; list_for_each_entry(device, &device_list, core_list) { - if (!sscanf(device->name, name, &i)) + char buf[IB_DEVICE_NAME_MAX]; + + if (sscanf(device->name, name, &i) != 1) continue; if (i < 0 || i >= PAGE_SIZE * 8) continue; snprintf(buf, sizeof buf, name, i); - if (!strncmp(buf, device->name, IB_DEVICE_NAME_MAX)) + if (!strcmp(buf, dev_name(&device->dev))) set_bit(i, inuse); } i = find_first_zero_bit(inuse, PAGE_SIZE * 8); free_page((unsigned long) inuse); - snprintf(buf, sizeof buf, name, i); - - if (__ib_device_get_by_name(buf)) - return -ENFILE; - strlcpy(name, buf, IB_DEVICE_NAME_MAX); - return 0; + return dev_set_name(&ibdev->dev, name, i); } static void ib_device_release(struct device *device) @@ -454,9 +450,9 @@ static u32 __dev_new_index(void) * callback for each device that is added. @device must be allocated * with ib_alloc_device(). */ -int ib_register_device(struct ib_device *device, - int (*port_callback)(struct ib_device *, - u8, struct kobject *)) +int ib_register_device(struct ib_device *device, const char *name, + int (*port_callback)(struct ib_device *, u8, + struct kobject *)) { int ret; struct ib_client *client; @@ -495,11 +491,20 @@ int ib_register_device(struct ib_device *device, mutex_lock(&device_mutex); - if (strchr(device->name, '%')) { - ret = alloc_name(device->name); + if (strchr(name, '%')) { + ret = alloc_name(device, name); + if (ret) + goto out; + } else { + ret = dev_set_name(&device->dev, name); if (ret) goto out; } + if (__ib_device_get_by_name(dev_name(&device->dev))) { + ret = -ENFILE; + goto out; + } + strlcpy(device->name, dev_name(&device->dev), IB_DEVICE_NAME_MAX); if (ib_device_check_mandatory(device)) { ret = -EINVAL; diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c index 0b04dbff884f..bc947a863b34 100644 --- a/drivers/infiniband/core/sysfs.c +++ b/drivers/infiniband/core/sysfs.c @@ -1311,10 +1311,6 @@ int ib_device_register_sysfs(struct ib_device *device, int ret; int i; - ret = dev_set_name(class_dev, "%s", device->name); - if (ret) - return ret; - device->groups[0] = &dev_attr_group; class_dev->groups = device->groups; diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c index 20b9f31052bf..73632e5b819f 100644 --- a/drivers/infiniband/hw/bnxt_re/main.c +++ b/drivers/infiniband/hw/bnxt_re/main.c @@ -579,7 +579,6 @@ static int bnxt_re_register_ib(struct bnxt_re_dev *rdev) /* ib device init */ ibdev->owner = THIS_MODULE; ibdev->node_type = RDMA_NODE_IB_CA; - strlcpy(ibdev->name, "bnxt_re%d", IB_DEVICE_NAME_MAX); strlcpy(ibdev->node_desc, BNXT_RE_DESC " HCA", strlen(BNXT_RE_DESC) + 5); ibdev->phys_port_cnt = 1; @@ -672,7 +671,7 @@ static int bnxt_re_register_ib(struct bnxt_re_dev *rdev) ibdev->alloc_hw_stats = bnxt_re_ib_alloc_hw_stats; ibdev->driver_id = RDMA_DRIVER_BNXT_RE; - return ib_register_device(ibdev, NULL); + return ib_register_device(ibdev, "bnxt_re%d", NULL); } static ssize_t show_rev(struct device *device, struct device_attribute *attr, diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.c b/drivers/infiniband/hw/cxgb3/iwch_provider.c index 1b9ff21aa1d5..39530cc15f95 100644 --- a/drivers/infiniband/hw/cxgb3/iwch_provider.c +++ b/drivers/infiniband/hw/cxgb3/iwch_provider.c @@ -1319,7 +1319,6 @@ int iwch_register_device(struct iwch_dev *dev) int i; pr_debug("%s iwch_dev %p\n", __func__, dev); - strlcpy(dev->ibdev.name, "cxgb3_%d", IB_DEVICE_NAME_MAX); memset(&dev->ibdev.node_guid, 0, sizeof(dev->ibdev.node_guid)); memcpy(&dev->ibdev.node_guid, dev->rdev.t3cdev_p->lldev->dev_addr, 6); dev->ibdev.owner = THIS_MODULE; @@ -1402,7 +1401,7 @@ int iwch_register_device(struct iwch_dev *dev) sizeof(dev->ibdev.iwcm->ifname)); dev->ibdev.driver_id = RDMA_DRIVER_CXGB3; - ret = ib_register_device(&dev->ibdev, NULL); + ret = ib_register_device(&dev->ibdev, "cxgb3_%d", NULL); if (ret) goto bail1; diff --git a/drivers/infiniband/hw/cxgb4/provider.c b/drivers/infiniband/hw/cxgb4/provider.c index 4eda6872e617..416f8d1af610 100644 --- a/drivers/infiniband/hw/cxgb4/provider.c +++ b/drivers/infiniband/hw/cxgb4/provider.c @@ -535,7 +535,6 @@ void c4iw_register_device(struct work_struct *work) struct c4iw_dev *dev = ctx->dev; pr_debug("c4iw_dev %p\n", dev); - strlcpy(dev->ibdev.name, "cxgb4_%d", IB_DEVICE_NAME_MAX); memset(&dev->ibdev.node_guid, 0, sizeof(dev->ibdev.node_guid)); memcpy(&dev->ibdev.node_guid, dev->rdev.lldi.ports[0]->dev_addr, 6); dev->ibdev.owner = THIS_MODULE; @@ -627,7 +626,7 @@ void c4iw_register_device(struct work_struct *work) sizeof(dev->ibdev.iwcm->ifname)); dev->ibdev.driver_id = RDMA_DRIVER_CXGB4; - ret = ib_register_device(&dev->ibdev, NULL); + ret = ib_register_device(&dev->ibdev, "cxgb4_%d", NULL); if (ret) goto err_kfree_iwcm; diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c index 6edb547baee8..5a86a48cba13 100644 --- a/drivers/infiniband/hw/hns/hns_roce_main.c +++ b/drivers/infiniband/hw/hns/hns_roce_main.c @@ -449,7 +449,6 @@ static int hns_roce_register_device(struct hns_roce_dev *hr_dev) spin_lock_init(&iboe->lock); ib_dev = &hr_dev->ib_dev; - strlcpy(ib_dev->name, "hns_%d", IB_DEVICE_NAME_MAX); ib_dev->owner = THIS_MODULE; ib_dev->node_type = RDMA_NODE_IB_CA; @@ -530,7 +529,7 @@ static int hns_roce_register_device(struct hns_roce_dev *hr_dev) ib_dev->disassociate_ucontext = hns_roce_disassociate_ucontext; ib_dev->driver_id = RDMA_DRIVER_HNS; - ret = ib_register_device(ib_dev, NULL); + ret = ib_register_device(ib_dev, "hns_%d", NULL); if (ret) { dev_err(dev, "ib_register_device failed!\n"); return ret; diff --git a/drivers/infiniband/hw/i40iw/i40iw_verbs.c b/drivers/infiniband/hw/i40iw/i40iw_verbs.c index e2e6c74a7452..cb2aef874ca8 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_verbs.c +++ b/drivers/infiniband/hw/i40iw/i40iw_verbs.c @@ -2752,7 +2752,6 @@ static struct i40iw_ib_device *i40iw_init_rdma_device(struct i40iw_device *iwdev i40iw_pr_err("iwdev == NULL\n"); return NULL; } - strlcpy(iwibdev->ibdev.name, "i40iw%d", IB_DEVICE_NAME_MAX); iwibdev->ibdev.owner = THIS_MODULE; iwdev->iwibdev = iwibdev; iwibdev->iwdev = iwdev; @@ -2897,7 +2896,7 @@ int i40iw_register_rdma_device(struct i40iw_device *iwdev) iwibdev = iwdev->iwibdev; iwibdev->ibdev.driver_id = RDMA_DRIVER_I40IW; - ret = ib_register_device(&iwibdev->ibdev, NULL); + ret = ib_register_device(&iwibdev->ibdev, "i40iw%d", NULL); if (ret) goto error; diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index bf3cdb88aaf5..fa5d20eccc21 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -2540,7 +2540,6 @@ static void *mlx4_ib_add(struct mlx4_dev *dev) ibdev->dev = dev; ibdev->bond_next_port = 0; - strlcpy(ibdev->ib_dev.name, "mlx4_%d", IB_DEVICE_NAME_MAX); ibdev->ib_dev.owner = THIS_MODULE; ibdev->ib_dev.node_type = RDMA_NODE_IB_CA; ibdev->ib_dev.local_dma_lkey = dev->caps.reserved_lkey; @@ -2803,7 +2802,7 @@ static void *mlx4_ib_add(struct mlx4_dev *dev) goto err_steer_free_bitmap; ibdev->ib_dev.driver_id = RDMA_DRIVER_MLX4; - if (ib_register_device(&ibdev->ib_dev, NULL)) + if (ib_register_device(&ibdev->ib_dev, "mlx4_%d", NULL)) goto err_diag_counters; if (mlx4_ib_mad_init(ibdev)) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index fb1e3c546826..597cd3c171c9 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -5671,7 +5671,6 @@ void mlx5_ib_stage_init_cleanup(struct mlx5_ib_dev *dev) int mlx5_ib_stage_init_init(struct mlx5_ib_dev *dev) { struct mlx5_core_dev *mdev = dev->mdev; - const char *name; int err; int i; @@ -5704,12 +5703,6 @@ int mlx5_ib_stage_init_init(struct mlx5_ib_dev *dev) if (mlx5_use_mad_ifc(dev)) get_ext_port_caps(dev); - if (!mlx5_lag_is_active(mdev)) - name = "mlx5_%d"; - else - name = "mlx5_bond_%d"; - - strlcpy(dev->ib_dev.name, name, IB_DEVICE_NAME_MAX); dev->ib_dev.owner = THIS_MODULE; dev->ib_dev.node_type = RDMA_NODE_IB_CA; dev->ib_dev.local_dma_lkey = 0 /* not supported for now */; @@ -6122,7 +6115,13 @@ static int mlx5_ib_stage_populate_specs(struct mlx5_ib_dev *dev) int mlx5_ib_stage_ib_reg_init(struct mlx5_ib_dev *dev) { - return ib_register_device(&dev->ib_dev, NULL); + const char *name; + + if (!mlx5_lag_is_active(dev->mdev)) + name = "mlx5_%d"; + else + name = "mlx5_bond_%d"; + return ib_register_device(&dev->ib_dev, name, NULL); } void mlx5_ib_stage_pre_ib_reg_umr_cleanup(struct mlx5_ib_dev *dev) diff --git a/drivers/infiniband/hw/mthca/mthca_provider.c b/drivers/infiniband/hw/mthca/mthca_provider.c index 0d3473b4596e..7bd7e2ad17e4 100644 --- a/drivers/infiniband/hw/mthca/mthca_provider.c +++ b/drivers/infiniband/hw/mthca/mthca_provider.c @@ -1198,7 +1198,6 @@ int mthca_register_device(struct mthca_dev *dev) if (ret) return ret; - strlcpy(dev->ib_dev.name, "mthca%d", IB_DEVICE_NAME_MAX); dev->ib_dev.owner = THIS_MODULE; dev->ib_dev.uverbs_abi_ver = MTHCA_UVERBS_ABI_VERSION; @@ -1297,7 +1296,7 @@ int mthca_register_device(struct mthca_dev *dev) mutex_init(&dev->cap_mask_mutex); dev->ib_dev.driver_id = RDMA_DRIVER_MTHCA; - ret = ib_register_device(&dev->ib_dev, NULL); + ret = ib_register_device(&dev->ib_dev, "mthca%d", NULL); if (ret) return ret; diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c index 6940c7215961..2127cd2f4bec 100644 --- a/drivers/infiniband/hw/nes/nes_verbs.c +++ b/drivers/infiniband/hw/nes/nes_verbs.c @@ -3640,7 +3640,6 @@ struct nes_ib_device *nes_init_ofa_device(struct net_device *netdev) if (nesibdev == NULL) { return NULL; } - strlcpy(nesibdev->ibdev.name, "nes%d", IB_DEVICE_NAME_MAX); nesibdev->ibdev.owner = THIS_MODULE; nesibdev->ibdev.node_type = RDMA_NODE_RNIC; @@ -3798,7 +3797,7 @@ int nes_register_ofa_device(struct nes_ib_device *nesibdev) int i, ret; nesvnic->nesibdev->ibdev.driver_id = RDMA_DRIVER_NES; - ret = ib_register_device(&nesvnic->nesibdev->ibdev, NULL); + ret = ib_register_device(&nesvnic->nesibdev->ibdev, "nes%d", NULL); if (ret) { return ret; } diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_main.c b/drivers/infiniband/hw/ocrdma/ocrdma_main.c index 7832ee3e0c84..4d3c27613351 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_main.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_main.c @@ -116,7 +116,6 @@ static void get_dev_fw_str(struct ib_device *device, char *str) static int ocrdma_register_device(struct ocrdma_dev *dev) { - strlcpy(dev->ibdev.name, "ocrdma%d", IB_DEVICE_NAME_MAX); ocrdma_get_guid(dev, (u8 *)&dev->ibdev.node_guid); BUILD_BUG_ON(sizeof(OCRDMA_NODE_DESC) > IB_DEVICE_NODE_DESC_MAX); memcpy(dev->ibdev.node_desc, OCRDMA_NODE_DESC, @@ -214,7 +213,7 @@ static int ocrdma_register_device(struct ocrdma_dev *dev) dev->ibdev.post_srq_recv = ocrdma_post_srq_recv; } dev->ibdev.driver_id = RDMA_DRIVER_OCRDMA; - return ib_register_device(&dev->ibdev, NULL); + return ib_register_device(&dev->ibdev, "ocrdma%d", NULL); } static int ocrdma_alloc_resources(struct ocrdma_dev *dev) diff --git a/drivers/infiniband/hw/qedr/main.c b/drivers/infiniband/hw/qedr/main.c index a0af6d424aed..cd7b8b39a129 100644 --- a/drivers/infiniband/hw/qedr/main.c +++ b/drivers/infiniband/hw/qedr/main.c @@ -170,8 +170,6 @@ static int qedr_register_device(struct qedr_dev *dev) { int rc; - strlcpy(dev->ibdev.name, "qedr%d", IB_DEVICE_NAME_MAX); - dev->ibdev.node_guid = dev->attr.node_guid; memcpy(dev->ibdev.node_desc, QEDR_NODE_DESC, sizeof(QEDR_NODE_DESC)); dev->ibdev.owner = THIS_MODULE; @@ -264,7 +262,7 @@ static int qedr_register_device(struct qedr_dev *dev) dev->ibdev.get_dev_fw_str = qedr_get_dev_fw_str; dev->ibdev.driver_id = RDMA_DRIVER_QEDR; - return ib_register_device(&dev->ibdev, NULL); + return ib_register_device(&dev->ibdev, "qedr%d", NULL); } /* This function allocates fast-path status block memory */ diff --git a/drivers/infiniband/hw/usnic/usnic_ib_main.c b/drivers/infiniband/hw/usnic/usnic_ib_main.c index f0538a460328..3b9f12928314 100644 --- a/drivers/infiniband/hw/usnic/usnic_ib_main.c +++ b/drivers/infiniband/hw/usnic/usnic_ib_main.c @@ -364,7 +364,6 @@ static void *usnic_ib_device_add(struct pci_dev *dev) us_ibdev->ib_dev.num_comp_vectors = USNIC_IB_NUM_COMP_VECTORS; us_ibdev->ib_dev.dev.parent = &dev->dev; us_ibdev->ib_dev.uverbs_abi_ver = USNIC_UVERBS_ABI_VERSION; - strlcpy(us_ibdev->ib_dev.name, "usnic_%d", IB_DEVICE_NAME_MAX); us_ibdev->ib_dev.uverbs_cmd_mask = (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) | @@ -416,7 +415,7 @@ static void *usnic_ib_device_add(struct pci_dev *dev) us_ibdev->ib_dev.driver_id = RDMA_DRIVER_USNIC; - if (ib_register_device(&us_ibdev->ib_dev, NULL)) + if (ib_register_device(&us_ibdev->ib_dev, "usnic_%d", NULL)) goto err_fwd_dealloc; usnic_fwd_set_mtu(us_ibdev->ufdev, us_ibdev->netdev->mtu); diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c index a5719899f49a..6878107fc637 100644 --- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c +++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c @@ -162,7 +162,6 @@ static int pvrdma_register_device(struct pvrdma_dev *dev) int ret = -1; int i = 0; - strlcpy(dev->ib_dev.name, "vmw_pvrdma%d", IB_DEVICE_NAME_MAX); dev->ib_dev.node_guid = dev->dsr->caps.node_guid; dev->sys_image_guid = dev->dsr->caps.sys_image_guid; dev->flags = 0; @@ -267,7 +266,7 @@ static int pvrdma_register_device(struct pvrdma_dev *dev) dev->ib_dev.driver_id = RDMA_DRIVER_VMW_PVRDMA; spin_lock_init(&dev->srq_tbl_lock); - ret = ib_register_device(&dev->ib_dev, NULL); + ret = ib_register_device(&dev->ib_dev, "vmw_pvrdma%d", NULL); if (ret) goto err_srq_free; diff --git a/drivers/infiniband/sw/rdmavt/vt.c b/drivers/infiniband/sw/rdmavt/vt.c index 17e4abc067af..e3249d46bcef 100644 --- a/drivers/infiniband/sw/rdmavt/vt.c +++ b/drivers/infiniband/sw/rdmavt/vt.c @@ -828,7 +828,8 @@ int rvt_register_device(struct rvt_dev_info *rdi, u32 driver_id) rdi->ibdev.driver_id = driver_id; /* We are now good to announce we exist */ - ret = ib_register_device(&rdi->ibdev, rdi->driver_f.port_callback); + ret = ib_register_device(&rdi->ibdev, dev_name(&rdi->ibdev.dev), + rdi->driver_f.port_callback); if (ret) { rvt_pr_err(rdi, "Failed to register driver with ib core.\n"); goto bail_mr; diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c index f5b1e0ad6142..e4da5b671e4a 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.c +++ b/drivers/infiniband/sw/rxe/rxe_verbs.c @@ -1159,7 +1159,6 @@ int rxe_register_device(struct rxe_dev *rxe) struct ib_device *dev = &rxe->ib_dev; struct crypto_shash *tfm; - strlcpy(dev->name, "rxe%d", IB_DEVICE_NAME_MAX); strlcpy(dev->node_desc, "rxe", sizeof(dev->node_desc)); dev->owner = THIS_MODULE; @@ -1261,7 +1260,7 @@ int rxe_register_device(struct rxe_dev *rxe) rxe->tfm = tfm; dev->driver_id = RDMA_DRIVER_RXE; - err = ib_register_device(dev, NULL); + err = ib_register_device(dev, "rxe%d", NULL); if (err) { pr_warn("%s failed with error %d\n", __func__, err); goto err1; diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 0d822a9db300..9897d2329f2c 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2625,9 +2625,9 @@ void ib_dealloc_device(struct ib_device *device); void ib_get_device_fw_str(struct ib_device *device, char *str); -int ib_register_device(struct ib_device *device, - int (*port_callback)(struct ib_device *, - u8, struct kobject *)); +int ib_register_device(struct ib_device *device, const char *name, + int (*port_callback)(struct ib_device *, u8, + struct kobject *)); void ib_unregister_device(struct ib_device *device); int ib_register_client (struct ib_client *client); diff --git a/include/rdma/rdma_vt.h b/include/rdma/rdma_vt.h index e32facdd9fd3..065c9fbe6589 100644 --- a/include/rdma/rdma_vt.h +++ b/include/rdma/rdma_vt.h @@ -429,7 +429,14 @@ static inline void rvt_set_ibdev_name(struct rvt_dev_info *rdi, const char *fmt, const char *name, const int unit) { - snprintf(rdi->ibdev.name, sizeof(rdi->ibdev.name), fmt, name, unit); + /* + * FIXME: rvt and its users want to touch the ibdev before + * registration and have things like the name work. We don't have the + * infrastructure in the core to support this directly today, hack it + * to work by setting the name manually here. + */ + dev_set_name(&rdi->ibdev.dev, fmt, name, unit); + strlcpy(rdi->ibdev.name, dev_name(&rdi->ibdev.dev), IB_DEVICE_NAME_MAX); } /** -- cgit v1.2.3 From 43c7c851b9bce9e6091f2c882871a3b388aa38c3 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 20 Sep 2018 16:42:23 -0600 Subject: RDMA/core: Use dev_err/dbg/etc instead of pr_* + ibdev->name Any messages related to a device should be printed with the dev_* formatters. This provides greater consistency for the user. The core does not set pr_fmt so this has no significant change. Signed-off-by: Jason Gunthorpe Reviewed-by: Steve Wise Reviewed-by: Leon Romanovsky Reviewed-by: Dennis Dalessandro --- drivers/infiniband/core/cache.c | 46 ++++++++++++++++++-------------------- drivers/infiniband/core/cma.c | 11 ++++----- drivers/infiniband/core/device.c | 33 +++++++++++++++------------ drivers/infiniband/core/fmr_pool.c | 2 +- drivers/infiniband/core/restrack.c | 3 +-- drivers/infiniband/core/verbs.c | 10 +++++---- 6 files changed, 55 insertions(+), 50 deletions(-) (limited to 'drivers/infiniband/core') diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c index 8957d31d60ca..ebc64418d809 100644 --- a/drivers/infiniband/core/cache.c +++ b/drivers/infiniband/core/cache.c @@ -212,9 +212,8 @@ static void free_gid_entry_locked(struct ib_gid_table_entry *entry) u8 port_num = entry->attr.port_num; struct ib_gid_table *table = rdma_gid_table(device, port_num); - pr_debug("%s device=%s port=%d index=%d gid %pI6\n", __func__, - device->name, port_num, entry->attr.index, - entry->attr.gid.raw); + dev_dbg(&device->dev, "%s port=%d index=%d gid %pI6\n", __func__, + port_num, entry->attr.index, entry->attr.gid.raw); if (rdma_cap_roce_gid_table(device, port_num) && entry->state != GID_TABLE_ENTRY_INVALID) @@ -289,9 +288,9 @@ static void store_gid_entry(struct ib_gid_table *table, { entry->state = GID_TABLE_ENTRY_VALID; - pr_debug("%s device=%s port=%d index=%d gid %pI6\n", __func__, - entry->attr.device->name, entry->attr.port_num, - entry->attr.index, entry->attr.gid.raw); + dev_dbg(&entry->attr.device->dev, "%s port=%d index=%d gid %pI6\n", + __func__, entry->attr.port_num, entry->attr.index, + entry->attr.gid.raw); lockdep_assert_held(&table->lock); write_lock_irq(&table->rwlock); @@ -320,17 +319,16 @@ static int add_roce_gid(struct ib_gid_table_entry *entry) int ret; if (!attr->ndev) { - pr_err("%s NULL netdev device=%s port=%d index=%d\n", - __func__, attr->device->name, attr->port_num, - attr->index); + dev_err(&attr->device->dev, "%s NULL netdev port=%d index=%d\n", + __func__, attr->port_num, attr->index); return -EINVAL; } if (rdma_cap_roce_gid_table(attr->device, attr->port_num)) { ret = attr->device->add_gid(attr, &entry->context); if (ret) { - pr_err("%s GID add failed device=%s port=%d index=%d\n", - __func__, attr->device->name, attr->port_num, - attr->index); + dev_err(&attr->device->dev, + "%s GID add failed port=%d index=%d\n", + __func__, attr->port_num, attr->index); return ret; } } @@ -402,9 +400,8 @@ static void del_gid(struct ib_device *ib_dev, u8 port, lockdep_assert_held(&table->lock); - pr_debug("%s device=%s port=%d index=%d gid %pI6\n", __func__, - ib_dev->name, port, ix, - table->data_vec[ix]->attr.gid.raw); + dev_dbg(&ib_dev->dev, "%s port=%d index=%d gid %pI6\n", __func__, port, + ix, table->data_vec[ix]->attr.gid.raw); write_lock_irq(&table->rwlock); entry = table->data_vec[ix]; @@ -782,9 +779,9 @@ static void release_gid_table(struct ib_device *device, u8 port, if (is_gid_entry_free(table->data_vec[i])) continue; if (kref_read(&table->data_vec[i]->kref) > 1) { - pr_err("GID entry ref leak for %s (index %d) ref=%d\n", - device->name, i, - kref_read(&table->data_vec[i]->kref)); + dev_err(&device->dev, + "GID entry ref leak for index %d ref=%d\n", i, + kref_read(&table->data_vec[i]->kref)); leak = true; } } @@ -1303,8 +1300,9 @@ static int config_non_roce_gid_cache(struct ib_device *device, continue; ret = device->query_gid(device, port, i, &gid_attr.gid); if (ret) { - pr_warn("query_gid failed (%d) for %s (index %d)\n", - ret, device->name, i); + dev_warn(&device->dev, + "query_gid failed (%d) for index %d\n", ret, + i); goto err; } gid_attr.index = i; @@ -1333,8 +1331,7 @@ static void ib_cache_update(struct ib_device *device, ret = ib_query_port(device, port, tprops); if (ret) { - pr_warn("ib_query_port failed (%d) for %s\n", - ret, device->name); + dev_warn(&device->dev, "ib_query_port failed (%d)\n", ret); goto err; } @@ -1356,8 +1353,9 @@ static void ib_cache_update(struct ib_device *device, for (i = 0; i < pkey_cache->table_len; ++i) { ret = ib_query_pkey(device, port, i, pkey_cache->table + i); if (ret) { - pr_warn("ib_query_pkey failed (%d) for %s (index %d)\n", - ret, device->name, i); + dev_warn(&device->dev, + "ib_query_pkey failed (%d) for index %d\n", + ret, i); goto err; } } diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index a57c8b823302..c650223c52bf 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -2352,8 +2352,8 @@ static void cma_listen_on_dev(struct rdma_id_private *id_priv, ret = rdma_listen(id, id_priv->backlog); if (ret) - pr_warn("RDMA CMA: cma_listen_on_dev, error %d, listening on device %s\n", - ret, cma_dev->device->name); + dev_warn(&cma_dev->device->dev, + "RDMA CMA: cma_listen_on_dev, error %d\n", ret); } static void cma_listen_on_all(struct rdma_id_private *id_priv) @@ -4082,9 +4082,10 @@ static int cma_join_ib_multicast(struct rdma_id_private *id_priv, (!ib_sa_sendonly_fullmem_support(&sa_client, id_priv->id.device, id_priv->id.port_num))) { - pr_warn("RDMA CM: %s port %u Unable to multicast join\n" - "RDMA CM: SM doesn't support Send Only Full Member option\n", - id_priv->id.device->name, id_priv->id.port_num); + dev_warn( + &id_priv->id.device->dev, + "RDMA CM: port %u Unable to multicast join: SM doesn't support Send Only Full Member option\n", + id_priv->id.port_num); return -EOPNOTSUPP; } diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index faacf95699d7..7c3ff43092fd 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -123,8 +123,9 @@ static int ib_device_check_mandatory(struct ib_device *device) for (i = 0; i < ARRAY_SIZE(mandatory_table); ++i) { if (!*(void **) ((void *) device + mandatory_table[i].offset)) { - pr_warn("Device %s is missing mandatory function %s\n", - device->name, mandatory_table[i].name); + dev_warn(&device->dev, + "Device is missing mandatory function %s\n", + mandatory_table[i].name); return -EINVAL; } } @@ -513,20 +514,21 @@ int ib_register_device(struct ib_device *device, const char *name, ret = read_port_immutable(device); if (ret) { - pr_warn("Couldn't create per port immutable data %s\n", - device->name); + dev_warn(&device->dev, + "Couldn't create per port immutable data\n"); goto out; } ret = setup_port_pkey_list(device); if (ret) { - pr_warn("Couldn't create per port_pkey_list\n"); + dev_warn(&device->dev, "Couldn't create per port_pkey_list\n"); goto out; } ret = ib_cache_setup_one(device); if (ret) { - pr_warn("Couldn't set up InfiniBand P_Key/GID cache\n"); + dev_warn(&device->dev, + "Couldn't set up InfiniBand P_Key/GID cache\n"); goto port_cleanup; } @@ -534,21 +536,23 @@ int ib_register_device(struct ib_device *device, const char *name, ret = ib_device_register_rdmacg(device); if (ret) { - pr_warn("Couldn't register device with rdma cgroup\n"); + dev_warn(&device->dev, + "Couldn't register device with rdma cgroup\n"); goto cache_cleanup; } memset(&device->attrs, 0, sizeof(device->attrs)); ret = device->query_device(device, &device->attrs, &uhw); if (ret) { - pr_warn("Couldn't query the device attributes\n"); + dev_warn(&device->dev, + "Couldn't query the device attributes\n"); goto cg_cleanup; } ret = ib_device_register_sysfs(device, port_callback); if (ret) { - pr_warn("Couldn't register device %s with driver model\n", - device->name); + dev_warn(&device->dev, + "Couldn't register device with driver model\n"); goto cg_cleanup; } @@ -699,8 +703,9 @@ void ib_unregister_client(struct ib_client *client) found_context->data : NULL); if (!found_context) { - pr_warn("No client context found for %s/%s\n", - device->name, client->name); + dev_warn(&device->dev, + "No client context found for %s\n", + client->name); continue; } @@ -764,8 +769,8 @@ void ib_set_client_data(struct ib_device *device, struct ib_client *client, goto out; } - pr_warn("No client context found for %s/%s\n", - device->name, client->name); + dev_warn(&device->dev, "No client context found for %s\n", + client->name); out: write_unlock_irqrestore(&device->client_data_lock, flags); diff --git a/drivers/infiniband/core/fmr_pool.c b/drivers/infiniband/core/fmr_pool.c index a077500f7f32..d1f2eee7d5da 100644 --- a/drivers/infiniband/core/fmr_pool.c +++ b/drivers/infiniband/core/fmr_pool.c @@ -213,7 +213,7 @@ struct ib_fmr_pool *ib_create_fmr_pool(struct ib_pd *pd, device = pd->device; if (!device->alloc_fmr || !device->dealloc_fmr || !device->map_phys_fmr || !device->unmap_fmr) { - pr_info(PFX "Device %s does not support FMRs\n", device->name); + dev_info(&device->dev, "Device does not support FMRs\n"); return ERR_PTR(-ENOSYS); } diff --git a/drivers/infiniband/core/restrack.c b/drivers/infiniband/core/restrack.c index 3b7fa0ccaa08..bcc693fffd4c 100644 --- a/drivers/infiniband/core/restrack.c +++ b/drivers/infiniband/core/restrack.c @@ -50,8 +50,7 @@ void rdma_restrack_clean(struct rdma_restrack_root *res) dev = container_of(res, struct ib_device, res); pr_err("restrack: %s", CUT_HERE); - pr_err("restrack: BUG: RESTRACK detected leak of resources on %s\n", - dev->name); + dev_err(&dev->dev, "BUG: RESTRACK detected leak of resources\n"); hash_for_each(res->hash, bkt, e, node) { if (rdma_is_kernel_res(e)) { owner = e->kern_name; diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index c36be384fe34..ee5fc8408add 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -1629,14 +1629,16 @@ static int _ib_modify_qp(struct ib_qp *qp, struct ib_qp_attr *attr, if (rdma_ib_or_roce(qp->device, port)) { if (attr_mask & IB_QP_RQ_PSN && attr->rq_psn & ~0xffffff) { - pr_warn("%s: %s rq_psn overflow, masking to 24 bits\n", - __func__, qp->device->name); + dev_warn(&qp->device->dev, + "%s rq_psn overflow, masking to 24 bits\n", + __func__); attr->rq_psn &= 0xffffff; } if (attr_mask & IB_QP_SQ_PSN && attr->sq_psn & ~0xffffff) { - pr_warn("%s: %s sq_psn overflow, masking to 24 bits\n", - __func__, qp->device->name); + dev_warn(&qp->device->dev, + " %s sq_psn overflow, masking to 24 bits\n", + __func__); attr->sq_psn &= 0xffffff; } } -- cgit v1.2.3 From 896de0090a85f4c3a2b37fc0f46215a73c5b5429 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 20 Sep 2018 16:42:25 -0600 Subject: RDMA/core: Use dev_name instead of ibdev->name These return the same thing but dev_name is a more conventional use of the kernel API. Signed-off-by: Jason Gunthorpe Reviewed-by: Steve Wise Reviewed-by: Leon Romanovsky Reviewed-by: Dennis Dalessandro --- drivers/infiniband/core/cm.c | 2 +- drivers/infiniband/core/cma_configfs.c | 2 +- drivers/infiniband/core/device.c | 8 +++----- drivers/infiniband/core/fmr_pool.c | 3 ++- drivers/infiniband/core/iwcm.c | 2 +- drivers/infiniband/core/nldev.c | 3 ++- drivers/infiniband/core/sa_query.c | 2 +- drivers/infiniband/core/security.c | 7 +++---- drivers/infiniband/core/user_mad.c | 2 +- drivers/infiniband/core/uverbs_main.c | 2 +- 10 files changed, 16 insertions(+), 17 deletions(-) (limited to 'drivers/infiniband/core') diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c index 6e39c27dca8e..a6a20603ccea 100644 --- a/drivers/infiniband/core/cm.c +++ b/drivers/infiniband/core/cm.c @@ -4367,7 +4367,7 @@ static void cm_add_one(struct ib_device *ib_device) cm_dev->going_down = 0; cm_dev->device = device_create(&cm_class, &ib_device->dev, MKDEV(0, 0), NULL, - "%s", ib_device->name); + "%s", dev_name(&ib_device->dev)); if (IS_ERR(cm_dev->device)) { kfree(cm_dev); return; diff --git a/drivers/infiniband/core/cma_configfs.c b/drivers/infiniband/core/cma_configfs.c index eee38b40be99..8c2dfb3e294e 100644 --- a/drivers/infiniband/core/cma_configfs.c +++ b/drivers/infiniband/core/cma_configfs.c @@ -65,7 +65,7 @@ static struct cma_dev_port_group *to_dev_port_group(struct config_item *item) static bool filter_by_name(struct ib_device *ib_dev, void *cookie) { - return !strcmp(ib_dev->name, cookie); + return !strcmp(dev_name(&ib_dev->dev), cookie); } static int cma_configfs_params_get(struct config_item *item, diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 7c3ff43092fd..d105b9b2d118 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -165,7 +165,7 @@ static struct ib_device *__ib_device_get_by_name(const char *name) struct ib_device *device; list_for_each_entry(device, &device_list, core_list) - if (!strncmp(name, device->name, IB_DEVICE_NAME_MAX)) + if (!strcmp(name, dev_name(&device->dev))) return device; return NULL; @@ -184,7 +184,7 @@ static int alloc_name(struct ib_device *ibdev, const char *name) list_for_each_entry(device, &device_list, core_list) { char buf[IB_DEVICE_NAME_MAX]; - if (sscanf(device->name, name, &i) != 1) + if (sscanf(dev_name(&device->dev), name, &i) != 1) continue; if (i < 0 || i >= PAGE_SIZE * 8) continue; @@ -219,9 +219,7 @@ static void ib_device_release(struct device *device) static int ib_device_uevent(struct device *device, struct kobj_uevent_env *env) { - struct ib_device *dev = container_of(device, struct ib_device, dev); - - if (add_uevent_var(env, "NAME=%s", dev->name)) + if (add_uevent_var(env, "NAME=%s", dev_name(device))) return -ENOMEM; /* diff --git a/drivers/infiniband/core/fmr_pool.c b/drivers/infiniband/core/fmr_pool.c index d1f2eee7d5da..83ba0068e8bb 100644 --- a/drivers/infiniband/core/fmr_pool.c +++ b/drivers/infiniband/core/fmr_pool.c @@ -257,7 +257,8 @@ struct ib_fmr_pool *ib_create_fmr_pool(struct ib_pd *pd, atomic_set(&pool->flush_ser, 0); init_waitqueue_head(&pool->force_wait); - pool->worker = kthread_create_worker(0, "ib_fmr(%s)", device->name); + pool->worker = + kthread_create_worker(0, "ib_fmr(%s)", dev_name(&device->dev)); if (IS_ERR(pool->worker)) { pr_warn(PFX "couldn't start cleanup kthread worker\n"); ret = PTR_ERR(pool->worker); diff --git a/drivers/infiniband/core/iwcm.c b/drivers/infiniband/core/iwcm.c index 5d676cff41f4..ba668d49c751 100644 --- a/drivers/infiniband/core/iwcm.c +++ b/drivers/infiniband/core/iwcm.c @@ -509,7 +509,7 @@ static int iw_cm_map(struct iw_cm_id *cm_id, bool active) cm_id->m_local_addr = cm_id->local_addr; cm_id->m_remote_addr = cm_id->remote_addr; - memcpy(pm_reg_msg.dev_name, cm_id->device->name, + memcpy(pm_reg_msg.dev_name, dev_name(&cm_id->device->dev), sizeof(pm_reg_msg.dev_name)); memcpy(pm_reg_msg.if_name, cm_id->device->iwcm->ifname, sizeof(pm_reg_msg.if_name)); diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c index 0385ab438320..ba5403fbcd88 100644 --- a/drivers/infiniband/core/nldev.c +++ b/drivers/infiniband/core/nldev.c @@ -179,7 +179,8 @@ static int fill_nldev_handle(struct sk_buff *msg, struct ib_device *device) { if (nla_put_u32(msg, RDMA_NLDEV_ATTR_DEV_INDEX, device->index)) return -EMSGSIZE; - if (nla_put_string(msg, RDMA_NLDEV_ATTR_DEV_NAME, device->name)) + if (nla_put_string(msg, RDMA_NLDEV_ATTR_DEV_NAME, + dev_name(&device->dev))) return -EMSGSIZE; return 0; diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c index d3d6275b3b7e..19e1833e13fc 100644 --- a/drivers/infiniband/core/sa_query.c +++ b/drivers/infiniband/core/sa_query.c @@ -761,7 +761,7 @@ static void ib_nl_set_path_rec_attrs(struct sk_buff *skb, /* Construct the family header first */ header = skb_put(skb, NLMSG_ALIGN(sizeof(*header))); - memcpy(header->device_name, query->port->agent->device->name, + memcpy(header->device_name, dev_name(&query->port->agent->device->dev), LS_DEVICE_NAME_MAX); header->port_num = query->port->port_num; diff --git a/drivers/infiniband/core/security.c b/drivers/infiniband/core/security.c index 9b0bea8303e0..1143c0448666 100644 --- a/drivers/infiniband/core/security.c +++ b/drivers/infiniband/core/security.c @@ -685,9 +685,8 @@ static int ib_mad_agent_security_change(struct notifier_block *nb, if (event != LSM_POLICY_CHANGE) return NOTIFY_DONE; - ag->smp_allowed = !security_ib_endport_manage_subnet(ag->security, - ag->device->name, - ag->port_num); + ag->smp_allowed = !security_ib_endport_manage_subnet( + ag->security, dev_name(&ag->device->dev), ag->port_num); return NOTIFY_OK; } @@ -708,7 +707,7 @@ int ib_mad_agent_security_setup(struct ib_mad_agent *agent, return 0; ret = security_ib_endport_manage_subnet(agent->security, - agent->device->name, + dev_name(&agent->device->dev), agent->port_num); if (ret) return ret; diff --git a/drivers/infiniband/core/user_mad.c b/drivers/infiniband/core/user_mad.c index c34a6852d691..9961859da06a 100644 --- a/drivers/infiniband/core/user_mad.c +++ b/drivers/infiniband/core/user_mad.c @@ -1132,7 +1132,7 @@ static ssize_t show_ibdev(struct device *dev, struct device_attribute *attr, if (!port) return -ENODEV; - return sprintf(buf, "%s\n", port->ib_dev->name); + return sprintf(buf, "%s\n", dev_name(&port->ib_dev->dev)); } static DEVICE_ATTR(ibdev, S_IRUGO, show_ibdev, NULL); diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index 8d56773aac56..12d8f8097574 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -1179,7 +1179,7 @@ static ssize_t ibdev_show(struct device *device, struct device_attribute *attr, srcu_key = srcu_read_lock(&dev->disassociate_srcu); ib_dev = srcu_dereference(dev->ib_dev, &dev->disassociate_srcu); if (ib_dev) - ret = sprintf(buf, "%s\n", ib_dev->name); + ret = sprintf(buf, "%s\n", dev_name(&ib_dev->dev)); srcu_read_unlock(&dev->disassociate_srcu, srcu_key); return ret; -- cgit v1.2.3 From c8b53d0c5eb89a5831b7a25f4bd5e742a85c293b Mon Sep 17 00:00:00 2001 From: Alex Estrin Date: Wed, 26 Sep 2018 10:02:32 -0700 Subject: IB/sa: simplify return code logic for ib_nl_send_msg() rdma_nl_multicast() returns either negative error code or zero if succeeded. Remove unnecessary ret code checks and reassignments. Reviewed-by: Kaike Wan Signed-off-by: Alex Estrin Signed-off-by: Dennis Dalessandro Reviewed-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/sa_query.c | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) (limited to 'drivers/infiniband/core') diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c index 19e1833e13fc..a5e76d432d3f 100644 --- a/drivers/infiniband/core/sa_query.c +++ b/drivers/infiniband/core/sa_query.c @@ -835,7 +835,6 @@ static int ib_nl_send_msg(struct ib_sa_query *query, gfp_t gfp_mask) struct sk_buff *skb = NULL; struct nlmsghdr *nlh; void *data; - int ret = 0; struct ib_sa_mad *mad; int len; @@ -862,13 +861,7 @@ static int ib_nl_send_msg(struct ib_sa_query *query, gfp_t gfp_mask) /* Repair the nlmsg header length */ nlmsg_end(skb, nlh); - ret = rdma_nl_multicast(skb, RDMA_NL_GROUP_LS, gfp_mask); - if (!ret) - ret = len; - else - ret = 0; - - return ret; + return rdma_nl_multicast(skb, RDMA_NL_GROUP_LS, gfp_mask); } static int ib_nl_make_request(struct ib_sa_query *query, gfp_t gfp_mask) @@ -891,14 +884,12 @@ static int ib_nl_make_request(struct ib_sa_query *query, gfp_t gfp_mask) spin_unlock_irqrestore(&ib_nl_request_lock, flags); ret = ib_nl_send_msg(query, gfp_mask); - if (ret <= 0) { + if (ret) { ret = -EIO; /* Remove the request */ spin_lock_irqsave(&ib_nl_request_lock, flags); list_del(&query->list); spin_unlock_irqrestore(&ib_nl_request_lock, flags); - } else { - ret = 0; } return ret; -- cgit v1.2.3 From 3994586f4d7a1e8eb2a152405d0a1c9c8b947c4c Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Tue, 25 Sep 2018 12:04:04 +0300 Subject: RDMA/core: Acquire and release mmap_sem on page range Currently mmap_sem is read locked while pinning the memory. In a multi-threaded application of a process, holding mmap_sem lock creates contention with other threads who might be either registering memory, creating QPs or simply doing mmap() as such operations also require to hold the mmap_sem write lock. All such operation cannot make forward progress until one memory pin operation is completed. It becomes more worse if the memory is unpinned and/or memory registration is large (in GB range). Therefore, instead of holding mmap_sem for too long (for whole region pinning), acquire and release the lock for every few pages. For example on x86 with 4K page size, acquire and release mmap_sem for every 2Mbytes memory chunk. This allows other competing threads to make progress who might wish to hold mmap_sem for shorter duration. When memory registration latency is measured using [1] for memory sizes ranging from 4K to 48GB, <= 1% or 0.5% degradation is noticed. In many runs no difference is seen other than run-to-run variance. In other targeted tests of users with large memory, desired improvements are seen due to reduced contention of mmap_sem. [1] https://github.com/paravmellanox/rtool $ rdma_resource_lat -c 1 -s 48G -a -u L -i 500 -A It registers pinned memory from 4K to 48GB size with 500 iterations for each memory size. $ rdma_resource_lat -c 1 -s 12G -a -u L -i 500 -t 4 4 competing threads pin memory, each of 12GB size with 500 iterations. Signed-off-by: Parav Pandit Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/umem.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'drivers/infiniband/core') diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c index 8da1cf29a69f..c6144df47ea4 100644 --- a/drivers/infiniband/core/umem.c +++ b/drivers/infiniband/core/umem.c @@ -181,8 +181,8 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, sg_list_start = umem->sg_head.sgl; - down_read(&mm->mmap_sem); while (npages) { + down_read(&mm->mmap_sem); ret = get_user_pages_longterm(cur_base, min_t(unsigned long, npages, PAGE_SIZE / sizeof (struct page *)), @@ -196,17 +196,20 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, cur_base += ret * PAGE_SIZE; npages -= ret; + /* Continue to hold the mmap_sem as vma_list access + * needs to be protected. + */ for_each_sg(sg_list_start, sg, ret, i) { if (vma_list && !is_vm_hugetlb_page(vma_list[i])) umem->hugetlb = 0; sg_set_page(sg, page_list[i], PAGE_SIZE, 0); } + up_read(&mm->mmap_sem); /* preparing for next loop */ sg_list_start = sg; } - up_read(&mm->mmap_sem); umem->nmap = ib_dma_map_sg_attrs(context->device, umem->sg_head.sgl, -- cgit v1.2.3 From 78fb282b150c36269fcecf5d08e6de7117e9f4ab Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Sat, 15 Sep 2018 12:07:55 +0300 Subject: RDMA/cma: Allow accepting requests for multi port rdma device When IP failover is used between multiple ports of a given rdma device, allow accepting CM requests from either of the ports. This is applicable for IPv4 and IPv6 non link local addressing scheme. IPv6 link local addresses are bound. IP failover requests for listen cm_ids bound to specific netdev interfaces cannot be supported. (Similar to traditional sockets). Signed-off-by: Parav Pandit Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cma.c | 26 ++++++++++++++++++++++---- 1 file changed, 22 insertions(+), 4 deletions(-) (limited to 'drivers/infiniband/core') diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index c650223c52bf..47e884162ce5 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -1460,17 +1460,34 @@ static bool cma_protocol_roce(const struct rdma_cm_id *id) return rdma_protocol_roce(device, port_num); } +static bool cma_is_req_ipv6_ll(const struct cma_req_info *req) +{ + const struct sockaddr *daddr = + (const struct sockaddr *)&req->listen_addr_storage; + const struct sockaddr_in6 *daddr6 = (const struct sockaddr_in6 *)daddr; + + /* Returns true if the req is for IPv6 link local */ + return (daddr->sa_family == AF_INET6 && + (ipv6_addr_type(&daddr6->sin6_addr) & IPV6_ADDR_LINKLOCAL)); +} + static bool cma_match_net_dev(const struct rdma_cm_id *id, const struct net_device *net_dev, - u8 port_num) + const struct cma_req_info *req) { const struct rdma_addr *addr = &id->route.addr; if (!net_dev) /* This request is an AF_IB request */ - return (!id->port_num || id->port_num == port_num) && + return (!id->port_num || id->port_num == req->port) && (addr->src_addr.ss_family == AF_IB); + /* + * If the request is not for IPv6 link local, allow matching + * request to any netdevice of the one or multiport rdma device. + */ + if (!cma_is_req_ipv6_ll(req)) + return true; /* * Net namespaces must match, and if the listner is listening * on a specific netdevice than netdevice must match as well. @@ -1498,13 +1515,14 @@ static struct rdma_id_private *cma_find_listener( hlist_for_each_entry(id_priv, &bind_list->owners, node) { if (cma_match_private_data(id_priv, ib_event->private_data)) { if (id_priv->id.device == cm_id->device && - cma_match_net_dev(&id_priv->id, net_dev, req->port)) + cma_match_net_dev(&id_priv->id, net_dev, req)) return id_priv; list_for_each_entry(id_priv_dev, &id_priv->listen_list, listen_list) { if (id_priv_dev->id.device == cm_id->device && - cma_match_net_dev(&id_priv_dev->id, net_dev, req->port)) + cma_match_net_dev(&id_priv_dev->id, + net_dev, req)) return id_priv_dev; } } -- cgit v1.2.3 From ff11c6cd521f4fd859c825976e4146dfb166029c Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Sat, 15 Sep 2018 12:07:56 +0300 Subject: RDMA/cma: Introduce and use cma_acquire_dev_by_src_ip() Light weight version of cma_acquire_dev() just for binding with rdma device based on source IP(v4/v6) address. This simplifies cma_acquire_dev() to avoid listen_id specific checks and also for subsequent simplification for IB vs iWarp. Signed-off-by: Parav Pandit Reviewed-by: Daniel Jurgens Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cma.c | 84 +++++++++++++++++++++++++++++++++---------- 1 file changed, 66 insertions(+), 18 deletions(-) (limited to 'drivers/infiniband/core') diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 47e884162ce5..11bce4909f54 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -639,6 +639,58 @@ static void cma_bind_sgid_attr(struct rdma_id_private *id_priv, id_priv->id.route.addr.dev_addr.sgid_attr = sgid_attr; } +/** + * cma_acquire_dev_by_src_ip - Acquire cma device, port, gid attribute + * based on source ip address. + * @id_priv: cm_id which should be bound to cma device + * + * cma_acquire_dev_by_src_ip() binds cm id to cma device, port and GID attribute + * based on source IP address. It returns 0 on success or error code otherwise. + * It is applicable to active and passive side cm_id. + */ +static int cma_acquire_dev_by_src_ip(struct rdma_id_private *id_priv) +{ + struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; + const struct ib_gid_attr *sgid_attr; + union ib_gid gid, iboe_gid, *gidp; + struct cma_device *cma_dev; + enum ib_gid_type gid_type; + int ret = -ENODEV; + u8 port; + + if (dev_addr->dev_type != ARPHRD_INFINIBAND && + id_priv->id.ps == RDMA_PS_IPOIB) + return -EINVAL; + + rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr, + &iboe_gid); + + memcpy(&gid, dev_addr->src_dev_addr + + rdma_addr_gid_offset(dev_addr), sizeof(gid)); + + mutex_lock(&lock); + list_for_each_entry(cma_dev, &dev_list, list) { + for (port = rdma_start_port(cma_dev->device); + port <= rdma_end_port(cma_dev->device); port++) { + gidp = rdma_protocol_roce(cma_dev->device, port) ? + &iboe_gid : &gid; + gid_type = cma_dev->default_gid_type[port - 1]; + sgid_attr = cma_validate_port(cma_dev->device, port, + gid_type, gidp, id_priv); + if (!IS_ERR(sgid_attr)) { + id_priv->id.port_num = port; + cma_bind_sgid_attr(id_priv, sgid_attr); + cma_attach_to_dev(id_priv, cma_dev); + ret = 0; + goto out; + } + } + } +out: + mutex_unlock(&lock); + return ret; +} + static int cma_acquire_dev(struct rdma_id_private *id_priv, const struct rdma_id_private *listen_id_priv) { @@ -661,26 +713,22 @@ static int cma_acquire_dev(struct rdma_id_private *id_priv, memcpy(&gid, dev_addr->src_dev_addr + rdma_addr_gid_offset(dev_addr), sizeof gid); - if (listen_id_priv) { - cma_dev = listen_id_priv->cma_dev; - port = listen_id_priv->id.port_num; - gidp = rdma_protocol_roce(cma_dev->device, port) ? - &iboe_gid : &gid; - gid_type = listen_id_priv->gid_type; - sgid_attr = cma_validate_port(cma_dev->device, port, - gid_type, gidp, id_priv); - if (!IS_ERR(sgid_attr)) { - id_priv->id.port_num = port; - cma_bind_sgid_attr(id_priv, sgid_attr); - ret = 0; - goto out; - } + cma_dev = listen_id_priv->cma_dev; + port = listen_id_priv->id.port_num; + gidp = rdma_protocol_roce(cma_dev->device, port) ? &iboe_gid : &gid; + gid_type = listen_id_priv->gid_type; + sgid_attr = cma_validate_port(cma_dev->device, port, + gid_type, gidp, id_priv); + if (!IS_ERR(sgid_attr)) { + id_priv->id.port_num = port; + cma_bind_sgid_attr(id_priv, sgid_attr); + ret = 0; + goto out; } list_for_each_entry(cma_dev, &dev_list, list) { for (port = 1; port <= cma_dev->device->phys_port_cnt; ++port) { - if (listen_id_priv && - listen_id_priv->cma_dev == cma_dev && + if (listen_id_priv->cma_dev == cma_dev && listen_id_priv->id.port_num == port) continue; @@ -2878,7 +2926,7 @@ static void addr_handler(int status, struct sockaddr *src_addr, memcpy(cma_src_addr(id_priv), src_addr, rdma_addr_size(src_addr)); if (!status && !id_priv->cma_dev) { - status = cma_acquire_dev(id_priv, NULL); + status = cma_acquire_dev_by_src_ip(id_priv); if (status) pr_debug_ratelimited("RDMA CM: ADDR_ERROR: failed to acquire device. status %d\n", status); @@ -3427,7 +3475,7 @@ int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr) if (ret) goto err1; - ret = cma_acquire_dev(id_priv, NULL); + ret = cma_acquire_dev_by_src_ip(id_priv); if (ret) goto err1; } -- cgit v1.2.3 From 41ab1cb7d1cd5d53d68bcf5fb3fddad77af15545 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Sat, 15 Sep 2018 12:07:57 +0300 Subject: RDMA/cma: Introduce and use cma_ib_acquire_dev() When RDMA CM connect request arrives for IB transport, it already contains device, port, netdevice (optional). Instead of traversing all the cma devices, use the cma device already found by the cma_find_listener() for which a listener id is provided. iWarp devices doesn't need to derive RoCE GIDs, therefore drop RoCE specific checks from cma_acquire_dev() and rename it to cma_iw_acquire_dev(). Signed-off-by: Parav Pandit Reviewed-by: Daniel Jurgens Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cma.c | 93 ++++++++++++++++++++++++++++++++----------- 1 file changed, 69 insertions(+), 24 deletions(-) (limited to 'drivers/infiniband/core') diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 11bce4909f54..897aac68158b 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -691,34 +691,80 @@ out: return ret; } -static int cma_acquire_dev(struct rdma_id_private *id_priv, - const struct rdma_id_private *listen_id_priv) +/** + * cma_ib_acquire_dev - Acquire cma device, port and SGID attribute + * @id_priv: cm id to bind to cma device + * @listen_id_priv: listener cm id to match against + * @req: Pointer to req structure containaining incoming + * request information + * cma_ib_acquire_dev() acquires cma device, port and SGID attribute when + * rdma device matches for listen_id and incoming request. It also verifies + * that a GID table entry is present for the source address. + * Returns 0 on success, or returns error code otherwise. + */ +static int cma_ib_acquire_dev(struct rdma_id_private *id_priv, + const struct rdma_id_private *listen_id_priv, + struct cma_req_info *req) +{ + struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; + const struct ib_gid_attr *sgid_attr; + enum ib_gid_type gid_type; + union ib_gid gid; + + if (dev_addr->dev_type != ARPHRD_INFINIBAND && + id_priv->id.ps == RDMA_PS_IPOIB) + return -EINVAL; + + if (rdma_protocol_roce(req->device, req->port)) + rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr, + &gid); + else + memcpy(&gid, dev_addr->src_dev_addr + + rdma_addr_gid_offset(dev_addr), sizeof(gid)); + + gid_type = listen_id_priv->cma_dev->default_gid_type[req->port - 1]; + sgid_attr = cma_validate_port(req->device, req->port, + gid_type, &gid, id_priv); + if (IS_ERR(sgid_attr)) + return PTR_ERR(sgid_attr); + + id_priv->id.port_num = req->port; + cma_bind_sgid_attr(id_priv, sgid_attr); + /* Need to acquire lock to protect against reader + * of cma_dev->id_list such as cma_netdev_callback() and + * cma_process_remove(). + */ + mutex_lock(&lock); + cma_attach_to_dev(id_priv, listen_id_priv->cma_dev); + mutex_unlock(&lock); + return 0; +} + +static int cma_iw_acquire_dev(struct rdma_id_private *id_priv, + const struct rdma_id_private *listen_id_priv) { struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; const struct ib_gid_attr *sgid_attr; struct cma_device *cma_dev; - union ib_gid gid, iboe_gid, *gidp; enum ib_gid_type gid_type; int ret = -ENODEV; + union ib_gid gid; u8 port; if (dev_addr->dev_type != ARPHRD_INFINIBAND && id_priv->id.ps == RDMA_PS_IPOIB) return -EINVAL; - mutex_lock(&lock); - rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr, - &iboe_gid); - memcpy(&gid, dev_addr->src_dev_addr + - rdma_addr_gid_offset(dev_addr), sizeof gid); + rdma_addr_gid_offset(dev_addr), sizeof(gid)); + + mutex_lock(&lock); cma_dev = listen_id_priv->cma_dev; port = listen_id_priv->id.port_num; - gidp = rdma_protocol_roce(cma_dev->device, port) ? &iboe_gid : &gid; gid_type = listen_id_priv->gid_type; sgid_attr = cma_validate_port(cma_dev->device, port, - gid_type, gidp, id_priv); + gid_type, &gid, id_priv); if (!IS_ERR(sgid_attr)) { id_priv->id.port_num = port; cma_bind_sgid_attr(id_priv, sgid_attr); @@ -732,11 +778,9 @@ static int cma_acquire_dev(struct rdma_id_private *id_priv, listen_id_priv->id.port_num == port) continue; - gidp = rdma_protocol_roce(cma_dev->device, port) ? - &iboe_gid : &gid; gid_type = cma_dev->default_gid_type[port - 1]; sgid_attr = cma_validate_port(cma_dev->device, port, - gid_type, gidp, id_priv); + gid_type, &gid, id_priv); if (!IS_ERR(sgid_attr)) { id_priv->id.port_num = port; cma_bind_sgid_attr(id_priv, sgid_attr); @@ -1582,18 +1626,18 @@ static struct rdma_id_private *cma_find_listener( static struct rdma_id_private * cma_ib_id_from_event(struct ib_cm_id *cm_id, const struct ib_cm_event *ib_event, + struct cma_req_info *req, struct net_device **net_dev) { - struct cma_req_info req; struct rdma_bind_list *bind_list; struct rdma_id_private *id_priv; int err; - err = cma_save_req_info(ib_event, &req); + err = cma_save_req_info(ib_event, req); if (err) return ERR_PTR(err); - *net_dev = cma_get_net_dev(ib_event, &req); + *net_dev = cma_get_net_dev(ib_event, req); if (IS_ERR(*net_dev)) { if (PTR_ERR(*net_dev) == -EAFNOSUPPORT) { /* Assuming the protocol is AF_IB */ @@ -1631,17 +1675,17 @@ cma_ib_id_from_event(struct ib_cm_id *cm_id, } if (!validate_net_dev(*net_dev, - (struct sockaddr *)&req.listen_addr_storage, - (struct sockaddr *)&req.src_addr_storage)) { + (struct sockaddr *)&req->listen_addr_storage, + (struct sockaddr *)&req->src_addr_storage)) { id_priv = ERR_PTR(-EHOSTUNREACH); goto err; } } bind_list = cma_ps_find(*net_dev ? dev_net(*net_dev) : &init_net, - rdma_ps_from_service_id(req.service_id), - cma_port_from_service_id(req.service_id)); - id_priv = cma_find_listener(bind_list, cm_id, ib_event, &req, *net_dev); + rdma_ps_from_service_id(req->service_id), + cma_port_from_service_id(req->service_id)); + id_priv = cma_find_listener(bind_list, cm_id, ib_event, req, *net_dev); err: rcu_read_unlock(); if (IS_ERR(id_priv) && *net_dev) { @@ -2063,11 +2107,12 @@ static int cma_ib_req_handler(struct ib_cm_id *cm_id, { struct rdma_id_private *listen_id, *conn_id = NULL; struct rdma_cm_event event = {}; + struct cma_req_info req = {}; struct net_device *net_dev; u8 offset; int ret; - listen_id = cma_ib_id_from_event(cm_id, ib_event, &net_dev); + listen_id = cma_ib_id_from_event(cm_id, ib_event, &req, &net_dev); if (IS_ERR(listen_id)) return PTR_ERR(listen_id); @@ -2100,7 +2145,7 @@ static int cma_ib_req_handler(struct ib_cm_id *cm_id, } mutex_lock_nested(&conn_id->handler_mutex, SINGLE_DEPTH_NESTING); - ret = cma_acquire_dev(conn_id, listen_id); + ret = cma_ib_acquire_dev(conn_id, listen_id, &req); if (ret) goto err2; @@ -2296,7 +2341,7 @@ static int iw_conn_req_handler(struct iw_cm_id *cm_id, goto out; } - ret = cma_acquire_dev(conn_id, listen_id); + ret = cma_iw_acquire_dev(conn_id, listen_id); if (ret) { mutex_unlock(&conn_id->handler_mutex); rdma_destroy_id(new_cm_id); -- cgit v1.2.3 From e73798f20ecb35f7d6c672d48d6b9da57c8cbf64 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 28 Sep 2018 16:28:02 -0600 Subject: RDMA/uverbs: Fix RCU annotation for radix slot deference The uapi radix tree is a write-once data structure protected by kref. Once we get to the ioctl() fop it is not possible for anything else to be writing to it, so the access should use rcu_dereference_protected. Reported-by: Matthew Wilcox Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/uverbs_ioctl.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'drivers/infiniband/core') diff --git a/drivers/infiniband/core/uverbs_ioctl.c b/drivers/infiniband/core/uverbs_ioctl.c index 0e95a5888274..b0e493e8d860 100644 --- a/drivers/infiniband/core/uverbs_ioctl.c +++ b/drivers/infiniband/core/uverbs_ioctl.c @@ -386,8 +386,7 @@ static int uverbs_set_attr(struct bundle_priv *pbundle, return -EPROTONOSUPPORT; return 0; } - attr = srcu_dereference( - *slot, &pbundle->bundle.ufile->device->disassociate_srcu); + attr = rcu_dereference_protected(*slot, true); /* Reject duplicate attributes from user-space */ if (test_bit(attr_bkey, pbundle->bundle.attr_present)) @@ -498,9 +497,7 @@ static int bundle_destroy(struct bundle_priv *pbundle, bool commit) if (WARN_ON(!slot)) continue; - attr_uapi = srcu_dereference( - *slot, - &pbundle->bundle.ufile->device->disassociate_srcu); + attr_uapi = rcu_dereference_protected(*slot, true); if (attr_uapi->spec.type == UVERBS_ATTR_TYPE_IDRS_ARRAY) { current_ret = uverbs_free_idrs_array( @@ -542,7 +539,7 @@ static int ib_uverbs_cmd_verbs(struct ib_uverbs_file *ufile, uapi_key_ioctl_method(hdr->method_id)); if (unlikely(!slot)) return -EPROTONOSUPPORT; - method_elm = srcu_dereference(*slot, &ufile->device->disassociate_srcu); + method_elm = rcu_dereference_protected(*slot, true); if (!method_elm->use_stack) { pbundle = kmalloc(method_elm->bundle_size, GFP_KERNEL); -- cgit v1.2.3 From d31131bba5a1630304c55ea775c48cc84912ab59 Mon Sep 17 00:00:00 2001 From: Kamal Heib Date: Tue, 2 Oct 2018 16:11:21 +0300 Subject: RDMA: Remove unused parameter from ib_modify_qp_is_ok() The ll parameter is not used in ib_modify_qp_is_ok(), so remove it. Signed-off-by: Kamal Heib Reviewed-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/verbs.c | 3 +-- drivers/infiniband/hw/bnxt_re/ib_verbs.c | 3 +-- drivers/infiniband/hw/hns/hns_roce_qp.c | 4 ++-- drivers/infiniband/hw/mlx4/qp.c | 8 +------- drivers/infiniband/hw/mlx5/qp.c | 5 ++--- drivers/infiniband/hw/mthca/mthca_qp.c | 4 ++-- drivers/infiniband/hw/ocrdma/ocrdma_verbs.c | 3 +-- drivers/infiniband/hw/qedr/verbs.c | 3 +-- drivers/infiniband/hw/vmw_pvrdma/pvrdma_qp.c | 2 +- drivers/infiniband/sw/rdmavt/qp.c | 5 +---- drivers/infiniband/sw/rxe/rxe_qp.c | 3 +-- include/rdma/ib_verbs.h | 4 +--- 12 files changed, 15 insertions(+), 32 deletions(-) (limited to 'drivers/infiniband/core') diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index ee5fc8408add..1e7ad5e0a46e 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -1509,8 +1509,7 @@ static const struct { }; bool ib_modify_qp_is_ok(enum ib_qp_state cur_state, enum ib_qp_state next_state, - enum ib_qp_type type, enum ib_qp_attr_mask mask, - enum rdma_link_layer ll) + enum ib_qp_type type, enum ib_qp_attr_mask mask) { enum ib_qp_attr_mask req_param, opt_param; diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c index bc2b9e038439..9d7c48466f10 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c @@ -1598,8 +1598,7 @@ int bnxt_re_modify_qp(struct ib_qp *ib_qp, struct ib_qp_attr *qp_attr, curr_qp_state = __to_ib_qp_state(qp->qplib_qp.cur_qp_state); new_qp_state = qp_attr->qp_state; if (!ib_modify_qp_is_ok(curr_qp_state, new_qp_state, - ib_qp->qp_type, qp_attr_mask, - IB_LINK_LAYER_ETHERNET)) { + ib_qp->qp_type, qp_attr_mask)) { dev_err(rdev_to_dev(rdev), "Invalid attribute mask: %#x specified ", qp_attr_mask); diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c index efb7e961ca65..0378fc41fcfa 100644 --- a/drivers/infiniband/hw/hns/hns_roce_qp.c +++ b/drivers/infiniband/hw/hns/hns_roce_qp.c @@ -952,8 +952,8 @@ int hns_roce_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, } } - if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, attr_mask, - IB_LINK_LAYER_ETHERNET)) { + if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, + attr_mask)) { dev_err(dev, "ib_modify_qp_is_ok failed\n"); goto out; } diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c index 6dd3cd2c2f80..0711ca1dfb8f 100644 --- a/drivers/infiniband/hw/mlx4/qp.c +++ b/drivers/infiniband/hw/mlx4/qp.c @@ -2629,7 +2629,6 @@ enum { static int _mlx4_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask, struct ib_udata *udata) { - enum rdma_link_layer ll = IB_LINK_LAYER_UNSPECIFIED; struct mlx4_ib_dev *dev = to_mdev(ibqp->device); struct mlx4_ib_qp *qp = to_mqp(ibqp); enum ib_qp_state cur_state, new_state; @@ -2639,13 +2638,8 @@ static int _mlx4_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, cur_state = attr_mask & IB_QP_CUR_STATE ? attr->cur_qp_state : qp->state; new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state; - if (cur_state != new_state || cur_state != IB_QPS_RESET) { - int port = attr_mask & IB_QP_PORT ? attr->port_num : qp->port; - ll = rdma_port_get_link_layer(&dev->ib_dev, port); - } - if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, - attr_mask, ll)) { + attr_mask)) { pr_debug("qpn 0x%x: invalid attribute mask specified " "for transition %d to %d. qp_type %d," " attr_mask 0x%x\n", diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index c49a0815a12b..fa8e5dc65cb4 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -3509,7 +3509,6 @@ int mlx5_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, size_t required_cmd_sz; int err = -EINVAL; int port; - enum rdma_link_layer ll = IB_LINK_LAYER_UNSPECIFIED; if (ibqp->rwq_ind_tbl) return -ENOSYS; @@ -3555,7 +3554,6 @@ int mlx5_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, if (!(cur_state == new_state && cur_state == IB_QPS_RESET)) { port = attr_mask & IB_QP_PORT ? attr->port_num : qp->port; - ll = dev->ib_dev.get_link_layer(&dev->ib_dev, port); } if (qp->flags & MLX5_IB_QP_UNDERLAY) { @@ -3566,7 +3564,8 @@ int mlx5_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, } } else if (qp_type != MLX5_IB_QPT_REG_UMR && qp_type != MLX5_IB_QPT_DCI && - !ib_modify_qp_is_ok(cur_state, new_state, qp_type, attr_mask, ll)) { + !ib_modify_qp_is_ok(cur_state, new_state, qp_type, + attr_mask)) { mlx5_ib_dbg(dev, "invalid QP state transition from %d to %d, qp_type %d, attr_mask 0x%x\n", cur_state, new_state, ibqp->qp_type, attr_mask); goto out; diff --git a/drivers/infiniband/hw/mthca/mthca_qp.c b/drivers/infiniband/hw/mthca/mthca_qp.c index 3d37f2373d63..9d178ee3c96a 100644 --- a/drivers/infiniband/hw/mthca/mthca_qp.c +++ b/drivers/infiniband/hw/mthca/mthca_qp.c @@ -872,8 +872,8 @@ int mthca_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask, new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state; - if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, attr_mask, - IB_LINK_LAYER_UNSPECIFIED)) { + if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, + attr_mask)) { mthca_dbg(dev, "Bad QP transition (transport %d) " "%d->%d with attr 0x%08x\n", qp->transport, cur_state, new_state, diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c index c158ca9fde6d..06d2a7f3304c 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c @@ -1480,8 +1480,7 @@ int ocrdma_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, new_qps = old_qps; spin_unlock_irqrestore(&qp->q_lock, flags); - if (!ib_modify_qp_is_ok(old_qps, new_qps, ibqp->qp_type, attr_mask, - IB_LINK_LAYER_ETHERNET)) { + if (!ib_modify_qp_is_ok(old_qps, new_qps, ibqp->qp_type, attr_mask)) { pr_err("%s(%d) invalid attribute mask=0x%x specified for\n" "qpn=0x%x of type=0x%x old_qps=0x%x, new_qps=0x%x\n", __func__, dev->id, attr_mask, qp->id, ibqp->qp_type, diff --git a/drivers/infiniband/hw/qedr/verbs.c b/drivers/infiniband/hw/qedr/verbs.c index 9d4d165014d9..82ee4b4a7084 100644 --- a/drivers/infiniband/hw/qedr/verbs.c +++ b/drivers/infiniband/hw/qedr/verbs.c @@ -2238,8 +2238,7 @@ int qedr_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, if (rdma_protocol_roce(&dev->ibdev, 1)) { if (!ib_modify_qp_is_ok(old_qp_state, new_qp_state, - ibqp->qp_type, attr_mask, - IB_LINK_LAYER_ETHERNET)) { + ibqp->qp_type, attr_mask)) { DP_ERR(dev, "modify qp: invalid attribute mask=0x%x specified for\n" "qpn=0x%x of type=0x%x old_qp_state=0x%x, new_qp_state=0x%x\n", diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_qp.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_qp.c index 60083c0363a5..cf22f57a9f0d 100644 --- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_qp.c +++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_qp.c @@ -499,7 +499,7 @@ int pvrdma_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, next_state = (attr_mask & IB_QP_STATE) ? attr->qp_state : cur_state; if (!ib_modify_qp_is_ok(cur_state, next_state, ibqp->qp_type, - attr_mask, IB_LINK_LAYER_ETHERNET)) { + attr_mask)) { ret = -EINVAL; goto out; } diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c index 2db71e956d02..a036a5368103 100644 --- a/drivers/infiniband/sw/rdmavt/qp.c +++ b/drivers/infiniband/sw/rdmavt/qp.c @@ -1164,11 +1164,8 @@ int rvt_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int lastwqe = 0; int mig = 0; int pmtu = 0; /* for gcc warning only */ - enum rdma_link_layer link; int opa_ah; - link = rdma_port_get_link_layer(ibqp->device, qp->port_num); - spin_lock_irq(&qp->r_lock); spin_lock(&qp->s_hlock); spin_lock(&qp->s_lock); @@ -1179,7 +1176,7 @@ int rvt_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, opa_ah = rdma_cap_opa_ah(ibqp->device, qp->port_num); if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, - attr_mask, link)) + attr_mask)) goto inval; if (rdi->driver_f.check_modify_qp && diff --git a/drivers/infiniband/sw/rxe/rxe_qp.c b/drivers/infiniband/sw/rxe/rxe_qp.c index 45b392b7342f..b9710907dac2 100644 --- a/drivers/infiniband/sw/rxe/rxe_qp.c +++ b/drivers/infiniband/sw/rxe/rxe_qp.c @@ -419,8 +419,7 @@ int rxe_qp_chk_attr(struct rxe_dev *rxe, struct rxe_qp *qp, enum ib_qp_state new_state = (mask & IB_QP_STATE) ? attr->qp_state : cur_state; - if (!ib_modify_qp_is_ok(cur_state, new_state, qp_type(qp), mask, - IB_LINK_LAYER_ETHERNET)) { + if (!ib_modify_qp_is_ok(cur_state, new_state, qp_type(qp), mask)) { pr_warn("invalid mask or state for qp\n"); goto err1; } diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 9897d2329f2c..f88c1071413a 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2742,7 +2742,6 @@ static inline int ib_destroy_usecnt(atomic_t *usecnt, * @next_state: Next QP state * @type: QP type * @mask: Mask of supplied QP attributes - * @ll : link layer of port * * This function is a helper function that a low-level driver's * modify_qp method can use to validate the consumer's input. It @@ -2751,8 +2750,7 @@ static inline int ib_destroy_usecnt(atomic_t *usecnt, * and that the attribute mask supplied is allowed for the transition. */ bool ib_modify_qp_is_ok(enum ib_qp_state cur_state, enum ib_qp_state next_state, - enum ib_qp_type type, enum ib_qp_attr_mask mask, - enum rdma_link_layer ll); + enum ib_qp_type type, enum ib_qp_attr_mask mask); void ib_register_event_handler(struct ib_event_handler *event_handler); void ib_unregister_event_handler(struct ib_event_handler *event_handler); -- cgit v1.2.3 From 38716732f161c3d107c4cc406a287f1201bed752 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 2 Oct 2018 11:49:24 +0300 Subject: RDMA/netlink: Simplify netlink listener existence check All users of rdma_nl_chk_listeners() are interested to get boolean answer if netlink socket has listeners, so update all places to boolean function. Signed-off-by: Leon Romanovsky Reviewed-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/addr.c | 2 +- drivers/infiniband/core/netlink.c | 4 ++-- drivers/infiniband/core/sa_query.c | 2 +- include/rdma/rdma_netlink.h | 4 ++-- 4 files changed, 6 insertions(+), 6 deletions(-) (limited to 'drivers/infiniband/core') diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c index c2ca9e4b5160..1400a9d0d56d 100644 --- a/drivers/infiniband/core/addr.c +++ b/drivers/infiniband/core/addr.c @@ -315,7 +315,7 @@ static void queue_req(struct addr_req *req) static int ib_nl_fetch_ha(struct rdma_dev_addr *dev_addr, const void *daddr, u32 seq, u16 family) { - if (rdma_nl_chk_listeners(RDMA_NL_GROUP_LS)) + if (!rdma_nl_chk_listeners(RDMA_NL_GROUP_LS)) return -EADDRNOTAVAIL; return ib_nl_ip_send_msg(dev_addr, daddr, seq, family); diff --git a/drivers/infiniband/core/netlink.c b/drivers/infiniband/core/netlink.c index 3ccaae18ad75..724f5a62e82f 100644 --- a/drivers/infiniband/core/netlink.c +++ b/drivers/infiniband/core/netlink.c @@ -47,9 +47,9 @@ static struct { const struct rdma_nl_cbs *cb_table; } rdma_nl_types[RDMA_NL_NUM_CLIENTS]; -int rdma_nl_chk_listeners(unsigned int group) +bool rdma_nl_chk_listeners(unsigned int group) { - return (netlink_has_listeners(nls, group)) ? 0 : -1; + return netlink_has_listeners(nls, group); } EXPORT_SYMBOL(rdma_nl_chk_listeners); diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c index a5e76d432d3f..f28f6fdb78cb 100644 --- a/drivers/infiniband/core/sa_query.c +++ b/drivers/infiniband/core/sa_query.c @@ -1384,7 +1384,7 @@ static int send_mad(struct ib_sa_query *query, int timeout_ms, gfp_t gfp_mask) if ((query->flags & IB_SA_ENABLE_LOCAL_SERVICE) && (!(query->flags & IB_SA_QUERY_OPA))) { - if (!rdma_nl_chk_listeners(RDMA_NL_GROUP_LS)) { + if (rdma_nl_chk_listeners(RDMA_NL_GROUP_LS)) { if (!ib_nl_make_request(query, gfp_mask)) return id; } diff --git a/include/rdma/rdma_netlink.h b/include/rdma/rdma_netlink.h index c369703fcd69..70218e6b5187 100644 --- a/include/rdma/rdma_netlink.h +++ b/include/rdma/rdma_netlink.h @@ -96,7 +96,7 @@ int rdma_nl_multicast(struct sk_buff *skb, unsigned int group, gfp_t flags); /** * Check if there are any listeners to the netlink group * @group: the netlink group ID - * Returns 0 on success or a negative for no listeners. + * Returns true on success or false if no listeners. */ -int rdma_nl_chk_listeners(unsigned int group); +bool rdma_nl_chk_listeners(unsigned int group); #endif /* _RDMA_NETLINK_H */ -- cgit v1.2.3 From fe33507ec38a8b2e8b782b83669943b7a5fefd4c Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Fri, 21 Sep 2018 09:18:24 -0500 Subject: RDMA/core: Check error status of rdma_find_ndev_for_src_ip_rcu rdma_find_ndev_for_src_ip_rcu() returns either valid netdev pointer or ERR_PTR(). Instead of checking for NULL, check for error. Fixes: caf1e3ae9fa6 ("RDMA/core Introduce and use rdma_find_ndev_for_src_ip_rcu") Reported-by: syzbot+20c32fa6ff84a2d28c36@syzkaller.appspotmail.com Signed-off-by: Parav Pandit Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/addr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/infiniband/core') diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c index 1400a9d0d56d..07e0ffe74a8a 100644 --- a/drivers/infiniband/core/addr.c +++ b/drivers/infiniband/core/addr.c @@ -513,7 +513,7 @@ static int rdma_set_src_addr_rcu(struct rdma_dev_addr *dev_addr, * loopback IP address. */ ndev = rdma_find_ndev_for_src_ip_rcu(dev_net(ndev), dst_in); - if (!ndev) + if (IS_ERR(ndev)) return -ENODEV; } -- cgit v1.2.3 From 363ad35577de3a73cf97006ec5f00fccaee73172 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 2 Oct 2018 11:48:01 +0300 Subject: RDMA/restrack: Un-inline set task implementation Prepare rdma_restrack_set_task() call to accommodate more code by moving its implementation from *.h to *.c. Reviewed-by: Artemy Kovalyov Reviewed-by: Yossi Itigin Signed-off-by: Leon Romanovsky Reviewed-by: Steve Wise Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/restrack.c | 10 ++++++++++ include/rdma/restrack.h | 10 ++-------- 2 files changed, 12 insertions(+), 8 deletions(-) (limited to 'drivers/infiniband/core') diff --git a/drivers/infiniband/core/restrack.c b/drivers/infiniband/core/restrack.c index bcc693fffd4c..b02d43988e16 100644 --- a/drivers/infiniband/core/restrack.c +++ b/drivers/infiniband/core/restrack.c @@ -155,6 +155,16 @@ static bool res_is_user(struct rdma_restrack_entry *res) } } +void rdma_restrack_set_task(struct rdma_restrack_entry *res, + struct task_struct *task) +{ + if (res->task) + put_task_struct(res->task); + get_task_struct(task); + res->task = task; +} +EXPORT_SYMBOL(rdma_restrack_set_task); + void rdma_restrack_add(struct rdma_restrack_entry *res) { struct ib_device *dev = res_to_dev(res); diff --git a/include/rdma/restrack.h b/include/rdma/restrack.h index 9654d33edd98..0bddbbdbaf7c 100644 --- a/include/rdma/restrack.h +++ b/include/rdma/restrack.h @@ -175,14 +175,8 @@ int rdma_restrack_put(struct rdma_restrack_entry *res); * @res: resource entry * @task: task struct */ -static inline void rdma_restrack_set_task(struct rdma_restrack_entry *res, - struct task_struct *task) -{ - if (res->task) - put_task_struct(res->task); - get_task_struct(task); - res->task = task; -} +void rdma_restrack_set_task(struct rdma_restrack_entry *res, + struct task_struct *task); /* * Helper functions for rdma drivers when filling out -- cgit v1.2.3 From 2165fc264079ecb7fbfa5e8b330a92eb3f0fcbe1 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 2 Oct 2018 11:48:02 +0300 Subject: RDMA/restrack: Consolidate task name updates in one place Unify task update and kernel name set in one place. Reviewed-by: Artemy Kovalyov Reviewed-by: Yossi Itigin Signed-off-by: Leon Romanovsky Reviewed-by: Steve Wise Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cma.c | 10 ++-------- drivers/infiniband/core/cq.c | 2 +- drivers/infiniband/core/restrack.c | 13 +++++++++---- drivers/infiniband/core/verbs.c | 4 ++-- include/rdma/restrack.h | 4 ++-- 5 files changed, 16 insertions(+), 17 deletions(-) (limited to 'drivers/infiniband/core') diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 897aac68158b..f117b755c4c2 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -875,10 +875,7 @@ struct rdma_cm_id *__rdma_create_id(struct net *net, if (!id_priv) return ERR_PTR(-ENOMEM); - if (caller) - id_priv->res.kern_name = caller; - else - rdma_restrack_set_task(&id_priv->res, current); + rdma_restrack_set_task(&id_priv->res, caller); id_priv->res.type = RDMA_RESTRACK_CM_ID; id_priv->state = RDMA_CM_IDLE; id_priv->id.context = context; @@ -3945,10 +3942,7 @@ int __rdma_accept(struct rdma_cm_id *id, struct rdma_conn_param *conn_param, id_priv = container_of(id, struct rdma_id_private, id); - if (caller) - id_priv->res.kern_name = caller; - else - rdma_restrack_set_task(&id_priv->res, current); + rdma_restrack_set_task(&id_priv->res, caller); if (!cma_comp(id_priv, RDMA_CM_CONNECT)) return -EINVAL; diff --git a/drivers/infiniband/core/cq.c b/drivers/infiniband/core/cq.c index 9271f7290005..b1e5365ddafa 100644 --- a/drivers/infiniband/core/cq.c +++ b/drivers/infiniband/core/cq.c @@ -161,7 +161,7 @@ struct ib_cq *__ib_alloc_cq(struct ib_device *dev, void *private, goto out_destroy_cq; cq->res.type = RDMA_RESTRACK_CQ; - cq->res.kern_name = caller; + rdma_restrack_set_task(&cq->res, caller); rdma_restrack_add(&cq->res); switch (cq->poll_ctx) { diff --git a/drivers/infiniband/core/restrack.c b/drivers/infiniband/core/restrack.c index b02d43988e16..035af568ba64 100644 --- a/drivers/infiniband/core/restrack.c +++ b/drivers/infiniband/core/restrack.c @@ -156,12 +156,17 @@ static bool res_is_user(struct rdma_restrack_entry *res) } void rdma_restrack_set_task(struct rdma_restrack_entry *res, - struct task_struct *task) + const char *caller) { + if (caller) { + res->kern_name = caller; + return; + } + if (res->task) put_task_struct(res->task); - get_task_struct(task); - res->task = task; + get_task_struct(current); + res->task = current; } EXPORT_SYMBOL(rdma_restrack_set_task); @@ -177,7 +182,7 @@ void rdma_restrack_add(struct rdma_restrack_entry *res) if (res_is_user(res)) { if (!res->task) - rdma_restrack_set_task(res, current); + rdma_restrack_set_task(res, NULL); res->kern_name = NULL; } else { set_kern_name(res); diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index 1e7ad5e0a46e..65a7e0b44ad7 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -264,7 +264,7 @@ struct ib_pd *__ib_alloc_pd(struct ib_device *device, unsigned int flags, } pd->res.type = RDMA_RESTRACK_PD; - pd->res.kern_name = caller; + rdma_restrack_set_task(&pd->res, caller); rdma_restrack_add(&pd->res); if (mr_access_flags) { @@ -1889,7 +1889,7 @@ struct ib_cq *__ib_create_cq(struct ib_device *device, cq->cq_context = cq_context; atomic_set(&cq->usecnt, 0); cq->res.type = RDMA_RESTRACK_CQ; - cq->res.kern_name = caller; + rdma_restrack_set_task(&cq->res, caller); rdma_restrack_add(&cq->res); } diff --git a/include/rdma/restrack.h b/include/rdma/restrack.h index 0bddbbdbaf7c..2638fa7cd702 100644 --- a/include/rdma/restrack.h +++ b/include/rdma/restrack.h @@ -173,10 +173,10 @@ int rdma_restrack_put(struct rdma_restrack_entry *res); /** * rdma_restrack_set_task() - set the task for this resource * @res: resource entry - * @task: task struct + * @caller: kernel name, the current task will be used if the caller is NULL. */ void rdma_restrack_set_task(struct rdma_restrack_entry *res, - struct task_struct *task); + const char *caller); /* * Helper functions for rdma drivers when filling out -- cgit v1.2.3 From ed7a01fd3fd77f40b4ef2562b966a5decd8928d2 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 2 Oct 2018 11:48:03 +0300 Subject: RDMA/restrack: Release task struct which was hold by CM_ID object Tracking CM_ID resource is performed in two stages: creation of cm_id and connecting it to the cma_dev. It is needed because rdma-cm protocol exports two separate user-visible calls rdma_create_id and rdma_accept. At the time of CM_ID creation, the real owner of that object is unknown yet and we need to grab task_struct. This task_struct is released or reassigned in attach phase later on. but call to rdma_destroy_id left this task_struct unreleased. Such separation is unique to CM_ID and other restrack objects initialize in one shot. It means that it is safe to use "res->valid" check to catch unfinished CM_ID flow and release task_struct for that object. Fixes: 00313983cda6 ("RDMA/nldev: provide detailed CM_ID information") Reported-by: Artemy Kovalyov Reviewed-by: Artemy Kovalyov Reviewed-by: Yossi Itigin Signed-off-by: Leon Romanovsky Reviewed-by: Steve Wise Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cma.c | 7 +++---- drivers/infiniband/core/restrack.c | 6 ++++-- 2 files changed, 7 insertions(+), 6 deletions(-) (limited to 'drivers/infiniband/core') diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index f117b755c4c2..f98ddb5f4d59 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -1815,8 +1815,8 @@ void rdma_destroy_id(struct rdma_cm_id *id) mutex_lock(&id_priv->handler_mutex); mutex_unlock(&id_priv->handler_mutex); + rdma_restrack_del(&id_priv->res); if (id_priv->cma_dev) { - rdma_restrack_del(&id_priv->res); if (rdma_cap_ib_cm(id_priv->id.device, 1)) { if (id_priv->cm_id.ib) ib_destroy_cm_id(id_priv->cm_id.ib); @@ -3542,10 +3542,9 @@ int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr) return 0; err2: - if (id_priv->cma_dev) { - rdma_restrack_del(&id_priv->res); + rdma_restrack_del(&id_priv->res); + if (id_priv->cma_dev) cma_release_dev(id_priv); - } err1: cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_IDLE); return ret; diff --git a/drivers/infiniband/core/restrack.c b/drivers/infiniband/core/restrack.c index 035af568ba64..16b5f9949770 100644 --- a/drivers/infiniband/core/restrack.c +++ b/drivers/infiniband/core/restrack.c @@ -223,7 +223,7 @@ void rdma_restrack_del(struct rdma_restrack_entry *res) struct ib_device *dev; if (!res->valid) - return; + goto out; dev = res_to_dev(res); if (!dev) @@ -236,8 +236,10 @@ void rdma_restrack_del(struct rdma_restrack_entry *res) down_write(&dev->res.rwsem); hash_del(&res->node); res->valid = false; + up_write(&dev->res.rwsem); + +out: if (res->task) put_task_struct(res->task); - up_write(&dev->res.rwsem); } EXPORT_SYMBOL(rdma_restrack_del); -- cgit v1.2.3 From fe9bc1644918aa1d02a889b4ca788bfb67f90816 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Thu, 11 Oct 2018 22:10:10 +0300 Subject: RDMA/restrack: Protect from reentry to resource return path Nullify the resource task struct pointer to ensure that subsequent calls won't try to release task_struct again. ------------[ cut here ]------------ ODEBUG: free active (active state 1) object type: rcu_head hint: (null) WARNING: CPU: 0 PID: 6048 at lib/debugobjects.c:329 debug_print_object+0x16a/0x210 lib/debugobjects.c:326 Kernel panic - not syncing: panic_on_warn set ... CPU: 0 PID: 6048 Comm: syz-executor022 Not tainted 4.19.0-rc7-next-20181008+ #89 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x244/0x3ab lib/dump_stack.c:113 panic+0x238/0x4e7 kernel/panic.c:184 __warn.cold.8+0x163/0x1ba kernel/panic.c:536 report_bug+0x254/0x2d0 lib/bug.c:186 fixup_bug arch/x86/kernel/traps.c:178 [inline] do_error_trap+0x11b/0x200 arch/x86/kernel/traps.c:271 do_invalid_op+0x36/0x40 arch/x86/kernel/traps.c:290 invalid_op+0x14/0x20 arch/x86/entry/entry_64.S:969 RIP: 0010:debug_print_object+0x16a/0x210 lib/debugobjects.c:326 Code: 41 88 48 89 fa 48 c1 ea 03 80 3c 02 00 0f 85 92 00 00 00 48 8b 14 dd 60 02 41 88 4c 89 fe 48 c7 c7 00 f8 40 88 e8 36 2f b4 fd <0f> 0b 83 05 a9 f4 5e 06 01 48 83 c4 18 5b 41 5c 41 5d 41 5e 41 5f RSP: 0018:ffff8801d8c3eda8 EFLAGS: 00010086 RAX: 0000000000000000 RBX: 0000000000000003 RCX: 0000000000000000 RDX: 0000000000000000 RSI: ffffffff8164d235 RDI: 0000000000000005 RBP: ffff8801d8c3ede8 R08: ffff8801d70aa280 R09: ffffed003b5c3eda R10: ffffed003b5c3eda R11: ffff8801dae1f6d7 R12: 0000000000000001 R13: ffffffff8939a760 R14: 0000000000000000 R15: ffffffff8840fca0 __debug_check_no_obj_freed lib/debugobjects.c:786 [inline] debug_check_no_obj_freed+0x3ae/0x58d lib/debugobjects.c:818 kmem_cache_free+0x202/0x290 mm/slab.c:3759 free_task_struct kernel/fork.c:163 [inline] free_task+0x16e/0x1f0 kernel/fork.c:457 __put_task_struct+0x2e6/0x620 kernel/fork.c:730 put_task_struct include/linux/sched/task.h:96 [inline] finish_task_switch+0x66c/0x900 kernel/sched/core.c:2715 context_switch kernel/sched/core.c:2834 [inline] __schedule+0x8d7/0x21d0 kernel/sched/core.c:3480 schedule+0xfe/0x460 kernel/sched/core.c:3524 freezable_schedule include/linux/freezer.h:172 [inline] futex_wait_queue_me+0x3f9/0x840 kernel/futex.c:2530 futex_wait+0x45c/0xa50 kernel/futex.c:2645 do_futex+0x31a/0x26d0 kernel/futex.c:3528 __do_sys_futex kernel/futex.c:3589 [inline] __se_sys_futex kernel/futex.c:3557 [inline] __x64_sys_futex+0x472/0x6a0 kernel/futex.c:3557 do_syscall_64+0x1b9/0x820 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x446549 Code: e8 2c b3 02 00 48 83 c4 18 c3 0f 1f 80 00 00 00 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0f 83 2b 09 fc ff c3 66 2e 0f 1f 84 00 00 00 00 RSP: 002b:00007f3a998f5da8 EFLAGS: 00000246 ORIG_RAX: 00000000000000ca RAX: ffffffffffffffda RBX: 00000000006dbc38 RCX: 0000000000446549 RDX: 0000000000000000 RSI: 0000000000000080 RDI: 00000000006dbc38 RBP: 00000000006dbc30 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 00000000006dbc3c R13: 2f646e6162696e69 R14: 666e692f7665642f R15: 00000000006dbd2c Kernel Offset: disabled Reported-by: syzbot+71aff6ea121ffefc280f@syzkaller.appspotmail.com Fixes: ed7a01fd3fd7 ("RDMA/restrack: Release task struct which was hold by CM_ID object") Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/restrack.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'drivers/infiniband/core') diff --git a/drivers/infiniband/core/restrack.c b/drivers/infiniband/core/restrack.c index 16b5f9949770..06d8657ce583 100644 --- a/drivers/infiniband/core/restrack.c +++ b/drivers/infiniband/core/restrack.c @@ -239,7 +239,9 @@ void rdma_restrack_del(struct rdma_restrack_entry *res) up_write(&dev->res.rwsem); out: - if (res->task) + if (res->task) { put_task_struct(res->task); + res->task = NULL; + } } EXPORT_SYMBOL(rdma_restrack_del); -- cgit v1.2.3 From e54b6a3bcd1ec972b25a164bdf495d9e7120b107 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Thu, 11 Oct 2018 22:36:10 +0300 Subject: RDMA/cm: Respect returned status of cm_init_av_by_path Add missing check for failure of cm_init_av_by_path Fixes: e1444b5a163e ("IB/cm: Fix automatic path migration support") Reported-by: Slava Shwartsman Reviewed-by: Parav Pandit Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cm.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'drivers/infiniband/core') diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c index a6a20603ccea..edb2cb758be7 100644 --- a/drivers/infiniband/core/cm.c +++ b/drivers/infiniband/core/cm.c @@ -3292,8 +3292,11 @@ static int cm_lap_handler(struct cm_work *work) if (ret) goto unlock; - cm_init_av_by_path(param->alternate_path, NULL, &cm_id_priv->alt_av, - cm_id_priv); + ret = cm_init_av_by_path(param->alternate_path, NULL, + &cm_id_priv->alt_av, cm_id_priv); + if (ret) + goto unlock; + cm_id_priv->id.lap_state = IB_CM_LAP_RCVD; cm_id_priv->tid = lap_msg->hdr.tid; ret = atomic_inc_and_test(&cm_id_priv->work_count); -- cgit v1.2.3 From d6f9125207902ace40d36d6571cda251b43a8f95 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Thu, 11 Oct 2018 17:30:03 +0300 Subject: RDMA/cma: Remove unused timeout_ms parameter from cma_resolve_iw_route() cma_resolve_iw_route() doesn't use timeout_ms parameter, so let's remove it. Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/core/cma.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/infiniband/core') diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 5c7e3bafdd4a..1156cb911a5c 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -2751,7 +2751,7 @@ err: } EXPORT_SYMBOL(rdma_set_ib_path); -static int cma_resolve_iw_route(struct rdma_id_private *id_priv, int timeout_ms) +static int cma_resolve_iw_route(struct rdma_id_private *id_priv) { struct cma_work *work; @@ -2867,7 +2867,7 @@ int rdma_resolve_route(struct rdma_cm_id *id, int timeout_ms) else if (rdma_protocol_roce(id->device, id->port_num)) ret = cma_resolve_iboe_route(id_priv); else if (rdma_protocol_iwarp(id->device, id->port_num)) - ret = cma_resolve_iw_route(id_priv, timeout_ms); + ret = cma_resolve_iw_route(id_priv); else ret = -ENOSYS; -- cgit v1.2.3 From 9549c2bd094f0f54b8827d64886f5b1de370dff3 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Thu, 11 Oct 2018 17:30:04 +0300 Subject: RDMA/core: Align multiple functions to kernel coding style This patch changes the small number of functions to be aligned to kernel coding style. It is needed to minimize the diffstat of the following patch. It doesn't change any functionality. Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/core/addr.c | 3 +-- drivers/infiniband/core/sa.h | 10 ++++------ include/rdma/ib_addr.h | 3 +-- include/rdma/ib_sa.h | 36 +++++++++++++++--------------------- 4 files changed, 21 insertions(+), 31 deletions(-) (limited to 'drivers/infiniband/core') diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c index 07e0ffe74a8a..b6f7cde36c2d 100644 --- a/drivers/infiniband/core/addr.c +++ b/drivers/infiniband/core/addr.c @@ -662,8 +662,7 @@ int rdma_resolve_ip(struct sockaddr *src_addr, const struct sockaddr *dst_addr, struct rdma_dev_addr *addr, int timeout_ms, void (*callback)(int status, struct sockaddr *src_addr, struct rdma_dev_addr *addr, void *context), - bool resolve_by_gid_attr, - void *context) + bool resolve_by_gid_attr, void *context) { struct sockaddr *src_in, *dst_in; struct addr_req *req; diff --git a/drivers/infiniband/core/sa.h b/drivers/infiniband/core/sa.h index b1d4bbf4ce5c..57d4496f6720 100644 --- a/drivers/infiniband/core/sa.h +++ b/drivers/infiniband/core/sa.h @@ -49,16 +49,14 @@ static inline void ib_sa_client_put(struct ib_sa_client *client) } int ib_sa_mcmember_rec_query(struct ib_sa_client *client, - struct ib_device *device, u8 port_num, - u8 method, + struct ib_device *device, u8 port_num, u8 method, struct ib_sa_mcmember_rec *rec, - ib_sa_comp_mask comp_mask, - int timeout_ms, gfp_t gfp_mask, + ib_sa_comp_mask comp_mask, int timeout_ms, + gfp_t gfp_mask, void (*callback)(int status, struct ib_sa_mcmember_rec *resp, void *context), - void *context, - struct ib_sa_query **sa_query); + void *context, struct ib_sa_query **sa_query); int mcast_init(void); void mcast_cleanup(void); diff --git a/include/rdma/ib_addr.h b/include/rdma/ib_addr.h index e09eca91eb18..eebbe63b530c 100644 --- a/include/rdma/ib_addr.h +++ b/include/rdma/ib_addr.h @@ -102,8 +102,7 @@ int rdma_resolve_ip(struct sockaddr *src_addr, const struct sockaddr *dst_addr, struct rdma_dev_addr *addr, int timeout_ms, void (*callback)(int status, struct sockaddr *src_addr, struct rdma_dev_addr *addr, void *context), - bool resolve_by_gid_attr, - void *context); + bool resolve_by_gid_attr, void *context); void rdma_addr_cancel(struct rdma_dev_addr *addr); diff --git a/include/rdma/ib_sa.h b/include/rdma/ib_sa.h index b6ddf2a1b9d8..95ce625a49e3 100644 --- a/include/rdma/ib_sa.h +++ b/include/rdma/ib_sa.h @@ -449,28 +449,23 @@ struct ib_sa_query; void ib_sa_cancel_query(int id, struct ib_sa_query *query); -int ib_sa_path_rec_get(struct ib_sa_client *client, - struct ib_device *device, u8 port_num, - struct sa_path_rec *rec, - ib_sa_comp_mask comp_mask, - int timeout_ms, gfp_t gfp_mask, - void (*callback)(int status, - struct sa_path_rec *resp, +int ib_sa_path_rec_get(struct ib_sa_client *client, struct ib_device *device, + u8 port_num, struct sa_path_rec *rec, + ib_sa_comp_mask comp_mask, int timeout_ms, + gfp_t gfp_mask, + void (*callback)(int status, struct sa_path_rec *resp, void *context), - void *context, - struct ib_sa_query **query); + void *context, struct ib_sa_query **query); int ib_sa_service_rec_query(struct ib_sa_client *client, - struct ib_device *device, u8 port_num, - u8 method, - struct ib_sa_service_rec *rec, - ib_sa_comp_mask comp_mask, - int timeout_ms, gfp_t gfp_mask, - void (*callback)(int status, - struct ib_sa_service_rec *resp, - void *context), - void *context, - struct ib_sa_query **sa_query); + struct ib_device *device, u8 port_num, u8 method, + struct ib_sa_service_rec *rec, + ib_sa_comp_mask comp_mask, int timeout_ms, + gfp_t gfp_mask, + void (*callback)(int status, + struct ib_sa_service_rec *resp, + void *context), + void *context, struct ib_sa_query **sa_query); struct ib_sa_multicast { struct ib_sa_mcmember_rec rec; @@ -577,8 +572,7 @@ int ib_sa_guid_info_rec_query(struct ib_sa_client *client, void (*callback)(int status, struct ib_sa_guidinfo_rec *resp, void *context), - void *context, - struct ib_sa_query **sa_query); + void *context, struct ib_sa_query **sa_query); bool ib_sa_sendonly_fullmem_support(struct ib_sa_client *client, struct ib_device *device, -- cgit v1.2.3 From dbace111e5b320682eee63d7173959a2b2bd9ccb Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Thu, 11 Oct 2018 17:30:05 +0300 Subject: RDMA/core: Annotate timeout as unsigned long The ucma users supply timeout in u32 format, it means that any number with most significant bit set will be converted to negative value by various rdma_*, cma_* and sa_query functions, which treat timeout as int. In the lowest level, the timeout is converted back to be unsigned long. Remove this ambiguous conversion by updating all function signatures to receive unsigned long. Reported-by: Noa Osherovich Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/core/addr.c | 2 +- drivers/infiniband/core/cma.c | 11 ++++++----- drivers/infiniband/core/mad.c | 2 +- drivers/infiniband/core/mad_priv.h | 2 +- drivers/infiniband/core/sa.h | 4 ++-- drivers/infiniband/core/sa_query.c | 13 +++++++------ include/rdma/ib_addr.h | 2 +- include/rdma/ib_cm.h | 2 +- include/rdma/ib_sa.h | 6 +++--- include/rdma/rdma_cm.h | 5 +++-- 10 files changed, 26 insertions(+), 23 deletions(-) (limited to 'drivers/infiniband/core') diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c index b6f7cde36c2d..0dce94e3c495 100644 --- a/drivers/infiniband/core/addr.c +++ b/drivers/infiniband/core/addr.c @@ -659,7 +659,7 @@ static void process_one_req(struct work_struct *_work) } int rdma_resolve_ip(struct sockaddr *src_addr, const struct sockaddr *dst_addr, - struct rdma_dev_addr *addr, int timeout_ms, + struct rdma_dev_addr *addr, unsigned long timeout_ms, void (*callback)(int status, struct sockaddr *src_addr, struct rdma_dev_addr *addr, void *context), bool resolve_by_gid_attr, void *context) diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 1156cb911a5c..15d5bb7bf6bb 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -2510,8 +2510,8 @@ static void cma_query_handler(int status, struct sa_path_rec *path_rec, queue_work(cma_wq, &work->work); } -static int cma_query_ib_route(struct rdma_id_private *id_priv, int timeout_ms, - struct cma_work *work) +static int cma_query_ib_route(struct rdma_id_private *id_priv, + unsigned long timeout_ms, struct cma_work *work) { struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; struct sa_path_rec path_rec; @@ -2629,7 +2629,8 @@ static void cma_init_resolve_addr_work(struct cma_work *work, work->event.event = RDMA_CM_EVENT_ADDR_RESOLVED; } -static int cma_resolve_ib_route(struct rdma_id_private *id_priv, int timeout_ms) +static int cma_resolve_ib_route(struct rdma_id_private *id_priv, + unsigned long timeout_ms) { struct rdma_route *route = &id_priv->id.route; struct cma_work *work; @@ -2852,7 +2853,7 @@ err1: return ret; } -int rdma_resolve_route(struct rdma_cm_id *id, int timeout_ms) +int rdma_resolve_route(struct rdma_cm_id *id, unsigned long timeout_ms) { struct rdma_id_private *id_priv; int ret; @@ -3072,7 +3073,7 @@ static int cma_bind_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, } int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, - const struct sockaddr *dst_addr, int timeout_ms) + const struct sockaddr *dst_addr, unsigned long timeout_ms) { struct rdma_id_private *id_priv; int ret; diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c index c355379e7534..d7025cd5be28 100644 --- a/drivers/infiniband/core/mad.c +++ b/drivers/infiniband/core/mad.c @@ -2414,7 +2414,7 @@ static void wait_for_response(struct ib_mad_send_wr_private *mad_send_wr) } void ib_reset_mad_timeout(struct ib_mad_send_wr_private *mad_send_wr, - int timeout_ms) + unsigned long timeout_ms) { mad_send_wr->timeout = msecs_to_jiffies(timeout_ms); wait_for_response(mad_send_wr); diff --git a/drivers/infiniband/core/mad_priv.h b/drivers/infiniband/core/mad_priv.h index d84ae1671898..216509036aa8 100644 --- a/drivers/infiniband/core/mad_priv.h +++ b/drivers/infiniband/core/mad_priv.h @@ -221,6 +221,6 @@ void ib_mad_complete_send_wr(struct ib_mad_send_wr_private *mad_send_wr, void ib_mark_mad_done(struct ib_mad_send_wr_private *mad_send_wr); void ib_reset_mad_timeout(struct ib_mad_send_wr_private *mad_send_wr, - int timeout_ms); + unsigned long timeout_ms); #endif /* __IB_MAD_PRIV_H__ */ diff --git a/drivers/infiniband/core/sa.h b/drivers/infiniband/core/sa.h index 57d4496f6720..cbaaaa92fff3 100644 --- a/drivers/infiniband/core/sa.h +++ b/drivers/infiniband/core/sa.h @@ -51,8 +51,8 @@ static inline void ib_sa_client_put(struct ib_sa_client *client) int ib_sa_mcmember_rec_query(struct ib_sa_client *client, struct ib_device *device, u8 port_num, u8 method, struct ib_sa_mcmember_rec *rec, - ib_sa_comp_mask comp_mask, int timeout_ms, - gfp_t gfp_mask, + ib_sa_comp_mask comp_mask, + unsigned long timeout_ms, gfp_t gfp_mask, void (*callback)(int status, struct ib_sa_mcmember_rec *resp, void *context), diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c index f28f6fdb78cb..be5ba5e15496 100644 --- a/drivers/infiniband/core/sa_query.c +++ b/drivers/infiniband/core/sa_query.c @@ -1360,7 +1360,8 @@ static void init_mad(struct ib_sa_query *query, struct ib_mad_agent *agent) spin_unlock_irqrestore(&tid_lock, flags); } -static int send_mad(struct ib_sa_query *query, int timeout_ms, gfp_t gfp_mask) +static int send_mad(struct ib_sa_query *query, unsigned long timeout_ms, + gfp_t gfp_mask) { bool preload = gfpflags_allow_blocking(gfp_mask); unsigned long flags; @@ -1550,7 +1551,7 @@ int ib_sa_path_rec_get(struct ib_sa_client *client, struct ib_device *device, u8 port_num, struct sa_path_rec *rec, ib_sa_comp_mask comp_mask, - int timeout_ms, gfp_t gfp_mask, + unsigned long timeout_ms, gfp_t gfp_mask, void (*callback)(int status, struct sa_path_rec *resp, void *context), @@ -1704,7 +1705,7 @@ int ib_sa_service_rec_query(struct ib_sa_client *client, struct ib_device *device, u8 port_num, u8 method, struct ib_sa_service_rec *rec, ib_sa_comp_mask comp_mask, - int timeout_ms, gfp_t gfp_mask, + unsigned long timeout_ms, gfp_t gfp_mask, void (*callback)(int status, struct ib_sa_service_rec *resp, void *context), @@ -1801,7 +1802,7 @@ int ib_sa_mcmember_rec_query(struct ib_sa_client *client, u8 method, struct ib_sa_mcmember_rec *rec, ib_sa_comp_mask comp_mask, - int timeout_ms, gfp_t gfp_mask, + unsigned long timeout_ms, gfp_t gfp_mask, void (*callback)(int status, struct ib_sa_mcmember_rec *resp, void *context), @@ -1892,7 +1893,7 @@ int ib_sa_guid_info_rec_query(struct ib_sa_client *client, struct ib_device *device, u8 port_num, struct ib_sa_guidinfo_rec *rec, ib_sa_comp_mask comp_mask, u8 method, - int timeout_ms, gfp_t gfp_mask, + unsigned long timeout_ms, gfp_t gfp_mask, void (*callback)(int status, struct ib_sa_guidinfo_rec *resp, void *context), @@ -2059,7 +2060,7 @@ static void ib_sa_classport_info_rec_release(struct ib_sa_query *sa_query) } static int ib_sa_classport_info_rec_query(struct ib_sa_port *port, - int timeout_ms, + unsigned long timeout_ms, void (*callback)(void *context), void *context, struct ib_sa_query **sa_query) diff --git a/include/rdma/ib_addr.h b/include/rdma/ib_addr.h index eebbe63b530c..2734c895c1bf 100644 --- a/include/rdma/ib_addr.h +++ b/include/rdma/ib_addr.h @@ -99,7 +99,7 @@ int rdma_translate_ip(const struct sockaddr *addr, * @context: User-specified context associated with the call. */ int rdma_resolve_ip(struct sockaddr *src_addr, const struct sockaddr *dst_addr, - struct rdma_dev_addr *addr, int timeout_ms, + struct rdma_dev_addr *addr, unsigned long timeout_ms, void (*callback)(int status, struct sockaddr *src_addr, struct rdma_dev_addr *addr, void *context), bool resolve_by_gid_attr, void *context); diff --git a/include/rdma/ib_cm.h b/include/rdma/ib_cm.h index c10f4b5ea8ab..49f4f75499b3 100644 --- a/include/rdma/ib_cm.h +++ b/include/rdma/ib_cm.h @@ -583,7 +583,7 @@ struct ib_cm_sidr_req_param { struct sa_path_rec *path; const struct ib_gid_attr *sgid_attr; __be64 service_id; - int timeout_ms; + unsigned long timeout_ms; const void *private_data; u8 private_data_len; u8 max_cm_retries; diff --git a/include/rdma/ib_sa.h b/include/rdma/ib_sa.h index 95ce625a49e3..19520979b84c 100644 --- a/include/rdma/ib_sa.h +++ b/include/rdma/ib_sa.h @@ -451,7 +451,7 @@ void ib_sa_cancel_query(int id, struct ib_sa_query *query); int ib_sa_path_rec_get(struct ib_sa_client *client, struct ib_device *device, u8 port_num, struct sa_path_rec *rec, - ib_sa_comp_mask comp_mask, int timeout_ms, + ib_sa_comp_mask comp_mask, unsigned long timeout_ms, gfp_t gfp_mask, void (*callback)(int status, struct sa_path_rec *resp, void *context), @@ -460,7 +460,7 @@ int ib_sa_path_rec_get(struct ib_sa_client *client, struct ib_device *device, int ib_sa_service_rec_query(struct ib_sa_client *client, struct ib_device *device, u8 port_num, u8 method, struct ib_sa_service_rec *rec, - ib_sa_comp_mask comp_mask, int timeout_ms, + ib_sa_comp_mask comp_mask, unsigned long timeout_ms, gfp_t gfp_mask, void (*callback)(int status, struct ib_sa_service_rec *resp, @@ -568,7 +568,7 @@ int ib_sa_guid_info_rec_query(struct ib_sa_client *client, struct ib_device *device, u8 port_num, struct ib_sa_guidinfo_rec *rec, ib_sa_comp_mask comp_mask, u8 method, - int timeout_ms, gfp_t gfp_mask, + unsigned long timeout_ms, gfp_t gfp_mask, void (*callback)(int status, struct ib_sa_guidinfo_rec *resp, void *context), diff --git a/include/rdma/rdma_cm.h b/include/rdma/rdma_cm.h index 53d93c7d8e01..60987a5903b7 100644 --- a/include/rdma/rdma_cm.h +++ b/include/rdma/rdma_cm.h @@ -196,7 +196,8 @@ int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr); * @timeout_ms: Time to wait for resolution to complete. */ int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, - const struct sockaddr *dst_addr, int timeout_ms); + const struct sockaddr *dst_addr, + unsigned long timeout_ms); /** * rdma_resolve_route - Resolve the RDMA address bound to the RDMA identifier @@ -206,7 +207,7 @@ int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, * Users must have first called rdma_resolve_addr to resolve a dst_addr * into an RDMA address before calling this routine. */ -int rdma_resolve_route(struct rdma_cm_id *id, int timeout_ms); +int rdma_resolve_route(struct rdma_cm_id *id, unsigned long timeout_ms); /** * rdma_create_qp - Allocate a QP and associate it with the specified RDMA -- cgit v1.2.3 From d21943dd19b5c79dc09bb0e8bf80cd5ee09c41c2 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Wed, 10 Oct 2018 09:19:11 +0300 Subject: RDMA/core: Implement IB device rename function Generic implementation of IB device rename function. Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/core/core_priv.h | 1 + drivers/infiniband/core/device.c | 25 +++++++++++++++++++++++++ 2 files changed, 26 insertions(+) (limited to 'drivers/infiniband/core') diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h index d7399d5b1cb6..c5881756b799 100644 --- a/drivers/infiniband/core/core_priv.h +++ b/drivers/infiniband/core/core_priv.h @@ -87,6 +87,7 @@ int ib_device_register_sysfs(struct ib_device *device, int (*port_callback)(struct ib_device *, u8, struct kobject *)); void ib_device_unregister_sysfs(struct ib_device *device); +int ib_device_rename(struct ib_device *ibdev, const char *name); typedef void (*roce_netdev_callback)(struct ib_device *device, u8 port, struct net_device *idev, void *cookie); diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index d105b9b2d118..5e70f5e1cfd9 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -171,6 +171,31 @@ static struct ib_device *__ib_device_get_by_name(const char *name) return NULL; } +int ib_device_rename(struct ib_device *ibdev, const char *name) +{ + struct ib_device *device; + int ret = 0; + + if (!strcmp(name, dev_name(&ibdev->dev))) + return ret; + + mutex_lock(&device_mutex); + list_for_each_entry(device, &device_list, core_list) { + if (!strcmp(name, dev_name(&device->dev))) { + ret = -EEXIST; + goto out; + } + } + + ret = device_rename(&ibdev->dev, name); + if (ret) + goto out; + strlcpy(ibdev->name, name, IB_DEVICE_NAME_MAX); +out: + mutex_unlock(&device_mutex); + return ret; +} + static int alloc_name(struct ib_device *ibdev, const char *name) { unsigned long *inuse; -- cgit v1.2.3 From 05d940d3a3ec4e6d5d6a726aae4d73c5c64603c6 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Wed, 10 Oct 2018 09:19:12 +0300 Subject: RDMA/nldev: Allow IB device rename through RDMA netlink Provide an option to rename IB device name through RDMA netlink and limit it to users with ADMIN capability only. Signed-off-by: Leon Romanovsky Reviewed-by: Parav Pandit Signed-off-by: Doug Ledford --- drivers/infiniband/core/nldev.c | 34 ++++++++++++++++++++++++++++++++++ include/uapi/rdma/rdma_netlink.h | 3 ++- 2 files changed, 36 insertions(+), 1 deletion(-) (limited to 'drivers/infiniband/core') diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c index ba5403fbcd88..573399e3ccc1 100644 --- a/drivers/infiniband/core/nldev.c +++ b/drivers/infiniband/core/nldev.c @@ -646,6 +646,36 @@ err: return err; } +static int nldev_set_doit(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; + struct ib_device *device; + u32 index; + int err; + + err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, nldev_policy, + extack); + if (err || !tb[RDMA_NLDEV_ATTR_DEV_INDEX]) + return -EINVAL; + + index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); + device = ib_device_get_by_index(index); + if (!device) + return -EINVAL; + + if (tb[RDMA_NLDEV_ATTR_DEV_NAME]) { + char name[IB_DEVICE_NAME_MAX] = {}; + + nla_strlcpy(name, tb[RDMA_NLDEV_ATTR_DEV_NAME], + IB_DEVICE_NAME_MAX); + err = ib_device_rename(device, name); + } + + put_device(&device->dev); + return err; +} + static int _nldev_get_dumpit(struct ib_device *device, struct sk_buff *skb, struct netlink_callback *cb, @@ -1078,6 +1108,10 @@ static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = { .doit = nldev_get_doit, .dump = nldev_get_dumpit, }, + [RDMA_NLDEV_CMD_SET] = { + .doit = nldev_set_doit, + .flags = RDMA_NL_ADMIN_PERM, + }, [RDMA_NLDEV_CMD_PORT_GET] = { .doit = nldev_port_get_doit, .dump = nldev_port_get_dumpit, diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h index edba6351ac13..f9c41bf59efc 100644 --- a/include/uapi/rdma/rdma_netlink.h +++ b/include/uapi/rdma/rdma_netlink.h @@ -227,8 +227,9 @@ enum rdma_nldev_command { RDMA_NLDEV_CMD_UNSPEC, RDMA_NLDEV_CMD_GET, /* can dump */ + RDMA_NLDEV_CMD_SET, - /* 2 - 4 are free to use */ + /* 3 - 4 are free to use */ RDMA_NLDEV_CMD_PORT_GET = 5, /* can dump */ -- cgit v1.2.3 From 0f6ef65d1c6ec8deb5d0f11f86631ec4cfe8f22e Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Sun, 7 Oct 2018 12:12:40 +0300 Subject: RDMA/core: Do not expose unsupported counters MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If the provider driver (such as rdma_rxe) doesn't support pma counters, avoid exposing its directory similar to optional hw_counters directory. If core fails to read the PMA counter, return an error so that user can retry later if needed. Fixes: 35c4cbb17811 ("IB/core: Create get_perf_mad function in sysfs.c") Reported-by: Holger Hoffstätte Tested-by: Holger Hoffstätte Signed-off-by: Parav Pandit Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/core/sysfs.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) (limited to 'drivers/infiniband/core') diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c index bc947a863b34..107c8ba2046c 100644 --- a/drivers/infiniband/core/sysfs.c +++ b/drivers/infiniband/core/sysfs.c @@ -512,7 +512,7 @@ static ssize_t show_pma_counter(struct ib_port *p, struct port_attribute *attr, ret = get_perf_mad(p->ibdev, p->port_num, tab_attr->attr_id, &data, 40 + offset / 8, sizeof(data)); if (ret < 0) - return sprintf(buf, "N/A (no PMA)\n"); + return ret; switch (width) { case 4: @@ -1057,10 +1057,12 @@ static int add_port(struct ib_device *device, int port_num, goto err_put; } - p->pma_table = get_counter_table(device, port_num); - ret = sysfs_create_group(&p->kobj, p->pma_table); - if (ret) - goto err_put_gid_attrs; + if (device->process_mad) { + p->pma_table = get_counter_table(device, port_num); + ret = sysfs_create_group(&p->kobj, p->pma_table); + if (ret) + goto err_put_gid_attrs; + } p->gid_group.name = "gids"; p->gid_group.attrs = alloc_group_attrs(show_port_gid, attr.gid_tbl_len); @@ -1173,7 +1175,8 @@ err_free_gid: p->gid_group.attrs = NULL; err_remove_pma: - sysfs_remove_group(&p->kobj, p->pma_table); + if (p->pma_table) + sysfs_remove_group(&p->kobj, p->pma_table); err_put_gid_attrs: kobject_put(&p->gid_attr_group->kobj); @@ -1289,7 +1292,9 @@ static void free_port_list_attributes(struct ib_device *device) kfree(port->hw_stats); free_hsag(&port->kobj, port->hw_stats_ag); } - sysfs_remove_group(p, port->pma_table); + + if (port->pma_table) + sysfs_remove_group(p, port->pma_table); sysfs_remove_group(p, &port->pkey_group); sysfs_remove_group(p, &port->gid_group); sysfs_remove_group(&port->gid_attr_group->kobj, -- cgit v1.2.3 From 1ae4cfa03902c83d1d77123e5ac8f0812c61b90e Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Sun, 7 Oct 2018 12:12:41 +0300 Subject: RDMA/core: Rename ports_parent to ports_kobj Normally kobj objects have kobj suffix to reflect it. Rename ports_parent to ports_kobj. Signed-off-by: Parav Pandit Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/core/sysfs.c | 9 ++++----- include/rdma/ib_verbs.h | 2 +- 2 files changed, 5 insertions(+), 6 deletions(-) (limited to 'drivers/infiniband/core') diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c index 107c8ba2046c..f54f107ef668 100644 --- a/drivers/infiniband/core/sysfs.c +++ b/drivers/infiniband/core/sysfs.c @@ -1036,7 +1036,7 @@ static int add_port(struct ib_device *device, int port_num, p->port_num = port_num; ret = kobject_init_and_add(&p->kobj, &port_type, - device->ports_parent, + device->ports_kobj, "%d", port_num); if (ret) { kfree(p); @@ -1305,7 +1305,7 @@ static void free_port_list_attributes(struct ib_device *device) kobject_put(p); } - kobject_put(device->ports_parent); + kobject_put(device->ports_kobj); } int ib_device_register_sysfs(struct ib_device *device, @@ -1323,9 +1323,8 @@ int ib_device_register_sysfs(struct ib_device *device, if (ret) goto err; - device->ports_parent = kobject_create_and_add("ports", - &class_dev->kobj); - if (!device->ports_parent) { + device->ports_kobj = kobject_create_and_add("ports", &class_dev->kobj); + if (!device->ports_kobj) { ret = -ENOMEM; goto err_put; } diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 7ce617d77f8f..7d732cf87886 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2542,7 +2542,7 @@ struct ib_device { /* First group for device attributes, NULL terminated array */ const struct attribute_group *groups[2]; - struct kobject *ports_parent; + struct kobject *ports_kobj; struct list_head port_list; enum { -- cgit v1.2.3 From 7d65cbf0b0ac7d7eebf397ff9af6645b2b3004c2 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 2 Oct 2018 11:13:28 +0300 Subject: RDMA/core: Increase total number of RDMA ports across all devices IDA adds overhead to store IDs bitmap with maximal value of IDA can be upto 2099202 (IDA_MAX = 0x80000000U / IDA_BITMAP_BITS - 1). However, there is no need to add such enormous number of devices and it is enough for now to limit it to be 8192. Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/core/core_priv.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/infiniband/core') diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h index c5881756b799..bb9007a0cca7 100644 --- a/drivers/infiniband/core/core_priv.h +++ b/drivers/infiniband/core/core_priv.h @@ -44,7 +44,7 @@ #include "mad_priv.h" /* Total number of ports combined across all struct ib_devices's */ -#define RDMA_MAX_PORTS 1024 +#define RDMA_MAX_PORTS 8192 struct pkey_index_qp_list { struct list_head pkey_index_list; -- cgit v1.2.3 From 90f6e41cc03a4055d56e94ad7c97df4b1add7f61 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 2 Oct 2018 11:13:29 +0300 Subject: RDMA/uverbs: Use kernel API to allocate uverbs indexes Replace custom code to allocate indexes to generic kernel API. Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/core/uverbs_main.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'drivers/infiniband/core') diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index 4258cbd55ed7..6d373f5515b7 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -73,7 +73,7 @@ enum { static dev_t dynamic_uverbs_dev; static struct class *uverbs_class; -static DECLARE_BITMAP(dev_map, IB_UVERBS_MAX_DEVICES); +static DEFINE_IDA(uverbs_ida); static ssize_t (*uverbs_cmd_table[])(struct ib_uverbs_file *file, const char __user *buf, int in_len, @@ -1268,11 +1268,11 @@ static void ib_uverbs_add_one(struct ib_device *device) rcu_assign_pointer(uverbs_dev->ib_dev, device); uverbs_dev->num_comp_vectors = device->num_comp_vectors; - devnum = find_first_zero_bit(dev_map, IB_UVERBS_MAX_DEVICES); - if (devnum >= IB_UVERBS_MAX_DEVICES) + devnum = ida_alloc_max(&uverbs_ida, IB_UVERBS_MAX_DEVICES - 1, + GFP_KERNEL); + if (devnum < 0) goto err; uverbs_dev->devnum = devnum; - set_bit(devnum, dev_map); if (devnum >= IB_UVERBS_NUM_FIXED_MINOR) base = dynamic_uverbs_dev + devnum - IB_UVERBS_NUM_FIXED_MINOR; else @@ -1296,7 +1296,7 @@ static void ib_uverbs_add_one(struct ib_device *device) return; err_uapi: - clear_bit(devnum, dev_map); + ida_free(&uverbs_ida, devnum); err: if (atomic_dec_and_test(&uverbs_dev->refcount)) ib_uverbs_comp_dev(uverbs_dev); @@ -1371,7 +1371,7 @@ static void ib_uverbs_remove_one(struct ib_device *device, void *client_data) return; cdev_device_del(&uverbs_dev->cdev, &uverbs_dev->dev); - clear_bit(uverbs_dev->devnum, dev_map); + ida_free(&uverbs_ida, uverbs_dev->devnum); if (device->disassociate_ucontext) { /* We disassociate HW resources and immediately return. -- cgit v1.2.3 From 551d315e34a5e6961f8deaf2d6f37ad24fccaa08 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 2 Oct 2018 11:13:30 +0300 Subject: RDMA/umad: Use kernel API to allocate umad indexes Replace custom code to allocate indexes to generic kernel API. Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/core/user_mad.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'drivers/infiniband/core') diff --git a/drivers/infiniband/core/user_mad.c b/drivers/infiniband/core/user_mad.c index 9961859da06a..f55f48f6b272 100644 --- a/drivers/infiniband/core/user_mad.c +++ b/drivers/infiniband/core/user_mad.c @@ -138,7 +138,7 @@ static const dev_t base_issm_dev = MKDEV(IB_UMAD_MAJOR, IB_UMAD_MINOR_BASE) + static dev_t dynamic_umad_dev; static dev_t dynamic_issm_dev; -static DECLARE_BITMAP(dev_map, IB_UMAD_MAX_PORTS); +static DEFINE_IDA(umad_ida); static void ib_umad_add_one(struct ib_device *device); static void ib_umad_remove_one(struct ib_device *device, void *client_data); @@ -1159,11 +1159,10 @@ static int ib_umad_init_port(struct ib_device *device, int port_num, dev_t base_umad; dev_t base_issm; - devnum = find_first_zero_bit(dev_map, IB_UMAD_MAX_PORTS); - if (devnum >= IB_UMAD_MAX_PORTS) + devnum = ida_alloc_max(&umad_ida, IB_UMAD_MAX_PORTS - 1, GFP_KERNEL); + if (devnum < 0) return -1; port->dev_num = devnum; - set_bit(devnum, dev_map); if (devnum >= IB_UMAD_NUM_FIXED_MINOR) { base_umad = dynamic_umad_dev + devnum - IB_UMAD_NUM_FIXED_MINOR; base_issm = dynamic_issm_dev + devnum - IB_UMAD_NUM_FIXED_MINOR; @@ -1227,7 +1226,7 @@ err_dev: err_cdev: cdev_del(&port->cdev); - clear_bit(devnum, dev_map); + ida_free(&umad_ida, devnum); return -1; } @@ -1261,7 +1260,7 @@ static void ib_umad_kill_port(struct ib_umad_port *port) } mutex_unlock(&port->file_mutex); - clear_bit(port->dev_num, dev_map); + ida_free(&umad_ida, port->dev_num); } static void ib_umad_add_one(struct ib_device *device) -- cgit v1.2.3 From 67fecaf8e9cc28812042f61194ac0e0a9737f897 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Wed, 17 Oct 2018 13:19:27 +0300 Subject: RDMA/core: Fix unwinding flow in case of error to register device If port pkey list initialization fails, free the port_immutable memory during cleanup path. Currently it is missed out. If cache setup fails, free the pkey list during cleanup path. Fixes: d291f1a65 ("IB/core: Enforce PKey security on QPs") Signed-off-by: Parav Pandit Reviewed-by: Daniel Jurgens Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/core/device.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'drivers/infiniband/core') diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 5e70f5e1cfd9..d175b94ae952 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -545,14 +545,14 @@ int ib_register_device(struct ib_device *device, const char *name, ret = setup_port_pkey_list(device); if (ret) { dev_warn(&device->dev, "Couldn't create per port_pkey_list\n"); - goto out; + goto port_cleanup; } ret = ib_cache_setup_one(device); if (ret) { dev_warn(&device->dev, "Couldn't set up InfiniBand P_Key/GID cache\n"); - goto port_cleanup; + goto pkey_cleanup; } device->index = __dev_new_index(); @@ -596,6 +596,8 @@ cg_cleanup: cache_cleanup: ib_cache_cleanup_one(device); ib_cache_release_one(device); +pkey_cleanup: + kfree(device->port_pkey_list); port_cleanup: kfree(device->port_immutable); out: -- cgit v1.2.3 From 548cb4fbe80d68b9d1b8b30aca179636e74bec36 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Wed, 17 Oct 2018 13:20:20 +0300 Subject: RDMA/core: Refactor ib_register_device() function ib_register_device() does several allocation and initialization steps. Split it into smaller more readable functions for easy review and maintenance. Signed-off-by: Parav Pandit Reviewed-by: Daniel Jurgens Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/core/device.c | 126 +++++++++++++++++++++++---------------- 1 file changed, 75 insertions(+), 51 deletions(-) (limited to 'drivers/infiniband/core') diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index d175b94ae952..87eb4f2cdd7d 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -465,22 +465,8 @@ static u32 __dev_new_index(void) } } -/** - * ib_register_device - Register an IB device with IB core - * @device:Device to register - * - * Low-level drivers use ib_register_device() to register their - * devices with the IB core. All registered clients will receive a - * callback for each device that is added. @device must be allocated - * with ib_alloc_device(). - */ -int ib_register_device(struct ib_device *device, const char *name, - int (*port_callback)(struct ib_device *, u8, - struct kobject *)) +static void setup_dma_device(struct ib_device *device) { - int ret; - struct ib_client *client; - struct ib_udata uhw = {.outlen = 0, .inlen = 0}; struct device *parent = device->dev.parent; WARN_ON_ONCE(device->dma_device); @@ -512,34 +498,38 @@ int ib_register_device(struct ib_device *device, const char *name, WARN_ON_ONCE(!parent); device->dma_device = parent; } +} - mutex_lock(&device_mutex); +static void cleanup_device(struct ib_device *device) +{ + ib_cache_cleanup_one(device); + ib_cache_release_one(device); + kfree(device->port_pkey_list); + kfree(device->port_immutable); +} - if (strchr(name, '%')) { - ret = alloc_name(device, name); - if (ret) - goto out; - } else { - ret = dev_set_name(&device->dev, name); - if (ret) - goto out; - } - if (__ib_device_get_by_name(dev_name(&device->dev))) { - ret = -ENFILE; - goto out; - } - strlcpy(device->name, dev_name(&device->dev), IB_DEVICE_NAME_MAX); +static int setup_device(struct ib_device *device) +{ + struct ib_udata uhw = {.outlen = 0, .inlen = 0}; + int ret; - if (ib_device_check_mandatory(device)) { - ret = -EINVAL; - goto out; - } + ret = ib_device_check_mandatory(device); + if (ret) + return ret; ret = read_port_immutable(device); if (ret) { dev_warn(&device->dev, "Couldn't create per port immutable data\n"); - goto out; + return ret; + } + + memset(&device->attrs, 0, sizeof(device->attrs)); + ret = device->query_device(device, &device->attrs, &uhw); + if (ret) { + dev_warn(&device->dev, + "Couldn't query the device attributes\n"); + goto port_cleanup; } ret = setup_port_pkey_list(device); @@ -554,6 +544,53 @@ int ib_register_device(struct ib_device *device, const char *name, "Couldn't set up InfiniBand P_Key/GID cache\n"); goto pkey_cleanup; } + return 0; + +pkey_cleanup: + kfree(device->port_pkey_list); +port_cleanup: + kfree(device->port_immutable); + return ret; +} + +/** + * ib_register_device - Register an IB device with IB core + * @device:Device to register + * + * Low-level drivers use ib_register_device() to register their + * devices with the IB core. All registered clients will receive a + * callback for each device that is added. @device must be allocated + * with ib_alloc_device(). + */ +int ib_register_device(struct ib_device *device, const char *name, + int (*port_callback)(struct ib_device *, u8, + struct kobject *)) +{ + int ret; + struct ib_client *client; + + setup_dma_device(device); + + mutex_lock(&device_mutex); + + if (strchr(name, '%')) { + ret = alloc_name(device, name); + if (ret) + goto out; + } else { + ret = dev_set_name(&device->dev, name); + if (ret) + goto out; + } + if (__ib_device_get_by_name(dev_name(&device->dev))) { + ret = -ENFILE; + goto out; + } + strlcpy(device->name, dev_name(&device->dev), IB_DEVICE_NAME_MAX); + + ret = setup_device(device); + if (ret) + goto out; device->index = __dev_new_index(); @@ -561,15 +598,7 @@ int ib_register_device(struct ib_device *device, const char *name, if (ret) { dev_warn(&device->dev, "Couldn't register device with rdma cgroup\n"); - goto cache_cleanup; - } - - memset(&device->attrs, 0, sizeof(device->attrs)); - ret = device->query_device(device, &device->attrs, &uhw); - if (ret) { - dev_warn(&device->dev, - "Couldn't query the device attributes\n"); - goto cg_cleanup; + goto dev_cleanup; } ret = ib_device_register_sysfs(device, port_callback); @@ -593,13 +622,8 @@ int ib_register_device(struct ib_device *device, const char *name, cg_cleanup: ib_device_unregister_rdmacg(device); -cache_cleanup: - ib_cache_cleanup_one(device); - ib_cache_release_one(device); -pkey_cleanup: - kfree(device->port_pkey_list); -port_cleanup: - kfree(device->port_immutable); +dev_cleanup: + cleanup_device(device); out: mutex_unlock(&device_mutex); return ret; -- cgit v1.2.3 From 76d865b87c327b34c0e24f23e75828878022f899 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Wed, 17 Oct 2018 13:21:08 +0300 Subject: RDMA/core: Fix comment for hw stats init for port == 0 When add_port() is done for port == 0, it indicates that ports hardware counters initialization should be skipped. Reflect so in the comment. Signed-off-by: Parav Pandit Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/core/sysfs.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'drivers/infiniband/core') diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c index f54f107ef668..6fcce2c206c6 100644 --- a/drivers/infiniband/core/sysfs.c +++ b/drivers/infiniband/core/sysfs.c @@ -1120,9 +1120,9 @@ static int add_port(struct ib_device *device, int port_num, } /* - * If port == 0, it means we have only one port and the parent - * device, not this port device, should be the holder of the - * hw_counters + * If port == 0, it means hw_counters are per device and not per + * port, so holder should be device. Therefore skip per port conunter + * initialization. */ if (device->alloc_hw_stats && port_num) setup_hw_stats(device, p, port_num); -- cgit v1.2.3