From 7cc2e18f21008f4093b49099264ca4d65b9aa223 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 19 Aug 2019 14:16:59 +0300 Subject: RDMA/odp: Use the common interval tree library instead of generic ODP is working with userspace VA's in the interval tree which always fit into an unsigned long, so we can use the common code. This comes at a cost of a 16 byte increase in ib_umem_odp struct size due to storing the interval tree start/last in addition to the umem addr/length. However these values were computed and are performance critical for the interval lookup, so this seems like a worthwhile trade off. Removes 2k of .text from the kernel. Link: https://lore.kernel.org/r/20190819111710.18440-2-leon@kernel.org Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/Kconfig | 1 + drivers/infiniband/core/umem_odp.c | 72 ++++++++++---------------------------- 2 files changed, 19 insertions(+), 54 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig index 85e103b147cc..b44b1c322ec8 100644 --- a/drivers/infiniband/Kconfig +++ b/drivers/infiniband/Kconfig @@ -55,6 +55,7 @@ config INFINIBAND_ON_DEMAND_PAGING bool "InfiniBand on-demand paging support" depends on INFINIBAND_USER_MEM select MMU_NOTIFIER + select INTERVAL_TREE default y ---help--- On demand paging support for the InfiniBand subsystem. diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c index c0e15db34680..6c17a7c3a565 100644 --- a/drivers/infiniband/core/umem_odp.c +++ b/drivers/infiniband/core/umem_odp.c @@ -39,45 +39,13 @@ #include #include #include -#include +#include #include #include #include #include -/* - * The ib_umem list keeps track of memory regions for which the HW - * device request to receive notification when the related memory - * mapping is changed. - * - * ib_umem_lock protects the list. - */ - -static u64 node_start(struct umem_odp_node *n) -{ - struct ib_umem_odp *umem_odp = - container_of(n, struct ib_umem_odp, interval_tree); - - return ib_umem_start(umem_odp); -} - -/* Note that the representation of the intervals in the interval tree - * considers the ending point as contained in the interval, while the - * function ib_umem_end returns the first address which is not contained - * in the umem. - */ -static u64 node_last(struct umem_odp_node *n) -{ - struct ib_umem_odp *umem_odp = - container_of(n, struct ib_umem_odp, interval_tree); - - return ib_umem_end(umem_odp) - 1; -} - -INTERVAL_TREE_DEFINE(struct umem_odp_node, rb, u64, __subtree_last, - node_start, node_last, static, rbt_ib_umem) - static void ib_umem_notifier_start_account(struct ib_umem_odp *umem_odp) { mutex_lock(&umem_odp->umem_mutex); @@ -205,9 +173,18 @@ static void add_umem_to_per_mm(struct ib_umem_odp *umem_odp) struct ib_ucontext_per_mm *per_mm = umem_odp->per_mm; down_write(&per_mm->umem_rwsem); - if (likely(ib_umem_start(umem_odp) != ib_umem_end(umem_odp))) - rbt_ib_umem_insert(&umem_odp->interval_tree, - &per_mm->umem_tree); + if (likely(ib_umem_start(umem_odp) != ib_umem_end(umem_odp))) { + /* + * Note that the representation of the intervals in the + * interval tree considers the ending point as contained in + * the interval, while the function ib_umem_end returns the + * first address which is not contained in the umem. + */ + umem_odp->interval_tree.start = ib_umem_start(umem_odp); + umem_odp->interval_tree.last = ib_umem_end(umem_odp) - 1; + interval_tree_insert(&umem_odp->interval_tree, + &per_mm->umem_tree); + } up_write(&per_mm->umem_rwsem); } @@ -217,8 +194,8 @@ static void remove_umem_from_per_mm(struct ib_umem_odp *umem_odp) down_write(&per_mm->umem_rwsem); if (likely(ib_umem_start(umem_odp) != ib_umem_end(umem_odp))) - rbt_ib_umem_remove(&umem_odp->interval_tree, - &per_mm->umem_tree); + interval_tree_remove(&umem_odp->interval_tree, + &per_mm->umem_tree); complete_all(&umem_odp->notifier_completion); up_write(&per_mm->umem_rwsem); @@ -761,18 +738,18 @@ int rbt_ib_umem_for_each_in_range(struct rb_root_cached *root, void *cookie) { int ret_val = 0; - struct umem_odp_node *node, *next; + struct interval_tree_node *node, *next; struct ib_umem_odp *umem; if (unlikely(start == last)) return ret_val; - for (node = rbt_ib_umem_iter_first(root, start, last - 1); + for (node = interval_tree_iter_first(root, start, last - 1); node; node = next) { /* TODO move the blockable decision up to the callback */ if (!blockable) return -EAGAIN; - next = rbt_ib_umem_iter_next(node, start, last - 1); + next = interval_tree_iter_next(node, start, last - 1); umem = container_of(node, struct ib_umem_odp, interval_tree); ret_val = cb(umem, start, last, cookie) || ret_val; } @@ -780,16 +757,3 @@ int rbt_ib_umem_for_each_in_range(struct rb_root_cached *root, return ret_val; } EXPORT_SYMBOL(rbt_ib_umem_for_each_in_range); - -struct ib_umem_odp *rbt_ib_umem_lookup(struct rb_root_cached *root, - u64 addr, u64 length) -{ - struct umem_odp_node *node; - - node = rbt_ib_umem_iter_first(root, addr, addr + length - 1); - if (node) - return container_of(node, struct ib_umem_odp, interval_tree); - return NULL; - -} -EXPORT_SYMBOL(rbt_ib_umem_lookup); -- cgit v1.2.3 From f993de88a55f2dd56f17248d06c0dfda9a9799db Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 19 Aug 2019 14:17:00 +0300 Subject: RDMA/odp: Iterate over the whole rbtree directly Instead of intersecting a full interval, just iterate over every element directly. This is faster and clearer. Link: https://lore.kernel.org/r/20190819111710.18440-3-leon@kernel.org Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/umem_odp.c | 40 +++++++++++++++++++------------------ drivers/infiniband/hw/mlx5/odp.c | 41 ++++++++++++++++++-------------------- 2 files changed, 40 insertions(+), 41 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c index 6c17a7c3a565..77adf405e23c 100644 --- a/drivers/infiniband/core/umem_odp.c +++ b/drivers/infiniband/core/umem_odp.c @@ -72,31 +72,34 @@ static void ib_umem_notifier_end_account(struct ib_umem_odp *umem_odp) mutex_unlock(&umem_odp->umem_mutex); } -static int ib_umem_notifier_release_trampoline(struct ib_umem_odp *umem_odp, - u64 start, u64 end, void *cookie) -{ - /* - * Increase the number of notifiers running, to - * prevent any further fault handling on this MR. - */ - ib_umem_notifier_start_account(umem_odp); - complete_all(&umem_odp->notifier_completion); - umem_odp->umem.context->invalidate_range( - umem_odp, ib_umem_start(umem_odp), ib_umem_end(umem_odp)); - return 0; -} - static void ib_umem_notifier_release(struct mmu_notifier *mn, struct mm_struct *mm) { struct ib_ucontext_per_mm *per_mm = container_of(mn, struct ib_ucontext_per_mm, mn); + struct rb_node *node; down_read(&per_mm->umem_rwsem); - if (per_mm->active) - rbt_ib_umem_for_each_in_range( - &per_mm->umem_tree, 0, ULLONG_MAX, - ib_umem_notifier_release_trampoline, true, NULL); + if (!per_mm->active) + goto out; + + for (node = rb_first_cached(&per_mm->umem_tree); node; + node = rb_next(node)) { + struct ib_umem_odp *umem_odp = + rb_entry(node, struct ib_umem_odp, interval_tree.rb); + + /* + * Increase the number of notifiers running, to prevent any + * further fault handling on this MR. + */ + ib_umem_notifier_start_account(umem_odp); + complete_all(&umem_odp->notifier_completion); + umem_odp->umem.context->invalidate_range( + umem_odp, ib_umem_start(umem_odp), + ib_umem_end(umem_odp)); + } + +out: up_read(&per_mm->umem_rwsem); } @@ -756,4 +759,3 @@ int rbt_ib_umem_for_each_in_range(struct rb_root_cached *root, return ret_val; } -EXPORT_SYMBOL(rbt_ib_umem_for_each_in_range); diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index 1d257d1b3b0d..82b716a28ec1 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -539,34 +539,31 @@ struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd, return imr; } -static int mr_leaf_free(struct ib_umem_odp *umem_odp, u64 start, u64 end, - void *cookie) +void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *imr) { - struct mlx5_ib_mr *mr = umem_odp->private, *imr = cookie; - - if (mr->parent != imr) - return 0; - - ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem_odp), - ib_umem_end(umem_odp)); + struct ib_ucontext_per_mm *per_mm = mr_to_per_mm(imr); + struct rb_node *node; - if (umem_odp->dying) - return 0; + down_read(&per_mm->umem_rwsem); + for (node = rb_first_cached(&per_mm->umem_tree); node; + node = rb_next(node)) { + struct ib_umem_odp *umem_odp = + rb_entry(node, struct ib_umem_odp, interval_tree.rb); + struct mlx5_ib_mr *mr = umem_odp->private; - WRITE_ONCE(umem_odp->dying, 1); - atomic_inc(&imr->num_leaf_free); - schedule_work(&umem_odp->work); + if (mr->parent != imr) + continue; - return 0; -} + ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem_odp), + ib_umem_end(umem_odp)); -void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *imr) -{ - struct ib_ucontext_per_mm *per_mm = mr_to_per_mm(imr); + if (umem_odp->dying) + continue; - down_read(&per_mm->umem_rwsem); - rbt_ib_umem_for_each_in_range(&per_mm->umem_tree, 0, ULLONG_MAX, - mr_leaf_free, true, imr); + WRITE_ONCE(umem_odp->dying, 1); + atomic_inc(&imr->num_leaf_free); + schedule_work(&umem_odp->work); + } up_read(&per_mm->umem_rwsem); wait_event(imr->q_leaf_free, !atomic_read(&imr->num_leaf_free)); -- cgit v1.2.3 From fd7dbf035edcfb035977423e2a5102832c1427f4 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 19 Aug 2019 14:17:01 +0300 Subject: RDMA/odp: Make it clearer when a umem is an implicit ODP umem Implicit ODP umems are special, they don't have any page lists, they don't exist in the interval tree and they are never DMA mapped. Instead of trying to guess this based on a zero length use an explicit flag. Further, do not allow non-implicit umems to be 0 size. Link: https://lore.kernel.org/r/20190819111710.18440-4-leon@kernel.org Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/umem_odp.c | 54 +++++++++++++++++++++----------------- drivers/infiniband/hw/mlx5/mr.c | 2 +- drivers/infiniband/hw/mlx5/odp.c | 2 +- include/rdma/ib_umem_odp.h | 8 ++++++ 4 files changed, 40 insertions(+), 26 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c index 77adf405e23c..7300d0a10d1e 100644 --- a/drivers/infiniband/core/umem_odp.c +++ b/drivers/infiniband/core/umem_odp.c @@ -176,18 +176,15 @@ static void add_umem_to_per_mm(struct ib_umem_odp *umem_odp) struct ib_ucontext_per_mm *per_mm = umem_odp->per_mm; down_write(&per_mm->umem_rwsem); - if (likely(ib_umem_start(umem_odp) != ib_umem_end(umem_odp))) { - /* - * Note that the representation of the intervals in the - * interval tree considers the ending point as contained in - * the interval, while the function ib_umem_end returns the - * first address which is not contained in the umem. - */ - umem_odp->interval_tree.start = ib_umem_start(umem_odp); - umem_odp->interval_tree.last = ib_umem_end(umem_odp) - 1; - interval_tree_insert(&umem_odp->interval_tree, - &per_mm->umem_tree); - } + /* + * Note that the representation of the intervals in the interval tree + * considers the ending point as contained in the interval, while the + * function ib_umem_end returns the first address which is not + * contained in the umem. + */ + umem_odp->interval_tree.start = ib_umem_start(umem_odp); + umem_odp->interval_tree.last = ib_umem_end(umem_odp) - 1; + interval_tree_insert(&umem_odp->interval_tree, &per_mm->umem_tree); up_write(&per_mm->umem_rwsem); } @@ -196,11 +193,8 @@ static void remove_umem_from_per_mm(struct ib_umem_odp *umem_odp) struct ib_ucontext_per_mm *per_mm = umem_odp->per_mm; down_write(&per_mm->umem_rwsem); - if (likely(ib_umem_start(umem_odp) != ib_umem_end(umem_odp))) - interval_tree_remove(&umem_odp->interval_tree, - &per_mm->umem_tree); + interval_tree_remove(&umem_odp->interval_tree, &per_mm->umem_tree); complete_all(&umem_odp->notifier_completion); - up_write(&per_mm->umem_rwsem); } @@ -320,6 +314,9 @@ struct ib_umem_odp *ib_alloc_odp_umem(struct ib_umem_odp *root, int pages = size >> PAGE_SHIFT; int ret; + if (!size) + return ERR_PTR(-EINVAL); + odp_data = kzalloc(sizeof(*odp_data), GFP_KERNEL); if (!odp_data) return ERR_PTR(-ENOMEM); @@ -381,6 +378,9 @@ int ib_umem_odp_get(struct ib_umem_odp *umem_odp, int access) struct mm_struct *mm = umem->owning_mm; int ret_val; + if (umem_odp->umem.address == 0 && umem_odp->umem.length == 0) + umem_odp->is_implicit_odp = 1; + umem_odp->page_shift = PAGE_SHIFT; if (access & IB_ACCESS_HUGETLB) { struct vm_area_struct *vma; @@ -401,7 +401,10 @@ int ib_umem_odp_get(struct ib_umem_odp *umem_odp, int access) init_completion(&umem_odp->notifier_completion); - if (ib_umem_odp_num_pages(umem_odp)) { + if (!umem_odp->is_implicit_odp) { + if (!ib_umem_odp_num_pages(umem_odp)) + return -EINVAL; + umem_odp->page_list = vzalloc(array_size(sizeof(*umem_odp->page_list), ib_umem_odp_num_pages(umem_odp))); @@ -420,7 +423,9 @@ int ib_umem_odp_get(struct ib_umem_odp *umem_odp, int access) ret_val = get_per_mm(umem_odp); if (ret_val) goto out_dma_list; - add_umem_to_per_mm(umem_odp); + + if (!umem_odp->is_implicit_odp) + add_umem_to_per_mm(umem_odp); return 0; @@ -439,13 +444,14 @@ void ib_umem_odp_release(struct ib_umem_odp *umem_odp) * It is the driver's responsibility to ensure, before calling us, * that the hardware will not attempt to access the MR any more. */ - ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem_odp), - ib_umem_end(umem_odp)); - - remove_umem_from_per_mm(umem_odp); + if (!umem_odp->is_implicit_odp) { + ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem_odp), + ib_umem_end(umem_odp)); + remove_umem_from_per_mm(umem_odp); + vfree(umem_odp->dma_list); + vfree(umem_odp->page_list); + } put_per_mm(umem_odp); - vfree(umem_odp->dma_list); - vfree(umem_odp->page_list); } /* diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index b74fad08412f..ba2ec495b6e3 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -1600,7 +1600,7 @@ static void dereg_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) /* Wait for all running page-fault handlers to finish. */ synchronize_srcu(&dev->mr_srcu); /* Destroy all page mappings */ - if (umem_odp->page_list) + if (!umem_odp->is_implicit_odp) mlx5_ib_invalidate_range(umem_odp, ib_umem_start(umem_odp), ib_umem_end(umem_odp)); diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index 82b716a28ec1..80c07d85b966 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -584,7 +584,7 @@ static int pagefault_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr, struct ib_umem_odp *odp; size_t size; - if (!odp_mr->page_list) { + if (odp_mr->is_implicit_odp) { odp = implicit_mr_get_data(mr, io_virt, bcnt); if (IS_ERR(odp)) diff --git a/include/rdma/ib_umem_odp.h b/include/rdma/ib_umem_odp.h index 030d5cbad02c..14b38b4459c5 100644 --- a/include/rdma/ib_umem_odp.h +++ b/include/rdma/ib_umem_odp.h @@ -69,6 +69,14 @@ struct ib_umem_odp { /* Tree tracking */ struct interval_tree_node interval_tree; + /* + * An implicit odp umem cannot be DMA mapped, has 0 length, and serves + * only as an anchor for the driver to hold onto the per_mm. FIXME: + * This should be removed and drivers should work with the per_mm + * directly. + */ + bool is_implicit_odp; + struct completion notifier_completion; int dying; unsigned int page_shift; -- cgit v1.2.3 From 22d79c9a912c9d56258a0adf8a9c8cbfb6f2fd1a Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 19 Aug 2019 14:17:02 +0300 Subject: RMDA/odp: Consolidate umem_odp initialization This is done in two different places, consolidate all the post-allocation initialization into a single function. Link: https://lore.kernel.org/r/20190819111710.18440-5-leon@kernel.org Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/umem_odp.c | 200 ++++++++++++++++--------------------- 1 file changed, 86 insertions(+), 114 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c index 7300d0a10d1e..1643ff374b58 100644 --- a/drivers/infiniband/core/umem_odp.c +++ b/drivers/infiniband/core/umem_odp.c @@ -171,23 +171,6 @@ static const struct mmu_notifier_ops ib_umem_notifiers = { .invalidate_range_end = ib_umem_notifier_invalidate_range_end, }; -static void add_umem_to_per_mm(struct ib_umem_odp *umem_odp) -{ - struct ib_ucontext_per_mm *per_mm = umem_odp->per_mm; - - down_write(&per_mm->umem_rwsem); - /* - * Note that the representation of the intervals in the interval tree - * considers the ending point as contained in the interval, while the - * function ib_umem_end returns the first address which is not - * contained in the umem. - */ - umem_odp->interval_tree.start = ib_umem_start(umem_odp); - umem_odp->interval_tree.last = ib_umem_end(umem_odp) - 1; - interval_tree_insert(&umem_odp->interval_tree, &per_mm->umem_tree); - up_write(&per_mm->umem_rwsem); -} - static void remove_umem_from_per_mm(struct ib_umem_odp *umem_odp) { struct ib_ucontext_per_mm *per_mm = umem_odp->per_mm; @@ -237,33 +220,23 @@ out_pid: return ERR_PTR(ret); } -static int get_per_mm(struct ib_umem_odp *umem_odp) +static struct ib_ucontext_per_mm *get_per_mm(struct ib_umem_odp *umem_odp) { struct ib_ucontext *ctx = umem_odp->umem.context; struct ib_ucontext_per_mm *per_mm; + lockdep_assert_held(&ctx->per_mm_list_lock); + /* * Generally speaking we expect only one or two per_mm in this list, * so no reason to optimize this search today. */ - mutex_lock(&ctx->per_mm_list_lock); list_for_each_entry(per_mm, &ctx->per_mm_list, ucontext_list) { if (per_mm->mm == umem_odp->umem.owning_mm) - goto found; - } - - per_mm = alloc_per_mm(ctx, umem_odp->umem.owning_mm); - if (IS_ERR(per_mm)) { - mutex_unlock(&ctx->per_mm_list_lock); - return PTR_ERR(per_mm); + return per_mm; } -found: - umem_odp->per_mm = per_mm; - per_mm->odp_mrs_count++; - mutex_unlock(&ctx->per_mm_list_lock); - - return 0; + return alloc_per_mm(ctx, umem_odp->umem.owning_mm); } static void free_per_mm(struct rcu_head *rcu) @@ -304,79 +277,114 @@ static void put_per_mm(struct ib_umem_odp *umem_odp) mmu_notifier_call_srcu(&per_mm->rcu, free_per_mm); } +static inline int ib_init_umem_odp(struct ib_umem_odp *umem_odp, + struct ib_ucontext_per_mm *per_mm) +{ + struct ib_ucontext *ctx = umem_odp->umem.context; + int ret; + + umem_odp->umem.is_odp = 1; + if (!umem_odp->is_implicit_odp) { + size_t pages = ib_umem_odp_num_pages(umem_odp); + + if (!pages) + return -EINVAL; + + /* + * Note that the representation of the intervals in the + * interval tree considers the ending point as contained in + * the interval, while the function ib_umem_end returns the + * first address which is not contained in the umem. + */ + umem_odp->interval_tree.start = ib_umem_start(umem_odp); + umem_odp->interval_tree.last = ib_umem_end(umem_odp) - 1; + + umem_odp->page_list = vzalloc( + array_size(sizeof(*umem_odp->page_list), pages)); + if (!umem_odp->page_list) + return -ENOMEM; + + umem_odp->dma_list = + vzalloc(array_size(sizeof(*umem_odp->dma_list), pages)); + if (!umem_odp->dma_list) { + ret = -ENOMEM; + goto out_page_list; + } + } + + mutex_lock(&ctx->per_mm_list_lock); + if (!per_mm) { + per_mm = get_per_mm(umem_odp); + if (IS_ERR(per_mm)) { + ret = PTR_ERR(per_mm); + goto out_unlock; + } + } + umem_odp->per_mm = per_mm; + per_mm->odp_mrs_count++; + mutex_unlock(&ctx->per_mm_list_lock); + + mutex_init(&umem_odp->umem_mutex); + init_completion(&umem_odp->notifier_completion); + + if (!umem_odp->is_implicit_odp) { + down_write(&per_mm->umem_rwsem); + interval_tree_insert(&umem_odp->interval_tree, + &per_mm->umem_tree); + up_write(&per_mm->umem_rwsem); + } + + return 0; + +out_unlock: + mutex_unlock(&ctx->per_mm_list_lock); + vfree(umem_odp->dma_list); +out_page_list: + vfree(umem_odp->page_list); + return ret; +} + struct ib_umem_odp *ib_alloc_odp_umem(struct ib_umem_odp *root, unsigned long addr, size_t size) { - struct ib_ucontext_per_mm *per_mm = root->per_mm; - struct ib_ucontext *ctx = per_mm->context; + /* + * Caller must ensure that root cannot be freed during the call to + * ib_alloc_odp_umem. + */ struct ib_umem_odp *odp_data; struct ib_umem *umem; - int pages = size >> PAGE_SHIFT; int ret; - if (!size) - return ERR_PTR(-EINVAL); - odp_data = kzalloc(sizeof(*odp_data), GFP_KERNEL); if (!odp_data) return ERR_PTR(-ENOMEM); umem = &odp_data->umem; - umem->context = ctx; + umem->context = root->umem.context; umem->length = size; umem->address = addr; - odp_data->page_shift = PAGE_SHIFT; umem->writable = root->umem.writable; - umem->is_odp = 1; - odp_data->per_mm = per_mm; - umem->owning_mm = per_mm->mm; - mmgrab(umem->owning_mm); - - mutex_init(&odp_data->umem_mutex); - init_completion(&odp_data->notifier_completion); - - odp_data->page_list = - vzalloc(array_size(pages, sizeof(*odp_data->page_list))); - if (!odp_data->page_list) { - ret = -ENOMEM; - goto out_odp_data; - } + umem->owning_mm = root->umem.owning_mm; + odp_data->page_shift = PAGE_SHIFT; - odp_data->dma_list = - vzalloc(array_size(pages, sizeof(*odp_data->dma_list))); - if (!odp_data->dma_list) { - ret = -ENOMEM; - goto out_page_list; + ret = ib_init_umem_odp(odp_data, root->per_mm); + if (ret) { + kfree(odp_data); + return ERR_PTR(ret); } - /* - * Caller must ensure that the umem_odp that the per_mm came from - * cannot be freed during the call to ib_alloc_odp_umem. - */ - mutex_lock(&ctx->per_mm_list_lock); - per_mm->odp_mrs_count++; - mutex_unlock(&ctx->per_mm_list_lock); - add_umem_to_per_mm(odp_data); + mmgrab(umem->owning_mm); return odp_data; - -out_page_list: - vfree(odp_data->page_list); -out_odp_data: - mmdrop(umem->owning_mm); - kfree(odp_data); - return ERR_PTR(ret); } EXPORT_SYMBOL(ib_alloc_odp_umem); int ib_umem_odp_get(struct ib_umem_odp *umem_odp, int access) { - struct ib_umem *umem = &umem_odp->umem; /* * NOTE: This must called in a process context where umem->owning_mm * == current->mm */ - struct mm_struct *mm = umem->owning_mm; - int ret_val; + struct mm_struct *mm = umem_odp->umem.owning_mm; if (umem_odp->umem.address == 0 && umem_odp->umem.length == 0) umem_odp->is_implicit_odp = 1; @@ -397,43 +405,7 @@ int ib_umem_odp_get(struct ib_umem_odp *umem_odp, int access) up_read(&mm->mmap_sem); } - mutex_init(&umem_odp->umem_mutex); - - init_completion(&umem_odp->notifier_completion); - - if (!umem_odp->is_implicit_odp) { - if (!ib_umem_odp_num_pages(umem_odp)) - return -EINVAL; - - umem_odp->page_list = - vzalloc(array_size(sizeof(*umem_odp->page_list), - ib_umem_odp_num_pages(umem_odp))); - if (!umem_odp->page_list) - return -ENOMEM; - - umem_odp->dma_list = - vzalloc(array_size(sizeof(*umem_odp->dma_list), - ib_umem_odp_num_pages(umem_odp))); - if (!umem_odp->dma_list) { - ret_val = -ENOMEM; - goto out_page_list; - } - } - - ret_val = get_per_mm(umem_odp); - if (ret_val) - goto out_dma_list; - - if (!umem_odp->is_implicit_odp) - add_umem_to_per_mm(umem_odp); - - return 0; - -out_dma_list: - vfree(umem_odp->dma_list); -out_page_list: - vfree(umem_odp->page_list); - return ret_val; + return ib_init_umem_odp(umem_odp, NULL); } void ib_umem_odp_release(struct ib_umem_odp *umem_odp) -- cgit v1.2.3 From f20bef6a951b6ef619655ed846113f706d0824d7 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 19 Aug 2019 14:17:03 +0300 Subject: RDMA/odp: Make the three ways to create a umem_odp clear The three paths to build the umem_odps are kind of muddled, they are: - As a normal ib_mr umem - As a child in an implicit ODP umem tree - As the root of an implicit ODP umem tree Only the first two are actually umem's, the last is an abuse. The implicit case can only be triggered by explicit driver request, it should never be co-mingled with the normal case. While we are here, make sensible function names and add some comments to make this clearer. Link: https://lore.kernel.org/r/20190819111710.18440-6-leon@kernel.org Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/umem_odp.c | 80 +++++++++++++++++++++++++++++++++++--- drivers/infiniband/hw/mlx5/odp.c | 23 ++++++----- include/rdma/ib_umem_odp.h | 6 ++- 3 files changed, 89 insertions(+), 20 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c index 1643ff374b58..198c0ce04998 100644 --- a/drivers/infiniband/core/umem_odp.c +++ b/drivers/infiniband/core/umem_odp.c @@ -46,6 +46,8 @@ #include #include +#include "uverbs.h" + static void ib_umem_notifier_start_account(struct ib_umem_odp *umem_odp) { mutex_lock(&umem_odp->umem_mutex); @@ -344,8 +346,67 @@ out_page_list: return ret; } -struct ib_umem_odp *ib_alloc_odp_umem(struct ib_umem_odp *root, - unsigned long addr, size_t size) +/** + * ib_umem_odp_alloc_implicit - Allocate a parent implicit ODP umem + * + * Implicit ODP umems do not have a VA range and do not have any page lists. + * They exist only to hold the per_mm reference to help the driver create + * children umems. + * + * @udata: udata from the syscall being used to create the umem + * @access: ib_reg_mr access flags + */ +struct ib_umem_odp *ib_umem_odp_alloc_implicit(struct ib_udata *udata, + int access) +{ + struct ib_ucontext *context = + container_of(udata, struct uverbs_attr_bundle, driver_udata) + ->context; + struct ib_umem *umem; + struct ib_umem_odp *umem_odp; + int ret; + + if (access & IB_ACCESS_HUGETLB) + return ERR_PTR(-EINVAL); + + if (!context) + return ERR_PTR(-EIO); + if (WARN_ON_ONCE(!context->invalidate_range)) + return ERR_PTR(-EINVAL); + + umem_odp = kzalloc(sizeof(*umem_odp), GFP_KERNEL); + if (!umem_odp) + return ERR_PTR(-ENOMEM); + umem = &umem_odp->umem; + umem->context = context; + umem->writable = ib_access_writable(access); + umem->owning_mm = current->mm; + umem_odp->is_implicit_odp = 1; + umem_odp->page_shift = PAGE_SHIFT; + + ret = ib_init_umem_odp(umem_odp, NULL); + if (ret) { + kfree(umem_odp); + return ERR_PTR(ret); + } + + mmgrab(umem->owning_mm); + + return umem_odp; +} +EXPORT_SYMBOL(ib_umem_odp_alloc_implicit); + +/** + * ib_umem_odp_alloc_child - Allocate a child ODP umem under an implicit + * parent ODP umem + * + * @root: The parent umem enclosing the child. This must be allocated using + * ib_alloc_implicit_odp_umem() + * @addr: The starting userspace VA + * @size: The length of the userspace VA + */ +struct ib_umem_odp *ib_umem_odp_alloc_child(struct ib_umem_odp *root, + unsigned long addr, size_t size) { /* * Caller must ensure that root cannot be freed during the call to @@ -355,6 +416,9 @@ struct ib_umem_odp *ib_alloc_odp_umem(struct ib_umem_odp *root, struct ib_umem *umem; int ret; + if (WARN_ON(!root->is_implicit_odp)) + return ERR_PTR(-EINVAL); + odp_data = kzalloc(sizeof(*odp_data), GFP_KERNEL); if (!odp_data) return ERR_PTR(-ENOMEM); @@ -376,8 +440,15 @@ struct ib_umem_odp *ib_alloc_odp_umem(struct ib_umem_odp *root, return odp_data; } -EXPORT_SYMBOL(ib_alloc_odp_umem); +EXPORT_SYMBOL(ib_umem_odp_alloc_child); +/** + * ib_umem_odp_get - Complete ib_umem_get() + * + * @umem_odp: The partially configured umem from ib_umem_get() + * @addr: The starting userspace VA + * @access: ib_reg_mr access flags + */ int ib_umem_odp_get(struct ib_umem_odp *umem_odp, int access) { /* @@ -386,9 +457,6 @@ int ib_umem_odp_get(struct ib_umem_odp *umem_odp, int access) */ struct mm_struct *mm = umem_odp->umem.owning_mm; - if (umem_odp->umem.address == 0 && umem_odp->umem.length == 0) - umem_odp->is_implicit_odp = 1; - umem_odp->page_shift = PAGE_SHIFT; if (access & IB_ACCESS_HUGETLB) { struct vm_area_struct *vma; diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index 80c07d85b966..ad209ae44f05 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -384,7 +384,7 @@ static void mlx5_ib_page_fault_resume(struct mlx5_ib_dev *dev, } static struct mlx5_ib_mr *implicit_mr_alloc(struct ib_pd *pd, - struct ib_umem *umem, + struct ib_umem_odp *umem_odp, bool ksm, int access_flags) { struct mlx5_ib_dev *dev = to_mdev(pd->device); @@ -402,7 +402,7 @@ static struct mlx5_ib_mr *implicit_mr_alloc(struct ib_pd *pd, mr->dev = dev; mr->access_flags = access_flags; mr->mmkey.iova = 0; - mr->umem = umem; + mr->umem = &umem_odp->umem; if (ksm) { err = mlx5_ib_update_xlt(mr, 0, @@ -462,14 +462,13 @@ next_mr: if (nentries) nentries++; } else { - odp = ib_alloc_odp_umem(odp_mr, addr, - MLX5_IMR_MTT_SIZE); + odp = ib_umem_odp_alloc_child(odp_mr, addr, MLX5_IMR_MTT_SIZE); if (IS_ERR(odp)) { mutex_unlock(&odp_mr->umem_mutex); return ERR_CAST(odp); } - mtt = implicit_mr_alloc(mr->ibmr.pd, &odp->umem, 0, + mtt = implicit_mr_alloc(mr->ibmr.pd, odp, 0, mr->access_flags); if (IS_ERR(mtt)) { mutex_unlock(&odp_mr->umem_mutex); @@ -519,19 +518,19 @@ struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd, int access_flags) { struct mlx5_ib_mr *imr; - struct ib_umem *umem; + struct ib_umem_odp *umem_odp; - umem = ib_umem_get(udata, 0, 0, access_flags, 0); - if (IS_ERR(umem)) - return ERR_CAST(umem); + umem_odp = ib_umem_odp_alloc_implicit(udata, access_flags); + if (IS_ERR(umem_odp)) + return ERR_CAST(umem_odp); - imr = implicit_mr_alloc(&pd->ibpd, umem, 1, access_flags); + imr = implicit_mr_alloc(&pd->ibpd, umem_odp, 1, access_flags); if (IS_ERR(imr)) { - ib_umem_release(umem); + ib_umem_release(&umem_odp->umem); return ERR_CAST(imr); } - imr->umem = umem; + imr->umem = &umem_odp->umem; init_waitqueue_head(&imr->q_leaf_free); atomic_set(&imr->num_leaf_free, 0); atomic_set(&imr->num_pending_prefetch, 0); diff --git a/include/rdma/ib_umem_odp.h b/include/rdma/ib_umem_odp.h index 14b38b4459c5..219fe7015e7d 100644 --- a/include/rdma/ib_umem_odp.h +++ b/include/rdma/ib_umem_odp.h @@ -140,8 +140,10 @@ struct ib_ucontext_per_mm { }; int ib_umem_odp_get(struct ib_umem_odp *umem_odp, int access); -struct ib_umem_odp *ib_alloc_odp_umem(struct ib_umem_odp *root_umem, - unsigned long addr, size_t size); +struct ib_umem_odp *ib_umem_odp_alloc_implicit(struct ib_udata *udata, + int access); +struct ib_umem_odp *ib_umem_odp_alloc_child(struct ib_umem_odp *root_umem, + unsigned long addr, size_t size); void ib_umem_odp_release(struct ib_umem_odp *umem_odp); int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 start_offset, -- cgit v1.2.3 From 261dc53f8ee037bf2fbf68f90c319b04062a126c Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 19 Aug 2019 14:17:04 +0300 Subject: RDMA/odp: Split creating a umem_odp from ib_umem_get This is the last creation API that is overloaded for both, there is very little code sharing and a driver has to be specifically ready for a umem_odp to be created to use the odp version. Link: https://lore.kernel.org/r/20190819111710.18440-7-leon@kernel.org Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/umem.c | 30 +++-------------- drivers/infiniband/core/umem_odp.c | 67 ++++++++++++++++++++++++++++---------- drivers/infiniband/hw/mlx5/mem.c | 13 -------- drivers/infiniband/hw/mlx5/mr.c | 34 ++++++++++++++----- include/rdma/ib_umem_odp.h | 9 +++-- 5 files changed, 86 insertions(+), 67 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c index 56553668256f..9a39c45cd1e6 100644 --- a/drivers/infiniband/core/umem.c +++ b/drivers/infiniband/core/umem.c @@ -184,9 +184,6 @@ EXPORT_SYMBOL(ib_umem_find_best_pgsz); /** * ib_umem_get - Pin and DMA map userspace memory. * - * If access flags indicate ODP memory, avoid pinning. Instead, stores - * the mm for future page fault handling in conjunction with MMU notifiers. - * * @udata: userspace context to pin memory for * @addr: userspace virtual address to start at * @size: length of region to pin @@ -231,17 +228,12 @@ struct ib_umem *ib_umem_get(struct ib_udata *udata, unsigned long addr, if (!can_do_mlock()) return ERR_PTR(-EPERM); - if (access & IB_ACCESS_ON_DEMAND) { - umem = kzalloc(sizeof(struct ib_umem_odp), GFP_KERNEL); - if (!umem) - return ERR_PTR(-ENOMEM); - umem->is_odp = 1; - } else { - umem = kzalloc(sizeof(*umem), GFP_KERNEL); - if (!umem) - return ERR_PTR(-ENOMEM); - } + if (access & IB_ACCESS_ON_DEMAND) + return ERR_PTR(-EOPNOTSUPP); + umem = kzalloc(sizeof(*umem), GFP_KERNEL); + if (!umem) + return ERR_PTR(-ENOMEM); umem->context = context; umem->length = size; umem->address = addr; @@ -249,18 +241,6 @@ struct ib_umem *ib_umem_get(struct ib_udata *udata, unsigned long addr, umem->owning_mm = mm = current->mm; mmgrab(mm); - if (access & IB_ACCESS_ON_DEMAND) { - if (WARN_ON_ONCE(!context->invalidate_range)) { - ret = -EINVAL; - goto umem_kfree; - } - - ret = ib_umem_odp_get(to_ib_umem_odp(umem), access); - if (ret) - goto umem_kfree; - return umem; - } - page_list = (struct page **) __get_free_page(GFP_KERNEL); if (!page_list) { ret = -ENOMEM; diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c index 198c0ce04998..6a88bd0fcb33 100644 --- a/drivers/infiniband/core/umem_odp.c +++ b/drivers/infiniband/core/umem_odp.c @@ -335,6 +335,7 @@ static inline int ib_init_umem_odp(struct ib_umem_odp *umem_odp, &per_mm->umem_tree); up_write(&per_mm->umem_rwsem); } + mmgrab(umem_odp->umem.owning_mm); return 0; @@ -389,9 +390,6 @@ struct ib_umem_odp *ib_umem_odp_alloc_implicit(struct ib_udata *udata, kfree(umem_odp); return ERR_PTR(ret); } - - mmgrab(umem->owning_mm); - return umem_odp; } EXPORT_SYMBOL(ib_umem_odp_alloc_implicit); @@ -435,27 +433,51 @@ struct ib_umem_odp *ib_umem_odp_alloc_child(struct ib_umem_odp *root, kfree(odp_data); return ERR_PTR(ret); } - - mmgrab(umem->owning_mm); - return odp_data; } EXPORT_SYMBOL(ib_umem_odp_alloc_child); /** - * ib_umem_odp_get - Complete ib_umem_get() + * ib_umem_odp_get - Create a umem_odp for a userspace va * - * @umem_odp: The partially configured umem from ib_umem_get() - * @addr: The starting userspace VA - * @access: ib_reg_mr access flags + * @udata: userspace context to pin memory for + * @addr: userspace virtual address to start at + * @size: length of region to pin + * @access: IB_ACCESS_xxx flags for memory being pinned + * + * The driver should use when the access flags indicate ODP memory. It avoids + * pinning, instead, stores the mm for future page fault handling in + * conjunction with MMU notifiers. */ -int ib_umem_odp_get(struct ib_umem_odp *umem_odp, int access) +struct ib_umem_odp *ib_umem_odp_get(struct ib_udata *udata, unsigned long addr, + size_t size, int access) { - /* - * NOTE: This must called in a process context where umem->owning_mm - * == current->mm - */ - struct mm_struct *mm = umem_odp->umem.owning_mm; + struct ib_umem_odp *umem_odp; + struct ib_ucontext *context; + struct mm_struct *mm; + int ret; + + if (!udata) + return ERR_PTR(-EIO); + + context = container_of(udata, struct uverbs_attr_bundle, driver_udata) + ->context; + if (!context) + return ERR_PTR(-EIO); + + if (WARN_ON_ONCE(!(access & IB_ACCESS_ON_DEMAND)) || + WARN_ON_ONCE(!context->invalidate_range)) + return ERR_PTR(-EINVAL); + + umem_odp = kzalloc(sizeof(struct ib_umem_odp), GFP_KERNEL); + if (!umem_odp) + return ERR_PTR(-ENOMEM); + + umem_odp->umem.context = context; + umem_odp->umem.length = size; + umem_odp->umem.address = addr; + umem_odp->umem.writable = ib_access_writable(access); + umem_odp->umem.owning_mm = mm = current->mm; umem_odp->page_shift = PAGE_SHIFT; if (access & IB_ACCESS_HUGETLB) { @@ -466,15 +488,24 @@ int ib_umem_odp_get(struct ib_umem_odp *umem_odp, int access) vma = find_vma(mm, ib_umem_start(umem_odp)); if (!vma || !is_vm_hugetlb_page(vma)) { up_read(&mm->mmap_sem); - return -EINVAL; + ret = -EINVAL; + goto err_free; } h = hstate_vma(vma); umem_odp->page_shift = huge_page_shift(h); up_read(&mm->mmap_sem); } - return ib_init_umem_odp(umem_odp, NULL); + ret = ib_init_umem_odp(umem_odp, NULL); + if (ret) + goto err_free; + return umem_odp; + +err_free: + kfree(umem_odp); + return ERR_PTR(ret); } +EXPORT_SYMBOL(ib_umem_odp_get); void ib_umem_odp_release(struct ib_umem_odp *umem_odp) { diff --git a/drivers/infiniband/hw/mlx5/mem.c b/drivers/infiniband/hw/mlx5/mem.c index a40e0abf2338..b5aece786b36 100644 --- a/drivers/infiniband/hw/mlx5/mem.c +++ b/drivers/infiniband/hw/mlx5/mem.c @@ -56,19 +56,6 @@ void mlx5_ib_cont_pages(struct ib_umem *umem, u64 addr, struct scatterlist *sg; int entry; - if (umem->is_odp) { - struct ib_umem_odp *odp = to_ib_umem_odp(umem); - unsigned int page_shift = odp->page_shift; - - *ncont = ib_umem_odp_num_pages(odp); - *count = *ncont << (page_shift - PAGE_SHIFT); - *shift = page_shift; - if (order) - *order = ilog2(roundup_pow_of_two(*ncont)); - - return; - } - addr = addr >> PAGE_SHIFT; tmp = (unsigned long)addr; m = find_first_bit(&tmp, BITS_PER_LONG); diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index ba2ec495b6e3..fc1106ddc3f6 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -784,19 +784,37 @@ static int mr_umem_get(struct mlx5_ib_dev *dev, struct ib_udata *udata, int *ncont, int *order) { struct ib_umem *u; - int err; *umem = NULL; - u = ib_umem_get(udata, start, length, access_flags, 0); - err = PTR_ERR_OR_ZERO(u); - if (err) { - mlx5_ib_dbg(dev, "umem get failed (%d)\n", err); - return err; + if (access_flags & IB_ACCESS_ON_DEMAND) { + struct ib_umem_odp *odp; + + odp = ib_umem_odp_get(udata, start, length, access_flags); + if (IS_ERR(odp)) { + mlx5_ib_dbg(dev, "umem get failed (%ld)\n", + PTR_ERR(odp)); + return PTR_ERR(odp); + } + + u = &odp->umem; + + *page_shift = odp->page_shift; + *ncont = ib_umem_odp_num_pages(odp); + *npages = *ncont << (*page_shift - PAGE_SHIFT); + if (order) + *order = ilog2(roundup_pow_of_two(*ncont)); + } else { + u = ib_umem_get(udata, start, length, access_flags, 0); + if (IS_ERR(u)) { + mlx5_ib_dbg(dev, "umem get failed (%ld)\n", PTR_ERR(u)); + return PTR_ERR(u); + } + + mlx5_ib_cont_pages(u, start, MLX5_MKEY_PAGE_SHIFT_MASK, npages, + page_shift, ncont, order); } - mlx5_ib_cont_pages(u, start, MLX5_MKEY_PAGE_SHIFT_MASK, npages, - page_shift, ncont, order); if (!*npages) { mlx5_ib_warn(dev, "avoid zero region\n"); ib_umem_release(u); diff --git a/include/rdma/ib_umem_odp.h b/include/rdma/ib_umem_odp.h index 219fe7015e7d..5efb67f97b0a 100644 --- a/include/rdma/ib_umem_odp.h +++ b/include/rdma/ib_umem_odp.h @@ -139,7 +139,8 @@ struct ib_ucontext_per_mm { struct rcu_head rcu; }; -int ib_umem_odp_get(struct ib_umem_odp *umem_odp, int access); +struct ib_umem_odp *ib_umem_odp_get(struct ib_udata *udata, unsigned long addr, + size_t size, int access); struct ib_umem_odp *ib_umem_odp_alloc_implicit(struct ib_udata *udata, int access); struct ib_umem_odp *ib_umem_odp_alloc_child(struct ib_umem_odp *root_umem, @@ -199,9 +200,11 @@ static inline int ib_umem_mmu_notifier_retry(struct ib_umem_odp *umem_odp, #else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */ -static inline int ib_umem_odp_get(struct ib_umem_odp *umem_odp, int access) +static inline struct ib_umem_odp *ib_umem_odp_get(struct ib_udata *udata, + unsigned long addr, + size_t size, int access) { - return -EINVAL; + return ERR_PTR(-EINVAL); } static inline void ib_umem_odp_release(struct ib_umem_odp *umem_odp) {} -- cgit v1.2.3 From 0446cad9ca385c3b8d6e5a1184e59650fa7a7a6d Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 19 Aug 2019 14:17:05 +0300 Subject: RDMA/odp: Provide ib_umem_odp_release() to undo the allocs Now that there are allocator APIs that return the ib_umem_odp directly it should be freed through a umem_odp free'er as well. Link: https://lore.kernel.org/r/20190819111710.18440-8-leon@kernel.org Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/umem.c | 20 ++++---------------- drivers/infiniband/core/umem_odp.c | 3 +++ drivers/infiniband/hw/mlx5/mr.c | 2 +- drivers/infiniband/hw/mlx5/odp.c | 6 +++--- 4 files changed, 11 insertions(+), 20 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c index 9a39c45cd1e6..37eb8643ec29 100644 --- a/drivers/infiniband/core/umem.c +++ b/drivers/infiniband/core/umem.c @@ -326,15 +326,6 @@ umem_kfree: } EXPORT_SYMBOL(ib_umem_get); -static void __ib_umem_release_tail(struct ib_umem *umem) -{ - mmdrop(umem->owning_mm); - if (umem->is_odp) - kfree(to_ib_umem_odp(umem)); - else - kfree(umem); -} - /** * ib_umem_release - release memory pinned with ib_umem_get * @umem: umem struct to release @@ -343,17 +334,14 @@ void ib_umem_release(struct ib_umem *umem) { if (!umem) return; - - if (umem->is_odp) { - ib_umem_odp_release(to_ib_umem_odp(umem)); - __ib_umem_release_tail(umem); - return; - } + if (umem->is_odp) + return ib_umem_odp_release(to_ib_umem_odp(umem)); __ib_umem_release(umem->context->device, umem, 1); atomic64_sub(ib_umem_num_pages(umem), &umem->owning_mm->pinned_vm); - __ib_umem_release_tail(umem); + mmdrop(umem->owning_mm); + kfree(umem); } EXPORT_SYMBOL(ib_umem_release); diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c index 6a88bd0fcb33..3d4bbafa441c 100644 --- a/drivers/infiniband/core/umem_odp.c +++ b/drivers/infiniband/core/umem_odp.c @@ -523,7 +523,10 @@ void ib_umem_odp_release(struct ib_umem_odp *umem_odp) vfree(umem_odp->page_list); } put_per_mm(umem_odp); + mmdrop(umem_odp->umem.owning_mm); + kfree(umem_odp); } +EXPORT_SYMBOL(ib_umem_odp_release); /* * Map for DMA and insert a single page into the on-demand paging page tables. diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index fc1106ddc3f6..b7da619614e4 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -1629,7 +1629,7 @@ static void dereg_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) * so that there will not be any invalidations in * flight, looking at the *mr struct. */ - ib_umem_release(umem); + ib_umem_odp_release(umem_odp); atomic_sub(npages, &dev->mdev->priv.reg_pages); /* Avoid double-freeing the umem. */ diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index ad209ae44f05..a660dc2b21f4 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -206,7 +206,7 @@ static void mr_leaf_free_action(struct work_struct *work) mr->parent = NULL; synchronize_srcu(&mr->dev->mr_srcu); - ib_umem_release(&odp->umem); + ib_umem_odp_release(odp); if (imr->live) mlx5_ib_update_xlt(imr, idx, 1, 0, MLX5_IB_UPD_XLT_INDIRECT | @@ -472,7 +472,7 @@ next_mr: mr->access_flags); if (IS_ERR(mtt)) { mutex_unlock(&odp_mr->umem_mutex); - ib_umem_release(&odp->umem); + ib_umem_odp_release(odp); return ERR_CAST(mtt); } @@ -526,7 +526,7 @@ struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd, imr = implicit_mr_alloc(&pd->ibpd, umem_odp, 1, access_flags); if (IS_ERR(imr)) { - ib_umem_release(&umem_odp->umem); + ib_umem_odp_release(umem_odp); return ERR_CAST(imr); } -- cgit v1.2.3 From 204e3e5630c5d41948fc11d8419c07da8f3e5a4d Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 19 Aug 2019 14:17:06 +0300 Subject: RDMA/odp: Check for overflow when computing the umem_odp end Since the page size can be extended in the ODP case by IB_ACCESS_HUGETLB the existing overflow checks done by ib_umem_get() are not sufficient. Check for overflow again. Further, remove the unchecked math from the inlines and just use the precomputed value stored in the interval_tree_node. Link: https://lore.kernel.org/r/20190819111710.18440-9-leon@kernel.org Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/umem_odp.c | 25 +++++++++++++++++++------ include/rdma/ib_umem_odp.h | 5 ++--- 2 files changed, 21 insertions(+), 9 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c index 3d4bbafa441c..d0ef7d86213e 100644 --- a/drivers/infiniband/core/umem_odp.c +++ b/drivers/infiniband/core/umem_odp.c @@ -287,19 +287,32 @@ static inline int ib_init_umem_odp(struct ib_umem_odp *umem_odp, umem_odp->umem.is_odp = 1; if (!umem_odp->is_implicit_odp) { - size_t pages = ib_umem_odp_num_pages(umem_odp); - + size_t page_size = 1UL << umem_odp->page_shift; + size_t pages; + + umem_odp->interval_tree.start = + ALIGN_DOWN(umem_odp->umem.address, page_size); + if (check_add_overflow(umem_odp->umem.address, + umem_odp->umem.length, + &umem_odp->interval_tree.last)) + return -EOVERFLOW; + umem_odp->interval_tree.last = + ALIGN(umem_odp->interval_tree.last, page_size); + if (unlikely(umem_odp->interval_tree.last < page_size)) + return -EOVERFLOW; + + pages = (umem_odp->interval_tree.last - + umem_odp->interval_tree.start) >> + umem_odp->page_shift; if (!pages) return -EINVAL; /* * Note that the representation of the intervals in the * interval tree considers the ending point as contained in - * the interval, while the function ib_umem_end returns the - * first address which is not contained in the umem. + * the interval. */ - umem_odp->interval_tree.start = ib_umem_start(umem_odp); - umem_odp->interval_tree.last = ib_umem_end(umem_odp) - 1; + umem_odp->interval_tree.last--; umem_odp->page_list = vzalloc( array_size(sizeof(*umem_odp->page_list), pages)); diff --git a/include/rdma/ib_umem_odp.h b/include/rdma/ib_umem_odp.h index 5efb67f97b0a..b37c674b7fe6 100644 --- a/include/rdma/ib_umem_odp.h +++ b/include/rdma/ib_umem_odp.h @@ -91,14 +91,13 @@ static inline struct ib_umem_odp *to_ib_umem_odp(struct ib_umem *umem) /* Returns the first page of an ODP umem. */ static inline unsigned long ib_umem_start(struct ib_umem_odp *umem_odp) { - return ALIGN_DOWN(umem_odp->umem.address, 1UL << umem_odp->page_shift); + return umem_odp->interval_tree.start; } /* Returns the address of the page after the last one of an ODP umem. */ static inline unsigned long ib_umem_end(struct ib_umem_odp *umem_odp) { - return ALIGN(umem_odp->umem.address + umem_odp->umem.length, - 1UL << umem_odp->page_shift); + return umem_odp->interval_tree.last + 1; } static inline size_t ib_umem_odp_num_pages(struct ib_umem_odp *umem_odp) -- cgit v1.2.3 From 37824952dc8fcd96e5c5a1ce9abf3f0ba09b1e5e Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 19 Aug 2019 14:17:07 +0300 Subject: RDMA/odp: Use kvcalloc for the dma_list and page_list There is no specific need for these to be in the valloc space, let the system decide automatically how to do the allocation. Link: https://lore.kernel.org/r/20190819111710.18440-10-leon@kernel.org Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/umem_odp.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c index d0ef7d86213e..72765b110f4b 100644 --- a/drivers/infiniband/core/umem_odp.c +++ b/drivers/infiniband/core/umem_odp.c @@ -314,13 +314,13 @@ static inline int ib_init_umem_odp(struct ib_umem_odp *umem_odp, */ umem_odp->interval_tree.last--; - umem_odp->page_list = vzalloc( - array_size(sizeof(*umem_odp->page_list), pages)); + umem_odp->page_list = kvcalloc( + pages, sizeof(*umem_odp->page_list), GFP_KERNEL); if (!umem_odp->page_list) return -ENOMEM; - umem_odp->dma_list = - vzalloc(array_size(sizeof(*umem_odp->dma_list), pages)); + umem_odp->dma_list = kvcalloc( + pages, sizeof(*umem_odp->dma_list), GFP_KERNEL); if (!umem_odp->dma_list) { ret = -ENOMEM; goto out_page_list; @@ -354,9 +354,9 @@ static inline int ib_init_umem_odp(struct ib_umem_odp *umem_odp, out_unlock: mutex_unlock(&ctx->per_mm_list_lock); - vfree(umem_odp->dma_list); + kvfree(umem_odp->dma_list); out_page_list: - vfree(umem_odp->page_list); + kvfree(umem_odp->page_list); return ret; } @@ -532,8 +532,8 @@ void ib_umem_odp_release(struct ib_umem_odp *umem_odp) ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem_odp), ib_umem_end(umem_odp)); remove_umem_from_per_mm(umem_odp); - vfree(umem_odp->dma_list); - vfree(umem_odp->page_list); + kvfree(umem_odp->dma_list); + kvfree(umem_odp->page_list); } put_per_mm(umem_odp); mmdrop(umem_odp->umem.owning_mm); -- cgit v1.2.3 From ce51346feede2ea41de0ad58af2b514223e11dad Mon Sep 17 00:00:00 2001 From: Moni Shoua Date: Mon, 19 Aug 2019 14:17:08 +0300 Subject: RDMA/core: Make invalidate_range a device operation The callback function 'invalidate_range' is implemented in a driver so the place for it is in the ib_device_ops structure and not in ib_ucontext. Signed-off-by: Moni Shoua Reviewed-by: Guy Levi Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20190819111710.18440-11-leon@kernel.org Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/device.c | 1 + drivers/infiniband/core/umem_odp.c | 10 +++++----- drivers/infiniband/core/uverbs_cmd.c | 2 -- drivers/infiniband/hw/mlx5/main.c | 4 ---- drivers/infiniband/hw/mlx5/odp.c | 1 + include/rdma/ib_verbs.h | 4 ++-- 6 files changed, 9 insertions(+), 13 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index ea8661a00651..b5631b8a0397 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -2562,6 +2562,7 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops) SET_DEVICE_OP(dev_ops, get_vf_config); SET_DEVICE_OP(dev_ops, get_vf_stats); SET_DEVICE_OP(dev_ops, init_port); + SET_DEVICE_OP(dev_ops, invalidate_range); SET_DEVICE_OP(dev_ops, iw_accept); SET_DEVICE_OP(dev_ops, iw_add_ref); SET_DEVICE_OP(dev_ops, iw_connect); diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c index 72765b110f4b..32fb0b579dec 100644 --- a/drivers/infiniband/core/umem_odp.c +++ b/drivers/infiniband/core/umem_odp.c @@ -96,7 +96,7 @@ static void ib_umem_notifier_release(struct mmu_notifier *mn, */ ib_umem_notifier_start_account(umem_odp); complete_all(&umem_odp->notifier_completion); - umem_odp->umem.context->invalidate_range( + umem_odp->umem.context->device->ops.invalidate_range( umem_odp, ib_umem_start(umem_odp), ib_umem_end(umem_odp)); } @@ -109,7 +109,7 @@ static int invalidate_range_start_trampoline(struct ib_umem_odp *item, u64 start, u64 end, void *cookie) { ib_umem_notifier_start_account(item); - item->umem.context->invalidate_range(item, start, end); + item->umem.context->device->ops.invalidate_range(item, start, end); return 0; } @@ -385,7 +385,7 @@ struct ib_umem_odp *ib_umem_odp_alloc_implicit(struct ib_udata *udata, if (!context) return ERR_PTR(-EIO); - if (WARN_ON_ONCE(!context->invalidate_range)) + if (WARN_ON_ONCE(!context->device->ops.invalidate_range)) return ERR_PTR(-EINVAL); umem_odp = kzalloc(sizeof(*umem_odp), GFP_KERNEL); @@ -479,7 +479,7 @@ struct ib_umem_odp *ib_umem_odp_get(struct ib_udata *udata, unsigned long addr, return ERR_PTR(-EIO); if (WARN_ON_ONCE(!(access & IB_ACCESS_ON_DEMAND)) || - WARN_ON_ONCE(!context->invalidate_range)) + WARN_ON_ONCE(!context->device->ops.invalidate_range)) return ERR_PTR(-EINVAL); umem_odp = kzalloc(sizeof(struct ib_umem_odp), GFP_KERNEL); @@ -607,7 +607,7 @@ out: if (remove_existing_mapping) { ib_umem_notifier_start_account(umem_odp); - context->invalidate_range( + dev->ops.invalidate_range( umem_odp, ib_umem_start(umem_odp) + (page_index << umem_odp->page_shift), diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 7ddd0e5bc6b3..8f4fd4fac159 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -275,8 +275,6 @@ static int ib_uverbs_get_context(struct uverbs_attr_bundle *attrs) ret = ib_dev->ops.alloc_ucontext(ucontext, &attrs->driver_udata); if (ret) goto err_file; - if (!(ib_dev->attrs.device_cap_flags & IB_DEVICE_ON_DEMAND_PAGING)) - ucontext->invalidate_range = NULL; rdma_restrack_uadd(&ucontext->res); diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index e12a4404096b..7d5c2763b691 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -1867,10 +1867,6 @@ static int mlx5_ib_alloc_ucontext(struct ib_ucontext *uctx, if (err) goto out_sys_pages; - if (ibdev->attrs.device_cap_flags & IB_DEVICE_ON_DEMAND_PAGING) - context->ibucontext.invalidate_range = - &mlx5_ib_invalidate_range; - if (req.flags & MLX5_IB_ALLOC_UCTX_DEVX) { err = mlx5_ib_devx_create(dev, true); if (err < 0) diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index a660dc2b21f4..6d940d8f5247 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -1612,6 +1612,7 @@ void mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent *ent) static const struct ib_device_ops mlx5_ib_dev_odp_ops = { .advise_mr = mlx5_ib_advise_mr, + .invalidate_range = mlx5_ib_invalidate_range, }; int mlx5_ib_odp_init_one(struct mlx5_ib_dev *dev) diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 4f225175cb91..c2b39dda44cc 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -1417,8 +1417,6 @@ struct ib_ucontext { bool cleanup_retryable; - void (*invalidate_range)(struct ib_umem_odp *umem_odp, - unsigned long start, unsigned long end); struct mutex per_mm_list_lock; struct list_head per_mm_list; @@ -2378,6 +2376,8 @@ struct ib_device_ops { u64 iova); int (*unmap_fmr)(struct list_head *fmr_list); int (*dealloc_fmr)(struct ib_fmr *fmr); + void (*invalidate_range)(struct ib_umem_odp *umem_odp, + unsigned long start, unsigned long end); int (*attach_mcast)(struct ib_qp *qp, union ib_gid *gid, u16 lid); int (*detach_mcast)(struct ib_qp *qp, union ib_gid *gid, u16 lid); struct ib_xrcd *(*alloc_xrcd)(struct ib_device *device, -- cgit v1.2.3 From a705f3e3a1a8cbe0064730024398b2320ae1ce74 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 19 Aug 2019 14:17:09 +0300 Subject: RDMA/mlx5: Use ib_umem_start instead of umem.address These are subtly different, the address is the original VA requested during umem_get, while ib_umem_start() is the version that is rounded to the proper page size, ie is the true start of the umem's dma map. Link: https://lore.kernel.org/r/20190819111710.18440-12-leon@kernel.org Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/odp.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index 6d940d8f5247..7603b4926480 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -184,7 +184,7 @@ void mlx5_odp_populate_klm(struct mlx5_klm *pklm, size_t offset, for (i = 0; i < nentries; i++, pklm++) { pklm->bcount = cpu_to_be32(MLX5_IMR_MTT_SIZE); va = (offset + i) * MLX5_IMR_MTT_SIZE; - if (odp && odp->umem.address == va) { + if (odp && ib_umem_start(odp) == va) { struct mlx5_ib_mr *mtt = odp->private; pklm->key = cpu_to_be32(mtt->ibmr.lkey); @@ -494,7 +494,7 @@ next_mr: addr += MLX5_IMR_MTT_SIZE; if (unlikely(addr < io_virt + bcnt)) { odp = odp_next(odp); - if (odp && odp->umem.address != addr) + if (odp && ib_umem_start(odp) != addr) odp = NULL; goto next_mr; } @@ -662,7 +662,7 @@ next_mr: io_virt += size; next = odp_next(odp); - if (unlikely(!next || next->umem.address != io_virt)) { + if (unlikely(!next || ib_umem_start(next) != io_virt)) { mlx5_ib_dbg(dev, "next implicit leaf removed at 0x%llx. got %p\n", io_virt, next); return -EAGAIN; -- cgit v1.2.3 From fba0e448a2c5b297a4ddc1ec4e48f4aa6600a1c9 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 19 Aug 2019 14:17:10 +0300 Subject: RDMA/mlx5: Use odp instead of mr->umem in pagefault_mr These are the same thing since mr always comes from odp->private. It is confusing to reference the same memory via two names. Link: https://lore.kernel.org/r/20190819111710.18440-13-leon@kernel.org Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/odp.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index 7603b4926480..762038ab83e2 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -601,7 +601,7 @@ next_mr: start_idx = (io_virt - (mr->mmkey.iova & page_mask)) >> page_shift; access_mask = ODP_READ_ALLOWED_BIT; - if (prefetch && !downgrade && !mr->umem->writable) { + if (prefetch && !downgrade && !odp->umem.writable) { /* prefetch with write-access must * be supported by the MR */ @@ -609,7 +609,7 @@ next_mr: goto out; } - if (mr->umem->writable && !downgrade) + if (odp->umem.writable && !downgrade) access_mask |= ODP_WRITE_ALLOWED_BIT; current_seq = READ_ONCE(odp->notifiers_seq); @@ -619,8 +619,8 @@ next_mr: */ smp_rmb(); - ret = ib_umem_odp_map_dma_pages(to_ib_umem_odp(mr->umem), io_virt, size, - access_mask, current_seq); + ret = ib_umem_odp_map_dma_pages(odp, io_virt, size, access_mask, + current_seq); if (ret < 0) goto out; @@ -628,8 +628,7 @@ next_mr: np = ret; mutex_lock(&odp->umem_mutex); - if (!ib_umem_mmu_notifier_retry(to_ib_umem_odp(mr->umem), - current_seq)) { + if (!ib_umem_mmu_notifier_retry(odp, current_seq)) { /* * No need to check whether the MTTs really belong to * this MR, since ib_umem_odp_map_dma_pages already -- cgit v1.2.3 From c571feca2dc972dc5afeba9036d08239f1c51af1 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 6 Aug 2019 20:15:43 -0300 Subject: RDMA/odp: use mmu_notifier_get/put for 'struct ib_ucontext_per_mm' This is a significant simplification, no extra list is kept per FD, and the interval tree is now shared between all the ucontexts, reducing overhead if there are multiple ucontexts active. Link: https://lore.kernel.org/r/20190806231548.25242-7-jgg@ziepe.ca Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/umem_odp.c | 168 +++++++++++----------------------- drivers/infiniband/core/uverbs_cmd.c | 3 - drivers/infiniband/core/uverbs_main.c | 1 + drivers/infiniband/hw/mlx5/main.c | 5 - include/rdma/ib_umem_odp.h | 10 +- include/rdma/ib_verbs.h | 3 - 6 files changed, 55 insertions(+), 135 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c index 32fb0b579dec..bf36f2c5fb3a 100644 --- a/drivers/infiniband/core/umem_odp.c +++ b/drivers/infiniband/core/umem_odp.c @@ -82,7 +82,7 @@ static void ib_umem_notifier_release(struct mmu_notifier *mn, struct rb_node *node; down_read(&per_mm->umem_rwsem); - if (!per_mm->active) + if (!per_mm->mn.users) goto out; for (node = rb_first_cached(&per_mm->umem_tree); node; @@ -125,10 +125,10 @@ static int ib_umem_notifier_invalidate_range_start(struct mmu_notifier *mn, else if (!down_read_trylock(&per_mm->umem_rwsem)) return -EAGAIN; - if (!per_mm->active) { + if (!per_mm->mn.users) { up_read(&per_mm->umem_rwsem); /* - * At this point active is permanently set and visible to this + * At this point users is permanently zero and visible to this * CPU without a lock, that fact is relied on to skip the unlock * in range_end. */ @@ -158,7 +158,7 @@ static void ib_umem_notifier_invalidate_range_end(struct mmu_notifier *mn, struct ib_ucontext_per_mm *per_mm = container_of(mn, struct ib_ucontext_per_mm, mn); - if (unlikely(!per_mm->active)) + if (unlikely(!per_mm->mn.users)) return; rbt_ib_umem_for_each_in_range(&per_mm->umem_tree, range->start, @@ -167,122 +167,47 @@ static void ib_umem_notifier_invalidate_range_end(struct mmu_notifier *mn, up_read(&per_mm->umem_rwsem); } -static const struct mmu_notifier_ops ib_umem_notifiers = { - .release = ib_umem_notifier_release, - .invalidate_range_start = ib_umem_notifier_invalidate_range_start, - .invalidate_range_end = ib_umem_notifier_invalidate_range_end, -}; - -static void remove_umem_from_per_mm(struct ib_umem_odp *umem_odp) -{ - struct ib_ucontext_per_mm *per_mm = umem_odp->per_mm; - - down_write(&per_mm->umem_rwsem); - interval_tree_remove(&umem_odp->interval_tree, &per_mm->umem_tree); - complete_all(&umem_odp->notifier_completion); - up_write(&per_mm->umem_rwsem); -} - -static struct ib_ucontext_per_mm *alloc_per_mm(struct ib_ucontext *ctx, - struct mm_struct *mm) +static struct mmu_notifier *ib_umem_alloc_notifier(struct mm_struct *mm) { struct ib_ucontext_per_mm *per_mm; - int ret; per_mm = kzalloc(sizeof(*per_mm), GFP_KERNEL); if (!per_mm) return ERR_PTR(-ENOMEM); - per_mm->context = ctx; - per_mm->mm = mm; per_mm->umem_tree = RB_ROOT_CACHED; init_rwsem(&per_mm->umem_rwsem); - per_mm->active = true; + WARN_ON(mm != current->mm); rcu_read_lock(); per_mm->tgid = get_task_pid(current->group_leader, PIDTYPE_PID); rcu_read_unlock(); - - WARN_ON(mm != current->mm); - - per_mm->mn.ops = &ib_umem_notifiers; - ret = mmu_notifier_register(&per_mm->mn, per_mm->mm); - if (ret) { - dev_err(&ctx->device->dev, - "Failed to register mmu_notifier %d\n", ret); - goto out_pid; - } - - list_add(&per_mm->ucontext_list, &ctx->per_mm_list); - return per_mm; - -out_pid: - put_pid(per_mm->tgid); - kfree(per_mm); - return ERR_PTR(ret); -} - -static struct ib_ucontext_per_mm *get_per_mm(struct ib_umem_odp *umem_odp) -{ - struct ib_ucontext *ctx = umem_odp->umem.context; - struct ib_ucontext_per_mm *per_mm; - - lockdep_assert_held(&ctx->per_mm_list_lock); - - /* - * Generally speaking we expect only one or two per_mm in this list, - * so no reason to optimize this search today. - */ - list_for_each_entry(per_mm, &ctx->per_mm_list, ucontext_list) { - if (per_mm->mm == umem_odp->umem.owning_mm) - return per_mm; - } - - return alloc_per_mm(ctx, umem_odp->umem.owning_mm); -} - -static void free_per_mm(struct rcu_head *rcu) -{ - kfree(container_of(rcu, struct ib_ucontext_per_mm, rcu)); + return &per_mm->mn; } -static void put_per_mm(struct ib_umem_odp *umem_odp) +static void ib_umem_free_notifier(struct mmu_notifier *mn) { - struct ib_ucontext_per_mm *per_mm = umem_odp->per_mm; - struct ib_ucontext *ctx = umem_odp->umem.context; - bool need_free; - - mutex_lock(&ctx->per_mm_list_lock); - umem_odp->per_mm = NULL; - per_mm->odp_mrs_count--; - need_free = per_mm->odp_mrs_count == 0; - if (need_free) - list_del(&per_mm->ucontext_list); - mutex_unlock(&ctx->per_mm_list_lock); - - if (!need_free) - return; - - /* - * NOTE! mmu_notifier_unregister() can happen between a start/end - * callback, resulting in an start/end, and thus an unbalanced - * lock. This doesn't really matter to us since we are about to kfree - * the memory that holds the lock, however LOCKDEP doesn't like this. - */ - down_write(&per_mm->umem_rwsem); - per_mm->active = false; - up_write(&per_mm->umem_rwsem); + struct ib_ucontext_per_mm *per_mm = + container_of(mn, struct ib_ucontext_per_mm, mn); WARN_ON(!RB_EMPTY_ROOT(&per_mm->umem_tree.rb_root)); - mmu_notifier_unregister_no_release(&per_mm->mn, per_mm->mm); + put_pid(per_mm->tgid); - mmu_notifier_call_srcu(&per_mm->rcu, free_per_mm); + kfree(per_mm); } -static inline int ib_init_umem_odp(struct ib_umem_odp *umem_odp, - struct ib_ucontext_per_mm *per_mm) +static const struct mmu_notifier_ops ib_umem_notifiers = { + .release = ib_umem_notifier_release, + .invalidate_range_start = ib_umem_notifier_invalidate_range_start, + .invalidate_range_end = ib_umem_notifier_invalidate_range_end, + .alloc_notifier = ib_umem_alloc_notifier, + .free_notifier = ib_umem_free_notifier, +}; + +static inline int ib_init_umem_odp(struct ib_umem_odp *umem_odp) { - struct ib_ucontext *ctx = umem_odp->umem.context; + struct ib_ucontext_per_mm *per_mm; + struct mmu_notifier *mn; int ret; umem_odp->umem.is_odp = 1; @@ -327,17 +252,13 @@ static inline int ib_init_umem_odp(struct ib_umem_odp *umem_odp, } } - mutex_lock(&ctx->per_mm_list_lock); - if (!per_mm) { - per_mm = get_per_mm(umem_odp); - if (IS_ERR(per_mm)) { - ret = PTR_ERR(per_mm); - goto out_unlock; - } + mn = mmu_notifier_get(&ib_umem_notifiers, umem_odp->umem.owning_mm); + if (IS_ERR(mn)) { + ret = PTR_ERR(mn); + goto out_dma_list; } - umem_odp->per_mm = per_mm; - per_mm->odp_mrs_count++; - mutex_unlock(&ctx->per_mm_list_lock); + umem_odp->per_mm = per_mm = + container_of(mn, struct ib_ucontext_per_mm, mn); mutex_init(&umem_odp->umem_mutex); init_completion(&umem_odp->notifier_completion); @@ -352,8 +273,7 @@ static inline int ib_init_umem_odp(struct ib_umem_odp *umem_odp, return 0; -out_unlock: - mutex_unlock(&ctx->per_mm_list_lock); +out_dma_list: kvfree(umem_odp->dma_list); out_page_list: kvfree(umem_odp->page_list); @@ -398,7 +318,7 @@ struct ib_umem_odp *ib_umem_odp_alloc_implicit(struct ib_udata *udata, umem_odp->is_implicit_odp = 1; umem_odp->page_shift = PAGE_SHIFT; - ret = ib_init_umem_odp(umem_odp, NULL); + ret = ib_init_umem_odp(umem_odp); if (ret) { kfree(umem_odp); return ERR_PTR(ret); @@ -441,7 +361,7 @@ struct ib_umem_odp *ib_umem_odp_alloc_child(struct ib_umem_odp *root, umem->owning_mm = root->umem.owning_mm; odp_data->page_shift = PAGE_SHIFT; - ret = ib_init_umem_odp(odp_data, root->per_mm); + ret = ib_init_umem_odp(odp_data); if (ret) { kfree(odp_data); return ERR_PTR(ret); @@ -509,7 +429,7 @@ struct ib_umem_odp *ib_umem_odp_get(struct ib_udata *udata, unsigned long addr, up_read(&mm->mmap_sem); } - ret = ib_init_umem_odp(umem_odp, NULL); + ret = ib_init_umem_odp(umem_odp); if (ret) goto err_free; return umem_odp; @@ -522,6 +442,8 @@ EXPORT_SYMBOL(ib_umem_odp_get); void ib_umem_odp_release(struct ib_umem_odp *umem_odp) { + struct ib_ucontext_per_mm *per_mm = umem_odp->per_mm; + /* * Ensure that no more pages are mapped in the umem. * @@ -531,11 +453,27 @@ void ib_umem_odp_release(struct ib_umem_odp *umem_odp) if (!umem_odp->is_implicit_odp) { ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem_odp), ib_umem_end(umem_odp)); - remove_umem_from_per_mm(umem_odp); kvfree(umem_odp->dma_list); kvfree(umem_odp->page_list); } - put_per_mm(umem_odp); + + down_write(&per_mm->umem_rwsem); + if (!umem_odp->is_implicit_odp) { + interval_tree_remove(&umem_odp->interval_tree, + &per_mm->umem_tree); + complete_all(&umem_odp->notifier_completion); + } + /* + * NOTE! mmu_notifier_unregister() can happen between a start/end + * callback, resulting in a missing end, and thus an unbalanced + * lock. This doesn't really matter to us since we are about to kfree + * the memory that holds the lock, however LOCKDEP doesn't like this. + * Thus we call the mmu_notifier_put under the rwsem and test the + * internal users count to reliably see if we are past this point. + */ + mmu_notifier_put(&per_mm->mn); + up_write(&per_mm->umem_rwsem); + mmdrop(umem_odp->umem.owning_mm); kfree(umem_odp); } diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 8f4fd4fac159..7c10dfe417a4 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -252,9 +252,6 @@ static int ib_uverbs_get_context(struct uverbs_attr_bundle *attrs) ucontext->closing = false; ucontext->cleanup_retryable = false; - mutex_init(&ucontext->per_mm_list_lock); - INIT_LIST_HEAD(&ucontext->per_mm_list); - ret = get_unused_fd_flags(O_CLOEXEC); if (ret < 0) goto err_free; diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index 11c13c1381cf..e369ac0d6f51 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -1487,6 +1487,7 @@ static void __exit ib_uverbs_cleanup(void) IB_UVERBS_NUM_FIXED_MINOR); unregister_chrdev_region(dynamic_uverbs_dev, IB_UVERBS_NUM_DYNAMIC_MINOR); + mmu_notifier_synchronize(); } module_init(ib_uverbs_init); diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 7d5c2763b691..4ba73a95475a 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -1995,11 +1995,6 @@ static void mlx5_ib_dealloc_ucontext(struct ib_ucontext *ibcontext) struct mlx5_ib_dev *dev = to_mdev(ibcontext->device); struct mlx5_bfreg_info *bfregi; - /* All umem's must be destroyed before destroying the ucontext. */ - mutex_lock(&ibcontext->per_mm_list_lock); - WARN_ON(!list_empty(&ibcontext->per_mm_list)); - mutex_unlock(&ibcontext->per_mm_list_lock); - bfregi = &context->bfregi; mlx5_ib_dealloc_transport_domain(dev, context->tdn, context->devx_uid); diff --git a/include/rdma/ib_umem_odp.h b/include/rdma/ib_umem_odp.h index b37c674b7fe6..253df1a1fa54 100644 --- a/include/rdma/ib_umem_odp.h +++ b/include/rdma/ib_umem_odp.h @@ -122,20 +122,12 @@ static inline size_t ib_umem_odp_num_pages(struct ib_umem_odp *umem_odp) #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING struct ib_ucontext_per_mm { - struct ib_ucontext *context; - struct mm_struct *mm; + struct mmu_notifier mn; struct pid *tgid; - bool active; struct rb_root_cached umem_tree; /* Protects umem_tree */ struct rw_semaphore umem_rwsem; - - struct mmu_notifier mn; - unsigned int odp_mrs_count; - - struct list_head ucontext_list; - struct rcu_head rcu; }; struct ib_umem_odp *ib_umem_odp_get(struct ib_udata *udata, unsigned long addr, diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index c2b39dda44cc..f659f4a02aa9 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -1417,9 +1417,6 @@ struct ib_ucontext { bool cleanup_retryable; - struct mutex per_mm_list_lock; - struct list_head per_mm_list; - struct ib_rdmacg_object cg_obj; /* * Implementation details of the RDMA core, don't use in drivers: -- cgit v1.2.3 From 47f725ee7b5f5cae1f83512961bcf8b41a7a5794 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 6 Aug 2019 20:15:44 -0300 Subject: RDMA/odp: remove ib_ucontext from ib_umem At this point the ucontext is only being stored to access the ib_device, so just store the ib_device directly instead. This is more natural and logical as the umem has nothing to do with the ucontext. Link: https://lore.kernel.org/r/20190806231548.25242-8-jgg@ziepe.ca Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/umem.c | 4 ++-- drivers/infiniband/core/umem_odp.c | 15 +++++++-------- include/rdma/ib_umem.h | 2 +- 3 files changed, 10 insertions(+), 11 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c index 37eb8643ec29..41f9e268e3fb 100644 --- a/drivers/infiniband/core/umem.c +++ b/drivers/infiniband/core/umem.c @@ -234,7 +234,7 @@ struct ib_umem *ib_umem_get(struct ib_udata *udata, unsigned long addr, umem = kzalloc(sizeof(*umem), GFP_KERNEL); if (!umem) return ERR_PTR(-ENOMEM); - umem->context = context; + umem->ibdev = context->device; umem->length = size; umem->address = addr; umem->writable = ib_access_writable(access); @@ -337,7 +337,7 @@ void ib_umem_release(struct ib_umem *umem) if (umem->is_odp) return ib_umem_odp_release(to_ib_umem_odp(umem)); - __ib_umem_release(umem->context->device, umem, 1); + __ib_umem_release(umem->ibdev, umem, 1); atomic64_sub(ib_umem_num_pages(umem), &umem->owning_mm->pinned_vm); mmdrop(umem->owning_mm); diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c index bf36f2c5fb3a..9aebe9ce8b07 100644 --- a/drivers/infiniband/core/umem_odp.c +++ b/drivers/infiniband/core/umem_odp.c @@ -96,7 +96,7 @@ static void ib_umem_notifier_release(struct mmu_notifier *mn, */ ib_umem_notifier_start_account(umem_odp); complete_all(&umem_odp->notifier_completion); - umem_odp->umem.context->device->ops.invalidate_range( + umem_odp->umem.ibdev->ops.invalidate_range( umem_odp, ib_umem_start(umem_odp), ib_umem_end(umem_odp)); } @@ -109,7 +109,7 @@ static int invalidate_range_start_trampoline(struct ib_umem_odp *item, u64 start, u64 end, void *cookie) { ib_umem_notifier_start_account(item); - item->umem.context->device->ops.invalidate_range(item, start, end); + item->umem.ibdev->ops.invalidate_range(item, start, end); return 0; } @@ -312,7 +312,7 @@ struct ib_umem_odp *ib_umem_odp_alloc_implicit(struct ib_udata *udata, if (!umem_odp) return ERR_PTR(-ENOMEM); umem = &umem_odp->umem; - umem->context = context; + umem->ibdev = context->device; umem->writable = ib_access_writable(access); umem->owning_mm = current->mm; umem_odp->is_implicit_odp = 1; @@ -354,7 +354,7 @@ struct ib_umem_odp *ib_umem_odp_alloc_child(struct ib_umem_odp *root, if (!odp_data) return ERR_PTR(-ENOMEM); umem = &odp_data->umem; - umem->context = root->umem.context; + umem->ibdev = root->umem.ibdev; umem->length = size; umem->address = addr; umem->writable = root->umem.writable; @@ -406,7 +406,7 @@ struct ib_umem_odp *ib_umem_odp_get(struct ib_udata *udata, unsigned long addr, if (!umem_odp) return ERR_PTR(-ENOMEM); - umem_odp->umem.context = context; + umem_odp->umem.ibdev = context->device; umem_odp->umem.length = size; umem_odp->umem.address = addr; umem_odp->umem.writable = ib_access_writable(access); @@ -504,8 +504,7 @@ static int ib_umem_odp_map_dma_single_page( u64 access_mask, unsigned long current_seq) { - struct ib_ucontext *context = umem_odp->umem.context; - struct ib_device *dev = context->device; + struct ib_device *dev = umem_odp->umem.ibdev; dma_addr_t dma_addr; int remove_existing_mapping = 0; int ret = 0; @@ -718,7 +717,7 @@ void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 virt, { int idx; u64 addr; - struct ib_device *dev = umem_odp->umem.context->device; + struct ib_device *dev = umem_odp->umem.ibdev; virt = max_t(u64, virt, ib_umem_start(umem_odp)); bound = min_t(u64, bound, ib_umem_end(umem_odp)); diff --git a/include/rdma/ib_umem.h b/include/rdma/ib_umem.h index 1052d0d62be7..a91b2af64ec4 100644 --- a/include/rdma/ib_umem.h +++ b/include/rdma/ib_umem.h @@ -42,7 +42,7 @@ struct ib_ucontext; struct ib_umem_odp; struct ib_umem { - struct ib_ucontext *context; + struct ib_device *ibdev; struct mm_struct *owning_mm; size_t length; unsigned long address; -- cgit v1.2.3