summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2023-01-20 14:15:51 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2023-01-20 14:15:51 -0800
commit8974efaa3385959e7ea1019a4b63acff28631e6d (patch)
tree5abc7ad295f8e96ca54932b99d89ada18bde359f
parentedc00350d205d2de8871b514c8f9b403d588e5d1 (diff)
parent0f097f08c9b3c1fdb6cc9f2dd423abc17d13f1a2 (diff)
downloadlinux-8974efaa3385959e7ea1019a4b63acff28631e6d.tar.bz2
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma
Pull rdma fixes from Jason Gunthorpe: - Several hfi1 patches fixing some long standing driver bugs - Overflow when working with sg lists with elements greater than 4G - An rxe regression with object numbering after the mrs reach their limit - A theoretical problem with the scatterlist merging code * tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma: lib/scatterlist: Fix to calculate the last_pg properly IB/hfi1: Remove user expected buffer invalidate race IB/hfi1: Immediately remove invalid memory from hardware IB/hfi1: Fix expected receive setup error exit issues IB/hfi1: Reserve user expected TIDs IB/hfi1: Reject a zero-length user expected buffer RDMA/core: Fix ib block iterator counter overflow RDMA/rxe: Prevent faulty rkey generation RDMA/rxe: Fix inaccurate constants in rxe_type_info
-rw-r--r--drivers/infiniband/core/verbs.c7
-rw-r--r--drivers/infiniband/hw/hfi1/user_exp_rcv.c200
-rw-r--r--drivers/infiniband/hw/hfi1/user_exp_rcv.h3
-rw-r--r--drivers/infiniband/sw/rxe/rxe_param.h10
-rw-r--r--drivers/infiniband/sw/rxe/rxe_pool.c22
-rw-r--r--lib/scatterlist.c25
6 files changed, 179 insertions, 88 deletions
diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
index 26b021f43ba4..11b1c1603aeb 100644
--- a/drivers/infiniband/core/verbs.c
+++ b/drivers/infiniband/core/verbs.c
@@ -2957,15 +2957,18 @@ EXPORT_SYMBOL(__rdma_block_iter_start);
bool __rdma_block_iter_next(struct ib_block_iter *biter)
{
unsigned int block_offset;
+ unsigned int sg_delta;
if (!biter->__sg_nents || !biter->__sg)
return false;
biter->__dma_addr = sg_dma_address(biter->__sg) + biter->__sg_advance;
block_offset = biter->__dma_addr & (BIT_ULL(biter->__pg_bit) - 1);
- biter->__sg_advance += BIT_ULL(biter->__pg_bit) - block_offset;
+ sg_delta = BIT_ULL(biter->__pg_bit) - block_offset;
- if (biter->__sg_advance >= sg_dma_len(biter->__sg)) {
+ if (sg_dma_len(biter->__sg) - biter->__sg_advance > sg_delta) {
+ biter->__sg_advance += sg_delta;
+ } else {
biter->__sg_advance = 0;
biter->__sg = sg_next(biter->__sg);
biter->__sg_nents--;
diff --git a/drivers/infiniband/hw/hfi1/user_exp_rcv.c b/drivers/infiniband/hw/hfi1/user_exp_rcv.c
index 186d30291260..b02f2f0809c8 100644
--- a/drivers/infiniband/hw/hfi1/user_exp_rcv.c
+++ b/drivers/infiniband/hw/hfi1/user_exp_rcv.c
@@ -23,18 +23,25 @@ static void cacheless_tid_rb_remove(struct hfi1_filedata *fdata,
static bool tid_rb_invalidate(struct mmu_interval_notifier *mni,
const struct mmu_notifier_range *range,
unsigned long cur_seq);
+static bool tid_cover_invalidate(struct mmu_interval_notifier *mni,
+ const struct mmu_notifier_range *range,
+ unsigned long cur_seq);
static int program_rcvarray(struct hfi1_filedata *fd, struct tid_user_buf *,
struct tid_group *grp,
unsigned int start, u16 count,
u32 *tidlist, unsigned int *tididx,
unsigned int *pmapped);
-static int unprogram_rcvarray(struct hfi1_filedata *fd, u32 tidinfo,
- struct tid_group **grp);
+static int unprogram_rcvarray(struct hfi1_filedata *fd, u32 tidinfo);
+static void __clear_tid_node(struct hfi1_filedata *fd,
+ struct tid_rb_node *node);
static void clear_tid_node(struct hfi1_filedata *fd, struct tid_rb_node *node);
static const struct mmu_interval_notifier_ops tid_mn_ops = {
.invalidate = tid_rb_invalidate,
};
+static const struct mmu_interval_notifier_ops tid_cover_ops = {
+ .invalidate = tid_cover_invalidate,
+};
/*
* Initialize context and file private data needed for Expected
@@ -253,53 +260,65 @@ int hfi1_user_exp_rcv_setup(struct hfi1_filedata *fd,
tididx = 0, mapped, mapped_pages = 0;
u32 *tidlist = NULL;
struct tid_user_buf *tidbuf;
+ unsigned long mmu_seq = 0;
if (!PAGE_ALIGNED(tinfo->vaddr))
return -EINVAL;
+ if (tinfo->length == 0)
+ return -EINVAL;
tidbuf = kzalloc(sizeof(*tidbuf), GFP_KERNEL);
if (!tidbuf)
return -ENOMEM;
+ mutex_init(&tidbuf->cover_mutex);
tidbuf->vaddr = tinfo->vaddr;
tidbuf->length = tinfo->length;
tidbuf->psets = kcalloc(uctxt->expected_count, sizeof(*tidbuf->psets),
GFP_KERNEL);
if (!tidbuf->psets) {
- kfree(tidbuf);
- return -ENOMEM;
+ ret = -ENOMEM;
+ goto fail_release_mem;
+ }
+
+ if (fd->use_mn) {
+ ret = mmu_interval_notifier_insert(
+ &tidbuf->notifier, current->mm,
+ tidbuf->vaddr, tidbuf->npages * PAGE_SIZE,
+ &tid_cover_ops);
+ if (ret)
+ goto fail_release_mem;
+ mmu_seq = mmu_interval_read_begin(&tidbuf->notifier);
}
pinned = pin_rcv_pages(fd, tidbuf);
if (pinned <= 0) {
- kfree(tidbuf->psets);
- kfree(tidbuf);
- return pinned;
+ ret = (pinned < 0) ? pinned : -ENOSPC;
+ goto fail_unpin;
}
/* Find sets of physically contiguous pages */
tidbuf->n_psets = find_phys_blocks(tidbuf, pinned);
- /*
- * We don't need to access this under a lock since tid_used is per
- * process and the same process cannot be in hfi1_user_exp_rcv_clear()
- * and hfi1_user_exp_rcv_setup() at the same time.
- */
+ /* Reserve the number of expected tids to be used. */
spin_lock(&fd->tid_lock);
if (fd->tid_used + tidbuf->n_psets > fd->tid_limit)
pageset_count = fd->tid_limit - fd->tid_used;
else
pageset_count = tidbuf->n_psets;
+ fd->tid_used += pageset_count;
spin_unlock(&fd->tid_lock);
- if (!pageset_count)
- goto bail;
+ if (!pageset_count) {
+ ret = -ENOSPC;
+ goto fail_unreserve;
+ }
ngroups = pageset_count / dd->rcv_entries.group_size;
tidlist = kcalloc(pageset_count, sizeof(*tidlist), GFP_KERNEL);
if (!tidlist) {
ret = -ENOMEM;
- goto nomem;
+ goto fail_unreserve;
}
tididx = 0;
@@ -395,43 +414,78 @@ int hfi1_user_exp_rcv_setup(struct hfi1_filedata *fd,
}
unlock:
mutex_unlock(&uctxt->exp_mutex);
-nomem:
hfi1_cdbg(TID, "total mapped: tidpairs:%u pages:%u (%d)", tididx,
mapped_pages, ret);
- if (tididx) {
- spin_lock(&fd->tid_lock);
- fd->tid_used += tididx;
- spin_unlock(&fd->tid_lock);
- tinfo->tidcnt = tididx;
- tinfo->length = mapped_pages * PAGE_SIZE;
-
- if (copy_to_user(u64_to_user_ptr(tinfo->tidlist),
- tidlist, sizeof(tidlist[0]) * tididx)) {
- /*
- * On failure to copy to the user level, we need to undo
- * everything done so far so we don't leak resources.
- */
- tinfo->tidlist = (unsigned long)&tidlist;
- hfi1_user_exp_rcv_clear(fd, tinfo);
- tinfo->tidlist = 0;
- ret = -EFAULT;
- goto bail;
+
+ /* fail if nothing was programmed, set error if none provided */
+ if (tididx == 0) {
+ if (ret >= 0)
+ ret = -ENOSPC;
+ goto fail_unreserve;
+ }
+
+ /* adjust reserved tid_used to actual count */
+ spin_lock(&fd->tid_lock);
+ fd->tid_used -= pageset_count - tididx;
+ spin_unlock(&fd->tid_lock);
+
+ /* unpin all pages not covered by a TID */
+ unpin_rcv_pages(fd, tidbuf, NULL, mapped_pages, pinned - mapped_pages,
+ false);
+
+ if (fd->use_mn) {
+ /* check for an invalidate during setup */
+ bool fail = false;
+
+ mutex_lock(&tidbuf->cover_mutex);
+ fail = mmu_interval_read_retry(&tidbuf->notifier, mmu_seq);
+ mutex_unlock(&tidbuf->cover_mutex);
+
+ if (fail) {
+ ret = -EBUSY;
+ goto fail_unprogram;
}
}
- /*
- * If not everything was mapped (due to insufficient RcvArray entries,
- * for example), unpin all unmapped pages so we can pin them nex time.
- */
- if (mapped_pages != pinned)
- unpin_rcv_pages(fd, tidbuf, NULL, mapped_pages,
- (pinned - mapped_pages), false);
-bail:
+ tinfo->tidcnt = tididx;
+ tinfo->length = mapped_pages * PAGE_SIZE;
+
+ if (copy_to_user(u64_to_user_ptr(tinfo->tidlist),
+ tidlist, sizeof(tidlist[0]) * tididx)) {
+ ret = -EFAULT;
+ goto fail_unprogram;
+ }
+
+ if (fd->use_mn)
+ mmu_interval_notifier_remove(&tidbuf->notifier);
+ kfree(tidbuf->pages);
kfree(tidbuf->psets);
+ kfree(tidbuf);
kfree(tidlist);
+ return 0;
+
+fail_unprogram:
+ /* unprogram, unmap, and unpin all allocated TIDs */
+ tinfo->tidlist = (unsigned long)tidlist;
+ hfi1_user_exp_rcv_clear(fd, tinfo);
+ tinfo->tidlist = 0;
+ pinned = 0; /* nothing left to unpin */
+ pageset_count = 0; /* nothing left reserved */
+fail_unreserve:
+ spin_lock(&fd->tid_lock);
+ fd->tid_used -= pageset_count;
+ spin_unlock(&fd->tid_lock);
+fail_unpin:
+ if (fd->use_mn)
+ mmu_interval_notifier_remove(&tidbuf->notifier);
+ if (pinned > 0)
+ unpin_rcv_pages(fd, tidbuf, NULL, 0, pinned, false);
+fail_release_mem:
kfree(tidbuf->pages);
+ kfree(tidbuf->psets);
kfree(tidbuf);
- return ret > 0 ? 0 : ret;
+ kfree(tidlist);
+ return ret;
}
int hfi1_user_exp_rcv_clear(struct hfi1_filedata *fd,
@@ -452,7 +506,7 @@ int hfi1_user_exp_rcv_clear(struct hfi1_filedata *fd,
mutex_lock(&uctxt->exp_mutex);
for (tididx = 0; tididx < tinfo->tidcnt; tididx++) {
- ret = unprogram_rcvarray(fd, tidinfo[tididx], NULL);
+ ret = unprogram_rcvarray(fd, tidinfo[tididx]);
if (ret) {
hfi1_cdbg(TID, "Failed to unprogram rcv array %d",
ret);
@@ -706,6 +760,7 @@ static int set_rcvarray_entry(struct hfi1_filedata *fd,
}
node->fdata = fd;
+ mutex_init(&node->invalidate_mutex);
node->phys = page_to_phys(pages[0]);
node->npages = npages;
node->rcventry = rcventry;
@@ -721,11 +776,6 @@ static int set_rcvarray_entry(struct hfi1_filedata *fd,
&tid_mn_ops);
if (ret)
goto out_unmap;
- /*
- * FIXME: This is in the wrong order, the notifier should be
- * established before the pages are pinned by pin_rcv_pages.
- */
- mmu_interval_read_begin(&node->notifier);
}
fd->entry_to_rb[node->rcventry - uctxt->expected_base] = node;
@@ -745,8 +795,7 @@ out_unmap:
return -EFAULT;
}
-static int unprogram_rcvarray(struct hfi1_filedata *fd, u32 tidinfo,
- struct tid_group **grp)
+static int unprogram_rcvarray(struct hfi1_filedata *fd, u32 tidinfo)
{
struct hfi1_ctxtdata *uctxt = fd->uctxt;
struct hfi1_devdata *dd = uctxt->dd;
@@ -769,9 +818,6 @@ static int unprogram_rcvarray(struct hfi1_filedata *fd, u32 tidinfo,
if (!node || node->rcventry != (uctxt->expected_base + rcventry))
return -EBADF;
- if (grp)
- *grp = node->grp;
-
if (fd->use_mn)
mmu_interval_notifier_remove(&node->notifier);
cacheless_tid_rb_remove(fd, node);
@@ -779,23 +825,34 @@ static int unprogram_rcvarray(struct hfi1_filedata *fd, u32 tidinfo,
return 0;
}
-static void clear_tid_node(struct hfi1_filedata *fd, struct tid_rb_node *node)
+static void __clear_tid_node(struct hfi1_filedata *fd, struct tid_rb_node *node)
{
struct hfi1_ctxtdata *uctxt = fd->uctxt;
struct hfi1_devdata *dd = uctxt->dd;
+ mutex_lock(&node->invalidate_mutex);
+ if (node->freed)
+ goto done;
+ node->freed = true;
+
trace_hfi1_exp_tid_unreg(uctxt->ctxt, fd->subctxt, node->rcventry,
node->npages,
node->notifier.interval_tree.start, node->phys,
node->dma_addr);
- /*
- * Make sure device has seen the write before we unpin the
- * pages.
- */
+ /* Make sure device has seen the write before pages are unpinned */
hfi1_put_tid(dd, node->rcventry, PT_INVALID_FLUSH, 0, 0);
unpin_rcv_pages(fd, NULL, node, 0, node->npages, true);
+done:
+ mutex_unlock(&node->invalidate_mutex);
+}
+
+static void clear_tid_node(struct hfi1_filedata *fd, struct tid_rb_node *node)
+{
+ struct hfi1_ctxtdata *uctxt = fd->uctxt;
+
+ __clear_tid_node(fd, node);
node->grp->used--;
node->grp->map &= ~(1 << (node->rcventry - node->grp->base));
@@ -854,10 +911,16 @@ static bool tid_rb_invalidate(struct mmu_interval_notifier *mni,
if (node->freed)
return true;
+ /* take action only if unmapping */
+ if (range->event != MMU_NOTIFY_UNMAP)
+ return true;
+
trace_hfi1_exp_tid_inval(uctxt->ctxt, fdata->subctxt,
node->notifier.interval_tree.start,
node->rcventry, node->npages, node->dma_addr);
- node->freed = true;
+
+ /* clear the hardware rcvarray entry */
+ __clear_tid_node(fdata, node);
spin_lock(&fdata->invalid_lock);
if (fdata->invalid_tid_idx < uctxt->expected_count) {
@@ -887,6 +950,23 @@ static bool tid_rb_invalidate(struct mmu_interval_notifier *mni,
return true;
}
+static bool tid_cover_invalidate(struct mmu_interval_notifier *mni,
+ const struct mmu_notifier_range *range,
+ unsigned long cur_seq)
+{
+ struct tid_user_buf *tidbuf =
+ container_of(mni, struct tid_user_buf, notifier);
+
+ /* take action only if unmapping */
+ if (range->event == MMU_NOTIFY_UNMAP) {
+ mutex_lock(&tidbuf->cover_mutex);
+ mmu_interval_set_seq(mni, cur_seq);
+ mutex_unlock(&tidbuf->cover_mutex);
+ }
+
+ return true;
+}
+
static void cacheless_tid_rb_remove(struct hfi1_filedata *fdata,
struct tid_rb_node *tnode)
{
diff --git a/drivers/infiniband/hw/hfi1/user_exp_rcv.h b/drivers/infiniband/hw/hfi1/user_exp_rcv.h
index 8c53e416bf84..f8ee997d0050 100644
--- a/drivers/infiniband/hw/hfi1/user_exp_rcv.h
+++ b/drivers/infiniband/hw/hfi1/user_exp_rcv.h
@@ -16,6 +16,8 @@ struct tid_pageset {
};
struct tid_user_buf {
+ struct mmu_interval_notifier notifier;
+ struct mutex cover_mutex;
unsigned long vaddr;
unsigned long length;
unsigned int npages;
@@ -27,6 +29,7 @@ struct tid_user_buf {
struct tid_rb_node {
struct mmu_interval_notifier notifier;
struct hfi1_filedata *fdata;
+ struct mutex invalidate_mutex; /* covers hw removal */
unsigned long phys;
struct tid_group *grp;
u32 rcventry;
diff --git a/drivers/infiniband/sw/rxe/rxe_param.h b/drivers/infiniband/sw/rxe/rxe_param.h
index a754fc902e3d..7b41d79e72b2 100644
--- a/drivers/infiniband/sw/rxe/rxe_param.h
+++ b/drivers/infiniband/sw/rxe/rxe_param.h
@@ -98,11 +98,11 @@ enum rxe_device_param {
RXE_MAX_SRQ = DEFAULT_MAX_VALUE - RXE_MIN_SRQ_INDEX,
RXE_MIN_MR_INDEX = 0x00000001,
- RXE_MAX_MR_INDEX = DEFAULT_MAX_VALUE,
- RXE_MAX_MR = DEFAULT_MAX_VALUE - RXE_MIN_MR_INDEX,
- RXE_MIN_MW_INDEX = 0x00010001,
- RXE_MAX_MW_INDEX = 0x00020000,
- RXE_MAX_MW = 0x00001000,
+ RXE_MAX_MR_INDEX = DEFAULT_MAX_VALUE >> 1,
+ RXE_MAX_MR = RXE_MAX_MR_INDEX - RXE_MIN_MR_INDEX,
+ RXE_MIN_MW_INDEX = RXE_MAX_MR_INDEX + 1,
+ RXE_MAX_MW_INDEX = DEFAULT_MAX_VALUE,
+ RXE_MAX_MW = RXE_MAX_MW_INDEX - RXE_MIN_MW_INDEX,
RXE_MAX_PKT_PER_ACK = 64,
diff --git a/drivers/infiniband/sw/rxe/rxe_pool.c b/drivers/infiniband/sw/rxe/rxe_pool.c
index f50620f5a0a1..1151c0b5ccea 100644
--- a/drivers/infiniband/sw/rxe/rxe_pool.c
+++ b/drivers/infiniband/sw/rxe/rxe_pool.c
@@ -23,16 +23,16 @@ static const struct rxe_type_info {
.size = sizeof(struct rxe_ucontext),
.elem_offset = offsetof(struct rxe_ucontext, elem),
.min_index = 1,
- .max_index = UINT_MAX,
- .max_elem = UINT_MAX,
+ .max_index = RXE_MAX_UCONTEXT,
+ .max_elem = RXE_MAX_UCONTEXT,
},
[RXE_TYPE_PD] = {
.name = "pd",
.size = sizeof(struct rxe_pd),
.elem_offset = offsetof(struct rxe_pd, elem),
.min_index = 1,
- .max_index = UINT_MAX,
- .max_elem = UINT_MAX,
+ .max_index = RXE_MAX_PD,
+ .max_elem = RXE_MAX_PD,
},
[RXE_TYPE_AH] = {
.name = "ah",
@@ -40,7 +40,7 @@ static const struct rxe_type_info {
.elem_offset = offsetof(struct rxe_ah, elem),
.min_index = RXE_MIN_AH_INDEX,
.max_index = RXE_MAX_AH_INDEX,
- .max_elem = RXE_MAX_AH_INDEX - RXE_MIN_AH_INDEX + 1,
+ .max_elem = RXE_MAX_AH,
},
[RXE_TYPE_SRQ] = {
.name = "srq",
@@ -49,7 +49,7 @@ static const struct rxe_type_info {
.cleanup = rxe_srq_cleanup,
.min_index = RXE_MIN_SRQ_INDEX,
.max_index = RXE_MAX_SRQ_INDEX,
- .max_elem = RXE_MAX_SRQ_INDEX - RXE_MIN_SRQ_INDEX + 1,
+ .max_elem = RXE_MAX_SRQ,
},
[RXE_TYPE_QP] = {
.name = "qp",
@@ -58,7 +58,7 @@ static const struct rxe_type_info {
.cleanup = rxe_qp_cleanup,
.min_index = RXE_MIN_QP_INDEX,
.max_index = RXE_MAX_QP_INDEX,
- .max_elem = RXE_MAX_QP_INDEX - RXE_MIN_QP_INDEX + 1,
+ .max_elem = RXE_MAX_QP,
},
[RXE_TYPE_CQ] = {
.name = "cq",
@@ -66,8 +66,8 @@ static const struct rxe_type_info {
.elem_offset = offsetof(struct rxe_cq, elem),
.cleanup = rxe_cq_cleanup,
.min_index = 1,
- .max_index = UINT_MAX,
- .max_elem = UINT_MAX,
+ .max_index = RXE_MAX_CQ,
+ .max_elem = RXE_MAX_CQ,
},
[RXE_TYPE_MR] = {
.name = "mr",
@@ -76,7 +76,7 @@ static const struct rxe_type_info {
.cleanup = rxe_mr_cleanup,
.min_index = RXE_MIN_MR_INDEX,
.max_index = RXE_MAX_MR_INDEX,
- .max_elem = RXE_MAX_MR_INDEX - RXE_MIN_MR_INDEX + 1,
+ .max_elem = RXE_MAX_MR,
},
[RXE_TYPE_MW] = {
.name = "mw",
@@ -85,7 +85,7 @@ static const struct rxe_type_info {
.cleanup = rxe_mw_cleanup,
.min_index = RXE_MIN_MW_INDEX,
.max_index = RXE_MAX_MW_INDEX,
- .max_elem = RXE_MAX_MW_INDEX - RXE_MIN_MW_INDEX + 1,
+ .max_elem = RXE_MAX_MW,
},
};
diff --git a/lib/scatterlist.c b/lib/scatterlist.c
index f72aa50c6654..8d7519a8f308 100644
--- a/lib/scatterlist.c
+++ b/lib/scatterlist.c
@@ -470,22 +470,27 @@ int sg_alloc_append_table_from_pages(struct sg_append_table *sgt_append,
return -EOPNOTSUPP;
if (sgt_append->prv) {
+ unsigned long next_pfn = (page_to_phys(sg_page(sgt_append->prv)) +
+ sgt_append->prv->offset + sgt_append->prv->length) / PAGE_SIZE;
+
if (WARN_ON(offset))
return -EINVAL;
/* Merge contiguous pages into the last SG */
prv_len = sgt_append->prv->length;
- last_pg = sg_page(sgt_append->prv);
- while (n_pages && pages_are_mergeable(pages[0], last_pg)) {
- if (sgt_append->prv->length + PAGE_SIZE > max_segment)
- break;
- sgt_append->prv->length += PAGE_SIZE;
- last_pg = pages[0];
- pages++;
- n_pages--;
+ if (page_to_pfn(pages[0]) == next_pfn) {
+ last_pg = pfn_to_page(next_pfn - 1);
+ while (n_pages && pages_are_mergeable(pages[0], last_pg)) {
+ if (sgt_append->prv->length + PAGE_SIZE > max_segment)
+ break;
+ sgt_append->prv->length += PAGE_SIZE;
+ last_pg = pages[0];
+ pages++;
+ n_pages--;
+ }
+ if (!n_pages)
+ goto out;
}
- if (!n_pages)
- goto out;
}
/* compute number of contiguous chunks */