summaryrefslogtreecommitdiffstats
path: root/drivers/infiniband/sw
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2019-07-15 20:38:15 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2019-07-15 20:38:15 -0700
commit2a3c389a0fde49b241430df806a34276568cfb29 (patch)
tree9cf35829317e8cc2aaffc4341fb824dad63fce02 /drivers/infiniband/sw
parent8de262531f5fbb7458463224a7587429800c24bf (diff)
parent0b043644c0ca601cb19943a81aa1f1455dbe9461 (diff)
downloadlinux-2a3c389a0fde49b241430df806a34276568cfb29.tar.bz2
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma
Pull rdma updates from Jason Gunthorpe: "A smaller cycle this time. Notably we see another new driver, 'Soft iWarp', and the deletion of an ancient unused driver for nes. - Revise and simplify the signature offload RDMA MR APIs - More progress on hoisting object allocation boiler plate code out of the drivers - Driver bug fixes and revisions for hns, hfi1, efa, cxgb4, qib, i40iw - Tree wide cleanups: struct_size, put_user_page, xarray, rst doc conversion - Removal of obsolete ib_ucm chardev and nes driver - netlink based discovery of chardevs and autoloading of the modules providing them - Move more of the rdamvt/hfi1 uapi to include/uapi/rdma - New driver 'siw' for software based iWarp running on top of netdev, much like rxe's software RoCE. - mlx5 feature to report events in their raw devx format to userspace - Expose per-object counters through rdma tool - Adaptive interrupt moderation for RDMA (DIM), sharing the DIM core from netdev" * tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma: (194 commits) RMDA/siw: Require a 64 bit arch RDMA/siw: Mark expected switch fall-throughs RDMA/core: Fix -Wunused-const-variable warnings rdma/siw: Remove set but not used variable 's' rdma/siw: Add missing dependencies on LIBCRC32C and DMA_VIRT_OPS RDMA/siw: Add missing rtnl_lock around access to ifa rdma/siw: Use proper enumerated type in map_cqe_status RDMA/siw: Remove unnecessary kthread create/destroy printouts IB/rdmavt: Fix variable shadowing issue in rvt_create_cq RDMA/core: Fix race when resolving IP address RDMA/core: Make rdma_counter.h compile stand alone IB/core: Work on the caller socket net namespace in nldev_newlink() RDMA/rxe: Fill in wc byte_len with IB_WC_RECV_RDMA_WITH_IMM RDMA/mlx5: Set RDMA DIM to be enabled by default RDMA/nldev: Added configuration of RDMA dynamic interrupt moderation to netlink RDMA/core: Provide RDMA DIM support for ULPs linux/dim: Implement RDMA adaptive moderation (DIM) IB/mlx5: Report correctly tag matching rendezvous capability docs: infiniband: add it to the driver-api bookset IB/mlx5: Implement VHCA tunnel mechanism in DEVX ...
Diffstat (limited to 'drivers/infiniband/sw')
-rw-r--r--drivers/infiniband/sw/Makefile1
-rw-r--r--drivers/infiniband/sw/rdmavt/ah.c6
-rw-r--r--drivers/infiniband/sw/rdmavt/cq.c250
-rw-r--r--drivers/infiniband/sw/rdmavt/cq.h7
-rw-r--r--drivers/infiniband/sw/rdmavt/mr.c6
-rw-r--r--drivers/infiniband/sw/rdmavt/qp.c402
-rw-r--r--drivers/infiniband/sw/rdmavt/qp.h2
-rw-r--r--drivers/infiniband/sw/rdmavt/rc.c41
-rw-r--r--drivers/infiniband/sw/rdmavt/srq.c69
-rw-r--r--drivers/infiniband/sw/rdmavt/trace_mr.h56
-rw-r--r--drivers/infiniband/sw/rdmavt/vt.c7
-rw-r--r--drivers/infiniband/sw/rdmavt/vt.h9
-rw-r--r--drivers/infiniband/sw/rxe/rxe_comp.c2
-rw-r--r--drivers/infiniband/sw/rxe/rxe_mr.c3
-rw-r--r--drivers/infiniband/sw/rxe/rxe_pool.c1
-rw-r--r--drivers/infiniband/sw/rxe/rxe_resp.c5
-rw-r--r--drivers/infiniband/sw/rxe/rxe_verbs.c40
-rw-r--r--drivers/infiniband/sw/rxe/rxe_verbs.h3
-rw-r--r--drivers/infiniband/sw/siw/Kconfig18
-rw-r--r--drivers/infiniband/sw/siw/Makefile11
-rw-r--r--drivers/infiniband/sw/siw/iwarp.h380
-rw-r--r--drivers/infiniband/sw/siw/siw.h745
-rw-r--r--drivers/infiniband/sw/siw/siw_cm.c2070
-rw-r--r--drivers/infiniband/sw/siw/siw_cm.h133
-rw-r--r--drivers/infiniband/sw/siw/siw_cq.c101
-rw-r--r--drivers/infiniband/sw/siw/siw_main.c685
-rw-r--r--drivers/infiniband/sw/siw/siw_mem.c460
-rw-r--r--drivers/infiniband/sw/siw/siw_mem.h74
-rw-r--r--drivers/infiniband/sw/siw/siw_qp.c1322
-rw-r--r--drivers/infiniband/sw/siw/siw_qp_rx.c1458
-rw-r--r--drivers/infiniband/sw/siw/siw_qp_tx.c1269
-rw-r--r--drivers/infiniband/sw/siw/siw_verbs.c1760
-rw-r--r--drivers/infiniband/sw/siw/siw_verbs.h91
33 files changed, 11155 insertions, 332 deletions
diff --git a/drivers/infiniband/sw/Makefile b/drivers/infiniband/sw/Makefile
index ab48a9b60844..68e0230f8f31 100644
--- a/drivers/infiniband/sw/Makefile
+++ b/drivers/infiniband/sw/Makefile
@@ -1,3 +1,4 @@
# SPDX-License-Identifier: GPL-2.0-only
obj-$(CONFIG_INFINIBAND_RDMAVT) += rdmavt/
obj-$(CONFIG_RDMA_RXE) += rxe/
+obj-$(CONFIG_RDMA_SIW) += siw/
diff --git a/drivers/infiniband/sw/rdmavt/ah.c b/drivers/infiniband/sw/rdmavt/ah.c
index 0e147b32cbe9..fe99da0ff060 100644
--- a/drivers/infiniband/sw/rdmavt/ah.c
+++ b/drivers/infiniband/sw/rdmavt/ah.c
@@ -1,5 +1,5 @@
/*
- * Copyright(c) 2016 Intel Corporation.
+ * Copyright(c) 2016 - 2019 Intel Corporation.
*
* This file is provided under a dual BSD/GPLv2 license. When using or
* redistributing this file, you may do so under either license.
@@ -119,8 +119,6 @@ int rvt_create_ah(struct ib_ah *ibah, struct rdma_ah_attr *ah_attr,
rdma_copy_ah_attr(&ah->attr, ah_attr);
- atomic_set(&ah->refcount, 0);
-
if (dev->driver_f.notify_new_ah)
dev->driver_f.notify_new_ah(ibah->device, ah_attr, ah);
@@ -141,8 +139,6 @@ void rvt_destroy_ah(struct ib_ah *ibah, u32 destroy_flags)
struct rvt_ah *ah = ibah_to_rvtah(ibah);
unsigned long flags;
- WARN_ON_ONCE(atomic_read(&ah->refcount));
-
spin_lock_irqsave(&dev->n_ahs_lock, flags);
dev->n_ahs_allocated--;
spin_unlock_irqrestore(&dev->n_ahs_lock, flags);
diff --git a/drivers/infiniband/sw/rdmavt/cq.c b/drivers/infiniband/sw/rdmavt/cq.c
index a06e6da7a026..a85571a4cf57 100644
--- a/drivers/infiniband/sw/rdmavt/cq.c
+++ b/drivers/infiniband/sw/rdmavt/cq.c
@@ -60,22 +60,39 @@ static struct workqueue_struct *comp_vector_wq;
* @solicited: true if @entry is solicited
*
* This may be called with qp->s_lock held.
+ *
+ * Return: return true on success, else return
+ * false if cq is full.
*/
-void rvt_cq_enter(struct rvt_cq *cq, struct ib_wc *entry, bool solicited)
+bool rvt_cq_enter(struct rvt_cq *cq, struct ib_wc *entry, bool solicited)
{
- struct rvt_cq_wc *wc;
+ struct ib_uverbs_wc *uqueue = NULL;
+ struct ib_wc *kqueue = NULL;
+ struct rvt_cq_wc *u_wc = NULL;
+ struct rvt_k_cq_wc *k_wc = NULL;
unsigned long flags;
u32 head;
u32 next;
+ u32 tail;
spin_lock_irqsave(&cq->lock, flags);
+ if (cq->ip) {
+ u_wc = cq->queue;
+ uqueue = &u_wc->uqueue[0];
+ head = RDMA_READ_UAPI_ATOMIC(u_wc->head);
+ tail = RDMA_READ_UAPI_ATOMIC(u_wc->tail);
+ } else {
+ k_wc = cq->kqueue;
+ kqueue = &k_wc->kqueue[0];
+ head = k_wc->head;
+ tail = k_wc->tail;
+ }
+
/*
- * Note that the head pointer might be writable by user processes.
- * Take care to verify it is a sane value.
+ * Note that the head pointer might be writable by
+ * user processes.Take care to verify it is a sane value.
*/
- wc = cq->queue;
- head = wc->head;
if (head >= (unsigned)cq->ibcq.cqe) {
head = cq->ibcq.cqe;
next = 0;
@@ -83,7 +100,12 @@ void rvt_cq_enter(struct rvt_cq *cq, struct ib_wc *entry, bool solicited)
next = head + 1;
}
- if (unlikely(next == wc->tail)) {
+ if (unlikely(next == tail || cq->cq_full)) {
+ struct rvt_dev_info *rdi = cq->rdi;
+
+ if (!cq->cq_full)
+ rvt_pr_err_ratelimited(rdi, "CQ is full!\n");
+ cq->cq_full = true;
spin_unlock_irqrestore(&cq->lock, flags);
if (cq->ibcq.event_handler) {
struct ib_event ev;
@@ -93,30 +115,30 @@ void rvt_cq_enter(struct rvt_cq *cq, struct ib_wc *entry, bool solicited)
ev.event = IB_EVENT_CQ_ERR;
cq->ibcq.event_handler(&ev, cq->ibcq.cq_context);
}
- return;
+ return false;
}
trace_rvt_cq_enter(cq, entry, head);
- if (cq->ip) {
- wc->uqueue[head].wr_id = entry->wr_id;
- wc->uqueue[head].status = entry->status;
- wc->uqueue[head].opcode = entry->opcode;
- wc->uqueue[head].vendor_err = entry->vendor_err;
- wc->uqueue[head].byte_len = entry->byte_len;
- wc->uqueue[head].ex.imm_data = entry->ex.imm_data;
- wc->uqueue[head].qp_num = entry->qp->qp_num;
- wc->uqueue[head].src_qp = entry->src_qp;
- wc->uqueue[head].wc_flags = entry->wc_flags;
- wc->uqueue[head].pkey_index = entry->pkey_index;
- wc->uqueue[head].slid = ib_lid_cpu16(entry->slid);
- wc->uqueue[head].sl = entry->sl;
- wc->uqueue[head].dlid_path_bits = entry->dlid_path_bits;
- wc->uqueue[head].port_num = entry->port_num;
+ if (uqueue) {
+ uqueue[head].wr_id = entry->wr_id;
+ uqueue[head].status = entry->status;
+ uqueue[head].opcode = entry->opcode;
+ uqueue[head].vendor_err = entry->vendor_err;
+ uqueue[head].byte_len = entry->byte_len;
+ uqueue[head].ex.imm_data = entry->ex.imm_data;
+ uqueue[head].qp_num = entry->qp->qp_num;
+ uqueue[head].src_qp = entry->src_qp;
+ uqueue[head].wc_flags = entry->wc_flags;
+ uqueue[head].pkey_index = entry->pkey_index;
+ uqueue[head].slid = ib_lid_cpu16(entry->slid);
+ uqueue[head].sl = entry->sl;
+ uqueue[head].dlid_path_bits = entry->dlid_path_bits;
+ uqueue[head].port_num = entry->port_num;
/* Make sure entry is written before the head index. */
- smp_wmb();
+ RDMA_WRITE_UAPI_ATOMIC(u_wc->head, next);
} else {
- wc->kqueue[head] = *entry;
+ kqueue[head] = *entry;
+ k_wc->head = next;
}
- wc->head = next;
if (cq->notify == IB_CQ_NEXT_COMP ||
(cq->notify == IB_CQ_SOLICITED &&
@@ -132,6 +154,7 @@ void rvt_cq_enter(struct rvt_cq *cq, struct ib_wc *entry, bool solicited)
}
spin_unlock_irqrestore(&cq->lock, flags);
+ return true;
}
EXPORT_SYMBOL(rvt_cq_enter);
@@ -166,43 +189,38 @@ static void send_complete(struct work_struct *work)
/**
* rvt_create_cq - create a completion queue
- * @ibdev: the device this completion queue is attached to
+ * @ibcq: Allocated CQ
* @attr: creation attributes
* @udata: user data for libibverbs.so
*
* Called by ib_create_cq() in the generic verbs code.
*
- * Return: pointer to the completion queue or negative errno values
- * for failure.
+ * Return: 0 on success
*/
-struct ib_cq *rvt_create_cq(struct ib_device *ibdev,
- const struct ib_cq_init_attr *attr,
- struct ib_udata *udata)
+int rvt_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
+ struct ib_udata *udata)
{
+ struct ib_device *ibdev = ibcq->device;
struct rvt_dev_info *rdi = ib_to_rvt(ibdev);
- struct rvt_cq *cq;
- struct rvt_cq_wc *wc;
- struct ib_cq *ret;
+ struct rvt_cq *cq = ibcq_to_rvtcq(ibcq);
+ struct rvt_cq_wc *u_wc = NULL;
+ struct rvt_k_cq_wc *k_wc = NULL;
u32 sz;
unsigned int entries = attr->cqe;
int comp_vector = attr->comp_vector;
+ int err;
if (attr->flags)
- return ERR_PTR(-EINVAL);
+ return -EINVAL;
if (entries < 1 || entries > rdi->dparms.props.max_cqe)
- return ERR_PTR(-EINVAL);
+ return -EINVAL;
if (comp_vector < 0)
comp_vector = 0;
comp_vector = comp_vector % rdi->ibdev.num_comp_vectors;
- /* Allocate the completion queue structure. */
- cq = kzalloc_node(sizeof(*cq), GFP_KERNEL, rdi->dparms.node);
- if (!cq)
- return ERR_PTR(-ENOMEM);
-
/*
* Allocate the completion queue entries and head/tail pointers.
* This is allocated separately so that it can be resized and
@@ -210,17 +228,18 @@ struct ib_cq *rvt_create_cq(struct ib_device *ibdev,
* We need to use vmalloc() in order to support mmap and large
* numbers of entries.
*/
- sz = sizeof(*wc);
- if (udata && udata->outlen >= sizeof(__u64))
- sz += sizeof(struct ib_uverbs_wc) * (entries + 1);
- else
- sz += sizeof(struct ib_wc) * (entries + 1);
- wc = udata ?
- vmalloc_user(sz) :
- vzalloc_node(sz, rdi->dparms.node);
- if (!wc) {
- ret = ERR_PTR(-ENOMEM);
- goto bail_cq;
+ if (udata && udata->outlen >= sizeof(__u64)) {
+ sz = sizeof(struct ib_uverbs_wc) * (entries + 1);
+ sz += sizeof(*u_wc);
+ u_wc = vmalloc_user(sz);
+ if (!u_wc)
+ return -ENOMEM;
+ } else {
+ sz = sizeof(struct ib_wc) * (entries + 1);
+ sz += sizeof(*k_wc);
+ k_wc = vzalloc_node(sz, rdi->dparms.node);
+ if (!k_wc)
+ return -ENOMEM;
}
/*
@@ -228,26 +247,22 @@ struct ib_cq *rvt_create_cq(struct ib_device *ibdev,
* See rvt_mmap() for details.
*/
if (udata && udata->outlen >= sizeof(__u64)) {
- int err;
-
- cq->ip = rvt_create_mmap_info(rdi, sz, udata, wc);
+ cq->ip = rvt_create_mmap_info(rdi, sz, udata, u_wc);
if (!cq->ip) {
- ret = ERR_PTR(-ENOMEM);
+ err = -ENOMEM;
goto bail_wc;
}
err = ib_copy_to_udata(udata, &cq->ip->offset,
sizeof(cq->ip->offset));
- if (err) {
- ret = ERR_PTR(err);
+ if (err)
goto bail_ip;
- }
}
spin_lock_irq(&rdi->n_cqs_lock);
if (rdi->n_cqs_allocated == rdi->dparms.props.max_cq) {
spin_unlock_irq(&rdi->n_cqs_lock);
- ret = ERR_PTR(-ENOMEM);
+ err = -ENOMEM;
goto bail_ip;
}
@@ -277,21 +292,20 @@ struct ib_cq *rvt_create_cq(struct ib_device *ibdev,
cq->notify = RVT_CQ_NONE;
spin_lock_init(&cq->lock);
INIT_WORK(&cq->comptask, send_complete);
- cq->queue = wc;
-
- ret = &cq->ibcq;
+ if (u_wc)
+ cq->queue = u_wc;
+ else
+ cq->kqueue = k_wc;
trace_rvt_create_cq(cq, attr);
- goto done;
+ return 0;
bail_ip:
kfree(cq->ip);
bail_wc:
- vfree(wc);
-bail_cq:
- kfree(cq);
-done:
- return ret;
+ vfree(u_wc);
+ vfree(k_wc);
+ return err;
}
/**
@@ -300,10 +314,8 @@ done:
* @udata: user data or NULL for kernel object
*
* Called by ib_destroy_cq() in the generic verbs code.
- *
- * Return: always 0
*/
-int rvt_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata)
+void rvt_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata)
{
struct rvt_cq *cq = ibcq_to_rvtcq(ibcq);
struct rvt_dev_info *rdi = cq->rdi;
@@ -316,9 +328,6 @@ int rvt_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata)
kref_put(&cq->ip->ref, rvt_release_mmap_info);
else
vfree(cq->queue);
- kfree(cq);
-
- return 0;
}
/**
@@ -345,9 +354,16 @@ int rvt_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags notify_flags)
if (cq->notify != IB_CQ_NEXT_COMP)
cq->notify = notify_flags & IB_CQ_SOLICITED_MASK;
- if ((notify_flags & IB_CQ_REPORT_MISSED_EVENTS) &&
- cq->queue->head != cq->queue->tail)
- ret = 1;
+ if (notify_flags & IB_CQ_REPORT_MISSED_EVENTS) {
+ if (cq->queue) {
+ if (RDMA_READ_UAPI_ATOMIC(cq->queue->head) !=
+ RDMA_READ_UAPI_ATOMIC(cq->queue->tail))
+ ret = 1;
+ } else {
+ if (cq->kqueue->head != cq->kqueue->tail)
+ ret = 1;
+ }
+ }
spin_unlock_irqrestore(&cq->lock, flags);
@@ -363,12 +379,14 @@ int rvt_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags notify_flags)
int rvt_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata)
{
struct rvt_cq *cq = ibcq_to_rvtcq(ibcq);
- struct rvt_cq_wc *old_wc;
- struct rvt_cq_wc *wc;
u32 head, tail, n;
int ret;
u32 sz;
struct rvt_dev_info *rdi = cq->rdi;
+ struct rvt_cq_wc *u_wc = NULL;
+ struct rvt_cq_wc *old_u_wc = NULL;
+ struct rvt_k_cq_wc *k_wc = NULL;
+ struct rvt_k_cq_wc *old_k_wc = NULL;
if (cqe < 1 || cqe > rdi->dparms.props.max_cqe)
return -EINVAL;
@@ -376,17 +394,19 @@ int rvt_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata)
/*
* Need to use vmalloc() if we want to support large #s of entries.
*/
- sz = sizeof(*wc);
- if (udata && udata->outlen >= sizeof(__u64))
- sz += sizeof(struct ib_uverbs_wc) * (cqe + 1);
- else
- sz += sizeof(struct ib_wc) * (cqe + 1);
- wc = udata ?
- vmalloc_user(sz) :
- vzalloc_node(sz, rdi->dparms.node);
- if (!wc)
- return -ENOMEM;
-
+ if (udata && udata->outlen >= sizeof(__u64)) {
+ sz = sizeof(struct ib_uverbs_wc) * (cqe + 1);
+ sz += sizeof(*u_wc);
+ u_wc = vmalloc_user(sz);
+ if (!u_wc)
+ return -ENOMEM;
+ } else {
+ sz = sizeof(struct ib_wc) * (cqe + 1);
+ sz += sizeof(*k_wc);
+ k_wc = vzalloc_node(sz, rdi->dparms.node);
+ if (!k_wc)
+ return -ENOMEM;
+ }
/* Check that we can write the offset to mmap. */
if (udata && udata->outlen >= sizeof(__u64)) {
__u64 offset = 0;
@@ -401,11 +421,18 @@ int rvt_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata)
* Make sure head and tail are sane since they
* might be user writable.
*/
- old_wc = cq->queue;
- head = old_wc->head;
+ if (u_wc) {
+ old_u_wc = cq->queue;
+ head = RDMA_READ_UAPI_ATOMIC(old_u_wc->head);
+ tail = RDMA_READ_UAPI_ATOMIC(old_u_wc->tail);
+ } else {
+ old_k_wc = cq->kqueue;
+ head = old_k_wc->head;
+ tail = old_k_wc->tail;
+ }
+
if (head > (u32)cq->ibcq.cqe)
head = (u32)cq->ibcq.cqe;
- tail = old_wc->tail;
if (tail > (u32)cq->ibcq.cqe)
tail = (u32)cq->ibcq.cqe;
if (head < tail)
@@ -417,27 +444,36 @@ int rvt_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata)
goto bail_unlock;
}
for (n = 0; tail != head; n++) {
- if (cq->ip)
- wc->uqueue[n] = old_wc->uqueue[tail];
+ if (u_wc)
+ u_wc->uqueue[n] = old_u_wc->uqueue[tail];
else
- wc->kqueue[n] = old_wc->kqueue[tail];
+ k_wc->kqueue[n] = old_k_wc->kqueue[tail];
if (tail == (u32)cq->ibcq.cqe)
tail = 0;
else
tail++;
}
cq->ibcq.cqe = cqe;
- wc->head = n;
- wc->tail = 0;
- cq->queue = wc;
+ if (u_wc) {
+ RDMA_WRITE_UAPI_ATOMIC(u_wc->head, n);
+ RDMA_WRITE_UAPI_ATOMIC(u_wc->tail, 0);
+ cq->queue = u_wc;
+ } else {
+ k_wc->head = n;
+ k_wc->tail = 0;
+ cq->kqueue = k_wc;
+ }
spin_unlock_irq(&cq->lock);
- vfree(old_wc);
+ if (u_wc)
+ vfree(old_u_wc);
+ else
+ vfree(old_k_wc);
if (cq->ip) {
struct rvt_mmap_info *ip = cq->ip;
- rvt_update_mmap_info(rdi, ip, sz, wc);
+ rvt_update_mmap_info(rdi, ip, sz, u_wc);
/*
* Return the offset to mmap.
@@ -461,7 +497,9 @@ int rvt_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata)
bail_unlock:
spin_unlock_irq(&cq->lock);
bail_free:
- vfree(wc);
+ vfree(u_wc);
+ vfree(k_wc);
+
return ret;
}
@@ -479,7 +517,7 @@ bail_free:
int rvt_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry)
{
struct rvt_cq *cq = ibcq_to_rvtcq(ibcq);
- struct rvt_cq_wc *wc;
+ struct rvt_k_cq_wc *wc;
unsigned long flags;
int npolled;
u32 tail;
@@ -490,7 +528,7 @@ int rvt_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry)
spin_lock_irqsave(&cq->lock, flags);
- wc = cq->queue;
+ wc = cq->kqueue;
tail = wc->tail;
if (tail > (u32)cq->ibcq.cqe)
tail = (u32)cq->ibcq.cqe;
diff --git a/drivers/infiniband/sw/rdmavt/cq.h b/drivers/infiniband/sw/rdmavt/cq.h
index 3ad6faf18ecb..5e26a2eb19a4 100644
--- a/drivers/infiniband/sw/rdmavt/cq.h
+++ b/drivers/infiniband/sw/rdmavt/cq.h
@@ -51,10 +51,9 @@
#include <rdma/rdma_vt.h>
#include <rdma/rdmavt_cq.h>
-struct ib_cq *rvt_create_cq(struct ib_device *ibdev,
- const struct ib_cq_init_attr *attr,
- struct ib_udata *udata);
-int rvt_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata);
+int rvt_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
+ struct ib_udata *udata);
+void rvt_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata);
int rvt_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags notify_flags);
int rvt_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata);
int rvt_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry);
diff --git a/drivers/infiniband/sw/rdmavt/mr.c b/drivers/infiniband/sw/rdmavt/mr.c
index f48240f66b8f..a6a39f01dca3 100644
--- a/drivers/infiniband/sw/rdmavt/mr.c
+++ b/drivers/infiniband/sw/rdmavt/mr.c
@@ -562,8 +562,7 @@ int rvt_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
if (ret)
goto out;
rvt_deinit_mregion(&mr->mr);
- if (mr->umem)
- ib_umem_release(mr->umem);
+ ib_umem_release(mr->umem);
kfree(mr);
out:
return ret;
@@ -613,8 +612,8 @@ static int rvt_set_page(struct ib_mr *ibmr, u64 addr)
n = mapped_segs % RVT_SEGSZ;
mr->mr.map[m]->segs[n].vaddr = (void *)addr;
mr->mr.map[m]->segs[n].length = ps;
- trace_rvt_mr_page_seg(&mr->mr, m, n, (void *)addr, ps);
mr->mr.length += ps;
+ trace_rvt_mr_page_seg(&mr->mr, m, n, (void *)addr, ps);
return 0;
}
@@ -643,6 +642,7 @@ int rvt_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg,
mr->mr.iova = ibmr->iova;
mr->mr.offset = ibmr->iova - (u64)mr->mr.map[0]->segs[0].vaddr;
mr->mr.length = (size_t)ibmr->length;
+ trace_rvt_map_mr_sg(ibmr, sg_nents, sg_offset);
return ret;
}
diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c
index c5a50614a6c6..0b0a241c57ff 100644
--- a/drivers/infiniband/sw/rdmavt/qp.c
+++ b/drivers/infiniband/sw/rdmavt/qp.c
@@ -1,5 +1,5 @@
/*
- * Copyright(c) 2016 - 2018 Intel Corporation.
+ * Copyright(c) 2016 - 2019 Intel Corporation.
*
* This file is provided under a dual BSD/GPLv2 license. When using or
* redistributing this file, you may do so under either license.
@@ -58,6 +58,8 @@
#include "vt.h"
#include "trace.h"
+#define RVT_RWQ_COUNT_THRESHOLD 16
+
static void rvt_rc_timeout(struct timer_list *t);
/*
@@ -803,6 +805,47 @@ static void rvt_remove_qp(struct rvt_dev_info *rdi, struct rvt_qp *qp)
}
/**
+ * rvt_alloc_rq - allocate memory for user or kernel buffer
+ * @rq: receive queue data structure
+ * @size: number of request queue entries
+ * @node: The NUMA node
+ * @udata: True if user data is available or not false
+ *
+ * Return: If memory allocation failed, return -ENONEM
+ * This function is used by both shared receive
+ * queues and non-shared receive queues to allocate
+ * memory.
+ */
+int rvt_alloc_rq(struct rvt_rq *rq, u32 size, int node,
+ struct ib_udata *udata)
+{
+ if (udata) {
+ rq->wq = vmalloc_user(sizeof(struct rvt_rwq) + size);
+ if (!rq->wq)
+ goto bail;
+ /* need kwq with no buffers */
+ rq->kwq = kzalloc_node(sizeof(*rq->kwq), GFP_KERNEL, node);
+ if (!rq->kwq)
+ goto bail;
+ rq->kwq->curr_wq = rq->wq->wq;
+ } else {
+ /* need kwq with buffers */
+ rq->kwq =
+ vzalloc_node(sizeof(struct rvt_krwq) + size, node);
+ if (!rq->kwq)
+ goto bail;
+ rq->kwq->curr_wq = rq->kwq->wq;
+ }
+
+ spin_lock_init(&rq->kwq->p_lock);
+ spin_lock_init(&rq->kwq->c_lock);
+ return 0;
+bail:
+ rvt_free_rq(rq);
+ return -ENOMEM;
+}
+
+/**
* rvt_init_qp - initialize the QP state to the reset state
* @qp: the QP to init or reinit
* @type: the QP type
@@ -852,10 +895,8 @@ static void rvt_init_qp(struct rvt_dev_info *rdi, struct rvt_qp *qp,
qp->s_tail_ack_queue = 0;
qp->s_acked_ack_queue = 0;
qp->s_num_rd_atomic = 0;
- if (qp->r_rq.wq) {
- qp->r_rq.wq->head = 0;
- qp->r_rq.wq->tail = 0;
- }
+ if (qp->r_rq.kwq)
+ qp->r_rq.kwq->count = qp->r_rq.size;
qp->r_sge.num_sge = 0;
atomic_set(&qp->s_reserved_used, 0);
}
@@ -928,6 +969,61 @@ static void rvt_free_qpn(struct rvt_qpn_table *qpt, u32 qpn)
}
/**
+ * get_allowed_ops - Given a QP type return the appropriate allowed OP
+ * @type: valid, supported, QP type
+ */
+static u8 get_allowed_ops(enum ib_qp_type type)
+{
+ return type == IB_QPT_RC ? IB_OPCODE_RC : type == IB_QPT_UC ?
+ IB_OPCODE_UC : IB_OPCODE_UD;
+}
+
+/**
+ * free_ud_wq_attr - Clean up AH attribute cache for UD QPs
+ * @qp: Valid QP with allowed_ops set
+ *
+ * The rvt_swqe data structure being used is a union, so this is
+ * only valid for UD QPs.
+ */
+static void free_ud_wq_attr(struct rvt_qp *qp)
+{
+ struct rvt_swqe *wqe;
+ int i;
+
+ for (i = 0; qp->allowed_ops == IB_OPCODE_UD && i < qp->s_size; i++) {
+ wqe = rvt_get_swqe_ptr(qp, i);
+ kfree(wqe->ud_wr.attr);
+ wqe->ud_wr.attr = NULL;
+ }
+}
+
+/**
+ * alloc_ud_wq_attr - AH attribute cache for UD QPs
+ * @qp: Valid QP with allowed_ops set
+ * @node: Numa node for allocation
+ *
+ * The rvt_swqe data structure being used is a union, so this is
+ * only valid for UD QPs.
+ */
+static int alloc_ud_wq_attr(struct rvt_qp *qp, int node)
+{
+ struct rvt_swqe *wqe;
+ int i;
+
+ for (i = 0; qp->allowed_ops == IB_OPCODE_UD && i < qp->s_size; i++) {
+ wqe = rvt_get_swqe_ptr(qp, i);
+ wqe->ud_wr.attr = kzalloc_node(sizeof(*wqe->ud_wr.attr),
+ GFP_KERNEL, node);
+ if (!wqe->ud_wr.attr) {
+ free_ud_wq_attr(qp);
+ return -ENOMEM;
+ }
+ }
+
+ return 0;
+}
+
+/**
* rvt_create_qp - create a queue pair for a device
* @ibpd: the protection domain who's device we create the queue pair for
* @init_attr: the attributes of the queue pair
@@ -989,9 +1085,7 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd,
case IB_QPT_UC:
case IB_QPT_RC:
case IB_QPT_UD:
- sz = sizeof(struct rvt_sge) *
- init_attr->cap.max_send_sge +
- sizeof(struct rvt_swqe);
+ sz = struct_size(swq, sg_list, init_attr->cap.max_send_sge);
swq = vzalloc_node(array_size(sz, sqsize), rdi->dparms.node);
if (!swq)
return ERR_PTR(-ENOMEM);
@@ -1011,6 +1105,7 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd,
rdi->dparms.node);
if (!qp)
goto bail_swq;
+ qp->allowed_ops = get_allowed_ops(init_attr->qp_type);
RCU_INIT_POINTER(qp->next, NULL);
if (init_attr->qp_type == IB_QPT_RC) {
@@ -1048,17 +1143,12 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd,
qp->r_rq.max_sge = init_attr->cap.max_recv_sge;
sz = (sizeof(struct ib_sge) * qp->r_rq.max_sge) +
sizeof(struct rvt_rwqe);
- if (udata)
- qp->r_rq.wq = vmalloc_user(
- sizeof(struct rvt_rwq) +
- qp->r_rq.size * sz);
- else
- qp->r_rq.wq = vzalloc_node(
- sizeof(struct rvt_rwq) +
- qp->r_rq.size * sz,
- rdi->dparms.node);
- if (!qp->r_rq.wq)
+ err = rvt_alloc_rq(&qp->r_rq, qp->r_rq.size * sz,
+ rdi->dparms.node, udata);
+ if (err) {
+ ret = ERR_PTR(err);
goto bail_driver_priv;
+ }
}
/*
@@ -1068,7 +1158,6 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd,
spin_lock_init(&qp->r_lock);
spin_lock_init(&qp->s_hlock);
spin_lock_init(&qp->s_lock);
- spin_lock_init(&qp->r_rq.lock);
atomic_set(&qp->refcount, 0);
atomic_set(&qp->local_ops_pending, 0);
init_waitqueue_head(&qp->wait);
@@ -1080,6 +1169,11 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd,
qp->s_max_sge = init_attr->cap.max_send_sge;
if (init_attr->sq_sig_type == IB_SIGNAL_REQ_WR)
qp->s_flags = RVT_S_SIGNAL_REQ_WR;
+ err = alloc_ud_wq_attr(qp, rdi->dparms.node);
+ if (err) {
+ ret = (ERR_PTR(err));
+ goto bail_driver_priv;
+ }
err = alloc_qpn(rdi, &rdi->qp_dev->qpn_table,
init_attr->qp_type,
@@ -1172,28 +1266,6 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd,
ret = &qp->ibqp;
- /*
- * We have our QP and its good, now keep track of what types of opcodes
- * can be processed on this QP. We do this by keeping track of what the
- * 3 high order bits of the opcode are.
- */
- switch (init_attr->qp_type) {
- case IB_QPT_SMI:
- case IB_QPT_GSI:
- case IB_QPT_UD:
- qp->allowed_ops = IB_OPCODE_UD;
- break;
- case IB_QPT_RC:
- qp->allowed_ops = IB_OPCODE_RC;
- break;
- case IB_QPT_UC:
- qp->allowed_ops = IB_OPCODE_UC;
- break;
- default:
- ret = ERR_PTR(-EINVAL);
- goto bail_ip;
- }
-
return ret;
bail_ip:
@@ -1204,8 +1276,8 @@ bail_qpn:
rvt_free_qpn(&rdi->qp_dev->qpn_table, qp->ibqp.qp_num);
bail_rq_wq:
- if (!qp->ip)
- vfree(qp->r_rq.wq);
+ rvt_free_rq(&qp->r_rq);
+ free_ud_wq_attr(qp);
bail_driver_priv:
rdi->driver_f.qp_priv_free(rdi, qp);
@@ -1271,19 +1343,26 @@ int rvt_error_qp(struct rvt_qp *qp, enum ib_wc_status err)
}
wc.status = IB_WC_WR_FLUSH_ERR;
- if (qp->r_rq.wq) {
- struct rvt_rwq *wq;
+ if (qp->r_rq.kwq) {
u32 head;
u32 tail;
-
- spin_lock(&qp->r_rq.lock);
-
+ struct rvt_rwq *wq = NULL;
+ struct rvt_krwq *kwq = NULL;
+
+ spin_lock(&qp->r_rq.kwq->c_lock);
+ /* qp->ip used to validate if there is a user buffer mmaped */
+ if (qp->ip) {
+ wq = qp->r_rq.wq;
+ head = RDMA_READ_UAPI_ATOMIC(wq->head);
+ tail = RDMA_READ_UAPI_ATOMIC(wq->tail);
+ } else {
+ kwq = qp->r_rq.kwq;
+ head = kwq->head;
+ tail = kwq->tail;
+ }
/* sanity check pointers before trusting them */
- wq = qp->r_rq.wq;
- head = wq->head;
if (head >= qp->r_rq.size)
head = 0;
- tail = wq->tail;
if (tail >= qp->r_rq.size)
tail = 0;
while (tail != head) {
@@ -1292,9 +1371,11 @@ int rvt_error_qp(struct rvt_qp *qp, enum ib_wc_status err)
tail = 0;
rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc, 1);
}
- wq->tail = tail;
-
- spin_unlock(&qp->r_rq.lock);
+ if (qp->ip)
+ RDMA_WRITE_UAPI_ATOMIC(wq->tail, tail);
+ else
+ kwq->tail = tail;
+ spin_unlock(&qp->r_rq.kwq->c_lock);
} else if (qp->ibqp.event_handler) {
ret = 1;
}
@@ -1636,12 +1717,12 @@ int rvt_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata)
if (qp->ip)
kref_put(&qp->ip->ref, rvt_release_mmap_info);
- else
- vfree(qp->r_rq.wq);
+ kvfree(qp->r_rq.kwq);
rdi->driver_f.qp_priv_free(rdi, qp);
kfree(qp->s_ack_queue);
rdma_destroy_ah_attr(&qp->remote_ah_attr);
rdma_destroy_ah_attr(&qp->alt_ah_attr);
+ free_ud_wq_attr(qp);
vfree(qp->s_wq);
kfree(qp);
return 0;
@@ -1723,7 +1804,7 @@ int rvt_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr,
const struct ib_recv_wr **bad_wr)
{
struct rvt_qp *qp = ibqp_to_rvtqp(ibqp);
- struct rvt_rwq *wq = qp->r_rq.wq;
+ struct rvt_krwq *wq = qp->r_rq.kwq;
unsigned long flags;
int qp_err_flush = (ib_rvt_state_ops[qp->state] & RVT_FLUSH_RECV) &&
!qp->ibqp.srq;
@@ -1744,12 +1825,12 @@ int rvt_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr,
return -EINVAL;
}
- spin_lock_irqsave(&qp->r_rq.lock, flags);
+ spin_lock_irqsave(&qp->r_rq.kwq->p_lock, flags);
next = wq->head + 1;
if (next >= qp->r_rq.size)
next = 0;
- if (next == wq->tail) {
- spin_unlock_irqrestore(&qp->r_rq.lock, flags);
+ if (next == READ_ONCE(wq->tail)) {
+ spin_unlock_irqrestore(&qp->r_rq.kwq->p_lock, flags);
*bad_wr = wr;
return -ENOMEM;
}
@@ -1766,16 +1847,18 @@ int rvt_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr,
wqe = rvt_get_rwqe_ptr(&qp->r_rq, wq->head);
wqe->wr_id = wr->wr_id;
wqe->num_sge = wr->num_sge;
- for (i = 0; i < wr->num_sge; i++)
- wqe->sg_list[i] = wr->sg_list[i];
+ for (i = 0; i < wr->num_sge; i++) {
+ wqe->sg_list[i].addr = wr->sg_list[i].addr;
+ wqe->sg_list[i].length = wr->sg_list[i].length;
+ wqe->sg_list[i].lkey = wr->sg_list[i].lkey;
+ }
/*
* Make sure queue entry is written
* before the head index.
*/
- smp_wmb();
- wq->head = next;
+ smp_store_release(&wq->head, next);
}
- spin_unlock_irqrestore(&qp->r_rq.lock, flags);
+ spin_unlock_irqrestore(&qp->r_rq.kwq->p_lock, flags);
}
return 0;
}
@@ -1856,10 +1939,9 @@ static inline int rvt_qp_is_avail(
/* see rvt_qp_wqe_unreserve() */
smp_mb__before_atomic();
- reserved_used = atomic_read(&qp->s_reserved_used);
if (unlikely(reserved_op)) {
/* see rvt_qp_wqe_unreserve() */
- smp_mb__before_atomic();
+ reserved_used = atomic_read(&qp->s_reserved_used);
if (reserved_used >= rdi->dparms.reserved_operations)
return -ENOMEM;
return 0;
@@ -1867,14 +1949,13 @@ static inline int rvt_qp_is_avail(
/* non-reserved operations */
if (likely(qp->s_avail))
return 0;
- slast = READ_ONCE(qp->s_last);
+ /* See rvt_qp_complete_swqe() */
+ slast = smp_load_acquire(&qp->s_last);
if (qp->s_head >= slast)
avail = qp->s_size - (qp->s_head - slast);
else
avail = slast - qp->s_head;
- /* see rvt_qp_wqe_unreserve() */
- smp_mb__before_atomic();
reserved_used = atomic_read(&qp->s_reserved_used);
avail = avail - 1 -
(rdi->dparms.reserved_operations - reserved_used);
@@ -2011,10 +2092,10 @@ static int rvt_post_one_wr(struct rvt_qp *qp,
*/
log_pmtu = qp->log_pmtu;
if (qp->allowed_ops == IB_OPCODE_UD) {
- struct rvt_ah *ah = ibah_to_rvtah(wqe->ud_wr.ah);
+ struct rvt_ah *ah = rvt_get_swqe_ah(wqe);
log_pmtu = ah->log_pmtu;
- atomic_inc(&ibah_to_rvtah(ud_wr(wr)->ah)->refcount);
+ rdma_copy_ah_attr(wqe->ud_wr.attr, &ah->attr);
}
if (rdi->post_parms[wr->opcode].flags & RVT_OPERATION_LOCAL) {
@@ -2059,7 +2140,7 @@ static int rvt_post_one_wr(struct rvt_qp *qp,
bail_inval_free_ref:
if (qp->allowed_ops == IB_OPCODE_UD)
- atomic_dec(&ibah_to_rvtah(ud_wr(wr)->ah)->refcount);
+ rdma_destroy_ah_attr(wqe->ud_wr.attr);
bail_inval_free:
/* release mr holds */
while (j) {
@@ -2145,7 +2226,7 @@ int rvt_post_srq_recv(struct ib_srq *ibsrq, const struct ib_recv_wr *wr,
const struct ib_recv_wr **bad_wr)
{
struct rvt_srq *srq = ibsrq_to_rvtsrq(ibsrq);
- struct rvt_rwq *wq;
+ struct rvt_krwq *wq;
unsigned long flags;
for (; wr; wr = wr->next) {
@@ -2158,13 +2239,13 @@ int rvt_post_srq_recv(struct ib_srq *ibsrq, const struct ib_recv_wr *wr,
return -EINVAL;
}
- spin_lock_irqsave(&srq->rq.lock, flags);
- wq = srq->rq.wq;
+ spin_lock_irqsave(&srq->rq.kwq->p_lock, flags);
+ wq = srq->rq.kwq;
next = wq->head + 1;
if (next >= srq->rq.size)
next = 0;
- if (next == wq->tail) {
- spin_unlock_irqrestore(&srq->rq.lock, flags);
+ if (next == READ_ONCE(wq->tail)) {
+ spin_unlock_irqrestore(&srq->rq.kwq->p_lock, flags);
*bad_wr = wr;
return -ENOMEM;
}
@@ -2172,17 +2253,35 @@ int rvt_post_srq_recv(struct ib_srq *ibsrq, const struct ib_recv_wr *wr,
wqe = rvt_get_rwqe_ptr(&srq->rq, wq->head);
wqe->wr_id = wr->wr_id;
wqe->num_sge = wr->num_sge;
- for (i = 0; i < wr->num_sge; i++)
- wqe->sg_list[i] = wr->sg_list[i];
+ for (i = 0; i < wr->num_sge; i++) {
+ wqe->sg_list[i].addr = wr->sg_list[i].addr;
+ wqe->sg_list[i].length = wr->sg_list[i].length;
+ wqe->sg_list[i].lkey = wr->sg_list[i].lkey;
+ }
/* Make sure queue entry is written before the head index. */
- smp_wmb();
- wq->head = next;
- spin_unlock_irqrestore(&srq->rq.lock, flags);
+ smp_store_release(&wq->head, next);
+ spin_unlock_irqrestore(&srq->rq.kwq->p_lock, flags);
}
return 0;
}
/*
+ * rvt used the internal kernel struct as part of its ABI, for now make sure
+ * the kernel struct does not change layout. FIXME: rvt should never cast the
+ * user struct to a kernel struct.
+ */
+static struct ib_sge *rvt_cast_sge(struct rvt_wqe_sge *sge)
+{
+ BUILD_BUG_ON(offsetof(struct ib_sge, addr) !=
+ offsetof(struct rvt_wqe_sge, addr));
+ BUILD_BUG_ON(offsetof(struct ib_sge, length) !=
+ offsetof(struct rvt_wqe_sge, length));
+ BUILD_BUG_ON(offsetof(struct ib_sge, lkey) !=
+ offsetof(struct rvt_wqe_sge, lkey));
+ return (struct ib_sge *)sge;
+}
+
+/*
* Validate a RWQE and fill in the SGE state.
* Return 1 if OK.
*/
@@ -2205,7 +2304,7 @@ static int init_sge(struct rvt_qp *qp, struct rvt_rwqe *wqe)
continue;
/* Check LKEY */
ret = rvt_lkey_ok(rkt, pd, j ? &ss->sg_list[j - 1] : &ss->sge,
- NULL, &wqe->sg_list[i],
+ NULL, rvt_cast_sge(&wqe->sg_list[i]),
IB_ACCESS_LOCAL_WRITE);
if (unlikely(ret <= 0))
goto bad_lkey;
@@ -2234,6 +2333,50 @@ bad_lkey:
}
/**
+ * get_count - count numbers of request work queue entries
+ * in circular buffer
+ * @rq: data structure for request queue entry
+ * @tail: tail indices of the circular buffer
+ * @head: head indices of the circular buffer
+ *
+ * Return - total number of entries in the circular buffer
+ */
+static u32 get_count(struct rvt_rq *rq, u32 tail, u32 head)
+{
+ u32 count;
+
+ count = head;
+
+ if (count >= rq->size)
+ count = 0;
+ if (count < tail)
+ count += rq->size - tail;
+ else
+ count -= tail;
+
+ return count;
+}
+
+/**
+ * get_rvt_head - get head indices of the circular buffer
+ * @rq: data structure for request queue entry
+ * @ip: the QP
+ *
+ * Return - head index value
+ */
+static inline u32 get_rvt_head(struct rvt_rq *rq, void *ip)
+{
+ u32 head;
+
+ if (ip)
+ head = RDMA_READ_UAPI_ATOMIC(rq->wq->head);
+ else
+ head = rq->kwq->head;
+
+ return head;
+}
+
+/**
* rvt_get_rwqe - copy the next RWQE into the QP's RWQE
* @qp: the QP
* @wr_id_only: update qp->r_wr_id only, not qp->r_sge
@@ -2247,39 +2390,54 @@ int rvt_get_rwqe(struct rvt_qp *qp, bool wr_id_only)
{
unsigned long flags;
struct rvt_rq *rq;
+ struct rvt_krwq *kwq = NULL;
struct rvt_rwq *wq;
struct rvt_srq *srq;
struct rvt_rwqe *wqe;
void (*handler)(struct ib_event *, void *);
u32 tail;
+ u32 head;
int ret;
+ void *ip = NULL;
if (qp->ibqp.srq) {
srq = ibsrq_to_rvtsrq(qp->ibqp.srq);
handler = srq->ibsrq.event_handler;
rq = &srq->rq;
+ ip = srq->ip;
} else {
srq = NULL;
handler = NULL;
rq = &qp->r_rq;
+ ip = qp->ip;
}
- spin_lock_irqsave(&rq->lock, flags);
+ spin_lock_irqsave(&rq->kwq->c_lock, flags);
if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) {
ret = 0;
goto unlock;
}
+ kwq = rq->kwq;
+ if (ip) {
+ wq = rq->wq;
+ tail = RDMA_READ_UAPI_ATOMIC(wq->tail);
+ } else {
+ tail = kwq->tail;
+ }
- wq = rq->wq;
- tail = wq->tail;
/* Validate tail before using it since it is user writable. */
if (tail >= rq->size)
tail = 0;
- if (unlikely(tail == wq->head)) {
+
+ if (kwq->count < RVT_RWQ_COUNT_THRESHOLD) {
+ head = get_rvt_head(rq, ip);
+ kwq->count = get_count(rq, tail, head);
+ }
+ if (unlikely(kwq->count == 0)) {
ret = 0;
goto unlock;
}
- /* Make sure entry is read after head index is read. */
+ /* Make sure entry is read after the count is read. */
smp_rmb();
wqe = rvt_get_rwqe_ptr(rq, tail);
/*
@@ -2289,43 +2447,41 @@ int rvt_get_rwqe(struct rvt_qp *qp, bool wr_id_only)
*/
if (++tail >= rq->size)
tail = 0;
- wq->tail = tail;
+ if (ip)
+ RDMA_WRITE_UAPI_ATOMIC(wq->tail, tail);
+ else
+ kwq->tail = tail;
if (!wr_id_only && !init_sge(qp, wqe)) {
ret = -1;
goto unlock;
}
qp->r_wr_id = wqe->wr_id;
+ kwq->count--;
ret = 1;
set_bit(RVT_R_WRID_VALID, &qp->r_aflags);
if (handler) {
- u32 n;
-
/*
* Validate head pointer value and compute
* the number of remaining WQEs.
*/
- n = wq->head;
- if (n >= rq->size)
- n = 0;
- if (n < tail)
- n += rq->size - tail;
- else
- n -= tail;
- if (n < srq->limit) {
- struct ib_event ev;
-
- srq->limit = 0;
- spin_unlock_irqrestore(&rq->lock, flags);
- ev.device = qp->ibqp.device;
- ev.element.srq = qp->ibqp.srq;
- ev.event = IB_EVENT_SRQ_LIMIT_REACHED;
- handler(&ev, srq->ibsrq.srq_context);
- goto bail;
+ if (kwq->count < srq->limit) {
+ kwq->count = get_count(rq, tail, get_rvt_head(rq, ip));
+ if (kwq->count < srq->limit) {
+ struct ib_event ev;
+
+ srq->limit = 0;
+ spin_unlock_irqrestore(&rq->kwq->c_lock, flags);
+ ev.device = qp->ibqp.device;
+ ev.element.srq = qp->ibqp.srq;
+ ev.event = IB_EVENT_SRQ_LIMIT_REACHED;
+ handler(&ev, srq->ibsrq.srq_context);
+ goto bail;
+ }
}
}
unlock:
- spin_unlock_irqrestore(&rq->lock, flags);
+ spin_unlock_irqrestore(&rq->kwq->c_lock, flags);
bail:
return ret;
}
@@ -2667,27 +2823,16 @@ void rvt_send_complete(struct rvt_qp *qp, struct rvt_swqe *wqe,
enum ib_wc_status status)
{
u32 old_last, last;
- struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device);
+ struct rvt_dev_info *rdi;
if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_OR_FLUSH_SEND))
return;
+ rdi = ib_to_rvt(qp->ibqp.device);
- last = qp->s_last;
- old_last = last;
- trace_rvt_qp_send_completion(qp, wqe, last);
- if (++last >= qp->s_size)
- last = 0;
- trace_rvt_qp_send_completion(qp, wqe, last);
- qp->s_last = last;
- /* See post_send() */
- barrier();
- rvt_put_qp_swqe(qp, wqe);
-
- rvt_qp_swqe_complete(qp,
- wqe,
- rdi->wc_opcode[wqe->wr.opcode],
- status);
-
+ old_last = qp->s_last;
+ trace_rvt_qp_send_completion(qp, wqe, old_last);
+ last = rvt_qp_complete_swqe(qp, wqe, rdi->wc_opcode[wqe->wr.opcode],
+ status);
if (qp->s_acked == old_last)
qp->s_acked = last;
if (qp->s_cur == old_last)
@@ -3021,8 +3166,7 @@ do_write:
wc.sl = rdma_ah_get_sl(&qp->remote_ah_attr);
wc.port_num = 1;
/* Signal completion event if the solicited bit is set. */
- rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc,
- wqe->wr.send_flags & IB_SEND_SOLICITED);
+ rvt_recv_cq(qp, &wc, wqe->wr.send_flags & IB_SEND_SOLICITED);
send_comp:
spin_unlock_irqrestore(&qp->r_lock, flags);
diff --git a/drivers/infiniband/sw/rdmavt/qp.h b/drivers/infiniband/sw/rdmavt/qp.h
index 6db1619389b0..2cdba1283bf6 100644
--- a/drivers/infiniband/sw/rdmavt/qp.h
+++ b/drivers/infiniband/sw/rdmavt/qp.h
@@ -68,4 +68,6 @@ int rvt_post_srq_recv(struct ib_srq *ibsrq, const struct ib_recv_wr *wr,
const struct ib_recv_wr **bad_wr);
int rvt_wss_init(struct rvt_dev_info *rdi);
void rvt_wss_exit(struct rvt_dev_info *rdi);
+int rvt_alloc_rq(struct rvt_rq *rq, u32 size, int node,
+ struct ib_udata *udata);
#endif /* DEF_RVTQP_H */
diff --git a/drivers/infiniband/sw/rdmavt/rc.c b/drivers/infiniband/sw/rdmavt/rc.c
index 09f0cf538be6..890d7b760d2e 100644
--- a/drivers/infiniband/sw/rdmavt/rc.c
+++ b/drivers/infiniband/sw/rdmavt/rc.c
@@ -104,26 +104,33 @@ __be32 rvt_compute_aeth(struct rvt_qp *qp)
} else {
u32 min, max, x;
u32 credits;
- struct rvt_rwq *wq = qp->r_rq.wq;
u32 head;
u32 tail;
- /* sanity check pointers before trusting them */
- head = wq->head;
- if (head >= qp->r_rq.size)
- head = 0;
- tail = wq->tail;
- if (tail >= qp->r_rq.size)
- tail = 0;
- /*
- * Compute the number of credits available (RWQEs).
- * There is a small chance that the pair of reads are
- * not atomic, which is OK, since the fuzziness is
- * resolved as further ACKs go out.
- */
- credits = head - tail;
- if ((int)credits < 0)
- credits += qp->r_rq.size;
+ credits = READ_ONCE(qp->r_rq.kwq->count);
+ if (credits == 0) {
+ /* sanity check pointers before trusting them */
+ if (qp->ip) {
+ head = RDMA_READ_UAPI_ATOMIC(qp->r_rq.wq->head);
+ tail = RDMA_READ_UAPI_ATOMIC(qp->r_rq.wq->tail);
+ } else {
+ head = READ_ONCE(qp->r_rq.kwq->head);
+ tail = READ_ONCE(qp->r_rq.kwq->tail);
+ }
+ if (head >= qp->r_rq.size)
+ head = 0;
+ if (tail >= qp->r_rq.size)
+ tail = 0;
+ /*
+ * Compute the number of credits available (RWQEs).
+ * There is a small chance that the pair of reads are
+ * not atomic, which is OK, since the fuzziness is
+ * resolved as further ACKs go out.
+ */
+ credits = head - tail;
+ if ((int)credits < 0)
+ credits += qp->r_rq.size;
+ }
/*
* Binary search the credit table to find the code to
* use.
diff --git a/drivers/infiniband/sw/rdmavt/srq.c b/drivers/infiniband/sw/rdmavt/srq.c
index 8d6b3e764255..24fef021d51d 100644
--- a/drivers/infiniband/sw/rdmavt/srq.c
+++ b/drivers/infiniband/sw/rdmavt/srq.c
@@ -52,7 +52,7 @@
#include "srq.h"
#include "vt.h"
-
+#include "qp.h"
/**
* rvt_driver_srq_init - init srq resources on a per driver basis
* @rdi: rvt dev structure
@@ -97,11 +97,8 @@ int rvt_create_srq(struct ib_srq *ibsrq, struct ib_srq_init_attr *srq_init_attr,
srq->rq.max_sge = srq_init_attr->attr.max_sge;
sz = sizeof(struct ib_sge) * srq->rq.max_sge +
sizeof(struct rvt_rwqe);
- srq->rq.wq = udata ?
- vmalloc_user(sizeof(struct rvt_rwq) + srq->rq.size * sz) :
- vzalloc_node(sizeof(struct rvt_rwq) + srq->rq.size * sz,
- dev->dparms.node);
- if (!srq->rq.wq) {
+ if (rvt_alloc_rq(&srq->rq, srq->rq.size * sz,
+ dev->dparms.node, udata)) {
ret = -ENOMEM;
goto bail_srq;
}
@@ -152,7 +149,7 @@ int rvt_create_srq(struct ib_srq *ibsrq, struct ib_srq_init_attr *srq_init_attr,
bail_ip:
kfree(srq->ip);
bail_wq:
- vfree(srq->rq.wq);
+ rvt_free_rq(&srq->rq);
bail_srq:
return ret;
}
@@ -172,11 +169,12 @@ int rvt_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
{
struct rvt_srq *srq = ibsrq_to_rvtsrq(ibsrq);
struct rvt_dev_info *dev = ib_to_rvt(ibsrq->device);
- struct rvt_rwq *wq;
+ struct rvt_rq tmp_rq = {};
int ret = 0;
if (attr_mask & IB_SRQ_MAX_WR) {
- struct rvt_rwq *owq;
+ struct rvt_krwq *okwq = NULL;
+ struct rvt_rwq *owq = NULL;
struct rvt_rwqe *p;
u32 sz, size, n, head, tail;
@@ -185,17 +183,12 @@ int rvt_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
((attr_mask & IB_SRQ_LIMIT) ?
attr->srq_limit : srq->limit) > attr->max_wr)
return -EINVAL;
-
sz = sizeof(struct rvt_rwqe) +
srq->rq.max_sge * sizeof(struct ib_sge);
size = attr->max_wr + 1;
- wq = udata ?
- vmalloc_user(sizeof(struct rvt_rwq) + size * sz) :
- vzalloc_node(sizeof(struct rvt_rwq) + size * sz,
- dev->dparms.node);
- if (!wq)
+ if (rvt_alloc_rq(&tmp_rq, size * sz, dev->dparms.node,
+ udata))
return -ENOMEM;
-
/* Check that we can write the offset to mmap. */
if (udata && udata->inlen >= sizeof(__u64)) {
__u64 offset_addr;
@@ -213,14 +206,20 @@ int rvt_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
goto bail_free;
}
- spin_lock_irq(&srq->rq.lock);
+ spin_lock_irq(&srq->rq.kwq->c_lock);
/*
* validate head and tail pointer values and compute
* the number of remaining WQEs.
*/
- owq = srq->rq.wq;
- head = owq->head;
- tail = owq->tail;
+ if (udata) {
+ owq = srq->rq.wq;
+ head = RDMA_READ_UAPI_ATOMIC(owq->head);
+ tail = RDMA_READ_UAPI_ATOMIC(owq->tail);
+ } else {
+ okwq = srq->rq.kwq;
+ head = okwq->head;
+ tail = okwq->tail;
+ }
if (head >= srq->rq.size || tail >= srq->rq.size) {
ret = -EINVAL;
goto bail_unlock;
@@ -235,7 +234,7 @@ int rvt_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
goto bail_unlock;
}
n = 0;
- p = wq->wq;
+ p = tmp_rq.kwq->curr_wq;
while (tail != head) {
struct rvt_rwqe *wqe;
int i;
@@ -250,22 +249,29 @@ int rvt_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
if (++tail >= srq->rq.size)
tail = 0;
}
- srq->rq.wq = wq;
+ srq->rq.kwq = tmp_rq.kwq;
+ if (udata) {
+ srq->rq.wq = tmp_rq.wq;
+ RDMA_WRITE_UAPI_ATOMIC(tmp_rq.wq->head, n);
+ RDMA_WRITE_UAPI_ATOMIC(tmp_rq.wq->tail, 0);
+ } else {
+ tmp_rq.kwq->head = n;
+ tmp_rq.kwq->tail = 0;
+ }
srq->rq.size = size;
- wq->head = n;
- wq->tail = 0;
if (attr_mask & IB_SRQ_LIMIT)
srq->limit = attr->srq_limit;
- spin_unlock_irq(&srq->rq.lock);
+ spin_unlock_irq(&srq->rq.kwq->c_lock);
vfree(owq);
+ kvfree(okwq);
if (srq->ip) {
struct rvt_mmap_info *ip = srq->ip;
struct rvt_dev_info *dev = ib_to_rvt(srq->ibsrq.device);
u32 s = sizeof(struct rvt_rwq) + size * sz;
- rvt_update_mmap_info(dev, ip, s, wq);
+ rvt_update_mmap_info(dev, ip, s, tmp_rq.wq);
/*
* Return the offset to mmap.
@@ -289,19 +295,19 @@ int rvt_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
spin_unlock_irq(&dev->pending_lock);
}
} else if (attr_mask & IB_SRQ_LIMIT) {
- spin_lock_irq(&srq->rq.lock);
+ spin_lock_irq(&srq->rq.kwq->c_lock);
if (attr->srq_limit >= srq->rq.size)
ret = -EINVAL;
else
srq->limit = attr->srq_limit;
- spin_unlock_irq(&srq->rq.lock);
+ spin_unlock_irq(&srq->rq.kwq->c_lock);
}
return ret;
bail_unlock:
- spin_unlock_irq(&srq->rq.lock);
+ spin_unlock_irq(&srq->rq.kwq->c_lock);
bail_free:
- vfree(wq);
+ rvt_free_rq(&tmp_rq);
return ret;
}
@@ -336,6 +342,5 @@ void rvt_destroy_srq(struct ib_srq *ibsrq, struct ib_udata *udata)
spin_unlock(&dev->n_srqs_lock);
if (srq->ip)
kref_put(&srq->ip->ref, rvt_release_mmap_info);
- else
- vfree(srq->rq.wq);
+ kvfree(srq->rq.kwq);
}
diff --git a/drivers/infiniband/sw/rdmavt/trace_mr.h b/drivers/infiniband/sw/rdmavt/trace_mr.h
index 976e482930a3..95b8a0e3b8bd 100644
--- a/drivers/infiniband/sw/rdmavt/trace_mr.h
+++ b/drivers/infiniband/sw/rdmavt/trace_mr.h
@@ -54,6 +54,8 @@
#include <rdma/rdma_vt.h>
#include <rdma/rdmavt_mr.h>
+#include "mr.h"
+
#undef TRACE_SYSTEM
#define TRACE_SYSTEM rvt_mr
DECLARE_EVENT_CLASS(
@@ -64,8 +66,12 @@ DECLARE_EVENT_CLASS(
RDI_DEV_ENTRY(ib_to_rvt(mr->pd->device))
__field(void *, vaddr)
__field(struct page *, page)
+ __field(u64, iova)
+ __field(u64, user_base)
__field(size_t, len)
+ __field(size_t, length)
__field(u32, lkey)
+ __field(u32, offset)
__field(u16, m)
__field(u16, n)
),
@@ -73,18 +79,28 @@ DECLARE_EVENT_CLASS(
RDI_DEV_ASSIGN(ib_to_rvt(mr->pd->device));
__entry->vaddr = v;
__entry->page = virt_to_page(v);
+ __entry->iova = mr->iova;
+ __entry->user_base = mr->user_base;
+ __entry->lkey = mr->lkey;
__entry->m = m;
__entry->n = n;
__entry->len = len;
+ __entry->length = mr->length;
+ __entry->offset = mr->offset;
),
TP_printk(
- "[%s] vaddr %p page %p m %u n %u len %ld",
+ "[%s] lkey %x iova %llx user_base %llx mr_len %lu vaddr %llx page %p m %u n %u len %lu off %u",
__get_str(dev),
- __entry->vaddr,
+ __entry->lkey,
+ __entry->iova,
+ __entry->user_base,
+ __entry->length,
+ (unsigned long long)__entry->vaddr,
__entry->page,
__entry->m,
__entry->n,
- __entry->len
+ __entry->len,
+ __entry->offset
)
);
@@ -165,6 +181,40 @@ DEFINE_EVENT(
TP_PROTO(struct rvt_sge *sge, struct ib_sge *isge),
TP_ARGS(sge, isge));
+TRACE_EVENT(
+ rvt_map_mr_sg,
+ TP_PROTO(struct ib_mr *ibmr, int sg_nents, unsigned int *sg_offset),
+ TP_ARGS(ibmr, sg_nents, sg_offset),
+ TP_STRUCT__entry(
+ RDI_DEV_ENTRY(ib_to_rvt(to_imr(ibmr)->mr.pd->device))
+ __field(u64, iova)
+ __field(u64, ibmr_iova)
+ __field(u64, user_base)
+ __field(u64, ibmr_length)
+ __field(int, sg_nents)
+ __field(uint, sg_offset)
+ ),
+ TP_fast_assign(
+ RDI_DEV_ASSIGN(ib_to_rvt(to_imr(ibmr)->mr.pd->device))
+ __entry->ibmr_iova = ibmr->iova;
+ __entry->iova = to_imr(ibmr)->mr.iova;
+ __entry->user_base = to_imr(ibmr)->mr.user_base;
+ __entry->ibmr_length = to_imr(ibmr)->mr.length;
+ __entry->sg_nents = sg_nents;
+ __entry->sg_offset = sg_offset ? *sg_offset : 0;
+ ),
+ TP_printk(
+ "[%s] ibmr_iova %llx iova %llx user_base %llx length %llx sg_nents %d sg_offset %u",
+ __get_str(dev),
+ __entry->ibmr_iova,
+ __entry->iova,
+ __entry->user_base,
+ __entry->ibmr_length,
+ __entry->sg_nents,
+ __entry->sg_offset
+ )
+);
+
#endif /* __RVT_TRACE_MR_H */
#undef TRACE_INCLUDE_PATH
diff --git a/drivers/infiniband/sw/rdmavt/vt.c b/drivers/infiniband/sw/rdmavt/vt.c
index 9546a837a8ac..18da1e1ea979 100644
--- a/drivers/infiniband/sw/rdmavt/vt.c
+++ b/drivers/infiniband/sw/rdmavt/vt.c
@@ -382,6 +382,8 @@ enum {
};
static const struct ib_device_ops rvt_dev_ops = {
+ .uverbs_abi_ver = RVT_UVERBS_ABI_VERSION,
+
.alloc_fmr = rvt_alloc_fmr,
.alloc_mr = rvt_alloc_mr,
.alloc_pd = rvt_alloc_pd,
@@ -427,6 +429,7 @@ static const struct ib_device_ops rvt_dev_ops = {
.unmap_fmr = rvt_unmap_fmr,
INIT_RDMA_OBJ_SIZE(ib_ah, rvt_ah, ibah),
+ INIT_RDMA_OBJ_SIZE(ib_cq, rvt_cq, ibcq),
INIT_RDMA_OBJ_SIZE(ib_pd, rvt_pd, ibpd),
INIT_RDMA_OBJ_SIZE(ib_srq, rvt_srq, ibsrq),
INIT_RDMA_OBJ_SIZE(ib_ucontext, rvt_ucontext, ibucontext),
@@ -530,7 +533,7 @@ static noinline int check_support(struct rvt_dev_info *rdi, int verb)
*
* Return: 0 on success otherwise an errno.
*/
-int rvt_register_device(struct rvt_dev_info *rdi, u32 driver_id)
+int rvt_register_device(struct rvt_dev_info *rdi)
{
int ret = 0, i;
@@ -600,7 +603,6 @@ int rvt_register_device(struct rvt_dev_info *rdi, u32 driver_id)
* exactly which functions rdmavt supports, nor do they know the ABI
* version, so we do all of this sort of stuff here.
*/
- rdi->ibdev.uverbs_abi_ver = RVT_UVERBS_ABI_VERSION;
rdi->ibdev.uverbs_cmd_mask =
(1ull << IB_USER_VERBS_CMD_GET_CONTEXT) |
(1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) |
@@ -636,7 +638,6 @@ int rvt_register_device(struct rvt_dev_info *rdi, u32 driver_id)
if (!rdi->ibdev.num_comp_vectors)
rdi->ibdev.num_comp_vectors = 1;
- rdi->ibdev.driver_id = driver_id;
/* We are now good to announce we exist */
ret = ib_register_device(&rdi->ibdev, dev_name(&rdi->ibdev.dev));
if (ret) {
diff --git a/drivers/infiniband/sw/rdmavt/vt.h b/drivers/infiniband/sw/rdmavt/vt.h
index 0675ea6c3872..d19ff817c2c7 100644
--- a/drivers/infiniband/sw/rdmavt/vt.h
+++ b/drivers/infiniband/sw/rdmavt/vt.h
@@ -78,6 +78,12 @@
fmt, \
##__VA_ARGS__)
+#define rvt_pr_err_ratelimited(rdi, fmt, ...) \
+ __rvt_pr_err_ratelimited((rdi)->driver_f.get_pci_dev(rdi), \
+ rvt_get_ibdev_name(rdi), \
+ fmt, \
+ ##__VA_ARGS__)
+
#define __rvt_pr_info(pdev, name, fmt, ...) \
dev_info(&pdev->dev, "%s: " fmt, name, ##__VA_ARGS__)
@@ -87,6 +93,9 @@
#define __rvt_pr_err(pdev, name, fmt, ...) \
dev_err(&pdev->dev, "%s: " fmt, name, ##__VA_ARGS__)
+#define __rvt_pr_err_ratelimited(pdev, name, fmt, ...) \
+ dev_err_ratelimited(&(pdev)->dev, "%s: " fmt, name, ##__VA_ARGS__)
+
static inline int ibport_num_to_idx(struct ib_device *ibdev, u8 port_num)
{
struct rvt_dev_info *rdi = ib_to_rvt(ibdev);
diff --git a/drivers/infiniband/sw/rxe/rxe_comp.c b/drivers/infiniband/sw/rxe/rxe_comp.c
index 00eb99d3df86..116cafc9afcf 100644
--- a/drivers/infiniband/sw/rxe/rxe_comp.c
+++ b/drivers/infiniband/sw/rxe/rxe_comp.c
@@ -558,7 +558,7 @@ int rxe_completer(void *arg)
{
struct rxe_qp *qp = (struct rxe_qp *)arg;
struct rxe_dev *rxe = to_rdev(qp->ibqp.device);
- struct rxe_send_wqe *wqe = wqe;
+ struct rxe_send_wqe *wqe = NULL;
struct sk_buff *skb = NULL;
struct rxe_pkt_info *pkt = NULL;
enum comp_state state;
diff --git a/drivers/infiniband/sw/rxe/rxe_mr.c b/drivers/infiniband/sw/rxe/rxe_mr.c
index f501f72489d8..ea6a819b7167 100644
--- a/drivers/infiniband/sw/rxe/rxe_mr.c
+++ b/drivers/infiniband/sw/rxe/rxe_mr.c
@@ -96,8 +96,7 @@ void rxe_mem_cleanup(struct rxe_pool_entry *arg)
struct rxe_mem *mem = container_of(arg, typeof(*mem), pelem);
int i;
- if (mem->umem)
- ib_umem_release(mem->umem);
+ ib_umem_release(mem->umem);
if (mem->map) {
for (i = 0; i < mem->num_map; i++)
diff --git a/drivers/infiniband/sw/rxe/rxe_pool.c b/drivers/infiniband/sw/rxe/rxe_pool.c
index 56cf18af016a..fbcbac52290b 100644
--- a/drivers/infiniband/sw/rxe/rxe_pool.c
+++ b/drivers/infiniband/sw/rxe/rxe_pool.c
@@ -72,6 +72,7 @@ struct rxe_type_info rxe_type_info[RXE_NUM_TYPES] = {
[RXE_TYPE_CQ] = {
.name = "rxe-cq",
.size = sizeof(struct rxe_cq),
+ .flags = RXE_POOL_NO_ALLOC,
.cleanup = rxe_cq_cleanup,
},
[RXE_TYPE_MR] = {
diff --git a/drivers/infiniband/sw/rxe/rxe_resp.c b/drivers/infiniband/sw/rxe/rxe_resp.c
index aca9f60f9b21..1cbfbd98eb22 100644
--- a/drivers/infiniband/sw/rxe/rxe_resp.c
+++ b/drivers/infiniband/sw/rxe/rxe_resp.c
@@ -431,6 +431,7 @@ static enum resp_states check_rkey(struct rxe_qp *qp,
qp->resp.va = reth_va(pkt);
qp->resp.rkey = reth_rkey(pkt);
qp->resp.resid = reth_len(pkt);
+ qp->resp.length = reth_len(pkt);
}
access = (pkt->mask & RXE_READ_MASK) ? IB_ACCESS_REMOTE_READ
: IB_ACCESS_REMOTE_WRITE;
@@ -856,7 +857,9 @@ static enum resp_states do_complete(struct rxe_qp *qp,
pkt->mask & RXE_WRITE_MASK) ?
IB_WC_RECV_RDMA_WITH_IMM : IB_WC_RECV;
wc->vendor_err = 0;
- wc->byte_len = wqe->dma.length - wqe->dma.resid;
+ wc->byte_len = (pkt->mask & RXE_IMMDT_MASK &&
+ pkt->mask & RXE_WRITE_MASK) ?
+ qp->resp.length : wqe->dma.length - wqe->dma.resid;
/* fields after byte_len are different between kernel and user
* space
diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c
index 8c3e2a18cfe4..4ebdfcf4d33e 100644
--- a/drivers/infiniband/sw/rxe/rxe_verbs.c
+++ b/drivers/infiniband/sw/rxe/rxe_verbs.c
@@ -778,55 +778,43 @@ err1:
return err;
}
-static struct ib_cq *rxe_create_cq(struct ib_device *dev,
- const struct ib_cq_init_attr *attr,
- struct ib_udata *udata)
+static int rxe_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
+ struct ib_udata *udata)
{
int err;
+ struct ib_device *dev = ibcq->device;
struct rxe_dev *rxe = to_rdev(dev);
- struct rxe_cq *cq;
+ struct rxe_cq *cq = to_rcq(ibcq);
struct rxe_create_cq_resp __user *uresp = NULL;
if (udata) {
if (udata->outlen < sizeof(*uresp))
- return ERR_PTR(-EINVAL);
+ return -EINVAL;
uresp = udata->outbuf;
}
if (attr->flags)
- return ERR_PTR(-EINVAL);
+ return -EINVAL;
err = rxe_cq_chk_attr(rxe, NULL, attr->cqe, attr->comp_vector);
if (err)
- goto err1;
-
- cq = rxe_alloc(&rxe->cq_pool);
- if (!cq) {
- err = -ENOMEM;
- goto err1;
- }
+ return err;
err = rxe_cq_from_init(rxe, cq, attr->cqe, attr->comp_vector, udata,
uresp);
if (err)
- goto err2;
-
- return &cq->ibcq;
+ return err;
-err2:
- rxe_drop_ref(cq);
-err1:
- return ERR_PTR(err);
+ return rxe_add_to_pool(&rxe->cq_pool, &cq->pelem);
}
-static int rxe_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata)
+static void rxe_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata)
{
struct rxe_cq *cq = to_rcq(ibcq);
rxe_cq_disable(cq);
rxe_drop_ref(cq);
- return 0;
}
static int rxe_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata)
@@ -1111,6 +1099,10 @@ static int rxe_enable_driver(struct ib_device *ib_dev)
}
static const struct ib_device_ops rxe_dev_ops = {
+ .owner = THIS_MODULE,
+ .driver_id = RDMA_DRIVER_RXE,
+ .uverbs_abi_ver = RXE_UVERBS_ABI_VERSION,
+
.alloc_hw_stats = rxe_ib_alloc_hw_stats,
.alloc_mr = rxe_alloc_mr,
.alloc_pd = rxe_alloc_pd,
@@ -1157,6 +1149,7 @@ static const struct ib_device_ops rxe_dev_ops = {
.resize_cq = rxe_resize_cq,
INIT_RDMA_OBJ_SIZE(ib_ah, rxe_ah, ibah),
+ INIT_RDMA_OBJ_SIZE(ib_cq, rxe_cq, ibcq),
INIT_RDMA_OBJ_SIZE(ib_pd, rxe_pd, ibpd),
INIT_RDMA_OBJ_SIZE(ib_srq, rxe_srq, ibsrq),
INIT_RDMA_OBJ_SIZE(ib_ucontext, rxe_ucontext, ibuc),
@@ -1170,7 +1163,6 @@ int rxe_register_device(struct rxe_dev *rxe, const char *ibdev_name)
strlcpy(dev->node_desc, "rxe", sizeof(dev->node_desc));
- dev->owner = THIS_MODULE;
dev->node_type = RDMA_NODE_IB_CA;
dev->phys_port_cnt = 1;
dev->num_comp_vectors = num_possible_cpus();
@@ -1182,7 +1174,6 @@ int rxe_register_device(struct rxe_dev *rxe, const char *ibdev_name)
dma_coerce_mask_and_coherent(&dev->dev,
dma_get_required_mask(&dev->dev));
- dev->uverbs_abi_ver = RXE_UVERBS_ABI_VERSION;
dev->uverbs_cmd_mask = BIT_ULL(IB_USER_VERBS_CMD_GET_CONTEXT)
| BIT_ULL(IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL)
| BIT_ULL(IB_USER_VERBS_CMD_QUERY_DEVICE)
@@ -1230,7 +1221,6 @@ int rxe_register_device(struct rxe_dev *rxe, const char *ibdev_name)
rxe->tfm = tfm;
rdma_set_device_sysfs_group(dev, &rxe_attr_group);
- dev->driver_id = RDMA_DRIVER_RXE;
err = ib_register_device(dev, ibdev_name);
if (err)
pr_warn("%s failed with error %d\n", __func__, err);
diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h b/drivers/infiniband/sw/rxe/rxe_verbs.h
index e8be7f44e3be..5c4b2239129c 100644
--- a/drivers/infiniband/sw/rxe/rxe_verbs.h
+++ b/drivers/infiniband/sw/rxe/rxe_verbs.h
@@ -85,8 +85,8 @@ struct rxe_cqe {
};
struct rxe_cq {
- struct rxe_pool_entry pelem;
struct ib_cq ibcq;
+ struct rxe_pool_entry pelem;
struct rxe_queue *queue;
spinlock_t cq_lock;
u8 notify;
@@ -213,6 +213,7 @@ struct rxe_resp_info {
struct rxe_mem *mr;
u32 resid;
u32 rkey;
+ u32 length;
u64 atomic_orig;
/* SRQ only */
diff --git a/drivers/infiniband/sw/siw/Kconfig b/drivers/infiniband/sw/siw/Kconfig
new file mode 100644
index 000000000000..dace276aea14
--- /dev/null
+++ b/drivers/infiniband/sw/siw/Kconfig
@@ -0,0 +1,18 @@
+config RDMA_SIW
+ tristate "Software RDMA over TCP/IP (iWARP) driver"
+ depends on INET && INFINIBAND && LIBCRC32C && 64BIT
+ select DMA_VIRT_OPS
+ help
+ This driver implements the iWARP RDMA transport over
+ the Linux TCP/IP network stack. It enables a system with a
+ standard Ethernet adapter to interoperate with a iWARP
+ adapter or with another system running the SIW driver.
+ (See also RXE which is a similar software driver for RoCE.)
+
+ The driver interfaces with the Linux RDMA stack and
+ implements both a kernel and user space RDMA verbs API.
+ The user space verbs API requires a support
+ library named libsiw which is loaded by the generic user
+ space verbs API, libibverbs. To implement RDMA over
+ TCP/IP, the driver further interfaces with the Linux
+ in-kernel TCP socket layer.
diff --git a/drivers/infiniband/sw/siw/Makefile b/drivers/infiniband/sw/siw/Makefile
new file mode 100644
index 000000000000..f5f7e3867889
--- /dev/null
+++ b/drivers/infiniband/sw/siw/Makefile
@@ -0,0 +1,11 @@
+obj-$(CONFIG_RDMA_SIW) += siw.o
+
+siw-y := \
+ siw_cm.o \
+ siw_cq.o \
+ siw_main.o \
+ siw_mem.o \
+ siw_qp.o \
+ siw_qp_tx.o \
+ siw_qp_rx.o \
+ siw_verbs.o
diff --git a/drivers/infiniband/sw/siw/iwarp.h b/drivers/infiniband/sw/siw/iwarp.h
new file mode 100644
index 000000000000..e8a04d9c89cb
--- /dev/null
+++ b/drivers/infiniband/sw/siw/iwarp.h
@@ -0,0 +1,380 @@
+/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause */
+
+/* Authors: Bernard Metzler <bmt@zurich.ibm.com> */
+/* Copyright (c) 2008-2019, IBM Corporation */
+
+#ifndef _IWARP_H
+#define _IWARP_H
+
+#include <rdma/rdma_user_cm.h> /* RDMA_MAX_PRIVATE_DATA */
+#include <linux/types.h>
+#include <asm/byteorder.h>
+
+#define RDMAP_VERSION 1
+#define DDP_VERSION 1
+#define MPA_REVISION_1 1
+#define MPA_REVISION_2 2
+#define MPA_MAX_PRIVDATA RDMA_MAX_PRIVATE_DATA
+#define MPA_KEY_REQ "MPA ID Req Frame"
+#define MPA_KEY_REP "MPA ID Rep Frame"
+#define MPA_IRD_ORD_MASK 0x3fff
+
+struct mpa_rr_params {
+ __be16 bits;
+ __be16 pd_len;
+};
+
+/*
+ * MPA request/response header bits & fields
+ */
+enum {
+ MPA_RR_FLAG_MARKERS = cpu_to_be16(0x8000),
+ MPA_RR_FLAG_CRC = cpu_to_be16(0x4000),
+ MPA_RR_FLAG_REJECT = cpu_to_be16(0x2000),
+ MPA_RR_FLAG_ENHANCED = cpu_to_be16(0x1000),
+ MPA_RR_FLAG_GSO_EXP = cpu_to_be16(0x0800),
+ MPA_RR_MASK_REVISION = cpu_to_be16(0x00ff)
+};
+
+/*
+ * MPA request/reply header
+ */
+struct mpa_rr {
+ __u8 key[16];
+ struct mpa_rr_params params;
+};
+
+static inline void __mpa_rr_set_revision(__be16 *bits, u8 rev)
+{
+ *bits = (*bits & ~MPA_RR_MASK_REVISION) |
+ (cpu_to_be16(rev) & MPA_RR_MASK_REVISION);
+}
+
+static inline u8 __mpa_rr_revision(__be16 mpa_rr_bits)
+{
+ __be16 rev = mpa_rr_bits & MPA_RR_MASK_REVISION;
+
+ return be16_to_cpu(rev);
+}
+
+enum mpa_v2_ctrl {
+ MPA_V2_PEER_TO_PEER = cpu_to_be16(0x8000),
+ MPA_V2_ZERO_LENGTH_RTR = cpu_to_be16(0x4000),
+ MPA_V2_RDMA_WRITE_RTR = cpu_to_be16(0x8000),
+ MPA_V2_RDMA_READ_RTR = cpu_to_be16(0x4000),
+ MPA_V2_RDMA_NO_RTR = cpu_to_be16(0x0000),
+ MPA_V2_MASK_IRD_ORD = cpu_to_be16(0x3fff)
+};
+
+struct mpa_v2_data {
+ __be16 ird;
+ __be16 ord;
+};
+
+struct mpa_marker {
+ __be16 rsvd;
+ __be16 fpdu_hmd; /* FPDU header-marker distance (= MPA's FPDUPTR) */
+};
+
+/*
+ * maximum MPA trailer
+ */
+struct mpa_trailer {
+ __u8 pad[4];
+ __be32 crc;
+};
+
+#define MPA_HDR_SIZE 2
+#define MPA_CRC_SIZE 4
+
+/*
+ * Common portion of iWARP headers (MPA, DDP, RDMAP)
+ * for any FPDU
+ */
+struct iwarp_ctrl {
+ __be16 mpa_len;
+ __be16 ddp_rdmap_ctrl;
+};
+
+/*
+ * DDP/RDMAP Hdr bits & fields
+ */
+enum {
+ DDP_FLAG_TAGGED = cpu_to_be16(0x8000),
+ DDP_FLAG_LAST = cpu_to_be16(0x4000),
+ DDP_MASK_RESERVED = cpu_to_be16(0x3C00),
+ DDP_MASK_VERSION = cpu_to_be16(0x0300),
+ RDMAP_MASK_VERSION = cpu_to_be16(0x00C0),
+ RDMAP_MASK_RESERVED = cpu_to_be16(0x0030),
+ RDMAP_MASK_OPCODE = cpu_to_be16(0x000f)
+};
+
+static inline u8 __ddp_get_version(struct iwarp_ctrl *ctrl)
+{
+ return be16_to_cpu(ctrl->ddp_rdmap_ctrl & DDP_MASK_VERSION) >> 8;
+}
+
+static inline void __ddp_set_version(struct iwarp_ctrl *ctrl, u8 version)
+{
+ ctrl->ddp_rdmap_ctrl =
+ (ctrl->ddp_rdmap_ctrl & ~DDP_MASK_VERSION) |
+ (cpu_to_be16((u16)version << 8) & DDP_MASK_VERSION);
+}
+
+static inline u8 __rdmap_get_version(struct iwarp_ctrl *ctrl)
+{
+ __be16 ver = ctrl->ddp_rdmap_ctrl & RDMAP_MASK_VERSION;
+
+ return be16_to_cpu(ver) >> 6;
+}
+
+static inline void __rdmap_set_version(struct iwarp_ctrl *ctrl, u8 version)
+{
+ ctrl->ddp_rdmap_ctrl = (ctrl->ddp_rdmap_ctrl & ~RDMAP_MASK_VERSION) |
+ (cpu_to_be16(version << 6) & RDMAP_MASK_VERSION);
+}
+
+static inline u8 __rdmap_get_opcode(struct iwarp_ctrl *ctrl)
+{
+ return be16_to_cpu(ctrl->ddp_rdmap_ctrl & RDMAP_MASK_OPCODE);
+}
+
+static inline void __rdmap_set_opcode(struct iwarp_ctrl *ctrl, u8 opcode)
+{
+ ctrl->ddp_rdmap_ctrl = (ctrl->ddp_rdmap_ctrl & ~RDMAP_MASK_OPCODE) |
+ (cpu_to_be16(opcode) & RDMAP_MASK_OPCODE);
+}
+
+struct iwarp_rdma_write {
+ struct iwarp_ctrl ctrl;
+ __be32 sink_stag;
+ __be64 sink_to;
+};
+
+struct iwarp_rdma_rreq {
+ struct iwarp_ctrl ctrl;
+ __be32 rsvd;
+ __be32 ddp_qn;
+ __be32 ddp_msn;
+ __be32 ddp_mo;
+ __be32 sink_stag;
+ __be64 sink_to;
+ __be32 read_size;
+ __be32 source_stag;
+ __be64 source_to;
+};
+
+struct iwarp_rdma_rresp {
+ struct iwarp_ctrl ctrl;
+ __be32 sink_stag;
+ __be64 sink_to;
+};
+
+struct iwarp_send {
+ struct iwarp_ctrl ctrl;
+ __be32 rsvd;
+ __be32 ddp_qn;
+ __be32 ddp_msn;
+ __be32 ddp_mo;
+};
+
+struct iwarp_send_inv {
+ struct iwarp_ctrl ctrl;
+ __be32 inval_stag;
+ __be32 ddp_qn;
+ __be32 ddp_msn;
+ __be32 ddp_mo;
+};
+
+struct iwarp_terminate {
+ struct iwarp_ctrl ctrl;
+ __be32 rsvd;
+ __be32 ddp_qn;
+ __be32 ddp_msn;
+ __be32 ddp_mo;
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+ __be32 layer : 4;
+ __be32 etype : 4;
+ __be32 ecode : 8;
+ __be32 flag_m : 1;
+ __be32 flag_d : 1;
+ __be32 flag_r : 1;
+ __be32 reserved : 13;
+#elif defined(__BIG_ENDIAN_BITFIELD)
+ __be32 reserved : 13;
+ __be32 flag_r : 1;
+ __be32 flag_d : 1;
+ __be32 flag_m : 1;
+ __be32 ecode : 8;
+ __be32 etype : 4;
+ __be32 layer : 4;
+#else
+#error "undefined byte order"
+#endif
+};
+
+/*
+ * Terminate Hdr bits & fields
+ */
+enum {
+ TERM_MASK_LAYER = cpu_to_be32(0xf0000000),
+ TERM_MASK_ETYPE = cpu_to_be32(0x0f000000),
+ TERM_MASK_ECODE = cpu_to_be32(0x00ff0000),
+ TERM_FLAG_M = cpu_to_be32(0x00008000),
+ TERM_FLAG_D = cpu_to_be32(0x00004000),
+ TERM_FLAG_R = cpu_to_be32(0x00002000),
+ TERM_MASK_RESVD = cpu_to_be32(0x00001fff)
+};
+
+static inline u8 __rdmap_term_layer(struct iwarp_terminate *term)
+{
+ return term->layer;
+}
+
+static inline void __rdmap_term_set_layer(struct iwarp_terminate *term,
+ u8 layer)
+{
+ term->layer = layer & 0xf;
+}
+
+static inline u8 __rdmap_term_etype(struct iwarp_terminate *term)
+{
+ return term->etype;
+}
+
+static inline void __rdmap_term_set_etype(struct iwarp_terminate *term,
+ u8 etype)
+{
+ term->etype = etype & 0xf;
+}
+
+static inline u8 __rdmap_term_ecode(struct iwarp_terminate *term)
+{
+ return term->ecode;
+}
+
+static inline void __rdmap_term_set_ecode(struct iwarp_terminate *term,
+ u8 ecode)
+{
+ term->ecode = ecode;
+}
+
+/*
+ * Common portion of iWARP headers (MPA, DDP, RDMAP)
+ * for an FPDU carrying an untagged DDP segment
+ */
+struct iwarp_ctrl_untagged {
+ struct iwarp_ctrl ctrl;
+ __be32 rsvd;
+ __be32 ddp_qn;
+ __be32 ddp_msn;
+ __be32 ddp_mo;
+};
+
+/*
+ * Common portion of iWARP headers (MPA, DDP, RDMAP)
+ * for an FPDU carrying a tagged DDP segment
+ */
+struct iwarp_ctrl_tagged {
+ struct iwarp_ctrl ctrl;
+ __be32 ddp_stag;
+ __be64 ddp_to;
+};
+
+union iwarp_hdr {
+ struct iwarp_ctrl ctrl;
+ struct iwarp_ctrl_untagged c_untagged;
+ struct iwarp_ctrl_tagged c_tagged;
+ struct iwarp_rdma_write rwrite;
+ struct iwarp_rdma_rreq rreq;
+ struct iwarp_rdma_rresp rresp;
+ struct iwarp_terminate terminate;
+ struct iwarp_send send;
+ struct iwarp_send_inv send_inv;
+};
+
+enum term_elayer {
+ TERM_ERROR_LAYER_RDMAP = 0x00,
+ TERM_ERROR_LAYER_DDP = 0x01,
+ TERM_ERROR_LAYER_LLP = 0x02 /* eg., MPA */
+};
+
+enum ddp_etype {
+ DDP_ETYPE_CATASTROPHIC = 0x0,
+ DDP_ETYPE_TAGGED_BUF = 0x1,
+ DDP_ETYPE_UNTAGGED_BUF = 0x2,
+ DDP_ETYPE_RSVD = 0x3
+};
+
+enum ddp_ecode {
+ /* unspecified, set to zero */
+ DDP_ECODE_CATASTROPHIC = 0x00,
+ /* Tagged Buffer Errors */
+ DDP_ECODE_T_INVALID_STAG = 0x00,
+ DDP_ECODE_T_BASE_BOUNDS = 0x01,
+ DDP_ECODE_T_STAG_NOT_ASSOC = 0x02,
+ DDP_ECODE_T_TO_WRAP = 0x03,
+ DDP_ECODE_T_VERSION = 0x04,
+ /* Untagged Buffer Errors */
+ DDP_ECODE_UT_INVALID_QN = 0x01,
+ DDP_ECODE_UT_INVALID_MSN_NOBUF = 0x02,
+ DDP_ECODE_UT_INVALID_MSN_RANGE = 0x03,
+ DDP_ECODE_UT_INVALID_MO = 0x04,
+ DDP_ECODE_UT_MSG_TOOLONG = 0x05,
+ DDP_ECODE_UT_VERSION = 0x06
+};
+
+enum rdmap_untagged_qn {
+ RDMAP_UNTAGGED_QN_SEND = 0,
+ RDMAP_UNTAGGED_QN_RDMA_READ = 1,
+ RDMAP_UNTAGGED_QN_TERMINATE = 2,
+ RDMAP_UNTAGGED_QN_COUNT = 3
+};
+
+enum rdmap_etype {
+ RDMAP_ETYPE_CATASTROPHIC = 0x0,
+ RDMAP_ETYPE_REMOTE_PROTECTION = 0x1,
+ RDMAP_ETYPE_REMOTE_OPERATION = 0x2
+};
+
+enum rdmap_ecode {
+ RDMAP_ECODE_INVALID_STAG = 0x00,
+ RDMAP_ECODE_BASE_BOUNDS = 0x01,
+ RDMAP_ECODE_ACCESS_RIGHTS = 0x02,
+ RDMAP_ECODE_STAG_NOT_ASSOC = 0x03,
+ RDMAP_ECODE_TO_WRAP = 0x04,
+ RDMAP_ECODE_VERSION = 0x05,
+ RDMAP_ECODE_OPCODE = 0x06,
+ RDMAP_ECODE_CATASTROPHIC_STREAM = 0x07,
+ RDMAP_ECODE_CATASTROPHIC_GLOBAL = 0x08,
+ RDMAP_ECODE_CANNOT_INVALIDATE = 0x09,
+ RDMAP_ECODE_UNSPECIFIED = 0xff
+};
+
+enum llp_ecode {
+ LLP_ECODE_TCP_STREAM_LOST = 0x01, /* How to transfer this ?? */
+ LLP_ECODE_RECEIVED_CRC = 0x02,
+ LLP_ECODE_FPDU_START = 0x03,
+ LLP_ECODE_INVALID_REQ_RESP = 0x04,
+
+ /* Errors for Enhanced Connection Establishment only */
+ LLP_ECODE_LOCAL_CATASTROPHIC = 0x05,
+ LLP_ECODE_INSUFFICIENT_IRD = 0x06,
+ LLP_ECODE_NO_MATCHING_RTR = 0x07
+};
+
+enum llp_etype { LLP_ETYPE_MPA = 0x00 };
+
+enum rdma_opcode {
+ RDMAP_RDMA_WRITE = 0x0,
+ RDMAP_RDMA_READ_REQ = 0x1,
+ RDMAP_RDMA_READ_RESP = 0x2,
+ RDMAP_SEND = 0x3,
+ RDMAP_SEND_INVAL = 0x4,
+ RDMAP_SEND_SE = 0x5,
+ RDMAP_SEND_SE_INVAL = 0x6,
+ RDMAP_TERMINATE = 0x7,
+ RDMAP_NOT_SUPPORTED = RDMAP_TERMINATE + 1
+};
+
+#endif
diff --git a/drivers/infiniband/sw/siw/siw.h b/drivers/infiniband/sw/siw/siw.h
new file mode 100644
index 000000000000..03fd7b2f595f
--- /dev/null
+++ b/drivers/infiniband/sw/siw/siw.h
@@ -0,0 +1,745 @@
+/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause */
+
+/* Authors: Bernard Metzler <bmt@zurich.ibm.com> */
+/* Copyright (c) 2008-2019, IBM Corporation */
+
+#ifndef _SIW_H
+#define _SIW_H
+
+#include <rdma/ib_verbs.h>
+#include <linux/socket.h>
+#include <linux/skbuff.h>
+#include <crypto/hash.h>
+#include <linux/crc32.h>
+#include <linux/crc32c.h>
+
+#include <rdma/siw-abi.h>
+#include "iwarp.h"
+
+#define SIW_VENDOR_ID 0x626d74 /* ascii 'bmt' for now */
+#define SIW_VENDORT_PART_ID 0
+#define SIW_MAX_QP (1024 * 100)
+#define SIW_MAX_QP_WR (1024 * 32)
+#define SIW_MAX_ORD_QP 128
+#define SIW_MAX_IRD_QP 128
+#define SIW_MAX_SGE_PBL 256 /* max num sge's for PBL */
+#define SIW_MAX_SGE_RD 1 /* iwarp limitation. we could relax */
+#define SIW_MAX_CQ (1024 * 100)
+#define SIW_MAX_CQE (SIW_MAX_QP_WR * 100)
+#define SIW_MAX_MR (SIW_MAX_QP * 10)
+#define SIW_MAX_PD SIW_MAX_QP
+#define SIW_MAX_MW 0 /* to be set if MW's are supported */
+#define SIW_MAX_FMR SIW_MAX_MR
+#define SIW_MAX_SRQ SIW_MAX_QP
+#define SIW_MAX_SRQ_WR (SIW_MAX_QP_WR * 10)
+#define SIW_MAX_CONTEXT SIW_MAX_PD
+
+/* Min number of bytes for using zero copy transmit */
+#define SENDPAGE_THRESH PAGE_SIZE
+
+/* Maximum number of frames which can be send in one SQ processing */
+#define SQ_USER_MAXBURST 100
+
+/* Maximum number of consecutive IRQ elements which get served
+ * if SQ has pending work. Prevents starving local SQ processing
+ * by serving peer Read Requests.
+ */
+#define SIW_IRQ_MAXBURST_SQ_ACTIVE 4
+
+struct siw_dev_cap {
+ int max_qp;
+ int max_qp_wr;
+ int max_ord; /* max. outbound read queue depth */
+ int max_ird; /* max. inbound read queue depth */
+ int max_sge;
+ int max_sge_rd;
+ int max_cq;
+ int max_cqe;
+ int max_mr;
+ int max_pd;
+ int max_mw;
+ int max_fmr;
+ int max_srq;
+ int max_srq_wr;
+ int max_srq_sge;
+};
+
+struct siw_pd {
+ struct ib_pd base_pd;
+};
+
+struct siw_device {
+ struct ib_device base_dev;
+ struct net_device *netdev;
+ struct siw_dev_cap attrs;
+
+ u32 vendor_part_id;
+ int numa_node;
+
+ /* physical port state (only one port per device) */
+ enum ib_port_state state;
+
+ spinlock_t lock;
+
+ struct xarray qp_xa;
+ struct xarray mem_xa;
+
+ struct list_head cep_list;
+ struct list_head qp_list;
+
+ /* active objects statistics to enforce limits */
+ atomic_t num_qp;
+ atomic_t num_cq;
+ atomic_t num_pd;
+ atomic_t num_mr;
+ atomic_t num_srq;
+ atomic_t num_ctx;
+
+ struct work_struct netdev_down;
+};
+
+struct siw_uobj {
+ void *addr;
+ u32 size;
+};
+
+struct siw_ucontext {
+ struct ib_ucontext base_ucontext;
+ struct siw_device *sdev;
+
+ /* xarray of user mappable objects */
+ struct xarray xa;
+ u32 uobj_nextkey;
+};
+
+/*
+ * The RDMA core does not define LOCAL_READ access, which is always
+ * enabled implictely.
+ */
+#define IWARP_ACCESS_MASK \
+ (IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE | \
+ IB_ACCESS_REMOTE_READ)
+
+/*
+ * siw presentation of user memory registered as source
+ * or target of RDMA operations.
+ */
+
+struct siw_page_chunk {
+ struct page **plist;
+};
+
+struct siw_umem {
+ struct siw_page_chunk *page_chunk;
+ int num_pages;
+ bool writable;
+ u64 fp_addr; /* First page base address */
+ struct mm_struct *owning_mm;
+};
+
+struct siw_pble {
+ u64 addr; /* Address of assigned user buffer */
+ u64 size; /* Size of this entry */
+ u64 pbl_off; /* Total offset from start of PBL */
+};
+
+struct siw_pbl {
+ unsigned int num_buf;
+ unsigned int max_buf;
+ struct siw_pble pbe[1];
+};
+
+struct siw_mr;
+
+/*
+ * Generic memory representation for registered siw memory.
+ * Memory lookup always via higher 24 bit of STag (STag index).
+ */
+struct siw_mem {
+ struct siw_device *sdev;
+ struct kref ref;
+ u64 va; /* VA of memory */
+ u64 len; /* length of the memory buffer in bytes */
+ u32 stag; /* iWarp memory access steering tag */
+ u8 stag_valid; /* VALID or INVALID */
+ u8 is_pbl; /* PBL or user space mem */
+ u8 is_mw; /* Memory Region or Memory Window */
+ enum ib_access_flags perms; /* local/remote READ & WRITE */
+ union {
+ struct siw_umem *umem;
+ struct siw_pbl *pbl;
+ void *mem_obj;
+ };
+ struct ib_pd *pd;
+};
+
+struct siw_mr {
+ struct ib_mr base_mr;
+ struct siw_mem *mem;
+ struct rcu_head rcu;
+};
+
+/*
+ * Error codes for local or remote
+ * access to registered memory
+ */
+enum siw_access_state {
+ E_ACCESS_OK,
+ E_STAG_INVALID,
+ E_BASE_BOUNDS,
+ E_ACCESS_PERM,
+ E_PD_MISMATCH
+};
+
+enum siw_wr_state {
+ SIW_WR_IDLE,
+ SIW_WR_QUEUED, /* processing has not started yet */
+ SIW_WR_INPROGRESS /* initiated processing of the WR */
+};
+
+/* The WQE currently being processed (RX or TX) */
+struct siw_wqe {
+ /* Copy of applications SQE or RQE */
+ union {
+ struct siw_sqe sqe;
+ struct siw_rqe rqe;
+ };
+ struct siw_mem *mem[SIW_MAX_SGE]; /* per sge's resolved mem */
+ enum siw_wr_state wr_status;
+ enum siw_wc_status wc_status;
+ u32 bytes; /* total bytes to process */
+ u32 processed; /* bytes processed */
+};
+
+struct siw_cq {
+ struct ib_cq base_cq;
+ spinlock_t lock;
+ u64 *notify;
+ struct siw_cqe *queue;
+ u32 cq_put;
+ u32 cq_get;
+ u32 num_cqe;
+ bool kernel_verbs;
+ u32 xa_cq_index; /* mmap information for CQE array */
+ u32 id; /* For debugging only */
+};
+
+enum siw_qp_state {
+ SIW_QP_STATE_IDLE,
+ SIW_QP_STATE_RTR,
+ SIW_QP_STATE_RTS,
+ SIW_QP_STATE_CLOSING,
+ SIW_QP_STATE_TERMINATE,
+ SIW_QP_STATE_ERROR,
+ SIW_QP_STATE_COUNT
+};
+
+enum siw_qp_flags {
+ SIW_RDMA_BIND_ENABLED = (1 << 0),
+ SIW_RDMA_WRITE_ENABLED = (1 << 1),
+ SIW_RDMA_READ_ENABLED = (1 << 2),
+ SIW_SIGNAL_ALL_WR = (1 << 3),
+ SIW_MPA_CRC = (1 << 4),
+ SIW_QP_IN_DESTROY = (1 << 5)
+};
+
+enum siw_qp_attr_mask {
+ SIW_QP_ATTR_STATE = (1 << 0),
+ SIW_QP_ATTR_ACCESS_FLAGS = (1 << 1),
+ SIW_QP_ATTR_LLP_HANDLE = (1 << 2),
+ SIW_QP_ATTR_ORD = (1 << 3),
+ SIW_QP_ATTR_IRD = (1 << 4),
+ SIW_QP_ATTR_SQ_SIZE = (1 << 5),
+ SIW_QP_ATTR_RQ_SIZE = (1 << 6),
+ SIW_QP_ATTR_MPA = (1 << 7)
+};
+
+struct siw_srq {
+ struct ib_srq base_srq;
+ spinlock_t lock;
+ u32 max_sge;
+ u32 limit; /* low watermark for async event */
+ struct siw_rqe *recvq;
+ u32 rq_put;
+ u32 rq_get;
+ u32 num_rqe; /* max # of wqe's allowed */
+ u32 xa_srq_index; /* mmap information for SRQ array */
+ char armed; /* inform user if limit hit */
+ char kernel_verbs; /* '1' if kernel client */
+};
+
+struct siw_qp_attrs {
+ enum siw_qp_state state;
+ u32 sq_size;
+ u32 rq_size;
+ u32 orq_size;
+ u32 irq_size;
+ u32 sq_max_sges;
+ u32 rq_max_sges;
+ enum siw_qp_flags flags;
+
+ struct socket *sk;
+};
+
+enum siw_tx_ctx {
+ SIW_SEND_HDR, /* start or continue sending HDR */
+ SIW_SEND_DATA, /* start or continue sending DDP payload */
+ SIW_SEND_TRAILER, /* start or continue sending TRAILER */
+ SIW_SEND_SHORT_FPDU/* send whole FPDU hdr|data|trailer at once */
+};
+
+enum siw_rx_state {
+ SIW_GET_HDR, /* await new hdr or within hdr */
+ SIW_GET_DATA_START, /* start of inbound DDP payload */
+ SIW_GET_DATA_MORE, /* continuation of (misaligned) DDP payload */
+ SIW_GET_TRAILER/* await new trailer or within trailer */
+};
+
+struct siw_rx_stream {
+ struct sk_buff *skb;
+ int skb_new; /* pending unread bytes in skb */
+ int skb_offset; /* offset in skb */
+ int skb_copied; /* processed bytes in skb */
+
+ union iwarp_hdr hdr;
+ struct mpa_trailer trailer;
+
+ enum siw_rx_state state;
+
+ /*
+ * For each FPDU, main RX loop runs through 3 stages:
+ * Receiving protocol headers, placing DDP payload and receiving
+ * trailer information (CRC + possibly padding).
+ * Next two variables keep state on receive status of the
+ * current FPDU part (hdr, data, trailer).
+ */
+ int fpdu_part_rcvd; /* bytes in pkt part copied */
+ int fpdu_part_rem; /* bytes in pkt part not seen */
+
+ /*
+ * Next expected DDP MSN for each QN +
+ * expected steering tag +
+ * expected DDP tagget offset (all HBO)
+ */
+ u32 ddp_msn[RDMAP_UNTAGGED_QN_COUNT];
+ u32 ddp_stag;
+ u64 ddp_to;
+ u32 inval_stag; /* Stag to be invalidated */
+
+ struct shash_desc *mpa_crc_hd;
+ u8 rx_suspend : 1;
+ u8 pad : 2; /* # of pad bytes expected */
+ u8 rdmap_op : 4; /* opcode of current frame */
+};
+
+struct siw_rx_fpdu {
+ /*
+ * Local destination memory of inbound RDMA operation.
+ * Valid, according to wqe->wr_status
+ */
+ struct siw_wqe wqe_active;
+
+ unsigned int pbl_idx; /* Index into current PBL */
+ unsigned int sge_idx; /* current sge in rx */
+ unsigned int sge_off; /* already rcvd in curr. sge */
+
+ char first_ddp_seg; /* this is the first DDP seg */
+ char more_ddp_segs; /* more DDP segs expected */
+ u8 prev_rdmap_op : 4; /* opcode of prev frame */
+};
+
+/*
+ * Shorthands for short packets w/o payload
+ * to be transmitted more efficient.
+ */
+struct siw_send_pkt {
+ struct iwarp_send send;
+ __be32 crc;
+};
+
+struct siw_write_pkt {
+ struct iwarp_rdma_write write;
+ __be32 crc;
+};
+
+struct siw_rreq_pkt {
+ struct iwarp_rdma_rreq rreq;
+ __be32 crc;
+};
+
+struct siw_rresp_pkt {
+ struct iwarp_rdma_rresp rresp;
+ __be32 crc;
+};
+
+struct siw_iwarp_tx {
+ union {
+ union iwarp_hdr hdr;
+
+ /* Generic part of FPDU header */
+ struct iwarp_ctrl ctrl;
+ struct iwarp_ctrl_untagged c_untagged;
+ struct iwarp_ctrl_tagged c_tagged;
+
+ /* FPDU headers */
+ struct iwarp_rdma_write rwrite;
+ struct iwarp_rdma_rreq rreq;
+ struct iwarp_rdma_rresp rresp;
+ struct iwarp_terminate terminate;
+ struct iwarp_send send;
+ struct iwarp_send_inv send_inv;
+
+ /* complete short FPDUs */
+ struct siw_send_pkt send_pkt;
+ struct siw_write_pkt write_pkt;
+ struct siw_rreq_pkt rreq_pkt;
+ struct siw_rresp_pkt rresp_pkt;
+ } pkt;
+
+ struct mpa_trailer trailer;
+ /* DDP MSN for untagged messages */
+ u32 ddp_msn[RDMAP_UNTAGGED_QN_COUNT];
+
+ enum siw_tx_ctx state;
+ u16 ctrl_len; /* ddp+rdmap hdr */
+ u16 ctrl_sent;
+ int burst;
+ int bytes_unsent; /* ddp payload bytes */
+
+ struct shash_desc *mpa_crc_hd;
+
+ u8 do_crc : 1; /* do crc for segment */
+ u8 use_sendpage : 1; /* send w/o copy */
+ u8 tx_suspend : 1; /* stop sending DDP segs. */
+ u8 pad : 2; /* # pad in current fpdu */
+ u8 orq_fence : 1; /* ORQ full or Send fenced */
+ u8 in_syscall : 1; /* TX out of user context */
+ u8 zcopy_tx : 1; /* Use TCP_SENDPAGE if possible */
+ u8 gso_seg_limit; /* Maximum segments for GSO, 0 = unbound */
+
+ u16 fpdu_len; /* len of FPDU to tx */
+ unsigned int tcp_seglen; /* remaining tcp seg space */
+
+ struct siw_wqe wqe_active;
+
+ int pbl_idx; /* Index into current PBL */
+ int sge_idx; /* current sge in tx */
+ u32 sge_off; /* already sent in curr. sge */
+};
+
+struct siw_qp {
+ struct siw_device *sdev;
+ struct ib_qp *ib_qp;
+ struct kref ref;
+ u32 qp_num;
+ struct list_head devq;
+ int tx_cpu;
+ bool kernel_verbs;
+ struct siw_qp_attrs attrs;
+
+ struct siw_cep *cep;
+ struct rw_semaphore state_lock;
+
+ struct ib_pd *pd;
+ struct siw_cq *scq;
+ struct siw_cq *rcq;
+ struct siw_srq *srq;
+
+ struct siw_iwarp_tx tx_ctx; /* Transmit context */
+ spinlock_t sq_lock;
+ struct siw_sqe *sendq; /* send queue element array */
+ uint32_t sq_get; /* consumer index into sq array */
+ uint32_t sq_put; /* kernel prod. index into sq array */
+ struct llist_node tx_list;
+
+ struct siw_sqe *orq; /* outbound read queue element array */
+ spinlock_t orq_lock;
+ uint32_t orq_get; /* consumer index into orq array */
+ uint32_t orq_put; /* shared producer index for ORQ */
+
+ struct siw_rx_stream rx_stream;
+ struct siw_rx_fpdu *rx_fpdu;
+ struct siw_rx_fpdu rx_tagged;
+ struct siw_rx_fpdu rx_untagged;
+ spinlock_t rq_lock;
+ struct siw_rqe *recvq; /* recv queue element array */
+ uint32_t rq_get; /* consumer index into rq array */
+ uint32_t rq_put; /* kernel prod. index into rq array */
+
+ struct siw_sqe *irq; /* inbound read queue element array */
+ uint32_t irq_get; /* consumer index into irq array */
+ uint32_t irq_put; /* producer index into irq array */
+ int irq_burst;
+
+ struct { /* information to be carried in TERMINATE pkt, if valid */
+ u8 valid;
+ u8 in_tx;
+ u8 layer : 4, etype : 4;
+ u8 ecode;
+ } term_info;
+ u32 xa_sq_index; /* mmap information for SQE array */
+ u32 xa_rq_index; /* mmap information for RQE array */
+ struct rcu_head rcu;
+};
+
+struct siw_base_qp {
+ struct ib_qp base_qp;
+ struct siw_qp *qp;
+};
+
+/* helper macros */
+#define rx_qp(rx) container_of(rx, struct siw_qp, rx_stream)
+#define tx_qp(tx) container_of(tx, struct siw_qp, tx_ctx)
+#define tx_wqe(qp) (&(qp)->tx_ctx.wqe_active)
+#define rx_wqe(rctx) (&(rctx)->wqe_active)
+#define rx_mem(rctx) ((rctx)->wqe_active.mem[0])
+#define tx_type(wqe) ((wqe)->sqe.opcode)
+#define rx_type(wqe) ((wqe)->rqe.opcode)
+#define tx_flags(wqe) ((wqe)->sqe.flags)
+
+struct iwarp_msg_info {
+ int hdr_len;
+ struct iwarp_ctrl ctrl;
+ int (*rx_data)(struct siw_qp *qp);
+};
+
+/* Global siw parameters. Currently set in siw_main.c */
+extern const bool zcopy_tx;
+extern const bool try_gso;
+extern const bool loopback_enabled;
+extern const bool mpa_crc_required;
+extern const bool mpa_crc_strict;
+extern const bool siw_tcp_nagle;
+extern u_char mpa_version;
+extern const bool peer_to_peer;
+extern struct task_struct *siw_tx_thread[];
+
+extern struct crypto_shash *siw_crypto_shash;
+extern struct iwarp_msg_info iwarp_pktinfo[RDMAP_TERMINATE + 1];
+
+/* QP general functions */
+int siw_qp_modify(struct siw_qp *qp, struct siw_qp_attrs *attr,
+ enum siw_qp_attr_mask mask);
+int siw_qp_mpa_rts(struct siw_qp *qp, enum mpa_v2_ctrl ctrl);
+void siw_qp_llp_close(struct siw_qp *qp);
+void siw_qp_cm_drop(struct siw_qp *qp, int schedule);
+void siw_send_terminate(struct siw_qp *qp);
+
+void siw_qp_get_ref(struct ib_qp *qp);
+void siw_qp_put_ref(struct ib_qp *qp);
+int siw_qp_add(struct siw_device *sdev, struct siw_qp *qp);
+void siw_free_qp(struct kref *ref);
+
+void siw_init_terminate(struct siw_qp *qp, enum term_elayer layer,
+ u8 etype, u8 ecode, int in_tx);
+enum ddp_ecode siw_tagged_error(enum siw_access_state state);
+enum rdmap_ecode siw_rdmap_error(enum siw_access_state state);
+
+void siw_read_to_orq(struct siw_sqe *rreq, struct siw_sqe *sqe);
+int siw_sqe_complete(struct siw_qp *qp, struct siw_sqe *sqe, u32 bytes,
+ enum siw_wc_status status);
+int siw_rqe_complete(struct siw_qp *qp, struct siw_rqe *rqe, u32 bytes,
+ u32 inval_stag, enum siw_wc_status status);
+void siw_qp_llp_data_ready(struct sock *sk);
+void siw_qp_llp_write_space(struct sock *sk);
+
+/* QP TX path functions */
+int siw_run_sq(void *arg);
+int siw_qp_sq_process(struct siw_qp *qp);
+int siw_sq_start(struct siw_qp *qp);
+int siw_activate_tx(struct siw_qp *qp);
+void siw_stop_tx_thread(int nr_cpu);
+int siw_get_tx_cpu(struct siw_device *sdev);
+void siw_put_tx_cpu(int cpu);
+
+/* QP RX path functions */
+int siw_proc_send(struct siw_qp *qp);
+int siw_proc_rreq(struct siw_qp *qp);
+int siw_proc_rresp(struct siw_qp *qp);
+int siw_proc_write(struct siw_qp *qp);
+int siw_proc_terminate(struct siw_qp *qp);
+
+int siw_tcp_rx_data(read_descriptor_t *rd_desc, struct sk_buff *skb,
+ unsigned int off, size_t len);
+
+static inline void set_rx_fpdu_context(struct siw_qp *qp, u8 opcode)
+{
+ if (opcode == RDMAP_RDMA_WRITE || opcode == RDMAP_RDMA_READ_RESP)
+ qp->rx_fpdu = &qp->rx_tagged;
+ else
+ qp->rx_fpdu = &qp->rx_untagged;
+
+ qp->rx_stream.rdmap_op = opcode;
+}
+
+static inline struct siw_ucontext *to_siw_ctx(struct ib_ucontext *base_ctx)
+{
+ return container_of(base_ctx, struct siw_ucontext, base_ucontext);
+}
+
+static inline struct siw_base_qp *to_siw_base_qp(struct ib_qp *base_qp)
+{
+ return container_of(base_qp, struct siw_base_qp, base_qp);
+}
+
+static inline struct siw_qp *to_siw_qp(struct ib_qp *base_qp)
+{
+ return to_siw_base_qp(base_qp)->qp;
+}
+
+static inline struct siw_cq *to_siw_cq(struct ib_cq *base_cq)
+{
+ return container_of(base_cq, struct siw_cq, base_cq);
+}
+
+static inline struct siw_srq *to_siw_srq(struct ib_srq *base_srq)
+{
+ return container_of(base_srq, struct siw_srq, base_srq);
+}
+
+static inline struct siw_device *to_siw_dev(struct ib_device *base_dev)
+{
+ return container_of(base_dev, struct siw_device, base_dev);
+}
+
+static inline struct siw_mr *to_siw_mr(struct ib_mr *base_mr)
+{
+ return container_of(base_mr, struct siw_mr, base_mr);
+}
+
+static inline struct siw_qp *siw_qp_id2obj(struct siw_device *sdev, int id)
+{
+ struct siw_qp *qp;
+
+ rcu_read_lock();
+ qp = xa_load(&sdev->qp_xa, id);
+ if (likely(qp && kref_get_unless_zero(&qp->ref))) {
+ rcu_read_unlock();
+ return qp;
+ }
+ rcu_read_unlock();
+ return NULL;
+}
+
+static inline u32 qp_id(struct siw_qp *qp)
+{
+ return qp->qp_num;
+}
+
+static inline void siw_qp_get(struct siw_qp *qp)
+{
+ kref_get(&qp->ref);
+}
+
+static inline void siw_qp_put(struct siw_qp *qp)
+{
+ kref_put(&qp->ref, siw_free_qp);
+}
+
+static inline int siw_sq_empty(struct siw_qp *qp)
+{
+ struct siw_sqe *sqe = &qp->sendq[qp->sq_get % qp->attrs.sq_size];
+
+ return READ_ONCE(sqe->flags) == 0;
+}
+
+static inline struct siw_sqe *sq_get_next(struct siw_qp *qp)
+{
+ struct siw_sqe *sqe = &qp->sendq[qp->sq_get % qp->attrs.sq_size];
+
+ if (READ_ONCE(sqe->flags) & SIW_WQE_VALID)
+ return sqe;
+
+ return NULL;
+}
+
+static inline struct siw_sqe *orq_get_current(struct siw_qp *qp)
+{
+ return &qp->orq[qp->orq_get % qp->attrs.orq_size];
+}
+
+static inline struct siw_sqe *orq_get_tail(struct siw_qp *qp)
+{
+ return &qp->orq[qp->orq_put % qp->attrs.orq_size];
+}
+
+static inline struct siw_sqe *orq_get_free(struct siw_qp *qp)
+{
+ struct siw_sqe *orq_e = orq_get_tail(qp);
+
+ if (orq_e && READ_ONCE(orq_e->flags) == 0)
+ return orq_e;
+
+ return NULL;
+}
+
+static inline int siw_orq_empty(struct siw_qp *qp)
+{
+ return qp->orq[qp->orq_get % qp->attrs.orq_size].flags == 0 ? 1 : 0;
+}
+
+static inline struct siw_sqe *irq_alloc_free(struct siw_qp *qp)
+{
+ struct siw_sqe *irq_e = &qp->irq[qp->irq_put % qp->attrs.irq_size];
+
+ if (READ_ONCE(irq_e->flags) == 0) {
+ qp->irq_put++;
+ return irq_e;
+ }
+ return NULL;
+}
+
+static inline __wsum siw_csum_update(const void *buff, int len, __wsum sum)
+{
+ return (__force __wsum)crc32c((__force __u32)sum, buff, len);
+}
+
+static inline __wsum siw_csum_combine(__wsum csum, __wsum csum2, int offset,
+ int len)
+{
+ return (__force __wsum)__crc32c_le_combine((__force __u32)csum,
+ (__force __u32)csum2, len);
+}
+
+static inline void siw_crc_skb(struct siw_rx_stream *srx, unsigned int len)
+{
+ const struct skb_checksum_ops siw_cs_ops = {
+ .update = siw_csum_update,
+ .combine = siw_csum_combine,
+ };
+ __wsum crc = *(u32 *)shash_desc_ctx(srx->mpa_crc_hd);
+
+ crc = __skb_checksum(srx->skb, srx->skb_offset, len, crc,
+ &siw_cs_ops);
+ *(u32 *)shash_desc_ctx(srx->mpa_crc_hd) = crc;
+}
+
+#define siw_dbg(ibdev, fmt, ...) \
+ ibdev_dbg(ibdev, "%s: " fmt, __func__, ##__VA_ARGS__)
+
+#define siw_dbg_qp(qp, fmt, ...) \
+ ibdev_dbg(&qp->sdev->base_dev, "QP[%u] %s: " fmt, qp_id(qp), __func__, \
+ ##__VA_ARGS__)
+
+#define siw_dbg_cq(cq, fmt, ...) \
+ ibdev_dbg(cq->base_cq.device, "CQ[%u] %s: " fmt, cq->id, __func__, \
+ ##__VA_ARGS__)
+
+#define siw_dbg_pd(pd, fmt, ...) \
+ ibdev_dbg(pd->device, "PD[%u] %s: " fmt, pd->res.id, __func__, \
+ ##__VA_ARGS__)
+
+#define siw_dbg_mem(mem, fmt, ...) \
+ ibdev_dbg(&mem->sdev->base_dev, \
+ "MEM[0x%08x] %s: " fmt, mem->stag, __func__, ##__VA_ARGS__)
+
+#define siw_dbg_cep(cep, fmt, ...) \
+ ibdev_dbg(&cep->sdev->base_dev, "CEP[0x%p] %s: " fmt, \
+ cep, __func__, ##__VA_ARGS__)
+
+void siw_cq_flush(struct siw_cq *cq);
+void siw_sq_flush(struct siw_qp *qp);
+void siw_rq_flush(struct siw_qp *qp);
+int siw_reap_cqe(struct siw_cq *cq, struct ib_wc *wc);
+
+#endif
diff --git a/drivers/infiniband/sw/siw/siw_cm.c b/drivers/infiniband/sw/siw/siw_cm.c
new file mode 100644
index 000000000000..a7cde98e73e8
--- /dev/null
+++ b/drivers/infiniband/sw/siw/siw_cm.c
@@ -0,0 +1,2070 @@
+// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause
+
+/* Authors: Bernard Metzler <bmt@zurich.ibm.com> */
+/* Fredy Neeser */
+/* Greg Joyce <greg@opengridcomputing.com> */
+/* Copyright (c) 2008-2019, IBM Corporation */
+/* Copyright (c) 2017, Open Grid Computing, Inc. */
+
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/net.h>
+#include <linux/inetdevice.h>
+#include <net/addrconf.h>
+#include <linux/workqueue.h>
+#include <net/sock.h>
+#include <net/tcp.h>
+#include <linux/inet.h>
+#include <linux/tcp.h>
+
+#include <rdma/iw_cm.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_user_verbs.h>
+
+#include "siw.h"
+#include "siw_cm.h"
+
+/*
+ * Set to any combination of
+ * MPA_V2_RDMA_NO_RTR, MPA_V2_RDMA_READ_RTR, MPA_V2_RDMA_WRITE_RTR
+ */
+static __be16 rtr_type = MPA_V2_RDMA_READ_RTR | MPA_V2_RDMA_WRITE_RTR;
+static const bool relaxed_ird_negotiation = 1;
+
+static void siw_cm_llp_state_change(struct sock *s);
+static void siw_cm_llp_data_ready(struct sock *s);
+static void siw_cm_llp_write_space(struct sock *s);
+static void siw_cm_llp_error_report(struct sock *s);
+static int siw_cm_upcall(struct siw_cep *cep, enum iw_cm_event_type reason,
+ int status);
+
+static void siw_sk_assign_cm_upcalls(struct sock *sk)
+{
+ write_lock_bh(&sk->sk_callback_lock);
+ sk->sk_state_change = siw_cm_llp_state_change;
+ sk->sk_data_ready = siw_cm_llp_data_ready;
+ sk->sk_write_space = siw_cm_llp_write_space;
+ sk->sk_error_report = siw_cm_llp_error_report;
+ write_unlock_bh(&sk->sk_callback_lock);
+}
+
+static void siw_sk_save_upcalls(struct sock *sk)
+{
+ struct siw_cep *cep = sk_to_cep(sk);
+
+ write_lock_bh(&sk->sk_callback_lock);
+ cep->sk_state_change = sk->sk_state_change;
+ cep->sk_data_ready = sk->sk_data_ready;
+ cep->sk_write_space = sk->sk_write_space;
+ cep->sk_error_report = sk->sk_error_report;
+ write_unlock_bh(&sk->sk_callback_lock);
+}
+
+static void siw_sk_restore_upcalls(struct sock *sk, struct siw_cep *cep)
+{
+ sk->sk_state_change = cep->sk_state_change;
+ sk->sk_data_ready = cep->sk_data_ready;
+ sk->sk_write_space = cep->sk_write_space;
+ sk->sk_error_report = cep->sk_error_report;
+ sk->sk_user_data = NULL;
+}
+
+static void siw_qp_socket_assoc(struct siw_cep *cep, struct siw_qp *qp)
+{
+ struct socket *s = cep->sock;
+ struct sock *sk = s->sk;
+
+ write_lock_bh(&sk->sk_callback_lock);
+
+ qp->attrs.sk = s;
+ sk->sk_data_ready = siw_qp_llp_data_ready;
+ sk->sk_write_space = siw_qp_llp_write_space;
+
+ write_unlock_bh(&sk->sk_callback_lock);
+}
+
+static void siw_socket_disassoc(struct socket *s)
+{
+ struct sock *sk = s->sk;
+ struct siw_cep *cep;
+
+ if (sk) {
+ write_lock_bh(&sk->sk_callback_lock);
+ cep = sk_to_cep(sk);
+ if (cep) {
+ siw_sk_restore_upcalls(sk, cep);
+ siw_cep_put(cep);
+ } else {
+ pr_warn("siw: cannot restore sk callbacks: no ep\n");
+ }
+ write_unlock_bh(&sk->sk_callback_lock);
+ } else {
+ pr_warn("siw: cannot restore sk callbacks: no sk\n");
+ }
+}
+
+static void siw_rtr_data_ready(struct sock *sk)
+{
+ struct siw_cep *cep;
+ struct siw_qp *qp = NULL;
+ read_descriptor_t rd_desc;
+
+ read_lock(&sk->sk_callback_lock);
+
+ cep = sk_to_cep(sk);
+ if (!cep) {
+ WARN(1, "No connection endpoint\n");
+ goto out;
+ }
+ qp = sk_to_qp(sk);
+
+ memset(&rd_desc, 0, sizeof(rd_desc));
+ rd_desc.arg.data = qp;
+ rd_desc.count = 1;
+
+ tcp_read_sock(sk, &rd_desc, siw_tcp_rx_data);
+ /*
+ * Check if first frame was successfully processed.
+ * Signal connection full establishment if yes.
+ * Failed data processing would have already scheduled
+ * connection drop.
+ */
+ if (!qp->rx_stream.rx_suspend)
+ siw_cm_upcall(cep, IW_CM_EVENT_ESTABLISHED, 0);
+out:
+ read_unlock(&sk->sk_callback_lock);
+ if (qp)
+ siw_qp_socket_assoc(cep, qp);
+}
+
+static void siw_sk_assign_rtr_upcalls(struct siw_cep *cep)
+{
+ struct sock *sk = cep->sock->sk;
+
+ write_lock_bh(&sk->sk_callback_lock);
+ sk->sk_data_ready = siw_rtr_data_ready;
+ sk->sk_write_space = siw_qp_llp_write_space;
+ write_unlock_bh(&sk->sk_callback_lock);
+}
+
+static void siw_cep_socket_assoc(struct siw_cep *cep, struct socket *s)
+{
+ cep->sock = s;
+ siw_cep_get(cep);
+ s->sk->sk_user_data = cep;
+
+ siw_sk_save_upcalls(s->sk);
+ siw_sk_assign_cm_upcalls(s->sk);
+}
+
+static struct siw_cep *siw_cep_alloc(struct siw_device *sdev)
+{
+ struct siw_cep *cep = kzalloc(sizeof(*cep), GFP_KERNEL);
+ unsigned long flags;
+
+ if (!cep)
+ return NULL;
+
+ INIT_LIST_HEAD(&cep->listenq);
+ INIT_LIST_HEAD(&cep->devq);
+ INIT_LIST_HEAD(&cep->work_freelist);
+
+ kref_init(&cep->ref);
+ cep->state = SIW_EPSTATE_IDLE;
+ init_waitqueue_head(&cep->waitq);
+ spin_lock_init(&cep->lock);
+ cep->sdev = sdev;
+ cep->enhanced_rdma_conn_est = false;
+
+ spin_lock_irqsave(&sdev->lock, flags);
+ list_add_tail(&cep->devq, &sdev->cep_list);
+ spin_unlock_irqrestore(&sdev->lock, flags);
+
+ siw_dbg_cep(cep, "new endpoint\n");
+ return cep;
+}
+
+static void siw_cm_free_work(struct siw_cep *cep)
+{
+ struct list_head *w, *tmp;
+ struct siw_cm_work *work;
+
+ list_for_each_safe(w, tmp, &cep->work_freelist) {
+ work = list_entry(w, struct siw_cm_work, list);
+ list_del(&work->list);
+ kfree(work);
+ }
+}
+
+static void siw_cancel_mpatimer(struct siw_cep *cep)
+{
+ spin_lock_bh(&cep->lock);
+ if (cep->mpa_timer) {
+ if (cancel_delayed_work(&cep->mpa_timer->work)) {
+ siw_cep_put(cep);
+ kfree(cep->mpa_timer); /* not needed again */
+ }
+ cep->mpa_timer = NULL;
+ }
+ spin_unlock_bh(&cep->lock);
+}
+
+static void siw_put_work(struct siw_cm_work *work)
+{
+ INIT_LIST_HEAD(&work->list);
+ spin_lock_bh(&work->cep->lock);
+ list_add(&work->list, &work->cep->work_freelist);
+ spin_unlock_bh(&work->cep->lock);
+}
+
+static void siw_cep_set_inuse(struct siw_cep *cep)
+{
+ unsigned long flags;
+ int rv;
+retry:
+ spin_lock_irqsave(&cep->lock, flags);
+
+ if (cep->in_use) {
+ spin_unlock_irqrestore(&cep->lock, flags);
+ rv = wait_event_interruptible(cep->waitq, !cep->in_use);
+ if (signal_pending(current))
+ flush_signals(current);
+ goto retry;
+ } else {
+ cep->in_use = 1;
+ spin_unlock_irqrestore(&cep->lock, flags);
+ }
+}
+
+static void siw_cep_set_free(struct siw_cep *cep)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&cep->lock, flags);
+ cep->in_use = 0;
+ spin_unlock_irqrestore(&cep->lock, flags);
+
+ wake_up(&cep->waitq);
+}
+
+static void __siw_cep_dealloc(struct kref *ref)
+{
+ struct siw_cep *cep = container_of(ref, struct siw_cep, ref);
+ struct siw_device *sdev = cep->sdev;
+ unsigned long flags;
+
+ WARN_ON(cep->listen_cep);
+
+ /* kfree(NULL) is safe */
+ kfree(cep->mpa.pdata);
+ spin_lock_bh(&cep->lock);
+ if (!list_empty(&cep->work_freelist))
+ siw_cm_free_work(cep);
+ spin_unlock_bh(&cep->lock);
+
+ spin_lock_irqsave(&sdev->lock, flags);
+ list_del(&cep->devq);
+ spin_unlock_irqrestore(&sdev->lock, flags);
+
+ siw_dbg_cep(cep, "free endpoint\n");
+ kfree(cep);
+}
+
+static struct siw_cm_work *siw_get_work(struct siw_cep *cep)
+{
+ struct siw_cm_work *work = NULL;
+
+ spin_lock_bh(&cep->lock);
+ if (!list_empty(&cep->work_freelist)) {
+ work = list_entry(cep->work_freelist.next, struct siw_cm_work,
+ list);
+ list_del_init(&work->list);
+ }
+ spin_unlock_bh(&cep->lock);
+ return work;
+}
+
+static int siw_cm_alloc_work(struct siw_cep *cep, int num)
+{
+ struct siw_cm_work *work;
+
+ while (num--) {
+ work = kmalloc(sizeof(*work), GFP_KERNEL);
+ if (!work) {
+ if (!(list_empty(&cep->work_freelist)))
+ siw_cm_free_work(cep);
+ return -ENOMEM;
+ }
+ work->cep = cep;
+ INIT_LIST_HEAD(&work->list);
+ list_add(&work->list, &cep->work_freelist);
+ }
+ return 0;
+}
+
+/*
+ * siw_cm_upcall()
+ *
+ * Upcall to IWCM to inform about async connection events
+ */
+static int siw_cm_upcall(struct siw_cep *cep, enum iw_cm_event_type reason,
+ int status)
+{
+ struct iw_cm_event event;
+ struct iw_cm_id *id;
+
+ memset(&event, 0, sizeof(event));
+ event.status = status;
+ event.event = reason;
+
+ if (reason == IW_CM_EVENT_CONNECT_REQUEST) {
+ event.provider_data = cep;
+ id = cep->listen_cep->cm_id;
+ } else {
+ id = cep->cm_id;
+ }
+ /* Signal IRD and ORD */
+ if (reason == IW_CM_EVENT_ESTABLISHED ||
+ reason == IW_CM_EVENT_CONNECT_REPLY) {
+ /* Signal negotiated IRD/ORD values we will use */
+ event.ird = cep->ird;
+ event.ord = cep->ord;
+ } else if (reason == IW_CM_EVENT_CONNECT_REQUEST) {
+ event.ird = cep->ord;
+ event.ord = cep->ird;
+ }
+ /* Signal private data and address information */
+ if (reason == IW_CM_EVENT_CONNECT_REQUEST ||
+ reason == IW_CM_EVENT_CONNECT_REPLY) {
+ u16 pd_len = be16_to_cpu(cep->mpa.hdr.params.pd_len);
+
+ if (pd_len) {
+ /*
+ * hand over MPA private data
+ */
+ event.private_data_len = pd_len;
+ event.private_data = cep->mpa.pdata;
+
+ /* Hide MPA V2 IRD/ORD control */
+ if (cep->enhanced_rdma_conn_est) {
+ event.private_data_len -=
+ sizeof(struct mpa_v2_data);
+ event.private_data +=
+ sizeof(struct mpa_v2_data);
+ }
+ }
+ getname_local(cep->sock, &event.local_addr);
+ getname_peer(cep->sock, &event.remote_addr);
+ }
+ siw_dbg_cep(cep, "[QP %u]: id 0x%p, reason=%d, status=%d\n",
+ cep->qp ? qp_id(cep->qp) : -1, id, reason, status);
+
+ return id->event_handler(id, &event);
+}
+
+/*
+ * siw_qp_cm_drop()
+ *
+ * Drops established LLP connection if present and not already
+ * scheduled for dropping. Called from user context, SQ workqueue
+ * or receive IRQ. Caller signals if socket can be immediately
+ * closed (basically, if not in IRQ).
+ */
+void siw_qp_cm_drop(struct siw_qp *qp, int schedule)
+{
+ struct siw_cep *cep = qp->cep;
+
+ qp->rx_stream.rx_suspend = 1;
+ qp->tx_ctx.tx_suspend = 1;
+
+ if (!qp->cep)
+ return;
+
+ if (schedule) {
+ siw_cm_queue_work(cep, SIW_CM_WORK_CLOSE_LLP);
+ } else {
+ siw_cep_set_inuse(cep);
+
+ if (cep->state == SIW_EPSTATE_CLOSED) {
+ siw_dbg_cep(cep, "already closed\n");
+ goto out;
+ }
+ siw_dbg_cep(cep, "immediate close, state %d\n", cep->state);
+
+ if (qp->term_info.valid)
+ siw_send_terminate(qp);
+
+ if (cep->cm_id) {
+ switch (cep->state) {
+ case SIW_EPSTATE_AWAIT_MPAREP:
+ siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY,
+ -EINVAL);
+ break;
+
+ case SIW_EPSTATE_RDMA_MODE:
+ siw_cm_upcall(cep, IW_CM_EVENT_CLOSE, 0);
+ break;
+
+ case SIW_EPSTATE_IDLE:
+ case SIW_EPSTATE_LISTENING:
+ case SIW_EPSTATE_CONNECTING:
+ case SIW_EPSTATE_AWAIT_MPAREQ:
+ case SIW_EPSTATE_RECVD_MPAREQ:
+ case SIW_EPSTATE_CLOSED:
+ default:
+ break;
+ }
+ cep->cm_id->rem_ref(cep->cm_id);
+ cep->cm_id = NULL;
+ siw_cep_put(cep);
+ }
+ cep->state = SIW_EPSTATE_CLOSED;
+
+ if (cep->sock) {
+ siw_socket_disassoc(cep->sock);
+ /*
+ * Immediately close socket
+ */
+ sock_release(cep->sock);
+ cep->sock = NULL;
+ }
+ if (cep->qp) {
+ cep->qp = NULL;
+ siw_qp_put(qp);
+ }
+out:
+ siw_cep_set_free(cep);
+ }
+}
+
+void siw_cep_put(struct siw_cep *cep)
+{
+ WARN_ON(kref_read(&cep->ref) < 1);
+ kref_put(&cep->ref, __siw_cep_dealloc);
+}
+
+void siw_cep_get(struct siw_cep *cep)
+{
+ kref_get(&cep->ref);
+}
+
+/*
+ * Expects params->pd_len in host byte order
+ */
+static int siw_send_mpareqrep(struct siw_cep *cep, const void *pdata, u8 pd_len)
+{
+ struct socket *s = cep->sock;
+ struct mpa_rr *rr = &cep->mpa.hdr;
+ struct kvec iov[3];
+ struct msghdr msg;
+ int rv;
+ int iovec_num = 0;
+ int mpa_len;
+
+ memset(&msg, 0, sizeof(msg));
+
+ iov[iovec_num].iov_base = rr;
+ iov[iovec_num].iov_len = sizeof(*rr);
+ mpa_len = sizeof(*rr);
+
+ if (cep->enhanced_rdma_conn_est) {
+ iovec_num++;
+ iov[iovec_num].iov_base = &cep->mpa.v2_ctrl;
+ iov[iovec_num].iov_len = sizeof(cep->mpa.v2_ctrl);
+ mpa_len += sizeof(cep->mpa.v2_ctrl);
+ }
+ if (pd_len) {
+ iovec_num++;
+ iov[iovec_num].iov_base = (char *)pdata;
+ iov[iovec_num].iov_len = pd_len;
+ mpa_len += pd_len;
+ }
+ if (cep->enhanced_rdma_conn_est)
+ pd_len += sizeof(cep->mpa.v2_ctrl);
+
+ rr->params.pd_len = cpu_to_be16(pd_len);
+
+ rv = kernel_sendmsg(s, &msg, iov, iovec_num + 1, mpa_len);
+
+ return rv < 0 ? rv : 0;
+}
+
+/*
+ * Receive MPA Request/Reply header.
+ *
+ * Returns 0 if complete MPA Request/Reply header including
+ * eventual private data was received. Returns -EAGAIN if
+ * header was partially received or negative error code otherwise.
+ *
+ * Context: May be called in process context only
+ */
+static int siw_recv_mpa_rr(struct siw_cep *cep)
+{
+ struct mpa_rr *hdr = &cep->mpa.hdr;
+ struct socket *s = cep->sock;
+ u16 pd_len;
+ int rcvd, to_rcv;
+
+ if (cep->mpa.bytes_rcvd < sizeof(struct mpa_rr)) {
+ rcvd = ksock_recv(s, (char *)hdr + cep->mpa.bytes_rcvd,
+ sizeof(struct mpa_rr) - cep->mpa.bytes_rcvd,
+ 0);
+ if (rcvd <= 0)
+ return -ECONNABORTED;
+
+ cep->mpa.bytes_rcvd += rcvd;
+
+ if (cep->mpa.bytes_rcvd < sizeof(struct mpa_rr))
+ return -EAGAIN;
+
+ if (be16_to_cpu(hdr->params.pd_len) > MPA_MAX_PRIVDATA)
+ return -EPROTO;
+ }
+ pd_len = be16_to_cpu(hdr->params.pd_len);
+
+ /*
+ * At least the MPA Request/Reply header (frame not including
+ * private data) has been received.
+ * Receive (or continue receiving) any private data.
+ */
+ to_rcv = pd_len - (cep->mpa.bytes_rcvd - sizeof(struct mpa_rr));
+
+ if (!to_rcv) {
+ /*
+ * We must have hdr->params.pd_len == 0 and thus received a
+ * complete MPA Request/Reply frame.
+ * Check against peer protocol violation.
+ */
+ u32 word;
+
+ rcvd = ksock_recv(s, (char *)&word, sizeof(word), MSG_DONTWAIT);
+ if (rcvd == -EAGAIN)
+ return 0;
+
+ if (rcvd == 0) {
+ siw_dbg_cep(cep, "peer EOF\n");
+ return -EPIPE;
+ }
+ if (rcvd < 0) {
+ siw_dbg_cep(cep, "error: %d\n", rcvd);
+ return rcvd;
+ }
+ siw_dbg_cep(cep, "peer sent extra data: %d\n", rcvd);
+
+ return -EPROTO;
+ }
+
+ /*
+ * At this point, we must have hdr->params.pd_len != 0.
+ * A private data buffer gets allocated if hdr->params.pd_len != 0.
+ */
+ if (!cep->mpa.pdata) {
+ cep->mpa.pdata = kmalloc(pd_len + 4, GFP_KERNEL);
+ if (!cep->mpa.pdata)
+ return -ENOMEM;
+ }
+ rcvd = ksock_recv(
+ s, cep->mpa.pdata + cep->mpa.bytes_rcvd - sizeof(struct mpa_rr),
+ to_rcv + 4, MSG_DONTWAIT);
+
+ if (rcvd < 0)
+ return rcvd;
+
+ if (rcvd > to_rcv)
+ return -EPROTO;
+
+ cep->mpa.bytes_rcvd += rcvd;
+
+ if (to_rcv == rcvd) {
+ siw_dbg_cep(cep, "%d bytes private data received\n", pd_len);
+ return 0;
+ }
+ return -EAGAIN;
+}
+
+/*
+ * siw_proc_mpareq()
+ *
+ * Read MPA Request from socket and signal new connection to IWCM
+ * if success. Caller must hold lock on corresponding listening CEP.
+ */
+static int siw_proc_mpareq(struct siw_cep *cep)
+{
+ struct mpa_rr *req;
+ int version, rv;
+ u16 pd_len;
+
+ rv = siw_recv_mpa_rr(cep);
+ if (rv)
+ return rv;
+
+ req = &cep->mpa.hdr;
+
+ version = __mpa_rr_revision(req->params.bits);
+ pd_len = be16_to_cpu(req->params.pd_len);
+
+ if (version > MPA_REVISION_2)
+ /* allow for 0, 1, and 2 only */
+ return -EPROTO;
+
+ if (memcmp(req->key, MPA_KEY_REQ, 16))
+ return -EPROTO;
+
+ /* Prepare for sending MPA reply */
+ memcpy(req->key, MPA_KEY_REP, 16);
+
+ if (version == MPA_REVISION_2 &&
+ (req->params.bits & MPA_RR_FLAG_ENHANCED)) {
+ /*
+ * MPA version 2 must signal IRD/ORD values and P2P mode
+ * in private data if header flag MPA_RR_FLAG_ENHANCED
+ * is set.
+ */
+ if (pd_len < sizeof(struct mpa_v2_data))
+ goto reject_conn;
+
+ cep->enhanced_rdma_conn_est = true;
+ }
+
+ /* MPA Markers: currently not supported. Marker TX to be added. */
+ if (req->params.bits & MPA_RR_FLAG_MARKERS)
+ goto reject_conn;
+
+ if (req->params.bits & MPA_RR_FLAG_CRC) {
+ /*
+ * RFC 5044, page 27: CRC MUST be used if peer requests it.
+ * siw specific: 'mpa_crc_strict' parameter to reject
+ * connection with CRC if local CRC off enforced by
+ * 'mpa_crc_strict' module parameter.
+ */
+ if (!mpa_crc_required && mpa_crc_strict)
+ goto reject_conn;
+
+ /* Enable CRC if requested by module parameter */
+ if (mpa_crc_required)
+ req->params.bits |= MPA_RR_FLAG_CRC;
+ }
+ if (cep->enhanced_rdma_conn_est) {
+ struct mpa_v2_data *v2 = (struct mpa_v2_data *)cep->mpa.pdata;
+
+ /*
+ * Peer requested ORD becomes requested local IRD,
+ * peer requested IRD becomes requested local ORD.
+ * IRD and ORD get limited by global maximum values.
+ */
+ cep->ord = ntohs(v2->ird) & MPA_IRD_ORD_MASK;
+ cep->ord = min(cep->ord, SIW_MAX_ORD_QP);
+ cep->ird = ntohs(v2->ord) & MPA_IRD_ORD_MASK;
+ cep->ird = min(cep->ird, SIW_MAX_IRD_QP);
+
+ /* May get overwritten by locally negotiated values */
+ cep->mpa.v2_ctrl.ird = htons(cep->ird);
+ cep->mpa.v2_ctrl.ord = htons(cep->ord);
+
+ /*
+ * Support for peer sent zero length Write or Read to
+ * let local side enter RTS. Writes are preferred.
+ * Sends would require pre-posting a Receive and are
+ * not supported.
+ * Propose zero length Write if none of Read and Write
+ * is indicated.
+ */
+ if (v2->ird & MPA_V2_PEER_TO_PEER) {
+ cep->mpa.v2_ctrl.ird |= MPA_V2_PEER_TO_PEER;
+
+ if (v2->ord & MPA_V2_RDMA_WRITE_RTR)
+ cep->mpa.v2_ctrl.ord |= MPA_V2_RDMA_WRITE_RTR;
+ else if (v2->ord & MPA_V2_RDMA_READ_RTR)
+ cep->mpa.v2_ctrl.ord |= MPA_V2_RDMA_READ_RTR;
+ else
+ cep->mpa.v2_ctrl.ord |= MPA_V2_RDMA_WRITE_RTR;
+ }
+ }
+
+ cep->state = SIW_EPSTATE_RECVD_MPAREQ;
+
+ /* Keep reference until IWCM accepts/rejects */
+ siw_cep_get(cep);
+ rv = siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REQUEST, 0);
+ if (rv)
+ siw_cep_put(cep);
+
+ return rv;
+
+reject_conn:
+ siw_dbg_cep(cep, "reject: crc %d:%d:%d, m %d:%d\n",
+ req->params.bits & MPA_RR_FLAG_CRC ? 1 : 0,
+ mpa_crc_required, mpa_crc_strict,
+ req->params.bits & MPA_RR_FLAG_MARKERS ? 1 : 0, 0);
+
+ req->params.bits &= ~MPA_RR_FLAG_MARKERS;
+ req->params.bits |= MPA_RR_FLAG_REJECT;
+
+ if (!mpa_crc_required && mpa_crc_strict)
+ req->params.bits &= ~MPA_RR_FLAG_CRC;
+
+ if (pd_len)
+ kfree(cep->mpa.pdata);
+
+ cep->mpa.pdata = NULL;
+
+ siw_send_mpareqrep(cep, NULL, 0);
+
+ return -EOPNOTSUPP;
+}
+
+static int siw_proc_mpareply(struct siw_cep *cep)
+{
+ struct siw_qp_attrs qp_attrs;
+ enum siw_qp_attr_mask qp_attr_mask;
+ struct siw_qp *qp = cep->qp;
+ struct mpa_rr *rep;
+ int rv;
+ u16 rep_ord;
+ u16 rep_ird;
+ bool ird_insufficient = false;
+ enum mpa_v2_ctrl mpa_p2p_mode = MPA_V2_RDMA_NO_RTR;
+
+ rv = siw_recv_mpa_rr(cep);
+ if (rv != -EAGAIN)
+ siw_cancel_mpatimer(cep);
+ if (rv)
+ goto out_err;
+
+ rep = &cep->mpa.hdr;
+
+ if (__mpa_rr_revision(rep->params.bits) > MPA_REVISION_2) {
+ /* allow for 0, 1, and 2 only */
+ rv = -EPROTO;
+ goto out_err;
+ }
+ if (memcmp(rep->key, MPA_KEY_REP, 16)) {
+ siw_init_terminate(qp, TERM_ERROR_LAYER_LLP, LLP_ETYPE_MPA,
+ LLP_ECODE_INVALID_REQ_RESP, 0);
+ siw_send_terminate(qp);
+ rv = -EPROTO;
+ goto out_err;
+ }
+ if (rep->params.bits & MPA_RR_FLAG_REJECT) {
+ siw_dbg_cep(cep, "got mpa reject\n");
+ siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, -ECONNRESET);
+
+ return -ECONNRESET;
+ }
+ if (try_gso && rep->params.bits & MPA_RR_FLAG_GSO_EXP) {
+ siw_dbg_cep(cep, "peer allows GSO on TX\n");
+ qp->tx_ctx.gso_seg_limit = 0;
+ }
+ if ((rep->params.bits & MPA_RR_FLAG_MARKERS) ||
+ (mpa_crc_required && !(rep->params.bits & MPA_RR_FLAG_CRC)) ||
+ (mpa_crc_strict && !mpa_crc_required &&
+ (rep->params.bits & MPA_RR_FLAG_CRC))) {
+ siw_dbg_cep(cep, "reply unsupp: crc %d:%d:%d, m %d:%d\n",
+ rep->params.bits & MPA_RR_FLAG_CRC ? 1 : 0,
+ mpa_crc_required, mpa_crc_strict,
+ rep->params.bits & MPA_RR_FLAG_MARKERS ? 1 : 0, 0);
+
+ siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, -ECONNREFUSED);
+
+ return -EINVAL;
+ }
+ if (cep->enhanced_rdma_conn_est) {
+ struct mpa_v2_data *v2;
+
+ if (__mpa_rr_revision(rep->params.bits) < MPA_REVISION_2 ||
+ !(rep->params.bits & MPA_RR_FLAG_ENHANCED)) {
+ /*
+ * Protocol failure: The responder MUST reply with
+ * MPA version 2 and MUST set MPA_RR_FLAG_ENHANCED.
+ */
+ siw_dbg_cep(cep, "mpa reply error: vers %d, enhcd %d\n",
+ __mpa_rr_revision(rep->params.bits),
+ rep->params.bits & MPA_RR_FLAG_ENHANCED ?
+ 1 :
+ 0);
+
+ siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY,
+ -ECONNRESET);
+ return -EINVAL;
+ }
+ v2 = (struct mpa_v2_data *)cep->mpa.pdata;
+ rep_ird = ntohs(v2->ird) & MPA_IRD_ORD_MASK;
+ rep_ord = ntohs(v2->ord) & MPA_IRD_ORD_MASK;
+
+ if (cep->ird < rep_ord &&
+ (relaxed_ird_negotiation == false ||
+ rep_ord > cep->sdev->attrs.max_ird)) {
+ siw_dbg_cep(cep, "ird %d, rep_ord %d, max_ord %d\n",
+ cep->ird, rep_ord,
+ cep->sdev->attrs.max_ord);
+ ird_insufficient = true;
+ }
+ if (cep->ord > rep_ird && relaxed_ird_negotiation == false) {
+ siw_dbg_cep(cep, "ord %d, rep_ird %d\n", cep->ord,
+ rep_ird);
+ ird_insufficient = true;
+ }
+ /*
+ * Always report negotiated peer values to user,
+ * even if IRD/ORD negotiation failed
+ */
+ cep->ird = rep_ord;
+ cep->ord = rep_ird;
+
+ if (ird_insufficient) {
+ /*
+ * If the initiator IRD is insuffient for the
+ * responder ORD, send a TERM.
+ */
+ siw_init_terminate(qp, TERM_ERROR_LAYER_LLP,
+ LLP_ETYPE_MPA,
+ LLP_ECODE_INSUFFICIENT_IRD, 0);
+ siw_send_terminate(qp);
+ rv = -ENOMEM;
+ goto out_err;
+ }
+ if (cep->mpa.v2_ctrl_req.ird & MPA_V2_PEER_TO_PEER)
+ mpa_p2p_mode =
+ cep->mpa.v2_ctrl_req.ord &
+ (MPA_V2_RDMA_WRITE_RTR | MPA_V2_RDMA_READ_RTR);
+
+ /*
+ * Check if we requested P2P mode, and if peer agrees
+ */
+ if (mpa_p2p_mode != MPA_V2_RDMA_NO_RTR) {
+ if ((mpa_p2p_mode & v2->ord) == 0) {
+ /*
+ * We requested RTR mode(s), but the peer
+ * did not pick any mode we support.
+ */
+ siw_dbg_cep(cep,
+ "rtr mode: req %2x, got %2x\n",
+ mpa_p2p_mode,
+ v2->ord & (MPA_V2_RDMA_WRITE_RTR |
+ MPA_V2_RDMA_READ_RTR));
+
+ siw_init_terminate(qp, TERM_ERROR_LAYER_LLP,
+ LLP_ETYPE_MPA,
+ LLP_ECODE_NO_MATCHING_RTR,
+ 0);
+ siw_send_terminate(qp);
+ rv = -EPROTO;
+ goto out_err;
+ }
+ mpa_p2p_mode = v2->ord & (MPA_V2_RDMA_WRITE_RTR |
+ MPA_V2_RDMA_READ_RTR);
+ }
+ }
+ memset(&qp_attrs, 0, sizeof(qp_attrs));
+
+ if (rep->params.bits & MPA_RR_FLAG_CRC)
+ qp_attrs.flags = SIW_MPA_CRC;
+
+ qp_attrs.irq_size = cep->ird;
+ qp_attrs.orq_size = cep->ord;
+ qp_attrs.sk = cep->sock;
+ qp_attrs.state = SIW_QP_STATE_RTS;
+
+ qp_attr_mask = SIW_QP_ATTR_STATE | SIW_QP_ATTR_LLP_HANDLE |
+ SIW_QP_ATTR_ORD | SIW_QP_ATTR_IRD | SIW_QP_ATTR_MPA;
+
+ /* Move socket RX/TX under QP control */
+ down_write(&qp->state_lock);
+ if (qp->attrs.state > SIW_QP_STATE_RTR) {
+ rv = -EINVAL;
+ up_write(&qp->state_lock);
+ goto out_err;
+ }
+ rv = siw_qp_modify(qp, &qp_attrs, qp_attr_mask);
+
+ siw_qp_socket_assoc(cep, qp);
+
+ up_write(&qp->state_lock);
+
+ /* Send extra RDMA frame to trigger peer RTS if negotiated */
+ if (mpa_p2p_mode != MPA_V2_RDMA_NO_RTR) {
+ rv = siw_qp_mpa_rts(qp, mpa_p2p_mode);
+ if (rv)
+ goto out_err;
+ }
+ if (!rv) {
+ rv = siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, 0);
+ if (!rv)
+ cep->state = SIW_EPSTATE_RDMA_MODE;
+
+ return 0;
+ }
+
+out_err:
+ siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, -EINVAL);
+
+ return rv;
+}
+
+/*
+ * siw_accept_newconn - accept an incoming pending connection
+ *
+ */
+static void siw_accept_newconn(struct siw_cep *cep)
+{
+ struct socket *s = cep->sock;
+ struct socket *new_s = NULL;
+ struct siw_cep *new_cep = NULL;
+ int rv = 0; /* debug only. should disappear */
+
+ if (cep->state != SIW_EPSTATE_LISTENING)
+ goto error;
+
+ new_cep = siw_cep_alloc(cep->sdev);
+ if (!new_cep)
+ goto error;
+
+ /*
+ * 4: Allocate a sufficient number of work elements
+ * to allow concurrent handling of local + peer close
+ * events, MPA header processing + MPA timeout.
+ */
+ if (siw_cm_alloc_work(new_cep, 4) != 0)
+ goto error;
+
+ /*
+ * Copy saved socket callbacks from listening CEP
+ * and assign new socket with new CEP
+ */
+ new_cep->sk_state_change = cep->sk_state_change;
+ new_cep->sk_data_ready = cep->sk_data_ready;
+ new_cep->sk_write_space = cep->sk_write_space;
+ new_cep->sk_error_report = cep->sk_error_report;
+
+ rv = kernel_accept(s, &new_s, O_NONBLOCK);
+ if (rv != 0) {
+ /*
+ * Connection already aborted by peer..?
+ */
+ siw_dbg_cep(cep, "kernel_accept() error: %d\n", rv);
+ goto error;
+ }
+ new_cep->sock = new_s;
+ siw_cep_get(new_cep);
+ new_s->sk->sk_user_data = new_cep;
+
+ siw_dbg_cep(cep, "listen socket 0x%p, new 0x%p\n", s, new_s);
+
+ if (siw_tcp_nagle == false) {
+ int val = 1;
+
+ rv = kernel_setsockopt(new_s, SOL_TCP, TCP_NODELAY,
+ (char *)&val, sizeof(val));
+ if (rv) {
+ siw_dbg_cep(cep, "setsockopt NODELAY error: %d\n", rv);
+ goto error;
+ }
+ }
+ new_cep->state = SIW_EPSTATE_AWAIT_MPAREQ;
+
+ rv = siw_cm_queue_work(new_cep, SIW_CM_WORK_MPATIMEOUT);
+ if (rv)
+ goto error;
+ /*
+ * See siw_proc_mpareq() etc. for the use of new_cep->listen_cep.
+ */
+ new_cep->listen_cep = cep;
+ siw_cep_get(cep);
+
+ if (atomic_read(&new_s->sk->sk_rmem_alloc)) {
+ /*
+ * MPA REQ already queued
+ */
+ siw_dbg_cep(cep, "immediate mpa request\n");
+
+ siw_cep_set_inuse(new_cep);
+ rv = siw_proc_mpareq(new_cep);
+ siw_cep_set_free(new_cep);
+
+ if (rv != -EAGAIN) {
+ siw_cep_put(cep);
+ new_cep->listen_cep = NULL;
+ if (rv)
+ goto error;
+ }
+ }
+ return;
+
+error:
+ if (new_cep)
+ siw_cep_put(new_cep);
+
+ if (new_s) {
+ siw_socket_disassoc(new_s);
+ sock_release(new_s);
+ new_cep->sock = NULL;
+ }
+ siw_dbg_cep(cep, "error %d\n", rv);
+}
+
+static void siw_cm_work_handler(struct work_struct *w)
+{
+ struct siw_cm_work *work;
+ struct siw_cep *cep;
+ int release_cep = 0, rv = 0;
+
+ work = container_of(w, struct siw_cm_work, work.work);
+ cep = work->cep;
+
+ siw_dbg_cep(cep, "[QP %u]: work type: %d, state %d\n",
+ cep->qp ? qp_id(cep->qp) : -1, work->type, cep->state);
+
+ siw_cep_set_inuse(cep);
+
+ switch (work->type) {
+ case SIW_CM_WORK_ACCEPT:
+ siw_accept_newconn(cep);
+ break;
+
+ case SIW_CM_WORK_READ_MPAHDR:
+ if (cep->state == SIW_EPSTATE_AWAIT_MPAREQ) {
+ if (cep->listen_cep) {
+ siw_cep_set_inuse(cep->listen_cep);
+
+ if (cep->listen_cep->state ==
+ SIW_EPSTATE_LISTENING)
+ rv = siw_proc_mpareq(cep);
+ else
+ rv = -EFAULT;
+
+ siw_cep_set_free(cep->listen_cep);
+
+ if (rv != -EAGAIN) {
+ siw_cep_put(cep->listen_cep);
+ cep->listen_cep = NULL;
+ if (rv)
+ siw_cep_put(cep);
+ }
+ }
+ } else if (cep->state == SIW_EPSTATE_AWAIT_MPAREP) {
+ rv = siw_proc_mpareply(cep);
+ } else {
+ /*
+ * CEP already moved out of MPA handshake.
+ * any connection management already done.
+ * silently ignore the mpa packet.
+ */
+ if (cep->state == SIW_EPSTATE_RDMA_MODE) {
+ cep->sock->sk->sk_data_ready(cep->sock->sk);
+ siw_dbg_cep(cep, "already in RDMA mode");
+ } else {
+ siw_dbg_cep(cep, "out of state: %d\n",
+ cep->state);
+ }
+ }
+ if (rv && rv != EAGAIN)
+ release_cep = 1;
+ break;
+
+ case SIW_CM_WORK_CLOSE_LLP:
+ /*
+ * QP scheduled LLP close
+ */
+ if (cep->qp && cep->qp->term_info.valid)
+ siw_send_terminate(cep->qp);
+
+ if (cep->cm_id)
+ siw_cm_upcall(cep, IW_CM_EVENT_CLOSE, 0);
+
+ release_cep = 1;
+ break;
+
+ case SIW_CM_WORK_PEER_CLOSE:
+ if (cep->cm_id) {
+ if (cep->state == SIW_EPSTATE_AWAIT_MPAREP) {
+ /*
+ * MPA reply not received, but connection drop
+ */
+ siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY,
+ -ECONNRESET);
+ } else if (cep->state == SIW_EPSTATE_RDMA_MODE) {
+ /*
+ * NOTE: IW_CM_EVENT_DISCONNECT is given just
+ * to transition IWCM into CLOSING.
+ */
+ siw_cm_upcall(cep, IW_CM_EVENT_DISCONNECT, 0);
+ siw_cm_upcall(cep, IW_CM_EVENT_CLOSE, 0);
+ }
+ /*
+ * for other states there is no connection
+ * known to the IWCM.
+ */
+ } else {
+ if (cep->state == SIW_EPSTATE_RECVD_MPAREQ) {
+ /*
+ * Wait for the ulp/CM to call accept/reject
+ */
+ siw_dbg_cep(cep,
+ "mpa req recvd, wait for ULP\n");
+ } else if (cep->state == SIW_EPSTATE_AWAIT_MPAREQ) {
+ /*
+ * Socket close before MPA request received.
+ */
+ siw_dbg_cep(cep, "no mpareq: drop listener\n");
+ siw_cep_put(cep->listen_cep);
+ cep->listen_cep = NULL;
+ }
+ }
+ release_cep = 1;
+ break;
+
+ case SIW_CM_WORK_MPATIMEOUT:
+ cep->mpa_timer = NULL;
+
+ if (cep->state == SIW_EPSTATE_AWAIT_MPAREP) {
+ /*
+ * MPA request timed out:
+ * Hide any partially received private data and signal
+ * timeout
+ */
+ cep->mpa.hdr.params.pd_len = 0;
+
+ if (cep->cm_id)
+ siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY,
+ -ETIMEDOUT);
+ release_cep = 1;
+
+ } else if (cep->state == SIW_EPSTATE_AWAIT_MPAREQ) {
+ /*
+ * No MPA request received after peer TCP stream setup.
+ */
+ if (cep->listen_cep) {
+ siw_cep_put(cep->listen_cep);
+ cep->listen_cep = NULL;
+ }
+ release_cep = 1;
+ }
+ break;
+
+ default:
+ WARN(1, "Undefined CM work type: %d\n", work->type);
+ }
+ if (release_cep) {
+ siw_dbg_cep(cep,
+ "release: timer=%s, QP[%u], id 0x%p\n",
+ cep->mpa_timer ? "y" : "n",
+ cep->qp ? qp_id(cep->qp) : -1, cep->cm_id);
+
+ siw_cancel_mpatimer(cep);
+
+ cep->state = SIW_EPSTATE_CLOSED;
+
+ if (cep->qp) {
+ struct siw_qp *qp = cep->qp;
+ /*
+ * Serialize a potential race with application
+ * closing the QP and calling siw_qp_cm_drop()
+ */
+ siw_qp_get(qp);
+ siw_cep_set_free(cep);
+
+ siw_qp_llp_close(qp);
+ siw_qp_put(qp);
+
+ siw_cep_set_inuse(cep);
+ cep->qp = NULL;
+ siw_qp_put(qp);
+ }
+ if (cep->sock) {
+ siw_socket_disassoc(cep->sock);
+ sock_release(cep->sock);
+ cep->sock = NULL;
+ }
+ if (cep->cm_id) {
+ cep->cm_id->rem_ref(cep->cm_id);
+ cep->cm_id = NULL;
+ siw_cep_put(cep);
+ }
+ }
+ siw_cep_set_free(cep);
+ siw_put_work(work);
+ siw_cep_put(cep);
+}
+
+static struct workqueue_struct *siw_cm_wq;
+
+int siw_cm_queue_work(struct siw_cep *cep, enum siw_work_type type)
+{
+ struct siw_cm_work *work = siw_get_work(cep);
+ unsigned long delay = 0;
+
+ if (!work) {
+ siw_dbg_cep(cep, "failed with no work available\n");
+ return -ENOMEM;
+ }
+ work->type = type;
+ work->cep = cep;
+
+ siw_cep_get(cep);
+
+ INIT_DELAYED_WORK(&work->work, siw_cm_work_handler);
+
+ if (type == SIW_CM_WORK_MPATIMEOUT) {
+ cep->mpa_timer = work;
+
+ if (cep->state == SIW_EPSTATE_AWAIT_MPAREP)
+ delay = MPAREQ_TIMEOUT;
+ else
+ delay = MPAREP_TIMEOUT;
+ }
+ siw_dbg_cep(cep, "[QP %u]: work type: %d, work 0x%p, timeout %lu\n",
+ cep->qp ? qp_id(cep->qp) : -1, type, work, delay);
+
+ queue_delayed_work(siw_cm_wq, &work->work, delay);
+
+ return 0;
+}
+
+static void siw_cm_llp_data_ready(struct sock *sk)
+{
+ struct siw_cep *cep;
+
+ read_lock(&sk->sk_callback_lock);
+
+ cep = sk_to_cep(sk);
+ if (!cep) {
+ WARN_ON(1);
+ goto out;
+ }
+ siw_dbg_cep(cep, "state: %d\n", cep->state);
+
+ switch (cep->state) {
+ case SIW_EPSTATE_RDMA_MODE:
+ /* fall through */
+ case SIW_EPSTATE_LISTENING:
+ break;
+
+ case SIW_EPSTATE_AWAIT_MPAREQ:
+ /* fall through */
+ case SIW_EPSTATE_AWAIT_MPAREP:
+ siw_cm_queue_work(cep, SIW_CM_WORK_READ_MPAHDR);
+ break;
+
+ default:
+ siw_dbg_cep(cep, "unexpected data, state %d\n", cep->state);
+ break;
+ }
+out:
+ read_unlock(&sk->sk_callback_lock);
+}
+
+static void siw_cm_llp_write_space(struct sock *sk)
+{
+ struct siw_cep *cep = sk_to_cep(sk);
+
+ if (cep)
+ siw_dbg_cep(cep, "state: %d\n", cep->state);
+}
+
+static void siw_cm_llp_error_report(struct sock *sk)
+{
+ struct siw_cep *cep = sk_to_cep(sk);
+
+ if (cep) {
+ siw_dbg_cep(cep, "error %d, socket state: %d, cep state: %d\n",
+ sk->sk_err, sk->sk_state, cep->state);
+ cep->sk_error_report(sk);
+ }
+}
+
+static void siw_cm_llp_state_change(struct sock *sk)
+{
+ struct siw_cep *cep;
+ void (*orig_state_change)(struct sock *s);
+
+ read_lock(&sk->sk_callback_lock);
+
+ cep = sk_to_cep(sk);
+ if (!cep) {
+ /* endpoint already disassociated */
+ read_unlock(&sk->sk_callback_lock);
+ return;
+ }
+ orig_state_change = cep->sk_state_change;
+
+ siw_dbg_cep(cep, "state: %d\n", cep->state);
+
+ switch (sk->sk_state) {
+ case TCP_ESTABLISHED:
+ /*
+ * handle accepting socket as special case where only
+ * new connection is possible
+ */
+ siw_cm_queue_work(cep, SIW_CM_WORK_ACCEPT);
+ break;
+
+ case TCP_CLOSE:
+ case TCP_CLOSE_WAIT:
+ if (cep->qp)
+ cep->qp->tx_ctx.tx_suspend = 1;
+ siw_cm_queue_work(cep, SIW_CM_WORK_PEER_CLOSE);
+ break;
+
+ default:
+ siw_dbg_cep(cep, "unexpected socket state %d\n", sk->sk_state);
+ }
+ read_unlock(&sk->sk_callback_lock);
+ orig_state_change(sk);
+}
+
+static int kernel_bindconnect(struct socket *s, struct sockaddr *laddr,
+ struct sockaddr *raddr)
+{
+ int rv, flags = 0, s_val = 1;
+ size_t size = laddr->sa_family == AF_INET ?
+ sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6);
+
+ /*
+ * Make address available again asap.
+ */
+ rv = kernel_setsockopt(s, SOL_SOCKET, SO_REUSEADDR, (char *)&s_val,
+ sizeof(s_val));
+ if (rv < 0)
+ return rv;
+
+ rv = s->ops->bind(s, laddr, size);
+ if (rv < 0)
+ return rv;
+
+ rv = s->ops->connect(s, raddr, size, flags);
+
+ return rv < 0 ? rv : 0;
+}
+
+int siw_connect(struct iw_cm_id *id, struct iw_cm_conn_param *params)
+{
+ struct siw_device *sdev = to_siw_dev(id->device);
+ struct siw_qp *qp;
+ struct siw_cep *cep = NULL;
+ struct socket *s = NULL;
+ struct sockaddr *laddr = (struct sockaddr *)&id->local_addr,
+ *raddr = (struct sockaddr *)&id->remote_addr;
+ bool p2p_mode = peer_to_peer, v4 = true;
+ u16 pd_len = params->private_data_len;
+ int version = mpa_version, rv;
+
+ if (pd_len > MPA_MAX_PRIVDATA)
+ return -EINVAL;
+
+ if (params->ird > sdev->attrs.max_ird ||
+ params->ord > sdev->attrs.max_ord)
+ return -ENOMEM;
+
+ if (laddr->sa_family == AF_INET6)
+ v4 = false;
+ else if (laddr->sa_family != AF_INET)
+ return -EAFNOSUPPORT;
+
+ /*
+ * Respect any iwarp port mapping: Use mapped remote address
+ * if valid. Local address must not be mapped, since siw
+ * uses kernel TCP stack.
+ */
+ if ((v4 && to_sockaddr_in(id->remote_addr).sin_port != 0) ||
+ to_sockaddr_in6(id->remote_addr).sin6_port != 0)
+ raddr = (struct sockaddr *)&id->m_remote_addr;
+
+ qp = siw_qp_id2obj(sdev, params->qpn);
+ if (!qp) {
+ WARN(1, "[QP %u] does not exist\n", params->qpn);
+ rv = -EINVAL;
+ goto error;
+ }
+ if (v4)
+ siw_dbg_qp(qp,
+ "id 0x%p, pd_len %d, laddr %pI4 %d, raddr %pI4 %d\n",
+ id, pd_len,
+ &((struct sockaddr_in *)(laddr))->sin_addr,
+ ntohs(((struct sockaddr_in *)(laddr))->sin_port),
+ &((struct sockaddr_in *)(raddr))->sin_addr,
+ ntohs(((struct sockaddr_in *)(raddr))->sin_port));
+ else
+ siw_dbg_qp(qp,
+ "id 0x%p, pd_len %d, laddr %pI6 %d, raddr %pI6 %d\n",
+ id, pd_len,
+ &((struct sockaddr_in6 *)(laddr))->sin6_addr,
+ ntohs(((struct sockaddr_in6 *)(laddr))->sin6_port),
+ &((struct sockaddr_in6 *)(raddr))->sin6_addr,
+ ntohs(((struct sockaddr_in6 *)(raddr))->sin6_port));
+
+ rv = sock_create(v4 ? AF_INET : AF_INET6, SOCK_STREAM, IPPROTO_TCP, &s);
+ if (rv < 0)
+ goto error;
+
+ /*
+ * NOTE: For simplification, connect() is called in blocking
+ * mode. Might be reconsidered for async connection setup at
+ * TCP level.
+ */
+ rv = kernel_bindconnect(s, laddr, raddr);
+ if (rv != 0) {
+ siw_dbg_qp(qp, "kernel_bindconnect: error %d\n", rv);
+ goto error;
+ }
+ if (siw_tcp_nagle == false) {
+ int val = 1;
+
+ rv = kernel_setsockopt(s, SOL_TCP, TCP_NODELAY, (char *)&val,
+ sizeof(val));
+ if (rv) {
+ siw_dbg_qp(qp, "setsockopt NODELAY error: %d\n", rv);
+ goto error;
+ }
+ }
+ cep = siw_cep_alloc(sdev);
+ if (!cep) {
+ rv = -ENOMEM;
+ goto error;
+ }
+ siw_cep_set_inuse(cep);
+
+ /* Associate QP with CEP */
+ siw_cep_get(cep);
+ qp->cep = cep;
+
+ /* siw_qp_get(qp) already done by QP lookup */
+ cep->qp = qp;
+
+ id->add_ref(id);
+ cep->cm_id = id;
+
+ /*
+ * 4: Allocate a sufficient number of work elements
+ * to allow concurrent handling of local + peer close
+ * events, MPA header processing + MPA timeout.
+ */
+ rv = siw_cm_alloc_work(cep, 4);
+ if (rv != 0) {
+ rv = -ENOMEM;
+ goto error;
+ }
+ cep->ird = params->ird;
+ cep->ord = params->ord;
+
+ if (p2p_mode && cep->ord == 0)
+ cep->ord = 1;
+
+ cep->state = SIW_EPSTATE_CONNECTING;
+
+ /*
+ * Associate CEP with socket
+ */
+ siw_cep_socket_assoc(cep, s);
+
+ cep->state = SIW_EPSTATE_AWAIT_MPAREP;
+
+ /*
+ * Set MPA Request bits: CRC if required, no MPA Markers,
+ * MPA Rev. according to module parameter 'mpa_version', Key 'Request'.
+ */
+ cep->mpa.hdr.params.bits = 0;
+ if (version > MPA_REVISION_2) {
+ pr_warn("Setting MPA version to %u\n", MPA_REVISION_2);
+ version = MPA_REVISION_2;
+ /* Adjust also module parameter */
+ mpa_version = MPA_REVISION_2;
+ }
+ __mpa_rr_set_revision(&cep->mpa.hdr.params.bits, version);
+
+ if (try_gso)
+ cep->mpa.hdr.params.bits |= MPA_RR_FLAG_GSO_EXP;
+
+ if (mpa_crc_required)
+ cep->mpa.hdr.params.bits |= MPA_RR_FLAG_CRC;
+
+ /*
+ * If MPA version == 2:
+ * o Include ORD and IRD.
+ * o Indicate peer-to-peer mode, if required by module
+ * parameter 'peer_to_peer'.
+ */
+ if (version == MPA_REVISION_2) {
+ cep->enhanced_rdma_conn_est = true;
+ cep->mpa.hdr.params.bits |= MPA_RR_FLAG_ENHANCED;
+
+ cep->mpa.v2_ctrl.ird = htons(cep->ird);
+ cep->mpa.v2_ctrl.ord = htons(cep->ord);
+
+ if (p2p_mode) {
+ cep->mpa.v2_ctrl.ird |= MPA_V2_PEER_TO_PEER;
+ cep->mpa.v2_ctrl.ord |= rtr_type;
+ }
+ /* Remember own P2P mode requested */
+ cep->mpa.v2_ctrl_req.ird = cep->mpa.v2_ctrl.ird;
+ cep->mpa.v2_ctrl_req.ord = cep->mpa.v2_ctrl.ord;
+ }
+ memcpy(cep->mpa.hdr.key, MPA_KEY_REQ, 16);
+
+ rv = siw_send_mpareqrep(cep, params->private_data, pd_len);
+ /*
+ * Reset private data.
+ */
+ cep->mpa.hdr.params.pd_len = 0;
+
+ if (rv >= 0) {
+ rv = siw_cm_queue_work(cep, SIW_CM_WORK_MPATIMEOUT);
+ if (!rv) {
+ siw_dbg_cep(cep, "id 0x%p, [QP %u]: exit\n", id,
+ qp_id(qp));
+ siw_cep_set_free(cep);
+ return 0;
+ }
+ }
+error:
+ siw_dbg_qp(qp, "failed: %d\n", rv);
+
+ if (cep) {
+ siw_socket_disassoc(s);
+ sock_release(s);
+ cep->sock = NULL;
+
+ cep->qp = NULL;
+
+ cep->cm_id = NULL;
+ id->rem_ref(id);
+ siw_cep_put(cep);
+
+ qp->cep = NULL;
+ siw_cep_put(cep);
+
+ cep->state = SIW_EPSTATE_CLOSED;
+
+ siw_cep_set_free(cep);
+
+ siw_cep_put(cep);
+
+ } else if (s) {
+ sock_release(s);
+ }
+ siw_qp_put(qp);
+
+ return rv;
+}
+
+/*
+ * siw_accept - Let SoftiWARP accept an RDMA connection request
+ *
+ * @id: New connection management id to be used for accepted
+ * connection request
+ * @params: Connection parameters provided by ULP for accepting connection
+ *
+ * Transition QP to RTS state, associate new CM id @id with accepted CEP
+ * and get prepared for TCP input by installing socket callbacks.
+ * Then send MPA Reply and generate the "connection established" event.
+ * Socket callbacks must be installed before sending MPA Reply, because
+ * the latter may cause a first RDMA message to arrive from the RDMA Initiator
+ * side very quickly, at which time the socket callbacks must be ready.
+ */
+int siw_accept(struct iw_cm_id *id, struct iw_cm_conn_param *params)
+{
+ struct siw_device *sdev = to_siw_dev(id->device);
+ struct siw_cep *cep = (struct siw_cep *)id->provider_data;
+ struct siw_qp *qp;
+ struct siw_qp_attrs qp_attrs;
+ int rv, max_priv_data = MPA_MAX_PRIVDATA;
+ bool wait_for_peer_rts = false;
+
+ siw_cep_set_inuse(cep);
+ siw_cep_put(cep);
+
+ /* Free lingering inbound private data */
+ if (cep->mpa.hdr.params.pd_len) {
+ cep->mpa.hdr.params.pd_len = 0;
+ kfree(cep->mpa.pdata);
+ cep->mpa.pdata = NULL;
+ }
+ siw_cancel_mpatimer(cep);
+
+ if (cep->state != SIW_EPSTATE_RECVD_MPAREQ) {
+ siw_dbg_cep(cep, "id 0x%p: out of state\n", id);
+
+ siw_cep_set_free(cep);
+ siw_cep_put(cep);
+
+ return -ECONNRESET;
+ }
+ qp = siw_qp_id2obj(sdev, params->qpn);
+ if (!qp) {
+ WARN(1, "[QP %d] does not exist\n", params->qpn);
+ siw_cep_set_free(cep);
+ siw_cep_put(cep);
+
+ return -EINVAL;
+ }
+ down_write(&qp->state_lock);
+ if (qp->attrs.state > SIW_QP_STATE_RTR) {
+ rv = -EINVAL;
+ up_write(&qp->state_lock);
+ goto error;
+ }
+ siw_dbg_cep(cep, "id 0x%p\n", id);
+
+ if (try_gso && cep->mpa.hdr.params.bits & MPA_RR_FLAG_GSO_EXP) {
+ siw_dbg_cep(cep, "peer allows GSO on TX\n");
+ qp->tx_ctx.gso_seg_limit = 0;
+ }
+ if (params->ord > sdev->attrs.max_ord ||
+ params->ird > sdev->attrs.max_ird) {
+ siw_dbg_cep(
+ cep,
+ "id 0x%p, [QP %u]: ord %d (max %d), ird %d (max %d)\n",
+ id, qp_id(qp), params->ord, sdev->attrs.max_ord,
+ params->ird, sdev->attrs.max_ird);
+ rv = -EINVAL;
+ up_write(&qp->state_lock);
+ goto error;
+ }
+ if (cep->enhanced_rdma_conn_est)
+ max_priv_data -= sizeof(struct mpa_v2_data);
+
+ if (params->private_data_len > max_priv_data) {
+ siw_dbg_cep(
+ cep,
+ "id 0x%p, [QP %u]: private data length: %d (max %d)\n",
+ id, qp_id(qp), params->private_data_len, max_priv_data);
+ rv = -EINVAL;
+ up_write(&qp->state_lock);
+ goto error;
+ }
+ if (cep->enhanced_rdma_conn_est) {
+ if (params->ord > cep->ord) {
+ if (relaxed_ird_negotiation) {
+ params->ord = cep->ord;
+ } else {
+ cep->ird = params->ird;
+ cep->ord = params->ord;
+ rv = -EINVAL;
+ up_write(&qp->state_lock);
+ goto error;
+ }
+ }
+ if (params->ird < cep->ird) {
+ if (relaxed_ird_negotiation &&
+ cep->ird <= sdev->attrs.max_ird)
+ params->ird = cep->ird;
+ else {
+ rv = -ENOMEM;
+ up_write(&qp->state_lock);
+ goto error;
+ }
+ }
+ if (cep->mpa.v2_ctrl.ord &
+ (MPA_V2_RDMA_WRITE_RTR | MPA_V2_RDMA_READ_RTR))
+ wait_for_peer_rts = true;
+ /*
+ * Signal back negotiated IRD and ORD values
+ */
+ cep->mpa.v2_ctrl.ord =
+ htons(params->ord & MPA_IRD_ORD_MASK) |
+ (cep->mpa.v2_ctrl.ord & ~MPA_V2_MASK_IRD_ORD);
+ cep->mpa.v2_ctrl.ird =
+ htons(params->ird & MPA_IRD_ORD_MASK) |
+ (cep->mpa.v2_ctrl.ird & ~MPA_V2_MASK_IRD_ORD);
+ }
+ cep->ird = params->ird;
+ cep->ord = params->ord;
+
+ cep->cm_id = id;
+ id->add_ref(id);
+
+ memset(&qp_attrs, 0, sizeof(qp_attrs));
+ qp_attrs.orq_size = cep->ord;
+ qp_attrs.irq_size = cep->ird;
+ qp_attrs.sk = cep->sock;
+ if (cep->mpa.hdr.params.bits & MPA_RR_FLAG_CRC)
+ qp_attrs.flags = SIW_MPA_CRC;
+ qp_attrs.state = SIW_QP_STATE_RTS;
+
+ siw_dbg_cep(cep, "id 0x%p, [QP%u]: moving to rts\n", id, qp_id(qp));
+
+ /* Associate QP with CEP */
+ siw_cep_get(cep);
+ qp->cep = cep;
+
+ /* siw_qp_get(qp) already done by QP lookup */
+ cep->qp = qp;
+
+ cep->state = SIW_EPSTATE_RDMA_MODE;
+
+ /* Move socket RX/TX under QP control */
+ rv = siw_qp_modify(qp, &qp_attrs,
+ SIW_QP_ATTR_STATE | SIW_QP_ATTR_LLP_HANDLE |
+ SIW_QP_ATTR_ORD | SIW_QP_ATTR_IRD |
+ SIW_QP_ATTR_MPA);
+ up_write(&qp->state_lock);
+
+ if (rv)
+ goto error;
+
+ siw_dbg_cep(cep, "id 0x%p, [QP %u]: send mpa reply, %d byte pdata\n",
+ id, qp_id(qp), params->private_data_len);
+
+ rv = siw_send_mpareqrep(cep, params->private_data,
+ params->private_data_len);
+ if (rv != 0)
+ goto error;
+
+ if (wait_for_peer_rts) {
+ siw_sk_assign_rtr_upcalls(cep);
+ } else {
+ siw_qp_socket_assoc(cep, qp);
+ rv = siw_cm_upcall(cep, IW_CM_EVENT_ESTABLISHED, 0);
+ if (rv)
+ goto error;
+ }
+ siw_cep_set_free(cep);
+
+ return 0;
+error:
+ siw_socket_disassoc(cep->sock);
+ sock_release(cep->sock);
+ cep->sock = NULL;
+
+ cep->state = SIW_EPSTATE_CLOSED;
+
+ if (cep->cm_id) {
+ cep->cm_id->rem_ref(id);
+ cep->cm_id = NULL;
+ }
+ if (qp->cep) {
+ siw_cep_put(cep);
+ qp->cep = NULL;
+ }
+ cep->qp = NULL;
+ siw_qp_put(qp);
+
+ siw_cep_set_free(cep);
+ siw_cep_put(cep);
+
+ return rv;
+}
+
+/*
+ * siw_reject()
+ *
+ * Local connection reject case. Send private data back to peer,
+ * close connection and dereference connection id.
+ */
+int siw_reject(struct iw_cm_id *id, const void *pdata, u8 pd_len)
+{
+ struct siw_cep *cep = (struct siw_cep *)id->provider_data;
+
+ siw_cep_set_inuse(cep);
+ siw_cep_put(cep);
+
+ siw_cancel_mpatimer(cep);
+
+ if (cep->state != SIW_EPSTATE_RECVD_MPAREQ) {
+ siw_dbg_cep(cep, "id 0x%p: out of state\n", id);
+
+ siw_cep_set_free(cep);
+ siw_cep_put(cep); /* put last reference */
+
+ return -ECONNRESET;
+ }
+ siw_dbg_cep(cep, "id 0x%p, cep->state %d, pd_len %d\n", id, cep->state,
+ pd_len);
+
+ if (__mpa_rr_revision(cep->mpa.hdr.params.bits) >= MPA_REVISION_1) {
+ cep->mpa.hdr.params.bits |= MPA_RR_FLAG_REJECT; /* reject */
+ siw_send_mpareqrep(cep, pdata, pd_len);
+ }
+ siw_socket_disassoc(cep->sock);
+ sock_release(cep->sock);
+ cep->sock = NULL;
+
+ cep->state = SIW_EPSTATE_CLOSED;
+
+ siw_cep_set_free(cep);
+ siw_cep_put(cep);
+
+ return 0;
+}
+
+static int siw_listen_address(struct iw_cm_id *id, int backlog,
+ struct sockaddr *laddr, int addr_family)
+{
+ struct socket *s;
+ struct siw_cep *cep = NULL;
+ struct siw_device *sdev = to_siw_dev(id->device);
+ int rv = 0, s_val;
+
+ rv = sock_create(addr_family, SOCK_STREAM, IPPROTO_TCP, &s);
+ if (rv < 0)
+ return rv;
+
+ /*
+ * Allow binding local port when still in TIME_WAIT from last close.
+ */
+ s_val = 1;
+ rv = kernel_setsockopt(s, SOL_SOCKET, SO_REUSEADDR, (char *)&s_val,
+ sizeof(s_val));
+ if (rv) {
+ siw_dbg(id->device, "id 0x%p: setsockopt error: %d\n", id, rv);
+ goto error;
+ }
+ rv = s->ops->bind(s, laddr, addr_family == AF_INET ?
+ sizeof(struct sockaddr_in) :
+ sizeof(struct sockaddr_in6));
+ if (rv) {
+ siw_dbg(id->device, "id 0x%p: socket bind error: %d\n", id, rv);
+ goto error;
+ }
+ cep = siw_cep_alloc(sdev);
+ if (!cep) {
+ rv = -ENOMEM;
+ goto error;
+ }
+ siw_cep_socket_assoc(cep, s);
+
+ rv = siw_cm_alloc_work(cep, backlog);
+ if (rv) {
+ siw_dbg(id->device,
+ "id 0x%p: alloc_work error %d, backlog %d\n", id,
+ rv, backlog);
+ goto error;
+ }
+ rv = s->ops->listen(s, backlog);
+ if (rv) {
+ siw_dbg(id->device, "id 0x%p: listen error %d\n", id, rv);
+ goto error;
+ }
+ cep->cm_id = id;
+ id->add_ref(id);
+
+ /*
+ * In case of a wildcard rdma_listen on a multi-homed device,
+ * a listener's IWCM id is associated with more than one listening CEP.
+ *
+ * We currently use id->provider_data in three different ways:
+ *
+ * o For a listener's IWCM id, id->provider_data points to
+ * the list_head of the list of listening CEPs.
+ * Uses: siw_create_listen(), siw_destroy_listen()
+ *
+ * o For each accepted passive-side IWCM id, id->provider_data
+ * points to the CEP itself. This is a consequence of
+ * - siw_cm_upcall() setting event.provider_data = cep and
+ * - the IWCM's cm_conn_req_handler() setting provider_data of the
+ * new passive-side IWCM id equal to event.provider_data
+ * Uses: siw_accept(), siw_reject()
+ *
+ * o For an active-side IWCM id, id->provider_data is not used at all.
+ *
+ */
+ if (!id->provider_data) {
+ id->provider_data =
+ kmalloc(sizeof(struct list_head), GFP_KERNEL);
+ if (!id->provider_data) {
+ rv = -ENOMEM;
+ goto error;
+ }
+ INIT_LIST_HEAD((struct list_head *)id->provider_data);
+ }
+ list_add_tail(&cep->listenq, (struct list_head *)id->provider_data);
+ cep->state = SIW_EPSTATE_LISTENING;
+
+ if (addr_family == AF_INET)
+ siw_dbg(id->device, "Listen at laddr %pI4 %u\n",
+ &(((struct sockaddr_in *)laddr)->sin_addr),
+ ((struct sockaddr_in *)laddr)->sin_port);
+ else
+ siw_dbg(id->device, "Listen at laddr %pI6 %u\n",
+ &(((struct sockaddr_in6 *)laddr)->sin6_addr),
+ ((struct sockaddr_in6 *)laddr)->sin6_port);
+
+ return 0;
+
+error:
+ siw_dbg(id->device, "failed: %d\n", rv);
+
+ if (cep) {
+ siw_cep_set_inuse(cep);
+
+ if (cep->cm_id) {
+ cep->cm_id->rem_ref(cep->cm_id);
+ cep->cm_id = NULL;
+ }
+ cep->sock = NULL;
+ siw_socket_disassoc(s);
+ cep->state = SIW_EPSTATE_CLOSED;
+
+ siw_cep_set_free(cep);
+ siw_cep_put(cep);
+ }
+ sock_release(s);
+
+ return rv;
+}
+
+static void siw_drop_listeners(struct iw_cm_id *id)
+{
+ struct list_head *p, *tmp;
+
+ /*
+ * In case of a wildcard rdma_listen on a multi-homed device,
+ * a listener's IWCM id is associated with more than one listening CEP.
+ */
+ list_for_each_safe(p, tmp, (struct list_head *)id->provider_data) {
+ struct siw_cep *cep = list_entry(p, struct siw_cep, listenq);
+
+ list_del(p);
+
+ siw_dbg_cep(cep, "id 0x%p: drop cep, state %d\n", id,
+ cep->state);
+
+ siw_cep_set_inuse(cep);
+
+ if (cep->cm_id) {
+ cep->cm_id->rem_ref(cep->cm_id);
+ cep->cm_id = NULL;
+ }
+ if (cep->sock) {
+ siw_socket_disassoc(cep->sock);
+ sock_release(cep->sock);
+ cep->sock = NULL;
+ }
+ cep->state = SIW_EPSTATE_CLOSED;
+ siw_cep_set_free(cep);
+ siw_cep_put(cep);
+ }
+}
+
+/*
+ * siw_create_listen - Create resources for a listener's IWCM ID @id
+ *
+ * Listens on the socket addresses id->local_addr and id->remote_addr.
+ *
+ * If the listener's @id provides a specific local IP address, at most one
+ * listening socket is created and associated with @id.
+ *
+ * If the listener's @id provides the wildcard (zero) local IP address,
+ * a separate listen is performed for each local IP address of the device
+ * by creating a listening socket and binding to that local IP address.
+ *
+ */
+int siw_create_listen(struct iw_cm_id *id, int backlog)
+{
+ struct net_device *dev = to_siw_dev(id->device)->netdev;
+ int rv = 0, listeners = 0;
+
+ siw_dbg(id->device, "id 0x%p: backlog %d\n", id, backlog);
+
+ /*
+ * For each attached address of the interface, create a
+ * listening socket, if id->local_addr is the wildcard
+ * IP address or matches the IP address.
+ */
+ if (id->local_addr.ss_family == AF_INET) {
+ struct in_device *in_dev = in_dev_get(dev);
+ struct sockaddr_in s_laddr, *s_raddr;
+ const struct in_ifaddr *ifa;
+
+ memcpy(&s_laddr, &id->local_addr, sizeof(s_laddr));
+ s_raddr = (struct sockaddr_in *)&id->remote_addr;
+
+ siw_dbg(id->device,
+ "id 0x%p: laddr %pI4:%d, raddr %pI4:%d\n",
+ id, &s_laddr.sin_addr, ntohs(s_laddr.sin_port),
+ &s_raddr->sin_addr, ntohs(s_raddr->sin_port));
+
+ rtnl_lock();
+ in_dev_for_each_ifa_rtnl(ifa, in_dev) {
+ if (ipv4_is_zeronet(s_laddr.sin_addr.s_addr) ||
+ s_laddr.sin_addr.s_addr == ifa->ifa_address) {
+ s_laddr.sin_addr.s_addr = ifa->ifa_address;
+
+ rv = siw_listen_address(id, backlog,
+ (struct sockaddr *)&s_laddr,
+ AF_INET);
+ if (!rv)
+ listeners++;
+ }
+ }
+ rtnl_unlock();
+ in_dev_put(in_dev);
+ } else if (id->local_addr.ss_family == AF_INET6) {
+ struct inet6_dev *in6_dev = in6_dev_get(dev);
+ struct inet6_ifaddr *ifp;
+ struct sockaddr_in6 *s_laddr = &to_sockaddr_in6(id->local_addr),
+ *s_raddr = &to_sockaddr_in6(id->remote_addr);
+
+ siw_dbg(id->device,
+ "id 0x%p: laddr %pI6:%d, raddr %pI6:%d\n",
+ id, &s_laddr->sin6_addr, ntohs(s_laddr->sin6_port),
+ &s_raddr->sin6_addr, ntohs(s_raddr->sin6_port));
+
+ read_lock_bh(&in6_dev->lock);
+ list_for_each_entry(ifp, &in6_dev->addr_list, if_list) {
+ struct sockaddr_in6 bind_addr;
+
+ if (ipv6_addr_any(&s_laddr->sin6_addr) ||
+ ipv6_addr_equal(&s_laddr->sin6_addr, &ifp->addr)) {
+ bind_addr.sin6_family = AF_INET6;
+ bind_addr.sin6_port = s_laddr->sin6_port;
+ bind_addr.sin6_flowinfo = 0;
+ bind_addr.sin6_addr = ifp->addr;
+ bind_addr.sin6_scope_id = dev->ifindex;
+
+ rv = siw_listen_address(id, backlog,
+ (struct sockaddr *)&bind_addr,
+ AF_INET6);
+ if (!rv)
+ listeners++;
+ }
+ }
+ read_unlock_bh(&in6_dev->lock);
+
+ in6_dev_put(in6_dev);
+ } else {
+ return -EAFNOSUPPORT;
+ }
+ if (listeners)
+ rv = 0;
+ else if (!rv)
+ rv = -EINVAL;
+
+ siw_dbg(id->device, "id 0x%p: %s\n", id, rv ? "FAIL" : "OK");
+
+ return rv;
+}
+
+int siw_destroy_listen(struct iw_cm_id *id)
+{
+ siw_dbg(id->device, "id 0x%p\n", id);
+
+ if (!id->provider_data) {
+ siw_dbg(id->device, "id 0x%p: no cep(s)\n", id);
+ return 0;
+ }
+ siw_drop_listeners(id);
+ kfree(id->provider_data);
+ id->provider_data = NULL;
+
+ return 0;
+}
+
+int siw_cm_init(void)
+{
+ /*
+ * create_single_workqueue for strict ordering
+ */
+ siw_cm_wq = create_singlethread_workqueue("siw_cm_wq");
+ if (!siw_cm_wq)
+ return -ENOMEM;
+
+ return 0;
+}
+
+void siw_cm_exit(void)
+{
+ if (siw_cm_wq) {
+ flush_workqueue(siw_cm_wq);
+ destroy_workqueue(siw_cm_wq);
+ }
+}
diff --git a/drivers/infiniband/sw/siw/siw_cm.h b/drivers/infiniband/sw/siw/siw_cm.h
new file mode 100644
index 000000000000..8c59cb3e2868
--- /dev/null
+++ b/drivers/infiniband/sw/siw/siw_cm.h
@@ -0,0 +1,133 @@
+/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause */
+
+/* Authors: Bernard Metzler <bmt@zurich.ibm.com> */
+/* Greg Joyce <greg@opengridcomputing.com> */
+/* Copyright (c) 2008-2019, IBM Corporation */
+/* Copyright (c) 2017, Open Grid Computing, Inc. */
+
+#ifndef _SIW_CM_H
+#define _SIW_CM_H
+
+#include <net/sock.h>
+#include <linux/tcp.h>
+
+#include <rdma/iw_cm.h>
+
+enum siw_cep_state {
+ SIW_EPSTATE_IDLE = 1,
+ SIW_EPSTATE_LISTENING,
+ SIW_EPSTATE_CONNECTING,
+ SIW_EPSTATE_AWAIT_MPAREQ,
+ SIW_EPSTATE_RECVD_MPAREQ,
+ SIW_EPSTATE_AWAIT_MPAREP,
+ SIW_EPSTATE_RDMA_MODE,
+ SIW_EPSTATE_CLOSED
+};
+
+struct siw_mpa_info {
+ struct mpa_rr hdr; /* peer mpa hdr in host byte order */
+ struct mpa_v2_data v2_ctrl;
+ struct mpa_v2_data v2_ctrl_req;
+ char *pdata;
+ int bytes_rcvd;
+};
+
+struct siw_device;
+
+struct siw_cep {
+ struct iw_cm_id *cm_id;
+ struct siw_device *sdev;
+ struct list_head devq;
+ spinlock_t lock;
+ struct kref ref;
+ int in_use;
+ wait_queue_head_t waitq;
+ enum siw_cep_state state;
+
+ struct list_head listenq;
+ struct siw_cep *listen_cep;
+
+ struct siw_qp *qp;
+ struct socket *sock;
+
+ struct siw_cm_work *mpa_timer;
+ struct list_head work_freelist;
+
+ struct siw_mpa_info mpa;
+ int ord;
+ int ird;
+ bool enhanced_rdma_conn_est;
+
+ /* Saved upcalls of socket */
+ void (*sk_state_change)(struct sock *sk);
+ void (*sk_data_ready)(struct sock *sk);
+ void (*sk_write_space)(struct sock *sk);
+ void (*sk_error_report)(struct sock *sk);
+};
+
+/*
+ * Connection initiator waits 10 seconds to receive an
+ * MPA reply after sending out MPA request. Reponder waits for
+ * 5 seconds for MPA request to arrive if new TCP connection
+ * was set up.
+ */
+#define MPAREQ_TIMEOUT (HZ * 10)
+#define MPAREP_TIMEOUT (HZ * 5)
+
+enum siw_work_type {
+ SIW_CM_WORK_ACCEPT = 1,
+ SIW_CM_WORK_READ_MPAHDR,
+ SIW_CM_WORK_CLOSE_LLP, /* close socket */
+ SIW_CM_WORK_PEER_CLOSE, /* socket indicated peer close */
+ SIW_CM_WORK_MPATIMEOUT
+};
+
+struct siw_cm_work {
+ struct delayed_work work;
+ struct list_head list;
+ enum siw_work_type type;
+ struct siw_cep *cep;
+};
+
+#define to_sockaddr_in(a) (*(struct sockaddr_in *)(&(a)))
+#define to_sockaddr_in6(a) (*(struct sockaddr_in6 *)(&(a)))
+
+static inline int getname_peer(struct socket *s, struct sockaddr_storage *a)
+{
+ return s->ops->getname(s, (struct sockaddr *)a, 1);
+}
+
+static inline int getname_local(struct socket *s, struct sockaddr_storage *a)
+{
+ return s->ops->getname(s, (struct sockaddr *)a, 0);
+}
+
+static inline int ksock_recv(struct socket *sock, char *buf, size_t size,
+ int flags)
+{
+ struct kvec iov = { buf, size };
+ struct msghdr msg = { .msg_name = NULL, .msg_flags = flags };
+
+ return kernel_recvmsg(sock, &msg, &iov, 1, size, flags);
+}
+
+int siw_connect(struct iw_cm_id *id, struct iw_cm_conn_param *parm);
+int siw_accept(struct iw_cm_id *id, struct iw_cm_conn_param *param);
+int siw_reject(struct iw_cm_id *id, const void *data, u8 len);
+int siw_create_listen(struct iw_cm_id *id, int backlog);
+int siw_destroy_listen(struct iw_cm_id *id);
+
+void siw_cep_get(struct siw_cep *cep);
+void siw_cep_put(struct siw_cep *cep);
+int siw_cm_queue_work(struct siw_cep *cep, enum siw_work_type type);
+
+int siw_cm_init(void);
+void siw_cm_exit(void);
+
+/*
+ * TCP socket interface
+ */
+#define sk_to_qp(sk) (((struct siw_cep *)((sk)->sk_user_data))->qp)
+#define sk_to_cep(sk) ((struct siw_cep *)((sk)->sk_user_data))
+
+#endif
diff --git a/drivers/infiniband/sw/siw/siw_cq.c b/drivers/infiniband/sw/siw/siw_cq.c
new file mode 100644
index 000000000000..e381ae9b7d62
--- /dev/null
+++ b/drivers/infiniband/sw/siw/siw_cq.c
@@ -0,0 +1,101 @@
+// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause
+
+/* Authors: Bernard Metzler <bmt@zurich.ibm.com> */
+/* Copyright (c) 2008-2019, IBM Corporation */
+
+#include <linux/errno.h>
+#include <linux/types.h>
+
+#include <rdma/ib_verbs.h>
+
+#include "siw.h"
+
+static int map_wc_opcode[SIW_NUM_OPCODES] = {
+ [SIW_OP_WRITE] = IB_WC_RDMA_WRITE,
+ [SIW_OP_SEND] = IB_WC_SEND,
+ [SIW_OP_SEND_WITH_IMM] = IB_WC_SEND,
+ [SIW_OP_READ] = IB_WC_RDMA_READ,
+ [SIW_OP_READ_LOCAL_INV] = IB_WC_RDMA_READ,
+ [SIW_OP_COMP_AND_SWAP] = IB_WC_COMP_SWAP,
+ [SIW_OP_FETCH_AND_ADD] = IB_WC_FETCH_ADD,
+ [SIW_OP_INVAL_STAG] = IB_WC_LOCAL_INV,
+ [SIW_OP_REG_MR] = IB_WC_REG_MR,
+ [SIW_OP_RECEIVE] = IB_WC_RECV,
+ [SIW_OP_READ_RESPONSE] = -1 /* not used */
+};
+
+static struct {
+ enum siw_wc_status siw;
+ enum ib_wc_status ib;
+} map_cqe_status[SIW_NUM_WC_STATUS] = {
+ { SIW_WC_SUCCESS, IB_WC_SUCCESS },
+ { SIW_WC_LOC_LEN_ERR, IB_WC_LOC_LEN_ERR },
+ { SIW_WC_LOC_PROT_ERR, IB_WC_LOC_PROT_ERR },
+ { SIW_WC_LOC_QP_OP_ERR, IB_WC_LOC_QP_OP_ERR },
+ { SIW_WC_WR_FLUSH_ERR, IB_WC_WR_FLUSH_ERR },
+ { SIW_WC_BAD_RESP_ERR, IB_WC_BAD_RESP_ERR },
+ { SIW_WC_LOC_ACCESS_ERR, IB_WC_LOC_ACCESS_ERR },
+ { SIW_WC_REM_ACCESS_ERR, IB_WC_REM_ACCESS_ERR },
+ { SIW_WC_REM_INV_REQ_ERR, IB_WC_REM_INV_REQ_ERR },
+ { SIW_WC_GENERAL_ERR, IB_WC_GENERAL_ERR }
+};
+
+/*
+ * Reap one CQE from the CQ. Only used by kernel clients
+ * during CQ normal operation. Might be called during CQ
+ * flush for user mapped CQE array as well.
+ */
+int siw_reap_cqe(struct siw_cq *cq, struct ib_wc *wc)
+{
+ struct siw_cqe *cqe;
+ unsigned long flags;
+
+ spin_lock_irqsave(&cq->lock, flags);
+
+ cqe = &cq->queue[cq->cq_get % cq->num_cqe];
+ if (READ_ONCE(cqe->flags) & SIW_WQE_VALID) {
+ memset(wc, 0, sizeof(*wc));
+ wc->wr_id = cqe->id;
+ wc->status = map_cqe_status[cqe->status].ib;
+ wc->opcode = map_wc_opcode[cqe->opcode];
+ wc->byte_len = cqe->bytes;
+
+ /*
+ * During CQ flush, also user land CQE's may get
+ * reaped here, which do not hold a QP reference
+ * and do not qualify for memory extension verbs.
+ */
+ if (likely(cq->kernel_verbs)) {
+ if (cqe->flags & SIW_WQE_REM_INVAL) {
+ wc->ex.invalidate_rkey = cqe->inval_stag;
+ wc->wc_flags = IB_WC_WITH_INVALIDATE;
+ }
+ wc->qp = cqe->base_qp;
+ siw_dbg_cq(cq, "idx %u, type %d, flags %2x, id 0x%p\n",
+ cq->cq_get % cq->num_cqe, cqe->opcode,
+ cqe->flags, (void *)cqe->id);
+ }
+ WRITE_ONCE(cqe->flags, 0);
+ cq->cq_get++;
+
+ spin_unlock_irqrestore(&cq->lock, flags);
+
+ return 1;
+ }
+ spin_unlock_irqrestore(&cq->lock, flags);
+
+ return 0;
+}
+
+/*
+ * siw_cq_flush()
+ *
+ * Flush all CQ elements.
+ */
+void siw_cq_flush(struct siw_cq *cq)
+{
+ struct ib_wc wc;
+
+ while (siw_reap_cqe(cq, &wc))
+ ;
+}
diff --git a/drivers/infiniband/sw/siw/siw_main.c b/drivers/infiniband/sw/siw/siw_main.c
new file mode 100644
index 000000000000..f55c4e80aea4
--- /dev/null
+++ b/drivers/infiniband/sw/siw/siw_main.c
@@ -0,0 +1,685 @@
+// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause
+
+/* Authors: Bernard Metzler <bmt@zurich.ibm.com> */
+/* Copyright (c) 2008-2019, IBM Corporation */
+
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/netdevice.h>
+#include <linux/inetdevice.h>
+#include <net/net_namespace.h>
+#include <linux/rtnetlink.h>
+#include <linux/if_arp.h>
+#include <linux/list.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/module.h>
+#include <linux/dma-mapping.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_user_verbs.h>
+#include <rdma/rdma_netlink.h>
+#include <linux/kthread.h>
+
+#include "siw.h"
+#include "siw_verbs.h"
+
+MODULE_AUTHOR("Bernard Metzler");
+MODULE_DESCRIPTION("Software iWARP Driver");
+MODULE_LICENSE("Dual BSD/GPL");
+
+/* transmit from user buffer, if possible */
+const bool zcopy_tx = true;
+
+/* Restrict usage of GSO, if hardware peer iwarp is unable to process
+ * large packets. try_gso = true lets siw try to use local GSO,
+ * if peer agrees. Not using GSO severly limits siw maximum tx bandwidth.
+ */
+const bool try_gso;
+
+/* Attach siw also with loopback devices */
+const bool loopback_enabled = true;
+
+/* We try to negotiate CRC on, if true */
+const bool mpa_crc_required;
+
+/* MPA CRC on/off enforced */
+const bool mpa_crc_strict;
+
+/* Control TCP_NODELAY socket option */
+const bool siw_tcp_nagle;
+
+/* Select MPA version to be used during connection setup */
+u_char mpa_version = MPA_REVISION_2;
+
+/* Selects MPA P2P mode (additional handshake during connection
+ * setup, if true.
+ */
+const bool peer_to_peer;
+
+struct task_struct *siw_tx_thread[NR_CPUS];
+struct crypto_shash *siw_crypto_shash;
+
+static int siw_device_register(struct siw_device *sdev, const char *name)
+{
+ struct ib_device *base_dev = &sdev->base_dev;
+ static int dev_id = 1;
+ int rv;
+
+ rv = ib_register_device(base_dev, name);
+ if (rv) {
+ pr_warn("siw: device registration error %d\n", rv);
+ return rv;
+ }
+ sdev->vendor_part_id = dev_id++;
+
+ siw_dbg(base_dev, "HWaddr=%pM\n", sdev->netdev->dev_addr);
+
+ return 0;
+}
+
+static void siw_device_cleanup(struct ib_device *base_dev)
+{
+ struct siw_device *sdev = to_siw_dev(base_dev);
+
+ xa_destroy(&sdev->qp_xa);
+ xa_destroy(&sdev->mem_xa);
+}
+
+static int siw_create_tx_threads(void)
+{
+ int cpu, assigned = 0;
+
+ for_each_online_cpu(cpu) {
+ /* Skip HT cores */
+ if (cpu % cpumask_weight(topology_sibling_cpumask(cpu)))
+ continue;
+
+ siw_tx_thread[cpu] =
+ kthread_create(siw_run_sq, (unsigned long *)(long)cpu,
+ "siw_tx/%d", cpu);
+ if (IS_ERR(siw_tx_thread[cpu])) {
+ siw_tx_thread[cpu] = NULL;
+ continue;
+ }
+ kthread_bind(siw_tx_thread[cpu], cpu);
+
+ wake_up_process(siw_tx_thread[cpu]);
+ assigned++;
+ }
+ return assigned;
+}
+
+static int siw_dev_qualified(struct net_device *netdev)
+{
+ /*
+ * Additional hardware support can be added here
+ * (e.g. ARPHRD_FDDI, ARPHRD_ATM, ...) - see
+ * <linux/if_arp.h> for type identifiers.
+ */
+ if (netdev->type == ARPHRD_ETHER || netdev->type == ARPHRD_IEEE802 ||
+ (netdev->type == ARPHRD_LOOPBACK && loopback_enabled))
+ return 1;
+
+ return 0;
+}
+
+static DEFINE_PER_CPU(atomic_t, siw_use_cnt);
+
+static struct {
+ struct cpumask **tx_valid_cpus;
+ int num_nodes;
+} siw_cpu_info;
+
+static int siw_init_cpulist(void)
+{
+ int i, num_nodes = num_possible_nodes();
+
+ memset(siw_tx_thread, 0, sizeof(siw_tx_thread));
+
+ siw_cpu_info.num_nodes = num_nodes;
+
+ siw_cpu_info.tx_valid_cpus =
+ kcalloc(num_nodes, sizeof(struct cpumask *), GFP_KERNEL);
+ if (!siw_cpu_info.tx_valid_cpus) {
+ siw_cpu_info.num_nodes = 0;
+ return -ENOMEM;
+ }
+ for (i = 0; i < siw_cpu_info.num_nodes; i++) {
+ siw_cpu_info.tx_valid_cpus[i] =
+ kzalloc(sizeof(struct cpumask), GFP_KERNEL);
+ if (!siw_cpu_info.tx_valid_cpus[i])
+ goto out_err;
+
+ cpumask_clear(siw_cpu_info.tx_valid_cpus[i]);
+ }
+ for_each_possible_cpu(i)
+ cpumask_set_cpu(i, siw_cpu_info.tx_valid_cpus[cpu_to_node(i)]);
+
+ return 0;
+
+out_err:
+ siw_cpu_info.num_nodes = 0;
+ while (i) {
+ kfree(siw_cpu_info.tx_valid_cpus[i]);
+ siw_cpu_info.tx_valid_cpus[i--] = NULL;
+ }
+ kfree(siw_cpu_info.tx_valid_cpus);
+ siw_cpu_info.tx_valid_cpus = NULL;
+
+ return -ENOMEM;
+}
+
+static void siw_destroy_cpulist(void)
+{
+ int i = 0;
+
+ while (i < siw_cpu_info.num_nodes)
+ kfree(siw_cpu_info.tx_valid_cpus[i++]);
+
+ kfree(siw_cpu_info.tx_valid_cpus);
+}
+
+/*
+ * Choose CPU with least number of active QP's from NUMA node of
+ * TX interface.
+ */
+int siw_get_tx_cpu(struct siw_device *sdev)
+{
+ const struct cpumask *tx_cpumask;
+ int i, num_cpus, cpu, min_use, node = sdev->numa_node, tx_cpu = -1;
+
+ if (node < 0)
+ tx_cpumask = cpu_online_mask;
+ else
+ tx_cpumask = siw_cpu_info.tx_valid_cpus[node];
+
+ num_cpus = cpumask_weight(tx_cpumask);
+ if (!num_cpus) {
+ /* no CPU on this NUMA node */
+ tx_cpumask = cpu_online_mask;
+ num_cpus = cpumask_weight(tx_cpumask);
+ }
+ if (!num_cpus)
+ goto out;
+
+ cpu = cpumask_first(tx_cpumask);
+
+ for (i = 0, min_use = SIW_MAX_QP; i < num_cpus;
+ i++, cpu = cpumask_next(cpu, tx_cpumask)) {
+ int usage;
+
+ /* Skip any cores which have no TX thread */
+ if (!siw_tx_thread[cpu])
+ continue;
+
+ usage = atomic_read(&per_cpu(siw_use_cnt, cpu));
+ if (usage <= min_use) {
+ tx_cpu = cpu;
+ min_use = usage;
+ }
+ }
+ siw_dbg(&sdev->base_dev,
+ "tx cpu %d, node %d, %d qp's\n", tx_cpu, node, min_use);
+
+out:
+ if (tx_cpu >= 0)
+ atomic_inc(&per_cpu(siw_use_cnt, tx_cpu));
+ else
+ pr_warn("siw: no tx cpu found\n");
+
+ return tx_cpu;
+}
+
+void siw_put_tx_cpu(int cpu)
+{
+ atomic_dec(&per_cpu(siw_use_cnt, cpu));
+}
+
+static struct ib_qp *siw_get_base_qp(struct ib_device *base_dev, int id)
+{
+ struct siw_qp *qp = siw_qp_id2obj(to_siw_dev(base_dev), id);
+
+ if (qp) {
+ /*
+ * siw_qp_id2obj() increments object reference count
+ */
+ siw_qp_put(qp);
+ return qp->ib_qp;
+ }
+ return NULL;
+}
+
+static void siw_verbs_sq_flush(struct ib_qp *base_qp)
+{
+ struct siw_qp *qp = to_siw_qp(base_qp);
+
+ down_write(&qp->state_lock);
+ siw_sq_flush(qp);
+ up_write(&qp->state_lock);
+}
+
+static void siw_verbs_rq_flush(struct ib_qp *base_qp)
+{
+ struct siw_qp *qp = to_siw_qp(base_qp);
+
+ down_write(&qp->state_lock);
+ siw_rq_flush(qp);
+ up_write(&qp->state_lock);
+}
+
+static const struct ib_device_ops siw_device_ops = {
+ .owner = THIS_MODULE,
+ .uverbs_abi_ver = SIW_ABI_VERSION,
+ .driver_id = RDMA_DRIVER_SIW,
+
+ .alloc_mr = siw_alloc_mr,
+ .alloc_pd = siw_alloc_pd,
+ .alloc_ucontext = siw_alloc_ucontext,
+ .create_cq = siw_create_cq,
+ .create_qp = siw_create_qp,
+ .create_srq = siw_create_srq,
+ .dealloc_driver = siw_device_cleanup,
+ .dealloc_pd = siw_dealloc_pd,
+ .dealloc_ucontext = siw_dealloc_ucontext,
+ .dereg_mr = siw_dereg_mr,
+ .destroy_cq = siw_destroy_cq,
+ .destroy_qp = siw_destroy_qp,
+ .destroy_srq = siw_destroy_srq,
+ .drain_rq = siw_verbs_rq_flush,
+ .drain_sq = siw_verbs_sq_flush,
+ .get_dma_mr = siw_get_dma_mr,
+ .get_port_immutable = siw_get_port_immutable,
+ .iw_accept = siw_accept,
+ .iw_add_ref = siw_qp_get_ref,
+ .iw_connect = siw_connect,
+ .iw_create_listen = siw_create_listen,
+ .iw_destroy_listen = siw_destroy_listen,
+ .iw_get_qp = siw_get_base_qp,
+ .iw_reject = siw_reject,
+ .iw_rem_ref = siw_qp_put_ref,
+ .map_mr_sg = siw_map_mr_sg,
+ .mmap = siw_mmap,
+ .modify_qp = siw_verbs_modify_qp,
+ .modify_srq = siw_modify_srq,
+ .poll_cq = siw_poll_cq,
+ .post_recv = siw_post_receive,
+ .post_send = siw_post_send,
+ .post_srq_recv = siw_post_srq_recv,
+ .query_device = siw_query_device,
+ .query_gid = siw_query_gid,
+ .query_pkey = siw_query_pkey,
+ .query_port = siw_query_port,
+ .query_qp = siw_query_qp,
+ .query_srq = siw_query_srq,
+ .req_notify_cq = siw_req_notify_cq,
+ .reg_user_mr = siw_reg_user_mr,
+
+ INIT_RDMA_OBJ_SIZE(ib_cq, siw_cq, base_cq),
+ INIT_RDMA_OBJ_SIZE(ib_pd, siw_pd, base_pd),
+ INIT_RDMA_OBJ_SIZE(ib_srq, siw_srq, base_srq),
+ INIT_RDMA_OBJ_SIZE(ib_ucontext, siw_ucontext, base_ucontext),
+};
+
+static struct siw_device *siw_device_create(struct net_device *netdev)
+{
+ struct siw_device *sdev = NULL;
+ struct ib_device *base_dev;
+ struct device *parent = netdev->dev.parent;
+ int rv;
+
+ if (!parent) {
+ /*
+ * The loopback device has no parent device,
+ * so it appears as a top-level device. To support
+ * loopback device connectivity, take this device
+ * as the parent device. Skip all other devices
+ * w/o parent device.
+ */
+ if (netdev->type != ARPHRD_LOOPBACK) {
+ pr_warn("siw: device %s error: no parent device\n",
+ netdev->name);
+ return NULL;
+ }
+ parent = &netdev->dev;
+ }
+ sdev = ib_alloc_device(siw_device, base_dev);
+ if (!sdev)
+ return NULL;
+
+ base_dev = &sdev->base_dev;
+
+ sdev->netdev = netdev;
+
+ if (netdev->type != ARPHRD_LOOPBACK) {
+ memcpy(&base_dev->node_guid, netdev->dev_addr, 6);
+ } else {
+ /*
+ * The loopback device does not have a HW address,
+ * but connection mangagement lib expects gid != 0
+ */
+ size_t gidlen = min_t(size_t, strlen(base_dev->name), 6);
+
+ memcpy(&base_dev->node_guid, base_dev->name, gidlen);
+ }
+ base_dev->uverbs_cmd_mask =
+ (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) |
+ (1ull << IB_USER_VERBS_CMD_QUERY_PORT) |
+ (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) |
+ (1ull << IB_USER_VERBS_CMD_ALLOC_PD) |
+ (1ull << IB_USER_VERBS_CMD_DEALLOC_PD) |
+ (1ull << IB_USER_VERBS_CMD_REG_MR) |
+ (1ull << IB_USER_VERBS_CMD_DEREG_MR) |
+ (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) |
+ (1ull << IB_USER_VERBS_CMD_CREATE_CQ) |
+ (1ull << IB_USER_VERBS_CMD_POLL_CQ) |
+ (1ull << IB_USER_VERBS_CMD_REQ_NOTIFY_CQ) |
+ (1ull << IB_USER_VERBS_CMD_DESTROY_CQ) |
+ (1ull << IB_USER_VERBS_CMD_CREATE_QP) |
+ (1ull << IB_USER_VERBS_CMD_QUERY_QP) |
+ (1ull << IB_USER_VERBS_CMD_MODIFY_QP) |
+ (1ull << IB_USER_VERBS_CMD_DESTROY_QP) |
+ (1ull << IB_USER_VERBS_CMD_POST_SEND) |
+ (1ull << IB_USER_VERBS_CMD_POST_RECV) |
+ (1ull << IB_USER_VERBS_CMD_CREATE_SRQ) |
+ (1ull << IB_USER_VERBS_CMD_POST_SRQ_RECV) |
+ (1ull << IB_USER_VERBS_CMD_MODIFY_SRQ) |
+ (1ull << IB_USER_VERBS_CMD_QUERY_SRQ) |
+ (1ull << IB_USER_VERBS_CMD_DESTROY_SRQ);
+
+ base_dev->node_type = RDMA_NODE_RNIC;
+ memcpy(base_dev->node_desc, SIW_NODE_DESC_COMMON,
+ sizeof(SIW_NODE_DESC_COMMON));
+
+ /*
+ * Current model (one-to-one device association):
+ * One Softiwarp device per net_device or, equivalently,
+ * per physical port.
+ */
+ base_dev->phys_port_cnt = 1;
+ base_dev->dev.parent = parent;
+ base_dev->dev.dma_ops = &dma_virt_ops;
+ base_dev->num_comp_vectors = num_possible_cpus();
+
+ ib_set_device_ops(base_dev, &siw_device_ops);
+ rv = ib_device_set_netdev(base_dev, netdev, 1);
+ if (rv)
+ goto error;
+
+ memcpy(base_dev->iw_ifname, netdev->name,
+ sizeof(base_dev->iw_ifname));
+
+ /* Disable TCP port mapping */
+ base_dev->iw_driver_flags = IW_F_NO_PORT_MAP,
+
+ sdev->attrs.max_qp = SIW_MAX_QP;
+ sdev->attrs.max_qp_wr = SIW_MAX_QP_WR;
+ sdev->attrs.max_ord = SIW_MAX_ORD_QP;
+ sdev->attrs.max_ird = SIW_MAX_IRD_QP;
+ sdev->attrs.max_sge = SIW_MAX_SGE;
+ sdev->attrs.max_sge_rd = SIW_MAX_SGE_RD;
+ sdev->attrs.max_cq = SIW_MAX_CQ;
+ sdev->attrs.max_cqe = SIW_MAX_CQE;
+ sdev->attrs.max_mr = SIW_MAX_MR;
+ sdev->attrs.max_pd = SIW_MAX_PD;
+ sdev->attrs.max_mw = SIW_MAX_MW;
+ sdev->attrs.max_fmr = SIW_MAX_FMR;
+ sdev->attrs.max_srq = SIW_MAX_SRQ;
+ sdev->attrs.max_srq_wr = SIW_MAX_SRQ_WR;
+ sdev->attrs.max_srq_sge = SIW_MAX_SGE;
+
+ xa_init_flags(&sdev->qp_xa, XA_FLAGS_ALLOC1);
+ xa_init_flags(&sdev->mem_xa, XA_FLAGS_ALLOC1);
+
+ INIT_LIST_HEAD(&sdev->cep_list);
+ INIT_LIST_HEAD(&sdev->qp_list);
+
+ atomic_set(&sdev->num_ctx, 0);
+ atomic_set(&sdev->num_srq, 0);
+ atomic_set(&sdev->num_qp, 0);
+ atomic_set(&sdev->num_cq, 0);
+ atomic_set(&sdev->num_mr, 0);
+ atomic_set(&sdev->num_pd, 0);
+
+ sdev->numa_node = dev_to_node(parent);
+ spin_lock_init(&sdev->lock);
+
+ return sdev;
+error:
+ ib_dealloc_device(base_dev);
+
+ return NULL;
+}
+
+/*
+ * Network link becomes unavailable. Mark all
+ * affected QP's accordingly.
+ */
+static void siw_netdev_down(struct work_struct *work)
+{
+ struct siw_device *sdev =
+ container_of(work, struct siw_device, netdev_down);
+
+ struct siw_qp_attrs qp_attrs;
+ struct list_head *pos, *tmp;
+
+ memset(&qp_attrs, 0, sizeof(qp_attrs));
+ qp_attrs.state = SIW_QP_STATE_ERROR;
+
+ list_for_each_safe(pos, tmp, &sdev->qp_list) {
+ struct siw_qp *qp = list_entry(pos, struct siw_qp, devq);
+
+ down_write(&qp->state_lock);
+ WARN_ON(siw_qp_modify(qp, &qp_attrs, SIW_QP_ATTR_STATE));
+ up_write(&qp->state_lock);
+ }
+ ib_device_put(&sdev->base_dev);
+}
+
+static void siw_device_goes_down(struct siw_device *sdev)
+{
+ if (ib_device_try_get(&sdev->base_dev)) {
+ INIT_WORK(&sdev->netdev_down, siw_netdev_down);
+ schedule_work(&sdev->netdev_down);
+ }
+}
+
+static int siw_netdev_event(struct notifier_block *nb, unsigned long event,
+ void *arg)
+{
+ struct net_device *netdev = netdev_notifier_info_to_dev(arg);
+ struct ib_device *base_dev;
+ struct siw_device *sdev;
+
+ dev_dbg(&netdev->dev, "siw: event %lu\n", event);
+
+ if (dev_net(netdev) != &init_net)
+ return NOTIFY_OK;
+
+ base_dev = ib_device_get_by_netdev(netdev, RDMA_DRIVER_SIW);
+ if (!base_dev)
+ return NOTIFY_OK;
+
+ sdev = to_siw_dev(base_dev);
+
+ switch (event) {
+ case NETDEV_UP:
+ sdev->state = IB_PORT_ACTIVE;
+ siw_port_event(sdev, 1, IB_EVENT_PORT_ACTIVE);
+ break;
+
+ case NETDEV_GOING_DOWN:
+ siw_device_goes_down(sdev);
+ break;
+
+ case NETDEV_DOWN:
+ sdev->state = IB_PORT_DOWN;
+ siw_port_event(sdev, 1, IB_EVENT_PORT_ERR);
+ break;
+
+ case NETDEV_REGISTER:
+ /*
+ * Device registration now handled only by
+ * rdma netlink commands. So it shall be impossible
+ * to end up here with a valid siw device.
+ */
+ siw_dbg(base_dev, "unexpected NETDEV_REGISTER event\n");
+ break;
+
+ case NETDEV_UNREGISTER:
+ ib_unregister_device_queued(&sdev->base_dev);
+ break;
+
+ case NETDEV_CHANGEADDR:
+ siw_port_event(sdev, 1, IB_EVENT_LID_CHANGE);
+ break;
+ /*
+ * Todo: Below netdev events are currently not handled.
+ */
+ case NETDEV_CHANGEMTU:
+ case NETDEV_CHANGE:
+ break;
+
+ default:
+ break;
+ }
+ ib_device_put(&sdev->base_dev);
+
+ return NOTIFY_OK;
+}
+
+static struct notifier_block siw_netdev_nb = {
+ .notifier_call = siw_netdev_event,
+};
+
+static int siw_newlink(const char *basedev_name, struct net_device *netdev)
+{
+ struct ib_device *base_dev;
+ struct siw_device *sdev = NULL;
+ int rv = -ENOMEM;
+
+ if (!siw_dev_qualified(netdev))
+ return -EINVAL;
+
+ base_dev = ib_device_get_by_netdev(netdev, RDMA_DRIVER_SIW);
+ if (base_dev) {
+ ib_device_put(base_dev);
+ return -EEXIST;
+ }
+ sdev = siw_device_create(netdev);
+ if (sdev) {
+ dev_dbg(&netdev->dev, "siw: new device\n");
+
+ if (netif_running(netdev) && netif_carrier_ok(netdev))
+ sdev->state = IB_PORT_ACTIVE;
+ else
+ sdev->state = IB_PORT_DOWN;
+
+ rv = siw_device_register(sdev, basedev_name);
+ if (rv)
+ ib_dealloc_device(&sdev->base_dev);
+ }
+ return rv;
+}
+
+static struct rdma_link_ops siw_link_ops = {
+ .type = "siw",
+ .newlink = siw_newlink,
+};
+
+/*
+ * siw_init_module - Initialize Softiwarp module and register with netdev
+ * subsystem.
+ */
+static __init int siw_init_module(void)
+{
+ int rv;
+ int nr_cpu;
+
+ if (SENDPAGE_THRESH < SIW_MAX_INLINE) {
+ pr_info("siw: sendpage threshold too small: %u\n",
+ (int)SENDPAGE_THRESH);
+ rv = -EINVAL;
+ goto out_error;
+ }
+ rv = siw_init_cpulist();
+ if (rv)
+ goto out_error;
+
+ rv = siw_cm_init();
+ if (rv)
+ goto out_error;
+
+ if (!siw_create_tx_threads()) {
+ pr_info("siw: Could not start any TX thread\n");
+ goto out_error;
+ }
+ /*
+ * Locate CRC32 algorithm. If unsuccessful, fail
+ * loading siw only, if CRC is required.
+ */
+ siw_crypto_shash = crypto_alloc_shash("crc32c", 0, 0);
+ if (IS_ERR(siw_crypto_shash)) {
+ pr_info("siw: Loading CRC32c failed: %ld\n",
+ PTR_ERR(siw_crypto_shash));
+ siw_crypto_shash = NULL;
+ if (mpa_crc_required) {
+ rv = -EOPNOTSUPP;
+ goto out_error;
+ }
+ }
+ rv = register_netdevice_notifier(&siw_netdev_nb);
+ if (rv)
+ goto out_error;
+
+ rdma_link_register(&siw_link_ops);
+
+ pr_info("SoftiWARP attached\n");
+ return 0;
+
+out_error:
+ for (nr_cpu = 0; nr_cpu < nr_cpu_ids; nr_cpu++) {
+ if (siw_tx_thread[nr_cpu]) {
+ siw_stop_tx_thread(nr_cpu);
+ siw_tx_thread[nr_cpu] = NULL;
+ }
+ }
+ if (siw_crypto_shash)
+ crypto_free_shash(siw_crypto_shash);
+
+ pr_info("SoftIWARP attach failed. Error: %d\n", rv);
+
+ siw_cm_exit();
+ siw_destroy_cpulist();
+
+ return rv;
+}
+
+static void __exit siw_exit_module(void)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ if (siw_tx_thread[cpu]) {
+ siw_stop_tx_thread(cpu);
+ siw_tx_thread[cpu] = NULL;
+ }
+ }
+ unregister_netdevice_notifier(&siw_netdev_nb);
+ rdma_link_unregister(&siw_link_ops);
+ ib_unregister_driver(RDMA_DRIVER_SIW);
+
+ siw_cm_exit();
+
+ siw_destroy_cpulist();
+
+ if (siw_crypto_shash)
+ crypto_free_shash(siw_crypto_shash);
+
+ pr_info("SoftiWARP detached\n");
+}
+
+module_init(siw_init_module);
+module_exit(siw_exit_module);
+
+MODULE_ALIAS_RDMA_LINK("siw");
diff --git a/drivers/infiniband/sw/siw/siw_mem.c b/drivers/infiniband/sw/siw/siw_mem.c
new file mode 100644
index 000000000000..67171c82b0c4
--- /dev/null
+++ b/drivers/infiniband/sw/siw/siw_mem.c
@@ -0,0 +1,460 @@
+// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause
+
+/* Authors: Bernard Metzler <bmt@zurich.ibm.com> */
+/* Copyright (c) 2008-2019, IBM Corporation */
+
+#include <linux/gfp.h>
+#include <rdma/ib_verbs.h>
+#include <linux/dma-mapping.h>
+#include <linux/slab.h>
+#include <linux/sched/mm.h>
+#include <linux/resource.h>
+
+#include "siw.h"
+#include "siw_mem.h"
+
+/*
+ * Stag lookup is based on its index part only (24 bits).
+ * The code avoids special Stag of zero and tries to randomize
+ * STag values between 1 and SIW_STAG_MAX_INDEX.
+ */
+int siw_mem_add(struct siw_device *sdev, struct siw_mem *m)
+{
+ struct xa_limit limit = XA_LIMIT(1, 0x00ffffff);
+ u32 id, next;
+
+ get_random_bytes(&next, 4);
+ next &= 0x00ffffff;
+
+ if (xa_alloc_cyclic(&sdev->mem_xa, &id, m, limit, &next,
+ GFP_KERNEL) < 0)
+ return -ENOMEM;
+
+ /* Set the STag index part */
+ m->stag = id << 8;
+
+ siw_dbg_mem(m, "new MEM object\n");
+
+ return 0;
+}
+
+/*
+ * siw_mem_id2obj()
+ *
+ * resolves memory from stag given by id. might be called from:
+ * o process context before sending out of sgl, or
+ * o in softirq when resolving target memory
+ */
+struct siw_mem *siw_mem_id2obj(struct siw_device *sdev, int stag_index)
+{
+ struct siw_mem *mem;
+
+ rcu_read_lock();
+ mem = xa_load(&sdev->mem_xa, stag_index);
+ if (likely(mem && kref_get_unless_zero(&mem->ref))) {
+ rcu_read_unlock();
+ return mem;
+ }
+ rcu_read_unlock();
+
+ return NULL;
+}
+
+static void siw_free_plist(struct siw_page_chunk *chunk, int num_pages,
+ bool dirty)
+{
+ struct page **p = chunk->plist;
+
+ while (num_pages--) {
+ if (!PageDirty(*p) && dirty)
+ put_user_pages_dirty_lock(p, 1);
+ else
+ put_user_page(*p);
+ p++;
+ }
+}
+
+void siw_umem_release(struct siw_umem *umem, bool dirty)
+{
+ struct mm_struct *mm_s = umem->owning_mm;
+ int i, num_pages = umem->num_pages;
+
+ for (i = 0; num_pages; i++) {
+ int to_free = min_t(int, PAGES_PER_CHUNK, num_pages);
+
+ siw_free_plist(&umem->page_chunk[i], to_free,
+ umem->writable && dirty);
+ kfree(umem->page_chunk[i].plist);
+ num_pages -= to_free;
+ }
+ atomic64_sub(umem->num_pages, &mm_s->pinned_vm);
+
+ mmdrop(mm_s);
+ kfree(umem->page_chunk);
+ kfree(umem);
+}
+
+int siw_mr_add_mem(struct siw_mr *mr, struct ib_pd *pd, void *mem_obj,
+ u64 start, u64 len, int rights)
+{
+ struct siw_device *sdev = to_siw_dev(pd->device);
+ struct siw_mem *mem = kzalloc(sizeof(*mem), GFP_KERNEL);
+ struct xa_limit limit = XA_LIMIT(1, 0x00ffffff);
+ u32 id, next;
+
+ if (!mem)
+ return -ENOMEM;
+
+ mem->mem_obj = mem_obj;
+ mem->stag_valid = 0;
+ mem->sdev = sdev;
+ mem->va = start;
+ mem->len = len;
+ mem->pd = pd;
+ mem->perms = rights & IWARP_ACCESS_MASK;
+ kref_init(&mem->ref);
+
+ mr->mem = mem;
+
+ get_random_bytes(&next, 4);
+ next &= 0x00ffffff;
+
+ if (xa_alloc_cyclic(&sdev->mem_xa, &id, mem, limit, &next,
+ GFP_KERNEL) < 0) {
+ kfree(mem);
+ return -ENOMEM;
+ }
+ /* Set the STag index part */
+ mem->stag = id << 8;
+ mr->base_mr.lkey = mr->base_mr.rkey = mem->stag;
+
+ return 0;
+}
+
+void siw_mr_drop_mem(struct siw_mr *mr)
+{
+ struct siw_mem *mem = mr->mem, *found;
+
+ mem->stag_valid = 0;
+
+ /* make STag invalid visible asap */
+ smp_mb();
+
+ found = xa_erase(&mem->sdev->mem_xa, mem->stag >> 8);
+ WARN_ON(found != mem);
+ siw_mem_put(mem);
+}
+
+void siw_free_mem(struct kref *ref)
+{
+ struct siw_mem *mem = container_of(ref, struct siw_mem, ref);
+
+ siw_dbg_mem(mem, "free mem, pbl: %s\n", mem->is_pbl ? "y" : "n");
+
+ if (!mem->is_mw && mem->mem_obj) {
+ if (mem->is_pbl == 0)
+ siw_umem_release(mem->umem, true);
+ else
+ kfree(mem->pbl);
+ }
+ kfree(mem);
+}
+
+/*
+ * siw_check_mem()
+ *
+ * Check protection domain, STAG state, access permissions and
+ * address range for memory object.
+ *
+ * @pd: Protection Domain memory should belong to
+ * @mem: memory to be checked
+ * @addr: starting addr of mem
+ * @perms: requested access permissions
+ * @len: len of memory interval to be checked
+ *
+ */
+int siw_check_mem(struct ib_pd *pd, struct siw_mem *mem, u64 addr,
+ enum ib_access_flags perms, int len)
+{
+ if (!mem->stag_valid) {
+ siw_dbg_pd(pd, "STag 0x%08x invalid\n", mem->stag);
+ return -E_STAG_INVALID;
+ }
+ if (mem->pd != pd) {
+ siw_dbg_pd(pd, "STag 0x%08x: PD mismatch\n", mem->stag);
+ return -E_PD_MISMATCH;
+ }
+ /*
+ * check access permissions
+ */
+ if ((mem->perms & perms) < perms) {
+ siw_dbg_pd(pd, "permissions 0x%08x < 0x%08x\n",
+ mem->perms, perms);
+ return -E_ACCESS_PERM;
+ }
+ /*
+ * Check if access falls into valid memory interval.
+ */
+ if (addr < mem->va || addr + len > mem->va + mem->len) {
+ siw_dbg_pd(pd, "MEM interval len %d\n", len);
+ siw_dbg_pd(pd, "[0x%016llx, 0x%016llx] out of bounds\n",
+ (unsigned long long)addr,
+ (unsigned long long)(addr + len));
+ siw_dbg_pd(pd, "[0x%016llx, 0x%016llx] STag=0x%08x\n",
+ (unsigned long long)mem->va,
+ (unsigned long long)(mem->va + mem->len),
+ mem->stag);
+
+ return -E_BASE_BOUNDS;
+ }
+ return E_ACCESS_OK;
+}
+
+/*
+ * siw_check_sge()
+ *
+ * Check SGE for access rights in given interval
+ *
+ * @pd: Protection Domain memory should belong to
+ * @sge: SGE to be checked
+ * @mem: location of memory reference within array
+ * @perms: requested access permissions
+ * @off: starting offset in SGE
+ * @len: len of memory interval to be checked
+ *
+ * NOTE: Function references SGE's memory object (mem->obj)
+ * if not yet done. New reference is kept if check went ok and
+ * released if check failed. If mem->obj is already valid, no new
+ * lookup is being done and mem is not released it check fails.
+ */
+int siw_check_sge(struct ib_pd *pd, struct siw_sge *sge, struct siw_mem *mem[],
+ enum ib_access_flags perms, u32 off, int len)
+{
+ struct siw_device *sdev = to_siw_dev(pd->device);
+ struct siw_mem *new = NULL;
+ int rv = E_ACCESS_OK;
+
+ if (len + off > sge->length) {
+ rv = -E_BASE_BOUNDS;
+ goto fail;
+ }
+ if (*mem == NULL) {
+ new = siw_mem_id2obj(sdev, sge->lkey >> 8);
+ if (unlikely(!new)) {
+ siw_dbg_pd(pd, "STag unknown: 0x%08x\n", sge->lkey);
+ rv = -E_STAG_INVALID;
+ goto fail;
+ }
+ *mem = new;
+ }
+ /* Check if user re-registered with different STag key */
+ if (unlikely((*mem)->stag != sge->lkey)) {
+ siw_dbg_mem((*mem), "STag mismatch: 0x%08x\n", sge->lkey);
+ rv = -E_STAG_INVALID;
+ goto fail;
+ }
+ rv = siw_check_mem(pd, *mem, sge->laddr + off, perms, len);
+ if (unlikely(rv))
+ goto fail;
+
+ return 0;
+
+fail:
+ if (new) {
+ *mem = NULL;
+ siw_mem_put(new);
+ }
+ return rv;
+}
+
+void siw_wqe_put_mem(struct siw_wqe *wqe, enum siw_opcode op)
+{
+ switch (op) {
+ case SIW_OP_SEND:
+ case SIW_OP_WRITE:
+ case SIW_OP_SEND_WITH_IMM:
+ case SIW_OP_SEND_REMOTE_INV:
+ case SIW_OP_READ:
+ case SIW_OP_READ_LOCAL_INV:
+ if (!(wqe->sqe.flags & SIW_WQE_INLINE))
+ siw_unref_mem_sgl(wqe->mem, wqe->sqe.num_sge);
+ break;
+
+ case SIW_OP_RECEIVE:
+ siw_unref_mem_sgl(wqe->mem, wqe->rqe.num_sge);
+ break;
+
+ case SIW_OP_READ_RESPONSE:
+ siw_unref_mem_sgl(wqe->mem, 1);
+ break;
+
+ default:
+ /*
+ * SIW_OP_INVAL_STAG and SIW_OP_REG_MR
+ * do not hold memory references
+ */
+ break;
+ }
+}
+
+int siw_invalidate_stag(struct ib_pd *pd, u32 stag)
+{
+ struct siw_device *sdev = to_siw_dev(pd->device);
+ struct siw_mem *mem = siw_mem_id2obj(sdev, stag >> 8);
+ int rv = 0;
+
+ if (unlikely(!mem)) {
+ siw_dbg_pd(pd, "STag 0x%08x unknown\n", stag);
+ return -EINVAL;
+ }
+ if (unlikely(mem->pd != pd)) {
+ siw_dbg_pd(pd, "PD mismatch for STag 0x%08x\n", stag);
+ rv = -EACCES;
+ goto out;
+ }
+ /*
+ * Per RDMA verbs definition, an STag may already be in invalid
+ * state if invalidation is requested. So no state check here.
+ */
+ mem->stag_valid = 0;
+
+ siw_dbg_pd(pd, "STag 0x%08x now invalid\n", stag);
+out:
+ siw_mem_put(mem);
+ return rv;
+}
+
+/*
+ * Gets physical address backed by PBL element. Address is referenced
+ * by linear byte offset into list of variably sized PB elements.
+ * Optionally, provides remaining len within current element, and
+ * current PBL index for later resume at same element.
+ */
+u64 siw_pbl_get_buffer(struct siw_pbl *pbl, u64 off, int *len, int *idx)
+{
+ int i = idx ? *idx : 0;
+
+ while (i < pbl->num_buf) {
+ struct siw_pble *pble = &pbl->pbe[i];
+
+ if (pble->pbl_off + pble->size > off) {
+ u64 pble_off = off - pble->pbl_off;
+
+ if (len)
+ *len = pble->size - pble_off;
+ if (idx)
+ *idx = i;
+
+ return pble->addr + pble_off;
+ }
+ i++;
+ }
+ if (len)
+ *len = 0;
+ return 0;
+}
+
+struct siw_pbl *siw_pbl_alloc(u32 num_buf)
+{
+ struct siw_pbl *pbl;
+ int buf_size = sizeof(*pbl);
+
+ if (num_buf == 0)
+ return ERR_PTR(-EINVAL);
+
+ buf_size += ((num_buf - 1) * sizeof(struct siw_pble));
+
+ pbl = kzalloc(buf_size, GFP_KERNEL);
+ if (!pbl)
+ return ERR_PTR(-ENOMEM);
+
+ pbl->max_buf = num_buf;
+
+ return pbl;
+}
+
+struct siw_umem *siw_umem_get(u64 start, u64 len, bool writable)
+{
+ struct siw_umem *umem;
+ struct mm_struct *mm_s;
+ u64 first_page_va;
+ unsigned long mlock_limit;
+ unsigned int foll_flags = FOLL_WRITE;
+ int num_pages, num_chunks, i, rv = 0;
+
+ if (!can_do_mlock())
+ return ERR_PTR(-EPERM);
+
+ if (!len)
+ return ERR_PTR(-EINVAL);
+
+ first_page_va = start & PAGE_MASK;
+ num_pages = PAGE_ALIGN(start + len - first_page_va) >> PAGE_SHIFT;
+ num_chunks = (num_pages >> CHUNK_SHIFT) + 1;
+
+ umem = kzalloc(sizeof(*umem), GFP_KERNEL);
+ if (!umem)
+ return ERR_PTR(-ENOMEM);
+
+ mm_s = current->mm;
+ umem->owning_mm = mm_s;
+ umem->writable = writable;
+
+ mmgrab(mm_s);
+
+ if (!writable)
+ foll_flags |= FOLL_FORCE;
+
+ down_read(&mm_s->mmap_sem);
+
+ mlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+
+ if (num_pages + atomic64_read(&mm_s->pinned_vm) > mlock_limit) {
+ rv = -ENOMEM;
+ goto out_sem_up;
+ }
+ umem->fp_addr = first_page_va;
+
+ umem->page_chunk =
+ kcalloc(num_chunks, sizeof(struct siw_page_chunk), GFP_KERNEL);
+ if (!umem->page_chunk) {
+ rv = -ENOMEM;
+ goto out_sem_up;
+ }
+ for (i = 0; num_pages; i++) {
+ int got, nents = min_t(int, num_pages, PAGES_PER_CHUNK);
+
+ umem->page_chunk[i].plist =
+ kcalloc(nents, sizeof(struct page *), GFP_KERNEL);
+ if (!umem->page_chunk[i].plist) {
+ rv = -ENOMEM;
+ goto out_sem_up;
+ }
+ got = 0;
+ while (nents) {
+ struct page **plist = &umem->page_chunk[i].plist[got];
+
+ rv = get_user_pages(first_page_va, nents,
+ foll_flags | FOLL_LONGTERM,
+ plist, NULL);
+ if (rv < 0)
+ goto out_sem_up;
+
+ umem->num_pages += rv;
+ atomic64_add(rv, &mm_s->pinned_vm);
+ first_page_va += rv * PAGE_SIZE;
+ nents -= rv;
+ got += rv;
+ }
+ num_pages -= got;
+ }
+out_sem_up:
+ up_read(&mm_s->mmap_sem);
+
+ if (rv > 0)
+ return umem;
+
+ siw_umem_release(umem, false);
+
+ return ERR_PTR(rv);
+}
diff --git a/drivers/infiniband/sw/siw/siw_mem.h b/drivers/infiniband/sw/siw/siw_mem.h
new file mode 100644
index 000000000000..f43daf280891
--- /dev/null
+++ b/drivers/infiniband/sw/siw/siw_mem.h
@@ -0,0 +1,74 @@
+/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause */
+
+/* Authors: Bernard Metzler <bmt@zurich.ibm.com> */
+/* Copyright (c) 2008-2019, IBM Corporation */
+
+#ifndef _SIW_MEM_H
+#define _SIW_MEM_H
+
+struct siw_umem *siw_umem_get(u64 start, u64 len, bool writable);
+void siw_umem_release(struct siw_umem *umem, bool dirty);
+struct siw_pbl *siw_pbl_alloc(u32 num_buf);
+u64 siw_pbl_get_buffer(struct siw_pbl *pbl, u64 off, int *len, int *idx);
+struct siw_mem *siw_mem_id2obj(struct siw_device *sdev, int stag_index);
+int siw_mem_add(struct siw_device *sdev, struct siw_mem *m);
+int siw_invalidate_stag(struct ib_pd *pd, u32 stag);
+int siw_check_mem(struct ib_pd *pd, struct siw_mem *mem, u64 addr,
+ enum ib_access_flags perms, int len);
+int siw_check_sge(struct ib_pd *pd, struct siw_sge *sge,
+ struct siw_mem *mem[], enum ib_access_flags perms,
+ u32 off, int len);
+void siw_wqe_put_mem(struct siw_wqe *wqe, enum siw_opcode op);
+int siw_mr_add_mem(struct siw_mr *mr, struct ib_pd *pd, void *mem_obj,
+ u64 start, u64 len, int rights);
+void siw_mr_drop_mem(struct siw_mr *mr);
+void siw_free_mem(struct kref *ref);
+
+static inline void siw_mem_put(struct siw_mem *mem)
+{
+ kref_put(&mem->ref, siw_free_mem);
+}
+
+static inline struct siw_mr *siw_mem2mr(struct siw_mem *m)
+{
+ return container_of(m, struct siw_mr, mem);
+}
+
+static inline void siw_unref_mem_sgl(struct siw_mem **mem, unsigned int num_sge)
+{
+ while (num_sge) {
+ if (*mem == NULL)
+ break;
+
+ siw_mem_put(*mem);
+ *mem = NULL;
+ mem++;
+ num_sge--;
+ }
+}
+
+#define CHUNK_SHIFT 9 /* sets number of pages per chunk */
+#define PAGES_PER_CHUNK (_AC(1, UL) << CHUNK_SHIFT)
+#define CHUNK_MASK (~(PAGES_PER_CHUNK - 1))
+#define PAGE_CHUNK_SIZE (PAGES_PER_CHUNK * sizeof(struct page *))
+
+/*
+ * siw_get_upage()
+ *
+ * Get page pointer for address on given umem.
+ *
+ * @umem: two dimensional list of page pointers
+ * @addr: user virtual address
+ */
+static inline struct page *siw_get_upage(struct siw_umem *umem, u64 addr)
+{
+ unsigned int page_idx = (addr - umem->fp_addr) >> PAGE_SHIFT,
+ chunk_idx = page_idx >> CHUNK_SHIFT,
+ page_in_chunk = page_idx & ~CHUNK_MASK;
+
+ if (likely(page_idx < umem->num_pages))
+ return umem->page_chunk[chunk_idx].plist[page_in_chunk];
+
+ return NULL;
+}
+#endif
diff --git a/drivers/infiniband/sw/siw/siw_qp.c b/drivers/infiniband/sw/siw/siw_qp.c
new file mode 100644
index 000000000000..11383d9f95ef
--- /dev/null
+++ b/drivers/infiniband/sw/siw/siw_qp.c
@@ -0,0 +1,1322 @@
+// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause
+
+/* Authors: Bernard Metzler <bmt@zurich.ibm.com> */
+/* Copyright (c) 2008-2019, IBM Corporation */
+
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/net.h>
+#include <linux/scatterlist.h>
+#include <linux/llist.h>
+#include <asm/barrier.h>
+#include <net/tcp.h>
+
+#include "siw.h"
+#include "siw_verbs.h"
+#include "siw_mem.h"
+
+static char siw_qp_state_to_string[SIW_QP_STATE_COUNT][sizeof "TERMINATE"] = {
+ [SIW_QP_STATE_IDLE] = "IDLE",
+ [SIW_QP_STATE_RTR] = "RTR",
+ [SIW_QP_STATE_RTS] = "RTS",
+ [SIW_QP_STATE_CLOSING] = "CLOSING",
+ [SIW_QP_STATE_TERMINATE] = "TERMINATE",
+ [SIW_QP_STATE_ERROR] = "ERROR"
+};
+
+/*
+ * iWARP (RDMAP, DDP and MPA) parameters as well as Softiwarp settings on a
+ * per-RDMAP message basis. Please keep order of initializer. All MPA len
+ * is initialized to minimum packet size.
+ */
+struct iwarp_msg_info iwarp_pktinfo[RDMAP_TERMINATE + 1] = {
+ { /* RDMAP_RDMA_WRITE */
+ .hdr_len = sizeof(struct iwarp_rdma_write),
+ .ctrl.mpa_len = htons(sizeof(struct iwarp_rdma_write) - 2),
+ .ctrl.ddp_rdmap_ctrl = DDP_FLAG_TAGGED | DDP_FLAG_LAST |
+ cpu_to_be16(DDP_VERSION << 8) |
+ cpu_to_be16(RDMAP_VERSION << 6) |
+ cpu_to_be16(RDMAP_RDMA_WRITE),
+ .rx_data = siw_proc_write },
+ { /* RDMAP_RDMA_READ_REQ */
+ .hdr_len = sizeof(struct iwarp_rdma_rreq),
+ .ctrl.mpa_len = htons(sizeof(struct iwarp_rdma_rreq) - 2),
+ .ctrl.ddp_rdmap_ctrl = DDP_FLAG_LAST | cpu_to_be16(DDP_VERSION << 8) |
+ cpu_to_be16(RDMAP_VERSION << 6) |
+ cpu_to_be16(RDMAP_RDMA_READ_REQ),
+ .rx_data = siw_proc_rreq },
+ { /* RDMAP_RDMA_READ_RESP */
+ .hdr_len = sizeof(struct iwarp_rdma_rresp),
+ .ctrl.mpa_len = htons(sizeof(struct iwarp_rdma_rresp) - 2),
+ .ctrl.ddp_rdmap_ctrl = DDP_FLAG_TAGGED | DDP_FLAG_LAST |
+ cpu_to_be16(DDP_VERSION << 8) |
+ cpu_to_be16(RDMAP_VERSION << 6) |
+ cpu_to_be16(RDMAP_RDMA_READ_RESP),
+ .rx_data = siw_proc_rresp },
+ { /* RDMAP_SEND */
+ .hdr_len = sizeof(struct iwarp_send),
+ .ctrl.mpa_len = htons(sizeof(struct iwarp_send) - 2),
+ .ctrl.ddp_rdmap_ctrl = DDP_FLAG_LAST | cpu_to_be16(DDP_VERSION << 8) |
+ cpu_to_be16(RDMAP_VERSION << 6) |
+ cpu_to_be16(RDMAP_SEND),
+ .rx_data = siw_proc_send },
+ { /* RDMAP_SEND_INVAL */
+ .hdr_len = sizeof(struct iwarp_send_inv),
+ .ctrl.mpa_len = htons(sizeof(struct iwarp_send_inv) - 2),
+ .ctrl.ddp_rdmap_ctrl = DDP_FLAG_LAST | cpu_to_be16(DDP_VERSION << 8) |
+ cpu_to_be16(RDMAP_VERSION << 6) |
+ cpu_to_be16(RDMAP_SEND_INVAL),
+ .rx_data = siw_proc_send },
+ { /* RDMAP_SEND_SE */
+ .hdr_len = sizeof(struct iwarp_send),
+ .ctrl.mpa_len = htons(sizeof(struct iwarp_send) - 2),
+ .ctrl.ddp_rdmap_ctrl = DDP_FLAG_LAST | cpu_to_be16(DDP_VERSION << 8) |
+ cpu_to_be16(RDMAP_VERSION << 6) |
+ cpu_to_be16(RDMAP_SEND_SE),
+ .rx_data = siw_proc_send },
+ { /* RDMAP_SEND_SE_INVAL */
+ .hdr_len = sizeof(struct iwarp_send_inv),
+ .ctrl.mpa_len = htons(sizeof(struct iwarp_send_inv) - 2),
+ .ctrl.ddp_rdmap_ctrl = DDP_FLAG_LAST | cpu_to_be16(DDP_VERSION << 8) |
+ cpu_to_be16(RDMAP_VERSION << 6) |
+ cpu_to_be16(RDMAP_SEND_SE_INVAL),
+ .rx_data = siw_proc_send },
+ { /* RDMAP_TERMINATE */
+ .hdr_len = sizeof(struct iwarp_terminate),
+ .ctrl.mpa_len = htons(sizeof(struct iwarp_terminate) - 2),
+ .ctrl.ddp_rdmap_ctrl = DDP_FLAG_LAST | cpu_to_be16(DDP_VERSION << 8) |
+ cpu_to_be16(RDMAP_VERSION << 6) |
+ cpu_to_be16(RDMAP_TERMINATE),
+ .rx_data = siw_proc_terminate }
+};
+
+void siw_qp_llp_data_ready(struct sock *sk)
+{
+ struct siw_qp *qp;
+
+ read_lock(&sk->sk_callback_lock);
+
+ if (unlikely(!sk->sk_user_data || !sk_to_qp(sk)))
+ goto done;
+
+ qp = sk_to_qp(sk);
+
+ if (likely(!qp->rx_stream.rx_suspend &&
+ down_read_trylock(&qp->state_lock))) {
+ read_descriptor_t rd_desc = { .arg.data = qp, .count = 1 };
+
+ if (likely(qp->attrs.state == SIW_QP_STATE_RTS))
+ /*
+ * Implements data receive operation during
+ * socket callback. TCP gracefully catches
+ * the case where there is nothing to receive
+ * (not calling siw_tcp_rx_data() then).
+ */
+ tcp_read_sock(sk, &rd_desc, siw_tcp_rx_data);
+
+ up_read(&qp->state_lock);
+ } else {
+ siw_dbg_qp(qp, "unable to process RX, suspend: %d\n",
+ qp->rx_stream.rx_suspend);
+ }
+done:
+ read_unlock(&sk->sk_callback_lock);
+}
+
+void siw_qp_llp_close(struct siw_qp *qp)
+{
+ siw_dbg_qp(qp, "enter llp close, state = %s\n",
+ siw_qp_state_to_string[qp->attrs.state]);
+
+ down_write(&qp->state_lock);
+
+ qp->rx_stream.rx_suspend = 1;
+ qp->tx_ctx.tx_suspend = 1;
+ qp->attrs.sk = NULL;
+
+ switch (qp->attrs.state) {
+ case SIW_QP_STATE_RTS:
+ case SIW_QP_STATE_RTR:
+ case SIW_QP_STATE_IDLE:
+ case SIW_QP_STATE_TERMINATE:
+ qp->attrs.state = SIW_QP_STATE_ERROR;
+ break;
+ /*
+ * SIW_QP_STATE_CLOSING:
+ *
+ * This is a forced close. shall the QP be moved to
+ * ERROR or IDLE ?
+ */
+ case SIW_QP_STATE_CLOSING:
+ if (tx_wqe(qp)->wr_status == SIW_WR_IDLE)
+ qp->attrs.state = SIW_QP_STATE_ERROR;
+ else
+ qp->attrs.state = SIW_QP_STATE_IDLE;
+ break;
+
+ default:
+ siw_dbg_qp(qp, "llp close: no state transition needed: %s\n",
+ siw_qp_state_to_string[qp->attrs.state]);
+ break;
+ }
+ siw_sq_flush(qp);
+ siw_rq_flush(qp);
+
+ /*
+ * Dereference closing CEP
+ */
+ if (qp->cep) {
+ siw_cep_put(qp->cep);
+ qp->cep = NULL;
+ }
+
+ up_write(&qp->state_lock);
+
+ siw_dbg_qp(qp, "llp close exit: state %s\n",
+ siw_qp_state_to_string[qp->attrs.state]);
+}
+
+/*
+ * socket callback routine informing about newly available send space.
+ * Function schedules SQ work for processing SQ items.
+ */
+void siw_qp_llp_write_space(struct sock *sk)
+{
+ struct siw_cep *cep = sk_to_cep(sk);
+
+ cep->sk_write_space(sk);
+
+ if (!test_bit(SOCK_NOSPACE, &sk->sk_socket->flags))
+ (void)siw_sq_start(cep->qp);
+}
+
+static int siw_qp_readq_init(struct siw_qp *qp, int irq_size, int orq_size)
+{
+ irq_size = roundup_pow_of_two(irq_size);
+ orq_size = roundup_pow_of_two(orq_size);
+
+ qp->attrs.irq_size = irq_size;
+ qp->attrs.orq_size = orq_size;
+
+ qp->irq = vzalloc(irq_size * sizeof(struct siw_sqe));
+ if (!qp->irq) {
+ siw_dbg_qp(qp, "irq malloc for %d failed\n", irq_size);
+ qp->attrs.irq_size = 0;
+ return -ENOMEM;
+ }
+ qp->orq = vzalloc(orq_size * sizeof(struct siw_sqe));
+ if (!qp->orq) {
+ siw_dbg_qp(qp, "orq malloc for %d failed\n", orq_size);
+ qp->attrs.orq_size = 0;
+ qp->attrs.irq_size = 0;
+ vfree(qp->irq);
+ return -ENOMEM;
+ }
+ siw_dbg_qp(qp, "ORD %d, IRD %d\n", orq_size, irq_size);
+ return 0;
+}
+
+static int siw_qp_enable_crc(struct siw_qp *qp)
+{
+ struct siw_rx_stream *c_rx = &qp->rx_stream;
+ struct siw_iwarp_tx *c_tx = &qp->tx_ctx;
+ int size = crypto_shash_descsize(siw_crypto_shash) +
+ sizeof(struct shash_desc);
+
+ if (siw_crypto_shash == NULL)
+ return -ENOENT;
+
+ c_tx->mpa_crc_hd = kzalloc(size, GFP_KERNEL);
+ c_rx->mpa_crc_hd = kzalloc(size, GFP_KERNEL);
+ if (!c_tx->mpa_crc_hd || !c_rx->mpa_crc_hd) {
+ kfree(c_tx->mpa_crc_hd);
+ kfree(c_rx->mpa_crc_hd);
+ c_tx->mpa_crc_hd = NULL;
+ c_rx->mpa_crc_hd = NULL;
+ return -ENOMEM;
+ }
+ c_tx->mpa_crc_hd->tfm = siw_crypto_shash;
+ c_rx->mpa_crc_hd->tfm = siw_crypto_shash;
+
+ return 0;
+}
+
+/*
+ * Send a non signalled READ or WRITE to peer side as negotiated
+ * with MPAv2 P2P setup protocol. The work request is only created
+ * as a current active WR and does not consume Send Queue space.
+ *
+ * Caller must hold QP state lock.
+ */
+int siw_qp_mpa_rts(struct siw_qp *qp, enum mpa_v2_ctrl ctrl)
+{
+ struct siw_wqe *wqe = tx_wqe(qp);
+ unsigned long flags;
+ int rv = 0;
+
+ spin_lock_irqsave(&qp->sq_lock, flags);
+
+ if (unlikely(wqe->wr_status != SIW_WR_IDLE)) {
+ spin_unlock_irqrestore(&qp->sq_lock, flags);
+ return -EIO;
+ }
+ memset(wqe->mem, 0, sizeof(*wqe->mem) * SIW_MAX_SGE);
+
+ wqe->wr_status = SIW_WR_QUEUED;
+ wqe->sqe.flags = 0;
+ wqe->sqe.num_sge = 1;
+ wqe->sqe.sge[0].length = 0;
+ wqe->sqe.sge[0].laddr = 0;
+ wqe->sqe.sge[0].lkey = 0;
+ /*
+ * While it must not be checked for inbound zero length
+ * READ/WRITE, some HW may treat STag 0 special.
+ */
+ wqe->sqe.rkey = 1;
+ wqe->sqe.raddr = 0;
+ wqe->processed = 0;
+
+ if (ctrl & MPA_V2_RDMA_WRITE_RTR)
+ wqe->sqe.opcode = SIW_OP_WRITE;
+ else if (ctrl & MPA_V2_RDMA_READ_RTR) {
+ struct siw_sqe *rreq;
+
+ wqe->sqe.opcode = SIW_OP_READ;
+
+ spin_lock(&qp->orq_lock);
+
+ rreq = orq_get_free(qp);
+ if (rreq) {
+ siw_read_to_orq(rreq, &wqe->sqe);
+ qp->orq_put++;
+ } else
+ rv = -EIO;
+
+ spin_unlock(&qp->orq_lock);
+ } else
+ rv = -EINVAL;
+
+ if (rv)
+ wqe->wr_status = SIW_WR_IDLE;
+
+ spin_unlock_irqrestore(&qp->sq_lock, flags);
+
+ if (!rv)
+ rv = siw_sq_start(qp);
+
+ return rv;
+}
+
+/*
+ * Map memory access error to DDP tagged error
+ */
+enum ddp_ecode siw_tagged_error(enum siw_access_state state)
+{
+ switch (state) {
+ case E_STAG_INVALID:
+ return DDP_ECODE_T_INVALID_STAG;
+ case E_BASE_BOUNDS:
+ return DDP_ECODE_T_BASE_BOUNDS;
+ case E_PD_MISMATCH:
+ return DDP_ECODE_T_STAG_NOT_ASSOC;
+ case E_ACCESS_PERM:
+ /*
+ * RFC 5041 (DDP) lacks an ecode for insufficient access
+ * permissions. 'Invalid STag' seem to be the closest
+ * match though.
+ */
+ return DDP_ECODE_T_INVALID_STAG;
+ default:
+ WARN_ON(1);
+ return DDP_ECODE_T_INVALID_STAG;
+ }
+}
+
+/*
+ * Map memory access error to RDMAP protection error
+ */
+enum rdmap_ecode siw_rdmap_error(enum siw_access_state state)
+{
+ switch (state) {
+ case E_STAG_INVALID:
+ return RDMAP_ECODE_INVALID_STAG;
+ case E_BASE_BOUNDS:
+ return RDMAP_ECODE_BASE_BOUNDS;
+ case E_PD_MISMATCH:
+ return RDMAP_ECODE_STAG_NOT_ASSOC;
+ case E_ACCESS_PERM:
+ return RDMAP_ECODE_ACCESS_RIGHTS;
+ default:
+ return RDMAP_ECODE_UNSPECIFIED;
+ }
+}
+
+void siw_init_terminate(struct siw_qp *qp, enum term_elayer layer, u8 etype,
+ u8 ecode, int in_tx)
+{
+ if (!qp->term_info.valid) {
+ memset(&qp->term_info, 0, sizeof(qp->term_info));
+ qp->term_info.layer = layer;
+ qp->term_info.etype = etype;
+ qp->term_info.ecode = ecode;
+ qp->term_info.in_tx = in_tx;
+ qp->term_info.valid = 1;
+ }
+ siw_dbg_qp(qp, "init TERM: layer %d, type %d, code %d, in tx %s\n",
+ layer, etype, ecode, in_tx ? "yes" : "no");
+}
+
+/*
+ * Send a TERMINATE message, as defined in RFC's 5040/5041/5044/6581.
+ * Sending TERMINATE messages is best effort - such messages
+ * can only be send if the QP is still connected and it does
+ * not have another outbound message in-progress, i.e. the
+ * TERMINATE message must not interfer with an incomplete current
+ * transmit operation.
+ */
+void siw_send_terminate(struct siw_qp *qp)
+{
+ struct kvec iov[3];
+ struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_EOR };
+ struct iwarp_terminate *term = NULL;
+ union iwarp_hdr *err_hdr = NULL;
+ struct socket *s = qp->attrs.sk;
+ struct siw_rx_stream *srx = &qp->rx_stream;
+ union iwarp_hdr *rx_hdr = &srx->hdr;
+ u32 crc = 0;
+ int num_frags, len_terminate, rv;
+
+ if (!qp->term_info.valid)
+ return;
+
+ qp->term_info.valid = 0;
+
+ if (tx_wqe(qp)->wr_status == SIW_WR_INPROGRESS) {
+ siw_dbg_qp(qp, "cannot send TERMINATE: op %d in progress\n",
+ tx_type(tx_wqe(qp)));
+ return;
+ }
+ if (!s && qp->cep)
+ /* QP not yet in RTS. Take socket from connection end point */
+ s = qp->cep->sock;
+
+ if (!s) {
+ siw_dbg_qp(qp, "cannot send TERMINATE: not connected\n");
+ return;
+ }
+
+ term = kzalloc(sizeof(*term), GFP_KERNEL);
+ if (!term)
+ return;
+
+ term->ddp_qn = cpu_to_be32(RDMAP_UNTAGGED_QN_TERMINATE);
+ term->ddp_mo = 0;
+ term->ddp_msn = cpu_to_be32(1);
+
+ iov[0].iov_base = term;
+ iov[0].iov_len = sizeof(*term);
+
+ if ((qp->term_info.layer == TERM_ERROR_LAYER_DDP) ||
+ ((qp->term_info.layer == TERM_ERROR_LAYER_RDMAP) &&
+ (qp->term_info.etype != RDMAP_ETYPE_CATASTROPHIC))) {
+ err_hdr = kzalloc(sizeof(*err_hdr), GFP_KERNEL);
+ if (!err_hdr) {
+ kfree(term);
+ return;
+ }
+ }
+ memcpy(&term->ctrl, &iwarp_pktinfo[RDMAP_TERMINATE].ctrl,
+ sizeof(struct iwarp_ctrl));
+
+ __rdmap_term_set_layer(term, qp->term_info.layer);
+ __rdmap_term_set_etype(term, qp->term_info.etype);
+ __rdmap_term_set_ecode(term, qp->term_info.ecode);
+
+ switch (qp->term_info.layer) {
+ case TERM_ERROR_LAYER_RDMAP:
+ if (qp->term_info.etype == RDMAP_ETYPE_CATASTROPHIC)
+ /* No additional DDP/RDMAP header to be included */
+ break;
+
+ if (qp->term_info.etype == RDMAP_ETYPE_REMOTE_PROTECTION) {
+ /*
+ * Complete RDMAP frame will get attached, and
+ * DDP segment length is valid
+ */
+ term->flag_m = 1;
+ term->flag_d = 1;
+ term->flag_r = 1;
+
+ if (qp->term_info.in_tx) {
+ struct iwarp_rdma_rreq *rreq;
+ struct siw_wqe *wqe = tx_wqe(qp);
+
+ /* Inbound RREQ error, detected during
+ * RRESP creation. Take state from
+ * current TX work queue element to
+ * reconstruct peers RREQ.
+ */
+ rreq = (struct iwarp_rdma_rreq *)err_hdr;
+
+ memcpy(&rreq->ctrl,
+ &iwarp_pktinfo[RDMAP_RDMA_READ_REQ].ctrl,
+ sizeof(struct iwarp_ctrl));
+
+ rreq->rsvd = 0;
+ rreq->ddp_qn =
+ htonl(RDMAP_UNTAGGED_QN_RDMA_READ);
+
+ /* Provide RREQ's MSN as kept aside */
+ rreq->ddp_msn = htonl(wqe->sqe.sge[0].length);
+
+ rreq->ddp_mo = htonl(wqe->processed);
+ rreq->sink_stag = htonl(wqe->sqe.rkey);
+ rreq->sink_to = cpu_to_be64(wqe->sqe.raddr);
+ rreq->read_size = htonl(wqe->sqe.sge[0].length);
+ rreq->source_stag = htonl(wqe->sqe.sge[0].lkey);
+ rreq->source_to =
+ cpu_to_be64(wqe->sqe.sge[0].laddr);
+
+ iov[1].iov_base = rreq;
+ iov[1].iov_len = sizeof(*rreq);
+
+ rx_hdr = (union iwarp_hdr *)rreq;
+ } else {
+ /* Take RDMAP/DDP information from
+ * current (failed) inbound frame.
+ */
+ iov[1].iov_base = rx_hdr;
+
+ if (__rdmap_get_opcode(&rx_hdr->ctrl) ==
+ RDMAP_RDMA_READ_REQ)
+ iov[1].iov_len =
+ sizeof(struct iwarp_rdma_rreq);
+ else /* SEND type */
+ iov[1].iov_len =
+ sizeof(struct iwarp_send);
+ }
+ } else {
+ /* Do not report DDP hdr information if packet
+ * layout is unknown
+ */
+ if ((qp->term_info.ecode == RDMAP_ECODE_VERSION) ||
+ (qp->term_info.ecode == RDMAP_ECODE_OPCODE))
+ break;
+
+ iov[1].iov_base = rx_hdr;
+
+ /* Only DDP frame will get attached */
+ if (rx_hdr->ctrl.ddp_rdmap_ctrl & DDP_FLAG_TAGGED)
+ iov[1].iov_len =
+ sizeof(struct iwarp_rdma_write);
+ else
+ iov[1].iov_len = sizeof(struct iwarp_send);
+
+ term->flag_m = 1;
+ term->flag_d = 1;
+ }
+ term->ctrl.mpa_len = cpu_to_be16(iov[1].iov_len);
+ break;
+
+ case TERM_ERROR_LAYER_DDP:
+ /* Report error encountered while DDP processing.
+ * This can only happen as a result of inbound
+ * DDP processing
+ */
+
+ /* Do not report DDP hdr information if packet
+ * layout is unknown
+ */
+ if (((qp->term_info.etype == DDP_ETYPE_TAGGED_BUF) &&
+ (qp->term_info.ecode == DDP_ECODE_T_VERSION)) ||
+ ((qp->term_info.etype == DDP_ETYPE_UNTAGGED_BUF) &&
+ (qp->term_info.ecode == DDP_ECODE_UT_VERSION)))
+ break;
+
+ iov[1].iov_base = rx_hdr;
+
+ if (rx_hdr->ctrl.ddp_rdmap_ctrl & DDP_FLAG_TAGGED)
+ iov[1].iov_len = sizeof(struct iwarp_ctrl_tagged);
+ else
+ iov[1].iov_len = sizeof(struct iwarp_ctrl_untagged);
+
+ term->flag_m = 1;
+ term->flag_d = 1;
+ break;
+
+ default:
+ break;
+ }
+ if (term->flag_m || term->flag_d || term->flag_r) {
+ iov[2].iov_base = &crc;
+ iov[2].iov_len = sizeof(crc);
+ len_terminate = sizeof(*term) + iov[1].iov_len + MPA_CRC_SIZE;
+ num_frags = 3;
+ } else {
+ iov[1].iov_base = &crc;
+ iov[1].iov_len = sizeof(crc);
+ len_terminate = sizeof(*term) + MPA_CRC_SIZE;
+ num_frags = 2;
+ }
+
+ /* Adjust DDP Segment Length parameter, if valid */
+ if (term->flag_m) {
+ u32 real_ddp_len = be16_to_cpu(rx_hdr->ctrl.mpa_len);
+ enum rdma_opcode op = __rdmap_get_opcode(&rx_hdr->ctrl);
+
+ real_ddp_len -= iwarp_pktinfo[op].hdr_len - MPA_HDR_SIZE;
+ rx_hdr->ctrl.mpa_len = cpu_to_be16(real_ddp_len);
+ }
+
+ term->ctrl.mpa_len =
+ cpu_to_be16(len_terminate - (MPA_HDR_SIZE + MPA_CRC_SIZE));
+ if (qp->tx_ctx.mpa_crc_hd) {
+ crypto_shash_init(qp->tx_ctx.mpa_crc_hd);
+ if (crypto_shash_update(qp->tx_ctx.mpa_crc_hd,
+ (u8 *)iov[0].iov_base,
+ iov[0].iov_len))
+ goto out;
+
+ if (num_frags == 3) {
+ if (crypto_shash_update(qp->tx_ctx.mpa_crc_hd,
+ (u8 *)iov[1].iov_base,
+ iov[1].iov_len))
+ goto out;
+ }
+ crypto_shash_final(qp->tx_ctx.mpa_crc_hd, (u8 *)&crc);
+ }
+
+ rv = kernel_sendmsg(s, &msg, iov, num_frags, len_terminate);
+ siw_dbg_qp(qp, "sent TERM: %s, layer %d, type %d, code %d (%d bytes)\n",
+ rv == len_terminate ? "success" : "failure",
+ __rdmap_term_layer(term), __rdmap_term_etype(term),
+ __rdmap_term_ecode(term), rv);
+out:
+ kfree(term);
+ kfree(err_hdr);
+}
+
+/*
+ * Handle all attrs other than state
+ */
+static void siw_qp_modify_nonstate(struct siw_qp *qp,
+ struct siw_qp_attrs *attrs,
+ enum siw_qp_attr_mask mask)
+{
+ if (mask & SIW_QP_ATTR_ACCESS_FLAGS) {
+ if (attrs->flags & SIW_RDMA_BIND_ENABLED)
+ qp->attrs.flags |= SIW_RDMA_BIND_ENABLED;
+ else
+ qp->attrs.flags &= ~SIW_RDMA_BIND_ENABLED;
+
+ if (attrs->flags & SIW_RDMA_WRITE_ENABLED)
+ qp->attrs.flags |= SIW_RDMA_WRITE_ENABLED;
+ else
+ qp->attrs.flags &= ~SIW_RDMA_WRITE_ENABLED;
+
+ if (attrs->flags & SIW_RDMA_READ_ENABLED)
+ qp->attrs.flags |= SIW_RDMA_READ_ENABLED;
+ else
+ qp->attrs.flags &= ~SIW_RDMA_READ_ENABLED;
+ }
+}
+
+static int siw_qp_nextstate_from_idle(struct siw_qp *qp,
+ struct siw_qp_attrs *attrs,
+ enum siw_qp_attr_mask mask)
+{
+ int rv = 0;
+
+ switch (attrs->state) {
+ case SIW_QP_STATE_RTS:
+ if (attrs->flags & SIW_MPA_CRC) {
+ rv = siw_qp_enable_crc(qp);
+ if (rv)
+ break;
+ }
+ if (!(mask & SIW_QP_ATTR_LLP_HANDLE)) {
+ siw_dbg_qp(qp, "no socket\n");
+ rv = -EINVAL;
+ break;
+ }
+ if (!(mask & SIW_QP_ATTR_MPA)) {
+ siw_dbg_qp(qp, "no MPA\n");
+ rv = -EINVAL;
+ break;
+ }
+ /*
+ * Initialize iWARP TX state
+ */
+ qp->tx_ctx.ddp_msn[RDMAP_UNTAGGED_QN_SEND] = 0;
+ qp->tx_ctx.ddp_msn[RDMAP_UNTAGGED_QN_RDMA_READ] = 0;
+ qp->tx_ctx.ddp_msn[RDMAP_UNTAGGED_QN_TERMINATE] = 0;
+
+ /*
+ * Initialize iWARP RX state
+ */
+ qp->rx_stream.ddp_msn[RDMAP_UNTAGGED_QN_SEND] = 1;
+ qp->rx_stream.ddp_msn[RDMAP_UNTAGGED_QN_RDMA_READ] = 1;
+ qp->rx_stream.ddp_msn[RDMAP_UNTAGGED_QN_TERMINATE] = 1;
+
+ /*
+ * init IRD free queue, caller has already checked
+ * limits.
+ */
+ rv = siw_qp_readq_init(qp, attrs->irq_size,
+ attrs->orq_size);
+ if (rv)
+ break;
+
+ qp->attrs.sk = attrs->sk;
+ qp->attrs.state = SIW_QP_STATE_RTS;
+
+ siw_dbg_qp(qp, "enter RTS: crc=%s, ord=%u, ird=%u\n",
+ attrs->flags & SIW_MPA_CRC ? "y" : "n",
+ qp->attrs.orq_size, qp->attrs.irq_size);
+ break;
+
+ case SIW_QP_STATE_ERROR:
+ siw_rq_flush(qp);
+ qp->attrs.state = SIW_QP_STATE_ERROR;
+ if (qp->cep) {
+ siw_cep_put(qp->cep);
+ qp->cep = NULL;
+ }
+ break;
+
+ default:
+ break;
+ }
+ return rv;
+}
+
+static int siw_qp_nextstate_from_rts(struct siw_qp *qp,
+ struct siw_qp_attrs *attrs)
+{
+ int drop_conn = 0;
+
+ switch (attrs->state) {
+ case SIW_QP_STATE_CLOSING:
+ /*
+ * Verbs: move to IDLE if SQ and ORQ are empty.
+ * Move to ERROR otherwise. But first of all we must
+ * close the connection. So we keep CLOSING or ERROR
+ * as a transient state, schedule connection drop work
+ * and wait for the socket state change upcall to
+ * come back closed.
+ */
+ if (tx_wqe(qp)->wr_status == SIW_WR_IDLE) {
+ qp->attrs.state = SIW_QP_STATE_CLOSING;
+ } else {
+ qp->attrs.state = SIW_QP_STATE_ERROR;
+ siw_sq_flush(qp);
+ }
+ siw_rq_flush(qp);
+
+ drop_conn = 1;
+ break;
+
+ case SIW_QP_STATE_TERMINATE:
+ qp->attrs.state = SIW_QP_STATE_TERMINATE;
+
+ siw_init_terminate(qp, TERM_ERROR_LAYER_RDMAP,
+ RDMAP_ETYPE_CATASTROPHIC,
+ RDMAP_ECODE_UNSPECIFIED, 1);
+ drop_conn = 1;
+ break;
+
+ case SIW_QP_STATE_ERROR:
+ /*
+ * This is an emergency close.
+ *
+ * Any in progress transmit operation will get
+ * cancelled.
+ * This will likely result in a protocol failure,
+ * if a TX operation is in transit. The caller
+ * could unconditional wait to give the current
+ * operation a chance to complete.
+ * Esp., how to handle the non-empty IRQ case?
+ * The peer was asking for data transfer at a valid
+ * point in time.
+ */
+ siw_sq_flush(qp);
+ siw_rq_flush(qp);
+ qp->attrs.state = SIW_QP_STATE_ERROR;
+ drop_conn = 1;
+ break;
+
+ default:
+ break;
+ }
+ return drop_conn;
+}
+
+static void siw_qp_nextstate_from_term(struct siw_qp *qp,
+ struct siw_qp_attrs *attrs)
+{
+ switch (attrs->state) {
+ case SIW_QP_STATE_ERROR:
+ siw_rq_flush(qp);
+ qp->attrs.state = SIW_QP_STATE_ERROR;
+
+ if (tx_wqe(qp)->wr_status != SIW_WR_IDLE)
+ siw_sq_flush(qp);
+ break;
+
+ default:
+ break;
+ }
+}
+
+static int siw_qp_nextstate_from_close(struct siw_qp *qp,
+ struct siw_qp_attrs *attrs)
+{
+ int rv = 0;
+
+ switch (attrs->state) {
+ case SIW_QP_STATE_IDLE:
+ WARN_ON(tx_wqe(qp)->wr_status != SIW_WR_IDLE);
+ qp->attrs.state = SIW_QP_STATE_IDLE;
+ break;
+
+ case SIW_QP_STATE_CLOSING:
+ /*
+ * The LLP may already moved the QP to closing
+ * due to graceful peer close init
+ */
+ break;
+
+ case SIW_QP_STATE_ERROR:
+ /*
+ * QP was moved to CLOSING by LLP event
+ * not yet seen by user.
+ */
+ qp->attrs.state = SIW_QP_STATE_ERROR;
+
+ if (tx_wqe(qp)->wr_status != SIW_WR_IDLE)
+ siw_sq_flush(qp);
+
+ siw_rq_flush(qp);
+ break;
+
+ default:
+ siw_dbg_qp(qp, "state transition undefined: %s => %s\n",
+ siw_qp_state_to_string[qp->attrs.state],
+ siw_qp_state_to_string[attrs->state]);
+
+ rv = -ECONNABORTED;
+ }
+ return rv;
+}
+
+/*
+ * Caller must hold qp->state_lock
+ */
+int siw_qp_modify(struct siw_qp *qp, struct siw_qp_attrs *attrs,
+ enum siw_qp_attr_mask mask)
+{
+ int drop_conn = 0, rv = 0;
+
+ if (!mask)
+ return 0;
+
+ siw_dbg_qp(qp, "state: %s => %s\n",
+ siw_qp_state_to_string[qp->attrs.state],
+ siw_qp_state_to_string[attrs->state]);
+
+ if (mask != SIW_QP_ATTR_STATE)
+ siw_qp_modify_nonstate(qp, attrs, mask);
+
+ if (!(mask & SIW_QP_ATTR_STATE))
+ return 0;
+
+ switch (qp->attrs.state) {
+ case SIW_QP_STATE_IDLE:
+ case SIW_QP_STATE_RTR:
+ rv = siw_qp_nextstate_from_idle(qp, attrs, mask);
+ break;
+
+ case SIW_QP_STATE_RTS:
+ drop_conn = siw_qp_nextstate_from_rts(qp, attrs);
+ break;
+
+ case SIW_QP_STATE_TERMINATE:
+ siw_qp_nextstate_from_term(qp, attrs);
+ break;
+
+ case SIW_QP_STATE_CLOSING:
+ siw_qp_nextstate_from_close(qp, attrs);
+ break;
+ default:
+ break;
+ }
+ if (drop_conn)
+ siw_qp_cm_drop(qp, 0);
+
+ return rv;
+}
+
+void siw_read_to_orq(struct siw_sqe *rreq, struct siw_sqe *sqe)
+{
+ rreq->id = sqe->id;
+ rreq->opcode = sqe->opcode;
+ rreq->sge[0].laddr = sqe->sge[0].laddr;
+ rreq->sge[0].length = sqe->sge[0].length;
+ rreq->sge[0].lkey = sqe->sge[0].lkey;
+ rreq->sge[1].lkey = sqe->sge[1].lkey;
+ rreq->flags = sqe->flags | SIW_WQE_VALID;
+ rreq->num_sge = 1;
+}
+
+/*
+ * Must be called with SQ locked.
+ * To avoid complete SQ starvation by constant inbound READ requests,
+ * the active IRQ will not be served after qp->irq_burst, if the
+ * SQ has pending work.
+ */
+int siw_activate_tx(struct siw_qp *qp)
+{
+ struct siw_sqe *irqe, *sqe;
+ struct siw_wqe *wqe = tx_wqe(qp);
+ int rv = 1;
+
+ irqe = &qp->irq[qp->irq_get % qp->attrs.irq_size];
+
+ if (irqe->flags & SIW_WQE_VALID) {
+ sqe = sq_get_next(qp);
+
+ /*
+ * Avoid local WQE processing starvation in case
+ * of constant inbound READ request stream
+ */
+ if (sqe && ++qp->irq_burst >= SIW_IRQ_MAXBURST_SQ_ACTIVE) {
+ qp->irq_burst = 0;
+ goto skip_irq;
+ }
+ memset(wqe->mem, 0, sizeof(*wqe->mem) * SIW_MAX_SGE);
+ wqe->wr_status = SIW_WR_QUEUED;
+
+ /* start READ RESPONSE */
+ wqe->sqe.opcode = SIW_OP_READ_RESPONSE;
+ wqe->sqe.flags = 0;
+ if (irqe->num_sge) {
+ wqe->sqe.num_sge = 1;
+ wqe->sqe.sge[0].length = irqe->sge[0].length;
+ wqe->sqe.sge[0].laddr = irqe->sge[0].laddr;
+ wqe->sqe.sge[0].lkey = irqe->sge[0].lkey;
+ } else {
+ wqe->sqe.num_sge = 0;
+ }
+
+ /* Retain original RREQ's message sequence number for
+ * potential error reporting cases.
+ */
+ wqe->sqe.sge[1].length = irqe->sge[1].length;
+
+ wqe->sqe.rkey = irqe->rkey;
+ wqe->sqe.raddr = irqe->raddr;
+
+ wqe->processed = 0;
+ qp->irq_get++;
+
+ /* mark current IRQ entry free */
+ smp_store_mb(irqe->flags, 0);
+
+ goto out;
+ }
+ sqe = sq_get_next(qp);
+ if (sqe) {
+skip_irq:
+ memset(wqe->mem, 0, sizeof(*wqe->mem) * SIW_MAX_SGE);
+ wqe->wr_status = SIW_WR_QUEUED;
+
+ /* First copy SQE to kernel private memory */
+ memcpy(&wqe->sqe, sqe, sizeof(*sqe));
+
+ if (wqe->sqe.opcode >= SIW_NUM_OPCODES) {
+ rv = -EINVAL;
+ goto out;
+ }
+ if (wqe->sqe.flags & SIW_WQE_INLINE) {
+ if (wqe->sqe.opcode != SIW_OP_SEND &&
+ wqe->sqe.opcode != SIW_OP_WRITE) {
+ rv = -EINVAL;
+ goto out;
+ }
+ if (wqe->sqe.sge[0].length > SIW_MAX_INLINE) {
+ rv = -EINVAL;
+ goto out;
+ }
+ wqe->sqe.sge[0].laddr = (u64)&wqe->sqe.sge[1];
+ wqe->sqe.sge[0].lkey = 0;
+ wqe->sqe.num_sge = 1;
+ }
+ if (wqe->sqe.flags & SIW_WQE_READ_FENCE) {
+ /* A READ cannot be fenced */
+ if (unlikely(wqe->sqe.opcode == SIW_OP_READ ||
+ wqe->sqe.opcode ==
+ SIW_OP_READ_LOCAL_INV)) {
+ siw_dbg_qp(qp, "cannot fence read\n");
+ rv = -EINVAL;
+ goto out;
+ }
+ spin_lock(&qp->orq_lock);
+
+ if (!siw_orq_empty(qp)) {
+ qp->tx_ctx.orq_fence = 1;
+ rv = 0;
+ }
+ spin_unlock(&qp->orq_lock);
+
+ } else if (wqe->sqe.opcode == SIW_OP_READ ||
+ wqe->sqe.opcode == SIW_OP_READ_LOCAL_INV) {
+ struct siw_sqe *rreq;
+
+ wqe->sqe.num_sge = 1;
+
+ spin_lock(&qp->orq_lock);
+
+ rreq = orq_get_free(qp);
+ if (rreq) {
+ /*
+ * Make an immediate copy in ORQ to be ready
+ * to process loopback READ reply
+ */
+ siw_read_to_orq(rreq, &wqe->sqe);
+ qp->orq_put++;
+ } else {
+ qp->tx_ctx.orq_fence = 1;
+ rv = 0;
+ }
+ spin_unlock(&qp->orq_lock);
+ }
+
+ /* Clear SQE, can be re-used by application */
+ smp_store_mb(sqe->flags, 0);
+ qp->sq_get++;
+ } else {
+ rv = 0;
+ }
+out:
+ if (unlikely(rv < 0)) {
+ siw_dbg_qp(qp, "error %d\n", rv);
+ wqe->wr_status = SIW_WR_IDLE;
+ }
+ return rv;
+}
+
+/*
+ * Check if current CQ state qualifies for calling CQ completion
+ * handler. Must be called with CQ lock held.
+ */
+static bool siw_cq_notify_now(struct siw_cq *cq, u32 flags)
+{
+ u64 cq_notify;
+
+ if (!cq->base_cq.comp_handler)
+ return false;
+
+ cq_notify = READ_ONCE(*cq->notify);
+
+ if ((cq_notify & SIW_NOTIFY_NEXT_COMPLETION) ||
+ ((cq_notify & SIW_NOTIFY_SOLICITED) &&
+ (flags & SIW_WQE_SOLICITED))) {
+ /* dis-arm CQ */
+ smp_store_mb(*cq->notify, SIW_NOTIFY_NOT);
+
+ return true;
+ }
+ return false;
+}
+
+int siw_sqe_complete(struct siw_qp *qp, struct siw_sqe *sqe, u32 bytes,
+ enum siw_wc_status status)
+{
+ struct siw_cq *cq = qp->scq;
+ int rv = 0;
+
+ if (cq) {
+ u32 sqe_flags = sqe->flags;
+ struct siw_cqe *cqe;
+ u32 idx;
+ unsigned long flags;
+
+ spin_lock_irqsave(&cq->lock, flags);
+
+ idx = cq->cq_put % cq->num_cqe;
+ cqe = &cq->queue[idx];
+
+ if (!READ_ONCE(cqe->flags)) {
+ bool notify;
+
+ cqe->id = sqe->id;
+ cqe->opcode = sqe->opcode;
+ cqe->status = status;
+ cqe->imm_data = 0;
+ cqe->bytes = bytes;
+
+ if (cq->kernel_verbs)
+ cqe->base_qp = qp->ib_qp;
+ else
+ cqe->qp_id = qp_id(qp);
+
+ /* mark CQE valid for application */
+ WRITE_ONCE(cqe->flags, SIW_WQE_VALID);
+ /* recycle SQE */
+ smp_store_mb(sqe->flags, 0);
+
+ cq->cq_put++;
+ notify = siw_cq_notify_now(cq, sqe_flags);
+
+ spin_unlock_irqrestore(&cq->lock, flags);
+
+ if (notify) {
+ siw_dbg_cq(cq, "Call completion handler\n");
+ cq->base_cq.comp_handler(&cq->base_cq,
+ cq->base_cq.cq_context);
+ }
+ } else {
+ spin_unlock_irqrestore(&cq->lock, flags);
+ rv = -ENOMEM;
+ siw_cq_event(cq, IB_EVENT_CQ_ERR);
+ }
+ } else {
+ /* recycle SQE */
+ smp_store_mb(sqe->flags, 0);
+ }
+ return rv;
+}
+
+int siw_rqe_complete(struct siw_qp *qp, struct siw_rqe *rqe, u32 bytes,
+ u32 inval_stag, enum siw_wc_status status)
+{
+ struct siw_cq *cq = qp->rcq;
+ int rv = 0;
+
+ if (cq) {
+ struct siw_cqe *cqe;
+ u32 idx;
+ unsigned long flags;
+
+ spin_lock_irqsave(&cq->lock, flags);
+
+ idx = cq->cq_put % cq->num_cqe;
+ cqe = &cq->queue[idx];
+
+ if (!READ_ONCE(cqe->flags)) {
+ bool notify;
+ u8 cqe_flags = SIW_WQE_VALID;
+
+ cqe->id = rqe->id;
+ cqe->opcode = SIW_OP_RECEIVE;
+ cqe->status = status;
+ cqe->imm_data = 0;
+ cqe->bytes = bytes;
+
+ if (cq->kernel_verbs) {
+ cqe->base_qp = qp->ib_qp;
+ if (inval_stag) {
+ cqe_flags |= SIW_WQE_REM_INVAL;
+ cqe->inval_stag = inval_stag;
+ }
+ } else {
+ cqe->qp_id = qp_id(qp);
+ }
+ /* mark CQE valid for application */
+ WRITE_ONCE(cqe->flags, cqe_flags);
+ /* recycle RQE */
+ smp_store_mb(rqe->flags, 0);
+
+ cq->cq_put++;
+ notify = siw_cq_notify_now(cq, SIW_WQE_SIGNALLED);
+
+ spin_unlock_irqrestore(&cq->lock, flags);
+
+ if (notify) {
+ siw_dbg_cq(cq, "Call completion handler\n");
+ cq->base_cq.comp_handler(&cq->base_cq,
+ cq->base_cq.cq_context);
+ }
+ } else {
+ spin_unlock_irqrestore(&cq->lock, flags);
+ rv = -ENOMEM;
+ siw_cq_event(cq, IB_EVENT_CQ_ERR);
+ }
+ } else {
+ /* recycle RQE */
+ smp_store_mb(rqe->flags, 0);
+ }
+ return rv;
+}
+
+/*
+ * siw_sq_flush()
+ *
+ * Flush SQ and ORRQ entries to CQ.
+ *
+ * Must be called with QP state write lock held.
+ * Therefore, SQ and ORQ lock must not be taken.
+ */
+void siw_sq_flush(struct siw_qp *qp)
+{
+ struct siw_sqe *sqe;
+ struct siw_wqe *wqe = tx_wqe(qp);
+ int async_event = 0;
+
+ /*
+ * Start with completing any work currently on the ORQ
+ */
+ while (qp->attrs.orq_size) {
+ sqe = &qp->orq[qp->orq_get % qp->attrs.orq_size];
+ if (!READ_ONCE(sqe->flags))
+ break;
+
+ if (siw_sqe_complete(qp, sqe, 0, SIW_WC_WR_FLUSH_ERR) != 0)
+ break;
+
+ WRITE_ONCE(sqe->flags, 0);
+ qp->orq_get++;
+ }
+ /*
+ * Flush an in-progress WQE if present
+ */
+ if (wqe->wr_status != SIW_WR_IDLE) {
+ siw_dbg_qp(qp, "flush current SQE, type %d, status %d\n",
+ tx_type(wqe), wqe->wr_status);
+
+ siw_wqe_put_mem(wqe, tx_type(wqe));
+
+ if (tx_type(wqe) != SIW_OP_READ_RESPONSE &&
+ ((tx_type(wqe) != SIW_OP_READ &&
+ tx_type(wqe) != SIW_OP_READ_LOCAL_INV) ||
+ wqe->wr_status == SIW_WR_QUEUED))
+ /*
+ * An in-progress Read Request is already in
+ * the ORQ
+ */
+ siw_sqe_complete(qp, &wqe->sqe, wqe->bytes,
+ SIW_WC_WR_FLUSH_ERR);
+
+ wqe->wr_status = SIW_WR_IDLE;
+ }
+ /*
+ * Flush the Send Queue
+ */
+ while (qp->attrs.sq_size) {
+ sqe = &qp->sendq[qp->sq_get % qp->attrs.sq_size];
+ if (!READ_ONCE(sqe->flags))
+ break;
+
+ async_event = 1;
+ if (siw_sqe_complete(qp, sqe, 0, SIW_WC_WR_FLUSH_ERR) != 0)
+ /*
+ * Shall IB_EVENT_SQ_DRAINED be supressed if work
+ * completion fails?
+ */
+ break;
+
+ WRITE_ONCE(sqe->flags, 0);
+ qp->sq_get++;
+ }
+ if (async_event)
+ siw_qp_event(qp, IB_EVENT_SQ_DRAINED);
+}
+
+/*
+ * siw_rq_flush()
+ *
+ * Flush recv queue entries to CQ. Also
+ * takes care of pending active tagged and untagged
+ * inbound transfers, which have target memory
+ * referenced.
+ *
+ * Must be called with QP state write lock held.
+ * Therefore, RQ lock must not be taken.
+ */
+void siw_rq_flush(struct siw_qp *qp)
+{
+ struct siw_wqe *wqe = &qp->rx_untagged.wqe_active;
+
+ /*
+ * Flush an in-progress untagged operation if present
+ */
+ if (wqe->wr_status != SIW_WR_IDLE) {
+ siw_dbg_qp(qp, "flush current rqe, type %d, status %d\n",
+ rx_type(wqe), wqe->wr_status);
+
+ siw_wqe_put_mem(wqe, rx_type(wqe));
+
+ if (rx_type(wqe) == SIW_OP_RECEIVE) {
+ siw_rqe_complete(qp, &wqe->rqe, wqe->bytes,
+ 0, SIW_WC_WR_FLUSH_ERR);
+ } else if (rx_type(wqe) != SIW_OP_READ &&
+ rx_type(wqe) != SIW_OP_READ_RESPONSE &&
+ rx_type(wqe) != SIW_OP_WRITE) {
+ siw_sqe_complete(qp, &wqe->sqe, 0, SIW_WC_WR_FLUSH_ERR);
+ }
+ wqe->wr_status = SIW_WR_IDLE;
+ }
+ wqe = &qp->rx_tagged.wqe_active;
+
+ if (wqe->wr_status != SIW_WR_IDLE) {
+ siw_wqe_put_mem(wqe, rx_type(wqe));
+ wqe->wr_status = SIW_WR_IDLE;
+ }
+ /*
+ * Flush the Receive Queue
+ */
+ while (qp->attrs.rq_size) {
+ struct siw_rqe *rqe =
+ &qp->recvq[qp->rq_get % qp->attrs.rq_size];
+
+ if (!READ_ONCE(rqe->flags))
+ break;
+
+ if (siw_rqe_complete(qp, rqe, 0, 0, SIW_WC_WR_FLUSH_ERR) != 0)
+ break;
+
+ WRITE_ONCE(rqe->flags, 0);
+ qp->rq_get++;
+ }
+}
+
+int siw_qp_add(struct siw_device *sdev, struct siw_qp *qp)
+{
+ int rv = xa_alloc(&sdev->qp_xa, &qp->ib_qp->qp_num, qp, xa_limit_32b,
+ GFP_KERNEL);
+
+ if (!rv) {
+ kref_init(&qp->ref);
+ qp->sdev = sdev;
+ qp->qp_num = qp->ib_qp->qp_num;
+ siw_dbg_qp(qp, "new QP\n");
+ }
+ return rv;
+}
+
+void siw_free_qp(struct kref *ref)
+{
+ struct siw_qp *found, *qp = container_of(ref, struct siw_qp, ref);
+ struct siw_device *sdev = qp->sdev;
+ unsigned long flags;
+
+ if (qp->cep)
+ siw_cep_put(qp->cep);
+
+ found = xa_erase(&sdev->qp_xa, qp_id(qp));
+ WARN_ON(found != qp);
+ spin_lock_irqsave(&sdev->lock, flags);
+ list_del(&qp->devq);
+ spin_unlock_irqrestore(&sdev->lock, flags);
+
+ vfree(qp->sendq);
+ vfree(qp->recvq);
+ vfree(qp->irq);
+ vfree(qp->orq);
+
+ siw_put_tx_cpu(qp->tx_cpu);
+
+ atomic_dec(&sdev->num_qp);
+ siw_dbg_qp(qp, "free QP\n");
+ kfree_rcu(qp, rcu);
+}
diff --git a/drivers/infiniband/sw/siw/siw_qp_rx.c b/drivers/infiniband/sw/siw/siw_qp_rx.c
new file mode 100644
index 000000000000..f87657a11657
--- /dev/null
+++ b/drivers/infiniband/sw/siw/siw_qp_rx.c
@@ -0,0 +1,1458 @@
+// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause
+
+/* Authors: Bernard Metzler <bmt@zurich.ibm.com> */
+/* Copyright (c) 2008-2019, IBM Corporation */
+
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/net.h>
+#include <linux/scatterlist.h>
+#include <linux/highmem.h>
+
+#include <rdma/iw_cm.h>
+#include <rdma/ib_verbs.h>
+
+#include "siw.h"
+#include "siw_verbs.h"
+#include "siw_mem.h"
+
+/*
+ * siw_rx_umem()
+ *
+ * Receive data of @len into target referenced by @dest_addr.
+ *
+ * @srx: Receive Context
+ * @umem: siw representation of target memory
+ * @dest_addr: user virtual address
+ * @len: number of bytes to place
+ */
+static int siw_rx_umem(struct siw_rx_stream *srx, struct siw_umem *umem,
+ u64 dest_addr, int len)
+{
+ int copied = 0;
+
+ while (len) {
+ struct page *p;
+ int pg_off, bytes, rv;
+ void *dest;
+
+ p = siw_get_upage(umem, dest_addr);
+ if (unlikely(!p)) {
+ pr_warn("siw: %s: [QP %u]: bogus addr: %p, %p\n",
+ __func__, qp_id(rx_qp(srx)),
+ (void *)dest_addr, (void *)umem->fp_addr);
+ /* siw internal error */
+ srx->skb_copied += copied;
+ srx->skb_new -= copied;
+
+ return -EFAULT;
+ }
+ pg_off = dest_addr & ~PAGE_MASK;
+ bytes = min(len, (int)PAGE_SIZE - pg_off);
+
+ siw_dbg_qp(rx_qp(srx), "page %p, bytes=%u\n", p, bytes);
+
+ dest = kmap_atomic(p);
+ rv = skb_copy_bits(srx->skb, srx->skb_offset, dest + pg_off,
+ bytes);
+
+ if (unlikely(rv)) {
+ kunmap_atomic(dest);
+ srx->skb_copied += copied;
+ srx->skb_new -= copied;
+
+ pr_warn("siw: [QP %u]: %s, len %d, page %p, rv %d\n",
+ qp_id(rx_qp(srx)), __func__, len, p, rv);
+
+ return -EFAULT;
+ }
+ if (srx->mpa_crc_hd) {
+ if (rx_qp(srx)->kernel_verbs) {
+ crypto_shash_update(srx->mpa_crc_hd,
+ (u8 *)(dest + pg_off), bytes);
+ kunmap_atomic(dest);
+ } else {
+ kunmap_atomic(dest);
+ /*
+ * Do CRC on original, not target buffer.
+ * Some user land applications may
+ * concurrently write the target buffer,
+ * which would yield a broken CRC.
+ * Walking the skb twice is very ineffcient.
+ * Folding the CRC into skb_copy_bits()
+ * would be much better, but is currently
+ * not supported.
+ */
+ siw_crc_skb(srx, bytes);
+ }
+ } else {
+ kunmap_atomic(dest);
+ }
+ srx->skb_offset += bytes;
+ copied += bytes;
+ len -= bytes;
+ dest_addr += bytes;
+ pg_off = 0;
+ }
+ srx->skb_copied += copied;
+ srx->skb_new -= copied;
+
+ return copied;
+}
+
+static int siw_rx_kva(struct siw_rx_stream *srx, void *kva, int len)
+{
+ int rv;
+
+ siw_dbg_qp(rx_qp(srx), "kva: 0x%p, len: %u\n", kva, len);
+
+ rv = skb_copy_bits(srx->skb, srx->skb_offset, kva, len);
+ if (unlikely(rv)) {
+ pr_warn("siw: [QP %u]: %s, len %d, kva 0x%p, rv %d\n",
+ qp_id(rx_qp(srx)), __func__, len, kva, rv);
+
+ return rv;
+ }
+ if (srx->mpa_crc_hd)
+ crypto_shash_update(srx->mpa_crc_hd, (u8 *)kva, len);
+
+ srx->skb_offset += len;
+ srx->skb_copied += len;
+ srx->skb_new -= len;
+
+ return len;
+}
+
+static int siw_rx_pbl(struct siw_rx_stream *srx, int *pbl_idx,
+ struct siw_mem *mem, u64 addr, int len)
+{
+ struct siw_pbl *pbl = mem->pbl;
+ u64 offset = addr - mem->va;
+ int copied = 0;
+
+ while (len) {
+ int bytes;
+ u64 buf_addr =
+ siw_pbl_get_buffer(pbl, offset, &bytes, pbl_idx);
+ if (!buf_addr)
+ break;
+
+ bytes = min(bytes, len);
+ if (siw_rx_kva(srx, (void *)buf_addr, bytes) == bytes) {
+ copied += bytes;
+ offset += bytes;
+ len -= bytes;
+ } else {
+ break;
+ }
+ }
+ return copied;
+}
+
+/*
+ * siw_rresp_check_ntoh()
+ *
+ * Check incoming RRESP fragment header against expected
+ * header values and update expected values for potential next
+ * fragment.
+ *
+ * NOTE: This function must be called only if a RRESP DDP segment
+ * starts but not for fragmented consecutive pieces of an
+ * already started DDP segment.
+ */
+static int siw_rresp_check_ntoh(struct siw_rx_stream *srx,
+ struct siw_rx_fpdu *frx)
+{
+ struct iwarp_rdma_rresp *rresp = &srx->hdr.rresp;
+ struct siw_wqe *wqe = &frx->wqe_active;
+ enum ddp_ecode ecode;
+
+ u32 sink_stag = be32_to_cpu(rresp->sink_stag);
+ u64 sink_to = be64_to_cpu(rresp->sink_to);
+
+ if (frx->first_ddp_seg) {
+ srx->ddp_stag = wqe->sqe.sge[0].lkey;
+ srx->ddp_to = wqe->sqe.sge[0].laddr;
+ frx->pbl_idx = 0;
+ }
+ /* Below checks extend beyond the semantics of DDP, and
+ * into RDMAP:
+ * We check if the read response matches exactly the
+ * read request which was send to the remote peer to
+ * trigger this read response. RFC5040/5041 do not
+ * always have a proper error code for the detected
+ * error cases. We choose 'base or bounds error' for
+ * cases where the inbound STag is valid, but offset
+ * or length do not match our response receive state.
+ */
+ if (unlikely(srx->ddp_stag != sink_stag)) {
+ pr_warn("siw: [QP %u]: rresp stag: %08x != %08x\n",
+ qp_id(rx_qp(srx)), sink_stag, srx->ddp_stag);
+ ecode = DDP_ECODE_T_INVALID_STAG;
+ goto error;
+ }
+ if (unlikely(srx->ddp_to != sink_to)) {
+ pr_warn("siw: [QP %u]: rresp off: %016llx != %016llx\n",
+ qp_id(rx_qp(srx)), (unsigned long long)sink_to,
+ (unsigned long long)srx->ddp_to);
+ ecode = DDP_ECODE_T_BASE_BOUNDS;
+ goto error;
+ }
+ if (unlikely(!frx->more_ddp_segs &&
+ (wqe->processed + srx->fpdu_part_rem != wqe->bytes))) {
+ pr_warn("siw: [QP %u]: rresp len: %d != %d\n",
+ qp_id(rx_qp(srx)),
+ wqe->processed + srx->fpdu_part_rem, wqe->bytes);
+ ecode = DDP_ECODE_T_BASE_BOUNDS;
+ goto error;
+ }
+ return 0;
+error:
+ siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_DDP,
+ DDP_ETYPE_TAGGED_BUF, ecode, 0);
+ return -EINVAL;
+}
+
+/*
+ * siw_write_check_ntoh()
+ *
+ * Check incoming WRITE fragment header against expected
+ * header values and update expected values for potential next
+ * fragment
+ *
+ * NOTE: This function must be called only if a WRITE DDP segment
+ * starts but not for fragmented consecutive pieces of an
+ * already started DDP segment.
+ */
+static int siw_write_check_ntoh(struct siw_rx_stream *srx,
+ struct siw_rx_fpdu *frx)
+{
+ struct iwarp_rdma_write *write = &srx->hdr.rwrite;
+ enum ddp_ecode ecode;
+
+ u32 sink_stag = be32_to_cpu(write->sink_stag);
+ u64 sink_to = be64_to_cpu(write->sink_to);
+
+ if (frx->first_ddp_seg) {
+ srx->ddp_stag = sink_stag;
+ srx->ddp_to = sink_to;
+ frx->pbl_idx = 0;
+ } else {
+ if (unlikely(srx->ddp_stag != sink_stag)) {
+ pr_warn("siw: [QP %u]: write stag: %08x != %08x\n",
+ qp_id(rx_qp(srx)), sink_stag,
+ srx->ddp_stag);
+ ecode = DDP_ECODE_T_INVALID_STAG;
+ goto error;
+ }
+ if (unlikely(srx->ddp_to != sink_to)) {
+ pr_warn("siw: [QP %u]: write off: %016llx != %016llx\n",
+ qp_id(rx_qp(srx)),
+ (unsigned long long)sink_to,
+ (unsigned long long)srx->ddp_to);
+ ecode = DDP_ECODE_T_BASE_BOUNDS;
+ goto error;
+ }
+ }
+ return 0;
+error:
+ siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_DDP,
+ DDP_ETYPE_TAGGED_BUF, ecode, 0);
+ return -EINVAL;
+}
+
+/*
+ * siw_send_check_ntoh()
+ *
+ * Check incoming SEND fragment header against expected
+ * header values and update expected MSN if no next
+ * fragment expected
+ *
+ * NOTE: This function must be called only if a SEND DDP segment
+ * starts but not for fragmented consecutive pieces of an
+ * already started DDP segment.
+ */
+static int siw_send_check_ntoh(struct siw_rx_stream *srx,
+ struct siw_rx_fpdu *frx)
+{
+ struct iwarp_send_inv *send = &srx->hdr.send_inv;
+ struct siw_wqe *wqe = &frx->wqe_active;
+ enum ddp_ecode ecode;
+
+ u32 ddp_msn = be32_to_cpu(send->ddp_msn);
+ u32 ddp_mo = be32_to_cpu(send->ddp_mo);
+ u32 ddp_qn = be32_to_cpu(send->ddp_qn);
+
+ if (unlikely(ddp_qn != RDMAP_UNTAGGED_QN_SEND)) {
+ pr_warn("siw: [QP %u]: invalid ddp qn %d for send\n",
+ qp_id(rx_qp(srx)), ddp_qn);
+ ecode = DDP_ECODE_UT_INVALID_QN;
+ goto error;
+ }
+ if (unlikely(ddp_msn != srx->ddp_msn[RDMAP_UNTAGGED_QN_SEND])) {
+ pr_warn("siw: [QP %u]: send msn: %u != %u\n",
+ qp_id(rx_qp(srx)), ddp_msn,
+ srx->ddp_msn[RDMAP_UNTAGGED_QN_SEND]);
+ ecode = DDP_ECODE_UT_INVALID_MSN_RANGE;
+ goto error;
+ }
+ if (unlikely(ddp_mo != wqe->processed)) {
+ pr_warn("siw: [QP %u], send mo: %u != %u\n",
+ qp_id(rx_qp(srx)), ddp_mo, wqe->processed);
+ ecode = DDP_ECODE_UT_INVALID_MO;
+ goto error;
+ }
+ if (frx->first_ddp_seg) {
+ /* initialize user memory write position */
+ frx->sge_idx = 0;
+ frx->sge_off = 0;
+ frx->pbl_idx = 0;
+
+ /* only valid for SEND_INV and SEND_SE_INV operations */
+ srx->inval_stag = be32_to_cpu(send->inval_stag);
+ }
+ if (unlikely(wqe->bytes < wqe->processed + srx->fpdu_part_rem)) {
+ siw_dbg_qp(rx_qp(srx), "receive space short: %d - %d < %d\n",
+ wqe->bytes, wqe->processed, srx->fpdu_part_rem);
+ wqe->wc_status = SIW_WC_LOC_LEN_ERR;
+ ecode = DDP_ECODE_UT_INVALID_MSN_NOBUF;
+ goto error;
+ }
+ return 0;
+error:
+ siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_DDP,
+ DDP_ETYPE_UNTAGGED_BUF, ecode, 0);
+ return -EINVAL;
+}
+
+static struct siw_wqe *siw_rqe_get(struct siw_qp *qp)
+{
+ struct siw_rqe *rqe;
+ struct siw_srq *srq;
+ struct siw_wqe *wqe = NULL;
+ bool srq_event = false;
+ unsigned long flags;
+
+ srq = qp->srq;
+ if (srq) {
+ spin_lock_irqsave(&srq->lock, flags);
+ if (unlikely(!srq->num_rqe))
+ goto out;
+
+ rqe = &srq->recvq[srq->rq_get % srq->num_rqe];
+ } else {
+ if (unlikely(!qp->recvq))
+ goto out;
+
+ rqe = &qp->recvq[qp->rq_get % qp->attrs.rq_size];
+ }
+ if (likely(rqe->flags == SIW_WQE_VALID)) {
+ int num_sge = rqe->num_sge;
+
+ if (likely(num_sge <= SIW_MAX_SGE)) {
+ int i = 0;
+
+ wqe = rx_wqe(&qp->rx_untagged);
+ rx_type(wqe) = SIW_OP_RECEIVE;
+ wqe->wr_status = SIW_WR_INPROGRESS;
+ wqe->bytes = 0;
+ wqe->processed = 0;
+
+ wqe->rqe.id = rqe->id;
+ wqe->rqe.num_sge = num_sge;
+
+ while (i < num_sge) {
+ wqe->rqe.sge[i].laddr = rqe->sge[i].laddr;
+ wqe->rqe.sge[i].lkey = rqe->sge[i].lkey;
+ wqe->rqe.sge[i].length = rqe->sge[i].length;
+ wqe->bytes += wqe->rqe.sge[i].length;
+ wqe->mem[i] = NULL;
+ i++;
+ }
+ /* can be re-used by appl */
+ smp_store_mb(rqe->flags, 0);
+ } else {
+ siw_dbg_qp(qp, "too many sge's: %d\n", rqe->num_sge);
+ if (srq)
+ spin_unlock_irqrestore(&srq->lock, flags);
+ return NULL;
+ }
+ if (!srq) {
+ qp->rq_get++;
+ } else {
+ if (srq->armed) {
+ /* Test SRQ limit */
+ u32 off = (srq->rq_get + srq->limit) %
+ srq->num_rqe;
+ struct siw_rqe *rqe2 = &srq->recvq[off];
+
+ if (!(rqe2->flags & SIW_WQE_VALID)) {
+ srq->armed = 0;
+ srq_event = true;
+ }
+ }
+ srq->rq_get++;
+ }
+ }
+out:
+ if (srq) {
+ spin_unlock_irqrestore(&srq->lock, flags);
+ if (srq_event)
+ siw_srq_event(srq, IB_EVENT_SRQ_LIMIT_REACHED);
+ }
+ return wqe;
+}
+
+/*
+ * siw_proc_send:
+ *
+ * Process one incoming SEND and place data into memory referenced by
+ * receive wqe.
+ *
+ * Function supports partially received sends (suspending/resuming
+ * current receive wqe processing)
+ *
+ * return value:
+ * 0: reached the end of a DDP segment
+ * -EAGAIN: to be called again to finish the DDP segment
+ */
+int siw_proc_send(struct siw_qp *qp)
+{
+ struct siw_rx_stream *srx = &qp->rx_stream;
+ struct siw_rx_fpdu *frx = &qp->rx_untagged;
+ struct siw_wqe *wqe;
+ u32 data_bytes; /* all data bytes available */
+ u32 rcvd_bytes; /* sum of data bytes rcvd */
+ int rv = 0;
+
+ if (frx->first_ddp_seg) {
+ wqe = siw_rqe_get(qp);
+ if (unlikely(!wqe)) {
+ siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
+ DDP_ETYPE_UNTAGGED_BUF,
+ DDP_ECODE_UT_INVALID_MSN_NOBUF, 0);
+ return -ENOENT;
+ }
+ } else {
+ wqe = rx_wqe(frx);
+ }
+ if (srx->state == SIW_GET_DATA_START) {
+ rv = siw_send_check_ntoh(srx, frx);
+ if (unlikely(rv)) {
+ siw_qp_event(qp, IB_EVENT_QP_FATAL);
+ return rv;
+ }
+ if (!srx->fpdu_part_rem) /* zero length SEND */
+ return 0;
+ }
+ data_bytes = min(srx->fpdu_part_rem, srx->skb_new);
+ rcvd_bytes = 0;
+
+ /* A zero length SEND will skip below loop */
+ while (data_bytes) {
+ struct ib_pd *pd;
+ struct siw_mem **mem, *mem_p;
+ struct siw_sge *sge;
+ u32 sge_bytes; /* data bytes avail for SGE */
+
+ sge = &wqe->rqe.sge[frx->sge_idx];
+
+ if (!sge->length) {
+ /* just skip empty sge's */
+ frx->sge_idx++;
+ frx->sge_off = 0;
+ frx->pbl_idx = 0;
+ continue;
+ }
+ sge_bytes = min(data_bytes, sge->length - frx->sge_off);
+ mem = &wqe->mem[frx->sge_idx];
+
+ /*
+ * check with QP's PD if no SRQ present, SRQ's PD otherwise
+ */
+ pd = qp->srq == NULL ? qp->pd : qp->srq->base_srq.pd;
+
+ rv = siw_check_sge(pd, sge, mem, IB_ACCESS_LOCAL_WRITE,
+ frx->sge_off, sge_bytes);
+ if (unlikely(rv)) {
+ siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
+ DDP_ETYPE_CATASTROPHIC,
+ DDP_ECODE_CATASTROPHIC, 0);
+
+ siw_qp_event(qp, IB_EVENT_QP_ACCESS_ERR);
+ break;
+ }
+ mem_p = *mem;
+ if (mem_p->mem_obj == NULL)
+ rv = siw_rx_kva(srx,
+ (void *)(sge->laddr + frx->sge_off),
+ sge_bytes);
+ else if (!mem_p->is_pbl)
+ rv = siw_rx_umem(srx, mem_p->umem,
+ sge->laddr + frx->sge_off, sge_bytes);
+ else
+ rv = siw_rx_pbl(srx, &frx->pbl_idx, mem_p,
+ sge->laddr + frx->sge_off, sge_bytes);
+
+ if (unlikely(rv != sge_bytes)) {
+ wqe->processed += rcvd_bytes;
+
+ siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
+ DDP_ETYPE_CATASTROPHIC,
+ DDP_ECODE_CATASTROPHIC, 0);
+ return -EINVAL;
+ }
+ frx->sge_off += rv;
+
+ if (frx->sge_off == sge->length) {
+ frx->sge_idx++;
+ frx->sge_off = 0;
+ frx->pbl_idx = 0;
+ }
+ data_bytes -= rv;
+ rcvd_bytes += rv;
+
+ srx->fpdu_part_rem -= rv;
+ srx->fpdu_part_rcvd += rv;
+ }
+ wqe->processed += rcvd_bytes;
+
+ if (!srx->fpdu_part_rem)
+ return 0;
+
+ return (rv < 0) ? rv : -EAGAIN;
+}
+
+/*
+ * siw_proc_write:
+ *
+ * Place incoming WRITE after referencing and checking target buffer
+
+ * Function supports partially received WRITEs (suspending/resuming
+ * current receive processing)
+ *
+ * return value:
+ * 0: reached the end of a DDP segment
+ * -EAGAIN: to be called again to finish the DDP segment
+ */
+int siw_proc_write(struct siw_qp *qp)
+{
+ struct siw_rx_stream *srx = &qp->rx_stream;
+ struct siw_rx_fpdu *frx = &qp->rx_tagged;
+ struct siw_mem *mem;
+ int bytes, rv;
+
+ if (srx->state == SIW_GET_DATA_START) {
+ if (!srx->fpdu_part_rem) /* zero length WRITE */
+ return 0;
+
+ rv = siw_write_check_ntoh(srx, frx);
+ if (unlikely(rv)) {
+ siw_qp_event(qp, IB_EVENT_QP_FATAL);
+ return rv;
+ }
+ }
+ bytes = min(srx->fpdu_part_rem, srx->skb_new);
+
+ if (frx->first_ddp_seg) {
+ struct siw_wqe *wqe = rx_wqe(frx);
+
+ rx_mem(frx) = siw_mem_id2obj(qp->sdev, srx->ddp_stag >> 8);
+ if (unlikely(!rx_mem(frx))) {
+ siw_dbg_qp(qp,
+ "sink stag not found/invalid, stag 0x%08x\n",
+ srx->ddp_stag);
+
+ siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
+ DDP_ETYPE_TAGGED_BUF,
+ DDP_ECODE_T_INVALID_STAG, 0);
+ return -EINVAL;
+ }
+ wqe->rqe.num_sge = 1;
+ rx_type(wqe) = SIW_OP_WRITE;
+ wqe->wr_status = SIW_WR_INPROGRESS;
+ }
+ mem = rx_mem(frx);
+
+ /*
+ * Check if application re-registered memory with different
+ * key field of STag.
+ */
+ if (unlikely(mem->stag != srx->ddp_stag)) {
+ siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
+ DDP_ETYPE_TAGGED_BUF,
+ DDP_ECODE_T_INVALID_STAG, 0);
+ return -EINVAL;
+ }
+ rv = siw_check_mem(qp->pd, mem, srx->ddp_to + srx->fpdu_part_rcvd,
+ IB_ACCESS_REMOTE_WRITE, bytes);
+ if (unlikely(rv)) {
+ siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
+ DDP_ETYPE_TAGGED_BUF, siw_tagged_error(-rv),
+ 0);
+
+ siw_qp_event(qp, IB_EVENT_QP_ACCESS_ERR);
+
+ return -EINVAL;
+ }
+
+ if (mem->mem_obj == NULL)
+ rv = siw_rx_kva(srx,
+ (void *)(srx->ddp_to + srx->fpdu_part_rcvd),
+ bytes);
+ else if (!mem->is_pbl)
+ rv = siw_rx_umem(srx, mem->umem,
+ srx->ddp_to + srx->fpdu_part_rcvd, bytes);
+ else
+ rv = siw_rx_pbl(srx, &frx->pbl_idx, mem,
+ srx->ddp_to + srx->fpdu_part_rcvd, bytes);
+
+ if (unlikely(rv != bytes)) {
+ siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
+ DDP_ETYPE_CATASTROPHIC,
+ DDP_ECODE_CATASTROPHIC, 0);
+ return -EINVAL;
+ }
+ srx->fpdu_part_rem -= rv;
+ srx->fpdu_part_rcvd += rv;
+
+ if (!srx->fpdu_part_rem) {
+ srx->ddp_to += srx->fpdu_part_rcvd;
+ return 0;
+ }
+ return -EAGAIN;
+}
+
+/*
+ * Inbound RREQ's cannot carry user data.
+ */
+int siw_proc_rreq(struct siw_qp *qp)
+{
+ struct siw_rx_stream *srx = &qp->rx_stream;
+
+ if (!srx->fpdu_part_rem)
+ return 0;
+
+ pr_warn("siw: [QP %u]: rreq with mpa len %d\n", qp_id(qp),
+ be16_to_cpu(srx->hdr.ctrl.mpa_len));
+
+ return -EPROTO;
+}
+
+/*
+ * siw_init_rresp:
+ *
+ * Process inbound RDMA READ REQ. Produce a pseudo READ RESPONSE WQE.
+ * Put it at the tail of the IRQ, if there is another WQE currently in
+ * transmit processing. If not, make it the current WQE to be processed
+ * and schedule transmit processing.
+ *
+ * Can be called from softirq context and from process
+ * context (RREAD socket loopback case!)
+ *
+ * return value:
+ * 0: success,
+ * failure code otherwise
+ */
+
+static int siw_init_rresp(struct siw_qp *qp, struct siw_rx_stream *srx)
+{
+ struct siw_wqe *tx_work = tx_wqe(qp);
+ struct siw_sqe *resp;
+
+ uint64_t raddr = be64_to_cpu(srx->hdr.rreq.sink_to),
+ laddr = be64_to_cpu(srx->hdr.rreq.source_to);
+ uint32_t length = be32_to_cpu(srx->hdr.rreq.read_size),
+ lkey = be32_to_cpu(srx->hdr.rreq.source_stag),
+ rkey = be32_to_cpu(srx->hdr.rreq.sink_stag),
+ msn = be32_to_cpu(srx->hdr.rreq.ddp_msn);
+
+ int run_sq = 1, rv = 0;
+ unsigned long flags;
+
+ if (unlikely(msn != srx->ddp_msn[RDMAP_UNTAGGED_QN_RDMA_READ])) {
+ siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
+ DDP_ETYPE_UNTAGGED_BUF,
+ DDP_ECODE_UT_INVALID_MSN_RANGE, 0);
+ return -EPROTO;
+ }
+ spin_lock_irqsave(&qp->sq_lock, flags);
+
+ if (tx_work->wr_status == SIW_WR_IDLE) {
+ /*
+ * immediately schedule READ response w/o
+ * consuming IRQ entry: IRQ must be empty.
+ */
+ tx_work->processed = 0;
+ tx_work->mem[0] = NULL;
+ tx_work->wr_status = SIW_WR_QUEUED;
+ resp = &tx_work->sqe;
+ } else {
+ resp = irq_alloc_free(qp);
+ run_sq = 0;
+ }
+ if (likely(resp)) {
+ resp->opcode = SIW_OP_READ_RESPONSE;
+
+ resp->sge[0].length = length;
+ resp->sge[0].laddr = laddr;
+ resp->sge[0].lkey = lkey;
+
+ /* Keep aside message sequence number for potential
+ * error reporting during Read Response generation.
+ */
+ resp->sge[1].length = msn;
+
+ resp->raddr = raddr;
+ resp->rkey = rkey;
+ resp->num_sge = length ? 1 : 0;
+
+ /* RRESP now valid as current TX wqe or placed into IRQ */
+ smp_store_mb(resp->flags, SIW_WQE_VALID);
+ } else {
+ pr_warn("siw: [QP %u]: irq %d exceeded %d\n", qp_id(qp),
+ qp->irq_put % qp->attrs.irq_size, qp->attrs.irq_size);
+
+ siw_init_terminate(qp, TERM_ERROR_LAYER_RDMAP,
+ RDMAP_ETYPE_REMOTE_OPERATION,
+ RDMAP_ECODE_CATASTROPHIC_STREAM, 0);
+ rv = -EPROTO;
+ }
+
+ spin_unlock_irqrestore(&qp->sq_lock, flags);
+
+ if (run_sq)
+ rv = siw_sq_start(qp);
+
+ return rv;
+}
+
+/*
+ * Only called at start of Read.Resonse processing.
+ * Transfer pending Read from tip of ORQ into currrent rx wqe,
+ * but keep ORQ entry valid until Read.Response processing done.
+ * No Queue locking needed.
+ */
+static int siw_orqe_start_rx(struct siw_qp *qp)
+{
+ struct siw_sqe *orqe;
+ struct siw_wqe *wqe = NULL;
+
+ /* make sure ORQ indices are current */
+ smp_mb();
+
+ orqe = orq_get_current(qp);
+ if (READ_ONCE(orqe->flags) & SIW_WQE_VALID) {
+ /* RRESP is a TAGGED RDMAP operation */
+ wqe = rx_wqe(&qp->rx_tagged);
+ wqe->sqe.id = orqe->id;
+ wqe->sqe.opcode = orqe->opcode;
+ wqe->sqe.sge[0].laddr = orqe->sge[0].laddr;
+ wqe->sqe.sge[0].lkey = orqe->sge[0].lkey;
+ wqe->sqe.sge[0].length = orqe->sge[0].length;
+ wqe->sqe.flags = orqe->flags;
+ wqe->sqe.num_sge = 1;
+ wqe->bytes = orqe->sge[0].length;
+ wqe->processed = 0;
+ wqe->mem[0] = NULL;
+ /* make sure WQE is completely written before valid */
+ smp_wmb();
+ wqe->wr_status = SIW_WR_INPROGRESS;
+
+ return 0;
+ }
+ return -EPROTO;
+}
+
+/*
+ * siw_proc_rresp:
+ *
+ * Place incoming RRESP data into memory referenced by RREQ WQE
+ * which is at the tip of the ORQ
+ *
+ * Function supports partially received RRESP's (suspending/resuming
+ * current receive processing)
+ */
+int siw_proc_rresp(struct siw_qp *qp)
+{
+ struct siw_rx_stream *srx = &qp->rx_stream;
+ struct siw_rx_fpdu *frx = &qp->rx_tagged;
+ struct siw_wqe *wqe = rx_wqe(frx);
+ struct siw_mem **mem, *mem_p;
+ struct siw_sge *sge;
+ int bytes, rv;
+
+ if (frx->first_ddp_seg) {
+ if (unlikely(wqe->wr_status != SIW_WR_IDLE)) {
+ pr_warn("siw: [QP %u]: proc RRESP: status %d, op %d\n",
+ qp_id(qp), wqe->wr_status, wqe->sqe.opcode);
+ rv = -EPROTO;
+ goto error_term;
+ }
+ /*
+ * fetch pending RREQ from orq
+ */
+ rv = siw_orqe_start_rx(qp);
+ if (rv) {
+ pr_warn("siw: [QP %u]: ORQ empty at idx %d\n",
+ qp_id(qp), qp->orq_get % qp->attrs.orq_size);
+ goto error_term;
+ }
+ rv = siw_rresp_check_ntoh(srx, frx);
+ if (unlikely(rv)) {
+ siw_qp_event(qp, IB_EVENT_QP_FATAL);
+ return rv;
+ }
+ } else {
+ if (unlikely(wqe->wr_status != SIW_WR_INPROGRESS)) {
+ pr_warn("siw: [QP %u]: resume RRESP: status %d\n",
+ qp_id(qp), wqe->wr_status);
+ rv = -EPROTO;
+ goto error_term;
+ }
+ }
+ if (!srx->fpdu_part_rem) /* zero length RRESPONSE */
+ return 0;
+
+ sge = wqe->sqe.sge; /* there is only one */
+ mem = &wqe->mem[0];
+
+ if (!(*mem)) {
+ /*
+ * check target memory which resolves memory on first fragment
+ */
+ rv = siw_check_sge(qp->pd, sge, mem, IB_ACCESS_LOCAL_WRITE, 0,
+ wqe->bytes);
+ if (unlikely(rv)) {
+ siw_dbg_qp(qp, "target mem check: %d\n", rv);
+ wqe->wc_status = SIW_WC_LOC_PROT_ERR;
+
+ siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
+ DDP_ETYPE_TAGGED_BUF,
+ siw_tagged_error(-rv), 0);
+
+ siw_qp_event(qp, IB_EVENT_QP_ACCESS_ERR);
+
+ return -EINVAL;
+ }
+ }
+ mem_p = *mem;
+
+ bytes = min(srx->fpdu_part_rem, srx->skb_new);
+
+ if (mem_p->mem_obj == NULL)
+ rv = siw_rx_kva(srx, (void *)(sge->laddr + wqe->processed),
+ bytes);
+ else if (!mem_p->is_pbl)
+ rv = siw_rx_umem(srx, mem_p->umem, sge->laddr + wqe->processed,
+ bytes);
+ else
+ rv = siw_rx_pbl(srx, &frx->pbl_idx, mem_p,
+ sge->laddr + wqe->processed, bytes);
+ if (rv != bytes) {
+ wqe->wc_status = SIW_WC_GENERAL_ERR;
+ rv = -EINVAL;
+ goto error_term;
+ }
+ srx->fpdu_part_rem -= rv;
+ srx->fpdu_part_rcvd += rv;
+ wqe->processed += rv;
+
+ if (!srx->fpdu_part_rem) {
+ srx->ddp_to += srx->fpdu_part_rcvd;
+ return 0;
+ }
+ return -EAGAIN;
+
+error_term:
+ siw_init_terminate(qp, TERM_ERROR_LAYER_DDP, DDP_ETYPE_CATASTROPHIC,
+ DDP_ECODE_CATASTROPHIC, 0);
+ return rv;
+}
+
+int siw_proc_terminate(struct siw_qp *qp)
+{
+ struct siw_rx_stream *srx = &qp->rx_stream;
+ struct sk_buff *skb = srx->skb;
+ struct iwarp_terminate *term = &srx->hdr.terminate;
+ union iwarp_hdr term_info;
+ u8 *infop = (u8 *)&term_info;
+ enum rdma_opcode op;
+ u16 to_copy = sizeof(struct iwarp_ctrl);
+
+ pr_warn("siw: got TERMINATE. layer %d, type %d, code %d\n",
+ __rdmap_term_layer(term), __rdmap_term_etype(term),
+ __rdmap_term_ecode(term));
+
+ if (be32_to_cpu(term->ddp_qn) != RDMAP_UNTAGGED_QN_TERMINATE ||
+ be32_to_cpu(term->ddp_msn) !=
+ qp->rx_stream.ddp_msn[RDMAP_UNTAGGED_QN_TERMINATE] ||
+ be32_to_cpu(term->ddp_mo) != 0) {
+ pr_warn("siw: rx bogus TERM [QN x%08x, MSN x%08x, MO x%08x]\n",
+ be32_to_cpu(term->ddp_qn), be32_to_cpu(term->ddp_msn),
+ be32_to_cpu(term->ddp_mo));
+ return -ECONNRESET;
+ }
+ /*
+ * Receive remaining pieces of TERM if indicated
+ */
+ if (!term->flag_m)
+ return -ECONNRESET;
+
+ /* Do not take the effort to reassemble a network fragmented
+ * TERM message
+ */
+ if (srx->skb_new < sizeof(struct iwarp_ctrl_tagged))
+ return -ECONNRESET;
+
+ memset(infop, 0, sizeof(term_info));
+
+ skb_copy_bits(skb, srx->skb_offset, infop, to_copy);
+
+ op = __rdmap_get_opcode(&term_info.ctrl);
+ if (op >= RDMAP_TERMINATE)
+ goto out;
+
+ infop += to_copy;
+ srx->skb_offset += to_copy;
+ srx->skb_new -= to_copy;
+ srx->skb_copied += to_copy;
+ srx->fpdu_part_rcvd += to_copy;
+ srx->fpdu_part_rem -= to_copy;
+
+ to_copy = iwarp_pktinfo[op].hdr_len - to_copy;
+
+ /* Again, no network fragmented TERM's */
+ if (to_copy + MPA_CRC_SIZE > srx->skb_new)
+ return -ECONNRESET;
+
+ skb_copy_bits(skb, srx->skb_offset, infop, to_copy);
+
+ if (term->flag_r) {
+ siw_dbg_qp(qp, "TERM reports RDMAP hdr type %u, len %u (%s)\n",
+ op, be16_to_cpu(term_info.ctrl.mpa_len),
+ term->flag_m ? "valid" : "invalid");
+ } else if (term->flag_d) {
+ siw_dbg_qp(qp, "TERM reports DDP hdr type %u, len %u (%s)\n",
+ op, be16_to_cpu(term_info.ctrl.mpa_len),
+ term->flag_m ? "valid" : "invalid");
+ }
+out:
+ srx->skb_new -= to_copy;
+ srx->skb_offset += to_copy;
+ srx->skb_copied += to_copy;
+ srx->fpdu_part_rcvd += to_copy;
+ srx->fpdu_part_rem -= to_copy;
+
+ return -ECONNRESET;
+}
+
+static int siw_get_trailer(struct siw_qp *qp, struct siw_rx_stream *srx)
+{
+ struct sk_buff *skb = srx->skb;
+ u8 *tbuf = (u8 *)&srx->trailer.crc - srx->pad;
+ __wsum crc_in, crc_own = 0;
+
+ siw_dbg_qp(qp, "expected %d, available %d, pad %u\n",
+ srx->fpdu_part_rem, srx->skb_new, srx->pad);
+
+ if (srx->skb_new < srx->fpdu_part_rem)
+ return -EAGAIN;
+
+ skb_copy_bits(skb, srx->skb_offset, tbuf, srx->fpdu_part_rem);
+
+ if (srx->mpa_crc_hd && srx->pad)
+ crypto_shash_update(srx->mpa_crc_hd, tbuf, srx->pad);
+
+ srx->skb_new -= srx->fpdu_part_rem;
+ srx->skb_offset += srx->fpdu_part_rem;
+ srx->skb_copied += srx->fpdu_part_rem;
+
+ if (!srx->mpa_crc_hd)
+ return 0;
+
+ /*
+ * CRC32 is computed, transmitted and received directly in NBO,
+ * so there's never a reason to convert byte order.
+ */
+ crypto_shash_final(srx->mpa_crc_hd, (u8 *)&crc_own);
+ crc_in = (__force __wsum)srx->trailer.crc;
+
+ if (unlikely(crc_in != crc_own)) {
+ pr_warn("siw: crc error. in: %08x, own %08x, op %u\n",
+ crc_in, crc_own, qp->rx_stream.rdmap_op);
+
+ siw_init_terminate(qp, TERM_ERROR_LAYER_LLP,
+ LLP_ETYPE_MPA,
+ LLP_ECODE_RECEIVED_CRC, 0);
+ return -EINVAL;
+ }
+ return 0;
+}
+
+#define MIN_DDP_HDR sizeof(struct iwarp_ctrl_tagged)
+
+static int siw_get_hdr(struct siw_rx_stream *srx)
+{
+ struct sk_buff *skb = srx->skb;
+ struct siw_qp *qp = rx_qp(srx);
+ struct iwarp_ctrl *c_hdr = &srx->hdr.ctrl;
+ struct siw_rx_fpdu *frx;
+ u8 opcode;
+ int bytes;
+
+ if (srx->fpdu_part_rcvd < MIN_DDP_HDR) {
+ /*
+ * copy a mimimum sized (tagged) DDP frame control part
+ */
+ bytes = min_t(int, srx->skb_new,
+ MIN_DDP_HDR - srx->fpdu_part_rcvd);
+
+ skb_copy_bits(skb, srx->skb_offset,
+ (char *)c_hdr + srx->fpdu_part_rcvd, bytes);
+
+ srx->fpdu_part_rcvd += bytes;
+
+ srx->skb_new -= bytes;
+ srx->skb_offset += bytes;
+ srx->skb_copied += bytes;
+
+ if (srx->fpdu_part_rcvd < MIN_DDP_HDR)
+ return -EAGAIN;
+
+ if (unlikely(__ddp_get_version(c_hdr) != DDP_VERSION)) {
+ enum ddp_etype etype;
+ enum ddp_ecode ecode;
+
+ pr_warn("siw: received ddp version unsupported %d\n",
+ __ddp_get_version(c_hdr));
+
+ if (c_hdr->ddp_rdmap_ctrl & DDP_FLAG_TAGGED) {
+ etype = DDP_ETYPE_TAGGED_BUF;
+ ecode = DDP_ECODE_T_VERSION;
+ } else {
+ etype = DDP_ETYPE_UNTAGGED_BUF;
+ ecode = DDP_ECODE_UT_VERSION;
+ }
+ siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_DDP,
+ etype, ecode, 0);
+ return -EINVAL;
+ }
+ if (unlikely(__rdmap_get_version(c_hdr) != RDMAP_VERSION)) {
+ pr_warn("siw: received rdmap version unsupported %d\n",
+ __rdmap_get_version(c_hdr));
+
+ siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_RDMAP,
+ RDMAP_ETYPE_REMOTE_OPERATION,
+ RDMAP_ECODE_VERSION, 0);
+ return -EINVAL;
+ }
+ opcode = __rdmap_get_opcode(c_hdr);
+
+ if (opcode > RDMAP_TERMINATE) {
+ pr_warn("siw: received unknown packet type %u\n",
+ opcode);
+
+ siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_RDMAP,
+ RDMAP_ETYPE_REMOTE_OPERATION,
+ RDMAP_ECODE_OPCODE, 0);
+ return -EINVAL;
+ }
+ siw_dbg_qp(rx_qp(srx), "new header, opcode %u\n", opcode);
+ } else {
+ opcode = __rdmap_get_opcode(c_hdr);
+ }
+ set_rx_fpdu_context(qp, opcode);
+ frx = qp->rx_fpdu;
+
+ /*
+ * Figure out len of current hdr: variable length of
+ * iwarp hdr may force us to copy hdr information in
+ * two steps. Only tagged DDP messages are already
+ * completely received.
+ */
+ if (iwarp_pktinfo[opcode].hdr_len > sizeof(struct iwarp_ctrl_tagged)) {
+ bytes = iwarp_pktinfo[opcode].hdr_len - MIN_DDP_HDR;
+
+ if (srx->skb_new < bytes)
+ return -EAGAIN;
+
+ skb_copy_bits(skb, srx->skb_offset,
+ (char *)c_hdr + srx->fpdu_part_rcvd, bytes);
+
+ srx->fpdu_part_rcvd += bytes;
+
+ srx->skb_new -= bytes;
+ srx->skb_offset += bytes;
+ srx->skb_copied += bytes;
+ }
+
+ /*
+ * DDP/RDMAP header receive completed. Check if the current
+ * DDP segment starts a new RDMAP message or continues a previously
+ * started RDMAP message.
+ *
+ * Alternating reception of DDP segments (or FPDUs) from incomplete
+ * tagged and untagged RDMAP messages is supported, as long as
+ * the current tagged or untagged message gets eventually completed
+ * w/o intersection from another message of the same type
+ * (tagged/untagged). E.g., a WRITE can get intersected by a SEND,
+ * but not by a READ RESPONSE etc.
+ */
+ if (srx->mpa_crc_hd) {
+ /*
+ * Restart CRC computation
+ */
+ crypto_shash_init(srx->mpa_crc_hd);
+ crypto_shash_update(srx->mpa_crc_hd, (u8 *)c_hdr,
+ srx->fpdu_part_rcvd);
+ }
+ if (frx->more_ddp_segs) {
+ frx->first_ddp_seg = 0;
+ if (frx->prev_rdmap_op != opcode) {
+ pr_warn("siw: packet intersection: %u : %u\n",
+ frx->prev_rdmap_op, opcode);
+ /*
+ * The last inbound RDMA operation of same type
+ * (tagged or untagged) is left unfinished.
+ * To complete it in error, make it the current
+ * operation again, even with the header already
+ * overwritten. For error handling, only the opcode
+ * and current rx context are relevant.
+ */
+ set_rx_fpdu_context(qp, frx->prev_rdmap_op);
+ __rdmap_set_opcode(c_hdr, frx->prev_rdmap_op);
+ return -EPROTO;
+ }
+ } else {
+ frx->prev_rdmap_op = opcode;
+ frx->first_ddp_seg = 1;
+ }
+ frx->more_ddp_segs = c_hdr->ddp_rdmap_ctrl & DDP_FLAG_LAST ? 0 : 1;
+
+ return 0;
+}
+
+static int siw_check_tx_fence(struct siw_qp *qp)
+{
+ struct siw_wqe *tx_waiting = tx_wqe(qp);
+ struct siw_sqe *rreq;
+ int resume_tx = 0, rv = 0;
+ unsigned long flags;
+
+ spin_lock_irqsave(&qp->orq_lock, flags);
+
+ rreq = orq_get_current(qp);
+
+ /* free current orq entry */
+ WRITE_ONCE(rreq->flags, 0);
+
+ if (qp->tx_ctx.orq_fence) {
+ if (unlikely(tx_waiting->wr_status != SIW_WR_QUEUED)) {
+ pr_warn("siw: [QP %u]: fence resume: bad status %d\n",
+ qp_id(qp), tx_waiting->wr_status);
+ rv = -EPROTO;
+ goto out;
+ }
+ /* resume SQ processing */
+ if (tx_waiting->sqe.opcode == SIW_OP_READ ||
+ tx_waiting->sqe.opcode == SIW_OP_READ_LOCAL_INV) {
+ rreq = orq_get_tail(qp);
+ if (unlikely(!rreq)) {
+ pr_warn("siw: [QP %u]: no ORQE\n", qp_id(qp));
+ rv = -EPROTO;
+ goto out;
+ }
+ siw_read_to_orq(rreq, &tx_waiting->sqe);
+
+ qp->orq_put++;
+ qp->tx_ctx.orq_fence = 0;
+ resume_tx = 1;
+
+ } else if (siw_orq_empty(qp)) {
+ qp->tx_ctx.orq_fence = 0;
+ resume_tx = 1;
+ } else {
+ pr_warn("siw: [QP %u]: fence resume: orq idx: %d:%d\n",
+ qp_id(qp), qp->orq_get, qp->orq_put);
+ rv = -EPROTO;
+ }
+ }
+ qp->orq_get++;
+out:
+ spin_unlock_irqrestore(&qp->orq_lock, flags);
+
+ if (resume_tx)
+ rv = siw_sq_start(qp);
+
+ return rv;
+}
+
+/*
+ * siw_rdmap_complete()
+ *
+ * Complete processing of an RDMA message after receiving all
+ * DDP segmens or ABort processing after encountering error case.
+ *
+ * o SENDs + RRESPs will need for completion,
+ * o RREQs need for READ RESPONSE initialization
+ * o WRITEs need memory dereferencing
+ *
+ * TODO: Failed WRITEs need local error to be surfaced.
+ */
+static int siw_rdmap_complete(struct siw_qp *qp, int error)
+{
+ struct siw_rx_stream *srx = &qp->rx_stream;
+ struct siw_wqe *wqe = rx_wqe(qp->rx_fpdu);
+ enum siw_wc_status wc_status = wqe->wc_status;
+ u8 opcode = __rdmap_get_opcode(&srx->hdr.ctrl);
+ int rv = 0;
+
+ switch (opcode) {
+ case RDMAP_SEND_SE:
+ case RDMAP_SEND_SE_INVAL:
+ wqe->rqe.flags |= SIW_WQE_SOLICITED;
+ /* Fall through */
+
+ case RDMAP_SEND:
+ case RDMAP_SEND_INVAL:
+ if (wqe->wr_status == SIW_WR_IDLE)
+ break;
+
+ srx->ddp_msn[RDMAP_UNTAGGED_QN_SEND]++;
+
+ if (error != 0 && wc_status == SIW_WC_SUCCESS)
+ wc_status = SIW_WC_GENERAL_ERR;
+ /*
+ * Handle STag invalidation request
+ */
+ if (wc_status == SIW_WC_SUCCESS &&
+ (opcode == RDMAP_SEND_INVAL ||
+ opcode == RDMAP_SEND_SE_INVAL)) {
+ rv = siw_invalidate_stag(qp->pd, srx->inval_stag);
+ if (rv) {
+ siw_init_terminate(
+ qp, TERM_ERROR_LAYER_RDMAP,
+ rv == -EACCES ?
+ RDMAP_ETYPE_REMOTE_PROTECTION :
+ RDMAP_ETYPE_REMOTE_OPERATION,
+ RDMAP_ECODE_CANNOT_INVALIDATE, 0);
+
+ wc_status = SIW_WC_REM_INV_REQ_ERR;
+ }
+ rv = siw_rqe_complete(qp, &wqe->rqe, wqe->processed,
+ rv ? 0 : srx->inval_stag,
+ wc_status);
+ } else {
+ rv = siw_rqe_complete(qp, &wqe->rqe, wqe->processed,
+ 0, wc_status);
+ }
+ siw_wqe_put_mem(wqe, SIW_OP_RECEIVE);
+ break;
+
+ case RDMAP_RDMA_READ_RESP:
+ if (wqe->wr_status == SIW_WR_IDLE)
+ break;
+
+ if (error != 0) {
+ if ((srx->state == SIW_GET_HDR &&
+ qp->rx_fpdu->first_ddp_seg) || error == -ENODATA)
+ /* possible RREQ in ORQ left untouched */
+ break;
+
+ if (wc_status == SIW_WC_SUCCESS)
+ wc_status = SIW_WC_GENERAL_ERR;
+ } else if (qp->kernel_verbs &&
+ rx_type(wqe) == SIW_OP_READ_LOCAL_INV) {
+ /*
+ * Handle any STag invalidation request
+ */
+ rv = siw_invalidate_stag(qp->pd, wqe->sqe.sge[0].lkey);
+ if (rv) {
+ siw_init_terminate(qp, TERM_ERROR_LAYER_RDMAP,
+ RDMAP_ETYPE_CATASTROPHIC,
+ RDMAP_ECODE_UNSPECIFIED, 0);
+
+ if (wc_status == SIW_WC_SUCCESS) {
+ wc_status = SIW_WC_GENERAL_ERR;
+ error = rv;
+ }
+ }
+ }
+ /*
+ * All errors turn the wqe into signalled.
+ */
+ if ((wqe->sqe.flags & SIW_WQE_SIGNALLED) || error != 0)
+ rv = siw_sqe_complete(qp, &wqe->sqe, wqe->processed,
+ wc_status);
+ siw_wqe_put_mem(wqe, SIW_OP_READ);
+
+ if (!error)
+ rv = siw_check_tx_fence(qp);
+ else
+ /* Disable current ORQ eleement */
+ WRITE_ONCE(orq_get_current(qp)->flags, 0);
+ break;
+
+ case RDMAP_RDMA_READ_REQ:
+ if (!error) {
+ rv = siw_init_rresp(qp, srx);
+ srx->ddp_msn[RDMAP_UNTAGGED_QN_RDMA_READ]++;
+ }
+ break;
+
+ case RDMAP_RDMA_WRITE:
+ if (wqe->wr_status == SIW_WR_IDLE)
+ break;
+
+ /*
+ * Free References from memory object if
+ * attached to receive context (inbound WRITE).
+ * While a zero-length WRITE is allowed,
+ * no memory reference got created.
+ */
+ if (rx_mem(&qp->rx_tagged)) {
+ siw_mem_put(rx_mem(&qp->rx_tagged));
+ rx_mem(&qp->rx_tagged) = NULL;
+ }
+ break;
+
+ default:
+ break;
+ }
+ wqe->wr_status = SIW_WR_IDLE;
+
+ return rv;
+}
+
+/*
+ * siw_tcp_rx_data()
+ *
+ * Main routine to consume inbound TCP payload
+ *
+ * @rd_desc: read descriptor
+ * @skb: socket buffer
+ * @off: offset in skb
+ * @len: skb->len - offset : payload in skb
+ */
+int siw_tcp_rx_data(read_descriptor_t *rd_desc, struct sk_buff *skb,
+ unsigned int off, size_t len)
+{
+ struct siw_qp *qp = rd_desc->arg.data;
+ struct siw_rx_stream *srx = &qp->rx_stream;
+ int rv;
+
+ srx->skb = skb;
+ srx->skb_new = skb->len - off;
+ srx->skb_offset = off;
+ srx->skb_copied = 0;
+
+ siw_dbg_qp(qp, "new data, len %d\n", srx->skb_new);
+
+ while (srx->skb_new) {
+ int run_completion = 1;
+
+ if (unlikely(srx->rx_suspend)) {
+ /* Do not process any more data */
+ srx->skb_copied += srx->skb_new;
+ break;
+ }
+ switch (srx->state) {
+ case SIW_GET_HDR:
+ rv = siw_get_hdr(srx);
+ if (!rv) {
+ srx->fpdu_part_rem =
+ be16_to_cpu(srx->hdr.ctrl.mpa_len) -
+ srx->fpdu_part_rcvd + MPA_HDR_SIZE;
+
+ if (srx->fpdu_part_rem)
+ srx->pad = -srx->fpdu_part_rem & 0x3;
+ else
+ srx->pad = 0;
+
+ srx->state = SIW_GET_DATA_START;
+ srx->fpdu_part_rcvd = 0;
+ }
+ break;
+
+ case SIW_GET_DATA_MORE:
+ /*
+ * Another data fragment of the same DDP segment.
+ * Setting first_ddp_seg = 0 avoids repeating
+ * initializations that shall occur only once per
+ * DDP segment.
+ */
+ qp->rx_fpdu->first_ddp_seg = 0;
+ /* Fall through */
+
+ case SIW_GET_DATA_START:
+ /*
+ * Headers will be checked by the opcode-specific
+ * data receive function below.
+ */
+ rv = iwarp_pktinfo[qp->rx_stream.rdmap_op].rx_data(qp);
+ if (!rv) {
+ int mpa_len =
+ be16_to_cpu(srx->hdr.ctrl.mpa_len)
+ + MPA_HDR_SIZE;
+
+ srx->fpdu_part_rem = (-mpa_len & 0x3)
+ + MPA_CRC_SIZE;
+ srx->fpdu_part_rcvd = 0;
+ srx->state = SIW_GET_TRAILER;
+ } else {
+ if (unlikely(rv == -ECONNRESET))
+ run_completion = 0;
+ else
+ srx->state = SIW_GET_DATA_MORE;
+ }
+ break;
+
+ case SIW_GET_TRAILER:
+ /*
+ * read CRC + any padding
+ */
+ rv = siw_get_trailer(qp, srx);
+ if (likely(!rv)) {
+ /*
+ * FPDU completed.
+ * complete RDMAP message if last fragment
+ */
+ srx->state = SIW_GET_HDR;
+ srx->fpdu_part_rcvd = 0;
+
+ if (!(srx->hdr.ctrl.ddp_rdmap_ctrl &
+ DDP_FLAG_LAST))
+ /* more frags */
+ break;
+
+ rv = siw_rdmap_complete(qp, 0);
+ run_completion = 0;
+ }
+ break;
+
+ default:
+ pr_warn("QP[%u]: RX out of state\n", qp_id(qp));
+ rv = -EPROTO;
+ run_completion = 0;
+ }
+ if (unlikely(rv != 0 && rv != -EAGAIN)) {
+ if ((srx->state > SIW_GET_HDR ||
+ qp->rx_fpdu->more_ddp_segs) && run_completion)
+ siw_rdmap_complete(qp, rv);
+
+ siw_dbg_qp(qp, "rx error %d, rx state %d\n", rv,
+ srx->state);
+
+ siw_qp_cm_drop(qp, 1);
+
+ break;
+ }
+ if (rv) {
+ siw_dbg_qp(qp, "fpdu fragment, state %d, missing %d\n",
+ srx->state, srx->fpdu_part_rem);
+ break;
+ }
+ }
+ return srx->skb_copied;
+}
diff --git a/drivers/infiniband/sw/siw/siw_qp_tx.c b/drivers/infiniband/sw/siw/siw_qp_tx.c
new file mode 100644
index 000000000000..43020d2040fc
--- /dev/null
+++ b/drivers/infiniband/sw/siw/siw_qp_tx.c
@@ -0,0 +1,1269 @@
+// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause
+
+/* Authors: Bernard Metzler <bmt@zurich.ibm.com> */
+/* Copyright (c) 2008-2019, IBM Corporation */
+
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/net.h>
+#include <linux/scatterlist.h>
+#include <linux/highmem.h>
+#include <net/tcp.h>
+
+#include <rdma/iw_cm.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_user_verbs.h>
+
+#include "siw.h"
+#include "siw_verbs.h"
+#include "siw_mem.h"
+
+#define MAX_HDR_INLINE \
+ (((uint32_t)(sizeof(struct siw_rreq_pkt) - \
+ sizeof(struct iwarp_send))) & 0xF8)
+
+static struct page *siw_get_pblpage(struct siw_mem *mem, u64 addr, int *idx)
+{
+ struct siw_pbl *pbl = mem->pbl;
+ u64 offset = addr - mem->va;
+ u64 paddr = siw_pbl_get_buffer(pbl, offset, NULL, idx);
+
+ if (paddr)
+ return virt_to_page(paddr);
+
+ return NULL;
+}
+
+/*
+ * Copy short payload at provided destination payload address
+ */
+static int siw_try_1seg(struct siw_iwarp_tx *c_tx, u64 paddr)
+{
+ struct siw_wqe *wqe = &c_tx->wqe_active;
+ struct siw_sge *sge = &wqe->sqe.sge[0];
+ u32 bytes = sge->length;
+
+ if (bytes > MAX_HDR_INLINE || wqe->sqe.num_sge != 1)
+ return MAX_HDR_INLINE + 1;
+
+ if (!bytes)
+ return 0;
+
+ if (tx_flags(wqe) & SIW_WQE_INLINE) {
+ memcpy((void *)paddr, &wqe->sqe.sge[1], bytes);
+ } else {
+ struct siw_mem *mem = wqe->mem[0];
+
+ if (!mem->mem_obj) {
+ /* Kernel client using kva */
+ memcpy((void *)paddr, (void *)sge->laddr, bytes);
+ } else if (c_tx->in_syscall) {
+ if (copy_from_user((void *)paddr,
+ (const void __user *)sge->laddr,
+ bytes))
+ return -EFAULT;
+ } else {
+ unsigned int off = sge->laddr & ~PAGE_MASK;
+ struct page *p;
+ char *buffer;
+ int pbl_idx = 0;
+
+ if (!mem->is_pbl)
+ p = siw_get_upage(mem->umem, sge->laddr);
+ else
+ p = siw_get_pblpage(mem, sge->laddr, &pbl_idx);
+
+ if (unlikely(!p))
+ return -EFAULT;
+
+ buffer = kmap_atomic(p);
+
+ if (likely(PAGE_SIZE - off >= bytes)) {
+ memcpy((void *)paddr, buffer + off, bytes);
+ kunmap_atomic(buffer);
+ } else {
+ unsigned long part = bytes - (PAGE_SIZE - off);
+
+ memcpy((void *)paddr, buffer + off, part);
+ kunmap_atomic(buffer);
+
+ if (!mem->is_pbl)
+ p = siw_get_upage(mem->umem,
+ sge->laddr + part);
+ else
+ p = siw_get_pblpage(mem,
+ sge->laddr + part,
+ &pbl_idx);
+ if (unlikely(!p))
+ return -EFAULT;
+
+ buffer = kmap_atomic(p);
+ memcpy((void *)(paddr + part), buffer,
+ bytes - part);
+ kunmap_atomic(buffer);
+ }
+ }
+ }
+ return (int)bytes;
+}
+
+#define PKT_FRAGMENTED 1
+#define PKT_COMPLETE 0
+
+/*
+ * siw_qp_prepare_tx()
+ *
+ * Prepare tx state for sending out one fpdu. Builds complete pkt
+ * if no user data or only immediate data are present.
+ *
+ * returns PKT_COMPLETE if complete pkt built, PKT_FRAGMENTED otherwise.
+ */
+static int siw_qp_prepare_tx(struct siw_iwarp_tx *c_tx)
+{
+ struct siw_wqe *wqe = &c_tx->wqe_active;
+ char *crc = NULL;
+ int data = 0;
+
+ switch (tx_type(wqe)) {
+ case SIW_OP_READ:
+ case SIW_OP_READ_LOCAL_INV:
+ memcpy(&c_tx->pkt.ctrl,
+ &iwarp_pktinfo[RDMAP_RDMA_READ_REQ].ctrl,
+ sizeof(struct iwarp_ctrl));
+
+ c_tx->pkt.rreq.rsvd = 0;
+ c_tx->pkt.rreq.ddp_qn = htonl(RDMAP_UNTAGGED_QN_RDMA_READ);
+ c_tx->pkt.rreq.ddp_msn =
+ htonl(++c_tx->ddp_msn[RDMAP_UNTAGGED_QN_RDMA_READ]);
+ c_tx->pkt.rreq.ddp_mo = 0;
+ c_tx->pkt.rreq.sink_stag = htonl(wqe->sqe.sge[0].lkey);
+ c_tx->pkt.rreq.sink_to =
+ cpu_to_be64(wqe->sqe.sge[0].laddr);
+ c_tx->pkt.rreq.source_stag = htonl(wqe->sqe.rkey);
+ c_tx->pkt.rreq.source_to = cpu_to_be64(wqe->sqe.raddr);
+ c_tx->pkt.rreq.read_size = htonl(wqe->sqe.sge[0].length);
+
+ c_tx->ctrl_len = sizeof(struct iwarp_rdma_rreq);
+ crc = (char *)&c_tx->pkt.rreq_pkt.crc;
+ break;
+
+ case SIW_OP_SEND:
+ if (tx_flags(wqe) & SIW_WQE_SOLICITED)
+ memcpy(&c_tx->pkt.ctrl,
+ &iwarp_pktinfo[RDMAP_SEND_SE].ctrl,
+ sizeof(struct iwarp_ctrl));
+ else
+ memcpy(&c_tx->pkt.ctrl, &iwarp_pktinfo[RDMAP_SEND].ctrl,
+ sizeof(struct iwarp_ctrl));
+
+ c_tx->pkt.send.ddp_qn = RDMAP_UNTAGGED_QN_SEND;
+ c_tx->pkt.send.ddp_msn =
+ htonl(++c_tx->ddp_msn[RDMAP_UNTAGGED_QN_SEND]);
+ c_tx->pkt.send.ddp_mo = 0;
+
+ c_tx->pkt.send_inv.inval_stag = 0;
+
+ c_tx->ctrl_len = sizeof(struct iwarp_send);
+
+ crc = (char *)&c_tx->pkt.send_pkt.crc;
+ data = siw_try_1seg(c_tx, (u64)crc);
+ break;
+
+ case SIW_OP_SEND_REMOTE_INV:
+ if (tx_flags(wqe) & SIW_WQE_SOLICITED)
+ memcpy(&c_tx->pkt.ctrl,
+ &iwarp_pktinfo[RDMAP_SEND_SE_INVAL].ctrl,
+ sizeof(struct iwarp_ctrl));
+ else
+ memcpy(&c_tx->pkt.ctrl,
+ &iwarp_pktinfo[RDMAP_SEND_INVAL].ctrl,
+ sizeof(struct iwarp_ctrl));
+
+ c_tx->pkt.send.ddp_qn = RDMAP_UNTAGGED_QN_SEND;
+ c_tx->pkt.send.ddp_msn =
+ htonl(++c_tx->ddp_msn[RDMAP_UNTAGGED_QN_SEND]);
+ c_tx->pkt.send.ddp_mo = 0;
+
+ c_tx->pkt.send_inv.inval_stag = cpu_to_be32(wqe->sqe.rkey);
+
+ c_tx->ctrl_len = sizeof(struct iwarp_send_inv);
+
+ crc = (char *)&c_tx->pkt.send_pkt.crc;
+ data = siw_try_1seg(c_tx, (u64)crc);
+ break;
+
+ case SIW_OP_WRITE:
+ memcpy(&c_tx->pkt.ctrl, &iwarp_pktinfo[RDMAP_RDMA_WRITE].ctrl,
+ sizeof(struct iwarp_ctrl));
+
+ c_tx->pkt.rwrite.sink_stag = htonl(wqe->sqe.rkey);
+ c_tx->pkt.rwrite.sink_to = cpu_to_be64(wqe->sqe.raddr);
+ c_tx->ctrl_len = sizeof(struct iwarp_rdma_write);
+
+ crc = (char *)&c_tx->pkt.write_pkt.crc;
+ data = siw_try_1seg(c_tx, (u64)crc);
+ break;
+
+ case SIW_OP_READ_RESPONSE:
+ memcpy(&c_tx->pkt.ctrl,
+ &iwarp_pktinfo[RDMAP_RDMA_READ_RESP].ctrl,
+ sizeof(struct iwarp_ctrl));
+
+ /* NBO */
+ c_tx->pkt.rresp.sink_stag = cpu_to_be32(wqe->sqe.rkey);
+ c_tx->pkt.rresp.sink_to = cpu_to_be64(wqe->sqe.raddr);
+
+ c_tx->ctrl_len = sizeof(struct iwarp_rdma_rresp);
+
+ crc = (char *)&c_tx->pkt.write_pkt.crc;
+ data = siw_try_1seg(c_tx, (u64)crc);
+ break;
+
+ default:
+ siw_dbg_qp(tx_qp(c_tx), "stale wqe type %d\n", tx_type(wqe));
+ return -EOPNOTSUPP;
+ }
+ if (unlikely(data < 0))
+ return data;
+
+ c_tx->ctrl_sent = 0;
+
+ if (data <= MAX_HDR_INLINE) {
+ if (data) {
+ wqe->processed = data;
+
+ c_tx->pkt.ctrl.mpa_len =
+ htons(c_tx->ctrl_len + data - MPA_HDR_SIZE);
+
+ /* Add pad, if needed */
+ data += -(int)data & 0x3;
+ /* advance CRC location after payload */
+ crc += data;
+ c_tx->ctrl_len += data;
+
+ if (!(c_tx->pkt.ctrl.ddp_rdmap_ctrl & DDP_FLAG_TAGGED))
+ c_tx->pkt.c_untagged.ddp_mo = 0;
+ else
+ c_tx->pkt.c_tagged.ddp_to =
+ cpu_to_be64(wqe->sqe.raddr);
+ }
+
+ *(u32 *)crc = 0;
+ /*
+ * Do complete CRC if enabled and short packet
+ */
+ if (c_tx->mpa_crc_hd) {
+ crypto_shash_init(c_tx->mpa_crc_hd);
+ if (crypto_shash_update(c_tx->mpa_crc_hd,
+ (u8 *)&c_tx->pkt,
+ c_tx->ctrl_len))
+ return -EINVAL;
+ crypto_shash_final(c_tx->mpa_crc_hd, (u8 *)crc);
+ }
+ c_tx->ctrl_len += MPA_CRC_SIZE;
+
+ return PKT_COMPLETE;
+ }
+ c_tx->ctrl_len += MPA_CRC_SIZE;
+ c_tx->sge_idx = 0;
+ c_tx->sge_off = 0;
+ c_tx->pbl_idx = 0;
+
+ /*
+ * Allow direct sending out of user buffer if WR is non signalled
+ * and payload is over threshold.
+ * Per RDMA verbs, the application should not change the send buffer
+ * until the work completed. In iWarp, work completion is only
+ * local delivery to TCP. TCP may reuse the buffer for
+ * retransmission. Changing unsent data also breaks the CRC,
+ * if applied.
+ */
+ if (c_tx->zcopy_tx && wqe->bytes >= SENDPAGE_THRESH &&
+ !(tx_flags(wqe) & SIW_WQE_SIGNALLED))
+ c_tx->use_sendpage = 1;
+ else
+ c_tx->use_sendpage = 0;
+
+ return PKT_FRAGMENTED;
+}
+
+/*
+ * Send out one complete control type FPDU, or header of FPDU carrying
+ * data. Used for fixed sized packets like Read.Requests or zero length
+ * SENDs, WRITEs, READ.Responses, or header only.
+ */
+static int siw_tx_ctrl(struct siw_iwarp_tx *c_tx, struct socket *s,
+ int flags)
+{
+ struct msghdr msg = { .msg_flags = flags };
+ struct kvec iov = { .iov_base =
+ (char *)&c_tx->pkt.ctrl + c_tx->ctrl_sent,
+ .iov_len = c_tx->ctrl_len - c_tx->ctrl_sent };
+
+ int rv = kernel_sendmsg(s, &msg, &iov, 1,
+ c_tx->ctrl_len - c_tx->ctrl_sent);
+
+ if (rv >= 0) {
+ c_tx->ctrl_sent += rv;
+
+ if (c_tx->ctrl_sent == c_tx->ctrl_len)
+ rv = 0;
+ else
+ rv = -EAGAIN;
+ }
+ return rv;
+}
+
+/*
+ * 0copy TCP transmit interface: Use do_tcp_sendpages.
+ *
+ * Using sendpage to push page by page appears to be less efficient
+ * than using sendmsg, even if data are copied.
+ *
+ * A general performance limitation might be the extra four bytes
+ * trailer checksum segment to be pushed after user data.
+ */
+static int siw_tcp_sendpages(struct socket *s, struct page **page, int offset,
+ size_t size)
+{
+ struct sock *sk = s->sk;
+ int i = 0, rv = 0, sent = 0,
+ flags = MSG_MORE | MSG_DONTWAIT | MSG_SENDPAGE_NOTLAST;
+
+ while (size) {
+ size_t bytes = min_t(size_t, PAGE_SIZE - offset, size);
+
+ if (size + offset <= PAGE_SIZE)
+ flags = MSG_MORE | MSG_DONTWAIT;
+
+ tcp_rate_check_app_limited(sk);
+try_page_again:
+ lock_sock(sk);
+ rv = do_tcp_sendpages(sk, page[i], offset, bytes, flags);
+ release_sock(sk);
+
+ if (rv > 0) {
+ size -= rv;
+ sent += rv;
+ if (rv != bytes) {
+ offset += rv;
+ bytes -= rv;
+ goto try_page_again;
+ }
+ offset = 0;
+ } else {
+ if (rv == -EAGAIN || rv == 0)
+ break;
+ return rv;
+ }
+ i++;
+ }
+ return sent;
+}
+
+/*
+ * siw_0copy_tx()
+ *
+ * Pushes list of pages to TCP socket. If pages from multiple
+ * SGE's, all referenced pages of each SGE are pushed in one
+ * shot.
+ */
+static int siw_0copy_tx(struct socket *s, struct page **page,
+ struct siw_sge *sge, unsigned int offset,
+ unsigned int size)
+{
+ int i = 0, sent = 0, rv;
+ int sge_bytes = min(sge->length - offset, size);
+
+ offset = (sge->laddr + offset) & ~PAGE_MASK;
+
+ while (sent != size) {
+ rv = siw_tcp_sendpages(s, &page[i], offset, sge_bytes);
+ if (rv >= 0) {
+ sent += rv;
+ if (size == sent || sge_bytes > rv)
+ break;
+
+ i += PAGE_ALIGN(sge_bytes + offset) >> PAGE_SHIFT;
+ sge++;
+ sge_bytes = min(sge->length, size - sent);
+ offset = sge->laddr & ~PAGE_MASK;
+ } else {
+ sent = rv;
+ break;
+ }
+ }
+ return sent;
+}
+
+#define MAX_TRAILER (MPA_CRC_SIZE + 4)
+
+static void siw_unmap_pages(struct page **pages, int hdr_len, int num_maps)
+{
+ if (hdr_len) {
+ ++pages;
+ --num_maps;
+ }
+ while (num_maps-- > 0) {
+ kunmap(*pages);
+ pages++;
+ }
+}
+
+/*
+ * siw_tx_hdt() tries to push a complete packet to TCP where all
+ * packet fragments are referenced by the elements of one iovec.
+ * For the data portion, each involved page must be referenced by
+ * one extra element. All sge's data can be non-aligned to page
+ * boundaries. Two more elements are referencing iWARP header
+ * and trailer:
+ * MAX_ARRAY = 64KB/PAGE_SIZE + 1 + (2 * (SIW_MAX_SGE - 1) + HDR + TRL
+ */
+#define MAX_ARRAY ((0xffff / PAGE_SIZE) + 1 + (2 * (SIW_MAX_SGE - 1) + 2))
+
+/*
+ * Write out iov referencing hdr, data and trailer of current FPDU.
+ * Update transmit state dependent on write return status
+ */
+static int siw_tx_hdt(struct siw_iwarp_tx *c_tx, struct socket *s)
+{
+ struct siw_wqe *wqe = &c_tx->wqe_active;
+ struct siw_sge *sge = &wqe->sqe.sge[c_tx->sge_idx];
+ struct kvec iov[MAX_ARRAY];
+ struct page *page_array[MAX_ARRAY];
+ struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_EOR };
+
+ int seg = 0, do_crc = c_tx->do_crc, is_kva = 0, rv;
+ unsigned int data_len = c_tx->bytes_unsent, hdr_len = 0, trl_len = 0,
+ sge_off = c_tx->sge_off, sge_idx = c_tx->sge_idx,
+ pbl_idx = c_tx->pbl_idx;
+
+ if (c_tx->state == SIW_SEND_HDR) {
+ if (c_tx->use_sendpage) {
+ rv = siw_tx_ctrl(c_tx, s, MSG_DONTWAIT | MSG_MORE);
+ if (rv)
+ goto done;
+
+ c_tx->state = SIW_SEND_DATA;
+ } else {
+ iov[0].iov_base =
+ (char *)&c_tx->pkt.ctrl + c_tx->ctrl_sent;
+ iov[0].iov_len = hdr_len =
+ c_tx->ctrl_len - c_tx->ctrl_sent;
+ seg = 1;
+ }
+ }
+
+ wqe->processed += data_len;
+
+ while (data_len) { /* walk the list of SGE's */
+ unsigned int sge_len = min(sge->length - sge_off, data_len);
+ unsigned int fp_off = (sge->laddr + sge_off) & ~PAGE_MASK;
+ struct siw_mem *mem;
+
+ if (!(tx_flags(wqe) & SIW_WQE_INLINE)) {
+ mem = wqe->mem[sge_idx];
+ if (!mem->mem_obj)
+ is_kva = 1;
+ } else {
+ is_kva = 1;
+ }
+ if (is_kva && !c_tx->use_sendpage) {
+ /*
+ * tx from kernel virtual address: either inline data
+ * or memory region with assigned kernel buffer
+ */
+ iov[seg].iov_base = (void *)(sge->laddr + sge_off);
+ iov[seg].iov_len = sge_len;
+
+ if (do_crc)
+ crypto_shash_update(c_tx->mpa_crc_hd,
+ iov[seg].iov_base,
+ sge_len);
+ sge_off += sge_len;
+ data_len -= sge_len;
+ seg++;
+ goto sge_done;
+ }
+
+ while (sge_len) {
+ size_t plen = min((int)PAGE_SIZE - fp_off, sge_len);
+
+ if (!is_kva) {
+ struct page *p;
+
+ if (mem->is_pbl)
+ p = siw_get_pblpage(
+ mem, sge->laddr + sge_off,
+ &pbl_idx);
+ else
+ p = siw_get_upage(mem->umem,
+ sge->laddr + sge_off);
+ if (unlikely(!p)) {
+ if (hdr_len)
+ seg--;
+ if (!c_tx->use_sendpage && seg) {
+ siw_unmap_pages(page_array,
+ hdr_len, seg);
+ }
+ wqe->processed -= c_tx->bytes_unsent;
+ rv = -EFAULT;
+ goto done_crc;
+ }
+ page_array[seg] = p;
+
+ if (!c_tx->use_sendpage) {
+ iov[seg].iov_base = kmap(p) + fp_off;
+ iov[seg].iov_len = plen;
+ if (do_crc)
+ crypto_shash_update(
+ c_tx->mpa_crc_hd,
+ iov[seg].iov_base,
+ plen);
+ } else if (do_crc)
+ crypto_shash_update(
+ c_tx->mpa_crc_hd,
+ page_address(p) + fp_off,
+ plen);
+ } else {
+ u64 pa = ((sge->laddr + sge_off) & PAGE_MASK);
+
+ page_array[seg] = virt_to_page(pa);
+ if (do_crc)
+ crypto_shash_update(
+ c_tx->mpa_crc_hd,
+ (void *)(sge->laddr + sge_off),
+ plen);
+ }
+
+ sge_len -= plen;
+ sge_off += plen;
+ data_len -= plen;
+ fp_off = 0;
+
+ if (++seg > (int)MAX_ARRAY) {
+ siw_dbg_qp(tx_qp(c_tx), "to many fragments\n");
+ if (!is_kva && !c_tx->use_sendpage) {
+ siw_unmap_pages(page_array, hdr_len,
+ seg - 1);
+ }
+ wqe->processed -= c_tx->bytes_unsent;
+ rv = -EMSGSIZE;
+ goto done_crc;
+ }
+ }
+sge_done:
+ /* Update SGE variables at end of SGE */
+ if (sge_off == sge->length &&
+ (data_len != 0 || wqe->processed < wqe->bytes)) {
+ sge_idx++;
+ sge++;
+ sge_off = 0;
+ }
+ }
+ /* trailer */
+ if (likely(c_tx->state != SIW_SEND_TRAILER)) {
+ iov[seg].iov_base = &c_tx->trailer.pad[4 - c_tx->pad];
+ iov[seg].iov_len = trl_len = MAX_TRAILER - (4 - c_tx->pad);
+ } else {
+ iov[seg].iov_base = &c_tx->trailer.pad[c_tx->ctrl_sent];
+ iov[seg].iov_len = trl_len = MAX_TRAILER - c_tx->ctrl_sent;
+ }
+
+ if (c_tx->pad) {
+ *(u32 *)c_tx->trailer.pad = 0;
+ if (do_crc)
+ crypto_shash_update(c_tx->mpa_crc_hd,
+ (u8 *)&c_tx->trailer.crc - c_tx->pad,
+ c_tx->pad);
+ }
+ if (!c_tx->mpa_crc_hd)
+ c_tx->trailer.crc = 0;
+ else if (do_crc)
+ crypto_shash_final(c_tx->mpa_crc_hd, (u8 *)&c_tx->trailer.crc);
+
+ data_len = c_tx->bytes_unsent;
+
+ if (c_tx->use_sendpage) {
+ rv = siw_0copy_tx(s, page_array, &wqe->sqe.sge[c_tx->sge_idx],
+ c_tx->sge_off, data_len);
+ if (rv == data_len) {
+ rv = kernel_sendmsg(s, &msg, &iov[seg], 1, trl_len);
+ if (rv > 0)
+ rv += data_len;
+ else
+ rv = data_len;
+ }
+ } else {
+ rv = kernel_sendmsg(s, &msg, iov, seg + 1,
+ hdr_len + data_len + trl_len);
+ if (!is_kva)
+ siw_unmap_pages(page_array, hdr_len, seg);
+ }
+ if (rv < (int)hdr_len) {
+ /* Not even complete hdr pushed or negative rv */
+ wqe->processed -= data_len;
+ if (rv >= 0) {
+ c_tx->ctrl_sent += rv;
+ rv = -EAGAIN;
+ }
+ goto done_crc;
+ }
+ rv -= hdr_len;
+
+ if (rv >= (int)data_len) {
+ /* all user data pushed to TCP or no data to push */
+ if (data_len > 0 && wqe->processed < wqe->bytes) {
+ /* Save the current state for next tx */
+ c_tx->sge_idx = sge_idx;
+ c_tx->sge_off = sge_off;
+ c_tx->pbl_idx = pbl_idx;
+ }
+ rv -= data_len;
+
+ if (rv == trl_len) /* all pushed */
+ rv = 0;
+ else {
+ c_tx->state = SIW_SEND_TRAILER;
+ c_tx->ctrl_len = MAX_TRAILER;
+ c_tx->ctrl_sent = rv + 4 - c_tx->pad;
+ c_tx->bytes_unsent = 0;
+ rv = -EAGAIN;
+ }
+
+ } else if (data_len > 0) {
+ /* Maybe some user data pushed to TCP */
+ c_tx->state = SIW_SEND_DATA;
+ wqe->processed -= data_len - rv;
+
+ if (rv) {
+ /*
+ * Some bytes out. Recompute tx state based
+ * on old state and bytes pushed
+ */
+ unsigned int sge_unsent;
+
+ c_tx->bytes_unsent -= rv;
+ sge = &wqe->sqe.sge[c_tx->sge_idx];
+ sge_unsent = sge->length - c_tx->sge_off;
+
+ while (sge_unsent <= rv) {
+ rv -= sge_unsent;
+ c_tx->sge_idx++;
+ c_tx->sge_off = 0;
+ sge++;
+ sge_unsent = sge->length;
+ }
+ c_tx->sge_off += rv;
+ }
+ rv = -EAGAIN;
+ }
+done_crc:
+ c_tx->do_crc = 0;
+done:
+ return rv;
+}
+
+static void siw_update_tcpseg(struct siw_iwarp_tx *c_tx,
+ struct socket *s)
+{
+ struct tcp_sock *tp = tcp_sk(s->sk);
+
+ if (tp->gso_segs) {
+ if (c_tx->gso_seg_limit == 0)
+ c_tx->tcp_seglen = tp->mss_cache * tp->gso_segs;
+ else
+ c_tx->tcp_seglen =
+ tp->mss_cache *
+ min_t(u16, c_tx->gso_seg_limit, tp->gso_segs);
+ } else {
+ c_tx->tcp_seglen = tp->mss_cache;
+ }
+ /* Loopback may give odd numbers */
+ c_tx->tcp_seglen &= 0xfffffff8;
+}
+
+/*
+ * siw_prepare_fpdu()
+ *
+ * Prepares transmit context to send out one FPDU if FPDU will contain
+ * user data and user data are not immediate data.
+ * Computes maximum FPDU length to fill up TCP MSS if possible.
+ *
+ * @qp: QP from which to transmit
+ * @wqe: Current WQE causing transmission
+ *
+ * TODO: Take into account real available sendspace on socket
+ * to avoid header misalignment due to send pausing within
+ * fpdu transmission
+ */
+static void siw_prepare_fpdu(struct siw_qp *qp, struct siw_wqe *wqe)
+{
+ struct siw_iwarp_tx *c_tx = &qp->tx_ctx;
+ int data_len;
+
+ c_tx->ctrl_len =
+ iwarp_pktinfo[__rdmap_get_opcode(&c_tx->pkt.ctrl)].hdr_len;
+ c_tx->ctrl_sent = 0;
+
+ /*
+ * Update target buffer offset if any
+ */
+ if (!(c_tx->pkt.ctrl.ddp_rdmap_ctrl & DDP_FLAG_TAGGED))
+ /* Untagged message */
+ c_tx->pkt.c_untagged.ddp_mo = cpu_to_be32(wqe->processed);
+ else /* Tagged message */
+ c_tx->pkt.c_tagged.ddp_to =
+ cpu_to_be64(wqe->sqe.raddr + wqe->processed);
+
+ data_len = wqe->bytes - wqe->processed;
+ if (data_len + c_tx->ctrl_len + MPA_CRC_SIZE > c_tx->tcp_seglen) {
+ /* Trim DDP payload to fit into current TCP segment */
+ data_len = c_tx->tcp_seglen - (c_tx->ctrl_len + MPA_CRC_SIZE);
+ c_tx->pkt.ctrl.ddp_rdmap_ctrl &= ~DDP_FLAG_LAST;
+ c_tx->pad = 0;
+ } else {
+ c_tx->pkt.ctrl.ddp_rdmap_ctrl |= DDP_FLAG_LAST;
+ c_tx->pad = -data_len & 0x3;
+ }
+ c_tx->bytes_unsent = data_len;
+
+ c_tx->pkt.ctrl.mpa_len =
+ htons(c_tx->ctrl_len + data_len - MPA_HDR_SIZE);
+
+ /*
+ * Init MPA CRC computation
+ */
+ if (c_tx->mpa_crc_hd) {
+ crypto_shash_init(c_tx->mpa_crc_hd);
+ crypto_shash_update(c_tx->mpa_crc_hd, (u8 *)&c_tx->pkt,
+ c_tx->ctrl_len);
+ c_tx->do_crc = 1;
+ }
+}
+
+/*
+ * siw_check_sgl_tx()
+ *
+ * Check permissions for a list of SGE's (SGL).
+ * A successful check will have all memory referenced
+ * for transmission resolved and assigned to the WQE.
+ *
+ * @pd: Protection Domain SGL should belong to
+ * @wqe: WQE to be checked
+ * @perms: requested access permissions
+ *
+ */
+
+static int siw_check_sgl_tx(struct ib_pd *pd, struct siw_wqe *wqe,
+ enum ib_access_flags perms)
+{
+ struct siw_sge *sge = &wqe->sqe.sge[0];
+ int i, len, num_sge = wqe->sqe.num_sge;
+
+ if (unlikely(num_sge > SIW_MAX_SGE))
+ return -EINVAL;
+
+ for (i = 0, len = 0; num_sge; num_sge--, i++, sge++) {
+ /*
+ * rdma verbs: do not check stag for a zero length sge
+ */
+ if (sge->length) {
+ int rv = siw_check_sge(pd, sge, &wqe->mem[i], perms, 0,
+ sge->length);
+
+ if (unlikely(rv != E_ACCESS_OK))
+ return rv;
+ }
+ len += sge->length;
+ }
+ return len;
+}
+
+/*
+ * siw_qp_sq_proc_tx()
+ *
+ * Process one WQE which needs transmission on the wire.
+ */
+static int siw_qp_sq_proc_tx(struct siw_qp *qp, struct siw_wqe *wqe)
+{
+ struct siw_iwarp_tx *c_tx = &qp->tx_ctx;
+ struct socket *s = qp->attrs.sk;
+ int rv = 0, burst_len = qp->tx_ctx.burst;
+ enum rdmap_ecode ecode = RDMAP_ECODE_CATASTROPHIC_STREAM;
+
+ if (unlikely(wqe->wr_status == SIW_WR_IDLE))
+ return 0;
+
+ if (!burst_len)
+ burst_len = SQ_USER_MAXBURST;
+
+ if (wqe->wr_status == SIW_WR_QUEUED) {
+ if (!(wqe->sqe.flags & SIW_WQE_INLINE)) {
+ if (tx_type(wqe) == SIW_OP_READ_RESPONSE)
+ wqe->sqe.num_sge = 1;
+
+ if (tx_type(wqe) != SIW_OP_READ &&
+ tx_type(wqe) != SIW_OP_READ_LOCAL_INV) {
+ /*
+ * Reference memory to be tx'd w/o checking
+ * access for LOCAL_READ permission, since
+ * not defined in RDMA core.
+ */
+ rv = siw_check_sgl_tx(qp->pd, wqe, 0);
+ if (rv < 0) {
+ if (tx_type(wqe) ==
+ SIW_OP_READ_RESPONSE)
+ ecode = siw_rdmap_error(-rv);
+ rv = -EINVAL;
+ goto tx_error;
+ }
+ wqe->bytes = rv;
+ } else {
+ wqe->bytes = 0;
+ }
+ } else {
+ wqe->bytes = wqe->sqe.sge[0].length;
+ if (!qp->kernel_verbs) {
+ if (wqe->bytes > SIW_MAX_INLINE) {
+ rv = -EINVAL;
+ goto tx_error;
+ }
+ wqe->sqe.sge[0].laddr = (u64)&wqe->sqe.sge[1];
+ }
+ }
+ wqe->wr_status = SIW_WR_INPROGRESS;
+ wqe->processed = 0;
+
+ siw_update_tcpseg(c_tx, s);
+
+ rv = siw_qp_prepare_tx(c_tx);
+ if (rv == PKT_FRAGMENTED) {
+ c_tx->state = SIW_SEND_HDR;
+ siw_prepare_fpdu(qp, wqe);
+ } else if (rv == PKT_COMPLETE) {
+ c_tx->state = SIW_SEND_SHORT_FPDU;
+ } else {
+ goto tx_error;
+ }
+ }
+
+next_segment:
+ siw_dbg_qp(qp, "wr type %d, state %d, data %u, sent %u, id %llx\n",
+ tx_type(wqe), wqe->wr_status, wqe->bytes, wqe->processed,
+ wqe->sqe.id);
+
+ if (--burst_len == 0) {
+ rv = -EINPROGRESS;
+ goto tx_done;
+ }
+ if (c_tx->state == SIW_SEND_SHORT_FPDU) {
+ enum siw_opcode tx_type = tx_type(wqe);
+ unsigned int msg_flags;
+
+ if (siw_sq_empty(qp) || !siw_tcp_nagle || burst_len == 1)
+ /*
+ * End current TCP segment, if SQ runs empty,
+ * or siw_tcp_nagle is not set, or we bail out
+ * soon due to no burst credit left.
+ */
+ msg_flags = MSG_DONTWAIT;
+ else
+ msg_flags = MSG_DONTWAIT | MSG_MORE;
+
+ rv = siw_tx_ctrl(c_tx, s, msg_flags);
+
+ if (!rv && tx_type != SIW_OP_READ &&
+ tx_type != SIW_OP_READ_LOCAL_INV)
+ wqe->processed = wqe->bytes;
+
+ goto tx_done;
+
+ } else {
+ rv = siw_tx_hdt(c_tx, s);
+ }
+ if (!rv) {
+ /*
+ * One segment sent. Processing completed if last
+ * segment, Do next segment otherwise.
+ */
+ if (unlikely(c_tx->tx_suspend)) {
+ /*
+ * Verbs, 6.4.: Try stopping sending after a full
+ * DDP segment if the connection goes down
+ * (== peer halfclose)
+ */
+ rv = -ECONNABORTED;
+ goto tx_done;
+ }
+ if (c_tx->pkt.ctrl.ddp_rdmap_ctrl & DDP_FLAG_LAST) {
+ siw_dbg_qp(qp, "WQE completed\n");
+ goto tx_done;
+ }
+ c_tx->state = SIW_SEND_HDR;
+
+ siw_update_tcpseg(c_tx, s);
+
+ siw_prepare_fpdu(qp, wqe);
+ goto next_segment;
+ }
+tx_done:
+ qp->tx_ctx.burst = burst_len;
+ return rv;
+
+tx_error:
+ if (ecode != RDMAP_ECODE_CATASTROPHIC_STREAM)
+ siw_init_terminate(qp, TERM_ERROR_LAYER_RDMAP,
+ RDMAP_ETYPE_REMOTE_PROTECTION, ecode, 1);
+ else
+ siw_init_terminate(qp, TERM_ERROR_LAYER_RDMAP,
+ RDMAP_ETYPE_CATASTROPHIC,
+ RDMAP_ECODE_UNSPECIFIED, 1);
+ return rv;
+}
+
+static int siw_fastreg_mr(struct ib_pd *pd, struct siw_sqe *sqe)
+{
+ struct ib_mr *base_mr = (struct ib_mr *)sqe->base_mr;
+ struct siw_device *sdev = to_siw_dev(pd->device);
+ struct siw_mem *mem = siw_mem_id2obj(sdev, sqe->rkey >> 8);
+ int rv = 0;
+
+ siw_dbg_pd(pd, "STag 0x%08x\n", sqe->rkey);
+
+ if (unlikely(!mem || !base_mr)) {
+ pr_warn("siw: fastreg: STag 0x%08x unknown\n", sqe->rkey);
+ return -EINVAL;
+ }
+ if (unlikely(base_mr->rkey >> 8 != sqe->rkey >> 8)) {
+ pr_warn("siw: fastreg: STag 0x%08x: bad MR\n", sqe->rkey);
+ rv = -EINVAL;
+ goto out;
+ }
+ if (unlikely(mem->pd != pd)) {
+ pr_warn("siw: fastreg: PD mismatch\n");
+ rv = -EINVAL;
+ goto out;
+ }
+ if (unlikely(mem->stag_valid)) {
+ pr_warn("siw: fastreg: STag 0x%08x already valid\n", sqe->rkey);
+ rv = -EINVAL;
+ goto out;
+ }
+ /* Refresh STag since user may have changed key part */
+ mem->stag = sqe->rkey;
+ mem->perms = sqe->access;
+
+ siw_dbg_mem(mem, "STag now valid, MR va: 0x%016llx -> 0x%016llx\n",
+ mem->va, base_mr->iova);
+ mem->va = base_mr->iova;
+ mem->stag_valid = 1;
+out:
+ siw_mem_put(mem);
+ return rv;
+}
+
+static int siw_qp_sq_proc_local(struct siw_qp *qp, struct siw_wqe *wqe)
+{
+ int rv;
+
+ switch (tx_type(wqe)) {
+ case SIW_OP_REG_MR:
+ rv = siw_fastreg_mr(qp->pd, &wqe->sqe);
+ break;
+
+ case SIW_OP_INVAL_STAG:
+ rv = siw_invalidate_stag(qp->pd, wqe->sqe.rkey);
+ break;
+
+ default:
+ rv = -EINVAL;
+ }
+ return rv;
+}
+
+/*
+ * siw_qp_sq_process()
+ *
+ * Core TX path routine for RDMAP/DDP/MPA using a TCP kernel socket.
+ * Sends RDMAP payload for the current SQ WR @wqe of @qp in one or more
+ * MPA FPDUs, each containing a DDP segment.
+ *
+ * SQ processing may occur in user context as a result of posting
+ * new WQE's or from siw_sq_work_handler() context. Processing in
+ * user context is limited to non-kernel verbs users.
+ *
+ * SQ processing may get paused anytime, possibly in the middle of a WR
+ * or FPDU, if insufficient send space is available. SQ processing
+ * gets resumed from siw_sq_work_handler(), if send space becomes
+ * available again.
+ *
+ * Must be called with the QP state read-locked.
+ *
+ * Note:
+ * An outbound RREQ can be satisfied by the corresponding RRESP
+ * _before_ it gets assigned to the ORQ. This happens regularly
+ * in RDMA READ via loopback case. Since both outbound RREQ and
+ * inbound RRESP can be handled by the same CPU, locking the ORQ
+ * is dead-lock prone and thus not an option. With that, the
+ * RREQ gets assigned to the ORQ _before_ being sent - see
+ * siw_activate_tx() - and pulled back in case of send failure.
+ */
+int siw_qp_sq_process(struct siw_qp *qp)
+{
+ struct siw_wqe *wqe = tx_wqe(qp);
+ enum siw_opcode tx_type;
+ unsigned long flags;
+ int rv = 0;
+
+ siw_dbg_qp(qp, "enter for type %d\n", tx_type(wqe));
+
+next_wqe:
+ /*
+ * Stop QP processing if SQ state changed
+ */
+ if (unlikely(qp->tx_ctx.tx_suspend)) {
+ siw_dbg_qp(qp, "tx suspended\n");
+ goto done;
+ }
+ tx_type = tx_type(wqe);
+
+ if (tx_type <= SIW_OP_READ_RESPONSE)
+ rv = siw_qp_sq_proc_tx(qp, wqe);
+ else
+ rv = siw_qp_sq_proc_local(qp, wqe);
+
+ if (!rv) {
+ /*
+ * WQE processing done
+ */
+ switch (tx_type) {
+ case SIW_OP_SEND:
+ case SIW_OP_SEND_REMOTE_INV:
+ case SIW_OP_WRITE:
+ siw_wqe_put_mem(wqe, tx_type);
+ /* Fall through */
+
+ case SIW_OP_INVAL_STAG:
+ case SIW_OP_REG_MR:
+ if (tx_flags(wqe) & SIW_WQE_SIGNALLED)
+ siw_sqe_complete(qp, &wqe->sqe, wqe->bytes,
+ SIW_WC_SUCCESS);
+ break;
+
+ case SIW_OP_READ:
+ case SIW_OP_READ_LOCAL_INV:
+ /*
+ * already enqueued to ORQ queue
+ */
+ break;
+
+ case SIW_OP_READ_RESPONSE:
+ siw_wqe_put_mem(wqe, tx_type);
+ break;
+
+ default:
+ WARN(1, "undefined WQE type %d\n", tx_type);
+ rv = -EINVAL;
+ goto done;
+ }
+
+ spin_lock_irqsave(&qp->sq_lock, flags);
+ wqe->wr_status = SIW_WR_IDLE;
+ rv = siw_activate_tx(qp);
+ spin_unlock_irqrestore(&qp->sq_lock, flags);
+
+ if (rv <= 0)
+ goto done;
+
+ goto next_wqe;
+
+ } else if (rv == -EAGAIN) {
+ siw_dbg_qp(qp, "sq paused: hd/tr %d of %d, data %d\n",
+ qp->tx_ctx.ctrl_sent, qp->tx_ctx.ctrl_len,
+ qp->tx_ctx.bytes_unsent);
+ rv = 0;
+ goto done;
+ } else if (rv == -EINPROGRESS) {
+ rv = siw_sq_start(qp);
+ goto done;
+ } else {
+ /*
+ * WQE processing failed.
+ * Verbs 8.3.2:
+ * o It turns any WQE into a signalled WQE.
+ * o Local catastrophic error must be surfaced
+ * o QP must be moved into Terminate state: done by code
+ * doing socket state change processing
+ *
+ * o TODO: Termination message must be sent.
+ * o TODO: Implement more precise work completion errors,
+ * see enum ib_wc_status in ib_verbs.h
+ */
+ siw_dbg_qp(qp, "wqe type %d processing failed: %d\n",
+ tx_type(wqe), rv);
+
+ spin_lock_irqsave(&qp->sq_lock, flags);
+ /*
+ * RREQ may have already been completed by inbound RRESP!
+ */
+ if (tx_type == SIW_OP_READ ||
+ tx_type == SIW_OP_READ_LOCAL_INV) {
+ /* Cleanup pending entry in ORQ */
+ qp->orq_put--;
+ qp->orq[qp->orq_put % qp->attrs.orq_size].flags = 0;
+ }
+ spin_unlock_irqrestore(&qp->sq_lock, flags);
+ /*
+ * immediately suspends further TX processing
+ */
+ if (!qp->tx_ctx.tx_suspend)
+ siw_qp_cm_drop(qp, 0);
+
+ switch (tx_type) {
+ case SIW_OP_SEND:
+ case SIW_OP_SEND_REMOTE_INV:
+ case SIW_OP_SEND_WITH_IMM:
+ case SIW_OP_WRITE:
+ case SIW_OP_READ:
+ case SIW_OP_READ_LOCAL_INV:
+ siw_wqe_put_mem(wqe, tx_type);
+ /* Fall through */
+
+ case SIW_OP_INVAL_STAG:
+ case SIW_OP_REG_MR:
+ siw_sqe_complete(qp, &wqe->sqe, wqe->bytes,
+ SIW_WC_LOC_QP_OP_ERR);
+
+ siw_qp_event(qp, IB_EVENT_QP_FATAL);
+
+ break;
+
+ case SIW_OP_READ_RESPONSE:
+ siw_dbg_qp(qp, "proc. read.response failed: %d\n", rv);
+
+ siw_qp_event(qp, IB_EVENT_QP_REQ_ERR);
+
+ siw_wqe_put_mem(wqe, SIW_OP_READ_RESPONSE);
+
+ break;
+
+ default:
+ WARN(1, "undefined WQE type %d\n", tx_type);
+ rv = -EINVAL;
+ }
+ wqe->wr_status = SIW_WR_IDLE;
+ }
+done:
+ return rv;
+}
+
+static void siw_sq_resume(struct siw_qp *qp)
+{
+ if (down_read_trylock(&qp->state_lock)) {
+ if (likely(qp->attrs.state == SIW_QP_STATE_RTS &&
+ !qp->tx_ctx.tx_suspend)) {
+ int rv = siw_qp_sq_process(qp);
+
+ up_read(&qp->state_lock);
+
+ if (unlikely(rv < 0)) {
+ siw_dbg_qp(qp, "SQ task failed: err %d\n", rv);
+
+ if (!qp->tx_ctx.tx_suspend)
+ siw_qp_cm_drop(qp, 0);
+ }
+ } else {
+ up_read(&qp->state_lock);
+ }
+ } else {
+ siw_dbg_qp(qp, "Resume SQ while QP locked\n");
+ }
+ siw_qp_put(qp);
+}
+
+struct tx_task_t {
+ struct llist_head active;
+ wait_queue_head_t waiting;
+};
+
+static DEFINE_PER_CPU(struct tx_task_t, siw_tx_task_g);
+
+void siw_stop_tx_thread(int nr_cpu)
+{
+ kthread_stop(siw_tx_thread[nr_cpu]);
+ wake_up(&per_cpu(siw_tx_task_g, nr_cpu).waiting);
+}
+
+int siw_run_sq(void *data)
+{
+ const int nr_cpu = (unsigned int)(long)data;
+ struct llist_node *active;
+ struct siw_qp *qp;
+ struct tx_task_t *tx_task = &per_cpu(siw_tx_task_g, nr_cpu);
+
+ init_llist_head(&tx_task->active);
+ init_waitqueue_head(&tx_task->waiting);
+
+ while (1) {
+ struct llist_node *fifo_list = NULL;
+
+ wait_event_interruptible(tx_task->waiting,
+ !llist_empty(&tx_task->active) ||
+ kthread_should_stop());
+
+ if (kthread_should_stop())
+ break;
+
+ active = llist_del_all(&tx_task->active);
+ /*
+ * llist_del_all returns a list with newest entry first.
+ * Re-order list for fairness among QP's.
+ */
+ while (active) {
+ struct llist_node *tmp = active;
+
+ active = llist_next(active);
+ tmp->next = fifo_list;
+ fifo_list = tmp;
+ }
+ while (fifo_list) {
+ qp = container_of(fifo_list, struct siw_qp, tx_list);
+ fifo_list = llist_next(fifo_list);
+ qp->tx_list.next = NULL;
+
+ siw_sq_resume(qp);
+ }
+ }
+ active = llist_del_all(&tx_task->active);
+ if (active) {
+ llist_for_each_entry(qp, active, tx_list) {
+ qp->tx_list.next = NULL;
+ siw_sq_resume(qp);
+ }
+ }
+ return 0;
+}
+
+int siw_sq_start(struct siw_qp *qp)
+{
+ if (tx_wqe(qp)->wr_status == SIW_WR_IDLE)
+ return 0;
+
+ if (unlikely(!cpu_online(qp->tx_cpu))) {
+ siw_put_tx_cpu(qp->tx_cpu);
+ qp->tx_cpu = siw_get_tx_cpu(qp->sdev);
+ if (qp->tx_cpu < 0) {
+ pr_warn("siw: no tx cpu available\n");
+
+ return -EIO;
+ }
+ }
+ siw_qp_get(qp);
+
+ llist_add(&qp->tx_list, &per_cpu(siw_tx_task_g, qp->tx_cpu).active);
+
+ wake_up(&per_cpu(siw_tx_task_g, qp->tx_cpu).waiting);
+
+ return 0;
+}
diff --git a/drivers/infiniband/sw/siw/siw_verbs.c b/drivers/infiniband/sw/siw/siw_verbs.c
new file mode 100644
index 000000000000..32dc79d0e898
--- /dev/null
+++ b/drivers/infiniband/sw/siw/siw_verbs.c
@@ -0,0 +1,1760 @@
+// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause
+
+/* Authors: Bernard Metzler <bmt@zurich.ibm.com> */
+/* Copyright (c) 2008-2019, IBM Corporation */
+
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/uaccess.h>
+#include <linux/vmalloc.h>
+#include <linux/xarray.h>
+
+#include <rdma/iw_cm.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_user_verbs.h>
+#include <rdma/uverbs_ioctl.h>
+
+#include "siw.h"
+#include "siw_verbs.h"
+#include "siw_mem.h"
+
+static int ib_qp_state_to_siw_qp_state[IB_QPS_ERR + 1] = {
+ [IB_QPS_RESET] = SIW_QP_STATE_IDLE,
+ [IB_QPS_INIT] = SIW_QP_STATE_IDLE,
+ [IB_QPS_RTR] = SIW_QP_STATE_RTR,
+ [IB_QPS_RTS] = SIW_QP_STATE_RTS,
+ [IB_QPS_SQD] = SIW_QP_STATE_CLOSING,
+ [IB_QPS_SQE] = SIW_QP_STATE_TERMINATE,
+ [IB_QPS_ERR] = SIW_QP_STATE_ERROR
+};
+
+static char ib_qp_state_to_string[IB_QPS_ERR + 1][sizeof("RESET")] = {
+ [IB_QPS_RESET] = "RESET", [IB_QPS_INIT] = "INIT", [IB_QPS_RTR] = "RTR",
+ [IB_QPS_RTS] = "RTS", [IB_QPS_SQD] = "SQD", [IB_QPS_SQE] = "SQE",
+ [IB_QPS_ERR] = "ERR"
+};
+
+static u32 siw_create_uobj(struct siw_ucontext *uctx, void *vaddr, u32 size)
+{
+ struct siw_uobj *uobj;
+ struct xa_limit limit = XA_LIMIT(0, SIW_UOBJ_MAX_KEY);
+ u32 key;
+
+ uobj = kzalloc(sizeof(*uobj), GFP_KERNEL);
+ if (!uobj)
+ return SIW_INVAL_UOBJ_KEY;
+
+ if (xa_alloc_cyclic(&uctx->xa, &key, uobj, limit, &uctx->uobj_nextkey,
+ GFP_KERNEL) < 0) {
+ kfree(uobj);
+ return SIW_INVAL_UOBJ_KEY;
+ }
+ uobj->size = PAGE_ALIGN(size);
+ uobj->addr = vaddr;
+
+ return key;
+}
+
+static struct siw_uobj *siw_get_uobj(struct siw_ucontext *uctx,
+ unsigned long off, u32 size)
+{
+ struct siw_uobj *uobj = xa_load(&uctx->xa, off);
+
+ if (uobj && uobj->size == size)
+ return uobj;
+
+ return NULL;
+}
+
+int siw_mmap(struct ib_ucontext *ctx, struct vm_area_struct *vma)
+{
+ struct siw_ucontext *uctx = to_siw_ctx(ctx);
+ struct siw_uobj *uobj;
+ unsigned long off = vma->vm_pgoff;
+ int size = vma->vm_end - vma->vm_start;
+ int rv = -EINVAL;
+
+ /*
+ * Must be page aligned
+ */
+ if (vma->vm_start & (PAGE_SIZE - 1)) {
+ pr_warn("siw: mmap not page aligned\n");
+ goto out;
+ }
+ uobj = siw_get_uobj(uctx, off, size);
+ if (!uobj) {
+ siw_dbg(&uctx->sdev->base_dev, "mmap lookup failed: %lu, %u\n",
+ off, size);
+ goto out;
+ }
+ rv = remap_vmalloc_range(vma, uobj->addr, 0);
+ if (rv)
+ pr_warn("remap_vmalloc_range failed: %lu, %u\n", off, size);
+out:
+ return rv;
+}
+
+int siw_alloc_ucontext(struct ib_ucontext *base_ctx, struct ib_udata *udata)
+{
+ struct siw_device *sdev = to_siw_dev(base_ctx->device);
+ struct siw_ucontext *ctx = to_siw_ctx(base_ctx);
+ struct siw_uresp_alloc_ctx uresp = {};
+ int rv;
+
+ if (atomic_inc_return(&sdev->num_ctx) > SIW_MAX_CONTEXT) {
+ rv = -ENOMEM;
+ goto err_out;
+ }
+ xa_init_flags(&ctx->xa, XA_FLAGS_ALLOC);
+ ctx->uobj_nextkey = 0;
+ ctx->sdev = sdev;
+
+ uresp.dev_id = sdev->vendor_part_id;
+
+ if (udata->outlen < sizeof(uresp)) {
+ rv = -EINVAL;
+ goto err_out;
+ }
+ rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp));
+ if (rv)
+ goto err_out;
+
+ siw_dbg(base_ctx->device, "success. now %d context(s)\n",
+ atomic_read(&sdev->num_ctx));
+
+ return 0;
+
+err_out:
+ atomic_dec(&sdev->num_ctx);
+ siw_dbg(base_ctx->device, "failure %d. now %d context(s)\n", rv,
+ atomic_read(&sdev->num_ctx));
+
+ return rv;
+}
+
+void siw_dealloc_ucontext(struct ib_ucontext *base_ctx)
+{
+ struct siw_ucontext *uctx = to_siw_ctx(base_ctx);
+ void *entry;
+ unsigned long index;
+
+ /*
+ * Make sure all user mmap objects are gone. Since QP, CQ
+ * and SRQ destroy routines destroy related objects, nothing
+ * should be found here.
+ */
+ xa_for_each(&uctx->xa, index, entry) {
+ kfree(xa_erase(&uctx->xa, index));
+ pr_warn("siw: dropping orphaned uobj at %lu\n", index);
+ }
+ xa_destroy(&uctx->xa);
+ atomic_dec(&uctx->sdev->num_ctx);
+}
+
+int siw_query_device(struct ib_device *base_dev, struct ib_device_attr *attr,
+ struct ib_udata *udata)
+{
+ struct siw_device *sdev = to_siw_dev(base_dev);
+
+ if (udata->inlen || udata->outlen)
+ return -EINVAL;
+
+ memset(attr, 0, sizeof(*attr));
+
+ /* Revisit atomic caps if RFC 7306 gets supported */
+ attr->atomic_cap = 0;
+ attr->device_cap_flags =
+ IB_DEVICE_MEM_MGT_EXTENSIONS | IB_DEVICE_ALLOW_USER_UNREG;
+ attr->max_cq = sdev->attrs.max_cq;
+ attr->max_cqe = sdev->attrs.max_cqe;
+ attr->max_fast_reg_page_list_len = SIW_MAX_SGE_PBL;
+ attr->max_fmr = sdev->attrs.max_fmr;
+ attr->max_mr = sdev->attrs.max_mr;
+ attr->max_mw = sdev->attrs.max_mw;
+ attr->max_mr_size = ~0ull;
+ attr->max_pd = sdev->attrs.max_pd;
+ attr->max_qp = sdev->attrs.max_qp;
+ attr->max_qp_init_rd_atom = sdev->attrs.max_ird;
+ attr->max_qp_rd_atom = sdev->attrs.max_ord;
+ attr->max_qp_wr = sdev->attrs.max_qp_wr;
+ attr->max_recv_sge = sdev->attrs.max_sge;
+ attr->max_res_rd_atom = sdev->attrs.max_qp * sdev->attrs.max_ird;
+ attr->max_send_sge = sdev->attrs.max_sge;
+ attr->max_sge_rd = sdev->attrs.max_sge_rd;
+ attr->max_srq = sdev->attrs.max_srq;
+ attr->max_srq_sge = sdev->attrs.max_srq_sge;
+ attr->max_srq_wr = sdev->attrs.max_srq_wr;
+ attr->page_size_cap = PAGE_SIZE;
+ attr->vendor_id = SIW_VENDOR_ID;
+ attr->vendor_part_id = sdev->vendor_part_id;
+
+ memcpy(&attr->sys_image_guid, sdev->netdev->dev_addr, 6);
+
+ return 0;
+}
+
+int siw_query_port(struct ib_device *base_dev, u8 port,
+ struct ib_port_attr *attr)
+{
+ struct siw_device *sdev = to_siw_dev(base_dev);
+
+ memset(attr, 0, sizeof(*attr));
+
+ attr->active_mtu = attr->max_mtu;
+ attr->active_speed = 2;
+ attr->active_width = 2;
+ attr->gid_tbl_len = 1;
+ attr->max_msg_sz = -1;
+ attr->max_mtu = ib_mtu_int_to_enum(sdev->netdev->mtu);
+ attr->phys_state = sdev->state == IB_PORT_ACTIVE ? 5 : 3;
+ attr->pkey_tbl_len = 1;
+ attr->port_cap_flags = IB_PORT_CM_SUP | IB_PORT_DEVICE_MGMT_SUP;
+ attr->state = sdev->state;
+ /*
+ * All zero
+ *
+ * attr->lid = 0;
+ * attr->bad_pkey_cntr = 0;
+ * attr->qkey_viol_cntr = 0;
+ * attr->sm_lid = 0;
+ * attr->lmc = 0;
+ * attr->max_vl_num = 0;
+ * attr->sm_sl = 0;
+ * attr->subnet_timeout = 0;
+ * attr->init_type_repy = 0;
+ */
+ return 0;
+}
+
+int siw_get_port_immutable(struct ib_device *base_dev, u8 port,
+ struct ib_port_immutable *port_immutable)
+{
+ struct ib_port_attr attr;
+ int rv = siw_query_port(base_dev, port, &attr);
+
+ if (rv)
+ return rv;
+
+ port_immutable->pkey_tbl_len = attr.pkey_tbl_len;
+ port_immutable->gid_tbl_len = attr.gid_tbl_len;
+ port_immutable->core_cap_flags = RDMA_CORE_PORT_IWARP;
+
+ return 0;
+}
+
+int siw_query_pkey(struct ib_device *base_dev, u8 port, u16 idx, u16 *pkey)
+{
+ /* Report the default pkey */
+ *pkey = 0xffff;
+ return 0;
+}
+
+int siw_query_gid(struct ib_device *base_dev, u8 port, int idx,
+ union ib_gid *gid)
+{
+ struct siw_device *sdev = to_siw_dev(base_dev);
+
+ /* subnet_prefix == interface_id == 0; */
+ memset(gid, 0, sizeof(*gid));
+ memcpy(&gid->raw[0], sdev->netdev->dev_addr, 6);
+
+ return 0;
+}
+
+int siw_alloc_pd(struct ib_pd *pd, struct ib_udata *udata)
+{
+ struct siw_device *sdev = to_siw_dev(pd->device);
+
+ if (atomic_inc_return(&sdev->num_pd) > SIW_MAX_PD) {
+ atomic_dec(&sdev->num_pd);
+ return -ENOMEM;
+ }
+ siw_dbg_pd(pd, "now %d PD's(s)\n", atomic_read(&sdev->num_pd));
+
+ return 0;
+}
+
+void siw_dealloc_pd(struct ib_pd *pd, struct ib_udata *udata)
+{
+ struct siw_device *sdev = to_siw_dev(pd->device);
+
+ siw_dbg_pd(pd, "free PD\n");
+ atomic_dec(&sdev->num_pd);
+}
+
+void siw_qp_get_ref(struct ib_qp *base_qp)
+{
+ siw_qp_get(to_siw_qp(base_qp));
+}
+
+void siw_qp_put_ref(struct ib_qp *base_qp)
+{
+ siw_qp_put(to_siw_qp(base_qp));
+}
+
+/*
+ * siw_create_qp()
+ *
+ * Create QP of requested size on given device.
+ *
+ * @pd: Protection Domain
+ * @attrs: Initial QP attributes.
+ * @udata: used to provide QP ID, SQ and RQ size back to user.
+ */
+
+struct ib_qp *siw_create_qp(struct ib_pd *pd,
+ struct ib_qp_init_attr *attrs,
+ struct ib_udata *udata)
+{
+ struct siw_qp *qp = NULL;
+ struct siw_base_qp *siw_base_qp = NULL;
+ struct ib_device *base_dev = pd->device;
+ struct siw_device *sdev = to_siw_dev(base_dev);
+ struct siw_ucontext *uctx =
+ rdma_udata_to_drv_context(udata, struct siw_ucontext,
+ base_ucontext);
+ struct siw_cq *scq = NULL, *rcq = NULL;
+ unsigned long flags;
+ int num_sqe, num_rqe, rv = 0;
+
+ siw_dbg(base_dev, "create new QP\n");
+
+ if (atomic_inc_return(&sdev->num_qp) > SIW_MAX_QP) {
+ siw_dbg(base_dev, "too many QP's\n");
+ rv = -ENOMEM;
+ goto err_out;
+ }
+ if (attrs->qp_type != IB_QPT_RC) {
+ siw_dbg(base_dev, "only RC QP's supported\n");
+ rv = -EINVAL;
+ goto err_out;
+ }
+ if ((attrs->cap.max_send_wr > SIW_MAX_QP_WR) ||
+ (attrs->cap.max_recv_wr > SIW_MAX_QP_WR) ||
+ (attrs->cap.max_send_sge > SIW_MAX_SGE) ||
+ (attrs->cap.max_recv_sge > SIW_MAX_SGE)) {
+ siw_dbg(base_dev, "QP size error\n");
+ rv = -EINVAL;
+ goto err_out;
+ }
+ if (attrs->cap.max_inline_data > SIW_MAX_INLINE) {
+ siw_dbg(base_dev, "max inline send: %d > %d\n",
+ attrs->cap.max_inline_data, (int)SIW_MAX_INLINE);
+ rv = -EINVAL;
+ goto err_out;
+ }
+ /*
+ * NOTE: we allow for zero element SQ and RQ WQE's SGL's
+ * but not for a QP unable to hold any WQE (SQ + RQ)
+ */
+ if (attrs->cap.max_send_wr + attrs->cap.max_recv_wr == 0) {
+ siw_dbg(base_dev, "QP must have send or receive queue\n");
+ rv = -EINVAL;
+ goto err_out;
+ }
+ scq = to_siw_cq(attrs->send_cq);
+ rcq = to_siw_cq(attrs->recv_cq);
+
+ if (!scq || (!rcq && !attrs->srq)) {
+ siw_dbg(base_dev, "send CQ or receive CQ invalid\n");
+ rv = -EINVAL;
+ goto err_out;
+ }
+ siw_base_qp = kzalloc(sizeof(*siw_base_qp), GFP_KERNEL);
+ if (!siw_base_qp) {
+ rv = -ENOMEM;
+ goto err_out;
+ }
+ qp = kzalloc(sizeof(*qp), GFP_KERNEL);
+ if (!qp) {
+ rv = -ENOMEM;
+ goto err_out;
+ }
+ siw_base_qp->qp = qp;
+ qp->ib_qp = &siw_base_qp->base_qp;
+
+ init_rwsem(&qp->state_lock);
+ spin_lock_init(&qp->sq_lock);
+ spin_lock_init(&qp->rq_lock);
+ spin_lock_init(&qp->orq_lock);
+
+ qp->kernel_verbs = !udata;
+ qp->xa_sq_index = SIW_INVAL_UOBJ_KEY;
+ qp->xa_rq_index = SIW_INVAL_UOBJ_KEY;
+
+ rv = siw_qp_add(sdev, qp);
+ if (rv)
+ goto err_out;
+
+ /* All queue indices are derived from modulo operations
+ * on a free running 'get' (consumer) and 'put' (producer)
+ * unsigned counter. Having queue sizes at power of two
+ * avoids handling counter wrap around.
+ */
+ num_sqe = roundup_pow_of_two(attrs->cap.max_send_wr);
+ num_rqe = roundup_pow_of_two(attrs->cap.max_recv_wr);
+
+ if (qp->kernel_verbs)
+ qp->sendq = vzalloc(num_sqe * sizeof(struct siw_sqe));
+ else
+ qp->sendq = vmalloc_user(num_sqe * sizeof(struct siw_sqe));
+
+ if (qp->sendq == NULL) {
+ siw_dbg(base_dev, "SQ size %d alloc failed\n", num_sqe);
+ rv = -ENOMEM;
+ goto err_out_xa;
+ }
+ if (attrs->sq_sig_type != IB_SIGNAL_REQ_WR) {
+ if (attrs->sq_sig_type == IB_SIGNAL_ALL_WR)
+ qp->attrs.flags |= SIW_SIGNAL_ALL_WR;
+ else {
+ rv = -EINVAL;
+ goto err_out_xa;
+ }
+ }
+ qp->pd = pd;
+ qp->scq = scq;
+ qp->rcq = rcq;
+
+ if (attrs->srq) {
+ /*
+ * SRQ support.
+ * Verbs 6.3.7: ignore RQ size, if SRQ present
+ * Verbs 6.3.5: do not check PD of SRQ against PD of QP
+ */
+ qp->srq = to_siw_srq(attrs->srq);
+ qp->attrs.rq_size = 0;
+ siw_dbg(base_dev, "QP [%u]: [SRQ 0x%p] attached\n",
+ qp->qp_num, qp->srq);
+ } else if (num_rqe) {
+ if (qp->kernel_verbs)
+ qp->recvq = vzalloc(num_rqe * sizeof(struct siw_rqe));
+ else
+ qp->recvq =
+ vmalloc_user(num_rqe * sizeof(struct siw_rqe));
+
+ if (qp->recvq == NULL) {
+ siw_dbg(base_dev, "RQ size %d alloc failed\n", num_rqe);
+ rv = -ENOMEM;
+ goto err_out_xa;
+ }
+ qp->attrs.rq_size = num_rqe;
+ }
+ qp->attrs.sq_size = num_sqe;
+ qp->attrs.sq_max_sges = attrs->cap.max_send_sge;
+ qp->attrs.rq_max_sges = attrs->cap.max_recv_sge;
+
+ /* Make those two tunables fixed for now. */
+ qp->tx_ctx.gso_seg_limit = 1;
+ qp->tx_ctx.zcopy_tx = zcopy_tx;
+
+ qp->attrs.state = SIW_QP_STATE_IDLE;
+
+ if (udata) {
+ struct siw_uresp_create_qp uresp = {};
+
+ uresp.num_sqe = num_sqe;
+ uresp.num_rqe = num_rqe;
+ uresp.qp_id = qp_id(qp);
+
+ if (qp->sendq) {
+ qp->xa_sq_index =
+ siw_create_uobj(uctx, qp->sendq,
+ num_sqe * sizeof(struct siw_sqe));
+ }
+ if (qp->recvq) {
+ qp->xa_rq_index =
+ siw_create_uobj(uctx, qp->recvq,
+ num_rqe * sizeof(struct siw_rqe));
+ }
+ if (qp->xa_sq_index == SIW_INVAL_UOBJ_KEY ||
+ qp->xa_rq_index == SIW_INVAL_UOBJ_KEY) {
+ rv = -ENOMEM;
+ goto err_out_xa;
+ }
+ uresp.sq_key = qp->xa_sq_index << PAGE_SHIFT;
+ uresp.rq_key = qp->xa_rq_index << PAGE_SHIFT;
+
+ if (udata->outlen < sizeof(uresp)) {
+ rv = -EINVAL;
+ goto err_out_xa;
+ }
+ rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp));
+ if (rv)
+ goto err_out_xa;
+ }
+ qp->tx_cpu = siw_get_tx_cpu(sdev);
+ if (qp->tx_cpu < 0) {
+ rv = -EINVAL;
+ goto err_out_xa;
+ }
+ INIT_LIST_HEAD(&qp->devq);
+ spin_lock_irqsave(&sdev->lock, flags);
+ list_add_tail(&qp->devq, &sdev->qp_list);
+ spin_unlock_irqrestore(&sdev->lock, flags);
+
+ return qp->ib_qp;
+
+err_out_xa:
+ xa_erase(&sdev->qp_xa, qp_id(qp));
+err_out:
+ kfree(siw_base_qp);
+
+ if (qp) {
+ if (qp->xa_sq_index != SIW_INVAL_UOBJ_KEY)
+ kfree(xa_erase(&uctx->xa, qp->xa_sq_index));
+ if (qp->xa_rq_index != SIW_INVAL_UOBJ_KEY)
+ kfree(xa_erase(&uctx->xa, qp->xa_rq_index));
+
+ vfree(qp->sendq);
+ vfree(qp->recvq);
+ kfree(qp);
+ }
+ atomic_dec(&sdev->num_qp);
+
+ return ERR_PTR(rv);
+}
+
+/*
+ * Minimum siw_query_qp() verb interface.
+ *
+ * @qp_attr_mask is not used but all available information is provided
+ */
+int siw_query_qp(struct ib_qp *base_qp, struct ib_qp_attr *qp_attr,
+ int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr)
+{
+ struct siw_qp *qp;
+ struct siw_device *sdev;
+
+ if (base_qp && qp_attr && qp_init_attr) {
+ qp = to_siw_qp(base_qp);
+ sdev = to_siw_dev(base_qp->device);
+ } else {
+ return -EINVAL;
+ }
+ qp_attr->cap.max_inline_data = SIW_MAX_INLINE;
+ qp_attr->cap.max_send_wr = qp->attrs.sq_size;
+ qp_attr->cap.max_send_sge = qp->attrs.sq_max_sges;
+ qp_attr->cap.max_recv_wr = qp->attrs.rq_size;
+ qp_attr->cap.max_recv_sge = qp->attrs.rq_max_sges;
+ qp_attr->path_mtu = ib_mtu_int_to_enum(sdev->netdev->mtu);
+ qp_attr->max_rd_atomic = qp->attrs.irq_size;
+ qp_attr->max_dest_rd_atomic = qp->attrs.orq_size;
+
+ qp_attr->qp_access_flags = IB_ACCESS_LOCAL_WRITE |
+ IB_ACCESS_REMOTE_WRITE |
+ IB_ACCESS_REMOTE_READ;
+
+ qp_init_attr->qp_type = base_qp->qp_type;
+ qp_init_attr->send_cq = base_qp->send_cq;
+ qp_init_attr->recv_cq = base_qp->recv_cq;
+ qp_init_attr->srq = base_qp->srq;
+
+ qp_init_attr->cap = qp_attr->cap;
+
+ return 0;
+}
+
+int siw_verbs_modify_qp(struct ib_qp *base_qp, struct ib_qp_attr *attr,
+ int attr_mask, struct ib_udata *udata)
+{
+ struct siw_qp_attrs new_attrs;
+ enum siw_qp_attr_mask siw_attr_mask = 0;
+ struct siw_qp *qp = to_siw_qp(base_qp);
+ int rv = 0;
+
+ if (!attr_mask)
+ return 0;
+
+ memset(&new_attrs, 0, sizeof(new_attrs));
+
+ if (attr_mask & IB_QP_ACCESS_FLAGS) {
+ siw_attr_mask = SIW_QP_ATTR_ACCESS_FLAGS;
+
+ if (attr->qp_access_flags & IB_ACCESS_REMOTE_READ)
+ new_attrs.flags |= SIW_RDMA_READ_ENABLED;
+ if (attr->qp_access_flags & IB_ACCESS_REMOTE_WRITE)
+ new_attrs.flags |= SIW_RDMA_WRITE_ENABLED;
+ if (attr->qp_access_flags & IB_ACCESS_MW_BIND)
+ new_attrs.flags |= SIW_RDMA_BIND_ENABLED;
+ }
+ if (attr_mask & IB_QP_STATE) {
+ siw_dbg_qp(qp, "desired IB QP state: %s\n",
+ ib_qp_state_to_string[attr->qp_state]);
+
+ new_attrs.state = ib_qp_state_to_siw_qp_state[attr->qp_state];
+
+ if (new_attrs.state > SIW_QP_STATE_RTS)
+ qp->tx_ctx.tx_suspend = 1;
+
+ siw_attr_mask |= SIW_QP_ATTR_STATE;
+ }
+ if (!siw_attr_mask)
+ goto out;
+
+ down_write(&qp->state_lock);
+
+ rv = siw_qp_modify(qp, &new_attrs, siw_attr_mask);
+
+ up_write(&qp->state_lock);
+out:
+ return rv;
+}
+
+int siw_destroy_qp(struct ib_qp *base_qp, struct ib_udata *udata)
+{
+ struct siw_qp *qp = to_siw_qp(base_qp);
+ struct siw_base_qp *siw_base_qp = to_siw_base_qp(base_qp);
+ struct siw_ucontext *uctx =
+ rdma_udata_to_drv_context(udata, struct siw_ucontext,
+ base_ucontext);
+ struct siw_qp_attrs qp_attrs;
+
+ siw_dbg_qp(qp, "state %d, cep 0x%p\n", qp->attrs.state, qp->cep);
+
+ /*
+ * Mark QP as in process of destruction to prevent from
+ * any async callbacks to RDMA core
+ */
+ qp->attrs.flags |= SIW_QP_IN_DESTROY;
+ qp->rx_stream.rx_suspend = 1;
+
+ if (uctx && qp->xa_sq_index != SIW_INVAL_UOBJ_KEY)
+ kfree(xa_erase(&uctx->xa, qp->xa_sq_index));
+ if (uctx && qp->xa_rq_index != SIW_INVAL_UOBJ_KEY)
+ kfree(xa_erase(&uctx->xa, qp->xa_rq_index));
+
+ down_write(&qp->state_lock);
+
+ qp_attrs.state = SIW_QP_STATE_ERROR;
+ siw_qp_modify(qp, &qp_attrs, SIW_QP_ATTR_STATE);
+
+ if (qp->cep) {
+ siw_cep_put(qp->cep);
+ qp->cep = NULL;
+ }
+ up_write(&qp->state_lock);
+
+ kfree(qp->tx_ctx.mpa_crc_hd);
+ kfree(qp->rx_stream.mpa_crc_hd);
+
+ qp->scq = qp->rcq = NULL;
+
+ siw_qp_put(qp);
+ kfree(siw_base_qp);
+
+ return 0;
+}
+
+/*
+ * siw_copy_inline_sgl()
+ *
+ * Prepare sgl of inlined data for sending. For userland callers
+ * function checks if given buffer addresses and len's are within
+ * process context bounds.
+ * Data from all provided sge's are copied together into the wqe,
+ * referenced by a single sge.
+ */
+static int siw_copy_inline_sgl(const struct ib_send_wr *core_wr,
+ struct siw_sqe *sqe)
+{
+ struct ib_sge *core_sge = core_wr->sg_list;
+ void *kbuf = &sqe->sge[1];
+ int num_sge = core_wr->num_sge, bytes = 0;
+
+ sqe->sge[0].laddr = (u64)kbuf;
+ sqe->sge[0].lkey = 0;
+
+ while (num_sge--) {
+ if (!core_sge->length) {
+ core_sge++;
+ continue;
+ }
+ bytes += core_sge->length;
+ if (bytes > SIW_MAX_INLINE) {
+ bytes = -EINVAL;
+ break;
+ }
+ memcpy(kbuf, (void *)(uintptr_t)core_sge->addr,
+ core_sge->length);
+
+ kbuf += core_sge->length;
+ core_sge++;
+ }
+ sqe->sge[0].length = bytes > 0 ? bytes : 0;
+ sqe->num_sge = bytes > 0 ? 1 : 0;
+
+ return bytes;
+}
+
+/*
+ * siw_post_send()
+ *
+ * Post a list of S-WR's to a SQ.
+ *
+ * @base_qp: Base QP contained in siw QP
+ * @wr: Null terminated list of user WR's
+ * @bad_wr: Points to failing WR in case of synchronous failure.
+ */
+int siw_post_send(struct ib_qp *base_qp, const struct ib_send_wr *wr,
+ const struct ib_send_wr **bad_wr)
+{
+ struct siw_qp *qp = to_siw_qp(base_qp);
+ struct siw_wqe *wqe = tx_wqe(qp);
+
+ unsigned long flags;
+ int rv = 0;
+
+ /*
+ * Try to acquire QP state lock. Must be non-blocking
+ * to accommodate kernel clients needs.
+ */
+ if (!down_read_trylock(&qp->state_lock)) {
+ *bad_wr = wr;
+ siw_dbg_qp(qp, "QP locked, state %d\n", qp->attrs.state);
+ return -ENOTCONN;
+ }
+ if (unlikely(qp->attrs.state != SIW_QP_STATE_RTS)) {
+ up_read(&qp->state_lock);
+ *bad_wr = wr;
+ siw_dbg_qp(qp, "QP out of state %d\n", qp->attrs.state);
+ return -ENOTCONN;
+ }
+ if (wr && !qp->kernel_verbs) {
+ siw_dbg_qp(qp, "wr must be empty for user mapped sq\n");
+ up_read(&qp->state_lock);
+ *bad_wr = wr;
+ return -EINVAL;
+ }
+ spin_lock_irqsave(&qp->sq_lock, flags);
+
+ while (wr) {
+ u32 idx = qp->sq_put % qp->attrs.sq_size;
+ struct siw_sqe *sqe = &qp->sendq[idx];
+
+ if (sqe->flags) {
+ siw_dbg_qp(qp, "sq full\n");
+ rv = -ENOMEM;
+ break;
+ }
+ if (wr->num_sge > qp->attrs.sq_max_sges) {
+ siw_dbg_qp(qp, "too many sge's: %d\n", wr->num_sge);
+ rv = -EINVAL;
+ break;
+ }
+ sqe->id = wr->wr_id;
+
+ if ((wr->send_flags & IB_SEND_SIGNALED) ||
+ (qp->attrs.flags & SIW_SIGNAL_ALL_WR))
+ sqe->flags |= SIW_WQE_SIGNALLED;
+
+ if (wr->send_flags & IB_SEND_FENCE)
+ sqe->flags |= SIW_WQE_READ_FENCE;
+
+ switch (wr->opcode) {
+ case IB_WR_SEND:
+ case IB_WR_SEND_WITH_INV:
+ if (wr->send_flags & IB_SEND_SOLICITED)
+ sqe->flags |= SIW_WQE_SOLICITED;
+
+ if (!(wr->send_flags & IB_SEND_INLINE)) {
+ siw_copy_sgl(wr->sg_list, sqe->sge,
+ wr->num_sge);
+ sqe->num_sge = wr->num_sge;
+ } else {
+ rv = siw_copy_inline_sgl(wr, sqe);
+ if (rv <= 0) {
+ rv = -EINVAL;
+ break;
+ }
+ sqe->flags |= SIW_WQE_INLINE;
+ sqe->num_sge = 1;
+ }
+ if (wr->opcode == IB_WR_SEND)
+ sqe->opcode = SIW_OP_SEND;
+ else {
+ sqe->opcode = SIW_OP_SEND_REMOTE_INV;
+ sqe->rkey = wr->ex.invalidate_rkey;
+ }
+ break;
+
+ case IB_WR_RDMA_READ_WITH_INV:
+ case IB_WR_RDMA_READ:
+ /*
+ * iWarp restricts RREAD sink to SGL containing
+ * 1 SGE only. we could relax to SGL with multiple
+ * elements referring the SAME ltag or even sending
+ * a private per-rreq tag referring to a checked
+ * local sgl with MULTIPLE ltag's.
+ */
+ if (unlikely(wr->num_sge != 1)) {
+ rv = -EINVAL;
+ break;
+ }
+ siw_copy_sgl(wr->sg_list, &sqe->sge[0], 1);
+ /*
+ * NOTE: zero length RREAD is allowed!
+ */
+ sqe->raddr = rdma_wr(wr)->remote_addr;
+ sqe->rkey = rdma_wr(wr)->rkey;
+ sqe->num_sge = 1;
+
+ if (wr->opcode == IB_WR_RDMA_READ)
+ sqe->opcode = SIW_OP_READ;
+ else
+ sqe->opcode = SIW_OP_READ_LOCAL_INV;
+ break;
+
+ case IB_WR_RDMA_WRITE:
+ if (!(wr->send_flags & IB_SEND_INLINE)) {
+ siw_copy_sgl(wr->sg_list, &sqe->sge[0],
+ wr->num_sge);
+ sqe->num_sge = wr->num_sge;
+ } else {
+ rv = siw_copy_inline_sgl(wr, sqe);
+ if (unlikely(rv < 0)) {
+ rv = -EINVAL;
+ break;
+ }
+ sqe->flags |= SIW_WQE_INLINE;
+ sqe->num_sge = 1;
+ }
+ sqe->raddr = rdma_wr(wr)->remote_addr;
+ sqe->rkey = rdma_wr(wr)->rkey;
+ sqe->opcode = SIW_OP_WRITE;
+ break;
+
+ case IB_WR_REG_MR:
+ sqe->base_mr = (uint64_t)reg_wr(wr)->mr;
+ sqe->rkey = reg_wr(wr)->key;
+ sqe->access = reg_wr(wr)->access & IWARP_ACCESS_MASK;
+ sqe->opcode = SIW_OP_REG_MR;
+ break;
+
+ case IB_WR_LOCAL_INV:
+ sqe->rkey = wr->ex.invalidate_rkey;
+ sqe->opcode = SIW_OP_INVAL_STAG;
+ break;
+
+ default:
+ siw_dbg_qp(qp, "ib wr type %d unsupported\n",
+ wr->opcode);
+ rv = -EINVAL;
+ break;
+ }
+ siw_dbg_qp(qp, "opcode %d, flags 0x%x, wr_id 0x%p\n",
+ sqe->opcode, sqe->flags, (void *)sqe->id);
+
+ if (unlikely(rv < 0))
+ break;
+
+ /* make SQE only valid after completely written */
+ smp_wmb();
+ sqe->flags |= SIW_WQE_VALID;
+
+ qp->sq_put++;
+ wr = wr->next;
+ }
+
+ /*
+ * Send directly if SQ processing is not in progress.
+ * Eventual immediate errors (rv < 0) do not affect the involved
+ * RI resources (Verbs, 8.3.1) and thus do not prevent from SQ
+ * processing, if new work is already pending. But rv must be passed
+ * to caller.
+ */
+ if (wqe->wr_status != SIW_WR_IDLE) {
+ spin_unlock_irqrestore(&qp->sq_lock, flags);
+ goto skip_direct_sending;
+ }
+ rv = siw_activate_tx(qp);
+ spin_unlock_irqrestore(&qp->sq_lock, flags);
+
+ if (rv <= 0)
+ goto skip_direct_sending;
+
+ if (qp->kernel_verbs) {
+ rv = siw_sq_start(qp);
+ } else {
+ qp->tx_ctx.in_syscall = 1;
+
+ if (siw_qp_sq_process(qp) != 0 && !(qp->tx_ctx.tx_suspend))
+ siw_qp_cm_drop(qp, 0);
+
+ qp->tx_ctx.in_syscall = 0;
+ }
+skip_direct_sending:
+
+ up_read(&qp->state_lock);
+
+ if (rv >= 0)
+ return 0;
+ /*
+ * Immediate error
+ */
+ siw_dbg_qp(qp, "error %d\n", rv);
+
+ *bad_wr = wr;
+ return rv;
+}
+
+/*
+ * siw_post_receive()
+ *
+ * Post a list of R-WR's to a RQ.
+ *
+ * @base_qp: Base QP contained in siw QP
+ * @wr: Null terminated list of user WR's
+ * @bad_wr: Points to failing WR in case of synchronous failure.
+ */
+int siw_post_receive(struct ib_qp *base_qp, const struct ib_recv_wr *wr,
+ const struct ib_recv_wr **bad_wr)
+{
+ struct siw_qp *qp = to_siw_qp(base_qp);
+ unsigned long flags;
+ int rv = 0;
+
+ if (qp->srq) {
+ *bad_wr = wr;
+ return -EOPNOTSUPP; /* what else from errno.h? */
+ }
+ /*
+ * Try to acquire QP state lock. Must be non-blocking
+ * to accommodate kernel clients needs.
+ */
+ if (!down_read_trylock(&qp->state_lock)) {
+ *bad_wr = wr;
+ return -ENOTCONN;
+ }
+ if (!qp->kernel_verbs) {
+ siw_dbg_qp(qp, "no kernel post_recv for user mapped sq\n");
+ up_read(&qp->state_lock);
+ *bad_wr = wr;
+ return -EINVAL;
+ }
+ if (qp->attrs.state > SIW_QP_STATE_RTS) {
+ up_read(&qp->state_lock);
+ *bad_wr = wr;
+ return -EINVAL;
+ }
+ /*
+ * Serialize potentially multiple producers.
+ * Not needed for single threaded consumer side.
+ */
+ spin_lock_irqsave(&qp->rq_lock, flags);
+
+ while (wr) {
+ u32 idx = qp->rq_put % qp->attrs.rq_size;
+ struct siw_rqe *rqe = &qp->recvq[idx];
+
+ if (rqe->flags) {
+ siw_dbg_qp(qp, "RQ full\n");
+ rv = -ENOMEM;
+ break;
+ }
+ if (wr->num_sge > qp->attrs.rq_max_sges) {
+ siw_dbg_qp(qp, "too many sge's: %d\n", wr->num_sge);
+ rv = -EINVAL;
+ break;
+ }
+ rqe->id = wr->wr_id;
+ rqe->num_sge = wr->num_sge;
+ siw_copy_sgl(wr->sg_list, rqe->sge, wr->num_sge);
+
+ /* make sure RQE is completely written before valid */
+ smp_wmb();
+
+ rqe->flags = SIW_WQE_VALID;
+
+ qp->rq_put++;
+ wr = wr->next;
+ }
+ spin_unlock_irqrestore(&qp->rq_lock, flags);
+
+ up_read(&qp->state_lock);
+
+ if (rv < 0) {
+ siw_dbg_qp(qp, "error %d\n", rv);
+ *bad_wr = wr;
+ }
+ return rv > 0 ? 0 : rv;
+}
+
+void siw_destroy_cq(struct ib_cq *base_cq, struct ib_udata *udata)
+{
+ struct siw_cq *cq = to_siw_cq(base_cq);
+ struct siw_device *sdev = to_siw_dev(base_cq->device);
+ struct siw_ucontext *ctx =
+ rdma_udata_to_drv_context(udata, struct siw_ucontext,
+ base_ucontext);
+
+ siw_dbg_cq(cq, "free CQ resources\n");
+
+ siw_cq_flush(cq);
+
+ if (ctx && cq->xa_cq_index != SIW_INVAL_UOBJ_KEY)
+ kfree(xa_erase(&ctx->xa, cq->xa_cq_index));
+
+ atomic_dec(&sdev->num_cq);
+
+ vfree(cq->queue);
+}
+
+/*
+ * siw_create_cq()
+ *
+ * Populate CQ of requested size
+ *
+ * @base_cq: CQ as allocated by RDMA midlayer
+ * @attr: Initial CQ attributes
+ * @udata: relates to user context
+ */
+
+int siw_create_cq(struct ib_cq *base_cq, const struct ib_cq_init_attr *attr,
+ struct ib_udata *udata)
+{
+ struct siw_device *sdev = to_siw_dev(base_cq->device);
+ struct siw_cq *cq = to_siw_cq(base_cq);
+ int rv, size = attr->cqe;
+
+ if (atomic_inc_return(&sdev->num_cq) > SIW_MAX_CQ) {
+ siw_dbg(base_cq->device, "too many CQ's\n");
+ rv = -ENOMEM;
+ goto err_out;
+ }
+ if (size < 1 || size > sdev->attrs.max_cqe) {
+ siw_dbg(base_cq->device, "CQ size error: %d\n", size);
+ rv = -EINVAL;
+ goto err_out;
+ }
+ size = roundup_pow_of_two(size);
+ cq->base_cq.cqe = size;
+ cq->num_cqe = size;
+ cq->xa_cq_index = SIW_INVAL_UOBJ_KEY;
+
+ if (!udata) {
+ cq->kernel_verbs = 1;
+ cq->queue = vzalloc(size * sizeof(struct siw_cqe) +
+ sizeof(struct siw_cq_ctrl));
+ } else {
+ cq->queue = vmalloc_user(size * sizeof(struct siw_cqe) +
+ sizeof(struct siw_cq_ctrl));
+ }
+ if (cq->queue == NULL) {
+ rv = -ENOMEM;
+ goto err_out;
+ }
+ get_random_bytes(&cq->id, 4);
+ siw_dbg(base_cq->device, "new CQ [%u]\n", cq->id);
+
+ spin_lock_init(&cq->lock);
+
+ cq->notify = &((struct siw_cq_ctrl *)&cq->queue[size])->notify;
+
+ if (udata) {
+ struct siw_uresp_create_cq uresp = {};
+ struct siw_ucontext *ctx =
+ rdma_udata_to_drv_context(udata, struct siw_ucontext,
+ base_ucontext);
+
+ cq->xa_cq_index =
+ siw_create_uobj(ctx, cq->queue,
+ size * sizeof(struct siw_cqe) +
+ sizeof(struct siw_cq_ctrl));
+ if (cq->xa_cq_index == SIW_INVAL_UOBJ_KEY) {
+ rv = -ENOMEM;
+ goto err_out;
+ }
+ uresp.cq_key = cq->xa_cq_index << PAGE_SHIFT;
+ uresp.cq_id = cq->id;
+ uresp.num_cqe = size;
+
+ if (udata->outlen < sizeof(uresp)) {
+ rv = -EINVAL;
+ goto err_out;
+ }
+ rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp));
+ if (rv)
+ goto err_out;
+ }
+ return 0;
+
+err_out:
+ siw_dbg(base_cq->device, "CQ creation failed: %d", rv);
+
+ if (cq && cq->queue) {
+ struct siw_ucontext *ctx =
+ rdma_udata_to_drv_context(udata, struct siw_ucontext,
+ base_ucontext);
+ if (cq->xa_cq_index != SIW_INVAL_UOBJ_KEY)
+ kfree(xa_erase(&ctx->xa, cq->xa_cq_index));
+ vfree(cq->queue);
+ }
+ atomic_dec(&sdev->num_cq);
+
+ return rv;
+}
+
+/*
+ * siw_poll_cq()
+ *
+ * Reap CQ entries if available and copy work completion status into
+ * array of WC's provided by caller. Returns number of reaped CQE's.
+ *
+ * @base_cq: Base CQ contained in siw CQ.
+ * @num_cqe: Maximum number of CQE's to reap.
+ * @wc: Array of work completions to be filled by siw.
+ */
+int siw_poll_cq(struct ib_cq *base_cq, int num_cqe, struct ib_wc *wc)
+{
+ struct siw_cq *cq = to_siw_cq(base_cq);
+ int i;
+
+ for (i = 0; i < num_cqe; i++) {
+ if (!siw_reap_cqe(cq, wc))
+ break;
+ wc++;
+ }
+ return i;
+}
+
+/*
+ * siw_req_notify_cq()
+ *
+ * Request notification for new CQE's added to that CQ.
+ * Defined flags:
+ * o SIW_CQ_NOTIFY_SOLICITED lets siw trigger a notification
+ * event if a WQE with notification flag set enters the CQ
+ * o SIW_CQ_NOTIFY_NEXT_COMP lets siw trigger a notification
+ * event if a WQE enters the CQ.
+ * o IB_CQ_REPORT_MISSED_EVENTS: return value will provide the
+ * number of not reaped CQE's regardless of its notification
+ * type and current or new CQ notification settings.
+ *
+ * @base_cq: Base CQ contained in siw CQ.
+ * @flags: Requested notification flags.
+ */
+int siw_req_notify_cq(struct ib_cq *base_cq, enum ib_cq_notify_flags flags)
+{
+ struct siw_cq *cq = to_siw_cq(base_cq);
+
+ siw_dbg_cq(cq, "flags: 0x%02x\n", flags);
+
+ if ((flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED)
+ /* CQ event for next solicited completion */
+ smp_store_mb(*cq->notify, SIW_NOTIFY_SOLICITED);
+ else
+ /* CQ event for any signalled completion */
+ smp_store_mb(*cq->notify, SIW_NOTIFY_ALL);
+
+ if (flags & IB_CQ_REPORT_MISSED_EVENTS)
+ return cq->cq_put - cq->cq_get;
+
+ return 0;
+}
+
+/*
+ * siw_dereg_mr()
+ *
+ * Release Memory Region.
+ *
+ * @base_mr: Base MR contained in siw MR.
+ * @udata: points to user context, unused.
+ */
+int siw_dereg_mr(struct ib_mr *base_mr, struct ib_udata *udata)
+{
+ struct siw_mr *mr = to_siw_mr(base_mr);
+ struct siw_device *sdev = to_siw_dev(base_mr->device);
+
+ siw_dbg_mem(mr->mem, "deregister MR\n");
+
+ atomic_dec(&sdev->num_mr);
+
+ siw_mr_drop_mem(mr);
+ kfree_rcu(mr, rcu);
+
+ return 0;
+}
+
+/*
+ * siw_reg_user_mr()
+ *
+ * Register Memory Region.
+ *
+ * @pd: Protection Domain
+ * @start: starting address of MR (virtual address)
+ * @len: len of MR
+ * @rnic_va: not used by siw
+ * @rights: MR access rights
+ * @udata: user buffer to communicate STag and Key.
+ */
+struct ib_mr *siw_reg_user_mr(struct ib_pd *pd, u64 start, u64 len,
+ u64 rnic_va, int rights, struct ib_udata *udata)
+{
+ struct siw_mr *mr = NULL;
+ struct siw_umem *umem = NULL;
+ struct siw_ureq_reg_mr ureq;
+ struct siw_device *sdev = to_siw_dev(pd->device);
+
+ unsigned long mem_limit = rlimit(RLIMIT_MEMLOCK);
+ int rv;
+
+ siw_dbg_pd(pd, "start: 0x%016llx, va: 0x%016llx, len: %llu\n",
+ (unsigned long long)start, (unsigned long long)rnic_va,
+ (unsigned long long)len);
+
+ if (atomic_inc_return(&sdev->num_mr) > SIW_MAX_MR) {
+ siw_dbg_pd(pd, "too many mr's\n");
+ rv = -ENOMEM;
+ goto err_out;
+ }
+ if (!len) {
+ rv = -EINVAL;
+ goto err_out;
+ }
+ if (mem_limit != RLIM_INFINITY) {
+ unsigned long num_pages =
+ (PAGE_ALIGN(len + (start & ~PAGE_MASK))) >> PAGE_SHIFT;
+ mem_limit >>= PAGE_SHIFT;
+
+ if (num_pages > mem_limit - current->mm->locked_vm) {
+ siw_dbg_pd(pd, "pages req %lu, max %lu, lock %lu\n",
+ num_pages, mem_limit,
+ current->mm->locked_vm);
+ rv = -ENOMEM;
+ goto err_out;
+ }
+ }
+ umem = siw_umem_get(start, len, ib_access_writable(rights));
+ if (IS_ERR(umem)) {
+ rv = PTR_ERR(umem);
+ siw_dbg_pd(pd, "getting user memory failed: %d\n", rv);
+ umem = NULL;
+ goto err_out;
+ }
+ mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+ if (!mr) {
+ rv = -ENOMEM;
+ goto err_out;
+ }
+ rv = siw_mr_add_mem(mr, pd, umem, start, len, rights);
+ if (rv)
+ goto err_out;
+
+ if (udata) {
+ struct siw_uresp_reg_mr uresp = {};
+ struct siw_mem *mem = mr->mem;
+
+ if (udata->inlen < sizeof(ureq)) {
+ rv = -EINVAL;
+ goto err_out;
+ }
+ rv = ib_copy_from_udata(&ureq, udata, sizeof(ureq));
+ if (rv)
+ goto err_out;
+
+ mr->base_mr.lkey |= ureq.stag_key;
+ mr->base_mr.rkey |= ureq.stag_key;
+ mem->stag |= ureq.stag_key;
+ uresp.stag = mem->stag;
+
+ if (udata->outlen < sizeof(uresp)) {
+ rv = -EINVAL;
+ goto err_out;
+ }
+ rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp));
+ if (rv)
+ goto err_out;
+ }
+ mr->mem->stag_valid = 1;
+
+ return &mr->base_mr;
+
+err_out:
+ atomic_dec(&sdev->num_mr);
+ if (mr) {
+ if (mr->mem)
+ siw_mr_drop_mem(mr);
+ kfree_rcu(mr, rcu);
+ } else {
+ if (umem)
+ siw_umem_release(umem, false);
+ }
+ return ERR_PTR(rv);
+}
+
+struct ib_mr *siw_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
+ u32 max_sge, struct ib_udata *udata)
+{
+ struct siw_device *sdev = to_siw_dev(pd->device);
+ struct siw_mr *mr = NULL;
+ struct siw_pbl *pbl = NULL;
+ int rv;
+
+ if (atomic_inc_return(&sdev->num_mr) > SIW_MAX_MR) {
+ siw_dbg_pd(pd, "too many mr's\n");
+ rv = -ENOMEM;
+ goto err_out;
+ }
+ if (mr_type != IB_MR_TYPE_MEM_REG) {
+ siw_dbg_pd(pd, "mr type %d unsupported\n", mr_type);
+ rv = -EOPNOTSUPP;
+ goto err_out;
+ }
+ if (max_sge > SIW_MAX_SGE_PBL) {
+ siw_dbg_pd(pd, "too many sge's: %d\n", max_sge);
+ rv = -ENOMEM;
+ goto err_out;
+ }
+ pbl = siw_pbl_alloc(max_sge);
+ if (IS_ERR(pbl)) {
+ rv = PTR_ERR(pbl);
+ siw_dbg_pd(pd, "pbl allocation failed: %d\n", rv);
+ pbl = NULL;
+ goto err_out;
+ }
+ mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+ if (!mr) {
+ rv = -ENOMEM;
+ goto err_out;
+ }
+ rv = siw_mr_add_mem(mr, pd, pbl, 0, max_sge * PAGE_SIZE, 0);
+ if (rv)
+ goto err_out;
+
+ mr->mem->is_pbl = 1;
+
+ siw_dbg_pd(pd, "[MEM %u]: success\n", mr->mem->stag);
+
+ return &mr->base_mr;
+
+err_out:
+ atomic_dec(&sdev->num_mr);
+
+ if (!mr) {
+ kfree(pbl);
+ } else {
+ if (mr->mem)
+ siw_mr_drop_mem(mr);
+ kfree_rcu(mr, rcu);
+ }
+ siw_dbg_pd(pd, "failed: %d\n", rv);
+
+ return ERR_PTR(rv);
+}
+
+/* Just used to count number of pages being mapped */
+static int siw_set_pbl_page(struct ib_mr *base_mr, u64 buf_addr)
+{
+ return 0;
+}
+
+int siw_map_mr_sg(struct ib_mr *base_mr, struct scatterlist *sl, int num_sle,
+ unsigned int *sg_off)
+{
+ struct scatterlist *slp;
+ struct siw_mr *mr = to_siw_mr(base_mr);
+ struct siw_mem *mem = mr->mem;
+ struct siw_pbl *pbl = mem->pbl;
+ struct siw_pble *pble;
+ u64 pbl_size;
+ int i, rv;
+
+ if (!pbl) {
+ siw_dbg_mem(mem, "no PBL allocated\n");
+ return -EINVAL;
+ }
+ pble = pbl->pbe;
+
+ if (pbl->max_buf < num_sle) {
+ siw_dbg_mem(mem, "too many SGE's: %d > %d\n",
+ mem->pbl->max_buf, num_sle);
+ return -ENOMEM;
+ }
+ for_each_sg(sl, slp, num_sle, i) {
+ if (sg_dma_len(slp) == 0) {
+ siw_dbg_mem(mem, "empty SGE\n");
+ return -EINVAL;
+ }
+ if (i == 0) {
+ pble->addr = sg_dma_address(slp);
+ pble->size = sg_dma_len(slp);
+ pble->pbl_off = 0;
+ pbl_size = pble->size;
+ pbl->num_buf = 1;
+ } else {
+ /* Merge PBL entries if adjacent */
+ if (pble->addr + pble->size == sg_dma_address(slp)) {
+ pble->size += sg_dma_len(slp);
+ } else {
+ pble++;
+ pbl->num_buf++;
+ pble->addr = sg_dma_address(slp);
+ pble->size = sg_dma_len(slp);
+ pble->pbl_off = pbl_size;
+ }
+ pbl_size += sg_dma_len(slp);
+ }
+ siw_dbg_mem(mem,
+ "sge[%d], size %llu, addr 0x%016llx, total %llu\n",
+ i, pble->size, pble->addr, pbl_size);
+ }
+ rv = ib_sg_to_pages(base_mr, sl, num_sle, sg_off, siw_set_pbl_page);
+ if (rv > 0) {
+ mem->len = base_mr->length;
+ mem->va = base_mr->iova;
+ siw_dbg_mem(mem,
+ "%llu bytes, start 0x%016llx, %u SLE to %u entries\n",
+ mem->len, mem->va, num_sle, pbl->num_buf);
+ }
+ return rv;
+}
+
+/*
+ * siw_get_dma_mr()
+ *
+ * Create a (empty) DMA memory region, where no umem is attached.
+ */
+struct ib_mr *siw_get_dma_mr(struct ib_pd *pd, int rights)
+{
+ struct siw_device *sdev = to_siw_dev(pd->device);
+ struct siw_mr *mr = NULL;
+ int rv;
+
+ if (atomic_inc_return(&sdev->num_mr) > SIW_MAX_MR) {
+ siw_dbg_pd(pd, "too many mr's\n");
+ rv = -ENOMEM;
+ goto err_out;
+ }
+ mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+ if (!mr) {
+ rv = -ENOMEM;
+ goto err_out;
+ }
+ rv = siw_mr_add_mem(mr, pd, NULL, 0, ULONG_MAX, rights);
+ if (rv)
+ goto err_out;
+
+ mr->mem->stag_valid = 1;
+
+ siw_dbg_pd(pd, "[MEM %u]: success\n", mr->mem->stag);
+
+ return &mr->base_mr;
+
+err_out:
+ if (rv)
+ kfree(mr);
+
+ atomic_dec(&sdev->num_mr);
+
+ return ERR_PTR(rv);
+}
+
+/*
+ * siw_create_srq()
+ *
+ * Create Shared Receive Queue of attributes @init_attrs
+ * within protection domain given by @pd.
+ *
+ * @base_srq: Base SRQ contained in siw SRQ.
+ * @init_attrs: SRQ init attributes.
+ * @udata: points to user context
+ */
+int siw_create_srq(struct ib_srq *base_srq,
+ struct ib_srq_init_attr *init_attrs, struct ib_udata *udata)
+{
+ struct siw_srq *srq = to_siw_srq(base_srq);
+ struct ib_srq_attr *attrs = &init_attrs->attr;
+ struct siw_device *sdev = to_siw_dev(base_srq->device);
+ struct siw_ucontext *ctx =
+ rdma_udata_to_drv_context(udata, struct siw_ucontext,
+ base_ucontext);
+ int rv;
+
+ if (atomic_inc_return(&sdev->num_srq) > SIW_MAX_SRQ) {
+ siw_dbg_pd(base_srq->pd, "too many SRQ's\n");
+ rv = -ENOMEM;
+ goto err_out;
+ }
+ if (attrs->max_wr == 0 || attrs->max_wr > SIW_MAX_SRQ_WR ||
+ attrs->max_sge > SIW_MAX_SGE || attrs->srq_limit > attrs->max_wr) {
+ rv = -EINVAL;
+ goto err_out;
+ }
+ srq->max_sge = attrs->max_sge;
+ srq->num_rqe = roundup_pow_of_two(attrs->max_wr);
+ srq->xa_srq_index = SIW_INVAL_UOBJ_KEY;
+ srq->limit = attrs->srq_limit;
+ if (srq->limit)
+ srq->armed = 1;
+
+ srq->kernel_verbs = !udata;
+
+ if (udata)
+ srq->recvq =
+ vmalloc_user(srq->num_rqe * sizeof(struct siw_rqe));
+ else
+ srq->recvq = vzalloc(srq->num_rqe * sizeof(struct siw_rqe));
+
+ if (srq->recvq == NULL) {
+ rv = -ENOMEM;
+ goto err_out;
+ }
+ if (udata) {
+ struct siw_uresp_create_srq uresp = {};
+
+ srq->xa_srq_index = siw_create_uobj(
+ ctx, srq->recvq, srq->num_rqe * sizeof(struct siw_rqe));
+
+ if (srq->xa_srq_index == SIW_INVAL_UOBJ_KEY) {
+ rv = -ENOMEM;
+ goto err_out;
+ }
+ uresp.srq_key = srq->xa_srq_index;
+ uresp.num_rqe = srq->num_rqe;
+
+ if (udata->outlen < sizeof(uresp)) {
+ rv = -EINVAL;
+ goto err_out;
+ }
+ rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp));
+ if (rv)
+ goto err_out;
+ }
+ spin_lock_init(&srq->lock);
+
+ siw_dbg_pd(base_srq->pd, "[SRQ 0x%p]: success\n", srq);
+
+ return 0;
+
+err_out:
+ if (srq->recvq) {
+ if (ctx && srq->xa_srq_index != SIW_INVAL_UOBJ_KEY)
+ kfree(xa_erase(&ctx->xa, srq->xa_srq_index));
+ vfree(srq->recvq);
+ }
+ atomic_dec(&sdev->num_srq);
+
+ return rv;
+}
+
+/*
+ * siw_modify_srq()
+ *
+ * Modify SRQ. The caller may resize SRQ and/or set/reset notification
+ * limit and (re)arm IB_EVENT_SRQ_LIMIT_REACHED notification.
+ *
+ * NOTE: it is unclear if RDMA core allows for changing the MAX_SGE
+ * parameter. siw_modify_srq() does not check the attrs->max_sge param.
+ */
+int siw_modify_srq(struct ib_srq *base_srq, struct ib_srq_attr *attrs,
+ enum ib_srq_attr_mask attr_mask, struct ib_udata *udata)
+{
+ struct siw_srq *srq = to_siw_srq(base_srq);
+ unsigned long flags;
+ int rv = 0;
+
+ spin_lock_irqsave(&srq->lock, flags);
+
+ if (attr_mask & IB_SRQ_MAX_WR) {
+ /* resize request not yet supported */
+ rv = -EOPNOTSUPP;
+ goto out;
+ }
+ if (attr_mask & IB_SRQ_LIMIT) {
+ if (attrs->srq_limit) {
+ if (unlikely(attrs->srq_limit > srq->num_rqe)) {
+ rv = -EINVAL;
+ goto out;
+ }
+ srq->armed = 1;
+ } else {
+ srq->armed = 0;
+ }
+ srq->limit = attrs->srq_limit;
+ }
+out:
+ spin_unlock_irqrestore(&srq->lock, flags);
+
+ return rv;
+}
+
+/*
+ * siw_query_srq()
+ *
+ * Query SRQ attributes.
+ */
+int siw_query_srq(struct ib_srq *base_srq, struct ib_srq_attr *attrs)
+{
+ struct siw_srq *srq = to_siw_srq(base_srq);
+ unsigned long flags;
+
+ spin_lock_irqsave(&srq->lock, flags);
+
+ attrs->max_wr = srq->num_rqe;
+ attrs->max_sge = srq->max_sge;
+ attrs->srq_limit = srq->limit;
+
+ spin_unlock_irqrestore(&srq->lock, flags);
+
+ return 0;
+}
+
+/*
+ * siw_destroy_srq()
+ *
+ * Destroy SRQ.
+ * It is assumed that the SRQ is not referenced by any
+ * QP anymore - the code trusts the RDMA core environment to keep track
+ * of QP references.
+ */
+void siw_destroy_srq(struct ib_srq *base_srq, struct ib_udata *udata)
+{
+ struct siw_srq *srq = to_siw_srq(base_srq);
+ struct siw_device *sdev = to_siw_dev(base_srq->device);
+ struct siw_ucontext *ctx =
+ rdma_udata_to_drv_context(udata, struct siw_ucontext,
+ base_ucontext);
+
+ if (ctx && srq->xa_srq_index != SIW_INVAL_UOBJ_KEY)
+ kfree(xa_erase(&ctx->xa, srq->xa_srq_index));
+
+ vfree(srq->recvq);
+ atomic_dec(&sdev->num_srq);
+}
+
+/*
+ * siw_post_srq_recv()
+ *
+ * Post a list of receive queue elements to SRQ.
+ * NOTE: The function does not check or lock a certain SRQ state
+ * during the post operation. The code simply trusts the
+ * RDMA core environment.
+ *
+ * @base_srq: Base SRQ contained in siw SRQ
+ * @wr: List of R-WR's
+ * @bad_wr: Updated to failing WR if posting fails.
+ */
+int siw_post_srq_recv(struct ib_srq *base_srq, const struct ib_recv_wr *wr,
+ const struct ib_recv_wr **bad_wr)
+{
+ struct siw_srq *srq = to_siw_srq(base_srq);
+ unsigned long flags;
+ int rv = 0;
+
+ if (unlikely(!srq->kernel_verbs)) {
+ siw_dbg_pd(base_srq->pd,
+ "[SRQ 0x%p]: no kernel post_recv for mapped srq\n",
+ srq);
+ rv = -EINVAL;
+ goto out;
+ }
+ /*
+ * Serialize potentially multiple producers.
+ * Also needed to serialize potentially multiple
+ * consumers.
+ */
+ spin_lock_irqsave(&srq->lock, flags);
+
+ while (wr) {
+ u32 idx = srq->rq_put % srq->num_rqe;
+ struct siw_rqe *rqe = &srq->recvq[idx];
+
+ if (rqe->flags) {
+ siw_dbg_pd(base_srq->pd, "SRQ full\n");
+ rv = -ENOMEM;
+ break;
+ }
+ if (unlikely(wr->num_sge > srq->max_sge)) {
+ siw_dbg_pd(base_srq->pd,
+ "[SRQ 0x%p]: too many sge's: %d\n", srq,
+ wr->num_sge);
+ rv = -EINVAL;
+ break;
+ }
+ rqe->id = wr->wr_id;
+ rqe->num_sge = wr->num_sge;
+ siw_copy_sgl(wr->sg_list, rqe->sge, wr->num_sge);
+
+ /* Make sure S-RQE is completely written before valid */
+ smp_wmb();
+
+ rqe->flags = SIW_WQE_VALID;
+
+ srq->rq_put++;
+ wr = wr->next;
+ }
+ spin_unlock_irqrestore(&srq->lock, flags);
+out:
+ if (unlikely(rv < 0)) {
+ siw_dbg_pd(base_srq->pd, "[SRQ 0x%p]: error %d\n", srq, rv);
+ *bad_wr = wr;
+ }
+ return rv;
+}
+
+void siw_qp_event(struct siw_qp *qp, enum ib_event_type etype)
+{
+ struct ib_event event;
+ struct ib_qp *base_qp = qp->ib_qp;
+
+ /*
+ * Do not report asynchronous errors on QP which gets
+ * destroyed via verbs interface (siw_destroy_qp())
+ */
+ if (qp->attrs.flags & SIW_QP_IN_DESTROY)
+ return;
+
+ event.event = etype;
+ event.device = base_qp->device;
+ event.element.qp = base_qp;
+
+ if (base_qp->event_handler) {
+ siw_dbg_qp(qp, "reporting event %d\n", etype);
+ base_qp->event_handler(&event, base_qp->qp_context);
+ }
+}
+
+void siw_cq_event(struct siw_cq *cq, enum ib_event_type etype)
+{
+ struct ib_event event;
+ struct ib_cq *base_cq = &cq->base_cq;
+
+ event.event = etype;
+ event.device = base_cq->device;
+ event.element.cq = base_cq;
+
+ if (base_cq->event_handler) {
+ siw_dbg_cq(cq, "reporting CQ event %d\n", etype);
+ base_cq->event_handler(&event, base_cq->cq_context);
+ }
+}
+
+void siw_srq_event(struct siw_srq *srq, enum ib_event_type etype)
+{
+ struct ib_event event;
+ struct ib_srq *base_srq = &srq->base_srq;
+
+ event.event = etype;
+ event.device = base_srq->device;
+ event.element.srq = base_srq;
+
+ if (base_srq->event_handler) {
+ siw_dbg_pd(srq->base_srq.pd,
+ "reporting SRQ event %d\n", etype);
+ base_srq->event_handler(&event, base_srq->srq_context);
+ }
+}
+
+void siw_port_event(struct siw_device *sdev, u8 port, enum ib_event_type etype)
+{
+ struct ib_event event;
+
+ event.event = etype;
+ event.device = &sdev->base_dev;
+ event.element.port_num = port;
+
+ siw_dbg(&sdev->base_dev, "reporting port event %d\n", etype);
+
+ ib_dispatch_event(&event);
+}
diff --git a/drivers/infiniband/sw/siw/siw_verbs.h b/drivers/infiniband/sw/siw/siw_verbs.h
new file mode 100644
index 000000000000..1910869281cb
--- /dev/null
+++ b/drivers/infiniband/sw/siw/siw_verbs.h
@@ -0,0 +1,91 @@
+/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause */
+
+/* Authors: Bernard Metzler <bmt@zurich.ibm.com> */
+/* Copyright (c) 2008-2019, IBM Corporation */
+
+#ifndef _SIW_VERBS_H
+#define _SIW_VERBS_H
+
+#include <linux/errno.h>
+
+#include <rdma/iw_cm.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_user_verbs.h>
+
+#include "siw.h"
+#include "siw_cm.h"
+
+/*
+ * siw_copy_sgl()
+ *
+ * Copy SGL from RDMA core representation to local
+ * representation.
+ */
+static inline void siw_copy_sgl(struct ib_sge *sge, struct siw_sge *siw_sge,
+ int num_sge)
+{
+ while (num_sge--) {
+ siw_sge->laddr = sge->addr;
+ siw_sge->length = sge->length;
+ siw_sge->lkey = sge->lkey;
+
+ siw_sge++;
+ sge++;
+ }
+}
+
+int siw_alloc_ucontext(struct ib_ucontext *base_ctx, struct ib_udata *udata);
+void siw_dealloc_ucontext(struct ib_ucontext *base_ctx);
+int siw_query_port(struct ib_device *base_dev, u8 port,
+ struct ib_port_attr *attr);
+int siw_get_port_immutable(struct ib_device *base_dev, u8 port,
+ struct ib_port_immutable *port_immutable);
+int siw_query_device(struct ib_device *base_dev, struct ib_device_attr *attr,
+ struct ib_udata *udata);
+int siw_create_cq(struct ib_cq *base_cq, const struct ib_cq_init_attr *attr,
+ struct ib_udata *udata);
+int siw_query_port(struct ib_device *base_dev, u8 port,
+ struct ib_port_attr *attr);
+int siw_query_pkey(struct ib_device *base_dev, u8 port, u16 idx, u16 *pkey);
+int siw_query_gid(struct ib_device *base_dev, u8 port, int idx,
+ union ib_gid *gid);
+int siw_alloc_pd(struct ib_pd *base_pd, struct ib_udata *udata);
+void siw_dealloc_pd(struct ib_pd *base_pd, struct ib_udata *udata);
+struct ib_qp *siw_create_qp(struct ib_pd *base_pd,
+ struct ib_qp_init_attr *attr,
+ struct ib_udata *udata);
+int siw_query_qp(struct ib_qp *base_qp, struct ib_qp_attr *qp_attr,
+ int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr);
+int siw_verbs_modify_qp(struct ib_qp *base_qp, struct ib_qp_attr *attr,
+ int attr_mask, struct ib_udata *udata);
+int siw_destroy_qp(struct ib_qp *base_qp, struct ib_udata *udata);
+int siw_post_send(struct ib_qp *base_qp, const struct ib_send_wr *wr,
+ const struct ib_send_wr **bad_wr);
+int siw_post_receive(struct ib_qp *base_qp, const struct ib_recv_wr *wr,
+ const struct ib_recv_wr **bad_wr);
+void siw_destroy_cq(struct ib_cq *base_cq, struct ib_udata *udata);
+int siw_poll_cq(struct ib_cq *base_cq, int num_entries, struct ib_wc *wc);
+int siw_req_notify_cq(struct ib_cq *base_cq, enum ib_cq_notify_flags flags);
+struct ib_mr *siw_reg_user_mr(struct ib_pd *base_pd, u64 start, u64 len,
+ u64 rnic_va, int rights, struct ib_udata *udata);
+struct ib_mr *siw_alloc_mr(struct ib_pd *base_pd, enum ib_mr_type mr_type,
+ u32 max_sge, struct ib_udata *udata);
+struct ib_mr *siw_get_dma_mr(struct ib_pd *base_pd, int rights);
+int siw_map_mr_sg(struct ib_mr *base_mr, struct scatterlist *sl, int num_sle,
+ unsigned int *sg_off);
+int siw_dereg_mr(struct ib_mr *base_mr, struct ib_udata *udata);
+int siw_create_srq(struct ib_srq *base_srq, struct ib_srq_init_attr *attr,
+ struct ib_udata *udata);
+int siw_modify_srq(struct ib_srq *base_srq, struct ib_srq_attr *attr,
+ enum ib_srq_attr_mask mask, struct ib_udata *udata);
+int siw_query_srq(struct ib_srq *base_srq, struct ib_srq_attr *attr);
+void siw_destroy_srq(struct ib_srq *base_srq, struct ib_udata *udata);
+int siw_post_srq_recv(struct ib_srq *base_srq, const struct ib_recv_wr *wr,
+ const struct ib_recv_wr **bad_wr);
+int siw_mmap(struct ib_ucontext *ctx, struct vm_area_struct *vma);
+void siw_qp_event(struct siw_qp *qp, enum ib_event_type type);
+void siw_cq_event(struct siw_cq *cq, enum ib_event_type type);
+void siw_srq_event(struct siw_srq *srq, enum ib_event_type type);
+void siw_port_event(struct siw_device *dev, u8 port, enum ib_event_type type);
+
+#endif