diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2019-07-15 20:38:15 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2019-07-15 20:38:15 -0700 |
commit | 2a3c389a0fde49b241430df806a34276568cfb29 (patch) | |
tree | 9cf35829317e8cc2aaffc4341fb824dad63fce02 /drivers/infiniband/sw | |
parent | 8de262531f5fbb7458463224a7587429800c24bf (diff) | |
parent | 0b043644c0ca601cb19943a81aa1f1455dbe9461 (diff) | |
download | linux-2a3c389a0fde49b241430df806a34276568cfb29.tar.bz2 |
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma
Pull rdma updates from Jason Gunthorpe:
"A smaller cycle this time. Notably we see another new driver, 'Soft
iWarp', and the deletion of an ancient unused driver for nes.
- Revise and simplify the signature offload RDMA MR APIs
- More progress on hoisting object allocation boiler plate code out
of the drivers
- Driver bug fixes and revisions for hns, hfi1, efa, cxgb4, qib,
i40iw
- Tree wide cleanups: struct_size, put_user_page, xarray, rst doc
conversion
- Removal of obsolete ib_ucm chardev and nes driver
- netlink based discovery of chardevs and autoloading of the modules
providing them
- Move more of the rdamvt/hfi1 uapi to include/uapi/rdma
- New driver 'siw' for software based iWarp running on top of netdev,
much like rxe's software RoCE.
- mlx5 feature to report events in their raw devx format to userspace
- Expose per-object counters through rdma tool
- Adaptive interrupt moderation for RDMA (DIM), sharing the DIM core
from netdev"
* tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma: (194 commits)
RMDA/siw: Require a 64 bit arch
RDMA/siw: Mark expected switch fall-throughs
RDMA/core: Fix -Wunused-const-variable warnings
rdma/siw: Remove set but not used variable 's'
rdma/siw: Add missing dependencies on LIBCRC32C and DMA_VIRT_OPS
RDMA/siw: Add missing rtnl_lock around access to ifa
rdma/siw: Use proper enumerated type in map_cqe_status
RDMA/siw: Remove unnecessary kthread create/destroy printouts
IB/rdmavt: Fix variable shadowing issue in rvt_create_cq
RDMA/core: Fix race when resolving IP address
RDMA/core: Make rdma_counter.h compile stand alone
IB/core: Work on the caller socket net namespace in nldev_newlink()
RDMA/rxe: Fill in wc byte_len with IB_WC_RECV_RDMA_WITH_IMM
RDMA/mlx5: Set RDMA DIM to be enabled by default
RDMA/nldev: Added configuration of RDMA dynamic interrupt moderation to netlink
RDMA/core: Provide RDMA DIM support for ULPs
linux/dim: Implement RDMA adaptive moderation (DIM)
IB/mlx5: Report correctly tag matching rendezvous capability
docs: infiniband: add it to the driver-api bookset
IB/mlx5: Implement VHCA tunnel mechanism in DEVX
...
Diffstat (limited to 'drivers/infiniband/sw')
33 files changed, 11155 insertions, 332 deletions
diff --git a/drivers/infiniband/sw/Makefile b/drivers/infiniband/sw/Makefile index ab48a9b60844..68e0230f8f31 100644 --- a/drivers/infiniband/sw/Makefile +++ b/drivers/infiniband/sw/Makefile @@ -1,3 +1,4 @@ # SPDX-License-Identifier: GPL-2.0-only obj-$(CONFIG_INFINIBAND_RDMAVT) += rdmavt/ obj-$(CONFIG_RDMA_RXE) += rxe/ +obj-$(CONFIG_RDMA_SIW) += siw/ diff --git a/drivers/infiniband/sw/rdmavt/ah.c b/drivers/infiniband/sw/rdmavt/ah.c index 0e147b32cbe9..fe99da0ff060 100644 --- a/drivers/infiniband/sw/rdmavt/ah.c +++ b/drivers/infiniband/sw/rdmavt/ah.c @@ -1,5 +1,5 @@ /* - * Copyright(c) 2016 Intel Corporation. + * Copyright(c) 2016 - 2019 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. @@ -119,8 +119,6 @@ int rvt_create_ah(struct ib_ah *ibah, struct rdma_ah_attr *ah_attr, rdma_copy_ah_attr(&ah->attr, ah_attr); - atomic_set(&ah->refcount, 0); - if (dev->driver_f.notify_new_ah) dev->driver_f.notify_new_ah(ibah->device, ah_attr, ah); @@ -141,8 +139,6 @@ void rvt_destroy_ah(struct ib_ah *ibah, u32 destroy_flags) struct rvt_ah *ah = ibah_to_rvtah(ibah); unsigned long flags; - WARN_ON_ONCE(atomic_read(&ah->refcount)); - spin_lock_irqsave(&dev->n_ahs_lock, flags); dev->n_ahs_allocated--; spin_unlock_irqrestore(&dev->n_ahs_lock, flags); diff --git a/drivers/infiniband/sw/rdmavt/cq.c b/drivers/infiniband/sw/rdmavt/cq.c index a06e6da7a026..a85571a4cf57 100644 --- a/drivers/infiniband/sw/rdmavt/cq.c +++ b/drivers/infiniband/sw/rdmavt/cq.c @@ -60,22 +60,39 @@ static struct workqueue_struct *comp_vector_wq; * @solicited: true if @entry is solicited * * This may be called with qp->s_lock held. + * + * Return: return true on success, else return + * false if cq is full. */ -void rvt_cq_enter(struct rvt_cq *cq, struct ib_wc *entry, bool solicited) +bool rvt_cq_enter(struct rvt_cq *cq, struct ib_wc *entry, bool solicited) { - struct rvt_cq_wc *wc; + struct ib_uverbs_wc *uqueue = NULL; + struct ib_wc *kqueue = NULL; + struct rvt_cq_wc *u_wc = NULL; + struct rvt_k_cq_wc *k_wc = NULL; unsigned long flags; u32 head; u32 next; + u32 tail; spin_lock_irqsave(&cq->lock, flags); + if (cq->ip) { + u_wc = cq->queue; + uqueue = &u_wc->uqueue[0]; + head = RDMA_READ_UAPI_ATOMIC(u_wc->head); + tail = RDMA_READ_UAPI_ATOMIC(u_wc->tail); + } else { + k_wc = cq->kqueue; + kqueue = &k_wc->kqueue[0]; + head = k_wc->head; + tail = k_wc->tail; + } + /* - * Note that the head pointer might be writable by user processes. - * Take care to verify it is a sane value. + * Note that the head pointer might be writable by + * user processes.Take care to verify it is a sane value. */ - wc = cq->queue; - head = wc->head; if (head >= (unsigned)cq->ibcq.cqe) { head = cq->ibcq.cqe; next = 0; @@ -83,7 +100,12 @@ void rvt_cq_enter(struct rvt_cq *cq, struct ib_wc *entry, bool solicited) next = head + 1; } - if (unlikely(next == wc->tail)) { + if (unlikely(next == tail || cq->cq_full)) { + struct rvt_dev_info *rdi = cq->rdi; + + if (!cq->cq_full) + rvt_pr_err_ratelimited(rdi, "CQ is full!\n"); + cq->cq_full = true; spin_unlock_irqrestore(&cq->lock, flags); if (cq->ibcq.event_handler) { struct ib_event ev; @@ -93,30 +115,30 @@ void rvt_cq_enter(struct rvt_cq *cq, struct ib_wc *entry, bool solicited) ev.event = IB_EVENT_CQ_ERR; cq->ibcq.event_handler(&ev, cq->ibcq.cq_context); } - return; + return false; } trace_rvt_cq_enter(cq, entry, head); - if (cq->ip) { - wc->uqueue[head].wr_id = entry->wr_id; - wc->uqueue[head].status = entry->status; - wc->uqueue[head].opcode = entry->opcode; - wc->uqueue[head].vendor_err = entry->vendor_err; - wc->uqueue[head].byte_len = entry->byte_len; - wc->uqueue[head].ex.imm_data = entry->ex.imm_data; - wc->uqueue[head].qp_num = entry->qp->qp_num; - wc->uqueue[head].src_qp = entry->src_qp; - wc->uqueue[head].wc_flags = entry->wc_flags; - wc->uqueue[head].pkey_index = entry->pkey_index; - wc->uqueue[head].slid = ib_lid_cpu16(entry->slid); - wc->uqueue[head].sl = entry->sl; - wc->uqueue[head].dlid_path_bits = entry->dlid_path_bits; - wc->uqueue[head].port_num = entry->port_num; + if (uqueue) { + uqueue[head].wr_id = entry->wr_id; + uqueue[head].status = entry->status; + uqueue[head].opcode = entry->opcode; + uqueue[head].vendor_err = entry->vendor_err; + uqueue[head].byte_len = entry->byte_len; + uqueue[head].ex.imm_data = entry->ex.imm_data; + uqueue[head].qp_num = entry->qp->qp_num; + uqueue[head].src_qp = entry->src_qp; + uqueue[head].wc_flags = entry->wc_flags; + uqueue[head].pkey_index = entry->pkey_index; + uqueue[head].slid = ib_lid_cpu16(entry->slid); + uqueue[head].sl = entry->sl; + uqueue[head].dlid_path_bits = entry->dlid_path_bits; + uqueue[head].port_num = entry->port_num; /* Make sure entry is written before the head index. */ - smp_wmb(); + RDMA_WRITE_UAPI_ATOMIC(u_wc->head, next); } else { - wc->kqueue[head] = *entry; + kqueue[head] = *entry; + k_wc->head = next; } - wc->head = next; if (cq->notify == IB_CQ_NEXT_COMP || (cq->notify == IB_CQ_SOLICITED && @@ -132,6 +154,7 @@ void rvt_cq_enter(struct rvt_cq *cq, struct ib_wc *entry, bool solicited) } spin_unlock_irqrestore(&cq->lock, flags); + return true; } EXPORT_SYMBOL(rvt_cq_enter); @@ -166,43 +189,38 @@ static void send_complete(struct work_struct *work) /** * rvt_create_cq - create a completion queue - * @ibdev: the device this completion queue is attached to + * @ibcq: Allocated CQ * @attr: creation attributes * @udata: user data for libibverbs.so * * Called by ib_create_cq() in the generic verbs code. * - * Return: pointer to the completion queue or negative errno values - * for failure. + * Return: 0 on success */ -struct ib_cq *rvt_create_cq(struct ib_device *ibdev, - const struct ib_cq_init_attr *attr, - struct ib_udata *udata) +int rvt_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, + struct ib_udata *udata) { + struct ib_device *ibdev = ibcq->device; struct rvt_dev_info *rdi = ib_to_rvt(ibdev); - struct rvt_cq *cq; - struct rvt_cq_wc *wc; - struct ib_cq *ret; + struct rvt_cq *cq = ibcq_to_rvtcq(ibcq); + struct rvt_cq_wc *u_wc = NULL; + struct rvt_k_cq_wc *k_wc = NULL; u32 sz; unsigned int entries = attr->cqe; int comp_vector = attr->comp_vector; + int err; if (attr->flags) - return ERR_PTR(-EINVAL); + return -EINVAL; if (entries < 1 || entries > rdi->dparms.props.max_cqe) - return ERR_PTR(-EINVAL); + return -EINVAL; if (comp_vector < 0) comp_vector = 0; comp_vector = comp_vector % rdi->ibdev.num_comp_vectors; - /* Allocate the completion queue structure. */ - cq = kzalloc_node(sizeof(*cq), GFP_KERNEL, rdi->dparms.node); - if (!cq) - return ERR_PTR(-ENOMEM); - /* * Allocate the completion queue entries and head/tail pointers. * This is allocated separately so that it can be resized and @@ -210,17 +228,18 @@ struct ib_cq *rvt_create_cq(struct ib_device *ibdev, * We need to use vmalloc() in order to support mmap and large * numbers of entries. */ - sz = sizeof(*wc); - if (udata && udata->outlen >= sizeof(__u64)) - sz += sizeof(struct ib_uverbs_wc) * (entries + 1); - else - sz += sizeof(struct ib_wc) * (entries + 1); - wc = udata ? - vmalloc_user(sz) : - vzalloc_node(sz, rdi->dparms.node); - if (!wc) { - ret = ERR_PTR(-ENOMEM); - goto bail_cq; + if (udata && udata->outlen >= sizeof(__u64)) { + sz = sizeof(struct ib_uverbs_wc) * (entries + 1); + sz += sizeof(*u_wc); + u_wc = vmalloc_user(sz); + if (!u_wc) + return -ENOMEM; + } else { + sz = sizeof(struct ib_wc) * (entries + 1); + sz += sizeof(*k_wc); + k_wc = vzalloc_node(sz, rdi->dparms.node); + if (!k_wc) + return -ENOMEM; } /* @@ -228,26 +247,22 @@ struct ib_cq *rvt_create_cq(struct ib_device *ibdev, * See rvt_mmap() for details. */ if (udata && udata->outlen >= sizeof(__u64)) { - int err; - - cq->ip = rvt_create_mmap_info(rdi, sz, udata, wc); + cq->ip = rvt_create_mmap_info(rdi, sz, udata, u_wc); if (!cq->ip) { - ret = ERR_PTR(-ENOMEM); + err = -ENOMEM; goto bail_wc; } err = ib_copy_to_udata(udata, &cq->ip->offset, sizeof(cq->ip->offset)); - if (err) { - ret = ERR_PTR(err); + if (err) goto bail_ip; - } } spin_lock_irq(&rdi->n_cqs_lock); if (rdi->n_cqs_allocated == rdi->dparms.props.max_cq) { spin_unlock_irq(&rdi->n_cqs_lock); - ret = ERR_PTR(-ENOMEM); + err = -ENOMEM; goto bail_ip; } @@ -277,21 +292,20 @@ struct ib_cq *rvt_create_cq(struct ib_device *ibdev, cq->notify = RVT_CQ_NONE; spin_lock_init(&cq->lock); INIT_WORK(&cq->comptask, send_complete); - cq->queue = wc; - - ret = &cq->ibcq; + if (u_wc) + cq->queue = u_wc; + else + cq->kqueue = k_wc; trace_rvt_create_cq(cq, attr); - goto done; + return 0; bail_ip: kfree(cq->ip); bail_wc: - vfree(wc); -bail_cq: - kfree(cq); -done: - return ret; + vfree(u_wc); + vfree(k_wc); + return err; } /** @@ -300,10 +314,8 @@ done: * @udata: user data or NULL for kernel object * * Called by ib_destroy_cq() in the generic verbs code. - * - * Return: always 0 */ -int rvt_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata) +void rvt_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata) { struct rvt_cq *cq = ibcq_to_rvtcq(ibcq); struct rvt_dev_info *rdi = cq->rdi; @@ -316,9 +328,6 @@ int rvt_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata) kref_put(&cq->ip->ref, rvt_release_mmap_info); else vfree(cq->queue); - kfree(cq); - - return 0; } /** @@ -345,9 +354,16 @@ int rvt_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags notify_flags) if (cq->notify != IB_CQ_NEXT_COMP) cq->notify = notify_flags & IB_CQ_SOLICITED_MASK; - if ((notify_flags & IB_CQ_REPORT_MISSED_EVENTS) && - cq->queue->head != cq->queue->tail) - ret = 1; + if (notify_flags & IB_CQ_REPORT_MISSED_EVENTS) { + if (cq->queue) { + if (RDMA_READ_UAPI_ATOMIC(cq->queue->head) != + RDMA_READ_UAPI_ATOMIC(cq->queue->tail)) + ret = 1; + } else { + if (cq->kqueue->head != cq->kqueue->tail) + ret = 1; + } + } spin_unlock_irqrestore(&cq->lock, flags); @@ -363,12 +379,14 @@ int rvt_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags notify_flags) int rvt_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata) { struct rvt_cq *cq = ibcq_to_rvtcq(ibcq); - struct rvt_cq_wc *old_wc; - struct rvt_cq_wc *wc; u32 head, tail, n; int ret; u32 sz; struct rvt_dev_info *rdi = cq->rdi; + struct rvt_cq_wc *u_wc = NULL; + struct rvt_cq_wc *old_u_wc = NULL; + struct rvt_k_cq_wc *k_wc = NULL; + struct rvt_k_cq_wc *old_k_wc = NULL; if (cqe < 1 || cqe > rdi->dparms.props.max_cqe) return -EINVAL; @@ -376,17 +394,19 @@ int rvt_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata) /* * Need to use vmalloc() if we want to support large #s of entries. */ - sz = sizeof(*wc); - if (udata && udata->outlen >= sizeof(__u64)) - sz += sizeof(struct ib_uverbs_wc) * (cqe + 1); - else - sz += sizeof(struct ib_wc) * (cqe + 1); - wc = udata ? - vmalloc_user(sz) : - vzalloc_node(sz, rdi->dparms.node); - if (!wc) - return -ENOMEM; - + if (udata && udata->outlen >= sizeof(__u64)) { + sz = sizeof(struct ib_uverbs_wc) * (cqe + 1); + sz += sizeof(*u_wc); + u_wc = vmalloc_user(sz); + if (!u_wc) + return -ENOMEM; + } else { + sz = sizeof(struct ib_wc) * (cqe + 1); + sz += sizeof(*k_wc); + k_wc = vzalloc_node(sz, rdi->dparms.node); + if (!k_wc) + return -ENOMEM; + } /* Check that we can write the offset to mmap. */ if (udata && udata->outlen >= sizeof(__u64)) { __u64 offset = 0; @@ -401,11 +421,18 @@ int rvt_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata) * Make sure head and tail are sane since they * might be user writable. */ - old_wc = cq->queue; - head = old_wc->head; + if (u_wc) { + old_u_wc = cq->queue; + head = RDMA_READ_UAPI_ATOMIC(old_u_wc->head); + tail = RDMA_READ_UAPI_ATOMIC(old_u_wc->tail); + } else { + old_k_wc = cq->kqueue; + head = old_k_wc->head; + tail = old_k_wc->tail; + } + if (head > (u32)cq->ibcq.cqe) head = (u32)cq->ibcq.cqe; - tail = old_wc->tail; if (tail > (u32)cq->ibcq.cqe) tail = (u32)cq->ibcq.cqe; if (head < tail) @@ -417,27 +444,36 @@ int rvt_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata) goto bail_unlock; } for (n = 0; tail != head; n++) { - if (cq->ip) - wc->uqueue[n] = old_wc->uqueue[tail]; + if (u_wc) + u_wc->uqueue[n] = old_u_wc->uqueue[tail]; else - wc->kqueue[n] = old_wc->kqueue[tail]; + k_wc->kqueue[n] = old_k_wc->kqueue[tail]; if (tail == (u32)cq->ibcq.cqe) tail = 0; else tail++; } cq->ibcq.cqe = cqe; - wc->head = n; - wc->tail = 0; - cq->queue = wc; + if (u_wc) { + RDMA_WRITE_UAPI_ATOMIC(u_wc->head, n); + RDMA_WRITE_UAPI_ATOMIC(u_wc->tail, 0); + cq->queue = u_wc; + } else { + k_wc->head = n; + k_wc->tail = 0; + cq->kqueue = k_wc; + } spin_unlock_irq(&cq->lock); - vfree(old_wc); + if (u_wc) + vfree(old_u_wc); + else + vfree(old_k_wc); if (cq->ip) { struct rvt_mmap_info *ip = cq->ip; - rvt_update_mmap_info(rdi, ip, sz, wc); + rvt_update_mmap_info(rdi, ip, sz, u_wc); /* * Return the offset to mmap. @@ -461,7 +497,9 @@ int rvt_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata) bail_unlock: spin_unlock_irq(&cq->lock); bail_free: - vfree(wc); + vfree(u_wc); + vfree(k_wc); + return ret; } @@ -479,7 +517,7 @@ bail_free: int rvt_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry) { struct rvt_cq *cq = ibcq_to_rvtcq(ibcq); - struct rvt_cq_wc *wc; + struct rvt_k_cq_wc *wc; unsigned long flags; int npolled; u32 tail; @@ -490,7 +528,7 @@ int rvt_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry) spin_lock_irqsave(&cq->lock, flags); - wc = cq->queue; + wc = cq->kqueue; tail = wc->tail; if (tail > (u32)cq->ibcq.cqe) tail = (u32)cq->ibcq.cqe; diff --git a/drivers/infiniband/sw/rdmavt/cq.h b/drivers/infiniband/sw/rdmavt/cq.h index 3ad6faf18ecb..5e26a2eb19a4 100644 --- a/drivers/infiniband/sw/rdmavt/cq.h +++ b/drivers/infiniband/sw/rdmavt/cq.h @@ -51,10 +51,9 @@ #include <rdma/rdma_vt.h> #include <rdma/rdmavt_cq.h> -struct ib_cq *rvt_create_cq(struct ib_device *ibdev, - const struct ib_cq_init_attr *attr, - struct ib_udata *udata); -int rvt_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata); +int rvt_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, + struct ib_udata *udata); +void rvt_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata); int rvt_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags notify_flags); int rvt_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata); int rvt_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry); diff --git a/drivers/infiniband/sw/rdmavt/mr.c b/drivers/infiniband/sw/rdmavt/mr.c index f48240f66b8f..a6a39f01dca3 100644 --- a/drivers/infiniband/sw/rdmavt/mr.c +++ b/drivers/infiniband/sw/rdmavt/mr.c @@ -562,8 +562,7 @@ int rvt_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata) if (ret) goto out; rvt_deinit_mregion(&mr->mr); - if (mr->umem) - ib_umem_release(mr->umem); + ib_umem_release(mr->umem); kfree(mr); out: return ret; @@ -613,8 +612,8 @@ static int rvt_set_page(struct ib_mr *ibmr, u64 addr) n = mapped_segs % RVT_SEGSZ; mr->mr.map[m]->segs[n].vaddr = (void *)addr; mr->mr.map[m]->segs[n].length = ps; - trace_rvt_mr_page_seg(&mr->mr, m, n, (void *)addr, ps); mr->mr.length += ps; + trace_rvt_mr_page_seg(&mr->mr, m, n, (void *)addr, ps); return 0; } @@ -643,6 +642,7 @@ int rvt_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, mr->mr.iova = ibmr->iova; mr->mr.offset = ibmr->iova - (u64)mr->mr.map[0]->segs[0].vaddr; mr->mr.length = (size_t)ibmr->length; + trace_rvt_map_mr_sg(ibmr, sg_nents, sg_offset); return ret; } diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c index c5a50614a6c6..0b0a241c57ff 100644 --- a/drivers/infiniband/sw/rdmavt/qp.c +++ b/drivers/infiniband/sw/rdmavt/qp.c @@ -1,5 +1,5 @@ /* - * Copyright(c) 2016 - 2018 Intel Corporation. + * Copyright(c) 2016 - 2019 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. @@ -58,6 +58,8 @@ #include "vt.h" #include "trace.h" +#define RVT_RWQ_COUNT_THRESHOLD 16 + static void rvt_rc_timeout(struct timer_list *t); /* @@ -803,6 +805,47 @@ static void rvt_remove_qp(struct rvt_dev_info *rdi, struct rvt_qp *qp) } /** + * rvt_alloc_rq - allocate memory for user or kernel buffer + * @rq: receive queue data structure + * @size: number of request queue entries + * @node: The NUMA node + * @udata: True if user data is available or not false + * + * Return: If memory allocation failed, return -ENONEM + * This function is used by both shared receive + * queues and non-shared receive queues to allocate + * memory. + */ +int rvt_alloc_rq(struct rvt_rq *rq, u32 size, int node, + struct ib_udata *udata) +{ + if (udata) { + rq->wq = vmalloc_user(sizeof(struct rvt_rwq) + size); + if (!rq->wq) + goto bail; + /* need kwq with no buffers */ + rq->kwq = kzalloc_node(sizeof(*rq->kwq), GFP_KERNEL, node); + if (!rq->kwq) + goto bail; + rq->kwq->curr_wq = rq->wq->wq; + } else { + /* need kwq with buffers */ + rq->kwq = + vzalloc_node(sizeof(struct rvt_krwq) + size, node); + if (!rq->kwq) + goto bail; + rq->kwq->curr_wq = rq->kwq->wq; + } + + spin_lock_init(&rq->kwq->p_lock); + spin_lock_init(&rq->kwq->c_lock); + return 0; +bail: + rvt_free_rq(rq); + return -ENOMEM; +} + +/** * rvt_init_qp - initialize the QP state to the reset state * @qp: the QP to init or reinit * @type: the QP type @@ -852,10 +895,8 @@ static void rvt_init_qp(struct rvt_dev_info *rdi, struct rvt_qp *qp, qp->s_tail_ack_queue = 0; qp->s_acked_ack_queue = 0; qp->s_num_rd_atomic = 0; - if (qp->r_rq.wq) { - qp->r_rq.wq->head = 0; - qp->r_rq.wq->tail = 0; - } + if (qp->r_rq.kwq) + qp->r_rq.kwq->count = qp->r_rq.size; qp->r_sge.num_sge = 0; atomic_set(&qp->s_reserved_used, 0); } @@ -928,6 +969,61 @@ static void rvt_free_qpn(struct rvt_qpn_table *qpt, u32 qpn) } /** + * get_allowed_ops - Given a QP type return the appropriate allowed OP + * @type: valid, supported, QP type + */ +static u8 get_allowed_ops(enum ib_qp_type type) +{ + return type == IB_QPT_RC ? IB_OPCODE_RC : type == IB_QPT_UC ? + IB_OPCODE_UC : IB_OPCODE_UD; +} + +/** + * free_ud_wq_attr - Clean up AH attribute cache for UD QPs + * @qp: Valid QP with allowed_ops set + * + * The rvt_swqe data structure being used is a union, so this is + * only valid for UD QPs. + */ +static void free_ud_wq_attr(struct rvt_qp *qp) +{ + struct rvt_swqe *wqe; + int i; + + for (i = 0; qp->allowed_ops == IB_OPCODE_UD && i < qp->s_size; i++) { + wqe = rvt_get_swqe_ptr(qp, i); + kfree(wqe->ud_wr.attr); + wqe->ud_wr.attr = NULL; + } +} + +/** + * alloc_ud_wq_attr - AH attribute cache for UD QPs + * @qp: Valid QP with allowed_ops set + * @node: Numa node for allocation + * + * The rvt_swqe data structure being used is a union, so this is + * only valid for UD QPs. + */ +static int alloc_ud_wq_attr(struct rvt_qp *qp, int node) +{ + struct rvt_swqe *wqe; + int i; + + for (i = 0; qp->allowed_ops == IB_OPCODE_UD && i < qp->s_size; i++) { + wqe = rvt_get_swqe_ptr(qp, i); + wqe->ud_wr.attr = kzalloc_node(sizeof(*wqe->ud_wr.attr), + GFP_KERNEL, node); + if (!wqe->ud_wr.attr) { + free_ud_wq_attr(qp); + return -ENOMEM; + } + } + + return 0; +} + +/** * rvt_create_qp - create a queue pair for a device * @ibpd: the protection domain who's device we create the queue pair for * @init_attr: the attributes of the queue pair @@ -989,9 +1085,7 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd, case IB_QPT_UC: case IB_QPT_RC: case IB_QPT_UD: - sz = sizeof(struct rvt_sge) * - init_attr->cap.max_send_sge + - sizeof(struct rvt_swqe); + sz = struct_size(swq, sg_list, init_attr->cap.max_send_sge); swq = vzalloc_node(array_size(sz, sqsize), rdi->dparms.node); if (!swq) return ERR_PTR(-ENOMEM); @@ -1011,6 +1105,7 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd, rdi->dparms.node); if (!qp) goto bail_swq; + qp->allowed_ops = get_allowed_ops(init_attr->qp_type); RCU_INIT_POINTER(qp->next, NULL); if (init_attr->qp_type == IB_QPT_RC) { @@ -1048,17 +1143,12 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd, qp->r_rq.max_sge = init_attr->cap.max_recv_sge; sz = (sizeof(struct ib_sge) * qp->r_rq.max_sge) + sizeof(struct rvt_rwqe); - if (udata) - qp->r_rq.wq = vmalloc_user( - sizeof(struct rvt_rwq) + - qp->r_rq.size * sz); - else - qp->r_rq.wq = vzalloc_node( - sizeof(struct rvt_rwq) + - qp->r_rq.size * sz, - rdi->dparms.node); - if (!qp->r_rq.wq) + err = rvt_alloc_rq(&qp->r_rq, qp->r_rq.size * sz, + rdi->dparms.node, udata); + if (err) { + ret = ERR_PTR(err); goto bail_driver_priv; + } } /* @@ -1068,7 +1158,6 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd, spin_lock_init(&qp->r_lock); spin_lock_init(&qp->s_hlock); spin_lock_init(&qp->s_lock); - spin_lock_init(&qp->r_rq.lock); atomic_set(&qp->refcount, 0); atomic_set(&qp->local_ops_pending, 0); init_waitqueue_head(&qp->wait); @@ -1080,6 +1169,11 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd, qp->s_max_sge = init_attr->cap.max_send_sge; if (init_attr->sq_sig_type == IB_SIGNAL_REQ_WR) qp->s_flags = RVT_S_SIGNAL_REQ_WR; + err = alloc_ud_wq_attr(qp, rdi->dparms.node); + if (err) { + ret = (ERR_PTR(err)); + goto bail_driver_priv; + } err = alloc_qpn(rdi, &rdi->qp_dev->qpn_table, init_attr->qp_type, @@ -1172,28 +1266,6 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd, ret = &qp->ibqp; - /* - * We have our QP and its good, now keep track of what types of opcodes - * can be processed on this QP. We do this by keeping track of what the - * 3 high order bits of the opcode are. - */ - switch (init_attr->qp_type) { - case IB_QPT_SMI: - case IB_QPT_GSI: - case IB_QPT_UD: - qp->allowed_ops = IB_OPCODE_UD; - break; - case IB_QPT_RC: - qp->allowed_ops = IB_OPCODE_RC; - break; - case IB_QPT_UC: - qp->allowed_ops = IB_OPCODE_UC; - break; - default: - ret = ERR_PTR(-EINVAL); - goto bail_ip; - } - return ret; bail_ip: @@ -1204,8 +1276,8 @@ bail_qpn: rvt_free_qpn(&rdi->qp_dev->qpn_table, qp->ibqp.qp_num); bail_rq_wq: - if (!qp->ip) - vfree(qp->r_rq.wq); + rvt_free_rq(&qp->r_rq); + free_ud_wq_attr(qp); bail_driver_priv: rdi->driver_f.qp_priv_free(rdi, qp); @@ -1271,19 +1343,26 @@ int rvt_error_qp(struct rvt_qp *qp, enum ib_wc_status err) } wc.status = IB_WC_WR_FLUSH_ERR; - if (qp->r_rq.wq) { - struct rvt_rwq *wq; + if (qp->r_rq.kwq) { u32 head; u32 tail; - - spin_lock(&qp->r_rq.lock); - + struct rvt_rwq *wq = NULL; + struct rvt_krwq *kwq = NULL; + + spin_lock(&qp->r_rq.kwq->c_lock); + /* qp->ip used to validate if there is a user buffer mmaped */ + if (qp->ip) { + wq = qp->r_rq.wq; + head = RDMA_READ_UAPI_ATOMIC(wq->head); + tail = RDMA_READ_UAPI_ATOMIC(wq->tail); + } else { + kwq = qp->r_rq.kwq; + head = kwq->head; + tail = kwq->tail; + } /* sanity check pointers before trusting them */ - wq = qp->r_rq.wq; - head = wq->head; if (head >= qp->r_rq.size) head = 0; - tail = wq->tail; if (tail >= qp->r_rq.size) tail = 0; while (tail != head) { @@ -1292,9 +1371,11 @@ int rvt_error_qp(struct rvt_qp *qp, enum ib_wc_status err) tail = 0; rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc, 1); } - wq->tail = tail; - - spin_unlock(&qp->r_rq.lock); + if (qp->ip) + RDMA_WRITE_UAPI_ATOMIC(wq->tail, tail); + else + kwq->tail = tail; + spin_unlock(&qp->r_rq.kwq->c_lock); } else if (qp->ibqp.event_handler) { ret = 1; } @@ -1636,12 +1717,12 @@ int rvt_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata) if (qp->ip) kref_put(&qp->ip->ref, rvt_release_mmap_info); - else - vfree(qp->r_rq.wq); + kvfree(qp->r_rq.kwq); rdi->driver_f.qp_priv_free(rdi, qp); kfree(qp->s_ack_queue); rdma_destroy_ah_attr(&qp->remote_ah_attr); rdma_destroy_ah_attr(&qp->alt_ah_attr); + free_ud_wq_attr(qp); vfree(qp->s_wq); kfree(qp); return 0; @@ -1723,7 +1804,7 @@ int rvt_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr, const struct ib_recv_wr **bad_wr) { struct rvt_qp *qp = ibqp_to_rvtqp(ibqp); - struct rvt_rwq *wq = qp->r_rq.wq; + struct rvt_krwq *wq = qp->r_rq.kwq; unsigned long flags; int qp_err_flush = (ib_rvt_state_ops[qp->state] & RVT_FLUSH_RECV) && !qp->ibqp.srq; @@ -1744,12 +1825,12 @@ int rvt_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr, return -EINVAL; } - spin_lock_irqsave(&qp->r_rq.lock, flags); + spin_lock_irqsave(&qp->r_rq.kwq->p_lock, flags); next = wq->head + 1; if (next >= qp->r_rq.size) next = 0; - if (next == wq->tail) { - spin_unlock_irqrestore(&qp->r_rq.lock, flags); + if (next == READ_ONCE(wq->tail)) { + spin_unlock_irqrestore(&qp->r_rq.kwq->p_lock, flags); *bad_wr = wr; return -ENOMEM; } @@ -1766,16 +1847,18 @@ int rvt_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr, wqe = rvt_get_rwqe_ptr(&qp->r_rq, wq->head); wqe->wr_id = wr->wr_id; wqe->num_sge = wr->num_sge; - for (i = 0; i < wr->num_sge; i++) - wqe->sg_list[i] = wr->sg_list[i]; + for (i = 0; i < wr->num_sge; i++) { + wqe->sg_list[i].addr = wr->sg_list[i].addr; + wqe->sg_list[i].length = wr->sg_list[i].length; + wqe->sg_list[i].lkey = wr->sg_list[i].lkey; + } /* * Make sure queue entry is written * before the head index. */ - smp_wmb(); - wq->head = next; + smp_store_release(&wq->head, next); } - spin_unlock_irqrestore(&qp->r_rq.lock, flags); + spin_unlock_irqrestore(&qp->r_rq.kwq->p_lock, flags); } return 0; } @@ -1856,10 +1939,9 @@ static inline int rvt_qp_is_avail( /* see rvt_qp_wqe_unreserve() */ smp_mb__before_atomic(); - reserved_used = atomic_read(&qp->s_reserved_used); if (unlikely(reserved_op)) { /* see rvt_qp_wqe_unreserve() */ - smp_mb__before_atomic(); + reserved_used = atomic_read(&qp->s_reserved_used); if (reserved_used >= rdi->dparms.reserved_operations) return -ENOMEM; return 0; @@ -1867,14 +1949,13 @@ static inline int rvt_qp_is_avail( /* non-reserved operations */ if (likely(qp->s_avail)) return 0; - slast = READ_ONCE(qp->s_last); + /* See rvt_qp_complete_swqe() */ + slast = smp_load_acquire(&qp->s_last); if (qp->s_head >= slast) avail = qp->s_size - (qp->s_head - slast); else avail = slast - qp->s_head; - /* see rvt_qp_wqe_unreserve() */ - smp_mb__before_atomic(); reserved_used = atomic_read(&qp->s_reserved_used); avail = avail - 1 - (rdi->dparms.reserved_operations - reserved_used); @@ -2011,10 +2092,10 @@ static int rvt_post_one_wr(struct rvt_qp *qp, */ log_pmtu = qp->log_pmtu; if (qp->allowed_ops == IB_OPCODE_UD) { - struct rvt_ah *ah = ibah_to_rvtah(wqe->ud_wr.ah); + struct rvt_ah *ah = rvt_get_swqe_ah(wqe); log_pmtu = ah->log_pmtu; - atomic_inc(&ibah_to_rvtah(ud_wr(wr)->ah)->refcount); + rdma_copy_ah_attr(wqe->ud_wr.attr, &ah->attr); } if (rdi->post_parms[wr->opcode].flags & RVT_OPERATION_LOCAL) { @@ -2059,7 +2140,7 @@ static int rvt_post_one_wr(struct rvt_qp *qp, bail_inval_free_ref: if (qp->allowed_ops == IB_OPCODE_UD) - atomic_dec(&ibah_to_rvtah(ud_wr(wr)->ah)->refcount); + rdma_destroy_ah_attr(wqe->ud_wr.attr); bail_inval_free: /* release mr holds */ while (j) { @@ -2145,7 +2226,7 @@ int rvt_post_srq_recv(struct ib_srq *ibsrq, const struct ib_recv_wr *wr, const struct ib_recv_wr **bad_wr) { struct rvt_srq *srq = ibsrq_to_rvtsrq(ibsrq); - struct rvt_rwq *wq; + struct rvt_krwq *wq; unsigned long flags; for (; wr; wr = wr->next) { @@ -2158,13 +2239,13 @@ int rvt_post_srq_recv(struct ib_srq *ibsrq, const struct ib_recv_wr *wr, return -EINVAL; } - spin_lock_irqsave(&srq->rq.lock, flags); - wq = srq->rq.wq; + spin_lock_irqsave(&srq->rq.kwq->p_lock, flags); + wq = srq->rq.kwq; next = wq->head + 1; if (next >= srq->rq.size) next = 0; - if (next == wq->tail) { - spin_unlock_irqrestore(&srq->rq.lock, flags); + if (next == READ_ONCE(wq->tail)) { + spin_unlock_irqrestore(&srq->rq.kwq->p_lock, flags); *bad_wr = wr; return -ENOMEM; } @@ -2172,17 +2253,35 @@ int rvt_post_srq_recv(struct ib_srq *ibsrq, const struct ib_recv_wr *wr, wqe = rvt_get_rwqe_ptr(&srq->rq, wq->head); wqe->wr_id = wr->wr_id; wqe->num_sge = wr->num_sge; - for (i = 0; i < wr->num_sge; i++) - wqe->sg_list[i] = wr->sg_list[i]; + for (i = 0; i < wr->num_sge; i++) { + wqe->sg_list[i].addr = wr->sg_list[i].addr; + wqe->sg_list[i].length = wr->sg_list[i].length; + wqe->sg_list[i].lkey = wr->sg_list[i].lkey; + } /* Make sure queue entry is written before the head index. */ - smp_wmb(); - wq->head = next; - spin_unlock_irqrestore(&srq->rq.lock, flags); + smp_store_release(&wq->head, next); + spin_unlock_irqrestore(&srq->rq.kwq->p_lock, flags); } return 0; } /* + * rvt used the internal kernel struct as part of its ABI, for now make sure + * the kernel struct does not change layout. FIXME: rvt should never cast the + * user struct to a kernel struct. + */ +static struct ib_sge *rvt_cast_sge(struct rvt_wqe_sge *sge) +{ + BUILD_BUG_ON(offsetof(struct ib_sge, addr) != + offsetof(struct rvt_wqe_sge, addr)); + BUILD_BUG_ON(offsetof(struct ib_sge, length) != + offsetof(struct rvt_wqe_sge, length)); + BUILD_BUG_ON(offsetof(struct ib_sge, lkey) != + offsetof(struct rvt_wqe_sge, lkey)); + return (struct ib_sge *)sge; +} + +/* * Validate a RWQE and fill in the SGE state. * Return 1 if OK. */ @@ -2205,7 +2304,7 @@ static int init_sge(struct rvt_qp *qp, struct rvt_rwqe *wqe) continue; /* Check LKEY */ ret = rvt_lkey_ok(rkt, pd, j ? &ss->sg_list[j - 1] : &ss->sge, - NULL, &wqe->sg_list[i], + NULL, rvt_cast_sge(&wqe->sg_list[i]), IB_ACCESS_LOCAL_WRITE); if (unlikely(ret <= 0)) goto bad_lkey; @@ -2234,6 +2333,50 @@ bad_lkey: } /** + * get_count - count numbers of request work queue entries + * in circular buffer + * @rq: data structure for request queue entry + * @tail: tail indices of the circular buffer + * @head: head indices of the circular buffer + * + * Return - total number of entries in the circular buffer + */ +static u32 get_count(struct rvt_rq *rq, u32 tail, u32 head) +{ + u32 count; + + count = head; + + if (count >= rq->size) + count = 0; + if (count < tail) + count += rq->size - tail; + else + count -= tail; + + return count; +} + +/** + * get_rvt_head - get head indices of the circular buffer + * @rq: data structure for request queue entry + * @ip: the QP + * + * Return - head index value + */ +static inline u32 get_rvt_head(struct rvt_rq *rq, void *ip) +{ + u32 head; + + if (ip) + head = RDMA_READ_UAPI_ATOMIC(rq->wq->head); + else + head = rq->kwq->head; + + return head; +} + +/** * rvt_get_rwqe - copy the next RWQE into the QP's RWQE * @qp: the QP * @wr_id_only: update qp->r_wr_id only, not qp->r_sge @@ -2247,39 +2390,54 @@ int rvt_get_rwqe(struct rvt_qp *qp, bool wr_id_only) { unsigned long flags; struct rvt_rq *rq; + struct rvt_krwq *kwq = NULL; struct rvt_rwq *wq; struct rvt_srq *srq; struct rvt_rwqe *wqe; void (*handler)(struct ib_event *, void *); u32 tail; + u32 head; int ret; + void *ip = NULL; if (qp->ibqp.srq) { srq = ibsrq_to_rvtsrq(qp->ibqp.srq); handler = srq->ibsrq.event_handler; rq = &srq->rq; + ip = srq->ip; } else { srq = NULL; handler = NULL; rq = &qp->r_rq; + ip = qp->ip; } - spin_lock_irqsave(&rq->lock, flags); + spin_lock_irqsave(&rq->kwq->c_lock, flags); if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) { ret = 0; goto unlock; } + kwq = rq->kwq; + if (ip) { + wq = rq->wq; + tail = RDMA_READ_UAPI_ATOMIC(wq->tail); + } else { + tail = kwq->tail; + } - wq = rq->wq; - tail = wq->tail; /* Validate tail before using it since it is user writable. */ if (tail >= rq->size) tail = 0; - if (unlikely(tail == wq->head)) { + + if (kwq->count < RVT_RWQ_COUNT_THRESHOLD) { + head = get_rvt_head(rq, ip); + kwq->count = get_count(rq, tail, head); + } + if (unlikely(kwq->count == 0)) { ret = 0; goto unlock; } - /* Make sure entry is read after head index is read. */ + /* Make sure entry is read after the count is read. */ smp_rmb(); wqe = rvt_get_rwqe_ptr(rq, tail); /* @@ -2289,43 +2447,41 @@ int rvt_get_rwqe(struct rvt_qp *qp, bool wr_id_only) */ if (++tail >= rq->size) tail = 0; - wq->tail = tail; + if (ip) + RDMA_WRITE_UAPI_ATOMIC(wq->tail, tail); + else + kwq->tail = tail; if (!wr_id_only && !init_sge(qp, wqe)) { ret = -1; goto unlock; } qp->r_wr_id = wqe->wr_id; + kwq->count--; ret = 1; set_bit(RVT_R_WRID_VALID, &qp->r_aflags); if (handler) { - u32 n; - /* * Validate head pointer value and compute * the number of remaining WQEs. */ - n = wq->head; - if (n >= rq->size) - n = 0; - if (n < tail) - n += rq->size - tail; - else - n -= tail; - if (n < srq->limit) { - struct ib_event ev; - - srq->limit = 0; - spin_unlock_irqrestore(&rq->lock, flags); - ev.device = qp->ibqp.device; - ev.element.srq = qp->ibqp.srq; - ev.event = IB_EVENT_SRQ_LIMIT_REACHED; - handler(&ev, srq->ibsrq.srq_context); - goto bail; + if (kwq->count < srq->limit) { + kwq->count = get_count(rq, tail, get_rvt_head(rq, ip)); + if (kwq->count < srq->limit) { + struct ib_event ev; + + srq->limit = 0; + spin_unlock_irqrestore(&rq->kwq->c_lock, flags); + ev.device = qp->ibqp.device; + ev.element.srq = qp->ibqp.srq; + ev.event = IB_EVENT_SRQ_LIMIT_REACHED; + handler(&ev, srq->ibsrq.srq_context); + goto bail; + } } } unlock: - spin_unlock_irqrestore(&rq->lock, flags); + spin_unlock_irqrestore(&rq->kwq->c_lock, flags); bail: return ret; } @@ -2667,27 +2823,16 @@ void rvt_send_complete(struct rvt_qp *qp, struct rvt_swqe *wqe, enum ib_wc_status status) { u32 old_last, last; - struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device); + struct rvt_dev_info *rdi; if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_OR_FLUSH_SEND)) return; + rdi = ib_to_rvt(qp->ibqp.device); - last = qp->s_last; - old_last = last; - trace_rvt_qp_send_completion(qp, wqe, last); - if (++last >= qp->s_size) - last = 0; - trace_rvt_qp_send_completion(qp, wqe, last); - qp->s_last = last; - /* See post_send() */ - barrier(); - rvt_put_qp_swqe(qp, wqe); - - rvt_qp_swqe_complete(qp, - wqe, - rdi->wc_opcode[wqe->wr.opcode], - status); - + old_last = qp->s_last; + trace_rvt_qp_send_completion(qp, wqe, old_last); + last = rvt_qp_complete_swqe(qp, wqe, rdi->wc_opcode[wqe->wr.opcode], + status); if (qp->s_acked == old_last) qp->s_acked = last; if (qp->s_cur == old_last) @@ -3021,8 +3166,7 @@ do_write: wc.sl = rdma_ah_get_sl(&qp->remote_ah_attr); wc.port_num = 1; /* Signal completion event if the solicited bit is set. */ - rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc, - wqe->wr.send_flags & IB_SEND_SOLICITED); + rvt_recv_cq(qp, &wc, wqe->wr.send_flags & IB_SEND_SOLICITED); send_comp: spin_unlock_irqrestore(&qp->r_lock, flags); diff --git a/drivers/infiniband/sw/rdmavt/qp.h b/drivers/infiniband/sw/rdmavt/qp.h index 6db1619389b0..2cdba1283bf6 100644 --- a/drivers/infiniband/sw/rdmavt/qp.h +++ b/drivers/infiniband/sw/rdmavt/qp.h @@ -68,4 +68,6 @@ int rvt_post_srq_recv(struct ib_srq *ibsrq, const struct ib_recv_wr *wr, const struct ib_recv_wr **bad_wr); int rvt_wss_init(struct rvt_dev_info *rdi); void rvt_wss_exit(struct rvt_dev_info *rdi); +int rvt_alloc_rq(struct rvt_rq *rq, u32 size, int node, + struct ib_udata *udata); #endif /* DEF_RVTQP_H */ diff --git a/drivers/infiniband/sw/rdmavt/rc.c b/drivers/infiniband/sw/rdmavt/rc.c index 09f0cf538be6..890d7b760d2e 100644 --- a/drivers/infiniband/sw/rdmavt/rc.c +++ b/drivers/infiniband/sw/rdmavt/rc.c @@ -104,26 +104,33 @@ __be32 rvt_compute_aeth(struct rvt_qp *qp) } else { u32 min, max, x; u32 credits; - struct rvt_rwq *wq = qp->r_rq.wq; u32 head; u32 tail; - /* sanity check pointers before trusting them */ - head = wq->head; - if (head >= qp->r_rq.size) - head = 0; - tail = wq->tail; - if (tail >= qp->r_rq.size) - tail = 0; - /* - * Compute the number of credits available (RWQEs). - * There is a small chance that the pair of reads are - * not atomic, which is OK, since the fuzziness is - * resolved as further ACKs go out. - */ - credits = head - tail; - if ((int)credits < 0) - credits += qp->r_rq.size; + credits = READ_ONCE(qp->r_rq.kwq->count); + if (credits == 0) { + /* sanity check pointers before trusting them */ + if (qp->ip) { + head = RDMA_READ_UAPI_ATOMIC(qp->r_rq.wq->head); + tail = RDMA_READ_UAPI_ATOMIC(qp->r_rq.wq->tail); + } else { + head = READ_ONCE(qp->r_rq.kwq->head); + tail = READ_ONCE(qp->r_rq.kwq->tail); + } + if (head >= qp->r_rq.size) + head = 0; + if (tail >= qp->r_rq.size) + tail = 0; + /* + * Compute the number of credits available (RWQEs). + * There is a small chance that the pair of reads are + * not atomic, which is OK, since the fuzziness is + * resolved as further ACKs go out. + */ + credits = head - tail; + if ((int)credits < 0) + credits += qp->r_rq.size; + } /* * Binary search the credit table to find the code to * use. diff --git a/drivers/infiniband/sw/rdmavt/srq.c b/drivers/infiniband/sw/rdmavt/srq.c index 8d6b3e764255..24fef021d51d 100644 --- a/drivers/infiniband/sw/rdmavt/srq.c +++ b/drivers/infiniband/sw/rdmavt/srq.c @@ -52,7 +52,7 @@ #include "srq.h" #include "vt.h" - +#include "qp.h" /** * rvt_driver_srq_init - init srq resources on a per driver basis * @rdi: rvt dev structure @@ -97,11 +97,8 @@ int rvt_create_srq(struct ib_srq *ibsrq, struct ib_srq_init_attr *srq_init_attr, srq->rq.max_sge = srq_init_attr->attr.max_sge; sz = sizeof(struct ib_sge) * srq->rq.max_sge + sizeof(struct rvt_rwqe); - srq->rq.wq = udata ? - vmalloc_user(sizeof(struct rvt_rwq) + srq->rq.size * sz) : - vzalloc_node(sizeof(struct rvt_rwq) + srq->rq.size * sz, - dev->dparms.node); - if (!srq->rq.wq) { + if (rvt_alloc_rq(&srq->rq, srq->rq.size * sz, + dev->dparms.node, udata)) { ret = -ENOMEM; goto bail_srq; } @@ -152,7 +149,7 @@ int rvt_create_srq(struct ib_srq *ibsrq, struct ib_srq_init_attr *srq_init_attr, bail_ip: kfree(srq->ip); bail_wq: - vfree(srq->rq.wq); + rvt_free_rq(&srq->rq); bail_srq: return ret; } @@ -172,11 +169,12 @@ int rvt_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, { struct rvt_srq *srq = ibsrq_to_rvtsrq(ibsrq); struct rvt_dev_info *dev = ib_to_rvt(ibsrq->device); - struct rvt_rwq *wq; + struct rvt_rq tmp_rq = {}; int ret = 0; if (attr_mask & IB_SRQ_MAX_WR) { - struct rvt_rwq *owq; + struct rvt_krwq *okwq = NULL; + struct rvt_rwq *owq = NULL; struct rvt_rwqe *p; u32 sz, size, n, head, tail; @@ -185,17 +183,12 @@ int rvt_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, ((attr_mask & IB_SRQ_LIMIT) ? attr->srq_limit : srq->limit) > attr->max_wr) return -EINVAL; - sz = sizeof(struct rvt_rwqe) + srq->rq.max_sge * sizeof(struct ib_sge); size = attr->max_wr + 1; - wq = udata ? - vmalloc_user(sizeof(struct rvt_rwq) + size * sz) : - vzalloc_node(sizeof(struct rvt_rwq) + size * sz, - dev->dparms.node); - if (!wq) + if (rvt_alloc_rq(&tmp_rq, size * sz, dev->dparms.node, + udata)) return -ENOMEM; - /* Check that we can write the offset to mmap. */ if (udata && udata->inlen >= sizeof(__u64)) { __u64 offset_addr; @@ -213,14 +206,20 @@ int rvt_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, goto bail_free; } - spin_lock_irq(&srq->rq.lock); + spin_lock_irq(&srq->rq.kwq->c_lock); /* * validate head and tail pointer values and compute * the number of remaining WQEs. */ - owq = srq->rq.wq; - head = owq->head; - tail = owq->tail; + if (udata) { + owq = srq->rq.wq; + head = RDMA_READ_UAPI_ATOMIC(owq->head); + tail = RDMA_READ_UAPI_ATOMIC(owq->tail); + } else { + okwq = srq->rq.kwq; + head = okwq->head; + tail = okwq->tail; + } if (head >= srq->rq.size || tail >= srq->rq.size) { ret = -EINVAL; goto bail_unlock; @@ -235,7 +234,7 @@ int rvt_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, goto bail_unlock; } n = 0; - p = wq->wq; + p = tmp_rq.kwq->curr_wq; while (tail != head) { struct rvt_rwqe *wqe; int i; @@ -250,22 +249,29 @@ int rvt_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, if (++tail >= srq->rq.size) tail = 0; } - srq->rq.wq = wq; + srq->rq.kwq = tmp_rq.kwq; + if (udata) { + srq->rq.wq = tmp_rq.wq; + RDMA_WRITE_UAPI_ATOMIC(tmp_rq.wq->head, n); + RDMA_WRITE_UAPI_ATOMIC(tmp_rq.wq->tail, 0); + } else { + tmp_rq.kwq->head = n; + tmp_rq.kwq->tail = 0; + } srq->rq.size = size; - wq->head = n; - wq->tail = 0; if (attr_mask & IB_SRQ_LIMIT) srq->limit = attr->srq_limit; - spin_unlock_irq(&srq->rq.lock); + spin_unlock_irq(&srq->rq.kwq->c_lock); vfree(owq); + kvfree(okwq); if (srq->ip) { struct rvt_mmap_info *ip = srq->ip; struct rvt_dev_info *dev = ib_to_rvt(srq->ibsrq.device); u32 s = sizeof(struct rvt_rwq) + size * sz; - rvt_update_mmap_info(dev, ip, s, wq); + rvt_update_mmap_info(dev, ip, s, tmp_rq.wq); /* * Return the offset to mmap. @@ -289,19 +295,19 @@ int rvt_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, spin_unlock_irq(&dev->pending_lock); } } else if (attr_mask & IB_SRQ_LIMIT) { - spin_lock_irq(&srq->rq.lock); + spin_lock_irq(&srq->rq.kwq->c_lock); if (attr->srq_limit >= srq->rq.size) ret = -EINVAL; else srq->limit = attr->srq_limit; - spin_unlock_irq(&srq->rq.lock); + spin_unlock_irq(&srq->rq.kwq->c_lock); } return ret; bail_unlock: - spin_unlock_irq(&srq->rq.lock); + spin_unlock_irq(&srq->rq.kwq->c_lock); bail_free: - vfree(wq); + rvt_free_rq(&tmp_rq); return ret; } @@ -336,6 +342,5 @@ void rvt_destroy_srq(struct ib_srq *ibsrq, struct ib_udata *udata) spin_unlock(&dev->n_srqs_lock); if (srq->ip) kref_put(&srq->ip->ref, rvt_release_mmap_info); - else - vfree(srq->rq.wq); + kvfree(srq->rq.kwq); } diff --git a/drivers/infiniband/sw/rdmavt/trace_mr.h b/drivers/infiniband/sw/rdmavt/trace_mr.h index 976e482930a3..95b8a0e3b8bd 100644 --- a/drivers/infiniband/sw/rdmavt/trace_mr.h +++ b/drivers/infiniband/sw/rdmavt/trace_mr.h @@ -54,6 +54,8 @@ #include <rdma/rdma_vt.h> #include <rdma/rdmavt_mr.h> +#include "mr.h" + #undef TRACE_SYSTEM #define TRACE_SYSTEM rvt_mr DECLARE_EVENT_CLASS( @@ -64,8 +66,12 @@ DECLARE_EVENT_CLASS( RDI_DEV_ENTRY(ib_to_rvt(mr->pd->device)) __field(void *, vaddr) __field(struct page *, page) + __field(u64, iova) + __field(u64, user_base) __field(size_t, len) + __field(size_t, length) __field(u32, lkey) + __field(u32, offset) __field(u16, m) __field(u16, n) ), @@ -73,18 +79,28 @@ DECLARE_EVENT_CLASS( RDI_DEV_ASSIGN(ib_to_rvt(mr->pd->device)); __entry->vaddr = v; __entry->page = virt_to_page(v); + __entry->iova = mr->iova; + __entry->user_base = mr->user_base; + __entry->lkey = mr->lkey; __entry->m = m; __entry->n = n; __entry->len = len; + __entry->length = mr->length; + __entry->offset = mr->offset; ), TP_printk( - "[%s] vaddr %p page %p m %u n %u len %ld", + "[%s] lkey %x iova %llx user_base %llx mr_len %lu vaddr %llx page %p m %u n %u len %lu off %u", __get_str(dev), - __entry->vaddr, + __entry->lkey, + __entry->iova, + __entry->user_base, + __entry->length, + (unsigned long long)__entry->vaddr, __entry->page, __entry->m, __entry->n, - __entry->len + __entry->len, + __entry->offset ) ); @@ -165,6 +181,40 @@ DEFINE_EVENT( TP_PROTO(struct rvt_sge *sge, struct ib_sge *isge), TP_ARGS(sge, isge)); +TRACE_EVENT( + rvt_map_mr_sg, + TP_PROTO(struct ib_mr *ibmr, int sg_nents, unsigned int *sg_offset), + TP_ARGS(ibmr, sg_nents, sg_offset), + TP_STRUCT__entry( + RDI_DEV_ENTRY(ib_to_rvt(to_imr(ibmr)->mr.pd->device)) + __field(u64, iova) + __field(u64, ibmr_iova) + __field(u64, user_base) + __field(u64, ibmr_length) + __field(int, sg_nents) + __field(uint, sg_offset) + ), + TP_fast_assign( + RDI_DEV_ASSIGN(ib_to_rvt(to_imr(ibmr)->mr.pd->device)) + __entry->ibmr_iova = ibmr->iova; + __entry->iova = to_imr(ibmr)->mr.iova; + __entry->user_base = to_imr(ibmr)->mr.user_base; + __entry->ibmr_length = to_imr(ibmr)->mr.length; + __entry->sg_nents = sg_nents; + __entry->sg_offset = sg_offset ? *sg_offset : 0; + ), + TP_printk( + "[%s] ibmr_iova %llx iova %llx user_base %llx length %llx sg_nents %d sg_offset %u", + __get_str(dev), + __entry->ibmr_iova, + __entry->iova, + __entry->user_base, + __entry->ibmr_length, + __entry->sg_nents, + __entry->sg_offset + ) +); + #endif /* __RVT_TRACE_MR_H */ #undef TRACE_INCLUDE_PATH diff --git a/drivers/infiniband/sw/rdmavt/vt.c b/drivers/infiniband/sw/rdmavt/vt.c index 9546a837a8ac..18da1e1ea979 100644 --- a/drivers/infiniband/sw/rdmavt/vt.c +++ b/drivers/infiniband/sw/rdmavt/vt.c @@ -382,6 +382,8 @@ enum { }; static const struct ib_device_ops rvt_dev_ops = { + .uverbs_abi_ver = RVT_UVERBS_ABI_VERSION, + .alloc_fmr = rvt_alloc_fmr, .alloc_mr = rvt_alloc_mr, .alloc_pd = rvt_alloc_pd, @@ -427,6 +429,7 @@ static const struct ib_device_ops rvt_dev_ops = { .unmap_fmr = rvt_unmap_fmr, INIT_RDMA_OBJ_SIZE(ib_ah, rvt_ah, ibah), + INIT_RDMA_OBJ_SIZE(ib_cq, rvt_cq, ibcq), INIT_RDMA_OBJ_SIZE(ib_pd, rvt_pd, ibpd), INIT_RDMA_OBJ_SIZE(ib_srq, rvt_srq, ibsrq), INIT_RDMA_OBJ_SIZE(ib_ucontext, rvt_ucontext, ibucontext), @@ -530,7 +533,7 @@ static noinline int check_support(struct rvt_dev_info *rdi, int verb) * * Return: 0 on success otherwise an errno. */ -int rvt_register_device(struct rvt_dev_info *rdi, u32 driver_id) +int rvt_register_device(struct rvt_dev_info *rdi) { int ret = 0, i; @@ -600,7 +603,6 @@ int rvt_register_device(struct rvt_dev_info *rdi, u32 driver_id) * exactly which functions rdmavt supports, nor do they know the ABI * version, so we do all of this sort of stuff here. */ - rdi->ibdev.uverbs_abi_ver = RVT_UVERBS_ABI_VERSION; rdi->ibdev.uverbs_cmd_mask = (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) | (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) | @@ -636,7 +638,6 @@ int rvt_register_device(struct rvt_dev_info *rdi, u32 driver_id) if (!rdi->ibdev.num_comp_vectors) rdi->ibdev.num_comp_vectors = 1; - rdi->ibdev.driver_id = driver_id; /* We are now good to announce we exist */ ret = ib_register_device(&rdi->ibdev, dev_name(&rdi->ibdev.dev)); if (ret) { diff --git a/drivers/infiniband/sw/rdmavt/vt.h b/drivers/infiniband/sw/rdmavt/vt.h index 0675ea6c3872..d19ff817c2c7 100644 --- a/drivers/infiniband/sw/rdmavt/vt.h +++ b/drivers/infiniband/sw/rdmavt/vt.h @@ -78,6 +78,12 @@ fmt, \ ##__VA_ARGS__) +#define rvt_pr_err_ratelimited(rdi, fmt, ...) \ + __rvt_pr_err_ratelimited((rdi)->driver_f.get_pci_dev(rdi), \ + rvt_get_ibdev_name(rdi), \ + fmt, \ + ##__VA_ARGS__) + #define __rvt_pr_info(pdev, name, fmt, ...) \ dev_info(&pdev->dev, "%s: " fmt, name, ##__VA_ARGS__) @@ -87,6 +93,9 @@ #define __rvt_pr_err(pdev, name, fmt, ...) \ dev_err(&pdev->dev, "%s: " fmt, name, ##__VA_ARGS__) +#define __rvt_pr_err_ratelimited(pdev, name, fmt, ...) \ + dev_err_ratelimited(&(pdev)->dev, "%s: " fmt, name, ##__VA_ARGS__) + static inline int ibport_num_to_idx(struct ib_device *ibdev, u8 port_num) { struct rvt_dev_info *rdi = ib_to_rvt(ibdev); diff --git a/drivers/infiniband/sw/rxe/rxe_comp.c b/drivers/infiniband/sw/rxe/rxe_comp.c index 00eb99d3df86..116cafc9afcf 100644 --- a/drivers/infiniband/sw/rxe/rxe_comp.c +++ b/drivers/infiniband/sw/rxe/rxe_comp.c @@ -558,7 +558,7 @@ int rxe_completer(void *arg) { struct rxe_qp *qp = (struct rxe_qp *)arg; struct rxe_dev *rxe = to_rdev(qp->ibqp.device); - struct rxe_send_wqe *wqe = wqe; + struct rxe_send_wqe *wqe = NULL; struct sk_buff *skb = NULL; struct rxe_pkt_info *pkt = NULL; enum comp_state state; diff --git a/drivers/infiniband/sw/rxe/rxe_mr.c b/drivers/infiniband/sw/rxe/rxe_mr.c index f501f72489d8..ea6a819b7167 100644 --- a/drivers/infiniband/sw/rxe/rxe_mr.c +++ b/drivers/infiniband/sw/rxe/rxe_mr.c @@ -96,8 +96,7 @@ void rxe_mem_cleanup(struct rxe_pool_entry *arg) struct rxe_mem *mem = container_of(arg, typeof(*mem), pelem); int i; - if (mem->umem) - ib_umem_release(mem->umem); + ib_umem_release(mem->umem); if (mem->map) { for (i = 0; i < mem->num_map; i++) diff --git a/drivers/infiniband/sw/rxe/rxe_pool.c b/drivers/infiniband/sw/rxe/rxe_pool.c index 56cf18af016a..fbcbac52290b 100644 --- a/drivers/infiniband/sw/rxe/rxe_pool.c +++ b/drivers/infiniband/sw/rxe/rxe_pool.c @@ -72,6 +72,7 @@ struct rxe_type_info rxe_type_info[RXE_NUM_TYPES] = { [RXE_TYPE_CQ] = { .name = "rxe-cq", .size = sizeof(struct rxe_cq), + .flags = RXE_POOL_NO_ALLOC, .cleanup = rxe_cq_cleanup, }, [RXE_TYPE_MR] = { diff --git a/drivers/infiniband/sw/rxe/rxe_resp.c b/drivers/infiniband/sw/rxe/rxe_resp.c index aca9f60f9b21..1cbfbd98eb22 100644 --- a/drivers/infiniband/sw/rxe/rxe_resp.c +++ b/drivers/infiniband/sw/rxe/rxe_resp.c @@ -431,6 +431,7 @@ static enum resp_states check_rkey(struct rxe_qp *qp, qp->resp.va = reth_va(pkt); qp->resp.rkey = reth_rkey(pkt); qp->resp.resid = reth_len(pkt); + qp->resp.length = reth_len(pkt); } access = (pkt->mask & RXE_READ_MASK) ? IB_ACCESS_REMOTE_READ : IB_ACCESS_REMOTE_WRITE; @@ -856,7 +857,9 @@ static enum resp_states do_complete(struct rxe_qp *qp, pkt->mask & RXE_WRITE_MASK) ? IB_WC_RECV_RDMA_WITH_IMM : IB_WC_RECV; wc->vendor_err = 0; - wc->byte_len = wqe->dma.length - wqe->dma.resid; + wc->byte_len = (pkt->mask & RXE_IMMDT_MASK && + pkt->mask & RXE_WRITE_MASK) ? + qp->resp.length : wqe->dma.length - wqe->dma.resid; /* fields after byte_len are different between kernel and user * space diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c index 8c3e2a18cfe4..4ebdfcf4d33e 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.c +++ b/drivers/infiniband/sw/rxe/rxe_verbs.c @@ -778,55 +778,43 @@ err1: return err; } -static struct ib_cq *rxe_create_cq(struct ib_device *dev, - const struct ib_cq_init_attr *attr, - struct ib_udata *udata) +static int rxe_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, + struct ib_udata *udata) { int err; + struct ib_device *dev = ibcq->device; struct rxe_dev *rxe = to_rdev(dev); - struct rxe_cq *cq; + struct rxe_cq *cq = to_rcq(ibcq); struct rxe_create_cq_resp __user *uresp = NULL; if (udata) { if (udata->outlen < sizeof(*uresp)) - return ERR_PTR(-EINVAL); + return -EINVAL; uresp = udata->outbuf; } if (attr->flags) - return ERR_PTR(-EINVAL); + return -EINVAL; err = rxe_cq_chk_attr(rxe, NULL, attr->cqe, attr->comp_vector); if (err) - goto err1; - - cq = rxe_alloc(&rxe->cq_pool); - if (!cq) { - err = -ENOMEM; - goto err1; - } + return err; err = rxe_cq_from_init(rxe, cq, attr->cqe, attr->comp_vector, udata, uresp); if (err) - goto err2; - - return &cq->ibcq; + return err; -err2: - rxe_drop_ref(cq); -err1: - return ERR_PTR(err); + return rxe_add_to_pool(&rxe->cq_pool, &cq->pelem); } -static int rxe_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata) +static void rxe_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata) { struct rxe_cq *cq = to_rcq(ibcq); rxe_cq_disable(cq); rxe_drop_ref(cq); - return 0; } static int rxe_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata) @@ -1111,6 +1099,10 @@ static int rxe_enable_driver(struct ib_device *ib_dev) } static const struct ib_device_ops rxe_dev_ops = { + .owner = THIS_MODULE, + .driver_id = RDMA_DRIVER_RXE, + .uverbs_abi_ver = RXE_UVERBS_ABI_VERSION, + .alloc_hw_stats = rxe_ib_alloc_hw_stats, .alloc_mr = rxe_alloc_mr, .alloc_pd = rxe_alloc_pd, @@ -1157,6 +1149,7 @@ static const struct ib_device_ops rxe_dev_ops = { .resize_cq = rxe_resize_cq, INIT_RDMA_OBJ_SIZE(ib_ah, rxe_ah, ibah), + INIT_RDMA_OBJ_SIZE(ib_cq, rxe_cq, ibcq), INIT_RDMA_OBJ_SIZE(ib_pd, rxe_pd, ibpd), INIT_RDMA_OBJ_SIZE(ib_srq, rxe_srq, ibsrq), INIT_RDMA_OBJ_SIZE(ib_ucontext, rxe_ucontext, ibuc), @@ -1170,7 +1163,6 @@ int rxe_register_device(struct rxe_dev *rxe, const char *ibdev_name) strlcpy(dev->node_desc, "rxe", sizeof(dev->node_desc)); - dev->owner = THIS_MODULE; dev->node_type = RDMA_NODE_IB_CA; dev->phys_port_cnt = 1; dev->num_comp_vectors = num_possible_cpus(); @@ -1182,7 +1174,6 @@ int rxe_register_device(struct rxe_dev *rxe, const char *ibdev_name) dma_coerce_mask_and_coherent(&dev->dev, dma_get_required_mask(&dev->dev)); - dev->uverbs_abi_ver = RXE_UVERBS_ABI_VERSION; dev->uverbs_cmd_mask = BIT_ULL(IB_USER_VERBS_CMD_GET_CONTEXT) | BIT_ULL(IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) | BIT_ULL(IB_USER_VERBS_CMD_QUERY_DEVICE) @@ -1230,7 +1221,6 @@ int rxe_register_device(struct rxe_dev *rxe, const char *ibdev_name) rxe->tfm = tfm; rdma_set_device_sysfs_group(dev, &rxe_attr_group); - dev->driver_id = RDMA_DRIVER_RXE; err = ib_register_device(dev, ibdev_name); if (err) pr_warn("%s failed with error %d\n", __func__, err); diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h b/drivers/infiniband/sw/rxe/rxe_verbs.h index e8be7f44e3be..5c4b2239129c 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.h +++ b/drivers/infiniband/sw/rxe/rxe_verbs.h @@ -85,8 +85,8 @@ struct rxe_cqe { }; struct rxe_cq { - struct rxe_pool_entry pelem; struct ib_cq ibcq; + struct rxe_pool_entry pelem; struct rxe_queue *queue; spinlock_t cq_lock; u8 notify; @@ -213,6 +213,7 @@ struct rxe_resp_info { struct rxe_mem *mr; u32 resid; u32 rkey; + u32 length; u64 atomic_orig; /* SRQ only */ diff --git a/drivers/infiniband/sw/siw/Kconfig b/drivers/infiniband/sw/siw/Kconfig new file mode 100644 index 000000000000..dace276aea14 --- /dev/null +++ b/drivers/infiniband/sw/siw/Kconfig @@ -0,0 +1,18 @@ +config RDMA_SIW + tristate "Software RDMA over TCP/IP (iWARP) driver" + depends on INET && INFINIBAND && LIBCRC32C && 64BIT + select DMA_VIRT_OPS + help + This driver implements the iWARP RDMA transport over + the Linux TCP/IP network stack. It enables a system with a + standard Ethernet adapter to interoperate with a iWARP + adapter or with another system running the SIW driver. + (See also RXE which is a similar software driver for RoCE.) + + The driver interfaces with the Linux RDMA stack and + implements both a kernel and user space RDMA verbs API. + The user space verbs API requires a support + library named libsiw which is loaded by the generic user + space verbs API, libibverbs. To implement RDMA over + TCP/IP, the driver further interfaces with the Linux + in-kernel TCP socket layer. diff --git a/drivers/infiniband/sw/siw/Makefile b/drivers/infiniband/sw/siw/Makefile new file mode 100644 index 000000000000..f5f7e3867889 --- /dev/null +++ b/drivers/infiniband/sw/siw/Makefile @@ -0,0 +1,11 @@ +obj-$(CONFIG_RDMA_SIW) += siw.o + +siw-y := \ + siw_cm.o \ + siw_cq.o \ + siw_main.o \ + siw_mem.o \ + siw_qp.o \ + siw_qp_tx.o \ + siw_qp_rx.o \ + siw_verbs.o diff --git a/drivers/infiniband/sw/siw/iwarp.h b/drivers/infiniband/sw/siw/iwarp.h new file mode 100644 index 000000000000..e8a04d9c89cb --- /dev/null +++ b/drivers/infiniband/sw/siw/iwarp.h @@ -0,0 +1,380 @@ +/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause */ + +/* Authors: Bernard Metzler <bmt@zurich.ibm.com> */ +/* Copyright (c) 2008-2019, IBM Corporation */ + +#ifndef _IWARP_H +#define _IWARP_H + +#include <rdma/rdma_user_cm.h> /* RDMA_MAX_PRIVATE_DATA */ +#include <linux/types.h> +#include <asm/byteorder.h> + +#define RDMAP_VERSION 1 +#define DDP_VERSION 1 +#define MPA_REVISION_1 1 +#define MPA_REVISION_2 2 +#define MPA_MAX_PRIVDATA RDMA_MAX_PRIVATE_DATA +#define MPA_KEY_REQ "MPA ID Req Frame" +#define MPA_KEY_REP "MPA ID Rep Frame" +#define MPA_IRD_ORD_MASK 0x3fff + +struct mpa_rr_params { + __be16 bits; + __be16 pd_len; +}; + +/* + * MPA request/response header bits & fields + */ +enum { + MPA_RR_FLAG_MARKERS = cpu_to_be16(0x8000), + MPA_RR_FLAG_CRC = cpu_to_be16(0x4000), + MPA_RR_FLAG_REJECT = cpu_to_be16(0x2000), + MPA_RR_FLAG_ENHANCED = cpu_to_be16(0x1000), + MPA_RR_FLAG_GSO_EXP = cpu_to_be16(0x0800), + MPA_RR_MASK_REVISION = cpu_to_be16(0x00ff) +}; + +/* + * MPA request/reply header + */ +struct mpa_rr { + __u8 key[16]; + struct mpa_rr_params params; +}; + +static inline void __mpa_rr_set_revision(__be16 *bits, u8 rev) +{ + *bits = (*bits & ~MPA_RR_MASK_REVISION) | + (cpu_to_be16(rev) & MPA_RR_MASK_REVISION); +} + +static inline u8 __mpa_rr_revision(__be16 mpa_rr_bits) +{ + __be16 rev = mpa_rr_bits & MPA_RR_MASK_REVISION; + + return be16_to_cpu(rev); +} + +enum mpa_v2_ctrl { + MPA_V2_PEER_TO_PEER = cpu_to_be16(0x8000), + MPA_V2_ZERO_LENGTH_RTR = cpu_to_be16(0x4000), + MPA_V2_RDMA_WRITE_RTR = cpu_to_be16(0x8000), + MPA_V2_RDMA_READ_RTR = cpu_to_be16(0x4000), + MPA_V2_RDMA_NO_RTR = cpu_to_be16(0x0000), + MPA_V2_MASK_IRD_ORD = cpu_to_be16(0x3fff) +}; + +struct mpa_v2_data { + __be16 ird; + __be16 ord; +}; + +struct mpa_marker { + __be16 rsvd; + __be16 fpdu_hmd; /* FPDU header-marker distance (= MPA's FPDUPTR) */ +}; + +/* + * maximum MPA trailer + */ +struct mpa_trailer { + __u8 pad[4]; + __be32 crc; +}; + +#define MPA_HDR_SIZE 2 +#define MPA_CRC_SIZE 4 + +/* + * Common portion of iWARP headers (MPA, DDP, RDMAP) + * for any FPDU + */ +struct iwarp_ctrl { + __be16 mpa_len; + __be16 ddp_rdmap_ctrl; +}; + +/* + * DDP/RDMAP Hdr bits & fields + */ +enum { + DDP_FLAG_TAGGED = cpu_to_be16(0x8000), + DDP_FLAG_LAST = cpu_to_be16(0x4000), + DDP_MASK_RESERVED = cpu_to_be16(0x3C00), + DDP_MASK_VERSION = cpu_to_be16(0x0300), + RDMAP_MASK_VERSION = cpu_to_be16(0x00C0), + RDMAP_MASK_RESERVED = cpu_to_be16(0x0030), + RDMAP_MASK_OPCODE = cpu_to_be16(0x000f) +}; + +static inline u8 __ddp_get_version(struct iwarp_ctrl *ctrl) +{ + return be16_to_cpu(ctrl->ddp_rdmap_ctrl & DDP_MASK_VERSION) >> 8; +} + +static inline void __ddp_set_version(struct iwarp_ctrl *ctrl, u8 version) +{ + ctrl->ddp_rdmap_ctrl = + (ctrl->ddp_rdmap_ctrl & ~DDP_MASK_VERSION) | + (cpu_to_be16((u16)version << 8) & DDP_MASK_VERSION); +} + +static inline u8 __rdmap_get_version(struct iwarp_ctrl *ctrl) +{ + __be16 ver = ctrl->ddp_rdmap_ctrl & RDMAP_MASK_VERSION; + + return be16_to_cpu(ver) >> 6; +} + +static inline void __rdmap_set_version(struct iwarp_ctrl *ctrl, u8 version) +{ + ctrl->ddp_rdmap_ctrl = (ctrl->ddp_rdmap_ctrl & ~RDMAP_MASK_VERSION) | + (cpu_to_be16(version << 6) & RDMAP_MASK_VERSION); +} + +static inline u8 __rdmap_get_opcode(struct iwarp_ctrl *ctrl) +{ + return be16_to_cpu(ctrl->ddp_rdmap_ctrl & RDMAP_MASK_OPCODE); +} + +static inline void __rdmap_set_opcode(struct iwarp_ctrl *ctrl, u8 opcode) +{ + ctrl->ddp_rdmap_ctrl = (ctrl->ddp_rdmap_ctrl & ~RDMAP_MASK_OPCODE) | + (cpu_to_be16(opcode) & RDMAP_MASK_OPCODE); +} + +struct iwarp_rdma_write { + struct iwarp_ctrl ctrl; + __be32 sink_stag; + __be64 sink_to; +}; + +struct iwarp_rdma_rreq { + struct iwarp_ctrl ctrl; + __be32 rsvd; + __be32 ddp_qn; + __be32 ddp_msn; + __be32 ddp_mo; + __be32 sink_stag; + __be64 sink_to; + __be32 read_size; + __be32 source_stag; + __be64 source_to; +}; + +struct iwarp_rdma_rresp { + struct iwarp_ctrl ctrl; + __be32 sink_stag; + __be64 sink_to; +}; + +struct iwarp_send { + struct iwarp_ctrl ctrl; + __be32 rsvd; + __be32 ddp_qn; + __be32 ddp_msn; + __be32 ddp_mo; +}; + +struct iwarp_send_inv { + struct iwarp_ctrl ctrl; + __be32 inval_stag; + __be32 ddp_qn; + __be32 ddp_msn; + __be32 ddp_mo; +}; + +struct iwarp_terminate { + struct iwarp_ctrl ctrl; + __be32 rsvd; + __be32 ddp_qn; + __be32 ddp_msn; + __be32 ddp_mo; +#if defined(__LITTLE_ENDIAN_BITFIELD) + __be32 layer : 4; + __be32 etype : 4; + __be32 ecode : 8; + __be32 flag_m : 1; + __be32 flag_d : 1; + __be32 flag_r : 1; + __be32 reserved : 13; +#elif defined(__BIG_ENDIAN_BITFIELD) + __be32 reserved : 13; + __be32 flag_r : 1; + __be32 flag_d : 1; + __be32 flag_m : 1; + __be32 ecode : 8; + __be32 etype : 4; + __be32 layer : 4; +#else +#error "undefined byte order" +#endif +}; + +/* + * Terminate Hdr bits & fields + */ +enum { + TERM_MASK_LAYER = cpu_to_be32(0xf0000000), + TERM_MASK_ETYPE = cpu_to_be32(0x0f000000), + TERM_MASK_ECODE = cpu_to_be32(0x00ff0000), + TERM_FLAG_M = cpu_to_be32(0x00008000), + TERM_FLAG_D = cpu_to_be32(0x00004000), + TERM_FLAG_R = cpu_to_be32(0x00002000), + TERM_MASK_RESVD = cpu_to_be32(0x00001fff) +}; + +static inline u8 __rdmap_term_layer(struct iwarp_terminate *term) +{ + return term->layer; +} + +static inline void __rdmap_term_set_layer(struct iwarp_terminate *term, + u8 layer) +{ + term->layer = layer & 0xf; +} + +static inline u8 __rdmap_term_etype(struct iwarp_terminate *term) +{ + return term->etype; +} + +static inline void __rdmap_term_set_etype(struct iwarp_terminate *term, + u8 etype) +{ + term->etype = etype & 0xf; +} + +static inline u8 __rdmap_term_ecode(struct iwarp_terminate *term) +{ + return term->ecode; +} + +static inline void __rdmap_term_set_ecode(struct iwarp_terminate *term, + u8 ecode) +{ + term->ecode = ecode; +} + +/* + * Common portion of iWARP headers (MPA, DDP, RDMAP) + * for an FPDU carrying an untagged DDP segment + */ +struct iwarp_ctrl_untagged { + struct iwarp_ctrl ctrl; + __be32 rsvd; + __be32 ddp_qn; + __be32 ddp_msn; + __be32 ddp_mo; +}; + +/* + * Common portion of iWARP headers (MPA, DDP, RDMAP) + * for an FPDU carrying a tagged DDP segment + */ +struct iwarp_ctrl_tagged { + struct iwarp_ctrl ctrl; + __be32 ddp_stag; + __be64 ddp_to; +}; + +union iwarp_hdr { + struct iwarp_ctrl ctrl; + struct iwarp_ctrl_untagged c_untagged; + struct iwarp_ctrl_tagged c_tagged; + struct iwarp_rdma_write rwrite; + struct iwarp_rdma_rreq rreq; + struct iwarp_rdma_rresp rresp; + struct iwarp_terminate terminate; + struct iwarp_send send; + struct iwarp_send_inv send_inv; +}; + +enum term_elayer { + TERM_ERROR_LAYER_RDMAP = 0x00, + TERM_ERROR_LAYER_DDP = 0x01, + TERM_ERROR_LAYER_LLP = 0x02 /* eg., MPA */ +}; + +enum ddp_etype { + DDP_ETYPE_CATASTROPHIC = 0x0, + DDP_ETYPE_TAGGED_BUF = 0x1, + DDP_ETYPE_UNTAGGED_BUF = 0x2, + DDP_ETYPE_RSVD = 0x3 +}; + +enum ddp_ecode { + /* unspecified, set to zero */ + DDP_ECODE_CATASTROPHIC = 0x00, + /* Tagged Buffer Errors */ + DDP_ECODE_T_INVALID_STAG = 0x00, + DDP_ECODE_T_BASE_BOUNDS = 0x01, + DDP_ECODE_T_STAG_NOT_ASSOC = 0x02, + DDP_ECODE_T_TO_WRAP = 0x03, + DDP_ECODE_T_VERSION = 0x04, + /* Untagged Buffer Errors */ + DDP_ECODE_UT_INVALID_QN = 0x01, + DDP_ECODE_UT_INVALID_MSN_NOBUF = 0x02, + DDP_ECODE_UT_INVALID_MSN_RANGE = 0x03, + DDP_ECODE_UT_INVALID_MO = 0x04, + DDP_ECODE_UT_MSG_TOOLONG = 0x05, + DDP_ECODE_UT_VERSION = 0x06 +}; + +enum rdmap_untagged_qn { + RDMAP_UNTAGGED_QN_SEND = 0, + RDMAP_UNTAGGED_QN_RDMA_READ = 1, + RDMAP_UNTAGGED_QN_TERMINATE = 2, + RDMAP_UNTAGGED_QN_COUNT = 3 +}; + +enum rdmap_etype { + RDMAP_ETYPE_CATASTROPHIC = 0x0, + RDMAP_ETYPE_REMOTE_PROTECTION = 0x1, + RDMAP_ETYPE_REMOTE_OPERATION = 0x2 +}; + +enum rdmap_ecode { + RDMAP_ECODE_INVALID_STAG = 0x00, + RDMAP_ECODE_BASE_BOUNDS = 0x01, + RDMAP_ECODE_ACCESS_RIGHTS = 0x02, + RDMAP_ECODE_STAG_NOT_ASSOC = 0x03, + RDMAP_ECODE_TO_WRAP = 0x04, + RDMAP_ECODE_VERSION = 0x05, + RDMAP_ECODE_OPCODE = 0x06, + RDMAP_ECODE_CATASTROPHIC_STREAM = 0x07, + RDMAP_ECODE_CATASTROPHIC_GLOBAL = 0x08, + RDMAP_ECODE_CANNOT_INVALIDATE = 0x09, + RDMAP_ECODE_UNSPECIFIED = 0xff +}; + +enum llp_ecode { + LLP_ECODE_TCP_STREAM_LOST = 0x01, /* How to transfer this ?? */ + LLP_ECODE_RECEIVED_CRC = 0x02, + LLP_ECODE_FPDU_START = 0x03, + LLP_ECODE_INVALID_REQ_RESP = 0x04, + + /* Errors for Enhanced Connection Establishment only */ + LLP_ECODE_LOCAL_CATASTROPHIC = 0x05, + LLP_ECODE_INSUFFICIENT_IRD = 0x06, + LLP_ECODE_NO_MATCHING_RTR = 0x07 +}; + +enum llp_etype { LLP_ETYPE_MPA = 0x00 }; + +enum rdma_opcode { + RDMAP_RDMA_WRITE = 0x0, + RDMAP_RDMA_READ_REQ = 0x1, + RDMAP_RDMA_READ_RESP = 0x2, + RDMAP_SEND = 0x3, + RDMAP_SEND_INVAL = 0x4, + RDMAP_SEND_SE = 0x5, + RDMAP_SEND_SE_INVAL = 0x6, + RDMAP_TERMINATE = 0x7, + RDMAP_NOT_SUPPORTED = RDMAP_TERMINATE + 1 +}; + +#endif diff --git a/drivers/infiniband/sw/siw/siw.h b/drivers/infiniband/sw/siw/siw.h new file mode 100644 index 000000000000..03fd7b2f595f --- /dev/null +++ b/drivers/infiniband/sw/siw/siw.h @@ -0,0 +1,745 @@ +/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause */ + +/* Authors: Bernard Metzler <bmt@zurich.ibm.com> */ +/* Copyright (c) 2008-2019, IBM Corporation */ + +#ifndef _SIW_H +#define _SIW_H + +#include <rdma/ib_verbs.h> +#include <linux/socket.h> +#include <linux/skbuff.h> +#include <crypto/hash.h> +#include <linux/crc32.h> +#include <linux/crc32c.h> + +#include <rdma/siw-abi.h> +#include "iwarp.h" + +#define SIW_VENDOR_ID 0x626d74 /* ascii 'bmt' for now */ +#define SIW_VENDORT_PART_ID 0 +#define SIW_MAX_QP (1024 * 100) +#define SIW_MAX_QP_WR (1024 * 32) +#define SIW_MAX_ORD_QP 128 +#define SIW_MAX_IRD_QP 128 +#define SIW_MAX_SGE_PBL 256 /* max num sge's for PBL */ +#define SIW_MAX_SGE_RD 1 /* iwarp limitation. we could relax */ +#define SIW_MAX_CQ (1024 * 100) +#define SIW_MAX_CQE (SIW_MAX_QP_WR * 100) +#define SIW_MAX_MR (SIW_MAX_QP * 10) +#define SIW_MAX_PD SIW_MAX_QP +#define SIW_MAX_MW 0 /* to be set if MW's are supported */ +#define SIW_MAX_FMR SIW_MAX_MR +#define SIW_MAX_SRQ SIW_MAX_QP +#define SIW_MAX_SRQ_WR (SIW_MAX_QP_WR * 10) +#define SIW_MAX_CONTEXT SIW_MAX_PD + +/* Min number of bytes for using zero copy transmit */ +#define SENDPAGE_THRESH PAGE_SIZE + +/* Maximum number of frames which can be send in one SQ processing */ +#define SQ_USER_MAXBURST 100 + +/* Maximum number of consecutive IRQ elements which get served + * if SQ has pending work. Prevents starving local SQ processing + * by serving peer Read Requests. + */ +#define SIW_IRQ_MAXBURST_SQ_ACTIVE 4 + +struct siw_dev_cap { + int max_qp; + int max_qp_wr; + int max_ord; /* max. outbound read queue depth */ + int max_ird; /* max. inbound read queue depth */ + int max_sge; + int max_sge_rd; + int max_cq; + int max_cqe; + int max_mr; + int max_pd; + int max_mw; + int max_fmr; + int max_srq; + int max_srq_wr; + int max_srq_sge; +}; + +struct siw_pd { + struct ib_pd base_pd; +}; + +struct siw_device { + struct ib_device base_dev; + struct net_device *netdev; + struct siw_dev_cap attrs; + + u32 vendor_part_id; + int numa_node; + + /* physical port state (only one port per device) */ + enum ib_port_state state; + + spinlock_t lock; + + struct xarray qp_xa; + struct xarray mem_xa; + + struct list_head cep_list; + struct list_head qp_list; + + /* active objects statistics to enforce limits */ + atomic_t num_qp; + atomic_t num_cq; + atomic_t num_pd; + atomic_t num_mr; + atomic_t num_srq; + atomic_t num_ctx; + + struct work_struct netdev_down; +}; + +struct siw_uobj { + void *addr; + u32 size; +}; + +struct siw_ucontext { + struct ib_ucontext base_ucontext; + struct siw_device *sdev; + + /* xarray of user mappable objects */ + struct xarray xa; + u32 uobj_nextkey; +}; + +/* + * The RDMA core does not define LOCAL_READ access, which is always + * enabled implictely. + */ +#define IWARP_ACCESS_MASK \ + (IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE | \ + IB_ACCESS_REMOTE_READ) + +/* + * siw presentation of user memory registered as source + * or target of RDMA operations. + */ + +struct siw_page_chunk { + struct page **plist; +}; + +struct siw_umem { + struct siw_page_chunk *page_chunk; + int num_pages; + bool writable; + u64 fp_addr; /* First page base address */ + struct mm_struct *owning_mm; +}; + +struct siw_pble { + u64 addr; /* Address of assigned user buffer */ + u64 size; /* Size of this entry */ + u64 pbl_off; /* Total offset from start of PBL */ +}; + +struct siw_pbl { + unsigned int num_buf; + unsigned int max_buf; + struct siw_pble pbe[1]; +}; + +struct siw_mr; + +/* + * Generic memory representation for registered siw memory. + * Memory lookup always via higher 24 bit of STag (STag index). + */ +struct siw_mem { + struct siw_device *sdev; + struct kref ref; + u64 va; /* VA of memory */ + u64 len; /* length of the memory buffer in bytes */ + u32 stag; /* iWarp memory access steering tag */ + u8 stag_valid; /* VALID or INVALID */ + u8 is_pbl; /* PBL or user space mem */ + u8 is_mw; /* Memory Region or Memory Window */ + enum ib_access_flags perms; /* local/remote READ & WRITE */ + union { + struct siw_umem *umem; + struct siw_pbl *pbl; + void *mem_obj; + }; + struct ib_pd *pd; +}; + +struct siw_mr { + struct ib_mr base_mr; + struct siw_mem *mem; + struct rcu_head rcu; +}; + +/* + * Error codes for local or remote + * access to registered memory + */ +enum siw_access_state { + E_ACCESS_OK, + E_STAG_INVALID, + E_BASE_BOUNDS, + E_ACCESS_PERM, + E_PD_MISMATCH +}; + +enum siw_wr_state { + SIW_WR_IDLE, + SIW_WR_QUEUED, /* processing has not started yet */ + SIW_WR_INPROGRESS /* initiated processing of the WR */ +}; + +/* The WQE currently being processed (RX or TX) */ +struct siw_wqe { + /* Copy of applications SQE or RQE */ + union { + struct siw_sqe sqe; + struct siw_rqe rqe; + }; + struct siw_mem *mem[SIW_MAX_SGE]; /* per sge's resolved mem */ + enum siw_wr_state wr_status; + enum siw_wc_status wc_status; + u32 bytes; /* total bytes to process */ + u32 processed; /* bytes processed */ +}; + +struct siw_cq { + struct ib_cq base_cq; + spinlock_t lock; + u64 *notify; + struct siw_cqe *queue; + u32 cq_put; + u32 cq_get; + u32 num_cqe; + bool kernel_verbs; + u32 xa_cq_index; /* mmap information for CQE array */ + u32 id; /* For debugging only */ +}; + +enum siw_qp_state { + SIW_QP_STATE_IDLE, + SIW_QP_STATE_RTR, + SIW_QP_STATE_RTS, + SIW_QP_STATE_CLOSING, + SIW_QP_STATE_TERMINATE, + SIW_QP_STATE_ERROR, + SIW_QP_STATE_COUNT +}; + +enum siw_qp_flags { + SIW_RDMA_BIND_ENABLED = (1 << 0), + SIW_RDMA_WRITE_ENABLED = (1 << 1), + SIW_RDMA_READ_ENABLED = (1 << 2), + SIW_SIGNAL_ALL_WR = (1 << 3), + SIW_MPA_CRC = (1 << 4), + SIW_QP_IN_DESTROY = (1 << 5) +}; + +enum siw_qp_attr_mask { + SIW_QP_ATTR_STATE = (1 << 0), + SIW_QP_ATTR_ACCESS_FLAGS = (1 << 1), + SIW_QP_ATTR_LLP_HANDLE = (1 << 2), + SIW_QP_ATTR_ORD = (1 << 3), + SIW_QP_ATTR_IRD = (1 << 4), + SIW_QP_ATTR_SQ_SIZE = (1 << 5), + SIW_QP_ATTR_RQ_SIZE = (1 << 6), + SIW_QP_ATTR_MPA = (1 << 7) +}; + +struct siw_srq { + struct ib_srq base_srq; + spinlock_t lock; + u32 max_sge; + u32 limit; /* low watermark for async event */ + struct siw_rqe *recvq; + u32 rq_put; + u32 rq_get; + u32 num_rqe; /* max # of wqe's allowed */ + u32 xa_srq_index; /* mmap information for SRQ array */ + char armed; /* inform user if limit hit */ + char kernel_verbs; /* '1' if kernel client */ +}; + +struct siw_qp_attrs { + enum siw_qp_state state; + u32 sq_size; + u32 rq_size; + u32 orq_size; + u32 irq_size; + u32 sq_max_sges; + u32 rq_max_sges; + enum siw_qp_flags flags; + + struct socket *sk; +}; + +enum siw_tx_ctx { + SIW_SEND_HDR, /* start or continue sending HDR */ + SIW_SEND_DATA, /* start or continue sending DDP payload */ + SIW_SEND_TRAILER, /* start or continue sending TRAILER */ + SIW_SEND_SHORT_FPDU/* send whole FPDU hdr|data|trailer at once */ +}; + +enum siw_rx_state { + SIW_GET_HDR, /* await new hdr or within hdr */ + SIW_GET_DATA_START, /* start of inbound DDP payload */ + SIW_GET_DATA_MORE, /* continuation of (misaligned) DDP payload */ + SIW_GET_TRAILER/* await new trailer or within trailer */ +}; + +struct siw_rx_stream { + struct sk_buff *skb; + int skb_new; /* pending unread bytes in skb */ + int skb_offset; /* offset in skb */ + int skb_copied; /* processed bytes in skb */ + + union iwarp_hdr hdr; + struct mpa_trailer trailer; + + enum siw_rx_state state; + + /* + * For each FPDU, main RX loop runs through 3 stages: + * Receiving protocol headers, placing DDP payload and receiving + * trailer information (CRC + possibly padding). + * Next two variables keep state on receive status of the + * current FPDU part (hdr, data, trailer). + */ + int fpdu_part_rcvd; /* bytes in pkt part copied */ + int fpdu_part_rem; /* bytes in pkt part not seen */ + + /* + * Next expected DDP MSN for each QN + + * expected steering tag + + * expected DDP tagget offset (all HBO) + */ + u32 ddp_msn[RDMAP_UNTAGGED_QN_COUNT]; + u32 ddp_stag; + u64 ddp_to; + u32 inval_stag; /* Stag to be invalidated */ + + struct shash_desc *mpa_crc_hd; + u8 rx_suspend : 1; + u8 pad : 2; /* # of pad bytes expected */ + u8 rdmap_op : 4; /* opcode of current frame */ +}; + +struct siw_rx_fpdu { + /* + * Local destination memory of inbound RDMA operation. + * Valid, according to wqe->wr_status + */ + struct siw_wqe wqe_active; + + unsigned int pbl_idx; /* Index into current PBL */ + unsigned int sge_idx; /* current sge in rx */ + unsigned int sge_off; /* already rcvd in curr. sge */ + + char first_ddp_seg; /* this is the first DDP seg */ + char more_ddp_segs; /* more DDP segs expected */ + u8 prev_rdmap_op : 4; /* opcode of prev frame */ +}; + +/* + * Shorthands for short packets w/o payload + * to be transmitted more efficient. + */ +struct siw_send_pkt { + struct iwarp_send send; + __be32 crc; +}; + +struct siw_write_pkt { + struct iwarp_rdma_write write; + __be32 crc; +}; + +struct siw_rreq_pkt { + struct iwarp_rdma_rreq rreq; + __be32 crc; +}; + +struct siw_rresp_pkt { + struct iwarp_rdma_rresp rresp; + __be32 crc; +}; + +struct siw_iwarp_tx { + union { + union iwarp_hdr hdr; + + /* Generic part of FPDU header */ + struct iwarp_ctrl ctrl; + struct iwarp_ctrl_untagged c_untagged; + struct iwarp_ctrl_tagged c_tagged; + + /* FPDU headers */ + struct iwarp_rdma_write rwrite; + struct iwarp_rdma_rreq rreq; + struct iwarp_rdma_rresp rresp; + struct iwarp_terminate terminate; + struct iwarp_send send; + struct iwarp_send_inv send_inv; + + /* complete short FPDUs */ + struct siw_send_pkt send_pkt; + struct siw_write_pkt write_pkt; + struct siw_rreq_pkt rreq_pkt; + struct siw_rresp_pkt rresp_pkt; + } pkt; + + struct mpa_trailer trailer; + /* DDP MSN for untagged messages */ + u32 ddp_msn[RDMAP_UNTAGGED_QN_COUNT]; + + enum siw_tx_ctx state; + u16 ctrl_len; /* ddp+rdmap hdr */ + u16 ctrl_sent; + int burst; + int bytes_unsent; /* ddp payload bytes */ + + struct shash_desc *mpa_crc_hd; + + u8 do_crc : 1; /* do crc for segment */ + u8 use_sendpage : 1; /* send w/o copy */ + u8 tx_suspend : 1; /* stop sending DDP segs. */ + u8 pad : 2; /* # pad in current fpdu */ + u8 orq_fence : 1; /* ORQ full or Send fenced */ + u8 in_syscall : 1; /* TX out of user context */ + u8 zcopy_tx : 1; /* Use TCP_SENDPAGE if possible */ + u8 gso_seg_limit; /* Maximum segments for GSO, 0 = unbound */ + + u16 fpdu_len; /* len of FPDU to tx */ + unsigned int tcp_seglen; /* remaining tcp seg space */ + + struct siw_wqe wqe_active; + + int pbl_idx; /* Index into current PBL */ + int sge_idx; /* current sge in tx */ + u32 sge_off; /* already sent in curr. sge */ +}; + +struct siw_qp { + struct siw_device *sdev; + struct ib_qp *ib_qp; + struct kref ref; + u32 qp_num; + struct list_head devq; + int tx_cpu; + bool kernel_verbs; + struct siw_qp_attrs attrs; + + struct siw_cep *cep; + struct rw_semaphore state_lock; + + struct ib_pd *pd; + struct siw_cq *scq; + struct siw_cq *rcq; + struct siw_srq *srq; + + struct siw_iwarp_tx tx_ctx; /* Transmit context */ + spinlock_t sq_lock; + struct siw_sqe *sendq; /* send queue element array */ + uint32_t sq_get; /* consumer index into sq array */ + uint32_t sq_put; /* kernel prod. index into sq array */ + struct llist_node tx_list; + + struct siw_sqe *orq; /* outbound read queue element array */ + spinlock_t orq_lock; + uint32_t orq_get; /* consumer index into orq array */ + uint32_t orq_put; /* shared producer index for ORQ */ + + struct siw_rx_stream rx_stream; + struct siw_rx_fpdu *rx_fpdu; + struct siw_rx_fpdu rx_tagged; + struct siw_rx_fpdu rx_untagged; + spinlock_t rq_lock; + struct siw_rqe *recvq; /* recv queue element array */ + uint32_t rq_get; /* consumer index into rq array */ + uint32_t rq_put; /* kernel prod. index into rq array */ + + struct siw_sqe *irq; /* inbound read queue element array */ + uint32_t irq_get; /* consumer index into irq array */ + uint32_t irq_put; /* producer index into irq array */ + int irq_burst; + + struct { /* information to be carried in TERMINATE pkt, if valid */ + u8 valid; + u8 in_tx; + u8 layer : 4, etype : 4; + u8 ecode; + } term_info; + u32 xa_sq_index; /* mmap information for SQE array */ + u32 xa_rq_index; /* mmap information for RQE array */ + struct rcu_head rcu; +}; + +struct siw_base_qp { + struct ib_qp base_qp; + struct siw_qp *qp; +}; + +/* helper macros */ +#define rx_qp(rx) container_of(rx, struct siw_qp, rx_stream) +#define tx_qp(tx) container_of(tx, struct siw_qp, tx_ctx) +#define tx_wqe(qp) (&(qp)->tx_ctx.wqe_active) +#define rx_wqe(rctx) (&(rctx)->wqe_active) +#define rx_mem(rctx) ((rctx)->wqe_active.mem[0]) +#define tx_type(wqe) ((wqe)->sqe.opcode) +#define rx_type(wqe) ((wqe)->rqe.opcode) +#define tx_flags(wqe) ((wqe)->sqe.flags) + +struct iwarp_msg_info { + int hdr_len; + struct iwarp_ctrl ctrl; + int (*rx_data)(struct siw_qp *qp); +}; + +/* Global siw parameters. Currently set in siw_main.c */ +extern const bool zcopy_tx; +extern const bool try_gso; +extern const bool loopback_enabled; +extern const bool mpa_crc_required; +extern const bool mpa_crc_strict; +extern const bool siw_tcp_nagle; +extern u_char mpa_version; +extern const bool peer_to_peer; +extern struct task_struct *siw_tx_thread[]; + +extern struct crypto_shash *siw_crypto_shash; +extern struct iwarp_msg_info iwarp_pktinfo[RDMAP_TERMINATE + 1]; + +/* QP general functions */ +int siw_qp_modify(struct siw_qp *qp, struct siw_qp_attrs *attr, + enum siw_qp_attr_mask mask); +int siw_qp_mpa_rts(struct siw_qp *qp, enum mpa_v2_ctrl ctrl); +void siw_qp_llp_close(struct siw_qp *qp); +void siw_qp_cm_drop(struct siw_qp *qp, int schedule); +void siw_send_terminate(struct siw_qp *qp); + +void siw_qp_get_ref(struct ib_qp *qp); +void siw_qp_put_ref(struct ib_qp *qp); +int siw_qp_add(struct siw_device *sdev, struct siw_qp *qp); +void siw_free_qp(struct kref *ref); + +void siw_init_terminate(struct siw_qp *qp, enum term_elayer layer, + u8 etype, u8 ecode, int in_tx); +enum ddp_ecode siw_tagged_error(enum siw_access_state state); +enum rdmap_ecode siw_rdmap_error(enum siw_access_state state); + +void siw_read_to_orq(struct siw_sqe *rreq, struct siw_sqe *sqe); +int siw_sqe_complete(struct siw_qp *qp, struct siw_sqe *sqe, u32 bytes, + enum siw_wc_status status); +int siw_rqe_complete(struct siw_qp *qp, struct siw_rqe *rqe, u32 bytes, + u32 inval_stag, enum siw_wc_status status); +void siw_qp_llp_data_ready(struct sock *sk); +void siw_qp_llp_write_space(struct sock *sk); + +/* QP TX path functions */ +int siw_run_sq(void *arg); +int siw_qp_sq_process(struct siw_qp *qp); +int siw_sq_start(struct siw_qp *qp); +int siw_activate_tx(struct siw_qp *qp); +void siw_stop_tx_thread(int nr_cpu); +int siw_get_tx_cpu(struct siw_device *sdev); +void siw_put_tx_cpu(int cpu); + +/* QP RX path functions */ +int siw_proc_send(struct siw_qp *qp); +int siw_proc_rreq(struct siw_qp *qp); +int siw_proc_rresp(struct siw_qp *qp); +int siw_proc_write(struct siw_qp *qp); +int siw_proc_terminate(struct siw_qp *qp); + +int siw_tcp_rx_data(read_descriptor_t *rd_desc, struct sk_buff *skb, + unsigned int off, size_t len); + +static inline void set_rx_fpdu_context(struct siw_qp *qp, u8 opcode) +{ + if (opcode == RDMAP_RDMA_WRITE || opcode == RDMAP_RDMA_READ_RESP) + qp->rx_fpdu = &qp->rx_tagged; + else + qp->rx_fpdu = &qp->rx_untagged; + + qp->rx_stream.rdmap_op = opcode; +} + +static inline struct siw_ucontext *to_siw_ctx(struct ib_ucontext *base_ctx) +{ + return container_of(base_ctx, struct siw_ucontext, base_ucontext); +} + +static inline struct siw_base_qp *to_siw_base_qp(struct ib_qp *base_qp) +{ + return container_of(base_qp, struct siw_base_qp, base_qp); +} + +static inline struct siw_qp *to_siw_qp(struct ib_qp *base_qp) +{ + return to_siw_base_qp(base_qp)->qp; +} + +static inline struct siw_cq *to_siw_cq(struct ib_cq *base_cq) +{ + return container_of(base_cq, struct siw_cq, base_cq); +} + +static inline struct siw_srq *to_siw_srq(struct ib_srq *base_srq) +{ + return container_of(base_srq, struct siw_srq, base_srq); +} + +static inline struct siw_device *to_siw_dev(struct ib_device *base_dev) +{ + return container_of(base_dev, struct siw_device, base_dev); +} + +static inline struct siw_mr *to_siw_mr(struct ib_mr *base_mr) +{ + return container_of(base_mr, struct siw_mr, base_mr); +} + +static inline struct siw_qp *siw_qp_id2obj(struct siw_device *sdev, int id) +{ + struct siw_qp *qp; + + rcu_read_lock(); + qp = xa_load(&sdev->qp_xa, id); + if (likely(qp && kref_get_unless_zero(&qp->ref))) { + rcu_read_unlock(); + return qp; + } + rcu_read_unlock(); + return NULL; +} + +static inline u32 qp_id(struct siw_qp *qp) +{ + return qp->qp_num; +} + +static inline void siw_qp_get(struct siw_qp *qp) +{ + kref_get(&qp->ref); +} + +static inline void siw_qp_put(struct siw_qp *qp) +{ + kref_put(&qp->ref, siw_free_qp); +} + +static inline int siw_sq_empty(struct siw_qp *qp) +{ + struct siw_sqe *sqe = &qp->sendq[qp->sq_get % qp->attrs.sq_size]; + + return READ_ONCE(sqe->flags) == 0; +} + +static inline struct siw_sqe *sq_get_next(struct siw_qp *qp) +{ + struct siw_sqe *sqe = &qp->sendq[qp->sq_get % qp->attrs.sq_size]; + + if (READ_ONCE(sqe->flags) & SIW_WQE_VALID) + return sqe; + + return NULL; +} + +static inline struct siw_sqe *orq_get_current(struct siw_qp *qp) +{ + return &qp->orq[qp->orq_get % qp->attrs.orq_size]; +} + +static inline struct siw_sqe *orq_get_tail(struct siw_qp *qp) +{ + return &qp->orq[qp->orq_put % qp->attrs.orq_size]; +} + +static inline struct siw_sqe *orq_get_free(struct siw_qp *qp) +{ + struct siw_sqe *orq_e = orq_get_tail(qp); + + if (orq_e && READ_ONCE(orq_e->flags) == 0) + return orq_e; + + return NULL; +} + +static inline int siw_orq_empty(struct siw_qp *qp) +{ + return qp->orq[qp->orq_get % qp->attrs.orq_size].flags == 0 ? 1 : 0; +} + +static inline struct siw_sqe *irq_alloc_free(struct siw_qp *qp) +{ + struct siw_sqe *irq_e = &qp->irq[qp->irq_put % qp->attrs.irq_size]; + + if (READ_ONCE(irq_e->flags) == 0) { + qp->irq_put++; + return irq_e; + } + return NULL; +} + +static inline __wsum siw_csum_update(const void *buff, int len, __wsum sum) +{ + return (__force __wsum)crc32c((__force __u32)sum, buff, len); +} + +static inline __wsum siw_csum_combine(__wsum csum, __wsum csum2, int offset, + int len) +{ + return (__force __wsum)__crc32c_le_combine((__force __u32)csum, + (__force __u32)csum2, len); +} + +static inline void siw_crc_skb(struct siw_rx_stream *srx, unsigned int len) +{ + const struct skb_checksum_ops siw_cs_ops = { + .update = siw_csum_update, + .combine = siw_csum_combine, + }; + __wsum crc = *(u32 *)shash_desc_ctx(srx->mpa_crc_hd); + + crc = __skb_checksum(srx->skb, srx->skb_offset, len, crc, + &siw_cs_ops); + *(u32 *)shash_desc_ctx(srx->mpa_crc_hd) = crc; +} + +#define siw_dbg(ibdev, fmt, ...) \ + ibdev_dbg(ibdev, "%s: " fmt, __func__, ##__VA_ARGS__) + +#define siw_dbg_qp(qp, fmt, ...) \ + ibdev_dbg(&qp->sdev->base_dev, "QP[%u] %s: " fmt, qp_id(qp), __func__, \ + ##__VA_ARGS__) + +#define siw_dbg_cq(cq, fmt, ...) \ + ibdev_dbg(cq->base_cq.device, "CQ[%u] %s: " fmt, cq->id, __func__, \ + ##__VA_ARGS__) + +#define siw_dbg_pd(pd, fmt, ...) \ + ibdev_dbg(pd->device, "PD[%u] %s: " fmt, pd->res.id, __func__, \ + ##__VA_ARGS__) + +#define siw_dbg_mem(mem, fmt, ...) \ + ibdev_dbg(&mem->sdev->base_dev, \ + "MEM[0x%08x] %s: " fmt, mem->stag, __func__, ##__VA_ARGS__) + +#define siw_dbg_cep(cep, fmt, ...) \ + ibdev_dbg(&cep->sdev->base_dev, "CEP[0x%p] %s: " fmt, \ + cep, __func__, ##__VA_ARGS__) + +void siw_cq_flush(struct siw_cq *cq); +void siw_sq_flush(struct siw_qp *qp); +void siw_rq_flush(struct siw_qp *qp); +int siw_reap_cqe(struct siw_cq *cq, struct ib_wc *wc); + +#endif diff --git a/drivers/infiniband/sw/siw/siw_cm.c b/drivers/infiniband/sw/siw/siw_cm.c new file mode 100644 index 000000000000..a7cde98e73e8 --- /dev/null +++ b/drivers/infiniband/sw/siw/siw_cm.c @@ -0,0 +1,2070 @@ +// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause + +/* Authors: Bernard Metzler <bmt@zurich.ibm.com> */ +/* Fredy Neeser */ +/* Greg Joyce <greg@opengridcomputing.com> */ +/* Copyright (c) 2008-2019, IBM Corporation */ +/* Copyright (c) 2017, Open Grid Computing, Inc. */ + +#include <linux/errno.h> +#include <linux/types.h> +#include <linux/net.h> +#include <linux/inetdevice.h> +#include <net/addrconf.h> +#include <linux/workqueue.h> +#include <net/sock.h> +#include <net/tcp.h> +#include <linux/inet.h> +#include <linux/tcp.h> + +#include <rdma/iw_cm.h> +#include <rdma/ib_verbs.h> +#include <rdma/ib_user_verbs.h> + +#include "siw.h" +#include "siw_cm.h" + +/* + * Set to any combination of + * MPA_V2_RDMA_NO_RTR, MPA_V2_RDMA_READ_RTR, MPA_V2_RDMA_WRITE_RTR + */ +static __be16 rtr_type = MPA_V2_RDMA_READ_RTR | MPA_V2_RDMA_WRITE_RTR; +static const bool relaxed_ird_negotiation = 1; + +static void siw_cm_llp_state_change(struct sock *s); +static void siw_cm_llp_data_ready(struct sock *s); +static void siw_cm_llp_write_space(struct sock *s); +static void siw_cm_llp_error_report(struct sock *s); +static int siw_cm_upcall(struct siw_cep *cep, enum iw_cm_event_type reason, + int status); + +static void siw_sk_assign_cm_upcalls(struct sock *sk) +{ + write_lock_bh(&sk->sk_callback_lock); + sk->sk_state_change = siw_cm_llp_state_change; + sk->sk_data_ready = siw_cm_llp_data_ready; + sk->sk_write_space = siw_cm_llp_write_space; + sk->sk_error_report = siw_cm_llp_error_report; + write_unlock_bh(&sk->sk_callback_lock); +} + +static void siw_sk_save_upcalls(struct sock *sk) +{ + struct siw_cep *cep = sk_to_cep(sk); + + write_lock_bh(&sk->sk_callback_lock); + cep->sk_state_change = sk->sk_state_change; + cep->sk_data_ready = sk->sk_data_ready; + cep->sk_write_space = sk->sk_write_space; + cep->sk_error_report = sk->sk_error_report; + write_unlock_bh(&sk->sk_callback_lock); +} + +static void siw_sk_restore_upcalls(struct sock *sk, struct siw_cep *cep) +{ + sk->sk_state_change = cep->sk_state_change; + sk->sk_data_ready = cep->sk_data_ready; + sk->sk_write_space = cep->sk_write_space; + sk->sk_error_report = cep->sk_error_report; + sk->sk_user_data = NULL; +} + +static void siw_qp_socket_assoc(struct siw_cep *cep, struct siw_qp *qp) +{ + struct socket *s = cep->sock; + struct sock *sk = s->sk; + + write_lock_bh(&sk->sk_callback_lock); + + qp->attrs.sk = s; + sk->sk_data_ready = siw_qp_llp_data_ready; + sk->sk_write_space = siw_qp_llp_write_space; + + write_unlock_bh(&sk->sk_callback_lock); +} + +static void siw_socket_disassoc(struct socket *s) +{ + struct sock *sk = s->sk; + struct siw_cep *cep; + + if (sk) { + write_lock_bh(&sk->sk_callback_lock); + cep = sk_to_cep(sk); + if (cep) { + siw_sk_restore_upcalls(sk, cep); + siw_cep_put(cep); + } else { + pr_warn("siw: cannot restore sk callbacks: no ep\n"); + } + write_unlock_bh(&sk->sk_callback_lock); + } else { + pr_warn("siw: cannot restore sk callbacks: no sk\n"); + } +} + +static void siw_rtr_data_ready(struct sock *sk) +{ + struct siw_cep *cep; + struct siw_qp *qp = NULL; + read_descriptor_t rd_desc; + + read_lock(&sk->sk_callback_lock); + + cep = sk_to_cep(sk); + if (!cep) { + WARN(1, "No connection endpoint\n"); + goto out; + } + qp = sk_to_qp(sk); + + memset(&rd_desc, 0, sizeof(rd_desc)); + rd_desc.arg.data = qp; + rd_desc.count = 1; + + tcp_read_sock(sk, &rd_desc, siw_tcp_rx_data); + /* + * Check if first frame was successfully processed. + * Signal connection full establishment if yes. + * Failed data processing would have already scheduled + * connection drop. + */ + if (!qp->rx_stream.rx_suspend) + siw_cm_upcall(cep, IW_CM_EVENT_ESTABLISHED, 0); +out: + read_unlock(&sk->sk_callback_lock); + if (qp) + siw_qp_socket_assoc(cep, qp); +} + +static void siw_sk_assign_rtr_upcalls(struct siw_cep *cep) +{ + struct sock *sk = cep->sock->sk; + + write_lock_bh(&sk->sk_callback_lock); + sk->sk_data_ready = siw_rtr_data_ready; + sk->sk_write_space = siw_qp_llp_write_space; + write_unlock_bh(&sk->sk_callback_lock); +} + +static void siw_cep_socket_assoc(struct siw_cep *cep, struct socket *s) +{ + cep->sock = s; + siw_cep_get(cep); + s->sk->sk_user_data = cep; + + siw_sk_save_upcalls(s->sk); + siw_sk_assign_cm_upcalls(s->sk); +} + +static struct siw_cep *siw_cep_alloc(struct siw_device *sdev) +{ + struct siw_cep *cep = kzalloc(sizeof(*cep), GFP_KERNEL); + unsigned long flags; + + if (!cep) + return NULL; + + INIT_LIST_HEAD(&cep->listenq); + INIT_LIST_HEAD(&cep->devq); + INIT_LIST_HEAD(&cep->work_freelist); + + kref_init(&cep->ref); + cep->state = SIW_EPSTATE_IDLE; + init_waitqueue_head(&cep->waitq); + spin_lock_init(&cep->lock); + cep->sdev = sdev; + cep->enhanced_rdma_conn_est = false; + + spin_lock_irqsave(&sdev->lock, flags); + list_add_tail(&cep->devq, &sdev->cep_list); + spin_unlock_irqrestore(&sdev->lock, flags); + + siw_dbg_cep(cep, "new endpoint\n"); + return cep; +} + +static void siw_cm_free_work(struct siw_cep *cep) +{ + struct list_head *w, *tmp; + struct siw_cm_work *work; + + list_for_each_safe(w, tmp, &cep->work_freelist) { + work = list_entry(w, struct siw_cm_work, list); + list_del(&work->list); + kfree(work); + } +} + +static void siw_cancel_mpatimer(struct siw_cep *cep) +{ + spin_lock_bh(&cep->lock); + if (cep->mpa_timer) { + if (cancel_delayed_work(&cep->mpa_timer->work)) { + siw_cep_put(cep); + kfree(cep->mpa_timer); /* not needed again */ + } + cep->mpa_timer = NULL; + } + spin_unlock_bh(&cep->lock); +} + +static void siw_put_work(struct siw_cm_work *work) +{ + INIT_LIST_HEAD(&work->list); + spin_lock_bh(&work->cep->lock); + list_add(&work->list, &work->cep->work_freelist); + spin_unlock_bh(&work->cep->lock); +} + +static void siw_cep_set_inuse(struct siw_cep *cep) +{ + unsigned long flags; + int rv; +retry: + spin_lock_irqsave(&cep->lock, flags); + + if (cep->in_use) { + spin_unlock_irqrestore(&cep->lock, flags); + rv = wait_event_interruptible(cep->waitq, !cep->in_use); + if (signal_pending(current)) + flush_signals(current); + goto retry; + } else { + cep->in_use = 1; + spin_unlock_irqrestore(&cep->lock, flags); + } +} + +static void siw_cep_set_free(struct siw_cep *cep) +{ + unsigned long flags; + + spin_lock_irqsave(&cep->lock, flags); + cep->in_use = 0; + spin_unlock_irqrestore(&cep->lock, flags); + + wake_up(&cep->waitq); +} + +static void __siw_cep_dealloc(struct kref *ref) +{ + struct siw_cep *cep = container_of(ref, struct siw_cep, ref); + struct siw_device *sdev = cep->sdev; + unsigned long flags; + + WARN_ON(cep->listen_cep); + + /* kfree(NULL) is safe */ + kfree(cep->mpa.pdata); + spin_lock_bh(&cep->lock); + if (!list_empty(&cep->work_freelist)) + siw_cm_free_work(cep); + spin_unlock_bh(&cep->lock); + + spin_lock_irqsave(&sdev->lock, flags); + list_del(&cep->devq); + spin_unlock_irqrestore(&sdev->lock, flags); + + siw_dbg_cep(cep, "free endpoint\n"); + kfree(cep); +} + +static struct siw_cm_work *siw_get_work(struct siw_cep *cep) +{ + struct siw_cm_work *work = NULL; + + spin_lock_bh(&cep->lock); + if (!list_empty(&cep->work_freelist)) { + work = list_entry(cep->work_freelist.next, struct siw_cm_work, + list); + list_del_init(&work->list); + } + spin_unlock_bh(&cep->lock); + return work; +} + +static int siw_cm_alloc_work(struct siw_cep *cep, int num) +{ + struct siw_cm_work *work; + + while (num--) { + work = kmalloc(sizeof(*work), GFP_KERNEL); + if (!work) { + if (!(list_empty(&cep->work_freelist))) + siw_cm_free_work(cep); + return -ENOMEM; + } + work->cep = cep; + INIT_LIST_HEAD(&work->list); + list_add(&work->list, &cep->work_freelist); + } + return 0; +} + +/* + * siw_cm_upcall() + * + * Upcall to IWCM to inform about async connection events + */ +static int siw_cm_upcall(struct siw_cep *cep, enum iw_cm_event_type reason, + int status) +{ + struct iw_cm_event event; + struct iw_cm_id *id; + + memset(&event, 0, sizeof(event)); + event.status = status; + event.event = reason; + + if (reason == IW_CM_EVENT_CONNECT_REQUEST) { + event.provider_data = cep; + id = cep->listen_cep->cm_id; + } else { + id = cep->cm_id; + } + /* Signal IRD and ORD */ + if (reason == IW_CM_EVENT_ESTABLISHED || + reason == IW_CM_EVENT_CONNECT_REPLY) { + /* Signal negotiated IRD/ORD values we will use */ + event.ird = cep->ird; + event.ord = cep->ord; + } else if (reason == IW_CM_EVENT_CONNECT_REQUEST) { + event.ird = cep->ord; + event.ord = cep->ird; + } + /* Signal private data and address information */ + if (reason == IW_CM_EVENT_CONNECT_REQUEST || + reason == IW_CM_EVENT_CONNECT_REPLY) { + u16 pd_len = be16_to_cpu(cep->mpa.hdr.params.pd_len); + + if (pd_len) { + /* + * hand over MPA private data + */ + event.private_data_len = pd_len; + event.private_data = cep->mpa.pdata; + + /* Hide MPA V2 IRD/ORD control */ + if (cep->enhanced_rdma_conn_est) { + event.private_data_len -= + sizeof(struct mpa_v2_data); + event.private_data += + sizeof(struct mpa_v2_data); + } + } + getname_local(cep->sock, &event.local_addr); + getname_peer(cep->sock, &event.remote_addr); + } + siw_dbg_cep(cep, "[QP %u]: id 0x%p, reason=%d, status=%d\n", + cep->qp ? qp_id(cep->qp) : -1, id, reason, status); + + return id->event_handler(id, &event); +} + +/* + * siw_qp_cm_drop() + * + * Drops established LLP connection if present and not already + * scheduled for dropping. Called from user context, SQ workqueue + * or receive IRQ. Caller signals if socket can be immediately + * closed (basically, if not in IRQ). + */ +void siw_qp_cm_drop(struct siw_qp *qp, int schedule) +{ + struct siw_cep *cep = qp->cep; + + qp->rx_stream.rx_suspend = 1; + qp->tx_ctx.tx_suspend = 1; + + if (!qp->cep) + return; + + if (schedule) { + siw_cm_queue_work(cep, SIW_CM_WORK_CLOSE_LLP); + } else { + siw_cep_set_inuse(cep); + + if (cep->state == SIW_EPSTATE_CLOSED) { + siw_dbg_cep(cep, "already closed\n"); + goto out; + } + siw_dbg_cep(cep, "immediate close, state %d\n", cep->state); + + if (qp->term_info.valid) + siw_send_terminate(qp); + + if (cep->cm_id) { + switch (cep->state) { + case SIW_EPSTATE_AWAIT_MPAREP: + siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, + -EINVAL); + break; + + case SIW_EPSTATE_RDMA_MODE: + siw_cm_upcall(cep, IW_CM_EVENT_CLOSE, 0); + break; + + case SIW_EPSTATE_IDLE: + case SIW_EPSTATE_LISTENING: + case SIW_EPSTATE_CONNECTING: + case SIW_EPSTATE_AWAIT_MPAREQ: + case SIW_EPSTATE_RECVD_MPAREQ: + case SIW_EPSTATE_CLOSED: + default: + break; + } + cep->cm_id->rem_ref(cep->cm_id); + cep->cm_id = NULL; + siw_cep_put(cep); + } + cep->state = SIW_EPSTATE_CLOSED; + + if (cep->sock) { + siw_socket_disassoc(cep->sock); + /* + * Immediately close socket + */ + sock_release(cep->sock); + cep->sock = NULL; + } + if (cep->qp) { + cep->qp = NULL; + siw_qp_put(qp); + } +out: + siw_cep_set_free(cep); + } +} + +void siw_cep_put(struct siw_cep *cep) +{ + WARN_ON(kref_read(&cep->ref) < 1); + kref_put(&cep->ref, __siw_cep_dealloc); +} + +void siw_cep_get(struct siw_cep *cep) +{ + kref_get(&cep->ref); +} + +/* + * Expects params->pd_len in host byte order + */ +static int siw_send_mpareqrep(struct siw_cep *cep, const void *pdata, u8 pd_len) +{ + struct socket *s = cep->sock; + struct mpa_rr *rr = &cep->mpa.hdr; + struct kvec iov[3]; + struct msghdr msg; + int rv; + int iovec_num = 0; + int mpa_len; + + memset(&msg, 0, sizeof(msg)); + + iov[iovec_num].iov_base = rr; + iov[iovec_num].iov_len = sizeof(*rr); + mpa_len = sizeof(*rr); + + if (cep->enhanced_rdma_conn_est) { + iovec_num++; + iov[iovec_num].iov_base = &cep->mpa.v2_ctrl; + iov[iovec_num].iov_len = sizeof(cep->mpa.v2_ctrl); + mpa_len += sizeof(cep->mpa.v2_ctrl); + } + if (pd_len) { + iovec_num++; + iov[iovec_num].iov_base = (char *)pdata; + iov[iovec_num].iov_len = pd_len; + mpa_len += pd_len; + } + if (cep->enhanced_rdma_conn_est) + pd_len += sizeof(cep->mpa.v2_ctrl); + + rr->params.pd_len = cpu_to_be16(pd_len); + + rv = kernel_sendmsg(s, &msg, iov, iovec_num + 1, mpa_len); + + return rv < 0 ? rv : 0; +} + +/* + * Receive MPA Request/Reply header. + * + * Returns 0 if complete MPA Request/Reply header including + * eventual private data was received. Returns -EAGAIN if + * header was partially received or negative error code otherwise. + * + * Context: May be called in process context only + */ +static int siw_recv_mpa_rr(struct siw_cep *cep) +{ + struct mpa_rr *hdr = &cep->mpa.hdr; + struct socket *s = cep->sock; + u16 pd_len; + int rcvd, to_rcv; + + if (cep->mpa.bytes_rcvd < sizeof(struct mpa_rr)) { + rcvd = ksock_recv(s, (char *)hdr + cep->mpa.bytes_rcvd, + sizeof(struct mpa_rr) - cep->mpa.bytes_rcvd, + 0); + if (rcvd <= 0) + return -ECONNABORTED; + + cep->mpa.bytes_rcvd += rcvd; + + if (cep->mpa.bytes_rcvd < sizeof(struct mpa_rr)) + return -EAGAIN; + + if (be16_to_cpu(hdr->params.pd_len) > MPA_MAX_PRIVDATA) + return -EPROTO; + } + pd_len = be16_to_cpu(hdr->params.pd_len); + + /* + * At least the MPA Request/Reply header (frame not including + * private data) has been received. + * Receive (or continue receiving) any private data. + */ + to_rcv = pd_len - (cep->mpa.bytes_rcvd - sizeof(struct mpa_rr)); + + if (!to_rcv) { + /* + * We must have hdr->params.pd_len == 0 and thus received a + * complete MPA Request/Reply frame. + * Check against peer protocol violation. + */ + u32 word; + + rcvd = ksock_recv(s, (char *)&word, sizeof(word), MSG_DONTWAIT); + if (rcvd == -EAGAIN) + return 0; + + if (rcvd == 0) { + siw_dbg_cep(cep, "peer EOF\n"); + return -EPIPE; + } + if (rcvd < 0) { + siw_dbg_cep(cep, "error: %d\n", rcvd); + return rcvd; + } + siw_dbg_cep(cep, "peer sent extra data: %d\n", rcvd); + + return -EPROTO; + } + + /* + * At this point, we must have hdr->params.pd_len != 0. + * A private data buffer gets allocated if hdr->params.pd_len != 0. + */ + if (!cep->mpa.pdata) { + cep->mpa.pdata = kmalloc(pd_len + 4, GFP_KERNEL); + if (!cep->mpa.pdata) + return -ENOMEM; + } + rcvd = ksock_recv( + s, cep->mpa.pdata + cep->mpa.bytes_rcvd - sizeof(struct mpa_rr), + to_rcv + 4, MSG_DONTWAIT); + + if (rcvd < 0) + return rcvd; + + if (rcvd > to_rcv) + return -EPROTO; + + cep->mpa.bytes_rcvd += rcvd; + + if (to_rcv == rcvd) { + siw_dbg_cep(cep, "%d bytes private data received\n", pd_len); + return 0; + } + return -EAGAIN; +} + +/* + * siw_proc_mpareq() + * + * Read MPA Request from socket and signal new connection to IWCM + * if success. Caller must hold lock on corresponding listening CEP. + */ +static int siw_proc_mpareq(struct siw_cep *cep) +{ + struct mpa_rr *req; + int version, rv; + u16 pd_len; + + rv = siw_recv_mpa_rr(cep); + if (rv) + return rv; + + req = &cep->mpa.hdr; + + version = __mpa_rr_revision(req->params.bits); + pd_len = be16_to_cpu(req->params.pd_len); + + if (version > MPA_REVISION_2) + /* allow for 0, 1, and 2 only */ + return -EPROTO; + + if (memcmp(req->key, MPA_KEY_REQ, 16)) + return -EPROTO; + + /* Prepare for sending MPA reply */ + memcpy(req->key, MPA_KEY_REP, 16); + + if (version == MPA_REVISION_2 && + (req->params.bits & MPA_RR_FLAG_ENHANCED)) { + /* + * MPA version 2 must signal IRD/ORD values and P2P mode + * in private data if header flag MPA_RR_FLAG_ENHANCED + * is set. + */ + if (pd_len < sizeof(struct mpa_v2_data)) + goto reject_conn; + + cep->enhanced_rdma_conn_est = true; + } + + /* MPA Markers: currently not supported. Marker TX to be added. */ + if (req->params.bits & MPA_RR_FLAG_MARKERS) + goto reject_conn; + + if (req->params.bits & MPA_RR_FLAG_CRC) { + /* + * RFC 5044, page 27: CRC MUST be used if peer requests it. + * siw specific: 'mpa_crc_strict' parameter to reject + * connection with CRC if local CRC off enforced by + * 'mpa_crc_strict' module parameter. + */ + if (!mpa_crc_required && mpa_crc_strict) + goto reject_conn; + + /* Enable CRC if requested by module parameter */ + if (mpa_crc_required) + req->params.bits |= MPA_RR_FLAG_CRC; + } + if (cep->enhanced_rdma_conn_est) { + struct mpa_v2_data *v2 = (struct mpa_v2_data *)cep->mpa.pdata; + + /* + * Peer requested ORD becomes requested local IRD, + * peer requested IRD becomes requested local ORD. + * IRD and ORD get limited by global maximum values. + */ + cep->ord = ntohs(v2->ird) & MPA_IRD_ORD_MASK; + cep->ord = min(cep->ord, SIW_MAX_ORD_QP); + cep->ird = ntohs(v2->ord) & MPA_IRD_ORD_MASK; + cep->ird = min(cep->ird, SIW_MAX_IRD_QP); + + /* May get overwritten by locally negotiated values */ + cep->mpa.v2_ctrl.ird = htons(cep->ird); + cep->mpa.v2_ctrl.ord = htons(cep->ord); + + /* + * Support for peer sent zero length Write or Read to + * let local side enter RTS. Writes are preferred. + * Sends would require pre-posting a Receive and are + * not supported. + * Propose zero length Write if none of Read and Write + * is indicated. + */ + if (v2->ird & MPA_V2_PEER_TO_PEER) { + cep->mpa.v2_ctrl.ird |= MPA_V2_PEER_TO_PEER; + + if (v2->ord & MPA_V2_RDMA_WRITE_RTR) + cep->mpa.v2_ctrl.ord |= MPA_V2_RDMA_WRITE_RTR; + else if (v2->ord & MPA_V2_RDMA_READ_RTR) + cep->mpa.v2_ctrl.ord |= MPA_V2_RDMA_READ_RTR; + else + cep->mpa.v2_ctrl.ord |= MPA_V2_RDMA_WRITE_RTR; + } + } + + cep->state = SIW_EPSTATE_RECVD_MPAREQ; + + /* Keep reference until IWCM accepts/rejects */ + siw_cep_get(cep); + rv = siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REQUEST, 0); + if (rv) + siw_cep_put(cep); + + return rv; + +reject_conn: + siw_dbg_cep(cep, "reject: crc %d:%d:%d, m %d:%d\n", + req->params.bits & MPA_RR_FLAG_CRC ? 1 : 0, + mpa_crc_required, mpa_crc_strict, + req->params.bits & MPA_RR_FLAG_MARKERS ? 1 : 0, 0); + + req->params.bits &= ~MPA_RR_FLAG_MARKERS; + req->params.bits |= MPA_RR_FLAG_REJECT; + + if (!mpa_crc_required && mpa_crc_strict) + req->params.bits &= ~MPA_RR_FLAG_CRC; + + if (pd_len) + kfree(cep->mpa.pdata); + + cep->mpa.pdata = NULL; + + siw_send_mpareqrep(cep, NULL, 0); + + return -EOPNOTSUPP; +} + +static int siw_proc_mpareply(struct siw_cep *cep) +{ + struct siw_qp_attrs qp_attrs; + enum siw_qp_attr_mask qp_attr_mask; + struct siw_qp *qp = cep->qp; + struct mpa_rr *rep; + int rv; + u16 rep_ord; + u16 rep_ird; + bool ird_insufficient = false; + enum mpa_v2_ctrl mpa_p2p_mode = MPA_V2_RDMA_NO_RTR; + + rv = siw_recv_mpa_rr(cep); + if (rv != -EAGAIN) + siw_cancel_mpatimer(cep); + if (rv) + goto out_err; + + rep = &cep->mpa.hdr; + + if (__mpa_rr_revision(rep->params.bits) > MPA_REVISION_2) { + /* allow for 0, 1, and 2 only */ + rv = -EPROTO; + goto out_err; + } + if (memcmp(rep->key, MPA_KEY_REP, 16)) { + siw_init_terminate(qp, TERM_ERROR_LAYER_LLP, LLP_ETYPE_MPA, + LLP_ECODE_INVALID_REQ_RESP, 0); + siw_send_terminate(qp); + rv = -EPROTO; + goto out_err; + } + if (rep->params.bits & MPA_RR_FLAG_REJECT) { + siw_dbg_cep(cep, "got mpa reject\n"); + siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, -ECONNRESET); + + return -ECONNRESET; + } + if (try_gso && rep->params.bits & MPA_RR_FLAG_GSO_EXP) { + siw_dbg_cep(cep, "peer allows GSO on TX\n"); + qp->tx_ctx.gso_seg_limit = 0; + } + if ((rep->params.bits & MPA_RR_FLAG_MARKERS) || + (mpa_crc_required && !(rep->params.bits & MPA_RR_FLAG_CRC)) || + (mpa_crc_strict && !mpa_crc_required && + (rep->params.bits & MPA_RR_FLAG_CRC))) { + siw_dbg_cep(cep, "reply unsupp: crc %d:%d:%d, m %d:%d\n", + rep->params.bits & MPA_RR_FLAG_CRC ? 1 : 0, + mpa_crc_required, mpa_crc_strict, + rep->params.bits & MPA_RR_FLAG_MARKERS ? 1 : 0, 0); + + siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, -ECONNREFUSED); + + return -EINVAL; + } + if (cep->enhanced_rdma_conn_est) { + struct mpa_v2_data *v2; + + if (__mpa_rr_revision(rep->params.bits) < MPA_REVISION_2 || + !(rep->params.bits & MPA_RR_FLAG_ENHANCED)) { + /* + * Protocol failure: The responder MUST reply with + * MPA version 2 and MUST set MPA_RR_FLAG_ENHANCED. + */ + siw_dbg_cep(cep, "mpa reply error: vers %d, enhcd %d\n", + __mpa_rr_revision(rep->params.bits), + rep->params.bits & MPA_RR_FLAG_ENHANCED ? + 1 : + 0); + + siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, + -ECONNRESET); + return -EINVAL; + } + v2 = (struct mpa_v2_data *)cep->mpa.pdata; + rep_ird = ntohs(v2->ird) & MPA_IRD_ORD_MASK; + rep_ord = ntohs(v2->ord) & MPA_IRD_ORD_MASK; + + if (cep->ird < rep_ord && + (relaxed_ird_negotiation == false || + rep_ord > cep->sdev->attrs.max_ird)) { + siw_dbg_cep(cep, "ird %d, rep_ord %d, max_ord %d\n", + cep->ird, rep_ord, + cep->sdev->attrs.max_ord); + ird_insufficient = true; + } + if (cep->ord > rep_ird && relaxed_ird_negotiation == false) { + siw_dbg_cep(cep, "ord %d, rep_ird %d\n", cep->ord, + rep_ird); + ird_insufficient = true; + } + /* + * Always report negotiated peer values to user, + * even if IRD/ORD negotiation failed + */ + cep->ird = rep_ord; + cep->ord = rep_ird; + + if (ird_insufficient) { + /* + * If the initiator IRD is insuffient for the + * responder ORD, send a TERM. + */ + siw_init_terminate(qp, TERM_ERROR_LAYER_LLP, + LLP_ETYPE_MPA, + LLP_ECODE_INSUFFICIENT_IRD, 0); + siw_send_terminate(qp); + rv = -ENOMEM; + goto out_err; + } + if (cep->mpa.v2_ctrl_req.ird & MPA_V2_PEER_TO_PEER) + mpa_p2p_mode = + cep->mpa.v2_ctrl_req.ord & + (MPA_V2_RDMA_WRITE_RTR | MPA_V2_RDMA_READ_RTR); + + /* + * Check if we requested P2P mode, and if peer agrees + */ + if (mpa_p2p_mode != MPA_V2_RDMA_NO_RTR) { + if ((mpa_p2p_mode & v2->ord) == 0) { + /* + * We requested RTR mode(s), but the peer + * did not pick any mode we support. + */ + siw_dbg_cep(cep, + "rtr mode: req %2x, got %2x\n", + mpa_p2p_mode, + v2->ord & (MPA_V2_RDMA_WRITE_RTR | + MPA_V2_RDMA_READ_RTR)); + + siw_init_terminate(qp, TERM_ERROR_LAYER_LLP, + LLP_ETYPE_MPA, + LLP_ECODE_NO_MATCHING_RTR, + 0); + siw_send_terminate(qp); + rv = -EPROTO; + goto out_err; + } + mpa_p2p_mode = v2->ord & (MPA_V2_RDMA_WRITE_RTR | + MPA_V2_RDMA_READ_RTR); + } + } + memset(&qp_attrs, 0, sizeof(qp_attrs)); + + if (rep->params.bits & MPA_RR_FLAG_CRC) + qp_attrs.flags = SIW_MPA_CRC; + + qp_attrs.irq_size = cep->ird; + qp_attrs.orq_size = cep->ord; + qp_attrs.sk = cep->sock; + qp_attrs.state = SIW_QP_STATE_RTS; + + qp_attr_mask = SIW_QP_ATTR_STATE | SIW_QP_ATTR_LLP_HANDLE | + SIW_QP_ATTR_ORD | SIW_QP_ATTR_IRD | SIW_QP_ATTR_MPA; + + /* Move socket RX/TX under QP control */ + down_write(&qp->state_lock); + if (qp->attrs.state > SIW_QP_STATE_RTR) { + rv = -EINVAL; + up_write(&qp->state_lock); + goto out_err; + } + rv = siw_qp_modify(qp, &qp_attrs, qp_attr_mask); + + siw_qp_socket_assoc(cep, qp); + + up_write(&qp->state_lock); + + /* Send extra RDMA frame to trigger peer RTS if negotiated */ + if (mpa_p2p_mode != MPA_V2_RDMA_NO_RTR) { + rv = siw_qp_mpa_rts(qp, mpa_p2p_mode); + if (rv) + goto out_err; + } + if (!rv) { + rv = siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, 0); + if (!rv) + cep->state = SIW_EPSTATE_RDMA_MODE; + + return 0; + } + +out_err: + siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, -EINVAL); + + return rv; +} + +/* + * siw_accept_newconn - accept an incoming pending connection + * + */ +static void siw_accept_newconn(struct siw_cep *cep) +{ + struct socket *s = cep->sock; + struct socket *new_s = NULL; + struct siw_cep *new_cep = NULL; + int rv = 0; /* debug only. should disappear */ + + if (cep->state != SIW_EPSTATE_LISTENING) + goto error; + + new_cep = siw_cep_alloc(cep->sdev); + if (!new_cep) + goto error; + + /* + * 4: Allocate a sufficient number of work elements + * to allow concurrent handling of local + peer close + * events, MPA header processing + MPA timeout. + */ + if (siw_cm_alloc_work(new_cep, 4) != 0) + goto error; + + /* + * Copy saved socket callbacks from listening CEP + * and assign new socket with new CEP + */ + new_cep->sk_state_change = cep->sk_state_change; + new_cep->sk_data_ready = cep->sk_data_ready; + new_cep->sk_write_space = cep->sk_write_space; + new_cep->sk_error_report = cep->sk_error_report; + + rv = kernel_accept(s, &new_s, O_NONBLOCK); + if (rv != 0) { + /* + * Connection already aborted by peer..? + */ + siw_dbg_cep(cep, "kernel_accept() error: %d\n", rv); + goto error; + } + new_cep->sock = new_s; + siw_cep_get(new_cep); + new_s->sk->sk_user_data = new_cep; + + siw_dbg_cep(cep, "listen socket 0x%p, new 0x%p\n", s, new_s); + + if (siw_tcp_nagle == false) { + int val = 1; + + rv = kernel_setsockopt(new_s, SOL_TCP, TCP_NODELAY, + (char *)&val, sizeof(val)); + if (rv) { + siw_dbg_cep(cep, "setsockopt NODELAY error: %d\n", rv); + goto error; + } + } + new_cep->state = SIW_EPSTATE_AWAIT_MPAREQ; + + rv = siw_cm_queue_work(new_cep, SIW_CM_WORK_MPATIMEOUT); + if (rv) + goto error; + /* + * See siw_proc_mpareq() etc. for the use of new_cep->listen_cep. + */ + new_cep->listen_cep = cep; + siw_cep_get(cep); + + if (atomic_read(&new_s->sk->sk_rmem_alloc)) { + /* + * MPA REQ already queued + */ + siw_dbg_cep(cep, "immediate mpa request\n"); + + siw_cep_set_inuse(new_cep); + rv = siw_proc_mpareq(new_cep); + siw_cep_set_free(new_cep); + + if (rv != -EAGAIN) { + siw_cep_put(cep); + new_cep->listen_cep = NULL; + if (rv) + goto error; + } + } + return; + +error: + if (new_cep) + siw_cep_put(new_cep); + + if (new_s) { + siw_socket_disassoc(new_s); + sock_release(new_s); + new_cep->sock = NULL; + } + siw_dbg_cep(cep, "error %d\n", rv); +} + +static void siw_cm_work_handler(struct work_struct *w) +{ + struct siw_cm_work *work; + struct siw_cep *cep; + int release_cep = 0, rv = 0; + + work = container_of(w, struct siw_cm_work, work.work); + cep = work->cep; + + siw_dbg_cep(cep, "[QP %u]: work type: %d, state %d\n", + cep->qp ? qp_id(cep->qp) : -1, work->type, cep->state); + + siw_cep_set_inuse(cep); + + switch (work->type) { + case SIW_CM_WORK_ACCEPT: + siw_accept_newconn(cep); + break; + + case SIW_CM_WORK_READ_MPAHDR: + if (cep->state == SIW_EPSTATE_AWAIT_MPAREQ) { + if (cep->listen_cep) { + siw_cep_set_inuse(cep->listen_cep); + + if (cep->listen_cep->state == + SIW_EPSTATE_LISTENING) + rv = siw_proc_mpareq(cep); + else + rv = -EFAULT; + + siw_cep_set_free(cep->listen_cep); + + if (rv != -EAGAIN) { + siw_cep_put(cep->listen_cep); + cep->listen_cep = NULL; + if (rv) + siw_cep_put(cep); + } + } + } else if (cep->state == SIW_EPSTATE_AWAIT_MPAREP) { + rv = siw_proc_mpareply(cep); + } else { + /* + * CEP already moved out of MPA handshake. + * any connection management already done. + * silently ignore the mpa packet. + */ + if (cep->state == SIW_EPSTATE_RDMA_MODE) { + cep->sock->sk->sk_data_ready(cep->sock->sk); + siw_dbg_cep(cep, "already in RDMA mode"); + } else { + siw_dbg_cep(cep, "out of state: %d\n", + cep->state); + } + } + if (rv && rv != EAGAIN) + release_cep = 1; + break; + + case SIW_CM_WORK_CLOSE_LLP: + /* + * QP scheduled LLP close + */ + if (cep->qp && cep->qp->term_info.valid) + siw_send_terminate(cep->qp); + + if (cep->cm_id) + siw_cm_upcall(cep, IW_CM_EVENT_CLOSE, 0); + + release_cep = 1; + break; + + case SIW_CM_WORK_PEER_CLOSE: + if (cep->cm_id) { + if (cep->state == SIW_EPSTATE_AWAIT_MPAREP) { + /* + * MPA reply not received, but connection drop + */ + siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, + -ECONNRESET); + } else if (cep->state == SIW_EPSTATE_RDMA_MODE) { + /* + * NOTE: IW_CM_EVENT_DISCONNECT is given just + * to transition IWCM into CLOSING. + */ + siw_cm_upcall(cep, IW_CM_EVENT_DISCONNECT, 0); + siw_cm_upcall(cep, IW_CM_EVENT_CLOSE, 0); + } + /* + * for other states there is no connection + * known to the IWCM. + */ + } else { + if (cep->state == SIW_EPSTATE_RECVD_MPAREQ) { + /* + * Wait for the ulp/CM to call accept/reject + */ + siw_dbg_cep(cep, + "mpa req recvd, wait for ULP\n"); + } else if (cep->state == SIW_EPSTATE_AWAIT_MPAREQ) { + /* + * Socket close before MPA request received. + */ + siw_dbg_cep(cep, "no mpareq: drop listener\n"); + siw_cep_put(cep->listen_cep); + cep->listen_cep = NULL; + } + } + release_cep = 1; + break; + + case SIW_CM_WORK_MPATIMEOUT: + cep->mpa_timer = NULL; + + if (cep->state == SIW_EPSTATE_AWAIT_MPAREP) { + /* + * MPA request timed out: + * Hide any partially received private data and signal + * timeout + */ + cep->mpa.hdr.params.pd_len = 0; + + if (cep->cm_id) + siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, + -ETIMEDOUT); + release_cep = 1; + + } else if (cep->state == SIW_EPSTATE_AWAIT_MPAREQ) { + /* + * No MPA request received after peer TCP stream setup. + */ + if (cep->listen_cep) { + siw_cep_put(cep->listen_cep); + cep->listen_cep = NULL; + } + release_cep = 1; + } + break; + + default: + WARN(1, "Undefined CM work type: %d\n", work->type); + } + if (release_cep) { + siw_dbg_cep(cep, + "release: timer=%s, QP[%u], id 0x%p\n", + cep->mpa_timer ? "y" : "n", + cep->qp ? qp_id(cep->qp) : -1, cep->cm_id); + + siw_cancel_mpatimer(cep); + + cep->state = SIW_EPSTATE_CLOSED; + + if (cep->qp) { + struct siw_qp *qp = cep->qp; + /* + * Serialize a potential race with application + * closing the QP and calling siw_qp_cm_drop() + */ + siw_qp_get(qp); + siw_cep_set_free(cep); + + siw_qp_llp_close(qp); + siw_qp_put(qp); + + siw_cep_set_inuse(cep); + cep->qp = NULL; + siw_qp_put(qp); + } + if (cep->sock) { + siw_socket_disassoc(cep->sock); + sock_release(cep->sock); + cep->sock = NULL; + } + if (cep->cm_id) { + cep->cm_id->rem_ref(cep->cm_id); + cep->cm_id = NULL; + siw_cep_put(cep); + } + } + siw_cep_set_free(cep); + siw_put_work(work); + siw_cep_put(cep); +} + +static struct workqueue_struct *siw_cm_wq; + +int siw_cm_queue_work(struct siw_cep *cep, enum siw_work_type type) +{ + struct siw_cm_work *work = siw_get_work(cep); + unsigned long delay = 0; + + if (!work) { + siw_dbg_cep(cep, "failed with no work available\n"); + return -ENOMEM; + } + work->type = type; + work->cep = cep; + + siw_cep_get(cep); + + INIT_DELAYED_WORK(&work->work, siw_cm_work_handler); + + if (type == SIW_CM_WORK_MPATIMEOUT) { + cep->mpa_timer = work; + + if (cep->state == SIW_EPSTATE_AWAIT_MPAREP) + delay = MPAREQ_TIMEOUT; + else + delay = MPAREP_TIMEOUT; + } + siw_dbg_cep(cep, "[QP %u]: work type: %d, work 0x%p, timeout %lu\n", + cep->qp ? qp_id(cep->qp) : -1, type, work, delay); + + queue_delayed_work(siw_cm_wq, &work->work, delay); + + return 0; +} + +static void siw_cm_llp_data_ready(struct sock *sk) +{ + struct siw_cep *cep; + + read_lock(&sk->sk_callback_lock); + + cep = sk_to_cep(sk); + if (!cep) { + WARN_ON(1); + goto out; + } + siw_dbg_cep(cep, "state: %d\n", cep->state); + + switch (cep->state) { + case SIW_EPSTATE_RDMA_MODE: + /* fall through */ + case SIW_EPSTATE_LISTENING: + break; + + case SIW_EPSTATE_AWAIT_MPAREQ: + /* fall through */ + case SIW_EPSTATE_AWAIT_MPAREP: + siw_cm_queue_work(cep, SIW_CM_WORK_READ_MPAHDR); + break; + + default: + siw_dbg_cep(cep, "unexpected data, state %d\n", cep->state); + break; + } +out: + read_unlock(&sk->sk_callback_lock); +} + +static void siw_cm_llp_write_space(struct sock *sk) +{ + struct siw_cep *cep = sk_to_cep(sk); + + if (cep) + siw_dbg_cep(cep, "state: %d\n", cep->state); +} + +static void siw_cm_llp_error_report(struct sock *sk) +{ + struct siw_cep *cep = sk_to_cep(sk); + + if (cep) { + siw_dbg_cep(cep, "error %d, socket state: %d, cep state: %d\n", + sk->sk_err, sk->sk_state, cep->state); + cep->sk_error_report(sk); + } +} + +static void siw_cm_llp_state_change(struct sock *sk) +{ + struct siw_cep *cep; + void (*orig_state_change)(struct sock *s); + + read_lock(&sk->sk_callback_lock); + + cep = sk_to_cep(sk); + if (!cep) { + /* endpoint already disassociated */ + read_unlock(&sk->sk_callback_lock); + return; + } + orig_state_change = cep->sk_state_change; + + siw_dbg_cep(cep, "state: %d\n", cep->state); + + switch (sk->sk_state) { + case TCP_ESTABLISHED: + /* + * handle accepting socket as special case where only + * new connection is possible + */ + siw_cm_queue_work(cep, SIW_CM_WORK_ACCEPT); + break; + + case TCP_CLOSE: + case TCP_CLOSE_WAIT: + if (cep->qp) + cep->qp->tx_ctx.tx_suspend = 1; + siw_cm_queue_work(cep, SIW_CM_WORK_PEER_CLOSE); + break; + + default: + siw_dbg_cep(cep, "unexpected socket state %d\n", sk->sk_state); + } + read_unlock(&sk->sk_callback_lock); + orig_state_change(sk); +} + +static int kernel_bindconnect(struct socket *s, struct sockaddr *laddr, + struct sockaddr *raddr) +{ + int rv, flags = 0, s_val = 1; + size_t size = laddr->sa_family == AF_INET ? + sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6); + + /* + * Make address available again asap. + */ + rv = kernel_setsockopt(s, SOL_SOCKET, SO_REUSEADDR, (char *)&s_val, + sizeof(s_val)); + if (rv < 0) + return rv; + + rv = s->ops->bind(s, laddr, size); + if (rv < 0) + return rv; + + rv = s->ops->connect(s, raddr, size, flags); + + return rv < 0 ? rv : 0; +} + +int siw_connect(struct iw_cm_id *id, struct iw_cm_conn_param *params) +{ + struct siw_device *sdev = to_siw_dev(id->device); + struct siw_qp *qp; + struct siw_cep *cep = NULL; + struct socket *s = NULL; + struct sockaddr *laddr = (struct sockaddr *)&id->local_addr, + *raddr = (struct sockaddr *)&id->remote_addr; + bool p2p_mode = peer_to_peer, v4 = true; + u16 pd_len = params->private_data_len; + int version = mpa_version, rv; + + if (pd_len > MPA_MAX_PRIVDATA) + return -EINVAL; + + if (params->ird > sdev->attrs.max_ird || + params->ord > sdev->attrs.max_ord) + return -ENOMEM; + + if (laddr->sa_family == AF_INET6) + v4 = false; + else if (laddr->sa_family != AF_INET) + return -EAFNOSUPPORT; + + /* + * Respect any iwarp port mapping: Use mapped remote address + * if valid. Local address must not be mapped, since siw + * uses kernel TCP stack. + */ + if ((v4 && to_sockaddr_in(id->remote_addr).sin_port != 0) || + to_sockaddr_in6(id->remote_addr).sin6_port != 0) + raddr = (struct sockaddr *)&id->m_remote_addr; + + qp = siw_qp_id2obj(sdev, params->qpn); + if (!qp) { + WARN(1, "[QP %u] does not exist\n", params->qpn); + rv = -EINVAL; + goto error; + } + if (v4) + siw_dbg_qp(qp, + "id 0x%p, pd_len %d, laddr %pI4 %d, raddr %pI4 %d\n", + id, pd_len, + &((struct sockaddr_in *)(laddr))->sin_addr, + ntohs(((struct sockaddr_in *)(laddr))->sin_port), + &((struct sockaddr_in *)(raddr))->sin_addr, + ntohs(((struct sockaddr_in *)(raddr))->sin_port)); + else + siw_dbg_qp(qp, + "id 0x%p, pd_len %d, laddr %pI6 %d, raddr %pI6 %d\n", + id, pd_len, + &((struct sockaddr_in6 *)(laddr))->sin6_addr, + ntohs(((struct sockaddr_in6 *)(laddr))->sin6_port), + &((struct sockaddr_in6 *)(raddr))->sin6_addr, + ntohs(((struct sockaddr_in6 *)(raddr))->sin6_port)); + + rv = sock_create(v4 ? AF_INET : AF_INET6, SOCK_STREAM, IPPROTO_TCP, &s); + if (rv < 0) + goto error; + + /* + * NOTE: For simplification, connect() is called in blocking + * mode. Might be reconsidered for async connection setup at + * TCP level. + */ + rv = kernel_bindconnect(s, laddr, raddr); + if (rv != 0) { + siw_dbg_qp(qp, "kernel_bindconnect: error %d\n", rv); + goto error; + } + if (siw_tcp_nagle == false) { + int val = 1; + + rv = kernel_setsockopt(s, SOL_TCP, TCP_NODELAY, (char *)&val, + sizeof(val)); + if (rv) { + siw_dbg_qp(qp, "setsockopt NODELAY error: %d\n", rv); + goto error; + } + } + cep = siw_cep_alloc(sdev); + if (!cep) { + rv = -ENOMEM; + goto error; + } + siw_cep_set_inuse(cep); + + /* Associate QP with CEP */ + siw_cep_get(cep); + qp->cep = cep; + + /* siw_qp_get(qp) already done by QP lookup */ + cep->qp = qp; + + id->add_ref(id); + cep->cm_id = id; + + /* + * 4: Allocate a sufficient number of work elements + * to allow concurrent handling of local + peer close + * events, MPA header processing + MPA timeout. + */ + rv = siw_cm_alloc_work(cep, 4); + if (rv != 0) { + rv = -ENOMEM; + goto error; + } + cep->ird = params->ird; + cep->ord = params->ord; + + if (p2p_mode && cep->ord == 0) + cep->ord = 1; + + cep->state = SIW_EPSTATE_CONNECTING; + + /* + * Associate CEP with socket + */ + siw_cep_socket_assoc(cep, s); + + cep->state = SIW_EPSTATE_AWAIT_MPAREP; + + /* + * Set MPA Request bits: CRC if required, no MPA Markers, + * MPA Rev. according to module parameter 'mpa_version', Key 'Request'. + */ + cep->mpa.hdr.params.bits = 0; + if (version > MPA_REVISION_2) { + pr_warn("Setting MPA version to %u\n", MPA_REVISION_2); + version = MPA_REVISION_2; + /* Adjust also module parameter */ + mpa_version = MPA_REVISION_2; + } + __mpa_rr_set_revision(&cep->mpa.hdr.params.bits, version); + + if (try_gso) + cep->mpa.hdr.params.bits |= MPA_RR_FLAG_GSO_EXP; + + if (mpa_crc_required) + cep->mpa.hdr.params.bits |= MPA_RR_FLAG_CRC; + + /* + * If MPA version == 2: + * o Include ORD and IRD. + * o Indicate peer-to-peer mode, if required by module + * parameter 'peer_to_peer'. + */ + if (version == MPA_REVISION_2) { + cep->enhanced_rdma_conn_est = true; + cep->mpa.hdr.params.bits |= MPA_RR_FLAG_ENHANCED; + + cep->mpa.v2_ctrl.ird = htons(cep->ird); + cep->mpa.v2_ctrl.ord = htons(cep->ord); + + if (p2p_mode) { + cep->mpa.v2_ctrl.ird |= MPA_V2_PEER_TO_PEER; + cep->mpa.v2_ctrl.ord |= rtr_type; + } + /* Remember own P2P mode requested */ + cep->mpa.v2_ctrl_req.ird = cep->mpa.v2_ctrl.ird; + cep->mpa.v2_ctrl_req.ord = cep->mpa.v2_ctrl.ord; + } + memcpy(cep->mpa.hdr.key, MPA_KEY_REQ, 16); + + rv = siw_send_mpareqrep(cep, params->private_data, pd_len); + /* + * Reset private data. + */ + cep->mpa.hdr.params.pd_len = 0; + + if (rv >= 0) { + rv = siw_cm_queue_work(cep, SIW_CM_WORK_MPATIMEOUT); + if (!rv) { + siw_dbg_cep(cep, "id 0x%p, [QP %u]: exit\n", id, + qp_id(qp)); + siw_cep_set_free(cep); + return 0; + } + } +error: + siw_dbg_qp(qp, "failed: %d\n", rv); + + if (cep) { + siw_socket_disassoc(s); + sock_release(s); + cep->sock = NULL; + + cep->qp = NULL; + + cep->cm_id = NULL; + id->rem_ref(id); + siw_cep_put(cep); + + qp->cep = NULL; + siw_cep_put(cep); + + cep->state = SIW_EPSTATE_CLOSED; + + siw_cep_set_free(cep); + + siw_cep_put(cep); + + } else if (s) { + sock_release(s); + } + siw_qp_put(qp); + + return rv; +} + +/* + * siw_accept - Let SoftiWARP accept an RDMA connection request + * + * @id: New connection management id to be used for accepted + * connection request + * @params: Connection parameters provided by ULP for accepting connection + * + * Transition QP to RTS state, associate new CM id @id with accepted CEP + * and get prepared for TCP input by installing socket callbacks. + * Then send MPA Reply and generate the "connection established" event. + * Socket callbacks must be installed before sending MPA Reply, because + * the latter may cause a first RDMA message to arrive from the RDMA Initiator + * side very quickly, at which time the socket callbacks must be ready. + */ +int siw_accept(struct iw_cm_id *id, struct iw_cm_conn_param *params) +{ + struct siw_device *sdev = to_siw_dev(id->device); + struct siw_cep *cep = (struct siw_cep *)id->provider_data; + struct siw_qp *qp; + struct siw_qp_attrs qp_attrs; + int rv, max_priv_data = MPA_MAX_PRIVDATA; + bool wait_for_peer_rts = false; + + siw_cep_set_inuse(cep); + siw_cep_put(cep); + + /* Free lingering inbound private data */ + if (cep->mpa.hdr.params.pd_len) { + cep->mpa.hdr.params.pd_len = 0; + kfree(cep->mpa.pdata); + cep->mpa.pdata = NULL; + } + siw_cancel_mpatimer(cep); + + if (cep->state != SIW_EPSTATE_RECVD_MPAREQ) { + siw_dbg_cep(cep, "id 0x%p: out of state\n", id); + + siw_cep_set_free(cep); + siw_cep_put(cep); + + return -ECONNRESET; + } + qp = siw_qp_id2obj(sdev, params->qpn); + if (!qp) { + WARN(1, "[QP %d] does not exist\n", params->qpn); + siw_cep_set_free(cep); + siw_cep_put(cep); + + return -EINVAL; + } + down_write(&qp->state_lock); + if (qp->attrs.state > SIW_QP_STATE_RTR) { + rv = -EINVAL; + up_write(&qp->state_lock); + goto error; + } + siw_dbg_cep(cep, "id 0x%p\n", id); + + if (try_gso && cep->mpa.hdr.params.bits & MPA_RR_FLAG_GSO_EXP) { + siw_dbg_cep(cep, "peer allows GSO on TX\n"); + qp->tx_ctx.gso_seg_limit = 0; + } + if (params->ord > sdev->attrs.max_ord || + params->ird > sdev->attrs.max_ird) { + siw_dbg_cep( + cep, + "id 0x%p, [QP %u]: ord %d (max %d), ird %d (max %d)\n", + id, qp_id(qp), params->ord, sdev->attrs.max_ord, + params->ird, sdev->attrs.max_ird); + rv = -EINVAL; + up_write(&qp->state_lock); + goto error; + } + if (cep->enhanced_rdma_conn_est) + max_priv_data -= sizeof(struct mpa_v2_data); + + if (params->private_data_len > max_priv_data) { + siw_dbg_cep( + cep, + "id 0x%p, [QP %u]: private data length: %d (max %d)\n", + id, qp_id(qp), params->private_data_len, max_priv_data); + rv = -EINVAL; + up_write(&qp->state_lock); + goto error; + } + if (cep->enhanced_rdma_conn_est) { + if (params->ord > cep->ord) { + if (relaxed_ird_negotiation) { + params->ord = cep->ord; + } else { + cep->ird = params->ird; + cep->ord = params->ord; + rv = -EINVAL; + up_write(&qp->state_lock); + goto error; + } + } + if (params->ird < cep->ird) { + if (relaxed_ird_negotiation && + cep->ird <= sdev->attrs.max_ird) + params->ird = cep->ird; + else { + rv = -ENOMEM; + up_write(&qp->state_lock); + goto error; + } + } + if (cep->mpa.v2_ctrl.ord & + (MPA_V2_RDMA_WRITE_RTR | MPA_V2_RDMA_READ_RTR)) + wait_for_peer_rts = true; + /* + * Signal back negotiated IRD and ORD values + */ + cep->mpa.v2_ctrl.ord = + htons(params->ord & MPA_IRD_ORD_MASK) | + (cep->mpa.v2_ctrl.ord & ~MPA_V2_MASK_IRD_ORD); + cep->mpa.v2_ctrl.ird = + htons(params->ird & MPA_IRD_ORD_MASK) | + (cep->mpa.v2_ctrl.ird & ~MPA_V2_MASK_IRD_ORD); + } + cep->ird = params->ird; + cep->ord = params->ord; + + cep->cm_id = id; + id->add_ref(id); + + memset(&qp_attrs, 0, sizeof(qp_attrs)); + qp_attrs.orq_size = cep->ord; + qp_attrs.irq_size = cep->ird; + qp_attrs.sk = cep->sock; + if (cep->mpa.hdr.params.bits & MPA_RR_FLAG_CRC) + qp_attrs.flags = SIW_MPA_CRC; + qp_attrs.state = SIW_QP_STATE_RTS; + + siw_dbg_cep(cep, "id 0x%p, [QP%u]: moving to rts\n", id, qp_id(qp)); + + /* Associate QP with CEP */ + siw_cep_get(cep); + qp->cep = cep; + + /* siw_qp_get(qp) already done by QP lookup */ + cep->qp = qp; + + cep->state = SIW_EPSTATE_RDMA_MODE; + + /* Move socket RX/TX under QP control */ + rv = siw_qp_modify(qp, &qp_attrs, + SIW_QP_ATTR_STATE | SIW_QP_ATTR_LLP_HANDLE | + SIW_QP_ATTR_ORD | SIW_QP_ATTR_IRD | + SIW_QP_ATTR_MPA); + up_write(&qp->state_lock); + + if (rv) + goto error; + + siw_dbg_cep(cep, "id 0x%p, [QP %u]: send mpa reply, %d byte pdata\n", + id, qp_id(qp), params->private_data_len); + + rv = siw_send_mpareqrep(cep, params->private_data, + params->private_data_len); + if (rv != 0) + goto error; + + if (wait_for_peer_rts) { + siw_sk_assign_rtr_upcalls(cep); + } else { + siw_qp_socket_assoc(cep, qp); + rv = siw_cm_upcall(cep, IW_CM_EVENT_ESTABLISHED, 0); + if (rv) + goto error; + } + siw_cep_set_free(cep); + + return 0; +error: + siw_socket_disassoc(cep->sock); + sock_release(cep->sock); + cep->sock = NULL; + + cep->state = SIW_EPSTATE_CLOSED; + + if (cep->cm_id) { + cep->cm_id->rem_ref(id); + cep->cm_id = NULL; + } + if (qp->cep) { + siw_cep_put(cep); + qp->cep = NULL; + } + cep->qp = NULL; + siw_qp_put(qp); + + siw_cep_set_free(cep); + siw_cep_put(cep); + + return rv; +} + +/* + * siw_reject() + * + * Local connection reject case. Send private data back to peer, + * close connection and dereference connection id. + */ +int siw_reject(struct iw_cm_id *id, const void *pdata, u8 pd_len) +{ + struct siw_cep *cep = (struct siw_cep *)id->provider_data; + + siw_cep_set_inuse(cep); + siw_cep_put(cep); + + siw_cancel_mpatimer(cep); + + if (cep->state != SIW_EPSTATE_RECVD_MPAREQ) { + siw_dbg_cep(cep, "id 0x%p: out of state\n", id); + + siw_cep_set_free(cep); + siw_cep_put(cep); /* put last reference */ + + return -ECONNRESET; + } + siw_dbg_cep(cep, "id 0x%p, cep->state %d, pd_len %d\n", id, cep->state, + pd_len); + + if (__mpa_rr_revision(cep->mpa.hdr.params.bits) >= MPA_REVISION_1) { + cep->mpa.hdr.params.bits |= MPA_RR_FLAG_REJECT; /* reject */ + siw_send_mpareqrep(cep, pdata, pd_len); + } + siw_socket_disassoc(cep->sock); + sock_release(cep->sock); + cep->sock = NULL; + + cep->state = SIW_EPSTATE_CLOSED; + + siw_cep_set_free(cep); + siw_cep_put(cep); + + return 0; +} + +static int siw_listen_address(struct iw_cm_id *id, int backlog, + struct sockaddr *laddr, int addr_family) +{ + struct socket *s; + struct siw_cep *cep = NULL; + struct siw_device *sdev = to_siw_dev(id->device); + int rv = 0, s_val; + + rv = sock_create(addr_family, SOCK_STREAM, IPPROTO_TCP, &s); + if (rv < 0) + return rv; + + /* + * Allow binding local port when still in TIME_WAIT from last close. + */ + s_val = 1; + rv = kernel_setsockopt(s, SOL_SOCKET, SO_REUSEADDR, (char *)&s_val, + sizeof(s_val)); + if (rv) { + siw_dbg(id->device, "id 0x%p: setsockopt error: %d\n", id, rv); + goto error; + } + rv = s->ops->bind(s, laddr, addr_family == AF_INET ? + sizeof(struct sockaddr_in) : + sizeof(struct sockaddr_in6)); + if (rv) { + siw_dbg(id->device, "id 0x%p: socket bind error: %d\n", id, rv); + goto error; + } + cep = siw_cep_alloc(sdev); + if (!cep) { + rv = -ENOMEM; + goto error; + } + siw_cep_socket_assoc(cep, s); + + rv = siw_cm_alloc_work(cep, backlog); + if (rv) { + siw_dbg(id->device, + "id 0x%p: alloc_work error %d, backlog %d\n", id, + rv, backlog); + goto error; + } + rv = s->ops->listen(s, backlog); + if (rv) { + siw_dbg(id->device, "id 0x%p: listen error %d\n", id, rv); + goto error; + } + cep->cm_id = id; + id->add_ref(id); + + /* + * In case of a wildcard rdma_listen on a multi-homed device, + * a listener's IWCM id is associated with more than one listening CEP. + * + * We currently use id->provider_data in three different ways: + * + * o For a listener's IWCM id, id->provider_data points to + * the list_head of the list of listening CEPs. + * Uses: siw_create_listen(), siw_destroy_listen() + * + * o For each accepted passive-side IWCM id, id->provider_data + * points to the CEP itself. This is a consequence of + * - siw_cm_upcall() setting event.provider_data = cep and + * - the IWCM's cm_conn_req_handler() setting provider_data of the + * new passive-side IWCM id equal to event.provider_data + * Uses: siw_accept(), siw_reject() + * + * o For an active-side IWCM id, id->provider_data is not used at all. + * + */ + if (!id->provider_data) { + id->provider_data = + kmalloc(sizeof(struct list_head), GFP_KERNEL); + if (!id->provider_data) { + rv = -ENOMEM; + goto error; + } + INIT_LIST_HEAD((struct list_head *)id->provider_data); + } + list_add_tail(&cep->listenq, (struct list_head *)id->provider_data); + cep->state = SIW_EPSTATE_LISTENING; + + if (addr_family == AF_INET) + siw_dbg(id->device, "Listen at laddr %pI4 %u\n", + &(((struct sockaddr_in *)laddr)->sin_addr), + ((struct sockaddr_in *)laddr)->sin_port); + else + siw_dbg(id->device, "Listen at laddr %pI6 %u\n", + &(((struct sockaddr_in6 *)laddr)->sin6_addr), + ((struct sockaddr_in6 *)laddr)->sin6_port); + + return 0; + +error: + siw_dbg(id->device, "failed: %d\n", rv); + + if (cep) { + siw_cep_set_inuse(cep); + + if (cep->cm_id) { + cep->cm_id->rem_ref(cep->cm_id); + cep->cm_id = NULL; + } + cep->sock = NULL; + siw_socket_disassoc(s); + cep->state = SIW_EPSTATE_CLOSED; + + siw_cep_set_free(cep); + siw_cep_put(cep); + } + sock_release(s); + + return rv; +} + +static void siw_drop_listeners(struct iw_cm_id *id) +{ + struct list_head *p, *tmp; + + /* + * In case of a wildcard rdma_listen on a multi-homed device, + * a listener's IWCM id is associated with more than one listening CEP. + */ + list_for_each_safe(p, tmp, (struct list_head *)id->provider_data) { + struct siw_cep *cep = list_entry(p, struct siw_cep, listenq); + + list_del(p); + + siw_dbg_cep(cep, "id 0x%p: drop cep, state %d\n", id, + cep->state); + + siw_cep_set_inuse(cep); + + if (cep->cm_id) { + cep->cm_id->rem_ref(cep->cm_id); + cep->cm_id = NULL; + } + if (cep->sock) { + siw_socket_disassoc(cep->sock); + sock_release(cep->sock); + cep->sock = NULL; + } + cep->state = SIW_EPSTATE_CLOSED; + siw_cep_set_free(cep); + siw_cep_put(cep); + } +} + +/* + * siw_create_listen - Create resources for a listener's IWCM ID @id + * + * Listens on the socket addresses id->local_addr and id->remote_addr. + * + * If the listener's @id provides a specific local IP address, at most one + * listening socket is created and associated with @id. + * + * If the listener's @id provides the wildcard (zero) local IP address, + * a separate listen is performed for each local IP address of the device + * by creating a listening socket and binding to that local IP address. + * + */ +int siw_create_listen(struct iw_cm_id *id, int backlog) +{ + struct net_device *dev = to_siw_dev(id->device)->netdev; + int rv = 0, listeners = 0; + + siw_dbg(id->device, "id 0x%p: backlog %d\n", id, backlog); + + /* + * For each attached address of the interface, create a + * listening socket, if id->local_addr is the wildcard + * IP address or matches the IP address. + */ + if (id->local_addr.ss_family == AF_INET) { + struct in_device *in_dev = in_dev_get(dev); + struct sockaddr_in s_laddr, *s_raddr; + const struct in_ifaddr *ifa; + + memcpy(&s_laddr, &id->local_addr, sizeof(s_laddr)); + s_raddr = (struct sockaddr_in *)&id->remote_addr; + + siw_dbg(id->device, + "id 0x%p: laddr %pI4:%d, raddr %pI4:%d\n", + id, &s_laddr.sin_addr, ntohs(s_laddr.sin_port), + &s_raddr->sin_addr, ntohs(s_raddr->sin_port)); + + rtnl_lock(); + in_dev_for_each_ifa_rtnl(ifa, in_dev) { + if (ipv4_is_zeronet(s_laddr.sin_addr.s_addr) || + s_laddr.sin_addr.s_addr == ifa->ifa_address) { + s_laddr.sin_addr.s_addr = ifa->ifa_address; + + rv = siw_listen_address(id, backlog, + (struct sockaddr *)&s_laddr, + AF_INET); + if (!rv) + listeners++; + } + } + rtnl_unlock(); + in_dev_put(in_dev); + } else if (id->local_addr.ss_family == AF_INET6) { + struct inet6_dev *in6_dev = in6_dev_get(dev); + struct inet6_ifaddr *ifp; + struct sockaddr_in6 *s_laddr = &to_sockaddr_in6(id->local_addr), + *s_raddr = &to_sockaddr_in6(id->remote_addr); + + siw_dbg(id->device, + "id 0x%p: laddr %pI6:%d, raddr %pI6:%d\n", + id, &s_laddr->sin6_addr, ntohs(s_laddr->sin6_port), + &s_raddr->sin6_addr, ntohs(s_raddr->sin6_port)); + + read_lock_bh(&in6_dev->lock); + list_for_each_entry(ifp, &in6_dev->addr_list, if_list) { + struct sockaddr_in6 bind_addr; + + if (ipv6_addr_any(&s_laddr->sin6_addr) || + ipv6_addr_equal(&s_laddr->sin6_addr, &ifp->addr)) { + bind_addr.sin6_family = AF_INET6; + bind_addr.sin6_port = s_laddr->sin6_port; + bind_addr.sin6_flowinfo = 0; + bind_addr.sin6_addr = ifp->addr; + bind_addr.sin6_scope_id = dev->ifindex; + + rv = siw_listen_address(id, backlog, + (struct sockaddr *)&bind_addr, + AF_INET6); + if (!rv) + listeners++; + } + } + read_unlock_bh(&in6_dev->lock); + + in6_dev_put(in6_dev); + } else { + return -EAFNOSUPPORT; + } + if (listeners) + rv = 0; + else if (!rv) + rv = -EINVAL; + + siw_dbg(id->device, "id 0x%p: %s\n", id, rv ? "FAIL" : "OK"); + + return rv; +} + +int siw_destroy_listen(struct iw_cm_id *id) +{ + siw_dbg(id->device, "id 0x%p\n", id); + + if (!id->provider_data) { + siw_dbg(id->device, "id 0x%p: no cep(s)\n", id); + return 0; + } + siw_drop_listeners(id); + kfree(id->provider_data); + id->provider_data = NULL; + + return 0; +} + +int siw_cm_init(void) +{ + /* + * create_single_workqueue for strict ordering + */ + siw_cm_wq = create_singlethread_workqueue("siw_cm_wq"); + if (!siw_cm_wq) + return -ENOMEM; + + return 0; +} + +void siw_cm_exit(void) +{ + if (siw_cm_wq) { + flush_workqueue(siw_cm_wq); + destroy_workqueue(siw_cm_wq); + } +} diff --git a/drivers/infiniband/sw/siw/siw_cm.h b/drivers/infiniband/sw/siw/siw_cm.h new file mode 100644 index 000000000000..8c59cb3e2868 --- /dev/null +++ b/drivers/infiniband/sw/siw/siw_cm.h @@ -0,0 +1,133 @@ +/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause */ + +/* Authors: Bernard Metzler <bmt@zurich.ibm.com> */ +/* Greg Joyce <greg@opengridcomputing.com> */ +/* Copyright (c) 2008-2019, IBM Corporation */ +/* Copyright (c) 2017, Open Grid Computing, Inc. */ + +#ifndef _SIW_CM_H +#define _SIW_CM_H + +#include <net/sock.h> +#include <linux/tcp.h> + +#include <rdma/iw_cm.h> + +enum siw_cep_state { + SIW_EPSTATE_IDLE = 1, + SIW_EPSTATE_LISTENING, + SIW_EPSTATE_CONNECTING, + SIW_EPSTATE_AWAIT_MPAREQ, + SIW_EPSTATE_RECVD_MPAREQ, + SIW_EPSTATE_AWAIT_MPAREP, + SIW_EPSTATE_RDMA_MODE, + SIW_EPSTATE_CLOSED +}; + +struct siw_mpa_info { + struct mpa_rr hdr; /* peer mpa hdr in host byte order */ + struct mpa_v2_data v2_ctrl; + struct mpa_v2_data v2_ctrl_req; + char *pdata; + int bytes_rcvd; +}; + +struct siw_device; + +struct siw_cep { + struct iw_cm_id *cm_id; + struct siw_device *sdev; + struct list_head devq; + spinlock_t lock; + struct kref ref; + int in_use; + wait_queue_head_t waitq; + enum siw_cep_state state; + + struct list_head listenq; + struct siw_cep *listen_cep; + + struct siw_qp *qp; + struct socket *sock; + + struct siw_cm_work *mpa_timer; + struct list_head work_freelist; + + struct siw_mpa_info mpa; + int ord; + int ird; + bool enhanced_rdma_conn_est; + + /* Saved upcalls of socket */ + void (*sk_state_change)(struct sock *sk); + void (*sk_data_ready)(struct sock *sk); + void (*sk_write_space)(struct sock *sk); + void (*sk_error_report)(struct sock *sk); +}; + +/* + * Connection initiator waits 10 seconds to receive an + * MPA reply after sending out MPA request. Reponder waits for + * 5 seconds for MPA request to arrive if new TCP connection + * was set up. + */ +#define MPAREQ_TIMEOUT (HZ * 10) +#define MPAREP_TIMEOUT (HZ * 5) + +enum siw_work_type { + SIW_CM_WORK_ACCEPT = 1, + SIW_CM_WORK_READ_MPAHDR, + SIW_CM_WORK_CLOSE_LLP, /* close socket */ + SIW_CM_WORK_PEER_CLOSE, /* socket indicated peer close */ + SIW_CM_WORK_MPATIMEOUT +}; + +struct siw_cm_work { + struct delayed_work work; + struct list_head list; + enum siw_work_type type; + struct siw_cep *cep; +}; + +#define to_sockaddr_in(a) (*(struct sockaddr_in *)(&(a))) +#define to_sockaddr_in6(a) (*(struct sockaddr_in6 *)(&(a))) + +static inline int getname_peer(struct socket *s, struct sockaddr_storage *a) +{ + return s->ops->getname(s, (struct sockaddr *)a, 1); +} + +static inline int getname_local(struct socket *s, struct sockaddr_storage *a) +{ + return s->ops->getname(s, (struct sockaddr *)a, 0); +} + +static inline int ksock_recv(struct socket *sock, char *buf, size_t size, + int flags) +{ + struct kvec iov = { buf, size }; + struct msghdr msg = { .msg_name = NULL, .msg_flags = flags }; + + return kernel_recvmsg(sock, &msg, &iov, 1, size, flags); +} + +int siw_connect(struct iw_cm_id *id, struct iw_cm_conn_param *parm); +int siw_accept(struct iw_cm_id *id, struct iw_cm_conn_param *param); +int siw_reject(struct iw_cm_id *id, const void *data, u8 len); +int siw_create_listen(struct iw_cm_id *id, int backlog); +int siw_destroy_listen(struct iw_cm_id *id); + +void siw_cep_get(struct siw_cep *cep); +void siw_cep_put(struct siw_cep *cep); +int siw_cm_queue_work(struct siw_cep *cep, enum siw_work_type type); + +int siw_cm_init(void); +void siw_cm_exit(void); + +/* + * TCP socket interface + */ +#define sk_to_qp(sk) (((struct siw_cep *)((sk)->sk_user_data))->qp) +#define sk_to_cep(sk) ((struct siw_cep *)((sk)->sk_user_data)) + +#endif diff --git a/drivers/infiniband/sw/siw/siw_cq.c b/drivers/infiniband/sw/siw/siw_cq.c new file mode 100644 index 000000000000..e381ae9b7d62 --- /dev/null +++ b/drivers/infiniband/sw/siw/siw_cq.c @@ -0,0 +1,101 @@ +// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause + +/* Authors: Bernard Metzler <bmt@zurich.ibm.com> */ +/* Copyright (c) 2008-2019, IBM Corporation */ + +#include <linux/errno.h> +#include <linux/types.h> + +#include <rdma/ib_verbs.h> + +#include "siw.h" + +static int map_wc_opcode[SIW_NUM_OPCODES] = { + [SIW_OP_WRITE] = IB_WC_RDMA_WRITE, + [SIW_OP_SEND] = IB_WC_SEND, + [SIW_OP_SEND_WITH_IMM] = IB_WC_SEND, + [SIW_OP_READ] = IB_WC_RDMA_READ, + [SIW_OP_READ_LOCAL_INV] = IB_WC_RDMA_READ, + [SIW_OP_COMP_AND_SWAP] = IB_WC_COMP_SWAP, + [SIW_OP_FETCH_AND_ADD] = IB_WC_FETCH_ADD, + [SIW_OP_INVAL_STAG] = IB_WC_LOCAL_INV, + [SIW_OP_REG_MR] = IB_WC_REG_MR, + [SIW_OP_RECEIVE] = IB_WC_RECV, + [SIW_OP_READ_RESPONSE] = -1 /* not used */ +}; + +static struct { + enum siw_wc_status siw; + enum ib_wc_status ib; +} map_cqe_status[SIW_NUM_WC_STATUS] = { + { SIW_WC_SUCCESS, IB_WC_SUCCESS }, + { SIW_WC_LOC_LEN_ERR, IB_WC_LOC_LEN_ERR }, + { SIW_WC_LOC_PROT_ERR, IB_WC_LOC_PROT_ERR }, + { SIW_WC_LOC_QP_OP_ERR, IB_WC_LOC_QP_OP_ERR }, + { SIW_WC_WR_FLUSH_ERR, IB_WC_WR_FLUSH_ERR }, + { SIW_WC_BAD_RESP_ERR, IB_WC_BAD_RESP_ERR }, + { SIW_WC_LOC_ACCESS_ERR, IB_WC_LOC_ACCESS_ERR }, + { SIW_WC_REM_ACCESS_ERR, IB_WC_REM_ACCESS_ERR }, + { SIW_WC_REM_INV_REQ_ERR, IB_WC_REM_INV_REQ_ERR }, + { SIW_WC_GENERAL_ERR, IB_WC_GENERAL_ERR } +}; + +/* + * Reap one CQE from the CQ. Only used by kernel clients + * during CQ normal operation. Might be called during CQ + * flush for user mapped CQE array as well. + */ +int siw_reap_cqe(struct siw_cq *cq, struct ib_wc *wc) +{ + struct siw_cqe *cqe; + unsigned long flags; + + spin_lock_irqsave(&cq->lock, flags); + + cqe = &cq->queue[cq->cq_get % cq->num_cqe]; + if (READ_ONCE(cqe->flags) & SIW_WQE_VALID) { + memset(wc, 0, sizeof(*wc)); + wc->wr_id = cqe->id; + wc->status = map_cqe_status[cqe->status].ib; + wc->opcode = map_wc_opcode[cqe->opcode]; + wc->byte_len = cqe->bytes; + + /* + * During CQ flush, also user land CQE's may get + * reaped here, which do not hold a QP reference + * and do not qualify for memory extension verbs. + */ + if (likely(cq->kernel_verbs)) { + if (cqe->flags & SIW_WQE_REM_INVAL) { + wc->ex.invalidate_rkey = cqe->inval_stag; + wc->wc_flags = IB_WC_WITH_INVALIDATE; + } + wc->qp = cqe->base_qp; + siw_dbg_cq(cq, "idx %u, type %d, flags %2x, id 0x%p\n", + cq->cq_get % cq->num_cqe, cqe->opcode, + cqe->flags, (void *)cqe->id); + } + WRITE_ONCE(cqe->flags, 0); + cq->cq_get++; + + spin_unlock_irqrestore(&cq->lock, flags); + + return 1; + } + spin_unlock_irqrestore(&cq->lock, flags); + + return 0; +} + +/* + * siw_cq_flush() + * + * Flush all CQ elements. + */ +void siw_cq_flush(struct siw_cq *cq) +{ + struct ib_wc wc; + + while (siw_reap_cqe(cq, &wc)) + ; +} diff --git a/drivers/infiniband/sw/siw/siw_main.c b/drivers/infiniband/sw/siw/siw_main.c new file mode 100644 index 000000000000..f55c4e80aea4 --- /dev/null +++ b/drivers/infiniband/sw/siw/siw_main.c @@ -0,0 +1,685 @@ +// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause + +/* Authors: Bernard Metzler <bmt@zurich.ibm.com> */ +/* Copyright (c) 2008-2019, IBM Corporation */ + +#include <linux/init.h> +#include <linux/errno.h> +#include <linux/netdevice.h> +#include <linux/inetdevice.h> +#include <net/net_namespace.h> +#include <linux/rtnetlink.h> +#include <linux/if_arp.h> +#include <linux/list.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/module.h> +#include <linux/dma-mapping.h> + +#include <rdma/ib_verbs.h> +#include <rdma/ib_user_verbs.h> +#include <rdma/rdma_netlink.h> +#include <linux/kthread.h> + +#include "siw.h" +#include "siw_verbs.h" + +MODULE_AUTHOR("Bernard Metzler"); +MODULE_DESCRIPTION("Software iWARP Driver"); +MODULE_LICENSE("Dual BSD/GPL"); + +/* transmit from user buffer, if possible */ +const bool zcopy_tx = true; + +/* Restrict usage of GSO, if hardware peer iwarp is unable to process + * large packets. try_gso = true lets siw try to use local GSO, + * if peer agrees. Not using GSO severly limits siw maximum tx bandwidth. + */ +const bool try_gso; + +/* Attach siw also with loopback devices */ +const bool loopback_enabled = true; + +/* We try to negotiate CRC on, if true */ +const bool mpa_crc_required; + +/* MPA CRC on/off enforced */ +const bool mpa_crc_strict; + +/* Control TCP_NODELAY socket option */ +const bool siw_tcp_nagle; + +/* Select MPA version to be used during connection setup */ +u_char mpa_version = MPA_REVISION_2; + +/* Selects MPA P2P mode (additional handshake during connection + * setup, if true. + */ +const bool peer_to_peer; + +struct task_struct *siw_tx_thread[NR_CPUS]; +struct crypto_shash *siw_crypto_shash; + +static int siw_device_register(struct siw_device *sdev, const char *name) +{ + struct ib_device *base_dev = &sdev->base_dev; + static int dev_id = 1; + int rv; + + rv = ib_register_device(base_dev, name); + if (rv) { + pr_warn("siw: device registration error %d\n", rv); + return rv; + } + sdev->vendor_part_id = dev_id++; + + siw_dbg(base_dev, "HWaddr=%pM\n", sdev->netdev->dev_addr); + + return 0; +} + +static void siw_device_cleanup(struct ib_device *base_dev) +{ + struct siw_device *sdev = to_siw_dev(base_dev); + + xa_destroy(&sdev->qp_xa); + xa_destroy(&sdev->mem_xa); +} + +static int siw_create_tx_threads(void) +{ + int cpu, assigned = 0; + + for_each_online_cpu(cpu) { + /* Skip HT cores */ + if (cpu % cpumask_weight(topology_sibling_cpumask(cpu))) + continue; + + siw_tx_thread[cpu] = + kthread_create(siw_run_sq, (unsigned long *)(long)cpu, + "siw_tx/%d", cpu); + if (IS_ERR(siw_tx_thread[cpu])) { + siw_tx_thread[cpu] = NULL; + continue; + } + kthread_bind(siw_tx_thread[cpu], cpu); + + wake_up_process(siw_tx_thread[cpu]); + assigned++; + } + return assigned; +} + +static int siw_dev_qualified(struct net_device *netdev) +{ + /* + * Additional hardware support can be added here + * (e.g. ARPHRD_FDDI, ARPHRD_ATM, ...) - see + * <linux/if_arp.h> for type identifiers. + */ + if (netdev->type == ARPHRD_ETHER || netdev->type == ARPHRD_IEEE802 || + (netdev->type == ARPHRD_LOOPBACK && loopback_enabled)) + return 1; + + return 0; +} + +static DEFINE_PER_CPU(atomic_t, siw_use_cnt); + +static struct { + struct cpumask **tx_valid_cpus; + int num_nodes; +} siw_cpu_info; + +static int siw_init_cpulist(void) +{ + int i, num_nodes = num_possible_nodes(); + + memset(siw_tx_thread, 0, sizeof(siw_tx_thread)); + + siw_cpu_info.num_nodes = num_nodes; + + siw_cpu_info.tx_valid_cpus = + kcalloc(num_nodes, sizeof(struct cpumask *), GFP_KERNEL); + if (!siw_cpu_info.tx_valid_cpus) { + siw_cpu_info.num_nodes = 0; + return -ENOMEM; + } + for (i = 0; i < siw_cpu_info.num_nodes; i++) { + siw_cpu_info.tx_valid_cpus[i] = + kzalloc(sizeof(struct cpumask), GFP_KERNEL); + if (!siw_cpu_info.tx_valid_cpus[i]) + goto out_err; + + cpumask_clear(siw_cpu_info.tx_valid_cpus[i]); + } + for_each_possible_cpu(i) + cpumask_set_cpu(i, siw_cpu_info.tx_valid_cpus[cpu_to_node(i)]); + + return 0; + +out_err: + siw_cpu_info.num_nodes = 0; + while (i) { + kfree(siw_cpu_info.tx_valid_cpus[i]); + siw_cpu_info.tx_valid_cpus[i--] = NULL; + } + kfree(siw_cpu_info.tx_valid_cpus); + siw_cpu_info.tx_valid_cpus = NULL; + + return -ENOMEM; +} + +static void siw_destroy_cpulist(void) +{ + int i = 0; + + while (i < siw_cpu_info.num_nodes) + kfree(siw_cpu_info.tx_valid_cpus[i++]); + + kfree(siw_cpu_info.tx_valid_cpus); +} + +/* + * Choose CPU with least number of active QP's from NUMA node of + * TX interface. + */ +int siw_get_tx_cpu(struct siw_device *sdev) +{ + const struct cpumask *tx_cpumask; + int i, num_cpus, cpu, min_use, node = sdev->numa_node, tx_cpu = -1; + + if (node < 0) + tx_cpumask = cpu_online_mask; + else + tx_cpumask = siw_cpu_info.tx_valid_cpus[node]; + + num_cpus = cpumask_weight(tx_cpumask); + if (!num_cpus) { + /* no CPU on this NUMA node */ + tx_cpumask = cpu_online_mask; + num_cpus = cpumask_weight(tx_cpumask); + } + if (!num_cpus) + goto out; + + cpu = cpumask_first(tx_cpumask); + + for (i = 0, min_use = SIW_MAX_QP; i < num_cpus; + i++, cpu = cpumask_next(cpu, tx_cpumask)) { + int usage; + + /* Skip any cores which have no TX thread */ + if (!siw_tx_thread[cpu]) + continue; + + usage = atomic_read(&per_cpu(siw_use_cnt, cpu)); + if (usage <= min_use) { + tx_cpu = cpu; + min_use = usage; + } + } + siw_dbg(&sdev->base_dev, + "tx cpu %d, node %d, %d qp's\n", tx_cpu, node, min_use); + +out: + if (tx_cpu >= 0) + atomic_inc(&per_cpu(siw_use_cnt, tx_cpu)); + else + pr_warn("siw: no tx cpu found\n"); + + return tx_cpu; +} + +void siw_put_tx_cpu(int cpu) +{ + atomic_dec(&per_cpu(siw_use_cnt, cpu)); +} + +static struct ib_qp *siw_get_base_qp(struct ib_device *base_dev, int id) +{ + struct siw_qp *qp = siw_qp_id2obj(to_siw_dev(base_dev), id); + + if (qp) { + /* + * siw_qp_id2obj() increments object reference count + */ + siw_qp_put(qp); + return qp->ib_qp; + } + return NULL; +} + +static void siw_verbs_sq_flush(struct ib_qp *base_qp) +{ + struct siw_qp *qp = to_siw_qp(base_qp); + + down_write(&qp->state_lock); + siw_sq_flush(qp); + up_write(&qp->state_lock); +} + +static void siw_verbs_rq_flush(struct ib_qp *base_qp) +{ + struct siw_qp *qp = to_siw_qp(base_qp); + + down_write(&qp->state_lock); + siw_rq_flush(qp); + up_write(&qp->state_lock); +} + +static const struct ib_device_ops siw_device_ops = { + .owner = THIS_MODULE, + .uverbs_abi_ver = SIW_ABI_VERSION, + .driver_id = RDMA_DRIVER_SIW, + + .alloc_mr = siw_alloc_mr, + .alloc_pd = siw_alloc_pd, + .alloc_ucontext = siw_alloc_ucontext, + .create_cq = siw_create_cq, + .create_qp = siw_create_qp, + .create_srq = siw_create_srq, + .dealloc_driver = siw_device_cleanup, + .dealloc_pd = siw_dealloc_pd, + .dealloc_ucontext = siw_dealloc_ucontext, + .dereg_mr = siw_dereg_mr, + .destroy_cq = siw_destroy_cq, + .destroy_qp = siw_destroy_qp, + .destroy_srq = siw_destroy_srq, + .drain_rq = siw_verbs_rq_flush, + .drain_sq = siw_verbs_sq_flush, + .get_dma_mr = siw_get_dma_mr, + .get_port_immutable = siw_get_port_immutable, + .iw_accept = siw_accept, + .iw_add_ref = siw_qp_get_ref, + .iw_connect = siw_connect, + .iw_create_listen = siw_create_listen, + .iw_destroy_listen = siw_destroy_listen, + .iw_get_qp = siw_get_base_qp, + .iw_reject = siw_reject, + .iw_rem_ref = siw_qp_put_ref, + .map_mr_sg = siw_map_mr_sg, + .mmap = siw_mmap, + .modify_qp = siw_verbs_modify_qp, + .modify_srq = siw_modify_srq, + .poll_cq = siw_poll_cq, + .post_recv = siw_post_receive, + .post_send = siw_post_send, + .post_srq_recv = siw_post_srq_recv, + .query_device = siw_query_device, + .query_gid = siw_query_gid, + .query_pkey = siw_query_pkey, + .query_port = siw_query_port, + .query_qp = siw_query_qp, + .query_srq = siw_query_srq, + .req_notify_cq = siw_req_notify_cq, + .reg_user_mr = siw_reg_user_mr, + + INIT_RDMA_OBJ_SIZE(ib_cq, siw_cq, base_cq), + INIT_RDMA_OBJ_SIZE(ib_pd, siw_pd, base_pd), + INIT_RDMA_OBJ_SIZE(ib_srq, siw_srq, base_srq), + INIT_RDMA_OBJ_SIZE(ib_ucontext, siw_ucontext, base_ucontext), +}; + +static struct siw_device *siw_device_create(struct net_device *netdev) +{ + struct siw_device *sdev = NULL; + struct ib_device *base_dev; + struct device *parent = netdev->dev.parent; + int rv; + + if (!parent) { + /* + * The loopback device has no parent device, + * so it appears as a top-level device. To support + * loopback device connectivity, take this device + * as the parent device. Skip all other devices + * w/o parent device. + */ + if (netdev->type != ARPHRD_LOOPBACK) { + pr_warn("siw: device %s error: no parent device\n", + netdev->name); + return NULL; + } + parent = &netdev->dev; + } + sdev = ib_alloc_device(siw_device, base_dev); + if (!sdev) + return NULL; + + base_dev = &sdev->base_dev; + + sdev->netdev = netdev; + + if (netdev->type != ARPHRD_LOOPBACK) { + memcpy(&base_dev->node_guid, netdev->dev_addr, 6); + } else { + /* + * The loopback device does not have a HW address, + * but connection mangagement lib expects gid != 0 + */ + size_t gidlen = min_t(size_t, strlen(base_dev->name), 6); + + memcpy(&base_dev->node_guid, base_dev->name, gidlen); + } + base_dev->uverbs_cmd_mask = + (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) | + (1ull << IB_USER_VERBS_CMD_QUERY_PORT) | + (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) | + (1ull << IB_USER_VERBS_CMD_ALLOC_PD) | + (1ull << IB_USER_VERBS_CMD_DEALLOC_PD) | + (1ull << IB_USER_VERBS_CMD_REG_MR) | + (1ull << IB_USER_VERBS_CMD_DEREG_MR) | + (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) | + (1ull << IB_USER_VERBS_CMD_CREATE_CQ) | + (1ull << IB_USER_VERBS_CMD_POLL_CQ) | + (1ull << IB_USER_VERBS_CMD_REQ_NOTIFY_CQ) | + (1ull << IB_USER_VERBS_CMD_DESTROY_CQ) | + (1ull << IB_USER_VERBS_CMD_CREATE_QP) | + (1ull << IB_USER_VERBS_CMD_QUERY_QP) | + (1ull << IB_USER_VERBS_CMD_MODIFY_QP) | + (1ull << IB_USER_VERBS_CMD_DESTROY_QP) | + (1ull << IB_USER_VERBS_CMD_POST_SEND) | + (1ull << IB_USER_VERBS_CMD_POST_RECV) | + (1ull << IB_USER_VERBS_CMD_CREATE_SRQ) | + (1ull << IB_USER_VERBS_CMD_POST_SRQ_RECV) | + (1ull << IB_USER_VERBS_CMD_MODIFY_SRQ) | + (1ull << IB_USER_VERBS_CMD_QUERY_SRQ) | + (1ull << IB_USER_VERBS_CMD_DESTROY_SRQ); + + base_dev->node_type = RDMA_NODE_RNIC; + memcpy(base_dev->node_desc, SIW_NODE_DESC_COMMON, + sizeof(SIW_NODE_DESC_COMMON)); + + /* + * Current model (one-to-one device association): + * One Softiwarp device per net_device or, equivalently, + * per physical port. + */ + base_dev->phys_port_cnt = 1; + base_dev->dev.parent = parent; + base_dev->dev.dma_ops = &dma_virt_ops; + base_dev->num_comp_vectors = num_possible_cpus(); + + ib_set_device_ops(base_dev, &siw_device_ops); + rv = ib_device_set_netdev(base_dev, netdev, 1); + if (rv) + goto error; + + memcpy(base_dev->iw_ifname, netdev->name, + sizeof(base_dev->iw_ifname)); + + /* Disable TCP port mapping */ + base_dev->iw_driver_flags = IW_F_NO_PORT_MAP, + + sdev->attrs.max_qp = SIW_MAX_QP; + sdev->attrs.max_qp_wr = SIW_MAX_QP_WR; + sdev->attrs.max_ord = SIW_MAX_ORD_QP; + sdev->attrs.max_ird = SIW_MAX_IRD_QP; + sdev->attrs.max_sge = SIW_MAX_SGE; + sdev->attrs.max_sge_rd = SIW_MAX_SGE_RD; + sdev->attrs.max_cq = SIW_MAX_CQ; + sdev->attrs.max_cqe = SIW_MAX_CQE; + sdev->attrs.max_mr = SIW_MAX_MR; + sdev->attrs.max_pd = SIW_MAX_PD; + sdev->attrs.max_mw = SIW_MAX_MW; + sdev->attrs.max_fmr = SIW_MAX_FMR; + sdev->attrs.max_srq = SIW_MAX_SRQ; + sdev->attrs.max_srq_wr = SIW_MAX_SRQ_WR; + sdev->attrs.max_srq_sge = SIW_MAX_SGE; + + xa_init_flags(&sdev->qp_xa, XA_FLAGS_ALLOC1); + xa_init_flags(&sdev->mem_xa, XA_FLAGS_ALLOC1); + + INIT_LIST_HEAD(&sdev->cep_list); + INIT_LIST_HEAD(&sdev->qp_list); + + atomic_set(&sdev->num_ctx, 0); + atomic_set(&sdev->num_srq, 0); + atomic_set(&sdev->num_qp, 0); + atomic_set(&sdev->num_cq, 0); + atomic_set(&sdev->num_mr, 0); + atomic_set(&sdev->num_pd, 0); + + sdev->numa_node = dev_to_node(parent); + spin_lock_init(&sdev->lock); + + return sdev; +error: + ib_dealloc_device(base_dev); + + return NULL; +} + +/* + * Network link becomes unavailable. Mark all + * affected QP's accordingly. + */ +static void siw_netdev_down(struct work_struct *work) +{ + struct siw_device *sdev = + container_of(work, struct siw_device, netdev_down); + + struct siw_qp_attrs qp_attrs; + struct list_head *pos, *tmp; + + memset(&qp_attrs, 0, sizeof(qp_attrs)); + qp_attrs.state = SIW_QP_STATE_ERROR; + + list_for_each_safe(pos, tmp, &sdev->qp_list) { + struct siw_qp *qp = list_entry(pos, struct siw_qp, devq); + + down_write(&qp->state_lock); + WARN_ON(siw_qp_modify(qp, &qp_attrs, SIW_QP_ATTR_STATE)); + up_write(&qp->state_lock); + } + ib_device_put(&sdev->base_dev); +} + +static void siw_device_goes_down(struct siw_device *sdev) +{ + if (ib_device_try_get(&sdev->base_dev)) { + INIT_WORK(&sdev->netdev_down, siw_netdev_down); + schedule_work(&sdev->netdev_down); + } +} + +static int siw_netdev_event(struct notifier_block *nb, unsigned long event, + void *arg) +{ + struct net_device *netdev = netdev_notifier_info_to_dev(arg); + struct ib_device *base_dev; + struct siw_device *sdev; + + dev_dbg(&netdev->dev, "siw: event %lu\n", event); + + if (dev_net(netdev) != &init_net) + return NOTIFY_OK; + + base_dev = ib_device_get_by_netdev(netdev, RDMA_DRIVER_SIW); + if (!base_dev) + return NOTIFY_OK; + + sdev = to_siw_dev(base_dev); + + switch (event) { + case NETDEV_UP: + sdev->state = IB_PORT_ACTIVE; + siw_port_event(sdev, 1, IB_EVENT_PORT_ACTIVE); + break; + + case NETDEV_GOING_DOWN: + siw_device_goes_down(sdev); + break; + + case NETDEV_DOWN: + sdev->state = IB_PORT_DOWN; + siw_port_event(sdev, 1, IB_EVENT_PORT_ERR); + break; + + case NETDEV_REGISTER: + /* + * Device registration now handled only by + * rdma netlink commands. So it shall be impossible + * to end up here with a valid siw device. + */ + siw_dbg(base_dev, "unexpected NETDEV_REGISTER event\n"); + break; + + case NETDEV_UNREGISTER: + ib_unregister_device_queued(&sdev->base_dev); + break; + + case NETDEV_CHANGEADDR: + siw_port_event(sdev, 1, IB_EVENT_LID_CHANGE); + break; + /* + * Todo: Below netdev events are currently not handled. + */ + case NETDEV_CHANGEMTU: + case NETDEV_CHANGE: + break; + + default: + break; + } + ib_device_put(&sdev->base_dev); + + return NOTIFY_OK; +} + +static struct notifier_block siw_netdev_nb = { + .notifier_call = siw_netdev_event, +}; + +static int siw_newlink(const char *basedev_name, struct net_device *netdev) +{ + struct ib_device *base_dev; + struct siw_device *sdev = NULL; + int rv = -ENOMEM; + + if (!siw_dev_qualified(netdev)) + return -EINVAL; + + base_dev = ib_device_get_by_netdev(netdev, RDMA_DRIVER_SIW); + if (base_dev) { + ib_device_put(base_dev); + return -EEXIST; + } + sdev = siw_device_create(netdev); + if (sdev) { + dev_dbg(&netdev->dev, "siw: new device\n"); + + if (netif_running(netdev) && netif_carrier_ok(netdev)) + sdev->state = IB_PORT_ACTIVE; + else + sdev->state = IB_PORT_DOWN; + + rv = siw_device_register(sdev, basedev_name); + if (rv) + ib_dealloc_device(&sdev->base_dev); + } + return rv; +} + +static struct rdma_link_ops siw_link_ops = { + .type = "siw", + .newlink = siw_newlink, +}; + +/* + * siw_init_module - Initialize Softiwarp module and register with netdev + * subsystem. + */ +static __init int siw_init_module(void) +{ + int rv; + int nr_cpu; + + if (SENDPAGE_THRESH < SIW_MAX_INLINE) { + pr_info("siw: sendpage threshold too small: %u\n", + (int)SENDPAGE_THRESH); + rv = -EINVAL; + goto out_error; + } + rv = siw_init_cpulist(); + if (rv) + goto out_error; + + rv = siw_cm_init(); + if (rv) + goto out_error; + + if (!siw_create_tx_threads()) { + pr_info("siw: Could not start any TX thread\n"); + goto out_error; + } + /* + * Locate CRC32 algorithm. If unsuccessful, fail + * loading siw only, if CRC is required. + */ + siw_crypto_shash = crypto_alloc_shash("crc32c", 0, 0); + if (IS_ERR(siw_crypto_shash)) { + pr_info("siw: Loading CRC32c failed: %ld\n", + PTR_ERR(siw_crypto_shash)); + siw_crypto_shash = NULL; + if (mpa_crc_required) { + rv = -EOPNOTSUPP; + goto out_error; + } + } + rv = register_netdevice_notifier(&siw_netdev_nb); + if (rv) + goto out_error; + + rdma_link_register(&siw_link_ops); + + pr_info("SoftiWARP attached\n"); + return 0; + +out_error: + for (nr_cpu = 0; nr_cpu < nr_cpu_ids; nr_cpu++) { + if (siw_tx_thread[nr_cpu]) { + siw_stop_tx_thread(nr_cpu); + siw_tx_thread[nr_cpu] = NULL; + } + } + if (siw_crypto_shash) + crypto_free_shash(siw_crypto_shash); + + pr_info("SoftIWARP attach failed. Error: %d\n", rv); + + siw_cm_exit(); + siw_destroy_cpulist(); + + return rv; +} + +static void __exit siw_exit_module(void) +{ + int cpu; + + for_each_possible_cpu(cpu) { + if (siw_tx_thread[cpu]) { + siw_stop_tx_thread(cpu); + siw_tx_thread[cpu] = NULL; + } + } + unregister_netdevice_notifier(&siw_netdev_nb); + rdma_link_unregister(&siw_link_ops); + ib_unregister_driver(RDMA_DRIVER_SIW); + + siw_cm_exit(); + + siw_destroy_cpulist(); + + if (siw_crypto_shash) + crypto_free_shash(siw_crypto_shash); + + pr_info("SoftiWARP detached\n"); +} + +module_init(siw_init_module); +module_exit(siw_exit_module); + +MODULE_ALIAS_RDMA_LINK("siw"); diff --git a/drivers/infiniband/sw/siw/siw_mem.c b/drivers/infiniband/sw/siw/siw_mem.c new file mode 100644 index 000000000000..67171c82b0c4 --- /dev/null +++ b/drivers/infiniband/sw/siw/siw_mem.c @@ -0,0 +1,460 @@ +// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause + +/* Authors: Bernard Metzler <bmt@zurich.ibm.com> */ +/* Copyright (c) 2008-2019, IBM Corporation */ + +#include <linux/gfp.h> +#include <rdma/ib_verbs.h> +#include <linux/dma-mapping.h> +#include <linux/slab.h> +#include <linux/sched/mm.h> +#include <linux/resource.h> + +#include "siw.h" +#include "siw_mem.h" + +/* + * Stag lookup is based on its index part only (24 bits). + * The code avoids special Stag of zero and tries to randomize + * STag values between 1 and SIW_STAG_MAX_INDEX. + */ +int siw_mem_add(struct siw_device *sdev, struct siw_mem *m) +{ + struct xa_limit limit = XA_LIMIT(1, 0x00ffffff); + u32 id, next; + + get_random_bytes(&next, 4); + next &= 0x00ffffff; + + if (xa_alloc_cyclic(&sdev->mem_xa, &id, m, limit, &next, + GFP_KERNEL) < 0) + return -ENOMEM; + + /* Set the STag index part */ + m->stag = id << 8; + + siw_dbg_mem(m, "new MEM object\n"); + + return 0; +} + +/* + * siw_mem_id2obj() + * + * resolves memory from stag given by id. might be called from: + * o process context before sending out of sgl, or + * o in softirq when resolving target memory + */ +struct siw_mem *siw_mem_id2obj(struct siw_device *sdev, int stag_index) +{ + struct siw_mem *mem; + + rcu_read_lock(); + mem = xa_load(&sdev->mem_xa, stag_index); + if (likely(mem && kref_get_unless_zero(&mem->ref))) { + rcu_read_unlock(); + return mem; + } + rcu_read_unlock(); + + return NULL; +} + +static void siw_free_plist(struct siw_page_chunk *chunk, int num_pages, + bool dirty) +{ + struct page **p = chunk->plist; + + while (num_pages--) { + if (!PageDirty(*p) && dirty) + put_user_pages_dirty_lock(p, 1); + else + put_user_page(*p); + p++; + } +} + +void siw_umem_release(struct siw_umem *umem, bool dirty) +{ + struct mm_struct *mm_s = umem->owning_mm; + int i, num_pages = umem->num_pages; + + for (i = 0; num_pages; i++) { + int to_free = min_t(int, PAGES_PER_CHUNK, num_pages); + + siw_free_plist(&umem->page_chunk[i], to_free, + umem->writable && dirty); + kfree(umem->page_chunk[i].plist); + num_pages -= to_free; + } + atomic64_sub(umem->num_pages, &mm_s->pinned_vm); + + mmdrop(mm_s); + kfree(umem->page_chunk); + kfree(umem); +} + +int siw_mr_add_mem(struct siw_mr *mr, struct ib_pd *pd, void *mem_obj, + u64 start, u64 len, int rights) +{ + struct siw_device *sdev = to_siw_dev(pd->device); + struct siw_mem *mem = kzalloc(sizeof(*mem), GFP_KERNEL); + struct xa_limit limit = XA_LIMIT(1, 0x00ffffff); + u32 id, next; + + if (!mem) + return -ENOMEM; + + mem->mem_obj = mem_obj; + mem->stag_valid = 0; + mem->sdev = sdev; + mem->va = start; + mem->len = len; + mem->pd = pd; + mem->perms = rights & IWARP_ACCESS_MASK; + kref_init(&mem->ref); + + mr->mem = mem; + + get_random_bytes(&next, 4); + next &= 0x00ffffff; + + if (xa_alloc_cyclic(&sdev->mem_xa, &id, mem, limit, &next, + GFP_KERNEL) < 0) { + kfree(mem); + return -ENOMEM; + } + /* Set the STag index part */ + mem->stag = id << 8; + mr->base_mr.lkey = mr->base_mr.rkey = mem->stag; + + return 0; +} + +void siw_mr_drop_mem(struct siw_mr *mr) +{ + struct siw_mem *mem = mr->mem, *found; + + mem->stag_valid = 0; + + /* make STag invalid visible asap */ + smp_mb(); + + found = xa_erase(&mem->sdev->mem_xa, mem->stag >> 8); + WARN_ON(found != mem); + siw_mem_put(mem); +} + +void siw_free_mem(struct kref *ref) +{ + struct siw_mem *mem = container_of(ref, struct siw_mem, ref); + + siw_dbg_mem(mem, "free mem, pbl: %s\n", mem->is_pbl ? "y" : "n"); + + if (!mem->is_mw && mem->mem_obj) { + if (mem->is_pbl == 0) + siw_umem_release(mem->umem, true); + else + kfree(mem->pbl); + } + kfree(mem); +} + +/* + * siw_check_mem() + * + * Check protection domain, STAG state, access permissions and + * address range for memory object. + * + * @pd: Protection Domain memory should belong to + * @mem: memory to be checked + * @addr: starting addr of mem + * @perms: requested access permissions + * @len: len of memory interval to be checked + * + */ +int siw_check_mem(struct ib_pd *pd, struct siw_mem *mem, u64 addr, + enum ib_access_flags perms, int len) +{ + if (!mem->stag_valid) { + siw_dbg_pd(pd, "STag 0x%08x invalid\n", mem->stag); + return -E_STAG_INVALID; + } + if (mem->pd != pd) { + siw_dbg_pd(pd, "STag 0x%08x: PD mismatch\n", mem->stag); + return -E_PD_MISMATCH; + } + /* + * check access permissions + */ + if ((mem->perms & perms) < perms) { + siw_dbg_pd(pd, "permissions 0x%08x < 0x%08x\n", + mem->perms, perms); + return -E_ACCESS_PERM; + } + /* + * Check if access falls into valid memory interval. + */ + if (addr < mem->va || addr + len > mem->va + mem->len) { + siw_dbg_pd(pd, "MEM interval len %d\n", len); + siw_dbg_pd(pd, "[0x%016llx, 0x%016llx] out of bounds\n", + (unsigned long long)addr, + (unsigned long long)(addr + len)); + siw_dbg_pd(pd, "[0x%016llx, 0x%016llx] STag=0x%08x\n", + (unsigned long long)mem->va, + (unsigned long long)(mem->va + mem->len), + mem->stag); + + return -E_BASE_BOUNDS; + } + return E_ACCESS_OK; +} + +/* + * siw_check_sge() + * + * Check SGE for access rights in given interval + * + * @pd: Protection Domain memory should belong to + * @sge: SGE to be checked + * @mem: location of memory reference within array + * @perms: requested access permissions + * @off: starting offset in SGE + * @len: len of memory interval to be checked + * + * NOTE: Function references SGE's memory object (mem->obj) + * if not yet done. New reference is kept if check went ok and + * released if check failed. If mem->obj is already valid, no new + * lookup is being done and mem is not released it check fails. + */ +int siw_check_sge(struct ib_pd *pd, struct siw_sge *sge, struct siw_mem *mem[], + enum ib_access_flags perms, u32 off, int len) +{ + struct siw_device *sdev = to_siw_dev(pd->device); + struct siw_mem *new = NULL; + int rv = E_ACCESS_OK; + + if (len + off > sge->length) { + rv = -E_BASE_BOUNDS; + goto fail; + } + if (*mem == NULL) { + new = siw_mem_id2obj(sdev, sge->lkey >> 8); + if (unlikely(!new)) { + siw_dbg_pd(pd, "STag unknown: 0x%08x\n", sge->lkey); + rv = -E_STAG_INVALID; + goto fail; + } + *mem = new; + } + /* Check if user re-registered with different STag key */ + if (unlikely((*mem)->stag != sge->lkey)) { + siw_dbg_mem((*mem), "STag mismatch: 0x%08x\n", sge->lkey); + rv = -E_STAG_INVALID; + goto fail; + } + rv = siw_check_mem(pd, *mem, sge->laddr + off, perms, len); + if (unlikely(rv)) + goto fail; + + return 0; + +fail: + if (new) { + *mem = NULL; + siw_mem_put(new); + } + return rv; +} + +void siw_wqe_put_mem(struct siw_wqe *wqe, enum siw_opcode op) +{ + switch (op) { + case SIW_OP_SEND: + case SIW_OP_WRITE: + case SIW_OP_SEND_WITH_IMM: + case SIW_OP_SEND_REMOTE_INV: + case SIW_OP_READ: + case SIW_OP_READ_LOCAL_INV: + if (!(wqe->sqe.flags & SIW_WQE_INLINE)) + siw_unref_mem_sgl(wqe->mem, wqe->sqe.num_sge); + break; + + case SIW_OP_RECEIVE: + siw_unref_mem_sgl(wqe->mem, wqe->rqe.num_sge); + break; + + case SIW_OP_READ_RESPONSE: + siw_unref_mem_sgl(wqe->mem, 1); + break; + + default: + /* + * SIW_OP_INVAL_STAG and SIW_OP_REG_MR + * do not hold memory references + */ + break; + } +} + +int siw_invalidate_stag(struct ib_pd *pd, u32 stag) +{ + struct siw_device *sdev = to_siw_dev(pd->device); + struct siw_mem *mem = siw_mem_id2obj(sdev, stag >> 8); + int rv = 0; + + if (unlikely(!mem)) { + siw_dbg_pd(pd, "STag 0x%08x unknown\n", stag); + return -EINVAL; + } + if (unlikely(mem->pd != pd)) { + siw_dbg_pd(pd, "PD mismatch for STag 0x%08x\n", stag); + rv = -EACCES; + goto out; + } + /* + * Per RDMA verbs definition, an STag may already be in invalid + * state if invalidation is requested. So no state check here. + */ + mem->stag_valid = 0; + + siw_dbg_pd(pd, "STag 0x%08x now invalid\n", stag); +out: + siw_mem_put(mem); + return rv; +} + +/* + * Gets physical address backed by PBL element. Address is referenced + * by linear byte offset into list of variably sized PB elements. + * Optionally, provides remaining len within current element, and + * current PBL index for later resume at same element. + */ +u64 siw_pbl_get_buffer(struct siw_pbl *pbl, u64 off, int *len, int *idx) +{ + int i = idx ? *idx : 0; + + while (i < pbl->num_buf) { + struct siw_pble *pble = &pbl->pbe[i]; + + if (pble->pbl_off + pble->size > off) { + u64 pble_off = off - pble->pbl_off; + + if (len) + *len = pble->size - pble_off; + if (idx) + *idx = i; + + return pble->addr + pble_off; + } + i++; + } + if (len) + *len = 0; + return 0; +} + +struct siw_pbl *siw_pbl_alloc(u32 num_buf) +{ + struct siw_pbl *pbl; + int buf_size = sizeof(*pbl); + + if (num_buf == 0) + return ERR_PTR(-EINVAL); + + buf_size += ((num_buf - 1) * sizeof(struct siw_pble)); + + pbl = kzalloc(buf_size, GFP_KERNEL); + if (!pbl) + return ERR_PTR(-ENOMEM); + + pbl->max_buf = num_buf; + + return pbl; +} + +struct siw_umem *siw_umem_get(u64 start, u64 len, bool writable) +{ + struct siw_umem *umem; + struct mm_struct *mm_s; + u64 first_page_va; + unsigned long mlock_limit; + unsigned int foll_flags = FOLL_WRITE; + int num_pages, num_chunks, i, rv = 0; + + if (!can_do_mlock()) + return ERR_PTR(-EPERM); + + if (!len) + return ERR_PTR(-EINVAL); + + first_page_va = start & PAGE_MASK; + num_pages = PAGE_ALIGN(start + len - first_page_va) >> PAGE_SHIFT; + num_chunks = (num_pages >> CHUNK_SHIFT) + 1; + + umem = kzalloc(sizeof(*umem), GFP_KERNEL); + if (!umem) + return ERR_PTR(-ENOMEM); + + mm_s = current->mm; + umem->owning_mm = mm_s; + umem->writable = writable; + + mmgrab(mm_s); + + if (!writable) + foll_flags |= FOLL_FORCE; + + down_read(&mm_s->mmap_sem); + + mlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; + + if (num_pages + atomic64_read(&mm_s->pinned_vm) > mlock_limit) { + rv = -ENOMEM; + goto out_sem_up; + } + umem->fp_addr = first_page_va; + + umem->page_chunk = + kcalloc(num_chunks, sizeof(struct siw_page_chunk), GFP_KERNEL); + if (!umem->page_chunk) { + rv = -ENOMEM; + goto out_sem_up; + } + for (i = 0; num_pages; i++) { + int got, nents = min_t(int, num_pages, PAGES_PER_CHUNK); + + umem->page_chunk[i].plist = + kcalloc(nents, sizeof(struct page *), GFP_KERNEL); + if (!umem->page_chunk[i].plist) { + rv = -ENOMEM; + goto out_sem_up; + } + got = 0; + while (nents) { + struct page **plist = &umem->page_chunk[i].plist[got]; + + rv = get_user_pages(first_page_va, nents, + foll_flags | FOLL_LONGTERM, + plist, NULL); + if (rv < 0) + goto out_sem_up; + + umem->num_pages += rv; + atomic64_add(rv, &mm_s->pinned_vm); + first_page_va += rv * PAGE_SIZE; + nents -= rv; + got += rv; + } + num_pages -= got; + } +out_sem_up: + up_read(&mm_s->mmap_sem); + + if (rv > 0) + return umem; + + siw_umem_release(umem, false); + + return ERR_PTR(rv); +} diff --git a/drivers/infiniband/sw/siw/siw_mem.h b/drivers/infiniband/sw/siw/siw_mem.h new file mode 100644 index 000000000000..f43daf280891 --- /dev/null +++ b/drivers/infiniband/sw/siw/siw_mem.h @@ -0,0 +1,74 @@ +/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause */ + +/* Authors: Bernard Metzler <bmt@zurich.ibm.com> */ +/* Copyright (c) 2008-2019, IBM Corporation */ + +#ifndef _SIW_MEM_H +#define _SIW_MEM_H + +struct siw_umem *siw_umem_get(u64 start, u64 len, bool writable); +void siw_umem_release(struct siw_umem *umem, bool dirty); +struct siw_pbl *siw_pbl_alloc(u32 num_buf); +u64 siw_pbl_get_buffer(struct siw_pbl *pbl, u64 off, int *len, int *idx); +struct siw_mem *siw_mem_id2obj(struct siw_device *sdev, int stag_index); +int siw_mem_add(struct siw_device *sdev, struct siw_mem *m); +int siw_invalidate_stag(struct ib_pd *pd, u32 stag); +int siw_check_mem(struct ib_pd *pd, struct siw_mem *mem, u64 addr, + enum ib_access_flags perms, int len); +int siw_check_sge(struct ib_pd *pd, struct siw_sge *sge, + struct siw_mem *mem[], enum ib_access_flags perms, + u32 off, int len); +void siw_wqe_put_mem(struct siw_wqe *wqe, enum siw_opcode op); +int siw_mr_add_mem(struct siw_mr *mr, struct ib_pd *pd, void *mem_obj, + u64 start, u64 len, int rights); +void siw_mr_drop_mem(struct siw_mr *mr); +void siw_free_mem(struct kref *ref); + +static inline void siw_mem_put(struct siw_mem *mem) +{ + kref_put(&mem->ref, siw_free_mem); +} + +static inline struct siw_mr *siw_mem2mr(struct siw_mem *m) +{ + return container_of(m, struct siw_mr, mem); +} + +static inline void siw_unref_mem_sgl(struct siw_mem **mem, unsigned int num_sge) +{ + while (num_sge) { + if (*mem == NULL) + break; + + siw_mem_put(*mem); + *mem = NULL; + mem++; + num_sge--; + } +} + +#define CHUNK_SHIFT 9 /* sets number of pages per chunk */ +#define PAGES_PER_CHUNK (_AC(1, UL) << CHUNK_SHIFT) +#define CHUNK_MASK (~(PAGES_PER_CHUNK - 1)) +#define PAGE_CHUNK_SIZE (PAGES_PER_CHUNK * sizeof(struct page *)) + +/* + * siw_get_upage() + * + * Get page pointer for address on given umem. + * + * @umem: two dimensional list of page pointers + * @addr: user virtual address + */ +static inline struct page *siw_get_upage(struct siw_umem *umem, u64 addr) +{ + unsigned int page_idx = (addr - umem->fp_addr) >> PAGE_SHIFT, + chunk_idx = page_idx >> CHUNK_SHIFT, + page_in_chunk = page_idx & ~CHUNK_MASK; + + if (likely(page_idx < umem->num_pages)) + return umem->page_chunk[chunk_idx].plist[page_in_chunk]; + + return NULL; +} +#endif diff --git a/drivers/infiniband/sw/siw/siw_qp.c b/drivers/infiniband/sw/siw/siw_qp.c new file mode 100644 index 000000000000..11383d9f95ef --- /dev/null +++ b/drivers/infiniband/sw/siw/siw_qp.c @@ -0,0 +1,1322 @@ +// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause + +/* Authors: Bernard Metzler <bmt@zurich.ibm.com> */ +/* Copyright (c) 2008-2019, IBM Corporation */ + +#include <linux/errno.h> +#include <linux/types.h> +#include <linux/net.h> +#include <linux/scatterlist.h> +#include <linux/llist.h> +#include <asm/barrier.h> +#include <net/tcp.h> + +#include "siw.h" +#include "siw_verbs.h" +#include "siw_mem.h" + +static char siw_qp_state_to_string[SIW_QP_STATE_COUNT][sizeof "TERMINATE"] = { + [SIW_QP_STATE_IDLE] = "IDLE", + [SIW_QP_STATE_RTR] = "RTR", + [SIW_QP_STATE_RTS] = "RTS", + [SIW_QP_STATE_CLOSING] = "CLOSING", + [SIW_QP_STATE_TERMINATE] = "TERMINATE", + [SIW_QP_STATE_ERROR] = "ERROR" +}; + +/* + * iWARP (RDMAP, DDP and MPA) parameters as well as Softiwarp settings on a + * per-RDMAP message basis. Please keep order of initializer. All MPA len + * is initialized to minimum packet size. + */ +struct iwarp_msg_info iwarp_pktinfo[RDMAP_TERMINATE + 1] = { + { /* RDMAP_RDMA_WRITE */ + .hdr_len = sizeof(struct iwarp_rdma_write), + .ctrl.mpa_len = htons(sizeof(struct iwarp_rdma_write) - 2), + .ctrl.ddp_rdmap_ctrl = DDP_FLAG_TAGGED | DDP_FLAG_LAST | + cpu_to_be16(DDP_VERSION << 8) | + cpu_to_be16(RDMAP_VERSION << 6) | + cpu_to_be16(RDMAP_RDMA_WRITE), + .rx_data = siw_proc_write }, + { /* RDMAP_RDMA_READ_REQ */ + .hdr_len = sizeof(struct iwarp_rdma_rreq), + .ctrl.mpa_len = htons(sizeof(struct iwarp_rdma_rreq) - 2), + .ctrl.ddp_rdmap_ctrl = DDP_FLAG_LAST | cpu_to_be16(DDP_VERSION << 8) | + cpu_to_be16(RDMAP_VERSION << 6) | + cpu_to_be16(RDMAP_RDMA_READ_REQ), + .rx_data = siw_proc_rreq }, + { /* RDMAP_RDMA_READ_RESP */ + .hdr_len = sizeof(struct iwarp_rdma_rresp), + .ctrl.mpa_len = htons(sizeof(struct iwarp_rdma_rresp) - 2), + .ctrl.ddp_rdmap_ctrl = DDP_FLAG_TAGGED | DDP_FLAG_LAST | + cpu_to_be16(DDP_VERSION << 8) | + cpu_to_be16(RDMAP_VERSION << 6) | + cpu_to_be16(RDMAP_RDMA_READ_RESP), + .rx_data = siw_proc_rresp }, + { /* RDMAP_SEND */ + .hdr_len = sizeof(struct iwarp_send), + .ctrl.mpa_len = htons(sizeof(struct iwarp_send) - 2), + .ctrl.ddp_rdmap_ctrl = DDP_FLAG_LAST | cpu_to_be16(DDP_VERSION << 8) | + cpu_to_be16(RDMAP_VERSION << 6) | + cpu_to_be16(RDMAP_SEND), + .rx_data = siw_proc_send }, + { /* RDMAP_SEND_INVAL */ + .hdr_len = sizeof(struct iwarp_send_inv), + .ctrl.mpa_len = htons(sizeof(struct iwarp_send_inv) - 2), + .ctrl.ddp_rdmap_ctrl = DDP_FLAG_LAST | cpu_to_be16(DDP_VERSION << 8) | + cpu_to_be16(RDMAP_VERSION << 6) | + cpu_to_be16(RDMAP_SEND_INVAL), + .rx_data = siw_proc_send }, + { /* RDMAP_SEND_SE */ + .hdr_len = sizeof(struct iwarp_send), + .ctrl.mpa_len = htons(sizeof(struct iwarp_send) - 2), + .ctrl.ddp_rdmap_ctrl = DDP_FLAG_LAST | cpu_to_be16(DDP_VERSION << 8) | + cpu_to_be16(RDMAP_VERSION << 6) | + cpu_to_be16(RDMAP_SEND_SE), + .rx_data = siw_proc_send }, + { /* RDMAP_SEND_SE_INVAL */ + .hdr_len = sizeof(struct iwarp_send_inv), + .ctrl.mpa_len = htons(sizeof(struct iwarp_send_inv) - 2), + .ctrl.ddp_rdmap_ctrl = DDP_FLAG_LAST | cpu_to_be16(DDP_VERSION << 8) | + cpu_to_be16(RDMAP_VERSION << 6) | + cpu_to_be16(RDMAP_SEND_SE_INVAL), + .rx_data = siw_proc_send }, + { /* RDMAP_TERMINATE */ + .hdr_len = sizeof(struct iwarp_terminate), + .ctrl.mpa_len = htons(sizeof(struct iwarp_terminate) - 2), + .ctrl.ddp_rdmap_ctrl = DDP_FLAG_LAST | cpu_to_be16(DDP_VERSION << 8) | + cpu_to_be16(RDMAP_VERSION << 6) | + cpu_to_be16(RDMAP_TERMINATE), + .rx_data = siw_proc_terminate } +}; + +void siw_qp_llp_data_ready(struct sock *sk) +{ + struct siw_qp *qp; + + read_lock(&sk->sk_callback_lock); + + if (unlikely(!sk->sk_user_data || !sk_to_qp(sk))) + goto done; + + qp = sk_to_qp(sk); + + if (likely(!qp->rx_stream.rx_suspend && + down_read_trylock(&qp->state_lock))) { + read_descriptor_t rd_desc = { .arg.data = qp, .count = 1 }; + + if (likely(qp->attrs.state == SIW_QP_STATE_RTS)) + /* + * Implements data receive operation during + * socket callback. TCP gracefully catches + * the case where there is nothing to receive + * (not calling siw_tcp_rx_data() then). + */ + tcp_read_sock(sk, &rd_desc, siw_tcp_rx_data); + + up_read(&qp->state_lock); + } else { + siw_dbg_qp(qp, "unable to process RX, suspend: %d\n", + qp->rx_stream.rx_suspend); + } +done: + read_unlock(&sk->sk_callback_lock); +} + +void siw_qp_llp_close(struct siw_qp *qp) +{ + siw_dbg_qp(qp, "enter llp close, state = %s\n", + siw_qp_state_to_string[qp->attrs.state]); + + down_write(&qp->state_lock); + + qp->rx_stream.rx_suspend = 1; + qp->tx_ctx.tx_suspend = 1; + qp->attrs.sk = NULL; + + switch (qp->attrs.state) { + case SIW_QP_STATE_RTS: + case SIW_QP_STATE_RTR: + case SIW_QP_STATE_IDLE: + case SIW_QP_STATE_TERMINATE: + qp->attrs.state = SIW_QP_STATE_ERROR; + break; + /* + * SIW_QP_STATE_CLOSING: + * + * This is a forced close. shall the QP be moved to + * ERROR or IDLE ? + */ + case SIW_QP_STATE_CLOSING: + if (tx_wqe(qp)->wr_status == SIW_WR_IDLE) + qp->attrs.state = SIW_QP_STATE_ERROR; + else + qp->attrs.state = SIW_QP_STATE_IDLE; + break; + + default: + siw_dbg_qp(qp, "llp close: no state transition needed: %s\n", + siw_qp_state_to_string[qp->attrs.state]); + break; + } + siw_sq_flush(qp); + siw_rq_flush(qp); + + /* + * Dereference closing CEP + */ + if (qp->cep) { + siw_cep_put(qp->cep); + qp->cep = NULL; + } + + up_write(&qp->state_lock); + + siw_dbg_qp(qp, "llp close exit: state %s\n", + siw_qp_state_to_string[qp->attrs.state]); +} + +/* + * socket callback routine informing about newly available send space. + * Function schedules SQ work for processing SQ items. + */ +void siw_qp_llp_write_space(struct sock *sk) +{ + struct siw_cep *cep = sk_to_cep(sk); + + cep->sk_write_space(sk); + + if (!test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) + (void)siw_sq_start(cep->qp); +} + +static int siw_qp_readq_init(struct siw_qp *qp, int irq_size, int orq_size) +{ + irq_size = roundup_pow_of_two(irq_size); + orq_size = roundup_pow_of_two(orq_size); + + qp->attrs.irq_size = irq_size; + qp->attrs.orq_size = orq_size; + + qp->irq = vzalloc(irq_size * sizeof(struct siw_sqe)); + if (!qp->irq) { + siw_dbg_qp(qp, "irq malloc for %d failed\n", irq_size); + qp->attrs.irq_size = 0; + return -ENOMEM; + } + qp->orq = vzalloc(orq_size * sizeof(struct siw_sqe)); + if (!qp->orq) { + siw_dbg_qp(qp, "orq malloc for %d failed\n", orq_size); + qp->attrs.orq_size = 0; + qp->attrs.irq_size = 0; + vfree(qp->irq); + return -ENOMEM; + } + siw_dbg_qp(qp, "ORD %d, IRD %d\n", orq_size, irq_size); + return 0; +} + +static int siw_qp_enable_crc(struct siw_qp *qp) +{ + struct siw_rx_stream *c_rx = &qp->rx_stream; + struct siw_iwarp_tx *c_tx = &qp->tx_ctx; + int size = crypto_shash_descsize(siw_crypto_shash) + + sizeof(struct shash_desc); + + if (siw_crypto_shash == NULL) + return -ENOENT; + + c_tx->mpa_crc_hd = kzalloc(size, GFP_KERNEL); + c_rx->mpa_crc_hd = kzalloc(size, GFP_KERNEL); + if (!c_tx->mpa_crc_hd || !c_rx->mpa_crc_hd) { + kfree(c_tx->mpa_crc_hd); + kfree(c_rx->mpa_crc_hd); + c_tx->mpa_crc_hd = NULL; + c_rx->mpa_crc_hd = NULL; + return -ENOMEM; + } + c_tx->mpa_crc_hd->tfm = siw_crypto_shash; + c_rx->mpa_crc_hd->tfm = siw_crypto_shash; + + return 0; +} + +/* + * Send a non signalled READ or WRITE to peer side as negotiated + * with MPAv2 P2P setup protocol. The work request is only created + * as a current active WR and does not consume Send Queue space. + * + * Caller must hold QP state lock. + */ +int siw_qp_mpa_rts(struct siw_qp *qp, enum mpa_v2_ctrl ctrl) +{ + struct siw_wqe *wqe = tx_wqe(qp); + unsigned long flags; + int rv = 0; + + spin_lock_irqsave(&qp->sq_lock, flags); + + if (unlikely(wqe->wr_status != SIW_WR_IDLE)) { + spin_unlock_irqrestore(&qp->sq_lock, flags); + return -EIO; + } + memset(wqe->mem, 0, sizeof(*wqe->mem) * SIW_MAX_SGE); + + wqe->wr_status = SIW_WR_QUEUED; + wqe->sqe.flags = 0; + wqe->sqe.num_sge = 1; + wqe->sqe.sge[0].length = 0; + wqe->sqe.sge[0].laddr = 0; + wqe->sqe.sge[0].lkey = 0; + /* + * While it must not be checked for inbound zero length + * READ/WRITE, some HW may treat STag 0 special. + */ + wqe->sqe.rkey = 1; + wqe->sqe.raddr = 0; + wqe->processed = 0; + + if (ctrl & MPA_V2_RDMA_WRITE_RTR) + wqe->sqe.opcode = SIW_OP_WRITE; + else if (ctrl & MPA_V2_RDMA_READ_RTR) { + struct siw_sqe *rreq; + + wqe->sqe.opcode = SIW_OP_READ; + + spin_lock(&qp->orq_lock); + + rreq = orq_get_free(qp); + if (rreq) { + siw_read_to_orq(rreq, &wqe->sqe); + qp->orq_put++; + } else + rv = -EIO; + + spin_unlock(&qp->orq_lock); + } else + rv = -EINVAL; + + if (rv) + wqe->wr_status = SIW_WR_IDLE; + + spin_unlock_irqrestore(&qp->sq_lock, flags); + + if (!rv) + rv = siw_sq_start(qp); + + return rv; +} + +/* + * Map memory access error to DDP tagged error + */ +enum ddp_ecode siw_tagged_error(enum siw_access_state state) +{ + switch (state) { + case E_STAG_INVALID: + return DDP_ECODE_T_INVALID_STAG; + case E_BASE_BOUNDS: + return DDP_ECODE_T_BASE_BOUNDS; + case E_PD_MISMATCH: + return DDP_ECODE_T_STAG_NOT_ASSOC; + case E_ACCESS_PERM: + /* + * RFC 5041 (DDP) lacks an ecode for insufficient access + * permissions. 'Invalid STag' seem to be the closest + * match though. + */ + return DDP_ECODE_T_INVALID_STAG; + default: + WARN_ON(1); + return DDP_ECODE_T_INVALID_STAG; + } +} + +/* + * Map memory access error to RDMAP protection error + */ +enum rdmap_ecode siw_rdmap_error(enum siw_access_state state) +{ + switch (state) { + case E_STAG_INVALID: + return RDMAP_ECODE_INVALID_STAG; + case E_BASE_BOUNDS: + return RDMAP_ECODE_BASE_BOUNDS; + case E_PD_MISMATCH: + return RDMAP_ECODE_STAG_NOT_ASSOC; + case E_ACCESS_PERM: + return RDMAP_ECODE_ACCESS_RIGHTS; + default: + return RDMAP_ECODE_UNSPECIFIED; + } +} + +void siw_init_terminate(struct siw_qp *qp, enum term_elayer layer, u8 etype, + u8 ecode, int in_tx) +{ + if (!qp->term_info.valid) { + memset(&qp->term_info, 0, sizeof(qp->term_info)); + qp->term_info.layer = layer; + qp->term_info.etype = etype; + qp->term_info.ecode = ecode; + qp->term_info.in_tx = in_tx; + qp->term_info.valid = 1; + } + siw_dbg_qp(qp, "init TERM: layer %d, type %d, code %d, in tx %s\n", + layer, etype, ecode, in_tx ? "yes" : "no"); +} + +/* + * Send a TERMINATE message, as defined in RFC's 5040/5041/5044/6581. + * Sending TERMINATE messages is best effort - such messages + * can only be send if the QP is still connected and it does + * not have another outbound message in-progress, i.e. the + * TERMINATE message must not interfer with an incomplete current + * transmit operation. + */ +void siw_send_terminate(struct siw_qp *qp) +{ + struct kvec iov[3]; + struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_EOR }; + struct iwarp_terminate *term = NULL; + union iwarp_hdr *err_hdr = NULL; + struct socket *s = qp->attrs.sk; + struct siw_rx_stream *srx = &qp->rx_stream; + union iwarp_hdr *rx_hdr = &srx->hdr; + u32 crc = 0; + int num_frags, len_terminate, rv; + + if (!qp->term_info.valid) + return; + + qp->term_info.valid = 0; + + if (tx_wqe(qp)->wr_status == SIW_WR_INPROGRESS) { + siw_dbg_qp(qp, "cannot send TERMINATE: op %d in progress\n", + tx_type(tx_wqe(qp))); + return; + } + if (!s && qp->cep) + /* QP not yet in RTS. Take socket from connection end point */ + s = qp->cep->sock; + + if (!s) { + siw_dbg_qp(qp, "cannot send TERMINATE: not connected\n"); + return; + } + + term = kzalloc(sizeof(*term), GFP_KERNEL); + if (!term) + return; + + term->ddp_qn = cpu_to_be32(RDMAP_UNTAGGED_QN_TERMINATE); + term->ddp_mo = 0; + term->ddp_msn = cpu_to_be32(1); + + iov[0].iov_base = term; + iov[0].iov_len = sizeof(*term); + + if ((qp->term_info.layer == TERM_ERROR_LAYER_DDP) || + ((qp->term_info.layer == TERM_ERROR_LAYER_RDMAP) && + (qp->term_info.etype != RDMAP_ETYPE_CATASTROPHIC))) { + err_hdr = kzalloc(sizeof(*err_hdr), GFP_KERNEL); + if (!err_hdr) { + kfree(term); + return; + } + } + memcpy(&term->ctrl, &iwarp_pktinfo[RDMAP_TERMINATE].ctrl, + sizeof(struct iwarp_ctrl)); + + __rdmap_term_set_layer(term, qp->term_info.layer); + __rdmap_term_set_etype(term, qp->term_info.etype); + __rdmap_term_set_ecode(term, qp->term_info.ecode); + + switch (qp->term_info.layer) { + case TERM_ERROR_LAYER_RDMAP: + if (qp->term_info.etype == RDMAP_ETYPE_CATASTROPHIC) + /* No additional DDP/RDMAP header to be included */ + break; + + if (qp->term_info.etype == RDMAP_ETYPE_REMOTE_PROTECTION) { + /* + * Complete RDMAP frame will get attached, and + * DDP segment length is valid + */ + term->flag_m = 1; + term->flag_d = 1; + term->flag_r = 1; + + if (qp->term_info.in_tx) { + struct iwarp_rdma_rreq *rreq; + struct siw_wqe *wqe = tx_wqe(qp); + + /* Inbound RREQ error, detected during + * RRESP creation. Take state from + * current TX work queue element to + * reconstruct peers RREQ. + */ + rreq = (struct iwarp_rdma_rreq *)err_hdr; + + memcpy(&rreq->ctrl, + &iwarp_pktinfo[RDMAP_RDMA_READ_REQ].ctrl, + sizeof(struct iwarp_ctrl)); + + rreq->rsvd = 0; + rreq->ddp_qn = + htonl(RDMAP_UNTAGGED_QN_RDMA_READ); + + /* Provide RREQ's MSN as kept aside */ + rreq->ddp_msn = htonl(wqe->sqe.sge[0].length); + + rreq->ddp_mo = htonl(wqe->processed); + rreq->sink_stag = htonl(wqe->sqe.rkey); + rreq->sink_to = cpu_to_be64(wqe->sqe.raddr); + rreq->read_size = htonl(wqe->sqe.sge[0].length); + rreq->source_stag = htonl(wqe->sqe.sge[0].lkey); + rreq->source_to = + cpu_to_be64(wqe->sqe.sge[0].laddr); + + iov[1].iov_base = rreq; + iov[1].iov_len = sizeof(*rreq); + + rx_hdr = (union iwarp_hdr *)rreq; + } else { + /* Take RDMAP/DDP information from + * current (failed) inbound frame. + */ + iov[1].iov_base = rx_hdr; + + if (__rdmap_get_opcode(&rx_hdr->ctrl) == + RDMAP_RDMA_READ_REQ) + iov[1].iov_len = + sizeof(struct iwarp_rdma_rreq); + else /* SEND type */ + iov[1].iov_len = + sizeof(struct iwarp_send); + } + } else { + /* Do not report DDP hdr information if packet + * layout is unknown + */ + if ((qp->term_info.ecode == RDMAP_ECODE_VERSION) || + (qp->term_info.ecode == RDMAP_ECODE_OPCODE)) + break; + + iov[1].iov_base = rx_hdr; + + /* Only DDP frame will get attached */ + if (rx_hdr->ctrl.ddp_rdmap_ctrl & DDP_FLAG_TAGGED) + iov[1].iov_len = + sizeof(struct iwarp_rdma_write); + else + iov[1].iov_len = sizeof(struct iwarp_send); + + term->flag_m = 1; + term->flag_d = 1; + } + term->ctrl.mpa_len = cpu_to_be16(iov[1].iov_len); + break; + + case TERM_ERROR_LAYER_DDP: + /* Report error encountered while DDP processing. + * This can only happen as a result of inbound + * DDP processing + */ + + /* Do not report DDP hdr information if packet + * layout is unknown + */ + if (((qp->term_info.etype == DDP_ETYPE_TAGGED_BUF) && + (qp->term_info.ecode == DDP_ECODE_T_VERSION)) || + ((qp->term_info.etype == DDP_ETYPE_UNTAGGED_BUF) && + (qp->term_info.ecode == DDP_ECODE_UT_VERSION))) + break; + + iov[1].iov_base = rx_hdr; + + if (rx_hdr->ctrl.ddp_rdmap_ctrl & DDP_FLAG_TAGGED) + iov[1].iov_len = sizeof(struct iwarp_ctrl_tagged); + else + iov[1].iov_len = sizeof(struct iwarp_ctrl_untagged); + + term->flag_m = 1; + term->flag_d = 1; + break; + + default: + break; + } + if (term->flag_m || term->flag_d || term->flag_r) { + iov[2].iov_base = &crc; + iov[2].iov_len = sizeof(crc); + len_terminate = sizeof(*term) + iov[1].iov_len + MPA_CRC_SIZE; + num_frags = 3; + } else { + iov[1].iov_base = &crc; + iov[1].iov_len = sizeof(crc); + len_terminate = sizeof(*term) + MPA_CRC_SIZE; + num_frags = 2; + } + + /* Adjust DDP Segment Length parameter, if valid */ + if (term->flag_m) { + u32 real_ddp_len = be16_to_cpu(rx_hdr->ctrl.mpa_len); + enum rdma_opcode op = __rdmap_get_opcode(&rx_hdr->ctrl); + + real_ddp_len -= iwarp_pktinfo[op].hdr_len - MPA_HDR_SIZE; + rx_hdr->ctrl.mpa_len = cpu_to_be16(real_ddp_len); + } + + term->ctrl.mpa_len = + cpu_to_be16(len_terminate - (MPA_HDR_SIZE + MPA_CRC_SIZE)); + if (qp->tx_ctx.mpa_crc_hd) { + crypto_shash_init(qp->tx_ctx.mpa_crc_hd); + if (crypto_shash_update(qp->tx_ctx.mpa_crc_hd, + (u8 *)iov[0].iov_base, + iov[0].iov_len)) + goto out; + + if (num_frags == 3) { + if (crypto_shash_update(qp->tx_ctx.mpa_crc_hd, + (u8 *)iov[1].iov_base, + iov[1].iov_len)) + goto out; + } + crypto_shash_final(qp->tx_ctx.mpa_crc_hd, (u8 *)&crc); + } + + rv = kernel_sendmsg(s, &msg, iov, num_frags, len_terminate); + siw_dbg_qp(qp, "sent TERM: %s, layer %d, type %d, code %d (%d bytes)\n", + rv == len_terminate ? "success" : "failure", + __rdmap_term_layer(term), __rdmap_term_etype(term), + __rdmap_term_ecode(term), rv); +out: + kfree(term); + kfree(err_hdr); +} + +/* + * Handle all attrs other than state + */ +static void siw_qp_modify_nonstate(struct siw_qp *qp, + struct siw_qp_attrs *attrs, + enum siw_qp_attr_mask mask) +{ + if (mask & SIW_QP_ATTR_ACCESS_FLAGS) { + if (attrs->flags & SIW_RDMA_BIND_ENABLED) + qp->attrs.flags |= SIW_RDMA_BIND_ENABLED; + else + qp->attrs.flags &= ~SIW_RDMA_BIND_ENABLED; + + if (attrs->flags & SIW_RDMA_WRITE_ENABLED) + qp->attrs.flags |= SIW_RDMA_WRITE_ENABLED; + else + qp->attrs.flags &= ~SIW_RDMA_WRITE_ENABLED; + + if (attrs->flags & SIW_RDMA_READ_ENABLED) + qp->attrs.flags |= SIW_RDMA_READ_ENABLED; + else + qp->attrs.flags &= ~SIW_RDMA_READ_ENABLED; + } +} + +static int siw_qp_nextstate_from_idle(struct siw_qp *qp, + struct siw_qp_attrs *attrs, + enum siw_qp_attr_mask mask) +{ + int rv = 0; + + switch (attrs->state) { + case SIW_QP_STATE_RTS: + if (attrs->flags & SIW_MPA_CRC) { + rv = siw_qp_enable_crc(qp); + if (rv) + break; + } + if (!(mask & SIW_QP_ATTR_LLP_HANDLE)) { + siw_dbg_qp(qp, "no socket\n"); + rv = -EINVAL; + break; + } + if (!(mask & SIW_QP_ATTR_MPA)) { + siw_dbg_qp(qp, "no MPA\n"); + rv = -EINVAL; + break; + } + /* + * Initialize iWARP TX state + */ + qp->tx_ctx.ddp_msn[RDMAP_UNTAGGED_QN_SEND] = 0; + qp->tx_ctx.ddp_msn[RDMAP_UNTAGGED_QN_RDMA_READ] = 0; + qp->tx_ctx.ddp_msn[RDMAP_UNTAGGED_QN_TERMINATE] = 0; + + /* + * Initialize iWARP RX state + */ + qp->rx_stream.ddp_msn[RDMAP_UNTAGGED_QN_SEND] = 1; + qp->rx_stream.ddp_msn[RDMAP_UNTAGGED_QN_RDMA_READ] = 1; + qp->rx_stream.ddp_msn[RDMAP_UNTAGGED_QN_TERMINATE] = 1; + + /* + * init IRD free queue, caller has already checked + * limits. + */ + rv = siw_qp_readq_init(qp, attrs->irq_size, + attrs->orq_size); + if (rv) + break; + + qp->attrs.sk = attrs->sk; + qp->attrs.state = SIW_QP_STATE_RTS; + + siw_dbg_qp(qp, "enter RTS: crc=%s, ord=%u, ird=%u\n", + attrs->flags & SIW_MPA_CRC ? "y" : "n", + qp->attrs.orq_size, qp->attrs.irq_size); + break; + + case SIW_QP_STATE_ERROR: + siw_rq_flush(qp); + qp->attrs.state = SIW_QP_STATE_ERROR; + if (qp->cep) { + siw_cep_put(qp->cep); + qp->cep = NULL; + } + break; + + default: + break; + } + return rv; +} + +static int siw_qp_nextstate_from_rts(struct siw_qp *qp, + struct siw_qp_attrs *attrs) +{ + int drop_conn = 0; + + switch (attrs->state) { + case SIW_QP_STATE_CLOSING: + /* + * Verbs: move to IDLE if SQ and ORQ are empty. + * Move to ERROR otherwise. But first of all we must + * close the connection. So we keep CLOSING or ERROR + * as a transient state, schedule connection drop work + * and wait for the socket state change upcall to + * come back closed. + */ + if (tx_wqe(qp)->wr_status == SIW_WR_IDLE) { + qp->attrs.state = SIW_QP_STATE_CLOSING; + } else { + qp->attrs.state = SIW_QP_STATE_ERROR; + siw_sq_flush(qp); + } + siw_rq_flush(qp); + + drop_conn = 1; + break; + + case SIW_QP_STATE_TERMINATE: + qp->attrs.state = SIW_QP_STATE_TERMINATE; + + siw_init_terminate(qp, TERM_ERROR_LAYER_RDMAP, + RDMAP_ETYPE_CATASTROPHIC, + RDMAP_ECODE_UNSPECIFIED, 1); + drop_conn = 1; + break; + + case SIW_QP_STATE_ERROR: + /* + * This is an emergency close. + * + * Any in progress transmit operation will get + * cancelled. + * This will likely result in a protocol failure, + * if a TX operation is in transit. The caller + * could unconditional wait to give the current + * operation a chance to complete. + * Esp., how to handle the non-empty IRQ case? + * The peer was asking for data transfer at a valid + * point in time. + */ + siw_sq_flush(qp); + siw_rq_flush(qp); + qp->attrs.state = SIW_QP_STATE_ERROR; + drop_conn = 1; + break; + + default: + break; + } + return drop_conn; +} + +static void siw_qp_nextstate_from_term(struct siw_qp *qp, + struct siw_qp_attrs *attrs) +{ + switch (attrs->state) { + case SIW_QP_STATE_ERROR: + siw_rq_flush(qp); + qp->attrs.state = SIW_QP_STATE_ERROR; + + if (tx_wqe(qp)->wr_status != SIW_WR_IDLE) + siw_sq_flush(qp); + break; + + default: + break; + } +} + +static int siw_qp_nextstate_from_close(struct siw_qp *qp, + struct siw_qp_attrs *attrs) +{ + int rv = 0; + + switch (attrs->state) { + case SIW_QP_STATE_IDLE: + WARN_ON(tx_wqe(qp)->wr_status != SIW_WR_IDLE); + qp->attrs.state = SIW_QP_STATE_IDLE; + break; + + case SIW_QP_STATE_CLOSING: + /* + * The LLP may already moved the QP to closing + * due to graceful peer close init + */ + break; + + case SIW_QP_STATE_ERROR: + /* + * QP was moved to CLOSING by LLP event + * not yet seen by user. + */ + qp->attrs.state = SIW_QP_STATE_ERROR; + + if (tx_wqe(qp)->wr_status != SIW_WR_IDLE) + siw_sq_flush(qp); + + siw_rq_flush(qp); + break; + + default: + siw_dbg_qp(qp, "state transition undefined: %s => %s\n", + siw_qp_state_to_string[qp->attrs.state], + siw_qp_state_to_string[attrs->state]); + + rv = -ECONNABORTED; + } + return rv; +} + +/* + * Caller must hold qp->state_lock + */ +int siw_qp_modify(struct siw_qp *qp, struct siw_qp_attrs *attrs, + enum siw_qp_attr_mask mask) +{ + int drop_conn = 0, rv = 0; + + if (!mask) + return 0; + + siw_dbg_qp(qp, "state: %s => %s\n", + siw_qp_state_to_string[qp->attrs.state], + siw_qp_state_to_string[attrs->state]); + + if (mask != SIW_QP_ATTR_STATE) + siw_qp_modify_nonstate(qp, attrs, mask); + + if (!(mask & SIW_QP_ATTR_STATE)) + return 0; + + switch (qp->attrs.state) { + case SIW_QP_STATE_IDLE: + case SIW_QP_STATE_RTR: + rv = siw_qp_nextstate_from_idle(qp, attrs, mask); + break; + + case SIW_QP_STATE_RTS: + drop_conn = siw_qp_nextstate_from_rts(qp, attrs); + break; + + case SIW_QP_STATE_TERMINATE: + siw_qp_nextstate_from_term(qp, attrs); + break; + + case SIW_QP_STATE_CLOSING: + siw_qp_nextstate_from_close(qp, attrs); + break; + default: + break; + } + if (drop_conn) + siw_qp_cm_drop(qp, 0); + + return rv; +} + +void siw_read_to_orq(struct siw_sqe *rreq, struct siw_sqe *sqe) +{ + rreq->id = sqe->id; + rreq->opcode = sqe->opcode; + rreq->sge[0].laddr = sqe->sge[0].laddr; + rreq->sge[0].length = sqe->sge[0].length; + rreq->sge[0].lkey = sqe->sge[0].lkey; + rreq->sge[1].lkey = sqe->sge[1].lkey; + rreq->flags = sqe->flags | SIW_WQE_VALID; + rreq->num_sge = 1; +} + +/* + * Must be called with SQ locked. + * To avoid complete SQ starvation by constant inbound READ requests, + * the active IRQ will not be served after qp->irq_burst, if the + * SQ has pending work. + */ +int siw_activate_tx(struct siw_qp *qp) +{ + struct siw_sqe *irqe, *sqe; + struct siw_wqe *wqe = tx_wqe(qp); + int rv = 1; + + irqe = &qp->irq[qp->irq_get % qp->attrs.irq_size]; + + if (irqe->flags & SIW_WQE_VALID) { + sqe = sq_get_next(qp); + + /* + * Avoid local WQE processing starvation in case + * of constant inbound READ request stream + */ + if (sqe && ++qp->irq_burst >= SIW_IRQ_MAXBURST_SQ_ACTIVE) { + qp->irq_burst = 0; + goto skip_irq; + } + memset(wqe->mem, 0, sizeof(*wqe->mem) * SIW_MAX_SGE); + wqe->wr_status = SIW_WR_QUEUED; + + /* start READ RESPONSE */ + wqe->sqe.opcode = SIW_OP_READ_RESPONSE; + wqe->sqe.flags = 0; + if (irqe->num_sge) { + wqe->sqe.num_sge = 1; + wqe->sqe.sge[0].length = irqe->sge[0].length; + wqe->sqe.sge[0].laddr = irqe->sge[0].laddr; + wqe->sqe.sge[0].lkey = irqe->sge[0].lkey; + } else { + wqe->sqe.num_sge = 0; + } + + /* Retain original RREQ's message sequence number for + * potential error reporting cases. + */ + wqe->sqe.sge[1].length = irqe->sge[1].length; + + wqe->sqe.rkey = irqe->rkey; + wqe->sqe.raddr = irqe->raddr; + + wqe->processed = 0; + qp->irq_get++; + + /* mark current IRQ entry free */ + smp_store_mb(irqe->flags, 0); + + goto out; + } + sqe = sq_get_next(qp); + if (sqe) { +skip_irq: + memset(wqe->mem, 0, sizeof(*wqe->mem) * SIW_MAX_SGE); + wqe->wr_status = SIW_WR_QUEUED; + + /* First copy SQE to kernel private memory */ + memcpy(&wqe->sqe, sqe, sizeof(*sqe)); + + if (wqe->sqe.opcode >= SIW_NUM_OPCODES) { + rv = -EINVAL; + goto out; + } + if (wqe->sqe.flags & SIW_WQE_INLINE) { + if (wqe->sqe.opcode != SIW_OP_SEND && + wqe->sqe.opcode != SIW_OP_WRITE) { + rv = -EINVAL; + goto out; + } + if (wqe->sqe.sge[0].length > SIW_MAX_INLINE) { + rv = -EINVAL; + goto out; + } + wqe->sqe.sge[0].laddr = (u64)&wqe->sqe.sge[1]; + wqe->sqe.sge[0].lkey = 0; + wqe->sqe.num_sge = 1; + } + if (wqe->sqe.flags & SIW_WQE_READ_FENCE) { + /* A READ cannot be fenced */ + if (unlikely(wqe->sqe.opcode == SIW_OP_READ || + wqe->sqe.opcode == + SIW_OP_READ_LOCAL_INV)) { + siw_dbg_qp(qp, "cannot fence read\n"); + rv = -EINVAL; + goto out; + } + spin_lock(&qp->orq_lock); + + if (!siw_orq_empty(qp)) { + qp->tx_ctx.orq_fence = 1; + rv = 0; + } + spin_unlock(&qp->orq_lock); + + } else if (wqe->sqe.opcode == SIW_OP_READ || + wqe->sqe.opcode == SIW_OP_READ_LOCAL_INV) { + struct siw_sqe *rreq; + + wqe->sqe.num_sge = 1; + + spin_lock(&qp->orq_lock); + + rreq = orq_get_free(qp); + if (rreq) { + /* + * Make an immediate copy in ORQ to be ready + * to process loopback READ reply + */ + siw_read_to_orq(rreq, &wqe->sqe); + qp->orq_put++; + } else { + qp->tx_ctx.orq_fence = 1; + rv = 0; + } + spin_unlock(&qp->orq_lock); + } + + /* Clear SQE, can be re-used by application */ + smp_store_mb(sqe->flags, 0); + qp->sq_get++; + } else { + rv = 0; + } +out: + if (unlikely(rv < 0)) { + siw_dbg_qp(qp, "error %d\n", rv); + wqe->wr_status = SIW_WR_IDLE; + } + return rv; +} + +/* + * Check if current CQ state qualifies for calling CQ completion + * handler. Must be called with CQ lock held. + */ +static bool siw_cq_notify_now(struct siw_cq *cq, u32 flags) +{ + u64 cq_notify; + + if (!cq->base_cq.comp_handler) + return false; + + cq_notify = READ_ONCE(*cq->notify); + + if ((cq_notify & SIW_NOTIFY_NEXT_COMPLETION) || + ((cq_notify & SIW_NOTIFY_SOLICITED) && + (flags & SIW_WQE_SOLICITED))) { + /* dis-arm CQ */ + smp_store_mb(*cq->notify, SIW_NOTIFY_NOT); + + return true; + } + return false; +} + +int siw_sqe_complete(struct siw_qp *qp, struct siw_sqe *sqe, u32 bytes, + enum siw_wc_status status) +{ + struct siw_cq *cq = qp->scq; + int rv = 0; + + if (cq) { + u32 sqe_flags = sqe->flags; + struct siw_cqe *cqe; + u32 idx; + unsigned long flags; + + spin_lock_irqsave(&cq->lock, flags); + + idx = cq->cq_put % cq->num_cqe; + cqe = &cq->queue[idx]; + + if (!READ_ONCE(cqe->flags)) { + bool notify; + + cqe->id = sqe->id; + cqe->opcode = sqe->opcode; + cqe->status = status; + cqe->imm_data = 0; + cqe->bytes = bytes; + + if (cq->kernel_verbs) + cqe->base_qp = qp->ib_qp; + else + cqe->qp_id = qp_id(qp); + + /* mark CQE valid for application */ + WRITE_ONCE(cqe->flags, SIW_WQE_VALID); + /* recycle SQE */ + smp_store_mb(sqe->flags, 0); + + cq->cq_put++; + notify = siw_cq_notify_now(cq, sqe_flags); + + spin_unlock_irqrestore(&cq->lock, flags); + + if (notify) { + siw_dbg_cq(cq, "Call completion handler\n"); + cq->base_cq.comp_handler(&cq->base_cq, + cq->base_cq.cq_context); + } + } else { + spin_unlock_irqrestore(&cq->lock, flags); + rv = -ENOMEM; + siw_cq_event(cq, IB_EVENT_CQ_ERR); + } + } else { + /* recycle SQE */ + smp_store_mb(sqe->flags, 0); + } + return rv; +} + +int siw_rqe_complete(struct siw_qp *qp, struct siw_rqe *rqe, u32 bytes, + u32 inval_stag, enum siw_wc_status status) +{ + struct siw_cq *cq = qp->rcq; + int rv = 0; + + if (cq) { + struct siw_cqe *cqe; + u32 idx; + unsigned long flags; + + spin_lock_irqsave(&cq->lock, flags); + + idx = cq->cq_put % cq->num_cqe; + cqe = &cq->queue[idx]; + + if (!READ_ONCE(cqe->flags)) { + bool notify; + u8 cqe_flags = SIW_WQE_VALID; + + cqe->id = rqe->id; + cqe->opcode = SIW_OP_RECEIVE; + cqe->status = status; + cqe->imm_data = 0; + cqe->bytes = bytes; + + if (cq->kernel_verbs) { + cqe->base_qp = qp->ib_qp; + if (inval_stag) { + cqe_flags |= SIW_WQE_REM_INVAL; + cqe->inval_stag = inval_stag; + } + } else { + cqe->qp_id = qp_id(qp); + } + /* mark CQE valid for application */ + WRITE_ONCE(cqe->flags, cqe_flags); + /* recycle RQE */ + smp_store_mb(rqe->flags, 0); + + cq->cq_put++; + notify = siw_cq_notify_now(cq, SIW_WQE_SIGNALLED); + + spin_unlock_irqrestore(&cq->lock, flags); + + if (notify) { + siw_dbg_cq(cq, "Call completion handler\n"); + cq->base_cq.comp_handler(&cq->base_cq, + cq->base_cq.cq_context); + } + } else { + spin_unlock_irqrestore(&cq->lock, flags); + rv = -ENOMEM; + siw_cq_event(cq, IB_EVENT_CQ_ERR); + } + } else { + /* recycle RQE */ + smp_store_mb(rqe->flags, 0); + } + return rv; +} + +/* + * siw_sq_flush() + * + * Flush SQ and ORRQ entries to CQ. + * + * Must be called with QP state write lock held. + * Therefore, SQ and ORQ lock must not be taken. + */ +void siw_sq_flush(struct siw_qp *qp) +{ + struct siw_sqe *sqe; + struct siw_wqe *wqe = tx_wqe(qp); + int async_event = 0; + + /* + * Start with completing any work currently on the ORQ + */ + while (qp->attrs.orq_size) { + sqe = &qp->orq[qp->orq_get % qp->attrs.orq_size]; + if (!READ_ONCE(sqe->flags)) + break; + + if (siw_sqe_complete(qp, sqe, 0, SIW_WC_WR_FLUSH_ERR) != 0) + break; + + WRITE_ONCE(sqe->flags, 0); + qp->orq_get++; + } + /* + * Flush an in-progress WQE if present + */ + if (wqe->wr_status != SIW_WR_IDLE) { + siw_dbg_qp(qp, "flush current SQE, type %d, status %d\n", + tx_type(wqe), wqe->wr_status); + + siw_wqe_put_mem(wqe, tx_type(wqe)); + + if (tx_type(wqe) != SIW_OP_READ_RESPONSE && + ((tx_type(wqe) != SIW_OP_READ && + tx_type(wqe) != SIW_OP_READ_LOCAL_INV) || + wqe->wr_status == SIW_WR_QUEUED)) + /* + * An in-progress Read Request is already in + * the ORQ + */ + siw_sqe_complete(qp, &wqe->sqe, wqe->bytes, + SIW_WC_WR_FLUSH_ERR); + + wqe->wr_status = SIW_WR_IDLE; + } + /* + * Flush the Send Queue + */ + while (qp->attrs.sq_size) { + sqe = &qp->sendq[qp->sq_get % qp->attrs.sq_size]; + if (!READ_ONCE(sqe->flags)) + break; + + async_event = 1; + if (siw_sqe_complete(qp, sqe, 0, SIW_WC_WR_FLUSH_ERR) != 0) + /* + * Shall IB_EVENT_SQ_DRAINED be supressed if work + * completion fails? + */ + break; + + WRITE_ONCE(sqe->flags, 0); + qp->sq_get++; + } + if (async_event) + siw_qp_event(qp, IB_EVENT_SQ_DRAINED); +} + +/* + * siw_rq_flush() + * + * Flush recv queue entries to CQ. Also + * takes care of pending active tagged and untagged + * inbound transfers, which have target memory + * referenced. + * + * Must be called with QP state write lock held. + * Therefore, RQ lock must not be taken. + */ +void siw_rq_flush(struct siw_qp *qp) +{ + struct siw_wqe *wqe = &qp->rx_untagged.wqe_active; + + /* + * Flush an in-progress untagged operation if present + */ + if (wqe->wr_status != SIW_WR_IDLE) { + siw_dbg_qp(qp, "flush current rqe, type %d, status %d\n", + rx_type(wqe), wqe->wr_status); + + siw_wqe_put_mem(wqe, rx_type(wqe)); + + if (rx_type(wqe) == SIW_OP_RECEIVE) { + siw_rqe_complete(qp, &wqe->rqe, wqe->bytes, + 0, SIW_WC_WR_FLUSH_ERR); + } else if (rx_type(wqe) != SIW_OP_READ && + rx_type(wqe) != SIW_OP_READ_RESPONSE && + rx_type(wqe) != SIW_OP_WRITE) { + siw_sqe_complete(qp, &wqe->sqe, 0, SIW_WC_WR_FLUSH_ERR); + } + wqe->wr_status = SIW_WR_IDLE; + } + wqe = &qp->rx_tagged.wqe_active; + + if (wqe->wr_status != SIW_WR_IDLE) { + siw_wqe_put_mem(wqe, rx_type(wqe)); + wqe->wr_status = SIW_WR_IDLE; + } + /* + * Flush the Receive Queue + */ + while (qp->attrs.rq_size) { + struct siw_rqe *rqe = + &qp->recvq[qp->rq_get % qp->attrs.rq_size]; + + if (!READ_ONCE(rqe->flags)) + break; + + if (siw_rqe_complete(qp, rqe, 0, 0, SIW_WC_WR_FLUSH_ERR) != 0) + break; + + WRITE_ONCE(rqe->flags, 0); + qp->rq_get++; + } +} + +int siw_qp_add(struct siw_device *sdev, struct siw_qp *qp) +{ + int rv = xa_alloc(&sdev->qp_xa, &qp->ib_qp->qp_num, qp, xa_limit_32b, + GFP_KERNEL); + + if (!rv) { + kref_init(&qp->ref); + qp->sdev = sdev; + qp->qp_num = qp->ib_qp->qp_num; + siw_dbg_qp(qp, "new QP\n"); + } + return rv; +} + +void siw_free_qp(struct kref *ref) +{ + struct siw_qp *found, *qp = container_of(ref, struct siw_qp, ref); + struct siw_device *sdev = qp->sdev; + unsigned long flags; + + if (qp->cep) + siw_cep_put(qp->cep); + + found = xa_erase(&sdev->qp_xa, qp_id(qp)); + WARN_ON(found != qp); + spin_lock_irqsave(&sdev->lock, flags); + list_del(&qp->devq); + spin_unlock_irqrestore(&sdev->lock, flags); + + vfree(qp->sendq); + vfree(qp->recvq); + vfree(qp->irq); + vfree(qp->orq); + + siw_put_tx_cpu(qp->tx_cpu); + + atomic_dec(&sdev->num_qp); + siw_dbg_qp(qp, "free QP\n"); + kfree_rcu(qp, rcu); +} diff --git a/drivers/infiniband/sw/siw/siw_qp_rx.c b/drivers/infiniband/sw/siw/siw_qp_rx.c new file mode 100644 index 000000000000..f87657a11657 --- /dev/null +++ b/drivers/infiniband/sw/siw/siw_qp_rx.c @@ -0,0 +1,1458 @@ +// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause + +/* Authors: Bernard Metzler <bmt@zurich.ibm.com> */ +/* Copyright (c) 2008-2019, IBM Corporation */ + +#include <linux/errno.h> +#include <linux/types.h> +#include <linux/net.h> +#include <linux/scatterlist.h> +#include <linux/highmem.h> + +#include <rdma/iw_cm.h> +#include <rdma/ib_verbs.h> + +#include "siw.h" +#include "siw_verbs.h" +#include "siw_mem.h" + +/* + * siw_rx_umem() + * + * Receive data of @len into target referenced by @dest_addr. + * + * @srx: Receive Context + * @umem: siw representation of target memory + * @dest_addr: user virtual address + * @len: number of bytes to place + */ +static int siw_rx_umem(struct siw_rx_stream *srx, struct siw_umem *umem, + u64 dest_addr, int len) +{ + int copied = 0; + + while (len) { + struct page *p; + int pg_off, bytes, rv; + void *dest; + + p = siw_get_upage(umem, dest_addr); + if (unlikely(!p)) { + pr_warn("siw: %s: [QP %u]: bogus addr: %p, %p\n", + __func__, qp_id(rx_qp(srx)), + (void *)dest_addr, (void *)umem->fp_addr); + /* siw internal error */ + srx->skb_copied += copied; + srx->skb_new -= copied; + + return -EFAULT; + } + pg_off = dest_addr & ~PAGE_MASK; + bytes = min(len, (int)PAGE_SIZE - pg_off); + + siw_dbg_qp(rx_qp(srx), "page %p, bytes=%u\n", p, bytes); + + dest = kmap_atomic(p); + rv = skb_copy_bits(srx->skb, srx->skb_offset, dest + pg_off, + bytes); + + if (unlikely(rv)) { + kunmap_atomic(dest); + srx->skb_copied += copied; + srx->skb_new -= copied; + + pr_warn("siw: [QP %u]: %s, len %d, page %p, rv %d\n", + qp_id(rx_qp(srx)), __func__, len, p, rv); + + return -EFAULT; + } + if (srx->mpa_crc_hd) { + if (rx_qp(srx)->kernel_verbs) { + crypto_shash_update(srx->mpa_crc_hd, + (u8 *)(dest + pg_off), bytes); + kunmap_atomic(dest); + } else { + kunmap_atomic(dest); + /* + * Do CRC on original, not target buffer. + * Some user land applications may + * concurrently write the target buffer, + * which would yield a broken CRC. + * Walking the skb twice is very ineffcient. + * Folding the CRC into skb_copy_bits() + * would be much better, but is currently + * not supported. + */ + siw_crc_skb(srx, bytes); + } + } else { + kunmap_atomic(dest); + } + srx->skb_offset += bytes; + copied += bytes; + len -= bytes; + dest_addr += bytes; + pg_off = 0; + } + srx->skb_copied += copied; + srx->skb_new -= copied; + + return copied; +} + +static int siw_rx_kva(struct siw_rx_stream *srx, void *kva, int len) +{ + int rv; + + siw_dbg_qp(rx_qp(srx), "kva: 0x%p, len: %u\n", kva, len); + + rv = skb_copy_bits(srx->skb, srx->skb_offset, kva, len); + if (unlikely(rv)) { + pr_warn("siw: [QP %u]: %s, len %d, kva 0x%p, rv %d\n", + qp_id(rx_qp(srx)), __func__, len, kva, rv); + + return rv; + } + if (srx->mpa_crc_hd) + crypto_shash_update(srx->mpa_crc_hd, (u8 *)kva, len); + + srx->skb_offset += len; + srx->skb_copied += len; + srx->skb_new -= len; + + return len; +} + +static int siw_rx_pbl(struct siw_rx_stream *srx, int *pbl_idx, + struct siw_mem *mem, u64 addr, int len) +{ + struct siw_pbl *pbl = mem->pbl; + u64 offset = addr - mem->va; + int copied = 0; + + while (len) { + int bytes; + u64 buf_addr = + siw_pbl_get_buffer(pbl, offset, &bytes, pbl_idx); + if (!buf_addr) + break; + + bytes = min(bytes, len); + if (siw_rx_kva(srx, (void *)buf_addr, bytes) == bytes) { + copied += bytes; + offset += bytes; + len -= bytes; + } else { + break; + } + } + return copied; +} + +/* + * siw_rresp_check_ntoh() + * + * Check incoming RRESP fragment header against expected + * header values and update expected values for potential next + * fragment. + * + * NOTE: This function must be called only if a RRESP DDP segment + * starts but not for fragmented consecutive pieces of an + * already started DDP segment. + */ +static int siw_rresp_check_ntoh(struct siw_rx_stream *srx, + struct siw_rx_fpdu *frx) +{ + struct iwarp_rdma_rresp *rresp = &srx->hdr.rresp; + struct siw_wqe *wqe = &frx->wqe_active; + enum ddp_ecode ecode; + + u32 sink_stag = be32_to_cpu(rresp->sink_stag); + u64 sink_to = be64_to_cpu(rresp->sink_to); + + if (frx->first_ddp_seg) { + srx->ddp_stag = wqe->sqe.sge[0].lkey; + srx->ddp_to = wqe->sqe.sge[0].laddr; + frx->pbl_idx = 0; + } + /* Below checks extend beyond the semantics of DDP, and + * into RDMAP: + * We check if the read response matches exactly the + * read request which was send to the remote peer to + * trigger this read response. RFC5040/5041 do not + * always have a proper error code for the detected + * error cases. We choose 'base or bounds error' for + * cases where the inbound STag is valid, but offset + * or length do not match our response receive state. + */ + if (unlikely(srx->ddp_stag != sink_stag)) { + pr_warn("siw: [QP %u]: rresp stag: %08x != %08x\n", + qp_id(rx_qp(srx)), sink_stag, srx->ddp_stag); + ecode = DDP_ECODE_T_INVALID_STAG; + goto error; + } + if (unlikely(srx->ddp_to != sink_to)) { + pr_warn("siw: [QP %u]: rresp off: %016llx != %016llx\n", + qp_id(rx_qp(srx)), (unsigned long long)sink_to, + (unsigned long long)srx->ddp_to); + ecode = DDP_ECODE_T_BASE_BOUNDS; + goto error; + } + if (unlikely(!frx->more_ddp_segs && + (wqe->processed + srx->fpdu_part_rem != wqe->bytes))) { + pr_warn("siw: [QP %u]: rresp len: %d != %d\n", + qp_id(rx_qp(srx)), + wqe->processed + srx->fpdu_part_rem, wqe->bytes); + ecode = DDP_ECODE_T_BASE_BOUNDS; + goto error; + } + return 0; +error: + siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_DDP, + DDP_ETYPE_TAGGED_BUF, ecode, 0); + return -EINVAL; +} + +/* + * siw_write_check_ntoh() + * + * Check incoming WRITE fragment header against expected + * header values and update expected values for potential next + * fragment + * + * NOTE: This function must be called only if a WRITE DDP segment + * starts but not for fragmented consecutive pieces of an + * already started DDP segment. + */ +static int siw_write_check_ntoh(struct siw_rx_stream *srx, + struct siw_rx_fpdu *frx) +{ + struct iwarp_rdma_write *write = &srx->hdr.rwrite; + enum ddp_ecode ecode; + + u32 sink_stag = be32_to_cpu(write->sink_stag); + u64 sink_to = be64_to_cpu(write->sink_to); + + if (frx->first_ddp_seg) { + srx->ddp_stag = sink_stag; + srx->ddp_to = sink_to; + frx->pbl_idx = 0; + } else { + if (unlikely(srx->ddp_stag != sink_stag)) { + pr_warn("siw: [QP %u]: write stag: %08x != %08x\n", + qp_id(rx_qp(srx)), sink_stag, + srx->ddp_stag); + ecode = DDP_ECODE_T_INVALID_STAG; + goto error; + } + if (unlikely(srx->ddp_to != sink_to)) { + pr_warn("siw: [QP %u]: write off: %016llx != %016llx\n", + qp_id(rx_qp(srx)), + (unsigned long long)sink_to, + (unsigned long long)srx->ddp_to); + ecode = DDP_ECODE_T_BASE_BOUNDS; + goto error; + } + } + return 0; +error: + siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_DDP, + DDP_ETYPE_TAGGED_BUF, ecode, 0); + return -EINVAL; +} + +/* + * siw_send_check_ntoh() + * + * Check incoming SEND fragment header against expected + * header values and update expected MSN if no next + * fragment expected + * + * NOTE: This function must be called only if a SEND DDP segment + * starts but not for fragmented consecutive pieces of an + * already started DDP segment. + */ +static int siw_send_check_ntoh(struct siw_rx_stream *srx, + struct siw_rx_fpdu *frx) +{ + struct iwarp_send_inv *send = &srx->hdr.send_inv; + struct siw_wqe *wqe = &frx->wqe_active; + enum ddp_ecode ecode; + + u32 ddp_msn = be32_to_cpu(send->ddp_msn); + u32 ddp_mo = be32_to_cpu(send->ddp_mo); + u32 ddp_qn = be32_to_cpu(send->ddp_qn); + + if (unlikely(ddp_qn != RDMAP_UNTAGGED_QN_SEND)) { + pr_warn("siw: [QP %u]: invalid ddp qn %d for send\n", + qp_id(rx_qp(srx)), ddp_qn); + ecode = DDP_ECODE_UT_INVALID_QN; + goto error; + } + if (unlikely(ddp_msn != srx->ddp_msn[RDMAP_UNTAGGED_QN_SEND])) { + pr_warn("siw: [QP %u]: send msn: %u != %u\n", + qp_id(rx_qp(srx)), ddp_msn, + srx->ddp_msn[RDMAP_UNTAGGED_QN_SEND]); + ecode = DDP_ECODE_UT_INVALID_MSN_RANGE; + goto error; + } + if (unlikely(ddp_mo != wqe->processed)) { + pr_warn("siw: [QP %u], send mo: %u != %u\n", + qp_id(rx_qp(srx)), ddp_mo, wqe->processed); + ecode = DDP_ECODE_UT_INVALID_MO; + goto error; + } + if (frx->first_ddp_seg) { + /* initialize user memory write position */ + frx->sge_idx = 0; + frx->sge_off = 0; + frx->pbl_idx = 0; + + /* only valid for SEND_INV and SEND_SE_INV operations */ + srx->inval_stag = be32_to_cpu(send->inval_stag); + } + if (unlikely(wqe->bytes < wqe->processed + srx->fpdu_part_rem)) { + siw_dbg_qp(rx_qp(srx), "receive space short: %d - %d < %d\n", + wqe->bytes, wqe->processed, srx->fpdu_part_rem); + wqe->wc_status = SIW_WC_LOC_LEN_ERR; + ecode = DDP_ECODE_UT_INVALID_MSN_NOBUF; + goto error; + } + return 0; +error: + siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_DDP, + DDP_ETYPE_UNTAGGED_BUF, ecode, 0); + return -EINVAL; +} + +static struct siw_wqe *siw_rqe_get(struct siw_qp *qp) +{ + struct siw_rqe *rqe; + struct siw_srq *srq; + struct siw_wqe *wqe = NULL; + bool srq_event = false; + unsigned long flags; + + srq = qp->srq; + if (srq) { + spin_lock_irqsave(&srq->lock, flags); + if (unlikely(!srq->num_rqe)) + goto out; + + rqe = &srq->recvq[srq->rq_get % srq->num_rqe]; + } else { + if (unlikely(!qp->recvq)) + goto out; + + rqe = &qp->recvq[qp->rq_get % qp->attrs.rq_size]; + } + if (likely(rqe->flags == SIW_WQE_VALID)) { + int num_sge = rqe->num_sge; + + if (likely(num_sge <= SIW_MAX_SGE)) { + int i = 0; + + wqe = rx_wqe(&qp->rx_untagged); + rx_type(wqe) = SIW_OP_RECEIVE; + wqe->wr_status = SIW_WR_INPROGRESS; + wqe->bytes = 0; + wqe->processed = 0; + + wqe->rqe.id = rqe->id; + wqe->rqe.num_sge = num_sge; + + while (i < num_sge) { + wqe->rqe.sge[i].laddr = rqe->sge[i].laddr; + wqe->rqe.sge[i].lkey = rqe->sge[i].lkey; + wqe->rqe.sge[i].length = rqe->sge[i].length; + wqe->bytes += wqe->rqe.sge[i].length; + wqe->mem[i] = NULL; + i++; + } + /* can be re-used by appl */ + smp_store_mb(rqe->flags, 0); + } else { + siw_dbg_qp(qp, "too many sge's: %d\n", rqe->num_sge); + if (srq) + spin_unlock_irqrestore(&srq->lock, flags); + return NULL; + } + if (!srq) { + qp->rq_get++; + } else { + if (srq->armed) { + /* Test SRQ limit */ + u32 off = (srq->rq_get + srq->limit) % + srq->num_rqe; + struct siw_rqe *rqe2 = &srq->recvq[off]; + + if (!(rqe2->flags & SIW_WQE_VALID)) { + srq->armed = 0; + srq_event = true; + } + } + srq->rq_get++; + } + } +out: + if (srq) { + spin_unlock_irqrestore(&srq->lock, flags); + if (srq_event) + siw_srq_event(srq, IB_EVENT_SRQ_LIMIT_REACHED); + } + return wqe; +} + +/* + * siw_proc_send: + * + * Process one incoming SEND and place data into memory referenced by + * receive wqe. + * + * Function supports partially received sends (suspending/resuming + * current receive wqe processing) + * + * return value: + * 0: reached the end of a DDP segment + * -EAGAIN: to be called again to finish the DDP segment + */ +int siw_proc_send(struct siw_qp *qp) +{ + struct siw_rx_stream *srx = &qp->rx_stream; + struct siw_rx_fpdu *frx = &qp->rx_untagged; + struct siw_wqe *wqe; + u32 data_bytes; /* all data bytes available */ + u32 rcvd_bytes; /* sum of data bytes rcvd */ + int rv = 0; + + if (frx->first_ddp_seg) { + wqe = siw_rqe_get(qp); + if (unlikely(!wqe)) { + siw_init_terminate(qp, TERM_ERROR_LAYER_DDP, + DDP_ETYPE_UNTAGGED_BUF, + DDP_ECODE_UT_INVALID_MSN_NOBUF, 0); + return -ENOENT; + } + } else { + wqe = rx_wqe(frx); + } + if (srx->state == SIW_GET_DATA_START) { + rv = siw_send_check_ntoh(srx, frx); + if (unlikely(rv)) { + siw_qp_event(qp, IB_EVENT_QP_FATAL); + return rv; + } + if (!srx->fpdu_part_rem) /* zero length SEND */ + return 0; + } + data_bytes = min(srx->fpdu_part_rem, srx->skb_new); + rcvd_bytes = 0; + + /* A zero length SEND will skip below loop */ + while (data_bytes) { + struct ib_pd *pd; + struct siw_mem **mem, *mem_p; + struct siw_sge *sge; + u32 sge_bytes; /* data bytes avail for SGE */ + + sge = &wqe->rqe.sge[frx->sge_idx]; + + if (!sge->length) { + /* just skip empty sge's */ + frx->sge_idx++; + frx->sge_off = 0; + frx->pbl_idx = 0; + continue; + } + sge_bytes = min(data_bytes, sge->length - frx->sge_off); + mem = &wqe->mem[frx->sge_idx]; + + /* + * check with QP's PD if no SRQ present, SRQ's PD otherwise + */ + pd = qp->srq == NULL ? qp->pd : qp->srq->base_srq.pd; + + rv = siw_check_sge(pd, sge, mem, IB_ACCESS_LOCAL_WRITE, + frx->sge_off, sge_bytes); + if (unlikely(rv)) { + siw_init_terminate(qp, TERM_ERROR_LAYER_DDP, + DDP_ETYPE_CATASTROPHIC, + DDP_ECODE_CATASTROPHIC, 0); + + siw_qp_event(qp, IB_EVENT_QP_ACCESS_ERR); + break; + } + mem_p = *mem; + if (mem_p->mem_obj == NULL) + rv = siw_rx_kva(srx, + (void *)(sge->laddr + frx->sge_off), + sge_bytes); + else if (!mem_p->is_pbl) + rv = siw_rx_umem(srx, mem_p->umem, + sge->laddr + frx->sge_off, sge_bytes); + else + rv = siw_rx_pbl(srx, &frx->pbl_idx, mem_p, + sge->laddr + frx->sge_off, sge_bytes); + + if (unlikely(rv != sge_bytes)) { + wqe->processed += rcvd_bytes; + + siw_init_terminate(qp, TERM_ERROR_LAYER_DDP, + DDP_ETYPE_CATASTROPHIC, + DDP_ECODE_CATASTROPHIC, 0); + return -EINVAL; + } + frx->sge_off += rv; + + if (frx->sge_off == sge->length) { + frx->sge_idx++; + frx->sge_off = 0; + frx->pbl_idx = 0; + } + data_bytes -= rv; + rcvd_bytes += rv; + + srx->fpdu_part_rem -= rv; + srx->fpdu_part_rcvd += rv; + } + wqe->processed += rcvd_bytes; + + if (!srx->fpdu_part_rem) + return 0; + + return (rv < 0) ? rv : -EAGAIN; +} + +/* + * siw_proc_write: + * + * Place incoming WRITE after referencing and checking target buffer + + * Function supports partially received WRITEs (suspending/resuming + * current receive processing) + * + * return value: + * 0: reached the end of a DDP segment + * -EAGAIN: to be called again to finish the DDP segment + */ +int siw_proc_write(struct siw_qp *qp) +{ + struct siw_rx_stream *srx = &qp->rx_stream; + struct siw_rx_fpdu *frx = &qp->rx_tagged; + struct siw_mem *mem; + int bytes, rv; + + if (srx->state == SIW_GET_DATA_START) { + if (!srx->fpdu_part_rem) /* zero length WRITE */ + return 0; + + rv = siw_write_check_ntoh(srx, frx); + if (unlikely(rv)) { + siw_qp_event(qp, IB_EVENT_QP_FATAL); + return rv; + } + } + bytes = min(srx->fpdu_part_rem, srx->skb_new); + + if (frx->first_ddp_seg) { + struct siw_wqe *wqe = rx_wqe(frx); + + rx_mem(frx) = siw_mem_id2obj(qp->sdev, srx->ddp_stag >> 8); + if (unlikely(!rx_mem(frx))) { + siw_dbg_qp(qp, + "sink stag not found/invalid, stag 0x%08x\n", + srx->ddp_stag); + + siw_init_terminate(qp, TERM_ERROR_LAYER_DDP, + DDP_ETYPE_TAGGED_BUF, + DDP_ECODE_T_INVALID_STAG, 0); + return -EINVAL; + } + wqe->rqe.num_sge = 1; + rx_type(wqe) = SIW_OP_WRITE; + wqe->wr_status = SIW_WR_INPROGRESS; + } + mem = rx_mem(frx); + + /* + * Check if application re-registered memory with different + * key field of STag. + */ + if (unlikely(mem->stag != srx->ddp_stag)) { + siw_init_terminate(qp, TERM_ERROR_LAYER_DDP, + DDP_ETYPE_TAGGED_BUF, + DDP_ECODE_T_INVALID_STAG, 0); + return -EINVAL; + } + rv = siw_check_mem(qp->pd, mem, srx->ddp_to + srx->fpdu_part_rcvd, + IB_ACCESS_REMOTE_WRITE, bytes); + if (unlikely(rv)) { + siw_init_terminate(qp, TERM_ERROR_LAYER_DDP, + DDP_ETYPE_TAGGED_BUF, siw_tagged_error(-rv), + 0); + + siw_qp_event(qp, IB_EVENT_QP_ACCESS_ERR); + + return -EINVAL; + } + + if (mem->mem_obj == NULL) + rv = siw_rx_kva(srx, + (void *)(srx->ddp_to + srx->fpdu_part_rcvd), + bytes); + else if (!mem->is_pbl) + rv = siw_rx_umem(srx, mem->umem, + srx->ddp_to + srx->fpdu_part_rcvd, bytes); + else + rv = siw_rx_pbl(srx, &frx->pbl_idx, mem, + srx->ddp_to + srx->fpdu_part_rcvd, bytes); + + if (unlikely(rv != bytes)) { + siw_init_terminate(qp, TERM_ERROR_LAYER_DDP, + DDP_ETYPE_CATASTROPHIC, + DDP_ECODE_CATASTROPHIC, 0); + return -EINVAL; + } + srx->fpdu_part_rem -= rv; + srx->fpdu_part_rcvd += rv; + + if (!srx->fpdu_part_rem) { + srx->ddp_to += srx->fpdu_part_rcvd; + return 0; + } + return -EAGAIN; +} + +/* + * Inbound RREQ's cannot carry user data. + */ +int siw_proc_rreq(struct siw_qp *qp) +{ + struct siw_rx_stream *srx = &qp->rx_stream; + + if (!srx->fpdu_part_rem) + return 0; + + pr_warn("siw: [QP %u]: rreq with mpa len %d\n", qp_id(qp), + be16_to_cpu(srx->hdr.ctrl.mpa_len)); + + return -EPROTO; +} + +/* + * siw_init_rresp: + * + * Process inbound RDMA READ REQ. Produce a pseudo READ RESPONSE WQE. + * Put it at the tail of the IRQ, if there is another WQE currently in + * transmit processing. If not, make it the current WQE to be processed + * and schedule transmit processing. + * + * Can be called from softirq context and from process + * context (RREAD socket loopback case!) + * + * return value: + * 0: success, + * failure code otherwise + */ + +static int siw_init_rresp(struct siw_qp *qp, struct siw_rx_stream *srx) +{ + struct siw_wqe *tx_work = tx_wqe(qp); + struct siw_sqe *resp; + + uint64_t raddr = be64_to_cpu(srx->hdr.rreq.sink_to), + laddr = be64_to_cpu(srx->hdr.rreq.source_to); + uint32_t length = be32_to_cpu(srx->hdr.rreq.read_size), + lkey = be32_to_cpu(srx->hdr.rreq.source_stag), + rkey = be32_to_cpu(srx->hdr.rreq.sink_stag), + msn = be32_to_cpu(srx->hdr.rreq.ddp_msn); + + int run_sq = 1, rv = 0; + unsigned long flags; + + if (unlikely(msn != srx->ddp_msn[RDMAP_UNTAGGED_QN_RDMA_READ])) { + siw_init_terminate(qp, TERM_ERROR_LAYER_DDP, + DDP_ETYPE_UNTAGGED_BUF, + DDP_ECODE_UT_INVALID_MSN_RANGE, 0); + return -EPROTO; + } + spin_lock_irqsave(&qp->sq_lock, flags); + + if (tx_work->wr_status == SIW_WR_IDLE) { + /* + * immediately schedule READ response w/o + * consuming IRQ entry: IRQ must be empty. + */ + tx_work->processed = 0; + tx_work->mem[0] = NULL; + tx_work->wr_status = SIW_WR_QUEUED; + resp = &tx_work->sqe; + } else { + resp = irq_alloc_free(qp); + run_sq = 0; + } + if (likely(resp)) { + resp->opcode = SIW_OP_READ_RESPONSE; + + resp->sge[0].length = length; + resp->sge[0].laddr = laddr; + resp->sge[0].lkey = lkey; + + /* Keep aside message sequence number for potential + * error reporting during Read Response generation. + */ + resp->sge[1].length = msn; + + resp->raddr = raddr; + resp->rkey = rkey; + resp->num_sge = length ? 1 : 0; + + /* RRESP now valid as current TX wqe or placed into IRQ */ + smp_store_mb(resp->flags, SIW_WQE_VALID); + } else { + pr_warn("siw: [QP %u]: irq %d exceeded %d\n", qp_id(qp), + qp->irq_put % qp->attrs.irq_size, qp->attrs.irq_size); + + siw_init_terminate(qp, TERM_ERROR_LAYER_RDMAP, + RDMAP_ETYPE_REMOTE_OPERATION, + RDMAP_ECODE_CATASTROPHIC_STREAM, 0); + rv = -EPROTO; + } + + spin_unlock_irqrestore(&qp->sq_lock, flags); + + if (run_sq) + rv = siw_sq_start(qp); + + return rv; +} + +/* + * Only called at start of Read.Resonse processing. + * Transfer pending Read from tip of ORQ into currrent rx wqe, + * but keep ORQ entry valid until Read.Response processing done. + * No Queue locking needed. + */ +static int siw_orqe_start_rx(struct siw_qp *qp) +{ + struct siw_sqe *orqe; + struct siw_wqe *wqe = NULL; + + /* make sure ORQ indices are current */ + smp_mb(); + + orqe = orq_get_current(qp); + if (READ_ONCE(orqe->flags) & SIW_WQE_VALID) { + /* RRESP is a TAGGED RDMAP operation */ + wqe = rx_wqe(&qp->rx_tagged); + wqe->sqe.id = orqe->id; + wqe->sqe.opcode = orqe->opcode; + wqe->sqe.sge[0].laddr = orqe->sge[0].laddr; + wqe->sqe.sge[0].lkey = orqe->sge[0].lkey; + wqe->sqe.sge[0].length = orqe->sge[0].length; + wqe->sqe.flags = orqe->flags; + wqe->sqe.num_sge = 1; + wqe->bytes = orqe->sge[0].length; + wqe->processed = 0; + wqe->mem[0] = NULL; + /* make sure WQE is completely written before valid */ + smp_wmb(); + wqe->wr_status = SIW_WR_INPROGRESS; + + return 0; + } + return -EPROTO; +} + +/* + * siw_proc_rresp: + * + * Place incoming RRESP data into memory referenced by RREQ WQE + * which is at the tip of the ORQ + * + * Function supports partially received RRESP's (suspending/resuming + * current receive processing) + */ +int siw_proc_rresp(struct siw_qp *qp) +{ + struct siw_rx_stream *srx = &qp->rx_stream; + struct siw_rx_fpdu *frx = &qp->rx_tagged; + struct siw_wqe *wqe = rx_wqe(frx); + struct siw_mem **mem, *mem_p; + struct siw_sge *sge; + int bytes, rv; + + if (frx->first_ddp_seg) { + if (unlikely(wqe->wr_status != SIW_WR_IDLE)) { + pr_warn("siw: [QP %u]: proc RRESP: status %d, op %d\n", + qp_id(qp), wqe->wr_status, wqe->sqe.opcode); + rv = -EPROTO; + goto error_term; + } + /* + * fetch pending RREQ from orq + */ + rv = siw_orqe_start_rx(qp); + if (rv) { + pr_warn("siw: [QP %u]: ORQ empty at idx %d\n", + qp_id(qp), qp->orq_get % qp->attrs.orq_size); + goto error_term; + } + rv = siw_rresp_check_ntoh(srx, frx); + if (unlikely(rv)) { + siw_qp_event(qp, IB_EVENT_QP_FATAL); + return rv; + } + } else { + if (unlikely(wqe->wr_status != SIW_WR_INPROGRESS)) { + pr_warn("siw: [QP %u]: resume RRESP: status %d\n", + qp_id(qp), wqe->wr_status); + rv = -EPROTO; + goto error_term; + } + } + if (!srx->fpdu_part_rem) /* zero length RRESPONSE */ + return 0; + + sge = wqe->sqe.sge; /* there is only one */ + mem = &wqe->mem[0]; + + if (!(*mem)) { + /* + * check target memory which resolves memory on first fragment + */ + rv = siw_check_sge(qp->pd, sge, mem, IB_ACCESS_LOCAL_WRITE, 0, + wqe->bytes); + if (unlikely(rv)) { + siw_dbg_qp(qp, "target mem check: %d\n", rv); + wqe->wc_status = SIW_WC_LOC_PROT_ERR; + + siw_init_terminate(qp, TERM_ERROR_LAYER_DDP, + DDP_ETYPE_TAGGED_BUF, + siw_tagged_error(-rv), 0); + + siw_qp_event(qp, IB_EVENT_QP_ACCESS_ERR); + + return -EINVAL; + } + } + mem_p = *mem; + + bytes = min(srx->fpdu_part_rem, srx->skb_new); + + if (mem_p->mem_obj == NULL) + rv = siw_rx_kva(srx, (void *)(sge->laddr + wqe->processed), + bytes); + else if (!mem_p->is_pbl) + rv = siw_rx_umem(srx, mem_p->umem, sge->laddr + wqe->processed, + bytes); + else + rv = siw_rx_pbl(srx, &frx->pbl_idx, mem_p, + sge->laddr + wqe->processed, bytes); + if (rv != bytes) { + wqe->wc_status = SIW_WC_GENERAL_ERR; + rv = -EINVAL; + goto error_term; + } + srx->fpdu_part_rem -= rv; + srx->fpdu_part_rcvd += rv; + wqe->processed += rv; + + if (!srx->fpdu_part_rem) { + srx->ddp_to += srx->fpdu_part_rcvd; + return 0; + } + return -EAGAIN; + +error_term: + siw_init_terminate(qp, TERM_ERROR_LAYER_DDP, DDP_ETYPE_CATASTROPHIC, + DDP_ECODE_CATASTROPHIC, 0); + return rv; +} + +int siw_proc_terminate(struct siw_qp *qp) +{ + struct siw_rx_stream *srx = &qp->rx_stream; + struct sk_buff *skb = srx->skb; + struct iwarp_terminate *term = &srx->hdr.terminate; + union iwarp_hdr term_info; + u8 *infop = (u8 *)&term_info; + enum rdma_opcode op; + u16 to_copy = sizeof(struct iwarp_ctrl); + + pr_warn("siw: got TERMINATE. layer %d, type %d, code %d\n", + __rdmap_term_layer(term), __rdmap_term_etype(term), + __rdmap_term_ecode(term)); + + if (be32_to_cpu(term->ddp_qn) != RDMAP_UNTAGGED_QN_TERMINATE || + be32_to_cpu(term->ddp_msn) != + qp->rx_stream.ddp_msn[RDMAP_UNTAGGED_QN_TERMINATE] || + be32_to_cpu(term->ddp_mo) != 0) { + pr_warn("siw: rx bogus TERM [QN x%08x, MSN x%08x, MO x%08x]\n", + be32_to_cpu(term->ddp_qn), be32_to_cpu(term->ddp_msn), + be32_to_cpu(term->ddp_mo)); + return -ECONNRESET; + } + /* + * Receive remaining pieces of TERM if indicated + */ + if (!term->flag_m) + return -ECONNRESET; + + /* Do not take the effort to reassemble a network fragmented + * TERM message + */ + if (srx->skb_new < sizeof(struct iwarp_ctrl_tagged)) + return -ECONNRESET; + + memset(infop, 0, sizeof(term_info)); + + skb_copy_bits(skb, srx->skb_offset, infop, to_copy); + + op = __rdmap_get_opcode(&term_info.ctrl); + if (op >= RDMAP_TERMINATE) + goto out; + + infop += to_copy; + srx->skb_offset += to_copy; + srx->skb_new -= to_copy; + srx->skb_copied += to_copy; + srx->fpdu_part_rcvd += to_copy; + srx->fpdu_part_rem -= to_copy; + + to_copy = iwarp_pktinfo[op].hdr_len - to_copy; + + /* Again, no network fragmented TERM's */ + if (to_copy + MPA_CRC_SIZE > srx->skb_new) + return -ECONNRESET; + + skb_copy_bits(skb, srx->skb_offset, infop, to_copy); + + if (term->flag_r) { + siw_dbg_qp(qp, "TERM reports RDMAP hdr type %u, len %u (%s)\n", + op, be16_to_cpu(term_info.ctrl.mpa_len), + term->flag_m ? "valid" : "invalid"); + } else if (term->flag_d) { + siw_dbg_qp(qp, "TERM reports DDP hdr type %u, len %u (%s)\n", + op, be16_to_cpu(term_info.ctrl.mpa_len), + term->flag_m ? "valid" : "invalid"); + } +out: + srx->skb_new -= to_copy; + srx->skb_offset += to_copy; + srx->skb_copied += to_copy; + srx->fpdu_part_rcvd += to_copy; + srx->fpdu_part_rem -= to_copy; + + return -ECONNRESET; +} + +static int siw_get_trailer(struct siw_qp *qp, struct siw_rx_stream *srx) +{ + struct sk_buff *skb = srx->skb; + u8 *tbuf = (u8 *)&srx->trailer.crc - srx->pad; + __wsum crc_in, crc_own = 0; + + siw_dbg_qp(qp, "expected %d, available %d, pad %u\n", + srx->fpdu_part_rem, srx->skb_new, srx->pad); + + if (srx->skb_new < srx->fpdu_part_rem) + return -EAGAIN; + + skb_copy_bits(skb, srx->skb_offset, tbuf, srx->fpdu_part_rem); + + if (srx->mpa_crc_hd && srx->pad) + crypto_shash_update(srx->mpa_crc_hd, tbuf, srx->pad); + + srx->skb_new -= srx->fpdu_part_rem; + srx->skb_offset += srx->fpdu_part_rem; + srx->skb_copied += srx->fpdu_part_rem; + + if (!srx->mpa_crc_hd) + return 0; + + /* + * CRC32 is computed, transmitted and received directly in NBO, + * so there's never a reason to convert byte order. + */ + crypto_shash_final(srx->mpa_crc_hd, (u8 *)&crc_own); + crc_in = (__force __wsum)srx->trailer.crc; + + if (unlikely(crc_in != crc_own)) { + pr_warn("siw: crc error. in: %08x, own %08x, op %u\n", + crc_in, crc_own, qp->rx_stream.rdmap_op); + + siw_init_terminate(qp, TERM_ERROR_LAYER_LLP, + LLP_ETYPE_MPA, + LLP_ECODE_RECEIVED_CRC, 0); + return -EINVAL; + } + return 0; +} + +#define MIN_DDP_HDR sizeof(struct iwarp_ctrl_tagged) + +static int siw_get_hdr(struct siw_rx_stream *srx) +{ + struct sk_buff *skb = srx->skb; + struct siw_qp *qp = rx_qp(srx); + struct iwarp_ctrl *c_hdr = &srx->hdr.ctrl; + struct siw_rx_fpdu *frx; + u8 opcode; + int bytes; + + if (srx->fpdu_part_rcvd < MIN_DDP_HDR) { + /* + * copy a mimimum sized (tagged) DDP frame control part + */ + bytes = min_t(int, srx->skb_new, + MIN_DDP_HDR - srx->fpdu_part_rcvd); + + skb_copy_bits(skb, srx->skb_offset, + (char *)c_hdr + srx->fpdu_part_rcvd, bytes); + + srx->fpdu_part_rcvd += bytes; + + srx->skb_new -= bytes; + srx->skb_offset += bytes; + srx->skb_copied += bytes; + + if (srx->fpdu_part_rcvd < MIN_DDP_HDR) + return -EAGAIN; + + if (unlikely(__ddp_get_version(c_hdr) != DDP_VERSION)) { + enum ddp_etype etype; + enum ddp_ecode ecode; + + pr_warn("siw: received ddp version unsupported %d\n", + __ddp_get_version(c_hdr)); + + if (c_hdr->ddp_rdmap_ctrl & DDP_FLAG_TAGGED) { + etype = DDP_ETYPE_TAGGED_BUF; + ecode = DDP_ECODE_T_VERSION; + } else { + etype = DDP_ETYPE_UNTAGGED_BUF; + ecode = DDP_ECODE_UT_VERSION; + } + siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_DDP, + etype, ecode, 0); + return -EINVAL; + } + if (unlikely(__rdmap_get_version(c_hdr) != RDMAP_VERSION)) { + pr_warn("siw: received rdmap version unsupported %d\n", + __rdmap_get_version(c_hdr)); + + siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_RDMAP, + RDMAP_ETYPE_REMOTE_OPERATION, + RDMAP_ECODE_VERSION, 0); + return -EINVAL; + } + opcode = __rdmap_get_opcode(c_hdr); + + if (opcode > RDMAP_TERMINATE) { + pr_warn("siw: received unknown packet type %u\n", + opcode); + + siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_RDMAP, + RDMAP_ETYPE_REMOTE_OPERATION, + RDMAP_ECODE_OPCODE, 0); + return -EINVAL; + } + siw_dbg_qp(rx_qp(srx), "new header, opcode %u\n", opcode); + } else { + opcode = __rdmap_get_opcode(c_hdr); + } + set_rx_fpdu_context(qp, opcode); + frx = qp->rx_fpdu; + + /* + * Figure out len of current hdr: variable length of + * iwarp hdr may force us to copy hdr information in + * two steps. Only tagged DDP messages are already + * completely received. + */ + if (iwarp_pktinfo[opcode].hdr_len > sizeof(struct iwarp_ctrl_tagged)) { + bytes = iwarp_pktinfo[opcode].hdr_len - MIN_DDP_HDR; + + if (srx->skb_new < bytes) + return -EAGAIN; + + skb_copy_bits(skb, srx->skb_offset, + (char *)c_hdr + srx->fpdu_part_rcvd, bytes); + + srx->fpdu_part_rcvd += bytes; + + srx->skb_new -= bytes; + srx->skb_offset += bytes; + srx->skb_copied += bytes; + } + + /* + * DDP/RDMAP header receive completed. Check if the current + * DDP segment starts a new RDMAP message or continues a previously + * started RDMAP message. + * + * Alternating reception of DDP segments (or FPDUs) from incomplete + * tagged and untagged RDMAP messages is supported, as long as + * the current tagged or untagged message gets eventually completed + * w/o intersection from another message of the same type + * (tagged/untagged). E.g., a WRITE can get intersected by a SEND, + * but not by a READ RESPONSE etc. + */ + if (srx->mpa_crc_hd) { + /* + * Restart CRC computation + */ + crypto_shash_init(srx->mpa_crc_hd); + crypto_shash_update(srx->mpa_crc_hd, (u8 *)c_hdr, + srx->fpdu_part_rcvd); + } + if (frx->more_ddp_segs) { + frx->first_ddp_seg = 0; + if (frx->prev_rdmap_op != opcode) { + pr_warn("siw: packet intersection: %u : %u\n", + frx->prev_rdmap_op, opcode); + /* + * The last inbound RDMA operation of same type + * (tagged or untagged) is left unfinished. + * To complete it in error, make it the current + * operation again, even with the header already + * overwritten. For error handling, only the opcode + * and current rx context are relevant. + */ + set_rx_fpdu_context(qp, frx->prev_rdmap_op); + __rdmap_set_opcode(c_hdr, frx->prev_rdmap_op); + return -EPROTO; + } + } else { + frx->prev_rdmap_op = opcode; + frx->first_ddp_seg = 1; + } + frx->more_ddp_segs = c_hdr->ddp_rdmap_ctrl & DDP_FLAG_LAST ? 0 : 1; + + return 0; +} + +static int siw_check_tx_fence(struct siw_qp *qp) +{ + struct siw_wqe *tx_waiting = tx_wqe(qp); + struct siw_sqe *rreq; + int resume_tx = 0, rv = 0; + unsigned long flags; + + spin_lock_irqsave(&qp->orq_lock, flags); + + rreq = orq_get_current(qp); + + /* free current orq entry */ + WRITE_ONCE(rreq->flags, 0); + + if (qp->tx_ctx.orq_fence) { + if (unlikely(tx_waiting->wr_status != SIW_WR_QUEUED)) { + pr_warn("siw: [QP %u]: fence resume: bad status %d\n", + qp_id(qp), tx_waiting->wr_status); + rv = -EPROTO; + goto out; + } + /* resume SQ processing */ + if (tx_waiting->sqe.opcode == SIW_OP_READ || + tx_waiting->sqe.opcode == SIW_OP_READ_LOCAL_INV) { + rreq = orq_get_tail(qp); + if (unlikely(!rreq)) { + pr_warn("siw: [QP %u]: no ORQE\n", qp_id(qp)); + rv = -EPROTO; + goto out; + } + siw_read_to_orq(rreq, &tx_waiting->sqe); + + qp->orq_put++; + qp->tx_ctx.orq_fence = 0; + resume_tx = 1; + + } else if (siw_orq_empty(qp)) { + qp->tx_ctx.orq_fence = 0; + resume_tx = 1; + } else { + pr_warn("siw: [QP %u]: fence resume: orq idx: %d:%d\n", + qp_id(qp), qp->orq_get, qp->orq_put); + rv = -EPROTO; + } + } + qp->orq_get++; +out: + spin_unlock_irqrestore(&qp->orq_lock, flags); + + if (resume_tx) + rv = siw_sq_start(qp); + + return rv; +} + +/* + * siw_rdmap_complete() + * + * Complete processing of an RDMA message after receiving all + * DDP segmens or ABort processing after encountering error case. + * + * o SENDs + RRESPs will need for completion, + * o RREQs need for READ RESPONSE initialization + * o WRITEs need memory dereferencing + * + * TODO: Failed WRITEs need local error to be surfaced. + */ +static int siw_rdmap_complete(struct siw_qp *qp, int error) +{ + struct siw_rx_stream *srx = &qp->rx_stream; + struct siw_wqe *wqe = rx_wqe(qp->rx_fpdu); + enum siw_wc_status wc_status = wqe->wc_status; + u8 opcode = __rdmap_get_opcode(&srx->hdr.ctrl); + int rv = 0; + + switch (opcode) { + case RDMAP_SEND_SE: + case RDMAP_SEND_SE_INVAL: + wqe->rqe.flags |= SIW_WQE_SOLICITED; + /* Fall through */ + + case RDMAP_SEND: + case RDMAP_SEND_INVAL: + if (wqe->wr_status == SIW_WR_IDLE) + break; + + srx->ddp_msn[RDMAP_UNTAGGED_QN_SEND]++; + + if (error != 0 && wc_status == SIW_WC_SUCCESS) + wc_status = SIW_WC_GENERAL_ERR; + /* + * Handle STag invalidation request + */ + if (wc_status == SIW_WC_SUCCESS && + (opcode == RDMAP_SEND_INVAL || + opcode == RDMAP_SEND_SE_INVAL)) { + rv = siw_invalidate_stag(qp->pd, srx->inval_stag); + if (rv) { + siw_init_terminate( + qp, TERM_ERROR_LAYER_RDMAP, + rv == -EACCES ? + RDMAP_ETYPE_REMOTE_PROTECTION : + RDMAP_ETYPE_REMOTE_OPERATION, + RDMAP_ECODE_CANNOT_INVALIDATE, 0); + + wc_status = SIW_WC_REM_INV_REQ_ERR; + } + rv = siw_rqe_complete(qp, &wqe->rqe, wqe->processed, + rv ? 0 : srx->inval_stag, + wc_status); + } else { + rv = siw_rqe_complete(qp, &wqe->rqe, wqe->processed, + 0, wc_status); + } + siw_wqe_put_mem(wqe, SIW_OP_RECEIVE); + break; + + case RDMAP_RDMA_READ_RESP: + if (wqe->wr_status == SIW_WR_IDLE) + break; + + if (error != 0) { + if ((srx->state == SIW_GET_HDR && + qp->rx_fpdu->first_ddp_seg) || error == -ENODATA) + /* possible RREQ in ORQ left untouched */ + break; + + if (wc_status == SIW_WC_SUCCESS) + wc_status = SIW_WC_GENERAL_ERR; + } else if (qp->kernel_verbs && + rx_type(wqe) == SIW_OP_READ_LOCAL_INV) { + /* + * Handle any STag invalidation request + */ + rv = siw_invalidate_stag(qp->pd, wqe->sqe.sge[0].lkey); + if (rv) { + siw_init_terminate(qp, TERM_ERROR_LAYER_RDMAP, + RDMAP_ETYPE_CATASTROPHIC, + RDMAP_ECODE_UNSPECIFIED, 0); + + if (wc_status == SIW_WC_SUCCESS) { + wc_status = SIW_WC_GENERAL_ERR; + error = rv; + } + } + } + /* + * All errors turn the wqe into signalled. + */ + if ((wqe->sqe.flags & SIW_WQE_SIGNALLED) || error != 0) + rv = siw_sqe_complete(qp, &wqe->sqe, wqe->processed, + wc_status); + siw_wqe_put_mem(wqe, SIW_OP_READ); + + if (!error) + rv = siw_check_tx_fence(qp); + else + /* Disable current ORQ eleement */ + WRITE_ONCE(orq_get_current(qp)->flags, 0); + break; + + case RDMAP_RDMA_READ_REQ: + if (!error) { + rv = siw_init_rresp(qp, srx); + srx->ddp_msn[RDMAP_UNTAGGED_QN_RDMA_READ]++; + } + break; + + case RDMAP_RDMA_WRITE: + if (wqe->wr_status == SIW_WR_IDLE) + break; + + /* + * Free References from memory object if + * attached to receive context (inbound WRITE). + * While a zero-length WRITE is allowed, + * no memory reference got created. + */ + if (rx_mem(&qp->rx_tagged)) { + siw_mem_put(rx_mem(&qp->rx_tagged)); + rx_mem(&qp->rx_tagged) = NULL; + } + break; + + default: + break; + } + wqe->wr_status = SIW_WR_IDLE; + + return rv; +} + +/* + * siw_tcp_rx_data() + * + * Main routine to consume inbound TCP payload + * + * @rd_desc: read descriptor + * @skb: socket buffer + * @off: offset in skb + * @len: skb->len - offset : payload in skb + */ +int siw_tcp_rx_data(read_descriptor_t *rd_desc, struct sk_buff *skb, + unsigned int off, size_t len) +{ + struct siw_qp *qp = rd_desc->arg.data; + struct siw_rx_stream *srx = &qp->rx_stream; + int rv; + + srx->skb = skb; + srx->skb_new = skb->len - off; + srx->skb_offset = off; + srx->skb_copied = 0; + + siw_dbg_qp(qp, "new data, len %d\n", srx->skb_new); + + while (srx->skb_new) { + int run_completion = 1; + + if (unlikely(srx->rx_suspend)) { + /* Do not process any more data */ + srx->skb_copied += srx->skb_new; + break; + } + switch (srx->state) { + case SIW_GET_HDR: + rv = siw_get_hdr(srx); + if (!rv) { + srx->fpdu_part_rem = + be16_to_cpu(srx->hdr.ctrl.mpa_len) - + srx->fpdu_part_rcvd + MPA_HDR_SIZE; + + if (srx->fpdu_part_rem) + srx->pad = -srx->fpdu_part_rem & 0x3; + else + srx->pad = 0; + + srx->state = SIW_GET_DATA_START; + srx->fpdu_part_rcvd = 0; + } + break; + + case SIW_GET_DATA_MORE: + /* + * Another data fragment of the same DDP segment. + * Setting first_ddp_seg = 0 avoids repeating + * initializations that shall occur only once per + * DDP segment. + */ + qp->rx_fpdu->first_ddp_seg = 0; + /* Fall through */ + + case SIW_GET_DATA_START: + /* + * Headers will be checked by the opcode-specific + * data receive function below. + */ + rv = iwarp_pktinfo[qp->rx_stream.rdmap_op].rx_data(qp); + if (!rv) { + int mpa_len = + be16_to_cpu(srx->hdr.ctrl.mpa_len) + + MPA_HDR_SIZE; + + srx->fpdu_part_rem = (-mpa_len & 0x3) + + MPA_CRC_SIZE; + srx->fpdu_part_rcvd = 0; + srx->state = SIW_GET_TRAILER; + } else { + if (unlikely(rv == -ECONNRESET)) + run_completion = 0; + else + srx->state = SIW_GET_DATA_MORE; + } + break; + + case SIW_GET_TRAILER: + /* + * read CRC + any padding + */ + rv = siw_get_trailer(qp, srx); + if (likely(!rv)) { + /* + * FPDU completed. + * complete RDMAP message if last fragment + */ + srx->state = SIW_GET_HDR; + srx->fpdu_part_rcvd = 0; + + if (!(srx->hdr.ctrl.ddp_rdmap_ctrl & + DDP_FLAG_LAST)) + /* more frags */ + break; + + rv = siw_rdmap_complete(qp, 0); + run_completion = 0; + } + break; + + default: + pr_warn("QP[%u]: RX out of state\n", qp_id(qp)); + rv = -EPROTO; + run_completion = 0; + } + if (unlikely(rv != 0 && rv != -EAGAIN)) { + if ((srx->state > SIW_GET_HDR || + qp->rx_fpdu->more_ddp_segs) && run_completion) + siw_rdmap_complete(qp, rv); + + siw_dbg_qp(qp, "rx error %d, rx state %d\n", rv, + srx->state); + + siw_qp_cm_drop(qp, 1); + + break; + } + if (rv) { + siw_dbg_qp(qp, "fpdu fragment, state %d, missing %d\n", + srx->state, srx->fpdu_part_rem); + break; + } + } + return srx->skb_copied; +} diff --git a/drivers/infiniband/sw/siw/siw_qp_tx.c b/drivers/infiniband/sw/siw/siw_qp_tx.c new file mode 100644 index 000000000000..43020d2040fc --- /dev/null +++ b/drivers/infiniband/sw/siw/siw_qp_tx.c @@ -0,0 +1,1269 @@ +// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause + +/* Authors: Bernard Metzler <bmt@zurich.ibm.com> */ +/* Copyright (c) 2008-2019, IBM Corporation */ + +#include <linux/errno.h> +#include <linux/types.h> +#include <linux/net.h> +#include <linux/scatterlist.h> +#include <linux/highmem.h> +#include <net/tcp.h> + +#include <rdma/iw_cm.h> +#include <rdma/ib_verbs.h> +#include <rdma/ib_user_verbs.h> + +#include "siw.h" +#include "siw_verbs.h" +#include "siw_mem.h" + +#define MAX_HDR_INLINE \ + (((uint32_t)(sizeof(struct siw_rreq_pkt) - \ + sizeof(struct iwarp_send))) & 0xF8) + +static struct page *siw_get_pblpage(struct siw_mem *mem, u64 addr, int *idx) +{ + struct siw_pbl *pbl = mem->pbl; + u64 offset = addr - mem->va; + u64 paddr = siw_pbl_get_buffer(pbl, offset, NULL, idx); + + if (paddr) + return virt_to_page(paddr); + + return NULL; +} + +/* + * Copy short payload at provided destination payload address + */ +static int siw_try_1seg(struct siw_iwarp_tx *c_tx, u64 paddr) +{ + struct siw_wqe *wqe = &c_tx->wqe_active; + struct siw_sge *sge = &wqe->sqe.sge[0]; + u32 bytes = sge->length; + + if (bytes > MAX_HDR_INLINE || wqe->sqe.num_sge != 1) + return MAX_HDR_INLINE + 1; + + if (!bytes) + return 0; + + if (tx_flags(wqe) & SIW_WQE_INLINE) { + memcpy((void *)paddr, &wqe->sqe.sge[1], bytes); + } else { + struct siw_mem *mem = wqe->mem[0]; + + if (!mem->mem_obj) { + /* Kernel client using kva */ + memcpy((void *)paddr, (void *)sge->laddr, bytes); + } else if (c_tx->in_syscall) { + if (copy_from_user((void *)paddr, + (const void __user *)sge->laddr, + bytes)) + return -EFAULT; + } else { + unsigned int off = sge->laddr & ~PAGE_MASK; + struct page *p; + char *buffer; + int pbl_idx = 0; + + if (!mem->is_pbl) + p = siw_get_upage(mem->umem, sge->laddr); + else + p = siw_get_pblpage(mem, sge->laddr, &pbl_idx); + + if (unlikely(!p)) + return -EFAULT; + + buffer = kmap_atomic(p); + + if (likely(PAGE_SIZE - off >= bytes)) { + memcpy((void *)paddr, buffer + off, bytes); + kunmap_atomic(buffer); + } else { + unsigned long part = bytes - (PAGE_SIZE - off); + + memcpy((void *)paddr, buffer + off, part); + kunmap_atomic(buffer); + + if (!mem->is_pbl) + p = siw_get_upage(mem->umem, + sge->laddr + part); + else + p = siw_get_pblpage(mem, + sge->laddr + part, + &pbl_idx); + if (unlikely(!p)) + return -EFAULT; + + buffer = kmap_atomic(p); + memcpy((void *)(paddr + part), buffer, + bytes - part); + kunmap_atomic(buffer); + } + } + } + return (int)bytes; +} + +#define PKT_FRAGMENTED 1 +#define PKT_COMPLETE 0 + +/* + * siw_qp_prepare_tx() + * + * Prepare tx state for sending out one fpdu. Builds complete pkt + * if no user data or only immediate data are present. + * + * returns PKT_COMPLETE if complete pkt built, PKT_FRAGMENTED otherwise. + */ +static int siw_qp_prepare_tx(struct siw_iwarp_tx *c_tx) +{ + struct siw_wqe *wqe = &c_tx->wqe_active; + char *crc = NULL; + int data = 0; + + switch (tx_type(wqe)) { + case SIW_OP_READ: + case SIW_OP_READ_LOCAL_INV: + memcpy(&c_tx->pkt.ctrl, + &iwarp_pktinfo[RDMAP_RDMA_READ_REQ].ctrl, + sizeof(struct iwarp_ctrl)); + + c_tx->pkt.rreq.rsvd = 0; + c_tx->pkt.rreq.ddp_qn = htonl(RDMAP_UNTAGGED_QN_RDMA_READ); + c_tx->pkt.rreq.ddp_msn = + htonl(++c_tx->ddp_msn[RDMAP_UNTAGGED_QN_RDMA_READ]); + c_tx->pkt.rreq.ddp_mo = 0; + c_tx->pkt.rreq.sink_stag = htonl(wqe->sqe.sge[0].lkey); + c_tx->pkt.rreq.sink_to = + cpu_to_be64(wqe->sqe.sge[0].laddr); + c_tx->pkt.rreq.source_stag = htonl(wqe->sqe.rkey); + c_tx->pkt.rreq.source_to = cpu_to_be64(wqe->sqe.raddr); + c_tx->pkt.rreq.read_size = htonl(wqe->sqe.sge[0].length); + + c_tx->ctrl_len = sizeof(struct iwarp_rdma_rreq); + crc = (char *)&c_tx->pkt.rreq_pkt.crc; + break; + + case SIW_OP_SEND: + if (tx_flags(wqe) & SIW_WQE_SOLICITED) + memcpy(&c_tx->pkt.ctrl, + &iwarp_pktinfo[RDMAP_SEND_SE].ctrl, + sizeof(struct iwarp_ctrl)); + else + memcpy(&c_tx->pkt.ctrl, &iwarp_pktinfo[RDMAP_SEND].ctrl, + sizeof(struct iwarp_ctrl)); + + c_tx->pkt.send.ddp_qn = RDMAP_UNTAGGED_QN_SEND; + c_tx->pkt.send.ddp_msn = + htonl(++c_tx->ddp_msn[RDMAP_UNTAGGED_QN_SEND]); + c_tx->pkt.send.ddp_mo = 0; + + c_tx->pkt.send_inv.inval_stag = 0; + + c_tx->ctrl_len = sizeof(struct iwarp_send); + + crc = (char *)&c_tx->pkt.send_pkt.crc; + data = siw_try_1seg(c_tx, (u64)crc); + break; + + case SIW_OP_SEND_REMOTE_INV: + if (tx_flags(wqe) & SIW_WQE_SOLICITED) + memcpy(&c_tx->pkt.ctrl, + &iwarp_pktinfo[RDMAP_SEND_SE_INVAL].ctrl, + sizeof(struct iwarp_ctrl)); + else + memcpy(&c_tx->pkt.ctrl, + &iwarp_pktinfo[RDMAP_SEND_INVAL].ctrl, + sizeof(struct iwarp_ctrl)); + + c_tx->pkt.send.ddp_qn = RDMAP_UNTAGGED_QN_SEND; + c_tx->pkt.send.ddp_msn = + htonl(++c_tx->ddp_msn[RDMAP_UNTAGGED_QN_SEND]); + c_tx->pkt.send.ddp_mo = 0; + + c_tx->pkt.send_inv.inval_stag = cpu_to_be32(wqe->sqe.rkey); + + c_tx->ctrl_len = sizeof(struct iwarp_send_inv); + + crc = (char *)&c_tx->pkt.send_pkt.crc; + data = siw_try_1seg(c_tx, (u64)crc); + break; + + case SIW_OP_WRITE: + memcpy(&c_tx->pkt.ctrl, &iwarp_pktinfo[RDMAP_RDMA_WRITE].ctrl, + sizeof(struct iwarp_ctrl)); + + c_tx->pkt.rwrite.sink_stag = htonl(wqe->sqe.rkey); + c_tx->pkt.rwrite.sink_to = cpu_to_be64(wqe->sqe.raddr); + c_tx->ctrl_len = sizeof(struct iwarp_rdma_write); + + crc = (char *)&c_tx->pkt.write_pkt.crc; + data = siw_try_1seg(c_tx, (u64)crc); + break; + + case SIW_OP_READ_RESPONSE: + memcpy(&c_tx->pkt.ctrl, + &iwarp_pktinfo[RDMAP_RDMA_READ_RESP].ctrl, + sizeof(struct iwarp_ctrl)); + + /* NBO */ + c_tx->pkt.rresp.sink_stag = cpu_to_be32(wqe->sqe.rkey); + c_tx->pkt.rresp.sink_to = cpu_to_be64(wqe->sqe.raddr); + + c_tx->ctrl_len = sizeof(struct iwarp_rdma_rresp); + + crc = (char *)&c_tx->pkt.write_pkt.crc; + data = siw_try_1seg(c_tx, (u64)crc); + break; + + default: + siw_dbg_qp(tx_qp(c_tx), "stale wqe type %d\n", tx_type(wqe)); + return -EOPNOTSUPP; + } + if (unlikely(data < 0)) + return data; + + c_tx->ctrl_sent = 0; + + if (data <= MAX_HDR_INLINE) { + if (data) { + wqe->processed = data; + + c_tx->pkt.ctrl.mpa_len = + htons(c_tx->ctrl_len + data - MPA_HDR_SIZE); + + /* Add pad, if needed */ + data += -(int)data & 0x3; + /* advance CRC location after payload */ + crc += data; + c_tx->ctrl_len += data; + + if (!(c_tx->pkt.ctrl.ddp_rdmap_ctrl & DDP_FLAG_TAGGED)) + c_tx->pkt.c_untagged.ddp_mo = 0; + else + c_tx->pkt.c_tagged.ddp_to = + cpu_to_be64(wqe->sqe.raddr); + } + + *(u32 *)crc = 0; + /* + * Do complete CRC if enabled and short packet + */ + if (c_tx->mpa_crc_hd) { + crypto_shash_init(c_tx->mpa_crc_hd); + if (crypto_shash_update(c_tx->mpa_crc_hd, + (u8 *)&c_tx->pkt, + c_tx->ctrl_len)) + return -EINVAL; + crypto_shash_final(c_tx->mpa_crc_hd, (u8 *)crc); + } + c_tx->ctrl_len += MPA_CRC_SIZE; + + return PKT_COMPLETE; + } + c_tx->ctrl_len += MPA_CRC_SIZE; + c_tx->sge_idx = 0; + c_tx->sge_off = 0; + c_tx->pbl_idx = 0; + + /* + * Allow direct sending out of user buffer if WR is non signalled + * and payload is over threshold. + * Per RDMA verbs, the application should not change the send buffer + * until the work completed. In iWarp, work completion is only + * local delivery to TCP. TCP may reuse the buffer for + * retransmission. Changing unsent data also breaks the CRC, + * if applied. + */ + if (c_tx->zcopy_tx && wqe->bytes >= SENDPAGE_THRESH && + !(tx_flags(wqe) & SIW_WQE_SIGNALLED)) + c_tx->use_sendpage = 1; + else + c_tx->use_sendpage = 0; + + return PKT_FRAGMENTED; +} + +/* + * Send out one complete control type FPDU, or header of FPDU carrying + * data. Used for fixed sized packets like Read.Requests or zero length + * SENDs, WRITEs, READ.Responses, or header only. + */ +static int siw_tx_ctrl(struct siw_iwarp_tx *c_tx, struct socket *s, + int flags) +{ + struct msghdr msg = { .msg_flags = flags }; + struct kvec iov = { .iov_base = + (char *)&c_tx->pkt.ctrl + c_tx->ctrl_sent, + .iov_len = c_tx->ctrl_len - c_tx->ctrl_sent }; + + int rv = kernel_sendmsg(s, &msg, &iov, 1, + c_tx->ctrl_len - c_tx->ctrl_sent); + + if (rv >= 0) { + c_tx->ctrl_sent += rv; + + if (c_tx->ctrl_sent == c_tx->ctrl_len) + rv = 0; + else + rv = -EAGAIN; + } + return rv; +} + +/* + * 0copy TCP transmit interface: Use do_tcp_sendpages. + * + * Using sendpage to push page by page appears to be less efficient + * than using sendmsg, even if data are copied. + * + * A general performance limitation might be the extra four bytes + * trailer checksum segment to be pushed after user data. + */ +static int siw_tcp_sendpages(struct socket *s, struct page **page, int offset, + size_t size) +{ + struct sock *sk = s->sk; + int i = 0, rv = 0, sent = 0, + flags = MSG_MORE | MSG_DONTWAIT | MSG_SENDPAGE_NOTLAST; + + while (size) { + size_t bytes = min_t(size_t, PAGE_SIZE - offset, size); + + if (size + offset <= PAGE_SIZE) + flags = MSG_MORE | MSG_DONTWAIT; + + tcp_rate_check_app_limited(sk); +try_page_again: + lock_sock(sk); + rv = do_tcp_sendpages(sk, page[i], offset, bytes, flags); + release_sock(sk); + + if (rv > 0) { + size -= rv; + sent += rv; + if (rv != bytes) { + offset += rv; + bytes -= rv; + goto try_page_again; + } + offset = 0; + } else { + if (rv == -EAGAIN || rv == 0) + break; + return rv; + } + i++; + } + return sent; +} + +/* + * siw_0copy_tx() + * + * Pushes list of pages to TCP socket. If pages from multiple + * SGE's, all referenced pages of each SGE are pushed in one + * shot. + */ +static int siw_0copy_tx(struct socket *s, struct page **page, + struct siw_sge *sge, unsigned int offset, + unsigned int size) +{ + int i = 0, sent = 0, rv; + int sge_bytes = min(sge->length - offset, size); + + offset = (sge->laddr + offset) & ~PAGE_MASK; + + while (sent != size) { + rv = siw_tcp_sendpages(s, &page[i], offset, sge_bytes); + if (rv >= 0) { + sent += rv; + if (size == sent || sge_bytes > rv) + break; + + i += PAGE_ALIGN(sge_bytes + offset) >> PAGE_SHIFT; + sge++; + sge_bytes = min(sge->length, size - sent); + offset = sge->laddr & ~PAGE_MASK; + } else { + sent = rv; + break; + } + } + return sent; +} + +#define MAX_TRAILER (MPA_CRC_SIZE + 4) + +static void siw_unmap_pages(struct page **pages, int hdr_len, int num_maps) +{ + if (hdr_len) { + ++pages; + --num_maps; + } + while (num_maps-- > 0) { + kunmap(*pages); + pages++; + } +} + +/* + * siw_tx_hdt() tries to push a complete packet to TCP where all + * packet fragments are referenced by the elements of one iovec. + * For the data portion, each involved page must be referenced by + * one extra element. All sge's data can be non-aligned to page + * boundaries. Two more elements are referencing iWARP header + * and trailer: + * MAX_ARRAY = 64KB/PAGE_SIZE + 1 + (2 * (SIW_MAX_SGE - 1) + HDR + TRL + */ +#define MAX_ARRAY ((0xffff / PAGE_SIZE) + 1 + (2 * (SIW_MAX_SGE - 1) + 2)) + +/* + * Write out iov referencing hdr, data and trailer of current FPDU. + * Update transmit state dependent on write return status + */ +static int siw_tx_hdt(struct siw_iwarp_tx *c_tx, struct socket *s) +{ + struct siw_wqe *wqe = &c_tx->wqe_active; + struct siw_sge *sge = &wqe->sqe.sge[c_tx->sge_idx]; + struct kvec iov[MAX_ARRAY]; + struct page *page_array[MAX_ARRAY]; + struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_EOR }; + + int seg = 0, do_crc = c_tx->do_crc, is_kva = 0, rv; + unsigned int data_len = c_tx->bytes_unsent, hdr_len = 0, trl_len = 0, + sge_off = c_tx->sge_off, sge_idx = c_tx->sge_idx, + pbl_idx = c_tx->pbl_idx; + + if (c_tx->state == SIW_SEND_HDR) { + if (c_tx->use_sendpage) { + rv = siw_tx_ctrl(c_tx, s, MSG_DONTWAIT | MSG_MORE); + if (rv) + goto done; + + c_tx->state = SIW_SEND_DATA; + } else { + iov[0].iov_base = + (char *)&c_tx->pkt.ctrl + c_tx->ctrl_sent; + iov[0].iov_len = hdr_len = + c_tx->ctrl_len - c_tx->ctrl_sent; + seg = 1; + } + } + + wqe->processed += data_len; + + while (data_len) { /* walk the list of SGE's */ + unsigned int sge_len = min(sge->length - sge_off, data_len); + unsigned int fp_off = (sge->laddr + sge_off) & ~PAGE_MASK; + struct siw_mem *mem; + + if (!(tx_flags(wqe) & SIW_WQE_INLINE)) { + mem = wqe->mem[sge_idx]; + if (!mem->mem_obj) + is_kva = 1; + } else { + is_kva = 1; + } + if (is_kva && !c_tx->use_sendpage) { + /* + * tx from kernel virtual address: either inline data + * or memory region with assigned kernel buffer + */ + iov[seg].iov_base = (void *)(sge->laddr + sge_off); + iov[seg].iov_len = sge_len; + + if (do_crc) + crypto_shash_update(c_tx->mpa_crc_hd, + iov[seg].iov_base, + sge_len); + sge_off += sge_len; + data_len -= sge_len; + seg++; + goto sge_done; + } + + while (sge_len) { + size_t plen = min((int)PAGE_SIZE - fp_off, sge_len); + + if (!is_kva) { + struct page *p; + + if (mem->is_pbl) + p = siw_get_pblpage( + mem, sge->laddr + sge_off, + &pbl_idx); + else + p = siw_get_upage(mem->umem, + sge->laddr + sge_off); + if (unlikely(!p)) { + if (hdr_len) + seg--; + if (!c_tx->use_sendpage && seg) { + siw_unmap_pages(page_array, + hdr_len, seg); + } + wqe->processed -= c_tx->bytes_unsent; + rv = -EFAULT; + goto done_crc; + } + page_array[seg] = p; + + if (!c_tx->use_sendpage) { + iov[seg].iov_base = kmap(p) + fp_off; + iov[seg].iov_len = plen; + if (do_crc) + crypto_shash_update( + c_tx->mpa_crc_hd, + iov[seg].iov_base, + plen); + } else if (do_crc) + crypto_shash_update( + c_tx->mpa_crc_hd, + page_address(p) + fp_off, + plen); + } else { + u64 pa = ((sge->laddr + sge_off) & PAGE_MASK); + + page_array[seg] = virt_to_page(pa); + if (do_crc) + crypto_shash_update( + c_tx->mpa_crc_hd, + (void *)(sge->laddr + sge_off), + plen); + } + + sge_len -= plen; + sge_off += plen; + data_len -= plen; + fp_off = 0; + + if (++seg > (int)MAX_ARRAY) { + siw_dbg_qp(tx_qp(c_tx), "to many fragments\n"); + if (!is_kva && !c_tx->use_sendpage) { + siw_unmap_pages(page_array, hdr_len, + seg - 1); + } + wqe->processed -= c_tx->bytes_unsent; + rv = -EMSGSIZE; + goto done_crc; + } + } +sge_done: + /* Update SGE variables at end of SGE */ + if (sge_off == sge->length && + (data_len != 0 || wqe->processed < wqe->bytes)) { + sge_idx++; + sge++; + sge_off = 0; + } + } + /* trailer */ + if (likely(c_tx->state != SIW_SEND_TRAILER)) { + iov[seg].iov_base = &c_tx->trailer.pad[4 - c_tx->pad]; + iov[seg].iov_len = trl_len = MAX_TRAILER - (4 - c_tx->pad); + } else { + iov[seg].iov_base = &c_tx->trailer.pad[c_tx->ctrl_sent]; + iov[seg].iov_len = trl_len = MAX_TRAILER - c_tx->ctrl_sent; + } + + if (c_tx->pad) { + *(u32 *)c_tx->trailer.pad = 0; + if (do_crc) + crypto_shash_update(c_tx->mpa_crc_hd, + (u8 *)&c_tx->trailer.crc - c_tx->pad, + c_tx->pad); + } + if (!c_tx->mpa_crc_hd) + c_tx->trailer.crc = 0; + else if (do_crc) + crypto_shash_final(c_tx->mpa_crc_hd, (u8 *)&c_tx->trailer.crc); + + data_len = c_tx->bytes_unsent; + + if (c_tx->use_sendpage) { + rv = siw_0copy_tx(s, page_array, &wqe->sqe.sge[c_tx->sge_idx], + c_tx->sge_off, data_len); + if (rv == data_len) { + rv = kernel_sendmsg(s, &msg, &iov[seg], 1, trl_len); + if (rv > 0) + rv += data_len; + else + rv = data_len; + } + } else { + rv = kernel_sendmsg(s, &msg, iov, seg + 1, + hdr_len + data_len + trl_len); + if (!is_kva) + siw_unmap_pages(page_array, hdr_len, seg); + } + if (rv < (int)hdr_len) { + /* Not even complete hdr pushed or negative rv */ + wqe->processed -= data_len; + if (rv >= 0) { + c_tx->ctrl_sent += rv; + rv = -EAGAIN; + } + goto done_crc; + } + rv -= hdr_len; + + if (rv >= (int)data_len) { + /* all user data pushed to TCP or no data to push */ + if (data_len > 0 && wqe->processed < wqe->bytes) { + /* Save the current state for next tx */ + c_tx->sge_idx = sge_idx; + c_tx->sge_off = sge_off; + c_tx->pbl_idx = pbl_idx; + } + rv -= data_len; + + if (rv == trl_len) /* all pushed */ + rv = 0; + else { + c_tx->state = SIW_SEND_TRAILER; + c_tx->ctrl_len = MAX_TRAILER; + c_tx->ctrl_sent = rv + 4 - c_tx->pad; + c_tx->bytes_unsent = 0; + rv = -EAGAIN; + } + + } else if (data_len > 0) { + /* Maybe some user data pushed to TCP */ + c_tx->state = SIW_SEND_DATA; + wqe->processed -= data_len - rv; + + if (rv) { + /* + * Some bytes out. Recompute tx state based + * on old state and bytes pushed + */ + unsigned int sge_unsent; + + c_tx->bytes_unsent -= rv; + sge = &wqe->sqe.sge[c_tx->sge_idx]; + sge_unsent = sge->length - c_tx->sge_off; + + while (sge_unsent <= rv) { + rv -= sge_unsent; + c_tx->sge_idx++; + c_tx->sge_off = 0; + sge++; + sge_unsent = sge->length; + } + c_tx->sge_off += rv; + } + rv = -EAGAIN; + } +done_crc: + c_tx->do_crc = 0; +done: + return rv; +} + +static void siw_update_tcpseg(struct siw_iwarp_tx *c_tx, + struct socket *s) +{ + struct tcp_sock *tp = tcp_sk(s->sk); + + if (tp->gso_segs) { + if (c_tx->gso_seg_limit == 0) + c_tx->tcp_seglen = tp->mss_cache * tp->gso_segs; + else + c_tx->tcp_seglen = + tp->mss_cache * + min_t(u16, c_tx->gso_seg_limit, tp->gso_segs); + } else { + c_tx->tcp_seglen = tp->mss_cache; + } + /* Loopback may give odd numbers */ + c_tx->tcp_seglen &= 0xfffffff8; +} + +/* + * siw_prepare_fpdu() + * + * Prepares transmit context to send out one FPDU if FPDU will contain + * user data and user data are not immediate data. + * Computes maximum FPDU length to fill up TCP MSS if possible. + * + * @qp: QP from which to transmit + * @wqe: Current WQE causing transmission + * + * TODO: Take into account real available sendspace on socket + * to avoid header misalignment due to send pausing within + * fpdu transmission + */ +static void siw_prepare_fpdu(struct siw_qp *qp, struct siw_wqe *wqe) +{ + struct siw_iwarp_tx *c_tx = &qp->tx_ctx; + int data_len; + + c_tx->ctrl_len = + iwarp_pktinfo[__rdmap_get_opcode(&c_tx->pkt.ctrl)].hdr_len; + c_tx->ctrl_sent = 0; + + /* + * Update target buffer offset if any + */ + if (!(c_tx->pkt.ctrl.ddp_rdmap_ctrl & DDP_FLAG_TAGGED)) + /* Untagged message */ + c_tx->pkt.c_untagged.ddp_mo = cpu_to_be32(wqe->processed); + else /* Tagged message */ + c_tx->pkt.c_tagged.ddp_to = + cpu_to_be64(wqe->sqe.raddr + wqe->processed); + + data_len = wqe->bytes - wqe->processed; + if (data_len + c_tx->ctrl_len + MPA_CRC_SIZE > c_tx->tcp_seglen) { + /* Trim DDP payload to fit into current TCP segment */ + data_len = c_tx->tcp_seglen - (c_tx->ctrl_len + MPA_CRC_SIZE); + c_tx->pkt.ctrl.ddp_rdmap_ctrl &= ~DDP_FLAG_LAST; + c_tx->pad = 0; + } else { + c_tx->pkt.ctrl.ddp_rdmap_ctrl |= DDP_FLAG_LAST; + c_tx->pad = -data_len & 0x3; + } + c_tx->bytes_unsent = data_len; + + c_tx->pkt.ctrl.mpa_len = + htons(c_tx->ctrl_len + data_len - MPA_HDR_SIZE); + + /* + * Init MPA CRC computation + */ + if (c_tx->mpa_crc_hd) { + crypto_shash_init(c_tx->mpa_crc_hd); + crypto_shash_update(c_tx->mpa_crc_hd, (u8 *)&c_tx->pkt, + c_tx->ctrl_len); + c_tx->do_crc = 1; + } +} + +/* + * siw_check_sgl_tx() + * + * Check permissions for a list of SGE's (SGL). + * A successful check will have all memory referenced + * for transmission resolved and assigned to the WQE. + * + * @pd: Protection Domain SGL should belong to + * @wqe: WQE to be checked + * @perms: requested access permissions + * + */ + +static int siw_check_sgl_tx(struct ib_pd *pd, struct siw_wqe *wqe, + enum ib_access_flags perms) +{ + struct siw_sge *sge = &wqe->sqe.sge[0]; + int i, len, num_sge = wqe->sqe.num_sge; + + if (unlikely(num_sge > SIW_MAX_SGE)) + return -EINVAL; + + for (i = 0, len = 0; num_sge; num_sge--, i++, sge++) { + /* + * rdma verbs: do not check stag for a zero length sge + */ + if (sge->length) { + int rv = siw_check_sge(pd, sge, &wqe->mem[i], perms, 0, + sge->length); + + if (unlikely(rv != E_ACCESS_OK)) + return rv; + } + len += sge->length; + } + return len; +} + +/* + * siw_qp_sq_proc_tx() + * + * Process one WQE which needs transmission on the wire. + */ +static int siw_qp_sq_proc_tx(struct siw_qp *qp, struct siw_wqe *wqe) +{ + struct siw_iwarp_tx *c_tx = &qp->tx_ctx; + struct socket *s = qp->attrs.sk; + int rv = 0, burst_len = qp->tx_ctx.burst; + enum rdmap_ecode ecode = RDMAP_ECODE_CATASTROPHIC_STREAM; + + if (unlikely(wqe->wr_status == SIW_WR_IDLE)) + return 0; + + if (!burst_len) + burst_len = SQ_USER_MAXBURST; + + if (wqe->wr_status == SIW_WR_QUEUED) { + if (!(wqe->sqe.flags & SIW_WQE_INLINE)) { + if (tx_type(wqe) == SIW_OP_READ_RESPONSE) + wqe->sqe.num_sge = 1; + + if (tx_type(wqe) != SIW_OP_READ && + tx_type(wqe) != SIW_OP_READ_LOCAL_INV) { + /* + * Reference memory to be tx'd w/o checking + * access for LOCAL_READ permission, since + * not defined in RDMA core. + */ + rv = siw_check_sgl_tx(qp->pd, wqe, 0); + if (rv < 0) { + if (tx_type(wqe) == + SIW_OP_READ_RESPONSE) + ecode = siw_rdmap_error(-rv); + rv = -EINVAL; + goto tx_error; + } + wqe->bytes = rv; + } else { + wqe->bytes = 0; + } + } else { + wqe->bytes = wqe->sqe.sge[0].length; + if (!qp->kernel_verbs) { + if (wqe->bytes > SIW_MAX_INLINE) { + rv = -EINVAL; + goto tx_error; + } + wqe->sqe.sge[0].laddr = (u64)&wqe->sqe.sge[1]; + } + } + wqe->wr_status = SIW_WR_INPROGRESS; + wqe->processed = 0; + + siw_update_tcpseg(c_tx, s); + + rv = siw_qp_prepare_tx(c_tx); + if (rv == PKT_FRAGMENTED) { + c_tx->state = SIW_SEND_HDR; + siw_prepare_fpdu(qp, wqe); + } else if (rv == PKT_COMPLETE) { + c_tx->state = SIW_SEND_SHORT_FPDU; + } else { + goto tx_error; + } + } + +next_segment: + siw_dbg_qp(qp, "wr type %d, state %d, data %u, sent %u, id %llx\n", + tx_type(wqe), wqe->wr_status, wqe->bytes, wqe->processed, + wqe->sqe.id); + + if (--burst_len == 0) { + rv = -EINPROGRESS; + goto tx_done; + } + if (c_tx->state == SIW_SEND_SHORT_FPDU) { + enum siw_opcode tx_type = tx_type(wqe); + unsigned int msg_flags; + + if (siw_sq_empty(qp) || !siw_tcp_nagle || burst_len == 1) + /* + * End current TCP segment, if SQ runs empty, + * or siw_tcp_nagle is not set, or we bail out + * soon due to no burst credit left. + */ + msg_flags = MSG_DONTWAIT; + else + msg_flags = MSG_DONTWAIT | MSG_MORE; + + rv = siw_tx_ctrl(c_tx, s, msg_flags); + + if (!rv && tx_type != SIW_OP_READ && + tx_type != SIW_OP_READ_LOCAL_INV) + wqe->processed = wqe->bytes; + + goto tx_done; + + } else { + rv = siw_tx_hdt(c_tx, s); + } + if (!rv) { + /* + * One segment sent. Processing completed if last + * segment, Do next segment otherwise. + */ + if (unlikely(c_tx->tx_suspend)) { + /* + * Verbs, 6.4.: Try stopping sending after a full + * DDP segment if the connection goes down + * (== peer halfclose) + */ + rv = -ECONNABORTED; + goto tx_done; + } + if (c_tx->pkt.ctrl.ddp_rdmap_ctrl & DDP_FLAG_LAST) { + siw_dbg_qp(qp, "WQE completed\n"); + goto tx_done; + } + c_tx->state = SIW_SEND_HDR; + + siw_update_tcpseg(c_tx, s); + + siw_prepare_fpdu(qp, wqe); + goto next_segment; + } +tx_done: + qp->tx_ctx.burst = burst_len; + return rv; + +tx_error: + if (ecode != RDMAP_ECODE_CATASTROPHIC_STREAM) + siw_init_terminate(qp, TERM_ERROR_LAYER_RDMAP, + RDMAP_ETYPE_REMOTE_PROTECTION, ecode, 1); + else + siw_init_terminate(qp, TERM_ERROR_LAYER_RDMAP, + RDMAP_ETYPE_CATASTROPHIC, + RDMAP_ECODE_UNSPECIFIED, 1); + return rv; +} + +static int siw_fastreg_mr(struct ib_pd *pd, struct siw_sqe *sqe) +{ + struct ib_mr *base_mr = (struct ib_mr *)sqe->base_mr; + struct siw_device *sdev = to_siw_dev(pd->device); + struct siw_mem *mem = siw_mem_id2obj(sdev, sqe->rkey >> 8); + int rv = 0; + + siw_dbg_pd(pd, "STag 0x%08x\n", sqe->rkey); + + if (unlikely(!mem || !base_mr)) { + pr_warn("siw: fastreg: STag 0x%08x unknown\n", sqe->rkey); + return -EINVAL; + } + if (unlikely(base_mr->rkey >> 8 != sqe->rkey >> 8)) { + pr_warn("siw: fastreg: STag 0x%08x: bad MR\n", sqe->rkey); + rv = -EINVAL; + goto out; + } + if (unlikely(mem->pd != pd)) { + pr_warn("siw: fastreg: PD mismatch\n"); + rv = -EINVAL; + goto out; + } + if (unlikely(mem->stag_valid)) { + pr_warn("siw: fastreg: STag 0x%08x already valid\n", sqe->rkey); + rv = -EINVAL; + goto out; + } + /* Refresh STag since user may have changed key part */ + mem->stag = sqe->rkey; + mem->perms = sqe->access; + + siw_dbg_mem(mem, "STag now valid, MR va: 0x%016llx -> 0x%016llx\n", + mem->va, base_mr->iova); + mem->va = base_mr->iova; + mem->stag_valid = 1; +out: + siw_mem_put(mem); + return rv; +} + +static int siw_qp_sq_proc_local(struct siw_qp *qp, struct siw_wqe *wqe) +{ + int rv; + + switch (tx_type(wqe)) { + case SIW_OP_REG_MR: + rv = siw_fastreg_mr(qp->pd, &wqe->sqe); + break; + + case SIW_OP_INVAL_STAG: + rv = siw_invalidate_stag(qp->pd, wqe->sqe.rkey); + break; + + default: + rv = -EINVAL; + } + return rv; +} + +/* + * siw_qp_sq_process() + * + * Core TX path routine for RDMAP/DDP/MPA using a TCP kernel socket. + * Sends RDMAP payload for the current SQ WR @wqe of @qp in one or more + * MPA FPDUs, each containing a DDP segment. + * + * SQ processing may occur in user context as a result of posting + * new WQE's or from siw_sq_work_handler() context. Processing in + * user context is limited to non-kernel verbs users. + * + * SQ processing may get paused anytime, possibly in the middle of a WR + * or FPDU, if insufficient send space is available. SQ processing + * gets resumed from siw_sq_work_handler(), if send space becomes + * available again. + * + * Must be called with the QP state read-locked. + * + * Note: + * An outbound RREQ can be satisfied by the corresponding RRESP + * _before_ it gets assigned to the ORQ. This happens regularly + * in RDMA READ via loopback case. Since both outbound RREQ and + * inbound RRESP can be handled by the same CPU, locking the ORQ + * is dead-lock prone and thus not an option. With that, the + * RREQ gets assigned to the ORQ _before_ being sent - see + * siw_activate_tx() - and pulled back in case of send failure. + */ +int siw_qp_sq_process(struct siw_qp *qp) +{ + struct siw_wqe *wqe = tx_wqe(qp); + enum siw_opcode tx_type; + unsigned long flags; + int rv = 0; + + siw_dbg_qp(qp, "enter for type %d\n", tx_type(wqe)); + +next_wqe: + /* + * Stop QP processing if SQ state changed + */ + if (unlikely(qp->tx_ctx.tx_suspend)) { + siw_dbg_qp(qp, "tx suspended\n"); + goto done; + } + tx_type = tx_type(wqe); + + if (tx_type <= SIW_OP_READ_RESPONSE) + rv = siw_qp_sq_proc_tx(qp, wqe); + else + rv = siw_qp_sq_proc_local(qp, wqe); + + if (!rv) { + /* + * WQE processing done + */ + switch (tx_type) { + case SIW_OP_SEND: + case SIW_OP_SEND_REMOTE_INV: + case SIW_OP_WRITE: + siw_wqe_put_mem(wqe, tx_type); + /* Fall through */ + + case SIW_OP_INVAL_STAG: + case SIW_OP_REG_MR: + if (tx_flags(wqe) & SIW_WQE_SIGNALLED) + siw_sqe_complete(qp, &wqe->sqe, wqe->bytes, + SIW_WC_SUCCESS); + break; + + case SIW_OP_READ: + case SIW_OP_READ_LOCAL_INV: + /* + * already enqueued to ORQ queue + */ + break; + + case SIW_OP_READ_RESPONSE: + siw_wqe_put_mem(wqe, tx_type); + break; + + default: + WARN(1, "undefined WQE type %d\n", tx_type); + rv = -EINVAL; + goto done; + } + + spin_lock_irqsave(&qp->sq_lock, flags); + wqe->wr_status = SIW_WR_IDLE; + rv = siw_activate_tx(qp); + spin_unlock_irqrestore(&qp->sq_lock, flags); + + if (rv <= 0) + goto done; + + goto next_wqe; + + } else if (rv == -EAGAIN) { + siw_dbg_qp(qp, "sq paused: hd/tr %d of %d, data %d\n", + qp->tx_ctx.ctrl_sent, qp->tx_ctx.ctrl_len, + qp->tx_ctx.bytes_unsent); + rv = 0; + goto done; + } else if (rv == -EINPROGRESS) { + rv = siw_sq_start(qp); + goto done; + } else { + /* + * WQE processing failed. + * Verbs 8.3.2: + * o It turns any WQE into a signalled WQE. + * o Local catastrophic error must be surfaced + * o QP must be moved into Terminate state: done by code + * doing socket state change processing + * + * o TODO: Termination message must be sent. + * o TODO: Implement more precise work completion errors, + * see enum ib_wc_status in ib_verbs.h + */ + siw_dbg_qp(qp, "wqe type %d processing failed: %d\n", + tx_type(wqe), rv); + + spin_lock_irqsave(&qp->sq_lock, flags); + /* + * RREQ may have already been completed by inbound RRESP! + */ + if (tx_type == SIW_OP_READ || + tx_type == SIW_OP_READ_LOCAL_INV) { + /* Cleanup pending entry in ORQ */ + qp->orq_put--; + qp->orq[qp->orq_put % qp->attrs.orq_size].flags = 0; + } + spin_unlock_irqrestore(&qp->sq_lock, flags); + /* + * immediately suspends further TX processing + */ + if (!qp->tx_ctx.tx_suspend) + siw_qp_cm_drop(qp, 0); + + switch (tx_type) { + case SIW_OP_SEND: + case SIW_OP_SEND_REMOTE_INV: + case SIW_OP_SEND_WITH_IMM: + case SIW_OP_WRITE: + case SIW_OP_READ: + case SIW_OP_READ_LOCAL_INV: + siw_wqe_put_mem(wqe, tx_type); + /* Fall through */ + + case SIW_OP_INVAL_STAG: + case SIW_OP_REG_MR: + siw_sqe_complete(qp, &wqe->sqe, wqe->bytes, + SIW_WC_LOC_QP_OP_ERR); + + siw_qp_event(qp, IB_EVENT_QP_FATAL); + + break; + + case SIW_OP_READ_RESPONSE: + siw_dbg_qp(qp, "proc. read.response failed: %d\n", rv); + + siw_qp_event(qp, IB_EVENT_QP_REQ_ERR); + + siw_wqe_put_mem(wqe, SIW_OP_READ_RESPONSE); + + break; + + default: + WARN(1, "undefined WQE type %d\n", tx_type); + rv = -EINVAL; + } + wqe->wr_status = SIW_WR_IDLE; + } +done: + return rv; +} + +static void siw_sq_resume(struct siw_qp *qp) +{ + if (down_read_trylock(&qp->state_lock)) { + if (likely(qp->attrs.state == SIW_QP_STATE_RTS && + !qp->tx_ctx.tx_suspend)) { + int rv = siw_qp_sq_process(qp); + + up_read(&qp->state_lock); + + if (unlikely(rv < 0)) { + siw_dbg_qp(qp, "SQ task failed: err %d\n", rv); + + if (!qp->tx_ctx.tx_suspend) + siw_qp_cm_drop(qp, 0); + } + } else { + up_read(&qp->state_lock); + } + } else { + siw_dbg_qp(qp, "Resume SQ while QP locked\n"); + } + siw_qp_put(qp); +} + +struct tx_task_t { + struct llist_head active; + wait_queue_head_t waiting; +}; + +static DEFINE_PER_CPU(struct tx_task_t, siw_tx_task_g); + +void siw_stop_tx_thread(int nr_cpu) +{ + kthread_stop(siw_tx_thread[nr_cpu]); + wake_up(&per_cpu(siw_tx_task_g, nr_cpu).waiting); +} + +int siw_run_sq(void *data) +{ + const int nr_cpu = (unsigned int)(long)data; + struct llist_node *active; + struct siw_qp *qp; + struct tx_task_t *tx_task = &per_cpu(siw_tx_task_g, nr_cpu); + + init_llist_head(&tx_task->active); + init_waitqueue_head(&tx_task->waiting); + + while (1) { + struct llist_node *fifo_list = NULL; + + wait_event_interruptible(tx_task->waiting, + !llist_empty(&tx_task->active) || + kthread_should_stop()); + + if (kthread_should_stop()) + break; + + active = llist_del_all(&tx_task->active); + /* + * llist_del_all returns a list with newest entry first. + * Re-order list for fairness among QP's. + */ + while (active) { + struct llist_node *tmp = active; + + active = llist_next(active); + tmp->next = fifo_list; + fifo_list = tmp; + } + while (fifo_list) { + qp = container_of(fifo_list, struct siw_qp, tx_list); + fifo_list = llist_next(fifo_list); + qp->tx_list.next = NULL; + + siw_sq_resume(qp); + } + } + active = llist_del_all(&tx_task->active); + if (active) { + llist_for_each_entry(qp, active, tx_list) { + qp->tx_list.next = NULL; + siw_sq_resume(qp); + } + } + return 0; +} + +int siw_sq_start(struct siw_qp *qp) +{ + if (tx_wqe(qp)->wr_status == SIW_WR_IDLE) + return 0; + + if (unlikely(!cpu_online(qp->tx_cpu))) { + siw_put_tx_cpu(qp->tx_cpu); + qp->tx_cpu = siw_get_tx_cpu(qp->sdev); + if (qp->tx_cpu < 0) { + pr_warn("siw: no tx cpu available\n"); + + return -EIO; + } + } + siw_qp_get(qp); + + llist_add(&qp->tx_list, &per_cpu(siw_tx_task_g, qp->tx_cpu).active); + + wake_up(&per_cpu(siw_tx_task_g, qp->tx_cpu).waiting); + + return 0; +} diff --git a/drivers/infiniband/sw/siw/siw_verbs.c b/drivers/infiniband/sw/siw/siw_verbs.c new file mode 100644 index 000000000000..32dc79d0e898 --- /dev/null +++ b/drivers/infiniband/sw/siw/siw_verbs.c @@ -0,0 +1,1760 @@ +// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause + +/* Authors: Bernard Metzler <bmt@zurich.ibm.com> */ +/* Copyright (c) 2008-2019, IBM Corporation */ + +#include <linux/errno.h> +#include <linux/types.h> +#include <linux/uaccess.h> +#include <linux/vmalloc.h> +#include <linux/xarray.h> + +#include <rdma/iw_cm.h> +#include <rdma/ib_verbs.h> +#include <rdma/ib_user_verbs.h> +#include <rdma/uverbs_ioctl.h> + +#include "siw.h" +#include "siw_verbs.h" +#include "siw_mem.h" + +static int ib_qp_state_to_siw_qp_state[IB_QPS_ERR + 1] = { + [IB_QPS_RESET] = SIW_QP_STATE_IDLE, + [IB_QPS_INIT] = SIW_QP_STATE_IDLE, + [IB_QPS_RTR] = SIW_QP_STATE_RTR, + [IB_QPS_RTS] = SIW_QP_STATE_RTS, + [IB_QPS_SQD] = SIW_QP_STATE_CLOSING, + [IB_QPS_SQE] = SIW_QP_STATE_TERMINATE, + [IB_QPS_ERR] = SIW_QP_STATE_ERROR +}; + +static char ib_qp_state_to_string[IB_QPS_ERR + 1][sizeof("RESET")] = { + [IB_QPS_RESET] = "RESET", [IB_QPS_INIT] = "INIT", [IB_QPS_RTR] = "RTR", + [IB_QPS_RTS] = "RTS", [IB_QPS_SQD] = "SQD", [IB_QPS_SQE] = "SQE", + [IB_QPS_ERR] = "ERR" +}; + +static u32 siw_create_uobj(struct siw_ucontext *uctx, void *vaddr, u32 size) +{ + struct siw_uobj *uobj; + struct xa_limit limit = XA_LIMIT(0, SIW_UOBJ_MAX_KEY); + u32 key; + + uobj = kzalloc(sizeof(*uobj), GFP_KERNEL); + if (!uobj) + return SIW_INVAL_UOBJ_KEY; + + if (xa_alloc_cyclic(&uctx->xa, &key, uobj, limit, &uctx->uobj_nextkey, + GFP_KERNEL) < 0) { + kfree(uobj); + return SIW_INVAL_UOBJ_KEY; + } + uobj->size = PAGE_ALIGN(size); + uobj->addr = vaddr; + + return key; +} + +static struct siw_uobj *siw_get_uobj(struct siw_ucontext *uctx, + unsigned long off, u32 size) +{ + struct siw_uobj *uobj = xa_load(&uctx->xa, off); + + if (uobj && uobj->size == size) + return uobj; + + return NULL; +} + +int siw_mmap(struct ib_ucontext *ctx, struct vm_area_struct *vma) +{ + struct siw_ucontext *uctx = to_siw_ctx(ctx); + struct siw_uobj *uobj; + unsigned long off = vma->vm_pgoff; + int size = vma->vm_end - vma->vm_start; + int rv = -EINVAL; + + /* + * Must be page aligned + */ + if (vma->vm_start & (PAGE_SIZE - 1)) { + pr_warn("siw: mmap not page aligned\n"); + goto out; + } + uobj = siw_get_uobj(uctx, off, size); + if (!uobj) { + siw_dbg(&uctx->sdev->base_dev, "mmap lookup failed: %lu, %u\n", + off, size); + goto out; + } + rv = remap_vmalloc_range(vma, uobj->addr, 0); + if (rv) + pr_warn("remap_vmalloc_range failed: %lu, %u\n", off, size); +out: + return rv; +} + +int siw_alloc_ucontext(struct ib_ucontext *base_ctx, struct ib_udata *udata) +{ + struct siw_device *sdev = to_siw_dev(base_ctx->device); + struct siw_ucontext *ctx = to_siw_ctx(base_ctx); + struct siw_uresp_alloc_ctx uresp = {}; + int rv; + + if (atomic_inc_return(&sdev->num_ctx) > SIW_MAX_CONTEXT) { + rv = -ENOMEM; + goto err_out; + } + xa_init_flags(&ctx->xa, XA_FLAGS_ALLOC); + ctx->uobj_nextkey = 0; + ctx->sdev = sdev; + + uresp.dev_id = sdev->vendor_part_id; + + if (udata->outlen < sizeof(uresp)) { + rv = -EINVAL; + goto err_out; + } + rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp)); + if (rv) + goto err_out; + + siw_dbg(base_ctx->device, "success. now %d context(s)\n", + atomic_read(&sdev->num_ctx)); + + return 0; + +err_out: + atomic_dec(&sdev->num_ctx); + siw_dbg(base_ctx->device, "failure %d. now %d context(s)\n", rv, + atomic_read(&sdev->num_ctx)); + + return rv; +} + +void siw_dealloc_ucontext(struct ib_ucontext *base_ctx) +{ + struct siw_ucontext *uctx = to_siw_ctx(base_ctx); + void *entry; + unsigned long index; + + /* + * Make sure all user mmap objects are gone. Since QP, CQ + * and SRQ destroy routines destroy related objects, nothing + * should be found here. + */ + xa_for_each(&uctx->xa, index, entry) { + kfree(xa_erase(&uctx->xa, index)); + pr_warn("siw: dropping orphaned uobj at %lu\n", index); + } + xa_destroy(&uctx->xa); + atomic_dec(&uctx->sdev->num_ctx); +} + +int siw_query_device(struct ib_device *base_dev, struct ib_device_attr *attr, + struct ib_udata *udata) +{ + struct siw_device *sdev = to_siw_dev(base_dev); + + if (udata->inlen || udata->outlen) + return -EINVAL; + + memset(attr, 0, sizeof(*attr)); + + /* Revisit atomic caps if RFC 7306 gets supported */ + attr->atomic_cap = 0; + attr->device_cap_flags = + IB_DEVICE_MEM_MGT_EXTENSIONS | IB_DEVICE_ALLOW_USER_UNREG; + attr->max_cq = sdev->attrs.max_cq; + attr->max_cqe = sdev->attrs.max_cqe; + attr->max_fast_reg_page_list_len = SIW_MAX_SGE_PBL; + attr->max_fmr = sdev->attrs.max_fmr; + attr->max_mr = sdev->attrs.max_mr; + attr->max_mw = sdev->attrs.max_mw; + attr->max_mr_size = ~0ull; + attr->max_pd = sdev->attrs.max_pd; + attr->max_qp = sdev->attrs.max_qp; + attr->max_qp_init_rd_atom = sdev->attrs.max_ird; + attr->max_qp_rd_atom = sdev->attrs.max_ord; + attr->max_qp_wr = sdev->attrs.max_qp_wr; + attr->max_recv_sge = sdev->attrs.max_sge; + attr->max_res_rd_atom = sdev->attrs.max_qp * sdev->attrs.max_ird; + attr->max_send_sge = sdev->attrs.max_sge; + attr->max_sge_rd = sdev->attrs.max_sge_rd; + attr->max_srq = sdev->attrs.max_srq; + attr->max_srq_sge = sdev->attrs.max_srq_sge; + attr->max_srq_wr = sdev->attrs.max_srq_wr; + attr->page_size_cap = PAGE_SIZE; + attr->vendor_id = SIW_VENDOR_ID; + attr->vendor_part_id = sdev->vendor_part_id; + + memcpy(&attr->sys_image_guid, sdev->netdev->dev_addr, 6); + + return 0; +} + +int siw_query_port(struct ib_device *base_dev, u8 port, + struct ib_port_attr *attr) +{ + struct siw_device *sdev = to_siw_dev(base_dev); + + memset(attr, 0, sizeof(*attr)); + + attr->active_mtu = attr->max_mtu; + attr->active_speed = 2; + attr->active_width = 2; + attr->gid_tbl_len = 1; + attr->max_msg_sz = -1; + attr->max_mtu = ib_mtu_int_to_enum(sdev->netdev->mtu); + attr->phys_state = sdev->state == IB_PORT_ACTIVE ? 5 : 3; + attr->pkey_tbl_len = 1; + attr->port_cap_flags = IB_PORT_CM_SUP | IB_PORT_DEVICE_MGMT_SUP; + attr->state = sdev->state; + /* + * All zero + * + * attr->lid = 0; + * attr->bad_pkey_cntr = 0; + * attr->qkey_viol_cntr = 0; + * attr->sm_lid = 0; + * attr->lmc = 0; + * attr->max_vl_num = 0; + * attr->sm_sl = 0; + * attr->subnet_timeout = 0; + * attr->init_type_repy = 0; + */ + return 0; +} + +int siw_get_port_immutable(struct ib_device *base_dev, u8 port, + struct ib_port_immutable *port_immutable) +{ + struct ib_port_attr attr; + int rv = siw_query_port(base_dev, port, &attr); + + if (rv) + return rv; + + port_immutable->pkey_tbl_len = attr.pkey_tbl_len; + port_immutable->gid_tbl_len = attr.gid_tbl_len; + port_immutable->core_cap_flags = RDMA_CORE_PORT_IWARP; + + return 0; +} + +int siw_query_pkey(struct ib_device *base_dev, u8 port, u16 idx, u16 *pkey) +{ + /* Report the default pkey */ + *pkey = 0xffff; + return 0; +} + +int siw_query_gid(struct ib_device *base_dev, u8 port, int idx, + union ib_gid *gid) +{ + struct siw_device *sdev = to_siw_dev(base_dev); + + /* subnet_prefix == interface_id == 0; */ + memset(gid, 0, sizeof(*gid)); + memcpy(&gid->raw[0], sdev->netdev->dev_addr, 6); + + return 0; +} + +int siw_alloc_pd(struct ib_pd *pd, struct ib_udata *udata) +{ + struct siw_device *sdev = to_siw_dev(pd->device); + + if (atomic_inc_return(&sdev->num_pd) > SIW_MAX_PD) { + atomic_dec(&sdev->num_pd); + return -ENOMEM; + } + siw_dbg_pd(pd, "now %d PD's(s)\n", atomic_read(&sdev->num_pd)); + + return 0; +} + +void siw_dealloc_pd(struct ib_pd *pd, struct ib_udata *udata) +{ + struct siw_device *sdev = to_siw_dev(pd->device); + + siw_dbg_pd(pd, "free PD\n"); + atomic_dec(&sdev->num_pd); +} + +void siw_qp_get_ref(struct ib_qp *base_qp) +{ + siw_qp_get(to_siw_qp(base_qp)); +} + +void siw_qp_put_ref(struct ib_qp *base_qp) +{ + siw_qp_put(to_siw_qp(base_qp)); +} + +/* + * siw_create_qp() + * + * Create QP of requested size on given device. + * + * @pd: Protection Domain + * @attrs: Initial QP attributes. + * @udata: used to provide QP ID, SQ and RQ size back to user. + */ + +struct ib_qp *siw_create_qp(struct ib_pd *pd, + struct ib_qp_init_attr *attrs, + struct ib_udata *udata) +{ + struct siw_qp *qp = NULL; + struct siw_base_qp *siw_base_qp = NULL; + struct ib_device *base_dev = pd->device; + struct siw_device *sdev = to_siw_dev(base_dev); + struct siw_ucontext *uctx = + rdma_udata_to_drv_context(udata, struct siw_ucontext, + base_ucontext); + struct siw_cq *scq = NULL, *rcq = NULL; + unsigned long flags; + int num_sqe, num_rqe, rv = 0; + + siw_dbg(base_dev, "create new QP\n"); + + if (atomic_inc_return(&sdev->num_qp) > SIW_MAX_QP) { + siw_dbg(base_dev, "too many QP's\n"); + rv = -ENOMEM; + goto err_out; + } + if (attrs->qp_type != IB_QPT_RC) { + siw_dbg(base_dev, "only RC QP's supported\n"); + rv = -EINVAL; + goto err_out; + } + if ((attrs->cap.max_send_wr > SIW_MAX_QP_WR) || + (attrs->cap.max_recv_wr > SIW_MAX_QP_WR) || + (attrs->cap.max_send_sge > SIW_MAX_SGE) || + (attrs->cap.max_recv_sge > SIW_MAX_SGE)) { + siw_dbg(base_dev, "QP size error\n"); + rv = -EINVAL; + goto err_out; + } + if (attrs->cap.max_inline_data > SIW_MAX_INLINE) { + siw_dbg(base_dev, "max inline send: %d > %d\n", + attrs->cap.max_inline_data, (int)SIW_MAX_INLINE); + rv = -EINVAL; + goto err_out; + } + /* + * NOTE: we allow for zero element SQ and RQ WQE's SGL's + * but not for a QP unable to hold any WQE (SQ + RQ) + */ + if (attrs->cap.max_send_wr + attrs->cap.max_recv_wr == 0) { + siw_dbg(base_dev, "QP must have send or receive queue\n"); + rv = -EINVAL; + goto err_out; + } + scq = to_siw_cq(attrs->send_cq); + rcq = to_siw_cq(attrs->recv_cq); + + if (!scq || (!rcq && !attrs->srq)) { + siw_dbg(base_dev, "send CQ or receive CQ invalid\n"); + rv = -EINVAL; + goto err_out; + } + siw_base_qp = kzalloc(sizeof(*siw_base_qp), GFP_KERNEL); + if (!siw_base_qp) { + rv = -ENOMEM; + goto err_out; + } + qp = kzalloc(sizeof(*qp), GFP_KERNEL); + if (!qp) { + rv = -ENOMEM; + goto err_out; + } + siw_base_qp->qp = qp; + qp->ib_qp = &siw_base_qp->base_qp; + + init_rwsem(&qp->state_lock); + spin_lock_init(&qp->sq_lock); + spin_lock_init(&qp->rq_lock); + spin_lock_init(&qp->orq_lock); + + qp->kernel_verbs = !udata; + qp->xa_sq_index = SIW_INVAL_UOBJ_KEY; + qp->xa_rq_index = SIW_INVAL_UOBJ_KEY; + + rv = siw_qp_add(sdev, qp); + if (rv) + goto err_out; + + /* All queue indices are derived from modulo operations + * on a free running 'get' (consumer) and 'put' (producer) + * unsigned counter. Having queue sizes at power of two + * avoids handling counter wrap around. + */ + num_sqe = roundup_pow_of_two(attrs->cap.max_send_wr); + num_rqe = roundup_pow_of_two(attrs->cap.max_recv_wr); + + if (qp->kernel_verbs) + qp->sendq = vzalloc(num_sqe * sizeof(struct siw_sqe)); + else + qp->sendq = vmalloc_user(num_sqe * sizeof(struct siw_sqe)); + + if (qp->sendq == NULL) { + siw_dbg(base_dev, "SQ size %d alloc failed\n", num_sqe); + rv = -ENOMEM; + goto err_out_xa; + } + if (attrs->sq_sig_type != IB_SIGNAL_REQ_WR) { + if (attrs->sq_sig_type == IB_SIGNAL_ALL_WR) + qp->attrs.flags |= SIW_SIGNAL_ALL_WR; + else { + rv = -EINVAL; + goto err_out_xa; + } + } + qp->pd = pd; + qp->scq = scq; + qp->rcq = rcq; + + if (attrs->srq) { + /* + * SRQ support. + * Verbs 6.3.7: ignore RQ size, if SRQ present + * Verbs 6.3.5: do not check PD of SRQ against PD of QP + */ + qp->srq = to_siw_srq(attrs->srq); + qp->attrs.rq_size = 0; + siw_dbg(base_dev, "QP [%u]: [SRQ 0x%p] attached\n", + qp->qp_num, qp->srq); + } else if (num_rqe) { + if (qp->kernel_verbs) + qp->recvq = vzalloc(num_rqe * sizeof(struct siw_rqe)); + else + qp->recvq = + vmalloc_user(num_rqe * sizeof(struct siw_rqe)); + + if (qp->recvq == NULL) { + siw_dbg(base_dev, "RQ size %d alloc failed\n", num_rqe); + rv = -ENOMEM; + goto err_out_xa; + } + qp->attrs.rq_size = num_rqe; + } + qp->attrs.sq_size = num_sqe; + qp->attrs.sq_max_sges = attrs->cap.max_send_sge; + qp->attrs.rq_max_sges = attrs->cap.max_recv_sge; + + /* Make those two tunables fixed for now. */ + qp->tx_ctx.gso_seg_limit = 1; + qp->tx_ctx.zcopy_tx = zcopy_tx; + + qp->attrs.state = SIW_QP_STATE_IDLE; + + if (udata) { + struct siw_uresp_create_qp uresp = {}; + + uresp.num_sqe = num_sqe; + uresp.num_rqe = num_rqe; + uresp.qp_id = qp_id(qp); + + if (qp->sendq) { + qp->xa_sq_index = + siw_create_uobj(uctx, qp->sendq, + num_sqe * sizeof(struct siw_sqe)); + } + if (qp->recvq) { + qp->xa_rq_index = + siw_create_uobj(uctx, qp->recvq, + num_rqe * sizeof(struct siw_rqe)); + } + if (qp->xa_sq_index == SIW_INVAL_UOBJ_KEY || + qp->xa_rq_index == SIW_INVAL_UOBJ_KEY) { + rv = -ENOMEM; + goto err_out_xa; + } + uresp.sq_key = qp->xa_sq_index << PAGE_SHIFT; + uresp.rq_key = qp->xa_rq_index << PAGE_SHIFT; + + if (udata->outlen < sizeof(uresp)) { + rv = -EINVAL; + goto err_out_xa; + } + rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp)); + if (rv) + goto err_out_xa; + } + qp->tx_cpu = siw_get_tx_cpu(sdev); + if (qp->tx_cpu < 0) { + rv = -EINVAL; + goto err_out_xa; + } + INIT_LIST_HEAD(&qp->devq); + spin_lock_irqsave(&sdev->lock, flags); + list_add_tail(&qp->devq, &sdev->qp_list); + spin_unlock_irqrestore(&sdev->lock, flags); + + return qp->ib_qp; + +err_out_xa: + xa_erase(&sdev->qp_xa, qp_id(qp)); +err_out: + kfree(siw_base_qp); + + if (qp) { + if (qp->xa_sq_index != SIW_INVAL_UOBJ_KEY) + kfree(xa_erase(&uctx->xa, qp->xa_sq_index)); + if (qp->xa_rq_index != SIW_INVAL_UOBJ_KEY) + kfree(xa_erase(&uctx->xa, qp->xa_rq_index)); + + vfree(qp->sendq); + vfree(qp->recvq); + kfree(qp); + } + atomic_dec(&sdev->num_qp); + + return ERR_PTR(rv); +} + +/* + * Minimum siw_query_qp() verb interface. + * + * @qp_attr_mask is not used but all available information is provided + */ +int siw_query_qp(struct ib_qp *base_qp, struct ib_qp_attr *qp_attr, + int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr) +{ + struct siw_qp *qp; + struct siw_device *sdev; + + if (base_qp && qp_attr && qp_init_attr) { + qp = to_siw_qp(base_qp); + sdev = to_siw_dev(base_qp->device); + } else { + return -EINVAL; + } + qp_attr->cap.max_inline_data = SIW_MAX_INLINE; + qp_attr->cap.max_send_wr = qp->attrs.sq_size; + qp_attr->cap.max_send_sge = qp->attrs.sq_max_sges; + qp_attr->cap.max_recv_wr = qp->attrs.rq_size; + qp_attr->cap.max_recv_sge = qp->attrs.rq_max_sges; + qp_attr->path_mtu = ib_mtu_int_to_enum(sdev->netdev->mtu); + qp_attr->max_rd_atomic = qp->attrs.irq_size; + qp_attr->max_dest_rd_atomic = qp->attrs.orq_size; + + qp_attr->qp_access_flags = IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_REMOTE_WRITE | + IB_ACCESS_REMOTE_READ; + + qp_init_attr->qp_type = base_qp->qp_type; + qp_init_attr->send_cq = base_qp->send_cq; + qp_init_attr->recv_cq = base_qp->recv_cq; + qp_init_attr->srq = base_qp->srq; + + qp_init_attr->cap = qp_attr->cap; + + return 0; +} + +int siw_verbs_modify_qp(struct ib_qp *base_qp, struct ib_qp_attr *attr, + int attr_mask, struct ib_udata *udata) +{ + struct siw_qp_attrs new_attrs; + enum siw_qp_attr_mask siw_attr_mask = 0; + struct siw_qp *qp = to_siw_qp(base_qp); + int rv = 0; + + if (!attr_mask) + return 0; + + memset(&new_attrs, 0, sizeof(new_attrs)); + + if (attr_mask & IB_QP_ACCESS_FLAGS) { + siw_attr_mask = SIW_QP_ATTR_ACCESS_FLAGS; + + if (attr->qp_access_flags & IB_ACCESS_REMOTE_READ) + new_attrs.flags |= SIW_RDMA_READ_ENABLED; + if (attr->qp_access_flags & IB_ACCESS_REMOTE_WRITE) + new_attrs.flags |= SIW_RDMA_WRITE_ENABLED; + if (attr->qp_access_flags & IB_ACCESS_MW_BIND) + new_attrs.flags |= SIW_RDMA_BIND_ENABLED; + } + if (attr_mask & IB_QP_STATE) { + siw_dbg_qp(qp, "desired IB QP state: %s\n", + ib_qp_state_to_string[attr->qp_state]); + + new_attrs.state = ib_qp_state_to_siw_qp_state[attr->qp_state]; + + if (new_attrs.state > SIW_QP_STATE_RTS) + qp->tx_ctx.tx_suspend = 1; + + siw_attr_mask |= SIW_QP_ATTR_STATE; + } + if (!siw_attr_mask) + goto out; + + down_write(&qp->state_lock); + + rv = siw_qp_modify(qp, &new_attrs, siw_attr_mask); + + up_write(&qp->state_lock); +out: + return rv; +} + +int siw_destroy_qp(struct ib_qp *base_qp, struct ib_udata *udata) +{ + struct siw_qp *qp = to_siw_qp(base_qp); + struct siw_base_qp *siw_base_qp = to_siw_base_qp(base_qp); + struct siw_ucontext *uctx = + rdma_udata_to_drv_context(udata, struct siw_ucontext, + base_ucontext); + struct siw_qp_attrs qp_attrs; + + siw_dbg_qp(qp, "state %d, cep 0x%p\n", qp->attrs.state, qp->cep); + + /* + * Mark QP as in process of destruction to prevent from + * any async callbacks to RDMA core + */ + qp->attrs.flags |= SIW_QP_IN_DESTROY; + qp->rx_stream.rx_suspend = 1; + + if (uctx && qp->xa_sq_index != SIW_INVAL_UOBJ_KEY) + kfree(xa_erase(&uctx->xa, qp->xa_sq_index)); + if (uctx && qp->xa_rq_index != SIW_INVAL_UOBJ_KEY) + kfree(xa_erase(&uctx->xa, qp->xa_rq_index)); + + down_write(&qp->state_lock); + + qp_attrs.state = SIW_QP_STATE_ERROR; + siw_qp_modify(qp, &qp_attrs, SIW_QP_ATTR_STATE); + + if (qp->cep) { + siw_cep_put(qp->cep); + qp->cep = NULL; + } + up_write(&qp->state_lock); + + kfree(qp->tx_ctx.mpa_crc_hd); + kfree(qp->rx_stream.mpa_crc_hd); + + qp->scq = qp->rcq = NULL; + + siw_qp_put(qp); + kfree(siw_base_qp); + + return 0; +} + +/* + * siw_copy_inline_sgl() + * + * Prepare sgl of inlined data for sending. For userland callers + * function checks if given buffer addresses and len's are within + * process context bounds. + * Data from all provided sge's are copied together into the wqe, + * referenced by a single sge. + */ +static int siw_copy_inline_sgl(const struct ib_send_wr *core_wr, + struct siw_sqe *sqe) +{ + struct ib_sge *core_sge = core_wr->sg_list; + void *kbuf = &sqe->sge[1]; + int num_sge = core_wr->num_sge, bytes = 0; + + sqe->sge[0].laddr = (u64)kbuf; + sqe->sge[0].lkey = 0; + + while (num_sge--) { + if (!core_sge->length) { + core_sge++; + continue; + } + bytes += core_sge->length; + if (bytes > SIW_MAX_INLINE) { + bytes = -EINVAL; + break; + } + memcpy(kbuf, (void *)(uintptr_t)core_sge->addr, + core_sge->length); + + kbuf += core_sge->length; + core_sge++; + } + sqe->sge[0].length = bytes > 0 ? bytes : 0; + sqe->num_sge = bytes > 0 ? 1 : 0; + + return bytes; +} + +/* + * siw_post_send() + * + * Post a list of S-WR's to a SQ. + * + * @base_qp: Base QP contained in siw QP + * @wr: Null terminated list of user WR's + * @bad_wr: Points to failing WR in case of synchronous failure. + */ +int siw_post_send(struct ib_qp *base_qp, const struct ib_send_wr *wr, + const struct ib_send_wr **bad_wr) +{ + struct siw_qp *qp = to_siw_qp(base_qp); + struct siw_wqe *wqe = tx_wqe(qp); + + unsigned long flags; + int rv = 0; + + /* + * Try to acquire QP state lock. Must be non-blocking + * to accommodate kernel clients needs. + */ + if (!down_read_trylock(&qp->state_lock)) { + *bad_wr = wr; + siw_dbg_qp(qp, "QP locked, state %d\n", qp->attrs.state); + return -ENOTCONN; + } + if (unlikely(qp->attrs.state != SIW_QP_STATE_RTS)) { + up_read(&qp->state_lock); + *bad_wr = wr; + siw_dbg_qp(qp, "QP out of state %d\n", qp->attrs.state); + return -ENOTCONN; + } + if (wr && !qp->kernel_verbs) { + siw_dbg_qp(qp, "wr must be empty for user mapped sq\n"); + up_read(&qp->state_lock); + *bad_wr = wr; + return -EINVAL; + } + spin_lock_irqsave(&qp->sq_lock, flags); + + while (wr) { + u32 idx = qp->sq_put % qp->attrs.sq_size; + struct siw_sqe *sqe = &qp->sendq[idx]; + + if (sqe->flags) { + siw_dbg_qp(qp, "sq full\n"); + rv = -ENOMEM; + break; + } + if (wr->num_sge > qp->attrs.sq_max_sges) { + siw_dbg_qp(qp, "too many sge's: %d\n", wr->num_sge); + rv = -EINVAL; + break; + } + sqe->id = wr->wr_id; + + if ((wr->send_flags & IB_SEND_SIGNALED) || + (qp->attrs.flags & SIW_SIGNAL_ALL_WR)) + sqe->flags |= SIW_WQE_SIGNALLED; + + if (wr->send_flags & IB_SEND_FENCE) + sqe->flags |= SIW_WQE_READ_FENCE; + + switch (wr->opcode) { + case IB_WR_SEND: + case IB_WR_SEND_WITH_INV: + if (wr->send_flags & IB_SEND_SOLICITED) + sqe->flags |= SIW_WQE_SOLICITED; + + if (!(wr->send_flags & IB_SEND_INLINE)) { + siw_copy_sgl(wr->sg_list, sqe->sge, + wr->num_sge); + sqe->num_sge = wr->num_sge; + } else { + rv = siw_copy_inline_sgl(wr, sqe); + if (rv <= 0) { + rv = -EINVAL; + break; + } + sqe->flags |= SIW_WQE_INLINE; + sqe->num_sge = 1; + } + if (wr->opcode == IB_WR_SEND) + sqe->opcode = SIW_OP_SEND; + else { + sqe->opcode = SIW_OP_SEND_REMOTE_INV; + sqe->rkey = wr->ex.invalidate_rkey; + } + break; + + case IB_WR_RDMA_READ_WITH_INV: + case IB_WR_RDMA_READ: + /* + * iWarp restricts RREAD sink to SGL containing + * 1 SGE only. we could relax to SGL with multiple + * elements referring the SAME ltag or even sending + * a private per-rreq tag referring to a checked + * local sgl with MULTIPLE ltag's. + */ + if (unlikely(wr->num_sge != 1)) { + rv = -EINVAL; + break; + } + siw_copy_sgl(wr->sg_list, &sqe->sge[0], 1); + /* + * NOTE: zero length RREAD is allowed! + */ + sqe->raddr = rdma_wr(wr)->remote_addr; + sqe->rkey = rdma_wr(wr)->rkey; + sqe->num_sge = 1; + + if (wr->opcode == IB_WR_RDMA_READ) + sqe->opcode = SIW_OP_READ; + else + sqe->opcode = SIW_OP_READ_LOCAL_INV; + break; + + case IB_WR_RDMA_WRITE: + if (!(wr->send_flags & IB_SEND_INLINE)) { + siw_copy_sgl(wr->sg_list, &sqe->sge[0], + wr->num_sge); + sqe->num_sge = wr->num_sge; + } else { + rv = siw_copy_inline_sgl(wr, sqe); + if (unlikely(rv < 0)) { + rv = -EINVAL; + break; + } + sqe->flags |= SIW_WQE_INLINE; + sqe->num_sge = 1; + } + sqe->raddr = rdma_wr(wr)->remote_addr; + sqe->rkey = rdma_wr(wr)->rkey; + sqe->opcode = SIW_OP_WRITE; + break; + + case IB_WR_REG_MR: + sqe->base_mr = (uint64_t)reg_wr(wr)->mr; + sqe->rkey = reg_wr(wr)->key; + sqe->access = reg_wr(wr)->access & IWARP_ACCESS_MASK; + sqe->opcode = SIW_OP_REG_MR; + break; + + case IB_WR_LOCAL_INV: + sqe->rkey = wr->ex.invalidate_rkey; + sqe->opcode = SIW_OP_INVAL_STAG; + break; + + default: + siw_dbg_qp(qp, "ib wr type %d unsupported\n", + wr->opcode); + rv = -EINVAL; + break; + } + siw_dbg_qp(qp, "opcode %d, flags 0x%x, wr_id 0x%p\n", + sqe->opcode, sqe->flags, (void *)sqe->id); + + if (unlikely(rv < 0)) + break; + + /* make SQE only valid after completely written */ + smp_wmb(); + sqe->flags |= SIW_WQE_VALID; + + qp->sq_put++; + wr = wr->next; + } + + /* + * Send directly if SQ processing is not in progress. + * Eventual immediate errors (rv < 0) do not affect the involved + * RI resources (Verbs, 8.3.1) and thus do not prevent from SQ + * processing, if new work is already pending. But rv must be passed + * to caller. + */ + if (wqe->wr_status != SIW_WR_IDLE) { + spin_unlock_irqrestore(&qp->sq_lock, flags); + goto skip_direct_sending; + } + rv = siw_activate_tx(qp); + spin_unlock_irqrestore(&qp->sq_lock, flags); + + if (rv <= 0) + goto skip_direct_sending; + + if (qp->kernel_verbs) { + rv = siw_sq_start(qp); + } else { + qp->tx_ctx.in_syscall = 1; + + if (siw_qp_sq_process(qp) != 0 && !(qp->tx_ctx.tx_suspend)) + siw_qp_cm_drop(qp, 0); + + qp->tx_ctx.in_syscall = 0; + } +skip_direct_sending: + + up_read(&qp->state_lock); + + if (rv >= 0) + return 0; + /* + * Immediate error + */ + siw_dbg_qp(qp, "error %d\n", rv); + + *bad_wr = wr; + return rv; +} + +/* + * siw_post_receive() + * + * Post a list of R-WR's to a RQ. + * + * @base_qp: Base QP contained in siw QP + * @wr: Null terminated list of user WR's + * @bad_wr: Points to failing WR in case of synchronous failure. + */ +int siw_post_receive(struct ib_qp *base_qp, const struct ib_recv_wr *wr, + const struct ib_recv_wr **bad_wr) +{ + struct siw_qp *qp = to_siw_qp(base_qp); + unsigned long flags; + int rv = 0; + + if (qp->srq) { + *bad_wr = wr; + return -EOPNOTSUPP; /* what else from errno.h? */ + } + /* + * Try to acquire QP state lock. Must be non-blocking + * to accommodate kernel clients needs. + */ + if (!down_read_trylock(&qp->state_lock)) { + *bad_wr = wr; + return -ENOTCONN; + } + if (!qp->kernel_verbs) { + siw_dbg_qp(qp, "no kernel post_recv for user mapped sq\n"); + up_read(&qp->state_lock); + *bad_wr = wr; + return -EINVAL; + } + if (qp->attrs.state > SIW_QP_STATE_RTS) { + up_read(&qp->state_lock); + *bad_wr = wr; + return -EINVAL; + } + /* + * Serialize potentially multiple producers. + * Not needed for single threaded consumer side. + */ + spin_lock_irqsave(&qp->rq_lock, flags); + + while (wr) { + u32 idx = qp->rq_put % qp->attrs.rq_size; + struct siw_rqe *rqe = &qp->recvq[idx]; + + if (rqe->flags) { + siw_dbg_qp(qp, "RQ full\n"); + rv = -ENOMEM; + break; + } + if (wr->num_sge > qp->attrs.rq_max_sges) { + siw_dbg_qp(qp, "too many sge's: %d\n", wr->num_sge); + rv = -EINVAL; + break; + } + rqe->id = wr->wr_id; + rqe->num_sge = wr->num_sge; + siw_copy_sgl(wr->sg_list, rqe->sge, wr->num_sge); + + /* make sure RQE is completely written before valid */ + smp_wmb(); + + rqe->flags = SIW_WQE_VALID; + + qp->rq_put++; + wr = wr->next; + } + spin_unlock_irqrestore(&qp->rq_lock, flags); + + up_read(&qp->state_lock); + + if (rv < 0) { + siw_dbg_qp(qp, "error %d\n", rv); + *bad_wr = wr; + } + return rv > 0 ? 0 : rv; +} + +void siw_destroy_cq(struct ib_cq *base_cq, struct ib_udata *udata) +{ + struct siw_cq *cq = to_siw_cq(base_cq); + struct siw_device *sdev = to_siw_dev(base_cq->device); + struct siw_ucontext *ctx = + rdma_udata_to_drv_context(udata, struct siw_ucontext, + base_ucontext); + + siw_dbg_cq(cq, "free CQ resources\n"); + + siw_cq_flush(cq); + + if (ctx && cq->xa_cq_index != SIW_INVAL_UOBJ_KEY) + kfree(xa_erase(&ctx->xa, cq->xa_cq_index)); + + atomic_dec(&sdev->num_cq); + + vfree(cq->queue); +} + +/* + * siw_create_cq() + * + * Populate CQ of requested size + * + * @base_cq: CQ as allocated by RDMA midlayer + * @attr: Initial CQ attributes + * @udata: relates to user context + */ + +int siw_create_cq(struct ib_cq *base_cq, const struct ib_cq_init_attr *attr, + struct ib_udata *udata) +{ + struct siw_device *sdev = to_siw_dev(base_cq->device); + struct siw_cq *cq = to_siw_cq(base_cq); + int rv, size = attr->cqe; + + if (atomic_inc_return(&sdev->num_cq) > SIW_MAX_CQ) { + siw_dbg(base_cq->device, "too many CQ's\n"); + rv = -ENOMEM; + goto err_out; + } + if (size < 1 || size > sdev->attrs.max_cqe) { + siw_dbg(base_cq->device, "CQ size error: %d\n", size); + rv = -EINVAL; + goto err_out; + } + size = roundup_pow_of_two(size); + cq->base_cq.cqe = size; + cq->num_cqe = size; + cq->xa_cq_index = SIW_INVAL_UOBJ_KEY; + + if (!udata) { + cq->kernel_verbs = 1; + cq->queue = vzalloc(size * sizeof(struct siw_cqe) + + sizeof(struct siw_cq_ctrl)); + } else { + cq->queue = vmalloc_user(size * sizeof(struct siw_cqe) + + sizeof(struct siw_cq_ctrl)); + } + if (cq->queue == NULL) { + rv = -ENOMEM; + goto err_out; + } + get_random_bytes(&cq->id, 4); + siw_dbg(base_cq->device, "new CQ [%u]\n", cq->id); + + spin_lock_init(&cq->lock); + + cq->notify = &((struct siw_cq_ctrl *)&cq->queue[size])->notify; + + if (udata) { + struct siw_uresp_create_cq uresp = {}; + struct siw_ucontext *ctx = + rdma_udata_to_drv_context(udata, struct siw_ucontext, + base_ucontext); + + cq->xa_cq_index = + siw_create_uobj(ctx, cq->queue, + size * sizeof(struct siw_cqe) + + sizeof(struct siw_cq_ctrl)); + if (cq->xa_cq_index == SIW_INVAL_UOBJ_KEY) { + rv = -ENOMEM; + goto err_out; + } + uresp.cq_key = cq->xa_cq_index << PAGE_SHIFT; + uresp.cq_id = cq->id; + uresp.num_cqe = size; + + if (udata->outlen < sizeof(uresp)) { + rv = -EINVAL; + goto err_out; + } + rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp)); + if (rv) + goto err_out; + } + return 0; + +err_out: + siw_dbg(base_cq->device, "CQ creation failed: %d", rv); + + if (cq && cq->queue) { + struct siw_ucontext *ctx = + rdma_udata_to_drv_context(udata, struct siw_ucontext, + base_ucontext); + if (cq->xa_cq_index != SIW_INVAL_UOBJ_KEY) + kfree(xa_erase(&ctx->xa, cq->xa_cq_index)); + vfree(cq->queue); + } + atomic_dec(&sdev->num_cq); + + return rv; +} + +/* + * siw_poll_cq() + * + * Reap CQ entries if available and copy work completion status into + * array of WC's provided by caller. Returns number of reaped CQE's. + * + * @base_cq: Base CQ contained in siw CQ. + * @num_cqe: Maximum number of CQE's to reap. + * @wc: Array of work completions to be filled by siw. + */ +int siw_poll_cq(struct ib_cq *base_cq, int num_cqe, struct ib_wc *wc) +{ + struct siw_cq *cq = to_siw_cq(base_cq); + int i; + + for (i = 0; i < num_cqe; i++) { + if (!siw_reap_cqe(cq, wc)) + break; + wc++; + } + return i; +} + +/* + * siw_req_notify_cq() + * + * Request notification for new CQE's added to that CQ. + * Defined flags: + * o SIW_CQ_NOTIFY_SOLICITED lets siw trigger a notification + * event if a WQE with notification flag set enters the CQ + * o SIW_CQ_NOTIFY_NEXT_COMP lets siw trigger a notification + * event if a WQE enters the CQ. + * o IB_CQ_REPORT_MISSED_EVENTS: return value will provide the + * number of not reaped CQE's regardless of its notification + * type and current or new CQ notification settings. + * + * @base_cq: Base CQ contained in siw CQ. + * @flags: Requested notification flags. + */ +int siw_req_notify_cq(struct ib_cq *base_cq, enum ib_cq_notify_flags flags) +{ + struct siw_cq *cq = to_siw_cq(base_cq); + + siw_dbg_cq(cq, "flags: 0x%02x\n", flags); + + if ((flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED) + /* CQ event for next solicited completion */ + smp_store_mb(*cq->notify, SIW_NOTIFY_SOLICITED); + else + /* CQ event for any signalled completion */ + smp_store_mb(*cq->notify, SIW_NOTIFY_ALL); + + if (flags & IB_CQ_REPORT_MISSED_EVENTS) + return cq->cq_put - cq->cq_get; + + return 0; +} + +/* + * siw_dereg_mr() + * + * Release Memory Region. + * + * @base_mr: Base MR contained in siw MR. + * @udata: points to user context, unused. + */ +int siw_dereg_mr(struct ib_mr *base_mr, struct ib_udata *udata) +{ + struct siw_mr *mr = to_siw_mr(base_mr); + struct siw_device *sdev = to_siw_dev(base_mr->device); + + siw_dbg_mem(mr->mem, "deregister MR\n"); + + atomic_dec(&sdev->num_mr); + + siw_mr_drop_mem(mr); + kfree_rcu(mr, rcu); + + return 0; +} + +/* + * siw_reg_user_mr() + * + * Register Memory Region. + * + * @pd: Protection Domain + * @start: starting address of MR (virtual address) + * @len: len of MR + * @rnic_va: not used by siw + * @rights: MR access rights + * @udata: user buffer to communicate STag and Key. + */ +struct ib_mr *siw_reg_user_mr(struct ib_pd *pd, u64 start, u64 len, + u64 rnic_va, int rights, struct ib_udata *udata) +{ + struct siw_mr *mr = NULL; + struct siw_umem *umem = NULL; + struct siw_ureq_reg_mr ureq; + struct siw_device *sdev = to_siw_dev(pd->device); + + unsigned long mem_limit = rlimit(RLIMIT_MEMLOCK); + int rv; + + siw_dbg_pd(pd, "start: 0x%016llx, va: 0x%016llx, len: %llu\n", + (unsigned long long)start, (unsigned long long)rnic_va, + (unsigned long long)len); + + if (atomic_inc_return(&sdev->num_mr) > SIW_MAX_MR) { + siw_dbg_pd(pd, "too many mr's\n"); + rv = -ENOMEM; + goto err_out; + } + if (!len) { + rv = -EINVAL; + goto err_out; + } + if (mem_limit != RLIM_INFINITY) { + unsigned long num_pages = + (PAGE_ALIGN(len + (start & ~PAGE_MASK))) >> PAGE_SHIFT; + mem_limit >>= PAGE_SHIFT; + + if (num_pages > mem_limit - current->mm->locked_vm) { + siw_dbg_pd(pd, "pages req %lu, max %lu, lock %lu\n", + num_pages, mem_limit, + current->mm->locked_vm); + rv = -ENOMEM; + goto err_out; + } + } + umem = siw_umem_get(start, len, ib_access_writable(rights)); + if (IS_ERR(umem)) { + rv = PTR_ERR(umem); + siw_dbg_pd(pd, "getting user memory failed: %d\n", rv); + umem = NULL; + goto err_out; + } + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) { + rv = -ENOMEM; + goto err_out; + } + rv = siw_mr_add_mem(mr, pd, umem, start, len, rights); + if (rv) + goto err_out; + + if (udata) { + struct siw_uresp_reg_mr uresp = {}; + struct siw_mem *mem = mr->mem; + + if (udata->inlen < sizeof(ureq)) { + rv = -EINVAL; + goto err_out; + } + rv = ib_copy_from_udata(&ureq, udata, sizeof(ureq)); + if (rv) + goto err_out; + + mr->base_mr.lkey |= ureq.stag_key; + mr->base_mr.rkey |= ureq.stag_key; + mem->stag |= ureq.stag_key; + uresp.stag = mem->stag; + + if (udata->outlen < sizeof(uresp)) { + rv = -EINVAL; + goto err_out; + } + rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp)); + if (rv) + goto err_out; + } + mr->mem->stag_valid = 1; + + return &mr->base_mr; + +err_out: + atomic_dec(&sdev->num_mr); + if (mr) { + if (mr->mem) + siw_mr_drop_mem(mr); + kfree_rcu(mr, rcu); + } else { + if (umem) + siw_umem_release(umem, false); + } + return ERR_PTR(rv); +} + +struct ib_mr *siw_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type, + u32 max_sge, struct ib_udata *udata) +{ + struct siw_device *sdev = to_siw_dev(pd->device); + struct siw_mr *mr = NULL; + struct siw_pbl *pbl = NULL; + int rv; + + if (atomic_inc_return(&sdev->num_mr) > SIW_MAX_MR) { + siw_dbg_pd(pd, "too many mr's\n"); + rv = -ENOMEM; + goto err_out; + } + if (mr_type != IB_MR_TYPE_MEM_REG) { + siw_dbg_pd(pd, "mr type %d unsupported\n", mr_type); + rv = -EOPNOTSUPP; + goto err_out; + } + if (max_sge > SIW_MAX_SGE_PBL) { + siw_dbg_pd(pd, "too many sge's: %d\n", max_sge); + rv = -ENOMEM; + goto err_out; + } + pbl = siw_pbl_alloc(max_sge); + if (IS_ERR(pbl)) { + rv = PTR_ERR(pbl); + siw_dbg_pd(pd, "pbl allocation failed: %d\n", rv); + pbl = NULL; + goto err_out; + } + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) { + rv = -ENOMEM; + goto err_out; + } + rv = siw_mr_add_mem(mr, pd, pbl, 0, max_sge * PAGE_SIZE, 0); + if (rv) + goto err_out; + + mr->mem->is_pbl = 1; + + siw_dbg_pd(pd, "[MEM %u]: success\n", mr->mem->stag); + + return &mr->base_mr; + +err_out: + atomic_dec(&sdev->num_mr); + + if (!mr) { + kfree(pbl); + } else { + if (mr->mem) + siw_mr_drop_mem(mr); + kfree_rcu(mr, rcu); + } + siw_dbg_pd(pd, "failed: %d\n", rv); + + return ERR_PTR(rv); +} + +/* Just used to count number of pages being mapped */ +static int siw_set_pbl_page(struct ib_mr *base_mr, u64 buf_addr) +{ + return 0; +} + +int siw_map_mr_sg(struct ib_mr *base_mr, struct scatterlist *sl, int num_sle, + unsigned int *sg_off) +{ + struct scatterlist *slp; + struct siw_mr *mr = to_siw_mr(base_mr); + struct siw_mem *mem = mr->mem; + struct siw_pbl *pbl = mem->pbl; + struct siw_pble *pble; + u64 pbl_size; + int i, rv; + + if (!pbl) { + siw_dbg_mem(mem, "no PBL allocated\n"); + return -EINVAL; + } + pble = pbl->pbe; + + if (pbl->max_buf < num_sle) { + siw_dbg_mem(mem, "too many SGE's: %d > %d\n", + mem->pbl->max_buf, num_sle); + return -ENOMEM; + } + for_each_sg(sl, slp, num_sle, i) { + if (sg_dma_len(slp) == 0) { + siw_dbg_mem(mem, "empty SGE\n"); + return -EINVAL; + } + if (i == 0) { + pble->addr = sg_dma_address(slp); + pble->size = sg_dma_len(slp); + pble->pbl_off = 0; + pbl_size = pble->size; + pbl->num_buf = 1; + } else { + /* Merge PBL entries if adjacent */ + if (pble->addr + pble->size == sg_dma_address(slp)) { + pble->size += sg_dma_len(slp); + } else { + pble++; + pbl->num_buf++; + pble->addr = sg_dma_address(slp); + pble->size = sg_dma_len(slp); + pble->pbl_off = pbl_size; + } + pbl_size += sg_dma_len(slp); + } + siw_dbg_mem(mem, + "sge[%d], size %llu, addr 0x%016llx, total %llu\n", + i, pble->size, pble->addr, pbl_size); + } + rv = ib_sg_to_pages(base_mr, sl, num_sle, sg_off, siw_set_pbl_page); + if (rv > 0) { + mem->len = base_mr->length; + mem->va = base_mr->iova; + siw_dbg_mem(mem, + "%llu bytes, start 0x%016llx, %u SLE to %u entries\n", + mem->len, mem->va, num_sle, pbl->num_buf); + } + return rv; +} + +/* + * siw_get_dma_mr() + * + * Create a (empty) DMA memory region, where no umem is attached. + */ +struct ib_mr *siw_get_dma_mr(struct ib_pd *pd, int rights) +{ + struct siw_device *sdev = to_siw_dev(pd->device); + struct siw_mr *mr = NULL; + int rv; + + if (atomic_inc_return(&sdev->num_mr) > SIW_MAX_MR) { + siw_dbg_pd(pd, "too many mr's\n"); + rv = -ENOMEM; + goto err_out; + } + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) { + rv = -ENOMEM; + goto err_out; + } + rv = siw_mr_add_mem(mr, pd, NULL, 0, ULONG_MAX, rights); + if (rv) + goto err_out; + + mr->mem->stag_valid = 1; + + siw_dbg_pd(pd, "[MEM %u]: success\n", mr->mem->stag); + + return &mr->base_mr; + +err_out: + if (rv) + kfree(mr); + + atomic_dec(&sdev->num_mr); + + return ERR_PTR(rv); +} + +/* + * siw_create_srq() + * + * Create Shared Receive Queue of attributes @init_attrs + * within protection domain given by @pd. + * + * @base_srq: Base SRQ contained in siw SRQ. + * @init_attrs: SRQ init attributes. + * @udata: points to user context + */ +int siw_create_srq(struct ib_srq *base_srq, + struct ib_srq_init_attr *init_attrs, struct ib_udata *udata) +{ + struct siw_srq *srq = to_siw_srq(base_srq); + struct ib_srq_attr *attrs = &init_attrs->attr; + struct siw_device *sdev = to_siw_dev(base_srq->device); + struct siw_ucontext *ctx = + rdma_udata_to_drv_context(udata, struct siw_ucontext, + base_ucontext); + int rv; + + if (atomic_inc_return(&sdev->num_srq) > SIW_MAX_SRQ) { + siw_dbg_pd(base_srq->pd, "too many SRQ's\n"); + rv = -ENOMEM; + goto err_out; + } + if (attrs->max_wr == 0 || attrs->max_wr > SIW_MAX_SRQ_WR || + attrs->max_sge > SIW_MAX_SGE || attrs->srq_limit > attrs->max_wr) { + rv = -EINVAL; + goto err_out; + } + srq->max_sge = attrs->max_sge; + srq->num_rqe = roundup_pow_of_two(attrs->max_wr); + srq->xa_srq_index = SIW_INVAL_UOBJ_KEY; + srq->limit = attrs->srq_limit; + if (srq->limit) + srq->armed = 1; + + srq->kernel_verbs = !udata; + + if (udata) + srq->recvq = + vmalloc_user(srq->num_rqe * sizeof(struct siw_rqe)); + else + srq->recvq = vzalloc(srq->num_rqe * sizeof(struct siw_rqe)); + + if (srq->recvq == NULL) { + rv = -ENOMEM; + goto err_out; + } + if (udata) { + struct siw_uresp_create_srq uresp = {}; + + srq->xa_srq_index = siw_create_uobj( + ctx, srq->recvq, srq->num_rqe * sizeof(struct siw_rqe)); + + if (srq->xa_srq_index == SIW_INVAL_UOBJ_KEY) { + rv = -ENOMEM; + goto err_out; + } + uresp.srq_key = srq->xa_srq_index; + uresp.num_rqe = srq->num_rqe; + + if (udata->outlen < sizeof(uresp)) { + rv = -EINVAL; + goto err_out; + } + rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp)); + if (rv) + goto err_out; + } + spin_lock_init(&srq->lock); + + siw_dbg_pd(base_srq->pd, "[SRQ 0x%p]: success\n", srq); + + return 0; + +err_out: + if (srq->recvq) { + if (ctx && srq->xa_srq_index != SIW_INVAL_UOBJ_KEY) + kfree(xa_erase(&ctx->xa, srq->xa_srq_index)); + vfree(srq->recvq); + } + atomic_dec(&sdev->num_srq); + + return rv; +} + +/* + * siw_modify_srq() + * + * Modify SRQ. The caller may resize SRQ and/or set/reset notification + * limit and (re)arm IB_EVENT_SRQ_LIMIT_REACHED notification. + * + * NOTE: it is unclear if RDMA core allows for changing the MAX_SGE + * parameter. siw_modify_srq() does not check the attrs->max_sge param. + */ +int siw_modify_srq(struct ib_srq *base_srq, struct ib_srq_attr *attrs, + enum ib_srq_attr_mask attr_mask, struct ib_udata *udata) +{ + struct siw_srq *srq = to_siw_srq(base_srq); + unsigned long flags; + int rv = 0; + + spin_lock_irqsave(&srq->lock, flags); + + if (attr_mask & IB_SRQ_MAX_WR) { + /* resize request not yet supported */ + rv = -EOPNOTSUPP; + goto out; + } + if (attr_mask & IB_SRQ_LIMIT) { + if (attrs->srq_limit) { + if (unlikely(attrs->srq_limit > srq->num_rqe)) { + rv = -EINVAL; + goto out; + } + srq->armed = 1; + } else { + srq->armed = 0; + } + srq->limit = attrs->srq_limit; + } +out: + spin_unlock_irqrestore(&srq->lock, flags); + + return rv; +} + +/* + * siw_query_srq() + * + * Query SRQ attributes. + */ +int siw_query_srq(struct ib_srq *base_srq, struct ib_srq_attr *attrs) +{ + struct siw_srq *srq = to_siw_srq(base_srq); + unsigned long flags; + + spin_lock_irqsave(&srq->lock, flags); + + attrs->max_wr = srq->num_rqe; + attrs->max_sge = srq->max_sge; + attrs->srq_limit = srq->limit; + + spin_unlock_irqrestore(&srq->lock, flags); + + return 0; +} + +/* + * siw_destroy_srq() + * + * Destroy SRQ. + * It is assumed that the SRQ is not referenced by any + * QP anymore - the code trusts the RDMA core environment to keep track + * of QP references. + */ +void siw_destroy_srq(struct ib_srq *base_srq, struct ib_udata *udata) +{ + struct siw_srq *srq = to_siw_srq(base_srq); + struct siw_device *sdev = to_siw_dev(base_srq->device); + struct siw_ucontext *ctx = + rdma_udata_to_drv_context(udata, struct siw_ucontext, + base_ucontext); + + if (ctx && srq->xa_srq_index != SIW_INVAL_UOBJ_KEY) + kfree(xa_erase(&ctx->xa, srq->xa_srq_index)); + + vfree(srq->recvq); + atomic_dec(&sdev->num_srq); +} + +/* + * siw_post_srq_recv() + * + * Post a list of receive queue elements to SRQ. + * NOTE: The function does not check or lock a certain SRQ state + * during the post operation. The code simply trusts the + * RDMA core environment. + * + * @base_srq: Base SRQ contained in siw SRQ + * @wr: List of R-WR's + * @bad_wr: Updated to failing WR if posting fails. + */ +int siw_post_srq_recv(struct ib_srq *base_srq, const struct ib_recv_wr *wr, + const struct ib_recv_wr **bad_wr) +{ + struct siw_srq *srq = to_siw_srq(base_srq); + unsigned long flags; + int rv = 0; + + if (unlikely(!srq->kernel_verbs)) { + siw_dbg_pd(base_srq->pd, + "[SRQ 0x%p]: no kernel post_recv for mapped srq\n", + srq); + rv = -EINVAL; + goto out; + } + /* + * Serialize potentially multiple producers. + * Also needed to serialize potentially multiple + * consumers. + */ + spin_lock_irqsave(&srq->lock, flags); + + while (wr) { + u32 idx = srq->rq_put % srq->num_rqe; + struct siw_rqe *rqe = &srq->recvq[idx]; + + if (rqe->flags) { + siw_dbg_pd(base_srq->pd, "SRQ full\n"); + rv = -ENOMEM; + break; + } + if (unlikely(wr->num_sge > srq->max_sge)) { + siw_dbg_pd(base_srq->pd, + "[SRQ 0x%p]: too many sge's: %d\n", srq, + wr->num_sge); + rv = -EINVAL; + break; + } + rqe->id = wr->wr_id; + rqe->num_sge = wr->num_sge; + siw_copy_sgl(wr->sg_list, rqe->sge, wr->num_sge); + + /* Make sure S-RQE is completely written before valid */ + smp_wmb(); + + rqe->flags = SIW_WQE_VALID; + + srq->rq_put++; + wr = wr->next; + } + spin_unlock_irqrestore(&srq->lock, flags); +out: + if (unlikely(rv < 0)) { + siw_dbg_pd(base_srq->pd, "[SRQ 0x%p]: error %d\n", srq, rv); + *bad_wr = wr; + } + return rv; +} + +void siw_qp_event(struct siw_qp *qp, enum ib_event_type etype) +{ + struct ib_event event; + struct ib_qp *base_qp = qp->ib_qp; + + /* + * Do not report asynchronous errors on QP which gets + * destroyed via verbs interface (siw_destroy_qp()) + */ + if (qp->attrs.flags & SIW_QP_IN_DESTROY) + return; + + event.event = etype; + event.device = base_qp->device; + event.element.qp = base_qp; + + if (base_qp->event_handler) { + siw_dbg_qp(qp, "reporting event %d\n", etype); + base_qp->event_handler(&event, base_qp->qp_context); + } +} + +void siw_cq_event(struct siw_cq *cq, enum ib_event_type etype) +{ + struct ib_event event; + struct ib_cq *base_cq = &cq->base_cq; + + event.event = etype; + event.device = base_cq->device; + event.element.cq = base_cq; + + if (base_cq->event_handler) { + siw_dbg_cq(cq, "reporting CQ event %d\n", etype); + base_cq->event_handler(&event, base_cq->cq_context); + } +} + +void siw_srq_event(struct siw_srq *srq, enum ib_event_type etype) +{ + struct ib_event event; + struct ib_srq *base_srq = &srq->base_srq; + + event.event = etype; + event.device = base_srq->device; + event.element.srq = base_srq; + + if (base_srq->event_handler) { + siw_dbg_pd(srq->base_srq.pd, + "reporting SRQ event %d\n", etype); + base_srq->event_handler(&event, base_srq->srq_context); + } +} + +void siw_port_event(struct siw_device *sdev, u8 port, enum ib_event_type etype) +{ + struct ib_event event; + + event.event = etype; + event.device = &sdev->base_dev; + event.element.port_num = port; + + siw_dbg(&sdev->base_dev, "reporting port event %d\n", etype); + + ib_dispatch_event(&event); +} diff --git a/drivers/infiniband/sw/siw/siw_verbs.h b/drivers/infiniband/sw/siw/siw_verbs.h new file mode 100644 index 000000000000..1910869281cb --- /dev/null +++ b/drivers/infiniband/sw/siw/siw_verbs.h @@ -0,0 +1,91 @@ +/* SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause */ + +/* Authors: Bernard Metzler <bmt@zurich.ibm.com> */ +/* Copyright (c) 2008-2019, IBM Corporation */ + +#ifndef _SIW_VERBS_H +#define _SIW_VERBS_H + +#include <linux/errno.h> + +#include <rdma/iw_cm.h> +#include <rdma/ib_verbs.h> +#include <rdma/ib_user_verbs.h> + +#include "siw.h" +#include "siw_cm.h" + +/* + * siw_copy_sgl() + * + * Copy SGL from RDMA core representation to local + * representation. + */ +static inline void siw_copy_sgl(struct ib_sge *sge, struct siw_sge *siw_sge, + int num_sge) +{ + while (num_sge--) { + siw_sge->laddr = sge->addr; + siw_sge->length = sge->length; + siw_sge->lkey = sge->lkey; + + siw_sge++; + sge++; + } +} + +int siw_alloc_ucontext(struct ib_ucontext *base_ctx, struct ib_udata *udata); +void siw_dealloc_ucontext(struct ib_ucontext *base_ctx); +int siw_query_port(struct ib_device *base_dev, u8 port, + struct ib_port_attr *attr); +int siw_get_port_immutable(struct ib_device *base_dev, u8 port, + struct ib_port_immutable *port_immutable); +int siw_query_device(struct ib_device *base_dev, struct ib_device_attr *attr, + struct ib_udata *udata); +int siw_create_cq(struct ib_cq *base_cq, const struct ib_cq_init_attr *attr, + struct ib_udata *udata); +int siw_query_port(struct ib_device *base_dev, u8 port, + struct ib_port_attr *attr); +int siw_query_pkey(struct ib_device *base_dev, u8 port, u16 idx, u16 *pkey); +int siw_query_gid(struct ib_device *base_dev, u8 port, int idx, + union ib_gid *gid); +int siw_alloc_pd(struct ib_pd *base_pd, struct ib_udata *udata); +void siw_dealloc_pd(struct ib_pd *base_pd, struct ib_udata *udata); +struct ib_qp *siw_create_qp(struct ib_pd *base_pd, + struct ib_qp_init_attr *attr, + struct ib_udata *udata); +int siw_query_qp(struct ib_qp *base_qp, struct ib_qp_attr *qp_attr, + int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr); +int siw_verbs_modify_qp(struct ib_qp *base_qp, struct ib_qp_attr *attr, + int attr_mask, struct ib_udata *udata); +int siw_destroy_qp(struct ib_qp *base_qp, struct ib_udata *udata); +int siw_post_send(struct ib_qp *base_qp, const struct ib_send_wr *wr, + const struct ib_send_wr **bad_wr); +int siw_post_receive(struct ib_qp *base_qp, const struct ib_recv_wr *wr, + const struct ib_recv_wr **bad_wr); +void siw_destroy_cq(struct ib_cq *base_cq, struct ib_udata *udata); +int siw_poll_cq(struct ib_cq *base_cq, int num_entries, struct ib_wc *wc); +int siw_req_notify_cq(struct ib_cq *base_cq, enum ib_cq_notify_flags flags); +struct ib_mr *siw_reg_user_mr(struct ib_pd *base_pd, u64 start, u64 len, + u64 rnic_va, int rights, struct ib_udata *udata); +struct ib_mr *siw_alloc_mr(struct ib_pd *base_pd, enum ib_mr_type mr_type, + u32 max_sge, struct ib_udata *udata); +struct ib_mr *siw_get_dma_mr(struct ib_pd *base_pd, int rights); +int siw_map_mr_sg(struct ib_mr *base_mr, struct scatterlist *sl, int num_sle, + unsigned int *sg_off); +int siw_dereg_mr(struct ib_mr *base_mr, struct ib_udata *udata); +int siw_create_srq(struct ib_srq *base_srq, struct ib_srq_init_attr *attr, + struct ib_udata *udata); +int siw_modify_srq(struct ib_srq *base_srq, struct ib_srq_attr *attr, + enum ib_srq_attr_mask mask, struct ib_udata *udata); +int siw_query_srq(struct ib_srq *base_srq, struct ib_srq_attr *attr); +void siw_destroy_srq(struct ib_srq *base_srq, struct ib_udata *udata); +int siw_post_srq_recv(struct ib_srq *base_srq, const struct ib_recv_wr *wr, + const struct ib_recv_wr **bad_wr); +int siw_mmap(struct ib_ucontext *ctx, struct vm_area_struct *vma); +void siw_qp_event(struct siw_qp *qp, enum ib_event_type type); +void siw_cq_event(struct siw_cq *cq, enum ib_event_type type); +void siw_srq_event(struct siw_srq *srq, enum ib_event_type type); +void siw_port_event(struct siw_device *dev, u8 port, enum ib_event_type type); + +#endif |