From f8181697fd09bb3b6d857cef31d9af93ea2b0758 Mon Sep 17 00:00:00 2001
From: Ira Weiny <ira.weiny@intel.com>
Date: Fri, 1 Jul 2016 16:00:49 -0700
Subject: IB/hfi1: Clean up port state structure definition

The definition of port state changed mid development and the
old structure was kept accidentally.  Remove this dead code.

Reviewed-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Signed-off-by: Ira Weiny <ira.weiny@intel.com>
Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 include/rdma/opa_port_info.h | 16 ----------------
 1 file changed, 16 deletions(-)

(limited to 'include')

diff --git a/include/rdma/opa_port_info.h b/include/rdma/opa_port_info.h
index 2b95c2c336eb..9303e0e4f508 100644
--- a/include/rdma/opa_port_info.h
+++ b/include/rdma/opa_port_info.h
@@ -33,11 +33,6 @@
 #if !defined(OPA_PORT_INFO_H)
 #define OPA_PORT_INFO_H
 
-/* Temporary until HFI driver is updated */
-#ifndef USE_PI_LED_ENABLE
-#define USE_PI_LED_ENABLE 0
-#endif
-
 #define OPA_PORT_LINK_MODE_NOP	0		/* No change */
 #define OPA_PORT_LINK_MODE_OPA	4		/* Port mode is OPA */
 
@@ -274,23 +269,12 @@ enum port_info_field_masks {
 	OPA_PI_MASK_MTU_CAP                       = 0x0F,
 };
 
-#if USE_PI_LED_ENABLE
 struct opa_port_states {
 	u8     reserved;
 	u8     ledenable_offlinereason;   /* 1 res, 1 bit, 6 bits */
 	u8     reserved2;
 	u8     portphysstate_portstate;   /* 4 bits, 4 bits */
 };
-#define PI_LED_ENABLE_SUP 1
-#else
-struct opa_port_states {
-	u8     reserved;
-	u8     offline_reason;            /* 2 res, 6 bits */
-	u8     reserved2;
-	u8     portphysstate_portstate;   /* 4 bits, 4 bits */
-};
-#define PI_LED_ENABLE_SUP 0
-#endif
 
 struct opa_port_state_info {
 	struct opa_port_states port_states;
-- 
cgit v1.2.3


From 632bc3f65081dd1e2e5394a9161580a0f78e8839 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@sandisk.com>
Date: Thu, 21 Jul 2016 13:03:30 -0700
Subject: IB/core, RDMA RW API: Do not exceed QP SGE send limit

Compute the SGE limit for RDMA READ and WRITE requests in
ib_create_qp(). Use that limit in the RDMA RW API implementation.

Signed-off-by: Bart Van Assche <bart.vanassche@sandisk.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Sagi Grimberg <sagi@grimberg.me>
Cc: Steve Wise <swise@opengridcomputing.com>
Cc: Parav Pandit <pandit.parav@gmail.com>
Cc: Nicholas Bellinger <nab@linux-iscsi.org>
Cc: Laurence Oberman <loberman@redhat.com>
Cc: <stable@vger.kernel.org> #v4.7+
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/core/rw.c    | 10 ++--------
 drivers/infiniband/core/verbs.c |  9 +++++++++
 include/rdma/ib_verbs.h         |  6 ++++++
 3 files changed, 17 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/core/rw.c b/drivers/infiniband/core/rw.c
index 1ad2baaa6c8c..dbfd854c32c9 100644
--- a/drivers/infiniband/core/rw.c
+++ b/drivers/infiniband/core/rw.c
@@ -58,13 +58,6 @@ static inline bool rdma_rw_io_needs_mr(struct ib_device *dev, u8 port_num,
 	return false;
 }
 
-static inline u32 rdma_rw_max_sge(struct ib_device *dev,
-		enum dma_data_direction dir)
-{
-	return dir == DMA_TO_DEVICE ?
-		dev->attrs.max_sge : dev->attrs.max_sge_rd;
-}
-
 static inline u32 rdma_rw_fr_page_list_len(struct ib_device *dev)
 {
 	/* arbitrary limit to avoid allocating gigantic resources */
@@ -186,7 +179,8 @@ static int rdma_rw_init_map_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
 		u64 remote_addr, u32 rkey, enum dma_data_direction dir)
 {
 	struct ib_device *dev = qp->pd->device;
-	u32 max_sge = rdma_rw_max_sge(dev, dir);
+	u32 max_sge = dir == DMA_TO_DEVICE ? qp->max_write_sge :
+		      qp->max_read_sge;
 	struct ib_sge *sge;
 	u32 total_len = 0, i, j;
 
diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
index 6298f54b4137..e39a0b597234 100644
--- a/drivers/infiniband/core/verbs.c
+++ b/drivers/infiniband/core/verbs.c
@@ -814,6 +814,15 @@ struct ib_qp *ib_create_qp(struct ib_pd *pd,
 		}
 	}
 
+	/*
+	 * Note: all hw drivers guarantee that max_send_sge is lower than
+	 * the device RDMA WRITE SGE limit but not all hw drivers ensure that
+	 * max_send_sge <= max_sge_rd.
+	 */
+	qp->max_write_sge = qp_init_attr->cap.max_send_sge;
+	qp->max_read_sge = min_t(u32, qp_init_attr->cap.max_send_sge,
+				 device->attrs.max_sge_rd);
+
 	return qp;
 }
 EXPORT_SYMBOL(ib_create_qp);
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 7e440d41487a..e694f02d42e3 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -1428,6 +1428,10 @@ struct ib_srq {
 	} ext;
 };
 
+/*
+ * @max_write_sge: Maximum SGE elements per RDMA WRITE request.
+ * @max_read_sge:  Maximum SGE elements per RDMA READ request.
+ */
 struct ib_qp {
 	struct ib_device       *device;
 	struct ib_pd	       *pd;
@@ -1449,6 +1453,8 @@ struct ib_qp {
 	void                  (*event_handler)(struct ib_event *, void *);
 	void		       *qp_context;
 	u32			qp_num;
+	u32			max_write_sge;
+	u32			max_read_sge;
 	enum ib_qp_type		qp_type;
 };
 
-- 
cgit v1.2.3


From afcf8f7647780aa147ad68be48d223cd50311b9a Mon Sep 17 00:00:00 2001
From: Mike Marciniszyn <mike.marciniszyn@intel.com>
Date: Fri, 1 Jul 2016 16:02:07 -0700
Subject: IB/rdmavt: Add data structures and routines for table driven post
 send

Add flexibility for driver dependent operations in post send
because different drivers will have differing post send
operation support.

This includes data structure definitions to support a table
driven scheme along with the necessary validation routine
using the new table.

Reviewed-by: Ashutosh Dixit <ashutosh.dixit@intel.com>
Reviewed-by: Jianxin Xiong <jianxin.xiong@intel.com>
Reviewed-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Signed-off-by: Mike Marciniszyn <mike.marciniszyn@intel.com>
Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/sw/rdmavt/qp.c | 67 ++++++++++++++++++++++++++++++++++++---
 include/rdma/rdma_vt.h            |  3 ++
 include/rdma/rdmavt_qp.h          | 28 +++++++++++++---
 3 files changed, 89 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c
index 41ba7e9cadaa..d2b5b547c5a0 100644
--- a/drivers/infiniband/sw/rdmavt/qp.c
+++ b/drivers/infiniband/sw/rdmavt/qp.c
@@ -613,6 +613,7 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd,
 	struct rvt_dev_info *rdi = ib_to_rvt(ibpd->device);
 	void *priv = NULL;
 	gfp_t gfp;
+	size_t sqsize;
 
 	if (!rdi)
 		return ERR_PTR(-EINVAL);
@@ -643,7 +644,8 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd,
 		    init_attr->cap.max_recv_wr == 0)
 			return ERR_PTR(-EINVAL);
 	}
-
+	sqsize =
+		init_attr->cap.max_send_wr + 1;
 	switch (init_attr->qp_type) {
 	case IB_QPT_SMI:
 	case IB_QPT_GSI:
@@ -658,11 +660,11 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd,
 			sizeof(struct rvt_swqe);
 		if (gfp == GFP_NOIO)
 			swq = __vmalloc(
-				(init_attr->cap.max_send_wr + 1) * sz,
+				sqsize * sz,
 				gfp | __GFP_ZERO, PAGE_KERNEL);
 		else
 			swq = vzalloc_node(
-				(init_attr->cap.max_send_wr + 1) * sz,
+				sqsize * sz,
 				rdi->dparms.node);
 		if (!swq)
 			return ERR_PTR(-ENOMEM);
@@ -747,7 +749,7 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd,
 		INIT_LIST_HEAD(&qp->rspwait);
 		qp->state = IB_QPS_RESET;
 		qp->s_wq = swq;
-		qp->s_size = init_attr->cap.max_send_wr + 1;
+		qp->s_size = sqsize;
 		qp->s_avail = init_attr->cap.max_send_wr;
 		qp->s_max_sge = init_attr->cap.max_send_sge;
 		if (init_attr->sq_sig_type == IB_SIGNAL_REQ_WR)
@@ -1440,12 +1442,65 @@ int rvt_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr,
 }
 
 /**
- * qp_get_savail - return number of avail send entries
+ * rvt_qp_valid_operation - validate post send wr request
+ * @qp - the qp
+ * @post-parms - the post send table for the driver
+ * @wr - the work request
  *
+ * The routine validates the operation based on the
+ * validation table an returns the length of the operation
+ * which can extend beyond the ib_send_bw.  Operation
+ * dependent flags key atomic operation validation.
+ *
+ * There is an exception for UD qps that validates the pd and
+ * overrides the length to include the additional UD specific
+ * length.
+ *
+ * Returns a negative error or the length of the work request
+ * for building the swqe.
+ */
+static inline int rvt_qp_valid_operation(
+	struct rvt_qp *qp,
+	const struct rvt_operation_params *post_parms,
+	struct ib_send_wr *wr)
+{
+	int len;
+
+	if (wr->opcode >= RVT_OPERATION_MAX || !post_parms[wr->opcode].length)
+		return -EINVAL;
+	if (!(post_parms[wr->opcode].qpt_support & BIT(qp->ibqp.qp_type)))
+		return -EINVAL;
+	if ((post_parms[wr->opcode].flags & RVT_OPERATION_PRIV) &&
+	    ibpd_to_rvtpd(qp->ibqp.pd)->user)
+		return -EINVAL;
+	if (post_parms[wr->opcode].flags & RVT_OPERATION_ATOMIC_SGE &&
+	    (wr->num_sge == 0 ||
+	     wr->sg_list[0].length < sizeof(u64) ||
+	     wr->sg_list[0].addr & (sizeof(u64) - 1)))
+		return -EINVAL;
+	if (post_parms[wr->opcode].flags & RVT_OPERATION_ATOMIC &&
+	    !qp->s_max_rd_atomic)
+		return -EINVAL;
+	len = post_parms[wr->opcode].length;
+	/* UD specific */
+	if (qp->ibqp.qp_type != IB_QPT_UC &&
+	    qp->ibqp.qp_type != IB_QPT_RC) {
+		if (qp->ibqp.pd != ud_wr(wr)->ah->pd)
+			return -EINVAL;
+		len = sizeof(struct ib_ud_wr);
+	}
+	return len;
+}
+
+/**
+ * qp_get_savail - return number of avail send entries
  * @qp - the qp
  *
  * This assumes the s_hlock is held but the s_last
  * qp variable is uncontrolled.
+ *
+ * The return is adjusted to not count device specific
+ * reserved operations.
  */
 static inline u32 qp_get_savail(struct rvt_qp *qp)
 {
@@ -1481,6 +1536,8 @@ static int rvt_post_one_wr(struct rvt_qp *qp,
 	u8 log_pmtu;
 	int ret;
 
+	BUILD_BUG_ON(IB_QPT_MAX >= (sizeof(u32) * BITS_PER_BYTE));
+
 	/* IB spec says that num_sge == 0 is OK. */
 	if (unlikely(wr->num_sge > qp->s_max_sge))
 		return -EINVAL;
diff --git a/include/rdma/rdma_vt.h b/include/rdma/rdma_vt.h
index 9c9a27d42aaa..3a70dc047314 100644
--- a/include/rdma/rdma_vt.h
+++ b/include/rdma/rdma_vt.h
@@ -351,6 +351,9 @@ struct rvt_dev_info {
 	/* Driver specific properties */
 	struct rvt_driver_params dparms;
 
+	/* post send table */
+	const struct rvt_operation_params *post_parms;
+
 	struct rvt_mregion __rcu *dma_mr;
 	struct rvt_lkey_table lkey_table;
 
diff --git a/include/rdma/rdmavt_qp.h b/include/rdma/rdmavt_qp.h
index 6d23b879416a..a90d1e941504 100644
--- a/include/rdma/rdmavt_qp.h
+++ b/include/rdma/rdmavt_qp.h
@@ -228,11 +228,31 @@ struct rvt_ack_entry {
 
 #define	RC_QP_SCALING_INTERVAL	5
 
-/*
- * Variables prefixed with s_ are for the requester (sender).
- * Variables prefixed with r_ are for the responder (receiver).
- * Variables prefixed with ack_ are for responder replies.
+#define RVT_OPERATION_PRIV        0x00000001
+#define RVT_OPERATION_ATOMIC      0x00000002
+#define RVT_OPERATION_ATOMIC_SGE  0x00000004
+
+#define RVT_OPERATION_MAX (IB_WR_RESERVED10 + 1)
+
+/**
+ * rvt_operation_params - op table entry
+ * @length - the length to copy into the swqe entry
+ * @qpt_support - a bit mask indicating QP type support
+ * @flags - RVT_OPERATION flags (see above)
+ *
+ * This supports table driven post send so that
+ * the driver can have differing an potentially
+ * different sets of operations.
  *
+ **/
+
+struct rvt_operation_params {
+	size_t length;
+	u32 qpt_support;
+	u32 flags;
+};
+
+/*
  * Common variables are protected by both r_rq.lock and s_lock in that order
  * which only happens in modify_qp() or changing the QP 'state'.
  */
-- 
cgit v1.2.3


From e8f8b098a44a66d3da81e460aed465f26693e120 Mon Sep 17 00:00:00 2001
From: Jianxin Xiong <jianxin.xiong@intel.com>
Date: Mon, 25 Jul 2016 13:38:19 -0700
Subject: IB/rdmavt: Add mechanism to invalidate MR keys

In order to support extended memory management, add the mechanism to
invalidate MR keys. This includes a flag "lkey_invalid" in the MR data
structure that is to be checked when validating access to the MR via
the associated key, and two utility functions to perform fast memory
registration and memory key invalidate operations.

Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com>
Reviewed-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Signed-off-by: Jianxin Xiong <jianxin.xiong@intel.com>
Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/sw/rdmavt/mr.c | 73 +++++++++++++++++++++++++++++++++++++--
 include/rdma/rdma_vt.h            |  3 ++
 include/rdma/rdmavt_mr.h          |  1 +
 3 files changed, 75 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/sw/rdmavt/mr.c b/drivers/infiniband/sw/rdmavt/mr.c
index 75f158aa853d..80c4b6b401b8 100644
--- a/drivers/infiniband/sw/rdmavt/mr.c
+++ b/drivers/infiniband/sw/rdmavt/mr.c
@@ -140,6 +140,7 @@ static int rvt_init_mregion(struct rvt_mregion *mr, struct ib_pd *pd,
 	init_completion(&mr->comp);
 	/* count returning the ptr to user */
 	atomic_set(&mr->refcount, 1);
+	atomic_set(&mr->lkey_invalid, 0);
 	mr->pd = pd;
 	mr->max_segs = count;
 	return 0;
@@ -530,6 +531,72 @@ int rvt_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg,
 			      rvt_set_page);
 }
 
+/**
+ * rvt_fast_reg_mr - fast register physical MR
+ * @qp: the queue pair where the work request comes from
+ * @ibmr: the memory region to be registered
+ * @key: updated key for this memory region
+ * @access: access flags for this memory region
+ *
+ * Returns 0 on success.
+ */
+int rvt_fast_reg_mr(struct rvt_qp *qp, struct ib_mr *ibmr, u32 key,
+		    int access)
+{
+	struct rvt_mr *mr = to_imr(ibmr);
+
+	if (qp->ibqp.pd != mr->mr.pd)
+		return -EACCES;
+
+	/* not applicable to dma MR or user MR */
+	if (!mr->mr.lkey || mr->umem)
+		return -EINVAL;
+
+	if ((key & 0xFFFFFF00) != (mr->mr.lkey & 0xFFFFFF00))
+		return -EINVAL;
+
+	ibmr->lkey = key;
+	ibmr->rkey = key;
+	mr->mr.lkey = key;
+	mr->mr.access_flags = access;
+	atomic_set(&mr->mr.lkey_invalid, 0);
+
+	return 0;
+}
+EXPORT_SYMBOL(rvt_fast_reg_mr);
+
+/**
+ * rvt_invalidate_rkey - invalidate an MR rkey
+ * @qp: queue pair associated with the invalidate op
+ * @rkey: rkey to invalidate
+ *
+ * Returns 0 on success.
+ */
+int rvt_invalidate_rkey(struct rvt_qp *qp, u32 rkey)
+{
+	struct rvt_dev_info *dev = ib_to_rvt(qp->ibqp.device);
+	struct rvt_lkey_table *rkt = &dev->lkey_table;
+	struct rvt_mregion *mr;
+
+	if (rkey == 0)
+		return -EINVAL;
+
+	rcu_read_lock();
+	mr = rcu_dereference(
+		rkt->table[(rkey >> (32 - dev->dparms.lkey_table_size))]);
+	if (unlikely(!mr || mr->lkey != rkey || qp->ibqp.pd != mr->pd))
+		goto bail;
+
+	atomic_set(&mr->lkey_invalid, 1);
+	rcu_read_unlock();
+	return 0;
+
+bail:
+	rcu_read_unlock();
+	return -EINVAL;
+}
+EXPORT_SYMBOL(rvt_invalidate_rkey);
+
 /**
  * rvt_alloc_fmr - allocate a fast memory region
  * @pd: the protection domain for this memory region
@@ -733,7 +800,8 @@ int rvt_lkey_ok(struct rvt_lkey_table *rkt, struct rvt_pd *pd,
 	}
 	mr = rcu_dereference(
 		rkt->table[(sge->lkey >> (32 - dev->dparms.lkey_table_size))]);
-	if (unlikely(!mr || mr->lkey != sge->lkey || mr->pd != &pd->ibpd))
+	if (unlikely(!mr || atomic_read(&mr->lkey_invalid) ||
+		     mr->lkey != sge->lkey || mr->pd != &pd->ibpd))
 		goto bail;
 
 	off = sge->addr - mr->user_base;
@@ -833,7 +901,8 @@ int rvt_rkey_ok(struct rvt_qp *qp, struct rvt_sge *sge,
 
 	mr = rcu_dereference(
 		rkt->table[(rkey >> (32 - dev->dparms.lkey_table_size))]);
-	if (unlikely(!mr || mr->lkey != rkey || qp->ibqp.pd != mr->pd))
+	if (unlikely(!mr || atomic_read(&mr->lkey_invalid) ||
+		     mr->lkey != rkey || qp->ibqp.pd != mr->pd))
 		goto bail;
 
 	off = vaddr - mr->iova;
diff --git a/include/rdma/rdma_vt.h b/include/rdma/rdma_vt.h
index 3a70dc047314..7fdba92d4c05 100644
--- a/include/rdma/rdma_vt.h
+++ b/include/rdma/rdma_vt.h
@@ -487,6 +487,9 @@ void rvt_unregister_device(struct rvt_dev_info *rvd);
 int rvt_check_ah(struct ib_device *ibdev, struct ib_ah_attr *ah_attr);
 int rvt_init_port(struct rvt_dev_info *rdi, struct rvt_ibport *port,
 		  int port_index, u16 *pkey_table);
+int rvt_fast_reg_mr(struct rvt_qp *qp, struct ib_mr *ibmr, u32 key,
+		    int access);
+int rvt_invalidate_rkey(struct rvt_qp *qp, u32 rkey);
 int rvt_rkey_ok(struct rvt_qp *qp, struct rvt_sge *sge,
 		u32 len, u64 vaddr, u32 rkey, int acc);
 int rvt_lkey_ok(struct rvt_lkey_table *rkt, struct rvt_pd *pd,
diff --git a/include/rdma/rdmavt_mr.h b/include/rdma/rdmavt_mr.h
index 5edffdca8c53..6b3c6c8b6b77 100644
--- a/include/rdma/rdmavt_mr.h
+++ b/include/rdma/rdmavt_mr.h
@@ -81,6 +81,7 @@ struct rvt_mregion {
 	u32 mapsz;              /* size of the map array */
 	u8  page_shift;         /* 0 - non unform/non powerof2 sizes */
 	u8  lkey_published;     /* in global table */
+	atomic_t lkey_invalid;	/* true if current lkey is invalid */
 	struct completion comp; /* complete when refcount goes to zero */
 	atomic_t refcount;
 	struct rvt_segarray *map[0];    /* the segments */
-- 
cgit v1.2.3


From d9f8723924d5955979d05cb7f4f10d9ebac39b7d Mon Sep 17 00:00:00 2001
From: Jianxin Xiong <jianxin.xiong@intel.com>
Date: Mon, 25 Jul 2016 13:38:25 -0700
Subject: IB/rdmavt: Handle local operations in post send

Some work requests are local operations, such as IB_WR_REG_MR and
IB_WR_LOCAL_INV. They differ from non-local operations in that:

(1) Local operations can be processed immediately without being posted
to the send queue if neither fencing nor completion generation is needed.
However, to ensure correct ordering, once a local operation is posted to
the work queue due to fencing or completion requiement, all subsequent
local operations must also be posted to the work queue until all the
local operations on the work queue have completed.

(2) Local operations don't send packets over the wire and thus don't
need (and shouldn't update) the packet sequence numbers.

Define a new a flag bit for the post send table to identify local
operations.

Add a new field to the QP structure to track the number of local
operations on the send queue to determine if direct processing of new
local operations should be enabled/disabled.

Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com>
Reviewed-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Signed-off-by: Jianxin Xiong <jianxin.xiong@intel.com>
Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/sw/rdmavt/qp.c | 45 ++++++++++++++++++++++++++++++++++-----
 include/rdma/rdmavt_qp.h          |  3 +++
 2 files changed, 43 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c
index ebc37f55ac55..f79b809241e0 100644
--- a/drivers/infiniband/sw/rdmavt/qp.c
+++ b/drivers/infiniband/sw/rdmavt/qp.c
@@ -743,6 +743,7 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd,
 		spin_lock_init(&qp->s_lock);
 		spin_lock_init(&qp->r_rq.lock);
 		atomic_set(&qp->refcount, 0);
+		atomic_set(&qp->local_ops_pending, 0);
 		init_waitqueue_head(&qp->wait);
 		init_timer(&qp->s_timer);
 		qp->s_timer.data = (unsigned long)qp;
@@ -1548,6 +1549,31 @@ static int rvt_post_one_wr(struct rvt_qp *qp,
 		return ret;
 	cplen = ret;
 
+	/*
+	 * Local operations including fast register and local invalidate
+	 * can be processed immediately w/o being posted to the send queue
+	 * if neither fencing nor completion generation is needed. However,
+	 * once fencing or completion is requested, direct processing of
+	 * following local operations must be disabled until all the local
+	 * operations posted to the send queue have completed. This is
+	 * necessary to ensure the correct ordering.
+	 */
+	if ((rdi->post_parms[wr->opcode].flags & RVT_OPERATION_LOCAL) &&
+	    !(wr->send_flags & (IB_SEND_FENCE | IB_SEND_SIGNALED)) &&
+	    !atomic_read(&qp->local_ops_pending)) {
+		struct ib_reg_wr *reg = reg_wr(wr);
+
+		switch (wr->opcode) {
+		case IB_WR_REG_MR:
+			return rvt_fast_reg_mr(qp, reg->mr, reg->key,
+					       reg->access);
+		case IB_WR_LOCAL_INV:
+			return rvt_invalidate_rkey(qp, wr->ex.invalidate_rkey);
+		default:
+			return -EINVAL;
+		}
+	}
+
 	/* check for avail */
 	if (unlikely(!qp->s_avail)) {
 		qp->s_avail = qp_get_savail(qp);
@@ -1612,11 +1638,20 @@ static int rvt_post_one_wr(struct rvt_qp *qp,
 		atomic_inc(&ibah_to_rvtah(ud_wr(wr)->ah)->refcount);
 	}
 
-	wqe->ssn = qp->s_ssn++;
-	wqe->psn = qp->s_next_psn;
-	wqe->lpsn = wqe->psn +
-			(wqe->length ? ((wqe->length - 1) >> log_pmtu) : 0);
-	qp->s_next_psn = wqe->lpsn + 1;
+	if (rdi->post_parms[wr->opcode].flags & RVT_OPERATION_LOCAL) {
+		atomic_inc(&qp->local_ops_pending);
+		wqe->ssn = 0;
+		wqe->psn = 0;
+		wqe->lpsn = 0;
+	} else {
+		wqe->ssn = qp->s_ssn++;
+		wqe->psn = qp->s_next_psn;
+		wqe->lpsn = wqe->psn +
+				(wqe->length ?
+					((wqe->length - 1) >> log_pmtu) :
+					0);
+		qp->s_next_psn = wqe->lpsn + 1;
+	}
 	trace_rvt_post_one_wr(qp, wqe);
 	smp_wmb(); /* see request builders */
 	qp->s_avail--;
diff --git a/include/rdma/rdmavt_qp.h b/include/rdma/rdmavt_qp.h
index a90d1e941504..b0ab12b30f1e 100644
--- a/include/rdma/rdmavt_qp.h
+++ b/include/rdma/rdmavt_qp.h
@@ -231,6 +231,7 @@ struct rvt_ack_entry {
 #define RVT_OPERATION_PRIV        0x00000001
 #define RVT_OPERATION_ATOMIC      0x00000002
 #define RVT_OPERATION_ATOMIC_SGE  0x00000004
+#define RVT_OPERATION_LOCAL       0x00000008
 
 #define RVT_OPERATION_MAX (IB_WR_RESERVED10 + 1)
 
@@ -363,6 +364,8 @@ struct rvt_qp {
 	struct rvt_sge_state s_ack_rdma_sge;
 	struct timer_list s_timer;
 
+	atomic_t local_ops_pending; /* number of fast_reg/local_inv reqs */
+
 	/*
 	 * This sge list MUST be last. Do not add anything below here.
 	 */
-- 
cgit v1.2.3


From 856cc4c237add46510c8ae91764f4eda31a9e1cf Mon Sep 17 00:00:00 2001
From: Mike Marciniszyn <mike.marciniszyn@intel.com>
Date: Mon, 25 Jul 2016 13:39:39 -0700
Subject: IB/hfi1: Add the capability for reserved operations

This fix allows for support of in-kernel reserved operations
without impacting the ULP user.

The low level driver can register a non-zero value which
will be transparently added to the send queue size and hidden
from the ULP in every respect.

ULP post sends will never see a full queue due to a reserved
post send and reserved operations will never exceed that
registered value.

The s_avail will continue to track the ULP swqe availability
and the difference between the reserved value and the reserved
in use will track reserved availabity.

Reviewed-by: Ashutosh Dixit <ashutosh.dixit@intel.com>
Signed-off-by: Mike Marciniszyn <mike.marciniszyn@intel.com>
Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/sw/rdmavt/qp.c | 85 ++++++++++++++++++++++++++++-----------
 include/rdma/rdma_vt.h            |  1 +
 include/rdma/rdmavt_qp.h          | 50 +++++++++++++++++++++++
 3 files changed, 113 insertions(+), 23 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c
index f79b809241e0..218494c6afe2 100644
--- a/drivers/infiniband/sw/rdmavt/qp.c
+++ b/drivers/infiniband/sw/rdmavt/qp.c
@@ -584,6 +584,7 @@ static void rvt_reset_qp(struct rvt_dev_info *rdi, struct rvt_qp *qp,
 		qp->r_rq.wq->tail = 0;
 	}
 	qp->r_sge.num_sge = 0;
+	atomic_set(&qp->s_reserved_used, 0);
 }
 
 /**
@@ -645,7 +646,8 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd,
 			return ERR_PTR(-EINVAL);
 	}
 	sqsize =
-		init_attr->cap.max_send_wr + 1;
+		init_attr->cap.max_send_wr + 1 +
+		rdi->dparms.reserved_operations;
 	switch (init_attr->qp_type) {
 	case IB_QPT_SMI:
 	case IB_QPT_GSI:
@@ -1335,7 +1337,8 @@ int rvt_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
 	attr->sq_psn = qp->s_next_psn & rdi->dparms.psn_mask;
 	attr->dest_qp_num = qp->remote_qpn;
 	attr->qp_access_flags = qp->qp_access_flags;
-	attr->cap.max_send_wr = qp->s_size - 1;
+	attr->cap.max_send_wr = qp->s_size - 1 -
+		rdi->dparms.reserved_operations;
 	attr->cap.max_recv_wr = qp->ibqp.srq ? 0 : qp->r_rq.size - 1;
 	attr->cap.max_send_sge = qp->s_max_sge;
 	attr->cap.max_recv_sge = qp->r_rq.max_sge;
@@ -1494,27 +1497,65 @@ static inline int rvt_qp_valid_operation(
 }
 
 /**
- * qp_get_savail - return number of avail send entries
+ * rvt_qp_is_avail - determine queue capacity
  * @qp - the qp
+ * @rdi - the rdmavt device
+ * @reserved_op - is reserved operation
  *
  * This assumes the s_hlock is held but the s_last
  * qp variable is uncontrolled.
  *
- * The return is adjusted to not count device specific
- * reserved operations.
+ * For non reserved operations, the qp->s_avail
+ * may be changed.
+ *
+ * The return value is zero or a -ENOMEM.
  */
-static inline u32 qp_get_savail(struct rvt_qp *qp)
+static inline int rvt_qp_is_avail(
+	struct rvt_qp *qp,
+	struct rvt_dev_info *rdi,
+	bool reserved_op)
 {
 	u32 slast;
-	u32 ret;
-
+	u32 avail;
+	u32 reserved_used;
+
+	/* see rvt_qp_wqe_unreserve() */
+	smp_mb__before_atomic();
+	reserved_used = atomic_read(&qp->s_reserved_used);
+	if (unlikely(reserved_op)) {
+		/* see rvt_qp_wqe_unreserve() */
+		smp_mb__before_atomic();
+		if (reserved_used >= rdi->dparms.reserved_operations)
+			return -ENOMEM;
+		return 0;
+	}
+	/* non-reserved operations */
+	if (likely(qp->s_avail))
+		return 0;
 	smp_read_barrier_depends(); /* see rc.c */
 	slast = ACCESS_ONCE(qp->s_last);
 	if (qp->s_head >= slast)
-		ret = qp->s_size - (qp->s_head - slast);
+		avail = qp->s_size - (qp->s_head - slast);
 	else
-		ret = slast - qp->s_head;
-	return ret - 1;
+		avail = slast - qp->s_head;
+
+	/* see rvt_qp_wqe_unreserve() */
+	smp_mb__before_atomic();
+	reserved_used = atomic_read(&qp->s_reserved_used);
+	avail =  avail - 1 -
+		(rdi->dparms.reserved_operations - reserved_used);
+	/* insure we don't assign a negative s_avail */
+	if ((s32)avail <= 0)
+		return -ENOMEM;
+	qp->s_avail = avail;
+	if (WARN_ON(qp->s_avail >
+		    (qp->s_size - 1 - rdi->dparms.reserved_operations)))
+		rvt_pr_err(rdi,
+			   "More avail entries than QP RB size.\nQP: %u, size: %u, avail: %u\nhead: %u, tail: %u, cur: %u, acked: %u, last: %u",
+			   qp->ibqp.qp_num, qp->s_size, qp->s_avail,
+			   qp->s_head, qp->s_tail, qp->s_cur,
+			   qp->s_acked, qp->s_last);
+	return 0;
 }
 
 /**
@@ -1537,6 +1578,7 @@ static int rvt_post_one_wr(struct rvt_qp *qp,
 	u8 log_pmtu;
 	int ret;
 	size_t cplen;
+	bool reserved_op;
 
 	BUILD_BUG_ON(IB_QPT_MAX >= (sizeof(u32) * BITS_PER_BYTE));
 
@@ -1574,18 +1616,12 @@ static int rvt_post_one_wr(struct rvt_qp *qp,
 		}
 	}
 
+	reserved_op = rdi->post_parms[wr->opcode].flags &
+			RVT_OPERATION_USE_RESERVE;
 	/* check for avail */
-	if (unlikely(!qp->s_avail)) {
-		qp->s_avail = qp_get_savail(qp);
-		if (WARN_ON(qp->s_avail > (qp->s_size - 1)))
-			rvt_pr_err(rdi,
-				   "More avail entries than QP RB size.\nQP: %u, size: %u, avail: %u\nhead: %u, tail: %u, cur: %u, acked: %u, last: %u",
-				   qp->ibqp.qp_num, qp->s_size, qp->s_avail,
-				   qp->s_head, qp->s_tail, qp->s_cur,
-				   qp->s_acked, qp->s_last);
-		if (!qp->s_avail)
-			return -ENOMEM;
-	}
+	ret = rvt_qp_is_avail(qp, rdi, reserved_op);
+	if (ret)
+		return ret;
 	next = qp->s_head + 1;
 	if (next >= qp->s_size)
 		next = 0;
@@ -1653,8 +1689,11 @@ static int rvt_post_one_wr(struct rvt_qp *qp,
 		qp->s_next_psn = wqe->lpsn + 1;
 	}
 	trace_rvt_post_one_wr(qp, wqe);
+	if (unlikely(reserved_op))
+		rvt_qp_wqe_reserve(qp, wqe);
+	else
+		qp->s_avail--;
 	smp_wmb(); /* see request builders */
-	qp->s_avail--;
 	qp->s_head = next;
 
 	return 0;
diff --git a/include/rdma/rdma_vt.h b/include/rdma/rdma_vt.h
index 7fdba92d4c05..e31502107a58 100644
--- a/include/rdma/rdma_vt.h
+++ b/include/rdma/rdma_vt.h
@@ -158,6 +158,7 @@ struct rvt_driver_params {
 	u32 max_mad_size;
 	u8 qos_shift;
 	u8 max_rdma_atomic;
+	u8 reserved_operations;
 };
 
 /* Protection domain */
diff --git a/include/rdma/rdmavt_qp.h b/include/rdma/rdmavt_qp.h
index b0ab12b30f1e..56adcfcabe0b 100644
--- a/include/rdma/rdmavt_qp.h
+++ b/include/rdma/rdmavt_qp.h
@@ -144,6 +144,11 @@
 #define RVT_PROCESS_OR_FLUSH_SEND \
 	(RVT_PROCESS_SEND_OK | RVT_FLUSH_SEND)
 
+/*
+ * Internal send flags
+ */
+#define RVT_SEND_RESERVE_USED           IB_SEND_RESERVED_START
+
 /*
  * Send work request queue entry.
  * The size of the sg_list is determined when the QP is created and stored
@@ -232,6 +237,7 @@ struct rvt_ack_entry {
 #define RVT_OPERATION_ATOMIC      0x00000002
 #define RVT_OPERATION_ATOMIC_SGE  0x00000004
 #define RVT_OPERATION_LOCAL       0x00000008
+#define RVT_OPERATION_USE_RESERVE 0x00000010
 
 #define RVT_OPERATION_MAX (IB_WR_RESERVED10 + 1)
 
@@ -328,6 +334,7 @@ struct rvt_qp {
 	u32 s_next_psn;         /* PSN for next request */
 	u32 s_avail;            /* number of entries avail */
 	u32 s_ssn;              /* SSN of tail entry */
+	atomic_t s_reserved_used; /* reserved entries in use */
 
 	spinlock_t s_lock ____cacheline_aligned_in_smp;
 	u32 s_flags;
@@ -459,6 +466,49 @@ static inline struct rvt_rwqe *rvt_get_rwqe_ptr(struct rvt_rq *rq, unsigned n)
 		  rq->max_sge * sizeof(struct ib_sge)) * n);
 }
 
+/**
+ * rvt_qp_wqe_reserve - reserve operation
+ * @qp - the rvt qp
+ * @wqe - the send wqe
+ *
+ * This routine used in post send to record
+ * a wqe relative reserved operation use.
+ */
+static inline void rvt_qp_wqe_reserve(
+	struct rvt_qp *qp,
+	struct rvt_swqe *wqe)
+{
+	wqe->wr.send_flags |= RVT_SEND_RESERVE_USED;
+	atomic_inc(&qp->s_reserved_used);
+}
+
+/**
+ * rvt_qp_wqe_unreserve - clean reserved operation
+ * @qp - the rvt qp
+ * @wqe - the send wqe
+ *
+ * This decrements the reserve use count.
+ *
+ * This call MUST precede the change to
+ * s_last to insure that post send sees a stable
+ * s_avail.
+ *
+ * An smp_mp__after_atomic() is used to insure
+ * the compiler does not juggle the order of the s_last
+ * ring index and the decrementing of s_reserved_used.
+ */
+static inline void rvt_qp_wqe_unreserve(
+	struct rvt_qp *qp,
+	struct rvt_swqe *wqe)
+{
+	if (unlikely(wqe->wr.send_flags & RVT_SEND_RESERVE_USED)) {
+		wqe->wr.send_flags &= ~RVT_SEND_RESERVE_USED;
+		atomic_dec(&qp->s_reserved_used);
+		/* insure no compiler re-order up to s_last change */
+		smp_mb__after_atomic();
+	}
+}
+
 extern const int  ib_rvt_state_ops[];
 
 struct rvt_dev_info;
-- 
cgit v1.2.3


From d9b13c203003cfb78c1f216049a204d385ccaeff Mon Sep 17 00:00:00 2001
From: Jianxin Xiong <jianxin.xiong@intel.com>
Date: Mon, 25 Jul 2016 13:39:45 -0700
Subject: IB/rdmavt, hfi1: Fix NFSoRDMA failure with FRMR enabled

Hanging has been observed while writing a file over NFSoRDMA. Dmesg on
the server contains messages like these:

[  931.992501] svcrdma: Error -22 posting RDMA_READ
[  952.076879] svcrdma: Error -22 posting RDMA_READ
[  982.154127] svcrdma: Error -22 posting RDMA_READ
[ 1012.235884] svcrdma: Error -22 posting RDMA_READ
[ 1042.319194] svcrdma: Error -22 posting RDMA_READ

Here is why:

With the base memory management extension enabled, FRMR is used instead
of FMR. The xprtrdma server issues each RDMA read request as the following
bundle:

(1)IB_WR_REG_MR, signaled;
(2)IB_WR_RDMA_READ, signaled;
(3)IB_WR_LOCAL_INV, signaled & fencing.

These requests are signaled. In order to generate completion, the fast
register work request is processed by the hfi1 send engine after being
posted to the work queue, and the corresponding lkey is not valid until
the request is processed. However, the rdmavt driver validates lkey when
the RDMA read request is posted and thus it fails immediately with error
-EINVAL (-22).

This patch changes the work flow of local operations (fast register and
local invalidate) so that fast register work requests are always
processed immediately to ensure that the corresponding lkey is valid
when subsequent work requests are posted. Local invalidate requests are
processed immediately if fencing is not required and no previous local
invalidate request is pending.

To allow completion generation for signaled local operations that have
been processed before posting to the work queue, an internal send flag
RVT_SEND_COMPLETION_ONLY is added. The hfi1 send engine checks this flag
and only generates completion for such requests.

Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com>
Signed-off-by: Jianxin Xiong <jianxin.xiong@intel.com>
Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/hw/hfi1/rc.c   | 17 +++++++-------
 drivers/infiniband/hw/hfi1/ruc.c  | 13 +++++------
 drivers/infiniband/hw/hfi1/uc.c   | 15 ++++++------
 drivers/infiniband/sw/rdmavt/qp.c | 48 ++++++++++++++++++++++++++-------------
 include/rdma/rdmavt_qp.h          |  1 +
 5 files changed, 56 insertions(+), 38 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/hw/hfi1/rc.c b/drivers/infiniband/hw/hfi1/rc.c
index 0bc43b67d0b8..5da190e6011b 100644
--- a/drivers/infiniband/hw/hfi1/rc.c
+++ b/drivers/infiniband/hw/hfi1/rc.c
@@ -402,7 +402,6 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
 	char newreq;
 	int middle = 0;
 	int delta;
-	int err;
 
 	ps->s_txreq = get_txreq(ps->dev, qp);
 	if (IS_ERR(ps->s_txreq))
@@ -484,25 +483,27 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
 			 */
 			if (wqe->wr.opcode == IB_WR_REG_MR ||
 			    wqe->wr.opcode == IB_WR_LOCAL_INV) {
+				int local_ops = 0;
+				int err = 0;
+
 				if (qp->s_last != qp->s_cur)
 					goto bail;
 				if (++qp->s_cur == qp->s_size)
 					qp->s_cur = 0;
 				if (++qp->s_tail == qp->s_size)
 					qp->s_tail = 0;
-				if (wqe->wr.opcode == IB_WR_REG_MR)
-					err = rvt_fast_reg_mr(
-						qp, wqe->reg_wr.mr,
-						wqe->reg_wr.key,
-						wqe->reg_wr.access);
-				else
+				if (!(wqe->wr.send_flags &
+				      RVT_SEND_COMPLETION_ONLY)) {
 					err = rvt_invalidate_rkey(
 						qp,
 						wqe->wr.ex.invalidate_rkey);
+					local_ops = 1;
+				}
 				hfi1_send_complete(qp, wqe,
 						   err ? IB_WC_LOC_PROT_ERR
 						       : IB_WC_SUCCESS);
-				atomic_dec(&qp->local_ops_pending);
+				if (local_ops)
+					atomic_dec(&qp->local_ops_pending);
 				qp->s_hdrwords = 0;
 				goto done_free_tx;
 			}
diff --git a/drivers/infiniband/hw/hfi1/ruc.c b/drivers/infiniband/hw/hfi1/ruc.c
index 76b9c9e42d86..7e76d33a5774 100644
--- a/drivers/infiniband/hw/hfi1/ruc.c
+++ b/drivers/infiniband/hw/hfi1/ruc.c
@@ -442,16 +442,15 @@ again:
 	sqp->s_len = wqe->length;
 	switch (wqe->wr.opcode) {
 	case IB_WR_REG_MR:
-		if (rvt_fast_reg_mr(sqp, wqe->reg_wr.mr, wqe->reg_wr.key,
-				    wqe->reg_wr.access))
-			send_status = IB_WC_LOC_PROT_ERR;
-		local_ops = 1;
 		goto send_comp;
 
 	case IB_WR_LOCAL_INV:
-		if (rvt_invalidate_rkey(sqp, wqe->wr.ex.invalidate_rkey))
-			send_status = IB_WC_LOC_PROT_ERR;
-		local_ops = 1;
+		if (!(wqe->wr.send_flags & RVT_SEND_COMPLETION_ONLY)) {
+			if (rvt_invalidate_rkey(sqp,
+						wqe->wr.ex.invalidate_rkey))
+				send_status = IB_WC_LOC_PROT_ERR;
+			local_ops = 1;
+		}
 		goto send_comp;
 
 	case IB_WR_SEND_WITH_INV:
diff --git a/drivers/infiniband/hw/hfi1/uc.c b/drivers/infiniband/hw/hfi1/uc.c
index ef6c96cd3d68..a726d96d185f 100644
--- a/drivers/infiniband/hw/hfi1/uc.c
+++ b/drivers/infiniband/hw/hfi1/uc.c
@@ -77,7 +77,6 @@ int hfi1_make_uc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
 	u32 len;
 	u32 pmtu = qp->pmtu;
 	int middle = 0;
-	int err;
 
 	ps->s_txreq = get_txreq(ps->dev, qp);
 	if (IS_ERR(ps->s_txreq))
@@ -125,20 +124,22 @@ int hfi1_make_uc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
 		 */
 		if (wqe->wr.opcode == IB_WR_REG_MR ||
 		    wqe->wr.opcode == IB_WR_LOCAL_INV) {
+			int local_ops = 0;
+			int err = 0;
+
 			if (qp->s_last != qp->s_cur)
 				goto bail;
 			if (++qp->s_cur == qp->s_size)
 				qp->s_cur = 0;
-			if (wqe->wr.opcode == IB_WR_REG_MR)
-				err = rvt_fast_reg_mr(qp, wqe->reg_wr.mr,
-						      wqe->reg_wr.key,
-						      wqe->reg_wr.access);
-			else
+			if (!(wqe->wr.send_flags & RVT_SEND_COMPLETION_ONLY)) {
 				err = rvt_invalidate_rkey(
 					qp, wqe->wr.ex.invalidate_rkey);
+				local_ops = 1;
+			}
 			hfi1_send_complete(qp, wqe, err ? IB_WC_LOC_PROT_ERR
 							: IB_WC_SUCCESS);
-			atomic_dec(&qp->local_ops_pending);
+			if (local_ops)
+				atomic_dec(&qp->local_ops_pending);
 			qp->s_hdrwords = 0;
 			goto done_free_tx;
 		}
diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c
index 218494c6afe2..8ccf1b970b2c 100644
--- a/drivers/infiniband/sw/rdmavt/qp.c
+++ b/drivers/infiniband/sw/rdmavt/qp.c
@@ -1579,6 +1579,7 @@ static int rvt_post_one_wr(struct rvt_qp *qp,
 	int ret;
 	size_t cplen;
 	bool reserved_op;
+	int local_ops_delayed = 0;
 
 	BUILD_BUG_ON(IB_QPT_MAX >= (sizeof(u32) * BITS_PER_BYTE));
 
@@ -1592,25 +1593,37 @@ static int rvt_post_one_wr(struct rvt_qp *qp,
 	cplen = ret;
 
 	/*
-	 * Local operations including fast register and local invalidate
-	 * can be processed immediately w/o being posted to the send queue
-	 * if neither fencing nor completion generation is needed. However,
-	 * once fencing or completion is requested, direct processing of
-	 * following local operations must be disabled until all the local
-	 * operations posted to the send queue have completed. This is
-	 * necessary to ensure the correct ordering.
+	 * Local operations include fast register and local invalidate.
+	 * Fast register needs to be processed immediately because the
+	 * registered lkey may be used by following work requests and the
+	 * lkey needs to be valid at the time those requests are posted.
+	 * Local invalidate can be processed immediately if fencing is
+	 * not required and no previous local invalidate ops are pending.
+	 * Signaled local operations that have been processed immediately
+	 * need to have requests with "completion only" flags set posted
+	 * to the send queue in order to generate completions.
 	 */
-	if ((rdi->post_parms[wr->opcode].flags & RVT_OPERATION_LOCAL) &&
-	    !(wr->send_flags & (IB_SEND_FENCE | IB_SEND_SIGNALED)) &&
-	    !atomic_read(&qp->local_ops_pending)) {
-		struct ib_reg_wr *reg = reg_wr(wr);
-
+	if ((rdi->post_parms[wr->opcode].flags & RVT_OPERATION_LOCAL)) {
 		switch (wr->opcode) {
 		case IB_WR_REG_MR:
-			return rvt_fast_reg_mr(qp, reg->mr, reg->key,
-					       reg->access);
+			ret = rvt_fast_reg_mr(qp,
+					      reg_wr(wr)->mr,
+					      reg_wr(wr)->key,
+					      reg_wr(wr)->access);
+			if (ret || !(wr->send_flags & IB_SEND_SIGNALED))
+				return ret;
+			break;
 		case IB_WR_LOCAL_INV:
-			return rvt_invalidate_rkey(qp, wr->ex.invalidate_rkey);
+			if ((wr->send_flags & IB_SEND_FENCE) ||
+			    atomic_read(&qp->local_ops_pending)) {
+				local_ops_delayed = 1;
+			} else {
+				ret = rvt_invalidate_rkey(
+					qp, wr->ex.invalidate_rkey);
+				if (ret || !(wr->send_flags & IB_SEND_SIGNALED))
+					return ret;
+			}
+			break;
 		default:
 			return -EINVAL;
 		}
@@ -1675,7 +1688,10 @@ static int rvt_post_one_wr(struct rvt_qp *qp,
 	}
 
 	if (rdi->post_parms[wr->opcode].flags & RVT_OPERATION_LOCAL) {
-		atomic_inc(&qp->local_ops_pending);
+		if (local_ops_delayed)
+			atomic_inc(&qp->local_ops_pending);
+		else
+			wqe->wr.send_flags |= RVT_SEND_COMPLETION_ONLY;
 		wqe->ssn = 0;
 		wqe->psn = 0;
 		wqe->lpsn = 0;
diff --git a/include/rdma/rdmavt_qp.h b/include/rdma/rdmavt_qp.h
index 56adcfcabe0b..13902dd319a9 100644
--- a/include/rdma/rdmavt_qp.h
+++ b/include/rdma/rdmavt_qp.h
@@ -148,6 +148,7 @@
  * Internal send flags
  */
 #define RVT_SEND_RESERVE_USED           IB_SEND_RESERVED_START
+#define RVT_SEND_COMPLETION_ONLY	(IB_SEND_RESERVED_START << 1)
 
 /*
  * Send work request queue entry.
-- 
cgit v1.2.3


From fe508272c963d62de4183c32b6883c3d54c557ef Mon Sep 17 00:00:00 2001
From: Ira Weiny <ira.weiny@intel.com>
Date: Wed, 27 Jul 2016 21:07:36 -0400
Subject: IB/rdmavt: Eliminate redundant opcode test in mr ref clear

The use of the specific opcode test is redundant since
all ack entry users correctly manipulate the mr pointer
to selectively trigger the reference clearing.

The overly specific test hinders the use of implementation
specific operations.

The change needs to get rid of the union to insure that
an atomic value is not seen as an MR pointer.

Reviewed-by: Ashutosh Dixit <ashutosh.dixit@intel.com>
Signed-off-by: Mike Marciniszyn <mike.marciniszyn@intel.com>
Signed-off-by: Ira Weiny <ira.weiny@intel.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/sw/rdmavt/qp.c |  3 +--
 include/rdma/rdmavt_qp.h          | 10 ++++------
 2 files changed, 5 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c
index 8ccf1b970b2c..bdb540f25a88 100644
--- a/drivers/infiniband/sw/rdmavt/qp.c
+++ b/drivers/infiniband/sw/rdmavt/qp.c
@@ -435,8 +435,7 @@ static void rvt_clear_mr_refs(struct rvt_qp *qp, int clr_sends)
 	for (n = 0; n < rvt_max_atomic(rdi); n++) {
 		struct rvt_ack_entry *e = &qp->s_ack_queue[n];
 
-		if (e->opcode == IB_OPCODE_RC_RDMA_READ_REQUEST &&
-		    e->rdma_sge.mr) {
+		if (e->rdma_sge.mr) {
 			rvt_put_mr(e->rdma_sge.mr);
 			e->rdma_sge.mr = NULL;
 		}
diff --git a/include/rdma/rdmavt_qp.h b/include/rdma/rdmavt_qp.h
index 13902dd319a9..bd34d0b56bf7 100644
--- a/include/rdma/rdmavt_qp.h
+++ b/include/rdma/rdmavt_qp.h
@@ -222,14 +222,12 @@ struct rvt_mmap_info {
  * to send a RDMA read response or atomic operation.
  */
 struct rvt_ack_entry {
-	u8 opcode;
-	u8 sent;
+	struct rvt_sge rdma_sge;
+	u64 atomic_data;
 	u32 psn;
 	u32 lpsn;
-	union {
-		struct rvt_sge rdma_sge;
-		u64 atomic_data;
-	};
+	u8 opcode;
+	u8 sent;
 };
 
 #define	RC_QP_SCALING_INTERVAL	5
-- 
cgit v1.2.3


From c49298026908a8ce9dcf01ed68734ad171cef98b Mon Sep 17 00:00:00 2001
From: Ira Weiny <ira.weiny@intel.com>
Date: Wed, 27 Jul 2016 21:08:42 -0400
Subject: IB/hfi1: Allow for non-double word multiple message sizes for user
 SDMA

The driver pads non-double word multiple message sizes but it doesn't
account for this padding when the packet length is calculated. Also, the
data length is miscalculated for message sizes less than 4 bytes due to
the bit representation in LRH. And there's a check for non-double word
multiple message sizes that prevents these messages from being sent.
This patch fixes length miscalculations and enables the functionality to
send non-double word multiple message sizes.

Reviewed-by: Harish Chegondi <harish.chegondi@intel.com>
Signed-off-by: Sebastian Sanchez <sebastian.sanchez@intel.com>
Signed-off-by: Ira Weiny <ira.weiny@intel.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 drivers/infiniband/hw/hfi1/user_sdma.c | 31 ++++++++++++++++++++++---------
 include/uapi/rdma/hfi/hfi1_user.h      |  2 +-
 2 files changed, 23 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/drivers/infiniband/hw/hfi1/user_sdma.c b/drivers/infiniband/hw/hfi1/user_sdma.c
index d16ed52a2cb1..1e266c95056a 100644
--- a/drivers/infiniband/hw/hfi1/user_sdma.c
+++ b/drivers/infiniband/hw/hfi1/user_sdma.c
@@ -793,14 +793,21 @@ static inline u32 compute_data_length(struct user_sdma_request *req,
 	 * The size of the data of the first packet is in the header
 	 * template. However, it includes the header and ICRC, which need
 	 * to be subtracted.
+	 * The minimum representable packet data length in a header is 4 bytes,
+	 * therefore, when the data length request is less than 4 bytes, there's
+	 * only one packet, and the packet data length is equal to that of the
+	 * request data length.
 	 * The size of the remaining packets is the minimum of the frag
 	 * size (MTU) or remaining data in the request.
 	 */
 	u32 len;
 
 	if (!req->seqnum) {
-		len = ((be16_to_cpu(req->hdr.lrh[2]) << 2) -
-		       (sizeof(tx->hdr) - 4));
+		if (req->data_len < sizeof(u32))
+			len = req->data_len;
+		else
+			len = ((be16_to_cpu(req->hdr.lrh[2]) << 2) -
+			       (sizeof(tx->hdr) - 4));
 	} else if (req_opcode(req->info.ctrl) == EXPECTED) {
 		u32 tidlen = EXP_TID_GET(req->tids[req->tididx], LEN) *
 			PAGE_SIZE;
@@ -830,6 +837,13 @@ static inline u32 compute_data_length(struct user_sdma_request *req,
 	return len;
 }
 
+static inline u32 pad_len(u32 len)
+{
+	if (len & (sizeof(u32) - 1))
+		len += sizeof(u32) - (len & (sizeof(u32) - 1));
+	return len;
+}
+
 static inline u32 get_lrh_len(struct hfi1_pkt_header hdr, u32 len)
 {
 	/* (Size of complete header - size of PBC) + 4B ICRC + data length */
@@ -921,7 +935,8 @@ static int user_sdma_send_pkts(struct user_sdma_request *req, unsigned maxpkts)
 		if (test_bit(SDMA_REQ_HAVE_AHG, &req->flags)) {
 			if (!req->seqnum) {
 				u16 pbclen = le16_to_cpu(req->hdr.pbc[0]);
-				u32 lrhlen = get_lrh_len(req->hdr, datalen);
+				u32 lrhlen = get_lrh_len(req->hdr,
+							 pad_len(datalen));
 				/*
 				 * Copy the request header into the tx header
 				 * because the HW needs a cacheline-aligned
@@ -1219,16 +1234,14 @@ static int check_header_template(struct user_sdma_request *req,
 	/*
 	 * Perform safety checks for any type of packet:
 	 *    - transfer size is multiple of 64bytes
-	 *    - packet length is multiple of 4bytes
-	 *    - entire request length is multiple of 4bytes
+	 *    - packet length is multiple of 4 bytes
 	 *    - packet length is not larger than MTU size
 	 *
 	 * These checks are only done for the first packet of the
 	 * transfer since the header is "given" to us by user space.
 	 * For the remainder of the packets we compute the values.
 	 */
-	if (req->info.fragsize % PIO_BLOCK_SIZE ||
-	    lrhlen & 0x3 || req->data_len & 0x3  ||
+	if (req->info.fragsize % PIO_BLOCK_SIZE || lrhlen & 0x3 ||
 	    lrhlen > get_lrh_len(*hdr, req->info.fragsize))
 		return -EINVAL;
 
@@ -1290,7 +1303,7 @@ static int set_txreq_header(struct user_sdma_request *req,
 	struct hfi1_pkt_header *hdr = &tx->hdr;
 	u16 pbclen;
 	int ret;
-	u32 tidval = 0, lrhlen = get_lrh_len(*hdr, datalen);
+	u32 tidval = 0, lrhlen = get_lrh_len(*hdr, pad_len(datalen));
 
 	/* Copy the header template to the request before modification */
 	memcpy(hdr, &req->hdr, sizeof(*hdr));
@@ -1401,7 +1414,7 @@ static int set_txreq_header_ahg(struct user_sdma_request *req,
 	struct hfi1_user_sdma_pkt_q *pq = req->pq;
 	struct hfi1_pkt_header *hdr = &req->hdr;
 	u16 pbclen = le16_to_cpu(hdr->pbc[0]);
-	u32 val32, tidval = 0, lrhlen = get_lrh_len(*hdr, len);
+	u32 val32, tidval = 0, lrhlen = get_lrh_len(*hdr, pad_len(len));
 
 	if (PBC2LRH(pbclen) != lrhlen) {
 		/* PBC.PbcLengthDWs */
diff --git a/include/uapi/rdma/hfi/hfi1_user.h b/include/uapi/rdma/hfi/hfi1_user.h
index 98bebf8bef55..d15e7289d835 100644
--- a/include/uapi/rdma/hfi/hfi1_user.h
+++ b/include/uapi/rdma/hfi/hfi1_user.h
@@ -75,7 +75,7 @@
  * may not be implemented; the user code must deal with this if it
  * cares, or it must abort after initialization reports the difference.
  */
-#define HFI1_USER_SWMINOR 1
+#define HFI1_USER_SWMINOR 2
 
 /*
  * We will encode the major/minor inside a single 32bit version number.
-- 
cgit v1.2.3