From 7dafbab3753fcf59bc81748e5b2c5bf04e1c62c7 Mon Sep 17 00:00:00 2001 From: Don Hiatt Date: Fri, 12 May 2017 09:19:55 -0700 Subject: IB/hfi1: Add functions to parse BTH/IB headers Improve code readablity by adding inline functions to read specific BTH/IB fields without knowledge of byte offsets. Reviewed-by: Brian Welty Reviewed-by: Dasaratharaman Chandramouli Reviewed-by: Dennis Dalessandro Signed-off-by: Don Hiatt Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- include/rdma/ib_hdrs.h | 84 ++++++++++++++++++++++++++++++++++++++++++++++++ include/rdma/ib_verbs.h | 2 ++ include/rdma/rdmavt_qp.h | 2 +- 3 files changed, 87 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/rdma/ib_hdrs.h b/include/rdma/ib_hdrs.h index 5519f31f043a..c124d515f7d5 100644 --- a/include/rdma/ib_hdrs.h +++ b/include/rdma/ib_hdrs.h @@ -193,8 +193,12 @@ static inline void put_ib_ateth_compare(u64 val, struct ib_atomic_eth *ateth) #define IB_LNH_MASK 3 #define IB_SC_MASK 0xf #define IB_SC_SHIFT 12 +#define IB_SC5_MASK 0x10 #define IB_SL_MASK 0xf #define IB_SL_SHIFT 4 +#define IB_SL_SHIFT 4 +#define IB_LVER_MASK 0xf +#define IB_LVER_SHIFT 8 static inline u8 ib_get_lnh(struct ib_header *hdr) { @@ -206,6 +210,11 @@ static inline u8 ib_get_sc(struct ib_header *hdr) return ((be16_to_cpu(hdr->lrh[0]) >> IB_SC_SHIFT) & IB_SC_MASK); } +static inline bool ib_is_sc5(u16 sc5) +{ + return !!(sc5 & IB_SC5_MASK); +} + static inline u8 ib_get_sl(struct ib_header *hdr) { return ((be16_to_cpu(hdr->lrh[0]) >> IB_SL_SHIFT) & IB_SL_MASK); @@ -221,6 +230,27 @@ static inline u16 ib_get_slid(struct ib_header *hdr) return (be16_to_cpu(hdr->lrh[3])); } +static inline u8 ib_get_lver(struct ib_header *hdr) +{ + return (u8)((be16_to_cpu(hdr->lrh[0]) >> IB_LVER_SHIFT) & + IB_LVER_MASK); +} + +static inline u16 ib_get_len(struct ib_header *hdr) +{ + return (u16)(be16_to_cpu(hdr->lrh[2])); +} + +static inline u32 ib_get_qkey(struct ib_other_headers *ohdr) +{ + return be32_to_cpu(ohdr->u.ud.deth[0]); +} + +static inline u32 ib_get_sqpn(struct ib_other_headers *ohdr) +{ + return ((be32_to_cpu(ohdr->u.ud.deth[1])) & IB_QPN_MASK); +} + /* * BTH */ @@ -229,6 +259,14 @@ static inline u16 ib_get_slid(struct ib_header *hdr) #define IB_BTH_PAD_MASK 3 #define IB_BTH_PKEY_MASK 0xffff #define IB_BTH_PAD_SHIFT 20 +#define IB_BTH_A_MASK 1 +#define IB_BTH_A_SHIFT 31 +#define IB_BTH_M_MASK 1 +#define IB_BTH_M_SHIFT 22 +#define IB_BTH_SE_MASK 1 +#define IB_BTH_SE_SHIFT 23 +#define IB_BTH_TVER_MASK 0xf +#define IB_BTH_TVER_SHIFT 16 static inline u8 ib_bth_get_pad(struct ib_other_headers *ohdr) { @@ -247,4 +285,50 @@ static inline u8 ib_bth_get_opcode(struct ib_other_headers *ohdr) IB_BTH_OPCODE_MASK); } +static inline u8 ib_bth_get_ackreq(struct ib_other_headers *ohdr) +{ + return (u8)((be32_to_cpu(ohdr->bth[2]) >> IB_BTH_A_SHIFT) & + IB_BTH_A_MASK); +} + +static inline u8 ib_bth_get_migreq(struct ib_other_headers *ohdr) +{ + return (u8)((be32_to_cpu(ohdr->bth[0]) >> IB_BTH_M_SHIFT) & + IB_BTH_M_MASK); +} + +static inline u8 ib_bth_get_se(struct ib_other_headers *ohdr) +{ + return (u8)((be32_to_cpu(ohdr->bth[0]) >> IB_BTH_SE_SHIFT) & + IB_BTH_SE_MASK); +} + +static inline u32 ib_bth_get_psn(struct ib_other_headers *ohdr) +{ + return (u32)(be32_to_cpu(ohdr->bth[2])); +} + +static inline u32 ib_bth_get_qpn(struct ib_other_headers *ohdr) +{ + return (u32)((be32_to_cpu(ohdr->bth[1])) & IB_QPN_MASK); +} + +static inline u8 ib_bth_get_becn(struct ib_other_headers *ohdr) +{ + return (u8)((be32_to_cpu(ohdr->bth[1]) >> IB_BECN_SHIFT) & + IB_BECN_MASK); +} + +static inline u8 ib_bth_get_fecn(struct ib_other_headers *ohdr) +{ + return (u8)((be32_to_cpu(ohdr->bth[1]) >> IB_FECN_SHIFT) & + IB_FECN_MASK); +} + +static inline u8 ib_bth_get_tver(struct ib_other_headers *ohdr) +{ + return (u8)((be32_to_cpu(ohdr->bth[0]) >> IB_BTH_TVER_SHIFT) & + IB_BTH_TVER_MASK); +} + #endif /* IB_HDRS_H */ diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index ba8314ec5768..8f1ce4e27bbd 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -664,6 +664,8 @@ union rdma_network_hdr { }; }; +#define IB_QPN_MASK 0xFFFFFF + enum { IB_MULTICAST_QPN = 0xffffff }; diff --git a/include/rdma/rdmavt_qp.h b/include/rdma/rdmavt_qp.h index be6472e5b06b..13f43b3527a8 100644 --- a/include/rdma/rdmavt_qp.h +++ b/include/rdma/rdmavt_qp.h @@ -396,7 +396,7 @@ struct rvt_srq { #define RVT_QPNMAP_ENTRIES (RVT_QPN_MAX / PAGE_SIZE / BITS_PER_BYTE) #define RVT_BITS_PER_PAGE (PAGE_SIZE * BITS_PER_BYTE) #define RVT_BITS_PER_PAGE_MASK (RVT_BITS_PER_PAGE - 1) -#define RVT_QPN_MASK 0xFFFFFF +#define RVT_QPN_MASK IB_QPN_MASK /* * QPN-map pages start out as NULL, they get allocated upon -- cgit v1.2.3 From 14fe13fcd3afb96b06809f280b586be1c998332c Mon Sep 17 00:00:00 2001 From: Mike Marciniszyn Date: Fri, 12 May 2017 09:20:31 -0700 Subject: IB/rdmavt: Compress adjacent SGEs in rvt_lkey_ok() SGEs that are contiguous needlessly consume driver dependent TX resources. The lkey validation logic is enhanced to compress the SGE that ends up in the send wqe when consecutive addresses are detected. The lkey validation API used to return 1 (success) or 0 (fail). The return value is now an -errno, 0 (compressed), or 1 (uncompressed). A additional argument is added to pass the last SQE for the compression. Loopback callers always pass a NULL to last_sge since the optimization is of little benefit in that situation. Reviewed-by: Dennis Dalessandro Signed-off-by: Brian Welty Signed-off-by: Venkata Sandeep Dhanalakota Signed-off-by: Mike Marciniszyn Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/ruc.c | 2 +- drivers/infiniband/hw/qib/qib_ruc.c | 2 +- drivers/infiniband/sw/rdmavt/mr.c | 51 +++++++++++++++++++++++---- drivers/infiniband/sw/rdmavt/qp.c | 23 ++++++------ drivers/infiniband/sw/rdmavt/trace_mr.h | 62 +++++++++++++++++++++++++++++++++ drivers/infiniband/sw/rdmavt/trace_tx.h | 11 +++--- include/rdma/rdma_vt.h | 3 +- 7 files changed, 130 insertions(+), 24 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/hw/hfi1/ruc.c b/drivers/infiniband/hw/hfi1/ruc.c index 9cc9c7be9dd4..476fe5da2992 100644 --- a/drivers/infiniband/hw/hfi1/ruc.c +++ b/drivers/infiniband/hw/hfi1/ruc.c @@ -75,7 +75,7 @@ static int init_sge(struct rvt_qp *qp, struct rvt_rwqe *wqe) continue; /* Check LKEY */ if (!rvt_lkey_ok(rkt, pd, j ? &ss->sg_list[j - 1] : &ss->sge, - &wqe->sg_list[i], IB_ACCESS_LOCAL_WRITE)) + NULL, &wqe->sg_list[i], IB_ACCESS_LOCAL_WRITE)) goto bad_lkey; qp->r_len += wqe->sg_list[i].length; j++; diff --git a/drivers/infiniband/hw/qib/qib_ruc.c b/drivers/infiniband/hw/qib/qib_ruc.c index bd09de7c6e56..88d84cbf7e5a 100644 --- a/drivers/infiniband/hw/qib/qib_ruc.c +++ b/drivers/infiniband/hw/qib/qib_ruc.c @@ -59,7 +59,7 @@ static int qib_init_sge(struct rvt_qp *qp, struct rvt_rwqe *wqe) continue; /* Check LKEY */ if (!rvt_lkey_ok(rkt, pd, j ? &ss->sg_list[j - 1] : &ss->sge, - &wqe->sg_list[i], IB_ACCESS_LOCAL_WRITE)) + NULL, &wqe->sg_list[i], IB_ACCESS_LOCAL_WRITE)) goto bad_lkey; qp->r_len += wqe->sg_list[i].length; j++; diff --git a/drivers/infiniband/sw/rdmavt/mr.c b/drivers/infiniband/sw/rdmavt/mr.c index aa5f9ea318e4..ea95672d9675 100644 --- a/drivers/infiniband/sw/rdmavt/mr.c +++ b/drivers/infiniband/sw/rdmavt/mr.c @@ -777,24 +777,55 @@ out: return ret; } +/** + * rvt_sge_adjacent - is isge compressible + * @isge: outgoing internal SGE + * @last_sge: last outgoing SGE written + * @sge: SGE to check + * + * If adjacent will update last_sge to add length. + * + * Return: true if isge is adjacent to last sge + */ +static inline bool rvt_sge_adjacent(struct rvt_sge *isge, + struct rvt_sge *last_sge, + struct ib_sge *sge) +{ + if (last_sge && sge->lkey == last_sge->mr->lkey && + ((uint64_t)(last_sge->vaddr + last_sge->length) == sge->addr)) { + if (sge->lkey) { + if (unlikely((sge->addr - last_sge->mr->user_base + + sge->length > last_sge->mr->length))) + return false; /* overrun, caller will catch */ + } else { + last_sge->length += sge->length; + } + last_sge->sge_length += sge->length; + trace_rvt_sge_adjacent(last_sge, sge); + return true; + } + return false; +} + /** * rvt_lkey_ok - check IB SGE for validity and initialize * @rkt: table containing lkey to check SGE against * @pd: protection domain * @isge: outgoing internal SGE + * @last_sge: last outgoing SGE written * @sge: SGE to check * @acc: access flags * * Check the IB SGE for validity and initialize our internal version * of it. * - * Return: 1 if valid and successful, otherwise returns 0. - * - * increments the reference count upon success + * Increments the reference count when a new sge is stored. * + * Return: 0 if compressed, 1 if added , otherwise returns -errno. */ int rvt_lkey_ok(struct rvt_lkey_table *rkt, struct rvt_pd *pd, - struct rvt_sge *isge, struct ib_sge *sge, int acc) + struct rvt_sge *isge, struct rvt_sge *last_sge, + struct ib_sge *sge, int acc) { struct rvt_mregion *mr; unsigned n, m; @@ -804,12 +835,14 @@ int rvt_lkey_ok(struct rvt_lkey_table *rkt, struct rvt_pd *pd, * We use LKEY == zero for kernel virtual addresses * (see rvt_get_dma_mr() and dma_virt_ops). */ - rcu_read_lock(); if (sge->lkey == 0) { struct rvt_dev_info *dev = ib_to_rvt(pd->ibpd.device); if (pd->user) - goto bail; + return -EINVAL; + if (rvt_sge_adjacent(isge, last_sge, sge)) + return 0; + rcu_read_lock(); mr = rcu_dereference(dev->dma_mr); if (!mr) goto bail; @@ -824,6 +857,9 @@ int rvt_lkey_ok(struct rvt_lkey_table *rkt, struct rvt_pd *pd, isge->n = 0; goto ok; } + if (rvt_sge_adjacent(isge, last_sge, sge)) + return 0; + rcu_read_lock(); mr = rcu_dereference(rkt->table[sge->lkey >> rkt->shift]); if (!mr) goto bail; @@ -874,12 +910,13 @@ int rvt_lkey_ok(struct rvt_lkey_table *rkt, struct rvt_pd *pd, isge->m = m; isge->n = n; ok: + trace_rvt_sge_new(isge, sge); return 1; bail_unref: rvt_put_mr(mr); bail: rcu_read_unlock(); - return 0; + return -EINVAL; } EXPORT_SYMBOL(rvt_lkey_ok); diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c index 727e81cc2c8f..a3dd1e536860 100644 --- a/drivers/infiniband/sw/rdmavt/qp.c +++ b/drivers/infiniband/sw/rdmavt/qp.c @@ -1646,7 +1646,7 @@ static int rvt_post_one_wr(struct rvt_qp *qp, struct rvt_pd *pd; struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device); u8 log_pmtu; - int ret; + int ret, incr; size_t cplen; bool reserved_op; int local_ops_delayed = 0; @@ -1719,22 +1719,23 @@ static int rvt_post_one_wr(struct rvt_qp *qp, wqe->length = 0; j = 0; if (wr->num_sge) { + struct rvt_sge *last_sge = NULL; + acc = wr->opcode >= IB_WR_RDMA_READ ? IB_ACCESS_LOCAL_WRITE : 0; for (i = 0; i < wr->num_sge; i++) { u32 length = wr->sg_list[i].length; - int ok; if (length == 0) continue; - ok = rvt_lkey_ok(rkt, pd, &wqe->sg_list[j], - &wr->sg_list[i], acc); - if (!ok) { - ret = -EINVAL; - goto bail_inval_free; - } + incr = rvt_lkey_ok(rkt, pd, &wqe->sg_list[j], last_sge, + &wr->sg_list[i], acc); + if (unlikely(incr < 0)) + goto bail_lkey_error; wqe->length += length; - j++; + if (incr) + last_sge = &wqe->sg_list[j]; + j += incr; } wqe->wr.num_sge = j; } @@ -1781,12 +1782,14 @@ static int rvt_post_one_wr(struct rvt_qp *qp, wqe->wr.send_flags &= ~RVT_SEND_RESERVE_USED; qp->s_avail--; } - trace_rvt_post_one_wr(qp, wqe); + trace_rvt_post_one_wr(qp, wqe, wr->num_sge); smp_wmb(); /* see request builders */ qp->s_head = next; return 0; +bail_lkey_error: + ret = incr; bail_inval_free: /* release mr holds */ while (j) { diff --git a/drivers/infiniband/sw/rdmavt/trace_mr.h b/drivers/infiniband/sw/rdmavt/trace_mr.h index 3318a6c36373..976e482930a3 100644 --- a/drivers/infiniband/sw/rdmavt/trace_mr.h +++ b/drivers/infiniband/sw/rdmavt/trace_mr.h @@ -103,6 +103,68 @@ DEFINE_EVENT( TP_PROTO(struct rvt_mregion *mr, u16 m, u16 n, void *v, size_t len), TP_ARGS(mr, m, n, v, len)); +DECLARE_EVENT_CLASS( + rvt_sge_template, + TP_PROTO(struct rvt_sge *sge, struct ib_sge *isge), + TP_ARGS(sge, isge), + TP_STRUCT__entry( + RDI_DEV_ENTRY(ib_to_rvt(sge->mr->pd->device)) + __field(struct rvt_mregion *, mr) + __field(struct rvt_sge *, sge) + __field(struct ib_sge *, isge) + __field(void *, vaddr) + __field(u64, ivaddr) + __field(u32, lkey) + __field(u32, sge_length) + __field(u32, length) + __field(u32, ilength) + __field(int, user) + __field(u16, m) + __field(u16, n) + ), + TP_fast_assign( + RDI_DEV_ASSIGN(ib_to_rvt(sge->mr->pd->device)); + __entry->mr = sge->mr; + __entry->sge = sge; + __entry->isge = isge; + __entry->vaddr = sge->vaddr; + __entry->ivaddr = isge->addr; + __entry->lkey = sge->mr->lkey; + __entry->sge_length = sge->sge_length; + __entry->length = sge->length; + __entry->ilength = isge->length; + __entry->m = sge->m; + __entry->n = sge->m; + __entry->user = ibpd_to_rvtpd(sge->mr->pd)->user; + ), + TP_printk( + "[%s] mr %p sge %p isge %p vaddr %p ivaddr %llx lkey %x sge_length %u length %u ilength %u m %u n %u user %u", + __get_str(dev), + __entry->mr, + __entry->sge, + __entry->isge, + __entry->vaddr, + __entry->ivaddr, + __entry->lkey, + __entry->sge_length, + __entry->length, + __entry->ilength, + __entry->m, + __entry->n, + __entry->user + ) +); + +DEFINE_EVENT( + rvt_sge_template, rvt_sge_adjacent, + TP_PROTO(struct rvt_sge *sge, struct ib_sge *isge), + TP_ARGS(sge, isge)); + +DEFINE_EVENT( + rvt_sge_template, rvt_sge_new, + TP_PROTO(struct rvt_sge *sge, struct ib_sge *isge), + TP_ARGS(sge, isge)); + #endif /* __RVT_TRACE_MR_H */ #undef TRACE_INCLUDE_PATH diff --git a/drivers/infiniband/sw/rdmavt/trace_tx.h b/drivers/infiniband/sw/rdmavt/trace_tx.h index a613a2223751..0ef25fc49f25 100644 --- a/drivers/infiniband/sw/rdmavt/trace_tx.h +++ b/drivers/infiniband/sw/rdmavt/trace_tx.h @@ -84,12 +84,12 @@ __print_symbolic(opcode, \ wr_opcode_name(RESERVED10)) #define POS_PRN \ -"[%s] wqe %p wr_id %llx send_flags %x qpn %x qpt %u psn %x lpsn %x ssn %x length %u opcode 0x%.2x,%s size %u avail %u head %u last %u pid %u num_sge %u" +"[%s] wqe %p wr_id %llx send_flags %x qpn %x qpt %u psn %x lpsn %x ssn %x length %u opcode 0x%.2x,%s size %u avail %u head %u last %u pid %u num_sge %u wr_num_sge %u" TRACE_EVENT( rvt_post_one_wr, - TP_PROTO(struct rvt_qp *qp, struct rvt_swqe *wqe), - TP_ARGS(qp, wqe), + TP_PROTO(struct rvt_qp *qp, struct rvt_swqe *wqe, int wr_num_sge), + TP_ARGS(qp, wqe, wr_num_sge), TP_STRUCT__entry( RDI_DEV_ENTRY(ib_to_rvt(qp->ibqp.device)) __field(u64, wr_id) @@ -108,6 +108,7 @@ TRACE_EVENT( __field(int, send_flags) __field(pid_t, pid) __field(int, num_sge) + __field(int, wr_num_sge) ), TP_fast_assign( RDI_DEV_ASSIGN(ib_to_rvt(qp->ibqp.device)) @@ -127,6 +128,7 @@ TRACE_EVENT( __entry->ssn = wqe->ssn; __entry->send_flags = wqe->wr.send_flags; __entry->num_sge = wqe->wr.num_sge; + __entry->wr_num_sge = wr_num_sge; ), TP_printk( POS_PRN, @@ -146,7 +148,8 @@ TRACE_EVENT( __entry->head, __entry->last, __entry->pid, - __entry->num_sge + __entry->num_sge, + __entry->wr_num_sge ) ); diff --git a/include/rdma/rdma_vt.h b/include/rdma/rdma_vt.h index 4878aaf7bdff..d0b9f91e5f4d 100644 --- a/include/rdma/rdma_vt.h +++ b/include/rdma/rdma_vt.h @@ -515,7 +515,8 @@ int rvt_invalidate_rkey(struct rvt_qp *qp, u32 rkey); int rvt_rkey_ok(struct rvt_qp *qp, struct rvt_sge *sge, u32 len, u64 vaddr, u32 rkey, int acc); int rvt_lkey_ok(struct rvt_lkey_table *rkt, struct rvt_pd *pd, - struct rvt_sge *isge, struct ib_sge *sge, int acc); + struct rvt_sge *isge, struct rvt_sge *last_sge, + struct ib_sge *sge, int acc); struct rvt_mcast *rvt_mcast_find(struct rvt_ibport *ibp, union ib_gid *mgid, u16 lid); -- cgit v1.2.3 From cb49366f3616fdf197893c24a5b2677b8c26ce29 Mon Sep 17 00:00:00 2001 From: "Vishwanathapura, Niranjana" Date: Thu, 1 Jun 2017 17:04:02 -0700 Subject: IB/core,rdmavt,hfi1,opa-vnic: Send OPA cap_mask3 in trap Provide the ability for IB clients to modify the OPA specific capability mask and include this mask in the subsequent trap data. Reviewed-by: Niranjana Vishwanathapura Signed-off-by: Michael N. Henry Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/mad.c | 7 ++----- drivers/infiniband/hw/hfi1/mad.h | 2 +- drivers/infiniband/hw/hfi1/verbs.c | 6 +++++- drivers/infiniband/sw/rdmavt/vt.c | 9 +++++++-- drivers/infiniband/ulp/opa_vnic/opa_vnic_vema.c | 27 ++++++++++++++++++++++++- include/rdma/ib_verbs.h | 3 ++- include/rdma/rdma_vt.h | 1 + 7 files changed, 44 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/hw/hfi1/mad.c b/drivers/infiniband/hw/hfi1/mad.c index 5977673a52d4..70831ad621b0 100644 --- a/drivers/infiniband/hw/hfi1/mad.c +++ b/drivers/infiniband/hw/hfi1/mad.c @@ -260,6 +260,7 @@ void hfi1_cap_mask_chg(struct rvt_dev_info *rdi, u8 port_num) data.issuer_lid = cpu_to_be32(lid); data.ntc_144.lid = data.issuer_lid; data.ntc_144.new_cap_mask = cpu_to_be32(ibp->rvp.port_cap_flags); + data.ntc_144.cap_mask3 = cpu_to_be16(ibp->rvp.port_cap3_flags); send_trap(ibp, &data, sizeof(data)); } @@ -704,11 +705,7 @@ static int __subn_get_opa_portinfo(struct opa_smp *smp, u32 am, u8 *data, buffer_units |= (dd->vl15_init << 11) & OPA_PI_MASK_BUF_UNIT_VL15_INIT; pi->buffer_units = cpu_to_be32(buffer_units); - pi->opa_cap_mask = cpu_to_be16(OPA_CAP_MASK3_IsSharedSpaceSupported | - OPA_CAP_MASK3_IsEthOnFabricSupported); - /* Driver does not support mcast/collective configuration */ - pi->opa_cap_mask &= - cpu_to_be16(~OPA_CAP_MASK3_IsAddrRangeConfigSupported); + pi->opa_cap_mask = cpu_to_be16(ibp->rvp.port_cap3_flags); pi->collectivemask_multicastmask = ((HFI1_COLLECTIVE_NR & 0x7) << 3 | (HFI1_MCAST_NR & 0x7)); diff --git a/drivers/infiniband/hw/hfi1/mad.h b/drivers/infiniband/hw/hfi1/mad.h index 5aa3fd1be653..a4e2506bd5ca 100644 --- a/drivers/infiniband/hw/hfi1/mad.h +++ b/drivers/infiniband/hw/hfi1/mad.h @@ -115,7 +115,7 @@ struct opa_mad_notice_attr { __be32 lid; /* LID where change occurred */ __be32 new_cap_mask; /* new capability mask */ __be16 reserved2; - __be16 cap_mask; + __be16 cap_mask3; __be16 change_flags; /* low 4 bits only */ } __packed ntc_144; diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c index af54d3f4696a..2d7759f0c6b4 100644 --- a/drivers/infiniband/hw/hfi1/verbs.c +++ b/drivers/infiniband/hw/hfi1/verbs.c @@ -1537,9 +1537,13 @@ static void init_ibport(struct hfi1_pportdata *ppd) /* Set the prefix to the default value (see ch. 4.1.1) */ ibp->rvp.gid_prefix = IB_DEFAULT_GID_PREFIX; ibp->rvp.sm_lid = 0; - /* Below should only set bits defined in OPA PortInfo.CapabilityMask */ + /* + * Below should only set bits defined in OPA PortInfo.CapabilityMask + * and PortInfo.CapabilityMask3 + */ ibp->rvp.port_cap_flags = IB_PORT_AUTO_MIGR_SUP | IB_PORT_CAP_MASK_NOTICE_SUP; + ibp->rvp.port_cap3_flags = OPA_CAP_MASK3_IsSharedSpaceSupported; ibp->rvp.pma_counter_select[0] = IB_PMA_PORT_XMIT_DATA; ibp->rvp.pma_counter_select[1] = IB_PMA_PORT_RCV_DATA; ibp->rvp.pma_counter_select[2] = IB_PMA_PORT_XMIT_PKTS; diff --git a/drivers/infiniband/sw/rdmavt/vt.c b/drivers/infiniband/sw/rdmavt/vt.c index 0d7c6bb551d9..64bdd442078a 100644 --- a/drivers/infiniband/sw/rdmavt/vt.c +++ b/drivers/infiniband/sw/rdmavt/vt.c @@ -202,8 +202,13 @@ static int rvt_modify_port(struct ib_device *ibdev, u8 port_num, return -EINVAL; rvp = rdi->ports[port_index]; - rvp->port_cap_flags |= props->set_port_cap_mask; - rvp->port_cap_flags &= ~props->clr_port_cap_mask; + if (port_modify_mask & IB_PORT_OPA_MASK_CHG) { + rvp->port_cap3_flags |= props->set_port_cap_mask; + rvp->port_cap3_flags &= ~props->clr_port_cap_mask; + } else { + rvp->port_cap_flags |= props->set_port_cap_mask; + rvp->port_cap_flags &= ~props->clr_port_cap_mask; + } if (props->set_port_cap_mask || props->clr_port_cap_mask) rdi->driver_f.cap_mask_chg(rdi, port_num); diff --git a/drivers/infiniband/ulp/opa_vnic/opa_vnic_vema.c b/drivers/infiniband/ulp/opa_vnic/opa_vnic_vema.c index 875694f9a7f9..32cdd7a35415 100644 --- a/drivers/infiniband/ulp/opa_vnic/opa_vnic_vema.c +++ b/drivers/infiniband/ulp/opa_vnic/opa_vnic_vema.c @@ -52,7 +52,9 @@ #include #include -#include +#include +#include +#include #include "opa_vnic_internal.h" @@ -979,6 +981,27 @@ static int vema_register(struct opa_vnic_ctrl_port *cport) return 0; } +/** + * opa_vnic_ctrl_config_dev -- This function sends a trap to the EM + * by way of ib_modify_port to indicate support for ethernet on the + * fabric. + * @cport: pointer to control port + * @en: enable or disable ethernet on fabric support + */ +static void opa_vnic_ctrl_config_dev(struct opa_vnic_ctrl_port *cport, bool en) +{ + struct ib_port_modify pm = { 0 }; + int i; + + if (en) + pm.set_port_cap_mask = OPA_CAP_MASK3_IsEthOnFabricSupported; + else + pm.clr_port_cap_mask = OPA_CAP_MASK3_IsEthOnFabricSupported; + + for (i = 1; i <= cport->num_ports; i++) + ib_modify_port(cport->ibdev, i, IB_PORT_OPA_MASK_CHG, &pm); +} + /** * opa_vnic_vema_add_one -- Handle new ib device * @device: ib device pointer @@ -1007,6 +1030,7 @@ static void opa_vnic_vema_add_one(struct ib_device *device) c_info("VNIC client initialized\n"); ib_set_client_data(device, &opa_vnic_client, cport); + opa_vnic_ctrl_config_dev(cport, true); } /** @@ -1025,6 +1049,7 @@ static void opa_vnic_vema_rem_one(struct ib_device *device, return; c_info("removing VNIC client\n"); + opa_vnic_ctrl_config_dev(cport, false); vema_unregister(cport); kfree(cport); } diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 8f1ce4e27bbd..9d4d2a74c95e 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -577,7 +577,8 @@ struct ib_device_modify { enum ib_port_modify_flags { IB_PORT_SHUTDOWN = 1, IB_PORT_INIT_TYPE = (1<<2), - IB_PORT_RESET_QKEY_CNTR = (1<<3) + IB_PORT_RESET_QKEY_CNTR = (1<<3), + IB_PORT_OPA_MASK_CHG = (1<<4) }; struct ib_port_modify { diff --git a/include/rdma/rdma_vt.h b/include/rdma/rdma_vt.h index d0b9f91e5f4d..0f18ffd98dd7 100644 --- a/include/rdma/rdma_vt.h +++ b/include/rdma/rdma_vt.h @@ -75,6 +75,7 @@ struct rvt_ibport { __be64 mkey; u64 tid; u32 port_cap_flags; + u16 port_cap3_flags; u32 pma_sample_start; u32 pma_sample_interval; __be16 pma_counter_select[5]; -- cgit v1.2.3 From d41861942fc55c14b6280d9568a0d0112037f065 Mon Sep 17 00:00:00 2001 From: Yuval Shaia Date: Wed, 14 Jun 2017 23:13:34 +0300 Subject: IB/core: Add generic function to extract IB speed from netdev Logic of retrieving netdev speed from net_device and translating it to IB speed is implemented in rxe, in usnic and in bnxt drivers. Define new function which merges all. Signed-off-by: Yuval Shaia Reviewed-by: Christian Benvenuti Reviewed-by: Selvin Xavier Reviewed-by: Moni Shoua Signed-off-by: Doug Ledford --- drivers/infiniband/core/roce_gid_mgmt.c | 2 + drivers/infiniband/core/verbs.c | 55 ++++++++++++++++++++++++++++ drivers/infiniband/hw/bnxt_re/ib_verbs.c | 49 ++----------------------- drivers/infiniband/hw/usnic/usnic_ib_verbs.c | 31 +++------------- drivers/infiniband/sw/rxe/rxe_verbs.c | 53 +++------------------------ include/rdma/ib_verbs.h | 1 + 6 files changed, 73 insertions(+), 118 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/core/roce_gid_mgmt.c b/drivers/infiniband/core/roce_gid_mgmt.c index 94a9eefb3cfc..90e3889b7fbe 100644 --- a/drivers/infiniband/core/roce_gid_mgmt.c +++ b/drivers/infiniband/core/roce_gid_mgmt.c @@ -44,6 +44,8 @@ static struct workqueue_struct *gid_cache_wq; +static struct workqueue_struct *gid_cache_wq; + enum gid_op_type { GID_DEL = 0, GID_ADD diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index fb98ed67d5bc..40de69bf07cd 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -1302,6 +1302,61 @@ int ib_modify_qp_with_udata(struct ib_qp *qp, struct ib_qp_attr *attr, } EXPORT_SYMBOL(ib_modify_qp_with_udata); +int ib_get_eth_speed(struct ib_device *dev, u8 port_num, u8 *speed, u8 *width) +{ + int rc; + u32 netdev_speed; + struct net_device *netdev; + struct ethtool_link_ksettings lksettings; + + if (rdma_port_get_link_layer(dev, port_num) != IB_LINK_LAYER_ETHERNET) + return -EINVAL; + + if (!dev->get_netdev) + return -EOPNOTSUPP; + + netdev = dev->get_netdev(dev, port_num); + if (!netdev) + return -ENODEV; + + rtnl_lock(); + rc = __ethtool_get_link_ksettings(netdev, &lksettings); + rtnl_unlock(); + + dev_put(netdev); + + if (!rc) { + netdev_speed = lksettings.base.speed; + } else { + netdev_speed = SPEED_1000; + pr_warn("%s speed is unknown, defaulting to %d\n", netdev->name, + netdev_speed); + } + + if (netdev_speed <= SPEED_1000) { + *width = IB_WIDTH_1X; + *speed = IB_SPEED_SDR; + } else if (netdev_speed <= SPEED_10000) { + *width = IB_WIDTH_1X; + *speed = IB_SPEED_FDR10; + } else if (netdev_speed <= SPEED_20000) { + *width = IB_WIDTH_4X; + *speed = IB_SPEED_DDR; + } else if (netdev_speed <= SPEED_25000) { + *width = IB_WIDTH_1X; + *speed = IB_SPEED_EDR; + } else if (netdev_speed <= SPEED_40000) { + *width = IB_WIDTH_4X; + *speed = IB_SPEED_FDR10; + } else { + *width = IB_WIDTH_4X; + *speed = IB_SPEED_EDR; + } + + return 0; +} +EXPORT_SYMBOL(ib_get_eth_speed); + int ib_modify_qp(struct ib_qp *qp, struct ib_qp_attr *qp_attr, int qp_attr_mask) diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c index 5dc6e7ce3ab9..b10e1a6dce84 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c @@ -223,50 +223,6 @@ int bnxt_re_modify_device(struct ib_device *ibdev, return 0; } -static void __to_ib_speed_width(struct net_device *netdev, u8 *speed, u8 *width) -{ - struct ethtool_link_ksettings lksettings; - u32 espeed; - - if (netdev->ethtool_ops && netdev->ethtool_ops->get_link_ksettings) { - memset(&lksettings, 0, sizeof(lksettings)); - rtnl_lock(); - netdev->ethtool_ops->get_link_ksettings(netdev, &lksettings); - rtnl_unlock(); - espeed = lksettings.base.speed; - } else { - espeed = SPEED_UNKNOWN; - } - switch (espeed) { - case SPEED_1000: - *speed = IB_SPEED_SDR; - *width = IB_WIDTH_1X; - break; - case SPEED_10000: - *speed = IB_SPEED_QDR; - *width = IB_WIDTH_1X; - break; - case SPEED_20000: - *speed = IB_SPEED_DDR; - *width = IB_WIDTH_4X; - break; - case SPEED_25000: - *speed = IB_SPEED_EDR; - *width = IB_WIDTH_1X; - break; - case SPEED_40000: - *speed = IB_SPEED_QDR; - *width = IB_WIDTH_4X; - break; - case SPEED_50000: - break; - default: - *speed = IB_SPEED_SDR; - *width = IB_WIDTH_1X; - break; - } -} - /* Port */ int bnxt_re_query_port(struct ib_device *ibdev, u8 port_num, struct ib_port_attr *port_attr) @@ -308,8 +264,9 @@ int bnxt_re_query_port(struct ib_device *ibdev, u8 port_num, * IB stack to avoid race in the NETDEV_UNREG path */ if (test_bit(BNXT_RE_FLAG_IBDEV_REGISTERED, &rdev->flags)) - __to_ib_speed_width(rdev->netdev, &port_attr->active_speed, - &port_attr->active_width); + if (!ib_get_eth_speed(ibdev, port_num, &port_attr->active_speed, + &port_attr->active_width)) + return -EINVAL; return 0; } diff --git a/drivers/infiniband/hw/usnic/usnic_ib_verbs.c b/drivers/infiniband/hw/usnic/usnic_ib_verbs.c index f9dc1e80c3b7..e5f57dd49980 100644 --- a/drivers/infiniband/hw/usnic/usnic_ib_verbs.c +++ b/drivers/infiniband/hw/usnic/usnic_ib_verbs.c @@ -226,27 +226,6 @@ static void qp_grp_destroy(struct usnic_ib_qp_grp *qp_grp) spin_unlock(&vf->lock); } -static void eth_speed_to_ib_speed(int speed, u8 *active_speed, - u8 *active_width) -{ - if (speed <= 10000) { - *active_width = IB_WIDTH_1X; - *active_speed = IB_SPEED_FDR10; - } else if (speed <= 20000) { - *active_width = IB_WIDTH_4X; - *active_speed = IB_SPEED_DDR; - } else if (speed <= 30000) { - *active_width = IB_WIDTH_4X; - *active_speed = IB_SPEED_QDR; - } else if (speed <= 40000) { - *active_width = IB_WIDTH_4X; - *active_speed = IB_SPEED_FDR10; - } else { - *active_width = IB_WIDTH_4X; - *active_speed = IB_SPEED_EDR; - } -} - static int create_qp_validate_user_data(struct usnic_ib_create_qp_cmd cmd) { if (cmd.spec.trans_type <= USNIC_TRANSPORT_UNKNOWN || @@ -326,12 +305,16 @@ int usnic_ib_query_port(struct ib_device *ibdev, u8 port, struct ib_port_attr *props) { struct usnic_ib_dev *us_ibdev = to_usdev(ibdev); - struct ethtool_link_ksettings cmd; usnic_dbg("\n"); mutex_lock(&us_ibdev->usdev_lock); - __ethtool_get_link_ksettings(us_ibdev->netdev, &cmd); + if (!ib_get_eth_speed(ibdev, port, &props->active_speed, + &props->active_width)) { + mutex_unlock(&us_ibdev->usdev_lock); + return -EINVAL; + } + /* props being zeroed by the caller, avoid zeroing it here */ props->lid = 0; @@ -355,8 +338,6 @@ int usnic_ib_query_port(struct ib_device *ibdev, u8 port, props->pkey_tbl_len = 1; props->bad_pkey_cntr = 0; props->qkey_viol_cntr = 0; - eth_speed_to_ib_speed(cmd.base.speed, &props->active_speed, - &props->active_width); props->max_mtu = IB_MTU_4096; props->active_mtu = iboe_get_mtu(us_ibdev->ufdev->mtu); /* Userspace will adjust for hdrs */ diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c index af90a7d42b96..e6c10e43a6d7 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.c +++ b/drivers/infiniband/sw/rxe/rxe_verbs.c @@ -51,40 +51,16 @@ static int rxe_query_device(struct ib_device *dev, return 0; } -static void rxe_eth_speed_to_ib_speed(int speed, u8 *active_speed, - u8 *active_width) -{ - if (speed <= 1000) { - *active_width = IB_WIDTH_1X; - *active_speed = IB_SPEED_SDR; - } else if (speed <= 10000) { - *active_width = IB_WIDTH_1X; - *active_speed = IB_SPEED_FDR10; - } else if (speed <= 20000) { - *active_width = IB_WIDTH_4X; - *active_speed = IB_SPEED_DDR; - } else if (speed <= 30000) { - *active_width = IB_WIDTH_4X; - *active_speed = IB_SPEED_QDR; - } else if (speed <= 40000) { - *active_width = IB_WIDTH_4X; - *active_speed = IB_SPEED_FDR10; - } else { - *active_width = IB_WIDTH_4X; - *active_speed = IB_SPEED_EDR; - } -} - static int rxe_query_port(struct ib_device *dev, u8 port_num, struct ib_port_attr *attr) { struct rxe_dev *rxe = to_rdev(dev); struct rxe_port *port; - u32 speed; + int rc = -EINVAL; if (unlikely(port_num != 1)) { pr_warn("invalid port_number %d\n", port_num); - goto err1; + goto out; } port = &rxe->port; @@ -93,29 +69,12 @@ static int rxe_query_port(struct ib_device *dev, *attr = port->attr; mutex_lock(&rxe->usdev_lock); - if (rxe->ndev->ethtool_ops->get_link_ksettings) { - struct ethtool_link_ksettings ks; - - rxe->ndev->ethtool_ops->get_link_ksettings(rxe->ndev, &ks); - speed = ks.base.speed; - } else if (rxe->ndev->ethtool_ops->get_settings) { - struct ethtool_cmd cmd; - - rxe->ndev->ethtool_ops->get_settings(rxe->ndev, &cmd); - speed = cmd.speed; - } else { - pr_warn("%s speed is unknown, defaulting to 1000\n", - rxe->ndev->name); - speed = 1000; - } - rxe_eth_speed_to_ib_speed(speed, &attr->active_speed, - &attr->active_width); + rc = ib_get_eth_speed(dev, port_num, &attr->active_speed, + &attr->active_width); mutex_unlock(&rxe->usdev_lock); - return 0; - -err1: - return -EINVAL; +out: + return rc; } static int rxe_query_gid(struct ib_device *device, diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index b5732432bb29..68d947dac9a2 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -3555,6 +3555,7 @@ void ib_drain_qp(struct ib_qp *qp); int ib_resolve_eth_dmac(struct ib_device *device, struct rdma_ah_attr *ah_attr); +int ib_get_eth_speed(struct ib_device *dev, u8 port_num, u8 *speed, u8 *width); static inline u8 *rdma_ah_retrieve_dmac(struct rdma_ah_attr *attr) { -- cgit v1.2.3 From bded747bb432bc5f7ad6d84ea747368b70ed9df2 Mon Sep 17 00:00:00 2001 From: Huy Nguyen Date: Tue, 30 May 2017 09:42:53 +0300 Subject: net/mlx5: Add raw ethernet local loopback firmware command Add support for raw ethernet local loopback firmware command. Signed-off-by: Huy Nguyen Reviewed-by: Daniel Jurgens Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/net/ethernet/mellanox/mlx5/core/vport.c | 62 +++++++++++++++++++++++++ include/linux/mlx5/mlx5_ifc.h | 11 +++-- include/linux/mlx5/vport.h | 3 +- 3 files changed, 72 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vport.c b/drivers/net/ethernet/mellanox/mlx5/core/vport.c index 5abfec1c3399..d653b0025b13 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/vport.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/vport.c @@ -897,6 +897,68 @@ int mlx5_modify_nic_vport_promisc(struct mlx5_core_dev *mdev, } EXPORT_SYMBOL_GPL(mlx5_modify_nic_vport_promisc); +enum { + UC_LOCAL_LB, + MC_LOCAL_LB +}; + +int mlx5_nic_vport_update_local_lb(struct mlx5_core_dev *mdev, bool enable) +{ + int inlen = MLX5_ST_SZ_BYTES(modify_nic_vport_context_in); + void *in; + int err; + + mlx5_core_dbg(mdev, "%s local_lb\n", enable ? "enable" : "disable"); + in = kvzalloc(inlen, GFP_KERNEL); + if (!in) + return -ENOMEM; + + MLX5_SET(modify_nic_vport_context_in, in, + field_select.disable_mc_local_lb, 1); + MLX5_SET(modify_nic_vport_context_in, in, + nic_vport_context.disable_mc_local_lb, !enable); + + MLX5_SET(modify_nic_vport_context_in, in, + field_select.disable_uc_local_lb, 1); + MLX5_SET(modify_nic_vport_context_in, in, + nic_vport_context.disable_uc_local_lb, !enable); + + err = mlx5_modify_nic_vport_context(mdev, in, inlen); + + kvfree(in); + return err; +} +EXPORT_SYMBOL_GPL(mlx5_nic_vport_update_local_lb); + +int mlx5_nic_vport_query_local_lb(struct mlx5_core_dev *mdev, bool *status) +{ + int outlen = MLX5_ST_SZ_BYTES(query_nic_vport_context_out); + u32 *out; + int value; + int err; + + out = kzalloc(outlen, GFP_KERNEL); + if (!out) + return -ENOMEM; + + err = mlx5_query_nic_vport_context(mdev, 0, out, outlen); + if (err) + goto out; + + value = MLX5_GET(query_nic_vport_context_out, out, + nic_vport_context.disable_mc_local_lb) << MC_LOCAL_LB; + + value |= MLX5_GET(query_nic_vport_context_out, out, + nic_vport_context.disable_uc_local_lb) << UC_LOCAL_LB; + + *status = !value; + +out: + kfree(out); + return err; +} +EXPORT_SYMBOL_GPL(mlx5_nic_vport_query_local_lb); + enum mlx5_vport_roce_state { MLX5_VPORT_ROCE_DISABLED = 0, MLX5_VPORT_ROCE_ENABLED = 1, diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 87869c04849a..57c75e8b3c19 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -1016,7 +1016,8 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 log_max_wq_sz[0x5]; u8 nic_vport_change_event[0x1]; - u8 reserved_at_3e1[0xa]; + u8 disable_local_lb[0x1]; + u8 reserved_at_3e2[0x9]; u8 log_max_vlan_list[0x5]; u8 reserved_at_3f0[0x3]; u8 log_max_current_mc_list[0x5]; @@ -2562,7 +2563,9 @@ struct mlx5_ifc_rmpc_bits { struct mlx5_ifc_nic_vport_context_bits { u8 reserved_at_0[0x5]; u8 min_wqe_inline_mode[0x3]; - u8 reserved_at_8[0x17]; + u8 reserved_at_8[0x15]; + u8 disable_mc_local_lb[0x1]; + u8 disable_uc_local_lb[0x1]; u8 roce_en[0x1]; u8 arm_change_event[0x1]; @@ -5229,7 +5232,9 @@ struct mlx5_ifc_modify_nic_vport_context_out_bits { }; struct mlx5_ifc_modify_nic_vport_field_select_bits { - u8 reserved_at_0[0x16]; + u8 reserved_at_0[0x14]; + u8 disable_uc_local_lb[0x1]; + u8 disable_mc_local_lb[0x1]; u8 node_guid[0x1]; u8 port_guid[0x1]; u8 min_inline[0x1]; diff --git a/include/linux/mlx5/vport.h b/include/linux/mlx5/vport.h index 656c70b65dd2..aaa0bb9e7655 100644 --- a/include/linux/mlx5/vport.h +++ b/include/linux/mlx5/vport.h @@ -114,5 +114,6 @@ int mlx5_core_modify_hca_vport_context(struct mlx5_core_dev *dev, u8 other_vport, u8 port_num, int vf, struct mlx5_hca_vport_context *req); - +int mlx5_nic_vport_update_local_lb(struct mlx5_core_dev *mdev, bool enable); +int mlx5_nic_vport_query_local_lb(struct mlx5_core_dev *mdev, bool *status); #endif /* __MLX5_VPORT_H__ */ -- cgit v1.2.3 From 4a2da0b8c0782816f3ae6846ae7942fcbb0f8172 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Tue, 30 May 2017 10:05:15 +0300 Subject: IB/mlx5: Add debug control parameters for congestion control This patch adds debug control parameters for congestion control which can be read or written through debugfs. They are for reaction point and notification point nodes. These control parameters are as below: +------------------------------+-----------------------------------------+ | Name | Description | |------------------------------+-----------------------------------------| |rp_clamp_tgt_rate | When set target rate is updated to | | | current rate | |------------------------------+-----------------------------------------| |rp_clamp_tgt_rate_ati | When set update target rate based on | | | timer as well | |------------------------------+-----------------------------------------| |rp_time_reset | time between rate increase if no | | | CNP is received unit in usec | |------------------------------+-----------------------------------------| |rp_byte_reset | Number of bytes between rate inease if | | | no CNP is received | |------------------------------+-----------------------------------------| |rp_threshold | Threshold for reaction point rate | | | control | |------------------------------+-----------------------------------------| |rp_ai_rate | Rate for target rate, unit in Mbps | |------------------------------+-----------------------------------------| |rp_hai_rate | Rate for hyper increase state | | | unit in Mbps | |------------------------------+-----------------------------------------| |rp_min_dec_fac | Minimum factor by which the current | | | transmit rate can be changed when | | | processing a CNP, unit is percerntage | |------------------------------+-----------------------------------------| |rp_min_rate | Minimum value for rate limit, | | | unit in Mbps | |------------------------------+-----------------------------------------| |rp_rate_to_set_on_first_cnp | Rate that is set when first CNP is | | | received, unit is Mbps | |------------------------------+-----------------------------------------| |rp_dce_tcp_g | Used to calculate alpha | |------------------------------+-----------------------------------------| |rp_dce_tcp_rtt | Time between updates of alpha value, | | | unit is usec | |------------------------------+-----------------------------------------| |rp_rate_reduce_monitor_period | Minimum time between consecutive rate | | | reductions | |------------------------------+-----------------------------------------| |rp_initial_alpha_value | Initial value of alpha | |------------------------------+-----------------------------------------| |rp_gd | When CNP is received, flow rate is | | | reduced based on gd, rp_gd is given as | | | log2(rp_gd) | |------------------------------+-----------------------------------------| |np_cnp_dscp | dscp code point for generated cnp | |------------------------------+-----------------------------------------| |np_cnp_prio_mode | 802.1p priority for generated cnp | |------------------------------+-----------------------------------------| |np_cnp_prio | cnp priority mode | +------------------------------+-----------------------------------------+ Signed-off-by: Parav Pandit Reviewed-by: Daniel Jurgens Reviewed-by: Eli Cohen Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/hw/mlx5/Makefile | 2 +- drivers/infiniband/hw/mlx5/cmd.c | 20 ++ drivers/infiniband/hw/mlx5/cmd.h | 4 + drivers/infiniband/hw/mlx5/cong.c | 421 +++++++++++++++++++++++++++++++++++ drivers/infiniband/hw/mlx5/main.c | 9 +- drivers/infiniband/hw/mlx5/mlx5_ib.h | 37 +++ include/linux/mlx5/mlx5_ifc.h | 3 +- 7 files changed, 493 insertions(+), 3 deletions(-) create mode 100644 drivers/infiniband/hw/mlx5/cong.c (limited to 'include') diff --git a/drivers/infiniband/hw/mlx5/Makefile b/drivers/infiniband/hw/mlx5/Makefile index 90ad2adc752f..bc6299697dda 100644 --- a/drivers/infiniband/hw/mlx5/Makefile +++ b/drivers/infiniband/hw/mlx5/Makefile @@ -1,4 +1,4 @@ obj-$(CONFIG_MLX5_INFINIBAND) += mlx5_ib.o -mlx5_ib-y := main.o cq.o doorbell.o qp.o mem.o srq.o mr.o ah.o mad.o gsi.o ib_virt.o cmd.o +mlx5_ib-y := main.o cq.o doorbell.o qp.o mem.o srq.o mr.o ah.o mad.o gsi.o ib_virt.o cmd.o cong.o mlx5_ib-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += odp.o diff --git a/drivers/infiniband/hw/mlx5/cmd.c b/drivers/infiniband/hw/mlx5/cmd.c index 18d5e1db93ed..470995fa38d2 100644 --- a/drivers/infiniband/hw/mlx5/cmd.c +++ b/drivers/infiniband/hw/mlx5/cmd.c @@ -57,3 +57,23 @@ int mlx5_cmd_query_cong_counter(struct mlx5_core_dev *dev, MLX5_SET(query_cong_statistics_in, in, clear, reset); return mlx5_cmd_exec(dev, in, sizeof(in), out, out_size); } + +int mlx5_cmd_query_cong_params(struct mlx5_core_dev *dev, int cong_point, + void *out, int out_size) +{ + u32 in[MLX5_ST_SZ_DW(query_cong_params_in)] = { }; + + MLX5_SET(query_cong_params_in, in, opcode, + MLX5_CMD_OP_QUERY_CONG_PARAMS); + MLX5_SET(query_cong_params_in, in, cong_protocol, cong_point); + + return mlx5_cmd_exec(dev, in, sizeof(in), out, out_size); +} + +int mlx5_cmd_modify_cong_params(struct mlx5_core_dev *dev, + void *in, int in_size) +{ + u32 out[MLX5_ST_SZ_DW(modify_cong_params_out)] = { }; + + return mlx5_cmd_exec(dev, in, in_size, out, sizeof(out)); +} diff --git a/drivers/infiniband/hw/mlx5/cmd.h b/drivers/infiniband/hw/mlx5/cmd.h index fa09228193a6..af4c24596274 100644 --- a/drivers/infiniband/hw/mlx5/cmd.h +++ b/drivers/infiniband/hw/mlx5/cmd.h @@ -39,4 +39,8 @@ int mlx5_cmd_null_mkey(struct mlx5_core_dev *dev, u32 *null_mkey); int mlx5_cmd_query_cong_counter(struct mlx5_core_dev *dev, bool reset, void *out, int out_size); +int mlx5_cmd_query_cong_params(struct mlx5_core_dev *dev, int cong_point, + void *out, int out_size); +int mlx5_cmd_modify_cong_params(struct mlx5_core_dev *mdev, + void *in, int in_size); #endif /* MLX5_IB_CMD_H */ diff --git a/drivers/infiniband/hw/mlx5/cong.c b/drivers/infiniband/hw/mlx5/cong.c new file mode 100644 index 000000000000..2d32b519bb61 --- /dev/null +++ b/drivers/infiniband/hw/mlx5/cong.c @@ -0,0 +1,421 @@ +/* + * Copyright (c) 2013-2017, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include + +#include "mlx5_ib.h" +#include "cmd.h" + +enum mlx5_ib_cong_node_type { + MLX5_IB_RROCE_ECN_RP = 1, + MLX5_IB_RROCE_ECN_NP = 2, +}; + +static const char * const mlx5_ib_dbg_cc_name[] = { + "rp_clamp_tgt_rate", + "rp_clamp_tgt_rate_ati", + "rp_time_reset", + "rp_byte_reset", + "rp_threshold", + "rp_ai_rate", + "rp_hai_rate", + "rp_min_dec_fac", + "rp_min_rate", + "rp_rate_to_set_on_first_cnp", + "rp_dce_tcp_g", + "rp_dce_tcp_rtt", + "rp_rate_reduce_monitor_period", + "rp_initial_alpha_value", + "rp_gd", + "np_cnp_dscp", + "np_cnp_prio_mode", + "np_cnp_prio", +}; + +#define MLX5_IB_RP_CLAMP_TGT_RATE_ATTR BIT(1) +#define MLX5_IB_RP_CLAMP_TGT_RATE_ATI_ATTR BIT(2) +#define MLX5_IB_RP_TIME_RESET_ATTR BIT(3) +#define MLX5_IB_RP_BYTE_RESET_ATTR BIT(4) +#define MLX5_IB_RP_THRESHOLD_ATTR BIT(5) +#define MLX5_IB_RP_AI_RATE_ATTR BIT(7) +#define MLX5_IB_RP_HAI_RATE_ATTR BIT(8) +#define MLX5_IB_RP_MIN_DEC_FAC_ATTR BIT(9) +#define MLX5_IB_RP_MIN_RATE_ATTR BIT(10) +#define MLX5_IB_RP_RATE_TO_SET_ON_FIRST_CNP_ATTR BIT(11) +#define MLX5_IB_RP_DCE_TCP_G_ATTR BIT(12) +#define MLX5_IB_RP_DCE_TCP_RTT_ATTR BIT(13) +#define MLX5_IB_RP_RATE_REDUCE_MONITOR_PERIOD_ATTR BIT(14) +#define MLX5_IB_RP_INITIAL_ALPHA_VALUE_ATTR BIT(15) +#define MLX5_IB_RP_GD_ATTR BIT(16) + +#define MLX5_IB_NP_CNP_DSCP_ATTR BIT(3) +#define MLX5_IB_NP_CNP_PRIO_MODE_ATTR BIT(4) + +static enum mlx5_ib_cong_node_type +mlx5_ib_param_to_node(enum mlx5_ib_dbg_cc_types param_offset) +{ + if (param_offset >= MLX5_IB_DBG_CC_RP_CLAMP_TGT_RATE && + param_offset <= MLX5_IB_DBG_CC_RP_GD) + return MLX5_IB_RROCE_ECN_RP; + else + return MLX5_IB_RROCE_ECN_NP; +} + +static u32 mlx5_get_cc_param_val(void *field, int offset) +{ + switch (offset) { + case MLX5_IB_DBG_CC_RP_CLAMP_TGT_RATE: + return MLX5_GET(cong_control_r_roce_ecn_rp, field, + clamp_tgt_rate); + case MLX5_IB_DBG_CC_RP_CLAMP_TGT_RATE_ATI: + return MLX5_GET(cong_control_r_roce_ecn_rp, field, + clamp_tgt_rate_after_time_inc); + case MLX5_IB_DBG_CC_RP_TIME_RESET: + return MLX5_GET(cong_control_r_roce_ecn_rp, field, + rpg_time_reset); + case MLX5_IB_DBG_CC_RP_BYTE_RESET: + return MLX5_GET(cong_control_r_roce_ecn_rp, field, + rpg_byte_reset); + case MLX5_IB_DBG_CC_RP_THRESHOLD: + return MLX5_GET(cong_control_r_roce_ecn_rp, field, + rpg_threshold); + case MLX5_IB_DBG_CC_RP_AI_RATE: + return MLX5_GET(cong_control_r_roce_ecn_rp, field, + rpg_ai_rate); + case MLX5_IB_DBG_CC_RP_HAI_RATE: + return MLX5_GET(cong_control_r_roce_ecn_rp, field, + rpg_hai_rate); + case MLX5_IB_DBG_CC_RP_MIN_DEC_FAC: + return MLX5_GET(cong_control_r_roce_ecn_rp, field, + rpg_min_dec_fac); + case MLX5_IB_DBG_CC_RP_MIN_RATE: + return MLX5_GET(cong_control_r_roce_ecn_rp, field, + rpg_min_rate); + case MLX5_IB_DBG_CC_RP_RATE_TO_SET_ON_FIRST_CNP: + return MLX5_GET(cong_control_r_roce_ecn_rp, field, + rate_to_set_on_first_cnp); + case MLX5_IB_DBG_CC_RP_DCE_TCP_G: + return MLX5_GET(cong_control_r_roce_ecn_rp, field, + dce_tcp_g); + case MLX5_IB_DBG_CC_RP_DCE_TCP_RTT: + return MLX5_GET(cong_control_r_roce_ecn_rp, field, + dce_tcp_rtt); + case MLX5_IB_DBG_CC_RP_RATE_REDUCE_MONITOR_PERIOD: + return MLX5_GET(cong_control_r_roce_ecn_rp, field, + rate_reduce_monitor_period); + case MLX5_IB_DBG_CC_RP_INITIAL_ALPHA_VALUE: + return MLX5_GET(cong_control_r_roce_ecn_rp, field, + initial_alpha_value); + case MLX5_IB_DBG_CC_RP_GD: + return MLX5_GET(cong_control_r_roce_ecn_rp, field, + rpg_gd); + case MLX5_IB_DBG_CC_NP_CNP_DSCP: + return MLX5_GET(cong_control_r_roce_ecn_np, field, + cnp_dscp); + case MLX5_IB_DBG_CC_NP_CNP_PRIO_MODE: + return MLX5_GET(cong_control_r_roce_ecn_np, field, + cnp_prio_mode); + case MLX5_IB_DBG_CC_NP_CNP_PRIO: + return MLX5_GET(cong_control_r_roce_ecn_np, field, + cnp_802p_prio); + default: + return 0; + } +} + +static void mlx5_ib_set_cc_param_mask_val(void *field, int offset, + u32 var, u32 *attr_mask) +{ + switch (offset) { + case MLX5_IB_DBG_CC_RP_CLAMP_TGT_RATE: + *attr_mask |= MLX5_IB_RP_CLAMP_TGT_RATE_ATTR; + MLX5_SET(cong_control_r_roce_ecn_rp, field, + clamp_tgt_rate, var); + break; + case MLX5_IB_DBG_CC_RP_CLAMP_TGT_RATE_ATI: + *attr_mask |= MLX5_IB_RP_CLAMP_TGT_RATE_ATI_ATTR; + MLX5_SET(cong_control_r_roce_ecn_rp, field, + clamp_tgt_rate_after_time_inc, var); + break; + case MLX5_IB_DBG_CC_RP_TIME_RESET: + *attr_mask |= MLX5_IB_RP_TIME_RESET_ATTR; + MLX5_SET(cong_control_r_roce_ecn_rp, field, + rpg_time_reset, var); + break; + case MLX5_IB_DBG_CC_RP_BYTE_RESET: + *attr_mask |= MLX5_IB_RP_BYTE_RESET_ATTR; + MLX5_SET(cong_control_r_roce_ecn_rp, field, + rpg_byte_reset, var); + break; + case MLX5_IB_DBG_CC_RP_THRESHOLD: + *attr_mask |= MLX5_IB_RP_THRESHOLD_ATTR; + MLX5_SET(cong_control_r_roce_ecn_rp, field, + rpg_threshold, var); + break; + case MLX5_IB_DBG_CC_RP_AI_RATE: + *attr_mask |= MLX5_IB_RP_AI_RATE_ATTR; + MLX5_SET(cong_control_r_roce_ecn_rp, field, + rpg_ai_rate, var); + break; + case MLX5_IB_DBG_CC_RP_HAI_RATE: + *attr_mask |= MLX5_IB_RP_HAI_RATE_ATTR; + MLX5_SET(cong_control_r_roce_ecn_rp, field, + rpg_hai_rate, var); + break; + case MLX5_IB_DBG_CC_RP_MIN_DEC_FAC: + *attr_mask |= MLX5_IB_RP_MIN_DEC_FAC_ATTR; + MLX5_SET(cong_control_r_roce_ecn_rp, field, + rpg_min_dec_fac, var); + break; + case MLX5_IB_DBG_CC_RP_MIN_RATE: + *attr_mask |= MLX5_IB_RP_MIN_RATE_ATTR; + MLX5_SET(cong_control_r_roce_ecn_rp, field, + rpg_min_rate, var); + break; + case MLX5_IB_DBG_CC_RP_RATE_TO_SET_ON_FIRST_CNP: + *attr_mask |= MLX5_IB_RP_RATE_TO_SET_ON_FIRST_CNP_ATTR; + MLX5_SET(cong_control_r_roce_ecn_rp, field, + rate_to_set_on_first_cnp, var); + break; + case MLX5_IB_DBG_CC_RP_DCE_TCP_G: + *attr_mask |= MLX5_IB_RP_DCE_TCP_G_ATTR; + MLX5_SET(cong_control_r_roce_ecn_rp, field, + dce_tcp_g, var); + break; + case MLX5_IB_DBG_CC_RP_DCE_TCP_RTT: + *attr_mask |= MLX5_IB_RP_DCE_TCP_RTT_ATTR; + MLX5_SET(cong_control_r_roce_ecn_rp, field, + dce_tcp_rtt, var); + break; + case MLX5_IB_DBG_CC_RP_RATE_REDUCE_MONITOR_PERIOD: + *attr_mask |= MLX5_IB_RP_RATE_REDUCE_MONITOR_PERIOD_ATTR; + MLX5_SET(cong_control_r_roce_ecn_rp, field, + rate_reduce_monitor_period, var); + break; + case MLX5_IB_DBG_CC_RP_INITIAL_ALPHA_VALUE: + *attr_mask |= MLX5_IB_RP_INITIAL_ALPHA_VALUE_ATTR; + MLX5_SET(cong_control_r_roce_ecn_rp, field, + initial_alpha_value, var); + break; + case MLX5_IB_DBG_CC_RP_GD: + *attr_mask |= MLX5_IB_RP_GD_ATTR; + MLX5_SET(cong_control_r_roce_ecn_rp, field, + rpg_gd, var); + break; + case MLX5_IB_DBG_CC_NP_CNP_DSCP: + *attr_mask |= MLX5_IB_NP_CNP_DSCP_ATTR; + MLX5_SET(cong_control_r_roce_ecn_np, field, cnp_dscp, var); + break; + case MLX5_IB_DBG_CC_NP_CNP_PRIO_MODE: + *attr_mask |= MLX5_IB_NP_CNP_PRIO_MODE_ATTR; + MLX5_SET(cong_control_r_roce_ecn_np, field, cnp_prio_mode, var); + break; + case MLX5_IB_DBG_CC_NP_CNP_PRIO: + *attr_mask |= MLX5_IB_NP_CNP_PRIO_MODE_ATTR; + MLX5_SET(cong_control_r_roce_ecn_np, field, cnp_prio_mode, 0); + MLX5_SET(cong_control_r_roce_ecn_np, field, cnp_802p_prio, var); + break; + } +} + +static int mlx5_ib_get_cc_params(struct mlx5_ib_dev *dev, int offset, u32 *var) +{ + int outlen = MLX5_ST_SZ_BYTES(query_cong_params_out); + void *out; + void *field; + int err; + enum mlx5_ib_cong_node_type node; + + out = kvzalloc(outlen, GFP_KERNEL); + if (!out) + return -ENOMEM; + + node = mlx5_ib_param_to_node(offset); + + err = mlx5_cmd_query_cong_params(dev->mdev, node, out, outlen); + if (err) + goto free; + + field = MLX5_ADDR_OF(query_cong_params_out, out, congestion_parameters); + *var = mlx5_get_cc_param_val(field, offset); + +free: + kvfree(out); + return err; +} + +static int mlx5_ib_set_cc_params(struct mlx5_ib_dev *dev, int offset, u32 var) +{ + int inlen = MLX5_ST_SZ_BYTES(modify_cong_params_in); + void *in; + void *field; + enum mlx5_ib_cong_node_type node; + u32 attr_mask = 0; + int err; + + in = kvzalloc(inlen, GFP_KERNEL); + if (!in) + return -ENOMEM; + + MLX5_SET(modify_cong_params_in, in, opcode, + MLX5_CMD_OP_MODIFY_CONG_PARAMS); + + node = mlx5_ib_param_to_node(offset); + MLX5_SET(modify_cong_params_in, in, cong_protocol, node); + + field = MLX5_ADDR_OF(modify_cong_params_in, in, congestion_parameters); + mlx5_ib_set_cc_param_mask_val(field, offset, var, &attr_mask); + + field = MLX5_ADDR_OF(modify_cong_params_in, in, field_select); + MLX5_SET(field_select_r_roce_rp, field, field_select_r_roce_rp, + attr_mask); + + err = mlx5_cmd_modify_cong_params(dev->mdev, in, inlen); + kvfree(in); + return err; +} + +static ssize_t set_param(struct file *filp, const char __user *buf, + size_t count, loff_t *pos) +{ + struct mlx5_ib_dbg_param *param = filp->private_data; + int offset = param->offset; + char lbuf[11] = { }; + u32 var; + int ret; + + if (count > sizeof(lbuf)) + return -EINVAL; + + if (copy_from_user(lbuf, buf, count)) + return -EFAULT; + + lbuf[sizeof(lbuf) - 1] = '\0'; + + if (kstrtou32(lbuf, 0, &var)) + return -EINVAL; + + ret = mlx5_ib_set_cc_params(param->dev, offset, var); + return ret ? ret : count; +} + +static ssize_t get_param(struct file *filp, char __user *buf, size_t count, + loff_t *pos) +{ + struct mlx5_ib_dbg_param *param = filp->private_data; + int offset = param->offset; + u32 var = 0; + int ret; + char lbuf[11]; + + if (*pos) + return 0; + + ret = mlx5_ib_get_cc_params(param->dev, offset, &var); + if (ret) + return ret; + + ret = snprintf(lbuf, sizeof(lbuf), "%d\n", var); + if (ret < 0) + return ret; + + if (copy_to_user(buf, lbuf, ret)) + return -EFAULT; + + *pos += ret; + return ret; +} + +static const struct file_operations dbg_cc_fops = { + .owner = THIS_MODULE, + .open = simple_open, + .write = set_param, + .read = get_param, +}; + +void mlx5_ib_cleanup_cong_debugfs(struct mlx5_ib_dev *dev) +{ + if (!mlx5_debugfs_root || + !dev->dbg_cc_params || + !dev->dbg_cc_params->root) + return; + + debugfs_remove_recursive(dev->dbg_cc_params->root); + kfree(dev->dbg_cc_params); + dev->dbg_cc_params = NULL; +} + +int mlx5_ib_init_cong_debugfs(struct mlx5_ib_dev *dev) +{ + struct mlx5_ib_dbg_cc_params *dbg_cc_params; + int i; + + if (!mlx5_debugfs_root) + goto out; + + if (!MLX5_CAP_GEN(dev->mdev, cc_query_allowed) || + !MLX5_CAP_GEN(dev->mdev, cc_modify_allowed)) + goto out; + + dbg_cc_params = kzalloc(sizeof(*dbg_cc_params), GFP_KERNEL); + if (!dbg_cc_params) + goto out; + + dev->dbg_cc_params = dbg_cc_params; + + dbg_cc_params->root = debugfs_create_dir("cc_params", + dev->mdev->priv.dbg_root); + if (!dbg_cc_params->root) + goto err; + + for (i = 0; i < MLX5_IB_DBG_CC_MAX; i++) { + dbg_cc_params->params[i].offset = i; + dbg_cc_params->params[i].dev = dev; + dbg_cc_params->params[i].dentry = + debugfs_create_file(mlx5_ib_dbg_cc_name[i], + 0600, dbg_cc_params->root, + &dbg_cc_params->params[i], + &dbg_cc_fops); + if (!dbg_cc_params->params[i].dentry) + goto err; + } +out: return 0; + +err: + mlx5_ib_warn(dev, "cong debugfs failure\n"); + mlx5_ib_cleanup_cong_debugfs(dev); + /* + * We don't want to fail driver if debugfs failed to initialize, + * so we are not forwarding error to the user. + */ + return 0; +} diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index bfd9117da10c..a903728627f1 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -3838,9 +3838,13 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev) goto err_odp; } + err = mlx5_ib_init_cong_debugfs(dev); + if (err) + goto err_cnt; + dev->mdev->priv.uar = mlx5_get_uars_page(dev->mdev); if (!dev->mdev->priv.uar) - goto err_cnt; + goto err_cong; err = mlx5_alloc_bfreg(dev->mdev, &dev->bfreg, false, false); if (err) @@ -3889,6 +3893,8 @@ err_uar_page: mlx5_put_uars_page(dev->mdev, dev->mdev->priv.uar); err_cnt: + mlx5_ib_cleanup_cong_debugfs(dev); +err_cong: if (MLX5_CAP_GEN(dev->mdev, max_qp_cnt)) mlx5_ib_dealloc_counters(dev); @@ -3923,6 +3929,7 @@ static void mlx5_ib_remove(struct mlx5_core_dev *mdev, void *context) mlx5_free_bfreg(dev->mdev, &dev->fp_bfreg); mlx5_free_bfreg(dev->mdev, &dev->bfreg); mlx5_put_uars_page(dev->mdev, mdev->priv.uar); + mlx5_ib_cleanup_cong_debugfs(dev); if (MLX5_CAP_GEN(dev->mdev, max_qp_cnt)) mlx5_ib_dealloc_counters(dev); destroy_umrc_res(dev); diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 5fb4547bbcb8..f0682f383b52 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -619,6 +619,39 @@ struct mlx5_roce { enum ib_port_state last_port_state; }; +struct mlx5_ib_dbg_param { + int offset; + struct mlx5_ib_dev *dev; + struct dentry *dentry; +}; + +enum mlx5_ib_dbg_cc_types { + MLX5_IB_DBG_CC_RP_CLAMP_TGT_RATE, + MLX5_IB_DBG_CC_RP_CLAMP_TGT_RATE_ATI, + MLX5_IB_DBG_CC_RP_TIME_RESET, + MLX5_IB_DBG_CC_RP_BYTE_RESET, + MLX5_IB_DBG_CC_RP_THRESHOLD, + MLX5_IB_DBG_CC_RP_AI_RATE, + MLX5_IB_DBG_CC_RP_HAI_RATE, + MLX5_IB_DBG_CC_RP_MIN_DEC_FAC, + MLX5_IB_DBG_CC_RP_MIN_RATE, + MLX5_IB_DBG_CC_RP_RATE_TO_SET_ON_FIRST_CNP, + MLX5_IB_DBG_CC_RP_DCE_TCP_G, + MLX5_IB_DBG_CC_RP_DCE_TCP_RTT, + MLX5_IB_DBG_CC_RP_RATE_REDUCE_MONITOR_PERIOD, + MLX5_IB_DBG_CC_RP_INITIAL_ALPHA_VALUE, + MLX5_IB_DBG_CC_RP_GD, + MLX5_IB_DBG_CC_NP_CNP_DSCP, + MLX5_IB_DBG_CC_NP_CNP_PRIO_MODE, + MLX5_IB_DBG_CC_NP_CNP_PRIO, + MLX5_IB_DBG_CC_MAX, +}; + +struct mlx5_ib_dbg_cc_params { + struct dentry *root; + struct mlx5_ib_dbg_param params[MLX5_IB_DBG_CC_MAX]; +}; + struct mlx5_ib_dev { struct ib_device ib_dev; struct mlx5_core_dev *mdev; @@ -655,6 +688,7 @@ struct mlx5_ib_dev { struct mlx5_ib_port *port; struct mlx5_sq_bfreg bfreg; struct mlx5_sq_bfreg fp_bfreg; + struct mlx5_ib_dbg_cc_params *dbg_cc_params; /* protect the user_td */ struct mutex lb_mutex; @@ -909,6 +943,9 @@ __be16 mlx5_get_roce_udp_sport(struct mlx5_ib_dev *dev, u8 port_num, int mlx5_get_roce_gid_type(struct mlx5_ib_dev *dev, u8 port_num, int index, enum ib_gid_type *gid_type); +void mlx5_ib_cleanup_cong_debugfs(struct mlx5_ib_dev *dev); +int mlx5_ib_init_cong_debugfs(struct mlx5_ib_dev *dev); + /* GSI QP helper functions */ struct ib_qp *mlx5_ib_gsi_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *init_attr); diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 57c75e8b3c19..3e697f672de5 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -1188,7 +1188,8 @@ struct mlx5_ifc_cong_control_r_roce_ecn_np_bits { u8 reserved_at_c0[0x12]; u8 cnp_dscp[0x6]; - u8 reserved_at_d8[0x5]; + u8 reserved_at_d8[0x4]; + u8 cnp_prio_mode[0x1]; u8 cnp_802p_prio[0x3]; u8 reserved_at_e0[0x720]; -- cgit v1.2.3 From 7ecf6d8ff154e6f7471ee537a400d43a5f3b1c57 Mon Sep 17 00:00:00 2001 From: Bodong Wang Date: Tue, 30 May 2017 10:18:24 +0300 Subject: IB/mlx5: Restore IB guid/policy for virtual functions When a user sets port_guid, node_guid or policy of an IB virtual function, save this information in "struct mlx5_vf_context". This information will be restored later when pci_resume is called. To make sure this works, one can use aer-inject to generate PCI errors on mlx5 devices and verify if relevant fields are restored after PCI resume. Signed-off-by: Bodong Wang Reviewed-by: Eli Cohen Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/hw/mlx5/ib_virt.c | 9 ++++++ drivers/net/ethernet/mellanox/mlx5/core/sriov.c | 42 +++++++++++++++++++++++++ include/linux/mlx5/driver.h | 17 +++++----- 3 files changed, 61 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/hw/mlx5/ib_virt.c b/drivers/infiniband/hw/mlx5/ib_virt.c index c1b9de800fe5..649a3364f838 100644 --- a/drivers/infiniband/hw/mlx5/ib_virt.c +++ b/drivers/infiniband/hw/mlx5/ib_virt.c @@ -96,6 +96,7 @@ int mlx5_ib_set_vf_link_state(struct ib_device *device, int vf, struct mlx5_ib_dev *dev = to_mdev(device); struct mlx5_core_dev *mdev = dev->mdev; struct mlx5_hca_vport_context *in; + struct mlx5_vf_context *vfs_ctx = mdev->priv.sriov.vfs_ctx; int err; in = kzalloc(sizeof(*in), GFP_KERNEL); @@ -109,6 +110,8 @@ int mlx5_ib_set_vf_link_state(struct ib_device *device, int vf, } in->field_select = MLX5_HCA_VPORT_SEL_STATE_POLICY; err = mlx5_core_modify_hca_vport_context(mdev, 1, 1, vf + 1, in); + if (!err) + vfs_ctx[vf].policy = in->policy; out: kfree(in); @@ -151,6 +154,7 @@ static int set_vf_node_guid(struct ib_device *device, int vf, u8 port, u64 guid) struct mlx5_ib_dev *dev = to_mdev(device); struct mlx5_core_dev *mdev = dev->mdev; struct mlx5_hca_vport_context *in; + struct mlx5_vf_context *vfs_ctx = mdev->priv.sriov.vfs_ctx; int err; in = kzalloc(sizeof(*in), GFP_KERNEL); @@ -160,6 +164,8 @@ static int set_vf_node_guid(struct ib_device *device, int vf, u8 port, u64 guid) in->field_select = MLX5_HCA_VPORT_SEL_NODE_GUID; in->node_guid = guid; err = mlx5_core_modify_hca_vport_context(mdev, 1, 1, vf + 1, in); + if (!err) + vfs_ctx[vf].node_guid = guid; kfree(in); return err; } @@ -169,6 +175,7 @@ static int set_vf_port_guid(struct ib_device *device, int vf, u8 port, u64 guid) struct mlx5_ib_dev *dev = to_mdev(device); struct mlx5_core_dev *mdev = dev->mdev; struct mlx5_hca_vport_context *in; + struct mlx5_vf_context *vfs_ctx = mdev->priv.sriov.vfs_ctx; int err; in = kzalloc(sizeof(*in), GFP_KERNEL); @@ -178,6 +185,8 @@ static int set_vf_port_guid(struct ib_device *device, int vf, u8 port, u64 guid) in->field_select = MLX5_HCA_VPORT_SEL_PORT_GUID; in->port_guid = guid; err = mlx5_core_modify_hca_vport_context(mdev, 1, 1, vf + 1, in); + if (!err) + vfs_ctx[vf].port_guid = guid; kfree(in); return err; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sriov.c b/drivers/net/ethernet/mellanox/mlx5/core/sriov.c index bcdf7779c48d..090b29e05a6a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/sriov.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/sriov.c @@ -32,6 +32,7 @@ #include #include +#include #include "mlx5_core.h" #ifdef CONFIG_MLX5_CORE_EN #include "eswitch.h" @@ -44,6 +45,38 @@ bool mlx5_sriov_is_enabled(struct mlx5_core_dev *dev) return !!sriov->num_vfs; } +static int sriov_restore_guids(struct mlx5_core_dev *dev, int vf) +{ + struct mlx5_core_sriov *sriov = &dev->priv.sriov; + struct mlx5_hca_vport_context *in; + int err = 0; + + /* Restore sriov guid and policy settings */ + if (sriov->vfs_ctx[vf].node_guid || + sriov->vfs_ctx[vf].port_guid || + sriov->vfs_ctx[vf].policy != MLX5_POLICY_INVALID) { + in = kzalloc(sizeof(*in), GFP_KERNEL); + if (!in) + return -ENOMEM; + + in->node_guid = sriov->vfs_ctx[vf].node_guid; + in->port_guid = sriov->vfs_ctx[vf].port_guid; + in->policy = sriov->vfs_ctx[vf].policy; + in->field_select = + !!(in->port_guid) * MLX5_HCA_VPORT_SEL_PORT_GUID | + !!(in->node_guid) * MLX5_HCA_VPORT_SEL_NODE_GUID | + !!(in->policy) * MLX5_HCA_VPORT_SEL_STATE_POLICY; + + err = mlx5_core_modify_hca_vport_context(dev, 1, 1, vf + 1, in); + if (err) + mlx5_core_warn(dev, "modify vport context failed, unable to restore VF %d settings\n", vf); + + kfree(in); + } + + return err; +} + static int mlx5_device_enable_sriov(struct mlx5_core_dev *dev, int num_vfs) { struct mlx5_core_sriov *sriov = &dev->priv.sriov; @@ -74,6 +107,15 @@ static int mlx5_device_enable_sriov(struct mlx5_core_dev *dev, int num_vfs) } sriov->vfs_ctx[vf].enabled = 1; sriov->enabled_vfs++; + if (MLX5_CAP_GEN(dev, port_type) == MLX5_CAP_PORT_TYPE_IB) { + err = sriov_restore_guids(dev, vf); + if (err) { + mlx5_core_warn(dev, + "failed to restore VF %d settings, err %d\n", + vf, err); + continue; + } + } mlx5_core_dbg(dev, "successfully enabled VF* %d\n", vf); } diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index df6ce59a1f95..54221be5f69e 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -162,6 +162,13 @@ enum dbg_rsc_type { MLX5_DBG_RSC_CQ, }; +enum port_state_policy { + MLX5_POLICY_DOWN = 0, + MLX5_POLICY_UP = 1, + MLX5_POLICY_FOLLOW = 2, + MLX5_POLICY_INVALID = 0xffffffff +}; + struct mlx5_field_desc { struct dentry *dent; int i; @@ -525,6 +532,9 @@ struct mlx5_mkey_table { struct mlx5_vf_context { int enabled; + u64 port_guid; + u64 node_guid; + enum port_state_policy policy; }; struct mlx5_core_sriov { @@ -842,13 +852,6 @@ struct mlx5_pas { u8 log_sz; }; -enum port_state_policy { - MLX5_POLICY_DOWN = 0, - MLX5_POLICY_UP = 1, - MLX5_POLICY_FOLLOW = 2, - MLX5_POLICY_INVALID = 0xffffffff -}; - enum phy_port_state { MLX5_AAA_111 }; -- cgit v1.2.3 From 7d9336d80b0b35d3537d37ff35e08dcb425073ed Mon Sep 17 00:00:00 2001 From: Maor Gottlieb Date: Tue, 30 May 2017 10:29:10 +0300 Subject: IB/core: Introduce delay drop for a WQ Work queue which is created with IB_WQ_FLAGS_DELAY_DROP won't cause packet drops when there aren't receive WQEs, but will wait until posting of receive WQEs or for some period of time that the device was configured with. It includes: * Add a new creation flag to enable delay drop functionality in a WQ. * A new capability was introduced - IB_RAW_PACKET_CAP_DELAY_DROP, which is the device's ability to delay packet drops when there aren't receive WQEs. Signed-off-by: Maor Gottlieb Reviewed-by: Yishai Hadas Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- include/rdma/ib_verbs.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index b5732432bb29..4a1444456af7 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -1546,6 +1546,10 @@ enum ib_raw_packet_caps { IB_RAW_PACKET_CAP_SCATTER_FCS = (1 << 1), /* Checksum offloads are supported (for both send and receive). */ IB_RAW_PACKET_CAP_IP_CSUM = (1 << 2), + /* When a packet is received for an RQ with no receive WQEs, the + * packet processing is delayed. + */ + IB_RAW_PACKET_CAP_DELAY_DROP = (1 << 3), }; enum ib_wq_type { @@ -1574,6 +1578,7 @@ struct ib_wq { enum ib_wq_flags { IB_WQ_FLAGS_CVLAN_STRIPPING = 1 << 0, IB_WQ_FLAGS_SCATTER_FCS = 1 << 1, + IB_WQ_FLAGS_DELAY_DROP = 1 << 2, }; struct ib_wq_init_attr { -- cgit v1.2.3 From c1e0bfc1312d0e06bdb24e6e4e7e10b0b4313ec6 Mon Sep 17 00:00:00 2001 From: Maor Gottlieb Date: Tue, 30 May 2017 10:29:11 +0300 Subject: net/mlx5: Introduce set delay drop command Add support to SET_DELAY_DROP command. This command will be used in downstream patches for delay packet drop. The timeout value should be indicated by delay_drop_timeout field. Packet processing will be delayed till timeout value passed or until more WQEs are posted. Setting this value to 0 disables the feature. Signed-off-by: Maor Gottlieb Reviewed-by: Yishai Hadas Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/net/ethernet/mellanox/mlx5/core/qp.c | 14 ++++++++++++++ include/linux/mlx5/mlx5_ifc.h | 25 ++++++++++++++++++++++++- include/linux/mlx5/qp.h | 3 +++ 3 files changed, 41 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/qp.c b/drivers/net/ethernet/mellanox/mlx5/core/qp.c index 340f281c9801..db9e665ab104 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/qp.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/qp.c @@ -242,6 +242,20 @@ int mlx5_core_destroy_qp(struct mlx5_core_dev *dev, } EXPORT_SYMBOL_GPL(mlx5_core_destroy_qp); +int mlx5_core_set_delay_drop(struct mlx5_core_dev *dev, + u32 timeout_usec) +{ + u32 out[MLX5_ST_SZ_DW(set_delay_drop_params_out)] = {0}; + u32 in[MLX5_ST_SZ_DW(set_delay_drop_params_in)] = {0}; + + MLX5_SET(set_delay_drop_params_in, in, opcode, + MLX5_CMD_OP_SET_DELAY_DROP_PARAMS); + MLX5_SET(set_delay_drop_params_in, in, delay_drop_timeout, + timeout_usec / 100); + return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); +} +EXPORT_SYMBOL_GPL(mlx5_core_set_delay_drop); + struct mbox_info { u32 *in; u32 *out; diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 3e697f672de5..40c05d29d5bb 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -200,6 +200,7 @@ enum { MLX5_CMD_OP_QUERY_SQ = 0x907, MLX5_CMD_OP_CREATE_RQ = 0x908, MLX5_CMD_OP_MODIFY_RQ = 0x909, + MLX5_CMD_OP_SET_DELAY_DROP_PARAMS = 0x910, MLX5_CMD_OP_DESTROY_RQ = 0x90a, MLX5_CMD_OP_QUERY_RQ = 0x90b, MLX5_CMD_OP_CREATE_RMP = 0x90c, @@ -840,7 +841,7 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 retransmission_q_counters[0x1]; u8 reserved_at_183[0x1]; u8 modify_rq_counter_set_id[0x1]; - u8 reserved_at_185[0x1]; + u8 rq_delay_drop[0x1]; u8 max_qp_cnt[0xa]; u8 pkey_table_size[0x10]; @@ -5853,6 +5854,28 @@ struct mlx5_ifc_destroy_rq_in_bits { u8 reserved_at_60[0x20]; }; +struct mlx5_ifc_set_delay_drop_params_in_bits { + u8 opcode[0x10]; + u8 reserved_at_10[0x10]; + + u8 reserved_at_20[0x10]; + u8 op_mod[0x10]; + + u8 reserved_at_40[0x20]; + + u8 reserved_at_60[0x10]; + u8 delay_drop_timeout[0x10]; +}; + +struct mlx5_ifc_set_delay_drop_params_out_bits { + u8 status[0x8]; + u8 reserved_at_8[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_at_40[0x40]; +}; + struct mlx5_ifc_destroy_rmp_out_bits { u8 status[0x8]; u8 reserved_at_8[0x18]; diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h index 6f41270d80c0..fff4ec13f620 100644 --- a/include/linux/mlx5/qp.h +++ b/include/linux/mlx5/qp.h @@ -561,6 +561,9 @@ int mlx5_core_destroy_qp(struct mlx5_core_dev *dev, int mlx5_core_qp_query(struct mlx5_core_dev *dev, struct mlx5_core_qp *qp, u32 *out, int outlen); +int mlx5_core_set_delay_drop(struct mlx5_core_dev *dev, + u32 timeout_usec); + int mlx5_core_xrcd_alloc(struct mlx5_core_dev *dev, u32 *xrcdn); int mlx5_core_xrcd_dealloc(struct mlx5_core_dev *dev, u32 xrcdn); void mlx5_init_qp_table(struct mlx5_core_dev *dev); -- cgit v1.2.3 From 246ac9814c5b2c0e9916dca5fbf8d6a40245fad1 Mon Sep 17 00:00:00 2001 From: Maor Gottlieb Date: Tue, 30 May 2017 10:29:12 +0300 Subject: net/mlx5: Introduce general notification event When delay drop timeout is expired, the firmware raises general notification event of DELAY_DROP_TIMEOUT subtype. In addition the feature is disable so the driver have to reactivate the timeout. Signed-off-by: Maor Gottlieb Reviewed-by: Yishai Hadas Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/net/ethernet/mellanox/mlx5/core/eq.c | 23 +++++++++++++++++++++++ include/linux/mlx5/device.h | 5 +++++ include/linux/mlx5/driver.h | 1 + include/linux/mlx5/mlx5_ifc.h | 3 ++- 4 files changed, 31 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c index af51a5d2b912..849417425811 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c @@ -161,6 +161,8 @@ static const char *eqe_type_str(u8 type) return "MLX5_EVENT_TYPE_NIC_VPORT_CHANGE"; case MLX5_EVENT_TYPE_FPGA_ERROR: return "MLX5_EVENT_TYPE_FPGA_ERROR"; + case MLX5_EVENT_TYPE_GENERAL_EVENT: + return "MLX5_EVENT_TYPE_GENERAL_EVENT"; default: return "Unrecognized event"; } @@ -378,6 +380,20 @@ int mlx5_core_page_fault_resume(struct mlx5_core_dev *dev, u32 token, EXPORT_SYMBOL_GPL(mlx5_core_page_fault_resume); #endif +static void general_event_handler(struct mlx5_core_dev *dev, + struct mlx5_eqe *eqe) +{ + switch (eqe->sub_type) { + case MLX5_GENERAL_SUBTYPE_DELAY_DROP_TIMEOUT: + if (dev->event) + dev->event(dev, MLX5_DEV_EVENT_DELAY_DROP_TIMEOUT, 0); + break; + default: + mlx5_core_dbg(dev, "General event with unrecognized subtype: sub_type %d\n", + eqe->sub_type); + } +} + static irqreturn_t mlx5_eq_int(int irq, void *eq_ptr) { struct mlx5_eq *eq = eq_ptr; @@ -486,6 +502,9 @@ static irqreturn_t mlx5_eq_int(int irq, void *eq_ptr) mlx5_fpga_event(dev, eqe->type, &eqe->data.raw); break; + case MLX5_EVENT_TYPE_GENERAL_EVENT: + general_event_handler(dev, eqe); + break; default: mlx5_core_warn(dev, "Unhandled event 0x%x on EQ 0x%x\n", eqe->type, eq->eqn); @@ -693,6 +712,10 @@ int mlx5_start_eqs(struct mlx5_core_dev *dev) mlx5_core_is_pf(dev)) async_event_mask |= (1ull << MLX5_EVENT_TYPE_NIC_VPORT_CHANGE); + if (MLX5_CAP_GEN(dev, port_type) == MLX5_CAP_PORT_TYPE_ETH && + MLX5_CAP_GEN(dev, general_notification_event)) + async_event_mask |= (1ull << MLX5_EVENT_TYPE_GENERAL_EVENT); + if (MLX5_CAP_GEN(dev, port_module_event)) async_event_mask |= (1ull << MLX5_EVENT_TYPE_PORT_MODULE_EVENT); else diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index f31a0b5377e1..a47b9ab9f2c9 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -290,6 +290,7 @@ enum mlx5_event { MLX5_EVENT_TYPE_GPIO_EVENT = 0x15, MLX5_EVENT_TYPE_PORT_MODULE_EVENT = 0x16, MLX5_EVENT_TYPE_REMOTE_CONFIG = 0x19, + MLX5_EVENT_TYPE_GENERAL_EVENT = 0x22, MLX5_EVENT_TYPE_PPS_EVENT = 0x25, MLX5_EVENT_TYPE_DB_BF_CONGESTION = 0x1a, @@ -304,6 +305,10 @@ enum mlx5_event { MLX5_EVENT_TYPE_FPGA_ERROR = 0x20, }; +enum { + MLX5_GENERAL_SUBTYPE_DELAY_DROP_TIMEOUT = 0x1, +}; + enum { MLX5_PORT_CHANGE_SUBTYPE_DOWN = 1, MLX5_PORT_CHANGE_SUBTYPE_ACTIVE = 4, diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 54221be5f69e..758ef40f9316 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -192,6 +192,7 @@ enum mlx5_dev_event { MLX5_DEV_EVENT_GUID_CHANGE, MLX5_DEV_EVENT_CLIENT_REREG, MLX5_DEV_EVENT_PPS, + MLX5_DEV_EVENT_DELAY_DROP_TIMEOUT, }; enum mlx5_port_status { diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 40c05d29d5bb..4bc57647fad8 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -874,7 +874,8 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 max_tc[0x4]; u8 reserved_at_1d0[0x1]; u8 dcbx[0x1]; - u8 reserved_at_1d2[0x3]; + u8 general_notification_event[0x1]; + u8 reserved_at_1d3[0x2]; u8 fpga[0x1]; u8 rol_s[0x1]; u8 rol_g[0x1]; -- cgit v1.2.3 From 03404e8ae652e02a5e3388224836cef53d7a0988 Mon Sep 17 00:00:00 2001 From: Maor Gottlieb Date: Tue, 30 May 2017 10:29:13 +0300 Subject: IB/mlx5: Add support to dropless RQ RQs that were configured for "delay drop" will prevent packet drops when their WQEs are depleted. Marking an RQ to be drop-less is done by setting delay_drop_en in RQ context using CREATE_RQ command. Since this feature is globally activated/deactivated by using the SET_DELAY_DROP command on all the marked RQs, we activated/deactivated it according to the number of RQs with 'delay_drop' enabled. When timeout is expired, then the feature is deactivated. Therefore the driver handles the delay drop timeout event and reactivate it. Signed-off-by: Maor Gottlieb Reviewed-by: Yishai Hadas Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/hw/mlx5/main.c | 60 +++++++++++++++++++++++++++++++++--- drivers/infiniband/hw/mlx5/mlx5_ib.h | 19 ++++++++++++ drivers/infiniband/hw/mlx5/qp.c | 37 ++++++++++++++++++++++ include/linux/mlx5/mlx5_ifc.h | 2 +- 4 files changed, 113 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index a903728627f1..ad4b12decc23 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -697,6 +697,10 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, props->device_cap_flags |= IB_DEVICE_UD_TSO; } + if (MLX5_CAP_GEN(dev->mdev, rq_delay_drop) && + MLX5_CAP_GEN(dev->mdev, general_notification_event)) + props->raw_packet_caps |= IB_RAW_PACKET_CAP_DELAY_DROP; + if (MLX5_CAP_GEN(dev->mdev, eth_net_offloads) && MLX5_CAP_ETH(dev->mdev, scatter_fcs)) { /* Legacy bit to support old userspace libraries */ @@ -2752,6 +2756,24 @@ static void mlx5_ib_handle_internal_error(struct mlx5_ib_dev *ibdev) spin_unlock_irqrestore(&ibdev->reset_flow_resource_lock, flags); } +static void delay_drop_handler(struct work_struct *work) +{ + int err; + struct mlx5_ib_delay_drop *delay_drop = + container_of(work, struct mlx5_ib_delay_drop, + delay_drop_work); + + mutex_lock(&delay_drop->lock); + err = mlx5_core_set_delay_drop(delay_drop->dev->mdev, + delay_drop->timeout); + if (err) { + mlx5_ib_warn(delay_drop->dev, "Failed to set delay drop, timeout=%u\n", + delay_drop->timeout); + delay_drop->activate = false; + } + mutex_unlock(&delay_drop->lock); +} + static void mlx5_ib_event(struct mlx5_core_dev *dev, void *context, enum mlx5_dev_event event, unsigned long param) { @@ -2804,8 +2826,11 @@ static void mlx5_ib_event(struct mlx5_core_dev *dev, void *context, ibev.event = IB_EVENT_CLIENT_REREGISTER; port = (u8)param; break; + case MLX5_DEV_EVENT_DELAY_DROP_TIMEOUT: + schedule_work(&ibdev->delay_drop.delay_drop_work); + goto out; default: - return; + goto out; } ibev.device = &ibdev->ib_dev; @@ -2813,7 +2838,7 @@ static void mlx5_ib_event(struct mlx5_core_dev *dev, void *context, if (port < 1 || port > ibdev->num_ports) { mlx5_ib_warn(ibdev, "warning: event on port %d\n", port); - return; + goto out; } if (ibdev->ib_active) @@ -2821,6 +2846,9 @@ static void mlx5_ib_event(struct mlx5_core_dev *dev, void *context, if (fatal) ibdev->ib_active = false; + +out: + return; } static int set_has_smi_cap(struct mlx5_ib_dev *dev) @@ -3623,6 +3651,26 @@ mlx5_ib_alloc_rdma_netdev(struct ib_device *hca, return netdev; } +static void cancel_delay_drop(struct mlx5_ib_dev *dev) +{ + if (!(dev->ib_dev.attrs.raw_packet_caps & IB_RAW_PACKET_CAP_DELAY_DROP)) + return; + + cancel_work_sync(&dev->delay_drop.delay_drop_work); +} + +static void init_delay_drop(struct mlx5_ib_dev *dev) +{ + if (!(dev->ib_dev.attrs.raw_packet_caps & IB_RAW_PACKET_CAP_DELAY_DROP)) + return; + + mutex_init(&dev->delay_drop.lock); + dev->delay_drop.dev = dev; + dev->delay_drop.activate = false; + dev->delay_drop.timeout = MLX5_MAX_DELAY_DROP_TIMEOUT_MS * 1000; + INIT_WORK(&dev->delay_drop.delay_drop_work, delay_drop_handler); +} + static void *mlx5_ib_add(struct mlx5_core_dev *mdev) { struct mlx5_ib_dev *dev; @@ -3862,11 +3910,13 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev) if (err) goto err_dev; + init_delay_drop(dev); + for (i = 0; i < ARRAY_SIZE(mlx5_class_attributes); i++) { err = device_create_file(&dev->ib_dev.dev, mlx5_class_attributes[i]); if (err) - goto err_umrc; + goto err_delay_drop; } if ((MLX5_CAP_GEN(mdev, port_type) == MLX5_CAP_PORT_TYPE_ETH) && @@ -3877,7 +3927,8 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev) return dev; -err_umrc: +err_delay_drop: + cancel_delay_drop(dev); destroy_umrc_res(dev); err_dev: @@ -3924,6 +3975,7 @@ static void mlx5_ib_remove(struct mlx5_core_dev *mdev, void *context) struct mlx5_ib_dev *dev = context; enum rdma_link_layer ll = mlx5_ib_port_link_layer(&dev->ib_dev, 1); + cancel_delay_drop(dev); mlx5_remove_netdev_notifier(dev); ib_unregister_device(&dev->ib_dev); mlx5_free_bfreg(dev->mdev, &dev->fp_bfreg); diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index f0682f383b52..097f12dc65a6 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -247,6 +247,10 @@ struct mlx5_ib_wq { void *qend; }; +enum mlx5_ib_wq_flags { + MLX5_IB_WQ_FLAGS_DELAY_DROP = 0x1, +}; + struct mlx5_ib_rwq { struct ib_wq ibwq; struct mlx5_core_qp core_qp; @@ -264,6 +268,7 @@ struct mlx5_ib_rwq { u32 wqe_count; u32 wqe_shift; int wq_sig; + u32 create_flags; /* Use enum mlx5_ib_wq_flags */ }; enum { @@ -652,6 +657,19 @@ struct mlx5_ib_dbg_cc_params { struct mlx5_ib_dbg_param params[MLX5_IB_DBG_CC_MAX]; }; +enum { + MLX5_MAX_DELAY_DROP_TIMEOUT_MS = 100, +}; + +struct mlx5_ib_delay_drop { + struct mlx5_ib_dev *dev; + struct work_struct delay_drop_work; + /* serialize setting of delay drop */ + struct mutex lock; + u32 timeout; + bool activate; +}; + struct mlx5_ib_dev { struct ib_device ib_dev; struct mlx5_core_dev *mdev; @@ -688,6 +706,7 @@ struct mlx5_ib_dev { struct mlx5_ib_port *port; struct mlx5_sq_bfreg bfreg; struct mlx5_sq_bfreg fp_bfreg; + struct mlx5_ib_delay_drop delay_drop; struct mlx5_ib_dbg_cc_params *dbg_cc_params; /* protect the user_td */ diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 0889ff367c86..939553d5c25f 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -4597,6 +4597,24 @@ static void mlx5_ib_wq_event(struct mlx5_core_qp *core_qp, int type) } } +static int set_delay_drop(struct mlx5_ib_dev *dev) +{ + int err = 0; + + mutex_lock(&dev->delay_drop.lock); + if (dev->delay_drop.activate) + goto out; + + err = mlx5_core_set_delay_drop(dev->mdev, dev->delay_drop.timeout); + if (err) + goto out; + + dev->delay_drop.activate = true; +out: + mutex_unlock(&dev->delay_drop.lock); + return err; +} + static int create_rq(struct mlx5_ib_rwq *rwq, struct ib_pd *pd, struct ib_wq_init_attr *init_attr) { @@ -4651,9 +4669,28 @@ static int create_rq(struct mlx5_ib_rwq *rwq, struct ib_pd *pd, } MLX5_SET(rqc, rqc, scatter_fcs, 1); } + if (init_attr->create_flags & IB_WQ_FLAGS_DELAY_DROP) { + if (!(dev->ib_dev.attrs.raw_packet_caps & + IB_RAW_PACKET_CAP_DELAY_DROP)) { + mlx5_ib_dbg(dev, "Delay drop is not supported\n"); + err = -EOPNOTSUPP; + goto out; + } + MLX5_SET(rqc, rqc, delay_drop_en, 1); + } rq_pas0 = (__be64 *)MLX5_ADDR_OF(wq, wq, pas); mlx5_ib_populate_pas(dev, rwq->umem, rwq->page_shift, rq_pas0, 0); err = mlx5_core_create_rq_tracked(dev->mdev, in, inlen, &rwq->core_qp); + if (!err && init_attr->create_flags & IB_WQ_FLAGS_DELAY_DROP) { + err = set_delay_drop(dev); + if (err) { + mlx5_ib_warn(dev, "Failed to enable delay drop err=%d\n", + err); + mlx5_core_destroy_rq_tracked(dev->mdev, &rwq->core_qp); + } else { + rwq->create_flags |= MLX5_IB_WQ_FLAGS_DELAY_DROP; + } + } out: kvfree(in); return err; diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 4bc57647fad8..f350688a12fa 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -2519,7 +2519,7 @@ enum { struct mlx5_ifc_rqc_bits { u8 rlky[0x1]; - u8 reserved_at_1[0x1]; + u8 delay_drop_en[0x1]; u8 scatter_fcs[0x1]; u8 vsd[0x1]; u8 mem_rq_type[0x4]; -- cgit v1.2.3 From be1d325a335840a86c133a56c6a911c368bac0fd Mon Sep 17 00:00:00 2001 From: Noa Osherovich Date: Mon, 12 Jun 2017 11:14:03 +0300 Subject: IB/core: Set RoCEv2 MGID according to spec RoCEv2 Annex states that for RoCEv2 over IPv4, the corresponding IPv4 address is encoded into the GID according to the following rule: GID= :ffff: Remove the 0xff0e prefix for RoCEv2 packets with IPv4 and leave it zeroed and change rdma_is_multicast_addr() to consider the new logic. Signed-off-by: Noa Osherovich Reviewed-by: Moni Shoua Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/core/cma.c | 13 +++++++------ drivers/infiniband/core/verbs.c | 10 ++++++---- include/rdma/ib_addr.h | 8 +++++++- 3 files changed, 20 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 0eb393237ba2..a8c2f0ccd225 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -3998,7 +3998,8 @@ static void iboe_mcast_work_handler(struct work_struct *work) kfree(mw); } -static void cma_iboe_set_mgid(struct sockaddr *addr, union ib_gid *mgid) +static void cma_iboe_set_mgid(struct sockaddr *addr, union ib_gid *mgid, + enum ib_gid_type gid_type) { struct sockaddr_in *sin = (struct sockaddr_in *)addr; struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)addr; @@ -4008,8 +4009,8 @@ static void cma_iboe_set_mgid(struct sockaddr *addr, union ib_gid *mgid) } else if (addr->sa_family == AF_INET6) { memcpy(mgid, &sin6->sin6_addr, sizeof *mgid); } else { - mgid->raw[0] = 0xff; - mgid->raw[1] = 0x0e; + mgid->raw[0] = (gid_type == IB_GID_TYPE_IB) ? 0xff : 0; + mgid->raw[1] = (gid_type == IB_GID_TYPE_IB) ? 0x0e : 0; mgid->raw[2] = 0; mgid->raw[3] = 0; mgid->raw[4] = 0; @@ -4050,7 +4051,9 @@ static int cma_iboe_join_multicast(struct rdma_id_private *id_priv, goto out1; } - cma_iboe_set_mgid(addr, &mc->multicast.ib->rec.mgid); + gid_type = id_priv->cma_dev->default_gid_type[id_priv->id.port_num - + rdma_start_port(id_priv->cma_dev->device)]; + cma_iboe_set_mgid(addr, &mc->multicast.ib->rec.mgid, gid_type); mc->multicast.ib->rec.pkey = cpu_to_be16(0xffff); if (id_priv->id.ps == RDMA_PS_UDP) @@ -4066,8 +4069,6 @@ static int cma_iboe_join_multicast(struct rdma_id_private *id_priv, mc->multicast.ib->rec.hop_limit = 1; mc->multicast.ib->rec.mtu = iboe_get_mtu(ndev->mtu); - gid_type = id_priv->cma_dev->default_gid_type[id_priv->id.port_num - - rdma_start_port(id_priv->cma_dev->device)]; if (addr->sa_family == AF_INET) { if (gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) { mc->multicast.ib->rec.hop_limit = IPV6_DEFAULT_HOPLIMIT; diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index 802bdc397a57..30fdc3ae1bbd 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -1613,8 +1613,9 @@ int ib_attach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid) if (!qp->device->attach_mcast) return -ENOSYS; - if (gid->raw[0] != 0xff || qp->qp_type != IB_QPT_UD || - !is_valid_mcast_lid(qp, lid)) + + if (!rdma_is_multicast_addr((struct in6_addr *)gid->raw) || + qp->qp_type != IB_QPT_UD || !is_valid_mcast_lid(qp, lid)) return -EINVAL; ret = qp->device->attach_mcast(qp, gid, lid); @@ -1630,8 +1631,9 @@ int ib_detach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid) if (!qp->device->detach_mcast) return -ENOSYS; - if (gid->raw[0] != 0xff || qp->qp_type != IB_QPT_UD || - !is_valid_mcast_lid(qp, lid)) + + if (!rdma_is_multicast_addr((struct in6_addr *)gid->raw) || + qp->qp_type != IB_QPT_UD || !is_valid_mcast_lid(qp, lid)) return -EINVAL; ret = qp->device->detach_mcast(qp, gid, lid); diff --git a/include/rdma/ib_addr.h b/include/rdma/ib_addr.h index b73a14edc85e..7aca12188ef3 100644 --- a/include/rdma/ib_addr.h +++ b/include/rdma/ib_addr.h @@ -304,7 +304,13 @@ static inline void rdma_get_ll_mac(struct in6_addr *addr, u8 *mac) static inline int rdma_is_multicast_addr(struct in6_addr *addr) { - return addr->s6_addr[0] == 0xff; + u32 ipv4_addr; + + if (addr->s6_addr[0] == 0xff) + return 1; + + memcpy(&ipv4_addr, addr->s6_addr + 12, 4); + return (ipv6_addr_v4mapped(addr) && ipv4_is_multicast(ipv4_addr)); } static inline void rdma_get_mcast_mac(struct in6_addr *addr, u8 *mac) -- cgit v1.2.3 From 02984cc7b3d62418bd72abacaf875c3a9eccdb66 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Thu, 8 Jun 2017 16:15:06 +0300 Subject: IB/core: Enable QP creation with a given source QP number Enable QP creation with a given source QP number. The created QP will use the source QPN as its wire QP number. This comes as a pre-patch for downstream patches in this series to allow user space applications to accelerate traffic which is typically handled by IPoIB ULP. Signed-off-by: Yishai Hadas Reviewed-by: Maor Gottlieb Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- include/rdma/ib_verbs.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 4a1444456af7..76a74c783c50 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -1059,6 +1059,7 @@ enum ib_qp_create_flags { /* FREE = 1 << 7, */ IB_QP_CREATE_SCATTER_FCS = 1 << 8, IB_QP_CREATE_CVLAN_STRIPPING = 1 << 9, + IB_QP_CREATE_SOURCE_QPN = 1 << 10, /* reserve bits 26-31 for low level drivers' internal use */ IB_QP_CREATE_RESERVED_START = 1 << 26, IB_QP_CREATE_RESERVED_END = 1 << 31, @@ -1086,6 +1087,7 @@ struct ib_qp_init_attr { */ u8 port_num; struct ib_rwq_ind_table *rwq_ind_tbl; + u32 source_qpn; }; struct ib_qp_open_attr { -- cgit v1.2.3 From 2dee0e545894c23b1a2cc2019ac87dffb42e5984 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Thu, 8 Jun 2017 16:15:07 +0300 Subject: IB/uverbs: Enable QP creation with a given source QP number Enable QP creation with a given source QP number, the created QP will use the source QPN as its wire QP number. To create such a QP, root privileges (i.e. CAP_NET_RAW) are required from the user application. This comes as a pre-patch for downstream patches in this series to allow user space applications to accelerate traffic which is typically handled by IPoIB ULP. Signed-off-by: Yishai Hadas Reviewed-by: Maor Gottlieb Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/core/uverbs_cmd.c | 17 ++++++++++++++--- include/uapi/rdma/ib_user_verbs.h | 2 +- 2 files changed, 15 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 2c98533a0203..60535c754db3 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -1383,8 +1383,9 @@ static int create_qp(struct ib_uverbs_file *file, attr.rwq_ind_tbl = ind_tbl; } - if ((cmd_sz >= offsetof(typeof(*cmd), reserved1) + - sizeof(cmd->reserved1)) && cmd->reserved1) { + if (cmd_sz > sizeof(*cmd) && + !ib_is_udata_cleared(ucore, sizeof(*cmd), + cmd_sz - sizeof(*cmd))) { ret = -EOPNOTSUPP; goto err_put; } @@ -1482,11 +1483,21 @@ static int create_qp(struct ib_uverbs_file *file, IB_QP_CREATE_MANAGED_SEND | IB_QP_CREATE_MANAGED_RECV | IB_QP_CREATE_SCATTER_FCS | - IB_QP_CREATE_CVLAN_STRIPPING)) { + IB_QP_CREATE_CVLAN_STRIPPING | + IB_QP_CREATE_SOURCE_QPN)) { ret = -EINVAL; goto err_put; } + if (attr.create_flags & IB_QP_CREATE_SOURCE_QPN) { + if (!capable(CAP_NET_RAW)) { + ret = -EPERM; + goto err_put; + } + + attr.source_qpn = cmd->source_qpn; + } + buf = (void *)cmd + sizeof(*cmd); if (cmd_sz > sizeof(*cmd)) if (!(buf[0] == 0 && !memcmp(buf, buf + 1, diff --git a/include/uapi/rdma/ib_user_verbs.h b/include/uapi/rdma/ib_user_verbs.h index 270c350bedc6..63656d2e8705 100644 --- a/include/uapi/rdma/ib_user_verbs.h +++ b/include/uapi/rdma/ib_user_verbs.h @@ -578,7 +578,7 @@ struct ib_uverbs_ex_create_qp { __u32 comp_mask; __u32 create_flags; __u32 rwq_ind_tbl_handle; - __u32 reserved1; + __u32 source_qpn; }; struct ib_uverbs_open_qp { -- cgit v1.2.3 From 4ce749bd94f697772ac2be4fb7e7a92726e94bfb Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Thu, 8 Jun 2017 16:15:10 +0300 Subject: net/mlx5: Report enhanced capabilities for IPoIB Report 'ipoib_enhanced_offloads' capabilities from the core layer, it will be used in the next patch from this series. Signed-off-by: Yishai Hadas Reviewed-by: Maor Gottlieb Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/net/ethernet/mellanox/mlx5/core/fw.c | 6 ++++++ include/linux/mlx5/device.h | 6 +++++- 2 files changed, 11 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fw.c b/drivers/net/ethernet/mellanox/mlx5/core/fw.c index fa33d59ab485..2c71557d1cee 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fw.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fw.c @@ -120,6 +120,12 @@ int mlx5_query_hca_caps(struct mlx5_core_dev *dev) return err; } + if (MLX5_CAP_GEN(dev, ipoib_enhanced_offloads)) { + err = mlx5_core_get_caps(dev, MLX5_CAP_IPOIB_ENHANCED_OFFLOADS); + if (err) + return err; + } + if (MLX5_CAP_GEN(dev, pg)) { err = mlx5_core_get_caps(dev, MLX5_CAP_ODP); if (err) diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index a47b9ab9f2c9..c13d71deaeca 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -973,7 +973,7 @@ enum mlx5_cap_type { MLX5_CAP_ATOMIC, MLX5_CAP_ROCE, MLX5_CAP_IPOIB_OFFLOADS, - MLX5_CAP_EOIB_OFFLOADS, + MLX5_CAP_IPOIB_ENHANCED_OFFLOADS, MLX5_CAP_FLOW_TABLE, MLX5_CAP_ESWITCH_FLOW_TABLE, MLX5_CAP_ESWITCH, @@ -1016,6 +1016,10 @@ enum mlx5_mcam_feature_groups { MLX5_GET(per_protocol_networking_offload_caps,\ mdev->caps.hca_max[MLX5_CAP_ETHERNET_OFFLOADS], cap) +#define MLX5_CAP_IPOIB_ENHANCED(mdev, cap) \ + MLX5_GET(per_protocol_networking_offload_caps,\ + mdev->caps.hca_cur[MLX5_CAP_IPOIB_ENHANCED_OFFLOADS], cap) + #define MLX5_CAP_ROCE(mdev, cap) \ MLX5_GET(roce_cap, mdev->caps.hca_cur[MLX5_CAP_ROCE], cap) -- cgit v1.2.3 From 3fffc82ad6c78fcc9d5d4eca089f00db14ab0358 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 12 Jun 2017 10:36:16 +0300 Subject: IB/mlx5: Fix existence check for extended address vector The extended address vector is the highest bit in be32 variable, but it was compared with the lowest. This patch fixes the endianness of that check and removes already declared define. Fixes: 17d2f88f92ce ("IB/mlx5: Add ODP atomics support") Reviewed-by: Artemy Kovalyov Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/hw/mlx5/odp.c | 2 +- include/linux/mlx5/qp.h | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index ae0746754008..3d701c7a4c91 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -939,7 +939,7 @@ static int mlx5_ib_mr_initiator_pfault_handler( if (qp->ibqp.qp_type != IB_QPT_RC) { av = *wqe; - if (av->dqp_dct & be32_to_cpu(MLX5_WQE_AV_EXT)) + if (av->dqp_dct & cpu_to_be32(MLX5_EXTENDED_UD_AV)) *wqe += sizeof(struct mlx5_av); else *wqe += sizeof(struct mlx5_base_av); diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h index fff4ec13f620..66d19b611fe4 100644 --- a/include/linux/mlx5/qp.h +++ b/include/linux/mlx5/qp.h @@ -212,7 +212,6 @@ struct mlx5_wqe_ctrl_seg { #define MLX5_WQE_CTRL_OPCODE_MASK 0xff #define MLX5_WQE_CTRL_WQE_INDEX_MASK 0x00ffff00 #define MLX5_WQE_CTRL_WQE_INDEX_SHIFT 8 -#define MLX5_WQE_AV_EXT 0x80000000 enum { MLX5_ETH_WQE_L3_INNER_CSUM = 1 << 4, -- cgit v1.2.3 From 58dcb60a226ee48cc66c96c27b751f06ec2bc5a9 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Mon, 19 Jun 2017 07:19:37 +0300 Subject: IB/mlx5: Expose extended error counters This patch adds below requester and responder side error counters, which will be exposed by hardware counters interface and are supported as part of query Q counters command extension. +---------------------------+-------------------------------------+ | Name | Description | |---------------------------+-------------------------------------| |resp_local_length_error | Number of times responder detected | | | local length errors | |---------------------------+-------------------------------------| |resp_cqe_error | Number of CQEs completed with error | | | at responder | |---------------------------+-------------------------------------| |req_cqe_error | Number of CQEs completed with error | | | at requester | |---------------------------+-------------------------------------| |req_remote_invalid_request | Number of times requester detected | | | remote invalid request error | |---------------------------+-------------------------------------| |req_remote_access_error | Number of times requester detected | | | remote access error | |---------------------------+-------------------------------------| |resp_remote_access_error | Number of times responder detected | | | remote access error | |---------------------------+-------------------------------------| |resp_cqe_flush_error | Number of CQEs completed with | | | flushed with error at responder | |---------------------------+-------------------------------------| |req_cqe_flush_error | Number of CQEs completed with | | | flushed with error at requester | +---------------------------+-------------------------------------+ Signed-off-by: Parav Pandit Reviewed-by: Daniel Jurgens Reviewed-by: Eli Cohen Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/hw/mlx5/main.c | 22 ++++++++++++++++++++ include/linux/mlx5/mlx5_ifc.h | 44 +++++++++++++++++++++++++++++++++++++-- 2 files changed, 64 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 2757d445b042..9dd9759459fb 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -3432,6 +3432,17 @@ static const struct mlx5_ib_counter cong_cnts[] = { INIT_CONG_COUNTER(np_cnp_sent), }; +static const struct mlx5_ib_counter extended_err_cnts[] = { + INIT_Q_COUNTER(resp_local_length_error), + INIT_Q_COUNTER(resp_cqe_error), + INIT_Q_COUNTER(req_cqe_error), + INIT_Q_COUNTER(req_remote_invalid_request), + INIT_Q_COUNTER(req_remote_access_errors), + INIT_Q_COUNTER(resp_remote_access_errors), + INIT_Q_COUNTER(resp_cqe_flush_error), + INIT_Q_COUNTER(req_cqe_flush_error), +}; + static void mlx5_ib_dealloc_counters(struct mlx5_ib_dev *dev) { unsigned int i; @@ -3456,6 +3467,10 @@ static int __mlx5_ib_alloc_counters(struct mlx5_ib_dev *dev, if (MLX5_CAP_GEN(dev->mdev, retransmission_q_counters)) num_counters += ARRAY_SIZE(retrans_q_cnts); + + if (MLX5_CAP_GEN(dev->mdev, enhanced_error_q_counters)) + num_counters += ARRAY_SIZE(extended_err_cnts); + cnts->num_q_counters = num_counters; if (MLX5_CAP_GEN(dev->mdev, cc_query_allowed)) { @@ -3505,6 +3520,13 @@ static void mlx5_ib_fill_counters(struct mlx5_ib_dev *dev, } } + if (MLX5_CAP_GEN(dev->mdev, enhanced_error_q_counters)) { + for (i = 0; i < ARRAY_SIZE(extended_err_cnts); i++, j++) { + names[j] = extended_err_cnts[i].name; + offsets[j] = extended_err_cnts[i].offset; + } + } + if (MLX5_CAP_GEN(dev->mdev, cc_query_allowed)) { for (i = 0; i < ARRAY_SIZE(cong_cnts); i++, j++) { names[j] = cong_cnts[i].name; diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index f350688a12fa..5bae70eb25af 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -858,7 +858,7 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 pcam_reg[0x1]; u8 local_ca_ack_delay[0x5]; u8 port_module_event[0x1]; - u8 reserved_at_1b1[0x1]; + u8 enhanced_error_q_counters[0x1]; u8 ports_check[0x1]; u8 reserved_at_1b3[0x1]; u8 disable_link_up[0x1]; @@ -3953,7 +3953,47 @@ struct mlx5_ifc_query_q_counter_out_bits { u8 local_ack_timeout_err[0x20]; - u8 reserved_at_320[0x4e0]; + u8 reserved_at_320[0xa0]; + + u8 resp_local_length_error[0x20]; + + u8 req_local_length_error[0x20]; + + u8 resp_local_qp_error[0x20]; + + u8 local_operation_error[0x20]; + + u8 resp_local_protection[0x20]; + + u8 req_local_protection[0x20]; + + u8 resp_cqe_error[0x20]; + + u8 req_cqe_error[0x20]; + + u8 req_mw_binding[0x20]; + + u8 req_bad_response[0x20]; + + u8 req_remote_invalid_request[0x20]; + + u8 resp_remote_invalid_request[0x20]; + + u8 req_remote_access_errors[0x20]; + + u8 resp_remote_access_errors[0x20]; + + u8 req_remote_operation_errors[0x20]; + + u8 req_transport_retries_exceeded[0x20]; + + u8 cq_overflow[0x20]; + + u8 resp_cqe_flush_error[0x20]; + + u8 req_cqe_flush_error[0x20]; + + u8 reserved_at_620[0x1e0]; }; struct mlx5_ifc_query_q_counter_in_bits { -- cgit v1.2.3 From ea30b966f7dd6bcfb20c98a7f99608c7bb10bfac Mon Sep 17 00:00:00 2001 From: Maor Gottlieb Date: Wed, 21 Jun 2017 09:26:28 +0300 Subject: IB/mlx4: Add inline-receive support When inline-receive is enabled, the HCA may write received data into the receive WQE. Inline-receive is enabled by setting its matching bit in the QP context and each single-packet message with payload not exceeding the receive WQE size will be delivered to the WQE. The completion report will indicate that the payload was placed to the WQE. It includes: 1) Return maximum supported size of inline-receive by the hardware in query_device vendor's data part. 2) Enable the feature when requested by the vendor data input. Signed-off-by: Maor Gottlieb Reviewed-by: Yishai Hadas Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/hw/mlx4/main.c | 7 +++++++ drivers/infiniband/hw/mlx4/mlx4_ib.h | 3 +++ drivers/infiniband/hw/mlx4/qp.c | 32 +++++++++++++++++++++++++------- include/uapi/rdma/mlx4-abi.h | 3 ++- 4 files changed, 37 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index d1b43cbbfea7..e8c290edb1e1 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -563,6 +563,13 @@ static int mlx4_ib_query_device(struct ib_device *ibdev, } } + if (uhw->outlen >= resp.response_length + + sizeof(resp.max_inl_recv_sz)) { + resp.response_length += sizeof(resp.max_inl_recv_sz); + resp.max_inl_recv_sz = dev->dev->caps.max_rq_sg * + sizeof(struct mlx4_wqe_data_seg); + } + if (uhw->outlen) { err = ib_copy_to_udata(uhw, &resp, resp.response_length); if (err) diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h index 9db82e67e959..a6337f3161cf 100644 --- a/drivers/infiniband/hw/mlx4/mlx4_ib.h +++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h @@ -318,6 +318,7 @@ struct mlx4_ib_qp { u8 sq_no_prefetch; u8 state; int mlx_type; + u32 inl_recv_sz; struct list_head gid_list; struct list_head steering_rules; struct mlx4_ib_buf *sqp_proxy_rcv; @@ -623,6 +624,8 @@ struct mlx4_uverbs_ex_query_device_resp { __u32 comp_mask; __u32 response_length; __u64 hca_core_clock_offset; + __u32 max_inl_recv_sz; + __u32 reserved; }; static inline struct mlx4_ib_dev *to_mdev(struct ib_device *ibdev) diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c index 75c0e6c5dd56..d1caa39a3943 100644 --- a/drivers/infiniband/hw/mlx4/qp.c +++ b/drivers/infiniband/hw/mlx4/qp.c @@ -377,7 +377,8 @@ static int send_wqe_overhead(enum mlx4_ib_qp_type type, u32 flags) } static int set_rq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap, - int is_user, int has_rq, struct mlx4_ib_qp *qp) + int is_user, int has_rq, struct mlx4_ib_qp *qp, + u32 inl_recv_sz) { /* Sanity check RQ size before proceeding */ if (cap->max_recv_wr > dev->dev->caps.max_wqes - MLX4_IB_SQ_MAX_SPARE || @@ -385,18 +386,24 @@ static int set_rq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap, return -EINVAL; if (!has_rq) { - if (cap->max_recv_wr) + if (cap->max_recv_wr || inl_recv_sz) return -EINVAL; qp->rq.wqe_cnt = qp->rq.max_gs = 0; } else { + u32 max_inl_recv_sz = dev->dev->caps.max_rq_sg * + sizeof(struct mlx4_wqe_data_seg); + u32 wqe_size; + /* HW requires >= 1 RQ entry with >= 1 gather entry */ - if (is_user && (!cap->max_recv_wr || !cap->max_recv_sge)) + if (is_user && (!cap->max_recv_wr || !cap->max_recv_sge || + inl_recv_sz > max_inl_recv_sz)) return -EINVAL; qp->rq.wqe_cnt = roundup_pow_of_two(max(1U, cap->max_recv_wr)); qp->rq.max_gs = roundup_pow_of_two(max(1U, cap->max_recv_sge)); - qp->rq.wqe_shift = ilog2(qp->rq.max_gs * sizeof (struct mlx4_wqe_data_seg)); + wqe_size = qp->rq.max_gs * sizeof(struct mlx4_wqe_data_seg); + qp->rq.wqe_shift = ilog2(max_t(u32, wqe_size, inl_recv_sz)); } /* leave userspace return values as they were, so as not to break ABI */ @@ -719,9 +726,6 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd, if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR) qp->sq_signal_bits = cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE); - err = set_rq_size(dev, &init_attr->cap, !!pd->uobject, qp_has_rq(init_attr), qp); - if (err) - goto err; if (pd->uobject) { struct mlx4_ib_create_qp ucmd; @@ -731,6 +735,12 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd, goto err; } + err = set_rq_size(dev, &init_attr->cap, !!pd->uobject, + qp_has_rq(init_attr), qp, ucmd.inl_recv_sz); + if (err) + goto err; + + qp->inl_recv_sz = ucmd.inl_recv_sz; qp->sq_no_prefetch = ucmd.sq_no_prefetch; err = set_user_sq_size(dev, qp, &ucmd); @@ -760,6 +770,11 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd, goto err_mtt; } } else { + err = set_rq_size(dev, &init_attr->cap, !!pd->uobject, + qp_has_rq(init_attr), qp, 0); + if (err) + goto err; + qp->sq_no_prefetch = 0; if (init_attr->create_flags & IB_QP_CREATE_IPOIB_UD_LSO) @@ -1651,6 +1666,9 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp, } } + if (qp->inl_recv_sz) + context->param3 |= cpu_to_be32(1 << 25); + if (ibqp->qp_type == IB_QPT_GSI || ibqp->qp_type == IB_QPT_SMI) context->mtu_msgmax = (IB_MTU_4096 << 5) | 11; else if (ibqp->qp_type == IB_QPT_RAW_PACKET) diff --git a/include/uapi/rdma/mlx4-abi.h b/include/uapi/rdma/mlx4-abi.h index af431752655c..bf3bdba2f326 100644 --- a/include/uapi/rdma/mlx4-abi.h +++ b/include/uapi/rdma/mlx4-abi.h @@ -101,7 +101,8 @@ struct mlx4_ib_create_qp { __u8 log_sq_bb_count; __u8 log_sq_stride; __u8 sq_no_prefetch; - __u8 reserved[5]; + __u32 inl_recv_sz; + __u8 reserved; }; #endif /* MLX4_ABI_USER_H */ -- cgit v1.2.3 From f3301870161ca293cd14b20a802c5646da02407f Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Wed, 21 Jun 2017 09:29:36 +0300 Subject: (IB, net)/mlx4: Add resource utilization support Adding visibility of resource usage of QPs, CQs and counters used by virtual functions. This feature will be used to give the PF administrator more data while debugging VF status. Usage info was added to ALLOC_RES command, to notify the PF if the resource which is being reserved or allocated for the VF will be used by kernel driver or by user verbs. Updated reservation and allocation functions of QP, CQ and counter with additional usage parameter. Signed-off-by: Moshe Shemesh Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/hw/mlx4/cq.c | 2 ++ drivers/infiniband/hw/mlx4/main.c | 6 ++++-- drivers/infiniband/hw/mlx4/qp.c | 13 +++++++++---- drivers/net/ethernet/mellanox/mlx4/cq.c | 7 ++++--- drivers/net/ethernet/mellanox/mlx4/en_cq.c | 1 + drivers/net/ethernet/mellanox/mlx4/en_netdev.c | 3 ++- drivers/net/ethernet/mellanox/mlx4/en_rx.c | 6 ++++-- drivers/net/ethernet/mellanox/mlx4/en_tx.c | 3 ++- drivers/net/ethernet/mellanox/mlx4/main.c | 7 ++++--- drivers/net/ethernet/mellanox/mlx4/qp.c | 5 +++-- include/linux/mlx4/device.h | 12 ++++++++++-- 11 files changed, 45 insertions(+), 20 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/hw/mlx4/cq.c b/drivers/infiniband/hw/mlx4/cq.c index ff931c580557..95382faa7ad1 100644 --- a/drivers/infiniband/hw/mlx4/cq.c +++ b/drivers/infiniband/hw/mlx4/cq.c @@ -218,6 +218,7 @@ struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev, goto err_mtt; uar = &to_mucontext(context)->uar; + cq->mcq.usage = MLX4_RES_USAGE_USER_VERBS; } else { err = mlx4_db_alloc(dev->dev, &cq->db, 1); if (err) @@ -233,6 +234,7 @@ struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev, goto err_db; uar = &dev->priv_uar; + cq->mcq.usage = MLX4_RES_USAGE_DRIVER; } if (dev->eq_table) diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index e8c290edb1e1..0944e224c0df 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -2779,7 +2779,8 @@ static void *mlx4_ib_add(struct mlx4_dev *dev) allocated = 0; if (mlx4_ib_port_link_layer(&ibdev->ib_dev, i + 1) == IB_LINK_LAYER_ETHERNET) { - err = mlx4_counter_alloc(ibdev->dev, &counter_index); + err = mlx4_counter_alloc(ibdev->dev, &counter_index, + MLX4_RES_USAGE_DRIVER); /* if failed to allocate a new counter, use default */ if (err) counter_index = @@ -2834,7 +2835,8 @@ static void *mlx4_ib_add(struct mlx4_dev *dev) ibdev->steer_qpn_count = MLX4_IB_UC_MAX_NUM_QPS; err = mlx4_qp_reserve_range(dev, ibdev->steer_qpn_count, MLX4_IB_UC_STEER_QPN_ALIGN, - &ibdev->steer_qpn_base, 0); + &ibdev->steer_qpn_base, 0, + MLX4_RES_USAGE_DRIVER); if (err) goto err_counter; diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c index d1caa39a3943..247b9132e9de 100644 --- a/drivers/infiniband/hw/mlx4/qp.c +++ b/drivers/infiniband/hw/mlx4/qp.c @@ -769,6 +769,7 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd, if (err) goto err_mtt; } + qp->mqp.usage = MLX4_RES_USAGE_USER_VERBS; } else { err = set_rq_size(dev, &init_attr->cap, !!pd->uobject, qp_has_rq(init_attr), qp, 0); @@ -841,6 +842,7 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd, err = -ENOMEM; goto err_wrid; } + qp->mqp.usage = MLX4_RES_USAGE_DRIVER; } if (sqpn) { @@ -860,13 +862,14 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd, (init_attr->cap.max_send_wr ? MLX4_RESERVE_ETH_BF_QP : 0) | (init_attr->cap.max_recv_wr ? - MLX4_RESERVE_A0_QP : 0)); + MLX4_RESERVE_A0_QP : 0), + qp->mqp.usage); else if (qp->flags & MLX4_IB_QP_NETIF) err = mlx4_ib_steer_qp_alloc(dev, 1, &qpn); else err = mlx4_qp_reserve_range(dev->dev, 1, 1, - &qpn, 0); + &qpn, 0, qp->mqp.usage); if (err) goto err_proxy; } @@ -1218,7 +1221,9 @@ static struct ib_qp *_mlx4_ib_create_qp(struct ib_pd *pd, if (udata) return ERR_PTR(-EINVAL); if (init_attr->create_flags & MLX4_IB_QP_CREATE_ROCE_V2_GSI) { - int res = mlx4_qp_reserve_range(to_mdev(pd->device)->dev, 1, 1, &sqpn, 0); + int res = mlx4_qp_reserve_range(to_mdev(pd->device)->dev, + 1, 1, &sqpn, 0, + MLX4_RES_USAGE_DRIVER); if (res) return ERR_PTR(res); @@ -1581,7 +1586,7 @@ static int create_qp_lb_counter(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp) !(dev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_LB_SRC_CHK)) return 0; - err = mlx4_counter_alloc(dev->dev, &tmp_idx); + err = mlx4_counter_alloc(dev->dev, &tmp_idx, MLX4_RES_USAGE_DRIVER); if (err) return err; diff --git a/drivers/net/ethernet/mellanox/mlx4/cq.c b/drivers/net/ethernet/mellanox/mlx4/cq.c index c56a511b918e..72eb50cd5ecd 100644 --- a/drivers/net/ethernet/mellanox/mlx4/cq.c +++ b/drivers/net/ethernet/mellanox/mlx4/cq.c @@ -241,13 +241,14 @@ err_out: return err; } -static int mlx4_cq_alloc_icm(struct mlx4_dev *dev, int *cqn) +static int mlx4_cq_alloc_icm(struct mlx4_dev *dev, int *cqn, u8 usage) { + u32 in_modifier = RES_CQ | (((u32)usage & 3) << 30); u64 out_param; int err; if (mlx4_is_mfunc(dev)) { - err = mlx4_cmd_imm(dev, 0, &out_param, RES_CQ, + err = mlx4_cmd_imm(dev, 0, &out_param, in_modifier, RES_OP_RESERVE_AND_MAP, MLX4_CMD_ALLOC_RES, MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED); if (err) @@ -303,7 +304,7 @@ int mlx4_cq_alloc(struct mlx4_dev *dev, int nent, cq->vector = vector; - err = mlx4_cq_alloc_icm(dev, &cq->cqn); + err = mlx4_cq_alloc_icm(dev, &cq->cqn, cq->usage); if (err) return err; diff --git a/drivers/net/ethernet/mellanox/mlx4/en_cq.c b/drivers/net/ethernet/mellanox/mlx4/en_cq.c index 85fe17e4dcfb..f849eec21824 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_cq.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_cq.c @@ -140,6 +140,7 @@ int mlx4_en_activate_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq, (cq->type == RX && priv->hwtstamp_config.rx_filter)) timestamp_en = 1; + cq->mcq.usage = MLX4_RES_USAGE_DRIVER; err = mlx4_cq_alloc(mdev->dev, cq->size, &cq->wqres.mtt, &mdev->priv_uar, cq->wqres.db.dma, &cq->mcq, cq->vector, 0, timestamp_en); diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c index 3a291fc1780a..e3e6d9fa69fd 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c @@ -651,7 +651,8 @@ static int mlx4_en_get_qp(struct mlx4_en_priv *priv) return 0; } - err = mlx4_qp_reserve_range(dev, 1, 1, qpn, MLX4_RESERVE_A0_QP); + err = mlx4_qp_reserve_range(dev, 1, 1, qpn, MLX4_RESERVE_A0_QP, + MLX4_RES_USAGE_DRIVER); en_dbg(DRV, priv, "Reserved qp %d\n", *qpn); if (err) { en_err(priv, "Failed to reserve qp for mac registration\n"); diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c index 436f7689a032..ad1ffd5857cb 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c @@ -1081,7 +1081,8 @@ int mlx4_en_create_drop_qp(struct mlx4_en_priv *priv) u32 qpn; err = mlx4_qp_reserve_range(priv->mdev->dev, 1, 1, &qpn, - MLX4_RESERVE_A0_QP); + MLX4_RESERVE_A0_QP, + MLX4_RES_USAGE_DRIVER); if (err) { en_err(priv, "Failed reserving drop qpn\n"); return err; @@ -1127,7 +1128,8 @@ int mlx4_en_config_rss_steer(struct mlx4_en_priv *priv) flags = priv->rx_ring_num == 1 ? MLX4_RESERVE_A0_QP : 0; err = mlx4_qp_reserve_range(mdev->dev, priv->rx_ring_num, priv->rx_ring_num, - &rss_map->base_qpn, flags); + &rss_map->base_qpn, flags, + MLX4_RES_USAGE_DRIVER); if (err) { en_err(priv, "Failed reserving %d qps\n", priv->rx_ring_num); return err; diff --git a/drivers/net/ethernet/mellanox/mlx4/en_tx.c b/drivers/net/ethernet/mellanox/mlx4/en_tx.c index 73faa3d77921..a81db2582555 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_tx.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_tx.c @@ -105,7 +105,8 @@ int mlx4_en_create_tx_ring(struct mlx4_en_priv *priv, (unsigned long long) ring->sp_wqres.buf.direct.map); err = mlx4_qp_reserve_range(mdev->dev, 1, 1, &ring->qpn, - MLX4_RESERVE_ETH_BF_QP); + MLX4_RESERVE_ETH_BF_QP, + MLX4_RES_USAGE_DRIVER); if (err) { en_err(priv, "failed reserving qp for TX ring\n"); goto err_hwq_res; diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c index a27c9c13a36e..fb2591d0e735 100644 --- a/drivers/net/ethernet/mellanox/mlx4/main.c +++ b/drivers/net/ethernet/mellanox/mlx4/main.c @@ -2475,7 +2475,7 @@ static int mlx4_allocate_default_counters(struct mlx4_dev *dev) priv->def_counter[port] = -1; for (port = 0; port < dev->caps.num_ports; port++) { - err = mlx4_counter_alloc(dev, &idx); + err = mlx4_counter_alloc(dev, &idx, MLX4_RES_USAGE_DRIVER); if (!err || err == -ENOSPC) { priv->def_counter[port] = idx; @@ -2517,13 +2517,14 @@ int __mlx4_counter_alloc(struct mlx4_dev *dev, u32 *idx) return 0; } -int mlx4_counter_alloc(struct mlx4_dev *dev, u32 *idx) +int mlx4_counter_alloc(struct mlx4_dev *dev, u32 *idx, u8 usage) { + u32 in_modifier = RES_COUNTER | (((u32)usage & 3) << 30); u64 out_param; int err; if (mlx4_is_mfunc(dev)) { - err = mlx4_cmd_imm(dev, 0, &out_param, RES_COUNTER, + err = mlx4_cmd_imm(dev, 0, &out_param, in_modifier, RES_OP_RESERVE, MLX4_CMD_ALLOC_RES, MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED); if (!err) diff --git a/drivers/net/ethernet/mellanox/mlx4/qp.c b/drivers/net/ethernet/mellanox/mlx4/qp.c index 26747212526b..5e5b4475b85e 100644 --- a/drivers/net/ethernet/mellanox/mlx4/qp.c +++ b/drivers/net/ethernet/mellanox/mlx4/qp.c @@ -245,8 +245,9 @@ int __mlx4_qp_reserve_range(struct mlx4_dev *dev, int cnt, int align, } int mlx4_qp_reserve_range(struct mlx4_dev *dev, int cnt, int align, - int *base, u8 flags) + int *base, u8 flags, u8 usage) { + u32 in_modifier = RES_QP | (((u32)usage & 3) << 30); u64 in_param = 0; u64 out_param; int err; @@ -258,7 +259,7 @@ int mlx4_qp_reserve_range(struct mlx4_dev *dev, int cnt, int align, set_param_l(&in_param, (((u32)flags) << 24) | (u32)cnt); set_param_h(&in_param, align); err = mlx4_cmd_imm(dev, in_param, &out_param, - RES_QP, RES_OP_RESERVE, + in_modifier, RES_OP_RESERVE, MLX4_CMD_ALLOC_RES, MLX4_CMD_TIME_CLASS_A, MLX4_CMD_WRAPPED); if (err) diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h index aad5d81dfb44..3607da001ad3 100644 --- a/include/linux/mlx4/device.h +++ b/include/linux/mlx4/device.h @@ -428,6 +428,12 @@ enum mlx4_steer_type { MLX4_NUM_STEERS }; +enum mlx4_resource_usage { + MLX4_RES_USAGE_NONE, + MLX4_RES_USAGE_DRIVER, + MLX4_RES_USAGE_USER_VERBS, +}; + enum { MLX4_NUM_FEXCH = 64 * 1024, }; @@ -748,6 +754,7 @@ struct mlx4_cq { } tasklet_ctx; int reset_notify_added; struct list_head reset_notify; + u8 usage; }; struct mlx4_qp { @@ -757,6 +764,7 @@ struct mlx4_qp { atomic_t refcount; struct completion free; + u8 usage; }; struct mlx4_srq { @@ -1120,7 +1128,7 @@ int mlx4_cq_alloc(struct mlx4_dev *dev, int nent, struct mlx4_mtt *mtt, unsigned vector, int collapsed, int timestamp_en); void mlx4_cq_free(struct mlx4_dev *dev, struct mlx4_cq *cq); int mlx4_qp_reserve_range(struct mlx4_dev *dev, int cnt, int align, - int *base, u8 flags); + int *base, u8 flags, u8 usage); void mlx4_qp_release_range(struct mlx4_dev *dev, int base_qpn, int cnt); int mlx4_qp_alloc(struct mlx4_dev *dev, int qpn, struct mlx4_qp *qp); @@ -1417,7 +1425,7 @@ int mlx4_get_phys_port_id(struct mlx4_dev *dev); int mlx4_wol_read(struct mlx4_dev *dev, u64 *config, int port); int mlx4_wol_write(struct mlx4_dev *dev, u64 config, int port); -int mlx4_counter_alloc(struct mlx4_dev *dev, u32 *idx); +int mlx4_counter_alloc(struct mlx4_dev *dev, u32 *idx, u8 usage); void mlx4_counter_free(struct mlx4_dev *dev, u32 idx); int mlx4_get_default_counter_index(struct mlx4_dev *dev, int port); -- cgit v1.2.3 From 400b1ebcfe31279895f1baa8ecaa390d9a4a0eef Mon Sep 17 00:00:00 2001 From: Guy Levi Date: Tue, 4 Jul 2017 16:24:24 +0300 Subject: IB/mlx4: Add support for WQ related verbs Support create/modify/destroy WQ related verbs. The base IB object to enable RSS functionality is a WQ (i.e. ib_wq). This patch implements the related WQ verbs as of create, modify and destroy. In downstream patches the WQ will be used as part of an indirection table (i.e. ib_rwq_ind_table) to enable RSS QP creation. Notes: ConnectX-3 hardware requires consecutive WQNs list as receive descriptor queues for the RSS QP. Hence, the driver manages consecutive ranges lists per context which the user must respect. Destroying the WQ does not return its WQN back to its range for reusing. However, destroying all WQs from the same range releases the range and in turn releases its WQNs for reusing. Since the WQ object is not a natural object in the hardware, the driver implements the WQ by the hardware QP. As such, the WQ inherits its port from its RSS QP parent upon its RST->INIT transition and by that time its state is applied to the hardware. Signed-off-by: Guy Levi Reviewed-by: Yishai Hadas Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/hw/mlx4/main.c | 24 ++ drivers/infiniband/hw/mlx4/mlx4_ib.h | 25 +- drivers/infiniband/hw/mlx4/qp.c | 505 +++++++++++++++++++++++++++++++---- include/uapi/rdma/mlx4-abi.h | 14 + 4 files changed, 511 insertions(+), 57 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index 0944e224c0df..7c6f929ebd3e 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -81,6 +81,8 @@ static const char mlx4_ib_version[] = DRV_VERSION "\n"; static void do_slave_init(struct mlx4_ib_dev *ibdev, int slave, int do_init); +static enum rdma_link_layer mlx4_ib_port_link_layer(struct ib_device *device, + u8 port_num); static struct workqueue_struct *wq; @@ -552,6 +554,11 @@ static int mlx4_ib_query_device(struct ib_device *ibdev, props->timestamp_mask = 0xFFFFFFFFFFFFULL; props->max_ah = INT_MAX; + if ((dev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_RSS) && + (mlx4_ib_port_link_layer(ibdev, 1) == IB_LINK_LAYER_ETHERNET || + mlx4_ib_port_link_layer(ibdev, 2) == IB_LINK_LAYER_ETHERNET)) + props->max_wq_type_rq = props->max_qp; + if (!mlx4_is_slave(dev->dev)) err = mlx4_get_internal_clock_params(dev->dev, &clock_params); @@ -1076,6 +1083,9 @@ static struct ib_ucontext *mlx4_ib_alloc_ucontext(struct ib_device *ibdev, INIT_LIST_HEAD(&context->db_page_list); mutex_init(&context->db_page_mutex); + INIT_LIST_HEAD(&context->wqn_ranges_list); + mutex_init(&context->wqn_ranges_mutex); + if (ibdev->uverbs_abi_ver == MLX4_IB_UVERBS_NO_DEV_CAPS_ABI_VERSION) err = ib_copy_to_udata(udata, &resp_v3, sizeof(resp_v3)); else @@ -2720,6 +2730,20 @@ static void *mlx4_ib_add(struct mlx4_dev *dev) ibdev->ib_dev.get_dev_fw_str = get_fw_ver_str; ibdev->ib_dev.disassociate_ucontext = mlx4_ib_disassociate_ucontext; + if ((dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_RSS) && + ((mlx4_ib_port_link_layer(&ibdev->ib_dev, 1) == + IB_LINK_LAYER_ETHERNET) || + (mlx4_ib_port_link_layer(&ibdev->ib_dev, 2) == + IB_LINK_LAYER_ETHERNET))) { + ibdev->ib_dev.create_wq = mlx4_ib_create_wq; + ibdev->ib_dev.modify_wq = mlx4_ib_modify_wq; + ibdev->ib_dev.destroy_wq = mlx4_ib_destroy_wq; + ibdev->ib_dev.uverbs_ex_cmd_mask |= + (1ull << IB_USER_VERBS_EX_CMD_CREATE_WQ) | + (1ull << IB_USER_VERBS_EX_CMD_MODIFY_WQ) | + (1ull << IB_USER_VERBS_EX_CMD_DESTROY_WQ); + } + if (!mlx4_is_slave(ibdev->dev)) { ibdev->ib_dev.alloc_fmr = mlx4_ib_fmr_alloc; ibdev->ib_dev.map_phys_fmr = mlx4_ib_map_phys_fmr; diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h index a6337f3161cf..ba4e78c064d2 100644 --- a/drivers/infiniband/hw/mlx4/mlx4_ib.h +++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h @@ -88,6 +88,8 @@ struct mlx4_ib_ucontext { struct list_head db_page_list; struct mutex db_page_mutex; struct mlx4_ib_vma_private_data hw_bar_info[HW_BAR_COUNT]; + struct list_head wqn_ranges_list; + struct mutex wqn_ranges_mutex; /* protect wqn_ranges_list */ }; struct mlx4_ib_pd { @@ -289,8 +291,19 @@ struct mlx4_roce_smac_vlan_info { int update_vid; }; +struct mlx4_wqn_range { + int base_wqn; + int size; + int refcount; + bool dirty; + struct list_head list; +}; + struct mlx4_ib_qp { - struct ib_qp ibqp; + union { + struct ib_qp ibqp; + struct ib_wq ibwq; + }; struct mlx4_qp mqp; struct mlx4_buf buf; @@ -329,6 +342,9 @@ struct mlx4_ib_qp { struct list_head cq_recv_list; struct list_head cq_send_list; struct counter_index *counter_index; + struct mlx4_wqn_range *wqn_range; + /* Number of RSS QP parents that uses this WQ */ + u32 rss_usecnt; }; struct mlx4_ib_srq { @@ -893,4 +909,11 @@ void mlx4_sched_ib_sl2vl_update_work(struct mlx4_ib_dev *ibdev, void mlx4_ib_sl2vl_update(struct mlx4_ib_dev *mdev, int port); +struct ib_wq *mlx4_ib_create_wq(struct ib_pd *pd, + struct ib_wq_init_attr *init_attr, + struct ib_udata *udata); +int mlx4_ib_destroy_wq(struct ib_wq *wq); +int mlx4_ib_modify_wq(struct ib_wq *wq, struct ib_wq_attr *wq_attr, + u32 wq_attr_mask, struct ib_udata *udata); + #endif /* MLX4_IB_H */ diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c index 247b9132e9de..65e4ec549368 100644 --- a/drivers/infiniband/hw/mlx4/qp.c +++ b/drivers/infiniband/hw/mlx4/qp.c @@ -116,6 +116,11 @@ static const __be32 mlx4_ib_opcode[] = { [IB_WR_MASKED_ATOMIC_FETCH_AND_ADD] = cpu_to_be32(MLX4_OPCODE_MASKED_ATOMIC_FA), }; +enum mlx4_ib_source_type { + MLX4_IB_QP_SRC = 0, + MLX4_IB_RWQ_SRC = 1, +}; + static struct mlx4_ib_sqp *to_msqp(struct mlx4_ib_qp *mqp) { return container_of(mqp, struct mlx4_ib_sqp, qp); @@ -330,6 +335,12 @@ static void mlx4_ib_qp_event(struct mlx4_qp *qp, enum mlx4_event type) } } +static void mlx4_ib_wq_event(struct mlx4_qp *qp, enum mlx4_event type) +{ + pr_warn_ratelimited("Unexpected event type %d on WQ 0x%06x. Events are not supported for WQs\n", + type, qp->qpn); +} + static int send_wqe_overhead(enum mlx4_ib_qp_type type, u32 flags) { /* @@ -639,7 +650,91 @@ static void mlx4_ib_free_qp_counter(struct mlx4_ib_dev *dev, qp->counter_index = NULL; } +/* + * This function allocates a WQN from a range which is consecutive and aligned + * to its size. In case the range is full, then it creates a new range and + * allocates WQN from it. The new range will be used for following allocations. + */ +static int mlx4_ib_alloc_wqn(struct mlx4_ib_ucontext *context, + struct mlx4_ib_qp *qp, int range_size, int *wqn) +{ + struct mlx4_ib_dev *dev = to_mdev(context->ibucontext.device); + struct mlx4_wqn_range *range; + int err = 0; + + mutex_lock(&context->wqn_ranges_mutex); + + range = list_first_entry_or_null(&context->wqn_ranges_list, + struct mlx4_wqn_range, list); + + if (!range || (range->refcount == range->size) || range->dirty) { + range = kzalloc(sizeof(*range), GFP_KERNEL); + if (!range) { + err = -ENOMEM; + goto out; + } + + err = mlx4_qp_reserve_range(dev->dev, range_size, + range_size, &range->base_wqn, 0, + qp->mqp.usage); + if (err) { + kfree(range); + goto out; + } + + range->size = range_size; + list_add(&range->list, &context->wqn_ranges_list); + } else if (range_size != 1) { + /* + * Requesting a new range (>1) when last range is still open, is + * not valid. + */ + err = -EINVAL; + goto out; + } + + qp->wqn_range = range; + + *wqn = range->base_wqn + range->refcount; + + range->refcount++; + +out: + mutex_unlock(&context->wqn_ranges_mutex); + + return err; +} + +static void mlx4_ib_release_wqn(struct mlx4_ib_ucontext *context, + struct mlx4_ib_qp *qp, bool dirty_release) +{ + struct mlx4_ib_dev *dev = to_mdev(context->ibucontext.device); + struct mlx4_wqn_range *range; + + mutex_lock(&context->wqn_ranges_mutex); + + range = qp->wqn_range; + + range->refcount--; + if (!range->refcount) { + mlx4_qp_release_range(dev->dev, range->base_wqn, + range->size); + list_del(&range->list); + kfree(range); + } else if (dirty_release) { + /* + * A range which one of its WQNs is destroyed, won't be able to be + * reused for further WQN allocations. + * The next created WQ will allocate a new range. + */ + range->dirty = 1; + } + + mutex_unlock(&context->wqn_ranges_mutex); +} + static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd, + enum mlx4_ib_source_type src, struct ib_qp_init_attr *init_attr, struct ib_udata *udata, int sqpn, struct mlx4_ib_qp **caller_qp) @@ -652,6 +747,7 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd, enum mlx4_ib_qp_type qp_type = (enum mlx4_ib_qp_type) init_attr->qp_type; struct mlx4_ib_cq *mcq; unsigned long flags; + int range_size = 0; /* When tunneling special qps, we use a plain UD qp */ if (sqpn) { @@ -728,27 +824,69 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd, if (pd->uobject) { - struct mlx4_ib_create_qp ucmd; + union { + struct mlx4_ib_create_qp qp; + struct mlx4_ib_create_wq wq; + } ucmd; + size_t copy_len; + + copy_len = (src == MLX4_IB_QP_SRC) ? + sizeof(struct mlx4_ib_create_qp) : + min(sizeof(struct mlx4_ib_create_wq), udata->inlen); - if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd)) { + if (ib_copy_from_udata(&ucmd, udata, copy_len)) { err = -EFAULT; goto err; } - err = set_rq_size(dev, &init_attr->cap, !!pd->uobject, - qp_has_rq(init_attr), qp, ucmd.inl_recv_sz); - if (err) - goto err; + if (src == MLX4_IB_RWQ_SRC) { + if (ucmd.wq.comp_mask || ucmd.wq.reserved1 || + ucmd.wq.reserved[0] || ucmd.wq.reserved[1] || + ucmd.wq.reserved[2]) { + pr_debug("user command isn't supported\n"); + err = -EOPNOTSUPP; + goto err; + } - qp->inl_recv_sz = ucmd.inl_recv_sz; - qp->sq_no_prefetch = ucmd.sq_no_prefetch; + if (ucmd.wq.log_range_size > + ilog2(dev->dev->caps.max_rss_tbl_sz)) { + pr_debug("WQN range size must be equal or smaller than %d\n", + dev->dev->caps.max_rss_tbl_sz); + err = -EOPNOTSUPP; + goto err; + } + range_size = 1 << ucmd.wq.log_range_size; + } else { + qp->inl_recv_sz = ucmd.qp.inl_recv_sz; + } - err = set_user_sq_size(dev, qp, &ucmd); + err = set_rq_size(dev, &init_attr->cap, !!pd->uobject, + qp_has_rq(init_attr), qp, qp->inl_recv_sz); if (err) goto err; - qp->umem = ib_umem_get(pd->uobject->context, ucmd.buf_addr, - qp->buf_size, 0, 0); + if (src == MLX4_IB_QP_SRC) { + qp->sq_no_prefetch = ucmd.qp.sq_no_prefetch; + + err = set_user_sq_size(dev, qp, + (struct mlx4_ib_create_qp *) + &ucmd); + if (err) + goto err; + } else { + qp->sq_no_prefetch = 1; + qp->sq.wqe_cnt = 1; + qp->sq.wqe_shift = MLX4_IB_MIN_SQ_STRIDE; + /* Allocated buffer expects to have at least that SQ + * size. + */ + qp->buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) + + (qp->sq.wqe_cnt << qp->sq.wqe_shift); + } + + qp->umem = ib_umem_get(pd->uobject->context, + (src == MLX4_IB_QP_SRC) ? ucmd.qp.buf_addr : + ucmd.wq.buf_addr, qp->buf_size, 0, 0); if (IS_ERR(qp->umem)) { err = PTR_ERR(qp->umem); goto err; @@ -765,7 +903,8 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd, if (qp_has_rq(init_attr)) { err = mlx4_ib_db_map_user(to_mucontext(pd->uobject->context), - ucmd.db_addr, &qp->db); + (src == MLX4_IB_QP_SRC) ? ucmd.qp.db_addr : + ucmd.wq.db_addr, &qp->db); if (err) goto err_mtt; } @@ -853,6 +992,11 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd, goto err_wrid; } } + } else if (src == MLX4_IB_RWQ_SRC) { + err = mlx4_ib_alloc_wqn(to_mucontext(pd->uobject->context), qp, + range_size, &qpn); + if (err) + goto err_wrid; } else { /* Raw packet QPNs may not have bits 6,7 set in their qp_num; * otherwise, the WQE BlueFlame setup flow wrongly causes @@ -891,7 +1035,9 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd, */ qp->doorbell_qpn = swab32(qp->mqp.qpn << 8); - qp->mqp.event = mlx4_ib_qp_event; + qp->mqp.event = (src == MLX4_IB_QP_SRC) ? mlx4_ib_qp_event : + mlx4_ib_wq_event; + if (!*caller_qp) *caller_qp = qp; @@ -918,6 +1064,9 @@ err_qpn: if (!sqpn) { if (qp->flags & MLX4_IB_QP_NETIF) mlx4_ib_steer_qp_free(dev, qpn, 1); + else if (src == MLX4_IB_RWQ_SRC) + mlx4_ib_release_wqn(to_mucontext(pd->uobject->context), + qp, 0); else mlx4_qp_release_range(dev->dev, qpn, 1); } @@ -1016,7 +1165,7 @@ static struct mlx4_ib_pd *get_pd(struct mlx4_ib_qp *qp) return to_mpd(qp->ibqp.pd); } -static void get_cqs(struct mlx4_ib_qp *qp, +static void get_cqs(struct mlx4_ib_qp *qp, enum mlx4_ib_source_type src, struct mlx4_ib_cq **send_cq, struct mlx4_ib_cq **recv_cq) { switch (qp->ibqp.qp_type) { @@ -1029,14 +1178,16 @@ static void get_cqs(struct mlx4_ib_qp *qp, *recv_cq = *send_cq; break; default: - *send_cq = to_mcq(qp->ibqp.send_cq); - *recv_cq = to_mcq(qp->ibqp.recv_cq); + *recv_cq = (src == MLX4_IB_QP_SRC) ? to_mcq(qp->ibqp.recv_cq) : + to_mcq(qp->ibwq.cq); + *send_cq = (src == MLX4_IB_QP_SRC) ? to_mcq(qp->ibqp.send_cq) : + *recv_cq; break; } } static void destroy_qp_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp, - int is_user) + enum mlx4_ib_source_type src, int is_user) { struct mlx4_ib_cq *send_cq, *recv_cq; unsigned long flags; @@ -1069,7 +1220,7 @@ static void destroy_qp_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp, } } - get_cqs(qp, &send_cq, &recv_cq); + get_cqs(qp, src, &send_cq, &recv_cq); spin_lock_irqsave(&dev->reset_flow_resource_lock, flags); mlx4_ib_lock_cqs(send_cq, recv_cq); @@ -1095,6 +1246,9 @@ static void destroy_qp_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp, if (!is_sqp(dev, qp) && !is_tunnel_qp(dev, qp)) { if (qp->flags & MLX4_IB_QP_NETIF) mlx4_ib_steer_qp_free(dev, qp->mqp.qpn, 1); + else if (src == MLX4_IB_RWQ_SRC) + mlx4_ib_release_wqn(to_mucontext( + qp->ibwq.uobject->context), qp, 1); else mlx4_qp_release_range(dev->dev, qp->mqp.qpn, 1); } @@ -1102,9 +1256,12 @@ static void destroy_qp_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp, mlx4_mtt_cleanup(dev->dev, &qp->mtt); if (is_user) { - if (qp->rq.wqe_cnt) - mlx4_ib_db_unmap_user(to_mucontext(qp->ibqp.uobject->context), - &qp->db); + if (qp->rq.wqe_cnt) { + struct mlx4_ib_ucontext *mcontext = !src ? + to_mucontext(qp->ibqp.uobject->context) : + to_mucontext(qp->ibwq.uobject->context); + mlx4_ib_db_unmap_user(mcontext, &qp->db); + } ib_umem_release(qp->umem); } else { kvfree(qp->sq.wrid); @@ -1200,8 +1357,8 @@ static struct ib_qp *_mlx4_ib_create_qp(struct ib_pd *pd, /* fall through */ case IB_QPT_UD: { - err = create_qp_common(to_mdev(pd->device), pd, init_attr, - udata, 0, &qp); + err = create_qp_common(to_mdev(pd->device), pd, MLX4_IB_QP_SRC, + init_attr, udata, 0, &qp); if (err) { kfree(qp); return ERR_PTR(err); @@ -1231,8 +1388,8 @@ static struct ib_qp *_mlx4_ib_create_qp(struct ib_pd *pd, sqpn = get_sqp_num(to_mdev(pd->device), init_attr); } - err = create_qp_common(to_mdev(pd->device), pd, init_attr, udata, - sqpn, &qp); + err = create_qp_common(to_mdev(pd->device), pd, MLX4_IB_QP_SRC, + init_attr, udata, sqpn, &qp); if (err) return ERR_PTR(err); @@ -1303,7 +1460,7 @@ static int _mlx4_ib_destroy_qp(struct ib_qp *qp) mlx4_ib_free_qp_counter(dev, mqp); pd = get_pd(mqp); - destroy_qp_common(dev, mqp, !!pd->ibpd.uobject); + destroy_qp_common(dev, mqp, MLX4_IB_QP_SRC, !!pd->ibpd.uobject); if (is_sqp(dev, mqp)) kfree(to_msqp(mqp)); @@ -1626,12 +1783,15 @@ static u8 gid_type_to_qpc(enum ib_gid_type gid_type) } } -static int __mlx4_ib_modify_qp(struct ib_qp *ibqp, +static int __mlx4_ib_modify_qp(void *src, enum mlx4_ib_source_type src_type, const struct ib_qp_attr *attr, int attr_mask, enum ib_qp_state cur_state, enum ib_qp_state new_state) { - struct mlx4_ib_dev *dev = to_mdev(ibqp->device); - struct mlx4_ib_qp *qp = to_mqp(ibqp); + struct ib_uobject *ibuobject; + struct ib_srq *ibsrq; + enum ib_qp_type qp_type; + struct mlx4_ib_dev *dev; + struct mlx4_ib_qp *qp; struct mlx4_ib_pd *pd; struct mlx4_ib_cq *send_cq, *recv_cq; struct mlx4_qp_context *context; @@ -1641,6 +1801,28 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp, int err = -EINVAL; int counter_index; + if (src_type == MLX4_IB_RWQ_SRC) { + struct ib_wq *ibwq; + + ibwq = (struct ib_wq *)src; + ibuobject = ibwq->uobject; + ibsrq = NULL; + qp_type = IB_QPT_RAW_PACKET; + qp = to_mqp((struct ib_qp *)ibwq); + dev = to_mdev(ibwq->device); + pd = to_mpd(ibwq->pd); + } else { + struct ib_qp *ibqp; + + ibqp = (struct ib_qp *)src; + ibuobject = ibqp->uobject; + ibsrq = ibqp->srq; + qp_type = ibqp->qp_type; + qp = to_mqp(ibqp); + dev = to_mdev(ibqp->device); + pd = get_pd(qp); + } + /* APM is not supported under RoCE */ if (attr_mask & IB_QP_ALT_PATH && rdma_port_get_link_layer(&dev->ib_dev, qp->port) == @@ -1674,11 +1856,11 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp, if (qp->inl_recv_sz) context->param3 |= cpu_to_be32(1 << 25); - if (ibqp->qp_type == IB_QPT_GSI || ibqp->qp_type == IB_QPT_SMI) + if (qp_type == IB_QPT_GSI || qp_type == IB_QPT_SMI) context->mtu_msgmax = (IB_MTU_4096 << 5) | 11; - else if (ibqp->qp_type == IB_QPT_RAW_PACKET) + else if (qp_type == IB_QPT_RAW_PACKET) context->mtu_msgmax = (MLX4_RAW_QP_MTU << 5) | MLX4_RAW_QP_MSGMAX; - else if (ibqp->qp_type == IB_QPT_UD) { + else if (qp_type == IB_QPT_UD) { if (qp->flags & MLX4_IB_QP_LSO) context->mtu_msgmax = (IB_MTU_4096 << 5) | ilog2(dev->dev->caps.max_gso_sz); @@ -1708,14 +1890,15 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp, if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) { context->sq_size_stride |= !!qp->sq_no_prefetch << 7; context->xrcd = cpu_to_be32((u32) qp->xrcdn); - if (ibqp->qp_type == IB_QPT_RAW_PACKET) + if (qp_type == IB_QPT_RAW_PACKET) context->param3 |= cpu_to_be32(1 << 30); } - if (qp->ibqp.uobject) + if (ibuobject) context->usr_page = cpu_to_be32( mlx4_to_hw_uar_index(dev->dev, - to_mucontext(ibqp->uobject->context)->uar.index)); + to_mucontext(ibuobject->context) + ->uar.index)); else context->usr_page = cpu_to_be32( mlx4_to_hw_uar_index(dev->dev, dev->priv_uar.index)); @@ -1759,7 +1942,7 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp, steer_qp = 1; } - if (ibqp->qp_type == IB_QPT_GSI) { + if (qp_type == IB_QPT_GSI) { enum ib_gid_type gid_type = qp->flags & MLX4_IB_ROCE_V2_GSI_QP ? IB_GID_TYPE_ROCE_UDP_ENCAP : IB_GID_TYPE_ROCE; u8 qpc_roce_mode = gid_type_to_qpc(gid_type); @@ -1776,7 +1959,7 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp, } if (attr_mask & IB_QP_AV) { - u8 port_num = mlx4_is_bonded(to_mdev(ibqp->device)->dev) ? 1 : + u8 port_num = mlx4_is_bonded(dev->dev) ? 1 : attr_mask & IB_QP_PORT ? attr->port_num : qp->port; union ib_gid gid; struct ib_gid_attr gid_attr = {.gid_type = IB_GID_TYPE_IB}; @@ -1791,7 +1974,7 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp, int index = rdma_ah_read_grh(&attr->ah_attr)->sgid_index; - status = ib_get_cached_gid(ibqp->device, port_num, + status = ib_get_cached_gid(&dev->ib_dev, port_num, index, &gid, &gid_attr); if (!status && !memcmp(&gid, &zgid, sizeof(gid))) status = -ENOENT; @@ -1848,15 +2031,14 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp, optpar |= MLX4_QP_OPTPAR_ALT_ADDR_PATH; } - pd = get_pd(qp); - get_cqs(qp, &send_cq, &recv_cq); + get_cqs(qp, src_type, &send_cq, &recv_cq); context->pd = cpu_to_be32(pd->pdn); context->cqn_send = cpu_to_be32(send_cq->mcq.cqn); context->cqn_recv = cpu_to_be32(recv_cq->mcq.cqn); context->params1 = cpu_to_be32(MLX4_IB_ACK_REQ_FREQ << 28); /* Set "fast registration enabled" for all kernel QPs */ - if (!qp->ibqp.uobject) + if (!ibuobject) context->params1 |= cpu_to_be32(1 << 11); if (attr_mask & IB_QP_RNR_RETRY) { @@ -1891,7 +2073,7 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp, optpar |= MLX4_QP_OPTPAR_RWE | MLX4_QP_OPTPAR_RRE | MLX4_QP_OPTPAR_RAE; } - if (ibqp->srq) + if (ibsrq) context->params2 |= cpu_to_be32(MLX4_QP_BIT_RIC); if (attr_mask & IB_QP_MIN_RNR_TIMER) { @@ -1922,17 +2104,19 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp, optpar |= MLX4_QP_OPTPAR_Q_KEY; } - if (ibqp->srq) - context->srqn = cpu_to_be32(1 << 24 | to_msrq(ibqp->srq)->msrq.srqn); + if (ibsrq) + context->srqn = cpu_to_be32(1 << 24 | + to_msrq(ibsrq)->msrq.srqn); - if (qp->rq.wqe_cnt && cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) + if (qp->rq.wqe_cnt && + cur_state == IB_QPS_RESET && + new_state == IB_QPS_INIT) context->db_rec_addr = cpu_to_be64(qp->db.dma); if (cur_state == IB_QPS_INIT && new_state == IB_QPS_RTR && - (ibqp->qp_type == IB_QPT_GSI || ibqp->qp_type == IB_QPT_SMI || - ibqp->qp_type == IB_QPT_UD || - ibqp->qp_type == IB_QPT_RAW_PACKET)) { + (qp_type == IB_QPT_GSI || qp_type == IB_QPT_SMI || + qp_type == IB_QPT_UD || qp_type == IB_QPT_RAW_PACKET)) { context->pri_path.sched_queue = (qp->port - 1) << 6; if (qp->mlx4_ib_qp_type == MLX4_IB_QPT_SMI || qp->mlx4_ib_qp_type & @@ -1965,7 +2149,7 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp, } } - if (qp->ibqp.qp_type == IB_QPT_RAW_PACKET) { + if (qp_type == IB_QPT_RAW_PACKET) { context->pri_path.ackto = (context->pri_path.ackto & 0xf8) | MLX4_IB_LINK_TYPE_ETH; if (dev->dev->caps.tunnel_offload_mode == MLX4_TUNNEL_OFFLOAD_MODE_VXLAN) { @@ -1975,7 +2159,7 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp, } } - if (ibqp->qp_type == IB_QPT_UD && (new_state == IB_QPS_RTR)) { + if (qp_type == IB_QPT_UD && (new_state == IB_QPS_RTR)) { int is_eth = rdma_port_get_link_layer( &dev->ib_dev, qp->port) == IB_LINK_LAYER_ETHERNET; @@ -1985,14 +2169,15 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp, } } - if (cur_state == IB_QPS_RTS && new_state == IB_QPS_SQD && attr_mask & IB_QP_EN_SQD_ASYNC_NOTIFY && attr->en_sqd_async_notify) sqd_event = 1; else sqd_event = 0; - if (!ibqp->uobject && cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) + if (!ibuobject && + cur_state == IB_QPS_RESET && + new_state == IB_QPS_INIT) context->rlkey_roce_mode |= (1 << 4); /* @@ -2001,7 +2186,9 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp, * headroom is stamped so that the hardware doesn't start * processing stale work requests. */ - if (!ibqp->uobject && cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) { + if (!ibuobject && + cur_state == IB_QPS_RESET && + new_state == IB_QPS_INIT) { struct mlx4_wqe_ctrl_seg *ctrl; int i; @@ -2058,9 +2245,9 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp, * entries and reinitialize the QP. */ if (new_state == IB_QPS_RESET) { - if (!ibqp->uobject) { + if (!ibuobject) { mlx4_ib_cq_clean(recv_cq, qp->mqp.qpn, - ibqp->srq ? to_msrq(ibqp->srq) : NULL); + ibsrq ? to_msrq(ibsrq) : NULL); if (send_cq != recv_cq) mlx4_ib_cq_clean(send_cq, qp->mqp.qpn, NULL); @@ -2265,7 +2452,8 @@ static int _mlx4_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, goto out; } - err = __mlx4_ib_modify_qp(ibqp, attr, attr_mask, cur_state, new_state); + err = __mlx4_ib_modify_qp(ibqp, MLX4_IB_QP_SRC, attr, attr_mask, + cur_state, new_state); if (mlx4_is_bonded(dev->dev) && (attr_mask & IB_QP_PORT)) attr->port_num = 1; @@ -3550,3 +3738,208 @@ out: return err; } +struct ib_wq *mlx4_ib_create_wq(struct ib_pd *pd, + struct ib_wq_init_attr *init_attr, + struct ib_udata *udata) +{ + struct mlx4_ib_dev *dev; + struct ib_qp_init_attr ib_qp_init_attr; + struct mlx4_ib_qp *qp; + struct mlx4_ib_create_wq ucmd; + int err, required_cmd_sz; + + if (!(udata && pd->uobject)) + return ERR_PTR(-EINVAL); + + required_cmd_sz = offsetof(typeof(ucmd), reserved) + + sizeof(ucmd.reserved); + if (udata->inlen < required_cmd_sz) { + pr_debug("invalid inlen\n"); + return ERR_PTR(-EINVAL); + } + + if (udata->inlen > sizeof(ucmd) && + !ib_is_udata_cleared(udata, sizeof(ucmd), + udata->inlen - sizeof(ucmd))) { + pr_debug("inlen is not supported\n"); + return ERR_PTR(-EOPNOTSUPP); + } + + if (udata->outlen) + return ERR_PTR(-EOPNOTSUPP); + + dev = to_mdev(pd->device); + + if (init_attr->wq_type != IB_WQT_RQ) { + pr_debug("unsupported wq type %d\n", init_attr->wq_type); + return ERR_PTR(-EOPNOTSUPP); + } + + if (init_attr->create_flags) { + pr_debug("unsupported create_flags %u\n", + init_attr->create_flags); + return ERR_PTR(-EOPNOTSUPP); + } + + qp = kzalloc(sizeof(*qp), GFP_KERNEL); + if (!qp) + return ERR_PTR(-ENOMEM); + + qp->pri.vid = 0xFFFF; + qp->alt.vid = 0xFFFF; + + memset(&ib_qp_init_attr, 0, sizeof(ib_qp_init_attr)); + ib_qp_init_attr.qp_context = init_attr->wq_context; + ib_qp_init_attr.qp_type = IB_QPT_RAW_PACKET; + ib_qp_init_attr.cap.max_recv_wr = init_attr->max_wr; + ib_qp_init_attr.cap.max_recv_sge = init_attr->max_sge; + ib_qp_init_attr.recv_cq = init_attr->cq; + ib_qp_init_attr.send_cq = ib_qp_init_attr.recv_cq; /* Dummy CQ */ + + err = create_qp_common(dev, pd, MLX4_IB_RWQ_SRC, &ib_qp_init_attr, + udata, 0, &qp); + if (err) { + kfree(qp); + return ERR_PTR(err); + } + + qp->ibwq.event_handler = init_attr->event_handler; + qp->ibwq.wq_num = qp->mqp.qpn; + qp->ibwq.state = IB_WQS_RESET; + + return &qp->ibwq; +} + +static int ib_wq2qp_state(enum ib_wq_state state) +{ + switch (state) { + case IB_WQS_RESET: + return IB_QPS_RESET; + case IB_WQS_RDY: + return IB_QPS_RTR; + default: + return IB_QPS_ERR; + } +} + +static int _mlx4_ib_modify_wq(struct ib_wq *ibwq, enum ib_wq_state new_state) +{ + struct mlx4_ib_qp *qp = to_mqp((struct ib_qp *)ibwq); + enum ib_qp_state qp_cur_state; + enum ib_qp_state qp_new_state; + int attr_mask; + int err; + + /* ib_qp.state represents the WQ HW state while ib_wq.state represents + * the WQ logic state. + */ + qp_cur_state = qp->state; + qp_new_state = ib_wq2qp_state(new_state); + + if (ib_wq2qp_state(new_state) == qp_cur_state) + return 0; + + if (new_state == IB_WQS_RDY) { + struct ib_qp_attr attr = {}; + + attr.port_num = qp->port; + attr_mask = IB_QP_PORT; + + err = __mlx4_ib_modify_qp(ibwq, MLX4_IB_RWQ_SRC, &attr, + attr_mask, IB_QPS_RESET, IB_QPS_INIT); + if (err) { + pr_debug("WQN=0x%06x failed to apply RST->INIT on the HW QP\n", + ibwq->wq_num); + return err; + } + + qp_cur_state = IB_QPS_INIT; + } + + attr_mask = 0; + err = __mlx4_ib_modify_qp(ibwq, MLX4_IB_RWQ_SRC, NULL, attr_mask, + qp_cur_state, qp_new_state); + + if (err && (qp_cur_state == IB_QPS_INIT)) { + qp_new_state = IB_QPS_RESET; + if (__mlx4_ib_modify_qp(ibwq, MLX4_IB_RWQ_SRC, NULL, + attr_mask, IB_QPS_INIT, IB_QPS_RESET)) { + pr_warn("WQN=0x%06x failed with reverting HW's resources failure\n", + ibwq->wq_num); + qp_new_state = IB_QPS_INIT; + } + } + + qp->state = qp_new_state; + + return err; +} + +int mlx4_ib_modify_wq(struct ib_wq *ibwq, struct ib_wq_attr *wq_attr, + u32 wq_attr_mask, struct ib_udata *udata) +{ + struct mlx4_ib_qp *qp = to_mqp((struct ib_qp *)ibwq); + struct mlx4_ib_modify_wq ucmd = {}; + size_t required_cmd_sz; + enum ib_wq_state cur_state, new_state; + int err = 0; + + required_cmd_sz = offsetof(typeof(ucmd), reserved) + + sizeof(ucmd.reserved); + if (udata->inlen < required_cmd_sz) + return -EINVAL; + + if (udata->inlen > sizeof(ucmd) && + !ib_is_udata_cleared(udata, sizeof(ucmd), + udata->inlen - sizeof(ucmd))) + return -EOPNOTSUPP; + + if (ib_copy_from_udata(&ucmd, udata, min(sizeof(ucmd), udata->inlen))) + return -EFAULT; + + if (ucmd.comp_mask || ucmd.reserved) + return -EOPNOTSUPP; + + if (wq_attr_mask & IB_WQ_FLAGS) + return -EOPNOTSUPP; + + cur_state = wq_attr_mask & IB_WQ_CUR_STATE ? wq_attr->curr_wq_state : + ibwq->state; + new_state = wq_attr_mask & IB_WQ_STATE ? wq_attr->wq_state : cur_state; + + if (cur_state < IB_WQS_RESET || cur_state > IB_WQS_ERR || + new_state < IB_WQS_RESET || new_state > IB_WQS_ERR) + return -EINVAL; + + if ((new_state == IB_WQS_RDY) && (cur_state == IB_WQS_ERR)) + return -EINVAL; + + if ((new_state == IB_WQS_ERR) && (cur_state == IB_WQS_RESET)) + return -EINVAL; + + /* Can update HW state only if a RSS QP has already associated to this + * WQ, so we can apply its port on the WQ. + */ + if (qp->rss_usecnt) + err = _mlx4_ib_modify_wq(ibwq, new_state); + + if (!err) + ibwq->state = new_state; + + return err; +} + +int mlx4_ib_destroy_wq(struct ib_wq *ibwq) +{ + struct mlx4_ib_dev *dev = to_mdev(ibwq->device); + struct mlx4_ib_qp *qp = to_mqp((struct ib_qp *)ibwq); + + if (qp->counter_index) + mlx4_ib_free_qp_counter(dev, qp); + + destroy_qp_common(dev, qp, MLX4_IB_RWQ_SRC, 1); + + kfree(qp); + + return 0; +} diff --git a/include/uapi/rdma/mlx4-abi.h b/include/uapi/rdma/mlx4-abi.h index bf3bdba2f326..c9702a5f0bda 100644 --- a/include/uapi/rdma/mlx4-abi.h +++ b/include/uapi/rdma/mlx4-abi.h @@ -105,4 +105,18 @@ struct mlx4_ib_create_qp { __u8 reserved; }; +struct mlx4_ib_create_wq { + __u64 buf_addr; + __u64 db_addr; + __u8 log_range_size; + __u8 reserved[3]; + __u32 comp_mask; + __u32 reserved1; +}; + +struct mlx4_ib_modify_wq { + __u32 comp_mask; + __u32 reserved; +}; + #endif /* MLX4_ABI_USER_H */ -- cgit v1.2.3 From b8d46ca035060e70f5f0da849d86720752d5aa17 Mon Sep 17 00:00:00 2001 From: Guy Levi Date: Tue, 4 Jul 2017 16:24:25 +0300 Subject: IB/mlx4: Add support for WQ indirection table related verbs To enable RSS functionality the IB indirection table object (i.e. ib_rwq_ind_table) should be used. This patch implements the related verbs as of create and destroy an indirection table. In downstream patches the indirection table will be used as part of RSS QP creation. Signed-off-by: Guy Levi Reviewed-by: Yishai Hadas Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/hw/mlx4/main.c | 12 +++++-- drivers/infiniband/hw/mlx4/mlx4_ib.h | 6 ++++ drivers/infiniband/hw/mlx4/qp.c | 70 ++++++++++++++++++++++++++++++++++++ include/uapi/rdma/mlx4-abi.h | 4 +++ 4 files changed, 89 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index 7c6f929ebd3e..b42234571a8c 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -2738,10 +2738,16 @@ static void *mlx4_ib_add(struct mlx4_dev *dev) ibdev->ib_dev.create_wq = mlx4_ib_create_wq; ibdev->ib_dev.modify_wq = mlx4_ib_modify_wq; ibdev->ib_dev.destroy_wq = mlx4_ib_destroy_wq; + ibdev->ib_dev.create_rwq_ind_table = + mlx4_ib_create_rwq_ind_table; + ibdev->ib_dev.destroy_rwq_ind_table = + mlx4_ib_destroy_rwq_ind_table; ibdev->ib_dev.uverbs_ex_cmd_mask |= - (1ull << IB_USER_VERBS_EX_CMD_CREATE_WQ) | - (1ull << IB_USER_VERBS_EX_CMD_MODIFY_WQ) | - (1ull << IB_USER_VERBS_EX_CMD_DESTROY_WQ); + (1ull << IB_USER_VERBS_EX_CMD_CREATE_WQ) | + (1ull << IB_USER_VERBS_EX_CMD_MODIFY_WQ) | + (1ull << IB_USER_VERBS_EX_CMD_DESTROY_WQ) | + (1ull << IB_USER_VERBS_EX_CMD_CREATE_RWQ_IND_TBL) | + (1ull << IB_USER_VERBS_EX_CMD_DESTROY_RWQ_IND_TBL); } if (!mlx4_is_slave(ibdev->dev)) { diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h index ba4e78c064d2..85525bc400a0 100644 --- a/drivers/infiniband/hw/mlx4/mlx4_ib.h +++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h @@ -916,4 +916,10 @@ int mlx4_ib_destroy_wq(struct ib_wq *wq); int mlx4_ib_modify_wq(struct ib_wq *wq, struct ib_wq_attr *wq_attr, u32 wq_attr_mask, struct ib_udata *udata); +struct ib_rwq_ind_table +*mlx4_ib_create_rwq_ind_table(struct ib_device *device, + struct ib_rwq_ind_table_init_attr *init_attr, + struct ib_udata *udata); +int mlx4_ib_destroy_rwq_ind_table(struct ib_rwq_ind_table *wq_ind_table); + #endif /* MLX4_IB_H */ diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c index 65e4ec549368..519919d15474 100644 --- a/drivers/infiniband/hw/mlx4/qp.c +++ b/drivers/infiniband/hw/mlx4/qp.c @@ -3943,3 +3943,73 @@ int mlx4_ib_destroy_wq(struct ib_wq *ibwq) return 0; } + +struct ib_rwq_ind_table +*mlx4_ib_create_rwq_ind_table(struct ib_device *device, + struct ib_rwq_ind_table_init_attr *init_attr, + struct ib_udata *udata) +{ + struct ib_rwq_ind_table *rwq_ind_table; + struct mlx4_ib_create_rwq_ind_tbl_resp resp = {}; + unsigned int ind_tbl_size = 1 << init_attr->log_ind_tbl_size; + unsigned int base_wqn; + size_t min_resp_len; + int i; + int err; + + if (udata->inlen > 0 && + !ib_is_udata_cleared(udata, 0, + udata->inlen)) + return ERR_PTR(-EOPNOTSUPP); + + min_resp_len = offsetof(typeof(resp), reserved) + sizeof(resp.reserved); + if (udata->outlen && udata->outlen < min_resp_len) + return ERR_PTR(-EINVAL); + + if (ind_tbl_size > + device->attrs.rss_caps.max_rwq_indirection_table_size) { + pr_debug("log_ind_tbl_size = %d is bigger than supported = %d\n", + ind_tbl_size, + device->attrs.rss_caps.max_rwq_indirection_table_size); + return ERR_PTR(-EINVAL); + } + + base_wqn = init_attr->ind_tbl[0]->wq_num; + + if (base_wqn % ind_tbl_size) { + pr_debug("WQN=0x%x isn't aligned with indirection table size\n", + base_wqn); + return ERR_PTR(-EINVAL); + } + + for (i = 1; i < ind_tbl_size; i++) { + if (++base_wqn != init_attr->ind_tbl[i]->wq_num) { + pr_debug("indirection table's WQNs aren't consecutive\n"); + return ERR_PTR(-EINVAL); + } + } + + rwq_ind_table = kzalloc(sizeof(*rwq_ind_table), GFP_KERNEL); + if (!rwq_ind_table) + return ERR_PTR(-ENOMEM); + + if (udata->outlen) { + resp.response_length = offsetof(typeof(resp), response_length) + + sizeof(resp.response_length); + err = ib_copy_to_udata(udata, &resp, resp.response_length); + if (err) + goto err; + } + + return rwq_ind_table; + +err: + kfree(rwq_ind_table); + return ERR_PTR(err); +} + +int mlx4_ib_destroy_rwq_ind_table(struct ib_rwq_ind_table *ib_rwq_ind_tbl) +{ + kfree(ib_rwq_ind_tbl); + return 0; +} diff --git a/include/uapi/rdma/mlx4-abi.h b/include/uapi/rdma/mlx4-abi.h index c9702a5f0bda..5591d955ba00 100644 --- a/include/uapi/rdma/mlx4-abi.h +++ b/include/uapi/rdma/mlx4-abi.h @@ -119,4 +119,8 @@ struct mlx4_ib_modify_wq { __u32 reserved; }; +struct mlx4_ib_create_rwq_ind_tbl_resp { + __u32 response_length; + __u32 reserved; +}; #endif /* MLX4_ABI_USER_H */ -- cgit v1.2.3 From 3078f5f1bd8b6c8aef77b8ef4d49671fa6eb058e Mon Sep 17 00:00:00 2001 From: Guy Levi Date: Tue, 4 Jul 2017 16:24:26 +0300 Subject: IB/mlx4: Add support for RSS QP Add support to work with a RSS QP by using an indirection table object upon QP creation. Other related QP verbs (e.g. modify/destroy/query) were updated as well for that QP mode. Notes: - The RX hash properties are supplied as driver private data. - The RSS QP port is used on the associated WQs in its indirection table. Applying different ports during WQ life time is not allowed. - The expected RSS QP flow is: create, modify(RST->INIT), modify(RST->RTR), destroy. Signed-off-by: Guy Levi Reviewed-by: Yishai Hadas Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/hw/mlx4/mlx4_ib.h | 8 + drivers/infiniband/hw/mlx4/qp.c | 453 +++++++++++++++++++++++++++++++++-- include/uapi/rdma/mlx4-abi.h | 33 +++ 3 files changed, 472 insertions(+), 22 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h index 85525bc400a0..1fa19820355a 100644 --- a/drivers/infiniband/hw/mlx4/mlx4_ib.h +++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h @@ -46,6 +46,7 @@ #include #include +#include #define MLX4_IB_DRV_NAME "mlx4_ib" @@ -299,6 +300,12 @@ struct mlx4_wqn_range { struct list_head list; }; +struct mlx4_ib_rss { + unsigned int base_qpn_tbl_sz; + u8 flags; + u8 rss_key[MLX4_EN_RSS_KEY_SIZE]; +}; + struct mlx4_ib_qp { union { struct ib_qp ibqp; @@ -345,6 +352,7 @@ struct mlx4_ib_qp { struct mlx4_wqn_range *wqn_range; /* Number of RSS QP parents that uses this WQ */ u32 rss_usecnt; + struct mlx4_ib_rss *rss_ctx; }; struct mlx4_ib_srq { diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c index 519919d15474..e42acfb20588 100644 --- a/drivers/infiniband/hw/mlx4/qp.c +++ b/drivers/infiniband/hw/mlx4/qp.c @@ -53,6 +53,7 @@ static void mlx4_ib_lock_cqs(struct mlx4_ib_cq *send_cq, struct mlx4_ib_cq *recv_cq); static void mlx4_ib_unlock_cqs(struct mlx4_ib_cq *send_cq, struct mlx4_ib_cq *recv_cq); +static int _mlx4_ib_modify_wq(struct ib_wq *ibwq, enum ib_wq_state new_state); enum { MLX4_IB_ACK_REQ_FREQ = 8, @@ -650,6 +651,212 @@ static void mlx4_ib_free_qp_counter(struct mlx4_ib_dev *dev, qp->counter_index = NULL; } +static int set_qp_rss(struct mlx4_ib_dev *dev, struct mlx4_ib_rss *rss_ctx, + struct ib_qp_init_attr *init_attr, + struct mlx4_ib_create_qp_rss *ucmd) +{ + rss_ctx->base_qpn_tbl_sz = init_attr->rwq_ind_tbl->ind_tbl[0]->wq_num | + (init_attr->rwq_ind_tbl->log_ind_tbl_size << 24); + + if ((ucmd->rx_hash_function == MLX4_IB_RX_HASH_FUNC_TOEPLITZ) && + (dev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_RSS_TOP)) { + memcpy(rss_ctx->rss_key, ucmd->rx_hash_key, + MLX4_EN_RSS_KEY_SIZE); + } else { + pr_debug("RX Hash function is not supported\n"); + return (-EOPNOTSUPP); + } + + if ((ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_SRC_IPV4) && + (ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_DST_IPV4)) { + rss_ctx->flags = MLX4_RSS_IPV4; + } else if ((ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_SRC_IPV4) || + (ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_DST_IPV4)) { + pr_debug("RX Hash fields_mask is not supported - both IPv4 SRC and DST must be set\n"); + return (-EOPNOTSUPP); + } + + if ((ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_SRC_IPV6) && + (ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_DST_IPV6)) { + rss_ctx->flags |= MLX4_RSS_IPV6; + } else if ((ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_SRC_IPV6) || + (ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_DST_IPV6)) { + pr_debug("RX Hash fields_mask is not supported - both IPv6 SRC and DST must be set\n"); + return (-EOPNOTSUPP); + } + + if ((ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_SRC_PORT_UDP) && + (ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_DST_PORT_UDP)) { + if (!(dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_UDP_RSS)) { + pr_debug("RX Hash fields_mask for UDP is not supported\n"); + return (-EOPNOTSUPP); + } + + if (rss_ctx->flags & MLX4_RSS_IPV4) { + rss_ctx->flags |= MLX4_RSS_UDP_IPV4; + } else if (rss_ctx->flags & MLX4_RSS_IPV6) { + rss_ctx->flags |= MLX4_RSS_UDP_IPV6; + } else { + pr_debug("RX Hash fields_mask is not supported - UDP must be set with IPv4 or IPv6\n"); + return (-EOPNOTSUPP); + } + } else if ((ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_SRC_PORT_UDP) || + (ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_DST_PORT_UDP)) { + pr_debug("RX Hash fields_mask is not supported - both UDP SRC and DST must be set\n"); + return (-EOPNOTSUPP); + } + + if ((ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_SRC_PORT_TCP) && + (ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_DST_PORT_TCP)) { + if (rss_ctx->flags & MLX4_RSS_IPV4) { + rss_ctx->flags |= MLX4_RSS_TCP_IPV4; + } else if (rss_ctx->flags & MLX4_RSS_IPV6) { + rss_ctx->flags |= MLX4_RSS_TCP_IPV6; + } else { + pr_debug("RX Hash fields_mask is not supported - TCP must be set with IPv4 or IPv6\n"); + return (-EOPNOTSUPP); + } + + } else if ((ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_SRC_PORT_TCP) || + (ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_DST_PORT_TCP)) { + pr_debug("RX Hash fields_mask is not supported - both TCP SRC and DST must be set\n"); + return (-EOPNOTSUPP); + } + + return 0; +} + +static int create_qp_rss(struct mlx4_ib_dev *dev, struct ib_pd *ibpd, + struct ib_qp_init_attr *init_attr, + struct mlx4_ib_create_qp_rss *ucmd, + struct mlx4_ib_qp *qp) +{ + int qpn; + int err; + + qp->mqp.usage = MLX4_RES_USAGE_USER_VERBS; + + err = mlx4_qp_reserve_range(dev->dev, 1, 1, &qpn, 0, qp->mqp.usage); + if (err) + return err; + + err = mlx4_qp_alloc(dev->dev, qpn, &qp->mqp); + if (err) + goto err_qpn; + + mutex_init(&qp->mutex); + + INIT_LIST_HEAD(&qp->gid_list); + INIT_LIST_HEAD(&qp->steering_rules); + + qp->mlx4_ib_qp_type = MLX4_IB_QPT_RAW_ETHERTYPE; + qp->state = IB_QPS_RESET; + + /* Set dummy send resources to be compatible with HV and PRM */ + qp->sq_no_prefetch = 1; + qp->sq.wqe_cnt = 1; + qp->sq.wqe_shift = MLX4_IB_MIN_SQ_STRIDE; + qp->buf_size = qp->sq.wqe_cnt << MLX4_IB_MIN_SQ_STRIDE; + qp->mtt = (to_mqp( + (struct ib_qp *)init_attr->rwq_ind_tbl->ind_tbl[0]))->mtt; + + qp->rss_ctx = kzalloc(sizeof(*qp->rss_ctx), GFP_KERNEL); + if (!qp->rss_ctx) { + err = -ENOMEM; + goto err_qp_alloc; + } + + err = set_qp_rss(dev, qp->rss_ctx, init_attr, ucmd); + if (err) + goto err; + + return 0; + +err: + kfree(qp->rss_ctx); + +err_qp_alloc: + mlx4_qp_remove(dev->dev, &qp->mqp); + mlx4_qp_free(dev->dev, &qp->mqp); + +err_qpn: + mlx4_qp_release_range(dev->dev, qpn, 1); + return err; +} + +static struct ib_qp *_mlx4_ib_create_qp_rss(struct ib_pd *pd, + struct ib_qp_init_attr *init_attr, + struct ib_udata *udata) +{ + struct mlx4_ib_qp *qp; + struct mlx4_ib_create_qp_rss ucmd = {}; + size_t required_cmd_sz; + int err; + + if (!udata) { + pr_debug("RSS QP with NULL udata\n"); + return ERR_PTR(-EINVAL); + } + + if (udata->outlen) + return ERR_PTR(-EOPNOTSUPP); + + required_cmd_sz = offsetof(typeof(ucmd), reserved1) + + sizeof(ucmd.reserved1); + if (udata->inlen < required_cmd_sz) { + pr_debug("invalid inlen\n"); + return ERR_PTR(-EINVAL); + } + + if (ib_copy_from_udata(&ucmd, udata, min(sizeof(ucmd), udata->inlen))) { + pr_debug("copy failed\n"); + return ERR_PTR(-EFAULT); + } + + if (ucmd.comp_mask || ucmd.reserved1) + return ERR_PTR(-EOPNOTSUPP); + + if (udata->inlen > sizeof(ucmd) && + !ib_is_udata_cleared(udata, sizeof(ucmd), + udata->inlen - sizeof(ucmd))) { + pr_debug("inlen is not supported\n"); + return ERR_PTR(-EOPNOTSUPP); + } + + if (init_attr->qp_type != IB_QPT_RAW_PACKET) { + pr_debug("RSS QP with unsupported QP type %d\n", + init_attr->qp_type); + return ERR_PTR(-EOPNOTSUPP); + } + + if (init_attr->create_flags) { + pr_debug("RSS QP doesn't support create flags\n"); + return ERR_PTR(-EOPNOTSUPP); + } + + if (init_attr->send_cq || init_attr->cap.max_send_wr) { + pr_debug("RSS QP with unsupported send attributes\n"); + return ERR_PTR(-EOPNOTSUPP); + } + + qp = kzalloc(sizeof(*qp), GFP_KERNEL); + if (!qp) + return ERR_PTR(-ENOMEM); + + qp->pri.vid = 0xFFFF; + qp->alt.vid = 0xFFFF; + + err = create_qp_rss(to_mdev(pd->device), pd, init_attr, &ucmd, qp); + if (err) { + kfree(qp); + return ERR_PTR(err); + } + + qp->ibqp.qp_num = qp->mqp.qpn; + + return &qp->ibqp; +} + /* * This function allocates a WQN from a range which is consecutive and aligned * to its size. In case the range is full, then it creates a new range and @@ -1186,6 +1393,36 @@ static void get_cqs(struct mlx4_ib_qp *qp, enum mlx4_ib_source_type src, } } +static void destroy_qp_rss(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp) +{ + if (qp->state != IB_QPS_RESET) { + int i; + + for (i = 0; i < (1 << qp->ibqp.rwq_ind_tbl->log_ind_tbl_size); + i++) { + struct ib_wq *ibwq = qp->ibqp.rwq_ind_tbl->ind_tbl[i]; + struct mlx4_ib_qp *wq = to_mqp((struct ib_qp *)ibwq); + + mutex_lock(&wq->mutex); + + wq->rss_usecnt--; + + mutex_unlock(&wq->mutex); + } + + if (mlx4_qp_modify(dev->dev, NULL, to_mlx4_state(qp->state), + MLX4_QP_STATE_RST, NULL, 0, 0, &qp->mqp)) + pr_warn("modify QP %06x to RESET failed.\n", + qp->mqp.qpn); + } + + mlx4_qp_remove(dev->dev, &qp->mqp); + mlx4_qp_free(dev->dev, &qp->mqp); + mlx4_qp_release_range(dev->dev, qp->mqp.qpn, 1); + del_gid_entries(qp); + kfree(qp->rss_ctx); +} + static void destroy_qp_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp, enum mlx4_ib_source_type src, int is_user) { @@ -1303,6 +1540,9 @@ static struct ib_qp *_mlx4_ib_create_qp(struct ib_pd *pd, int sup_u_create_flags = MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK; u16 xrcdn = 0; + if (init_attr->rwq_ind_tbl) + return _mlx4_ib_create_qp_rss(pd, init_attr, udata); + /* * We only support LSO, vendor flag1, and multicast loopback blocking, * and only for kernel UD QPs. @@ -1444,7 +1684,6 @@ static int _mlx4_ib_destroy_qp(struct ib_qp *qp) { struct mlx4_ib_dev *dev = to_mdev(qp->device); struct mlx4_ib_qp *mqp = to_mqp(qp); - struct mlx4_ib_pd *pd; if (is_qp0(dev, mqp)) mlx4_CLOSE_PORT(dev->dev, mqp->port); @@ -1459,8 +1698,14 @@ static int _mlx4_ib_destroy_qp(struct ib_qp *qp) if (mqp->counter_index) mlx4_ib_free_qp_counter(dev, mqp); - pd = get_pd(mqp); - destroy_qp_common(dev, mqp, MLX4_IB_QP_SRC, !!pd->ibpd.uobject); + if (qp->rwq_ind_tbl) { + destroy_qp_rss(dev, mqp); + } else { + struct mlx4_ib_pd *pd; + + pd = get_pd(mqp); + destroy_qp_common(dev, mqp, MLX4_IB_QP_SRC, !!pd->ibpd.uobject); + } if (is_sqp(dev, mqp)) kfree(to_msqp(mqp)); @@ -1783,12 +2028,116 @@ static u8 gid_type_to_qpc(enum ib_gid_type gid_type) } } +/* + * Go over all RSS QP's childes (WQs) and apply their HW state according to + * their logic state if the RSS QP is the first RSS QP associated for the WQ. + */ +static int bringup_rss_rwqs(struct ib_rwq_ind_table *ind_tbl, u8 port_num) +{ + int i; + int err; + + for (i = 0; i < (1 << ind_tbl->log_ind_tbl_size); i++) { + struct ib_wq *ibwq = ind_tbl->ind_tbl[i]; + struct mlx4_ib_qp *wq = to_mqp((struct ib_qp *)ibwq); + + mutex_lock(&wq->mutex); + + /* Mlx4_ib restrictions: + * WQ's is associated to a port according to the RSS QP it is + * associates to. + * In case the WQ is associated to a different port by another + * RSS QP, return a failure. + */ + if ((wq->rss_usecnt > 0) && (wq->port != port_num)) { + err = -EINVAL; + mutex_unlock(&wq->mutex); + break; + } + wq->port = port_num; + if ((wq->rss_usecnt == 0) && (ibwq->state == IB_WQS_RDY)) { + err = _mlx4_ib_modify_wq(ibwq, IB_WQS_RDY); + if (err) { + mutex_unlock(&wq->mutex); + break; + } + } + wq->rss_usecnt++; + + mutex_unlock(&wq->mutex); + } + + if (i && err) { + int j; + + for (j = (i - 1); j >= 0; j--) { + struct ib_wq *ibwq = ind_tbl->ind_tbl[j]; + struct mlx4_ib_qp *wq = to_mqp((struct ib_qp *)ibwq); + + mutex_lock(&wq->mutex); + + if ((wq->rss_usecnt == 1) && + (ibwq->state == IB_WQS_RDY)) + if (_mlx4_ib_modify_wq(ibwq, IB_WQS_RESET)) + pr_warn("failed to reverse WQN=0x%06x\n", + ibwq->wq_num); + wq->rss_usecnt--; + + mutex_unlock(&wq->mutex); + } + } + + return err; +} + +static void bring_down_rss_rwqs(struct ib_rwq_ind_table *ind_tbl) +{ + int i; + + for (i = 0; i < (1 << ind_tbl->log_ind_tbl_size); i++) { + struct ib_wq *ibwq = ind_tbl->ind_tbl[i]; + struct mlx4_ib_qp *wq = to_mqp((struct ib_qp *)ibwq); + + mutex_lock(&wq->mutex); + + if ((wq->rss_usecnt == 1) && (ibwq->state == IB_WQS_RDY)) + if (_mlx4_ib_modify_wq(ibwq, IB_WQS_RESET)) + pr_warn("failed to reverse WQN=%x\n", + ibwq->wq_num); + wq->rss_usecnt--; + + mutex_unlock(&wq->mutex); + } +} + +static void fill_qp_rss_context(struct mlx4_qp_context *context, + struct mlx4_ib_qp *qp) +{ + struct mlx4_rss_context *rss_context; + + rss_context = (void *)context + offsetof(struct mlx4_qp_context, + pri_path) + MLX4_RSS_OFFSET_IN_QPC_PRI_PATH; + + rss_context->base_qpn = cpu_to_be32(qp->rss_ctx->base_qpn_tbl_sz); + rss_context->default_qpn = + cpu_to_be32(qp->rss_ctx->base_qpn_tbl_sz & 0xffffff); + if (qp->rss_ctx->flags & (MLX4_RSS_UDP_IPV4 | MLX4_RSS_UDP_IPV6)) + rss_context->base_qpn_udp = rss_context->default_qpn; + rss_context->flags = qp->rss_ctx->flags; + /* Currently support just toeplitz */ + rss_context->hash_fn = MLX4_RSS_HASH_TOP; + + memcpy(rss_context->rss_key, qp->rss_ctx->rss_key, + MLX4_EN_RSS_KEY_SIZE); +} + static int __mlx4_ib_modify_qp(void *src, enum mlx4_ib_source_type src_type, const struct ib_qp_attr *attr, int attr_mask, enum ib_qp_state cur_state, enum ib_qp_state new_state) { struct ib_uobject *ibuobject; struct ib_srq *ibsrq; + struct ib_rwq_ind_table *rwq_ind_tbl; enum ib_qp_type qp_type; struct mlx4_ib_dev *dev; struct mlx4_ib_qp *qp; @@ -1804,23 +2153,25 @@ static int __mlx4_ib_modify_qp(void *src, enum mlx4_ib_source_type src_type, if (src_type == MLX4_IB_RWQ_SRC) { struct ib_wq *ibwq; - ibwq = (struct ib_wq *)src; - ibuobject = ibwq->uobject; - ibsrq = NULL; - qp_type = IB_QPT_RAW_PACKET; - qp = to_mqp((struct ib_qp *)ibwq); - dev = to_mdev(ibwq->device); - pd = to_mpd(ibwq->pd); + ibwq = (struct ib_wq *)src; + ibuobject = ibwq->uobject; + ibsrq = NULL; + rwq_ind_tbl = NULL; + qp_type = IB_QPT_RAW_PACKET; + qp = to_mqp((struct ib_qp *)ibwq); + dev = to_mdev(ibwq->device); + pd = to_mpd(ibwq->pd); } else { struct ib_qp *ibqp; - ibqp = (struct ib_qp *)src; - ibuobject = ibqp->uobject; - ibsrq = ibqp->srq; - qp_type = ibqp->qp_type; - qp = to_mqp(ibqp); - dev = to_mdev(ibqp->device); - pd = get_pd(qp); + ibqp = (struct ib_qp *)src; + ibuobject = ibqp->uobject; + ibsrq = ibqp->srq; + rwq_ind_tbl = ibqp->rwq_ind_tbl; + qp_type = ibqp->qp_type; + qp = to_mqp(ibqp); + dev = to_mdev(ibqp->device); + pd = get_pd(qp); } /* APM is not supported under RoCE */ @@ -1836,6 +2187,11 @@ static int __mlx4_ib_modify_qp(void *src, enum mlx4_ib_source_type src_type, context->flags = cpu_to_be32((to_mlx4_state(new_state) << 28) | (to_mlx4_st(dev, qp->mlx4_ib_qp_type) << 16)); + if (rwq_ind_tbl) { + fill_qp_rss_context(context, qp); + context->flags |= cpu_to_be32(1 << MLX4_RSS_QPC_FLAG_OFFSET); + } + if (!(attr_mask & IB_QP_PATH_MIG_STATE)) context->flags |= cpu_to_be32(MLX4_QP_PM_MIGRATED << 11); else { @@ -1876,9 +2232,11 @@ static int __mlx4_ib_modify_qp(void *src, enum mlx4_ib_source_type src_type, ilog2(dev->dev->caps.max_msg_sz); } - if (qp->rq.wqe_cnt) - context->rq_size_stride = ilog2(qp->rq.wqe_cnt) << 3; - context->rq_size_stride |= qp->rq.wqe_shift - 4; + if (!rwq_ind_tbl) { /* PRM RSS receive side should be left zeros */ + if (qp->rq.wqe_cnt) + context->rq_size_stride = ilog2(qp->rq.wqe_cnt) << 3; + context->rq_size_stride |= qp->rq.wqe_shift - 4; + } if (qp->sq.wqe_cnt) context->sq_size_stride = ilog2(qp->sq.wqe_cnt) << 3; @@ -2031,8 +2389,14 @@ static int __mlx4_ib_modify_qp(void *src, enum mlx4_ib_source_type src_type, optpar |= MLX4_QP_OPTPAR_ALT_ADDR_PATH; } - get_cqs(qp, src_type, &send_cq, &recv_cq); - context->pd = cpu_to_be32(pd->pdn); + context->pd = cpu_to_be32(pd->pdn); + + if (!rwq_ind_tbl) { + get_cqs(qp, src_type, &send_cq, &recv_cq); + } else { /* Set dummy CQs to be compatible with HV and PRM */ + send_cq = to_mcq(rwq_ind_tbl->ind_tbl[0]->cq); + recv_cq = send_cq; + } context->cqn_send = cpu_to_be32(send_cq->mcq.cqn); context->cqn_recv = cpu_to_be32(recv_cq->mcq.cqn); context->params1 = cpu_to_be32(MLX4_IB_ACK_REQ_FREQ << 28); @@ -2358,6 +2722,11 @@ out: return err; } +enum { + MLX4_IB_MODIFY_QP_RSS_SUP_ATTR_MSK = (IB_QP_STATE | + IB_QP_PORT), +}; + static int _mlx4_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask, struct ib_udata *udata) { @@ -2388,6 +2757,27 @@ static int _mlx4_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, goto out; } + if (ibqp->rwq_ind_tbl) { + if (!(((cur_state == IB_QPS_RESET) && + (new_state == IB_QPS_INIT)) || + ((cur_state == IB_QPS_INIT) && + (new_state == IB_QPS_RTR)))) { + pr_debug("qpn 0x%x: RSS QP unsupported transition %d to %d\n", + ibqp->qp_num, cur_state, new_state); + + err = -EOPNOTSUPP; + goto out; + } + + if (attr_mask & ~MLX4_IB_MODIFY_QP_RSS_SUP_ATTR_MSK) { + pr_debug("qpn 0x%x: RSS QP unsupported attribute mask 0x%x for transition %d to %d\n", + ibqp->qp_num, attr_mask, cur_state, new_state); + + err = -EOPNOTSUPP; + goto out; + } + } + if (mlx4_is_bonded(dev->dev) && (attr_mask & IB_QP_PORT)) { if ((cur_state == IB_QPS_RESET) && (new_state == IB_QPS_INIT)) { if ((ibqp->qp_type == IB_QPT_RC) || @@ -2452,9 +2842,18 @@ static int _mlx4_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, goto out; } + if (ibqp->rwq_ind_tbl && (new_state == IB_QPS_INIT)) { + err = bringup_rss_rwqs(ibqp->rwq_ind_tbl, attr->port_num); + if (err) + goto out; + } + err = __mlx4_ib_modify_qp(ibqp, MLX4_IB_QP_SRC, attr, attr_mask, cur_state, new_state); + if (ibqp->rwq_ind_tbl && err) + bring_down_rss_rwqs(ibqp->rwq_ind_tbl); + if (mlx4_is_bonded(dev->dev) && (attr_mask & IB_QP_PORT)) attr->port_num = 1; @@ -3643,6 +4042,9 @@ int mlx4_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr int mlx4_state; int err = 0; + if (ibqp->rwq_ind_tbl) + return -EOPNOTSUPP; + mutex_lock(&qp->mutex); if (qp->state == IB_QPS_RESET) { @@ -3917,6 +4319,11 @@ int mlx4_ib_modify_wq(struct ib_wq *ibwq, struct ib_wq_attr *wq_attr, if ((new_state == IB_WQS_ERR) && (cur_state == IB_WQS_RESET)) return -EINVAL; + /* Need to protect against the parent RSS which also may modify WQ + * state. + */ + mutex_lock(&qp->mutex); + /* Can update HW state only if a RSS QP has already associated to this * WQ, so we can apply its port on the WQ. */ @@ -3926,6 +4333,8 @@ int mlx4_ib_modify_wq(struct ib_wq *ibwq, struct ib_wq_attr *wq_attr, if (!err) ibwq->state = new_state; + mutex_unlock(&qp->mutex); + return err; } diff --git a/include/uapi/rdma/mlx4-abi.h b/include/uapi/rdma/mlx4-abi.h index 5591d955ba00..d915cab37ec3 100644 --- a/include/uapi/rdma/mlx4-abi.h +++ b/include/uapi/rdma/mlx4-abi.h @@ -95,6 +95,16 @@ struct mlx4_ib_create_srq_resp { __u32 reserved; }; +struct mlx4_ib_create_qp_rss { + __u64 rx_hash_fields_mask; + __u8 rx_hash_function; + __u8 rx_key_len; + __u8 reserved[6]; + __u8 rx_hash_key[40]; + __u32 comp_mask; + __u32 reserved1; +}; + struct mlx4_ib_create_qp { __u64 buf_addr; __u64 db_addr; @@ -123,4 +133,27 @@ struct mlx4_ib_create_rwq_ind_tbl_resp { __u32 response_length; __u32 reserved; }; + +/* RX Hash function flags */ +enum mlx4_ib_rx_hash_function_flags { + MLX4_IB_RX_HASH_FUNC_TOEPLITZ = 1 << 0, +}; + +/* + * RX Hash flags, these flags allows to set which incoming packet's field should + * participates in RX Hash. Each flag represent certain packet's field, + * when the flag is set the field that is represented by the flag will + * participate in RX Hash calculation. + */ +enum mlx4_ib_rx_hash_fields { + MLX4_IB_RX_HASH_SRC_IPV4 = 1 << 0, + MLX4_IB_RX_HASH_DST_IPV4 = 1 << 1, + MLX4_IB_RX_HASH_SRC_IPV6 = 1 << 2, + MLX4_IB_RX_HASH_DST_IPV6 = 1 << 3, + MLX4_IB_RX_HASH_SRC_PORT_TCP = 1 << 4, + MLX4_IB_RX_HASH_DST_PORT_TCP = 1 << 5, + MLX4_IB_RX_HASH_SRC_PORT_UDP = 1 << 6, + MLX4_IB_RX_HASH_DST_PORT_UDP = 1 << 7 +}; + #endif /* MLX4_ABI_USER_H */ -- cgit v1.2.3 From ad84dad2160d5f36bb471b391462d651c887d693 Mon Sep 17 00:00:00 2001 From: "Amrani, Ram" Date: Mon, 26 Jun 2017 19:05:05 +0300 Subject: RDMA/qedr: notify user application if DPM is supported Direct Packet Mode support may be disabled, e.g, due to limited resources. Notifying the user application prevents wasting cycles on attempting to send these kind of packets. Signed-off-by: Ram Amrani Signed-off-by: Doug Ledford --- drivers/infiniband/hw/qedr/main.c | 1 + drivers/infiniband/hw/qedr/qedr.h | 2 ++ drivers/infiniband/hw/qedr/verbs.c | 1 + include/uapi/rdma/qedr-abi.h | 1 + 4 files changed, 5 insertions(+) (limited to 'include') diff --git a/drivers/infiniband/hw/qedr/main.c b/drivers/infiniband/hw/qedr/main.c index 0ae30f5c8cbc..199b6edbef92 100644 --- a/drivers/infiniband/hw/qedr/main.c +++ b/drivers/infiniband/hw/qedr/main.c @@ -777,6 +777,7 @@ static struct qedr_dev *qedr_add(struct qed_dev *cdev, struct pci_dev *pdev, if (rc) goto init_err; + dev->user_dpm_enabled = dev_info.user_dpm_enabled; dev->num_hwfns = dev_info.common.num_hwfns; dev->rdma_ctx = dev->ops->rdma_get_rdma_ctx(cdev); diff --git a/drivers/infiniband/hw/qedr/qedr.h b/drivers/infiniband/hw/qedr/qedr.h index 392e76e26840..b2bb42e2805d 100644 --- a/drivers/infiniband/hw/qedr/qedr.h +++ b/drivers/infiniband/hw/qedr/qedr.h @@ -162,6 +162,8 @@ struct qedr_dev { struct qedr_qp *gsi_qp; unsigned long enet_state; + + u8 user_dpm_enabled; }; #define QEDR_MAX_SQ_PBL (0x8000) diff --git a/drivers/infiniband/hw/qedr/verbs.c b/drivers/infiniband/hw/qedr/verbs.c index 2ae71b8f1ba8..4322ee00498e 100644 --- a/drivers/infiniband/hw/qedr/verbs.c +++ b/drivers/infiniband/hw/qedr/verbs.c @@ -376,6 +376,7 @@ struct ib_ucontext *qedr_alloc_ucontext(struct ib_device *ibdev, memset(&uresp, 0, sizeof(uresp)); + uresp.dpm_enabled = dev->user_dpm_enabled; uresp.db_pa = ctx->dpi_phys_addr; uresp.db_size = ctx->dpi_size; uresp.max_send_wr = dev->attr.max_sqe; diff --git a/include/uapi/rdma/qedr-abi.h b/include/uapi/rdma/qedr-abi.h index 75c270d839c8..2684004ec4fd 100644 --- a/include/uapi/rdma/qedr-abi.h +++ b/include/uapi/rdma/qedr-abi.h @@ -49,6 +49,7 @@ struct qedr_alloc_ucontext_resp { __u32 sges_per_recv_wr; __u32 sges_per_srq_wr; __u32 max_cqes; + __u8 dpm_enabled; }; struct qedr_alloc_pd_ureq { -- cgit v1.2.3 From 67cbe3532c2cd84303a2073cedad6b8bcad13be3 Mon Sep 17 00:00:00 2001 From: "Amrani, Ram" Date: Mon, 26 Jun 2017 19:05:06 +0300 Subject: RDMA/qedr: notify user application of supported WIDs The number of supported WIDs, if they are supported at all, can be limited due to resources. Notifying the user space application the number of available WIDs allows it to utilize them correctly. Signed-off-by: Ram Amrani Signed-off-by: Doug Ledford --- drivers/infiniband/hw/qedr/verbs.c | 2 ++ include/uapi/rdma/qedr-abi.h | 2 ++ 2 files changed, 4 insertions(+) (limited to 'include') diff --git a/drivers/infiniband/hw/qedr/verbs.c b/drivers/infiniband/hw/qedr/verbs.c index 4322ee00498e..9ee2dce3e5bb 100644 --- a/drivers/infiniband/hw/qedr/verbs.c +++ b/drivers/infiniband/hw/qedr/verbs.c @@ -377,6 +377,8 @@ struct ib_ucontext *qedr_alloc_ucontext(struct ib_device *ibdev, memset(&uresp, 0, sizeof(uresp)); uresp.dpm_enabled = dev->user_dpm_enabled; + uresp.wids_enabled = 1; + uresp.wid_count = oparams.wid_count; uresp.db_pa = ctx->dpi_phys_addr; uresp.db_size = ctx->dpi_size; uresp.max_send_wr = dev->attr.max_sqe; diff --git a/include/uapi/rdma/qedr-abi.h b/include/uapi/rdma/qedr-abi.h index 2684004ec4fd..54b64357ab24 100644 --- a/include/uapi/rdma/qedr-abi.h +++ b/include/uapi/rdma/qedr-abi.h @@ -50,6 +50,8 @@ struct qedr_alloc_ucontext_resp { __u32 sges_per_srq_wr; __u32 max_cqes; __u8 dpm_enabled; + __u8 wids_enabled; + __u16 wid_count; }; struct qedr_alloc_pd_ureq { -- cgit v1.2.3 From bf90aadd630c2c9f7f965ba1e90d41b5b46db7c9 Mon Sep 17 00:00:00 2001 From: "Michael J. Ruhl" Date: Mon, 24 Jul 2017 07:46:12 -0700 Subject: IB/hfi1: Send MAD traps until repressed A trap should be sent to the FM until the FM sends a repress message. This is in line with the IBTA 13.4.9. Add the ability to resend traps until a repress message is received. Reviewed-by: Dennis Dalessandro Reviewed-by: Michael N. Henry Signed-off-by: Michael J. Ruhl Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/mad.c | 373 ++++++++++++++++++++++++++++--------- drivers/infiniband/hw/hfi1/mad.h | 3 +- drivers/infiniband/hw/hfi1/verbs.c | 5 + include/rdma/rdma_vt.h | 17 ++ 4 files changed, 310 insertions(+), 88 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/hw/hfi1/mad.c b/drivers/infiniband/hw/hfi1/mad.c index a081a98d728a..0a3e2dfdf56e 100644 --- a/drivers/infiniband/hw/hfi1/mad.c +++ b/drivers/infiniband/hw/hfi1/mad.c @@ -59,6 +59,16 @@ #define OPA_LINK_WIDTH_RESET_OLD 0x0fff #define OPA_LINK_WIDTH_RESET 0xffff +struct trap_node { + struct list_head list; + struct opa_mad_notice_attr data; + __be64 tid; + int len; + u32 retry; + u8 in_use; + u8 repress; +}; + static int smp_length_check(u32 data_size, u32 request_len) { if (unlikely(request_len < data_size)) @@ -97,28 +107,156 @@ void hfi1_event_pkey_change(struct hfi1_devdata *dd, u8 port) ib_dispatch_event(&event); } -static void send_trap(struct hfi1_ibport *ibp, void *data, unsigned len) +/* + * If the port is down, clean up all pending traps. We need to be careful + * with the given trap, because it may be queued. + */ +static void cleanup_traps(struct hfi1_ibport *ibp, struct trap_node *trap) +{ + struct trap_node *node, *q; + unsigned long flags; + struct list_head trap_list; + int i; + + for (i = 0; i < RVT_MAX_TRAP_LISTS; i++) { + spin_lock_irqsave(&ibp->rvp.lock, flags); + list_replace_init(&ibp->rvp.trap_lists[i].list, &trap_list); + ibp->rvp.trap_lists[i].list_len = 0; + spin_unlock_irqrestore(&ibp->rvp.lock, flags); + + /* + * Remove all items from the list, freeing all the non-given + * traps. + */ + list_for_each_entry_safe(node, q, &trap_list, list) { + list_del(&node->list); + if (node != trap) + kfree(node); + } + } + + /* + * If this wasn't on one of the lists it would not be freed. If it + * was on the list, it is now safe to free. + */ + kfree(trap); +} + +static struct trap_node *check_and_add_trap(struct hfi1_ibport *ibp, + struct trap_node *trap) +{ + struct trap_node *node; + struct trap_list *trap_list; + unsigned long flags; + unsigned long timeout; + int found = 0; + + /* + * Since the retry (handle timeout) does not remove a trap request + * from the list, all we have to do is compare the node. + */ + spin_lock_irqsave(&ibp->rvp.lock, flags); + trap_list = &ibp->rvp.trap_lists[trap->data.generic_type & 0x0F]; + + list_for_each_entry(node, &trap_list->list, list) { + if (node == trap) { + node->retry++; + found = 1; + break; + } + } + + /* If it is not on the list, add it, limited to RVT-MAX_TRAP_LEN. */ + if (!found) { + if (trap_list->list_len < RVT_MAX_TRAP_LEN) { + trap_list->list_len++; + list_add_tail(&trap->list, &trap_list->list); + } else { + pr_warn_ratelimited("hfi1: Maximim trap limit reached for 0x%0x traps\n", + trap->data.generic_type); + kfree(trap); + } + } + + /* + * Next check to see if there is a timer pending. If not, set it up + * and get the first trap from the list. + */ + node = NULL; + if (!timer_pending(&ibp->rvp.trap_timer)) { + /* + * o14-2 + * If the time out is set we have to wait until it expires + * before the trap can be sent. + * This should be > RVT_TRAP_TIMEOUT + */ + timeout = (RVT_TRAP_TIMEOUT * + (1UL << ibp->rvp.subnet_timeout)) / 1000; + mod_timer(&ibp->rvp.trap_timer, + jiffies + usecs_to_jiffies(timeout)); + node = list_first_entry(&trap_list->list, struct trap_node, + list); + node->in_use = 1; + } + spin_unlock_irqrestore(&ibp->rvp.lock, flags); + + return node; +} + +static void subn_handle_opa_trap_repress(struct hfi1_ibport *ibp, + struct opa_smp *smp) +{ + struct trap_list *trap_list; + struct trap_node *trap; + unsigned long flags; + int i; + + if (smp->attr_id != IB_SMP_ATTR_NOTICE) + return; + + spin_lock_irqsave(&ibp->rvp.lock, flags); + for (i = 0; i < RVT_MAX_TRAP_LISTS; i++) { + trap_list = &ibp->rvp.trap_lists[i]; + trap = list_first_entry_or_null(&trap_list->list, + struct trap_node, list); + if (trap && trap->tid == smp->tid) { + if (trap->in_use) { + trap->repress = 1; + } else { + trap_list->list_len--; + list_del(&trap->list); + kfree(trap); + } + break; + } + } + spin_unlock_irqrestore(&ibp->rvp.lock, flags); +} + +static void send_trap(struct hfi1_ibport *ibp, struct trap_node *trap) { struct ib_mad_send_buf *send_buf; struct ib_mad_agent *agent; struct opa_smp *smp; - int ret; unsigned long flags; - unsigned long timeout; int pkey_idx; u32 qpn = ppd_from_ibp(ibp)->sm_trap_qp; agent = ibp->rvp.send_agent; - if (!agent) + if (!agent) { + cleanup_traps(ibp, trap); return; + } /* o14-3.2.1 */ - if (driver_lstate(ppd_from_ibp(ibp)) != IB_PORT_ACTIVE) + if (driver_lstate(ppd_from_ibp(ibp)) != IB_PORT_ACTIVE) { + cleanup_traps(ibp, trap); return; + } - /* o14-2 */ - if (ibp->rvp.trap_timeout && time_before(jiffies, - ibp->rvp.trap_timeout)) + /* Add the trap to the list if necessary and see if we can send it */ + trap = check_and_add_trap(ibp, trap); + if (!trap) return; pkey_idx = hfi1_lookup_pkey_idx(ibp, LIM_MGMT_P_KEY); @@ -139,11 +277,21 @@ static void send_trap(struct hfi1_ibport *ibp, void *data, unsigned len) smp->mgmt_class = IB_MGMT_CLASS_SUBN_LID_ROUTED; smp->class_version = OPA_SM_CLASS_VERSION; smp->method = IB_MGMT_METHOD_TRAP; - ibp->rvp.tid++; - smp->tid = cpu_to_be64(ibp->rvp.tid); + + /* Only update the transaction ID for new traps (o13-5). */ + if (trap->tid == 0) { + ibp->rvp.tid++; + /* make sure that tid != 0 */ + if (ibp->rvp.tid == 0) + ibp->rvp.tid++; + trap->tid = cpu_to_be64(ibp->rvp.tid); + } + smp->tid = trap->tid; + smp->attr_id = IB_SMP_ATTR_NOTICE; /* o14-1: smp->mkey = 0; */ - memcpy(smp->route.lid.data, data, len); + + memcpy(smp->route.lid.data, &trap->data, trap->len); spin_lock_irqsave(&ibp->rvp.lock, flags); if (!ibp->rvp.sm_ah) { @@ -152,31 +300,72 @@ static void send_trap(struct hfi1_ibport *ibp, void *data, unsigned len) ah = hfi1_create_qp0_ah(ibp, ibp->rvp.sm_lid); if (IS_ERR(ah)) { - ret = PTR_ERR(ah); - } else { - send_buf->ah = ah; - ibp->rvp.sm_ah = ibah_to_rvtah(ah); - ret = 0; + spin_unlock_irqrestore(&ibp->rvp.lock, flags); + return; } + send_buf->ah = ah; + ibp->rvp.sm_ah = ibah_to_rvtah(ah); } else { - ret = -EINVAL; + spin_unlock_irqrestore(&ibp->rvp.lock, flags); + return; } } else { send_buf->ah = &ibp->rvp.sm_ah->ibah; - ret = 0; } + + /* + * If the trap was repressed while things were getting set up, don't + * bother sending it. This could happen for a retry. + */ + if (trap->repress) { + list_del(&trap->list); + spin_unlock_irqrestore(&ibp->rvp.lock, flags); + kfree(trap); + ib_free_send_mad(send_buf); + return; + } + + trap->in_use = 0; spin_unlock_irqrestore(&ibp->rvp.lock, flags); - if (!ret) - ret = ib_post_send_mad(send_buf, NULL); - if (!ret) { - /* 4.096 usec. */ - timeout = (4096 * (1UL << ibp->rvp.subnet_timeout)) / 1000; - ibp->rvp.trap_timeout = jiffies + usecs_to_jiffies(timeout); - } else { + if (ib_post_send_mad(send_buf, NULL)) ib_free_send_mad(send_buf); - ibp->rvp.trap_timeout = 0; +} + +void hfi1_handle_trap_timer(unsigned long data) +{ + struct hfi1_ibport *ibp = (struct hfi1_ibport *)data; + struct trap_node *trap = NULL; + unsigned long flags; + int i; + + /* Find the trap with the highest priority */ + spin_lock_irqsave(&ibp->rvp.lock, flags); + for (i = 0; !trap && i < RVT_MAX_TRAP_LISTS; i++) { + trap = list_first_entry_or_null(&ibp->rvp.trap_lists[i].list, + struct trap_node, list); } + spin_unlock_irqrestore(&ibp->rvp.lock, flags); + + if (trap) + send_trap(ibp, trap); +} + +static struct trap_node *create_trap_node(u8 type, __be16 trap_num, u32 lid) +{ + struct trap_node *trap; + + trap = kzalloc(sizeof(*trap), GFP_ATOMIC); + if (!trap) + return NULL; + + INIT_LIST_HEAD(&trap->list); + trap->data.generic_type = type; + trap->data.prod_type_lsb = IB_NOTICE_PROD_CA; + trap->data.trap_num = trap_num; + trap->data.issuer_lid = cpu_to_be32(lid); + + return trap; } /* @@ -185,28 +374,29 @@ static void send_trap(struct hfi1_ibport *ibp, void *data, unsigned len) void hfi1_bad_pkey(struct hfi1_ibport *ibp, u32 key, u32 sl, u32 qp1, u32 qp2, u16 lid1, u16 lid2) { - struct opa_mad_notice_attr data; + struct trap_node *trap; u32 lid = ppd_from_ibp(ibp)->lid; u32 _lid1 = lid1; u32 _lid2 = lid2; - memset(&data, 0, sizeof(data)); ibp->rvp.n_pkt_drops++; ibp->rvp.pkey_violations++; + trap = create_trap_node(IB_NOTICE_TYPE_SECURITY, OPA_TRAP_BAD_P_KEY, + lid); + if (!trap) + return; + /* Send violation trap */ - data.generic_type = IB_NOTICE_TYPE_SECURITY; - data.prod_type_lsb = IB_NOTICE_PROD_CA; - data.trap_num = OPA_TRAP_BAD_P_KEY; - data.issuer_lid = cpu_to_be32(lid); - data.ntc_257_258.lid1 = cpu_to_be32(_lid1); - data.ntc_257_258.lid2 = cpu_to_be32(_lid2); - data.ntc_257_258.key = cpu_to_be32(key); - data.ntc_257_258.sl = sl << 3; - data.ntc_257_258.qp1 = cpu_to_be32(qp1); - data.ntc_257_258.qp2 = cpu_to_be32(qp2); - - send_trap(ibp, &data, sizeof(data)); + trap->data.ntc_257_258.lid1 = cpu_to_be32(_lid1); + trap->data.ntc_257_258.lid2 = cpu_to_be32(_lid2); + trap->data.ntc_257_258.key = cpu_to_be32(key); + trap->data.ntc_257_258.sl = sl << 3; + trap->data.ntc_257_258.qp1 = cpu_to_be32(qp1); + trap->data.ntc_257_258.qp2 = cpu_to_be32(qp2); + + trap->len = sizeof(trap->data); + send_trap(ibp, trap); } /* @@ -215,34 +405,36 @@ void hfi1_bad_pkey(struct hfi1_ibport *ibp, u32 key, u32 sl, static void bad_mkey(struct hfi1_ibport *ibp, struct ib_mad_hdr *mad, __be64 mkey, __be32 dr_slid, u8 return_path[], u8 hop_cnt) { - struct opa_mad_notice_attr data; + struct trap_node *trap; u32 lid = ppd_from_ibp(ibp)->lid; - memset(&data, 0, sizeof(data)); + trap = create_trap_node(IB_NOTICE_TYPE_SECURITY, OPA_TRAP_BAD_M_KEY, + lid); + if (!trap) + return; + /* Send violation trap */ - data.generic_type = IB_NOTICE_TYPE_SECURITY; - data.prod_type_lsb = IB_NOTICE_PROD_CA; - data.trap_num = OPA_TRAP_BAD_M_KEY; - data.issuer_lid = cpu_to_be32(lid); - data.ntc_256.lid = data.issuer_lid; - data.ntc_256.method = mad->method; - data.ntc_256.attr_id = mad->attr_id; - data.ntc_256.attr_mod = mad->attr_mod; - data.ntc_256.mkey = mkey; + trap->data.ntc_256.lid = trap->data.issuer_lid; + trap->data.ntc_256.method = mad->method; + trap->data.ntc_256.attr_id = mad->attr_id; + trap->data.ntc_256.attr_mod = mad->attr_mod; + trap->data.ntc_256.mkey = mkey; if (mad->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) { - data.ntc_256.dr_slid = dr_slid; - data.ntc_256.dr_trunc_hop = IB_NOTICE_TRAP_DR_NOTICE; - if (hop_cnt > ARRAY_SIZE(data.ntc_256.dr_rtn_path)) { - data.ntc_256.dr_trunc_hop |= + trap->data.ntc_256.dr_slid = dr_slid; + trap->data.ntc_256.dr_trunc_hop = IB_NOTICE_TRAP_DR_NOTICE; + if (hop_cnt > ARRAY_SIZE(trap->data.ntc_256.dr_rtn_path)) { + trap->data.ntc_256.dr_trunc_hop |= IB_NOTICE_TRAP_DR_TRUNC; - hop_cnt = ARRAY_SIZE(data.ntc_256.dr_rtn_path); + hop_cnt = ARRAY_SIZE(trap->data.ntc_256.dr_rtn_path); } - data.ntc_256.dr_trunc_hop |= hop_cnt; - memcpy(data.ntc_256.dr_rtn_path, return_path, + trap->data.ntc_256.dr_trunc_hop |= hop_cnt; + memcpy(trap->data.ntc_256.dr_rtn_path, return_path, hop_cnt); } - send_trap(ibp, &data, sizeof(data)); + trap->len = sizeof(trap->data); + + send_trap(ibp, trap); } /* @@ -250,23 +442,24 @@ static void bad_mkey(struct hfi1_ibport *ibp, struct ib_mad_hdr *mad, */ void hfi1_cap_mask_chg(struct rvt_dev_info *rdi, u8 port_num) { - struct opa_mad_notice_attr data; + struct trap_node *trap; struct hfi1_ibdev *verbs_dev = dev_from_rdi(rdi); struct hfi1_devdata *dd = dd_from_dev(verbs_dev); struct hfi1_ibport *ibp = &dd->pport[port_num - 1].ibport_data; u32 lid = ppd_from_ibp(ibp)->lid; - memset(&data, 0, sizeof(data)); + trap = create_trap_node(IB_NOTICE_TYPE_INFO, + OPA_TRAP_CHANGE_CAPABILITY, + lid); + if (!trap) + return; - data.generic_type = IB_NOTICE_TYPE_INFO; - data.prod_type_lsb = IB_NOTICE_PROD_CA; - data.trap_num = OPA_TRAP_CHANGE_CAPABILITY; - data.issuer_lid = cpu_to_be32(lid); - data.ntc_144.lid = data.issuer_lid; - data.ntc_144.new_cap_mask = cpu_to_be32(ibp->rvp.port_cap_flags); - data.ntc_144.cap_mask3 = cpu_to_be16(ibp->rvp.port_cap3_flags); + trap->data.ntc_144.lid = trap->data.issuer_lid; + trap->data.ntc_144.new_cap_mask = cpu_to_be32(ibp->rvp.port_cap_flags); + trap->data.ntc_144.cap_mask3 = cpu_to_be16(ibp->rvp.port_cap3_flags); - send_trap(ibp, &data, sizeof(data)); + trap->len = sizeof(trap->data); + send_trap(ibp, trap); } /* @@ -274,19 +467,19 @@ void hfi1_cap_mask_chg(struct rvt_dev_info *rdi, u8 port_num) */ void hfi1_sys_guid_chg(struct hfi1_ibport *ibp) { - struct opa_mad_notice_attr data; + struct trap_node *trap; u32 lid = ppd_from_ibp(ibp)->lid; - memset(&data, 0, sizeof(data)); + trap = create_trap_node(IB_NOTICE_TYPE_INFO, OPA_TRAP_CHANGE_SYSGUID, + lid); + if (!trap) + return; - data.generic_type = IB_NOTICE_TYPE_INFO; - data.prod_type_lsb = IB_NOTICE_PROD_CA; - data.trap_num = OPA_TRAP_CHANGE_SYSGUID; - data.issuer_lid = cpu_to_be32(lid); - data.ntc_145.new_sys_guid = ib_hfi1_sys_image_guid; - data.ntc_145.lid = data.issuer_lid; + trap->data.ntc_145.new_sys_guid = ib_hfi1_sys_image_guid; + trap->data.ntc_145.lid = trap->data.issuer_lid; - send_trap(ibp, &data, sizeof(data)); + trap->len = sizeof(trap->data); + send_trap(ibp, trap); } /* @@ -294,20 +487,21 @@ void hfi1_sys_guid_chg(struct hfi1_ibport *ibp) */ void hfi1_node_desc_chg(struct hfi1_ibport *ibp) { - struct opa_mad_notice_attr data; + struct trap_node *trap; u32 lid = ppd_from_ibp(ibp)->lid; - memset(&data, 0, sizeof(data)); + trap = create_trap_node(IB_NOTICE_TYPE_INFO, + OPA_TRAP_CHANGE_CAPABILITY, + lid); + if (!trap) + return; - data.generic_type = IB_NOTICE_TYPE_INFO; - data.prod_type_lsb = IB_NOTICE_PROD_CA; - data.trap_num = OPA_TRAP_CHANGE_CAPABILITY; - data.issuer_lid = cpu_to_be32(lid); - data.ntc_144.lid = data.issuer_lid; - data.ntc_144.change_flags = + trap->data.ntc_144.lid = trap->data.issuer_lid; + trap->data.ntc_144.change_flags = cpu_to_be16(OPA_NOTICE_TRAP_NODE_DESC_CHG); - send_trap(ibp, &data, sizeof(data)); + trap->len = sizeof(trap->data); + send_trap(ibp, trap); } static int __subn_get_opa_nodedesc(struct opa_smp *smp, u32 am, @@ -4144,6 +4338,11 @@ static int process_subn_opa(struct ib_device *ibdev, int mad_flags, */ ret = IB_MAD_RESULT_SUCCESS; break; + case IB_MGMT_METHOD_TRAP_REPRESS: + subn_handle_opa_trap_repress(ibp, smp); + /* Always successful */ + ret = IB_MAD_RESULT_SUCCESS; + break; default: smp->status |= IB_SMP_UNSUP_METHOD; ret = reply((struct ib_mad_hdr *)smp); diff --git a/drivers/infiniband/hw/hfi1/mad.h b/drivers/infiniband/hw/hfi1/mad.h index a4e2506bd5ca..4c1245072093 100644 --- a/drivers/infiniband/hw/hfi1/mad.h +++ b/drivers/infiniband/hw/hfi1/mad.h @@ -1,5 +1,5 @@ /* - * Copyright(c) 2015, 2016 Intel Corporation. + * Copyright(c) 2015 - 2017 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. @@ -428,5 +428,6 @@ struct sc2vlnt { COUNTER_MASK(1, 4)) void hfi1_event_pkey_change(struct hfi1_devdata *dd, u8 port); +void hfi1_handle_trap_timer(unsigned long data); #endif /* _HFI1_MAD_H */ diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c index 3ef6384eae40..dc51bf247006 100644 --- a/drivers/infiniband/hw/hfi1/verbs.c +++ b/drivers/infiniband/hw/hfi1/verbs.c @@ -1535,6 +1535,11 @@ static void init_ibport(struct hfi1_pportdata *ppd) ibp->sc_to_sl[i] = i; } + for (i = 0; i < RVT_MAX_TRAP_LISTS ; i++) + INIT_LIST_HEAD(&ibp->rvp.trap_lists[i].list); + setup_timer(&ibp->rvp.trap_timer, hfi1_handle_trap_timer, + (unsigned long)ibp); + spin_lock_init(&ibp->rvp.lock); /* Set the prefix to the default value (see ch. 4.1.1) */ ibp->rvp.gid_prefix = IB_DEFAULT_GID_PREFIX; diff --git a/include/rdma/rdma_vt.h b/include/rdma/rdma_vt.h index 22fb15ff5e8b..fdfac0fd2f82 100644 --- a/include/rdma/rdma_vt.h +++ b/include/rdma/rdma_vt.h @@ -57,11 +57,21 @@ #include #include #include +#include #include #include #define RVT_MAX_PKEY_VALUES 16 +#define RVT_MAX_TRAP_LEN 100 /* Limit pending trap list */ +#define RVT_MAX_TRAP_LISTS ((IB_NOTICE_TYPE_INFO & 0x0F) + 1) +#define RVT_TRAP_TIMEOUT 4096 /* 4.096 usec */ + +struct trap_list { + u32 list_len; + struct list_head list; +}; + struct rvt_ibport { struct rvt_qp __rcu *qp[2]; struct ib_mad_agent *send_agent; /* agent for SMI (traps) */ @@ -128,6 +138,13 @@ struct rvt_ibport { u16 *pkey_table; struct rvt_ah *sm_ah; + + /* + * Keep a list of traps that have not been repressed. They will be + * resent based on trap_timer. + */ + struct trap_list trap_lists[RVT_MAX_TRAP_LISTS]; + struct timer_list trap_timer; }; #define RVT_CQN_MAX 16 /* maximum length of cq name */ -- cgit v1.2.3 From d541e45500bd269060c26387902e1bec9783c07c Mon Sep 17 00:00:00 2001 From: Dasaratharaman Chandramouli Date: Thu, 8 Jun 2017 13:37:43 -0400 Subject: IB/core: Convert ah_attr from OPA to IB when copying to user OPA address handle atttibutes that have 32 bit LIDs would have to be converted to IB address handle attribute with the LID field programmed in the GID before copying to user space. Signed-off-by: Dasaratharaman Chandramouli Reviewed-by: Don Hiatt Reviewed-by: Ira Weiny Signed-off-by: Doug Ledford --- drivers/infiniband/core/ucm.c | 2 +- drivers/infiniband/core/ucma.c | 10 ++++--- drivers/infiniband/core/uverbs_marshall.c | 48 +++++++++++++++++++++++++++---- include/rdma/ib_marshall.h | 6 ++-- 4 files changed, 54 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/core/ucm.c b/drivers/infiniband/core/ucm.c index 112099c86a19..f2a7f62c2834 100644 --- a/drivers/infiniband/core/ucm.c +++ b/drivers/infiniband/core/ucm.c @@ -618,7 +618,7 @@ static ssize_t ib_ucm_init_qp_attr(struct ib_ucm_file *file, if (result) goto out; - ib_copy_qp_attr_to_user(&resp, &qp_attr); + ib_copy_qp_attr_to_user(ctx->cm_id->device, &resp, &qp_attr); if (copy_to_user((void __user *)(unsigned long)cmd.response, &resp, sizeof(resp))) diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c index 276f0ef835bd..eb85b546e223 100644 --- a/drivers/infiniband/core/ucma.c +++ b/drivers/infiniband/core/ucma.c @@ -248,14 +248,15 @@ static void ucma_copy_conn_event(struct rdma_ucm_conn_param *dst, dst->qp_num = src->qp_num; } -static void ucma_copy_ud_event(struct rdma_ucm_ud_param *dst, +static void ucma_copy_ud_event(struct ib_device *device, + struct rdma_ucm_ud_param *dst, struct rdma_ud_param *src) { if (src->private_data_len) memcpy(dst->private_data, src->private_data, src->private_data_len); dst->private_data_len = src->private_data_len; - ib_copy_ah_attr_to_user(&dst->ah_attr, &src->ah_attr); + ib_copy_ah_attr_to_user(device, &dst->ah_attr, &src->ah_attr); dst->qp_num = src->qp_num; dst->qkey = src->qkey; } @@ -335,7 +336,8 @@ static int ucma_event_handler(struct rdma_cm_id *cm_id, uevent->resp.event = event->event; uevent->resp.status = event->status; if (cm_id->qp_type == IB_QPT_UD) - ucma_copy_ud_event(&uevent->resp.param.ud, &event->param.ud); + ucma_copy_ud_event(cm_id->device, &uevent->resp.param.ud, + &event->param.ud); else ucma_copy_conn_event(&uevent->resp.param.conn, &event->param.conn); @@ -1157,7 +1159,7 @@ static ssize_t ucma_init_qp_attr(struct ucma_file *file, if (ret) goto out; - ib_copy_qp_attr_to_user(&resp, &qp_attr); + ib_copy_qp_attr_to_user(ctx->cm_id->device, &resp, &qp_attr); if (copy_to_user((void __user *)(unsigned long)cmd.response, &resp, sizeof(resp))) ret = -EFAULT; diff --git a/drivers/infiniband/core/uverbs_marshall.c b/drivers/infiniband/core/uverbs_marshall.c index 94fd989c9060..bd0acf376af0 100644 --- a/drivers/infiniband/core/uverbs_marshall.c +++ b/drivers/infiniband/core/uverbs_marshall.c @@ -33,10 +33,47 @@ #include #include -void ib_copy_ah_attr_to_user(struct ib_uverbs_ah_attr *dst, - struct rdma_ah_attr *src) +#define OPA_DEFAULT_GID_PREFIX cpu_to_be64(0xfe80000000000000ULL) +static int rdma_ah_conv_opa_to_ib(struct ib_device *dev, + struct rdma_ah_attr *ib, + struct rdma_ah_attr *opa) { + struct ib_port_attr port_attr; + int ret = 0; + + /* Do structure copy and the over-write fields */ + *ib = *opa; + + ib->type = RDMA_AH_ATTR_TYPE_IB; + rdma_ah_set_grh(ib, NULL, 0, 0, 1, 0); + + if (ib_query_port(dev, opa->port_num, &port_attr)) { + /* Set to default subnet to indicate error */ + rdma_ah_set_subnet_prefix(ib, OPA_DEFAULT_GID_PREFIX); + ret = -EINVAL; + } else { + rdma_ah_set_subnet_prefix(ib, + cpu_to_be64(port_attr.subnet_prefix)); + } + rdma_ah_set_interface_id(ib, OPA_MAKE_ID(rdma_ah_get_dlid(opa))); + return ret; +} + +void ib_copy_ah_attr_to_user(struct ib_device *device, + struct ib_uverbs_ah_attr *dst, + struct rdma_ah_attr *ah_attr) +{ + struct rdma_ah_attr *src = ah_attr; + struct rdma_ah_attr conv_ah; + memset(&dst->grh.reserved, 0, sizeof(dst->grh.reserved)); + + if ((ah_attr->type == RDMA_AH_ATTR_TYPE_OPA) && + (rdma_ah_get_dlid(ah_attr) >= + be16_to_cpu(IB_MULTICAST_LID_BASE)) && + (!rdma_ah_conv_opa_to_ib(device, &conv_ah, ah_attr))) + src = &conv_ah; + dst->dlid = rdma_ah_get_dlid(src); dst->sl = rdma_ah_get_sl(src); dst->src_path_bits = rdma_ah_get_path_bits(src); @@ -57,7 +94,8 @@ void ib_copy_ah_attr_to_user(struct ib_uverbs_ah_attr *dst, } EXPORT_SYMBOL(ib_copy_ah_attr_to_user); -void ib_copy_qp_attr_to_user(struct ib_uverbs_qp_attr *dst, +void ib_copy_qp_attr_to_user(struct ib_device *device, + struct ib_uverbs_qp_attr *dst, struct ib_qp_attr *src) { dst->qp_state = src->qp_state; @@ -76,8 +114,8 @@ void ib_copy_qp_attr_to_user(struct ib_uverbs_qp_attr *dst, dst->max_recv_sge = src->cap.max_recv_sge; dst->max_inline_data = src->cap.max_inline_data; - ib_copy_ah_attr_to_user(&dst->ah_attr, &src->ah_attr); - ib_copy_ah_attr_to_user(&dst->alt_ah_attr, &src->alt_ah_attr); + ib_copy_ah_attr_to_user(device, &dst->ah_attr, &src->ah_attr); + ib_copy_ah_attr_to_user(device, &dst->alt_ah_attr, &src->alt_ah_attr); dst->pkey_index = src->pkey_index; dst->alt_pkey_index = src->alt_pkey_index; diff --git a/include/rdma/ib_marshall.h b/include/rdma/ib_marshall.h index 68cef3bd50fb..8ebf84ae9ed1 100644 --- a/include/rdma/ib_marshall.h +++ b/include/rdma/ib_marshall.h @@ -38,10 +38,12 @@ #include #include -void ib_copy_qp_attr_to_user(struct ib_uverbs_qp_attr *dst, +void ib_copy_qp_attr_to_user(struct ib_device *device, + struct ib_uverbs_qp_attr *dst, struct ib_qp_attr *src); -void ib_copy_ah_attr_to_user(struct ib_uverbs_ah_attr *dst, +void ib_copy_ah_attr_to_user(struct ib_device *device, + struct ib_uverbs_ah_attr *dst, struct rdma_ah_attr *src); void ib_copy_path_rec_to_user(struct ib_user_path_rec *dst, -- cgit v1.2.3 From 582faf3150f57b8364ac9d2aa731d7368ada7a4b Mon Sep 17 00:00:00 2001 From: Dasaratharaman Chandramouli Date: Thu, 8 Jun 2017 13:37:47 -0400 Subject: IB/core: Change port_attr.lid size from 16 to 32 bits lid field in struct ib_port_attr is increased to 32 bits. This enables core components to use larger LIDs if needed. The user ABI is unchanged and return 16 bit values when queried. Signed-off-by: Dasaratharaman Chandramouli Reviewed-by: Ira Weiny Signed-off-by: Don Hiatt Signed-off-by: Doug Ledford --- drivers/infiniband/core/core_priv.h | 1 + drivers/infiniband/core/uverbs_cmd.c | 5 ++++- drivers/infiniband/hw/mlx4/alias_GUID.c | 2 +- drivers/infiniband/hw/mlx4/mad.c | 2 +- drivers/infiniband/hw/mthca/mthca_mad.c | 2 +- include/rdma/ib_verbs.h | 2 +- include/rdma/opa_addr.h | 3 ++- 7 files changed, 11 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h index 11ae67514e13..6b54280530c9 100644 --- a/drivers/infiniband/core/core_priv.h +++ b/drivers/infiniband/core/core_priv.h @@ -38,6 +38,7 @@ #include #include +#include #include #include "mad_priv.h" diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 2c98533a0203..eef2623406cc 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -275,8 +275,11 @@ ssize_t ib_uverbs_query_port(struct ib_uverbs_file *file, resp.bad_pkey_cntr = attr.bad_pkey_cntr; resp.qkey_viol_cntr = attr.qkey_viol_cntr; resp.pkey_tbl_len = attr.pkey_tbl_len; - resp.lid = attr.lid; resp.sm_lid = attr.sm_lid; + if (rdma_cap_opa_ah(ib_dev, cmd.port_num)) + resp.lid = OPA_TO_IB_UCAST_LID(attr.lid); + else + resp.lid = (u16)attr.lid; resp.lmc = attr.lmc; resp.max_vl_num = attr.max_vl_num; resp.sm_sl = attr.sm_sl; diff --git a/drivers/infiniband/hw/mlx4/alias_GUID.c b/drivers/infiniband/hw/mlx4/alias_GUID.c index ea24230ea0d4..5a897b0106a9 100644 --- a/drivers/infiniband/hw/mlx4/alias_GUID.c +++ b/drivers/infiniband/hw/mlx4/alias_GUID.c @@ -528,7 +528,7 @@ static int set_guid_rec(struct ib_device *ibdev, memset(&guid_info_rec, 0, sizeof (struct ib_sa_guidinfo_rec)); - guid_info_rec.lid = cpu_to_be16(attr.lid); + guid_info_rec.lid = cpu_to_be16((u16)attr.lid); guid_info_rec.block_num = index; memcpy(guid_info_rec.guid_info_list, rec_det->all_recs, diff --git a/drivers/infiniband/hw/mlx4/mad.c b/drivers/infiniband/hw/mlx4/mad.c index 21d31cb1325f..00f057033cb9 100644 --- a/drivers/infiniband/hw/mlx4/mad.c +++ b/drivers/infiniband/hw/mlx4/mad.c @@ -860,7 +860,7 @@ static int ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, in_mad->mad_hdr.method == IB_MGMT_METHOD_SET && in_mad->mad_hdr.attr_id == IB_SMP_ATTR_PORT_INFO && !ib_query_port(ibdev, port_num, &pattr)) - prev_lid = pattr.lid; + prev_lid = (u16)pattr.lid; err = mlx4_MAD_IFC(to_mdev(ibdev), (mad_flags & IB_MAD_IGNORE_MKEY ? MLX4_MAD_IFC_IGNORE_MKEY : 0) | diff --git a/drivers/infiniband/hw/mthca/mthca_mad.c b/drivers/infiniband/hw/mthca/mthca_mad.c index 7df3db71777a..617531f1bfc6 100644 --- a/drivers/infiniband/hw/mthca/mthca_mad.c +++ b/drivers/infiniband/hw/mthca/mthca_mad.c @@ -256,7 +256,7 @@ int mthca_process_mad(struct ib_device *ibdev, in_mad->mad_hdr.method == IB_MGMT_METHOD_SET && in_mad->mad_hdr.attr_id == IB_SMP_ATTR_PORT_INFO && !ib_query_port(ibdev, port_num, &pattr)) - prev_lid = pattr.lid; + prev_lid = (u16)pattr.lid; err = mthca_MAD_IFC(to_mdev(ibdev), mad_flags & IB_MAD_IGNORE_MKEY, diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index b5732432bb29..4fa94e69b1fc 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -549,8 +549,8 @@ struct ib_port_attr { u32 bad_pkey_cntr; u32 qkey_viol_cntr; u16 pkey_tbl_len; - u16 lid; u16 sm_lid; + u32 lid; u8 lmc; u8 max_vl_num; u8 sm_sl; diff --git a/include/rdma/opa_addr.h b/include/rdma/opa_addr.h index eace28f1555d..46d0567fffea 100644 --- a/include/rdma/opa_addr.h +++ b/include/rdma/opa_addr.h @@ -50,7 +50,8 @@ #define OPA_SPECIAL_OUI (0x00066AULL) #define OPA_MAKE_ID(x) (cpu_to_be64(OPA_SPECIAL_OUI << 40 | (x))) - +#define OPA_TO_IB_UCAST_LID(x) (((x) >= be16_to_cpu(IB_MULTICAST_LID_BASE)) \ + ? 0 : x) /** * ib_is_opa_gid: Returns true if the top 24 bits of the gid * contains the OPA_STL_OUI identifier. This identifies that -- cgit v1.2.3 From db58540b021a17e0ede64f761b740556d77f1679 Mon Sep 17 00:00:00 2001 From: Dasaratharaman Chandramouli Date: Thu, 8 Jun 2017 13:37:48 -0400 Subject: IB/core: Change port_attr.sm_lid from 16 to 32 bits sm_lid field in struct ib_port_attr is increased to 32 bits. This enables core components to use larger LIDs if needed. The user ABI is unchanged and return 16 bit values when queried. Signed-off-by: Dasaratharaman Chandramouli Reviewed-by: Ira Weiny Signed-off-by: Don Hiatt Signed-off-by: Doug Ledford --- drivers/infiniband/core/uverbs_cmd.c | 8 +++++--- include/rdma/ib_verbs.h | 2 +- 2 files changed, 6 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index eef2623406cc..01e2ff023980 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -275,11 +275,13 @@ ssize_t ib_uverbs_query_port(struct ib_uverbs_file *file, resp.bad_pkey_cntr = attr.bad_pkey_cntr; resp.qkey_viol_cntr = attr.qkey_viol_cntr; resp.pkey_tbl_len = attr.pkey_tbl_len; - resp.sm_lid = attr.sm_lid; - if (rdma_cap_opa_ah(ib_dev, cmd.port_num)) + if (rdma_cap_opa_ah(ib_dev, cmd.port_num)) { resp.lid = OPA_TO_IB_UCAST_LID(attr.lid); - else + resp.sm_lid = OPA_TO_IB_UCAST_LID(attr.sm_lid); + } else { resp.lid = (u16)attr.lid; + resp.sm_lid = (u16)attr.sm_lid; + } resp.lmc = attr.lmc; resp.max_vl_num = attr.max_vl_num; resp.sm_sl = attr.sm_sl; diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 4fa94e69b1fc..620535908118 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -549,7 +549,7 @@ struct ib_port_attr { u32 bad_pkey_cntr; u32 qkey_viol_cntr; u16 pkey_tbl_len; - u16 sm_lid; + u32 sm_lid; u32 lid; u8 lmc; u8 max_vl_num; -- cgit v1.2.3 From 7db20ecd1d9700e2c240dee505162eb56ab55b5b Mon Sep 17 00:00:00 2001 From: "Hiatt, Don" Date: Thu, 8 Jun 2017 13:37:49 -0400 Subject: IB/core: Change wc.slid from 16 to 32 bits slid field in struct ib_wc is increased to 32 bits. This enables core components to use larger LIDs if needed. The user ABI is unchanged and return 16 bit values when queried. Signed-off-by: Dasaratharaman Chandramouli Reviewed-by: Ira Weiny Signed-off-by: Don Hiatt Signed-off-by: Doug Ledford --- drivers/infiniband/core/cm.c | 4 ++-- drivers/infiniband/core/user_mad.c | 2 +- drivers/infiniband/core/uverbs_cmd.c | 10 +++++++--- drivers/infiniband/hw/hfi1/mad.c | 2 +- drivers/infiniband/hw/mlx4/mad.c | 6 +++--- drivers/infiniband/hw/mlx5/mad.c | 2 +- drivers/infiniband/hw/mthca/mthca_cmd.c | 4 ++-- drivers/infiniband/hw/mthca/mthca_mad.c | 2 +- drivers/infiniband/sw/rdmavt/cq.c | 2 +- include/rdma/ib_verbs.h | 14 +++++++++++++- 10 files changed, 32 insertions(+), 16 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c index 2b4d613a3474..b39ee16aa479 100644 --- a/drivers/infiniband/core/cm.c +++ b/drivers/infiniband/core/cm.c @@ -1703,7 +1703,7 @@ static void cm_process_routed_req(struct cm_req_msg *req_msg, struct ib_wc *wc) { if (!cm_req_get_primary_subnet_local(req_msg)) { if (req_msg->primary_local_lid == IB_LID_PERMISSIVE) { - req_msg->primary_local_lid = cpu_to_be16(wc->slid); + req_msg->primary_local_lid = ib_slid_be16(wc->slid); cm_req_set_primary_sl(req_msg, wc->sl); } @@ -1713,7 +1713,7 @@ static void cm_process_routed_req(struct cm_req_msg *req_msg, struct ib_wc *wc) if (!cm_req_get_alt_subnet_local(req_msg)) { if (req_msg->alt_local_lid == IB_LID_PERMISSIVE) { - req_msg->alt_local_lid = cpu_to_be16(wc->slid); + req_msg->alt_local_lid = ib_slid_be16(wc->slid); cm_req_set_alt_sl(req_msg, wc->sl); } diff --git a/drivers/infiniband/core/user_mad.c b/drivers/infiniband/core/user_mad.c index 36a6f5c8914c..ff3c67a7aaad 100644 --- a/drivers/infiniband/core/user_mad.c +++ b/drivers/infiniband/core/user_mad.c @@ -229,7 +229,7 @@ static void recv_handler(struct ib_mad_agent *agent, packet->mad.hdr.status = 0; packet->mad.hdr.length = hdr_size(file) + mad_recv_wc->mad_len; packet->mad.hdr.qpn = cpu_to_be32(mad_recv_wc->wc->src_qp); - packet->mad.hdr.lid = cpu_to_be16(mad_recv_wc->wc->slid); + packet->mad.hdr.lid = ib_slid_be16(mad_recv_wc->wc->slid); packet->mad.hdr.sl = mad_recv_wc->wc->sl; packet->mad.hdr.path_bits = mad_recv_wc->wc->dlid_path_bits; packet->mad.hdr.pkey_index = mad_recv_wc->wc->pkey_index; diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 01e2ff023980..eb0da3784bf4 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -1190,7 +1190,8 @@ out: return ret ? ret : in_len; } -static int copy_wc_to_user(void __user *dest, struct ib_wc *wc) +static int copy_wc_to_user(struct ib_device *ib_dev, void __user *dest, + struct ib_wc *wc) { struct ib_uverbs_wc tmp; @@ -1204,7 +1205,10 @@ static int copy_wc_to_user(void __user *dest, struct ib_wc *wc) tmp.src_qp = wc->src_qp; tmp.wc_flags = wc->wc_flags; tmp.pkey_index = wc->pkey_index; - tmp.slid = wc->slid; + if (rdma_cap_opa_ah(ib_dev, wc->port_num)) + tmp.slid = OPA_TO_IB_UCAST_LID(wc->slid); + else + tmp.slid = ib_slid_cpu16(wc->slid); tmp.sl = wc->sl; tmp.dlid_path_bits = wc->dlid_path_bits; tmp.port_num = wc->port_num; @@ -1248,7 +1252,7 @@ ssize_t ib_uverbs_poll_cq(struct ib_uverbs_file *file, if (!ret) break; - ret = copy_wc_to_user(data_ptr, &wc); + ret = copy_wc_to_user(ib_dev, data_ptr, &wc); if (ret) goto out_put; diff --git a/drivers/infiniband/hw/hfi1/mad.c b/drivers/infiniband/hw/hfi1/mad.c index 5977673a52d4..00ebc26cd187 100644 --- a/drivers/infiniband/hw/hfi1/mad.c +++ b/drivers/infiniband/hw/hfi1/mad.c @@ -3958,7 +3958,7 @@ static int opa_local_smp_check(struct hfi1_ibport *ibp, const struct ib_wc *in_wc) { struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); - u16 slid = in_wc->slid; + u16 slid = ib_slid_cpu16(in_wc->slid); u16 pkey; if (in_wc->pkey_index >= ARRAY_SIZE(ppd->pkeys)) diff --git a/drivers/infiniband/hw/mlx4/mad.c b/drivers/infiniband/hw/mlx4/mad.c index 00f057033cb9..04fb44e7699e 100644 --- a/drivers/infiniband/hw/mlx4/mad.c +++ b/drivers/infiniband/hw/mlx4/mad.c @@ -169,7 +169,7 @@ int mlx4_MAD_IFC(struct mlx4_ib_dev *dev, int mad_ifc_flags, op_modifier |= 0x4; - in_modifier |= in_wc->slid << 16; + in_modifier |= ib_slid_cpu16(in_wc->slid) << 16; } err = mlx4_cmd_box(dev->dev, inmailbox->dma, outmailbox->dma, in_modifier, @@ -625,7 +625,7 @@ int mlx4_ib_send_to_slave(struct mlx4_ib_dev *dev, int slave, u8 port, memcpy((char *)&tun_mad->hdr.slid_mac_47_32, &(wc->smac[4]), 2); } else { tun_mad->hdr.sl_vid = cpu_to_be16(((u16)(wc->sl)) << 12); - tun_mad->hdr.slid_mac_47_32 = cpu_to_be16(wc->slid); + tun_mad->hdr.slid_mac_47_32 = ib_slid_be16(wc->slid); } ib_dma_sync_single_for_device(&dev->ib_dev, @@ -826,7 +826,7 @@ static int ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, } } - slid = in_wc ? in_wc->slid : be16_to_cpu(IB_LID_PERMISSIVE); + slid = in_wc ? ib_slid_cpu16(in_wc->slid) : be16_to_cpu(IB_LID_PERMISSIVE); if (in_mad->mad_hdr.method == IB_MGMT_METHOD_TRAP && slid == 0) { forward_trap(to_mdev(ibdev), port_num, in_mad); diff --git a/drivers/infiniband/hw/mlx5/mad.c b/drivers/infiniband/hw/mlx5/mad.c index 95db929bdc34..cd2264ac88ae 100644 --- a/drivers/infiniband/hw/mlx5/mad.c +++ b/drivers/infiniband/hw/mlx5/mad.c @@ -78,7 +78,7 @@ static int process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, u16 slid; int err; - slid = in_wc ? in_wc->slid : be16_to_cpu(IB_LID_PERMISSIVE); + slid = in_wc ? ib_slid_cpu16(in_wc->slid) : be16_to_cpu(IB_LID_PERMISSIVE); if (in_mad->mad_hdr.method == IB_MGMT_METHOD_TRAP && slid == 0) return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED; diff --git a/drivers/infiniband/hw/mthca/mthca_cmd.c b/drivers/infiniband/hw/mthca/mthca_cmd.c index 9d83a53c0c67..e19ae0b9b439 100644 --- a/drivers/infiniband/hw/mthca/mthca_cmd.c +++ b/drivers/infiniband/hw/mthca/mthca_cmd.c @@ -1921,7 +1921,7 @@ int mthca_MAD_IFC(struct mthca_dev *dev, int ignore_mkey, int ignore_bkey, (in_wc->wc_flags & IB_WC_GRH ? 0x80 : 0); MTHCA_PUT(inbox, val, MAD_IFC_G_PATH_OFFSET); - MTHCA_PUT(inbox, in_wc->slid, MAD_IFC_RLID_OFFSET); + MTHCA_PUT(inbox, ib_slid_cpu16(in_wc->slid), MAD_IFC_RLID_OFFSET); MTHCA_PUT(inbox, in_wc->pkey_index, MAD_IFC_PKEY_OFFSET); if (in_grh) @@ -1929,7 +1929,7 @@ int mthca_MAD_IFC(struct mthca_dev *dev, int ignore_mkey, int ignore_bkey, op_modifier |= 0x4; - in_modifier |= in_wc->slid << 16; + in_modifier |= ib_slid_cpu16(in_wc->slid) << 16; } err = mthca_cmd_box(dev, inmailbox->dma, outmailbox->dma, diff --git a/drivers/infiniband/hw/mthca/mthca_mad.c b/drivers/infiniband/hw/mthca/mthca_mad.c index 617531f1bfc6..a9caadab22cf 100644 --- a/drivers/infiniband/hw/mthca/mthca_mad.c +++ b/drivers/infiniband/hw/mthca/mthca_mad.c @@ -205,7 +205,7 @@ int mthca_process_mad(struct ib_device *ibdev, u16 *out_mad_pkey_index) { int err; - u16 slid = in_wc ? in_wc->slid : be16_to_cpu(IB_LID_PERMISSIVE); + u16 slid = in_wc ? ib_slid_cpu16(in_wc->slid) : be16_to_cpu(IB_LID_PERMISSIVE); u16 prev_lid = 0; struct ib_port_attr pattr; const struct ib_mad *in_mad = (const struct ib_mad *)in; diff --git a/drivers/infiniband/sw/rdmavt/cq.c b/drivers/infiniband/sw/rdmavt/cq.c index 0ae2ff8cf81e..0335a3df74d5 100644 --- a/drivers/infiniband/sw/rdmavt/cq.c +++ b/drivers/infiniband/sw/rdmavt/cq.c @@ -107,7 +107,7 @@ void rvt_cq_enter(struct rvt_cq *cq, struct ib_wc *entry, bool solicited) wc->uqueue[head].src_qp = entry->src_qp; wc->uqueue[head].wc_flags = entry->wc_flags; wc->uqueue[head].pkey_index = entry->pkey_index; - wc->uqueue[head].slid = entry->slid; + wc->uqueue[head].slid = ib_slid_cpu16(entry->slid); wc->uqueue[head].sl = entry->sl; wc->uqueue[head].dlid_path_bits = entry->dlid_path_bits; wc->uqueue[head].port_num = entry->port_num; diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 620535908118..7eaf7d2ab424 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -948,7 +948,7 @@ struct ib_wc { u32 src_qp; int wc_flags; u16 pkey_index; - u16 slid; + u32 slid; u8 sl; u8 dlid_path_bits; u8 port_num; /* valid only for DR SMPs on switches */ @@ -3706,4 +3706,16 @@ static inline enum rdma_ah_attr_type rdma_ah_find_type(struct ib_device *dev, else return RDMA_AH_ATTR_TYPE_IB; } + +/* Return slid in 16bit CPU encoding */ +static inline u16 ib_slid_cpu16(u32 slid) +{ + return (u16)slid; +} + +/* Return slid in 16bit BE encoding */ +static inline u16 ib_slid_be16(u32 slid) +{ + return cpu_to_be16((u16)slid); +} #endif /* IB_VERBS_H */ -- cgit v1.2.3 From e92aa00a518971fca6b79aa87a1a9c5e5aa51f3b Mon Sep 17 00:00:00 2001 From: "Hiatt, Don" Date: Thu, 8 Jun 2017 13:38:02 -0400 Subject: IB/CM: Add OPA Path record support to CM Add OPA path record support to the Connection Manager. Signed-off-by: Don Hiatt Reviewed-by: Ira Weiny Signed-off-by: Doug Ledford --- drivers/infiniband/core/cm.c | 50 +++++++++++++++++++++++++++++++++++++------- include/rdma/opa_addr.h | 18 ++++++++++++++++ 2 files changed, 60 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c index b39ee16aa479..885c429b4942 100644 --- a/drivers/infiniband/core/cm.c +++ b/drivers/infiniband/core/cm.c @@ -1175,6 +1175,11 @@ static void cm_format_req(struct cm_req_msg *req_msg, { struct sa_path_rec *pri_path = param->primary_path; struct sa_path_rec *alt_path = param->alternate_path; + bool pri_ext = false; + + if (pri_path->rec_type == SA_PATH_REC_TYPE_OPA) + pri_ext = opa_is_extended_lid(pri_path->opa.dlid, + pri_path->opa.slid); cm_format_mad_hdr(&req_msg->hdr, CM_REQ_ATTR_ID, cm_form_tid(cm_id_priv, CM_MSG_SEQUENCE_REQ)); @@ -1202,18 +1207,24 @@ static void cm_format_req(struct cm_req_msg *req_msg, cm_req_set_srq(req_msg, param->srq); } + req_msg->primary_local_gid = pri_path->sgid; + req_msg->primary_remote_gid = pri_path->dgid; + if (pri_ext) { + req_msg->primary_local_gid.global.interface_id + = OPA_MAKE_ID(be32_to_cpu(pri_path->opa.slid)); + req_msg->primary_remote_gid.global.interface_id + = OPA_MAKE_ID(be32_to_cpu(pri_path->opa.dlid)); + } if (pri_path->hop_limit <= 1) { - req_msg->primary_local_lid = + req_msg->primary_local_lid = pri_ext ? 0 : htons(ntohl(sa_path_get_slid(pri_path))); - req_msg->primary_remote_lid = + req_msg->primary_remote_lid = pri_ext ? 0 : htons(ntohl(sa_path_get_dlid(pri_path))); } else { /* Work-around until there's a way to obtain remote LID info */ req_msg->primary_local_lid = IB_LID_PERMISSIVE; req_msg->primary_remote_lid = IB_LID_PERMISSIVE; } - req_msg->primary_local_gid = pri_path->sgid; - req_msg->primary_remote_gid = pri_path->dgid; cm_req_set_primary_flow_label(req_msg, pri_path->flow_label); cm_req_set_primary_packet_rate(req_msg, pri_path->rate); req_msg->primary_traffic_class = pri_path->traffic_class; @@ -1225,17 +1236,29 @@ static void cm_format_req(struct cm_req_msg *req_msg, pri_path->packet_life_time)); if (alt_path) { + bool alt_ext = false; + + if (alt_path->rec_type == SA_PATH_REC_TYPE_OPA) + alt_ext = opa_is_extended_lid(alt_path->opa.dlid, + alt_path->opa.slid); + + req_msg->alt_local_gid = alt_path->sgid; + req_msg->alt_remote_gid = alt_path->dgid; + if (alt_ext) { + req_msg->alt_local_gid.global.interface_id + = OPA_MAKE_ID(be32_to_cpu(alt_path->opa.slid)); + req_msg->alt_remote_gid.global.interface_id + = OPA_MAKE_ID(be32_to_cpu(alt_path->opa.dlid)); + } if (alt_path->hop_limit <= 1) { - req_msg->alt_local_lid = + req_msg->alt_local_lid = alt_ext ? 0 : htons(ntohl(sa_path_get_slid(alt_path))); - req_msg->alt_remote_lid = + req_msg->alt_remote_lid = alt_ext ? 0 : htons(ntohl(sa_path_get_dlid(alt_path))); } else { req_msg->alt_local_lid = IB_LID_PERMISSIVE; req_msg->alt_remote_lid = IB_LID_PERMISSIVE; } - req_msg->alt_local_gid = alt_path->sgid; - req_msg->alt_remote_gid = alt_path->dgid; cm_req_set_alt_flow_label(req_msg, alt_path->flow_label); cm_req_set_alt_packet_rate(req_msg, alt_path->rate); @@ -2843,6 +2866,11 @@ static void cm_format_lap(struct cm_lap_msg *lap_msg, const void *private_data, u8 private_data_len) { + bool alt_ext = false; + + if (alternate_path->rec_type == SA_PATH_REC_TYPE_OPA) + alt_ext = opa_is_extended_lid(alternate_path->opa.dlid, + alternate_path->opa.slid); cm_format_mad_hdr(&lap_msg->hdr, CM_LAP_ATTR_ID, cm_form_tid(cm_id_priv, CM_MSG_SEQUENCE_LAP)); lap_msg->local_comm_id = cm_id_priv->id.local_id; @@ -2856,6 +2884,12 @@ static void cm_format_lap(struct cm_lap_msg *lap_msg, htons(ntohl(sa_path_get_dlid(alternate_path))); lap_msg->alt_local_gid = alternate_path->sgid; lap_msg->alt_remote_gid = alternate_path->dgid; + if (alt_ext) { + lap_msg->alt_local_gid.global.interface_id + = OPA_MAKE_ID(be32_to_cpu(alternate_path->opa.slid)); + lap_msg->alt_remote_gid.global.interface_id + = OPA_MAKE_ID(be32_to_cpu(alternate_path->opa.dlid)); + } cm_lap_set_flow_label(lap_msg, alternate_path->flow_label); cm_lap_set_traffic_class(lap_msg, alternate_path->traffic_class); lap_msg->alt_hop_limit = alternate_path->hop_limit; diff --git a/include/rdma/opa_addr.h b/include/rdma/opa_addr.h index 46d0567fffea..9b5e642cf550 100644 --- a/include/rdma/opa_addr.h +++ b/include/rdma/opa_addr.h @@ -77,4 +77,22 @@ static inline u32 opa_get_lid_from_gid(union ib_gid *gid) { return be64_to_cpu(gid->global.interface_id) & 0xFFFFFFFF; } + +/** + * opa_is_extended_lid: Returns true if dlid or slid are + * extended. + * + * @dlid: The DLID + * @slid: The SLID + */ +static inline bool opa_is_extended_lid(u32 dlid, u32 slid) +{ + if ((be32_to_cpu(dlid) >= + be16_to_cpu(IB_MULTICAST_LID_BASE)) || + (be32_to_cpu(slid) >= + be16_to_cpu(IB_MULTICAST_LID_BASE))) + return true; + else + return false; +} #endif /* OPA_ADDR_H */ -- cgit v1.2.3 From 78249c4215840edb95447ec6867b69a7ac1d7a0d Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Thu, 13 Jul 2017 11:09:38 +0300 Subject: mlx5: convert to generic pci_alloc_irq_vectors Now that we have a generic code to allocate an array of irq vectors and even correctly spread their affinity, correctly handle cpu hotplug events and more, were much better off using it. Reviewed-by: Christoph Hellwig Acked-by: Leon Romanovsky Signed-off-by: Sagi Grimberg Signed-off-by: Doug Ledford --- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 2 +- drivers/net/ethernet/mellanox/mlx5/core/eq.c | 9 ++--- drivers/net/ethernet/mellanox/mlx5/core/eswitch.c | 2 +- drivers/net/ethernet/mellanox/mlx5/core/health.c | 2 +- drivers/net/ethernet/mellanox/mlx5/core/main.c | 39 +++++++++------------- .../net/ethernet/mellanox/mlx5/core/mlx5_core.h | 1 - include/linux/mlx5/driver.h | 1 - 7 files changed, 20 insertions(+), 36 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 1eac5003084f..d0e572df3a1b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -397,7 +397,7 @@ static void mlx5e_enable_async_events(struct mlx5e_priv *priv) static void mlx5e_disable_async_events(struct mlx5e_priv *priv) { clear_bit(MLX5E_STATE_ASYNC_EVENTS_ENABLED, &priv->state); - synchronize_irq(mlx5_get_msix_vec(priv->mdev, MLX5_EQ_VEC_ASYNC)); + synchronize_irq(pci_irq_vector(priv->mdev->pdev, MLX5_EQ_VEC_ASYNC)); } static inline int mlx5e_get_wqe_mtt_sz(void) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c index af51a5d2b912..8a09d7197d70 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c @@ -585,7 +585,7 @@ int mlx5_create_map_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq, u8 vecidx, name, pci_name(dev->pdev)); eq->eqn = MLX5_GET(create_eq_out, out, eq_number); - eq->irqn = priv->msix_arr[vecidx].vector; + eq->irqn = pci_irq_vector(dev->pdev, vecidx); eq->dev = dev; eq->doorbell = priv->uar->map + MLX5_EQ_DOORBEL_OFFSET; err = request_irq(eq->irqn, handler, 0, @@ -620,7 +620,7 @@ int mlx5_create_map_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq, u8 vecidx, return 0; err_irq: - free_irq(priv->msix_arr[vecidx].vector, eq); + free_irq(eq->irqn, eq); err_eq: mlx5_cmd_destroy_eq(dev, eq->eqn); @@ -661,11 +661,6 @@ int mlx5_destroy_unmap_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq) } EXPORT_SYMBOL_GPL(mlx5_destroy_unmap_eq); -u32 mlx5_get_msix_vec(struct mlx5_core_dev *dev, int vecidx) -{ - return dev->priv.msix_arr[MLX5_EQ_VEC_ASYNC].vector; -} - int mlx5_eq_init(struct mlx5_core_dev *dev) { int err; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c index 89bfda419efe..1ce2543e3889 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c @@ -1585,7 +1585,7 @@ static void esw_disable_vport(struct mlx5_eswitch *esw, int vport_num) /* Mark this vport as disabled to discard new events */ vport->enabled = false; - synchronize_irq(mlx5_get_msix_vec(esw->dev, MLX5_EQ_VEC_ASYNC)); + synchronize_irq(pci_irq_vector(esw->dev->pdev, MLX5_EQ_VEC_ASYNC)); /* Wait for current already scheduled events to complete */ flush_workqueue(esw->work_queue); /* Disable events from this vport */ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/health.c b/drivers/net/ethernet/mellanox/mlx5/core/health.c index 4b6b03d6297f..8aea0a065e56 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/health.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/health.c @@ -81,7 +81,7 @@ static void trigger_cmd_completions(struct mlx5_core_dev *dev) u64 vector; /* wait for pending handlers to complete */ - synchronize_irq(dev->priv.msix_arr[MLX5_EQ_VEC_CMD].vector); + synchronize_irq(pci_irq_vector(dev->pdev, MLX5_EQ_VEC_CMD)); spin_lock_irqsave(&dev->cmd.alloc_lock, flags); vector = ~dev->cmd.bitmask & ((1ul << (1 << dev->cmd.log_sz)) - 1); if (!vector) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index c065132b956d..d2fd55e5c68b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -312,13 +312,12 @@ static void release_bar(struct pci_dev *pdev) pci_release_regions(pdev); } -static int mlx5_enable_msix(struct mlx5_core_dev *dev) +static int mlx5_alloc_irq_vectors(struct mlx5_core_dev *dev) { struct mlx5_priv *priv = &dev->priv; struct mlx5_eq_table *table = &priv->eq_table; int num_eqs = 1 << MLX5_CAP_GEN(dev, log_max_eq); int nvec; - int i; nvec = MLX5_CAP_GEN(dev, num_ports) * num_online_cpus() + MLX5_EQ_VEC_COMP_BASE; @@ -326,17 +325,13 @@ static int mlx5_enable_msix(struct mlx5_core_dev *dev) if (nvec <= MLX5_EQ_VEC_COMP_BASE) return -ENOMEM; - priv->msix_arr = kcalloc(nvec, sizeof(*priv->msix_arr), GFP_KERNEL); - priv->irq_info = kcalloc(nvec, sizeof(*priv->irq_info), GFP_KERNEL); - if (!priv->msix_arr || !priv->irq_info) + if (!priv->irq_info) goto err_free_msix; - for (i = 0; i < nvec; i++) - priv->msix_arr[i].entry = i; - - nvec = pci_enable_msix_range(dev->pdev, priv->msix_arr, - MLX5_EQ_VEC_COMP_BASE + 1, nvec); + nvec = pci_alloc_irq_vectors(dev->pdev, + MLX5_EQ_VEC_COMP_BASE + 1, nvec, + PCI_IRQ_MSIX); if (nvec < 0) return nvec; @@ -346,17 +341,15 @@ static int mlx5_enable_msix(struct mlx5_core_dev *dev) err_free_msix: kfree(priv->irq_info); - kfree(priv->msix_arr); return -ENOMEM; } -static void mlx5_disable_msix(struct mlx5_core_dev *dev) +static void mlx5_free_irq_vectors(struct mlx5_core_dev *dev) { struct mlx5_priv *priv = &dev->priv; - pci_disable_msix(dev->pdev); + pci_free_irq_vectors(dev->pdev); kfree(priv->irq_info); - kfree(priv->msix_arr); } struct mlx5_reg_host_endianness { @@ -615,8 +608,7 @@ u64 mlx5_read_internal_timer(struct mlx5_core_dev *dev) static int mlx5_irq_set_affinity_hint(struct mlx5_core_dev *mdev, int i) { struct mlx5_priv *priv = &mdev->priv; - struct msix_entry *msix = priv->msix_arr; - int irq = msix[i + MLX5_EQ_VEC_COMP_BASE].vector; + int irq = pci_irq_vector(mdev->pdev, MLX5_EQ_VEC_COMP_BASE + i); if (!zalloc_cpumask_var(&priv->irq_info[i].mask, GFP_KERNEL)) { mlx5_core_warn(mdev, "zalloc_cpumask_var failed"); @@ -636,8 +628,7 @@ static int mlx5_irq_set_affinity_hint(struct mlx5_core_dev *mdev, int i) static void mlx5_irq_clear_affinity_hint(struct mlx5_core_dev *mdev, int i) { struct mlx5_priv *priv = &mdev->priv; - struct msix_entry *msix = priv->msix_arr; - int irq = msix[i + MLX5_EQ_VEC_COMP_BASE].vector; + int irq = pci_irq_vector(mdev->pdev, MLX5_EQ_VEC_COMP_BASE + i); irq_set_affinity_hint(irq, NULL); free_cpumask_var(priv->irq_info[i].mask); @@ -760,8 +751,8 @@ static int alloc_comp_eqs(struct mlx5_core_dev *dev) } #ifdef CONFIG_RFS_ACCEL - irq_cpu_rmap_add(dev->rmap, - dev->priv.msix_arr[i + MLX5_EQ_VEC_COMP_BASE].vector); + irq_cpu_rmap_add(dev->rmap, pci_irq_vector(dev->pdev, + MLX5_EQ_VEC_COMP_BASE + i)); #endif snprintf(name, MLX5_MAX_IRQ_NAME, "mlx5_comp%d", i); err = mlx5_create_map_eq(dev, eq, @@ -1119,9 +1110,9 @@ static int mlx5_load_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv, goto err_stop_poll; } - err = mlx5_enable_msix(dev); + err = mlx5_alloc_irq_vectors(dev); if (err) { - dev_err(&pdev->dev, "enable msix failed\n"); + dev_err(&pdev->dev, "alloc irq vectors failed\n"); goto err_cleanup_once; } @@ -1220,7 +1211,7 @@ err_put_uars: mlx5_put_uars_page(dev, priv->uar); err_disable_msix: - mlx5_disable_msix(dev); + mlx5_free_irq_vectors(dev); err_cleanup_once: if (boot) @@ -1287,7 +1278,7 @@ static int mlx5_unload_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv, free_comp_eqs(dev); mlx5_stop_eqs(dev); mlx5_put_uars_page(dev, priv->uar); - mlx5_disable_msix(dev); + mlx5_free_irq_vectors(dev); if (cleanup) mlx5_cleanup_once(dev); mlx5_stop_health_poll(dev); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h index 6a3d6bef7dd4..ba1d494b016d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h @@ -110,7 +110,6 @@ int mlx5_destroy_scheduling_element_cmd(struct mlx5_core_dev *dev, u8 hierarchy, u32 element_id); int mlx5_wait_for_vf_pages(struct mlx5_core_dev *dev); u64 mlx5_read_internal_timer(struct mlx5_core_dev *dev); -u32 mlx5_get_msix_vec(struct mlx5_core_dev *dev, int vecidx); struct mlx5_eq *mlx5_eqn2eq(struct mlx5_core_dev *dev, int eqn); void mlx5_cq_tasklet_cb(unsigned long data); diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index df6ce59a1f95..5bac7f53b4f9 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -597,7 +597,6 @@ struct mlx5_port_module_event_stats { struct mlx5_priv { char name[MLX5_MAX_NAME_LEN]; struct mlx5_eq_table eq_table; - struct msix_entry *msix_arr; struct mlx5_irq_info *irq_info; /* pages stuff */ -- cgit v1.2.3 From a435393acafbf0ecff4deb3e3cb554b34f0d0664 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Thu, 13 Jul 2017 11:09:40 +0300 Subject: mlx5: move affinity hints assignments to generic code generic api takes care of spreading affinity similar to what mlx5 open coded (and even handles better asymmetric configurations). Ask the generic API to spread affinity for us, and feed him pre_vectors that do not participate in affinity settings (which is an improvement to what we had before). The affinity assignments should match what mlx5 tried to do earlier but now we do not set affinity to async, cmd and pages dedicated vectors. Also, remove mlx5e_get_cpu and introduce mlx5e_get_node (used for allocation purposes) and mlx5_get_vector_affinity (for indirection table construction) as they provide the needed information. Luckily, we have generic helpers to get cpumask and node given a irq vector. mlx5_get_vector_affinity will be used by mlx5_ib in a subsequent patch. Reviewed-by: Christoph Hellwig Acked-by: Leon Romanovsky Signed-off-by: Sagi Grimberg Signed-off-by: Doug Ledford --- drivers/net/ethernet/mellanox/mlx5/core/en.h | 1 - drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 45 +++++++------- drivers/net/ethernet/mellanox/mlx5/core/main.c | 75 ++--------------------- include/linux/mlx5/driver.h | 7 ++- 4 files changed, 35 insertions(+), 93 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h index e1b7ddfecd01..909123243a85 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h @@ -587,7 +587,6 @@ struct mlx5e_channel { struct mlx5_core_dev *mdev; struct mlx5e_tstamp *tstamp; int ix; - int cpu; }; struct mlx5e_channels { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 2c4e41833e55..fb647561c592 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -71,6 +71,11 @@ struct mlx5e_channel_param { struct mlx5e_cq_param icosq_cq; }; +static int mlx5e_get_node(struct mlx5e_priv *priv, int ix) +{ + return pci_irq_get_node(priv->mdev->pdev, MLX5_EQ_VEC_COMP_BASE + ix); +} + static bool mlx5e_check_fragmented_striding_rq_cap(struct mlx5_core_dev *mdev) { return MLX5_CAP_GEN(mdev, striding_rq) && @@ -444,16 +449,17 @@ static int mlx5e_rq_alloc_mpwqe_info(struct mlx5e_rq *rq, int wq_sz = mlx5_wq_ll_get_size(&rq->wq); int mtt_sz = mlx5e_get_wqe_mtt_sz(); int mtt_alloc = mtt_sz + MLX5_UMR_ALIGN - 1; + int node = mlx5e_get_node(c->priv, c->ix); int i; rq->mpwqe.info = kzalloc_node(wq_sz * sizeof(*rq->mpwqe.info), - GFP_KERNEL, cpu_to_node(c->cpu)); + GFP_KERNEL, node); if (!rq->mpwqe.info) goto err_out; /* We allocate more than mtt_sz as we will align the pointer */ - rq->mpwqe.mtt_no_align = kzalloc_node(mtt_alloc * wq_sz, GFP_KERNEL, - cpu_to_node(c->cpu)); + rq->mpwqe.mtt_no_align = kzalloc_node(mtt_alloc * wq_sz, + GFP_KERNEL, node); if (unlikely(!rq->mpwqe.mtt_no_align)) goto err_free_wqe_info; @@ -561,7 +567,7 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c, int err; int i; - rqp->wq.db_numa_node = cpu_to_node(c->cpu); + rqp->wq.db_numa_node = mlx5e_get_node(c->priv, c->ix); err = mlx5_wq_ll_create(mdev, &rqp->wq, rqc_wq, &rq->wq, &rq->wq_ctrl); @@ -628,7 +634,8 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c, default: /* MLX5_WQ_TYPE_LINKED_LIST */ rq->wqe.frag_info = kzalloc_node(wq_sz * sizeof(*rq->wqe.frag_info), - GFP_KERNEL, cpu_to_node(c->cpu)); + GFP_KERNEL, + mlx5e_get_node(c->priv, c->ix)); if (!rq->wqe.frag_info) { err = -ENOMEM; goto err_rq_wq_destroy; @@ -993,13 +1000,13 @@ static int mlx5e_alloc_xdpsq(struct mlx5e_channel *c, sq->uar_map = mdev->mlx5e_res.bfreg.map; sq->min_inline_mode = params->tx_min_inline_mode; - param->wq.db_numa_node = cpu_to_node(c->cpu); + param->wq.db_numa_node = mlx5e_get_node(c->priv, c->ix); err = mlx5_wq_cyc_create(mdev, ¶m->wq, sqc_wq, &sq->wq, &sq->wq_ctrl); if (err) return err; sq->wq.db = &sq->wq.db[MLX5_SND_DBR]; - err = mlx5e_alloc_xdpsq_db(sq, cpu_to_node(c->cpu)); + err = mlx5e_alloc_xdpsq_db(sq, mlx5e_get_node(c->priv, c->ix)); if (err) goto err_sq_wq_destroy; @@ -1047,13 +1054,13 @@ static int mlx5e_alloc_icosq(struct mlx5e_channel *c, sq->channel = c; sq->uar_map = mdev->mlx5e_res.bfreg.map; - param->wq.db_numa_node = cpu_to_node(c->cpu); + param->wq.db_numa_node = mlx5e_get_node(c->priv, c->ix); err = mlx5_wq_cyc_create(mdev, ¶m->wq, sqc_wq, &sq->wq, &sq->wq_ctrl); if (err) return err; sq->wq.db = &sq->wq.db[MLX5_SND_DBR]; - err = mlx5e_alloc_icosq_db(sq, cpu_to_node(c->cpu)); + err = mlx5e_alloc_icosq_db(sq, mlx5e_get_node(c->priv, c->ix)); if (err) goto err_sq_wq_destroy; @@ -1119,13 +1126,13 @@ static int mlx5e_alloc_txqsq(struct mlx5e_channel *c, if (MLX5_IPSEC_DEV(c->priv->mdev)) set_bit(MLX5E_SQ_STATE_IPSEC, &sq->state); - param->wq.db_numa_node = cpu_to_node(c->cpu); + param->wq.db_numa_node = mlx5e_get_node(c->priv, c->ix); err = mlx5_wq_cyc_create(mdev, ¶m->wq, sqc_wq, &sq->wq, &sq->wq_ctrl); if (err) return err; sq->wq.db = &sq->wq.db[MLX5_SND_DBR]; - err = mlx5e_alloc_txqsq_db(sq, cpu_to_node(c->cpu)); + err = mlx5e_alloc_txqsq_db(sq, mlx5e_get_node(c->priv, c->ix)); if (err) goto err_sq_wq_destroy; @@ -1497,8 +1504,8 @@ static int mlx5e_alloc_cq(struct mlx5e_channel *c, struct mlx5_core_dev *mdev = c->priv->mdev; int err; - param->wq.buf_numa_node = cpu_to_node(c->cpu); - param->wq.db_numa_node = cpu_to_node(c->cpu); + param->wq.buf_numa_node = mlx5e_get_node(c->priv, c->ix); + param->wq.db_numa_node = mlx5e_get_node(c->priv, c->ix); param->eq_ix = c->ix; err = mlx5e_alloc_cq_common(mdev, param, cq); @@ -1597,11 +1604,6 @@ static void mlx5e_close_cq(struct mlx5e_cq *cq) mlx5e_free_cq(cq); } -static int mlx5e_get_cpu(struct mlx5e_priv *priv, int ix) -{ - return cpumask_first(priv->mdev->priv.irq_info[ix].mask); -} - static int mlx5e_open_tx_cqs(struct mlx5e_channel *c, struct mlx5e_params *params, struct mlx5e_channel_param *cparam) @@ -1750,11 +1752,10 @@ static int mlx5e_open_channel(struct mlx5e_priv *priv, int ix, { struct mlx5e_cq_moder icocq_moder = {0, 0}; struct net_device *netdev = priv->netdev; - int cpu = mlx5e_get_cpu(priv, ix); struct mlx5e_channel *c; int err; - c = kzalloc_node(sizeof(*c), GFP_KERNEL, cpu_to_node(cpu)); + c = kzalloc_node(sizeof(*c), GFP_KERNEL, mlx5e_get_node(priv, ix)); if (!c) return -ENOMEM; @@ -1762,7 +1763,6 @@ static int mlx5e_open_channel(struct mlx5e_priv *priv, int ix, c->mdev = priv->mdev; c->tstamp = &priv->tstamp; c->ix = ix; - c->cpu = cpu; c->pdev = &priv->mdev->pdev->dev; c->netdev = priv->netdev; c->mkey_be = cpu_to_be32(priv->mdev->mlx5e_res.mkey.key); @@ -1848,7 +1848,8 @@ static void mlx5e_activate_channel(struct mlx5e_channel *c) for (tc = 0; tc < c->num_tc; tc++) mlx5e_activate_txqsq(&c->sq[tc]); mlx5e_activate_rq(&c->rq); - netif_set_xps_queue(c->netdev, get_cpu_mask(c->cpu), c->ix); + netif_set_xps_queue(c->netdev, + mlx5_get_vector_affinity(c->priv->mdev, c->ix), c->ix); } static void mlx5e_deactivate_channel(struct mlx5e_channel *c) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index d2fd55e5c68b..e464e8179655 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -316,6 +316,9 @@ static int mlx5_alloc_irq_vectors(struct mlx5_core_dev *dev) { struct mlx5_priv *priv = &dev->priv; struct mlx5_eq_table *table = &priv->eq_table; + struct irq_affinity irqdesc = { + .pre_vectors = MLX5_EQ_VEC_COMP_BASE, + }; int num_eqs = 1 << MLX5_CAP_GEN(dev, log_max_eq); int nvec; @@ -329,9 +332,10 @@ static int mlx5_alloc_irq_vectors(struct mlx5_core_dev *dev) if (!priv->irq_info) goto err_free_msix; - nvec = pci_alloc_irq_vectors(dev->pdev, + nvec = pci_alloc_irq_vectors_affinity(dev->pdev, MLX5_EQ_VEC_COMP_BASE + 1, nvec, - PCI_IRQ_MSIX); + PCI_IRQ_MSIX | PCI_IRQ_AFFINITY, + &irqdesc); if (nvec < 0) return nvec; @@ -605,63 +609,6 @@ u64 mlx5_read_internal_timer(struct mlx5_core_dev *dev) return (u64)timer_l | (u64)timer_h1 << 32; } -static int mlx5_irq_set_affinity_hint(struct mlx5_core_dev *mdev, int i) -{ - struct mlx5_priv *priv = &mdev->priv; - int irq = pci_irq_vector(mdev->pdev, MLX5_EQ_VEC_COMP_BASE + i); - - if (!zalloc_cpumask_var(&priv->irq_info[i].mask, GFP_KERNEL)) { - mlx5_core_warn(mdev, "zalloc_cpumask_var failed"); - return -ENOMEM; - } - - cpumask_set_cpu(cpumask_local_spread(i, priv->numa_node), - priv->irq_info[i].mask); - - if (IS_ENABLED(CONFIG_SMP) && - irq_set_affinity_hint(irq, priv->irq_info[i].mask)) - mlx5_core_warn(mdev, "irq_set_affinity_hint failed, irq 0x%.4x", irq); - - return 0; -} - -static void mlx5_irq_clear_affinity_hint(struct mlx5_core_dev *mdev, int i) -{ - struct mlx5_priv *priv = &mdev->priv; - int irq = pci_irq_vector(mdev->pdev, MLX5_EQ_VEC_COMP_BASE + i); - - irq_set_affinity_hint(irq, NULL); - free_cpumask_var(priv->irq_info[i].mask); -} - -static int mlx5_irq_set_affinity_hints(struct mlx5_core_dev *mdev) -{ - int err; - int i; - - for (i = 0; i < mdev->priv.eq_table.num_comp_vectors; i++) { - err = mlx5_irq_set_affinity_hint(mdev, i); - if (err) - goto err_out; - } - - return 0; - -err_out: - for (i--; i >= 0; i--) - mlx5_irq_clear_affinity_hint(mdev, i); - - return err; -} - -static void mlx5_irq_clear_affinity_hints(struct mlx5_core_dev *mdev) -{ - int i; - - for (i = 0; i < mdev->priv.eq_table.num_comp_vectors; i++) - mlx5_irq_clear_affinity_hint(mdev, i); -} - int mlx5_vector2eqn(struct mlx5_core_dev *dev, int vector, int *eqn, unsigned int *irqn) { @@ -1134,12 +1081,6 @@ static int mlx5_load_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv, goto err_stop_eqs; } - err = mlx5_irq_set_affinity_hints(dev); - if (err) { - dev_err(&pdev->dev, "Failed to alloc affinity hint cpumask\n"); - goto err_affinity_hints; - } - err = mlx5_init_fs(dev); if (err) { dev_err(&pdev->dev, "Failed to init flow steering\n"); @@ -1199,9 +1140,6 @@ err_sriov: mlx5_cleanup_fs(dev); err_fs: - mlx5_irq_clear_affinity_hints(dev); - -err_affinity_hints: free_comp_eqs(dev); err_stop_eqs: @@ -1274,7 +1212,6 @@ static int mlx5_unload_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv, mlx5_eswitch_detach(dev->priv.eswitch); #endif mlx5_cleanup_fs(dev); - mlx5_irq_clear_affinity_hints(dev); free_comp_eqs(dev); mlx5_stop_eqs(dev); mlx5_put_uars_page(dev, priv->uar); diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 5bac7f53b4f9..579731842c94 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -534,7 +534,6 @@ struct mlx5_core_sriov { }; struct mlx5_irq_info { - cpumask_var_t mask; char name[MLX5_MAX_IRQ_NAME]; }; @@ -1184,4 +1183,10 @@ enum { MLX5_TRIGGERED_CMD_COMP = (u64)1 << 32, }; +static inline const struct cpumask * +mlx5_get_vector_affinity(struct mlx5_core_dev *dev, int vector) +{ + return pci_irq_get_affinity(dev->pdev, MLX5_EQ_VEC_COMP_BASE + vector); +} + #endif /* MLX5_DRIVER_H */ -- cgit v1.2.3 From c66cd353bbe6869a059869a7a1518ec619afdc9d Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Thu, 13 Jul 2017 11:09:41 +0300 Subject: RDMA/core: expose affinity mappings per completion vector MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This will allow ULPs to intelligently locate threads based on completion vector cpu affinity mappings. In case the driver does not expose a get_vector_affinity callout, return NULL so the caller can maintain a fallback logic. Reviewed-by: Christoph Hellwig Reviewed-by: HÃ¥kon Bugge Acked-by: Doug Ledford Signed-off-by: Sagi Grimberg Signed-off-by: Doug Ledford --- include/rdma/ib_verbs.h | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'include') diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index b5732432bb29..73ed2e4e802f 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2296,6 +2296,8 @@ struct ib_device { */ int (*get_port_immutable)(struct ib_device *, u8, struct ib_port_immutable *); void (*get_dev_fw_str)(struct ib_device *, char *str, size_t str_len); + const struct cpumask *(*get_vector_affinity)(struct ib_device *ibdev, + int comp_vector); }; struct ib_client { @@ -3706,4 +3708,26 @@ static inline enum rdma_ah_attr_type rdma_ah_find_type(struct ib_device *dev, else return RDMA_AH_ATTR_TYPE_IB; } + +/** + * ib_get_vector_affinity - Get the affinity mappings of a given completion + * vector + * @device: the rdma device + * @comp_vector: index of completion vector + * + * Returns NULL on failure, otherwise a corresponding cpu map of the + * completion vector (returns all-cpus map if the device driver doesn't + * implement get_vector_affinity). + */ +static inline const struct cpumask * +ib_get_vector_affinity(struct ib_device *device, int comp_vector) +{ + if (comp_vector < 0 || comp_vector >= device->num_comp_vectors || + !device->get_vector_affinity) + return NULL; + + return device->get_vector_affinity(device, comp_vector); + +} + #endif /* IB_VERBS_H */ -- cgit v1.2.3 From 24c5dc6610e8a3764fcb885cc3284c12ff1513de Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Thu, 13 Jul 2017 11:09:43 +0300 Subject: block: Add rdma affinity based queue mapping helper Like pci and virtio, we add a rdma helper for affinity spreading. This achieves optimal mq affinity assignments according to the underlying rdma device affinity maps. Reviewed-by: Jens Axboe Reviewed-by: Christoph Hellwig Reviewed-by: Max Gurtovoy Signed-off-by: Sagi Grimberg Signed-off-by: Doug Ledford --- block/Kconfig | 5 +++++ block/Makefile | 1 + block/blk-mq-rdma.c | 52 +++++++++++++++++++++++++++++++++++++++++++++ include/linux/blk-mq-rdma.h | 10 +++++++++ 4 files changed, 68 insertions(+) create mode 100644 block/blk-mq-rdma.c create mode 100644 include/linux/blk-mq-rdma.h (limited to 'include') diff --git a/block/Kconfig b/block/Kconfig index 89cd28f8d051..3ab42bbb06d5 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -206,4 +206,9 @@ config BLK_MQ_VIRTIO depends on BLOCK && VIRTIO default y +config BLK_MQ_RDMA + bool + depends on BLOCK && INFINIBAND + default y + source block/Kconfig.iosched diff --git a/block/Makefile b/block/Makefile index 2b281cf258a0..9396ebc85d24 100644 --- a/block/Makefile +++ b/block/Makefile @@ -29,6 +29,7 @@ obj-$(CONFIG_BLK_CMDLINE_PARSER) += cmdline-parser.o obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o blk-integrity.o t10-pi.o obj-$(CONFIG_BLK_MQ_PCI) += blk-mq-pci.o obj-$(CONFIG_BLK_MQ_VIRTIO) += blk-mq-virtio.o +obj-$(CONFIG_BLK_MQ_RDMA) += blk-mq-rdma.o obj-$(CONFIG_BLK_DEV_ZONED) += blk-zoned.o obj-$(CONFIG_BLK_WBT) += blk-wbt.o obj-$(CONFIG_BLK_DEBUG_FS) += blk-mq-debugfs.o diff --git a/block/blk-mq-rdma.c b/block/blk-mq-rdma.c new file mode 100644 index 000000000000..996167f1de18 --- /dev/null +++ b/block/blk-mq-rdma.c @@ -0,0 +1,52 @@ +/* + * Copyright (c) 2017 Sagi Grimberg. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ +#include +#include +#include + +/** + * blk_mq_rdma_map_queues - provide a default queue mapping for rdma device + * @set: tagset to provide the mapping for + * @dev: rdma device associated with @set. + * @first_vec: first interrupt vectors to use for queues (usually 0) + * + * This function assumes the rdma device @dev has at least as many available + * interrupt vetors as @set has queues. It will then query it's affinity mask + * and built queue mapping that maps a queue to the CPUs that have irq affinity + * for the corresponding vector. + * + * In case either the driver passed a @dev with less vectors than + * @set->nr_hw_queues, or @dev does not provide an affinity mask for a + * vector, we fallback to the naive mapping. + */ +int blk_mq_rdma_map_queues(struct blk_mq_tag_set *set, + struct ib_device *dev, int first_vec) +{ + const struct cpumask *mask; + unsigned int queue, cpu; + + for (queue = 0; queue < set->nr_hw_queues; queue++) { + mask = ib_get_vector_affinity(dev, first_vec + queue); + if (!mask) + goto fallback; + + for_each_cpu(cpu, mask) + set->mq_map[cpu] = queue; + } + + return 0; + +fallback: + return blk_mq_map_queues(set); +} +EXPORT_SYMBOL_GPL(blk_mq_rdma_map_queues); diff --git a/include/linux/blk-mq-rdma.h b/include/linux/blk-mq-rdma.h new file mode 100644 index 000000000000..b4ade198007d --- /dev/null +++ b/include/linux/blk-mq-rdma.h @@ -0,0 +1,10 @@ +#ifndef _LINUX_BLK_MQ_RDMA_H +#define _LINUX_BLK_MQ_RDMA_H + +struct blk_mq_tag_set; +struct ib_device; + +int blk_mq_rdma_map_queues(struct blk_mq_tag_set *set, + struct ib_device *dev, int first_vec); + +#endif /* _LINUX_BLK_MQ_RDMA_H */ -- cgit v1.2.3 From 9047811b776ce09ba06623dd2a846cc501f0065b Mon Sep 17 00:00:00 2001 From: "Ismail, Mustafa" Date: Wed, 28 Jun 2017 09:02:45 -0500 Subject: RDMA/core: Add wait/retry version of ibnl_unicast Add a wait/retry version of ibnl_unicast, ibnl_unicast_wait, and modify ibnl_unicast to not wait/retry. This eliminates the undesirable wait for future users of ibnl_unicast. Change Portmapper calls originating from kernel to user-space to use ibnl_unicast_wait and take advantage of the wait/retry logic in netlink_unicast. Signed-off-by: Mustafa Ismail Signed-off-by: Chien Tin Tung Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/iwpm_msg.c | 6 +++--- drivers/infiniband/core/netlink.c | 12 +++++++++++- include/rdma/rdma_netlink.h | 10 ++++++++++ 3 files changed, 24 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/core/iwpm_msg.c b/drivers/infiniband/core/iwpm_msg.c index a0e7c16d8bd8..add99b92afdf 100644 --- a/drivers/infiniband/core/iwpm_msg.c +++ b/drivers/infiniband/core/iwpm_msg.c @@ -174,7 +174,7 @@ int iwpm_add_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client) goto add_mapping_error; nlmsg_request->req_buffer = pm_msg; - ret = ibnl_unicast(skb, nlh, iwpm_user_pid); + ret = ibnl_unicast_wait(skb, nlh, iwpm_user_pid); if (ret) { skb = NULL; /* skb is freed in the netlink send-op handling */ iwpm_user_pid = IWPM_PID_UNDEFINED; @@ -251,7 +251,7 @@ int iwpm_add_and_query_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client) goto query_mapping_error; nlmsg_request->req_buffer = pm_msg; - ret = ibnl_unicast(skb, nlh, iwpm_user_pid); + ret = ibnl_unicast_wait(skb, nlh, iwpm_user_pid); if (ret) { skb = NULL; /* skb is freed in the netlink send-op handling */ err_str = "Unable to send a nlmsg"; @@ -312,7 +312,7 @@ int iwpm_remove_mapping(struct sockaddr_storage *local_addr, u8 nl_client) if (ret) goto remove_mapping_error; - ret = ibnl_unicast(skb, nlh, iwpm_user_pid); + ret = ibnl_unicast_wait(skb, nlh, iwpm_user_pid); if (ret) { skb = NULL; /* skb is freed in the netlink send-op handling */ iwpm_user_pid = IWPM_PID_UNDEFINED; diff --git a/drivers/infiniband/core/netlink.c b/drivers/infiniband/core/netlink.c index 94931c474d41..0fc50e15ae22 100644 --- a/drivers/infiniband/core/netlink.c +++ b/drivers/infiniband/core/netlink.c @@ -232,11 +232,21 @@ int ibnl_unicast(struct sk_buff *skb, struct nlmsghdr *nlh, { int err; - err = netlink_unicast(nls, skb, pid, 0); + err = netlink_unicast(nls, skb, pid, MSG_DONTWAIT); return (err < 0) ? err : 0; } EXPORT_SYMBOL(ibnl_unicast); +int ibnl_unicast_wait(struct sk_buff *skb, struct nlmsghdr *nlh, + __u32 pid) +{ + int err; + + err = netlink_unicast(nls, skb, pid, 0); + return (err < 0) ? err : 0; +} +EXPORT_SYMBOL(ibnl_unicast_wait); + int ibnl_multicast(struct sk_buff *skb, struct nlmsghdr *nlh, unsigned int group, gfp_t flags) { diff --git a/include/rdma/rdma_netlink.h b/include/rdma/rdma_netlink.h index 348c102cb5f6..5b1466770917 100644 --- a/include/rdma/rdma_netlink.h +++ b/include/rdma/rdma_netlink.h @@ -63,6 +63,16 @@ int ibnl_put_attr(struct sk_buff *skb, struct nlmsghdr *nlh, int ibnl_unicast(struct sk_buff *skb, struct nlmsghdr *nlh, __u32 pid); +/** + * Send, with wait/1 retry, the supplied skb to a specific userspace PID. + * @skb: The netlink skb + * @nlh: Header of the netlink message to send + * @pid: Userspace netlink process ID + * Returns 0 on success or a negative error code. + */ +int ibnl_unicast_wait(struct sk_buff *skb, struct nlmsghdr *nlh, + __u32 pid); + /** * Send the supplied skb to a netlink group. * @skb: The netlink skb -- cgit v1.2.3 From c9901724a2f14128ef6a57986babcbfbcf61a257 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 5 Jun 2017 10:20:11 +0300 Subject: RDMA/netlink: Remove netlink clients infrastructure RDMA netlink has a complicated infrastructure for dynamically registering and de-registering netlink clients to the NETLINK_RDMA group. The complicated portion of this code is not widely used because 2 of the 3 current clients are statically compiled together with netlink.c. The infrastructure, therefore, is deemed overkill. Refactor the code to eliminate the dynamically added clients. Now all clients are pre-registered in a client array at compile time, and at run time they merely check-in with the infrastructure to pass their callback table for inclusion in the pre-sized client array. This also allows for future cleanups and removal of unneeded code in the iwcm* netlink handler. Signed-off-by: Leon Romanovsky Reviewed-by: Chien Tin Tung --- drivers/infiniband/core/cma.c | 6 +- drivers/infiniband/core/core_priv.h | 4 +- drivers/infiniband/core/device.c | 45 +++------ drivers/infiniband/core/iwcm.c | 10 +- drivers/infiniband/core/netlink.c | 185 +++++++++++++++++------------------- include/rdma/rdma_netlink.h | 13 +-- 6 files changed, 112 insertions(+), 151 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index ca4135c596ba..2a16a559bdda 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -4512,9 +4512,7 @@ static int __init cma_init(void) if (ret) goto err; - if (ibnl_add_client(RDMA_NL_RDMA_CM, ARRAY_SIZE(cma_cb_table), - cma_cb_table)) - pr_warn("RDMA CMA: failed to add netlink callback\n"); + rdma_nl_register(RDMA_NL_RDMA_CM, cma_cb_table); cma_configfs_init(); return 0; @@ -4531,7 +4529,7 @@ err_wq: static void __exit cma_cleanup(void) { cma_configfs_exit(); - ibnl_remove_client(RDMA_NL_RDMA_CM); + rdma_nl_unregister(RDMA_NL_RDMA_CM); ib_unregister_client(&cma_client); unregister_netdevice_notifier(&cma_nb); rdma_addr_unregister_client(&addr_client); diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h index 11ae67514e13..e759c27113cd 100644 --- a/drivers/infiniband/core/core_priv.h +++ b/drivers/infiniband/core/core_priv.h @@ -179,8 +179,8 @@ void ib_mad_cleanup(void); int ib_sa_init(void); void ib_sa_cleanup(void); -int ibnl_init(void); -void ibnl_cleanup(void); +int rdma_nl_init(void); +void rdma_nl_exit(void); /** * Check if there are any listeners to the netlink group diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index a5dfab6adf49..d0994cd30eae 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -1086,29 +1086,15 @@ struct net_device *ib_get_net_dev_by_params(struct ib_device *dev, } EXPORT_SYMBOL(ib_get_net_dev_by_params); -static struct ibnl_client_cbs ibnl_ls_cb_table[] = { +static const struct ibnl_client_cbs ibnl_ls_cb_table[] = { [RDMA_NL_LS_OP_RESOLVE] = { - .dump = ib_nl_handle_resolve_resp, - .module = THIS_MODULE }, + .dump = ib_nl_handle_resolve_resp}, [RDMA_NL_LS_OP_SET_TIMEOUT] = { - .dump = ib_nl_handle_set_timeout, - .module = THIS_MODULE }, + .dump = ib_nl_handle_set_timeout}, [RDMA_NL_LS_OP_IP_RESOLVE] = { - .dump = ib_nl_handle_ip_res_resp, - .module = THIS_MODULE }, + .dump = ib_nl_handle_ip_res_resp}, }; -static int ib_add_ibnl_clients(void) -{ - return ibnl_add_client(RDMA_NL_LS, ARRAY_SIZE(ibnl_ls_cb_table), - ibnl_ls_cb_table); -} - -static void ib_remove_ibnl_clients(void) -{ - ibnl_remove_client(RDMA_NL_LS); -} - static int __init ib_core_init(void) { int ret; @@ -1130,9 +1116,9 @@ static int __init ib_core_init(void) goto err_comp; } - ret = ibnl_init(); + ret = rdma_nl_init(); if (ret) { - pr_warn("Couldn't init IB netlink interface\n"); + pr_warn("Couldn't init IB netlink interface: err %d\n", ret); goto err_sysfs; } @@ -1154,24 +1140,17 @@ static int __init ib_core_init(void) goto err_mad; } - ret = ib_add_ibnl_clients(); - if (ret) { - pr_warn("Couldn't register ibnl clients\n"); - goto err_sa; - } - ret = register_lsm_notifier(&ibdev_lsm_nb); if (ret) { pr_warn("Couldn't register LSM notifier. ret %d\n", ret); - goto err_ibnl_clients; + goto err_sa; } + rdma_nl_register(RDMA_NL_LS, ibnl_ls_cb_table); ib_cache_setup(); return 0; -err_ibnl_clients: - ib_remove_ibnl_clients(); err_sa: ib_sa_cleanup(); err_mad: @@ -1179,7 +1158,7 @@ err_mad: err_addr: addr_cleanup(); err_ibnl: - ibnl_cleanup(); + rdma_nl_exit(); err_sysfs: class_unregister(&ib_class); err_comp: @@ -1191,13 +1170,13 @@ err: static void __exit ib_core_cleanup(void) { - unregister_lsm_notifier(&ibdev_lsm_nb); ib_cache_cleanup(); - ib_remove_ibnl_clients(); + rdma_nl_unregister(RDMA_NL_LS); + unregister_lsm_notifier(&ibdev_lsm_nb); ib_sa_cleanup(); ib_mad_cleanup(); addr_cleanup(); - ibnl_cleanup(); + rdma_nl_exit(); class_unregister(&ib_class); destroy_workqueue(ib_comp_wq); /* Make sure that any pending umem accounting work is done. */ diff --git a/drivers/infiniband/core/iwcm.c b/drivers/infiniband/core/iwcm.c index 31661b5c1743..8599271d8be6 100644 --- a/drivers/infiniband/core/iwcm.c +++ b/drivers/infiniband/core/iwcm.c @@ -1175,12 +1175,8 @@ static int __init iw_cm_init(void) ret = iwpm_init(RDMA_NL_IWCM); if (ret) pr_err("iw_cm: couldn't init iwpm\n"); - - ret = ibnl_add_client(RDMA_NL_IWCM, ARRAY_SIZE(iwcm_nl_cb_table), - iwcm_nl_cb_table); - if (ret) - pr_err("iw_cm: couldn't register netlink callbacks\n"); - + else + rdma_nl_register(RDMA_NL_IWCM, iwcm_nl_cb_table); iwcm_wq = alloc_ordered_workqueue("iw_cm_wq", WQ_MEM_RECLAIM); if (!iwcm_wq) return -ENOMEM; @@ -1200,7 +1196,7 @@ static void __exit iw_cm_cleanup(void) { unregister_net_sysctl_table(iwcm_ctl_table_hdr); destroy_workqueue(iwcm_wq); - ibnl_remove_client(RDMA_NL_IWCM); + rdma_nl_unregister(RDMA_NL_IWCM); iwpm_exit(RDMA_NL_IWCM); } diff --git a/drivers/infiniband/core/netlink.c b/drivers/infiniband/core/netlink.c index 0fc50e15ae22..06f7ba31fbdd 100644 --- a/drivers/infiniband/core/netlink.c +++ b/drivers/infiniband/core/netlink.c @@ -39,16 +39,13 @@ #include #include "core_priv.h" -struct ibnl_client { - struct list_head list; - int index; - int nops; - const struct ibnl_client_cbs *cb_table; -}; +#include "core_priv.h" -static DEFINE_MUTEX(ibnl_mutex); +static DEFINE_MUTEX(rdma_nl_mutex); static struct sock *nls; -static LIST_HEAD(client_list); +static struct { + const struct ibnl_client_cbs *cb_table; +} rdma_nl_types[RDMA_NL_NUM_CLIENTS]; int ibnl_chk_listeners(unsigned int group) { @@ -57,58 +54,74 @@ int ibnl_chk_listeners(unsigned int group) return 0; } -int ibnl_add_client(int index, int nops, - const struct ibnl_client_cbs cb_table[]) +static bool is_nl_msg_valid(unsigned int type, unsigned int op) { - struct ibnl_client *cur; - struct ibnl_client *nl_client; + static const unsigned int max_num_ops[RDMA_NL_NUM_CLIENTS - 1] = { + RDMA_NL_RDMA_CM_NUM_OPS, + RDMA_NL_IWPM_NUM_OPS, + 0, + RDMA_NL_LS_NUM_OPS, + 0 }; - nl_client = kmalloc(sizeof *nl_client, GFP_KERNEL); - if (!nl_client) - return -ENOMEM; + /* + * This BUILD_BUG_ON is intended to catch addition of new + * RDMA netlink protocol without updating the array above. + */ + BUILD_BUG_ON(RDMA_NL_NUM_CLIENTS != 6); - nl_client->index = index; - nl_client->nops = nops; - nl_client->cb_table = cb_table; + if (type > RDMA_NL_NUM_CLIENTS - 1) + return false; - mutex_lock(&ibnl_mutex); + return (op < max_num_ops[type - 1]) ? true : false; +} - list_for_each_entry(cur, &client_list, list) { - if (cur->index == index) { - pr_warn("Client for %d already exists\n", index); - mutex_unlock(&ibnl_mutex); - kfree(nl_client); - return -EINVAL; - } - } +static bool is_nl_valid(unsigned int type, unsigned int op) +{ + if (!is_nl_msg_valid(type, op) || + !rdma_nl_types[type].cb_table || + !rdma_nl_types[type].cb_table[op].dump) + return false; + return true; +} - list_add_tail(&nl_client->list, &client_list); +void rdma_nl_register(unsigned int index, + const struct ibnl_client_cbs cb_table[]) +{ + mutex_lock(&rdma_nl_mutex); + if (!is_nl_msg_valid(index, 0)) { + /* + * All clients are not interesting in success/failure of + * this call. They want to see the print to error log and + * continue their initialization. Print warning for them, + * because it is programmer's error to be here. + */ + mutex_unlock(&rdma_nl_mutex); + WARN(true, + "The not-valid %u index was supplied to RDMA netlink\n", + index); + return; + } - mutex_unlock(&ibnl_mutex); + if (rdma_nl_types[index].cb_table) { + mutex_unlock(&rdma_nl_mutex); + WARN(true, + "The %u index is already registered in RDMA netlink\n", + index); + return; + } - return 0; + rdma_nl_types[index].cb_table = cb_table; + mutex_unlock(&rdma_nl_mutex); } -EXPORT_SYMBOL(ibnl_add_client); +EXPORT_SYMBOL(rdma_nl_register); -int ibnl_remove_client(int index) +void rdma_nl_unregister(unsigned int index) { - struct ibnl_client *cur, *next; - - mutex_lock(&ibnl_mutex); - list_for_each_entry_safe(cur, next, &client_list, list) { - if (cur->index == index) { - list_del(&(cur->list)); - mutex_unlock(&ibnl_mutex); - kfree(cur); - return 0; - } - } - pr_warn("Can't remove callback for client idx %d. Not found\n", index); - mutex_unlock(&ibnl_mutex); - - return -EINVAL; + mutex_lock(&rdma_nl_mutex); + rdma_nl_types[index].cb_table = NULL; + mutex_unlock(&rdma_nl_mutex); } -EXPORT_SYMBOL(ibnl_remove_client); +EXPORT_SYMBOL(rdma_nl_unregister); void *ibnl_put_msg(struct sk_buff *skb, struct nlmsghdr **nlh, int seq, int len, int client, int op, int flags) @@ -149,45 +162,31 @@ EXPORT_SYMBOL(ibnl_put_attr); static int ibnl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { - struct ibnl_client *client; int type = nlh->nlmsg_type; - int index = RDMA_NL_GET_CLIENT(type); + unsigned int index = RDMA_NL_GET_CLIENT(type); unsigned int op = RDMA_NL_GET_OP(type); + struct netlink_callback cb = {}; + struct netlink_dump_control c = {}; - list_for_each_entry(client, &client_list, list) { - if (client->index == index) { - if (op >= client->nops || !client->cb_table[op].dump) - return -EINVAL; - - /* - * For response or local service set_timeout request, - * there is no need to use netlink_dump_start. - */ - if (!(nlh->nlmsg_flags & NLM_F_REQUEST) || - (index == RDMA_NL_LS && - op == RDMA_NL_LS_OP_SET_TIMEOUT)) { - struct netlink_callback cb = { - .skb = skb, - .nlh = nlh, - .dump = client->cb_table[op].dump, - .module = client->cb_table[op].module, - }; - - return cb.dump(skb, &cb); - } - - { - struct netlink_dump_control c = { - .dump = client->cb_table[op].dump, - .module = client->cb_table[op].module, - }; - return netlink_dump_start(nls, skb, nlh, &c); - } - } + if (!is_nl_valid(index, op)) + return -EINVAL; + + /* + * For response or local service set_timeout request, + * there is no need to use netlink_dump_start. + */ + if (!(nlh->nlmsg_flags & NLM_F_REQUEST) || + (index == RDMA_NL_LS && op == RDMA_NL_LS_OP_SET_TIMEOUT)) { + cb.skb = skb; + cb.nlh = nlh; + cb.dump = rdma_nl_types[index].cb_table[op].dump; + cb.module = rdma_nl_types[index].cb_table[op].module; + return cb.dump(skb, &cb); } - pr_info("Index %d wasn't found in client list\n", index); - return -EINVAL; + c.dump = rdma_nl_types[index].cb_table[op].dump; + c.module = rdma_nl_types[index].cb_table[op].module; + return netlink_dump_start(nls, skb, nlh, &c); } static void ibnl_rcv_reply_skb(struct sk_buff *skb) @@ -221,10 +220,10 @@ static void ibnl_rcv_reply_skb(struct sk_buff *skb) static void ibnl_rcv(struct sk_buff *skb) { - mutex_lock(&ibnl_mutex); + mutex_lock(&rdma_nl_mutex); ibnl_rcv_reply_skb(skb); netlink_rcv_skb(skb, &ibnl_rcv_msg); - mutex_unlock(&ibnl_mutex); + mutex_unlock(&rdma_nl_mutex); } int ibnl_unicast(struct sk_buff *skb, struct nlmsghdr *nlh, @@ -254,32 +253,26 @@ int ibnl_multicast(struct sk_buff *skb, struct nlmsghdr *nlh, } EXPORT_SYMBOL(ibnl_multicast); -int __init ibnl_init(void) +int __init rdma_nl_init(void) { struct netlink_kernel_cfg cfg = { .input = ibnl_rcv, }; nls = netlink_kernel_create(&init_net, NETLINK_RDMA, &cfg); - if (!nls) { - pr_warn("Failed to create netlink socket\n"); + if (!nls) return -ENOMEM; - } nls->sk_sndtimeo = 10 * HZ; return 0; } -void ibnl_cleanup(void) +void rdma_nl_exit(void) { - struct ibnl_client *cur, *next; + int idx; - mutex_lock(&ibnl_mutex); - list_for_each_entry_safe(cur, next, &client_list, list) { - list_del(&(cur->list)); - kfree(cur); - } - mutex_unlock(&ibnl_mutex); + for (idx = 0; idx < RDMA_NL_NUM_CLIENTS; idx++) + rdma_nl_unregister(idx); netlink_kernel_release(nls); } diff --git a/include/rdma/rdma_netlink.h b/include/rdma/rdma_netlink.h index 5b1466770917..aadf0ab963b2 100644 --- a/include/rdma/rdma_netlink.h +++ b/include/rdma/rdma_netlink.h @@ -11,23 +11,18 @@ struct ibnl_client_cbs { }; /** - * Add a a client to the list of IB netlink exporters. + * Register client in RDMA netlink. * @index: Index of the added client - * @nops: Number of supported ops by the added client. * @cb_table: A table for op->callback - * - * Returns 0 on success or a negative error code. */ -int ibnl_add_client(int index, int nops, - const struct ibnl_client_cbs cb_table[]); +void rdma_nl_register(unsigned int index, + const struct ibnl_client_cbs cb_table[]); /** * Remove a client from IB netlink. * @index: Index of the removed IB client. - * - * Returns 0 on success or a negative error code. */ -int ibnl_remove_client(int index); +void rdma_nl_unregister(unsigned int index); /** * Put a new message in a supplied skb. -- cgit v1.2.3 From 64401b69b29164c5731018cc44fc9b144ac9c5ae Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 30 May 2017 11:29:56 +0300 Subject: RDMA/netlink: Remove redundant owner option for netlink callbacks Owner field is not needed to be set because netlink is part of ib_core which will be unloaded last after all other modules are unloaded. Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/cma.c | 3 +-- drivers/infiniband/core/netlink.c | 2 -- include/rdma/rdma_netlink.h | 1 - 3 files changed, 1 insertion(+), 5 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 2a16a559bdda..0c85f140e616 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -4459,8 +4459,7 @@ out: } static const struct ibnl_client_cbs cma_cb_table[] = { - [RDMA_NL_RDMA_CM_ID_STATS] = { .dump = cma_get_id_stats, - .module = THIS_MODULE }, + [RDMA_NL_RDMA_CM_ID_STATS] = { .dump = cma_get_id_stats}, }; static int cma_init_net(struct net *net) diff --git a/drivers/infiniband/core/netlink.c b/drivers/infiniband/core/netlink.c index 06f7ba31fbdd..cd9b7e7b7d2c 100644 --- a/drivers/infiniband/core/netlink.c +++ b/drivers/infiniband/core/netlink.c @@ -180,12 +180,10 @@ static int ibnl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, cb.skb = skb; cb.nlh = nlh; cb.dump = rdma_nl_types[index].cb_table[op].dump; - cb.module = rdma_nl_types[index].cb_table[op].module; return cb.dump(skb, &cb); } c.dump = rdma_nl_types[index].cb_table[op].dump; - c.module = rdma_nl_types[index].cb_table[op].module; return netlink_dump_start(nls, skb, nlh, &c); } diff --git a/include/rdma/rdma_netlink.h b/include/rdma/rdma_netlink.h index aadf0ab963b2..c124d8e43fc8 100644 --- a/include/rdma/rdma_netlink.h +++ b/include/rdma/rdma_netlink.h @@ -7,7 +7,6 @@ struct ibnl_client_cbs { int (*dump)(struct sk_buff *skb, struct netlink_callback *nlcb); - struct module *module; }; /** -- cgit v1.2.3 From e3a2b93dddad315f01a4b67faee738954c084072 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 12 Jun 2017 16:00:19 +0300 Subject: RDMA/netlink: Add flag to consolidate common handling Add ability to provide flags to control RDMA netlink callbacks and convert addr.c and sa_query.c to be first users of such infrastructure. It allows to move their CAP_NET_ADMIN checks into netlink core. Signed-off-by: Leon Romanovsky Reviewed-by: Steve Wise --- drivers/infiniband/core/addr.c | 3 +-- drivers/infiniband/core/device.c | 12 +++++++++--- drivers/infiniband/core/netlink.c | 4 ++++ drivers/infiniband/core/sa_query.c | 6 ++---- include/rdma/rdma_netlink.h | 6 ++++++ 5 files changed, 22 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c index 01236cef7bfb..9f3339861ec5 100644 --- a/drivers/infiniband/core/addr.c +++ b/drivers/infiniband/core/addr.c @@ -134,8 +134,7 @@ int ib_nl_handle_ip_res_resp(struct sk_buff *skb, const struct nlmsghdr *nlh = (struct nlmsghdr *)cb->nlh; if ((nlh->nlmsg_flags & NLM_F_REQUEST) || - !(NETLINK_CB(skb).sk) || - !netlink_capable(skb, CAP_NET_ADMIN)) + !(NETLINK_CB(skb).sk)) return -EPERM; if (ib_nl_is_good_ip_resp(nlh)) diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index d0994cd30eae..7ae29cc49a5e 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -1088,11 +1088,17 @@ EXPORT_SYMBOL(ib_get_net_dev_by_params); static const struct ibnl_client_cbs ibnl_ls_cb_table[] = { [RDMA_NL_LS_OP_RESOLVE] = { - .dump = ib_nl_handle_resolve_resp}, + .dump = ib_nl_handle_resolve_resp, + .flags = RDMA_NL_ADMIN_PERM, + }, [RDMA_NL_LS_OP_SET_TIMEOUT] = { - .dump = ib_nl_handle_set_timeout}, + .dump = ib_nl_handle_set_timeout, + .flags = RDMA_NL_ADMIN_PERM, + }, [RDMA_NL_LS_OP_IP_RESOLVE] = { - .dump = ib_nl_handle_ip_res_resp}, + .dump = ib_nl_handle_ip_res_resp, + .flags = RDMA_NL_ADMIN_PERM, + }, }; static int __init ib_core_init(void) diff --git a/drivers/infiniband/core/netlink.c b/drivers/infiniband/core/netlink.c index 826fbd612c7d..c5ee62a24960 100644 --- a/drivers/infiniband/core/netlink.c +++ b/drivers/infiniband/core/netlink.c @@ -171,6 +171,10 @@ static int rdma_nl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, if (!is_nl_valid(index, op)) return -EINVAL; + if ((rdma_nl_types[index].cb_table[op].flags & RDMA_NL_ADMIN_PERM) && + !netlink_capable(skb, CAP_NET_ADMIN)) + return -EPERM; + /* * For response or local service set_timeout request, * there is no need to use netlink_dump_start. diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c index 70fa4cabe48e..b499f4422f41 100644 --- a/drivers/infiniband/core/sa_query.c +++ b/drivers/infiniband/core/sa_query.c @@ -1033,8 +1033,7 @@ int ib_nl_handle_set_timeout(struct sk_buff *skb, int ret; if (!(nlh->nlmsg_flags & NLM_F_REQUEST) || - !(NETLINK_CB(skb).sk) || - !netlink_capable(skb, CAP_NET_ADMIN)) + !(NETLINK_CB(skb).sk)) return -EPERM; ret = nla_parse(tb, LS_NLA_TYPE_MAX - 1, nlmsg_data(nlh), @@ -1109,8 +1108,7 @@ int ib_nl_handle_resolve_resp(struct sk_buff *skb, int ret; if ((nlh->nlmsg_flags & NLM_F_REQUEST) || - !(NETLINK_CB(skb).sk) || - !netlink_capable(skb, CAP_NET_ADMIN)) + !(NETLINK_CB(skb).sk)) return -EPERM; spin_lock_irqsave(&ib_nl_request_lock, flags); diff --git a/include/rdma/rdma_netlink.h b/include/rdma/rdma_netlink.h index c124d8e43fc8..6ea36ec45401 100644 --- a/include/rdma/rdma_netlink.h +++ b/include/rdma/rdma_netlink.h @@ -7,6 +7,12 @@ struct ibnl_client_cbs { int (*dump)(struct sk_buff *skb, struct netlink_callback *nlcb); + u8 flags; +}; + +enum rdma_nl_flags { + /* Require CAP_NET_ADMIN */ + RDMA_NL_ADMIN_PERM = 1 << 0, }; /** -- cgit v1.2.3 From f00e64637061876ec7b6383b0bd80197c51e7312 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Sun, 18 Jun 2017 15:35:20 +0300 Subject: RDMA/netlink: Rename and remove redundant parameter from ibnl_unicast* Netlink message header is not needed for unicast reply, hence remove it. Signed-off-by: Leon Romanovsky Reviewed-by: Steve Wise --- drivers/infiniband/core/iwpm_msg.c | 6 +++--- drivers/infiniband/core/iwpm_util.c | 4 ++-- drivers/infiniband/core/netlink.c | 10 ++++------ include/rdma/rdma_netlink.h | 8 ++------ 4 files changed, 11 insertions(+), 17 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/core/iwpm_msg.c b/drivers/infiniband/core/iwpm_msg.c index 45de263305f5..ca3c160bb9da 100644 --- a/drivers/infiniband/core/iwpm_msg.c +++ b/drivers/infiniband/core/iwpm_msg.c @@ -172,7 +172,7 @@ int iwpm_add_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client) goto add_mapping_error; nlmsg_request->req_buffer = pm_msg; - ret = ibnl_unicast_wait(skb, nlh, iwpm_user_pid); + ret = rdma_nl_unicast_wait(skb, iwpm_user_pid); if (ret) { skb = NULL; /* skb is freed in the netlink send-op handling */ iwpm_user_pid = IWPM_PID_UNDEFINED; @@ -248,7 +248,7 @@ int iwpm_add_and_query_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client) goto query_mapping_error; nlmsg_request->req_buffer = pm_msg; - ret = ibnl_unicast_wait(skb, nlh, iwpm_user_pid); + ret = rdma_nl_unicast_wait(skb, iwpm_user_pid); if (ret) { skb = NULL; /* skb is freed in the netlink send-op handling */ err_str = "Unable to send a nlmsg"; @@ -308,7 +308,7 @@ int iwpm_remove_mapping(struct sockaddr_storage *local_addr, u8 nl_client) if (ret) goto remove_mapping_error; - ret = ibnl_unicast_wait(skb, nlh, iwpm_user_pid); + ret = rdma_nl_unicast_wait(skb, iwpm_user_pid); if (ret) { skb = NULL; /* skb is freed in the netlink send-op handling */ iwpm_user_pid = IWPM_PID_UNDEFINED; diff --git a/drivers/infiniband/core/iwpm_util.c b/drivers/infiniband/core/iwpm_util.c index c46442ac71a2..c81c55942626 100644 --- a/drivers/infiniband/core/iwpm_util.c +++ b/drivers/infiniband/core/iwpm_util.c @@ -597,7 +597,7 @@ static int send_mapinfo_num(u32 mapping_num, u8 nl_client, int iwpm_pid) &mapping_num, IWPM_NLA_MAPINFO_SEND_NUM); if (ret) goto mapinfo_num_error; - ret = ibnl_unicast(skb, nlh, iwpm_pid); + ret = rdma_nl_unicast(skb, iwpm_pid); if (ret) { skb = NULL; err_str = "Unable to send a nlmsg"; @@ -626,7 +626,7 @@ static int send_nlmsg_done(struct sk_buff *skb, u8 nl_client, int iwpm_pid) return -ENOMEM; } nlh->nlmsg_type = NLMSG_DONE; - ret = ibnl_unicast(skb, (struct nlmsghdr *)skb->data, iwpm_pid); + ret = rdma_nl_unicast(skb, iwpm_pid); if (ret) pr_warn("%s Unable to send a nlmsg\n", __func__); return ret; diff --git a/drivers/infiniband/core/netlink.c b/drivers/infiniband/core/netlink.c index e2395a1d9f45..b95a70013f19 100644 --- a/drivers/infiniband/core/netlink.c +++ b/drivers/infiniband/core/netlink.c @@ -239,25 +239,23 @@ static void rdma_nl_rcv(struct sk_buff *skb) mutex_unlock(&rdma_nl_mutex); } -int ibnl_unicast(struct sk_buff *skb, struct nlmsghdr *nlh, - __u32 pid) +int rdma_nl_unicast(struct sk_buff *skb, u32 pid) { int err; err = netlink_unicast(nls, skb, pid, MSG_DONTWAIT); return (err < 0) ? err : 0; } -EXPORT_SYMBOL(ibnl_unicast); +EXPORT_SYMBOL(rdma_nl_unicast); -int ibnl_unicast_wait(struct sk_buff *skb, struct nlmsghdr *nlh, - __u32 pid) +int rdma_nl_unicast_wait(struct sk_buff *skb, __u32 pid) { int err; err = netlink_unicast(nls, skb, pid, 0); return (err < 0) ? err : 0; } -EXPORT_SYMBOL(ibnl_unicast_wait); +EXPORT_SYMBOL(rdma_nl_unicast_wait); int ibnl_multicast(struct sk_buff *skb, struct nlmsghdr *nlh, unsigned int group, gfp_t flags) diff --git a/include/rdma/rdma_netlink.h b/include/rdma/rdma_netlink.h index 6ea36ec45401..e7b0779385e9 100644 --- a/include/rdma/rdma_netlink.h +++ b/include/rdma/rdma_netlink.h @@ -56,22 +56,18 @@ int ibnl_put_attr(struct sk_buff *skb, struct nlmsghdr *nlh, /** * Send the supplied skb to a specific userspace PID. * @skb: The netlink skb - * @nlh: Header of the netlink message to send * @pid: Userspace netlink process ID * Returns 0 on success or a negative error code. */ -int ibnl_unicast(struct sk_buff *skb, struct nlmsghdr *nlh, - __u32 pid); +int rdma_nl_unicast(struct sk_buff *skb, u32 pid); /** * Send, with wait/1 retry, the supplied skb to a specific userspace PID. * @skb: The netlink skb - * @nlh: Header of the netlink message to send * @pid: Userspace netlink process ID * Returns 0 on success or a negative error code. */ -int ibnl_unicast_wait(struct sk_buff *skb, struct nlmsghdr *nlh, - __u32 pid); +int rdma_nl_unicast_wait(struct sk_buff *skb, __u32 pid); /** * Send the supplied skb to a netlink group. -- cgit v1.2.3 From 4d7f693af0c9d0d6940ff36f5adca1adfa0e7e6e Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Sun, 18 Jun 2017 15:44:32 +0300 Subject: RDMA/netlink: Rename and remove redundant parameter from ibnl_multicast The pointer to netlink header was not used in the ibnl_multicast function, so let's remove it and simplify the function signature. Signed-off-by: Leon Romanovsky Reviewed-by: Steve Wise --- drivers/infiniband/core/addr.c | 2 +- drivers/infiniband/core/iwpm_msg.c | 2 +- drivers/infiniband/core/netlink.c | 5 ++--- drivers/infiniband/core/sa_query.c | 2 +- include/rdma/rdma_netlink.h | 4 +--- 5 files changed, 6 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c index 9f3339861ec5..30cf764824ec 100644 --- a/drivers/infiniband/core/addr.c +++ b/drivers/infiniband/core/addr.c @@ -184,7 +184,7 @@ static int ib_nl_ip_send_msg(struct rdma_dev_addr *dev_addr, /* Repair the nlmsg header length */ nlmsg_end(skb, nlh); - ibnl_multicast(skb, nlh, RDMA_NL_GROUP_LS, GFP_KERNEL); + rdma_nl_multicast(skb, RDMA_NL_GROUP_LS, GFP_KERNEL); /* Make the request retry, so when we get the response from userspace * we will have something. diff --git a/drivers/infiniband/core/iwpm_msg.c b/drivers/infiniband/core/iwpm_msg.c index ca3c160bb9da..30825bb9b8e9 100644 --- a/drivers/infiniband/core/iwpm_msg.c +++ b/drivers/infiniband/core/iwpm_msg.c @@ -103,7 +103,7 @@ int iwpm_register_pid(struct iwpm_dev_data *pm_msg, u8 nl_client) pr_debug("%s: Multicasting a nlmsg (dev = %s ifname = %s iwpm = %s)\n", __func__, pm_msg->dev_name, pm_msg->if_name, iwpm_ulib_name); - ret = ibnl_multicast(skb, nlh, RDMA_NL_GROUP_IWPM, GFP_KERNEL); + ret = rdma_nl_multicast(skb, RDMA_NL_GROUP_IWPM, GFP_KERNEL); if (ret) { skb = NULL; /* skb is freed in the netlink send-op handling */ iwpm_user_pid = IWPM_PID_UNAVAILABLE; diff --git a/drivers/infiniband/core/netlink.c b/drivers/infiniband/core/netlink.c index b95a70013f19..5c627d1fbaa9 100644 --- a/drivers/infiniband/core/netlink.c +++ b/drivers/infiniband/core/netlink.c @@ -257,12 +257,11 @@ int rdma_nl_unicast_wait(struct sk_buff *skb, __u32 pid) } EXPORT_SYMBOL(rdma_nl_unicast_wait); -int ibnl_multicast(struct sk_buff *skb, struct nlmsghdr *nlh, - unsigned int group, gfp_t flags) +int rdma_nl_multicast(struct sk_buff *skb, unsigned int group, gfp_t flags) { return nlmsg_multicast(nls, skb, 0, group, flags); } -EXPORT_SYMBOL(ibnl_multicast); +EXPORT_SYMBOL(rdma_nl_multicast); int __init rdma_nl_init(void) { diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c index b499f4422f41..977f64d0e983 100644 --- a/drivers/infiniband/core/sa_query.c +++ b/drivers/infiniband/core/sa_query.c @@ -861,7 +861,7 @@ static int ib_nl_send_msg(struct ib_sa_query *query, gfp_t gfp_mask) /* Repair the nlmsg header length */ nlmsg_end(skb, nlh); - ret = ibnl_multicast(skb, nlh, RDMA_NL_GROUP_LS, gfp_mask); + ret = rdma_nl_multicast(skb, RDMA_NL_GROUP_LS, gfp_mask); if (!ret) ret = len; else diff --git a/include/rdma/rdma_netlink.h b/include/rdma/rdma_netlink.h index e7b0779385e9..16a94c425938 100644 --- a/include/rdma/rdma_netlink.h +++ b/include/rdma/rdma_netlink.h @@ -72,12 +72,10 @@ int rdma_nl_unicast_wait(struct sk_buff *skb, __u32 pid); /** * Send the supplied skb to a netlink group. * @skb: The netlink skb - * @nlh: Header of the netlink message to send * @group: Netlink group ID * @flags: allocation flags * Returns 0 on success or a negative error code. */ -int ibnl_multicast(struct sk_buff *skb, struct nlmsghdr *nlh, - unsigned int group, gfp_t flags); +int rdma_nl_multicast(struct sk_buff *skb, unsigned int group, gfp_t flags); #endif /* _RDMA_NETLINK_H */ -- cgit v1.2.3 From ff61c425c1c563f1d688d59caf3b18a395cbf9c4 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Sun, 18 Jun 2017 15:51:16 +0300 Subject: RDMA/netlink: Simplify and rename ibnl_chk_listeners Make ibnl_chk_listeners function to be one line by removing unneeded comparison. Rename that function to be complaint to other functions in RDMA netlink. Signed-off-by: Leon Romanovsky Reviewed-by: Steve Wise --- drivers/infiniband/core/addr.c | 2 +- drivers/infiniband/core/netlink.c | 7 +++---- drivers/infiniband/core/sa_query.c | 2 +- include/rdma/rdma_netlink.h | 6 ++++++ 4 files changed, 11 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c index 30cf764824ec..7310ece99cd9 100644 --- a/drivers/infiniband/core/addr.c +++ b/drivers/infiniband/core/addr.c @@ -325,7 +325,7 @@ static void queue_req(struct addr_req *req) static int ib_nl_fetch_ha(struct dst_entry *dst, struct rdma_dev_addr *dev_addr, const void *daddr, u32 seq, u16 family) { - if (ibnl_chk_listeners(RDMA_NL_GROUP_LS)) + if (rdma_nl_chk_listeners(RDMA_NL_GROUP_LS)) return -EADDRNOTAVAIL; /* We fill in what we can, the response will fill the rest */ diff --git a/drivers/infiniband/core/netlink.c b/drivers/infiniband/core/netlink.c index 5c627d1fbaa9..514959ccaf2d 100644 --- a/drivers/infiniband/core/netlink.c +++ b/drivers/infiniband/core/netlink.c @@ -47,12 +47,11 @@ static struct { const struct ibnl_client_cbs *cb_table; } rdma_nl_types[RDMA_NL_NUM_CLIENTS]; -int ibnl_chk_listeners(unsigned int group) +int rdma_nl_chk_listeners(unsigned int group) { - if (netlink_has_listeners(nls, group) == 0) - return -1; - return 0; + return (netlink_has_listeners(nls, group)) ? 0 : -1; } +EXPORT_SYMBOL(rdma_nl_chk_listeners); static bool is_nl_msg_valid(unsigned int type, unsigned int op) { diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c index 977f64d0e983..2cc85c2b74b7 100644 --- a/drivers/infiniband/core/sa_query.c +++ b/drivers/infiniband/core/sa_query.c @@ -1418,7 +1418,7 @@ static int send_mad(struct ib_sa_query *query, int timeout_ms, gfp_t gfp_mask) if ((query->flags & IB_SA_ENABLE_LOCAL_SERVICE) && (!(query->flags & IB_SA_QUERY_OPA))) { - if (!ibnl_chk_listeners(RDMA_NL_GROUP_LS)) { + if (!rdma_nl_chk_listeners(RDMA_NL_GROUP_LS)) { if (!ib_nl_make_request(query, gfp_mask)) return id; } diff --git a/include/rdma/rdma_netlink.h b/include/rdma/rdma_netlink.h index 16a94c425938..348e0bbe0fc9 100644 --- a/include/rdma/rdma_netlink.h +++ b/include/rdma/rdma_netlink.h @@ -78,4 +78,10 @@ int rdma_nl_unicast_wait(struct sk_buff *skb, __u32 pid); */ int rdma_nl_multicast(struct sk_buff *skb, unsigned int group, gfp_t flags); +/** + * Check if there are any listeners to the netlink group + * @group: the netlink group ID + * Returns 0 on success or a negative for no listeners. + */ +int rdma_nl_chk_listeners(unsigned int group); #endif /* _RDMA_NETLINK_H */ -- cgit v1.2.3 From 3250b4dbd87aa08c21891cabfc6f6b48b36fd7e5 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 19 Jun 2017 18:23:45 +0300 Subject: RDMA/netlink: Rename netlink callback struct The RDMA netlink client infrastructure was removed and made obsolete. The old infrastructure defined struct ibnl_client_cbs. Now that all uses of this have been updated to the new infrastructure, rename the struct to be compliant with the current stack naming standards: struct rdma_nl_cbs. Signed-off-by: Leon Romanovsky Reviewed-by: Steve Wise --- drivers/infiniband/core/cma.c | 2 +- drivers/infiniband/core/device.c | 2 +- drivers/infiniband/core/iwcm.c | 2 +- drivers/infiniband/core/netlink.c | 4 ++-- include/rdma/rdma_netlink.h | 4 ++-- 5 files changed, 7 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 0c85f140e616..d8edd8b11561 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -4458,7 +4458,7 @@ out: return skb->len; } -static const struct ibnl_client_cbs cma_cb_table[] = { +static const struct rdma_nl_cbs cma_cb_table[] = { [RDMA_NL_RDMA_CM_ID_STATS] = { .dump = cma_get_id_stats}, }; diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 7ae29cc49a5e..33a39518848c 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -1086,7 +1086,7 @@ struct net_device *ib_get_net_dev_by_params(struct ib_device *dev, } EXPORT_SYMBOL(ib_get_net_dev_by_params); -static const struct ibnl_client_cbs ibnl_ls_cb_table[] = { +static const struct rdma_nl_cbs ibnl_ls_cb_table[] = { [RDMA_NL_LS_OP_RESOLVE] = { .dump = ib_nl_handle_resolve_resp, .flags = RDMA_NL_ADMIN_PERM, diff --git a/drivers/infiniband/core/iwcm.c b/drivers/infiniband/core/iwcm.c index 8599271d8be6..452a3115e3e6 100644 --- a/drivers/infiniband/core/iwcm.c +++ b/drivers/infiniband/core/iwcm.c @@ -80,7 +80,7 @@ const char *__attribute_const__ iwcm_reject_msg(int reason) } EXPORT_SYMBOL(iwcm_reject_msg); -static struct ibnl_client_cbs iwcm_nl_cb_table[] = { +static struct rdma_nl_cbs iwcm_nl_cb_table[] = { [RDMA_NL_IWPM_REG_PID] = {.dump = iwpm_register_pid_cb}, [RDMA_NL_IWPM_ADD_MAPPING] = {.dump = iwpm_add_mapping_cb}, [RDMA_NL_IWPM_QUERY_MAPPING] = {.dump = iwpm_add_and_query_mapping_cb}, diff --git a/drivers/infiniband/core/netlink.c b/drivers/infiniband/core/netlink.c index 514959ccaf2d..a7082adae16b 100644 --- a/drivers/infiniband/core/netlink.c +++ b/drivers/infiniband/core/netlink.c @@ -44,7 +44,7 @@ static DEFINE_MUTEX(rdma_nl_mutex); static struct sock *nls; static struct { - const struct ibnl_client_cbs *cb_table; + const struct rdma_nl_cbs *cb_table; } rdma_nl_types[RDMA_NL_NUM_CLIENTS]; int rdma_nl_chk_listeners(unsigned int group) @@ -84,7 +84,7 @@ static bool is_nl_valid(unsigned int type, unsigned int op) } void rdma_nl_register(unsigned int index, - const struct ibnl_client_cbs cb_table[]) + const struct rdma_nl_cbs cb_table[]) { mutex_lock(&rdma_nl_mutex); if (!is_nl_msg_valid(index, 0)) { diff --git a/include/rdma/rdma_netlink.h b/include/rdma/rdma_netlink.h index 348e0bbe0fc9..92f8832297ab 100644 --- a/include/rdma/rdma_netlink.h +++ b/include/rdma/rdma_netlink.h @@ -5,7 +5,7 @@ #include #include -struct ibnl_client_cbs { +struct rdma_nl_cbs { int (*dump)(struct sk_buff *skb, struct netlink_callback *nlcb); u8 flags; }; @@ -21,7 +21,7 @@ enum rdma_nl_flags { * @cb_table: A table for op->callback */ void rdma_nl_register(unsigned int index, - const struct ibnl_client_cbs cb_table[]); + const struct rdma_nl_cbs cb_table[]); /** * Remove a client from IB netlink. -- cgit v1.2.3 From ecc82c53f9a4ce08ba7df626a4262c86841ced8f Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Sun, 18 Jun 2017 14:39:59 +0300 Subject: RDMA/core: Add and expose static device index This patch adds static device index in similar fashion to already available in netdev world (struct net->ifindex). In downstream patches, the RDMA nelink will use this idx-to-ib_device conversion, so as part of this commit, we are exposing the translation function to be visible for IB/core users. Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/core_priv.h | 5 +++++ drivers/infiniband/core/device.c | 37 ++++++++++++++++++++++++++++++++++++- include/rdma/ib_verbs.h | 2 ++ 3 files changed, 43 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h index 0c175590cf92..cbdcc81e1df8 100644 --- a/drivers/infiniband/core/core_priv.h +++ b/drivers/infiniband/core/core_priv.h @@ -309,4 +309,9 @@ static inline int ib_mad_enforce_security(struct ib_mad_agent_private *map, return 0; } #endif + +struct ib_device *__ib_device_get_by_index(u32 ifindex); +/* RDMA device netlink */ +void nldev_init(void); +void nldev_exit(void); #endif /* _CORE_PRIV_H */ diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 8828f26250a8..deae8b940994 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -134,6 +134,17 @@ static int ib_device_check_mandatory(struct ib_device *device) return 0; } +struct ib_device *__ib_device_get_by_index(u32 index) +{ + struct ib_device *device; + + list_for_each_entry(device, &device_list, core_list) + if (device->index == index) + return device; + + return NULL; +} + static struct ib_device *__ib_device_get_by_name(const char *name) { struct ib_device *device; @@ -145,7 +156,6 @@ static struct ib_device *__ib_device_get_by_name(const char *name) return NULL; } - static int alloc_name(char *name) { unsigned long *inuse; @@ -394,6 +404,30 @@ static int ib_security_change(struct notifier_block *nb, unsigned long event, return NOTIFY_OK; } +/** + * __dev_new_index - allocate an device index + * + * Returns a suitable unique value for a new device interface + * number. It assumes that there are less than 2^32-1 ib devices + * will be present in the system. + */ +static u32 __dev_new_index(void) +{ + /* + * The device index to allow stable naming. + * Similar to struct net -> ifindex. + */ + static u32 index; + + for (;;) { + if (!(++index)) + index = 1; + + if (!__ib_device_get_by_index(index)) + return index; + } +} + /** * ib_register_device - Register an IB device with IB core * @device:Device to register @@ -492,6 +526,7 @@ int ib_register_device(struct ib_device *device, if (client->add && !add_client_context(device, client)) client->add(device); + device->index = __dev_new_index(); down_write(&lists_rwsem); list_add_tail(&device->core_list, &device_list); up_write(&lists_rwsem); diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 1082b4c81b2c..3391df5fdc9c 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2298,6 +2298,8 @@ struct ib_device { struct rdmacg_device cg_device; #endif + u32 index; + /** * The following mandatory functions are used only at device * registration. Keep functions such as these at the end of this -- cgit v1.2.3 From 1830ba21b9a475cfc6159e6cfe532c75fe7682a4 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Thu, 15 Jun 2017 12:46:33 +0300 Subject: RDMA/netlink: Add and implement doit netlink callback The .doit callback is used by netlink core to differentiate between get and set operations. Common convention is to use that call for command operations like (SET, ADD, e.t.c.) and/or access without NLF_M_DUMP flag. This commit adds proper declaration and implementation to RDMA netlink. Signed-off-by: Leon Romanovsky Reviewed-by: Steve Wise --- drivers/infiniband/core/netlink.c | 19 ++++++++++++++----- include/rdma/rdma_netlink.h | 2 ++ 2 files changed, 16 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/core/netlink.c b/drivers/infiniband/core/netlink.c index a7082adae16b..484d6a8a2811 100644 --- a/drivers/infiniband/core/netlink.c +++ b/drivers/infiniband/core/netlink.c @@ -76,9 +76,13 @@ static bool is_nl_msg_valid(unsigned int type, unsigned int op) static bool is_nl_valid(unsigned int type, unsigned int op) { - if (!is_nl_msg_valid(type, op) || - !rdma_nl_types[type].cb_table || - !rdma_nl_types[type].cb_table[op].dump) + const struct rdma_nl_cbs *cb_table; + + if (!is_nl_msg_valid(type, op)) + return false; + + cb_table = rdma_nl_types[type].cb_table; + if (!cb_table || (!cb_table[op].dump && !cb_table[op].doit)) return false; return true; } @@ -151,6 +155,7 @@ static int rdma_nl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, unsigned int op = RDMA_NL_GET_OP(type); struct netlink_callback cb = {}; struct netlink_dump_control c = {}; + int ret; if (!is_nl_valid(index, op)) return -EINVAL; @@ -169,10 +174,14 @@ static int rdma_nl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, cb.nlh = nlh; cb.dump = rdma_nl_types[index].cb_table[op].dump; return cb.dump(skb, &cb); + } else { + c.dump = rdma_nl_types[index].cb_table[op].dump; + return netlink_dump_start(nls, skb, nlh, &c); } + if (rdma_nl_types[index].cb_table[op].doit) + ret = rdma_nl_types[index].cb_table[op].doit(skb, nlh, extack); + return ret; - c.dump = rdma_nl_types[index].cb_table[op].dump; - return netlink_dump_start(nls, skb, nlh, &c); } /* diff --git a/include/rdma/rdma_netlink.h b/include/rdma/rdma_netlink.h index 92f8832297ab..e25bf1988846 100644 --- a/include/rdma/rdma_netlink.h +++ b/include/rdma/rdma_netlink.h @@ -6,6 +6,8 @@ #include struct rdma_nl_cbs { + int (*doit)(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack); int (*dump)(struct sk_buff *skb, struct netlink_callback *nlcb); u8 flags; }; -- cgit v1.2.3 From 1a6e7c31d71db34d1b9bc3acc87eaea6c2ecc997 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 20 Jun 2017 07:55:53 +0300 Subject: RDMA/netlink: Add netlink device definitions to UAPI Introduce new defines to rdma_netlink.h, so the RDMA configuration tool will be able to communicate with RDMA subsystem by using the shared defines. The addition of new client (NLDEV) revealed the fact that we exposed by mistake the RDMA_NL_I40IW define which is not backed by any RDMA netlink by now and it won't be exposed in the future too. So this patch reuses the value and deletes the old defines. The NLDEV operates with objects. The struct ib_device has two straightforward objects: device itself and ports of that device. This brings us to propose the following commands to work on those objects: * RDMA_NLDEV_CMD_{GET,SET,NEW,DEL} - works on ib_device itself * RDMA_NLDEV_CMD_PORT_{GET,SET,NEW,DEL} - works on ports of specific ib_device Those commands receive/return the device index (RDMA_NLDEV_ATTR_DEV_INDEX) and port index (RDMA_NLDEV_ATTR_PORT_INDEX). For device object accesses, the RDMA_NLDEV_ATTR_PORT_INDEX will return the maximum number of ports for specific ib_device and for port access the actual port index. The port index starts from 1 to follow RDMA/core internal semantics and the sysfs exposed knobs. Signed-off-by: Leon Romanovsky Reviewed-by: Steve Wise --- drivers/infiniband/core/netlink.c | 2 +- include/uapi/rdma/rdma_netlink.h | 39 ++++++++++++++++++++++++++++++++++++++- 2 files changed, 39 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/core/netlink.c b/drivers/infiniband/core/netlink.c index cd692bd73793..27352a352770 100644 --- a/drivers/infiniband/core/netlink.c +++ b/drivers/infiniband/core/netlink.c @@ -61,7 +61,7 @@ static bool is_nl_msg_valid(unsigned int type, unsigned int op) RDMA_NL_IWPM_NUM_OPS, 0, RDMA_NL_LS_NUM_OPS, - 0 }; + RDMA_NLDEV_NUM_OPS }; /* * This BUILD_BUG_ON is intended to catch addition of new diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h index 02fe8390c18f..a44229fa5eca 100644 --- a/include/uapi/rdma/rdma_netlink.h +++ b/include/uapi/rdma/rdma_netlink.h @@ -8,7 +8,7 @@ enum { RDMA_NL_IWCM, RDMA_NL_RSVD, RDMA_NL_LS, /* RDMA Local Services */ - RDMA_NL_I40IW, + RDMA_NL_NLDEV, /* RDMA device interface */ RDMA_NL_NUM_CLIENTS }; @@ -222,4 +222,41 @@ struct rdma_nla_ls_gid { __u8 gid[16]; }; +enum rdma_nldev_command { + RDMA_NLDEV_CMD_UNSPEC, + + RDMA_NLDEV_CMD_GET, /* can dump */ + RDMA_NLDEV_CMD_SET, + RDMA_NLDEV_CMD_NEW, + RDMA_NLDEV_CMD_DEL, + + RDMA_NLDEV_CMD_PORT_GET, /* can dump */ + RDMA_NLDEV_CMD_PORT_SET, + RDMA_NLDEV_CMD_PORT_NEW, + RDMA_NLDEV_CMD_PORT_DEL, + + RDMA_NLDEV_NUM_OPS +}; + +enum rdma_nldev_attr { + /* don't change the order or add anything between, this is ABI! */ + RDMA_NLDEV_ATTR_UNSPEC, + + /* Identifier for ib_device */ + RDMA_NLDEV_ATTR_DEV_INDEX, /* u32 */ + + RDMA_NLDEV_ATTR_DEV_NAME, /* string */ + /* + * Device index together with port index are identifiers + * for port/link properties. + * + * For RDMA_NLDEV_CMD_GET commamnd, port index will return number + * of available ports in ib_device, while for port specific operations, + * it will be real port index as it appears in sysfs. Port index follows + * sysfs notation and starts from 1 for the first port. + */ + RDMA_NLDEV_ATTR_PORT_INDEX, /* u32 */ + + RDMA_NLDEV_ATTR_MAX +}; #endif /* _UAPI_RDMA_NETLINK_H */ -- cgit v1.2.3 From ac50525374315b9b609747f83b07f8dccb06b722 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 20 Jun 2017 14:47:08 +0300 Subject: RDMA/netlink: Expose device and port capability masks The port capability mask is exposed to user space via sysfs interface, while device capabilities are available for verbs only. This patch provides those capabilities through netlink interface. Signed-off-by: Leon Romanovsky Reviewed-by: Steve Wise --- drivers/infiniband/core/nldev.c | 19 +++++++++++++++++++ include/uapi/rdma/rdma_netlink.h | 5 +++++ 2 files changed, 24 insertions(+) (limited to 'include') diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c index db9d9ffc1415..94c1e49074f5 100644 --- a/drivers/infiniband/core/nldev.c +++ b/drivers/infiniband/core/nldev.c @@ -50,18 +50,37 @@ static int fill_dev_info(struct sk_buff *msg, struct ib_device *device) return -EMSGSIZE; if (nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, rdma_end_port(device))) return -EMSGSIZE; + + BUILD_BUG_ON(sizeof(device->attrs.device_cap_flags) != sizeof(u64)); + if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_CAP_FLAGS, + device->attrs.device_cap_flags, 0)) + return -EMSGSIZE; + return 0; } static int fill_port_info(struct sk_buff *msg, struct ib_device *device, u32 port) { + struct ib_port_attr attr; + int ret; + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_DEV_INDEX, device->index)) return -EMSGSIZE; if (nla_put_string(msg, RDMA_NLDEV_ATTR_DEV_NAME, device->name)) return -EMSGSIZE; if (nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, port)) return -EMSGSIZE; + + ret = ib_query_port(device, port, &attr); + if (ret) + return ret; + + BUILD_BUG_ON(sizeof(attr.port_cap_flags) > sizeof(u64)); + if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_CAP_FLAGS, + (u64)attr.port_cap_flags, 0)) + return -EMSGSIZE; + return 0; } diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h index a44229fa5eca..90de11db6580 100644 --- a/include/uapi/rdma/rdma_netlink.h +++ b/include/uapi/rdma/rdma_netlink.h @@ -257,6 +257,11 @@ enum rdma_nldev_attr { */ RDMA_NLDEV_ATTR_PORT_INDEX, /* u32 */ + /* + * Device and port capabilities + */ + RDMA_NLDEV_ATTR_CAP_FLAGS, /* u64 */ + RDMA_NLDEV_ATTR_MAX }; #endif /* _UAPI_RDMA_NETLINK_H */ -- cgit v1.2.3 From 9abb0d1bbd9529c574eacd8586e2bf68d17966cd Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 27 Jun 2017 16:49:53 +0300 Subject: RDMA: Simplify get firmware interface There is a need to forward FW version to user space application through RDMA netlink. In order to make it safe, there is need to declare nla_policy and limit the size of FW string. The new define IB_FW_VERSION_NAME_MAX will limit the size of FW version string. That define was chosen to be equal to ETHTOOL_FWVERS_LEN, because many drivers anyway are limited by that value indirectly. The introduction of this define allows us to remove the string size from get_fw_str function signature. Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/device.c | 4 ++-- drivers/infiniband/core/sysfs.c | 4 ++-- drivers/infiniband/hw/cxgb3/iwch_provider.c | 5 ++--- drivers/infiniband/hw/cxgb4/provider.c | 5 ++--- drivers/infiniband/hw/hfi1/verbs.c | 5 ++--- drivers/infiniband/hw/i40iw/i40iw_verbs.c | 7 +++---- drivers/infiniband/hw/mlx4/main.c | 5 ++--- drivers/infiniband/hw/mlx5/main.c | 8 ++++---- drivers/infiniband/hw/mthca/mthca_provider.c | 5 ++--- drivers/infiniband/hw/nes/nes_verbs.c | 5 ++--- drivers/infiniband/hw/ocrdma/ocrdma_main.c | 5 ++--- drivers/infiniband/hw/qedr/main.c | 5 ++--- drivers/infiniband/hw/usnic/usnic_ib_main.c | 6 ++---- drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c | 5 ++--- drivers/infiniband/ulp/ipoib/ipoib_ethtool.c | 3 +-- include/rdma/ib_verbs.h | 6 ++++-- 16 files changed, 36 insertions(+), 47 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 66b109bc6753..fbc92c649be8 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -336,10 +336,10 @@ static int read_port_immutable(struct ib_device *device) return 0; } -void ib_get_device_fw_str(struct ib_device *dev, char *str, size_t str_len) +void ib_get_device_fw_str(struct ib_device *dev, char *str) { if (dev->get_dev_fw_str) - dev->get_dev_fw_str(dev, str, str_len); + dev->get_dev_fw_str(dev, str); else str[0] = '\0'; } diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c index 7ebe1ef23652..abc5ab581f82 100644 --- a/drivers/infiniband/core/sysfs.c +++ b/drivers/infiniband/core/sysfs.c @@ -1210,8 +1210,8 @@ static ssize_t show_fw_ver(struct device *device, struct device_attribute *attr, { struct ib_device *dev = container_of(device, struct ib_device, dev); - ib_get_device_fw_str(dev, buf, PAGE_SIZE); - strlcat(buf, "\n", PAGE_SIZE); + ib_get_device_fw_str(dev, buf); + strlcat(buf, "\n", IB_FW_VERSION_NAME_MAX); return strlen(buf); } diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.c b/drivers/infiniband/hw/cxgb3/iwch_provider.c index 0cd0c1fa27d4..099e76f3758a 100644 --- a/drivers/infiniband/hw/cxgb3/iwch_provider.c +++ b/drivers/infiniband/hw/cxgb3/iwch_provider.c @@ -1336,8 +1336,7 @@ static int iwch_port_immutable(struct ib_device *ibdev, u8 port_num, return 0; } -static void get_dev_fw_ver_str(struct ib_device *ibdev, char *str, - size_t str_len) +static void get_dev_fw_ver_str(struct ib_device *ibdev, char *str) { struct iwch_dev *iwch_dev = to_iwch_dev(ibdev); struct ethtool_drvinfo info; @@ -1345,7 +1344,7 @@ static void get_dev_fw_ver_str(struct ib_device *ibdev, char *str, pr_debug("%s dev 0x%p\n", __func__, iwch_dev); lldev->ethtool_ops->get_drvinfo(lldev, &info); - snprintf(str, str_len, "%s", info.fw_version); + snprintf(str, IB_FW_VERSION_NAME_MAX, "%s", info.fw_version); } int iwch_register_device(struct iwch_dev *dev) diff --git a/drivers/infiniband/hw/cxgb4/provider.c b/drivers/infiniband/hw/cxgb4/provider.c index 0771e9a4d061..346e8334279a 100644 --- a/drivers/infiniband/hw/cxgb4/provider.c +++ b/drivers/infiniband/hw/cxgb4/provider.c @@ -517,14 +517,13 @@ static int c4iw_port_immutable(struct ib_device *ibdev, u8 port_num, return 0; } -static void get_dev_fw_str(struct ib_device *dev, char *str, - size_t str_len) +static void get_dev_fw_str(struct ib_device *dev, char *str) { struct c4iw_dev *c4iw_dev = container_of(dev, struct c4iw_dev, ibdev); pr_debug("%s dev 0x%p\n", __func__, dev); - snprintf(str, str_len, "%u.%u.%u.%u", + snprintf(str, IB_FW_VERSION_NAME_MAX, "%u.%u.%u.%u", FW_HDR_FW_VER_MAJOR_G(c4iw_dev->rdev.lldi.fw_vers), FW_HDR_FW_VER_MINOR_G(c4iw_dev->rdev.lldi.fw_vers), FW_HDR_FW_VER_MICRO_G(c4iw_dev->rdev.lldi.fw_vers), diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c index dc51bf247006..c88c03c11555 100644 --- a/drivers/infiniband/hw/hfi1/verbs.c +++ b/drivers/infiniband/hw/hfi1/verbs.c @@ -1561,14 +1561,13 @@ static void init_ibport(struct hfi1_pportdata *ppd) RCU_INIT_POINTER(ibp->rvp.qp[1], NULL); } -static void hfi1_get_dev_fw_str(struct ib_device *ibdev, char *str, - size_t str_len) +static void hfi1_get_dev_fw_str(struct ib_device *ibdev, char *str) { struct rvt_dev_info *rdi = ib_to_rvt(ibdev); struct hfi1_ibdev *dev = dev_from_rdi(rdi); u32 ver = dd_from_dev(dev)->dc8051_ver; - snprintf(str, str_len, "%u.%u.%u", dc8051_ver_maj(ver), + snprintf(str, IB_FW_VERSION_NAME_MAX, "%u.%u.%u", dc8051_ver_maj(ver), dc8051_ver_min(ver), dc8051_ver_patch(ver)); } diff --git a/drivers/infiniband/hw/i40iw/i40iw_verbs.c b/drivers/infiniband/hw/i40iw/i40iw_verbs.c index 02d871db7ca5..1aa411034a27 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_verbs.c +++ b/drivers/infiniband/hw/i40iw/i40iw_verbs.c @@ -2584,13 +2584,12 @@ static const char * const i40iw_hw_stat_names[] = { "iwRdmaInv" }; -static void i40iw_get_dev_fw_str(struct ib_device *dev, char *str, - size_t str_len) +static void i40iw_get_dev_fw_str(struct ib_device *dev, char *str) { u32 firmware_version = I40IW_FW_VERSION; - snprintf(str, str_len, "%u.%u", firmware_version, - (firmware_version & 0x000000ff)); + snprintf(str, IB_FW_VERSION_NAME_MAX, "%u.%u", firmware_version, + (firmware_version & 0x000000ff)); } /** diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index 1f25a37eb056..c636842c5be0 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -2587,12 +2587,11 @@ static int mlx4_port_immutable(struct ib_device *ibdev, u8 port_num, return 0; } -static void get_fw_ver_str(struct ib_device *device, char *str, - size_t str_len) +static void get_fw_ver_str(struct ib_device *device, char *str) { struct mlx4_ib_dev *dev = container_of(device, struct mlx4_ib_dev, ib_dev); - snprintf(str, str_len, "%d.%d.%d", + snprintf(str, IB_FW_VERSION_NAME_MAX, "%d.%d.%d", (int) (dev->dev->caps.fw_ver >> 32), (int) (dev->dev->caps.fw_ver >> 16) & 0xffff, (int) dev->dev->caps.fw_ver & 0xffff); diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 9279631d8da0..0a5a4e3fa66d 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -3285,13 +3285,13 @@ static int mlx5_port_immutable(struct ib_device *ibdev, u8 port_num, return 0; } -static void get_dev_fw_str(struct ib_device *ibdev, char *str, - size_t str_len) +static void get_dev_fw_str(struct ib_device *ibdev, char *str) { struct mlx5_ib_dev *dev = container_of(ibdev, struct mlx5_ib_dev, ib_dev); - snprintf(str, str_len, "%d.%d.%04d", fw_rev_maj(dev->mdev), - fw_rev_min(dev->mdev), fw_rev_sub(dev->mdev)); + snprintf(str, IB_FW_VERSION_NAME_MAX, "%d.%d.%04d", + fw_rev_maj(dev->mdev), fw_rev_min(dev->mdev), + fw_rev_sub(dev->mdev)); } static int mlx5_eth_lag_init(struct mlx5_ib_dev *dev) diff --git a/drivers/infiniband/hw/mthca/mthca_provider.c b/drivers/infiniband/hw/mthca/mthca_provider.c index c197cd9b193f..eae9bffd45d4 100644 --- a/drivers/infiniband/hw/mthca/mthca_provider.c +++ b/drivers/infiniband/hw/mthca/mthca_provider.c @@ -1178,12 +1178,11 @@ static int mthca_port_immutable(struct ib_device *ibdev, u8 port_num, return 0; } -static void get_dev_fw_str(struct ib_device *device, char *str, - size_t str_len) +static void get_dev_fw_str(struct ib_device *device, char *str) { struct mthca_dev *dev = container_of(device, struct mthca_dev, ib_dev); - snprintf(str, str_len, "%d.%d.%d", + snprintf(str, IB_FW_VERSION_NAME_MAX, "%d.%d.%d", (int) (dev->fw_ver >> 32), (int) (dev->fw_ver >> 16) & 0xffff, (int) dev->fw_ver & 0xffff); diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c index 25dcd7573df9..c2943e39d2f9 100644 --- a/drivers/infiniband/hw/nes/nes_verbs.c +++ b/drivers/infiniband/hw/nes/nes_verbs.c @@ -3672,15 +3672,14 @@ static int nes_port_immutable(struct ib_device *ibdev, u8 port_num, return 0; } -static void get_dev_fw_str(struct ib_device *dev, char *str, - size_t str_len) +static void get_dev_fw_str(struct ib_device *dev, char *str) { struct nes_ib_device *nesibdev = container_of(dev, struct nes_ib_device, ibdev); struct nes_vnic *nesvnic = nesibdev->nesvnic; nes_debug(NES_DBG_INIT, "\n"); - snprintf(str, str_len, "%u.%u", + snprintf(str, IB_FW_VERSION_NAME_MAX, "%u.%u", (nesvnic->nesdev->nesadapter->firmware_version >> 16), (nesvnic->nesdev->nesadapter->firmware_version & 0x000000ff)); } diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_main.c b/drivers/infiniband/hw/ocrdma/ocrdma_main.c index 757c65816295..fbfbd9e96147 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_main.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_main.c @@ -107,12 +107,11 @@ static int ocrdma_port_immutable(struct ib_device *ibdev, u8 port_num, return 0; } -static void get_dev_fw_str(struct ib_device *device, char *str, - size_t str_len) +static void get_dev_fw_str(struct ib_device *device, char *str) { struct ocrdma_dev *dev = get_ocrdma_dev(device); - snprintf(str, str_len, "%s", &dev->attr.fw_ver[0]); + snprintf(str, IB_FW_VERSION_NAME_MAX, "%s", &dev->attr.fw_ver[0]); } static int ocrdma_register_device(struct ocrdma_dev *dev) diff --git a/drivers/infiniband/hw/qedr/main.c b/drivers/infiniband/hw/qedr/main.c index 199b6edbef92..97d033f51dc9 100644 --- a/drivers/infiniband/hw/qedr/main.c +++ b/drivers/infiniband/hw/qedr/main.c @@ -68,13 +68,12 @@ static enum rdma_link_layer qedr_link_layer(struct ib_device *device, return IB_LINK_LAYER_ETHERNET; } -static void qedr_get_dev_fw_str(struct ib_device *ibdev, char *str, - size_t str_len) +static void qedr_get_dev_fw_str(struct ib_device *ibdev, char *str) { struct qedr_dev *qedr = get_qedr_dev(ibdev); u32 fw_ver = (u32)qedr->attr.fw_ver; - snprintf(str, str_len, "%d. %d. %d. %d", + snprintf(str, IB_FW_VERSION_NAME_MAX, "%d. %d. %d. %d", (fw_ver >> 24) & 0xFF, (fw_ver >> 16) & 0xFF, (fw_ver >> 8) & 0xFF, fw_ver & 0xFF); } diff --git a/drivers/infiniband/hw/usnic/usnic_ib_main.c b/drivers/infiniband/hw/usnic/usnic_ib_main.c index e69c8e476a2b..e86700f994cb 100644 --- a/drivers/infiniband/hw/usnic/usnic_ib_main.c +++ b/drivers/infiniband/hw/usnic/usnic_ib_main.c @@ -333,9 +333,7 @@ static int usnic_port_immutable(struct ib_device *ibdev, u8 port_num, return 0; } -static void usnic_get_dev_fw_str(struct ib_device *device, - char *str, - size_t str_len) +static void usnic_get_dev_fw_str(struct ib_device *device, char *str) { struct usnic_ib_dev *us_ibdev = container_of(device, struct usnic_ib_dev, ib_dev); @@ -345,7 +343,7 @@ static void usnic_get_dev_fw_str(struct ib_device *device, us_ibdev->netdev->ethtool_ops->get_drvinfo(us_ibdev->netdev, &info); mutex_unlock(&us_ibdev->usdev_lock); - snprintf(str, str_len, "%s", info.fw_version); + snprintf(str, IB_FW_VERSION_NAME_MAX, "%s", info.fw_version); } /* Start of PF discovery section */ diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c index e76565280afa..7f29e4db28a1 100644 --- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c +++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c @@ -102,12 +102,11 @@ static struct device_attribute *pvrdma_class_attributes[] = { &dev_attr_board_id }; -static void pvrdma_get_fw_ver_str(struct ib_device *device, char *str, - size_t str_len) +static void pvrdma_get_fw_ver_str(struct ib_device *device, char *str) { struct pvrdma_dev *dev = container_of(device, struct pvrdma_dev, ib_dev); - snprintf(str, str_len, "%d.%d.%d\n", + snprintf(str, IB_FW_VERSION_NAME_MAX, "%d.%d.%d\n", (int) (dev->dsr->caps.fw_ver >> 32), (int) (dev->dsr->caps.fw_ver >> 16) & 0xffff, (int) dev->dsr->caps.fw_ver & 0xffff); diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c b/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c index 7871379342f4..98e30b41e436 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c @@ -62,8 +62,7 @@ static void ipoib_get_drvinfo(struct net_device *netdev, { struct ipoib_dev_priv *priv = ipoib_priv(netdev); - ib_get_device_fw_str(priv->ca, drvinfo->fw_version, - sizeof(drvinfo->fw_version)); + ib_get_device_fw_str(priv->ca, drvinfo->fw_version); strlcpy(drvinfo->bus_info, dev_name(priv->ca->dev.parent), sizeof(drvinfo->bus_info)); diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 3391df5fdc9c..e0e87a1f66fb 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -64,6 +64,8 @@ #include #include +#define IB_FW_VERSION_NAME_MAX ETHTOOL_FWVERS_LEN + extern struct workqueue_struct *ib_wq; extern struct workqueue_struct *ib_comp_wq; @@ -2307,7 +2309,7 @@ struct ib_device { * in fast paths. */ int (*get_port_immutable)(struct ib_device *, u8, struct ib_port_immutable *); - void (*get_dev_fw_str)(struct ib_device *, char *str, size_t str_len); + void (*get_dev_fw_str)(struct ib_device *, char *str); }; struct ib_client { @@ -2343,7 +2345,7 @@ struct ib_client { struct ib_device *ib_alloc_device(size_t size); void ib_dealloc_device(struct ib_device *device); -void ib_get_device_fw_str(struct ib_device *device, char *str, size_t str_len); +void ib_get_device_fw_str(struct ib_device *device, char *str); int ib_register_device(struct ib_device *device, int (*port_callback)(struct ib_device *, -- cgit v1.2.3 From 8621a7e3c1c22e18385c9ced1647363884ea2aa1 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 27 Jun 2017 16:58:59 +0300 Subject: RDMA/netlink: Export FW version Add FW version to the device properties exported by RDMA netlink, to be used by RDMAtool. Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/nldev.c | 9 +++++++++ include/uapi/rdma/rdma_netlink.h | 4 ++++ 2 files changed, 13 insertions(+) (limited to 'include') diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c index 94c1e49074f5..cdc970ca5a1b 100644 --- a/drivers/infiniband/core/nldev.c +++ b/drivers/infiniband/core/nldev.c @@ -40,10 +40,14 @@ static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = { [RDMA_NLDEV_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, .len = IB_DEVICE_NAME_MAX - 1}, [RDMA_NLDEV_ATTR_PORT_INDEX] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_FW_VERSION] = { .type = NLA_NUL_STRING, + .len = IB_FW_VERSION_NAME_MAX - 1}, }; static int fill_dev_info(struct sk_buff *msg, struct ib_device *device) { + char fw[IB_FW_VERSION_NAME_MAX]; + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_DEV_INDEX, device->index)) return -EMSGSIZE; if (nla_put_string(msg, RDMA_NLDEV_ATTR_DEV_NAME, device->name)) @@ -56,6 +60,11 @@ static int fill_dev_info(struct sk_buff *msg, struct ib_device *device) device->attrs.device_cap_flags, 0)) return -EMSGSIZE; + ib_get_device_fw_str(device, fw); + /* Device without FW has strlen(fw) */ + if (strlen(fw) && nla_put_string(msg, RDMA_NLDEV_ATTR_FW_VERSION, fw)) + return -EMSGSIZE; + return 0; } diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h index 90de11db6580..5159858730b0 100644 --- a/include/uapi/rdma/rdma_netlink.h +++ b/include/uapi/rdma/rdma_netlink.h @@ -262,6 +262,10 @@ enum rdma_nldev_attr { */ RDMA_NLDEV_ATTR_CAP_FLAGS, /* u64 */ + /* + * FW version + */ + RDMA_NLDEV_ATTR_FW_VERSION, /* string */ RDMA_NLDEV_ATTR_MAX }; #endif /* _UAPI_RDMA_NETLINK_H */ -- cgit v1.2.3 From 1aaff896ca6b968a639e3e1e72ba6146ba332501 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Wed, 28 Jun 2017 14:01:37 +0300 Subject: RDMA/netlink: Export node_guid and sys_image_guid Add Node GUID and system image GUID to the device properties exported by RDMA netlink, to be used by RDMAtool. Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/nldev.c | 8 ++++++++ include/uapi/rdma/rdma_netlink.h | 13 +++++++++++++ 2 files changed, 21 insertions(+) (limited to 'include') diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c index cdc970ca5a1b..f932c2c3fad0 100644 --- a/drivers/infiniband/core/nldev.c +++ b/drivers/infiniband/core/nldev.c @@ -42,6 +42,8 @@ static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = { [RDMA_NLDEV_ATTR_PORT_INDEX] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_FW_VERSION] = { .type = NLA_NUL_STRING, .len = IB_FW_VERSION_NAME_MAX - 1}, + [RDMA_NLDEV_ATTR_NODE_GUID] = { .type = NLA_U64 }, + [RDMA_NLDEV_ATTR_SYS_IMAGE_GUID] = { .type = NLA_U64 }, }; static int fill_dev_info(struct sk_buff *msg, struct ib_device *device) @@ -65,6 +67,12 @@ static int fill_dev_info(struct sk_buff *msg, struct ib_device *device) if (strlen(fw) && nla_put_string(msg, RDMA_NLDEV_ATTR_FW_VERSION, fw)) return -EMSGSIZE; + if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_NODE_GUID, + be64_to_cpu(device->node_guid), 0)) + return -EMSGSIZE; + if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_SYS_IMAGE_GUID, + be64_to_cpu(device->attrs.sys_image_guid), 0)) + return -EMSGSIZE; return 0; } diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h index 5159858730b0..fe3a7429e7a1 100644 --- a/include/uapi/rdma/rdma_netlink.h +++ b/include/uapi/rdma/rdma_netlink.h @@ -266,6 +266,19 @@ enum rdma_nldev_attr { * FW version */ RDMA_NLDEV_ATTR_FW_VERSION, /* string */ + + /* + * Node GUID (in host byte order) associated with the RDMA device. + */ + RDMA_NLDEV_ATTR_NODE_GUID, /* u64 */ + + /* + * System image GUID (in host byte order) associated with + * this RDMA device and other devices which are part of a + * single system. + */ + RDMA_NLDEV_ATTR_SYS_IMAGE_GUID, /* u64 */ + RDMA_NLDEV_ATTR_MAX }; #endif /* _UAPI_RDMA_NETLINK_H */ -- cgit v1.2.3 From 12026fbba6af2fc53c3c6cf88bdfc6561986ba82 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Wed, 28 Jun 2017 15:05:14 +0300 Subject: RDMA/netlink: Advertise IB subnet prefix Add IB subnet prefix to the port properties exported by RDMA netlink. Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/nldev.c | 5 +++++ include/uapi/rdma/rdma_netlink.h | 5 +++++ 2 files changed, 10 insertions(+) (limited to 'include') diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c index f932c2c3fad0..7af71d5e52c8 100644 --- a/drivers/infiniband/core/nldev.c +++ b/drivers/infiniband/core/nldev.c @@ -44,6 +44,7 @@ static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = { .len = IB_FW_VERSION_NAME_MAX - 1}, [RDMA_NLDEV_ATTR_NODE_GUID] = { .type = NLA_U64 }, [RDMA_NLDEV_ATTR_SYS_IMAGE_GUID] = { .type = NLA_U64 }, + [RDMA_NLDEV_ATTR_SUBNET_PREFIX] = { .type = NLA_U64 }, }; static int fill_dev_info(struct sk_buff *msg, struct ib_device *device) @@ -97,6 +98,10 @@ static int fill_port_info(struct sk_buff *msg, if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_CAP_FLAGS, (u64)attr.port_cap_flags, 0)) return -EMSGSIZE; + if (rdma_protocol_ib(device, port) && + nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_SUBNET_PREFIX, + attr.subnet_prefix, 0)) + return -EMSGSIZE; return 0; } diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h index fe3a7429e7a1..481003182a35 100644 --- a/include/uapi/rdma/rdma_netlink.h +++ b/include/uapi/rdma/rdma_netlink.h @@ -279,6 +279,11 @@ enum rdma_nldev_attr { */ RDMA_NLDEV_ATTR_SYS_IMAGE_GUID, /* u64 */ + /* + * Subnet prefix (in host byte order) + */ + RDMA_NLDEV_ATTR_SUBNET_PREFIX, /* u64 */ + RDMA_NLDEV_ATTR_MAX }; #endif /* _UAPI_RDMA_NETLINK_H */ -- cgit v1.2.3 From 80a06dd36f79de7007f21f5cbe42181a4e5c7d6d Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Wed, 28 Jun 2017 15:38:36 +0300 Subject: RDMA/netink: Export lids and sm_lids According to the IB specification, the LID and SM_LID are 16-bit wide, but to support OmniPath users, export it as 32-bit value from the beginning. Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/nldev.c | 9 ++++++++- include/uapi/rdma/rdma_netlink.h | 8 ++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c index 7af71d5e52c8..16f1d28bea69 100644 --- a/drivers/infiniband/core/nldev.c +++ b/drivers/infiniband/core/nldev.c @@ -45,6 +45,8 @@ static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = { [RDMA_NLDEV_ATTR_NODE_GUID] = { .type = NLA_U64 }, [RDMA_NLDEV_ATTR_SYS_IMAGE_GUID] = { .type = NLA_U64 }, [RDMA_NLDEV_ATTR_SUBNET_PREFIX] = { .type = NLA_U64 }, + [RDMA_NLDEV_ATTR_LID] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_SM_LID] = { .type = NLA_U32 }, }; static int fill_dev_info(struct sk_buff *msg, struct ib_device *device) @@ -102,7 +104,12 @@ static int fill_port_info(struct sk_buff *msg, nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_SUBNET_PREFIX, attr.subnet_prefix, 0)) return -EMSGSIZE; - + if (rdma_protocol_ib(device, port)) { + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_LID, attr.lid)) + return -EMSGSIZE; + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_SM_LID, attr.sm_lid)) + return -EMSGSIZE; + } return 0; } diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h index 481003182a35..7d5caaf54126 100644 --- a/include/uapi/rdma/rdma_netlink.h +++ b/include/uapi/rdma/rdma_netlink.h @@ -284,6 +284,14 @@ enum rdma_nldev_attr { */ RDMA_NLDEV_ATTR_SUBNET_PREFIX, /* u64 */ + /* + * Local Identifier (LID), + * According to IB specification, It is 16-bit address assigned + * by the Subnet Manager. Extended to be 32-bit for OmniPath users. + */ + RDMA_NLDEV_ATTR_LID, /* u32 */ + RDMA_NLDEV_ATTR_SM_LID, /* u32 */ + RDMA_NLDEV_ATTR_MAX }; #endif /* _UAPI_RDMA_NETLINK_H */ -- cgit v1.2.3 From 34840fea112d36507c19dc6052b8c6d88bdd9c16 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Wed, 28 Jun 2017 15:49:30 +0300 Subject: RDMA/netlink: Export LID mask control (LMC) Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/nldev.c | 3 +++ include/uapi/rdma/rdma_netlink.h | 5 +++++ 2 files changed, 8 insertions(+) (limited to 'include') diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c index 16f1d28bea69..11546f87c5dc 100644 --- a/drivers/infiniband/core/nldev.c +++ b/drivers/infiniband/core/nldev.c @@ -47,6 +47,7 @@ static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = { [RDMA_NLDEV_ATTR_SUBNET_PREFIX] = { .type = NLA_U64 }, [RDMA_NLDEV_ATTR_LID] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_SM_LID] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_LMC] = { .type = NLA_U8 }, }; static int fill_dev_info(struct sk_buff *msg, struct ib_device *device) @@ -109,6 +110,8 @@ static int fill_port_info(struct sk_buff *msg, return -EMSGSIZE; if (nla_put_u32(msg, RDMA_NLDEV_ATTR_SM_LID, attr.sm_lid)) return -EMSGSIZE; + if (nla_put_u8(msg, RDMA_NLDEV_ATTR_LMC, attr.lmc)) + return -EMSGSIZE; } return 0; } diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h index 7d5caaf54126..035706e6b016 100644 --- a/include/uapi/rdma/rdma_netlink.h +++ b/include/uapi/rdma/rdma_netlink.h @@ -292,6 +292,11 @@ enum rdma_nldev_attr { RDMA_NLDEV_ATTR_LID, /* u32 */ RDMA_NLDEV_ATTR_SM_LID, /* u32 */ + /* + * LID mask control (LMC) + */ + RDMA_NLDEV_ATTR_LMC, /* u8 */ + RDMA_NLDEV_ATTR_MAX }; #endif /* _UAPI_RDMA_NETLINK_H */ -- cgit v1.2.3 From 5654e49db0b2d87c12b6e120b6a830abe3d3921b Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Thu, 29 Jun 2017 13:12:45 +0300 Subject: RDMA/netlink: Provide port state and physical link state Add port state and physical link state to the users of RDMA netlink. Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/nldev.c | 6 ++++++ include/uapi/rdma/rdma_netlink.h | 3 +++ 2 files changed, 9 insertions(+) (limited to 'include') diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c index 11546f87c5dc..32ccb2b88933 100644 --- a/drivers/infiniband/core/nldev.c +++ b/drivers/infiniband/core/nldev.c @@ -48,6 +48,8 @@ static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = { [RDMA_NLDEV_ATTR_LID] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_SM_LID] = { .type = NLA_U32 }, [RDMA_NLDEV_ATTR_LMC] = { .type = NLA_U8 }, + [RDMA_NLDEV_ATTR_PORT_STATE] = { .type = NLA_U8 }, + [RDMA_NLDEV_ATTR_PORT_PHYS_STATE] = { .type = NLA_U8 }, }; static int fill_dev_info(struct sk_buff *msg, struct ib_device *device) @@ -113,6 +115,10 @@ static int fill_port_info(struct sk_buff *msg, if (nla_put_u8(msg, RDMA_NLDEV_ATTR_LMC, attr.lmc)) return -EMSGSIZE; } + if (nla_put_u8(msg, RDMA_NLDEV_ATTR_PORT_STATE, attr.state)) + return -EMSGSIZE; + if (nla_put_u8(msg, RDMA_NLDEV_ATTR_PORT_PHYS_STATE, attr.phys_state)) + return -EMSGSIZE; return 0; } diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h index 035706e6b016..c488c3cf361b 100644 --- a/include/uapi/rdma/rdma_netlink.h +++ b/include/uapi/rdma/rdma_netlink.h @@ -297,6 +297,9 @@ enum rdma_nldev_attr { */ RDMA_NLDEV_ATTR_LMC, /* u8 */ + RDMA_NLDEV_ATTR_PORT_STATE, /* u8 */ + RDMA_NLDEV_ATTR_PORT_PHYS_STATE, /* u8 */ + RDMA_NLDEV_ATTR_MAX }; #endif /* _UAPI_RDMA_NETLINK_H */ -- cgit v1.2.3 From 1bb77b8c1d57149ed0aa6825255ead80ae584034 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Thu, 29 Jun 2017 16:01:29 +0300 Subject: RDMA/netlink: Export node_type Add ability to get node_type for RDAM netlink users. Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/nldev.c | 3 +++ include/uapi/rdma/rdma_netlink.h | 2 ++ 2 files changed, 5 insertions(+) (limited to 'include') diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c index 32ccb2b88933..474022274e09 100644 --- a/drivers/infiniband/core/nldev.c +++ b/drivers/infiniband/core/nldev.c @@ -50,6 +50,7 @@ static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = { [RDMA_NLDEV_ATTR_LMC] = { .type = NLA_U8 }, [RDMA_NLDEV_ATTR_PORT_STATE] = { .type = NLA_U8 }, [RDMA_NLDEV_ATTR_PORT_PHYS_STATE] = { .type = NLA_U8 }, + [RDMA_NLDEV_ATTR_DEV_NODE_TYPE] = { .type = NLA_U8 }, }; static int fill_dev_info(struct sk_buff *msg, struct ib_device *device) @@ -79,6 +80,8 @@ static int fill_dev_info(struct sk_buff *msg, struct ib_device *device) if (nla_put_u64_64bit(msg, RDMA_NLDEV_ATTR_SYS_IMAGE_GUID, be64_to_cpu(device->attrs.sys_image_guid), 0)) return -EMSGSIZE; + if (nla_put_u8(msg, RDMA_NLDEV_ATTR_DEV_NODE_TYPE, device->node_type)) + return -EMSGSIZE; return 0; } diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h index c488c3cf361b..861440a87e7c 100644 --- a/include/uapi/rdma/rdma_netlink.h +++ b/include/uapi/rdma/rdma_netlink.h @@ -300,6 +300,8 @@ enum rdma_nldev_attr { RDMA_NLDEV_ATTR_PORT_STATE, /* u8 */ RDMA_NLDEV_ATTR_PORT_PHYS_STATE, /* u8 */ + RDMA_NLDEV_ATTR_DEV_NODE_TYPE, /* u8 */ + RDMA_NLDEV_ATTR_MAX }; #endif /* _UAPI_RDMA_NETLINK_H */ -- cgit v1.2.3 From 92d50fc1602ecef44babe411c475344e55e1cdd9 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Wed, 19 Jul 2017 15:01:06 +0200 Subject: PCI/IB: add support for pci driver attribute groups Some drivers (specifically the nes IB driver), want to create a lot of sysfs driver attributes. Instead of open-coding the creation and removal of these files (and getting it wrong btw), it's a better idea to let the driver core handle all of this logic for us. So add a new field to the pci driver structure, **groups, that allows pci drivers to specify an attribute group list it wishes to have created when it is registered with the driver core. Big bonus is now the driver doesn't race with userspace when the sysfs files are created vs. when the kobject is announced, so any script/tool that actually wanted to use these files will not have to poll waiting for them to show up. Cc: Faisal Latif Cc: Doug Ledford Cc: Sean Hefty Cc: Hal Rosenstock Cc: Bjorn Helgaas Signed-off-by: Greg Kroah-Hartman Acked-by: Bjorn Helgaas Signed-off-by: Doug Ledford --- drivers/infiniband/hw/nes/nes.c | 67 ++++++++++++++--------------------------- drivers/pci/pci-driver.c | 1 + include/linux/pci.h | 1 + 3 files changed, 25 insertions(+), 44 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/hw/nes/nes.c b/drivers/infiniband/hw/nes/nes.c index a30aa6527f7e..84162e9ae5c0 100644 --- a/drivers/infiniband/hw/nes/nes.c +++ b/drivers/infiniband/hw/nes/nes.c @@ -808,13 +808,6 @@ static void nes_remove(struct pci_dev *pcidev) } -static struct pci_driver nes_pci_driver = { - .name = DRV_NAME, - .id_table = nes_pci_table, - .probe = nes_probe, - .remove = nes_remove, -}; - static ssize_t adapter_show(struct device_driver *ddp, char *buf) { unsigned int devfn = 0xffffffff; @@ -1156,35 +1149,29 @@ static DRIVER_ATTR_RW(idx_addr); static DRIVER_ATTR_RW(idx_data); static DRIVER_ATTR_RW(wqm_quanta); -static int nes_create_driver_sysfs(struct pci_driver *drv) -{ - int error; - error = driver_create_file(&drv->driver, &driver_attr_adapter); - error |= driver_create_file(&drv->driver, &driver_attr_eeprom_cmd); - error |= driver_create_file(&drv->driver, &driver_attr_eeprom_data); - error |= driver_create_file(&drv->driver, &driver_attr_flash_cmd); - error |= driver_create_file(&drv->driver, &driver_attr_flash_data); - error |= driver_create_file(&drv->driver, &driver_attr_nonidx_addr); - error |= driver_create_file(&drv->driver, &driver_attr_nonidx_data); - error |= driver_create_file(&drv->driver, &driver_attr_idx_addr); - error |= driver_create_file(&drv->driver, &driver_attr_idx_data); - error |= driver_create_file(&drv->driver, &driver_attr_wqm_quanta); - return error; -} +static struct attribute *nes_attrs[] = { + &driver_attr_adapter.attr, + &driver_attr_eeprom_cmd.attr, + &driver_attr_eeprom_data.attr, + &driver_attr_flash_cmd.attr, + &driver_attr_flash_data.attr, + &driver_attr_nonidx_addr.attr, + &driver_attr_nonidx_data.attr, + &driver_attr_idx_addr.attr, + &driver_attr_idx_data.attr, + &driver_attr_wqm_quanta.attr, + NULL, +}; +ATTRIBUTE_GROUPS(nes); + +static struct pci_driver nes_pci_driver = { + .name = DRV_NAME, + .id_table = nes_pci_table, + .probe = nes_probe, + .remove = nes_remove, + .groups = nes_groups, +}; -static void nes_remove_driver_sysfs(struct pci_driver *drv) -{ - driver_remove_file(&drv->driver, &driver_attr_adapter); - driver_remove_file(&drv->driver, &driver_attr_eeprom_cmd); - driver_remove_file(&drv->driver, &driver_attr_eeprom_data); - driver_remove_file(&drv->driver, &driver_attr_flash_cmd); - driver_remove_file(&drv->driver, &driver_attr_flash_data); - driver_remove_file(&drv->driver, &driver_attr_nonidx_addr); - driver_remove_file(&drv->driver, &driver_attr_nonidx_data); - driver_remove_file(&drv->driver, &driver_attr_idx_addr); - driver_remove_file(&drv->driver, &driver_attr_idx_data); - driver_remove_file(&drv->driver, &driver_attr_wqm_quanta); -} /** * nes_init_module - module initialization entry point @@ -1192,20 +1179,13 @@ static void nes_remove_driver_sysfs(struct pci_driver *drv) static int __init nes_init_module(void) { int retval; - int retval1; retval = nes_cm_start(); if (retval) { printk(KERN_ERR PFX "Unable to start NetEffect iWARP CM.\n"); return retval; } - retval = pci_register_driver(&nes_pci_driver); - if (retval >= 0) { - retval1 = nes_create_driver_sysfs(&nes_pci_driver); - if (retval1 < 0) - printk(KERN_ERR PFX "Unable to create NetEffect sys files.\n"); - } - return retval; + return pci_register_driver(&nes_pci_driver); } @@ -1215,7 +1195,6 @@ static int __init nes_init_module(void) static void __exit nes_exit_module(void) { nes_cm_stop(); - nes_remove_driver_sysfs(&nes_pci_driver); pci_unregister_driver(&nes_pci_driver); } diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c index d51e8738f9c2..4450feaf5c00 100644 --- a/drivers/pci/pci-driver.c +++ b/drivers/pci/pci-driver.c @@ -1307,6 +1307,7 @@ int __pci_register_driver(struct pci_driver *drv, struct module *owner, drv->driver.bus = &pci_bus_type; drv->driver.owner = owner; drv->driver.mod_name = mod_name; + drv->driver.groups = drv->groups; spin_lock_init(&drv->dynids.lock); INIT_LIST_HEAD(&drv->dynids.list); diff --git a/include/linux/pci.h b/include/linux/pci.h index 4869e66dd659..c421af60f2df 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -729,6 +729,7 @@ struct pci_driver { void (*shutdown) (struct pci_dev *dev); int (*sriov_configure) (struct pci_dev *dev, int num_vfs); /* PF pdev */ const struct pci_error_handlers *err_handler; + const struct attribute_group **groups; struct device_driver driver; struct pci_dynids dynids; }; -- cgit v1.2.3 From 5f8a4db715ca21c3195b5eea24e26574c0f5acfa Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 31 Jul 2017 08:50:05 +0200 Subject: infiniband: avoid overflow warning A sockaddr_in structure on the stack getting passed into rdma_ip2gid triggers this warning, since we memcpy into a larger sockaddr_in6 structure: In function 'memcpy', inlined from 'rdma_ip2gid' at include/rdma/ib_addr.h:175:3, inlined from 'addr_event.isra.4.constprop' at drivers/infiniband/core/roce_gid_mgmt.c:693:2, inlined from 'inetaddr_event' at drivers/infiniband/core/roce_gid_mgmt.c:716:9: include/linux/string.h:305:4: error: call to '__read_overflow2' declared with attribute error: detected read beyond size of object passed as 2nd parameter The warning seems appropriate here, but the code is also clearly correct, so we really just want to shut up this instance of the output. The best way I found so far is to avoid the memcpy() call and instead replace it with a struct assignment. Fixes: 6974f0c4555e ("include/linux/string.h: add the option of fortified string.h functions") Cc: Daniel Micay Cc: Kees Cook Signed-off-by: Arnd Bergmann Signed-off-by: Doug Ledford --- include/rdma/ib_addr.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/rdma/ib_addr.h b/include/rdma/ib_addr.h index b73a14edc85e..454e6ea742a5 100644 --- a/include/rdma/ib_addr.h +++ b/include/rdma/ib_addr.h @@ -172,7 +172,8 @@ static inline int rdma_ip2gid(struct sockaddr *addr, union ib_gid *gid) (struct in6_addr *)gid); break; case AF_INET6: - memcpy(gid->raw, &((struct sockaddr_in6 *)addr)->sin6_addr, 16); + *(struct in6_addr *)&gid->raw = + ((struct sockaddr_in6 *)addr)->sin6_addr; break; default: return -EINVAL; -- cgit v1.2.3 From 62ede7779904bc75bdd84f1ff0016113956ce3b4 Mon Sep 17 00:00:00 2001 From: "Hiatt, Don" Date: Mon, 14 Aug 2017 14:17:43 -0400 Subject: Add OPA extended LID support This patch series primarily increases sizes of variables that hold lid values from 16 to 32 bits. Additionally, it adds a check in the IB mad stack to verify a properly formatted MAD when OPA extended LIDs are used. Signed-off-by: Don Hiatt Reviewed-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/core/cm.c | 4 ++-- drivers/infiniband/core/user_mad.c | 2 +- drivers/infiniband/core/uverbs_cmd.c | 11 ++++++----- drivers/infiniband/hw/hfi1/mad.c | 2 +- drivers/infiniband/hw/mlx4/alias_GUID.c | 2 +- drivers/infiniband/hw/mlx4/mad.c | 8 ++++---- drivers/infiniband/hw/mlx5/mad.c | 2 +- drivers/infiniband/hw/mthca/mthca_cmd.c | 4 ++-- drivers/infiniband/hw/mthca/mthca_mad.c | 4 ++-- drivers/infiniband/sw/rdmavt/cq.c | 2 +- include/rdma/ib_verbs.h | 26 ++++++++++++++++++++------ 11 files changed, 41 insertions(+), 26 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c index 7a389697e2ec..fa3b0a428195 100644 --- a/drivers/infiniband/core/cm.c +++ b/drivers/infiniband/core/cm.c @@ -1770,7 +1770,7 @@ static void cm_process_routed_req(struct cm_req_msg *req_msg, struct ib_wc *wc) { if (!cm_req_get_primary_subnet_local(req_msg)) { if (req_msg->primary_local_lid == IB_LID_PERMISSIVE) { - req_msg->primary_local_lid = ib_slid_be16(wc->slid); + req_msg->primary_local_lid = ib_lid_be16(wc->slid); cm_req_set_primary_sl(req_msg, wc->sl); } @@ -1780,7 +1780,7 @@ static void cm_process_routed_req(struct cm_req_msg *req_msg, struct ib_wc *wc) if (!cm_req_get_alt_subnet_local(req_msg)) { if (req_msg->alt_local_lid == IB_LID_PERMISSIVE) { - req_msg->alt_local_lid = ib_slid_be16(wc->slid); + req_msg->alt_local_lid = ib_lid_be16(wc->slid); cm_req_set_alt_sl(req_msg, wc->sl); } diff --git a/drivers/infiniband/core/user_mad.c b/drivers/infiniband/core/user_mad.c index ff3c67a7aaad..c1696e6084b2 100644 --- a/drivers/infiniband/core/user_mad.c +++ b/drivers/infiniband/core/user_mad.c @@ -229,7 +229,7 @@ static void recv_handler(struct ib_mad_agent *agent, packet->mad.hdr.status = 0; packet->mad.hdr.length = hdr_size(file) + mad_recv_wc->mad_len; packet->mad.hdr.qpn = cpu_to_be32(mad_recv_wc->wc->src_qp); - packet->mad.hdr.lid = ib_slid_be16(mad_recv_wc->wc->slid); + packet->mad.hdr.lid = ib_lid_be16(mad_recv_wc->wc->slid); packet->mad.hdr.sl = mad_recv_wc->wc->sl; packet->mad.hdr.path_bits = mad_recv_wc->wc->dlid_path_bits; packet->mad.hdr.pkey_index = mad_recv_wc->wc->pkey_index; diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 39a0f1dc84e4..a21881e22bad 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -275,12 +275,13 @@ ssize_t ib_uverbs_query_port(struct ib_uverbs_file *file, resp.bad_pkey_cntr = attr.bad_pkey_cntr; resp.qkey_viol_cntr = attr.qkey_viol_cntr; resp.pkey_tbl_len = attr.pkey_tbl_len; + if (rdma_cap_opa_ah(ib_dev, cmd.port_num)) { - resp.lid = OPA_TO_IB_UCAST_LID(attr.lid); + resp.lid = OPA_TO_IB_UCAST_LID(attr.lid); resp.sm_lid = OPA_TO_IB_UCAST_LID(attr.sm_lid); } else { - resp.lid = (u16)attr.lid; - resp.sm_lid = (u16)attr.sm_lid; + resp.lid = ib_lid_cpu16(attr.lid); + resp.sm_lid = ib_lid_cpu16(attr.sm_lid); } resp.lmc = attr.lmc; resp.max_vl_num = attr.max_vl_num; @@ -1206,9 +1207,9 @@ static int copy_wc_to_user(struct ib_device *ib_dev, void __user *dest, tmp.wc_flags = wc->wc_flags; tmp.pkey_index = wc->pkey_index; if (rdma_cap_opa_ah(ib_dev, wc->port_num)) - tmp.slid = OPA_TO_IB_UCAST_LID(wc->slid); + tmp.slid = OPA_TO_IB_UCAST_LID(wc->slid); else - tmp.slid = ib_slid_cpu16(wc->slid); + tmp.slid = ib_lid_cpu16(wc->slid); tmp.sl = wc->sl; tmp.dlid_path_bits = wc->dlid_path_bits; tmp.port_num = wc->port_num; diff --git a/drivers/infiniband/hw/hfi1/mad.c b/drivers/infiniband/hw/hfi1/mad.c index 8daa3a5f7e95..11be4d19e607 100644 --- a/drivers/infiniband/hw/hfi1/mad.c +++ b/drivers/infiniband/hw/hfi1/mad.c @@ -4216,7 +4216,7 @@ static int opa_local_smp_check(struct hfi1_ibport *ibp, const struct ib_wc *in_wc) { struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); - u16 slid = ib_slid_cpu16(in_wc->slid); + u16 slid = ib_lid_cpu16(in_wc->slid); u16 pkey; if (in_wc->pkey_index >= ARRAY_SIZE(ppd->pkeys)) diff --git a/drivers/infiniband/hw/mlx4/alias_GUID.c b/drivers/infiniband/hw/mlx4/alias_GUID.c index 5a897b0106a9..0e4f60cfc59d 100644 --- a/drivers/infiniband/hw/mlx4/alias_GUID.c +++ b/drivers/infiniband/hw/mlx4/alias_GUID.c @@ -528,7 +528,7 @@ static int set_guid_rec(struct ib_device *ibdev, memset(&guid_info_rec, 0, sizeof (struct ib_sa_guidinfo_rec)); - guid_info_rec.lid = cpu_to_be16((u16)attr.lid); + guid_info_rec.lid = ib_lid_be16(attr.lid); guid_info_rec.block_num = index; memcpy(guid_info_rec.guid_info_list, rec_det->all_recs, diff --git a/drivers/infiniband/hw/mlx4/mad.c b/drivers/infiniband/hw/mlx4/mad.c index 04fb44e7699e..0793a21d76f4 100644 --- a/drivers/infiniband/hw/mlx4/mad.c +++ b/drivers/infiniband/hw/mlx4/mad.c @@ -169,7 +169,7 @@ int mlx4_MAD_IFC(struct mlx4_ib_dev *dev, int mad_ifc_flags, op_modifier |= 0x4; - in_modifier |= ib_slid_cpu16(in_wc->slid) << 16; + in_modifier |= ib_lid_cpu16(in_wc->slid) << 16; } err = mlx4_cmd_box(dev->dev, inmailbox->dma, outmailbox->dma, in_modifier, @@ -625,7 +625,7 @@ int mlx4_ib_send_to_slave(struct mlx4_ib_dev *dev, int slave, u8 port, memcpy((char *)&tun_mad->hdr.slid_mac_47_32, &(wc->smac[4]), 2); } else { tun_mad->hdr.sl_vid = cpu_to_be16(((u16)(wc->sl)) << 12); - tun_mad->hdr.slid_mac_47_32 = ib_slid_be16(wc->slid); + tun_mad->hdr.slid_mac_47_32 = ib_lid_be16(wc->slid); } ib_dma_sync_single_for_device(&dev->ib_dev, @@ -826,7 +826,7 @@ static int ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, } } - slid = in_wc ? ib_slid_cpu16(in_wc->slid) : be16_to_cpu(IB_LID_PERMISSIVE); + slid = in_wc ? ib_lid_cpu16(in_wc->slid) : be16_to_cpu(IB_LID_PERMISSIVE); if (in_mad->mad_hdr.method == IB_MGMT_METHOD_TRAP && slid == 0) { forward_trap(to_mdev(ibdev), port_num, in_mad); @@ -860,7 +860,7 @@ static int ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, in_mad->mad_hdr.method == IB_MGMT_METHOD_SET && in_mad->mad_hdr.attr_id == IB_SMP_ATTR_PORT_INFO && !ib_query_port(ibdev, port_num, &pattr)) - prev_lid = (u16)pattr.lid; + prev_lid = ib_lid_cpu16(pattr.lid); err = mlx4_MAD_IFC(to_mdev(ibdev), (mad_flags & IB_MAD_IGNORE_MKEY ? MLX4_MAD_IFC_IGNORE_MKEY : 0) | diff --git a/drivers/infiniband/hw/mlx5/mad.c b/drivers/infiniband/hw/mlx5/mad.c index cd2264ac88ae..18cfe5bf0fa3 100644 --- a/drivers/infiniband/hw/mlx5/mad.c +++ b/drivers/infiniband/hw/mlx5/mad.c @@ -78,7 +78,7 @@ static int process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, u16 slid; int err; - slid = in_wc ? ib_slid_cpu16(in_wc->slid) : be16_to_cpu(IB_LID_PERMISSIVE); + slid = in_wc ? ib_lid_cpu16(in_wc->slid) : be16_to_cpu(IB_LID_PERMISSIVE); if (in_mad->mad_hdr.method == IB_MGMT_METHOD_TRAP && slid == 0) return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED; diff --git a/drivers/infiniband/hw/mthca/mthca_cmd.c b/drivers/infiniband/hw/mthca/mthca_cmd.c index e19ae0b9b439..d0f062fc2a4b 100644 --- a/drivers/infiniband/hw/mthca/mthca_cmd.c +++ b/drivers/infiniband/hw/mthca/mthca_cmd.c @@ -1921,7 +1921,7 @@ int mthca_MAD_IFC(struct mthca_dev *dev, int ignore_mkey, int ignore_bkey, (in_wc->wc_flags & IB_WC_GRH ? 0x80 : 0); MTHCA_PUT(inbox, val, MAD_IFC_G_PATH_OFFSET); - MTHCA_PUT(inbox, ib_slid_cpu16(in_wc->slid), MAD_IFC_RLID_OFFSET); + MTHCA_PUT(inbox, ib_lid_cpu16(in_wc->slid), MAD_IFC_RLID_OFFSET); MTHCA_PUT(inbox, in_wc->pkey_index, MAD_IFC_PKEY_OFFSET); if (in_grh) @@ -1929,7 +1929,7 @@ int mthca_MAD_IFC(struct mthca_dev *dev, int ignore_mkey, int ignore_bkey, op_modifier |= 0x4; - in_modifier |= ib_slid_cpu16(in_wc->slid) << 16; + in_modifier |= ib_lid_cpu16(in_wc->slid) << 16; } err = mthca_cmd_box(dev, inmailbox->dma, outmailbox->dma, diff --git a/drivers/infiniband/hw/mthca/mthca_mad.c b/drivers/infiniband/hw/mthca/mthca_mad.c index a9caadab22cf..093f7755c843 100644 --- a/drivers/infiniband/hw/mthca/mthca_mad.c +++ b/drivers/infiniband/hw/mthca/mthca_mad.c @@ -205,7 +205,7 @@ int mthca_process_mad(struct ib_device *ibdev, u16 *out_mad_pkey_index) { int err; - u16 slid = in_wc ? ib_slid_cpu16(in_wc->slid) : be16_to_cpu(IB_LID_PERMISSIVE); + u16 slid = in_wc ? ib_lid_cpu16(in_wc->slid) : be16_to_cpu(IB_LID_PERMISSIVE); u16 prev_lid = 0; struct ib_port_attr pattr; const struct ib_mad *in_mad = (const struct ib_mad *)in; @@ -256,7 +256,7 @@ int mthca_process_mad(struct ib_device *ibdev, in_mad->mad_hdr.method == IB_MGMT_METHOD_SET && in_mad->mad_hdr.attr_id == IB_SMP_ATTR_PORT_INFO && !ib_query_port(ibdev, port_num, &pattr)) - prev_lid = (u16)pattr.lid; + prev_lid = ib_lid_cpu16(pattr.lid); err = mthca_MAD_IFC(to_mdev(ibdev), mad_flags & IB_MAD_IGNORE_MKEY, diff --git a/drivers/infiniband/sw/rdmavt/cq.c b/drivers/infiniband/sw/rdmavt/cq.c index 0335a3df74d5..97d71e49c092 100644 --- a/drivers/infiniband/sw/rdmavt/cq.c +++ b/drivers/infiniband/sw/rdmavt/cq.c @@ -107,7 +107,7 @@ void rvt_cq_enter(struct rvt_cq *cq, struct ib_wc *entry, bool solicited) wc->uqueue[head].src_qp = entry->src_qp; wc->uqueue[head].wc_flags = entry->wc_flags; wc->uqueue[head].pkey_index = entry->pkey_index; - wc->uqueue[head].slid = ib_slid_cpu16(entry->slid); + wc->uqueue[head].slid = ib_lid_cpu16(entry->slid); wc->uqueue[head].sl = entry->sl; wc->uqueue[head].dlid_path_bits = entry->dlid_path_bits; wc->uqueue[head].port_num = entry->port_num; diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 4db4ad56ace6..70a183179224 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -3724,16 +3724,30 @@ static inline enum rdma_ah_attr_type rdma_ah_find_type(struct ib_device *dev, return RDMA_AH_ATTR_TYPE_IB; } -/* Return slid in 16bit CPU encoding */ -static inline u16 ib_slid_cpu16(u32 slid) +/** + * ib_lid_cpu16 - Return lid in 16bit CPU encoding. + * In the current implementation the only way to get + * get the 32bit lid is from other sources for OPA. + * For IB, lids will always be 16bits so cast the + * value accordingly. + * + * @lid: A 32bit LID + */ +static inline u16 ib_lid_cpu16(u32 lid) { - return (u16)slid; + WARN_ON_ONCE(lid & 0xFFFF0000); + return (u16)lid; } -/* Return slid in 16bit BE encoding */ -static inline u16 ib_slid_be16(u32 slid) +/** + * ib_lid_be16 - Return lid in 16bit BE encoding. + * + * @lid: A 32bit LID + */ +static inline __be16 ib_lid_be16(u32 lid) { - return cpu_to_be16((u16)slid); + WARN_ON_ONCE(lid & 0xFFFF0000); + return cpu_to_be16((u16)lid); } /** -- cgit v1.2.3 From 18c90df9f2c00cb35ab8ba747aa0f742ee6bbf6a Mon Sep 17 00:00:00 2001 From: Romain Perier Date: Tue, 22 Aug 2017 13:46:59 +0200 Subject: mlx5: Replace PCI pool old API The PCI pool API is deprecated. This commit replaces the PCI pool old API by the appropriate function with the DMA pool API. Signed-off-by: Romain Perier Reviewed-by: Peter Senna Tschudin Acked-by: Doug Ledford Tested-by: Doug Ledford Signed-off-by: Doug Ledford --- drivers/net/ethernet/mellanox/mlx5/core/cmd.c | 11 ++++++----- include/linux/mlx5/driver.h | 2 +- 2 files changed, 7 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c index f5a2c605749f..7a40e8790f75 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c @@ -1095,7 +1095,7 @@ static struct mlx5_cmd_mailbox *alloc_cmd_box(struct mlx5_core_dev *dev, if (!mailbox) return ERR_PTR(-ENOMEM); - mailbox->buf = pci_pool_zalloc(dev->cmd.pool, flags, + mailbox->buf = dma_pool_zalloc(dev->cmd.pool, flags, &mailbox->dma); if (!mailbox->buf) { mlx5_core_dbg(dev, "failed allocation\n"); @@ -1110,7 +1110,7 @@ static struct mlx5_cmd_mailbox *alloc_cmd_box(struct mlx5_core_dev *dev, static void free_cmd_box(struct mlx5_core_dev *dev, struct mlx5_cmd_mailbox *mailbox) { - pci_pool_free(dev->cmd.pool, mailbox->buf, mailbox->dma); + dma_pool_free(dev->cmd.pool, mailbox->buf, mailbox->dma); kfree(mailbox); } @@ -1759,7 +1759,8 @@ int mlx5_cmd_init(struct mlx5_core_dev *dev) return -EINVAL; } - cmd->pool = pci_pool_create("mlx5_cmd", dev->pdev, size, align, 0); + cmd->pool = dma_pool_create("mlx5_cmd", &dev->pdev->dev, size, align, + 0); if (!cmd->pool) return -ENOMEM; @@ -1849,7 +1850,7 @@ err_free_page: free_cmd_page(dev, cmd); err_free_pool: - pci_pool_destroy(cmd->pool); + dma_pool_destroy(cmd->pool); return err; } @@ -1863,6 +1864,6 @@ void mlx5_cmd_cleanup(struct mlx5_core_dev *dev) destroy_workqueue(cmd->wq); destroy_msg_cache(dev); free_cmd_page(dev, cmd); - pci_pool_destroy(cmd->pool); + dma_pool_destroy(cmd->pool); } EXPORT_SYMBOL(mlx5_cmd_cleanup); diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 099fe311f272..db40bc4055c7 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -299,7 +299,7 @@ struct mlx5_cmd { struct semaphore pages_sem; int mode; struct mlx5_cmd_work_ent *ent_arr[MLX5_MAX_COMMANDS]; - struct pci_pool *pool; + struct dma_pool *pool; struct mlx5_cmd_debug dbg; struct cmd_msg_cache cache[MLX5_NUM_COMMAND_CACHES]; int checksum_disabled; -- cgit v1.2.3 From 16570d3da0938e0c46c31e5f97c9c8452025d2e7 Mon Sep 17 00:00:00 2001 From: Sebastian Sanchez Date: Fri, 4 Aug 2017 13:52:20 -0700 Subject: IB/hfi1: Remove pmtu from the QP structure The pmtu field doens't have be stored in the QP structure as it can easily be calculated when needed. Reviewed-by: Mike Marciniszyn Signed-off-by: Sebastian Sanchez Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/sw/rdmavt/qp.c | 3 +-- include/rdma/rdmavt_qp.h | 1 - 2 files changed, 1 insertion(+), 3 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c index 1878a97364aa..eb0c3d60c584 100644 --- a/drivers/infiniband/sw/rdmavt/qp.c +++ b/drivers/infiniband/sw/rdmavt/qp.c @@ -1243,7 +1243,6 @@ int rvt_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, if (attr_mask & IB_QP_PATH_MTU) { qp->pmtu = rdi->driver_f.mtu_from_qp(rdi, qp, pmtu); - qp->path_mtu = rdi->driver_f.mtu_to_path_mtu(qp->pmtu); qp->log_pmtu = ilog2(qp->pmtu); } @@ -1366,7 +1365,7 @@ int rvt_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, attr->qp_state = qp->state; attr->cur_qp_state = attr->qp_state; - attr->path_mtu = qp->path_mtu; + attr->path_mtu = rdi->driver_f.mtu_to_path_mtu(qp->pmtu); attr->path_mig_state = qp->s_mig_state; attr->qkey = qp->qkey; attr->rq_psn = qp->r_psn & rdi->dparms.psn_mask; diff --git a/include/rdma/rdmavt_qp.h b/include/rdma/rdmavt_qp.h index 07e2fffa6de6..8fbafb0ce674 100644 --- a/include/rdma/rdmavt_qp.h +++ b/include/rdma/rdmavt_qp.h @@ -277,7 +277,6 @@ struct rvt_qp { unsigned long timeout_jiffies; /* computed from timeout */ - enum ib_mtu path_mtu; int srate_mbps; /* s_srate (below) converted to Mbit/s */ pid_t pid; /* pid for user mode QPs */ u32 remote_qpn; -- cgit v1.2.3 From 13c19222889daf91da36b7fb63b5d5d9ce89b377 Mon Sep 17 00:00:00 2001 From: Don Hiatt Date: Fri, 4 Aug 2017 13:53:51 -0700 Subject: IB/rdmavt, hfi1, qib: Modify check_ah() to account for extended LIDs rvt_check_ah() delegates lid verification to underlying driver. Underlying driver uses different conditions to check for dlid depending on whether the device supports extended LIDs Reviewed-by: Dennis Dalessandro Signed-off-by: Dasaratharaman Chandramouli Signed-off-by: Don Hiatt Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/common.h | 9 --------- drivers/infiniband/hw/hfi1/mad.c | 5 +++-- drivers/infiniband/hw/hfi1/verbs.c | 5 +++++ drivers/infiniband/hw/qib/qib_verbs.c | 9 +++++++++ drivers/infiniband/sw/rdmavt/ah.c | 10 ---------- drivers/infiniband/sw/rdmavt/qp.c | 29 +++++++++++++++++++++++------ include/rdma/opa_addr.h | 18 ++++++++++++++++++ 7 files changed, 58 insertions(+), 27 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/hw/hfi1/common.h b/drivers/infiniband/hw/hfi1/common.h index ba9ab971ced9..aa416ef93c1a 100644 --- a/drivers/infiniband/hw/hfi1/common.h +++ b/drivers/infiniband/hw/hfi1/common.h @@ -333,15 +333,6 @@ struct diag_pkt { #define DEFAULT_P_KEY LIM_MGMT_P_KEY -/** - * 0xF8 - 4 bits of multicast range and 1 bit for collective range - * Example: For 24 bit LID space, - * Multicast range: 0xF00000 to 0xF7FFFF - * Collective range: 0xF80000 to 0xFFFFFE - */ -#define HFI1_MCAST_NR 0x4 /* Number of top bits set */ -#define HFI1_COLLECTIVE_NR 0x1 /* Number of bits after MCAST_NR */ - #define HFI1_PSM_IOC_BASE_SEQ 0x0 static inline __u64 rhf_to_cpu(const __le32 *rbuf) diff --git a/drivers/infiniband/hw/hfi1/mad.c b/drivers/infiniband/hw/hfi1/mad.c index cd1f6f841f34..21fadb4b510c 100644 --- a/drivers/infiniband/hw/hfi1/mad.c +++ b/drivers/infiniband/hw/hfi1/mad.c @@ -46,6 +46,7 @@ */ #include +#include #define OPA_NUM_PKEY_BLOCKS_PER_SMP (OPA_SMP_DR_DATA_SIZE \ / (OPA_PARTITION_TABLE_BLK_SIZE * sizeof(u16))) @@ -905,8 +906,8 @@ static int __subn_get_opa_portinfo(struct opa_smp *smp, u32 am, u8 *data, pi->buffer_units = cpu_to_be32(buffer_units); pi->opa_cap_mask = cpu_to_be16(ibp->rvp.port_cap3_flags); - pi->collectivemask_multicastmask = ((HFI1_COLLECTIVE_NR & 0x7) - << 3 | (HFI1_MCAST_NR & 0x7)); + pi->collectivemask_multicastmask = ((OPA_COLLECTIVE_NR & 0x7) + << 3 | (OPA_MCAST_NR & 0x7)); /* HFI supports a replay buffer 128 LTPs in size */ pi->replay_depth.buffer = 0x80; diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c index c88c03c11555..97ca42beb023 100644 --- a/drivers/infiniband/hw/hfi1/verbs.c +++ b/drivers/infiniband/hw/hfi1/verbs.c @@ -53,6 +53,7 @@ #include #include #include +#include #include "hfi.h" #include "common.h" @@ -1461,6 +1462,10 @@ static int hfi1_check_ah(struct ib_device *ibdev, struct rdma_ah_attr *ah_attr) struct hfi1_devdata *dd; u8 sc5; + if (hfi1_check_mcast(rdma_ah_get_dlid(ah_attr)) && + !(rdma_ah_get_ah_flags(ah_attr) & IB_AH_GRH)) + return -EINVAL; + /* test the mapping for validity */ ibp = to_iport(ibdev, rdma_ah_get_port_num(ah_attr)); ppd = ppd_from_ibp(ibp); diff --git a/drivers/infiniband/hw/qib/qib_verbs.c b/drivers/infiniband/hw/qib/qib_verbs.c index ac42dce7e281..9d92aeb8d9a1 100644 --- a/drivers/infiniband/hw/qib/qib_verbs.c +++ b/drivers/infiniband/hw/qib/qib_verbs.c @@ -1341,6 +1341,15 @@ int qib_check_ah(struct ib_device *ibdev, struct rdma_ah_attr *ah_attr) if (rdma_ah_get_sl(ah_attr) > 15) return -EINVAL; + if (rdma_ah_get_dlid(ah_attr) == 0) + return -EINVAL; + if (rdma_ah_get_dlid(ah_attr) >= + be16_to_cpu(IB_MULTICAST_LID_BASE) && + rdma_ah_get_dlid(ah_attr) != + be16_to_cpu(IB_LID_PERMISSIVE) && + !(rdma_ah_get_ah_flags(ah_attr) & IB_AH_GRH)) + return -EINVAL; + return 0; } diff --git a/drivers/infiniband/sw/rdmavt/ah.c b/drivers/infiniband/sw/rdmavt/ah.c index a96d4aa80ae8..ba3639a0d77c 100644 --- a/drivers/infiniband/sw/rdmavt/ah.c +++ b/drivers/infiniband/sw/rdmavt/ah.c @@ -66,8 +66,6 @@ int rvt_check_ah(struct ib_device *ibdev, int port_num = rdma_ah_get_port_num(ah_attr); struct ib_port_attr port_attr; struct rvt_dev_info *rdi = ib_to_rvt(ibdev); - enum rdma_link_layer link = rdma_port_get_link_layer(ibdev, port_num); - u32 dlid = rdma_ah_get_dlid(ah_attr); u8 ah_flags = rdma_ah_get_ah_flags(ah_attr); u8 static_rate = rdma_ah_get_static_rate(ah_attr); @@ -83,14 +81,6 @@ int rvt_check_ah(struct ib_device *ibdev, if ((ah_flags & IB_AH_GRH) && rdma_ah_read_grh(ah_attr)->sgid_index >= port_attr.gid_tbl_len) return -EINVAL; - if (link != IB_LINK_LAYER_ETHERNET) { - if (dlid == 0) - return -EINVAL; - if (dlid >= be16_to_cpu(IB_MULTICAST_LID_BASE) && - dlid != be16_to_cpu(IB_LID_PERMISSIVE) && - !(ah_flags & IB_AH_GRH)) - return -EINVAL; - } if (rdi->driver_f.check_ah) return rdi->driver_f.check_ah(ibdev, ah_attr); return 0; diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c index eb0c3d60c584..6f6525d24a2f 100644 --- a/drivers/infiniband/sw/rdmavt/qp.c +++ b/drivers/infiniband/sw/rdmavt/qp.c @@ -52,6 +52,7 @@ #include #include #include +#include #include "qp.h" #include "vt.h" #include "trace.h" @@ -1066,6 +1067,7 @@ int rvt_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int mig = 0; int pmtu = 0; /* for gcc warning only */ enum rdma_link_layer link; + int opa_ah; link = rdma_port_get_link_layer(ibqp->device, qp->port_num); @@ -1076,6 +1078,7 @@ int rvt_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, cur_state = attr_mask & IB_QP_CUR_STATE ? attr->cur_qp_state : qp->state; new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state; + opa_ah = rdma_cap_opa_ah(ibqp->device, qp->port_num); if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, attr_mask, link)) @@ -1086,17 +1089,31 @@ int rvt_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, goto inval; if (attr_mask & IB_QP_AV) { - if (rdma_ah_get_dlid(&attr->ah_attr) >= - be16_to_cpu(IB_MULTICAST_LID_BASE)) - goto inval; + if (opa_ah) { + if (rdma_ah_get_dlid(&attr->ah_attr) >= + opa_get_mcast_base(OPA_MCAST_NR)) + goto inval; + } else { + if (rdma_ah_get_dlid(&attr->ah_attr) >= + be16_to_cpu(IB_MULTICAST_LID_BASE)) + goto inval; + } + if (rvt_check_ah(qp->ibqp.device, &attr->ah_attr)) goto inval; } if (attr_mask & IB_QP_ALT_PATH) { - if (rdma_ah_get_dlid(&attr->alt_ah_attr) >= - be16_to_cpu(IB_MULTICAST_LID_BASE)) - goto inval; + if (opa_ah) { + if (rdma_ah_get_dlid(&attr->alt_ah_attr) >= + opa_get_mcast_base(OPA_MCAST_NR)) + goto inval; + } else { + if (rdma_ah_get_dlid(&attr->alt_ah_attr) >= + be16_to_cpu(IB_MULTICAST_LID_BASE)) + goto inval; + } + if (rvt_check_ah(qp->ibqp.device, &attr->alt_ah_attr)) goto inval; if (attr->alt_pkey_index >= rvt_get_npkeys(rdi)) diff --git a/include/rdma/opa_addr.h b/include/rdma/opa_addr.h index 9b5e642cf550..8d3ad4ecbea1 100644 --- a/include/rdma/opa_addr.h +++ b/include/rdma/opa_addr.h @@ -48,10 +48,21 @@ #ifndef OPA_ADDR_H #define OPA_ADDR_H +#include + #define OPA_SPECIAL_OUI (0x00066AULL) #define OPA_MAKE_ID(x) (cpu_to_be64(OPA_SPECIAL_OUI << 40 | (x))) #define OPA_TO_IB_UCAST_LID(x) (((x) >= be16_to_cpu(IB_MULTICAST_LID_BASE)) \ ? 0 : x) +/** + * 0xF8 - 4 bits of multicast range and 1 bit for collective range + * Example: For 24 bit LID space, + * Multicast range: 0xF00000 to 0xF7FFFF + * Collective range: 0xF80000 to 0xFFFFFE + */ +#define OPA_MCAST_NR 0x4 /* Number of top bits set */ +#define OPA_COLLECTIVE_NR 0x1 /* Number of bits after MCAST_NR */ + /** * ib_is_opa_gid: Returns true if the top 24 bits of the gid * contains the OPA_STL_OUI identifier. This identifies that @@ -95,4 +106,11 @@ static inline bool opa_is_extended_lid(u32 dlid, u32 slid) else return false; } + +/* Get multicast lid base */ +static inline u32 opa_get_mcast_base(u32 nr_top_bits) +{ + return (be32_to_cpu(OPA_LID_PERMISSIVE) << (32 - nr_top_bits)); +} + #endif /* OPA_ADDR_H */ -- cgit v1.2.3 From 72c07e2b671eda1cf3e8ebabc664f542f673b997 Mon Sep 17 00:00:00 2001 From: Don Hiatt Date: Fri, 4 Aug 2017 13:53:58 -0700 Subject: IB/hfi1: Add support to receive 16B bypass packets We introduce a struct hfi1_16b_header to support 16B headers. 16B bypass packets are received by the driver and processed similar to 9B packets. Add basic support to handle 16B packets. Reviewed-by: Dennis Dalessandro Signed-off-by: Don Hiatt Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/chip.c | 6 ++ drivers/infiniband/hw/hfi1/common.h | 1 + drivers/infiniband/hw/hfi1/driver.c | 127 ++++++++++++++++++++++++++++---- drivers/infiniband/hw/hfi1/hfi.h | 131 ++++++++++++++++++++++++++++++++- drivers/infiniband/hw/hfi1/rc.c | 2 +- drivers/infiniband/hw/hfi1/uc.c | 2 +- drivers/infiniband/hw/hfi1/ud.c | 4 +- drivers/infiniband/hw/hfi1/verbs.c | 17 +++-- drivers/infiniband/hw/hfi1/verbs.h | 13 ++++ drivers/infiniband/hw/hfi1/vnic.h | 15 ---- drivers/infiniband/hw/hfi1/vnic_main.c | 4 +- include/rdma/opa_vnic.h | 3 - 12 files changed, 274 insertions(+), 51 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/hw/hfi1/chip.c b/drivers/infiniband/hw/hfi1/chip.c index 1446c16bc8a8..ee1324cce25a 100644 --- a/drivers/infiniband/hw/hfi1/chip.c +++ b/drivers/infiniband/hw/hfi1/chip.c @@ -14468,6 +14468,7 @@ void hfi1_deinit_vnic_rsm(struct hfi1_devdata *dd) static void init_rxe(struct hfi1_devdata *dd) { struct rsm_map_table *rmt; + u64 val; /* enable all receive errors */ write_csr(dd, RCV_ERR_MASK, ~0ull); @@ -14492,6 +14493,11 @@ static void init_rxe(struct hfi1_devdata *dd) * (64 bytes). Max_Payload_Size is possibly modified upward in * tune_pcie_caps() which is called after this routine. */ + + /* Have 16 bytes (4DW) of bypass header available in header queue */ + val = read_csr(dd, RCV_BYPASS); + val |= (4ull << 16); + write_csr(dd, RCV_BYPASS, val); } static void init_other(struct hfi1_devdata *dd) diff --git a/drivers/infiniband/hw/hfi1/common.h b/drivers/infiniband/hw/hfi1/common.h index aa416ef93c1a..3e27794ec750 100644 --- a/drivers/infiniband/hw/hfi1/common.h +++ b/drivers/infiniband/hw/hfi1/common.h @@ -327,6 +327,7 @@ struct diag_pkt { /* misc. */ #define SC15_PACKET 0xF #define SIZE_OF_CRC 1 +#define SIZE_OF_LT 1 #define LIM_MGMT_P_KEY 0x7FFF #define FULL_MGMT_P_KEY 0xFFFF diff --git a/drivers/infiniband/hw/hfi1/driver.c b/drivers/infiniband/hw/hfi1/driver.c index 14f2a00c13c2..5280d82c344e 100644 --- a/drivers/infiniband/hw/hfi1/driver.c +++ b/drivers/infiniband/hw/hfi1/driver.c @@ -237,6 +237,13 @@ static inline struct ib_header *hfi1_get_msgheader(struct hfi1_devdata *dd, return (struct ib_header *)hfi1_get_header(dd, rhf_addr); } +static inline struct hfi1_16b_header + *hfi1_get_16B_header(struct hfi1_devdata *dd, + __le32 *rhf_addr) +{ + return (struct hfi1_16b_header *)hfi1_get_header(dd, rhf_addr); +} + /* * Validate and encode the a given RcvArray Buffer size. * The function will check whether the given size falls within @@ -925,6 +932,11 @@ static inline int set_armed_to_active(struct hfi1_ctxtdata *rcd, struct ib_header *hdr = hfi1_get_msgheader(packet->rcd->dd, packet->rhf_addr); sc = hfi1_9B_get_sc5(hdr, packet->rhf); + } else if (etype == RHF_RCV_TYPE_BYPASS) { + struct hfi1_16b_header *hdr = hfi1_get_16B_header( + packet->rcd->dd, + packet->rhf_addr); + sc = hfi1_16B_get_sc(hdr); } if (sc != SC15_PACKET) { int hwstate = driver_lstate(rcd->ppd); @@ -1386,9 +1398,14 @@ static int hfi1_setup_9B_packet(struct hfi1_packet *packet) } /* Query commonly used fields from packet header */ + packet->payload = packet->ebuf; packet->opcode = ib_bth_get_opcode(packet->ohdr); packet->slid = ib_get_slid(hdr); packet->dlid = ib_get_dlid(hdr); + if (unlikely((packet->dlid >= be16_to_cpu(IB_MULTICAST_LID_BASE)) && + (packet->dlid != be16_to_cpu(IB_LID_PERMISSIVE)))) + packet->dlid += opa_get_mcast_base(OPA_MCAST_NR) - + be16_to_cpu(IB_MULTICAST_LID_BASE); packet->sl = ib_get_sl(hdr); packet->sc = hfi1_9B_get_sc5(hdr, packet->rhf); packet->pad = ib_bth_get_pad(packet->ohdr); @@ -1402,6 +1419,73 @@ drop: return -EINVAL; } +static int hfi1_setup_bypass_packet(struct hfi1_packet *packet) +{ + /* + * Bypass packets have a different header/payload split + * compared to an IB packet. + * Current split is set such that 16 bytes of the actual + * header is in the header buffer and the remining is in + * the eager buffer. We chose 16 since hfi1 driver only + * supports 16B bypass packets and we will be able to + * receive the entire LRH with such a split. + */ + + struct hfi1_ctxtdata *rcd = packet->rcd; + struct hfi1_pportdata *ppd = rcd->ppd; + struct hfi1_ibport *ibp = &ppd->ibport_data; + u8 l4; + u8 grh_len; + + packet->hdr = (struct hfi1_16b_header *) + hfi1_get_16B_header(packet->rcd->dd, + packet->rhf_addr); + packet->hlen = (u8 *)packet->rhf_addr - (u8 *)packet->hdr; + + l4 = hfi1_16B_get_l4(packet->hdr); + if (l4 == OPA_16B_L4_IB_LOCAL) { + grh_len = 0; + packet->ohdr = packet->ebuf; + packet->grh = NULL; + } else if (l4 == OPA_16B_L4_IB_GLOBAL) { + u32 vtf; + + grh_len = sizeof(struct ib_grh); + packet->ohdr = packet->ebuf + grh_len; + packet->grh = packet->ebuf; + if (packet->grh->next_hdr != IB_GRH_NEXT_HDR) + goto drop; + vtf = be32_to_cpu(packet->grh->version_tclass_flow); + if ((vtf >> IB_GRH_VERSION_SHIFT) != IB_GRH_VERSION) + goto drop; + } else { + goto drop; + } + + /* Query commonly used fields from packet header */ + packet->opcode = ib_bth_get_opcode(packet->ohdr); + packet->hlen = hdr_len_by_opcode[packet->opcode] + 8 + grh_len; + packet->payload = packet->ebuf + packet->hlen - (4 * sizeof(u32)); + packet->slid = hfi1_16B_get_slid(packet->hdr); + packet->dlid = hfi1_16B_get_dlid(packet->hdr); + if (unlikely(hfi1_is_16B_mcast(packet->dlid))) + packet->dlid += opa_get_mcast_base(OPA_MCAST_NR) - + opa_get_lid(opa_get_mcast_base(OPA_MCAST_NR), + 16B); + packet->sc = hfi1_16B_get_sc(packet->hdr); + packet->sl = ibp->sc_to_sl[packet->sc]; + packet->pad = hfi1_16B_bth_get_pad(packet->ohdr); + packet->extra_byte = SIZE_OF_LT; + packet->fecn = hfi1_16B_get_fecn(packet->hdr); + packet->becn = hfi1_16B_get_becn(packet->hdr); + + return 0; +drop: + hfi1_cdbg(PKT, "%s: packet dropped\n", __func__); + ibp->rvp.n_pkt_drops++; + return -EINVAL; +} + void handle_eflags(struct hfi1_packet *packet) { struct hfi1_ctxtdata *rcd = packet->rcd; @@ -1464,8 +1548,8 @@ static inline bool hfi1_is_vnic_packet(struct hfi1_packet *packet) if (packet->rcd->is_vnic) return true; - if ((HFI1_GET_L2_TYPE(packet->ebuf) == OPA_VNIC_L2_TYPE) && - (HFI1_GET_L4_TYPE(packet->ebuf) == OPA_VNIC_L4_ETHR)) + if ((hfi1_16B_get_l2(packet->ebuf) == OPA_16B_L2_TYPE) && + (hfi1_16B_get_l4(packet->ebuf) == OPA_16B_L4_ETHR)) return true; return false; @@ -1475,25 +1559,38 @@ int process_receive_bypass(struct hfi1_packet *packet) { struct hfi1_devdata *dd = packet->rcd->dd; - if (unlikely(rhf_err_flags(packet->rhf))) { - handle_eflags(packet); - } else if (hfi1_is_vnic_packet(packet)) { + if (hfi1_is_vnic_packet(packet)) { hfi1_vnic_bypass_rcv(packet); return RHF_RCV_CONTINUE; } - dd_dev_err(dd, "Unsupported bypass packet. Dropping\n"); - incr_cntr64(&dd->sw_rcv_bypass_packet_errors); - if (!(dd->err_info_rcvport.status_and_code & OPA_EI_STATUS_SMASK)) { - u64 *flits = packet->ebuf; + if (hfi1_setup_bypass_packet(packet)) + return RHF_RCV_CONTINUE; + + if (unlikely(rhf_err_flags(packet->rhf))) { + handle_eflags(packet); + return RHF_RCV_CONTINUE; + } - if (flits && !(packet->rhf & RHF_LEN_ERR)) { - dd->err_info_rcvport.packet_flit1 = flits[0]; - dd->err_info_rcvport.packet_flit2 = - packet->tlen > sizeof(flits[0]) ? flits[1] : 0; + if (hfi1_16B_get_l2(packet->hdr) == 0x2) { + hfi1_16B_rcv(packet); + } else { + dd_dev_err(dd, + "Bypass packets other than 16B are not supported in normal operation. Dropping\n"); + incr_cntr64(&dd->sw_rcv_bypass_packet_errors); + if (!(dd->err_info_rcvport.status_and_code & + OPA_EI_STATUS_SMASK)) { + u64 *flits = packet->ebuf; + + if (flits && !(packet->rhf & RHF_LEN_ERR)) { + dd->err_info_rcvport.packet_flit1 = flits[0]; + dd->err_info_rcvport.packet_flit2 = + packet->tlen > sizeof(flits[0]) ? + flits[1] : 0; + } + dd->err_info_rcvport.status_and_code |= + (OPA_EI_STATUS_SMASK | BAD_L2_ERR); } - dd->err_info_rcvport.status_and_code |= - (OPA_EI_STATUS_SMASK | BAD_L2_ERR); } return RHF_RCV_CONTINUE; } diff --git a/drivers/infiniband/hw/hfi1/hfi.h b/drivers/infiniband/hw/hfi1/hfi.h index fa9160f68bb7..dbbad760cad4 100644 --- a/drivers/infiniband/hw/hfi1/hfi.h +++ b/drivers/infiniband/hw/hfi1/hfi.h @@ -66,6 +66,7 @@ #include #include #include +#include #include #include #include @@ -325,6 +326,7 @@ struct hfi1_ctxtdata { struct hfi1_packet { void *ebuf; void *hdr; + void *payload; struct hfi1_ctxtdata *rcd; __le32 *rhf_addr; struct rvt_qp *qp; @@ -351,6 +353,83 @@ struct hfi1_packet { bool fecn; }; +/* + * OPA 16B Header + */ +#define OPA_16B_L4_MASK 0xFFull +#define OPA_16B_SC_MASK 0x1F00000ull +#define OPA_16B_SC_SHIFT 20 +#define OPA_16B_LID_MASK 0xFFFFFull +#define OPA_16B_DLID_MASK 0xF000ull +#define OPA_16B_DLID_SHIFT 20 +#define OPA_16B_DLID_HIGH_SHIFT 12 +#define OPA_16B_SLID_MASK 0xF00ull +#define OPA_16B_SLID_SHIFT 20 +#define OPA_16B_SLID_HIGH_SHIFT 8 +#define OPA_16B_BECN_MASK 0x80000000ull +#define OPA_16B_BECN_SHIFT 31 +#define OPA_16B_FECN_MASK 0x10000000ull +#define OPA_16B_FECN_SHIFT 28 +#define OPA_16B_L2_MASK 0x60000000ull +#define OPA_16B_L2_SHIFT 29 + +/* + * OPA 16B L2/L4 Encodings + */ +#define OPA_16B_L2_TYPE 0x02 +#define OPA_16B_L4_IB_LOCAL 0x09 +#define OPA_16B_L4_IB_GLOBAL 0x0A +#define OPA_16B_L4_ETHR OPA_VNIC_L4_ETHR + +static inline u8 hfi1_16B_get_l4(struct hfi1_16b_header *hdr) +{ + return (u8)(hdr->lrh[2] & OPA_16B_L4_MASK); +} + +static inline u8 hfi1_16B_get_sc(struct hfi1_16b_header *hdr) +{ + return (u8)((hdr->lrh[1] & OPA_16B_SC_MASK) >> OPA_16B_SC_SHIFT); +} + +static inline u32 hfi1_16B_get_dlid(struct hfi1_16b_header *hdr) +{ + return (u32)((hdr->lrh[1] & OPA_16B_LID_MASK) | + (((hdr->lrh[2] & OPA_16B_DLID_MASK) >> + OPA_16B_DLID_HIGH_SHIFT) << OPA_16B_DLID_SHIFT)); +} + +static inline u32 hfi1_16B_get_slid(struct hfi1_16b_header *hdr) +{ + return (u32)((hdr->lrh[0] & OPA_16B_LID_MASK) | + (((hdr->lrh[2] & OPA_16B_SLID_MASK) >> + OPA_16B_SLID_HIGH_SHIFT) << OPA_16B_SLID_SHIFT)); +} + +static inline u8 hfi1_16B_get_becn(struct hfi1_16b_header *hdr) +{ + return (u8)((hdr->lrh[0] & OPA_16B_BECN_MASK) >> OPA_16B_BECN_SHIFT); +} + +static inline u8 hfi1_16B_get_fecn(struct hfi1_16b_header *hdr) +{ + return (u8)((hdr->lrh[1] & OPA_16B_FECN_MASK) >> OPA_16B_FECN_SHIFT); +} + +static inline u8 hfi1_16B_get_l2(struct hfi1_16b_header *hdr) +{ + return (u8)((hdr->lrh[1] & OPA_16B_L2_MASK) >> OPA_16B_L2_SHIFT); +} + +/* + * BTH + */ +#define OPA_16B_BTH_PAD_MASK 7 +static inline u8 hfi1_16B_bth_get_pad(struct ib_other_headers *ohdr) +{ + return (u8)((be32_to_cpu(ohdr->bth[0]) >> IB_BTH_PAD_SHIFT) & + OPA_16B_BTH_PAD_MASK); +} + struct rvt_sge_state; /* @@ -2084,11 +2163,55 @@ int hfi1_tempsense_rd(struct hfi1_devdata *dd, struct hfi1_temp *temp); /* * hfi1_check_mcast- Check if the given lid is - * in the IB multicast range. + * in the OPA multicast range. + * + * The LID might either reside in ah.dlid or might be + * in the GRH of the address handle as DGID if extended + * addresses are in use. */ -static inline bool hfi1_check_mcast(u16 lid) +static inline bool hfi1_check_mcast(u32 lid) +{ + return ((lid >= opa_get_mcast_base(OPA_MCAST_NR)) && + (lid != be32_to_cpu(OPA_LID_PERMISSIVE))); +} + +#define opa_get_lid(lid, format) \ + __opa_get_lid(lid, OPA_PORT_PACKET_FORMAT_##format) + +/* Convert a lid to a specific lid space */ +static inline u32 __opa_get_lid(u32 lid, u8 format) +{ + bool is_mcast = hfi1_check_mcast(lid); + + switch (format) { + case OPA_PORT_PACKET_FORMAT_8B: + case OPA_PORT_PACKET_FORMAT_10B: + if (is_mcast) + return (lid - opa_get_mcast_base(OPA_MCAST_NR) + + 0xF0000); + return lid & 0xFFFFF; + case OPA_PORT_PACKET_FORMAT_16B: + if (is_mcast) + return (lid - opa_get_mcast_base(OPA_MCAST_NR) + + 0xF00000); + return lid & 0xFFFFFF; + case OPA_PORT_PACKET_FORMAT_9B: + if (is_mcast) + return (lid - + opa_get_mcast_base(OPA_MCAST_NR) + + be16_to_cpu(IB_MULTICAST_LID_BASE)); + else + return lid & 0xFFFF; + default: + return lid; + } +} + +/* Return true if the given lid is the OPA 16B multicast range */ +static inline bool hfi1_is_16B_mcast(u32 lid) { - return ((lid >= be16_to_cpu(IB_MULTICAST_LID_BASE)) && - (lid != be16_to_cpu(IB_LID_PERMISSIVE))); + return ((lid >= + opa_get_lid(opa_get_mcast_base(OPA_MCAST_NR), 16B)) && + (lid != opa_get_lid(be32_to_cpu(OPA_LID_PERMISSIVE), 16B))); } #endif /* _HFI1_KERNEL_H */ diff --git a/drivers/infiniband/hw/hfi1/rc.c b/drivers/infiniband/hw/hfi1/rc.c index baa67bf0772b..cf74a56e20e5 100644 --- a/drivers/infiniband/hw/hfi1/rc.c +++ b/drivers/infiniband/hw/hfi1/rc.c @@ -1916,7 +1916,7 @@ void process_becn(struct hfi1_pportdata *ppd, u8 sl, u16 rlid, u32 lqpn, void hfi1_rc_rcv(struct hfi1_packet *packet) { struct hfi1_ctxtdata *rcd = packet->rcd; - void *data = packet->ebuf; + void *data = packet->payload; u32 tlen = packet->tlen; struct rvt_qp *qp = packet->qp; struct hfi1_ibport *ibp = rcd_to_iport(rcd); diff --git a/drivers/infiniband/hw/hfi1/uc.c b/drivers/infiniband/hw/hfi1/uc.c index 76c2451a53d7..366f7b9517fe 100644 --- a/drivers/infiniband/hw/hfi1/uc.c +++ b/drivers/infiniband/hw/hfi1/uc.c @@ -297,7 +297,7 @@ bail_no_tx: void hfi1_uc_rcv(struct hfi1_packet *packet) { struct hfi1_ibport *ibp = rcd_to_iport(packet->rcd); - void *data = packet->ebuf; + void *data = packet->payload; u32 tlen = packet->tlen; struct rvt_qp *qp = packet->qp; struct ib_other_headers *ohdr = packet->ohdr; diff --git a/drivers/infiniband/hw/hfi1/ud.c b/drivers/infiniband/hw/hfi1/ud.c index 6bf7a1b08491..dcf8c14c6d0e 100644 --- a/drivers/infiniband/hw/hfi1/ud.c +++ b/drivers/infiniband/hw/hfi1/ud.c @@ -667,11 +667,10 @@ void hfi1_ud_rcv(struct hfi1_packet *packet) struct hfi1_ibport *ibp = rcd_to_iport(packet->rcd); struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); struct ib_header *hdr = packet->hdr; - void *data = packet->ebuf; + void *data = packet->payload; u32 tlen = packet->tlen; struct rvt_qp *qp = packet->qp; u8 sc5 = hfi1_9B_get_sc5(hdr, packet->rhf); - u32 bth1; u8 sl_from_sc; u8 extra_bytes = packet->pad; u8 opcode = packet->opcode; @@ -679,7 +678,6 @@ void hfi1_ud_rcv(struct hfi1_packet *packet) u32 dlid = packet->dlid; u32 slid = packet->slid; - bth1 = be32_to_cpu(ohdr->bth[1]); qkey = ib_get_qkey(ohdr); src_qp = ib_get_sqpn(ohdr); pkey = ib_bth_get_pkey(ohdr); diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c index 97ca42beb023..ebddab1a06f4 100644 --- a/drivers/infiniband/hw/hfi1/verbs.c +++ b/drivers/infiniband/hw/hfi1/verbs.c @@ -571,7 +571,7 @@ static inline void hfi1_handle_packet(struct hfi1_packet *packet, goto drop; mcast = rvt_mcast_find(&ibp->rvp, &packet->grh->dgid, - packet->dlid); + opa_get_lid(packet->dlid, 9B)); if (!mcast) goto drop; list_for_each_entry_rcu(p, &mcast->qp_list, list) { @@ -627,14 +627,17 @@ drop: void hfi1_ib_rcv(struct hfi1_packet *packet) { struct hfi1_ctxtdata *rcd = packet->rcd; - bool is_mcast = false; - if (unlikely(hfi1_check_mcast(packet->dlid))) - is_mcast = true; + trace_input_ibhdr(rcd->dd, packet, !!(rhf_dc_info(packet->rhf))); + hfi1_handle_packet(packet, hfi1_check_mcast(packet->dlid)); +} + +void hfi1_16B_rcv(struct hfi1_packet *packet) +{ + struct hfi1_ctxtdata *rcd = packet->rcd; - trace_input_ibhdr(rcd->dd, packet, - !!(packet->rhf & RHF_DC_INFO_SMASK)); - hfi1_handle_packet(packet, is_mcast); + trace_input_ibhdr(rcd->dd, packet, false); + hfi1_handle_packet(packet, hfi1_check_mcast(packet->dlid)); } /* diff --git a/drivers/infiniband/hw/hfi1/verbs.h b/drivers/infiniband/hw/hfi1/verbs.h index 34267c7ef85a..590aab270f98 100644 --- a/drivers/infiniband/hw/hfi1/verbs.h +++ b/drivers/infiniband/hw/hfi1/verbs.h @@ -104,6 +104,17 @@ enum { HFI1_HAS_GRH = (1 << 0), }; +struct hfi1_16b_header { + u32 lrh[4]; + union { + struct { + struct ib_grh grh; + struct ib_other_headers oth; + } l; + struct ib_other_headers oth; + } u; +} __packed; + struct hfi1_ahg_info { u32 ahgdesc[2]; u16 tx_flags; @@ -378,6 +389,8 @@ void hfi1_unregister_ib_device(struct hfi1_devdata *); void hfi1_ib_rcv(struct hfi1_packet *packet); +void hfi1_16B_rcv(struct hfi1_packet *packet); + unsigned hfi1_get_npkeys(struct hfi1_devdata *); int hfi1_verbs_send_dma(struct rvt_qp *qp, struct hfi1_pkt_state *ps, diff --git a/drivers/infiniband/hw/hfi1/vnic.h b/drivers/infiniband/hw/hfi1/vnic.h index eec7c1424991..5ae781514e32 100644 --- a/drivers/infiniband/hw/hfi1/vnic.h +++ b/drivers/infiniband/hw/hfi1/vnic.h @@ -54,21 +54,6 @@ #define HFI1_VNIC_MAX_TXQ 16 #define HFI1_VNIC_MAX_PAD 12 -/* L2 header definitions */ -#define HFI1_L2_TYPE_OFFSET 0x7 -#define HFI1_L2_TYPE_SHFT 0x5 -#define HFI1_L2_TYPE_MASK 0x3 - -#define HFI1_GET_L2_TYPE(hdr) \ - ((*((u8 *)(hdr) + HFI1_L2_TYPE_OFFSET) >> HFI1_L2_TYPE_SHFT) & \ - HFI1_L2_TYPE_MASK) - -/* L4 type definitions */ -#define HFI1_L4_TYPE_OFFSET 8 - -#define HFI1_GET_L4_TYPE(data) \ - (*((u8 *)(data) + HFI1_L4_TYPE_OFFSET)) - /* L4 header definitions */ #define HFI1_VNIC_L4_HDR_OFFSET OPA_VNIC_L2_HDR_LEN diff --git a/drivers/infiniband/hw/hfi1/vnic_main.c b/drivers/infiniband/hw/hfi1/vnic_main.c index 2917a238a343..f419cbb05928 100644 --- a/drivers/infiniband/hw/hfi1/vnic_main.c +++ b/drivers/infiniband/hw/hfi1/vnic_main.c @@ -564,8 +564,8 @@ void hfi1_vnic_bypass_rcv(struct hfi1_packet *packet) int l4_type, vesw_id = -1; u8 q_idx; - l4_type = HFI1_GET_L4_TYPE(packet->ebuf); - if (likely(l4_type == OPA_VNIC_L4_ETHR)) { + l4_type = hfi1_16B_get_l4(packet->ebuf); + if (likely(l4_type == OPA_16B_L4_ETHR)) { vesw_id = HFI1_VNIC_GET_VESWID(packet->ebuf); vinfo = idr_find(&dd->vnic.vesw_idr, vesw_id); diff --git a/include/rdma/opa_vnic.h b/include/rdma/opa_vnic.h index 39d6890616a6..0c07a70bd7f6 100644 --- a/include/rdma/opa_vnic.h +++ b/include/rdma/opa_vnic.h @@ -54,9 +54,6 @@ #include -/* VNIC uses 16B header format */ -#define OPA_VNIC_L2_TYPE 0x2 - /* 16 header bytes + 2 reserved bytes */ #define OPA_VNIC_L2_HDR_LEN (16 + 2) -- cgit v1.2.3 From d98bb7f7e6fa29d45008370084d5cabac7ac69ed Mon Sep 17 00:00:00 2001 From: Don Hiatt Date: Fri, 4 Aug 2017 13:54:16 -0700 Subject: IB/hfi1: Determine 9B/16B L2 header type based on Address handle When address handle attributes are initialized, the LIDs are transformed to be in the 32 bit LID space. When constructing the header, hfi1 driver will look at the LID to determine the packet header to be created. Reviewed-by: Dennis Dalessandro Signed-off-by: Dasaratharaman Chandramouli Signed-off-by: Don Hiatt Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/core/sa_query.c | 21 +++++--- drivers/infiniband/core/uverbs_cmd.c | 3 ++ drivers/infiniband/hw/hfi1/hfi.h | 92 ++++++++++++++++++++++++++++++++++++ drivers/infiniband/hw/hfi1/qp.c | 28 +++++++++++ drivers/infiniband/hw/hfi1/verbs.c | 12 +++++ drivers/infiniband/hw/hfi1/verbs.h | 1 + include/rdma/ib_verbs.h | 15 ++++++ include/rdma/opa_addr.h | 4 +- 8 files changed, 168 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c index da29e2863c84..0179b21bad34 100644 --- a/drivers/infiniband/core/sa_query.c +++ b/drivers/infiniband/core/sa_query.c @@ -50,6 +50,7 @@ #include #include #include +#include #include "sa.h" #include "core_priv.h" @@ -1239,6 +1240,11 @@ int ib_init_ah_from_path(struct ib_device *device, u8 port_num, ah_attr->type = rdma_ah_find_type(device, port_num); rdma_ah_set_dlid(ah_attr, be32_to_cpu(sa_path_get_dlid(rec))); + + if ((ah_attr->type == RDMA_AH_ATTR_TYPE_OPA) && + (rdma_ah_get_dlid(ah_attr) == be16_to_cpu(IB_LID_PERMISSIVE))) + rdma_ah_set_make_grd(ah_attr, true); + rdma_ah_set_sl(ah_attr, rec->sl); rdma_ah_set_path_bits(ah_attr, be32_to_cpu(sa_path_get_slid(rec)) & get_src_path_mask(device, port_num)); @@ -2288,12 +2294,15 @@ static void update_sm_ah(struct work_struct *work) rdma_ah_set_sl(&ah_attr, port_attr.sm_sl); rdma_ah_set_port_num(&ah_attr, port->port_num); if (port_attr.grh_required) { - rdma_ah_set_ah_flags(&ah_attr, IB_AH_GRH); - - rdma_ah_set_subnet_prefix(&ah_attr, - cpu_to_be64(port_attr.subnet_prefix)); - rdma_ah_set_interface_id(&ah_attr, - cpu_to_be64(IB_SA_WELL_KNOWN_GUID)); + if (ah_attr.type == RDMA_AH_ATTR_TYPE_OPA) { + rdma_ah_set_make_grd(&ah_attr, true); + } else { + rdma_ah_set_ah_flags(&ah_attr, IB_AH_GRH); + rdma_ah_set_subnet_prefix(&ah_attr, + cpu_to_be64(port_attr.subnet_prefix)); + rdma_ah_set_interface_id(&ah_attr, + cpu_to_be64(IB_SA_WELL_KNOWN_GUID)); + } } new_ah->ah = rdma_create_ah(port->agent->qp->pd, &ah_attr); diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 7ea5a3bb5a04..dc7d773a96ec 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -2009,6 +2009,7 @@ static int modify_qp(struct ib_uverbs_file *file, rdma_ah_set_static_rate(&attr->ah_attr, cmd->base.dest.static_rate); rdma_ah_set_port_num(&attr->ah_attr, cmd->base.dest.port_num); + rdma_ah_set_make_grd(&attr->ah_attr, false); attr->alt_ah_attr.type = rdma_ah_find_type(qp->device, cmd->base.dest.port_num); @@ -2032,6 +2033,7 @@ static int modify_qp(struct ib_uverbs_file *file, cmd->base.alt_dest.static_rate); rdma_ah_set_port_num(&attr->alt_ah_attr, cmd->base.alt_dest.port_num); + rdma_ah_set_make_grd(&attr->alt_ah_attr, false); ret = ib_modify_qp_with_udata(qp, attr, modify_qp_mask(qp->qp_type, @@ -2584,6 +2586,7 @@ ssize_t ib_uverbs_create_ah(struct ib_uverbs_file *file, } attr.type = rdma_ah_find_type(ib_dev, cmd.attr.port_num); + rdma_ah_set_make_grd(&attr, false); rdma_ah_set_dlid(&attr, cmd.attr.dlid); rdma_ah_set_sl(&attr, cmd.attr.sl); rdma_ah_set_path_bits(&attr, cmd.attr.src_path_bits); diff --git a/drivers/infiniband/hw/hfi1/hfi.h b/drivers/infiniband/hw/hfi1/hfi.h index ee19660ca2fa..cec9590870ba 100644 --- a/drivers/infiniband/hw/hfi1/hfi.h +++ b/drivers/infiniband/hw/hfi1/hfi.h @@ -70,6 +70,7 @@ #include #include #include +#include #include "chip_registers.h" #include "common.h" @@ -353,6 +354,10 @@ struct hfi1_packet { bool fecn; }; +/* Packet types */ +#define HFI1_PKT_TYPE_9B 0 +#define HFI1_PKT_TYPE_16B 1 + /* * OPA 16B Header */ @@ -2170,6 +2175,31 @@ int hfi1_tempsense_rd(struct hfi1_devdata *dd, struct hfi1_temp *temp); #define DD_DEV_ENTRY(dd) __string(dev, dev_name(&(dd)->pcidev->dev)) #define DD_DEV_ASSIGN(dd) __assign_str(dev, dev_name(&(dd)->pcidev->dev)) +static inline void hfi1_update_ah_attr(struct ib_device *ibdev, + struct rdma_ah_attr *attr) +{ + struct hfi1_pportdata *ppd; + struct hfi1_ibport *ibp; + u32 dlid = rdma_ah_get_dlid(attr); + + /* + * Kernel clients may not have setup GRH information + * Set that here. + */ + ibp = to_iport(ibdev, rdma_ah_get_port_num(attr)); + ppd = ppd_from_ibp(ibp); + if ((((dlid >= be16_to_cpu(IB_MULTICAST_LID_BASE)) || + (ppd->lid >= be16_to_cpu(IB_MULTICAST_LID_BASE))) && + (dlid != be32_to_cpu(OPA_LID_PERMISSIVE)) && + (dlid != be16_to_cpu(IB_LID_PERMISSIVE)) && + (!(rdma_ah_get_ah_flags(attr) & IB_AH_GRH))) || + (rdma_ah_get_make_grd(attr))) { + rdma_ah_set_ah_flags(attr, IB_AH_GRH); + rdma_ah_set_interface_id(attr, OPA_MAKE_ID(dlid)); + rdma_ah_set_subnet_prefix(attr, ibp->rvp.gid_prefix); + } +} + /* * hfi1_check_mcast- Check if the given lid is * in the OPA multicast range. @@ -2223,4 +2253,66 @@ static inline bool hfi1_is_16B_mcast(u32 lid) opa_get_lid(opa_get_mcast_base(OPA_MCAST_NR), 16B)) && (lid != opa_get_lid(be32_to_cpu(OPA_LID_PERMISSIVE), 16B))); } + +static inline void hfi1_make_opa_lid(struct rdma_ah_attr *attr) +{ + const struct ib_global_route *grh = rdma_ah_read_grh(attr); + u32 dlid = rdma_ah_get_dlid(attr); + + /* Modify ah_attr.dlid to be in the 32 bit LID space. + * This is how the address will be laid out: + * Assuming MCAST_NR to be 4, + * 32 bit permissive LID = 0xFFFFFFFF + * Multicast LID range = 0xFFFFFFFE to 0xF0000000 + * Unicast LID range = 0xEFFFFFFF to 1 + * Invalid LID = 0 + */ + if (ib_is_opa_gid(&grh->dgid)) + dlid = opa_get_lid_from_gid(&grh->dgid); + else if ((dlid >= be16_to_cpu(IB_MULTICAST_LID_BASE)) && + (dlid != be16_to_cpu(IB_LID_PERMISSIVE)) && + (dlid != be32_to_cpu(OPA_LID_PERMISSIVE))) + dlid = dlid - be16_to_cpu(IB_MULTICAST_LID_BASE) + + opa_get_mcast_base(OPA_MCAST_NR); + else if (dlid == be16_to_cpu(IB_LID_PERMISSIVE)) + dlid = be32_to_cpu(OPA_LID_PERMISSIVE); + + rdma_ah_set_dlid(attr, dlid); +} + +static inline u8 hfi1_get_packet_type(u32 lid) +{ + /* 9B if lid > 0xF0000000 */ + if (lid >= opa_get_mcast_base(OPA_MCAST_NR)) + return HFI1_PKT_TYPE_9B; + + /* 16B if lid > 0xC000 */ + if (lid >= opa_get_lid(opa_get_mcast_base(OPA_MCAST_NR), 9B)) + return HFI1_PKT_TYPE_16B; + + return HFI1_PKT_TYPE_9B; +} + +static inline bool hfi1_get_hdr_type(u32 lid, struct rdma_ah_attr *attr) +{ + /* + * If there was an incoming 16B packet with permissive + * LIDs, OPA GIDs would have been programmed when those + * packets were received. A 16B packet will have to + * be sent in response to that packet. Return a 16B + * header type if that's the case. + */ + if (rdma_ah_get_dlid(attr) == be32_to_cpu(OPA_LID_PERMISSIVE)) + return (ib_is_opa_gid(&rdma_ah_read_grh(attr)->dgid)) ? + HFI1_PKT_TYPE_16B : HFI1_PKT_TYPE_9B; + + /* + * Return a 16B header type if either the the destination + * or source lid is extended. + */ + if (hfi1_get_packet_type(rdma_ah_get_dlid(attr)) == HFI1_PKT_TYPE_16B) + return HFI1_PKT_TYPE_16B; + + return hfi1_get_packet_type(lid); +} #endif /* _HFI1_KERNEL_H */ diff --git a/drivers/infiniband/hw/hfi1/qp.c b/drivers/infiniband/hw/hfi1/qp.c index b801d8469956..0fca6dfe8d9f 100644 --- a/drivers/infiniband/hw/hfi1/qp.c +++ b/drivers/infiniband/hw/hfi1/qp.c @@ -232,6 +232,31 @@ int hfi1_check_modify_qp(struct rvt_qp *qp, struct ib_qp_attr *attr, return 0; } +/* + * qp_set_16b - Set the hdr_type based on whether the slid or the + * dlid in the connection is extended. Only applicable for RC and UC + * QPs. UD QPs determine this on the fly from the ah in the wqe + */ +static inline void qp_set_16b(struct rvt_qp *qp) +{ + struct hfi1_pportdata *ppd; + struct hfi1_ibport *ibp; + struct hfi1_qp_priv *priv = qp->priv; + + /* Update ah_attr to account for extended LIDs */ + hfi1_update_ah_attr(qp->ibqp.device, &qp->remote_ah_attr); + + /* Create 32 bit LIDs */ + hfi1_make_opa_lid(&qp->remote_ah_attr); + + if (!(rdma_ah_get_ah_flags(&qp->remote_ah_attr) & IB_AH_GRH)) + return; + + ibp = to_iport(qp->ibqp.device, qp->port_num); + ppd = ppd_from_ibp(ibp); + priv->hdr_type = hfi1_get_hdr_type(ppd->lid, &qp->remote_ah_attr); +} + void hfi1_modify_qp(struct rvt_qp *qp, struct ib_qp_attr *attr, int attr_mask, struct ib_udata *udata) { @@ -242,6 +267,7 @@ void hfi1_modify_qp(struct rvt_qp *qp, struct ib_qp_attr *attr, priv->s_sc = ah_to_sc(ibqp->device, &qp->remote_ah_attr); priv->s_sde = qp_to_sdma_engine(qp, priv->s_sc); priv->s_sendcontext = qp_to_send_context(qp, priv->s_sc); + qp_set_16b(qp); } if (attr_mask & IB_QP_PATH_MIG_STATE && @@ -251,6 +277,7 @@ void hfi1_modify_qp(struct rvt_qp *qp, struct ib_qp_attr *attr, priv->s_sc = ah_to_sc(ibqp->device, &qp->remote_ah_attr); priv->s_sde = qp_to_sdma_engine(qp, priv->s_sc); priv->s_sendcontext = qp_to_send_context(qp, priv->s_sc); + qp_set_16b(qp); } } @@ -751,6 +778,7 @@ void hfi1_migrate_qp(struct rvt_qp *qp) qp->s_flags |= RVT_S_AHG_CLEAR; priv->s_sc = ah_to_sc(qp->ibqp.device, &qp->remote_ah_attr); priv->s_sde = qp_to_sdma_engine(qp, priv->s_sc); + qp_set_16b(qp); ev.device = qp->ibqp.device; ev.element.qp = &qp->ibqp; diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c index 0b1556fed47e..18b27276f202 100644 --- a/drivers/infiniband/hw/hfi1/verbs.c +++ b/drivers/infiniband/hw/hfi1/verbs.c @@ -1421,6 +1421,15 @@ static int query_port(struct rvt_dev_info *rdi, u8 port_num, props->active_mtu = !valid_ib_mtu(ppd->ibmtu) ? props->max_mtu : mtu_to_enum(ppd->ibmtu, IB_MTU_2048); + /* + * sm_lid of 0xFFFF needs special handling so that it can + * be differentiated from a permissve LID of 0xFFFF. + * We set the grh_required flag here so the SA can program + * the DGID in the address handle appropriately + */ + if (props->sm_lid == be16_to_cpu(IB_LID_PERMISSIVE)) + props->grh_required = true; + return 0; } @@ -1528,6 +1537,7 @@ static void hfi1_notify_new_ah(struct ib_device *ibdev, struct hfi1_pportdata *ppd; struct hfi1_devdata *dd; u8 sc5; + struct rdma_ah_attr *attr = &ah->attr; /* * Do not trust reading anything from rvt_ah at this point as it is not @@ -1537,6 +1547,8 @@ static void hfi1_notify_new_ah(struct ib_device *ibdev, ibp = to_iport(ibdev, rdma_ah_get_port_num(ah_attr)); ppd = ppd_from_ibp(ibp); sc5 = ibp->sl_to_sc[rdma_ah_get_sl(&ah->attr)]; + hfi1_update_ah_attr(ibdev, attr); + hfi1_make_opa_lid(attr); dd = dd_from_ppd(ppd); ah->vl = sc_to_vlt(dd, sc5); if (ah->vl < num_vls || ah->vl == 15) diff --git a/drivers/infiniband/hw/hfi1/verbs.h b/drivers/infiniband/hw/hfi1/verbs.h index 68577a0c922b..d3dd0c01b8f6 100644 --- a/drivers/infiniband/hw/hfi1/verbs.h +++ b/drivers/infiniband/hw/hfi1/verbs.h @@ -147,6 +147,7 @@ struct hfi1_qp_priv { u8 s_sc; /* SC[0..4] for next packet */ struct iowait s_iowait; struct rvt_qp *owner; + u8 hdr_type; /* 9B or 16B */ }; /* diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 70a183179224..8f263930c56f 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -864,6 +864,7 @@ struct roce_ah_attr { struct opa_ah_attr { u32 dlid; u8 src_path_bits; + bool make_grd; }; struct rdma_ah_attr { @@ -3625,6 +3626,20 @@ static inline u8 rdma_ah_get_path_bits(const struct rdma_ah_attr *attr) return 0; } +static inline void rdma_ah_set_make_grd(struct rdma_ah_attr *attr, + bool make_grd) +{ + if (attr->type == RDMA_AH_ATTR_TYPE_OPA) + attr->opa.make_grd = make_grd; +} + +static inline bool rdma_ah_get_make_grd(const struct rdma_ah_attr *attr) +{ + if (attr->type == RDMA_AH_ATTR_TYPE_OPA) + return attr->opa.make_grd; + return false; +} + static inline void rdma_ah_set_port_num(struct rdma_ah_attr *attr, u8 port_num) { attr->port_num = port_num; diff --git a/include/rdma/opa_addr.h b/include/rdma/opa_addr.h index 8d3ad4ecbea1..9ae126fb8648 100644 --- a/include/rdma/opa_addr.h +++ b/include/rdma/opa_addr.h @@ -71,7 +71,7 @@ * * @gid: The Global identifier */ -static inline bool ib_is_opa_gid(union ib_gid *gid) +static inline bool ib_is_opa_gid(const union ib_gid *gid) { return ((be64_to_cpu(gid->global.interface_id) >> 40) == OPA_SPECIAL_OUI); @@ -84,7 +84,7 @@ static inline bool ib_is_opa_gid(union ib_gid *gid) * * @gid: The Global identifier */ -static inline u32 opa_get_lid_from_gid(union ib_gid *gid) +static inline u32 opa_get_lid_from_gid(const union ib_gid *gid) { return be64_to_cpu(gid->global.interface_id) & 0xFFFFFFFF; } -- cgit v1.2.3 From 88733e3b845024cb2324a68469a4a25fdd9c0a6c Mon Sep 17 00:00:00 2001 From: Don Hiatt Date: Fri, 4 Aug 2017 13:54:23 -0700 Subject: IB/hfi1: Add 16B UD support Add 16B bypass packet support for UD traffic types. Reviewed-by: Dennis Dalessandro Signed-off-by: Dasaratharaman Chandramouli Signed-off-by: Don Hiatt Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/driver.c | 35 +-- drivers/infiniband/hw/hfi1/hfi.h | 117 +++++++++- drivers/infiniband/hw/hfi1/mad.c | 8 +- drivers/infiniband/hw/hfi1/ruc.c | 4 +- drivers/infiniband/hw/hfi1/ud.c | 421 +++++++++++++++++++++++++++--------- drivers/infiniband/hw/hfi1/verbs.h | 2 +- include/rdma/opa_addr.h | 1 + 7 files changed, 457 insertions(+), 131 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/hw/hfi1/driver.c b/drivers/infiniband/hw/hfi1/driver.c index ae6a90d2a31c..fc7085d6cf3f 100644 --- a/drivers/infiniband/hw/hfi1/driver.c +++ b/drivers/infiniband/hw/hfi1/driver.c @@ -437,23 +437,33 @@ void hfi1_process_ecn_slowpath(struct rvt_qp *qp, struct hfi1_packet *pkt, bool do_cnp) { struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num); - struct ib_header *hdr = pkt->hdr; struct ib_other_headers *ohdr = pkt->ohdr; struct ib_grh *grh = pkt->grh; u32 rqpn = 0, bth1; - u16 rlid, dlid = ib_get_dlid(hdr); - u8 sc, svc_type; + u16 pkey, rlid, dlid = ib_get_dlid(pkt->hdr); + u8 hdr_type, sc, svc_type; bool is_mcast = false; + if (pkt->etype == RHF_RCV_TYPE_BYPASS) { + is_mcast = hfi1_is_16B_mcast(dlid); + pkey = hfi1_16B_get_pkey(pkt->hdr); + sc = hfi1_16B_get_sc(pkt->hdr); + hdr_type = HFI1_PKT_TYPE_16B; + } else { + is_mcast = (dlid > be16_to_cpu(IB_MULTICAST_LID_BASE)) && + (dlid != be16_to_cpu(IB_LID_PERMISSIVE)); + pkey = ib_bth_get_pkey(ohdr); + sc = hfi1_9B_get_sc5(pkt->hdr, pkt->rhf); + hdr_type = HFI1_PKT_TYPE_9B; + } + switch (qp->ibqp.qp_type) { case IB_QPT_SMI: case IB_QPT_GSI: case IB_QPT_UD: - rlid = ib_get_slid(hdr); - rqpn = ib_get_sqpn(ohdr); + rlid = ib_get_slid(pkt->hdr); + rqpn = ib_get_sqpn(pkt->ohdr); svc_type = IB_CC_SVCTYPE_UD; - is_mcast = (dlid > be16_to_cpu(IB_MULTICAST_LID_BASE)) && - (dlid != be16_to_cpu(IB_LID_PERMISSIVE)); break; case IB_QPT_UC: rlid = rdma_ah_get_dlid(&qp->remote_ah_attr); @@ -469,14 +479,11 @@ void hfi1_process_ecn_slowpath(struct rvt_qp *qp, struct hfi1_packet *pkt, return; } - sc = hfi1_9B_get_sc5(hdr, pkt->rhf); - bth1 = be32_to_cpu(ohdr->bth[1]); - if (do_cnp && (bth1 & IB_FECN_SMASK)) { - u16 pkey = ib_bth_get_pkey(ohdr); - - return_cnp(ibp, qp, rqpn, pkey, dlid, rlid, sc, grh); - } + /* Call appropriate CNP handler */ + if (do_cnp && (bth1 & IB_FECN_SMASK)) + hfi1_handle_cnp_tbl[hdr_type](ibp, qp, rqpn, pkey, + dlid, rlid, sc, grh); if (!is_mcast && (bth1 & IB_BECN_SMASK)) { struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); diff --git a/drivers/infiniband/hw/hfi1/hfi.h b/drivers/infiniband/hw/hfi1/hfi.h index cec9590870ba..7e21192da8e1 100644 --- a/drivers/infiniband/hw/hfi1/hfi.h +++ b/drivers/infiniband/hw/hfi1/hfi.h @@ -831,6 +831,10 @@ struct hfi1_pportdata { typedef int (*rhf_rcv_function_ptr)(struct hfi1_packet *packet); typedef void (*opcode_handler)(struct hfi1_packet *packet); +typedef void (*hfi1_make_req)(struct rvt_qp *qp, + struct hfi1_pkt_state *ps, + struct rvt_swqe *wqe); + /* return values for the RHF receive functions */ #define RHF_RCV_CONTINUE 0 /* keep going */ @@ -1373,6 +1377,13 @@ void hfi1_set_vnic_msix_info(struct hfi1_ctxtdata *rcd); void hfi1_reset_vnic_msix_info(struct hfi1_ctxtdata *rcd); extern const struct pci_device_id hfi1_pci_tbl[]; +void hfi1_make_ud_req_9B(struct rvt_qp *qp, + struct hfi1_pkt_state *ps, + struct rvt_swqe *wqe); + +void hfi1_make_ud_req_16B(struct rvt_qp *qp, + struct hfi1_pkt_state *ps, + struct rvt_swqe *wqe); /* receive packet handler dispositions */ #define RCV_PKT_OK 0x0 /* keep going */ @@ -1507,6 +1518,18 @@ void process_becn(struct hfi1_pportdata *ppd, u8 sl, u16 rlid, u32 lqpn, void return_cnp(struct hfi1_ibport *ibp, struct rvt_qp *qp, u32 remote_qpn, u32 pkey, u32 slid, u32 dlid, u8 sc5, const struct ib_grh *old_grh); +void return_cnp_16B(struct hfi1_ibport *ibp, struct rvt_qp *qp, + u32 remote_qpn, u32 pkey, u32 slid, u32 dlid, + u8 sc5, const struct ib_grh *old_grh); +typedef void (*hfi1_handle_cnp)(struct hfi1_ibport *ibp, struct rvt_qp *qp, + u32 remote_qpn, u32 pkey, u32 slid, u32 dlid, + u8 sc5, const struct ib_grh *old_grh); + +/* We support only two types - 9B and 16B for now */ +static const hfi1_handle_cnp hfi1_handle_cnp_tbl[2] = { + [HFI1_PKT_TYPE_9B] = &return_cnp, + [HFI1_PKT_TYPE_16B] = &return_cnp_16B +}; #define PKEY_CHECK_INVALID -1 int egress_pkey_check(struct hfi1_pportdata *ppd, __be16 *lrh, __be32 *bth, u8 sc5, int8_t s_pkey_index); @@ -1747,12 +1770,22 @@ static inline bool process_ecn(struct rvt_qp *qp, struct hfi1_packet *pkt, bool do_cnp) { struct ib_other_headers *ohdr = pkt->ohdr; - u32 bth1; - bth1 = be32_to_cpu(ohdr->bth[1]); - if (unlikely(bth1 & (IB_BECN_SMASK | IB_FECN_SMASK))) { + u32 bth1; + bool becn = false; + bool fecn = false; + + if (pkt->etype == RHF_RCV_TYPE_BYPASS) { + fecn = hfi1_16B_get_fecn(pkt->hdr); + becn = hfi1_16B_get_becn(pkt->hdr); + } else { + bth1 = be32_to_cpu(ohdr->bth[1]); + fecn = bth1 & IB_FECN_SMASK; + becn = bth1 & IB_BECN_SMASK; + } + if (unlikely(fecn || becn)) { hfi1_process_ecn_slowpath(qp, pkt, do_cnp); - return !!(bth1 & IB_FECN_SMASK); + return fecn; } return false; } @@ -2315,4 +2348,80 @@ static inline bool hfi1_get_hdr_type(u32 lid, struct rdma_ah_attr *attr) return hfi1_get_packet_type(lid); } + +static inline void hfi1_make_ext_grh(struct hfi1_packet *packet, + struct ib_grh *grh, u32 slid, + u32 dlid) +{ + struct hfi1_ibport *ibp = &packet->rcd->ppd->ibport_data; + struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); + + if (!ibp) + return; + + grh->hop_limit = 1; + grh->sgid.global.subnet_prefix = ibp->rvp.gid_prefix; + if (slid == opa_get_lid(be32_to_cpu(OPA_LID_PERMISSIVE), 16B)) + grh->sgid.global.interface_id = + OPA_MAKE_ID(be32_to_cpu(OPA_LID_PERMISSIVE)); + else + grh->sgid.global.interface_id = OPA_MAKE_ID(slid); + + /* + * Upper layers (like mad) may compare the dgid in the + * wc that is obtained here with the sgid_index in + * the wr. Since sgid_index in wr is always 0 for + * extended lids, set the dgid here to the default + * IB gid. + */ + grh->dgid.global.subnet_prefix = ibp->rvp.gid_prefix; + grh->dgid.global.interface_id = + cpu_to_be64(ppd->guids[HFI1_PORT_GUID_INDEX]); +} + +static inline int hfi1_get_16b_padding(u32 hdr_size, u32 payload) +{ + return -(hdr_size + payload + (SIZE_OF_CRC << 2) + + SIZE_OF_LT) & 0x7; +} + +static inline void hfi1_make_ib_hdr(struct ib_header *hdr, + u16 lrh0, u16 len, + u16 dlid, u16 slid) +{ + hdr->lrh[0] = cpu_to_be16(lrh0); + hdr->lrh[1] = cpu_to_be16(dlid); + hdr->lrh[2] = cpu_to_be16(len); + hdr->lrh[3] = cpu_to_be16(slid); +} + +static inline void hfi1_make_16b_hdr(struct hfi1_16b_header *hdr, + u32 slid, u32 dlid, + u16 len, u16 pkey, + u8 becn, u8 fecn, u8 l4, + u8 sc) +{ + u32 lrh0 = 0; + u32 lrh1 = 0x40000000; + u32 lrh2 = 0; + u32 lrh3 = 0; + + lrh0 = (lrh0 & ~OPA_16B_BECN_MASK) | (becn << OPA_16B_BECN_SHIFT); + lrh0 = (lrh0 & ~OPA_16B_LEN_MASK) | (len << OPA_16B_LEN_SHIFT); + lrh0 = (lrh0 & ~OPA_16B_LID_MASK) | (slid & OPA_16B_LID_MASK); + lrh1 = (lrh1 & ~OPA_16B_FECN_MASK) | (fecn << OPA_16B_FECN_SHIFT); + lrh1 = (lrh1 & ~OPA_16B_SC_MASK) | (sc << OPA_16B_SC_SHIFT); + lrh1 = (lrh1 & ~OPA_16B_LID_MASK) | (dlid & OPA_16B_LID_MASK); + lrh2 = (lrh2 & ~OPA_16B_SLID_MASK) | + ((slid >> OPA_16B_SLID_SHIFT) << OPA_16B_SLID_HIGH_SHIFT); + lrh2 = (lrh2 & ~OPA_16B_DLID_MASK) | + ((dlid >> OPA_16B_DLID_SHIFT) << OPA_16B_DLID_HIGH_SHIFT); + lrh2 = (lrh2 & ~OPA_16B_PKEY_MASK) | (pkey << OPA_16B_PKEY_SHIFT); + lrh2 = (lrh2 & ~OPA_16B_L4_MASK) | l4; + + hdr->lrh[0] = lrh0; + hdr->lrh[1] = lrh1; + hdr->lrh[2] = lrh2; + hdr->lrh[3] = lrh3; +} #endif /* _HFI1_KERNEL_H */ diff --git a/drivers/infiniband/hw/hfi1/mad.c b/drivers/infiniband/hw/hfi1/mad.c index 21fadb4b510c..1509bc6b76d8 100644 --- a/drivers/infiniband/hw/hfi1/mad.c +++ b/drivers/infiniband/hw/hfi1/mad.c @@ -373,12 +373,10 @@ static struct trap_node *create_trap_node(u8 type, __be16 trap_num, u32 lid) * Send a bad P_Key trap (ch. 14.3.8). */ void hfi1_bad_pkey(struct hfi1_ibport *ibp, u32 key, u32 sl, - u32 qp1, u32 qp2, u16 lid1, u16 lid2) + u32 qp1, u32 qp2, u32 lid1, u32 lid2) { struct trap_node *trap; u32 lid = ppd_from_ibp(ibp)->lid; - u32 _lid1 = lid1; - u32 _lid2 = lid2; ibp->rvp.n_pkt_drops++; ibp->rvp.pkey_violations++; @@ -389,8 +387,8 @@ void hfi1_bad_pkey(struct hfi1_ibport *ibp, u32 key, u32 sl, return; /* Send violation trap */ - trap->data.ntc_257_258.lid1 = cpu_to_be32(_lid1); - trap->data.ntc_257_258.lid2 = cpu_to_be32(_lid2); + trap->data.ntc_257_258.lid1 = cpu_to_be32(lid1); + trap->data.ntc_257_258.lid2 = cpu_to_be32(lid2); trap->data.ntc_257_258.key = cpu_to_be32(key); trap->data.ntc_257_258.sl = sl << 3; trap->data.ntc_257_258.qp1 = cpu_to_be32(qp1); diff --git a/drivers/infiniband/hw/hfi1/ruc.c b/drivers/infiniband/hw/hfi1/ruc.c index d252f8f2207a..6839bfae933c 100644 --- a/drivers/infiniband/hw/hfi1/ruc.c +++ b/drivers/infiniband/hw/hfi1/ruc.c @@ -649,7 +649,7 @@ done: * @ibp: a pointer to the IB port * @hdr: a pointer to the GRH header being constructed * @grh: the global route address to send to - * @hwords: the number of 32 bit words of header being sent + * @hwords: size of header after grh being sent in dwords * @nwords: the number of 32 bit words of data being sent * * Return the size of the header in 32 bit words. @@ -661,7 +661,7 @@ u32 hfi1_make_grh(struct hfi1_ibport *ibp, struct ib_grh *hdr, cpu_to_be32((IB_GRH_VERSION << IB_GRH_VERSION_SHIFT) | (grh->traffic_class << IB_GRH_TCLASS_SHIFT) | (grh->flow_label << IB_GRH_FLOW_SHIFT)); - hdr->paylen = cpu_to_be16((hwords - 2 + nwords + SIZE_OF_CRC) << 2); + hdr->paylen = cpu_to_be16((hwords + nwords) << 2); /* next_hdr is defined by C8-7 in ch. 8.4.1 */ hdr->next_hdr = IB_GRH_NEXT_HDR; hdr->hop_limit = grh->hop_limit; diff --git a/drivers/infiniband/hw/hfi1/ud.c b/drivers/infiniband/hw/hfi1/ud.c index b708376b67da..2ba74fdd6f15 100644 --- a/drivers/infiniband/hw/hfi1/ud.c +++ b/drivers/infiniband/hw/hfi1/ud.c @@ -53,6 +53,12 @@ #include "verbs_txreq.h" #include "qp.h" +/* We support only two types - 9B and 16B for now */ +static const hfi1_make_req hfi1_make_ud_req_tbl[2] = { + [HFI1_PKT_TYPE_9B] = &hfi1_make_ud_req_9B, + [HFI1_PKT_TYPE_16B] = &hfi1_make_ud_req_16B +}; + /** * ud_loopback - handle send on loopback QPs * @sqp: the sending QP @@ -67,6 +73,7 @@ static void ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe) { struct hfi1_ibport *ibp = to_iport(sqp->ibqp.device, sqp->port_num); struct hfi1_pportdata *ppd; + struct hfi1_qp_priv *priv = sqp->priv; struct rvt_qp *qp; struct rdma_ah_attr *ah_attr; unsigned long flags; @@ -102,7 +109,7 @@ static void ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe) if (qp->ibqp.qp_num > 1) { u16 pkey; - u16 slid; + u32 slid; u8 sc5 = ibp->sl_to_sc[rdma_ah_get_sl(ah_attr)]; pkey = hfi1_get_pkey(ibp, sqp->s_pkey_index); @@ -176,9 +183,33 @@ static void ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe) if (rdma_ah_get_ah_flags(ah_attr) & IB_AH_GRH) { struct ib_grh grh; - const struct ib_global_route *grd = rdma_ah_read_grh(ah_attr); + struct ib_global_route grd = *(rdma_ah_read_grh(ah_attr)); + + /* + * For loopback packets with extended LIDs, the + * sgid_index in the GRH is 0 and the dgid is + * OPA GID of the sender. While creating a response + * to the loopback packet, IB core creates the new + * sgid_index from the DGID and that will be the + * OPA_GID_INDEX. The new dgid is from the sgid + * index and that will be in the IB GID format. + * + * We now have a case where the sent packet had a + * different sgid_index and dgid compared to the + * one that was received in response. + * + * Fix this inconsistency. + */ + if (priv->hdr_type == HFI1_PKT_TYPE_16B) { + if (grd.sgid_index == 0) + grd.sgid_index = OPA_GID_INDEX; - hfi1_make_grh(ibp, &grh, grd, 0, 0); + if (ib_is_opa_gid(&grd.dgid)) + grd.dgid.global.interface_id = + cpu_to_be64(ppd->guids[HFI1_PORT_GUID_INDEX]); + } + + hfi1_make_grh(ibp, &grh, &grd, 0, 0); hfi1_copy_sge(&qp->r_sge, &grh, sizeof(grh), true, false); wc.wc_flags |= IB_WC_GRH; @@ -235,7 +266,7 @@ static void ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe) wc.pkey_index = 0; } wc.slid = ppd->lid | (rdma_ah_get_path_bits(ah_attr) & - ((1 << ppd->lmc) - 1)); + ((1 << ppd->lmc) - 1)); /* Check for loopback when the port lid is not set */ if (wc.slid == 0 && sqp->ibqp.qp_type == IB_QPT_GSI) wc.slid = be16_to_cpu(IB_LID_PERMISSIVE); @@ -252,6 +283,183 @@ drop: rcu_read_unlock(); } +static void hfi1_make_bth_deth(struct rvt_qp *qp, struct rvt_swqe *wqe, + struct ib_other_headers *ohdr, + u16 *pkey, u32 extra_bytes, bool bypass) +{ + u32 bth0; + struct hfi1_ibport *ibp; + + ibp = to_iport(qp->ibqp.device, qp->port_num); + if (wqe->wr.opcode == IB_WR_SEND_WITH_IMM) { + ohdr->u.ud.imm_data = wqe->wr.ex.imm_data; + bth0 = IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE << 24; + } else { + bth0 = IB_OPCODE_UD_SEND_ONLY << 24; + } + + if (wqe->wr.send_flags & IB_SEND_SOLICITED) + bth0 |= IB_BTH_SOLICITED; + bth0 |= extra_bytes << 20; + if (qp->ibqp.qp_type == IB_QPT_GSI || qp->ibqp.qp_type == IB_QPT_SMI) + *pkey = hfi1_get_pkey(ibp, wqe->ud_wr.pkey_index); + else + *pkey = hfi1_get_pkey(ibp, qp->s_pkey_index); + if (!bypass) + bth0 |= *pkey; + ohdr->bth[0] = cpu_to_be32(bth0); + ohdr->bth[1] = cpu_to_be32(wqe->ud_wr.remote_qpn); + ohdr->bth[2] = cpu_to_be32(mask_psn(wqe->psn)); + /* + * Qkeys with the high order bit set mean use the + * qkey from the QP context instead of the WR (see 10.2.5). + */ + ohdr->u.ud.deth[0] = cpu_to_be32((int)wqe->ud_wr.remote_qkey < 0 ? + qp->qkey : wqe->ud_wr.remote_qkey); + ohdr->u.ud.deth[1] = cpu_to_be32(qp->ibqp.qp_num); +} + +void hfi1_make_ud_req_9B(struct rvt_qp *qp, struct hfi1_pkt_state *ps, + struct rvt_swqe *wqe) +{ + u32 nwords, extra_bytes; + u16 len, slid, dlid, pkey; + u16 lrh0 = 0; + u8 sc5; + struct hfi1_qp_priv *priv = qp->priv; + struct ib_other_headers *ohdr; + struct rdma_ah_attr *ah_attr; + struct hfi1_pportdata *ppd; + struct hfi1_ibport *ibp; + struct ib_grh *grh; + + ibp = to_iport(qp->ibqp.device, qp->port_num); + ppd = ppd_from_ibp(ibp); + ah_attr = &ibah_to_rvtah(wqe->ud_wr.ah)->attr; + + extra_bytes = -wqe->length & 3; + nwords = ((wqe->length + extra_bytes) >> 2) + SIZE_OF_CRC; + /* header size in dwords LRH+BTH+DETH = (8+12+8)/4. */ + qp->s_hdrwords = 7; + if (wqe->wr.opcode == IB_WR_SEND_WITH_IMM) + qp->s_hdrwords++; + + if (rdma_ah_get_ah_flags(ah_attr) & IB_AH_GRH) { + grh = &ps->s_txreq->phdr.hdr.ibh.u.l.grh; + qp->s_hdrwords += hfi1_make_grh(ibp, grh, + rdma_ah_read_grh(ah_attr), + qp->s_hdrwords - 2, nwords); + lrh0 = HFI1_LRH_GRH; + ohdr = &ps->s_txreq->phdr.hdr.ibh.u.l.oth; + } else { + lrh0 = HFI1_LRH_BTH; + ohdr = &ps->s_txreq->phdr.hdr.ibh.u.oth; + } + + sc5 = ibp->sl_to_sc[rdma_ah_get_sl(ah_attr)]; + lrh0 |= (rdma_ah_get_sl(ah_attr) & 0xf) << 4; + if (qp->ibqp.qp_type == IB_QPT_SMI) { + lrh0 |= 0xF000; /* Set VL (see ch. 13.5.3.1) */ + priv->s_sc = 0xf; + } else { + lrh0 |= (sc5 & 0xf) << 12; + priv->s_sc = sc5; + } + + dlid = opa_get_lid(rdma_ah_get_dlid(ah_attr), 9B); + if (dlid == be16_to_cpu(IB_LID_PERMISSIVE)) { + slid = be16_to_cpu(IB_LID_PERMISSIVE); + } else { + u16 lid = (u16)ppd->lid; + + if (lid) { + lid |= rdma_ah_get_path_bits(ah_attr) & + ((1 << ppd->lmc) - 1); + slid = lid; + } else { + slid = be16_to_cpu(IB_LID_PERMISSIVE); + } + } + hfi1_make_bth_deth(qp, wqe, ohdr, &pkey, extra_bytes, false); + len = qp->s_hdrwords + nwords; + + /* Setup the packet */ + ps->s_txreq->phdr.hdr.hdr_type = HFI1_PKT_TYPE_9B; + hfi1_make_ib_hdr(&ps->s_txreq->phdr.hdr.ibh, + lrh0, len, dlid, slid); +} + +void hfi1_make_ud_req_16B(struct rvt_qp *qp, struct hfi1_pkt_state *ps, + struct rvt_swqe *wqe) +{ + struct hfi1_qp_priv *priv = qp->priv; + struct ib_other_headers *ohdr; + struct rdma_ah_attr *ah_attr; + struct hfi1_pportdata *ppd; + struct hfi1_ibport *ibp; + u32 dlid, slid, nwords, extra_bytes; + u16 len, pkey; + u8 l4, sc5; + + ibp = to_iport(qp->ibqp.device, qp->port_num); + ppd = ppd_from_ibp(ibp); + ah_attr = &ibah_to_rvtah(wqe->ud_wr.ah)->attr; + /* header size in dwords 16B LRH+BTH+DETH = (16+12+8)/4. */ + qp->s_hdrwords = 9; + if (wqe->wr.opcode == IB_WR_SEND_WITH_IMM) + qp->s_hdrwords++; + + /* SW provides space for CRC and LT for bypass packets. */ + extra_bytes = hfi1_get_16b_padding((qp->s_hdrwords << 2), + wqe->length); + nwords = ((wqe->length + extra_bytes + SIZE_OF_LT) >> 2) + SIZE_OF_CRC; + + if ((rdma_ah_get_ah_flags(ah_attr) & IB_AH_GRH) && + hfi1_check_mcast(rdma_ah_get_dlid(ah_attr))) { + struct ib_grh *grh; + struct ib_global_route *grd = rdma_ah_retrieve_grh(ah_attr); + /* + * Ensure OPA GIDs are transformed to IB gids + * before creating the GRH. + */ + if (grd->sgid_index == OPA_GID_INDEX) { + dd_dev_warn(ppd->dd, "Bad sgid_index. sgid_index: %d\n", + grd->sgid_index); + grd->sgid_index = 0; + } + grh = &ps->s_txreq->phdr.hdr.opah.u.l.grh; + qp->s_hdrwords += hfi1_make_grh(ibp, grh, grd, + qp->s_hdrwords - 4, nwords); + ohdr = &ps->s_txreq->phdr.hdr.opah.u.l.oth; + l4 = OPA_16B_L4_IB_GLOBAL; + } else { + ohdr = &ps->s_txreq->phdr.hdr.opah.u.oth; + l4 = OPA_16B_L4_IB_LOCAL; + } + + sc5 = ibp->sl_to_sc[rdma_ah_get_sl(ah_attr)]; + if (qp->ibqp.qp_type == IB_QPT_SMI) + priv->s_sc = 0xf; + else + priv->s_sc = sc5; + + dlid = opa_get_lid(rdma_ah_get_dlid(ah_attr), 16B); + if (!ppd->lid) + slid = be32_to_cpu(OPA_LID_PERMISSIVE); + else + slid = ppd->lid | (rdma_ah_get_path_bits(ah_attr) & + ((1 << ppd->lmc) - 1)); + + hfi1_make_bth_deth(qp, wqe, ohdr, &pkey, extra_bytes, true); + /* Convert dwords to flits */ + len = (qp->s_hdrwords + nwords) >> 1; + + /* Setup the packet */ + ps->s_txreq->phdr.hdr.hdr_type = HFI1_PKT_TYPE_16B; + hfi1_make_16b_hdr(&ps->s_txreq->phdr.hdr.opah, + slid, dlid, len, pkey, 0, 0, l4, priv->s_sc); +} + /** * hfi1_make_ud_req - construct a UD request packet * @qp: the QP @@ -263,18 +471,12 @@ drop: int hfi1_make_ud_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) { struct hfi1_qp_priv *priv = qp->priv; - struct ib_other_headers *ohdr; struct rdma_ah_attr *ah_attr; struct hfi1_pportdata *ppd; struct hfi1_ibport *ibp; struct rvt_swqe *wqe; - u32 nwords; - u32 extra_bytes; - u32 bth0; - u16 lrh0; - u16 lid; int next_cur; - u8 sc5; + u32 lid; ps->s_txreq = get_txreq(ps->dev, qp); if (IS_ERR(ps->s_txreq)) @@ -311,13 +513,14 @@ int hfi1_make_ud_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) ibp = to_iport(qp->ibqp.device, qp->port_num); ppd = ppd_from_ibp(ibp); ah_attr = &ibah_to_rvtah(wqe->ud_wr.ah)->attr; - if (rdma_ah_get_dlid(ah_attr) < be16_to_cpu(IB_MULTICAST_LID_BASE) || - rdma_ah_get_dlid(ah_attr) == be16_to_cpu(IB_LID_PERMISSIVE)) { + priv->hdr_type = hfi1_get_hdr_type(ppd->lid, ah_attr); + if ((!hfi1_check_mcast(rdma_ah_get_dlid(ah_attr))) || + (rdma_ah_get_dlid(ah_attr) == be32_to_cpu(OPA_LID_PERMISSIVE))) { lid = rdma_ah_get_dlid(ah_attr) & ~((1 << ppd->lmc) - 1); if (unlikely(!loopback && - (lid == ppd->lid || - (lid == be16_to_cpu(IB_LID_PERMISSIVE) && - qp->ibqp.qp_type == IB_QPT_GSI)))) { + ((lid == ppd->lid) || + ((lid == be32_to_cpu(OPA_LID_PERMISSIVE)) && + (qp->ibqp.qp_type == IB_QPT_GSI))))) { unsigned long tflags = ps->flags; /* * If DMAs are in progress, we can't generate @@ -341,11 +544,6 @@ int hfi1_make_ud_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) } qp->s_cur = next_cur; - extra_bytes = -wqe->length & 3; - nwords = (wqe->length + extra_bytes) >> 2; - - /* header size in 32-bit words LRH+BTH+DETH = (8+12+8)/4. */ - qp->s_hdrwords = 7; ps->s_txreq->s_cur_size = wqe->length; ps->s_txreq->ss = &qp->s_sge; qp->s_srate = rdma_ah_get_static_rate(ah_attr); @@ -356,78 +554,12 @@ int hfi1_make_ud_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) qp->s_sge.num_sge = wqe->wr.num_sge; qp->s_sge.total_len = wqe->length; - if (rdma_ah_get_ah_flags(ah_attr) & IB_AH_GRH) { - /* Header size in 32-bit words. */ - qp->s_hdrwords += - hfi1_make_grh(ibp, - &ps->s_txreq->phdr.hdr.ibh.u.l.grh, - rdma_ah_read_grh(ah_attr), - qp->s_hdrwords, nwords); - lrh0 = HFI1_LRH_GRH; - ohdr = &ps->s_txreq->phdr.hdr.ibh.u.l.oth; - /* - * Don't worry about sending to locally attached multicast - * QPs. It is unspecified by the spec. what happens. - */ - } else { - /* Header size in 32-bit words. */ - lrh0 = HFI1_LRH_BTH; - ohdr = &ps->s_txreq->phdr.hdr.ibh.u.oth; - } - if (wqe->wr.opcode == IB_WR_SEND_WITH_IMM) { - qp->s_hdrwords++; - ohdr->u.ud.imm_data = wqe->wr.ex.imm_data; - bth0 = IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE << 24; - } else { - bth0 = IB_OPCODE_UD_SEND_ONLY << 24; - } - sc5 = ibp->sl_to_sc[rdma_ah_get_sl(ah_attr)]; - lrh0 |= (rdma_ah_get_sl(ah_attr) & 0xf) << 4; - if (qp->ibqp.qp_type == IB_QPT_SMI) { - lrh0 |= 0xF000; /* Set VL (see ch. 13.5.3.1) */ - priv->s_sc = 0xf; - } else { - lrh0 |= (sc5 & 0xf) << 12; - priv->s_sc = sc5; - } + /* Make the appropriate header */ + hfi1_make_ud_req_tbl[priv->hdr_type](qp, ps, qp->s_wqe); priv->s_sde = qp_to_sdma_engine(qp, priv->s_sc); ps->s_txreq->sde = priv->s_sde; priv->s_sendcontext = qp_to_send_context(qp, priv->s_sc); ps->s_txreq->psc = priv->s_sendcontext; - ps->s_txreq->phdr.hdr.ibh.lrh[0] = cpu_to_be16(lrh0); - ps->s_txreq->phdr.hdr.ibh.lrh[1] = - cpu_to_be16(rdma_ah_get_dlid(ah_attr)); - ps->s_txreq->phdr.hdr.ibh.lrh[2] = - cpu_to_be16(qp->s_hdrwords + nwords + SIZE_OF_CRC); - if (rdma_ah_get_dlid(ah_attr) == be16_to_cpu(IB_LID_PERMISSIVE)) { - ps->s_txreq->phdr.hdr.ibh.lrh[3] = IB_LID_PERMISSIVE; - } else { - lid = ppd->lid; - if (lid) { - lid |= rdma_ah_get_path_bits(ah_attr) & - ((1 << ppd->lmc) - 1); - ps->s_txreq->phdr.hdr.ibh.lrh[3] = cpu_to_be16(lid); - } else { - ps->s_txreq->phdr.hdr.ibh.lrh[3] = IB_LID_PERMISSIVE; - } - } - if (wqe->wr.send_flags & IB_SEND_SOLICITED) - bth0 |= IB_BTH_SOLICITED; - bth0 |= extra_bytes << 20; - if (qp->ibqp.qp_type == IB_QPT_GSI || qp->ibqp.qp_type == IB_QPT_SMI) - bth0 |= hfi1_get_pkey(ibp, wqe->ud_wr.pkey_index); - else - bth0 |= hfi1_get_pkey(ibp, qp->s_pkey_index); - ohdr->bth[0] = cpu_to_be32(bth0); - ohdr->bth[1] = cpu_to_be32(wqe->ud_wr.remote_qpn); - ohdr->bth[2] = cpu_to_be32(mask_psn(wqe->psn)); - /* - * Qkeys with the high order bit set mean use the - * qkey from the QP context instead of the WR (see 10.2.5). - */ - ohdr->u.ud.deth[0] = cpu_to_be32((int)wqe->ud_wr.remote_qkey < 0 ? - qp->qkey : wqe->ud_wr.remote_qkey); - ohdr->u.ud.deth[1] = cpu_to_be32(qp->ibqp.qp_num); /* disarm any ahg */ priv->s_ahg->ahgcount = 0; priv->s_ahg->ahgidx = 0; @@ -497,6 +629,64 @@ int hfi1_lookup_pkey_idx(struct hfi1_ibport *ibp, u16 pkey) return -1; } +void return_cnp_16B(struct hfi1_ibport *ibp, struct rvt_qp *qp, + u32 remote_qpn, u32 pkey, u32 slid, u32 dlid, + u8 sc5, const struct ib_grh *old_grh) +{ + u64 pbc, pbc_flags = 0; + u32 bth0, plen, vl, hwords = 7; + u16 len; + u8 l4; + struct hfi1_16b_header hdr; + struct ib_other_headers *ohdr; + struct pio_buf *pbuf; + struct send_context *ctxt = qp_to_send_context(qp, sc5); + struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); + u32 nwords; + + /* Populate length */ + nwords = ((hfi1_get_16b_padding(hwords << 2, 0) + + SIZE_OF_LT) >> 2) + SIZE_OF_CRC; + if (old_grh) { + struct ib_grh *grh = &hdr.u.l.grh; + + grh->version_tclass_flow = old_grh->version_tclass_flow; + grh->paylen = cpu_to_be16((hwords - 4 + nwords) << 2); + grh->hop_limit = 0xff; + grh->sgid = old_grh->dgid; + grh->dgid = old_grh->sgid; + ohdr = &hdr.u.l.oth; + l4 = OPA_16B_L4_IB_GLOBAL; + hwords += sizeof(struct ib_grh) / sizeof(u32); + } else { + ohdr = &hdr.u.oth; + l4 = OPA_16B_L4_IB_LOCAL; + } + + /* BIT 16 to 19 is TVER. Bit 20 to 22 is pad cnt */ + bth0 = (IB_OPCODE_CNP << 24) | (1 << 16) | + (hfi1_get_16b_padding(hwords << 2, 0) << 20); + ohdr->bth[0] = cpu_to_be32(bth0); + + ohdr->bth[1] = cpu_to_be32(remote_qpn); + ohdr->bth[2] = 0; /* PSN 0 */ + + /* Convert dwords to flits */ + len = (hwords + nwords) >> 1; + hfi1_make_16b_hdr(&hdr, slid, dlid, len, pkey, 1, 0, l4, sc5); + + plen = 2 /* PBC */ + hwords + nwords; + pbc_flags |= PBC_PACKET_BYPASS | PBC_INSERT_BYPASS_ICRC; + vl = sc_to_vlt(ppd->dd, sc5); + pbc = create_pbc(ppd, pbc_flags, qp->srate_mbps, vl, plen); + if (ctxt) { + pbuf = sc_buffer_alloc(ctxt, plen, NULL, NULL); + if (pbuf) + ppd->dd->pio_inline_send(ppd->dd, pbuf, pbc, + &hdr, hwords); + } +} + void return_cnp(struct hfi1_ibport *ibp, struct rvt_qp *qp, u32 remote_qpn, u32 pkey, u32 slid, u32 dlid, u8 sc5, const struct ib_grh *old_grh) @@ -535,11 +725,7 @@ void return_cnp(struct hfi1_ibport *ibp, struct rvt_qp *qp, u32 remote_qpn, ohdr->bth[1] = cpu_to_be32(remote_qpn | (1 << IB_BECN_SHIFT)); ohdr->bth[2] = 0; /* PSN 0 */ - hdr.lrh[0] = cpu_to_be16(lrh0); - hdr.lrh[1] = cpu_to_be16(dlid); - hdr.lrh[2] = cpu_to_be16(hwords + SIZE_OF_CRC); - hdr.lrh[3] = cpu_to_be16(slid); - + hfi1_make_ib_hdr(&hdr, lrh0, hwords + SIZE_OF_CRC, dlid, slid); plen = 2 /* PBC */ + hwords; pbc_flags |= (ib_is_sc5(sc5) << PBC_DC_INFO_SHIFT); vl = sc_to_vlt(ppd->dd, sc5); @@ -672,18 +858,33 @@ void hfi1_ud_rcv(struct hfi1_packet *packet) void *data = packet->payload; u32 tlen = packet->tlen; struct rvt_qp *qp = packet->qp; - u8 sc5 = hfi1_9B_get_sc5(hdr, packet->rhf); + u8 sc5 = packet->sc; u8 sl_from_sc; - u8 extra_bytes = packet->pad; u8 opcode = packet->opcode; u8 sl = packet->sl; u32 dlid = packet->dlid; u32 slid = packet->slid; + u8 extra_bytes; + bool dlid_is_permissive; + bool slid_is_permissive; + extra_bytes = packet->pad + packet->extra_byte + (SIZE_OF_CRC << 2); qkey = ib_get_qkey(ohdr); src_qp = ib_get_sqpn(ohdr); - pkey = ib_bth_get_pkey(ohdr); - extra_bytes += (SIZE_OF_CRC << 2); + + if (packet->etype == RHF_RCV_TYPE_BYPASS) { + u32 permissive_lid = + opa_get_lid(be32_to_cpu(OPA_LID_PERMISSIVE), 16B); + + pkey = hfi1_16B_get_pkey(packet->hdr); + dlid_is_permissive = (dlid == permissive_lid); + slid_is_permissive = (slid == permissive_lid); + } else { + hdr = packet->hdr; + pkey = ib_bth_get_pkey(ohdr); + dlid_is_permissive = (dlid == be16_to_cpu(IB_LID_PERMISSIVE)); + slid_is_permissive = (slid == be16_to_cpu(IB_LID_PERMISSIVE)); + } sl_from_sc = ibp->sc_to_sl[sc5]; process_ecn(qp, packet, (opcode != IB_OPCODE_CNP)); @@ -701,8 +902,7 @@ void hfi1_ud_rcv(struct hfi1_packet *packet) * and the QKEY matches (see 9.6.1.4.1 and 9.6.1.5.1). */ if (qp->ibqp.qp_num) { - if (unlikely(hdr->lrh[1] == IB_LID_PERMISSIVE || - hdr->lrh[3] == IB_LID_PERMISSIVE)) + if (unlikely(dlid_is_permissive || slid_is_permissive)) goto drop; if (qp->ibqp.qp_num > 1) { if (unlikely(rcv_pkey_check(ppd, pkey, sc5, slid))) { @@ -740,8 +940,7 @@ void hfi1_ud_rcv(struct hfi1_packet *packet) if (tlen > 2048) goto drop; - if ((hdr->lrh[1] == IB_LID_PERMISSIVE || - hdr->lrh[3] == IB_LID_PERMISSIVE) && + if ((dlid_is_permissive || slid_is_permissive) && smp->mgmt_class != IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) goto drop; @@ -794,7 +993,18 @@ void hfi1_ud_rcv(struct hfi1_packet *packet) goto drop; } if (packet->grh) { - hfi1_copy_sge(&qp->r_sge, &hdr->u.l.grh, + hfi1_copy_sge(&qp->r_sge, packet->grh, + sizeof(struct ib_grh), true, false); + wc.wc_flags |= IB_WC_GRH; + } else if (packet->etype == RHF_RCV_TYPE_BYPASS) { + struct ib_grh grh; + /* + * Assuming we only created 16B on the send side + * if we want to use large LIDs, since GRH was stripped + * out when creating 16B, add back the GRH here. + */ + hfi1_make_ext_grh(packet, &grh, slid, dlid); + hfi1_copy_sge(&qp->r_sge, &grh, sizeof(struct ib_grh), true, false); wc.wc_flags |= IB_WC_GRH; } else { @@ -827,14 +1037,15 @@ void hfi1_ud_rcv(struct hfi1_packet *packet) } else { wc.pkey_index = 0; } - + if (slid_is_permissive) + slid = be32_to_cpu(OPA_LID_PERMISSIVE); wc.slid = slid; wc.sl = sl_from_sc; /* * Save the LMC lower bits if the destination LID is a unicast LID. */ - wc.dlid_path_bits = dlid >= be16_to_cpu(IB_MULTICAST_LID_BASE) ? 0 : + wc.dlid_path_bits = hfi1_check_mcast(dlid) ? 0 : dlid & ((1 << ppd_from_ibp(ibp)->lmc) - 1); wc.port_num = qp->port_num; /* Signal completion event if the solicited bit is set. */ diff --git a/drivers/infiniband/hw/hfi1/verbs.h b/drivers/infiniband/hw/hfi1/verbs.h index d3dd0c01b8f6..4928ee4f92c1 100644 --- a/drivers/infiniband/hw/hfi1/verbs.h +++ b/drivers/infiniband/hw/hfi1/verbs.h @@ -259,7 +259,7 @@ static inline int hfi1_send_ok(struct rvt_qp *qp) * This must be called with s_lock held. */ void hfi1_bad_pkey(struct hfi1_ibport *ibp, u32 key, u32 sl, - u32 qp1, u32 qp2, u16 lid1, u16 lid2); + u32 qp1, u32 qp2, u32 lid1, u32 lid2); void hfi1_cap_mask_chg(struct rvt_dev_info *rdi, u8 port_num); void hfi1_sys_guid_chg(struct hfi1_ibport *ibp); void hfi1_node_desc_chg(struct hfi1_ibport *ibp); diff --git a/include/rdma/opa_addr.h b/include/rdma/opa_addr.h index 9ae126fb8648..e6e90f18e6d5 100644 --- a/include/rdma/opa_addr.h +++ b/include/rdma/opa_addr.h @@ -54,6 +54,7 @@ #define OPA_MAKE_ID(x) (cpu_to_be64(OPA_SPECIAL_OUI << 40 | (x))) #define OPA_TO_IB_UCAST_LID(x) (((x) >= be16_to_cpu(IB_MULTICAST_LID_BASE)) \ ? 0 : x) +#define OPA_GID_INDEX 0x1 /** * 0xF8 - 4 bits of multicast range and 1 bit for collective range * Example: For 24 bit LID space, -- cgit v1.2.3 From 51e658f5dd362cc8666f3f5ec1986660e3e51047 Mon Sep 17 00:00:00 2001 From: Dasaratharaman Chandramouli Date: Fri, 4 Aug 2017 13:54:35 -0700 Subject: IB/rdmavt, hfi1, qib: Enhance rdmavt and hfi1 to use 32 bit lids Increase lid used in hfi1 driver to 32 bits. qib continues to use 16 bit lids. Reviewed-by: Dennis Dalessandro Signed-off-by: Dasaratharaman Chandramouli Signed-off-by: Don Hiatt Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/chip.c | 12 +++-- drivers/infiniband/hw/hfi1/hfi.h | 2 +- drivers/infiniband/hw/hfi1/mad.c | 91 ++++++++++++++++++++++++++++++++----- drivers/infiniband/hw/hfi1/verbs.c | 23 +--------- drivers/infiniband/hw/hfi1/verbs.h | 2 - drivers/infiniband/hw/qib/qib_mad.c | 4 +- include/rdma/rdma_vt.h | 2 +- 7 files changed, 93 insertions(+), 43 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/hw/hfi1/chip.c b/drivers/infiniband/hw/hfi1/chip.c index ee1324cce25a..cbda37386982 100644 --- a/drivers/infiniband/hw/hfi1/chip.c +++ b/drivers/infiniband/hw/hfi1/chip.c @@ -10067,10 +10067,16 @@ static void set_lidlmc(struct hfi1_pportdata *ppd) struct hfi1_devdata *dd = ppd->dd; u32 mask = ~((1U << ppd->lmc) - 1); u64 c1 = read_csr(ppd->dd, DCC_CFG_PORT_CONFIG1); + u32 lid; + /* + * Program 0 in CSR if port lid is extended. This prevents + * 9B packets being sent out for large lids. + */ + lid = (ppd->lid >= be16_to_cpu(IB_MULTICAST_LID_BASE)) ? 0 : ppd->lid; c1 &= ~(DCC_CFG_PORT_CONFIG1_TARGET_DLID_SMASK | DCC_CFG_PORT_CONFIG1_DLID_MASK_SMASK); - c1 |= ((ppd->lid & DCC_CFG_PORT_CONFIG1_TARGET_DLID_MASK) + c1 |= ((lid & DCC_CFG_PORT_CONFIG1_TARGET_DLID_MASK) << DCC_CFG_PORT_CONFIG1_TARGET_DLID_SHIFT) | ((mask & DCC_CFG_PORT_CONFIG1_DLID_MASK_MASK) << DCC_CFG_PORT_CONFIG1_DLID_MASK_SHIFT); @@ -10081,7 +10087,7 @@ static void set_lidlmc(struct hfi1_pportdata *ppd) */ sreg = ((mask & SEND_CTXT_CHECK_SLID_MASK_MASK) << SEND_CTXT_CHECK_SLID_MASK_SHIFT) | - (((ppd->lid & mask) & SEND_CTXT_CHECK_SLID_VALUE_MASK) << + (((lid & mask) & SEND_CTXT_CHECK_SLID_VALUE_MASK) << SEND_CTXT_CHECK_SLID_VALUE_SHIFT); for (i = 0; i < dd->chip_send_contexts; i++) { @@ -10091,7 +10097,7 @@ static void set_lidlmc(struct hfi1_pportdata *ppd) } /* Now we have to do the same thing for the sdma engines */ - sdma_update_lmc(dd, mask, ppd->lid); + sdma_update_lmc(dd, mask, lid); } static const char *state_completed_string(u32 completed) diff --git a/drivers/infiniband/hw/hfi1/hfi.h b/drivers/infiniband/hw/hfi1/hfi.h index b07f42cfa5bf..52cae1146b80 100644 --- a/drivers/infiniband/hw/hfi1/hfi.h +++ b/drivers/infiniband/hw/hfi1/hfi.h @@ -718,7 +718,7 @@ struct hfi1_pportdata { u32 ibmaxlen; u32 current_egress_rate; /* units [10^6 bits/sec] */ /* LID programmed for this instance */ - u16 lid; + u32 lid; /* list of pkeys programmed; 0 if not set */ u16 pkeys[MAX_PKEY_VALUES]; u16 link_width_supported; diff --git a/drivers/infiniband/hw/hfi1/mad.c b/drivers/infiniband/hw/hfi1/mad.c index 1509bc6b76d8..cdcb4d021480 100644 --- a/drivers/infiniband/hw/hfi1/mad.c +++ b/drivers/infiniband/hw/hfi1/mad.c @@ -234,6 +234,61 @@ static void subn_handle_opa_trap_repress(struct hfi1_ibport *ibp, spin_unlock_irqrestore(&ibp->rvp.lock, flags); } +static void hfi1_update_sm_ah_attr(struct hfi1_ibport *ibp, + struct rdma_ah_attr *attr, u32 dlid) +{ + rdma_ah_set_dlid(attr, dlid); + rdma_ah_set_port_num(attr, ppd_from_ibp(ibp)->port); + if (dlid >= be16_to_cpu(IB_MULTICAST_LID_BASE)) { + struct ib_global_route *grh = rdma_ah_retrieve_grh(attr); + + rdma_ah_set_ah_flags(attr, IB_AH_GRH); + grh->sgid_index = 0; + grh->hop_limit = 1; + grh->dgid.global.subnet_prefix = + ibp->rvp.gid_prefix; + grh->dgid.global.interface_id = OPA_MAKE_ID(dlid); + } +} + +static int hfi1_modify_qp0_ah(struct hfi1_ibport *ibp, + struct rvt_ah *ah, u32 dlid) +{ + struct rdma_ah_attr attr; + struct rvt_qp *qp0; + int ret = -EINVAL; + + memset(&attr, 0, sizeof(attr)); + attr.type = ah->ibah.type; + hfi1_update_sm_ah_attr(ibp, &attr, dlid); + rcu_read_lock(); + qp0 = rcu_dereference(ibp->rvp.qp[0]); + if (qp0) + ret = rdma_modify_ah(&ah->ibah, &attr); + rcu_read_unlock(); + return ret; +} + +static struct ib_ah *hfi1_create_qp0_ah(struct hfi1_ibport *ibp, u32 dlid) +{ + struct rdma_ah_attr attr; + struct ib_ah *ah = ERR_PTR(-EINVAL); + struct rvt_qp *qp0; + struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); + struct hfi1_devdata *dd = dd_from_ppd(ppd); + u8 port_num = ppd->port; + + memset(&attr, 0, sizeof(attr)); + attr.type = rdma_ah_find_type(&dd->verbs_dev.rdi.ibdev, port_num); + hfi1_update_sm_ah_attr(ibp, &attr, dlid); + rcu_read_lock(); + qp0 = rcu_dereference(ibp->rvp.qp[0]); + if (qp0) + ah = rdma_create_ah(qp0->ibqp.pd, &attr); + rcu_read_unlock(); + return ah; +} + static void send_trap(struct hfi1_ibport *ibp, struct trap_node *trap) { struct ib_mad_send_buf *send_buf; @@ -1283,8 +1338,8 @@ static int __subn_set_opa_portinfo(struct opa_smp *smp, u32 am, u8 *data, struct hfi1_ibport *ibp; u8 clientrereg; unsigned long flags; - u32 smlid, opa_lid; /* tmp vars to hold LID values */ - u16 lid; + u32 smlid; + u32 lid; u8 ls_old, ls_new, ps_new; u8 vls; u8 msl; @@ -1301,22 +1356,20 @@ static int __subn_set_opa_portinfo(struct opa_smp *smp, u32 am, u8 *data, return reply((struct ib_mad_hdr *)smp); } - opa_lid = be32_to_cpu(pi->lid); - if (opa_lid & 0xFFFF0000) { - pr_warn("OPA_PortInfo lid out of range: %X\n", opa_lid); + lid = be32_to_cpu(pi->lid); + if (lid & 0xFF000000) { + pr_warn("OPA_PortInfo lid out of range: %X\n", lid); smp->status |= IB_SMP_INVALID_FIELD; goto get_only; } - lid = (u16)(opa_lid & 0x0000FFFF); smlid = be32_to_cpu(pi->sm_lid); - if (smlid & 0xFFFF0000) { + if (smlid & 0xFF000000) { pr_warn("OPA_PortInfo SM lid out of range: %X\n", smlid); smp->status |= IB_SMP_INVALID_FIELD; goto get_only; } - smlid &= 0x0000FFFF; clientrereg = (pi->clientrereg_subnettimeout & OPA_PI_MASK_CLIENT_REREGISTER); @@ -1331,12 +1384,16 @@ static int __subn_set_opa_portinfo(struct opa_smp *smp, u32 am, u8 *data, ls_old = driver_lstate(ppd); ibp->rvp.mkey = pi->mkey; - ibp->rvp.gid_prefix = pi->subnet_prefix; + if (ibp->rvp.gid_prefix != pi->subnet_prefix) { + ibp->rvp.gid_prefix = pi->subnet_prefix; + event.event = IB_EVENT_GID_CHANGE; + ib_dispatch_event(&event); + } ibp->rvp.mkey_lease_period = be16_to_cpu(pi->mkey_lease_period); /* Must be a valid unicast LID address. */ if ((lid == 0 && ls_old > IB_PORT_INIT) || - lid >= be16_to_cpu(IB_MULTICAST_LID_BASE)) { + (hfi1_is_16B_mcast(lid))) { smp->status |= IB_SMP_INVALID_FIELD; pr_warn("SubnSet(OPA_PortInfo) lid invalid 0x%x\n", lid); @@ -1349,6 +1406,16 @@ static int __subn_set_opa_portinfo(struct opa_smp *smp, u32 am, u8 *data, hfi1_set_lid(ppd, lid, pi->mkeyprotect_lmc & OPA_PI_MASK_LMC); event.event = IB_EVENT_LID_CHANGE; ib_dispatch_event(&event); + + if (HFI1_PORT_GUID_INDEX + 1 < HFI1_GUIDS_PER_PORT) { + /* Manufacture GID from LID to support extended + * addresses + */ + ppd->guids[HFI1_PORT_GUID_INDEX + 1] = + be64_to_cpu(OPA_MAKE_ID(lid)); + event.event = IB_EVENT_GID_CHANGE; + ib_dispatch_event(&event); + } } msl = pi->smsl & OPA_PI_MASK_SMSL; @@ -1359,7 +1426,7 @@ static int __subn_set_opa_portinfo(struct opa_smp *smp, u32 am, u8 *data, /* Must be a valid unicast LID address. */ if ((smlid == 0 && ls_old > IB_PORT_INIT) || - smlid >= be16_to_cpu(IB_MULTICAST_LID_BASE)) { + (hfi1_is_16B_mcast(smlid))) { smp->status |= IB_SMP_INVALID_FIELD; pr_warn("SubnSet(OPA_PortInfo) smlid invalid 0x%x\n", smlid); } else if (smlid != ibp->rvp.sm_lid || msl != ibp->rvp.sm_sl) { @@ -1367,7 +1434,7 @@ static int __subn_set_opa_portinfo(struct opa_smp *smp, u32 am, u8 *data, spin_lock_irqsave(&ibp->rvp.lock, flags); if (ibp->rvp.sm_ah) { if (smlid != ibp->rvp.sm_lid) - rdma_ah_set_dlid(&ibp->rvp.sm_ah->attr, smlid); + hfi1_modify_qp0_ah(ibp, ibp->rvp.sm_ah, smlid); if (msl != ibp->rvp.sm_sl) rdma_ah_set_sl(&ibp->rvp.sm_ah->attr, msl); } diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c index 18b27276f202..83565e5f46d0 100644 --- a/drivers/infiniband/hw/hfi1/verbs.c +++ b/drivers/infiniband/hw/hfi1/verbs.c @@ -1394,7 +1394,7 @@ static int query_port(struct rvt_dev_info *rdi, u8 port_num, struct hfi1_ibdev *verbs_dev = dev_from_rdi(rdi); struct hfi1_devdata *dd = dd_from_dev(verbs_dev); struct hfi1_pportdata *ppd = &dd->pport[port_num - 1]; - u16 lid = ppd->lid; + u32 lid = ppd->lid; /* props being zeroed by the caller, avoid zeroing it here */ props->lid = lid ? lid : 0; @@ -1555,27 +1555,6 @@ static void hfi1_notify_new_ah(struct ib_device *ibdev, ah->log_pmtu = ilog2(dd->vld[ah->vl].mtu); } -struct ib_ah *hfi1_create_qp0_ah(struct hfi1_ibport *ibp, u16 dlid) -{ - struct rdma_ah_attr attr; - struct ib_ah *ah = ERR_PTR(-EINVAL); - struct rvt_qp *qp0; - struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); - struct hfi1_devdata *dd = dd_from_ppd(ppd); - u8 port_num = ppd->port; - - memset(&attr, 0, sizeof(attr)); - attr.type = rdma_ah_find_type(&dd->verbs_dev.rdi.ibdev, port_num); - rdma_ah_set_dlid(&attr, dlid); - rdma_ah_set_port_num(&attr, ppd_from_ibp(ibp)->port); - rcu_read_lock(); - qp0 = rcu_dereference(ibp->rvp.qp[0]); - if (qp0) - ah = rdma_create_ah(qp0->ibqp.pd, &attr); - rcu_read_unlock(); - return ah; -} - /** * hfi1_get_npkeys - return the size of the PKEY table for context 0 * @dd: the hfi1_ib device diff --git a/drivers/infiniband/hw/hfi1/verbs.h b/drivers/infiniband/hw/hfi1/verbs.h index 4928ee4f92c1..ab1618e32d9c 100644 --- a/drivers/infiniband/hw/hfi1/verbs.h +++ b/drivers/infiniband/hw/hfi1/verbs.h @@ -334,8 +334,6 @@ void hfi1_rc_hdrerr( u8 ah_to_sc(struct ib_device *ibdev, struct rdma_ah_attr *ah_attr); -struct ib_ah *hfi1_create_qp0_ah(struct hfi1_ibport *ibp, u16 dlid); - void hfi1_rc_send_complete(struct rvt_qp *qp, struct hfi1_opa_header *opah); void hfi1_ud_rcv(struct hfi1_packet *packet); diff --git a/drivers/infiniband/hw/qib/qib_mad.c b/drivers/infiniband/hw/qib/qib_mad.c index 6b9b43b944e3..549c71971a1f 100644 --- a/drivers/infiniband/hw/qib/qib_mad.c +++ b/drivers/infiniband/hw/qib/qib_mad.c @@ -105,7 +105,7 @@ static void qib_send_trap(struct qib_ibport *ibp, void *data, unsigned len) if (ibp->rvp.sm_lid != be16_to_cpu(IB_LID_PERMISSIVE)) { struct ib_ah *ah; - ah = qib_create_qp0_ah(ibp, ibp->rvp.sm_lid); + ah = qib_create_qp0_ah(ibp, (u16)ibp->rvp.sm_lid); if (IS_ERR(ah)) ret = PTR_ERR(ah); else { @@ -496,7 +496,7 @@ static int subn_get_portinfo(struct ib_smp *smp, struct ib_device *ibdev, pip->mkey = ibp->rvp.mkey; pip->gid_prefix = ibp->rvp.gid_prefix; pip->lid = cpu_to_be16(ppd->lid); - pip->sm_lid = cpu_to_be16(ibp->rvp.sm_lid); + pip->sm_lid = cpu_to_be16((u16)ibp->rvp.sm_lid); pip->cap_mask = cpu_to_be32(ibp->rvp.port_cap_flags); /* pip->diag_code; */ pip->mkey_lease_period = cpu_to_be16(ibp->rvp.mkey_lease_period); diff --git a/include/rdma/rdma_vt.h b/include/rdma/rdma_vt.h index fdfac0fd2f82..1d94f3c264ba 100644 --- a/include/rdma/rdma_vt.h +++ b/include/rdma/rdma_vt.h @@ -91,7 +91,7 @@ struct rvt_ibport { __be16 pma_counter_select[5]; u16 pma_tag; u16 mkey_lease_period; - u16 sm_lid; + u32 sm_lid; u8 sm_sl; u8 mkeyprot; u8 subnet_timeout; -- cgit v1.2.3 From ec0d8b8a63ee760bca1bccc6769d6210e05ded29 Mon Sep 17 00:00:00 2001 From: Kamenee Arumugame Date: Sun, 13 Aug 2017 08:08:46 -0700 Subject: IB/hfi1: Stricter bounds checking of MAD trap index The macro size is valid. This change makes it less ambiguous. Bounds check trap type for better security. Reviewed-by: Michael J. Ruhl Signed-off-by: Kamenee Arumugam Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/mad.c | 13 ++++++++++++- include/rdma/rdma_vt.h | 2 +- 2 files changed, 13 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/hw/hfi1/mad.c b/drivers/infiniband/hw/hfi1/mad.c index 37b19bfae02a..661ba707fc60 100644 --- a/drivers/infiniband/hw/hfi1/mad.c +++ b/drivers/infiniband/hw/hfi1/mad.c @@ -151,13 +151,24 @@ static struct trap_node *check_and_add_trap(struct hfi1_ibport *ibp, unsigned long flags; unsigned long timeout; int found = 0; + unsigned int queue_id; + static int trap_count; + + queue_id = trap->data.generic_type & 0x0F; + if (queue_id >= RVT_MAX_TRAP_LISTS) { + trap_count++; + pr_err_ratelimited("hfi1: Invalid trap 0x%0x dropped. Total dropped: %d\n", + trap->data.generic_type, trap_count); + kfree(trap); + return NULL; + } /* * Since the retry (handle timeout) does not remove a trap request * from the list, all we have to do is compare the node. */ spin_lock_irqsave(&ibp->rvp.lock, flags); - trap_list = &ibp->rvp.trap_lists[trap->data.generic_type & 0x0F]; + trap_list = &ibp->rvp.trap_lists[queue_id]; list_for_each_entry(node, &trap_list->list, list) { if (node == trap) { diff --git a/include/rdma/rdma_vt.h b/include/rdma/rdma_vt.h index 1d94f3c264ba..1ba84a78f1c5 100644 --- a/include/rdma/rdma_vt.h +++ b/include/rdma/rdma_vt.h @@ -64,7 +64,7 @@ #define RVT_MAX_PKEY_VALUES 16 #define RVT_MAX_TRAP_LEN 100 /* Limit pending trap list */ -#define RVT_MAX_TRAP_LISTS ((IB_NOTICE_TYPE_INFO & 0x0F) + 1) +#define RVT_MAX_TRAP_LISTS 5 /*((IB_NOTICE_TYPE_INFO & 0x0F) + 1)*/ #define RVT_TRAP_TIMEOUT 4096 /* 4.096 usec */ struct trap_list { -- cgit v1.2.3 From e3bf14bdc17a8e917f337760cc7cacf3232d7dbc Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 14 Aug 2017 14:57:39 -0600 Subject: rdma: Autoload netlink client modules If a message comes in and we do not have the client in the table, then try to load the module supplying that client using MODULE_ALIAS to find it. This duplicates the scheme seen in other netlink muxes (eg nfnetlink). Signed-off-by: Jason Gunthorpe Reviewed-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/core/cma.c | 2 ++ drivers/infiniband/core/device.c | 2 ++ drivers/infiniband/core/iwcm.c | 2 ++ drivers/infiniband/core/netlink.c | 9 +++++++++ drivers/infiniband/core/nldev.c | 3 +++ include/rdma/rdma_netlink.h | 12 ++++++++++++ 6 files changed, 30 insertions(+) (limited to 'include') diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index d8edd8b11561..b76de2e2b209 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -4537,5 +4537,7 @@ static void __exit cma_cleanup(void) destroy_workqueue(cma_wq); } +MODULE_ALIAS_RDMA_NETLINK(RDMA_NL_RDMA_CM, 1); + module_init(cma_init); module_exit(cma_cleanup); diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 91d7cea1a0b9..fc6be1175183 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -1252,5 +1252,7 @@ static void __exit ib_core_cleanup(void) destroy_workqueue(ib_wq); } +MODULE_ALIAS_RDMA_NETLINK(RDMA_NL_LS, 4); + module_init(ib_core_init); module_exit(ib_core_cleanup); diff --git a/drivers/infiniband/core/iwcm.c b/drivers/infiniband/core/iwcm.c index e33528e102f8..fcf42f6bb82a 100644 --- a/drivers/infiniband/core/iwcm.c +++ b/drivers/infiniband/core/iwcm.c @@ -1200,5 +1200,7 @@ static void __exit iw_cm_cleanup(void) iwpm_exit(RDMA_NL_IWCM); } +MODULE_ALIAS_RDMA_NETLINK(RDMA_NL_IWCM, 2); + module_init(iw_cm_init); module_exit(iw_cm_cleanup); diff --git a/drivers/infiniband/core/netlink.c b/drivers/infiniband/core/netlink.c index f782697cf4d8..e685148dd3e6 100644 --- a/drivers/infiniband/core/netlink.c +++ b/drivers/infiniband/core/netlink.c @@ -84,6 +84,15 @@ static bool is_nl_valid(unsigned int type, unsigned int op) return false; cb_table = rdma_nl_types[type].cb_table; +#ifdef CONFIG_MODULES + if (!cb_table) { + mutex_unlock(&rdma_nl_mutex); + request_module("rdma-netlink-subsys-%d", type); + mutex_lock(&rdma_nl_mutex); + cb_table = rdma_nl_types[type].cb_table; + } +#endif + if (!cb_table || (!cb_table[op].dump && !cb_table[op].doit)) return false; return true; diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c index 474022274e09..3ba24c428c3b 100644 --- a/drivers/infiniband/core/nldev.c +++ b/drivers/infiniband/core/nldev.c @@ -30,6 +30,7 @@ * POSSIBILITY OF SUCH DAMAGE. */ +#include #include #include @@ -320,3 +321,5 @@ void __exit nldev_exit(void) { rdma_nl_unregister(RDMA_NL_NLDEV); } + +MODULE_ALIAS_RDMA_NETLINK(RDMA_NL_NLDEV, 5); diff --git a/include/rdma/rdma_netlink.h b/include/rdma/rdma_netlink.h index e25bf1988846..2d878596b1e0 100644 --- a/include/rdma/rdma_netlink.h +++ b/include/rdma/rdma_netlink.h @@ -17,6 +17,18 @@ enum rdma_nl_flags { RDMA_NL_ADMIN_PERM = 1 << 0, }; +/* Define this module as providing netlink services for NETLINK_RDMA, with + * index _index. Since the client indexes were setup in a uapi header as an + * enum and we do no want to change that, the user must supply the expanded + * constant as well and the compiler checks they are the same. + */ +#define MODULE_ALIAS_RDMA_NETLINK(_index, _val) \ + static inline void __chk_##_index(void) \ + { \ + BUILD_BUG_ON(_index != _val); \ + } \ + MODULE_ALIAS("rdma-netlink-subsys-" __stringify(_val)) + /** * Register client in RDMA netlink. * @index: Index of the added client -- cgit v1.2.3 From 55f2467cd717241dd941153d4db1e23c91aecf98 Mon Sep 17 00:00:00 2001 From: Maor Gottlieb Date: Thu, 17 Aug 2017 15:50:35 +0300 Subject: RDMA/mlx4: Fix create qp command alignment Avoid extra padding by replacing the order of inl_recv_sz and reserved, otherwise 'mlx4_ib_create_qp' structure might be larger than legacy user input leading to copy of some garbage data from the user space buffer. Fixes: ea30b966f7dd ('IB/mlx4: Add inline-receive support') Signed-off-by: Maor Gottlieb Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- include/uapi/rdma/mlx4-abi.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/rdma/mlx4-abi.h b/include/uapi/rdma/mlx4-abi.h index d915cab37ec3..21cce1a4c4dd 100644 --- a/include/uapi/rdma/mlx4-abi.h +++ b/include/uapi/rdma/mlx4-abi.h @@ -111,8 +111,8 @@ struct mlx4_ib_create_qp { __u8 log_sq_bb_count; __u8 log_sq_stride; __u8 sq_no_prefetch; - __u32 inl_recv_sz; __u8 reserved; + __u32 inl_recv_sz; }; struct mlx4_ib_create_wq { -- cgit v1.2.3 From dcc9881e6767559c04faf15804ac145a2ea026cb Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Thu, 17 Aug 2017 15:50:36 +0300 Subject: RDMA/(core, ulp): Convert register/unregister event handler to be void The functions ib_register_event_handler() and ib_unregister_event_handler() always returned success and they can't fail. Let's convert those functions to be void, remove redundant checks and cleanup tons of goto statements. Signed-off-by: Leon Romanovsky Reviewed-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/core/cache.c | 23 ++++++++--------------- drivers/infiniband/core/device.c | 8 ++------ drivers/infiniband/core/sa_query.c | 3 +-- drivers/infiniband/core/uverbs_main.c | 13 +------------ drivers/infiniband/ulp/ipoib/ipoib_main.c | 10 +--------- drivers/infiniband/ulp/iser/iser_verbs.c | 6 ++---- drivers/infiniband/ulp/opa_vnic/opa_vnic_vema.c | 7 +------ drivers/infiniband/ulp/srpt/ib_srpt.c | 5 ++--- include/rdma/ib_verbs.h | 4 ++-- 9 files changed, 20 insertions(+), 59 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c index efc94304dee3..77515638c55c 100644 --- a/drivers/infiniband/core/cache.c +++ b/drivers/infiniband/core/cache.c @@ -1199,30 +1199,23 @@ int ib_cache_setup_one(struct ib_device *device) device->cache.ports = kzalloc(sizeof(*device->cache.ports) * (rdma_end_port(device) - rdma_start_port(device) + 1), GFP_KERNEL); - if (!device->cache.ports) { - err = -ENOMEM; - goto out; - } + if (!device->cache.ports) + return -ENOMEM; err = gid_table_setup_one(device); - if (err) - goto out; + if (err) { + kfree(device->cache.ports); + device->cache.ports = NULL; + return err; + } for (p = 0; p <= rdma_end_port(device) - rdma_start_port(device); ++p) ib_cache_update(device, p + rdma_start_port(device), true); INIT_IB_EVENT_HANDLER(&device->cache.event_handler, device, ib_cache_event); - err = ib_register_event_handler(&device->cache.event_handler); - if (err) - goto err; - + ib_register_event_handler(&device->cache.event_handler); return 0; - -err: - gid_table_cleanup_one(device); -out: - return err; } void ib_cache_release_one(struct ib_device *device) diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index fc6be1175183..ec4786777447 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -747,7 +747,7 @@ EXPORT_SYMBOL(ib_set_client_data); * chapter 11 of the InfiniBand Architecture Specification). This * callback may occur in interrupt context. */ -int ib_register_event_handler (struct ib_event_handler *event_handler) +void ib_register_event_handler(struct ib_event_handler *event_handler) { unsigned long flags; @@ -755,8 +755,6 @@ int ib_register_event_handler (struct ib_event_handler *event_handler) list_add_tail(&event_handler->list, &event_handler->device->event_handler_list); spin_unlock_irqrestore(&event_handler->device->event_handler_lock, flags); - - return 0; } EXPORT_SYMBOL(ib_register_event_handler); @@ -767,15 +765,13 @@ EXPORT_SYMBOL(ib_register_event_handler); * Unregister an event handler registered with * ib_register_event_handler(). */ -int ib_unregister_event_handler(struct ib_event_handler *event_handler) +void ib_unregister_event_handler(struct ib_event_handler *event_handler) { unsigned long flags; spin_lock_irqsave(&event_handler->device->event_handler_lock, flags); list_del(&event_handler->list); spin_unlock_irqrestore(&event_handler->device->event_handler_lock, flags); - - return 0; } EXPORT_SYMBOL(ib_unregister_event_handler); diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c index 0179b21bad34..ab5e1024fea9 100644 --- a/drivers/infiniband/core/sa_query.c +++ b/drivers/infiniband/core/sa_query.c @@ -2417,8 +2417,7 @@ static void ib_sa_add_one(struct ib_device *device) */ INIT_IB_EVENT_HANDLER(&sa_dev->event_handler, device, ib_sa_event); - if (ib_register_event_handler(&sa_dev->event_handler)) - goto err; + ib_register_event_handler(&sa_dev->event_handler); for (i = 0; i <= e - s; ++i) { if (rdma_cap_ib_sa(device, i + 1)) diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index 5e530d2bee44..defeda33e27f 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -595,7 +595,6 @@ struct file *ib_uverbs_alloc_async_event_file(struct ib_uverbs_file *uverbs_file { struct ib_uverbs_async_event_file *ev_file; struct file *filp; - int ret; ev_file = kzalloc(sizeof(*ev_file), GFP_KERNEL); if (!ev_file) @@ -621,21 +620,11 @@ struct file *ib_uverbs_alloc_async_event_file(struct ib_uverbs_file *uverbs_file INIT_IB_EVENT_HANDLER(&uverbs_file->event_handler, ib_dev, ib_uverbs_event_handler); - ret = ib_register_event_handler(&uverbs_file->event_handler); - if (ret) - goto err_put_file; - + ib_register_event_handler(&uverbs_file->event_handler); /* At that point async file stuff was fully set */ return filp; -err_put_file: - fput(filp); - kref_put(&uverbs_file->async_file->ref, - ib_uverbs_release_async_event_file); - uverbs_file->async_file = NULL; - return ERR_PTR(ret); - err_put_refs: kref_put(&ev_file->uverbs_file->ref, ib_uverbs_release_file); kref_put(&ev_file->ref, ib_uverbs_release_async_event_file); diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index ee9f5d281b37..344e8d3d47bd 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -2227,13 +2227,7 @@ static struct net_device *ipoib_add_port(const char *format, INIT_IB_EVENT_HANDLER(&priv->event_handler, priv->ca, ipoib_event); - result = ib_register_event_handler(&priv->event_handler); - if (result < 0) { - printk(KERN_WARNING "%s: ib_register_event_handler failed for " - "port %d (ret = %d)\n", - hca->name, port, result); - goto event_failed; - } + ib_register_event_handler(&priv->event_handler); result = register_netdev(priv->dev); if (result) { @@ -2266,8 +2260,6 @@ register_failed: set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags); cancel_delayed_work(&priv->neigh_reap_task); flush_workqueue(priv->wq); - -event_failed: ipoib_dev_cleanup(priv->dev); device_init_failed: diff --git a/drivers/infiniband/ulp/iser/iser_verbs.c b/drivers/infiniband/ulp/iser/iser_verbs.c index 26a004e97ae0..55a73b0ed4c6 100644 --- a/drivers/infiniband/ulp/iser/iser_verbs.c +++ b/drivers/infiniband/ulp/iser/iser_verbs.c @@ -106,9 +106,7 @@ static int iser_create_device_ib_res(struct iser_device *device) INIT_IB_EVENT_HANDLER(&device->event_handler, ib_dev, iser_event_handler); - if (ib_register_event_handler(&device->event_handler)) - goto cq_err; - + ib_register_event_handler(&device->event_handler); return 0; cq_err: @@ -141,7 +139,7 @@ static void iser_free_device_ib_res(struct iser_device *device) comp->cq = NULL; } - (void)ib_unregister_event_handler(&device->event_handler); + ib_unregister_event_handler(&device->event_handler); ib_dealloc_pd(device->pd); kfree(device->comps); diff --git a/drivers/infiniband/ulp/opa_vnic/opa_vnic_vema.c b/drivers/infiniband/ulp/opa_vnic/opa_vnic_vema.c index 57b862b94dca..21f0b481edcc 100644 --- a/drivers/infiniband/ulp/opa_vnic/opa_vnic_vema.c +++ b/drivers/infiniband/ulp/opa_vnic/opa_vnic_vema.c @@ -954,12 +954,7 @@ static int vema_register(struct opa_vnic_ctrl_port *cport) INIT_IB_EVENT_HANDLER(&port->event_handler, cport->ibdev, opa_vnic_event); - ret = ib_register_event_handler(&port->event_handler); - if (ret) { - c_err("port %d: event handler register failed\n", i); - vema_unregister(cport); - return ret; - } + ib_register_event_handler(&port->event_handler); idr_init(&port->vport_idr); mutex_init(&port->lock); diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.c b/drivers/infiniband/ulp/srpt/ib_srpt.c index 402275be0931..9e8e9220f816 100644 --- a/drivers/infiniband/ulp/srpt/ib_srpt.c +++ b/drivers/infiniband/ulp/srpt/ib_srpt.c @@ -2238,7 +2238,7 @@ static int srpt_write_pending(struct se_cmd *se_cmd) cqe, first_wr); cqe = NULL; } - + ret = ib_post_send(ch->qp, first_wr, &bad_wr); if (ret) { pr_err("%s: ib_post_send() returned %d for %d (avail: %d)\n", @@ -2530,8 +2530,7 @@ static void srpt_add_one(struct ib_device *device) INIT_IB_EVENT_HANDLER(&sdev->event_handler, sdev->device, srpt_event_handler); - if (ib_register_event_handler(&sdev->event_handler)) - goto err_cm; + ib_register_event_handler(&sdev->event_handler); sdev->ioctx_ring = (struct srpt_recv_ioctx **) srpt_alloc_ioctx_ring(sdev, sdev->srq_size, diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index c155c105589d..e536a052e5dd 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2413,8 +2413,8 @@ int ib_modify_qp_is_ok(enum ib_qp_state cur_state, enum ib_qp_state next_state, enum ib_qp_type type, enum ib_qp_attr_mask mask, enum rdma_link_layer ll); -int ib_register_event_handler (struct ib_event_handler *event_handler); -int ib_unregister_event_handler(struct ib_event_handler *event_handler); +void ib_register_event_handler(struct ib_event_handler *event_handler); +void ib_unregister_event_handler(struct ib_event_handler *event_handler); void ib_dispatch_event(struct ib_event *event); int ib_query_port(struct ib_device *device, -- cgit v1.2.3 From 78b57f9529225111b440e6e5150f52f5d44e3c60 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Thu, 17 Aug 2017 15:50:37 +0300 Subject: RDMA/core: Cleanup device capability enum Cleanup patch prior exporting the ib_device_cap_flags to the user space. In this patch, we are aligning the indentation, removing IB_DEVICE_INIT_TYPE and IB_DEVICE_RESERVED fields, because it is not used in the kernel. Signed-off-by: Leon Romanovsky Reviewed-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- include/rdma/ib_verbs.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index e536a052e5dd..355c7a328e0b 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -170,7 +170,7 @@ enum ib_device_cap_flags { IB_DEVICE_UD_AV_PORT_ENFORCE = (1 << 6), IB_DEVICE_CURR_QP_STATE_MOD = (1 << 7), IB_DEVICE_SHUTDOWN_PORT = (1 << 8), - IB_DEVICE_INIT_TYPE = (1 << 9), + /* Not in use, former INIT_TYPE = (1 << 9),*/ IB_DEVICE_PORT_ACTIVE_EVENT = (1 << 10), IB_DEVICE_SYS_IMAGE_GUID = (1 << 11), IB_DEVICE_RC_RNR_NAK_GEN = (1 << 12), @@ -185,7 +185,7 @@ enum ib_device_cap_flags { * which will always contain a usable lkey. */ IB_DEVICE_LOCAL_DMA_LKEY = (1 << 15), - IB_DEVICE_RESERVED /* old SEND_W_INV */ = (1 << 16), + /* Reserved, old SEND_W_INV = (1 << 16),*/ IB_DEVICE_MEM_WINDOW = (1 << 17), /* * Devices should set IB_DEVICE_UD_IP_SUM if they support @@ -220,7 +220,7 @@ enum ib_device_cap_flags { * of I/O operations with single completion queue managed * by hardware. */ - IB_DEVICE_CROSS_CHANNEL = (1 << 27), + IB_DEVICE_CROSS_CHANNEL = (1 << 27), IB_DEVICE_MANAGED_FLOW_STEERING = (1 << 29), IB_DEVICE_SIGNATURE_HANDOVER = (1 << 30), IB_DEVICE_ON_DEMAND_PAGING = (1ULL << 31), -- cgit v1.2.3 From 078b3573030346df0cdc46d798c0f434dc53c2cc Mon Sep 17 00:00:00 2001 From: Guy Levi Date: Thu, 17 Aug 2017 15:50:47 +0300 Subject: IB/mlx4: Fix struct mlx4_ib_create_wq alignment The mlx4 ABI defines to have structures with alignment of 64B. Fixes: 400b1ebcfe31 ("IB/mlx4: Add support for WQ related verbs") Signed-off-by: Guy Levi Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/hw/mlx4/qp.c | 9 ++++----- include/uapi/rdma/mlx4-abi.h | 1 - 2 files changed, 4 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c index 0d2923c2225f..c3958fcfed75 100644 --- a/drivers/infiniband/hw/mlx4/qp.c +++ b/drivers/infiniband/hw/mlx4/qp.c @@ -1046,9 +1046,8 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd, } if (src == MLX4_IB_RWQ_SRC) { - if (ucmd.wq.comp_mask || ucmd.wq.reserved1 || - ucmd.wq.reserved[0] || ucmd.wq.reserved[1] || - ucmd.wq.reserved[2]) { + if (ucmd.wq.comp_mask || ucmd.wq.reserved[0] || + ucmd.wq.reserved[1] || ucmd.wq.reserved[2]) { pr_debug("user command isn't supported\n"); err = -EOPNOTSUPP; goto err; @@ -4146,8 +4145,8 @@ struct ib_wq *mlx4_ib_create_wq(struct ib_pd *pd, if (!(udata && pd->uobject)) return ERR_PTR(-EINVAL); - required_cmd_sz = offsetof(typeof(ucmd), reserved) + - sizeof(ucmd.reserved); + required_cmd_sz = offsetof(typeof(ucmd), comp_mask) + + sizeof(ucmd.comp_mask); if (udata->inlen < required_cmd_sz) { pr_debug("invalid inlen\n"); return ERR_PTR(-EINVAL); diff --git a/include/uapi/rdma/mlx4-abi.h b/include/uapi/rdma/mlx4-abi.h index 21cce1a4c4dd..0e10102861b5 100644 --- a/include/uapi/rdma/mlx4-abi.h +++ b/include/uapi/rdma/mlx4-abi.h @@ -121,7 +121,6 @@ struct mlx4_ib_create_wq { __u8 log_range_size; __u8 reserved[3]; __u32 comp_mask; - __u32 reserved1; }; struct mlx4_ib_modify_wq { -- cgit v1.2.3 From b23673f86fd0d9ccbc088e88e29899b4d3f0f055 Mon Sep 17 00:00:00 2001 From: Guy Levi Date: Thu, 17 Aug 2017 15:50:48 +0300 Subject: IB/mlx4: Remove redundant attribute in mlx4_ib_create_qp_rss struct rx_key_len is not in use and needs to be removed. Fixes: 3078f5f1bd8b ("IB/mlx4: Add support for RSS QP") Signed-off-by: Guy Levi Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- include/uapi/rdma/mlx4-abi.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include') diff --git a/include/uapi/rdma/mlx4-abi.h b/include/uapi/rdma/mlx4-abi.h index 0e10102861b5..c55f60e05f86 100644 --- a/include/uapi/rdma/mlx4-abi.h +++ b/include/uapi/rdma/mlx4-abi.h @@ -98,8 +98,7 @@ struct mlx4_ib_create_srq_resp { struct mlx4_ib_create_qp_rss { __u64 rx_hash_fields_mask; __u8 rx_hash_function; - __u8 rx_key_len; - __u8 reserved[6]; + __u8 reserved[7]; __u8 rx_hash_key[40]; __u32 comp_mask; __u32 reserved1; -- cgit v1.2.3 From 96dc3fc5f1d66b20cdf839d571c7b907e08d5d00 Mon Sep 17 00:00:00 2001 From: Noa Osherovich Date: Thu, 17 Aug 2017 15:52:28 +0300 Subject: IB/mlx5: Expose software parsing for Raw Ethernet QP Software parsing (SWP) is a feature that can be used to instruct the device to stop using its internal parser and to parse packets on the transmit path according to offsets set for each packets. Through this feature, the device allows the handling of checksum and LSO by the hardware according to the location of IP and TCP/UDP headers. Enable SW parsing on Raw Ethernet send queue by default if firmware supports it and report these capabilities to user space. Signed-off-by: Noa Osherovich Reviewed-by: Maor Gottlieb Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/hw/mlx5/main.c | 21 +++++++++++++++++++++ drivers/infiniband/hw/mlx5/qp.c | 3 +++ include/uapi/rdma/mlx5-abi.h | 17 +++++++++++++++++ 3 files changed, 41 insertions(+) (limited to 'include') diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 762ef6bf219e..07789450ec42 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -811,6 +811,27 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, if (field_avail(typeof(resp), reserved, uhw->outlen)) resp.response_length += sizeof(resp.reserved); + if (field_avail(typeof(resp), sw_parsing_caps, + uhw->outlen)) { + resp.response_length += sizeof(resp.sw_parsing_caps); + if (MLX5_CAP_ETH(mdev, swp)) { + resp.sw_parsing_caps.sw_parsing_offloads |= + MLX5_IB_SW_PARSING; + + if (MLX5_CAP_ETH(mdev, swp_csum)) + resp.sw_parsing_caps.sw_parsing_offloads |= + MLX5_IB_SW_PARSING_CSUM; + + if (MLX5_CAP_ETH(mdev, swp_lso)) + resp.sw_parsing_caps.sw_parsing_offloads |= + MLX5_IB_SW_PARSING_LSO; + + if (resp.sw_parsing_caps.sw_parsing_offloads) + resp.sw_parsing_caps.supported_qpts = + BIT(IB_QPT_RAW_PACKET); + } + } + if (uhw->outlen) { err = ib_copy_to_udata(uhw, &resp, resp.response_length); diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index bc49d14e0a00..656773196f27 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -1088,6 +1088,9 @@ static int create_raw_packet_qp_sq(struct mlx5_ib_dev *dev, MLX5_SET(sqc, sqc, cqn, MLX5_GET(qpc, qpc, cqn_snd)); MLX5_SET(sqc, sqc, tis_lst_sz, 1); MLX5_SET(sqc, sqc, tis_num_0, sq->tisn); + if (MLX5_CAP_GEN(dev->mdev, eth_net_offloads) && + MLX5_CAP_ETH(dev->mdev, swp)) + MLX5_SET(sqc, sqc, allow_swp, 1); wq = MLX5_ADDR_OF(sqc, sqc, wq); MLX5_SET(wq, wq, wq_type, MLX5_WQ_TYPE_CYCLIC); diff --git a/include/uapi/rdma/mlx5-abi.h b/include/uapi/rdma/mlx5-abi.h index 0b3d30837a9f..64d398e662cd 100644 --- a/include/uapi/rdma/mlx5-abi.h +++ b/include/uapi/rdma/mlx5-abi.h @@ -168,6 +168,22 @@ struct mlx5_packet_pacing_caps { __u32 reserved; }; +enum mlx5_ib_sw_parsing_offloads { + MLX5_IB_SW_PARSING = 1 << 0, + MLX5_IB_SW_PARSING_CSUM = 1 << 1, + MLX5_IB_SW_PARSING_LSO = 1 << 2, +}; + +struct mlx5_ib_sw_parsing_caps { + __u32 sw_parsing_offloads; /* enum mlx5_ib_sw_parsing_offloads */ + + /* Corresponding bit will be set if qp type from + * 'enum ib_qp_type' is supported, e.g. + * supported_qpts |= 1 << IB_QPT_RAW_PACKET + */ + __u32 supported_qpts; +}; + struct mlx5_ib_query_device_resp { __u32 comp_mask; __u32 response_length; @@ -177,6 +193,7 @@ struct mlx5_ib_query_device_resp { struct mlx5_packet_pacing_caps packet_pacing_caps; __u32 mlx5_ib_support_multi_pkt_send_wqes; __u32 reserved; + struct mlx5_ib_sw_parsing_caps sw_parsing_caps; }; struct mlx5_ib_create_cq { -- cgit v1.2.3 From 8b7ff7f3b301de52924cb2cf3fed47b181893116 Mon Sep 17 00:00:00 2001 From: Ilya Lesokhin Date: Thu, 17 Aug 2017 15:52:29 +0300 Subject: IB/mlx5: Enable UMR for MRs created with reg_create This patch is the first step in decoupling UMR usage and allocation from the MR cache. The only functional change in this patch is to enables UMR for MRs created with reg_create. This change fixes a bug where ODP memory regions that were not allocated from the MR cache did not have UMR enabled. Signed-off-by: Ilya Lesokhin Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/hw/mlx5/mlx5_ib.h | 2 +- drivers/infiniband/hw/mlx5/mr.c | 33 ++++++++++++++------------------- include/linux/mlx5/driver.h | 2 +- 3 files changed, 16 insertions(+), 21 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 7ac991070020..b3380e8beacf 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -503,7 +503,7 @@ struct mlx5_ib_mr { struct mlx5_shared_mr_info *smr_info; struct list_head list; int order; - int umred; + bool allocated_from_cache; int npages; struct mlx5_ib_dev *dev; u32 out[MLX5_ST_SZ_DW(create_mkey_out)]; diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index a0eb2f96179a..bc87016021e3 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -48,8 +48,7 @@ enum { #define MLX5_UMR_ALIGN 2048 static int clean_mr(struct mlx5_ib_mr *mr); -static int max_umr_order(struct mlx5_ib_dev *dev); -static int use_umr(struct mlx5_ib_dev *dev, int order); +static int mr_cache_max_order(struct mlx5_ib_dev *dev); static int unreg_umr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr); static int destroy_mkey(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) @@ -184,7 +183,7 @@ static int add_keys(struct mlx5_ib_dev *dev, int c, int num) break; } mr->order = ent->order; - mr->umred = 1; + mr->allocated_from_cache = 1; mr->dev = dev; MLX5_SET(mkc, mkc, free, 1); @@ -497,7 +496,7 @@ static struct mlx5_ib_mr *alloc_cached_mr(struct mlx5_ib_dev *dev, int order) int i; c = order2idx(dev, order); - last_umr_cache_entry = order2idx(dev, max_umr_order(dev)); + last_umr_cache_entry = order2idx(dev, mr_cache_max_order(dev)); if (c < 0 || c > last_umr_cache_entry) { mlx5_ib_warn(dev, "order %d, cache index %d\n", order, c); return NULL; @@ -677,12 +676,12 @@ int mlx5_mr_cache_init(struct mlx5_ib_dev *dev) INIT_DELAYED_WORK(&ent->dwork, delayed_cache_work_func); queue_work(cache->wq, &ent->work); - if (i > MAX_UMR_CACHE_ENTRY) { + if (i > MR_CACHE_LAST_STD_ENTRY) { mlx5_odp_init_mr_cache_entry(ent); continue; } - if (!use_umr(dev, ent->order)) + if (ent->order > mr_cache_max_order(dev)) continue; ent->page = PAGE_SHIFT; @@ -819,18 +818,13 @@ static int get_octo_len(u64 addr, u64 len, int page_size) return (npages + 1) / 2; } -static int max_umr_order(struct mlx5_ib_dev *dev) +static int mr_cache_max_order(struct mlx5_ib_dev *dev) { if (MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset)) - return MAX_UMR_CACHE_ENTRY + 2; + return MR_CACHE_LAST_STD_ENTRY + 2; return MLX5_MAX_UMR_SHIFT; } -static int use_umr(struct mlx5_ib_dev *dev, int order) -{ - return order <= max_umr_order(dev); -} - static int mr_umem_get(struct ib_pd *pd, u64 start, u64 length, int access_flags, struct ib_umem **umem, int *npages, int *page_shift, int *ncont, @@ -1149,6 +1143,7 @@ static struct mlx5_ib_mr *reg_create(struct ib_mr *ibmr, struct ib_pd *pd, MLX5_SET(mkc, mkc, rr, !!(access_flags & IB_ACCESS_REMOTE_READ)); MLX5_SET(mkc, mkc, lw, !!(access_flags & IB_ACCESS_LOCAL_WRITE)); MLX5_SET(mkc, mkc, lr, 1); + MLX5_SET(mkc, mkc, umr_en, 1); MLX5_SET64(mkc, mkc, start_addr, virt_addr); MLX5_SET64(mkc, mkc, len, length); @@ -1231,7 +1226,7 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, if (err < 0) return ERR_PTR(err); - if (use_umr(dev, order)) { + if (order <= mr_cache_max_order(dev)) { mr = reg_umr(pd, umem, virt_addr, length, ncont, page_shift, order, access_flags); if (PTR_ERR(mr) == -EAGAIN) { @@ -1355,7 +1350,7 @@ int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start, /* * UMR can't be used - MKey needs to be replaced. */ - if (mr->umred) { + if (mr->allocated_from_cache) { err = unreg_umr(dev, mr); if (err) mlx5_ib_warn(dev, "Failed to unregister MR\n"); @@ -1373,7 +1368,7 @@ int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start, if (IS_ERR(mr)) return PTR_ERR(mr); - mr->umred = 0; + mr->allocated_from_cache = 0; } else { /* * Send a UMR WQE @@ -1461,7 +1456,7 @@ mlx5_free_priv_descs(struct mlx5_ib_mr *mr) static int clean_mr(struct mlx5_ib_mr *mr) { struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device); - int umred = mr->umred; + int allocated_from_cache = mr->allocated_from_cache; int err; if (mr->sig) { @@ -1479,7 +1474,7 @@ static int clean_mr(struct mlx5_ib_mr *mr) mlx5_free_priv_descs(mr); - if (!umred) { + if (!allocated_from_cache) { err = destroy_mkey(dev, mr); if (err) { mlx5_ib_warn(dev, "failed to destroy mkey 0x%x (%d)\n", @@ -1490,7 +1485,7 @@ static int clean_mr(struct mlx5_ib_mr *mr) mlx5_mr_cache_free(dev, mr); } - if (!umred) + if (!allocated_from_cache) kfree(mr); return 0; diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index db40bc4055c7..99d88624ad07 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -1093,7 +1093,7 @@ enum { }; enum { - MAX_UMR_CACHE_ENTRY = 20, + MR_CACHE_LAST_STD_ENTRY = 20, MLX5_IMR_MTT_CACHE_ENTRY, MLX5_IMR_KSM_CACHE_ENTRY, MAX_MR_CACHE_ENTRIES -- cgit v1.2.3 From a550ddfc543e250798048cf4eabe721cd85ac724 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Thu, 17 Aug 2017 15:52:33 +0300 Subject: IB/mlx5: Add support for multi underlay QP Set underlay QPN as part of flow rule when it's applicable. There is one root flow table in the NIC RX namespace and all the underlay QPs steer the traffic to this flow table. In order to prevent QP to get traffic which is not target to its underlay QP, we need to set the underlay QP number as part of the steering matching. Note: When multicast traffic is sent the QPN filtering is done by the firmware as some early step. Adding the QPN match on the flow table entry is wrong as by that time the target QPN holds the multicast address (e.g. FF(s)) and it won't match. Signed-off-by: Yishai Hadas Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/hw/mlx5/main.c | 49 +++++++++++++++++++++++++++++++++------ include/linux/mlx5/mlx5_ifc.h | 8 +++++-- 2 files changed, 48 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 07789450ec42..8f7e46090f9a 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -2125,7 +2125,7 @@ static int parse_flow_attr(struct mlx5_core_dev *mdev, u32 *match_c, * it won't fall into the multicast flow steering table and this rule * could steal other multicast packets. */ -static bool flow_is_multicast_only(struct ib_flow_attr *ib_attr) +static bool flow_is_multicast_only(const struct ib_flow_attr *ib_attr) { union ib_flow_spec *flow_spec; @@ -2337,10 +2337,31 @@ static struct mlx5_ib_flow_prio *get_flow_table(struct mlx5_ib_dev *dev, return err ? ERR_PTR(err) : prio; } -static struct mlx5_ib_flow_handler *create_flow_rule(struct mlx5_ib_dev *dev, - struct mlx5_ib_flow_prio *ft_prio, - const struct ib_flow_attr *flow_attr, - struct mlx5_flow_destination *dst) +static void set_underlay_qp(struct mlx5_ib_dev *dev, + struct mlx5_flow_spec *spec, + u32 underlay_qpn) +{ + void *misc_params_c = MLX5_ADDR_OF(fte_match_param, + spec->match_criteria, + misc_parameters); + void *misc_params_v = MLX5_ADDR_OF(fte_match_param, spec->match_value, + misc_parameters); + + if (underlay_qpn && + MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev, + ft_field_support.bth_dst_qp)) { + MLX5_SET(fte_match_set_misc, + misc_params_v, bth_dst_qp, underlay_qpn); + MLX5_SET(fte_match_set_misc, + misc_params_c, bth_dst_qp, 0xffffff); + } +} + +static struct mlx5_ib_flow_handler *_create_flow_rule(struct mlx5_ib_dev *dev, + struct mlx5_ib_flow_prio *ft_prio, + const struct ib_flow_attr *flow_attr, + struct mlx5_flow_destination *dst, + u32 underlay_qpn) { struct mlx5_flow_table *ft = ft_prio->flow_table; struct mlx5_ib_flow_handler *handler; @@ -2376,6 +2397,9 @@ static struct mlx5_ib_flow_handler *create_flow_rule(struct mlx5_ib_dev *dev, ib_flow += ((union ib_flow_spec *)ib_flow)->size; } + if (!flow_is_multicast_only(flow_attr)) + set_underlay_qp(dev, spec, underlay_qpn); + spec->match_criteria_enable = get_match_criteria_enable(spec->match_criteria); if (is_drop) { flow_act.action = MLX5_FLOW_CONTEXT_ACTION_DROP; @@ -2415,6 +2439,14 @@ free: return err ? ERR_PTR(err) : handler; } +static struct mlx5_ib_flow_handler *create_flow_rule(struct mlx5_ib_dev *dev, + struct mlx5_ib_flow_prio *ft_prio, + const struct ib_flow_attr *flow_attr, + struct mlx5_flow_destination *dst) +{ + return _create_flow_rule(dev, ft_prio, flow_attr, dst, 0); +} + static struct mlx5_ib_flow_handler *create_dont_trap_rule(struct mlx5_ib_dev *dev, struct mlx5_ib_flow_prio *ft_prio, struct ib_flow_attr *flow_attr, @@ -2551,6 +2583,7 @@ static struct ib_flow *mlx5_ib_create_flow(struct ib_qp *qp, struct mlx5_ib_flow_prio *ft_prio_tx = NULL; struct mlx5_ib_flow_prio *ft_prio; int err; + int underlay_qpn; if (flow_attr->priority > MLX5_IB_FLOW_LAST_PRIO) return ERR_PTR(-ENOMEM); @@ -2591,8 +2624,10 @@ static struct ib_flow *mlx5_ib_create_flow(struct ib_qp *qp, handler = create_dont_trap_rule(dev, ft_prio, flow_attr, dst); } else { - handler = create_flow_rule(dev, ft_prio, flow_attr, - dst); + underlay_qpn = (mqp->flags & MLX5_IB_QP_UNDERLAY) ? + mqp->underlay_qpn : 0; + handler = _create_flow_rule(dev, ft_prio, flow_attr, + dst, underlay_qpn); } } else if (flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT || flow_attr->type == IB_FLOW_ATTR_MC_DEFAULT) { diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 5bae70eb25af..6563500c85de 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -295,8 +295,10 @@ struct mlx5_ifc_flow_table_fields_supported_bits { u8 inner_tcp_dport[0x1]; u8 inner_tcp_flags[0x1]; u8 reserved_at_37[0x9]; + u8 reserved_at_40[0x1a]; + u8 bth_dst_qp[0x1]; - u8 reserved_at_40[0x40]; + u8 reserved_at_5b[0x25]; }; struct mlx5_ifc_flow_table_prop_layout_bits { @@ -432,7 +434,9 @@ struct mlx5_ifc_fte_match_set_misc_bits { u8 reserved_at_100[0xc]; u8 inner_ipv6_flow_label[0x14]; - u8 reserved_at_120[0xe0]; + u8 reserved_at_120[0x28]; + u8 bth_dst_qp[0x18]; + u8 reserved_at_160[0xa0]; }; struct mlx5_ifc_cmd_pas_bits { -- cgit v1.2.3 From 795b609c8b59f8f20fa9d72bf8b4ae3b8aa5582c Mon Sep 17 00:00:00 2001 From: Bodong Wang Date: Thu, 17 Aug 2017 15:52:34 +0300 Subject: IB/mlx5: Allow posting multi packet send WQEs if hardware supports Set the field to allow posting multi packet send WQEs if hardware supports this feature. This doesn't mean the send WQEs will be for multi packet unless the send WQE was prepared according to multi packet send WQE format. User space shall use flag MLX5_IB_ALLOW_MPW to check if hardware supports MPW and allows MPW in SQ context. Signed-off-by: Bodong Wang Reviewed-by: Daniel Jurgens Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/hw/mlx5/main.c | 5 +++-- drivers/infiniband/hw/mlx5/qp.c | 2 ++ include/linux/mlx5/mlx5_ifc.h | 2 +- include/uapi/rdma/mlx5-abi.h | 5 +++++ 4 files changed, 11 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 8f7e46090f9a..ba0a97d6f677 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -802,8 +802,9 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, if (field_avail(typeof(resp), mlx5_ib_support_multi_pkt_send_wqes, uhw->outlen)) { - resp.mlx5_ib_support_multi_pkt_send_wqes = - MLX5_CAP_ETH(mdev, multi_pkt_send_wqe); + if (MLX5_CAP_ETH(mdev, multi_pkt_send_wqe)) + resp.mlx5_ib_support_multi_pkt_send_wqes = + MLX5_IB_ALLOW_MPW; resp.response_length += sizeof(resp.mlx5_ib_support_multi_pkt_send_wqes); } diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 656773196f27..d6df88a78d5e 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -1083,6 +1083,8 @@ static int create_raw_packet_qp_sq(struct mlx5_ib_dev *dev, sqc = MLX5_ADDR_OF(create_sq_in, in, ctx); MLX5_SET(sqc, sqc, flush_in_error_en, 1); + if (MLX5_CAP_ETH(dev->mdev, multi_pkt_send_wqe)) + MLX5_SET(sqc, sqc, allow_multi_pkt_send_wqe, 1); MLX5_SET(sqc, sqc, state, MLX5_SQC_STATE_RST); MLX5_SET(sqc, sqc, user_index, MLX5_GET(qpc, qpc, user_index)); MLX5_SET(sqc, sqc, cqn, MLX5_GET(qpc, qpc, cqn_snd)); diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 6563500c85de..6865e60ba473 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -2445,7 +2445,7 @@ struct mlx5_ifc_sqc_bits { u8 cd_master[0x1]; u8 fre[0x1]; u8 flush_in_error_en[0x1]; - u8 reserved_at_4[0x1]; + u8 allow_multi_pkt_send_wqe[0x1]; u8 min_wqe_inline_mode[0x3]; u8 state[0x4]; u8 reg_umr[0x1]; diff --git a/include/uapi/rdma/mlx5-abi.h b/include/uapi/rdma/mlx5-abi.h index 64d398e662cd..a61a512e5747 100644 --- a/include/uapi/rdma/mlx5-abi.h +++ b/include/uapi/rdma/mlx5-abi.h @@ -168,6 +168,11 @@ struct mlx5_packet_pacing_caps { __u32 reserved; }; +enum mlx5_ib_mpw_caps { + MPW_RESERVED = 1 << 0, + MLX5_IB_ALLOW_MPW = 1 << 1, +}; + enum mlx5_ib_sw_parsing_offloads { MLX5_IB_SW_PARSING = 1 << 0, MLX5_IB_SW_PARSING_CSUM = 1 << 1, -- cgit v1.2.3 From 050da902adde8faf6b1bef15ac4876ae145358f4 Mon Sep 17 00:00:00 2001 From: Bodong Wang Date: Thu, 17 Aug 2017 15:52:35 +0300 Subject: IB/mlx5: Report mlx5 enhanced multi packet WQE capability Expose enhanced multi packet WQE capability to user space through query_device by uhw. Signed-off-by: Bodong Wang Reviewed-by: Daniel Jurgens Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/hw/mlx5/main.c | 5 +++++ include/linux/mlx5/mlx5_ifc.h | 2 +- include/uapi/rdma/mlx5-abi.h | 1 + 3 files changed, 7 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index ba0a97d6f677..62e6298810e7 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -805,6 +805,11 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, if (MLX5_CAP_ETH(mdev, multi_pkt_send_wqe)) resp.mlx5_ib_support_multi_pkt_send_wqes = MLX5_IB_ALLOW_MPW; + + if (MLX5_CAP_ETH(mdev, enhanced_multi_pkt_send_wqe)) + resp.mlx5_ib_support_multi_pkt_send_wqes |= + MLX5_IB_SUPPORT_EMPW; + resp.response_length += sizeof(resp.mlx5_ib_support_multi_pkt_send_wqes); } diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 6865e60ba473..4eff0b8a1482 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -604,7 +604,7 @@ struct mlx5_ifc_per_protocol_networking_offload_caps_bits { u8 rss_ind_tbl_cap[0x4]; u8 reg_umr_sq[0x1]; u8 scatter_fcs[0x1]; - u8 reserved_at_1a[0x1]; + u8 enhanced_multi_pkt_send_wqe[0x1]; u8 tunnel_lso_const_out_ip_id[0x1]; u8 reserved_at_1c[0x2]; u8 tunnel_statless_gre[0x1]; diff --git a/include/uapi/rdma/mlx5-abi.h b/include/uapi/rdma/mlx5-abi.h index a61a512e5747..1791bf123ba9 100644 --- a/include/uapi/rdma/mlx5-abi.h +++ b/include/uapi/rdma/mlx5-abi.h @@ -171,6 +171,7 @@ struct mlx5_packet_pacing_caps { enum mlx5_ib_mpw_caps { MPW_RESERVED = 1 << 0, MLX5_IB_ALLOW_MPW = 1 << 1, + MLX5_IB_SUPPORT_EMPW = 1 << 2, }; enum mlx5_ib_sw_parsing_offloads { -- cgit v1.2.3 From 4734b4f417126e8773b3983122ca935d02af80de Mon Sep 17 00:00:00 2001 From: Mike Marciniszyn Date: Mon, 28 Aug 2017 11:23:45 -0700 Subject: IB/rdmavt: Add QP iterator API for QPs There are currently 3 spots in the qib and hfi1 driver that have knowledge of the internal QP hash list that should only be in scope to rdmavt QP code. Add an iterator API for processing all QPs to hide the nature of the RCU hashlist. The API consists of: - rvt_qp_iter_init() * For iterating QPs one at a time for seq_file semantics - rvt_qp_iter_next() * For iterating QPs one at a time for seq_file semantics - rvt_qp_iter() * For iterating all QPs The first two are used for things like seq_file prints. The last is for code that just needs to iterate all QPs in the system. Reviewed-by: Dennis Dalessandro Signed-off-by: Mike Marciniszyn Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/sw/rdmavt/qp.c | 144 ++++++++++++++++++++++++++++++++++++++ include/rdma/rdmavt_qp.h | 29 ++++++++ 2 files changed, 173 insertions(+) (limited to 'include') diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c index 3a238b00885e..9f70fd8665ab 100644 --- a/drivers/infiniband/sw/rdmavt/qp.c +++ b/drivers/infiniband/sw/rdmavt/qp.c @@ -2069,3 +2069,147 @@ enum hrtimer_restart rvt_rc_rnr_retry(struct hrtimer *t) return HRTIMER_NORESTART; } EXPORT_SYMBOL(rvt_rc_rnr_retry); + +/** + * rvt_qp_iter_init - initial for QP iteration + * @rdi - rvt devinfo + * @v - u64 value + * + * This returns an iterator suitable for iterating QPs + * in the system. + * + * The @cb is a user defined callback and @v is a 64 + * bit value passed to and relevant for processing in the + * @cb. An example use case would be to alter QP processing + * based on criteria not part of the rvt_qp. + * + * Use cases that require memory allocation to succeed + * must preallocate appropriately. + * + * Return: a pointer to an rvt_qp_iter or NULL + */ +struct rvt_qp_iter *rvt_qp_iter_init(struct rvt_dev_info *rdi, + u64 v, + void (*cb)(struct rvt_qp *qp, u64 v)) +{ + struct rvt_qp_iter *i; + + i = kzalloc(sizeof(*i), GFP_KERNEL); + if (!i) + return NULL; + + i->rdi = rdi; + /* number of special QPs (SMI/GSI) for device */ + i->specials = rdi->ibdev.phys_port_cnt * 2; + i->v = v; + i->cb = cb; + + return i; +} +EXPORT_SYMBOL(rvt_qp_iter_init); + +/** + * rvt_qp_iter_next - return the next QP in iter + * @iter - the iterator + * + * Fine grained QP iterator suitable for use + * with debugfs seq_file mechanisms. + * + * Updates iter->qp with the current QP when the return + * value is 0. + * + * Return: 0 - iter->qp is valid 1 - no more QPs + */ +int rvt_qp_iter_next(struct rvt_qp_iter *iter) + __must_hold(RCU) +{ + int n = iter->n; + int ret = 1; + struct rvt_qp *pqp = iter->qp; + struct rvt_qp *qp; + struct rvt_dev_info *rdi = iter->rdi; + + /* + * The approach is to consider the special qps + * as additional table entries before the + * real hash table. Since the qp code sets + * the qp->next hash link to NULL, this works just fine. + * + * iter->specials is 2 * # ports + * + * n = 0..iter->specials is the special qp indices + * + * n = iter->specials..rdi->qp_dev->qp_table_size+iter->specials are + * the potential hash bucket entries + * + */ + for (; n < rdi->qp_dev->qp_table_size + iter->specials; n++) { + if (pqp) { + qp = rcu_dereference(pqp->next); + } else { + if (n < iter->specials) { + struct rvt_ibport *rvp; + int pidx; + + pidx = n % rdi->ibdev.phys_port_cnt; + rvp = rdi->ports[pidx]; + qp = rcu_dereference(rvp->qp[n & 1]); + } else { + qp = rcu_dereference( + rdi->qp_dev->qp_table[ + (n - iter->specials)]); + } + } + pqp = qp; + if (qp) { + iter->qp = qp; + iter->n = n; + return 0; + } + } + return ret; +} +EXPORT_SYMBOL(rvt_qp_iter_next); + +/** + * rvt_qp_iter - iterate all QPs + * @rdi - rvt devinfo + * @v - a 64 bit value + * @cb - a callback + * + * This provides a way for iterating all QPs. + * + * The @cb is a user defined callback and @v is a 64 + * bit value passed to and relevant for processing in the + * cb. An example use case would be to alter QP processing + * based on criteria not part of the rvt_qp. + * + * The code has an internal iterator to simplify + * non seq_file use cases. + */ +void rvt_qp_iter(struct rvt_dev_info *rdi, + u64 v, + void (*cb)(struct rvt_qp *qp, u64 v)) +{ + int ret; + struct rvt_qp_iter i = { + .rdi = rdi, + .specials = rdi->ibdev.phys_port_cnt * 2, + .v = v, + .cb = cb + }; + + rcu_read_lock(); + do { + ret = rvt_qp_iter_next(&i); + if (!ret) { + rvt_get_qp(i.qp); + rcu_read_unlock(); + i.cb(i.qp, i.v); + rcu_read_lock(); + rvt_put_qp(i.qp); + } + } while (!ret); + rcu_read_unlock(); +} +EXPORT_SYMBOL(rvt_qp_iter); diff --git a/include/rdma/rdmavt_qp.h b/include/rdma/rdmavt_qp.h index 8fbafb0ce674..dfeb311c30a1 100644 --- a/include/rdma/rdmavt_qp.h +++ b/include/rdma/rdmavt_qp.h @@ -673,4 +673,33 @@ void rvt_del_timers_sync(struct rvt_qp *qp); void rvt_stop_rc_timers(struct rvt_qp *qp); void rvt_add_retry_timer(struct rvt_qp *qp); +/** + * struct rvt_qp_iter - the iterator for QPs + * @qp - the current QP + * + * This structure defines the current iterator + * state for sequenced access to all QPs relative + * to an rvt_dev_info. + */ +struct rvt_qp_iter { + struct rvt_qp *qp; + /* private: backpointer */ + struct rvt_dev_info *rdi; + /* private: callback routine */ + void (*cb)(struct rvt_qp *qp, u64 v); + /* private: for arg to callback routine */ + u64 v; + /* private: number of SMI,GSI QPs for device */ + int specials; + /* private: current iterator index */ + int n; +}; + +struct rvt_qp_iter *rvt_qp_iter_init(struct rvt_dev_info *rdi, + u64 v, + void (*cb)(struct rvt_qp *qp, u64 v)); +int rvt_qp_iter_next(struct rvt_qp_iter *iter); +void rvt_qp_iter(struct rvt_dev_info *rdi, + u64 v, + void (*cb)(struct rvt_qp *qp, u64 v)); #endif /* DEF_RDMAVT_INCQP_H */ -- cgit v1.2.3 From 0208da90def5776cef940f9de4ffe6ecef346207 Mon Sep 17 00:00:00 2001 From: Mike Marciniszyn Date: Mon, 28 Aug 2017 11:24:10 -0700 Subject: IB/rdmavt: Handle dereg of inuse MRs properly A destroy of an MR prior to destroying the QP can cause the following diagnostic if the QP is referencing the MR being de-registered: hfi1 0000:05:00.0: hfi1_0: rvt_dereg_mr timeout mr ffff8808562108 00 pd ffff880859b20b00 The solution is to when the a non-zero refcount is encountered when the MR is destroyed the QPs needs to be iterated looking for QPs in the same PD as the MR. If rvt_qp_mr_clean() detects any such QP references the rkey/lkey, the QP needs to be put into an error state via a call to rvt_qp_error() which will trigger the clean up of any stuck references. This solution is as specified in IBTA 1.3 Volume 1 11.2.10.5. [This is reproduced with the 0.4.9 version of qperf and the rc_bw test] Reviewed-by: Dennis Dalessandro Signed-off-by: Mike Marciniszyn Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/sw/rdmavt/mr.c | 121 ++++++++++++++++++++++++++++++++------ drivers/infiniband/sw/rdmavt/qp.c | 112 +++++++++++++++++++++++++++++++++-- include/rdma/rdmavt_mr.h | 3 + include/rdma/rdmavt_qp.h | 1 + 4 files changed, 216 insertions(+), 21 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/sw/rdmavt/mr.c b/drivers/infiniband/sw/rdmavt/mr.c index 1b3801f78e78..42713511b53b 100644 --- a/drivers/infiniband/sw/rdmavt/mr.c +++ b/drivers/infiniband/sw/rdmavt/mr.c @@ -440,6 +440,105 @@ bail_umem: return ret; } +/** + * rvt_dereg_clean_qp_cb - callback from iterator + * @qp - the qp + * @v - the mregion (as u64) + * + * This routine fields the callback for all QPs and + * for QPs in the same PD as the MR will call the + * rvt_qp_mr_clean() to potentially cleanup references. + */ +static void rvt_dereg_clean_qp_cb(struct rvt_qp *qp, u64 v) +{ + struct rvt_mregion *mr = (struct rvt_mregion *)v; + + /* skip PDs that are not ours */ + if (mr->pd != qp->ibqp.pd) + return; + rvt_qp_mr_clean(qp, mr->lkey); +} + +/** + * rvt_dereg_clean_qps - find QPs for reference cleanup + * @mr - the MR that is being deregistered + * + * This routine iterates RC QPs looking for references + * to the lkey noted in mr. + */ +static void rvt_dereg_clean_qps(struct rvt_mregion *mr) +{ + struct rvt_dev_info *rdi = ib_to_rvt(mr->pd->device); + + rvt_qp_iter(rdi, (u64)mr, rvt_dereg_clean_qp_cb); +} + +/** + * rvt_check_refs - check references + * @mr - the megion + * @t - the caller identification + * + * This routine checks MRs holding a reference during + * when being de-registered. + * + * If the count is non-zero, the code calls a clean routine then + * waits for the timeout for the count to zero. + */ +static int rvt_check_refs(struct rvt_mregion *mr, const char *t) +{ + unsigned long timeout; + struct rvt_dev_info *rdi = ib_to_rvt(mr->pd->device); + + if (percpu_ref_is_zero(&mr->refcount)) + return 0; + /* avoid dma mr */ + if (mr->lkey) + rvt_dereg_clean_qps(mr); + timeout = wait_for_completion_timeout(&mr->comp, 5 * HZ); + if (!timeout) { + rvt_pr_err(rdi, + "%s timeout mr %p pd %p lkey %x refcount %ld\n", + t, mr, mr->pd, mr->lkey, + atomic_long_read(&mr->refcount.count)); + rvt_get_mr(mr); + return -EBUSY; + } + return 0; +} + +/** + * rvt_mr_has_lkey - is MR + * @mr - the mregion + * @lkey - the lkey + */ +bool rvt_mr_has_lkey(struct rvt_mregion *mr, u32 lkey) +{ + return mr && lkey == mr->lkey; +} + +/** + * rvt_ss_has_lkey - is mr in sge tests + * @ss - the sge state + * @lkey + * + * This code tests for an MR in the indicated + * sge state. + */ +bool rvt_ss_has_lkey(struct rvt_sge_state *ss, u32 lkey) +{ + int i; + bool rval = false; + + if (!ss->num_sge) + return rval; + /* first one */ + rval = rvt_mr_has_lkey(ss->sge.mr, lkey); + /* any others */ + for (i = 0; !rval && i < ss->num_sge - 1; i++) + rval = rvt_mr_has_lkey(ss->sg_list[i].mr, lkey); + return rval; +} + /** * rvt_dereg_mr - unregister and free a memory region * @ibmr: the memory region to free @@ -453,22 +552,14 @@ bail_umem: int rvt_dereg_mr(struct ib_mr *ibmr) { struct rvt_mr *mr = to_imr(ibmr); - struct rvt_dev_info *rdi = ib_to_rvt(ibmr->pd->device); - int ret = 0; - unsigned long timeout; + int ret; rvt_free_lkey(&mr->mr); rvt_put_mr(&mr->mr); /* will set completion if last */ - timeout = wait_for_completion_timeout(&mr->mr.comp, 5 * HZ); - if (!timeout) { - rvt_pr_err(rdi, - "rvt_dereg_mr timeout mr %p pd %p\n", - mr, mr->mr.pd); - rvt_get_mr(&mr->mr); - ret = -EBUSY; + ret = rvt_check_refs(&mr->mr, __func__); + if (ret) goto out; - } rvt_deinit_mregion(&mr->mr); if (mr->umem) ib_umem_release(mr->umem); @@ -761,16 +852,12 @@ int rvt_dealloc_fmr(struct ib_fmr *ibfmr) { struct rvt_fmr *fmr = to_ifmr(ibfmr); int ret = 0; - unsigned long timeout; rvt_free_lkey(&fmr->mr); rvt_put_mr(&fmr->mr); /* will set completion if last */ - timeout = wait_for_completion_timeout(&fmr->mr.comp, 5 * HZ); - if (!timeout) { - rvt_get_mr(&fmr->mr); - ret = -EBUSY; + ret = rvt_check_refs(&fmr->mr, __func__); + if (ret) goto out; - } rvt_deinit_mregion(&fmr->mr); kfree(fmr); out: diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c index 9f70fd8665ab..22df09ae809e 100644 --- a/drivers/infiniband/sw/rdmavt/qp.c +++ b/drivers/infiniband/sw/rdmavt/qp.c @@ -458,10 +458,7 @@ static void rvt_clear_mr_refs(struct rvt_qp *qp, int clr_sends) } } - if (qp->ibqp.qp_type != IB_QPT_RC) - return; - - for (n = 0; n < rvt_max_atomic(rdi); n++) { + for (n = 0; qp->s_ack_queue && n < rvt_max_atomic(rdi); n++) { struct rvt_ack_entry *e = &qp->s_ack_queue[n]; if (e->rdma_sge.mr) { @@ -471,6 +468,113 @@ static void rvt_clear_mr_refs(struct rvt_qp *qp, int clr_sends) } } +/** + * rvt_swqe_has_lkey - return true if lkey is used by swqe + * @wqe - the send wqe + * @lkey - the lkey + * + * Test the swqe for using lkey + */ +static bool rvt_swqe_has_lkey(struct rvt_swqe *wqe, u32 lkey) +{ + int i; + + for (i = 0; i < wqe->wr.num_sge; i++) { + struct rvt_sge *sge = &wqe->sg_list[i]; + + if (rvt_mr_has_lkey(sge->mr, lkey)) + return true; + } + return false; +} + +/** + * rvt_qp_sends_has_lkey - return true is qp sends use lkey + * @qp - the rvt_qp + * @lkey - the lkey + */ +static bool rvt_qp_sends_has_lkey(struct rvt_qp *qp, u32 lkey) +{ + u32 s_last = qp->s_last; + + while (s_last != qp->s_head) { + struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, s_last); + + if (rvt_swqe_has_lkey(wqe, lkey)) + return true; + + if (++s_last >= qp->s_size) + s_last = 0; + } + if (qp->s_rdma_mr) + if (rvt_mr_has_lkey(qp->s_rdma_mr, lkey)) + return true; + return false; +} + +/** + * rvt_qp_acks_has_lkey - return true if acks have lkey + * @qp - the qp + * @lkey - the lkey + */ +static bool rvt_qp_acks_has_lkey(struct rvt_qp *qp, u32 lkey) +{ + int i; + struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device); + + for (i = 0; qp->s_ack_queue && i < rvt_max_atomic(rdi); i++) { + struct rvt_ack_entry *e = &qp->s_ack_queue[i]; + + if (rvt_mr_has_lkey(e->rdma_sge.mr, lkey)) + return true; + } + return false; +} + +/* + * rvt_qp_mr_clean - clean up remote ops for lkey + * @qp - the qp + * @lkey - the lkey that is being de-registered + * + * This routine checks if the lkey is being used by + * the qp. + * + * If so, the qp is put into an error state to elminate + * any references from the qp. + */ +void rvt_qp_mr_clean(struct rvt_qp *qp, u32 lkey) +{ + bool lastwqe = false; + + if (qp->ibqp.qp_type == IB_QPT_SMI || + qp->ibqp.qp_type == IB_QPT_GSI) + /* avoid special QPs */ + return; + spin_lock_irq(&qp->r_lock); + spin_lock(&qp->s_hlock); + spin_lock(&qp->s_lock); + + if (qp->state == IB_QPS_ERR || qp->state == IB_QPS_RESET) + goto check_lwqe; + + if (rvt_ss_has_lkey(&qp->r_sge, lkey) || + rvt_qp_sends_has_lkey(qp, lkey) || + rvt_qp_acks_has_lkey(qp, lkey)) + lastwqe = rvt_error_qp(qp, IB_WC_LOC_PROT_ERR); +check_lwqe: + spin_unlock(&qp->s_lock); + spin_unlock(&qp->s_hlock); + spin_unlock_irq(&qp->r_lock); + if (lastwqe) { + struct ib_event ev; + + ev.device = qp->ibqp.device; + ev.element.qp = &qp->ibqp; + ev.event = IB_EVENT_QP_LAST_WQE_REACHED; + qp->ibqp.event_handler(&ev, qp->ibqp.qp_context); + } +} + /** * rvt_remove_qp - remove qp form table * @rdi: rvt dev struct diff --git a/include/rdma/rdmavt_mr.h b/include/rdma/rdmavt_mr.h index f418bd5571a5..72a3856d4057 100644 --- a/include/rdma/rdmavt_mr.h +++ b/include/rdma/rdmavt_mr.h @@ -191,4 +191,7 @@ static inline void rvt_skip_sge(struct rvt_sge_state *ss, u32 length, } } +bool rvt_ss_has_lkey(struct rvt_sge_state *ss, u32 lkey); +bool rvt_mr_has_lkey(struct rvt_mregion *mr, u32 lkey); + #endif /* DEF_RDMAVT_INCMRH */ diff --git a/include/rdma/rdmavt_qp.h b/include/rdma/rdmavt_qp.h index dfeb311c30a1..0eed3d8752fa 100644 --- a/include/rdma/rdmavt_qp.h +++ b/include/rdma/rdmavt_qp.h @@ -702,4 +702,5 @@ int rvt_qp_iter_next(struct rvt_qp_iter *iter); void rvt_qp_iter(struct rvt_dev_info *rdi, u64 v, void (*cb)(struct rvt_qp *qp, u64 v)); +void rvt_qp_mr_clean(struct rvt_qp *qp, u32 lkey); #endif /* DEF_RDMAVT_INCQP_H */ -- cgit v1.2.3 From 6e44636aeab19259f804c8abca57a95ddc01df66 Mon Sep 17 00:00:00 2001 From: Artemy Kovalyov Date: Tue, 15 Aug 2017 11:59:02 +0300 Subject: net/mlx5: Update HW layout definitions * add offload_type field to mlx5_ifc_qpc_bits * update mlx5_ifc_xrqc_bits layout Signed-off-by: Artemy Kovalyov Reviewed-by: Yossi Itigin Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- include/linux/mlx5/mlx5_ifc.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 4eff0b8a1482..e27283ab3667 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -2022,6 +2022,10 @@ enum { MLX5_QPC_PM_STATE_MIGRATED = 0x3, }; +enum { + MLX5_QPC_OFFLOAD_TYPE_RNDV = 0x1, +}; + enum { MLX5_QPC_END_PADDING_MODE_SCATTER_AS_IS = 0x0, MLX5_QPC_END_PADDING_MODE_PAD_TO_CACHE_LINE_ALIGNMENT = 0x1, @@ -2065,7 +2069,8 @@ struct mlx5_ifc_qpc_bits { u8 st[0x8]; u8 reserved_at_10[0x3]; u8 pm_state[0x2]; - u8 reserved_at_15[0x7]; + u8 reserved_at_15[0x3]; + u8 offload_type[0x4]; u8 end_padding_mode[0x2]; u8 reserved_at_1e[0x2]; @@ -3010,7 +3015,7 @@ struct mlx5_ifc_xrqc_bits { struct mlx5_ifc_tag_matching_topology_context_bits tag_matching_topology_context; - u8 reserved_at_180[0x880]; + u8 reserved_at_180[0x280]; struct mlx5_ifc_wq_bits wq; }; -- cgit v1.2.3 From 6938fc1ee07e54c057430005f8dcaccabce027c3 Mon Sep 17 00:00:00 2001 From: Artemy Kovalyov Date: Thu, 17 Aug 2017 15:52:03 +0300 Subject: IB/core: Add XRQ capabilities This patch adds following TM XRQ capabilities: * max_rndv_hdr_size - Max size of rendezvous request message * max_num_tags - Max number of entries in tag matching list * max_ops - Max number of outstanding list operations * max_sge - Max number of SGE in tag matching entry * flags - the following flags are currently defined: - IB_TM_CAP_RC - Support tag matching on RC transport Signed-off-by: Artemy Kovalyov Reviewed-by: Yossi Itigin Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- include/rdma/ib_verbs.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'include') diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 355c7a328e0b..cab0bdcfad51 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -280,6 +280,24 @@ struct ib_rss_caps { u32 max_rwq_indirection_table_size; }; +enum ib_tm_cap_flags { + /* Support tag matching on RC transport */ + IB_TM_CAP_RC = 1 << 0, +}; + +struct ib_xrq_caps { + /* Max size of RNDV header */ + u32 max_rndv_hdr_size; + /* Max number of entries in tag matching list */ + u32 max_num_tags; + /* From enum ib_tm_cap_flags */ + u32 flags; + /* Max number of outstanding list operations */ + u32 max_ops; + /* Max number of SGE in tag matching entry */ + u32 max_sge; +}; + enum ib_cq_creation_flags { IB_CQ_FLAGS_TIMESTAMP_COMPLETION = 1 << 0, IB_CQ_FLAGS_IGNORE_OVERRUN = 1 << 1, @@ -340,6 +358,7 @@ struct ib_device_attr { struct ib_rss_caps rss_caps; u32 max_wq_type_rq; u32 raw_packet_caps; /* Use ib_raw_packet_caps enum */ + struct ib_xrq_caps xrq_caps; }; enum ib_mtu { -- cgit v1.2.3 From 1a56ff6daab1e062aadec582eb10e7090f0b370a Mon Sep 17 00:00:00 2001 From: Artemy Kovalyov Date: Thu, 17 Aug 2017 15:52:04 +0300 Subject: IB/core: Separate CQ handle in SRQ context Before this change CQ attached to SRQ was part of XRC specific extension. Moving CQ handle out makes it available to other types extending SRQ functionality. Signed-off-by: Artemy Kovalyov Reviewed-by: Yossi Itigin Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/core/uverbs_cmd.c | 27 +++++++++++++++++---------- drivers/infiniband/core/verbs.c | 16 +++++++++------- drivers/infiniband/hw/mlx4/srq.c | 4 ++-- drivers/infiniband/hw/mlx5/main.c | 10 +++++----- drivers/infiniband/hw/mlx5/srq.c | 11 +++++++---- include/rdma/ib_verbs.h | 31 ++++++++++++++++++++----------- 6 files changed, 60 insertions(+), 39 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 8e9fea03dec4..9f690af46a7e 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -3497,10 +3497,12 @@ static int __uverbs_create_xsrq(struct ib_uverbs_file *file, obj->uxrcd = container_of(xrcd_uobj, struct ib_uxrcd_object, uobject); atomic_inc(&obj->uxrcd->refcnt); + } - attr.ext.xrc.cq = uobj_get_obj_read(cq, cmd->cq_handle, - file->ucontext); - if (!attr.ext.xrc.cq) { + if (ib_srq_has_cq(cmd->srq_type)) { + attr.ext.cq = uobj_get_obj_read(cq, cmd->cq_handle, + file->ucontext); + if (!attr.ext.cq) { ret = -EINVAL; goto err_put_xrcd; } @@ -3535,10 +3537,13 @@ static int __uverbs_create_xsrq(struct ib_uverbs_file *file, srq->event_handler = attr.event_handler; srq->srq_context = attr.srq_context; + if (ib_srq_has_cq(cmd->srq_type)) { + srq->ext.cq = attr.ext.cq; + atomic_inc(&attr.ext.cq->usecnt); + } + if (cmd->srq_type == IB_SRQT_XRC) { - srq->ext.xrc.cq = attr.ext.xrc.cq; srq->ext.xrc.xrcd = attr.ext.xrc.xrcd; - atomic_inc(&attr.ext.xrc.cq->usecnt); atomic_inc(&attr.ext.xrc.xrcd->usecnt); } @@ -3561,10 +3566,12 @@ static int __uverbs_create_xsrq(struct ib_uverbs_file *file, goto err_copy; } - if (cmd->srq_type == IB_SRQT_XRC) { + if (cmd->srq_type == IB_SRQT_XRC) uobj_put_read(xrcd_uobj); - uobj_put_obj_read(attr.ext.xrc.cq); - } + + if (ib_srq_has_cq(cmd->srq_type)) + uobj_put_obj_read(attr.ext.cq); + uobj_put_obj_read(pd); uobj_alloc_commit(&obj->uevent.uobject); @@ -3577,8 +3584,8 @@ err_put: uobj_put_obj_read(pd); err_put_cq: - if (cmd->srq_type == IB_SRQT_XRC) - uobj_put_obj_read(attr.ext.xrc.cq); + if (ib_srq_has_cq(cmd->srq_type)) + uobj_put_obj_read(attr.ext.cq); err_put_xrcd: if (cmd->srq_type == IB_SRQT_XRC) { diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index b29d0ff94463..ecb6c395f19b 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -622,11 +622,13 @@ struct ib_srq *ib_create_srq(struct ib_pd *pd, srq->event_handler = srq_init_attr->event_handler; srq->srq_context = srq_init_attr->srq_context; srq->srq_type = srq_init_attr->srq_type; + if (ib_srq_has_cq(srq->srq_type)) { + srq->ext.cq = srq_init_attr->ext.cq; + atomic_inc(&srq->ext.cq->usecnt); + } if (srq->srq_type == IB_SRQT_XRC) { srq->ext.xrc.xrcd = srq_init_attr->ext.xrc.xrcd; - srq->ext.xrc.cq = srq_init_attr->ext.xrc.cq; atomic_inc(&srq->ext.xrc.xrcd->usecnt); - atomic_inc(&srq->ext.xrc.cq->usecnt); } atomic_inc(&pd->usecnt); atomic_set(&srq->usecnt, 0); @@ -667,18 +669,18 @@ int ib_destroy_srq(struct ib_srq *srq) pd = srq->pd; srq_type = srq->srq_type; - if (srq_type == IB_SRQT_XRC) { + if (ib_srq_has_cq(srq_type)) + cq = srq->ext.cq; + if (srq_type == IB_SRQT_XRC) xrcd = srq->ext.xrc.xrcd; - cq = srq->ext.xrc.cq; - } ret = srq->device->destroy_srq(srq); if (!ret) { atomic_dec(&pd->usecnt); - if (srq_type == IB_SRQT_XRC) { + if (srq_type == IB_SRQT_XRC) atomic_dec(&xrcd->usecnt); + if (ib_srq_has_cq(srq_type)) atomic_dec(&cq->usecnt); - } } return ret; diff --git a/drivers/infiniband/hw/mlx4/srq.c b/drivers/infiniband/hw/mlx4/srq.c index dd7a2fce9df4..ebee56cbc0e2 100644 --- a/drivers/infiniband/hw/mlx4/srq.c +++ b/drivers/infiniband/hw/mlx4/srq.c @@ -178,8 +178,8 @@ struct ib_srq *mlx4_ib_create_srq(struct ib_pd *pd, } } - cqn = (init_attr->srq_type == IB_SRQT_XRC) ? - to_mcq(init_attr->ext.xrc.cq)->mcq.cqn : 0; + cqn = ib_srq_has_cq(init_attr->srq_type) ? + to_mcq(init_attr->ext.cq)->mcq.cqn : 0; xrcdn = (init_attr->srq_type == IB_SRQT_XRC) ? to_mxrcd(init_attr->ext.xrc.xrcd)->xrcdn : (u16) dev->dev->caps.reserved_xrcds; diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 62e6298810e7..7ad585257fd3 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -3217,7 +3217,7 @@ static int create_dev_resources(struct mlx5_ib_resources *devr) attr.attr.max_sge = 1; attr.attr.max_wr = 1; attr.srq_type = IB_SRQT_XRC; - attr.ext.xrc.cq = devr->c0; + attr.ext.cq = devr->c0; attr.ext.xrc.xrcd = devr->x0; devr->s0 = mlx5_ib_create_srq(devr->p0, &attr, NULL); @@ -3232,9 +3232,9 @@ static int create_dev_resources(struct mlx5_ib_resources *devr) devr->s0->srq_context = NULL; devr->s0->srq_type = IB_SRQT_XRC; devr->s0->ext.xrc.xrcd = devr->x0; - devr->s0->ext.xrc.cq = devr->c0; + devr->s0->ext.cq = devr->c0; atomic_inc(&devr->s0->ext.xrc.xrcd->usecnt); - atomic_inc(&devr->s0->ext.xrc.cq->usecnt); + atomic_inc(&devr->s0->ext.cq->usecnt); atomic_inc(&devr->p0->usecnt); atomic_set(&devr->s0->usecnt, 0); @@ -3253,9 +3253,9 @@ static int create_dev_resources(struct mlx5_ib_resources *devr) devr->s1->event_handler = NULL; devr->s1->srq_context = NULL; devr->s1->srq_type = IB_SRQT_BASIC; - devr->s1->ext.xrc.cq = devr->c0; + devr->s1->ext.cq = devr->c0; atomic_inc(&devr->p0->usecnt); - atomic_set(&devr->s0->usecnt, 0); + atomic_set(&devr->s1->usecnt, 0); for (port = 0; port < ARRAY_SIZE(devr->ports); ++port) { INIT_WORK(&devr->ports[port].pkey_change_work, diff --git a/drivers/infiniband/hw/mlx5/srq.c b/drivers/infiniband/hw/mlx5/srq.c index 30b3ddd8e1ab..e6be4f2927a7 100644 --- a/drivers/infiniband/hw/mlx5/srq.c +++ b/drivers/infiniband/hw/mlx5/srq.c @@ -292,13 +292,16 @@ struct ib_srq *mlx5_ib_create_srq(struct ib_pd *pd, in.wqe_shift = srq->msrq.wqe_shift - 4; if (srq->wq_sig) in.flags |= MLX5_SRQ_FLAG_WQ_SIG; - if (init_attr->srq_type == IB_SRQT_XRC) { + + if (init_attr->srq_type == IB_SRQT_XRC) in.xrcd = to_mxrcd(init_attr->ext.xrc.xrcd)->xrcdn; - in.cqn = to_mcq(init_attr->ext.xrc.cq)->mcq.cqn; - } else if (init_attr->srq_type == IB_SRQT_BASIC) { + else in.xrcd = to_mxrcd(dev->devr.x0)->xrcdn; + + if (ib_srq_has_cq(init_attr->srq_type)) + in.cqn = to_mcq(init_attr->ext.cq)->mcq.cqn; + else in.cqn = to_mcq(dev->devr.c0)->mcq.cqn; - } in.pd = to_mpd(pd)->pdn; in.db_record = srq->db.dma; diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index cab0bdcfad51..f0e46757185b 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -994,6 +994,11 @@ enum ib_srq_type { IB_SRQT_XRC }; +static inline bool ib_srq_has_cq(enum ib_srq_type srq_type) +{ + return srq_type == IB_SRQT_XRC; +} + enum ib_srq_attr_mask { IB_SRQ_MAX_WR = 1 << 0, IB_SRQ_LIMIT = 1 << 1, @@ -1011,11 +1016,13 @@ struct ib_srq_init_attr { struct ib_srq_attr attr; enum ib_srq_type srq_type; - union { - struct { - struct ib_xrcd *xrcd; - struct ib_cq *cq; - } xrc; + struct { + struct ib_cq *cq; + union { + struct { + struct ib_xrcd *xrcd; + } xrc; + }; } ext; }; @@ -1554,12 +1561,14 @@ struct ib_srq { enum ib_srq_type srq_type; atomic_t usecnt; - union { - struct { - struct ib_xrcd *xrcd; - struct ib_cq *cq; - u32 srq_num; - } xrc; + struct { + struct ib_cq *cq; + union { + struct { + struct ib_xrcd *xrcd; + u32 srq_num; + } xrc; + }; } ext; }; -- cgit v1.2.3 From 9c2c849625cf779e0fac41c8be3c163df4b80c14 Mon Sep 17 00:00:00 2001 From: Artemy Kovalyov Date: Thu, 17 Aug 2017 15:52:05 +0300 Subject: IB/core: Add new SRQ type IB_SRQT_TM This patch adds new SRQ type - IB_SRQT_TM. The new SRQ type supports tag matching and rendezvous offloads for MPI applications. When SRQ receives a message it will search through the matching list for the corresponding posted receive buffer. The process of searching the matching list is called tag matching. In case the tag matching results in a match, the received message will be placed in the address specified by the receive buffer. In case no match was found the message will be placed in a generic buffer until the corresponding receive buffer will be posted. These messages are called unexpected and their set is called an unexpected list. Signed-off-by: Artemy Kovalyov Reviewed-by: Yossi Itigin Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- include/rdma/ib_verbs.h | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index f0e46757185b..1b4bb8743969 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -991,12 +991,14 @@ enum ib_cq_notify_flags { enum ib_srq_type { IB_SRQT_BASIC, - IB_SRQT_XRC + IB_SRQT_XRC, + IB_SRQT_TM, }; static inline bool ib_srq_has_cq(enum ib_srq_type srq_type) { - return srq_type == IB_SRQT_XRC; + return srq_type == IB_SRQT_XRC || + srq_type == IB_SRQT_TM; } enum ib_srq_attr_mask { @@ -1022,6 +1024,10 @@ struct ib_srq_init_attr { struct { struct ib_xrcd *xrcd; } xrc; + + struct { + u32 max_num_tags; + } tag_matching; }; } ext; }; -- cgit v1.2.3 From 9382d4e1d3c09fe20fa53eb12b51ef01ad40774f Mon Sep 17 00:00:00 2001 From: Artemy Kovalyov Date: Thu, 17 Aug 2017 15:52:06 +0300 Subject: IB/uverbs: Add XRQ creation parameter to UAPI Add tm_list_size parameter to struct ib_uverbs_create_xsrq. If SRQ type is tag-matching this field defines maximum size of tag matching list. Otherwise, it is expected to be zero. Signed-off-by: Artemy Kovalyov Reviewed-by: Yossi Itigin Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- include/uapi/rdma/ib_user_verbs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/rdma/ib_user_verbs.h b/include/uapi/rdma/ib_user_verbs.h index 63656d2e8705..d5434bbf40c8 100644 --- a/include/uapi/rdma/ib_user_verbs.h +++ b/include/uapi/rdma/ib_user_verbs.h @@ -1024,7 +1024,7 @@ struct ib_uverbs_create_xsrq { __u32 max_wr; __u32 max_sge; __u32 srq_limit; - __u32 reserved; + __u32 max_num_tags; __u32 xrcd_handle; __u32 cq_handle; __u64 driver_data[0]; -- cgit v1.2.3 From 8d50505ada728258fcdce99120b937ce68298c4e Mon Sep 17 00:00:00 2001 From: Artemy Kovalyov Date: Thu, 17 Aug 2017 15:52:08 +0300 Subject: IB/uverbs: Expose XRQ capabilities Make XRQ capabilities available via ibv_query_device() verb. Signed-off-by: Artemy Kovalyov Reviewed-by: Yossi Itigin Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/core/uverbs_cmd.c | 10 ++++++++++ include/uapi/rdma/ib_user_verbs.h | 15 +++++++++++++++ 2 files changed, 25 insertions(+) (limited to 'include') diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index e69038a07fa0..e0cb99860934 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -3868,6 +3868,16 @@ int ib_uverbs_ex_query_device(struct ib_uverbs_file *file, resp.raw_packet_caps = attr.raw_packet_caps; resp.response_length += sizeof(resp.raw_packet_caps); + + if (ucore->outlen < resp.response_length + sizeof(resp.xrq_caps)) + goto end; + + resp.xrq_caps.max_rndv_hdr_size = attr.xrq_caps.max_rndv_hdr_size; + resp.xrq_caps.max_num_tags = attr.xrq_caps.max_num_tags; + resp.xrq_caps.max_ops = attr.xrq_caps.max_ops; + resp.xrq_caps.max_sge = attr.xrq_caps.max_sge; + resp.xrq_caps.flags = attr.xrq_caps.flags; + resp.response_length += sizeof(resp.xrq_caps); end: err = ib_copy_to_udata(ucore, &resp, resp.response_length); return err; diff --git a/include/uapi/rdma/ib_user_verbs.h b/include/uapi/rdma/ib_user_verbs.h index d5434bbf40c8..9a0b6479fe0c 100644 --- a/include/uapi/rdma/ib_user_verbs.h +++ b/include/uapi/rdma/ib_user_verbs.h @@ -236,6 +236,20 @@ struct ib_uverbs_rss_caps { __u32 reserved; }; +struct ib_uverbs_tm_caps { + /* Max size of rendezvous request message */ + __u32 max_rndv_hdr_size; + /* Max number of entries in tag matching list */ + __u32 max_num_tags; + /* TM flags */ + __u32 flags; + /* Max number of outstanding list operations */ + __u32 max_ops; + /* Max number of SGE in tag matching entry */ + __u32 max_sge; + __u32 reserved; +}; + struct ib_uverbs_ex_query_device_resp { struct ib_uverbs_query_device_resp base; __u32 comp_mask; @@ -247,6 +261,7 @@ struct ib_uverbs_ex_query_device_resp { struct ib_uverbs_rss_caps rss_caps; __u32 max_wq_type_rq; __u32 raw_packet_caps; + struct ib_uverbs_tm_caps xrq_caps; }; struct ib_uverbs_query_port { -- cgit v1.2.3 From 5b3ec3fcb6bbe081279c73fb574af8c72f14cea0 Mon Sep 17 00:00:00 2001 From: Artemy Kovalyov Date: Thu, 17 Aug 2017 15:52:10 +0300 Subject: net/mlx5: Add XRQ support Add support to new XRQ(eXtended shared Receive Queue) hardware object. It supports SRQ semantics with addition of extended receive buffers topologies and offloads. Currently supports tag matching topology and rendezvouz offload. Signed-off-by: Artemy Kovalyov Reviewed-by: Yossi Itigin Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/net/ethernet/mellanox/mlx5/core/srq.c | 150 ++++++++++++++++++++++++-- include/linux/mlx5/driver.h | 1 + include/linux/mlx5/srq.h | 5 + 3 files changed, 146 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/srq.c b/drivers/net/ethernet/mellanox/mlx5/core/srq.c index f774de6f5fcb..7673da04efa4 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/srq.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/srq.c @@ -435,16 +435,128 @@ out: return err; } +static int create_xrq_cmd(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq, + struct mlx5_srq_attr *in) +{ + u32 create_out[MLX5_ST_SZ_DW(create_xrq_out)] = {0}; + void *create_in; + void *xrqc; + void *wq; + int pas_size; + int inlen; + int err; + + pas_size = get_pas_size(in); + inlen = MLX5_ST_SZ_BYTES(create_xrq_in) + pas_size; + create_in = kvzalloc(inlen, GFP_KERNEL); + if (!create_in) + return -ENOMEM; + + xrqc = MLX5_ADDR_OF(create_xrq_in, create_in, xrq_context); + wq = MLX5_ADDR_OF(xrqc, xrqc, wq); + + set_wq(wq, in); + memcpy(MLX5_ADDR_OF(xrqc, xrqc, wq.pas), in->pas, pas_size); + + if (in->type == IB_SRQT_TM) { + MLX5_SET(xrqc, xrqc, topology, MLX5_XRQC_TOPOLOGY_TAG_MATCHING); + if (in->flags & MLX5_SRQ_FLAG_RNDV) + MLX5_SET(xrqc, xrqc, offload, MLX5_XRQC_OFFLOAD_RNDV); + MLX5_SET(xrqc, xrqc, + tag_matching_topology_context.log_matching_list_sz, + in->tm_log_list_size); + } + MLX5_SET(xrqc, xrqc, user_index, in->user_index); + MLX5_SET(xrqc, xrqc, cqn, in->cqn); + MLX5_SET(create_xrq_in, create_in, opcode, MLX5_CMD_OP_CREATE_XRQ); + err = mlx5_cmd_exec(dev, create_in, inlen, create_out, + sizeof(create_out)); + kvfree(create_in); + if (!err) + srq->srqn = MLX5_GET(create_xrq_out, create_out, xrqn); + + return err; +} + +static int destroy_xrq_cmd(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq) +{ + u32 in[MLX5_ST_SZ_DW(destroy_xrq_in)] = {0}; + u32 out[MLX5_ST_SZ_DW(destroy_xrq_out)] = {0}; + + MLX5_SET(destroy_xrq_in, in, opcode, MLX5_CMD_OP_DESTROY_XRQ); + MLX5_SET(destroy_xrq_in, in, xrqn, srq->srqn); + + return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); +} + +static int arm_xrq_cmd(struct mlx5_core_dev *dev, + struct mlx5_core_srq *srq, + u16 lwm) +{ + u32 out[MLX5_ST_SZ_DW(arm_rq_out)] = {0}; + u32 in[MLX5_ST_SZ_DW(arm_rq_in)] = {0}; + + MLX5_SET(arm_rq_in, in, opcode, MLX5_CMD_OP_ARM_RQ); + MLX5_SET(arm_rq_in, in, op_mod, MLX5_ARM_RQ_IN_OP_MOD_XRQ); + MLX5_SET(arm_rq_in, in, srq_number, srq->srqn); + MLX5_SET(arm_rq_in, in, lwm, lwm); + + return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); +} + +static int query_xrq_cmd(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq, + struct mlx5_srq_attr *out) +{ + u32 in[MLX5_ST_SZ_DW(query_xrq_in)] = {0}; + u32 *xrq_out; + int outlen = MLX5_ST_SZ_BYTES(query_xrq_out); + void *xrqc; + int err; + + xrq_out = kvzalloc(outlen, GFP_KERNEL); + if (!xrq_out) + return -ENOMEM; + + MLX5_SET(query_xrq_in, in, opcode, MLX5_CMD_OP_QUERY_XRQ); + MLX5_SET(query_xrq_in, in, xrqn, srq->srqn); + + err = mlx5_cmd_exec(dev, in, sizeof(in), xrq_out, outlen); + if (err) + goto out; + + xrqc = MLX5_ADDR_OF(query_xrq_out, xrq_out, xrq_context); + get_wq(MLX5_ADDR_OF(xrqc, xrqc, wq), out); + if (MLX5_GET(xrqc, xrqc, state) != MLX5_XRQC_STATE_GOOD) + out->flags |= MLX5_SRQ_FLAG_ERR; + out->tm_next_tag = + MLX5_GET(xrqc, xrqc, + tag_matching_topology_context.append_next_index); + out->tm_hw_phase_cnt = + MLX5_GET(xrqc, xrqc, + tag_matching_topology_context.hw_phase_cnt); + out->tm_sw_phase_cnt = + MLX5_GET(xrqc, xrqc, + tag_matching_topology_context.sw_phase_cnt); + +out: + kvfree(xrq_out); + return err; +} + static int create_srq_split(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq, struct mlx5_srq_attr *in) { if (!dev->issi) return create_srq_cmd(dev, srq, in); - else if (srq->common.res == MLX5_RES_XSRQ) + switch (srq->common.res) { + case MLX5_RES_XSRQ: return create_xrc_srq_cmd(dev, srq, in); - else + case MLX5_RES_XRQ: + return create_xrq_cmd(dev, srq, in); + default: return create_rmp_cmd(dev, srq, in); + } } static int destroy_srq_split(struct mlx5_core_dev *dev, @@ -452,10 +564,14 @@ static int destroy_srq_split(struct mlx5_core_dev *dev, { if (!dev->issi) return destroy_srq_cmd(dev, srq); - else if (srq->common.res == MLX5_RES_XSRQ) + switch (srq->common.res) { + case MLX5_RES_XSRQ: return destroy_xrc_srq_cmd(dev, srq); - else + case MLX5_RES_XRQ: + return destroy_xrq_cmd(dev, srq); + default: return destroy_rmp_cmd(dev, srq); + } } int mlx5_core_create_srq(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq, @@ -464,10 +580,16 @@ int mlx5_core_create_srq(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq, int err; struct mlx5_srq_table *table = &dev->priv.srq_table; - if (in->type == IB_SRQT_XRC) + switch (in->type) { + case IB_SRQT_XRC: srq->common.res = MLX5_RES_XSRQ; - else + break; + case IB_SRQT_TM: + srq->common.res = MLX5_RES_XRQ; + break; + default: srq->common.res = MLX5_RES_SRQ; + } err = create_srq_split(dev, srq, in); if (err) @@ -528,10 +650,14 @@ int mlx5_core_query_srq(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq, { if (!dev->issi) return query_srq_cmd(dev, srq, out); - else if (srq->common.res == MLX5_RES_XSRQ) + switch (srq->common.res) { + case MLX5_RES_XSRQ: return query_xrc_srq_cmd(dev, srq, out); - else + case MLX5_RES_XRQ: + return query_xrq_cmd(dev, srq, out); + default: return query_rmp_cmd(dev, srq, out); + } } EXPORT_SYMBOL(mlx5_core_query_srq); @@ -540,10 +666,14 @@ int mlx5_core_arm_srq(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq, { if (!dev->issi) return arm_srq_cmd(dev, srq, lwm, is_srq); - else if (srq->common.res == MLX5_RES_XSRQ) + switch (srq->common.res) { + case MLX5_RES_XSRQ: return arm_xrc_srq_cmd(dev, srq, lwm); - else + case MLX5_RES_XRQ: + return arm_xrq_cmd(dev, srq, lwm); + default: return arm_rmp_cmd(dev, srq, lwm); + } } EXPORT_SYMBOL(mlx5_core_arm_srq); diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 99d88624ad07..c33e6f7a1afb 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -418,6 +418,7 @@ enum mlx5_res_type { MLX5_RES_SQ = MLX5_EVENT_QUEUE_TYPE_SQ, MLX5_RES_SRQ = 3, MLX5_RES_XSRQ = 4, + MLX5_RES_XRQ = 5, }; struct mlx5_core_rsc_common { diff --git a/include/linux/mlx5/srq.h b/include/linux/mlx5/srq.h index 1cde0fd53f90..24ff23e27c8a 100644 --- a/include/linux/mlx5/srq.h +++ b/include/linux/mlx5/srq.h @@ -38,6 +38,7 @@ enum { MLX5_SRQ_FLAG_ERR = (1 << 0), MLX5_SRQ_FLAG_WQ_SIG = (1 << 1), + MLX5_SRQ_FLAG_RNDV = (1 << 2), }; struct mlx5_srq_attr { @@ -56,6 +57,10 @@ struct mlx5_srq_attr { u32 user_index; u64 db_record; __be64 *pas; + u32 tm_log_list_size; + u32 tm_next_tag; + u32 tm_hw_phase_cnt; + u32 tm_sw_phase_cnt; }; struct mlx5_core_dev; -- cgit v1.2.3 From a0aa309c39de58b86b704654434431aeb5a8bdf1 Mon Sep 17 00:00:00 2001 From: Matan Barak Date: Thu, 3 Aug 2017 16:06:55 +0300 Subject: IB/core: Add a generic way to execute an operation on a uobject The ioctl infrastructure treats all user-objects in the same manner. It gets objects ids from the user-space and by using the object type and type attributes mentioned in the object specification, it executes this required method. Passing an object id from the user-space as an attribute is carried out in three stages. The first is carried out before the actual handler and the last is carried out afterwards. The different supported operations are read, write, destroy and create. In the first stage, the former three actions just fetches the object from the repository (by using its id) and locks it. The last action allocates a new uobject. Afterwards, the second stage is carried out when the handler itself carries out the required modification of the object. The last stage is carried out after the handler finishes and commits the result. The former two operations just unlock the object. Destroy calls the "free object" operation, taking into account the object's type and releases the uobject as well. Creation just adds the new uobject to the repository, making the object visible to the application. In order to abstract these details from the ioctl infrastructure layer, we add uverbs_get_uobject_from_context and uverbs_finalize_object functions which corresponds to the first and last stages respectively. Signed-off-by: Matan Barak Reviewed-by: Yishai Hadas Signed-off-by: Doug Ledford --- drivers/infiniband/core/rdma_core.c | 58 +++++++++++++++++++++++++++++++++++++ drivers/infiniband/core/rdma_core.h | 17 +++++++++++ include/rdma/uverbs_ioctl.h | 52 +++++++++++++++++++++++++++++++++ 3 files changed, 127 insertions(+) create mode 100644 include/rdma/uverbs_ioctl.h (limited to 'include') diff --git a/drivers/infiniband/core/rdma_core.c b/drivers/infiniband/core/rdma_core.c index 41c31a2bf093..2bd58ff17bb8 100644 --- a/drivers/infiniband/core/rdma_core.c +++ b/drivers/infiniband/core/rdma_core.c @@ -35,6 +35,7 @@ #include #include #include +#include #include "uverbs.h" #include "core_priv.h" #include "rdma_core.h" @@ -625,3 +626,60 @@ const struct uverbs_obj_type_class uverbs_fd_class = { .needs_kfree_rcu = false, }; +struct ib_uobject *uverbs_get_uobject_from_context(const struct uverbs_obj_type *type_attrs, + struct ib_ucontext *ucontext, + enum uverbs_obj_access access, + int id) +{ + switch (access) { + case UVERBS_ACCESS_READ: + return rdma_lookup_get_uobject(type_attrs, ucontext, id, false); + case UVERBS_ACCESS_DESTROY: + case UVERBS_ACCESS_WRITE: + return rdma_lookup_get_uobject(type_attrs, ucontext, id, true); + case UVERBS_ACCESS_NEW: + return rdma_alloc_begin_uobject(type_attrs, ucontext); + default: + WARN_ON(true); + return ERR_PTR(-EOPNOTSUPP); + } +} + +int uverbs_finalize_object(struct ib_uobject *uobj, + enum uverbs_obj_access access, + bool commit) +{ + int ret = 0; + + /* + * refcounts should be handled at the object level and not at the + * uobject level. Refcounts of the objects themselves are done in + * handlers. + */ + + switch (access) { + case UVERBS_ACCESS_READ: + rdma_lookup_put_uobject(uobj, false); + break; + case UVERBS_ACCESS_WRITE: + rdma_lookup_put_uobject(uobj, true); + break; + case UVERBS_ACCESS_DESTROY: + if (commit) + ret = rdma_remove_commit_uobject(uobj); + else + rdma_lookup_put_uobject(uobj, true); + break; + case UVERBS_ACCESS_NEW: + if (commit) + ret = rdma_alloc_commit_uobject(uobj); + else + rdma_alloc_abort_uobject(uobj); + break; + default: + WARN_ON(true); + ret = -EOPNOTSUPP; + } + + return ret; +} diff --git a/drivers/infiniband/core/rdma_core.h b/drivers/infiniband/core/rdma_core.h index 1b82e7ff7fe8..97483d1a7336 100644 --- a/drivers/infiniband/core/rdma_core.h +++ b/drivers/infiniband/core/rdma_core.h @@ -39,6 +39,7 @@ #include #include +#include #include #include @@ -75,4 +76,20 @@ void uverbs_uobject_put(struct ib_uobject *uobject); */ void uverbs_close_fd(struct file *f); +/* + * Get an ib_uobject that corresponds to the given id from ucontext, assuming + * the object is from the given type. Lock it to the required access when + * applicable. + * This function could create (access == NEW), destroy (access == DESTROY) + * or unlock (access == READ || access == WRITE) objects if required. + * The action will be finalized only when uverbs_finalize_object is called. + */ +struct ib_uobject *uverbs_get_uobject_from_context(const struct uverbs_obj_type *type_attrs, + struct ib_ucontext *ucontext, + enum uverbs_obj_access access, + int id); +int uverbs_finalize_object(struct ib_uobject *uobj, + enum uverbs_obj_access access, + bool commit); + #endif /* RDMA_CORE_H */ diff --git a/include/rdma/uverbs_ioctl.h b/include/rdma/uverbs_ioctl.h new file mode 100644 index 000000000000..6885b92db4a8 --- /dev/null +++ b/include/rdma/uverbs_ioctl.h @@ -0,0 +1,52 @@ +/* + * Copyright (c) 2017, Mellanox Technologies inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _UVERBS_IOCTL_ +#define _UVERBS_IOCTL_ + +#include + +/* + * ======================================= + * Verbs action specifications + * ======================================= + */ + +enum uverbs_obj_access { + UVERBS_ACCESS_READ, + UVERBS_ACCESS_WRITE, + UVERBS_ACCESS_NEW, + UVERBS_ACCESS_DESTROY +}; + +#endif + -- cgit v1.2.3 From f43dbebfa32041826299bdccae0352887fa007ea Mon Sep 17 00:00:00 2001 From: Matan Barak Date: Thu, 3 Aug 2017 16:06:56 +0300 Subject: IB/core: Add support to finalize objects in one transaction The new ioctl based infrastructure either commits or rollbacks all objects of the method as one transaction. In order to do that, we introduce a notion of dealing with a collection of objects that are related to a specific method. This also requires adding a notion of a method and attribute. A method contains a hash of attributes, where each bucket contains several attributes. The attributes are hashed according to their namespace which resides in the four upper bits of the id. For example, an object could be a CQ, which has an action of CREATE_CQ. This action has multiple attributes. For example, the CQ's new handle and the comp_channel. Each layer in this hierarchy - objects, methods and attributes is split into namespaces. The basic example for that is one namespace representing the default entities and another one representing the driver specific entities. When declaring these methods and attributes, we actually declare their specifications. When a method is executed, we actually allocates some space to hold auxiliary information. This auxiliary information contains meta-data about the required objects, such as pointers to their type information, pointers to the uobjects themselves (if exist), etc. The specification, along with the auxiliary information we allocated and filled is given to the finalize_objects function. Signed-off-by: Matan Barak Reviewed-by: Yishai Hadas Signed-off-by: Doug Ledford --- drivers/infiniband/core/rdma_core.c | 40 ++++++++++++++++++++++++++++ drivers/infiniband/core/rdma_core.h | 22 ++++++++++++++- include/rdma/uverbs_ioctl.h | 53 +++++++++++++++++++++++++++++++++++++ 3 files changed, 114 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/drivers/infiniband/core/rdma_core.c b/drivers/infiniband/core/rdma_core.c index 2bd58ff17bb8..0fe8ef913387 100644 --- a/drivers/infiniband/core/rdma_core.c +++ b/drivers/infiniband/core/rdma_core.c @@ -683,3 +683,43 @@ int uverbs_finalize_object(struct ib_uobject *uobj, return ret; } + +int uverbs_finalize_objects(struct uverbs_attr_bundle *attrs_bundle, + struct uverbs_attr_spec_hash * const *spec_hash, + size_t num, + bool commit) +{ + unsigned int i; + int ret = 0; + + for (i = 0; i < num; i++) { + struct uverbs_attr_bundle_hash *curr_bundle = + &attrs_bundle->hash[i]; + const struct uverbs_attr_spec_hash *curr_spec_bucket = + spec_hash[i]; + unsigned int j; + + for (j = 0; j < curr_bundle->num_attrs; j++) { + struct uverbs_attr *attr; + const struct uverbs_attr_spec *spec; + + if (!uverbs_attr_is_valid_in_hash(curr_bundle, j)) + continue; + + attr = &curr_bundle->attrs[j]; + spec = &curr_spec_bucket->attrs[j]; + + if (spec->type == UVERBS_ATTR_TYPE_IDR || + spec->type == UVERBS_ATTR_TYPE_FD) { + int current_ret; + + current_ret = uverbs_finalize_object(attr->obj_attr.uobject, + spec->obj.access, + commit); + if (!ret) + ret = current_ret; + } + } + } + return ret; +} diff --git a/drivers/infiniband/core/rdma_core.h b/drivers/infiniband/core/rdma_core.h index 97483d1a7336..9ed6ad0324c7 100644 --- a/drivers/infiniband/core/rdma_core.h +++ b/drivers/infiniband/core/rdma_core.h @@ -82,7 +82,8 @@ void uverbs_close_fd(struct file *f); * applicable. * This function could create (access == NEW), destroy (access == DESTROY) * or unlock (access == READ || access == WRITE) objects if required. - * The action will be finalized only when uverbs_finalize_object is called. + * The action will be finalized only when uverbs_finalize_object or + * uverbs_finalize_objects are called. */ struct ib_uobject *uverbs_get_uobject_from_context(const struct uverbs_obj_type *type_attrs, struct ib_ucontext *ucontext, @@ -91,5 +92,24 @@ struct ib_uobject *uverbs_get_uobject_from_context(const struct uverbs_obj_type int uverbs_finalize_object(struct ib_uobject *uobj, enum uverbs_obj_access access, bool commit); +/* + * Note that certain finalize stages could return a status: + * (a) alloc_commit could return a failure if the object is committed at the + * same time when the context is destroyed. + * (b) remove_commit could fail if the object wasn't destroyed successfully. + * Since multiple objects could be finalized in one transaction, it is very NOT + * recommended to have several finalize actions which have side effects. + * For example, it's NOT recommended to have a certain action which has both + * a commit action and a destroy action or two destroy objects in the same + * action. The rule of thumb is to have one destroy or commit action with + * multiple lookups. + * The first non zero return value of finalize_object is returned from this + * function. For example, this could happen when we couldn't destroy an + * object. + */ +int uverbs_finalize_objects(struct uverbs_attr_bundle *attrs_bundle, + struct uverbs_attr_spec_hash * const *spec_hash, + size_t num, + bool commit); #endif /* RDMA_CORE_H */ diff --git a/include/rdma/uverbs_ioctl.h b/include/rdma/uverbs_ioctl.h index 6885b92db4a8..d3ec02b7d937 100644 --- a/include/rdma/uverbs_ioctl.h +++ b/include/rdma/uverbs_ioctl.h @@ -41,6 +41,12 @@ * ======================================= */ +enum uverbs_attr_type { + UVERBS_ATTR_TYPE_NA, + UVERBS_ATTR_TYPE_IDR, + UVERBS_ATTR_TYPE_FD, +}; + enum uverbs_obj_access { UVERBS_ACCESS_READ, UVERBS_ACCESS_WRITE, @@ -48,5 +54,52 @@ enum uverbs_obj_access { UVERBS_ACCESS_DESTROY }; +struct uverbs_attr_spec { + enum uverbs_attr_type type; + struct { + /* + * higher bits mean the namespace and lower bits mean + * the type id within the namespace. + */ + u16 obj_type; + u8 access; + } obj; +}; + +struct uverbs_attr_spec_hash { + size_t num_attrs; + struct uverbs_attr_spec attrs[0]; +}; + +struct uverbs_obj_attr { + struct ib_uobject *uobject; +}; + +struct uverbs_attr { + struct uverbs_obj_attr obj_attr; +}; + +struct uverbs_attr_bundle_hash { + /* if bit i is set, it means attrs[i] contains valid information */ + unsigned long *valid_bitmap; + size_t num_attrs; + /* + * arrays of attributes, each element corresponds to the specification + * of the attribute in the same index. + */ + struct uverbs_attr *attrs; +}; + +struct uverbs_attr_bundle { + size_t num_buckets; + struct uverbs_attr_bundle_hash hash[]; +}; + +static inline bool uverbs_attr_is_valid_in_hash(const struct uverbs_attr_bundle_hash *attrs_hash, + unsigned int idx) +{ + return test_bit(idx, attrs_hash->valid_bitmap); +} + #endif -- cgit v1.2.3 From 72f9b089ecd2cc2194d27cbb14fd80a0b1472e89 Mon Sep 17 00:00:00 2001 From: Aditya Sarwade Date: Tue, 29 Aug 2017 15:51:29 -0700 Subject: RDMA/vmw_pvrdma: Report network header type in WC We should report the network header type in the work completion so that the kernel can infer the right RoCE type headers. Reviewed-by: Bryan Tan Signed-off-by: Aditya Sarwade Signed-off-by: Adit Ranadive Reviewed-by: Yuval Shaia Signed-off-by: Doug Ledford --- drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c | 1 + include/uapi/rdma/vmw_pvrdma-abi.h | 6 ++++-- 2 files changed, 5 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c index 90aa326fd7c0..8a12dc73b68e 100644 --- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c +++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c @@ -389,6 +389,7 @@ retry: wc->dlid_path_bits = cqe->dlid_path_bits; wc->port_num = cqe->port_num; wc->vendor_err = cqe->vendor_err; + wc->network_hdr_type = cqe->network_hdr_type; /* Update shared ring state */ pvrdma_idx_ring_inc(&cq->ring_state->rx.cons_head, cq->ibcq.cqe); diff --git a/include/uapi/rdma/vmw_pvrdma-abi.h b/include/uapi/rdma/vmw_pvrdma-abi.h index c8c1d2d6df4d..c6569b0032ec 100644 --- a/include/uapi/rdma/vmw_pvrdma-abi.h +++ b/include/uapi/rdma/vmw_pvrdma-abi.h @@ -125,7 +125,8 @@ enum pvrdma_wc_flags { PVRDMA_WC_IP_CSUM_OK = 1 << 3, PVRDMA_WC_WITH_SMAC = 1 << 4, PVRDMA_WC_WITH_VLAN = 1 << 5, - PVRDMA_WC_FLAGS_MAX = PVRDMA_WC_WITH_VLAN, + PVRDMA_WC_WITH_NETWORK_HDR_TYPE = 1 << 6, + PVRDMA_WC_FLAGS_MAX = PVRDMA_WC_WITH_NETWORK_HDR_TYPE, }; struct pvrdma_alloc_ucontext_resp { @@ -283,7 +284,8 @@ struct pvrdma_cqe { __u8 dlid_path_bits; __u8 port_num; __u8 smac[6]; - __u8 reserved2[7]; /* Pad to next power of 2 (64). */ + __u8 network_hdr_type; + __u8 reserved2[6]; /* Pad to next power of 2 (64). */ }; #endif /* __VMW_PVRDMA_ABI_H__ */ -- cgit v1.2.3 From fac9658cabb98afb68ef1630c558864e6f559c07 Mon Sep 17 00:00:00 2001 From: Matan Barak Date: Thu, 3 Aug 2017 16:06:57 +0300 Subject: IB/core: Add new ioctl interface In this ioctl interface, processing the command starts from properties of the command and fetching the appropriate user objects before calling the handler. Parsing and validation is done according to a specifier declared by the driver's code. In the driver, all supported objects are declared. These objects are separated to different object namepsaces. Dividing objects to namespaces is done at initialization by using the higher bits of the object ids. This initialization can mix objects declared in different places to one parsing tree using in this ioctl interface. For each object we list all supported methods. Similarly to objects, methods are separated to method namespaces too. Namespacing is done similarly to the objects case. This could be used in order to add methods to an existing object. Each method has a specific handler, which could be either a default handler or a driver specific handler. Along with the handler, a bunch of attributes are specified as well. Similarly to objects and method, attributes are namespaced and hashed by their ids at initialization too. All supported attributes are subject to automatic fetching and validation. These attributes include the command, response and the method's related objects' ids. When these entities (objects, methods and attributes) are used, the high bits of the entities ids are used in order to calculate the hash bucket index. Then, these high bits are masked out in order to have a zero based index. Since we use these high bits for both bucketing and namespacing, we get a compact representation and O(1) array access. This is mandatory for efficient dispatching. Each attribute has a type (PTR_IN, PTR_OUT, IDR and FD) and a length. Attributes could be validated through some attributes, like: (*) Minimum size / Exact size (*) Fops for FD (*) Object type for IDR If an IDR/fd attribute is specified, the kernel also states the object type and the required access (NEW, WRITE, READ or DESTROY). All uobject/fd management is done automatically by the infrastructure, meaning - the infrastructure will fail concurrent commands that at least one of them requires concurrent access (WRITE/DESTROY), synchronize actions with device removals (dissociate context events) and take care of reference counting (increase/decrease) for concurrent actions invocation. The reference counts on the actual kernel objects shall be handled by the handlers. objects +--------+ | | | | methods +--------+ | | ns method method_spec +-----+ |len | +--------+ +------+[d]+-------+ +----------------+[d]+------------+ |attr1+-> |type | | object +> |method+-> | spec +-> + attr_buckets +-> |default_chain+--> +-----+ |idr_type| +--------+ +------+ |handler| | | +------------+ |attr2| |access | | | | | +-------+ +----------------+ |driver chain| +-----+ +--------+ | | | | +------------+ | | +------+ | | | | | | | | | | | | | | | | | | | | +--------+ [d] = Hash ids to groups using the high order bits The right types table is also chosen by using the high bits from the ids. Currently we have either default or driver specific groups. Once validation and object fetching (or creation) completed, we call the handler: int (*handler)(struct ib_device *ib_dev, struct ib_uverbs_file *ufile, struct uverbs_attr_bundle *ctx); ctx bundles attributes of different namespaces. Each element there is an array of attributes which corresponds to one namespaces of attributes. For example, in the usually used case: ctx core +----------------------------+ +------------+ | core: +---> | valid | +----------------------------+ | cmd_attr | | driver: | +------------+ |----------------------------+--+ | valid | | | cmd_attr | | +------------+ | | valid | | | obj_attr | | +------------+ | | drivers | +------------+ +> | valid | | cmd_attr | +------------+ | valid | | cmd_attr | +------------+ | valid | | obj_attr | +------------+ Signed-off-by: Matan Barak Reviewed-by: Yishai Hadas Signed-off-by: Doug Ledford --- drivers/infiniband/core/Makefile | 2 +- drivers/infiniband/core/rdma_core.c | 46 +++++ drivers/infiniband/core/rdma_core.h | 5 + drivers/infiniband/core/uverbs_ioctl.c | 364 +++++++++++++++++++++++++++++++++ include/rdma/ib_verbs.h | 2 + include/rdma/uverbs_ioctl.h | 101 ++++++++- include/uapi/rdma/rdma_user_ioctl.h | 33 +++ 7 files changed, 543 insertions(+), 10 deletions(-) create mode 100644 drivers/infiniband/core/uverbs_ioctl.c (limited to 'include') diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile index 920609a0872e..746756dc9877 100644 --- a/drivers/infiniband/core/Makefile +++ b/drivers/infiniband/core/Makefile @@ -32,4 +32,4 @@ ib_umad-y := user_mad.o ib_ucm-y := ucm.o ib_uverbs-y := uverbs_main.o uverbs_cmd.o uverbs_marshall.o \ - rdma_core.o uverbs_std_types.o + rdma_core.o uverbs_std_types.o uverbs_ioctl.o diff --git a/drivers/infiniband/core/rdma_core.c b/drivers/infiniband/core/rdma_core.c index 0fe8ef913387..2a2f002ac7cb 100644 --- a/drivers/infiniband/core/rdma_core.c +++ b/drivers/infiniband/core/rdma_core.c @@ -36,10 +36,56 @@ #include #include #include +#include #include "uverbs.h" #include "core_priv.h" #include "rdma_core.h" +int uverbs_ns_idx(u16 *id, unsigned int ns_count) +{ + int ret = (*id & UVERBS_ID_NS_MASK) >> UVERBS_ID_NS_SHIFT; + + if (ret >= ns_count) + return -EINVAL; + + *id &= ~UVERBS_ID_NS_MASK; + return ret; +} + +const struct uverbs_object_spec *uverbs_get_object(const struct ib_device *ibdev, + uint16_t object) +{ + const struct uverbs_root_spec *object_hash = ibdev->specs_root; + const struct uverbs_object_spec_hash *objects; + int ret = uverbs_ns_idx(&object, object_hash->num_buckets); + + if (ret < 0) + return NULL; + + objects = object_hash->object_buckets[ret]; + + if (object >= objects->num_objects) + return NULL; + + return objects->objects[object]; +} + +const struct uverbs_method_spec *uverbs_get_method(const struct uverbs_object_spec *object, + uint16_t method) +{ + const struct uverbs_method_spec_hash *methods; + int ret = uverbs_ns_idx(&method, object->num_buckets); + + if (ret < 0) + return NULL; + + methods = object->method_buckets[ret]; + if (method >= methods->num_methods) + return NULL; + + return methods->methods[method]; +} + void uverbs_uobject_get(struct ib_uobject *uobject) { kref_get(&uobject->ref); diff --git a/drivers/infiniband/core/rdma_core.h b/drivers/infiniband/core/rdma_core.h index 9ed6ad0324c7..1efcf93238dd 100644 --- a/drivers/infiniband/core/rdma_core.h +++ b/drivers/infiniband/core/rdma_core.h @@ -43,6 +43,11 @@ #include #include +int uverbs_ns_idx(u16 *id, unsigned int ns_count); +const struct uverbs_object_spec *uverbs_get_object(const struct ib_device *ibdev, + uint16_t object); +const struct uverbs_method_spec *uverbs_get_method(const struct uverbs_object_spec *object, + uint16_t method); /* * These functions initialize the context and cleanups its uobjects. * The context has a list of objects which is protected by a mutex diff --git a/drivers/infiniband/core/uverbs_ioctl.c b/drivers/infiniband/core/uverbs_ioctl.c new file mode 100644 index 000000000000..5286ad57d903 --- /dev/null +++ b/drivers/infiniband/core/uverbs_ioctl.c @@ -0,0 +1,364 @@ +/* + * Copyright (c) 2017, Mellanox Technologies inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include "rdma_core.h" +#include "uverbs.h" + +static int uverbs_process_attr(struct ib_device *ibdev, + struct ib_ucontext *ucontext, + const struct ib_uverbs_attr *uattr, + u16 attr_id, + const struct uverbs_attr_spec_hash *attr_spec_bucket, + struct uverbs_attr_bundle_hash *attr_bundle_h, + struct ib_uverbs_attr __user *uattr_ptr) +{ + const struct uverbs_attr_spec *spec; + struct uverbs_attr *e; + const struct uverbs_object_spec *object; + struct uverbs_obj_attr *o_attr; + struct uverbs_attr *elements = attr_bundle_h->attrs; + + if (uattr->reserved) + return -EINVAL; + + if (attr_id >= attr_spec_bucket->num_attrs) { + if (uattr->flags & UVERBS_ATTR_F_MANDATORY) + return -EINVAL; + else + return 0; + } + + spec = &attr_spec_bucket->attrs[attr_id]; + e = &elements[attr_id]; + e->uattr = uattr_ptr; + + switch (spec->type) { + case UVERBS_ATTR_TYPE_PTR_IN: + case UVERBS_ATTR_TYPE_PTR_OUT: + if (uattr->len < spec->len || + (!(spec->flags & UVERBS_ATTR_SPEC_F_MIN_SZ) && + uattr->len > spec->len)) + return -EINVAL; + + e->ptr_attr.data = uattr->data; + e->ptr_attr.len = uattr->len; + e->ptr_attr.flags = uattr->flags; + break; + + case UVERBS_ATTR_TYPE_IDR: + if (uattr->data >> 32) + return -EINVAL; + /* fall through */ + case UVERBS_ATTR_TYPE_FD: + if (uattr->len != 0 || !ucontext || uattr->data > INT_MAX) + return -EINVAL; + + o_attr = &e->obj_attr; + object = uverbs_get_object(ibdev, spec->obj.obj_type); + if (!object) + return -EINVAL; + o_attr->type = object->type_attrs; + + o_attr->id = (int)uattr->data; + o_attr->uobject = uverbs_get_uobject_from_context( + o_attr->type, + ucontext, + spec->obj.access, + o_attr->id); + + if (IS_ERR(o_attr->uobject)) + return PTR_ERR(o_attr->uobject); + + if (spec->obj.access == UVERBS_ACCESS_NEW) { + u64 id = o_attr->uobject->id; + + /* Copy the allocated id to the user-space */ + if (put_user(id, &e->uattr->data)) { + uverbs_finalize_object(o_attr->uobject, + UVERBS_ACCESS_NEW, + false); + return -EFAULT; + } + } + + break; + default: + return -EOPNOTSUPP; + } + + set_bit(attr_id, attr_bundle_h->valid_bitmap); + return 0; +} + +static int uverbs_uattrs_process(struct ib_device *ibdev, + struct ib_ucontext *ucontext, + const struct ib_uverbs_attr *uattrs, + size_t num_uattrs, + const struct uverbs_method_spec *method, + struct uverbs_attr_bundle *attr_bundle, + struct ib_uverbs_attr __user *uattr_ptr) +{ + size_t i; + int ret = 0; + int num_given_buckets = 0; + + for (i = 0; i < num_uattrs; i++) { + const struct ib_uverbs_attr *uattr = &uattrs[i]; + u16 attr_id = uattr->attr_id; + struct uverbs_attr_spec_hash *attr_spec_bucket; + + ret = uverbs_ns_idx(&attr_id, method->num_buckets); + if (ret < 0) { + if (uattr->flags & UVERBS_ATTR_F_MANDATORY) { + uverbs_finalize_objects(attr_bundle, + method->attr_buckets, + num_given_buckets, + false); + return ret; + } + continue; + } + + /* + * ret is the found ns, so increase num_given_buckets if + * necessary. + */ + if (ret >= num_given_buckets) + num_given_buckets = ret + 1; + + attr_spec_bucket = method->attr_buckets[ret]; + ret = uverbs_process_attr(ibdev, ucontext, uattr, attr_id, + attr_spec_bucket, &attr_bundle->hash[ret], + uattr_ptr++); + if (ret) { + uverbs_finalize_objects(attr_bundle, + method->attr_buckets, + num_given_buckets, + false); + return ret; + } + } + + return num_given_buckets; +} + +static int uverbs_validate_kernel_mandatory(const struct uverbs_method_spec *method_spec, + struct uverbs_attr_bundle *attr_bundle) +{ + unsigned int i; + + for (i = 0; i < attr_bundle->num_buckets; i++) { + struct uverbs_attr_spec_hash *attr_spec_bucket = + method_spec->attr_buckets[i]; + + if (!bitmap_subset(attr_spec_bucket->mandatory_attrs_bitmask, + attr_bundle->hash[i].valid_bitmap, + attr_spec_bucket->num_attrs)) + return -EINVAL; + } + + return 0; +} + +static int uverbs_handle_method(struct ib_uverbs_attr __user *uattr_ptr, + const struct ib_uverbs_attr *uattrs, + size_t num_uattrs, + struct ib_device *ibdev, + struct ib_uverbs_file *ufile, + const struct uverbs_method_spec *method_spec, + struct uverbs_attr_bundle *attr_bundle) +{ + int ret; + int finalize_ret; + int num_given_buckets; + + num_given_buckets = uverbs_uattrs_process(ibdev, ufile->ucontext, uattrs, + num_uattrs, method_spec, + attr_bundle, uattr_ptr); + if (num_given_buckets <= 0) + return -EINVAL; + + attr_bundle->num_buckets = num_given_buckets; + ret = uverbs_validate_kernel_mandatory(method_spec, attr_bundle); + if (ret) + goto cleanup; + + ret = method_spec->handler(ibdev, ufile, attr_bundle); +cleanup: + finalize_ret = uverbs_finalize_objects(attr_bundle, + method_spec->attr_buckets, + attr_bundle->num_buckets, + !ret); + + return ret ? ret : finalize_ret; +} + +#define UVERBS_OPTIMIZE_USING_STACK_SZ 256 +static long ib_uverbs_cmd_verbs(struct ib_device *ib_dev, + struct ib_uverbs_file *file, + struct ib_uverbs_ioctl_hdr *hdr, + void __user *buf) +{ + const struct uverbs_object_spec *object_spec; + const struct uverbs_method_spec *method_spec; + long err = 0; + unsigned int i; + struct { + struct ib_uverbs_attr *uattrs; + struct uverbs_attr_bundle *uverbs_attr_bundle; + } *ctx = NULL; + struct uverbs_attr *curr_attr; + unsigned long *curr_bitmap; + size_t ctx_size; +#ifdef UVERBS_OPTIMIZE_USING_STACK_SZ + uintptr_t data[UVERBS_OPTIMIZE_USING_STACK_SZ / sizeof(uintptr_t)]; +#endif + + if (hdr->reserved) + return -EINVAL; + + object_spec = uverbs_get_object(ib_dev, hdr->object_id); + if (!object_spec) + return -EOPNOTSUPP; + + method_spec = uverbs_get_method(object_spec, hdr->method_id); + if (!method_spec) + return -EOPNOTSUPP; + + if ((method_spec->flags & UVERBS_ACTION_FLAG_CREATE_ROOT) ^ !file->ucontext) + return -EINVAL; + + ctx_size = sizeof(*ctx) + + sizeof(struct uverbs_attr_bundle) + + sizeof(struct uverbs_attr_bundle_hash) * method_spec->num_buckets + + sizeof(*ctx->uattrs) * hdr->num_attrs + + sizeof(*ctx->uverbs_attr_bundle->hash[0].attrs) * + method_spec->num_child_attrs + + sizeof(*ctx->uverbs_attr_bundle->hash[0].valid_bitmap) * + (method_spec->num_child_attrs / BITS_PER_LONG + + method_spec->num_buckets); + +#ifdef UVERBS_OPTIMIZE_USING_STACK_SZ + if (ctx_size <= UVERBS_OPTIMIZE_USING_STACK_SZ) + ctx = (void *)data; + + if (!ctx) +#endif + ctx = kmalloc(ctx_size, GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + ctx->uverbs_attr_bundle = (void *)ctx + sizeof(*ctx); + ctx->uattrs = (void *)(ctx->uverbs_attr_bundle + 1) + + (sizeof(ctx->uverbs_attr_bundle->hash[0]) * + method_spec->num_buckets); + curr_attr = (void *)(ctx->uattrs + hdr->num_attrs); + curr_bitmap = (void *)(curr_attr + method_spec->num_child_attrs); + + /* + * We just fill the pointers and num_attrs here. The data itself will be + * filled at a later stage (uverbs_process_attr) + */ + for (i = 0; i < method_spec->num_buckets; i++) { + unsigned int curr_num_attrs = method_spec->attr_buckets[i]->num_attrs; + + ctx->uverbs_attr_bundle->hash[i].attrs = curr_attr; + curr_attr += curr_num_attrs; + ctx->uverbs_attr_bundle->hash[i].num_attrs = curr_num_attrs; + ctx->uverbs_attr_bundle->hash[i].valid_bitmap = curr_bitmap; + bitmap_zero(curr_bitmap, curr_num_attrs); + curr_bitmap += BITS_TO_LONGS(curr_num_attrs); + } + + err = copy_from_user(ctx->uattrs, buf, + sizeof(*ctx->uattrs) * hdr->num_attrs); + if (err) { + err = -EFAULT; + goto out; + } + + err = uverbs_handle_method(buf, ctx->uattrs, hdr->num_attrs, ib_dev, + file, method_spec, ctx->uverbs_attr_bundle); +out: +#ifdef UVERBS_OPTIMIZE_USING_STACK_SZ + if (ctx_size > UVERBS_OPTIMIZE_USING_STACK_SZ) +#endif + kfree(ctx); + return err; +} + +#define IB_UVERBS_MAX_CMD_SZ 4096 + +long ib_uverbs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ + struct ib_uverbs_file *file = filp->private_data; + struct ib_uverbs_ioctl_hdr __user *user_hdr = + (struct ib_uverbs_ioctl_hdr __user *)arg; + struct ib_uverbs_ioctl_hdr hdr; + struct ib_device *ib_dev; + int srcu_key; + long err; + + srcu_key = srcu_read_lock(&file->device->disassociate_srcu); + ib_dev = srcu_dereference(file->device->ib_dev, + &file->device->disassociate_srcu); + if (!ib_dev) { + err = -EIO; + goto out; + } + + if (cmd == RDMA_VERBS_IOCTL) { + err = copy_from_user(&hdr, user_hdr, sizeof(hdr)); + + if (err || hdr.length > IB_UVERBS_MAX_CMD_SZ || + hdr.length != sizeof(hdr) + hdr.num_attrs * sizeof(struct ib_uverbs_attr)) { + err = -EINVAL; + goto out; + } + + if (hdr.reserved) { + err = -EOPNOTSUPP; + goto out; + } + + err = ib_uverbs_cmd_verbs(ib_dev, file, &hdr, + (__user void *)arg + sizeof(hdr)); + } else { + err = -ENOIOCTLCMD; + } +out: + srcu_read_unlock(&file->device->disassociate_srcu, srcu_key); + + return err; +} diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 1b4bb8743969..e6df68048517 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2348,6 +2348,8 @@ struct ib_device { void (*get_dev_fw_str)(struct ib_device *, char *str); const struct cpumask *(*get_vector_affinity)(struct ib_device *ibdev, int comp_vector); + + struct uverbs_root_spec *specs_root; }; struct ib_client { diff --git a/include/rdma/uverbs_ioctl.h b/include/rdma/uverbs_ioctl.h index d3ec02b7d937..f83f56329761 100644 --- a/include/rdma/uverbs_ioctl.h +++ b/include/rdma/uverbs_ioctl.h @@ -43,6 +43,8 @@ enum uverbs_attr_type { UVERBS_ATTR_TYPE_NA, + UVERBS_ATTR_TYPE_PTR_IN, + UVERBS_ATTR_TYPE_PTR_OUT, UVERBS_ATTR_TYPE_IDR, UVERBS_ATTR_TYPE_FD, }; @@ -54,29 +56,110 @@ enum uverbs_obj_access { UVERBS_ACCESS_DESTROY }; +enum { + UVERBS_ATTR_SPEC_F_MANDATORY = 1U << 0, + /* Support extending attributes by length */ + UVERBS_ATTR_SPEC_F_MIN_SZ = 1U << 1, +}; + struct uverbs_attr_spec { enum uverbs_attr_type type; - struct { - /* - * higher bits mean the namespace and lower bits mean - * the type id within the namespace. - */ - u16 obj_type; - u8 access; - } obj; + union { + u16 len; + struct { + /* + * higher bits mean the namespace and lower bits mean + * the type id within the namespace. + */ + u16 obj_type; + u8 access; + } obj; + }; + /* Combination of bits from enum UVERBS_ATTR_SPEC_F_XXXX */ + u8 flags; }; struct uverbs_attr_spec_hash { size_t num_attrs; + unsigned long *mandatory_attrs_bitmask; struct uverbs_attr_spec attrs[0]; }; +struct uverbs_attr_bundle; +struct ib_uverbs_file; + +enum { + /* + * Action marked with this flag creates a context (or root for all + * objects). + */ + UVERBS_ACTION_FLAG_CREATE_ROOT = 1U << 0, +}; + +struct uverbs_method_spec { + /* Combination of bits from enum UVERBS_ACTION_FLAG_XXXX */ + u32 flags; + size_t num_buckets; + size_t num_child_attrs; + int (*handler)(struct ib_device *ib_dev, struct ib_uverbs_file *ufile, + struct uverbs_attr_bundle *ctx); + struct uverbs_attr_spec_hash *attr_buckets[0]; +}; + +struct uverbs_method_spec_hash { + size_t num_methods; + struct uverbs_method_spec *methods[0]; +}; + +struct uverbs_object_spec { + const struct uverbs_obj_type *type_attrs; + size_t num_buckets; + struct uverbs_method_spec_hash *method_buckets[0]; +}; + +struct uverbs_object_spec_hash { + size_t num_objects; + struct uverbs_object_spec *objects[0]; +}; + +struct uverbs_root_spec { + size_t num_buckets; + struct uverbs_object_spec_hash *object_buckets[0]; +}; + +/* ================================================= + * Parsing infrastructure + * ================================================= + */ + +struct uverbs_ptr_attr { + union { + u64 data; + void __user *ptr; + }; + u16 len; + /* Combination of bits from enum UVERBS_ATTR_F_XXXX */ + u16 flags; +}; + struct uverbs_obj_attr { + /* pointer to the kernel descriptor -> type, access, etc */ + const struct uverbs_obj_type *type; struct ib_uobject *uobject; + /* fd or id in idr of this object */ + int id; }; struct uverbs_attr { - struct uverbs_obj_attr obj_attr; + /* + * pointer to the user-space given attribute, in order to write the + * new uobject's id or update flags. + */ + struct ib_uverbs_attr __user *uattr; + union { + struct uverbs_ptr_attr ptr_attr; + struct uverbs_obj_attr obj_attr; + }; }; struct uverbs_attr_bundle_hash { diff --git a/include/uapi/rdma/rdma_user_ioctl.h b/include/uapi/rdma/rdma_user_ioctl.h index 9388125ad51b..165a27e969d5 100644 --- a/include/uapi/rdma/rdma_user_ioctl.h +++ b/include/uapi/rdma/rdma_user_ioctl.h @@ -43,6 +43,39 @@ /* Legacy name, for user space application which already use it */ #define IB_IOCTL_MAGIC RDMA_IOCTL_MAGIC +#define RDMA_VERBS_IOCTL \ + _IOWR(RDMA_IOCTL_MAGIC, 1, struct ib_uverbs_ioctl_hdr) + +#define UVERBS_ID_NS_MASK 0xF000 +#define UVERBS_ID_NS_SHIFT 12 + +enum { + /* User input */ + UVERBS_ATTR_F_MANDATORY = 1U << 0, + /* + * Valid output bit should be ignored and considered set in + * mandatory fields. This bit is kernel output. + */ + UVERBS_ATTR_F_VALID_OUTPUT = 1U << 1, +}; + +struct ib_uverbs_attr { + __u16 attr_id; /* command specific type attribute */ + __u16 len; /* only for pointers */ + __u16 flags; /* combination of UVERBS_ATTR_F_XXXX */ + __u16 reserved; + __u64 data; /* ptr to command, inline data or idr/fd */ +}; + +struct ib_uverbs_ioctl_hdr { + __u16 length; + __u16 object_id; + __u16 method_id; + __u16 num_attrs; + __u64 reserved; + struct ib_uverbs_attr attrs[0]; +}; + /* * General blocks assignments * It is closed on purpose do not expose it it user space -- cgit v1.2.3 From 5009010fbf54bdc27e57baca490e1f9d6a4609e0 Mon Sep 17 00:00:00 2001 From: Matan Barak Date: Thu, 3 Aug 2017 16:06:58 +0300 Subject: IB/core: Declare an object instead of declaring only type attributes Switch all uverbs_type_attrs_xxxx with DECLARE_UVERBS_OBJECT macros. This will be later used in order to embed the object specific methods in the objects as well. Signed-off-by: Matan Barak Reviewed-by: Yishai Hadas Signed-off-by: Doug Ledford --- drivers/infiniband/core/uverbs_std_types.c | 112 +++++++++++++---------------- include/rdma/uverbs_ioctl.h | 16 +++++ include/rdma/uverbs_std_types.h | 40 +++++------ include/rdma/uverbs_types.h | 38 ++++++---- 4 files changed, 107 insertions(+), 99 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/core/uverbs_std_types.c b/drivers/infiniband/core/uverbs_std_types.c index ef293379f37a..b75c7da0d0a4 100644 --- a/drivers/infiniband/core/uverbs_std_types.c +++ b/drivers/infiniband/core/uverbs_std_types.c @@ -209,67 +209,51 @@ static int uverbs_hot_unplug_completion_event_file(struct ib_uobject_file *uobj_ return 0; }; -const struct uverbs_obj_fd_type uverbs_type_attrs_comp_channel = { - .type = UVERBS_TYPE_ALLOC_FD(sizeof(struct ib_uverbs_completion_event_file), 0), - .context_closed = uverbs_hot_unplug_completion_event_file, - .fops = &uverbs_event_fops, - .name = "[infinibandevent]", - .flags = O_RDONLY, -}; - -const struct uverbs_obj_idr_type uverbs_type_attrs_cq = { - .type = UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_ucq_object), 0), - .destroy_object = uverbs_free_cq, -}; - -const struct uverbs_obj_idr_type uverbs_type_attrs_qp = { - .type = UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_uqp_object), 0), - .destroy_object = uverbs_free_qp, -}; - -const struct uverbs_obj_idr_type uverbs_type_attrs_mw = { - .type = UVERBS_TYPE_ALLOC_IDR(0), - .destroy_object = uverbs_free_mw, -}; - -const struct uverbs_obj_idr_type uverbs_type_attrs_mr = { - /* 1 is used in order to free the MR after all the MWs */ - .type = UVERBS_TYPE_ALLOC_IDR(1), - .destroy_object = uverbs_free_mr, -}; - -const struct uverbs_obj_idr_type uverbs_type_attrs_srq = { - .type = UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_usrq_object), 0), - .destroy_object = uverbs_free_srq, -}; - -const struct uverbs_obj_idr_type uverbs_type_attrs_ah = { - .type = UVERBS_TYPE_ALLOC_IDR(0), - .destroy_object = uverbs_free_ah, -}; - -const struct uverbs_obj_idr_type uverbs_type_attrs_flow = { - .type = UVERBS_TYPE_ALLOC_IDR(0), - .destroy_object = uverbs_free_flow, -}; - -const struct uverbs_obj_idr_type uverbs_type_attrs_wq = { - .type = UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_uwq_object), 0), - .destroy_object = uverbs_free_wq, -}; - -const struct uverbs_obj_idr_type uverbs_type_attrs_rwq_ind_table = { - .type = UVERBS_TYPE_ALLOC_IDR(0), - .destroy_object = uverbs_free_rwq_ind_tbl, -}; - -const struct uverbs_obj_idr_type uverbs_type_attrs_xrcd = { - .type = UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_uxrcd_object), 0), - .destroy_object = uverbs_free_xrcd, -}; - -const struct uverbs_obj_idr_type uverbs_type_attrs_pd = { - /* 2 is used in order to free the PD after MRs */ - .type = UVERBS_TYPE_ALLOC_IDR(2), - .destroy_object = uverbs_free_pd, -}; +DECLARE_UVERBS_OBJECT(uverbs_object_comp_channel, + UVERBS_OBJECT_COMP_CHANNEL, + &UVERBS_TYPE_ALLOC_FD(0, + sizeof(struct ib_uverbs_completion_event_file), + uverbs_hot_unplug_completion_event_file, + &uverbs_event_fops, + "[infinibandevent]", O_RDONLY)); + +DECLARE_UVERBS_OBJECT(uverbs_object_cq, UVERBS_OBJECT_CQ, + &UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_ucq_object), 0, + uverbs_free_cq)); + +DECLARE_UVERBS_OBJECT(uverbs_object_qp, UVERBS_OBJECT_QP, + &UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_uqp_object), 0, + uverbs_free_qp)); + +DECLARE_UVERBS_OBJECT(uverbs_object_mw, UVERBS_OBJECT_MW, + &UVERBS_TYPE_ALLOC_IDR(0, uverbs_free_mw)); + +DECLARE_UVERBS_OBJECT(uverbs_object_mr, UVERBS_OBJECT_MR, + /* 1 is used in order to free the MR after all the MWs */ + &UVERBS_TYPE_ALLOC_IDR(1, uverbs_free_mr)); + +DECLARE_UVERBS_OBJECT(uverbs_object_srq, UVERBS_OBJECT_SRQ, + &UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_usrq_object), 0, + uverbs_free_srq)); + +DECLARE_UVERBS_OBJECT(uverbs_object_ah, UVERBS_OBJECT_AH, + &UVERBS_TYPE_ALLOC_IDR(0, uverbs_free_ah)); + +DECLARE_UVERBS_OBJECT(uverbs_object_flow, UVERBS_OBJECT_FLOW, + &UVERBS_TYPE_ALLOC_IDR(0, uverbs_free_flow)); + +DECLARE_UVERBS_OBJECT(uverbs_object_wq, UVERBS_OBJECT_WQ, + &UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_uwq_object), 0, + uverbs_free_wq)); + +DECLARE_UVERBS_OBJECT(uverbs_object_rwq_ind_table, + UVERBS_OBJECT_RWQ_IND_TBL, + &UVERBS_TYPE_ALLOC_IDR(0, uverbs_free_rwq_ind_tbl)); + +DECLARE_UVERBS_OBJECT(uverbs_object_xrcd, UVERBS_OBJECT_XRCD, + &UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_uxrcd_object), 0, + uverbs_free_xrcd)); + +DECLARE_UVERBS_OBJECT(uverbs_object_pd, UVERBS_OBJECT_PD, + /* 2 is used in order to free the PD after MRs */ + &UVERBS_TYPE_ALLOC_IDR(2, uverbs_free_pd)); diff --git a/include/rdma/uverbs_ioctl.h b/include/rdma/uverbs_ioctl.h index f83f56329761..99130083615e 100644 --- a/include/rdma/uverbs_ioctl.h +++ b/include/rdma/uverbs_ioctl.h @@ -127,6 +127,22 @@ struct uverbs_root_spec { struct uverbs_object_spec_hash *object_buckets[0]; }; +/* + * ======================================= + * Verbs definitions + * ======================================= + */ + +struct uverbs_object_def { + const struct uverbs_obj_type *type_attrs; +}; + +#define _UVERBS_OBJECT(_id, _type_attrs, ...) \ + ((const struct uverbs_object_def) { \ + .type_attrs = _type_attrs}) +#define DECLARE_UVERBS_OBJECT(_name, _id, _type_attrs, ...) \ + const struct uverbs_object_def _name = \ + _UVERBS_OBJECT(_id, _type_attrs, ##__VA_ARGS__) /* ================================================= * Parsing infrastructure * ================================================= diff --git a/include/rdma/uverbs_std_types.h b/include/rdma/uverbs_std_types.h index 7771ce966952..eda271b4aa6c 100644 --- a/include/rdma/uverbs_std_types.h +++ b/include/rdma/uverbs_std_types.h @@ -35,18 +35,18 @@ #include -extern const struct uverbs_obj_fd_type uverbs_type_attrs_comp_channel; -extern const struct uverbs_obj_idr_type uverbs_type_attrs_cq; -extern const struct uverbs_obj_idr_type uverbs_type_attrs_qp; -extern const struct uverbs_obj_idr_type uverbs_type_attrs_rwq_ind_table; -extern const struct uverbs_obj_idr_type uverbs_type_attrs_wq; -extern const struct uverbs_obj_idr_type uverbs_type_attrs_srq; -extern const struct uverbs_obj_idr_type uverbs_type_attrs_ah; -extern const struct uverbs_obj_idr_type uverbs_type_attrs_flow; -extern const struct uverbs_obj_idr_type uverbs_type_attrs_mr; -extern const struct uverbs_obj_idr_type uverbs_type_attrs_mw; -extern const struct uverbs_obj_idr_type uverbs_type_attrs_pd; -extern const struct uverbs_obj_idr_type uverbs_type_attrs_xrcd; +extern const struct uverbs_object_def uverbs_object_comp_channel; +extern const struct uverbs_object_def uverbs_object_cq; +extern const struct uverbs_object_def uverbs_object_qp; +extern const struct uverbs_object_def uverbs_object_rwq_ind_table; +extern const struct uverbs_object_def uverbs_object_wq; +extern const struct uverbs_object_def uverbs_object_srq; +extern const struct uverbs_object_def uverbs_object_ah; +extern const struct uverbs_object_def uverbs_object_flow; +extern const struct uverbs_object_def uverbs_object_mr; +extern const struct uverbs_object_def uverbs_object_mw; +extern const struct uverbs_object_def uverbs_object_pd; +extern const struct uverbs_object_def uverbs_object_xrcd; static inline struct ib_uobject *__uobj_get(const struct uverbs_obj_type *type, bool write, @@ -56,22 +56,22 @@ static inline struct ib_uobject *__uobj_get(const struct uverbs_obj_type *type, return rdma_lookup_get_uobject(type, ucontext, id, write); } -#define uobj_get_type(_type) uverbs_type_attrs_##_type.type +#define uobj_get_type(_object) uverbs_object_##_object.type_attrs #define uobj_get_read(_type, _id, _ucontext) \ - __uobj_get(&(_type), false, _ucontext, _id) + __uobj_get(_type, false, _ucontext, _id) -#define uobj_get_obj_read(_type, _id, _ucontext) \ +#define uobj_get_obj_read(_object, _id, _ucontext) \ ({ \ - struct ib_uobject *uobj = \ - __uobj_get(&uobj_get_type(_type), \ + struct ib_uobject *__uobj = \ + __uobj_get(uverbs_object_##_object.type_attrs, \ false, _ucontext, _id); \ \ - (struct ib_##_type *)(IS_ERR(uobj) ? NULL : uobj->object); \ + (struct ib_##_object *)(IS_ERR(__uobj) ? NULL : __uobj->object);\ }) #define uobj_get_write(_type, _id, _ucontext) \ - __uobj_get(&(_type), true, _ucontext, _id) + __uobj_get(_type, true, _ucontext, _id) static inline void uobj_put_read(struct ib_uobject *uobj) { @@ -108,7 +108,7 @@ static inline struct ib_uobject *__uobj_alloc(const struct uverbs_obj_type *type } #define uobj_alloc(_type, ucontext) \ - __uobj_alloc(&(_type), ucontext) + __uobj_alloc(_type, ucontext) #endif diff --git a/include/rdma/uverbs_types.h b/include/rdma/uverbs_types.h index 351ea185df44..9760b6d70744 100644 --- a/include/rdma/uverbs_types.h +++ b/include/rdma/uverbs_types.h @@ -151,22 +151,30 @@ extern const struct uverbs_obj_type_class uverbs_fd_class; #define UVERBS_BUILD_BUG_ON(cond) (sizeof(char[1 - 2 * !!(cond)]) - \ sizeof(char)) -#define UVERBS_TYPE_ALLOC_FD(_size, _order) \ - { \ - .destroy_order = _order, \ - .type_class = &uverbs_fd_class, \ - .obj_size = (_size) + \ - UVERBS_BUILD_BUG_ON((_size) < \ - sizeof(struct ib_uobject_file)),\ - } -#define UVERBS_TYPE_ALLOC_IDR_SZ(_size, _order) \ - { \ +#define UVERBS_TYPE_ALLOC_FD(_order, _obj_size, _context_closed, _fops, _name, _flags)\ + ((&((const struct uverbs_obj_fd_type) \ + {.type = { \ + .destroy_order = _order, \ + .type_class = &uverbs_fd_class, \ + .obj_size = (_obj_size) + \ + UVERBS_BUILD_BUG_ON((_obj_size) < sizeof(struct ib_uobject_file)), \ + }, \ + .context_closed = _context_closed, \ + .fops = _fops, \ + .name = _name, \ + .flags = _flags}))->type) +#define UVERBS_TYPE_ALLOC_IDR_SZ(_size, _order, _destroy_object) \ + ((&((const struct uverbs_obj_idr_type) \ + {.type = { \ .destroy_order = _order, \ .type_class = &uverbs_idr_class, \ .obj_size = (_size) + \ - UVERBS_BUILD_BUG_ON((_size) < \ - sizeof(struct ib_uobject)), \ - } -#define UVERBS_TYPE_ALLOC_IDR(_order) \ - UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_uobject), _order) + UVERBS_BUILD_BUG_ON((_size) < \ + sizeof(struct ib_uobject)) \ + }, \ + .destroy_object = _destroy_object,}))->type) +#define UVERBS_TYPE_ALLOC_IDR(_order, _destroy_object) \ + UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_uobject), _order, \ + _destroy_object) + #endif -- cgit v1.2.3 From 09e3ebf8c193d3f154c4ffb7cb18995df0243bc6 Mon Sep 17 00:00:00 2001 From: Matan Barak Date: Thu, 3 Aug 2017 16:06:59 +0300 Subject: IB/core: Add DEVICE object and root tree structure This adds the DEVICE object. This object supports creating the context that all objects are created from. Moreover, it supports executing methods which are related to the device itself, such as QUERY_DEVICE. This is a singleton object (per file instance). All standard objects are put in the root structure. This root will later on be used in drivers as the source for their whole parsing tree. Later on, when new features are added, these drivers could mix this root with other customized objects. Signed-off-by: Matan Barak Reviewed-by: Yishai Hadas Signed-off-by: Doug Ledford --- drivers/infiniband/core/uverbs_std_types.c | 17 +++++++++++++++ include/rdma/uverbs_ioctl.h | 35 ++++++++++++++++++++++++++++++ include/rdma/uverbs_std_types.h | 18 +++++++++++++++ 3 files changed, 70 insertions(+) (limited to 'include') diff --git a/drivers/infiniband/core/uverbs_std_types.c b/drivers/infiniband/core/uverbs_std_types.c index b75c7da0d0a4..5f90978bda8d 100644 --- a/drivers/infiniband/core/uverbs_std_types.c +++ b/drivers/infiniband/core/uverbs_std_types.c @@ -257,3 +257,20 @@ DECLARE_UVERBS_OBJECT(uverbs_object_xrcd, UVERBS_OBJECT_XRCD, DECLARE_UVERBS_OBJECT(uverbs_object_pd, UVERBS_OBJECT_PD, /* 2 is used in order to free the PD after MRs */ &UVERBS_TYPE_ALLOC_IDR(2, uverbs_free_pd)); + +DECLARE_UVERBS_OBJECT(uverbs_object_device, UVERBS_OBJECT_DEVICE, NULL); + +DECLARE_UVERBS_OBJECT_TREE(uverbs_default_objects, + &uverbs_object_device, + &uverbs_object_pd, + &uverbs_object_mr, + &uverbs_object_comp_channel, + &uverbs_object_cq, + &uverbs_object_qp, + &uverbs_object_ah, + &uverbs_object_mw, + &uverbs_object_srq, + &uverbs_object_flow, + &uverbs_object_wq, + &uverbs_object_rwq_ind_table, + &uverbs_object_xrcd); diff --git a/include/rdma/uverbs_ioctl.h b/include/rdma/uverbs_ioctl.h index 99130083615e..2e8925434d74 100644 --- a/include/rdma/uverbs_ioctl.h +++ b/include/rdma/uverbs_ioctl.h @@ -133,16 +133,51 @@ struct uverbs_root_spec { * ======================================= */ +struct uverbs_attr_def { + u16 id; + struct uverbs_attr_spec attr; +}; + +struct uverbs_method_def { + u16 id; + /* Combination of bits from enum UVERBS_ACTION_FLAG_XXXX */ + u32 flags; + size_t num_attrs; + const struct uverbs_attr_def * const (*attrs)[]; + int (*handler)(struct ib_device *ib_dev, struct ib_uverbs_file *ufile, + struct uverbs_attr_bundle *ctx); +}; + struct uverbs_object_def { + u16 id; const struct uverbs_obj_type *type_attrs; + size_t num_methods; + const struct uverbs_method_def * const (*methods)[]; +}; + +struct uverbs_object_tree_def { + size_t num_objects; + const struct uverbs_object_def * const (*objects)[]; }; #define _UVERBS_OBJECT(_id, _type_attrs, ...) \ ((const struct uverbs_object_def) { \ + .id = _id, \ .type_attrs = _type_attrs}) #define DECLARE_UVERBS_OBJECT(_name, _id, _type_attrs, ...) \ const struct uverbs_object_def _name = \ _UVERBS_OBJECT(_id, _type_attrs, ##__VA_ARGS__) +#define _UVERBS_TREE_OBJECTS_SZ(...) \ + (sizeof((const struct uverbs_object_def * const []){__VA_ARGS__}) / \ + sizeof(const struct uverbs_object_def *)) +#define _UVERBS_OBJECT_TREE(...) \ + ((const struct uverbs_object_tree_def) { \ + .num_objects = _UVERBS_TREE_OBJECTS_SZ(__VA_ARGS__), \ + .objects = &(const struct uverbs_object_def * const []){__VA_ARGS__} }) +#define DECLARE_UVERBS_OBJECT_TREE(_name, ...) \ + const struct uverbs_object_tree_def _name = \ + _UVERBS_OBJECT_TREE(__VA_ARGS__) + /* ================================================= * Parsing infrastructure * ================================================= diff --git a/include/rdma/uverbs_std_types.h b/include/rdma/uverbs_std_types.h index eda271b4aa6c..bef74099b7c5 100644 --- a/include/rdma/uverbs_std_types.h +++ b/include/rdma/uverbs_std_types.h @@ -35,6 +35,23 @@ #include +enum uverbs_default_objects { + UVERBS_OBJECT_DEVICE, /* No instances of DEVICE are allowed */ + UVERBS_OBJECT_PD, + UVERBS_OBJECT_COMP_CHANNEL, + UVERBS_OBJECT_CQ, + UVERBS_OBJECT_QP, + UVERBS_OBJECT_SRQ, + UVERBS_OBJECT_AH, + UVERBS_OBJECT_MR, + UVERBS_OBJECT_MW, + UVERBS_OBJECT_FLOW, + UVERBS_OBJECT_XRCD, + UVERBS_OBJECT_RWQ_IND_TBL, + UVERBS_OBJECT_WQ, + UVERBS_OBJECT_LAST, +}; + extern const struct uverbs_object_def uverbs_object_comp_channel; extern const struct uverbs_object_def uverbs_object_cq; extern const struct uverbs_object_def uverbs_object_qp; @@ -47,6 +64,7 @@ extern const struct uverbs_object_def uverbs_object_mr; extern const struct uverbs_object_def uverbs_object_mw; extern const struct uverbs_object_def uverbs_object_pd; extern const struct uverbs_object_def uverbs_object_xrcd; +extern const struct uverbs_object_def uverbs_object_device; static inline struct ib_uobject *__uobj_get(const struct uverbs_obj_type *type, bool write, -- cgit v1.2.3 From 118620d3686b2d624f9a5019f2f14c64cf50d21a Mon Sep 17 00:00:00 2001 From: Matan Barak Date: Thu, 3 Aug 2017 16:07:00 +0300 Subject: IB/core: Add uverbs merge trees functionality Different drivers support different features and even subset of the common uverbs implementation. Currently, this is handled as bitmask in every driver that represents which kind of methods it supports, but doesn't go down to attributes granularity. Moreover, drivers might want to add their specific types, methods and attributes to let their user-space counter-parts be exposed to some more efficient abstractions. It means that existence of different features is validated syntactically via the parsing infrastructure rather than using a complex in-handler logic. In order to do that, we allow defining features and abstractions as parsing trees. These per-feature parsing tree could be merged to an efficient (perfect-hash based) parsing tree, which is later used by the parsing infrastructure. To sum it up, this makes a parse tree unique for a device and represents only the features this particular device supports. This is done by having a root specification tree per feature. Before a device registers itself as an IB device, it merges all these trees into one parsing tree. This parsing tree is used to parse all user-space commands. A future user-space application could read this parse tree. This tree represents which objects, methods and attributes are supported by this device. This is based on the idea of Jason Gunthorpe Signed-off-by: Matan Barak Reviewed-by: Yishai Hadas Signed-off-by: Doug Ledford --- drivers/infiniband/core/Makefile | 3 +- drivers/infiniband/core/uverbs_ioctl_merge.c | 665 +++++++++++++++++++++++++++ include/rdma/uverbs_ioctl.h | 40 +- 3 files changed, 706 insertions(+), 2 deletions(-) create mode 100644 drivers/infiniband/core/uverbs_ioctl_merge.c (limited to 'include') diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile index 746756dc9877..b4df164f71a6 100644 --- a/drivers/infiniband/core/Makefile +++ b/drivers/infiniband/core/Makefile @@ -32,4 +32,5 @@ ib_umad-y := user_mad.o ib_ucm-y := ucm.o ib_uverbs-y := uverbs_main.o uverbs_cmd.o uverbs_marshall.o \ - rdma_core.o uverbs_std_types.o uverbs_ioctl.o + rdma_core.o uverbs_std_types.o uverbs_ioctl.o \ + uverbs_ioctl_merge.o diff --git a/drivers/infiniband/core/uverbs_ioctl_merge.c b/drivers/infiniband/core/uverbs_ioctl_merge.c new file mode 100644 index 000000000000..76ddb6564578 --- /dev/null +++ b/drivers/infiniband/core/uverbs_ioctl_merge.c @@ -0,0 +1,665 @@ +/* + * Copyright (c) 2017, Mellanox Technologies inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include "uverbs.h" + +#define UVERBS_NUM_NS (UVERBS_ID_NS_MASK >> UVERBS_ID_NS_SHIFT) +#define GET_NS_ID(idx) (((idx) & UVERBS_ID_NS_MASK) >> UVERBS_ID_NS_SHIFT) +#define GET_ID(idx) ((idx) & ~UVERBS_ID_NS_MASK) + +#define _for_each_element(elem, tmpi, tmpj, hashes, num_buckets_offset, \ + buckets_offset) \ + for (tmpj = 0, \ + elem = (*(const void ***)((hashes)[tmpi] + \ + (buckets_offset)))[0]; \ + tmpj < *(size_t *)((hashes)[tmpi] + (num_buckets_offset)); \ + tmpj++) \ + if ((elem = ((*(const void ***)(hashes[tmpi] + \ + (buckets_offset)))[tmpj]))) + +/* + * Iterate all elements of a few @hashes. The number of given hashes is + * indicated by @num_hashes. The offset of the number of buckets in the hash is + * represented by @num_buckets_offset, while the offset of the buckets array in + * the hash structure is represented by @buckets_offset. tmpi and tmpj are two + * short (or int) based indices that are given by the user. tmpi iterates over + * the different hashes. @elem points the current element in the hashes[tmpi] + * bucket we are looping on. To be honest, @hashes representation isn't exactly + * a hash, but more a collection of elements. These elements' ids are treated + * in a hash like manner, where the first upper bits are the bucket number. + * These elements are later mapped into a perfect-hash. + */ +#define for_each_element(elem, tmpi, tmpj, hashes, num_hashes, \ + num_buckets_offset, buckets_offset) \ + for (tmpi = 0; tmpi < (num_hashes); tmpi++) \ + _for_each_element(elem, tmpi, tmpj, hashes, num_buckets_offset,\ + buckets_offset) + +#define get_elements_iterators_entry_above(iters, num_elements, elements, \ + num_objects_fld, objects_fld, bucket,\ + min_id) \ + get_elements_above_id((const void **)iters, num_elements, \ + (const void **)(elements), \ + offsetof(typeof(**elements), \ + num_objects_fld), \ + offsetof(typeof(**elements), objects_fld),\ + offsetof(typeof(***(*elements)->objects_fld), id),\ + bucket, min_id) + +#define get_objects_above_id(iters, num_trees, trees, bucket, min_id) \ + get_elements_iterators_entry_above(iters, num_trees, trees, \ + num_objects, objects, bucket, min_id) + +#define get_methods_above_id(method_iters, num_iters, iters, bucket, min_id)\ + get_elements_iterators_entry_above(method_iters, num_iters, iters, \ + num_methods, methods, bucket, min_id) + +#define get_attrs_above_id(attrs_iters, num_iters, iters, bucket, min_id)\ + get_elements_iterators_entry_above(attrs_iters, num_iters, iters, \ + num_attrs, attrs, bucket, min_id) + +/* + * get_elements_above_id get a few hashes represented by @elements and + * @num_elements. The hashes fields are described by @num_offset, @data_offset + * and @id_offset in the same way as required by for_each_element. The function + * returns an array of @iters, represents an array of elements in the hashes + * buckets, which their ids are the smallest ids in all hashes but are all + * larger than the id given by min_id. Elements are only added to the iters + * array if their id belongs to the bucket @bucket. The number of elements in + * the returned array is returned by the function. @min_id is also updated to + * reflect the new min_id of all elements in iters. + */ +static size_t get_elements_above_id(const void **iters, + unsigned int num_elements, + const void **elements, + size_t num_offset, + size_t data_offset, + size_t id_offset, + u16 bucket, + short *min_id) +{ + size_t num_iters = 0; + short min = SHRT_MAX; + const void *elem; + int i, j, last_stored = -1; + + for_each_element(elem, i, j, elements, num_elements, num_offset, + data_offset) { + u16 id = *(u16 *)(elem + id_offset); + + if (GET_NS_ID(id) != bucket) + continue; + + if (GET_ID(id) < *min_id || + (min != SHRT_MAX && GET_ID(id) > min)) + continue; + + /* + * We first iterate all hashes represented by @elements. When + * we do, we try to find an element @elem in the bucket @bucket + * which its id is min. Since we can't ensure the user sorted + * the elements in increasing order, we override this hash's + * minimal id element we found, if a new element with a smaller + * id was just found. + */ + iters[last_stored == i ? num_iters - 1 : num_iters++] = elem; + last_stored = i; + min = GET_ID(id); + } + + /* + * We only insert to our iters array an element, if its id is smaller + * than all previous ids. Therefore, the final iters array is sorted so + * that smaller ids are in the end of the array. + * Therefore, we need to clean the beginning of the array to make sure + * all ids of final elements are equal to min. + */ + for (i = num_iters - 1; i >= 0 && + GET_ID(*(u16 *)(iters[i] + id_offset)) == min; i--) + ; + + num_iters -= i + 1; + memmove(iters, iters + i + 1, sizeof(*iters) * num_iters); + + *min_id = min; + return num_iters; +} + +#define find_max_element_entry_id(num_elements, elements, num_objects_fld, \ + objects_fld, bucket) \ + find_max_element_id(num_elements, (const void **)(elements), \ + offsetof(typeof(**elements), num_objects_fld), \ + offsetof(typeof(**elements), objects_fld), \ + offsetof(typeof(***(*elements)->objects_fld), id),\ + bucket) + +static short find_max_element_ns_id(unsigned int num_elements, + const void **elements, + size_t num_offset, + size_t data_offset, + size_t id_offset) +{ + short max_ns = SHRT_MIN; + const void *elem; + int i, j; + + for_each_element(elem, i, j, elements, num_elements, num_offset, + data_offset) { + u16 id = *(u16 *)(elem + id_offset); + + if (GET_NS_ID(id) > max_ns) + max_ns = GET_NS_ID(id); + } + + return max_ns; +} + +static short find_max_element_id(unsigned int num_elements, + const void **elements, + size_t num_offset, + size_t data_offset, + size_t id_offset, + u16 bucket) +{ + short max_id = SHRT_MIN; + const void *elem; + int i, j; + + for_each_element(elem, i, j, elements, num_elements, num_offset, + data_offset) { + u16 id = *(u16 *)(elem + id_offset); + + if (GET_NS_ID(id) == bucket && + GET_ID(id) > max_id) + max_id = GET_ID(id); + } + return max_id; +} + +#define find_max_element_entry_id(num_elements, elements, num_objects_fld, \ + objects_fld, bucket) \ + find_max_element_id(num_elements, (const void **)(elements), \ + offsetof(typeof(**elements), num_objects_fld), \ + offsetof(typeof(**elements), objects_fld), \ + offsetof(typeof(***(*elements)->objects_fld), id),\ + bucket) + +#define find_max_element_ns_entry_id(num_elements, elements, \ + num_objects_fld, objects_fld) \ + find_max_element_ns_id(num_elements, (const void **)(elements), \ + offsetof(typeof(**elements), num_objects_fld),\ + offsetof(typeof(**elements), objects_fld), \ + offsetof(typeof(***(*elements)->objects_fld), id)) + +/* + * find_max_xxxx_ns_id gets a few elements. Each element is described by an id + * which its upper bits represents a namespace. It finds the max namespace. This + * could be used in order to know how many buckets do we need to allocate. If no + * elements exist, SHRT_MIN is returned. Namespace represents here different + * buckets. The common example is "common bucket" and "driver bucket". + * + * find_max_xxxx_id gets a few elements and a bucket. Each element is described + * by an id which its upper bits represent a namespace. It returns the max id + * which is contained in the same namespace defined in @bucket. This could be + * used in order to know how many elements do we need to allocate in the bucket. + * If no elements exist, SHRT_MIN is returned. + */ + +#define find_max_object_id(num_trees, trees, bucket) \ + find_max_element_entry_id(num_trees, trees, num_objects,\ + objects, bucket) +#define find_max_object_ns_id(num_trees, trees) \ + find_max_element_ns_entry_id(num_trees, trees, \ + num_objects, objects) + +#define find_max_method_id(num_iters, iters, bucket) \ + find_max_element_entry_id(num_iters, iters, num_methods,\ + methods, bucket) +#define find_max_method_ns_id(num_iters, iters) \ + find_max_element_ns_entry_id(num_iters, iters, \ + num_methods, methods) + +#define find_max_attr_id(num_iters, iters, bucket) \ + find_max_element_entry_id(num_iters, iters, num_attrs, \ + attrs, bucket) +#define find_max_attr_ns_id(num_iters, iters) \ + find_max_element_ns_entry_id(num_iters, iters, \ + num_attrs, attrs) + +static void free_method(struct uverbs_method_spec *method) +{ + unsigned int i; + + if (!method) + return; + + for (i = 0; i < method->num_buckets; i++) + kfree(method->attr_buckets[i]); + + kfree(method); +} + +#define IS_ATTR_OBJECT(attr) ((attr)->type == UVERBS_ATTR_TYPE_IDR || \ + (attr)->type == UVERBS_ATTR_TYPE_FD) + +/* + * This function gets array of size @num_method_defs which contains pointers to + * method definitions @method_defs. The function allocates an + * uverbs_method_spec structure and initializes its number of buckets and the + * elements in buckets to the correct attributes. While doing that, it + * validates that there aren't conflicts between attributes of different + * method_defs. + */ +static struct uverbs_method_spec *build_method_with_attrs(const struct uverbs_method_def **method_defs, + size_t num_method_defs) +{ + int bucket_idx; + int max_attr_buckets = 0; + size_t num_attr_buckets = 0; + int res = 0; + struct uverbs_method_spec *method = NULL; + const struct uverbs_attr_def **attr_defs; + unsigned int num_of_singularities = 0; + + max_attr_buckets = find_max_attr_ns_id(num_method_defs, method_defs); + if (max_attr_buckets >= 0) + num_attr_buckets = max_attr_buckets + 1; + + method = kzalloc(sizeof(*method) + + num_attr_buckets * sizeof(*method->attr_buckets), + GFP_KERNEL); + if (!method) + return ERR_PTR(-ENOMEM); + + method->num_buckets = num_attr_buckets; + attr_defs = kcalloc(num_method_defs, sizeof(*attr_defs), GFP_KERNEL); + if (!attr_defs) { + res = -ENOMEM; + goto free_method; + } + for (bucket_idx = 0; bucket_idx < method->num_buckets; bucket_idx++) { + short min_id = SHRT_MIN; + int attr_max_bucket = 0; + struct uverbs_attr_spec_hash *hash = NULL; + + attr_max_bucket = find_max_attr_id(num_method_defs, method_defs, + bucket_idx); + if (attr_max_bucket < 0) + continue; + + hash = kzalloc(sizeof(*hash) + + ALIGN(sizeof(*hash->attrs) * (attr_max_bucket + 1), + sizeof(long)) + + BITS_TO_LONGS(attr_max_bucket) * sizeof(long), + GFP_KERNEL); + if (!hash) { + res = -ENOMEM; + goto free; + } + hash->num_attrs = attr_max_bucket + 1; + method->num_child_attrs += hash->num_attrs; + hash->mandatory_attrs_bitmask = (void *)(hash + 1) + + ALIGN(sizeof(*hash->attrs) * + (attr_max_bucket + 1), + sizeof(long)); + + method->attr_buckets[bucket_idx] = hash; + + do { + size_t num_attr_defs; + struct uverbs_attr_spec *attr; + bool attr_obj_with_special_access; + + num_attr_defs = + get_attrs_above_id(attr_defs, + num_method_defs, + method_defs, + bucket_idx, + &min_id); + /* Last attr in bucket */ + if (!num_attr_defs) + break; + + if (num_attr_defs > 1) { + /* + * We don't allow two attribute definitions for + * the same attribute. This is usually a + * programmer error. If required, it's better to + * just add a new attribute to capture the new + * semantics. + */ + res = -EEXIST; + goto free; + } + + attr = &hash->attrs[min_id]; + memcpy(attr, &attr_defs[0]->attr, sizeof(*attr)); + + attr_obj_with_special_access = IS_ATTR_OBJECT(attr) && + (attr->obj.access == UVERBS_ACCESS_NEW || + attr->obj.access == UVERBS_ACCESS_DESTROY); + num_of_singularities += !!attr_obj_with_special_access; + if (WARN(num_of_singularities > 1, + "ib_uverbs: Method contains more than one object attr (%d) with new/destroy access\n", + min_id) || + WARN(attr_obj_with_special_access && + !(attr->flags & UVERBS_ATTR_SPEC_F_MANDATORY), + "ib_uverbs: Tried to merge attr (%d) but it's an object with new/destroy aceess but isn't mandatory\n", + min_id) || + WARN(IS_ATTR_OBJECT(attr) && + attr->flags & UVERBS_ATTR_SPEC_F_MIN_SZ, + "ib_uverbs: Tried to merge attr (%d) but it's an object with min_sz flag\n", + min_id)) { + res = -EINVAL; + goto free; + } + + if (attr->flags & UVERBS_ATTR_SPEC_F_MANDATORY) + set_bit(min_id, hash->mandatory_attrs_bitmask); + min_id++; + + } while (1); + } + kfree(attr_defs); + return method; + +free: + kfree(attr_defs); +free_method: + free_method(method); + return ERR_PTR(res); +} + +static void free_object(struct uverbs_object_spec *object) +{ + unsigned int i, j; + + if (!object) + return; + + for (i = 0; i < object->num_buckets; i++) { + struct uverbs_method_spec_hash *method_buckets = + object->method_buckets[i]; + + if (!method_buckets) + continue; + + for (j = 0; j < method_buckets->num_methods; j++) + free_method(method_buckets->methods[j]); + + kfree(method_buckets); + } + + kfree(object); +} + +/* + * This function gets array of size @num_object_defs which contains pointers to + * object definitions @object_defs. The function allocated an + * uverbs_object_spec structure and initialize its number of buckets and the + * elements in buckets to the correct methods. While doing that, it + * sorts out the correct relationship between conflicts in the same method. + */ +static struct uverbs_object_spec *build_object_with_methods(const struct uverbs_object_def **object_defs, + size_t num_object_defs) +{ + u16 bucket_idx; + int max_method_buckets = 0; + u16 num_method_buckets = 0; + int res = 0; + struct uverbs_object_spec *object = NULL; + const struct uverbs_method_def **method_defs; + + max_method_buckets = find_max_method_ns_id(num_object_defs, object_defs); + if (max_method_buckets >= 0) + num_method_buckets = max_method_buckets + 1; + + object = kzalloc(sizeof(*object) + + num_method_buckets * + sizeof(*object->method_buckets), GFP_KERNEL); + if (!object) + return ERR_PTR(-ENOMEM); + + object->num_buckets = num_method_buckets; + method_defs = kcalloc(num_object_defs, sizeof(*method_defs), GFP_KERNEL); + if (!method_defs) { + res = -ENOMEM; + goto free_object; + } + + for (bucket_idx = 0; bucket_idx < object->num_buckets; bucket_idx++) { + short min_id = SHRT_MIN; + int methods_max_bucket = 0; + struct uverbs_method_spec_hash *hash = NULL; + + methods_max_bucket = find_max_method_id(num_object_defs, object_defs, + bucket_idx); + if (methods_max_bucket < 0) + continue; + + hash = kzalloc(sizeof(*hash) + + sizeof(*hash->methods) * (methods_max_bucket + 1), + GFP_KERNEL); + if (!hash) { + res = -ENOMEM; + goto free; + } + + hash->num_methods = methods_max_bucket + 1; + object->method_buckets[bucket_idx] = hash; + + do { + size_t num_method_defs; + struct uverbs_method_spec *method; + int i; + + num_method_defs = + get_methods_above_id(method_defs, + num_object_defs, + object_defs, + bucket_idx, + &min_id); + /* Last method in bucket */ + if (!num_method_defs) + break; + + method = build_method_with_attrs(method_defs, + num_method_defs); + if (IS_ERR(method)) { + res = PTR_ERR(method); + goto free; + } + + /* + * The last tree which is given as an argument to the + * merge overrides previous method handler. + * Therefore, we iterate backwards and search for the + * first handler which != NULL. This also defines the + * set of flags used for this handler. + */ + for (i = num_object_defs - 1; + i >= 0 && !method_defs[i]->handler; i--) + ; + hash->methods[min_id++] = method; + /* NULL handler isn't allowed */ + if (WARN(i < 0, + "ib_uverbs: tried to merge function id %d, but all handlers are NULL\n", + min_id)) { + res = -EINVAL; + goto free; + } + method->handler = method_defs[i]->handler; + method->flags = method_defs[i]->flags; + + } while (1); + } + kfree(method_defs); + return object; + +free: + kfree(method_defs); +free_object: + free_object(object); + return ERR_PTR(res); +} + +void uverbs_free_spec_tree(struct uverbs_root_spec *root) +{ + unsigned int i, j; + + if (!root) + return; + + for (i = 0; i < root->num_buckets; i++) { + struct uverbs_object_spec_hash *object_hash = + root->object_buckets[i]; + + if (!object_hash) + continue; + + for (j = 0; j < object_hash->num_objects; j++) + free_object(object_hash->objects[j]); + + kfree(object_hash); + } + + kfree(root); +} +EXPORT_SYMBOL(uverbs_free_spec_tree); + +struct uverbs_root_spec *uverbs_alloc_spec_tree(unsigned int num_trees, + const struct uverbs_object_tree_def **trees) +{ + u16 bucket_idx; + short max_object_buckets = 0; + size_t num_objects_buckets = 0; + struct uverbs_root_spec *root_spec = NULL; + const struct uverbs_object_def **object_defs; + int i; + int res = 0; + + max_object_buckets = find_max_object_ns_id(num_trees, trees); + /* + * Devices which don't want to support ib_uverbs, should just allocate + * an empty parsing tree. Every user-space command won't hit any valid + * entry in the parsing tree and thus will fail. + */ + if (max_object_buckets >= 0) + num_objects_buckets = max_object_buckets + 1; + + root_spec = kzalloc(sizeof(*root_spec) + + num_objects_buckets * sizeof(*root_spec->object_buckets), + GFP_KERNEL); + if (!root_spec) + return ERR_PTR(-ENOMEM); + root_spec->num_buckets = num_objects_buckets; + + object_defs = kcalloc(num_trees, sizeof(*object_defs), + GFP_KERNEL); + if (!object_defs) { + res = -ENOMEM; + goto free_root; + } + + for (bucket_idx = 0; bucket_idx < root_spec->num_buckets; bucket_idx++) { + short min_id = SHRT_MIN; + short objects_max_bucket; + struct uverbs_object_spec_hash *hash = NULL; + + objects_max_bucket = find_max_object_id(num_trees, trees, + bucket_idx); + if (objects_max_bucket < 0) + continue; + + hash = kzalloc(sizeof(*hash) + + sizeof(*hash->objects) * (objects_max_bucket + 1), + GFP_KERNEL); + if (!hash) { + res = -ENOMEM; + goto free; + } + hash->num_objects = objects_max_bucket + 1; + root_spec->object_buckets[bucket_idx] = hash; + + do { + size_t num_object_defs; + struct uverbs_object_spec *object; + + num_object_defs = get_objects_above_id(object_defs, + num_trees, + trees, + bucket_idx, + &min_id); + /* Last object in bucket */ + if (!num_object_defs) + break; + + object = build_object_with_methods(object_defs, + num_object_defs); + if (IS_ERR(object)) { + res = PTR_ERR(object); + goto free; + } + + /* + * The last tree which is given as an argument to the + * merge overrides previous object's type_attrs. + * Therefore, we iterate backwards and search for the + * first type_attrs which != NULL. + */ + for (i = num_object_defs - 1; + i >= 0 && !object_defs[i]->type_attrs; i--) + ; + /* + * NULL is a valid type_attrs. It means an object we + * can't instantiate (like DEVICE). + */ + object->type_attrs = i < 0 ? NULL : + object_defs[i]->type_attrs; + + hash->objects[min_id++] = object; + } while (1); + } + + kfree(object_defs); + return root_spec; + +free: + kfree(object_defs); +free_root: + uverbs_free_spec_tree(root_spec); + return ERR_PTR(res); +} +EXPORT_SYMBOL(uverbs_alloc_spec_tree); diff --git a/include/rdma/uverbs_ioctl.h b/include/rdma/uverbs_ioctl.h index 2e8925434d74..cf5b238d2d81 100644 --- a/include/rdma/uverbs_ioctl.h +++ b/include/rdma/uverbs_ioctl.h @@ -235,5 +235,43 @@ static inline bool uverbs_attr_is_valid_in_hash(const struct uverbs_attr_bundle_ return test_bit(idx, attrs_hash->valid_bitmap); } -#endif +/* ================================================= + * Definitions -> Specs infrastructure + * ================================================= + */ + +/* + * uverbs_alloc_spec_tree - Merges different common and driver specific feature + * into one parsing tree that every uverbs command will be parsed upon. + * + * @num_trees: Number of trees in the array @trees. + * @trees: Array of pointers to tree root definitions to merge. Each such tree + * possibly contains objects, methods and attributes definitions. + * + * Returns: + * uverbs_root_spec *: The root of the merged parsing tree. + * On error, we return an error code. Error is checked via IS_ERR. + * + * The following merges could take place: + * a. Two trees representing the same method with different handler + * -> We take the handler of the tree that its handler != NULL + * and its index in the trees array is greater. The incentive for that + * is that developers are expected to first merge common trees and then + * merge trees that gives specialized the behaviour. + * b. Two trees representing the same object with different + * type_attrs (struct uverbs_obj_type): + * -> We take the type_attrs of the tree that its type_attr != NULL + * and its index in the trees array is greater. This could be used + * in order to override the free function, allocation size, etc. + * c. Two trees representing the same method attribute (same id but possibly + * different attributes): + * -> ERROR (-ENOENT), we believe that's not the programmer's intent. + * + * An object without any methods is considered invalid and will abort the + * function with -ENOENT error. + */ +struct uverbs_root_spec *uverbs_alloc_spec_tree(unsigned int num_trees, + const struct uverbs_object_tree_def **trees); +void uverbs_free_spec_tree(struct uverbs_root_spec *root); +#endif -- cgit v1.2.3 From 3541030650c0ddb5d52163082fee427b2a453799 Mon Sep 17 00:00:00 2001 From: Matan Barak Date: Thu, 3 Aug 2017 16:07:01 +0300 Subject: IB/core: Add macros for declaring methods and attributes This patch adds macros for declaring objects, methods and attributes. These definitions are later used by downstream patches to declare some of the default types. Signed-off-by: Matan Barak Reviewed-by: Yishai Hadas Signed-off-by: Doug Ledford --- include/rdma/uverbs_ioctl.h | 105 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 104 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/rdma/uverbs_ioctl.h b/include/rdma/uverbs_ioctl.h index cf5b238d2d81..9a8d217cdc1d 100644 --- a/include/rdma/uverbs_ioctl.h +++ b/include/rdma/uverbs_ioctl.h @@ -34,6 +34,8 @@ #define _UVERBS_IOCTL_ #include +#include +#include /* * ======================================= @@ -160,10 +162,99 @@ struct uverbs_object_tree_def { const struct uverbs_object_def * const (*objects)[]; }; +#define UA_FLAGS(_flags) .flags = _flags +#define __UVERBS_ATTR0(_id, _len, _type, ...) \ + ((const struct uverbs_attr_def) \ + {.id = _id, .attr = {.type = _type, {.len = _len}, .flags = 0, } }) +#define __UVERBS_ATTR1(_id, _len, _type, _flags) \ + ((const struct uverbs_attr_def) \ + {.id = _id, .attr = {.type = _type, {.len = _len}, _flags, } }) +#define __UVERBS_ATTR(_id, _len, _type, _flags, _n, ...) \ + __UVERBS_ATTR##_n(_id, _len, _type, _flags) +/* + * In new compiler, UVERBS_ATTR could be simplified by declaring it as + * [_id] = {.type = _type, .len = _len, ##__VA_ARGS__} + * But since we support older compilers too, we need the more complex code. + */ +#define UVERBS_ATTR(_id, _len, _type, ...) \ + __UVERBS_ATTR(_id, _len, _type, ##__VA_ARGS__, 1, 0) +#define UVERBS_ATTR_PTR_IN_SZ(_id, _len, ...) \ + UVERBS_ATTR(_id, _len, UVERBS_ATTR_TYPE_PTR_IN, ##__VA_ARGS__) +/* If sizeof(_type) <= sizeof(u64), this will be inlined rather than a pointer */ +#define UVERBS_ATTR_PTR_IN(_id, _type, ...) \ + UVERBS_ATTR_PTR_IN_SZ(_id, sizeof(_type), ##__VA_ARGS__) +#define UVERBS_ATTR_PTR_OUT_SZ(_id, _len, ...) \ + UVERBS_ATTR(_id, _len, UVERBS_ATTR_TYPE_PTR_OUT, ##__VA_ARGS__) +#define UVERBS_ATTR_PTR_OUT(_id, _type, ...) \ + UVERBS_ATTR_PTR_OUT_SZ(_id, sizeof(_type), ##__VA_ARGS__) + +/* + * In new compiler, UVERBS_ATTR_IDR (and FD) could be simplified by declaring + * it as + * {.id = _id, \ + * .attr {.type = __obj_class, \ + * .obj = {.obj_type = _idr_type, \ + * .access = _access \ + * }, ##__VA_ARGS__ } } + * But since we support older compilers too, we need the more complex code. + */ +#define ___UVERBS_ATTR_OBJ0(_id, _obj_class, _obj_type, _access, ...)\ + ((const struct uverbs_attr_def) \ + {.id = _id, \ + .attr = {.type = _obj_class, \ + {.obj = {.obj_type = _obj_type, .access = _access } },\ + .flags = 0} }) +#define ___UVERBS_ATTR_OBJ1(_id, _obj_class, _obj_type, _access, _flags)\ + ((const struct uverbs_attr_def) \ + {.id = _id, \ + .attr = {.type = _obj_class, \ + {.obj = {.obj_type = _obj_type, .access = _access} }, \ + _flags} }) +#define ___UVERBS_ATTR_OBJ(_id, _obj_class, _obj_type, _access, _flags, \ + _n, ...) \ + ___UVERBS_ATTR_OBJ##_n(_id, _obj_class, _obj_type, _access, _flags) +#define __UVERBS_ATTR_OBJ(_id, _obj_class, _obj_type, _access, ...) \ + ___UVERBS_ATTR_OBJ(_id, _obj_class, _obj_type, _access, \ + ##__VA_ARGS__, 1, 0) +#define UVERBS_ATTR_IDR(_id, _idr_type, _access, ...) \ + __UVERBS_ATTR_OBJ(_id, UVERBS_ATTR_TYPE_IDR, _idr_type, _access,\ + ##__VA_ARGS__) +#define UVERBS_ATTR_FD(_id, _fd_type, _access, ...) \ + __UVERBS_ATTR_OBJ(_id, UVERBS_ATTR_TYPE_FD, _fd_type, \ + (_access) + BUILD_BUG_ON_ZERO( \ + (_access) != UVERBS_ACCESS_NEW && \ + (_access) != UVERBS_ACCESS_READ), \ + ##__VA_ARGS__) +#define DECLARE_UVERBS_ATTR_SPEC(_name, ...) \ + const struct uverbs_attr_def _name = __VA_ARGS__ + +#define _UVERBS_METHOD_ATTRS_SZ(...) \ + (sizeof((const struct uverbs_attr_def * const []){__VA_ARGS__}) /\ + sizeof(const struct uverbs_attr_def *)) +#define _UVERBS_METHOD(_id, _handler, _flags, ...) \ + ((const struct uverbs_method_def) { \ + .id = _id, \ + .flags = _flags, \ + .handler = _handler, \ + .num_attrs = _UVERBS_METHOD_ATTRS_SZ(__VA_ARGS__), \ + .attrs = &(const struct uverbs_attr_def * const []){__VA_ARGS__} }) +#define DECLARE_UVERBS_METHOD(_name, _id, _handler, ...) \ + const struct uverbs_method_def _name = \ + _UVERBS_METHOD(_id, _handler, 0, ##__VA_ARGS__) +#define DECLARE_UVERBS_CTX_METHOD(_name, _id, _handler, _flags, ...) \ + const struct uverbs_method_def _name = \ + _UVERBS_METHOD(_id, _handler, \ + UVERBS_ACTION_FLAG_CREATE_ROOT, \ + ##__VA_ARGS__) +#define _UVERBS_OBJECT_METHODS_SZ(...) \ + (sizeof((const struct uverbs_method_def * const []){__VA_ARGS__}) / \ + sizeof(const struct uverbs_method_def *)) #define _UVERBS_OBJECT(_id, _type_attrs, ...) \ ((const struct uverbs_object_def) { \ .id = _id, \ - .type_attrs = _type_attrs}) + .type_attrs = _type_attrs, \ + .num_methods = _UVERBS_OBJECT_METHODS_SZ(__VA_ARGS__), \ + .methods = &(const struct uverbs_method_def * const []){__VA_ARGS__} }) #define DECLARE_UVERBS_OBJECT(_name, _id, _type_attrs, ...) \ const struct uverbs_object_def _name = \ _UVERBS_OBJECT(_id, _type_attrs, ##__VA_ARGS__) @@ -235,6 +326,18 @@ static inline bool uverbs_attr_is_valid_in_hash(const struct uverbs_attr_bundle_ return test_bit(idx, attrs_hash->valid_bitmap); } +static inline bool uverbs_attr_is_valid(const struct uverbs_attr_bundle *attrs_bundle, + unsigned int idx) +{ + u16 idx_bucket = idx >> UVERBS_ID_NS_SHIFT; + + if (attrs_bundle->num_buckets <= idx_bucket) + return false; + + return uverbs_attr_is_valid_in_hash(&attrs_bundle->hash[idx_bucket], + idx & ~UVERBS_ID_NS_MASK); +} + /* ================================================= * Definitions -> Specs infrastructure * ================================================= -- cgit v1.2.3 From 4da70da23e9ba03f7f9e067fbe0eec6ebbfee401 Mon Sep 17 00:00:00 2001 From: Matan Barak Date: Thu, 3 Aug 2017 16:07:02 +0300 Subject: IB/core: Explicitly destroy an object while keeping uobject When some objects are destroyed, we need to extract their status at destruction. After object's destruction, this status (e.g. events_reported) relies in the uobject. In order to have the latest and correct status, the underlying object should be destroyed, but we should keep the uobject alive and read this information off the uobject. We introduce a rdma_explicit_destroy function. This function destroys the class type object (for example, the IDR class type which destroys the underlying object as well) and then convert the uobject to be of a null class type. This uobject will then be destroyed as any other uobject once uverbs_finalize_object[s] is called. Signed-off-by: Matan Barak Reviewed-by: Yishai Hadas Signed-off-by: Doug Ledford --- drivers/infiniband/core/rdma_core.c | 35 +++++++++++++++++++++++++++++++++++ include/rdma/uverbs_types.h | 1 + 2 files changed, 36 insertions(+) (limited to 'include') diff --git a/drivers/infiniband/core/rdma_core.c b/drivers/infiniband/core/rdma_core.c index 2a2f002ac7cb..85b5ee4defa4 100644 --- a/drivers/infiniband/core/rdma_core.c +++ b/drivers/infiniband/core/rdma_core.c @@ -451,6 +451,41 @@ int __must_check rdma_remove_commit_uobject(struct ib_uobject *uobj) return ret; } +static int null_obj_type_class_remove_commit(struct ib_uobject *uobj, + enum rdma_remove_reason why) +{ + return 0; +} + +static const struct uverbs_obj_type null_obj_type = { + .type_class = &((const struct uverbs_obj_type_class){ + .remove_commit = null_obj_type_class_remove_commit, + /* be cautious */ + .needs_kfree_rcu = true}), +}; + +int rdma_explicit_destroy(struct ib_uobject *uobject) +{ + int ret; + struct ib_ucontext *ucontext = uobject->context; + + /* Cleanup is running. Calling this should have been impossible */ + if (!down_read_trylock(&ucontext->cleanup_rwsem)) { + WARN(true, "ib_uverbs: Cleanup is running while removing an uobject\n"); + return 0; + } + lockdep_check(uobject, true); + ret = uobject->type->type_class->remove_commit(uobject, + RDMA_REMOVE_DESTROY); + if (ret) + return ret; + + uobject->type = &null_obj_type; + + up_read(&ucontext->cleanup_rwsem); + return 0; +} + static void alloc_commit_idr_uobject(struct ib_uobject *uobj) { uverbs_uobject_add(uobj); diff --git a/include/rdma/uverbs_types.h b/include/rdma/uverbs_types.h index 9760b6d70744..cc04ec65588d 100644 --- a/include/rdma/uverbs_types.h +++ b/include/rdma/uverbs_types.h @@ -129,6 +129,7 @@ struct ib_uobject *rdma_alloc_begin_uobject(const struct uverbs_obj_type *type, void rdma_alloc_abort_uobject(struct ib_uobject *uobj); int __must_check rdma_remove_commit_uobject(struct ib_uobject *uobj); int rdma_alloc_commit_uobject(struct ib_uobject *uobj); +int rdma_explicit_destroy(struct ib_uobject *uobject); struct uverbs_obj_fd_type { /* -- cgit v1.2.3 From 64b19e1323e96c34af7ca90d1954e70890c7a98e Mon Sep 17 00:00:00 2001 From: Matan Barak Date: Thu, 3 Aug 2017 16:07:03 +0300 Subject: IB/core: Export ioctl enum types to user-space Add a new ib_user_ioctl_verbs.h which exports all required ABI enums and structs to the user-space. Export the default types to user-space through this file. Signed-off-by: Matan Barak Reviewed-by: Yishai Hadas Signed-off-by: Doug Ledford --- include/rdma/uverbs_std_types.h | 18 +---------- include/uapi/rdma/ib_user_ioctl_verbs.h | 54 +++++++++++++++++++++++++++++++++ 2 files changed, 55 insertions(+), 17 deletions(-) create mode 100644 include/uapi/rdma/ib_user_ioctl_verbs.h (limited to 'include') diff --git a/include/rdma/uverbs_std_types.h b/include/rdma/uverbs_std_types.h index bef74099b7c5..400efe2a4d3c 100644 --- a/include/rdma/uverbs_std_types.h +++ b/include/rdma/uverbs_std_types.h @@ -34,23 +34,7 @@ #define _UVERBS_STD_TYPES__ #include - -enum uverbs_default_objects { - UVERBS_OBJECT_DEVICE, /* No instances of DEVICE are allowed */ - UVERBS_OBJECT_PD, - UVERBS_OBJECT_COMP_CHANNEL, - UVERBS_OBJECT_CQ, - UVERBS_OBJECT_QP, - UVERBS_OBJECT_SRQ, - UVERBS_OBJECT_AH, - UVERBS_OBJECT_MR, - UVERBS_OBJECT_MW, - UVERBS_OBJECT_FLOW, - UVERBS_OBJECT_XRCD, - UVERBS_OBJECT_RWQ_IND_TBL, - UVERBS_OBJECT_WQ, - UVERBS_OBJECT_LAST, -}; +#include extern const struct uverbs_object_def uverbs_object_comp_channel; extern const struct uverbs_object_def uverbs_object_cq; diff --git a/include/uapi/rdma/ib_user_ioctl_verbs.h b/include/uapi/rdma/ib_user_ioctl_verbs.h new file mode 100644 index 000000000000..78a2e5be4d6e --- /dev/null +++ b/include/uapi/rdma/ib_user_ioctl_verbs.h @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2017, Mellanox Technologies inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef IB_USER_IOCTL_VERBS_H +#define IB_USER_IOCTL_VERBS_H + +enum uverbs_default_objects { + UVERBS_OBJECT_DEVICE, /* No instances of DEVICE are allowed */ + UVERBS_OBJECT_PD, + UVERBS_OBJECT_COMP_CHANNEL, + UVERBS_OBJECT_CQ, + UVERBS_OBJECT_QP, + UVERBS_OBJECT_SRQ, + UVERBS_OBJECT_AH, + UVERBS_OBJECT_MR, + UVERBS_OBJECT_MW, + UVERBS_OBJECT_FLOW, + UVERBS_OBJECT_XRCD, + UVERBS_OBJECT_RWQ_IND_TBL, + UVERBS_OBJECT_WQ, + UVERBS_OBJECT_LAST, +}; + +#endif + -- cgit v1.2.3 From d70724f149b107f8e4062320270d3d8b6713a1bb Mon Sep 17 00:00:00 2001 From: Matan Barak Date: Thu, 3 Aug 2017 16:07:04 +0300 Subject: IB/core: Add legacy driver's user-data In this phase, we don't want to change all the drivers to use flexible driver's specific attributes. Therefore, we add two default attributes: UHW_IN and UHW_OUT. These attributes are optional in some methods and they encode the driver specific command data. We add a function that extract this data and creates the legacy udata over it. Driver's data should start from UVERBS_UDATA_DRIVER_DATA_FLAG. This turns on the first bit of the namespace, indicating this attribute belongs to the driver's namespace. Signed-off-by: Matan Barak Reviewed-by: Yishai Hadas Signed-off-by: Doug Ledford --- drivers/infiniband/core/uverbs_std_types.c | 40 ++++++++++++++++++++++++++ include/rdma/uverbs_ioctl.h | 46 ++++++++++++++++++++++++++++++ include/uapi/rdma/ib_user_ioctl_verbs.h | 10 +++++++ 3 files changed, 96 insertions(+) (limited to 'include') diff --git a/drivers/infiniband/core/uverbs_std_types.c b/drivers/infiniband/core/uverbs_std_types.c index 5f90978bda8d..db66c18857e4 100644 --- a/drivers/infiniband/core/uverbs_std_types.c +++ b/drivers/infiniband/core/uverbs_std_types.c @@ -209,6 +209,46 @@ static int uverbs_hot_unplug_completion_event_file(struct ib_uobject_file *uobj_ return 0; }; +/* + * This spec is used in order to pass information to the hardware driver in a + * legacy way. Every verb that could get driver specific data should get this + * spec. + */ +static const struct uverbs_attr_def uverbs_uhw_compat_in = + UVERBS_ATTR_PTR_IN_SZ(UVERBS_UHW_IN, 0, UA_FLAGS(UVERBS_ATTR_SPEC_F_MIN_SZ)); +static const struct uverbs_attr_def uverbs_uhw_compat_out = + UVERBS_ATTR_PTR_OUT_SZ(UVERBS_UHW_OUT, 0, UA_FLAGS(UVERBS_ATTR_SPEC_F_MIN_SZ)); + +static void create_udata(struct uverbs_attr_bundle *ctx, + struct ib_udata *udata) +{ + /* + * This is for ease of conversion. The purpose is to convert all drivers + * to use uverbs_attr_bundle instead of ib_udata. + * Assume attr == 0 is input and attr == 1 is output. + */ + void __user *inbuf; + size_t inbuf_len = 0; + void __user *outbuf; + size_t outbuf_len = 0; + const struct uverbs_attr *uhw_in = + uverbs_attr_get(ctx, UVERBS_UHW_IN); + const struct uverbs_attr *uhw_out = + uverbs_attr_get(ctx, UVERBS_UHW_OUT); + + if (!IS_ERR(uhw_in)) { + inbuf = uhw_in->ptr_attr.ptr; + inbuf_len = uhw_in->ptr_attr.len; + } + + if (!IS_ERR(uhw_out)) { + outbuf = uhw_out->ptr_attr.ptr; + outbuf_len = uhw_out->ptr_attr.len; + } + + INIT_UDATA_BUF_OR_NULL(udata, inbuf, outbuf, inbuf_len, outbuf_len); +} + DECLARE_UVERBS_OBJECT(uverbs_object_comp_channel, UVERBS_OBJECT_COMP_CHANNEL, &UVERBS_TYPE_ALLOC_FD(0, diff --git a/include/rdma/uverbs_ioctl.h b/include/rdma/uverbs_ioctl.h index 9a8d217cdc1d..759afa0621ea 100644 --- a/include/rdma/uverbs_ioctl.h +++ b/include/rdma/uverbs_ioctl.h @@ -36,6 +36,7 @@ #include #include #include +#include /* * ======================================= @@ -338,6 +339,51 @@ static inline bool uverbs_attr_is_valid(const struct uverbs_attr_bundle *attrs_b idx & ~UVERBS_ID_NS_MASK); } +static inline const struct uverbs_attr *uverbs_attr_get(const struct uverbs_attr_bundle *attrs_bundle, + u16 idx) +{ + u16 idx_bucket = idx >> UVERBS_ID_NS_SHIFT; + + if (!uverbs_attr_is_valid(attrs_bundle, idx)) + return ERR_PTR(-ENOENT); + + return &attrs_bundle->hash[idx_bucket].attrs[idx & ~UVERBS_ID_NS_MASK]; +} + +static inline int uverbs_copy_to(const struct uverbs_attr_bundle *attrs_bundle, + size_t idx, const void *from) +{ + const struct uverbs_attr *attr = uverbs_attr_get(attrs_bundle, idx); + u16 flags; + + if (IS_ERR(attr)) + return PTR_ERR(attr); + + flags = attr->ptr_attr.flags | UVERBS_ATTR_F_VALID_OUTPUT; + return (!copy_to_user(attr->ptr_attr.ptr, from, attr->ptr_attr.len) && + !put_user(flags, &attr->uattr->flags)) ? 0 : -EFAULT; +} + +static inline int _uverbs_copy_from(void *to, size_t to_size, + const struct uverbs_attr_bundle *attrs_bundle, + size_t idx) +{ + const struct uverbs_attr *attr = uverbs_attr_get(attrs_bundle, idx); + + if (IS_ERR(attr)) + return PTR_ERR(attr); + + if (to_size <= sizeof(((struct ib_uverbs_attr *)0)->data)) + memcpy(to, &attr->ptr_attr.data, attr->ptr_attr.len); + else if (copy_from_user(to, attr->ptr_attr.ptr, attr->ptr_attr.len)) + return -EFAULT; + + return 0; +} + +#define uverbs_copy_from(to, attrs_bundle, idx) \ + _uverbs_copy_from(to, sizeof(*(to)), attrs_bundle, idx) + /* ================================================= * Definitions -> Specs infrastructure * ================================================= diff --git a/include/uapi/rdma/ib_user_ioctl_verbs.h b/include/uapi/rdma/ib_user_ioctl_verbs.h index 78a2e5be4d6e..90f81eeca35b 100644 --- a/include/uapi/rdma/ib_user_ioctl_verbs.h +++ b/include/uapi/rdma/ib_user_ioctl_verbs.h @@ -33,6 +33,11 @@ #ifndef IB_USER_IOCTL_VERBS_H #define IB_USER_IOCTL_VERBS_H +#include + +#define UVERBS_UDATA_DRIVER_DATA_NS 1 +#define UVERBS_UDATA_DRIVER_DATA_FLAG (1UL << UVERBS_ID_NS_SHIFT) + enum uverbs_default_objects { UVERBS_OBJECT_DEVICE, /* No instances of DEVICE are allowed */ UVERBS_OBJECT_PD, @@ -50,5 +55,10 @@ enum uverbs_default_objects { UVERBS_OBJECT_LAST, }; +enum { + UVERBS_UHW_IN = UVERBS_UDATA_DRIVER_DATA_FLAG, + UVERBS_UHW_OUT, +}; + #endif -- cgit v1.2.3 From 9ee79fce364216df35ec46e26d20780c3c1644cc Mon Sep 17 00:00:00 2001 From: Matan Barak Date: Thu, 3 Aug 2017 16:07:05 +0300 Subject: IB/core: Add completion queue (cq) object actions Adding CQ ioctl actions: 1. create_cq 2. destroy_cq This requires adding the following: 1. A specification describing the method a. Handler b. Attributes specification Each attribute is one of the following: a. PTR_IN - input data Note: This could be encoded inlined for data < 64bit b. PTR_OUT - response data c. IDR - idr based object d. FD - fd based object Blobs attributes (clauses a and b) contain their type, while objects specifications (clauses c and d) contains the expected object type (for example, the given id should be UVERBS_TYPE_PD) and the required access (READ, WRITE, NEW or DESTROY). If a NEW is required, the new object's id will be assigned to this attribute. All attributes could get UA_FLAGS attribute. Currently we support stating that an attribute is mandatory or that the specification size corresponds to a lower bound (and that this attribute could be extended). We currently add both default attributes and the two generic UHW_IN and UHW_OUT driver specific attributes. 2. Handler A handler gets a uverbs_attr_bundle. The handler developer uses uverbs_attr_get to fetch an attribute of a given id. Each of these attribute groups correspond to the specification group defined in the action (clauses 1.b and 1.c respectively). The indices of these arrays corresponds to the attribute ids declared in the specifications (clause 2). The handler is quite simple. It assumes the infrastructure fetched all objects and locked, created or destroyed them as required by the specification. Pointer (or blob) attributes were validated to match their required sizes. After the handler finished, the infrastructure commits or rollbacks the objects. Signed-off-by: Matan Barak Reviewed-by: Yishai Hadas Signed-off-by: Doug Ledford --- drivers/infiniband/core/uverbs_std_types.c | 138 ++++++++++++++++++++++++++++- include/uapi/rdma/ib_user_ioctl_verbs.h | 20 +++++ 2 files changed, 157 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/drivers/infiniband/core/uverbs_std_types.c b/drivers/infiniband/core/uverbs_std_types.c index db66c18857e4..0a98579700ec 100644 --- a/drivers/infiniband/core/uverbs_std_types.c +++ b/drivers/infiniband/core/uverbs_std_types.c @@ -249,6 +249,140 @@ static void create_udata(struct uverbs_attr_bundle *ctx, INIT_UDATA_BUF_OR_NULL(udata, inbuf, outbuf, inbuf_len, outbuf_len); } +static int uverbs_create_cq_handler(struct ib_device *ib_dev, + struct ib_uverbs_file *file, + struct uverbs_attr_bundle *attrs) +{ + struct ib_ucontext *ucontext = file->ucontext; + struct ib_ucq_object *obj; + struct ib_udata uhw; + int ret; + u64 user_handle; + struct ib_cq_init_attr attr = {}; + struct ib_cq *cq; + struct ib_uverbs_completion_event_file *ev_file = NULL; + const struct uverbs_attr *ev_file_attr; + struct ib_uobject *ev_file_uobj; + + if (!(ib_dev->uverbs_cmd_mask & 1ULL << IB_USER_VERBS_CMD_CREATE_CQ)) + return -EOPNOTSUPP; + + ret = uverbs_copy_from(&attr.comp_vector, attrs, CREATE_CQ_COMP_VECTOR); + if (!ret) + ret = uverbs_copy_from(&attr.cqe, attrs, CREATE_CQ_CQE); + if (!ret) + ret = uverbs_copy_from(&user_handle, attrs, CREATE_CQ_USER_HANDLE); + if (ret) + return ret; + + /* Optional param, if it doesn't exist, we get -ENOENT and skip it */ + if (uverbs_copy_from(&attr.flags, attrs, CREATE_CQ_FLAGS) == -EFAULT) + return -EFAULT; + + ev_file_attr = uverbs_attr_get(attrs, CREATE_CQ_COMP_CHANNEL); + if (!IS_ERR(ev_file_attr)) { + ev_file_uobj = ev_file_attr->obj_attr.uobject; + + ev_file = container_of(ev_file_uobj, + struct ib_uverbs_completion_event_file, + uobj_file.uobj); + uverbs_uobject_get(ev_file_uobj); + } + + if (attr.comp_vector >= ucontext->ufile->device->num_comp_vectors) { + ret = -EINVAL; + goto err_event_file; + } + + obj = container_of(uverbs_attr_get(attrs, CREATE_CQ_HANDLE)->obj_attr.uobject, + typeof(*obj), uobject); + obj->uverbs_file = ucontext->ufile; + obj->comp_events_reported = 0; + obj->async_events_reported = 0; + INIT_LIST_HEAD(&obj->comp_list); + INIT_LIST_HEAD(&obj->async_list); + + /* Temporary, only until drivers get the new uverbs_attr_bundle */ + create_udata(attrs, &uhw); + + cq = ib_dev->create_cq(ib_dev, &attr, ucontext, &uhw); + if (IS_ERR(cq)) { + ret = PTR_ERR(cq); + goto err_event_file; + } + + cq->device = ib_dev; + cq->uobject = &obj->uobject; + cq->comp_handler = ib_uverbs_comp_handler; + cq->event_handler = ib_uverbs_cq_event_handler; + cq->cq_context = &ev_file->ev_queue; + obj->uobject.object = cq; + obj->uobject.user_handle = user_handle; + atomic_set(&cq->usecnt, 0); + + ret = uverbs_copy_to(attrs, CREATE_CQ_RESP_CQE, &cq->cqe); + if (ret) + goto err_cq; + + return 0; +err_cq: + ib_destroy_cq(cq); + +err_event_file: + if (ev_file) + uverbs_uobject_put(ev_file_uobj); + return ret; +}; + +static DECLARE_UVERBS_METHOD( + uverbs_method_cq_create, UVERBS_CQ_CREATE, uverbs_create_cq_handler, + &UVERBS_ATTR_IDR(CREATE_CQ_HANDLE, UVERBS_OBJECT_CQ, UVERBS_ACCESS_NEW, + UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)), + &UVERBS_ATTR_PTR_IN(CREATE_CQ_CQE, u32, + UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)), + &UVERBS_ATTR_PTR_IN(CREATE_CQ_USER_HANDLE, u64, + UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)), + &UVERBS_ATTR_FD(CREATE_CQ_COMP_CHANNEL, UVERBS_OBJECT_COMP_CHANNEL, + UVERBS_ACCESS_READ), + &UVERBS_ATTR_PTR_IN(CREATE_CQ_COMP_VECTOR, u32, + UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)), + &UVERBS_ATTR_PTR_IN(CREATE_CQ_FLAGS, u32), + &UVERBS_ATTR_PTR_OUT(CREATE_CQ_RESP_CQE, u32, + UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)), + &uverbs_uhw_compat_in, &uverbs_uhw_compat_out); + +static int uverbs_destroy_cq_handler(struct ib_device *ib_dev, + struct ib_uverbs_file *file, + struct uverbs_attr_bundle *attrs) +{ + struct ib_uverbs_destroy_cq_resp resp; + struct ib_uobject *uobj = + uverbs_attr_get(attrs, DESTROY_CQ_HANDLE)->obj_attr.uobject; + struct ib_ucq_object *obj = container_of(uobj, struct ib_ucq_object, + uobject); + int ret; + + if (!(ib_dev->uverbs_cmd_mask & 1ULL << IB_USER_VERBS_CMD_DESTROY_CQ)) + return -EOPNOTSUPP; + + ret = rdma_explicit_destroy(uobj); + if (ret) + return ret; + + resp.comp_events_reported = obj->comp_events_reported; + resp.async_events_reported = obj->async_events_reported; + + return uverbs_copy_to(attrs, DESTROY_CQ_RESP, &resp); +} + +static DECLARE_UVERBS_METHOD( + uverbs_method_cq_destroy, UVERBS_CQ_DESTROY, uverbs_destroy_cq_handler, + &UVERBS_ATTR_IDR(DESTROY_CQ_HANDLE, UVERBS_OBJECT_CQ, + UVERBS_ACCESS_DESTROY, + UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY)), + &UVERBS_ATTR_PTR_OUT(DESTROY_CQ_RESP, struct ib_uverbs_destroy_cq_resp, + UA_FLAGS(UVERBS_ATTR_SPEC_F_MANDATORY))); + DECLARE_UVERBS_OBJECT(uverbs_object_comp_channel, UVERBS_OBJECT_COMP_CHANNEL, &UVERBS_TYPE_ALLOC_FD(0, @@ -259,7 +393,9 @@ DECLARE_UVERBS_OBJECT(uverbs_object_comp_channel, DECLARE_UVERBS_OBJECT(uverbs_object_cq, UVERBS_OBJECT_CQ, &UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_ucq_object), 0, - uverbs_free_cq)); + uverbs_free_cq), + &uverbs_method_cq_create, + &uverbs_method_cq_destroy); DECLARE_UVERBS_OBJECT(uverbs_object_qp, UVERBS_OBJECT_QP, &UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_uqp_object), 0, diff --git a/include/uapi/rdma/ib_user_ioctl_verbs.h b/include/uapi/rdma/ib_user_ioctl_verbs.h index 90f81eeca35b..842792eae383 100644 --- a/include/uapi/rdma/ib_user_ioctl_verbs.h +++ b/include/uapi/rdma/ib_user_ioctl_verbs.h @@ -60,5 +60,25 @@ enum { UVERBS_UHW_OUT, }; +enum uverbs_create_cq_cmd_attr_ids { + CREATE_CQ_HANDLE, + CREATE_CQ_CQE, + CREATE_CQ_USER_HANDLE, + CREATE_CQ_COMP_CHANNEL, + CREATE_CQ_COMP_VECTOR, + CREATE_CQ_FLAGS, + CREATE_CQ_RESP_CQE, +}; + +enum uverbs_destroy_cq_cmd_attr_ids { + DESTROY_CQ_HANDLE, + DESTROY_CQ_RESP, +}; + +enum uverbs_actions_cq_ops { + UVERBS_CQ_CREATE, + UVERBS_CQ_DESTROY, +}; + #endif -- cgit v1.2.3 From 524271129401ed896dc76e49acdbafc506cb41ac Mon Sep 17 00:00:00 2001 From: Matan Barak Date: Thu, 3 Aug 2017 16:07:06 +0300 Subject: IB/core: Assign root to all drivers In order to use the parsing tree, we need to assign the root to all drivers. Currently, we just assign the default parsing tree via ib_uverbs_add_one. The driver could override this by assigning a parsing tree prior to registering the device. Signed-off-by: Matan Barak Reviewed-by: Yishai Hadas Signed-off-by: Doug Ledford --- drivers/infiniband/core/uverbs.h | 1 + drivers/infiniband/core/uverbs_main.c | 18 ++++++++++++++++++ include/rdma/uverbs_ioctl.h | 12 ++++++++++++ include/rdma/uverbs_std_types.h | 14 ++++++++++++++ 4 files changed, 45 insertions(+) (limited to 'include') diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h index 64d494a64daf..0f6f768f687e 100644 --- a/drivers/infiniband/core/uverbs.h +++ b/drivers/infiniband/core/uverbs.h @@ -100,6 +100,7 @@ struct ib_uverbs_device { struct mutex lists_mutex; /* protect lists */ struct list_head uverbs_file_list; struct list_head uverbs_events_file_list; + struct uverbs_root_spec *specs_root; }; struct ib_uverbs_event_queue { diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index defeda33e27f..872fec910c16 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -49,6 +49,7 @@ #include #include +#include #include "uverbs.h" #include "core_priv.h" @@ -1097,6 +1098,18 @@ static void ib_uverbs_add_one(struct ib_device *device) if (device_create_file(uverbs_dev->dev, &dev_attr_abi_version)) goto err_class; + if (!device->specs_root) { + const struct uverbs_object_tree_def *default_root[] = { + uverbs_default_get_objects()}; + + uverbs_dev->specs_root = uverbs_alloc_spec_tree(1, + default_root); + if (IS_ERR(uverbs_dev->specs_root)) + goto err_class; + + device->specs_root = uverbs_dev->specs_root; + } + ib_set_client_data(device, &uverbs_client, uverbs_dev); return; @@ -1228,6 +1241,11 @@ static void ib_uverbs_remove_one(struct ib_device *device, void *client_data) ib_uverbs_comp_dev(uverbs_dev); if (wait_clients) wait_for_completion(&uverbs_dev->comp); + if (uverbs_dev->specs_root) { + uverbs_free_spec_tree(uverbs_dev->specs_root); + device->specs_root = NULL; + } + kobject_put(&uverbs_dev->kobj); } diff --git a/include/rdma/uverbs_ioctl.h b/include/rdma/uverbs_ioctl.h index 759afa0621ea..6da44079aa58 100644 --- a/include/rdma/uverbs_ioctl.h +++ b/include/rdma/uverbs_ioctl.h @@ -419,8 +419,20 @@ static inline int _uverbs_copy_from(void *to, size_t to_size, * An object without any methods is considered invalid and will abort the * function with -ENOENT error. */ +#if IS_ENABLED(CONFIG_INFINIBAND_USER_ACCESS) struct uverbs_root_spec *uverbs_alloc_spec_tree(unsigned int num_trees, const struct uverbs_object_tree_def **trees); void uverbs_free_spec_tree(struct uverbs_root_spec *root); +#else +static inline struct uverbs_root_spec *uverbs_alloc_spec_tree(unsigned int num_trees, + const struct uverbs_object_tree_def **trees) +{ + return NULL; +} + +static inline void uverbs_free_spec_tree(struct uverbs_root_spec *root) +{ +} +#endif #endif diff --git a/include/rdma/uverbs_std_types.h b/include/rdma/uverbs_std_types.h index 400efe2a4d3c..5f8e20bbd67c 100644 --- a/include/rdma/uverbs_std_types.h +++ b/include/rdma/uverbs_std_types.h @@ -34,8 +34,10 @@ #define _UVERBS_STD_TYPES__ #include +#include #include +#if IS_ENABLED(CONFIG_INFINIBAND_USER_ACCESS) extern const struct uverbs_object_def uverbs_object_comp_channel; extern const struct uverbs_object_def uverbs_object_cq; extern const struct uverbs_object_def uverbs_object_qp; @@ -50,6 +52,18 @@ extern const struct uverbs_object_def uverbs_object_pd; extern const struct uverbs_object_def uverbs_object_xrcd; extern const struct uverbs_object_def uverbs_object_device; +extern const struct uverbs_object_tree_def uverbs_default_objects; +static inline const struct uverbs_object_tree_def *uverbs_default_get_objects(void) +{ + return &uverbs_default_objects; +} +#else +static inline const struct uverbs_object_tree_def *uverbs_default_get_objects(void) +{ + return NULL; +} +#endif + static inline struct ib_uobject *__uobj_get(const struct uverbs_obj_type *type, bool write, struct ib_ucontext *ucontext, -- cgit v1.2.3