diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2016-05-20 14:35:07 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2016-05-20 14:35:07 -0700 |
commit | 76b584d3125a1f7d8b64e9c522a4555bc2844bde (patch) | |
tree | c75dc6b134eeae650372df7c6179f1e43a95953b /drivers/staging | |
parent | 7992893c5a9fdffa42117f6f749359466e06bdf6 (diff) | |
parent | c16d2750a08c8ccaf98d65f287a8aec91bb9610d (diff) | |
download | linux-76b584d3125a1f7d8b64e9c522a4555bc2844bde.tar.bz2 |
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/dledford/rdma
Pull rdma updates from Doug Ledford:
"Primary 4.7 merge window changes
- Updates to the new Intel X722 iWARP driver
- Updates to the hfi1 driver
- Fixes for the iw_cxgb4 driver
- Misc core fixes
- Generic RDMA READ/WRITE API addition
- SRP updates
- Misc ipoib updates
- Minor mlx5 updates"
* tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/dledford/rdma: (148 commits)
IB/mlx5: Fire the CQ completion handler from tasklet
net/mlx5_core: Use tasklet for user-space CQ completion events
IB/core: Do not require CAP_NET_ADMIN for packet sniffing
IB/mlx4: Fix unaligned access in send_reply_to_slave
IB/mlx5: Report Scatter FCS device capability when supported
IB/mlx5: Add Scatter FCS support for Raw Packet QP
IB/core: Add Scatter FCS create flag
IB/core: Add Raw Scatter FCS device capability
IB/core: Add extended device capability flags
i40iw: pass hw_stats by reference rather than by value
i40iw: Remove unnecessary synchronize_irq() before free_irq()
i40iw: constify i40iw_vf_cqp_ops structure
IB/mlx5: Add UARs write-combining and non-cached mapping
IB/mlx5: Allow mapping the free running counter on PROT_EXEC
IB/mlx4: Use list_for_each_entry_safe
IB/SA: Use correct free function
IB/core: Fix a potential array overrun in CMA and SA agent
IB/core: Remove unnecessary check in ibnl_rcv_msg
IB/IWPM: Fix a potential skb leak
RDMA/nes: replace custom print_hex_dump()
...
Diffstat (limited to 'drivers/staging')
27 files changed, 919 insertions, 443 deletions
diff --git a/drivers/staging/rdma/hfi1/affinity.c b/drivers/staging/rdma/hfi1/affinity.c index 2cb8ca77f876..6e7050ab9e16 100644 --- a/drivers/staging/rdma/hfi1/affinity.c +++ b/drivers/staging/rdma/hfi1/affinity.c @@ -53,20 +53,6 @@ #include "sdma.h" #include "trace.h" -struct cpu_mask_set { - struct cpumask mask; - struct cpumask used; - uint gen; -}; - -struct hfi1_affinity { - struct cpu_mask_set def_intr; - struct cpu_mask_set rcv_intr; - struct cpu_mask_set proc; - /* spin lock to protect affinity struct */ - spinlock_t lock; -}; - /* Name of IRQ types, indexed by enum irq_type */ static const char * const irq_type_names[] = { "SDMA", @@ -82,6 +68,48 @@ static inline void init_cpu_mask_set(struct cpu_mask_set *set) set->gen = 0; } +/* Initialize non-HT cpu cores mask */ +int init_real_cpu_mask(struct hfi1_devdata *dd) +{ + struct hfi1_affinity *info; + int possible, curr_cpu, i, ht; + + info = kzalloc(sizeof(*info), GFP_KERNEL); + if (!info) + return -ENOMEM; + + cpumask_clear(&info->real_cpu_mask); + + /* Start with cpu online mask as the real cpu mask */ + cpumask_copy(&info->real_cpu_mask, cpu_online_mask); + + /* + * Remove HT cores from the real cpu mask. Do this in two steps below. + */ + possible = cpumask_weight(&info->real_cpu_mask); + ht = cpumask_weight(topology_sibling_cpumask( + cpumask_first(&info->real_cpu_mask))); + /* + * Step 1. Skip over the first N HT siblings and use them as the + * "real" cores. Assumes that HT cores are not enumerated in + * succession (except in the single core case). + */ + curr_cpu = cpumask_first(&info->real_cpu_mask); + for (i = 0; i < possible / ht; i++) + curr_cpu = cpumask_next(curr_cpu, &info->real_cpu_mask); + /* + * Step 2. Remove the remaining HT siblings. Use cpumask_next() to + * skip any gaps. + */ + for (; i < possible; i++) { + cpumask_clear_cpu(curr_cpu, &info->real_cpu_mask); + curr_cpu = cpumask_next(curr_cpu, &info->real_cpu_mask); + } + + dd->affinity = info; + return 0; +} + /* * Interrupt affinity. * @@ -93,20 +121,17 @@ static inline void init_cpu_mask_set(struct cpu_mask_set *set) * to the node relative 1 as necessary. * */ -int hfi1_dev_affinity_init(struct hfi1_devdata *dd) +void hfi1_dev_affinity_init(struct hfi1_devdata *dd) { int node = pcibus_to_node(dd->pcidev->bus); - struct hfi1_affinity *info; + struct hfi1_affinity *info = dd->affinity; const struct cpumask *local_mask; - int curr_cpu, possible, i, ht; + int curr_cpu, possible, i; if (node < 0) node = numa_node_id(); dd->node = node; - info = kzalloc(sizeof(*info), GFP_KERNEL); - if (!info) - return -ENOMEM; spin_lock_init(&info->lock); init_cpu_mask_set(&info->def_intr); @@ -116,30 +141,8 @@ int hfi1_dev_affinity_init(struct hfi1_devdata *dd) local_mask = cpumask_of_node(dd->node); if (cpumask_first(local_mask) >= nr_cpu_ids) local_mask = topology_core_cpumask(0); - /* use local mask as default */ - cpumask_copy(&info->def_intr.mask, local_mask); - /* - * Remove HT cores from the default mask. Do this in two steps below. - */ - possible = cpumask_weight(&info->def_intr.mask); - ht = cpumask_weight(topology_sibling_cpumask( - cpumask_first(&info->def_intr.mask))); - /* - * Step 1. Skip over the first N HT siblings and use them as the - * "real" cores. Assumes that HT cores are not enumerated in - * succession (except in the single core case). - */ - curr_cpu = cpumask_first(&info->def_intr.mask); - for (i = 0; i < possible / ht; i++) - curr_cpu = cpumask_next(curr_cpu, &info->def_intr.mask); - /* - * Step 2. Remove the remaining HT siblings. Use cpumask_next() to - * skip any gaps. - */ - for (; i < possible; i++) { - cpumask_clear_cpu(curr_cpu, &info->def_intr.mask); - curr_cpu = cpumask_next(curr_cpu, &info->def_intr.mask); - } + /* Use the "real" cpu mask of this node as the default */ + cpumask_and(&info->def_intr.mask, &info->real_cpu_mask, local_mask); /* fill in the receive list */ possible = cpumask_weight(&info->def_intr.mask); @@ -167,8 +170,6 @@ int hfi1_dev_affinity_init(struct hfi1_devdata *dd) } cpumask_copy(&info->proc.mask, cpu_online_mask); - dd->affinity = info; - return 0; } void hfi1_dev_affinity_free(struct hfi1_devdata *dd) diff --git a/drivers/staging/rdma/hfi1/affinity.h b/drivers/staging/rdma/hfi1/affinity.h index b287e4963024..20f52fe74091 100644 --- a/drivers/staging/rdma/hfi1/affinity.h +++ b/drivers/staging/rdma/hfi1/affinity.h @@ -64,10 +64,27 @@ enum affinity_flags { AFF_IRQ_LOCAL }; +struct cpu_mask_set { + struct cpumask mask; + struct cpumask used; + uint gen; +}; + +struct hfi1_affinity { + struct cpu_mask_set def_intr; + struct cpu_mask_set rcv_intr; + struct cpu_mask_set proc; + struct cpumask real_cpu_mask; + /* spin lock to protect affinity struct */ + spinlock_t lock; +}; + struct hfi1_msix_entry; +/* Initialize non-HT cpu cores mask */ +int init_real_cpu_mask(struct hfi1_devdata *); /* Initialize driver affinity data */ -int hfi1_dev_affinity_init(struct hfi1_devdata *); +void hfi1_dev_affinity_init(struct hfi1_devdata *); /* Free driver affinity data */ void hfi1_dev_affinity_free(struct hfi1_devdata *); /* diff --git a/drivers/staging/rdma/hfi1/chip.c b/drivers/staging/rdma/hfi1/chip.c index 16eb653903e0..dcae8e723f98 100644 --- a/drivers/staging/rdma/hfi1/chip.c +++ b/drivers/staging/rdma/hfi1/chip.c @@ -123,6 +123,8 @@ struct flag_table { #define MIN_KERNEL_KCTXTS 2 #define FIRST_KERNEL_KCTXT 1 +/* sizes for both the QP and RSM map tables */ +#define NUM_MAP_ENTRIES 256 #define NUM_MAP_REGS 32 /* Bit offset into the GUID which carries HFI id information */ @@ -1029,9 +1031,12 @@ static int thermal_init(struct hfi1_devdata *dd); static int wait_logical_linkstate(struct hfi1_pportdata *ppd, u32 state, int msecs); static void read_planned_down_reason_code(struct hfi1_devdata *dd, u8 *pdrrc); +static void read_link_down_reason(struct hfi1_devdata *dd, u8 *ldr); static void handle_temp_err(struct hfi1_devdata *); static void dc_shutdown(struct hfi1_devdata *); static void dc_start(struct hfi1_devdata *); +static int qos_rmt_entries(struct hfi1_devdata *dd, unsigned int *mp, + unsigned int *np); /* * Error interrupt table entry. This is used as input to the interrupt @@ -5661,7 +5666,7 @@ static int sc_to_vl(struct hfi1_devdata *dd, int sw_index) sci = &dd->send_contexts[sw_index]; /* there is no information for user (PSM) and ack contexts */ - if (sci->type != SC_KERNEL) + if ((sci->type != SC_KERNEL) && (sci->type != SC_VL15)) return -1; sc = sci->sc; @@ -6199,18 +6204,13 @@ static void hreq_response(struct hfi1_devdata *dd, u8 return_code, u16 rsp_data) /* * Handle host requests from the 8051. - * - * This is a work-queue function outside of the interrupt. */ -void handle_8051_request(struct work_struct *work) +static void handle_8051_request(struct hfi1_pportdata *ppd) { - struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata, - dc_host_req_work); struct hfi1_devdata *dd = ppd->dd; u64 reg; u16 data = 0; - u8 type, i, lanes, *cache = ppd->qsfp_info.cache; - u8 cdr_ctrl_byte = cache[QSFP_CDR_CTRL_BYTE_OFFS]; + u8 type; reg = read_csr(dd, DC_DC8051_CFG_EXT_DEV_1); if ((reg & DC_DC8051_CFG_EXT_DEV_1_REQ_NEW_SMASK) == 0) @@ -6231,46 +6231,11 @@ void handle_8051_request(struct work_struct *work) case HREQ_READ_CONFIG: case HREQ_SET_TX_EQ_ABS: case HREQ_SET_TX_EQ_REL: + case HREQ_ENABLE: dd_dev_info(dd, "8051 request: request 0x%x not supported\n", type); hreq_response(dd, HREQ_NOT_SUPPORTED, 0); break; - - case HREQ_ENABLE: - lanes = data & 0xF; - for (i = 0; lanes; lanes >>= 1, i++) { - if (!(lanes & 1)) - continue; - if (data & 0x200) { - /* enable TX CDR */ - if (cache[QSFP_MOD_PWR_OFFS] & 0x8 && - cache[QSFP_CDR_INFO_OFFS] & 0x80) - cdr_ctrl_byte |= (1 << (i + 4)); - } else { - /* disable TX CDR */ - if (cache[QSFP_MOD_PWR_OFFS] & 0x8 && - cache[QSFP_CDR_INFO_OFFS] & 0x80) - cdr_ctrl_byte &= ~(1 << (i + 4)); - } - - if (data & 0x800) { - /* enable RX CDR */ - if (cache[QSFP_MOD_PWR_OFFS] & 0x4 && - cache[QSFP_CDR_INFO_OFFS] & 0x40) - cdr_ctrl_byte |= (1 << i); - } else { - /* disable RX CDR */ - if (cache[QSFP_MOD_PWR_OFFS] & 0x4 && - cache[QSFP_CDR_INFO_OFFS] & 0x40) - cdr_ctrl_byte &= ~(1 << i); - } - } - one_qsfp_write(ppd, dd->hfi1_id, QSFP_CDR_CTRL_BYTE_OFFS, - &cdr_ctrl_byte, 1); - hreq_response(dd, HREQ_SUCCESS, data); - refresh_qsfp_cache(ppd, &ppd->qsfp_info); - break; - case HREQ_CONFIG_DONE: hreq_response(dd, HREQ_SUCCESS, 0); break; @@ -6278,7 +6243,6 @@ void handle_8051_request(struct work_struct *work) case HREQ_INTERFACE_TEST: hreq_response(dd, HREQ_SUCCESS, data); break; - default: dd_dev_err(dd, "8051 request: unknown request 0x%x\n", type); hreq_response(dd, HREQ_NOT_SUPPORTED, 0); @@ -6849,6 +6813,75 @@ static void reset_neighbor_info(struct hfi1_pportdata *ppd) ppd->neighbor_fm_security = 0; } +static const char * const link_down_reason_strs[] = { + [OPA_LINKDOWN_REASON_NONE] = "None", + [OPA_LINKDOWN_REASON_RCV_ERROR_0] = "Recive error 0", + [OPA_LINKDOWN_REASON_BAD_PKT_LEN] = "Bad packet length", + [OPA_LINKDOWN_REASON_PKT_TOO_LONG] = "Packet too long", + [OPA_LINKDOWN_REASON_PKT_TOO_SHORT] = "Packet too short", + [OPA_LINKDOWN_REASON_BAD_SLID] = "Bad SLID", + [OPA_LINKDOWN_REASON_BAD_DLID] = "Bad DLID", + [OPA_LINKDOWN_REASON_BAD_L2] = "Bad L2", + [OPA_LINKDOWN_REASON_BAD_SC] = "Bad SC", + [OPA_LINKDOWN_REASON_RCV_ERROR_8] = "Receive error 8", + [OPA_LINKDOWN_REASON_BAD_MID_TAIL] = "Bad mid tail", + [OPA_LINKDOWN_REASON_RCV_ERROR_10] = "Receive error 10", + [OPA_LINKDOWN_REASON_PREEMPT_ERROR] = "Preempt error", + [OPA_LINKDOWN_REASON_PREEMPT_VL15] = "Preempt vl15", + [OPA_LINKDOWN_REASON_BAD_VL_MARKER] = "Bad VL marker", + [OPA_LINKDOWN_REASON_RCV_ERROR_14] = "Receive error 14", + [OPA_LINKDOWN_REASON_RCV_ERROR_15] = "Receive error 15", + [OPA_LINKDOWN_REASON_BAD_HEAD_DIST] = "Bad head distance", + [OPA_LINKDOWN_REASON_BAD_TAIL_DIST] = "Bad tail distance", + [OPA_LINKDOWN_REASON_BAD_CTRL_DIST] = "Bad control distance", + [OPA_LINKDOWN_REASON_BAD_CREDIT_ACK] = "Bad credit ack", + [OPA_LINKDOWN_REASON_UNSUPPORTED_VL_MARKER] = "Unsupported VL marker", + [OPA_LINKDOWN_REASON_BAD_PREEMPT] = "Bad preempt", + [OPA_LINKDOWN_REASON_BAD_CONTROL_FLIT] = "Bad control flit", + [OPA_LINKDOWN_REASON_EXCEED_MULTICAST_LIMIT] = "Exceed multicast limit", + [OPA_LINKDOWN_REASON_RCV_ERROR_24] = "Receive error 24", + [OPA_LINKDOWN_REASON_RCV_ERROR_25] = "Receive error 25", + [OPA_LINKDOWN_REASON_RCV_ERROR_26] = "Receive error 26", + [OPA_LINKDOWN_REASON_RCV_ERROR_27] = "Receive error 27", + [OPA_LINKDOWN_REASON_RCV_ERROR_28] = "Receive error 28", + [OPA_LINKDOWN_REASON_RCV_ERROR_29] = "Receive error 29", + [OPA_LINKDOWN_REASON_RCV_ERROR_30] = "Receive error 30", + [OPA_LINKDOWN_REASON_EXCESSIVE_BUFFER_OVERRUN] = + "Excessive buffer overrun", + [OPA_LINKDOWN_REASON_UNKNOWN] = "Unknown", + [OPA_LINKDOWN_REASON_REBOOT] = "Reboot", + [OPA_LINKDOWN_REASON_NEIGHBOR_UNKNOWN] = "Neighbor unknown", + [OPA_LINKDOWN_REASON_FM_BOUNCE] = "FM bounce", + [OPA_LINKDOWN_REASON_SPEED_POLICY] = "Speed policy", + [OPA_LINKDOWN_REASON_WIDTH_POLICY] = "Width policy", + [OPA_LINKDOWN_REASON_DISCONNECTED] = "Disconnected", + [OPA_LINKDOWN_REASON_LOCAL_MEDIA_NOT_INSTALLED] = + "Local media not installed", + [OPA_LINKDOWN_REASON_NOT_INSTALLED] = "Not installed", + [OPA_LINKDOWN_REASON_CHASSIS_CONFIG] = "Chassis config", + [OPA_LINKDOWN_REASON_END_TO_END_NOT_INSTALLED] = + "End to end not installed", + [OPA_LINKDOWN_REASON_POWER_POLICY] = "Power policy", + [OPA_LINKDOWN_REASON_LINKSPEED_POLICY] = "Link speed policy", + [OPA_LINKDOWN_REASON_LINKWIDTH_POLICY] = "Link width policy", + [OPA_LINKDOWN_REASON_SWITCH_MGMT] = "Switch management", + [OPA_LINKDOWN_REASON_SMA_DISABLED] = "SMA disabled", + [OPA_LINKDOWN_REASON_TRANSIENT] = "Transient" +}; + +/* return the neighbor link down reason string */ +static const char *link_down_reason_str(u8 reason) +{ + const char *str = NULL; + + if (reason < ARRAY_SIZE(link_down_reason_strs)) + str = link_down_reason_strs[reason]; + if (!str) + str = "(invalid)"; + + return str; +} + /* * Handle a link down interrupt from the 8051. * @@ -6857,8 +6890,11 @@ static void reset_neighbor_info(struct hfi1_pportdata *ppd) void handle_link_down(struct work_struct *work) { u8 lcl_reason, neigh_reason = 0; + u8 link_down_reason; struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata, - link_down_work); + link_down_work); + int was_up; + static const char ldr_str[] = "Link down reason: "; if ((ppd->host_link_state & (HLS_DN_POLL | HLS_VERIFY_CAP | HLS_GOING_UP)) && @@ -6867,20 +6903,63 @@ void handle_link_down(struct work_struct *work) HFI1_ODR_MASK(OPA_LINKDOWN_REASON_NOT_INSTALLED); /* Go offline first, then deal with reading/writing through 8051 */ + was_up = !!(ppd->host_link_state & HLS_UP); set_link_state(ppd, HLS_DN_OFFLINE); - lcl_reason = 0; - read_planned_down_reason_code(ppd->dd, &neigh_reason); + if (was_up) { + lcl_reason = 0; + /* link down reason is only valid if the link was up */ + read_link_down_reason(ppd->dd, &link_down_reason); + switch (link_down_reason) { + case LDR_LINK_TRANSFER_ACTIVE_LOW: + /* the link went down, no idle message reason */ + dd_dev_info(ppd->dd, "%sUnexpected link down\n", + ldr_str); + break; + case LDR_RECEIVED_LINKDOWN_IDLE_MSG: + /* + * The neighbor reason is only valid if an idle message + * was received for it. + */ + read_planned_down_reason_code(ppd->dd, &neigh_reason); + dd_dev_info(ppd->dd, + "%sNeighbor link down message %d, %s\n", + ldr_str, neigh_reason, + link_down_reason_str(neigh_reason)); + break; + case LDR_RECEIVED_HOST_OFFLINE_REQ: + dd_dev_info(ppd->dd, + "%sHost requested link to go offline\n", + ldr_str); + break; + default: + dd_dev_info(ppd->dd, "%sUnknown reason 0x%x\n", + ldr_str, link_down_reason); + break; + } - /* - * If no reason, assume peer-initiated but missed - * LinkGoingDown idle flits. - */ - if (neigh_reason == 0) - lcl_reason = OPA_LINKDOWN_REASON_NEIGHBOR_UNKNOWN; + /* + * If no reason, assume peer-initiated but missed + * LinkGoingDown idle flits. + */ + if (neigh_reason == 0) + lcl_reason = OPA_LINKDOWN_REASON_NEIGHBOR_UNKNOWN; + } else { + /* went down while polling or going up */ + lcl_reason = OPA_LINKDOWN_REASON_TRANSIENT; + } set_link_down_reason(ppd, lcl_reason, neigh_reason, 0); + /* inform the SMA when the link transitions from up to down */ + if (was_up && ppd->local_link_down_reason.sma == 0 && + ppd->neigh_link_down_reason.sma == 0) { + ppd->local_link_down_reason.sma = + ppd->local_link_down_reason.latest; + ppd->neigh_link_down_reason.sma = + ppd->neigh_link_down_reason.latest; + } + reset_neighbor_info(ppd); /* disable the port */ @@ -6890,7 +6969,7 @@ void handle_link_down(struct work_struct *work) * If there is no cable attached, turn the DC off. Otherwise, * start the link bring up. */ - if (!qsfp_mod_present(ppd)) { + if (ppd->port_type == PORT_TYPE_QSFP && !qsfp_mod_present(ppd)) { dc_shutdown(ppd->dd); } else { tune_serdes(ppd); @@ -7373,7 +7452,11 @@ retry: ppd->link_width_downgrade_rx_active = rx; } - if (lwde == 0) { + if (ppd->link_width_downgrade_tx_active == 0 || + ppd->link_width_downgrade_rx_active == 0) { + /* the 8051 reported a dead link as a downgrade */ + dd_dev_err(ppd->dd, "Link downgrade is really a link down, ignoring\n"); + } else if (lwde == 0) { /* downgrade is disabled */ /* bounce if not at starting active width */ @@ -7534,7 +7617,7 @@ static void handle_8051_interrupt(struct hfi1_devdata *dd, u32 unused, u64 reg) host_msg &= ~(u64)LINKUP_ACHIEVED; } if (host_msg & EXT_DEVICE_CFG_REQ) { - queue_work(ppd->hfi1_wq, &ppd->dc_host_req_work); + handle_8051_request(ppd); host_msg &= ~(u64)EXT_DEVICE_CFG_REQ; } if (host_msg & VERIFY_CAP_FRAME) { @@ -8660,6 +8743,14 @@ static void read_planned_down_reason_code(struct hfi1_devdata *dd, u8 *pdrrc) *pdrrc = (frame >> DOWN_REMOTE_REASON_SHIFT) & DOWN_REMOTE_REASON_MASK; } +static void read_link_down_reason(struct hfi1_devdata *dd, u8 *ldr) +{ + u32 frame; + + read_8051_config(dd, LINK_DOWN_REASON, GENERAL_CONFIG, &frame); + *ldr = (frame & 0xff); +} + static int read_tx_settings(struct hfi1_devdata *dd, u8 *enable_lane_tx, u8 *tx_polarity_inversion, @@ -9049,9 +9140,9 @@ set_local_link_attributes_fail: } /* - * Call this to start the link. Schedule a retry if the cable is not - * present or if unable to start polling. Do not do anything if the - * link is disabled. Returns 0 if link is disabled or moved to polling + * Call this to start the link. + * Do not do anything if the link is disabled. + * Returns 0 if link is disabled, moved to polling, or the driver is not ready. */ int start_link(struct hfi1_pportdata *ppd) { @@ -9068,15 +9159,7 @@ int start_link(struct hfi1_pportdata *ppd) return 0; } - if (qsfp_mod_present(ppd) || loopback == LOOPBACK_SERDES || - loopback == LOOPBACK_LCB || - ppd->dd->icode == ICODE_FUNCTIONAL_SIMULATOR) - return set_link_state(ppd, HLS_DN_POLL); - - dd_dev_info(ppd->dd, - "%s: stopping link start because no cable is present\n", - __func__); - return -EAGAIN; + return set_link_state(ppd, HLS_DN_POLL); } static void wait_for_qsfp_init(struct hfi1_pportdata *ppd) @@ -9247,7 +9330,7 @@ static int handle_qsfp_error_conditions(struct hfi1_pportdata *ppd, return 0; } -/* This routine will only be scheduled if the QSFP module is present */ +/* This routine will only be scheduled if the QSFP module present is asserted */ void qsfp_event(struct work_struct *work) { struct qsfp_data *qd; @@ -9676,6 +9759,7 @@ static void set_send_length(struct hfi1_pportdata *ppd) & SEND_LEN_CHECK1_LEN_VL15_MASK) << SEND_LEN_CHECK1_LEN_VL15_SHIFT; int i; + u32 thres; for (i = 0; i < ppd->vls_supported; i++) { if (dd->vld[i].mtu > maxvlmtu) @@ -9694,16 +9778,17 @@ static void set_send_length(struct hfi1_pportdata *ppd) /* adjust kernel credit return thresholds based on new MTUs */ /* all kernel receive contexts have the same hdrqentsize */ for (i = 0; i < ppd->vls_supported; i++) { - sc_set_cr_threshold(dd->vld[i].sc, - sc_mtu_to_threshold(dd->vld[i].sc, - dd->vld[i].mtu, - dd->rcd[0]-> - rcvhdrqentsize)); - } - sc_set_cr_threshold(dd->vld[15].sc, - sc_mtu_to_threshold(dd->vld[15].sc, - dd->vld[15].mtu, + thres = min(sc_percent_to_threshold(dd->vld[i].sc, 50), + sc_mtu_to_threshold(dd->vld[i].sc, + dd->vld[i].mtu, dd->rcd[0]->rcvhdrqentsize)); + sc_set_cr_threshold(dd->vld[i].sc, thres); + } + thres = min(sc_percent_to_threshold(dd->vld[15].sc, 50), + sc_mtu_to_threshold(dd->vld[15].sc, + dd->vld[15].mtu, + dd->rcd[0]->rcvhdrqentsize)); + sc_set_cr_threshold(dd->vld[15].sc, thres); /* Adjust maximum MTU for the port in DC */ dcmtu = maxvlmtu == 10240 ? DCC_CFG_PORT_MTU_CAP_10240 : @@ -10030,7 +10115,6 @@ int set_link_state(struct hfi1_pportdata *ppd, u32 state) struct hfi1_devdata *dd = ppd->dd; struct ib_event event = {.device = NULL}; int ret1, ret = 0; - int was_up, is_down; int orig_new_state, poll_bounce; mutex_lock(&ppd->hls_lock); @@ -10049,8 +10133,6 @@ int set_link_state(struct hfi1_pportdata *ppd, u32 state) poll_bounce ? "(bounce) " : "", link_state_reason_name(ppd, state)); - was_up = !!(ppd->host_link_state & HLS_UP); - /* * If we're going to a (HLS_*) link state that implies the logical * link state is neither of (IB_PORT_ARMED, IB_PORT_ACTIVE), then @@ -10261,17 +10343,6 @@ int set_link_state(struct hfi1_pportdata *ppd, u32 state) break; } - is_down = !!(ppd->host_link_state & (HLS_DN_POLL | - HLS_DN_DISABLE | HLS_DN_OFFLINE)); - - if (was_up && is_down && ppd->local_link_down_reason.sma == 0 && - ppd->neigh_link_down_reason.sma == 0) { - ppd->local_link_down_reason.sma = - ppd->local_link_down_reason.latest; - ppd->neigh_link_down_reason.sma = - ppd->neigh_link_down_reason.latest; - } - goto done; unexpected: @@ -12673,22 +12744,24 @@ static int set_up_context_variables(struct hfi1_devdata *dd) int total_contexts; int ret; unsigned ngroups; + int qos_rmt_count; + int user_rmt_reduced; /* - * Kernel contexts: (to be fixed later): - * - min or 2 or 1 context/numa + * Kernel receive contexts: + * - min of 2 or 1 context/numa (excluding control context) * - Context 0 - control context (VL15/multicast/error) - * - Context 1 - default context + * - Context 1 - first kernel context + * - Context 2 - second kernel context + * ... */ if (n_krcvqs) /* - * Don't count context 0 in n_krcvqs since - * is isn't used for normal verbs traffic. - * - * krcvqs will reflect number of kernel - * receive contexts above 0. + * n_krcvqs is the sum of module parameter kernel receive + * contexts, krcvqs[]. It does not include the control + * context, so add that. */ - num_kernel_contexts = n_krcvqs + MIN_KERNEL_KCTXTS - 1; + num_kernel_contexts = n_krcvqs + 1; else num_kernel_contexts = num_online_nodes() + 1; num_kernel_contexts = @@ -12705,12 +12778,13 @@ static int set_up_context_variables(struct hfi1_devdata *dd) num_kernel_contexts = dd->chip_send_contexts - num_vls - 1; } /* - * User contexts: (to be fixed later) - * - default to 1 user context per CPU if num_user_contexts is - * negative + * User contexts: + * - default to 1 user context per real (non-HT) CPU core if + * num_user_contexts is negative */ if (num_user_contexts < 0) - num_user_contexts = num_online_cpus(); + num_user_contexts = + cpumask_weight(&dd->affinity->real_cpu_mask); total_contexts = num_kernel_contexts + num_user_contexts; @@ -12727,6 +12801,19 @@ static int set_up_context_variables(struct hfi1_devdata *dd) total_contexts = num_kernel_contexts + num_user_contexts; } + /* each user context requires an entry in the RMT */ + qos_rmt_count = qos_rmt_entries(dd, NULL, NULL); + if (qos_rmt_count + num_user_contexts > NUM_MAP_ENTRIES) { + user_rmt_reduced = NUM_MAP_ENTRIES - qos_rmt_count; + dd_dev_err(dd, + "RMT size is reducing the number of user receive contexts from %d to %d\n", + (int)num_user_contexts, + user_rmt_reduced); + /* recalculate */ + num_user_contexts = user_rmt_reduced; + total_contexts = num_kernel_contexts + num_user_contexts; + } + /* the first N are kernel contexts, the rest are user contexts */ dd->num_rcv_contexts = total_contexts; dd->n_krcv_queues = num_kernel_contexts; @@ -12776,12 +12863,13 @@ static int set_up_context_variables(struct hfi1_devdata *dd) dd->num_send_contexts = ret; dd_dev_info( dd, - "send contexts: chip %d, used %d (kernel %d, ack %d, user %d)\n", + "send contexts: chip %d, used %d (kernel %d, ack %d, user %d, vl15 %d)\n", dd->chip_send_contexts, dd->num_send_contexts, dd->sc_sizes[SC_KERNEL].count, dd->sc_sizes[SC_ACK].count, - dd->sc_sizes[SC_USER].count); + dd->sc_sizes[SC_USER].count, + dd->sc_sizes[SC_VL15].count); ret = 0; /* success */ } @@ -13451,122 +13539,224 @@ static void init_qpmap_table(struct hfi1_devdata *dd, int i; u64 ctxt = first_ctxt; - for (i = 0; i < 256;) { + for (i = 0; i < 256; i++) { reg |= ctxt << (8 * (i % 8)); - i++; ctxt++; if (ctxt > last_ctxt) ctxt = first_ctxt; - if (i % 8 == 0) { + if (i % 8 == 7) { write_csr(dd, regno, reg); reg = 0; regno += 8; } } - if (i % 8) - write_csr(dd, regno, reg); add_rcvctrl(dd, RCV_CTRL_RCV_QP_MAP_ENABLE_SMASK | RCV_CTRL_RCV_BYPASS_ENABLE_SMASK); } -/** - * init_qos - init RX qos - * @dd - device data - * @first_context - * - * This routine initializes Rule 0 and the - * RSM map table to implement qos. - * - * If all of the limit tests succeed, - * qos is applied based on the array - * interpretation of krcvqs where - * entry 0 is VL0. - * - * The number of vl bits (n) and the number of qpn - * bits (m) are computed to feed both the RSM map table - * and the single rule. - * +struct rsm_map_table { + u64 map[NUM_MAP_REGS]; + unsigned int used; +}; + +struct rsm_rule_data { + u8 offset; + u8 pkt_type; + u32 field1_off; + u32 field2_off; + u32 index1_off; + u32 index1_width; + u32 index2_off; + u32 index2_width; + u32 mask1; + u32 value1; + u32 mask2; + u32 value2; +}; + +/* + * Return an initialized RMT map table for users to fill in. OK if it + * returns NULL, indicating no table. */ -static void init_qos(struct hfi1_devdata *dd, u32 first_ctxt) +static struct rsm_map_table *alloc_rsm_map_table(struct hfi1_devdata *dd) { + struct rsm_map_table *rmt; + u8 rxcontext = is_ax(dd) ? 0 : 0xff; /* 0 is default if a0 ver. */ + + rmt = kmalloc(sizeof(*rmt), GFP_KERNEL); + if (rmt) { + memset(rmt->map, rxcontext, sizeof(rmt->map)); + rmt->used = 0; + } + + return rmt; +} + +/* + * Write the final RMT map table to the chip and free the table. OK if + * table is NULL. + */ +static void complete_rsm_map_table(struct hfi1_devdata *dd, + struct rsm_map_table *rmt) +{ + int i; + + if (rmt) { + /* write table to chip */ + for (i = 0; i < NUM_MAP_REGS; i++) + write_csr(dd, RCV_RSM_MAP_TABLE + (8 * i), rmt->map[i]); + + /* enable RSM */ + add_rcvctrl(dd, RCV_CTRL_RCV_RSM_ENABLE_SMASK); + } +} + +/* + * Add a receive side mapping rule. + */ +static void add_rsm_rule(struct hfi1_devdata *dd, u8 rule_index, + struct rsm_rule_data *rrd) +{ + write_csr(dd, RCV_RSM_CFG + (8 * rule_index), + (u64)rrd->offset << RCV_RSM_CFG_OFFSET_SHIFT | + 1ull << rule_index | /* enable bit */ + (u64)rrd->pkt_type << RCV_RSM_CFG_PACKET_TYPE_SHIFT); + write_csr(dd, RCV_RSM_SELECT + (8 * rule_index), + (u64)rrd->field1_off << RCV_RSM_SELECT_FIELD1_OFFSET_SHIFT | + (u64)rrd->field2_off << RCV_RSM_SELECT_FIELD2_OFFSET_SHIFT | + (u64)rrd->index1_off << RCV_RSM_SELECT_INDEX1_OFFSET_SHIFT | + (u64)rrd->index1_width << RCV_RSM_SELECT_INDEX1_WIDTH_SHIFT | + (u64)rrd->index2_off << RCV_RSM_SELECT_INDEX2_OFFSET_SHIFT | + (u64)rrd->index2_width << RCV_RSM_SELECT_INDEX2_WIDTH_SHIFT); + write_csr(dd, RCV_RSM_MATCH + (8 * rule_index), + (u64)rrd->mask1 << RCV_RSM_MATCH_MASK1_SHIFT | + (u64)rrd->value1 << RCV_RSM_MATCH_VALUE1_SHIFT | + (u64)rrd->mask2 << RCV_RSM_MATCH_MASK2_SHIFT | + (u64)rrd->value2 << RCV_RSM_MATCH_VALUE2_SHIFT); +} + +/* return the number of RSM map table entries that will be used for QOS */ +static int qos_rmt_entries(struct hfi1_devdata *dd, unsigned int *mp, + unsigned int *np) +{ + int i; + unsigned int m, n; u8 max_by_vl = 0; - unsigned qpns_per_vl, ctxt, i, qpn, n = 1, m; - u64 *rsmmap; - u64 reg; - u8 rxcontext = is_ax(dd) ? 0 : 0xff; /* 0 is default if a0 ver. */ - /* validate */ + /* is QOS active at all? */ if (dd->n_krcv_queues <= MIN_KERNEL_KCTXTS || num_vls == 1 || krcvqsset <= 1) - goto bail; - for (i = 0; i < min_t(unsigned, num_vls, krcvqsset); i++) + goto no_qos; + + /* determine bits for qpn */ + for (i = 0; i < min_t(unsigned int, num_vls, krcvqsset); i++) if (krcvqs[i] > max_by_vl) max_by_vl = krcvqs[i]; if (max_by_vl > 32) - goto bail; - qpns_per_vl = __roundup_pow_of_two(max_by_vl); - /* determine bits vl */ - n = ilog2(num_vls); - /* determine bits for qpn */ - m = ilog2(qpns_per_vl); + goto no_qos; + m = ilog2(__roundup_pow_of_two(max_by_vl)); + + /* determine bits for vl */ + n = ilog2(__roundup_pow_of_two(num_vls)); + + /* reject if too much is used */ if ((m + n) > 7) + goto no_qos; + + if (mp) + *mp = m; + if (np) + *np = n; + + return 1 << (m + n); + +no_qos: + if (mp) + *mp = 0; + if (np) + *np = 0; + return 0; +} + +/** + * init_qos - init RX qos + * @dd - device data + * @rmt - RSM map table + * + * This routine initializes Rule 0 and the RSM map table to implement + * quality of service (qos). + * + * If all of the limit tests succeed, qos is applied based on the array + * interpretation of krcvqs where entry 0 is VL0. + * + * The number of vl bits (n) and the number of qpn bits (m) are computed to + * feed both the RSM map table and the single rule. + */ +static void init_qos(struct hfi1_devdata *dd, struct rsm_map_table *rmt) +{ + struct rsm_rule_data rrd; + unsigned qpns_per_vl, ctxt, i, qpn, n = 1, m; + unsigned int rmt_entries; + u64 reg; + + if (!rmt) goto bail; - if (num_vls * qpns_per_vl > dd->chip_rcv_contexts) + rmt_entries = qos_rmt_entries(dd, &m, &n); + if (rmt_entries == 0) goto bail; - rsmmap = kmalloc_array(NUM_MAP_REGS, sizeof(u64), GFP_KERNEL); - if (!rsmmap) + qpns_per_vl = 1 << m; + + /* enough room in the map table? */ + rmt_entries = 1 << (m + n); + if (rmt->used + rmt_entries >= NUM_MAP_ENTRIES) goto bail; - memset(rsmmap, rxcontext, NUM_MAP_REGS * sizeof(u64)); - /* init the local copy of the table */ - for (i = 0, ctxt = first_ctxt; i < num_vls; i++) { + + /* add qos entries to the the RSM map table */ + for (i = 0, ctxt = FIRST_KERNEL_KCTXT; i < num_vls; i++) { unsigned tctxt; for (qpn = 0, tctxt = ctxt; krcvqs[i] && qpn < qpns_per_vl; qpn++) { unsigned idx, regoff, regidx; - /* generate index <= 128 */ - idx = (qpn << n) ^ i; + /* generate the index the hardware will produce */ + idx = rmt->used + ((qpn << n) ^ i); regoff = (idx % 8) * 8; regidx = idx / 8; - reg = rsmmap[regidx]; - /* replace 0xff with context number */ + /* replace default with context number */ + reg = rmt->map[regidx]; reg &= ~(RCV_RSM_MAP_TABLE_RCV_CONTEXT_A_MASK << regoff); reg |= (u64)(tctxt++) << regoff; - rsmmap[regidx] = reg; + rmt->map[regidx] = reg; if (tctxt == ctxt + krcvqs[i]) tctxt = ctxt; } ctxt += krcvqs[i]; } - /* flush cached copies to chip */ - for (i = 0; i < NUM_MAP_REGS; i++) - write_csr(dd, RCV_RSM_MAP_TABLE + (8 * i), rsmmap[i]); - /* add rule0 */ - write_csr(dd, RCV_RSM_CFG /* + (8 * 0) */, - RCV_RSM_CFG_ENABLE_OR_CHAIN_RSM0_MASK << - RCV_RSM_CFG_ENABLE_OR_CHAIN_RSM0_SHIFT | - 2ull << RCV_RSM_CFG_PACKET_TYPE_SHIFT); - write_csr(dd, RCV_RSM_SELECT /* + (8 * 0) */, - LRH_BTH_MATCH_OFFSET << RCV_RSM_SELECT_FIELD1_OFFSET_SHIFT | - LRH_SC_MATCH_OFFSET << RCV_RSM_SELECT_FIELD2_OFFSET_SHIFT | - LRH_SC_SELECT_OFFSET << RCV_RSM_SELECT_INDEX1_OFFSET_SHIFT | - ((u64)n) << RCV_RSM_SELECT_INDEX1_WIDTH_SHIFT | - QPN_SELECT_OFFSET << RCV_RSM_SELECT_INDEX2_OFFSET_SHIFT | - ((u64)m + (u64)n) << RCV_RSM_SELECT_INDEX2_WIDTH_SHIFT); - write_csr(dd, RCV_RSM_MATCH /* + (8 * 0) */, - LRH_BTH_MASK << RCV_RSM_MATCH_MASK1_SHIFT | - LRH_BTH_VALUE << RCV_RSM_MATCH_VALUE1_SHIFT | - LRH_SC_MASK << RCV_RSM_MATCH_MASK2_SHIFT | - LRH_SC_VALUE << RCV_RSM_MATCH_VALUE2_SHIFT); - /* Enable RSM */ - add_rcvctrl(dd, RCV_CTRL_RCV_RSM_ENABLE_SMASK); - kfree(rsmmap); - /* map everything else to first context */ - init_qpmap_table(dd, FIRST_KERNEL_KCTXT, MIN_KERNEL_KCTXTS - 1); + + rrd.offset = rmt->used; + rrd.pkt_type = 2; + rrd.field1_off = LRH_BTH_MATCH_OFFSET; + rrd.field2_off = LRH_SC_MATCH_OFFSET; + rrd.index1_off = LRH_SC_SELECT_OFFSET; + rrd.index1_width = n; + rrd.index2_off = QPN_SELECT_OFFSET; + rrd.index2_width = m + n; + rrd.mask1 = LRH_BTH_MASK; + rrd.value1 = LRH_BTH_VALUE; + rrd.mask2 = LRH_SC_MASK; + rrd.value2 = LRH_SC_VALUE; + + /* add rule 0 */ + add_rsm_rule(dd, 0, &rrd); + + /* mark RSM map entries as used */ + rmt->used += rmt_entries; + /* map everything else to the mcast/err/vl15 context */ + init_qpmap_table(dd, HFI1_CTRL_CTXT, HFI1_CTRL_CTXT); dd->qos_shift = n + 1; return; bail: @@ -13574,13 +13764,86 @@ bail: init_qpmap_table(dd, FIRST_KERNEL_KCTXT, dd->n_krcv_queues - 1); } +static void init_user_fecn_handling(struct hfi1_devdata *dd, + struct rsm_map_table *rmt) +{ + struct rsm_rule_data rrd; + u64 reg; + int i, idx, regoff, regidx; + u8 offset; + + /* there needs to be enough room in the map table */ + if (rmt->used + dd->num_user_contexts >= NUM_MAP_ENTRIES) { + dd_dev_err(dd, "User FECN handling disabled - too many user contexts allocated\n"); + return; + } + + /* + * RSM will extract the destination context as an index into the + * map table. The destination contexts are a sequential block + * in the range first_user_ctxt...num_rcv_contexts-1 (inclusive). + * Map entries are accessed as offset + extracted value. Adjust + * the added offset so this sequence can be placed anywhere in + * the table - as long as the entries themselves do not wrap. + * There are only enough bits in offset for the table size, so + * start with that to allow for a "negative" offset. + */ + offset = (u8)(NUM_MAP_ENTRIES + (int)rmt->used - + (int)dd->first_user_ctxt); + + for (i = dd->first_user_ctxt, idx = rmt->used; + i < dd->num_rcv_contexts; i++, idx++) { + /* replace with identity mapping */ + regoff = (idx % 8) * 8; + regidx = idx / 8; + reg = rmt->map[regidx]; + reg &= ~(RCV_RSM_MAP_TABLE_RCV_CONTEXT_A_MASK << regoff); + reg |= (u64)i << regoff; + rmt->map[regidx] = reg; + } + + /* + * For RSM intercept of Expected FECN packets: + * o packet type 0 - expected + * o match on F (bit 95), using select/match 1, and + * o match on SH (bit 133), using select/match 2. + * + * Use index 1 to extract the 8-bit receive context from DestQP + * (start at bit 64). Use that as the RSM map table index. + */ + rrd.offset = offset; + rrd.pkt_type = 0; + rrd.field1_off = 95; + rrd.field2_off = 133; + rrd.index1_off = 64; + rrd.index1_width = 8; + rrd.index2_off = 0; + rrd.index2_width = 0; + rrd.mask1 = 1; + rrd.value1 = 1; + rrd.mask2 = 1; + rrd.value2 = 1; + + /* add rule 1 */ + add_rsm_rule(dd, 1, &rrd); + + rmt->used += dd->num_user_contexts; +} + static void init_rxe(struct hfi1_devdata *dd) { + struct rsm_map_table *rmt; + /* enable all receive errors */ write_csr(dd, RCV_ERR_MASK, ~0ull); - /* setup QPN map table - start where VL15 context leaves off */ - init_qos(dd, dd->n_krcv_queues > MIN_KERNEL_KCTXTS ? - MIN_KERNEL_KCTXTS : 0); + + rmt = alloc_rsm_map_table(dd); + /* set up QOS, including the QPN map table */ + init_qos(dd, rmt); + init_user_fecn_handling(dd, rmt); + complete_rsm_map_table(dd, rmt); + kfree(rmt); + /* * make sure RcvCtrl.RcvWcb <= PCIe Device Control * Register Max_Payload_Size (PCI_EXP_DEVCTL in Linux PCIe config @@ -13762,6 +14025,7 @@ int hfi1_set_ctxt_pkey(struct hfi1_devdata *dd, unsigned ctxt, u16 pkey) write_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_PARTITION_KEY, reg); reg = read_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_ENABLE); reg |= SEND_CTXT_CHECK_ENABLE_CHECK_PARTITION_KEY_SMASK; + reg &= ~SEND_CTXT_CHECK_ENABLE_DISALLOW_KDETH_PACKETS_SMASK; write_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_ENABLE, reg); done: return ret; @@ -14148,6 +14412,19 @@ struct hfi1_devdata *hfi1_init_dd(struct pci_dev *pdev, (dd->revision >> CCE_REVISION_SW_SHIFT) & CCE_REVISION_SW_MASK); + /* + * The real cpu mask is part of the affinity struct but has to be + * initialized earlier than the rest of the affinity struct because it + * is needed to calculate the number of user contexts in + * set_up_context_variables(). However, hfi1_dev_affinity_init(), + * which initializes the rest of the affinity struct members, + * depends on set_up_context_variables() for the number of kernel + * contexts, so it cannot be called before set_up_context_variables(). + */ + ret = init_real_cpu_mask(dd); + if (ret) + goto bail_cleanup; + ret = set_up_context_variables(dd); if (ret) goto bail_cleanup; @@ -14161,9 +14438,7 @@ struct hfi1_devdata *hfi1_init_dd(struct pci_dev *pdev, /* set up KDETH QP prefix in both RX and TX CSRs */ init_kdeth_qp(dd); - ret = hfi1_dev_affinity_init(dd); - if (ret) - goto bail_cleanup; + hfi1_dev_affinity_init(dd); /* send contexts must be set up before receive contexts */ ret = init_send_contexts(dd); diff --git a/drivers/staging/rdma/hfi1/chip.h b/drivers/staging/rdma/hfi1/chip.h index 4f3b878e43eb..1948706fff1a 100644 --- a/drivers/staging/rdma/hfi1/chip.h +++ b/drivers/staging/rdma/hfi1/chip.h @@ -389,6 +389,7 @@ #define LAST_REMOTE_STATE_COMPLETE 0x13 #define LINK_QUALITY_INFO 0x14 #define REMOTE_DEVICE_ID 0x15 +#define LINK_DOWN_REASON 0x16 /* 8051 lane specific register field IDs */ #define TX_EQ_SETTINGS 0x00 @@ -497,6 +498,11 @@ #define PWRM_BER_CONTROL 0x1 #define PWRM_BANDWIDTH_CONTROL 0x2 +/* 8051 link down reasons */ +#define LDR_LINK_TRANSFER_ACTIVE_LOW 0xa +#define LDR_RECEIVED_LINKDOWN_IDLE_MSG 0xb +#define LDR_RECEIVED_HOST_OFFLINE_REQ 0xc + /* verify capability fabric CRC size bits */ enum { CAP_CRC_14B = (1 << 0), /* 14b CRC */ @@ -691,7 +697,6 @@ void handle_verify_cap(struct work_struct *work); void handle_freeze(struct work_struct *work); void handle_link_up(struct work_struct *work); void handle_link_down(struct work_struct *work); -void handle_8051_request(struct work_struct *work); void handle_link_downgrade(struct work_struct *work); void handle_link_bounce(struct work_struct *work); void handle_sma_message(struct work_struct *work); diff --git a/drivers/staging/rdma/hfi1/chip_registers.h b/drivers/staging/rdma/hfi1/chip_registers.h index 770f05c9b8de..8744de6667c2 100644 --- a/drivers/staging/rdma/hfi1/chip_registers.h +++ b/drivers/staging/rdma/hfi1/chip_registers.h @@ -771,6 +771,7 @@ #define RCV_RSM_CFG_ENABLE_OR_CHAIN_RSM0_MASK 0x1ull #define RCV_RSM_CFG_ENABLE_OR_CHAIN_RSM0_SHIFT 0 #define RCV_RSM_CFG_PACKET_TYPE_SHIFT 60 +#define RCV_RSM_CFG_OFFSET_SHIFT 32 #define RCV_RSM_MAP_TABLE (RXE + 0x000000000900) #define RCV_RSM_MAP_TABLE_RCV_CONTEXT_A_MASK 0xFFull #define RCV_RSM_MATCH (RXE + 0x000000000800) diff --git a/drivers/staging/rdma/hfi1/diag.c b/drivers/staging/rdma/hfi1/diag.c index c5b520bf610e..bb2409ad891a 100644 --- a/drivers/staging/rdma/hfi1/diag.c +++ b/drivers/staging/rdma/hfi1/diag.c @@ -413,7 +413,8 @@ static ssize_t diagpkt_send(struct diag_pkt *dp) goto bail; } /* can only use kernel contexts */ - if (dd->send_contexts[dp->sw_index].type != SC_KERNEL) { + if (dd->send_contexts[dp->sw_index].type != SC_KERNEL && + dd->send_contexts[dp->sw_index].type != SC_VL15) { ret = -EINVAL; goto bail; } diff --git a/drivers/staging/rdma/hfi1/driver.c b/drivers/staging/rdma/hfi1/driver.c index 34511e5df1d5..700c6fa3a633 100644 --- a/drivers/staging/rdma/hfi1/driver.c +++ b/drivers/staging/rdma/hfi1/driver.c @@ -75,7 +75,8 @@ DEFINE_MUTEX(hfi1_mutex); /* general driver use */ unsigned int hfi1_max_mtu = HFI1_DEFAULT_MAX_MTU; module_param_named(max_mtu, hfi1_max_mtu, uint, S_IRUGO); -MODULE_PARM_DESC(max_mtu, "Set max MTU bytes, default is 8192"); +MODULE_PARM_DESC(max_mtu, "Set max MTU bytes, default is " __stringify( + HFI1_DEFAULT_MAX_MTU)); unsigned int hfi1_cu = 1; module_param_named(cu, hfi1_cu, uint, S_IRUGO); diff --git a/drivers/staging/rdma/hfi1/firmware.c b/drivers/staging/rdma/hfi1/firmware.c index 3040162cb326..ed680fda611d 100644 --- a/drivers/staging/rdma/hfi1/firmware.c +++ b/drivers/staging/rdma/hfi1/firmware.c @@ -1413,8 +1413,15 @@ static int __acquire_chip_resource(struct hfi1_devdata *dd, u32 resource) if (resource & CR_DYN_MASK) { /* a dynamic resource is in use if either HFI has set the bit */ - all_bits = resource_mask(0, resource) | + if (dd->pcidev->device == PCI_DEVICE_ID_INTEL0 && + (resource & (CR_I2C1 | CR_I2C2))) { + /* discrete devices must serialize across both chains */ + all_bits = resource_mask(0, CR_I2C1 | CR_I2C2) | + resource_mask(1, CR_I2C1 | CR_I2C2); + } else { + all_bits = resource_mask(0, resource) | resource_mask(1, resource); + } my_bit = resource_mask(dd->hfi1_id, resource); } else { /* non-dynamic resources are not split between HFIs */ diff --git a/drivers/staging/rdma/hfi1/hfi.h b/drivers/staging/rdma/hfi1/hfi.h index 16cbdc4073e0..7b78d56de7f5 100644 --- a/drivers/staging/rdma/hfi1/hfi.h +++ b/drivers/staging/rdma/hfi1/hfi.h @@ -455,9 +455,9 @@ struct rvt_sge_state; #define HLS_UP (HLS_UP_INIT | HLS_UP_ARMED | HLS_UP_ACTIVE) /* use this MTU size if none other is given */ -#define HFI1_DEFAULT_ACTIVE_MTU 8192 +#define HFI1_DEFAULT_ACTIVE_MTU 10240 /* use this MTU size as the default maximum */ -#define HFI1_DEFAULT_MAX_MTU 8192 +#define HFI1_DEFAULT_MAX_MTU 10240 /* default partition key */ #define DEFAULT_PKEY 0xffff @@ -606,7 +606,6 @@ struct hfi1_pportdata { struct work_struct link_vc_work; struct work_struct link_up_work; struct work_struct link_down_work; - struct work_struct dc_host_req_work; struct work_struct sma_message_work; struct work_struct freeze_work; struct work_struct link_downgrade_work; @@ -1258,7 +1257,7 @@ void receive_interrupt_work(struct work_struct *work); static inline int hdr2sc(struct hfi1_message_header *hdr, u64 rhf) { return ((be16_to_cpu(hdr->lrh[0]) >> 12) & 0xf) | - ((!!(rhf & RHF_DC_INFO_MASK)) << 4); + ((!!(rhf & RHF_DC_INFO_SMASK)) << 4); } static inline u16 generate_jkey(kuid_t uid) @@ -1333,6 +1332,9 @@ void process_becn(struct hfi1_pportdata *ppd, u8 sl, u16 rlid, u32 lqpn, void return_cnp(struct hfi1_ibport *ibp, struct rvt_qp *qp, u32 remote_qpn, u32 pkey, u32 slid, u32 dlid, u8 sc5, const struct ib_grh *old_grh); +#define PKEY_CHECK_INVALID -1 +int egress_pkey_check(struct hfi1_pportdata *ppd, __be16 *lrh, __be32 *bth, + u8 sc5, int8_t s_pkey_index); #define PACKET_EGRESS_TIMEOUT 350 static inline void pause_for_credit_return(struct hfi1_devdata *dd) @@ -1776,6 +1778,7 @@ extern struct mutex hfi1_mutex; #define HFI1_PKT_USER_SC_INTEGRITY \ (SEND_CTXT_CHECK_ENABLE_DISALLOW_NON_KDETH_PACKETS_SMASK \ + | SEND_CTXT_CHECK_ENABLE_DISALLOW_KDETH_PACKETS_SMASK \ | SEND_CTXT_CHECK_ENABLE_DISALLOW_BYPASS_SMASK \ | SEND_CTXT_CHECK_ENABLE_DISALLOW_GRH_SMASK) diff --git a/drivers/staging/rdma/hfi1/init.c b/drivers/staging/rdma/hfi1/init.c index cfcdc16b41c3..502b7cf4647d 100644 --- a/drivers/staging/rdma/hfi1/init.c +++ b/drivers/staging/rdma/hfi1/init.c @@ -422,9 +422,10 @@ static enum hrtimer_restart cca_timer_fn(struct hrtimer *t) struct cca_timer *cca_timer; struct hfi1_pportdata *ppd; int sl; - u16 ccti, ccti_timer, ccti_min; + u16 ccti_timer, ccti_min; struct cc_state *cc_state; unsigned long flags; + enum hrtimer_restart ret = HRTIMER_NORESTART; cca_timer = container_of(t, struct cca_timer, hrtimer); ppd = cca_timer->ppd; @@ -450,24 +451,21 @@ static enum hrtimer_restart cca_timer_fn(struct hrtimer *t) spin_lock_irqsave(&ppd->cca_timer_lock, flags); - ccti = cca_timer->ccti; - - if (ccti > ccti_min) { + if (cca_timer->ccti > ccti_min) { cca_timer->ccti--; set_link_ipg(ppd); } - spin_unlock_irqrestore(&ppd->cca_timer_lock, flags); - - rcu_read_unlock(); - - if (ccti > ccti_min) { + if (cca_timer->ccti > ccti_min) { unsigned long nsec = 1024 * ccti_timer; /* ccti_timer is in units of 1.024 usec */ hrtimer_forward_now(t, ns_to_ktime(nsec)); - return HRTIMER_RESTART; + ret = HRTIMER_RESTART; } - return HRTIMER_NORESTART; + + spin_unlock_irqrestore(&ppd->cca_timer_lock, flags); + rcu_read_unlock(); + return ret; } /* @@ -496,7 +494,6 @@ void hfi1_init_pportdata(struct pci_dev *pdev, struct hfi1_pportdata *ppd, INIT_WORK(&ppd->link_vc_work, handle_verify_cap); INIT_WORK(&ppd->link_up_work, handle_link_up); INIT_WORK(&ppd->link_down_work, handle_link_down); - INIT_WORK(&ppd->dc_host_req_work, handle_8051_request); INIT_WORK(&ppd->freeze_work, handle_freeze); INIT_WORK(&ppd->link_downgrade_work, handle_link_downgrade); INIT_WORK(&ppd->sma_message_work, handle_sma_message); @@ -1007,7 +1004,7 @@ void hfi1_free_devdata(struct hfi1_devdata *dd) free_percpu(dd->rcv_limit); hfi1_dev_affinity_free(dd); free_percpu(dd->send_schedule); - ib_dealloc_device(&dd->verbs_dev.rdi.ibdev); + rvt_dealloc_device(&dd->verbs_dev.rdi); } /* @@ -1110,7 +1107,7 @@ struct hfi1_devdata *hfi1_alloc_devdata(struct pci_dev *pdev, size_t extra) bail: if (!list_empty(&dd->list)) list_del_init(&dd->list); - ib_dealloc_device(&dd->verbs_dev.rdi.ibdev); + rvt_dealloc_device(&dd->verbs_dev.rdi); return ERR_PTR(ret); } diff --git a/drivers/staging/rdma/hfi1/mad.c b/drivers/staging/rdma/hfi1/mad.c index d1e7f4d7cf6f..ed58cf21e790 100644 --- a/drivers/staging/rdma/hfi1/mad.c +++ b/drivers/staging/rdma/hfi1/mad.c @@ -999,7 +999,21 @@ static int set_port_states(struct hfi1_pportdata *ppd, struct opa_smp *smp, break; } - set_link_state(ppd, link_state); + if ((link_state == HLS_DN_POLL || + link_state == HLS_DN_DOWNDEF)) { + /* + * Going to poll. No matter what the current state, + * always move offline first, then tune and start the + * link. This correctly handles a FM link bounce and + * a link enable. Going offline is a no-op if already + * offline. + */ + set_link_state(ppd, HLS_DN_OFFLINE); + tune_serdes(ppd); + start_link(ppd); + } else { + set_link_state(ppd, link_state); + } if (link_state == HLS_DN_DISABLE && (ppd->offline_disabled_reason > HFI1_ODR_MASK(OPA_LINKDOWN_REASON_SMA_DISABLED) || diff --git a/drivers/staging/rdma/hfi1/mmu_rb.c b/drivers/staging/rdma/hfi1/mmu_rb.c index b3f0682a36c9..2b0e91d3093d 100644 --- a/drivers/staging/rdma/hfi1/mmu_rb.c +++ b/drivers/staging/rdma/hfi1/mmu_rb.c @@ -91,7 +91,7 @@ static unsigned long mmu_node_start(struct mmu_rb_node *node) static unsigned long mmu_node_last(struct mmu_rb_node *node) { - return PAGE_ALIGN((node->addr & PAGE_MASK) + node->len) - 1; + return PAGE_ALIGN(node->addr + node->len) - 1; } int hfi1_mmu_rb_register(struct rb_root *root, struct mmu_rb_ops *ops) @@ -126,10 +126,15 @@ void hfi1_mmu_rb_unregister(struct rb_root *root) if (!handler) return; + /* Unregister first so we don't get any more notifications. */ + if (current->mm) + mmu_notifier_unregister(&handler->mn, current->mm); + spin_lock_irqsave(&mmu_rb_lock, flags); list_del(&handler->list); spin_unlock_irqrestore(&mmu_rb_lock, flags); + spin_lock_irqsave(&handler->lock, flags); if (!RB_EMPTY_ROOT(root)) { struct rb_node *node; struct mmu_rb_node *rbnode; @@ -141,9 +146,8 @@ void hfi1_mmu_rb_unregister(struct rb_root *root) handler->ops->remove(root, rbnode, NULL); } } + spin_unlock_irqrestore(&handler->lock, flags); - if (current->mm) - mmu_notifier_unregister(&handler->mn, current->mm); kfree(handler); } @@ -235,6 +239,25 @@ struct mmu_rb_node *hfi1_mmu_rb_search(struct rb_root *root, unsigned long addr, return node; } +struct mmu_rb_node *hfi1_mmu_rb_extract(struct rb_root *root, + unsigned long addr, unsigned long len) +{ + struct mmu_rb_handler *handler = find_mmu_handler(root); + struct mmu_rb_node *node; + unsigned long flags; + + if (!handler) + return ERR_PTR(-EINVAL); + + spin_lock_irqsave(&handler->lock, flags); + node = __mmu_rb_search(handler, addr, len); + if (node) + __mmu_int_rb_remove(node, handler->root); + spin_unlock_irqrestore(&handler->lock, flags); + + return node; +} + void hfi1_mmu_rb_remove(struct rb_root *root, struct mmu_rb_node *node) { struct mmu_rb_handler *handler = find_mmu_handler(root); @@ -293,9 +316,9 @@ static void mmu_notifier_mem_invalidate(struct mmu_notifier *mn, hfi1_cdbg(MMU, "Invalidating node addr 0x%llx, len %u", node->addr, node->len); if (handler->ops->invalidate(root, node)) { - spin_unlock_irqrestore(&handler->lock, flags); - __mmu_rb_remove(handler, node, mm); - spin_lock_irqsave(&handler->lock, flags); + __mmu_int_rb_remove(node, root); + if (handler->ops->remove) + handler->ops->remove(root, node, mm); } } spin_unlock_irqrestore(&handler->lock, flags); diff --git a/drivers/staging/rdma/hfi1/mmu_rb.h b/drivers/staging/rdma/hfi1/mmu_rb.h index 19a306e83c7d..7a57b9c49d27 100644 --- a/drivers/staging/rdma/hfi1/mmu_rb.h +++ b/drivers/staging/rdma/hfi1/mmu_rb.h @@ -70,5 +70,7 @@ int hfi1_mmu_rb_insert(struct rb_root *, struct mmu_rb_node *); void hfi1_mmu_rb_remove(struct rb_root *, struct mmu_rb_node *); struct mmu_rb_node *hfi1_mmu_rb_search(struct rb_root *, unsigned long, unsigned long); +struct mmu_rb_node *hfi1_mmu_rb_extract(struct rb_root *, unsigned long, + unsigned long); #endif /* _HFI1_MMU_RB_H */ diff --git a/drivers/staging/rdma/hfi1/pio.c b/drivers/staging/rdma/hfi1/pio.c index c6849ce9e5eb..c67b9ad3fcf4 100644 --- a/drivers/staging/rdma/hfi1/pio.c +++ b/drivers/staging/rdma/hfi1/pio.c @@ -139,23 +139,30 @@ void pio_send_control(struct hfi1_devdata *dd, int op) /* Send Context Size (SCS) wildcards */ #define SCS_POOL_0 -1 #define SCS_POOL_1 -2 + /* Send Context Count (SCC) wildcards */ #define SCC_PER_VL -1 #define SCC_PER_CPU -2 - #define SCC_PER_KRCVQ -3 -#define SCC_ACK_CREDITS 32 + +/* Send Context Size (SCS) constants */ +#define SCS_ACK_CREDITS 32 +#define SCS_VL15_CREDITS 102 /* 3 pkts of 2048B data + 128B header */ + +#define PIO_THRESHOLD_CEILING 4096 #define PIO_WAIT_BATCH_SIZE 5 /* default send context sizes */ static struct sc_config_sizes sc_config_sizes[SC_MAX] = { [SC_KERNEL] = { .size = SCS_POOL_0, /* even divide, pool 0 */ - .count = SCC_PER_VL },/* one per NUMA */ - [SC_ACK] = { .size = SCC_ACK_CREDITS, + .count = SCC_PER_VL }, /* one per NUMA */ + [SC_ACK] = { .size = SCS_ACK_CREDITS, .count = SCC_PER_KRCVQ }, [SC_USER] = { .size = SCS_POOL_0, /* even divide, pool 0 */ .count = SCC_PER_CPU }, /* one per CPU */ + [SC_VL15] = { .size = SCS_VL15_CREDITS, + .count = 1 }, }; @@ -202,7 +209,8 @@ static int wildcard_to_pool(int wc) static const char *sc_type_names[SC_MAX] = { "kernel", "ack", - "user" + "user", + "vl15" }; static const char *sc_type_name(int index) @@ -231,6 +239,22 @@ int init_sc_pools_and_sizes(struct hfi1_devdata *dd) int i; /* + * When SDMA is enabled, kernel context pio packet size is capped by + * "piothreshold". Reduce pio buffer allocation for kernel context by + * setting it to a fixed size. The allocation allows 3-deep buffering + * of the largest pio packets plus up to 128 bytes header, sufficient + * to maintain verbs performance. + * + * When SDMA is disabled, keep the default pooling allocation. + */ + if (HFI1_CAP_IS_KSET(SDMA)) { + u16 max_pkt_size = (piothreshold < PIO_THRESHOLD_CEILING) ? + piothreshold : PIO_THRESHOLD_CEILING; + sc_config_sizes[SC_KERNEL].size = + 3 * (max_pkt_size + 128) / PIO_BLOCK_SIZE; + } + + /* * Step 0: * - copy the centipercents/absolute sizes from the pool config * - sanity check these values @@ -311,7 +335,7 @@ int init_sc_pools_and_sizes(struct hfi1_devdata *dd) if (i == SC_ACK) { count = dd->n_krcv_queues; } else if (i == SC_KERNEL) { - count = (INIT_SC_PER_VL * num_vls) + 1 /* VL15 */; + count = INIT_SC_PER_VL * num_vls; } else if (count == SCC_PER_CPU) { count = dd->num_rcv_contexts - dd->n_krcv_queues; } else if (count < 0) { @@ -596,7 +620,7 @@ u32 sc_mtu_to_threshold(struct send_context *sc, u32 mtu, u32 hdrqentsize) * Return value is what to write into the CSR: trigger return when * unreturned credits pass this count. */ -static u32 sc_percent_to_threshold(struct send_context *sc, u32 percent) +u32 sc_percent_to_threshold(struct send_context *sc, u32 percent) { return (sc->credits * percent) / 100; } @@ -790,7 +814,10 @@ struct send_context *sc_alloc(struct hfi1_devdata *dd, int type, * For Ack contexts, set a threshold for half the credits. * For User contexts use the given percentage. This has been * sanitized on driver start-up. - * For Kernel contexts, use the default MTU plus a header. + * For Kernel contexts, use the default MTU plus a header + * or half the credits, whichever is smaller. This should + * work for both the 3-deep buffering allocation and the + * pooling allocation. */ if (type == SC_ACK) { thresh = sc_percent_to_threshold(sc, 50); @@ -798,7 +825,9 @@ struct send_context *sc_alloc(struct hfi1_devdata *dd, int type, thresh = sc_percent_to_threshold(sc, user_credit_return_threshold); } else { /* kernel */ - thresh = sc_mtu_to_threshold(sc, hfi1_max_mtu, hdrqentsize); + thresh = min(sc_percent_to_threshold(sc, 50), + sc_mtu_to_threshold(sc, hfi1_max_mtu, + hdrqentsize)); } reg = thresh << SC(CREDIT_CTRL_THRESHOLD_SHIFT); /* add in early return */ @@ -1531,7 +1560,8 @@ static void sc_piobufavail(struct send_context *sc) unsigned long flags; unsigned i, n = 0; - if (dd->send_contexts[sc->sw_index].type != SC_KERNEL) + if (dd->send_contexts[sc->sw_index].type != SC_KERNEL && + dd->send_contexts[sc->sw_index].type != SC_VL15) return; list = &sc->piowait; /* @@ -1900,7 +1930,7 @@ int init_pervl_scs(struct hfi1_devdata *dd) u32 ctxt; struct hfi1_pportdata *ppd = dd->pport; - dd->vld[15].sc = sc_alloc(dd, SC_KERNEL, + dd->vld[15].sc = sc_alloc(dd, SC_VL15, dd->rcd[0]->rcvhdrqentsize, dd->node); if (!dd->vld[15].sc) goto nomem; diff --git a/drivers/staging/rdma/hfi1/pio.h b/drivers/staging/rdma/hfi1/pio.h index 0026976ce4f6..53a08edb7f64 100644 --- a/drivers/staging/rdma/hfi1/pio.h +++ b/drivers/staging/rdma/hfi1/pio.h @@ -51,7 +51,8 @@ #define SC_KERNEL 0 #define SC_ACK 1 #define SC_USER 2 -#define SC_MAX 3 +#define SC_VL15 3 +#define SC_MAX 4 /* invalid send context index */ #define INVALID_SCI 0xff @@ -293,6 +294,7 @@ void sc_group_release_update(struct hfi1_devdata *dd, u32 hw_context); void sc_add_credit_return_intr(struct send_context *sc); void sc_del_credit_return_intr(struct send_context *sc); void sc_set_cr_threshold(struct send_context *sc, u32 new_threshold); +u32 sc_percent_to_threshold(struct send_context *sc, u32 percent); u32 sc_mtu_to_threshold(struct send_context *sc, u32 mtu, u32 hdrqentsize); void hfi1_sc_wantpiobuf_intr(struct send_context *sc, u32 needint); void sc_wait(struct hfi1_devdata *dd); diff --git a/drivers/staging/rdma/hfi1/platform.c b/drivers/staging/rdma/hfi1/platform.c index 0a1d074583e4..8fe8a205b5bb 100644 --- a/drivers/staging/rdma/hfi1/platform.c +++ b/drivers/staging/rdma/hfi1/platform.c @@ -114,21 +114,11 @@ static int qual_power(struct hfi1_pportdata *ppd) if (ret) return ret; - if (QSFP_HIGH_PWR(cache[QSFP_MOD_PWR_OFFS]) != 4) - cable_power_class = QSFP_HIGH_PWR(cache[QSFP_MOD_PWR_OFFS]); - else - cable_power_class = QSFP_PWR(cache[QSFP_MOD_PWR_OFFS]); + cable_power_class = get_qsfp_power_class(cache[QSFP_MOD_PWR_OFFS]); - if (cable_power_class <= 3 && cable_power_class > (power_class_max - 1)) - ppd->offline_disabled_reason = - HFI1_ODR_MASK(OPA_LINKDOWN_REASON_POWER_POLICY); - else if (cable_power_class > 4 && cable_power_class > (power_class_max)) + if (cable_power_class > power_class_max) ppd->offline_disabled_reason = HFI1_ODR_MASK(OPA_LINKDOWN_REASON_POWER_POLICY); - /* - * cable_power_class will never have value 4 as this simply - * means the high power settings are unused - */ if (ppd->offline_disabled_reason == HFI1_ODR_MASK(OPA_LINKDOWN_REASON_POWER_POLICY)) { @@ -173,12 +163,9 @@ static int set_qsfp_high_power(struct hfi1_pportdata *ppd) u8 *cache = ppd->qsfp_info.cache; int ret; - if (QSFP_HIGH_PWR(cache[QSFP_MOD_PWR_OFFS]) != 4) - cable_power_class = QSFP_HIGH_PWR(cache[QSFP_MOD_PWR_OFFS]); - else - cable_power_class = QSFP_PWR(cache[QSFP_MOD_PWR_OFFS]); + cable_power_class = get_qsfp_power_class(cache[QSFP_MOD_PWR_OFFS]); - if (cable_power_class) { + if (cable_power_class > QSFP_POWER_CLASS_1) { power_ctrl_byte = cache[QSFP_PWR_CTRL_BYTE_OFFS]; power_ctrl_byte |= 1; @@ -190,8 +177,7 @@ static int set_qsfp_high_power(struct hfi1_pportdata *ppd) if (ret != 1) return -EIO; - if (cable_power_class > 3) { - /* > power class 4*/ + if (cable_power_class > QSFP_POWER_CLASS_4) { power_ctrl_byte |= (1 << 2); ret = qsfp_write(ppd, ppd->dd->hfi1_id, QSFP_PWR_CTRL_BYTE_OFFS, @@ -212,12 +198,21 @@ static void apply_rx_cdr(struct hfi1_pportdata *ppd, { u32 rx_preset; u8 *cache = ppd->qsfp_info.cache; + int cable_power_class; if (!((cache[QSFP_MOD_PWR_OFFS] & 0x4) && (cache[QSFP_CDR_INFO_OFFS] & 0x40))) return; - /* rx_preset preset to zero to catch error */ + /* RX CDR present, bypass supported */ + cable_power_class = get_qsfp_power_class(cache[QSFP_MOD_PWR_OFFS]); + + if (cable_power_class <= QSFP_POWER_CLASS_3) { + /* Power class <= 3, ignore config & turn RX CDR on */ + *cdr_ctrl_byte |= 0xF; + return; + } + get_platform_config_field( ppd->dd, PLATFORM_CONFIG_RX_PRESET_TABLE, rx_preset_index, RX_PRESET_TABLE_QSFP_RX_CDR_APPLY, @@ -250,15 +245,25 @@ static void apply_rx_cdr(struct hfi1_pportdata *ppd, static void apply_tx_cdr(struct hfi1_pportdata *ppd, u32 tx_preset_index, - u8 *ctr_ctrl_byte) + u8 *cdr_ctrl_byte) { u32 tx_preset; u8 *cache = ppd->qsfp_info.cache; + int cable_power_class; if (!((cache[QSFP_MOD_PWR_OFFS] & 0x8) && (cache[QSFP_CDR_INFO_OFFS] & 0x80))) return; + /* TX CDR present, bypass supported */ + cable_power_class = get_qsfp_power_class(cache[QSFP_MOD_PWR_OFFS]); + + if (cable_power_class <= QSFP_POWER_CLASS_3) { + /* Power class <= 3, ignore config & turn TX CDR on */ + *cdr_ctrl_byte |= 0xF0; + return; + } + get_platform_config_field( ppd->dd, PLATFORM_CONFIG_TX_PRESET_TABLE, tx_preset_index, @@ -282,10 +287,10 @@ static void apply_tx_cdr(struct hfi1_pportdata *ppd, (tx_preset << 2) | (tx_preset << 3)); if (tx_preset) - *ctr_ctrl_byte |= (tx_preset << 4); + *cdr_ctrl_byte |= (tx_preset << 4); else /* Preserve current/determined RX CDR status */ - *ctr_ctrl_byte &= ((tx_preset << 4) | 0xF); + *cdr_ctrl_byte &= ((tx_preset << 4) | 0xF); } static void apply_cdr_settings( @@ -598,6 +603,7 @@ static void apply_tunings( "Applying TX settings"); } +/* Must be holding the QSFP i2c resource */ static int tune_active_qsfp(struct hfi1_pportdata *ppd, u32 *ptr_tx_preset, u32 *ptr_rx_preset, u32 *ptr_total_atten) { @@ -605,26 +611,19 @@ static int tune_active_qsfp(struct hfi1_pportdata *ppd, u32 *ptr_tx_preset, u16 lss = ppd->link_speed_supported, lse = ppd->link_speed_enabled; u8 *cache = ppd->qsfp_info.cache; - ret = acquire_chip_resource(ppd->dd, qsfp_resource(ppd->dd), QSFP_WAIT); - if (ret) { - dd_dev_err(ppd->dd, "%s: hfi%d: cannot lock i2c chain\n", - __func__, (int)ppd->dd->hfi1_id); - return ret; - } - ppd->qsfp_info.limiting_active = 1; ret = set_qsfp_tx(ppd, 0); if (ret) - goto bail_unlock; + return ret; ret = qual_power(ppd); if (ret) - goto bail_unlock; + return ret; ret = qual_bitrate(ppd); if (ret) - goto bail_unlock; + return ret; if (ppd->qsfp_info.reset_needed) { reset_qsfp(ppd); @@ -636,7 +635,7 @@ static int tune_active_qsfp(struct hfi1_pportdata *ppd, u32 *ptr_tx_preset, ret = set_qsfp_high_power(ppd); if (ret) - goto bail_unlock; + return ret; if (cache[QSFP_EQ_INFO_OFFS] & 0x4) { ret = get_platform_config_field( @@ -646,7 +645,7 @@ static int tune_active_qsfp(struct hfi1_pportdata *ppd, u32 *ptr_tx_preset, ptr_tx_preset, 4); if (ret) { *ptr_tx_preset = OPA_INVALID_INDEX; - goto bail_unlock; + return ret; } } else { ret = get_platform_config_field( @@ -656,7 +655,7 @@ static int tune_active_qsfp(struct hfi1_pportdata *ppd, u32 *ptr_tx_preset, ptr_tx_preset, 4); if (ret) { *ptr_tx_preset = OPA_INVALID_INDEX; - goto bail_unlock; + return ret; } } @@ -665,7 +664,7 @@ static int tune_active_qsfp(struct hfi1_pportdata *ppd, u32 *ptr_tx_preset, PORT_TABLE_RX_PRESET_IDX, ptr_rx_preset, 4); if (ret) { *ptr_rx_preset = OPA_INVALID_INDEX; - goto bail_unlock; + return ret; } if ((lss & OPA_LINK_SPEED_25G) && (lse & OPA_LINK_SPEED_25G)) @@ -685,8 +684,6 @@ static int tune_active_qsfp(struct hfi1_pportdata *ppd, u32 *ptr_tx_preset, ret = set_qsfp_tx(ppd, 1); -bail_unlock: - release_chip_resource(ppd->dd, qsfp_resource(ppd->dd)); return ret; } @@ -833,12 +830,22 @@ void tune_serdes(struct hfi1_pportdata *ppd) total_atten = platform_atten + remote_atten; tuning_method = OPA_PASSIVE_TUNING; - } else + } else { ppd->offline_disabled_reason = HFI1_ODR_MASK(OPA_LINKDOWN_REASON_CHASSIS_CONFIG); + goto bail; + } break; case PORT_TYPE_QSFP: if (qsfp_mod_present(ppd)) { + ret = acquire_chip_resource(ppd->dd, + qsfp_resource(ppd->dd), + QSFP_WAIT); + if (ret) { + dd_dev_err(ppd->dd, "%s: hfi%d: cannot lock i2c chain\n", + __func__, (int)ppd->dd->hfi1_id); + goto bail; + } refresh_qsfp_cache(ppd, &ppd->qsfp_info); if (ppd->qsfp_info.cache_valid) { @@ -853,21 +860,23 @@ void tune_serdes(struct hfi1_pportdata *ppd) * update the cache to reflect the changes */ refresh_qsfp_cache(ppd, &ppd->qsfp_info); - if (ret) - goto bail; - limiting_active = ppd->qsfp_info.limiting_active; } else { dd_dev_err(dd, "%s: Reading QSFP memory failed\n", __func__); - goto bail; + ret = -EINVAL; /* a fail indication */ } - } else + release_chip_resource(ppd->dd, qsfp_resource(ppd->dd)); + if (ret) + goto bail; + } else { ppd->offline_disabled_reason = HFI1_ODR_MASK( OPA_LINKDOWN_REASON_LOCAL_MEDIA_NOT_INSTALLED); + goto bail; + } break; default: dd_dev_info(ppd->dd, "%s: Unknown port type\n", __func__); diff --git a/drivers/staging/rdma/hfi1/qp.c b/drivers/staging/rdma/hfi1/qp.c index dc9119e1b458..91eb42316df9 100644 --- a/drivers/staging/rdma/hfi1/qp.c +++ b/drivers/staging/rdma/hfi1/qp.c @@ -167,8 +167,12 @@ static inline int opa_mtu_enum_to_int(int mtu) */ static inline int verbs_mtu_enum_to_int(struct ib_device *dev, enum ib_mtu mtu) { - int val = opa_mtu_enum_to_int((int)mtu); + int val; + /* Constraining 10KB packets to 8KB packets */ + if (mtu == (enum ib_mtu)OPA_MTU_10240) + mtu = OPA_MTU_8192; + val = opa_mtu_enum_to_int((int)mtu); if (val > 0) return val; return ib_mtu_enum_to_int(mtu); diff --git a/drivers/staging/rdma/hfi1/qsfp.c b/drivers/staging/rdma/hfi1/qsfp.c index 9ed1963010fe..2441669f0817 100644 --- a/drivers/staging/rdma/hfi1/qsfp.c +++ b/drivers/staging/rdma/hfi1/qsfp.c @@ -96,7 +96,7 @@ int i2c_write(struct hfi1_pportdata *ppd, u32 target, int i2c_addr, int offset, { int ret; - if (!check_chip_resource(ppd->dd, qsfp_resource(ppd->dd), __func__)) + if (!check_chip_resource(ppd->dd, i2c_target(target), __func__)) return -EACCES; /* make sure the TWSI bus is in a sane state */ @@ -162,7 +162,7 @@ int i2c_read(struct hfi1_pportdata *ppd, u32 target, int i2c_addr, int offset, { int ret; - if (!check_chip_resource(ppd->dd, qsfp_resource(ppd->dd), __func__)) + if (!check_chip_resource(ppd->dd, i2c_target(target), __func__)) return -EACCES; /* make sure the TWSI bus is in a sane state */ @@ -192,7 +192,7 @@ int qsfp_write(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp, int ret; u8 page; - if (!check_chip_resource(ppd->dd, qsfp_resource(ppd->dd), __func__)) + if (!check_chip_resource(ppd->dd, i2c_target(target), __func__)) return -EACCES; /* make sure the TWSI bus is in a sane state */ @@ -276,7 +276,7 @@ int qsfp_read(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp, int ret; u8 page; - if (!check_chip_resource(ppd->dd, qsfp_resource(ppd->dd), __func__)) + if (!check_chip_resource(ppd->dd, i2c_target(target), __func__)) return -EACCES; /* make sure the TWSI bus is in a sane state */ @@ -355,6 +355,8 @@ int one_qsfp_read(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp, * The calls to qsfp_{read,write} in this function correctly handle the * address map difference between this mapping and the mapping implemented * by those functions + * + * The caller must be holding the QSFP i2c chain resource. */ int refresh_qsfp_cache(struct hfi1_pportdata *ppd, struct qsfp_data *cp) { @@ -371,13 +373,9 @@ int refresh_qsfp_cache(struct hfi1_pportdata *ppd, struct qsfp_data *cp) if (!qsfp_mod_present(ppd)) { ret = -ENODEV; - goto bail_no_release; + goto bail; } - ret = acquire_chip_resource(ppd->dd, qsfp_resource(ppd->dd), QSFP_WAIT); - if (ret) - goto bail_no_release; - ret = qsfp_read(ppd, target, 0, cache, QSFP_PAGESIZE); if (ret != QSFP_PAGESIZE) { dd_dev_info(ppd->dd, @@ -440,8 +438,6 @@ int refresh_qsfp_cache(struct hfi1_pportdata *ppd, struct qsfp_data *cp) } } - release_chip_resource(ppd->dd, qsfp_resource(ppd->dd)); - spin_lock_irqsave(&ppd->qsfp_info.qsfp_lock, flags); ppd->qsfp_info.cache_valid = 1; ppd->qsfp_info.cache_refresh_required = 0; @@ -450,8 +446,6 @@ int refresh_qsfp_cache(struct hfi1_pportdata *ppd, struct qsfp_data *cp) return 0; bail: - release_chip_resource(ppd->dd, qsfp_resource(ppd->dd)); -bail_no_release: memset(cache, 0, (QSFP_MAX_NUM_PAGES * 128)); return ret; } @@ -466,7 +460,28 @@ const char * const hfi1_qsfp_devtech[16] = { #define QSFP_DUMP_CHUNK 16 /* Holds longest string */ #define QSFP_DEFAULT_HDR_CNT 224 -static const char *pwr_codes = "1.5W2.0W2.5W3.5W"; +#define QSFP_PWR(pbyte) (((pbyte) >> 6) & 3) +#define QSFP_HIGH_PWR(pbyte) ((pbyte) & 3) +/* For use with QSFP_HIGH_PWR macro */ +#define QSFP_HIGH_PWR_UNUSED 0 /* Bits [1:0] = 00 implies low power module */ + +/* + * Takes power class byte [Page 00 Byte 129] in SFF 8636 + * Returns power class as integer (1 through 7, per SFF 8636 rev 2.4) + */ +int get_qsfp_power_class(u8 power_byte) +{ + if (QSFP_HIGH_PWR(power_byte) == QSFP_HIGH_PWR_UNUSED) + /* power classes count from 1, their bit encodings from 0 */ + return (QSFP_PWR(power_byte) + 1); + /* + * 00 in the high power classes stands for unused, bringing + * balance to the off-by-1 offset above, we add 4 here to + * account for the difference between the low and high power + * groups + */ + return (QSFP_HIGH_PWR(power_byte) + 4); +} int qsfp_mod_present(struct hfi1_pportdata *ppd) { @@ -537,6 +552,16 @@ set_zeroes: return ret; } +static const char *pwr_codes[8] = {"N/AW", + "1.5W", + "2.0W", + "2.5W", + "3.5W", + "4.0W", + "4.5W", + "5.0W" + }; + int qsfp_dump(struct hfi1_pportdata *ppd, char *buf, int len) { u8 *cache = &ppd->qsfp_info.cache[0]; @@ -546,6 +571,7 @@ int qsfp_dump(struct hfi1_pportdata *ppd, char *buf, int len) int bidx = 0; u8 *atten = &cache[QSFP_ATTEN_OFFS]; u8 *vendor_oui = &cache[QSFP_VOUI_OFFS]; + u8 power_byte = 0; sofar = 0; lenstr[0] = ' '; @@ -555,9 +581,9 @@ int qsfp_dump(struct hfi1_pportdata *ppd, char *buf, int len) if (QSFP_IS_CU(cache[QSFP_MOD_TECH_OFFS])) sprintf(lenstr, "%dM ", cache[QSFP_MOD_LEN_OFFS]); + power_byte = cache[QSFP_MOD_PWR_OFFS]; sofar += scnprintf(buf + sofar, len - sofar, "PWR:%.3sW\n", - pwr_codes + - (QSFP_PWR(cache[QSFP_MOD_PWR_OFFS]) * 4)); + pwr_codes[get_qsfp_power_class(power_byte)]); sofar += scnprintf(buf + sofar, len - sofar, "TECH:%s%s\n", lenstr, diff --git a/drivers/staging/rdma/hfi1/qsfp.h b/drivers/staging/rdma/hfi1/qsfp.h index 831fe4cf1345..dadc66c442b9 100644 --- a/drivers/staging/rdma/hfi1/qsfp.h +++ b/drivers/staging/rdma/hfi1/qsfp.h @@ -82,8 +82,9 @@ /* Byte 128 is Identifier: must be 0x0c for QSFP, or 0x0d for QSFP+ */ #define QSFP_MOD_ID_OFFS 128 /* - * Byte 129 is "Extended Identifier". We only care about D7,D6: Power class - * 0:1.5W, 1:2.0W, 2:2.5W, 3:3.5W + * Byte 129 is "Extended Identifier". + * For bits [7:6]: 0:1.5W, 1:2.0W, 2:2.5W, 3:3.5W + * For bits [1:0]: 0:Unused, 1:4W, 2:4.5W, 3:5W */ #define QSFP_MOD_PWR_OFFS 129 /* Byte 130 is Connector type. Not Intel req'd */ @@ -190,6 +191,9 @@ extern const char *const hfi1_qsfp_devtech[16]; #define QSFP_HIGH_BIAS_WARNING 0x22 #define QSFP_LOW_BIAS_WARNING 0x11 +#define QSFP_ATTEN_SDR(attenarray) (attenarray[0]) +#define QSFP_ATTEN_DDR(attenarray) (attenarray[1]) + /* * struct qsfp_data encapsulates state of QSFP device for one port. * it will be part of port-specific data if a board supports QSFP. @@ -201,12 +205,6 @@ extern const char *const hfi1_qsfp_devtech[16]; * and let the qsfp_lock arbitrate access to common resources. * */ - -#define QSFP_PWR(pbyte) (((pbyte) >> 6) & 3) -#define QSFP_HIGH_PWR(pbyte) (((pbyte) & 3) | 4) -#define QSFP_ATTEN_SDR(attenarray) (attenarray[0]) -#define QSFP_ATTEN_DDR(attenarray) (attenarray[1]) - struct qsfp_data { /* Helps to find our way */ struct hfi1_pportdata *ppd; @@ -223,6 +221,7 @@ struct qsfp_data { int refresh_qsfp_cache(struct hfi1_pportdata *ppd, struct qsfp_data *cp); +int get_qsfp_power_class(u8 power_byte); int qsfp_mod_present(struct hfi1_pportdata *ppd); int get_cable_info(struct hfi1_devdata *dd, u32 port_num, u32 addr, u32 len, u8 *data); diff --git a/drivers/staging/rdma/hfi1/rc.c b/drivers/staging/rdma/hfi1/rc.c index 0d7e1017f3cb..792f15eb8efe 100644 --- a/drivers/staging/rdma/hfi1/rc.c +++ b/drivers/staging/rdma/hfi1/rc.c @@ -1497,7 +1497,7 @@ reserved: /* Ignore reserved NAK codes. */ goto bail_stop; } - return ret; + /* cannot be reached */ bail_stop: hfi1_stop_rc_timers(qp); return ret; @@ -2021,8 +2021,6 @@ void process_becn(struct hfi1_pportdata *ppd, u8 sl, u16 rlid, u32 lqpn, if (sl >= OPA_MAX_SLS) return; - cca_timer = &ppd->cca_timer[sl]; - cc_state = get_cc_state(ppd); if (!cc_state) @@ -2041,6 +2039,7 @@ void process_becn(struct hfi1_pportdata *ppd, u8 sl, u16 rlid, u32 lqpn, spin_lock_irqsave(&ppd->cca_timer_lock, flags); + cca_timer = &ppd->cca_timer[sl]; if (cca_timer->ccti < ccti_limit) { if (cca_timer->ccti + ccti_incr <= ccti_limit) cca_timer->ccti += ccti_incr; @@ -2049,8 +2048,6 @@ void process_becn(struct hfi1_pportdata *ppd, u8 sl, u16 rlid, u32 lqpn, set_link_ipg(ppd); } - spin_unlock_irqrestore(&ppd->cca_timer_lock, flags); - ccti = cca_timer->ccti; if (!hrtimer_active(&cca_timer->hrtimer)) { @@ -2061,6 +2058,8 @@ void process_becn(struct hfi1_pportdata *ppd, u8 sl, u16 rlid, u32 lqpn, HRTIMER_MODE_REL); } + spin_unlock_irqrestore(&ppd->cca_timer_lock, flags); + if ((trigger_threshold != 0) && (ccti >= trigger_threshold)) log_cca_event(ppd, sl, rlid, lqpn, rqpn, svc_type); } diff --git a/drivers/staging/rdma/hfi1/ruc.c b/drivers/staging/rdma/hfi1/ruc.c index 08813cdbd475..a659aec3c3c6 100644 --- a/drivers/staging/rdma/hfi1/ruc.c +++ b/drivers/staging/rdma/hfi1/ruc.c @@ -831,7 +831,6 @@ void hfi1_do_send(struct rvt_qp *qp) struct hfi1_pkt_state ps; struct hfi1_qp_priv *priv = qp->priv; int (*make_req)(struct rvt_qp *qp, struct hfi1_pkt_state *ps); - unsigned long flags; unsigned long timeout; unsigned long timeout_int; int cpu; @@ -866,11 +865,11 @@ void hfi1_do_send(struct rvt_qp *qp) timeout_int = SEND_RESCHED_TIMEOUT; } - spin_lock_irqsave(&qp->s_lock, flags); + spin_lock_irqsave(&qp->s_lock, ps.flags); /* Return if we are already busy processing a work request. */ if (!hfi1_send_ok(qp)) { - spin_unlock_irqrestore(&qp->s_lock, flags); + spin_unlock_irqrestore(&qp->s_lock, ps.flags); return; } @@ -884,7 +883,7 @@ void hfi1_do_send(struct rvt_qp *qp) do { /* Check for a constructed packet to be sent. */ if (qp->s_hdrwords != 0) { - spin_unlock_irqrestore(&qp->s_lock, flags); + spin_unlock_irqrestore(&qp->s_lock, ps.flags); /* * If the packet cannot be sent now, return and * the send tasklet will be woken up later. @@ -897,11 +896,14 @@ void hfi1_do_send(struct rvt_qp *qp) if (unlikely(time_after(jiffies, timeout))) { if (workqueue_congested(cpu, ps.ppd->hfi1_wq)) { - spin_lock_irqsave(&qp->s_lock, flags); + spin_lock_irqsave( + &qp->s_lock, + ps.flags); qp->s_flags &= ~RVT_S_BUSY; hfi1_schedule_send(qp); - spin_unlock_irqrestore(&qp->s_lock, - flags); + spin_unlock_irqrestore( + &qp->s_lock, + ps.flags); this_cpu_inc( *ps.ppd->dd->send_schedule); return; @@ -913,11 +915,11 @@ void hfi1_do_send(struct rvt_qp *qp) } timeout = jiffies + (timeout_int) / 8; } - spin_lock_irqsave(&qp->s_lock, flags); + spin_lock_irqsave(&qp->s_lock, ps.flags); } } while (make_req(qp, &ps)); - spin_unlock_irqrestore(&qp->s_lock, flags); + spin_unlock_irqrestore(&qp->s_lock, ps.flags); } /* diff --git a/drivers/staging/rdma/hfi1/sysfs.c b/drivers/staging/rdma/hfi1/sysfs.c index c7f1271190af..8cd6df8634ad 100644 --- a/drivers/staging/rdma/hfi1/sysfs.c +++ b/drivers/staging/rdma/hfi1/sysfs.c @@ -84,7 +84,7 @@ static ssize_t read_cc_table_bin(struct file *filp, struct kobject *kobj, rcu_read_unlock(); return -EINVAL; } - memcpy(buf, &cc_state->cct, count); + memcpy(buf, (void *)&cc_state->cct + pos, count); rcu_read_unlock(); return count; @@ -131,7 +131,7 @@ static ssize_t read_cc_setting_bin(struct file *filp, struct kobject *kobj, rcu_read_unlock(); return -EINVAL; } - memcpy(buf, &cc_state->cong_setting, count); + memcpy(buf, (void *)&cc_state->cong_setting + pos, count); rcu_read_unlock(); return count; diff --git a/drivers/staging/rdma/hfi1/ud.c b/drivers/staging/rdma/hfi1/ud.c index ae8a70f703eb..1e503ad0bebb 100644 --- a/drivers/staging/rdma/hfi1/ud.c +++ b/drivers/staging/rdma/hfi1/ud.c @@ -322,7 +322,7 @@ int hfi1_make_ud_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) (lid == ppd->lid || (lid == be16_to_cpu(IB_LID_PERMISSIVE) && qp->ibqp.qp_type == IB_QPT_GSI)))) { - unsigned long flags; + unsigned long tflags = ps->flags; /* * If DMAs are in progress, we can't generate * a completion for the loopback packet since @@ -335,10 +335,10 @@ int hfi1_make_ud_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) goto bail; } qp->s_cur = next_cur; - local_irq_save(flags); - spin_unlock_irqrestore(&qp->s_lock, flags); + spin_unlock_irqrestore(&qp->s_lock, tflags); ud_loopback(qp, wqe); - spin_lock_irqsave(&qp->s_lock, flags); + spin_lock_irqsave(&qp->s_lock, tflags); + ps->flags = tflags; hfi1_send_complete(qp, wqe, IB_WC_SUCCESS); goto done_free_tx; } diff --git a/drivers/staging/rdma/hfi1/user_exp_rcv.c b/drivers/staging/rdma/hfi1/user_exp_rcv.c index 8bd56d5c783d..1b640a35b3fe 100644 --- a/drivers/staging/rdma/hfi1/user_exp_rcv.c +++ b/drivers/staging/rdma/hfi1/user_exp_rcv.c @@ -399,8 +399,11 @@ int hfi1_user_exp_rcv_setup(struct file *fp, struct hfi1_tid_info *tinfo) * pages, accept the amount pinned so far and program only that. * User space knows how to deal with partially programmed buffers. */ - if (!hfi1_can_pin_pages(dd, fd->tid_n_pinned, npages)) - return -ENOMEM; + if (!hfi1_can_pin_pages(dd, fd->tid_n_pinned, npages)) { + ret = -ENOMEM; + goto bail; + } + pinned = hfi1_acquire_user_pages(vaddr, npages, true, pages); if (pinned <= 0) { ret = pinned; diff --git a/drivers/staging/rdma/hfi1/user_sdma.c b/drivers/staging/rdma/hfi1/user_sdma.c index d53a659548e0..0014c9c0e967 100644 --- a/drivers/staging/rdma/hfi1/user_sdma.c +++ b/drivers/staging/rdma/hfi1/user_sdma.c @@ -180,6 +180,8 @@ struct user_sdma_iovec { u64 offset; }; +#define SDMA_CACHE_NODE_EVICT BIT(0) + struct sdma_mmu_node { struct mmu_rb_node rb; struct list_head list; @@ -187,6 +189,7 @@ struct sdma_mmu_node { atomic_t refcount; struct page **pages; unsigned npages; + unsigned long flags; }; struct user_sdma_request { @@ -597,6 +600,13 @@ int hfi1_user_sdma_process_request(struct file *fp, struct iovec *iovec, goto free_req; } + /* Checking P_KEY for requests from user-space */ + if (egress_pkey_check(dd->pport, req->hdr.lrh, req->hdr.bth, sc, + PKEY_CHECK_INVALID)) { + ret = -EINVAL; + goto free_req; + } + /* * Also should check the BTH.lnh. If it says the next header is GRH then * the RXE parsing will be off and will land in the middle of the KDETH @@ -1030,27 +1040,29 @@ static inline int num_user_pages(const struct iovec *iov) return 1 + ((epage - spage) >> PAGE_SHIFT); } -/* Caller must hold pq->evict_lock */ static u32 sdma_cache_evict(struct hfi1_user_sdma_pkt_q *pq, u32 npages) { u32 cleared = 0; struct sdma_mmu_node *node, *ptr; + struct list_head to_evict = LIST_HEAD_INIT(to_evict); + spin_lock(&pq->evict_lock); list_for_each_entry_safe_reverse(node, ptr, &pq->evict, list) { /* Make sure that no one is still using the node. */ if (!atomic_read(&node->refcount)) { - /* - * Need to use the page count now as the remove callback - * will free the node. - */ + set_bit(SDMA_CACHE_NODE_EVICT, &node->flags); + list_del_init(&node->list); + list_add(&node->list, &to_evict); cleared += node->npages; - spin_unlock(&pq->evict_lock); - hfi1_mmu_rb_remove(&pq->sdma_rb_root, &node->rb); - spin_lock(&pq->evict_lock); if (cleared >= npages) break; } } + spin_unlock(&pq->evict_lock); + + list_for_each_entry_safe(node, ptr, &to_evict, list) + hfi1_mmu_rb_remove(&pq->sdma_rb_root, &node->rb); + return cleared; } @@ -1062,9 +1074,9 @@ static int pin_vector_pages(struct user_sdma_request *req, struct sdma_mmu_node *node = NULL; struct mmu_rb_node *rb_node; - rb_node = hfi1_mmu_rb_search(&pq->sdma_rb_root, - (unsigned long)iovec->iov.iov_base, - iovec->iov.iov_len); + rb_node = hfi1_mmu_rb_extract(&pq->sdma_rb_root, + (unsigned long)iovec->iov.iov_base, + iovec->iov.iov_len); if (rb_node && !IS_ERR(rb_node)) node = container_of(rb_node, struct sdma_mmu_node, rb); else @@ -1076,7 +1088,6 @@ static int pin_vector_pages(struct user_sdma_request *req, return -ENOMEM; node->rb.addr = (unsigned long)iovec->iov.iov_base; - node->rb.len = iovec->iov.iov_len; node->pq = pq; atomic_set(&node->refcount, 0); INIT_LIST_HEAD(&node->list); @@ -1093,11 +1104,25 @@ static int pin_vector_pages(struct user_sdma_request *req, memcpy(pages, node->pages, node->npages * sizeof(*pages)); npages -= node->npages; + + /* + * If rb_node is NULL, it means that this is brand new node + * and, therefore not on the eviction list. + * If, however, the rb_node is non-NULL, it means that the + * node is already in RB tree and, therefore on the eviction + * list (nodes are unconditionally inserted in the eviction + * list). In that case, we have to remove the node prior to + * calling the eviction function in order to prevent it from + * freeing this node. + */ + if (rb_node) { + spin_lock(&pq->evict_lock); + list_del_init(&node->list); + spin_unlock(&pq->evict_lock); + } retry: if (!hfi1_can_pin_pages(pq->dd, pq->n_locked, npages)) { - spin_lock(&pq->evict_lock); cleared = sdma_cache_evict(pq, npages); - spin_unlock(&pq->evict_lock); if (cleared >= npages) goto retry; } @@ -1117,37 +1142,32 @@ retry: goto bail; } kfree(node->pages); + node->rb.len = iovec->iov.iov_len; node->pages = pages; node->npages += pinned; npages = node->npages; spin_lock(&pq->evict_lock); - if (!rb_node) - list_add(&node->list, &pq->evict); - else - list_move(&node->list, &pq->evict); + list_add(&node->list, &pq->evict); pq->n_locked += pinned; spin_unlock(&pq->evict_lock); } iovec->pages = node->pages; iovec->npages = npages; - if (!rb_node) { - ret = hfi1_mmu_rb_insert(&req->pq->sdma_rb_root, &node->rb); - if (ret) { - spin_lock(&pq->evict_lock); + ret = hfi1_mmu_rb_insert(&req->pq->sdma_rb_root, &node->rb); + if (ret) { + spin_lock(&pq->evict_lock); + if (!list_empty(&node->list)) list_del(&node->list); - pq->n_locked -= node->npages; - spin_unlock(&pq->evict_lock); - ret = 0; - goto bail; - } - } else { - atomic_inc(&node->refcount); + pq->n_locked -= node->npages; + spin_unlock(&pq->evict_lock); + goto bail; } return 0; bail: - if (!rb_node) - kfree(node); + if (rb_node) + unpin_vector_pages(current->mm, node->pages, 0, node->npages); + kfree(node); return ret; } @@ -1558,7 +1578,20 @@ static void sdma_rb_remove(struct rb_root *root, struct mmu_rb_node *mnode, container_of(mnode, struct sdma_mmu_node, rb); spin_lock(&node->pq->evict_lock); - list_del(&node->list); + /* + * We've been called by the MMU notifier but this node has been + * scheduled for eviction. The eviction function will take care + * of freeing this node. + * We have to take the above lock first because we are racing + * against the setting of the bit in the eviction function. + */ + if (mm && test_bit(SDMA_CACHE_NODE_EVICT, &node->flags)) { + spin_unlock(&node->pq->evict_lock); + return; + } + + if (!list_empty(&node->list)) + list_del(&node->list); node->pq->n_locked -= node->npages; spin_unlock(&node->pq->evict_lock); diff --git a/drivers/staging/rdma/hfi1/verbs.c b/drivers/staging/rdma/hfi1/verbs.c index 89f2aad45c1b..9cdc85fa366f 100644 --- a/drivers/staging/rdma/hfi1/verbs.c +++ b/drivers/staging/rdma/hfi1/verbs.c @@ -545,7 +545,7 @@ static inline int qp_ok(int opcode, struct hfi1_packet *packet) if (!(ib_rvt_state_ops[packet->qp->state] & RVT_PROCESS_RECV_OK)) goto dropit; - if (((opcode & OPCODE_QP_MASK) == packet->qp->allowed_ops) || + if (((opcode & RVT_OPCODE_QP_MASK) == packet->qp->allowed_ops) || (opcode == IB_OPCODE_CNP)) return 1; dropit: @@ -1089,16 +1089,16 @@ bail: /* * egress_pkey_matches_entry - return 1 if the pkey matches ent (ent - * being an entry from the ingress partition key table), return 0 + * being an entry from the partition key table), return 0 * otherwise. Use the matching criteria for egress partition keys * specified in the OPAv1 spec., section 9.1l.7. */ static inline int egress_pkey_matches_entry(u16 pkey, u16 ent) { u16 mkey = pkey & PKEY_LOW_15_MASK; - u16 ment = ent & PKEY_LOW_15_MASK; + u16 mentry = ent & PKEY_LOW_15_MASK; - if (mkey == ment) { + if (mkey == mentry) { /* * If pkey[15] is set (full partition member), * is bit 15 in the corresponding table element @@ -1111,32 +1111,32 @@ static inline int egress_pkey_matches_entry(u16 pkey, u16 ent) return 0; } -/* - * egress_pkey_check - return 0 if hdr's pkey matches according to the - * criteria in the OPAv1 spec., section 9.11.7. +/** + * egress_pkey_check - check P_KEY of a packet + * @ppd: Physical IB port data + * @lrh: Local route header + * @bth: Base transport header + * @sc5: SC for packet + * @s_pkey_index: It will be used for look up optimization for kernel contexts + * only. If it is negative value, then it means user contexts is calling this + * function. + * + * It checks if hdr's pkey is valid. + * + * Return: 0 on success, otherwise, 1 */ -static inline int egress_pkey_check(struct hfi1_pportdata *ppd, - struct hfi1_ib_header *hdr, - struct rvt_qp *qp) +int egress_pkey_check(struct hfi1_pportdata *ppd, __be16 *lrh, __be32 *bth, + u8 sc5, int8_t s_pkey_index) { - struct hfi1_qp_priv *priv = qp->priv; - struct hfi1_other_headers *ohdr; struct hfi1_devdata *dd; - int i = 0; + int i; u16 pkey; - u8 lnh, sc5 = priv->s_sc; + int is_user_ctxt_mechanism = (s_pkey_index < 0); if (!(ppd->part_enforce & HFI1_PART_ENFORCE_OUT)) return 0; - /* locate the pkey within the headers */ - lnh = be16_to_cpu(hdr->lrh[0]) & 3; - if (lnh == HFI1_LRH_GRH) - ohdr = &hdr->u.l.oth; - else - ohdr = &hdr->u.oth; - - pkey = (u16)be32_to_cpu(ohdr->bth[0]); + pkey = (u16)be32_to_cpu(bth[0]); /* If SC15, pkey[0:14] must be 0x7fff */ if ((sc5 == 0xf) && ((pkey & PKEY_LOW_15_MASK) != PKEY_LOW_15_MASK)) @@ -1146,28 +1146,37 @@ static inline int egress_pkey_check(struct hfi1_pportdata *ppd, if ((pkey & PKEY_LOW_15_MASK) == 0) goto bad; - /* The most likely matching pkey has index qp->s_pkey_index */ - if (unlikely(!egress_pkey_matches_entry(pkey, - ppd->pkeys - [qp->s_pkey_index]))) { - /* no match - try the entire table */ - for (; i < MAX_PKEY_VALUES; i++) { - if (egress_pkey_matches_entry(pkey, ppd->pkeys[i])) - break; - } + /* + * For the kernel contexts only, if a qp is passed into the function, + * the most likely matching pkey has index qp->s_pkey_index + */ + if (!is_user_ctxt_mechanism && + egress_pkey_matches_entry(pkey, ppd->pkeys[s_pkey_index])) { + return 0; } - if (i < MAX_PKEY_VALUES) - return 0; + for (i = 0; i < MAX_PKEY_VALUES; i++) { + if (egress_pkey_matches_entry(pkey, ppd->pkeys[i])) + return 0; + } bad: - incr_cntr64(&ppd->port_xmit_constraint_errors); - dd = ppd->dd; - if (!(dd->err_info_xmit_constraint.status & OPA_EI_STATUS_SMASK)) { - u16 slid = be16_to_cpu(hdr->lrh[3]); - - dd->err_info_xmit_constraint.status |= OPA_EI_STATUS_SMASK; - dd->err_info_xmit_constraint.slid = slid; - dd->err_info_xmit_constraint.pkey = pkey; + /* + * For the user-context mechanism, the P_KEY check would only happen + * once per SDMA request, not once per packet. Therefore, there's no + * need to increment the counter for the user-context mechanism. + */ + if (!is_user_ctxt_mechanism) { + incr_cntr64(&ppd->port_xmit_constraint_errors); + dd = ppd->dd; + if (!(dd->err_info_xmit_constraint.status & + OPA_EI_STATUS_SMASK)) { + u16 slid = be16_to_cpu(lrh[3]); + + dd->err_info_xmit_constraint.status |= + OPA_EI_STATUS_SMASK; + dd->err_info_xmit_constraint.slid = slid; + dd->err_info_xmit_constraint.pkey = pkey; + } } return 1; } @@ -1227,11 +1236,26 @@ int hfi1_verbs_send(struct rvt_qp *qp, struct hfi1_pkt_state *ps) { struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device); struct hfi1_qp_priv *priv = qp->priv; + struct hfi1_other_headers *ohdr; + struct hfi1_ib_header *hdr; send_routine sr; int ret; + u8 lnh; + + hdr = &ps->s_txreq->phdr.hdr; + /* locate the pkey within the headers */ + lnh = be16_to_cpu(hdr->lrh[0]) & 3; + if (lnh == HFI1_LRH_GRH) + ohdr = &hdr->u.l.oth; + else + ohdr = &hdr->u.oth; sr = get_send_routine(qp, ps->s_txreq); - ret = egress_pkey_check(dd->pport, &ps->s_txreq->phdr.hdr, qp); + ret = egress_pkey_check(dd->pport, + hdr->lrh, + ohdr->bth, + priv->s_sc, + qp->s_pkey_index); if (unlikely(ret)) { /* * The value we are returning here does not get propagated to diff --git a/drivers/staging/rdma/hfi1/verbs.h b/drivers/staging/rdma/hfi1/verbs.h index 6c4670fffdbb..3ee223983b20 100644 --- a/drivers/staging/rdma/hfi1/verbs.h +++ b/drivers/staging/rdma/hfi1/verbs.h @@ -215,6 +215,7 @@ struct hfi1_pkt_state { struct hfi1_ibport *ibp; struct hfi1_pportdata *ppd; struct verbs_txreq *s_txreq; + unsigned long flags; }; #define HFI1_PSN_CREDIT 16 @@ -334,9 +335,6 @@ int hfi1_process_mad(struct ib_device *ibdev, int mad_flags, u8 port, #endif #define PSN_MODIFY_MASK 0xFFFFFF -/* Number of bits to pay attention to in the opcode for checking qp type */ -#define OPCODE_QP_MASK 0xE0 - /* * Compare the lower 24 bits of the msn values. * Returns an integer <, ==, or > than zero. |