1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
|
/* SPDX-License-Identifier: GPL-2.0 */
/*
* Shared Memory Communications over RDMA (SMC-R) and RoCE
*
* Definitions for SMC Connections, Link Groups and Links
*
* Copyright IBM Corp. 2016
*
* Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
*/
#ifndef _SMC_CORE_H
#define _SMC_CORE_H
#include <linux/atomic.h>
#include <rdma/ib_verbs.h>
#include "smc.h"
#include "smc_ib.h"
#define SMC_RMBS_PER_LGR_MAX 255 /* max. # of RMBs per link group */
struct smc_lgr_list { /* list of link group definition */
struct list_head list;
spinlock_t lock; /* protects list of link groups */
u32 num; /* unique link group number */
};
enum smc_lgr_role { /* possible roles of a link group */
SMC_CLNT, /* client */
SMC_SERV /* server */
};
enum smc_link_state { /* possible states of a link */
SMC_LNK_UNUSED, /* link is unused */
SMC_LNK_INACTIVE, /* link is inactive */
SMC_LNK_ACTIVATING, /* link is being activated */
SMC_LNK_ACTIVE, /* link is active */
};
#define SMC_WR_BUF_SIZE 48 /* size of work request buffer */
struct smc_wr_buf {
u8 raw[SMC_WR_BUF_SIZE];
};
#define SMC_WR_REG_MR_WAIT_TIME (5 * HZ)/* wait time for ib_wr_reg_mr result */
enum smc_wr_reg_state {
POSTED, /* ib_wr_reg_mr request posted */
CONFIRMED, /* ib_wr_reg_mr response: successful */
FAILED /* ib_wr_reg_mr response: failure */
};
struct smc_rdma_sge { /* sges for RDMA writes */
struct ib_sge wr_tx_rdma_sge[SMC_IB_MAX_SEND_SGE];
};
#define SMC_MAX_RDMA_WRITES 2 /* max. # of RDMA writes per
* message send
*/
struct smc_rdma_sges { /* sges per message send */
struct smc_rdma_sge tx_rdma_sge[SMC_MAX_RDMA_WRITES];
};
struct smc_rdma_wr { /* work requests per message
* send
*/
struct ib_rdma_wr wr_tx_rdma[SMC_MAX_RDMA_WRITES];
};
#define SMC_LGR_ID_SIZE 4
struct smc_link {
struct smc_ib_device *smcibdev; /* ib-device */
u8 ibport; /* port - values 1 | 2 */
struct ib_pd *roce_pd; /* IB protection domain,
* unique for every RoCE QP
*/
struct ib_qp *roce_qp; /* IB queue pair */
struct ib_qp_attr qp_attr; /* IB queue pair attributes */
struct smc_wr_buf *wr_tx_bufs; /* WR send payload buffers */
struct ib_send_wr *wr_tx_ibs; /* WR send meta data */
struct ib_sge *wr_tx_sges; /* WR send gather meta data */
struct smc_rdma_sges *wr_tx_rdma_sges;/*RDMA WRITE gather meta data*/
struct smc_rdma_wr *wr_tx_rdmas; /* WR RDMA WRITE */
struct smc_wr_tx_pend *wr_tx_pends; /* WR send waiting for CQE */
struct completion *wr_tx_compl; /* WR send CQE completion */
/* above four vectors have wr_tx_cnt elements and use the same index */
dma_addr_t wr_tx_dma_addr; /* DMA address of wr_tx_bufs */
atomic_long_t wr_tx_id; /* seq # of last sent WR */
unsigned long *wr_tx_mask; /* bit mask of used indexes */
u32 wr_tx_cnt; /* number of WR send buffers */
wait_queue_head_t wr_tx_wait; /* wait for free WR send buf */
struct smc_wr_buf *wr_rx_bufs; /* WR recv payload buffers */
struct ib_recv_wr *wr_rx_ibs; /* WR recv meta data */
struct ib_sge *wr_rx_sges; /* WR recv scatter meta data */
/* above three vectors have wr_rx_cnt elements and use the same index */
dma_addr_t wr_rx_dma_addr; /* DMA address of wr_rx_bufs */
u64 wr_rx_id; /* seq # of last recv WR */
u32 wr_rx_cnt; /* number of WR recv buffers */
unsigned long wr_rx_tstamp; /* jiffies when last buf rx */
struct ib_reg_wr wr_reg; /* WR register memory region */
wait_queue_head_t wr_reg_wait; /* wait for wr_reg result */
enum smc_wr_reg_state wr_reg_state; /* state of wr_reg request */
u8 gid[SMC_GID_SIZE];/* gid matching used vlan id*/
u8 sgid_index; /* gid index for vlan id */
u32 peer_qpn; /* QP number of peer */
enum ib_mtu path_mtu; /* used mtu */
enum ib_mtu peer_mtu; /* mtu size of peer */
u32 psn_initial; /* QP tx initial packet seqno */
u32 peer_psn; /* QP rx initial packet seqno */
u8 peer_mac[ETH_ALEN]; /* = gid[8:10||13:15] */
u8 peer_gid[SMC_GID_SIZE]; /* gid of peer*/
u8 link_id; /* unique # within link group */
u8 link_uid[SMC_LGR_ID_SIZE]; /* unique lnk id */
u8 peer_link_uid[SMC_LGR_ID_SIZE]; /* peer uid */
u8 link_idx; /* index in lgr link array */
u8 link_is_asym; /* is link asymmetric? */
struct smc_link_group *lgr; /* parent link group */
struct work_struct link_down_wrk; /* wrk to bring link down */
enum smc_link_state state; /* state of link */
struct delayed_work llc_testlink_wrk; /* testlink worker */
struct completion llc_testlink_resp; /* wait for rx of testlink */
int llc_testlink_time; /* testlink interval */
};
/* For now we just allow one parallel link per link group. The SMC protocol
* allows more (up to 8).
*/
#define SMC_LINKS_PER_LGR_MAX 3
#define SMC_SINGLE_LINK 0
#define SMC_FIRST_CONTACT 1 /* first contact to a peer */
#define SMC_REUSE_CONTACT 0 /* follow-on contact to a peer*/
/* tx/rx buffer list element for sndbufs list and rmbs list of a lgr */
struct smc_buf_desc {
struct list_head list;
void *cpu_addr; /* virtual address of buffer */
struct page *pages;
int len; /* length of buffer */
u32 used; /* currently used / unused */
union {
struct { /* SMC-R */
struct sg_table sgt[SMC_LINKS_PER_LGR_MAX];
/* virtual buffer */
struct ib_mr *mr_rx[SMC_LINKS_PER_LGR_MAX];
/* for rmb only: memory region
* incl. rkey provided to peer
*/
u32 order; /* allocation order */
u8 is_conf_rkey;
/* confirm_rkey done */
u8 is_reg_mr[SMC_LINKS_PER_LGR_MAX];
/* mem region registered */
u8 is_map_ib[SMC_LINKS_PER_LGR_MAX];
/* mem region mapped to lnk */
u8 is_reg_err;
/* buffer registration err */
};
struct { /* SMC-D */
unsigned short sba_idx;
/* SBA index number */
u64 token;
/* DMB token number */
dma_addr_t dma_addr;
/* DMA address */
};
};
};
struct smc_rtoken { /* address/key of remote RMB */
u64 dma_addr;
u32 rkey;
};
#define SMC_BUF_MIN_SIZE 16384 /* minimum size of an RMB */
#define SMC_RMBE_SIZES 16 /* number of distinct RMBE sizes */
/* theoretically, the RFC states that largest size would be 512K,
* i.e. compressed 5 and thus 6 sizes (0..5), despite
* struct smc_clc_msg_accept_confirm.rmbe_size being a 4 bit value (0..15)
*/
struct smcd_dev;
enum smc_lgr_type { /* redundancy state of lgr */
SMC_LGR_NONE, /* no active links, lgr to be deleted */
SMC_LGR_SINGLE, /* 1 active RNIC on each peer */
SMC_LGR_SYMMETRIC, /* 2 active RNICs on each peer */
SMC_LGR_ASYMMETRIC_PEER, /* local has 2, peer 1 active RNICs */
SMC_LGR_ASYMMETRIC_LOCAL, /* local has 1, peer 2 active RNICs */
};
enum smc_llc_flowtype {
SMC_LLC_FLOW_NONE = 0,
SMC_LLC_FLOW_ADD_LINK = 2,
SMC_LLC_FLOW_DEL_LINK = 4,
SMC_LLC_FLOW_RKEY = 6,
};
struct smc_llc_qentry;
struct smc_llc_flow {
enum smc_llc_flowtype type;
struct smc_llc_qentry *qentry;
};
struct smc_link_group {
struct list_head list;
struct rb_root conns_all; /* connection tree */
rwlock_t conns_lock; /* protects conns_all */
unsigned int conns_num; /* current # of connections */
unsigned short vlan_id; /* vlan id of link group */
struct list_head sndbufs[SMC_RMBE_SIZES];/* tx buffers */
struct mutex sndbufs_lock; /* protects tx buffers */
struct list_head rmbs[SMC_RMBE_SIZES]; /* rx buffers */
struct mutex rmbs_lock; /* protects rx buffers */
u8 id[SMC_LGR_ID_SIZE]; /* unique lgr id */
struct delayed_work free_work; /* delayed freeing of an lgr */
struct work_struct terminate_work; /* abnormal lgr termination */
u8 sync_err : 1; /* lgr no longer fits to peer */
u8 terminating : 1;/* lgr is terminating */
u8 freefast : 1; /* free worker scheduled fast */
u8 freeing : 1; /* lgr is being freed */
bool is_smcd; /* SMC-R or SMC-D */
union {
struct { /* SMC-R */
enum smc_lgr_role role;
/* client or server */
struct smc_link lnk[SMC_LINKS_PER_LGR_MAX];
/* smc link */
char peer_systemid[SMC_SYSTEMID_LEN];
/* unique system_id of peer */
struct smc_rtoken rtokens[SMC_RMBS_PER_LGR_MAX]
[SMC_LINKS_PER_LGR_MAX];
/* remote addr/key pairs */
DECLARE_BITMAP(rtokens_used_mask, SMC_RMBS_PER_LGR_MAX);
/* used rtoken elements */
u8 next_link_id;
enum smc_lgr_type type;
/* redundancy state */
u8 pnet_id[SMC_MAX_PNETID_LEN + 1];
/* pnet id of this lgr */
struct list_head llc_event_q;
/* queue for llc events */
spinlock_t llc_event_q_lock;
/* protects llc_event_q */
struct mutex llc_conf_mutex;
/* protects lgr reconfig. */
struct work_struct llc_add_link_work;
struct work_struct llc_del_link_work;
struct work_struct llc_event_work;
/* llc event worker */
wait_queue_head_t llc_flow_waiter;
/* w4 next llc event */
wait_queue_head_t llc_msg_waiter;
/* w4 next llc msg */
struct smc_llc_flow llc_flow_lcl;
/* llc local control field */
struct smc_llc_flow llc_flow_rmt;
/* llc remote control field */
struct smc_llc_qentry *delayed_event;
/* arrived when flow active */
spinlock_t llc_flow_lock;
/* protects llc flow */
int llc_testlink_time;
/* link keep alive time */
u32 llc_termination_rsn;
/* rsn code for termination */
};
struct { /* SMC-D */
u64 peer_gid;
/* Peer GID (remote) */
struct smcd_dev *smcd;
/* ISM device for VLAN reg. */
u8 peer_shutdown : 1;
/* peer triggered shutdownn */
};
};
};
struct smc_clc_msg_local;
struct smc_init_info {
u8 is_smcd;
unsigned short vlan_id;
int srv_first_contact;
int cln_first_contact;
/* SMC-R */
struct smc_clc_msg_local *ib_lcl;
struct smc_ib_device *ib_dev;
u8 ib_gid[SMC_GID_SIZE];
u8 ib_port;
u32 ib_clcqpn;
/* SMC-D */
u64 ism_gid;
struct smcd_dev *ism_dev;
};
/* Find the connection associated with the given alert token in the link group.
* To use rbtrees we have to implement our own search core.
* Requires @conns_lock
* @token alert token to search for
* @lgr link group to search in
* Returns connection associated with token if found, NULL otherwise.
*/
static inline struct smc_connection *smc_lgr_find_conn(
u32 token, struct smc_link_group *lgr)
{
struct smc_connection *res = NULL;
struct rb_node *node;
node = lgr->conns_all.rb_node;
while (node) {
struct smc_connection *cur = rb_entry(node,
struct smc_connection, alert_node);
if (cur->alert_token_local > token) {
node = node->rb_left;
} else {
if (cur->alert_token_local < token) {
node = node->rb_right;
} else {
res = cur;
break;
}
}
}
return res;
}
/* returns true if the specified link is usable */
static inline bool smc_link_usable(struct smc_link *lnk)
{
if (lnk->state == SMC_LNK_UNUSED || lnk->state == SMC_LNK_INACTIVE)
return false;
return true;
}
struct smc_sock;
struct smc_clc_msg_accept_confirm;
struct smc_clc_msg_local;
void smc_lgr_cleanup_early(struct smc_connection *conn);
void smc_lgr_terminate_sched(struct smc_link_group *lgr);
void smcr_port_add(struct smc_ib_device *smcibdev, u8 ibport);
void smcr_port_err(struct smc_ib_device *smcibdev, u8 ibport);
void smc_smcd_terminate(struct smcd_dev *dev, u64 peer_gid,
unsigned short vlan);
void smc_smcd_terminate_all(struct smcd_dev *dev);
void smc_smcr_terminate_all(struct smc_ib_device *smcibdev);
int smc_buf_create(struct smc_sock *smc, bool is_smcd);
int smc_uncompress_bufsize(u8 compressed);
int smc_rmb_rtoken_handling(struct smc_connection *conn, struct smc_link *link,
struct smc_clc_msg_accept_confirm *clc);
int smc_rtoken_add(struct smc_link *lnk, __be64 nw_vaddr, __be32 nw_rkey);
int smc_rtoken_delete(struct smc_link *lnk, __be32 nw_rkey);
void smc_rtoken_set(struct smc_link_group *lgr, int link_idx, int link_idx_new,
__be32 nw_rkey_known, __be64 nw_vaddr, __be32 nw_rkey);
void smc_rtoken_set2(struct smc_link_group *lgr, int rtok_idx, int link_id,
__be64 nw_vaddr, __be32 nw_rkey);
void smc_sndbuf_sync_sg_for_cpu(struct smc_connection *conn);
void smc_sndbuf_sync_sg_for_device(struct smc_connection *conn);
void smc_rmb_sync_sg_for_cpu(struct smc_connection *conn);
void smc_rmb_sync_sg_for_device(struct smc_connection *conn);
int smc_vlan_by_tcpsk(struct socket *clcsock, struct smc_init_info *ini);
void smc_conn_free(struct smc_connection *conn);
int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini);
void smc_lgr_schedule_free_work_fast(struct smc_link_group *lgr);
int smc_core_init(void);
void smc_core_exit(void);
int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk,
u8 link_idx, struct smc_init_info *ini);
void smcr_link_clear(struct smc_link *lnk, bool log);
int smcr_buf_map_lgr(struct smc_link *lnk);
int smcr_buf_reg_lgr(struct smc_link *lnk);
void smcr_lgr_set_type(struct smc_link_group *lgr, enum smc_lgr_type new_type);
void smcr_lgr_set_type_asym(struct smc_link_group *lgr,
enum smc_lgr_type new_type, int asym_lnk_idx);
int smcr_link_reg_rmb(struct smc_link *link, struct smc_buf_desc *rmb_desc);
struct smc_link *smc_switch_conns(struct smc_link_group *lgr,
struct smc_link *from_lnk, bool is_dev_err);
void smcr_link_down_cond(struct smc_link *lnk);
void smcr_link_down_cond_sched(struct smc_link *lnk);
static inline struct smc_link_group *smc_get_lgr(struct smc_link *link)
{
return link->lgr;
}
#endif
|