|
1 #This patch was developed both in-house and from outside. We plan to submit it |
|
2 #upstream, but do not yet have a target date for doing so |
|
3 # |
|
4 # HG changeset patch |
|
5 # Parent 90d898abcac39d3fc4a631a678f0bb7bbe28d877 |
|
6 25759055 OFUV (Userland) support for XRC APIs |
|
7 |
|
8 diff -r 90d898abcac3 src/mlx4.c |
|
9 --- a/src/mlx4.c Mon Nov 21 11:48:10 2016 -0800 |
|
10 +++ b/src/mlx4.c Mon Mar 20 14:22:58 2017 -0700 |
|
11 @@ -274,6 +274,8 @@ |
|
12 verbs_set_ctx_op(verbs_ctx, open_qp, mlx4_open_qp); |
|
13 verbs_set_ctx_op(verbs_ctx, drv_ibv_create_flow, ibv_cmd_create_flow); |
|
14 verbs_set_ctx_op(verbs_ctx, drv_ibv_destroy_flow, ibv_cmd_destroy_flow); |
|
15 + verbs_set_ctx_op(verbs_ctx, drv_set_legacy_xrc, mlx4_set_legacy_xrc); |
|
16 + verbs_set_ctx_op(verbs_ctx, drv_get_legacy_xrc, mlx4_get_legacy_xrc); |
|
17 |
|
18 return 0; |
|
19 |
|
20 diff -r 90d898abcac3 src/mlx4.h |
|
21 --- a/src/mlx4.h Mon Nov 21 11:48:10 2016 -0800 |
|
22 +++ b/src/mlx4.h Mon Mar 20 14:22:58 2017 -0700 |
|
23 @@ -233,6 +233,7 @@ |
|
24 uint32_t *db; |
|
25 uint16_t counter; |
|
26 uint8_t ext_srq; |
|
27 + struct ibv_srq_legacy *ibv_srq_legacy; |
|
28 }; |
|
29 |
|
30 struct mlx4_wq { |
|
31 @@ -464,4 +465,7 @@ |
|
32 struct mlx4_ah *ah); |
|
33 void mlx4_free_av(struct mlx4_ah *ah); |
|
34 |
|
35 +void *mlx4_get_legacy_xrc(struct ibv_srq *srq); |
|
36 +void mlx4_set_legacy_xrc(struct ibv_srq *srq, void *legacy_xrc_srq); |
|
37 + |
|
38 #endif /* MLX4_H */ |
|
39 diff -r 90d898abcac3 src/qp.c |
|
40 --- a/src/qp.c Mon Nov 21 11:48:10 2016 -0800 |
|
41 +++ b/src/qp.c Mon Mar 20 14:22:58 2017 -0700 |
|
42 @@ -247,6 +247,7 @@ |
|
43 |
|
44 switch (ibqp->qp_type) { |
|
45 case IBV_QPT_XRC_SEND: |
|
46 + case IBV_QPT_XRC: |
|
47 ctrl->srcrb_flags |= MLX4_REMOTE_SRQN_FLAGS(wr); |
|
48 /* fall through */ |
|
49 case IBV_QPT_RC: |
|
50 @@ -559,6 +560,7 @@ |
|
51 break; |
|
52 |
|
53 case IBV_QPT_XRC_SEND: |
|
54 + case IBV_QPT_XRC: |
|
55 case IBV_QPT_RC: |
|
56 size += sizeof (struct mlx4_wqe_raddr_seg); |
|
57 /* |
|
58 @@ -596,9 +598,11 @@ |
|
59 qp->buf.buf = qpbuf; |
|
60 qp->buf.length = buflen; |
|
61 |
|
62 - qp->sq.wrid = malloc(qp->sq.wqe_cnt * sizeof (uint64_t)); |
|
63 - if (!qp->sq.wrid) |
|
64 - return -1; |
|
65 + if (qp->sq.wqe_cnt) { |
|
66 + qp->sq.wrid = malloc(qp->sq.wqe_cnt * sizeof (uint64_t)); |
|
67 + if (!qp->sq.wrid) |
|
68 + return -1; |
|
69 + } |
|
70 |
|
71 if (qp->rq.wqe_cnt) { |
|
72 qp->rq.wrid = malloc(qp->rq.wqe_cnt * sizeof (uint64_t)); |
|
73 @@ -628,16 +632,20 @@ |
|
74 qp->sq.offset = 0; |
|
75 } |
|
76 |
|
77 - if ((long int)qp->buf.length < (long int)qp->buf_size) { |
|
78 - fprintf(stderr, PFX "QP kernel buffer size %lu < user buf " |
|
79 - "size %d\n", (unsigned long)qp->buf.length, qp->buf_size); |
|
80 - } |
|
81 - if ((!rq_off && qp->rq.offset) || (!sq_off && qp->sq.offset)) { |
|
82 - fprintf(stderr, PFX "QP kernel and user out of sync on " |
|
83 - "buffer order\n"); |
|
84 - } |
|
85 + if (qp->buf_size) { |
|
86 + if ((long int)qp->buf.length < (long int)qp->buf_size) { |
|
87 + fprintf(stderr, PFX "QP kernel buffer size %lu < user " |
|
88 + "buf size %d\n", (unsigned long)qp->buf.length, |
|
89 + qp->buf_size); |
|
90 + } |
|
91 + if ((!rq_off && qp->rq.offset) || (!sq_off && qp->sq.offset)) { |
|
92 + fprintf(stderr, PFX "QP kernel and user out of sync on " |
|
93 + "buffer order\n"); |
|
94 + } |
|
95 |
|
96 - memset(qp->buf.buf, 0, qp->buf_size); |
|
97 + memset(qp->buf.buf, 0, qp->buf_size); |
|
98 + } else |
|
99 + qp->buf.buf = NULL; |
|
100 return 0; |
|
101 } |
|
102 #endif |
|
103 @@ -705,6 +713,7 @@ |
|
104 break; |
|
105 |
|
106 case IBV_QPT_XRC_SEND: |
|
107 + case IBV_QPT_XRC: |
|
108 case IBV_QPT_UC: |
|
109 case IBV_QPT_RC: |
|
110 wqe_size -= sizeof (struct mlx4_wqe_raddr_seg); |
|
111 diff -r 90d898abcac3 src/srq.c |
|
112 --- a/src/srq.c Mon Nov 21 11:48:10 2016 -0800 |
|
113 +++ b/src/srq.c Mon Mar 20 14:22:58 2017 -0700 |
|
114 @@ -66,13 +66,17 @@ |
|
115 struct ibv_recv_wr *wr, |
|
116 struct ibv_recv_wr **bad_wr) |
|
117 { |
|
118 - struct mlx4_srq *srq = to_msrq(ibsrq); |
|
119 + struct mlx4_srq *srq; |
|
120 struct mlx4_wqe_srq_next_seg *next; |
|
121 struct mlx4_wqe_data_seg *scat; |
|
122 int err = 0; |
|
123 int nreq; |
|
124 int i; |
|
125 |
|
126 + if (ibsrq->handle == LEGACY_XRC_SRQ_HANDLE) |
|
127 + ibsrq = (struct ibv_srq *)(((struct ibv_srq_legacy *) ibsrq)->ibv_srq); |
|
128 + |
|
129 + srq = to_msrq(ibsrq); |
|
130 pthread_spin_lock(&srq->lock); |
|
131 |
|
132 for (nreq = 0; wr; ++nreq, wr = wr->next) { |
|
133 @@ -290,6 +294,9 @@ |
|
134 struct mlx4_create_srq_resp resp; |
|
135 struct mlx4_srq *srq; |
|
136 int ret; |
|
137 +#if defined(__SVR4) && defined(__sun) |
|
138 + void *srqbuf; |
|
139 +#endif |
|
140 |
|
141 /* Sanity check SRQ size before proceeding */ |
|
142 if (attr_ex->attr.max_wr > 1 << 16 || attr_ex->attr.max_sge > 64) |
|
143 @@ -342,9 +349,67 @@ |
|
144 attr_ex, |
|
145 &cmd.ibv_cmd, sizeof cmd, |
|
146 &resp.ibv_resp, sizeof resp); |
|
147 + |
|
148 +#if defined(__SVR4) && defined(__sun) |
|
149 + if (ret) { |
|
150 + goto err; |
|
151 + } |
|
152 + |
|
153 + /* |
|
154 + * The kernel driver passes back mmap information for mapping the |
|
155 + * SRQ work queue memory it allocated and the doorbell for |
|
156 + * for posting. |
|
157 + */ |
|
158 + if (resp.mdd.msrq_rev < 1) { |
|
159 + fprintf(stderr, PFX "libmlx4_create_xrc_srq libmlx4/hermon umap " |
|
160 + "rev mismatch (kernel rev=%d)\n", resp.mdd.msrq_rev); |
|
161 + goto err_destroy; |
|
162 + } |
|
163 + |
|
164 + srqbuf = mmap64((void *)0, resp.mdd.msrq_maplen, (PROT_READ | PROT_WRITE), |
|
165 + MAP_SHARED, attr_ex->pd->context->mmap_fd, resp.mdd.msrq_mapoffset); |
|
166 + |
|
167 + if (srqbuf == MAP_FAILED) { |
|
168 + goto err_destroy; |
|
169 + } |
|
170 + |
|
171 + srq->buf.buf = srqbuf; |
|
172 + srq->buf.length = resp.mdd.msrq_maplen; |
|
173 + srq->max = resp.ibv_resp.max_wr; |
|
174 + srq->max_gs = resp.ibv_resp.max_sge; |
|
175 + srq->verbs_srq.srq_num = srq->srqn = resp.mdd.msrq_srqnum; |
|
176 + srq->counter = 0; |
|
177 + |
|
178 + srq->db = mlx4_alloc_db(to_mctx(attr_ex->pd->context), |
|
179 + resp.mdd.msrq_rdbr_mapoffset, |
|
180 + resp.mdd.msrq_rdbr_maplen, |
|
181 + resp.mdd.msrq_rdbr_offset); |
|
182 + if (srq->db == NULL) { |
|
183 + goto err_unmap; |
|
184 + } |
|
185 + |
|
186 + /* |
|
187 + * The following call only initializes memory and control structures, |
|
188 + * it utilizes the memory allocated by the kernel. |
|
189 + * It also allocates the srq->wrid memory. |
|
190 + */ |
|
191 + if (mlx4_set_srq_buf(attr_ex->pd, srq, resp.mdd.msrq_wqesz, |
|
192 + resp.mdd.msrq_numwqe)) { |
|
193 + goto err_db; |
|
194 + } |
|
195 + |
|
196 + /* |
|
197 + * The returned max wr will have been rounded up to the nearest |
|
198 + * power of 2, subtracting 1 from that and reporting that value |
|
199 + * as the max will give us the required free WR in the queue, as |
|
200 + * in OFED. |
|
201 + */ |
|
202 + attr_ex->attr.max_wr -= 1; |
|
203 +#else |
|
204 if (ret) |
|
205 goto err_db; |
|
206 |
|
207 +#endif |
|
208 ret = mlx4_store_xsrq(&to_mctx(context)->xsrq_table, |
|
209 srq->verbs_srq.srq_num, srq); |
|
210 if (ret) |
|
211 @@ -352,13 +417,35 @@ |
|
212 |
|
213 return &srq->verbs_srq.srq; |
|
214 |
|
215 -err_destroy: |
|
216 - ibv_cmd_destroy_srq(&srq->verbs_srq.srq); |
|
217 err_db: |
|
218 mlx4_free_db(to_mctx(context), MLX4_DB_TYPE_RQ, srq->db); |
|
219 + |
|
220 +#if defined(__SVR4) && defined(__sun) |
|
221 + if (srq->wrid) |
|
222 + free(srq->wrid); |
|
223 +err_unmap: |
|
224 + mlx4_free_buf(&srq->buf); |
|
225 + |
|
226 +err_destroy: |
|
227 + /* |
|
228 + * Calling ibv_cmd_destroy_srq() will try and take the ibv_srq |
|
229 + * mutex that is initialised by the ibv_create_srq() entry point |
|
230 + * that called us AFTER we return, so its not initialised yet. |
|
231 + * So initialised it here so the destroy call doesn't hang. |
|
232 + */ |
|
233 + pthread_mutex_init(&(srq->verbs_srq.srq.mutex), NULL); |
|
234 + pthread_cond_init(&(srq->verbs_srq.srq.cond), NULL); |
|
235 + srq->verbs_srq.srq.events_completed = 0; |
|
236 + |
|
237 + ibv_cmd_destroy_srq(&srq->verbs_srq.srq); |
|
238 +#else |
|
239 +err_destroy: |
|
240 + ibv_cmd_destroy_srq(&srq->verbs_srq.srq); |
|
241 err_free: |
|
242 free(srq->wrid); |
|
243 mlx4_free_buf(&srq->buf); |
|
244 +#endif |
|
245 + |
|
246 err: |
|
247 free(srq); |
|
248 return NULL; |
|
249 diff -r 90d898abcac3 src/verbs.c |
|
250 --- a/src/verbs.c Mon Nov 21 11:48:10 2016 -0800 |
|
251 +++ b/src/verbs.c Mon Mar 20 14:22:58 2017 -0700 |
|
252 @@ -549,6 +549,21 @@ |
|
253 return 0; |
|
254 } |
|
255 |
|
256 +void *mlx4_get_legacy_xrc(struct ibv_srq *srq) |
|
257 +{ |
|
258 + struct mlx4_srq *msrq = to_msrq(srq); |
|
259 + |
|
260 + return msrq->ibv_srq_legacy; |
|
261 +} |
|
262 + |
|
263 +void mlx4_set_legacy_xrc(struct ibv_srq *srq, void *legacy_xrc_srq) |
|
264 +{ |
|
265 + struct mlx4_srq *msrq = to_msrq(srq); |
|
266 + |
|
267 + msrq->ibv_srq_legacy = legacy_xrc_srq; |
|
268 + return; |
|
269 +} |
|
270 + |
|
271 struct ibv_srq *mlx4_create_srq(struct ibv_pd *pd, |
|
272 struct ibv_srq_init_attr *attr) |
|
273 { |
|
274 @@ -564,7 +579,7 @@ |
|
275 if (attr->attr.max_wr > 1 << 16 || attr->attr.max_sge > 64) |
|
276 return NULL; |
|
277 |
|
278 - srq = malloc(sizeof *srq); |
|
279 + srq = calloc(1, sizeof *srq); |
|
280 if (!srq) |
|
281 return NULL; |
|
282 |
|
283 @@ -724,6 +739,9 @@ |
|
284 { |
|
285 struct ibv_modify_srq cmd; |
|
286 |
|
287 + if (srq->handle == LEGACY_XRC_SRQ_HANDLE) |
|
288 + srq = (struct ibv_srq *)(((struct ibv_srq_legacy *) srq)->ibv_srq); |
|
289 + |
|
290 #if !(defined(__SVR4) && defined(__sun)) |
|
291 return ibv_cmd_modify_srq(srq, attr, attr_mask, &cmd, sizeof cmd); |
|
292 #else |
|
293 @@ -741,6 +759,9 @@ |
|
294 { |
|
295 struct ibv_query_srq cmd; |
|
296 |
|
297 + if (srq->handle == LEGACY_XRC_SRQ_HANDLE) |
|
298 + srq = (struct ibv_srq *)(((struct ibv_srq_legacy *) srq)->ibv_srq); |
|
299 + |
|
300 #if !(defined(__SVR4) && defined(__sun)) |
|
301 return ibv_cmd_query_srq(srq, attr, &cmd, sizeof cmd); |
|
302 #else |
|
303 @@ -757,9 +778,23 @@ |
|
304 int mlx4_destroy_srq(struct ibv_srq *srq) |
|
305 { |
|
306 int ret; |
|
307 + struct ibv_srq *legacy_srq = NULL; |
|
308 |
|
309 - if (to_msrq(srq)->ext_srq) |
|
310 - return mlx4_destroy_xrc_srq(srq); |
|
311 + if (srq->handle == LEGACY_XRC_SRQ_HANDLE) { |
|
312 + legacy_srq = srq; |
|
313 + srq = (struct ibv_srq *)(((struct ibv_srq_legacy *) srq)->ibv_srq); |
|
314 + } |
|
315 + |
|
316 + if (to_msrq(srq)->ext_srq) { |
|
317 + ret = mlx4_destroy_xrc_srq(srq); |
|
318 + if (ret) |
|
319 + return ret; |
|
320 + |
|
321 + if (legacy_srq) |
|
322 + free(legacy_srq); |
|
323 + |
|
324 + return 0; |
|
325 + } |
|
326 |
|
327 ret = ibv_cmd_destroy_srq(srq); |
|
328 if (ret) |
|
329 @@ -783,7 +818,7 @@ |
|
330 struct ibv_create_qp_resp resp; |
|
331 #else |
|
332 struct mlx4_create_qp_resp resp; |
|
333 - void *qpbuf; |
|
334 + void *qpbuf = NULL; |
|
335 #endif |
|
336 |
|
337 /* Sanity check QP size before proceeding */ |
|
338 @@ -813,7 +848,8 @@ |
|
339 } |
|
340 |
|
341 if (attr->srq || attr->qp_type == IBV_QPT_XRC_SEND || |
|
342 - attr->qp_type == IBV_QPT_XRC_RECV) { |
|
343 + attr->qp_type == IBV_QPT_XRC_RECV || |
|
344 + attr->qp_type == IBV_QPT_XRC) { |
|
345 attr->cap.max_recv_wr = qp->rq.wqe_cnt = attr->cap.max_recv_sge = 0; |
|
346 } else { |
|
347 #if !(defined(__SVR4) && defined(__sun)) |
|
348 @@ -900,18 +936,22 @@ |
|
349 "rev mismatch (kernel rev=%d)\n", resp.mdd.mqp_rev); |
|
350 goto err_destroy; |
|
351 } |
|
352 - qpbuf = mmap64((void *)0, resp.mdd.mqp_maplen, (PROT_READ | PROT_WRITE), |
|
353 - MAP_SHARED, context->mmap_fd, resp.mdd.mqp_mapoffset); |
|
354 - |
|
355 - if (qpbuf == MAP_FAILED) |
|
356 - goto err_destroy; |
|
357 |
|
358 - /* |
|
359 - * Need to set qp->buf here in case alloc_db fails then |
|
360 - * we'll call mlx4_free_buf() to umap. |
|
361 - */ |
|
362 - qp->buf.buf = qpbuf; |
|
363 - qp->buf.length = resp.mdd.mqp_maplen; |
|
364 + if (resp.mdd.mqp_maplen != 0) { |
|
365 + qpbuf = mmap64((void *)0, resp.mdd.mqp_maplen, |
|
366 + (PROT_READ | PROT_WRITE), MAP_SHARED, context->mmap_fd, |
|
367 + resp.mdd.mqp_mapoffset); |
|
368 + |
|
369 + if (qpbuf == MAP_FAILED) |
|
370 + goto err_destroy; |
|
371 + |
|
372 + /* |
|
373 + * Need to set qp->buf here in case alloc_db fails then |
|
374 + * we'll call mlx4_free_buf() to umap. |
|
375 + */ |
|
376 + qp->buf.buf = qpbuf; |
|
377 + qp->buf.length = resp.mdd.mqp_maplen; |
|
378 + } |
|
379 |
|
380 if (attr->cap.max_recv_sge) { |
|
381 qp->db = mlx4_alloc_db(to_mctx(context), |
|
382 @@ -934,10 +974,12 @@ |
|
383 qp->sq_spare_wqes = resp.mdd.mqp_sq_headroomwqes; |
|
384 qp->sq.wqe_cnt = resp.mdd.mqp_sq_numwqe; |
|
385 |
|
386 - if (attr->srq) |
|
387 - qp->rq.wqe_cnt = 0; |
|
388 + if (attr->srq || attr->qp_type == IBV_QPT_XRC || |
|
389 + attr->qp_type == IBV_QPT_XRC_SEND || |
|
390 + attr->qp_type == IBV_QPT_XRC_RECV) |
|
391 + qp->rq.wqe_cnt = 0; |
|
392 else |
|
393 - qp->rq.wqe_cnt = resp.mdd.mqp_rq_numwqe; |
|
394 + qp->rq.wqe_cnt = resp.mdd.mqp_rq_numwqe; |
|
395 |
|
396 if (mlx4_set_qp_buf(attr->pd, qp, qpbuf, resp.mdd.mqp_maplen, |
|
397 resp.mdd.mqp_rq_wqesz, resp.mdd.mqp_rq_off, |
|
398 @@ -1020,12 +1062,23 @@ |
|
399 struct ibv_qp_init_attr_ex attr_ex; |
|
400 struct ibv_qp *qp; |
|
401 |
|
402 - memcpy(&attr_ex, attr, sizeof *attr); |
|
403 + /* We should copy below only the shared fields excluding the xrc_domain field. |
|
404 + * Otherwise we may have an ABI issue with applications that were compiled |
|
405 + * without the xrc_domain field. The xrc_domain any way has no affect in |
|
406 + * the sender side, no need to copy in/out. |
|
407 + */ |
|
408 + int init_attr_base_size = offsetof(struct ibv_qp_init_attr, |
|
409 + xrc_domain); |
|
410 + |
|
411 + memset(&attr_ex, 0, sizeof(attr_ex)); /* pre-set all fields to zero */ |
|
412 + /* copying only shared fields */ |
|
413 + memcpy(&attr_ex, attr, init_attr_base_size); |
|
414 attr_ex.comp_mask = IBV_QP_INIT_ATTR_PD; |
|
415 attr_ex.pd = pd; |
|
416 + |
|
417 qp = mlx4_create_qp_ex(pd->context, &attr_ex); |
|
418 if (qp) |
|
419 - memcpy(attr, &attr_ex, sizeof *attr); |
|
420 + memcpy(attr, &attr_ex, init_attr_base_size); |
|
421 return qp; |
|
422 } |
|
423 |