components/open-fabrics/libmlx4/patches/005-libmlx4-xrc.patch
changeset 7865 22ec3267b2a3
equal deleted inserted replaced
7864:f11e8d81786a 7865:22ec3267b2a3
       
     1 #This patch was developed both in-house and from outside. We plan to submit it
       
     2 #upstream, but do not yet have a target date for doing so
       
     3 #
       
     4 # HG changeset patch
       
     5 # Parent  90d898abcac39d3fc4a631a678f0bb7bbe28d877
       
     6 25759055 OFUV (Userland) support for XRC APIs
       
     7 
       
     8 diff -r 90d898abcac3 src/mlx4.c
       
     9 --- a/src/mlx4.c	Mon Nov 21 11:48:10 2016 -0800
       
    10 +++ b/src/mlx4.c	Mon Mar 20 14:22:58 2017 -0700
       
    11 @@ -274,6 +274,8 @@
       
    12  	verbs_set_ctx_op(verbs_ctx, open_qp, mlx4_open_qp);
       
    13  	verbs_set_ctx_op(verbs_ctx, drv_ibv_create_flow, ibv_cmd_create_flow);
       
    14  	verbs_set_ctx_op(verbs_ctx, drv_ibv_destroy_flow, ibv_cmd_destroy_flow);
       
    15 +	verbs_set_ctx_op(verbs_ctx, drv_set_legacy_xrc, mlx4_set_legacy_xrc);
       
    16 +	verbs_set_ctx_op(verbs_ctx, drv_get_legacy_xrc, mlx4_get_legacy_xrc);
       
    17  
       
    18  	return 0;
       
    19  
       
    20 diff -r 90d898abcac3 src/mlx4.h
       
    21 --- a/src/mlx4.h	Mon Nov 21 11:48:10 2016 -0800
       
    22 +++ b/src/mlx4.h	Mon Mar 20 14:22:58 2017 -0700
       
    23 @@ -233,6 +233,7 @@
       
    24  	uint32_t		       *db;
       
    25  	uint16_t			counter;
       
    26  	uint8_t				ext_srq;
       
    27 +        struct ibv_srq_legacy	       *ibv_srq_legacy;
       
    28  };
       
    29  
       
    30  struct mlx4_wq {
       
    31 @@ -464,4 +465,7 @@
       
    32  		   struct mlx4_ah *ah);
       
    33  void mlx4_free_av(struct mlx4_ah *ah);
       
    34  
       
    35 +void *mlx4_get_legacy_xrc(struct ibv_srq *srq);
       
    36 +void mlx4_set_legacy_xrc(struct ibv_srq *srq, void *legacy_xrc_srq);
       
    37 +
       
    38  #endif /* MLX4_H */
       
    39 diff -r 90d898abcac3 src/qp.c
       
    40 --- a/src/qp.c	Mon Nov 21 11:48:10 2016 -0800
       
    41 +++ b/src/qp.c	Mon Mar 20 14:22:58 2017 -0700
       
    42 @@ -247,6 +247,7 @@
       
    43  
       
    44  		switch (ibqp->qp_type) {
       
    45  		case IBV_QPT_XRC_SEND:
       
    46 +		case IBV_QPT_XRC:
       
    47  			ctrl->srcrb_flags |= MLX4_REMOTE_SRQN_FLAGS(wr);
       
    48  			/* fall through */
       
    49  		case IBV_QPT_RC:
       
    50 @@ -559,6 +560,7 @@
       
    51  		break;
       
    52  
       
    53  	case IBV_QPT_XRC_SEND:
       
    54 +	case IBV_QPT_XRC:
       
    55  	case IBV_QPT_RC:
       
    56  		size += sizeof (struct mlx4_wqe_raddr_seg);
       
    57  		/*
       
    58 @@ -596,9 +598,11 @@
       
    59  	qp->buf.buf      = qpbuf;
       
    60  	qp->buf.length   = buflen;
       
    61  
       
    62 -	qp->sq.wrid = malloc(qp->sq.wqe_cnt * sizeof (uint64_t));
       
    63 -	if (!qp->sq.wrid)
       
    64 -		return -1;
       
    65 +	if (qp->sq.wqe_cnt) {
       
    66 +		qp->sq.wrid = malloc(qp->sq.wqe_cnt * sizeof (uint64_t));
       
    67 +		if (!qp->sq.wrid)
       
    68 +			return -1;
       
    69 +	}
       
    70  
       
    71  	if (qp->rq.wqe_cnt) {
       
    72  		qp->rq.wrid = malloc(qp->rq.wqe_cnt * sizeof (uint64_t));
       
    73 @@ -628,16 +632,20 @@
       
    74  		qp->sq.offset = 0;
       
    75  	}
       
    76  
       
    77 -	if ((long int)qp->buf.length < (long int)qp->buf_size) {
       
    78 -		fprintf(stderr, PFX "QP kernel buffer size %lu < user buf "
       
    79 -		    "size %d\n", (unsigned long)qp->buf.length, qp->buf_size);
       
    80 -	}
       
    81 -	if ((!rq_off && qp->rq.offset) || (!sq_off && qp->sq.offset)) {
       
    82 -		fprintf(stderr, PFX "QP kernel and user out of sync on "
       
    83 -		    "buffer order\n");
       
    84 -	}
       
    85 +	if (qp->buf_size) {
       
    86 +		if ((long int)qp->buf.length < (long int)qp->buf_size) {
       
    87 +			fprintf(stderr, PFX "QP kernel buffer size %lu < user "
       
    88 +			    "buf size %d\n", (unsigned long)qp->buf.length,
       
    89 +			    qp->buf_size);
       
    90 +		}
       
    91 +		if ((!rq_off && qp->rq.offset) || (!sq_off && qp->sq.offset)) {
       
    92 +			fprintf(stderr, PFX "QP kernel and user out of sync on "
       
    93 +			    "buffer order\n");
       
    94 +		}
       
    95  
       
    96 -	memset(qp->buf.buf, 0, qp->buf_size);
       
    97 +		memset(qp->buf.buf, 0, qp->buf_size);
       
    98 +	} else
       
    99 +		qp->buf.buf = NULL;
       
   100  	return 0;
       
   101  }
       
   102  #endif
       
   103 @@ -705,6 +713,7 @@
       
   104  		break;
       
   105  
       
   106  	case IBV_QPT_XRC_SEND:
       
   107 +	case IBV_QPT_XRC:
       
   108  	case IBV_QPT_UC:
       
   109  	case IBV_QPT_RC:
       
   110  		wqe_size -= sizeof (struct mlx4_wqe_raddr_seg);
       
   111 diff -r 90d898abcac3 src/srq.c
       
   112 --- a/src/srq.c	Mon Nov 21 11:48:10 2016 -0800
       
   113 +++ b/src/srq.c	Mon Mar 20 14:22:58 2017 -0700
       
   114 @@ -66,13 +66,17 @@
       
   115  		       struct ibv_recv_wr *wr,
       
   116  		       struct ibv_recv_wr **bad_wr)
       
   117  {
       
   118 -	struct mlx4_srq *srq = to_msrq(ibsrq);
       
   119 +	struct mlx4_srq *srq;
       
   120  	struct mlx4_wqe_srq_next_seg *next;
       
   121  	struct mlx4_wqe_data_seg *scat;
       
   122  	int err = 0;
       
   123  	int nreq;
       
   124  	int i;
       
   125  
       
   126 +	if (ibsrq->handle == LEGACY_XRC_SRQ_HANDLE)
       
   127 +	       ibsrq = (struct ibv_srq *)(((struct ibv_srq_legacy *) ibsrq)->ibv_srq);
       
   128 + 
       
   129 +	srq = to_msrq(ibsrq);
       
   130  	pthread_spin_lock(&srq->lock);
       
   131  
       
   132  	for (nreq = 0; wr; ++nreq, wr = wr->next) {
       
   133 @@ -290,6 +294,9 @@
       
   134  	struct mlx4_create_srq_resp resp;
       
   135  	struct mlx4_srq *srq;
       
   136  	int ret;
       
   137 +#if defined(__SVR4) && defined(__sun)
       
   138 +	void		*srqbuf;
       
   139 +#endif
       
   140  
       
   141  	/* Sanity check SRQ size before proceeding */
       
   142  	if (attr_ex->attr.max_wr > 1 << 16 || attr_ex->attr.max_sge > 64)
       
   143 @@ -342,9 +349,67 @@
       
   144  				    attr_ex,
       
   145  				    &cmd.ibv_cmd, sizeof cmd,
       
   146  				    &resp.ibv_resp, sizeof resp);
       
   147 +
       
   148 +#if defined(__SVR4) && defined(__sun)
       
   149 +	if (ret) {
       
   150 +		goto err;
       
   151 +	}
       
   152 +
       
   153 +	/*
       
   154 +	 * The kernel driver passes back mmap information for mapping the
       
   155 +	 * SRQ work queue memory it allocated and the doorbell for
       
   156 +	 * for posting.
       
   157 +	 */
       
   158 +	if (resp.mdd.msrq_rev < 1) {
       
   159 +		fprintf(stderr, PFX "libmlx4_create_xrc_srq libmlx4/hermon umap "
       
   160 +			"rev mismatch (kernel rev=%d)\n", resp.mdd.msrq_rev);
       
   161 +		goto err_destroy;
       
   162 +	}
       
   163 +
       
   164 +	srqbuf = mmap64((void *)0, resp.mdd.msrq_maplen, (PROT_READ | PROT_WRITE),
       
   165 +		MAP_SHARED, attr_ex->pd->context->mmap_fd, resp.mdd.msrq_mapoffset);
       
   166 +
       
   167 +	if (srqbuf == MAP_FAILED) {
       
   168 +		goto err_destroy;
       
   169 +	}
       
   170 +
       
   171 +	srq->buf.buf	= srqbuf;
       
   172 +	srq->buf.length	= resp.mdd.msrq_maplen;
       
   173 +	srq->max	= resp.ibv_resp.max_wr;
       
   174 +	srq->max_gs	= resp.ibv_resp.max_sge;
       
   175 +	srq->verbs_srq.srq_num = srq->srqn = resp.mdd.msrq_srqnum;
       
   176 +	srq->counter	= 0;
       
   177 +
       
   178 +	srq->db = mlx4_alloc_db(to_mctx(attr_ex->pd->context),
       
   179 +			resp.mdd.msrq_rdbr_mapoffset,
       
   180 +			resp.mdd.msrq_rdbr_maplen,
       
   181 +			resp.mdd.msrq_rdbr_offset);
       
   182 +	if (srq->db == NULL) {
       
   183 +		goto err_unmap;
       
   184 +	}
       
   185 +
       
   186 +	/*
       
   187 +	 * The following call only initializes memory and control structures,
       
   188 +	 * it utilizes the memory allocated by the kernel.
       
   189 +	 * It also allocates the srq->wrid memory.
       
   190 +	 */
       
   191 +	if (mlx4_set_srq_buf(attr_ex->pd, srq, resp.mdd.msrq_wqesz,
       
   192 +			resp.mdd.msrq_numwqe)) {
       
   193 +		goto err_db;
       
   194 +	}
       
   195 +
       
   196 +	/*
       
   197 +	 * The returned max wr will have been rounded up to the nearest
       
   198 +	 * power of 2, subtracting 1 from that and reporting that value
       
   199 +	 * as the max will give us the required free WR in the queue, as
       
   200 +	 * in OFED.
       
   201 +	 */
       
   202 +	attr_ex->attr.max_wr -= 1;
       
   203 +#else
       
   204  	if (ret)
       
   205  		goto err_db;
       
   206  
       
   207 +#endif
       
   208  	ret = mlx4_store_xsrq(&to_mctx(context)->xsrq_table,
       
   209  			      srq->verbs_srq.srq_num, srq);
       
   210  	if (ret)
       
   211 @@ -352,13 +417,35 @@
       
   212  
       
   213  	return &srq->verbs_srq.srq;
       
   214  
       
   215 -err_destroy:
       
   216 -	ibv_cmd_destroy_srq(&srq->verbs_srq.srq);
       
   217  err_db:
       
   218  	mlx4_free_db(to_mctx(context), MLX4_DB_TYPE_RQ, srq->db);
       
   219 +
       
   220 +#if defined(__SVR4) && defined(__sun)
       
   221 +	if (srq->wrid)
       
   222 +		free(srq->wrid);
       
   223 +err_unmap:
       
   224 +	mlx4_free_buf(&srq->buf);
       
   225 +
       
   226 +err_destroy:
       
   227 +	/*
       
   228 +	 * Calling ibv_cmd_destroy_srq() will try and take the ibv_srq
       
   229 +	 * mutex that is initialised by the ibv_create_srq() entry point
       
   230 +	 * that called us AFTER we return, so its not initialised yet.
       
   231 +	 * So initialised it here so the destroy call doesn't hang.
       
   232 +	 */
       
   233 +	pthread_mutex_init(&(srq->verbs_srq.srq.mutex), NULL);
       
   234 +	pthread_cond_init(&(srq->verbs_srq.srq.cond), NULL);
       
   235 +	srq->verbs_srq.srq.events_completed = 0;
       
   236 +
       
   237 +	ibv_cmd_destroy_srq(&srq->verbs_srq.srq);
       
   238 +#else
       
   239 +err_destroy:
       
   240 +        ibv_cmd_destroy_srq(&srq->verbs_srq.srq);
       
   241  err_free:
       
   242  	free(srq->wrid);
       
   243  	mlx4_free_buf(&srq->buf);
       
   244 +#endif
       
   245 +
       
   246  err:
       
   247  	free(srq);
       
   248  	return NULL;
       
   249 diff -r 90d898abcac3 src/verbs.c
       
   250 --- a/src/verbs.c	Mon Nov 21 11:48:10 2016 -0800
       
   251 +++ b/src/verbs.c	Mon Mar 20 14:22:58 2017 -0700
       
   252 @@ -549,6 +549,21 @@
       
   253  	return 0;
       
   254  }
       
   255  
       
   256 +void *mlx4_get_legacy_xrc(struct ibv_srq *srq)
       
   257 +{
       
   258 +       struct mlx4_srq *msrq = to_msrq(srq);
       
   259 +
       
   260 +       return msrq->ibv_srq_legacy;
       
   261 +}
       
   262 +
       
   263 +void mlx4_set_legacy_xrc(struct ibv_srq *srq, void *legacy_xrc_srq)
       
   264 +{
       
   265 +       struct mlx4_srq *msrq = to_msrq(srq);
       
   266 +
       
   267 +       msrq->ibv_srq_legacy = legacy_xrc_srq;
       
   268 +       return;
       
   269 +}
       
   270 +
       
   271  struct ibv_srq *mlx4_create_srq(struct ibv_pd *pd,
       
   272  				struct ibv_srq_init_attr *attr)
       
   273  {
       
   274 @@ -564,7 +579,7 @@
       
   275  	if (attr->attr.max_wr > 1 << 16 || attr->attr.max_sge > 64)
       
   276  		return NULL;
       
   277  
       
   278 -	srq = malloc(sizeof *srq);
       
   279 +	srq = calloc(1, sizeof *srq);
       
   280  	if (!srq)
       
   281  		return NULL;
       
   282  
       
   283 @@ -724,6 +739,9 @@
       
   284  {
       
   285  	struct ibv_modify_srq cmd;
       
   286  
       
   287 +	if (srq->handle == LEGACY_XRC_SRQ_HANDLE)
       
   288 +		srq = (struct ibv_srq *)(((struct ibv_srq_legacy *) srq)->ibv_srq);
       
   289 +
       
   290  #if !(defined(__SVR4) && defined(__sun))
       
   291  	return ibv_cmd_modify_srq(srq, attr, attr_mask, &cmd, sizeof cmd);
       
   292  #else
       
   293 @@ -741,6 +759,9 @@
       
   294  {
       
   295  	struct ibv_query_srq cmd;
       
   296  
       
   297 +	if (srq->handle == LEGACY_XRC_SRQ_HANDLE)
       
   298 +		srq = (struct ibv_srq *)(((struct ibv_srq_legacy *) srq)->ibv_srq);
       
   299 +
       
   300  #if !(defined(__SVR4) && defined(__sun))
       
   301  	return ibv_cmd_query_srq(srq, attr, &cmd, sizeof cmd);
       
   302  #else
       
   303 @@ -757,9 +778,23 @@
       
   304  int mlx4_destroy_srq(struct ibv_srq *srq)
       
   305  {
       
   306  	int ret;
       
   307 +	struct ibv_srq *legacy_srq = NULL;
       
   308  
       
   309 -	if (to_msrq(srq)->ext_srq)
       
   310 -		return mlx4_destroy_xrc_srq(srq);
       
   311 +	if (srq->handle == LEGACY_XRC_SRQ_HANDLE) {
       
   312 +		legacy_srq = srq;
       
   313 +		srq = (struct ibv_srq *)(((struct ibv_srq_legacy *) srq)->ibv_srq);
       
   314 +	}
       
   315 +
       
   316 +	if (to_msrq(srq)->ext_srq) {
       
   317 +		ret =  mlx4_destroy_xrc_srq(srq);
       
   318 +		if (ret)
       
   319 +			return ret;
       
   320 + 
       
   321 +		if (legacy_srq)
       
   322 +			free(legacy_srq);
       
   323 +
       
   324 +		return 0;
       
   325 +	}
       
   326  
       
   327  	ret = ibv_cmd_destroy_srq(srq);
       
   328  	if (ret)
       
   329 @@ -783,7 +818,7 @@
       
   330  	struct ibv_create_qp_resp resp;
       
   331  #else
       
   332  	struct mlx4_create_qp_resp	resp;
       
   333 -	void				*qpbuf;
       
   334 +	void				*qpbuf = NULL;
       
   335  #endif
       
   336  
       
   337  	/* Sanity check QP size before proceeding */
       
   338 @@ -813,7 +848,8 @@
       
   339  	}
       
   340  
       
   341  	if (attr->srq || attr->qp_type == IBV_QPT_XRC_SEND ||
       
   342 -	    attr->qp_type == IBV_QPT_XRC_RECV) {
       
   343 +	    attr->qp_type == IBV_QPT_XRC_RECV ||
       
   344 +	    attr->qp_type == IBV_QPT_XRC) {
       
   345  		attr->cap.max_recv_wr = qp->rq.wqe_cnt = attr->cap.max_recv_sge = 0;
       
   346  	} else {
       
   347  #if !(defined(__SVR4) && defined(__sun))
       
   348 @@ -900,18 +936,22 @@
       
   349  		    "rev mismatch (kernel rev=%d)\n", resp.mdd.mqp_rev);
       
   350  		goto err_destroy;
       
   351  	}
       
   352 -	qpbuf = mmap64((void *)0, resp.mdd.mqp_maplen, (PROT_READ | PROT_WRITE),
       
   353 -	    MAP_SHARED, context->mmap_fd, resp.mdd.mqp_mapoffset);
       
   354 -
       
   355 -	if (qpbuf == MAP_FAILED)
       
   356 -		goto err_destroy;
       
   357  
       
   358 -	/*
       
   359 -	 * Need to set qp->buf here in case alloc_db fails then
       
   360 -	 * we'll call mlx4_free_buf() to umap.
       
   361 -	 */
       
   362 -	qp->buf.buf	= qpbuf;
       
   363 -	qp->buf.length	= resp.mdd.mqp_maplen;
       
   364 +	if (resp.mdd.mqp_maplen != 0) {
       
   365 +		qpbuf = mmap64((void *)0, resp.mdd.mqp_maplen,
       
   366 +		    (PROT_READ | PROT_WRITE), MAP_SHARED, context->mmap_fd,
       
   367 +		    resp.mdd.mqp_mapoffset);
       
   368 +
       
   369 +		if (qpbuf == MAP_FAILED)
       
   370 +			goto err_destroy;
       
   371 +
       
   372 +		/*
       
   373 +		 * Need to set qp->buf here in case alloc_db fails then
       
   374 +		 * we'll call mlx4_free_buf() to umap.
       
   375 +		 */
       
   376 +		qp->buf.buf	= qpbuf;
       
   377 +		qp->buf.length	= resp.mdd.mqp_maplen;
       
   378 +	}
       
   379  
       
   380  	if (attr->cap.max_recv_sge) {
       
   381  		qp->db = mlx4_alloc_db(to_mctx(context),
       
   382 @@ -934,10 +974,12 @@
       
   383  	qp->sq_spare_wqes = resp.mdd.mqp_sq_headroomwqes;
       
   384  	qp->sq.wqe_cnt    = resp.mdd.mqp_sq_numwqe;
       
   385  
       
   386 -	if (attr->srq)
       
   387 -		qp->rq.wqe_cnt  = 0;
       
   388 +	if (attr->srq || attr->qp_type == IBV_QPT_XRC ||
       
   389 +	    attr->qp_type == IBV_QPT_XRC_SEND ||
       
   390 +	    attr->qp_type == IBV_QPT_XRC_RECV)
       
   391 +		qp->rq.wqe_cnt	= 0;
       
   392  	else
       
   393 -		qp->rq.wqe_cnt  = resp.mdd.mqp_rq_numwqe;
       
   394 +		qp->rq.wqe_cnt	= resp.mdd.mqp_rq_numwqe;
       
   395  
       
   396  	if (mlx4_set_qp_buf(attr->pd, qp, qpbuf, resp.mdd.mqp_maplen,
       
   397  	    resp.mdd.mqp_rq_wqesz, resp.mdd.mqp_rq_off,
       
   398 @@ -1020,12 +1062,23 @@
       
   399  	struct ibv_qp_init_attr_ex attr_ex;
       
   400  	struct ibv_qp *qp;
       
   401  
       
   402 -	memcpy(&attr_ex, attr, sizeof *attr);
       
   403 +	/* We should copy below only the shared fields excluding the xrc_domain field.
       
   404 +	 * Otherwise we may have an ABI issue with applications that were compiled
       
   405 +	 * without the xrc_domain field. The xrc_domain any way has no affect in
       
   406 +	 * the sender side, no need to copy in/out.
       
   407 +	 */
       
   408 +	int init_attr_base_size = offsetof(struct ibv_qp_init_attr,
       
   409 +		xrc_domain);
       
   410 +
       
   411 +	memset(&attr_ex, 0, sizeof(attr_ex)); /* pre-set all fields to zero */
       
   412 +	/* copying only shared fields */
       
   413 +	memcpy(&attr_ex, attr, init_attr_base_size);
       
   414  	attr_ex.comp_mask = IBV_QP_INIT_ATTR_PD;
       
   415  	attr_ex.pd = pd;
       
   416 +
       
   417  	qp = mlx4_create_qp_ex(pd->context, &attr_ex);
       
   418  	if (qp)
       
   419 -		memcpy(attr, &attr_ex, sizeof *attr);
       
   420 +		memcpy(attr, &attr_ex, init_attr_base_size);
       
   421  	return qp;
       
   422  }
       
   423