usr/src/uts/common/io/ib/clients/iser/iser_cq.c
author Peter Dunlap <Peter.Dunlap@Sun.COM>
Tue, 24 Mar 2009 17:50:49 -0600
changeset 9162 b011b0287065
child 9247 331d3de1fc0f
permissions -rw-r--r--
PSARC 2008/395 iSER: iSCSI Extensions for RDMA 6702590 iSCSI initiator needs to support iSER transport 6702591 COMSTAR iSCSI port provider needs to support iSER transport 6797024 COMSTAR iscsit asserted at iscsit_login.c line: 679 6776635 panic[cpu0]/thread=ffffff000f874c60,assertion failed: 0, file: ../../common/io/idm/idm.c, line: 1465 6802232 Bad kernel fault at addr=0x0 from idm_crc32c call

/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

#include <sys/types.h>
#include <sys/ddi.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <netinet/in.h>
#include <sys/sunddi.h>
#include <sys/sdt.h>
#include <sys/ib/ibtl/ibti.h>
#include <sys/ib/ibtl/ibtl_types.h>

#include <sys/ib/clients/iser/iser.h>

/*
 * iser_cq.c
 *    Routines for completion queue handlers for iSER.
 */
static void iser_msg_handle(iser_chan_t *chan, iser_msg_t *msg);
int iser_iscsihdr_handle(iser_chan_t *chan, iser_msg_t *msg);
static int iser_ib_poll_send_completions(ibt_cq_hdl_t cq_hdl,
    iser_chan_t *iser_chan);
static int iser_ib_poll_recv_completions(ibt_cq_hdl_t cq_hdl,
    iser_chan_t *iser_chan);

void
iser_ib_sendcq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
{
	iser_chan_t	*iser_chan;
	ibt_status_t	status;

	iser_chan = (iser_chan_t *)arg;

	/* Poll completions until the CQ is empty */
	do {
		status = iser_ib_poll_send_completions(cq_hdl, iser_chan);
	} while (status != IBT_CQ_EMPTY);

	/* We've emptied the CQ, rearm it before we're done here */
	status = ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION);
	if (status != IBT_SUCCESS) {
		/* Unexpected error */
		ISER_LOG(CE_NOTE, "iser_ib_sendcq_handler: "
		    "ibt_enable_cq_notify error (%d)", status);
		return;
	}

	/* Now, check for more completions after the rearm */
	do {
		status = iser_ib_poll_send_completions(cq_hdl, iser_chan);
	} while (status != IBT_CQ_EMPTY);
}

static int
iser_ib_poll_send_completions(ibt_cq_hdl_t cq_hdl, iser_chan_t *iser_chan)
{
	ibt_wc_t	wc[ISER_IB_SCQ_POLL_MAX];
	ibt_wrid_t	wrid;
	idm_buf_t	*idb = NULL;
	idm_task_t	*idt = NULL;
	iser_wr_t	*wr = NULL;
	int		i;
	uint_t		npoll = 0;
	ibt_status_t	status;
	iser_conn_t	*iser_conn;
	idm_status_t	idm_status;

	iser_conn = iser_chan->ic_conn;

	/*
	 * Poll ISER_IB_SCQ_POLL_MAX completions from the CQ.
	 */
	status = ibt_poll_cq(cq_hdl, wc, ISER_IB_SCQ_POLL_MAX, &npoll);

	if (status != IBT_SUCCESS) {
		if (status != IBT_CQ_EMPTY) {
			/* Unexpected error */
			ISER_LOG(CE_NOTE, "iser_ib_sendcq_handler: ibt_poll_cq "
			    "error (%d)", status);
		}
		/* CQ is empty. Either way, move along... */
		return (status);
	}

	/*
	 * Handle each of the completions we've polled
	 */
	for (i = 0; i < npoll; i++) {

		DTRACE_PROBE3(iser__send__cqe, iser_chan_t *, iser_chan,
		    ibt_wc_t *, &wc[i], ibt_wc_status_t, wc[i].wc_status);

		/* Grab the wrid of the completion */
		wrid = wc[i].wc_id;

		/* Decrement this channel's SQ posted count */
		mutex_enter(&iser_chan->ic_sq_post_lock);
		iser_chan->ic_sq_post_count--;
		mutex_exit(&iser_chan->ic_sq_post_lock);

		/* Pull in the wr handle */
		wr = (iser_wr_t *)(uintptr_t)wrid;
		ASSERT(wr != NULL);

		/* Set an idm_status for return to IDM */
		idm_status = (wc[i].wc_status == IBT_WC_SUCCESS) ?
		    IDM_STATUS_SUCCESS : IDM_STATUS_FAIL;

		/*
		 * A non-success status here indicates the QP went
		 * into an error state while this WR was being
		 * processed. This can also happen when the
		 * channel is closed on the remote end. Clean up
		 * the resources, then push CE_TRANSPORT_FAIL
		 * into IDM.
		 */
		if (wc[i].wc_status != IBT_WC_SUCCESS) {
			/*
			 * Free the resources attached to this
			 * completion.
			 */
			if (wr->iw_msg != NULL) {
				/* Free iser_msg handle */
				iser_msg_free(wr->iw_msg);
			}

			if (wr->iw_pdu != NULL) {
				/* Complete the PDU */
				idm_pdu_complete(wr->iw_pdu, idm_status);
			}

			if (wr->iw_buf != NULL) {
				/* Invoke buffer callback */
				idb = wr->iw_buf;
#ifdef DEBUG
				bcopy(&wc[i],
				    &((iser_buf_t *)idb->idb_buf_private)->
				    buf_wc, sizeof (ibt_wc_t));
#endif
				idt = idb->idb_task_binding;
				mutex_enter(&idt->idt_mutex);
				if (wr->iw_type == ISER_WR_RDMAW) {
					idm_buf_tx_to_ini_done(idt, idb,
					    IDM_STATUS_FAIL);
				} else { /* ISER_WR_RDMAR */
					idm_buf_rx_from_ini_done(idt, idb,
					    IDM_STATUS_FAIL);
				}
			}

			/* Free the iser wr handle */
			iser_wr_free(wr);

			/*
			 * Tell IDM that the channel has gone down,
			 * unless he already knows.
			 */
			mutex_enter(&iser_conn->ic_lock);
			switch (iser_conn->ic_stage) {
			case ISER_CONN_STAGE_IC_DISCONNECTED:
			case ISER_CONN_STAGE_IC_FREED:
			case ISER_CONN_STAGE_CLOSING:
			case ISER_CONN_STAGE_CLOSED:
				break;

			default:
				idm_conn_event(iser_conn->ic_idmc,
				    CE_TRANSPORT_FAIL, idm_status);
				iser_conn->ic_stage = ISER_CONN_STAGE_CLOSING;
			}
			mutex_exit(&iser_conn->ic_lock);

			/* Move onto the next completion */
			continue;
		}

		/*
		 * For a success status, just invoke the PDU or
		 * buffer completion. We use our WR handle's
		 * "iw_type" here so that we can properly process
		 * because the CQE's opcode is invalid if the status
		 * is failed.
		 */
		switch (wr->iw_type) {
		case ISER_WR_SEND:
			/* Free the msg handle */
			ASSERT(wr->iw_msg != NULL);
			iser_msg_free(wr->iw_msg);

			if (wr->iw_pdu == NULL) {
				/* This is a hello exchange message */
				mutex_enter(&iser_conn->ic_lock);
				if (iser_conn->ic_stage ==
				    ISER_CONN_STAGE_HELLOREPLY_SENT) {
					/*
					 * We're on the target side,
					 * and have just successfully
					 * sent the HelloReply msg.
					 */
					iser_conn->ic_stage =
					    ISER_CONN_STAGE_LOGGED_IN;
				}
				mutex_exit(&iser_conn->ic_lock);
			} else {
				/* This is a normal control message */
				idm_pdu_complete(wr->iw_pdu, idm_status);
			}

			/* Free the wr handle */
			iser_wr_free(wr);

			break;

		case ISER_WR_RDMAW:
		case ISER_WR_RDMAR:
			/*
			 * Invoke the appropriate callback;
			 * the buffer will be freed there.
			 */
			idb = wr->iw_buf;
#ifdef DEBUG
			bcopy(&wc[i],
			    &((iser_buf_t *)idb->idb_buf_private)->buf_wc,
			    sizeof (ibt_wc_t));
#endif
			idt = idb->idb_task_binding;

			mutex_enter(&idt->idt_mutex);
			if (wr->iw_type == ISER_WR_RDMAW) {
				idm_buf_tx_to_ini_done(idt, idb, idm_status);
			} else {
				idm_buf_rx_from_ini_done(idt, idb, idm_status);
			}

			/* Free the wr handle */
			iser_wr_free(wr);

			break;

		default:
			ASSERT(0);
			break;
		}
	}

	return (status);
}

void
iser_ib_recvcq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
{
	iser_chan_t	*iser_chan;
	ibt_status_t	status;

	iser_chan = (iser_chan_t *)arg;

	/* Poll completions until the CQ is empty */
	do {
		status = iser_ib_poll_recv_completions(cq_hdl, iser_chan);
	} while (status != IBT_CQ_EMPTY);

	/* We've emptied the CQ, rearm it before we're done here */
	status = ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION);
	if (status != IBT_SUCCESS) {
		/* Unexpected error */
		ISER_LOG(CE_NOTE, "iser_ib_recvcq_handler: "
		    "ibt_enable_cq_notify error (%d)", status);
		return;
	}

	/* Now, check for more completions after the rearm */
	do {
		status = iser_ib_poll_recv_completions(cq_hdl, iser_chan);
	} while (status != IBT_CQ_EMPTY);
}

static int
iser_ib_poll_recv_completions(ibt_cq_hdl_t cq_hdl, iser_chan_t *iser_chan)
{
	ibt_wc_t	wc;
	iser_msg_t	*msg;
	iser_qp_t	*iser_qp;
	int		status;

	iser_qp = &(iser_chan->ic_qp);

	bzero(&wc, sizeof (ibt_wc_t));
	status = ibt_poll_cq(cq_hdl, &wc, 1, NULL);
	if (status == IBT_CQ_EMPTY) {
		/* CQ is empty, return */
		return (status);
	}

	if (status != IBT_SUCCESS) {
		/* Unexpected error */
		ISER_LOG(CE_NOTE, "iser_ib_poll_recv_completions: "
		    "ibt_poll_cq error (%d)", status);
		mutex_enter(&iser_qp->qp_lock);
		iser_qp->rq_level--;
		mutex_exit(&iser_qp->qp_lock);
		/* Free the msg handle (if we got it back) */
		if ((msg = (iser_msg_t *)(uintptr_t)wc.wc_id) != NULL) {
			iser_msg_free(msg);
		}
		return (status);
	}

	/* Retrieve the iSER msg handle */
	msg = (iser_msg_t *)(uintptr_t)wc.wc_id;
	ASSERT(msg != NULL);

	/*
	 * Decrement the posted level in the RQ, then check
	 * to see if we need to fill the RQ back up (or if
	 * we are already on the taskq).
	 */
	mutex_enter(&iser_qp->qp_lock);
	iser_qp->rq_level--;

	if ((iser_qp->rq_taskqpending == B_FALSE) &&
	    (iser_qp->rq_level <= iser_qp->rq_lwm)) {
		/* Set the pending flag and fire off a post_recv */
		iser_qp->rq_taskqpending = B_TRUE;
		mutex_exit(&iser_qp->qp_lock);

		status = ddi_taskq_dispatch(iser_taskq, iser_ib_post_recv,
		    (void *)iser_chan->ic_chanhdl, DDI_NOSLEEP);

		if (status != DDI_SUCCESS) {
			ISER_LOG(CE_NOTE, "iser_ib_poll_recv_completions: "
			    "task dispatch failed");
			/* Failure to launch, unset the pending flag */
			mutex_enter(&iser_qp->qp_lock);
			iser_qp->rq_taskqpending = B_FALSE;
			mutex_exit(&iser_qp->qp_lock);
		}
	} else {
		mutex_exit(&iser_qp->qp_lock);
	}

	DTRACE_PROBE3(iser__recv__cqe, iser_chan_t *, iser_chan,
	    ibt_wc_t *, &wc, ibt_wc_status_t, wc.wc_status);
	if (wc.wc_status != IBT_WC_SUCCESS) {
		/*
		 * Tell IDM that the channel has gone down,
		 * unless he already knows.
		 */
		mutex_enter(&iser_chan->ic_conn->ic_lock);
		switch (iser_chan->ic_conn->ic_stage) {
		case ISER_CONN_STAGE_IC_DISCONNECTED:
		case ISER_CONN_STAGE_IC_FREED:
		case ISER_CONN_STAGE_CLOSING:
		case ISER_CONN_STAGE_CLOSED:
			break;

		default:
			idm_conn_event(iser_chan->ic_conn->ic_idmc,
			    CE_TRANSPORT_FAIL, IDM_STATUS_FAIL);
			iser_chan->ic_conn->ic_stage =
			    ISER_CONN_STAGE_CLOSING;
		}
		mutex_exit(&iser_chan->ic_conn->ic_lock);

		iser_msg_free(msg);
		return (DDI_SUCCESS);
	} else {
		/*
		 * We have an iSER message in, let's handle it.
		 * We will free the iser_msg_t later in this path,
		 * depending upon the action required.
		 */
		iser_msg_handle(iser_chan, msg);
		return (DDI_SUCCESS);
	}
}

static void
iser_msg_handle(iser_chan_t *chan, iser_msg_t *msg)
{
	int		opcode;
	iser_ctrl_hdr_t	*hdr = NULL;
	iser_conn_t	*iser_conn = chan->ic_conn;
	int		status;

	hdr = (iser_ctrl_hdr_t *)(uintptr_t)msg->msg_ds.ds_va;
	ASSERT(hdr != NULL);

	opcode = hdr->opcode;
	if (opcode == ISER_OPCODE_CTRL_TYPE_PDU) {
		/*
		 * Handle an iSCSI Control PDU iSER message.
		 * Note we'll free the msg handle in the PDU callback.
		 */
		status = iser_iscsihdr_handle(chan, msg);
		if (status != DDI_SUCCESS) {
			/*
			 * We are unable to handle this message, and
			 * have no way to recover from this.  Fail the
			 * transport.
			 */
			ISER_LOG(CE_NOTE, "iser_msg_handle: failed "
			    "iser_iscsihdr_handle");
			iser_msg_free(msg);
			idm_conn_event(iser_conn->ic_idmc,
			    CE_TRANSPORT_FAIL, IDM_STATUS_FAIL);
		}
	} else if (opcode == ISER_OPCODE_HELLO_MSG) { /* at the target */
		/*
		 * We are currently not supporting Hello Exchange,
		 * since OFED iSER does not. May be revisited.
		 */
		ASSERT(opcode != ISER_OPCODE_HELLO_MSG);

		if (iser_conn->ic_type != ISER_CONN_TYPE_TGT) {
			idm_conn_event(iser_conn->ic_idmc,
			    CE_TRANSPORT_FAIL, IDM_STATUS_FAIL);
		}

		iser_hello_hdr_t *hello_hdr = (iser_hello_hdr_t *)hdr;

		ISER_LOG(CE_NOTE, "received Hello message: opcode[%d], "
		    "maxver[%d], minver[%d], iser_ird[%d], msg (0x%p)",
		    hello_hdr->opcode, hello_hdr->maxver, hello_hdr->minver,
		    ntohs(hello_hdr->iser_ird), (void *)msg);

		mutex_enter(&iser_conn->ic_lock);

		if (iser_conn->ic_stage != ISER_CONN_STAGE_HELLO_WAIT) {
			/* target is not expected to receive a Hello */
			idm_conn_event(iser_conn->ic_idmc,
			    CE_TRANSPORT_FAIL, IDM_STATUS_FAIL);
		}

		iser_conn->ic_stage = ISER_CONN_STAGE_HELLOREPLY_SENT;
		mutex_exit(&iser_conn->ic_lock);

		/* Prepare and send a HelloReply message */
		status = iser_xfer_helloreply_msg(chan);
		if (status != ISER_STATUS_SUCCESS) {

			mutex_enter(&iser_conn->ic_lock);
			iser_conn->ic_stage =
			    ISER_CONN_STAGE_HELLOREPLY_SENT_FAIL;
			mutex_exit(&iser_conn->ic_lock);

			idm_conn_event(iser_conn->ic_idmc,
			    CE_TRANSPORT_FAIL, status);
		}

		/* Free this msg handle */
		iser_msg_free(msg);

	} else if (opcode == ISER_OPCODE_HELLOREPLY_MSG) { /* at initiator */

		/*
		 * We are currently not supporting Hello Exchange,
		 * since OFED iSER does not. May be revisited.
		 */
		ASSERT(opcode != ISER_OPCODE_HELLOREPLY_MSG);

		if (iser_conn->ic_type != ISER_CONN_TYPE_INI) {
			idm_conn_event(iser_conn->ic_idmc,
			    CE_TRANSPORT_FAIL, status);
		}

		iser_helloreply_hdr_t *hello_hdr = (iser_helloreply_hdr_t *)hdr;

		ISER_LOG(CE_NOTE, "received Hello Reply message: opcode[%d], "
		    "maxver[%d], curver[%d], iser_ord[%d], msg (0x%p)",
		    hello_hdr->opcode, hello_hdr->maxver, hello_hdr->curver,
		    ntohs(hello_hdr->iser_ord), (void *)msg);

		/* Free this msg handle */
		iser_msg_free(msg);

		/*
		 * Signal the receipt of HelloReply to the waiting thread
		 * so that the initiator can proceed to the Full Feature
		 * Phase.
		 */
		mutex_enter(&iser_conn->ic_lock);
		iser_conn->ic_stage = ISER_CONN_STAGE_HELLOREPLY_RCV;
		cv_signal(&iser_conn->ic_stage_cv);
		mutex_exit(&iser_conn->ic_lock);
	} else {
		/* Protocol error: free the msg handle and fail the session */
		ISER_LOG(CE_NOTE, "iser_msg_handle: unsupported opcode (0x%x): "
		    "terminating session on IDM handle (0x%p)", opcode,
		    (void *) iser_conn->ic_idmc);

		iser_msg_free(msg);
		idm_conn_event(iser_conn->ic_idmc, CE_TRANSPORT_FAIL,
		    IDM_STATUS_FAIL);
	}
}

#define	IDM_PDU_OPCODE(PDU) \
	((PDU)->isp_hdr->opcode & ISCSI_OPCODE_MASK)

/* network to host translation for 24b integers */
static uint32_t
n2h24(uchar_t *ptr)
{
	return ((ptr[0] << 16) | (ptr[1] << 8) | ptr[2]);
}

/* ARGSUSED */
static void
iser_rx_pdu_cb(idm_pdu_t *pdu, idm_status_t status)
{
	/* Free the iser msg handle and the PDU handle */
	iser_msg_free((iser_msg_t *)pdu->isp_transport_private);
	idm_pdu_free(pdu);
}

int
iser_iscsihdr_handle(iser_chan_t *chan, iser_msg_t *msg)
{
	idm_pdu_t	*pdu;
	uint8_t		*iser_hdrp;
	uint8_t		*iscsi_hdrp;
	iscsi_hdr_t	*bhs;

	pdu = idm_pdu_alloc_nosleep(sizeof (iscsi_hdr_t), 0);
	pdu->isp_ic = chan->ic_conn->ic_idmc;
	ASSERT(pdu->isp_ic != NULL);

	/* Set the iser_msg handle into the transport-private field */
	pdu->isp_transport_private = (void *)msg;

	/* Set up a pointer in the pdu handle to the iSER header */
	iser_hdrp = (uint8_t *)(uintptr_t)msg->msg_ds.ds_va;
	if (iser_hdrp == NULL) {
		ISER_LOG(CE_NOTE, "iser_iscsihdr_handle: iser_hdrp is NULL");
		idm_pdu_free(pdu);
		return (ISER_STATUS_FAIL);
	}
	pdu->isp_transport_hdr = (void *)iser_hdrp;
	pdu->isp_transport_hdrlen = ISER_HEADER_LENGTH;

	/*
	 * Set up a pointer to the iSCSI header, which is directly
	 * after the iSER header in the message.
	 */
	iscsi_hdrp = ((uint8_t *)(uintptr_t)msg->msg_ds.ds_va) +
	    ISER_HEADER_LENGTH;
	if (iscsi_hdrp == NULL) {
		ISER_LOG(CE_NOTE, "iser_iscsihdr_handle: iscsi_hdrp is NULL");
		idm_pdu_free(pdu);
		return (ISER_STATUS_FAIL);
	}
	pdu->isp_hdr = (iscsi_hdr_t *)(uintptr_t)iscsi_hdrp;

	/* Fill in the BHS */
	bhs = pdu->isp_hdr;
	pdu->isp_hdrlen	= sizeof (iscsi_hdr_t) +
	    (bhs->hlength * sizeof (uint32_t));
	pdu->isp_datalen = n2h24(bhs->dlength);
	pdu->isp_callback = iser_rx_pdu_cb;

	/*
	 * If datalen > 0, then non-scsi data may be present. Allocate
	 * space in the PDU handle and set a pointer to the data.
	 */
	if (pdu->isp_datalen) {
		pdu->isp_data = ((uint8_t *)(uintptr_t)pdu->isp_hdr) +
		    pdu->isp_hdrlen;
	}

	/* Process RX PDU */
	idm_pdu_rx(pdu->isp_ic, pdu);

	return (DDI_SUCCESS);
}