usr/src/uts/common/io/ib/clients/ibd/ibd_cm.c
author ke ge - Sun Microsystems - Beijing China <Kevin.Ge@Sun.COM>
Mon, 18 Jan 2010 13:06:27 +0800
changeset 11534 9e690526d933
child 11642 5c24335364eb
permissions -rw-r--r--
6831623 Add Connected Mode to IPonIB

/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */

/*
 * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */
/* Copyright (c) 1990 Mentat Inc. */

/*
 * An implementation of the IPoIB-CM standard based on PSARC 2009/593.
 */
#include <sys/types.h>
#include <sys/conf.h>
#include <sys/ddi.h>
#include <sys/sunddi.h>
#include <sys/modctl.h>
#include <sys/stropts.h>
#include <sys/stream.h>
#include <sys/strsun.h>
#include <sys/strsubr.h>
#include <sys/dlpi.h>
#include <sys/mac_provider.h>

#include <sys/pattr.h>		/* for HCK_FULLCKSUM */
#include <sys/atomic.h>		/* for atomic_add*() */
#include <sys/ethernet.h>	/* for ETHERTYPE_IP */
#include <netinet/in.h>		/* for netinet/ip.h below */
#include <netinet/ip.h>		/* for struct ip */
#include <inet/common.h>	/* for inet/ip.h below */
#include <inet/ip.h>		/* for ipha_t */
#include <inet/ip_if.h>		/* for ETHERTYPE_IPV6 */
#include <inet/ip6.h>		/* for ip6_t */
#include <netinet/icmp6.h>	/* for icmp6_t */
#include <sys/ib/ibtl/ibvti.h>	/* for ace->ac_dest->ud_dst_qpn */

#include <sys/ib/clients/ibd/ibd.h>


/* Per-interface tunables (for developers) */
extern uint_t ibd_rc_tx_copy_thresh;
/*
 * ibd_rc_rx_copy_thresh
 *     If (the size of incoming buffer <= ibd_rc_rx_copy_thresh), ibd will
 * attempt to allocate a buffer and do a bcopy of the incoming data into
 * the alocated buffer.
 *
 * ibd_rc_rx_rwqe_thresh
 *     If (the number of available rwqe < ibd_rc_rx_rwqe_thresh), ibd will
 * attempt to allocate a buffer and do a bcopy of the incoming data into
 * the allocated buffer.
 */
uint_t ibd_rc_rx_copy_thresh = 0x1000;
uint_t ibd_rc_rx_rwqe_thresh = 0x200;	/* old is 32; */

/*
 * ibd_rc_num_swqe
 *	1) Send CQ size = ibd_rc_num_swqe
 *	2) The send queue size = ibd_rc_num_swqe -1
 *	3) Number of pre-allocated Tx buffers for ibt_post_send() =
 * ibd_rc_num_swqe - 1.
 */
uint_t ibd_rc_num_swqe = 0x1ff;

/*
 * ibd_rc_num_rwqe
 *	1) For non-SRQ, we pre-post ibd_rc_num_rwqe number of WRs
 * via ibt_post_receive() for receive queue of each RC channel.
 *	2) For SRQ and non-SRQ, receive CQ size = ibd_rc_num_rwqe
 */
uint_t ibd_rc_num_rwqe = 0x7ff;

/*
 * For SRQ
 *	If using SRQ, we allocate ibd_rc_num_srq number of buffers (the size of
 * each buffer is equal to RC mtu). And post them by ibt_post_srq().
 *
 *	ibd_rc_num_srq should not be larger than ibd_rc_num_rwqe, otherwise
 * it will cause a bug with the following warnings:
 * NOTICE: hermon0: Device Error: EQE cq overrun or protection error
 * NOTICE: hermon0: Device Error: EQE local work queue catastrophic error
 * NOTICE: ibd0: HCA GUID 0003ba0001008984 port 1 PKEY ffff catastrophic
 * channel error
 * NOTICE: ibd0: HCA GUID 0003ba0001008984 port 1 PKEY ffff completion queue
 * error
 */
uint_t ibd_rc_num_srq = 0x7fe;

boolean_t ibd_rc_enable_cq_moderation = B_TRUE;

/*
 * Send CQ moderation parameters
 */
uint_t ibd_rc_txcomp_count = 10;
uint_t ibd_rc_txcomp_usec = 300;

/*
 * Receive CQ moderation parameters
 */
uint_t ibd_rc_rxcomp_count = 4;
uint_t ibd_rc_rxcomp_usec = 10;

uint_t ibd_rc_tx_softintr = 1;

/*
 * If the number of WRs in receive queue of each RC connection less than
 * IBD_RC_RX_WR_THRESHOLD, we will post more receive WRs into it.
 */
#define	IBD_RC_RX_WR_THRESHOLD		0x20

/*
 * If the number of free SWQEs (or large Tx buf) is larger than or equal to
 * IBD_RC_TX_FREE_THRESH, we will call mac_tx_update to notify GLD to continue
 * transmitting packets.
 */
#define	IBD_RC_TX_FREE_THRESH		8

#define	IBD_RC_QPN_TO_SID(qpn) \
	((uint64_t)(IBD_RC_SERVICE_ID | ((qpn) & 0xffffff)))

/* For interop with legacy OFED */
#define	IBD_RC_QPN_TO_SID_OFED_INTEROP(qpn) \
	((uint64_t)(IBD_RC_SERVICE_ID_OFED_INTEROP | ((qpn) & 0xffffff)))

/* Internet Header + 64 bits of Data Datagram. Refer to RFC 792 */
#define	IBD_RC_IP_ICMP_RETURN_DATA_BYTES	64


/* Functions for Reliable Connected Mode */
/* Connection Setup/Close Functions */
static ibt_cm_status_t ibd_rc_dispatch_pass_mad(void *,
    ibt_cm_event_t *, ibt_cm_return_args_t *, void *, ibt_priv_data_len_t);
static ibt_cm_status_t ibd_rc_dispatch_actv_mad(void *,
    ibt_cm_event_t *, ibt_cm_return_args_t *, void *, ibt_priv_data_len_t);
static int ibd_rc_pas_close(ibd_rc_chan_t *);
static void ibd_rc_act_close(ibd_rc_chan_t *);

static inline void ibd_rc_add_to_chan_list(ibd_rc_chan_list_t *,
    ibd_rc_chan_t *);
static inline ibd_rc_chan_t *ibd_rc_rm_header_chan_list(
    ibd_rc_chan_list_t *);
static inline void ibd_rc_rm_from_chan_list(ibd_rc_chan_list_t *,
    ibd_rc_chan_t *);

/* CQ handlers */
static void ibd_rc_rcq_handler(ibt_cq_hdl_t, void *);
static void ibd_rc_scq_handler(ibt_cq_hdl_t, void *);
static void ibd_rc_poll_rcq(ibd_rc_chan_t *, ibt_cq_hdl_t);

/* Receive Functions */
static int ibd_rc_post_srq(ibd_state_t *, ibd_rwqe_t *);
static void ibd_rc_srq_freemsg_cb(char *);

static int ibd_rc_post_rwqe(ibd_rc_chan_t *, ibd_rwqe_t *);
static void ibd_rc_freemsg_cb(char *);
static void ibd_rc_process_rx(ibd_rc_chan_t *, ibd_rwqe_t *, ibt_wc_t *);
static void ibd_rc_free_rwqe(ibd_rc_chan_t *, ibd_rwqe_t *);
static void ibd_rc_fini_rxlist(ibd_rc_chan_t *);


/* Send Functions */
static void ibd_rc_release_swqe(ibd_rc_chan_t *, ibd_swqe_t *);
static int ibd_rc_init_txlist(ibd_rc_chan_t *);
static void ibd_rc_fini_txlist(ibd_rc_chan_t *);
static uint_t ibd_rc_tx_recycle(caddr_t);


void
ibd_async_rc_close_act_chan(ibd_state_t *state, ibd_req_t *req)
{
	ibd_rc_chan_t *rc_chan = req->rq_ptr;
	ibd_ace_t *ace;

	while (rc_chan != NULL) {
		ace = rc_chan->ace;
		ASSERT(ace != NULL);
		/* Close old RC channel */
		ibd_rc_act_close(rc_chan);
		mutex_enter(&state->id_ac_mutex);
		ASSERT(ace->ac_ref != 0);
		atomic_dec_32(&ace->ac_ref);
		ace->ac_chan = NULL;
		if ((ace->ac_ref == 0) || (ace->ac_ref == CYCLEVAL)) {
			IBD_ACACHE_INSERT_FREE(state, ace);
			ace->ac_ref = 0;
		} else {
			ace->ac_ref |= CYCLEVAL;
			state->rc_delay_ace_recycle++;
		}
		mutex_exit(&state->id_ac_mutex);
		rc_chan = ibd_rc_rm_header_chan_list(
		    &state->rc_obs_act_chan_list);
	}
}

void
ibd_async_rc_recycle_ace(ibd_state_t *state, ibd_req_t *req)
{
	ibd_ace_t *ace = req->rq_ptr;
	ibd_rc_chan_t *rc_chan;

	ASSERT(ace != NULL);
	rc_chan = ace->ac_chan;
	ASSERT(rc_chan != NULL);
	/* Close old RC channel */
	ibd_rc_act_close(rc_chan);
	mutex_enter(&state->id_ac_mutex);
	ASSERT(ace->ac_ref != 0);
	atomic_dec_32(&ace->ac_ref);
	ace->ac_chan = NULL;
	if ((ace->ac_ref == 0) || (ace->ac_ref == CYCLEVAL)) {
		IBD_ACACHE_INSERT_FREE(state, ace);
		ace->ac_ref = 0;
	} else {
		ace->ac_ref |= CYCLEVAL;
		state->rc_delay_ace_recycle++;
	}
	mutex_exit(&state->id_ac_mutex);
	mutex_enter(&state->rc_ace_recycle_lock);
	state->rc_ace_recycle = NULL;
	mutex_exit(&state->rc_ace_recycle_lock);
}

/* Simple ICMP IP Header Template */
static const ipha_t icmp_ipha = {
	IP_SIMPLE_HDR_VERSION, 0, 0, 0, 0, 0, IPPROTO_ICMP
};

/* Packet is too big. Send ICMP packet to GLD to request a smaller MTU */
void
ibd_async_rc_process_too_big(ibd_state_t *state, ibd_req_t *req)
{
	mblk_t *mp = req->rq_ptr;
	ibd_ace_t *ace = req->rq_ptr2;
	uint16_t mtu = state->id_mtu - IPOIB_HDRSIZE;
	uint_t	len_needed;
	size_t	msg_len;
	mblk_t	*pmtu_mp;
	ushort_t	sap;
	ib_header_info_t *ibha;	/* ib header for pmtu_pkt */
	/*
	 * ipha: IP header for pmtu_pkt
	 * old_ipha: IP header for old packet
	 */
	ipha_t *ipha, *old_ipha;
	icmph_t	*icmph;

	sap = ntohs(((ipoib_hdr_t *)mp->b_rptr)->ipoib_type);

	if (!pullupmsg(mp, -1)) {
		DPRINT(40, "ibd_async_rc_process_too_big: pullupmsg fail");
		goto too_big_fail;
	}
	/* move to IP header. */
	mp->b_rptr += IPOIB_HDRSIZE;
	old_ipha = (ipha_t *)mp->b_rptr;

	len_needed = IPH_HDR_LENGTH(old_ipha);
	if (old_ipha->ipha_protocol == IPPROTO_ENCAP) {
		len_needed += IPH_HDR_LENGTH(((uchar_t *)old_ipha +
		    len_needed));
	} else if (old_ipha->ipha_protocol == IPPROTO_IPV6) {
		ip6_t *ip6h = (ip6_t *)((uchar_t *)old_ipha
		    + len_needed);
		len_needed += ip_hdr_length_v6(mp, ip6h);
	}
	len_needed += IBD_RC_IP_ICMP_RETURN_DATA_BYTES;
	msg_len = msgdsize(mp);
	if (msg_len > len_needed) {
		(void) adjmsg(mp, len_needed - msg_len);
		msg_len = len_needed;
	}

	if ((pmtu_mp = allocb(sizeof (ib_header_info_t) + sizeof (ipha_t)
	    + sizeof (icmph_t), BPRI_MED)) == NULL) {
		DPRINT(40, "ibd_async_rc_process_too_big: allocb fail");
		goto too_big_fail;
	}
	pmtu_mp->b_cont = mp;
	pmtu_mp->b_wptr = pmtu_mp->b_rptr + sizeof (ib_header_info_t)
	    + sizeof (ipha_t) + sizeof (icmph_t);

	ibha = (ib_header_info_t *)pmtu_mp->b_rptr;

	/* Fill IB header */
	bcopy(&state->id_macaddr, &ibha->ib_dst, IPOIB_ADDRL);
	/*
	 * If the GRH is not valid, indicate to GLDv3 by setting
	 * the VerTcFlow field to 0.
	 */
	ibha->ib_grh.ipoib_vertcflow = 0;
	ibha->ipib_rhdr.ipoib_type = htons(sap);
	ibha->ipib_rhdr.ipoib_mbz = 0;

	/* Fill IP header */
	ipha = (ipha_t *)&ibha[1];
	*ipha = icmp_ipha;
	ipha->ipha_src = old_ipha->ipha_dst;
	ipha->ipha_dst = old_ipha->ipha_src;
	ipha->ipha_ttl = old_ipha->ipha_ttl;
	msg_len += sizeof (icmp_ipha) + sizeof (icmph_t);
	if (msg_len > IP_MAXPACKET) {
		ibd_print_warn(state, "ibd_rc_process_too_big_pkt: msg_len(%d) "
		    "> IP_MAXPACKET", (uint32_t)msg_len);
		(void) adjmsg(mp, IP_MAXPACKET - msg_len);
		msg_len = IP_MAXPACKET;
	}
	ipha->ipha_length = htons((uint16_t)msg_len);
	ipha->ipha_hdr_checksum = 0;
	ipha->ipha_hdr_checksum = (uint16_t)ip_csum_hdr(ipha);

	/* Fill ICMP body */
	icmph = (icmph_t *)&ipha[1];
	bzero(icmph, sizeof (icmph_t));
	icmph->icmph_type = ICMP_DEST_UNREACHABLE;
	icmph->icmph_code = ICMP_FRAGMENTATION_NEEDED;
	icmph->icmph_du_mtu = htons(mtu);
	icmph->icmph_checksum = 0;
	icmph->icmph_checksum = IP_CSUM(pmtu_mp,
	    (int32_t)sizeof (ib_header_info_t) + (int32_t)sizeof (ipha_t), 0);

	(void) hcksum_assoc(pmtu_mp, NULL, NULL, 0, 0, 0, 0,
	    HCK_FULLCKSUM | HCK_FULLCKSUM_OK, 0);

	DPRINT(30, "ibd_async_rc_process_too_big: sap=0x%x, ip_src=0x%x, "
	    "ip_dst=0x%x, ttl=%d, len_needed=%d, msg_len=%d",
	    sap, ipha->ipha_src, ipha->ipha_dst, ipha->ipha_ttl,
	    len_needed, (uint32_t)msg_len);

	mac_rx(state->id_mh, state->id_rh, pmtu_mp);

	mutex_enter(&ace->tx_too_big_mutex);
	ace->tx_too_big_ongoing = B_FALSE;
	mutex_exit(&ace->tx_too_big_mutex);
	return;

too_big_fail:
	/* Drop packet */
	freemsg(mp);
	mutex_enter(&ace->tx_too_big_mutex);
	ace->tx_too_big_ongoing = B_FALSE;
	mutex_exit(&ace->tx_too_big_mutex);
}

void
ibd_rc_get_conf(ibd_state_t *state)
{
	int *props;
	uint_t num_props;
	int instance;

	instance = ddi_get_instance(state->id_dip);

	/*
	 * Get the array of "enable_rc" properties from "ibd.conf" file
	 */
	if (ddi_prop_lookup_int_array(DDI_DEV_T_ANY, state->id_dip,
	    DDI_PROP_DONTPASS, "enable_rc", &props, &num_props)
	    == DDI_PROP_SUCCESS) {
		if (instance < num_props) {
			if (props[instance] == 1) {
				state->id_enable_rc = B_TRUE;
			} else {
				state->id_enable_rc = B_FALSE;
			}
		} else {
			/* not enough properties configured */
			state->id_enable_rc = B_FALSE;
			DPRINT(40, "ibd_rc_get_conf: Not enough "
			    "enable_rc values in ibd.conf,"
			    " disable RC mode, instance=%d", instance);
		}

		/* free memory allocated for properties */
		ddi_prop_free(props);
	} else {
		state->id_enable_rc = B_FALSE;
		DPRINT(30, "ibd_rc_get_conf: fail to find "
		    "enable_rc in ibd.conf, disable RC mode");
	}

	state->rc_mtu = 65524;
	state->rc_enable_srq = B_TRUE;
}

#ifdef DEBUG
/*
 * ibd_rc_update_stats - update driver private kstat counters
 *
 * This routine will dump the internal statistics counters for ibd's
 * Reliable Connected Mode. The current stats dump values will
 * be sent to the kernel status area.
 */
static int
ibd_rc_update_stats(kstat_t *ksp, int rw)
{
	ibd_state_t *state;
	ibd_rc_stat_t *ibd_rc_ksp;

	if (rw == KSTAT_WRITE)
		return (EACCES);

	state = (ibd_state_t *)ksp->ks_private;
	ASSERT(state != NULL);
	ibd_rc_ksp = (ibd_rc_stat_t *)ksp->ks_data;

	ibd_rc_ksp->rc_rcv_trans_byte.value.ul = state->rc_rcv_trans_byte;
	ibd_rc_ksp->rc_rcv_trans_pkt.value.ul = state->rc_rcv_trans_pkt;
	ibd_rc_ksp->rc_rcv_copy_byte.value.ul = state->rc_rcv_copy_byte;
	ibd_rc_ksp->rc_rcv_copy_pkt.value.ul = state->rc_rcv_copy_pkt;
	ibd_rc_ksp->rc_rcv_alloc_fail.value.ul = state->rc_rcv_alloc_fail;

	ibd_rc_ksp->rc_rcq_invoke.value.ul = state->rc_rcq_invoke;
	ibd_rc_ksp->rc_rcq_err.value.ul = state->rc_rcq_err;
	ibd_rc_ksp->rc_scq_invoke.value.ul = state->rc_scq_invoke;

	ibd_rc_ksp->rc_rwqe_short.value.ul = state->rc_rwqe_short;

	ibd_rc_ksp->rc_xmt_bytes.value.ul = state->rc_xmt_bytes;
	ibd_rc_ksp->rc_xmt_small_pkt.value.ul = state->rc_xmt_small_pkt;
	ibd_rc_ksp->rc_xmt_fragmented_pkt.value.ul =
	    state->rc_xmt_fragmented_pkt;
	ibd_rc_ksp->rc_xmt_map_fail_pkt.value.ul = state->rc_xmt_map_fail_pkt;
	ibd_rc_ksp->rc_xmt_map_succ_pkt.value.ul = state->rc_xmt_map_succ_pkt;
	ibd_rc_ksp->rc_ace_not_found.value.ul = state->rc_ace_not_found;

	ibd_rc_ksp->rc_scq_no_swqe.value.ul = state->rc_scq_no_swqe;
	ibd_rc_ksp->rc_scq_no_largebuf.value.ul = state->rc_scq_no_largebuf;
	ibd_rc_ksp->rc_swqe_short.value.ul = state->rc_swqe_short;
	ibd_rc_ksp->rc_swqe_mac_update.value.ul = state->rc_swqe_mac_update;
	ibd_rc_ksp->rc_xmt_buf_short.value.ul = state->rc_xmt_buf_short;
	ibd_rc_ksp->rc_xmt_buf_mac_update.value.ul =
	    state->rc_xmt_buf_mac_update;

	ibd_rc_ksp->rc_conn_succ.value.ul = state->rc_conn_succ;
	ibd_rc_ksp->rc_conn_fail.value.ul = state->rc_conn_fail;
	ibd_rc_ksp->rc_null_conn.value.ul = state->rc_null_conn;
	ibd_rc_ksp->rc_no_estab_conn.value.ul = state->rc_no_estab_conn;

	ibd_rc_ksp->rc_act_close.value.ul = state->rc_act_close;
	ibd_rc_ksp->rc_pas_close.value.ul = state->rc_pas_close;
	ibd_rc_ksp->rc_delay_ace_recycle.value.ul = state->rc_delay_ace_recycle;
	ibd_rc_ksp->rc_act_close_simultaneous.value.ul =
	    state->rc_act_close_simultaneous;
	ibd_rc_ksp->rc_reset_cnt.value.ul = state->rc_reset_cnt;

	return (0);
}


/*
 * ibd_rc_init_stats - initialize kstat data structures
 *
 * This routine will create and initialize the driver private
 * statistics counters.
 */
int
ibd_rc_init_stats(ibd_state_t *state)
{
	kstat_t *ksp;
	ibd_rc_stat_t *ibd_rc_ksp;

	/*
	 * Create and init kstat
	 */
	ksp = kstat_create("ibd", ddi_get_instance(state->id_dip),
	    "statistics", "net", KSTAT_TYPE_NAMED,
	    sizeof (ibd_rc_stat_t) / sizeof (kstat_named_t), 0);

	if (ksp == NULL) {
		ibd_print_warn(state, "ibd_rc_init_stats: Could not create "
		    "kernel statistics");
		return (DDI_FAILURE);
	}

	state->rc_ksp = ksp;	/* Fill in the ksp of ibd over RC mode */

	ibd_rc_ksp = (ibd_rc_stat_t *)ksp->ks_data;

	/*
	 * Initialize all the statistics
	 */
	kstat_named_init(&ibd_rc_ksp->rc_rcv_trans_byte, "RC: Rx Bytes, "
	    "transfer mode", KSTAT_DATA_ULONG);
	kstat_named_init(&ibd_rc_ksp->rc_rcv_trans_pkt, "RC: Rx Pkts, "
	    "transfer mode", KSTAT_DATA_ULONG);
	kstat_named_init(&ibd_rc_ksp->rc_rcv_copy_byte, "RC: Rx Bytes, "
	    "copy mode", KSTAT_DATA_ULONG);
	kstat_named_init(&ibd_rc_ksp->rc_rcv_copy_pkt, "RC: Rx Pkts, "
	    "copy mode", KSTAT_DATA_ULONG);
	kstat_named_init(&ibd_rc_ksp->rc_rcv_alloc_fail, "RC: Rx alloc fail",
	    KSTAT_DATA_ULONG);

	kstat_named_init(&ibd_rc_ksp->rc_rcq_invoke, "RC: invoke of Recv CQ "
	    "handler", KSTAT_DATA_ULONG);
	kstat_named_init(&ibd_rc_ksp->rc_rcq_err, "RC: fail in Recv CQ handler",
	    KSTAT_DATA_ULONG);

	kstat_named_init(&ibd_rc_ksp->rc_scq_invoke, "RC: invoke of Send CQ "
	    "handler", KSTAT_DATA_ULONG);

	kstat_named_init(&ibd_rc_ksp->rc_rwqe_short, "RC: Short rwqe",
	    KSTAT_DATA_ULONG);

	kstat_named_init(&ibd_rc_ksp->rc_xmt_bytes, "RC: Sent Bytes",
	    KSTAT_DATA_ULONG);
	kstat_named_init(&ibd_rc_ksp->rc_xmt_small_pkt,
	    "RC: Tx pkt small size", KSTAT_DATA_ULONG);
	kstat_named_init(&ibd_rc_ksp->rc_xmt_fragmented_pkt,
	    "RC: Tx pkt fragmentary", KSTAT_DATA_ULONG);
	kstat_named_init(&ibd_rc_ksp->rc_xmt_map_fail_pkt,
	    "RC: Tx pkt fail ibt_map_mem_iov()", KSTAT_DATA_ULONG);
	kstat_named_init(&ibd_rc_ksp->rc_xmt_map_succ_pkt,
	    "RC: Tx pkt succ ibt_map_mem_iov()", KSTAT_DATA_ULONG);
	kstat_named_init(&ibd_rc_ksp->rc_ace_not_found, "RC: ace not found",
	    KSTAT_DATA_ULONG);

	kstat_named_init(&ibd_rc_ksp->rc_scq_no_swqe, "RC: No swqe after "
	    "recycle", KSTAT_DATA_ULONG);
	kstat_named_init(&ibd_rc_ksp->rc_scq_no_largebuf, "RC: No large tx buf "
	    "after recycle", KSTAT_DATA_ULONG);
	kstat_named_init(&ibd_rc_ksp->rc_swqe_short, "RC: No swqe in ibd_send",
	    KSTAT_DATA_ULONG);
	kstat_named_init(&ibd_rc_ksp->rc_swqe_mac_update, "RC: mac_tx_update "
	    "#, swqe available", KSTAT_DATA_ULONG);
	kstat_named_init(&ibd_rc_ksp->rc_xmt_buf_short, "RC: No buf in "
	    "ibd_send", KSTAT_DATA_ULONG);
	kstat_named_init(&ibd_rc_ksp->rc_xmt_buf_mac_update, "RC: "
	    "mac_tx_update #, buf available", KSTAT_DATA_ULONG);

	kstat_named_init(&ibd_rc_ksp->rc_conn_succ, "RC: succ connected",
	    KSTAT_DATA_ULONG);
	kstat_named_init(&ibd_rc_ksp->rc_conn_fail, "RC: fail connect",
	    KSTAT_DATA_ULONG);
	kstat_named_init(&ibd_rc_ksp->rc_null_conn, "RC: null conn for unicast "
	    "pkt", KSTAT_DATA_ULONG);
	kstat_named_init(&ibd_rc_ksp->rc_no_estab_conn, "RC: not in act estab "
	    "state", KSTAT_DATA_ULONG);

	kstat_named_init(&ibd_rc_ksp->rc_act_close, "RC: call ibd_rc_act_close",
	    KSTAT_DATA_ULONG);
	kstat_named_init(&ibd_rc_ksp->rc_pas_close, "RC: call ibd_rc_pas_close",
	    KSTAT_DATA_ULONG);
	kstat_named_init(&ibd_rc_ksp->rc_delay_ace_recycle, "RC: delay ace "
	    "recycle", KSTAT_DATA_ULONG);
	kstat_named_init(&ibd_rc_ksp->rc_act_close_simultaneous, "RC: "
	    "simultaneous ibd_rc_act_close", KSTAT_DATA_ULONG);
	kstat_named_init(&ibd_rc_ksp->rc_reset_cnt, "RC: Reset RC channel",
	    KSTAT_DATA_ULONG);

	/*
	 * Function to provide kernel stat update on demand
	 */
	ksp->ks_update = ibd_rc_update_stats;

	/*
	 * Pointer into provider's raw statistics
	 */
	ksp->ks_private = (void *)state;

	/*
	 * Add kstat to systems kstat chain
	 */
	kstat_install(ksp);

	return (DDI_SUCCESS);
}
#endif

static ibt_status_t
ibd_rc_alloc_chan(ibd_rc_chan_t **ret_chan, ibd_state_t *state,
    boolean_t is_tx_chan)
{
	ibt_status_t result;
	ibd_rc_chan_t *chan;
	ibt_rc_chan_alloc_args_t alloc_args;
	ibt_chan_alloc_flags_t alloc_flags;
	ibt_chan_sizes_t sizes;
	ibt_cq_attr_t cq_atts;
	int rv;

	chan = kmem_zalloc(sizeof (ibd_rc_chan_t), KM_SLEEP);

	chan->state = state;
	mutex_init(&chan->rx_wqe_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
	mutex_init(&chan->rx_free_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
	mutex_init(&chan->tx_wqe_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
	mutex_init(&chan->tx_rel_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
	mutex_init(&chan->tx_post_lock, NULL, MUTEX_DRIVER, NULL);
	mutex_init(&chan->tx_poll_lock, NULL, MUTEX_DRIVER, NULL);

	/* Allocate IB structures for a new RC channel. */
	if (is_tx_chan) {
		chan->scq_size = ibd_rc_num_swqe;
		chan->rcq_size = IBD_RC_MIN_CQ_SIZE;
	} else {
		chan->scq_size = IBD_RC_MIN_CQ_SIZE;
		chan->rcq_size = ibd_rc_num_rwqe;
	}
	cq_atts.cq_size = chan->scq_size;
	cq_atts.cq_sched = NULL;
	cq_atts.cq_flags = IBT_CQ_NO_FLAGS;
	result = ibt_alloc_cq(state->id_hca_hdl, &cq_atts, &chan->scq_hdl,
	    &chan->scq_size);
	if (result != IBT_SUCCESS) {
		DPRINT(40, "ibd_rc_alloc_chan: error <%d>"
		    "create scq completion queue (size <%d>)",
		    result, chan->scq_size);
		goto alloc_scq_err;
	}	/* if failure to alloc cq */

	if (ibd_rc_enable_cq_moderation) {
		if (ibt_modify_cq(chan->scq_hdl, ibd_rc_txcomp_count,
		    ibd_rc_txcomp_usec, 0) != IBT_SUCCESS) {
			ibd_print_warn(state, "ibd_rc_alloc_chan: Send CQ "
			    "interrupt moderation failed");
		}
	}

	ibt_set_cq_private(chan->scq_hdl, (void *) (uintptr_t)chan);
	ibt_set_cq_handler(chan->scq_hdl, ibd_rc_scq_handler,
	    (void *) (uintptr_t)chan);

	cq_atts.cq_size = chan->rcq_size;
	cq_atts.cq_sched = NULL;
	cq_atts.cq_flags = IBT_CQ_NO_FLAGS;
	result = ibt_alloc_cq(state->id_hca_hdl, &cq_atts, &chan->rcq_hdl,
	    &chan->rcq_size);
	if (result != IBT_SUCCESS) {
		ibd_print_warn(state, "ibd_rc_alloc_chan: error <%d> creating "
		    "rx completion queue (size <%d>)", result, chan->rcq_size);
		goto alloc_rcq_err;
	}	/* if failure to alloc cq */

	if (ibd_rc_enable_cq_moderation) {
		if (ibt_modify_cq(chan->rcq_hdl, ibd_rc_rxcomp_count,
		    ibd_rc_rxcomp_usec, 0) != IBT_SUCCESS) {
			ibd_print_warn(state, "ibd_rc_alloc_chan: Receive CQ "
			    "interrupt moderation failed");
		}
	}
	ibt_set_cq_private(chan->rcq_hdl, (void *) (uintptr_t)chan);
	ibt_set_cq_handler(chan->rcq_hdl, ibd_rc_rcq_handler,
	    (void *)(uintptr_t)chan);

	if (is_tx_chan) {
		chan->is_tx_chan = B_TRUE;
		if (ibd_rc_init_txlist(chan) != DDI_SUCCESS) {
			ibd_print_warn(state, "ibd_rc_alloc_chan: "
			    "ibd_rc_init_txlist failed");
			goto init_txlist_err;
		}
		if (ibd_rc_tx_softintr == 1) {
			if ((rv = ddi_add_softintr(state->id_dip,
			    DDI_SOFTINT_LOW, &chan->scq_softintr, NULL, NULL,
			    ibd_rc_tx_recycle, (caddr_t)chan)) !=
			    DDI_SUCCESS) {
				DPRINT(10, "ibd_rc_alloc_chan: failed in "
				    "ddi_add_softintr(scq_softintr), ret=%d",
				    rv);
				goto alloc_softintr_err;
			}
		}
	} else {
		chan->is_tx_chan = B_FALSE;
	}

	/*
	 * enable completions
	 */
	result = ibt_enable_cq_notify(chan->scq_hdl, IBT_NEXT_COMPLETION);
	if (result != IBT_SUCCESS) {
		ibd_print_warn(state, "ibd_rc_alloc_chan: ibt_enable_cq_notify"
		    "(scq) failed: status %d\n", result);
		goto alloc_scq_enable_err;
	}

	/* We will enable chan->rcq_hdl later. */

	/* alloc a RC channel */
	bzero(&alloc_args, sizeof (ibt_rc_chan_alloc_args_t));
	bzero(&sizes, sizeof (ibt_chan_sizes_t));

	alloc_args.rc_flags = IBT_WR_SIGNALED;
	alloc_args.rc_control = IBT_CEP_NO_FLAGS;

	alloc_args.rc_scq = chan->scq_hdl;
	alloc_args.rc_rcq = chan->rcq_hdl;
	alloc_args.rc_pd = state->id_pd_hdl;

	alloc_args.rc_hca_port_num = state->id_port;
	alloc_args.rc_clone_chan = NULL;

	/* scatter/gather */
	alloc_args.rc_sizes.cs_sq_sgl = state->rc_tx_max_sqseg;

	/*
	 * For the number of SGL elements in receive side, I think it
	 * should be 1. Because ibd driver allocates a whole block memory
	 * for each ibt_post_recv().
	 */
	alloc_args.rc_sizes.cs_rq_sgl = 1;

	/* The send queue size and the receive queue size */
	alloc_args.rc_sizes.cs_sq = chan->scq_size;
	alloc_args.rc_sizes.cs_rq = chan->rcq_size;

	if (state->id_hca_res_lkey_capab) {
		alloc_args.rc_flags = IBT_FAST_REG_RES_LKEY;
	} else {
		DPRINT(40, "ibd_rc_alloc_chan: not support reserved lkey");
	}

	if (state->rc_enable_srq) {
		alloc_flags = IBT_ACHAN_USES_SRQ;
		alloc_args.rc_srq = state->rc_srq_hdl;
	} else {
		alloc_flags = IBT_ACHAN_NO_FLAGS;
	}

	result = ibt_alloc_rc_channel(state->id_hca_hdl,
	    alloc_flags, &alloc_args, &chan->chan_hdl, &sizes);
	if (result != IBT_SUCCESS) {
		ibd_print_warn(state, "ibd_rc_alloc_chan: ibd_rc_open_channel"
		    " fail:<%d>", result);
		goto alloc_scq_enable_err;
	}

	*ret_chan = chan;
	return (IBT_SUCCESS);

alloc_scq_enable_err:
	if (is_tx_chan) {
		if (ibd_rc_tx_softintr == 1) {
			ddi_remove_softintr(chan->scq_softintr);
		}
	}
alloc_softintr_err:
	if (is_tx_chan) {
		ibd_rc_fini_txlist(chan);
	}
init_txlist_err:
	(void) ibt_free_cq(chan->rcq_hdl);
alloc_rcq_err:
	(void) ibt_free_cq(chan->scq_hdl);
alloc_scq_err:
	mutex_destroy(&chan->tx_poll_lock);
	mutex_destroy(&chan->tx_post_lock);
	mutex_destroy(&chan->tx_rel_list.dl_mutex);
	mutex_destroy(&chan->tx_wqe_list.dl_mutex);
	mutex_destroy(&chan->rx_free_list.dl_mutex);
	mutex_destroy(&chan->rx_wqe_list.dl_mutex);
	kmem_free(chan, sizeof (ibd_rc_chan_t));
	return (result);
}

static void
ibd_rc_free_chan(ibd_rc_chan_t *chan)
{
	ibt_status_t ret;

	/* DPRINT(30, "ibd_rc_free_chan: chan=%p", chan); */

	if (chan->chan_hdl != NULL) {
		ret = ibt_free_channel(chan->chan_hdl);
		if (ret != IBT_SUCCESS) {
			DPRINT(40, "ib_rc_free_chan: ibt_free_channel failed, "
			    "chan=%p, returned: %d", chan, ret);
			return;
		}
		chan->chan_hdl = NULL;
	}

	if (chan->rcq_hdl != NULL) {
		ret = ibt_free_cq(chan->rcq_hdl);
		if (ret != IBT_SUCCESS) {
			DPRINT(40, "ib_rc_free_chan: ibt_free_cq(rcq) failed, "
			    "chan=%p, returned: %d", chan, ret);
			return;
		}
		chan->rcq_hdl = NULL;
	}

	if (chan->scq_hdl != NULL) {
		ret = ibt_free_cq(chan->scq_hdl);
		if (ret != IBT_SUCCESS) {
			DPRINT(40, "ib_rc_free_chan: ibt_free_cq(scq) failed, "
			    "chan=%p, returned: %d", chan, ret);
			return;
		}
		chan->scq_hdl = NULL;
	}

	/* Free buffers */
	if (chan->is_tx_chan) {
		ibd_rc_fini_txlist(chan);
		if (ibd_rc_tx_softintr == 1) {
			ddi_remove_softintr(chan->scq_softintr);
		}
	} else {
		if (!chan->state->rc_enable_srq) {
			ibd_rc_fini_rxlist(chan);
		}
	}

	mutex_destroy(&chan->tx_poll_lock);
	mutex_destroy(&chan->tx_post_lock);
	mutex_destroy(&chan->tx_rel_list.dl_mutex);
	mutex_destroy(&chan->tx_wqe_list.dl_mutex);
	mutex_destroy(&chan->rx_free_list.dl_mutex);
	mutex_destroy(&chan->rx_wqe_list.dl_mutex);

	/*
	 * If it is a passive channel, must make sure it has been removed
	 * from chan->state->rc_pass_chan_list
	 */
	kmem_free(chan, sizeof (ibd_rc_chan_t));
}

/* Add a RC channel */
static inline void
ibd_rc_add_to_chan_list(ibd_rc_chan_list_t *list, ibd_rc_chan_t *chan)
{
	mutex_enter(&list->chan_list_mutex);
	if (list->chan_list == NULL) {
		list->chan_list = chan;
	} else {
		chan->next = list->chan_list;
		list->chan_list = chan;
	}
	mutex_exit(&list->chan_list_mutex);
}

/* Remove a RC channel */
static inline void
ibd_rc_rm_from_chan_list(ibd_rc_chan_list_t *list, ibd_rc_chan_t *chan)
{
	ibd_rc_chan_t *pre_chan;

	mutex_enter(&list->chan_list_mutex);
	if (list->chan_list == chan) {
		DPRINT(30, "ibd_rc_rm_from_chan_list(first): found chan(%p)"
		    " in chan_list", chan);
		list->chan_list = chan->next;
	} else {
		pre_chan = list->chan_list;
		while (pre_chan != NULL) {
			if (pre_chan->next == chan) {
				DPRINT(30, "ibd_rc_rm_from_chan_list"
				    "(middle): found chan(%p) in "
				    "rc_pass_chan_list", chan);
				pre_chan->next = chan->next;
				break;
			}
			pre_chan = pre_chan->next;
		}
	}
	mutex_exit(&list->chan_list_mutex);
}

static inline ibd_rc_chan_t *
ibd_rc_rm_header_chan_list(ibd_rc_chan_list_t *list)
{
	ibd_rc_chan_t *rc_chan;

	mutex_enter(&list->chan_list_mutex);
	rc_chan = list->chan_list;
	if (rc_chan != NULL) {
		list->chan_list = rc_chan->next;
	}
	mutex_exit(&list->chan_list_mutex);
	return (rc_chan);
}

static int
ibd_rc_alloc_srq_copybufs(ibd_state_t *state)
{
	ibt_mr_attr_t mem_attr;
	uint_t rc_rx_bufs_sz;

	/*
	 * Allocate one big chunk for all regular rx copy bufs
	 */
	rc_rx_bufs_sz =  (state->rc_mtu + IPOIB_GRH_SIZE) * state->rc_srq_size;

	state->rc_srq_rx_bufs = kmem_zalloc(rc_rx_bufs_sz, KM_SLEEP);

	state->rc_srq_rwqes = kmem_zalloc(state->rc_srq_size *
	    sizeof (ibd_rwqe_t), KM_SLEEP);

	/*
	 * Do one memory registration on the entire rxbuf area
	 */
	mem_attr.mr_vaddr = (uint64_t)(uintptr_t)state->rc_srq_rx_bufs;
	mem_attr.mr_len = rc_rx_bufs_sz;
	mem_attr.mr_as = NULL;
	mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE;
	if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr,
	    &state->rc_srq_rx_mr_hdl, &state->rc_srq_rx_mr_desc)
	    != IBT_SUCCESS) {
		DPRINT(40, "ibd_rc_alloc_srq_copybufs: ibt_register_mr() "
		    "failed");
		kmem_free(state->rc_srq_rwqes,
		    state->rc_srq_size * sizeof (ibd_rwqe_t));
		kmem_free(state->rc_srq_rx_bufs, rc_rx_bufs_sz);
		state->rc_srq_rx_bufs = NULL;
		state->rc_srq_rwqes = NULL;
		return (DDI_FAILURE);
	}

	return (DDI_SUCCESS);
}

static void
ibd_rc_free_srq_copybufs(ibd_state_t *state)
{
	uint_t rc_rx_buf_sz;

	/*
	 * Don't change the value of state->rc_mtu at the period from call
	 * ibd_rc_alloc_srq_copybufs() to call ibd_rc_free_srq_copybufs().
	 */
	rc_rx_buf_sz = state->rc_mtu + IPOIB_GRH_SIZE;

	/*
	 * Unregister rxbuf mr
	 */
	if (ibt_deregister_mr(state->id_hca_hdl,
	    state->rc_srq_rx_mr_hdl) != IBT_SUCCESS) {
		DPRINT(40, "ibd_rc_free_srq_copybufs: ibt_deregister_mr()"
		    " failed");
	}
	state->rc_srq_rx_mr_hdl = NULL;

	/*
	 * Free rxbuf memory
	 */
	kmem_free(state->rc_srq_rwqes,
	    state->rc_srq_size * sizeof (ibd_rwqe_t));
	kmem_free(state->rc_srq_rx_bufs, state->rc_srq_size * rc_rx_buf_sz);
	state->rc_srq_rwqes = NULL;
	state->rc_srq_rx_bufs = NULL;
}

/*
 * Allocate and post a certain number of SRQ receive buffers and WRs.
 */
int
ibd_rc_init_srq_list(ibd_state_t *state)
{
	ibd_rwqe_t *rwqe;
	ibt_lkey_t lkey;
	int i;
	uint_t len;
	uint8_t *bufaddr;
	ibt_srq_sizes_t srq_sizes;
	ibt_srq_sizes_t	 srq_real_sizes;
	ibt_status_t ret;

	srq_sizes.srq_sgl_sz = 1;
	srq_sizes.srq_wr_sz = ibd_rc_num_srq;
	ret = ibt_alloc_srq(state->id_hca_hdl, IBT_SRQ_NO_FLAGS,
	    state->id_pd_hdl, &srq_sizes, &state->rc_srq_hdl, &srq_real_sizes);
	if (ret != IBT_SUCCESS) {
		DPRINT(10, "ibd_rc_init_srq_list: ibt_alloc_srq failed."
		    "req_sgl_sz=%d, req_wr_sz=0x%x, ret=%d",
		    srq_sizes.srq_sgl_sz, srq_sizes.srq_wr_sz, ret);
		return (DDI_FAILURE);
	}

	state->rc_srq_size = srq_real_sizes.srq_wr_sz;
	if (ibd_rc_alloc_srq_copybufs(state) != DDI_SUCCESS) {
		ret = ibt_free_srq(state->rc_srq_hdl);
		if (ret != IBT_SUCCESS) {
			ibd_print_warn(state, "ibd_rc_init_srq_list: "
			    "ibt_free_srq fail, ret=%d", ret);
		}
		return (DDI_FAILURE);
	}

	/*
	 * Allocate and setup the rwqe list
	 */
	lkey = state->rc_srq_rx_mr_desc.md_lkey;
	rwqe = state->rc_srq_rwqes;
	bufaddr = state->rc_srq_rx_bufs;
	len = state->rc_mtu + IPOIB_GRH_SIZE;
	state->rc_srq_rwqe_list.dl_cnt = 0;
	state->rc_srq_rwqe_list.dl_bufs_outstanding = 0;
	for (i = 0; i < state->rc_srq_size; i++, rwqe++, bufaddr += len) {
		rwqe->w_state = state;
		rwqe->w_freeing_wqe = B_FALSE;
		rwqe->w_freemsg_cb.free_func = ibd_rc_srq_freemsg_cb;
		rwqe->w_freemsg_cb.free_arg = (char *)rwqe;
		rwqe->rwqe_copybuf.ic_bufaddr = bufaddr;

		if ((rwqe->rwqe_im_mblk = desballoc(bufaddr, len, 0,
		    &rwqe->w_freemsg_cb)) == NULL) {
			DPRINT(40, "ibd_rc_init_srq_list : desballoc() failed");
			rwqe->rwqe_copybuf.ic_bufaddr = NULL;
			ibd_rc_fini_srq_list(state);
			return (DDI_FAILURE);
		}

		rwqe->rwqe_copybuf.ic_sgl.ds_key = lkey;
		/* Leave IPOIB_GRH_SIZE space */
		rwqe->rwqe_copybuf.ic_sgl.ds_va =
		    (ib_vaddr_t)(uintptr_t)(bufaddr + IPOIB_GRH_SIZE);
		rwqe->rwqe_copybuf.ic_sgl.ds_len = state->rc_mtu;
		rwqe->w_rwr.wr_id = (ibt_wrid_t)(uintptr_t)rwqe;
		rwqe->w_rwr.wr_nds = 1;
		rwqe->w_rwr.wr_sgl = &rwqe->rwqe_copybuf.ic_sgl;
		(void) ibd_rc_post_srq(state, rwqe);
	}

	return (DDI_SUCCESS);
}

/*
 * Free the statically allocated Rx buffer list for SRQ.
 */
void
ibd_rc_fini_srq_list(ibd_state_t *state)
{
	ibd_rwqe_t *rwqe;
	int i;
	ibt_status_t ret;

	ret = ibt_free_srq(state->rc_srq_hdl);
	if (ret != IBT_SUCCESS) {
		ibd_print_warn(state, "ibd_rc_fini_srq_list: "
		    "ibt_free_srq fail, ret=%d", ret);
	}

	mutex_enter(&state->rc_srq_rwqe_list.dl_mutex);
	rwqe = state->rc_srq_rwqes;
	for (i = 0; i < state->rc_srq_size; i++, rwqe++) {
		if (rwqe->rwqe_im_mblk != NULL) {
			rwqe->w_freeing_wqe = B_TRUE;
			freemsg(rwqe->rwqe_im_mblk);
		}
	}
	mutex_exit(&state->rc_srq_rwqe_list.dl_mutex);

	ibd_rc_free_srq_copybufs(state);
}

/*
 * Free an allocated recv wqe.
 */
void
ibd_rc_srq_free_rwqe(ibd_state_t *state, ibd_rwqe_t *rwqe)
{
	/*
	 * desballoc() failed (no memory) or the posting of rwqe failed.
	 *
	 * This rwqe is placed on a free list so that it
	 * can be reinstated in future.
	 *
	 * NOTE: no code currently exists to reinstate
	 * these "lost" rwqes.
	 */
	mutex_enter(&state->rc_srq_free_list.dl_mutex);
	state->rc_srq_free_list.dl_cnt++;
	rwqe->rwqe_next = state->rc_srq_free_list.dl_head;
	state->rc_srq_free_list.dl_head = RWQE_TO_WQE(rwqe);
	mutex_exit(&state->rc_srq_free_list.dl_mutex);
}

static void
ibd_rc_srq_freemsg_cb(char *arg)
{
	ibd_rwqe_t *rwqe = (ibd_rwqe_t *)arg;
	ibd_state_t *state = rwqe->w_state;

	ASSERT(state->rc_enable_srq);

	/*
	 * If the wqe is being destructed, do not attempt recycling.
	 */
	if (rwqe->w_freeing_wqe == B_TRUE) {
		return;
	}

	ASSERT(state->rc_srq_rwqe_list.dl_cnt < state->rc_srq_size);

	/*
	 * Upper layer has released held mblk, so we have
	 * no more use for keeping the old pointer in
	 * our rwqe.
	 */
	rwqe->rwqe_im_mblk = desballoc(rwqe->rwqe_copybuf.ic_bufaddr,
	    state->rc_mtu + IPOIB_GRH_SIZE, 0, &rwqe->w_freemsg_cb);
	if (rwqe->rwqe_im_mblk == NULL) {
		DPRINT(40, "ibd_rc_srq_freemsg_cb: desballoc failed");
		ibd_rc_srq_free_rwqe(state, rwqe);
		return;
	}

	if (ibd_rc_post_srq(state, rwqe) == DDI_FAILURE) {
		ibd_rc_srq_free_rwqe(state, rwqe);
		return;
	}

	atomic_add_32(&state->rc_srq_rwqe_list.dl_bufs_outstanding, -1);
}

/*
 * Post a rwqe to the hardware and add it to the Rx list.
 */
static int
ibd_rc_post_srq(ibd_state_t *state, ibd_rwqe_t *rwqe)
{
	/*
	 * Here we should add dl_cnt before post recv, because
	 * we would have to make sure dl_cnt is updated before
	 * the corresponding ibd_rc_process_rx() is called.
	 */
	ASSERT(state->rc_srq_rwqe_list.dl_cnt < state->rc_srq_size);
	atomic_add_32(&state->rc_srq_rwqe_list.dl_cnt, 1);
	if (ibt_post_srq(state->rc_srq_hdl, &rwqe->w_rwr, 1, NULL) !=
	    IBT_SUCCESS) {
		atomic_dec_32(&state->rc_srq_rwqe_list.dl_cnt);
		DPRINT(40, "ibd_rc_post_srq : ibt_post_srq() failed");
		return (DDI_FAILURE);
	}

	return (DDI_SUCCESS);
}

/*
 * Post a rwqe to the hardware and add it to the Rx list.
 */
static int
ibd_rc_post_rwqe(ibd_rc_chan_t *chan, ibd_rwqe_t *rwqe)
{
	/*
	 * Here we should add dl_cnt before post recv, because we would
	 * have to make sure dl_cnt has already updated before
	 * corresponding ibd_rc_process_rx() is called.
	 */
	atomic_add_32(&chan->rx_wqe_list.dl_cnt, 1);
	if (ibt_post_recv(chan->chan_hdl, &rwqe->w_rwr, 1, NULL) !=
	    IBT_SUCCESS) {
		atomic_dec_32(&chan->rx_wqe_list.dl_cnt);
		DPRINT(40, "ibd_rc_post_rwqe : failed in ibt_post_recv()");
		return (DDI_FAILURE);
	}
	return (DDI_SUCCESS);
}

static int
ibd_rc_alloc_rx_copybufs(ibd_rc_chan_t *chan)
{
	ibd_state_t *state = chan->state;
	ibt_mr_attr_t mem_attr;
	uint_t rc_rx_bufs_sz;

	/*
	 * Allocate one big chunk for all regular rx copy bufs
	 */
	rc_rx_bufs_sz = (state->rc_mtu + IPOIB_GRH_SIZE) * chan->rcq_size;

	chan->rx_bufs = kmem_zalloc(rc_rx_bufs_sz, KM_SLEEP);

	chan->rx_rwqes = kmem_zalloc(chan->rcq_size *
	    sizeof (ibd_rwqe_t), KM_SLEEP);

	/*
	 * Do one memory registration on the entire rxbuf area
	 */
	mem_attr.mr_vaddr = (uint64_t)(uintptr_t)chan->rx_bufs;
	mem_attr.mr_len = rc_rx_bufs_sz;
	mem_attr.mr_as = NULL;
	mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE;
	if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr,
	    &chan->rx_mr_hdl, &chan->rx_mr_desc) != IBT_SUCCESS) {
		DPRINT(40, "ibd_rc_alloc_srq_copybufs: ibt_register_mr failed");
		kmem_free(chan->rx_rwqes, chan->rcq_size * sizeof (ibd_rwqe_t));
		kmem_free(chan->rx_bufs, rc_rx_bufs_sz);
		chan->rx_bufs = NULL;
		chan->rx_rwqes = NULL;
		return (DDI_FAILURE);
	}

	return (DDI_SUCCESS);
}

static void
ibd_rc_free_rx_copybufs(ibd_rc_chan_t *chan)
{
	ibd_state_t *state = chan->state;
	uint_t rc_rx_buf_sz;

	ASSERT(!state->rc_enable_srq);
	ASSERT(chan->rx_rwqes != NULL);
	ASSERT(chan->rx_bufs != NULL);

	/*
	 * Don't change the value of state->rc_mtu at the period from call
	 * ibd_rc_alloc_rx_copybufs() to call ibd_rc_free_rx_copybufs().
	 */
	rc_rx_buf_sz = state->rc_mtu + IPOIB_GRH_SIZE;

	/*
	 * Unregister rxbuf mr
	 */
	if (ibt_deregister_mr(state->id_hca_hdl,
	    chan->rx_mr_hdl) != IBT_SUCCESS) {
		DPRINT(40, "ibd_rc_free_rx_copybufs: ibt_deregister_mr failed");
	}
	chan->rx_mr_hdl = NULL;

	/*
	 * Free rxbuf memory
	 */
	kmem_free(chan->rx_rwqes, chan->rcq_size * sizeof (ibd_rwqe_t));
	chan->rx_rwqes = NULL;

	kmem_free(chan->rx_bufs, chan->rcq_size * rc_rx_buf_sz);
	chan->rx_bufs = NULL;
}

/*
 * Post a certain number of receive buffers and WRs on a RC channel.
 */
static int
ibd_rc_init_rxlist(ibd_rc_chan_t *chan)
{
	ibd_state_t *state = chan->state;
	ibd_rwqe_t *rwqe;
	ibt_lkey_t lkey;
	int i;
	uint_t len;
	uint8_t *bufaddr;

	ASSERT(!state->rc_enable_srq);
	if (ibd_rc_alloc_rx_copybufs(chan) != DDI_SUCCESS)
		return (DDI_FAILURE);

	/*
	 * Allocate and setup the rwqe list
	 */
	lkey = chan->rx_mr_desc.md_lkey;
	rwqe = chan->rx_rwqes;
	bufaddr = chan->rx_bufs;
	len = state->rc_mtu + IPOIB_GRH_SIZE;
	for (i = 0; i < chan->rcq_size; i++, rwqe++, bufaddr += len) {
		rwqe->w_state = state;
		rwqe->w_chan = chan;
		rwqe->w_freeing_wqe = B_FALSE;
		rwqe->w_freemsg_cb.free_func = ibd_rc_freemsg_cb;
		rwqe->w_freemsg_cb.free_arg = (char *)rwqe;
		rwqe->rwqe_copybuf.ic_bufaddr = bufaddr;

		if ((rwqe->rwqe_im_mblk = desballoc(bufaddr, len, 0,
		    &rwqe->w_freemsg_cb)) == NULL) {
			DPRINT(40, "ibd_rc_init_srq_list: desballoc() failed");
			rwqe->rwqe_copybuf.ic_bufaddr = NULL;
			ibd_rc_fini_rxlist(chan);
			return (DDI_FAILURE);
		}

		rwqe->rwqe_copybuf.ic_sgl.ds_key = lkey;
		rwqe->rwqe_copybuf.ic_sgl.ds_va =
		    (ib_vaddr_t)(uintptr_t)(bufaddr + IPOIB_GRH_SIZE);
		rwqe->rwqe_copybuf.ic_sgl.ds_len = state->rc_mtu;
		rwqe->w_rwr.wr_id = (ibt_wrid_t)(uintptr_t)rwqe;
		rwqe->w_rwr.wr_nds = 1;
		rwqe->w_rwr.wr_sgl = &rwqe->rwqe_copybuf.ic_sgl;
		(void) ibd_rc_post_rwqe(chan, rwqe);
	}

	return (DDI_SUCCESS);
}

/*
 * Free the statically allocated Rx buffer list for SRQ.
 */
static void
ibd_rc_fini_rxlist(ibd_rc_chan_t *chan)
{
	ibd_rwqe_t *rwqe;
	int i;

	if (chan->rx_bufs == NULL) {
		DPRINT(40, "ibd_rc_fini_rxlist: empty chan->rx_bufs, quit");
		return;
	}

	/* bufs_outstanding must be 0 */
	ASSERT((chan->rx_wqe_list.dl_head == NULL) ||
	    (chan->rx_wqe_list.dl_bufs_outstanding == 0));

	mutex_enter(&chan->rx_wqe_list.dl_mutex);
	rwqe = chan->rx_rwqes;
	for (i = 0; i < chan->rcq_size; i++, rwqe++) {
		if (rwqe->rwqe_im_mblk != NULL) {
			rwqe->w_freeing_wqe = B_TRUE;
			freemsg(rwqe->rwqe_im_mblk);
		}
	}
	mutex_exit(&chan->rx_wqe_list.dl_mutex);

	ibd_rc_free_rx_copybufs(chan);
}

/*
 * Free an allocated recv wqe.
 */
static void
ibd_rc_free_rwqe(ibd_rc_chan_t *chan, ibd_rwqe_t *rwqe)
{
	/*
	 * desballoc() failed (no memory) or the posting of rwqe failed.
	 *
	 * This rwqe is placed on a free list so that it
	 * can be reinstated in future.
	 *
	 * NOTE: no code currently exists to reinstate
	 * these "lost" rwqes.
	 */
	mutex_enter(&chan->rx_free_list.dl_mutex);
	chan->rx_free_list.dl_cnt++;
	rwqe->rwqe_next = chan->rx_free_list.dl_head;
	chan->rx_free_list.dl_head = RWQE_TO_WQE(rwqe);
	mutex_exit(&chan->rx_free_list.dl_mutex);
}

/*
 * Processing to be done after receipt of a packet; hand off to GLD
 * in the format expected by GLD.
 */
static void
ibd_rc_process_rx(ibd_rc_chan_t *chan, ibd_rwqe_t *rwqe, ibt_wc_t *wc)
{
	ibd_state_t *state = chan->state;
	ib_header_info_t *phdr;
	ipoib_hdr_t *ipibp;
	mblk_t *mp;
	mblk_t *mpc;
	int rxcnt;
	ip6_t *ip6h;
	int len;

	/*
	 * Track number handed to upper layer, and number still
	 * available to receive packets.
	 */
	if (state->rc_enable_srq) {
		rxcnt = atomic_dec_32_nv(&state->rc_srq_rwqe_list.dl_cnt);
	} else {
		rxcnt = atomic_dec_32_nv(&chan->rx_wqe_list.dl_cnt);
	}

	/*
	 * It can not be a IBA multicast packet.
	 */
	ASSERT(!wc->wc_flags & IBT_WC_GRH_PRESENT);


#ifdef DEBUG
	if (rxcnt < ibd_rc_rx_rwqe_thresh) {
		state->rc_rwqe_short++;
	}
#endif

	/*
	 * Possibly replenish the Rx pool if needed.
	 */
	if ((rxcnt >= ibd_rc_rx_rwqe_thresh) &&
	    (wc->wc_bytes_xfer > ibd_rc_rx_copy_thresh)) {
		atomic_add_64(&state->rc_rcv_trans_byte, wc->wc_bytes_xfer);
		atomic_inc_64(&state->rc_rcv_trans_pkt);

		/*
		 * Record how many rwqe has been occupied by upper
		 * network layer
		 */
		if (state->rc_enable_srq) {
			atomic_add_32(&state->rc_srq_rwqe_list.
			    dl_bufs_outstanding, 1);
		} else {
			atomic_add_32(&chan->rx_wqe_list.
			    dl_bufs_outstanding, 1);
		}
		mp = rwqe->rwqe_im_mblk;
	} else {
		atomic_add_64(&state->rc_rcv_copy_byte, wc->wc_bytes_xfer);
		atomic_inc_64(&state->rc_rcv_copy_pkt);

		if ((mp = allocb(wc->wc_bytes_xfer + IPOIB_GRH_SIZE,
		    BPRI_HI)) == NULL) {	/* no memory */
			DPRINT(40, "ibd_rc_process_rx: allocb() failed");
			state->rc_rcv_alloc_fail++;
			if (state->rc_enable_srq) {
				if (ibd_rc_post_srq(state, rwqe) ==
				    DDI_FAILURE) {
					ibd_rc_srq_free_rwqe(state, rwqe);
				}
			} else {
				if (ibd_rc_post_rwqe(chan, rwqe) ==
				    DDI_FAILURE) {
					ibd_rc_free_rwqe(chan, rwqe);
				}
			}
			return;
		}

		bcopy(rwqe->rwqe_im_mblk->b_rptr + IPOIB_GRH_SIZE,
		    mp->b_wptr + IPOIB_GRH_SIZE, wc->wc_bytes_xfer);

		if (state->rc_enable_srq) {
			if (ibd_rc_post_srq(state, rwqe) == DDI_FAILURE) {
				ibd_rc_srq_free_rwqe(state, rwqe);
			}
		} else {
			if (ibd_rc_post_rwqe(chan, rwqe) == DDI_FAILURE) {
				ibd_rc_free_rwqe(chan, rwqe);
			}
		}
	}

	ipibp = (ipoib_hdr_t *)((uchar_t *)mp->b_rptr + IPOIB_GRH_SIZE);
	if (ntohs(ipibp->ipoib_type) == ETHERTYPE_IPV6) {
		ip6h = (ip6_t *)((uchar_t *)ipibp + sizeof (ipoib_hdr_t));
		len = ntohs(ip6h->ip6_plen);
		if (ip6h->ip6_nxt == IPPROTO_ICMPV6) {
			/* LINTED: E_CONSTANT_CONDITION */
			IBD_PAD_NSNA(ip6h, len, IBD_RECV);
		}
	}

	phdr = (ib_header_info_t *)mp->b_rptr;
	phdr->ib_grh.ipoib_vertcflow = 0;
	ovbcopy(&state->id_macaddr, &phdr->ib_dst,
	    sizeof (ipoib_mac_t));
	mp->b_wptr = mp->b_rptr + wc->wc_bytes_xfer+ IPOIB_GRH_SIZE;

	/*
	 * Can RC mode in IB guarantee its checksum correctness?
	 *
	 *	(void) hcksum_assoc(mp, NULL, NULL, 0, 0, 0, 0,
	 *	    HCK_FULLCKSUM | HCK_FULLCKSUM_OK, 0);
	 */

	/*
	 * Make sure this is NULL or we're in trouble.
	 */
	if (mp->b_next != NULL) {
		ibd_print_warn(state,
		    "ibd_rc_process_rx: got duplicate mp from rcq?");
		mp->b_next = NULL;
	}

	/*
	 * Add this mp to the list of processed mp's to send to
	 * the nw layer
	 */
	if (state->rc_enable_srq) {
		mutex_enter(&state->rc_rx_lock);
		if (state->rc_rx_mp) {
			ASSERT(state->rc_rx_mp_tail != NULL);
			state->rc_rx_mp_tail->b_next = mp;
		} else {
			ASSERT(state->rc_rx_mp_tail == NULL);
			state->rc_rx_mp = mp;
		}

		state->rc_rx_mp_tail = mp;
		state->rc_rx_mp_len++;

		if (state->rc_rx_mp_len  >= IBD_MAX_RX_MP_LEN) {
			mpc = state->rc_rx_mp;

			state->rc_rx_mp = NULL;
			state->rc_rx_mp_tail = NULL;
			state->rc_rx_mp_len = 0;
			mutex_exit(&state->rc_rx_lock);
			mac_rx(state->id_mh, NULL, mpc);
		} else {
			mutex_exit(&state->rc_rx_lock);
		}
	} else {
		mutex_enter(&chan->rx_lock);
		if (chan->rx_mp) {
			ASSERT(chan->rx_mp_tail != NULL);
			chan->rx_mp_tail->b_next = mp;
		} else {
			ASSERT(chan->rx_mp_tail == NULL);
			chan->rx_mp = mp;
		}

		chan->rx_mp_tail = mp;
		chan->rx_mp_len++;

		if (chan->rx_mp_len  >= IBD_MAX_RX_MP_LEN) {
			mpc = chan->rx_mp;

			chan->rx_mp = NULL;
			chan->rx_mp_tail = NULL;
			chan->rx_mp_len = 0;
			mutex_exit(&chan->rx_lock);
			mac_rx(state->id_mh, NULL, mpc);
		} else {
			mutex_exit(&chan->rx_lock);
		}
	}
}

/*
 * Callback code invoked from STREAMs when the recv data buffer is free
 * for recycling.
 */
static void
ibd_rc_freemsg_cb(char *arg)
{
	ibd_rwqe_t *rwqe = (ibd_rwqe_t *)arg;
	ibd_rc_chan_t *chan = rwqe->w_chan;
	ibd_state_t *state = rwqe->w_state;

	/*
	 * If the wqe is being destructed, do not attempt recycling.
	 */
	if (rwqe->w_freeing_wqe == B_TRUE) {
		return;
	}

	ASSERT(!state->rc_enable_srq);
	ASSERT(chan->rx_wqe_list.dl_cnt < chan->rcq_size);

	rwqe->rwqe_im_mblk = desballoc(rwqe->rwqe_copybuf.ic_bufaddr,
	    state->rc_mtu + IPOIB_GRH_SIZE, 0, &rwqe->w_freemsg_cb);
	if (rwqe->rwqe_im_mblk == NULL) {
		DPRINT(40, "ibd_rc_freemsg_cb: desballoc() failed");
		ibd_rc_free_rwqe(chan, rwqe);
		return;
	}

	/*
	 * Post back to h/w. We could actually have more than
	 * id_num_rwqe WQEs on the list if there were multiple
	 * ibd_freemsg_cb() calls outstanding (since the lock is
	 * not held the entire time). This will start getting
	 * corrected over subsequent ibd_freemsg_cb() calls.
	 */
	if (ibd_rc_post_rwqe(chan, rwqe) == DDI_FAILURE) {
		ibd_rc_free_rwqe(chan, rwqe);
		return;
	}
	atomic_add_32(&chan->rx_wqe_list.dl_bufs_outstanding, -1);
}

/*
 * Common code for interrupt handling as well as for polling
 * for all completed wqe's while detaching.
 */
static void
ibd_rc_poll_rcq(ibd_rc_chan_t *chan, ibt_cq_hdl_t cq_hdl)
{
	ibd_wqe_t *wqe;
	ibt_wc_t *wc, *wcs;
	uint_t numwcs, real_numwcs;
	int i;

	wcs = chan->rx_wc;
	numwcs = IBD_RC_MAX_CQ_WC;

	while (ibt_poll_cq(cq_hdl, wcs, numwcs, &real_numwcs) == IBT_SUCCESS) {
		for (i = 0, wc = wcs; i < real_numwcs; i++, wc++) {
			wqe = (ibd_wqe_t *)(uintptr_t)wc->wc_id;
			if (wc->wc_status != IBT_WC_SUCCESS) {
				chan->state->rc_rcq_err++;
				/*
				 * Channel being torn down.
				 */
				DPRINT(40, "ibd_rc_poll_rcq: wc_status(%d) != "
				    "SUCC, chan=%p", wc->wc_status, chan);
				if (wc->wc_status == IBT_WC_WR_FLUSHED_ERR) {
					/*
					 * Do not invoke Rx handler because
					 * it might add buffers to the Rx pool
					 * when we are trying to deinitialize.
					 */
					continue;
				}
			}
			ibd_rc_process_rx(chan, WQE_TO_RWQE(wqe), wc);
		}
	}
}

/* Receive CQ handler */
/* ARGSUSED */
static void
ibd_rc_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
{
	ibd_rc_chan_t *chan = (ibd_rc_chan_t *)arg;
	ibd_state_t *state = chan->state;

	ASSERT(chan->chan_state == IBD_RC_STATE_PAS_ESTAB);

	/*
	 * Poll for completed entries; the CQ will not interrupt any
	 * more for incoming (or transmitted) packets.
	 */
	state->rc_rcq_invoke++;
	ibd_rc_poll_rcq(chan, chan->rcq_hdl);

	/*
	 * Now enable CQ notifications; all packets that arrive now
	 * (or complete transmission) will cause new interrupts.
	 */
	if (ibt_enable_cq_notify(chan->rcq_hdl, IBT_NEXT_COMPLETION) !=
	    IBT_SUCCESS) {
		/*
		 * We do not expect a failure here.
		 */
		DPRINT(40, "ibd_rc_rcq_handler: ibt_enable_cq_notify() failed");
	}

	/*
	 * Repoll to catch all packets that might have arrived after
	 * we finished the first poll loop and before interrupts got
	 * armed.
	 */
	ibd_rc_poll_rcq(chan, chan->rcq_hdl);

	if (state->rc_enable_srq) {
		mutex_enter(&state->rc_rx_lock);

		if (state->rc_rx_mp != NULL) {
			mblk_t *mpc;
			mpc = state->rc_rx_mp;

			state->rc_rx_mp = NULL;
			state->rc_rx_mp_tail = NULL;
			state->rc_rx_mp_len = 0;

			mutex_exit(&state->rc_rx_lock);
			mac_rx(state->id_mh, NULL, mpc);
		} else {
			mutex_exit(&state->rc_rx_lock);
		}
	} else {
		mutex_enter(&chan->rx_lock);

		if (chan->rx_mp != NULL) {
			mblk_t *mpc;
			mpc = chan->rx_mp;

			chan->rx_mp = NULL;
			chan->rx_mp_tail = NULL;
			chan->rx_mp_len = 0;

			mutex_exit(&chan->rx_lock);
			mac_rx(state->id_mh, NULL, mpc);
		} else {
			mutex_exit(&chan->rx_lock);
		}
	}
}

/*
 * Allocate the statically allocated Tx buffer list.
 */
int
ibd_rc_init_tx_largebuf_list(ibd_state_t *state)
{
	ibd_rc_tx_largebuf_t *lbufp;
	ibd_rc_tx_largebuf_t *tail;
	uint8_t *memp;
	ibt_mr_attr_t mem_attr;
	uint32_t num_swqe;
	size_t  mem_size;
	int i;

	num_swqe = ibd_rc_num_swqe - 1;

	/*
	 * Allocate one big chunk for all Tx large copy bufs
	 */
	/* Don't transfer IPOIB_GRH_SIZE bytes (40 bytes) */
	mem_size = num_swqe * state->rc_mtu;
	state->rc_tx_mr_bufs = kmem_zalloc(mem_size, KM_SLEEP);

	mem_attr.mr_len = mem_size;
	mem_attr.mr_vaddr = (uint64_t)(uintptr_t)state->rc_tx_mr_bufs;
	mem_attr.mr_as = NULL;
	mem_attr.mr_flags = IBT_MR_SLEEP;
	if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr,
	    &state->rc_tx_mr_hdl, &state->rc_tx_mr_desc) != IBT_SUCCESS) {
		DPRINT(40, "ibd_rc_init_tx_largebuf_list: ibt_register_mr "
		    "failed");
		kmem_free(state->rc_tx_mr_bufs, mem_size);
		state->rc_tx_mr_bufs = NULL;
		return (DDI_FAILURE);
	}

	state->rc_tx_largebuf_desc_base = kmem_zalloc(num_swqe *
	    sizeof (ibd_rc_tx_largebuf_t), KM_SLEEP);

	/*
	 * Set up the buf chain
	 */
	memp = state->rc_tx_mr_bufs;
	mutex_enter(&state->rc_tx_large_bufs_lock);
	lbufp = state->rc_tx_largebuf_desc_base;
	for (i = 0; i < num_swqe; i++) {
		lbufp->lb_buf = memp;
		lbufp->lb_next = lbufp + 1;

		tail = lbufp;

		memp += state->rc_mtu;
		lbufp++;
	}
	tail->lb_next = NULL;

	/*
	 * Set up the buffer information in ibd state
	 */
	state->rc_tx_largebuf_free_head = state->rc_tx_largebuf_desc_base;
	state->rc_tx_largebuf_nfree = num_swqe;
	mutex_exit(&state->rc_tx_large_bufs_lock);
	return (DDI_SUCCESS);
}

void
ibd_rc_fini_tx_largebuf_list(ibd_state_t *state)
{
	uint32_t num_swqe;

	num_swqe = ibd_rc_num_swqe - 1;

	if (ibt_deregister_mr(state->id_hca_hdl,
	    state->rc_tx_mr_hdl) != IBT_SUCCESS) {
		DPRINT(40, "ibd_rc_fini_tx_largebuf_list: ibt_deregister_mr() "
		    "failed");
	}
	state->rc_tx_mr_hdl = NULL;

	kmem_free(state->rc_tx_mr_bufs, num_swqe * state->rc_mtu);
	state->rc_tx_mr_bufs = NULL;

	kmem_free(state->rc_tx_largebuf_desc_base,
	    num_swqe * sizeof (ibd_rc_tx_largebuf_t));
	state->rc_tx_largebuf_desc_base = NULL;
}

static int
ibd_rc_alloc_tx_copybufs(ibd_rc_chan_t *chan)
{
	ibt_mr_attr_t mem_attr;
	ibd_state_t *state;

	state = chan->state;
	ASSERT(state != NULL);

	/*
	 * Allocate one big chunk for all regular tx copy bufs
	 */
	mem_attr.mr_len = chan->scq_size * ibd_rc_tx_copy_thresh;

	chan->tx_mr_bufs = kmem_zalloc(mem_attr.mr_len, KM_SLEEP);

	/*
	 * Do one memory registration on the entire txbuf area
	 */
	mem_attr.mr_vaddr = (uint64_t)(uintptr_t)chan->tx_mr_bufs;
	mem_attr.mr_as = NULL;
	mem_attr.mr_flags = IBT_MR_SLEEP;
	if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr,
	    &chan->tx_mr_hdl, &chan->tx_mr_desc) != IBT_SUCCESS) {
		DPRINT(40, "ibd_rc_alloc_tx_copybufs: ibt_register_mr failed");
		ASSERT(mem_attr.mr_len ==
		    chan->scq_size * ibd_rc_tx_copy_thresh);
		kmem_free(chan->tx_mr_bufs, mem_attr.mr_len);
		chan->tx_mr_bufs = NULL;
		return (DDI_FAILURE);
	}

	return (DDI_SUCCESS);
}

/*
 * Allocate the statically allocated Tx buffer list.
 */
static int
ibd_rc_init_txlist(ibd_rc_chan_t *chan)
{
	ibd_swqe_t *swqe;
	int i;
	ibt_lkey_t lkey;

	if (ibd_rc_alloc_tx_copybufs(chan) != DDI_SUCCESS)
		return (DDI_FAILURE);

	/*
	 * Allocate and setup the swqe list
	 */
	lkey = chan->tx_mr_desc.md_lkey;
	chan->tx_wqes = kmem_zalloc(chan->scq_size *
	    sizeof (ibd_swqe_t), KM_SLEEP);
	swqe = chan->tx_wqes;
	for (i = 0; i < chan->scq_size; i++, swqe++) {
		swqe->swqe_next = NULL;
		swqe->swqe_im_mblk = NULL;

		swqe->swqe_copybuf.ic_sgl.ds_key = lkey;
		swqe->swqe_copybuf.ic_sgl.ds_len = 0; /* set in send */

		swqe->w_swr.wr_id = (ibt_wrid_t)(uintptr_t)swqe;
		swqe->w_swr.wr_flags = IBT_WR_SEND_SIGNAL;
		swqe->swqe_copybuf.ic_sgl.ds_va = (ib_vaddr_t)(uintptr_t)
		    (chan->tx_mr_bufs + i * ibd_rc_tx_copy_thresh);
		swqe->w_swr.wr_trans = IBT_RC_SRV;

		/* Add to list */
		mutex_enter(&chan->tx_wqe_list.dl_mutex);
		chan->tx_wqe_list.dl_cnt++;
		swqe->swqe_next = chan->tx_wqe_list.dl_head;
		chan->tx_wqe_list.dl_head = SWQE_TO_WQE(swqe);
		mutex_exit(&chan->tx_wqe_list.dl_mutex);
	}

	return (DDI_SUCCESS);
}

/*
 * Free the statically allocated Tx buffer list.
 */
static void
ibd_rc_fini_txlist(ibd_rc_chan_t *chan)
{
	if (chan->tx_mr_hdl != NULL) {
		if (ibt_deregister_mr(chan->state->id_hca_hdl,
		    chan->tx_mr_hdl) != IBT_SUCCESS) {
			DPRINT(40, "ibd_rc_fini_txlist: ibt_deregister_mr "
			    "failed");
		}
		chan->tx_mr_hdl = NULL;
	}

	if (chan->tx_mr_bufs != NULL) {
		kmem_free(chan->tx_mr_bufs, chan->scq_size *
		    ibd_rc_tx_copy_thresh);
		chan->tx_mr_bufs = NULL;
	}

	if (chan->tx_wqes != NULL) {
		kmem_free(chan->tx_wqes, chan->scq_size *
		    sizeof (ibd_swqe_t));
		chan->tx_wqes = NULL;
	}
}

/*
 * Acquire send wqe from free list.
 * Returns error number and send wqe pointer.
 */
ibd_swqe_t *
ibd_rc_acquire_swqes(ibd_rc_chan_t *chan)
{
	ibd_swqe_t *wqe;

	mutex_enter(&chan->tx_rel_list.dl_mutex);
	if (chan->tx_rel_list.dl_head != NULL) {
		/* transfer id_tx_rel_list to id_tx_list */
		chan->tx_wqe_list.dl_head =
		    chan->tx_rel_list.dl_head;
		chan->tx_wqe_list.dl_cnt =
		    chan->tx_rel_list.dl_cnt;
		chan->tx_wqe_list.dl_pending_sends = B_FALSE;

		/* clear id_tx_rel_list */
		chan->tx_rel_list.dl_head = NULL;
		chan->tx_rel_list.dl_cnt = 0;
		mutex_exit(&chan->tx_rel_list.dl_mutex);

		wqe = WQE_TO_SWQE(chan->tx_wqe_list.dl_head);
		chan->tx_wqe_list.dl_cnt -= 1;
		chan->tx_wqe_list.dl_head = wqe->swqe_next;
	} else {	/* no free swqe */
		mutex_exit(&chan->tx_rel_list.dl_mutex);
		chan->tx_wqe_list.dl_pending_sends = B_TRUE;
		wqe = NULL;
	}
	return (wqe);
}

/*
 * Release send wqe back into free list.
 */
static void
ibd_rc_release_swqe(ibd_rc_chan_t *chan, ibd_swqe_t *swqe)
{
	/*
	 * Add back on Tx list for reuse.
	 */
	swqe->swqe_next = NULL;
	mutex_enter(&chan->tx_rel_list.dl_mutex);
	chan->tx_rel_list.dl_pending_sends = B_FALSE;
	swqe->swqe_next = chan->tx_rel_list.dl_head;
	chan->tx_rel_list.dl_head = SWQE_TO_WQE(swqe);
	chan->tx_rel_list.dl_cnt++;
	mutex_exit(&chan->tx_rel_list.dl_mutex);
}

void
ibd_rc_post_send(ibd_rc_chan_t *chan, ibd_swqe_t *node)
{
	uint_t		i;
	uint_t		num_posted;
	uint_t		n_wrs;
	ibt_status_t	ibt_status;
	ibt_send_wr_t	wrs[IBD_MAX_TX_POST_MULTIPLE];
	ibd_swqe_t	*tx_head, *elem;
	ibd_swqe_t	*nodes[IBD_MAX_TX_POST_MULTIPLE];

	/* post the one request, then check for more */
	ibt_status = ibt_post_send(chan->chan_hdl,
	    &node->w_swr, 1, NULL);
	if (ibt_status != IBT_SUCCESS) {
		ibd_print_warn(chan->state, "ibd_post_send: "
		    "posting one wr failed: ret=%d", ibt_status);
		ibd_rc_tx_cleanup(node);
	}

	tx_head = NULL;
	for (;;) {
		if (tx_head == NULL) {
			mutex_enter(&chan->tx_post_lock);
			tx_head = chan->tx_head;
			if (tx_head == NULL) {
				chan->tx_busy = 0;
				mutex_exit(&chan->tx_post_lock);
				return;
			}
			chan->tx_head = NULL;
			mutex_exit(&chan->tx_post_lock);
		}

		/*
		 * Collect pending requests, IBD_MAX_TX_POST_MULTIPLE wrs
		 * at a time if possible, and keep posting them.
		 */
		for (n_wrs = 0, elem = tx_head;
		    (elem) && (n_wrs < IBD_MAX_TX_POST_MULTIPLE);
		    elem = WQE_TO_SWQE(elem->swqe_next), n_wrs++) {
			nodes[n_wrs] = elem;
			wrs[n_wrs] = elem->w_swr;
		}
		tx_head = elem;

		ASSERT(n_wrs != 0);

		/*
		 * If posting fails for some reason, we'll never receive
		 * completion intimation, so we'll need to cleanup. But
		 * we need to make sure we don't clean up nodes whose
		 * wrs have been successfully posted. We assume that the
		 * hca driver returns on the first failure to post and
		 * therefore the first 'num_posted' entries don't need
		 * cleanup here.
		 */
		num_posted = 0;
		ibt_status = ibt_post_send(chan->chan_hdl,
		    wrs, n_wrs, &num_posted);
		if (ibt_status != IBT_SUCCESS) {
			ibd_print_warn(chan->state, "ibd_post_send: "
			    "posting multiple wrs failed: "
			    "requested=%d, done=%d, ret=%d",
			    n_wrs, num_posted, ibt_status);

			for (i = num_posted; i < n_wrs; i++)
				ibd_rc_tx_cleanup(nodes[i]);
		}
	}
}

/*
 * Common code that deals with clean ups after a successful or
 * erroneous transmission attempt.
 */
void
ibd_rc_tx_cleanup(ibd_swqe_t *swqe)
{
	ibd_ace_t *ace = swqe->w_ahandle;
	ibd_state_t *state;

	ASSERT(ace != NULL);
	ASSERT(ace->ac_chan != NULL);

	state = ace->ac_chan->state;

	/*
	 * If this was a dynamic registration in ibd_send(),
	 * deregister now.
	 */
	if (swqe->swqe_im_mblk != NULL) {
		ASSERT(swqe->w_buftype == IBD_WQE_MAPPED);
		if (swqe->w_buftype == IBD_WQE_MAPPED) {
			ibd_unmap_mem(state, swqe);
		}
		freemsg(swqe->swqe_im_mblk);
		swqe->swqe_im_mblk = NULL;
	} else {
		ASSERT(swqe->w_buftype != IBD_WQE_MAPPED);
	}

	if (swqe->w_buftype == IBD_WQE_RC_COPYBUF) {
		ibd_rc_tx_largebuf_t *lbufp;

		lbufp = swqe->w_rc_tx_largebuf;
		ASSERT(lbufp != NULL);

		mutex_enter(&state->rc_tx_large_bufs_lock);
		lbufp->lb_next = state->rc_tx_largebuf_free_head;
		state->rc_tx_largebuf_free_head = lbufp;
		state->rc_tx_largebuf_nfree ++;
		mutex_exit(&state->rc_tx_large_bufs_lock);
		swqe->w_rc_tx_largebuf = NULL;
	}


	/*
	 * Release the send wqe for reuse.
	 */
	ibd_rc_release_swqe(ace->ac_chan, swqe);

	/*
	 * Drop the reference count on the AH; it can be reused
	 * now for a different destination if there are no more
	 * posted sends that will use it. This can be eliminated
	 * if we can always associate each Tx buffer with an AH.
	 * The ace can be null if we are cleaning up from the
	 * ibd_send() error path.
	 */
	ibd_dec_ref_ace(state, ace);
}

void
ibd_rc_drain_scq(ibd_rc_chan_t *chan, ibt_cq_hdl_t cq_hdl)
{
	ibd_state_t *state = chan->state;
	ibd_wqe_t *wqe;
	ibt_wc_t *wc, *wcs;
	uint_t numwcs, real_numwcs;
	int i;

	wcs = chan->tx_wc;
	numwcs = IBD_RC_MAX_CQ_WC;

	while (ibt_poll_cq(cq_hdl, wcs, numwcs, &real_numwcs) == IBT_SUCCESS) {
		for (i = 0, wc = wcs; i < real_numwcs; i++, wc++) {
			wqe = (ibd_wqe_t *)(uintptr_t)wc->wc_id;
			if (wc->wc_status != IBT_WC_SUCCESS) {
				chan->tx_trans_error_cnt ++;
				DPRINT(30, "ibd_rc_drain_scq: "
				    "wc_status(%d) != SUCC, "
				    "chan=%p, ace=%p, link_state=%d",
				    wc->wc_status, chan, chan->ace,
				    chan->state->id_link_state);
			} else {
				chan->tx_trans_error_cnt = 0;
			}
			ibd_rc_tx_cleanup(WQE_TO_SWQE(wqe));
		}

		mutex_enter(&state->id_sched_lock);
		if (state->id_sched_needed == 0) {
			mutex_exit(&state->id_sched_lock);
		} else if (state->id_sched_needed & IBD_RSRC_RC_SWQE) {
			mutex_enter(&chan->tx_wqe_list.dl_mutex);
			mutex_enter(&chan->tx_rel_list.dl_mutex);
			if ((chan->tx_rel_list.dl_cnt +
			    chan->tx_wqe_list.dl_cnt) > IBD_RC_TX_FREE_THRESH) {
				state->id_sched_needed &= ~IBD_RSRC_RC_SWQE;
				mutex_exit(&chan->tx_rel_list.dl_mutex);
				mutex_exit(&chan->tx_wqe_list.dl_mutex);
				mutex_exit(&state->id_sched_lock);
				state->rc_swqe_mac_update++;
				mac_tx_update(state->id_mh);
			} else {
				state->rc_scq_no_swqe++;
				mutex_exit(&chan->tx_rel_list.dl_mutex);
				mutex_exit(&chan->tx_wqe_list.dl_mutex);
				mutex_exit(&state->id_sched_lock);
			}
		} else if (state->id_sched_needed & IBD_RSRC_RC_TX_LARGEBUF) {
			mutex_enter(&state->rc_tx_large_bufs_lock);
			if (state->rc_tx_largebuf_nfree >
			    IBD_RC_TX_FREE_THRESH) {
				ASSERT(state->rc_tx_largebuf_free_head != NULL);
				state->id_sched_needed &=
				    ~IBD_RSRC_RC_TX_LARGEBUF;
				mutex_exit(&state->rc_tx_large_bufs_lock);
				mutex_exit(&state->id_sched_lock);
				state->rc_xmt_buf_mac_update++;
				mac_tx_update(state->id_mh);
			} else {
				state->rc_scq_no_largebuf++;
				mutex_exit(&state->rc_tx_large_bufs_lock);
				mutex_exit(&state->id_sched_lock);
			}
		} else if (state->id_sched_needed & IBD_RSRC_SWQE) {
			mutex_enter(&state->id_tx_list.dl_mutex);
			mutex_enter(&state->id_tx_rel_list.dl_mutex);
			if ((state->id_tx_list.dl_cnt +
			    state->id_tx_rel_list.dl_cnt)
			    > IBD_FREE_SWQES_THRESH) {
				state->id_sched_needed &= ~IBD_RSRC_SWQE;
				state->id_sched_cnt++;
				mutex_exit(&state->id_tx_rel_list.dl_mutex);
				mutex_exit(&state->id_tx_list.dl_mutex);
				mutex_exit(&state->id_sched_lock);
				mac_tx_update(state->id_mh);
			} else {
				mutex_exit(&state->id_tx_rel_list.dl_mutex);
				mutex_exit(&state->id_tx_list.dl_mutex);
				mutex_exit(&state->id_sched_lock);
			}
		} else {
			mutex_exit(&state->id_sched_lock);
		}
	}
}

/* Send CQ handler, call ibd_rx_tx_cleanup to recycle Tx buffers */
/* ARGSUSED */
static void
ibd_rc_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
{
	ibd_rc_chan_t *chan = (ibd_rc_chan_t *)arg;

	chan->state->rc_scq_invoke++;

	if (ibd_rc_tx_softintr == 1) {
		mutex_enter(&chan->tx_poll_lock);
		if (chan->tx_poll_busy & IBD_CQ_POLLING) {
			chan->tx_poll_busy |= IBD_REDO_CQ_POLLING;
			mutex_exit(&chan->tx_poll_lock);
			return;
		} else {
			mutex_exit(&chan->tx_poll_lock);
			ddi_trigger_softintr(chan->scq_softintr);
		}
	} else
		(void) ibd_rc_tx_recycle(arg);
}

static uint_t
ibd_rc_tx_recycle(caddr_t arg)
{
	ibd_rc_chan_t *chan = (ibd_rc_chan_t *)arg;
	ibd_ace_t *ace;
	ibd_state_t *state = chan->state;
	int flag, redo_flag;
	int redo = 1;

	flag = IBD_CQ_POLLING;
	redo_flag = IBD_REDO_CQ_POLLING;

	mutex_enter(&chan->tx_poll_lock);
	if (chan->tx_poll_busy & flag) {
		ibd_print_warn(state, "ibd_rc_tx_recycle: multiple polling "
		    "threads");
		chan->tx_poll_busy |= redo_flag;
		mutex_exit(&chan->tx_poll_lock);
		return (DDI_INTR_CLAIMED);
	}
	chan->tx_poll_busy |= flag;
	mutex_exit(&chan->tx_poll_lock);

	/*
	 * Poll for completed entries; the CQ will not interrupt any
	 * more for completed packets.
	 */
	ibd_rc_drain_scq(chan, chan->scq_hdl);

	/*
	 * Now enable CQ notifications; all completions originating now
	 * will cause new interrupts.
	 */
	do {
		if (ibt_enable_cq_notify(chan->scq_hdl, IBT_NEXT_COMPLETION) !=
		    IBT_SUCCESS) {
			/*
			 * We do not expect a failure here.
			 */
			DPRINT(40, "ibd_rc_scq_handler: ibt_enable_cq_notify()"
			    " failed");
		}

		ibd_rc_drain_scq(chan, chan->scq_hdl);

		if (chan->tx_trans_error_cnt > 3) {
			mutex_enter(&chan->tx_poll_lock);
			chan->tx_poll_busy = 0;
			mutex_exit(&chan->tx_poll_lock);
			goto error_reset_chan;
		}
		mutex_enter(&chan->tx_poll_lock);
		if (chan->tx_poll_busy & redo_flag)
			chan->tx_poll_busy &= ~redo_flag;
		else {
			chan->tx_poll_busy &= ~flag;
			redo = 0;
		}
		mutex_exit(&chan->tx_poll_lock);

	} while (redo);

	return (DDI_INTR_CLAIMED);

error_reset_chan:
	/*
	 * Channel being torn down.
	 */
	mutex_enter(&state->id_ac_mutex);
	if ((chan->chan_state == IBD_RC_STATE_ACT_ESTAB) &&
	    (chan->state->id_link_state == LINK_STATE_UP) &&
	    ((ace = ibd_acache_find(state, &chan->ace->ac_mac, B_FALSE, 0))
	    != NULL) && (ace == chan->ace)) {
		ASSERT(ace->ac_mce == NULL);
		INC_REF(ace, 1);
		IBD_ACACHE_PULLOUT_ACTIVE(state, ace);
		chan->chan_state = IBD_RC_STATE_ACT_CLOSING;
		mutex_exit(&state->id_ac_mutex);
		state->rc_reset_cnt++;
		DPRINT(30, "ibd_rc_tx_recycle(chan=%p, ace=%p): "
		    " reset RC channel", chan, chan->ace);
		ibd_rc_signal_act_close(state, ace);
	} else {
		mutex_exit(&state->id_ac_mutex);
		state->rc_act_close_simultaneous++;
		DPRINT(40, "ibd_rc_tx_recycle: other thread is closing"
		    " it. chan=%p, act_state=%d, link_state=%d, ace=%p",
		    chan, chan->chan_state, state->id_link_state, ace);
	}
	return (DDI_INTR_CLAIMED);
}

/* Listen with corresponding service ID */
ibt_status_t
ibd_rc_listen(ibd_state_t *state)
{
	ibt_srv_desc_t srvdesc;
	ib_svc_id_t ret_sid;
	ibt_status_t status;
	ib_gid_t gid;

	if (state->rc_listen_hdl != NULL) {
		DPRINT(40, "ibd_rc_listen: rc_listen_hdl should be NULL");
		return (IBT_FAILURE);
	}

	bzero(&srvdesc, sizeof (ibt_srv_desc_t));
	srvdesc.sd_handler = ibd_rc_dispatch_pass_mad;
	srvdesc.sd_flags = IBT_SRV_NO_FLAGS;

	/*
	 * Register the service with service id
	 * Incoming connection requests should arrive on this service id.
	 */
	status = ibt_register_service(state->id_ibt_hdl, &srvdesc,
	    IBD_RC_QPN_TO_SID(state->id_qpnum),
	    1, &state->rc_listen_hdl, &ret_sid);
	if (status != IBT_SUCCESS) {
		DPRINT(40, "ibd_rc_listen: Service Registration Failed, "
		    "ret=%d", status);
		return (status);
	}

	gid = state->id_sgid;

	/* pass state as cm_private */
	status = ibt_bind_service(state->rc_listen_hdl,
	    gid, NULL, state, &state->rc_listen_bind);
	if (status != IBT_SUCCESS) {
		DPRINT(40, "ibd_rc_listen:"
		    " fail to bind port: <%d>", status);
		(void) ibt_deregister_service(state->id_ibt_hdl,
		    state->rc_listen_hdl);
		return (status);
	}

	/*
	 * Legacy OFED had used a wrong service ID (one additional zero digit)
	 * for many years. To interop with legacy OFED, we support this wrong
	 * service ID here.
	 */
	ASSERT(state->rc_listen_hdl_OFED_interop == NULL);

	bzero(&srvdesc, sizeof (ibt_srv_desc_t));
	srvdesc.sd_handler = ibd_rc_dispatch_pass_mad;
	srvdesc.sd_flags = IBT_SRV_NO_FLAGS;

	/*
	 * Register the service with service id
	 * Incoming connection requests should arrive on this service id.
	 */
	status = ibt_register_service(state->id_ibt_hdl, &srvdesc,
	    IBD_RC_QPN_TO_SID_OFED_INTEROP(state->id_qpnum),
	    1, &state->rc_listen_hdl_OFED_interop, &ret_sid);
	if (status != IBT_SUCCESS) {
		DPRINT(40,
		    "ibd_rc_listen: Service Registration for Legacy OFED "
		    "Failed %d", status);
		(void) ibt_unbind_service(state->rc_listen_hdl,
		    state->rc_listen_bind);
		(void) ibt_deregister_service(state->id_ibt_hdl,
		    state->rc_listen_hdl);
		return (status);
	}

	gid = state->id_sgid;

	/* pass state as cm_private */
	status = ibt_bind_service(state->rc_listen_hdl_OFED_interop,
	    gid, NULL, state, &state->rc_listen_bind_OFED_interop);
	if (status != IBT_SUCCESS) {
		DPRINT(40, "ibd_rc_listen: fail to bind port: <%d> for "
		    "Legacy OFED listener", status);
		(void) ibt_deregister_service(state->id_ibt_hdl,
		    state->rc_listen_hdl_OFED_interop);
		(void) ibt_unbind_service(state->rc_listen_hdl,
		    state->rc_listen_bind);
		(void) ibt_deregister_service(state->id_ibt_hdl,
		    state->rc_listen_hdl);
		return (status);
	}

	return (IBT_SUCCESS);
}

void
ibd_rc_stop_listen(ibd_state_t *state)
{
	int ret;

	/* Disable incoming connection requests */
	if (state->rc_listen_hdl != NULL) {
		ret = ibt_unbind_all_services(state->rc_listen_hdl);
		if (ret != 0) {
			DPRINT(40, "ibd_rc_stop_listen:"
			    "ibt_unbind_all_services() failed, ret=%d", ret);
		}
		ret = ibt_deregister_service(state->id_ibt_hdl,
		    state->rc_listen_hdl);
		if (ret != 0) {
			DPRINT(40, "ibd_rc_stop_listen:"
			    "ibt_deregister_service() failed, ret=%d", ret);
		} else {
			state->rc_listen_hdl = NULL;
		}
	}

	/* Disable incoming connection requests */
	if (state->rc_listen_hdl_OFED_interop != NULL) {
		ret = ibt_unbind_all_services(
		    state->rc_listen_hdl_OFED_interop);
		if (ret != 0) {
			DPRINT(40, "ibd_rc_stop_listen:"
			    "ibt_unbind_all_services() failed: %d", ret);
		}
		ret = ibt_deregister_service(state->id_ibt_hdl,
		    state->rc_listen_hdl_OFED_interop);
		if (ret != 0) {
			DPRINT(40, "ibd_rc_stop_listen:"
			    "ibt_deregister_service() failed: %d", ret);
		} else {
			state->rc_listen_hdl_OFED_interop = NULL;
		}
	}
}

int
ibd_rc_close_all_chan(ibd_state_t *state)
{
	ibd_rc_chan_t *rc_chan, *rc_chan1;
	ibd_ace_t *ace;
	uint_t attempts;

	/* Disable all Rx routines */
	mutex_enter(&state->rc_pass_chan_list.chan_list_mutex);
	rc_chan = state->rc_pass_chan_list.chan_list;
	while (rc_chan != NULL) {
		ibt_set_cq_handler(rc_chan->rcq_hdl, 0, 0);
		rc_chan = rc_chan->next;
	}
	mutex_exit(&state->rc_pass_chan_list.chan_list_mutex);

	if (state->rc_enable_srq) {
		attempts = 50;
		while (state->rc_srq_rwqe_list.dl_bufs_outstanding > 0) {
			DPRINT(30, "ibd_rc_close_all_chan: outstanding > 0");
			delay(drv_usectohz(100000));
			if (--attempts == 0) {
				/*
				 * There are pending bufs with the network
				 * layer and we have no choice but to wait
				 * for them to be done with. Reap all the
				 * Tx/Rx completions that were posted since
				 * we turned off the notification and
				 * return failure.
				 */
				mutex_enter(
				    &state->rc_pass_chan_list.chan_list_mutex);
				rc_chan = state->rc_pass_chan_list.chan_list;
				while (rc_chan != NULL) {
					ibd_rc_poll_rcq
					    (rc_chan, rc_chan->rcq_hdl);
					ibt_set_cq_handler(rc_chan->rcq_hdl,
					    ibd_rc_rcq_handler, rc_chan);
					rc_chan = rc_chan->next;
				}
				mutex_exit(
				    &state->rc_pass_chan_list.chan_list_mutex);
				return (DDI_FAILURE);
			}
		}
	}

	/* Close all passive RC channels */
	rc_chan = ibd_rc_rm_header_chan_list(&state->rc_pass_chan_list);
	while (rc_chan != NULL) {
		if (ibd_rc_pas_close(rc_chan) != DDI_SUCCESS) {
			mutex_enter(&state->rc_pass_chan_list.chan_list_mutex);
			rc_chan1 = state->rc_pass_chan_list.chan_list;
			while (rc_chan1 != NULL) {
				ibd_rc_poll_rcq(rc_chan1, rc_chan1->rcq_hdl);
				ibt_set_cq_handler(rc_chan1->rcq_hdl,
				    ibd_rc_rcq_handler, rc_chan1);
				rc_chan1 = rc_chan1->next;
			}
			mutex_exit(&state->rc_pass_chan_list.chan_list_mutex);
			ibd_rc_add_to_chan_list(&state->rc_pass_chan_list,
			    rc_chan);
			DPRINT(40, "ibd_rc_close_all_chan: ibd_rc_pas_close() "
			    "failed");
			return (DDI_FAILURE);
		}
		rc_chan = ibd_rc_rm_header_chan_list(&state->rc_pass_chan_list);
	}

	/* Close all active RC channels */
	mutex_enter(&state->id_ac_mutex);
	ace = list_head(&state->id_ah_active);
	while (ace != NULL) {
		if (ace->ac_chan != NULL) {
			ibd_rc_add_to_chan_list(&state->rc_obs_act_chan_list,
			    ace->ac_chan);
		}
		ace = list_next(&state->id_ah_active, ace);
	}
	mutex_exit(&state->id_ac_mutex);

	rc_chan = ibd_rc_rm_header_chan_list(&state->rc_obs_act_chan_list);
	while (rc_chan != NULL) {
		ace = rc_chan->ace;
		ibd_rc_act_close(rc_chan);
		if (ace != NULL)
			ace->ac_chan = NULL;
		rc_chan = ibd_rc_rm_header_chan_list(
		    &state->rc_obs_act_chan_list);
	}
	return (DDI_SUCCESS);
}

void
ibd_rc_try_connect(ibd_state_t *state, ibd_ace_t *ace,  ibt_path_info_t *path)
{
	ibt_status_t status;

	status = ibd_rc_connect(state, ace, path,
	    IBD_RC_SERVICE_ID_OFED_INTEROP);

	if (status != IBT_SUCCESS) {
		/* wait peer side remove stale channel */
		delay(drv_usectohz(10000));
		status = ibd_rc_connect(state, ace, path,
		    IBD_RC_SERVICE_ID_OFED_INTEROP);
	}

	if (status != IBT_SUCCESS) {
		/* wait peer side remove stale channel */
		delay(drv_usectohz(10000));
		(void) ibd_rc_connect(state, ace, path,
		    IBD_RC_SERVICE_ID);
	}
}

/*
 * Allocates channel and sets the ace->ac_chan to it.
 * Opens the channel.
 */
ibt_status_t
ibd_rc_connect(ibd_state_t *state, ibd_ace_t *ace,  ibt_path_info_t *path,
    uint64_t ietf_cm_service_id)
{
	ibt_status_t status = 0;
	ibt_rc_returns_t open_returns;
	ibt_chan_open_args_t open_args;
	ibd_rc_msg_hello_t hello_req_msg;
	ibd_rc_msg_hello_t *hello_ack_msg;
	ibd_rc_chan_t *chan;

	ASSERT(ace != NULL);
	ASSERT(ace->ac_mce == NULL);
	ASSERT(ace->ac_chan == NULL);

	if ((status = ibd_rc_alloc_chan(&chan, state, B_TRUE)) != IBT_SUCCESS) {
		DPRINT(10, "ibd_rc_connect: ibd_rc_alloc_chan() failed");
		return (status);
	}

	ace->ac_chan = chan;
	chan->state = state;
	chan->ace = ace;

	ibt_set_chan_private(chan->chan_hdl, (void *)(uintptr_t)ace);

	hello_ack_msg = kmem_zalloc(sizeof (ibd_rc_msg_hello_t), KM_SLEEP);

	/*
	 * open the channels
	 */
	bzero(&open_args, sizeof (ibt_chan_open_args_t));
	bzero(&open_returns, sizeof (ibt_rc_returns_t));

	open_args.oc_cm_handler = ibd_rc_dispatch_actv_mad;
	open_args.oc_cm_clnt_private = (void *)(uintptr_t)ace;

	/*
	 * update path record with the SID
	 */
	path->pi_sid =
	    ietf_cm_service_id | ((ace->ac_dest->ud_dst_qpn) & 0xffffff);


	/* pre-allocate memory for hello ack message */
	open_returns.rc_priv_data_len = sizeof (ibd_rc_msg_hello_t);
	open_returns.rc_priv_data = hello_ack_msg;

	open_args.oc_path = path;

	open_args.oc_path_rnr_retry_cnt	= 7;
	open_args.oc_path_retry_cnt = 7;

	/* We don't do RDMA */
	open_args.oc_rdma_ra_out = 0;
	open_args.oc_rdma_ra_in	= 0;

	hello_req_msg.reserved_qpn = htonl(state->id_qpnum);
	hello_req_msg.rx_mtu = htonl(state->rc_mtu);
	open_args.oc_priv_data_len = sizeof (ibd_rc_msg_hello_t);
	open_args.oc_priv_data = (void *)(&hello_req_msg);

	ASSERT(open_args.oc_priv_data_len <= IBT_REQ_PRIV_DATA_SZ);
	ASSERT(open_returns.rc_priv_data_len <= IBT_REP_PRIV_DATA_SZ);
	ASSERT(open_args.oc_cm_handler != NULL);

	status = ibt_open_rc_channel(chan->chan_hdl, IBT_OCHAN_NO_FLAGS,
	    IBT_BLOCKING, &open_args, &open_returns);

	if (status == IBT_SUCCESS) {
		/* Success! */
		DPRINT(2, "ibd_rc_connect: call ibt_open_rc_channel succ!");
		state->rc_conn_succ++;
		kmem_free(hello_ack_msg, sizeof (ibd_rc_msg_hello_t));
		return (IBT_SUCCESS);
	}

	/* failure */
	(void) ibt_flush_channel(chan->chan_hdl);
	ibd_rc_free_chan(chan);
	ace->ac_chan = NULL;

	/* check open_returns report error and exit */
	DPRINT(30, "ibd_rc_connect: call ibt_open_rc_chan fail."
	    "ret status = %d, reason=%d, ace=%p, mtu=0x%x, qpn=0x%x,"
	    " peer qpn=0x%x", status, (int)open_returns.rc_status, ace,
	    hello_req_msg.rx_mtu, hello_req_msg.reserved_qpn,
	    ace->ac_dest->ud_dst_qpn);
	kmem_free(hello_ack_msg, sizeof (ibd_rc_msg_hello_t));
	return (status);
}

void
ibd_rc_signal_act_close(ibd_state_t *state, ibd_ace_t *ace)
{
	ibd_req_t *req;

	req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP);
	if (req == NULL) {
		ibd_print_warn(state, "ibd_rc_signal_act_close: alloc "
		    "ibd_req_t fail");
		mutex_enter(&state->rc_obs_act_chan_list.chan_list_mutex);
		ace->ac_chan->next = state->rc_obs_act_chan_list.chan_list;
		state->rc_obs_act_chan_list.chan_list = ace->ac_chan;
		mutex_exit(&state->rc_obs_act_chan_list.chan_list_mutex);
	} else {
		req->rq_ptr = ace->ac_chan;
		ibd_queue_work_slot(state, req, IBD_ASYNC_RC_CLOSE_ACT_CHAN);
	}
}

void
ibd_rc_signal_ace_recycle(ibd_state_t *state, ibd_ace_t *ace)
{
	ibd_req_t *req;

	mutex_enter(&state->rc_ace_recycle_lock);
	if (state->rc_ace_recycle != NULL) {
		mutex_exit(&state->rc_ace_recycle_lock);
		return;
	}

	req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP);
	if (req == NULL) {
		mutex_exit(&state->rc_ace_recycle_lock);
		return;
	}

	state->rc_ace_recycle = ace;
	mutex_exit(&state->rc_ace_recycle_lock);
	ASSERT(ace->ac_mce == NULL);
	INC_REF(ace, 1);
	IBD_ACACHE_PULLOUT_ACTIVE(state, ace);
	req->rq_ptr = ace;
	ibd_queue_work_slot(state, req, IBD_ASYNC_RC_RECYCLE_ACE);
}

static void
ibd_rc_act_close(ibd_rc_chan_t *chan)
{
	uint_t times;
	ibt_status_t ret;

	ASSERT(chan != NULL);

	chan->state->rc_act_close++;
	switch (chan->chan_state) {
	case IBD_RC_STATE_ACT_CLOSING:	/* stale, close it */
	case IBD_RC_STATE_ACT_ESTAB:
		DPRINT(30, "ibd_rc_act_close-1: close and free chan, "
		    "act_state=%d, chan=%p", chan->chan_state, chan);
		chan->chan_state = IBD_RC_STATE_ACT_CLOSED;
		ibt_set_cq_handler(chan->rcq_hdl, 0, 0);
		/* Wait send queue empty */
		times = 0;
		mutex_enter(&chan->tx_wqe_list.dl_mutex);
		mutex_enter(&chan->tx_rel_list.dl_mutex);
		while (((chan->tx_wqe_list.dl_cnt + chan->tx_rel_list.dl_cnt)
		    != chan->scq_size) && (times < 50)) {
			DPRINT(30, "ibd_rc_act_close: dl_cnt(tx_wqe_list=%d,"
			    " tx_rel_list=%d) != chan->scq_size=%d",
			    chan->tx_wqe_list.dl_cnt, chan->tx_rel_list.dl_cnt,
			    chan->scq_size);
			mutex_exit(&chan->tx_rel_list.dl_mutex);
			mutex_exit(&chan->tx_wqe_list.dl_mutex);
			mutex_enter(&chan->tx_poll_lock);
			if (chan->tx_poll_busy & IBD_CQ_POLLING) {
				DPRINT(40, "ibd_rc_act_close: multiple "
				    "polling threads");
				mutex_exit(&chan->tx_poll_lock);
			} else {
				chan->tx_poll_busy = IBD_CQ_POLLING;
				mutex_exit(&chan->tx_poll_lock);
				ibd_rc_drain_scq(chan, chan->scq_hdl);
				mutex_enter(&chan->tx_poll_lock);
				chan->tx_poll_busy = 0;
				mutex_exit(&chan->tx_poll_lock);
			}
			delay(drv_usectohz(100000));
			times++;
			mutex_enter(&chan->tx_wqe_list.dl_mutex);
			mutex_enter(&chan->tx_rel_list.dl_mutex);
		}
		mutex_exit(&chan->tx_rel_list.dl_mutex);
		mutex_exit(&chan->tx_wqe_list.dl_mutex);
		ibt_set_cq_handler(chan->scq_hdl, 0, 0);
		ret = ibt_close_rc_channel(chan->chan_hdl,
		    IBT_BLOCKING|IBT_NOCALLBACKS, NULL, 0, NULL, NULL, 0);
		if (ret != IBT_SUCCESS) {
			DPRINT(40, "ibd_rc_act_close-2: ibt_close_rc_channel "
			    "fail, chan=%p, returned=%d", chan, ret);
		} else {
			DPRINT(30, "ibd_rc_act_close-2: ibt_close_rc_channel "
			    "succ, chan=%p", chan);
		}

		ibd_rc_free_chan(chan);
		break;
	case IBD_RC_STATE_ACT_REP_RECV:
		chan->chan_state = IBD_RC_STATE_ACT_CLOSED;
		(void) ibt_flush_channel(chan->chan_hdl);
		ibd_rc_free_chan(chan);
		break;
	case IBD_RC_STATE_ACT_ERROR:
		DPRINT(40, "ibd_rc_act_close: IBD_RC_STATE_ERROR branch");
		break;
	default:
		DPRINT(40, "ibd_rc_act_close: default branch, act_state=%d, "
		    "chan=%p", chan->chan_state, chan);
	}
}

static int
ibd_rc_pas_close(ibd_rc_chan_t *chan)
{
	uint_t times;
	ibt_status_t ret;

	ASSERT(chan != NULL);
	chan->state->rc_pas_close++;

	switch (chan->chan_state) {
	case IBD_RC_STATE_PAS_ESTAB:
		/*
		 * First, stop receive interrupts; this stops the
		 * connection from handing up buffers to higher layers.
		 * Wait for receive buffers to be returned; give up
		 * after 5 seconds.
		 */
		ibt_set_cq_handler(chan->rcq_hdl, 0, 0);
		if (!chan->state->rc_enable_srq) {
			times = 50;
			while (chan->rx_wqe_list.dl_bufs_outstanding > 0) {
				delay(drv_usectohz(100000));
				if (--times == 0) {
					DPRINT(40, "ibd_rc_pas_close : "
					    "reclaiming failed");
					ibd_rc_poll_rcq(chan, chan->rcq_hdl);
					ibt_set_cq_handler(chan->rcq_hdl,
					    ibd_rc_rcq_handler,
					    (void *)(uintptr_t)chan);
					return (DDI_FAILURE);
				}
			}
		}
		ibt_set_cq_handler(chan->scq_hdl, 0, 0);
		chan->chan_state = IBD_RC_STATE_PAS_CLOSED;
		DPRINT(30, "ibd_rc_pas_close-1: close and free chan, "
		    "chan_state=%d, chan=%p", chan->chan_state, chan);
		ret = ibt_close_rc_channel(chan->chan_hdl,
		    IBT_BLOCKING|IBT_NOCALLBACKS, NULL, 0, NULL, NULL, 0);
		if (ret != IBT_SUCCESS) {
			DPRINT(40, "ibd_rc_pas_close-2: ibt_close_rc_channel()"
			    " fail, chan=%p, returned=%d", chan, ret);
		} else {
			DPRINT(30, "ibd_rc_pas_close-2: ibt_close_rc_channel()"
			    " succ, chan=%p", chan);
		}

		ibd_rc_free_chan(chan);
		break;
	case IBD_RC_STATE_PAS_REQ_RECV:
		chan->chan_state = IBD_RC_STATE_PAS_CLOSED;
		(void) ibt_flush_channel(chan->chan_hdl);
		ibd_rc_free_chan(chan);
		break;
	default:
		DPRINT(40, "ibd_rc_pas_close: default, chan_state=%d, chan=%p",
		    chan->chan_state, chan);
	}
	return (DDI_SUCCESS);
}

/*
 * Remove duplicate RC channel which comes from the same mac
 *
 * From the IP point of view, we could check for same MAC:
 * GID, P_Key (or QPN, though in a reboot this is likely to
 * change so P_Key is better). The GID usually will equate to
 * port (since typically it uses the port GUID in the low 64 bits).
 * These fields exists in the REQ messages.
 */
void
ibd_rc_handle_req_rm_dup(ibd_state_t *state, ibt_cm_event_t *ibt_cm_event)
{
	ibd_rc_chan_t *chan, *pre_chan;

	pre_chan = NULL;
	mutex_enter(&state->rc_pass_chan_list.chan_list_mutex);
	chan = state->rc_pass_chan_list.chan_list;
	while (chan != NULL) {
		if ((bcmp(&chan->requester_gid,
		    &ibt_cm_event->cm_event.req.req_prim_addr.av_dgid,
		    sizeof (ib_gid_t)) == 0) && (chan->requester_pkey ==
		    ibt_cm_event->cm_event.req.req_pkey)) {
			if (pre_chan == NULL) {
				state->rc_pass_chan_list.chan_list = chan->next;
			} else {
				pre_chan->next = chan->next;
			}
			break;
		}
		pre_chan = chan;
		chan = chan->next;
	}
	mutex_exit(&state->rc_pass_chan_list.chan_list_mutex);
	if (chan) {
		DPRINT(30, "ibd_rc_handle_req_rm_dup: same gid and pkey, "
		    "remove duplicate channal, chan=%p", chan);
		if (ibd_rc_pas_close(chan) != DDI_SUCCESS) {
			ibd_rc_add_to_chan_list(&state->rc_pass_chan_list,
			    chan);
		}
	}
}

/*
 * Passive Side:
 *	Handle an incoming CM REQ from active side.
 *
 *	If success, this function allocates an ibd_rc_chan_t, then
 * assigns it to "*ret_conn".
 */
static ibt_cm_status_t
ibd_rc_handle_req(void *arg, ibd_rc_chan_t **ret_conn,
    ibt_cm_event_t *ibt_cm_event, ibt_cm_return_args_t *ret_args,
    void *ret_priv_data)
{
	ibd_rc_msg_hello_t *hello_msg;
	ibd_state_t *state = (ibd_state_t *)arg;
	ibd_rc_chan_t *chan;

	ibd_rc_handle_req_rm_dup(state, ibt_cm_event);

	if (ibd_rc_alloc_chan(&chan, state, B_FALSE) != IBT_SUCCESS) {
		DPRINT(40, "ibd_rc_handle_req: ibd_rc_alloc_chan() failed");
		return (IBT_CM_REJECT);
	}

	ibd_rc_add_to_chan_list(&state->rc_pass_chan_list, chan);

	ibt_set_chan_private(chan->chan_hdl, (void *)(uintptr_t)chan);

	if (!state->rc_enable_srq) {
		if (ibd_rc_init_rxlist(chan) != DDI_SUCCESS) {
			ibd_rc_free_chan(chan);
			DPRINT(40, "ibd_rc_handle_req: ibd_rc_init_rxlist() "
			    "failed");
			return (IBT_CM_REJECT);
		}
	}

	ret_args->cm_ret.rep.cm_channel = chan->chan_hdl;

	/* We don't do RDMA */
	ret_args->cm_ret.rep.cm_rdma_ra_out = 0;
	ret_args->cm_ret.rep.cm_rdma_ra_in = 0;

	ret_args->cm_ret.rep.cm_rnr_retry_cnt = 7;
	ret_args->cm_ret_len = sizeof (ibd_rc_msg_hello_t);

	hello_msg = (ibd_rc_msg_hello_t *)ibt_cm_event->cm_priv_data;
	DPRINT(30, "ibd_rc_handle_req(): peer qpn=0x%x, peer mtu=0x%x",
	    ntohl(hello_msg->reserved_qpn), ntohl(hello_msg->rx_mtu));

	hello_msg = (ibd_rc_msg_hello_t *)ret_priv_data;
	hello_msg->reserved_qpn = htonl(state->id_qpnum);
	hello_msg->rx_mtu = htonl(state->rc_mtu);

	chan->requester_gid = ibt_cm_event->cm_event.req.req_prim_addr.av_dgid;
	chan->requester_pkey = ibt_cm_event->cm_event.req.req_pkey;
	chan->chan_state = IBD_RC_STATE_PAS_REQ_RECV;	/* ready to receive */
	*ret_conn = chan;

	return (IBT_CM_ACCEPT);
}

/*
 * ibd_rc_handle_act_estab -- handler for connection established completion
 * for active side.
 */
static ibt_cm_status_t
ibd_rc_handle_act_estab(ibd_ace_t *ace)
{
	ibt_status_t result;

	switch (ace->ac_chan->chan_state) {
		case IBD_RC_STATE_ACT_REP_RECV:
			ace->ac_chan->chan_state = IBD_RC_STATE_ACT_ESTAB;
			result = ibt_enable_cq_notify(ace->ac_chan->rcq_hdl,
			    IBT_NEXT_COMPLETION);
			if (result != IBT_SUCCESS) {
				DPRINT(40, "ibd_rc_handle_act_estab: "
				    "ibt_enable_cq_notify(rcq) "
				    "failed: status %d", result);
				return (IBT_CM_REJECT);
			}
			break;
		default:
			DPRINT(40, "ibd_rc_handle_act_estab: default "
			    "branch, act_state=%d", ace->ac_chan->chan_state);
			return (IBT_CM_REJECT);
	}
	return (IBT_CM_ACCEPT);
}

/*
 * ibd_rc_handle_pas_estab -- handler for connection established completion
 * for passive side.
 */
static ibt_cm_status_t
ibd_rc_handle_pas_estab(ibd_rc_chan_t *chan)
{
	ibt_status_t result;

	switch (chan->chan_state) {
		case IBD_RC_STATE_PAS_REQ_RECV:
			chan->chan_state = IBD_RC_STATE_PAS_ESTAB;

			result = ibt_enable_cq_notify(chan->rcq_hdl,
			    IBT_NEXT_COMPLETION);
			if (result != IBT_SUCCESS) {
				DPRINT(40, "ibd_rc_handle_pas_estab: "
				    "ibt_enable_cq_notify(rcq) "
				    "failed: status %d", result);
				return (IBT_CM_REJECT);
			}
			break;
		default:
			DPRINT(40, "ibd_rc_handle_pas_estab: default "
			    "branch, chan_state=%d", chan->chan_state);
			return (IBT_CM_REJECT);
	}
	return (IBT_CM_ACCEPT);
}

/* ARGSUSED */
static ibt_cm_status_t
ibd_rc_dispatch_actv_mad(void *arg, ibt_cm_event_t *ibt_cm_event,
    ibt_cm_return_args_t *ret_args, void *ret_priv_data,
    ibt_priv_data_len_t ret_len_max)
{
	ibt_cm_status_t result = IBT_CM_ACCEPT;
	ibd_ace_t *ace = (ibd_ace_t *)(uintptr_t)arg;
	ibd_rc_chan_t *rc_chan;
	ibd_state_t *state;
	ibd_rc_msg_hello_t *hello_ack;
	uint_t times;

	switch (ibt_cm_event->cm_type) {
	case IBT_CM_EVENT_REP_RCV:
		ASSERT(ace->ac_chan != NULL);
		ASSERT(ace->ac_chan->chan_state == IBD_RC_STATE_INIT);
		hello_ack = (ibd_rc_msg_hello_t *)ibt_cm_event->cm_priv_data;
		DPRINT(30, "ibd_rc_handle_rep: hello_ack->mtu=0x%x, "
		    "hello_ack->qpn=0x%x", ntohl(hello_ack->rx_mtu),
		    ntohl(hello_ack->reserved_qpn));
		ace->ac_chan->chan_state = IBD_RC_STATE_ACT_REP_RECV;
		break;

	case IBT_CM_EVENT_CONN_EST:
		ASSERT(ace->ac_chan != NULL);
		DPRINT(30, "ibd_rc_dispatch_actv_mad: IBT_CM_EVENT_CONN_EST, "
		    "ace=%p, act_state=%d, chan=%p",
		    ace, ace->ac_chan->chan_state, ace->ac_chan);
		result = ibd_rc_handle_act_estab(ace);
		break;

	case IBT_CM_EVENT_CONN_CLOSED:
		rc_chan = ace->ac_chan;
		if (rc_chan == NULL) {
			DPRINT(40, "ibd_rc_dispatch_actv_mad: "
			    "rc_chan==NULL, IBT_CM_EVENT_CONN_CLOSED");
			return (IBT_CM_ACCEPT);
		}
		state = rc_chan->state;
		mutex_enter(&state->id_ac_mutex);
		if ((rc_chan->chan_state == IBD_RC_STATE_ACT_ESTAB) &&
		    ((ace = ibd_acache_find(state, &ace->ac_mac, B_FALSE, 0))
		    != NULL) && (ace == rc_chan->ace)) {
			rc_chan->chan_state = IBD_RC_STATE_ACT_CLOSING;
			ASSERT(ace->ac_mce == NULL);
			INC_REF(ace, 1);
			IBD_ACACHE_PULLOUT_ACTIVE(state, ace);
			mutex_exit(&state->id_ac_mutex);
			DPRINT(30, "ibd_rc_dispatch_actv_mad: "
			    "IBT_CM_EVENT_CONN_CLOSED, ace=%p, chan=%p, "
			    "reason=%d", ace, rc_chan,
			    ibt_cm_event->cm_event.closed);
		} else {
			mutex_exit(&state->id_ac_mutex);
			state->rc_act_close_simultaneous++;
			DPRINT(40, "ibd_rc_dispatch_actv_mad: other thread "
			    "is closing it, IBT_CM_EVENT_CONN_CLOSED, "
			    "chan_state=%d", rc_chan->chan_state);
			return (IBT_CM_ACCEPT);
		}
		/* wait until the send queue clean */
		times = 0;
		mutex_enter(&rc_chan->tx_wqe_list.dl_mutex);
		mutex_enter(&rc_chan->tx_rel_list.dl_mutex);
		while (((rc_chan->tx_wqe_list.dl_cnt +
		    rc_chan->tx_rel_list.dl_cnt)
		    != rc_chan->scq_size) && (times < 50)) {
			DPRINT(40, "ibd_rc_dispatch_act_mad: dl_cnt"
			    "(tx_wqe_list=%d, tx_rel_list=%d) != "
			    "chan->scq_size=%d",
			    rc_chan->tx_wqe_list.dl_cnt,
			    rc_chan->tx_rel_list.dl_cnt,
			    rc_chan->scq_size);
			mutex_exit(&rc_chan->tx_rel_list.dl_mutex);
			mutex_exit(&rc_chan->tx_wqe_list.dl_mutex);
			mutex_enter(&rc_chan->tx_poll_lock);
			if (rc_chan->tx_poll_busy & IBD_CQ_POLLING) {
				DPRINT(40, "ibd_rc_dispatch_actv_mad: "
				    "multiple polling threads");
				mutex_exit(&rc_chan->tx_poll_lock);
			} else {
				rc_chan->tx_poll_busy = IBD_CQ_POLLING;
				mutex_exit(&rc_chan->tx_poll_lock);
				ibd_rc_drain_scq(rc_chan, rc_chan->scq_hdl);
				mutex_enter(&rc_chan->tx_poll_lock);
				rc_chan->tx_poll_busy = 0;
				mutex_exit(&rc_chan->tx_poll_lock);
			}
			delay(drv_usectohz(100000));
			times++;
			mutex_enter(&rc_chan->tx_wqe_list.dl_mutex);
			mutex_enter(&rc_chan->tx_rel_list.dl_mutex);
		}
		mutex_exit(&rc_chan->tx_rel_list.dl_mutex);
		mutex_exit(&rc_chan->tx_wqe_list.dl_mutex);
		rc_chan->chan_state = IBD_RC_STATE_ACT_CLOSED;
		ibd_rc_free_chan(rc_chan);
		DPRINT(30, "ibd_rc_dispatch_actv_mad: "
		    "IBT_CM_EVENT_CONN_CLOSED, ref=%x", ace->ac_ref);
		mutex_enter(&state->id_ac_mutex);
		ace->ac_chan = NULL;
		ASSERT(ace->ac_ref != 0);
		atomic_dec_32(&ace->ac_ref);
		if ((ace->ac_ref == 0) || (ace->ac_ref == CYCLEVAL)) {
			IBD_ACACHE_INSERT_FREE(state, ace);
			ace->ac_ref = 0;
		} else {
			ace->ac_ref |= CYCLEVAL;
			state->rc_delay_ace_recycle++;
		}
		mutex_exit(&state->id_ac_mutex);
		break;

	case IBT_CM_EVENT_FAILURE:
		DPRINT(30, "ibd_rc_dispatch_actv_mad: IBT_CM_EVENT_FAILURE,"
		    "ace=%p, chan=%p, code: %d, msg: %d, reason=%d",
		    ace, ace->ac_chan,
		    ibt_cm_event->cm_event.failed.cf_code,
		    ibt_cm_event->cm_event.failed.cf_msg,
		    ibt_cm_event->cm_event.failed.cf_reason);
		/*
		 * Don't need free resource here. The resource is freed
		 * at function ibd_rc_connect()
		 */
		break;

	case IBT_CM_EVENT_MRA_RCV:
		DPRINT(40, "ibd_rc_dispatch_actv_mad: IBT_CM_EVENT_MRA_RCV");
		break;
	case IBT_CM_EVENT_LAP_RCV:
		DPRINT(40, "ibd_rc_dispatch_actv_mad: LAP message received");
		break;
	case IBT_CM_EVENT_APR_RCV:
		DPRINT(40, "ibd_rc_dispatch_actv_mad: APR message received");
		break;
	default:
		DPRINT(40, "ibd_rc_dispatch_actv_mad: default branch, "
		    "ibt_cm_event->cm_type=%d", ibt_cm_event->cm_type);
		break;
	}

	return (result);
}

/* ARGSUSED */
static ibt_cm_status_t
ibd_rc_dispatch_pass_mad(void *arg, ibt_cm_event_t *ibt_cm_event,
    ibt_cm_return_args_t *ret_args, void *ret_priv_data,
    ibt_priv_data_len_t ret_len_max)
{
	ibt_cm_status_t result = IBT_CM_ACCEPT;
	ibd_rc_chan_t *chan;

	if (ibt_cm_event->cm_type == IBT_CM_EVENT_REQ_RCV) {
		DPRINT(30, "ibd_rc_dispatch_pass_mad: IBT_CM_EVENT_REQ_RCV,"
		    "req_pkey=%x", ibt_cm_event->cm_event.req.req_pkey);
		/* Receive an incoming CM REQ from active side */
		result = ibd_rc_handle_req(arg, &chan, ibt_cm_event, ret_args,
		    ret_priv_data);
		return (result);
	}

	if (ibt_cm_event->cm_channel == 0) {
		DPRINT(30, "ibd_rc_dispatch_pass_mad: "
		    "ERROR ibt_cm_event->cm_channel == 0");
		return (IBT_CM_REJECT);
	}

	chan =
	    (ibd_rc_chan_t *)ibt_get_chan_private(ibt_cm_event->cm_channel);
	if (chan == NULL) {
		DPRINT(40, "ibd_rc_dispatch_pass_mad: conn == 0");
		return (IBT_CM_REJECT);
	}

	switch (ibt_cm_event->cm_type) {
	case IBT_CM_EVENT_CONN_EST:
		DPRINT(30, "ibd_rc_dispatch_pass_mad: IBT_CM_EVENT_CONN_EST, "
		    "chan=%p", chan);
		result = ibd_rc_handle_pas_estab(chan);
		break;
	case IBT_CM_EVENT_CONN_CLOSED:
		DPRINT(30, "ibd_rc_dispatch_pass_mad: IBT_CM_EVENT_CONN_CLOSED,"
		    " chan=%p, reason=%d", chan, ibt_cm_event->cm_event.closed);
		ibd_rc_rm_from_chan_list(&chan->state->rc_pass_chan_list, chan);
		ibd_rc_free_chan(chan);
		break;
	case IBT_CM_EVENT_FAILURE:
		DPRINT(30, "ibd_rc_dispatch_pass_mad: IBT_CM_EVENT_FAILURE,"
		    " chan=%p, code: %d, msg: %d, reason=%d", chan,
		    ibt_cm_event->cm_event.failed.cf_code,
		    ibt_cm_event->cm_event.failed.cf_msg,
		    ibt_cm_event->cm_event.failed.cf_reason);

		ibd_rc_rm_from_chan_list(&chan->state->rc_pass_chan_list, chan);
		ibd_rc_free_chan(chan);
		return (IBT_CM_ACCEPT);
	case IBT_CM_EVENT_MRA_RCV:
		DPRINT(40, "ibd_rc_dispatch_pass_mad: IBT_CM_EVENT_MRA_RCV");
		break;
	case IBT_CM_EVENT_LAP_RCV:
		DPRINT(40, "ibd_rc_dispatch_pass_mad: LAP message received");
		break;
	case IBT_CM_EVENT_APR_RCV:
		DPRINT(40, "ibd_rc_dispatch_pass_mad: APR message received");
		break;
	default:
		DPRINT(40, "ibd_rc_dispatch_pass_mad: default, type=%d, "
		    "chan=%p", ibt_cm_event->cm_type, chan);
		break;
	}

	return (result);
}