usr/src/uts/common/io/ib/mgt/ibcm/ibcm_impl.c
author Rajkumar Sivaprakasam <Rajkumar.Sivaprakasam@Sun.COM>
Thu, 05 Feb 2009 10:03:55 -0800
changeset 8695 115e6d42744b
parent 8580 85b678a652f5
child 9879 2547a76dc28b
permissions -rw-r--r--
6773181 panic due to assertion failure from ibmf_saa_impl_hca_detach() 6800017 HCA DR requires RCM script for SDP 6773886 Panic in ibnex_name_child() function 6784821 typo from 'cfgadm configure' 6763923 Calling 'cfgadm -yx update_pkey_tbls' with a invalid ap_id doesn't return error 6751194 rpcib: unable to unconfigure InfiniBand HCA cards 6794326 ibcm:ibcm_hca_detach fails due to transient connections 6794307 ibnex_ioc_list not protected properly 6778827 rpcib: Panic due to invalid mutex reference

/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

/*
 * ibcm_impl.c
 *
 * contains internal functions of IB CM module.
 *
 * TBD:
 * 1. HCA CATASTROPHIC/RECOVERED not handled yet
 */

#include <sys/ib/mgt/ibcm/ibcm_impl.h>
#include <sys/disp.h>


/* function prototypes */
static ibcm_status_t	ibcm_init(void);
static ibcm_status_t	ibcm_fini(void);

/* Routines to initialize and destory CM global locks and CVs */
static void		ibcm_init_locks(void);
static void		ibcm_fini_locks(void);

/* Routines that initialize/teardown CM's global hca structures */
static void		ibcm_init_hcas();
static ibcm_status_t	ibcm_fini_hcas();

static void		ibcm_init_classportinfo();
static void		ibcm_stop_timeout_thread();

/* Routines that handle HCA attach/detach asyncs */
static void		ibcm_hca_attach(ib_guid_t);
static ibcm_status_t	ibcm_hca_detach(ibcm_hca_info_t *);

/* Routines that initialize the HCA's port related fields */
static ibt_status_t	ibcm_hca_init_port(ibcm_hca_info_t *hcap,
			    uint8_t port_index);
static ibcm_status_t	ibcm_hca_fini_port(ibcm_hca_info_t *hcap,
			    uint8_t port_index);

static void ibcm_rc_flow_control_init(void);
static void ibcm_rc_flow_control_fini(void);

/*
 * Routines that check if hca's avl trees and sidr lists are free of any
 * active client resources ie., RC or UD state structures in certain states
 */
static ibcm_status_t	ibcm_check_avl_clean(ibcm_hca_info_t *hcap);
static ibcm_status_t	ibcm_check_sidr_clean(ibcm_hca_info_t *hcap);

/* Add a new hca structure to CM's global hca list */
static ibcm_hca_info_t	*ibcm_add_hca_entry(ib_guid_t hcaguid, uint_t nports);

static void		ibcm_comm_est_handler(ibt_async_event_t *);
void			ibcm_async_handler(void *, ibt_hca_hdl_t,
			    ibt_async_code_t, ibt_async_event_t *);

/* Global variables */
char			cmlog[] = "ibcm";	/* for debug log messages */
ibt_clnt_hdl_t		ibcm_ibt_handle;	/* IBT handle */
kmutex_t		ibcm_svc_info_lock;	/* list lock */
kcondvar_t		ibcm_svc_info_cv;	/* cv for deregister */
kmutex_t		ibcm_recv_mutex;
avl_tree_t		ibcm_svc_avl_tree;
taskq_t			*ibcm_taskq = NULL;
int			taskq_dispatch_fail_cnt;

kmutex_t		ibcm_trace_mutex;	/* Trace mutex */
kmutex_t		ibcm_trace_print_mutex;	/* Trace print mutex */
int			ibcm_conn_max_trcnt = IBCM_MAX_CONN_TRCNT;

int			ibcm_enable_trace = 2;	/* Trace level 4 by default */
int			ibcm_dtrace = 0; /* conditionally enable more dtrace */

_NOTE(MUTEX_PROTECTS_DATA(ibcm_svc_info_lock, ibcm_svc_info_s::{svc_bind_list
    svc_ref_cnt svc_to_delete}))

_NOTE(MUTEX_PROTECTS_DATA(ibcm_svc_info_lock, ibcm_svc_bind_s::{sbind_link}))

_NOTE(MUTEX_PROTECTS_DATA(ibcm_trace_mutex, ibcm_conn_trace_s))

_NOTE(DATA_READABLE_WITHOUT_LOCK(ibcm_conn_trace_s))

_NOTE(MUTEX_PROTECTS_DATA(ibcm_trace_print_mutex, ibcm_debug_buf))

_NOTE(DATA_READABLE_WITHOUT_LOCK(ibcm_debug_buf))

/*
 * Initial state is INIT. All hca dr's return success immediately in this
 * state, without adding or deleting any hca's to CM.
 */
ibcm_finit_state_t	ibcm_finit_state = IBCM_FINIT_INIT;

/* mutex and cv to manage hca's reference and resource count(s) */
kmutex_t		ibcm_global_hca_lock;
kcondvar_t		ibcm_global_hca_cv;

/* mutex and cv to sa session open */
kmutex_t		ibcm_sa_open_lock;
kcondvar_t		ibcm_sa_open_cv;
int			ibcm_sa_timeout_delay = 1;		/* in ticks */
_NOTE(MUTEX_PROTECTS_DATA(ibcm_sa_open_lock,
    ibcm_port_info_s::{port_ibmf_saa_hdl port_saa_open_in_progress}))

_NOTE(DATA_READABLE_WITHOUT_LOCK(ibcm_port_info_s::{port_ibmf_saa_hdl}))

/* serialize sm notice callbacks */
kmutex_t		ibcm_sm_notice_serialize_lock;

_NOTE(LOCK_ORDER(ibcm_sm_notice_serialize_lock ibcm_global_hca_lock))

_NOTE(MUTEX_PROTECTS_DATA(ibcm_global_hca_lock, ibcm_hca_info_s::{hca_state
    hca_svc_cnt hca_acc_cnt hca_res_cnt hca_next}))

_NOTE(MUTEX_PROTECTS_DATA(ibcm_global_hca_lock,
    ibcm_port_info_s::{port_ibmf_hdl}))

_NOTE(MUTEX_PROTECTS_DATA(ibcm_sm_notice_serialize_lock,
    ibcm_port_info_s::{port_event_status}))

_NOTE(DATA_READABLE_WITHOUT_LOCK(ibcm_hca_info_s::{hca_state}))
_NOTE(DATA_READABLE_WITHOUT_LOCK(
    ibcm_hca_info_s::{hca_port_info.port_ibmf_hdl}))

/* mutex for CM's qp list management */
kmutex_t		ibcm_qp_list_lock;

_NOTE(MUTEX_PROTECTS_DATA(ibcm_qp_list_lock, ibcm_port_info_s::{port_qplist}))
_NOTE(MUTEX_PROTECTS_DATA(ibcm_qp_list_lock, ibcm_qp_list_s))
_NOTE(MUTEX_PROTECTS_DATA(ibcm_qp_list_lock, ibcm_qp_list_s))

kcondvar_t		ibcm_timeout_list_cv;
kcondvar_t		ibcm_timeout_thread_done_cv;
kt_did_t		ibcm_timeout_thread_did;
ibcm_state_data_t	*ibcm_timeout_list_hdr, *ibcm_timeout_list_tail;
ibcm_ud_state_data_t	*ibcm_ud_timeout_list_hdr, *ibcm_ud_timeout_list_tail;
kmutex_t		ibcm_timeout_list_lock;
uint8_t			ibcm_timeout_list_flags = 0;
pri_t			ibcm_timeout_thread_pri = MINCLSYSPRI;

_NOTE(MUTEX_PROTECTS_DATA(ibcm_timeout_list_lock,
    ibcm_state_data_s::timeout_next))

_NOTE(MUTEX_PROTECTS_DATA(ibcm_timeout_list_lock,
    ibcm_ud_state_data_s::ud_timeout_next))

/*
 * Flow control logic for open_rc_channel uses the following.
 */

struct ibcm_open_s {
	kmutex_t		mutex;
	kcondvar_t		cv;
	uint8_t			task_running;
	uint_t			queued;
	uint_t			exit_deferred;
	uint_t			in_progress;
	uint_t			in_progress_max;
	uint_t			sends;
	uint_t			sends_max;
	uint_t			sends_lowat;
	uint_t			sends_hiwat;
	ibcm_state_data_t	*tail;
	ibcm_state_data_t	head;
} ibcm_open;

static void ibcm_open_task(void *);

/*
 * Flow control logic for SA access and close_rc_channel calls follows.
 */

int ibcm_close_simul_max	= 12;
int ibcm_lapr_simul_max		= 12;
int ibcm_saa_simul_max		= 8;

typedef struct ibcm_flow1_s {
	struct ibcm_flow1_s	*link;
	kcondvar_t		cv;
	uint8_t			waiters;	/* 1 to IBCM_FLOW_SIMUL_MAX */
} ibcm_flow1_t;

typedef struct ibcm_flow_s {
	ibcm_flow1_t		*list;
	uint_t			simul;	/* #requests currently outstanding */
	uint_t			simul_max;
	uint_t			waiters_per_chunk;
	uint_t			lowat;
	uint_t			lowat_default;
	/* statistics */
	uint_t			total;
} ibcm_flow_t;

ibcm_flow_t ibcm_saa_flow;
ibcm_flow_t ibcm_close_flow;
ibcm_flow_t ibcm_lapr_flow;

/* NONBLOCKING close requests are queued */
struct ibcm_close_s {
	kmutex_t		mutex;
	ibcm_state_data_t	*tail;
	ibcm_state_data_t	head;
} ibcm_close;

static ibt_clnt_modinfo_t ibcm_ibt_modinfo = {	/* Client's modinfop */
	IBTI_V_CURR,
	IBT_CM,
	ibcm_async_handler,
	NULL,
	"IBCM"
};

/* IBCM's list of HCAs registered with it */
static ibcm_hca_info_t	*ibcm_hca_listp = NULL;	/* CM's HCA list */

/* Array of CM state call table functions */
ibcm_state_handler_t	ibcm_sm_funcs_tbl[] = {
	ibcm_process_req_msg,
	ibcm_process_mra_msg,
	ibcm_process_rej_msg,
	ibcm_process_rep_msg,
	ibcm_process_rtu_msg,
	ibcm_process_dreq_msg,
	ibcm_process_drep_msg,
	ibcm_process_sidr_req_msg,
	ibcm_process_sidr_rep_msg,
	ibcm_process_lap_msg,
	ibcm_process_apr_msg
};

/* the following globals are CM tunables */
ibt_rnr_nak_time_t	ibcm_default_rnr_nak_time = IBT_RNR_NAK_655ms;

uint32_t	ibcm_max_retries = IBCM_MAX_RETRIES;
clock_t		ibcm_local_processing_time = IBCM_LOCAL_RESPONSE_TIME;
clock_t		ibcm_remote_response_time = IBCM_REMOTE_RESPONSE_TIME;
ib_time_t	ibcm_max_sidr_rep_proctime = IBCM_MAX_SIDR_PROCESS_TIME;
ib_time_t	ibcm_max_sidr_pktlife_time = IBCM_MAX_SIDR_PKT_LIFE_TIME;

ib_time_t	ibcm_max_sidr_rep_store_time = 18;
uint32_t	ibcm_wait_for_acc_cnt_timeout = 2000000;	/* 2 sec */
uint32_t	ibcm_wait_for_res_cnt_timeout = 2000000;	/* 2 sec */

ib_time_t	ibcm_max_ib_pkt_lt = IBCM_MAX_IB_PKT_LT;
ib_time_t	ibcm_max_ib_mad_pkt_lt = IBCM_MAX_IB_MAD_PKT_LT;

/*
 * This delay accounts for time involved in various activities as follows :
 *
 * IBMF delays for posting the MADs in non-blocking mode
 * IBMF delays for receiving the MADs and delivering to CM
 * CM delays in processing the MADs before invoking client handlers,
 * Any other delays associated with HCA driver in processing the MADs and
 * 	other subsystems that CM may invoke (ex : SA, HCA driver)
 */
uint32_t	ibcm_sw_delay	= 1000;	/* 1000us / 1ms */
uint32_t	ibcm_max_sa_retries = IBCM_MAX_SA_RETRIES + 1;

/*	approx boot time */
uint32_t	ibcm_adj_btime = 4;	/* 4 seconds */

/*
 * The information in ibcm_clpinfo is kept in wireformat and is setup at
 * init time, and used read-only after that
 */
ibcm_classportinfo_msg_t	ibcm_clpinfo;

char	*event_str[] = {
	"NEVER SEE THIS             ",
	"SESSION_ID                 ",
	"CHAN_HDL                   ",
	"LOCAL_COMID/HCA/PORT       ",
	"LOCAL_QPN                  ",
	"REMOTE_COMID/HCA           ",
	"REMOTE_QPN                 ",
	"BASE_TIME                  ",
	"INCOMING_REQ               ",
	"INCOMING_REP               ",
	"INCOMING_RTU               ",
	"INCOMING_COMEST            ",
	"INCOMING_MRA               ",
	"INCOMING_REJ               ",
	"INCOMING_LAP               ",
	"INCOMING_APR               ",
	"INCOMING_DREQ              ",
	"INCOMING_DREP              ",
	"OUTGOING_REQ               ",
	"OUTGOING_REP               ",
	"OUTGOING_RTU               ",
	"OUTGOING_LAP               ",
	"OUTGOING_APR               ",
	"OUTGOING_MRA               ",
	"OUTGOING_REJ               ",
	"OUTGOING_DREQ              ",
	"OUTGOING_DREP              ",
	"REQ_POST_COMPLETE          ",
	"REP_POST_COMPLETE          ",
	"RTU_POST_COMPLETE          ",
	"MRA_POST_COMPLETE          ",
	"REJ_POST_COMPLETE          ",
	"LAP_POST_COMPLETE          ",
	"APR_POST_COMPLETE          ",
	"DREQ_POST_COMPLETE         ",
	"DREP_POST_COMPLETE         ",
	"TIMEOUT_REP                ",
	"CALLED_REQ_RCVD_EVENT      ",
	"RET_REQ_RCVD_EVENT         ",
	"CALLED_REP_RCVD_EVENT      ",
	"RET_REP_RCVD_EVENT         ",
	"CALLED_CONN_EST_EVENT      ",
	"RET_CONN_EST_EVENT         ",
	"CALLED_CONN_FAIL_EVENT     ",
	"RET_CONN_FAIL_EVENT        ",
	"CALLED_CONN_CLOSE_EVENT    ",
	"RET_CONN_CLOSE_EVENT       ",
	"INIT_INIT                  ",
	"INIT_INIT_FAIL             ",
	"INIT_RTR                   ",
	"INIT_RTR_FAIL              ",
	"RTR_RTS                    ",
	"RTR_RTS_FAIL               ",
	"RTS_RTS                    ",
	"RTS_RTS_FAIL               ",
	"TO_ERROR                   ",
	"ERROR_FAIL                 ",
	"SET_ALT                    ",
	"SET_ALT_FAIL               ",
	"STALE_DETECT               ",
	"OUTGOING_REQ_RETRY         ",
	"OUTGOING_REP_RETRY         ",
	"OUTGOING_LAP_RETRY         ",
	"OUTGOING_MRA_RETRY         ",
	"OUTGOING_DREQ_RETRY        ",
	"NEVER SEE THIS             "
};

char	ibcm_debug_buf[IBCM_DEBUG_BUF_SIZE];

_NOTE(SCHEME_PROTECTS_DATA("used in a localized function consistently",
    ibcm_debug_buf))
_NOTE(READ_ONLY_DATA(ibcm_taskq))

_NOTE(MUTEX_PROTECTS_DATA(ibcm_timeout_list_lock, ibcm_timeout_list_flags))
_NOTE(MUTEX_PROTECTS_DATA(ibcm_timeout_list_lock, ibcm_timeout_list_hdr))
_NOTE(MUTEX_PROTECTS_DATA(ibcm_timeout_list_lock, ibcm_ud_timeout_list_hdr))

#ifdef DEBUG
int		ibcm_test_mode = 0;	/* set to 1, if running tests */
#endif


/* Module Driver Info */
static struct modlmisc ibcm_modlmisc = {
	&mod_miscops,
	"IB Communication Manager"
};

/* Module Linkage */
static struct modlinkage ibcm_modlinkage = {
	MODREV_1,
	&ibcm_modlmisc,
	NULL
};


int
_init(void)
{
	int		rval;
	ibcm_status_t	status;

	status = ibcm_init();
	if (status != IBCM_SUCCESS) {
		IBTF_DPRINTF_L2(cmlog, "_init: ibcm failed %d", status);
		return (EINVAL);
	}

	rval = mod_install(&ibcm_modlinkage);
	if (rval != 0) {
		IBTF_DPRINTF_L2(cmlog, "_init: ibcm mod_install failed %d",
		    rval);
		(void) ibcm_fini();
	}

	IBTF_DPRINTF_L5(cmlog, "_init: ibcm successful");
	return (rval);

}


int
_info(struct modinfo *modinfop)
{
	return (mod_info(&ibcm_modlinkage, modinfop));
}


int
_fini(void)
{
	int status;

	if (ibcm_fini() != IBCM_SUCCESS)
		return (EBUSY);

	if ((status = mod_remove(&ibcm_modlinkage)) != 0) {
		IBTF_DPRINTF_L2(cmlog, "_fini: ibcm mod_remove failed %d",
		    status);
		return (status);
	}

	IBTF_DPRINTF_L5(cmlog, "_fini: ibcm successful");

	return (status);
}

/* Initializes all global mutex and CV in cm module */
static void
ibcm_init_locks()
{

	/* Verify CM MAD sizes */
#ifdef DEBUG

	if (ibcm_test_mode > 1) {

		IBTF_DPRINTF_L1(cmlog, "REQ MAD SIZE %d",
		    sizeof (ibcm_req_msg_t));
		IBTF_DPRINTF_L1(cmlog, "REP MAD SIZE %d",
		    sizeof (ibcm_rep_msg_t));
		IBTF_DPRINTF_L1(cmlog, "RTU MAD SIZE %d",
		    sizeof (ibcm_rtu_msg_t));
		IBTF_DPRINTF_L1(cmlog, "MRA MAD SIZE %d",
		    sizeof (ibcm_mra_msg_t));
		IBTF_DPRINTF_L1(cmlog, "REJ MAD SIZE %d",
		    sizeof (ibcm_rej_msg_t));
		IBTF_DPRINTF_L1(cmlog, "LAP MAD SIZE %d",
		    sizeof (ibcm_lap_msg_t));
		IBTF_DPRINTF_L1(cmlog, "APR MAD SIZE %d",
		    sizeof (ibcm_apr_msg_t));
		IBTF_DPRINTF_L1(cmlog, "DREQ MAD SIZE %d",
		    sizeof (ibcm_dreq_msg_t));
		IBTF_DPRINTF_L1(cmlog, "DREP MAD SIZE %d",
		    sizeof (ibcm_drep_msg_t));
		IBTF_DPRINTF_L1(cmlog, "SIDR REQ MAD SIZE %d",
		    sizeof (ibcm_sidr_req_msg_t));
		IBTF_DPRINTF_L1(cmlog, "SIDR REP MAD SIZE %d",
		    sizeof (ibcm_sidr_rep_msg_t));
	}

#endif

	/* Create all global locks within cm module */
	mutex_init(&ibcm_svc_info_lock, NULL, MUTEX_DEFAULT, NULL);
	mutex_init(&ibcm_timeout_list_lock, NULL, MUTEX_DEFAULT, NULL);
	mutex_init(&ibcm_global_hca_lock, NULL, MUTEX_DEFAULT, NULL);
	mutex_init(&ibcm_sa_open_lock, NULL, MUTEX_DEFAULT, NULL);
	mutex_init(&ibcm_recv_mutex, NULL, MUTEX_DEFAULT, NULL);
	mutex_init(&ibcm_sm_notice_serialize_lock, NULL, MUTEX_DEFAULT, NULL);
	mutex_init(&ibcm_qp_list_lock, NULL, MUTEX_DEFAULT, NULL);
	mutex_init(&ibcm_trace_mutex, NULL, MUTEX_DEFAULT, NULL);
	mutex_init(&ibcm_trace_print_mutex, NULL, MUTEX_DEFAULT, NULL);
	cv_init(&ibcm_svc_info_cv, NULL, CV_DRIVER, NULL);
	cv_init(&ibcm_timeout_list_cv, NULL, CV_DRIVER, NULL);
	cv_init(&ibcm_timeout_thread_done_cv, NULL, CV_DRIVER, NULL);
	cv_init(&ibcm_global_hca_cv, NULL, CV_DRIVER, NULL);
	cv_init(&ibcm_sa_open_cv, NULL, CV_DRIVER, NULL);
	avl_create(&ibcm_svc_avl_tree, ibcm_svc_compare,
	    sizeof (ibcm_svc_info_t),
	    offsetof(struct ibcm_svc_info_s, svc_link));

	IBTF_DPRINTF_L5(cmlog, "ibcm_init_locks: done");
}

/* Destroys all global mutex and CV in cm module */
static void
ibcm_fini_locks()
{
	/* Destroy all global locks within cm module */
	mutex_destroy(&ibcm_svc_info_lock);
	mutex_destroy(&ibcm_timeout_list_lock);
	mutex_destroy(&ibcm_global_hca_lock);
	mutex_destroy(&ibcm_sa_open_lock);
	mutex_destroy(&ibcm_recv_mutex);
	mutex_destroy(&ibcm_sm_notice_serialize_lock);
	mutex_destroy(&ibcm_qp_list_lock);
	mutex_destroy(&ibcm_trace_mutex);
	mutex_destroy(&ibcm_trace_print_mutex);
	cv_destroy(&ibcm_svc_info_cv);
	cv_destroy(&ibcm_timeout_list_cv);
	cv_destroy(&ibcm_timeout_thread_done_cv);
	cv_destroy(&ibcm_global_hca_cv);
	cv_destroy(&ibcm_sa_open_cv);
	avl_destroy(&ibcm_svc_avl_tree);

	IBTF_DPRINTF_L5(cmlog, "ibcm_fini_locks: done");
}


/* Initialize CM's classport info */
static void
ibcm_init_classportinfo()
{
	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(ibcm_clpinfo));

	ibcm_clpinfo.BaseVersion = IBCM_MAD_BASE_VERSION;
	ibcm_clpinfo.ClassVersion = IBCM_MAD_CLASS_VERSION;

	/* For now, CM supports same capabilities at all ports */
	ibcm_clpinfo.CapabilityMask =
	    h2b16(IBCM_CPINFO_CAP_RC | IBCM_CPINFO_CAP_SIDR);

	/* Bits 0-7 are all 0 for Communication Mgmt Class */

	/* For now, CM has the same respvalue at all ports */
	ibcm_clpinfo.RespTimeValue_plus =
	    h2b32(ibt_usec2ib(ibcm_local_processing_time) & 0x1f);

	/* For now, redirect fields are set to 0 */
	/* Trap fields are not applicable to CM, hence set to 0 */

	_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(ibcm_clpinfo));
	IBTF_DPRINTF_L5(cmlog, "ibcm_init_classportinfo: done");
}

/*
 * ibcm_init():
 * 	- call ibt_attach()
 * 	- create AVL trees
 *	- Attach HCA handlers that are already present before
 *	CM got loaded.
 *
 * Arguments:	NONE
 *
 * Return values:
 *	IBCM_SUCCESS - success
 */
static ibcm_status_t
ibcm_init(void)
{
	ibt_status_t	status;
	kthread_t	*t;

	IBTF_DPRINTF_L3(cmlog, "ibcm_init:");

	ibcm_init_classportinfo();

	if (ibcm_init_ids() != IBCM_SUCCESS) {
		IBTF_DPRINTF_L1(cmlog, "ibcm_init: "
		    "fatal error: vmem_create() failed");
		return (IBCM_FAILURE);
	}
	ibcm_init_locks();

	if (ibcm_ar_init() != IBCM_SUCCESS) {
		IBTF_DPRINTF_L1(cmlog, "ibcm_init: "
		    "fatal error: ibcm_ar_init() failed");
		ibcm_fini_ids();
		ibcm_fini_locks();
		return (IBCM_FAILURE);
	}
	ibcm_rc_flow_control_init();

	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(ibcm_taskq))
	ibcm_taskq = system_taskq;
	_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(ibcm_taskq))

	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(ibcm_timeout_list_flags))
	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(ibcm_timeout_thread_did))

	/* Start the timeout list processing thread */
	ibcm_timeout_list_flags = 0;
	t = thread_create(NULL, 0, ibcm_process_tlist, 0, 0, &p0, TS_RUN,
	    ibcm_timeout_thread_pri);
	ibcm_timeout_thread_did = t->t_did;

	_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(ibcm_timeout_list_flags))
	_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(ibcm_timeout_thread_did))

	/*
	 * NOTE : if ibt_attach is done after ibcm_init_hcas, then some
	 * HCA DR events may be lost. CM could call re-init hca list
	 * again, but it is more complicated. Some HCA's DR's lost may
	 * be HCA detach, which makes hca list re-syncing and locking more
	 * complex
	 */
	status = ibt_attach(&ibcm_ibt_modinfo, NULL, NULL, &ibcm_ibt_handle);
	if (status != IBT_SUCCESS) {
		IBTF_DPRINTF_L2(cmlog, "ibcm_init(): ibt_attach failed %d",
		    status);
		(void) ibcm_ar_fini();
		ibcm_stop_timeout_thread();
		ibcm_fini_ids();
		ibcm_fini_locks();
		ibcm_rc_flow_control_fini();
		return (IBCM_FAILURE);
	}

	/* Block all HCA attach/detach asyncs */
	mutex_enter(&ibcm_global_hca_lock);

	ibcm_init_hcas();
	ibcm_finit_state = IBCM_FINIT_IDLE;

	ibcm_path_cache_init();

	/* Unblock any waiting HCA DR asyncs in CM */
	mutex_exit(&ibcm_global_hca_lock);

	IBTF_DPRINTF_L4(cmlog, "ibcm_init: done");
	return (IBCM_SUCCESS);
}

/* Allocates and initializes the "per hca" global data in CM */
static void
ibcm_init_hcas()
{
	uint_t	num_hcas = 0;
	ib_guid_t *guid_array;
	int i;

	IBTF_DPRINTF_L5(cmlog, "ibcm_init_hcas:");

	/* Get the number of HCAs */
	num_hcas = ibt_get_hca_list(&guid_array);
	IBTF_DPRINTF_L4(cmlog, "ibcm_init_hcas: ibt_get_hca_list() "
	    "returned %d hcas", num_hcas);

	ASSERT(MUTEX_HELD(&ibcm_global_hca_lock));

	for (i = 0; i < num_hcas; i++)
		ibcm_hca_attach(guid_array[i]);

	if (num_hcas)
		ibt_free_hca_list(guid_array, num_hcas);

	IBTF_DPRINTF_L5(cmlog, "ibcm_init_hcas: done");
}


/*
 * ibcm_fini():
 * 	- Deregister w/ ibt
 * 	- Cleanup IBCM HCA listp
 * 	- Destroy mutexes
 *
 * Arguments:	NONE
 *
 * Return values:
 *	IBCM_SUCCESS - success
 */
static ibcm_status_t
ibcm_fini(void)
{
	ibt_status_t	status;

	IBTF_DPRINTF_L3(cmlog, "ibcm_fini:");

	/*
	 * CM assumes that the all general clients got rid of all the
	 * established connections and service registrations, completed all
	 * pending SIDR operations before a call to ibcm_fini()
	 */

	if (ibcm_ar_fini() != IBCM_SUCCESS) {
		IBTF_DPRINTF_L2(cmlog, "ibcm_fini: ibcm_ar_fini failed");
		return (IBCM_FAILURE);
	}

	/* cleanup the svcinfo list */
	mutex_enter(&ibcm_svc_info_lock);
	if (avl_first(&ibcm_svc_avl_tree) != NULL) {
		IBTF_DPRINTF_L2(cmlog, "ibcm_fini: "
		    "ibcm_svc_avl_tree is not empty");
		mutex_exit(&ibcm_svc_info_lock);
		return (IBCM_FAILURE);
	}
	mutex_exit(&ibcm_svc_info_lock);

	/* disables any new hca attach/detaches */
	mutex_enter(&ibcm_global_hca_lock);

	ibcm_finit_state = IBCM_FINIT_BUSY;

	if (ibcm_fini_hcas() != IBCM_SUCCESS) {
		IBTF_DPRINTF_L2(cmlog, "ibcm_fini: "
		    "some hca's still have client resources");

		/* First, re-initialize the hcas */
		ibcm_init_hcas();
		/* and then enable the HCA asyncs */
		ibcm_finit_state = IBCM_FINIT_IDLE;
		mutex_exit(&ibcm_global_hca_lock);
		if (ibcm_ar_init() != IBCM_SUCCESS) {
			IBTF_DPRINTF_L1(cmlog, "ibcm_fini:ibcm_ar_init failed");
		}
		return (IBCM_FAILURE);
	}

	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(ibcm_timeout_list_hdr))
	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(ibcm_ud_timeout_list_hdr))

	ASSERT(ibcm_timeout_list_hdr == NULL);
	ASSERT(ibcm_ud_timeout_list_hdr == NULL);

	_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(ibcm_timeout_list_hdr))
	_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(ibcm_ud_timeout_list_hdr))

	/* Release any pending asyncs on ibcm_global_hca_lock */
	ibcm_finit_state = IBCM_FINIT_SUCCESS;
	mutex_exit(&ibcm_global_hca_lock);

	ibcm_stop_timeout_thread();

	/*
	 * Detach from IBTL. Waits until all pending asyncs are complete.
	 * Above cv_broadcast wakes up any waiting hca attach/detach asyncs
	 */
	status = ibt_detach(ibcm_ibt_handle);

	/* if detach fails, CM didn't free up some resources, so assert */
	if (status != IBT_SUCCESS)
		IBTF_DPRINTF_L1(cmlog, "ibcm_fini: ibt_detach failed %d",
		    status);

	ibcm_rc_flow_control_fini();

	ibcm_path_cache_fini();

	ibcm_fini_ids();
	ibcm_fini_locks();
	IBTF_DPRINTF_L3(cmlog, "ibcm_fini: done");
	return (IBCM_SUCCESS);
}

/* This routine exit's the ibcm timeout thread  */
static void
ibcm_stop_timeout_thread()
{
	mutex_enter(&ibcm_timeout_list_lock);

	/* Stop the timeout list processing thread */
	ibcm_timeout_list_flags =
	    ibcm_timeout_list_flags | IBCM_TIMEOUT_THREAD_EXIT;

	/* Wake up, if the timeout thread is on a cv_wait */
	cv_signal(&ibcm_timeout_list_cv);

	mutex_exit(&ibcm_timeout_list_lock);
	thread_join(ibcm_timeout_thread_did);

	IBTF_DPRINTF_L5(cmlog, "ibcm_stop_timeout_thread: done");
}


/* Attempts to release all the hca's associated with CM */
static ibcm_status_t
ibcm_fini_hcas()
{
	ibcm_hca_info_t *hcap, *next;

	IBTF_DPRINTF_L4(cmlog, "ibcm_fini_hcas:");

	ASSERT(MUTEX_HELD(&ibcm_global_hca_lock));

	hcap = ibcm_hca_listp;
	while (hcap != NULL) {
		next = hcap->hca_next;
		if (ibcm_hca_detach(hcap) != IBCM_SUCCESS) {
			ibcm_hca_listp = hcap;
			return (IBCM_FAILURE);
		}
		hcap = next;
	}

	IBTF_DPRINTF_L4(cmlog, "ibcm_fini_hcas: SUCCEEDED");
	return (IBCM_SUCCESS);
}


/*
 * ibcm_hca_attach():
 *	Called as an asynchronous event to notify CM of an attach of HCA.
 *	Here ibcm_hca_info_t is initialized and all fields are
 *	filled in along with SA Access handles and IBMA handles.
 *	Also called from ibcm_init to initialize ibcm_hca_info_t's for each
 *	hca's
 *
 * Arguments: (WILL CHANGE BASED ON ASYNC EVENT CODE)
 *	hca_guid	- HCA's guid
 *
 * Return values: NONE
 */
static void
ibcm_hca_attach(ib_guid_t hcaguid)
{
	int			i;
	ibt_status_t		status;
	uint_t			nports = 0;
	ibcm_hca_info_t		*hcap;
	ibt_hca_attr_t		hca_attrs;

	IBTF_DPRINTF_L3(cmlog, "ibcm_hca_attach: guid = 0x%llX", hcaguid);

	ASSERT(MUTEX_HELD(&ibcm_global_hca_lock));

	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*hcap))

	status = ibt_query_hca_byguid(hcaguid, &hca_attrs);
	if (status != IBT_SUCCESS) {
		IBTF_DPRINTF_L2(cmlog, "ibcm_hca_attach: "
		    "ibt_query_hca_byguid failed = %d", status);
		return;
	}
	nports = hca_attrs.hca_nports;

	IBTF_DPRINTF_L4(cmlog, "ibcm_hca_attach: num ports = %x", nports);

	if ((hcap = ibcm_add_hca_entry(hcaguid, nports)) == NULL)
		return;

	hcap->hca_guid = hcaguid;	/* Set GUID */
	hcap->hca_num_ports = nports;	/* Set number of ports */

	if (ibcm_init_hca_ids(hcap) != IBCM_SUCCESS) {
		ibcm_delete_hca_entry(hcap);
		return;
	}

	/* Store the static hca attribute data */
	hcap->hca_caps = hca_attrs.hca_flags;
	hcap->hca_vendor_id = hca_attrs.hca_vendor_id;
	hcap->hca_device_id = hca_attrs.hca_device_id;
	hcap->hca_ack_delay = hca_attrs.hca_local_ack_delay;
	hcap->hca_max_rdma_in_qp = hca_attrs.hca_max_rdma_in_qp;
	hcap->hca_max_rdma_out_qp = hca_attrs.hca_max_rdma_out_qp;

	/* loop thru nports and initialize IBMF handles */
	for (i = 0; i < hcap->hca_num_ports; i++) {
		status = ibt_get_port_state_byguid(hcaguid, i + 1, NULL, NULL);
		if (status != IBT_SUCCESS) {
			IBTF_DPRINTF_L2(cmlog, "ibcm_hca_attach: "
			    "port_num %d state DOWN", i + 1);
		}

		hcap->hca_port_info[i].port_hcap = hcap;
		hcap->hca_port_info[i].port_num = i+1;

		if (ibcm_hca_init_port(hcap, i) != IBT_SUCCESS)
			IBTF_DPRINTF_L2(cmlog, "ibcm_hca_attach: "
			    "ibcm_hca_init_port failed %d port_num %d",
			    status, i+1);
	}

	/* create the "active" CM AVL tree */
	avl_create(&hcap->hca_active_tree, ibcm_active_node_compare,
	    sizeof (ibcm_state_data_t),
	    offsetof(struct ibcm_state_data_s, avl_active_link));

	/* create the "passive" CM AVL tree */
	avl_create(&hcap->hca_passive_tree, ibcm_passive_node_compare,
	    sizeof (ibcm_state_data_t),
	    offsetof(struct ibcm_state_data_s, avl_passive_link));

	/* create the "passive comid" CM AVL tree */
	avl_create(&hcap->hca_passive_comid_tree,
	    ibcm_passive_comid_node_compare,
	    sizeof (ibcm_state_data_t),
	    offsetof(struct ibcm_state_data_s, avl_passive_comid_link));

	/*
	 * Mark the state of the HCA to "attach" only at the end
	 * Now CM starts accepting incoming MADs and client API calls
	 */
	hcap->hca_state = IBCM_HCA_ACTIVE;

	_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*hcap))

	IBTF_DPRINTF_L3(cmlog, "ibcm_hca_attach: ATTACH Done");
}

/*
 * ibcm_hca_detach():
 *	Called as an asynchronous event to notify CM of a detach of HCA.
 *	Here ibcm_hca_info_t is freed up and all fields that
 *	were initialized earlier are cleaned up
 *
 * Arguments: (WILL CHANGE BASED ON ASYNC EVENT CODE)
 *	hca_guid    - HCA's guid
 *
 * Return values:
 *	IBCM_SUCCESS	- able to detach HCA
 *	IBCM_FAILURE	- failed to detach HCA
 */
static ibcm_status_t
ibcm_hca_detach(ibcm_hca_info_t *hcap)
{
	int		port_index, i;
	ibcm_status_t	status = IBCM_SUCCESS;
	clock_t		absolute_time;

	IBTF_DPRINTF_L3(cmlog, "ibcm_hca_detach: hcap = 0x%p guid = 0x%llX",
	    hcap, hcap->hca_guid);

	ASSERT(MUTEX_HELD(&ibcm_global_hca_lock));

	/*
	 * Declare hca is going away to all CM clients. Wait until the
	 * access count becomes zero.
	 */
	hcap->hca_state = IBCM_HCA_NOT_ACTIVE;

	/* wait on response CV */
	absolute_time = ddi_get_lbolt() +
	    drv_usectohz(ibcm_wait_for_acc_cnt_timeout);

	while (hcap->hca_acc_cnt > 0)
		if (cv_timedwait(&ibcm_global_hca_cv, &ibcm_global_hca_lock,
		    absolute_time) == -1)
			break;

	if (hcap->hca_acc_cnt != 0) {
		/* We got a timeout */
		IBTF_DPRINTF_L2(cmlog, "ibcm_hca_detach: Aborting due"
		    " to timeout on hca_acc_cnt %u, \n Some CM Clients are "
		    "still active, looks like we need to wait some more time "
		    "(ibcm_wait_for_acc_cnt_timeout).", hcap->hca_acc_cnt);
		hcap->hca_state = IBCM_HCA_ACTIVE;
		return (IBCM_FAILURE);
	}

	/*
	 * First make sure, there are no active users of ibma handles,
	 * and then de-register handles.
	 */

	/* make sure that there are no "Service"s registered w/ this HCA. */
	if (hcap->hca_svc_cnt != 0) {
		IBTF_DPRINTF_L2(cmlog, "ibcm_hca_detach: "
		    "Active services still there %d", hcap->hca_svc_cnt);
		hcap->hca_state = IBCM_HCA_ACTIVE;
		return (IBCM_FAILURE);
	}

	if (ibcm_check_sidr_clean(hcap) != IBCM_SUCCESS) {
		IBTF_DPRINTF_L2(cmlog, "ibcm_hca_detach:"
		    "There are active SIDR operations");
		hcap->hca_state = IBCM_HCA_ACTIVE;
		return (IBCM_FAILURE);
	}

	if (ibcm_check_avl_clean(hcap) != IBCM_SUCCESS) {
		IBTF_DPRINTF_L2(cmlog, "ibcm_hca_detach: "
		    "There are active RC connections");
		hcap->hca_state = IBCM_HCA_ACTIVE;
		return (IBCM_FAILURE);
	}

	/*
	 * Now, wait until all rc and sidr stateps go away
	 * All these stateps must be short lived ones, waiting to be cleaned
	 * up after some timeout value, based on the current state.
	 */
	IBTF_DPRINTF_L3(cmlog, "ibcm_hca_detach:hca_guid = 0x%llX res_cnt = %d",
	    hcap->hca_guid, hcap->hca_res_cnt);

	/* wait on response CV */
	absolute_time = ddi_get_lbolt() +
	    drv_usectohz(ibcm_wait_for_res_cnt_timeout);

	while (hcap->hca_res_cnt > 0)
		if (cv_timedwait(&ibcm_global_hca_cv, &ibcm_global_hca_lock,
		    absolute_time) == -1)
			break;

	if (hcap->hca_res_cnt != 0) {
		/* We got a timeout waiting for hca_res_cnt to become 0 */
		IBTF_DPRINTF_L2(cmlog, "ibcm_hca_detach: Aborting due"
		    " to timeout on res_cnt %d, \n Some CM connections are "
		    "still in transient state, looks like we need to wait "
		    "some more time (ibcm_wait_for_res_cnt_timeout).",
		    hcap->hca_res_cnt);
		hcap->hca_state = IBCM_HCA_ACTIVE;
		return (IBCM_FAILURE);
	}

	/* Re-assert the while loop step above */
	ASSERT(hcap->hca_sidr_list == NULL);
	avl_destroy(&hcap->hca_active_tree);
	avl_destroy(&hcap->hca_passive_tree);
	avl_destroy(&hcap->hca_passive_comid_tree);

	/*
	 * Unregister all ports from IBMA
	 * If there is a failure, re-initialize any free'd ibma handles. This
	 * is required to receive the incoming mads
	 */
	status = IBCM_SUCCESS;
	for (port_index = 0; port_index < hcap->hca_num_ports; port_index++) {
		if ((status = ibcm_hca_fini_port(hcap, port_index)) !=
		    IBCM_SUCCESS) {
			IBTF_DPRINTF_L2(cmlog, "ibcm_hca_detach: "
			    "Failed to free IBMA Handle for port_num %d",
			    port_index + 1);
			break;
		}
	}

	/* If detach fails, re-initialize ibma handles for incoming mads */
	if (status != IBCM_SUCCESS)  {
		for (i = 0; i < port_index; i++) {
			if (ibcm_hca_init_port(hcap, i) != IBT_SUCCESS)
				IBTF_DPRINTF_L2(cmlog, "ibcm_hca_detach: "
				    "Failed to re-allocate IBMA Handles for"
				    " port_num %d", port_index + 1);
		}
		hcap->hca_state = IBCM_HCA_ACTIVE;
		return (IBCM_FAILURE);
	}

	ibcm_fini_hca_ids(hcap);
	ibcm_delete_hca_entry(hcap);

	IBTF_DPRINTF_L3(cmlog, "ibcm_hca_detach: DETACH succeeded");
	return (IBCM_SUCCESS);
}

/* Checks, if there are any active sidr state entries in the specified hca */
static ibcm_status_t
ibcm_check_sidr_clean(ibcm_hca_info_t *hcap)
{
	ibcm_ud_state_data_t	*usp;
	uint32_t		transient_cnt = 0;

	IBTF_DPRINTF_L5(cmlog, "ibcm_check_sidr_clean:");

	rw_enter(&hcap->hca_sidr_list_lock, RW_WRITER);
	usp = hcap->hca_sidr_list;	/* Point to the list */
	while (usp != NULL) {
		mutex_enter(&usp->ud_state_mutex);
		if ((usp->ud_state != IBCM_STATE_SIDR_REP_SENT) &&
		    (usp->ud_state != IBCM_STATE_TIMED_OUT) &&
		    (usp->ud_state != IBCM_STATE_DELETE)) {

			IBTF_DPRINTF_L3(cmlog, "ibcm_check_sidr_clean:"
			    "usp = %p not in transient state = %d", usp,
			    usp->ud_state);

			mutex_exit(&usp->ud_state_mutex);
			rw_exit(&hcap->hca_sidr_list_lock);
			return (IBCM_FAILURE);
		} else {
			mutex_exit(&usp->ud_state_mutex);
			++transient_cnt;
		}

		usp = usp->ud_nextp;
	}
	rw_exit(&hcap->hca_sidr_list_lock);

	IBTF_DPRINTF_L4(cmlog, "ibcm_check_sidr_clean: transient_cnt %d",
	    transient_cnt);

	return (IBCM_SUCCESS);
}

/* Checks, if there are any active rc state entries, in the specified hca */
static ibcm_status_t
ibcm_check_avl_clean(ibcm_hca_info_t *hcap)

{
	ibcm_state_data_t	*sp;
	avl_tree_t		*avl_tree;
	uint32_t		transient_cnt = 0;

	IBTF_DPRINTF_L5(cmlog, "ibcm_check_avl_clean:");
	/*
	 * Both the trees ie., active and passive must reference to all
	 * statep's, so let's use one
	 */
	avl_tree = &hcap->hca_active_tree;

	rw_enter(&hcap->hca_state_rwlock, RW_WRITER);

	for (sp = avl_first(avl_tree); sp != NULL;
	    sp = avl_walk(avl_tree, sp, AVL_AFTER)) {
		mutex_enter(&sp->state_mutex);
		if ((sp->state != IBCM_STATE_TIMEWAIT) &&
		    (sp->state != IBCM_STATE_REJ_SENT) &&
		    (sp->state != IBCM_STATE_DELETE)) {
			IBTF_DPRINTF_L3(cmlog, "ibcm_check_avl_clean: "
			    "sp = %p not in transient state = %d", sp,
			    sp->state);
			mutex_exit(&sp->state_mutex);
			rw_exit(&hcap->hca_state_rwlock);
			return (IBCM_FAILURE);
		} else {
			mutex_exit(&sp->state_mutex);
			++transient_cnt;
		}
	}

	rw_exit(&hcap->hca_state_rwlock);

	IBTF_DPRINTF_L4(cmlog, "ibcm_check_avl_clean: transient_cnt %d",
	    transient_cnt);

	return (IBCM_SUCCESS);
}

/* Adds a new entry into CM's global hca list, if hca_guid is not there yet */
static ibcm_hca_info_t *
ibcm_add_hca_entry(ib_guid_t hcaguid, uint_t nports)
{
	ibcm_hca_info_t	*hcap;

	IBTF_DPRINTF_L5(cmlog, "ibcm_add_hca_entry: guid = 0x%llX",
	    hcaguid);

	ASSERT(MUTEX_HELD(&ibcm_global_hca_lock));

	/*
	 * Check if this hca_guid already in the list
	 * If yes, then ignore this and return NULL
	 */

	hcap = ibcm_hca_listp;

	/* search for this HCA */
	while (hcap != NULL) {
		if (hcap->hca_guid == hcaguid) {
			/* already exists */
			IBTF_DPRINTF_L2(cmlog, "ibcm_add_hca_entry: "
			    "hcap %p guid 0x%llX, entry already exists !!",
			    hcap, hcap->hca_guid);
			return (NULL);
		}
		hcap = hcap->hca_next;
	}

	/* Allocate storage for the new HCA entry found */
	hcap = kmem_zalloc(sizeof (ibcm_hca_info_t) +
	    (nports - 1) * sizeof (ibcm_port_info_t), KM_SLEEP);

	/* initialize RW lock */
	rw_init(&hcap->hca_state_rwlock, NULL, RW_DRIVER, NULL);
	/* initialize SIDR list lock */
	rw_init(&hcap->hca_sidr_list_lock, NULL, RW_DRIVER, NULL);
	/* Insert "hcap" into the global HCA list maintained by CM */
	hcap->hca_next = ibcm_hca_listp;
	ibcm_hca_listp = hcap;

	IBTF_DPRINTF_L5(cmlog, "ibcm_add_hca_entry: done hcap = 0x%p", hcap);

	return (hcap);

}

/* deletes the given ibcm_hca_info_t from CM's global hca list */
void
ibcm_delete_hca_entry(ibcm_hca_info_t *hcap)
{
	ibcm_hca_info_t	*headp, *prevp = NULL;

	/* ibcm_hca_global_lock is held */
	IBTF_DPRINTF_L5(cmlog, "ibcm_delete_hca_entry: guid = 0x%llX "
	    "hcap = 0x%p", hcap->hca_guid, hcap);

	ASSERT(MUTEX_HELD(&ibcm_global_hca_lock));

	headp = ibcm_hca_listp;
	while (headp != NULL) {
		if (headp == hcap) {
			IBTF_DPRINTF_L3(cmlog, "ibcm_delete_hca_entry: "
			    "deleting hcap %p hcaguid %llX", hcap,
			    hcap->hca_guid);
			if (prevp) {
				prevp->hca_next = headp->hca_next;
			} else {
				prevp = headp->hca_next;
				ibcm_hca_listp = prevp;
			}
			rw_destroy(&hcap->hca_state_rwlock);
			rw_destroy(&hcap->hca_sidr_list_lock);
			kmem_free(hcap, sizeof (ibcm_hca_info_t) +
			    (hcap->hca_num_ports - 1) *
			    sizeof (ibcm_port_info_t));
			return;
		}

		prevp = headp;
		headp = headp->hca_next;
	}
}

/*
 * ibcm_find_hca_entry:
 *	Given a HCA's GUID find out ibcm_hca_info_t entry for that HCA
 *	This entry can be then used to access AVL tree/SIDR list etc.
 *	If entry exists and in HCA ATTACH state, then hca's ref cnt is
 *	incremented and entry returned. Else NULL returned.
 *
 *	All functions that use ibcm_find_hca_entry and get a non-NULL
 *	return values must call ibcm_dec_hca_acc_cnt to decrement the
 *	respective hca ref cnt. There shouldn't be any usage of
 *	ibcm_hca_info_t * returned from ibcm_find_hca_entry,
 *	after decrementing the hca_acc_cnt
 *
 * INPUTS:
 *	hca_guid	- HCA's guid
 *
 * RETURN VALUE:
 *	hcap		- if a match is found, else NULL
 */
ibcm_hca_info_t *
ibcm_find_hca_entry(ib_guid_t hca_guid)
{
	ibcm_hca_info_t *hcap;

	IBTF_DPRINTF_L5(cmlog, "ibcm_find_hca_entry: guid = 0x%llX", hca_guid);

	mutex_enter(&ibcm_global_hca_lock);

	hcap = ibcm_hca_listp;
	/* search for this HCA */
	while (hcap != NULL) {
		if (hcap->hca_guid == hca_guid)
			break;
		hcap = hcap->hca_next;
	}

	/* if no hcap for the hca_guid, return NULL */
	if (hcap == NULL) {
		mutex_exit(&ibcm_global_hca_lock);
		return (NULL);
	}

	/* return hcap, only if it valid to use */
	if (hcap->hca_state == IBCM_HCA_ACTIVE) {
		++(hcap->hca_acc_cnt);

		IBTF_DPRINTF_L5(cmlog, "ibcm_find_hca_entry: "
		    "found hcap = 0x%p hca_acc_cnt %u", hcap,
		    hcap->hca_acc_cnt);

		mutex_exit(&ibcm_global_hca_lock);
		return (hcap);
	} else {
		mutex_exit(&ibcm_global_hca_lock);

		IBTF_DPRINTF_L2(cmlog, "ibcm_find_hca_entry: "
		    "found hcap = 0x%p not in active state", hcap);
		return (NULL);
	}
}

/*
 * Searches for ibcm_hca_info_t entry based on hca_guid, but doesn't increment
 * the hca's reference count. This function is used, where the calling context
 * is attempting to delete hcap itself and hence acc_cnt cannot be incremented
 * OR assumes that valid hcap must be available in ibcm's global hca list.
 */
ibcm_hca_info_t *
ibcm_find_hcap_entry(ib_guid_t hca_guid)
{
	ibcm_hca_info_t *hcap;

	IBTF_DPRINTF_L5(cmlog, "ibcm_find_hcap_entry: guid = 0x%llX", hca_guid);

	ASSERT(MUTEX_HELD(&ibcm_global_hca_lock));

	hcap = ibcm_hca_listp;
	/* search for this HCA */
	while (hcap != NULL) {
		if (hcap->hca_guid == hca_guid)
			break;
		hcap = hcap->hca_next;
	}

	if (hcap == NULL)
		IBTF_DPRINTF_L2(cmlog, "ibcm_find_hcap_entry: No hcap found for"
		    " hca_guid 0x%llX", hca_guid);
	else
		IBTF_DPRINTF_L5(cmlog, "ibcm_find_hcap_entry: hcap found for"
		    " hca_guid 0x%llX", hca_guid);

	return (hcap);
}

/* increment the hca's temporary reference count */
ibcm_status_t
ibcm_inc_hca_acc_cnt(ibcm_hca_info_t *hcap)
{
	mutex_enter(&ibcm_global_hca_lock);
	if (hcap->hca_state == IBCM_HCA_ACTIVE) {
		++(hcap->hca_acc_cnt);
		IBTF_DPRINTF_L5(cmlog, "ibcm_inc_hca_acc_cnt: "
		    "hcap = 0x%p  acc_cnt = %d ", hcap, hcap->hca_acc_cnt);
		mutex_exit(&ibcm_global_hca_lock);
		return (IBCM_SUCCESS);
	} else {
		IBTF_DPRINTF_L2(cmlog, "ibcm_inc_hca_acc_cnt: "
		    "hcap INACTIVE 0x%p  acc_cnt = %d ", hcap,
		    hcap->hca_acc_cnt);
		mutex_exit(&ibcm_global_hca_lock);
		return (IBCM_FAILURE);
	}
}

/* decrement the hca's ref count, and wake up any waiting threads */
void
ibcm_dec_hca_acc_cnt(ibcm_hca_info_t *hcap)
{
	mutex_enter(&ibcm_global_hca_lock);
	ASSERT(hcap->hca_acc_cnt > 0);
	--(hcap->hca_acc_cnt);
	IBTF_DPRINTF_L5(cmlog, "ibcm_dec_hca_acc_cnt: hcap = 0x%p "
	    "acc_cnt = %d", hcap, hcap->hca_acc_cnt);
	if ((hcap->hca_state == IBCM_HCA_NOT_ACTIVE) &&
	    (hcap->hca_acc_cnt == 0)) {
		IBTF_DPRINTF_L3(cmlog, "ibcm_dec_hca_acc_cnt: "
		    "cv_broadcast for hcap = 0x%p", hcap);
		cv_broadcast(&ibcm_global_hca_cv);
	}
	mutex_exit(&ibcm_global_hca_lock);
}

/* increment the hca's resource count */
void
ibcm_inc_hca_res_cnt(ibcm_hca_info_t *hcap)

{
	mutex_enter(&ibcm_global_hca_lock);
	++(hcap->hca_res_cnt);
	IBTF_DPRINTF_L5(cmlog, "ibcm_inc_hca_res_cnt: hcap = 0x%p "
	    "ref_cnt = %d", hcap, hcap->hca_res_cnt);
	mutex_exit(&ibcm_global_hca_lock);
}

/* decrement the hca's resource count, and wake up any waiting threads */
void
ibcm_dec_hca_res_cnt(ibcm_hca_info_t *hcap)
{
	mutex_enter(&ibcm_global_hca_lock);
	ASSERT(hcap->hca_res_cnt > 0);
	--(hcap->hca_res_cnt);
	IBTF_DPRINTF_L5(cmlog, "ibcm_dec_hca_res_cnt: hcap = 0x%p "
	    "ref_cnt = %d", hcap, hcap->hca_res_cnt);
	if ((hcap->hca_state == IBCM_HCA_NOT_ACTIVE) &&
	    (hcap->hca_res_cnt == 0)) {
		IBTF_DPRINTF_L3(cmlog, "ibcm_dec_hca_res_cnt: "
		    "cv_broadcast for hcap = 0x%p", hcap);
		cv_broadcast(&ibcm_global_hca_cv);
	}
	mutex_exit(&ibcm_global_hca_lock);
}

/* increment the hca's service count */
void
ibcm_inc_hca_svc_cnt(ibcm_hca_info_t *hcap)

{
	mutex_enter(&ibcm_global_hca_lock);
	++(hcap->hca_svc_cnt);
	IBTF_DPRINTF_L5(cmlog, "ibcm_inc_hca_svc_cnt: hcap = 0x%p "
	    "svc_cnt = %d", hcap, hcap->hca_svc_cnt);
	mutex_exit(&ibcm_global_hca_lock);
}

/* decrement the hca's service count */
void
ibcm_dec_hca_svc_cnt(ibcm_hca_info_t *hcap)
{
	mutex_enter(&ibcm_global_hca_lock);
	ASSERT(hcap->hca_svc_cnt > 0);
	--(hcap->hca_svc_cnt);
	IBTF_DPRINTF_L5(cmlog, "ibcm_dec_hca_svc_cnt: hcap = 0x%p "
	    "svc_cnt = %d", hcap, hcap->hca_svc_cnt);
	mutex_exit(&ibcm_global_hca_lock);
}

/*
 * The following code manages three classes of requests that CM makes to
 * the fabric.  Those three classes are SA_ACCESS, REQ/REP/RTU, and DREQ/DREP.
 * The main issue is that the fabric can become very busy, and the CM
 * protocols rely on responses being made based on a predefined timeout
 * value.  By managing how many simultaneous sessions are allowed, there
 * is observed extremely high reliability of CM protocol succeeding when
 * it should.
 *
 * SA_ACCESS and DREQ/DREP are managed at the thread level, whereby the
 * thread blocks until there are less than some number of threads doing
 * similar requests.
 *
 * REQ/REP/RTU requests beyond a given limit are added to a list,
 * allowing the thread to return immediately to its caller in the
 * case where the "mode" is IBT_NONBLOCKING.  This is the mode used
 * by uDAPL and seems to be an important feature/behavior.
 */

static int
ibcm_ok_to_start(struct ibcm_open_s *openp)
{
	return (openp->sends < openp->sends_hiwat &&
	    openp->in_progress < openp->in_progress_max);
}

void
ibcm_open_done(ibcm_state_data_t *statep)
{
	int run;
	ibcm_state_data_t **linkp, *tmp;

	ASSERT(MUTEX_HELD(&statep->state_mutex));
	if (statep->open_flow == 1) {
		statep->open_flow = 0;
		mutex_enter(&ibcm_open.mutex);
		if (statep->open_link == NULL) {
			ibcm_open.in_progress--;
			run = ibcm_ok_to_start(&ibcm_open);
		} else {
			ibcm_open.queued--;
			linkp = &ibcm_open.head.open_link;
			while (*linkp != statep)
				linkp = &((*linkp)->open_link);
			*linkp = statep->open_link;
			statep->open_link = NULL;
			/*
			 * If we remove what tail pointed to, we need
			 * to reassign tail (it is never NULL).
			 * tail points to head for the empty list.
			 */
			if (ibcm_open.tail == statep) {
				tmp = &ibcm_open.head;
				while (tmp->open_link != &ibcm_open.head)
					tmp = tmp->open_link;
				ibcm_open.tail = tmp;
			}
			run = 0;
		}
		mutex_exit(&ibcm_open.mutex);
		if (run)
			ibcm_run_tlist_thread();
	}
}

/* dtrace */
void
ibcm_open_wait(hrtime_t delta)
{
	if (delta > 1000000)
		IBTF_DPRINTF_L2(cmlog, "ibcm_open_wait: flow more %lld", delta);
}

void
ibcm_open_start(ibcm_state_data_t *statep)
{
	ibcm_insert_trace(statep, IBCM_TRACE_OUTGOING_REQ);

	mutex_enter(&statep->state_mutex);
	ibcm_open_wait(gethrtime() - statep->post_time);
	mutex_exit(&statep->state_mutex);

	ibcm_post_rc_mad(statep, statep->stored_msg, ibcm_post_req_complete,
	    statep);

	mutex_enter(&statep->state_mutex);
	IBCM_REF_CNT_DECR(statep);
	mutex_exit(&statep->state_mutex);
}

void
ibcm_open_enqueue(ibcm_state_data_t *statep)
{
	int run;

	mutex_enter(&statep->state_mutex);
	statep->post_time = gethrtime();
	mutex_exit(&statep->state_mutex);
	mutex_enter(&ibcm_open.mutex);
	if (ibcm_open.queued == 0 && ibcm_ok_to_start(&ibcm_open)) {
		ibcm_open.in_progress++;
		mutex_exit(&ibcm_open.mutex);
		ibcm_open_start(statep);
	} else {
		ibcm_open.queued++;
		statep->open_link = &ibcm_open.head;
		ibcm_open.tail->open_link = statep;
		ibcm_open.tail = statep;
		run = ibcm_ok_to_start(&ibcm_open);
		mutex_exit(&ibcm_open.mutex);
		if (run)
			ibcm_run_tlist_thread();
	}
}

ibcm_state_data_t *
ibcm_open_dequeue(void)
{
	ibcm_state_data_t *statep;

	ASSERT(MUTEX_HELD(&ibcm_open.mutex));
	ibcm_open.queued--;
	ibcm_open.in_progress++;
	statep = ibcm_open.head.open_link;
	ibcm_open.head.open_link = statep->open_link;
	statep->open_link = NULL;
	/*
	 * If we remove what tail pointed to, we need
	 * to reassign tail (it is never NULL).
	 * tail points to head for the empty list.
	 */
	if (ibcm_open.tail == statep)
		ibcm_open.tail = &ibcm_open.head;
	return (statep);
}

void
ibcm_check_for_opens(void)
{
	ibcm_state_data_t 	*statep;

	mutex_enter(&ibcm_open.mutex);

	while (ibcm_open.queued > 0) {
		if (ibcm_ok_to_start(&ibcm_open)) {
			statep = ibcm_open_dequeue();
			mutex_exit(&ibcm_open.mutex);

			ibcm_open_start(statep);

			mutex_enter(&ibcm_open.mutex);
		} else {
			break;
		}
	}
	mutex_exit(&ibcm_open.mutex);
}


static void
ibcm_flow_init(ibcm_flow_t *flow, uint_t simul_max)
{
	flow->list			= NULL;
	flow->simul			= 0;
	flow->waiters_per_chunk		= 4;
	flow->simul_max			= simul_max;
	flow->lowat			= simul_max - flow->waiters_per_chunk;
	flow->lowat_default		= flow->lowat;
	/* stats */
	flow->total			= 0;
}

static void
ibcm_rc_flow_control_init(void)
{
	mutex_init(&ibcm_open.mutex, NULL, MUTEX_DEFAULT, NULL);
	mutex_enter(&ibcm_open.mutex);
	ibcm_flow_init(&ibcm_close_flow, ibcm_close_simul_max);
	ibcm_flow_init(&ibcm_lapr_flow, ibcm_lapr_simul_max);
	ibcm_flow_init(&ibcm_saa_flow, ibcm_saa_simul_max);

	ibcm_open.queued 		= 0;
	ibcm_open.exit_deferred 	= 0;
	ibcm_open.in_progress 		= 0;
	ibcm_open.in_progress_max 	= 16;
	ibcm_open.sends 		= 0;
	ibcm_open.sends_max 		= 0;
	ibcm_open.sends_lowat 		= 8;
	ibcm_open.sends_hiwat 		= 16;
	ibcm_open.tail 			= &ibcm_open.head;
	ibcm_open.head.open_link 	= NULL;
	mutex_exit(&ibcm_open.mutex);

	mutex_init(&ibcm_close.mutex, NULL, MUTEX_DEFAULT, NULL);
	mutex_enter(&ibcm_close.mutex);
	ibcm_close.tail			= &ibcm_close.head;
	ibcm_close.head.close_link 	= NULL;
	mutex_exit(&ibcm_close.mutex);
}

static void
ibcm_rc_flow_control_fini(void)
{
	mutex_destroy(&ibcm_open.mutex);
	mutex_destroy(&ibcm_close.mutex);
}

static ibcm_flow1_t *
ibcm_flow_find(ibcm_flow_t *flow)
{
	ibcm_flow1_t *flow1;
	ibcm_flow1_t *f;

	f = flow->list;
	if (f) {	/* most likely code path */
		while (f->link != NULL)
			f = f->link;
		if (f->waiters < flow->waiters_per_chunk)
			return (f);
	}

	/* There was no flow1 list element ready for another waiter */
	mutex_exit(&ibcm_open.mutex);
	flow1 = kmem_alloc(sizeof (*flow1), KM_SLEEP);
	mutex_enter(&ibcm_open.mutex);

	f = flow->list;
	if (f) {
		while (f->link != NULL)
			f = f->link;
		if (f->waiters < flow->waiters_per_chunk) {
			kmem_free(flow1, sizeof (*flow1));
			return (f);
		}
		f->link = flow1;
	} else {
		flow->list = flow1;
	}
	cv_init(&flow1->cv, NULL, CV_DRIVER, NULL);
	flow1->waiters = 0;
	flow1->link = NULL;
	return (flow1);
}

static void
ibcm_flow_enter(ibcm_flow_t *flow)
{
	mutex_enter(&ibcm_open.mutex);
	if (flow->list == NULL && flow->simul < flow->simul_max) {
		flow->simul++;
		flow->total++;
		mutex_exit(&ibcm_open.mutex);
	} else {
		ibcm_flow1_t *flow1;

		flow1 = ibcm_flow_find(flow);
		flow1->waiters++;
		cv_wait(&flow1->cv, &ibcm_open.mutex);
		if (--flow1->waiters == 0) {
			cv_destroy(&flow1->cv);
			mutex_exit(&ibcm_open.mutex);
			kmem_free(flow1, sizeof (*flow1));
		} else
			mutex_exit(&ibcm_open.mutex);
	}
}

static void
ibcm_flow_exit(ibcm_flow_t *flow)
{
	mutex_enter(&ibcm_open.mutex);
	if (--flow->simul < flow->lowat) {
		if (flow->lowat < flow->lowat_default)
			flow->lowat++;
		if (flow->list) {
			ibcm_flow1_t *flow1;

			flow1 = flow->list;
			flow->list = flow1->link;	/* unlink */
			flow1->link = NULL;		/* be clean */
			flow->total += flow1->waiters;
			flow->simul += flow1->waiters;
			cv_broadcast(&flow1->cv);
		}
	}
	mutex_exit(&ibcm_open.mutex);
}

void
ibcm_flow_inc(void)
{
	mutex_enter(&ibcm_open.mutex);
	if (++ibcm_open.sends > ibcm_open.sends_max) {
		ibcm_open.sends_max = ibcm_open.sends;
		IBTF_DPRINTF_L2(cmlog, "ibcm_flow_inc: sends max = %d",
		    ibcm_open.sends_max);
	}
	mutex_exit(&ibcm_open.mutex);
}

static void
ibcm_check_send_cmpltn_time(hrtime_t delta, char *event_msg)
{
	if (delta > 4000000LL) {
		IBTF_DPRINTF_L2(cmlog, "ibcm_check_send_cmpltn_time: "
		    "%s: %lldns", event_msg, delta);
	}
}

void
ibcm_flow_dec(hrtime_t time, char *mad_type)
{
	int flow_exit = 0;
	int run = 0;

	if (ibcm_dtrace)
		ibcm_check_send_cmpltn_time(gethrtime() - time, mad_type);
	mutex_enter(&ibcm_open.mutex);
	ibcm_open.sends--;
	if (ibcm_open.sends < ibcm_open.sends_lowat) {
		run = ibcm_ok_to_start(&ibcm_open);
		if (ibcm_open.exit_deferred) {
			ibcm_open.exit_deferred--;
			flow_exit = 1;
		}
	}
	mutex_exit(&ibcm_open.mutex);
	if (flow_exit)
		ibcm_flow_exit(&ibcm_close_flow);
	if (run)
		ibcm_run_tlist_thread();
}

void
ibcm_close_enqueue(ibcm_state_data_t *statep)
{
	mutex_enter(&ibcm_close.mutex);
	statep->close_link = NULL;
	ibcm_close.tail->close_link = statep;
	ibcm_close.tail = statep;
	mutex_exit(&ibcm_close.mutex);
	ibcm_run_tlist_thread();
}

void
ibcm_check_for_async_close()
{
	ibcm_state_data_t 	*statep;

	mutex_enter(&ibcm_close.mutex);

	while (ibcm_close.head.close_link) {
		statep = ibcm_close.head.close_link;
		ibcm_close.head.close_link = statep->close_link;
		statep->close_link = NULL;
		if (ibcm_close.tail == statep)
			ibcm_close.tail = &ibcm_close.head;
		mutex_exit(&ibcm_close.mutex);
		ibcm_close_start(statep);
		mutex_enter(&ibcm_close.mutex);
	}
	mutex_exit(&ibcm_close.mutex);
}

void
ibcm_close_enter(void)
{
	ibcm_flow_enter(&ibcm_close_flow);
}

void
ibcm_close_exit(void)
{
	int flow_exit;

	mutex_enter(&ibcm_open.mutex);
	if (ibcm_open.sends < ibcm_open.sends_lowat ||
	    ibcm_open.exit_deferred >= 4)
		flow_exit = 1;
	else {
		flow_exit = 0;
		ibcm_open.exit_deferred++;
	}
	mutex_exit(&ibcm_open.mutex);
	if (flow_exit)
		ibcm_flow_exit(&ibcm_close_flow);
}

/*
 * This function needs to be called twice to finish our flow
 * control accounting when closing down a connection.  One
 * call has send_done set to 1, while the other has it set to 0.
 * Because of retries, this could get called more than once
 * with either 0 or 1, but additional calls have no effect.
 */
void
ibcm_close_done(ibcm_state_data_t *statep, int send_done)
{
	int flow_exit;

	ASSERT(MUTEX_HELD(&statep->state_mutex));
	if (statep->close_flow == 1) {
		if (send_done)
			statep->close_flow = 3;
		else
			statep->close_flow = 2;
	} else if ((send_done && statep->close_flow == 2) ||
	    (!send_done && statep->close_flow == 3)) {
		statep->close_flow = 0;
		mutex_enter(&ibcm_open.mutex);
		if (ibcm_open.sends < ibcm_open.sends_lowat ||
		    ibcm_open.exit_deferred >= 4)
			flow_exit = 1;
		else {
			flow_exit = 0;
			ibcm_open.exit_deferred++;
		}
		mutex_exit(&ibcm_open.mutex);
		if (flow_exit)
			ibcm_flow_exit(&ibcm_close_flow);
	}
}

void
ibcm_lapr_enter(void)
{
	ibcm_flow_enter(&ibcm_lapr_flow);
}

void
ibcm_lapr_exit(void)
{
	ibcm_flow_exit(&ibcm_lapr_flow);
}

void
ibcm_sa_access_enter()
{
	ibcm_flow_enter(&ibcm_saa_flow);
}

void
ibcm_sa_access_exit()
{
	ibcm_flow_exit(&ibcm_saa_flow);
}

static void
ibcm_sm_notice_handler(ibmf_saa_handle_t saa_handle,
    ibmf_saa_subnet_event_t saa_event_code,
    ibmf_saa_event_details_t *saa_event_details,
    void *callback_arg)
{
	ibcm_port_info_t	*portp = (ibcm_port_info_t *)callback_arg;
	ibt_subnet_event_code_t code;
	ibt_subnet_event_t	event;
	uint8_t			event_status;

	IBTF_DPRINTF_L3(cmlog, "ibcm_sm_notice_handler: saa_hdl %p, code = %d",
	    saa_handle, saa_event_code);

	mutex_enter(&ibcm_sm_notice_serialize_lock);

	switch (saa_event_code) {
	case IBMF_SAA_EVENT_MCG_CREATED:
		code = IBT_SM_EVENT_MCG_CREATED;
		break;
	case IBMF_SAA_EVENT_MCG_DELETED:
		code = IBT_SM_EVENT_MCG_DELETED;
		break;
	case IBMF_SAA_EVENT_GID_AVAILABLE:
		code = IBT_SM_EVENT_GID_AVAIL;
		ibcm_path_cache_purge();
		break;
	case IBMF_SAA_EVENT_GID_UNAVAILABLE:
		code = IBT_SM_EVENT_GID_UNAVAIL;
		ibcm_path_cache_purge();
		break;
	case IBMF_SAA_EVENT_SUBSCRIBER_STATUS_CHG:
		event_status =
		    saa_event_details->ie_producer_event_status_mask &
		    IBMF_SAA_EVENT_STATUS_MASK_PRODUCER_SM;
		if (event_status == (portp->port_event_status &
		    IBMF_SAA_EVENT_STATUS_MASK_PRODUCER_SM)) {
			mutex_exit(&ibcm_sm_notice_serialize_lock);
			return;	/* no change */
		}
		portp->port_event_status = event_status;
		if (event_status == IBMF_SAA_EVENT_STATUS_MASK_PRODUCER_SM)
			code = IBT_SM_EVENT_AVAILABLE;
		else
			code = IBT_SM_EVENT_UNAVAILABLE;
		break;
	default:
		mutex_exit(&ibcm_sm_notice_serialize_lock);
		return;
	}

	mutex_enter(&ibcm_global_hca_lock);

	/* don't send the event if we're tearing down */
	if (!IBCM_ACCESS_HCA_OK(portp->port_hcap)) {
		mutex_exit(&ibcm_global_hca_lock);
		mutex_exit(&ibcm_sm_notice_serialize_lock);
		return;
	}

	++(portp->port_hcap->hca_acc_cnt);
	mutex_exit(&ibcm_global_hca_lock);

	event.sm_notice_gid = saa_event_details->ie_gid;
	ibtl_cm_sm_notice_handler(portp->port_sgid0, code, &event);

	mutex_exit(&ibcm_sm_notice_serialize_lock);

	ibcm_dec_hca_acc_cnt(portp->port_hcap);
}

void
ibt_register_subnet_notices(ibt_clnt_hdl_t ibt_hdl,
    ibt_sm_notice_handler_t sm_notice_handler, void *private)
{
	ibcm_port_info_t	*portp;
	ibcm_hca_info_t		*hcap;
	uint8_t			port;
	int			num_failed_sgids;
	ibtl_cm_sm_init_fail_t	*ifail;
	ib_gid_t		*sgidp;

	IBTF_DPRINTF_L3(cmlog, "ibt_register_subnet_notices: ibt_hdl = %p",
	    ibt_hdl);

	mutex_enter(&ibcm_sm_notice_serialize_lock);

	ibtl_cm_set_sm_notice_handler(ibt_hdl, sm_notice_handler, private);
	if (sm_notice_handler == NULL) {
		mutex_exit(&ibcm_sm_notice_serialize_lock);
		return;
	}

	/* for each port, if service is not available, make a call */
	mutex_enter(&ibcm_global_hca_lock);
	num_failed_sgids = 0;
	hcap = ibcm_hca_listp;
	while (hcap != NULL) {
		portp = hcap->hca_port_info;
		for (port = 0; port < hcap->hca_num_ports; port++) {
			if (!(portp->port_event_status &
			    IBMF_SAA_EVENT_STATUS_MASK_PRODUCER_SM))
				num_failed_sgids++;
			portp++;
		}
		hcap = hcap->hca_next;
	}
	if (num_failed_sgids != 0) {
		ifail = kmem_alloc(sizeof (*ifail) +
		    (num_failed_sgids - 1) * sizeof (ib_gid_t), KM_SLEEP);
		ifail->smf_num_sgids = num_failed_sgids;
		ifail->smf_ibt_hdl = ibt_hdl;
		sgidp = &ifail->smf_sgid[0];
		hcap = ibcm_hca_listp;
		while (hcap != NULL) {
			portp = hcap->hca_port_info;
			for (port = 0; port < hcap->hca_num_ports; port++) {
				if (!(portp->port_event_status &
				    IBMF_SAA_EVENT_STATUS_MASK_PRODUCER_SM))
					*sgidp++ = portp->port_sgid0;
				portp++;
			}
			hcap = hcap->hca_next;
		}
	}
	mutex_exit(&ibcm_global_hca_lock);

	if (num_failed_sgids != 0) {
		ibtl_cm_sm_notice_init_failure(ifail);
		kmem_free(ifail, sizeof (*ifail) +
		    (num_failed_sgids - 1) * sizeof (ib_gid_t));
	}
	mutex_exit(&ibcm_sm_notice_serialize_lock);
}

/* The following is run from a taskq because we've seen the stack overflow. */
static void
ibcm_init_saa(void *arg)
{
	ibcm_port_info_t		*portp = (ibcm_port_info_t *)arg;
	int				status;
	ib_guid_t			port_guid;
	ibmf_saa_subnet_event_args_t	event_args;

	port_guid = portp->port_sgid0.gid_guid;

	IBTF_DPRINTF_L3(cmlog, "ibcm_init_saa: port guid %llX", port_guid);

	event_args.is_event_callback_arg = portp;
	event_args.is_event_callback = ibcm_sm_notice_handler;

	if ((status = ibmf_sa_session_open(port_guid, 0, &event_args,
	    IBMF_VERSION, 0, &portp->port_ibmf_saa_hdl)) != IBMF_SUCCESS) {
		IBTF_DPRINTF_L2(cmlog, "ibcm_init_saa: "
		    "ibmf_sa_session_open failed for port guid %llX "
		    "status = %d", port_guid, status);
	} else {
		IBTF_DPRINTF_L2(cmlog, "ibcm_init_saa: "
		    "registered sa_hdl 0x%p for port guid %llX",
		    portp->port_ibmf_saa_hdl, port_guid);
	}

	mutex_enter(&ibcm_sa_open_lock);
	portp->port_saa_open_in_progress = 0;
	cv_broadcast(&ibcm_sa_open_cv);
	mutex_exit(&ibcm_sa_open_lock);
}

void
ibcm_init_saa_handle(ibcm_hca_info_t *hcap, uint8_t port)
{
	ibmf_saa_handle_t	saa_handle;
	uint8_t			port_index = port - 1;
	ibcm_port_info_t	*portp = &hcap->hca_port_info[port_index];
	ibt_status_t		ibt_status;

	if (port_index >= hcap->hca_num_ports)
		return;

	mutex_enter(&ibcm_sa_open_lock);
	if (portp->port_saa_open_in_progress) {
		mutex_exit(&ibcm_sa_open_lock);
		return;
	}

	saa_handle = portp->port_ibmf_saa_hdl;
	if (saa_handle != NULL) {
		mutex_exit(&ibcm_sa_open_lock);
		return;
	}

	portp->port_saa_open_in_progress = 1;
	mutex_exit(&ibcm_sa_open_lock);

	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(portp->port_event_status))

	/* The assumption is that we're getting event notifications */
	portp->port_event_status = IBMF_SAA_EVENT_STATUS_MASK_PRODUCER_SM;

	_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(portp->port_event_status))

	ibt_status = ibt_get_port_state_byguid(portp->port_hcap->hca_guid,
	    portp->port_num, &portp->port_sgid0, NULL);
	if (ibt_status != IBT_SUCCESS) {
		IBTF_DPRINTF_L2(cmlog, "ibcm_init_saa_handle: "
		    "ibt_get_port_state_byguid failed for guid %llX "
		    "with status %d", portp->port_hcap->hca_guid, ibt_status);
		mutex_enter(&ibcm_sa_open_lock);
		portp->port_saa_open_in_progress = 0;
		cv_broadcast(&ibcm_sa_open_cv);
		mutex_exit(&ibcm_sa_open_lock);
		return;
	}
	/* if the port is UP, try sa_session_open */
	(void) taskq_dispatch(ibcm_taskq, ibcm_init_saa, portp, TQ_SLEEP);
}


ibmf_saa_handle_t
ibcm_get_saa_handle(ibcm_hca_info_t *hcap, uint8_t port)
{
	ibmf_saa_handle_t	saa_handle;
	uint8_t			port_index = port - 1;
	ibcm_port_info_t	*portp = &hcap->hca_port_info[port_index];
	ibt_status_t		ibt_status;

	if (port_index >= hcap->hca_num_ports)
		return (NULL);

	mutex_enter(&ibcm_sa_open_lock);
	while (portp->port_saa_open_in_progress) {
		cv_wait(&ibcm_sa_open_cv, &ibcm_sa_open_lock);
	}

	saa_handle = portp->port_ibmf_saa_hdl;
	if (saa_handle != NULL) {
		mutex_exit(&ibcm_sa_open_lock);
		return (saa_handle);
	}

	portp->port_saa_open_in_progress = 1;
	mutex_exit(&ibcm_sa_open_lock);

	ibt_status = ibt_get_port_state_byguid(portp->port_hcap->hca_guid,
	    portp->port_num, &portp->port_sgid0, NULL);
	if (ibt_status != IBT_SUCCESS) {
		IBTF_DPRINTF_L2(cmlog, "ibcm_get_saa_handle: "
		    "ibt_get_port_state_byguid failed for guid %llX "
		    "with status %d", portp->port_hcap->hca_guid, ibt_status);
		mutex_enter(&ibcm_sa_open_lock);
		portp->port_saa_open_in_progress = 0;
		cv_broadcast(&ibcm_sa_open_cv);
		mutex_exit(&ibcm_sa_open_lock);
		return (NULL);
	}
	/* if the port is UP, try sa_session_open */
	(void) taskq_dispatch(ibcm_taskq, ibcm_init_saa, portp, TQ_SLEEP);

	mutex_enter(&ibcm_sa_open_lock);
	while (portp->port_saa_open_in_progress) {
		cv_wait(&ibcm_sa_open_cv, &ibcm_sa_open_lock);
	}
	saa_handle = portp->port_ibmf_saa_hdl;
	mutex_exit(&ibcm_sa_open_lock);
	return (saa_handle);
}


/*
 * ibcm_hca_init_port():
 * 	- Register port with IBMA
 *
 * Arguments:
 *	hcap		- HCA's guid
 *	port_index	- port number minus 1
 *
 * Return values:
 *	IBCM_SUCCESS - success
 */
ibt_status_t
ibcm_hca_init_port(ibcm_hca_info_t *hcap, uint8_t port_index)
{
	int			status;
	ibmf_register_info_t	*ibmf_reg;

	IBTF_DPRINTF_L4(cmlog, "ibcm_hca_init_port: hcap = 0x%p port_num %d",
	    hcap, port_index + 1);

	ASSERT(MUTEX_HELD(&ibcm_global_hca_lock));

	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(hcap->hca_port_info))

	if (hcap->hca_port_info[port_index].port_ibmf_hdl == NULL) {
		/* Register with IBMF */
		ibmf_reg = &hcap->hca_port_info[port_index].port_ibmf_reg;
		ibmf_reg->ir_ci_guid = hcap->hca_guid;
		ibmf_reg->ir_port_num = port_index + 1;
		ibmf_reg->ir_client_class = COMM_MGT_MANAGER_AGENT;

		/*
		 * register with management framework
		 */
		status = ibmf_register(ibmf_reg, IBMF_VERSION,
		    IBMF_REG_FLAG_NO_OFFLOAD, NULL, NULL,
		    &(hcap->hca_port_info[port_index].port_ibmf_hdl),
		    &(hcap->hca_port_info[port_index].port_ibmf_caps));

		if (status != IBMF_SUCCESS) {
			IBTF_DPRINTF_L2(cmlog, "ibcm_hca_init_port: "
			    "ibmf_register failed for port_num %x, "
			    "status = %x", port_index + 1, status);
			return (ibcm_ibmf_analyze_error(status));
		}

		hcap->hca_port_info[port_index].port_qp1.qp_cm =
		    IBMF_QP_HANDLE_DEFAULT;
		hcap->hca_port_info[port_index].port_qp1.qp_port =
		    &(hcap->hca_port_info[port_index]);

		/*
		 * Register the read callback with IBMF.
		 * Since we just did an ibmf_register, handle is
		 * valid and ibcm_recv_cb() is valid so we can
		 * safely assert for success of ibmf_setup_recv_cb()
		 *
		 * Depending on the "state" of the HCA,
		 * CM may drop incoming packets
		 */
		status = ibmf_setup_async_cb(
		    hcap->hca_port_info[port_index].port_ibmf_hdl,
		    IBMF_QP_HANDLE_DEFAULT, ibcm_recv_cb,
		    &(hcap->hca_port_info[port_index].port_qp1), 0);
		ASSERT(status == IBMF_SUCCESS);

		IBTF_DPRINTF_L5(cmlog, "ibcm_hca_init_port: "
		    "IBMF hdl[%x] = 0x%p", port_index,
		    hcap->hca_port_info[port_index].port_ibmf_hdl);

		/* Attempt to get the saa_handle for this port */
		ibcm_init_saa_handle(hcap, port_index + 1);
	}

	return (IBT_SUCCESS);
}

/*
 * useful, to re attempt to initialize port ibma handles from elsewhere in
 * cm code
 */
ibt_status_t
ibcm_hca_reinit_port(ibcm_hca_info_t *hcap, uint8_t port_index)
{
	ibt_status_t	status;

	IBTF_DPRINTF_L5(cmlog, "ibcm_hca_reinit_port: hcap 0x%p port_num %d",
	    hcap, port_index + 1);

	mutex_enter(&ibcm_global_hca_lock);
	status = ibcm_hca_init_port(hcap, port_index);
	mutex_exit(&ibcm_global_hca_lock);
	return (status);
}


/*
 * ibcm_hca_fini_port():
 * 	- Deregister port with IBMA
 *
 * Arguments:
 *	hcap		- HCA's guid
 *	port_index	- port number minus 1
 *
 * Return values:
 *	IBCM_SUCCESS - success
 */
static ibcm_status_t
ibcm_hca_fini_port(ibcm_hca_info_t *hcap, uint8_t port_index)
{
	int			ibmf_status;
	ibcm_status_t		ibcm_status;

	IBTF_DPRINTF_L4(cmlog, "ibcm_hca_fini_port: hcap = 0x%p port_num %d ",
	    hcap, port_index + 1);

	ASSERT(MUTEX_HELD(&ibcm_global_hca_lock));

	if (hcap->hca_port_info[port_index].port_ibmf_saa_hdl != NULL) {
		IBTF_DPRINTF_L5(cmlog, "ibcm_hca_fini_port: "
		    "ibmf_sa_session_close IBMF SAA hdl %p",
		    hcap->hca_port_info[port_index].port_ibmf_saa_hdl);

		ibmf_status = ibmf_sa_session_close(
		    &hcap->hca_port_info[port_index].port_ibmf_saa_hdl, 0);
		if (ibmf_status != IBMF_SUCCESS) {
			IBTF_DPRINTF_L2(cmlog, "ibcm_hca_fini_port: "
			    "ibmf_sa_session_close of port %d returned %x",
			    port_index + 1, ibmf_status);
			return (IBCM_FAILURE);
		}
	}

	if (hcap->hca_port_info[port_index].port_ibmf_hdl != NULL) {
		IBTF_DPRINTF_L5(cmlog, "ibcm_hca_fini_port: "
		    "ibmf_unregister IBMF Hdl %p",
		    hcap->hca_port_info[port_index].port_ibmf_hdl);

		/* clean-up all the ibmf qp's allocated on this port */
		ibcm_status = ibcm_free_allqps(hcap, port_index + 1);

		if (ibcm_status != IBCM_SUCCESS) {

			IBTF_DPRINTF_L2(cmlog, "ibcm_hca_fini_port: "
			    "ibcm_free_allqps failed for port_num %d",
			    port_index + 1);
			return (IBCM_FAILURE);
		}

		/* Tear down the receive callback */
		ibmf_status = ibmf_tear_down_async_cb(
		    hcap->hca_port_info[port_index].port_ibmf_hdl,
		    IBMF_QP_HANDLE_DEFAULT, 0);

		if (ibmf_status != IBMF_SUCCESS) {
			IBTF_DPRINTF_L2(cmlog, "ibcm_hca_fini_port: "
			    "ibmf_tear_down_async_cb failed %d port_num %d",
			    ibmf_status, port_index + 1);
			return (IBCM_FAILURE);
		}

		/* Now, unregister with IBMF */
		ibmf_status = ibmf_unregister(
		    &hcap->hca_port_info[port_index].port_ibmf_hdl, 0);
		IBTF_DPRINTF_L4(cmlog, "ibcm_hca_fini_port: "
		    "ibmf_unregister of port_num %x returned %x",
		    port_index + 1, ibmf_status);

		if (ibmf_status == IBMF_SUCCESS)
			hcap->hca_port_info[port_index].port_ibmf_hdl = NULL;
		else {
			IBTF_DPRINTF_L2(cmlog, "ibcm_hca_fini_port: "
			    "ibmf_unregister failed %d port_num %d",
			    ibmf_status, port_index + 1);
			return (IBCM_FAILURE);
		}
	}
	return (IBCM_SUCCESS);
}

/*
 * ibcm_comm_est_handler():
 *	Check if the given channel is in ESTABLISHED state or not
 *
 * Arguments:
 *	eventp	- A pointer to an ibt_async_event_t struct
 *
 * Return values: NONE
 */
static void
ibcm_comm_est_handler(ibt_async_event_t *eventp)
{
	ibcm_state_data_t	*statep;

	IBTF_DPRINTF_L4(cmlog, "ibcm_comm_est_handler:");

	/* Both QP and EEC handles can't be NULL */
	if (eventp->ev_chan_hdl == NULL) {
		IBTF_DPRINTF_L2(cmlog, "ibcm_comm_est_handler: "
		    "both QP and EEC handles are NULL");
		return;
	}

	/* get the "statep" from qp/eec handles */
	IBCM_GET_CHAN_PRIVATE(eventp->ev_chan_hdl, statep);
	if (statep == NULL) {
		IBTF_DPRINTF_L2(cmlog, "ibcm_comm_est_handler: statep is NULL");
		return;
	}

	mutex_enter(&statep->state_mutex);

	IBCM_RELEASE_CHAN_PRIVATE(eventp->ev_chan_hdl);

	IBTF_DPRINTF_L4(cmlog, "ibcm_comm_est_handler: statep = %p", statep);

	IBCM_REF_CNT_INCR(statep);

	if ((statep->state == IBCM_STATE_REP_SENT) ||
	    (statep->state == IBCM_STATE_MRA_REP_RCVD)) {
		timeout_id_t	timer_val = statep->timerid;

		statep->state = IBCM_STATE_TRANSIENT_ESTABLISHED;

		if (timer_val) {
			statep->timerid = 0;
			mutex_exit(&statep->state_mutex);
			(void) untimeout(timer_val);
		} else
			mutex_exit(&statep->state_mutex);

		/* CM doesn't have RTU message here */
		ibcm_cep_state_rtu(statep, NULL);

	} else {
		if (statep->state == IBCM_STATE_ESTABLISHED ||
		    statep->state == IBCM_STATE_TRANSIENT_ESTABLISHED) {
			IBTF_DPRINTF_L4(cmlog, "ibcm_comm_est_handler: "
			    "Channel already in ESTABLISHED state");
		} else {
			/* An unexpected behavior from remote */
			IBTF_DPRINTF_L2(cmlog, "ibcm_comm_est_handler: "
			    "Unexpected in state = %d", statep->state);
		}
		mutex_exit(&statep->state_mutex);

		ibcm_insert_trace(statep, IBCM_TRACE_INCOMING_COMEST);
	}

	mutex_enter(&statep->state_mutex);
	IBCM_REF_CNT_DECR(statep);
	mutex_exit(&statep->state_mutex);
}


/*
 * ibcm_async_handler():
 *	CM's Async Handler
 *	(Handles ATTACH, DETACH, COM_EST events)
 *
 * Arguments:
 *	eventp	- A pointer to an ibt_async_event_t struct
 *
 * Return values: None
 *
 * NOTE : CM assumes that all HCA DR events are delivered sequentially
 * i.e., until ibcm_async_handler  completes for a given HCA DR, framework
 * shall not invoke ibcm_async_handler with another DR event for the same
 * HCA
 */
/* ARGSUSED */
void
ibcm_async_handler(void *clnt_hdl, ibt_hca_hdl_t hca_hdl,
    ibt_async_code_t code, ibt_async_event_t *eventp)
{
	ibcm_hca_info_t		*hcap;
	ibcm_port_up_t		*pup;

	IBTF_DPRINTF_L3(cmlog, "ibcm_async_handler: "
	    "clnt_hdl = %p, code = 0x%x, eventp = 0x%p",
	    clnt_hdl, code, eventp);

	mutex_enter(&ibcm_global_hca_lock);

	/* If fini is going to complete successfully, then return */
	if (ibcm_finit_state != IBCM_FINIT_IDLE) {

		/*
		 * This finit state implies one of the following:
		 * Init either didn't start or didn't complete OR
		 * Fini is about to return SUCCESS and release the global lock.
		 * In all these cases, it is safe to ignore the async.
		 */

		IBTF_DPRINTF_L2(cmlog, "ibcm_async_handler: ignoring event %x, "
		    "as either init didn't complete or fini about to succeed",
		    code);
		mutex_exit(&ibcm_global_hca_lock);
		return;
	}

	switch (code) {
	case IBT_EVENT_PORT_UP:
		mutex_exit(&ibcm_global_hca_lock);
		pup = kmem_alloc(sizeof (ibcm_port_up_t), KM_SLEEP);
		_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*pup))
		pup->pup_hca_guid = eventp->ev_hca_guid;
		pup->pup_port = eventp->ev_port;
		_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*pup))
		(void) taskq_dispatch(ibcm_taskq,
		    ibcm_service_record_rewrite_task, pup, TQ_SLEEP);
		ibcm_path_cache_purge();
		return;

	case IBT_HCA_ATTACH_EVENT:

		/* eventp->ev_hcaguid is the HCA GUID of interest */
		ibcm_hca_attach(eventp->ev_hca_guid);
		break;

	case IBT_HCA_DETACH_EVENT:

		/* eventp->ev_hca_guid is the HCA GUID of interest */
		if ((hcap = ibcm_find_hcap_entry(eventp->ev_hca_guid)) ==
		    NULL) {
			IBTF_DPRINTF_L2(cmlog, "ibcm_async_handler:"
			    " hca %llX doesn't exist", eventp->ev_hca_guid);
			break;
		}

		(void) ibcm_hca_detach(hcap);
		break;

	case IBT_EVENT_COM_EST_QP:
		/* eventp->ev_qp_hdl is the ibt_qp_hdl_t of interest */
	case IBT_EVENT_COM_EST_EEC:
		/* eventp->ev_eec_hdl is the ibt_eec_hdl_t of interest */
		ibcm_comm_est_handler(eventp);
		break;
	default:
		break;
	}

	/* Unblock, any blocked fini/init operations */
	mutex_exit(&ibcm_global_hca_lock);
}