6890586 RFE: Support iSCSI Multiple Connections per Session (MC/S) with COMSTAR iSCSI
authorPriya Krishnan <Priya.Krishnan@Sun.COM>
Mon, 10 May 2010 17:03:07 -0400
changeset 12372 1d6e0c9ff22e
parent 12371 dd3a3f0b2318
child 12373 a6d4ab1b6cf3
6890586 RFE: Support iSCSI Multiple Connections per Session (MC/S) with COMSTAR iSCSI
usr/src/uts/common/io/comstar/port/iscsit/iscsit.c
usr/src/uts/common/io/comstar/port/iscsit/iscsit.h
usr/src/uts/common/io/comstar/port/iscsit/iscsit_sess.c
usr/src/uts/common/io/idm/idm.c
usr/src/uts/common/io/idm/idm_conn_sm.c
usr/src/uts/common/sys/idm/idm_impl.h
--- a/usr/src/uts/common/io/comstar/port/iscsit/iscsit.c	Mon May 10 15:01:49 2010 -0400
+++ b/usr/src/uts/common/io/comstar/port/iscsit/iscsit.c	Mon May 10 17:03:07 2010 -0400
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #include <sys/cpuvar.h>
@@ -120,6 +119,30 @@
 static idm_status_t iscsit_enable_svc(iscsit_hostinfo_t *hostinfo);
 static void iscsit_disable_svc(void);
 
+static int
+iscsit_check_cmdsn_and_queue(idm_pdu_t *rx_pdu);
+
+static void
+iscsit_add_pdu_to_queue(iscsit_sess_t *ist, idm_pdu_t *rx_pdu);
+
+static idm_pdu_t *
+iscsit_remove_pdu_from_queue(iscsit_sess_t *ist, uint32_t cmdsn);
+
+static void
+iscsit_process_pdu_in_queue(iscsit_sess_t *ist);
+
+static void
+iscsit_rxpdu_queue_monitor_session(iscsit_sess_t *ist);
+
+static void
+iscsit_rxpdu_queue_monitor(void *arg);
+
+static void
+iscsit_post_staged_pdu(idm_pdu_t *rx_pdu);
+
+static void
+iscsit_post_scsi_cmd(idm_conn_t *ic, idm_pdu_t *rx_pdu);
+
 static void
 iscsit_op_scsi_task_mgmt(iscsit_conn_t *ict, idm_pdu_t *rx_pdu);
 
@@ -137,6 +160,9 @@
 
 int iscsit_cmd_window();
 
+static  int
+iscsit_sna_lt(uint32_t sn1, uint32_t sn2);
+
 void
 iscsit_set_cmdsn(iscsit_conn_t *ict, idm_pdu_t *rx_pdu);
 
@@ -216,6 +242,22 @@
 static void iscsit_send_task_mgmt_resp(idm_pdu_t *tm_resp_pdu,
     uint8_t tm_status);
 
+/*
+ * MC/S: Out-of-order commands are staged on a session-wide wait
+ * queue until a system-tunable threshold is reached. A separate
+ * thread is used to scan the staging queue on all the session,
+ * If a delayed PDU does not arrive within a timeout, the target
+ * will advance to the staged PDU that is next in sequence, skipping
+ * over the missing PDU(s) to go past a hole in the sequence.
+ */
+volatile int rxpdu_queue_threshold = ISCSIT_RXPDU_QUEUE_THRESHOLD;
+
+static kmutex_t		iscsit_rxpdu_queue_monitor_mutex;
+kthread_t		*iscsit_rxpdu_queue_monitor_thr_id;
+static kt_did_t		iscsit_rxpdu_queue_monitor_thr_did;
+static boolean_t	iscsit_rxpdu_queue_monitor_thr_running;
+static kcondvar_t	iscsit_rxpdu_queue_monitor_cv;
+
 int
 _init(void)
 {
@@ -226,6 +268,12 @@
 	    MUTEX_DRIVER, NULL);
 	iscsit_global.global_svc_state = ISE_DETACHED;
 
+	mutex_init(&iscsit_rxpdu_queue_monitor_mutex, NULL,
+	    MUTEX_DRIVER, NULL);
+	iscsit_rxpdu_queue_monitor_thr_id = NULL;
+	iscsit_rxpdu_queue_monitor_thr_running = B_FALSE;
+	cv_init(&iscsit_rxpdu_queue_monitor_cv, NULL, CV_DEFAULT, NULL);
+
 	if ((rc = mod_install(&modlinkage)) != 0) {
 		mutex_destroy(&iscsit_global.global_state_mutex);
 		rw_destroy(&iscsit_global.global_rwlock);
@@ -249,6 +297,8 @@
 	rc = mod_remove(&modlinkage);
 
 	if (rc == 0) {
+		mutex_destroy(&iscsit_rxpdu_queue_monitor_mutex);
+		cv_destroy(&iscsit_rxpdu_queue_monitor_cv);
 		mutex_destroy(&iscsit_global.global_state_mutex);
 		rw_destroy(&iscsit_global.global_rwlock);
 	}
@@ -692,6 +742,9 @@
 	iscsit_global.global_dispatch_taskq = taskq_create("iscsit_dispatch",
 	    1, minclsyspri, 16, 16, TASKQ_PREPOPULATE);
 
+	/* Scan staged PDUs, meaningful in MC/S situations */
+	iscsit_rxpdu_queue_monitor_start();
+
 	return (IDM_STATUS_SUCCESS);
 
 tear_down_and_return:
@@ -753,6 +806,8 @@
 
 	ASSERT(iscsit_global.global_svc_state == ISE_DISABLING);
 
+	iscsit_rxpdu_queue_monitor_stop();
+
 	/* tear down discovery sessions */
 	for (sess = avl_first(&iscsit_global.global_discovery_sessions);
 	    sess != NULL;
@@ -853,8 +908,10 @@
 		idm_conn_event(ic, CE_TRANSPORT_FAIL, NULL);
 		break;
 	case ISCSI_OP_SCSI_TASK_MGT_MSG:
-		iscsit_set_cmdsn(ict, rx_pdu);
-		iscsit_op_scsi_task_mgmt(ict, rx_pdu);
+		if (iscsit_check_cmdsn_and_queue(rx_pdu)) {
+			iscsit_set_cmdsn(ict, rx_pdu);
+			iscsit_op_scsi_task_mgmt(ict, rx_pdu);
+		}
 		break;
 	case ISCSI_OP_NOOP_OUT:
 	case ISCSI_OP_LOGIN_CMD:
@@ -1034,10 +1091,9 @@
 	iscsi_data_rsp_hdr_t *dh = (iscsi_data_rsp_hdr_t *)pdu->isp_hdr;
 
 	/*
-	 * We acquired iscsit_sess_t.ist_sn_rwlock in iscsit_xfer_scsi_data
-	 * in reader mode so we expect to be locked here
+	 * We acquired iscsit_sess_t.ist_sn_mutex in iscsit_xfer_scsi_data
 	 */
-
+	ASSERT(MUTEX_HELD(&itask->it_ict->ict_sess->ist_sn_mutex));
 	/*
 	 * Lun is only required if the opcode == ISCSI_OP_SCSI_DATA_RSP
 	 * and the 'A' bit is to be set
@@ -1252,12 +1308,37 @@
 static idm_status_t
 iscsit_conn_lost(idm_conn_t *ic)
 {
-	iscsit_conn_t *ict = ic->ic_handle;
+	iscsit_conn_t	*ict	= ic->ic_handle;
+	iscsit_sess_t	*ist	= ict->ict_sess;
+	iscsit_cbuf_t	*cbuf;
+	idm_pdu_t	*rx_pdu;
+	int i;
 
 	mutex_enter(&ict->ict_mutex);
 	ict->ict_lost = B_TRUE;
 	mutex_exit(&ict->ict_mutex);
-
+	/*
+	 * scrub the staging queue for all PDUs on this connection
+	 */
+	if (ist != NULL) {
+		mutex_enter(&ist->ist_sn_mutex);
+		for (cbuf = ist->ist_rxpdu_queue, i = 0;
+		    ((cbuf->cb_num_elems > 0) && (i < ISCSIT_RXPDU_QUEUE_LEN));
+		    i++) {
+			if (((rx_pdu = cbuf->cb_buffer[i]) != NULL) &&
+			    (rx_pdu->isp_ic == ic)) {
+				/* conn is lost, drop the pdu */
+				DTRACE_PROBE3(scrubbing__staging__queue,
+				    iscsit_sess_t *, ist, idm_conn_t *, ic,
+				    idm_pdu_t *, rx_pdu);
+				idm_pdu_complete(rx_pdu, IDM_STATUS_FAIL);
+				cbuf->cb_buffer[i] = NULL;
+				cbuf->cb_num_elems--;
+				iscsit_conn_dispatch_rele(ict);
+			}
+		}
+		mutex_exit(&ist->ist_sn_mutex);
+	}
 	/*
 	 * Make sure there aren't any PDU's transitioning from the receive
 	 * handler to the dispatch taskq.
@@ -1431,20 +1512,20 @@
 		 * access to the SN values.  We need to lock here to enforce
 		 * lock ordering
 		 */
-		rw_enter(&ict_sess->ist_sn_rwlock, RW_READER);
+		mutex_enter(&ict_sess->ist_sn_mutex);
 		idm_rc = idm_buf_tx_to_ini(iscsit_task->it_idm_task,
 		    ibuf->ibuf_idm_buf, dbuf->db_relative_offset,
 		    dbuf->db_data_size, &iscsit_buf_xfer_cb, dbuf);
-		rw_exit(&ict_sess->ist_sn_rwlock);
+		mutex_exit(&ict_sess->ist_sn_mutex);
 
 		return (iscsit_idm_to_stmf(idm_rc));
 	} else if (dbuf->db_flags & DB_DIRECTION_FROM_RPORT) {
 		/* Grab the SN lock (see comment above) */
-		rw_enter(&ict_sess->ist_sn_rwlock, RW_READER);
+		mutex_enter(&ict_sess->ist_sn_mutex);
 		idm_rc = idm_buf_rx_from_ini(iscsit_task->it_idm_task,
 		    ibuf->ibuf_idm_buf, dbuf->db_relative_offset,
 		    dbuf->db_data_size, &iscsit_buf_xfer_cb, dbuf);
-		rw_exit(&ict_sess->ist_sn_rwlock);
+		mutex_exit(&ict_sess->ist_sn_mutex);
 
 		return (iscsit_idm_to_stmf(idm_rc));
 	}
@@ -1821,13 +1902,23 @@
 	/*NOTREACHED*/
 }
 
+void
+iscsit_op_scsi_cmd(idm_conn_t *ic, idm_pdu_t *rx_pdu)
+{
+	iscsit_conn_t		*ict = ic->ic_handle;
+
+	if (iscsit_check_cmdsn_and_queue(rx_pdu)) {
+		iscsit_post_scsi_cmd(ic, rx_pdu);
+	}
+	iscsit_process_pdu_in_queue(ict->ict_sess);
+}
 
 /*
  * ISCSI protocol
  */
 
 void
-iscsit_op_scsi_cmd(idm_conn_t *ic, idm_pdu_t *rx_pdu)
+iscsit_post_scsi_cmd(idm_conn_t *ic, idm_pdu_t *rx_pdu)
 {
 	iscsit_conn_t		*ict;
 	iscsit_task_t		*itask;
@@ -1851,7 +1942,6 @@
 		return;
 	}
 
-
 	/*
 	 * Note CmdSN and ITT in task.  IDM will have already validated this
 	 * request against the connection state so we don't need to check
@@ -2038,7 +2128,6 @@
 		    uint32_t, ibuf->ibuf_stmf_buf->db_relative_offset,
 		    uint64_t, 0, uint32_t, 0, uint32_t, 0, /* no raddr */
 		    uint32_t, rx_pdu->isp_datalen, int, XFER_BUF_TX_TO_INI);
-
 		stmf_post_task(task, ibuf->ibuf_stmf_buf);
 	} else {
 
@@ -2085,25 +2174,39 @@
 static void
 iscsit_deferred(void *rx_pdu_void)
 {
-	idm_pdu_t *rx_pdu = rx_pdu_void;
-	idm_conn_t *ic = rx_pdu->isp_ic;
-	iscsit_conn_t *ict = ic->ic_handle;
-
+	idm_pdu_t		*rx_pdu = rx_pdu_void;
+	idm_conn_t		*ic = rx_pdu->isp_ic;
+	iscsit_conn_t		*ict = ic->ic_handle;
+
+	/*
+	 * NOP and Task Management Commands can be marked for immediate
+	 * delivery. Commands marked as 'Immediate' are to be considered
+	 * for execution as soon as they arrive on the target. So these
+	 * should not be checked for sequence order and put in a queue.
+	 * The CmdSN is not advanced for Immediate Commands.
+	 */
 	switch (IDM_PDU_OPCODE(rx_pdu)) {
 	case ISCSI_OP_NOOP_OUT:
-		iscsit_set_cmdsn(ict, rx_pdu);
-		iscsit_pdu_op_noop(ict, rx_pdu);
+		if (iscsit_check_cmdsn_and_queue(rx_pdu)) {
+			iscsit_set_cmdsn(ict, rx_pdu);
+			iscsit_pdu_op_noop(ict, rx_pdu);
+		}
 		break;
 	case ISCSI_OP_LOGIN_CMD:
 		iscsit_pdu_op_login_cmd(ict, rx_pdu);
-		break;
+		iscsit_conn_dispatch_rele(ict);
+		return;
 	case ISCSI_OP_TEXT_CMD:
-		iscsit_set_cmdsn(ict, rx_pdu);
-		iscsit_pdu_op_text_cmd(ict, rx_pdu);
+		if (iscsit_check_cmdsn_and_queue(rx_pdu)) {
+			iscsit_set_cmdsn(ict, rx_pdu);
+			iscsit_pdu_op_text_cmd(ict, rx_pdu);
+		}
 		break;
 	case ISCSI_OP_LOGOUT_CMD:
-		iscsit_set_cmdsn(ict, rx_pdu);
-		iscsit_pdu_op_logout_cmd(ict, rx_pdu);
+		if (iscsit_check_cmdsn_and_queue(rx_pdu)) {
+			iscsit_set_cmdsn(ict, rx_pdu);
+			iscsit_pdu_op_logout_cmd(ict, rx_pdu);
+		}
 		break;
 	default:
 		/* Protocol error.  IDM should have caught this */
@@ -2111,6 +2214,11 @@
 		ASSERT(0);
 		break;
 	}
+	/*
+	 * Check if there are other PDUs in the session staging queue
+	 * waiting to be posted to SCSI layer.
+	 */
+	iscsit_process_pdu_in_queue(ict->ict_sess);
 
 	iscsit_conn_dispatch_rele(ict);
 }
@@ -2240,14 +2348,20 @@
 			refcmdsn = ntohl(iscsi_tm->refcmdsn);
 
 			/*
-			 * Task was not found.  If RefCmdSN is within the CmdSN
-			 * window and less than CmdSN of the TM function, return
-			 * "Function Complete".  Otherwise, return
-			 * "Task Does Not Exist".
+			 * Task was not found. But the SCSI command could be
+			 * on the rxpdu wait queue. If RefCmdSN is within
+			 * the CmdSN window and less than CmdSN of the TM
+			 * function, return "Function Complete". Otherwise,
+			 * return "Task Does Not Exist".
 			 */
 
 			if (iscsit_cmdsn_in_window(ict, refcmdsn) &&
-			    (refcmdsn < cmdsn)) {
+			    iscsit_sna_lt(refcmdsn, cmdsn)) {
+				mutex_enter(&ict->ict_sess->ist_sn_mutex);
+				(void) iscsit_remove_pdu_from_queue(
+				    ict->ict_sess, refcmdsn);
+				iscsit_conn_dispatch_rele(ict);
+				mutex_exit(&ict->ict_sess->ist_sn_mutex);
 				iscsit_send_task_mgmt_resp(tm_resp_pdu,
 				    SCSI_TCP_TM_RESP_COMPLETE);
 			} else {
@@ -2473,8 +2587,12 @@
 int
 iscsit_cmd_window()
 {
-	/* Will be better later */
-	return	(1024);
+	/*
+	 * Instead of using a pre-defined constant for the command window,
+	 * it should be made confiurable and dynamic. With MC/S, sequence
+	 * numbers will be used up at a much faster rate than with SC/S.
+	 */
+	return	(ISCSIT_MAX_WINDOW);
 }
 
 /*
@@ -2489,11 +2607,16 @@
 	ist = ict->ict_sess;
 
 	req = (iscsi_scsi_cmd_hdr_t *)rx_pdu->isp_hdr;
-
-	rw_enter(&ist->ist_sn_rwlock, RW_WRITER);
+	if (req->opcode & ISCSI_OP_IMMEDIATE) {
+		/* no cmdsn increment for immediate PDUs */
+		return;
+	}
+
+	/* Ensure that the ExpCmdSN advances in an orderly manner */
+	mutex_enter(&ist->ist_sn_mutex);
 	ist->ist_expcmdsn = ntohl(req->cmdsn) + 1;
 	ist->ist_maxcmdsn = ntohl(req->cmdsn) + iscsit_cmd_window();
-	rw_exit(&ist->ist_sn_rwlock);
+	mutex_exit(&ist->ist_sn_mutex);
 }
 
 /*
@@ -2509,16 +2632,16 @@
 	/*
 	 * The command sequence numbers are session-wide and must stay
 	 * consistent across the transfer, so protect the cmdsn with a
-	 * reader lock on the session. The status sequence number will
+	 * mutex lock on the session. The status sequence number will
 	 * be updated just before the transport layer transmits the PDU.
 	 */
 
-	rw_enter(&ict->ict_sess->ist_sn_rwlock, RW_READER);
+	mutex_enter(&ict->ict_sess->ist_sn_mutex);
 	/* Set ExpCmdSN and MaxCmdSN */
 	rsp->maxcmdsn = htonl(ist->ist_maxcmdsn);
 	rsp->expcmdsn = htonl(ist->ist_expcmdsn);
 	idm_pdu_tx(pdu);
-	rw_exit(&ict->ict_sess->ist_sn_rwlock);
+	mutex_exit(&ict->ict_sess->ist_sn_mutex);
 }
 
 /*
@@ -2916,7 +3039,7 @@
 
 	ist = ict->ict_sess;
 
-	rw_enter(&ist->ist_sn_rwlock, RW_READER);
+	mutex_enter(&ist->ist_sn_mutex);
 
 	/*
 	 * If cmdsn is less than ist_expcmdsn - iscsit_cmd_window() or
@@ -2928,7 +3051,335 @@
 		rval = B_FALSE;
 	}
 
-	rw_exit(&ist->ist_sn_rwlock);
+	mutex_exit(&ist->ist_sn_mutex);
 
 	return (rval);
 }
+
+/*
+ * iscsit_check_cmdsn_and_queue
+ *
+ * Independent of the order in which the iSCSI target receives non-immediate
+ * command PDU across the entire session and any multiple connections within
+ * the session, the target must deliver the commands to the SCSI layer in
+ * CmdSN order. So out-of-order non-immediate commands are queued up on a
+ * session-wide wait queue. Duplicate commands are ignored.
+ *
+ */
+static int
+iscsit_check_cmdsn_and_queue(idm_pdu_t *rx_pdu)
+{
+	idm_conn_t		*ic = rx_pdu->isp_ic;
+	iscsit_conn_t		*ict = ic->ic_handle;
+	iscsit_sess_t		*ist = ict->ict_sess;
+	iscsi_scsi_cmd_hdr_t	*hdr = (iscsi_scsi_cmd_hdr_t *)rx_pdu->isp_hdr;
+
+	mutex_enter(&ist->ist_sn_mutex);
+	if (hdr->opcode & ISCSI_OP_IMMEDIATE) {
+		/* do not queue, handle it immediately */
+		DTRACE_PROBE2(immediate__cmd, iscsit_sess_t *, ist,
+		    idm_pdu_t *, rx_pdu);
+		mutex_exit(&ist->ist_sn_mutex);
+		return (ISCSIT_CMDSN_EQ_EXPCMDSN);
+	}
+	if (iscsit_sna_lt(ist->ist_expcmdsn, ntohl(hdr->cmdsn))) {
+		/*
+		 * Out-of-order commands (cmdSN higher than ExpCmdSN)
+		 * are staged on a fixed-size circular buffer until
+		 * the missing command is delivered to the SCSI layer.
+		 * Irrespective of the order of insertion into the
+		 * staging queue, the commands are processed out of the
+		 * queue in cmdSN order only.
+		 */
+		rx_pdu->isp_queue_time = ddi_get_time();
+		iscsit_add_pdu_to_queue(ist, rx_pdu);
+		mutex_exit(&ist->ist_sn_mutex);
+		return (ISCSIT_CMDSN_GT_EXPCMDSN);
+	} else if (iscsit_sna_lt(ntohl(hdr->cmdsn), ist->ist_expcmdsn)) {
+		DTRACE_PROBE3(cmdsn__lt__expcmdsn, iscsit_sess_t *, ist,
+		    iscsit_conn_t *, ict, idm_pdu_t *, rx_pdu);
+		mutex_exit(&ist->ist_sn_mutex);
+		return (ISCSIT_CMDSN_LT_EXPCMDSN);
+	} else {
+		mutex_exit(&ist->ist_sn_mutex);
+		return (ISCSIT_CMDSN_EQ_EXPCMDSN);
+	}
+}
+
+/*
+ * iscsit_add_pdu_to_queue() adds PDUs into the array indexed by
+ * their cmdsn value. The length of the array is kept above the
+ * maximum window size. The window keeps the cmdsn within a range
+ * such that there are no collisons. e.g. the assumption is that
+ * the windowing checks make it impossible to receive PDUs that
+ * index into the same location in the array.
+ */
+static void
+iscsit_add_pdu_to_queue(iscsit_sess_t *ist, idm_pdu_t *rx_pdu)
+{
+	iscsit_cbuf_t	*cbuf	= ist->ist_rxpdu_queue;
+	iscsit_conn_t	*ict 	= rx_pdu->isp_ic->ic_handle;
+	uint32_t	cmdsn	=
+	    ((iscsi_scsi_cmd_hdr_t *)rx_pdu->isp_hdr)->cmdsn;
+	uint32_t	index;
+
+	ASSERT(MUTEX_HELD(&ist->ist_sn_mutex));
+	/*
+	 * If the connection is being torn down, then
+	 * don't add the PDU to the staging queue
+	 */
+	mutex_enter(&ict->ict_mutex);
+	if (ict->ict_lost) {
+		mutex_exit(&ict->ict_mutex);
+		idm_pdu_complete(rx_pdu, IDM_STATUS_FAIL);
+		return;
+	}
+	iscsit_conn_dispatch_hold(ict);
+	mutex_exit(&ict->ict_mutex);
+
+	index = ntohl(cmdsn) % ISCSIT_RXPDU_QUEUE_LEN;
+	ASSERT(cbuf->cb_buffer[index] == NULL);
+	cbuf->cb_buffer[index] = rx_pdu;
+	cbuf->cb_num_elems++;
+}
+
+static idm_pdu_t *
+iscsit_remove_pdu_from_queue(iscsit_sess_t *ist, uint32_t cmdsn)
+{
+	iscsit_cbuf_t	*cbuf	= ist->ist_rxpdu_queue;
+	idm_pdu_t	*pdu	= NULL;
+	uint32_t	index;
+
+	ASSERT(MUTEX_HELD(&ist->ist_sn_mutex));
+	index = cmdsn % ISCSIT_RXPDU_QUEUE_LEN;
+	if ((pdu = cbuf->cb_buffer[index]) != NULL) {
+		ASSERT(cmdsn ==
+		    ntohl(((iscsi_scsi_cmd_hdr_t *)pdu->isp_hdr)->cmdsn));
+		cbuf->cb_buffer[index] = NULL;
+		cbuf->cb_num_elems--;
+		return (pdu);
+	}
+	return (NULL);
+}
+
+/*
+ * iscsit_process_pdu_in_queue() finds the next pdu in sequence
+ * and posts it to the SCSI layer
+ */
+static void
+iscsit_process_pdu_in_queue(iscsit_sess_t *ist)
+{
+	iscsit_cbuf_t	*cbuf	= ist->ist_rxpdu_queue;
+	idm_pdu_t	*pdu = NULL;
+	uint32_t	expcmdsn;
+
+	for (;;) {
+		mutex_enter(&ist->ist_sn_mutex);
+		if (cbuf->cb_num_elems == 0) {
+			mutex_exit(&ist->ist_sn_mutex);
+			break;
+		}
+		expcmdsn = ist->ist_expcmdsn;
+		if ((pdu = iscsit_remove_pdu_from_queue(ist, expcmdsn))
+		    == NULL) {
+			mutex_exit(&ist->ist_sn_mutex);
+			break;
+		}
+		mutex_exit(&ist->ist_sn_mutex);
+		iscsit_post_staged_pdu(pdu);
+	}
+}
+
+static void
+iscsit_post_staged_pdu(idm_pdu_t *rx_pdu)
+{
+	iscsit_conn_t	*ict	= rx_pdu->isp_ic->ic_handle;
+
+	/* Post the PDU to the SCSI layer */
+	switch (IDM_PDU_OPCODE(rx_pdu)) {
+	case ISCSI_OP_NOOP_OUT:
+		iscsit_set_cmdsn(ict, rx_pdu);
+		iscsit_pdu_op_noop(ict, rx_pdu);
+		break;
+	case ISCSI_OP_TEXT_CMD:
+		iscsit_set_cmdsn(ict, rx_pdu);
+		iscsit_pdu_op_text_cmd(ict, rx_pdu);
+		break;
+	case ISCSI_OP_SCSI_TASK_MGT_MSG:
+		iscsit_set_cmdsn(ict, rx_pdu);
+		iscsit_op_scsi_task_mgmt(ict, rx_pdu);
+		break;
+	case ISCSI_OP_SCSI_CMD:
+		/* cmdSN will be incremented after creating itask */
+		iscsit_post_scsi_cmd(rx_pdu->isp_ic, rx_pdu);
+		break;
+	case ISCSI_OP_LOGOUT_CMD:
+		iscsit_set_cmdsn(ict, rx_pdu);
+		iscsit_pdu_op_logout_cmd(ict, rx_pdu);
+		break;
+	default:
+		/* No other PDUs should be placed on the queue */
+		ASSERT(0);
+	}
+	iscsit_conn_dispatch_rele(ict); /* release hold on the conn */
+}
+
+/* ARGSUSED */
+void
+iscsit_rxpdu_queue_monitor_start(void)
+{
+	mutex_enter(&iscsit_rxpdu_queue_monitor_mutex);
+	if (iscsit_rxpdu_queue_monitor_thr_running) {
+		mutex_exit(&iscsit_rxpdu_queue_monitor_mutex);
+		return;
+	}
+	iscsit_rxpdu_queue_monitor_thr_id =
+	    thread_create(NULL, 0, iscsit_rxpdu_queue_monitor, NULL,
+	    0, &p0, TS_RUN, minclsyspri);
+	while (!iscsit_rxpdu_queue_monitor_thr_running) {
+		cv_wait(&iscsit_rxpdu_queue_monitor_cv,
+		    &iscsit_rxpdu_queue_monitor_mutex);
+	}
+	mutex_exit(&iscsit_rxpdu_queue_monitor_mutex);
+
+}
+
+/* ARGSUSED */
+void
+iscsit_rxpdu_queue_monitor_stop(void)
+{
+	mutex_enter(&iscsit_rxpdu_queue_monitor_mutex);
+	if (iscsit_rxpdu_queue_monitor_thr_running) {
+		iscsit_rxpdu_queue_monitor_thr_running = B_FALSE;
+		cv_signal(&iscsit_rxpdu_queue_monitor_cv);
+		mutex_exit(&iscsit_rxpdu_queue_monitor_mutex);
+
+		thread_join(iscsit_rxpdu_queue_monitor_thr_did);
+		return;
+	}
+	mutex_exit(&iscsit_rxpdu_queue_monitor_mutex);
+}
+
+/*
+ * A separate thread is used to scan the staging queue on all the
+ * sessions, If a delayed PDU does not arrive within a timeout, the
+ * target will advance to the staged PDU that is next in sequence
+ * and exceeded the threshold wait time. It is up to the initiator
+ * to note that the target has not acknowledged a particular cmdsn
+ * and take appropriate action.
+ */
+/* ARGSUSED */
+static void
+iscsit_rxpdu_queue_monitor(void *arg)
+{
+	iscsit_tgt_t	*tgt;
+	iscsit_sess_t	*ist;
+
+	mutex_enter(&iscsit_rxpdu_queue_monitor_mutex);
+	iscsit_rxpdu_queue_monitor_thr_did = curthread->t_did;
+	iscsit_rxpdu_queue_monitor_thr_running = B_TRUE;
+	cv_signal(&iscsit_rxpdu_queue_monitor_cv);
+
+	while (iscsit_rxpdu_queue_monitor_thr_running) {
+		ISCSIT_GLOBAL_LOCK(RW_READER);
+		for (tgt = avl_first(&iscsit_global.global_target_list);
+		    tgt != NULL;
+		    tgt = AVL_NEXT(&iscsit_global.global_target_list, tgt)) {
+			mutex_enter(&tgt->target_mutex);
+			for (ist = avl_first(&tgt->target_sess_list);
+			    ist != NULL;
+			    ist = AVL_NEXT(&tgt->target_sess_list, ist)) {
+
+				iscsit_rxpdu_queue_monitor_session(ist);
+			}
+			mutex_exit(&tgt->target_mutex);
+		}
+		ISCSIT_GLOBAL_UNLOCK();
+		if (iscsit_rxpdu_queue_monitor_thr_running == B_FALSE) {
+			break;
+		}
+		(void) cv_reltimedwait(&iscsit_rxpdu_queue_monitor_cv,
+		    &iscsit_rxpdu_queue_monitor_mutex,
+		    ISCSIT_RXPDU_QUEUE_MONITOR_INTERVAL * drv_usectohz(1000000),
+		    TR_CLOCK_TICK);
+	}
+	mutex_exit(&iscsit_rxpdu_queue_monitor_mutex);
+	thread_exit();
+}
+
+static void
+iscsit_rxpdu_queue_monitor_session(iscsit_sess_t *ist)
+{
+	iscsit_cbuf_t	*cbuf	= ist->ist_rxpdu_queue;
+	idm_pdu_t	*next_pdu = NULL;
+	uint32_t	index, next_cmdsn, i;
+
+	/*
+	 * Assume that all PDUs in the staging queue have a cmdsn >= expcmdsn.
+	 * Starting with the expcmdsn, iterate over the staged PDUs to find
+	 * the next PDU with a wait time greater than the threshold. If found
+	 * advance the staged PDU to the SCSI layer, skipping over the missing
+	 * PDU(s) to get past the hole in the command sequence. It is up to
+	 * the initiator to note that the target has not acknowledged a cmdsn
+	 * and take appropriate action.
+	 *
+	 * Since the PDU(s) arrive in any random order, it is possible that
+	 * that the actual wait time for a particular PDU is much longer than
+	 * the defined threshold. e.g. Consider a case where commands are sent
+	 * over 4 different connections, and cmdsn = 1004 arrives first, then
+	 * 1003, and 1002 and 1001 are lost due to a connection failure.
+	 * So now 1003 is waiting for 1002 to be delivered, and although the
+	 * wait time of 1004 > wait time of 1003, only 1003 will be considered
+	 * by the monitor thread. 1004 will be automatically processed by
+	 * iscsit_process_pdu_in_queue() once the scan is complete and the
+	 * expcmdsn becomes current.
+	 */
+	mutex_enter(&ist->ist_sn_mutex);
+	cbuf = ist->ist_rxpdu_queue;
+	if (cbuf->cb_num_elems == 0) {
+		mutex_exit(&ist->ist_sn_mutex);
+		return;
+	}
+	for (next_pdu = NULL, i = 0; ; i++) {
+		next_cmdsn = ist->ist_expcmdsn + i; /* start at expcmdsn */
+		index = next_cmdsn % ISCSIT_RXPDU_QUEUE_LEN;
+		if ((next_pdu = cbuf->cb_buffer[index]) != NULL) {
+			/*
+			 * If the PDU wait time has not exceeded threshold
+			 * stop scanning the staging queue until the timer
+			 * fires again
+			 */
+			if ((ddi_get_time() - next_pdu->isp_queue_time)
+			    < rxpdu_queue_threshold) {
+				mutex_exit(&ist->ist_sn_mutex);
+				return;
+			}
+			/*
+			 * Remove the next PDU from the queue and post it
+			 * to the SCSI layer, skipping over the missing
+			 * PDU. Stop scanning the staging queue until
+			 * the monitor timer fires again
+			 */
+			(void) iscsit_remove_pdu_from_queue(ist, next_cmdsn);
+			mutex_exit(&ist->ist_sn_mutex);
+			DTRACE_PROBE3(advanced__to__blocked__cmdsn,
+			    iscsit_sess_t *, ist, idm_pdu_t *, next_pdu,
+			    uint32_t, next_cmdsn);
+			iscsit_post_staged_pdu(next_pdu);
+			/* Deliver any subsequent PDUs immediately */
+			iscsit_process_pdu_in_queue(ist);
+			return;
+		}
+		/*
+		 * Skipping over i PDUs, e.g. a case where commands 1001 and
+		 * 1002 are lost in the network, skip over both and post 1003
+		 * expcmdsn then becomes 1004 at the end of the scan.
+		 */
+		DTRACE_PROBE2(skipping__over__cmdsn, iscsit_sess_t *, ist,
+		    uint32_t, next_cmdsn);
+	}
+	/*
+	 * following the assumption, staged cmdsn >= expcmdsn, this statement
+	 * is never reached.
+	 */
+}
--- a/usr/src/uts/common/io/comstar/port/iscsit/iscsit.h	Mon May 10 15:01:49 2010 -0400
+++ b/usr/src/uts/common/io/comstar/port/iscsit/iscsit.h	Mon May 10 17:03:07 2010 -0400
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 #ifndef _ISCSIT_H_
 #define	_ISCSIT_H_
@@ -36,7 +35,7 @@
  */
 #define	ISCSIT_MIN_VERSION			0x00
 #define	ISCSIT_MAX_VERSION			0x00
-#define	ISCSIT_MAX_CONNECTIONS			1 /* No MC/S support */
+#define	ISCSIT_MAX_CONNECTIONS			32 /* MC/S support  */
 #define	ISCSIT_MAX_RECV_DATA_SEGMENT_LENGTH	(32*1024)
 #define	ISCSIT_MAX_BURST_LENGTH			(512*1024)
 #define	ISCSIT_MAX_FIRST_BURST_LENGTH		ISCSI_DEFAULT_FIRST_BURST_LENGTH
@@ -55,12 +54,34 @@
 /* Max targets per system */
 #define	ISCSIT_MAX_TARGETS	1024
 
+#define	ISCSIT_MAX_WINDOW	1024
+#define	ISCSIT_RXPDU_QUEUE_LEN	2048
+
+#define	ISCSIT_CMDSN_LT_EXPCMDSN	-1
+#define	ISCSIT_CMDSN_EQ_EXPCMDSN	1
+#define	ISCSIT_CMDSN_GT_EXPCMDSN	0
+/*
+ * MC/S: A timeout is maintained to recover from lost CmdSN (holes in the
+ * CmdSN ordering). When the timeout is reached, the ExpCmdSN is advanced
+ * past the hole to continue processing the queued commands. This value is
+ * system-tunable (volatile rxpdu_queue_threshold) and should be in the
+ * range from 5 to 30 seconds.
+ */
+#define	ISCSIT_RXPDU_QUEUE_THRESHOLD		5	/* 5 seconds */
+#define	ISCSIT_RXPDU_QUEUE_MONITOR_INTERVAL	5	/* 5 seconds */
+
 /* Time in seconds to wait between calls to stmf_deregister_local_port */
 #define	TGT_DEREG_RETRY_SECONDS	1
 
 #define	ISCSIT_GLOBAL_LOCK(rw) rw_enter(&iscsit_global.global_rwlock, (rw))
 #define	ISCSIT_GLOBAL_UNLOCK() rw_exit(&iscsit_global.global_rwlock)
 
+/* Circular buffer to hold the out-of-order PDUs in MC/S */
+typedef struct {
+	idm_pdu_t	*cb_buffer[ISCSIT_RXPDU_QUEUE_LEN];
+	int		cb_num_elems;
+} iscsit_cbuf_t;
+
 /*
  * Used for serial number arithmetic (RFC 1982)
  */
@@ -337,7 +358,7 @@
 	iscsit_tgt_t		*ist_tgt;
 	idm_refcnt_t		ist_refcnt;
 	kmem_cache_t		*ist_task_cache;
-	krwlock_t		ist_sn_rwlock;
+	kmutex_t		ist_sn_mutex;
 	kmutex_t		ist_mutex;
 	kcondvar_t		ist_cv;
 	iscsit_session_state_t	ist_state;
@@ -363,6 +384,7 @@
 	uint32_t		ist_expcmdsn;
 	uint32_t		ist_maxcmdsn;
 	avl_tree_t		ist_task_list;
+	iscsit_cbuf_t		*ist_rxpdu_queue;
 } iscsit_sess_t;
 
 /* Update iscsit_ils_name table whenever login states are modified */
@@ -822,4 +844,10 @@
     unsigned int chap_i, uchar_t *chap_c, unsigned int challenge_len,
     uchar_t *chap_r, unsigned int resp_len);
 
+void
+iscsit_rxpdu_queue_monitor_start(void);
+
+void
+iscsit_rxpdu_queue_monitor_stop(void);
+
 #endif /* _ISCSIT_H_ */
--- a/usr/src/uts/common/io/comstar/port/iscsit/iscsit_sess.c	Mon May 10 15:01:49 2010 -0400
+++ b/usr/src/uts/common/io/comstar/port/iscsit/iscsit_sess.c	Mon May 10 17:03:07 2010 -0400
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #include <sys/cpuvar.h>
@@ -139,7 +138,7 @@
 	}
 
 	idm_sm_audit_init(&result->ist_state_audit);
-	rw_init(&result->ist_sn_rwlock, NULL, RW_DRIVER, NULL);
+	mutex_init(&result->ist_sn_mutex, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&result->ist_mutex, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&result->ist_cv, NULL, CV_DEFAULT, NULL);
 	list_create(&result->ist_events, sizeof (sess_event_ctx_t),
@@ -148,7 +147,7 @@
 	    offsetof(iscsit_conn_t, ict_sess_ln));
 	avl_create(&result->ist_task_list, iscsit_task_itt_compare,
 	    sizeof (iscsit_task_t), offsetof(iscsit_task_t, it_sess_ln));
-
+	result->ist_rxpdu_queue = kmem_zalloc(sizeof (iscsit_cbuf_t), KM_SLEEP);
 	result->ist_state = SS_Q1_FREE;
 	result->ist_last_state = SS_Q1_FREE;
 	bcopy(isid, result->ist_isid, ISCSI_ISID_LEN);
@@ -252,11 +251,12 @@
 		kmem_free(ist->ist_target_alias,
 		    strlen(ist->ist_target_alias) + 1);
 	avl_destroy(&ist->ist_task_list);
+	kmem_free(ist->ist_rxpdu_queue, sizeof (iscsit_cbuf_t));
 	list_destroy(&ist->ist_conn_list);
 	list_destroy(&ist->ist_events);
 	cv_destroy(&ist->ist_cv);
 	mutex_destroy(&ist->ist_mutex);
-	rw_destroy(&ist->ist_sn_rwlock);
+	mutex_destroy(&ist->ist_sn_mutex);
 	kmem_free(ist, sizeof (*ist));
 }
 
--- a/usr/src/uts/common/io/idm/idm.c	Mon May 10 15:01:49 2010 -0400
+++ b/usr/src/uts/common/io/idm/idm.c	Mon May 10 17:03:07 2010 -0400
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #include <sys/cpuvar.h>
@@ -54,7 +53,6 @@
 	MODREV_1, (void *)&modlmisc, NULL
 };
 
-extern int idm_task_compare(const void *t1, const void *t2);
 extern void idm_wd_thread(void *arg);
 
 static int _idm_init(void);
--- a/usr/src/uts/common/io/idm/idm_conn_sm.c	Mon May 10 15:01:49 2010 -0400
+++ b/usr/src/uts/common/io/idm/idm_conn_sm.c	Mon May 10 17:03:07 2010 -0400
@@ -20,8 +20,7 @@
  */
 
 /*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #include <sys/cpuvar.h>
@@ -633,6 +632,8 @@
 idm_state_s5_logged_in(idm_conn_t *ic, idm_conn_event_ctx_t *event_ctx)
 {
 	switch (event_ctx->iec_event) {
+	case CE_MISC_RX:
+		/* MC/S: when removing the non-leading connection */
 	case CE_LOGOUT_THIS_CONN_RCV:
 	case CE_LOGOUT_THIS_CONN_SND:
 	case CE_LOGOUT_OTHER_CONN_RCV:
@@ -675,7 +676,6 @@
 		idm_update_state(ic, CS_S8_CLEANUP, event_ctx);
 		break;
 	case CE_MISC_TX:
-	case CE_MISC_RX:
 	case CE_TX_PROTOCOL_ERROR:
 	case CE_RX_PROTOCOL_ERROR:
 	case CE_LOGIN_TIMEOUT:
--- a/usr/src/uts/common/sys/idm/idm_impl.h	Mon May 10 15:01:49 2010 -0400
+++ b/usr/src/uts/common/sys/idm/idm_impl.h	Mon May 10 17:03:07 2010 -0400
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 #ifndef	_IDM_IMPL_H_
 #define	_IDM_IMPL_H_
@@ -373,6 +372,7 @@
 	uint32_t	isp_flags;
 	uint_t		isp_hdrbuflen;
 	uint_t		isp_databuflen;
+	time_t		isp_queue_time;
 } idm_pdu_t;
 
 /*
@@ -522,8 +522,6 @@
 
 void idm_listbuf_insert(list_t *lst, idm_buf_t *buf);
 
-int idm_task_compare(const void *v1, const void *v2);
-
 idm_conn_t *idm_lookup_conn(uint8_t *isid, uint16_t tsih, uint16_t cid);
 
 #ifdef	__cplusplus