6909809 COMSTAR should avoid extra data copy to zvol-based backing store
authorJames Moore <James.Moore@Sun.COM>
Wed, 05 May 2010 10:23:23 -0700
changeset 12314 0ed71edeac88
parent 12313 c894fe63f90f
child 12315 e8f2d0caeb31
6909809 COMSTAR should avoid extra data copy to zvol-based backing store 6931076 COMSTAR qlt driver should be able to dynamically map passed-in buffers 6933737 When QLT is loaded, fcinfo reports supported speeds returns 1g for the 8Gb HBA's 6912734 qlt: qlt emits spurious warnings during init on debug kernels
usr/src/uts/common/Makefile.files
usr/src/uts/common/fs/zfs/sys/zvol.h
usr/src/uts/common/fs/zfs/zvol.c
usr/src/uts/common/io/comstar/lu/stmf_sbd/sbd.c
usr/src/uts/common/io/comstar/lu/stmf_sbd/sbd_impl.h
usr/src/uts/common/io/comstar/lu/stmf_sbd/sbd_scsi.c
usr/src/uts/common/io/comstar/lu/stmf_sbd/sbd_zvol.c
usr/src/uts/common/io/comstar/lu/stmf_sbd/stmf_sbd.h
usr/src/uts/common/io/comstar/port/fct/fct.c
usr/src/uts/common/io/comstar/port/qlt/qlt.c
usr/src/uts/common/io/comstar/port/qlt/qlt.h
usr/src/uts/common/io/comstar/port/qlt/qlt_dma.c
usr/src/uts/common/io/comstar/port/qlt/qlt_dma.h
usr/src/uts/common/io/comstar/port/qlt/qlt_open.h
usr/src/uts/common/io/comstar/stmf/stmf.c
usr/src/uts/common/sys/fct.h
usr/src/uts/common/sys/lpif.h
usr/src/uts/common/sys/portif.h
usr/src/uts/common/sys/stmf.h
usr/src/uts/intel/stmf_sbd/Makefile
usr/src/uts/sparc/stmf_sbd/Makefile
--- a/usr/src/uts/common/Makefile.files	Wed May 05 10:34:37 2010 -0400
+++ b/usr/src/uts/common/Makefile.files	Wed May 05 10:23:23 2010 -0700
@@ -969,7 +969,7 @@
 
 STMF_OBJS += lun_map.o stmf.o
 
-STMF_SBD_OBJS += sbd.o sbd_scsi.o sbd_pgr.o
+STMF_SBD_OBJS += sbd.o sbd_scsi.o sbd_pgr.o sbd_zvol.o
 
 SYSMSG_OBJS +=	sysmsg.o
 
--- a/usr/src/uts/common/fs/zfs/sys/zvol.h	Wed May 05 10:34:37 2010 -0400
+++ b/usr/src/uts/common/fs/zfs/sys/zvol.h	Wed May 05 10:23:23 2010 -0700
@@ -20,8 +20,7 @@
  */
 
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #ifndef	_SYS_ZVOL_H
@@ -59,6 +58,15 @@
 extern int zvol_busy(void);
 extern void zvol_init(void);
 extern void zvol_fini(void);
+
+extern int zvol_get_volume_params(minor_t minor, uint64_t *blksize,
+    uint64_t *max_xfer_len, void **minor_hdl, void **objset_hdl, void **zil_hdl,
+    void **rl_hdl, void **bonus_hdl);
+extern uint64_t zvol_get_volume_size(void *minor_hdl);
+extern int zvol_get_volume_wce(void *minor_hdl);
+extern void zvol_log_write_minor(void *minor_hdl, dmu_tx_t *tx, offset_t off,
+    ssize_t resid, boolean_t sync);
+
 #endif
 
 #ifdef	__cplusplus
--- a/usr/src/uts/common/fs/zfs/zvol.c	Wed May 05 10:34:37 2010 -0400
+++ b/usr/src/uts/common/fs/zfs/zvol.c	Wed May 05 10:23:23 2010 -0700
@@ -1458,6 +1458,79 @@
 }
 
 /*
+ * BEGIN entry points to allow external callers access to the volume.
+ */
+/*
+ * Return the volume parameters needed for access from an external caller.
+ * These values are invariant as long as the volume is held open.
+ */
+int
+zvol_get_volume_params(minor_t minor, uint64_t *blksize,
+    uint64_t *max_xfer_len, void **minor_hdl, void **objset_hdl, void **zil_hdl,
+    void **rl_hdl, void **bonus_hdl)
+{
+	zvol_state_t *zv;
+
+	if (minor == 0)
+		return (ENXIO);
+	if ((zv = ddi_get_soft_state(zvol_state, minor)) == NULL)
+		return (ENXIO);
+	if (zv->zv_flags & ZVOL_DUMPIFIED)
+		return (ENXIO);
+
+	ASSERT(blksize && max_xfer_len && minor_hdl &&
+	    objset_hdl && zil_hdl && rl_hdl && bonus_hdl);
+
+	*blksize = zv->zv_volblocksize;
+	*max_xfer_len = (uint64_t)zvol_maxphys;
+	*minor_hdl = zv;
+	*objset_hdl = zv->zv_objset;
+	*zil_hdl = zv->zv_zilog;
+	*rl_hdl = &zv->zv_znode;
+	*bonus_hdl = zv->zv_dbuf;
+	return (0);
+}
+
+/*
+ * Return the current volume size to an external caller.
+ * The size can change while the volume is open.
+ */
+uint64_t
+zvol_get_volume_size(void *minor_hdl)
+{
+	zvol_state_t *zv = minor_hdl;
+
+	return (zv->zv_volsize);
+}
+
+/*
+ * Return the current WCE setting to an external caller.
+ * The WCE setting can change while the volume is open.
+ */
+int
+zvol_get_volume_wce(void *minor_hdl)
+{
+	zvol_state_t *zv = minor_hdl;
+
+	return ((zv->zv_flags & ZVOL_WCE) ? 1 : 0);
+}
+
+/*
+ * Entry point for external callers to zvol_log_write
+ */
+void
+zvol_log_write_minor(void *minor_hdl, dmu_tx_t *tx, offset_t off, ssize_t resid,
+    boolean_t sync)
+{
+	zvol_state_t *zv = minor_hdl;
+
+	zvol_log_write(zv, tx, off, resid, sync);
+}
+/*
+ * END entry points to allow external callers access to the volume.
+ */
+
+/*
  * Dirtbag ioctls to support mkfs(1M) for UFS filesystems.  See dkio(7I).
  */
 /*ARGSUSED*/
--- a/usr/src/uts/common/io/comstar/lu/stmf_sbd/sbd.c	Wed May 05 10:34:37 2010 -0400
+++ b/usr/src/uts/common/io/comstar/lu/stmf_sbd/sbd.c	Wed May 05 10:23:23 2010 -0700
@@ -1405,6 +1405,7 @@
 	lu->lu_send_status_done = sbd_send_status_done;
 	lu->lu_task_free = sbd_task_free;
 	lu->lu_abort = sbd_abort;
+	lu->lu_dbuf_free = sbd_dbuf_free;
 	lu->lu_ctl = sbd_ctl;
 	lu->lu_info = sbd_info;
 	sl->sl_state = STMF_STATE_OFFLINE;
@@ -1432,6 +1433,8 @@
 	uint64_t supported_size;
 	vattr_t vattr;
 	enum vtype vt;
+	struct dk_cinfo dki;
+	int unused;
 
 	mutex_enter(&sl->sl_lock);
 	if (vp_valid) {
@@ -1483,6 +1486,7 @@
 	}
 	/* sl_data_readable size includes any metadata. */
 	sl->sl_data_readable_size = vattr.va_size;
+
 	if (VOP_PATHCONF(sl->sl_data_vp, _PC_FILESIZEBITS, &nbits,
 	    CRED(), NULL) != 0) {
 		nbits = 0;
@@ -1532,6 +1536,21 @@
 		ret = EINVAL;
 		goto odf_close_data_and_exit;
 	}
+	/*
+	 * Get the minor device for direct zvol access
+	 */
+	if (sl->sl_flags & SL_ZFS_META) {
+		if ((ret = VOP_IOCTL(sl->sl_data_vp, DKIOCINFO, (intptr_t)&dki,
+		    FKIOCTL, kcred, &unused, NULL)) != 0) {
+			cmn_err(CE_WARN, "ioctl(DKIOCINFO) failed %d", ret);
+			/* zvol reserves 0, so this would fail later */
+			sl->sl_zvol_minor = 0;
+		} else {
+			sl->sl_zvol_minor = dki.dki_unit;
+			if (sbd_zvol_get_volume_params(sl) == 0)
+				sl->sl_flags |= SL_CALL_ZVOL;
+		}
+	}
 	sl->sl_flags |= SL_MEDIA_LOADED;
 	mutex_exit(&sl->sl_lock);
 	return (0);
--- a/usr/src/uts/common/io/comstar/lu/stmf_sbd/sbd_impl.h	Wed May 05 10:34:37 2010 -0400
+++ b/usr/src/uts/common/io/comstar/lu/stmf_sbd/sbd_impl.h	Wed May 05 10:23:23 2010 -0700
@@ -254,6 +254,40 @@
  */
 #define	SBD_IT_HAS_SCSI2_RESERVATION	0x0001
 
+/*
+ * dbuf private data needed for direct zvol data transfers
+ *
+ * To further isolate the zvol knowledge, the object handles
+ * needed to call into zfs are declared void * here.
+ */
+
+typedef struct sbd_zvol_io {
+	uint64_t	zvio_offset;	/* offset into volume */
+	int		zvio_flags;	/* flags */
+	void 		*zvio_dbp;	/* array of dmu buffers */
+	void		*zvio_abp;	/* array of arc buffers */
+	uio_t		*zvio_uio;	/* for copy operations */
+} sbd_zvol_io_t;
+
+#define	ZVIO_DEFAULT	0
+#define	ZVIO_COMMIT	1
+#define	ZVIO_ABORT	2
+#define	ZVIO_SYNC	4
+#define	ZVIO_ASYNC	8
+
+/*
+ * zvol data path functions
+ */
+int sbd_zvol_get_volume_params(sbd_lu_t *sl);
+uint32_t sbd_zvol_numsegs(sbd_lu_t *sl, uint64_t off, uint32_t len);
+int sbd_zvol_alloc_read_bufs(sbd_lu_t *sl, stmf_data_buf_t *dbuf);
+void sbd_zvol_rele_read_bufs(sbd_lu_t *sl, stmf_data_buf_t *dbuf);
+int sbd_zvol_alloc_write_bufs(sbd_lu_t *sl, stmf_data_buf_t *dbuf);
+void sbd_zvol_rele_write_bufs_abort(sbd_lu_t *sl, stmf_data_buf_t *dbuf);
+int sbd_zvol_rele_write_bufs(sbd_lu_t *sl, stmf_data_buf_t *dbuf);
+int sbd_zvol_copy_read(sbd_lu_t *sl, uio_t *uio);
+int sbd_zvol_copy_write(sbd_lu_t *sl, uio_t *uio, int flags);
+
 stmf_status_t sbd_task_alloc(struct scsi_task *task);
 void sbd_new_task(struct scsi_task *task, struct stmf_data_buf *initial_dbuf);
 void sbd_dbuf_xfer_done(struct scsi_task *task, struct stmf_data_buf *dbuf);
@@ -261,6 +295,7 @@
 void sbd_task_free(struct scsi_task *task);
 stmf_status_t sbd_abort(struct stmf_lu *lu, int abort_cmd, void *arg,
 							uint32_t flags);
+void sbd_dbuf_free(struct scsi_task *task, struct stmf_data_buf *dbuf);
 void sbd_ctl(struct stmf_lu *lu, int cmd, void *arg);
 stmf_status_t sbd_info(uint32_t cmd, stmf_lu_t *lu, void *arg,
 				uint8_t *buf, uint32_t *bufsizep);
--- a/usr/src/uts/common/io/comstar/lu/stmf_sbd/sbd_scsi.c	Wed May 05 10:34:37 2010 -0400
+++ b/usr/src/uts/common/io/comstar/lu/stmf_sbd/sbd_scsi.c	Wed May 05 10:23:23 2010 -0700
@@ -92,8 +92,6 @@
     stmf_data_buf_t *dbuf);
 void sbd_handle_short_write_transfers(scsi_task_t *task,
     stmf_data_buf_t *dbuf, uint32_t cdb_xfer_size);
-static void sbd_handle_sync_cache(struct scsi_task *task,
-    struct stmf_data_buf *initial_dbuf);
 void sbd_handle_mode_select_xfer(scsi_task_t *task, uint8_t *buf,
     uint32_t buflen);
 void sbd_handle_mode_select(scsi_task_t *task, stmf_data_buf_t *dbuf);
@@ -105,6 +103,8 @@
 extern void sbd_handle_pgr_in_cmd(scsi_task_t *, stmf_data_buf_t *);
 extern void sbd_handle_pgr_out_cmd(scsi_task_t *, stmf_data_buf_t *);
 extern void sbd_handle_pgr_out_data(scsi_task_t *, stmf_data_buf_t *);
+void sbd_do_sgl_write_xfer(struct scsi_task *task, sbd_cmd_t *scmd,
+    int first_xfer);
 /*
  * IMPORTANT NOTE:
  * =================
@@ -178,6 +178,269 @@
 	}
 }
 
+/*
+ * sbd_zcopy: Bail-out switch for reduced copy path.
+ *
+ * 0 - read & write off
+ * 1 - read & write on
+ * 2 - only read on
+ * 4 - only write on
+ */
+int sbd_zcopy = 1;	/* enable zcopy read & write path */
+uint32_t sbd_max_xfer_len = 0;		/* Valid if non-zero */
+uint32_t sbd_1st_xfer_len = 0;		/* Valid if non-zero */
+uint32_t sbd_copy_threshold = 0;		/* Valid if non-zero */
+
+static void
+sbd_do_sgl_read_xfer(struct scsi_task *task, sbd_cmd_t *scmd, int first_xfer)
+{
+	sbd_lu_t *sl = (sbd_lu_t *)task->task_lu->lu_provider_private;
+	sbd_zvol_io_t *zvio;
+	int ret, final_xfer;
+	uint64_t offset;
+	uint32_t xfer_len, max_len, first_len;
+	stmf_status_t xstat;
+	stmf_data_buf_t *dbuf;
+	uint_t nblks;
+	uint64_t blksize = sl->sl_blksize;
+	size_t db_private_sz;
+	hrtime_t xfer_start, xfer_elapsed;
+	uintptr_t pad;
+
+	ASSERT(rw_read_held(&sl->sl_access_state_lock));
+	ASSERT((sl->sl_flags & SL_MEDIA_LOADED) != 0);
+
+	/*
+	 * Calculate the limits on xfer_len to the minimum of :
+	 *    - task limit
+	 *    - lun limit
+	 *    - sbd global limit if set
+	 *    - first xfer limit if set
+	 *
+	 * First, protect against silly over-ride value
+	 */
+	if (sbd_max_xfer_len && ((sbd_max_xfer_len % DEV_BSIZE) != 0)) {
+		cmn_err(CE_WARN, "sbd_max_xfer_len invalid %d, resetting\n",
+		    sbd_max_xfer_len);
+		sbd_max_xfer_len = 0;
+	}
+	if (sbd_1st_xfer_len && ((sbd_1st_xfer_len % DEV_BSIZE) != 0)) {
+		cmn_err(CE_WARN, "sbd_1st_xfer_len invalid %d, resetting\n",
+		    sbd_1st_xfer_len);
+		sbd_1st_xfer_len = 0;
+	}
+
+	max_len = MIN(task->task_max_xfer_len, sl->sl_max_xfer_len);
+	if (sbd_max_xfer_len)
+		max_len = MIN(max_len, sbd_max_xfer_len);
+	/*
+	 * Special case the first xfer if hints are set.
+	 */
+	if (first_xfer && (sbd_1st_xfer_len || task->task_1st_xfer_len)) {
+		/* global over-ride has precedence */
+		if (sbd_1st_xfer_len)
+			first_len = sbd_1st_xfer_len;
+		else
+			first_len = task->task_1st_xfer_len;
+	} else {
+		first_len = 0;
+	}
+
+	while (scmd->len && scmd->nbufs < task->task_max_nbufs) {
+
+		xfer_len = MIN(max_len, scmd->len);
+		if (first_len) {
+			xfer_len = MIN(xfer_len, first_len);
+			first_len = 0;
+		}
+		if (scmd->len == xfer_len) {
+			final_xfer = 1;
+		} else {
+			/*
+			 * Attempt to end xfer on a block boundary.
+			 * The only way this does not happen is if the
+			 * xfer_len is small enough to stay contained
+			 * within the same block.
+			 */
+			uint64_t xfer_offset, xfer_aligned_end;
+
+			final_xfer = 0;
+			xfer_offset = scmd->addr + scmd->current_ro;
+			xfer_aligned_end =
+			    P2ALIGN(xfer_offset+xfer_len, blksize);
+			if (xfer_aligned_end > xfer_offset)
+				xfer_len = xfer_aligned_end - xfer_offset;
+		}
+		/*
+		 * Allocate object to track the read and reserve
+		 * enough space for scatter/gather list.
+		 */
+		offset = scmd->addr + scmd->current_ro;
+		nblks = sbd_zvol_numsegs(sl, offset, xfer_len);
+
+		db_private_sz = sizeof (*zvio) + sizeof (uintptr_t) /* PAD */ +
+		    (nblks * sizeof (stmf_sglist_ent_t));
+		dbuf = stmf_alloc(STMF_STRUCT_DATA_BUF, db_private_sz,
+		    AF_DONTZERO);
+		/*
+		 * Setup the dbuf
+		 *
+		 * XXX Framework does not handle variable length sglists
+		 * properly, so setup db_lu_private and db_port_private
+		 * fields here. db_stmf_private is properly set for
+		 * calls to stmf_free.
+		 */
+		if (dbuf->db_port_private == NULL) {
+			/*
+			 * XXX Framework assigns space to PP after db_sglist[0]
+			 */
+			cmn_err(CE_PANIC, "db_port_private == NULL");
+		}
+		pad = (uintptr_t)&dbuf->db_sglist[nblks];
+		dbuf->db_lu_private = (void *)P2ROUNDUP(pad, sizeof (pad));
+		dbuf->db_port_private = NULL;
+		dbuf->db_buf_size = xfer_len;
+		dbuf->db_data_size = xfer_len;
+		dbuf->db_relative_offset = scmd->current_ro;
+		dbuf->db_sglist_length = (uint16_t)nblks;
+		dbuf->db_xfer_status = 0;
+		dbuf->db_handle = 0;
+
+		dbuf->db_flags = (DB_DONT_CACHE | DB_DONT_REUSE |
+		    DB_DIRECTION_TO_RPORT | DB_LU_DATA_BUF);
+		if (final_xfer)
+			dbuf->db_flags |= DB_SEND_STATUS_GOOD;
+
+		zvio = dbuf->db_lu_private;
+		/* Need absolute offset for zvol access */
+		zvio->zvio_offset = offset;
+		zvio->zvio_flags = ZVIO_SYNC;
+
+		/*
+		 * Accounting for start of read.
+		 * Note there is no buffer address for the probe yet.
+		 */
+		stmf_lu_xfer_start(task);
+		DTRACE_PROBE5(backing__store__read__start, sbd_lu_t *, sl,
+		    uint8_t *, NULL, uint64_t, xfer_len,
+		    uint64_t, offset, scsi_task_t *, task);
+		xfer_start = gethrtime();
+
+		ret = sbd_zvol_alloc_read_bufs(sl, dbuf);
+
+		xfer_elapsed = gethrtime() - xfer_start;
+
+		stmf_lu_xfer_done(task, B_TRUE /* read */, (uint64_t)xfer_len,
+		    xfer_elapsed);
+		DTRACE_PROBE6(backing__store__read__end, sbd_lu_t *, sl,
+		    uint8_t *, NULL, uint64_t, xfer_len,
+		    uint64_t, offset, int, ret, scsi_task_t *, task);
+
+		if (ret != 0) {
+			/*
+			 * Read failure from the backend.
+			 */
+			stmf_free(dbuf);
+			if (scmd->nbufs == 0) {
+				/* nothing queued, just finish */
+				scmd->flags &= ~SBD_SCSI_CMD_ACTIVE;
+				stmf_scsilib_send_status(task, STATUS_CHECK,
+				    STMF_SAA_READ_ERROR);
+				rw_exit(&sl->sl_access_state_lock);
+			} else {
+				/* process failure when other dbufs finish */
+				scmd->flags |= SBD_SCSI_CMD_XFER_FAIL;
+			}
+			return;
+		}
+
+
+		/*
+		 * Allow PP to do setup
+		 */
+		xstat = stmf_setup_dbuf(task, dbuf, 0);
+		if (xstat != STMF_SUCCESS) {
+			/*
+			 * This could happen if the driver cannot get the
+			 * DDI resources it needs for this request.
+			 * If other dbufs are queued, try again when the next
+			 * one completes, otherwise give up.
+			 */
+			sbd_zvol_rele_read_bufs(sl, dbuf);
+			stmf_free(dbuf);
+			if (scmd->nbufs > 0) {
+				/* completion of previous dbuf will retry */
+				return;
+			}
+			/*
+			 * Done with this command.
+			 */
+			scmd->flags &= ~SBD_SCSI_CMD_ACTIVE;
+			if (first_xfer)
+				stmf_scsilib_send_status(task, STATUS_QFULL, 0);
+			else
+				stmf_scsilib_send_status(task, STATUS_CHECK,
+				    STMF_SAA_READ_ERROR);
+			rw_exit(&sl->sl_access_state_lock);
+			return;
+		}
+		/*
+		 * dbuf is now queued on task
+		 */
+		scmd->nbufs++;
+
+		/* XXX leave this in for FW? */
+		DTRACE_PROBE4(sbd__xfer, struct scsi_task *, task,
+		    struct stmf_data_buf *, dbuf, uint64_t, offset,
+		    uint32_t, xfer_len);
+		/*
+		 * Do not pass STMF_IOF_LU_DONE so that the zvol
+		 * state can be released in the completion callback.
+		 */
+		xstat = stmf_xfer_data(task, dbuf, 0);
+		switch (xstat) {
+		case STMF_SUCCESS:
+			break;
+		case STMF_BUSY:
+			/*
+			 * The dbuf is queued on the task, but unknown
+			 * to the PP, thus no completion will occur.
+			 */
+			sbd_zvol_rele_read_bufs(sl, dbuf);
+			stmf_teardown_dbuf(task, dbuf);
+			stmf_free(dbuf);
+			scmd->nbufs--;
+			if (scmd->nbufs > 0) {
+				/* completion of previous dbuf will retry */
+				return;
+			}
+			/*
+			 * Done with this command.
+			 */
+			rw_exit(&sl->sl_access_state_lock);
+			scmd->flags &= ~SBD_SCSI_CMD_ACTIVE;
+			if (first_xfer)
+				stmf_scsilib_send_status(task, STATUS_QFULL, 0);
+			else
+				stmf_scsilib_send_status(task, STATUS_CHECK,
+				    STMF_SAA_READ_ERROR);
+			return;
+		case STMF_ABORTED:
+			/*
+			 * Completion from task_done will cleanup
+			 */
+			scmd->flags &= ~SBD_SCSI_CMD_ACTIVE;
+			return;
+		}
+		/*
+		 * Update the xfer progress.
+		 */
+		ASSERT(scmd->len >= xfer_len);
+		scmd->len -= xfer_len;
+		scmd->current_ro += xfer_len;
+	}
+}
+
 void
 sbd_handle_read_xfer_completion(struct scsi_task *task, sbd_cmd_t *scmd,
 				struct stmf_data_buf *dbuf)
@@ -225,6 +488,325 @@
 	sbd_do_read_xfer(task, scmd, dbuf);
 }
 
+/*
+ * This routine must release the DMU resources and free the dbuf
+ * in all cases.  If this is the final dbuf of the task, then drop
+ * the reader lock on the LU state. If there are no errors and more
+ * work to do, then queue more xfer operations.
+ */
+void
+sbd_handle_sgl_read_xfer_completion(struct scsi_task *task, sbd_cmd_t *scmd,
+				struct stmf_data_buf *dbuf)
+{
+	sbd_lu_t *sl = (sbd_lu_t *)task->task_lu->lu_provider_private;
+	stmf_status_t xfer_status;
+	uint32_t data_size;
+	int scmd_err;
+
+	ASSERT(dbuf->db_lu_private);
+	ASSERT(scmd->cmd_type == SBD_CMD_SCSI_READ);
+
+	scmd->nbufs--;	/* account for this dbuf */
+	/*
+	 * Release the DMU resources.
+	 */
+	sbd_zvol_rele_read_bufs(sl, dbuf);
+	/*
+	 * Release the dbuf after retrieving needed fields.
+	 */
+	xfer_status = dbuf->db_xfer_status;
+	data_size = dbuf->db_data_size;
+	stmf_teardown_dbuf(task, dbuf);
+	stmf_free(dbuf);
+	/*
+	 * Release the state lock if this is the last completion.
+	 * If this is the last dbuf on task and all data has been
+	 * transferred or an error encountered, then no more dbufs
+	 * will be queued.
+	 */
+	scmd_err = (((scmd->flags & SBD_SCSI_CMD_ACTIVE) == 0) ||
+	    (scmd->flags & SBD_SCSI_CMD_XFER_FAIL) ||
+	    (xfer_status != STMF_SUCCESS));
+	if (scmd->nbufs == 0 && (scmd->len == 0 || scmd_err)) {
+		/* all DMU state has been released */
+		rw_exit(&sl->sl_access_state_lock);
+	}
+
+	/*
+	 * If there have been no errors, either complete the task
+	 * or issue more data xfer operations.
+	 */
+	if (!scmd_err) {
+		/*
+		 * This chunk completed successfully
+		 */
+		task->task_nbytes_transferred += data_size;
+		if (scmd->nbufs == 0 && scmd->len == 0) {
+			/*
+			 * This command completed successfully
+			 *
+			 * Status was sent along with data, so no status
+			 * completion will occur. Tell stmf we are done.
+			 */
+			scmd->flags &= ~SBD_SCSI_CMD_ACTIVE;
+			stmf_task_lu_done(task);
+			return;
+		}
+		/*
+		 * Start more xfers
+		 */
+		sbd_do_sgl_read_xfer(task, scmd, 0);
+		return;
+	}
+	/*
+	 * Sort out the failure
+	 */
+	if (scmd->flags & SBD_SCSI_CMD_ACTIVE) {
+		/*
+		 * If a previous error occurred, leave the command active
+		 * and wait for the last completion to send the status check.
+		 */
+		if (scmd->flags & SBD_SCSI_CMD_XFER_FAIL) {
+			if (scmd->nbufs == 0) {
+				scmd->flags &= ~SBD_SCSI_CMD_ACTIVE;
+				stmf_scsilib_send_status(task, STATUS_CHECK,
+				    STMF_SAA_READ_ERROR);
+			}
+			return;
+		}
+		/*
+		 * Must have been a failure on current dbuf
+		 */
+		ASSERT(xfer_status != STMF_SUCCESS);
+		scmd->flags &= ~SBD_SCSI_CMD_ACTIVE;
+		stmf_abort(STMF_QUEUE_TASK_ABORT, task, xfer_status, NULL);
+	}
+}
+
+void
+sbd_handle_sgl_write_xfer_completion(struct scsi_task *task, sbd_cmd_t *scmd,
+				struct stmf_data_buf *dbuf)
+{
+	sbd_zvol_io_t *zvio = dbuf->db_lu_private;
+	sbd_lu_t *sl = (sbd_lu_t *)task->task_lu->lu_provider_private;
+	int ret;
+	int scmd_err, scmd_xfer_done;
+	stmf_status_t xfer_status = dbuf->db_xfer_status;
+	uint32_t data_size = dbuf->db_data_size;
+	hrtime_t xfer_start;
+
+	ASSERT(zvio);
+
+	/*
+	 * Allow PP to free up resources before releasing the write bufs
+	 * as writing to the backend could take some time.
+	 */
+	stmf_teardown_dbuf(task, dbuf);
+
+	scmd->nbufs--;	/* account for this dbuf */
+	/*
+	 * All data was queued and this is the last completion,
+	 * but there could still be an error.
+	 */
+	scmd_xfer_done = (scmd->len == 0 && scmd->nbufs == 0);
+	scmd_err = (((scmd->flags & SBD_SCSI_CMD_ACTIVE) == 0) ||
+	    (scmd->flags & SBD_SCSI_CMD_XFER_FAIL) ||
+	    (xfer_status != STMF_SUCCESS));
+
+	/* start the accounting clock */
+	stmf_lu_xfer_start(task);
+	DTRACE_PROBE5(backing__store__write__start, sbd_lu_t *, sl,
+	    uint8_t *, NULL, uint64_t, data_size,
+	    uint64_t, zvio->zvio_offset, scsi_task_t *, task);
+	xfer_start = gethrtime();
+
+	if (scmd_err) {
+		/* just return the write buffers */
+		sbd_zvol_rele_write_bufs_abort(sl, dbuf);
+		ret = 0;
+	} else {
+		if (scmd_xfer_done)
+			zvio->zvio_flags = ZVIO_COMMIT;
+		else
+			zvio->zvio_flags = 0;
+		/* write the data */
+		ret = sbd_zvol_rele_write_bufs(sl, dbuf);
+	}
+
+	/* finalize accounting */
+	stmf_lu_xfer_done(task, B_FALSE /* not read */, data_size,
+	    (gethrtime() - xfer_start));
+	DTRACE_PROBE6(backing__store__write__end, sbd_lu_t *, sl,
+	    uint8_t *, NULL, uint64_t, data_size,
+	    uint64_t, zvio->zvio_offset, int, ret,  scsi_task_t *, task);
+
+	if (ret != 0) {
+		/* update the error flag */
+		scmd->flags |= SBD_SCSI_CMD_XFER_FAIL;
+		scmd_err = 1;
+	}
+
+	/* Release the dbuf */
+	stmf_free(dbuf);
+
+	/*
+	 * Release the state lock if this is the last completion.
+	 * If this is the last dbuf on task and all data has been
+	 * transferred or an error encountered, then no more dbufs
+	 * will be queued.
+	 */
+	if (scmd->nbufs == 0 && (scmd->len == 0 || scmd_err)) {
+		/* all DMU state has been released */
+		rw_exit(&sl->sl_access_state_lock);
+	}
+	/*
+	 * If there have been no errors, either complete the task
+	 * or issue more data xfer operations.
+	 */
+	if (!scmd_err) {
+		/* This chunk completed successfully */
+		task->task_nbytes_transferred += data_size;
+		if (scmd_xfer_done) {
+			/* This command completed successfully */
+			scmd->flags &= ~SBD_SCSI_CMD_ACTIVE;
+			if ((scmd->flags & SBD_SCSI_CMD_SYNC_WRITE) &&
+			    (sbd_flush_data_cache(sl, 0) != SBD_SUCCESS)) {
+				stmf_scsilib_send_status(task, STATUS_CHECK,
+				    STMF_SAA_WRITE_ERROR);
+			} else {
+				stmf_scsilib_send_status(task, STATUS_GOOD, 0);
+			}
+			return;
+		}
+		/*
+		 * Start more xfers
+		 */
+		sbd_do_sgl_write_xfer(task, scmd, 0);
+		return;
+	}
+	/*
+	 * Sort out the failure
+	 */
+	if (scmd->flags & SBD_SCSI_CMD_ACTIVE) {
+		if (scmd->flags & SBD_SCSI_CMD_XFER_FAIL) {
+			if (scmd->nbufs == 0) {
+				scmd->flags &= ~SBD_SCSI_CMD_ACTIVE;
+				stmf_scsilib_send_status(task, STATUS_CHECK,
+				    STMF_SAA_WRITE_ERROR);
+			}
+			/*
+			 * Leave the command active until last dbuf completes.
+			 */
+			return;
+		}
+		scmd->flags &= ~SBD_SCSI_CMD_ACTIVE;
+		ASSERT(xfer_status != STMF_SUCCESS);
+		stmf_abort(STMF_QUEUE_TASK_ABORT, task, xfer_status, NULL);
+	}
+}
+
+/*
+ * Handle a copy operation using the zvol interface.
+ *
+ * Similar to the sbd_data_read/write path, except it goes directly through
+ * the zvol interfaces. It can pass a port provider sglist in the
+ * form of uio which is lost through the vn_rdwr path.
+ *
+ * Returns:
+ *	STMF_SUCCESS - request handled
+ *	STMF_FAILURE - request not handled, caller must deal with error
+ */
+static stmf_status_t
+sbd_copy_rdwr(scsi_task_t *task, uint64_t laddr, stmf_data_buf_t *dbuf,
+    int cmd, int commit)
+{
+	sbd_lu_t		*sl = task->task_lu->lu_provider_private;
+	struct uio		uio;
+	struct iovec		*iov, *tiov, iov1[8];
+	uint32_t		len, resid;
+	int			ret, i, iovcnt, flags;
+	hrtime_t		xfer_start;
+	boolean_t		is_read;
+
+	ASSERT(cmd == SBD_CMD_SCSI_READ || cmd == SBD_CMD_SCSI_WRITE);
+
+	is_read = (cmd == SBD_CMD_SCSI_READ) ? B_TRUE : B_FALSE;
+	iovcnt = dbuf->db_sglist_length;
+	/* use the stack for small iovecs */
+	if (iovcnt > 8) {
+		iov = kmem_alloc(iovcnt * sizeof (*iov), KM_SLEEP);
+	} else {
+		iov = &iov1[0];
+	}
+
+	/* Convert dbuf sglist to iovec format */
+	len = dbuf->db_data_size;
+	resid = len;
+	tiov = iov;
+	for (i = 0; i < iovcnt; i++) {
+		tiov->iov_base = (caddr_t)dbuf->db_sglist[i].seg_addr;
+		tiov->iov_len = MIN(resid, dbuf->db_sglist[i].seg_length);
+		resid -= tiov->iov_len;
+		tiov++;
+	}
+	if (resid != 0) {
+		cmn_err(CE_WARN, "inconsistant sglist rem %d", resid);
+		if (iov != &iov1[0])
+			kmem_free(iov, iovcnt * sizeof (*iov));
+		return (STMF_FAILURE);
+	}
+	/* Setup the uio struct */
+	uio.uio_iov = iov;
+	uio.uio_iovcnt = iovcnt;
+	uio.uio_loffset = laddr;
+	uio.uio_segflg = (short)UIO_SYSSPACE;
+	uio.uio_resid = (uint64_t)len;
+	uio.uio_llimit = RLIM64_INFINITY;
+
+	/* start the accounting clock */
+	stmf_lu_xfer_start(task);
+	xfer_start = gethrtime();
+	if (is_read == B_TRUE) {
+		uio.uio_fmode = FREAD;
+		uio.uio_extflg = UIO_COPY_CACHED;
+		DTRACE_PROBE5(backing__store__read__start, sbd_lu_t *, sl,
+		    uint8_t *, NULL, uint64_t, len, uint64_t, laddr,
+		    scsi_task_t *, task);
+
+		/* Fetch the data */
+		ret = sbd_zvol_copy_read(sl, &uio);
+
+		DTRACE_PROBE6(backing__store__read__end, sbd_lu_t *, sl,
+		    uint8_t *, NULL, uint64_t, len, uint64_t, laddr, int, ret,
+		    scsi_task_t *, task);
+	} else {
+		uio.uio_fmode = FWRITE;
+		uio.uio_extflg = UIO_COPY_DEFAULT;
+		DTRACE_PROBE5(backing__store__write__start, sbd_lu_t *, sl,
+		    uint8_t *, NULL, uint64_t, len, uint64_t, laddr,
+		    scsi_task_t *, task);
+
+		flags = (commit) ? ZVIO_COMMIT : 0;
+		/* Write the data */
+		ret = sbd_zvol_copy_write(sl, &uio, flags);
+
+		DTRACE_PROBE6(backing__store__write__end, sbd_lu_t *, sl,
+		    uint8_t *, NULL, uint64_t, len, uint64_t, laddr, int, ret,
+		    scsi_task_t *, task);
+	}
+	/* finalize accounting */
+	stmf_lu_xfer_done(task, is_read, (uint64_t)len,
+	    (gethrtime() - xfer_start));
+
+	if (iov != &iov1[0])
+		kmem_free(iov, iovcnt * sizeof (*iov));
+	if (ret != 0) {
+		/* Backend I/O error */
+		return (STMF_FAILURE);
+	}
+	return (STMF_SUCCESS);
+}
+
 void
 sbd_handle_read(struct scsi_task *task, struct stmf_data_buf *initial_dbuf)
 {
@@ -285,6 +867,95 @@
 		return;
 	}
 
+	/*
+	 * Determine if this read can directly use DMU buffers.
+	 */
+	if (sbd_zcopy & (2|1) &&		/* Debug switch */
+	    initial_dbuf == NULL &&		/* No PP buffer passed in */
+	    sl->sl_flags & SL_CALL_ZVOL &&	/* zvol backing store */
+	    (task->task_additional_flags &
+	    TASK_AF_ACCEPT_LU_DBUF))		/* PP allows it */
+	{
+		/*
+		 * Reduced copy path
+		 */
+		uint32_t copy_threshold, minsize;
+		int ret;
+
+		/*
+		 * The sl_access_state_lock will be held shared
+		 * for the entire request and released when all
+		 * dbufs have completed.
+		 */
+		rw_enter(&sl->sl_access_state_lock, RW_READER);
+		if ((sl->sl_flags & SL_MEDIA_LOADED) == 0) {
+			rw_exit(&sl->sl_access_state_lock);
+			stmf_scsilib_send_status(task, STATUS_CHECK,
+			    STMF_SAA_READ_ERROR);
+			return;
+		}
+
+		/*
+		 * Check if setup is more expensive than copying the data.
+		 *
+		 * Use the global over-ride sbd_zcopy_threshold if set.
+		 */
+		copy_threshold = (sbd_copy_threshold > 0) ?
+		    sbd_copy_threshold : task->task_copy_threshold;
+		minsize = len;
+		if (len < copy_threshold &&
+		    (dbuf = stmf_alloc_dbuf(task, len, &minsize, 0)) != 0) {
+
+			ret = sbd_copy_rdwr(task, laddr, dbuf,
+			    SBD_CMD_SCSI_READ, 0);
+			/* done with the backend */
+			rw_exit(&sl->sl_access_state_lock);
+			if (ret != 0) {
+				/* backend error */
+				stmf_scsilib_send_status(task, STATUS_CHECK,
+				    STMF_SAA_READ_ERROR);
+			} else {
+				/* send along good data */
+				dbuf->db_relative_offset = 0;
+				dbuf->db_data_size = len;
+				dbuf->db_flags = DB_SEND_STATUS_GOOD |
+				    DB_DIRECTION_TO_RPORT;
+				/* XXX keep for FW? */
+				DTRACE_PROBE4(sbd__xfer,
+				    struct scsi_task *, task,
+				    struct stmf_data_buf *, dbuf,
+				    uint64_t, laddr, uint32_t, len);
+				(void) stmf_xfer_data(task, dbuf,
+				    STMF_IOF_LU_DONE);
+			}
+			return;
+		}
+
+		/* committed to reduced copy */
+		if (task->task_lu_private) {
+			scmd = (sbd_cmd_t *)task->task_lu_private;
+		} else {
+			scmd = (sbd_cmd_t *)kmem_alloc(sizeof (sbd_cmd_t),
+			    KM_SLEEP);
+			task->task_lu_private = scmd;
+		}
+		/*
+		 * Setup scmd to track read progress.
+		 */
+		scmd->flags = SBD_SCSI_CMD_ACTIVE;
+		scmd->cmd_type = SBD_CMD_SCSI_READ;
+		scmd->nbufs = 0;
+		scmd->addr = laddr;
+		scmd->len = len;
+		scmd->current_ro = 0;
+
+		/*
+		 * Kick-off the read.
+		 */
+		sbd_do_sgl_read_xfer(task, scmd, 1);
+		return;
+	}
+
 	if (initial_dbuf == NULL) {
 		uint32_t maxsize, minsize, old_minsize;
 
@@ -311,6 +982,10 @@
 			dbuf->db_data_size = len;
 			dbuf->db_flags = DB_SEND_STATUS_GOOD |
 			    DB_DIRECTION_TO_RPORT;
+			/* XXX keep for FW? */
+			DTRACE_PROBE4(sbd__xfer, struct scsi_task *, task,
+			    struct stmf_data_buf *, dbuf,
+			    uint64_t, laddr, uint32_t, len);
 			(void) stmf_xfer_data(task, dbuf, STMF_IOF_LU_DONE);
 		} else {
 			stmf_scsilib_send_status(task, STATUS_CHECK,
@@ -403,6 +1078,224 @@
 }
 
 void
+sbd_do_sgl_write_xfer(struct scsi_task *task, sbd_cmd_t *scmd, int first_xfer)
+{
+	sbd_lu_t *sl = (sbd_lu_t *)task->task_lu->lu_provider_private;
+	sbd_zvol_io_t *zvio;
+	int ret;
+	uint32_t xfer_len, max_len, first_len;
+	stmf_status_t xstat;
+	stmf_data_buf_t *dbuf;
+	uint_t nblks;
+	uint64_t blksize = sl->sl_blksize;
+	uint64_t offset;
+	size_t db_private_sz;
+	uintptr_t pad;
+
+	ASSERT(rw_read_held(&sl->sl_access_state_lock));
+	ASSERT((sl->sl_flags & SL_MEDIA_LOADED) != 0);
+
+	/*
+	 * Calculate the limits on xfer_len to the minimum of :
+	 *    - task limit
+	 *    - lun limit
+	 *    - sbd global limit if set
+	 *    - first xfer limit if set
+	 *
+	 * First, protect against silly over-ride value
+	 */
+	if (sbd_max_xfer_len && ((sbd_max_xfer_len % DEV_BSIZE) != 0)) {
+		cmn_err(CE_WARN, "sbd_max_xfer_len invalid %d, resetting\n",
+		    sbd_max_xfer_len);
+		sbd_max_xfer_len = 0;
+	}
+	if (sbd_1st_xfer_len && ((sbd_1st_xfer_len % DEV_BSIZE) != 0)) {
+		cmn_err(CE_WARN, "sbd_1st_xfer_len invalid %d, resetting\n",
+		    sbd_1st_xfer_len);
+		sbd_1st_xfer_len = 0;
+	}
+
+	max_len = MIN(task->task_max_xfer_len, sl->sl_max_xfer_len);
+	if (sbd_max_xfer_len)
+		max_len = MIN(max_len, sbd_max_xfer_len);
+	/*
+	 * Special case the first xfer if hints are set.
+	 */
+	if (first_xfer && (sbd_1st_xfer_len || task->task_1st_xfer_len)) {
+		/* global over-ride has precedence */
+		if (sbd_1st_xfer_len)
+			first_len = sbd_1st_xfer_len;
+		else
+			first_len = task->task_1st_xfer_len;
+	} else {
+		first_len = 0;
+	}
+
+
+	while (scmd->len && scmd->nbufs < task->task_max_nbufs) {
+
+		xfer_len = MIN(max_len, scmd->len);
+		if (first_len) {
+			xfer_len = MIN(xfer_len, first_len);
+			first_len = 0;
+		}
+		if (xfer_len < scmd->len) {
+			/*
+			 * Attempt to end xfer on a block boundary.
+			 * The only way this does not happen is if the
+			 * xfer_len is small enough to stay contained
+			 * within the same block.
+			 */
+			uint64_t xfer_offset, xfer_aligned_end;
+
+			xfer_offset = scmd->addr + scmd->current_ro;
+			xfer_aligned_end =
+			    P2ALIGN(xfer_offset+xfer_len, blksize);
+			if (xfer_aligned_end > xfer_offset)
+				xfer_len = xfer_aligned_end - xfer_offset;
+		}
+		/*
+		 * Allocate object to track the write and reserve
+		 * enough space for scatter/gather list.
+		 */
+		offset = scmd->addr + scmd->current_ro;
+		nblks = sbd_zvol_numsegs(sl, offset, xfer_len);
+		db_private_sz = sizeof (*zvio) + sizeof (uintptr_t) /* PAD */ +
+		    (nblks * sizeof (stmf_sglist_ent_t));
+		dbuf = stmf_alloc(STMF_STRUCT_DATA_BUF, db_private_sz,
+		    AF_DONTZERO);
+
+		/*
+		 * Setup the dbuf
+		 *
+		 * XXX Framework does not handle variable length sglists
+		 * properly, so setup db_lu_private and db_port_private
+		 * fields here. db_stmf_private is properly set for
+		 * calls to stmf_free.
+		 */
+		if (dbuf->db_port_private == NULL) {
+			/*
+			 * XXX Framework assigns space to PP after db_sglist[0]
+			 */
+			cmn_err(CE_PANIC, "db_port_private == NULL");
+		}
+		pad = (uintptr_t)&dbuf->db_sglist[nblks];
+		dbuf->db_lu_private = (void *)P2ROUNDUP(pad, sizeof (pad));
+		dbuf->db_port_private = NULL;
+		dbuf->db_buf_size = xfer_len;
+		dbuf->db_data_size = xfer_len;
+		dbuf->db_relative_offset = scmd->current_ro;
+		dbuf->db_sglist_length = (uint16_t)nblks;
+		dbuf->db_xfer_status = 0;
+		dbuf->db_handle = 0;
+		dbuf->db_flags = (DB_DONT_CACHE | DB_DONT_REUSE |
+		    DB_DIRECTION_FROM_RPORT | DB_LU_DATA_BUF);
+
+		zvio = dbuf->db_lu_private;
+		zvio->zvio_offset = offset;
+
+		/* get the buffers */
+		ret = sbd_zvol_alloc_write_bufs(sl, dbuf);
+		if (ret != 0) {
+			/*
+			 * Could not allocate buffers from the backend;
+			 * treat it like an IO error.
+			 */
+			stmf_free(dbuf);
+			scmd->flags |= SBD_SCSI_CMD_XFER_FAIL;
+			if (scmd->nbufs == 0) {
+				/*
+				 * Nothing queued, so no completions coming
+				 */
+				stmf_scsilib_send_status(task, STATUS_CHECK,
+				    STMF_SAA_WRITE_ERROR);
+				rw_exit(&sl->sl_access_state_lock);
+			}
+			/*
+			 * Completions of previous buffers will cleanup.
+			 */
+			return;
+		}
+
+		/*
+		 * Allow PP to do setup
+		 */
+		xstat = stmf_setup_dbuf(task, dbuf, 0);
+		if (xstat != STMF_SUCCESS) {
+			/*
+			 * This could happen if the driver cannot get the
+			 * DDI resources it needs for this request.
+			 * If other dbufs are queued, try again when the next
+			 * one completes, otherwise give up.
+			 */
+			sbd_zvol_rele_write_bufs_abort(sl, dbuf);
+			stmf_free(dbuf);
+			if (scmd->nbufs > 0) {
+				/* completion of previous dbuf will retry */
+				return;
+			}
+			/*
+			 * Done with this command.
+			 */
+			scmd->flags &= ~SBD_SCSI_CMD_ACTIVE;
+			if (first_xfer)
+				stmf_scsilib_send_status(task, STATUS_QFULL, 0);
+			else
+				stmf_scsilib_send_status(task, STATUS_CHECK,
+				    STMF_SAA_WRITE_ERROR);
+			rw_exit(&sl->sl_access_state_lock);
+			return;
+		}
+
+		/*
+		 * dbuf is now queued on task
+		 */
+		scmd->nbufs++;
+
+		xstat = stmf_xfer_data(task, dbuf, 0);
+		switch (xstat) {
+		case STMF_SUCCESS:
+			break;
+		case STMF_BUSY:
+			/*
+			 * The dbuf is queued on the task, but unknown
+			 * to the PP, thus no completion will occur.
+			 */
+			sbd_zvol_rele_write_bufs_abort(sl, dbuf);
+			stmf_teardown_dbuf(task, dbuf);
+			stmf_free(dbuf);
+			scmd->nbufs--;
+			if (scmd->nbufs > 0) {
+				/* completion of previous dbuf will retry */
+				return;
+			}
+			/*
+			 * Done with this command.
+			 */
+			scmd->flags &= ~SBD_SCSI_CMD_ACTIVE;
+			if (first_xfer)
+				stmf_scsilib_send_status(task, STATUS_QFULL, 0);
+			else
+				stmf_scsilib_send_status(task, STATUS_CHECK,
+				    STMF_SAA_WRITE_ERROR);
+			rw_exit(&sl->sl_access_state_lock);
+			return;
+		case STMF_ABORTED:
+			/*
+			 * Completion code will cleanup.
+			 */
+			scmd->flags |= SBD_SCSI_CMD_XFER_FAIL;
+			return;
+		}
+		/*
+		 * Update the xfer progress.
+		 */
+		scmd->len -= xfer_len;
+		scmd->current_ro += xfer_len;
+	}
+}
+
+void
 sbd_handle_write_xfer_completion(struct scsi_task *task, sbd_cmd_t *scmd,
     struct stmf_data_buf *dbuf, uint8_t dbuf_reusable)
 {
@@ -440,19 +1333,36 @@
 
 	laddr = scmd->addr + dbuf->db_relative_offset;
 
-	for (buflen = 0, ndx = 0; (buflen < dbuf->db_data_size) &&
-	    (ndx < dbuf->db_sglist_length); ndx++) {
-		iolen = min(dbuf->db_data_size - buflen,
-		    dbuf->db_sglist[ndx].seg_length);
-		if (iolen == 0)
-			break;
-		if (sbd_data_write(sl, task, laddr, (uint64_t)iolen,
-		    dbuf->db_sglist[ndx].seg_addr) != STMF_SUCCESS) {
+	/*
+	 * If this is going to a zvol, use the direct call to
+	 * sbd_zvol_copy_{read,write}. The direct call interface is
+	 * restricted to PPs that accept sglists, but that is not required.
+	 */
+	if (sl->sl_flags & SL_CALL_ZVOL &&
+	    (task->task_additional_flags & TASK_AF_ACCEPT_LU_DBUF) &&
+	    (sbd_zcopy & (4|1))) {
+		int commit;
+
+		commit = (scmd->len == 0 && scmd->nbufs == 0);
+		if (sbd_copy_rdwr(task, laddr, dbuf, SBD_CMD_SCSI_WRITE,
+		    commit) != STMF_SUCCESS)
 			scmd->flags |= SBD_SCSI_CMD_XFER_FAIL;
-			break;
+		buflen = dbuf->db_data_size;
+	} else {
+		for (buflen = 0, ndx = 0; (buflen < dbuf->db_data_size) &&
+		    (ndx < dbuf->db_sglist_length); ndx++) {
+			iolen = min(dbuf->db_data_size - buflen,
+			    dbuf->db_sglist[ndx].seg_length);
+			if (iolen == 0)
+				break;
+			if (sbd_data_write(sl, task, laddr, (uint64_t)iolen,
+			    dbuf->db_sglist[ndx].seg_addr) != STMF_SUCCESS) {
+				scmd->flags |= SBD_SCSI_CMD_XFER_FAIL;
+				break;
+			}
+			buflen += iolen;
+			laddr += (uint64_t)iolen;
 		}
-		buflen += iolen;
-		laddr += (uint64_t)iolen;
 	}
 	task->task_nbytes_transferred += buflen;
 WRITE_XFER_DONE:
@@ -487,6 +1397,36 @@
 	sbd_do_write_xfer(task, scmd, dbuf, dbuf_reusable);
 }
 
+/*
+ * Return true if copy avoidance is beneficial.
+ */
+static int
+sbd_zcopy_write_useful(scsi_task_t *task, uint64_t laddr, uint32_t len,
+    uint64_t blksize)
+{
+	/*
+	 * If there is a global copy threshold over-ride, use it.
+	 * Otherwise use the PP value with the caveat that at least
+	 * 1/2 the data must avoid being copied to be useful.
+	 */
+	if (sbd_copy_threshold > 0) {
+		return (len >= sbd_copy_threshold);
+	} else {
+		uint64_t no_copy_span;
+
+		/* sub-blocksize writes always copy */
+		if (len < task->task_copy_threshold || len < blksize)
+			return (0);
+		/*
+		 * Calculate amount of data that will avoid the copy path.
+		 * The calculation is only valid if len >= blksize.
+		 */
+		no_copy_span = P2ALIGN(laddr+len, blksize) -
+		    P2ROUNDUP(laddr, blksize);
+		return (no_copy_span >= len/2);
+	}
+}
+
 void
 sbd_handle_write(struct scsi_task *task, struct stmf_data_buf *initial_dbuf)
 {
@@ -559,6 +1499,47 @@
 		return;
 	}
 
+	if (sbd_zcopy & (4|1) &&		/* Debug switch */
+	    initial_dbuf == NULL &&		/* No PP buf passed in */
+	    sl->sl_flags & SL_CALL_ZVOL &&	/* zvol backing store */
+	    (task->task_additional_flags &
+	    TASK_AF_ACCEPT_LU_DBUF) &&		/* PP allows it */
+	    sbd_zcopy_write_useful(task, laddr, len, sl->sl_blksize)) {
+
+		/*
+		 * XXX Note that disallowing initial_dbuf will eliminate
+		 * iSCSI from participating. For small writes, that is
+		 * probably ok. For large writes, it may be best to just
+		 * copy the data from the initial dbuf and use zcopy for
+		 * the rest.
+		 */
+		rw_enter(&sl->sl_access_state_lock, RW_READER);
+		if ((sl->sl_flags & SL_MEDIA_LOADED) == 0) {
+			rw_exit(&sl->sl_access_state_lock);
+			stmf_scsilib_send_status(task, STATUS_CHECK,
+			    STMF_SAA_READ_ERROR);
+			return;
+		}
+		/*
+		 * Setup scmd to track the write progress.
+		 */
+		if (task->task_lu_private) {
+			scmd = (sbd_cmd_t *)task->task_lu_private;
+		} else {
+			scmd = (sbd_cmd_t *)kmem_alloc(sizeof (sbd_cmd_t),
+			    KM_SLEEP);
+			task->task_lu_private = scmd;
+		}
+		scmd->flags = SBD_SCSI_CMD_ACTIVE | sync_wr_flag;
+		scmd->cmd_type = SBD_CMD_SCSI_WRITE;
+		scmd->nbufs = 0;
+		scmd->addr = laddr;
+		scmd->len = len;
+		scmd->current_ro = 0;
+		sbd_do_sgl_write_xfer(task, scmd, 1);
+		return;
+	}
+
 	if ((initial_dbuf != NULL) && (task->task_flags & TF_INITIAL_BURST)) {
 		if (initial_dbuf->db_data_size > len) {
 			if (initial_dbuf->db_data_size >
@@ -589,7 +1570,7 @@
 
 	if (do_immediate_data) {
 		/*
-		 * Accout for data passed in this write command
+		 * Account for data passed in this write command
 		 */
 		(void) stmf_xfer_data(task, dbuf, STMF_IOF_STATS_ONLY);
 		scmd->len -= dbuf->db_data_size;
@@ -1916,9 +2897,30 @@
 void
 sbd_dbuf_xfer_done(struct scsi_task *task, struct stmf_data_buf *dbuf)
 {
-	sbd_cmd_t *scmd = NULL;
-
-	scmd = (sbd_cmd_t *)task->task_lu_private;
+	sbd_cmd_t *scmd = (sbd_cmd_t *)task->task_lu_private;
+
+	if (dbuf->db_flags & DB_LU_DATA_BUF) {
+		/*
+		 * Buffers passed in from the LU always complete
+		 * even if the task is no longer active.
+		 */
+		ASSERT(task->task_additional_flags & TASK_AF_ACCEPT_LU_DBUF);
+		ASSERT(scmd);
+		switch (scmd->cmd_type) {
+		case (SBD_CMD_SCSI_READ):
+			sbd_handle_sgl_read_xfer_completion(task, scmd, dbuf);
+			break;
+		case (SBD_CMD_SCSI_WRITE):
+			sbd_handle_sgl_write_xfer_completion(task, scmd, dbuf);
+			break;
+		default:
+			cmn_err(CE_PANIC, "Unknown cmd type, task = %p",
+			    (void *)task);
+			break;
+		}
+		return;
+	}
+
 	if ((scmd == NULL) || ((scmd->flags & SBD_SCSI_CMD_ACTIVE) == 0))
 		return;
 
@@ -2005,6 +3007,38 @@
 	return (STMF_NOT_FOUND);
 }
 
+/*
+ * This function is called during task clean-up if the
+ * DB_LU_FLAG is set on the dbuf. This should only be called for
+ * abort processing after sbd_abort has been called for the task.
+ */
+void
+sbd_dbuf_free(struct scsi_task *task, struct stmf_data_buf *dbuf)
+{
+	sbd_cmd_t *scmd = (sbd_cmd_t *)task->task_lu_private;
+	sbd_lu_t *sl = (sbd_lu_t *)task->task_lu->lu_provider_private;
+
+	ASSERT(dbuf->db_lu_private);
+	ASSERT(scmd && scmd->nbufs > 0);
+	ASSERT((scmd->flags & SBD_SCSI_CMD_ACTIVE) == 0);
+	ASSERT(dbuf->db_flags & DB_LU_DATA_BUF);
+	ASSERT(task->task_additional_flags & TASK_AF_ACCEPT_LU_DBUF);
+	ASSERT((curthread->t_flag & T_INTR_THREAD) == 0);
+
+	if (scmd->cmd_type == SBD_CMD_SCSI_READ) {
+		sbd_zvol_rele_read_bufs(sl, dbuf);
+	} else if (scmd->cmd_type == SBD_CMD_SCSI_WRITE) {
+		sbd_zvol_rele_write_bufs_abort(sl, dbuf);
+	} else {
+		cmn_err(CE_PANIC, "Unknown cmd type %d, task = %p",
+		    scmd->cmd_type, (void *)task);
+	}
+	if (--scmd->nbufs == 0)
+		rw_exit(&sl->sl_access_state_lock);
+	stmf_teardown_dbuf(task, dbuf);
+	stmf_free(dbuf);
+}
+
 /* ARGSUSED */
 void
 sbd_ctl(struct stmf_lu *lu, int cmd, void *arg)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/common/io/comstar/lu/stmf_sbd/sbd_zvol.c	Wed May 05 10:23:23 2010 -0700
@@ -0,0 +1,460 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#include <sys/conf.h>
+#include <sys/file.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/modctl.h>
+#include <sys/scsi/scsi.h>
+#include <sys/scsi/impl/scsi_reset_notify.h>
+#include <sys/scsi/generic/mode.h>
+#include <sys/disp.h>
+#include <sys/byteorder.h>
+#include <sys/atomic.h>
+#include <sys/sdt.h>
+#include <sys/dkio.h>
+#include <sys/dmu.h>
+#include <sys/arc.h>
+#include <sys/zvol.h>
+#include <sys/zfs_rlock.h>
+
+#include <stmf.h>
+#include <lpif.h>
+#include <portif.h>
+#include <stmf_ioctl.h>
+#include <stmf_sbd.h>
+#include <stmf_sbd_ioctl.h>
+#include <sbd_impl.h>
+
+
+/*
+ * This file contains direct calls into the zfs module.
+ * These functions mimic zvol_read and zvol_write except pointers
+ * to the data buffers are passed instead of copying the data itself.
+ *
+ * zfs internal interfaces referenced here:
+ *
+ * FUNCTIONS
+ *    dmu_buf_hold_array_by_bonus()
+ *    dmu_buf_rele_array()
+ *
+ *    dmu_request_arc_buf()
+ *    dmu_assign_arcbuf()
+ *    dmu_return_arc()
+ *    arc_buf_size()
+ *
+ *    dmu_tx_create()
+ *    dmu_tx_hold_write()
+ *    dmu_tx_assign()
+ *    dmu_tx_commit(tx)
+ *    dmu_tx_abort(tx)
+ *    zil_commit()
+ *
+ *    zfs_range_lock()
+ *    zfs_range_unlock()
+ *
+ *    zvol_log_write()
+ *
+ *    dmu_read_uio()
+ *    dmu_write_uio()
+ * MINOR DATA
+ *    zv_volsize
+ *    zv_volblocksize
+ *    zv_flags		- for WCE
+ *    zv_objset		- dmu_tx_create
+ *    zv_zilog		- zil_commit
+ *    zv_znode		- zfs_range_lock
+ *    zv_dbuf		- dmu_buf_hold_array_by_bonus, dmu_request_arcbuf
+ * GLOBAL DATA
+ *    zvol_maxphys
+ */
+
+/*
+ * Take direct control of the volume instead of using the driver
+ * interfaces provided by zvol.c. Gather parameters and handles
+ * needed to make direct calls into zfs/dmu/zvol. The driver is
+ * opened exclusively at this point, so these parameters cannot change.
+ *
+ * NOTE: the object size and WCE can change while the device
+ * is open, so they must be fetched for every operation.
+ */
+int
+sbd_zvol_get_volume_params(sbd_lu_t *sl)
+{
+	int ret;
+
+	ret = zvol_get_volume_params(sl->sl_zvol_minor,
+	    &sl->sl_blksize,		/* volume block size */
+	    &sl->sl_max_xfer_len,	/* max data chunk size */
+	    &sl->sl_zvol_minor_hdl,	/* minor soft state */
+	    &sl->sl_zvol_objset_hdl,	/* dmu_tx_create */
+	    &sl->sl_zvol_zil_hdl,	/* zil_commit */
+	    &sl->sl_zvol_rl_hdl,	/* zfs_range_lock */
+	    &sl->sl_zvol_bonus_hdl);	/* dmu_buf_hold_array_by_bonus, */
+					/* dmu_request_arcbuf, */
+					/* dmu_assign_arcbuf */
+
+	if (ret == 0 && sl->sl_blksize < MMU_PAGESIZE) {
+		cmn_err(CE_NOTE, "COMSTAR reduced copy disabled due to "
+		    "small zvol blocksize (%d)\n", (int)sl->sl_blksize);
+		ret = ENOTSUP;
+	}
+
+	return (ret);
+}
+
+/*
+ * Return the number of elements in a scatter/gather list required for
+ * the given span in the zvol. Elements are 1:1 with zvol blocks.
+ */
+uint32_t
+sbd_zvol_numsegs(sbd_lu_t *sl, uint64_t off, uint32_t len)
+{
+	uint64_t blksz = sl->sl_blksize;
+	uint64_t endoff = off + len;
+	uint64_t numsegs;
+
+	numsegs = (P2ROUNDUP(endoff, blksz) - P2ALIGN(off, blksz)) / blksz;
+	return ((uint32_t)numsegs);
+}
+
+/*
+ * Return an array of dmu_buf_t pointers for the requested range.
+ * The dmu buffers are either in cache or read in synchronously.
+ * Fill in the dbuf sglist from the dmu_buf_t array.
+ */
+static void *RDTAG = "sbd_zvol_read";
+
+int
+sbd_zvol_alloc_read_bufs(sbd_lu_t *sl, stmf_data_buf_t *dbuf)
+{
+	sbd_zvol_io_t	*zvio = dbuf->db_lu_private;
+	rl_t 		*rl;
+	int 		numbufs, error;
+	uint64_t 	len = dbuf->db_data_size;
+	uint64_t 	offset = zvio->zvio_offset;
+	dmu_buf_t	**dbpp, *dbp;
+
+	/* Make sure request is reasonable */
+	if (len > sl->sl_max_xfer_len)
+		return (E2BIG);
+	if (offset + len  > zvol_get_volume_size(sl->sl_zvol_minor_hdl))
+		return (EIO);
+
+	/*
+	 * The range lock is only held until the dmu buffers read in and
+	 * held; not during the callers use of the data.
+	 */
+	rl = zfs_range_lock(sl->sl_zvol_rl_hdl, offset, len, RL_READER);
+
+	error = dmu_buf_hold_array_by_bonus(sl->sl_zvol_bonus_hdl, offset,
+	    len, TRUE, RDTAG, &numbufs, &dbpp);
+
+	zfs_range_unlock(rl);
+
+	if (error == ECKSUM)
+		error = EIO;
+
+	if (error == 0) {
+		/*
+		 * Fill in db_sglist from the dmu_buf_t array.
+		 */
+		int		i;
+		stmf_sglist_ent_t *sgl;
+		uint64_t	odiff, seglen;
+
+		zvio->zvio_dbp = dbpp;
+		/* make sure db_sglist is large enough */
+		if (dbuf->db_sglist_length != numbufs) {
+			cmn_err(CE_PANIC, "wrong size sglist: dbuf %d != %d\n",
+			    dbuf->db_sglist_length, numbufs);
+		}
+
+		sgl = &dbuf->db_sglist[0];
+		for (i = 0; i < numbufs; i++) {
+			dbp = dbpp[i];
+			odiff =  offset - dbp->db_offset;
+			ASSERT(odiff == 0 || i == 0);
+			sgl->seg_addr = (uint8_t *)dbp->db_data + odiff;
+			seglen = MIN(len, dbp->db_size - odiff);
+			sgl->seg_length = (uint32_t)seglen;
+			offset += seglen;
+			len -= seglen;
+			sgl++;
+		}
+		ASSERT(len == 0);
+
+	}
+	return (error);
+}
+
+/*
+ * Release a dmu_buf_t array.
+ */
+/*ARGSUSED*/
+void
+sbd_zvol_rele_read_bufs(sbd_lu_t *sl, stmf_data_buf_t *dbuf)
+{
+	sbd_zvol_io_t *zvio = dbuf->db_lu_private;
+
+	ASSERT(zvio->zvio_dbp);
+	ASSERT(dbuf->db_sglist_length);
+
+	dmu_buf_rele_array(zvio->zvio_dbp, (int)dbuf->db_sglist_length, RDTAG);
+}
+
+/*
+ * Allocate enough loaned arc buffers for the requested region.
+ * Mimic the handling of the dmu_buf_t array used for reads as closely
+ * as possible even though the arc_buf_t's are anonymous until released.
+ * The buffers will match the zvol object blocks sizes and alignments
+ * such that a data copy may be avoided when the buffers are assigned.
+ */
+int
+sbd_zvol_alloc_write_bufs(sbd_lu_t *sl, stmf_data_buf_t *dbuf)
+{
+	sbd_zvol_io_t	*zvio = dbuf->db_lu_private;
+	int		blkshift, numbufs, i;
+	uint64_t	blksize;
+	arc_buf_t	**abp;
+	stmf_sglist_ent_t *sgl;
+	uint64_t 	len = dbuf->db_data_size;
+	uint64_t 	offset = zvio->zvio_offset;
+
+	/* Make sure request is reasonable */
+	if (len > sl->sl_max_xfer_len)
+		return (E2BIG);
+	if (offset + len  > zvol_get_volume_size(sl->sl_zvol_minor_hdl))
+		return (EIO);
+
+	/*
+	 * Break up the request into chunks to match
+	 * the volume block size. Only full, and aligned
+	 * buffers will avoid the data copy in the dmu.
+	 */
+	/*
+	 * calculate how may dbufs are needed
+	 */
+	blksize = sl->sl_blksize;
+	ASSERT(ISP2(blksize));
+	blkshift = highbit(blksize - 1);
+	/*
+	 * taken from dmu_buf_hold_array_by_dnode()
+	 */
+	numbufs = (P2ROUNDUP(offset+len, 1ULL<<blkshift) -
+	    P2ALIGN(offset, 1ULL<<blkshift)) >> blkshift;
+	if (dbuf->db_sglist_length != numbufs) {
+		cmn_err(CE_PANIC, "wrong size sglist: dbuf %d != %d\n",
+		    dbuf->db_sglist_length, numbufs);
+	}
+	/*
+	 * allocate a holder for the needed arc_buf pointers
+	 */
+	abp = kmem_alloc(sizeof (arc_buf_t *) * numbufs, KM_SLEEP);
+	/*
+	 * The write operation uses loaned arc buffers so that
+	 * the xfer_data is done outside of a dmu transaction.
+	 * These buffers will exactly match the request unlike
+	 * the dmu buffers obtained from the read operation.
+	 */
+	/*
+	 * allocate the arc buffers and fill in the stmf sglist
+	 */
+	sgl = &dbuf->db_sglist[0];
+	for (i = 0; i < numbufs; i++) {
+		uint64_t seglen;
+
+		/* first block may not be aligned */
+		seglen = P2NPHASE(offset, blksize);
+		if (seglen == 0)
+			seglen = blksize;
+		seglen = MIN(seglen, len);
+		abp[i] = dmu_request_arcbuf(sl->sl_zvol_bonus_hdl, (int)seglen);
+		ASSERT(arc_buf_size(abp[i]) == (int)seglen);
+		sgl->seg_addr = abp[i]->b_data;
+		sgl->seg_length = (uint32_t)seglen;
+		sgl++;
+		offset += seglen;
+		len -= seglen;
+	}
+	ASSERT(len == 0);
+
+	zvio->zvio_abp = abp;
+	return (0);
+}
+
+/*ARGSUSED*/
+void
+sbd_zvol_rele_write_bufs_abort(sbd_lu_t *sl, stmf_data_buf_t *dbuf)
+{
+	sbd_zvol_io_t *zvio = dbuf->db_lu_private;
+	int i;
+	arc_buf_t **abp = zvio->zvio_abp;
+
+	/* free arcbufs */
+	for (i = 0; i < dbuf->db_sglist_length; i++)
+		dmu_return_arcbuf(*abp++);
+	kmem_free(zvio->zvio_abp,
+	    sizeof (arc_buf_t *) * dbuf->db_sglist_length);
+	zvio->zvio_abp = NULL;
+}
+
+/*
+ * Release the arc_buf_t array allocated above and handle these cases :
+ *
+ * flags == 0 - create transaction and assign all arc bufs to offsets
+ * flags == ZVIO_COMMIT - same as above and commit to zil on sync devices
+ */
+int
+sbd_zvol_rele_write_bufs(sbd_lu_t *sl, stmf_data_buf_t *dbuf)
+{
+	sbd_zvol_io_t	*zvio = dbuf->db_lu_private;
+	dmu_tx_t	*tx;
+	int		sync, i, error;
+	rl_t 		*rl;
+	arc_buf_t	**abp = zvio->zvio_abp;
+	int		flags = zvio->zvio_flags;
+	uint64_t	toffset, offset = zvio->zvio_offset;
+	uint64_t	resid, len = dbuf->db_data_size;
+
+	ASSERT(flags == 0 || flags == ZVIO_COMMIT || flags == ZVIO_ABORT);
+
+	rl = zfs_range_lock(sl->sl_zvol_rl_hdl, offset, len, RL_WRITER);
+
+	tx = dmu_tx_create(sl->sl_zvol_objset_hdl);
+	dmu_tx_hold_write(tx, ZVOL_OBJ, offset, (int)len);
+	error = dmu_tx_assign(tx, TXG_WAIT);
+
+	if (error) {
+		dmu_tx_abort(tx);
+		zfs_range_unlock(rl);
+		sbd_zvol_rele_write_bufs_abort(sl, dbuf);
+		return (error);
+	}
+
+	toffset = offset;
+	resid = len;
+	for (i = 0; i < dbuf->db_sglist_length; i++) {
+		arc_buf_t *abuf;
+		int size;
+
+		abuf = abp[i];
+		size = arc_buf_size(abuf);
+		dmu_assign_arcbuf(sl->sl_zvol_bonus_hdl, toffset, abuf, tx);
+		toffset += size;
+		resid -= size;
+	}
+	ASSERT(resid == 0);
+
+	sync = !zvol_get_volume_wce(sl->sl_zvol_minor_hdl);
+	zvol_log_write_minor(sl->sl_zvol_minor_hdl, tx, offset,
+	    (ssize_t)len, sync);
+	dmu_tx_commit(tx);
+	zfs_range_unlock(rl);
+	kmem_free(zvio->zvio_abp,
+	    sizeof (arc_buf_t *) * dbuf->db_sglist_length);
+	zvio->zvio_abp = NULL;
+	if (sync && (flags & ZVIO_COMMIT))
+		zil_commit(sl->sl_zvol_zil_hdl, UINT64_MAX, ZVOL_OBJ);
+	return (0);
+}
+
+/*
+ * Copy interface for callers using direct zvol access.
+ * Very similar to zvol_read but the uio may have multiple iovec entries.
+ */
+int
+sbd_zvol_copy_read(sbd_lu_t *sl, uio_t *uio)
+{
+	int		error;
+	rl_t 		*rl;
+	uint64_t	len = (uint64_t)uio->uio_resid;
+	uint64_t	offset = (uint64_t)uio->uio_loffset;
+
+	/* Make sure request is reasonable */
+	if (len > sl->sl_max_xfer_len)
+		return (E2BIG);
+	if (offset + len  > zvol_get_volume_size(sl->sl_zvol_minor_hdl))
+		return (EIO);
+
+	rl = zfs_range_lock(sl->sl_zvol_rl_hdl, offset, len, RL_READER);
+
+	error =  dmu_read_uio(sl->sl_zvol_objset_hdl, ZVOL_OBJ, uio, len);
+
+	zfs_range_unlock(rl);
+	if (error == ECKSUM)
+		error = EIO;
+	return (error);
+}
+
+/*
+ * Copy interface for callers using direct zvol access.
+ * Very similar to zvol_write but the uio may have multiple iovec entries.
+ */
+int
+sbd_zvol_copy_write(sbd_lu_t *sl, uio_t *uio, int flags)
+{
+	rl_t 		*rl;
+	dmu_tx_t 	*tx;
+	int		error, sync;
+	uint64_t	len = (uint64_t)uio->uio_resid;
+	uint64_t	offset = (uint64_t)uio->uio_loffset;
+
+	ASSERT(flags == 0 || flags == ZVIO_COMMIT);
+
+	/* Make sure request is reasonable */
+	if (len > sl->sl_max_xfer_len)
+		return (E2BIG);
+	if (offset + len  > zvol_get_volume_size(sl->sl_zvol_minor_hdl))
+		return (EIO);
+
+	rl = zfs_range_lock(sl->sl_zvol_rl_hdl, offset, len, RL_WRITER);
+
+	sync = !zvol_get_volume_wce(sl->sl_zvol_minor_hdl);
+
+	tx = dmu_tx_create(sl->sl_zvol_objset_hdl);
+	dmu_tx_hold_write(tx, ZVOL_OBJ, offset, (int)uio->uio_resid);
+	error = dmu_tx_assign(tx, TXG_WAIT);
+	if (error) {
+		dmu_tx_abort(tx);
+	} else {
+		/*
+		 * XXX use the new bonus handle entry.
+		 */
+		error = dmu_write_uio(sl->sl_zvol_objset_hdl, ZVOL_OBJ,
+		    uio, len, tx);
+		if (error == 0) {
+			zvol_log_write_minor(sl->sl_zvol_minor_hdl, tx, offset,
+			    (ssize_t)len, sync);
+		}
+		dmu_tx_commit(tx);
+	}
+	zfs_range_unlock(rl);
+	if (sync && (flags & ZVIO_COMMIT))
+		zil_commit(sl->sl_zvol_zil_hdl, UINT64_MAX, ZVOL_OBJ);
+	if (error == ECKSUM)
+		error = EIO;
+	return (error);
+}
--- a/usr/src/uts/common/io/comstar/lu/stmf_sbd/stmf_sbd.h	Wed May 05 10:34:37 2010 -0400
+++ b/usr/src/uts/common/io/comstar/lu/stmf_sbd/stmf_sbd.h	Wed May 05 10:23:23 2010 -0700
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #ifndef	_STMF_SBD_H
@@ -221,6 +220,13 @@
 	/* zfs metadata */
 	krwlock_t	sl_zfs_meta_lock;
 	char		*sl_zfs_meta;
+	minor_t		sl_zvol_minor;		/* for direct zvol calls */
+	/* opaque handles for zvol direct calls */
+	void		*sl_zvol_minor_hdl;
+	void		*sl_zvol_objset_hdl;
+	void		*sl_zvol_zil_hdl;
+	void		*sl_zvol_rl_hdl;
+	void		*sl_zvol_bonus_hdl;
 
 	/* Backing store */
 	char		*sl_data_filename;
@@ -230,6 +236,8 @@
 	uint64_t	sl_data_readable_size;	/* read() fails after this */
 	uint64_t	sl_data_offset;		/* After the metadata,if any */
 	uint64_t	sl_lu_size;		/* READ CAPACITY size */
+	uint64_t	sl_blksize;		/* used for zvols */
+	uint64_t	sl_max_xfer_len;	/* used for zvols */
 
 	struct sbd_it_data	*sl_it_list;
 	struct sbd_pgr		*sl_pgr;
@@ -258,6 +266,7 @@
 #define	SL_ZFS_META			    0x10000
 #define	SL_WRITEBACK_CACHE_SET_UNSUPPORTED  0x20000
 #define	SL_FLUSH_ON_DISABLED_WRITECACHE	    0x40000
+#define	SL_CALL_ZVOL			    0x80000
 
 /*
  * sl_trans_op. LU is undergoing some transition and this field
--- a/usr/src/uts/common/io/comstar/port/fct/fct.c	Wed May 05 10:34:37 2010 -0400
+++ b/usr/src/uts/common/io/comstar/port/fct/fct.c	Wed May 05 10:23:23 2010 -0700
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #include <sys/conf.h>
@@ -1096,6 +1095,27 @@
 	    pminsize, flags));
 }
 
+stmf_status_t
+fct_setup_dbuf(scsi_task_t *task, stmf_data_buf_t *dbuf, uint32_t flags)
+{
+	fct_local_port_t *port = (fct_local_port_t *)
+	    task->task_lport->lport_port_private;
+
+	ASSERT(port->port_fds->fds_setup_dbuf != NULL);
+	if (port->port_fds->fds_setup_dbuf == NULL)
+		return (STMF_FAILURE);
+
+	return (port->port_fds->fds_setup_dbuf(port, dbuf, flags));
+}
+
+void
+fct_teardown_dbuf(stmf_dbuf_store_t *ds, stmf_data_buf_t *dbuf)
+{
+	fct_dbuf_store_t *fds = ds->ds_port_private;
+
+	fds->fds_teardown_dbuf(fds, dbuf);
+}
+
 void
 fct_free_dbuf(stmf_dbuf_store_t *ds, stmf_data_buf_t *dbuf)
 {
@@ -1199,6 +1219,8 @@
 	lport->lport_pp = port->port_pp;
 	port->port_fds->fds_ds->ds_alloc_data_buf = fct_alloc_dbuf;
 	port->port_fds->fds_ds->ds_free_data_buf = fct_free_dbuf;
+	port->port_fds->fds_ds->ds_setup_dbuf = fct_setup_dbuf;
+	port->port_fds->fds_ds->ds_teardown_dbuf = fct_teardown_dbuf;
 	lport->lport_ds = port->port_fds->fds_ds;
 	lport->lport_xfer_data = fct_xfer_scsi_data;
 	lport->lport_send_status = fct_send_scsi_status;
@@ -1711,6 +1733,8 @@
 void
 fct_post_rcvd_cmd(fct_cmd_t *cmd, stmf_data_buf_t *dbuf)
 {
+	fct_dbuf_store_t *fds;
+
 	if (cmd->cmd_type == FCT_CMD_FCP_XCHG) {
 		fct_i_cmd_t *icmd = (fct_i_cmd_t *)cmd->cmd_fct_private;
 		fct_i_local_port_t *iport =
@@ -1739,6 +1763,30 @@
 				task->task_additional_flags |=
 				    TASK_AF_PORT_LOAD_HIGH;
 		}
+		/*
+		 * If the target driver accepts sglists, fill in task fields.
+		 */
+		fds = cmd->cmd_port->port_fds;
+		if (fds->fds_setup_dbuf != NULL) {
+			task->task_additional_flags |= TASK_AF_ACCEPT_LU_DBUF;
+			task->task_copy_threshold = fds->fds_copy_threshold;
+			task->task_max_xfer_len = fds->fds_max_sgl_xfer_len;
+			/*
+			 * A single stream load encounters a little extra
+			 * latency if large xfers are done in 1 chunk.
+			 * Give a hint to the LU that starting the xfer
+			 * with a smaller chunk would be better in this case.
+			 * For any other load, use maximum chunk size.
+			 */
+			if (load == 1) {
+				/* estimate */
+				task->task_1st_xfer_len = 128*1024;
+			} else {
+				/* zero means no hint */
+				task->task_1st_xfer_len = 0;
+			}
+		}
+
 		stmf_post_task((scsi_task_t *)cmd->cmd_specific, dbuf);
 		atomic_and_32(&icmd->icmd_flags, ~ICMD_IN_TRANSITION);
 		return;
--- a/usr/src/uts/common/io/comstar/port/qlt/qlt.c	Wed May 05 10:34:37 2010 -0400
+++ b/usr/src/uts/common/io/comstar/port/qlt/qlt.c	Wed May 05 10:23:23 2010 -0700
@@ -25,8 +25,7 @@
  */
 
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #include <sys/conf.h>
@@ -1006,7 +1005,8 @@
 	port_attrs->supported_speed = PORT_SPEED_1G |
 	    PORT_SPEED_2G | PORT_SPEED_4G;
 	if (qlt->qlt_25xx_chip)
-		port_attrs->supported_speed |= PORT_SPEED_8G;
+		port_attrs->supported_speed = PORT_SPEED_2G | PORT_SPEED_4G |
+		    PORT_SPEED_8G;
 	if (qlt->qlt_81xx_chip)
 		port_attrs->supported_speed = PORT_SPEED_10G;
 
@@ -1097,6 +1097,9 @@
 	if (qlt_dmem_init(qlt) != QLT_SUCCESS) {
 		return (FCT_FAILURE);
 	}
+	/* Initialize the ddi_dma_handle free pool */
+	qlt_dma_handle_pool_init(qlt);
+
 	port = (fct_local_port_t *)fct_alloc(FCT_STRUCT_LOCAL_PORT, 0, 0);
 	if (port == NULL) {
 		goto qlt_pstart_fail_1;
@@ -1108,6 +1111,10 @@
 	qlt->qlt_port = port;
 	fds->fds_alloc_data_buf = qlt_dmem_alloc;
 	fds->fds_free_data_buf = qlt_dmem_free;
+	fds->fds_setup_dbuf = qlt_dma_setup_dbuf;
+	fds->fds_teardown_dbuf = qlt_dma_teardown_dbuf;
+	fds->fds_max_sgl_xfer_len = QLT_DMA_SG_LIST_LENGTH * MMU_PAGESIZE;
+	fds->fds_copy_threshold = MMU_PAGESIZE;
 	fds->fds_fca_private = (void *)qlt;
 	/*
 	 * Since we keep everything in the state struct and dont allocate any
@@ -1158,6 +1165,7 @@
 	fct_free(port);
 	qlt->qlt_port = NULL;
 qlt_pstart_fail_1:
+	qlt_dma_handle_pool_fini(qlt);
 	qlt_dmem_fini(qlt);
 	return (QLT_FAILURE);
 }
@@ -1175,6 +1183,7 @@
 	fct_free(qlt->qlt_port->port_fds);
 	fct_free(qlt->qlt_port);
 	qlt->qlt_port = NULL;
+	qlt_dma_handle_pool_fini(qlt);
 	qlt_dmem_fini(qlt);
 	return (QLT_SUCCESS);
 }
@@ -1333,13 +1342,13 @@
 		DMEM_WR16(qlt, icb+0x74,
 		    qlt81nvr->enode_mac[4] |
 		    (qlt81nvr->enode_mac[5] << 8));
-	} else {
-		DMEM_WR32(qlt, icb+0x5c, BIT_11 | BIT_5 | BIT_4 |
-		    BIT_2 | BIT_1 | BIT_0);
-		DMEM_WR32(qlt, icb+0x60, BIT_5);
-		DMEM_WR32(qlt, icb+0x64, BIT_14 | BIT_8 | BIT_7 |
-		    BIT_4);
-	}
+		} else {
+			DMEM_WR32(qlt, icb+0x5c, BIT_11 | BIT_5 | BIT_4 |
+			    BIT_2 | BIT_1 | BIT_0);
+			DMEM_WR32(qlt, icb+0x60, BIT_5);
+			DMEM_WR32(qlt, icb+0x64, BIT_14 | BIT_8 | BIT_7 |
+			    BIT_4);
+		}
 
 	if (qlt->qlt_81xx_chip) {
 		qlt_dmem_bctl_t		*bctl;
@@ -2995,7 +3004,7 @@
 		caddr_t resp = &qlt->resp_ptr[qlt->resp_ndx_to_fw << 6];
 		uint32_t ent_cnt;
 
-		ent_cnt = (uint32_t)(resp[1]);
+		ent_cnt = (uint32_t)(resp[0] == 0x51 ? resp[1] : 1);
 		if (ent_cnt > total_ent) {
 			break;
 		}
@@ -3375,11 +3384,12 @@
 fct_status_t
 qlt_xfer_scsi_data(fct_cmd_t *cmd, stmf_data_buf_t *dbuf, uint32_t ioflags)
 {
-	qlt_dmem_bctl_t *bctl = (qlt_dmem_bctl_t *)dbuf->db_port_private;
-	qlt_state_t *qlt = (qlt_state_t *)cmd->cmd_port->port_fca_private;
-	qlt_cmd_t *qcmd = (qlt_cmd_t *)cmd->cmd_fca_private;
-	uint8_t *req;
-	uint16_t flags;
+	qlt_dmem_bctl_t	*bctl = (qlt_dmem_bctl_t *)dbuf->db_port_private;
+	qlt_state_t	*qlt = (qlt_state_t *)cmd->cmd_port->port_fca_private;
+	qlt_cmd_t	*qcmd = (qlt_cmd_t *)cmd->cmd_fca_private;
+	uint8_t		*req, rcnt;
+	uint16_t	flags;
+	uint16_t	cookie_count;
 
 	if (dbuf->db_handle == 0)
 		qcmd->dbuf = dbuf;
@@ -3394,28 +3404,114 @@
 	if (dbuf->db_flags & DB_SEND_STATUS_GOOD)
 		flags = (uint16_t)(flags | BIT_15);
 
+	if (dbuf->db_flags & DB_LU_DATA_BUF) {
+		/*
+		 * Data bufs from LU are in scatter/gather list format.
+		 */
+		cookie_count = qlt_get_cookie_count(dbuf);
+		rcnt = qlt_get_iocb_count(cookie_count);
+	} else {
+		cookie_count = 1;
+		rcnt = 1;
+	}
 	mutex_enter(&qlt->req_lock);
-	req = (uint8_t *)qlt_get_req_entries(qlt, 1);
+	req = (uint8_t *)qlt_get_req_entries(qlt, rcnt);
 	if (req == NULL) {
 		mutex_exit(&qlt->req_lock);
 		return (FCT_BUSY);
 	}
-	bzero(req, IOCB_SIZE);
-	req[0] = 0x12; req[1] = 0x1;
+	bzero(req, IOCB_SIZE);	/* XXX needed ? */
+	req[0] = 0x12;
+	req[1] = rcnt;
 	req[2] = dbuf->db_handle;
 	QMEM_WR32(qlt, req+4, cmd->cmd_handle);
 	QMEM_WR16(qlt, req+8, cmd->cmd_rp->rp_handle);
 	QMEM_WR16(qlt, req+10, 60);	/* 60 seconds timeout */
-	req[12] = 1;
+	QMEM_WR16(qlt, req+12, cookie_count);
 	QMEM_WR32(qlt, req+0x10, cmd->cmd_rportid);
 	QMEM_WR32(qlt, req+0x14, qcmd->fw_xchg_addr);
 	QMEM_WR16(qlt, req+0x1A, flags);
 	QMEM_WR16(qlt, req+0x20, cmd->cmd_oxid);
 	QMEM_WR32(qlt, req+0x24, dbuf->db_relative_offset);
 	QMEM_WR32(qlt, req+0x2C, dbuf->db_data_size);
-	QMEM_WR64(qlt, req+0x34, bctl->bctl_dev_addr);
-	QMEM_WR32(qlt, req+0x34+8, dbuf->db_data_size);
-	qlt_submit_req_entries(qlt, 1);
+	if (dbuf->db_flags & DB_LU_DATA_BUF) {
+		uint8_t			*qptr;	/* qlt continuation segs */
+		uint16_t		cookie_resid;
+		uint16_t		cont_segs;
+		ddi_dma_cookie_t	cookie, *ckp;
+
+		/*
+		 * See if the dma cookies are in simple array format.
+		 */
+		ckp = qlt_get_cookie_array(dbuf);
+
+		/*
+		 * Program the first segment into main record.
+		 */
+		if (ckp) {
+			ASSERT(ckp->dmac_size);
+			QMEM_WR64(qlt, req+0x34, ckp->dmac_laddress);
+			QMEM_WR32(qlt, req+0x3c, ckp->dmac_size);
+		} else {
+			qlt_ddi_dma_nextcookie(dbuf, &cookie);
+			ASSERT(cookie.dmac_size);
+			QMEM_WR64(qlt, req+0x34, cookie.dmac_laddress);
+			QMEM_WR32(qlt, req+0x3c, cookie.dmac_size);
+		}
+		cookie_resid = cookie_count-1;
+
+		/*
+		 * Program remaining segments into continuation records.
+		 */
+		while (cookie_resid) {
+			req += IOCB_SIZE;
+			if (req >= (uint8_t *)qlt->resp_ptr) {
+				req = (uint8_t *)qlt->req_ptr;
+			}
+			req[0] = 0x0a;
+			req[1] = 1;
+			req[2] = req[3] = 0;	/* tidy */
+			qptr = &req[4];
+			for (cont_segs = CONT_A64_DATA_SEGMENTS;
+			    cont_segs && cookie_resid; cont_segs--) {
+
+				if (ckp) {
+					++ckp;		/* next cookie */
+					ASSERT(ckp->dmac_size != 0);
+					QMEM_WR64(qlt, qptr,
+					    ckp->dmac_laddress);
+					qptr += 8;	/* skip over laddress */
+					QMEM_WR32(qlt, qptr, ckp->dmac_size);
+					qptr += 4;	/* skip over size */
+				} else {
+					qlt_ddi_dma_nextcookie(dbuf, &cookie);
+					ASSERT(cookie.dmac_size != 0);
+					QMEM_WR64(qlt, qptr,
+					    cookie.dmac_laddress);
+					qptr += 8;	/* skip over laddress */
+					QMEM_WR32(qlt, qptr, cookie.dmac_size);
+					qptr += 4;	/* skip over size */
+				}
+				cookie_resid--;
+			}
+			/*
+			 * zero unused remainder of IOCB
+			 */
+			if (cont_segs) {
+				size_t resid;
+				resid = (size_t)((uintptr_t)(req+IOCB_SIZE) -
+				    (uintptr_t)qptr);
+				ASSERT(resid < IOCB_SIZE);
+				bzero(qptr, resid);
+			}
+		}
+	} else {
+		/* Single, contiguous buffer */
+		QMEM_WR64(qlt, req+0x34, bctl->bctl_dev_addr);
+		QMEM_WR32(qlt, req+0x34+8, dbuf->db_data_size);
+	}
+
+	qlt_submit_req_entries(qlt, rcnt);
 	mutex_exit(&qlt->req_lock);
 
 	return (STMF_SUCCESS);
@@ -6074,7 +6170,7 @@
 qlt_read_string_prop(qlt_state_t *qlt, char *prop, char **prop_val)
 {
 	return (ddi_prop_lookup_string(DDI_DEV_T_ANY, qlt->dip,
-	    DDI_PROP_DONTPASS | DDI_PROP_CANSLEEP, prop, prop_val));
+	    DDI_PROP_DONTPASS, prop, prop_val));
 }
 
 static int
--- a/usr/src/uts/common/io/comstar/port/qlt/qlt.h	Wed May 05 10:34:37 2010 -0400
+++ b/usr/src/uts/common/io/comstar/port/qlt/qlt.h	Wed May 05 10:23:23 2010 -0700
@@ -25,8 +25,7 @@
  */
 
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #ifndef	_QLT_H
@@ -240,6 +239,7 @@
 } qlt_abts_cmd_t;
 
 struct qlt_dmem_bucket;
+struct qlt_ddi_dma_handle_pool;
 
 #define	QLT_INTR_FIXED	0x1
 #define	QLT_INTR_MSI	0x2
@@ -259,6 +259,9 @@
 	fct_local_port_t	*qlt_port;
 	struct qlt_dmem_bucket	**dmem_buckets;
 
+	struct qlt_dma_handle_pool
+				*qlt_dma_handle_pool;
+
 	int			instance;
 	uint8_t			qlt_state:7,
 				qlt_state_not_acked:1;
@@ -351,7 +354,6 @@
 	uint64_t	qlt_bumpbucket;		/* bigger buffer supplied */
 	uint64_t	qlt_pmintry;
 	uint64_t	qlt_pmin_ok;
-
 } qlt_state_t;
 
 /*
--- a/usr/src/uts/common/io/comstar/port/qlt/qlt_dma.c	Wed May 05 10:34:37 2010 -0400
+++ b/usr/src/uts/common/io/comstar/port/qlt/qlt_dma.c	Wed May 05 10:23:23 2010 -0700
@@ -25,8 +25,7 @@
  */
 
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #include <sys/conf.h>
@@ -42,6 +41,12 @@
 #include <qlt.h>
 #include <qlt_dma.h>
 
+/*
+ *  Local Function Prototypes.
+ */
+static void
+qlt_dma_free_handles(qlt_state_t *qlt, qlt_dma_handle_t *first_handle);
+
 #define	BUF_COUNT_2K		2048
 #define	BUF_COUNT_8K		512
 #define	BUF_COUNT_64K		256
@@ -77,18 +82,18 @@
 fct_status_t
 qlt_dmem_init(qlt_state_t *qlt)
 {
-	qlt_dmem_bucket_t *p;
-	qlt_dmem_bctl_t *bctl, *bc;
-	qlt_dmem_bctl_t *prev;
-	int ndx, i;
-	uint32_t total_mem;
-	uint8_t *addr;
-	uint8_t *host_addr;
-	uint64_t dev_addr;
-	ddi_dma_cookie_t cookie;
-	uint32_t ncookie;
-	uint32_t bsize;
-	size_t len;
+	qlt_dmem_bucket_t	*p;
+	qlt_dmem_bctl_t		*bctl, *bc;
+	qlt_dmem_bctl_t		*prev;
+	int			ndx, i;
+	uint32_t		total_mem;
+	uint8_t			*addr;
+	uint8_t			*host_addr;
+	uint64_t		dev_addr;
+	ddi_dma_cookie_t	cookie;
+	uint32_t		ncookie;
+	uint32_t		bsize;
+	size_t			len;
 
 	if (qlt->qlt_bucketcnt[0] != 0) {
 		bucket2K.dmem_nbufs = qlt->qlt_bucketcnt[0];
@@ -166,7 +171,7 @@
 		p->dmem_bctl_free_list = bctl;
 		p->dmem_nbufs_free = p->dmem_nbufs;
 		for (i = 0; i < p->dmem_nbufs; i++) {
-			stmf_data_buf_t *db;
+			stmf_data_buf_t	*db;
 			prev = bctl;
 			bctl->bctl_bucket = p;
 			bctl->bctl_buf = db = stmf_alloc(STMF_STRUCT_DATA_BUF,
@@ -216,6 +221,44 @@
 }
 
 void
+qlt_dma_handle_pool_init(qlt_state_t *qlt)
+{
+	qlt_dma_handle_pool_t *pool;
+
+	pool = kmem_zalloc(sizeof (*pool), KM_SLEEP);
+	mutex_init(&pool->pool_lock, NULL, MUTEX_DRIVER, NULL);
+	qlt->qlt_dma_handle_pool = pool;
+}
+
+void
+qlt_dma_handle_pool_fini(qlt_state_t *qlt)
+{
+	qlt_dma_handle_pool_t	*pool;
+	qlt_dma_handle_t	*handle, *next_handle;
+
+	pool = qlt->qlt_dma_handle_pool;
+	mutex_enter(&pool->pool_lock);
+	/*
+	 * XXX Need to wait for free == total elements
+	 * XXX Not sure how other driver shutdown stuff is done.
+	 */
+	ASSERT(pool->num_free == pool->num_total);
+	if (pool->num_free != pool->num_total)
+		cmn_err(CE_WARN,
+		    "num_free %d != num_total %d\n",
+		    pool->num_free, pool->num_total);
+	handle = pool->free_list;
+	while (handle) {
+		next_handle = handle->next;
+		kmem_free(handle, sizeof (*handle));
+		handle = next_handle;
+	}
+	qlt->qlt_dma_handle_pool = NULL;
+	mutex_destroy(&pool->pool_lock);
+	kmem_free(pool, sizeof (*pool));
+}
+
+void
 qlt_dmem_fini(qlt_state_t *qlt)
 {
 	qlt_dmem_bucket_t *p;
@@ -254,7 +297,7 @@
 /* ARGSUSED */
 stmf_data_buf_t *
 qlt_i_dmem_alloc(qlt_state_t *qlt, uint32_t size, uint32_t *pminsize,
-					uint32_t flags)
+    uint32_t flags)
 {
 	qlt_dmem_bucket_t	*p;
 	qlt_dmem_bctl_t 	*bctl;
@@ -350,9 +393,13 @@
 void
 qlt_dmem_free(fct_dbuf_store_t *fds, stmf_data_buf_t *dbuf)
 {
-	qlt_dmem_bctl_t *bctl = (qlt_dmem_bctl_t *)dbuf->db_port_private;
-	qlt_dmem_bucket_t *p = bctl->bctl_bucket;
+	qlt_dmem_bctl_t		*bctl;
+	qlt_dmem_bucket_t	*p;
 
+	ASSERT((dbuf->db_flags & DB_LU_DATA_BUF) == 0);
+
+	bctl = (qlt_dmem_bctl_t *)dbuf->db_port_private;
+	p = bctl->bctl_bucket;
 	mutex_enter(&p->dmem_lock);
 	bctl->bctl_next = p->dmem_bctl_free_list;
 	p->dmem_bctl_free_list = bctl;
@@ -363,10 +410,394 @@
 void
 qlt_dmem_dma_sync(stmf_data_buf_t *dbuf, uint_t sync_type)
 {
-	qlt_dmem_bctl_t *bctl = (qlt_dmem_bctl_t *)dbuf->db_port_private;
-	qlt_dmem_bucket_t *p = bctl->bctl_bucket;
+	qlt_dmem_bctl_t		*bctl;
+	qlt_dma_sgl_t		*qsgl;
+	qlt_dmem_bucket_t	*p;
+	qlt_dma_handle_t	*th;
+	int			rv;
+
+	if (dbuf->db_flags & DB_LU_DATA_BUF) {
+		/*
+		 * go through ddi handle list
+		 */
+		qsgl = (qlt_dma_sgl_t *)dbuf->db_port_private;
+		th = qsgl->handle_list;
+		while (th) {
+			rv = ddi_dma_sync(th->dma_handle,
+			    0, 0, sync_type);
+			if (rv != DDI_SUCCESS) {
+				cmn_err(CE_WARN, "ddi_dma_sync FAILED\n");
+			}
+			th = th->next;
+		}
+	} else {
+		bctl = (qlt_dmem_bctl_t *)dbuf->db_port_private;
+		p = bctl->bctl_bucket;
+		(void) ddi_dma_sync(p->dmem_dma_handle, (off_t)
+		    (bctl->bctl_dev_addr - p->dmem_dev_addr),
+		    dbuf->db_data_size, sync_type);
+	}
+}
+
+/*
+ * A very lite version of ddi_dma_addr_bind_handle()
+ */
+uint64_t
+qlt_ddi_vtop(caddr_t vaddr)
+{
+	uint64_t offset, paddr;
+	pfn_t pfn;
+
+	pfn = hat_getpfnum(kas.a_hat, vaddr);
+	ASSERT(pfn != PFN_INVALID && pfn != PFN_SUSPENDED);
+	offset = ((uintptr_t)vaddr) & MMU_PAGEOFFSET;
+	paddr = mmu_ptob(pfn);
+	return (paddr+offset);
+}
+
+static ddi_dma_attr_t 	qlt_sgl_dma_attr = {
+	DMA_ATTR_V0,		/* dma_attr_version */
+	0,				/* low DMA address range */
+	0xffffffffffffffff,		/* high DMA address range */
+	0xffffffff,			/* DMA counter register */
+	64,				/* DMA address alignment */
+	0xff,			/* DMA burstsizes */
+	1,				/* min effective DMA size */
+	0xffffffff,			/* max DMA xfer size */
+	0xffffffff,			/* segment boundary */
+	QLT_DMA_SG_LIST_LENGTH,	/* s/g list length */
+	1,				/* granularity of device */
+	0				/* DMA transfer flags */
+};
+
+/*
+ * Allocate a qlt_dma_handle container and fill it with a ddi_dma_handle
+ */
+static qlt_dma_handle_t *
+qlt_dma_alloc_handle(qlt_state_t *qlt)
+{
+	ddi_dma_handle_t ddi_handle;
+	qlt_dma_handle_t *qlt_handle;
+	int rv;
+
+	rv = ddi_dma_alloc_handle(qlt->dip, &qlt_sgl_dma_attr,
+	    DDI_DMA_SLEEP, 0, &ddi_handle);
+	if (rv != DDI_SUCCESS) {
+		EL(qlt, "ddi_dma_alloc_handle status=%xh\n", rv);
+		return (NULL);
+	}
+	qlt_handle = kmem_zalloc(sizeof (qlt_dma_handle_t), KM_SLEEP);
+	qlt_handle->dma_handle = ddi_handle;
+	return (qlt_handle);
+}
+
+/*
+ * Allocate a list of qlt_dma_handle containers from the free list
+ */
+static qlt_dma_handle_t *
+qlt_dma_alloc_handle_list(qlt_state_t *qlt, int handle_count)
+{
+	qlt_dma_handle_pool_t	*pool;
+	qlt_dma_handle_t	*tmp_handle, *first_handle, *last_handle;
+	int i;
+
+	/*
+	 * Make sure the free list can satisfy the request.
+	 * Once the free list is primed, it should satisfy most requests.
+	 * XXX Should there be a limit on pool size?
+	 */
+	pool = qlt->qlt_dma_handle_pool;
+	mutex_enter(&pool->pool_lock);
+	while (handle_count > pool->num_free) {
+		mutex_exit(&pool->pool_lock);
+		if ((tmp_handle = qlt_dma_alloc_handle(qlt)) == NULL)
+			return (NULL);
+		mutex_enter(&pool->pool_lock);
+		tmp_handle->next = pool->free_list;
+		pool->free_list = tmp_handle;
+		pool->num_free++;
+		pool->num_total++;
+	}
+
+	/*
+	 * The free list lock is held and the list is large enough to
+	 * satisfy this request. Run down the freelist and snip off
+	 * the number of elements needed for this request.
+	 */
+	first_handle = pool->free_list;
+	tmp_handle = first_handle;
+	for (i = 0; i < handle_count; i++) {
+		last_handle = tmp_handle;
+		tmp_handle = tmp_handle->next;
+	}
+	pool->free_list = tmp_handle;
+	pool->num_free -= handle_count;
+	mutex_exit(&pool->pool_lock);
+	last_handle->next = NULL;	/* sanity */
+	return (first_handle);
+}
+
+/*
+ * Return a list of qlt_dma_handle containers to the free list.
+ */
+static void
+qlt_dma_free_handles(qlt_state_t *qlt, qlt_dma_handle_t *first_handle)
+{
+	qlt_dma_handle_pool_t *pool;
+	qlt_dma_handle_t *tmp_handle, *last_handle;
+	int rv, handle_count;
+
+	/*
+	 * Traverse the list and unbind the handles
+	 */
+	ASSERT(first_handle);
+	tmp_handle = first_handle;
+	handle_count = 0;
+	while (tmp_handle != NULL) {
+		last_handle = tmp_handle;
+		/*
+		 * If the handle is bound, unbind the handle so it can be
+		 * reused. It may not be bound if there was a bind failure.
+		 */
+		if (tmp_handle->num_cookies != 0) {
+			rv = ddi_dma_unbind_handle(tmp_handle->dma_handle);
+			ASSERT(rv == DDI_SUCCESS);
+			tmp_handle->num_cookies = 0;
+			tmp_handle->num_cookies_fetched = 0;
+		}
+		tmp_handle = tmp_handle->next;
+		handle_count++;
+	}
+	/*
+	 * Insert this list into the free list
+	 */
+	pool = qlt->qlt_dma_handle_pool;
+	mutex_enter(&pool->pool_lock);
+	last_handle->next = pool->free_list;
+	pool->free_list = first_handle;
+	pool->num_free += handle_count;
+	mutex_exit(&pool->pool_lock);
+}
+
+/*
+ * cookies produced by mapping this dbuf
+ */
+uint16_t
+qlt_get_cookie_count(stmf_data_buf_t *dbuf)
+{
+	qlt_dma_sgl_t *qsgl = dbuf->db_port_private;
+
+	ASSERT(dbuf->db_flags & DB_LU_DATA_BUF);
+	return (qsgl->cookie_count);
+}
+
+ddi_dma_cookie_t
+*qlt_get_cookie_array(stmf_data_buf_t *dbuf)
+{
+	qlt_dma_sgl_t *qsgl = dbuf->db_port_private;
+
+	ASSERT(dbuf->db_flags & DB_LU_DATA_BUF);
+
+	if (qsgl->cookie_prefetched)
+		return (&qsgl->cookie[0]);
+	else
+		return (NULL);
+}
 
-	(void) ddi_dma_sync(p->dmem_dma_handle, (off_t)
-	    (bctl->bctl_dev_addr - p->dmem_dev_addr),
-	    dbuf->db_data_size, sync_type);
+/*
+ * Wrapper around ddi_dma_nextcookie that hides the ddi_dma_handle usage.
+ */
+void
+qlt_ddi_dma_nextcookie(stmf_data_buf_t *dbuf, ddi_dma_cookie_t *cookiep)
+{
+	qlt_dma_sgl_t *qsgl = dbuf->db_port_private;
+
+	ASSERT(dbuf->db_flags & DB_LU_DATA_BUF);
+
+	if (qsgl->cookie_prefetched) {
+		ASSERT(qsgl->cookie_next_fetch < qsgl->cookie_count);
+		*cookiep = qsgl->cookie[qsgl->cookie_next_fetch++];
+	} else {
+		qlt_dma_handle_t *fetch;
+		qlt_dma_handle_t *FETCH_DONE = (qlt_dma_handle_t *)0xbad;
+
+		ASSERT(qsgl->handle_list != NULL);
+		ASSERT(qsgl->handle_next_fetch != FETCH_DONE);
+
+		fetch = qsgl->handle_next_fetch;
+		if (fetch->num_cookies_fetched == 0) {
+			*cookiep = fetch->first_cookie;
+		} else {
+			ddi_dma_nextcookie(fetch->dma_handle, cookiep);
+		}
+		if (++fetch->num_cookies_fetched == fetch->num_cookies) {
+			if (fetch->next == NULL)
+				qsgl->handle_next_fetch = FETCH_DONE;
+			else
+				qsgl->handle_next_fetch = fetch->next;
+		}
+	}
 }
+
+/*
+ * Set this flag to fetch the DDI dma cookies from the handles here and
+ * store them in the port private area of the dbuf. This will allow
+ * faster access to the cookies in qlt_xfer_scsi_data() at the expense of
+ * an extra copy. If the qlt->req_lock is hot, this may help.
+ */
+int qlt_sgl_prefetch = 0;
+
+/*ARGSUSED*/
+stmf_status_t
+qlt_dma_setup_dbuf(fct_local_port_t *port, stmf_data_buf_t *dbuf,
+    uint32_t flags)
+{
+	qlt_state_t		*qlt = port->port_fca_private;
+	qlt_dma_sgl_t		*qsgl;
+	struct stmf_sglist_ent	*sglp;
+	qlt_dma_handle_t	*handle_list, *th;
+	int			i, rv;
+	ddi_dma_cookie_t	*cookie_p;
+	int			cookie_count, numbufs;
+	int			prefetch;
+	size_t			qsize;
+
+	/*
+	 * psuedo code:
+	 * get dma handle list from cache - one per sglist entry
+	 * foreach sglist entry
+	 *	bind dma handle to sglist vaddr
+	 * allocate space for DMA state to store in db_port_private
+	 * fill in port private object
+	 * if prefetching
+	 *	move all dma cookies into db_port_private
+	 */
+	dbuf->db_port_private = NULL;
+	numbufs = dbuf->db_sglist_length;
+	handle_list = qlt_dma_alloc_handle_list(qlt, numbufs);
+	if (handle_list == NULL) {
+		EL(qlt, "handle_list==NULL\n");
+		return (STMF_FAILURE);
+	}
+	/*
+	 * Loop through sglist and bind each entry to a handle
+	 */
+	th = handle_list;
+	sglp = &dbuf->db_sglist[0];
+	cookie_count = 0;
+	for (i = 0; i < numbufs; i++, sglp++) {
+
+		/*
+		 * Bind this sgl entry to a DDI dma handle
+		 */
+		if ((rv = ddi_dma_addr_bind_handle(
+		    th->dma_handle,
+		    NULL,
+		    (caddr_t)(sglp->seg_addr),
+		    (size_t)sglp->seg_length,
+		    DDI_DMA_RDWR | DDI_DMA_STREAMING,
+		    DDI_DMA_DONTWAIT,
+		    NULL,
+		    &th->first_cookie,
+		    &th->num_cookies)) != DDI_DMA_MAPPED) {
+			cmn_err(CE_NOTE, "ddi_dma_addr_bind_handle %d", rv);
+			qlt_dma_free_handles(qlt, handle_list);
+			return (STMF_FAILURE);
+		}
+
+		/*
+		 * Add to total cookie count
+		 */
+		cookie_count += th->num_cookies;
+		if (cookie_count > QLT_DMA_SG_LIST_LENGTH) {
+			/*
+			 * Request exceeds HBA limit
+			 */
+			qlt_dma_free_handles(qlt, handle_list);
+			return (STMF_FAILURE);
+		}
+		/* move to next ddi_dma_handle */
+		th = th->next;
+	}
+
+	/*
+	 * Allocate our port private object for DMA mapping state.
+	 */
+	prefetch =  qlt_sgl_prefetch;
+	qsize = sizeof (qlt_dma_sgl_t);
+	if (prefetch) {
+		/* one extra ddi_dma_cookie allocated for alignment padding */
+		qsize += cookie_count * sizeof (ddi_dma_cookie_t);
+	}
+	qsgl = kmem_alloc(qsize, KM_SLEEP);
+	/*
+	 * Fill in the sgl
+	 */
+	dbuf->db_port_private = qsgl;
+	qsgl->qsize = qsize;
+	qsgl->handle_count = dbuf->db_sglist_length;
+	qsgl->cookie_prefetched = prefetch;
+	qsgl->cookie_count = cookie_count;
+	qsgl->cookie_next_fetch = 0;
+	qsgl->handle_list = handle_list;
+	qsgl->handle_next_fetch = handle_list;
+	if (prefetch) {
+		/*
+		 * traverse handle list and move cookies to db_port_private
+		 */
+		th = handle_list;
+		cookie_p = &qsgl->cookie[0];
+		for (i = 0; i < numbufs; i++) {
+			uint_t cc = th->num_cookies;
+
+			*cookie_p++ = th->first_cookie;
+			while (--cc > 0) {
+				ddi_dma_nextcookie(th->dma_handle, cookie_p++);
+			}
+			th->num_cookies_fetched = th->num_cookies;
+			th = th->next;
+		}
+	}
+
+	return (STMF_SUCCESS);
+}
+
+void
+qlt_dma_teardown_dbuf(fct_dbuf_store_t *fds, stmf_data_buf_t *dbuf)
+{
+	qlt_state_t		*qlt = fds->fds_fca_private;
+	qlt_dma_sgl_t		*qsgl = dbuf->db_port_private;
+
+	ASSERT(qlt);
+	ASSERT(qsgl);
+	ASSERT(dbuf->db_flags & DB_LU_DATA_BUF);
+
+	/*
+	 * unbind and free the dma handles
+	 */
+	if (qsgl->handle_list) {
+		/* go through ddi handle list */
+		qlt_dma_free_handles(qlt, qsgl->handle_list);
+	}
+	kmem_free(qsgl, qsgl->qsize);
+}
+
+uint8_t
+qlt_get_iocb_count(uint32_t cookie_count)
+{
+	uint32_t	cnt, cont_segs;
+	uint8_t		iocb_count;
+
+	iocb_count = 1;
+	cnt = CMD7_2400_DATA_SEGMENTS;
+	cont_segs = CONT_A64_DATA_SEGMENTS;
+
+	if (cookie_count > cnt) {
+		cnt = cookie_count - cnt;
+		iocb_count = (uint8_t)(iocb_count + cnt / cont_segs);
+		if (cnt % cont_segs) {
+			iocb_count++;
+		}
+	}
+	return (iocb_count);
+}
--- a/usr/src/uts/common/io/comstar/port/qlt/qlt_dma.h	Wed May 05 10:34:37 2010 -0400
+++ b/usr/src/uts/common/io/comstar/port/qlt/qlt_dma.h	Wed May 05 10:23:23 2010 -0700
@@ -25,8 +25,7 @@
  */
 
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #ifndef	_QLT_DMA_H
@@ -38,13 +37,64 @@
 extern "C" {
 #endif
 
+/*
+ * DMA memory object.
+ */
+#define	QLT_DMA_SG_LIST_LENGTH	1270
+#define	CMD7_2400_DATA_SEGMENTS	1
+#define	CONT_A64_DATA_SEGMENTS	5
+
+
+/*
+ * Container for ddi_dma_handle
+ *
+ * These elements are either linked to an active dbuf or in the free list.
+ */
+struct qlt_dma_handle {
+	struct qlt_dma_handle	*next;
+	ddi_dma_handle_t	dma_handle;
+	ddi_dma_cookie_t	first_cookie;
+	uint_t			num_cookies;
+	uint_t			num_cookies_fetched;
+};
+
+typedef struct qlt_dma_handle qlt_dma_handle_t;
+
+/*
+ * The dbuf private data when using a scatter/gather list.
+ */
+struct qlt_dma_sgl {
+	uint16_t		handle_count;
+	uint16_t		cookie_count;
+	uint16_t		cookie_next_fetch;
+	uint16_t		cookie_prefetched;
+	qlt_dma_handle_t	*handle_list;
+	qlt_dma_handle_t	*handle_next_fetch;
+	size_t			qsize;
+	ddi_dma_cookie_t	cookie[1];
+};
+
+typedef struct qlt_dma_sgl qlt_dma_sgl_t;
+
+/*
+ * Structure to maintain ddi_dma_handle free pool.
+ */
+struct qlt_dma_handle_pool {
+	kmutex_t		pool_lock;	/* protects all fields */
+	qlt_dma_handle_t	*free_list;
+	int			num_free;
+	int			num_total;
+};
+
+typedef struct qlt_dma_handle_pool qlt_dma_handle_pool_t;
+
 struct qlt_dmem_bucket;
 
 typedef struct qlt_dmem_bctl {
 	struct qlt_dmem_bucket	*bctl_bucket;
 	struct qlt_dmem_bctl	*bctl_next;
 	uint64_t		bctl_dev_addr;
-	uint8_t			bctl_task_ndx;
+	uint8_t			bctl_task_ndx;	/* notused */
 	stmf_data_buf_t		*bctl_buf;
 } qlt_dmem_bctl_t;
 
@@ -63,13 +113,27 @@
 
 fct_status_t qlt_dmem_init(qlt_state_t *qlt);
 void qlt_dmem_fini(qlt_state_t *qlt);
+void qlt_dma_handle_pool_init(qlt_state_t *qlt);
+void qlt_dma_handle_pool_fini(qlt_state_t *qlt);
 stmf_data_buf_t *qlt_dmem_alloc(fct_local_port_t *port, uint32_t size,
     uint32_t *pminsize, uint32_t flags);
 stmf_data_buf_t *qlt_i_dmem_alloc(qlt_state_t *qlt, uint32_t size,
     uint32_t *pminsize, uint32_t flags);
 void qlt_dmem_free(fct_dbuf_store_t *fds, stmf_data_buf_t *dbuf);
 void qlt_i_dmem_free(qlt_state_t *qlt, stmf_data_buf_t *dbuf);
+stmf_status_t qlt_dma_setup_dbuf(fct_local_port_t *port,
+    stmf_data_buf_t *dbuf, uint32_t flags);
+void qlt_dma_teardown_dbuf(fct_dbuf_store_t *fds, stmf_data_buf_t *dbuf);
 void qlt_dmem_dma_sync(stmf_data_buf_t *dbuf, uint_t sync_type);
+uint8_t qlt_get_iocb_count(uint32_t cookie_cnt);
+uint64_t qlt_ddi_vtop(caddr_t vaddr);
+/*
+ * XXX move the following into the fct layer
+ */
+uint16_t qlt_get_cookie_count(stmf_data_buf_t *dbuf);
+void qlt_ddi_dma_nextcookie(stmf_data_buf_t *dbuf, ddi_dma_cookie_t *cookie_p);
+ddi_dma_cookie_t *qlt_get_cookie_array(stmf_data_buf_t *dbuf);
+
 
 #ifdef	__cplusplus
 }
--- a/usr/src/uts/common/io/comstar/port/qlt/qlt_open.h	Wed May 05 10:34:37 2010 -0400
+++ b/usr/src/uts/common/io/comstar/port/qlt/qlt_open.h	Wed May 05 10:23:23 2010 -0700
@@ -45,7 +45,7 @@
 #endif
 
 #ifndef QLT_VERSION
-#define	QLT_VERSION	"20091202-1.04"
+#define	QLT_VERSION	"20100505-1.05"
 #endif
 
 #ifndef	QLT_NAME
--- a/usr/src/uts/common/io/comstar/stmf/stmf.c	Wed May 05 10:34:37 2010 -0400
+++ b/usr/src/uts/common/io/comstar/stmf/stmf.c	Wed May 05 10:23:23 2010 -0700
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #include <sys/conf.h>
@@ -2157,7 +2157,10 @@
 	stmf_size = stmf_sizes[struct_id].shared +
 	    stmf_sizes[struct_id].fw_private + additional_size;
 
-	sh = (__stmf_t *)kmem_zalloc(stmf_size, kmem_flag);
+	if (flags & AF_DONTZERO)
+		sh = (__stmf_t *)kmem_alloc(stmf_size, kmem_flag);
+	else
+		sh = (__stmf_t *)kmem_zalloc(stmf_size, kmem_flag);
 
 	if (sh == NULL)
 		return (NULL);
@@ -4242,6 +4245,54 @@
 	return (NULL);
 }
 
+stmf_status_t
+stmf_setup_dbuf(scsi_task_t *task, stmf_data_buf_t *dbuf, uint32_t flags)
+{
+	stmf_i_scsi_task_t *itask =
+	    (stmf_i_scsi_task_t *)task->task_stmf_private;
+	stmf_local_port_t *lport = task->task_lport;
+	uint8_t ndx;
+	stmf_status_t ret;
+
+	ASSERT(task->task_additional_flags & TASK_AF_ACCEPT_LU_DBUF);
+	ASSERT(lport->lport_ds->ds_setup_dbuf != NULL);
+	ASSERT(dbuf->db_flags & DB_LU_DATA_BUF);
+
+	if ((task->task_additional_flags & TASK_AF_ACCEPT_LU_DBUF) == 0)
+		return (STMF_FAILURE);
+	if (lport->lport_ds->ds_setup_dbuf == NULL)
+		return (STMF_FAILURE);
+
+	ndx = stmf_first_zero[itask->itask_allocated_buf_map];
+	if (ndx == 0xff)
+		return (STMF_FAILURE);
+	ret = lport->lport_ds->ds_setup_dbuf(task, dbuf, flags);
+	if (ret == STMF_FAILURE)
+		return (STMF_FAILURE);
+	itask->itask_dbufs[ndx] = dbuf;
+	task->task_cur_nbufs++;
+	itask->itask_allocated_buf_map |= (1 << ndx);
+	dbuf->db_handle = ndx;
+
+	return (STMF_SUCCESS);
+}
+
+void
+stmf_teardown_dbuf(scsi_task_t *task, stmf_data_buf_t *dbuf)
+{
+	stmf_i_scsi_task_t *itask =
+	    (stmf_i_scsi_task_t *)task->task_stmf_private;
+	stmf_local_port_t *lport = task->task_lport;
+
+	ASSERT(task->task_additional_flags & TASK_AF_ACCEPT_LU_DBUF);
+	ASSERT(lport->lport_ds->ds_teardown_dbuf != NULL);
+	ASSERT(dbuf->db_flags & DB_LU_DATA_BUF);
+
+	itask->itask_allocated_buf_map &= ~(1 << dbuf->db_handle);
+	task->task_cur_nbufs--;
+	lport->lport_ds->ds_teardown_dbuf(lport->lport_ds, dbuf);
+}
+
 void
 stmf_free_dbuf(scsi_task_t *task, stmf_data_buf_t *dbuf)
 {
@@ -4276,7 +4327,6 @@
 	stmf_i_scsi_task_t *itask;
 	stmf_i_scsi_task_t **ppitask;
 	scsi_task_t *task;
-	uint64_t *p;
 	uint8_t	*l;
 	stmf_lun_map_ent_t *lun_map_ent;
 	uint16_t cdb_length;
@@ -4331,10 +4381,20 @@
 	} while (0);
 
 	if (!new_task) {
+		/*
+		 * Save the task_cdb pointer and zero per cmd fields.
+		 * We know the task_cdb_length is large enough by task
+		 * selection process above.
+		 */
+		uint8_t *save_cdb;
+		uintptr_t t_start, t_end;
+
 		task = itask->itask_task;
-		task->task_timeout = 0;
-		p = (uint64_t *)&task->task_flags;
-		*p++ = 0; *p++ = 0; p++; p++; *p++ = 0; *p++ = 0; *p = 0;
+		save_cdb = task->task_cdb;	/* save */
+		t_start = (uintptr_t)&task->task_flags;
+		t_end = (uintptr_t)&task->task_extended_cmd;
+		bzero((void *)t_start, (size_t)(t_end - t_start));
+		task->task_cdb = save_cdb;	/* restore */
 		itask->itask_ncmds = 0;
 	} else {
 		task = (scsi_task_t *)stmf_alloc(STMF_STRUCT_SCSI_TASK,
@@ -4596,25 +4656,38 @@
 	int i;
 	uint8_t map;
 
-	if ((map = itask->itask_allocated_buf_map) != 0) {
-		for (i = 0; i < 4; i++) {
-			if (map & 1) {
-				stmf_data_buf_t *dbuf;
-
-				dbuf = itask->itask_dbufs[i];
-				if (dbuf->db_lu_private) {
-					dbuf->db_lu_private = NULL;
-				}
-				if (dbuf->db_xfer_start_timestamp != NULL) {
-					stmf_lport_xfer_done(itask, dbuf);
-				}
+	if ((map = itask->itask_allocated_buf_map) == 0)
+		return;
+	for (i = 0; i < 4; i++) {
+		if (map & 1) {
+			stmf_data_buf_t *dbuf;
+
+			dbuf = itask->itask_dbufs[i];
+			if (dbuf->db_xfer_start_timestamp) {
+				stmf_lport_xfer_done(itask, dbuf);
+			}
+			if (dbuf->db_flags & DB_LU_DATA_BUF) {
+				/*
+				 * LU needs to clean up buffer.
+				 * LU is required to free the buffer
+				 * in the xfer_done handler.
+				 */
+				scsi_task_t *task = itask->itask_task;
+				stmf_lu_t *lu = task->task_lu;
+
+				lu->lu_dbuf_free(task, dbuf);
+				ASSERT(((itask->itask_allocated_buf_map>>i)
+				    & 1) == 0); /* must be gone */
+			} else {
+				ASSERT(dbuf->db_lu_private == NULL);
+				dbuf->db_lu_private = NULL;
 				lport->lport_ds->ds_free_data_buf(
 				    lport->lport_ds, dbuf);
 			}
-			map >>= 1;
-		}
-		itask->itask_allocated_buf_map = 0;
-	}
+		}
+		map >>= 1;
+	}
+	itask->itask_allocated_buf_map = 0;
 }
 
 void
--- a/usr/src/uts/common/sys/fct.h	Wed May 05 10:34:37 2010 -0400
+++ b/usr/src/uts/common/sys/fct.h	Wed May 05 10:23:23 2010 -0700
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 #ifndef	_FCT_H
 #define	_FCT_H
@@ -195,6 +194,13 @@
 			    uint32_t size, uint32_t *pminsize, uint32_t flags);
 	void		(*fds_free_data_buf)(struct fct_dbuf_store *fds,
 			    stmf_data_buf_t *dbuf);
+	stmf_status_t	(*fds_setup_dbuf)(struct fct_local_port *port,
+			    stmf_data_buf_t *dbuf, uint32_t flags);
+	void		(*fds_teardown_dbuf)(struct fct_dbuf_store *fds,
+			    stmf_data_buf_t *dbuf);
+
+	uint32_t		fds_max_sgl_xfer_len;
+	uint32_t		fds_copy_threshold;
 } fct_dbuf_store_t;
 
 #define	FCT_FCA_MODREV_1	1
--- a/usr/src/uts/common/sys/lpif.h	Wed May 05 10:34:37 2010 -0400
+++ b/usr/src/uts/common/sys/lpif.h	Wed May 05 10:23:23 2010 -0700
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 #ifndef	_LPIF_H
 #define	_LPIF_H
@@ -74,6 +73,8 @@
 		int eventid, void *arg, uint32_t flags);
 	void			*lu_proxy_reg_arg;
 	uint32_t		lu_proxy_reg_arg_len;
+	void			(*lu_dbuf_free)(struct scsi_task *task,
+		struct stmf_data_buf *dbuf);
 } stmf_lu_t;
 
 /*
--- a/usr/src/uts/common/sys/portif.h	Wed May 05 10:34:37 2010 -0400
+++ b/usr/src/uts/common/sys/portif.h	Wed May 05 10:23:23 2010 -0700
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 #ifndef	_PORTIF_H
 #define	_PORTIF_H
@@ -41,8 +40,15 @@
 
 	stmf_data_buf_t		*(*ds_alloc_data_buf)(struct scsi_task *task,
 	    uint32_t size, uint32_t *pminsize, uint32_t flags);
-	void			 (*ds_free_data_buf)(
-		struct stmf_dbuf_store *ds, stmf_data_buf_t *dbuf);
+
+	void			(*ds_free_data_buf)(
+	    struct stmf_dbuf_store *ds, stmf_data_buf_t *dbuf);
+
+	stmf_status_t		(*ds_setup_dbuf)(struct scsi_task *task,
+	    stmf_data_buf_t *dbuf, uint32_t flags);
+
+	void			(*ds_teardown_dbuf)(
+	    struct stmf_dbuf_store *ds, stmf_data_buf_t *dbuf);
 } stmf_dbuf_store_t;
 
 #define	PORTIF_REV_1	0x00010000
--- a/usr/src/uts/common/sys/stmf.h	Wed May 05 10:34:37 2010 -0400
+++ b/usr/src/uts/common/sys/stmf.h	Wed May 05 10:23:23 2010 -0700
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 #ifndef	_STMF_H
 #define	_STMF_H
@@ -62,6 +61,22 @@
 #define	COMPANY_ID_NONE			0xFFFFFFFF
 #define	COMPANY_ID_SUN			0x00144F
 
+/*
+ * The scatter/gather list buffer format is used in 2 different
+ * contexts within stmf:
+ * 1) supplied by the port provider that the LU provider uses to exchange
+ *    data with the backing store.
+ * 2) supplied by the LU provider that the port provider uses exchange
+ *    data with the host initiator.
+ * The second format is optionally supported by the port provided as
+ * indicated by the command task flags.
+ */
+
+typedef struct stmf_sglist_ent {
+	uint32_t	seg_length;
+	uint8_t		*seg_addr;
+} stmf_sglist_ent_t;
+
 typedef struct stmf_data_buf {
 	void		*db_stmf_private;
 	void		*db_port_private;
@@ -74,10 +89,7 @@
 	stmf_status_t	db_xfer_status;
 	uint8_t		db_handle;	/* To track parallel buffers */
 	hrtime_t	db_xfer_start_timestamp;
-	struct stmf_sglist_ent {
-		uint32_t	seg_length;
-		uint8_t		*seg_addr;
-	}		db_sglist[1];
+	stmf_sglist_ent_t db_sglist[1];	/* PP scatter/gather list */
 } stmf_data_buf_t;
 
 /*
@@ -89,6 +101,7 @@
 #define	DB_STATUS_GOOD_SENT		0x0008
 #define	DB_DONT_CACHE			0x0010
 #define	DB_DONT_REUSE			0x0020
+#define	DB_LU_DATA_BUF			0x0040
 
 typedef struct scsi_task {
 	void		*task_stmf_private;
@@ -119,6 +132,10 @@
 	/* Fields to manage data phase */
 	uint32_t	task_cmd_xfer_length;	/* xfer len based on CDB */
 	uint32_t	task_nbytes_transferred;
+	uint32_t	task_max_xfer_len;	/* largest xfer allowed */
+	uint32_t	task_1st_xfer_len;	/* 1st xfer hint */
+	uint32_t	task_copy_threshold;	/* copy reduction threshold */
+
 
 	/* Status Phase */
 	stmf_status_t	task_completion_status;
@@ -180,6 +197,10 @@
 #define	TASK_AF_ENABLE_COMP_CONF	0x01
 #define	TASK_AF_PORT_LOAD_HIGH		0x02
 #define	TASK_AF_NO_EXPECTED_XFER_LENGTH	0x04
+/*
+ * PP sets this flag if it can process dbufs created by the LU.
+ */
+#define	TASK_AF_ACCEPT_LU_DBUF		0x08
 
 /*
  * scsi_task_t extension identifiers
@@ -208,9 +229,10 @@
  * struct allocation flags
  */
 #define	AF_FORCE_NOSLEEP	0x0001
+#define	AF_DONTZERO		0x0002
 
 typedef struct stmf_state_change_info {
-	uint64_t	st_rflags;	/* Reason behin this change */
+	uint64_t	st_rflags;	/* Reason behind this change */
 	char		*st_additional_info;
 } stmf_state_change_info_t;
 
@@ -343,6 +365,9 @@
 stmf_data_buf_t *stmf_alloc_dbuf(scsi_task_t *task, uint32_t size,
     uint32_t *pminsize, uint32_t flags);
 void stmf_free_dbuf(scsi_task_t *task, stmf_data_buf_t *dbuf);
+stmf_status_t stmf_setup_dbuf(scsi_task_t *task, stmf_data_buf_t *dbuf,
+    uint32_t flags);
+void stmf_teardown_dbuf(scsi_task_t *task, stmf_data_buf_t *dbuf);
 stmf_status_t stmf_xfer_data(scsi_task_t *task, stmf_data_buf_t *dbuf,
     uint32_t ioflags);
 stmf_status_t stmf_send_scsi_status(scsi_task_t *task, uint32_t ioflags);
--- a/usr/src/uts/intel/stmf_sbd/Makefile	Wed May 05 10:34:37 2010 -0400
+++ b/usr/src/uts/intel/stmf_sbd/Makefile	Wed May 05 10:23:23 2010 -0700
@@ -19,8 +19,7 @@
 # CDDL HEADER END
 #
 #
-# Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
-# Use is subject to license terms.
+# Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
 #
 #	This makefile drives the production of the stmf_sbd driver for 
 #	COMSTAR.
@@ -57,7 +56,7 @@
 #	Overrides and depends_on
 #
 MODSTUBS_DIR	 = $(OBJS_DIR)
-LDFLAGS		+= -dy -Ndrv/stmf
+LDFLAGS		+= -dy -Ndrv/stmf -Nfs/zfs
 
 INC_PATH	+= -I$(UTSBASE)/common/fs/zfs
 INC_PATH	+= -I$(UTSBASE)/common/io/comstar/lu/stmf_sbd
--- a/usr/src/uts/sparc/stmf_sbd/Makefile	Wed May 05 10:34:37 2010 -0400
+++ b/usr/src/uts/sparc/stmf_sbd/Makefile	Wed May 05 10:23:23 2010 -0700
@@ -19,8 +19,7 @@
 # CDDL HEADER END
 #
 #
-# Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
-# Use is subject to license terms.
+# Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
 #
 #	This makefile drives the production of the stmf_sbd driver for 
 #	COMSTAR.
@@ -57,7 +56,7 @@
 #	Overrides and depends_on
 #
 MODSTUBS_DIR	 = $(OBJS_DIR)
-LDFLAGS		+= -dy -Ndrv/stmf
+LDFLAGS		+= -dy -Ndrv/stmf -Nfs/zfs
 
 INC_PATH	+= -I$(UTSBASE)/common/fs/zfs
 INC_PATH	+= -I$(UTSBASE)/common/io/comstar/lu/stmf_sbd