6416482 filebench oltp workload hangs in zfs
authormaybee
Mon, 19 Jun 2006 19:31:35 -0700
changeset 2237 45affe88ed99
parent 2236 31cd1215427e
child 2238 92401fcee910
6416482 filebench oltp workload hangs in zfs 6440499 zil should avoid txg_wait_synced() and use dmu_sync() to issue parallel IOs when fsyncing
usr/src/cmd/ztest/ztest.c
usr/src/uts/common/fs/zfs/arc.c
usr/src/uts/common/fs/zfs/dbuf.c
usr/src/uts/common/fs/zfs/dmu.c
usr/src/uts/common/fs/zfs/sys/dmu.h
usr/src/uts/common/fs/zfs/sys/zfs_rlock.h
usr/src/uts/common/fs/zfs/sys/zil.h
usr/src/uts/common/fs/zfs/sys/zil_impl.h
usr/src/uts/common/fs/zfs/vdev.c
usr/src/uts/common/fs/zfs/zfs_log.c
usr/src/uts/common/fs/zfs/zfs_rlock.c
usr/src/uts/common/fs/zfs/zfs_vnops.c
usr/src/uts/common/fs/zfs/zfs_znode.c
usr/src/uts/common/fs/zfs/zil.c
usr/src/uts/common/fs/zfs/zvol.c
--- a/usr/src/cmd/ztest/ztest.c	Mon Jun 19 18:00:33 2006 -0700
+++ b/usr/src/cmd/ztest/ztest.c	Mon Jun 19 19:31:35 2006 -0700
@@ -2031,22 +2031,30 @@
 			uint64_t blkoff;
 			zbookmark_t zb;
 
-			txg_suspend(dmu_objset_pool(os));
 			(void) mutex_lock(lp);
-			error = dmu_sync(os, ZTEST_DIROBJ, off, &blkoff, &blk,
-			    txg);
+			blkoff = P2ALIGN_TYPED(off, bs, uint64_t);
+			error = dmu_buf_hold(os,
+			    ZTEST_DIROBJ, blkoff, FTAG, &db);
+			if (error) {
+				dprintf("dmu_buf_hold(%s, %d, %llx) = %d\n",
+				    osname, ZTEST_DIROBJ, blkoff, error);
+				(void) mutex_unlock(lp);
+				continue;
+			}
+			blkoff = off - blkoff;
+			error = dmu_sync(NULL, db, &blk, txg, NULL, NULL);
+			dmu_buf_rele(db, FTAG);
 			(void) mutex_unlock(lp);
 			if (error) {
-				txg_resume(dmu_objset_pool(os));
 				dprintf("dmu_sync(%s, %d, %llx) = %d\n",
 				    osname, ZTEST_DIROBJ, off, error);
 				continue;
 			}
 
 			if (blk.blk_birth == 0)	{	/* concurrent free */
-				txg_resume(dmu_objset_pool(os));
 				continue;
 			}
+			txg_suspend(dmu_objset_pool(os));
 
 			ASSERT(blk.blk_fill == 1);
 			ASSERT3U(BP_GET_TYPE(&blk), ==, DMU_OT_UINT64_OTHER);
--- a/usr/src/uts/common/fs/zfs/arc.c	Mon Jun 19 18:00:33 2006 -0700
+++ b/usr/src/uts/common/fs/zfs/arc.c	Mon Jun 19 19:31:35 2006 -0700
@@ -2217,6 +2217,8 @@
 	ASSERT3P(hdr->b_state, ==, arc.anon);
 	ASSERT(BUF_EMPTY(hdr));
 	ASSERT(!HDR_IO_ERROR(hdr));
+	ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0);
+	ASSERT(hdr->b_acb == 0);
 	acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
 	acb->acb_done = done;
 	acb->acb_private = private;
--- a/usr/src/uts/common/fs/zfs/dbuf.c	Mon Jun 19 18:00:33 2006 -0700
+++ b/usr/src/uts/common/fs/zfs/dbuf.c	Mon Jun 19 19:31:35 2006 -0700
@@ -712,9 +712,9 @@
 {
 	ASSERT(db->db_blkid != DB_BONUS_BLKID);
 	ASSERT(MUTEX_HELD(&db->db_mtx));
-	if (db->db_d.db_overridden_by[txg&TXG_MASK] == IN_DMU_SYNC) {
-		db->db_d.db_overridden_by[txg&TXG_MASK] = NULL;
-	} else if (db->db_d.db_overridden_by[txg&TXG_MASK] != NULL) {
+	ASSERT(db->db_d.db_overridden_by[txg&TXG_MASK] != IN_DMU_SYNC);
+
+	if (db->db_d.db_overridden_by[txg&TXG_MASK] != NULL) {
 		/* free this block */
 		ASSERT(list_link_active(&db->db_dirty_node[txg&TXG_MASK]) ||
 		    db->db_dnode->dn_free_txg == txg);
@@ -1783,6 +1783,16 @@
 	if (db->db_level == 0) {
 		data = (arc_buf_t **)&db->db_d.db_data_old[txg&TXG_MASK];
 		blksz = arc_buf_size(*data);
+
+		/*
+		 * This buffer is in the middle of an immdiate write.
+		 * Wait for the synchronous IO to complete.
+		 */
+		while (db->db_d.db_overridden_by[txg&TXG_MASK] == IN_DMU_SYNC) {
+			ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
+			cv_wait(&db->db_changed, &db->db_mtx);
+			ASSERT(db->db_d.db_overridden_by[txg&TXG_MASK]);
+		}
 		/*
 		 * If this buffer is currently "in use" (i.e., there are
 		 * active holds and db_data still references it), then make
@@ -2085,6 +2095,8 @@
 
 	mutex_enter(&db->db_mtx);
 
+	ASSERT(db->db_d.db_overridden_by[txg&TXG_MASK] == NULL);
+
 	if (db->db_dirtied == txg)
 		db->db_dirtied = 0;
 
--- a/usr/src/uts/common/fs/zfs/dmu.c	Mon Jun 19 18:00:33 2006 -0700
+++ b/usr/src/uts/common/fs/zfs/dmu.c	Mon Jun 19 19:31:35 2006 -0700
@@ -1376,185 +1376,195 @@
 	return (ra.err);
 }
 
+typedef struct {
+	uint64_t	txg;
+	dmu_buf_impl_t	*db;
+	dmu_sync_cb_t	*done;
+	void		*arg;
+} dmu_sync_cbin_t;
+
+typedef union {
+	dmu_sync_cbin_t	data;
+	blkptr_t	blk;
+} dmu_sync_cbarg_t;
+
+/* ARGSUSED */
+static void
+dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg)
+{
+	dmu_sync_cbin_t *in = (dmu_sync_cbin_t *)varg;
+	dmu_buf_impl_t *db = in->db;
+	uint64_t txg = in->txg;
+	dmu_sync_cb_t *done = in->done;
+	void *arg = in->arg;
+	blkptr_t *blk = (blkptr_t *)varg;
+
+	if (!BP_IS_HOLE(zio->io_bp)) {
+		zio->io_bp->blk_fill = 1;
+		BP_SET_TYPE(zio->io_bp, db->db_dnode->dn_type);
+		BP_SET_LEVEL(zio->io_bp, 0);
+	}
+
+	*blk = *zio->io_bp; /* structure assignment */
+
+	mutex_enter(&db->db_mtx);
+	ASSERT(db->db_d.db_overridden_by[txg&TXG_MASK] == IN_DMU_SYNC);
+	db->db_d.db_overridden_by[txg&TXG_MASK] = blk;
+	cv_broadcast(&db->db_changed);
+	mutex_exit(&db->db_mtx);
+
+	if (done)
+		done(&(db->db), arg);
+}
+
 /*
- * Intent log support: sync the block at <os, object, offset> to disk.
- * N.B. and XXX: the caller is responsible for serializing dmu_sync()s
- * of the same block, and for making sure that the data isn't changing
- * while dmu_sync() is writing it.
+ * Intent log support: sync the block associated with db to disk.
+ * N.B. and XXX: the caller is responsible for making sure that the
+ * data isn't changing while dmu_sync() is writing it.
  *
  * Return values:
  *
- *	EALREADY: this txg has already been synced, so there's nothing to to.
+ *	EEXIST: this txg has already been synced, so there's nothing to to.
  *		The caller should not log the write.
  *
  *	ENOENT: the block was dbuf_free_range()'d, so there's nothing to do.
  *		The caller should not log the write.
  *
- *	EINPROGRESS: the block is in the process of being synced by the
- *		usual mechanism (spa_sync()), so we can't sync it here.
- *		The caller should txg_wait_synced() and not log the write.
- *
- *	EBUSY: another thread is trying to dmu_sync() the same dbuf.
- *		(This case cannot arise under the current locking rules.)
- *		The caller should txg_wait_synced() and not log the write.
+ *	EALREADY: this block is already in the process of being synced.
+ *		The caller should track its progress (somehow).
  *
- *	ESTALE: the block was dirtied or freed while we were writing it,
- *		so the data is no longer valid.
- *		The caller should txg_wait_synced() and not log the write.
+ *	EINPROGRESS: the IO has been initiated.
+ *		The caller should log this blkptr in the callback.
  *
- *	0: success.  Sets *bp to the blkptr just written, and sets
- *		*blkoff to the data's offset within that block.
- *		The caller should log this blkptr/blkoff in its lr_write_t.
+ *	0: completed.  Sets *bp to the blkptr just written.
+ *		The caller should log this blkptr immediately.
  */
 int
-dmu_sync(objset_t *os, uint64_t object, uint64_t offset, uint64_t *blkoff,
-    blkptr_t *bp, uint64_t txg)
+dmu_sync(zio_t *pio, dmu_buf_t *db_fake,
+    blkptr_t *bp, uint64_t txg, dmu_sync_cb_t *done, void *arg)
 {
-	objset_impl_t *osi = os->os;
-	dsl_pool_t *dp = osi->os_dsl_dataset->ds_dir->dd_pool;
+	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+	objset_impl_t *os = db->db_objset;
+	dsl_pool_t *dp = os->os_dsl_dataset->ds_dir->dd_pool;
 	tx_state_t *tx = &dp->dp_tx;
-	dmu_buf_impl_t *db;
+	dmu_sync_cbin_t *in;
 	blkptr_t *blk;
+	zbookmark_t zb;
+	uint32_t arc_flag;
 	int err;
-	zbookmark_t zb;
 
-	ASSERT(RW_LOCK_HELD(&tx->tx_suspend));
 	ASSERT(BP_IS_HOLE(bp));
 	ASSERT(txg != 0);
 
+
 	dprintf("dmu_sync txg=%llu, s,o,q %llu %llu %llu\n",
 	    txg, tx->tx_synced_txg, tx->tx_open_txg, tx->tx_quiesced_txg);
 
 	/*
-	 * XXX why is this routine using dmu_buf_*() and casting between
-	 * dmu_buf_impl_t and dmu_buf_t?
+	 * XXX - would be nice if we could do this without suspending...
 	 */
+	txg_suspend(dp);
 
 	/*
 	 * If this txg already synced, there's nothing to do.
 	 */
 	if (txg <= tx->tx_synced_txg) {
+		txg_resume(dp);
 		/*
 		 * If we're running ziltest, we need the blkptr regardless.
 		 */
 		if (txg > spa_freeze_txg(dp->dp_spa)) {
-			err = dmu_buf_hold(os, object, offset,
-			    FTAG, (dmu_buf_t **)&db);
-			if (err)
-				return (err);
 			/* if db_blkptr == NULL, this was an empty write */
 			if (db->db_blkptr)
 				*bp = *db->db_blkptr; /* structure assignment */
-			else
-				bzero(bp, sizeof (blkptr_t));
-			*blkoff = offset - db->db.db_offset;
-			ASSERT3U(*blkoff, <, db->db.db_size);
-			dmu_buf_rele((dmu_buf_t *)db, FTAG);
 			return (0);
 		}
-		return (EALREADY);
+		return (EEXIST);
 	}
 
-	/*
-	 * If this txg is in the middle of syncing, just wait for it.
-	 */
-	if (txg == tx->tx_syncing_txg) {
-		ASSERT(txg != tx->tx_open_txg);
-		return (EINPROGRESS);
-	}
-
-	err = dmu_buf_hold(os, object, offset, FTAG, (dmu_buf_t **)&db);
-	if (err)
-		return (err);
-
 	mutex_enter(&db->db_mtx);
 
-	/*
-	 * If this dbuf isn't dirty, must have been free_range'd.
-	 * There's no need to log writes to freed blocks, so we're done.
-	 */
-	if (!list_link_active(&db->db_dirty_node[txg&TXG_MASK])) {
+	blk = db->db_d.db_overridden_by[txg&TXG_MASK];
+	if (blk == IN_DMU_SYNC) {
+		/*
+		 * We have already issued a sync write for this buffer.
+		 */
+		mutex_exit(&db->db_mtx);
+		txg_resume(dp);
+		return (EALREADY);
+	} else if (blk != NULL) {
+		/*
+		 * This buffer had already been synced.  It could not
+		 * have been dirtied since, or we would have cleared blk.
+		 */
+		*bp = *blk; /* structure assignment */
 		mutex_exit(&db->db_mtx);
-		dmu_buf_rele((dmu_buf_t *)db, FTAG);
+		txg_resume(dp);
+		return (0);
+	}
+
+	if (txg == tx->tx_syncing_txg) {
+		while (db->db_data_pending) {
+			/*
+			 * IO is in-progress.  Wait for it to finish.
+			 * XXX - would be nice to be able to somehow "attach"
+			 * this zio to the parent zio passed in.
+			 */
+			cv_wait(&db->db_changed, &db->db_mtx);
+			ASSERT(db->db_data_pending ||
+			    (db->db_blkptr && db->db_blkptr->blk_birth == txg));
+		}
+
+		if (db->db_blkptr && db->db_blkptr->blk_birth == txg) {
+			/*
+			 * IO is already completed.
+			 */
+			*bp = *db->db_blkptr; /* structure assignment */
+			mutex_exit(&db->db_mtx);
+			txg_resume(dp);
+			return (0);
+		}
+	}
+
+	if (db->db_d.db_data_old[txg&TXG_MASK] == NULL) {
+		/*
+		 * This dbuf isn't dirty, must have been free_range'd.
+		 * There's no need to log writes to freed blocks, so we're done.
+		 */
+		mutex_exit(&db->db_mtx);
+		txg_resume(dp);
 		return (ENOENT);
 	}
 
-	blk = db->db_d.db_overridden_by[txg&TXG_MASK];
-
+	ASSERT(db->db_d.db_overridden_by[txg&TXG_MASK] == NULL);
+	db->db_d.db_overridden_by[txg&TXG_MASK] = IN_DMU_SYNC;
 	/*
-	 * If we already did a dmu_sync() of this dbuf in this txg,
-	 * free the old block before writing the new one.
+	 * XXX - a little ugly to stash the blkptr in the callback
+	 * buffer.  We always need to make sure the following is true:
+	 * ASSERT(sizeof(blkptr_t) >= sizeof(dmu_sync_cbin_t));
 	 */
-	if (blk != NULL) {
-		ASSERT(blk != IN_DMU_SYNC);
-		if (blk == IN_DMU_SYNC) {
-			mutex_exit(&db->db_mtx);
-			dmu_buf_rele((dmu_buf_t *)db, FTAG);
-			return (EBUSY);
-		}
-		arc_release(db->db_d.db_data_old[txg&TXG_MASK], db);
-		if (!BP_IS_HOLE(blk)) {
-			(void) arc_free(NULL, osi->os_spa, txg, blk,
-			    NULL, NULL, ARC_WAIT);
-		}
-		kmem_free(blk, sizeof (blkptr_t));
-	}
+	in = kmem_alloc(sizeof (blkptr_t), KM_SLEEP);
+	in->db = db;
+	in->txg = txg;
+	in->done = done;
+	in->arg = arg;
+	mutex_exit(&db->db_mtx);
+	txg_resume(dp);
 
-	db->db_d.db_overridden_by[txg&TXG_MASK] = IN_DMU_SYNC;
-	mutex_exit(&db->db_mtx);
-
-	blk = kmem_alloc(sizeof (blkptr_t), KM_SLEEP);
-	blk->blk_birth = 0; /* mark as invalid */
-
-	zb.zb_objset = osi->os_dsl_dataset->ds_object;
+	arc_flag = pio == NULL ? ARC_WAIT : ARC_NOWAIT;
+	zb.zb_objset = os->os_dsl_dataset->ds_object;
 	zb.zb_object = db->db.db_object;
 	zb.zb_level = db->db_level;
 	zb.zb_blkid = db->db_blkid;
-	err = arc_write(NULL, osi->os_spa,
-	    zio_checksum_select(db->db_dnode->dn_checksum, osi->os_checksum),
-	    zio_compress_select(db->db_dnode->dn_compress, osi->os_compress),
-	    dmu_get_replication_level(osi->os_spa, &zb, db->db_dnode->dn_type),
-	    txg, blk, db->db_d.db_data_old[txg&TXG_MASK], NULL, NULL,
-	    ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, ARC_WAIT, &zb);
+	err = arc_write(pio, os->os_spa,
+	    zio_checksum_select(db->db_dnode->dn_checksum, os->os_checksum),
+	    zio_compress_select(db->db_dnode->dn_compress, os->os_compress),
+	    dmu_get_replication_level(os->os_spa, &zb, db->db_dnode->dn_type),
+	    txg, bp, db->db_d.db_data_old[txg&TXG_MASK], dmu_sync_done, in,
+	    ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, arc_flag, &zb);
 	ASSERT(err == 0);
 
-	if (!BP_IS_HOLE(blk)) {
-		blk->blk_fill = 1;
-		BP_SET_TYPE(blk, db->db_dnode->dn_type);
-		BP_SET_LEVEL(blk, 0);
-	}
-
-	/* copy the block pointer back to caller */
-	*bp = *blk; /* structure assignment */
-	*blkoff = offset - db->db.db_offset;
-	ASSERT3U(*blkoff, <, db->db.db_size);
-
-	mutex_enter(&db->db_mtx);
-	if (db->db_d.db_overridden_by[txg&TXG_MASK] != IN_DMU_SYNC) {
-		/* we were dirtied/freed during the sync */
-		ASSERT3P(db->db_d.db_overridden_by[txg&TXG_MASK], ==, NULL);
-		arc_release(db->db_d.db_data_old[txg&TXG_MASK], db);
-		mutex_exit(&db->db_mtx);
-		dmu_buf_rele((dmu_buf_t *)db, FTAG);
-		/* Note that this block does not free on disk until txg syncs */
-
-		/*
-		 * XXX can we use ARC_NOWAIT here?
-		 * XXX should we be ignoring the return code?
-		 */
-		if (!BP_IS_HOLE(blk)) {
-			(void) arc_free(NULL, osi->os_spa, txg, blk,
-			    NULL, NULL, ARC_WAIT);
-		}
-		kmem_free(blk, sizeof (blkptr_t));
-		return (ESTALE);
-	}
-
-	db->db_d.db_overridden_by[txg&TXG_MASK] = blk;
-	mutex_exit(&db->db_mtx);
-	dmu_buf_rele((dmu_buf_t *)db, FTAG);
-	ASSERT3U(txg, >, tx->tx_syncing_txg);
-	return (0);
+	return (arc_flag == ARC_NOWAIT ? EINPROGRESS : 0);
 }
 
 uint64_t
--- a/usr/src/uts/common/fs/zfs/sys/dmu.h	Mon Jun 19 18:00:33 2006 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/dmu.h	Mon Jun 19 19:31:35 2006 -0700
@@ -572,13 +572,17 @@
 
 /*
  * Synchronous write.
- * On success returns 0 and fills in the blk pointed at by bp.
+ * If a parent zio is provided this function initiates a write on the
+ * provided buffer as a child of the parent zio.
+ * In the absense of a parent zio, the write is completed synchronously.
+ * At write completion, blk is filled with the bp of the written block.
  * Note that while the data covered by this function will be on stable
- * storage when the function returns this new data does not become a
+ * storage when the write completes this new data does not become a
  * permanent part of the file until the associated transaction commits.
  */
-int dmu_sync(objset_t *os, uint64_t object, uint64_t offset, uint64_t *blkoff,
-    struct blkptr *bp, uint64_t txg);
+typedef void dmu_sync_cb_t(dmu_buf_t *db, void *arg);
+int dmu_sync(struct zio *zio, dmu_buf_t *db,
+    struct blkptr *bp, uint64_t txg, dmu_sync_cb_t *done, void *arg);
 
 /*
  * Find the next hole or data block in file starting at *off
--- a/usr/src/uts/common/fs/zfs/sys/zfs_rlock.h	Mon Jun 19 18:00:33 2006 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/zfs_rlock.h	Mon Jun 19 19:31:35 2006 -0700
@@ -43,6 +43,7 @@
 } rl_type_t;
 
 typedef struct rl {
+	znode_t *r_zp;		/* znode this lock applies to */
 	avl_node_t r_node;	/* avl node link */
 	uint64_t r_off;		/* file range offset */
 	uint64_t r_len;		/* file range length */
@@ -66,13 +67,13 @@
 /*
  * Unlock range and destroy range lock structure.
  */
-void zfs_range_unlock(znode_t *zp, rl_t *rl);
+void zfs_range_unlock(rl_t *rl);
 
 /*
  * Reduce range locked as RW_WRITER from whole file to specified range.
  * Asserts the whole file was previously locked.
  */
-void zfs_range_reduce(znode_t *zp, rl_t *rl, uint64_t off, uint64_t len);
+void zfs_range_reduce(rl_t *rl, uint64_t off, uint64_t len);
 
 /*
  * AVL comparison function used to compare range locks
--- a/usr/src/uts/common/fs/zfs/sys/zil.h	Mon Jun 19 18:00:33 2006 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/zil.h	Mon Jun 19 19:31:35 2006 -0700
@@ -215,7 +215,7 @@
 typedef void zil_parse_lr_func_t(zilog_t *zilog, lr_t *lr, void *arg,
     uint64_t txg);
 typedef int zil_replay_func_t();
-typedef int zil_get_data_t(void *arg, lr_write_t *lr, char *dbuf);
+typedef int zil_get_data_t(void *arg, lr_write_t *lr, char *dbuf, zio_t *zio);
 
 extern uint64_t	zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func,
     zil_parse_lr_func_t *parse_lr_func, void *arg, uint64_t txg);
--- a/usr/src/uts/common/fs/zfs/sys/zil_impl.h	Mon Jun 19 18:00:33 2006 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/zil_impl.h	Mon Jun 19 19:31:35 2006 -0700
@@ -51,6 +51,7 @@
 	int		lwb_nused;	/* # used bytes in buffer */
 	int		lwb_sz;		/* size of block and buffer */
 	char		*lwb_buf;	/* log write buffer */
+	zio_t		*lwb_zio;	/* zio for this buffer */
 	uint64_t	lwb_max_txg;	/* highest txg in this lwb */
 	uint64_t	lwb_seq;	/* highest log record seq number */
 	txg_handle_t	lwb_txgh;	/* txg handle for txg_exit() */
@@ -78,6 +79,7 @@
 	objset_t	*zl_os;		/* object set we're logging */
 	zil_get_data_t	*zl_get_data;	/* callback to get object content */
 	uint64_t	zl_itx_seq;	/* itx sequence number */
+	uint64_t	zl_wait_seq;	/* last tx write initiated */
 	uint64_t	zl_ss_seq;	/* last tx on stable storage */
 	uint64_t	zl_destroy_txg;	/* txg of last zil_destroy() */
 	uint64_t	zl_replay_seq[TXG_SIZE]; /* seq of last replayed rec */
--- a/usr/src/uts/common/fs/zfs/vdev.c	Mon Jun 19 18:00:33 2006 -0700
+++ b/usr/src/uts/common/fs/zfs/vdev.c	Mon Jun 19 19:31:35 2006 -0700
@@ -1792,7 +1792,7 @@
 		"deadline = pri + (lbolt >> time_shift)",
 		0,
 		63,
-		4,
+		8,
 		offsetof(struct vdev, vdev_queue.vq_time_shift)
 	},
 	{
--- a/usr/src/uts/common/fs/zfs/zfs_log.c	Mon Jun 19 18:00:33 2006 -0700
+++ b/usr/src/uts/common/fs/zfs/zfs_log.c	Mon Jun 19 19:31:35 2006 -0700
@@ -209,7 +209,7 @@
 /*
  * zfs_log_write() handles TX_WRITE transactions.
  */
-ssize_t zfs_immediate_write_sz = 65536;
+ssize_t zfs_immediate_write_sz = 32768;
 
 uint64_t
 zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
--- a/usr/src/uts/common/fs/zfs/zfs_rlock.c	Mon Jun 19 18:00:33 2006 -0700
+++ b/usr/src/uts/common/fs/zfs/zfs_rlock.c	Mon Jun 19 19:31:35 2006 -0700
@@ -34,8 +34,8 @@
  * ---------
  * Defined in zfs_rlock.h but essentially:
  *	rl = zfs_range_lock(zp, off, len, lock_type);
- *	zfs_range_unlock(zp, rl);
- *	zfs_range_reduce(zp, rl, off, len);
+ *	zfs_range_unlock(rl);
+ *	zfs_range_reduce(rl, off, len);
  *
  * AVL tree
  * --------
@@ -417,6 +417,7 @@
 	ASSERT(type == RL_READER || type == RL_WRITER || type == RL_APPEND);
 
 	new = kmem_alloc(sizeof (rl_t), KM_SLEEP);
+	new->r_zp = zp;
 	new->r_off = off;
 	new->r_len = len;
 	new->r_cnt = 1; /* assume it's going to be in the tree */
@@ -503,8 +504,10 @@
  * Unlock range and destroy range lock structure.
  */
 void
-zfs_range_unlock(znode_t *zp, rl_t *rl)
+zfs_range_unlock(rl_t *rl)
 {
+	znode_t *zp = rl->r_zp;
+
 	ASSERT(rl->r_type == RL_WRITER || rl->r_type == RL_READER);
 	ASSERT(rl->r_cnt == 1 || rl->r_cnt == 0);
 	ASSERT(!rl->r_proxy);
@@ -535,8 +538,10 @@
  * entry in the tree.
  */
 void
-zfs_range_reduce(znode_t *zp, rl_t *rl, uint64_t off, uint64_t len)
+zfs_range_reduce(rl_t *rl, uint64_t off, uint64_t len)
 {
+	znode_t *zp = rl->r_zp;
+
 	/* Ensure there are no other locks */
 	ASSERT(avl_numnodes(&zp->z_range_avl) == 1);
 	ASSERT(rl->r_off == 0);
--- a/usr/src/uts/common/fs/zfs/zfs_vnops.c	Mon Jun 19 18:00:33 2006 -0700
+++ b/usr/src/uts/common/fs/zfs/zfs_vnops.c	Mon Jun 19 19:31:35 2006 -0700
@@ -487,7 +487,7 @@
 		dmu_buf_rele_array(dbpp, numbufs, FTAG);
 	}
 out:
-	zfs_range_unlock(zp, rl);
+	zfs_range_unlock(rl);
 
 	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
 	ZFS_EXIT(zfsvfs);
@@ -606,10 +606,10 @@
 	ZFS_ENTER(zfsvfs);
 
 	/*
-	 * Pre-fault the initial pages to ensure slow (eg NFS) pages
+	 * Pre-fault the pages to ensure slow (eg NFS) pages
 	 * don't hold up txg.
 	 */
-	zfs_prefault_write(MIN(start_resid, SPA_MAXBLOCKSIZE), uio);
+	zfs_prefault_write(n, uio);
 
 	/*
 	 * If in append mode, set the io offset pointer to eof.
@@ -692,7 +692,7 @@
 			new_blksz = MIN(end_size, max_blksz);
 		}
 		zfs_grow_blocksize(zp, new_blksz, tx);
-		zfs_range_reduce(zp, rl, woff, n);
+		zfs_range_reduce(rl, woff, n);
 	}
 
 	/*
@@ -766,9 +766,6 @@
 		    ioflag, uio);
 		dmu_tx_commit(tx);
 
-		/* Pre-fault the next set of pages */
-		zfs_prefault_write(MIN(n, SPA_MAXBLOCKSIZE), uio);
-
 		/*
 		 * Start another transaction.
 		 */
@@ -810,7 +807,7 @@
 
 no_tx_done:
 
-	zfs_range_unlock(zp, rl);
+	zfs_range_unlock(rl);
 
 	/*
 	 * If we're in replay mode, or we made no progress, return error.
@@ -827,16 +824,28 @@
 	return (0);
 }
 
+void
+zfs_get_done(dmu_buf_t *db, void *vrl)
+{
+	rl_t *rl = (rl_t *)vrl;
+	vnode_t *vp = ZTOV(rl->r_zp);
+
+	dmu_buf_rele(db, rl);
+	zfs_range_unlock(rl);
+	VN_RELE(vp);
+}
+
 /*
  * Get data to generate a TX_WRITE intent log record.
  */
 int
-zfs_get_data(void *arg, lr_write_t *lr, char *buf)
+zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
 {
 	zfsvfs_t *zfsvfs = arg;
 	objset_t *os = zfsvfs->z_os;
 	znode_t *zp;
 	uint64_t off = lr->lr_offset;
+	dmu_buf_t *db;
 	rl_t *rl;
 	int dlen = lr->lr_length;  		/* length of user data */
 	int error = 0;
@@ -861,8 +870,6 @@
 	 * we don't have to write the data twice.
 	 */
 	if (buf != NULL) { /* immediate write */
-		dmu_buf_t *db;
-
 		rl = zfs_range_lock(zp, off, dlen, RL_READER);
 		/* test for truncation needs to be done while range locked */
 		if (off >= zp->z_phys->zp_size) {
@@ -892,20 +899,30 @@
 			rl = zfs_range_lock(zp, boff, dlen, RL_READER);
 			if (zp->z_blksz == dlen)
 				break;
-			zfs_range_unlock(zp, rl);
+			zfs_range_unlock(rl);
 		}
 		/* test for truncation needs to be done while range locked */
 		if (off >= zp->z_phys->zp_size) {
 			error = ENOENT;
 			goto out;
 		}
-		txg_suspend(dmu_objset_pool(os));
-		error = dmu_sync(os, lr->lr_foid, off, &lr->lr_blkoff,
-		    &lr->lr_blkptr, lr->lr_common.lrc_txg);
-		txg_resume(dmu_objset_pool(os));
+		VERIFY(0 == dmu_buf_hold(os, lr->lr_foid, boff, rl, &db));
+		ASSERT(boff == db->db_offset);
+		lr->lr_blkoff = off - boff;
+		error = dmu_sync(zio, db, &lr->lr_blkptr,
+		    lr->lr_common.lrc_txg, zio ? zfs_get_done : NULL, rl);
+		/*
+		 * If we get EINPROGRESS, then we need to wait for a
+		 * write IO initiated by dmu_sync() to complete before
+		 * we can release this dbuf.  We will finish everthing
+		 * up in the zfs_get_done() callback.
+		 */
+		if (error == EINPROGRESS)
+			return (0);
+		dmu_buf_rele(db, rl);
 	}
 out:
-	zfs_range_unlock(zp, rl);
+	zfs_range_unlock(rl);
 	VN_RELE(ZTOV(zp));
 	return (error);
 }
@@ -2785,7 +2802,7 @@
 	 * Can't push pages past end-of-file.
 	 */
 	if (off >= zp->z_phys->zp_size) {
-		zfs_range_unlock(zp, rl);
+		zfs_range_unlock(rl);
 		return (EIO);
 	}
 	len = MIN(PAGESIZE, zp->z_phys->zp_size - off);
@@ -2795,7 +2812,7 @@
 	dmu_tx_hold_bonus(tx, zp->z_id);
 	err = dmu_tx_assign(tx, zfsvfs->z_assign);
 	if (err != 0) {
-		zfs_range_unlock(zp, rl);
+		zfs_range_unlock(rl);
 		if (err == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
 			dmu_tx_wait(tx);
 			dmu_tx_abort(tx);
@@ -2815,7 +2832,7 @@
 	(void) zfs_log_write(zilog, tx, TX_WRITE, zp, off, len, 0, NULL);
 	dmu_tx_commit(tx);
 
-	zfs_range_unlock(zp, rl);
+	zfs_range_unlock(rl);
 
 	pvn_write_done(pp, B_WRITE | flags);
 	if (offp)
@@ -3155,7 +3172,7 @@
 
 	/* can't fault past EOF */
 	if (off >= zp->z_phys->zp_size) {
-		zfs_range_unlock(zp, rl);
+		zfs_range_unlock(rl);
 		ZFS_EXIT(zfsvfs);
 		return (EFAULT);
 	}
@@ -3236,7 +3253,7 @@
 
 	if (need_unlock)
 		rw_exit(&zp->z_map_lock);
-	zfs_range_unlock(zp, rl);
+	zfs_range_unlock(rl);
 
 	ZFS_EXIT(zfsvfs);
 	return (err);
--- a/usr/src/uts/common/fs/zfs/zfs_znode.c	Mon Jun 19 18:00:33 2006 -0700
+++ b/usr/src/uts/common/fs/zfs/zfs_znode.c	Mon Jun 19 19:31:35 2006 -0700
@@ -936,7 +936,7 @@
 		/* recheck, in case zp_size changed */
 		if (off + len > zp->z_phys->zp_size) {
 			/* lost race: file size changed, lock whole file */
-			zfs_range_unlock(zp, rl);
+			zfs_range_unlock(rl);
 			rl = zfs_range_lock(zp, 0, UINT64_MAX, RL_WRITER);
 		}
 	}
@@ -946,7 +946,7 @@
 	 */
 	size = zp->z_phys->zp_size;
 	if (len == 0 && size == off) {
-		zfs_range_unlock(zp, rl);
+		zfs_range_unlock(rl);
 		return (0);
 	}
 
@@ -964,7 +964,7 @@
 			extent = size - off;
 		}
 		if (error = chklock(vp, FWRITE, start, extent, flag, NULL)) {
-			zfs_range_unlock(zp, rl);
+			zfs_range_unlock(rl);
 			return (error);
 		}
 	}
@@ -996,7 +996,7 @@
 		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT)
 			dmu_tx_wait(tx);
 		dmu_tx_abort(tx);
-		zfs_range_unlock(zp, rl);
+		zfs_range_unlock(rl);
 		return (error);
 	}
 
@@ -1022,7 +1022,7 @@
 		seq = zfs_log_truncate(zilog, tx, TX_TRUNCATE, zp, off, len);
 	}
 
-	zfs_range_unlock(zp, rl);
+	zfs_range_unlock(rl);
 
 	dmu_tx_commit(tx);
 
--- a/usr/src/uts/common/fs/zfs/zil.c	Mon Jun 19 18:00:33 2006 -0700
+++ b/usr/src/uts/common/fs/zfs/zil.c	Mon Jun 19 19:31:35 2006 -0700
@@ -366,6 +366,8 @@
 		lwb->lwb_max_txg = txg;
 		lwb->lwb_seq = 0;
 		lwb->lwb_state = UNWRITTEN;
+		lwb->lwb_zio = NULL;
+
 		mutex_enter(&zilog->zl_lock);
 		list_insert_tail(&zilog->zl_lwb_list, lwb);
 		mutex_exit(&zilog->zl_lock);
@@ -619,6 +621,29 @@
 }
 
 /*
+ * Initialize the io for a log block.
+ *
+ * Note, we should not initialize the IO until we are about
+ * to use it, since zio_rewrite() does a spa_config_enter().
+ */
+static void
+zil_lwb_write_init(zilog_t *zilog, lwb_t *lwb)
+{
+	zbookmark_t zb;
+
+	zb.zb_objset = lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_OBJSET];
+	zb.zb_object = 0;
+	zb.zb_level = -1;
+	zb.zb_blkid = lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_SEQ];
+
+	ASSERT(lwb->lwb_zio == NULL);
+	lwb->lwb_zio = zio_rewrite(NULL, zilog->zl_spa,
+	    ZIO_CHECKSUM_ZILOG, 0, &lwb->lwb_blk, lwb->lwb_buf,
+	    lwb->lwb_sz, zil_lwb_write_done, lwb,
+	    ZIO_PRIORITY_LOG_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
+}
+
+/*
  * Start a log block write and advance to the next log block.
  * Calls are serialized.
  */
@@ -631,7 +656,6 @@
 	blkptr_t *bp = &ztp->zit_next_blk;
 	uint64_t txg;
 	uint64_t zil_blksz;
-	zbookmark_t zb;
 	int error;
 
 	ASSERT(lwb->lwb_nused <= ZIL_BLK_DATA_SZ(lwb));
@@ -651,7 +675,8 @@
 	 * maximum of the previous used size, the current used size and
 	 * the amount waiting in the queue.
 	 */
-	zil_blksz = MAX(zilog->zl_cur_used, zilog->zl_prev_used);
+	zil_blksz = MAX(zilog->zl_prev_used,
+	    zilog->zl_cur_used + sizeof (*ztp));
 	zil_blksz = MAX(zil_blksz, zilog->zl_itx_list_sz + sizeof (*ztp));
 	zil_blksz = P2ROUNDUP_TYPED(zil_blksz, ZIL_MIN_BLKSZ, uint64_t);
 	if (zil_blksz > ZIL_MAX_BLKSZ)
@@ -692,6 +717,7 @@
 	nlwb->lwb_max_txg = txg;
 	nlwb->lwb_seq = 0;
 	nlwb->lwb_state = UNWRITTEN;
+	nlwb->lwb_zio = NULL;
 
 	/*
 	 * Put new lwb at the end of the log chain,
@@ -704,16 +730,19 @@
 	mutex_exit(&zilog->zl_lock);
 
 	/*
-	 * write the old log block
+	 * kick off the write for the old log block
 	 */
-	zb.zb_objset = lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_OBJSET];
-	zb.zb_object = 0;
-	zb.zb_level = -1;
-	zb.zb_blkid = lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_SEQ];
-
-	zio_nowait(zio_rewrite(NULL, spa, ZIO_CHECKSUM_ZILOG, 0,
-	    &lwb->lwb_blk, lwb->lwb_buf, lwb->lwb_sz, zil_lwb_write_done, lwb,
-	    ZIO_PRIORITY_LOG_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb));
+	dprintf_bp(&lwb->lwb_blk, "lwb %p txg %llu: ", lwb, txg);
+	if (lwb->lwb_zio == NULL) {
+		/*
+		 * This can only happen if there are no log records in this
+		 * block (i.e. the first record to go in was too big to fit).
+		 * XXX - would be nice if we could avoid this IO
+		 */
+		ASSERT(lwb->lwb_nused == 0);
+		zil_lwb_write_init(zilog, lwb);
+	}
+	zio_nowait(lwb->lwb_zio);
 
 	return (nlwb);
 }
@@ -722,61 +751,21 @@
 zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb)
 {
 	lr_t *lrc = &itx->itx_lr; /* common log record */
-	lr_write_t *lr;
-	char *dbuf;
+	lr_write_t *lr = (lr_write_t *)lrc;
 	uint64_t seq = lrc->lrc_seq;
 	uint64_t txg = lrc->lrc_txg;
 	uint64_t reclen = lrc->lrc_reclen;
-	uint64_t dlen = 0;
-	int error;
+	uint64_t dlen;
 
 	if (lwb == NULL)
 		return (NULL);
 	ASSERT(lwb->lwb_buf != NULL);
 
-	/*
-	 * If it's a write, fetch the data or get its blkptr as appropriate.
-	 */
-	if (lrc->lrc_txtype == TX_WRITE) {
-		lr = (lr_write_t *)lrc;
-		if (txg > spa_freeze_txg(zilog->zl_spa))
-			txg_wait_synced(zilog->zl_dmu_pool, txg);
-		if (itx->itx_wr_state != WR_COPIED) {
-			if (itx->itx_wr_state == WR_NEED_COPY) {
-				dlen = P2ROUNDUP_TYPED(lr->lr_length,
-				    sizeof (uint64_t), uint64_t);
-				ASSERT(dlen);
-				dbuf = kmem_alloc(dlen, KM_NOSLEEP);
-				/* on memory shortage use dmu_sync */
-				if (dbuf == NULL) {
-					itx->itx_wr_state = WR_INDIRECT;
-					dlen = 0;
-				}
-			} else {
-				ASSERT(itx->itx_wr_state == WR_INDIRECT);
-				dbuf = NULL;
-			}
-			error = zilog->zl_get_data(itx->itx_private, lr, dbuf);
-			if (error) {
-				if (dlen)
-					kmem_free(dbuf, dlen);
-				if (error != ENOENT && error != EALREADY) {
-					txg_wait_synced(zilog->zl_dmu_pool,
-					    txg);
-					mutex_enter(&zilog->zl_lock);
-					zilog->zl_ss_seq =
-					    MAX(seq, zilog->zl_ss_seq);
-					mutex_exit(&zilog->zl_lock);
-					return (lwb);
-				}
-				mutex_enter(&zilog->zl_lock);
-				zil_add_vdev(zilog, DVA_GET_VDEV(BP_IDENTITY(
-				    &(lr->lr_blkptr))), seq);
-				mutex_exit(&zilog->zl_lock);
-				return (lwb);
-			}
-		}
-	}
+	if (lrc->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY)
+		dlen = P2ROUNDUP_TYPED(
+		    lr->lr_length, sizeof (uint64_t), uint64_t);
+	else
+		dlen = 0;
 
 	zilog->zl_cur_used += (reclen + dlen);
 
@@ -785,32 +774,58 @@
 	 */
 	if (lwb->lwb_nused + reclen + dlen > ZIL_BLK_DATA_SZ(lwb)) {
 		lwb = zil_lwb_write_start(zilog, lwb);
-		if (lwb == NULL) {
-			if (dlen)
-				kmem_free(dbuf, dlen);
+		if (lwb == NULL)
 			return (NULL);
-		}
 		ASSERT(lwb->lwb_nused == 0);
 		if (reclen + dlen > ZIL_BLK_DATA_SZ(lwb)) {
 			txg_wait_synced(zilog->zl_dmu_pool, txg);
 			mutex_enter(&zilog->zl_lock);
 			zilog->zl_ss_seq = MAX(seq, zilog->zl_ss_seq);
 			mutex_exit(&zilog->zl_lock);
-			if (dlen)
-				kmem_free(dbuf, dlen);
 			return (lwb);
 		}
 	}
 
-	lrc->lrc_reclen += dlen;
+	if (lwb->lwb_zio == NULL)
+		zil_lwb_write_init(zilog, lwb);
+
 	bcopy(lrc, lwb->lwb_buf + lwb->lwb_nused, reclen);
-	lwb->lwb_nused += reclen;
-	if (dlen) {
-		bcopy(dbuf, lwb->lwb_buf + lwb->lwb_nused, dlen);
-		lwb->lwb_nused += dlen;
-		kmem_free(dbuf, dlen);
-		lrc->lrc_reclen -= dlen; /* for kmem_free of itx */
+
+	/*
+	 * If it's a write, fetch the data or get its blkptr as appropriate.
+	 */
+	if (lrc->lrc_txtype == TX_WRITE) {
+		if (txg > spa_freeze_txg(zilog->zl_spa))
+			txg_wait_synced(zilog->zl_dmu_pool, txg);
+		if (itx->itx_wr_state != WR_COPIED) {
+			char *dbuf;
+			int error;
+
+			/* alignment is guaranteed */
+			lr = (lr_write_t *)(lwb->lwb_buf + lwb->lwb_nused);
+			if (dlen) {
+				ASSERT(itx->itx_wr_state == WR_NEED_COPY);
+				dbuf = lwb->lwb_buf + lwb->lwb_nused + reclen;
+				lr->lr_common.lrc_reclen += dlen;
+			} else {
+				ASSERT(itx->itx_wr_state == WR_INDIRECT);
+				dbuf = NULL;
+			}
+			error = zilog->zl_get_data(
+			    itx->itx_private, lr, dbuf, lwb->lwb_zio);
+			if (error) {
+				ASSERT(error == ENOENT || error == EEXIST ||
+				    error == EALREADY);
+				return (lwb);
+			}
+		}
 	}
+
+	mutex_enter(&zilog->zl_lock);
+	ASSERT(seq > zilog->zl_wait_seq);
+	zilog->zl_wait_seq = seq;
+	mutex_exit(&zilog->zl_lock);
+	lwb->lwb_nused += reclen + dlen;
 	lwb->lwb_max_txg = MAX(lwb->lwb_max_txg, txg);
 	ASSERT3U(lwb->lwb_seq, <, seq);
 	lwb->lwb_seq = seq;
@@ -993,8 +1008,9 @@
 	/*
 	 * Wait if necessary for our seq to be committed.
 	 */
-	if (lwb) {
-		while (zilog->zl_ss_seq < seq && zilog->zl_log_error == 0)
+	if (lwb && zilog->zl_wait_seq) {
+		while (zilog->zl_ss_seq < zilog->zl_wait_seq &&
+		    zilog->zl_log_error == 0)
 			cv_wait(&zilog->zl_cv_seq, &zilog->zl_lock);
 		zil_flush_vdevs(zilog, seq);
 	}
@@ -1009,6 +1025,7 @@
 		cv_broadcast(&zilog->zl_cv_seq);
 	}
 	/* wake up others waiting to start a write */
+	zilog->zl_wait_seq = 0;
 	zilog->zl_writer = B_FALSE;
 	mutex_exit(&zilog->zl_lock);
 	cv_broadcast(&zilog->zl_cv_write);
--- a/usr/src/uts/common/fs/zfs/zvol.c	Mon Jun 19 18:00:33 2006 -0700
+++ b/usr/src/uts/common/fs/zfs/zvol.c	Mon Jun 19 19:31:35 2006 -0700
@@ -68,6 +68,7 @@
 #include <sys/zfs_ioctl.h>
 #include <sys/mkdev.h>
 #include <sys/zil.h>
+#include <sys/refcount.h>
 
 #include "zfs_namecheck.h"
 
@@ -683,7 +684,9 @@
 	itx_t *itx;
 	lr_write_t *lr;
 	objset_t *os;
+	dmu_buf_t *db;
 	uint64_t txg;
+	uint64_t boff;
 	int error;
 	uint32_t blocksize;
 
@@ -714,18 +717,22 @@
 		if (nbytes <= zvol_immediate_write_sz) {
 			itx = zvol_immediate_itx(off, nbytes, addr);
 		} else {
+			boff =  P2ALIGN_TYPED(off, blocksize, uint64_t);
 			itx = zil_itx_create(TX_WRITE, sizeof (*lr));
 			lr = (lr_write_t *)&itx->itx_lr;
 			lr->lr_foid = ZVOL_OBJ;
 			lr->lr_offset = off;
 			lr->lr_length = nbytes;
-			lr->lr_blkoff = 0;
+			lr->lr_blkoff = off - boff;
 			BP_ZERO(&lr->lr_blkptr);
 
-			txg_suspend(dmu_objset_pool(os));
-			error = dmu_sync(os, ZVOL_OBJ, off, &lr->lr_blkoff,
-			    &lr->lr_blkptr, txg);
-			txg_resume(dmu_objset_pool(os));
+			/* XXX - we should do these IOs in parallel */
+			VERIFY(0 == dmu_buf_hold(os, ZVOL_OBJ, boff,
+			    FTAG, &db));
+			ASSERT(boff == db->db_offset);
+			error = dmu_sync(NULL, db, &lr->lr_blkptr,
+			    txg, NULL, NULL);
+			dmu_buf_rele(db, FTAG);
 			if (error) {
 				kmem_free(itx, offsetof(itx_t, itx_lr));
 				return (error);