6341569 zio_alloc_blk() vdev distribution performs badly
authorperrin
Sat, 04 Nov 2006 07:59:19 -0800
changeset 3063 b252896b372b
parent 3062 46d280f5351d
child 3064 fdd6499bdd98
6341569 zio_alloc_blk() vdev distribution performs badly 6428639 large writes to zvol synchs too much, better cut down a little 6444692 Need to flush disk write cache for dmu_sync buffers 6465634 zvol: dmu_sync() should be issued in parallel 6468731 lwb_state_t can be nuked 6470042 parallel dmu_sync() isn't being used 6471679 stash blocksize in zvol_state_t rather than reading in every zvol_log_write 6472230 ZIL vdev management is inefficient 6473775 zil_commit changes in snv_48 make it hot for O_DSYNC workloads 6478388 ZIL replay takes too long causing issues while booting 6486390 zil_commit could push more transactions 6486496 zil_replay() useful debug
usr/src/uts/common/fs/zfs/metaslab.c
usr/src/uts/common/fs/zfs/sys/metaslab.h
usr/src/uts/common/fs/zfs/sys/zfs_znode.h
usr/src/uts/common/fs/zfs/sys/zil.h
usr/src/uts/common/fs/zfs/sys/zil_impl.h
usr/src/uts/common/fs/zfs/sys/zio.h
usr/src/uts/common/fs/zfs/zfs_log.c
usr/src/uts/common/fs/zfs/zfs_vnops.c
usr/src/uts/common/fs/zfs/zfs_znode.c
usr/src/uts/common/fs/zfs/zil.c
usr/src/uts/common/fs/zfs/zio.c
usr/src/uts/common/fs/zfs/zvol.c
--- a/usr/src/uts/common/fs/zfs/metaslab.c	Sat Nov 04 01:18:55 2006 -0800
+++ b/usr/src/uts/common/fs/zfs/metaslab.c	Sat Nov 04 07:59:19 2006 -0800
@@ -704,7 +704,7 @@
  */
 static int
 metaslab_alloc_dva(spa_t *spa, uint64_t psize, dva_t *dva, int d,
-    dva_t *hintdva, uint64_t txg)
+    dva_t *hintdva, uint64_t txg, boolean_t hintdva_avoid)
 {
 	metaslab_group_t *mg, *rotor;
 	metaslab_class_t *mc;
@@ -725,10 +725,10 @@
 	 * nothing actually breaks if we miss a few updates -- we just won't
 	 * allocate quite as evenly.  It all balances out over time.
 	 *
-	 * If we are doing ditto blocks, try to spread them across consecutive
-	 * vdevs.  If we're forced to reuse a vdev before we've allocated
-	 * all of our ditto blocks, then try and spread them out on that
-	 * vdev as much as possible.  If it turns out to not be possible,
+	 * If we are doing ditto or log blocks, try to spread them across
+	 * consecutive vdevs.  If we're forced to reuse a vdev before we've
+	 * allocated all of our ditto blocks, then try and spread them out on
+	 * that vdev as much as possible.  If it turns out to not be possible,
 	 * gradually lower our standards until anything becomes acceptable.
 	 * Also, allocating on consecutive vdevs (as opposed to random vdevs)
 	 * gives us hope of containing our fault domains to something we're
@@ -743,7 +743,10 @@
 	 */
 	if (hintdva) {
 		vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d]));
-		mg = vd->vdev_mg;
+		if (hintdva_avoid)
+			mg = vd->vdev_mg->mg_next;
+		else
+			mg = vd->vdev_mg;
 	} else if (d != 0) {
 		vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1]));
 		mg = vd->vdev_mg->mg_next;
@@ -918,7 +921,7 @@
 
 int
 metaslab_alloc(spa_t *spa, uint64_t psize, blkptr_t *bp, int ndvas,
-    uint64_t txg, blkptr_t *hintbp)
+    uint64_t txg, blkptr_t *hintbp, boolean_t hintbp_avoid)
 {
 	dva_t *dva = bp->blk_dva;
 	dva_t *hintdva = hintbp->blk_dva;
@@ -930,7 +933,8 @@
 	ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp));
 
 	for (d = 0; d < ndvas; d++) {
-		error = metaslab_alloc_dva(spa, psize, dva, d, hintdva, txg);
+		error = metaslab_alloc_dva(spa, psize, dva, d, hintdva,
+		    txg, hintbp_avoid);
 		if (error) {
 			for (d--; d >= 0; d--) {
 				metaslab_free_dva(spa, &dva[d], txg, B_TRUE);
--- a/usr/src/uts/common/fs/zfs/sys/metaslab.h	Sat Nov 04 01:18:55 2006 -0800
+++ b/usr/src/uts/common/fs/zfs/sys/metaslab.h	Sat Nov 04 07:59:19 2006 -0800
@@ -48,7 +48,7 @@
 extern void metaslab_sync_done(metaslab_t *msp, uint64_t txg);
 
 extern int metaslab_alloc(spa_t *spa, uint64_t psize, blkptr_t *bp,
-    int ncopies, uint64_t txg, blkptr_t *hintbp);
+    int ncopies, uint64_t txg, blkptr_t *hintbp, boolean_t hintbp_avoid);
 extern void metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg,
     boolean_t now);
 extern int metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg);
--- a/usr/src/uts/common/fs/zfs/sys/zfs_znode.h	Sat Nov 04 01:18:55 2006 -0800
+++ b/usr/src/uts/common/fs/zfs/sys/zfs_znode.h	Sat Nov 04 07:59:19 2006 -0800
@@ -151,6 +151,7 @@
 	uint_t		z_seq;		/* modification sequence number */
 	uint64_t	z_mapcnt;	/* number of pages mapped to file */
 	uint64_t	z_last_itx;	/* last ZIL itx on this znode */
+	uint32_t	z_sync_cnt;	/* synchronous open count */
 	kmutex_t	z_acl_lock;	/* acl data lock */
 	list_node_t	z_link_node;	/* all znodes in fs link */
 	/*
--- a/usr/src/uts/common/fs/zfs/sys/zil.h	Sat Nov 04 01:18:55 2006 -0800
+++ b/usr/src/uts/common/fs/zfs/sys/zil.h	Sat Nov 04 07:59:19 2006 -0800
@@ -212,10 +212,23 @@
 	list_node_t	itx_node;	/* linkage on zl_itx_list */
 	void		*itx_private;	/* type-specific opaque data */
 	itx_wr_state_t	itx_wr_state;	/* write state */
+	uint8_t		itx_sync;	/* synchronous transaction */
 	lr_t		itx_lr;		/* common part of log record */
 	/* followed by type-specific part of lr_xx_t and its immediate data */
 } itx_t;
 
+
+/*
+ * zgd_t is passed through dmu_sync() to the callback routine zfs_get_done()
+ * to handle the cleanup of the dmu_sync() buffer write
+ */
+typedef struct {
+	zilog_t		*zgd_zilog;	/* zilog */
+	blkptr_t	*zgd_bp;	/* block pointer */
+	struct rl	*zgd_rl;	/* range lock */
+} zgd_t;
+
+
 typedef void zil_parse_blk_func_t(zilog_t *zilog, blkptr_t *bp, void *arg,
     uint64_t txg);
 typedef void zil_parse_lr_func_t(zilog_t *zilog, lr_t *lr, void *arg,
@@ -252,6 +265,8 @@
 extern int	zil_suspend(zilog_t *zilog);
 extern void	zil_resume(zilog_t *zilog);
 
+extern void	zil_add_vdev(zilog_t *zilog, uint64_t vdev);
+
 extern int zil_disable;
 
 #ifdef	__cplusplus
--- a/usr/src/uts/common/fs/zfs/sys/zil_impl.h	Sat Nov 04 01:18:55 2006 -0800
+++ b/usr/src/uts/common/fs/zfs/sys/zil_impl.h	Sat Nov 04 07:59:19 2006 -0800
@@ -35,13 +35,6 @@
 extern "C" {
 #endif
 
-typedef enum lwb_state_type {
-	UNWRITTEN,	/* buffer yet to be written */
-	SEQ_INCOMPLETE,	/* buffer written, but there's an unwritten buffer in */
-			/* the sequence before this */
-	SEQ_COMPLETE,	/* no unwritten buffers before this */
-} lwb_state_t;
-
 /*
  * Log write buffer.
  */
@@ -58,8 +51,11 @@
 } lwb_t;
 
 /*
- * vdev element for use in flushing device write caches
+ * Vdev flushing: We use a bit map of size ZIL_VDEV_BMAP bytes.
+ * Any vdev numbers beyond that use a linked list of zil_vdev_t structures.
  */
+
+#define	ZIL_VDEV_BMSZ 16 /* 16 * 8 = 128 vdevs */
 typedef struct zil_vdev {
 	uint64_t	vdev;		/* device written */
 	list_node_t	vdev_seq_node;	/* zilog->zl_vdev_list linkage */
@@ -76,8 +72,8 @@
 	objset_t	*zl_os;		/* object set we're logging */
 	zil_get_data_t	*zl_get_data;	/* callback to get object content */
 	zio_t		*zl_root_zio;	/* log writer root zio */
-	uint64_t	zl_itx_seq;	/* itx sequence number */
-	uint64_t	zl_ss_seq;	/* last tx on stable storage */
+	uint64_t	zl_itx_seq;	/* next itx sequence number */
+	uint64_t	zl_commit_seq;	/* committed upto this number */
 	uint64_t	zl_lr_seq;	/* log record sequence number */
 	uint64_t	zl_destroy_txg;	/* txg of last zil_destroy() */
 	uint64_t	zl_replay_seq[TXG_SIZE]; /* seq of last replayed rec */
@@ -96,8 +92,11 @@
 	uint64_t	zl_prev_used;	/* previous commit log size used */
 	list_t		zl_lwb_list;	/* in-flight log write list */
 	list_t		zl_vdev_list;	/* list of [vdev, seq] pairs */
+	uint8_t		zl_vdev_bmap[ZIL_VDEV_BMSZ]; /* bitmap of vdevs */
 	taskq_t		*zl_clean_taskq; /* runs lwb and itx clean tasks */
 	avl_tree_t	zl_dva_tree;	/* track DVAs during log parse */
+	clock_t		zl_replay_time;	/* lbolt of when replay started */
+	uint64_t	zl_replay_blks;	/* number of log blocks replayed */
 };
 
 typedef struct zil_dva_node {
--- a/usr/src/uts/common/fs/zfs/sys/zio.h	Sat Nov 04 01:18:55 2006 -0800
+++ b/usr/src/uts/common/fs/zfs/sys/zio.h	Sat Nov 04 07:59:19 2006 -0800
@@ -286,7 +286,8 @@
     uint64_t size, void *data, int checksum,
     zio_done_func_t *done, void *private, int priority, int flags);
 
-extern int zio_alloc_blk(spa_t *spa, uint64_t size, blkptr_t *bp, uint64_t txg);
+extern int zio_alloc_blk(spa_t *spa, uint64_t size, blkptr_t *new_bp,
+    blkptr_t *old_bp, uint64_t txg);
 extern void zio_free_blk(spa_t *spa, blkptr_t *bp, uint64_t txg);
 
 extern int zio_wait(zio_t *zio);
--- a/usr/src/uts/common/fs/zfs/zfs_log.c	Sat Nov 04 01:18:55 2006 -0800
+++ b/usr/src/uts/common/fs/zfs/zfs_log.c	Sat Nov 04 07:59:19 2006 -0800
@@ -273,6 +273,7 @@
 
 	itx->itx_private = zp->z_zfsvfs;
 
+	itx->itx_sync = (zp->z_sync_cnt != 0);
 	seq = zil_itx_assign(zilog, itx, tx);
 	zp->z_last_itx = seq;
 }
@@ -297,6 +298,7 @@
 	lr->lr_offset = off;
 	lr->lr_length = len;
 
+	itx->itx_sync = (zp->z_sync_cnt != 0);
 	seq = zil_itx_assign(zilog, itx, tx);
 	zp->z_last_itx = seq;
 }
@@ -326,6 +328,7 @@
 	ZFS_TIME_ENCODE(&vap->va_atime, lr->lr_atime);
 	ZFS_TIME_ENCODE(&vap->va_mtime, lr->lr_mtime);
 
+	itx->itx_sync = (zp->z_sync_cnt != 0);
 	seq = zil_itx_assign(zilog, itx, tx);
 	zp->z_last_itx = seq;
 }
@@ -350,6 +353,7 @@
 	lr->lr_aclcnt = (uint64_t)aclcnt;
 	bcopy(z_ace, (ace_t *)(lr + 1), aclcnt * sizeof (ace_t));
 
+	itx->itx_sync = (zp->z_sync_cnt != 0);
 	seq = zil_itx_assign(zilog, itx, tx);
 	zp->z_last_itx = seq;
 }
--- a/usr/src/uts/common/fs/zfs/zfs_vnops.c	Sat Nov 04 01:18:55 2006 -0800
+++ b/usr/src/uts/common/fs/zfs/zfs_vnops.c	Sat Nov 04 07:59:19 2006 -0800
@@ -154,11 +154,15 @@
  *	ZFS_EXIT(zfsvfs);		// finished in zfs
  *	return (error);			// done, report error
  */
-
 /* ARGSUSED */
 static int
 zfs_open(vnode_t **vpp, int flag, cred_t *cr)
 {
+	znode_t	*zp = VTOZ(*vpp);
+
+	/* Keep a count of the synchronous opens in the znode */
+	if (flag & (FSYNC | FDSYNC))
+		atomic_inc_32(&zp->z_sync_cnt);
 	return (0);
 }
 
@@ -166,6 +170,12 @@
 static int
 zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr)
 {
+	znode_t	*zp = VTOZ(vp);
+
+	/* Decrement the synchronous opens in the znode */
+	if (flag & (FSYNC | FDSYNC))
+		atomic_dec_32(&zp->z_sync_cnt);
+
 	/*
 	 * Clean up any locks held by this process on the vp.
 	 */
@@ -827,14 +837,17 @@
 }
 
 void
-zfs_get_done(dmu_buf_t *db, void *vrl)
+zfs_get_done(dmu_buf_t *db, void *vzgd)
 {
-	rl_t *rl = (rl_t *)vrl;
+	zgd_t *zgd = (zgd_t *)vzgd;
+	rl_t *rl = zgd->zgd_rl;
 	vnode_t *vp = ZTOV(rl->r_zp);
 
-	dmu_buf_rele(db, rl);
+	dmu_buf_rele(db, vzgd);
 	zfs_range_unlock(rl);
 	VN_RELE(vp);
+	zil_add_vdev(zgd->zgd_zilog, DVA_GET_VDEV(BP_IDENTITY(zgd->zgd_bp)));
+	kmem_free(zgd, sizeof (zgd_t));
 }
 
 /*
@@ -849,9 +862,11 @@
 	uint64_t off = lr->lr_offset;
 	dmu_buf_t *db;
 	rl_t *rl;
+	zgd_t *zgd;
 	int dlen = lr->lr_length;  		/* length of user data */
 	int error = 0;
 
+	ASSERT(zio);
 	ASSERT(dlen != 0);
 
 	/*
@@ -907,11 +922,19 @@
 			error = ENOENT;
 			goto out;
 		}
-		VERIFY(0 == dmu_buf_hold(os, lr->lr_foid, boff, rl, &db));
+		zgd = (zgd_t *)kmem_alloc(sizeof (zgd_t), KM_SLEEP);
+		zgd->zgd_rl = rl;
+		zgd->zgd_zilog = zfsvfs->z_log;
+		zgd->zgd_bp = &lr->lr_blkptr;
+		VERIFY(0 == dmu_buf_hold(os, lr->lr_foid, boff, zgd, &db));
 		ASSERT(boff == db->db_offset);
 		lr->lr_blkoff = off - boff;
 		error = dmu_sync(zio, db, &lr->lr_blkptr,
-		    lr->lr_common.lrc_txg, zio ? zfs_get_done : NULL, rl);
+		    lr->lr_common.lrc_txg, zfs_get_done, zgd);
+		if (error == 0) {
+			zil_add_vdev(zfsvfs->z_log,
+			    DVA_GET_VDEV(BP_IDENTITY(&lr->lr_blkptr)));
+		}
 		/*
 		 * If we get EINPROGRESS, then we need to wait for a
 		 * write IO initiated by dmu_sync() to complete before
@@ -920,7 +943,8 @@
 		 */
 		if (error == EINPROGRESS)
 			return (0);
-		dmu_buf_rele(db, rl);
+		dmu_buf_rele(db, zgd);
+		kmem_free(zgd, sizeof (zgd_t));
 	}
 out:
 	zfs_range_unlock(rl);
--- a/usr/src/uts/common/fs/zfs/zfs_znode.c	Sat Nov 04 01:18:55 2006 -0800
+++ b/usr/src/uts/common/fs/zfs/zfs_znode.c	Sat Nov 04 07:59:19 2006 -0800
@@ -411,6 +411,7 @@
 	zp->z_id = obj_num;
 	zp->z_blksz = blksz;
 	zp->z_seq = 0x7A4653;
+	zp->z_sync_cnt = 0;
 
 	mutex_enter(&zfsvfs->z_znodes_lock);
 	list_insert_tail(&zfsvfs->z_all_znodes, zp);
--- a/usr/src/uts/common/fs/zfs/zil.c	Sat Nov 04 01:18:55 2006 -0800
+++ b/usr/src/uts/common/fs/zfs/zil.c	Sat Nov 04 07:59:19 2006 -0800
@@ -352,7 +352,8 @@
 		dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
 		txg = dmu_tx_get_txg(tx);
 
-		error = zio_alloc_blk(zilog->zl_spa, ZIL_MIN_BLKSZ, &blk, txg);
+		error = zio_alloc_blk(zilog->zl_spa, ZIL_MIN_BLKSZ, &blk,
+		    NULL, txg);
 
 		if (error == 0)
 			zil_init_log_chain(zilog, &blk);
@@ -494,73 +495,101 @@
 void
 zil_add_vdev(zilog_t *zilog, uint64_t vdev)
 {
-	zil_vdev_t *zv;
+	zil_vdev_t *zv, *new;
+	uint64_t bmap_sz = sizeof (zilog->zl_vdev_bmap) << 3;
+	uchar_t *cp;
 
 	if (zfs_nocacheflush)
 		return;
 
-	ASSERT(MUTEX_HELD(&zilog->zl_lock));
-	zv = kmem_alloc(sizeof (zil_vdev_t), KM_SLEEP);
-	zv->vdev = vdev;
-	list_insert_tail(&zilog->zl_vdev_list, zv);
+	if (vdev < bmap_sz) {
+		cp = zilog->zl_vdev_bmap + (vdev / 8);
+		atomic_or_8(cp, 1 << (vdev % 8));
+	} else  {
+		/*
+		 * insert into ordered list
+		 */
+		mutex_enter(&zilog->zl_lock);
+		for (zv = list_head(&zilog->zl_vdev_list); zv != NULL;
+		    zv = list_next(&zilog->zl_vdev_list, zv)) {
+			if (zv->vdev == vdev) {
+				/* duplicate found - just return */
+				mutex_exit(&zilog->zl_lock);
+				return;
+			}
+			if (zv->vdev > vdev) {
+				/* insert before this entry */
+				new = kmem_alloc(sizeof (zil_vdev_t),
+				    KM_SLEEP);
+				new->vdev = vdev;
+				list_insert_before(&zilog->zl_vdev_list,
+				    zv, new);
+				mutex_exit(&zilog->zl_lock);
+				return;
+			}
+		}
+		/* ran off end of list, insert at the end */
+		ASSERT(zv == NULL);
+		new = kmem_alloc(sizeof (zil_vdev_t), KM_SLEEP);
+		new->vdev = vdev;
+		list_insert_tail(&zilog->zl_vdev_list, new);
+		mutex_exit(&zilog->zl_lock);
+	}
+}
+
+/* start an async flush of the write cache for this vdev */
+void
+zil_flush_vdev(spa_t *spa, uint64_t vdev, zio_t **zio)
+{
+	vdev_t *vd;
+
+	if (*zio == NULL)
+		*zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
+
+	vd = vdev_lookup_top(spa, vdev);
+	ASSERT(vd);
+
+	(void) zio_nowait(zio_ioctl(*zio, spa, vd, DKIOCFLUSHWRITECACHE,
+	    NULL, NULL, ZIO_PRIORITY_NOW,
+	    ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY));
 }
 
 void
 zil_flush_vdevs(zilog_t *zilog)
 {
-	vdev_t *vd;
-	zil_vdev_t *zv, *zv2;
-	zio_t *zio;
-	spa_t *spa;
+	zil_vdev_t *zv;
+	zio_t *zio = NULL;
+	spa_t *spa = zilog->zl_spa;
 	uint64_t vdev;
+	uint8_t b;
+	int i, j;
+
+	ASSERT(zilog->zl_writer);
 
-	if (zfs_nocacheflush)
-		return;
-
-	ASSERT(MUTEX_HELD(&zilog->zl_lock));
-
-	spa = zilog->zl_spa;
-	zio = NULL;
+	for (i = 0; i < sizeof (zilog->zl_vdev_bmap); i++) {
+		b = zilog->zl_vdev_bmap[i];
+		if (b == 0)
+			continue;
+		for (j = 0; j < 8; j++) {
+			if (b & (1 << j)) {
+				vdev = (i << 3) + j;
+				zil_flush_vdev(spa, vdev, &zio);
+			}
+		}
+		zilog->zl_vdev_bmap[i] = 0;
+	}
 
 	while ((zv = list_head(&zilog->zl_vdev_list)) != NULL) {
-		vdev = zv->vdev;
+		zil_flush_vdev(spa, zv->vdev, &zio);
 		list_remove(&zilog->zl_vdev_list, zv);
 		kmem_free(zv, sizeof (zil_vdev_t));
-
-		/*
-		 * remove all chained entries with same vdev
-		 */
-		zv = list_head(&zilog->zl_vdev_list);
-		while (zv) {
-			zv2 = list_next(&zilog->zl_vdev_list, zv);
-			if (zv->vdev == vdev) {
-				list_remove(&zilog->zl_vdev_list, zv);
-				kmem_free(zv, sizeof (zil_vdev_t));
-			}
-			zv = zv2;
-		}
-
-		/* flush the write cache for this vdev */
-		mutex_exit(&zilog->zl_lock);
-		if (zio == NULL)
-			zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
-		vd = vdev_lookup_top(spa, vdev);
-		ASSERT(vd);
-		(void) zio_nowait(zio_ioctl(zio, spa, vd, DKIOCFLUSHWRITECACHE,
-		    NULL, NULL, ZIO_PRIORITY_NOW,
-		    ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY));
-		mutex_enter(&zilog->zl_lock);
 	}
-
 	/*
 	 * Wait for all the flushes to complete.  Not all devices actually
 	 * support the DKIOCFLUSHWRITECACHE ioctl, so it's OK if it fails.
 	 */
-	if (zio != NULL) {
-		mutex_exit(&zilog->zl_lock);
+	if (zio)
 		(void) zio_wait(zio);
-		mutex_enter(&zilog->zl_lock);
-	}
 }
 
 /*
@@ -610,10 +639,12 @@
 		zilog->zl_root_zio = zio_root(zilog->zl_spa, NULL, NULL,
 		    ZIO_FLAG_CANFAIL);
 	}
-	lwb->lwb_zio = zio_rewrite(zilog->zl_root_zio, zilog->zl_spa,
-	    ZIO_CHECKSUM_ZILOG, 0, &lwb->lwb_blk, lwb->lwb_buf,
-	    lwb->lwb_sz, zil_lwb_write_done, lwb,
-	    ZIO_PRIORITY_LOG_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
+	if (lwb->lwb_zio == NULL) {
+		lwb->lwb_zio = zio_rewrite(zilog->zl_root_zio, zilog->zl_spa,
+		    ZIO_CHECKSUM_ZILOG, 0, &lwb->lwb_blk, lwb->lwb_buf,
+		    lwb->lwb_sz, zil_lwb_write_done, lwb,
+		    ZIO_PRIORITY_LOG_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
+	}
 }
 
 /*
@@ -655,7 +686,9 @@
 	if (zil_blksz > ZIL_MAX_BLKSZ)
 		zil_blksz = ZIL_MAX_BLKSZ;
 
-	error = zio_alloc_blk(spa, zil_blksz, bp, txg);
+	BP_ZERO(bp);
+	/* pass the old blkptr in order to spread log blocks across devs */
+	error = zio_alloc_blk(spa, zil_blksz, bp, &lwb->lwb_blk, txg);
 	if (error) {
 		/*
 		 * Reinitialise the lwb.
@@ -689,20 +722,20 @@
 	nlwb->lwb_zio = NULL;
 
 	/*
-	 * Put new lwb at the end of the log chain,
-	 * and record the vdev for later flushing
+	 * Put new lwb at the end of the log chain
 	 */
 	mutex_enter(&zilog->zl_lock);
 	list_insert_tail(&zilog->zl_lwb_list, nlwb);
+	mutex_exit(&zilog->zl_lock);
+
+	/* Record the vdev for later flushing */
 	zil_add_vdev(zilog, DVA_GET_VDEV(BP_IDENTITY(&(lwb->lwb_blk))));
-	mutex_exit(&zilog->zl_lock);
 
 	/*
 	 * kick off the write for the old log block
 	 */
 	dprintf_bp(&lwb->lwb_blk, "lwb %p txg %llu: ", lwb, txg);
-	if (lwb->lwb_zio == NULL)
-		zil_lwb_write_init(zilog, lwb);
+	ASSERT(lwb->lwb_zio);
 	zio_nowait(lwb->lwb_zio);
 
 	return (nlwb);
@@ -729,6 +762,8 @@
 
 	zilog->zl_cur_used += (reclen + dlen);
 
+	zil_lwb_write_init(zilog, lwb);
+
 	/*
 	 * If this record won't fit in the current log block, start a new one.
 	 */
@@ -736,6 +771,7 @@
 		lwb = zil_lwb_write_start(zilog, lwb);
 		if (lwb == NULL)
 			return (NULL);
+		zil_lwb_write_init(zilog, lwb);
 		ASSERT(lwb->lwb_nused == 0);
 		if (reclen + dlen > ZIL_BLK_DATA_SZ(lwb)) {
 			txg_wait_synced(zilog->zl_dmu_pool, txg);
@@ -843,20 +879,26 @@
 		kmem_free(itx, offsetof(itx_t, itx_lr)
 		    + itx->itx_lr.lrc_reclen);
 	}
+	cv_broadcast(&zilog->zl_cv_writer);
 	mutex_exit(&zilog->zl_lock);
 }
 
 /*
- * If there are in-memory intent log transactions then
- * start up a taskq to free up any that have now been synced.
+ * If there are any in-memory intent log transactions which have now been
+ * synced then start up a taskq to free them.
  */
 void
 zil_clean(zilog_t *zilog)
 {
+	itx_t *itx;
+
 	mutex_enter(&zilog->zl_lock);
-	if (list_head(&zilog->zl_itx_list) != NULL)
+	itx = list_head(&zilog->zl_itx_list);
+	if ((itx != NULL) &&
+	    (itx->itx_lr.lrc_txg <= spa_last_synced_txg(zilog->zl_spa))) {
 		(void) taskq_dispatch(zilog->zl_clean_taskq,
 		    (void (*)(void *))zil_itx_clean, zilog, TQ_NOSLEEP);
+	}
 	mutex_exit(&zilog->zl_lock);
 }
 
@@ -865,6 +907,7 @@
 {
 	uint64_t txg;
 	uint64_t reclen;
+	uint64_t commit_seq = 0;
 	itx_t *itx, *itx_next = (itx_t *)-1;
 	lwb_t *lwb;
 	spa_t *spa;
@@ -883,13 +926,9 @@
 			 * dirty the fs by calling zil_create()
 			 */
 			if (list_is_empty(&zilog->zl_itx_list)) {
-				/* wake up others waiting to start a write */
 				zilog->zl_writer = B_FALSE;
-				cv_broadcast(&zilog->zl_cv_writer);
-				mutex_exit(&zilog->zl_lock);
 				return;
 			}
-
 			mutex_exit(&zilog->zl_lock);
 			zil_create(zilog);
 			mutex_enter(&zilog->zl_lock);
@@ -897,11 +936,7 @@
 		}
 	}
 
-	/*
-	 * Loop through in-memory log transactions filling log blocks,
-	 * until we reach the given sequence number and there's no more
-	 * room in the write buffer.
-	 */
+	/* Loop through in-memory log transactions filling log blocks. */
 	DTRACE_PROBE1(zil__cw1, zilog_t *, zilog);
 	for (;;) {
 		/*
@@ -917,6 +952,8 @@
 		for (; itx != NULL; itx = list_next(&zilog->zl_itx_list, itx)) {
 			if (foid == 0) /* push all foids? */
 				break;
+			if (itx->itx_sync) /* push all O_[D]SYNC */
+				break;
 			switch (itx->itx_lr.lrc_txtype) {
 			case TX_SETATTR:
 			case TX_WRITE:
@@ -936,8 +973,9 @@
 		reclen = itx->itx_lr.lrc_reclen;
 		if ((itx->itx_lr.lrc_seq > seq) &&
 		    ((lwb == NULL) || (lwb->lwb_nused == 0) ||
-		    (lwb->lwb_nused + reclen > ZIL_BLK_DATA_SZ(lwb))))
+		    (lwb->lwb_nused + reclen > ZIL_BLK_DATA_SZ(lwb)))) {
 			break;
+		}
 
 		/*
 		 * Save the next pointer.  Even though we soon drop
@@ -947,10 +985,10 @@
 		 */
 		itx_next = list_next(&zilog->zl_itx_list, itx);
 		list_remove(&zilog->zl_itx_list, itx);
+		mutex_exit(&zilog->zl_lock);
 		txg = itx->itx_lr.lrc_txg;
 		ASSERT(txg);
 
-		mutex_exit(&zilog->zl_lock);
 		if (txg > spa_last_synced_txg(spa) ||
 		    txg > spa_freeze_txg(spa))
 			lwb = zil_lwb_commit(zilog, itx, lwb);
@@ -960,10 +998,16 @@
 		zilog->zl_itx_list_sz -= reclen;
 	}
 	DTRACE_PROBE1(zil__cw2, zilog_t *, zilog);
+	/* determine commit sequence number */
+	itx = list_head(&zilog->zl_itx_list);
+	if (itx)
+		commit_seq = itx->itx_lr.lrc_seq;
+	else
+		commit_seq = zilog->zl_itx_seq;
 	mutex_exit(&zilog->zl_lock);
 
 	/* write the last block out */
-	if (lwb != NULL && lwb->lwb_nused != 0)
+	if (lwb != NULL && lwb->lwb_zio != NULL)
 		lwb = zil_lwb_write_start(zilog, lwb);
 
 	zilog->zl_prev_used = zilog->zl_cur_used;
@@ -972,26 +1016,24 @@
 	/*
 	 * Wait if necessary for the log blocks to be on stable storage.
 	 */
-	mutex_enter(&zilog->zl_lock);
 	if (zilog->zl_root_zio) {
-		mutex_exit(&zilog->zl_lock);
 		DTRACE_PROBE1(zil__cw3, zilog_t *, zilog);
 		(void) zio_wait(zilog->zl_root_zio);
 		DTRACE_PROBE1(zil__cw4, zilog_t *, zilog);
-		mutex_enter(&zilog->zl_lock);
-		zil_flush_vdevs(zilog);
+		if (!zfs_nocacheflush)
+			zil_flush_vdevs(zilog);
 	}
 
 	if (zilog->zl_log_error || lwb == NULL) {
 		zilog->zl_log_error = 0;
-		mutex_exit(&zilog->zl_lock);
 		txg_wait_synced(zilog->zl_dmu_pool, 0);
-		mutex_enter(&zilog->zl_lock);
 	}
-	/* wake up others waiting to start a write */
+
+	mutex_enter(&zilog->zl_lock);
 	zilog->zl_writer = B_FALSE;
-	cv_broadcast(&zilog->zl_cv_writer);
-	mutex_exit(&zilog->zl_lock);
+
+	ASSERT3U(commit_seq, >=, zilog->zl_commit_seq);
+	zilog->zl_commit_seq = commit_seq;
 }
 
 /*
@@ -1009,9 +1051,17 @@
 
 	seq = MIN(seq, zilog->zl_itx_seq);	/* cap seq at largest itx seq */
 
-	while (zilog->zl_writer)
+	while (zilog->zl_writer) {
 		cv_wait(&zilog->zl_cv_writer, &zilog->zl_lock);
+		if (seq < zilog->zl_commit_seq) {
+			mutex_exit(&zilog->zl_lock);
+			return;
+		}
+	}
 	zil_commit_writer(zilog, seq, foid); /* drops zl_lock */
+	/* wake up others waiting on the commit */
+	cv_broadcast(&zilog->zl_cv_writer);
+	mutex_exit(&zilog->zl_lock);
 }
 
 /*
@@ -1278,7 +1328,8 @@
 	const zil_header_t *zh = zilog->zl_header;
 	uint64_t reclen = lr->lrc_reclen;
 	uint64_t txtype = lr->lrc_txtype;
-	int pass, error;
+	char *name;
+	int pass, error, sunk;
 
 	if (zilog->zl_stop_replay)
 		return;
@@ -1343,7 +1394,7 @@
 	 * and update the log header to reflect the fact that we did so.
 	 * We use the DMU's ability to assign into a specific txg to do this.
 	 */
-	for (pass = 1; /* CONSTANTCONDITION */; pass++) {
+	for (pass = 1, sunk = B_FALSE; /* CONSTANTCONDITION */; pass++) {
 		uint64_t replay_txg;
 		dmu_tx_t *replay_tx;
 
@@ -1378,6 +1429,24 @@
 
 		dmu_tx_commit(replay_tx);
 
+		if (!error)
+			return;
+
+		/*
+		 * The DMU's dnode layer doesn't see removes until the txg
+		 * commits, so a subsequent claim can spuriously fail with
+		 * EEXIST. So if we receive any error other than ERESTART
+		 * we try syncing out any removes then retrying the
+		 * transaction.
+		 */
+		if (error != ERESTART && !sunk) {
+			if (zr->zr_rm_sync != NULL)
+				zr->zr_rm_sync(zr->zr_arg);
+			txg_wait_synced(spa_get_dsl(zilog->zl_spa), 0);
+			sunk = B_TRUE;
+			continue; /* retry */
+		}
+
 		if (error != ERESTART)
 			break;
 
@@ -1388,29 +1457,21 @@
 		dprintf("pass %d, retrying\n", pass);
 	}
 
-	if (error) {
-		char *name = kmem_alloc(MAXNAMELEN, KM_SLEEP);
-		dmu_objset_name(zr->zr_os, name);
-		cmn_err(CE_WARN, "ZFS replay transaction error %d, "
-		    "dataset %s, seq 0x%llx, txtype %llu\n",
-		    error, name,
-		    (u_longlong_t)lr->lrc_seq, (u_longlong_t)txtype);
-		zilog->zl_stop_replay = 1;
-		kmem_free(name, MAXNAMELEN);
-	}
+	ASSERT(error && error != ERESTART);
+	name = kmem_alloc(MAXNAMELEN, KM_SLEEP);
+	dmu_objset_name(zr->zr_os, name);
+	cmn_err(CE_WARN, "ZFS replay transaction error %d, "
+	    "dataset %s, seq 0x%llx, txtype %llu\n",
+	    error, name, (u_longlong_t)lr->lrc_seq, (u_longlong_t)txtype);
+	zilog->zl_stop_replay = 1;
+	kmem_free(name, MAXNAMELEN);
+}
 
-	/*
-	 * The DMU's dnode layer doesn't see removes until the txg commits,
-	 * so a subsequent claim can spuriously fail with EEXIST.
-	 * To prevent this, if we might have removed an object,
-	 * wait for the delete thread to delete it, and then
-	 * wait for the transaction group to sync.
-	 */
-	if (txtype == TX_REMOVE || txtype == TX_RMDIR || txtype == TX_RENAME) {
-		if (zr->zr_rm_sync != NULL)
-			zr->zr_rm_sync(zr->zr_arg);
-		txg_wait_synced(spa_get_dsl(zilog->zl_spa), 0);
-	}
+/* ARGSUSED */
+static void
+zil_incr_blks(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
+{
+	zilog->zl_replay_blks++;
 }
 
 /*
@@ -1445,7 +1506,9 @@
 	txg_wait_synced(zilog->zl_dmu_pool, 0);
 
 	zilog->zl_stop_replay = 0;
-	(void) zil_parse(zilog, NULL, zil_replay_log_record, &zr,
+	zilog->zl_replay_time = lbolt;
+	ASSERT(zilog->zl_replay_blks == 0);
+	(void) zil_parse(zilog, zil_incr_blks, zil_replay_log_record, &zr,
 	    zh->zh_claim_txg);
 	kmem_free(zr.zr_lrbuf, 2 * SPA_MAXBLOCKSIZE);
 
--- a/usr/src/uts/common/fs/zfs/zio.c	Sat Nov 04 01:18:55 2006 -0800
+++ b/usr/src/uts/common/fs/zfs/zio.c	Sat Nov 04 07:59:19 2006 -0800
@@ -1147,7 +1147,7 @@
 	gsize = SPA_GANGBLOCKSIZE;
 	gbps_left = SPA_GBH_NBLKPTRS;
 
-	error = metaslab_alloc(spa, gsize, bp, gbh_ndvas, txg, NULL);
+	error = metaslab_alloc(spa, gsize, bp, gbh_ndvas, txg, NULL, B_FALSE);
 	if (error == ENOSPC)
 		panic("can't allocate gang block header");
 	ASSERT(error == 0);
@@ -1174,7 +1174,7 @@
 
 		while (resid <= maxalloc * gbps_left) {
 			error = metaslab_alloc(spa, maxalloc, gbp, ndvas,
-			    txg, bp);
+			    txg, bp, B_FALSE);
 			if (error == 0)
 				break;
 			ASSERT3U(error, ==, ENOSPC);
@@ -1245,7 +1245,7 @@
 	ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp));
 
 	error = metaslab_alloc(zio->io_spa, zio->io_size, bp, zio->io_ndvas,
-	    zio->io_txg, NULL);
+	    zio->io_txg, NULL, B_FALSE);
 
 	if (error == 0) {
 		bp->blk_birth = zio->io_txg;
@@ -1653,25 +1653,27 @@
  * Try to allocate an intent log block.  Return 0 on success, errno on failure.
  */
 int
-zio_alloc_blk(spa_t *spa, uint64_t size, blkptr_t *bp, uint64_t txg)
+zio_alloc_blk(spa_t *spa, uint64_t size, blkptr_t *new_bp, blkptr_t *old_bp,
+    uint64_t txg)
 {
 	int error;
 
 	spa_config_enter(spa, RW_READER, FTAG);
 
-	BP_ZERO(bp);
-
-	error = metaslab_alloc(spa, size, bp, 1, txg, NULL);
+	/*
+	 * We were passed the previous log blocks dva_t in bp->blk_dva[0].
+	 */
+	error = metaslab_alloc(spa, size, new_bp, 1, txg, old_bp, B_TRUE);
 
 	if (error == 0) {
-		BP_SET_LSIZE(bp, size);
-		BP_SET_PSIZE(bp, size);
-		BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF);
-		BP_SET_CHECKSUM(bp, ZIO_CHECKSUM_ZILOG);
-		BP_SET_TYPE(bp, DMU_OT_INTENT_LOG);
-		BP_SET_LEVEL(bp, 0);
-		BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
-		bp->blk_birth = txg;
+		BP_SET_LSIZE(new_bp, size);
+		BP_SET_PSIZE(new_bp, size);
+		BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF);
+		BP_SET_CHECKSUM(new_bp, ZIO_CHECKSUM_ZILOG);
+		BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG);
+		BP_SET_LEVEL(new_bp, 0);
+		BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER);
+		new_bp->blk_birth = txg;
 	}
 
 	spa_config_exit(spa, FTAG);
--- a/usr/src/uts/common/fs/zfs/zvol.c	Sat Nov 04 01:18:55 2006 -0800
+++ b/usr/src/uts/common/fs/zfs/zvol.c	Sat Nov 04 07:59:19 2006 -0800
@@ -92,6 +92,7 @@
 typedef struct zvol_state {
 	char		zv_name[MAXPATHLEN]; /* pool/dd name */
 	uint64_t	zv_volsize;	/* amount of space we advertise */
+	uint64_t	zv_volblocksize; /* volume block size */
 	minor_t		zv_minor;	/* minor number */
 	uint8_t		zv_min_bs;	/* minimum addressable block shift */
 	uint8_t		zv_readonly;	/* hard readonly; like write-protect */
@@ -104,6 +105,13 @@
 	krwlock_t	zv_dslock;	/* dmu_sync() rwlock */
 } zvol_state_t;
 
+/*
+ * zvol maximum transfer in one DMU tx.
+ */
+int zvol_maxphys = DMU_MAX_ACCESS/2;
+
+int zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio);
+
 static void
 zvol_size_changed(zvol_state_t *zv, dev_t dev)
 {
@@ -309,6 +317,7 @@
 {
 	zvol_state_t *zv;
 	objset_t *os;
+	dmu_object_info_t doi;
 	uint64_t volsize;
 	minor_t minor = 0;
 	struct pathname linkpath;
@@ -428,7 +437,12 @@
 	zv->zv_volsize = volsize;
 	zv->zv_objset = os;
 	zv->zv_mode = ds_mode;
-	zv->zv_zilog = zil_open(os, NULL);
+	zv->zv_zilog = zil_open(os, zvol_get_data);
+
+	/* get and cache the blocksize */
+	error = dmu_object_info(os, ZVOL_OBJ, &doi);
+	ASSERT(error == 0);
+	zv->zv_volblocksize = doi.doi_data_block_size;
 
 	rw_init(&zv->zv_dslock, NULL, RW_DEFAULT, NULL);
 
@@ -687,83 +701,111 @@
 	return (itx);
 }
 
+void
+zvol_get_done(dmu_buf_t *db, void *vzgd)
+{
+	zgd_t *zgd = (zgd_t *)vzgd;
+
+	dmu_buf_rele(db, vzgd);
+	zil_add_vdev(zgd->zgd_zilog, DVA_GET_VDEV(BP_IDENTITY(zgd->zgd_bp)));
+	kmem_free(zgd, sizeof (zgd_t));
+}
+
+/*
+ * Get data to generate a TX_WRITE intent log record.
+ */
+int
+zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
+{
+	zvol_state_t *zv = arg;
+	objset_t *os = zv->zv_objset;
+	dmu_buf_t *db;
+	zgd_t *zgd;
+	int dlen = lr->lr_length;  		/* length of user data */
+	int error;
+
+	ASSERT(zio);
+	ASSERT(dlen != 0);
+	ASSERT(buf == NULL);
+
+	zgd = (zgd_t *)kmem_alloc(sizeof (zgd_t), KM_SLEEP);
+	zgd->zgd_zilog = zv->zv_zilog;
+	zgd->zgd_bp = &lr->lr_blkptr;
+
+	VERIFY(0 == dmu_buf_hold(os, ZVOL_OBJ, lr->lr_offset, zgd, &db));
+	/*
+	 * Have to lock to ensure when when the data is
+	 * written out and it's checksum is being calculated
+	 * that no one can change the data.
+	 */
+	rw_enter(&zv->zv_dslock, RW_READER);
+	error = dmu_sync(zio, db, &lr->lr_blkptr,
+	    lr->lr_common.lrc_txg, zvol_get_done, zgd);
+	rw_exit(&zv->zv_dslock);
+	if (error == 0) {
+		zil_add_vdev(zv->zv_zilog,
+		    DVA_GET_VDEV(BP_IDENTITY(&lr->lr_blkptr)));
+	}
+	/*
+	 * If we get EINPROGRESS, then we need to wait for a
+	 * write IO initiated by dmu_sync() to complete before
+	 * we can release this dbuf.  We will finish everything
+	 * up in the zvol_get_done() callback.
+	 */
+	if (error == EINPROGRESS)
+		return (0);
+	dmu_buf_rele(db, zgd);
+	kmem_free(zgd, sizeof (zgd_t));
+	return (error);
+}
+
 /*
  * zvol_log_write() handles synchronous writes using TX_WRITE ZIL transactions.
  *
  * We store data in the log buffers if it's small enough.
- * Otherwise we flush the data out via dmu_sync().
+ * Otherwise we will later flush the data out via dmu_sync().
  */
-ssize_t zvol_immediate_write_sz = 65536;
+ssize_t zvol_immediate_write_sz = 32768;
 
-int
+void
 zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, offset_t off, ssize_t len,
     char *addr)
 {
-	dmu_object_info_t doi;
 	ssize_t nbytes;
 	itx_t *itx;
 	lr_write_t *lr;
-	objset_t *os;
-	dmu_buf_t *db;
-	uint64_t txg;
+	zilog_t *zilog = zv->zv_zilog;
 	uint64_t boff;
-	int error;
 	uint32_t blocksize;
 
 	/* handle common case */
 	if (len <= zvol_immediate_write_sz) {
 		itx = zvol_immediate_itx(off, len, addr);
-		(void) zil_itx_assign(zv->zv_zilog, itx, tx);
-		return (0);
+		(void) zil_itx_assign(zilog, itx, tx);
 	}
 
-	txg = dmu_tx_get_txg(tx);
-	os = zv->zv_objset;
+	blocksize = zv->zv_volblocksize;
 
-	/*
-	 * We need to dmu_sync() each block in the range.
-	 * For this we need the blocksize.
-	 */
-	error = dmu_object_info(os, ZVOL_OBJ, &doi);
-	if (error)
-		return (error);
-	blocksize = doi.doi_data_block_size;
-
-	/*
-	 * We need to immediate write or dmu_sync() each block in the range.
-	 */
 	while (len) {
 		nbytes = MIN(len, blocksize - P2PHASE(off, blocksize));
 		if (nbytes <= zvol_immediate_write_sz) {
 			itx = zvol_immediate_itx(off, nbytes, addr);
 		} else {
-			boff =  P2ALIGN_TYPED(off, blocksize, uint64_t);
+			boff = P2ALIGN_TYPED(off, blocksize, uint64_t);
 			itx = zil_itx_create(TX_WRITE, sizeof (*lr));
+			itx->itx_wr_state = WR_INDIRECT;
+			itx->itx_private = zv;
 			lr = (lr_write_t *)&itx->itx_lr;
 			lr->lr_foid = ZVOL_OBJ;
 			lr->lr_offset = off;
 			lr->lr_length = nbytes;
 			lr->lr_blkoff = off - boff;
 			BP_ZERO(&lr->lr_blkptr);
-
-			/* XXX - we should do these IOs in parallel */
-			VERIFY(0 == dmu_buf_hold(os, ZVOL_OBJ, boff,
-			    FTAG, &db));
-			ASSERT(boff == db->db_offset);
-			error = dmu_sync(NULL, db, &lr->lr_blkptr,
-			    txg, NULL, NULL);
-			dmu_buf_rele(db, FTAG);
-			if (error) {
-				kmem_free(itx, offsetof(itx_t, itx_lr));
-				return (error);
-			}
-			itx->itx_wr_state = WR_COPIED;
 		}
-		(void) zil_itx_assign(zv->zv_zilog, itx, tx);
+		(void) zil_itx_assign(zilog, itx, tx);
 		len -= nbytes;
 		off += nbytes;
 	}
-	return (0);
 }
 
 int
@@ -777,7 +819,6 @@
 	int error = 0;
 	int sync;
 	int reading;
-	int txg_sync_needed = B_FALSE;
 
 	if (zv == NULL) {
 		bioerror(bp, ENXIO);
@@ -822,7 +863,7 @@
 
 	while (resid != 0 && off < volsize) {
 
-		size = MIN(resid, 1UL << 20);	/* cap at 1MB per tx */
+		size = MIN(resid, zvol_maxphys); /* zvol_maxphys per tx */
 
 		if (size > volsize - off)	/* don't write past the end */
 			size = volsize - off;
@@ -837,13 +878,9 @@
 				dmu_tx_abort(tx);
 			} else {
 				dmu_write(os, ZVOL_OBJ, off, size, addr, tx);
-				if (sync) {
-					/* use the ZIL to commit this write */
-					if (zvol_log_write(zv, tx, off, size,
-					    addr) != 0) {
-						txg_sync_needed = B_TRUE;
-					}
-				}
+				/* add a log write transaction */
+				if (sync)
+					zvol_log_write(zv, tx, off, size, addr);
 				dmu_tx_commit(tx);
 			}
 		}
@@ -860,42 +897,55 @@
 
 	biodone(bp);
 
-	if (sync) {
-		if (txg_sync_needed)
-			txg_wait_synced(dmu_objset_pool(os), 0);
-		else
-			zil_commit(zv->zv_zilog, UINT64_MAX, 0);
-	}
+	if (sync)
+		zil_commit(zv->zv_zilog, UINT64_MAX, 0);
 
 	return (0);
 }
 
+/*
+ * Set the buffer count to the zvol maximum transfer.
+ * Using our own routine instead of the default minphys()
+ * means that for larger writes we write bigger buffers on X86
+ * (128K instead of 56K) and flush the disk write cache less often
+ * (every zvol_maxphys - currently 1MB) instead of minphys (currently
+ * 56K on X86 and 128K on sparc).
+ */
+void
+zvol_minphys(struct buf *bp)
+{
+	if (bp->b_bcount > zvol_maxphys)
+		bp->b_bcount = zvol_maxphys;
+}
+
 /*ARGSUSED*/
 int
 zvol_read(dev_t dev, uio_t *uiop, cred_t *cr)
 {
-	return (physio(zvol_strategy, NULL, dev, B_READ, minphys, uiop));
+	return (physio(zvol_strategy, NULL, dev, B_READ, zvol_minphys, uiop));
 }
 
 /*ARGSUSED*/
 int
 zvol_write(dev_t dev, uio_t *uiop, cred_t *cr)
 {
-	return (physio(zvol_strategy, NULL, dev, B_WRITE, minphys, uiop));
+	return (physio(zvol_strategy, NULL, dev, B_WRITE, zvol_minphys, uiop));
 }
 
 /*ARGSUSED*/
 int
 zvol_aread(dev_t dev, struct aio_req *aio, cred_t *cr)
 {
-	return (aphysio(zvol_strategy, anocancel, dev, B_READ, minphys, aio));
+	return (aphysio(zvol_strategy, anocancel, dev, B_READ, zvol_minphys,
+	    aio));
 }
 
 /*ARGSUSED*/
 int
 zvol_awrite(dev_t dev, struct aio_req *aio, cred_t *cr)
 {
-	return (aphysio(zvol_strategy, anocancel, dev, B_WRITE, minphys, aio));
+	return (aphysio(zvol_strategy, anocancel, dev, B_WRITE, zvol_minphys,
+	    aio));
 }
 
 /*