usr/src/uts/common/fs/zfs/zio.c
changeset 5530 4ed96167d864
parent 5450 b25030891c44
child 5688 c0b02c8fd2c0
child 13152 b2ac0409282f
--- a/usr/src/uts/common/fs/zfs/zio.c	Tue Nov 27 17:41:22 2007 -0800
+++ b/usr/src/uts/common/fs/zfs/zio.c	Tue Nov 27 22:58:05 2007 -0800
@@ -61,9 +61,6 @@
 char *zio_type_name[ZIO_TYPES] = {
 	"null", "read", "write", "free", "claim", "ioctl" };
 
-/* At or above this size, force gang blocking - for testing */
-uint64_t zio_gang_bang = SPA_MAXBLOCKSIZE + 1;
-
 /* Force an allocation failure when non-zero */
 uint16_t zio_zil_fail_shift = 0;
 uint16_t zio_io_fail_shift = 0;
@@ -170,8 +167,6 @@
 			    align, NULL, NULL, NULL, NULL, data_alloc_arena,
 			    KMC_NODEBUG);
 
-			dprintf("creating cache for size %5lx align %5lx\n",
-			    size, align);
 		}
 	}
 
@@ -356,9 +351,6 @@
 		zio->io_bp = bp;
 		zio->io_bp_copy = *bp;
 		zio->io_bp_orig = *bp;
-		if (dmu_ot[BP_GET_TYPE(bp)].ot_metadata ||
-		    BP_GET_LEVEL(bp) != 0)
-			zio->io_flags |= ZIO_FLAG_METADATA;
 	}
 	zio->io_done = done;
 	zio->io_private = private;
@@ -366,10 +358,7 @@
 	zio->io_priority = priority;
 	zio->io_stage = stage;
 	zio->io_pipeline = pipeline;
-	zio->io_async_stages = ZIO_ASYNC_PIPELINE_STAGES;
 	zio->io_timestamp = lbolt64;
-	if (pio != NULL)
-		zio->io_flags |= (pio->io_flags & ZIO_FLAG_METADATA);
 	mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL);
 	zio_push_transform(zio, data, size, size);
@@ -395,7 +384,7 @@
 	if (pio == NULL) {
 		if (type != ZIO_TYPE_NULL &&
 		    !(flags & ZIO_FLAG_CONFIG_HELD)) {
-			spa_config_enter(zio->io_spa, RW_READER, zio);
+			spa_config_enter(spa, RW_READER, zio);
 			zio->io_flags |= ZIO_FLAG_CONFIG_GRABBED;
 		}
 		zio->io_root = zio;
@@ -409,7 +398,7 @@
 		    !(pio->io_flags & ZIO_FLAG_CONFIG_GRABBED) &&
 		    !(pio->io_flags & ZIO_FLAG_CONFIG_HELD)) {
 			pio->io_flags |= ZIO_FLAG_CONFIG_GRABBED;
-			spa_config_enter(zio->io_spa, RW_READER, pio);
+			spa_config_enter(spa, RW_READER, pio);
 		}
 		if (stage < ZIO_STAGE_READY)
 			pio->io_children_notready++;
@@ -524,9 +513,6 @@
 	zio->io_compress = compress;
 	zio->io_ndvas = ncopies;
 
-	if (compress != ZIO_COMPRESS_OFF)
-		zio->io_async_stages |= 1U << ZIO_STAGE_WRITE_COMPRESS;
-
 	if (bp->blk_birth != txg) {
 		/* XXX the bp usually (always?) gets re-zeroed later */
 		BP_ZERO(bp);
@@ -551,7 +537,7 @@
 
 	zio = zio_create(pio, spa, txg, bp, data, size, done, private,
 	    ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_USER,
-	    ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE);
+	    ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE(bp));
 
 	zio->io_bookmark = *zb;
 	zio->io_checksum = checksum;
@@ -612,7 +598,7 @@
 
 	zio = zio_create(pio, spa, txg, bp, NULL, 0, done, private,
 	    ZIO_TYPE_FREE, ZIO_PRIORITY_FREE, ZIO_FLAG_USER,
-	    ZIO_STAGE_OPEN, ZIO_FREE_PIPELINE);
+	    ZIO_STAGE_OPEN, ZIO_FREE_PIPELINE(bp));
 
 	zio->io_bp = &zio->io_bp_copy;
 
@@ -641,7 +627,7 @@
 
 	zio = zio_create(pio, spa, txg, bp, NULL, 0, done, private,
 	    ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, 0,
-	    ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE);
+	    ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE(bp));
 
 	zio->io_bp = &zio->io_bp_copy;
 
@@ -820,7 +806,7 @@
 
 	zio->io_waiter = curthread;
 
-	zio_next_stage_async(zio);
+	zio_execute(zio);
 
 	mutex_enter(&zio->io_lock);
 	while (zio->io_stalled != ZIO_STAGE_DONE)
@@ -838,7 +824,23 @@
 void
 zio_nowait(zio_t *zio)
 {
-	zio_next_stage_async(zio);
+	zio_execute(zio);
+}
+
+void
+zio_interrupt(zio_t *zio)
+{
+	(void) taskq_dispatch(zio->io_spa->spa_zio_intr_taskq[zio->io_type],
+	    (task_func_t *)zio_execute, zio, TQ_SLEEP);
+}
+
+static int
+zio_issue_async(zio_t *zio)
+{
+	(void) taskq_dispatch(zio->io_spa->spa_zio_issue_taskq[zio->io_type],
+	    (task_func_t *)zio_execute, zio, TQ_SLEEP);
+
+	return (ZIO_PIPELINE_STOP);
 }
 
 /*
@@ -846,18 +848,20 @@
  * I/O pipeline interlocks: parent/child dependency scoreboarding
  * ==========================================================================
  */
-static void
+static int
 zio_wait_for_children(zio_t *zio, uint32_t stage, uint64_t *countp)
 {
+	int rv = ZIO_PIPELINE_CONTINUE;
+
 	mutex_enter(&zio->io_lock);
-	if (*countp == 0) {
-		ASSERT(zio->io_stalled == 0);
-		mutex_exit(&zio->io_lock);
-		zio_next_stage(zio);
-	} else {
+	ASSERT(zio->io_stalled == 0);
+	if (*countp != 0) {
 		zio->io_stalled = stage;
-		mutex_exit(&zio->io_lock);
+		rv = ZIO_PIPELINE_STOP;
 	}
+	mutex_exit(&zio->io_lock);
+
+	return (rv);
 }
 
 static void
@@ -872,48 +876,54 @@
 	if (--*countp == 0 && pio->io_stalled == stage) {
 		pio->io_stalled = 0;
 		mutex_exit(&pio->io_lock);
-		zio_next_stage_async(pio);
+		zio_execute(pio);
 	} else {
 		mutex_exit(&pio->io_lock);
 	}
 }
 
-static void
-zio_wait_children_ready(zio_t *zio)
+int
+zio_wait_for_children_ready(zio_t *zio)
 {
-	zio_wait_for_children(zio, ZIO_STAGE_WAIT_CHILDREN_READY,
-	    &zio->io_children_notready);
+	return (zio_wait_for_children(zio, ZIO_STAGE_WAIT_FOR_CHILDREN_READY,
+	    &zio->io_children_notready));
 }
 
-void
-zio_wait_children_done(zio_t *zio)
+int
+zio_wait_for_children_done(zio_t *zio)
 {
-	zio_wait_for_children(zio, ZIO_STAGE_WAIT_CHILDREN_DONE,
-	    &zio->io_children_notdone);
+	return (zio_wait_for_children(zio, ZIO_STAGE_WAIT_FOR_CHILDREN_DONE,
+	    &zio->io_children_notdone));
 }
 
-static void
+static int
 zio_read_init(zio_t *zio)
 {
-	if (BP_GET_COMPRESS(zio->io_bp) != ZIO_COMPRESS_OFF) {
-		uint64_t csize = BP_GET_PSIZE(zio->io_bp);
+	blkptr_t *bp = zio->io_bp;
+
+	if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF) {
+		uint64_t csize = BP_GET_PSIZE(bp);
 		void *cbuf = zio_buf_alloc(csize);
 
 		zio_push_transform(zio, cbuf, csize, csize);
 		zio->io_pipeline |= 1U << ZIO_STAGE_READ_DECOMPRESS;
 	}
 
-	if (BP_IS_GANG(zio->io_bp)) {
+	if (BP_IS_GANG(bp)) {
 		uint64_t gsize = SPA_GANGBLOCKSIZE;
 		void *gbuf = zio_buf_alloc(gsize);
 
 		zio_push_transform(zio, gbuf, gsize, gsize);
 		zio->io_pipeline |= 1U << ZIO_STAGE_READ_GANG_MEMBERS;
 	}
-	zio_next_stage(zio);
+
+	if (!dmu_ot[BP_GET_TYPE(bp)].ot_metadata && BP_GET_LEVEL(bp) == 0)
+		zio->io_flags |= ZIO_FLAG_DONT_CACHE;
+
+	return (ZIO_PIPELINE_CONTINUE);
 }
 
-static void
+static int
 zio_ready(zio_t *zio)
 {
 	zio_t *pio = zio->io_parent;
@@ -922,16 +932,16 @@
 		zio->io_ready(zio);
 
 	if (pio != NULL)
-		zio_notify_parent(zio, ZIO_STAGE_WAIT_CHILDREN_READY,
+		zio_notify_parent(zio, ZIO_STAGE_WAIT_FOR_CHILDREN_READY,
 		    &pio->io_children_notready);
 
 	if (zio->io_bp)
 		zio->io_bp_copy = *zio->io_bp;
 
-	zio_next_stage(zio);
+	return (ZIO_PIPELINE_CONTINUE);
 }
 
-static void
+static int
 zio_vdev_retry_io(zio_t *zio)
 {
 	zio_t *pio = zio->io_parent;
@@ -967,7 +977,7 @@
 		if (pio->io_stage > ZIO_STAGE_OPEN && IO_IS_ALLOCATING(pio))
 			pio->io_flags |= ZIO_FLAG_WRITE_RETRY;
 
-		ASSERT(pio->io_stage <= ZIO_STAGE_WAIT_CHILDREN_DONE);
+		ASSERT(pio->io_stage <= ZIO_STAGE_WAIT_FOR_CHILDREN_DONE);
 		mutex_exit(&pio->io_lock);
 	}
 
@@ -977,7 +987,8 @@
 	 */
 	zio->io_flags &= ~ZIO_FLAG_WRITE_RETRY;
 	zio->io_error = 0;
-	zio_next_stage_async(zio);
+
+	return (ZIO_PIPELINE_CONTINUE);
 }
 
 int
@@ -1029,7 +1040,7 @@
 			zio->io_stage = ZIO_STAGE_READY;
 		}
 
-		(void) taskq_dispatch(zio_taskq, zio_resubmit_stage_async,
+		(void) taskq_dispatch(zio_taskq, (task_func_t *)zio_execute,
 		    zio, TQ_SLEEP);
 	}
 	mutex_exit(&spa->spa_zio_lock);
@@ -1049,7 +1060,7 @@
 	return (0);
 }
 
-static void
+static int
 zio_vdev_suspend_io(zio_t *zio)
 {
 	spa_t *spa = zio->io_spa;
@@ -1069,9 +1080,11 @@
 	cv_broadcast(&spa->spa_zio_cv);
 #endif
 	mutex_exit(&spa->spa_zio_lock);
+
+	return (ZIO_PIPELINE_STOP);
 }
 
-static void
+static int
 zio_assess(zio_t *zio)
 {
 	spa_t *spa = zio->io_spa;
@@ -1138,10 +1151,9 @@
 		 * property.
 		 */
 		if (zio_write_retry && zio->io_error != ENOSPC &&
-		    IO_IS_ALLOCATING(zio)) {
-			zio_vdev_retry_io(zio);
-			return;
-		}
+		    IO_IS_ALLOCATING(zio))
+			return (zio_vdev_retry_io(zio));
+
 		ASSERT(!(zio->io_flags & ZIO_FLAG_WRITE_RETRY));
 
 		/*
@@ -1175,22 +1187,20 @@
 				    "uncorrectable I/O failure and the "
 				    "failure mode property for this pool "
 				    "is set to panic.", spa_name(spa));
-			} else {
-				cmn_err(CE_WARN, "Pool '%s' has encountered "
-				    "an uncorrectable I/O error. Manual "
-				    "intervention is required.",
-				    spa_name(spa));
-				zio_vdev_suspend_io(zio);
 			}
-			return;
+			cmn_err(CE_WARN, "Pool '%s' has encountered "
+			    "an uncorrectable I/O error. "
+			    "Manual intervention is required.", spa_name(spa));
+			return (zio_vdev_suspend_io(zio));
 		}
 	}
 	ASSERT(!(zio->io_flags & ZIO_FLAG_WRITE_RETRY));
 	ASSERT(zio->io_children_notready == 0);
-	zio_next_stage(zio);
+
+	return (ZIO_PIPELINE_CONTINUE);
 }
 
-static void
+static int
 zio_done(zio_t *zio)
 {
 	zio_t *pio = zio->io_parent;
@@ -1221,7 +1231,7 @@
 			pio->io_child = next;
 		mutex_exit(&pio->io_lock);
 
-		zio_notify_parent(zio, ZIO_STAGE_WAIT_CHILDREN_DONE,
+		zio_notify_parent(zio, ZIO_STAGE_WAIT_FOR_CHILDREN_DONE,
 		    &pio->io_children_notdone);
 	}
 
@@ -1243,6 +1253,8 @@
 		cv_destroy(&zio->io_cv);
 		kmem_cache_free(zio_cache, zio);
 	}
+
+	return (ZIO_PIPELINE_STOP);
 }
 
 /*
@@ -1250,7 +1262,7 @@
  * Compression support
  * ==========================================================================
  */
-static void
+static int
 zio_write_compress(zio_t *zio)
 {
 	int compress = zio->io_compress;
@@ -1300,7 +1312,7 @@
 		ASSERT(csize != 0);
 		BP_SET_LSIZE(bp, lsize);
 		BP_SET_COMPRESS(bp, compress);
-		zio->io_pipeline = ZIO_REWRITE_PIPELINE;
+		zio->io_pipeline = ZIO_REWRITE_PIPELINE(bp);
 	} else {
 		if (bp->blk_birth == zio->io_txg)
 			BP_ZERO(bp);
@@ -1316,10 +1328,10 @@
 		}
 	}
 
-	zio_next_stage(zio);
+	return (ZIO_PIPELINE_CONTINUE);
 }
 
-static void
+static int
 zio_read_decompress(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
@@ -1338,7 +1350,7 @@
 
 	zio_buf_free(data, bufsize);
 
-	zio_next_stage(zio);
+	return (ZIO_PIPELINE_CONTINUE);
 }
 
 /*
@@ -1347,19 +1359,6 @@
  * ==========================================================================
  */
 static void
-zio_gang_pipeline(zio_t *zio)
-{
-	/*
-	 * By default, the pipeline assumes that we're dealing with a gang
-	 * block.  If we're not, strip out any gang-specific stages.
-	 */
-	if (!BP_IS_GANG(zio->io_bp))
-		zio->io_pipeline &= ~ZIO_GANG_STAGES;
-
-	zio_next_stage(zio);
-}
-
-static void
 zio_gang_byteswap(zio_t *zio)
 {
 	ASSERT(zio->io_size == SPA_GANGBLOCKSIZE);
@@ -1368,7 +1367,7 @@
 		byteswap_uint64_array(zio->io_data, zio->io_size);
 }
 
-static void
+static int
 zio_get_gang_header(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
@@ -1384,10 +1383,10 @@
 	    zio->io_flags & ZIO_FLAG_GANG_INHERIT,
 	    ZIO_STAGE_OPEN, ZIO_READ_GANG_PIPELINE));
 
-	zio_wait_children_done(zio);
+	return (zio_wait_for_children_done(zio));
 }
 
-static void
+static int
 zio_read_gang_members(zio_t *zio)
 {
 	zio_gbh_phys_t *gbh;
@@ -1410,16 +1409,17 @@
 		ASSERT(!BP_IS_HOLE(gbp));
 
 		zio_nowait(zio_read(zio, zio->io_spa, gbp,
-		    (char *)zio->io_data + loff, lsize, NULL, NULL,
-		    zio->io_priority, zio->io_flags & ZIO_FLAG_GANG_INHERIT,
-		    &zio->io_bookmark));
+		    (char *)zio->io_data + loff, lsize,
+		    NULL, NULL, zio->io_priority,
+		    zio->io_flags & ZIO_FLAG_GANG_INHERIT, &zio->io_bookmark));
 	}
 
 	zio_buf_free(gbh, gbufsize);
-	zio_wait_children_done(zio);
+
+	return (zio_wait_for_children_done(zio));
 }
 
-static void
+static int
 zio_rewrite_gang_members(zio_t *zio)
 {
 	zio_gbh_phys_t *gbh;
@@ -1446,15 +1446,16 @@
 
 		zio_nowait(zio_rewrite(zio, zio->io_spa, zio->io_checksum,
 		    zio->io_txg, gbp, (char *)zio->io_data + loff, lsize,
-		    NULL, NULL, zio->io_priority, zio->io_flags,
-		    &zio->io_bookmark));
+		    NULL, NULL, zio->io_priority,
+		    zio->io_flags & ZIO_FLAG_GANG_INHERIT, &zio->io_bookmark));
 	}
 
 	zio_push_transform(zio, gbh, gsize, gbufsize);
-	zio_wait_children_ready(zio);
+
+	return (zio_wait_for_children_ready(zio));
 }
 
-static void
+static int
 zio_free_gang_members(zio_t *zio)
 {
 	zio_gbh_phys_t *gbh;
@@ -1476,10 +1477,11 @@
 	}
 
 	zio_buf_free(gbh, gbufsize);
-	zio_next_stage(zio);
+
+	return (ZIO_PIPELINE_CONTINUE);
 }
 
-static void
+static int
 zio_claim_gang_members(zio_t *zio)
 {
 	zio_gbh_phys_t *gbh;
@@ -1500,7 +1502,8 @@
 	}
 
 	zio_buf_free(gbh, gbufsize);
-	zio_next_stage(zio);
+
+	return (ZIO_PIPELINE_CONTINUE);
 }
 
 static void
@@ -1549,8 +1552,10 @@
 
 	error = metaslab_alloc(spa, mc, gsize, bp, gbh_ndvas, txg, NULL,
 	    B_FALSE);
-	if (error)
-		return (error);
+	if (error) {
+		zio->io_error = error;
+		return (ZIO_PIPELINE_CONTINUE);
+	}
 
 	for (d = 0; d < gbh_ndvas; d++)
 		DVA_SET_GANG(&dva[d], 1);
@@ -1560,10 +1565,6 @@
 	gbh = zio_buf_alloc(gsize);
 	bzero(gbh, gsize);
 
-	/* We need to test multi-level gang blocks */
-	if (maxalloc >= zio_gang_bang && (lbolt & 0x1) == 0)
-		maxalloc = MAX(maxalloc >> 2, SPA_MINBLOCKSIZE);
-
 	for (loff = 0, i = 0; loff != zio->io_size;
 	    loff += lsize, resid -= lsize, gbps_left--, i++) {
 		blkptr_t *gbp = &gbh->zg_blkptr[i];
@@ -1579,8 +1580,10 @@
 				break;
 			ASSERT3U(error, ==, ENOSPC);
 			/* XXX - free up previous allocations? */
-			if (maxalloc == SPA_MINBLOCKSIZE)
-				return (error);
+			if (maxalloc == SPA_MINBLOCKSIZE) {
+				zio->io_error = error;
+				return (ZIO_PIPELINE_CONTINUE);
+			}
 			maxalloc = P2ROUNDUP(maxalloc >> 1, SPA_MINBLOCKSIZE);
 		}
 
@@ -1614,14 +1617,14 @@
 	zio->io_pipeline |= 1U << ZIO_STAGE_GANG_CHECKSUM_GENERATE;
 
 	zio_push_transform(zio, gbh, gsize, gsize);
+
 	/*
-	 * As much as we'd like this to be zio_wait_children_ready(),
+	 * As much as we'd like this to be 'ready' instead of 'done',
 	 * updating our ASIZE doesn't happen until the io_done callback,
 	 * so we have to wait for that to finish in order for our BP
 	 * to be stable.
 	 */
-	zio_wait_children_done(zio);
-	return (0);
+	return (zio_wait_for_children_done(zio));
 }
 
 /*
@@ -1629,7 +1632,7 @@
  * Allocate and free blocks
  * ==========================================================================
  */
-static void
+static int
 zio_dva_allocate(zio_t *zio)
 {
 	spa_t *spa = zio->io_spa;
@@ -1642,14 +1645,6 @@
 	ASSERT3U(zio->io_ndvas, >, 0);
 	ASSERT3U(zio->io_ndvas, <=, spa_max_replication(spa));
 
-	/* For testing, make some blocks above a certain size be gang blocks */
-	if (zio->io_size >= zio_gang_bang && (lbolt & 0x3) == 0) {
-		error = zio_write_allocate_gang_members(zio, mc);
-		if (error)
-			zio->io_error = error;
-		return;
-	}
-
 	/*
 	 * For testing purposes, we force I/Os to retry. We don't allow
 	 * retries beyond the first pass since those I/Os are non-allocating
@@ -1668,17 +1663,15 @@
 	if (error == 0) {
 		bp->blk_birth = zio->io_txg;
 	} else if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE) {
-		error = zio_write_allocate_gang_members(zio, mc);
-		if (error == 0)
-			return;
-		zio->io_error = error;
+		return (zio_write_allocate_gang_members(zio, mc));
 	} else {
 		zio->io_error = error;
 	}
-	zio_next_stage(zio);
+
+	return (ZIO_PIPELINE_CONTINUE);
 }
 
-static void
+static int
 zio_dva_free(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
@@ -1687,15 +1680,15 @@
 
 	BP_ZERO(bp);
 
-	zio_next_stage(zio);
+	return (ZIO_PIPELINE_CONTINUE);
 }
 
-static void
+static int
 zio_dva_claim(zio_t *zio)
 {
 	zio->io_error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg);
 
-	zio_next_stage(zio);
+	return (ZIO_PIPELINE_CONTINUE);
 }
 
 /*
@@ -1704,7 +1697,7 @@
  * ==========================================================================
  */
 
-static void
+static int
 zio_vdev_io_start(zio_t *zio)
 {
 	vdev_t *vd = zio->io_vd;
@@ -1719,24 +1712,21 @@
 	 * at that time.
 	 */
 	if (spa_state(spa) == POOL_STATE_IO_FAILURE &&
-	    zio->io_type == ZIO_TYPE_WRITE) {
-		zio_vdev_suspend_io(zio);
-		return;
-	}
+	    zio->io_type == ZIO_TYPE_WRITE)
+		return (zio_vdev_suspend_io(zio));
 
-	if (vd == NULL) {
-		/* The mirror_ops handle multiple DVAs in a single BP */
-		vdev_mirror_ops.vdev_op_io_start(zio);
-		return;
-	}
+	/*
+	 * The mirror_ops handle multiple DVAs in a single BP
+	 */
+	if (vd == NULL)
+		return (vdev_mirror_ops.vdev_op_io_start(zio));
 
 	align = 1ULL << tvd->vdev_ashift;
 
 	if (zio->io_retries == 0 && vd == tvd)
 		zio->io_flags |= ZIO_FLAG_FAILFAST;
 
-	if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) &&
-	    vd->vdev_children == 0) {
+	if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) && vd->vdev_children == 0) {
 		zio->io_flags |= ZIO_FLAG_PHYSICAL;
 		zio->io_offset += VDEV_LABEL_START_SIZE;
 	}
@@ -1760,19 +1750,16 @@
 	    P2ROUNDUP(ZIO_GET_IOSIZE(zio), align) == zio->io_size);
 	ASSERT(zio->io_type != ZIO_TYPE_WRITE || (spa_mode & FWRITE));
 
-	vdev_io_start(zio);
-
-	/* zio_next_stage_async() gets called from io completion interrupt */
+	return (vd->vdev_ops->vdev_op_io_start(zio));
 }
 
-static void
+static int
 zio_vdev_io_done(zio_t *zio)
 {
 	if (zio->io_vd == NULL)
-		/* The mirror_ops handle multiple DVAs in a single BP */
-		vdev_mirror_ops.vdev_op_io_done(zio);
-	else
-		vdev_io_done(zio);
+		return (vdev_mirror_ops.vdev_op_io_done(zio));
+
+	return (zio->io_vd->vdev_ops->vdev_op_io_done(zio));
 }
 
 /* XXPOLICY */
@@ -1795,7 +1782,7 @@
 	return (B_TRUE);
 }
 
-static void
+static int
 zio_vdev_io_assess(zio_t *zio)
 {
 	vdev_t *vd = zio->io_vd;
@@ -1833,15 +1820,10 @@
 		zio->io_flags |= ZIO_FLAG_DONT_CACHE;
 		zio->io_stage = ZIO_STAGE_VDEV_IO_START - 1;
 
-		dprintf("retry #%d for %s to %s offset %llx\n",
-		    zio->io_retries, zio_type_name[zio->io_type],
-		    vdev_description(vd), zio->io_offset);
-
-		zio_next_stage_async(zio);
-		return;
+		return (ZIO_PIPELINE_CONTINUE);
 	}
 
-	zio_next_stage(zio);
+	return (ZIO_PIPELINE_CONTINUE);
 }
 
 void
@@ -1876,7 +1858,7 @@
  * Generate and verify checksums
  * ==========================================================================
  */
-static void
+static int
 zio_checksum_generate(zio_t *zio)
 {
 	int checksum = zio->io_checksum;
@@ -1889,10 +1871,10 @@
 
 	zio_checksum(checksum, &bp->blk_cksum, zio->io_data, zio->io_size);
 
-	zio_next_stage(zio);
+	return (ZIO_PIPELINE_CONTINUE);
 }
 
-static void
+static int
 zio_gang_checksum_generate(zio_t *zio)
 {
 	zio_cksum_t zc;
@@ -1905,10 +1887,10 @@
 
 	zio_checksum(ZIO_CHECKSUM_GANG_HEADER, &zc, zio->io_data, zio->io_size);
 
-	zio_next_stage(zio);
+	return (ZIO_PIPELINE_CONTINUE);
 }
 
-static void
+static int
 zio_checksum_verify(zio_t *zio)
 {
 	if (zio->io_bp != NULL) {
@@ -1918,7 +1900,7 @@
 			    zio->io_spa, zio->io_vd, zio, 0, 0);
 	}
 
-	zio_next_stage(zio);
+	return (ZIO_PIPELINE_CONTINUE);
 }
 
 /*
@@ -1949,20 +1931,15 @@
  * Define the pipeline
  * ==========================================================================
  */
-typedef void zio_pipe_stage_t(zio_t *zio);
-
-static void
-zio_badop(zio_t *zio)
-{
-	panic("Invalid I/O pipeline stage %u for zio %p", zio->io_stage, zio);
-}
+typedef int zio_pipe_stage_t(zio_t *zio);
 
 zio_pipe_stage_t *zio_pipeline[ZIO_STAGE_DONE + 2] = {
-	zio_badop,
-	zio_wait_children_ready,
+	NULL,
+	zio_wait_for_children_ready,
+	zio_read_init,
+	zio_issue_async,
 	zio_write_compress,
 	zio_checksum_generate,
-	zio_gang_pipeline,
 	zio_get_gang_header,
 	zio_rewrite_gang_members,
 	zio_free_gang_members,
@@ -1972,116 +1949,63 @@
 	zio_dva_claim,
 	zio_gang_checksum_generate,
 	zio_ready,
-	zio_read_init,
 	zio_vdev_io_start,
 	zio_vdev_io_done,
 	zio_vdev_io_assess,
-	zio_wait_children_done,
+	zio_wait_for_children_done,
 	zio_checksum_verify,
 	zio_read_gang_members,
 	zio_read_decompress,
 	zio_assess,
 	zio_done,
-	zio_badop
+	NULL
 };
 
 /*
- * Move an I/O to the next stage of the pipeline and execute that stage.
- * There's no locking on io_stage because there's no legitimate way for
- * multiple threads to be attempting to process the same I/O.
+ * Execute the I/O pipeline until one of the following occurs:
+ * (1) the I/O completes; (2) the pipeline stalls waiting for
+ * dependent child I/Os; (3) the I/O issues, so we're waiting
+ * for an I/O completion interrupt; (4) the I/O is delegated by
+ * vdev-level caching or aggregation; (5) the I/O is deferred
+ * due to vdev-level queueing; (6) the I/O is handed off to
+ * another thread.  In all cases, the pipeline stops whenever
+ * there's no CPU work; it never burns a thread in cv_wait().
+ *
+ * There's no locking on io_stage because there's no legitimate way
+ * for multiple threads to be attempting to process the same I/O.
  */
 void
-zio_next_stage(zio_t *zio)
+zio_execute(zio_t *zio)
 {
-	uint32_t pipeline = zio->io_pipeline;
-
-	ASSERT(!MUTEX_HELD(&zio->io_lock));
-
-	if (zio->io_error) {
-		dprintf("zio %p vdev %s offset %llx stage %d error %d\n",
-		    zio, vdev_description(zio->io_vd),
-		    zio->io_offset, zio->io_stage, zio->io_error);
-		if (((1U << zio->io_stage) & ZIO_VDEV_IO_PIPELINE) == 0)
-			pipeline &= ZIO_ERROR_PIPELINE_MASK;
-	}
-
-	while (((1U << ++zio->io_stage) & pipeline) == 0)
-		continue;
-
-	ASSERT(zio->io_stage <= ZIO_STAGE_DONE);
-	ASSERT(zio->io_stalled == 0);
+	while (zio->io_stage < ZIO_STAGE_DONE) {
+		uint32_t pipeline = zio->io_pipeline;
+		int rv;
 
-	/*
-	 * See the comment in zio_next_stage_async() about per-CPU taskqs.
-	 */
-	if (((1U << zio->io_stage) & zio->io_async_stages) &&
-	    (zio->io_stage == ZIO_STAGE_WRITE_COMPRESS) &&
-	    !(zio->io_flags & ZIO_FLAG_METADATA)) {
-		taskq_t *tq = zio->io_spa->spa_zio_issue_taskq[zio->io_type];
-		(void) taskq_dispatch(tq,
-		    (task_func_t *)zio_pipeline[zio->io_stage], zio, TQ_SLEEP);
-	} else {
-		zio_pipeline[zio->io_stage](zio);
-	}
-}
-
-void
-zio_next_stage_async(zio_t *zio)
-{
-	taskq_t *tq;
-	uint32_t pipeline = zio->io_pipeline;
-
-	ASSERT(!MUTEX_HELD(&zio->io_lock));
+		ASSERT(!MUTEX_HELD(&zio->io_lock));
 
-	if (zio->io_error) {
-		dprintf("zio %p vdev %s offset %llx stage %d error %d\n",
-		    zio, vdev_description(zio->io_vd),
-		    zio->io_offset, zio->io_stage, zio->io_error);
-		if (((1U << zio->io_stage) & ZIO_VDEV_IO_PIPELINE) == 0)
+		/*
+		 * If an error occurred outside the vdev stack,
+		 * just execute the interlock stages to clean up.
+		 */
+		if (zio->io_error &&
+		    ((1U << zio->io_stage) & ZIO_VDEV_IO_STAGES) == 0)
 			pipeline &= ZIO_ERROR_PIPELINE_MASK;
-	}
 
-	while (((1U << ++zio->io_stage) & pipeline) == 0)
-		continue;
-
-	ASSERT(zio->io_stage <= ZIO_STAGE_DONE);
-	ASSERT(zio->io_stalled == 0);
+		while (((1U << ++zio->io_stage) & pipeline) == 0)
+			continue;
 
-	/*
-	 * For performance, we'll probably want two sets of task queues:
-	 * per-CPU issue taskqs and per-CPU completion taskqs.  The per-CPU
-	 * part is for read performance: since we have to make a pass over
-	 * the data to checksum it anyway, we want to do this on the same CPU
-	 * that issued the read, because (assuming CPU scheduling affinity)
-	 * that thread is probably still there.  Getting this optimization
-	 * right avoids performance-hostile cache-to-cache transfers.
-	 *
-	 * Note that having two sets of task queues is also necessary for
-	 * correctness: if all of the issue threads get bogged down waiting
-	 * for dependent reads (e.g. metaslab freelist) to complete, then
-	 * there won't be any threads available to service I/O completion
-	 * interrupts.
-	 */
-	if ((1U << zio->io_stage) & zio->io_async_stages) {
-		if (zio->io_stage < ZIO_STAGE_VDEV_IO_DONE)
-			tq = zio->io_spa->spa_zio_issue_taskq[zio->io_type];
-		else
-			tq = zio->io_spa->spa_zio_intr_taskq[zio->io_type];
-		(void) taskq_dispatch(tq,
-		    (task_func_t *)zio_pipeline[zio->io_stage], zio, TQ_SLEEP);
-	} else {
-		zio_pipeline[zio->io_stage](zio);
+		ASSERT(zio->io_stage <= ZIO_STAGE_DONE);
+		ASSERT(zio->io_stalled == 0);
+
+		rv = zio_pipeline[zio->io_stage](zio);
+
+		if (rv == ZIO_PIPELINE_STOP)
+			return;
+
+		ASSERT(rv == ZIO_PIPELINE_CONTINUE);
 	}
 }
 
-void
-zio_resubmit_stage_async(void *arg)
-{
-	zio_t *zio = (zio_t *)(uintptr_t)arg;
-
-	zio_next_stage_async(zio);
-}
-
 static boolean_t
 zio_io_should_fail(uint16_t range)
 {