6798384 It can take a village to raise a zio
authorBill Moore <Bill.Moore@Sun.COM>
Tue, 27 Jan 2009 16:26:14 -0800
changeset 8632 36ef517870a3
parent 8631 f693eaa7ed4a
child 8633 9d6ad45f7cc6
6798384 It can take a village to raise a zio
usr/src/cmd/mdb/common/modules/zfs/zfs.c
usr/src/uts/common/fs/zfs/arc.c
usr/src/uts/common/fs/zfs/sys/zio.h
usr/src/uts/common/fs/zfs/vdev.c
usr/src/uts/common/fs/zfs/vdev_cache.c
usr/src/uts/common/fs/zfs/vdev_label.c
usr/src/uts/common/fs/zfs/vdev_mirror.c
usr/src/uts/common/fs/zfs/vdev_queue.c
usr/src/uts/common/fs/zfs/zio.c
--- a/usr/src/cmd/mdb/common/modules/zfs/zfs.c	Tue Jan 27 15:52:26 2009 -0800
+++ b/usr/src/cmd/mdb/common/modules/zfs/zfs.c	Tue Jan 27 16:26:14 2009 -0800
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -1607,20 +1607,34 @@
  * descend down the tree.
  */
 
-#define	ZIO_MAXDEPTH	16
+#define	ZIO_MAXINDENT	24
+#define	ZIO_MAXWIDTH	(sizeof (uintptr_t) * 2 + ZIO_MAXINDENT)
+#define	ZIO_WALK_SELF	0
+#define	ZIO_WALK_CHILD	1
+#define	ZIO_WALK_PARENT	2
+
+typedef struct zio_print_args {
+	int	zpa_current_depth;
+	int	zpa_min_depth;
+	int	zpa_max_depth;
+	int	zpa_type;
+	uint_t	zpa_flags;
+} zio_print_args_t;
+
+static int zio_child_cb(uintptr_t addr, const void *unknown, void *arg);
 
 static int
 zio_print_cb(uintptr_t addr, const void *data, void *priv)
 {
 	const zio_t *zio = data;
-	uintptr_t depth = (uintptr_t)priv;
+	zio_print_args_t *zpa = priv;
 	mdb_ctf_id_t type_enum, stage_enum;
+	int indent = zpa->zpa_current_depth;
 	const char *type, *stage;
-	int maxdepth;
+	uintptr_t laddr;
 
-	maxdepth = sizeof (uintptr_t) * 2 + ZIO_MAXDEPTH;
-	if (depth > ZIO_MAXDEPTH)
-		depth = ZIO_MAXDEPTH;
+	if (indent > ZIO_MAXINDENT)
+		indent = ZIO_MAXINDENT;
 
 	if (mdb_ctf_lookup_by_name("enum zio_type", &type_enum) == -1 ||
 	    mdb_ctf_lookup_by_name("enum zio_stage", &stage_enum) == -1) {
@@ -1638,45 +1652,100 @@
 	else
 		stage = "?";
 
+	if (zpa->zpa_current_depth >= zpa->zpa_min_depth) {
+		if (zpa->zpa_flags & DCMD_PIPE_OUT) {
+			mdb_printf("%?p\n", addr);
+		} else {
+			mdb_printf("%*s%-*p %-5s %-16s ", indent, "",
+			    ZIO_MAXWIDTH - indent, addr, type, stage);
+			if (zio->io_waiter)
+				mdb_printf("%?p\n", zio->io_waiter);
+			else
+				mdb_printf("-\n");
+		}
+	}
 
-	mdb_printf("%*s%-*p %-5s %-22s ",
-	    depth, "", maxdepth - depth, addr, type, stage);
-	if (zio->io_waiter)
-		mdb_printf("%?p\n", zio->io_waiter);
+	if (zpa->zpa_current_depth >= zpa->zpa_max_depth)
+		return (WALK_NEXT);
+
+	if (zpa->zpa_type == ZIO_WALK_PARENT)
+		laddr = addr + OFFSETOF(zio_t, io_parent_list);
 	else
-		mdb_printf("-\n");
+		laddr = addr + OFFSETOF(zio_t, io_child_list);
 
-	if (mdb_pwalk("zio_child", zio_print_cb, (void *)(depth + 1),
-	    addr) !=  0) {
-		mdb_warn("failed to walk zio_t children at %p\n", addr);
+	zpa->zpa_current_depth++;
+	if (mdb_pwalk("list", zio_child_cb, zpa, laddr) != 0) {
+		mdb_warn("failed to walk zio_t children at %p\n", laddr);
 		return (WALK_ERR);
 	}
+	zpa->zpa_current_depth--;
 
 	return (WALK_NEXT);
 }
 
-/*ARGSUSED*/
+/* ARGSUSED */
+static int
+zio_child_cb(uintptr_t addr, const void *unknown, void *arg)
+{
+	zio_link_t zl;
+	zio_t zio;
+	uintptr_t ziop;
+	zio_print_args_t *zpa = arg;
+
+	if (mdb_vread(&zl, sizeof (zl), addr) == -1) {
+		mdb_warn("failed to read zio_link_t at %p", addr);
+		return (WALK_ERR);
+	}
+
+	if (zpa->zpa_type == ZIO_WALK_PARENT)
+		ziop = (uintptr_t)zl.zl_parent;
+	else
+		ziop = (uintptr_t)zl.zl_child;
+
+	if (mdb_vread(&zio, sizeof (zio_t), ziop) == -1) {
+		mdb_warn("failed to read zio_t at %p", ziop);
+		return (WALK_ERR);
+	}
+
+	return (zio_print_cb(ziop, &zio, arg));
+}
+
+/* ARGSUSED */
 static int
 zio_print(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
 {
 	zio_t zio;
-	int maxdepth;
-
-	maxdepth = sizeof (uintptr_t) * 2 + ZIO_MAXDEPTH;
+	zio_print_args_t zpa = { 0 };
 
 	if (!(flags & DCMD_ADDRSPEC))
 		return (DCMD_USAGE);
 
+	if (mdb_getopts(argc, argv,
+	    'r', MDB_OPT_SETBITS, INT_MAX, &zpa.zpa_max_depth,
+	    'c', MDB_OPT_SETBITS, ZIO_WALK_CHILD, &zpa.zpa_type,
+	    'p', MDB_OPT_SETBITS, ZIO_WALK_PARENT, &zpa.zpa_type,
+	    NULL) != argc)
+		return (DCMD_USAGE);
+
+	zpa.zpa_flags = flags;
+	if (zpa.zpa_max_depth != 0) {
+		if (zpa.zpa_type == ZIO_WALK_SELF)
+			zpa.zpa_type = ZIO_WALK_CHILD;
+	} else if (zpa.zpa_type != ZIO_WALK_SELF) {
+		zpa.zpa_min_depth = 1;
+		zpa.zpa_max_depth = 1;
+	}
+
 	if (mdb_vread(&zio, sizeof (zio_t), addr) == -1) {
 		mdb_warn("failed to read zio_t at %p", addr);
 		return (DCMD_ERR);
 	}
 
-	if (DCMD_HDRSPEC(flags))
-		mdb_printf("%<u>%-*s %-5s %-22s %-?s%</u>\n", maxdepth,
+	if (!(flags & DCMD_PIPE_OUT) && DCMD_HDRSPEC(flags))
+		mdb_printf("%<u>%-*s %-5s %-16s %-?s%</u>\n", ZIO_MAXWIDTH,
 		    "ADDRESS", "TYPE", "STAGE", "WAITER");
 
-	if (zio_print_cb(addr, &zio, NULL) != WALK_NEXT)
+	if (zio_print_cb(addr, &zio, &zpa) != WALK_NEXT)
 		return (DCMD_ERR);
 
 	return (DCMD_OK);
@@ -1880,50 +1949,6 @@
 }
 
 /*
- * ::walk zio_child
- *
- * Walk the children of a zio_t structure.
- */
-static int
-zio_child_walk_init(mdb_walk_state_t *wsp)
-{
-	zio_t zio;
-
-	if (wsp->walk_addr == 0) {
-		mdb_warn("::walk zio_child doesn't support global walks\n");
-		return (WALK_ERR);
-	}
-
-	if (mdb_vread(&zio, sizeof (zio), wsp->walk_addr) == -1) {
-		mdb_warn("failed to read zio_t at %p", wsp->walk_addr);
-		return (WALK_ERR);
-	}
-
-	wsp->walk_addr = (uintptr_t)zio.io_child;
-	return (WALK_NEXT);
-}
-
-static int
-zio_sibling_walk_step(mdb_walk_state_t *wsp)
-{
-	zio_t zio;
-	int status;
-
-	if (wsp->walk_addr == NULL)
-		return (WALK_DONE);
-
-	if (mdb_vread(&zio, sizeof (zio), wsp->walk_addr) == -1) {
-		mdb_warn("failed to read zio_t at %p", wsp->walk_addr);
-		return (WALK_ERR);
-	}
-
-	status = wsp->walk_callback(wsp->walk_addr, &zio, wsp->walk_cbdata);
-
-	wsp->walk_addr = (uintptr_t)zio.io_sibling_next;
-	return (status);
-}
-
-/*
  * [addr]::walk zio_root
  *
  * Walk only root zio_t structures, optionally for a particular spa_t.
@@ -1941,7 +1966,9 @@
 	if (wsp->walk_data != NULL && wsp->walk_data != zio.io_spa)
 		return (WALK_NEXT);
 
-	if ((uintptr_t)zio.io_parent != NULL)
+	/* If the parent list is not empty, ignore */
+	if (zio.io_parent_list.list_head.list_next !=
+	    &((zio_t *)wsp->walk_addr)->io_parent_list.list_head)
 		return (WALK_NEXT);
 
 	return (wsp->walk_callback(wsp->walk_addr, &zio, wsp->walk_cbdata));
@@ -2129,23 +2156,27 @@
 	{ "dbuf", ":", "print dmu_buf_impl_t", dbuf },
 	{ "dbuf_stats", ":", "dbuf stats", dbuf_stats },
 	{ "dbufs",
-	"\t[-O objset_impl_t*] [-n objset_name | \"mos\"] "
-	"[-o object | \"mdn\"] \n"
-	"\t[-l level] [-b blkid | \"bonus\"]",
-	"find dmu_buf_impl_t's that match specified criteria", dbufs },
+	    "\t[-O objset_impl_t*] [-n objset_name | \"mos\"] "
+	    "[-o object | \"mdn\"] \n"
+	    "\t[-l level] [-b blkid | \"bonus\"]",
+	    "find dmu_buf_impl_t's that match specified criteria", dbufs },
 	{ "abuf_find", "dva_word[0] dva_word[1]",
-	"find arc_buf_hdr_t of a specified DVA",
-	abuf_find },
+	    "find arc_buf_hdr_t of a specified DVA",
+	    abuf_find },
 	{ "spa", "?[-cv]", "spa_t summary", spa_print },
 	{ "spa_config", ":", "print spa_t configuration", spa_print_config },
 	{ "spa_verify", ":", "verify spa_t consistency", spa_verify },
 	{ "spa_space", ":[-b]", "print spa_t on-disk space usage", spa_space },
 	{ "spa_vdevs", ":", "given a spa_t, print vdev summary", spa_vdevs },
 	{ "vdev", ":[-re]\n"
-	"\t-r display recursively\n"
-	"\t-e print statistics\n",
-	"vdev_t summary", vdev_print },
-	{ "zio", ":", "zio_t summary", zio_print },
+	    "\t-r display recursively\n"
+	    "\t-e print statistics",
+	    "vdev_t summary", vdev_print },
+	{ "zio", ":[cpr]\n"
+	    "\t-c display children\n"
+	    "\t-p display parents\n"
+	    "\t-r display recursively",
+	    "zio_t summary", zio_print },
 	{ "zio_state", "?", "print out all zio_t structures on system or "
 	    "for a particular pool", zio_state },
 	{ "zio_pipeline", ":", "decode a zio pipeline", zio_pipeline },
@@ -2179,8 +2210,6 @@
 		txg_list3_walk_init, txg_list_walk_step, NULL },
 	{ "zio", "walk all zio structures, optionally for a particular spa_t",
 		zio_walk_init, zio_walk_step, NULL },
-	{ "zio_child", "walk children of a zio_t structure",
-		zio_child_walk_init, zio_sibling_walk_step, NULL },
 	{ "zio_root", "walk all root zio_t structures, optionally for a "
 	    "particular spa_t",
 		zio_walk_init, zio_walk_root_step, NULL },
--- a/usr/src/uts/common/fs/zfs/arc.c	Tue Jan 27 15:52:26 2009 -0800
+++ b/usr/src/uts/common/fs/zfs/arc.c	Tue Jan 27 16:26:14 2009 -0800
@@ -2548,7 +2548,7 @@
 				acb->acb_private = private;
 				if (pio != NULL)
 					acb->acb_zio_dummy = zio_null(pio,
-					    spa, NULL, NULL, zio_flags);
+					    spa, NULL, NULL, NULL, zio_flags);
 
 				ASSERT(acb->acb_done != NULL);
 				acb->acb_next = hdr->b_acb;
@@ -4008,11 +4008,15 @@
 		 * storage now.  If there *is* a waiter, the caller must
 		 * issue the i/o in a context where it's OK to block.
 		 */
-		if (zio->io_waiter == NULL)
-			zio_nowait(zio_read(zio->io_parent,
-			    cb->l2rcb_spa, &cb->l2rcb_bp,
+		if (zio->io_waiter == NULL) {
+			zio_t *pio = zio_unique_parent(zio);
+
+			ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL);
+
+			zio_nowait(zio_read(pio, cb->l2rcb_spa, &cb->l2rcb_bp,
 			    buf->b_data, zio->io_size, arc_read_done, buf,
 			    zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb));
+		}
 	}
 
 	kmem_free(cb, sizeof (l2arc_read_callback_t));
--- a/usr/src/uts/common/fs/zfs/sys/zio.h	Tue Jan 27 15:52:26 2009 -0800
+++ b/usr/src/uts/common/fs/zfs/sys/zio.h	Tue Jan 27 16:26:14 2009 -0800
@@ -20,7 +20,7 @@
  */
 
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -265,6 +265,13 @@
 #define	ZIO_REEXECUTE_NOW	0x01
 #define	ZIO_REEXECUTE_SUSPEND	0x02
 
+typedef struct zio_link {
+	zio_t		*zl_parent;
+	zio_t		*zl_child;
+	list_node_t	zl_parent_node;
+	list_node_t	zl_child_node;
+} zio_link_t;
+
 struct zio {
 	/* Core information about this I/O */
 	zbookmark_t	io_bookmark;
@@ -275,14 +282,14 @@
 	uint8_t		io_priority;
 	uint8_t		io_reexecute;
 	uint8_t		io_async_root;
+	uint8_t		io_state[ZIO_WAIT_TYPES];
 	uint64_t	io_txg;
 	spa_t		*io_spa;
 	blkptr_t	*io_bp;
 	blkptr_t	io_bp_copy;
-	zio_t		*io_parent;
-	zio_t		*io_child;
-	zio_t		*io_sibling_prev;
-	zio_t		*io_sibling_next;
+	list_t		io_parent_list;
+	list_t		io_child_list;
+	zio_link_t	*io_walk_link;
 	zio_t		*io_logical;
 	zio_transform_t *io_transform_stack;
 
@@ -305,8 +312,6 @@
 	avl_node_t	io_offset_node;
 	avl_node_t	io_deadline_node;
 	avl_tree_t	*io_vdev_tree;
-	zio_t		*io_delegate_list;
-	zio_t		*io_delegate_next;
 
 	/* Internal pipeline state */
 	int		io_flags;
@@ -329,7 +334,7 @@
 	uint64_t	io_ena;
 };
 
-extern zio_t *zio_null(zio_t *pio, spa_t *spa,
+extern zio_t *zio_null(zio_t *pio, spa_t *spa, vdev_t *vd,
     zio_done_func_t *done, void *private, int flags);
 
 extern zio_t *zio_root(spa_t *spa,
@@ -379,6 +384,11 @@
 extern void zio_execute(zio_t *zio);
 extern void zio_interrupt(zio_t *zio);
 
+extern zio_t *zio_walk_parents(zio_t *cio);
+extern zio_t *zio_walk_children(zio_t *pio);
+extern zio_t *zio_unique_parent(zio_t *cio);
+extern void zio_add_child(zio_t *pio, zio_t *cio);
+
 extern void *zio_buf_alloc(size_t size);
 extern void zio_buf_free(void *buf, size_t size);
 extern void *zio_data_buf_alloc(size_t size);
--- a/usr/src/uts/common/fs/zfs/vdev.c	Tue Jan 27 15:52:26 2009 -0800
+++ b/usr/src/uts/common/fs/zfs/vdev.c	Tue Jan 27 16:26:14 2009 -0800
@@ -20,7 +20,7 @@
  */
 
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -816,23 +816,22 @@
 	boolean_t	vps_readable;
 	boolean_t	vps_writeable;
 	int		vps_flags;
-	zio_t		*vps_root;
-	vdev_t		*vps_vd;
 } vdev_probe_stats_t;
 
 static void
 vdev_probe_done(zio_t *zio)
 {
 	spa_t *spa = zio->io_spa;
+	vdev_t *vd = zio->io_vd;
 	vdev_probe_stats_t *vps = zio->io_private;
-	vdev_t *vd = vps->vps_vd;
+
+	ASSERT(vd->vdev_probe_zio != NULL);
 
 	if (zio->io_type == ZIO_TYPE_READ) {
-		ASSERT(zio->io_vd == vd);
 		if (zio->io_error == 0)
 			vps->vps_readable = 1;
 		if (zio->io_error == 0 && spa_writeable(spa)) {
-			zio_nowait(zio_write_phys(vps->vps_root, vd,
+			zio_nowait(zio_write_phys(vd->vdev_probe_zio, vd,
 			    zio->io_offset, zio->io_size, zio->io_data,
 			    ZIO_CHECKSUM_OFF, vdev_probe_done, vps,
 			    ZIO_PRIORITY_SYNC_WRITE, vps->vps_flags, B_TRUE));
@@ -840,13 +839,11 @@
 			zio_buf_free(zio->io_data, zio->io_size);
 		}
 	} else if (zio->io_type == ZIO_TYPE_WRITE) {
-		ASSERT(zio->io_vd == vd);
 		if (zio->io_error == 0)
 			vps->vps_writeable = 1;
 		zio_buf_free(zio->io_data, zio->io_size);
 	} else if (zio->io_type == ZIO_TYPE_NULL) {
-		ASSERT(zio->io_vd == NULL);
-		ASSERT(zio == vps->vps_root);
+		zio_t *pio;
 
 		vd->vdev_cant_read |= !vps->vps_readable;
 		vd->vdev_cant_write |= !vps->vps_writeable;
@@ -860,6 +857,16 @@
 			    spa, vd, NULL, 0, 0);
 			zio->io_error = ENXIO;
 		}
+
+		mutex_enter(&vd->vdev_probe_lock);
+		ASSERT(vd->vdev_probe_zio == zio);
+		vd->vdev_probe_zio = NULL;
+		mutex_exit(&vd->vdev_probe_lock);
+
+		while ((pio = zio_walk_parents(zio)) != NULL)
+			if (!vdev_accessible(vd, pio))
+				pio->io_error = ENXIO;
+
 		kmem_free(vps, sizeof (*vps));
 	}
 }
@@ -870,45 +877,78 @@
  * but the first (which we leave alone in case it contains a VTOC).
  */
 zio_t *
-vdev_probe(vdev_t *vd, zio_t *pio)
+vdev_probe(vdev_t *vd, zio_t *zio)
 {
 	spa_t *spa = vd->vdev_spa;
-	vdev_probe_stats_t *vps;
-	zio_t *zio;
-
-	vps = kmem_zalloc(sizeof (*vps), KM_SLEEP);
-
-	vps->vps_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_PROBE |
-	    ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE | ZIO_FLAG_DONT_RETRY;
-
-	if (spa_config_held(spa, SCL_ZIO, RW_WRITER)) {
-		/*
-		 * vdev_cant_read and vdev_cant_write can only transition
-		 * from TRUE to FALSE when we have the SCL_ZIO lock as writer;
-		 * otherwise they can only transition from FALSE to TRUE.
-		 * This ensures that any zio looking at these values can
-		 * assume that failures persist for the life of the I/O.
-		 * That's important because when a device has intermittent
-		 * connectivity problems, we want to ensure that they're
-		 * ascribed to the device (ENXIO) and not the zio (EIO).
-		 *
-		 * Since we hold SCL_ZIO as writer here, clear both values
-		 * so the probe can reevaluate from first principles.
-		 */
-		vps->vps_flags |= ZIO_FLAG_CONFIG_WRITER;
-		vd->vdev_cant_read = B_FALSE;
-		vd->vdev_cant_write = B_FALSE;
-	}
+	vdev_probe_stats_t *vps = NULL;
+	zio_t *pio;
 
 	ASSERT(vd->vdev_ops->vdev_op_leaf);
 
-	zio = zio_null(pio, spa, vdev_probe_done, vps, vps->vps_flags);
-
-	vps->vps_root = zio;
-	vps->vps_vd = vd;
+	/*
+	 * Don't probe the probe.
+	 */
+	if (zio && (zio->io_flags & ZIO_FLAG_PROBE))
+		return (NULL);
+
+	/*
+	 * To prevent 'probe storms' when a device fails, we create
+	 * just one probe i/o at a time.  All zios that want to probe
+	 * this vdev will become parents of the probe io.
+	 */
+	mutex_enter(&vd->vdev_probe_lock);
+
+	if ((pio = vd->vdev_probe_zio) == NULL) {
+		vps = kmem_zalloc(sizeof (*vps), KM_SLEEP);
+
+		vps->vps_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_PROBE |
+		    ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE |
+		    ZIO_FLAG_DONT_RETRY;
+
+		if (spa_config_held(spa, SCL_ZIO, RW_WRITER)) {
+			/*
+			 * vdev_cant_read and vdev_cant_write can only
+			 * transition from TRUE to FALSE when we have the
+			 * SCL_ZIO lock as writer; otherwise they can only
+			 * transition from FALSE to TRUE.  This ensures that
+			 * any zio looking at these values can assume that
+			 * failures persist for the life of the I/O.  That's
+			 * important because when a device has intermittent
+			 * connectivity problems, we want to ensure that
+			 * they're ascribed to the device (ENXIO) and not
+			 * the zio (EIO).
+			 *
+			 * Since we hold SCL_ZIO as writer here, clear both
+			 * values so the probe can reevaluate from first
+			 * principles.
+			 */
+			vps->vps_flags |= ZIO_FLAG_CONFIG_WRITER;
+			vd->vdev_cant_read = B_FALSE;
+			vd->vdev_cant_write = B_FALSE;
+		}
+
+		vd->vdev_probe_zio = pio = zio_null(NULL, spa, vd,
+		    vdev_probe_done, vps,
+		    vps->vps_flags | ZIO_FLAG_DONT_PROPAGATE);
+
+		if (zio != NULL) {
+			vd->vdev_probe_wanted = B_TRUE;
+			spa_async_request(spa, SPA_ASYNC_PROBE);
+		}
+	}
+
+	if (zio != NULL)
+		zio_add_child(zio, pio);
+
+	mutex_exit(&vd->vdev_probe_lock);
+
+	if (vps == NULL) {
+		ASSERT(zio != NULL);
+		return (NULL);
+	}
 
 	for (int l = 1; l < VDEV_LABELS; l++) {
-		zio_nowait(zio_read_phys(zio, vd,
+		zio_nowait(zio_read_phys(pio, vd,
 		    vdev_label_offset(vd->vdev_psize, l,
 		    offsetof(vdev_label_t, vl_pad)),
 		    VDEV_SKIP_SIZE, zio_buf_alloc(VDEV_SKIP_SIZE),
@@ -916,7 +956,11 @@
 		    ZIO_PRIORITY_SYNC_READ, vps->vps_flags, B_TRUE));
 	}
 
-	return (zio);
+	if (zio == NULL)
+		return (pio);
+
+	zio_nowait(pio);
+	return (NULL);
 }
 
 /*
--- a/usr/src/uts/common/fs/zfs/vdev_cache.c	Tue Jan 27 15:52:26 2009 -0800
+++ b/usr/src/uts/common/fs/zfs/vdev_cache.c	Tue Jan 27 16:26:14 2009 -0800
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -203,23 +203,23 @@
  * Fill a previously allocated cache entry with data.
  */
 static void
-vdev_cache_fill(zio_t *zio)
+vdev_cache_fill(zio_t *fio)
 {
-	vdev_t *vd = zio->io_vd;
+	vdev_t *vd = fio->io_vd;
 	vdev_cache_t *vc = &vd->vdev_cache;
-	vdev_cache_entry_t *ve = zio->io_private;
-	zio_t *dio;
+	vdev_cache_entry_t *ve = fio->io_private;
+	zio_t *pio;
 
-	ASSERT(zio->io_size == VCBS);
+	ASSERT(fio->io_size == VCBS);
 
 	/*
 	 * Add data to the cache.
 	 */
 	mutex_enter(&vc->vc_lock);
 
-	ASSERT(ve->ve_fill_io == zio);
-	ASSERT(ve->ve_offset == zio->io_offset);
-	ASSERT(ve->ve_data == zio->io_data);
+	ASSERT(ve->ve_fill_io == fio);
+	ASSERT(ve->ve_offset == fio->io_offset);
+	ASSERT(ve->ve_data == fio->io_data);
 
 	ve->ve_fill_io = NULL;
 
@@ -228,20 +228,13 @@
 	 * any reads that were queued up before the missed update are still
 	 * valid, so we can satisfy them from this line before we evict it.
 	 */
-	for (dio = zio->io_delegate_list; dio; dio = dio->io_delegate_next)
-		vdev_cache_hit(vc, ve, dio);
+	while ((pio = zio_walk_parents(fio)) != NULL)
+		vdev_cache_hit(vc, ve, pio);
 
-	if (zio->io_error || ve->ve_missed_update)
+	if (fio->io_error || ve->ve_missed_update)
 		vdev_cache_evict(vc, ve);
 
 	mutex_exit(&vc->vc_lock);
-
-	while ((dio = zio->io_delegate_list) != NULL) {
-		zio->io_delegate_list = dio->io_delegate_next;
-		dio->io_delegate_next = NULL;
-		dio->io_error = zio->io_error;
-		zio_execute(dio);
-	}
 }
 
 /*
@@ -284,9 +277,8 @@
 		}
 
 		if ((fio = ve->ve_fill_io) != NULL) {
-			zio->io_delegate_next = fio->io_delegate_list;
-			fio->io_delegate_list = zio;
 			zio_vdev_io_bypass(zio);
+			zio_add_child(zio, fio);
 			mutex_exit(&vc->vc_lock);
 			VDCSTAT_BUMP(vdc_stat_delegations);
 			return (0);
@@ -296,7 +288,6 @@
 		zio_vdev_io_bypass(zio);
 
 		mutex_exit(&vc->vc_lock);
-		zio_execute(zio);
 		VDCSTAT_BUMP(vdc_stat_hits);
 		return (0);
 	}
@@ -313,8 +304,8 @@
 	    ZIO_FLAG_DONT_CACHE, vdev_cache_fill, ve);
 
 	ve->ve_fill_io = fio;
-	fio->io_delegate_list = zio;
 	zio_vdev_io_bypass(zio);
+	zio_add_child(zio, fio);
 
 	mutex_exit(&vc->vc_lock);
 	zio_nowait(fio);
--- a/usr/src/uts/common/fs/zfs/vdev_label.c	Tue Jan 27 15:52:26 2009 -0800
+++ b/usr/src/uts/common/fs/zfs/vdev_label.c	Tue Jan 27 16:26:14 2009 -0800
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -961,7 +961,7 @@
 	for (vd = list_head(dl); vd != NULL; vd = list_next(dl, vd)) {
 		uint64_t *good_writes = kmem_zalloc(sizeof (uint64_t),
 		    KM_SLEEP);
-		zio_t *vio = zio_null(zio, spa,
+		zio_t *vio = zio_null(zio, spa, NULL,
 		    (vd->vdev_islog || vd->vdev_aux != NULL) ?
 		    vdev_label_sync_ignore_done : vdev_label_sync_top_done,
 		    good_writes, flags);
--- a/usr/src/uts/common/fs/zfs/vdev_mirror.c	Tue Jan 27 15:52:26 2009 -0800
+++ b/usr/src/uts/common/fs/zfs/vdev_mirror.c	Tue Jan 27 16:26:14 2009 -0800
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -180,11 +180,16 @@
 	mirror_child_t *mc = zio->io_private;
 
 	if (zio->io_error == 0) {
-		zio_t *pio = zio->io_parent;
-		mutex_enter(&pio->io_lock);
-		ASSERT3U(zio->io_size, >=, pio->io_size);
-		bcopy(zio->io_data, pio->io_data, pio->io_size);
-		mutex_exit(&pio->io_lock);
+		zio_t *pio;
+
+		mutex_enter(&zio->io_lock);
+		while ((pio = zio_walk_parents(zio)) != NULL) {
+			mutex_enter(&pio->io_lock);
+			ASSERT3U(zio->io_size, >=, pio->io_size);
+			bcopy(zio->io_data, pio->io_data, pio->io_size);
+			mutex_exit(&pio->io_lock);
+		}
+		mutex_exit(&zio->io_lock);
 	}
 
 	zio_buf_free(zio->io_data, zio->io_size);
--- a/usr/src/uts/common/fs/zfs/vdev_queue.c	Tue Jan 27 15:52:26 2009 -0800
+++ b/usr/src/uts/common/fs/zfs/vdev_queue.c	Tue Jan 27 16:26:14 2009 -0800
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -149,20 +149,12 @@
 static void
 vdev_queue_agg_io_done(zio_t *aio)
 {
-	zio_t *dio;
-	uint64_t offset = 0;
+	zio_t *pio;
 
-	while ((dio = aio->io_delegate_list) != NULL) {
+	while ((pio = zio_walk_parents(aio)) != NULL)
 		if (aio->io_type == ZIO_TYPE_READ)
-			bcopy((char *)aio->io_data + offset, dio->io_data,
-			    dio->io_size);
-		offset += dio->io_size;
-		aio->io_delegate_list = dio->io_delegate_next;
-		dio->io_delegate_next = NULL;
-		dio->io_error = aio->io_error;
-		zio_execute(dio);
-	}
-	ASSERT3U(offset, ==, aio->io_size);
+			bcopy((char *)aio->io_data + (pio->io_offset -
+			    aio->io_offset), pio->io_data, pio->io_size);
 
 	zio_buf_free(aio->io_data, aio->io_size);
 }
@@ -173,8 +165,8 @@
 static zio_t *
 vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit)
 {
-	zio_t *fio, *lio, *aio, *dio;
-	avl_tree_t *tree;
+	zio_t *fio, *lio, *aio, *dio, *nio;
+	avl_tree_t *t;
 	uint64_t size;
 	int flags;
 
@@ -186,7 +178,7 @@
 
 	fio = lio = avl_first(&vq->vq_deadline_tree);
 
-	tree = fio->io_vdev_tree;
+	t = fio->io_vdev_tree;
 	size = fio->io_size;
 	flags = fio->io_flags & ZIO_FLAG_AGG_INHERIT;
 
@@ -198,55 +190,54 @@
 		 * of the I/O, such as whether it's a normal I/O or a
 		 * scrub/resilver, can be preserved in the aggregate.
 		 */
-		while ((dio = AVL_PREV(tree, fio)) != NULL &&
+		while ((dio = AVL_PREV(t, fio)) != NULL &&
 		    IS_ADJACENT(dio, fio) &&
 		    (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
 		    size + dio->io_size <= zfs_vdev_aggregation_limit) {
-			dio->io_delegate_next = fio;
 			fio = dio;
 			size += dio->io_size;
 		}
-		while ((dio = AVL_NEXT(tree, lio)) != NULL &&
+		while ((dio = AVL_NEXT(t, lio)) != NULL &&
 		    IS_ADJACENT(lio, dio) &&
 		    (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
 		    size + dio->io_size <= zfs_vdev_aggregation_limit) {
-			lio->io_delegate_next = dio;
 			lio = dio;
 			size += dio->io_size;
 		}
 	}
 
 	if (fio != lio) {
-		char *buf = zio_buf_alloc(size);
-		uint64_t offset = 0;
-
 		ASSERT(size <= zfs_vdev_aggregation_limit);
 
 		aio = zio_vdev_delegated_io(fio->io_vd, fio->io_offset,
-		    buf, size, fio->io_type, ZIO_PRIORITY_NOW,
+		    zio_buf_alloc(size), size, fio->io_type, ZIO_PRIORITY_NOW,
 		    flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE,
 		    vdev_queue_agg_io_done, NULL);
 
-		aio->io_delegate_list = fio;
-
-		for (dio = fio; dio != NULL; dio = dio->io_delegate_next) {
+		/* We want to process lio, then stop */
+		lio = AVL_NEXT(t, lio);
+		for (dio = fio; dio != lio; dio = nio) {
 			ASSERT(dio->io_type == aio->io_type);
-			ASSERT(dio->io_vdev_tree == tree);
+			ASSERT(dio->io_vdev_tree == t);
+
 			if (dio->io_type == ZIO_TYPE_WRITE)
-				bcopy(dio->io_data, buf + offset, dio->io_size);
-			offset += dio->io_size;
+				bcopy(dio->io_data, (char *)aio->io_data +
+				    (dio->io_offset - aio->io_offset),
+				    dio->io_size);
+			nio = AVL_NEXT(t, dio);
+
+			zio_add_child(dio, aio);
 			vdev_queue_io_remove(vq, dio);
 			zio_vdev_io_bypass(dio);
+			zio_execute(dio);
 		}
 
-		ASSERT(offset == size);
-
 		avl_add(&vq->vq_pending_tree, aio);
 
 		return (aio);
 	}
 
-	ASSERT(fio->io_vdev_tree == tree);
+	ASSERT(fio->io_vdev_tree == t);
 	vdev_queue_io_remove(vq, fio);
 
 	avl_add(&vq->vq_pending_tree, fio);
--- a/usr/src/uts/common/fs/zfs/zio.c	Tue Jan 27 15:52:26 2009 -0800
+++ b/usr/src/uts/common/fs/zfs/zio.c	Tue Jan 27 16:26:14 2009 -0800
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -69,6 +69,7 @@
  * ==========================================================================
  */
 kmem_cache_t *zio_cache;
+kmem_cache_t *zio_link_cache;
 kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
 kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
 
@@ -92,8 +93,10 @@
 #ifdef _KERNEL
 	data_alloc_arena = zio_alloc_arena;
 #endif
-	zio_cache = kmem_cache_create("zio_cache", sizeof (zio_t), 0,
-	    NULL, NULL, NULL, NULL, NULL, 0);
+	zio_cache = kmem_cache_create("zio_cache",
+	    sizeof (zio_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
+	zio_link_cache = kmem_cache_create("zio_link_cache",
+	    sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
 
 	/*
 	 * For small buffers, we want a cache for each multiple of
@@ -164,6 +167,7 @@
 		zio_data_buf_cache[c] = NULL;
 	}
 
+	kmem_cache_destroy(zio_link_cache);
 	kmem_cache_destroy(zio_cache);
 
 	zio_inject_fini();
@@ -298,41 +302,102 @@
  * I/O parent/child relationships and pipeline interlocks
  * ==========================================================================
  */
+/*
+ * NOTE - Callers to zio_walk_parents() and zio_walk_children must
+ *        continue calling these functions until they return NULL.
+ *        Otherwise, the next caller will pick up the list walk in
+ *        some indeterminate state.  (Otherwise every caller would
+ *        have to pass in a cookie to keep the state represented by
+ *        io_walk_link, which gets annoying.)
+ */
+zio_t *
+zio_walk_parents(zio_t *cio)
+{
+	zio_link_t *zl = cio->io_walk_link;
+	list_t *pl = &cio->io_parent_list;
 
-static void
-zio_add_child(zio_t *pio, zio_t *zio)
+	zl = (zl == NULL) ? list_head(pl) : list_next(pl, zl);
+	cio->io_walk_link = zl;
+
+	if (zl == NULL)
+		return (NULL);
+
+	ASSERT(zl->zl_child == cio);
+	return (zl->zl_parent);
+}
+
+zio_t *
+zio_walk_children(zio_t *pio)
 {
+	zio_link_t *zl = pio->io_walk_link;
+	list_t *cl = &pio->io_child_list;
+
+	zl = (zl == NULL) ? list_head(cl) : list_next(cl, zl);
+	pio->io_walk_link = zl;
+
+	if (zl == NULL)
+		return (NULL);
+
+	ASSERT(zl->zl_parent == pio);
+	return (zl->zl_child);
+}
+
+zio_t *
+zio_unique_parent(zio_t *cio)
+{
+	zio_t *pio = zio_walk_parents(cio);
+
+	VERIFY(zio_walk_parents(cio) == NULL);
+	return (pio);
+}
+
+void
+zio_add_child(zio_t *pio, zio_t *cio)
+{
+	zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP);
+
+	/*
+	 * Logical I/Os can have logical, gang, or vdev children.
+	 * Gang I/Os can have gang or vdev children.
+	 * Vdev I/Os can only have vdev children.
+	 * The following ASSERT captures all of these constraints.
+	 */
+	ASSERT(cio->io_child_type <= pio->io_child_type);
+
+	zl->zl_parent = pio;
+	zl->zl_child = cio;
+
+	mutex_enter(&cio->io_lock);
 	mutex_enter(&pio->io_lock);
-	if (zio->io_stage < ZIO_STAGE_READY)
-		pio->io_children[zio->io_child_type][ZIO_WAIT_READY]++;
-	if (zio->io_stage < ZIO_STAGE_DONE)
-		pio->io_children[zio->io_child_type][ZIO_WAIT_DONE]++;
-	zio->io_sibling_prev = NULL;
-	zio->io_sibling_next = pio->io_child;
-	if (pio->io_child != NULL)
-		pio->io_child->io_sibling_prev = zio;
-	pio->io_child = zio;
-	zio->io_parent = pio;
+
+	ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0);
+
+	for (int w = 0; w < ZIO_WAIT_TYPES; w++)
+		pio->io_children[cio->io_child_type][w] += !cio->io_state[w];
+
+	list_insert_head(&pio->io_child_list, zl);
+	list_insert_head(&cio->io_parent_list, zl);
+
 	mutex_exit(&pio->io_lock);
+	mutex_exit(&cio->io_lock);
 }
 
 static void
-zio_remove_child(zio_t *pio, zio_t *zio)
+zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl)
 {
-	zio_t *next, *prev;
+	ASSERT(zl->zl_parent == pio);
+	ASSERT(zl->zl_child == cio);
 
-	ASSERT(zio->io_parent == pio);
-
+	mutex_enter(&cio->io_lock);
 	mutex_enter(&pio->io_lock);
-	next = zio->io_sibling_next;
-	prev = zio->io_sibling_prev;
-	if (next != NULL)
-		next->io_sibling_prev = prev;
-	if (prev != NULL)
-		prev->io_sibling_next = next;
-	if (pio->io_child == zio)
-		pio->io_child = next;
+
+	list_remove(&pio->io_child_list, zl);
+	list_remove(&cio->io_parent_list, zl);
+
 	mutex_exit(&pio->io_lock);
+	mutex_exit(&cio->io_lock);
+
+	kmem_cache_free(zio_link_cache, zl);
 }
 
 static boolean_t
@@ -407,6 +472,11 @@
 	mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL);
 
+	list_create(&zio->io_parent_list, sizeof (zio_link_t),
+	    offsetof(zio_link_t, zl_parent_node));
+	list_create(&zio->io_child_list, sizeof (zio_link_t),
+	    offsetof(zio_link_t, zl_child_node));
+
 	if (vd != NULL)
 		zio->io_child_type = ZIO_CHILD_VDEV;
 	else if (flags & ZIO_FLAG_GANG_CHILD)
@@ -441,17 +511,13 @@
 	zio->io_orig_stage = zio->io_stage = stage;
 	zio->io_orig_pipeline = zio->io_pipeline = pipeline;
 
+	zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY);
+	zio->io_state[ZIO_WAIT_DONE] = (stage >= ZIO_STAGE_DONE);
+
 	if (zb != NULL)
 		zio->io_bookmark = *zb;
 
 	if (pio != NULL) {
-		/*
-		 * Logical I/Os can have logical, gang, or vdev children.
-		 * Gang I/Os can have gang or vdev children.
-		 * Vdev I/Os can only have vdev children.
-		 * The following ASSERT captures all of these constraints.
-		 */
-		ASSERT(zio->io_child_type <= pio->io_child_type);
 		if (zio->io_logical == NULL)
 			zio->io_logical = pio->io_logical;
 		zio_add_child(pio, zio);
@@ -466,6 +532,8 @@
 	spa_t *spa = zio->io_spa;
 	uint8_t async_root = zio->io_async_root;
 
+	list_destroy(&zio->io_parent_list);
+	list_destroy(&zio->io_child_list);
 	mutex_destroy(&zio->io_lock);
 	cv_destroy(&zio->io_cv);
 	kmem_cache_free(zio_cache, zio);
@@ -479,13 +547,13 @@
 }
 
 zio_t *
-zio_null(zio_t *pio, spa_t *spa, zio_done_func_t *done, void *private,
-	int flags)
+zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done,
+    void *private, int flags)
 {
 	zio_t *zio;
 
 	zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private,
-	    ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, NULL, 0, NULL,
+	    ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL,
 	    ZIO_STAGE_OPEN, ZIO_INTERLOCK_PIPELINE);
 
 	return (zio);
@@ -494,7 +562,7 @@
 zio_t *
 zio_root(spa_t *spa, zio_done_func_t *done, void *private, int flags)
 {
-	return (zio_null(NULL, spa, done, private, flags));
+	return (zio_null(NULL, spa, NULL, done, private, flags));
 }
 
 zio_t *
@@ -573,12 +641,12 @@
 	ASSERT(!BP_IS_HOLE(bp));
 
 	if (bp->blk_fill == BLK_FILL_ALREADY_FREED)
-		return (zio_null(pio, spa, NULL, NULL, flags));
+		return (zio_null(pio, spa, NULL, NULL, NULL, flags));
 
 	if (txg == spa->spa_syncing_txg &&
 	    spa_sync_pass(spa) > SYNC_PASS_DEFERRED_FREE) {
 		bplist_enqueue_deferred(&spa->spa_sync_bplist, bp);
-		return (zio_null(pio, spa, NULL, NULL, flags));
+		return (zio_null(pio, spa, NULL, NULL, NULL, flags));
 	}
 
 	zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp),
@@ -629,7 +697,7 @@
 
 		zio->io_cmd = cmd;
 	} else {
-		zio = zio_null(pio, spa, NULL, NULL, flags);
+		zio = zio_null(pio, spa, NULL, NULL, NULL, flags);
 
 		for (c = 0; c < vd->vdev_children; c++)
 			zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd,
@@ -1020,11 +1088,12 @@
 {
 	ASSERT(zio->io_executor == NULL);
 
-	if (zio->io_parent == NULL && zio->io_child_type == ZIO_CHILD_LOGICAL) {
+	if (zio->io_child_type == ZIO_CHILD_LOGICAL &&
+	    zio_unique_parent(zio) == NULL) {
 		/*
 		 * This is a logical async I/O with no parent to wait for it.
-		 * Attach it to the pool's global async root zio so that
-		 * spa_unload() has a way of waiting for async I/O to finish.
+		 * Track how many outstanding I/Os of this type exist so
+		 * that spa_unload() knows when they are all done.
 		 */
 		spa_t *spa = zio->io_spa;
 		zio->io_async_root = B_TRUE;
@@ -1045,13 +1114,18 @@
 static void
 zio_reexecute(zio_t *pio)
 {
-	zio_t *zio, *zio_next;
+	zio_t *cio, *cio_next;
+
+	ASSERT(pio->io_child_type == ZIO_CHILD_LOGICAL);
+	ASSERT(pio->io_orig_stage == ZIO_STAGE_OPEN);
 
 	pio->io_flags = pio->io_orig_flags;
 	pio->io_stage = pio->io_orig_stage;
 	pio->io_pipeline = pio->io_orig_pipeline;
 	pio->io_reexecute = 0;
 	pio->io_error = 0;
+	for (int w = 0; w < ZIO_WAIT_TYPES; w++)
+		pio->io_state[w] = 0;
 	for (int c = 0; c < ZIO_CHILD_TYPES; c++)
 		pio->io_child_error[c] = 0;
 
@@ -1071,18 +1145,18 @@
 
 	/*
 	 * As we reexecute pio's children, new children could be created.
-	 * New children go to the head of the io_child list, however,
+	 * New children go to the head of pio's io_child_list, however,
 	 * so we will (correctly) not reexecute them.  The key is that
-	 * the remainder of the io_child list, from 'zio_next' onward,
-	 * cannot be affected by any side effects of reexecuting 'zio'.
+	 * the remainder of pio's io_child_list, from 'cio_next' onward,
+	 * cannot be affected by any side effects of reexecuting 'cio'.
 	 */
-	for (zio = pio->io_child; zio != NULL; zio = zio_next) {
-		zio_next = zio->io_sibling_next;
+	for (cio = zio_walk_children(pio); cio != NULL; cio = cio_next) {
+		cio_next = zio_walk_children(pio);
 		mutex_enter(&pio->io_lock);
-		pio->io_children[zio->io_child_type][ZIO_WAIT_READY]++;
-		pio->io_children[zio->io_child_type][ZIO_WAIT_DONE]++;
+		for (int w = 0; w < ZIO_WAIT_TYPES; w++)
+			pio->io_children[cio->io_child_type][w]++;
 		mutex_exit(&pio->io_lock);
-		zio_reexecute(zio);
+		zio_reexecute(cio);
 	}
 
 	/*
@@ -1111,7 +1185,7 @@
 	if (zio != NULL) {
 		ASSERT(zio != spa->spa_suspend_zio_root);
 		ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
-		ASSERT(zio->io_parent == NULL);
+		ASSERT(zio_unique_parent(zio) == NULL);
 		ASSERT(zio->io_stage == ZIO_STAGE_DONE);
 		zio_add_child(spa->spa_suspend_zio_root, zio);
 	}
@@ -1122,7 +1196,7 @@
 void
 zio_resume(spa_t *spa)
 {
-	zio_t *pio, *zio;
+	zio_t *pio, *cio, *cio_next;
 
 	/*
 	 * Reexecute all previously suspended i/o.
@@ -1137,10 +1211,11 @@
 	if (pio == NULL)
 		return;
 
-	while ((zio = pio->io_child) != NULL) {
-		zio_remove_child(pio, zio);
-		zio->io_parent = NULL;
-		zio_reexecute(zio);
+	for (cio = zio_walk_children(pio); cio != NULL; cio = cio_next) {
+		zio_link_t *zl = pio->io_walk_link;
+		cio_next = zio_walk_children(pio);
+		zio_remove_child(pio, cio, zl);
+		zio_reexecute(cio);
 	}
 
 	ASSERT(pio->io_children[ZIO_CHILD_LOGICAL][ZIO_WAIT_DONE] == 0);
@@ -1352,9 +1427,10 @@
 	zio_t *lio = zio->io_logical;
 	zio_gang_node_t *gn = zio->io_private;
 	blkptr_t *bp = zio->io_bp;
+	zio_t *pio = zio_unique_parent(zio);
 
-	ASSERT(zio->io_parent == lio);
-	ASSERT(zio->io_child == NULL);
+	ASSERT(pio == lio);
+	ASSERT(zio_walk_children(zio) == NULL);
 
 	if (zio->io_error)
 		return;
@@ -1445,7 +1521,7 @@
 static void
 zio_write_gang_member_ready(zio_t *zio)
 {
-	zio_t *pio = zio->io_parent;
+	zio_t *pio = zio_unique_parent(zio);
 	zio_t *lio = zio->io_logical;
 	dva_t *cdva = zio->io_bp->blk_dva;
 	dva_t *pdva = pio->io_bp->blk_dva;
@@ -1690,72 +1766,6 @@
  * Read and write to physical devices
  * ==========================================================================
  */
-
-static void
-zio_vdev_io_probe_done(zio_t *zio)
-{
-	zio_t *dio;
-	vdev_t *vd = zio->io_private;
-
-	mutex_enter(&vd->vdev_probe_lock);
-	ASSERT(vd->vdev_probe_zio == zio);
-	vd->vdev_probe_zio = NULL;
-	mutex_exit(&vd->vdev_probe_lock);
-
-	while ((dio = zio->io_delegate_list) != NULL) {
-		zio->io_delegate_list = dio->io_delegate_next;
-		dio->io_delegate_next = NULL;
-		if (!vdev_accessible(vd, dio))
-			dio->io_error = ENXIO;
-		zio_execute(dio);
-	}
-}
-
-/*
- * Probe the device to determine whether I/O failure is specific to this
- * zio (e.g. a bad sector) or affects the entire vdev (e.g. unplugged).
- */
-static int
-zio_vdev_io_probe(zio_t *zio)
-{
-	vdev_t *vd = zio->io_vd;
-	zio_t *pio = NULL;
-	boolean_t created_pio = B_FALSE;
-
-	/*
-	 * Don't probe the probe.
-	 */
-	if (zio->io_flags & ZIO_FLAG_PROBE)
-		return (ZIO_PIPELINE_CONTINUE);
-
-	/*
-	 * To prevent 'probe storms' when a device fails, we create
-	 * just one probe i/o at a time.  All zios that want to probe
-	 * this vdev will join the probe zio's io_delegate_list.
-	 */
-	mutex_enter(&vd->vdev_probe_lock);
-
-	if ((pio = vd->vdev_probe_zio) == NULL) {
-		vd->vdev_probe_zio = pio = zio_root(zio->io_spa,
-		    zio_vdev_io_probe_done, vd, ZIO_FLAG_CANFAIL);
-		created_pio = B_TRUE;
-		vd->vdev_probe_wanted = B_TRUE;
-		spa_async_request(zio->io_spa, SPA_ASYNC_PROBE);
-	}
-
-	zio->io_delegate_next = pio->io_delegate_list;
-	pio->io_delegate_list = zio;
-
-	mutex_exit(&vd->vdev_probe_lock);
-
-	if (created_pio) {
-		zio_nowait(vdev_probe(vd, pio));
-		zio_nowait(pio);
-	}
-
-	return (ZIO_PIPELINE_STOP);
-}
-
 static int
 zio_vdev_io_start(zio_t *zio)
 {
@@ -1811,7 +1821,6 @@
 	    zio->io_txg != 0 &&	/* not a delegated i/o */
 	    !vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) {
 		ASSERT(zio->io_type == ZIO_TYPE_WRITE);
-		ASSERT(zio->io_delegate_list == NULL);
 		zio_vdev_io_bypass(zio);
 		return (ZIO_PIPELINE_CONTINUE);
 	}
@@ -1820,7 +1829,7 @@
 	    (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE)) {
 
 		if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio) == 0)
-			return (ZIO_PIPELINE_STOP);
+			return (ZIO_PIPELINE_CONTINUE);
 
 		if ((zio = vdev_queue_io(zio)) == NULL)
 			return (ZIO_PIPELINE_STOP);
@@ -1872,7 +1881,7 @@
 	ops->vdev_op_io_done(zio);
 
 	if (unexpected_error)
-		return (zio_vdev_io_probe(zio));
+		VERIFY(vdev_probe(vd, zio) == NULL);
 
 	return (ZIO_PIPELINE_CONTINUE);
 }
@@ -2068,7 +2077,7 @@
 zio_ready(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
-	zio_t *pio = zio->io_parent;
+	zio_t *pio, *pio_next;
 
 	if (zio->io_ready) {
 		if (BP_IS_GANG(bp) &&
@@ -2088,8 +2097,22 @@
 	if (zio->io_error)
 		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
 
-	if (pio != NULL)
+	mutex_enter(&zio->io_lock);
+	zio->io_state[ZIO_WAIT_READY] = 1;
+	pio = zio_walk_parents(zio);
+	mutex_exit(&zio->io_lock);
+
+	/*
+	 * As we notify zio's parents, new parents could be added.
+	 * New parents go to the head of zio's io_parent_list, however,
+	 * so we will (correctly) not notify them.  The remainder of zio's
+	 * io_parent_list, from 'pio_next' onward, cannot change because
+	 * all parents must wait for us to be done before they can be done.
+	 */
+	for (; pio != NULL; pio = pio_next) {
+		pio_next = zio_walk_parents(zio);
 		zio_notify_parent(pio, zio, ZIO_WAIT_READY);
+	}
 
 	return (ZIO_PIPELINE_CONTINUE);
 }
@@ -2098,11 +2121,11 @@
 zio_done(zio_t *zio)
 {
 	spa_t *spa = zio->io_spa;
-	zio_t *pio = zio->io_parent;
 	zio_t *lio = zio->io_logical;
 	blkptr_t *bp = zio->io_bp;
 	vdev_t *vd = zio->io_vd;
 	uint64_t psize = zio->io_size;
+	zio_t *pio, *pio_next;
 
 	/*
 	 * If our of children haven't all completed,
@@ -2122,7 +2145,7 @@
 		ASSERT(bp->blk_pad[1] == 0);
 		ASSERT(bp->blk_pad[2] == 0);
 		ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0 ||
-		    (pio != NULL && bp == pio->io_bp));
+		    (bp == zio_unique_parent(zio)->io_bp));
 		if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) &&
 		    !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) {
 			ASSERT(!BP_SHOULD_BYTESWAP(bp));
@@ -2217,7 +2240,11 @@
 
 		zio_gang_tree_free(&zio->io_gang_tree);
 
-		if (pio != NULL) {
+		mutex_enter(&zio->io_lock);
+		zio->io_state[ZIO_WAIT_DONE] = 1;
+		mutex_exit(&zio->io_lock);
+
+		if ((pio = zio_unique_parent(zio)) != NULL) {
 			/*
 			 * We're not a root i/o, so there's nothing to do
 			 * but notify our parent.  Don't propagate errors
@@ -2243,20 +2270,28 @@
 		return (ZIO_PIPELINE_STOP);
 	}
 
-	ASSERT(zio->io_child == NULL);
+	ASSERT(zio_walk_children(zio) == NULL);
 	ASSERT(zio->io_reexecute == 0);
 	ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL));
 
+	/*
+	 * It is the responsibility of the done callback to ensure that this
+	 * particular zio is no longer discoverable for adoption, and as
+	 * such, cannot acquire any new parents.
+	 */
 	if (zio->io_done)
 		zio->io_done(zio);
 
 	zio_gang_tree_free(&zio->io_gang_tree);
 
-	ASSERT(zio->io_delegate_list == NULL);
-	ASSERT(zio->io_delegate_next == NULL);
+	mutex_enter(&zio->io_lock);
+	zio->io_state[ZIO_WAIT_DONE] = 1;
+	mutex_exit(&zio->io_lock);
 
-	if (pio != NULL) {
-		zio_remove_child(pio, zio);
+	for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) {
+		zio_link_t *zl = zio->io_walk_link;
+		pio_next = zio_walk_parents(zio);
+		zio_remove_child(pio, zio, zl);
 		zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
 	}