6410698 ZFS metadata needs to be more highly replicated (ditto blocks)
authorbillm
Mon, 10 Apr 2006 05:03:38 -0700
changeset 1775 e51e26b432c0
parent 1774 274a4306dfe0
child 1776 779af7da6661
6410698 ZFS metadata needs to be more highly replicated (ditto blocks) 6410700 zdb should support reading raw blocks out of storage pool 6410709 ztest: spa config can change before pool export
usr/src/cmd/mdb/common/modules/zfs/zfs.c
usr/src/cmd/zdb/zdb.c
usr/src/cmd/zpool/zpool_main.c
usr/src/cmd/ztest/ztest.c
usr/src/uts/common/fs/zfs/arc.c
usr/src/uts/common/fs/zfs/dbuf.c
usr/src/uts/common/fs/zfs/dmu.c
usr/src/uts/common/fs/zfs/dmu_objset.c
usr/src/uts/common/fs/zfs/dsl_pool.c
usr/src/uts/common/fs/zfs/metaslab.c
usr/src/uts/common/fs/zfs/spa.c
usr/src/uts/common/fs/zfs/spa_misc.c
usr/src/uts/common/fs/zfs/sys/arc.h
usr/src/uts/common/fs/zfs/sys/dmu.h
usr/src/uts/common/fs/zfs/sys/metaslab.h
usr/src/uts/common/fs/zfs/sys/spa.h
usr/src/uts/common/fs/zfs/sys/vdev.h
usr/src/uts/common/fs/zfs/sys/zio.h
usr/src/uts/common/fs/zfs/sys/zio_impl.h
usr/src/uts/common/fs/zfs/vdev.c
usr/src/uts/common/fs/zfs/vdev_mirror.c
usr/src/uts/common/fs/zfs/vdev_raidz.c
usr/src/uts/common/fs/zfs/vdev_root.c
usr/src/uts/common/fs/zfs/zfs_ioctl.c
usr/src/uts/common/fs/zfs/zio.c
usr/src/uts/common/fs/zfs/zio_checksum.c
usr/src/uts/common/sys/fs/zfs.h
--- a/usr/src/cmd/mdb/common/modules/zfs/zfs.c	Sun Apr 09 20:56:25 2006 -0700
+++ b/usr/src/cmd/mdb/common/modules/zfs/zfs.c	Mon Apr 10 05:03:38 2006 -0700
@@ -437,20 +437,28 @@
 		zct[i].ci_name = local_strdup(buf);
 	}
 
-	for (i = 0; i < SPA_DVAS_PER_BP; i++) {
+	/*
+	 * Super-ick warning:  This code is also duplicated in
+	 * cmd/zdb.c .   Yeah, I hate code replication, too.
+	 */
+	for (i = 0; i < BP_GET_NDVAS(&bp); i++) {
 		dva_t *dva = &bp.blk_dva[i];
-		mdb_printf("DVA[%d]: GANG: %-5s  GRID: %2x  ASIZE: %5x  "
-		    "vdev %llu  offset %llx\n",
-		    i,
-		    DVA_GET_GANG(dva) ? "TRUE" : "FALSE",
-		    DVA_GET_GRID(dva),
-		    DVA_GET_ASIZE(dva),
-		    DVA_GET_VDEV(dva),
-		    DVA_GET_OFFSET(dva));
+
+		mdb_printf("DVA[%d]: vdev_id %lld / %llx\n", i,
+		    DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva));
+		mdb_printf("DVA[%d]:       GANG: %-5s  GRID:  %04x\t"
+		    "ASIZE: %llx\n", i, DVA_GET_GANG(dva) ? "TRUE" : "FALSE",
+		    DVA_GET_GRID(dva), DVA_GET_ASIZE(dva));
+		mdb_printf("DVA[%d]: :%llu:%llx:%llx:%s%s%s%s\n", i,
+		    DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva), BP_GET_PSIZE(&bp),
+		    BP_SHOULD_BYTESWAP(&bp) ? "e" : "",
+		    !DVA_GET_GANG(dva) && BP_GET_LEVEL(&bp) != 0 ? "i" : "",
+		    DVA_GET_GANG(dva) ? "g" : "",
+		    BP_GET_COMPRESS(&bp) != 0 ? "d" : "");
 	}
 	mdb_printf("LSIZE:  %-16llx\t\tPSIZE: %llx\n",
 	    BP_GET_LSIZE(&bp), BP_GET_PSIZE(&bp));
-	mdb_printf("ENDIAN: %-6s  TYPE: %s\n",
+	mdb_printf("ENDIAN: %6s\t\t\t\t\tTYPE:  %s\n",
 	    BP_GET_BYTEORDER(&bp) ? "LITTLE" : "BIG",
 	    doti[BP_GET_TYPE(&bp)].ot_name);
 	mdb_printf("BIRTH:  %-16llx   LEVEL: %-2d\tFILL:  %llx\n",
--- a/usr/src/cmd/zdb/zdb.c	Sun Apr 09 20:56:25 2006 -0700
+++ b/usr/src/cmd/zdb/zdb.c	Mon Apr 10 05:03:38 2006 -0700
@@ -27,6 +27,7 @@
 
 #include <stdio.h>
 #include <stdlib.h>
+#include <ctype.h>
 #include <sys/zfs_context.h>
 #include <sys/spa.h>
 #include <sys/spa_impl.h>
@@ -84,8 +85,9 @@
 	    "Usage: %s [-udibcsvLU] [-O order] [-B os:obj:level:blkid] "
 	    "dataset [object...]\n"
 	    "       %s -C [pool]\n"
-	    "       %s -l dev\n",
-	    cmdname, cmdname, cmdname);
+	    "       %s -l dev\n"
+	    "       %s -R vdev:offset:size:flags\n",
+	    cmdname, cmdname, cmdname, cmdname);
 
 	(void) fprintf(stderr, "	-u uberblock\n");
 	(void) fprintf(stderr, "	-d datasets\n");
@@ -102,6 +104,8 @@
 	(void) fprintf(stderr, "	-U use zpool.cache in /tmp\n");
 	(void) fprintf(stderr, "	-B objset:object:level:blkid -- "
 	    "simulate bad block\n");
+	(void) fprintf(stderr, "        -R read and display block from a"
+	    "device\n");
 	(void) fprintf(stderr, "Specify an option more than once (e.g. -bb) "
 	    "to make only that option verbose\n");
 	(void) fprintf(stderr, "Default is to dump everything non-verbosely\n");
@@ -523,20 +527,41 @@
 	    dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT);
 }
 
+static void
+sprintf_blkptr_compact(char *blkbuf, blkptr_t *bp, int alldvas)
+{
+	dva_t *dva = bp->blk_dva;
+	int ndvas = alldvas ? BP_GET_NDVAS(bp) : 1;
+	int i;
+
+	blkbuf[0] = '\0';
+
+	for (i = 0; i < ndvas; i++)
+		(void) sprintf(blkbuf + strlen(blkbuf), "%llu:%llx:%llx ",
+		    (u_longlong_t)DVA_GET_VDEV(&dva[i]),
+		    (u_longlong_t)DVA_GET_OFFSET(&dva[i]),
+		    (u_longlong_t)DVA_GET_ASIZE(&dva[i]));
+
+	(void) sprintf(blkbuf + strlen(blkbuf), "%llxL/%llxP F=%llu B=%llu",
+	    (u_longlong_t)BP_GET_LSIZE(bp),
+	    (u_longlong_t)BP_GET_PSIZE(bp),
+	    (u_longlong_t)bp->blk_fill,
+	    (u_longlong_t)bp->blk_birth);
+}
+
 /* ARGSUSED */
 static int
 zdb_indirect_cb(traverse_blk_cache_t *bc, spa_t *spa, void *a)
 {
 	zbookmark_t *zb = &bc->bc_bookmark;
 	blkptr_t *bp = &bc->bc_blkptr;
-	dva_t *dva = &bp->blk_dva[0];
 	void *data = bc->bc_data;
 	dnode_phys_t *dnp = bc->bc_dnode;
-	char buffer[300];
+	char blkbuf[BP_SPRINTF_LEN + 80];
 	int l;
 
 	if (bc->bc_errno) {
-		(void) sprintf(buffer,
+		(void) sprintf(blkbuf,
 		    "Error %d reading <%llu, %llu, %lld, %llu>: ",
 		    bc->bc_errno,
 		    (u_longlong_t)zb->zb_objset,
@@ -581,37 +606,28 @@
 		ASSERT3U(fill, ==, bp->blk_fill);
 	}
 
-	(void) sprintf(buffer, "%16llx ",
+	(void) sprintf(blkbuf, "%16llx ",
 	    (u_longlong_t)blkid2offset(dnp, zb->zb_level, zb->zb_blkid));
 
 	ASSERT(zb->zb_level >= 0);
 
 	for (l = dnp->dn_nlevels - 1; l >= -1; l--) {
 		if (l == zb->zb_level) {
-			(void) sprintf(buffer + strlen(buffer), "L%llx",
+			(void) sprintf(blkbuf + strlen(blkbuf), "L%llx",
 			    (u_longlong_t)zb->zb_level);
 		} else {
-			(void) sprintf(buffer + strlen(buffer), " ");
+			(void) sprintf(blkbuf + strlen(blkbuf), " ");
 		}
 	}
 
 out:
 	if (bp->blk_birth == 0) {
-		(void) sprintf(buffer + strlen(buffer), "<hole>");
-		(void) printf("%s\n", buffer);
+		(void) sprintf(blkbuf + strlen(blkbuf), "<hole>");
+		(void) printf("%s\n", blkbuf);
 	} else {
-		// XXBP - Need to print number of active BPs here
-		(void) sprintf(buffer + strlen(buffer),
-		    "vdev=%llu off=%llx %llxL/%llxP/%llxA F=%llu B=%llu",
-		    (u_longlong_t)DVA_GET_VDEV(dva),
-		    (u_longlong_t)DVA_GET_OFFSET(dva),
-		    (u_longlong_t)BP_GET_LSIZE(bp),
-		    (u_longlong_t)BP_GET_PSIZE(bp),
-		    (u_longlong_t)DVA_GET_ASIZE(dva),
-		    (u_longlong_t)bp->blk_fill,
-		    (u_longlong_t)bp->blk_birth);
-
-		(void) printf("%s\n", buffer);
+		sprintf_blkptr_compact(blkbuf + strlen(blkbuf), bp,
+		    dump_opt['d'] > 5 ? 1 : 0);
+		(void) printf("%s\n", blkbuf);
 	}
 
 	return (bc->bc_errno ? ERESTART : 0);
@@ -762,18 +778,12 @@
 	(void) printf("\n");
 
 	while (bplist_iterate(&bpl, &itor, bp) == 0) {
+		char blkbuf[BP_SPRINTF_LEN];
+
 		ASSERT(bp->blk_birth != 0);
-		// XXBP - Do we want to see all DVAs, or just one?
-		(void) printf("\tItem %3llu: vdev=%llu off=%llx "
-		    "%llxL/%llxP/%llxA F=%llu B=%llu\n",
-		    (u_longlong_t)itor - 1,
-		    (u_longlong_t)DVA_GET_VDEV(&bp->blk_dva[0]),
-		    (u_longlong_t)DVA_GET_OFFSET(&bp->blk_dva[0]),
-		    (u_longlong_t)BP_GET_LSIZE(bp),
-		    (u_longlong_t)BP_GET_PSIZE(bp),
-		    (u_longlong_t)DVA_GET_ASIZE(&bp->blk_dva[0]),
-		    (u_longlong_t)bp->blk_fill,
-		    (u_longlong_t)bp->blk_birth);
+		sprintf_blkptr_compact(blkbuf, bp, dump_opt['d'] > 5 ? 1 : 0);
+		(void) printf("\tItem %3llu: %s\n",
+		    (u_longlong_t)itor - 1, blkbuf);
 	}
 
 	bplist_close(&bpl);
@@ -1228,45 +1238,73 @@
 static int
 zdb_space_map_claim(spa_t *spa, blkptr_t *bp, zbookmark_t *zb)
 {
-	dva_t *dva = &bp->blk_dva[0];
-	uint64_t vdev = DVA_GET_VDEV(dva);
-	uint64_t offset = DVA_GET_OFFSET(dva);
-	uint64_t size = DVA_GET_ASIZE(dva);
+	dva_t *dva = bp->blk_dva;
 	vdev_t *vd;
 	metaslab_t *msp;
 	space_map_t *allocmap, *freemap;
 	int error;
+	int d;
+	blkptr_t blk = *bp;
 
-	if ((vd = vdev_lookup_top(spa, vdev)) == NULL)
-		return (ENXIO);
+	for (d = 0; d < BP_GET_NDVAS(bp); d++) {
+		uint64_t vdev = DVA_GET_VDEV(&dva[d]);
+		uint64_t offset = DVA_GET_OFFSET(&dva[d]);
+		uint64_t size = DVA_GET_ASIZE(&dva[d]);
+
+		if ((vd = vdev_lookup_top(spa, vdev)) == NULL)
+			return (ENXIO);
+
+		if ((offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count)
+			return (ENXIO);
+
+		msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
+		allocmap = &msp->ms_allocmap[0];
+		freemap = &msp->ms_freemap[0];
 
-	if ((offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count)
-		return (ENXIO);
+		/* Prepare our copy of the bp in case we need to read GBHs */
+		if (DVA_GET_GANG(&dva[d])) {
+			size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
+			DVA_SET_ASIZE(&blk.blk_dva[d], size);
+			DVA_SET_GANG(&blk.blk_dva[d], 0);
+		}
+
+		mutex_enter(&msp->ms_lock);
+		if (space_map_contains(freemap, offset, size)) {
+			mutex_exit(&msp->ms_lock);
+			return (EAGAIN);	/* allocated more than once */
+		}
 
-	if (DVA_GET_GANG(dva)) {
+		if (!space_map_contains(allocmap, offset, size)) {
+			mutex_exit(&msp->ms_lock);
+			return (ESTALE);	/* not allocated at all */
+		}
+
+		space_map_remove(allocmap, offset, size);
+		space_map_add(freemap, offset, size);
+
+		mutex_exit(&msp->ms_lock);
+	}
+
+	if (BP_IS_GANG(bp)) {
 		zio_gbh_phys_t gbh;
-		blkptr_t blk = *bp;
 		int g;
 
 		/* LINTED - compile time assert */
 		ASSERT(sizeof (zio_gbh_phys_t) == SPA_GANGBLOCKSIZE);
-		size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
-		DVA_SET_GANG(&blk.blk_dva[0], 0);
-		DVA_SET_ASIZE(&blk.blk_dva[0], size);
+
 		BP_SET_CHECKSUM(&blk, ZIO_CHECKSUM_GANG_HEADER);
 		BP_SET_PSIZE(&blk, SPA_GANGBLOCKSIZE);
 		BP_SET_LSIZE(&blk, SPA_GANGBLOCKSIZE);
 		BP_SET_COMPRESS(&blk, ZIO_COMPRESS_OFF);
-		error = zio_wait(zio_read(NULL, spa, &blk,
-		    &gbh, SPA_GANGBLOCKSIZE, NULL, NULL,
-		    ZIO_PRIORITY_SYNC_READ,
+		error = zio_wait(zio_read(NULL, spa, &blk, &gbh,
+		    SPA_GANGBLOCKSIZE, NULL, NULL, ZIO_PRIORITY_SYNC_READ,
 		    ZIO_FLAG_CANFAIL | ZIO_FLAG_CONFIG_HELD, zb));
 		if (error)
 			return (error);
 		if (BP_SHOULD_BYTESWAP(&blk))
 			byteswap_uint64_array(&gbh, SPA_GANGBLOCKSIZE);
 		for (g = 0; g < SPA_GBH_NBLKPTRS; g++) {
-			if (gbh.zg_blkptr[g].blk_birth == 0)
+			if (BP_IS_HOLE(&gbh.zg_blkptr[g]))
 				break;
 			error = zdb_space_map_claim(spa, &gbh.zg_blkptr[g], zb);
 			if (error)
@@ -1274,26 +1312,6 @@
 		}
 	}
 
-	msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
-	allocmap = &msp->ms_allocmap[0];
-	freemap = &msp->ms_freemap[0];
-
-	mutex_enter(&msp->ms_lock);
-	if (space_map_contains(freemap, offset, size)) {
-		mutex_exit(&msp->ms_lock);
-		return (EAGAIN);	/* allocated more than once */
-	}
-
-	if (!space_map_contains(allocmap, offset, size)) {
-		mutex_exit(&msp->ms_lock);
-		return (ESTALE);	/* not allocated at all */
-	}
-
-	space_map_remove(allocmap, offset, size);
-	space_map_add(freemap, offset, size);
-
-	mutex_exit(&msp->ms_lock);
-
 	return (0);
 }
 
@@ -1448,7 +1466,7 @@
 
 	zcb->zcb_readfails = 0;
 
-	ASSERT(bp->blk_birth != 0);
+	ASSERT(!BP_IS_HOLE(bp));
 
 	zdb_count_block(spa, zcb, bp, type);
 
@@ -1511,13 +1529,13 @@
 		    spa->spa_sync_bplist_obj));
 
 		while (bplist_iterate(bpl, &itor, &blk) == 0) {
-			zdb_count_block(spa, &zcb, &blk, DMU_OT_DEFERRED);
 			if (dump_opt['b'] >= 4) {
 				char blkbuf[BP_SPRINTF_LEN];
 				sprintf_blkptr(blkbuf, BP_SPRINTF_LEN, &blk);
 				(void) printf("[%s] %s\n",
 				    "deferred free", blkbuf);
 			}
+			zdb_count_block(spa, &zcb, &blk, DMU_OT_DEFERRED);
 		}
 
 		bplist_close(bpl);
@@ -1703,6 +1721,321 @@
 		exit(rc);
 }
 
+#define	ZDB_FLAG_CHECKSUM	0x0001
+#define	ZDB_FLAG_DECOMPRESS	0x0002
+#define	ZDB_FLAG_BSWAP		0x0004
+#define	ZDB_FLAG_GBH		0x0008
+#define	ZDB_FLAG_INDIRECT	0x0010
+#define	ZDB_FLAG_PHYS		0x0020
+#define	ZDB_FLAG_RAW		0x0040
+#define	ZDB_FLAG_PRINT_BLKPTR	0x0080
+
+int flagbits[256];
+
+static void
+zdb_print_blkptr(blkptr_t *bp, int flags)
+{
+	dva_t *dva = bp->blk_dva;
+	int d;
+
+	if (flags & ZDB_FLAG_BSWAP)
+		byteswap_uint64_array((void *)bp, sizeof (blkptr_t));
+	/*
+	 * Super-ick warning:  This code is also duplicated in
+	 * cmd/mdb/common/modules/zfs/zfs.c .  Yeah, I hate code
+	 * replication, too.
+	 */
+	for (d = 0; d < BP_GET_NDVAS(bp); d++) {
+		(void) printf("\tDVA[%d]: vdev_id %lld / %llx\n", d,
+		    DVA_GET_VDEV(&dva[d]), DVA_GET_OFFSET(&dva[d]));
+		(void) printf("\tDVA[%d]:       GANG: %-5s  GRID:  %04llx\t"
+		    "ASIZE: %llx\n", d,
+		    DVA_GET_GANG(&dva[d]) ? "TRUE" : "FALSE",
+		    DVA_GET_GRID(&dva[d]), DVA_GET_ASIZE(&dva[d]));
+		(void) printf("\tDVA[%d]: :%llu:%llx:%llx:%s%s%s%s\n", d,
+		    DVA_GET_VDEV(&dva[d]), DVA_GET_OFFSET(&dva[d]),
+		    BP_GET_PSIZE(bp),
+		    BP_SHOULD_BYTESWAP(bp) ? "e" : "",
+		    !DVA_GET_GANG(&dva[d]) && BP_GET_LEVEL(bp) != 0 ?
+		    "d" : "",
+		    DVA_GET_GANG(&dva[d]) ? "g" : "",
+		    BP_GET_COMPRESS(bp) != 0 ? "d" : "");
+	}
+	(void) printf("\tLSIZE:  %-16llx\t\tPSIZE: %llx\n",
+	    BP_GET_LSIZE(bp), BP_GET_PSIZE(bp));
+	(void) printf("\tENDIAN: %6s\t\t\t\t\tTYPE:  %s\n",
+	    BP_GET_BYTEORDER(bp) ? "LITTLE" : "BIG",
+	    dmu_ot[BP_GET_TYPE(bp)].ot_name);
+	(void) printf("\tBIRTH:  %-16llx   LEVEL: %-2llu\tFILL:  %llx\n",
+	    (u_longlong_t)bp->blk_birth, BP_GET_LEVEL(bp),
+	    (u_longlong_t)bp->blk_fill);
+	(void) printf("\tCKFUNC: %-16s\t\tCOMP:  %s\n",
+	    zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_name,
+	    zio_compress_table[BP_GET_COMPRESS(bp)].ci_name);
+	(void) printf("\tCKSUM:  %llx:%llx:%llx:%llx\n",
+	    (u_longlong_t)bp->blk_cksum.zc_word[0],
+	    (u_longlong_t)bp->blk_cksum.zc_word[1],
+	    (u_longlong_t)bp->blk_cksum.zc_word[2],
+	    (u_longlong_t)bp->blk_cksum.zc_word[3]);
+}
+
+static void
+zdb_dump_indirect(blkptr_t *bp, int nbps, int flags)
+{
+	int i;
+
+	for (i = 0; i < nbps; i++)
+		zdb_print_blkptr(&bp[i], flags);
+}
+
+static void
+zdb_dump_gbh(void *buf, int flags)
+{
+	zdb_dump_indirect((blkptr_t *)buf, SPA_GBH_NBLKPTRS, flags);
+}
+
+static void
+zdb_dump_block_raw(void *buf, uint64_t size, int flags)
+{
+	if (flags & ZDB_FLAG_BSWAP)
+		byteswap_uint64_array(buf, size);
+	(void) write(2, buf, size);
+}
+
+static void
+zdb_dump_block(char *label, void *buf, uint64_t size, int flags)
+{
+	uint64_t *d = (uint64_t *)buf;
+	int nwords = size / sizeof (uint64_t);
+	int do_bswap = !!(flags & ZDB_FLAG_BSWAP);
+	int i, j;
+	char *hdr, *c;
+
+
+	if (do_bswap)
+		hdr = " 7 6 5 4 3 2 1 0   f e d c b a 9 8";
+	else
+		hdr = " 0 1 2 3 4 5 6 7   8 9 a b c d e f";
+
+	(void) printf("\n%s\n%6s   %s  0123456789abcdef\n", label, "", hdr);
+
+	for (i = 0; i < nwords; i += 2) {
+		(void) printf("%06llx:  %016llx  %016llx  ",
+		    (u_longlong_t)(i * sizeof (uint64_t)),
+		    (u_longlong_t)(do_bswap ? BSWAP_64(d[i]) : d[i]),
+		    (u_longlong_t)(do_bswap ? BSWAP_64(d[i + 1]) : d[i + 1]));
+
+		c = (char *)&d[i];
+		for (j = 0; j < 2 * sizeof (uint64_t); j++)
+			(void) printf("%c", isprint(c[j]) ? c[j] : '.');
+		(void) printf("\n");
+	}
+}
+
+/*
+ * There are two acceptable formats:
+ *	leaf_name	  - For example: c1t0d0 or /tmp/ztest.0a
+ *	child[.child]*    - For example: 0.1.1
+ *
+ * The second form can be used to specify arbitrary vdevs anywhere
+ * in the heirarchy.  For example, in a pool with a mirror of
+ * RAID-Zs, you can specify either RAID-Z vdev with 0.0 or 0.1 .
+ */
+static vdev_t *
+zdb_vdev_lookup(vdev_t *vdev, char *path)
+{
+	char *s, *p, *q;
+	int i;
+
+	if (vdev == NULL)
+		return (NULL);
+
+	/* First, assume the x.x.x.x format */
+	i = (int)strtoul(path, &s, 10);
+	if (s == path || (s && *s != '.' && *s != '\0'))
+		goto name;
+	if (i < 0 || i >= vdev->vdev_children)
+		return (NULL);
+
+	vdev = vdev->vdev_child[i];
+	if (*s == '\0')
+		return (vdev);
+	return (zdb_vdev_lookup(vdev, s+1));
+
+name:
+	for (i = 0; i < vdev->vdev_children; i++) {
+		vdev_t *vc = vdev->vdev_child[i];
+
+		if (vc->vdev_path == NULL) {
+			vc = zdb_vdev_lookup(vc, path);
+			if (vc == NULL)
+				continue;
+			else
+				return (vc);
+		}
+
+		p = strrchr(vc->vdev_path, '/');
+		p = p ? p + 1 : vc->vdev_path;
+		q = &vc->vdev_path[strlen(vc->vdev_path) - 2];
+
+		if (strcmp(vc->vdev_path, path) == 0)
+			return (vc);
+		if (strcmp(p, path) == 0)
+			return (vc);
+		if (strcmp(q, "s0") == 0 && strncmp(p, path, q - p) == 0)
+			return (vc);
+	}
+
+	return (NULL);
+}
+
+/*
+ * Read a block from a pool and print it out.  The syntax of the
+ * block descriptor is:
+ *
+ *	pool:vdev_specifier:offset:size[:flags]
+ *
+ *	pool           - The name of the pool you wish to read from
+ *	vdev_specifier - Which vdev (see comment for zdb_vdev_lookup)
+ *	offset         - offset, in hex, in bytes
+ *	size           - Amount of data to read, in hex, in bytes
+ *	flags          - A string of characters specifying options
+ *		 b: Decode a blkptr at given offset within block
+ *		*c: Calculate and display checksums
+ *		*d: Decompress data before dumping
+ *		 e: Byteswap data before dumping
+ *		*g: Display data as a gang block header
+ *		*i: Display as an indirect block
+ *		 p: Do I/O to physical offset
+ *		 r: Dump raw data to stdout
+ *
+ *              * = not yet implemented
+ */
+static void
+zdb_read_block(char *thing, spa_t **spap)
+{
+	spa_t *spa = *spap;
+	int flags = 0;
+	uint64_t offset = 0, size = 0, blkptr_offset = 0;
+	zio_t *zio;
+	vdev_t *vd;
+	void *buf;
+	char *s, *p, *dup, *spa_name, *vdev, *flagstr;
+	int i, error, zio_flags;
+
+	dup = strdup(thing);
+	s = strtok(dup, ":");
+	spa_name = s ? s : "";
+	s = strtok(NULL, ":");
+	vdev = s ? s : "";
+	s = strtok(NULL, ":");
+	offset = strtoull(s ? s : "", NULL, 16);
+	s = strtok(NULL, ":");
+	size = strtoull(s ? s : "", NULL, 16);
+	s = strtok(NULL, ":");
+	flagstr = s ? s : "";
+
+	s = NULL;
+	if (size == 0)
+		s = "size must not be zero";
+	if (!IS_P2ALIGNED(size, DEV_BSIZE))
+		s = "size must be a multiple of sector size";
+	if (!IS_P2ALIGNED(offset, DEV_BSIZE))
+		s = "offset must be a multiple of sector size";
+	if (s) {
+		(void) printf("Invalid block specifier: %s  - %s\n", thing, s);
+		free(dup);
+		return;
+	}
+
+	for (s = strtok(flagstr, ":"); s; s = strtok(NULL, ":")) {
+		for (i = 0; flagstr[i]; i++) {
+			int bit = flagbits[flagstr[i]];
+
+			if (bit == 0) {
+				(void) printf("***Invalid flag: %c\n",
+				    flagstr[i]);
+				continue;
+			}
+			flags |= bit;
+
+			/* If it's not something with an argument, keep going */
+			if ((bit & (ZDB_FLAG_CHECKSUM | ZDB_FLAG_DECOMPRESS |
+			    ZDB_FLAG_PRINT_BLKPTR)) == 0)
+				continue;
+
+			p = &flagstr[i + 1];
+			if (bit == ZDB_FLAG_PRINT_BLKPTR)
+				blkptr_offset = strtoull(p, &p, 16);
+			if (*p != ':' && *p != '\0') {
+				(void) printf("***Invalid flag arg: '%s'\n", s);
+				free(dup);
+				return;
+			}
+		}
+	}
+
+	if (spa == NULL || spa->spa_name == NULL ||
+	    strcmp(spa->spa_name, spa_name)) {
+		if (spa && spa->spa_name)
+			spa_close(spa, (void *)zdb_read_block);
+		error = spa_open(spa_name, spap, (void *)zdb_read_block);
+		if (error)
+			fatal("Failed to open pool '%s': errno = %d\n",
+			    spa_name, error);
+		spa = *spap;
+	}
+
+	vd = zdb_vdev_lookup(spa->spa_root_vdev, vdev);
+	if (vd == NULL) {
+		(void) printf("***Invalid vdev: %s\n", vdev);
+		free(dup);
+		return;
+	} else {
+		if (vd->vdev_path)
+			(void) printf("Found vdev: %s\n", vd->vdev_path);
+		else
+			(void) printf("Found vdev type: %s\n",
+			    vd->vdev_ops->vdev_op_type);
+	}
+
+	buf = umem_alloc(size, UMEM_NOFAIL);
+
+	zio_flags = ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE |
+	    ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_NOBOOKMARK;
+
+	if (flags & ZDB_FLAG_PHYS)
+		zio_flags |= ZIO_FLAG_PHYSICAL;
+
+	zio = zio_root(spa, NULL, NULL, 0);
+	/* XXX todo - cons up a BP so RAID-Z will be happy */
+	zio_nowait(zio_vdev_child_io(zio, NULL, vd, offset, buf, size,
+	    ZIO_TYPE_READ, ZIO_PRIORITY_SYNC_READ, zio_flags, NULL, NULL));
+	error = zio_wait(zio);
+
+	if (error) {
+		(void) printf("Read of %s failed, error: %d\n", thing, error);
+		goto out;
+	}
+
+	if (flags & ZDB_FLAG_PRINT_BLKPTR)
+		zdb_print_blkptr((blkptr_t *)(void *)
+		    ((uintptr_t)buf + (uintptr_t)blkptr_offset), flags);
+	else if (flags & ZDB_FLAG_RAW)
+		zdb_dump_block_raw(buf, size, flags);
+	else if (flags & ZDB_FLAG_INDIRECT)
+		zdb_dump_indirect((blkptr_t *)buf, size / sizeof (blkptr_t),
+		    flags);
+	else if (flags & ZDB_FLAG_GBH)
+		zdb_dump_gbh(buf, flags);
+	else
+		zdb_dump_block(thing, buf, size, flags);
+
+out:
+	umem_free(buf, size);
+	free(dup);
+}
+
 int
 main(int argc, char **argv)
 {
@@ -1721,7 +2054,7 @@
 
 	dprintf_setup(&argc, argv);
 
-	while ((c = getopt(argc, argv, "udibcsvCLO:B:Ul")) != -1) {
+	while ((c = getopt(argc, argv, "udibcsvCLO:B:UlR")) != -1) {
 		switch (c) {
 		case 'u':
 		case 'd':
@@ -1731,6 +2064,7 @@
 		case 's':
 		case 'C':
 		case 'l':
+		case 'R':
 			dump_opt[c]++;
 			dump_all = 0;
 			break;
@@ -1801,7 +2135,7 @@
 	}
 
 	for (c = 0; c < 256; c++) {
-		if (dump_all && c != 'L' && c != 'l')
+		if (dump_all && c != 'L' && c != 'l' && c != 'R')
 			dump_opt[c] = 1;
 		if (dump_opt[c])
 			dump_opt[c] += verbose;
@@ -1823,6 +2157,27 @@
 		return (0);
 	}
 
+	if (dump_opt['R']) {
+		flagbits['b'] = ZDB_FLAG_PRINT_BLKPTR;
+		flagbits['c'] = ZDB_FLAG_CHECKSUM;
+		flagbits['d'] = ZDB_FLAG_DECOMPRESS;
+		flagbits['e'] = ZDB_FLAG_BSWAP;
+		flagbits['g'] = ZDB_FLAG_GBH;
+		flagbits['i'] = ZDB_FLAG_INDIRECT;
+		flagbits['p'] = ZDB_FLAG_PHYS;
+		flagbits['r'] = ZDB_FLAG_RAW;
+
+		spa = NULL;
+		while (argv[0]) {
+			zdb_read_block(argv[0], &spa);
+			argv++;
+			argc--;
+		}
+		if (spa)
+			spa_close(spa, (void *)zdb_read_block);
+		return (0);
+	}
+
 	if (dump_opt['C'])
 		dump_config(argv[0]);
 
--- a/usr/src/cmd/zpool/zpool_main.c	Sun Apr 09 20:56:25 2006 -0700
+++ b/usr/src/cmd/zpool/zpool_main.c	Mon Apr 10 05:03:38 2006 -0700
@@ -2783,8 +2783,9 @@
 
 	ret = zpool_upgrade(zhp);
 	if (ret == 0)
-		(void) printf(gettext("Successfully upgraded '%s'\n"),
-		    zpool_get_name(zhp));
+		(void) printf(gettext("Successfully upgraded '%s' "
+		    "from version %llu to version %llu\n"), zpool_get_name(zhp),
+		    (u_longlong_t)version, (u_longlong_t)ZFS_VERSION);
 
 	return (ret != 0);
 }
@@ -2848,8 +2849,10 @@
 		(void) printf(gettext("VER  DESCRIPTION\n"));
 		(void) printf("---  -----------------------------------------"
 		    "---------------\n");
-		(void) printf(gettext(" 1   Initial ZFS version.\n\n"));
-		(void) printf(gettext("For more information on a particular "
+		(void) printf(gettext(" 1   Initial ZFS version.\n"));
+		(void) printf(gettext(" 2   Ditto blocks "
+		    "(replicated metadata)\n"));
+		(void) printf(gettext("\nFor more information on a particular "
 		    "version, including supported releases, see:\n\n"));
 		(void) printf("http://www.opensolaris.org/os/community/zfs/"
 		    "version/N\n\n");
--- a/usr/src/cmd/ztest/ztest.c	Sun Apr 09 20:56:25 2006 -0700
+++ b/usr/src/cmd/ztest/ztest.c	Mon Apr 10 05:03:38 2006 -0700
@@ -2825,9 +2825,6 @@
 	if (error)
 		fatal(0, "spa_open('%s') = %d", oldname, error);
 
-	ASSERT(spa->spa_config != NULL);
-
-	VERIFY(nvlist_dup(spa->spa_config, &config, 0) == 0);
 	pool_guid = spa_guid(spa);
 	spa_close(spa, FTAG);
 
@@ -2836,7 +2833,7 @@
 	/*
 	 * Export it.
 	 */
-	error = spa_export(oldname);
+	error = spa_export(oldname, &config);
 	if (error)
 		fatal(0, "spa_export('%s') = %d", oldname, error);
 
--- a/usr/src/uts/common/fs/zfs/arc.c	Sun Apr 09 20:56:25 2006 -0700
+++ b/usr/src/uts/common/fs/zfs/arc.c	Mon Apr 10 05:03:38 2006 -0700
@@ -2186,7 +2186,7 @@
 }
 
 int
-arc_write(zio_t *pio, spa_t *spa, int checksum, int compress,
+arc_write(zio_t *pio, spa_t *spa, int checksum, int compress, int ncopies,
     uint64_t txg, blkptr_t *bp, arc_buf_t *buf,
     arc_done_func_t *done, void *private, int priority, int flags,
     uint32_t arc_flags, zbookmark_t *zb)
@@ -2205,7 +2205,7 @@
 	acb->acb_byteswap = (arc_byteswap_func_t *)-1;
 	hdr->b_acb = acb;
 	hdr->b_flags |= ARC_IO_IN_PROGRESS;
-	rzio = zio_write(pio, spa, checksum, compress, txg, bp,
+	rzio = zio_write(pio, spa, checksum, compress, ncopies, txg, bp,
 	    buf->b_data, hdr->b_size, arc_write_done, buf, priority, flags, zb);
 
 	if (arc_flags & ARC_WAIT)
--- a/usr/src/uts/common/fs/zfs/dbuf.c	Sun Apr 09 20:56:25 2006 -0700
+++ b/usr/src/uts/common/fs/zfs/dbuf.c	Mon Apr 10 05:03:38 2006 -0700
@@ -2029,7 +2029,9 @@
 	zb.zb_object = db->db.db_object;
 	zb.zb_level = db->db_level;
 	zb.zb_blkid = db->db_blkid;
-	(void) arc_write(zio, os->os_spa, checksum, compress, txg,
+
+	(void) arc_write(zio, os->os_spa, checksum, compress,
+	    dmu_get_replication_level(os->os_spa, &zb, dn->dn_type), txg,
 	    db->db_blkptr, *data, dbuf_write_done, db,
 	    ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, ARC_NOWAIT, &zb);
 	/*
--- a/usr/src/uts/common/fs/zfs/dmu.c	Sun Apr 09 20:56:25 2006 -0700
+++ b/usr/src/uts/common/fs/zfs/dmu.c	Mon Apr 10 05:03:38 2006 -0700
@@ -82,8 +82,6 @@
 	dmu_buf_impl_t *db;
 	int err;
 
-	/* dataset_verify(dd); */
-
 	err = dnode_hold(os->os, object, FTAG, &dn);
 	if (err)
 		return (err);
@@ -1425,7 +1423,8 @@
 dmu_sync(objset_t *os, uint64_t object, uint64_t offset, uint64_t *blkoff,
     blkptr_t *bp, uint64_t txg)
 {
-	dsl_pool_t *dp = os->os->os_dsl_dataset->ds_dir->dd_pool;
+	objset_impl_t *osi = os->os;
+	dsl_pool_t *dp = osi->os_dsl_dataset->ds_dir->dd_pool;
 	tx_state_t *tx = &dp->dp_tx;
 	dmu_buf_impl_t *db;
 	blkptr_t *blk;
@@ -1508,7 +1507,7 @@
 		}
 		arc_release(db->db_d.db_data_old[txg&TXG_MASK], db);
 		if (!BP_IS_HOLE(blk)) {
-			(void) arc_free(NULL, os->os->os_spa, txg, blk,
+			(void) arc_free(NULL, osi->os_spa, txg, blk,
 			    NULL, NULL, ARC_WAIT);
 		}
 		kmem_free(blk, sizeof (blkptr_t));
@@ -1520,13 +1519,14 @@
 	blk = kmem_alloc(sizeof (blkptr_t), KM_SLEEP);
 	blk->blk_birth = 0; /* mark as invalid */
 
-	zb.zb_objset = os->os->os_dsl_dataset->ds_object;
+	zb.zb_objset = osi->os_dsl_dataset->ds_object;
 	zb.zb_object = db->db.db_object;
 	zb.zb_level = db->db_level;
 	zb.zb_blkid = db->db_blkid;
-	err = arc_write(NULL, os->os->os_spa,
-	    zio_checksum_select(db->db_dnode->dn_checksum, os->os->os_checksum),
-	    zio_compress_select(db->db_dnode->dn_compress, os->os->os_compress),
+	err = arc_write(NULL, osi->os_spa,
+	    zio_checksum_select(db->db_dnode->dn_checksum, osi->os_checksum),
+	    zio_compress_select(db->db_dnode->dn_compress, osi->os_compress),
+	    dmu_get_replication_level(osi->os_spa, &zb, db->db_dnode->dn_type),
 	    txg, blk, db->db_d.db_data_old[txg&TXG_MASK], NULL, NULL,
 	    ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, ARC_WAIT, &zb);
 	ASSERT(err == 0);
@@ -1556,7 +1556,7 @@
 		 * XXX should we be ignoring the return code?
 		 */
 		if (!BP_IS_HOLE(blk)) {
-			(void) arc_free(NULL, os->os->os_spa, txg, blk,
+			(void) arc_free(NULL, osi->os_spa, txg, blk,
 			    NULL, NULL, ARC_WAIT);
 		}
 		kmem_free(blk, sizeof (blkptr_t));
@@ -1625,6 +1625,24 @@
 	dnode_rele(dn, FTAG);
 }
 
+/*
+ * XXX - eventually, this should take into account per-dataset (or
+ *       even per-object?) user requests for higher levels of replication.
+ */
+int
+dmu_get_replication_level(spa_t *spa, zbookmark_t *zb, dmu_object_type_t ot)
+{
+	int ncopies = 1;
+
+	if (dmu_ot[ot].ot_metadata)
+		ncopies++;
+	if (zb->zb_level != 0)
+		ncopies++;
+	if (zb->zb_objset == 0 && zb->zb_object == 0)
+		ncopies++;
+	return (MIN(ncopies, spa_max_replication(spa)));
+}
+
 int
 dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off)
 {
--- a/usr/src/uts/common/fs/zfs/dmu_objset.c	Sun Apr 09 20:56:25 2006 -0700
+++ b/usr/src/uts/common/fs/zfs/dmu_objset.c	Mon Apr 10 05:03:38 2006 -0700
@@ -679,7 +679,9 @@
 	zb.zb_level = -1;
 	zb.zb_blkid = 0;
 	err = arc_write(NULL, os->os_spa, os->os_md_checksum,
-	    os->os_md_compress, tx->tx_txg, &os->os_rootbp, abuf, killer, os,
+	    os->os_md_compress,
+	    dmu_get_replication_level(os->os_spa, &zb, DMU_OT_OBJSET),
+	    tx->tx_txg, &os->os_rootbp, abuf, killer, os,
 	    ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, ARC_WAIT, &zb);
 	ASSERT(err == 0);
 	VERIFY(arc_buf_remove_ref(abuf, FTAG) == 1);
--- a/usr/src/uts/common/fs/zfs/dsl_pool.c	Sun Apr 09 20:56:25 2006 -0700
+++ b/usr/src/uts/common/fs/zfs/dsl_pool.c	Mon Apr 10 05:03:38 2006 -0700
@@ -232,7 +232,7 @@
 	uint64_t space, resv;
 
 	/*
-	 * Reserve about 1% (1/128), or at least 16MB, for allocation
+	 * Reserve about 1.6% (1/64), or at least 32MB, for allocation
 	 * efficiency.
 	 * XXX The intent log is not accounted for, so it must fit
 	 * within this slop.
@@ -242,7 +242,7 @@
 	 * (e.g. make it possible to rm(1) files from a full pool).
 	 */
 	space = spa_get_space(dp->dp_spa);
-	resv = MAX(space >> 7, SPA_MINDEVSIZE >> 2);
+	resv = MAX(space >> 6, SPA_MINDEVSIZE >> 1);
 	if (netfree)
 		resv >>= 1;
 
--- a/usr/src/uts/common/fs/zfs/metaslab.c	Sun Apr 09 20:56:25 2006 -0700
+++ b/usr/src/uts/common/fs/zfs/metaslab.c	Mon Apr 10 05:03:38 2006 -0700
@@ -352,14 +352,19 @@
 	kmem_free(msp, sizeof (metaslab_t));
 }
 
-#define	METASLAB_ACTIVE_WEIGHT	(1ULL << 63)
+#define	METASLAB_WEIGHT_PRIMARY		(1ULL << 63)
+#define	METASLAB_WEIGHT_SECONDARY	(1ULL << 62)
+#define	METASLAB_ACTIVE_MASK		\
+	(METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY)
+#define	METASLAB_SMO_BONUS_MULTIPLIER	2
 
 static uint64_t
 metaslab_weight(metaslab_t *msp)
 {
+	metaslab_group_t *mg = msp->ms_group;
 	space_map_t *sm = &msp->ms_map;
 	space_map_obj_t *smo = &msp->ms_smo;
-	vdev_t *vd = msp->ms_group->mg_vd;
+	vdev_t *vd = mg->mg_vd;
 	uint64_t weight, space;
 
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
@@ -387,26 +392,27 @@
 	 * For locality, assign higher weight to metaslabs we've used before.
 	 */
 	if (smo->smo_object != 0)
-		weight *= 2;
-	ASSERT(weight >= space && weight <= 4 * space);
+		weight *= METASLAB_SMO_BONUS_MULTIPLIER;
+	ASSERT(weight >= space &&
+	    weight <= 2 * METASLAB_SMO_BONUS_MULTIPLIER * space);
 
 	/*
 	 * If this metaslab is one we're actively using, adjust its weight to
 	 * make it preferable to any inactive metaslab so we'll polish it off.
 	 */
-	weight |= (msp->ms_weight & METASLAB_ACTIVE_WEIGHT);
+	weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK);
 
 	return (weight);
 }
 
 static int
-metaslab_activate(metaslab_t *msp)
+metaslab_activate(metaslab_t *msp, uint64_t activation_weight)
 {
 	space_map_t *sm = &msp->ms_map;
 
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 
-	if (msp->ms_weight < METASLAB_ACTIVE_WEIGHT) {
+	if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {
 		int error = space_map_load(sm, &metaslab_ff_ops,
 		    SM_FREE, &msp->ms_smo,
 		    msp->ms_group->mg_vd->vdev_spa->spa_meta_objset);
@@ -415,10 +421,10 @@
 			return (error);
 		}
 		metaslab_group_sort(msp->ms_group, msp,
-		    msp->ms_weight | METASLAB_ACTIVE_WEIGHT);
+		    msp->ms_weight | activation_weight);
 	}
 	ASSERT(sm->sm_loaded);
-	ASSERT(msp->ms_weight >= METASLAB_ACTIVE_WEIGHT);
+	ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
 
 	return (0);
 }
@@ -426,8 +432,8 @@
 static void
 metaslab_passivate(metaslab_t *msp, uint64_t size)
 {
-	metaslab_group_sort(msp->ms_group, msp, MIN(msp->ms_weight, size - 1));
-	ASSERT(msp->ms_weight < METASLAB_ACTIVE_WEIGHT);
+	metaslab_group_sort(msp->ms_group, msp, MIN(msp->ms_weight, size));
+	ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0);
 }
 
 /*
@@ -571,7 +577,7 @@
 	 * future allocations have synced.  (If we unloaded it now and then
 	 * loaded a moment later, the map wouldn't reflect those allocations.)
 	 */
-	if (sm->sm_loaded && msp->ms_weight < METASLAB_ACTIVE_WEIGHT) {
+	if (sm->sm_loaded && (msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {
 		int evictable = 1;
 
 		for (t = 1; t < TXG_CONCURRENT_STATES; t++)
@@ -616,7 +622,7 @@
 
 	mutex_enter(&msp->ms_lock);
 
-	error = metaslab_activate(msp);
+	error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY);
 	if (error) {
 		mutex_exit(&msp->ms_lock);
 		return (error);
@@ -633,25 +639,76 @@
 	return (0);
 }
 
-static metaslab_t *
-metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t *offp,
-	uint64_t txg)
+static uint64_t
+metaslab_distance(metaslab_t *msp, dva_t *dva)
+{
+	uint64_t ms_shift = msp->ms_group->mg_vd->vdev_ms_shift;
+	uint64_t offset = DVA_GET_OFFSET(dva) >> ms_shift;
+	uint64_t start = msp->ms_map.sm_start >> ms_shift;
+
+	if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva))
+		return (1ULL << 63);
+
+	if (offset < start)
+		return ((start - offset) << ms_shift);
+	if (offset > start)
+		return ((offset - start) << ms_shift);
+	return (0);
+}
+
+static uint64_t
+metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg,
+    uint64_t min_distance, dva_t *dva, int d)
 {
 	metaslab_t *msp = NULL;
 	uint64_t offset = -1ULL;
+	avl_tree_t *t = &mg->mg_metaslab_tree;
+	uint64_t activation_weight;
+	uint64_t target_distance;
+	int i;
+
+	activation_weight = METASLAB_WEIGHT_PRIMARY;
+	for (i = 0; i < d; i++)
+		if (DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id)
+			activation_weight = METASLAB_WEIGHT_SECONDARY;
 
 	for (;;) {
 		mutex_enter(&mg->mg_lock);
-		msp = avl_first(&mg->mg_metaslab_tree);
-		if (msp == NULL || msp->ms_weight < size) {
-			mutex_exit(&mg->mg_lock);
-			return (NULL);
+		for (msp = avl_first(t); msp; msp = AVL_NEXT(t, msp)) {
+			if (msp->ms_weight < size) {
+				mutex_exit(&mg->mg_lock);
+				return (-1ULL);
+			}
+
+			if (activation_weight == METASLAB_WEIGHT_PRIMARY)
+				break;
+
+			target_distance = min_distance +
+			    (msp->ms_smo.smo_alloc ? 0 : min_distance >> 1);
+
+			for (i = 0; i < d; i++)
+				if (metaslab_distance(msp, &dva[i]) <
+				    target_distance)
+					break;
+			if (i == d)
+				break;
 		}
 		mutex_exit(&mg->mg_lock);
+		if (msp == NULL)
+			return (-1ULL);
 
 		mutex_enter(&msp->ms_lock);
 
-		if (metaslab_activate(msp) != 0) {
+		if ((msp->ms_weight & METASLAB_WEIGHT_SECONDARY) &&
+		    activation_weight == METASLAB_WEIGHT_PRIMARY) {
+			metaslab_passivate(msp,
+			    (msp->ms_weight & ~METASLAB_ACTIVE_MASK) /
+			    METASLAB_SMO_BONUS_MULTIPLIER);
+			mutex_exit(&msp->ms_lock);
+			continue;
+		}
+
+		if (metaslab_activate(msp, activation_weight) != 0) {
 			mutex_exit(&msp->ms_lock);
 			continue;
 		}
@@ -659,7 +716,7 @@
 		if ((offset = space_map_alloc(&msp->ms_map, size)) != -1ULL)
 			break;
 
-		metaslab_passivate(msp, size);
+		metaslab_passivate(msp, size - 1);
 
 		mutex_exit(&msp->ms_lock);
 	}
@@ -671,22 +728,24 @@
 
 	mutex_exit(&msp->ms_lock);
 
-	*offp = offset;
-	return (msp);
+	return (offset);
 }
 
 /*
  * Allocate a block for the specified i/o.
  */
-int
-metaslab_alloc(spa_t *spa, uint64_t psize, dva_t *dva, uint64_t txg)
+static int
+metaslab_alloc_one(spa_t *spa, uint64_t psize, dva_t *dva, int d,
+    dva_t *hintdva, uint64_t txg)
 {
-	metaslab_t *msp;
 	metaslab_group_t *mg, *rotor;
 	metaslab_class_t *mc;
 	vdev_t *vd;
+	int dshift = 3;
+	int all_zero;
 	uint64_t offset = -1ULL;
 	uint64_t asize;
+	uint64_t distance;
 
 	mc = spa_metaslab_class_select(spa);
 
@@ -695,17 +754,50 @@
 	 * Note that there's no locking on mc_rotor or mc_allocated because
 	 * nothing actually breaks if we miss a few updates -- we just won't
 	 * allocate quite as evenly.  It all balances out over time.
+	 *
+	 * If we are doing ditto blocks, try to spread them across consecutive
+	 * vdevs.  If we're forced to reuse a vdev before we've allocated
+	 * all of our ditto blocks, then try and spread them out on that
+	 * vdev as much as possible.  If it turns out to not be possible,
+	 * gradually lower our standards until anything becomes acceptable.
+	 * Also, allocating on consecutive vdevs (as opposed to random vdevs)
+	 * gives us hope of containing our fault domains to something we're
+	 * able to reason about.  Otherwise, any two top-level vdev failures
+	 * will guarantee the loss of data.  With consecutive allocation,
+	 * only two adjacent top-level vdev failures will result in data loss.
+	 *
+	 * If we are doing gang blocks (hintdva is non-NULL), try to keep
+	 * ourselves on the same vdev as our gang block header.  That
+	 * way, we can hope for locality in vdev_cache, plus it makes our
+	 * fault domains something tractable.
 	 */
-	mg = rotor = mc->mc_rotor;
+	if (hintdva) {
+		vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d]));
+		mg = vd->vdev_mg;
+	} else if (d != 0) {
+		vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1]));
+		mg = vd->vdev_mg->mg_next;
+	} else {
+		mg = mc->mc_rotor;
+	}
+	rotor = mg;
+
+top:
+	all_zero = B_TRUE;
 	do {
 		vd = mg->mg_vd;
+
+		distance = vd->vdev_asize >> dshift;
+		if (distance <= (1ULL << vd->vdev_ms_shift))
+			distance = 0;
+		else
+			all_zero = B_FALSE;
+
 		asize = vdev_psize_to_asize(vd, psize);
 		ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0);
 
-		msp = metaslab_group_alloc(mg, asize, &offset, txg);
-		if (msp != NULL) {
-			ASSERT(offset != -1ULL);
-
+		offset = metaslab_group_alloc(mg, asize, txg, distance, dva, d);
+		if (offset != -1ULL) {
 			/*
 			 * If we've just selected this metaslab group,
 			 * figure out whether the corresponding vdev is
@@ -740,10 +832,10 @@
 				mc->mc_allocated = 0;
 			}
 
-			DVA_SET_VDEV(dva, vd->vdev_id);
-			DVA_SET_OFFSET(dva, offset);
-			DVA_SET_GANG(dva, 0);
-			DVA_SET_ASIZE(dva, asize);
+			DVA_SET_VDEV(&dva[d], vd->vdev_id);
+			DVA_SET_OFFSET(&dva[d], offset);
+			DVA_SET_GANG(&dva[d], 0);
+			DVA_SET_ASIZE(&dva[d], asize);
 
 			return (0);
 		}
@@ -751,13 +843,46 @@
 		mc->mc_allocated = 0;
 	} while ((mg = mg->mg_next) != rotor);
 
-	DVA_SET_VDEV(dva, 0);
-	DVA_SET_OFFSET(dva, 0);
-	DVA_SET_GANG(dva, 0);
+	if (!all_zero) {
+		dshift++;
+		ASSERT(dshift < 64);
+		goto top;
+	}
+
+	bzero(&dva[d], sizeof (dva_t));
 
 	return (ENOSPC);
 }
 
+int
+metaslab_alloc(spa_t *spa, uint64_t psize, blkptr_t *bp, int ncopies,
+    uint64_t txg, blkptr_t *hintbp)
+{
+	int d, error;
+	dva_t *dva = bp->blk_dva;
+	dva_t *hintdva = hintbp->blk_dva;
+
+	ASSERT(ncopies > 0 && ncopies <= spa_max_replication(spa));
+	ASSERT(BP_GET_NDVAS(bp) == 0);
+	ASSERT(hintbp == NULL || ncopies <= BP_GET_NDVAS(hintbp));
+
+	for (d = 0; d < ncopies; d++) {
+		error = metaslab_alloc_one(spa, psize, dva, d, hintdva, txg);
+		if (error) {
+			for (d--; d >= 0; d--) {
+				ASSERT(DVA_IS_VALID(&dva[d]));
+				metaslab_free(spa, &dva[d], txg, B_TRUE);
+				bzero(&dva[d], sizeof (dva_t));
+			}
+			return (ENOSPC);
+		}
+	}
+	ASSERT(error == 0);
+	ASSERT(BP_GET_NDVAS(bp) == ncopies);
+
+	return (0);
+}
+
 /*
  * Free the block represented by DVA in the context of the specified
  * transaction group.
--- a/usr/src/uts/common/fs/zfs/spa.c	Sun Apr 09 20:56:25 2006 -0700
+++ b/usr/src/uts/common/fs/zfs/spa.c	Mon Apr 10 05:03:38 2006 -0700
@@ -940,10 +940,13 @@
  * configuration from the cache afterwards.
  */
 static int
-spa_export_common(char *pool, int new_state)
+spa_export_common(char *pool, int new_state, nvlist_t **oldconfig)
 {
 	spa_t *spa;
 
+	if (oldconfig)
+		*oldconfig = NULL;
+
 	if (!(spa_mode & FWRITE))
 		return (EROFS);
 
@@ -1011,6 +1014,9 @@
 		spa_deactivate(spa);
 	}
 
+	if (oldconfig && spa->spa_config)
+		VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0);
+
 	if (new_state != POOL_STATE_UNINITIALIZED) {
 		spa_remove(spa);
 		spa_config_sync();
@@ -1026,16 +1032,16 @@
 int
 spa_destroy(char *pool)
 {
-	return (spa_export_common(pool, POOL_STATE_DESTROYED));
+	return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL));
 }
 
 /*
  * Export a storage pool.
  */
 int
-spa_export(char *pool)
+spa_export(char *pool, nvlist_t **oldconfig)
 {
-	return (spa_export_common(pool, POOL_STATE_EXPORTED));
+	return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig));
 }
 
 /*
@@ -1045,7 +1051,7 @@
 int
 spa_reset(char *pool)
 {
-	return (spa_export_common(pool, POOL_STATE_UNINITIALIZED));
+	return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL));
 }
 
 
@@ -1497,7 +1503,7 @@
 
 	mutex_enter(&spa->spa_scrub_lock);
 	if (zio->io_error && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
-		vdev_t *vd = zio->io_vd;
+		vdev_t *vd = zio->io_vd ? zio->io_vd : spa->spa_root_vdev;
 		spa->spa_scrub_errors++;
 		mutex_enter(&vd->vdev_stat_lock);
 		vd->vdev_stat.vs_scrub_errors++;
@@ -1535,9 +1541,12 @@
 spa_scrub_cb(traverse_blk_cache_t *bc, spa_t *spa, void *a)
 {
 	blkptr_t *bp = &bc->bc_blkptr;
-	vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(&bp->blk_dva[0]));
+	vdev_t *vd = spa->spa_root_vdev;
+	dva_t *dva = bp->blk_dva;
+	int needs_resilver = B_FALSE;
+	int d;
 
-	if (bc->bc_errno || vd == NULL) {
+	if (bc->bc_errno) {
 		/*
 		 * We can't scrub this block, but we can continue to scrub
 		 * the rest of the pool.  Note the error and move along.
@@ -1546,43 +1555,52 @@
 		spa->spa_scrub_errors++;
 		mutex_exit(&spa->spa_scrub_lock);
 
-		if (vd != NULL) {
-			mutex_enter(&vd->vdev_stat_lock);
-			vd->vdev_stat.vs_scrub_errors++;
-			mutex_exit(&vd->vdev_stat_lock);
-		}
+		mutex_enter(&vd->vdev_stat_lock);
+		vd->vdev_stat.vs_scrub_errors++;
+		mutex_exit(&vd->vdev_stat_lock);
 
 		return (ERESTART);
 	}
 
 	ASSERT(bp->blk_birth < spa->spa_scrub_maxtxg);
 
-	/*
-	 * Keep track of how much data we've examined so that
-	 * zpool(1M) status can make useful progress reports.
-	 */
-	mutex_enter(&vd->vdev_stat_lock);
-	vd->vdev_stat.vs_scrub_examined += BP_GET_ASIZE(bp);
-	mutex_exit(&vd->vdev_stat_lock);
+	for (d = 0; d < BP_GET_NDVAS(bp); d++) {
+		vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d]));
+
+		ASSERT(vd != NULL);
+
+		/*
+		 * Keep track of how much data we've examined so that
+		 * zpool(1M) status can make useful progress reports.
+		 */
+		mutex_enter(&vd->vdev_stat_lock);
+		vd->vdev_stat.vs_scrub_examined += DVA_GET_ASIZE(&dva[d]);
+		mutex_exit(&vd->vdev_stat_lock);
 
-	if (spa->spa_scrub_type == POOL_SCRUB_RESILVER) {
-		if (DVA_GET_GANG(&bp->blk_dva[0])) {
-			/*
-			 * Gang members may be spread across multiple vdevs,
-			 * so the best we can do is look at the pool-wide DTL.
-			 * XXX -- it would be better to change our allocation
-			 * policy to ensure that this can't happen.
-			 */
-			vd = spa->spa_root_vdev;
+		if (spa->spa_scrub_type == POOL_SCRUB_RESILVER) {
+			if (DVA_GET_GANG(&dva[d])) {
+				/*
+				 * Gang members may be spread across multiple
+				 * vdevs, so the best we can do is look at the
+				 * pool-wide DTL.
+				 * XXX -- it would be better to change our
+				 * allocation policy to ensure that this can't
+				 * happen.
+				 */
+				vd = spa->spa_root_vdev;
+			}
+			if (vdev_dtl_contains(&vd->vdev_dtl_map,
+			    bp->blk_birth, 1))
+				needs_resilver = B_TRUE;
 		}
-		if (vdev_dtl_contains(&vd->vdev_dtl_map, bp->blk_birth, 1)) {
-			spa_scrub_io_start(spa, bp, ZIO_PRIORITY_RESILVER,
-			    ZIO_FLAG_RESILVER, &bc->bc_bookmark);
-		}
-	} else {
+	}
+
+	if (spa->spa_scrub_type == POOL_SCRUB_EVERYTHING)
 		spa_scrub_io_start(spa, bp, ZIO_PRIORITY_SCRUB,
 		    ZIO_FLAG_SCRUB, &bc->bc_bookmark);
-	}
+	else if (needs_resilver)
+		spa_scrub_io_start(spa, bp, ZIO_PRIORITY_RESILVER,
+		    ZIO_FLAG_RESILVER, &bc->bc_bookmark);
 
 	return (0);
 }
--- a/usr/src/uts/common/fs/zfs/spa_misc.c	Sun Apr 09 20:56:25 2006 -0700
+++ b/usr/src/uts/common/fs/zfs/spa_misc.c	Mon Apr 10 05:03:38 2006 -0700
@@ -52,60 +52,60 @@
  *
  * spa_namespace_lock (global mutex)
  *
- * 	This lock must be acquired to do any of the following:
+ *	This lock must be acquired to do any of the following:
  *
- * 		- Lookup a spa_t by name
- * 		- Add or remove a spa_t from the namespace
- * 		- Increase spa_refcount from non-zero
- * 		- Check if spa_refcount is zero
- * 		- Rename a spa_t
+ *		- Lookup a spa_t by name
+ *		- Add or remove a spa_t from the namespace
+ *		- Increase spa_refcount from non-zero
+ *		- Check if spa_refcount is zero
+ *		- Rename a spa_t
  *		- add/remove/attach/detach devices
- * 		- Held for the duration of create/destroy/import/export
+ *		- Held for the duration of create/destroy/import/export
  *
- * 	It does not need to handle recursion.  A create or destroy may
- * 	reference objects (files or zvols) in other pools, but by
- * 	definition they must have an existing reference, and will never need
- * 	to lookup a spa_t by name.
+ *	It does not need to handle recursion.  A create or destroy may
+ *	reference objects (files or zvols) in other pools, but by
+ *	definition they must have an existing reference, and will never need
+ *	to lookup a spa_t by name.
  *
  * spa_refcount (per-spa refcount_t protected by mutex)
  *
- * 	This reference count keep track of any active users of the spa_t.  The
- * 	spa_t cannot be destroyed or freed while this is non-zero.  Internally,
- * 	the refcount is never really 'zero' - opening a pool implicitly keeps
- * 	some references in the DMU.  Internally we check against SPA_MINREF, but
- * 	present the image of a zero/non-zero value to consumers.
+ *	This reference count keep track of any active users of the spa_t.  The
+ *	spa_t cannot be destroyed or freed while this is non-zero.  Internally,
+ *	the refcount is never really 'zero' - opening a pool implicitly keeps
+ *	some references in the DMU.  Internally we check against SPA_MINREF, but
+ *	present the image of a zero/non-zero value to consumers.
  *
  * spa_config_lock (per-spa crazy rwlock)
  *
- * 	This SPA special is a recursive rwlock, capable of being acquired from
- * 	asynchronous threads.  It has protects the spa_t from config changes,
- * 	and must be held in the following circumstances:
+ *	This SPA special is a recursive rwlock, capable of being acquired from
+ *	asynchronous threads.  It has protects the spa_t from config changes,
+ *	and must be held in the following circumstances:
  *
- * 		- RW_READER to perform I/O to the spa
- * 		- RW_WRITER to change the vdev config
+ *		- RW_READER to perform I/O to the spa
+ *		- RW_WRITER to change the vdev config
  *
  * spa_config_cache_lock (per-spa mutex)
  *
- * 	This mutex prevents the spa_config nvlist from being updated.  No
+ *	This mutex prevents the spa_config nvlist from being updated.  No
  *      other locks are required to obtain this lock, although implicitly you
  *      must have the namespace lock or non-zero refcount to have any kind
  *      of spa_t pointer at all.
  *
  * The locking order is fairly straightforward:
  *
- * 		spa_namespace_lock	->	spa_refcount
+ *		spa_namespace_lock	->	spa_refcount
  *
- * 	The namespace lock must be acquired to increase the refcount from 0
- * 	or to check if it is zero.
+ *	The namespace lock must be acquired to increase the refcount from 0
+ *	or to check if it is zero.
  *
- * 		spa_refcount 		->	spa_config_lock
+ *		spa_refcount		->	spa_config_lock
  *
- * 	There must be at least one valid reference on the spa_t to acquire
- * 	the config lock.
+ *	There must be at least one valid reference on the spa_t to acquire
+ *	the config lock.
  *
- * 		spa_namespace_lock	->	spa_config_lock
+ *		spa_namespace_lock	->	spa_config_lock
  *
- * 	The namespace lock must always be taken before the config lock.
+ *	The namespace lock must always be taken before the config lock.
  *
  *
  * The spa_namespace_lock and spa_config_cache_lock can be acquired directly and
@@ -114,53 +114,53 @@
  * The namespace is manipulated using the following functions, all which require
  * the spa_namespace_lock to be held.
  *
- * 	spa_lookup()		Lookup a spa_t by name.
+ *	spa_lookup()		Lookup a spa_t by name.
  *
- * 	spa_add()		Create a new spa_t in the namespace.
+ *	spa_add()		Create a new spa_t in the namespace.
  *
- * 	spa_remove()		Remove a spa_t from the namespace.  This also
- * 				frees up any memory associated with the spa_t.
+ *	spa_remove()		Remove a spa_t from the namespace.  This also
+ *				frees up any memory associated with the spa_t.
  *
- * 	spa_next()		Returns the next spa_t in the system, or the
- * 				first if NULL is passed.
+ *	spa_next()		Returns the next spa_t in the system, or the
+ *				first if NULL is passed.
  *
- * 	spa_evict_all()		Shutdown and remove all spa_t structures in
- * 				the system.
+ *	spa_evict_all()		Shutdown and remove all spa_t structures in
+ *				the system.
  *
  *	spa_guid_exists()	Determine whether a pool/device guid exists.
  *
  * The spa_refcount is manipulated using the following functions:
  *
- * 	spa_open_ref()		Adds a reference to the given spa_t.  Must be
- * 				called with spa_namespace_lock held if the
- * 				refcount is currently zero.
+ *	spa_open_ref()		Adds a reference to the given spa_t.  Must be
+ *				called with spa_namespace_lock held if the
+ *				refcount is currently zero.
  *
- * 	spa_close()		Remove a reference from the spa_t.  This will
- * 				not free the spa_t or remove it from the
- * 				namespace.  No locking is required.
+ *	spa_close()		Remove a reference from the spa_t.  This will
+ *				not free the spa_t or remove it from the
+ *				namespace.  No locking is required.
  *
- * 	spa_refcount_zero()	Returns true if the refcount is currently
- * 				zero.  Must be called with spa_namespace_lock
- * 				held.
+ *	spa_refcount_zero()	Returns true if the refcount is currently
+ *				zero.  Must be called with spa_namespace_lock
+ *				held.
  *
  * The spa_config_lock is manipulated using the following functions:
  *
- * 	spa_config_enter()	Acquire the config lock as RW_READER or
- * 				RW_WRITER.  At least one reference on the spa_t
- * 				must exist.
+ *	spa_config_enter()	Acquire the config lock as RW_READER or
+ *				RW_WRITER.  At least one reference on the spa_t
+ *				must exist.
  *
- * 	spa_config_exit()	Release the config lock.
+ *	spa_config_exit()	Release the config lock.
  *
- * 	spa_config_held()	Returns true if the config lock is currently
- * 				held in the given state.
+ *	spa_config_held()	Returns true if the config lock is currently
+ *				held in the given state.
  *
  * The vdev configuration is protected by spa_vdev_enter() / spa_vdev_exit().
  *
- * 	spa_vdev_enter()	Acquire the namespace lock and the config lock
+ *	spa_vdev_enter()	Acquire the namespace lock and the config lock
  *				for writing.
  *
- * 	spa_vdev_exit()		Release the config lock, wait for all I/O
- * 				to complete, sync the updated configs to the
+ *	spa_vdev_exit()		Release the config lock, wait for all I/O
+ *				to complete, sync the updated configs to the
  *				cache, and release the namespace lock.
  *
  * The spa_name() function also requires either the spa_namespace_lock
@@ -173,6 +173,7 @@
 kmutex_t spa_namespace_lock;
 static kcondvar_t spa_namespace_cv;
 static int spa_active_count;
+static int spa_max_replication_override = SPA_DVAS_PER_BP;
 
 kmem_cache_t *spa_buffer_pool;
 int spa_mode;
@@ -617,8 +618,7 @@
 void
 sprintf_blkptr(char *buf, int len, blkptr_t *bp)
 {
-	/* XXBP - Need to see if we want all DVAs or not */
-	dva_t *dva = BP_IDENTITY(bp);
+	int d;
 
 	if (bp == NULL) {
 		(void) snprintf(buf, len, "<NULL>");
@@ -630,20 +630,27 @@
 		return;
 	}
 
-	(void) snprintf(buf, len, "[L%llu %s] vdev=%llu offset=%llx "
-	    "size=%llxL/%llxP/%llxA %s %s %s %s "
-	    "birth=%llu fill=%llu cksum=%llx:%llx:%llx:%llx",
+	(void) snprintf(buf, len, "[L%llu %s] %llxL/%llxP ",
 	    (u_longlong_t)BP_GET_LEVEL(bp),
 	    dmu_ot[BP_GET_TYPE(bp)].ot_name,
-	    (u_longlong_t)DVA_GET_VDEV(dva),
-	    (u_longlong_t)DVA_GET_OFFSET(dva),
 	    (u_longlong_t)BP_GET_LSIZE(bp),
-	    (u_longlong_t)BP_GET_PSIZE(bp),
-	    (u_longlong_t)DVA_GET_ASIZE(dva),
+	    (u_longlong_t)BP_GET_PSIZE(bp));
+
+	for (d = 0; d < BP_GET_NDVAS(bp); d++) {
+		dva_t *dva = &bp->blk_dva[d];
+		(void) snprintf(buf + strlen(buf), len - strlen(buf),
+		    "DVA[%d]=<%llu:%llx:%llx> ", d,
+		    (u_longlong_t)DVA_GET_VDEV(dva),
+		    (u_longlong_t)DVA_GET_OFFSET(dva),
+		    (u_longlong_t)DVA_GET_ASIZE(dva));
+	}
+
+	(void) snprintf(buf + strlen(buf), len - strlen(buf),
+	    "%s %s %s %s birth=%llu fill=%llu cksum=%llx:%llx:%llx:%llx",
 	    zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_name,
 	    zio_compress_table[BP_GET_COMPRESS(bp)].ci_name,
 	    BP_GET_BYTEORDER(bp) == 0 ? "BE" : "LE",
-	    DVA_GET_GANG(dva) == 0 ? "contiguous" : "gang",
+	    BP_IS_GANG(bp) ? "gang" : "contiguous",
 	    (u_longlong_t)bp->blk_birth,
 	    (u_longlong_t)bp->blk_fill,
 	    (u_longlong_t)bp->blk_cksum.zc_word[0],
@@ -796,8 +803,29 @@
 	/*
 	 * For now, the worst case is 512-byte RAID-Z blocks, in which
 	 * case the space requirement is exactly 2x; so just assume that.
+	 * Add to this the fact that we can have up to 3 DVAs per bp, and
+	 * we have to multiply by a total of 6x.
 	 */
-	return (lsize << 1);
+	return (lsize * 6);
+}
+
+uint64_t
+spa_version(spa_t *spa)
+{
+	return (spa->spa_ubsync.ub_version);
+}
+
+int
+spa_max_replication(spa_t *spa)
+{
+	/*
+	 * As of ZFS_VERSION == ZFS_VERSION_DITTO_BLOCKS, we are able to
+	 * handle BPs with more than one DVA allocated.  Set our max
+	 * replication level accordingly.
+	 */
+	if (spa_version(spa) < ZFS_VERSION_DITTO_BLOCKS)
+		return (1);
+	return (MIN(SPA_DVAS_PER_BP, spa_max_replication_override));
 }
 
 /*
--- a/usr/src/uts/common/fs/zfs/sys/arc.h	Sun Apr 09 20:56:25 2006 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/arc.h	Mon Apr 10 05:03:38 2006 -0700
@@ -75,7 +75,7 @@
 int arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_byteswap_func_t *swap,
     arc_done_func_t *done, void *private, int priority, int flags,
     uint32_t arc_flags, zbookmark_t *zb);
-int arc_write(zio_t *pio, spa_t *spa, int checksum, int compress,
+int arc_write(zio_t *pio, spa_t *spa, int checksum, int compress, int ncopies,
     uint64_t txg, blkptr_t *bp, arc_buf_t *buf,
     arc_done_func_t *done, void *private, int priority, int flags,
     uint32_t arc_flags, zbookmark_t *zb);
--- a/usr/src/uts/common/fs/zfs/sys/dmu.h	Sun Apr 09 20:56:25 2006 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/dmu.h	Mon Apr 10 05:03:38 2006 -0700
@@ -56,6 +56,8 @@
 struct dnode;
 struct drr_begin;
 struct drr_end;
+struct zbookmark;
+struct spa;
 
 typedef struct objset objset_t;
 typedef struct dmu_tx dmu_tx_t;
@@ -263,6 +265,12 @@
     dmu_tx_t *tx);
 
 /*
+ * Decide how many copies of a given block we should make.  Can be from
+ * 1 to SPA_DVAS_PER_BP.
+ */
+int dmu_get_replication_level(struct spa *spa, struct zbookmark *zb,
+    dmu_object_type_t ot);
+/*
  * The bonus data is accessed more or less like a regular buffer.
  * You must dmu_bonus_hold() to get the buffer, which will give you a
  * dmu_buf_t with db_offset==-1ULL, and db_size = the size of the bonus
--- a/usr/src/uts/common/fs/zfs/sys/metaslab.h	Sun Apr 09 20:56:25 2006 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/metaslab.h	Mon Apr 10 05:03:38 2006 -0700
@@ -47,7 +47,8 @@
 extern void metaslab_sync(metaslab_t *msp, uint64_t txg);
 extern void metaslab_sync_done(metaslab_t *msp, uint64_t txg);
 
-extern int metaslab_alloc(spa_t *spa, uint64_t size, dva_t *dva, uint64_t txg);
+extern int metaslab_alloc(spa_t *spa, uint64_t psize, blkptr_t *bp,
+    int ncopies, uint64_t txg, blkptr_t *hintbp);
 extern void metaslab_free(spa_t *spa, dva_t *dva, uint64_t txg, boolean_t now);
 extern int metaslab_claim(spa_t *spa, dva_t *dva, uint64_t txg);
 
--- a/usr/src/uts/common/fs/zfs/sys/spa.h	Sun Apr 09 20:56:25 2006 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/spa.h	Mon Apr 10 05:03:38 2006 -0700
@@ -234,6 +234,16 @@
 	(DVA_GET_ASIZE(&(bp)->blk_dva[0]) + DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \
 	DVA_GET_ASIZE(&(bp)->blk_dva[2]))
 
+#define	BP_GET_NDVAS(bp)	\
+	(!!DVA_GET_ASIZE(&(bp)->blk_dva[0]) + \
+	!!DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \
+	!!DVA_GET_ASIZE(&(bp)->blk_dva[2]))
+
+#define	BP_COUNT_GANG(bp)	\
+	(DVA_GET_GANG(&(bp)->blk_dva[0]) + \
+	DVA_GET_GANG(&(bp)->blk_dva[1]) + \
+	DVA_GET_GANG(&(bp)->blk_dva[2]))
+
 #define	DVA_EQUAL(dva1, dva2)	\
 	((dva1)->dva_word[1] == (dva2)->dva_word[1] && \
 	(dva1)->dva_word[0] == (dva2)->dva_word[0])
@@ -248,10 +258,10 @@
 	(zcp)->zc_word[3] = w3;			\
 }
 
+#define	BP_IDENTITY(bp)		(&(bp)->blk_dva[0])
+#define	BP_IS_GANG(bp)		DVA_GET_GANG(BP_IDENTITY(bp))
 #define	BP_IS_HOLE(bp)		((bp)->blk_birth == 0)
 
-#define	BP_IDENTITY(bp)		(&(bp)->blk_dva[0])
-
 #define	BP_ZERO(bp)				\
 {						\
 	(bp)->blk_dva[0].dva_word[0] = 0;	\
@@ -281,7 +291,7 @@
 
 #define	BP_SHOULD_BYTESWAP(bp)	(BP_GET_BYTEORDER(bp) != ZFS_HOST_BYTEORDER)
 
-#define	BP_SPRINTF_LEN	256
+#define	BP_SPRINTF_LEN	320
 
 #include <sys/dmu.h>
 
@@ -297,7 +307,7 @@
 extern int spa_import(const char *pool, nvlist_t *config, const char *altroot);
 extern nvlist_t *spa_tryimport(nvlist_t *tryconfig);
 extern int spa_destroy(char *pool);
-extern int spa_export(char *pool);
+extern int spa_export(char *pool, nvlist_t **oldconfig);
 extern int spa_reset(char *pool);
 extern void spa_async_request(spa_t *spa, int flag);
 extern void spa_async_suspend(spa_t *spa);
@@ -387,6 +397,8 @@
 extern uint64_t spa_get_alloc(spa_t *spa);
 extern uint64_t spa_get_space(spa_t *spa);
 extern uint64_t spa_get_asize(spa_t *spa, uint64_t lsize);
+extern uint64_t spa_version(spa_t *spa);
+extern int spa_max_replication(spa_t *spa);
 extern int spa_busy(void);
 
 /* Miscellaneous support routines */
--- a/usr/src/uts/common/fs/zfs/sys/vdev.h	Sun Apr 09 20:56:25 2006 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/vdev.h	Mon Apr 10 05:03:38 2006 -0700
@@ -80,6 +80,7 @@
 extern void vdev_scrub_stat_update(vdev_t *vd, pool_scrub_type_t type,
     boolean_t complete);
 extern int vdev_getspec(spa_t *spa, uint64_t vdev, char **vdev_spec);
+extern void vdev_propagate_state(vdev_t *vd);
 extern void vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state,
     vdev_aux_t aux);
 
--- a/usr/src/uts/common/fs/zfs/sys/zio.h	Sun Apr 09 20:56:25 2006 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/zio.h	Mon Apr 10 05:03:38 2006 -0700
@@ -34,6 +34,7 @@
 #include <sys/avl.h>
 #include <sys/dkio.h>
 #include <sys/fs/zfs.h>
+#include <sys/zio_impl.h>
 
 #ifdef	__cplusplus
 extern "C" {
@@ -58,9 +59,8 @@
 	(SPA_GBH_NBLKPTRS * sizeof (blkptr_t))) /\
 	sizeof (uint64_t))
 
-#define	ZIO_GET_DVA(zio)	(&(zio)->io_bp->blk_dva[(zio)->io_dva_index])
 #define	ZIO_GET_IOSIZE(zio)	\
-	(DVA_GET_GANG(ZIO_GET_DVA(zio)) ? \
+	(BP_IS_GANG((zio)->io_bp) ? \
 	SPA_GANGBLOCKSIZE : BP_GET_PSIZE((zio)->io_bp))
 
 typedef struct zio_gbh {
@@ -152,7 +152,6 @@
 
 typedef struct zio zio_t;
 typedef void zio_done_func_t(zio_t *zio);
-typedef struct zio_transform zio_transform_t;
 
 extern uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE];
 extern char *zio_type_name[ZIO_TYPES];
@@ -190,9 +189,9 @@
 	zio_t		*io_root;
 	spa_t		*io_spa;
 	zbookmark_t	io_bookmark;
-	int		io_checksum;
-	int		io_compress;
-	int		io_dva_index;
+	enum zio_checksum io_checksum;
+	enum zio_compress io_compress;
+	int		io_ndvas;
 	uint64_t	io_txg;
 	blkptr_t	*io_bp;
 	blkptr_t	io_bp_copy;
@@ -225,8 +224,8 @@
 
 	/* Internal pipeline state */
 	int		io_flags;
-	uint8_t		io_type;
-	uint8_t		io_stage;
+	enum zio_type	io_type;
+	enum zio_stage	io_stage;
 	uint8_t		io_stalled;
 	uint8_t		io_priority;
 	struct dk_callback io_dk_callback;
@@ -257,7 +256,7 @@
     int priority, int flags, zbookmark_t *zb);
 
 extern zio_t *zio_write(zio_t *pio, spa_t *spa, int checksum, int compress,
-    uint64_t txg, blkptr_t *bp, void *data, uint64_t size,
+    int ncopies, uint64_t txg, blkptr_t *bp, void *data, uint64_t size,
     zio_done_func_t *done, void *private, int priority, int flags,
     zbookmark_t *zb);
 
--- a/usr/src/uts/common/fs/zfs/sys/zio_impl.h	Sun Apr 09 20:56:25 2006 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/zio_impl.h	Mon Apr 10 05:03:38 2006 -0700
@@ -61,9 +61,6 @@
 
 	ZIO_STAGE_READY,			/* RWFCI */
 
-	ZIO_STAGE_DVA_TRANSLATE,		/* RW--- */
-
-	ZIO_STAGE_VDEV_IO_SETUP,		/* RW--I */
 	ZIO_STAGE_VDEV_IO_START,		/* RW--I */
 	ZIO_STAGE_VDEV_IO_DONE,			/* RW--I */
 	ZIO_STAGE_VDEV_IO_ASSESS,		/* RW--I */
@@ -88,8 +85,7 @@
 	(1U << ZIO_STAGE_READ_DECOMPRESS))
 
 #define	ZIO_VDEV_IO_PIPELINE					\
-	((1U << ZIO_STAGE_VDEV_IO_SETUP) |			\
-	(1U << ZIO_STAGE_VDEV_IO_START) |			\
+	((1U << ZIO_STAGE_VDEV_IO_START) |			\
 	(1U << ZIO_STAGE_VDEV_IO_DONE) |			\
 	(1U << ZIO_STAGE_VDEV_IO_ASSESS))
 
@@ -103,8 +99,7 @@
 	(1U << ZIO_STAGE_DONE))
 
 #define	ZIO_READ_PIPELINE					\
-	((1U << ZIO_STAGE_DVA_TRANSLATE) |			\
-	ZIO_READ_PHYS_PIPELINE)
+	ZIO_READ_PHYS_PIPELINE
 
 #define	ZIO_WRITE_PHYS_PIPELINE					\
 	((1U << ZIO_STAGE_OPEN) |				\
@@ -116,8 +111,7 @@
 	(1U << ZIO_STAGE_DONE))
 
 #define	ZIO_WRITE_COMMON_PIPELINE				\
-	((1U << ZIO_STAGE_DVA_TRANSLATE) |			\
-	ZIO_WRITE_PHYS_PIPELINE)
+	ZIO_WRITE_PHYS_PIPELINE
 
 #define	ZIO_WRITE_PIPELINE					\
 	((1U << ZIO_STAGE_WRITE_COMPRESS) |			\
@@ -193,6 +187,7 @@
 #define	ZIO_ERROR_PIPELINE_MASK					\
 	ZIO_WAIT_FOR_CHILDREN_PIPELINE
 
+typedef struct zio_transform zio_transform_t;
 struct zio_transform {
 	void		*zt_data;
 	uint64_t	zt_size;
--- a/usr/src/uts/common/fs/zfs/vdev.c	Sun Apr 09 20:56:25 2006 -0700
+++ b/usr/src/uts/common/fs/zfs/vdev.c	Mon Apr 10 05:03:38 2006 -0700
@@ -847,31 +847,16 @@
 vdev_reopen(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
-	vdev_t *rvd = spa->spa_root_vdev;
-	int c;
 
 	ASSERT(spa_config_held(spa, RW_WRITER));
 
-	if (vd == rvd) {
-		for (c = 0; c < rvd->vdev_children; c++)
-			vdev_reopen(rvd->vdev_child[c]);
-		return;
-	}
-
-	/* only valid for top-level vdevs */
-	ASSERT3P(vd, ==, vd->vdev_top);
-
 	vdev_close(vd);
 	(void) vdev_open(vd);
 
 	/*
 	 * Reassess root vdev's health.
 	 */
-	rvd->vdev_state = VDEV_STATE_HEALTHY;
-	for (c = 0; c < rvd->vdev_children; c++) {
-		uint64_t state = rvd->vdev_child[c]->vdev_state;
-		rvd->vdev_state = MIN(rvd->vdev_state, state);
-	}
+	vdev_propagate_state(spa->spa_root_vdev);
 }
 
 int
@@ -1741,6 +1726,39 @@
 	list_remove(&spa->spa_dirty_list, vd);
 }
 
+void
+vdev_propagate_state(vdev_t *vd)
+{
+	vdev_t *rvd = vd->vdev_spa->spa_root_vdev;
+	int degraded = 0, faulted = 0;
+	int corrupted = 0;
+	int c;
+	vdev_t *child;
+
+	for (c = 0; c < vd->vdev_children; c++) {
+		child = vd->vdev_child[c];
+		if (child->vdev_state <= VDEV_STATE_CANT_OPEN)
+			faulted++;
+		else if (child->vdev_state == VDEV_STATE_DEGRADED)
+			degraded++;
+
+		if (child->vdev_stat.vs_aux == VDEV_AUX_CORRUPT_DATA)
+			corrupted++;
+	}
+
+	vd->vdev_ops->vdev_op_state_change(vd, faulted, degraded);
+
+	/*
+	 * Root special: if there is a toplevel vdev that cannot be
+	 * opened due to corrupted metadata, then propagate the root
+	 * vdev's aux state as 'corrupt' rather than 'insufficient
+	 * replicas'.
+	 */
+	if (corrupted && vd == rvd && rvd->vdev_state == VDEV_STATE_CANT_OPEN)
+		vdev_set_state(rvd, B_FALSE, VDEV_STATE_CANT_OPEN,
+		    VDEV_AUX_CORRUPT_DATA);
+}
+
 /*
  * Set a vdev's state.  If this is during an open, we don't update the parent
  * state, because we're in the process of opening children depth-first.
@@ -1810,36 +1828,6 @@
 	if (isopen)
 		return;
 
-	if (vd->vdev_parent != NULL) {
-		int c;
-		int degraded = 0, faulted = 0;
-		int corrupted = 0;
-		vdev_t *parent, *child;
-
-		parent = vd->vdev_parent;
-		for (c = 0; c < parent->vdev_children; c++) {
-			child = parent->vdev_child[c];
-			if (child->vdev_state <= VDEV_STATE_CANT_OPEN)
-				faulted++;
-			else if (child->vdev_state == VDEV_STATE_DEGRADED)
-				degraded++;
-
-			if (child->vdev_stat.vs_aux == VDEV_AUX_CORRUPT_DATA)
-				corrupted++;
-		}
-
-		vd->vdev_parent->vdev_ops->vdev_op_state_change(
-		    vd->vdev_parent, faulted, degraded);
-
-		/*
-		 * Root special: if this is a toplevel vdev that cannot be
-		 * opened due to corrupted metadata, then propagate the root
-		 * vdev's aux state as 'corrupt' rather than 'insufficient
-		 * replicas'.
-		 */
-		if (corrupted && vd == vd->vdev_top)
-			vdev_set_state(vd->vdev_spa->spa_root_vdev,
-			    B_FALSE, VDEV_STATE_CANT_OPEN,
-			    VDEV_AUX_CORRUPT_DATA);
-	}
+	if (vd->vdev_parent != NULL)
+		vdev_propagate_state(vd->vdev_parent);
 }
--- a/usr/src/uts/common/fs/zfs/vdev_mirror.c	Sun Apr 09 20:56:25 2006 -0700
+++ b/usr/src/uts/common/fs/zfs/vdev_mirror.c	Mon Apr 10 05:03:38 2006 -0700
@@ -35,25 +35,85 @@
  * Virtual device vector for mirroring.
  */
 
+typedef struct mirror_child {
+	vdev_t		*mc_vd;
+	uint64_t	mc_offset;
+	int		mc_error;
+	short		mc_tried;
+	short		mc_skipped;
+} mirror_child_t;
+
 typedef struct mirror_map {
-	int	mm_error;
-	short	mm_tried;
-	short	mm_skipped;
+	int		mm_children;
+	int		mm_replacing;
+	int		mm_preferred;
+	int		mm_root;
+	mirror_child_t	mm_child[1];
 } mirror_map_t;
 
 static mirror_map_t *
 vdev_mirror_map_alloc(zio_t *zio)
 {
-	zio->io_vsd = kmem_zalloc(zio->io_vd->vdev_children *
-	    sizeof (mirror_map_t), KM_SLEEP);
-	return (zio->io_vsd);
+	mirror_map_t *mm = NULL;
+	mirror_child_t *mc;
+	vdev_t *vd = zio->io_vd;
+	int c, d;
+
+	if (vd == NULL) {
+		dva_t *dva = zio->io_bp->blk_dva;
+		spa_t *spa = zio->io_spa;
+
+		c = BP_GET_NDVAS(zio->io_bp);
+
+		mm = kmem_zalloc(offsetof(mirror_map_t, mm_child[c]), KM_SLEEP);
+		mm->mm_children = c;
+		mm->mm_replacing = B_FALSE;
+		mm->mm_preferred = spa_get_random(c);
+		mm->mm_root = B_TRUE;
+
+		/*
+		 * Check the other, lower-index DVAs to see if they're on
+		 * the same vdev as the child we picked.  If they are, use
+		 * them since they are likely to have been allocated from
+		 * the primary metaslab in use at the time, and hence are
+		 * more likely to have locality with single-copy data.
+		 */
+		for (c = mm->mm_preferred, d = c - 1; d >= 0; d--) {
+			if (DVA_GET_VDEV(&dva[d]) == DVA_GET_VDEV(&dva[c]))
+				mm->mm_preferred = d;
+		}
+
+		for (c = 0; c < mm->mm_children; c++) {
+			mc = &mm->mm_child[c];
+			mc->mc_vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[c]));
+			mc->mc_offset = DVA_GET_OFFSET(&dva[c]);
+		}
+	} else {
+		c = vd->vdev_children;
+
+		mm = kmem_zalloc(offsetof(mirror_map_t, mm_child[c]), KM_SLEEP);
+		mm->mm_children = c;
+		mm->mm_replacing = (vd->vdev_ops == &vdev_replacing_ops);
+		mm->mm_preferred = mm->mm_replacing ? 0 : spa_get_random(c);
+		mm->mm_root = B_FALSE;
+
+		for (c = 0; c < mm->mm_children; c++) {
+			mc = &mm->mm_child[c];
+			mc->mc_vd = vd->vdev_child[c];
+			mc->mc_offset = zio->io_offset;
+		}
+	}
+
+	zio->io_vsd = mm;
+	return (mm);
 }
 
 static void
 vdev_mirror_map_free(zio_t *zio)
 {
-	kmem_free(zio->io_vsd,
-	    zio->io_vd->vdev_children * sizeof (mirror_map_t));
+	mirror_map_t *mm = zio->io_vsd;
+
+	kmem_free(mm, offsetof(mirror_map_t, mm_child[mm->mm_children]));
 	zio->io_vsd = NULL;
 }
 
@@ -103,30 +163,31 @@
 static void
 vdev_mirror_child_done(zio_t *zio)
 {
-	mirror_map_t *mm = zio->io_private;
+	mirror_child_t *mc = zio->io_private;
 
-	mm->mm_error = zio->io_error;
-	mm->mm_tried = 1;
-	mm->mm_skipped = 0;
+	mc->mc_error = zio->io_error;
+	mc->mc_tried = 1;
+	mc->mc_skipped = 0;
 }
 
 static void
 vdev_mirror_scrub_done(zio_t *zio)
 {
-	mirror_map_t *mm = zio->io_private;
+	mirror_child_t *mc = zio->io_private;
 
 	if (zio->io_error == 0) {
 		zio_t *pio = zio->io_parent;
 		mutex_enter(&pio->io_lock);
+		ASSERT3U(zio->io_size, >=, pio->io_size);
 		bcopy(zio->io_data, pio->io_data, pio->io_size);
 		mutex_exit(&pio->io_lock);
 	}
 
 	zio_buf_free(zio->io_data, zio->io_size);
 
-	mm->mm_error = zio->io_error;
-	mm->mm_tried = 1;
-	mm->mm_skipped = 0;
+	mc->mc_error = zio->io_error;
+	mc->mc_tried = 1;
+	mc->mc_skipped = 0;
 }
 
 static void
@@ -144,60 +205,42 @@
 vdev_mirror_child_select(zio_t *zio)
 {
 	mirror_map_t *mm = zio->io_vsd;
-	vdev_t *vd = zio->io_vd;
-	vdev_t *cvd;
+	mirror_child_t *mc;
 	uint64_t txg = zio->io_txg;
 	int i, c;
 
 	ASSERT(zio->io_bp == NULL || zio->io_bp->blk_birth == txg);
 
 	/*
-	 * Select the child we'd like to read from absent any errors.
-	 * The current policy is to alternate sides at 8M granularity.
-	 * XXX -- investigate other policies for read distribution.
-	 */
-	c = (zio->io_offset >> (SPA_MAXBLOCKSHIFT + 6)) % vd->vdev_children;
-
-	/*
-	 * If this is a replacing vdev, always try child 0 (the source) first.
-	 */
-	if (vd->vdev_ops == &vdev_replacing_ops)
-		c = 0;
-
-	/*
 	 * Try to find a child whose DTL doesn't contain the block to read.
 	 * If a child is known to be completely inaccessible (indicated by
 	 * vdev_is_dead() returning B_TRUE), don't even try.
 	 */
-	for (i = 0; i < vd->vdev_children; i++, c++) {
-		if (c >= vd->vdev_children)
+	for (i = 0, c = mm->mm_preferred; i < mm->mm_children; i++, c++) {
+		if (c >= mm->mm_children)
 			c = 0;
-		if (mm[c].mm_tried || mm[c].mm_skipped)
+		mc = &mm->mm_child[c];
+		if (mc->mc_tried || mc->mc_skipped)
 			continue;
-		cvd = vd->vdev_child[c];
-		if (vdev_is_dead(cvd)) {
-			mm[c].mm_error = ENXIO;
-			mm[c].mm_tried = 1;	/* don't even try */
-			mm[c].mm_skipped = 1;
+		if (vdev_is_dead(mc->mc_vd)) {
+			mc->mc_error = ENXIO;
+			mc->mc_tried = 1;	/* don't even try */
+			mc->mc_skipped = 1;
 			continue;
 		}
-		if (!vdev_dtl_contains(&cvd->vdev_dtl_map, txg, 1))
+		if (!vdev_dtl_contains(&mc->mc_vd->vdev_dtl_map, txg, 1))
 			return (c);
-		mm[c].mm_error = ESTALE;
-		mm[c].mm_skipped = 1;
+		mc->mc_error = ESTALE;
+		mc->mc_skipped = 1;
 	}
 
 	/*
 	 * Every device is either missing or has this txg in its DTL.
-	 * If we don't have any sibling replicas to consult, look for
-	 * any child we haven't already tried before giving up.
+	 * Look for any child we haven't already tried before giving up.
 	 */
-	if (vd == vd->vdev_top || vd->vdev_parent->vdev_children <= 1) {
-		for (c = 0; c < vd->vdev_children; c++) {
-			if (!mm[c].mm_tried)
-				return (c);
-		}
-	}
+	for (c = 0; c < mm->mm_children; c++)
+		if (!mm->mm_child[c].mc_tried)
+			return (c);
 
 	/*
 	 * Every child failed.  There's no place left to look.
@@ -208,28 +251,28 @@
 static void
 vdev_mirror_io_start(zio_t *zio)
 {
-	vdev_t *vd = zio->io_vd;
 	mirror_map_t *mm;
+	mirror_child_t *mc;
 	int c, children;
 
 	mm = vdev_mirror_map_alloc(zio);
 
 	if (zio->io_type == ZIO_TYPE_READ) {
-		if ((zio->io_flags & ZIO_FLAG_SCRUB) &&
-		    vd->vdev_ops != &vdev_replacing_ops) {
+		if ((zio->io_flags & ZIO_FLAG_SCRUB) && !mm->mm_replacing) {
 			/*
 			 * For scrubbing reads we need to allocate a read
 			 * buffer for each child and issue reads to all
 			 * children.  If any child succeeds, it will copy its
 			 * data into zio->io_data in vdev_mirror_scrub_done.
 			 */
-			for (c = 0; c < vd->vdev_children; c++) {
+			for (c = 0; c < mm->mm_children; c++) {
+				mc = &mm->mm_child[c];
 				zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
-				    vd->vdev_child[c], zio->io_offset,
+				    mc->mc_vd, mc->mc_offset,
 				    zio_buf_alloc(zio->io_size), zio->io_size,
 				    zio->io_type, zio->io_priority,
-				    ZIO_FLAG_CANFAIL, vdev_mirror_scrub_done,
-				    &mm[c]));
+				    ZIO_FLAG_CANFAIL,
+				    vdev_mirror_scrub_done, mc));
 			}
 			zio_wait_children_done(zio);
 			return;
@@ -248,23 +291,23 @@
 		 * first child happens to have a DTL entry here as well.
 		 * All other writes go to all children.
 		 */
-		if ((zio->io_flags & ZIO_FLAG_RESILVER) &&
-		    vd->vdev_ops == &vdev_replacing_ops &&
-		    !vdev_dtl_contains(&vd->vdev_child[0]->vdev_dtl_map,
+		if ((zio->io_flags & ZIO_FLAG_RESILVER) && mm->mm_replacing &&
+		    !vdev_dtl_contains(&mm->mm_child[0].mc_vd->vdev_dtl_map,
 		    zio->io_txg, 1)) {
-			c = vd->vdev_children - 1;
+			c = mm->mm_children - 1;
 			children = 1;
 		} else {
 			c = 0;
-			children = vd->vdev_children;
+			children = mm->mm_children;
 		}
 	}
 
 	while (children--) {
+		mc = &mm->mm_child[c];
 		zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
-		    vd->vdev_child[c], zio->io_offset, zio->io_data,
-		    zio->io_size, zio->io_type, zio->io_priority,
-		    ZIO_FLAG_CANFAIL, vdev_mirror_child_done, &mm[c]));
+		    mc->mc_vd, mc->mc_offset,
+		    zio->io_data, zio->io_size, zio->io_type, zio->io_priority,
+		    ZIO_FLAG_CANFAIL, vdev_mirror_child_done, mc));
 		c++;
 	}
 
@@ -274,20 +317,19 @@
 static void
 vdev_mirror_io_done(zio_t *zio)
 {
-	vdev_t *vd = zio->io_vd;
-	vdev_t *cvd;
 	mirror_map_t *mm = zio->io_vsd;
+	mirror_child_t *mc;
 	int c;
 	int good_copies = 0;
 	int unexpected_errors = 0;
 
-	ASSERT(mm != NULL);
-
 	zio->io_error = 0;
 	zio->io_numerrors = 0;
 
-	for (c = 0; c < vd->vdev_children; c++) {
-		if (mm[c].mm_tried && mm[c].mm_error == 0) {
+	for (c = 0; c < mm->mm_children; c++) {
+		mc = &mm->mm_child[c];
+
+		if (mc->mc_tried && mc->mc_error == 0) {
 			good_copies++;
 			continue;
 		}
@@ -296,10 +338,10 @@
 		 * We preserve any EIOs because those may be worth retrying;
 		 * whereas ECKSUM and ENXIO are more likely to be persistent.
 		 */
-		if (mm[c].mm_error) {
+		if (mc->mc_error) {
 			if (zio->io_error != EIO)
-				zio->io_error = mm[c].mm_error;
-			if (!mm[c].mm_skipped)
+				zio->io_error = mc->mc_error;
+			if (!mc->mc_skipped)
 				unexpected_errors++;
 			zio->io_numerrors++;
 		}
@@ -308,11 +350,12 @@
 	if (zio->io_type == ZIO_TYPE_WRITE) {
 		/*
 		 * XXX -- for now, treat partial writes as success.
+		 * XXX -- For a replacing vdev, we need to make sure the
+		 *	  new child succeeds.
 		 */
 		/* XXPOLICY */
 		if (good_copies != 0)
 			zio->io_error = 0;
-		ASSERT(mm != NULL);
 		vdev_mirror_map_free(zio);
 		zio_next_stage(zio);
 		return;
@@ -325,17 +368,16 @@
 	 */
 	/* XXPOLICY */
 	if (good_copies == 0 && (c = vdev_mirror_child_select(zio)) != -1) {
-		ASSERT(c >= 0 && c < vd->vdev_children);
-		cvd = vd->vdev_child[c];
-		dprintf("%s: retrying i/o (err=%d) on child %s\n",
-		    vdev_description(zio->io_vd), zio->io_error,
-		    vdev_description(cvd));
+		ASSERT(c >= 0 && c < mm->mm_children);
+		mc = &mm->mm_child[c];
+		dprintf("retrying i/o (err=%d) on child %s\n",
+		    zio->io_error, vdev_description(mc->mc_vd));
 		zio->io_error = 0;
 		zio_vdev_io_redone(zio);
-		zio_nowait(zio_vdev_child_io(zio, zio->io_bp, cvd,
-		    zio->io_offset, zio->io_data, zio->io_size,
+		zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
+		    mc->mc_vd, mc->mc_offset, zio->io_data, zio->io_size,
 		    ZIO_TYPE_READ, zio->io_priority, ZIO_FLAG_CANFAIL,
-		    vdev_mirror_child_done, &mm[c]));
+		    vdev_mirror_child_done, mc));
 		zio_wait_children_done(zio);
 		return;
 	}
@@ -360,7 +402,7 @@
 		rio = zio_null(zio, zio->io_spa,
 		    vdev_mirror_repair_done, zio, ZIO_FLAG_CANFAIL);
 
-		for (c = 0; c < vd->vdev_children; c++) {
+		for (c = 0; c < mm->mm_children; c++) {
 			/*
 			 * Don't rewrite known good children.
 			 * Not only is it unnecessary, it could
@@ -368,24 +410,23 @@
 			 * power while rewriting the only good copy,
 			 * there would be no good copies left!
 			 */
-			cvd = vd->vdev_child[c];
+			mc = &mm->mm_child[c];
 
-			if (mm[c].mm_error == 0) {
-				if (mm[c].mm_tried)
+			if (mc->mc_error == 0) {
+				if (mc->mc_tried)
 					continue;
-				if (!vdev_dtl_contains(&cvd->vdev_dtl_map,
+				if (!vdev_dtl_contains(&mc->mc_vd->vdev_dtl_map,
 				    zio->io_txg, 1))
 					continue;
-				mm[c].mm_error = ESTALE;
+				mc->mc_error = ESTALE;
 			}
 
-			dprintf("%s resilvered %s @ 0x%llx error %d\n",
-			    vdev_description(vd),
-			    vdev_description(cvd),
-			    zio->io_offset, mm[c].mm_error);
+			dprintf("resilvered %s @ 0x%llx error %d\n",
+			    vdev_description(mc->mc_vd), mc->mc_offset,
+			    mc->mc_error);
 
-			zio_nowait(zio_vdev_child_io(rio, zio->io_bp, cvd,
-			    zio->io_offset, zio->io_data, zio->io_size,
+			zio_nowait(zio_vdev_child_io(rio, zio->io_bp, mc->mc_vd,
+			    mc->mc_offset, zio->io_data, zio->io_size,
 			    ZIO_TYPE_WRITE, zio->io_priority,
 			    ZIO_FLAG_IO_REPAIR | ZIO_FLAG_CANFAIL |
 			    ZIO_FLAG_DONT_PROPAGATE, NULL, NULL));
--- a/usr/src/uts/common/fs/zfs/vdev_raidz.c	Sun Apr 09 20:56:25 2006 -0700
+++ b/usr/src/uts/common/fs/zfs/vdev_raidz.c	Mon Apr 10 05:03:38 2006 -0700
@@ -272,12 +272,7 @@
 
 	rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children);
 
-	if (DVA_GET_GANG(ZIO_GET_DVA(zio))) {
-		ASSERT3U(rm->rm_asize, ==,
-		    vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE));
-	} else {
-		ASSERT3U(rm->rm_asize, ==, DVA_GET_ASIZE(ZIO_GET_DVA(zio)));
-	}
+	ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
 
 	if (zio->io_type == ZIO_TYPE_WRITE) {
 
@@ -357,11 +352,10 @@
 	vdev_t *cvd;
 	raidz_map_t *rm = zio->io_vsd;
 	raidz_col_t *rc;
-	blkptr_t *bp = zio->io_bp;
 	int unexpected_errors = 0;
 	int c;
 
-	ASSERT(bp != NULL);	/* XXX need to add code to enforce this */
+	ASSERT(zio->io_bp != NULL);  /* XXX need to add code to enforce this */
 
 	zio->io_error = 0;
 	zio->io_numerrors = 0;
--- a/usr/src/uts/common/fs/zfs/vdev_root.c	Sun Apr 09 20:56:25 2006 -0700
+++ b/usr/src/uts/common/fs/zfs/vdev_root.c	Mon Apr 10 05:03:38 2006 -0700
@@ -35,12 +35,29 @@
  * Virtual device vector for the pool's root vdev.
  */
 
+/*
+ * We should be able to tolerate one failure with absolutely no damage
+ * to our metadata.  Two failures will take out space maps, a bunch of
+ * indirect block trees, meta dnodes, dnodes, etc.  Probably not a happy
+ * place to live.  When we get smarter, we can liberalize this policy.
+ * e.g. If we haven't lost two consecutive top-level vdevs, then we are
+ * probably fine.  Adding bean counters during alloc/free can make this
+ * future guesswork more accurate.
+ */
+/*ARGSUSED*/
+static int
+too_many_errors(vdev_t *vd, int numerrors)
+{
+	return (numerrors > 0);
+}
+
 static int
 vdev_root_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift)
 {
 	vdev_t *cvd;
 	int c, error;
 	int lasterror = 0;
+	int numerrors = 0;
 
 	if (vd->vdev_children == 0) {
 		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
@@ -52,17 +69,20 @@
 
 		if ((error = vdev_open(cvd)) != 0) {
 			lasterror = error;
+			numerrors++;
 			continue;
 		}
 	}
 
-	if (lasterror)
+	if (too_many_errors(vd, numerrors)) {
 		vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
+		return (lasterror);
+	}
 
 	*asize = 0;
 	*ashift = 0;
 
-	return (lasterror);
+	return (0);
 }
 
 static void
@@ -77,7 +97,7 @@
 static void
 vdev_root_state_change(vdev_t *vd, int faulted, int degraded)
 {
-	if (faulted > 0)
+	if (too_many_errors(vd, faulted))
 		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
 		    VDEV_AUX_NO_REPLICAS);
 	else if (degraded != 0)
--- a/usr/src/uts/common/fs/zfs/zfs_ioctl.c	Sun Apr 09 20:56:25 2006 -0700
+++ b/usr/src/uts/common/fs/zfs/zfs_ioctl.c	Mon Apr 10 05:03:38 2006 -0700
@@ -392,7 +392,7 @@
 static int
 zfs_ioc_pool_export(zfs_cmd_t *zc)
 {
-	return (spa_export(zc->zc_name));
+	return (spa_export(zc->zc_name, NULL));
 }
 
 static int
--- a/usr/src/uts/common/fs/zfs/zio.c	Sun Apr 09 20:56:25 2006 -0700
+++ b/usr/src/uts/common/fs/zfs/zio.c	Mon Apr 10 05:03:38 2006 -0700
@@ -248,8 +248,6 @@
 		zio->io_bp = bp;
 		zio->io_bp_copy = *bp;
 		zio->io_bp_orig = *bp;
-		/* XXBP - Need to inherit this when it matters */
-		zio->io_dva_index = 0;
 	}
 	zio->io_done = done;
 	zio->io_private = private;
@@ -279,6 +277,7 @@
 		if (pio->io_child != NULL)
 			pio->io_child->io_sibling_prev = zio;
 		pio->io_child = zio;
+		zio->io_ndvas = pio->io_ndvas;
 		mutex_exit(&pio->io_lock);
 	}
 
@@ -310,7 +309,6 @@
     int priority, int flags, zbookmark_t *zb)
 {
 	zio_t *zio;
-	dva_t *dva;
 
 	ASSERT3U(size, ==, BP_GET_LSIZE(bp));
 
@@ -325,9 +323,6 @@
 	 */
 	zio->io_bp = &zio->io_bp_copy;
 
-	bp = zio->io_bp;
-	dva = ZIO_GET_DVA(zio);
-
 	if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF) {
 		uint64_t csize = BP_GET_PSIZE(bp);
 		void *cbuf = zio_buf_alloc(csize);
@@ -336,7 +331,7 @@
 		zio->io_pipeline |= 1U << ZIO_STAGE_READ_DECOMPRESS;
 	}
 
-	if (DVA_GET_GANG(dva)) {
+	if (BP_IS_GANG(bp)) {
 		uint64_t gsize = SPA_GANGBLOCKSIZE;
 		void *gbuf = zio_buf_alloc(gsize);
 
@@ -348,7 +343,7 @@
 }
 
 zio_t *
-zio_write(zio_t *pio, spa_t *spa, int checksum, int compress,
+zio_write(zio_t *pio, spa_t *spa, int checksum, int compress, int ncopies,
     uint64_t txg, blkptr_t *bp, void *data, uint64_t size,
     zio_done_func_t *done, void *private, int priority, int flags,
     zbookmark_t *zb)
@@ -371,6 +366,7 @@
 
 	zio->io_checksum = checksum;
 	zio->io_compress = compress;
+	zio->io_ndvas = ncopies;
 
 	if (compress != ZIO_COMPRESS_OFF)
 		zio->io_async_stages |= 1U << ZIO_STAGE_WRITE_COMPRESS;
@@ -380,6 +376,10 @@
 		BP_ZERO(bp);
 		BP_SET_LSIZE(bp, size);
 		BP_SET_PSIZE(bp, size);
+	} else {
+		/* Make sure someone doesn't change their mind on overwrites */
+		ASSERT(MIN(zio->io_ndvas + BP_IS_GANG(bp),
+		    spa_max_replication(spa)) == BP_GET_NDVAS(bp));
 	}
 
 	return (zio);
@@ -393,7 +393,6 @@
 {
 	zio_t *zio;
 
-	/* XXBP - We need to re-evaluate when to insert pipeline stages */
 	zio = zio_create(pio, spa, txg, bp, data, size, done, private,
 	    ZIO_TYPE_WRITE, priority, flags,
 	    ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE);
@@ -402,6 +401,9 @@
 	zio->io_checksum = checksum;
 	zio->io_compress = ZIO_COMPRESS_OFF;
 
+	if (pio != NULL)
+		ASSERT3U(zio->io_ndvas, <=, BP_GET_NDVAS(bp));
+
 	return (zio);
 }
 
@@ -441,7 +443,6 @@
 		return (zio_null(pio, spa, NULL, NULL, 0));
 	}
 
-	/* XXBP - We need to re-evaluate when to insert pipeline stages */
 	zio = zio_create(pio, spa, txg, bp, NULL, 0, done, private,
 	    ZIO_TYPE_FREE, ZIO_PRIORITY_FREE, 0,
 	    ZIO_STAGE_OPEN, ZIO_FREE_PIPELINE);
@@ -471,7 +472,6 @@
 	ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa));
 	ASSERT3U(spa_first_txg(spa), <=, txg);
 
-	/* XXBP - We need to re-evaluate when to insert pipeline stages */
 	zio = zio_create(pio, spa, txg, bp, NULL, 0, done, private,
 	    ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, 0,
 	    ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE);
@@ -623,7 +623,7 @@
 	cio = zio_create(zio, zio->io_spa, zio->io_txg, bp, data, size,
 	    done, private, type, priority,
 	    (zio->io_flags & ZIO_FLAG_VDEV_INHERIT) | ZIO_FLAG_CANFAIL | flags,
-	    ZIO_STAGE_VDEV_IO_SETUP - 1, pipeline);
+	    ZIO_STAGE_VDEV_IO_START - 1, pipeline);
 
 	cio->io_vd = vd;
 	cio->io_offset = offset;
@@ -748,8 +748,13 @@
 		ASSERT(bp->blk_pad[2] == 0);
 		ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0);
 		if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) &&
-		    !(zio->io_flags & ZIO_FLAG_IO_REPAIR))
+		    !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) {
 			ASSERT(!BP_SHOULD_BYTESWAP(bp));
+			if (zio->io_ndvas != 0)
+				ASSERT3U(zio->io_ndvas, <=, BP_GET_NDVAS(bp));
+			ASSERT(BP_COUNT_GANG(bp) == 0 ||
+			    (BP_COUNT_GANG(bp) == BP_GET_NDVAS(bp)));
+		}
 	}
 
 	if (vd != NULL)
@@ -902,6 +907,7 @@
 			BP_ZERO(bp);
 			zio->io_pipeline = ZIO_WAIT_FOR_CHILDREN_PIPELINE;
 		} else {
+			ASSERT3U(BP_GET_NDVAS(bp), ==, 0);
 			BP_SET_LSIZE(bp, lsize);
 			BP_SET_PSIZE(bp, csize);
 			BP_SET_COMPRESS(bp, compress);
@@ -946,7 +952,7 @@
 	 * By default, the pipeline assumes that we're dealing with a gang
 	 * block.  If we're not, strip out any gang-specific stages.
 	 */
-	if (!DVA_GET_GANG(ZIO_GET_DVA(zio)))
+	if (!BP_IS_GANG(zio->io_bp))
 		zio->io_pipeline &= ~ZIO_GANG_STAGES;
 
 	zio_next_stage(zio);
@@ -968,7 +974,7 @@
 	uint64_t gsize = SPA_GANGBLOCKSIZE;
 	void *gbuf = zio_buf_alloc(gsize);
 
-	ASSERT(DVA_GET_GANG(ZIO_GET_DVA(zio)));
+	ASSERT(BP_IS_GANG(bp));
 
 	zio_push_transform(zio, gbuf, gsize, gsize);
 
@@ -987,7 +993,7 @@
 	uint64_t gsize, gbufsize, loff, lsize;
 	int i;
 
-	ASSERT(DVA_GET_GANG(ZIO_GET_DVA(zio)));
+	ASSERT(BP_IS_GANG(zio->io_bp));
 
 	zio_gang_byteswap(zio);
 	zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize);
@@ -1019,7 +1025,7 @@
 	uint64_t gsize, gbufsize, loff, lsize;
 	int i;
 
-	ASSERT(DVA_GET_GANG(ZIO_GET_DVA(zio)));
+	ASSERT(BP_IS_GANG(zio->io_bp));
 	ASSERT3U(zio->io_size, ==, SPA_GANGBLOCKSIZE);
 
 	zio_gang_byteswap(zio);
@@ -1054,7 +1060,7 @@
 	uint64_t gsize, gbufsize;
 	int i;
 
-	ASSERT(DVA_GET_GANG(ZIO_GET_DVA(zio)));
+	ASSERT(BP_IS_GANG(zio->io_bp));
 
 	zio_gang_byteswap(zio);
 	zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize);
@@ -1079,7 +1085,7 @@
 	uint64_t gsize, gbufsize;
 	int i;
 
-	ASSERT(DVA_GET_GANG(ZIO_GET_DVA(zio)));
+	ASSERT(BP_IS_GANG(zio->io_bp));
 
 	zio_gang_byteswap(zio);
 	zio_pop_transform(zio, (void **)&gbh, &gsize, &gbufsize);
@@ -1100,17 +1106,23 @@
 zio_write_allocate_gang_member_done(zio_t *zio)
 {
 	zio_t *pio = zio->io_parent;
-	dva_t *cdva = ZIO_GET_DVA(zio);
-	dva_t *pdva = ZIO_GET_DVA(pio);
+	dva_t *cdva = zio->io_bp->blk_dva;
+	dva_t *pdva = pio->io_bp->blk_dva;
 	uint64_t asize;
-
-	ASSERT(DVA_GET_GANG(pdva));
+	int d;
 
-	/* XXBP - Need to be careful here with multiple DVAs */
+	ASSERT3U(pio->io_ndvas, ==, zio->io_ndvas);
+	ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp));
+	ASSERT3U(zio->io_ndvas, <=, BP_GET_NDVAS(zio->io_bp));
+	ASSERT3U(pio->io_ndvas, <=, BP_GET_NDVAS(pio->io_bp));
+
 	mutex_enter(&pio->io_lock);
-	asize = DVA_GET_ASIZE(pdva);
-	asize += DVA_GET_ASIZE(cdva);
-	DVA_SET_ASIZE(pdva, asize);
+	for (d = 0; d < BP_GET_NDVAS(pio->io_bp); d++) {
+		ASSERT(DVA_GET_GANG(&pdva[d]));
+		asize = DVA_GET_ASIZE(&pdva[d]);
+		asize += DVA_GET_ASIZE(&cdva[d]);
+		DVA_SET_ASIZE(&pdva[d], asize);
+	}
 	mutex_exit(&pio->io_lock);
 }
 
@@ -1118,41 +1130,50 @@
 zio_write_allocate_gang_members(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
-	dva_t *dva = ZIO_GET_DVA(zio);
+	dva_t *dva = bp->blk_dva;
+	spa_t *spa = zio->io_spa;
 	zio_gbh_phys_t *gbh;
+	uint64_t txg = zio->io_txg;
 	uint64_t resid = zio->io_size;
 	uint64_t maxalloc = P2ROUNDUP(zio->io_size >> 1, SPA_MINBLOCKSIZE);
 	uint64_t gsize, loff, lsize;
 	uint32_t gbps_left;
+	int ndvas = zio->io_ndvas;
+	int gbh_ndvas = MIN(ndvas + 1, spa_max_replication(spa));
 	int error;
-	int i;
+	int i, d;
 
 	gsize = SPA_GANGBLOCKSIZE;
 	gbps_left = SPA_GBH_NBLKPTRS;
 
-	error = metaslab_alloc(zio->io_spa, gsize, dva, zio->io_txg);
+	error = metaslab_alloc(spa, gsize, bp, gbh_ndvas, txg, NULL);
 	if (error == ENOSPC)
 		panic("can't allocate gang block header");
 	ASSERT(error == 0);
 
-	DVA_SET_GANG(dva, 1);
+	for (d = 0; d < gbh_ndvas; d++)
+		DVA_SET_GANG(&dva[d], 1);
 
-	bp->blk_birth = zio->io_txg;
+	bp->blk_birth = txg;
 
 	gbh = zio_buf_alloc(gsize);
 	bzero(gbh, gsize);
 
+	/* We need to test multi-level gang blocks */
+	if (maxalloc >= zio_gang_bang && (lbolt & 0x1) == 0)
+		maxalloc = MAX(maxalloc >> 2, SPA_MINBLOCKSIZE);
+
 	for (loff = 0, i = 0; loff != zio->io_size;
 	    loff += lsize, resid -= lsize, gbps_left--, i++) {
 		blkptr_t *gbp = &gbh->zg_blkptr[i];
-		dva = &gbp->blk_dva[0];
+		dva = gbp->blk_dva;
 
 		ASSERT(gbps_left != 0);
 		maxalloc = MIN(maxalloc, resid);
 
 		while (resid <= maxalloc * gbps_left) {
-			error = metaslab_alloc(zio->io_spa, maxalloc, dva,
-			    zio->io_txg);
+			error = metaslab_alloc(spa, maxalloc, gbp, ndvas,
+			    txg, bp);
 			if (error == 0)
 				break;
 			ASSERT3U(error, ==, ENOSPC);
@@ -1166,9 +1187,9 @@
 			BP_SET_LSIZE(gbp, lsize);
 			BP_SET_PSIZE(gbp, lsize);
 			BP_SET_COMPRESS(gbp, ZIO_COMPRESS_OFF);
-			gbp->blk_birth = zio->io_txg;
-			zio_nowait(zio_rewrite(zio, zio->io_spa,
-			    zio->io_checksum, zio->io_txg, gbp,
+			gbp->blk_birth = txg;
+			zio_nowait(zio_rewrite(zio, spa,
+			    zio->io_checksum, txg, gbp,
 			    (char *)zio->io_data + loff, lsize,
 			    zio_write_allocate_gang_member_done, NULL,
 			    zio->io_priority, zio->io_flags,
@@ -1176,8 +1197,8 @@
 		} else {
 			lsize = P2ROUNDUP(resid / gbps_left, SPA_MINBLOCKSIZE);
 			ASSERT(lsize != SPA_MINBLOCKSIZE);
-			zio_nowait(zio_write_allocate(zio, zio->io_spa,
-			    zio->io_checksum, zio->io_txg, gbp,
+			zio_nowait(zio_write_allocate(zio, spa,
+			    zio->io_checksum, txg, gbp,
 			    (char *)zio->io_data + loff, lsize,
 			    zio_write_allocate_gang_member_done, NULL,
 			    zio->io_priority, zio->io_flags));
@@ -1189,6 +1210,12 @@
 	zio->io_pipeline |= 1U << ZIO_STAGE_GANG_CHECKSUM_GENERATE;
 
 	zio_push_transform(zio, gbh, gsize, gsize);
+	/*
+	 * As much as we'd like this to be zio_wait_children_ready(),
+	 * updating our ASIZE doesn't happen until the io_done callback,
+	 * so we have to wait for that to finish in order for our BP
+	 * to be stable.
+	 */
 	zio_wait_children_done(zio);
 }
 
@@ -1201,10 +1228,12 @@
 zio_dva_allocate(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
-	dva_t *dva = ZIO_GET_DVA(zio);
 	int error;
 
 	ASSERT(BP_IS_HOLE(bp));
+	ASSERT3U(BP_GET_NDVAS(bp), ==, 0);
+	ASSERT3U(zio->io_ndvas, >, 0);
+	ASSERT3U(zio->io_ndvas, <=, spa_max_replication(zio->io_spa));
 
 	/* For testing, make some blocks above a certain size be gang blocks */
 	if (zio->io_size >= zio_gang_bang && (lbolt & 0x3) == 0) {
@@ -1214,7 +1243,8 @@
 
 	ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp));
 
-	error = metaslab_alloc(zio->io_spa, zio->io_size, dva, zio->io_txg);
+	error = metaslab_alloc(zio->io_spa, zio->io_size, bp, zio->io_ndvas,
+	    zio->io_txg, NULL);
 
 	if (error == 0) {
 		bp->blk_birth = zio->io_txg;
@@ -1233,11 +1263,13 @@
 zio_dva_free(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
-	dva_t *dva = ZIO_GET_DVA(zio);
+	dva_t *dva = bp->blk_dva;
+	int d;
 
 	ASSERT(!BP_IS_HOLE(bp));
 
-	metaslab_free(zio->io_spa, dva, zio->io_txg, B_FALSE);
+	for (d = 0; d < BP_GET_NDVAS(bp); d++)
+		metaslab_free(zio->io_spa, &dva[d], zio->io_txg, B_FALSE);
 
 	BP_ZERO(bp);
 
@@ -1248,31 +1280,17 @@
 zio_dva_claim(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
-	dva_t *dva = ZIO_GET_DVA(zio);
+	dva_t *dva = bp->blk_dva;
+	int error = 0;
+	int d;
 
 	ASSERT(!BP_IS_HOLE(bp));
 
-	zio->io_error = metaslab_claim(zio->io_spa, dva, zio->io_txg);
-
-	zio_next_stage(zio);
-}
-
-static void
-zio_dva_translate(zio_t *zio)
-{
-	spa_t *spa = zio->io_spa;
-	dva_t *dva = ZIO_GET_DVA(zio);
-	uint64_t vdev = DVA_GET_VDEV(dva);
-	uint64_t offset = DVA_GET_OFFSET(dva);
-
-	ASSERT3U(zio->io_size, ==, ZIO_GET_IOSIZE(zio));
-
-	zio->io_offset = offset;
-
-	if ((zio->io_vd = vdev_lookup_top(spa, vdev)) == NULL)
-		zio->io_error = ENXIO;
-	else if (offset + zio->io_size > zio->io_vd->vdev_asize)
-		zio->io_error = EOVERFLOW;
+	for (d = 0; d < BP_GET_NDVAS(bp); d++) {
+		error = metaslab_claim(zio->io_spa, &dva[d], zio->io_txg);
+		if (error)
+			zio->io_error = error;
+	}
 
 	zio_next_stage(zio);
 }
@@ -1284,17 +1302,26 @@
  */
 
 static void
-zio_vdev_io_setup(zio_t *zio)
+zio_vdev_io_start(zio_t *zio)
 {
 	vdev_t *vd = zio->io_vd;
-	vdev_t *tvd = vd->vdev_top;
-	uint64_t align = 1ULL << tvd->vdev_ashift;
+	vdev_t *tvd = vd ? vd->vdev_top : NULL;
+	blkptr_t *bp = zio->io_bp;
+	uint64_t align;
 
-	/* XXPOLICY */
+	if (vd == NULL) {
+		/* The mirror_ops handle multiple DVAs in a single BP */
+		vdev_mirror_ops.vdev_op_io_start(zio);
+		return;
+	}
+
+	align = 1ULL << tvd->vdev_ashift;
+
 	if (zio->io_retries == 0 && vd == tvd)
 		zio->io_flags |= ZIO_FLAG_FAILFAST;
 
-	if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) && vd->vdev_children == 0) {
+	if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) &&
+	    vd->vdev_children == 0) {
 		zio->io_flags |= ZIO_FLAG_PHYSICAL;
 		zio->io_offset += VDEV_LABEL_START_SIZE;
 	}
@@ -1312,15 +1339,6 @@
 		zio->io_flags |= ZIO_FLAG_SUBBLOCK;
 	}
 
-	zio_next_stage(zio);
-}
-
-static void
-zio_vdev_io_start(zio_t *zio)
-{
-	blkptr_t *bp = zio->io_bp;
-	uint64_t align = 1ULL << zio->io_vd->vdev_top->vdev_ashift;
-
 	ASSERT(P2PHASE(zio->io_offset, align) == 0);
 	ASSERT(P2PHASE(zio->io_size, align) == 0);
 	ASSERT(bp == NULL ||
@@ -1335,7 +1353,11 @@
 static void
 zio_vdev_io_done(zio_t *zio)
 {
-	vdev_io_done(zio);
+	if (zio->io_vd == NULL)
+		/* The mirror_ops handle multiple DVAs in a single BP */
+		vdev_mirror_ops.vdev_op_io_done(zio);
+	else
+		vdev_io_done(zio);
 }
 
 /* XXPOLICY */
@@ -1348,7 +1370,7 @@
 		return (B_FALSE);
 	if (zio->io_delegate_list != NULL)
 		return (B_FALSE);
-	if (vd != vd->vdev_top)
+	if (vd && vd != vd->vdev_top)
 		return (B_FALSE);
 	if (zio->io_flags & ZIO_FLAG_DONT_RETRY)
 		return (B_FALSE);
@@ -1362,7 +1384,7 @@
 zio_vdev_io_assess(zio_t *zio)
 {
 	vdev_t *vd = zio->io_vd;
-	vdev_t *tvd = vd->vdev_top;
+	vdev_t *tvd = vd ? vd->vdev_top : NULL;
 
 	ASSERT(zio->io_vsd == NULL);
 
@@ -1394,7 +1416,7 @@
 		/* XXPOLICY */
 		zio->io_flags &= ~ZIO_FLAG_FAILFAST;
 		zio->io_flags |= ZIO_FLAG_DONT_CACHE;
-		zio->io_stage = ZIO_STAGE_VDEV_IO_SETUP - 1;
+		zio->io_stage = ZIO_STAGE_VDEV_IO_START - 1;
 
 		dprintf("retry #%d for %s to %s offset %llx\n",
 		    zio->io_retries, zio_type_name[zio->io_type],
@@ -1404,8 +1426,8 @@
 		return;
 	}
 
-	if (zio->io_error != 0 && !(zio->io_flags & ZIO_FLAG_SPECULATIVE) &&
-	    zio->io_error != ECKSUM) {
+	if (zio->io_error != 0 && zio->io_error != ECKSUM &&
+	    !(zio->io_flags & ZIO_FLAG_SPECULATIVE) && vd) {
 		/*
 		 * Poor man's hotplug support.  Even if we're done retrying this
 		 * I/O, try to reopen the vdev to see if it's still attached.
@@ -1480,8 +1502,8 @@
 	zio_cksum_t zc;
 	zio_gbh_phys_t *gbh = zio->io_data;
 
+	ASSERT(BP_IS_GANG(zio->io_bp));
 	ASSERT3U(zio->io_size, ==, SPA_GANGBLOCKSIZE);
-	ASSERT(DVA_GET_GANG(ZIO_GET_DVA(zio)));
 
 	zio_set_gang_verifier(zio, &gbh->zg_tail.zbt_cksum);
 
@@ -1518,9 +1540,11 @@
 void
 zio_set_gang_verifier(zio_t *zio, zio_cksum_t *zcp)
 {
-	zcp->zc_word[0] = DVA_GET_VDEV(ZIO_GET_DVA(zio));
-	zcp->zc_word[1] = DVA_GET_OFFSET(ZIO_GET_DVA(zio));
-	zcp->zc_word[2] = zio->io_bp->blk_birth;
+	blkptr_t *bp = zio->io_bp;
+
+	zcp->zc_word[0] = DVA_GET_VDEV(BP_IDENTITY(bp));
+	zcp->zc_word[1] = DVA_GET_OFFSET(BP_IDENTITY(bp));
+	zcp->zc_word[2] = bp->blk_birth;
 	zcp->zc_word[3] = 0;
 }
 
@@ -1552,8 +1576,6 @@
 	zio_dva_claim,
 	zio_gang_checksum_generate,
 	zio_ready,
-	zio_dva_translate,
-	zio_vdev_io_setup,
 	zio_vdev_io_start,
 	zio_vdev_io_done,
 	zio_vdev_io_assess,
@@ -1656,7 +1678,7 @@
 
 	BP_ZERO(bp);
 
-	error = metaslab_alloc(spa, size, BP_IDENTITY(bp), txg);
+	error = metaslab_alloc(spa, size, bp, 1, txg, NULL);
 
 	if (error == 0) {
 		BP_SET_CHECKSUM(bp, checksum);
@@ -1681,7 +1703,7 @@
 void
 zio_free_blk(spa_t *spa, blkptr_t *bp, uint64_t txg)
 {
-	ASSERT(DVA_GET_GANG(BP_IDENTITY(bp)) == 0);
+	ASSERT(!BP_IS_GANG(bp));
 
 	dprintf_bp(bp, "txg %llu: ", txg);
 
--- a/usr/src/uts/common/fs/zfs/zio_checksum.c	Sun Apr 09 20:56:25 2006 -0700
+++ b/usr/src/uts/common/fs/zfs/zio_checksum.c	Mon Apr 10 05:03:38 2006 -0700
@@ -122,9 +122,8 @@
 zio_checksum_error(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
-	dva_t *dva = ZIO_GET_DVA(zio);
 	zio_cksum_t zc = bp->blk_cksum;
-	uint_t checksum = DVA_GET_GANG(dva) ? ZIO_CHECKSUM_GANG_HEADER :
+	uint_t checksum = BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER :
 	    BP_GET_CHECKSUM(bp);
 	int byteswap = BP_SHOULD_BYTESWAP(bp);
 	void *data = zio->io_data;
@@ -159,7 +158,7 @@
 		}
 		zc = expected_cksum;
 	} else {
-		ASSERT(!DVA_GET_GANG(dva));
+		ASSERT(!BP_IS_GANG(bp));
 		ci->ci_func[byteswap](data, size, &actual_cksum);
 	}
 
--- a/usr/src/uts/common/sys/fs/zfs.h	Sun Apr 09 20:56:25 2006 -0700
+++ b/usr/src/uts/common/sys/fs/zfs.h	Mon Apr 10 05:03:38 2006 -0700
@@ -109,7 +109,23 @@
 /*
  * On-disk format version.
  */
-#define	ZFS_VERSION			1ULL
+#define	ZFS_VERSION_1			1ULL
+#define	ZFS_VERSION_2			2ULL
+#define	ZFS_VERSION			ZFS_VERSION_2
+
+/*
+ * Symbolic names for the changes that caused a ZFS_VERSION switch.
+ * Used in the code when checking for presence or absence of a feature.
+ * Feel free to define multiple symbolic names for each version if there
+ * were multiple changes to on-disk structures during that version.
+ *
+ * NOTE: When checking the current ZFS_VERSION in your code, be sure
+ *       to use spa_version() since it reports the version of the
+ *       last synced uberblock.  Checking the in-flight version can
+ *       be dangerous in some cases.
+ */
+#define	ZFS_VERSION_INITIAL		ZFS_VERSION_1
+#define	ZFS_VERSION_DITTO_BLOCKS	ZFS_VERSION_2
 
 /*
  * The following are configuration names used in the nvlist describing a pool's