6343667 scrub/resilver has to start over when a snapshot is taken
authorahrens
Mon, 07 Jul 2008 13:39:21 -0700
changeset 7046 361307ae060d
parent 7045 d4292813278d
child 7047 8e4f1d0e9dd8
6343667 scrub/resilver has to start over when a snapshot is taken 6343693 'zpool status' gives delayed start for 'zpool scrub' 6670746 scrub on degraded pool return the status of 'resilver completed'? 6675685 DTL entries are lost resulting in checksum errors 6706404 get_history_one() can dereference off end of hist_event_table[] 6715414 assertion failed: ds->ds_owner != tag in dsl_dataset_rele() 6716437 ztest gets SEGV in arc_released() 6722838 bfu does not update grub
usr/src/cmd/zdb/zdb.c
usr/src/cmd/zpool/zpool_main.c
usr/src/cmd/ztest/ztest.c
usr/src/grub/grub-0.95/stage2/zfs-include/zfs.h
usr/src/tools/scripts/bfu.sh
usr/src/uts/common/Makefile.files
usr/src/uts/common/fs/zfs/arc.c
usr/src/uts/common/fs/zfs/bplist.c
usr/src/uts/common/fs/zfs/dbuf.c
usr/src/uts/common/fs/zfs/dmu.c
usr/src/uts/common/fs/zfs/dmu_objset.c
usr/src/uts/common/fs/zfs/dmu_send.c
usr/src/uts/common/fs/zfs/dmu_traverse.c
usr/src/uts/common/fs/zfs/dnode_sync.c
usr/src/uts/common/fs/zfs/dsl_dataset.c
usr/src/uts/common/fs/zfs/dsl_dir.c
usr/src/uts/common/fs/zfs/dsl_pool.c
usr/src/uts/common/fs/zfs/dsl_prop.c
usr/src/uts/common/fs/zfs/dsl_scrub.c
usr/src/uts/common/fs/zfs/spa.c
usr/src/uts/common/fs/zfs/spa_errlog.c
usr/src/uts/common/fs/zfs/spa_history.c
usr/src/uts/common/fs/zfs/spa_misc.c
usr/src/uts/common/fs/zfs/sys/arc.h
usr/src/uts/common/fs/zfs/sys/bplist.h
usr/src/uts/common/fs/zfs/sys/dmu.h
usr/src/uts/common/fs/zfs/sys/dmu_objset.h
usr/src/uts/common/fs/zfs/sys/dsl_dataset.h
usr/src/uts/common/fs/zfs/sys/dsl_dir.h
usr/src/uts/common/fs/zfs/sys/dsl_pool.h
usr/src/uts/common/fs/zfs/sys/dsl_prop.h
usr/src/uts/common/fs/zfs/sys/spa.h
usr/src/uts/common/fs/zfs/sys/spa_impl.h
usr/src/uts/common/fs/zfs/sys/txg.h
usr/src/uts/common/fs/zfs/sys/vdev.h
usr/src/uts/common/fs/zfs/sys/zap.h
usr/src/uts/common/fs/zfs/sys/zap_leaf.h
usr/src/uts/common/fs/zfs/sys/zfs_znode.h
usr/src/uts/common/fs/zfs/sys/zio.h
usr/src/uts/common/fs/zfs/txg.c
usr/src/uts/common/fs/zfs/vdev.c
usr/src/uts/common/fs/zfs/zap.c
usr/src/uts/common/fs/zfs/zfs_dir.c
usr/src/uts/common/fs/zfs/zfs_ioctl.c
usr/src/uts/common/fs/zfs/zfs_vfsops.c
usr/src/uts/common/fs/zfs/zfs_znode.c
usr/src/uts/common/fs/zfs/zil.c
usr/src/uts/common/fs/zfs/zio.c
usr/src/uts/common/sys/fs/zfs.h
--- a/usr/src/cmd/zdb/zdb.c	Mon Jul 07 13:38:41 2008 -0700
+++ b/usr/src/cmd/zdb/zdb.c	Mon Jul 07 13:39:21 2008 -0700
@@ -773,7 +773,7 @@
 	nicenum(ds->ds_unique_bytes, unique);
 	sprintf_blkptr(blkbuf, BP_SPRINTF_LEN, &ds->ds_bp);
 
-	(void) printf("\t\tdataset_obj = %llu\n",
+	(void) printf("\t\tdir_obj = %llu\n",
 	    (u_longlong_t)ds->ds_dir_obj);
 	(void) printf("\t\tprev_snap_obj = %llu\n",
 	    (u_longlong_t)ds->ds_prev_snap_obj);
@@ -800,6 +800,8 @@
 	    (u_longlong_t)ds->ds_guid);
 	(void) printf("\t\tflags = %llx\n",
 	    (u_longlong_t)ds->ds_flags);
+	(void) printf("\t\tnext_clones_obj = %llu\n",
+	    (u_longlong_t)ds->ds_next_clones_obj);
 	(void) printf("\t\tbp = %s\n", blkbuf);
 }
 
@@ -1007,6 +1009,8 @@
 	dump_uint8,		/* ZFS SYSACL			*/
 	dump_none,		/* FUID nvlist			*/
 	dump_packed_nvlist,	/* FUID nvlist size		*/
+	dump_zap,		/* DSL dataset next clones	*/
+	dump_zap,		/* DSL scrub queue		*/
 };
 
 static void
@@ -1174,6 +1178,9 @@
 	if (verbosity < 2)
 		return;
 
+	if (os->os->os_rootbp->blk_birth == 0)
+		return;
+
 	if (zopt_objects != 0) {
 		for (i = 0; i < zopt_objects; i++)
 			dump_object(os, zopt_object[i], verbosity,
--- a/usr/src/cmd/zpool/zpool_main.c	Mon Jul 07 13:38:41 2008 -0700
+++ b/usr/src/cmd/zpool/zpool_main.c	Mon Jul 07 13:39:21 2008 -0700
@@ -3430,6 +3430,7 @@
 		(void) printf(gettext(" 9   refquota and refreservation "
 		    "properties\n"));
 		(void) printf(gettext(" 10  Cache devices\n"));
+		(void) printf(gettext(" 11  Improved scrub performance\n"));
 		(void) printf(gettext("For more information on a particular "
 		    "version, including supported releases, see:\n\n"));
 		(void) printf("http://www.opensolaris.org/os/community/zfs/"
@@ -3514,6 +3515,7 @@
 	"filesystem version upgrade",
 	"refquota set",
 	"refreservation set",
+	"pool scrub done",
 };
 
 /*
@@ -3568,7 +3570,7 @@
 			    ZPOOL_HIST_TXG, &txg) == 0);
 			verify(nvlist_lookup_string(records[i],
 			    ZPOOL_HIST_INT_STR, &pathstr) == 0);
-			if (ievent > LOG_END)
+			if (ievent >= LOG_END)
 				continue;
 			(void) snprintf(internalstr,
 			    sizeof (internalstr),
--- a/usr/src/cmd/ztest/ztest.c	Mon Jul 07 13:38:41 2008 -0700
+++ b/usr/src/cmd/ztest/ztest.c	Mon Jul 07 13:39:21 2008 -0700
@@ -205,14 +205,14 @@
 	{ ztest_zap_parallel,			100,	&zopt_always	},
 	{ ztest_traverse,			1,	&zopt_often	},
 	{ ztest_dsl_prop_get_set,		1,	&zopt_sometimes	},
-	{ ztest_dmu_objset_create_destroy,	1,	&zopt_sometimes	},
-	{ ztest_dmu_snapshot_create_destroy,	1,	&zopt_rarely	},
-	{ ztest_spa_create_destroy,		1,	&zopt_sometimes	},
+	{ ztest_dmu_objset_create_destroy,	1,	&zopt_sometimes },
+	{ ztest_dmu_snapshot_create_destroy,	1,	&zopt_rarely },
+	{ ztest_spa_create_destroy,		1,	&zopt_sometimes },
 	{ ztest_fault_inject,			1,	&zopt_sometimes	},
 	{ ztest_spa_rename,			1,	&zopt_rarely	},
-	{ ztest_vdev_attach_detach,		1,	&zopt_rarely	},
-	{ ztest_vdev_LUN_growth,		1,	&zopt_rarely	},
-	{ ztest_vdev_add_remove,		1,	&zopt_vdevtime	},
+	{ ztest_vdev_attach_detach,		1,	&zopt_rarely },
+	{ ztest_vdev_LUN_growth,		1,	&zopt_rarely },
+	{ ztest_vdev_add_remove,		1,	&zopt_vdevtime },
 	{ ztest_scrub,				1,	&zopt_vdevtime	},
 };
 
@@ -1046,12 +1046,15 @@
 	/*
 	 * If someone grew the LUN, the replacement may be too small.
 	 */
-	if (error == EOVERFLOW)
+	if (error == EOVERFLOW || error == EBUSY)
 		expected_error = error;
 
-	if (error != expected_error) {
-		fatal(0, "attach (%s, %s, %d) returned %d, expected %d",
-		    oldpath, newpath, replacing, error, expected_error);
+	/* XXX workaround 6690467 */
+	if (error != expected_error && expected_error != EBUSY) {
+		fatal(0, "attach (%s %llu, %s %llu, %d) "
+		    "returned %d, expected %d",
+		    oldpath, (longlong_t)oldsize, newpath,
+		    (longlong_t)newsize, replacing, error, expected_error);
 	}
 
 	(void) mutex_unlock(&ztest_shared->zs_vdev_lock);
@@ -1551,7 +1554,7 @@
 	 * Destroy the previous batch of objects.
 	 */
 	for (b = 0; b < batchsize; b++) {
-		VERIFY(0 == dmu_read(os, batchobj, b * sizeof (uint64_t),
+		VERIFY3U(0, ==, dmu_read(os, batchobj, b * sizeof (uint64_t),
 		    sizeof (uint64_t), &object));
 		if (object == 0)
 			continue;
@@ -2681,13 +2684,9 @@
 {
 	spa_t *spa = za->za_spa;
 
-	mutex_enter(&spa_namespace_lock);
-	(void) spa_scrub(spa, POOL_SCRUB_EVERYTHING, B_FALSE);
-	mutex_exit(&spa_namespace_lock);
+	(void) spa_scrub(spa, POOL_SCRUB_EVERYTHING);
 	(void) poll(NULL, 0, 1000); /* wait a second, then force a restart */
-	mutex_enter(&spa_namespace_lock);
-	(void) spa_scrub(spa, POOL_SCRUB_EVERYTHING, B_FALSE);
-	mutex_exit(&spa_namespace_lock);
+	(void) spa_scrub(spa, POOL_SCRUB_EVERYTHING);
 }
 
 /*
--- a/usr/src/grub/grub-0.95/stage2/zfs-include/zfs.h	Mon Jul 07 13:38:41 2008 -0700
+++ b/usr/src/grub/grub-0.95/stage2/zfs-include/zfs.h	Mon Jul 07 13:39:21 2008 -0700
@@ -39,7 +39,8 @@
 #define	SPA_VERSION_8			8ULL
 #define	SPA_VERSION_9			9ULL
 #define	SPA_VERSION_10			10ULL
-#define	SPA_VERSION			SPA_VERSION_10
+#define	SPA_VERSION_11			11ULL
+#define	SPA_VERSION			SPA_VERSION_11
 
 /*
  * The following are configuration names used in the nvlist describing a pool's
--- a/usr/src/tools/scripts/bfu.sh	Mon Jul 07 13:38:41 2008 -0700
+++ b/usr/src/tools/scripts/bfu.sh	Mon Jul 07 13:39:21 2008 -0700
@@ -7463,6 +7463,10 @@
 			fi
 			;;
 		    i386)
+			print "Extracting grub for boot " \
+			    "block ... \c" | tee -a $EXTRACT_LOG
+			do_extraction $cpiodir/$karch.boot$ZFIX  | \
+				tee -a $EXTRACT_LOG
 			$rootprefix/boot/solaris/bin/update_grub -R $root
 			;;
 		    *)
--- a/usr/src/uts/common/Makefile.files	Mon Jul 07 13:38:41 2008 -0700
+++ b/usr/src/uts/common/Makefile.files	Mon Jul 07 13:39:21 2008 -0700
@@ -1096,6 +1096,7 @@
 	dmu_zfetch.o		\
 	dsl_deleg.o		\
 	dsl_prop.o		\
+	dsl_scrub.o		\
 	fletcher.o		\
 	gzip.o			\
 	lzjb.o			\
--- a/usr/src/uts/common/fs/zfs/arc.c	Mon Jul 07 13:38:41 2008 -0700
+++ b/usr/src/uts/common/fs/zfs/arc.c	Mon Jul 07 13:39:21 2008 -0700
@@ -173,6 +173,7 @@
 uint64_t zfs_arc_max;
 uint64_t zfs_arc_min;
 uint64_t zfs_arc_meta_limit = 0;
+int zfs_mdcomp_disable = 0;
 
 /*
  * Note that buffers can be in one of 6 states:
@@ -392,7 +393,6 @@
 struct arc_callback {
 	void			*acb_private;
 	arc_done_func_t		*acb_done;
-	arc_byteswap_func_t	*acb_byteswap;
 	arc_buf_t		*acb_buf;
 	zio_t			*acb_zio_dummy;
 	arc_callback_t		*acb_next;
@@ -441,6 +441,11 @@
 
 	l2arc_buf_hdr_t		*b_l2hdr;
 	list_node_t		b_l2node;
+	/*
+	 * scrub code can lockout access to the buf while it changes
+	 * bp's contained within it.
+	 */
+	krwlock_t		b_datalock;
 };
 
 static arc_buf_t *arc_eviction_list;
@@ -474,6 +479,7 @@
 #define	ARC_L2_WRITING		(1 << 17)	/* L2ARC write in progress */
 #define	ARC_L2_EVICTED		(1 << 18)	/* evicted during I/O */
 #define	ARC_L2_WRITE_HEAD	(1 << 19)	/* head of write list */
+#define	ARC_STORED		(1 << 20)	/* has been store()d to */
 
 #define	HDR_IN_HASH_TABLE(hdr)	((hdr)->b_flags & ARC_IN_HASH_TABLE)
 #define	HDR_IO_IN_PROGRESS(hdr)	((hdr)->b_flags & ARC_IO_IN_PROGRESS)
@@ -609,7 +615,7 @@
 static void l2arc_hdr_stat_remove(void);
 
 static uint64_t
-buf_hash(spa_t *spa, dva_t *dva, uint64_t birth)
+buf_hash(spa_t *spa, const dva_t *dva, uint64_t birth)
 {
 	uintptr_t spav = (uintptr_t)spa;
 	uint8_t *vdva = (uint8_t *)dva;
@@ -637,7 +643,7 @@
 	((buf)->b_birth == birth) && ((buf)->b_spa == spa)
 
 static arc_buf_hdr_t *
-buf_hash_find(spa_t *spa, dva_t *dva, uint64_t birth, kmutex_t **lockp)
+buf_hash_find(spa_t *spa, const dva_t *dva, uint64_t birth, kmutex_t **lockp)
 {
 	uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
 	kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
@@ -757,6 +763,7 @@
 	refcount_create(&buf->b_refcnt);
 	cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL);
 	mutex_init(&buf->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
+	rw_init(&buf->b_datalock, NULL, RW_DEFAULT, NULL);
 
 	ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE);
 	return (0);
@@ -775,6 +782,7 @@
 	refcount_destroy(&buf->b_refcnt);
 	cv_destroy(&buf->b_cv);
 	mutex_destroy(&buf->b_freeze_lock);
+	rw_destroy(&buf->b_datalock);
 
 	ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE);
 }
@@ -941,7 +949,7 @@
 		ASSERT3U(*size, >=, delta);
 		atomic_add_64(size, -delta);
 		mutex_exit(&ab->b_state->arcs_mtx);
-		/* remove the prefetch flag is we get a reference */
+		/* remove the prefetch flag if we get a reference */
 		if (ab->b_flags & ARC_PREFETCH)
 			ab->b_flags &= ~ARC_PREFETCH;
 	}
@@ -1271,6 +1279,7 @@
 	ASSERT(refcount_is_zero(&hdr->b_refcnt));
 	ASSERT3P(hdr->b_state, ==, arc_anon);
 	ASSERT(!HDR_IO_IN_PROGRESS(hdr));
+	ASSERT(!(hdr->b_flags & ARC_STORED));
 
 	if (hdr->b_l2hdr != NULL) {
 		if (!MUTEX_HELD(&l2arc_buflist_mtx)) {
@@ -2296,8 +2305,12 @@
 	/* byteswap if necessary */
 	callback_list = hdr->b_acb;
 	ASSERT(callback_list != NULL);
-	if (BP_SHOULD_BYTESWAP(zio->io_bp) && callback_list->acb_byteswap)
-		callback_list->acb_byteswap(buf->b_data, hdr->b_size);
+	if (BP_SHOULD_BYTESWAP(zio->io_bp)) {
+		arc_byteswap_func_t *func = BP_GET_LEVEL(zio->io_bp) > 0 ?
+		    byteswap_uint64_array :
+		    dmu_ot[BP_GET_TYPE(zio->io_bp)].ot_byteswap;
+		func(buf->b_data, hdr->b_size);
+	}
 
 	arc_cksum_compute(buf, B_FALSE);
 
@@ -2394,11 +2407,33 @@
  *
  * arc_read_done() will invoke all the requested "done" functions
  * for readers of this block.
+ *
+ * Normal callers should use arc_read and pass the arc buffer and offset
+ * for the bp.  But if you know you don't need locking, you can use
+ * arc_read_bp.
  */
 int
-arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_byteswap_func_t *swap,
+arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_buf_t *pbuf,
     arc_done_func_t *done, void *private, int priority, int flags,
-    uint32_t *arc_flags, zbookmark_t *zb)
+    uint32_t *arc_flags, const zbookmark_t *zb)
+{
+	int err;
+
+	ASSERT(!refcount_is_zero(&pbuf->b_hdr->b_refcnt));
+	ASSERT3U((char *)bp - (char *)pbuf->b_data, <, pbuf->b_hdr->b_size);
+	rw_enter(&pbuf->b_hdr->b_datalock, RW_READER);
+
+	err = arc_read_nolock(pio, spa, bp, done, private, priority,
+	    flags, arc_flags, zb);
+
+	rw_exit(&pbuf->b_hdr->b_datalock);
+	return (err);
+}
+
+int
+arc_read_nolock(zio_t *pio, spa_t *spa, blkptr_t *bp,
+    arc_done_func_t *done, void *private, int priority, int flags,
+    uint32_t *arc_flags, const zbookmark_t *zb)
 {
 	arc_buf_hdr_t *hdr;
 	arc_buf_t *buf;
@@ -2427,7 +2462,6 @@
 				    KM_SLEEP);
 				acb->acb_done = done;
 				acb->acb_private = private;
-				acb->acb_byteswap = swap;
 				if (pio != NULL)
 					acb->acb_zio_dummy = zio_null(pio,
 					    spa, NULL, NULL, flags);
@@ -2536,7 +2570,6 @@
 		acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
 		acb->acb_done = done;
 		acb->acb_private = private;
-		acb->acb_byteswap = swap;
 
 		ASSERT(hdr->b_acb == NULL);
 		hdr->b_acb = acb;
@@ -2790,7 +2823,7 @@
  * Release this buffer from the cache.  This must be done
  * after a read and prior to modifying the buffer contents.
  * If the buffer has more than one reference, we must make
- * make a new hdr for the buffer.
+ * a new hdr for the buffer.
  */
 void
 arc_release(arc_buf_t *buf, void *tag)
@@ -2802,6 +2835,7 @@
 
 	/* this buffer is not on any list */
 	ASSERT(refcount_count(&hdr->b_refcnt) > 0);
+	ASSERT(!(hdr->b_flags & ARC_STORED));
 
 	if (hdr->b_state == arc_anon) {
 		/* this buffer is already released */
@@ -2964,9 +2998,6 @@
 
 	hdr->b_acb = NULL;
 
-	/* this buffer is on no lists and is not in the hash table */
-	ASSERT3P(hdr->b_state, ==, arc_anon);
-
 	hdr->b_dva = *BP_IDENTITY(zio->io_bp);
 	hdr->b_birth = zio->io_bp->blk_birth;
 	hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0];
@@ -3002,7 +3033,9 @@
 			ASSERT3P(exists, ==, NULL);
 		}
 		hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
-		arc_access(hdr, hash_lock);
+		/* if it's not anon, we are doing a scrub */
+		if (hdr->b_state == arc_anon)
+			arc_access(hdr, hash_lock);
 		mutex_exit(hash_lock);
 	} else if (callback->awcb_done == NULL) {
 		int destroy_hdr;
@@ -3019,6 +3052,7 @@
 	} else {
 		hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
 	}
+	hdr->b_flags &= ~ARC_STORED;
 
 	if (callback->awcb_done) {
 		ASSERT(!refcount_is_zero(&hdr->b_refcnt));
@@ -3028,19 +3062,62 @@
 	kmem_free(callback, sizeof (arc_write_callback_t));
 }
 
+static void
+write_policy(spa_t *spa, const writeprops_t *wp,
+    int *cksump, int *compp, int *copiesp)
+{
+	int copies = wp->wp_copies;
+	boolean_t ismd = (wp->wp_level > 0 || dmu_ot[wp->wp_type].ot_metadata);
+
+	/* Determine copies setting */
+	if (ismd)
+		copies++;
+	*copiesp = MIN(copies, spa_max_replication(spa));
+
+	/* Determine checksum setting */
+	if (ismd) {
+		/*
+		 * Metadata always gets checksummed.  If the data
+		 * checksum is multi-bit correctable, and it's not a
+		 * ZBT-style checksum, then it's suitable for metadata
+		 * as well.  Otherwise, the metadata checksum defaults
+		 * to fletcher4.
+		 */
+		if (zio_checksum_table[wp->wp_oschecksum].ci_correctable &&
+		    !zio_checksum_table[wp->wp_oschecksum].ci_zbt)
+			*cksump = wp->wp_oschecksum;
+		else
+			*cksump = ZIO_CHECKSUM_FLETCHER_4;
+	} else {
+		*cksump = zio_checksum_select(wp->wp_dnchecksum,
+		    wp->wp_oschecksum);
+	}
+
+	/* Determine compression setting */
+	if (ismd) {
+		/*
+		 * XXX -- we should design a compression algorithm
+		 * that specializes in arrays of bps.
+		 */
+		*compp = zfs_mdcomp_disable ? ZIO_COMPRESS_EMPTY :
+		    ZIO_COMPRESS_LZJB;
+	} else {
+		*compp = zio_compress_select(wp->wp_dncompress,
+		    wp->wp_oscompress);
+	}
+}
+
 zio_t *
-arc_write(zio_t *pio, spa_t *spa, int checksum, int compress, int ncopies,
+arc_write(zio_t *pio, spa_t *spa, const writeprops_t *wp,
     uint64_t txg, blkptr_t *bp, arc_buf_t *buf,
     arc_done_func_t *ready, arc_done_func_t *done, void *private, int priority,
-    int flags, zbookmark_t *zb)
+    int flags, const zbookmark_t *zb)
 {
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 	arc_write_callback_t *callback;
 	zio_t	*zio;
-
-	/* this is a private buffer - no locking required */
-	ASSERT3P(hdr->b_state, ==, arc_anon);
-	ASSERT(BUF_EMPTY(hdr));
+	int cksum, comp, copies;
+
 	ASSERT(!HDR_IO_ERROR(hdr));
 	ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0);
 	ASSERT(hdr->b_acb == 0);
@@ -3049,9 +3126,11 @@
 	callback->awcb_done = done;
 	callback->awcb_private = private;
 	callback->awcb_buf = buf;
-	zio = zio_write(pio, spa, checksum, compress, ncopies, txg, bp,
-	    buf->b_data, hdr->b_size, arc_write_ready, arc_write_done, callback,
-	    priority, flags, zb);
+
+	write_policy(spa, wp, &cksum, &comp, &copies);
+	zio = zio_write(pio, spa, cksum, comp, copies, txg, bp,
+	    buf->b_data, hdr->b_size, arc_write_ready, arc_write_done,
+	    callback, priority, flags, zb);
 
 	return (zio);
 }
--- a/usr/src/uts/common/fs/zfs/bplist.c	Mon Jul 07 13:38:41 2008 -0700
+++ b/usr/src/uts/common/fs/zfs/bplist.c	Mon Jul 07 13:39:21 2008 -0700
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -181,7 +181,7 @@
 }
 
 int
-bplist_enqueue(bplist_t *bpl, blkptr_t *bp, dmu_tx_t *tx)
+bplist_enqueue(bplist_t *bpl, const blkptr_t *bp, dmu_tx_t *tx)
 {
 	uint64_t blk, off;
 	blkptr_t *bparray;
@@ -229,7 +229,7 @@
  * Deferred entry; will be written later by bplist_sync().
  */
 void
-bplist_enqueue_deferred(bplist_t *bpl, blkptr_t *bp)
+bplist_enqueue_deferred(bplist_t *bpl, const blkptr_t *bp)
 {
 	bplist_q_t *bpq = kmem_alloc(sizeof (*bpq), KM_SLEEP);
 
--- a/usr/src/uts/common/fs/zfs/dbuf.c	Mon Jul 07 13:38:41 2008 -0700
+++ b/usr/src/uts/common/fs/zfs/dbuf.c	Mon Jul 07 13:39:21 2008 -0700
@@ -39,13 +39,10 @@
 
 static void dbuf_destroy(dmu_buf_impl_t *db);
 static int dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
-static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, int checksum,
-    int compress, dmu_tx_t *tx);
+static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx);
 static arc_done_func_t dbuf_write_ready;
 static arc_done_func_t dbuf_write_done;
 
-int zfs_mdcomp_disable = 0;
-
 /*
  * Global data structures and functions for the dbuf cache.
  */
@@ -456,26 +453,27 @@
 static void
 dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
 {
-	blkptr_t *bp;
+	dnode_t *dn = db->db_dnode;
 	zbookmark_t zb;
 	uint32_t aflags = ARC_NOWAIT;
+	arc_buf_t *pbuf;
 
 	ASSERT(!refcount_is_zero(&db->db_holds));
 	/* We need the struct_rwlock to prevent db_blkptr from changing. */
-	ASSERT(RW_LOCK_HELD(&db->db_dnode->dn_struct_rwlock));
+	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 	ASSERT(db->db_state == DB_UNCACHED);
 	ASSERT(db->db_buf == NULL);
 
 	if (db->db_blkid == DB_BONUS_BLKID) {
-		int bonuslen = db->db_dnode->dn_bonuslen;
+		int bonuslen = dn->dn_bonuslen;
 
 		ASSERT3U(bonuslen, <=, db->db.db_size);
 		db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN);
 		arc_space_consume(DN_MAX_BONUSLEN);
 		if (bonuslen < DN_MAX_BONUSLEN)
 			bzero(db->db.db_data, DN_MAX_BONUSLEN);
-		bcopy(DN_BONUS(db->db_dnode->dn_phys), db->db.db_data,
+		bcopy(DN_BONUS(dn->dn_phys), db->db.db_data,
 		    bonuslen);
 		dbuf_update_data(db);
 		db->db_state = DB_CACHED;
@@ -483,21 +481,11 @@
 		return;
 	}
 
-	if (db->db_level == 0 && dnode_block_freed(db->db_dnode, db->db_blkid))
-		bp = NULL;
-	else
-		bp = db->db_blkptr;
-
-	if (bp == NULL)
-		dprintf_dbuf(db, "blkptr: %s\n", "NULL");
-	else
-		dprintf_dbuf_bp(db, bp, "%s", "blkptr:");
-
-	if (bp == NULL || BP_IS_HOLE(bp)) {
+	if (db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr) ||
+	    (db->db_level == 0 && dnode_block_freed(dn, db->db_blkid))) {
 		arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
 
-		ASSERT(bp == NULL || BP_IS_HOLE(bp));
-		dbuf_set_data(db, arc_buf_alloc(db->db_dnode->dn_objset->os_spa,
+		dbuf_set_data(db, arc_buf_alloc(dn->dn_objset->os_spa,
 		    db->db.db_size, db, type));
 		bzero(db->db.db_data, db->db.db_size);
 		db->db_state = DB_CACHED;
@@ -517,10 +505,13 @@
 
 	dbuf_add_ref(db, NULL);
 	/* ZIO_FLAG_CANFAIL callers have to check the parent zio's error */
-	ASSERT3U(db->db_dnode->dn_type, <, DMU_OT_NUMTYPES);
-	(void) arc_read(zio, db->db_dnode->dn_objset->os_spa, bp,
-	    db->db_level > 0 ? byteswap_uint64_array :
-	    dmu_ot[db->db_dnode->dn_type].ot_byteswap,
+
+	if (db->db_parent)
+		pbuf = db->db_parent->db_buf;
+	else
+		pbuf = db->db_objset->os_phys_buf;
+
+	(void) arc_read(zio, dn->dn_objset->os_spa, db->db_blkptr, pbuf,
 	    dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ,
 	    (*flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED,
 	    &aflags, &zb);
@@ -690,7 +681,8 @@
 	/* free this block */
 	if (!BP_IS_HOLE(&dr->dt.dl.dr_overridden_by)) {
 		/* XXX can get silent EIO here */
-		(void) arc_free(NULL, db->db_dnode->dn_objset->os_spa,
+		(void) dsl_free(NULL,
+		    spa_get_dsl(db->db_dnode->dn_objset->os_spa),
 		    txg, &dr->dt.dl.dr_overridden_by, NULL, NULL, ARC_WAIT);
 	}
 	dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
@@ -1561,6 +1553,7 @@
 
 	if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp) == 0) {
 		if (bp && !BP_IS_HOLE(bp)) {
+			arc_buf_t *pbuf;
 			uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH;
 			zbookmark_t zb;
 			zb.zb_objset = dn->dn_objset->os_dsl_dataset ?
@@ -1569,9 +1562,13 @@
 			zb.zb_level = 0;
 			zb.zb_blkid = blkid;
 
-			(void) arc_read(NULL, dn->dn_objset->os_spa, bp,
-			    dmu_ot[dn->dn_type].ot_byteswap,
-			    NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
+			if (db)
+				pbuf = db->db_buf;
+			else
+				pbuf = dn->dn_objset->os_phys_buf;
+
+			(void) arc_read(NULL, dn->dn_objset->os_spa,
+			    bp, pbuf, NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
 			    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
 			    &aflags, &zb);
 		}
@@ -1885,15 +1882,8 @@
 
 	db->db_data_pending = dr;
 
-	arc_release(db->db_buf, db);
 	mutex_exit(&db->db_mtx);
-
-	/*
-	 * XXX -- we should design a compression algorithm
-	 * that specializes in arrays of bps.
-	 */
-	dbuf_write(dr, db->db_buf, ZIO_CHECKSUM_FLETCHER_4,
-	    zfs_mdcomp_disable ? ZIO_COMPRESS_EMPTY : ZIO_COMPRESS_LZJB, tx);
+	dbuf_write(dr, db->db_buf, tx);
 
 	zio = dr->dr_zio;
 	mutex_enter(&dr->dt.di.dr_mtx);
@@ -1911,7 +1901,6 @@
 	dnode_t *dn = db->db_dnode;
 	objset_impl_t *os = dn->dn_objset;
 	uint64_t txg = tx->tx_txg;
-	int checksum, compress;
 	int blksz;
 
 	ASSERT(dmu_tx_is_syncing(tx));
@@ -2030,14 +2019,6 @@
 			*datap = arc_buf_alloc(os->os_spa, blksz, db, type);
 			bcopy(db->db.db_data, (*datap)->b_data, blksz);
 		}
-	} else {
-		/*
-		 * Private object buffers are released here rather
-		 * than in dbuf_dirty() since they are only modified
-		 * in the syncing context and we don't want the
-		 * overhead of making multiple copies of the data.
-		 */
-		arc_release(db->db_buf, db);
 	}
 
 	ASSERT(*datap != NULL);
@@ -2045,22 +2026,7 @@
 
 	mutex_exit(&db->db_mtx);
 
-	/*
-	 * Allow dnode settings to override objset settings,
-	 * except for metadata checksums.
-	 */
-	if (dmu_ot[dn->dn_type].ot_metadata) {
-		checksum = os->os_md_checksum;
-		compress = zio_compress_select(dn->dn_compress,
-		    os->os_md_compress);
-	} else {
-		checksum = zio_checksum_select(dn->dn_checksum,
-		    os->os_checksum);
-		compress = zio_compress_select(dn->dn_compress,
-		    os->os_compress);
-	}
-
-	dbuf_write(dr, *datap, checksum, compress, tx);
+	dbuf_write(dr, *datap, tx);
 
 	ASSERT(!list_link_active(&dr->dr_dirty_node));
 	if (dn->dn_object == DMU_META_DNODE_OBJECT)
@@ -2096,8 +2062,7 @@
 }
 
 static void
-dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, int checksum,
-    int compress, dmu_tx_t *tx)
+dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db = dr->dr_dbuf;
 	dnode_t *dn = db->db_dnode;
@@ -2105,9 +2070,25 @@
 	dmu_buf_impl_t *parent = db->db_parent;
 	uint64_t txg = tx->tx_txg;
 	zbookmark_t zb;
+	writeprops_t wp = { 0 };
 	zio_t *zio;
 	int zio_flags;
 
+	if (!BP_IS_HOLE(db->db_blkptr) &&
+	    (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE)) {
+		/*
+		 * Private object buffers are released here rather
+		 * than in dbuf_dirty() since they are only modified
+		 * in the syncing context and we don't want the
+		 * overhead of making multiple copies of the data.
+		 */
+		arc_release(data, db);
+	} else {
+		ASSERT(arc_released(data));
+		/* XXX why do we need to thaw here? */
+		arc_buf_thaw(data);
+	}
+
 	if (parent != dn->dn_dbuf) {
 		ASSERT(parent && parent->db_data_pending);
 		ASSERT(db->db_level == parent->db_level-1);
@@ -2132,13 +2113,20 @@
 	zio_flags = ZIO_FLAG_MUSTSUCCEED;
 	if (dmu_ot[dn->dn_type].ot_metadata || zb.zb_level != 0)
 		zio_flags |= ZIO_FLAG_METADATA;
+	wp.wp_type = dn->dn_type;
+	wp.wp_level = db->db_level;
+	wp.wp_copies = os->os_copies;
+	wp.wp_dncompress = dn->dn_compress;
+	wp.wp_oscompress = os->os_compress;
+	wp.wp_dnchecksum = dn->dn_checksum;
+	wp.wp_oschecksum = os->os_checksum;
+
 	if (BP_IS_OLDER(db->db_blkptr, txg))
 		(void) dsl_dataset_block_kill(
 		    os->os_dsl_dataset, db->db_blkptr, zio, tx);
 
-	dr->dr_zio = arc_write(zio, os->os_spa, checksum, compress,
-	    dmu_get_replication_level(os, &zb, dn->dn_type), txg,
-	    db->db_blkptr, data, dbuf_write_ready, dbuf_write_done, db,
+	dr->dr_zio = arc_write(zio, os->os_spa, &wp,
+	    txg, db->db_blkptr, data, dbuf_write_ready, dbuf_write_done, db,
 	    ZIO_PRIORITY_ASYNC_WRITE, zio_flags, &zb);
 }
 
--- a/usr/src/uts/common/fs/zfs/dmu.c	Mon Jul 07 13:38:41 2008 -0700
+++ b/usr/src/uts/common/fs/zfs/dmu.c	Mon Jul 07 13:39:21 2008 -0700
@@ -84,6 +84,8 @@
 	{	byteswap_uint8_array,	TRUE,	"ZFS SYSACL"		},
 	{	byteswap_uint8_array,	TRUE,	"FUID table"		},
 	{	byteswap_uint64_array,	TRUE,	"FUID table size"	},
+	{	zap_byteswap,		TRUE,	"DSL dataset next clones"},
+	{	zap_byteswap,		TRUE,	"scrub work queue"	},
 };
 
 int
@@ -843,6 +845,7 @@
 	dbuf_dirty_record_t *dr;
 	dmu_sync_arg_t *in;
 	zbookmark_t zb;
+	writeprops_t wp = { 0 };
 	zio_t *zio;
 	int zio_flags;
 	int err;
@@ -958,10 +961,14 @@
 	zio_flags = ZIO_FLAG_MUSTSUCCEED;
 	if (dmu_ot[db->db_dnode->dn_type].ot_metadata || zb.zb_level != 0)
 		zio_flags |= ZIO_FLAG_METADATA;
-	zio = arc_write(pio, os->os_spa,
-	    zio_checksum_select(db->db_dnode->dn_checksum, os->os_checksum),
-	    zio_compress_select(db->db_dnode->dn_compress, os->os_compress),
-	    dmu_get_replication_level(os, &zb, db->db_dnode->dn_type),
+	wp.wp_type = db->db_dnode->dn_type;
+	wp.wp_copies = os->os_copies;
+	wp.wp_level = db->db_level;
+	wp.wp_dnchecksum = db->db_dnode->dn_checksum;
+	wp.wp_oschecksum = os->os_checksum;
+	wp.wp_dncompress = db->db_dnode->dn_compress;
+	wp.wp_oscompress = os->os_compress;
+	zio = arc_write(pio, os->os_spa, &wp,
 	    txg, bp, dr->dt.dl.dr_data, NULL, dmu_sync_done, in,
 	    ZIO_PRIORITY_SYNC_WRITE, zio_flags, &zb);
 
@@ -1019,21 +1026,6 @@
 }
 
 int
-dmu_get_replication_level(objset_impl_t *os,
-    zbookmark_t *zb, dmu_object_type_t ot)
-{
-	int ncopies = os->os_copies;
-
-	/* If it's the mos, it should have max copies set. */
-	ASSERT(zb->zb_objset != 0 ||
-	    ncopies == spa_max_replication(os->os_spa));
-
-	if (dmu_ot[ot].ot_metadata || zb->zb_level != 0)
-		ncopies++;
-	return (MIN(ncopies, spa_max_replication(os->os_spa)));
-}
-
-int
 dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off)
 {
 	dnode_t *dn;
--- a/usr/src/uts/common/fs/zfs/dmu_objset.c	Mon Jul 07 13:38:41 2008 -0700
+++ b/usr/src/uts/common/fs/zfs/dmu_objset.c	Mon Jul 07 13:39:21 2008 -0700
@@ -149,7 +149,7 @@
     objset_impl_t **osip)
 {
 	objset_impl_t *osi;
-	int i, err, checksum;
+	int i, err;
 
 	ASSERT(ds == NULL || MUTEX_HELD(&ds->ds_opening_lock));
 
@@ -167,8 +167,12 @@
 		zb.zb_blkid = 0;
 
 		dprintf_bp(osi->os_rootbp, "reading %s", "");
-		err = arc_read(NULL, spa, osi->os_rootbp,
-		    dmu_ot[DMU_OT_OBJSET].ot_byteswap,
+		/*
+		 * NB: when bprewrite scrub can change the bp,
+		 * and this is called from dmu_objset_open_ds_os, the bp
+		 * could change, and we'll need a lock.
+		 */
+		err = arc_read_nolock(NULL, spa, osi->os_rootbp,
 		    arc_getbuf_func, &osi->os_phys_buf,
 		    ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL, &aflags, &zb);
 		if (err) {
@@ -176,8 +180,6 @@
 			return (err);
 		}
 		osi->os_phys = osi->os_phys_buf->b_data;
-		if (ds == NULL || dsl_dataset_is_snapshot(ds) == 0)
-			arc_release(osi->os_phys_buf, &osi->os_phys_buf);
 	} else {
 		osi->os_phys_buf = arc_buf_alloc(spa, sizeof (objset_phys_t),
 		    &osi->os_phys_buf, ARC_BUFC_METADATA);
@@ -213,22 +215,8 @@
 		osi->os_copies = spa_max_replication(spa);
 	}
 
-	osi->os_zil = zil_alloc(&osi->os, &osi->os_phys->os_zil_header);
-
-	/*
-	 * Metadata always gets compressed and checksummed.
-	 * If the data checksum is multi-bit correctable, and it's not
-	 * a ZBT-style checksum, then it's suitable for metadata as well.
-	 * Otherwise, the metadata checksum defaults to fletcher4.
-	 */
-	checksum = osi->os_checksum;
-
-	if (zio_checksum_table[checksum].ci_correctable &&
-	    !zio_checksum_table[checksum].ci_zbt)
-		osi->os_md_checksum = checksum;
-	else
-		osi->os_md_checksum = ZIO_CHECKSUM_FLETCHER_4;
-	osi->os_md_compress = ZIO_COMPRESS_LZJB;
+	osi->os_zil_header = osi->os_phys->os_zil_header;
+	osi->os_zil = zil_alloc(&osi->os, &osi->os_zil_header);
 
 	for (i = 0; i < TXG_SIZE; i++) {
 		list_create(&osi->os_dirty_dnodes[i], sizeof (dnode_t),
@@ -835,22 +823,13 @@
 	}
 }
 
-/* ARGSUSED */
-static void
-killer(zio_t *zio, arc_buf_t *abuf, void *arg)
-{
-	objset_impl_t *os = arg;
-
-	ASSERT3U(zio->io_error, ==, 0);
-	arc_release(os->os_phys_buf, &os->os_phys_buf);
-}
-
 /* called from dsl */
 void
 dmu_objset_sync(objset_impl_t *os, zio_t *pio, dmu_tx_t *tx)
 {
 	int txgoff;
 	zbookmark_t zb;
+	writeprops_t wp = { 0 };
 	zio_t *zio;
 	list_t *list;
 	dbuf_dirty_record_t *dr;
@@ -881,10 +860,14 @@
 		(void) dsl_dataset_block_kill(os->os_dsl_dataset,
 		    os->os_rootbp, pio, tx);
 	}
-	zio = arc_write(pio, os->os_spa, os->os_md_checksum,
-	    os->os_md_compress,
-	    dmu_get_replication_level(os, &zb, DMU_OT_OBJSET),
-	    tx->tx_txg, os->os_rootbp, os->os_phys_buf, ready, killer, os,
+	wp.wp_type = DMU_OT_OBJSET;
+	wp.wp_copies = os->os_copies;
+	wp.wp_level = (uint8_t)-1;
+	wp.wp_oschecksum = os->os_checksum;
+	wp.wp_oscompress = os->os_compress;
+	arc_release(os->os_phys_buf, &os->os_phys_buf);
+	zio = arc_write(pio, os->os_spa, &wp,
+	    tx->tx_txg, os->os_rootbp, os->os_phys_buf, ready, NULL, os,
 	    ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_METADATA,
 	    &zb);
 
@@ -910,6 +893,7 @@
 	 * Free intent log blocks up to this tx.
 	 */
 	zil_sync(os->os_zil, tx);
+	os->os_phys->os_zil_header = os->os_zil_header;
 	zio_nowait(zio);
 }
 
@@ -1046,48 +1030,80 @@
 	return (0);
 }
 
+struct findarg {
+	int (*func)(char *, void *);
+	void *arg;
+};
+
+/* ARGSUSED */
+static int
+findfunc(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg)
+{
+	struct findarg *fa = arg;
+	return (fa->func((char *)dsname, fa->arg));
+}
+
 /*
  * Find all objsets under name, and for each, call 'func(child_name, arg)'.
+ * Perhaps change all callers to use dmu_objset_find_spa()?
  */
 int
 dmu_objset_find(char *name, int func(char *, void *), void *arg, int flags)
 {
+	struct findarg fa;
+	fa.func = func;
+	fa.arg = arg;
+	return (dmu_objset_find_spa(NULL, name, findfunc, &fa, flags));
+}
+
+/*
+ * Find all objsets under name, call func on each
+ */
+int
+dmu_objset_find_spa(spa_t *spa, const char *name,
+    int func(spa_t *, uint64_t, const char *, void *), void *arg, int flags)
+{
 	dsl_dir_t *dd;
-	objset_t *os;
-	uint64_t snapobj;
+	dsl_pool_t *dp;
+	dsl_dataset_t *ds;
 	zap_cursor_t zc;
 	zap_attribute_t *attr;
 	char *child;
-	int do_self, err;
+	uint64_t thisobj;
+	int err;
 
-	err = dsl_dir_open(name, FTAG, &dd, NULL);
+	if (name == NULL)
+		name = spa_name(spa);
+	err = dsl_dir_open_spa(spa, name, FTAG, &dd, NULL);
 	if (err)
 		return (err);
 
-	/* NB: the $MOS dir doesn't have a head dataset */
-	do_self = (dd->dd_phys->dd_head_dataset_obj != 0);
+	/* Don't visit hidden ($MOS & $ORIGIN) objsets. */
+	if (dd->dd_myname[0] == '$') {
+		dsl_dir_close(dd, FTAG);
+		return (0);
+	}
+
+	thisobj = dd->dd_phys->dd_head_dataset_obj;
 	attr = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
+	dp = dd->dd_pool;
 
 	/*
 	 * Iterate over all children.
 	 */
 	if (flags & DS_FIND_CHILDREN) {
-		for (zap_cursor_init(&zc, dd->dd_pool->dp_meta_objset,
+		for (zap_cursor_init(&zc, dp->dp_meta_objset,
 		    dd->dd_phys->dd_child_dir_zapobj);
 		    zap_cursor_retrieve(&zc, attr) == 0;
 		    (void) zap_cursor_advance(&zc)) {
 			ASSERT(attr->za_integer_length == sizeof (uint64_t));
 			ASSERT(attr->za_num_integers == 1);
 
-			/*
-			 * No separating '/' because parent's name ends in /.
-			 */
 			child = kmem_alloc(MAXPATHLEN, KM_SLEEP);
-			/* XXX could probably just use name here */
-			dsl_dir_name(dd, child);
+			(void) strcpy(child, name);
 			(void) strcat(child, "/");
 			(void) strcat(child, attr->za_name);
-			err = dmu_objset_find(child, func, arg, flags);
+			err = dmu_objset_find_spa(spa, child, func, arg, flags);
 			kmem_free(child, MAXPATHLEN);
 			if (err)
 				break;
@@ -1104,30 +1120,36 @@
 	/*
 	 * Iterate over all snapshots.
 	 */
-	if ((flags & DS_FIND_SNAPSHOTS) &&
-	    dmu_objset_open(name, DMU_OST_ANY,
-	    DS_MODE_USER | DS_MODE_READONLY, &os) == 0) {
+	if (flags & DS_FIND_SNAPSHOTS) {
+		if (!dsl_pool_sync_context(dp))
+			rw_enter(&dp->dp_config_rwlock, RW_READER);
+		err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds);
+		if (!dsl_pool_sync_context(dp))
+			rw_exit(&dp->dp_config_rwlock);
 
-		snapobj = os->os->os_dsl_dataset->ds_phys->ds_snapnames_zapobj;
-		dmu_objset_close(os);
+		if (err == 0) {
+			uint64_t snapobj = ds->ds_phys->ds_snapnames_zapobj;
+			dsl_dataset_rele(ds, FTAG);
 
-		for (zap_cursor_init(&zc, dd->dd_pool->dp_meta_objset, snapobj);
-		    zap_cursor_retrieve(&zc, attr) == 0;
-		    (void) zap_cursor_advance(&zc)) {
-			ASSERT(attr->za_integer_length == sizeof (uint64_t));
-			ASSERT(attr->za_num_integers == 1);
+			for (zap_cursor_init(&zc, dp->dp_meta_objset, snapobj);
+			    zap_cursor_retrieve(&zc, attr) == 0;
+			    (void) zap_cursor_advance(&zc)) {
+				ASSERT(attr->za_integer_length ==
+				    sizeof (uint64_t));
+				ASSERT(attr->za_num_integers == 1);
 
-			child = kmem_alloc(MAXPATHLEN, KM_SLEEP);
-			/* XXX could probably just use name here */
-			dsl_dir_name(dd, child);
-			(void) strcat(child, "@");
-			(void) strcat(child, attr->za_name);
-			err = func(child, arg);
-			kmem_free(child, MAXPATHLEN);
-			if (err)
-				break;
+				child = kmem_alloc(MAXPATHLEN, KM_SLEEP);
+				(void) strcpy(child, name);
+				(void) strcat(child, "@");
+				(void) strcat(child, attr->za_name);
+				err = func(spa, attr->za_first_integer,
+				    child, arg);
+				kmem_free(child, MAXPATHLEN);
+				if (err)
+					break;
+			}
+			zap_cursor_fini(&zc);
 		}
-		zap_cursor_fini(&zc);
 	}
 
 	dsl_dir_close(dd, FTAG);
@@ -1139,8 +1161,7 @@
 	/*
 	 * Apply to self if appropriate.
 	 */
-	if (do_self)
-		err = func(name, arg);
+	err = func(spa, thisobj, name, arg);
 	return (err);
 }
 
--- a/usr/src/uts/common/fs/zfs/dmu_send.c	Mon Jul 07 13:38:41 2008 -0700
+++ b/usr/src/uts/common/fs/zfs/dmu_send.c	Mon Jul 07 13:39:21 2008 -0700
@@ -200,10 +200,9 @@
 			zb.zb_object = object;
 			zb.zb_level = level;
 			zb.zb_blkid = blkid;
-			(void) arc_read(NULL, spa, bp,
-			    dmu_ot[type].ot_byteswap, arc_getbuf_func, &abuf,
-			    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_MUSTSUCCEED,
-			    &aflags, &zb);
+			(void) arc_read_nolock(NULL, spa, bp,
+			    arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ,
+			    ZIO_FLAG_MUSTSUCCEED, &aflags, &zb);
 
 			if (abuf) {
 				err = dump_data(ba, type, object, blkid * blksz,
@@ -241,11 +240,12 @@
 		return (EXDEV);
 
 	if (fromorigin) {
+		dsl_pool_t *dp = ds->ds_dir->dd_pool;
+
 		if (fromsnap)
 			return (EINVAL);
 
-		if (ds->ds_dir->dd_phys->dd_origin_obj != NULL) {
-			dsl_pool_t *dp = ds->ds_dir->dd_pool;
+		if (dsl_dir_is_clone(ds->ds_dir)) {
 			rw_enter(&dp->dp_config_rwlock, RW_READER);
 			err = dsl_dataset_hold_obj(dp,
 			    ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &fromds);
@@ -407,7 +407,7 @@
 		return (EINVAL);
 
 	/* must not be a clone ds */
-	if (ds->ds_prev != NULL)
+	if (dsl_dir_is_clone(ds->ds_dir))
 		return (EINVAL);
 
 	err = dsl_dataset_destroy_check(ds, rbsa->tag, tx);
@@ -443,7 +443,7 @@
 	 */
 	dsl_dataset_destroy_sync(ds, rbsa->tag, cr, tx);
 
-	dsobj = dsl_dataset_create_sync_impl(dd, rbsa->origin, flags, tx);
+	dsobj = dsl_dataset_create_sync_dd(dd, rbsa->origin, flags, tx);
 
 	rbsa->ds = recv_full_sync_impl(dd->dd_pool, dsobj,
 	    rbsa->origin ? DMU_OST_NONE : rbsa->type, cr, tx);
--- a/usr/src/uts/common/fs/zfs/dmu_traverse.c	Mon Jul 07 13:38:41 2008 -0700
+++ b/usr/src/uts/common/fs/zfs/dmu_traverse.c	Mon Jul 07 13:39:21 2008 -0700
@@ -614,7 +614,10 @@
 			th->th_locked = 0;
 		}
 
-		rc = traverse_read(th, bc, &dsp->ds_bp, dn);
+		if (BP_IS_HOLE(&dsp->ds_bp))
+			rc = ERESTART;
+		else
+			rc = traverse_read(th, bc, &dsp->ds_bp, dn);
 
 		if (rc != 0) {
 			if (rc == ERESTART)
--- a/usr/src/uts/common/fs/zfs/dnode_sync.c	Mon Jul 07 13:38:41 2008 -0700
+++ b/usr/src/uts/common/fs/zfs/dnode_sync.c	Mon Jul 07 13:39:21 2008 -0700
@@ -387,7 +387,6 @@
 				mutex_exit(&db->db_mtx);
 			} else if (refcount_is_zero(&db->db_holds)) {
 				progress = TRUE;
-				ASSERT(!arc_released(db->db_buf));
 				dbuf_clear(db); /* exits db_mtx for us */
 			} else {
 				mutex_exit(&db->db_mtx);
--- a/usr/src/uts/common/fs/zfs/dsl_dataset.c	Mon Jul 07 13:38:41 2008 -0700
+++ b/usr/src/uts/common/fs/zfs/dsl_dataset.c	Mon Jul 07 13:39:21 2008 -0700
@@ -39,6 +39,7 @@
 #include <sys/zfs_context.h>
 #include <sys/zfs_ioctl.h>
 #include <sys/spa.h>
+#include <sys/zfs_znode.h>
 #include <sys/sunddi.h>
 
 static char *dsl_reaper = "the grim reaper";
@@ -55,7 +56,6 @@
 
 #define	DSL_DATASET_IS_DESTROYED(ds)	((ds)->ds_owner == dsl_reaper)
 
-static void dsl_dataset_drop_ref(dsl_dataset_t *ds, void *tag);
 
 /*
  * Figure out how much of this delta should be propogated to the dsl_dir
@@ -135,7 +135,7 @@
 		 * Account for the meta-objset space in its placeholder
 		 * dataset.
 		 */
-		err = arc_free(pio, tx->tx_pool->dp_spa,
+		err = dsl_free(pio, tx->tx_pool,
 		    tx->tx_txg, bp, NULL, NULL, pio ? ARC_NOWAIT: ARC_WAIT);
 		ASSERT(err == 0);
 
@@ -153,7 +153,7 @@
 		int64_t delta;
 
 		dprintf_bp(bp, "freeing: %s", "");
-		err = arc_free(pio, tx->tx_pool->dp_spa,
+		err = dsl_free(pio, tx->tx_pool,
 		    tx->tx_txg, bp, NULL, NULL, pio ? ARC_NOWAIT: ARC_WAIT);
 		ASSERT(err == 0);
 
@@ -439,6 +439,8 @@
 	}
 	ASSERT3P(ds->ds_dbuf, ==, dbuf);
 	ASSERT3P(ds->ds_phys, ==, dbuf->db_data);
+	ASSERT(ds->ds_phys->ds_prev_snap_obj != 0 ||
+	    dp->dp_origin_snap == NULL || ds == dp->dp_origin_snap);
 	mutex_enter(&ds->ds_lock);
 	if (!dsl_pool_sync_context(dp) && DSL_DATASET_IS_DESTROYED(ds)) {
 		mutex_exit(&ds->ds_lock);
@@ -651,7 +653,7 @@
 	return (result);
 }
 
-static void
+void
 dsl_dataset_drop_ref(dsl_dataset_t *ds, void *tag)
 {
 	dmu_buf_rele(ds->ds_dbuf, tag);
@@ -660,7 +662,6 @@
 void
 dsl_dataset_rele(dsl_dataset_t *ds, void *tag)
 {
-	ASSERT(ds->ds_owner != tag);
 	if (!dsl_pool_sync_context(ds->ds_dir->dd_pool)) {
 		rw_exit(&ds->ds_rwlock);
 	}
@@ -711,51 +712,8 @@
 		rw_enter(&ds->ds_rwlock, RW_WRITER);
 }
 
-void
-dsl_dataset_create_root(dsl_pool_t *dp, uint64_t *ddobjp, dmu_tx_t *tx)
-{
-	objset_t *mos = dp->dp_meta_objset;
-	dmu_buf_t *dbuf;
-	dsl_dataset_phys_t *dsphys;
-	dsl_dataset_t *ds;
-	uint64_t dsobj;
-	dsl_dir_t *dd;
-
-	dsl_dir_create_root(mos, ddobjp, tx);
-	VERIFY(0 == dsl_dir_open_obj(dp, *ddobjp, NULL, FTAG, &dd));
-
-	dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,
-	    DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx);
-	VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf));
-	dmu_buf_will_dirty(dbuf, tx);
-	dsphys = dbuf->db_data;
-	dsphys->ds_dir_obj = dd->dd_object;
-	dsphys->ds_fsid_guid = unique_create();
-	(void) random_get_pseudo_bytes((void*)&dsphys->ds_guid,
-	    sizeof (dsphys->ds_guid));
-	dsphys->ds_snapnames_zapobj =
-	    zap_create_norm(mos, U8_TEXTPREP_TOUPPER, DMU_OT_DSL_DS_SNAP_MAP,
-	    DMU_OT_NONE, 0, tx);
-	dsphys->ds_creation_time = gethrestime_sec();
-	dsphys->ds_creation_txg = tx->tx_txg;
-	dsphys->ds_deadlist_obj =
-	    bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx);
-	if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE)
-		dsphys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
-	dmu_buf_rele(dbuf, FTAG);
-
-	dmu_buf_will_dirty(dd->dd_dbuf, tx);
-	dd->dd_phys->dd_head_dataset_obj = dsobj;
-	dsl_dir_close(dd, FTAG);
-
-	VERIFY(0 == dsl_dataset_get_ref(dp, dsobj, FTAG, &ds));
-	(void) dmu_objset_create_impl(dp->dp_spa, ds,
-	    &ds->ds_phys->ds_bp, DMU_OST_ZFS, tx);
-	dsl_dataset_drop_ref(ds, FTAG);
-}
-
 uint64_t
-dsl_dataset_create_sync_impl(dsl_dir_t *dd, dsl_dataset_t *origin,
+dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin,
     uint64_t flags, dmu_tx_t *tx)
 {
 	dsl_pool_t *dp = dd->dd_pool;
@@ -764,6 +722,9 @@
 	uint64_t dsobj;
 	objset_t *mos = dp->dp_meta_objset;
 
+	if (origin == NULL)
+		origin = dp->dp_origin_snap;
+
 	ASSERT(origin == NULL || origin->ds_dir->dd_pool == dp);
 	ASSERT(origin == NULL || origin->ds_phys->ds_num_children > 0);
 	ASSERT(dmu_tx_is_syncing(tx));
@@ -784,7 +745,7 @@
 	    zap_create_norm(mos, U8_TEXTPREP_TOUPPER, DMU_OT_DSL_DS_SNAP_MAP,
 	    DMU_OT_NONE, 0, tx);
 	dsphys->ds_creation_time = gethrestime_sec();
-	dsphys->ds_creation_txg = tx->tx_txg;
+	dsphys->ds_creation_txg = tx->tx_txg == TXG_INITIAL ? 1 : tx->tx_txg;
 	dsphys->ds_deadlist_obj =
 	    bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx);
 
@@ -804,6 +765,17 @@
 		dmu_buf_will_dirty(origin->ds_dbuf, tx);
 		origin->ds_phys->ds_num_children++;
 
+		if (spa_version(dp->dp_spa) >= SPA_VERSION_NEXT_CLONES) {
+			if (origin->ds_phys->ds_next_clones_obj == 0) {
+				origin->ds_phys->ds_next_clones_obj =
+				    zap_create(mos,
+				    DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx);
+			}
+			VERIFY(0 == zap_add_int(mos,
+			    origin->ds_phys->ds_next_clones_obj,
+			    dsobj, tx));
+		}
+
 		dmu_buf_will_dirty(dd->dd_dbuf, tx);
 		dd->dd_phys->dd_origin_obj = origin->ds_object;
 	}
@@ -829,10 +801,10 @@
 
 	ASSERT(lastname[0] != '@');
 
-	ddobj = dsl_dir_create_sync(pdd, lastname, tx);
+	ddobj = dsl_dir_create_sync(dp, pdd, lastname, tx);
 	VERIFY(0 == dsl_dir_open_obj(dp, ddobj, lastname, FTAG, &dd));
 
-	dsobj = dsl_dataset_create_sync_impl(dd, origin, flags, tx);
+	dsobj = dsl_dataset_create_sync_dd(dd, origin, flags, tx);
 
 	dsl_deleg_set_create_perms(dd, tx, cr);
 
@@ -1155,8 +1127,8 @@
 	*ka->compressedp += BP_GET_PSIZE(bp);
 	*ka->uncompressedp += BP_GET_UCSIZE(bp);
 	/* XXX check for EIO? */
-	(void) arc_free(ka->zio, spa, ka->tx->tx_txg, bp, NULL, NULL,
-	    ARC_NOWAIT);
+	(void) dsl_free(ka->zio, spa_get_dsl(spa), ka->tx->tx_txg,
+	    bp, NULL, NULL, ARC_NOWAIT);
 	return (0);
 }
 
@@ -1251,7 +1223,7 @@
 		    delta, -compressed, -uncompressed, tx);
 	}
 
-	if (ds->ds_prev) {
+	if (ds->ds_prev && ds->ds_prev != ds->ds_dir->dd_pool->dp_origin_snap) {
 		/* Change our contents to that of the prev snapshot */
 		ASSERT3U(ds->ds_prev->ds_object, ==,
 		    ds->ds_phys->ds_prev_snap_obj);
@@ -1270,6 +1242,8 @@
 			ds->ds_prev->ds_phys->ds_unique_bytes = 0;
 		}
 	} else {
+		objset_impl_t *osi;
+
 		/* Zero out our contents, recreate objset */
 		bzero(&ds->ds_phys->ds_bp, sizeof (blkptr_t));
 		ds->ds_phys->ds_used_bytes = 0;
@@ -1277,8 +1251,11 @@
 		ds->ds_phys->ds_uncompressed_bytes = 0;
 		ds->ds_phys->ds_flags = 0;
 		ds->ds_phys->ds_unique_bytes = 0;
-		(void) dmu_objset_create_impl(ds->ds_dir->dd_pool->dp_spa, ds,
+		osi = dmu_objset_create_impl(ds->ds_dir->dd_pool->dp_spa, ds,
 		    &ds->ds_phys->ds_bp, *ost, tx);
+#ifdef _KERNEL
+		zfs_create_fs(&osi->os, kcred, NULL, tx);
+#endif
 	}
 
 	spa_history_internal_log(LOG_DS_ROLLBACK, ds->ds_dir->dd_pool->dp_spa,
@@ -1439,6 +1416,8 @@
 
 	ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock));
 
+	dsl_pool_ds_destroyed(ds, tx);
+
 	obj = ds->ds_object;
 
 	if (ds->ds_phys->ds_prev_snap_obj != 0) {
@@ -1453,6 +1432,16 @@
 
 		dmu_buf_will_dirty(ds_prev->ds_dbuf, tx);
 		if (after_branch_point &&
+		    ds_prev->ds_phys->ds_next_clones_obj != 0) {
+			VERIFY(0 == zap_remove_int(mos,
+			    ds_prev->ds_phys->ds_next_clones_obj, obj, tx));
+			if (ds->ds_phys->ds_next_snap_obj != 0) {
+				VERIFY(0 == zap_add_int(mos,
+				    ds_prev->ds_phys->ds_next_clones_obj,
+				    ds->ds_phys->ds_next_snap_obj, tx));
+			}
+		}
+		if (after_branch_point &&
 		    ds->ds_phys->ds_next_snap_obj == 0) {
 			/* This clone is toast. */
 			ASSERT(ds_prev->ds_phys->ds_num_children > 1);
@@ -1471,8 +1460,6 @@
 		uint64_t itor = 0;
 		uint64_t old_unique;
 
-		spa_scrub_restart(dp->dp_spa, tx->tx_txg);
-
 		VERIFY(0 == dsl_dataset_hold_obj(dp,
 		    ds->ds_phys->ds_next_snap_obj, FTAG, &ds_next));
 		ASSERT3U(ds_next->ds_phys->ds_prev_snap_obj, ==, obj);
@@ -1510,7 +1497,7 @@
 				compressed += BP_GET_PSIZE(&bp);
 				uncompressed += BP_GET_UCSIZE(&bp);
 				/* XXX check return value? */
-				(void) arc_free(zio, dp->dp_spa, tx->tx_txg,
+				(void) dsl_free(zio, dp, tx->tx_txg,
 				    &bp, NULL, NULL, ARC_NOWAIT);
 			}
 		}
@@ -1670,6 +1657,13 @@
 	spa_history_internal_log(LOG_DS_DESTROY, dp->dp_spa, tx,
 	    cr, "dataset = %llu", ds->ds_object);
 
+	if (ds->ds_phys->ds_next_clones_obj != 0) {
+		uint64_t count;
+		ASSERT(0 == zap_count(mos,
+		    ds->ds_phys->ds_next_clones_obj, &count) && count == 0);
+		VERIFY(0 == dmu_object_free(mos,
+		    ds->ds_phys->ds_next_clones_obj, tx));
+	}
 	dsl_dir_close(ds->ds_dir, ds);
 	ds->ds_dir = NULL;
 	dsl_dataset_drain_refs(ds, tag);
@@ -1751,13 +1745,20 @@
 	dsl_pool_t *dp = ds->ds_dir->dd_pool;
 	dmu_buf_t *dbuf;
 	dsl_dataset_phys_t *dsphys;
-	uint64_t dsobj;
+	uint64_t dsobj, crtxg;
 	objset_t *mos = dp->dp_meta_objset;
 	int err;
 
-	spa_scrub_restart(dp->dp_spa, tx->tx_txg);
 	ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock));
 
+	/*
+	 * The origin's ds_creation_txg has to be < TXG_INITIAL
+	 */
+	if (strcmp(snapname, ORIGIN_DIR_NAME) == 0)
+		crtxg = 1;
+	else
+		crtxg = tx->tx_txg;
+
 	dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,
 	    DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx);
 	VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf));
@@ -1773,7 +1774,7 @@
 	dsphys->ds_next_snap_obj = ds->ds_object;
 	dsphys->ds_num_children = 1;
 	dsphys->ds_creation_time = gethrestime_sec();
-	dsphys->ds_creation_txg = tx->tx_txg;
+	dsphys->ds_creation_txg = crtxg;
 	dsphys->ds_deadlist_obj = ds->ds_phys->ds_deadlist_obj;
 	dsphys->ds_used_bytes = ds->ds_phys->ds_used_bytes;
 	dsphys->ds_compressed_bytes = ds->ds_phys->ds_compressed_bytes;
@@ -1784,6 +1785,8 @@
 
 	ASSERT3U(ds->ds_prev != 0, ==, ds->ds_phys->ds_prev_snap_obj != 0);
 	if (ds->ds_prev) {
+		uint64_t next_clones_obj =
+		    ds->ds_prev->ds_phys->ds_next_clones_obj;
 		ASSERT(ds->ds_prev->ds_phys->ds_next_snap_obj ==
 		    ds->ds_object ||
 		    ds->ds_prev->ds_phys->ds_num_children > 1);
@@ -1792,6 +1795,11 @@
 			ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==,
 			    ds->ds_prev->ds_phys->ds_creation_txg);
 			ds->ds_prev->ds_phys->ds_next_snap_obj = dsobj;
+		} else if (next_clones_obj != 0) {
+			VERIFY3U(0, ==, zap_remove_int(mos,
+			    next_clones_obj, dsphys->ds_next_snap_obj, tx));
+			VERIFY3U(0, ==, zap_add_int(mos,
+			    next_clones_obj, dsobj, tx));
 		}
 	}
 
@@ -1809,7 +1817,7 @@
 	dmu_buf_will_dirty(ds->ds_dbuf, tx);
 	ASSERT3U(ds->ds_phys->ds_prev_snap_txg, <, tx->tx_txg);
 	ds->ds_phys->ds_prev_snap_obj = dsobj;
-	ds->ds_phys->ds_prev_snap_txg = tx->tx_txg;
+	ds->ds_phys->ds_prev_snap_txg = crtxg;
 	ds->ds_phys->ds_unique_bytes = 0;
 	if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE)
 		ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
@@ -1828,6 +1836,8 @@
 	VERIFY(0 == dsl_dataset_get_ref(dp,
 	    ds->ds_phys->ds_prev_snap_obj, ds, &ds->ds_prev));
 
+	dsl_pool_ds_snapshotted(ds, tx);
+
 	spa_history_internal_log(LOG_DS_SNAPSHOT, dp->dp_spa, tx, cr,
 	    "dataset = %llu", dsobj);
 }
@@ -1899,7 +1909,7 @@
 
 	/* clone origin is really a dsl_dir thing... */
 	rw_enter(&ds->ds_dir->dd_pool->dp_config_rwlock, RW_READER);
-	if (ds->ds_dir->dd_phys->dd_origin_obj) {
+	if (dsl_dir_is_clone(ds->ds_dir)) {
 		dsl_dataset_t *ods;
 
 		VERIFY(0 == dsl_dataset_get_ref(ds->ds_dir->dd_pool,
@@ -2146,7 +2156,7 @@
 	if (tail == NULL) {
 		int delta = strlen(newname) - strlen(oldname);
 
-		/* if we're growing, validate child size lengths */
+		/* if we're growing, validate child name lengths */
 		if (delta > 0)
 			err = dmu_objset_find(oldname, dsl_valid_rename,
 			    &delta, DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS);
@@ -2189,7 +2199,7 @@
 	return (err);
 }
 
-struct promotedsarg {
+struct promotenode {
 	list_node_t link;
 	dsl_dataset_t *ds;
 };
@@ -2207,7 +2217,7 @@
 {
 	dsl_dataset_t *hds = arg1;
 	struct promotearg *pa = arg2;
-	struct promotedsarg *snap = list_head(&pa->snap_list);
+	struct promotenode *snap = list_head(&pa->snap_list);
 	dsl_pool_t *dp = hds->ds_dir->dd_pool;
 	dsl_dataset_t *origin_ds = snap->ds;
 	dsl_dataset_t *newnext_ds;
@@ -2216,8 +2226,8 @@
 	blkptr_t bp;
 	int err;
 
-	/* Check that it is a clone */
-	if (hds->ds_dir->dd_phys->dd_origin_obj == 0)
+	/* Check that it is a real clone */
+	if (!dsl_dir_is_clone(hds->ds_dir))
 		return (EINVAL);
 
 	/* Since this is so expensive, don't do the preliminary check */
@@ -2321,12 +2331,13 @@
 {
 	dsl_dataset_t *hds = arg1;
 	struct promotearg *pa = arg2;
-	struct promotedsarg *snap = list_head(&pa->snap_list);
+	struct promotenode *snap = list_head(&pa->snap_list);
 	dsl_dataset_t *origin_ds = snap->ds;
 	dsl_dir_t *dd = hds->ds_dir;
 	dsl_pool_t *dp = hds->ds_dir->dd_pool;
 	dsl_dir_t *odd = NULL;
 	char *name;
+	uint64_t oldnext_obj;
 
 	ASSERT(0 == (hds->ds_phys->ds_flags & DS_FLAG_NOPROMOTE));
 
@@ -2339,8 +2350,19 @@
 
 	/* change origin's next snap */
 	dmu_buf_will_dirty(origin_ds->ds_dbuf, tx);
+	oldnext_obj = origin_ds->ds_phys->ds_next_snap_obj;
 	origin_ds->ds_phys->ds_next_snap_obj = pa->newnext_obj;
 
+	/* change the origin's next clone */
+	if (origin_ds->ds_phys->ds_next_clones_obj) {
+		VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
+		    origin_ds->ds_phys->ds_next_clones_obj,
+		    pa->newnext_obj, tx));
+		VERIFY3U(0, ==, zap_add_int(dp->dp_meta_objset,
+		    origin_ds->ds_phys->ds_next_clones_obj,
+		    oldnext_obj, tx));
+	}
+
 	/* change origin */
 	dmu_buf_will_dirty(dd->dd_dbuf, tx);
 	ASSERT3U(dd->dd_phys->dd_origin_obj, ==, origin_ds->ds_object);
@@ -2394,7 +2416,7 @@
 	dsl_pool_t *dp;
 	dmu_object_info_t doi;
 	struct promotearg pa;
-	struct promotedsarg *snap;
+	struct promotenode *snap;
 	uint64_t snap_obj;
 	uint64_t last_snap = 0;
 	int err;
@@ -2420,39 +2442,51 @@
 	 */
 	pa.clone_origin = NULL;
 	list_create(&pa.snap_list,
-	    sizeof (struct promotedsarg), offsetof(struct promotedsarg, link));
+	    sizeof (struct promotenode), offsetof(struct promotenode, link));
 	rw_enter(&dp->dp_config_rwlock, RW_READER);
 	ASSERT(dd->dd_phys->dd_origin_obj != 0);
 	snap_obj = dd->dd_phys->dd_origin_obj;
 	while (snap_obj) {
-		snap = kmem_alloc(sizeof (struct promotedsarg), KM_SLEEP);
-		err = dsl_dataset_own_obj(dp, snap_obj, 0, FTAG, &snap->ds);
+		dsl_dataset_t *snapds;
+
+		/*
+		 * NB: this would be handled by the below check for
+		 * clone of a clone, but then we'd always own_obj() the
+		 * $ORIGIN, thus causing unnecessary EBUSYs.  We don't
+		 * need to set pa.clone_origin because the $ORIGIN has
+		 * no data to account for.
+		 */
+		if (dp->dp_origin_snap &&
+		    snap_obj == dp->dp_origin_snap->ds_object)
+			break;
+
+		err = dsl_dataset_own_obj(dp, snap_obj, 0, FTAG, &snapds);
 		if (err == ENOENT) {
 			/* lost race with snapshot destroy */
-			struct promotedsarg *last = list_tail(&pa.snap_list);
+			struct promotenode *last = list_tail(&pa.snap_list);
 			ASSERT(snap_obj != last->ds->ds_phys->ds_prev_snap_obj);
 			snap_obj = last->ds->ds_phys->ds_prev_snap_obj;
-			kmem_free(snap, sizeof (struct promotedsarg));
 			continue;
 		} else if (err) {
-			kmem_free(snap, sizeof (struct promotedsarg));
 			rw_exit(&dp->dp_config_rwlock);
 			goto out;
 		}
+
 		/*
 		 * We could be a clone of a clone.  If we reach our
 		 * parent's branch point, we're done.
 		 */
 		if (last_snap &&
-		    snap->ds->ds_phys->ds_next_snap_obj != last_snap) {
-			pa.clone_origin = snap->ds;
-			kmem_free(snap, sizeof (struct promotedsarg));
-			snap_obj = 0;
-		} else {
-			list_insert_tail(&pa.snap_list, snap);
-			last_snap = snap_obj;
-			snap_obj = snap->ds->ds_phys->ds_prev_snap_obj;
+		    snapds->ds_phys->ds_next_snap_obj != last_snap) {
+			pa.clone_origin = snapds;
+			break;
 		}
+
+		snap = kmem_alloc(sizeof (struct promotenode), KM_SLEEP);
+		snap->ds = snapds;
+		list_insert_tail(&pa.snap_list, snap);
+		last_snap = snap_obj;
+		snap_obj = snap->ds->ds_phys->ds_prev_snap_obj;
 	}
 	snap = list_head(&pa.snap_list);
 	ASSERT(snap != NULL);
@@ -2476,7 +2510,7 @@
 	while ((snap = list_tail(&pa.snap_list)) != NULL) {
 		list_remove(&pa.snap_list, snap);
 		dsl_dataset_disown(snap->ds, FTAG);
-		kmem_free(snap, sizeof (struct promotedsarg));
+		kmem_free(snap, sizeof (struct promotenode));
 	}
 	list_destroy(&pa.snap_list);
 	if (pa.clone_origin)
--- a/usr/src/uts/common/fs/zfs/dsl_dir.c	Mon Jul 07 13:38:41 2008 -0700
+++ b/usr/src/uts/common/fs/zfs/dsl_dir.c	Mon Jul 07 13:39:21 2008 -0700
@@ -406,23 +406,31 @@
 }
 
 uint64_t
-dsl_dir_create_sync(dsl_dir_t *pds, const char *name, dmu_tx_t *tx)
+dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds, const char *name,
+    dmu_tx_t *tx)
 {
-	objset_t *mos = pds->dd_pool->dp_meta_objset;
+	objset_t *mos = dp->dp_meta_objset;
 	uint64_t ddobj;
 	dsl_dir_phys_t *dsphys;
 	dmu_buf_t *dbuf;
 
 	ddobj = dmu_object_alloc(mos, DMU_OT_DSL_DIR, 0,
 	    DMU_OT_DSL_DIR, sizeof (dsl_dir_phys_t), tx);
-	VERIFY(0 == zap_add(mos, pds->dd_phys->dd_child_dir_zapobj,
-	    name, sizeof (uint64_t), 1, &ddobj, tx));
+	if (pds) {
+		VERIFY(0 == zap_add(mos, pds->dd_phys->dd_child_dir_zapobj,
+		    name, sizeof (uint64_t), 1, &ddobj, tx));
+	} else {
+		/* it's the root dir */
+		VERIFY(0 == zap_add(mos, DMU_POOL_DIRECTORY_OBJECT,
+		    DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1, &ddobj, tx));
+	}
 	VERIFY(0 == dmu_bonus_hold(mos, ddobj, FTAG, &dbuf));
 	dmu_buf_will_dirty(dbuf, tx);
 	dsphys = dbuf->db_data;
 
 	dsphys->dd_creation_time = gethrestime_sec();
-	dsphys->dd_parent_obj = pds->dd_object;
+	if (pds)
+		dsphys->dd_parent_obj = pds->dd_object;
 	dsphys->dd_props_zapobj = zap_create(mos,
 	    DMU_OT_DSL_PROPS, DMU_OT_NONE, 0, tx);
 	dsphys->dd_child_dir_zapobj = zap_create(mos,
@@ -489,31 +497,13 @@
 	VERIFY(0 == dmu_object_free(mos, obj, tx));
 }
 
-void
-dsl_dir_create_root(objset_t *mos, uint64_t *ddobjp, dmu_tx_t *tx)
+boolean_t
+dsl_dir_is_clone(dsl_dir_t *dd)
 {
-	dsl_dir_phys_t *dsp;
-	dmu_buf_t *dbuf;
-	int error;
-
-	*ddobjp = dmu_object_alloc(mos, DMU_OT_DSL_DIR, 0,
-	    DMU_OT_DSL_DIR, sizeof (dsl_dir_phys_t), tx);
-
-	error = zap_add(mos, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ROOT_DATASET,
-	    sizeof (uint64_t), 1, ddobjp, tx);
-	ASSERT3U(error, ==, 0);
-
-	VERIFY(0 == dmu_bonus_hold(mos, *ddobjp, FTAG, &dbuf));
-	dmu_buf_will_dirty(dbuf, tx);
-	dsp = dbuf->db_data;
-
-	dsp->dd_creation_time = gethrestime_sec();
-	dsp->dd_props_zapobj = zap_create(mos,
-	    DMU_OT_DSL_PROPS, DMU_OT_NONE, 0, tx);
-	dsp->dd_child_dir_zapobj = zap_create(mos,
-	    DMU_OT_DSL_DIR_CHILD_MAP, DMU_OT_NONE, 0, tx);
-
-	dmu_buf_rele(dbuf, FTAG);
+	return (dd->dd_phys->dd_origin_obj &&
+	    (dd->dd_pool->dp_origin_snap == NULL ||
+	    dd->dd_phys->dd_origin_obj !=
+	    dd->dd_pool->dp_origin_snap->ds_object));
 }
 
 void
@@ -531,7 +521,7 @@
 	mutex_exit(&dd->dd_lock);
 
 	rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER);
-	if (dd->dd_phys->dd_origin_obj) {
+	if (dsl_dir_is_clone(dd)) {
 		dsl_dataset_t *ds;
 		char buf[MAXNAMELEN];
 
--- a/usr/src/uts/common/fs/zfs/dsl_pool.c	Mon Jul 07 13:38:41 2008 -0700
+++ b/usr/src/uts/common/fs/zfs/dsl_pool.c	Mon Jul 07 13:39:21 2008 -0700
@@ -36,23 +36,26 @@
 #include <sys/zio.h>
 #include <sys/zfs_context.h>
 #include <sys/fs/zfs.h>
+#include <sys/zfs_znode.h>
+#include <sys/spa_impl.h>
 
 int zfs_no_write_throttle = 0;
 uint64_t zfs_write_limit_override = 0;
 
+
 static int
-dsl_pool_open_mos_dir(dsl_pool_t *dp, dsl_dir_t **ddp)
+dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **ddp)
 {
 	uint64_t obj;
 	int err;
 
 	err = zap_lookup(dp->dp_meta_objset,
 	    dp->dp_root_dir->dd_phys->dd_child_dir_zapobj,
-	    MOS_DIR_NAME, sizeof (obj), 1, &obj);
+	    name, sizeof (obj), 1, &obj);
 	if (err)
 		return (err);
 
-	return (dsl_dir_open_obj(dp, obj, MOS_DIR_NAME, dp, ddp));
+	return (dsl_dir_open_obj(dp, obj, name, dp, ddp));
 }
 
 static dsl_pool_t *
@@ -79,6 +82,7 @@
 	    offsetof(dsl_dataset_t, ds_synced_link));
 
 	mutex_init(&dp->dp_lock, NULL, MUTEX_DEFAULT, NULL);
+	mutex_init(&dp->dp_scrub_cancel_lock, NULL, MUTEX_DEFAULT, NULL);
 
 	return (dp);
 }
@@ -88,9 +92,11 @@
 {
 	int err;
 	dsl_pool_t *dp = dsl_pool_open_impl(spa, txg);
+	dsl_dir_t *dd;
+	dsl_dataset_t *ds;
 	objset_impl_t *osi;
 
-	rw_enter(&dp->dp_config_rwlock, RW_READER);
+	rw_enter(&dp->dp_config_rwlock, RW_WRITER);
 	err = dmu_objset_open_impl(spa, NULL, &dp->dp_meta_rootbp, &osi);
 	if (err)
 		goto out;
@@ -107,10 +113,73 @@
 	if (err)
 		goto out;
 
-	err = dsl_pool_open_mos_dir(dp, &dp->dp_mos_dir);
+	err = dsl_pool_open_special_dir(dp, MOS_DIR_NAME, &dp->dp_mos_dir);
 	if (err)
 		goto out;
 
+	if (spa_version(spa) >= SPA_VERSION_ORIGIN) {
+		err = dsl_pool_open_special_dir(dp, ORIGIN_DIR_NAME, &dd);
+		if (err)
+			goto out;
+		err = dsl_dataset_hold_obj(dp, dd->dd_phys->dd_head_dataset_obj,
+		    FTAG, &ds);
+		if (err)
+			goto out;
+		err = dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj,
+		    dp, &dp->dp_origin_snap);
+		if (err)
+			goto out;
+		dsl_dataset_rele(ds, FTAG);
+		dsl_dir_close(dd, dp);
+	}
+
+	/* get scrub status */
+	err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+	    DMU_POOL_SCRUB_FUNC, sizeof (uint32_t), 1,
+	    &dp->dp_scrub_func);
+	if (err == 0) {
+		err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+		    DMU_POOL_SCRUB_QUEUE, sizeof (uint64_t), 1,
+		    &dp->dp_scrub_queue_obj);
+		if (err)
+			goto out;
+		err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+		    DMU_POOL_SCRUB_MIN_TXG, sizeof (uint64_t), 1,
+		    &dp->dp_scrub_min_txg);
+		if (err)
+			goto out;
+		err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+		    DMU_POOL_SCRUB_MAX_TXG, sizeof (uint64_t), 1,
+		    &dp->dp_scrub_max_txg);
+		if (err)
+			goto out;
+		err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+		    DMU_POOL_SCRUB_BOOKMARK, sizeof (uint64_t), 4,
+		    &dp->dp_scrub_bookmark);
+		if (err)
+			goto out;
+		err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+		    DMU_POOL_SCRUB_ERRORS, sizeof (uint64_t), 1,
+		    &spa->spa_scrub_errors);
+		if (err)
+			goto out;
+		if (spa_version(spa) < SPA_VERSION_DSL_SCRUB) {
+			/*
+			 * A new-type scrub was in progress on an old
+			 * pool.  Restart from the beginning, since the
+			 * old software may have changed the pool in the
+			 * meantime.
+			 */
+			dsl_pool_scrub_restart(dp);
+		}
+	} else {
+		/*
+		 * It's OK if there is no scrub in progress (and if
+		 * there was an I/O error, ignore it).
+		 */
+		err = 0;
+	}
+
 out:
 	rw_exit(&dp->dp_config_rwlock);
 	if (err)
@@ -124,7 +193,15 @@
 void
 dsl_pool_close(dsl_pool_t *dp)
 {
-	/* drop our reference from dsl_pool_open() */
+	/* drop our references from dsl_pool_open() */
+
+	/*
+	 * Since we held the origin_snap from "syncing" context (which
+	 * includes pool-opening context), it actually only got a "ref"
+	 * and not a hold, so just drop that here.
+	 */
+	if (dp->dp_origin_snap)
+		dsl_dataset_drop_ref(dp->dp_origin_snap, dp);
 	if (dp->dp_mos_dir)
 		dsl_dir_close(dp->dp_mos_dir, dp);
 	if (dp->dp_root_dir)
@@ -142,6 +219,7 @@
 	txg_fini(dp);
 	rw_destroy(&dp->dp_config_rwlock);
 	mutex_destroy(&dp->dp_lock);
+	mutex_destroy(&dp->dp_scrub_cancel_lock);
 	kmem_free(dp, sizeof (dsl_pool_t));
 }
 
@@ -151,6 +229,11 @@
 	int err;
 	dsl_pool_t *dp = dsl_pool_open_impl(spa, txg);
 	dmu_tx_t *tx = dmu_tx_create_assigned(dp, txg);
+	objset_impl_t *osip;
+	dsl_dataset_t *ds;
+	uint64_t dsobj;
+
+	/* create and open the MOS (meta-objset) */
 	dp->dp_meta_objset = &dmu_objset_create_impl(spa,
 	    NULL, &dp->dp_meta_rootbp, DMU_OST_META, tx)->os;
 
@@ -160,13 +243,29 @@
 	ASSERT3U(err, ==, 0);
 
 	/* create and open the root dir */
-	dsl_dataset_create_root(dp, &dp->dp_root_dir_obj, tx);
+	dp->dp_root_dir_obj = dsl_dir_create_sync(dp, NULL, NULL, tx);
 	VERIFY(0 == dsl_dir_open_obj(dp, dp->dp_root_dir_obj,
 	    NULL, dp, &dp->dp_root_dir));
 
 	/* create and open the meta-objset dir */
-	(void) dsl_dir_create_sync(dp->dp_root_dir, MOS_DIR_NAME, tx);
-	VERIFY(0 == dsl_pool_open_mos_dir(dp, &dp->dp_mos_dir));
+	(void) dsl_dir_create_sync(dp, dp->dp_root_dir, MOS_DIR_NAME, tx);
+	VERIFY(0 == dsl_pool_open_special_dir(dp,
+	    MOS_DIR_NAME, &dp->dp_mos_dir));
+
+	if (spa_version(spa) >= SPA_VERSION_DSL_SCRUB)
+		dsl_pool_create_origin(dp, tx);
+
+	/* create the root dataset */
+	dsobj = dsl_dataset_create_sync_dd(dp->dp_root_dir, NULL, 0, tx);
+
+	/* create the root objset */
+	VERIFY(0 == dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
+	osip = dmu_objset_create_impl(dp->dp_spa, ds,
+	    dsl_dataset_get_blkptr(ds), DMU_OST_ZFS, tx);
+#ifdef _KERNEL
+	zfs_create_fs(&osip->os, kcred, NULL, tx);
+#endif
+	dsl_dataset_rele(ds, FTAG);
 
 	dmu_tx_commit(tx);
 
@@ -202,6 +301,9 @@
 	while (dd = txg_list_remove(&dp->dp_dirty_dirs, txg))
 		dsl_dir_sync(dd, tx);
 
+	if (spa_sync_pass(dp->dp_spa) == 1)
+		dsl_pool_scrub_sync(dp, tx);
+
 	if (list_head(&mosi->os_dirty_dnodes[txg & TXG_MASK]) != NULL ||
 	    list_head(&mosi->os_free_dnodes[txg & TXG_MASK]) != NULL) {
 		zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
@@ -338,3 +440,111 @@
 		mutex_exit(&dp->dp_lock);
 	}
 }
+
+/* ARGSUSED */
+static int
+upgrade_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg)
+{
+	dmu_tx_t *tx = arg;
+	dsl_dataset_t *ds, *prev = NULL;
+	int err;
+	dsl_pool_t *dp = spa_get_dsl(spa);
+
+	err = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds);
+	if (err)
+		return (err);
+
+	while (ds->ds_phys->ds_prev_snap_obj != 0) {
+		err = dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj,
+		    FTAG, &prev);
+		if (err) {
+			dsl_dataset_rele(ds, FTAG);
+			return (err);
+		}
+
+		if (prev->ds_phys->ds_next_snap_obj != ds->ds_object)
+			break;
+		dsl_dataset_rele(ds, FTAG);
+		ds = prev;
+		prev = NULL;
+	}
+
+	if (prev == NULL) {
+		prev = dp->dp_origin_snap;
+
+		/*
+		 * The $ORIGIN can't have any data, or the accounting
+		 * will be wrong.
+		 */
+		ASSERT(prev->ds_phys->ds_bp.blk_birth == 0);
+
+		/* The origin doesn't get attached to itself */
+		if (ds->ds_object == prev->ds_object) {
+			dsl_dataset_rele(ds, FTAG);
+			return (0);
+		}
+
+		dmu_buf_will_dirty(ds->ds_dbuf, tx);
+		ds->ds_phys->ds_prev_snap_obj = prev->ds_object;
+		ds->ds_phys->ds_prev_snap_txg = prev->ds_phys->ds_creation_txg;
+
+		dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx);
+		ds->ds_dir->dd_phys->dd_origin_obj = prev->ds_object;
+
+		dmu_buf_will_dirty(prev->ds_dbuf, tx);
+		prev->ds_phys->ds_num_children++;
+
+		if (ds->ds_phys->ds_next_snap_obj == 0) {
+			ASSERT(ds->ds_prev == NULL);
+			VERIFY(0 == dsl_dataset_hold_obj(dp,
+			    ds->ds_phys->ds_prev_snap_obj, ds, &ds->ds_prev));
+		}
+	}
+
+	ASSERT(ds->ds_dir->dd_phys->dd_origin_obj == prev->ds_object);
+	ASSERT(ds->ds_phys->ds_prev_snap_obj == prev->ds_object);
+
+	if (prev->ds_phys->ds_next_clones_obj == 0) {
+		prev->ds_phys->ds_next_clones_obj =
+		    zap_create(dp->dp_meta_objset,
+		    DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx);
+	}
+	VERIFY(0 == zap_add_int(dp->dp_meta_objset,
+	    prev->ds_phys->ds_next_clones_obj, ds->ds_object, tx));
+
+	dsl_dataset_rele(ds, FTAG);
+	if (prev != dp->dp_origin_snap)
+		dsl_dataset_rele(prev, FTAG);
+	return (0);
+}
+
+void
+dsl_pool_upgrade_clones(dsl_pool_t *dp, dmu_tx_t *tx)
+{
+	ASSERT(dmu_tx_is_syncing(tx));
+	ASSERT(dp->dp_origin_snap != NULL);
+
+	(void) dmu_objset_find_spa(dp->dp_spa, NULL, upgrade_clones_cb,
+	    tx, DS_FIND_CHILDREN);
+}
+
+void
+dsl_pool_create_origin(dsl_pool_t *dp, dmu_tx_t *tx)
+{
+	uint64_t dsobj;
+	dsl_dataset_t *ds;
+
+	ASSERT(dmu_tx_is_syncing(tx));
+	ASSERT(dp->dp_origin_snap == NULL);
+
+	/* create the origin dir, ds, & snap-ds */
+	rw_enter(&dp->dp_config_rwlock, RW_WRITER);
+	dsobj = dsl_dataset_create_sync(dp->dp_root_dir, ORIGIN_DIR_NAME,
+	    NULL, 0, kcred, tx);
+	VERIFY(0 == dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
+	dsl_dataset_snapshot_sync(ds, ORIGIN_DIR_NAME, kcred, tx);
+	VERIFY(0 == dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj,
+	    dp, &dp->dp_origin_snap));
+	dsl_dataset_rele(ds, FTAG);
+	rw_exit(&dp->dp_config_rwlock);
+}
--- a/usr/src/uts/common/fs/zfs/dsl_prop.c	Mon Jul 07 13:38:41 2008 -0700
+++ b/usr/src/uts/common/fs/zfs/dsl_prop.c	Mon Jul 07 13:39:21 2008 -0700
@@ -156,7 +156,7 @@
 }
 
 int
-dsl_prop_get_ds(dsl_dir_t *dd, const char *propname,
+dsl_prop_get_dd(dsl_dir_t *dd, const char *propname,
     int intsz, int numints, void *buf, char *setpoint)
 {
 	int err;
@@ -194,7 +194,7 @@
 		return (ENOENT);
 	}
 
-	err = dsl_prop_get_ds(dd, propname, intsz, numints, buf, setpoint);
+	err = dsl_prop_get_dd(dd, propname, intsz, numints, buf, setpoint);
 
 	dsl_dir_close(dd, FTAG);
 	return (err);
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/common/fs/zfs/dsl_scrub.c	Mon Jul 07 13:39:21 2008 -0700
@@ -0,0 +1,915 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/dsl_pool.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_prop.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_synctask.h>
+#include <sys/dnode.h>
+#include <sys/dmu_tx.h>
+#include <sys/dmu_objset.h>
+#include <sys/arc.h>
+#include <sys/zap.h>
+#include <sys/zio.h>
+#include <sys/zfs_context.h>
+#include <sys/fs/zfs.h>
+#include <sys/zfs_znode.h>
+#include <sys/spa_impl.h>
+#include <sys/vdev_impl.h>
+
+/* XXX */
+#ifndef _KERNEL
+#include <ucontext.h>
+#include <stdio.h>
+#endif
+
+typedef int (scrub_cb_t)(dsl_pool_t *, const blkptr_t *, const zbookmark_t *);
+
+static scrub_cb_t dsl_pool_scrub_clean_cb;
+static dsl_syncfunc_t dsl_pool_scrub_cancel_sync;
+
+int zfs_scrub_min_time = 1; /* scrub for at least 1 sec each txg */
+int zfs_scrub_max_time = 2; /* scrub for at most 2 sec each txg */
+boolean_t zfs_no_scrub_io = B_FALSE; /* set to disable scrub i/o */
+
+extern int zfs_txg_timeout;
+
+static scrub_cb_t *scrub_funcs[SCRUB_FUNC_NUMFUNCS] = {
+	NULL,
+	dsl_pool_scrub_clean_cb
+};
+
+#define	SET_BOOKMARK(zb, objset, object, level, blkid)  \
+{                                                       \
+	(zb)->zb_objset = objset;                       \
+	(zb)->zb_object = object;                       \
+	(zb)->zb_level = level;                         \
+	(zb)->zb_blkid = blkid;                         \
+}
+
+/* ARGSUSED */
+static void
+dsl_pool_scrub_setup_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+{
+	dsl_pool_t *dp = arg1;
+	enum scrub_func *funcp = arg2;
+	dmu_object_type_t ot = 0;
+	boolean_t complete = B_FALSE;
+
+	dsl_pool_scrub_cancel_sync(dp, &complete, cr, tx);
+
+	ASSERT(dp->dp_scrub_func == SCRUB_FUNC_NONE);
+	ASSERT(*funcp > SCRUB_FUNC_NONE);
+	ASSERT(*funcp < SCRUB_FUNC_NUMFUNCS);
+
+	dp->dp_scrub_min_txg = 0;
+	dp->dp_scrub_max_txg = tx->tx_txg;
+
+	if (*funcp == SCRUB_FUNC_CLEAN) {
+		vdev_t *rvd = dp->dp_spa->spa_root_vdev;
+
+		/* rewrite all disk labels */
+		vdev_config_dirty(rvd);
+
+		if (vdev_resilver_needed(rvd,
+		    &dp->dp_scrub_min_txg, &dp->dp_scrub_max_txg)) {
+			spa_event_notify(dp->dp_spa, NULL,
+			    ESC_ZFS_RESILVER_START);
+			dp->dp_scrub_max_txg = MIN(dp->dp_scrub_max_txg,
+			    tx->tx_txg);
+		}
+
+		/* zero out the scrub stats in all vdev_stat_t's */
+		vdev_scrub_stat_update(rvd,
+		    dp->dp_scrub_min_txg ? POOL_SCRUB_RESILVER :
+		    POOL_SCRUB_EVERYTHING, B_FALSE);
+
+		dp->dp_spa->spa_scrub_started = B_TRUE;
+	}
+
+	/* back to the generic stuff */
+
+	if (spa_version(dp->dp_spa) < SPA_VERSION_DSL_SCRUB)
+		ot = DMU_OT_ZAP_OTHER;
+
+	dp->dp_scrub_func = *funcp;
+	dp->dp_scrub_queue_obj = zap_create(dp->dp_meta_objset,
+	    ot ? ot : DMU_OT_SCRUB_QUEUE, DMU_OT_NONE, 0, tx);
+	bzero(&dp->dp_scrub_bookmark, sizeof (zbookmark_t));
+	dp->dp_scrub_restart = B_FALSE;
+
+	VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+	    DMU_POOL_SCRUB_FUNC, sizeof (uint32_t), 1,
+	    &dp->dp_scrub_func, tx));
+	VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+	    DMU_POOL_SCRUB_QUEUE, sizeof (uint64_t), 1,
+	    &dp->dp_scrub_queue_obj, tx));
+	VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+	    DMU_POOL_SCRUB_MIN_TXG, sizeof (uint64_t), 1,
+	    &dp->dp_scrub_min_txg, tx));
+	VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+	    DMU_POOL_SCRUB_MAX_TXG, sizeof (uint64_t), 1,
+	    &dp->dp_scrub_max_txg, tx));
+	VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+	    DMU_POOL_SCRUB_BOOKMARK, sizeof (uint64_t), 4,
+	    &dp->dp_scrub_bookmark, tx));
+	VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+	    DMU_POOL_SCRUB_ERRORS, sizeof (uint64_t), 1,
+	    &dp->dp_spa->spa_scrub_errors, tx));
+
+	spa_history_internal_log(LOG_POOL_SCRUB, dp->dp_spa, tx, cr,
+	    "func=%u mintxg=%llu maxtxg=%llu",
+	    *funcp, dp->dp_scrub_min_txg, dp->dp_scrub_max_txg);
+}
+
+int
+dsl_pool_scrub_setup(dsl_pool_t *dp, enum scrub_func func)
+{
+	return (dsl_sync_task_do(dp, NULL,
+	    dsl_pool_scrub_setup_sync, dp, &func, 0));
+}
+
+/* ARGSUSED */
+static void
+dsl_pool_scrub_cancel_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+{
+	dsl_pool_t *dp = arg1;
+	boolean_t *completep = arg2;
+
+	if (dp->dp_scrub_func == SCRUB_FUNC_NONE)
+		return;
+
+	mutex_enter(&dp->dp_scrub_cancel_lock);
+
+	if (dp->dp_scrub_restart) {
+		dp->dp_scrub_restart = B_FALSE;
+		*completep = B_FALSE;
+	}
+
+	/* XXX this is scrub-clean specific */
+	mutex_enter(&dp->dp_spa->spa_scrub_lock);
+	while (dp->dp_spa->spa_scrub_inflight > 0) {
+		cv_wait(&dp->dp_spa->spa_scrub_io_cv,
+		    &dp->dp_spa->spa_scrub_lock);
+	}
+	mutex_exit(&dp->dp_spa->spa_scrub_lock);
+	dp->dp_spa->spa_scrub_started = B_FALSE;
+
+	dp->dp_scrub_func = SCRUB_FUNC_NONE;
+	VERIFY(0 == dmu_object_free(dp->dp_meta_objset,
+	    dp->dp_scrub_queue_obj, tx));
+	dp->dp_scrub_queue_obj = 0;
+	bzero(&dp->dp_scrub_bookmark, sizeof (zbookmark_t));
+
+	VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+	    DMU_POOL_SCRUB_QUEUE, tx));
+	VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+	    DMU_POOL_SCRUB_MIN_TXG, tx));
+	VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+	    DMU_POOL_SCRUB_MAX_TXG, tx));
+	VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+	    DMU_POOL_SCRUB_BOOKMARK, tx));
+	VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+	    DMU_POOL_SCRUB_FUNC, tx));
+	VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+	    DMU_POOL_SCRUB_ERRORS, tx));
+
+	spa_history_internal_log(LOG_POOL_SCRUB_DONE, dp->dp_spa, tx, cr,
+	    "complete=%u", *completep);
+
+	/* below is scrub-clean specific */
+	vdev_scrub_stat_update(dp->dp_spa->spa_root_vdev, POOL_SCRUB_NONE,
+	    *completep);
+	/*
+	 * If the scrub/resilver completed, update all DTLs to reflect this.
+	 * Whether it succeeded or not, vacate all temporary scrub DTLs.
+	 */
+	vdev_dtl_reassess(dp->dp_spa->spa_root_vdev, tx->tx_txg,
+	    *completep ? dp->dp_scrub_max_txg : 0, B_TRUE);
+	if (dp->dp_scrub_min_txg && *completep)
+		spa_event_notify(dp->dp_spa, NULL, ESC_ZFS_RESILVER_FINISH);
+	spa_errlog_rotate(dp->dp_spa);
+
+	/*
+	 * We may have finished replacing a device.
+	 * Let the async thread assess this and handle the detach.
+	 */
+	spa_async_request(dp->dp_spa, SPA_ASYNC_RESILVER_DONE);
+
+	dp->dp_scrub_min_txg = dp->dp_scrub_max_txg = 0;
+	mutex_exit(&dp->dp_scrub_cancel_lock);
+}
+
+int
+dsl_pool_scrub_cancel(dsl_pool_t *dp)
+{
+	boolean_t complete = B_FALSE;
+
+	return (dsl_sync_task_do(dp, NULL,
+	    dsl_pool_scrub_cancel_sync, dp, &complete, 3));
+}
+
+int
+dsl_free(zio_t *pio, dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp,
+    zio_done_func_t *done, void *private, uint32_t arc_flags)
+{
+	/*
+	 * This function will be used by bp-rewrite wad to intercept frees.
+	 */
+	return (arc_free(pio, dp->dp_spa, txg, (blkptr_t *)bpp,
+	    done, private, arc_flags));
+}
+
+static boolean_t
+bookmark_is_zero(const zbookmark_t *zb)
+{
+	return (zb->zb_objset == 0 && zb->zb_object == 0 &&
+	    zb->zb_level == 0 && zb->zb_blkid == 0);
+}
+
+/* dnp is the dnode for zb1->zb_object */
+static boolean_t
+bookmark_is_before(dnode_phys_t *dnp, const zbookmark_t *zb1,
+    const zbookmark_t *zb2)
+{
+	uint64_t nextL0;
+
+	ASSERT(zb1->zb_objset == zb2->zb_objset);
+	ASSERT(zb1->zb_object != -1ULL);
+	ASSERT(zb2->zb_object != 0);
+	ASSERT(zb2->zb_level == 0);
+
+	/*
+	 * A bookmark in the deadlist is considered to be after
+	 * everything else.
+	 */
+	if (zb2->zb_object == -1ULL)
+		return (B_TRUE);
+
+	/* The objset_phys_t isn't before anything. */
+	if (dnp == NULL)
+		return (B_FALSE);
+
+	nextL0 = (zb1->zb_blkid + 1) <<
+	    ((zb1->zb_level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT));
+
+	if (zb1->zb_object == 0) {
+		uint64_t nextobj = nextL0 *
+		    (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT) >> DNODE_SHIFT;
+		return (nextobj <= zb2->zb_object);
+	}
+
+	if (zb1->zb_object < zb2->zb_object)
+		return (B_TRUE);
+	if (zb1->zb_object > zb2->zb_object)
+		return (B_FALSE);
+
+	return (nextL0 <= zb2->zb_blkid);
+}
+
+static boolean_t
+scrub_pause(dsl_pool_t *dp, const zbookmark_t *zb)
+{
+	int elapsed_ticks;
+
+	if (dp->dp_scrub_pausing)
+		return (B_TRUE); /* we're already pausing */
+
+	if (!bookmark_is_zero(&dp->dp_scrub_bookmark))
+		return (B_FALSE); /* we're resuming */
+
+	/* we don't yet know how to resume from anything but leaf blocks */
+	if (zb->zb_object == 0 || zb->zb_level != 0)
+		return (B_FALSE);
+
+	elapsed_ticks = lbolt64 - dp->dp_scrub_start_time;
+	if (elapsed_ticks > hz * zfs_txg_timeout ||
+	    (elapsed_ticks > hz * zfs_scrub_min_time && txg_sync_waiting(dp))) {
+		dprintf("pausing at %llx/%llx/%llx/%llx\n",
+		    (longlong_t)zb->zb_objset, (longlong_t)zb->zb_object,
+		    (longlong_t)zb->zb_level, (longlong_t)zb->zb_blkid);
+		dp->dp_scrub_pausing = B_TRUE;
+		dp->dp_scrub_bookmark = *zb;
+		return (B_TRUE);
+	}
+	return (B_FALSE);
+}
+
+/* ARGSUSED */
+static void
+traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
+{
+	dsl_pool_t *dp = arg;
+
+	if (bp->blk_birth <= dp->dp_scrub_min_txg)
+		return;
+
+	if (claim_txg != 0 || bp->blk_birth < spa_first_txg(dp->dp_spa)) {
+		zbookmark_t zb = { 0 };
+		/* XXX figure out zb.objset */
+		zb.zb_blkid = bp->blk_cksum.zc_word[ZIL_ZC_SEQ];
+		VERIFY(0 ==
+		    scrub_funcs[dp->dp_scrub_func](dp, bp, &zb));
+	}
+}
+
+/* ARGSUSED */
+static void
+traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg)
+{
+	dsl_pool_t *dp = arg;
+
+	if (lrc->lrc_txtype == TX_WRITE) {
+		lr_write_t *lr = (lr_write_t *)lrc;
+		blkptr_t *bp = &lr->lr_blkptr;
+
+		if (bp->blk_birth <= dp->dp_scrub_min_txg)
+			return;
+
+		if (claim_txg != 0 && bp->blk_birth >= claim_txg) {
+			zbookmark_t zb = { 0 };
+			/* XXX figure out zb.objset */
+			zb.zb_object = lr->lr_foid;
+			zb.zb_blkid = lr->lr_offset / BP_GET_LSIZE(bp);
+			VERIFY(0 ==
+			    scrub_funcs[dp->dp_scrub_func](dp, bp, &zb));
+		}
+	}
+}
+
+static void
+traverse_zil(dsl_pool_t *dp, zil_header_t *zh)
+{
+	uint64_t claim_txg = zh->zh_claim_txg;
+	zilog_t *zilog;
+
+	/*
+	 * We only want to visit blocks that have been claimed but not yet
+	 * replayed (or, in read-only mode, blocks that *would* be claimed).
+	 */
+	if (claim_txg == 0 && (spa_mode & FWRITE))
+		return;
+
+	zilog = zil_alloc(dp->dp_meta_objset, zh);
+
+	(void) zil_parse(zilog, traverse_zil_block, traverse_zil_record, dp,
+	    claim_txg);
+
+	zil_free(zilog);
+}
+
+static void
+scrub_visitbp(dsl_pool_t *dp, dnode_phys_t *dnp,
+    arc_buf_t *pbuf, blkptr_t *bp, const zbookmark_t *zb)
+{
+	int err;
+	arc_buf_t *buf = NULL;
+
+	if (bp->blk_birth == 0)
+		return;
+
+	dprintf_bp(bp, "scrub_visitbp bm %lld/%lld/%lld/%lld: ",
+	    zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid);
+
+	if (bp->blk_birth <= dp->dp_scrub_min_txg)
+		return;
+
+	if (scrub_pause(dp, zb))
+		return;
+
+	if (!bookmark_is_zero(&dp->dp_scrub_bookmark)) {
+		/*
+		 * If we already visited this bp & everything below (in
+		 * a prior txg), don't bother doing it again.
+		 */
+		if (bookmark_is_before(dnp, zb, &dp->dp_scrub_bookmark))
+			return;
+
+		/*
+		 * If we found the block we're trying to resume from, or
+		 * we went past it to a different object, zero it out to
+		 * indicate that it's OK to start checking for pausing
+		 * again.
+		 */
+		if (bcmp(zb, &dp->dp_scrub_bookmark, sizeof (*zb)) == 0 ||
+		    zb->zb_object > dp->dp_scrub_bookmark.zb_object) {
+			dprintf("resuming at %llx/%llx/%llx/%llx\n",
+			    (longlong_t)zb->zb_objset,
+			    (longlong_t)zb->zb_object,
+			    (longlong_t)zb->zb_level,
+			    (longlong_t)zb->zb_blkid);
+			bzero(&dp->dp_scrub_bookmark, sizeof (*zb));
+		}
+	}
+
+	if (BP_GET_LEVEL(bp) > 0) {
+		uint32_t flags = ARC_WAIT;
+		int i;
+		blkptr_t *cbp;
+		int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
+
+		err = arc_read(NULL, dp->dp_spa, bp, pbuf,
+		    arc_getbuf_func, &buf,
+		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
+		if (err) {
+			mutex_enter(&dp->dp_spa->spa_scrub_lock);
+			dp->dp_spa->spa_scrub_errors++;
+			mutex_exit(&dp->dp_spa->spa_scrub_lock);
+			return;
+		}
+		cbp = buf->b_data;
+
+		for (i = 0; i < epb; i++, cbp++) {
+			zbookmark_t czb;
+
+			SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object,
+			    zb->zb_level - 1,
+			    zb->zb_blkid * epb + i);
+			scrub_visitbp(dp, dnp, buf, cbp, &czb);
+		}
+	} else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) {
+		uint32_t flags = ARC_WAIT;
+		dnode_phys_t *child_dnp;
+		int i, j;
+		int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
+
+		err = arc_read(NULL, dp->dp_spa, bp, pbuf,
+		    arc_getbuf_func, &buf,
+		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
+		if (err) {
+			mutex_enter(&dp->dp_spa->spa_scrub_lock);
+			dp->dp_spa->spa_scrub_errors++;
+			mutex_exit(&dp->dp_spa->spa_scrub_lock);
+			return;
+		}
+		child_dnp = buf->b_data;
+
+		for (i = 0; i < epb; i++, child_dnp++) {
+			for (j = 0; j < child_dnp->dn_nblkptr; j++) {
+				zbookmark_t czb;
+
+				SET_BOOKMARK(&czb, zb->zb_objset,
+				    zb->zb_blkid * epb + i,
+				    child_dnp->dn_nlevels - 1, j);
+				scrub_visitbp(dp, child_dnp, buf,
+				    &child_dnp->dn_blkptr[j], &czb);
+			}
+		}
+	} else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
+		uint32_t flags = ARC_WAIT;
+		objset_phys_t *osp;
+		int j;
+
+		err = arc_read_nolock(NULL, dp->dp_spa, bp,
+		    arc_getbuf_func, &buf,
+		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
+		if (err) {
+			mutex_enter(&dp->dp_spa->spa_scrub_lock);
+			dp->dp_spa->spa_scrub_errors++;
+			mutex_exit(&dp->dp_spa->spa_scrub_lock);
+			return;
+		}
+
+		osp = buf->b_data;
+
+		traverse_zil(dp, &osp->os_zil_header);
+
+		for (j = 0; j < osp->os_meta_dnode.dn_nblkptr; j++) {
+			zbookmark_t czb;
+
+			SET_BOOKMARK(&czb, zb->zb_objset, 0,
+			    osp->os_meta_dnode.dn_nlevels - 1, j);
+			scrub_visitbp(dp, &osp->os_meta_dnode, buf,
+			    &osp->os_meta_dnode.dn_blkptr[j], &czb);
+		}
+	}
+
+	(void) scrub_funcs[dp->dp_scrub_func](dp, bp, zb);
+	if (buf)
+		(void) arc_buf_remove_ref(buf, &buf);
+}
+
+static void
+scrub_visit_rootbp(dsl_pool_t *dp, dsl_dataset_t *ds, blkptr_t *bp)
+{
+	zbookmark_t zb;
+
+	SET_BOOKMARK(&zb, ds ? ds->ds_object : 0, 0, -1, 0);
+	scrub_visitbp(dp, NULL, NULL, bp, &zb);
+}
+
+void
+dsl_pool_ds_destroyed(dsl_dataset_t *ds, dmu_tx_t *tx)
+{
+	dsl_pool_t *dp = ds->ds_dir->dd_pool;
+
+	if (dp->dp_scrub_func == SCRUB_FUNC_NONE)
+		return;
+
+	if (dp->dp_scrub_bookmark.zb_objset == ds->ds_object) {
+		SET_BOOKMARK(&dp->dp_scrub_bookmark, -1, 0, 0, 0);
+	} else if (zap_remove_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj,
+	    ds->ds_object, tx) != 0) {
+		return;
+	}
+
+	if (ds->ds_phys->ds_next_snap_obj != 0) {
+		VERIFY(zap_add_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj,
+		    ds->ds_phys->ds_next_snap_obj, tx) == 0);
+	}
+	ASSERT3U(ds->ds_phys->ds_num_children, <=, 1);
+}
+
+void
+dsl_pool_ds_snapshotted(dsl_dataset_t *ds, dmu_tx_t *tx)
+{
+	dsl_pool_t *dp = ds->ds_dir->dd_pool;
+
+	if (dp->dp_scrub_func == SCRUB_FUNC_NONE)
+		return;
+
+	ASSERT(ds->ds_phys->ds_prev_snap_obj != 0);
+
+	if (dp->dp_scrub_bookmark.zb_objset == ds->ds_object) {
+		dp->dp_scrub_bookmark.zb_objset =
+		    ds->ds_phys->ds_prev_snap_obj;
+	} else if (zap_remove_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj,
+	    ds->ds_object, tx) == 0) {
+		VERIFY(zap_add_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj,
+		    ds->ds_phys->ds_prev_snap_obj, tx) == 0);
+	}
+}
+
+struct enqueue_clones_arg {
+	dmu_tx_t *tx;
+	uint64_t originobj;
+};
+
+/* ARGSUSED */
+static int
+enqueue_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg)
+{
+	struct enqueue_clones_arg *eca = arg;
+	dsl_dataset_t *ds;
+	int err;
+	dsl_pool_t *dp;
+
+	err = dsl_dataset_hold_obj(spa->spa_dsl_pool, dsobj, FTAG, &ds);
+	if (err)
+		return (err);
+	dp = ds->ds_dir->dd_pool;
+
+	if (ds->ds_dir->dd_phys->dd_origin_obj == eca->originobj) {
+		while (ds->ds_phys->ds_prev_snap_obj != eca->originobj) {
+			dsl_dataset_t *prev;
+			err = dsl_dataset_hold_obj(dp,
+			    ds->ds_phys->ds_prev_snap_obj, FTAG, &prev);
+
+			dsl_dataset_rele(ds, FTAG);
+			if (err)
+				return (err);
+			ds = prev;
+		}
+		VERIFY(zap_add_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj,
+		    ds->ds_object, eca->tx) == 0);
+	}
+	dsl_dataset_rele(ds, FTAG);
+	return (0);
+}
+
+static void
+scrub_visitds(dsl_pool_t *dp, uint64_t dsobj, dmu_tx_t *tx)
+{
+	dsl_dataset_t *ds;
+	uint64_t min_txg_save;
+
+	VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
+
+	/*
+	 * Iterate over the bps in this ds.
+	 */
+	min_txg_save = dp->dp_scrub_min_txg;
+	dp->dp_scrub_min_txg =
+	    MAX(dp->dp_scrub_min_txg, ds->ds_phys->ds_prev_snap_txg);
+	scrub_visit_rootbp(dp, ds, &ds->ds_phys->ds_bp);
+	dp->dp_scrub_min_txg = min_txg_save;
+
+	if (dp->dp_scrub_pausing)
+		goto out;
+
+	/*
+	 * Add descendent datasets to work queue.
+	 */
+	if (ds->ds_phys->ds_next_snap_obj != 0) {
+		VERIFY(zap_add_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj,
+		    ds->ds_phys->ds_next_snap_obj, tx) == 0);
+	}
+	if (ds->ds_phys->ds_num_children > 1) {
+		if (spa_version(dp->dp_spa) < SPA_VERSION_DSL_SCRUB) {
+			struct enqueue_clones_arg eca;
+			eca.tx = tx;
+			eca.originobj = ds->ds_object;
+
+			(void) dmu_objset_find_spa(ds->ds_dir->dd_pool->dp_spa,
+			    NULL, enqueue_clones_cb, &eca, DS_FIND_CHILDREN);
+		} else {
+			VERIFY(zap_join(dp->dp_meta_objset,
+			    ds->ds_phys->ds_next_clones_obj,
+			    dp->dp_scrub_queue_obj, tx) == 0);
+		}
+	}
+
+out:
+	dsl_dataset_rele(ds, FTAG);
+}
+
+/* ARGSUSED */
+static int
+enqueue_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg)
+{
+	dmu_tx_t *tx = arg;
+	dsl_dataset_t *ds;
+	int err;
+	dsl_pool_t *dp;
+
+	err = dsl_dataset_hold_obj(spa->spa_dsl_pool, dsobj, FTAG, &ds);
+	if (err)
+		return (err);
+
+	dp = ds->ds_dir->dd_pool;
+
+	while (ds->ds_phys->ds_prev_snap_obj != 0) {
+		dsl_dataset_t *prev;
+		err = dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj,
+		    FTAG, &prev);
+		if (err) {
+			dsl_dataset_rele(ds, FTAG);
+			return (err);
+		}
+
+		/*
+		 * If this is a clone, we don't need to worry about it for now.
+		 */
+		if (prev->ds_phys->ds_next_snap_obj != ds->ds_object) {
+			dsl_dataset_rele(ds, FTAG);
+			dsl_dataset_rele(prev, FTAG);
+			return (0);
+		}
+		dsl_dataset_rele(ds, FTAG);
+		ds = prev;
+	}
+
+	VERIFY(zap_add_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj,
+	    ds->ds_object, tx) == 0);
+	dsl_dataset_rele(ds, FTAG);
+	return (0);
+}
+
+void
+dsl_pool_scrub_sync(dsl_pool_t *dp, dmu_tx_t *tx)
+{
+	zap_cursor_t zc;
+	zap_attribute_t za;
+	boolean_t complete = B_TRUE;
+
+	if (dp->dp_scrub_func == SCRUB_FUNC_NONE)
+		return;
+
+	/* If the spa is not fully loaded, don't bother. */
+	if (dp->dp_spa->spa_load_state != SPA_LOAD_NONE)
+		return;
+
+	if (dp->dp_scrub_restart) {
+		enum scrub_func func = dp->dp_scrub_func;
+		dp->dp_scrub_restart = B_FALSE;
+		dsl_pool_scrub_setup_sync(dp, &func, kcred, tx);
+	}
+
+	if (dp->dp_spa->spa_root_vdev->vdev_stat.vs_scrub_type == 0) {
+		/*
+		 * We must have resumed after rebooting; reset the vdev
+		 * stats to know that we're doing a scrub (although it
+		 * will think we're just starting now).
+		 */
+		vdev_scrub_stat_update(dp->dp_spa->spa_root_vdev,
+		    dp->dp_scrub_min_txg ? POOL_SCRUB_RESILVER :
+		    POOL_SCRUB_EVERYTHING, B_FALSE);
+	}
+
+	dp->dp_scrub_pausing = B_FALSE;
+	dp->dp_scrub_start_time = lbolt64;
+	dp->dp_scrub_isresilver = (dp->dp_scrub_min_txg != 0);
+
+	if (dp->dp_scrub_bookmark.zb_objset == 0) {
+		/* First do the MOS & ORIGIN */
+		scrub_visit_rootbp(dp, NULL, &dp->dp_meta_rootbp);
+		if (dp->dp_scrub_pausing)
+			goto out;
+
+		if (spa_version(dp->dp_spa) < SPA_VERSION_DSL_SCRUB) {
+			VERIFY(0 == dmu_objset_find_spa(dp->dp_spa,
+			    NULL, enqueue_cb, tx, DS_FIND_CHILDREN));
+		} else {
+			scrub_visitds(dp, dp->dp_origin_snap->ds_object, tx);
+		}
+		ASSERT(!dp->dp_scrub_pausing);
+	} else if (dp->dp_scrub_bookmark.zb_objset != -1ULL) {
+		/*
+		 * If we were paused, continue from here.  Note if the
+		 * ds we were paused on was deleted, the zb_objset will
+		 * be -1, so we will skip this and find a new objset
+		 * below.
+		 */
+		scrub_visitds(dp, dp->dp_scrub_bookmark.zb_objset, tx);
+		if (dp->dp_scrub_pausing)
+			goto out;
+	}
+
+	/*
+	 * In case we were paused right at the end of the ds, zero the
+	 * bookmark so we don't think that we're still trying to resume.
+	 */
+	bzero(&dp->dp_scrub_bookmark, sizeof (zbookmark_t));
+
+	/* keep pulling things out of the zap-object-as-queue */
+	while (zap_cursor_init(&zc, dp->dp_meta_objset, dp->dp_scrub_queue_obj),
+	    zap_cursor_retrieve(&zc, &za) == 0) {
+		VERIFY(0 == zap_remove(dp->dp_meta_objset,
+		    dp->dp_scrub_queue_obj, za.za_name, tx));
+		scrub_visitds(dp, za.za_first_integer, tx);
+		if (dp->dp_scrub_pausing)
+			break;
+		zap_cursor_fini(&zc);
+	}
+	zap_cursor_fini(&zc);
+	if (dp->dp_scrub_pausing)
+		goto out;
+
+	/* done. */
+
+	dsl_pool_scrub_cancel_sync(dp, &complete, kcred, tx);
+	return;
+out:
+	VERIFY(0 == zap_update(dp->dp_meta_objset,
+	    DMU_POOL_DIRECTORY_OBJECT,
+	    DMU_POOL_SCRUB_BOOKMARK, sizeof (uint64_t), 4,
+	    &dp->dp_scrub_bookmark, tx));
+	VERIFY(0 == zap_update(dp->dp_meta_objset,
+	    DMU_POOL_DIRECTORY_OBJECT,
+	    DMU_POOL_SCRUB_ERRORS, sizeof (uint64_t), 1,
+	    &dp->dp_spa->spa_scrub_errors, tx));
+
+	/* XXX this is scrub-clean specific */
+	mutex_enter(&dp->dp_spa->spa_scrub_lock);
+	while (dp->dp_spa->spa_scrub_inflight > 0) {
+		cv_wait(&dp->dp_spa->spa_scrub_io_cv,
+		    &dp->dp_spa->spa_scrub_lock);
+	}
+	mutex_exit(&dp->dp_spa->spa_scrub_lock);
+}
+
+void
+dsl_pool_scrub_restart(dsl_pool_t *dp)
+{
+	mutex_enter(&dp->dp_scrub_cancel_lock);
+	dp->dp_scrub_restart = B_TRUE;
+	mutex_exit(&dp->dp_scrub_cancel_lock);
+}
+
+/*
+ * scrub consumers
+ */
+
+static void
+dsl_pool_scrub_clean_done(zio_t *zio)
+{
+	spa_t *spa = zio->io_spa;
+
+	zio_data_buf_free(zio->io_data, zio->io_size);
+
+	mutex_enter(&spa->spa_scrub_lock);
+	spa->spa_scrub_inflight--;
+	cv_broadcast(&spa->spa_scrub_io_cv);
+
+	if (zio->io_error)
+		spa->spa_scrub_errors++;
+	mutex_exit(&spa->spa_scrub_lock);
+}
+
+static int
+dsl_pool_scrub_clean_cb(dsl_pool_t *dp,
+    const blkptr_t *bp, const zbookmark_t *zb)
+{
+	size_t size = BP_GET_LSIZE(bp);
+	int d;
+	spa_t *spa = dp->dp_spa;
+	boolean_t needs_io;
+	int zio_flags = ZIO_FLAG_SCRUB_THREAD | ZIO_FLAG_CANFAIL;
+	int zio_priority;
+
+	dprintf_bp(bp, "visiting %s", "");
+
+	if (dp->dp_scrub_isresilver == 0) {
+		/* It's a scrub */
+		zio_flags |= ZIO_FLAG_SCRUB;
+		zio_priority = ZIO_PRIORITY_SCRUB;
+		needs_io = B_TRUE;
+	} else {
+		/* It's a resilver */
+		zio_flags |= ZIO_FLAG_RESILVER;
+		zio_priority = ZIO_PRIORITY_RESILVER;
+		needs_io = B_FALSE;
+	}
+
+	for (d = 0; d < BP_GET_NDVAS(bp); d++) {
+		vdev_t *vd = vdev_lookup_top(spa,
+		    DVA_GET_VDEV(&bp->blk_dva[d]));
+
+		/*
+		 * Keep track of how much data we've examined so that
+		 * zpool(1M) status can make useful progress reports.
+		 */
+		mutex_enter(&vd->vdev_stat_lock);
+		vd->vdev_stat.vs_scrub_examined +=
+		    DVA_GET_ASIZE(&bp->blk_dva[d]);
+		mutex_exit(&vd->vdev_stat_lock);
+
+		/* if it's a resilver, this may not be in the target range */
+		if (!needs_io) {
+			if (DVA_GET_GANG(&bp->blk_dva[d])) {
+				/*
+				 * Gang members may be spread across multiple
+				 * vdevs, so the best we can do is look at the
+				 * pool-wide DTL.
+				 * XXX -- it would be better to change our
+				 * allocation policy to ensure that this can't
+				 * happen.
+				 */
+				vd = spa->spa_root_vdev;
+			}
+			needs_io = vdev_dtl_contains(&vd->vdev_dtl_map,
+			    bp->blk_birth, 1);
+		}
+	}
+
+	if (needs_io && !zfs_no_scrub_io) {
+		void *data = zio_data_buf_alloc(size);
+
+		mutex_enter(&spa->spa_scrub_lock);
+		while (spa->spa_scrub_inflight >= spa->spa_scrub_maxinflight)
+			cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
+		spa->spa_scrub_inflight++;
+		mutex_exit(&spa->spa_scrub_lock);
+
+		zio_nowait(zio_read(NULL, spa, bp, data, size,
+		    dsl_pool_scrub_clean_done, NULL, zio_priority,
+		    zio_flags, zb));
+	}
+
+	/* do not relocate this block */
+	return (0);
+}
+
+int
+dsl_pool_scrub_clean(dsl_pool_t *dp)
+{
+	/*
+	 * Purge all vdev caches.  We do this here rather than in sync
+	 * context because this requires a writer lock on the spa_config
+	 * lock, which we can't do from sync context.  The
+	 * spa_scrub_reopen flag indicates that vdev_open() should not
+	 * attempt to start another scrub.
+	 */
+	spa_config_enter(dp->dp_spa, RW_WRITER, FTAG);
+	dp->dp_spa->spa_scrub_reopen = B_TRUE;
+	vdev_reopen(dp->dp_spa->spa_root_vdev);
+	dp->dp_spa->spa_scrub_reopen = B_FALSE;
+	spa_config_exit(dp->dp_spa, FTAG);
+
+	return (dsl_pool_scrub_setup(dp, SCRUB_FUNC_CLEAN));
+}
--- a/usr/src/uts/common/fs/zfs/spa.c	Mon Jul 07 13:38:41 2008 -0700
+++ b/usr/src/uts/common/fs/zfs/spa.c	Mon Jul 07 13:39:21 2008 -0700
@@ -1365,6 +1365,7 @@
 
 	error = 0;
 out:
+	spa->spa_minref = refcount_count(&spa->spa_refcount);
 	if (error && error != EBADF)
 		zfs_ereport_post(FM_EREPORT_ZFS_POOL, spa, NULL, NULL, 0, 0);
 	spa->spa_load_state = SPA_LOAD_NONE;
@@ -1390,7 +1391,6 @@
 {
 	spa_t *spa;
 	int error;
-	int loaded = B_FALSE;
 	int locked = B_FALSE;
 
 	*spapp = NULL;
@@ -1456,18 +1456,10 @@
 		} else {
 			spa->spa_last_open_failed = B_FALSE;
 		}
-
-		loaded = B_TRUE;
 	}
 
 	spa_open_ref(spa, tag);
 
-	/*
-	 * If we just loaded the pool, resilver anything that's out of date.
-	 */
-	if (loaded && (spa_mode & FWRITE))
-		VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0);
-
 	if (locked)
 		mutex_exit(&spa_namespace_lock);
 
@@ -2023,6 +2015,8 @@
 
 	mutex_exit(&spa_namespace_lock);
 
+	spa->spa_minref = refcount_count(&spa->spa_refcount);
+
 	return (0);
 }
 
@@ -2040,7 +2034,6 @@
 	nvlist_t *nvroot;
 	nvlist_t **spares, **l2cache;
 	uint_t nspares, nl2cache;
-	int mosconfig = isroot? B_FALSE : B_TRUE;
 
 	/*
 	 * If a pool with this name exists, return failure.
@@ -2065,10 +2058,11 @@
 
 	/*
 	 * Pass off the heavy lifting to spa_load().
-	 * Pass TRUE for mosconfig because the user-supplied config
-	 * is actually the one to trust when doing an import.
+	 * Pass TRUE for mosconfig (unless this is a root pool) because
+	 * the user-supplied config is actually the one to trust when
+	 * doing an import.
 	 */
-	loaderr = error = spa_load(spa, config, SPA_LOAD_IMPORT, mosconfig);
+	loaderr = error = spa_load(spa, config, SPA_LOAD_IMPORT, !isroot);
 
 	spa_config_enter(spa, RW_WRITER, FTAG);
 	/*
@@ -2162,13 +2156,6 @@
 		 * Update the config cache to include the newly-imported pool.
 		 */
 		spa_config_update_common(spa, SPA_CONFIG_UPDATE_POOL, isroot);
-
-		/*
-		 * Resilver anything that's out of date.
-		 */
-		if (!isroot)
-			VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER,
-			    B_TRUE) == 0);
 	}
 
 	spa->spa_import_faulted = B_FALSE;
@@ -2471,7 +2458,6 @@
 		 * Objsets may be open only because they're dirty, so we
 		 * have to force it to sync before checking spa_refcnt.
 		 */
-		spa_scrub_suspend(spa);
 		txg_wait_synced(spa->spa_dsl_pool, 0);
 
 		/*
@@ -2482,15 +2468,11 @@
 		if (!spa_refcount_zero(spa) ||
 		    (spa->spa_inject_ref != 0 &&
 		    new_state != POOL_STATE_UNINITIALIZED)) {
-			spa_scrub_resume(spa);
 			spa_async_resume(spa);
 			mutex_exit(&spa_namespace_lock);
 			return (EBUSY);
 		}
 
-		spa_scrub_resume(spa);
-		VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0);
-
 		/*
 		 * We want this to be reflected on every label,
 		 * so mark them all dirty.  spa_unload() will do the
@@ -2552,7 +2534,6 @@
 	return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL));
 }
 
-
 /*
  * ==========================================================================
  * Device manipulation
@@ -2837,12 +2818,9 @@
 	(void) spa_vdev_exit(spa, newrootvd, open_txg, 0);
 
 	/*
-	 * Kick off a resilver to update newvd.  We need to grab the namespace
-	 * lock because spa_scrub() needs to post a sysevent with the pool name.
+	 * Kick off a resilver to update newvd.
 	 */
-	mutex_enter(&spa_namespace_lock);
-	VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0);
-	mutex_exit(&spa_namespace_lock);
+	VERIFY3U(spa_scrub(spa, POOL_SCRUB_RESILVER), ==, 0);
 
 	return (0);
 }
@@ -3368,404 +3346,36 @@
  * ==========================================================================
  */
 
-static void
-spa_scrub_io_done(zio_t *zio)
-{
-	spa_t *spa = zio->io_spa;
-
-	arc_data_buf_free(zio->io_data, zio->io_size);
-
-	mutex_enter(&spa->spa_scrub_lock);
-	if (zio->io_error && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
-		vdev_t *vd = zio->io_vd ? zio->io_vd : spa->spa_root_vdev;
-		spa->spa_scrub_errors++;
-		mutex_enter(&vd->vdev_stat_lock);
-		vd->vdev_stat.vs_scrub_errors++;
-		mutex_exit(&vd->vdev_stat_lock);
-	}
-
-	if (--spa->spa_scrub_inflight < spa->spa_scrub_maxinflight)
-		cv_broadcast(&spa->spa_scrub_io_cv);
-
-	ASSERT(spa->spa_scrub_inflight >= 0);
-
-	mutex_exit(&spa->spa_scrub_lock);
-}
-
-static void
-spa_scrub_io_start(spa_t *spa, blkptr_t *bp, int priority, int flags,
-    zbookmark_t *zb)
-{
-	size_t size = BP_GET_LSIZE(bp);
-	void *data;
-
-	mutex_enter(&spa->spa_scrub_lock);
-	/*
-	 * Do not give too much work to vdev(s).
-	 */
-	while (spa->spa_scrub_inflight >= spa->spa_scrub_maxinflight) {
-		cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
-	}
-	spa->spa_scrub_inflight++;
-	mutex_exit(&spa->spa_scrub_lock);
-
-	data = arc_data_buf_alloc(size);
-
-	if (zb->zb_level == -1 && BP_GET_TYPE(bp) != DMU_OT_OBJSET)
-		flags |= ZIO_FLAG_SPECULATIVE;	/* intent log block */
-
-	flags |= ZIO_FLAG_SCRUB_THREAD | ZIO_FLAG_CANFAIL;
-
-	zio_nowait(zio_read(NULL, spa, bp, data, size,
-	    spa_scrub_io_done, NULL, priority, flags, zb));
-}
-
-/* ARGSUSED */
-static int
-spa_scrub_cb(traverse_blk_cache_t *bc, spa_t *spa, void *a)
-{
-	blkptr_t *bp = &bc->bc_blkptr;
-	vdev_t *vd = spa->spa_root_vdev;
-	dva_t *dva = bp->blk_dva;
-	int needs_resilver = B_FALSE;
-	int d;
-
-	if (bc->bc_errno) {
-		/*
-		 * We can't scrub this block, but we can continue to scrub
-		 * the rest of the pool.  Note the error and move along.
-		 */
-		mutex_enter(&spa->spa_scrub_lock);
-		spa->spa_scrub_errors++;
-		mutex_exit(&spa->spa_scrub_lock);
-
-		mutex_enter(&vd->vdev_stat_lock);
-		vd->vdev_stat.vs_scrub_errors++;
-		mutex_exit(&vd->vdev_stat_lock);
-
-		return (ERESTART);
-	}
-
-	ASSERT(bp->blk_birth < spa->spa_scrub_maxtxg);
-
-	for (d = 0; d < BP_GET_NDVAS(bp); d++) {
-		vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d]));
-
-		ASSERT(vd != NULL);
-
-		/*
-		 * Keep track of how much data we've examined so that
-		 * zpool(1M) status can make useful progress reports.
-		 */
-		mutex_enter(&vd->vdev_stat_lock);
-		vd->vdev_stat.vs_scrub_examined += DVA_GET_ASIZE(&dva[d]);
-		mutex_exit(&vd->vdev_stat_lock);
-
-		if (spa->spa_scrub_type == POOL_SCRUB_RESILVER) {
-			if (DVA_GET_GANG(&dva[d])) {
-				/*
-				 * Gang members may be spread across multiple
-				 * vdevs, so the best we can do is look at the
-				 * pool-wide DTL.
-				 * XXX -- it would be better to change our
-				 * allocation policy to ensure that this can't
-				 * happen.
-				 */
-				vd = spa->spa_root_vdev;
-			}
-			if (vdev_dtl_contains(&vd->vdev_dtl_map,
-			    bp->blk_birth, 1))
-				needs_resilver = B_TRUE;
-		}
-	}
-
-	if (spa->spa_scrub_type == POOL_SCRUB_EVERYTHING)
-		spa_scrub_io_start(spa, bp, ZIO_PRIORITY_SCRUB,
-		    ZIO_FLAG_SCRUB, &bc->bc_bookmark);
-	else if (needs_resilver)
-		spa_scrub_io_start(spa, bp, ZIO_PRIORITY_RESILVER,
-		    ZIO_FLAG_RESILVER, &bc->bc_bookmark);
-
-	return (0);
-}
-
-static void
-spa_scrub_thread(spa_t *spa)
+int
+spa_scrub(spa_t *spa, pool_scrub_type_t type)
 {
-	callb_cpr_t cprinfo;
-	traverse_handle_t *th = spa->spa_scrub_th;
-	vdev_t *rvd = spa->spa_root_vdev;
-	pool_scrub_type_t scrub_type = spa->spa_scrub_type;
-	int error = 0;
-	boolean_t complete;
-
-	CALLB_CPR_INIT(&cprinfo, &spa->spa_scrub_lock, callb_generic_cpr, FTAG);
-
-	/*
-	 * If we're restarting due to a snapshot create/delete,
-	 * wait for that to complete.
-	 */
-	txg_wait_synced(spa_get_dsl(spa), 0);
-
-	dprintf("start %s mintxg=%llu maxtxg=%llu\n",
-	    scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub",
-	    spa->spa_scrub_mintxg, spa->spa_scrub_maxtxg);
-
-	spa_config_enter(spa, RW_WRITER, FTAG);
-	vdev_reopen(rvd);		/* purge all vdev caches */
-	vdev_config_dirty(rvd);		/* rewrite all disk labels */
-	vdev_scrub_stat_update(rvd, scrub_type, B_FALSE);
-	spa_config_exit(spa, FTAG);
-
-	mutex_enter(&spa->spa_scrub_lock);
-	spa->spa_scrub_errors = 0;
-	spa->spa_scrub_active = 1;
-	ASSERT(spa->spa_scrub_inflight == 0);
-
-	while (!spa->spa_scrub_stop) {
-		CALLB_CPR_SAFE_BEGIN(&cprinfo);
-		while (spa->spa_scrub_suspended) {
-			spa->spa_scrub_active = 0;
-			cv_broadcast(&spa->spa_scrub_cv);
-			cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock);
-			spa->spa_scrub_active = 1;
-		}
-		CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_scrub_lock);
-
-		if (spa->spa_scrub_restart_txg != 0)
-			break;
-
-		mutex_exit(&spa->spa_scrub_lock);
-		error = traverse_more(th);
-		mutex_enter(&spa->spa_scrub_lock);
-		if (error != EAGAIN)
-			break;
-	}
-
-	while (spa->spa_scrub_inflight)
-		cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
-
-	spa->spa_scrub_active = 0;
-	cv_broadcast(&spa->spa_scrub_cv);
-
-	mutex_exit(&spa->spa_scrub_lock);
-
-	spa_config_enter(spa, RW_WRITER, FTAG);
-
-	mutex_enter(&spa->spa_scrub_lock);
-
-	/*
-	 * Note: we check spa_scrub_restart_txg under both spa_scrub_lock
-	 * AND the spa config lock to synchronize with any config changes
-	 * that revise the DTLs under spa_vdev_enter() / spa_vdev_exit().
-	 */
-	if (spa->spa_scrub_restart_txg != 0)
-		error = ERESTART;
-
-	if (spa->spa_scrub_stop)
-		error = EINTR;
-
-	/*
-	 * Even if there were uncorrectable errors, we consider the scrub
-	 * completed.  The downside is that if there is a transient error during
-	 * a resilver, we won't resilver the data properly to the target.  But
-	 * if the damage is permanent (more likely) we will resilver forever,
-	 * which isn't really acceptable.  Since there is enough information for
-	 * the user to know what has failed and why, this seems like a more
-	 * tractable approach.
-	 */
-	complete = (error == 0);
-
-	dprintf("end %s to maxtxg=%llu %s, traverse=%d, %llu errors, stop=%u\n",
-	    scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub",
-	    spa->spa_scrub_maxtxg, complete ? "done" : "FAILED",
-	    error, spa->spa_scrub_errors, spa->spa_scrub_stop);
-
-	mutex_exit(&spa->spa_scrub_lock);
-
-	/*
-	 * If the scrub/resilver completed, update all DTLs to reflect this.
-	 * Whether it succeeded or not, vacate all temporary scrub DTLs.
-	 */
-	vdev_dtl_reassess(rvd, spa_last_synced_txg(spa) + 1,
-	    complete ? spa->spa_scrub_maxtxg : 0, B_TRUE);
-	vdev_scrub_stat_update(rvd, POOL_SCRUB_NONE, complete);
-	spa_errlog_rotate(spa);
-
-	if (scrub_type == POOL_SCRUB_RESILVER && complete)
-		spa_event_notify(spa, NULL, ESC_ZFS_RESILVER_FINISH);
-
-	spa_config_exit(spa, FTAG);
-
-	mutex_enter(&spa->spa_scrub_lock);
-
-	/*
-	 * We may have finished replacing a device.
-	 * Let the async thread assess this and handle the detach.
-	 */
-	spa_async_request(spa, SPA_ASYNC_RESILVER_DONE);
-
-	/*
-	 * If we were told to restart, our final act is to start a new scrub.
-	 */
-	if (error == ERESTART)
-		spa_async_request(spa, scrub_type == POOL_SCRUB_RESILVER ?
-		    SPA_ASYNC_RESILVER : SPA_ASYNC_SCRUB);
-
-	spa->spa_scrub_type = POOL_SCRUB_NONE;
-	spa->spa_scrub_active = 0;
-	spa->spa_scrub_thread = NULL;
-	cv_broadcast(&spa->spa_scrub_cv);
-	CALLB_CPR_EXIT(&cprinfo);	/* drops &spa->spa_scrub_lock */
-	thread_exit();
-}
-
-void
-spa_scrub_suspend(spa_t *spa)
-{
-	mutex_enter(&spa->spa_scrub_lock);
-	spa->spa_scrub_suspended++;
-	while (spa->spa_scrub_active) {
-		cv_broadcast(&spa->spa_scrub_cv);
-		cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock);
-	}
-	while (spa->spa_scrub_inflight)
-		cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
-	mutex_exit(&spa->spa_scrub_lock);
-}
-
-void
-spa_scrub_resume(spa_t *spa)
-{
-	mutex_enter(&spa->spa_scrub_lock);
-	ASSERT(spa->spa_scrub_suspended != 0);
-	if (--spa->spa_scrub_suspended == 0)
-		cv_broadcast(&spa->spa_scrub_cv);
-	mutex_exit(&spa->spa_scrub_lock);
-}
-
-void
-spa_scrub_restart(spa_t *spa, uint64_t txg)
-{
-	/*
-	 * Something happened (e.g. snapshot create/delete) that means
-	 * we must restart any in-progress scrubs.  The itinerary will
-	 * fix this properly.
-	 */
-	mutex_enter(&spa->spa_scrub_lock);
-	spa->spa_scrub_restart_txg = txg;
-	mutex_exit(&spa->spa_scrub_lock);
-}
-
-int
-spa_scrub(spa_t *spa, pool_scrub_type_t type, boolean_t force)
-{
-	space_seg_t *ss;
-	uint64_t mintxg, maxtxg;
-	vdev_t *rvd = spa->spa_root_vdev;
-
-	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 	ASSERT(!spa_config_held(spa, RW_WRITER));
 
 	if ((uint_t)type >= POOL_SCRUB_TYPES)
 		return (ENOTSUP);
 
-	mutex_enter(&spa->spa_scrub_lock);
-
 	/*
-	 * If there's a scrub or resilver already in progress, stop it.
+	 * If a resilver was requested, but there is no DTL on a
+	 * writeable leaf device, we have nothing to do.
 	 */
-	while (spa->spa_scrub_thread != NULL) {
-		/*
-		 * Don't stop a resilver unless forced.
-		 */
-		if (spa->spa_scrub_type == POOL_SCRUB_RESILVER && !force) {
-			mutex_exit(&spa->spa_scrub_lock);
-			return (EBUSY);
-		}
-		spa->spa_scrub_stop = 1;
-		cv_broadcast(&spa->spa_scrub_cv);
-		cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock);
-	}
-
-	/*
-	 * Terminate the previous traverse.
-	 */
-	if (spa->spa_scrub_th != NULL) {
-		traverse_fini(spa->spa_scrub_th);
-		spa->spa_scrub_th = NULL;
-	}
-
-	if (rvd == NULL) {
-		ASSERT(spa->spa_scrub_stop == 0);
-		ASSERT(spa->spa_scrub_type == type);
-		ASSERT(spa->spa_scrub_restart_txg == 0);
-		mutex_exit(&spa->spa_scrub_lock);
+	if (type == POOL_SCRUB_RESILVER &&
+	    !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) {
+		spa_async_request(spa, SPA_ASYNC_RESILVER_DONE);
 		return (0);
 	}
 
-	mintxg = TXG_INITIAL - 1;
-	maxtxg = spa_last_synced_txg(spa) + 1;
-
-	mutex_enter(&rvd->vdev_dtl_lock);
-
-	if (rvd->vdev_dtl_map.sm_space == 0) {
-		/*
-		 * The pool-wide DTL is empty.
-		 * If this is a resilver, there's nothing to do except
-		 * check whether any in-progress replacements have completed.
-		 */
-		if (type == POOL_SCRUB_RESILVER) {
-			type = POOL_SCRUB_NONE;
-			spa_async_request(spa, SPA_ASYNC_RESILVER_DONE);
-		}
+	if (type == POOL_SCRUB_EVERYTHING &&
+	    spa->spa_dsl_pool->dp_scrub_func != SCRUB_FUNC_NONE &&
+	    spa->spa_dsl_pool->dp_scrub_isresilver)
+		return (EBUSY);
+
+	if (type == POOL_SCRUB_EVERYTHING || type == POOL_SCRUB_RESILVER) {
+		return (dsl_pool_scrub_clean(spa->spa_dsl_pool));
+	} else if (type == POOL_SCRUB_NONE) {
+		return (dsl_pool_scrub_cancel(spa->spa_dsl_pool));
 	} else {
-		/*
-		 * The pool-wide DTL is non-empty.
-		 * If this is a normal scrub, upgrade to a resilver instead.
-		 */
-		if (type == POOL_SCRUB_EVERYTHING)
-			type = POOL_SCRUB_RESILVER;
+		return (EINVAL);
 	}
-
-	if (type == POOL_SCRUB_RESILVER) {
-		/*
-		 * Determine the resilvering boundaries.
-		 *
-		 * Note: (mintxg, maxtxg) is an open interval,
-		 * i.e. mintxg and maxtxg themselves are not included.
-		 *
-		 * Note: for maxtxg, we MIN with spa_last_synced_txg(spa) + 1
-		 * so we don't claim to resilver a txg that's still changing.
-		 */
-		ss = avl_first(&rvd->vdev_dtl_map.sm_root);
-		mintxg = ss->ss_start - 1;
-		ss = avl_last(&rvd->vdev_dtl_map.sm_root);
-		maxtxg = MIN(ss->ss_end, maxtxg);
-
-		spa_event_notify(spa, NULL, ESC_ZFS_RESILVER_START);
-	}
-
-	mutex_exit(&rvd->vdev_dtl_lock);
-
-	spa->spa_scrub_stop = 0;
-	spa->spa_scrub_type = type;
-	spa->spa_scrub_restart_txg = 0;
-
-	if (type != POOL_SCRUB_NONE) {
-		spa->spa_scrub_mintxg = mintxg;
-		spa->spa_scrub_maxtxg = maxtxg;
-		spa->spa_scrub_th = traverse_init(spa, spa_scrub_cb, NULL,
-		    ADVANCE_PRE | ADVANCE_PRUNE | ADVANCE_ZIL,
-		    ZIO_FLAG_CANFAIL);
-		traverse_add_pool(spa->spa_scrub_th, mintxg, maxtxg);
-		spa->spa_scrub_thread = thread_create(NULL, 0,
-		    spa_scrub_thread, spa, 0, &p0, TS_RUN, minclsyspri);
-	}
-
-	mutex_exit(&spa->spa_scrub_lock);
-
-	return (0);
 }
 
 /*
@@ -3837,25 +3447,10 @@
 		spa_vdev_resilver_done(spa);
 
 	/*
-	 * Kick off a scrub.  When starting a RESILVER scrub (or an EVERYTHING
-	 * scrub which can become a resilver), we need to hold
-	 * spa_namespace_lock() because the sysevent we post via
-	 * spa_event_notify() needs to get the name of the pool.
-	 */
-	if (tasks & SPA_ASYNC_SCRUB) {
-		mutex_enter(&spa_namespace_lock);
-		VERIFY(spa_scrub(spa, POOL_SCRUB_EVERYTHING, B_TRUE) == 0);
-		mutex_exit(&spa_namespace_lock);
-	}
-
-	/*
 	 * Kick off a resilver.
 	 */
-	if (tasks & SPA_ASYNC_RESILVER) {
-		mutex_enter(&spa_namespace_lock);
-		VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0);
-		mutex_exit(&spa_namespace_lock);
-	}
+	if (tasks & SPA_ASYNC_RESILVER)
+		VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER) == 0);
 
 	/*
 	 * Let the world know that we're done.
@@ -4212,6 +3807,19 @@
 		}
 	}
 
+	if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN &&
+	    spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) {
+		dsl_pool_create_origin(dp, tx);
+
+		/* Keeping the origin open increases spa_minref */
+		spa->spa_minref += 3;
+	}
+
+	if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES &&
+	    spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) {
+		dsl_pool_upgrade_clones(dp, tx);
+	}
+
 	/*
 	 * If anything has changed in this txg, push the deferred frees
 	 * from the previous txg.  If not, leave them alone so that we
@@ -4296,21 +3904,12 @@
 		spa->spa_config_syncing = NULL;
 	}
 
-	/*
-	 * Make a stable copy of the fully synced uberblock.
-	 * We use this as the root for pool traversals.
-	 */
-	spa->spa_traverse_wanted = 1;	/* tells traverse_more() to stop */
-
-	spa_scrub_suspend(spa);		/* stop scrubbing and finish I/Os */
-
+	spa->spa_traverse_wanted = B_TRUE;
 	rw_enter(&spa->spa_traverse_lock, RW_WRITER);
-	spa->spa_traverse_wanted = 0;
+	spa->spa_traverse_wanted = B_FALSE;
 	spa->spa_ubsync = spa->spa_uberblock;
 	rw_exit(&spa->spa_traverse_lock);
 
-	spa_scrub_resume(spa);		/* resume scrub with new ubsync */
-
 	/*
 	 * Clean up the ZIL records for the synced txg.
 	 */
@@ -4390,7 +3989,6 @@
 		mutex_exit(&spa_namespace_lock);
 		spa_async_suspend(spa);
 		mutex_enter(&spa_namespace_lock);
-		VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0);
 		spa_close(spa, FTAG);
 
 		if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
--- a/usr/src/uts/common/fs/zfs/spa_errlog.c	Mon Jul 07 13:38:41 2008 -0700
+++ b/usr/src/uts/common/fs/zfs/spa_errlog.c	Mon Jul 07 13:39:21 2008 -0700
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -298,10 +298,7 @@
 spa_errlog_rotate(spa_t *spa)
 {
 	mutex_enter(&spa->spa_errlist_lock);
-
-	ASSERT(!spa->spa_scrub_finished);
 	spa->spa_scrub_finished = B_TRUE;
-
 	mutex_exit(&spa->spa_errlist_lock);
 }
 
--- a/usr/src/uts/common/fs/zfs/spa_history.c	Mon Jul 07 13:38:41 2008 -0700
+++ b/usr/src/uts/common/fs/zfs/spa_history.c	Mon Jul 07 13:39:21 2008 -0700
@@ -20,7 +20,7 @@
  */
 
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -399,6 +399,13 @@
 	char *str;
 	va_list adx;
 
+	/*
+	 * If this is part of creating a pool, not everything is
+	 * initialized yet, so don't bother logging the internal events.
+	 */
+	if (tx->tx_txg == TXG_INITIAL)
+		return;
+
 	hap = kmem_alloc(sizeof (history_arg_t), KM_SLEEP);
 	str = kmem_alloc(HIS_MAX_RECORD_LEN, KM_SLEEP);
 
--- a/usr/src/uts/common/fs/zfs/spa_misc.c	Mon Jul 07 13:38:41 2008 -0700
+++ b/usr/src/uts/common/fs/zfs/spa_misc.c	Mon Jul 07 13:39:21 2008 -0700
@@ -74,7 +74,7 @@
  *	This reference count keep track of any active users of the spa_t.  The
  *	spa_t cannot be destroyed or freed while this is non-zero.  Internally,
  *	the refcount is never really 'zero' - opening a pool implicitly keeps
- *	some references in the DMU.  Internally we check against SPA_MINREF, but
+ *	some references in the DMU.  Internally we check against spa_minref, but
  *	present the image of a zero/non-zero value to consumers.
  *
  * spa_config_lock (per-spa read-priority rwlock)
@@ -191,7 +191,6 @@
  */
 int zfs_recover = 0;
 
-#define	SPA_MINREF	5	/* spa_refcnt for an open-but-idle pool */
 
 /*
  * ==========================================================================
@@ -334,7 +333,6 @@
 	mutex_init(&spa->spa_props_lock, NULL, MUTEX_DEFAULT, NULL);
 
 	cv_init(&spa->spa_async_cv, NULL, CV_DEFAULT, NULL);
-	cv_init(&spa->spa_scrub_cv, NULL, CV_DEFAULT, NULL);
 	cv_init(&spa->spa_scrub_io_cv, NULL, CV_DEFAULT, NULL);
 
 	spa->spa_name = spa_strdup(name);
@@ -382,7 +380,6 @@
 
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 	ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
-	ASSERT(spa->spa_scrub_thread == NULL);
 
 	avl_remove(&spa_namespace_avl, spa);
 	cv_broadcast(&spa_namespace_cv);
@@ -413,7 +410,6 @@
 	rw_destroy(&spa->spa_traverse_lock);
 
 	cv_destroy(&spa->spa_async_cv);
-	cv_destroy(&spa->spa_scrub_cv);
 	cv_destroy(&spa->spa_scrub_io_cv);
 
 	mutex_destroy(&spa->spa_uberblock_lock);
@@ -458,9 +454,8 @@
 void
 spa_open_ref(spa_t *spa, void *tag)
 {
-	ASSERT(refcount_count(&spa->spa_refcount) > SPA_MINREF ||
+	ASSERT(refcount_count(&spa->spa_refcount) >= spa->spa_minref ||
 	    MUTEX_HELD(&spa_namespace_lock));
-
 	(void) refcount_add(&spa->spa_refcount, tag);
 }
 
@@ -471,15 +466,14 @@
 void
 spa_close(spa_t *spa, void *tag)
 {
-	ASSERT(refcount_count(&spa->spa_refcount) > SPA_MINREF ||
+	ASSERT(refcount_count(&spa->spa_refcount) > spa->spa_minref ||
 	    MUTEX_HELD(&spa_namespace_lock));
-
 	(void) refcount_remove(&spa->spa_refcount, tag);
 }
 
 /*
  * Check to see if the spa refcount is zero.  Must be called with
- * spa_namespace_lock held.  We really compare against SPA_MINREF, which is the
+ * spa_namespace_lock held.  We really compare against spa_minref, which is the
  * number of references acquired when opening a pool
  */
 boolean_t
@@ -487,7 +481,7 @@
 {
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 
-	return (refcount_count(&spa->spa_refcount) == SPA_MINREF);
+	return (refcount_count(&spa->spa_refcount) == spa->spa_minref);
 }
 
 /*
@@ -737,13 +731,6 @@
 {
 	mutex_enter(&spa_namespace_lock);
 
-	/*
-	 * Suspend scrub activity while we mess with the config.  We must do
-	 * this after acquiring the namespace lock to avoid a 3-way deadlock
-	 * with spa_scrub_stop() and the scrub thread.
-	 */
-	spa_scrub_suspend(spa);
-
 	spa_config_enter(spa, RW_WRITER, spa);
 
 	return (spa_last_synced_txg(spa) + 1);
@@ -771,18 +758,13 @@
 	 * If the config changed, notify the scrub thread that it must restart.
 	 */
 	if (error == 0 && !list_is_empty(&spa->spa_dirty_list)) {
+		dsl_pool_scrub_restart(spa->spa_dsl_pool);
 		config_changed = B_TRUE;
-		spa_scrub_restart(spa, txg);
 	}
 
 	spa_config_exit(spa, spa);
 
 	/*
-	 * Allow scrubbing to resume.
-	 */
-	spa_scrub_resume(spa);
-
-	/*
 	 * Note: this txg_wait_synced() is important because it ensures
 	 * that there won't be more than one config change per txg.
 	 * This allows us to use the txg as the generation number.
@@ -1017,7 +999,7 @@
 	return (&spa->spa_traverse_lock);
 }
 
-int
+boolean_t
 spa_traverse_wanted(spa_t *spa)
 {
 	return (spa->spa_traverse_wanted);
--- a/usr/src/uts/common/fs/zfs/sys/arc.h	Mon Jul 07 13:38:41 2008 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/arc.h	Mon Jul 07 13:39:21 2008 -0700
@@ -86,13 +86,24 @@
 int arc_referenced(arc_buf_t *buf);
 #endif
 
-int arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_byteswap_func_t *swap,
+typedef struct writeprops {
+	dmu_object_type_t wp_type;
+	uint8_t wp_copies;
+	uint8_t wp_level;
+	uint8_t wp_dncompress, wp_oscompress;
+	uint8_t wp_dnchecksum, wp_oschecksum;
+} writeprops_t;
+
+int arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_buf_t *pbuf,
     arc_done_func_t *done, void *private, int priority, int flags,
-    uint32_t *arc_flags, zbookmark_t *zb);
-zio_t *arc_write(zio_t *pio, spa_t *spa, int checksum, int compress,
-    int ncopies, uint64_t txg, blkptr_t *bp, arc_buf_t *buf,
+    uint32_t *arc_flags, const zbookmark_t *zb);
+int arc_read_nolock(zio_t *pio, spa_t *spa, blkptr_t *bp,
+    arc_done_func_t *done, void *private, int priority, int flags,
+    uint32_t *arc_flags, const zbookmark_t *zb);
+zio_t *arc_write(zio_t *pio, spa_t *spa, const writeprops_t *wp,
+    uint64_t txg, blkptr_t *bp, arc_buf_t *buf,
     arc_done_func_t *ready, arc_done_func_t *done, void *private, int priority,
-    int flags, zbookmark_t *zb);
+    int flags, const zbookmark_t *zb);
 int arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
     zio_done_func_t *done, void *private, uint32_t arc_flags);
 int arc_tryread(spa_t *spa, blkptr_t *bp, void *data);
--- a/usr/src/uts/common/fs/zfs/sys/bplist.h	Mon Jul 07 13:38:41 2008 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/bplist.h	Mon Jul 07 13:39:21 2008 -0700
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -75,8 +75,8 @@
 extern void bplist_close(bplist_t *bpl);
 extern boolean_t bplist_empty(bplist_t *bpl);
 extern int bplist_iterate(bplist_t *bpl, uint64_t *itorp, blkptr_t *bp);
-extern int bplist_enqueue(bplist_t *bpl, blkptr_t *bp, dmu_tx_t *tx);
-extern void bplist_enqueue_deferred(bplist_t *bpl, blkptr_t *bp);
+extern int bplist_enqueue(bplist_t *bpl, const blkptr_t *bp, dmu_tx_t *tx);
+extern void bplist_enqueue_deferred(bplist_t *bpl, const blkptr_t *bp);
 extern void bplist_sync(bplist_t *bpl, dmu_tx_t *tx);
 extern void bplist_vacate(bplist_t *bpl, dmu_tx_t *tx);
 extern int bplist_space(bplist_t *bpl,
--- a/usr/src/uts/common/fs/zfs/sys/dmu.h	Mon Jul 07 13:38:41 2008 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/dmu.h	Mon Jul 07 13:39:21 2008 -0700
@@ -114,6 +114,8 @@
 	DMU_OT_SYSACL,			/* SYSACL */
 	DMU_OT_FUID,			/* FUID table (Packed NVLIST UINT8) */
 	DMU_OT_FUID_SIZE,		/* FUID table size UINT64 */
+	DMU_OT_NEXT_CLONES,		/* ZAP */
+	DMU_OT_SCRUB_QUEUE,		/* ZAP */
 	DMU_OT_NUMTYPES
 } dmu_object_type_t;
 
@@ -202,6 +204,19 @@
 #define	DMU_POOL_PROPS			"pool_props"
 #define	DMU_POOL_L2CACHE		"l2cache"
 
+/* 4x8 zbookmark_t */
+#define	DMU_POOL_SCRUB_BOOKMARK		"scrub_bookmark"
+/* 1x8 zap obj DMU_OT_SCRUB_QUEUE */
+#define	DMU_POOL_SCRUB_QUEUE		"scrub_queue"
+/* 1x8 txg */
+#define	DMU_POOL_SCRUB_MIN_TXG		"scrub_min_txg"
+/* 1x8 txg */
+#define	DMU_POOL_SCRUB_MAX_TXG		"scrub_max_txg"
+/* 1x4 enum scrub_func */
+#define	DMU_POOL_SCRUB_FUNC		"scrub_func"
+/* 1x8 count */
+#define	DMU_POOL_SCRUB_ERRORS		"scrub_errors"
+
 /*
  * Allocate an object from this objset.  The range of object numbers
  * available is (0, DN_MAX_OBJECT).  Object 0 is the meta-dnode.
--- a/usr/src/uts/common/fs/zfs/sys/dmu_objset.h	Mon Jul 07 13:38:41 2008 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/dmu_objset.h	Mon Jul 07 13:39:21 2008 -0700
@@ -69,12 +69,11 @@
 	uint8_t os_checksum;	/* can change, under dsl_dir's locks */
 	uint8_t os_compress;	/* can change, under dsl_dir's locks */
 	uint8_t os_copies;	/* can change, under dsl_dir's locks */
-	uint8_t os_md_checksum;
-	uint8_t os_md_compress;
 
 	/* no lock needed: */
 	struct dmu_tx *os_synctx; /* XXX sketchy */
 	blkptr_t *os_rootbp;
+	zil_header_t os_zil_header;
 
 	/* Protected by os_obj_lock */
 	kmutex_t os_obj_lock;
@@ -111,6 +110,8 @@
 uint64_t dmu_objset_fsid_guid(objset_t *os);
 int dmu_objset_find(char *name, int func(char *, void *), void *arg,
     int flags);
+int dmu_objset_find_spa(spa_t *spa, const char *name,
+    int func(spa_t *, uint64_t, const char *, void *), void *arg, int flags);
 void dmu_objset_byteswap(void *buf, size_t size);
 int dmu_objset_evict_dbufs(objset_t *os);
 
--- a/usr/src/uts/common/fs/zfs/sys/dsl_dataset.h	Mon Jul 07 13:38:41 2008 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/dsl_dataset.h	Mon Jul 07 13:39:21 2008 -0700
@@ -71,15 +71,15 @@
 #define	DS_FLAG_CI_DATASET	(1ULL<<16)
 
 typedef struct dsl_dataset_phys {
-	uint64_t ds_dir_obj;
-	uint64_t ds_prev_snap_obj;
+	uint64_t ds_dir_obj;		/* DMU_OT_DSL_DIR */
+	uint64_t ds_prev_snap_obj;	/* DMU_OT_DSL_DATASET */
 	uint64_t ds_prev_snap_txg;
-	uint64_t ds_next_snap_obj;
-	uint64_t ds_snapnames_zapobj;	/* zap obj of snaps; ==0 for snaps */
+	uint64_t ds_next_snap_obj;	/* DMU_OT_DSL_DATASET */
+	uint64_t ds_snapnames_zapobj;	/* DMU_OT_DSL_DS_SNAP_MAP 0 for snaps */
 	uint64_t ds_num_children;	/* clone/snap children; ==0 for head */
 	uint64_t ds_creation_time;	/* seconds since 1970 */
 	uint64_t ds_creation_txg;
-	uint64_t ds_deadlist_obj;
+	uint64_t ds_deadlist_obj;	/* DMU_OT_BPLIST */
 	uint64_t ds_used_bytes;
 	uint64_t ds_compressed_bytes;
 	uint64_t ds_uncompressed_bytes;
@@ -91,9 +91,10 @@
 	 */
 	uint64_t ds_fsid_guid;
 	uint64_t ds_guid;
-	uint64_t ds_flags;
+	uint64_t ds_flags;		/* DS_FLAG_* */
 	blkptr_t ds_bp;
-	uint64_t ds_pad[8]; /* pad out to 320 bytes for good measure */
+	uint64_t ds_next_clones_obj;	/* DMU_OT_DSL_NEXT_CLONES */
+	uint64_t ds_pad[7]; /* pad out to 320 bytes for good measure */
 } dsl_dataset_phys_t;
 
 typedef struct dsl_dataset {
@@ -158,13 +159,14 @@
 void dsl_dataset_name(dsl_dataset_t *ds, char *name);
 void dsl_dataset_rele(dsl_dataset_t *ds, void *tag);
 void dsl_dataset_disown(dsl_dataset_t *ds, void *owner);
+void dsl_dataset_drop_ref(dsl_dataset_t *ds, void *tag);
 boolean_t dsl_dataset_tryown(dsl_dataset_t *ds, boolean_t inconsistentok,
     void *owner);
 void dsl_dataset_make_exclusive(dsl_dataset_t *ds, void *owner);
-uint64_t dsl_dataset_create_sync_impl(dsl_dir_t *dd, dsl_dataset_t *origin,
-    uint64_t flags, dmu_tx_t *tx);
 uint64_t dsl_dataset_create_sync(dsl_dir_t *pds, const char *lastname,
     dsl_dataset_t *origin, uint64_t flags, cred_t *, dmu_tx_t *);
+uint64_t dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin,
+    uint64_t flags, dmu_tx_t *tx);
 int dsl_dataset_destroy(dsl_dataset_t *ds, void *tag);
 int dsl_snapshots_destroy(char *fsname, char *snapname);
 dsl_checkfunc_t dsl_dataset_destroy_check;
@@ -204,9 +206,6 @@
     uint64_t *usedobjsp, uint64_t *availobjsp);
 uint64_t dsl_dataset_fsid_guid(dsl_dataset_t *ds);
 
-void dsl_dataset_create_root(struct dsl_pool *dp, uint64_t *ddobjp,
-    dmu_tx_t *tx);
-
 int dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf);
 
 int dsl_dataset_check_quota(dsl_dataset_t *ds, boolean_t check_quota,
--- a/usr/src/uts/common/fs/zfs/sys/dsl_dir.h	Mon Jul 07 13:38:41 2008 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/dsl_dir.h	Mon Jul 07 13:39:21 2008 -0700
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -100,8 +100,8 @@
 void dsl_dir_name(dsl_dir_t *dd, char *buf);
 int dsl_dir_namelen(dsl_dir_t *dd);
 int dsl_dir_is_private(dsl_dir_t *dd);
-uint64_t dsl_dir_create_sync(dsl_dir_t *pds, const char *name, dmu_tx_t *tx);
-void dsl_dir_create_root(objset_t *mos, uint64_t *ddobjp, dmu_tx_t *tx);
+uint64_t dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds,
+    const char *name, dmu_tx_t *tx);
 dsl_checkfunc_t dsl_dir_destroy_check;
 dsl_syncfunc_t dsl_dir_destroy_sync;
 void dsl_dir_stats(dsl_dir_t *dd, nvlist_t *nv);
@@ -121,9 +121,11 @@
 int dsl_dir_rename(dsl_dir_t *dd, const char *newname);
 int dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd, uint64_t space);
 int dsl_dir_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx);
+boolean_t dsl_dir_is_clone(dsl_dir_t *dd);
 
 /* internal reserved dir name */
 #define	MOS_DIR_NAME "$MOS"
+#define	ORIGIN_DIR_NAME "$ORIGIN"
 
 #ifdef ZFS_DEBUG
 #define	dprintf_dd(dd, fmt, ...) do { \
--- a/usr/src/uts/common/fs/zfs/sys/dsl_pool.h	Mon Jul 07 13:38:41 2008 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/dsl_pool.h	Mon Jul 07 13:39:21 2008 -0700
@@ -32,6 +32,7 @@
 #include <sys/txg.h>
 #include <sys/txg_impl.h>
 #include <sys/zfs_context.h>
+#include <sys/zio.h>
 
 #ifdef	__cplusplus
 extern "C" {
@@ -39,6 +40,16 @@
 
 struct objset;
 struct dsl_dir;
+struct dsl_dataset;
+struct dsl_pool;
+struct dmu_tx;
+
+enum scrub_func {
+	SCRUB_FUNC_NONE,
+	SCRUB_FUNC_CLEAN,
+	SCRUB_FUNC_NUMFUNCS
+};
+
 
 typedef struct dsl_pool {
 	/* Immutable */
@@ -46,6 +57,7 @@
 	struct objset *dp_meta_objset;
 	struct dsl_dir *dp_root_dir;
 	struct dsl_dir *dp_mos_dir;
+	struct dsl_dataset *dp_origin_snap;
 	uint64_t dp_root_dir_obj;
 
 	/* No lock needed - sync context only */
@@ -58,6 +70,17 @@
 	uint64_t dp_space_towrite[TXG_SIZE];
 	uint64_t dp_tempreserved[TXG_SIZE];
 
+	enum scrub_func dp_scrub_func;
+	uint64_t dp_scrub_queue_obj;
+	uint64_t dp_scrub_min_txg;
+	uint64_t dp_scrub_max_txg;
+	zbookmark_t dp_scrub_bookmark;
+	boolean_t dp_scrub_pausing;
+	boolean_t dp_scrub_isresilver;
+	uint64_t dp_scrub_start_time;
+	kmutex_t dp_scrub_cancel_lock; /* protects dp_scrub_restart */
+	boolean_t dp_scrub_restart;
+
 	/* Has its own locking */
 	tx_state_t dp_tx;
 	txg_list_t dp_dirty_datasets;
@@ -84,6 +107,17 @@
 void dsl_pool_tempreserve_clear(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx);
 void dsl_pool_memory_pressure(dsl_pool_t *dp);
 void dsl_pool_willuse_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx);
+int dsl_free(zio_t *pio, dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp,
+    zio_done_func_t *done, void *private, uint32_t arc_flags);
+void dsl_pool_ds_destroyed(struct dsl_dataset *ds, struct dmu_tx *tx);
+void dsl_pool_ds_snapshotted(struct dsl_dataset *ds, struct dmu_tx *tx);
+void dsl_pool_create_origin(dsl_pool_t *dp, dmu_tx_t *tx);
+void dsl_pool_upgrade_clones(dsl_pool_t *dp, dmu_tx_t *tx);
+
+int dsl_pool_scrub_cancel(dsl_pool_t *dp);
+int dsl_pool_scrub_clean(dsl_pool_t *dp);
+void dsl_pool_scrub_sync(dsl_pool_t *dp, dmu_tx_t *tx);
+void dsl_pool_scrub_restart(dsl_pool_t *dp);
 
 #ifdef	__cplusplus
 }
--- a/usr/src/uts/common/fs/zfs/sys/dsl_prop.h	Mon Jul 07 13:38:41 2008 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/dsl_prop.h	Mon Jul 07 13:39:21 2008 -0700
@@ -55,6 +55,8 @@
     dsl_prop_changed_cb_t *callback, void *cbarg);
 int dsl_prop_numcb(struct dsl_dataset *ds);
 
+int dsl_prop_get_dd(dsl_dir_t *dd, const char *propname,
+    int intsz, int numints, void *buf, char *setpoint);
 int dsl_prop_get(const char *ddname, const char *propname,
     int intsz, int numints, void *buf, char *setpoint);
 int dsl_prop_get_integer(const char *ddname, const char *propname,
--- a/usr/src/uts/common/fs/zfs/sys/spa.h	Mon Jul 07 13:38:41 2008 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/spa.h	Mon Jul 07 13:39:21 2008 -0700
@@ -337,6 +337,7 @@
 extern int spa_export(char *pool, nvlist_t **oldconfig);
 extern int spa_reset(char *pool);
 extern void spa_async_request(spa_t *spa, int flag);
+extern void spa_async_unrequest(spa_t *spa, int flag);
 extern void spa_async_suspend(spa_t *spa);
 extern void spa_async_resume(spa_t *spa);
 extern spa_t *spa_inject_addref(char *pool);
@@ -344,7 +345,6 @@
 
 #define	SPA_ASYNC_REMOVE	0x01
 #define	SPA_ASYNC_RESILVER_DONE	0x02
-#define	SPA_ASYNC_SCRUB		0x04
 #define	SPA_ASYNC_RESILVER	0x08
 #define	SPA_ASYNC_CONFIG_UPDATE	0x10
 
@@ -371,10 +371,7 @@
 extern void spa_l2cache_space_update(vdev_t *vd, int64_t space, int64_t alloc);
 
 /* scrubbing */
-extern int spa_scrub(spa_t *spa, pool_scrub_type_t type, boolean_t force);
-extern void spa_scrub_suspend(spa_t *spa);
-extern void spa_scrub_resume(spa_t *spa);
-extern void spa_scrub_restart(spa_t *spa, uint64_t txg);
+extern int spa_scrub(spa_t *spa, pool_scrub_type_t type);
 
 /* spa syncing */
 extern void spa_sync(spa_t *spa, uint64_t txg); /* only for DMU use */
@@ -425,7 +422,7 @@
 
 /* Accessor functions */
 extern krwlock_t *spa_traverse_rwlock(spa_t *spa);
-extern int spa_traverse_wanted(spa_t *spa);
+extern boolean_t spa_traverse_wanted(spa_t *spa);
 extern struct dsl_pool *spa_get_dsl(spa_t *spa);
 extern blkptr_t *spa_get_rootblkptr(spa_t *spa);
 extern void spa_set_rootblkptr(spa_t *spa, const blkptr_t *bp);
--- a/usr/src/uts/common/fs/zfs/sys/spa_impl.h	Mon Jul 07 13:38:41 2008 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/spa_impl.h	Mon Jul 07 13:39:21 2008 -0700
@@ -119,21 +119,15 @@
 	uberblock_t	spa_ubsync;		/* last synced uberblock */
 	uberblock_t	spa_uberblock;		/* current uberblock */
 	kmutex_t	spa_scrub_lock;		/* resilver/scrub lock */
-	kthread_t	*spa_scrub_thread;	/* scrub/resilver thread */
-	traverse_handle_t *spa_scrub_th;	/* scrub traverse handle */
-	uint64_t	spa_scrub_restart_txg;	/* need to restart */
-	uint64_t	spa_scrub_mintxg;	/* min txg we'll scrub */
-	uint64_t	spa_scrub_maxtxg;	/* max txg we'll scrub */
 	uint64_t	spa_scrub_inflight;	/* in-flight scrub I/Os */
 	uint64_t	spa_scrub_maxinflight;	/* max in-flight scrub I/Os */
 	uint64_t	spa_scrub_errors;	/* scrub I/O error count */
-	int		spa_scrub_suspended;	/* tell scrubber to suspend */
-	kcondvar_t	spa_scrub_cv;		/* scrub thread state change */
 	kcondvar_t	spa_scrub_io_cv;	/* scrub I/O completion */
-	uint8_t		spa_scrub_stop;		/* tell scrubber to stop */
 	uint8_t		spa_scrub_active;	/* active or suspended? */
 	uint8_t		spa_scrub_type;		/* type of scrub we're doing */
 	uint8_t		spa_scrub_finished;	/* indicator to rotate logs */
+	uint8_t		spa_scrub_started;	/* started since last boot */
+	uint8_t		spa_scrub_reopen;	/* scrub doing vdev_reopen */
 	kmutex_t	spa_async_lock;		/* protect async state */
 	kthread_t	*spa_async_thread;	/* thread doing async task */
 	int		spa_async_suspended;	/* async tasks suspended */
@@ -164,6 +158,7 @@
 	uint8_t		spa_failmode;		/* failure mode for the pool */
 	boolean_t	spa_import_faulted;	/* allow faulted vdevs */
 	boolean_t	spa_is_root;		/* pool is root */
+	int		spa_minref;		/* num refs when first opened */
 	/*
 	 * spa_refcnt & spa_config_lock must be the last elements
 	 * because refcount_t changes size based on compilation options.
--- a/usr/src/uts/common/fs/zfs/sys/txg.h	Mon Jul 07 13:38:41 2008 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/txg.h	Mon Jul 07 13:39:21 2008 -0700
@@ -102,7 +102,10 @@
  * Returns TRUE if we are "backed up" waiting for the syncing
  * transaction to complete; otherwise returns FALSE.
  */
-extern int txg_stalled(struct dsl_pool *dp);
+extern boolean_t txg_stalled(struct dsl_pool *dp);
+
+/* returns TRUE if someone is waiting for the next txg to sync */
+extern boolean_t txg_sync_waiting(struct dsl_pool *dp);
 
 /*
  * Per-txg object lists.
--- a/usr/src/uts/common/fs/zfs/sys/vdev.h	Mon Jul 07 13:38:41 2008 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/vdev.h	Mon Jul 07 13:39:21 2008 -0700
@@ -63,6 +63,8 @@
 extern int vdev_dtl_contains(space_map_t *sm, uint64_t txg, uint64_t size);
 extern void vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg,
     int scrub_done);
+extern boolean_t vdev_resilver_needed(vdev_t *vd,
+    uint64_t *minp, uint64_t *maxp);
 
 extern const char *vdev_description(vdev_t *vd);
 
--- a/usr/src/uts/common/fs/zfs/sys/zap.h	Mon Jul 07 13:38:41 2008 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/zap.h	Mon Jul 07 13:39:21 2008 -0700
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -243,6 +243,21 @@
 int zap_value_search(objset_t *os, uint64_t zapobj,
     uint64_t value, uint64_t mask, char *name);
 
+/*
+ * Transfer all the entries from fromobj into intoobj.  Only works on
+ * int_size=8 num_integers=1 values.  Fails if there are any duplicated
+ * entries.
+ */
+int zap_join(objset_t *os, uint64_t fromobj, uint64_t intoobj, dmu_tx_t *tx);
+
+/*
+ * Manipulate entries where the name + value are the "same" (the name is
+ * a stringified version of the value).
+ */
+int zap_add_int(objset_t *os, uint64_t obj, uint64_t value, dmu_tx_t *tx);
+int zap_remove_int(objset_t *os, uint64_t obj, uint64_t value, dmu_tx_t *tx);
+int zap_lookup_int(objset_t *os, uint64_t obj, uint64_t value);
+
 struct zap;
 struct zap_leaf;
 typedef struct zap_cursor {
--- a/usr/src/uts/common/fs/zfs/sys/zap_leaf.h	Mon Jul 07 13:38:41 2008 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/zap_leaf.h	Mon Jul 07 13:39:21 2008 -0700
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -151,7 +151,7 @@
 } zap_leaf_chunk_t;
 
 typedef struct zap_leaf {
-	krwlock_t l_rwlock; 		/* only used on head of chain */
+	krwlock_t l_rwlock;
 	uint64_t l_blkid;		/* 1<<ZAP_BLOCK_SHIFT byte block off */
 	int l_bs;			/* block size shift */
 	dmu_buf_t *l_dbuf;
--- a/usr/src/uts/common/fs/zfs/sys/zfs_znode.h	Mon Jul 07 13:38:41 2008 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/zfs_znode.h	Mon Jul 07 13:39:21 2008 -0700
@@ -290,7 +290,7 @@
 	if ((zfsvfs)->z_atime && !((zfsvfs)->z_vfs->vfs_flag & VFS_RDONLY)) \
 		zfs_time_stamper(zp, ACCESSED, NULL)
 
-extern int	zfs_init_fs(zfsvfs_t *, znode_t **, cred_t *);
+extern int	zfs_init_fs(zfsvfs_t *, znode_t **);
 extern void	zfs_set_dataprop(objset_t *);
 extern void	zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *,
     dmu_tx_t *tx);
--- a/usr/src/uts/common/fs/zfs/sys/zio.h	Mon Jul 07 13:38:41 2008 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/zio.h	Mon Jul 07 13:39:21 2008 -0700
@@ -291,14 +291,14 @@
 extern zio_t *zio_root(spa_t *spa,
     zio_done_func_t *done, void *private, int flags);
 
-extern zio_t *zio_read(zio_t *pio, spa_t *spa, blkptr_t *bp, void *data,
+extern zio_t *zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, void *data,
     uint64_t size, zio_done_func_t *done, void *private,
-    int priority, int flags, zbookmark_t *zb);
+    int priority, int flags, const zbookmark_t *zb);
 
 extern zio_t *zio_write(zio_t *pio, spa_t *spa, int checksum, int compress,
     int ncopies, uint64_t txg, blkptr_t *bp, void *data, uint64_t size,
     zio_done_func_t *ready, zio_done_func_t *done, void *private, int priority,
-    int flags, zbookmark_t *zb);
+    int flags, const zbookmark_t *zb);
 
 extern zio_t *zio_rewrite(zio_t *pio, spa_t *spa, int checksum, blkptr_t *bp,
     void *data, uint64_t size, zio_done_func_t *done, void *private,
--- a/usr/src/uts/common/fs/zfs/txg.c	Mon Jul 07 13:38:41 2008 -0700
+++ b/usr/src/uts/common/fs/zfs/txg.c	Mon Jul 07 13:39:21 2008 -0700
@@ -121,7 +121,12 @@
 	tx->tx_quiesce_thread = thread_create(NULL, 0, txg_quiesce_thread,
 	    dp, 0, &p0, TS_RUN, minclsyspri);
 
-	tx->tx_sync_thread = thread_create(NULL, 0, txg_sync_thread,
+	/*
+	 * The sync thread can need a larger-than-default stack size on
+	 * 32-bit x86.  This is due in part to nested pools and
+	 * scrub_visitbp() recursion.
+	 */
+	tx->tx_sync_thread = thread_create(NULL, 12<<10, txg_sync_thread,
 	    dp, 0, &p0, TS_RUN, minclsyspri);
 
 	mutex_exit(&tx->tx_sync_lock);
@@ -502,13 +507,22 @@
 	mutex_exit(&tx->tx_sync_lock);
 }
 
-int
+boolean_t
 txg_stalled(dsl_pool_t *dp)
 {
 	tx_state_t *tx = &dp->dp_tx;
 	return (tx->tx_quiesce_txg_waiting > tx->tx_open_txg);
 }
 
+boolean_t
+txg_sync_waiting(dsl_pool_t *dp)
+{
+	tx_state_t *tx = &dp->dp_tx;
+
+	return (tx->tx_syncing_txg <= tx->tx_sync_txg_waiting ||
+	    tx->tx_quiesced_txg != 0);
+}
+
 void
 txg_suspend(dsl_pool_t *dp)
 {
--- a/usr/src/uts/common/fs/zfs/vdev.c	Mon Jul 07 13:38:41 2008 -0700
+++ b/usr/src/uts/common/fs/zfs/vdev.c	Mon Jul 07 13:39:21 2008 -0700
@@ -58,8 +58,8 @@
 	NULL
 };
 
-/* maximum scrub/resilver I/O queue */
-int zfs_scrub_limit = 70;
+/* maximum scrub/resilver I/O queue per leaf vdev */
+int zfs_scrub_limit = 10;
 
 /*
  * Given a vdev type, return the appropriate ops vector.
@@ -137,11 +137,12 @@
 {
 	vdev_t *rvd = spa->spa_root_vdev;
 
-	ASSERT(spa_config_held(spa, RW_READER) ||
-	    curthread == spa->spa_scrub_thread);
+	ASSERT(spa_config_held(spa, RW_READER));
 
-	if (vdev < rvd->vdev_children)
+	if (vdev < rvd->vdev_children) {
+		ASSERT(rvd->vdev_child[vdev] != NULL);
 		return (rvd->vdev_child[vdev]);
+	}
 
 	return (NULL);
 }
@@ -970,6 +971,19 @@
 		    (vdev_psize_to_asize(vd, 1<<17) >> SPA_MINBLOCKSHIFT);
 	}
 
+	/*
+	 * If a leaf vdev has a DTL, and seems healthy, then kick off a
+	 * resilver.  But don't do this if we are doing a reopen for a
+	 * scrub, since this would just restart the scrub we are already
+	 * doing.
+	 */
+	if (vd->vdev_children == 0 && !vd->vdev_spa->spa_scrub_reopen) {
+		mutex_enter(&vd->vdev_dtl_lock);
+		if (vd->vdev_dtl_map.sm_space != 0 && vdev_writeable(vd))
+			spa_async_request(vd->vdev_spa, SPA_ASYNC_RESILVER);
+		mutex_exit(&vd->vdev_dtl_lock);
+	}
+
 	return (0);
 }
 
@@ -1212,22 +1226,27 @@
 	spa_t *spa = vd->vdev_spa;
 	int c;
 
-	ASSERT(spa_config_held(spa, RW_WRITER));
+	ASSERT(spa_config_held(spa, RW_READER));
 
 	if (vd->vdev_children == 0) {
 		mutex_enter(&vd->vdev_dtl_lock);
-		/*
-		 * We're successfully scrubbed everything up to scrub_txg.
-		 * Therefore, excise all old DTLs up to that point, then
-		 * fold in the DTLs for everything we couldn't scrub.
-		 */
-		if (scrub_txg != 0) {
+		if (scrub_txg != 0 &&
+		    (spa->spa_scrub_started || spa->spa_scrub_errors == 0)) {
+			/* XXX should check scrub_done? */
+			/*
+			 * We completed a scrub up to scrub_txg.  If we
+			 * did it without rebooting, then the scrub dtl
+			 * will be valid, so excise the old region and
+			 * fold in the scrub dtl.  Otherwise, leave the
+			 * dtl as-is if there was an error.
+			 */
 			space_map_excise(&vd->vdev_dtl_map, 0, scrub_txg);
 			space_map_union(&vd->vdev_dtl_map, &vd->vdev_dtl_scrub);
 		}
 		if (scrub_done)
 			space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL);
 		mutex_exit(&vd->vdev_dtl_lock);
+
 		if (txg != 0)
 			vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg);
 		return;
@@ -1352,6 +1371,49 @@
 	dmu_tx_commit(tx);
 }
 
+/*
+ * Determine if resilver is needed, and if so the txg range.
+ */
+boolean_t
+vdev_resilver_needed(vdev_t *vd, uint64_t *minp, uint64_t *maxp)
+{
+	boolean_t needed = B_FALSE;
+	uint64_t thismin = UINT64_MAX;
+	uint64_t thismax = 0;
+
+	if (vd->vdev_children == 0) {
+		mutex_enter(&vd->vdev_dtl_lock);
+		if (vd->vdev_dtl_map.sm_space != 0 && vdev_writeable(vd)) {
+			space_seg_t *ss;
+
+			ss = avl_first(&vd->vdev_dtl_map.sm_root);
+			thismin = ss->ss_start - 1;
+			ss = avl_last(&vd->vdev_dtl_map.sm_root);
+			thismax = ss->ss_end;
+			needed = B_TRUE;
+		}
+		mutex_exit(&vd->vdev_dtl_lock);
+	} else {
+		int c;
+		for (c = 0; c < vd->vdev_children; c++) {
+			vdev_t *cvd = vd->vdev_child[c];
+			uint64_t cmin, cmax;
+
+			if (vdev_resilver_needed(cvd, &cmin, &cmax)) {
+				thismin = MIN(thismin, cmin);
+				thismax = MAX(thismax, cmax);
+				needed = B_TRUE;
+			}
+		}
+	}
+
+	if (needed && minp) {
+		*minp = thismin;
+		*maxp = thismax;
+	}
+	return (needed);
+}
+
 void
 vdev_load(vdev_t *vd)
 {
@@ -1656,7 +1718,7 @@
 	 * w/pool name.
 	 */
 	mutex_enter(&spa_namespace_lock);
-	VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0);
+	VERIFY3U(spa_scrub(spa, POOL_SCRUB_RESILVER), ==, 0);
 	mutex_exit(&spa_namespace_lock);
 
 	return (0);
@@ -1831,6 +1893,7 @@
 
 	mutex_enter(&vd->vdev_stat_lock);
 	bcopy(&vd->vdev_stat, vs, sizeof (*vs));
+	vs->vs_scrub_errors = vd->vdev_spa->spa_scrub_errors;
 	vs->vs_timestamp = gethrtime() - vs->vs_timestamp;
 	vs->vs_state = vd->vdev_state;
 	vs->vs_rsize = vdev_get_rsize(vd);
@@ -1854,7 +1917,6 @@
 			vs->vs_write_errors += cvs->vs_write_errors;
 			vs->vs_checksum_errors += cvs->vs_checksum_errors;
 			vs->vs_scrub_examined += cvs->vs_scrub_examined;
-			vs->vs_scrub_errors += cvs->vs_scrub_errors;
 			mutex_exit(&vd->vdev_stat_lock);
 		}
 	}
@@ -1956,7 +2018,6 @@
 		vs->vs_scrub_complete = 0;
 		vs->vs_scrub_examined = 0;
 		vs->vs_scrub_repaired = 0;
-		vs->vs_scrub_errors = 0;
 		vs->vs_scrub_start = gethrestime_sec();
 		vs->vs_scrub_end = 0;
 	}
--- a/usr/src/uts/common/fs/zfs/zap.c	Mon Jul 07 13:38:41 2008 -0700
+++ b/usr/src/uts/common/fs/zfs/zap.c	Mon Jul 07 13:39:21 2008 -0700
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -903,6 +903,10 @@
 	return (err);
 }
 
+/*
+ * Helper functions for consumers.
+ */
+
 int
 zap_value_search(objset_t *os, uint64_t zapobj, uint64_t value, uint64_t mask,
     char *name)
@@ -928,6 +932,53 @@
 	return (err);
 }
 
+int
+zap_join(objset_t *os, uint64_t fromobj, uint64_t intoobj, dmu_tx_t *tx)
+{
+	zap_cursor_t zc;
+	zap_attribute_t za;
+	int err;
+
+	for (zap_cursor_init(&zc, os, fromobj);
+	    zap_cursor_retrieve(&zc, &za) == 0;
+	    (void) zap_cursor_advance(&zc)) {
+		if (za.za_integer_length != 8 || za.za_num_integers != 1)
+			return (EINVAL);
+		err = zap_add(os, intoobj, za.za_name,
+		    8, 1, &za.za_first_integer, tx);
+		if (err)
+			return (err);
+	}
+	zap_cursor_fini(&zc);
+	return (0);
+}
+
+int
+zap_add_int(objset_t *os, uint64_t obj, uint64_t value, dmu_tx_t *tx)
+{
+	char name[20];
+
+	(void) snprintf(name, sizeof (name), "%llx", (longlong_t)value);
+	return (zap_add(os, obj, name, 8, 1, &value, tx));
+}
+
+int
+zap_remove_int(objset_t *os, uint64_t obj, uint64_t value, dmu_tx_t *tx)
+{
+	char name[20];
+
+	(void) snprintf(name, sizeof (name), "%llx", (longlong_t)value);
+	return (zap_remove(os, obj, name, tx));
+}
+
+int
+zap_lookup_int(objset_t *os, uint64_t obj, uint64_t value)
+{
+	char name[20];
+
+	(void) snprintf(name, sizeof (name), "%llx", (longlong_t)value);
+	return (zap_lookup(os, obj, name, 8, 1, &value));
+}
 
 /*
  * Routines for iterating over the attributes.
--- a/usr/src/uts/common/fs/zfs/zfs_dir.c	Mon Jul 07 13:38:41 2008 -0700
+++ b/usr/src/uts/common/fs/zfs/zfs_dir.c	Mon Jul 07 13:39:21 2008 -0700
@@ -407,21 +407,6 @@
 	return (error);
 }
 
-static char *
-zfs_unlinked_hexname(char namebuf[17], uint64_t x)
-{
-	char *name = &namebuf[16];
-	const char digits[16] = "0123456789abcdef";
-
-	*name = '\0';
-	do {
-		*--name = digits[x & 0xf];
-		x >>= 4;
-	} while (x != 0);
-
-	return (name);
-}
-
 /*
  * unlinked Set (formerly known as the "delete queue") Error Handling
  *
@@ -440,30 +425,12 @@
 zfs_unlinked_add(znode_t *zp, dmu_tx_t *tx)
 {
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
-	char obj_name[17];
-	int error;
 
 	ASSERT(zp->z_unlinked);
 	ASSERT3U(zp->z_phys->zp_links, ==, 0);
 
-	error = zap_add(zfsvfs->z_os, zfsvfs->z_unlinkedobj,
-	    zfs_unlinked_hexname(obj_name, zp->z_id), 8, 1, &zp->z_id, tx);
-	ASSERT3U(error, ==, 0);
-}
-
-static void
-zfs_unlinked_remove(znode_t *zp, dmu_tx_t *tx)
-{
-	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
-	char obj_name[17];
-	int error;
-
-	ASSERT(zp->z_unlinked);
-	ASSERT3U(zp->z_phys->zp_links, ==, 0);
-
-	error = zap_remove(zfsvfs->z_os, zfsvfs->z_unlinkedobj,
-	    zfs_unlinked_hexname(obj_name, zp->z_id), tx);
-	ASSERT3U(error, ==, 0);
+	VERIFY3U(0, ==,
+	    zap_add_int(zfsvfs->z_os, zfsvfs->z_unlinkedobj, zp->z_id, tx));
 }
 
 /*
@@ -670,7 +637,8 @@
 	}
 
 	/* Remove this znode from the unlinked set */
-	zfs_unlinked_remove(zp, tx);
+	VERIFY3U(0, ==,
+	    zap_remove_int(zfsvfs->z_os, zfsvfs->z_unlinkedobj, zp->z_id, tx));
 
 	zfs_znode_delete(zp, tx);
 
--- a/usr/src/uts/common/fs/zfs/zfs_ioctl.c	Mon Jul 07 13:38:41 2008 -0700
+++ b/usr/src/uts/common/fs/zfs/zfs_ioctl.c	Mon Jul 07 13:39:21 2008 -0700
@@ -900,9 +900,7 @@
 	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
 		return (error);
 
-	mutex_enter(&spa_namespace_lock);
-	error = spa_scrub(spa, zc->zc_cookie, B_FALSE);
-	mutex_exit(&spa_namespace_lock);
+	error = spa_scrub(spa, zc->zc_cookie);
 
 	spa_close(spa, FTAG);
 
--- a/usr/src/uts/common/fs/zfs/zfs_vfsops.c	Mon Jul 07 13:38:41 2008 -0700
+++ b/usr/src/uts/common/fs/zfs/zfs_vfsops.c	Mon Jul 07 13:39:21 2008 -0700
@@ -638,7 +638,7 @@
 }
 
 static int
-zfs_domount(vfs_t *vfsp, char *osname, cred_t *cr)
+zfs_domount(vfs_t *vfsp, char *osname)
 {
 	dev_t mount_dev;
 	uint64_t recordsize, readonly;
@@ -707,7 +707,7 @@
 	if (error)
 		goto out;
 
-	if (error = zfs_init_fs(zfsvfs, &zp, cr))
+	if (error = zfs_init_fs(zfsvfs, &zp))
 		goto out;
 
 	/* The call to zfs_init_fs leaves the vnode held, release it here. */
@@ -910,7 +910,7 @@
 		if (error = vfs_lock(vfsp))
 			return (error);
 
-		if (error = zfs_domount(vfsp, rootfs.bo_name, CRED())) {
+		if (error = zfs_domount(vfsp, rootfs.bo_name)) {
 			cmn_err(CE_NOTE, "\nzfs_domount: error %d\n", error);
 			goto out;
 		}
@@ -1054,7 +1054,7 @@
 		goto out;
 	}
 
-	error = zfs_domount(vfsp, osname, cr);
+	error = zfs_domount(vfsp, osname);
 
 out:
 	pn_free(&spn);
--- a/usr/src/uts/common/fs/zfs/zfs_znode.c	Mon Jul 07 13:38:41 2008 -0700
+++ b/usr/src/uts/common/fs/zfs/zfs_znode.c	Mon Jul 07 13:39:21 2008 -0700
@@ -422,45 +422,17 @@
  *	incore "master" object.  Verify version compatibility.
  */
 int
-zfs_init_fs(zfsvfs_t *zfsvfs, znode_t **zpp, cred_t *cr)
+zfs_init_fs(zfsvfs_t *zfsvfs, znode_t **zpp)
 {
 	extern int zfsfstype;
 
 	objset_t	*os = zfsvfs->z_os;
 	int		i, error;
-	dmu_object_info_t doi;
 	uint64_t fsid_guid;
 	uint64_t zval;
 
 	*zpp = NULL;
 
-	/*
-	 * XXX - hack to auto-create the pool root filesystem at
-	 * the first attempted mount.
-	 */
-	if (dmu_object_info(os, MASTER_NODE_OBJ, &doi) == ENOENT) {
-		dmu_tx_t *tx = dmu_tx_create(os);
-		uint64_t zpl_version;
-		nvlist_t *zprops;
-
-		dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, TRUE, NULL); /* master */
-		dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, TRUE, NULL); /* del queue */
-		dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); /* root node */
-		error = dmu_tx_assign(tx, TXG_WAIT);
-		ASSERT3U(error, ==, 0);
-		if (spa_version(dmu_objset_spa(os)) >= SPA_VERSION_FUID)
-			zpl_version = ZPL_VERSION;
-		else
-			zpl_version = ZPL_VERSION_FUID - 1;
-
-		VERIFY(nvlist_alloc(&zprops, NV_UNIQUE_NAME, KM_SLEEP) == 0);
-		VERIFY(nvlist_add_uint64(zprops,
-		    zfs_prop_to_name(ZFS_PROP_VERSION), zpl_version) == 0);
-		zfs_create_fs(os, cr, zprops, tx);
-		nvlist_free(zprops);
-		dmu_tx_commit(tx);
-	}
-
 	error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version);
 	if (error) {
 		return (error);
@@ -1505,8 +1477,7 @@
 zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
 {
 	zfsvfs_t	zfsvfs;
-	uint64_t	moid, doid;
-	uint64_t	version = 0;
+	uint64_t	moid, doid, version;
 	uint64_t	sense = ZFS_CASE_SENSITIVE;
 	uint64_t	norm = 0;
 	nvpair_t	*elem;
@@ -1531,6 +1502,12 @@
 	/*
 	 * Set starting attributes.
 	 */
+	if (spa_version(dmu_objset_spa(os)) >= SPA_VERSION_FUID)
+		version = ZPL_VERSION;
+	else
+		version = ZPL_VERSION_FUID - 1;
+	error = zap_update(os, moid, ZPL_VERSION_STR,
+	    8, 1, &version, tx);
 	elem = NULL;
 	while ((elem = nvlist_next_nvpair(zplprops, elem)) != NULL) {
 		/* For the moment we expect all zpl props to be uint64_ts */
--- a/usr/src/uts/common/fs/zfs/zil.c	Mon Jul 07 13:38:41 2008 -0700
+++ b/usr/src/uts/common/fs/zfs/zil.c	Mon Jul 07 13:39:21 2008 -0700
@@ -167,7 +167,11 @@
 
 	*abufpp = NULL;
 
-	error = arc_read(NULL, zilog->zl_spa, &blk, byteswap_uint64_array,
+	/*
+	 * We shouldn't be doing any scrubbing while we're doing log
+	 * replay, it's OK to not lock.
+	 */
+	error = arc_read_nolock(NULL, zilog->zl_spa, &blk,
 	    arc_getbuf_func, abufpp, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL |
 	    ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB, &aflags, &zb);
 
--- a/usr/src/uts/common/fs/zfs/zio.c	Mon Jul 07 13:38:41 2008 -0700
+++ b/usr/src/uts/common/fs/zfs/zio.c	Mon Jul 07 13:39:21 2008 -0700
@@ -340,6 +340,9 @@
 	ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
 	ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0);
 
+	/* Only we should set CONFIG_GRABBED */
+	ASSERT(!(flags & ZIO_FLAG_CONFIG_GRABBED));
+
 	zio = kmem_cache_alloc(zio_cache, KM_SLEEP);
 	bzero(zio, sizeof (zio_t));
 	zio->io_parent = pio;
@@ -418,6 +421,18 @@
 	zio->io_orig_pipeline = zio->io_pipeline;
 	zio->io_orig_flags = zio->io_flags;
 
+	/*
+	 * If this is not a null zio, and config is not already held,
+	 * then the root zio should have grabbed the config lock.
+	 * If this is not a root zio, it should not have grabbed the
+	 * config lock.
+	 */
+	ASSERT((zio->io_root->io_flags & ZIO_FLAG_CONFIG_HELD) ||
+	    zio->io_type == ZIO_TYPE_NULL ||
+	    (zio->io_root->io_flags & ZIO_FLAG_CONFIG_GRABBED));
+	ASSERT(zio->io_root == zio ||
+	    !(zio->io_flags & ZIO_FLAG_CONFIG_GRABBED));
+
 	return (zio);
 }
 
@@ -452,9 +467,9 @@
 }
 
 zio_t *
-zio_read(zio_t *pio, spa_t *spa, blkptr_t *bp, void *data,
+zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, void *data,
     uint64_t size, zio_done_func_t *done, void *private,
-    int priority, int flags, zbookmark_t *zb)
+    int priority, int flags, const zbookmark_t *zb)
 {
 	zio_t *zio;
 
@@ -467,7 +482,8 @@
 	if (spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE)
 		ZIO_ENTER(spa);
 
-	zio = zio_create(pio, spa, bp->blk_birth, bp, data, size, done, private,
+	zio = zio_create(pio, spa, bp->blk_birth, (blkptr_t *)bp,
+	    data, size, done, private,
 	    ZIO_TYPE_READ, priority, flags | ZIO_FLAG_USER,
 	    ZIO_STAGE_OPEN, ZIO_READ_PIPELINE);
 	zio->io_bookmark = *zb;
@@ -486,7 +502,7 @@
 zio_write(zio_t *pio, spa_t *spa, int checksum, int compress, int ncopies,
     uint64_t txg, blkptr_t *bp, void *data, uint64_t size,
     zio_done_func_t *ready, zio_done_func_t *done, void *private, int priority,
-    int flags, zbookmark_t *zb)
+    int flags, const zbookmark_t *zb)
 {
 	zio_t *zio;
 
--- a/usr/src/uts/common/sys/fs/zfs.h	Mon Jul 07 13:38:41 2008 -0700
+++ b/usr/src/uts/common/sys/fs/zfs.h	Mon Jul 07 13:39:21 2008 -0700
@@ -227,14 +227,14 @@
 #define	SPA_VERSION_8			8ULL
 #define	SPA_VERSION_9			9ULL
 #define	SPA_VERSION_10			10ULL
-
+#define	SPA_VERSION_11			11ULL
 /*
  * When bumping up SPA_VERSION, make sure GRUB ZFS understands the on-disk
  * format change. Go to usr/src/grub/grub-0.95/stage2/{zfs-include/, fsys_zfs*},
  * and do the appropriate changes.
  */
-#define	SPA_VERSION			SPA_VERSION_10
-#define	SPA_VERSION_STRING		"10"
+#define	SPA_VERSION			SPA_VERSION_11
+#define	SPA_VERSION_STRING		"11"
 
 /*
  * Symbolic names for the changes that caused a SPA_VERSION switch.
@@ -264,6 +264,9 @@
 #define	SPA_VERSION_REFQUOTA		SPA_VERSION_9
 #define	SPA_VERSION_UNIQUE_ACCURATE	SPA_VERSION_9
 #define	SPA_VERSION_L2CACHE		SPA_VERSION_10
+#define	SPA_VERSION_NEXT_CLONES		SPA_VERSION_11
+#define	SPA_VERSION_ORIGIN		SPA_VERSION_11
+#define	SPA_VERSION_DSL_SCRUB		SPA_VERSION_11
 
 /*
  * ZPL version - rev'd whenever an incompatible on-disk format change
@@ -600,6 +603,11 @@
 #define	ZFS_EV_VDEV_PATH	"vdev_path"
 #define	ZFS_EV_VDEV_GUID	"vdev_guid"
 
+/*
+ * Note: This is encoded on-disk, so new events must be added to the
+ * end, and unused events can not be removed.  Be sure to edit
+ * zpool_main.c: hist_event_table[].
+ */
 typedef enum history_internal_events {
 	LOG_NO_EVENT = 0,
 	LOG_POOL_CREATE,
@@ -638,6 +646,7 @@
 	LOG_DS_UPGRADE,
 	LOG_DS_REFQUOTA,
 	LOG_DS_REFRESERV,
+	LOG_POOL_SCRUB_DONE,
 	LOG_END
 } history_internal_events_t;