6899923 vdev_offline/vdev_add deadlock
authorJeff Bonwick <Jeff.Bonwick@Sun.COM>
Thu, 19 Nov 2009 22:24:55 -0800
changeset 11125 fca3e6d28599
parent 11124 595a67d9f11a
child 11126 fa551ad7be3b
6899923 vdev_offline/vdev_add deadlock 6900497 zdb -S could be much faster 6900898 deduped blocks should be scrubbed/resilvered exactly once
usr/src/cmd/zdb/zdb.c
usr/src/cmd/ztest/ztest.c
usr/src/common/zfs/zpool_prop.c
usr/src/uts/common/fs/zfs/ddt.c
usr/src/uts/common/fs/zfs/dmu_traverse.c
usr/src/uts/common/fs/zfs/dsl_pool.c
usr/src/uts/common/fs/zfs/dsl_scrub.c
usr/src/uts/common/fs/zfs/spa.c
usr/src/uts/common/fs/zfs/spa_misc.c
usr/src/uts/common/fs/zfs/sys/ddt.h
usr/src/uts/common/fs/zfs/sys/dmu.h
usr/src/uts/common/fs/zfs/sys/dmu_traverse.h
usr/src/uts/common/fs/zfs/sys/dsl_pool.h
usr/src/uts/common/fs/zfs/zio.c
--- a/usr/src/cmd/zdb/zdb.c	Fri Nov 20 12:17:57 2009 +0800
+++ b/usr/src/cmd/zdb/zdb.c	Thu Nov 19 22:24:55 2009 -0800
@@ -94,14 +94,15 @@
 usage(void)
 {
 	(void) fprintf(stderr,
-	    "Usage: %s [-CumdibcsvhL] [-S user:cksumalg] "
+	    "Usage: %s [-CumdibcsvhL] "
 	    "poolname [object...]\n"
 	    "       %s [-div] dataset [object...]\n"
 	    "       %s -m [-L] poolname [vdev [metaslab...]]\n"
 	    "       %s -R poolname vdev:offset:size[:flags]\n"
+	    "       %s -S poolname\n"
 	    "       %s -l device\n"
 	    "       %s -C\n\n",
-	    cmdname, cmdname, cmdname, cmdname, cmdname, cmdname);
+	    cmdname, cmdname, cmdname, cmdname, cmdname, cmdname, cmdname);
 
 	(void) fprintf(stderr, "    Dataset name must include at least one "
 	    "separator character '/' or '@'\n");
@@ -549,7 +550,7 @@
 	for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
 		if (ddp->ddp_phys_birth == 0)
 			continue;
-		ddt_bp_create(ddt, ddk, ddp, &blk);
+		ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk);
 		sprintf_blkptr(blkbuf, &blk);
 		(void) printf("index %llx refcnt %llu %s %s\n",
 		    (u_longlong_t)index, (u_longlong_t)ddp->ddp_refcnt,
@@ -686,7 +687,7 @@
 
 	(void) printf("%s contents:\n\n", name);
 
-	while ((error = ddt_object_walk(ddt, type, class, &dde, &walk)) == 0)
+	while ((error = ddt_object_walk(ddt, type, class, &walk, &dde)) == 0)
 		dump_dde(ddt, &dde, walk);
 
 	ASSERT(error == ENOENT);
@@ -1344,7 +1345,8 @@
 	nicenum(doi.doi_physical_blocks_512 << 9, asize);
 	nicenum(doi.doi_bonus_size, bonus_size);
 	(void) sprintf(fill, "%6.2f", 100.0 * doi.doi_fill_count *
-	    doi.doi_data_block_size / doi.doi_max_offset);
+	    doi.doi_data_block_size / (object == 0 ? DNODES_PER_BLOCK : 1) /
+	    doi.doi_max_offset);
 
 	aux[0] = '\0';
 
@@ -1865,26 +1867,28 @@
 };
 
 static void
-zdb_ddt_leak_init(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
-    zdb_cb_t *zcb)
+zdb_ddt_leak_init(spa_t *spa, zdb_cb_t *zcb)
 {
-	uint64_t walk = 0;
+	ddt_bookmark_t ddb = { 0 };
 	ddt_entry_t dde;
 	int error;
 
-	if (class == DDT_CLASS_UNIQUE || !ddt_object_exists(ddt, type, class))
-		return;
-
-	while ((error = ddt_object_walk(ddt, type, class, &dde, &walk)) == 0) {
+	while ((error = ddt_walk(spa, &ddb, &dde)) == 0) {
 		blkptr_t blk;
 		ddt_phys_t *ddp = dde.dde_phys;
+
+		if (ddb.ddb_class == DDT_CLASS_UNIQUE)
+			return;
+
 		ASSERT(ddt_phys_total_refcnt(&dde) > 1);
+
 		for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
 			if (ddp->ddp_phys_birth == 0)
 				continue;
-			ddt_bp_create(ddt, &dde.dde_key, ddp, &blk);
+			ddt_bp_create(ddb.ddb_checksum,
+			    &dde.dde_key, ddp, &blk);
 			if (p == DDT_PHYS_DITTO) {
-				zdb_count_block(ddt->ddt_spa, NULL, zcb, &blk,
+				zdb_count_block(spa, NULL, zcb, &blk,
 				    ZDB_OT_DITTO);
 			} else {
 				zcb->zcb_dedup_asize +=
@@ -1893,6 +1897,7 @@
 			}
 		}
 		if (!dump_opt['L']) {
+			ddt_t *ddt = spa->spa_ddt[ddb.ddb_checksum];
 			ddt_enter(ddt);
 			VERIFY(ddt_lookup(ddt, &blk, B_TRUE) != NULL);
 			ddt_exit(ddt);
@@ -1924,12 +1929,7 @@
 
 	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 
-	for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++)
-		for (enum ddt_type type = 0; type < DDT_TYPES; type++)
-			for (enum ddt_class class = 0; class < DDT_CLASSES;
-			    class++)
-				zdb_ddt_leak_init(spa->spa_ddt[c],
-				    type, class, zcb);
+	zdb_ddt_leak_init(spa, zcb);
 
 	spa_config_exit(spa, SCL_CONFIG, FTAG);
 }
@@ -1957,6 +1957,7 @@
 	zdb_cb_t zcb = { 0 };
 	zdb_blkstats_t *zb, *tzb;
 	uint64_t norm_alloc, norm_space, total_alloc, total_found;
+	int flags = TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA;
 	int leaks = 0;
 
 	(void) printf("\nTraversing all blocks %s%s%s%s%s...\n",
@@ -2000,7 +2001,10 @@
 		bplist_close(bpl);
 	}
 
-	zcb.zcb_haderrors |= traverse_pool(spa, zdb_blkptr_cb, &zcb, 0);
+	if (dump_opt['c'] > 1)
+		flags |= TRAVERSE_PREFETCH_DATA;
+
+	zcb.zcb_haderrors |= traverse_pool(spa, 0, flags, zdb_blkptr_cb, &zcb);
 
 	if (zcb.zcb_haderrors) {
 		(void) printf("\nError counts:\n\n");
@@ -2170,7 +2174,8 @@
 		    avl_numnodes(t));
 	}
 
-	if (BP_GET_LEVEL(bp) > 0 || dmu_ot[BP_GET_TYPE(bp)].ot_metadata)
+	if (BP_IS_HOLE(bp) || BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_OFF ||
+	    BP_GET_LEVEL(bp) > 0 || dmu_ot[BP_GET_TYPE(bp)].ot_metadata)
 		return (0);
 
 	ddt_key_fill(&zdde_search.zdde_key, bp);
@@ -2205,7 +2210,8 @@
 
 	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 
-	(void) traverse_pool(spa, zdb_ddt_add_cb, &t, 0);
+	(void) traverse_pool(spa, 0, TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA,
+	    zdb_ddt_add_cb, &t);
 
 	spa_config_exit(spa, SCL_CONFIG, FTAG);
 
--- a/usr/src/cmd/ztest/ztest.c	Fri Nov 20 12:17:57 2009 +0800
+++ b/usr/src/cmd/ztest/ztest.c	Thu Nov 19 22:24:55 2009 -0800
@@ -837,7 +837,6 @@
 	return (error);
 }
 
-#if 0
 static int
 ztest_spa_prop_set_uint64(ztest_shared_t *zs, zpool_prop_t prop, uint64_t value)
 {
@@ -860,7 +859,6 @@
 
 	return (error);
 }
-#endif
 
 static void
 ztest_rll_init(rll_t *rll)
@@ -4134,10 +4132,8 @@
 
 	(void) rw_rdlock(&zs->zs_name_lock);
 
-#if 0
 	(void) ztest_spa_prop_set_uint64(zs, ZPOOL_PROP_DEDUPDITTO,
 	    ZIO_DEDUPDITTO_MIN + ztest_random(ZIO_DEDUPDITTO_MIN));
-#endif
 
 	VERIFY3U(spa_prop_get(zs->zs_spa, &props), ==, 0);
 
--- a/usr/src/common/zfs/zpool_prop.c	Fri Nov 20 12:17:57 2009 +0800
+++ b/usr/src/common/zfs/zpool_prop.c	Thu Nov 19 22:24:55 2009 -0800
@@ -90,6 +90,8 @@
 	/* default number properties */
 	register_number(ZPOOL_PROP_VERSION, "version", SPA_VERSION,
 	    PROP_DEFAULT, ZFS_TYPE_POOL, "<version>", "VERSION");
+	register_number(ZPOOL_PROP_DEDUPDITTO, "dedupditto", 0,
+	    PROP_DEFAULT, ZFS_TYPE_POOL, "<threshold (min 100)>", "DEDUPDITTO");
 
 	/* default index (boolean) properties */
 	register_index(ZPOOL_PROP_DELEGATION, "delegation", 1, PROP_DEFAULT,
@@ -109,8 +111,6 @@
 	/* hidden properties */
 	register_hidden(ZPOOL_PROP_NAME, "name", PROP_TYPE_STRING,
 	    PROP_READONLY, ZFS_TYPE_POOL, "NAME");
-	register_hidden(ZPOOL_PROP_DEDUPDITTO, "dedupditto", PROP_TYPE_NUMBER,
-	    PROP_READONLY, ZFS_TYPE_POOL, "DEDUPDITTO");
 }
 
 /*
--- a/usr/src/uts/common/fs/zfs/ddt.c	Fri Nov 20 12:17:57 2009 +0800
+++ b/usr/src/uts/common/fs/zfs/ddt.c	Thu Nov 19 22:24:55 2009 -0800
@@ -32,6 +32,7 @@
 #include <sys/zap.h>
 #include <sys/dmu_tx.h>
 #include <sys/arc.h>
+#include <sys/dsl_pool.h>
 #include <sys/zio_checksum.h>
 #include <sys/zio_compress.h>
 
@@ -158,7 +159,7 @@
 
 int
 ddt_object_walk(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
-    ddt_entry_t *dde, uint64_t *walk)
+    uint64_t *walk, ddt_entry_t *dde)
 {
 	ASSERT(ddt_object_exists(ddt, type, class));
 
@@ -212,8 +213,8 @@
 }
 
 void
-ddt_bp_create(const ddt_t *ddt, const ddt_key_t *ddk, const ddt_phys_t *ddp,
-    blkptr_t *bp)
+ddt_bp_create(enum zio_checksum checksum,
+    const ddt_key_t *ddk, const ddt_phys_t *ddp, blkptr_t *bp)
 {
 	BP_ZERO(bp);
 
@@ -225,7 +226,7 @@
 	BP_SET_LSIZE(bp, DDK_GET_LSIZE(ddk));
 	BP_SET_PSIZE(bp, DDK_GET_PSIZE(ddk));
 	BP_SET_COMPRESS(bp, DDK_GET_COMPRESS(ddk));
-	BP_SET_CHECKSUM(bp, ddt->ddt_checksum);
+	BP_SET_CHECKSUM(bp, checksum);
 	BP_SET_TYPE(bp, DMU_OT_NONE);
 	BP_SET_LEVEL(bp, 0);
 	BP_SET_DEDUP(bp, 0);
@@ -277,7 +278,7 @@
 {
 	blkptr_t blk;
 
-	ddt_bp_create(ddt, ddk, ddp, &blk);
+	ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk);
 	ddt_phys_clear(ddp);
 	zio_free(ddt->ddt_spa, txg, &blk);
 }
@@ -750,6 +751,30 @@
 	}
 }
 
+boolean_t
+ddt_class_contains(spa_t *spa, enum ddt_class max_class, const blkptr_t *bp)
+{
+	ddt_t *ddt;
+	ddt_entry_t dde;
+
+	if (!BP_GET_DEDUP(bp))
+		return (B_FALSE);
+
+	if (max_class == DDT_CLASS_UNIQUE)
+		return (B_TRUE);
+
+	ddt = spa->spa_ddt[BP_GET_CHECKSUM(bp)];
+
+	ddt_key_fill(&dde.dde_key, bp);
+
+	for (enum ddt_type type = 0; type < DDT_TYPES; type++)
+		for (enum ddt_class class = 0; class <= max_class; class++)
+			if (ddt_object_lookup(ddt, type, class, &dde) == 0)
+				return (B_TRUE);
+
+	return (B_FALSE);
+}
+
 ddt_entry_t *
 ddt_repair_start(ddt_t *ddt, const blkptr_t *bp)
 {
@@ -820,7 +845,7 @@
 		    ddp->ddp_phys_birth != rddp->ddp_phys_birth ||
 		    bcmp(ddp->ddp_dva, rddp->ddp_dva, sizeof (ddp->ddp_dva)))
 			continue;
-		ddt_bp_create(ddt, ddk, ddp, &blk);
+		ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk);
 		zio_nowait(zio_rewrite(zio, zio->io_spa, 0, &blk,
 		    rdde->dde_repair_data, DDK_GET_PSIZE(rddk), NULL, NULL,
 		    ZIO_PRIORITY_SYNC_WRITE, ZIO_DDT_CHILD_FLAGS(zio), NULL));
@@ -845,7 +870,7 @@
 		rdde_next = AVL_NEXT(t, rdde);
 		avl_remove(&ddt->ddt_repair_tree, rdde);
 		ddt_exit(ddt);
-		ddt_bp_create(ddt, &rdde->dde_key, NULL, &blk);
+		ddt_bp_create(ddt->ddt_checksum, &rdde->dde_key, NULL, &blk);
 		dde = ddt_repair_start(ddt, &blk);
 		ddt_repair_entry(ddt, dde, rdde, rio);
 		ddt_repair_done(ddt, dde);
@@ -857,6 +882,7 @@
 static void
 ddt_sync_entry(ddt_t *ddt, ddt_entry_t *dde, dmu_tx_t *tx, uint64_t txg)
 {
+	dsl_pool_t *dp = ddt->ddt_spa->spa_dsl_pool;
 	ddt_phys_t *ddp = dde->dde_phys;
 	ddt_key_t *ddk = &dde->dde_key;
 	enum ddt_type otype = dde->dde_type;
@@ -905,6 +931,11 @@
 		if (!ddt_object_exists(ddt, ntype, nclass))
 			ddt_object_create(ddt, ntype, nclass, tx);
 		VERIFY(ddt_object_update(ddt, ntype, nclass, dde, tx) == 0);
+
+		if (dp->dp_scrub_func != SCRUB_FUNC_NONE &&
+		    oclass > dp->dp_scrub_ddt_class_max &&
+		    nclass <= dp->dp_scrub_ddt_class_max)
+			dsl_pool_scrub_ddt_entry(dp, ddt->ddt_checksum, dde);
 	}
 }
 
@@ -968,3 +999,31 @@
 
 	dmu_tx_commit(tx);
 }
+
+int
+ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_entry_t *dde)
+{
+	do {
+		do {
+			do {
+				ddt_t *ddt = spa->spa_ddt[ddb->ddb_checksum];
+				int error = ENOENT;
+				if (ddt_object_exists(ddt, ddb->ddb_type,
+				    ddb->ddb_class)) {
+					error = ddt_object_walk(ddt,
+					    ddb->ddb_type, ddb->ddb_class,
+					    &ddb->ddb_cursor, dde);
+				}
+				if (error == 0)
+					return (0);
+				if (error != ENOENT)
+					return (error);
+				ddb->ddb_cursor = 0;
+			} while (++ddb->ddb_checksum < ZIO_CHECKSUM_FUNCTIONS);
+			ddb->ddb_checksum = 0;
+		} while (++ddb->ddb_type < DDT_TYPES);
+		ddb->ddb_type = 0;
+	} while (++ddb->ddb_class < DDT_CLASSES);
+
+	return (ENOENT);
+}
--- a/usr/src/uts/common/fs/zfs/dmu_traverse.c	Fri Nov 20 12:17:57 2009 +0800
+++ b/usr/src/uts/common/fs/zfs/dmu_traverse.c	Thu Nov 19 22:24:55 2009 -0800
@@ -375,7 +375,8 @@
  * NB: pool must not be changing on-disk (eg, from zdb or sync context).
  */
 int
-traverse_pool(spa_t *spa, blkptr_cb_t func, void *arg, uint64_t txg_start)
+traverse_pool(spa_t *spa, uint64_t txg_start, int flags,
+    blkptr_cb_t func, void *arg)
 {
 	int err;
 	uint64_t obj;
@@ -384,7 +385,7 @@
 
 	/* visit the MOS */
 	err = traverse_impl(spa, 0, spa_get_rootblkptr(spa),
-	    txg_start, TRAVERSE_PRE | TRAVERSE_PREFETCH, func, arg);
+	    txg_start, flags, func, arg);
 	if (err)
 		return (err);
 
@@ -408,8 +409,7 @@
 				return (err);
 			if (ds->ds_phys->ds_prev_snap_txg > txg)
 				txg = ds->ds_phys->ds_prev_snap_txg;
-			err = traverse_dataset(ds, txg,
-			    TRAVERSE_PRE | TRAVERSE_PREFETCH, func, arg);
+			err = traverse_dataset(ds, txg, flags, func, arg);
 			dsl_dataset_rele(ds, FTAG);
 			if (err)
 				return (err);
--- a/usr/src/uts/common/fs/zfs/dsl_pool.c	Fri Nov 20 12:17:57 2009 +0800
+++ b/usr/src/uts/common/fs/zfs/dsl_pool.c	Thu Nov 19 22:24:55 2009 -0800
@@ -171,11 +171,23 @@
 		if (err)
 			goto out;
 		err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
-		    DMU_POOL_SCRUB_BOOKMARK, sizeof (uint64_t), 4,
+		    DMU_POOL_SCRUB_BOOKMARK, sizeof (uint64_t),
+		    sizeof (dp->dp_scrub_bookmark) / sizeof (uint64_t),
 		    &dp->dp_scrub_bookmark);
 		if (err)
 			goto out;
 		err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+		    DMU_POOL_SCRUB_DDT_BOOKMARK, sizeof (uint64_t),
+		    sizeof (dp->dp_scrub_ddt_bookmark) / sizeof (uint64_t),
+		    &dp->dp_scrub_ddt_bookmark);
+		if (err && err != ENOENT)
+			goto out;
+		err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+		    DMU_POOL_SCRUB_DDT_CLASS_MAX, sizeof (uint64_t), 1,
+		    &dp->dp_scrub_ddt_class_max);
+		if (err && err != ENOENT)
+			goto out;
+		err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 		    DMU_POOL_SCRUB_ERRORS, sizeof (uint64_t), 1,
 		    &spa->spa_scrub_errors);
 		if (err)
--- a/usr/src/uts/common/fs/zfs/dsl_scrub.c	Fri Nov 20 12:17:57 2009 +0800
+++ b/usr/src/uts/common/fs/zfs/dsl_scrub.c	Thu Nov 19 22:24:55 2009 -0800
@@ -53,6 +53,7 @@
 int zfs_scrub_min_time = 1; /* scrub for at least 1 sec each txg */
 int zfs_resilver_min_time = 3; /* resilver for at least 3 sec each txg */
 boolean_t zfs_no_scrub_io = B_FALSE; /* set to disable scrub i/o */
+enum ddt_class zfs_scrub_ddt_class_max = DDT_CLASS_DUPLICATE;
 
 extern int zfs_txg_timeout;
 
@@ -78,6 +79,7 @@
 
 	dp->dp_scrub_min_txg = 0;
 	dp->dp_scrub_max_txg = tx->tx_txg;
+	dp->dp_scrub_ddt_class_max = zfs_scrub_ddt_class_max;
 
 	if (*funcp == SCRUB_FUNC_CLEAN) {
 		vdev_t *rvd = dp->dp_spa->spa_root_vdev;
@@ -101,6 +103,14 @@
 		    dp->dp_scrub_min_txg ? POOL_SCRUB_RESILVER :
 		    POOL_SCRUB_EVERYTHING, B_FALSE);
 
+		/*
+		 * If this is an incremental scrub, limit the DDT scrub phase
+		 * to just the auto-ditto class (for correctness); the rest
+		 * of the scrub should go faster using top-down pruning.
+		 */
+		if (dp->dp_scrub_min_txg > TXG_INITIAL)
+			dp->dp_scrub_ddt_class_max = DDT_CLASS_DITTO;
+
 		dp->dp_spa->spa_scrub_started = B_TRUE;
 	}
 
@@ -119,8 +129,8 @@
 	dp->dp_scrub_queue_obj = zap_create(dp->dp_meta_objset,
 	    ot ? ot : DMU_OT_SCRUB_QUEUE, DMU_OT_NONE, 0, tx);
 	bzero(&dp->dp_scrub_bookmark, sizeof (zbookmark_t));
+	bzero(&dp->dp_scrub_ddt_bookmark, sizeof (ddt_bookmark_t));
 	dp->dp_scrub_restart = B_FALSE;
-	dp->dp_scrub_ditto = B_FALSE;
 	dp->dp_spa->spa_scrub_errors = 0;
 
 	VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
@@ -136,9 +146,17 @@
 	    DMU_POOL_SCRUB_MAX_TXG, sizeof (uint64_t), 1,
 	    &dp->dp_scrub_max_txg, tx));
 	VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
-	    DMU_POOL_SCRUB_BOOKMARK, sizeof (uint64_t), 4,
+	    DMU_POOL_SCRUB_BOOKMARK, sizeof (uint64_t),
+	    sizeof (dp->dp_scrub_bookmark) / sizeof (uint64_t),
 	    &dp->dp_scrub_bookmark, tx));
 	VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+	    DMU_POOL_SCRUB_DDT_BOOKMARK, sizeof (uint64_t),
+	    sizeof (dp->dp_scrub_ddt_bookmark) / sizeof (uint64_t),
+	    &dp->dp_scrub_ddt_bookmark, tx));
+	VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+	    DMU_POOL_SCRUB_DDT_CLASS_MAX, sizeof (uint64_t), 1,
+	    &dp->dp_scrub_ddt_class_max, tx));
+	VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 	    DMU_POOL_SCRUB_ERRORS, sizeof (uint64_t), 1,
 	    &dp->dp_spa->spa_scrub_errors, tx));
 
@@ -186,6 +204,7 @@
 	    dp->dp_scrub_queue_obj, tx));
 	dp->dp_scrub_queue_obj = 0;
 	bzero(&dp->dp_scrub_bookmark, sizeof (zbookmark_t));
+	bzero(&dp->dp_scrub_ddt_bookmark, sizeof (ddt_bookmark_t));
 
 	VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 	    DMU_POOL_SCRUB_QUEUE, tx));
@@ -200,6 +219,11 @@
 	VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 	    DMU_POOL_SCRUB_ERRORS, tx));
 
+	(void) zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+	    DMU_POOL_SCRUB_DDT_BOOKMARK, tx);
+	(void) zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+	    DMU_POOL_SCRUB_DDT_CLASS_MAX, tx);
+
 	spa_history_internal_log(LOG_POOL_SCRUB_DONE, dp->dp_spa, tx, cr,
 	    "complete=%u", *completep);
 
@@ -296,7 +320,7 @@
 }
 
 static boolean_t
-scrub_pause(dsl_pool_t *dp, const zbookmark_t *zb)
+scrub_pause(dsl_pool_t *dp, const zbookmark_t *zb, const ddt_bookmark_t *ddb)
 {
 	int elapsed_ticks;
 	int mintime;
@@ -308,7 +332,7 @@
 		return (B_FALSE); /* we're resuming */
 
 	/* We only know how to resume from level-0 blocks. */
-	if (zb->zb_level != 0)
+	if (zb != NULL && zb->zb_level != 0)
 		return (B_FALSE);
 
 	mintime = dp->dp_scrub_isresilver ? zfs_resilver_min_time :
@@ -316,11 +340,23 @@
 	elapsed_ticks = ddi_get_lbolt64() - dp->dp_scrub_start_time;
 	if (elapsed_ticks > hz * zfs_txg_timeout ||
 	    (elapsed_ticks > hz * mintime && txg_sync_waiting(dp))) {
-		dprintf("pausing at %llx/%llx/%llx/%llx\n",
-		    (longlong_t)zb->zb_objset, (longlong_t)zb->zb_object,
-		    (longlong_t)zb->zb_level, (longlong_t)zb->zb_blkid);
+		if (zb) {
+			dprintf("pausing at bookmark %llx/%llx/%llx/%llx\n",
+			    (longlong_t)zb->zb_objset,
+			    (longlong_t)zb->zb_object,
+			    (longlong_t)zb->zb_level,
+			    (longlong_t)zb->zb_blkid);
+			dp->dp_scrub_bookmark = *zb;
+		}
+		if (ddb) {
+			dprintf("pausing at DDT bookmark %llx/%llx/%llx/%llx\n",
+			    (longlong_t)ddb->ddb_class,
+			    (longlong_t)ddb->ddb_type,
+			    (longlong_t)ddb->ddb_checksum,
+			    (longlong_t)ddb->ddb_cursor);
+			ASSERT(&dp->dp_scrub_ddt_bookmark == ddb);
+		}
 		dp->dp_scrub_pausing = B_TRUE;
-		dp->dp_scrub_bookmark = *zb;
 		return (B_TRUE);
 	}
 	return (B_FALSE);
@@ -423,7 +459,7 @@
 	if (bp->blk_birth <= dp->dp_scrub_min_txg)
 		return;
 
-	if (scrub_pause(dp, zb))
+	if (scrub_pause(dp, zb, NULL))
 		return;
 
 	if (!bookmark_is_zero(&dp->dp_scrub_bookmark)) {
@@ -525,7 +561,13 @@
 		}
 	}
 
-	(void) scrub_funcs[dp->dp_scrub_func](dp, bp, zb);
+	/*
+	 * If dsl_pool_scrub_ddt() has aready scrubbed this block,
+	 * don't scrub it again.
+	 */
+	if (!ddt_class_contains(dp->dp_spa, dp->dp_scrub_ddt_class_max, bp))
+		(void) scrub_funcs[dp->dp_scrub_func](dp, bp, zb);
+
 	if (buf)
 		(void) arc_buf_remove_ref(buf, &buf);
 }
@@ -542,7 +584,6 @@
 		SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j);
 		scrub_visitbp(dp, dnp, buf, &dnp->dn_blkptr[j], &czb);
 	}
-
 }
 
 static void
@@ -564,8 +605,8 @@
 		return;
 
 	if (dp->dp_scrub_bookmark.zb_objset == ds->ds_object) {
-		SET_BOOKMARK(&dp->dp_scrub_bookmark, ZB_DESTROYED_OBJSET,
-		    0, 0, 0);
+		SET_BOOKMARK(&dp->dp_scrub_bookmark,
+		    ZB_DESTROYED_OBJSET, 0, 0, 0);
 	} else if (zap_remove_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj,
 	    ds->ds_object, tx) != 0) {
 		return;
@@ -771,34 +812,65 @@
 	return (0);
 }
 
+/*
+ * Scrub/dedup interaction.
+ *
+ * If there are N references to a deduped block, we don't want to scrub it
+ * N times -- ideally, we should scrub it exactly once.
+ *
+ * To prevent excess scrubbing, the scrub begins by walking the DDT
+ * to find all blocks with refcnt > 1, and scrubs each of these once.
+ * Then the top-down scrub begins, only visiting blocks with refcnt == 1.
+ *
+ * There would be nothing more to say if a block's refcnt couldn't change
+ * during a scrub, but of course it can.  There are two cases to consider.
+ *
+ * If a block has refcnt > 1 during the DDT scrub phase, but has refcnt == 1
+ * when visited during the top-down scrub phase, it will be scrubbed twice.
+ * This negates our scrub optimization, but is otherwise harmless.
+ *
+ * If a block has refcnt == 1 during the DDT scrub phase, but has refcnt > 1
+ * on each visit during the top-down scrub phase, it will never be scrubbed.
+ * To catch this, ddt_sync_entry() notifies the scrub code whenever a block's
+ * reference count changes; if it transitions from refcnt == 1 to refcnt > 1
+ * while a scrub is in progress, it scrubs the block right then.
+ *
+ * The code does not actually use the refcnt directly, but rather uses the
+ * dde's replication class (enum ddt_class), which serves the same purpose.
+ */
 static void
-dsl_pool_scrub_ddt(dsl_pool_t *dp, enum zio_checksum c, enum ddt_type type,
-    enum ddt_class class)
+dsl_pool_scrub_ddt(dsl_pool_t *dp)
 {
-	ddt_t *ddt = ddt_select_by_checksum(dp->dp_spa, c);
+	ddt_bookmark_t *ddb = &dp->dp_scrub_ddt_bookmark;
 	ddt_entry_t dde;
-	blkptr_t blk;
-	zbookmark_t zb = { 0 };
-	uint64_t walk = 0;
 	int error;
 
-	if (!ddt_object_exists(ddt, type, class))
-		return;
-
-	while ((error = ddt_object_walk(ddt, type, class, &dde, &walk)) == 0) {
-		int p = DDT_PHYS_DITTO;
-		ddt_bp_create(ddt, &dde.dde_key, &dde.dde_phys[p], &blk);
-		scrub_funcs[dp->dp_scrub_func](dp, &blk, &zb);
+	while ((error = ddt_walk(dp->dp_spa, ddb, &dde)) == 0) {
+		if (ddb->ddb_class > dp->dp_scrub_ddt_class_max)
+			return;
+		dsl_pool_scrub_ddt_entry(dp, ddb->ddb_checksum, &dde);
+		if (scrub_pause(dp, NULL, ddb))
+			return;
 	}
 	ASSERT(error == ENOENT);
+	ASSERT(ddb->ddb_class > dp->dp_scrub_ddt_class_max);
 }
 
-static void
-dsl_pool_scrub_ditto(dsl_pool_t *dp)
+void
+dsl_pool_scrub_ddt_entry(dsl_pool_t *dp, enum zio_checksum checksum,
+    const ddt_entry_t *dde)
 {
-	for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++)
-		for (enum ddt_type type = 0; type < DDT_TYPES; type++)
-			dsl_pool_scrub_ddt(dp, c, type, DDT_CLASS_DITTO);
+	const ddt_key_t *ddk = &dde->dde_key;
+	const ddt_phys_t *ddp = dde->dde_phys;
+	blkptr_t blk;
+	zbookmark_t zb = { 0 };
+
+	for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
+		if (ddp->ddp_phys_birth == 0)
+			continue;
+		ddt_bp_create(checksum, ddk, ddp, &blk);
+		scrub_funcs[dp->dp_scrub_func](dp, &blk, &zb);
+	}
 }
 
 void
@@ -840,9 +912,10 @@
 	dp->dp_scrub_isresilver = (dp->dp_scrub_min_txg != 0);
 	spa->spa_scrub_active = B_TRUE;
 
-	if (!dp->dp_scrub_ditto) {
-		dsl_pool_scrub_ditto(dp);
-		dp->dp_scrub_ditto = B_TRUE;
+	if (dp->dp_scrub_ddt_bookmark.ddb_class <= dp->dp_scrub_ddt_class_max) {
+		dsl_pool_scrub_ddt(dp);
+		if (dp->dp_scrub_pausing)
+			goto out;
 	}
 
 	if (dp->dp_scrub_bookmark.zb_objset == DMU_META_OBJSET) {
@@ -895,12 +968,18 @@
 	dsl_pool_scrub_cancel_sync(dp, &complete, kcred, tx);
 	return;
 out:
-	VERIFY(0 == zap_update(dp->dp_meta_objset,
-	    DMU_POOL_DIRECTORY_OBJECT,
-	    DMU_POOL_SCRUB_BOOKMARK, sizeof (uint64_t), 4,
+	VERIFY(0 == zap_update(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+	    DMU_POOL_SCRUB_BOOKMARK, sizeof (uint64_t),
+	    sizeof (dp->dp_scrub_bookmark) / sizeof (uint64_t),
 	    &dp->dp_scrub_bookmark, tx));
-	VERIFY(0 == zap_update(dp->dp_meta_objset,
-	    DMU_POOL_DIRECTORY_OBJECT,
+	VERIFY(0 == zap_update(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+	    DMU_POOL_SCRUB_DDT_BOOKMARK, sizeof (uint64_t),
+	    sizeof (dp->dp_scrub_ddt_bookmark) / sizeof (uint64_t),
+	    &dp->dp_scrub_ddt_bookmark, tx));
+	VERIFY(0 == zap_update(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+	    DMU_POOL_SCRUB_DDT_CLASS_MAX, sizeof (uint64_t), 1,
+	    &dp->dp_scrub_ddt_class_max, tx));
+	VERIFY(0 == zap_update(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 	    DMU_POOL_SCRUB_ERRORS, sizeof (uint64_t), 1,
 	    &spa->spa_scrub_errors, tx));
 
--- a/usr/src/uts/common/fs/zfs/spa.c	Fri Nov 20 12:17:57 2009 +0800
+++ b/usr/src/uts/common/fs/zfs/spa.c	Thu Nov 19 22:24:55 2009 -0800
@@ -1230,8 +1230,8 @@
 	rio = zio_root(spa, NULL, &sle,
 	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE);
 
-	error = traverse_pool(spa, spa_load_verify_cb, rio,
-	    spa->spa_verify_min_txg);
+	error = traverse_pool(spa, spa->spa_verify_min_txg,
+	    TRAVERSE_PRE | TRAVERSE_PREFETCH, spa_load_verify_cb, rio);
 
 	(void) zio_wait(rio);
 
--- a/usr/src/uts/common/fs/zfs/spa_misc.c	Fri Nov 20 12:17:57 2009 +0800
+++ b/usr/src/uts/common/fs/zfs/spa_misc.c	Thu Nov 19 22:24:55 2009 -0800
@@ -840,8 +840,8 @@
 uint64_t
 spa_vdev_enter(spa_t *spa)
 {
+	mutex_enter(&spa->spa_vdev_top_lock);
 	mutex_enter(&spa_namespace_lock);
-	mutex_enter(&spa->spa_vdev_top_lock);
 	return (spa_vdev_config_enter(spa));
 }
 
@@ -937,8 +937,8 @@
 spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error)
 {
 	spa_vdev_config_exit(spa, vd, txg, error, FTAG);
+	mutex_exit(&spa_namespace_lock);
 	mutex_exit(&spa->spa_vdev_top_lock);
-	mutex_exit(&spa_namespace_lock);
 
 	return (error);
 }
--- a/usr/src/uts/common/fs/zfs/sys/ddt.h	Fri Nov 20 12:17:57 2009 +0800
+++ b/usr/src/uts/common/fs/zfs/sys/ddt.h	Thu Nov 19 22:24:55 2009 -0800
@@ -153,6 +153,19 @@
 	avl_node_t	ddt_node;
 };
 
+/*
+ * In-core and on-disk bookmark for DDT walks
+ */
+typedef struct ddt_bookmark {
+	uint64_t	ddb_class;
+	uint64_t	ddb_type;
+	uint64_t	ddb_checksum;
+	uint64_t	ddb_cursor;
+} ddt_bookmark_t;
+
+/*
+ * Ops vector to access a specific DDT object type.
+ */
 typedef struct ddt_ops {
 	char ddt_op_name[32];
 	int (*ddt_op_create)(objset_t *os, uint64_t *object, dmu_tx_t *tx,
@@ -173,7 +186,7 @@
 extern void ddt_object_name(ddt_t *ddt, enum ddt_type type,
     enum ddt_class class, char *name);
 extern int ddt_object_walk(ddt_t *ddt, enum ddt_type type,
-    enum ddt_class class, ddt_entry_t *dde, uint64_t *walk);
+    enum ddt_class class, uint64_t *walk, ddt_entry_t *dde);
 extern uint64_t ddt_object_count(ddt_t *ddt, enum ddt_type type,
     enum ddt_class class);
 extern int ddt_object_info(ddt_t *ddt, enum ddt_type type,
@@ -183,7 +196,7 @@
 
 extern void ddt_bp_fill(const ddt_phys_t *ddp, blkptr_t *bp,
     uint64_t txg);
-extern void ddt_bp_create(const ddt_t *ddt, const ddt_key_t *ddk,
+extern void ddt_bp_create(enum zio_checksum checksum, const ddt_key_t *ddk,
     const ddt_phys_t *ddp, blkptr_t *bp);
 
 extern void ddt_key_fill(ddt_key_t *ddk, const blkptr_t *bp);
@@ -214,13 +227,14 @@
 extern void ddt_decompress(uchar_t *src, void *dst, size_t s_len, size_t d_len);
 
 extern ddt_t *ddt_select(spa_t *spa, const blkptr_t *bp);
-extern ddt_t *ddt_select_by_checksum(spa_t *spa, enum zio_checksum c);
-
 extern void ddt_enter(ddt_t *ddt);
 extern void ddt_exit(ddt_t *ddt);
 extern ddt_entry_t *ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add);
 extern void ddt_remove(ddt_t *ddt, ddt_entry_t *dde);
 
+extern boolean_t ddt_class_contains(spa_t *spa, enum ddt_class max_class,
+    const blkptr_t *bp);
+
 extern ddt_entry_t *ddt_repair_start(ddt_t *ddt, const blkptr_t *bp);
 extern void ddt_repair_done(ddt_t *ddt, ddt_entry_t *dde);
 
@@ -230,6 +244,7 @@
 extern int ddt_load(spa_t *spa);
 extern void ddt_unload(spa_t *spa);
 extern void ddt_sync(spa_t *spa, uint64_t txg);
+extern int ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_entry_t *dde);
 
 extern const ddt_ops_t ddt_zap_ops;
 
--- a/usr/src/uts/common/fs/zfs/sys/dmu.h	Fri Nov 20 12:17:57 2009 +0800
+++ b/usr/src/uts/common/fs/zfs/sys/dmu.h	Thu Nov 19 22:24:55 2009 -0800
@@ -211,6 +211,10 @@
 
 /* 4x8 zbookmark_t */
 #define	DMU_POOL_SCRUB_BOOKMARK		"scrub_bookmark"
+/* 4x8 ddt_bookmark_t */
+#define	DMU_POOL_SCRUB_DDT_BOOKMARK	"scrub_ddt_bookmark"
+/* 1x8 max_class */
+#define	DMU_POOL_SCRUB_DDT_CLASS_MAX	"scrub_ddt_class_max"
 /* 1x8 zap obj DMU_OT_SCRUB_QUEUE */
 #define	DMU_POOL_SCRUB_QUEUE		"scrub_queue"
 /* 1x8 txg */
--- a/usr/src/uts/common/fs/zfs/sys/dmu_traverse.h	Fri Nov 20 12:17:57 2009 +0800
+++ b/usr/src/uts/common/fs/zfs/sys/dmu_traverse.h	Thu Nov 19 22:24:55 2009 -0800
@@ -47,9 +47,10 @@
 #define	TRAVERSE_PREFETCH_DATA		(1<<3)
 #define	TRAVERSE_PREFETCH (TRAVERSE_PREFETCH_METADATA | TRAVERSE_PREFETCH_DATA)
 
-int traverse_dataset(struct dsl_dataset *ds, uint64_t txg_start,
-    int flags, blkptr_cb_t func, void *arg);
-int traverse_pool(spa_t *spa, blkptr_cb_t func, void *arg, uint64_t txg_start);
+int traverse_dataset(struct dsl_dataset *ds,
+    uint64_t txg_start, int flags, blkptr_cb_t func, void *arg);
+int traverse_pool(spa_t *spa,
+    uint64_t txg_start, int flags, blkptr_cb_t func, void *arg);
 
 #ifdef	__cplusplus
 }
--- a/usr/src/uts/common/fs/zfs/sys/dsl_pool.h	Fri Nov 20 12:17:57 2009 +0800
+++ b/usr/src/uts/common/fs/zfs/sys/dsl_pool.h	Thu Nov 19 22:24:55 2009 -0800
@@ -32,6 +32,7 @@
 #include <sys/zfs_context.h>
 #include <sys/zio.h>
 #include <sys/dnode.h>
+#include <sys/ddt.h>
 
 #ifdef	__cplusplus
 extern "C" {
@@ -96,13 +97,14 @@
 	uint64_t dp_scrub_queue_obj;
 	uint64_t dp_scrub_min_txg;
 	uint64_t dp_scrub_max_txg;
+	uint64_t dp_scrub_start_time;
+	uint64_t dp_scrub_ddt_class_max;
 	zbookmark_t dp_scrub_bookmark;
+	ddt_bookmark_t dp_scrub_ddt_bookmark;
 	boolean_t dp_scrub_pausing;
 	boolean_t dp_scrub_isresilver;
-	uint64_t dp_scrub_start_time;
+	boolean_t dp_scrub_restart;
 	kmutex_t dp_scrub_cancel_lock; /* protects dp_scrub_restart */
-	boolean_t dp_scrub_restart;
-	boolean_t dp_scrub_ditto;
 
 	/* Has its own locking */
 	tx_state_t dp_tx;
@@ -145,6 +147,8 @@
 int dsl_pool_scrub_clean(dsl_pool_t *dp);
 void dsl_pool_scrub_sync(dsl_pool_t *dp, dmu_tx_t *tx);
 void dsl_pool_scrub_restart(dsl_pool_t *dp);
+void dsl_pool_scrub_ddt_entry(dsl_pool_t *dp, enum zio_checksum checksum,
+    const ddt_entry_t *dde);
 
 taskq_t *dsl_pool_vnrele_taskq(dsl_pool_t *dp);
 
--- a/usr/src/uts/common/fs/zfs/zio.c	Fri Nov 20 12:17:57 2009 +0800
+++ b/usr/src/uts/common/fs/zfs/zio.c	Thu Nov 19 22:24:55 2009 -0800
@@ -867,6 +867,9 @@
 	if (!dmu_ot[BP_GET_TYPE(bp)].ot_metadata && BP_GET_LEVEL(bp) == 0)
 		zio->io_flags |= ZIO_FLAG_DONT_CACHE;
 
+	if (BP_GET_TYPE(bp) == DMU_OT_DDT_ZAP)
+		zio->io_flags |= ZIO_FLAG_DONT_CACHE;
+
 	if (BP_GET_DEDUP(bp) && zio->io_child_type == ZIO_CHILD_LOGICAL)
 		zio->io_pipeline = ZIO_DDT_READ_PIPELINE;
 
@@ -1736,7 +1739,8 @@
 		for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
 			if (ddp->ddp_phys_birth == 0 || ddp == ddp_self)
 				continue;
-			ddt_bp_create(ddt, &dde->dde_key, ddp, &blk);
+			ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, ddp,
+			    &blk);
 			zio_nowait(zio_read(zio, zio->io_spa, &blk,
 			    zio_buf_alloc(zio->io_size), zio->io_size,
 			    zio_ddt_child_read_done, dde, zio->io_priority,