6948890 snapshot deletion can induce pathologically long spa_sync() times
authorMatthew Ahrens <Matthew.Ahrens@Sun.COM>
Fri, 21 May 2010 17:29:22 -0700
changeset 12470 54258108784b
parent 12469 1664b63fef32
child 12471 10160519e417
6948890 snapshot deletion can induce pathologically long spa_sync() times
usr/src/cmd/zdb/zdb.c
usr/src/cmd/zinject/translate.c
usr/src/cmd/zinject/zinject.c
usr/src/cmd/zinject/zinject.h
usr/src/cmd/zpool/zpool_main.c
usr/src/cmd/ztest/ztest.c
usr/src/grub/capability
usr/src/grub/grub-0.97/stage2/zfs-include/zfs.h
usr/src/uts/common/Makefile.files
usr/src/uts/common/fs/zfs/bplist.c
usr/src/uts/common/fs/zfs/bpobj.c
usr/src/uts/common/fs/zfs/dmu.c
usr/src/uts/common/fs/zfs/dmu_send.c
usr/src/uts/common/fs/zfs/dsl_dataset.c
usr/src/uts/common/fs/zfs/dsl_deadlist.c
usr/src/uts/common/fs/zfs/dsl_dir.c
usr/src/uts/common/fs/zfs/dsl_pool.c
usr/src/uts/common/fs/zfs/dsl_scan.c
usr/src/uts/common/fs/zfs/spa.c
usr/src/uts/common/fs/zfs/spa_misc.c
usr/src/uts/common/fs/zfs/sys/bplist.h
usr/src/uts/common/fs/zfs/sys/bpobj.h
usr/src/uts/common/fs/zfs/sys/dmu.h
usr/src/uts/common/fs/zfs/sys/dsl_dataset.h
usr/src/uts/common/fs/zfs/sys/dsl_deadlist.h
usr/src/uts/common/fs/zfs/sys/dsl_dir.h
usr/src/uts/common/fs/zfs/sys/dsl_pool.h
usr/src/uts/common/fs/zfs/sys/dsl_scan.h
usr/src/uts/common/fs/zfs/sys/spa.h
usr/src/uts/common/fs/zfs/sys/spa_impl.h
usr/src/uts/common/fs/zfs/sys/zio_checksum.h
usr/src/uts/common/fs/zfs/txg.c
usr/src/uts/common/fs/zfs/zio.c
usr/src/uts/common/fs/zfs/zio_checksum.c
usr/src/uts/common/sys/fs/zfs.h
--- a/usr/src/cmd/zdb/zdb.c	Fri May 21 15:05:41 2010 -0700
+++ b/usr/src/cmd/zdb/zdb.c	Fri May 21 17:29:22 2010 -0700
@@ -900,9 +900,9 @@
 }
 
 static void
-sprintf_blkptr_compact(char *blkbuf, blkptr_t *bp)
+sprintf_blkptr_compact(char *blkbuf, const blkptr_t *bp)
 {
-	dva_t *dva = bp->blk_dva;
+	const dva_t *dva = bp->blk_dva;
 	int ndvas = dump_opt['d'] > 5 ? BP_GET_NDVAS(bp) : 1;
 
 	if (dump_opt['b'] >= 5) {
@@ -1127,12 +1127,21 @@
 	(void) printf("\t\tbp = %s\n", blkbuf);
 }
 
-static void
-dump_bplist(objset_t *mos, uint64_t object, char *name)
+/* ARGSUSED */
+static int
+dump_bpobj_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
 {
-	bplist_t bpl = { 0 };
-	blkptr_t blk, *bp = &blk;
-	uint64_t itor = 0;
+	char blkbuf[BP_SPRINTF_LEN];
+
+	ASSERT(bp->blk_birth != 0);
+	sprintf_blkptr_compact(blkbuf, bp);
+	(void) printf("\t%s\n", blkbuf);
+	return (0);
+}
+
+static void
+dump_bpobj(bpobj_t *bpo, char *name)
+{
 	char bytes[32];
 	char comp[32];
 	char uncomp[32];
@@ -1140,45 +1149,59 @@
 	if (dump_opt['d'] < 3)
 		return;
 
-	bplist_init(&bpl);
-	VERIFY(0 == bplist_open(&bpl, mos, object));
-	if (bplist_empty(&bpl)) {
-		bplist_close(&bpl);
-		bplist_fini(&bpl);
-		return;
-	}
-
-	zdb_nicenum(bpl.bpl_phys->bpl_bytes, bytes);
-	if (bpl.bpl_dbuf->db_size == sizeof (bplist_phys_t)) {
-		zdb_nicenum(bpl.bpl_phys->bpl_comp, comp);
-		zdb_nicenum(bpl.bpl_phys->bpl_uncomp, uncomp);
-		(void) printf("\n    %s: %llu entries, %s (%s/%s comp)\n",
-		    name, (u_longlong_t)bpl.bpl_phys->bpl_entries,
+	zdb_nicenum(bpo->bpo_phys->bpo_bytes, bytes);
+	if (bpo->bpo_havesubobj) {
+		zdb_nicenum(bpo->bpo_phys->bpo_comp, comp);
+		zdb_nicenum(bpo->bpo_phys->bpo_uncomp, uncomp);
+		(void) printf("\n    %s: %llu local blkptrs, %llu subobjs, "
+		    "%s (%s/%s comp)\n",
+		    name, (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs,
+		    (u_longlong_t)bpo->bpo_phys->bpo_num_subobjs,
 		    bytes, comp, uncomp);
 	} else {
-		(void) printf("\n    %s: %llu entries, %s\n",
-		    name, (u_longlong_t)bpl.bpl_phys->bpl_entries, bytes);
+		(void) printf("\n    %s: %llu blkptrs, %s\n",
+		    name, (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs, bytes);
 	}
 
-	if (dump_opt['d'] < 5) {
-		bplist_close(&bpl);
-		bplist_fini(&bpl);
+	if (dump_opt['d'] < 5)
 		return;
-	}
 
 	(void) printf("\n");
 
-	while (bplist_iterate(&bpl, &itor, bp) == 0) {
-		char blkbuf[BP_SPRINTF_LEN];
-
-		ASSERT(bp->blk_birth != 0);
-		sprintf_blkptr_compact(blkbuf, bp);
-		(void) printf("\tItem %3llu: %s\n",
-		    (u_longlong_t)itor - 1, blkbuf);
+	(void) bpobj_iterate_nofree(bpo, dump_bpobj_cb, NULL, NULL);
+}
+
+static void
+dump_deadlist(dsl_deadlist_t *dl)
+{
+	dsl_deadlist_entry_t *dle;
+	char bytes[32];
+	char comp[32];
+	char uncomp[32];
+
+	if (dump_opt['d'] < 3)
+		return;
+
+	zdb_nicenum(dl->dl_phys->dl_used, bytes);
+	zdb_nicenum(dl->dl_phys->dl_comp, comp);
+	zdb_nicenum(dl->dl_phys->dl_uncomp, uncomp);
+	(void) printf("\n    Deadlist: %s (%s/%s comp)\n",
+	    bytes, comp, uncomp);
+
+	if (dump_opt['d'] < 4)
+		return;
+
+	(void) printf("\n");
+
+	for (dle = avl_first(&dl->dl_tree); dle;
+	    dle = AVL_NEXT(&dl->dl_tree, dle)) {
+		(void) printf("      mintxg %llu -> obj %llu\n",
+		    (longlong_t)dle->dle_mintxg,
+		    (longlong_t)dle->dle_bpobj.bpo_object);
+
+		if (dump_opt['d'] >= 5)
+			dump_bpobj(&dle->dle_bpobj, "");
 	}
-
-	bplist_close(&bpl);
-	bplist_fini(&bpl);
 }
 
 static avl_tree_t idx_tree;
@@ -1404,6 +1427,10 @@
 	dump_sa_layouts,	/* SA attribute layouts		*/
 	dump_zap,		/* DSL scrub translations	*/
 	dump_none,		/* fake dedup BP		*/
+	dump_zap,		/* deadlist			*/
+	dump_none,		/* deadlist hdr			*/
+	dump_zap,		/* dsl clones			*/
+	dump_none,		/* bpobj subobjs		*/
 	dump_unknown,		/* Unknown type, must be last	*/
 };
 
@@ -1590,8 +1617,7 @@
 		dump_intent_log(dmu_objset_zil(os));
 
 	if (dmu_objset_ds(os) != NULL)
-		dump_bplist(dmu_objset_pool(os)->dp_meta_objset,
-		    dmu_objset_ds(os)->ds_phys->ds_deadlist_obj, "Deadlist");
+		dump_deadlist(&dmu_objset_ds(os)->ds_deadlist);
 
 	if (verbosity < 2)
 		return;
@@ -1867,10 +1893,11 @@
 	uint64_t	zcb_errors[256];
 	int		zcb_readfails;
 	int		zcb_haderrors;
+	spa_t		*zcb_spa;
 } zdb_cb_t;
 
 static void
-zdb_count_block(spa_t *spa, zilog_t *zilog, zdb_cb_t *zcb, const blkptr_t *bp,
+zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp,
     dmu_object_type_t type)
 {
 	uint64_t refcnt = 0;
@@ -1898,7 +1925,7 @@
 		ddt_t *ddt;
 		ddt_entry_t *dde;
 
-		ddt = ddt_select(spa, bp);
+		ddt = ddt_select(zcb->zcb_spa, bp);
 		ddt_enter(ddt);
 		dde = ddt_lookup(ddt, bp, B_FALSE);
 
@@ -1914,8 +1941,8 @@
 		ddt_exit(ddt);
 	}
 
-	VERIFY3U(zio_wait(zio_claim(NULL, spa,
-	    refcnt ? 0 : spa_first_txg(spa),
+	VERIFY3U(zio_wait(zio_claim(NULL, zcb->zcb_spa,
+	    refcnt ? 0 : spa_first_txg(zcb->zcb_spa),
 	    bp, NULL, NULL, ZIO_FLAG_CANFAIL)), ==, 0);
 }
 
@@ -1934,7 +1961,7 @@
 
 	type = BP_GET_TYPE(bp);
 
-	zdb_count_block(spa, zilog, zcb, bp, type);
+	zdb_count_block(zcb, zilog, bp, type);
 
 	is_metadata = (BP_GET_LEVEL(bp) != 0 || dmu_ot[type].ot_metadata);
 
@@ -2048,8 +2075,7 @@
 			ddt_bp_create(ddb.ddb_checksum,
 			    &dde.dde_key, ddp, &blk);
 			if (p == DDT_PHYS_DITTO) {
-				zdb_count_block(spa, NULL, zcb, &blk,
-				    ZDB_OT_DITTO);
+				zdb_count_block(zcb, NULL, &blk, ZDB_OT_DITTO);
 			} else {
 				zcb->zcb_dedup_asize +=
 				    BP_GET_ASIZE(&blk) * (ddp->ddp_refcnt - 1);
@@ -2070,6 +2096,8 @@
 static void
 zdb_leak_init(spa_t *spa, zdb_cb_t *zcb)
 {
+	zcb->zcb_spa = spa;
+
 	if (!dump_opt['L']) {
 		vdev_t *rvd = spa->spa_root_vdev;
 		for (int c = 0; c < rvd->vdev_children; c++) {
@@ -2111,6 +2139,22 @@
 	}
 }
 
+/* ARGSUSED */
+static int
+count_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
+{
+	zdb_cb_t *zcb = arg;
+
+	if (dump_opt['b'] >= 4) {
+		char blkbuf[BP_SPRINTF_LEN];
+		sprintf_blkptr(blkbuf, bp);
+		(void) printf("[%s] %s\n",
+		    "deferred free", blkbuf);
+	}
+	zdb_count_block(zcb, NULL, bp, ZDB_OT_DEFERRED);
+	return (0);
+}
+
 static int
 dump_block_stats(spa_t *spa)
 {
@@ -2140,26 +2184,10 @@
 	/*
 	 * If there's a deferred-free bplist, process that first.
 	 */
-	if (spa->spa_deferred_bplist_obj != 0) {
-		bplist_t *bpl = &spa->spa_deferred_bplist;
-		blkptr_t blk;
-		uint64_t itor = 0;
-
-		VERIFY(0 == bplist_open(bpl, spa->spa_meta_objset,
-		    spa->spa_deferred_bplist_obj));
-
-		while (bplist_iterate(bpl, &itor, &blk) == 0) {
-			if (dump_opt['b'] >= 4) {
-				char blkbuf[BP_SPRINTF_LEN];
-				sprintf_blkptr(blkbuf, &blk);
-				(void) printf("[%s] %s\n",
-				    "deferred free", blkbuf);
-			}
-			zdb_count_block(spa, NULL, &zcb, &blk, ZDB_OT_DEFERRED);
-		}
-
-		bplist_close(bpl);
-	}
+	(void) bpobj_iterate_nofree(&spa->spa_deferred_bpobj,
+	    count_block_cb, &zcb, NULL);
+	(void) bpobj_iterate_nofree(&spa->spa_dsl_pool->dp_free_bpobj,
+	    count_block_cb, &zcb, NULL);
 
 	if (dump_opt['c'] > 1)
 		flags |= TRAVERSE_PREFETCH_DATA;
@@ -2438,8 +2466,11 @@
 	if (dump_opt['d'] || dump_opt['i']) {
 		dump_dir(dp->dp_meta_objset);
 		if (dump_opt['d'] >= 3) {
-			dump_bplist(dp->dp_meta_objset,
-			    spa->spa_deferred_bplist_obj, "Deferred frees");
+			dump_bpobj(&spa->spa_deferred_bpobj, "Deferred frees");
+			if (spa_version(spa) >= SPA_VERSION_DEADLISTS) {
+				dump_bpobj(&spa->spa_dsl_pool->dp_free_bpobj,
+				    "Pool frees");
+			}
 			dump_dtl(spa->spa_root_vdev, 0);
 		}
 		(void) dmu_objset_find(spa_name(spa), dump_one_dir,
--- a/usr/src/cmd/zinject/translate.c	Fri May 21 15:05:41 2010 -0700
+++ b/usr/src/cmd/zinject/translate.c	Fri May 21 17:29:22 2010 -0700
@@ -359,8 +359,8 @@
 		case TYPE_CONFIG:
 			record->zi_type = DMU_OT_PACKED_NVLIST;
 			break;
-		case TYPE_BPLIST:
-			record->zi_type = DMU_OT_BPLIST;
+		case TYPE_BPOBJ:
+			record->zi_type = DMU_OT_BPOBJ;
 			break;
 		case TYPE_SPACEMAP:
 			record->zi_type = DMU_OT_SPACE_MAP;
--- a/usr/src/cmd/zinject/zinject.c	Fri May 21 15:05:41 2010 -0700
+++ b/usr/src/cmd/zinject/zinject.c	Fri May 21 17:29:22 2010 -0700
@@ -69,7 +69,7 @@
  * 		mos		Any data in the MOS
  * 		mosdir		object directory
  * 		config		pool configuration
- * 		bplist		blkptr list
+ * 		bpobj		blkptr list
  * 		spacemap	spacemap
  * 		metaslab	metaslab
  * 		errlog		persistent error log
@@ -163,7 +163,7 @@
 	"mosdir",
 	"metaslab",
 	"config",
-	"bplist",
+	"bpobj",
 	"spacemap",
 	"errlog",
 	"uber",
@@ -193,8 +193,8 @@
 		return ("metaslab");
 	case DMU_OT_PACKED_NVLIST:
 		return ("config");
-	case DMU_OT_BPLIST:
-		return ("bplist");
+	case DMU_OT_BPOBJ:
+		return ("bpobj");
 	case DMU_OT_SPACE_MAP:
 		return ("spacemap");
 	case DMU_OT_ERROR_LOG:
@@ -285,7 +285,7 @@
 	    "\t\t\ton a ZFS filesystem.\n"
 	    "\n"
 	    "\t-t <mos>\tInject errors into the MOS for objects of the given\n"
-	    "\t\t\ttype.  Valid types are: mos, mosdir, config, bplist,\n"
+	    "\t\t\ttype.  Valid types are: mos, mosdir, config, bpobj,\n"
 	    "\t\t\tspacemap, metaslab, errlog.  The only valid <object> is\n"
 	    "\t\t\tthe poolname.\n");
 }
--- a/usr/src/cmd/zinject/zinject.h	Fri May 21 15:05:41 2010 -0700
+++ b/usr/src/cmd/zinject/zinject.h	Fri May 21 17:29:22 2010 -0700
@@ -38,7 +38,7 @@
 	TYPE_MOSDIR,		/* MOS object directory		*/
 	TYPE_METASLAB,		/* metaslab objects		*/
 	TYPE_CONFIG,		/* MOS config			*/
-	TYPE_BPLIST,		/* block pointer list		*/
+	TYPE_BPOBJ,		/* block pointer list		*/
 	TYPE_SPACEMAP,		/* space map objects		*/
 	TYPE_ERRLOG,		/* persistent error log		*/
 	TYPE_LABEL_UBERBLOCK,	/* label specific uberblock	*/
--- a/usr/src/cmd/zpool/zpool_main.c	Fri May 21 15:05:41 2010 -0700
+++ b/usr/src/cmd/zpool/zpool_main.c	Fri May 21 17:29:22 2010 -0700
@@ -4007,6 +4007,8 @@
 		(void) printf(gettext(" 23  Slim ZIL\n"));
 		(void) printf(gettext(" 24  System attributes\n"));
 		(void) printf(gettext(" 25  Improved scrub stats\n"));
+		(void) printf(gettext(" 26  Improved snapshot deletion "
+		    "performance\n"));
 		(void) printf(gettext("\nFor more information on a particular "
 		    "version, including supported releases,\n"));
 		(void) printf(gettext("see the ZFS Administration Guide.\n\n"));
--- a/usr/src/cmd/ztest/ztest.c	Fri May 21 15:05:41 2010 -0700
+++ b/usr/src/cmd/ztest/ztest.c	Fri May 21 17:29:22 2010 -0700
@@ -94,6 +94,7 @@
 #include <sys/dsl_prop.h>
 #include <sys/dsl_dataset.h>
 #include <sys/dsl_scan.h>
+#include <sys/zio_checksum.h>
 #include <sys/refcount.h>
 #include <stdio.h>
 #include <stdio_ext.h>
@@ -3136,9 +3137,9 @@
 		fatal(0, "dmu_objset_create(%s) = %d", clone2name, error);
 	}
 
-	error = dsl_dataset_own(snap1name, B_FALSE, FTAG, &ds);
+	error = dsl_dataset_own(snap2name, B_FALSE, FTAG, &ds);
 	if (error)
-		fatal(0, "dsl_dataset_own(%s) = %d", snap1name, error);
+		fatal(0, "dsl_dataset_own(%s) = %d", snap2name, error);
 	error = dsl_dataset_promote(clone2name, NULL);
 	if (error != EBUSY)
 		fatal(0, "dsl_dataset_promote(%s), %d, not EBUSY", clone2name,
--- a/usr/src/grub/capability	Fri May 21 15:05:41 2010 -0700
+++ b/usr/src/grub/capability	Fri May 21 17:29:22 2010 -0700
@@ -39,7 +39,7 @@
 # This file and the associated version are Solaris specific and are
 # not a part of the open source distribution of GRUB.
 #
-VERSION=17
+VERSION=18
 dboot
 xVM
 zfs
--- a/usr/src/grub/grub-0.97/stage2/zfs-include/zfs.h	Fri May 21 15:05:41 2010 -0700
+++ b/usr/src/grub/grub-0.97/stage2/zfs-include/zfs.h	Fri May 21 17:29:22 2010 -0700
@@ -26,7 +26,7 @@
 /*
  * On-disk version number.
  */
-#define	SPA_VERSION			25ULL
+#define	SPA_VERSION			26ULL
 
 /*
  * The following are configuration names used in the nvlist describing a pool's
--- a/usr/src/uts/common/Makefile.files	Fri May 21 15:05:41 2010 -0700
+++ b/usr/src/uts/common/Makefile.files	Fri May 21 17:29:22 2010 -0700
@@ -1327,6 +1327,7 @@
 ZFS_COMMON_OBJS +=		\
 	arc.o			\
 	bplist.o		\
+	bpobj.o			\
 	dbuf.o			\
 	ddt.o			\
 	ddt_zap.o		\
@@ -1340,6 +1341,7 @@
 	dnode_sync.o		\
 	dsl_dir.o		\
 	dsl_dataset.o		\
+	dsl_deadlist.o		\
 	dsl_pool.o		\
 	dsl_synctask.o		\
 	dmu_zfetch.o		\
--- a/usr/src/uts/common/fs/zfs/bplist.c	Fri May 21 15:05:41 2010 -0700
+++ b/usr/src/uts/common/fs/zfs/bplist.c	Fri May 21 17:29:22 2010 -0700
@@ -25,353 +25,45 @@
 #include <sys/bplist.h>
 #include <sys/zfs_context.h>
 
-void
-bplist_init(bplist_t *bpl)
-{
-	bzero(bpl, sizeof (*bpl));
-	mutex_init(&bpl->bpl_lock, NULL, MUTEX_DEFAULT, NULL);
-	mutex_init(&bpl->bpl_q_lock, NULL, MUTEX_DEFAULT, NULL);
-	list_create(&bpl->bpl_queue, sizeof (bplist_q_t),
-	    offsetof(bplist_q_t, bpq_node));
-}
 
 void
-bplist_fini(bplist_t *bpl)
-{
-	ASSERT(list_is_empty(&bpl->bpl_queue));
-	list_destroy(&bpl->bpl_queue);
-	mutex_destroy(&bpl->bpl_q_lock);
-	mutex_destroy(&bpl->bpl_lock);
-}
-
-static int
-bplist_hold(bplist_t *bpl)
+bplist_create(bplist_t *bpl)
 {
-	ASSERT(MUTEX_HELD(&bpl->bpl_lock));
-	if (bpl->bpl_dbuf == NULL) {
-		int err = dmu_bonus_hold(bpl->bpl_mos,
-		    bpl->bpl_object, bpl, &bpl->bpl_dbuf);
-		if (err)
-			return (err);
-		bpl->bpl_phys = bpl->bpl_dbuf->db_data;
-	}
-	return (0);
-}
-
-uint64_t
-bplist_create(objset_t *mos, int blocksize, dmu_tx_t *tx)
-{
-	int size;
-
-	size = spa_version(dmu_objset_spa(mos)) < SPA_VERSION_BPLIST_ACCOUNT ?
-	    BPLIST_SIZE_V0 : sizeof (bplist_phys_t);
-
-	return (dmu_object_alloc(mos, DMU_OT_BPLIST, blocksize,
-	    DMU_OT_BPLIST_HDR, size, tx));
+	mutex_init(&bpl->bpl_lock, NULL, MUTEX_DEFAULT, NULL);
+	list_create(&bpl->bpl_list, sizeof (bplist_entry_t),
+	    offsetof(bplist_entry_t, bpe_node));
 }
 
 void
-bplist_destroy(objset_t *mos, uint64_t object, dmu_tx_t *tx)
-{
-	VERIFY(dmu_object_free(mos, object, tx) == 0);
-}
-
-int
-bplist_open(bplist_t *bpl, objset_t *mos, uint64_t object)
+bplist_destroy(bplist_t *bpl)
 {
-	dmu_object_info_t doi;
-	int err;
-
-	err = dmu_object_info(mos, object, &doi);
-	if (err)
-		return (err);
-
-	mutex_enter(&bpl->bpl_lock);
-
-	ASSERT(bpl->bpl_dbuf == NULL);
-	ASSERT(bpl->bpl_phys == NULL);
-	ASSERT(bpl->bpl_cached_dbuf == NULL);
-	ASSERT(list_is_empty(&bpl->bpl_queue));
-	ASSERT(object != 0);
-	ASSERT3U(doi.doi_type, ==, DMU_OT_BPLIST);
-	ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_BPLIST_HDR);
-
-	bpl->bpl_mos = mos;
-	bpl->bpl_object = object;
-	bpl->bpl_blockshift = highbit(doi.doi_data_block_size - 1);
-	bpl->bpl_bpshift = bpl->bpl_blockshift - SPA_BLKPTRSHIFT;
-	bpl->bpl_havecomp = (doi.doi_bonus_size == sizeof (bplist_phys_t));
-
-	mutex_exit(&bpl->bpl_lock);
-	return (0);
+	list_destroy(&bpl->bpl_list);
+	mutex_destroy(&bpl->bpl_lock);
 }
 
 void
-bplist_close(bplist_t *bpl)
+bplist_append(bplist_t *bpl, const blkptr_t *bp)
 {
-	mutex_enter(&bpl->bpl_lock);
-
-	ASSERT(list_is_empty(&bpl->bpl_queue));
+	bplist_entry_t *bpe = kmem_alloc(sizeof (*bpe), KM_SLEEP);
 
-	if (bpl->bpl_cached_dbuf) {
-		dmu_buf_rele(bpl->bpl_cached_dbuf, bpl);
-		bpl->bpl_cached_dbuf = NULL;
-	}
-	if (bpl->bpl_dbuf) {
-		dmu_buf_rele(bpl->bpl_dbuf, bpl);
-		bpl->bpl_dbuf = NULL;
-		bpl->bpl_phys = NULL;
-	}
-
+	mutex_enter(&bpl->bpl_lock);
+	bpe->bpe_blk = *bp;
+	list_insert_tail(&bpl->bpl_list, bpe);
 	mutex_exit(&bpl->bpl_lock);
 }
 
-boolean_t
-bplist_empty(bplist_t *bpl)
-{
-	boolean_t rv;
-
-	if (bpl->bpl_object == 0)
-		return (B_TRUE);
-
-	mutex_enter(&bpl->bpl_lock);
-	VERIFY(0 == bplist_hold(bpl)); /* XXX */
-	rv = (bpl->bpl_phys->bpl_entries == 0);
-	mutex_exit(&bpl->bpl_lock);
-
-	return (rv);
-}
-
-static int
-bplist_cache(bplist_t *bpl, uint64_t blkid)
+void
+bplist_iterate(bplist_t *bpl, bplist_itor_t *func, void *arg, dmu_tx_t *tx)
 {
-	int err = 0;
-
-	if (bpl->bpl_cached_dbuf == NULL ||
-	    bpl->bpl_cached_dbuf->db_offset != (blkid << bpl->bpl_blockshift)) {
-		if (bpl->bpl_cached_dbuf != NULL)
-			dmu_buf_rele(bpl->bpl_cached_dbuf, bpl);
-		err = dmu_buf_hold(bpl->bpl_mos,
-		    bpl->bpl_object, blkid << bpl->bpl_blockshift,
-		    bpl, &bpl->bpl_cached_dbuf, DMU_READ_PREFETCH);
-		ASSERT(err || bpl->bpl_cached_dbuf->db_size ==
-		    1ULL << bpl->bpl_blockshift);
-	}
-	return (err);
-}
-
-int
-bplist_iterate(bplist_t *bpl, uint64_t *itorp, blkptr_t *bp)
-{
-	uint64_t blk, off;
-	blkptr_t *bparray;
-	int err;
+	bplist_entry_t *bpe;
 
 	mutex_enter(&bpl->bpl_lock);
-
-	err = bplist_hold(bpl);
-	if (err) {
-		mutex_exit(&bpl->bpl_lock);
-		return (err);
-	}
-
-	do {
-		if (*itorp >= bpl->bpl_phys->bpl_entries) {
-			mutex_exit(&bpl->bpl_lock);
-			return (ENOENT);
-		}
-
-		blk = *itorp >> bpl->bpl_bpshift;
-		off = P2PHASE(*itorp, 1ULL << bpl->bpl_bpshift);
-
-		err = bplist_cache(bpl, blk);
-		if (err) {
-			mutex_exit(&bpl->bpl_lock);
-			return (err);
-		}
-
-		bparray = bpl->bpl_cached_dbuf->db_data;
-		*bp = bparray[off];
-		(*itorp)++;
-	} while (bp->blk_birth == 0);
-
-	mutex_exit(&bpl->bpl_lock);
-	return (0);
-}
-
-int
-bplist_enqueue(bplist_t *bpl, const blkptr_t *bp, dmu_tx_t *tx)
-{
-	uint64_t blk, off;
-	blkptr_t *bparray;
-	int err;
-
-	ASSERT(!BP_IS_HOLE(bp));
-	mutex_enter(&bpl->bpl_lock);
-	err = bplist_hold(bpl);
-	if (err) {
-		mutex_exit(&bpl->bpl_lock);
-		return (err);
-	}
-
-	blk = bpl->bpl_phys->bpl_entries >> bpl->bpl_bpshift;
-	off = P2PHASE(bpl->bpl_phys->bpl_entries, 1ULL << bpl->bpl_bpshift);
-
-	err = bplist_cache(bpl, blk);
-	if (err) {
+	while (bpe = list_head(&bpl->bpl_list)) {
+		list_remove(&bpl->bpl_list, bpe);
 		mutex_exit(&bpl->bpl_lock);
-		return (err);
-	}
-
-	dmu_buf_will_dirty(bpl->bpl_cached_dbuf, tx);
-	bparray = bpl->bpl_cached_dbuf->db_data;
-	bparray[off] = *bp;
-
-	/* We never need the fill count. */
-	bparray[off].blk_fill = 0;
-
-	/* The bplist will compress better if we can leave off the checksum */
-	if (!BP_GET_DEDUP(&bparray[off]))
-		bzero(&bparray[off].blk_cksum, sizeof (bparray[off].blk_cksum));
-
-	dmu_buf_will_dirty(bpl->bpl_dbuf, tx);
-	bpl->bpl_phys->bpl_entries++;
-	bpl->bpl_phys->bpl_bytes +=
-	    bp_get_dsize_sync(dmu_objset_spa(bpl->bpl_mos), bp);
-	if (bpl->bpl_havecomp) {
-		bpl->bpl_phys->bpl_comp += BP_GET_PSIZE(bp);
-		bpl->bpl_phys->bpl_uncomp += BP_GET_UCSIZE(bp);
-	}
-	mutex_exit(&bpl->bpl_lock);
-
-	return (0);
-}
-
-void
-bplist_enqueue_cb(void *bpl, const blkptr_t *bp, dmu_tx_t *tx)
-{
-	VERIFY(bplist_enqueue(bpl, bp, tx) == 0);
-}
-
-/*
- * Deferred entry; will be processed later by bplist_sync().
- */
-void
-bplist_enqueue_deferred(bplist_t *bpl, const blkptr_t *bp)
-{
-	bplist_q_t *bpq = kmem_alloc(sizeof (*bpq), KM_SLEEP);
-
-	ASSERT(!BP_IS_HOLE(bp));
-	mutex_enter(&bpl->bpl_q_lock);
-	bpq->bpq_blk = *bp;
-	list_insert_tail(&bpl->bpl_queue, bpq);
-	mutex_exit(&bpl->bpl_q_lock);
-}
-
-void
-bplist_sync(bplist_t *bpl, bplist_sync_cb_t *func, void *arg, dmu_tx_t *tx)
-{
-	bplist_q_t *bpq;
-
-	mutex_enter(&bpl->bpl_q_lock);
-	while (bpq = list_head(&bpl->bpl_queue)) {
-		list_remove(&bpl->bpl_queue, bpq);
-		mutex_exit(&bpl->bpl_q_lock);
-		func(arg, &bpq->bpq_blk, tx);
-		kmem_free(bpq, sizeof (*bpq));
-		mutex_enter(&bpl->bpl_q_lock);
-	}
-	mutex_exit(&bpl->bpl_q_lock);
-}
-
-void
-bplist_vacate(bplist_t *bpl, dmu_tx_t *tx)
-{
-	mutex_enter(&bpl->bpl_lock);
-	ASSERT(list_is_empty(&bpl->bpl_queue));
-	VERIFY(0 == bplist_hold(bpl));
-	dmu_buf_will_dirty(bpl->bpl_dbuf, tx);
-	VERIFY(0 == dmu_free_range(bpl->bpl_mos,
-	    bpl->bpl_object, 0, -1ULL, tx));
-	bpl->bpl_phys->bpl_entries = 0;
-	bpl->bpl_phys->bpl_bytes = 0;
-	if (bpl->bpl_havecomp) {
-		bpl->bpl_phys->bpl_comp = 0;
-		bpl->bpl_phys->bpl_uncomp = 0;
+		func(arg, &bpe->bpe_blk, tx);
+		kmem_free(bpe, sizeof (*bpe));
+		mutex_enter(&bpl->bpl_lock);
 	}
 	mutex_exit(&bpl->bpl_lock);
 }
-
-int
-bplist_space(bplist_t *bpl, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
-{
-	int err;
-
-	mutex_enter(&bpl->bpl_lock);
-
-	err = bplist_hold(bpl);
-	if (err) {
-		mutex_exit(&bpl->bpl_lock);
-		return (err);
-	}
-
-	*usedp = bpl->bpl_phys->bpl_bytes;
-	if (bpl->bpl_havecomp) {
-		*compp = bpl->bpl_phys->bpl_comp;
-		*uncompp = bpl->bpl_phys->bpl_uncomp;
-	}
-	mutex_exit(&bpl->bpl_lock);
-
-	if (!bpl->bpl_havecomp) {
-		uint64_t itor = 0, comp = 0, uncomp = 0;
-		blkptr_t bp;
-
-		while ((err = bplist_iterate(bpl, &itor, &bp)) == 0) {
-			comp += BP_GET_PSIZE(&bp);
-			uncomp += BP_GET_UCSIZE(&bp);
-		}
-		if (err == ENOENT)
-			err = 0;
-		*compp = comp;
-		*uncompp = uncomp;
-	}
-
-	return (err);
-}
-
-/*
- * Return (in *dsizep) the amount of space on the deadlist which is:
- * mintxg < blk_birth <= maxtxg
- */
-int
-bplist_space_birthrange(bplist_t *bpl, uint64_t mintxg, uint64_t maxtxg,
-    uint64_t *dsizep)
-{
-	uint64_t size = 0;
-	uint64_t itor = 0;
-	blkptr_t bp;
-	int err;
-
-	/*
-	 * As an optimization, if they want the whole txg range, just
-	 * get bpl_bytes rather than iterating over the bps.
-	 */
-	if (mintxg < TXG_INITIAL && maxtxg == UINT64_MAX) {
-		mutex_enter(&bpl->bpl_lock);
-		err = bplist_hold(bpl);
-		if (err == 0)
-			*dsizep = bpl->bpl_phys->bpl_bytes;
-		mutex_exit(&bpl->bpl_lock);
-		return (err);
-	}
-
-	while ((err = bplist_iterate(bpl, &itor, &bp)) == 0) {
-		if (bp.blk_birth > mintxg && bp.blk_birth <= maxtxg) {
-			size += bp_get_dsize(dmu_objset_spa(bpl->bpl_mos), &bp);
-		}
-	}
-	if (err == ENOENT)
-		err = 0;
-	*dsizep = size;
-	return (err);
-}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/common/fs/zfs/bpobj.c	Fri May 21 17:29:22 2010 -0700
@@ -0,0 +1,462 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#include <sys/bpobj.h>
+#include <sys/zfs_context.h>
+#include <sys/refcount.h>
+
+uint64_t
+bpobj_alloc(objset_t *os, int blocksize, dmu_tx_t *tx)
+{
+	int size;
+
+	if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_BPOBJ_ACCOUNT)
+		size = BPOBJ_SIZE_V0;
+	else if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_DEADLISTS)
+		size = BPOBJ_SIZE_V1;
+	else
+		size = sizeof (bpobj_phys_t);
+
+	return (dmu_object_alloc(os, DMU_OT_BPOBJ, blocksize,
+	    DMU_OT_BPOBJ_HDR, size, tx));
+}
+
+void
+bpobj_free(objset_t *os, uint64_t obj, dmu_tx_t *tx)
+{
+	int64_t i;
+	bpobj_t bpo;
+	dmu_object_info_t doi;
+	int epb;
+	dmu_buf_t *dbuf = NULL;
+
+	VERIFY3U(0, ==, bpobj_open(&bpo, os, obj));
+
+	mutex_enter(&bpo.bpo_lock);
+
+	if (!bpo.bpo_havesubobj || bpo.bpo_phys->bpo_subobjs == 0)
+		goto out;
+
+	VERIFY3U(0, ==, dmu_object_info(os, bpo.bpo_phys->bpo_subobjs, &doi));
+	epb = doi.doi_data_block_size / sizeof (uint64_t);
+
+	for (i = bpo.bpo_phys->bpo_num_subobjs - 1; i >= 0; i--) {
+		uint64_t *objarray;
+		uint64_t offset, blkoff;
+
+		offset = i * sizeof (uint64_t);
+		blkoff = P2PHASE(i, epb);
+
+		if (dbuf == NULL || dbuf->db_offset > offset) {
+			if (dbuf)
+				dmu_buf_rele(dbuf, FTAG);
+			VERIFY3U(0, ==, dmu_buf_hold(os,
+			    bpo.bpo_phys->bpo_subobjs, offset, FTAG, &dbuf, 0));
+		}
+
+		ASSERT3U(offset, >=, dbuf->db_offset);
+		ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size);
+
+		objarray = dbuf->db_data;
+		bpobj_free(os, objarray[blkoff], tx);
+	}
+	if (dbuf) {
+		dmu_buf_rele(dbuf, FTAG);
+		dbuf = NULL;
+	}
+	VERIFY3U(0, ==, dmu_object_free(os, bpo.bpo_phys->bpo_subobjs, tx));
+
+out:
+	mutex_exit(&bpo.bpo_lock);
+	bpobj_close(&bpo);
+
+	VERIFY3U(0, ==, dmu_object_free(os, obj, tx));
+}
+
+int
+bpobj_open(bpobj_t *bpo, objset_t *os, uint64_t object)
+{
+	dmu_object_info_t doi;
+	int err;
+
+	err = dmu_object_info(os, object, &doi);
+	if (err)
+		return (err);
+
+	bzero(bpo, sizeof (*bpo));
+	mutex_init(&bpo->bpo_lock, NULL, MUTEX_DEFAULT, NULL);
+
+	ASSERT(bpo->bpo_dbuf == NULL);
+	ASSERT(bpo->bpo_phys == NULL);
+	ASSERT(object != 0);
+	ASSERT3U(doi.doi_type, ==, DMU_OT_BPOBJ);
+	ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_BPOBJ_HDR);
+
+	bpo->bpo_os = os;
+	bpo->bpo_object = object;
+	bpo->bpo_epb = doi.doi_data_block_size >> SPA_BLKPTRSHIFT;
+	bpo->bpo_havecomp = (doi.doi_bonus_size > BPOBJ_SIZE_V0);
+	bpo->bpo_havesubobj = (doi.doi_bonus_size > BPOBJ_SIZE_V1);
+
+	err = dmu_bonus_hold(bpo->bpo_os,
+	    bpo->bpo_object, bpo, &bpo->bpo_dbuf);
+	if (err)
+		return (err);
+	bpo->bpo_phys = bpo->bpo_dbuf->db_data;
+	return (0);
+}
+
+void
+bpobj_close(bpobj_t *bpo)
+{
+	/* Lame workaround for closing a bpobj that was never opened. */
+	if (bpo->bpo_object == 0)
+		return;
+
+	dmu_buf_rele(bpo->bpo_dbuf, bpo);
+	if (bpo->bpo_cached_dbuf != NULL)
+		dmu_buf_rele(bpo->bpo_cached_dbuf, bpo);
+	bpo->bpo_dbuf = NULL;
+	bpo->bpo_phys = NULL;
+	bpo->bpo_cached_dbuf = NULL;
+
+	mutex_destroy(&bpo->bpo_lock);
+}
+
+static int
+bpobj_iterate_impl(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx,
+    boolean_t free)
+{
+	dmu_object_info_t doi;
+	int epb;
+	int64_t i;
+	int err = 0;
+	dmu_buf_t *dbuf = NULL;
+
+	mutex_enter(&bpo->bpo_lock);
+
+	if (free)
+		dmu_buf_will_dirty(bpo->bpo_dbuf, tx);
+
+	for (i = bpo->bpo_phys->bpo_num_blkptrs - 1; i >= 0; i--) {
+		blkptr_t *bparray;
+		blkptr_t *bp;
+		uint64_t offset, blkoff;
+
+		offset = i * sizeof (blkptr_t);
+		blkoff = P2PHASE(i, bpo->bpo_epb);
+
+		if (dbuf == NULL || dbuf->db_offset > offset) {
+			if (dbuf)
+				dmu_buf_rele(dbuf, FTAG);
+			err = dmu_buf_hold(bpo->bpo_os, bpo->bpo_object, offset,
+			    FTAG, &dbuf, 0);
+			if (err)
+				break;
+		}
+
+		ASSERT3U(offset, >=, dbuf->db_offset);
+		ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size);
+
+		bparray = dbuf->db_data;
+		bp = &bparray[blkoff];
+		err = func(arg, bp, tx);
+		if (err)
+			break;
+		if (free) {
+			bpo->bpo_phys->bpo_bytes -=
+			    bp_get_dsize_sync(dmu_objset_spa(bpo->bpo_os), bp);
+			ASSERT3S(bpo->bpo_phys->bpo_bytes, >=, 0);
+			if (bpo->bpo_havecomp) {
+				bpo->bpo_phys->bpo_comp -= BP_GET_PSIZE(bp);
+				bpo->bpo_phys->bpo_uncomp -= BP_GET_UCSIZE(bp);
+			}
+			bpo->bpo_phys->bpo_num_blkptrs--;
+			ASSERT3S(bpo->bpo_phys->bpo_num_blkptrs, >=, 0);
+		}
+	}
+	if (dbuf) {
+		dmu_buf_rele(dbuf, FTAG);
+		dbuf = NULL;
+	}
+	if (free) {
+		i++;
+		VERIFY3U(0, ==, dmu_free_range(bpo->bpo_os, bpo->bpo_object,
+		    i * sizeof (blkptr_t), -1ULL, tx));
+	}
+	if (err || !bpo->bpo_havesubobj || bpo->bpo_phys->bpo_subobjs == 0)
+		goto out;
+
+	ASSERT(bpo->bpo_havecomp);
+	err = dmu_object_info(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, &doi);
+	if (err)
+		return (err);
+	epb = doi.doi_data_block_size / sizeof (uint64_t);
+
+	for (i = bpo->bpo_phys->bpo_num_subobjs - 1; i >= 0; i--) {
+		uint64_t *objarray;
+		uint64_t offset, blkoff;
+		bpobj_t sublist;
+		uint64_t used_before, comp_before, uncomp_before;
+		uint64_t used_after, comp_after, uncomp_after;
+
+		offset = i * sizeof (uint64_t);
+		blkoff = P2PHASE(i, epb);
+
+		if (dbuf == NULL || dbuf->db_offset > offset) {
+			if (dbuf)
+				dmu_buf_rele(dbuf, FTAG);
+			err = dmu_buf_hold(bpo->bpo_os,
+			    bpo->bpo_phys->bpo_subobjs, offset, FTAG, &dbuf, 0);
+			if (err)
+				break;
+		}
+
+		ASSERT3U(offset, >=, dbuf->db_offset);
+		ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size);
+
+		objarray = dbuf->db_data;
+		err = bpobj_open(&sublist, bpo->bpo_os, objarray[blkoff]);
+		if (err)
+			break;
+		if (free) {
+			err = bpobj_space(&sublist,
+			    &used_before, &comp_before, &uncomp_before);
+			if (err)
+				break;
+		}
+		err = bpobj_iterate_impl(&sublist, func, arg, tx, free);
+		if (free) {
+			VERIFY3U(0, ==, bpobj_space(&sublist,
+			    &used_after, &comp_after, &uncomp_after));
+			bpo->bpo_phys->bpo_bytes -= used_before - used_after;
+			ASSERT3S(bpo->bpo_phys->bpo_bytes, >=, 0);
+			bpo->bpo_phys->bpo_comp -= comp_before - used_after;
+			bpo->bpo_phys->bpo_uncomp -=
+			    uncomp_before - uncomp_after;
+		}
+
+		bpobj_close(&sublist);
+		if (err)
+			break;
+		if (free) {
+			err = dmu_object_free(bpo->bpo_os,
+			    objarray[blkoff], tx);
+			if (err)
+				break;
+			bpo->bpo_phys->bpo_num_subobjs--;
+			ASSERT3S(bpo->bpo_phys->bpo_num_subobjs, >=, 0);
+		}
+	}
+	if (dbuf) {
+		dmu_buf_rele(dbuf, FTAG);
+		dbuf = NULL;
+	}
+	if (free) {
+		VERIFY3U(0, ==, dmu_free_range(bpo->bpo_os,
+		    bpo->bpo_phys->bpo_subobjs,
+		    (i + 1) * sizeof (uint64_t), -1ULL, tx));
+	}
+
+out:
+	/* If there are no entries, there should be no bytes. */
+	ASSERT(bpo->bpo_phys->bpo_num_blkptrs > 0 ||
+	    (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_num_subobjs > 0) ||
+	    bpo->bpo_phys->bpo_bytes == 0);
+
+	mutex_exit(&bpo->bpo_lock);
+	return (err);
+}
+
+/*
+ * Iterate and remove the entries.  If func returns nonzero, iteration
+ * will stop and that entry will not be removed.
+ */
+int
+bpobj_iterate(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx)
+{
+	return (bpobj_iterate_impl(bpo, func, arg, tx, B_TRUE));
+}
+
+/*
+ * Iterate the entries.  If func returns nonzero, iteration will stop.
+ */
+int
+bpobj_iterate_nofree(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx)
+{
+	return (bpobj_iterate_impl(bpo, func, arg, tx, B_FALSE));
+}
+
+void
+bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx)
+{
+	bpobj_t subbpo;
+	uint64_t used, comp, uncomp;
+
+	ASSERT(bpo->bpo_havesubobj);
+	ASSERT(bpo->bpo_havecomp);
+
+	VERIFY3U(0, ==, bpobj_open(&subbpo, bpo->bpo_os, subobj));
+	VERIFY3U(0, ==, bpobj_space(&subbpo, &used, &comp, &uncomp));
+	bpobj_close(&subbpo);
+
+	if (used == 0) {
+		/* No point in having an empty subobj. */
+		bpobj_free(bpo->bpo_os, subobj, tx);
+		return;
+	}
+
+	dmu_buf_will_dirty(bpo->bpo_dbuf, tx);
+	if (bpo->bpo_phys->bpo_subobjs == 0) {
+		bpo->bpo_phys->bpo_subobjs = dmu_object_alloc(bpo->bpo_os,
+		    DMU_OT_BPOBJ_SUBOBJ, SPA_MAXBLOCKSIZE, DMU_OT_NONE, 0, tx);
+	}
+
+	mutex_enter(&bpo->bpo_lock);
+	dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs,
+	    bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj),
+	    sizeof (subobj), &subobj, tx);
+	bpo->bpo_phys->bpo_num_subobjs++;
+	bpo->bpo_phys->bpo_bytes += used;
+	bpo->bpo_phys->bpo_comp += comp;
+	bpo->bpo_phys->bpo_uncomp += uncomp;
+	mutex_exit(&bpo->bpo_lock);
+}
+
+void
+bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, dmu_tx_t *tx)
+{
+	blkptr_t stored_bp = *bp;
+	uint64_t offset;
+	int blkoff;
+	blkptr_t *bparray;
+
+	ASSERT(!BP_IS_HOLE(bp));
+
+	/* We never need the fill count. */
+	stored_bp.blk_fill = 0;
+
+	/* The bpobj will compress better if we can leave off the checksum */
+	if (!BP_GET_DEDUP(bp))
+		bzero(&stored_bp.blk_cksum, sizeof (stored_bp.blk_cksum));
+
+	mutex_enter(&bpo->bpo_lock);
+
+	offset = bpo->bpo_phys->bpo_num_blkptrs * sizeof (stored_bp);
+	blkoff = P2PHASE(bpo->bpo_phys->bpo_num_blkptrs, bpo->bpo_epb);
+
+	if (bpo->bpo_cached_dbuf == NULL ||
+	    offset < bpo->bpo_cached_dbuf->db_offset ||
+	    offset >= bpo->bpo_cached_dbuf->db_offset +
+	    bpo->bpo_cached_dbuf->db_size) {
+		if (bpo->bpo_cached_dbuf)
+			dmu_buf_rele(bpo->bpo_cached_dbuf, bpo);
+		VERIFY3U(0, ==, dmu_buf_hold(bpo->bpo_os, bpo->bpo_object,
+		    offset, bpo, &bpo->bpo_cached_dbuf, 0));
+	}
+
+	dmu_buf_will_dirty(bpo->bpo_cached_dbuf, tx);
+	bparray = bpo->bpo_cached_dbuf->db_data;
+	bparray[blkoff] = stored_bp;
+
+	dmu_buf_will_dirty(bpo->bpo_dbuf, tx);
+	bpo->bpo_phys->bpo_num_blkptrs++;
+	bpo->bpo_phys->bpo_bytes +=
+	    bp_get_dsize_sync(dmu_objset_spa(bpo->bpo_os), bp);
+	if (bpo->bpo_havecomp) {
+		bpo->bpo_phys->bpo_comp += BP_GET_PSIZE(bp);
+		bpo->bpo_phys->bpo_uncomp += BP_GET_UCSIZE(bp);
+	}
+	mutex_exit(&bpo->bpo_lock);
+}
+
+struct space_range_arg {
+	spa_t *spa;
+	uint64_t mintxg;
+	uint64_t maxtxg;
+	uint64_t used;
+	uint64_t comp;
+	uint64_t uncomp;
+};
+
+/* ARGSUSED */
+static int
+space_range_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
+{
+	struct space_range_arg *sra = arg;
+
+	if (bp->blk_birth > sra->mintxg && bp->blk_birth <= sra->maxtxg) {
+		sra->used += bp_get_dsize_sync(sra->spa, bp);
+		sra->comp += BP_GET_PSIZE(bp);
+		sra->uncomp += BP_GET_UCSIZE(bp);
+	}
+	return (0);
+}
+
+int
+bpobj_space(bpobj_t *bpo, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
+{
+	mutex_enter(&bpo->bpo_lock);
+
+	*usedp = bpo->bpo_phys->bpo_bytes;
+	if (bpo->bpo_havecomp) {
+		*compp = bpo->bpo_phys->bpo_comp;
+		*uncompp = bpo->bpo_phys->bpo_uncomp;
+		mutex_exit(&bpo->bpo_lock);
+		return (0);
+	} else {
+		mutex_exit(&bpo->bpo_lock);
+		return (bpobj_space_range(bpo, 0, UINT64_MAX,
+		    usedp, compp, uncompp));
+	}
+}
+
+/*
+ * Return the amount of space in the bpobj which is:
+ * mintxg < blk_birth <= maxtxg
+ */
+int
+bpobj_space_range(bpobj_t *bpo, uint64_t mintxg, uint64_t maxtxg,
+    uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
+{
+	struct space_range_arg sra = { 0 };
+	int err;
+
+	/*
+	 * As an optimization, if they want the whole txg range, just
+	 * get bpo_bytes rather than iterating over the bps.
+	 */
+	if (mintxg < TXG_INITIAL && maxtxg == UINT64_MAX && bpo->bpo_havecomp)
+		return (bpobj_space(bpo, usedp, compp, uncompp));
+
+	sra.spa = dmu_objset_spa(bpo->bpo_os);
+	sra.mintxg = mintxg;
+	sra.maxtxg = maxtxg;
+
+	err = bpobj_iterate_nofree(bpo, space_range_cb, &sra, NULL);
+	*usedp = sra.used;
+	*compp = sra.comp;
+	*uncompp = sra.uncomp;
+	return (err);
+}
--- a/usr/src/uts/common/fs/zfs/dmu.c	Fri May 21 15:05:41 2010 -0700
+++ b/usr/src/uts/common/fs/zfs/dmu.c	Fri May 21 17:29:22 2010 -0700
@@ -51,8 +51,8 @@
 	{	byteswap_uint64_array,	TRUE,	"object array"		},
 	{	byteswap_uint8_array,	TRUE,	"packed nvlist"		},
 	{	byteswap_uint64_array,	TRUE,	"packed nvlist size"	},
-	{	byteswap_uint64_array,	TRUE,	"bplist"		},
-	{	byteswap_uint64_array,	TRUE,	"bplist header"		},
+	{	byteswap_uint64_array,	TRUE,	"bpobj"			},
+	{	byteswap_uint64_array,	TRUE,	"bpobj header"		},
 	{	byteswap_uint64_array,	TRUE,	"SPA space map header"	},
 	{	byteswap_uint64_array,	TRUE,	"SPA space map"		},
 	{	byteswap_uint64_array,	TRUE,	"ZIL intent log"	},
@@ -96,6 +96,10 @@
 	{	zap_byteswap,		TRUE,	"SA attr layouts"	},
 	{	zap_byteswap,		TRUE,	"scan translations"	},
 	{	byteswap_uint8_array,	FALSE,	"deduplicated block"	},
+	{	zap_byteswap,		TRUE,	"DSL deadlist map"	},
+	{	byteswap_uint64_array,	TRUE,	"DSL deadlist map hdr"	},
+	{	zap_byteswap,		TRUE,	"DSL dir clones"	},
+	{	byteswap_uint64_array,	TRUE,	"bpobj subobj"		},
 };
 
 int
--- a/usr/src/uts/common/fs/zfs/dmu_send.c	Fri May 21 15:05:41 2010 -0700
+++ b/usr/src/uts/common/fs/zfs/dmu_send.c	Fri May 21 17:29:22 2010 -0700
@@ -39,6 +39,7 @@
 #include <sys/zap.h>
 #include <sys/zio_checksum.h>
 #include <sys/zfs_znode.h>
+#include <zfs_fletcher.h>
 #include <sys/avl.h>
 #include <sys/ddt.h>
 
--- a/usr/src/uts/common/fs/zfs/dsl_dataset.c	Fri May 21 15:05:41 2010 -0700
+++ b/usr/src/uts/common/fs/zfs/dsl_dataset.c	Fri May 21 17:29:22 2010 -0700
@@ -39,6 +39,7 @@
 #include <sys/zfs_znode.h>
 #include <sys/zvol.h>
 #include <sys/dsl_scan.h>
+#include <sys/dsl_deadlist.h>
 
 /*
  * Enable/disable prefetching of dedup-ed blocks which are going to be freed.
@@ -51,6 +52,13 @@
 static dsl_syncfunc_t dsl_dataset_destroy_begin_sync;
 static dsl_syncfunc_t dsl_dataset_set_reservation_sync;
 
+#define	SWITCH64(x, y) \
+	{ \
+		uint64_t __tmp = (x); \
+		(x) = (y); \
+		(y) = __tmp; \
+	}
+
 #define	DS_REF_MAX	(1ULL << 62)
 
 #define	DSL_DEADLIST_BLOCKSIZE	SPA_MAXBLOCKSIZE
@@ -178,13 +186,13 @@
 			/*
 			 * We are here as part of zio's write done callback,
 			 * which means we're a zio interrupt thread.  We can't
-			 * call bplist_enqueue() now because it may block
+			 * call dsl_deadlist_insert() now because it may block
 			 * waiting for I/O.  Instead, put bp on the deferred
 			 * queue and let dsl_pool_sync() finish the job.
 			 */
-			bplist_enqueue_deferred(&ds->ds_deadlist, bp);
+			bplist_append(&ds->ds_pending_deadlist, bp);
 		} else {
-			VERIFY(0 == bplist_enqueue(&ds->ds_deadlist, bp, tx));
+			dsl_deadlist_insert(&ds->ds_deadlist, bp, tx);
 		}
 		ASSERT3U(ds->ds_prev->ds_object, ==,
 		    ds->ds_phys->ds_prev_snap_obj);
@@ -269,7 +277,13 @@
 		ds->ds_prev = NULL;
 	}
 
-	bplist_close(&ds->ds_deadlist);
+	bplist_destroy(&ds->ds_pending_deadlist);
+	if (db != NULL) {
+		dsl_deadlist_close(&ds->ds_deadlist);
+	} else {
+		ASSERT(ds->ds_deadlist.dl_dbuf == NULL);
+		ASSERT(!ds->ds_deadlist.dl_oldfmt);
+	}
 	if (ds->ds_dir)
 		dsl_dir_close(ds->ds_dir, ds);
 
@@ -280,7 +294,6 @@
 	mutex_destroy(&ds->ds_opening_lock);
 	rw_destroy(&ds->ds_rwlock);
 	cv_destroy(&ds->ds_exclusive_cv);
-	bplist_fini(&ds->ds_deadlist);
 
 	kmem_free(ds, sizeof (dsl_dataset_t));
 }
@@ -380,25 +393,23 @@
 		mutex_init(&ds->ds_opening_lock, NULL, MUTEX_DEFAULT, NULL);
 		rw_init(&ds->ds_rwlock, 0, 0, 0);
 		cv_init(&ds->ds_exclusive_cv, NULL, CV_DEFAULT, NULL);
-		bplist_init(&ds->ds_deadlist);
-
-		err = bplist_open(&ds->ds_deadlist,
+
+		bplist_create(&ds->ds_pending_deadlist);
+		dsl_deadlist_open(&ds->ds_deadlist,
 		    mos, ds->ds_phys->ds_deadlist_obj);
+
 		if (err == 0) {
 			err = dsl_dir_open_obj(dp,
 			    ds->ds_phys->ds_dir_obj, NULL, ds, &ds->ds_dir);
 		}
 		if (err) {
-			/*
-			 * we don't really need to close the blist if we
-			 * just opened it.
-			 */
 			mutex_destroy(&ds->ds_lock);
 			mutex_destroy(&ds->ds_recvlock);
 			mutex_destroy(&ds->ds_opening_lock);
 			rw_destroy(&ds->ds_rwlock);
 			cv_destroy(&ds->ds_exclusive_cv);
-			bplist_fini(&ds->ds_deadlist);
+			bplist_destroy(&ds->ds_pending_deadlist);
+			dsl_deadlist_close(&ds->ds_deadlist);
 			kmem_free(ds, sizeof (dsl_dataset_t));
 			dmu_buf_rele(dbuf, tag);
 			return (err);
@@ -455,7 +466,8 @@
 			    dsl_dataset_evict);
 		}
 		if (err || winner) {
-			bplist_close(&ds->ds_deadlist);
+			bplist_destroy(&ds->ds_pending_deadlist);
+			dsl_deadlist_close(&ds->ds_deadlist);
 			if (ds->ds_prev)
 				dsl_dataset_drop_ref(ds->ds_prev, ds);
 			dsl_dir_close(ds->ds_dir, ds);
@@ -464,7 +476,6 @@
 			mutex_destroy(&ds->ds_opening_lock);
 			rw_destroy(&ds->ds_rwlock);
 			cv_destroy(&ds->ds_exclusive_cv);
-			bplist_fini(&ds->ds_deadlist);
 			kmem_free(ds, sizeof (dsl_dataset_t));
 			if (err) {
 				dmu_buf_rele(dbuf, tag);
@@ -726,7 +737,7 @@
 	if (ds->ds_dbuf)
 		dsl_dataset_drop_ref(ds, tag);
 	else
-		dsl_dataset_evict(ds->ds_dbuf, ds);
+		dsl_dataset_evict(NULL, ds);
 }
 
 boolean_t
@@ -788,10 +799,12 @@
 	    DMU_OT_NONE, 0, tx);
 	dsphys->ds_creation_time = gethrestime_sec();
 	dsphys->ds_creation_txg = tx->tx_txg == TXG_INITIAL ? 1 : tx->tx_txg;
-	dsphys->ds_deadlist_obj =
-	    bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx);
-
-	if (origin) {
+
+	if (origin == NULL) {
+		dsphys->ds_deadlist_obj = dsl_deadlist_alloc(mos, tx);
+	} else {
+		dsl_dataset_t *ohds;
+
 		dsphys->ds_prev_snap_obj = origin->ds_object;
 		dsphys->ds_prev_snap_txg =
 		    origin->ds_phys->ds_creation_txg;
@@ -807,6 +820,12 @@
 		dmu_buf_will_dirty(origin->ds_dbuf, tx);
 		origin->ds_phys->ds_num_children++;
 
+		VERIFY3U(0, ==, dsl_dataset_hold_obj(dp,
+		    origin->ds_dir->dd_phys->dd_head_dataset_obj, FTAG, &ohds));
+		dsphys->ds_deadlist_obj = dsl_deadlist_clone(&ohds->ds_deadlist,
+		    dsphys->ds_prev_snap_txg, dsphys->ds_prev_snap_obj, tx);
+		dsl_dataset_rele(ohds, FTAG);
+
 		if (spa_version(dp->dp_spa) >= SPA_VERSION_NEXT_CLONES) {
 			if (origin->ds_phys->ds_next_clones_obj == 0) {
 				origin->ds_phys->ds_next_clones_obj =
@@ -820,6 +839,16 @@
 
 		dmu_buf_will_dirty(dd->dd_dbuf, tx);
 		dd->dd_phys->dd_origin_obj = origin->ds_object;
+		if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
+			if (origin->ds_dir->dd_phys->dd_clones == 0) {
+				dmu_buf_will_dirty(origin->ds_dir->dd_dbuf, tx);
+				origin->ds_dir->dd_phys->dd_clones =
+				    zap_create(mos,
+				    DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx);
+			}
+			VERIFY3U(0, ==, zap_add_int(mos,
+			    origin->ds_dir->dd_phys->dd_clones, dsobj, tx));
+		}
 	}
 
 	if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE)
@@ -1201,8 +1230,7 @@
 	else
 		mrs_used = 0;
 
-	VERIFY(0 == bplist_space(&ds->ds_deadlist, &dlused, &dlcomp,
-	    &dluncomp));
+	dsl_deadlist_space(&ds->ds_deadlist, &dlused, &dlcomp, &dluncomp);
 
 	ASSERT3U(dlused, <=, mrs_used);
 	ds->ds_phys->ds_unique_bytes =
@@ -1462,6 +1490,103 @@
 	ASSERT3U(count, <=, ds->ds_phys->ds_num_children - 2);
 }
 
+static void
+dsl_dataset_remove_clones_key(dsl_dataset_t *ds, uint64_t mintxg, dmu_tx_t *tx)
+{
+	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
+	zap_cursor_t zc;
+	zap_attribute_t za;
+
+	/*
+	 * If it is the old version, dd_clones doesn't exist so we can't
+	 * find the clones, but deadlist_remove_key() is a no-op so it
+	 * doesn't matter.
+	 */
+	if (ds->ds_dir->dd_phys->dd_clones == 0)
+		return;
+
+	for (zap_cursor_init(&zc, mos, ds->ds_dir->dd_phys->dd_clones);
+	    zap_cursor_retrieve(&zc, &za) == 0;
+	    zap_cursor_advance(&zc)) {
+		dsl_dataset_t *clone;
+
+		VERIFY3U(0, ==, dsl_dataset_hold_obj(ds->ds_dir->dd_pool,
+		    za.za_first_integer, FTAG, &clone));
+		if (clone->ds_dir->dd_origin_txg > mintxg) {
+			dsl_deadlist_remove_key(&clone->ds_deadlist,
+			    mintxg, tx);
+			dsl_dataset_remove_clones_key(clone, mintxg, tx);
+		}
+		dsl_dataset_rele(clone, FTAG);
+	}
+	zap_cursor_fini(&zc);
+}
+
+struct process_old_arg {
+	dsl_dataset_t *ds;
+	dsl_dataset_t *ds_prev;
+	boolean_t after_branch_point;
+	zio_t *pio;
+	uint64_t used, comp, uncomp;
+};
+
+static int
+process_old_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
+{
+	struct process_old_arg *poa = arg;
+	dsl_pool_t *dp = poa->ds->ds_dir->dd_pool;
+
+	if (bp->blk_birth <= poa->ds->ds_phys->ds_prev_snap_txg) {
+		dsl_deadlist_insert(&poa->ds->ds_deadlist, bp, tx);
+		if (poa->ds_prev && !poa->after_branch_point &&
+		    bp->blk_birth >
+		    poa->ds_prev->ds_phys->ds_prev_snap_txg) {
+			poa->ds_prev->ds_phys->ds_unique_bytes +=
+			    bp_get_dsize_sync(dp->dp_spa, bp);
+		}
+	} else {
+		poa->used += bp_get_dsize_sync(dp->dp_spa, bp);
+		poa->comp += BP_GET_PSIZE(bp);
+		poa->uncomp += BP_GET_UCSIZE(bp);
+		dsl_free_sync(poa->pio, dp, tx->tx_txg, bp);
+	}
+	return (0);
+}
+
+static void
+process_old_deadlist(dsl_dataset_t *ds, dsl_dataset_t *ds_prev,
+    dsl_dataset_t *ds_next, boolean_t after_branch_point, dmu_tx_t *tx)
+{
+	struct process_old_arg poa = { 0 };
+	dsl_pool_t *dp = ds->ds_dir->dd_pool;
+	objset_t *mos = dp->dp_meta_objset;
+
+	ASSERT(ds->ds_deadlist.dl_oldfmt);
+	ASSERT(ds_next->ds_deadlist.dl_oldfmt);
+
+	poa.ds = ds;
+	poa.ds_prev = ds_prev;
+	poa.after_branch_point = after_branch_point;
+	poa.pio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
+	VERIFY3U(0, ==, bpobj_iterate(&ds_next->ds_deadlist.dl_bpobj,
+	    process_old_cb, &poa, tx));
+	VERIFY3U(zio_wait(poa.pio), ==, 0);
+	ASSERT3U(poa.used, ==, ds->ds_phys->ds_unique_bytes);
+
+	/* change snapused */
+	dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP,
+	    -poa.used, -poa.comp, -poa.uncomp, tx);
+
+	/* swap next's deadlist to our deadlist */
+	dsl_deadlist_close(&ds->ds_deadlist);
+	dsl_deadlist_close(&ds_next->ds_deadlist);
+	SWITCH64(ds_next->ds_phys->ds_deadlist_obj,
+	    ds->ds_phys->ds_deadlist_obj);
+	dsl_deadlist_open(&ds->ds_deadlist, mos, ds->ds_phys->ds_deadlist_obj);
+	dsl_deadlist_open(&ds_next->ds_deadlist, mos,
+	    ds_next->ds_phys->ds_deadlist_obj);
+}
+
 void
 dsl_dataset_destroy_sync(void *arg1, void *tag, dmu_tx_t *tx)
 {
@@ -1566,12 +1691,9 @@
 	}
 
 	if (dsl_dataset_is_snapshot(ds)) {
-		blkptr_t bp;
-		zio_t *pio;
 		dsl_dataset_t *ds_next;
-		uint64_t itor = 0;
 		uint64_t old_unique;
-		int64_t used = 0, compressed = 0, uncompressed = 0;
+		uint64_t used = 0, comp = 0, uncomp = 0;
 
 		VERIFY(0 == dsl_dataset_hold_obj(dp,
 		    ds->ds_phys->ds_next_snap_obj, FTAG, &ds_next));
@@ -1587,53 +1709,49 @@
 		ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==,
 		    ds_prev ? ds_prev->ds_phys->ds_creation_txg : 0);
 
-		pio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
-
-		/*
-		 * Transfer to our deadlist (which will become next's
-		 * new deadlist) any entries from next's current
-		 * deadlist which were born before prev, and free the
-		 * other entries.
-		 *
-		 * XXX we're doing this long task with the config lock held
-		 */
-		while (bplist_iterate(&ds_next->ds_deadlist, &itor, &bp) == 0) {
-			if (bp.blk_birth <= ds->ds_phys->ds_prev_snap_txg) {
-				VERIFY(0 == bplist_enqueue(&ds->ds_deadlist,
-				    &bp, tx));
-				if (ds_prev && !after_branch_point &&
-				    bp.blk_birth >
-				    ds_prev->ds_phys->ds_prev_snap_txg) {
-					ds_prev->ds_phys->ds_unique_bytes +=
-					    bp_get_dsize_sync(dp->dp_spa, &bp);
-				}
-			} else {
-				used += bp_get_dsize_sync(dp->dp_spa, &bp);
-				compressed += BP_GET_PSIZE(&bp);
-				uncompressed += BP_GET_UCSIZE(&bp);
-				dsl_free_sync(pio, dp, tx->tx_txg, &bp);
+
+		if (ds_next->ds_deadlist.dl_oldfmt) {
+			process_old_deadlist(ds, ds_prev, ds_next,
+			    after_branch_point, tx);
+		} else {
+			/* Adjust prev's unique space. */
+			if (ds_prev && !after_branch_point) {
+				dsl_deadlist_space_range(&ds_next->ds_deadlist,
+				    ds_prev->ds_phys->ds_prev_snap_txg,
+				    ds->ds_phys->ds_prev_snap_txg,
+				    &used, &comp, &uncomp);
+				ds_prev->ds_phys->ds_unique_bytes += used;
 			}
+
+			/* Adjust snapused. */
+			dsl_deadlist_space_range(&ds_next->ds_deadlist,
+			    ds->ds_phys->ds_prev_snap_txg, UINT64_MAX,
+			    &used, &comp, &uncomp);
+			dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP,
+			    -used, -comp, -uncomp, tx);
+
+			/* Move blocks to be freed to pool's free list. */
+			dsl_deadlist_move_bpobj(&ds_next->ds_deadlist,
+			    &dp->dp_free_bpobj, ds->ds_phys->ds_prev_snap_txg,
+			    tx);
+			dsl_dir_diduse_space(tx->tx_pool->dp_free_dir,
+			    DD_USED_HEAD, used, comp, uncomp, tx);
+			dsl_dir_dirty(tx->tx_pool->dp_free_dir, tx);
+
+			/* Merge our deadlist into next's and free it. */
+			dsl_deadlist_merge(&ds_next->ds_deadlist,
+			    ds->ds_phys->ds_deadlist_obj, tx);
 		}
-		VERIFY3U(zio_wait(pio), ==, 0);
-		ASSERT3U(used, ==, ds->ds_phys->ds_unique_bytes);
-
-		/* change snapused */
-		dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP,
-		    -used, -compressed, -uncompressed, tx);
-
-		/* free next's deadlist */
-		bplist_close(&ds_next->ds_deadlist);
-		bplist_destroy(mos, ds_next->ds_phys->ds_deadlist_obj, tx);
-
-		/* set next's deadlist to our deadlist */
-		bplist_close(&ds->ds_deadlist);
-		ds_next->ds_phys->ds_deadlist_obj =
-		    ds->ds_phys->ds_deadlist_obj;
-		VERIFY(0 == bplist_open(&ds_next->ds_deadlist, mos,
-		    ds_next->ds_phys->ds_deadlist_obj));
-		ds->ds_phys->ds_deadlist_obj = 0;
+		dsl_deadlist_close(&ds->ds_deadlist);
+		dsl_deadlist_free(mos, ds->ds_phys->ds_deadlist_obj, tx);
+
+		/* Collapse range in clone heads */
+		dsl_dataset_remove_clones_key(ds,
+		    ds->ds_phys->ds_creation_txg, tx);
 
 		if (dsl_dataset_is_snapshot(ds_next)) {
+			dsl_dataset_t *ds_nextnext;
+
 			/*
 			 * Update next's unique to include blocks which
 			 * were previously shared by only this snapshot
@@ -1642,25 +1760,27 @@
 			 * died after the next snap and before the one
 			 * after that (ie. be on the snap after next's
 			 * deadlist).
-			 *
-			 * XXX we're doing this long task with the
-			 * config lock held
 			 */
-			dsl_dataset_t *ds_after_next;
-			uint64_t space;
-
 			VERIFY(0 == dsl_dataset_hold_obj(dp,
 			    ds_next->ds_phys->ds_next_snap_obj,
-			    FTAG, &ds_after_next));
-
-			VERIFY(0 ==
-			    bplist_space_birthrange(&ds_after_next->ds_deadlist,
+			    FTAG, &ds_nextnext));
+			dsl_deadlist_space_range(&ds_nextnext->ds_deadlist,
 			    ds->ds_phys->ds_prev_snap_txg,
-			    ds->ds_phys->ds_creation_txg, &space));
-			ds_next->ds_phys->ds_unique_bytes += space;
-
-			dsl_dataset_rele(ds_after_next, FTAG);
+			    ds->ds_phys->ds_creation_txg,
+			    &used, &comp, &uncomp);
+			ds_next->ds_phys->ds_unique_bytes += used;
+			dsl_dataset_rele(ds_nextnext, FTAG);
 			ASSERT3P(ds_next->ds_prev, ==, NULL);
+
+			/* Collapse range in this head. */
+			dsl_dataset_t *hds;
+			VERIFY3U(0, ==, dsl_dataset_hold_obj(dp,
+			    ds->ds_dir->dd_phys->dd_head_dataset_obj,
+			    FTAG, &hds));
+			dsl_deadlist_remove_key(&hds->ds_deadlist,
+			    ds->ds_phys->ds_creation_txg, tx);
+			dsl_dataset_rele(hds, FTAG);
+
 		} else {
 			ASSERT3P(ds_next->ds_prev, ==, ds);
 			dsl_dataset_drop_ref(ds_next->ds_prev, ds_next);
@@ -1700,9 +1820,8 @@
 		 */
 		struct killarg ka;
 
-		ASSERT(after_branch_point || bplist_empty(&ds->ds_deadlist));
-		bplist_close(&ds->ds_deadlist);
-		bplist_destroy(mos, ds->ds_phys->ds_deadlist_obj, tx);
+		dsl_deadlist_close(&ds->ds_deadlist);
+		dsl_deadlist_free(mos, ds->ds_phys->ds_deadlist_obj, tx);
 		ds->ds_phys->ds_deadlist_obj = 0;
 
 		/*
@@ -1721,6 +1840,11 @@
 		    ds->ds_phys->ds_unique_bytes == 0);
 
 		if (ds->ds_prev != NULL) {
+			if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
+				VERIFY3U(0, ==, zap_remove_int(mos,
+				    ds->ds_prev->ds_dir->dd_phys->dd_clones,
+				    ds->ds_object, tx));
+			}
 			dsl_dataset_rele(ds->ds_prev, ds);
 			ds->ds_prev = ds_prev = NULL;
 		}
@@ -1935,20 +2059,24 @@
 		    delta, 0, 0, tx);
 	}
 
-	bplist_close(&ds->ds_deadlist);
 	dmu_buf_will_dirty(ds->ds_dbuf, tx);
+	zfs_dbgmsg("taking snapshot %s@%s/%llu; newkey=%llu",
+	    ds->ds_dir->dd_myname, snapname, dsobj,
+	    ds->ds_phys->ds_prev_snap_txg);
+	ds->ds_phys->ds_deadlist_obj = dsl_deadlist_clone(&ds->ds_deadlist,
+	    UINT64_MAX, ds->ds_phys->ds_prev_snap_obj, tx);
+	dsl_deadlist_close(&ds->ds_deadlist);
+	dsl_deadlist_open(&ds->ds_deadlist, mos, ds->ds_phys->ds_deadlist_obj);
+	dsl_deadlist_add_key(&ds->ds_deadlist,
+	    ds->ds_phys->ds_prev_snap_txg, tx);
+
 	ASSERT3U(ds->ds_phys->ds_prev_snap_txg, <, tx->tx_txg);
 	ds->ds_phys->ds_prev_snap_obj = dsobj;
 	ds->ds_phys->ds_prev_snap_txg = crtxg;
 	ds->ds_phys->ds_unique_bytes = 0;
 	if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE)
 		ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
-	ds->ds_phys->ds_deadlist_obj =
-	    bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx);
-	VERIFY(0 == bplist_open(&ds->ds_deadlist, mos,
-	    ds->ds_phys->ds_deadlist_obj));
-
-	dprintf("snap '%s' -> obj %llu\n", snapname, dsobj);
+
 	err = zap_add(mos, ds->ds_phys->ds_snapnames_zapobj,
 	    snapname, 8, 1, &dsobj, tx);
 	ASSERT(err == 0);
@@ -2357,6 +2485,7 @@
 	struct promotenode *snap = list_head(&pa->shared_snaps);
 	dsl_dataset_t *origin_ds = snap->ds;
 	int err;
+	uint64_t unused;
 
 	/* Check that it is a real clone */
 	if (!dsl_dir_is_clone(hds->ds_dir))
@@ -2372,10 +2501,9 @@
 	/* compute origin's new unique space */
 	snap = list_tail(&pa->clone_snaps);
 	ASSERT3U(snap->ds->ds_phys->ds_prev_snap_obj, ==, origin_ds->ds_object);
-	err = bplist_space_birthrange(&snap->ds->ds_deadlist,
-	    origin_ds->ds_phys->ds_prev_snap_txg, UINT64_MAX, &pa->unique);
-	if (err)
-		return (err);
+	dsl_deadlist_space_range(&snap->ds->ds_deadlist,
+	    origin_ds->ds_phys->ds_prev_snap_txg, UINT64_MAX,
+	    &pa->unique, &unused, &unused);
 
 	/*
 	 * Walk the snapshots that we are moving
@@ -2414,9 +2542,8 @@
 		if (ds->ds_phys->ds_prev_snap_obj == 0)
 			continue;
 
-		if (err = bplist_space(&ds->ds_deadlist,
-		    &dlused, &dlcomp, &dluncomp))
-			goto out;
+		dsl_deadlist_space(&ds->ds_deadlist,
+		    &dlused, &dlcomp, &dluncomp);
 		pa->used += dlused;
 		pa->comp += dlcomp;
 		pa->uncomp += dluncomp;
@@ -2450,7 +2577,7 @@
 		/*
 		 * Note, typically this will not be a clone of a clone,
 		 * so dd_origin_txg will be < TXG_INITIAL, so
-		 * these snaplist_space() -> bplist_space_birthrange()
+		 * these snaplist_space() -> dsl_deadlist_space_range()
 		 * calls will be fast because they do not have to
 		 * iterate over all bps.
 		 */
@@ -2530,6 +2657,26 @@
 	origin_head->ds_dir->dd_origin_txg =
 	    origin_ds->ds_phys->ds_creation_txg;
 
+	/* change dd_clone entries */
+	if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
+		VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
+		    odd->dd_phys->dd_clones, hds->ds_object, tx));
+		VERIFY3U(0, ==, zap_add_int(dp->dp_meta_objset,
+		    pa->origin_origin->ds_dir->dd_phys->dd_clones,
+		    hds->ds_object, tx));
+
+		VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
+		    pa->origin_origin->ds_dir->dd_phys->dd_clones,
+		    origin_head->ds_object, tx));
+		if (dd->dd_phys->dd_clones == 0) {
+			dd->dd_phys->dd_clones = zap_create(dp->dp_meta_objset,
+			    DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx);
+		}
+		VERIFY3U(0, ==, zap_add_int(dp->dp_meta_objset,
+		    dd->dd_phys->dd_clones, origin_head->ds_object, tx));
+
+	}
+
 	/* move snapshots to this dir */
 	for (snap = list_head(&pa->shared_snaps); snap;
 	    snap = list_next(&pa->shared_snaps, snap)) {
@@ -2547,6 +2694,7 @@
 		VERIFY(0 == zap_add(dp->dp_meta_objset,
 		    hds->ds_phys->ds_snapnames_zapobj, ds->ds_snapname,
 		    8, 1, &ds->ds_object, tx));
+
 		/* change containing dsl_dir */
 		dmu_buf_will_dirty(ds->ds_dbuf, tx);
 		ASSERT3U(ds->ds_phys->ds_dir_obj, ==, odd->dd_object);
@@ -2556,6 +2704,40 @@
 		VERIFY(0 == dsl_dir_open_obj(dp, dd->dd_object,
 		    NULL, ds, &ds->ds_dir));
 
+		/* move any clone references */
+		if (ds->ds_phys->ds_next_clones_obj &&
+		    spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
+			zap_cursor_t zc;
+			zap_attribute_t za;
+
+			for (zap_cursor_init(&zc, dp->dp_meta_objset,
+			    ds->ds_phys->ds_next_clones_obj);
+			    zap_cursor_retrieve(&zc, &za) == 0;
+			    zap_cursor_advance(&zc)) {
+				dsl_dataset_t *cnds;
+				uint64_t o;
+
+				if (za.za_first_integer == oldnext_obj) {
+					/*
+					 * We've already moved the
+					 * origin's reference.
+					 */
+					continue;
+				}
+
+				VERIFY3U(0, ==, dsl_dataset_hold_obj(dp,
+				    za.za_first_integer, FTAG, &cnds));
+				o = cnds->ds_dir->dd_phys->dd_head_dataset_obj;
+
+				VERIFY3U(zap_remove_int(dp->dp_meta_objset,
+				    odd->dd_phys->dd_clones, o, tx), ==, 0);
+				VERIFY3U(zap_add_int(dp->dp_meta_objset,
+				    dd->dd_phys->dd_clones, o, tx), ==, 0);
+				dsl_dataset_rele(cnds, FTAG);
+			}
+			zap_cursor_fini(&zc);
+		}
+
 		ASSERT3U(dsl_prop_numcb(ds), ==, 0);
 	}
 
@@ -2651,11 +2833,9 @@
 
 	*spacep = 0;
 	for (snap = list_head(l); snap; snap = list_next(l, snap)) {
-		uint64_t used;
-		int err = bplist_space_birthrange(&snap->ds->ds_deadlist,
-		    mintxg, UINT64_MAX, &used);
-		if (err)
-			return (err);
+		uint64_t used, comp, uncomp;
+		dsl_deadlist_space_range(&snap->ds->ds_deadlist,
+		    mintxg, UINT64_MAX, &used, &comp, &uncomp);
 		*spacep += used;
 	}
 	return (0);
@@ -2742,10 +2922,10 @@
 	if (err != 0)
 		goto out;
 
-	if (dsl_dir_is_clone(snap->ds->ds_dir)) {
-		err = dsl_dataset_own_obj(dp,
+	if (snap->ds->ds_dir->dd_phys->dd_origin_obj != 0) {
+		err = dsl_dataset_hold_obj(dp,
 		    snap->ds->ds_dir->dd_phys->dd_origin_obj,
-		    0, FTAG, &pa.origin_origin);
+		    FTAG, &pa.origin_origin);
 		if (err != 0)
 			goto out;
 	}
@@ -2770,7 +2950,7 @@
 	snaplist_destroy(&pa.clone_snaps, B_FALSE);
 	snaplist_destroy(&pa.origin_snaps, B_FALSE);
 	if (pa.origin_origin)
-		dsl_dataset_disown(pa.origin_origin, FTAG);
+		dsl_dataset_rele(pa.origin_origin, FTAG);
 	dsl_dataset_rele(ds, FTAG);
 	return (err);
 }
@@ -2860,10 +3040,12 @@
 	 */
 	if (csa->cds->ds_prev) {
 		dsl_dataset_t *origin = csa->cds->ds_prev;
+		uint64_t comp, uncomp;
+
 		dmu_buf_will_dirty(origin->ds_dbuf, tx);
-		VERIFY(0 == bplist_space_birthrange(&csa->cds->ds_deadlist,
+		dsl_deadlist_space_range(&csa->cds->ds_deadlist,
 		    origin->ds_phys->ds_prev_snap_txg, UINT64_MAX,
-		    &origin->ds_phys->ds_unique_bytes));
+		    &origin->ds_phys->ds_unique_bytes, &comp, &uncomp);
 	}
 
 	/* swap blkptrs */
@@ -2883,10 +3065,10 @@
 		ASSERT3U(csa->cds->ds_dir->dd_phys->
 		    dd_used_breakdown[DD_USED_SNAP], ==, 0);
 
-		VERIFY(0 == bplist_space(&csa->cds->ds_deadlist, &cdl_used,
-		    &cdl_comp, &cdl_uncomp));
-		VERIFY(0 == bplist_space(&csa->ohds->ds_deadlist, &odl_used,
-		    &odl_comp, &odl_uncomp));
+		dsl_deadlist_space(&csa->cds->ds_deadlist,
+		    &cdl_used, &cdl_comp, &cdl_uncomp);
+		dsl_deadlist_space(&csa->ohds->ds_deadlist,
+		    &odl_used, &odl_comp, &odl_uncomp);
 
 		dused = csa->cds->ds_phys->ds_used_bytes + cdl_used -
 		    (csa->ohds->ds_phys->ds_used_bytes + odl_used);
@@ -2907,21 +3089,16 @@
 		 * deadlist (since that's the only thing that's
 		 * changing that affects the snapused).
 		 */
-		VERIFY(0 == bplist_space_birthrange(&csa->cds->ds_deadlist,
-		    csa->ohds->ds_dir->dd_origin_txg, UINT64_MAX, &cdl_used));
-		VERIFY(0 == bplist_space_birthrange(&csa->ohds->ds_deadlist,
-		    csa->ohds->ds_dir->dd_origin_txg, UINT64_MAX, &odl_used));
+		dsl_deadlist_space_range(&csa->cds->ds_deadlist,
+		    csa->ohds->ds_dir->dd_origin_txg, UINT64_MAX,
+		    &cdl_used, &cdl_comp, &cdl_uncomp);
+		dsl_deadlist_space_range(&csa->ohds->ds_deadlist,
+		    csa->ohds->ds_dir->dd_origin_txg, UINT64_MAX,
+		    &odl_used, &odl_comp, &odl_uncomp);
 		dsl_dir_transfer_space(csa->ohds->ds_dir, cdl_used - odl_used,
 		    DD_USED_HEAD, DD_USED_SNAP, tx);
 	}
 
-#define	SWITCH64(x, y) \
-	{ \
-		uint64_t __tmp = (x); \
-		(x) = (y); \
-		(y) = __tmp; \
-	}
-
 	/* swap ds_*_bytes */
 	SWITCH64(csa->ohds->ds_phys->ds_used_bytes,
 	    csa->cds->ds_phys->ds_used_bytes);
@@ -2936,15 +3113,17 @@
 	dsl_dir_diduse_space(csa->ohds->ds_dir, DD_USED_REFRSRV,
 	    csa->unused_refres_delta, 0, 0, tx);
 
-	/* swap deadlists */
-	bplist_close(&csa->cds->ds_deadlist);
-	bplist_close(&csa->ohds->ds_deadlist);
+	/*
+	 * Swap deadlists.
+	 */
+	dsl_deadlist_close(&csa->cds->ds_deadlist);
+	dsl_deadlist_close(&csa->ohds->ds_deadlist);
 	SWITCH64(csa->ohds->ds_phys->ds_deadlist_obj,
 	    csa->cds->ds_phys->ds_deadlist_obj);
-	VERIFY(0 == bplist_open(&csa->cds->ds_deadlist, dp->dp_meta_objset,
-	    csa->cds->ds_phys->ds_deadlist_obj));
-	VERIFY(0 == bplist_open(&csa->ohds->ds_deadlist, dp->dp_meta_objset,
-	    csa->ohds->ds_phys->ds_deadlist_obj));
+	dsl_deadlist_open(&csa->cds->ds_deadlist, dp->dp_meta_objset,
+	    csa->cds->ds_phys->ds_deadlist_obj);
+	dsl_deadlist_open(&csa->ohds->ds_deadlist, dp->dp_meta_objset,
+	    csa->ohds->ds_phys->ds_deadlist_obj);
 
 	dsl_scan_ds_clone_swapped(csa->ohds, csa->cds, tx);
 }
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/common/fs/zfs/dsl_deadlist.c	Fri May 21 17:29:22 2010 -0700
@@ -0,0 +1,474 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#include <sys/dsl_dataset.h>
+#include <sys/dmu.h>
+#include <sys/refcount.h>
+#include <sys/zap.h>
+#include <sys/zfs_context.h>
+#include <sys/dsl_pool.h>
+
+static int
+dsl_deadlist_compare(const void *arg1, const void *arg2)
+{
+	const dsl_deadlist_entry_t *dle1 = arg1;
+	const dsl_deadlist_entry_t *dle2 = arg2;
+
+	if (dle1->dle_mintxg < dle2->dle_mintxg)
+		return (-1);
+	else if (dle1->dle_mintxg > dle2->dle_mintxg)
+		return (+1);
+	else
+		return (0);
+}
+
+static void
+dsl_deadlist_load_tree(dsl_deadlist_t *dl)
+{
+	zap_cursor_t zc;
+	zap_attribute_t za;
+
+	ASSERT(!dl->dl_oldfmt);
+	if (dl->dl_havetree)
+		return;
+
+	avl_create(&dl->dl_tree, dsl_deadlist_compare,
+	    sizeof (dsl_deadlist_entry_t),
+	    offsetof(dsl_deadlist_entry_t, dle_node));
+	for (zap_cursor_init(&zc, dl->dl_os, dl->dl_object);
+	    zap_cursor_retrieve(&zc, &za) == 0;
+	    zap_cursor_advance(&zc)) {
+		dsl_deadlist_entry_t *dle = kmem_alloc(sizeof (*dle), KM_SLEEP);
+		dle->dle_mintxg = strtonum(za.za_name, NULL);
+		VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os,
+		    za.za_first_integer));
+		avl_add(&dl->dl_tree, dle);
+	}
+	zap_cursor_fini(&zc);
+	dl->dl_havetree = B_TRUE;
+}
+
+void
+dsl_deadlist_open(dsl_deadlist_t *dl, objset_t *os, uint64_t object)
+{
+	dmu_object_info_t doi;
+
+	mutex_init(&dl->dl_lock, NULL, MUTEX_DEFAULT, NULL);
+	dl->dl_os = os;
+	dl->dl_object = object;
+	VERIFY3U(0, ==, dmu_bonus_hold(os, object, dl, &dl->dl_dbuf));
+	dmu_object_info_from_db(dl->dl_dbuf, &doi);
+	if (doi.doi_type == DMU_OT_BPOBJ) {
+		dmu_buf_rele(dl->dl_dbuf, dl);
+		dl->dl_dbuf = NULL;
+		dl->dl_oldfmt = B_TRUE;
+		VERIFY3U(0, ==, bpobj_open(&dl->dl_bpobj, os, object));
+		return;
+	}
+
+	dl->dl_oldfmt = B_FALSE;
+	dl->dl_phys = dl->dl_dbuf->db_data;
+	dl->dl_havetree = B_FALSE;
+}
+
+void
+dsl_deadlist_close(dsl_deadlist_t *dl)
+{
+	void *cookie = NULL;
+	dsl_deadlist_entry_t *dle;
+
+	if (dl->dl_oldfmt) {
+		dl->dl_oldfmt = B_FALSE;
+		bpobj_close(&dl->dl_bpobj);
+		return;
+	}
+
+	if (dl->dl_havetree) {
+		while ((dle = avl_destroy_nodes(&dl->dl_tree, &cookie))
+		    != NULL) {
+			bpobj_close(&dle->dle_bpobj);
+			kmem_free(dle, sizeof (*dle));
+		}
+		avl_destroy(&dl->dl_tree);
+	}
+	dmu_buf_rele(dl->dl_dbuf, dl);
+	mutex_destroy(&dl->dl_lock);
+	dl->dl_dbuf = NULL;
+	dl->dl_phys = NULL;
+}
+
+uint64_t
+dsl_deadlist_alloc(objset_t *os, dmu_tx_t *tx)
+{
+	if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_DEADLISTS)
+		return (bpobj_alloc(os, SPA_MAXBLOCKSIZE, tx));
+	return (zap_create(os, DMU_OT_DEADLIST, DMU_OT_DEADLIST_HDR,
+	    sizeof (dsl_deadlist_phys_t), tx));
+}
+
+void
+dsl_deadlist_free(objset_t *os, uint64_t dlobj, dmu_tx_t *tx)
+{
+	dmu_object_info_t doi;
+	zap_cursor_t zc;
+	zap_attribute_t za;
+
+	VERIFY3U(0, ==, dmu_object_info(os, dlobj, &doi));
+	if (doi.doi_type == DMU_OT_BPOBJ) {
+		bpobj_free(os, dlobj, tx);
+		return;
+	}
+
+	for (zap_cursor_init(&zc, os, dlobj);
+	    zap_cursor_retrieve(&zc, &za) == 0;
+	    zap_cursor_advance(&zc))
+		bpobj_free(os, za.za_first_integer, tx);
+	zap_cursor_fini(&zc);
+	VERIFY3U(0, ==, dmu_object_free(os, dlobj, tx));
+}
+
+void
+dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, dmu_tx_t *tx)
+{
+	dsl_deadlist_entry_t dle_tofind;
+	dsl_deadlist_entry_t *dle;
+	avl_index_t where;
+
+	if (dl->dl_oldfmt) {
+		bpobj_enqueue(&dl->dl_bpobj, bp, tx);
+		return;
+	}
+
+	dsl_deadlist_load_tree(dl);
+
+	dmu_buf_will_dirty(dl->dl_dbuf, tx);
+	mutex_enter(&dl->dl_lock);
+	dl->dl_phys->dl_used +=
+	    bp_get_dsize_sync(dmu_objset_spa(dl->dl_os), bp);
+	dl->dl_phys->dl_comp += BP_GET_PSIZE(bp);
+	dl->dl_phys->dl_uncomp += BP_GET_UCSIZE(bp);
+	mutex_exit(&dl->dl_lock);
+
+	dle_tofind.dle_mintxg = bp->blk_birth;
+	dle = avl_find(&dl->dl_tree, &dle_tofind, &where);
+	if (dle == NULL)
+		dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE);
+	else
+		dle = AVL_PREV(&dl->dl_tree, dle);
+	bpobj_enqueue(&dle->dle_bpobj, bp, tx);
+}
+
+/*
+ * Insert new key in deadlist, which must be > all current entries.
+ * mintxg is not inclusive.
+ */
+void
+dsl_deadlist_add_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx)
+{
+	uint64_t obj;
+	dsl_deadlist_entry_t *dle;
+
+	if (dl->dl_oldfmt)
+		return;
+
+	dsl_deadlist_load_tree(dl);
+
+	dle = kmem_alloc(sizeof (*dle), KM_SLEEP);
+	dle->dle_mintxg = mintxg;
+	obj = bpobj_alloc(dl->dl_os, SPA_MAXBLOCKSIZE, tx);
+	VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, obj));
+	avl_add(&dl->dl_tree, dle);
+
+	VERIFY3U(0, ==, zap_add_int_key(dl->dl_os, dl->dl_object,
+	    mintxg, obj, tx));
+}
+
+/*
+ * Remove this key, merging its entries into the previous key.
+ */
+void
+dsl_deadlist_remove_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx)
+{
+	dsl_deadlist_entry_t dle_tofind;
+	dsl_deadlist_entry_t *dle, *dle_prev;
+
+	if (dl->dl_oldfmt)
+		return;
+
+	dsl_deadlist_load_tree(dl);
+
+	dle_tofind.dle_mintxg = mintxg;
+	dle = avl_find(&dl->dl_tree, &dle_tofind, NULL);
+	dle_prev = AVL_PREV(&dl->dl_tree, dle);
+
+	bpobj_enqueue_subobj(&dle_prev->dle_bpobj,
+	    dle->dle_bpobj.bpo_object, tx);
+
+	avl_remove(&dl->dl_tree, dle);
+	bpobj_close(&dle->dle_bpobj);
+	kmem_free(dle, sizeof (*dle));
+
+	VERIFY3U(0, ==, zap_remove_int(dl->dl_os, dl->dl_object, mintxg, tx));
+}
+
+/*
+ * Walk ds's snapshots to regenerate generate ZAP & AVL.
+ */
+static void
+dsl_deadlist_regenerate(objset_t *os, uint64_t dlobj,
+    uint64_t mrs_obj, dmu_tx_t *tx)
+{
+	dsl_deadlist_t dl;
+	dsl_pool_t *dp = dmu_objset_pool(os);
+
+	dsl_deadlist_open(&dl, os, dlobj);
+	if (dl.dl_oldfmt) {
+		dsl_deadlist_close(&dl);
+		return;
+	}
+
+	while (mrs_obj != 0) {
+		dsl_dataset_t *ds;
+		VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, mrs_obj, FTAG, &ds));
+		dsl_deadlist_add_key(&dl, ds->ds_phys->ds_prev_snap_txg, tx);
+		mrs_obj = ds->ds_phys->ds_prev_snap_obj;
+		dsl_dataset_rele(ds, FTAG);
+	}
+	dsl_deadlist_close(&dl);
+}
+
+uint64_t
+dsl_deadlist_clone(dsl_deadlist_t *dl, uint64_t maxtxg,
+    uint64_t mrs_obj, dmu_tx_t *tx)
+{
+	dsl_deadlist_entry_t *dle;
+	uint64_t newobj;
+
+	newobj = dsl_deadlist_alloc(dl->dl_os, tx);
+
+	if (dl->dl_oldfmt) {
+		dsl_deadlist_regenerate(dl->dl_os, newobj, mrs_obj, tx);
+		return (newobj);
+	}
+
+	dsl_deadlist_load_tree(dl);
+
+	for (dle = avl_first(&dl->dl_tree); dle;
+	    dle = AVL_NEXT(&dl->dl_tree, dle)) {
+		uint64_t obj;
+
+		if (dle->dle_mintxg >= maxtxg)
+			break;
+
+		obj = bpobj_alloc(dl->dl_os, SPA_MAXBLOCKSIZE, tx);
+		VERIFY3U(0, ==, zap_add_int_key(dl->dl_os, newobj,
+		    dle->dle_mintxg, obj, tx));
+	}
+	return (newobj);
+}
+
+void
+dsl_deadlist_space(dsl_deadlist_t *dl,
+    uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
+{
+	if (dl->dl_oldfmt) {
+		VERIFY3U(0, ==, bpobj_space(&dl->dl_bpobj,
+		    usedp, compp, uncompp));
+		return;
+	}
+
+	mutex_enter(&dl->dl_lock);
+	*usedp = dl->dl_phys->dl_used;
+	*compp = dl->dl_phys->dl_comp;
+	*uncompp = dl->dl_phys->dl_uncomp;
+	mutex_exit(&dl->dl_lock);
+}
+
+/*
+ * return space used in the range (mintxg, maxtxg].
+ * Includes maxtxg, does not include mintxg.
+ * mintxg and maxtxg must both be keys in the deadlist (unless maxtxg is
+ * UINT64_MAX).
+ */
+void
+dsl_deadlist_space_range(dsl_deadlist_t *dl, uint64_t mintxg, uint64_t maxtxg,
+    uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
+{
+	dsl_deadlist_entry_t dle_tofind;
+	dsl_deadlist_entry_t *dle;
+	avl_index_t where;
+
+	if (dl->dl_oldfmt) {
+		VERIFY3U(0, ==, bpobj_space_range(&dl->dl_bpobj,
+		    mintxg, maxtxg, usedp, compp, uncompp));
+		return;
+	}
+
+	dsl_deadlist_load_tree(dl);
+	*usedp = *compp = *uncompp = 0;
+
+	dle_tofind.dle_mintxg = mintxg;
+	dle = avl_find(&dl->dl_tree, &dle_tofind, &where);
+	/*
+	 * If we don't find this mintxg, there shouldn't be anything
+	 * after it either.
+	 */
+	ASSERT(dle != NULL ||
+	    avl_nearest(&dl->dl_tree, where, AVL_AFTER) == NULL);
+	for (; dle && dle->dle_mintxg < maxtxg;
+	    dle = AVL_NEXT(&dl->dl_tree, dle)) {
+		uint64_t used, comp, uncomp;
+
+		VERIFY3U(0, ==, bpobj_space(&dle->dle_bpobj,
+		    &used, &comp, &uncomp));
+
+		*usedp += used;
+		*compp += comp;
+		*uncompp += uncomp;
+	}
+}
+
+static void
+dsl_deadlist_insert_bpobj(dsl_deadlist_t *dl, uint64_t obj, uint64_t birth,
+    dmu_tx_t *tx)
+{
+	dsl_deadlist_entry_t dle_tofind;
+	dsl_deadlist_entry_t *dle;
+	avl_index_t where;
+	uint64_t used, comp, uncomp;
+	bpobj_t bpo;
+
+	VERIFY3U(0, ==, bpobj_open(&bpo, dl->dl_os, obj));
+	VERIFY3U(0, ==, bpobj_space(&bpo, &used, &comp, &uncomp));
+	bpobj_close(&bpo);
+
+	dsl_deadlist_load_tree(dl);
+
+	dmu_buf_will_dirty(dl->dl_dbuf, tx);
+	mutex_enter(&dl->dl_lock);
+	dl->dl_phys->dl_used += used;
+	dl->dl_phys->dl_comp += comp;
+	dl->dl_phys->dl_uncomp += uncomp;
+	mutex_exit(&dl->dl_lock);
+
+	dle_tofind.dle_mintxg = birth;
+	dle = avl_find(&dl->dl_tree, &dle_tofind, &where);
+	if (dle == NULL)
+		dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE);
+	bpobj_enqueue_subobj(&dle->dle_bpobj, obj, tx);
+}
+
+static int
+dsl_deadlist_insert_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
+{
+	dsl_deadlist_t *dl = arg;
+	dsl_deadlist_insert(dl, bp, tx);
+	return (0);
+}
+
+/*
+ * Merge the deadlist pointed to by 'obj' into dl.  obj will be left as
+ * an empty deadlist.
+ */
+void
+dsl_deadlist_merge(dsl_deadlist_t *dl, uint64_t obj, dmu_tx_t *tx)
+{
+	zap_cursor_t zc;
+	zap_attribute_t za;
+	dmu_buf_t *bonus;
+	dsl_deadlist_phys_t *dlp;
+	dmu_object_info_t doi;
+
+	VERIFY3U(0, ==, dmu_object_info(dl->dl_os, obj, &doi));
+	if (doi.doi_type == DMU_OT_BPOBJ) {
+		bpobj_t bpo;
+		VERIFY3U(0, ==, bpobj_open(&bpo, dl->dl_os, obj));
+		VERIFY3U(0, ==, bpobj_iterate(&bpo,
+		    dsl_deadlist_insert_cb, dl, tx));
+		bpobj_close(&bpo);
+		return;
+	}
+
+	for (zap_cursor_init(&zc, dl->dl_os, obj);
+	    zap_cursor_retrieve(&zc, &za) == 0;
+	    zap_cursor_advance(&zc)) {
+		uint64_t mintxg = strtonum(za.za_name, NULL);
+		dsl_deadlist_insert_bpobj(dl, za.za_first_integer, mintxg, tx);
+		VERIFY3U(0, ==, zap_remove_int(dl->dl_os, obj, mintxg, tx));
+	}
+	zap_cursor_fini(&zc);
+
+	VERIFY3U(0, ==, dmu_bonus_hold(dl->dl_os, obj, FTAG, &bonus));
+	dlp = bonus->db_data;
+	dmu_buf_will_dirty(bonus, tx);
+	bzero(dlp, sizeof (*dlp));
+	dmu_buf_rele(bonus, FTAG);
+}
+
+/*
+ * Remove entries on dl that are >= mintxg, and put them on the bpobj.
+ */
+void
+dsl_deadlist_move_bpobj(dsl_deadlist_t *dl, bpobj_t *bpo, uint64_t mintxg,
+    dmu_tx_t *tx)
+{
+	dsl_deadlist_entry_t dle_tofind;
+	dsl_deadlist_entry_t *dle;
+	avl_index_t where;
+
+	ASSERT(!dl->dl_oldfmt);
+	dmu_buf_will_dirty(dl->dl_dbuf, tx);
+	dsl_deadlist_load_tree(dl);
+
+	dle_tofind.dle_mintxg = mintxg;
+	dle = avl_find(&dl->dl_tree, &dle_tofind, &where);
+	if (dle == NULL)
+		dle = avl_nearest(&dl->dl_tree, where, AVL_AFTER);
+	while (dle) {
+		uint64_t used, comp, uncomp;
+		dsl_deadlist_entry_t *dle_next;
+
+		bpobj_enqueue_subobj(bpo, dle->dle_bpobj.bpo_object, tx);
+
+		VERIFY3U(0, ==, bpobj_space(&dle->dle_bpobj,
+		    &used, &comp, &uncomp));
+		mutex_enter(&dl->dl_lock);
+		ASSERT3U(dl->dl_phys->dl_used, >=, used);
+		ASSERT3U(dl->dl_phys->dl_comp, >=, comp);
+		ASSERT3U(dl->dl_phys->dl_uncomp, >=, uncomp);
+		dl->dl_phys->dl_used -= used;
+		dl->dl_phys->dl_comp -= comp;
+		dl->dl_phys->dl_uncomp -= uncomp;
+		mutex_exit(&dl->dl_lock);
+
+		VERIFY3U(0, ==, zap_remove_int(dl->dl_os, dl->dl_object,
+		    dle->dle_mintxg, tx));
+
+		dle_next = AVL_NEXT(&dl->dl_tree, dle);
+		avl_remove(&dl->dl_tree, dle);
+		bpobj_close(&dle->dle_bpobj);
+		kmem_free(dle, sizeof (*dle));
+		dle = dle_next;
+	}
+}
--- a/usr/src/uts/common/fs/zfs/dsl_dir.c	Fri May 21 15:05:41 2010 -0700
+++ b/usr/src/uts/common/fs/zfs/dsl_dir.c	Fri May 21 17:29:22 2010 -0700
@@ -412,7 +412,7 @@
 {
 	objset_t *mos = dp->dp_meta_objset;
 	uint64_t ddobj;
-	dsl_dir_phys_t *dsphys;
+	dsl_dir_phys_t *ddphys;
 	dmu_buf_t *dbuf;
 
 	ddobj = dmu_object_alloc(mos, DMU_OT_DSL_DIR, 0,
@@ -427,17 +427,17 @@
 	}
 	VERIFY(0 == dmu_bonus_hold(mos, ddobj, FTAG, &dbuf));
 	dmu_buf_will_dirty(dbuf, tx);
-	dsphys = dbuf->db_data;
+	ddphys = dbuf->db_data;
 
-	dsphys->dd_creation_time = gethrestime_sec();
+	ddphys->dd_creation_time = gethrestime_sec();
 	if (pds)
-		dsphys->dd_parent_obj = pds->dd_object;
-	dsphys->dd_props_zapobj = zap_create(mos,
+		ddphys->dd_parent_obj = pds->dd_object;
+	ddphys->dd_props_zapobj = zap_create(mos,
 	    DMU_OT_DSL_PROPS, DMU_OT_NONE, 0, tx);
-	dsphys->dd_child_dir_zapobj = zap_create(mos,
+	ddphys->dd_child_dir_zapobj = zap_create(mos,
 	    DMU_OT_DSL_DIR_CHILD_MAP, DMU_OT_NONE, 0, tx);
 	if (spa_version(dp->dp_spa) >= SPA_VERSION_USED_BREAKDOWN)
-		dsphys->dd_flags |= DD_FLAG_USED_BREAKDOWN;
+		ddphys->dd_flags |= DD_FLAG_USED_BREAKDOWN;
 	dmu_buf_rele(dbuf, FTAG);
 
 	return (ddobj);
--- a/usr/src/uts/common/fs/zfs/dsl_pool.c	Fri May 21 15:05:41 2010 -0700
+++ b/usr/src/uts/common/fs/zfs/dsl_pool.c	Fri May 21 17:29:22 2010 -0700
@@ -38,6 +38,7 @@
 #include <sys/fs/zfs.h>
 #include <sys/zfs_znode.h>
 #include <sys/spa_impl.h>
+#include <sys/dsl_deadlist.h>
 
 int zfs_no_write_throttle = 0;
 int zfs_write_limit_shift = 3;			/* 1/8th of physical memory */
@@ -104,6 +105,7 @@
 	dsl_pool_t *dp = dsl_pool_open_impl(spa, txg);
 	dsl_dir_t *dd;
 	dsl_dataset_t *ds;
+	uint64_t obj;
 
 	rw_enter(&dp->dp_config_rwlock, RW_WRITER);
 	err = dmu_objset_open_impl(spa, NULL, &dp->dp_meta_rootbp,
@@ -143,6 +145,20 @@
 			goto out;
 	}
 
+	if (spa_version(spa) >= SPA_VERSION_DEADLISTS) {
+		err = dsl_pool_open_special_dir(dp, FREE_DIR_NAME,
+		    &dp->dp_free_dir);
+		if (err)
+			goto out;
+
+		err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+		    DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj);
+		if (err)
+			goto out;
+		VERIFY3U(0, ==, bpobj_open(&dp->dp_free_bpobj,
+		    dp->dp_meta_objset, obj));
+	}
+
 	err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 	    DMU_POOL_TMP_USERREFS, sizeof (uint64_t), 1,
 	    &dp->dp_tmp_userrefs_obj);
@@ -177,9 +193,13 @@
 		dsl_dataset_drop_ref(dp->dp_origin_snap, dp);
 	if (dp->dp_mos_dir)
 		dsl_dir_close(dp->dp_mos_dir, dp);
+	if (dp->dp_free_dir)
+		dsl_dir_close(dp->dp_free_dir, dp);
 	if (dp->dp_root_dir)
 		dsl_dir_close(dp->dp_root_dir, dp);
 
+	bpobj_close(&dp->dp_free_bpobj);
+
 	/* undo the dmu_objset_open_impl(mos) from dsl_pool_open() */
 	if (dp->dp_meta_objset)
 		dmu_objset_evict(dp->dp_meta_objset);
@@ -208,7 +228,7 @@
 	dmu_tx_t *tx = dmu_tx_create_assigned(dp, txg);
 	objset_t *os;
 	dsl_dataset_t *ds;
-	uint64_t dsobj;
+	uint64_t obj;
 
 	/* create and open the MOS (meta-objset) */
 	dp->dp_meta_objset = dmu_objset_create_impl(spa,
@@ -232,14 +252,29 @@
 	VERIFY(0 == dsl_pool_open_special_dir(dp,
 	    MOS_DIR_NAME, &dp->dp_mos_dir));
 
+	if (spa_version(spa) >= SPA_VERSION_DEADLISTS) {
+		/* create and open the free dir */
+		(void) dsl_dir_create_sync(dp, dp->dp_root_dir,
+		    FREE_DIR_NAME, tx);
+		VERIFY(0 == dsl_pool_open_special_dir(dp,
+		    FREE_DIR_NAME, &dp->dp_free_dir));
+
+		/* create and open the free_bplist */
+		obj = bpobj_alloc(dp->dp_meta_objset, SPA_MAXBLOCKSIZE, tx);
+		VERIFY(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+		    DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx) == 0);
+		VERIFY3U(0, ==, bpobj_open(&dp->dp_free_bpobj,
+		    dp->dp_meta_objset, obj));
+	}
+
 	if (spa_version(spa) >= SPA_VERSION_DSL_SCRUB)
 		dsl_pool_create_origin(dp, tx);
 
 	/* create the root dataset */
-	dsobj = dsl_dataset_create_sync_dd(dp->dp_root_dir, NULL, 0, tx);
+	obj = dsl_dataset_create_sync_dd(dp->dp_root_dir, NULL, 0, tx);
 
 	/* create the root objset */
-	VERIFY(0 == dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
+	VERIFY(0 == dsl_dataset_hold_obj(dp, obj, FTAG, &ds));
 	os = dmu_objset_create_impl(dp->dp_spa, ds,
 	    dsl_dataset_get_blkptr(ds), DMU_OST_ZFS, tx);
 #ifdef _KERNEL
@@ -252,6 +287,14 @@
 	return (dp);
 }
 
+static int
+deadlist_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
+{
+	dsl_deadlist_t *dl = arg;
+	dsl_deadlist_insert(dl, bp, tx);
+	return (0);
+}
+
 void
 dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
 {
@@ -315,13 +358,14 @@
 	err = zio_wait(zio);
 
 	/*
-	 * If anything was added to a deadlist during a zio done callback,
-	 * it had to be put on the deferred queue.  Enqueue it for real now.
+	 * Move dead blocks from the pending deadlist to the on-disk
+	 * deadlist.
 	 */
 	for (ds = list_head(&dp->dp_synced_datasets); ds;
-	    ds = list_next(&dp->dp_synced_datasets, ds))
-		bplist_sync(&ds->ds_deadlist,
-		    bplist_enqueue_cb, &ds->ds_deadlist, tx);
+	    ds = list_next(&dp->dp_synced_datasets, ds)) {
+		bplist_iterate(&ds->ds_pending_deadlist,
+		    deadlist_enqueue_cb, &ds->ds_deadlist, tx);
+	}
 
 	while (dstg = txg_list_remove(&dp->dp_sync_tasks, txg)) {
 		/*
@@ -612,6 +656,65 @@
 	    tx, DS_FIND_CHILDREN));
 }
 
+/* ARGSUSED */
+static int
+upgrade_dir_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg)
+{
+	dmu_tx_t *tx = arg;
+	dsl_dataset_t *ds;
+	dsl_pool_t *dp = spa_get_dsl(spa);
+	objset_t *mos = dp->dp_meta_objset;
+
+	VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
+
+	if (ds->ds_dir->dd_phys->dd_origin_obj) {
+		dsl_dataset_t *origin;
+
+		VERIFY3U(0, ==, dsl_dataset_hold_obj(dp,
+		    ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &origin));
+
+		if (origin->ds_dir->dd_phys->dd_clones == 0) {
+			dmu_buf_will_dirty(origin->ds_dir->dd_dbuf, tx);
+			origin->ds_dir->dd_phys->dd_clones = zap_create(mos,
+			    DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx);
+		}
+
+		VERIFY3U(0, ==, zap_add_int(dp->dp_meta_objset,
+		    origin->ds_dir->dd_phys->dd_clones, dsobj, tx));
+
+		dsl_dataset_rele(origin, FTAG);
+	}
+
+	dsl_dataset_rele(ds, FTAG);
+	return (0);
+}
+
+void
+dsl_pool_upgrade_dir_clones(dsl_pool_t *dp, dmu_tx_t *tx)
+{
+	ASSERT(dmu_tx_is_syncing(tx));
+	uint64_t obj;
+
+	(void) dsl_dir_create_sync(dp, dp->dp_root_dir, FREE_DIR_NAME, tx);
+	VERIFY(0 == dsl_pool_open_special_dir(dp,
+	    FREE_DIR_NAME, &dp->dp_free_dir));
+
+	/*
+	 * We can't use bpobj_alloc(), because spa_version() still
+	 * returns the old version, and we need a new-version bpobj with
+	 * subobj support.  So call dmu_object_alloc() directly.
+	 */
+	obj = dmu_object_alloc(dp->dp_meta_objset, DMU_OT_BPOBJ,
+	    SPA_MAXBLOCKSIZE, DMU_OT_BPOBJ_HDR, sizeof (bpobj_phys_t), tx);
+	VERIFY3U(0, ==, zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+	    DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx));
+	VERIFY3U(0, ==, bpobj_open(&dp->dp_free_bpobj,
+	    dp->dp_meta_objset, obj));
+
+	VERIFY3U(0, ==, dmu_objset_find_spa(dp->dp_spa, NULL,
+	    upgrade_dir_clones_cb, tx, DS_FIND_CHILDREN));
+}
+
 void
 dsl_pool_create_origin(dsl_pool_t *dp, dmu_tx_t *tx)
 {
--- a/usr/src/uts/common/fs/zfs/dsl_scan.c	Fri May 21 15:05:41 2010 -0700
+++ b/usr/src/uts/common/fs/zfs/dsl_scan.c	Fri May 21 17:29:22 2010 -0700
@@ -57,6 +57,7 @@
 static void dsl_scan_sync_state(dsl_scan_t *, dmu_tx_t *tx);
 
 int zfs_scan_min_time_ms = 1000; /* min millisecs to scrub per txg */
+int zfs_free_min_time_ms = 1000; /* min millisecs to free per txg */
 int zfs_resilver_min_time_ms = 3000; /* min millisecs to resilver per txg */
 boolean_t zfs_no_scrub_io = B_FALSE; /* set to disable scrub i/o */
 boolean_t zfs_no_scrub_prefetch = B_FALSE; /* set to disable srub prefetching */
@@ -599,7 +600,7 @@
 	 * XXX need to make sure all of these arc_read() prefetches are
 	 * done before setting xlateall (similar to dsl_read())
 	 */
-	(void) arc_read(scn->scn_prefetch_zio_root, scn->scn_dp->dp_spa, bp,
+	(void) arc_read(scn->scn_zio_root, scn->scn_dp->dp_spa, bp,
 	    buf, NULL, NULL, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL,
 	    &flags, &czb);
 }
@@ -1378,11 +1379,56 @@
 	zap_cursor_fini(&zc);
 }
 
+static int
+dsl_scan_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
+{
+	dsl_scan_t *scn = arg;
+	uint64_t elapsed_nanosecs;
+
+	elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time;
+
+	if (elapsed_nanosecs / NANOSEC > zfs_txg_timeout ||
+	    (elapsed_nanosecs / MICROSEC > zfs_free_min_time_ms &&
+	    txg_sync_waiting(scn->scn_dp)) ||
+	    spa_shutting_down(scn->scn_dp->dp_spa))
+		return (ERESTART);
+
+	zio_nowait(zio_free_sync(scn->scn_zio_root, scn->scn_dp->dp_spa,
+	    dmu_tx_get_txg(tx), bp, 0));
+	dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, DD_USED_HEAD,
+	    -bp_get_dsize_sync(scn->scn_dp->dp_spa, bp),
+	    -BP_GET_PSIZE(bp), -BP_GET_UCSIZE(bp), tx);
+	scn->scn_visited_this_txg++;
+	return (0);
+}
+
+boolean_t
+dsl_scan_active(dsl_scan_t *scn)
+{
+	spa_t *spa = scn->scn_dp->dp_spa;
+	uint64_t used = 0, comp, uncomp;
+
+	if (spa->spa_load_state != SPA_LOAD_NONE)
+		return (B_FALSE);
+	if (spa_shutting_down(spa))
+		return (B_FALSE);
+
+	if (scn->scn_phys.scn_state == DSS_SCANNING)
+		return (B_TRUE);
+
+	if (spa_version(scn->scn_dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
+		(void) bpobj_space(&scn->scn_dp->dp_free_bpobj,
+		    &used, &comp, &uncomp);
+	}
+	return (used != 0);
+}
+
 void
 dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
 {
 	dsl_scan_t *scn = dp->dp_scan;
 	spa_t *spa = dp->dp_spa;
+	int err;
 
 	/*
 	 * Check for scn_restart_txg before checking spa_load_state, so
@@ -1400,10 +1446,9 @@
 		dsl_scan_setup_sync(scn, &func, tx);
 	}
 
-	if (dp->dp_spa->spa_load_state != SPA_LOAD_NONE ||
-	    spa_shutting_down(spa) ||
-	    spa_sync_pass(dp->dp_spa) > 1 ||
-	    scn->scn_phys.scn_state != DSS_SCANNING)
+
+	if (!dsl_scan_active(scn) ||
+	    spa_sync_pass(dp->dp_spa) > 1)
 		return;
 
 	scn->scn_visited_this_txg = 0;
@@ -1411,6 +1456,40 @@
 	scn->scn_sync_start_time = gethrtime();
 	spa->spa_scrub_active = B_TRUE;
 
+	/*
+	 * First process the free list.  If we pause the free, don't do
+	 * any scanning.  This ensures that there is no free list when
+	 * we are scanning, so the scan code doesn't have to worry about
+	 * traversing it.
+	 */
+	if (spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
+		scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
+		    NULL, ZIO_FLAG_MUSTSUCCEED);
+		err = bpobj_iterate(&dp->dp_free_bpobj,
+		    dsl_scan_free_cb, scn, tx);
+		VERIFY3U(0, ==, zio_wait(scn->scn_zio_root));
+		if (scn->scn_visited_this_txg) {
+			zfs_dbgmsg("freed %llu blocks in %llums from "
+			    "free_bpobj txg %llu",
+			    (longlong_t)scn->scn_visited_this_txg,
+			    (longlong_t)
+			    (gethrtime() - scn->scn_sync_start_time) / MICROSEC,
+			    (longlong_t)tx->tx_txg);
+			scn->scn_visited_this_txg = 0;
+			/*
+			 * Re-sync the ddt so that we can further modify
+			 * it when doing bprewrite.
+			 */
+			ddt_sync(spa, tx->tx_txg);
+		}
+		if (err == ERESTART)
+			return;
+	}
+
+	if (scn->scn_phys.scn_state != DSS_SCANNING)
+		return;
+
+
 	if (scn->scn_phys.scn_ddt_bookmark.ddb_class <=
 	    scn->scn_phys.scn_ddt_class_max) {
 		zfs_dbgmsg("doing scan sync txg %llu; "
@@ -1433,11 +1512,11 @@
 		    (longlong_t)scn->scn_phys.scn_bookmark.zb_blkid);
 	}
 
-	scn->scn_prefetch_zio_root = zio_root(dp->dp_spa, NULL,
+	scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
 	    NULL, ZIO_FLAG_CANFAIL);
 	dsl_scan_visit(scn, tx);
-	(void) zio_wait(scn->scn_prefetch_zio_root);
-	scn->scn_prefetch_zio_root = NULL;
+	(void) zio_wait(scn->scn_zio_root);
+	scn->scn_zio_root = NULL;
 
 	zfs_dbgmsg("visited %llu blocks in %llums",
 	    (longlong_t)scn->scn_visited_this_txg,
--- a/usr/src/uts/common/fs/zfs/spa.c	Fri May 21 15:05:41 2010 -0700
+++ b/usr/src/uts/common/fs/zfs/spa.c	Fri May 21 17:29:22 2010 -0700
@@ -943,6 +943,8 @@
 		spa->spa_async_zio_root = NULL;
 	}
 
+	bpobj_close(&spa->spa_deferred_bpobj);
+
 	/*
 	 * Close the dsl pool.
 	 */
@@ -1662,6 +1664,7 @@
 	uint64_t config_cache_txg = spa->spa_config_txg;
 	int orig_mode = spa->spa_mode;
 	int parse;
+	uint64_t obj;
 
 	/*
 	 * If this is an untrusted config, access the pool in read-only mode.
@@ -1840,8 +1843,10 @@
 		return (spa_load(spa, state, SPA_IMPORT_EXISTING, B_TRUE));
 	}
 
-	if (spa_dir_prop(spa, DMU_POOL_SYNC_BPLIST,
-	    &spa->spa_deferred_bplist_obj) != 0)
+	if (spa_dir_prop(spa, DMU_POOL_SYNC_BPOBJ, &obj) != 0)
+		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+	error = bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj);
+	if (error != 0)
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 
 	/*
@@ -2687,7 +2692,7 @@
 	uint64_t txg = TXG_INITIAL;
 	nvlist_t **spares, **l2cache;
 	uint_t nspares, nl2cache;
-	uint64_t version;
+	uint64_t version, obj;
 
 	/*
 	 * If this pool already exists, return failure.
@@ -2834,20 +2839,20 @@
 	}
 
 	/*
-	 * Create the deferred-free bplist object.  Turn off compression
+	 * Create the deferred-free bpobj.  Turn off compression
 	 * because sync-to-convergence takes longer if the blocksize
 	 * keeps changing.
 	 */
-	spa->spa_deferred_bplist_obj = bplist_create(spa->spa_meta_objset,
-	    1 << 14, tx);
-	dmu_object_set_compress(spa->spa_meta_objset,
-	    spa->spa_deferred_bplist_obj, ZIO_COMPRESS_OFF, tx);
-
+	obj = bpobj_alloc(spa->spa_meta_objset, 1 << 14, tx);
+	dmu_object_set_compress(spa->spa_meta_objset, obj,
+	    ZIO_COMPRESS_OFF, tx);
 	if (zap_add(spa->spa_meta_objset,
-	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST,
-	    sizeof (uint64_t), 1, &spa->spa_deferred_bplist_obj, tx) != 0) {
-		cmn_err(CE_PANIC, "failed to add bplist");
+	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPOBJ,
+	    sizeof (uint64_t), 1, &obj, tx) != 0) {
+		cmn_err(CE_PANIC, "failed to add bpobj");
 	}
+	VERIFY3U(0, ==, bpobj_open(&spa->spa_deferred_bpobj,
+	    spa->spa_meta_objset, obj));
 
 	/*
 	 * Create the pool's history object.
@@ -4946,34 +4951,23 @@
  * SPA syncing routines
  * ==========================================================================
  */
-static void
-spa_sync_deferred_bplist(spa_t *spa, bplist_t *bpl, dmu_tx_t *tx, uint64_t txg)
+
+static int
+bpobj_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
 {
-	blkptr_t blk;
-	uint64_t itor = 0;
-	uint8_t c = 1;
-
-	while (bplist_iterate(bpl, &itor, &blk) == 0) {
-		ASSERT(blk.blk_birth < txg);
-		zio_free(spa, txg, &blk);
-	}
-
-	bplist_vacate(bpl, tx);
-
-	/*
-	 * Pre-dirty the first block so we sync to convergence faster.
-	 * (Usually only the first block is needed.)
-	 */
-	dmu_write(bpl->bpl_mos, spa->spa_deferred_bplist_obj, 0, 1, &c, tx);
+	bpobj_t *bpo = arg;
+	bpobj_enqueue(bpo, bp, tx);
+	return (0);
 }
 
-static void
-spa_sync_free(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
+static int
+spa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
 {
 	zio_t *zio = arg;
 
 	zio_nowait(zio_free_sync(zio, zio->io_spa, dmu_tx_get_txg(tx), bp,
 	    zio->io_flags));
+	return (0);
 }
 
 static void
@@ -5204,6 +5198,42 @@
 }
 
 /*
+ * Perform one-time upgrade on-disk changes.  spa_version() does not
+ * reflect the new version this txg, so there must be no changes this
+ * txg to anything that the upgrade code depends on after it executes.
+ * Therefore this must be called after dsl_pool_sync() does the sync
+ * tasks.
+ */
+static void
+spa_sync_upgrades(spa_t *spa, dmu_tx_t *tx)
+{
+	dsl_pool_t *dp = spa->spa_dsl_pool;
+
+	ASSERT(spa->spa_sync_pass == 1);
+
+	if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN &&
+	    spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) {
+		dsl_pool_create_origin(dp, tx);
+
+		/* Keeping the origin open increases spa_minref */
+		spa->spa_minref += 3;
+	}
+
+	if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES &&
+	    spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) {
+		dsl_pool_upgrade_clones(dp, tx);
+	}
+
+	if (spa->spa_ubsync.ub_version < SPA_VERSION_DIR_CLONES &&
+	    spa->spa_uberblock.ub_version >= SPA_VERSION_DIR_CLONES) {
+		dsl_pool_upgrade_dir_clones(dp, tx);
+
+		/* Keeping the freedir open increases spa_minref */
+		spa->spa_minref += 3;
+	}
+}
+
+/*
  * Sync the specified transaction group.  New blocks may be dirtied as
  * part of the process, so we iterate until it converges.
  */
@@ -5212,7 +5242,7 @@
 {
 	dsl_pool_t *dp = spa->spa_dsl_pool;
 	objset_t *mos = spa->spa_meta_objset;
-	bplist_t *defer_bpl = &spa->spa_deferred_bplist;
+	bpobj_t *defer_bpo = &spa->spa_deferred_bpobj;
 	bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK];
 	vdev_t *rvd = spa->spa_root_vdev;
 	vdev_t *vd;
@@ -5251,8 +5281,6 @@
 	}
 	spa_config_exit(spa, SCL_STATE, FTAG);
 
-	VERIFY(0 == bplist_open(defer_bpl, mos, spa->spa_deferred_bplist_obj));
-
 	tx = dmu_tx_create_assigned(dp, txg);
 
 	/*
@@ -5276,19 +5304,6 @@
 		}
 	}
 
-	if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN &&
-	    spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) {
-		dsl_pool_create_origin(dp, tx);
-
-		/* Keeping the origin open increases spa_minref */
-		spa->spa_minref += 3;
-	}
-
-	if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES &&
-	    spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) {
-		dsl_pool_upgrade_clones(dp, tx);
-	}
-
 	/*
 	 * If anything has changed in this txg, or if someone is waiting
 	 * for this txg to sync (eg, spa_vdev_remove()), push the
@@ -5299,9 +5314,13 @@
 	if (!txg_list_empty(&dp->dp_dirty_datasets, txg) ||
 	    !txg_list_empty(&dp->dp_dirty_dirs, txg) ||
 	    !txg_list_empty(&dp->dp_sync_tasks, txg) ||
-	    ((dp->dp_scan->scn_phys.scn_state == DSS_SCANNING ||
-	    txg_sync_waiting(dp)) && !spa_shutting_down(spa)))
-		spa_sync_deferred_bplist(spa, defer_bpl, tx, txg);
+	    ((dsl_scan_active(dp->dp_scan) ||
+	    txg_sync_waiting(dp)) && !spa_shutting_down(spa))) {
+		zio_t *zio = zio_root(spa, NULL, NULL, 0);
+		VERIFY3U(bpobj_iterate(defer_bpo,
+		    spa_free_sync_cb, zio, tx), ==, 0);
+		VERIFY3U(zio_wait(zio), ==, 0);
+	}
 
 	/*
 	 * Iterate to convergence.
@@ -5319,10 +5338,12 @@
 
 		if (pass <= SYNC_PASS_DEFERRED_FREE) {
 			zio_t *zio = zio_root(spa, NULL, NULL, 0);
-			bplist_sync(free_bpl, spa_sync_free, zio, tx);
+			bplist_iterate(free_bpl, spa_free_sync_cb,
+			    zio, tx);
 			VERIFY(zio_wait(zio) == 0);
 		} else {
-			bplist_sync(free_bpl, bplist_enqueue_cb, defer_bpl, tx);
+			bplist_iterate(free_bpl, bpobj_enqueue_cb,
+			    defer_bpo, tx);
 		}
 
 		ddt_sync(spa, txg);
@@ -5331,12 +5352,11 @@
 		while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg))
 			vdev_sync(vd, txg);
 
+		if (pass == 1)
+			spa_sync_upgrades(spa, tx);
+
 	} while (dmu_objset_is_dirty(mos, txg));
 
-	ASSERT(list_is_empty(&free_bpl->bpl_queue));
-
-	bplist_close(defer_bpl);
-
 	/*
 	 * Rewrite the vdev configuration (which includes the uberblock)
 	 * to commit the transaction group.
@@ -5423,8 +5443,6 @@
 	ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg));
 	ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg));
 	ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg));
-	ASSERT(list_is_empty(&defer_bpl->bpl_queue));
-	ASSERT(list_is_empty(&free_bpl->bpl_queue));
 
 	spa->spa_sync_pass = 0;
 
--- a/usr/src/uts/common/fs/zfs/spa_misc.c	Fri May 21 15:05:41 2010 -0700
+++ b/usr/src/uts/common/fs/zfs/spa_misc.c	Fri May 21 17:29:22 2010 -0700
@@ -445,8 +445,7 @@
 	cv_init(&spa->spa_suspend_cv, NULL, CV_DEFAULT, NULL);
 
 	for (int t = 0; t < TXG_SIZE; t++)
-		bplist_init(&spa->spa_free_bplist[t]);
-	bplist_init(&spa->spa_deferred_bplist);
+		bplist_create(&spa->spa_free_bplist[t]);
 
 	(void) strlcpy(spa->spa_name, name, sizeof (spa->spa_name));
 	spa->spa_state = POOL_STATE_UNINITIALIZED;
@@ -524,8 +523,7 @@
 	spa_config_lock_destroy(spa);
 
 	for (int t = 0; t < TXG_SIZE; t++)
-		bplist_fini(&spa->spa_free_bplist[t]);
-	bplist_fini(&spa->spa_deferred_bplist);
+		bplist_destroy(&spa->spa_free_bplist[t]);
 
 	cv_destroy(&spa->spa_async_cv);
 	cv_destroy(&spa->spa_proc_cv);
--- a/usr/src/uts/common/fs/zfs/sys/bplist.h	Fri May 21 15:05:41 2010 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/bplist.h	Fri May 21 17:29:22 2010 -0700
@@ -19,76 +19,36 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #ifndef	_SYS_BPLIST_H
 #define	_SYS_BPLIST_H
 
-#include <sys/dmu.h>
+#include <sys/zfs_context.h>
 #include <sys/spa.h>
-#include <sys/txg.h>
-#include <sys/zio.h>
-#include <sys/zfs_context.h>
 
 #ifdef	__cplusplus
 extern "C" {
 #endif
 
-typedef struct bplist_phys {
-	/*
-	 * This is the bonus buffer for the dead lists.  The object's
-	 * contents is an array of bpl_entries blkptr_t's, representing
-	 * a total of bpl_bytes physical space.
-	 */
-	uint64_t	bpl_entries;
-	uint64_t	bpl_bytes;
-	uint64_t	bpl_comp;
-	uint64_t	bpl_uncomp;
-} bplist_phys_t;
-
-#define	BPLIST_SIZE_V0	(2 * sizeof (uint64_t))
-
-typedef struct bplist_q {
-	blkptr_t	bpq_blk;
-	list_node_t	bpq_node;
-} bplist_q_t;
+typedef struct bplist_entry {
+	blkptr_t	bpe_blk;
+	list_node_t	bpe_node;
+} bplist_entry_t;
 
 typedef struct bplist {
 	kmutex_t	bpl_lock;
-	objset_t	*bpl_mos;
-	uint64_t	bpl_object;
-	uint8_t		bpl_blockshift;
-	uint8_t		bpl_bpshift;
-	uint8_t		bpl_havecomp;
-	kmutex_t	bpl_q_lock;
-	list_t		bpl_queue;
-	bplist_phys_t	*bpl_phys;
-	dmu_buf_t	*bpl_dbuf;
-	dmu_buf_t	*bpl_cached_dbuf;
+	list_t		bpl_list;
 } bplist_t;
 
-typedef void bplist_sync_cb_t(void *arg, const blkptr_t *bp, dmu_tx_t *tx);
+typedef int bplist_itor_t(void *arg, const blkptr_t *bp, dmu_tx_t *tx);
 
-extern void bplist_init(bplist_t *bpl);
-extern void bplist_fini(bplist_t *bpl);
-extern uint64_t bplist_create(objset_t *mos, int blocksize, dmu_tx_t *tx);
-extern void bplist_destroy(objset_t *mos, uint64_t object, dmu_tx_t *tx);
-extern int bplist_open(bplist_t *bpl, objset_t *mos, uint64_t object);
-extern void bplist_close(bplist_t *bpl);
-extern boolean_t bplist_empty(bplist_t *bpl);
-extern int bplist_iterate(bplist_t *bpl, uint64_t *itorp, blkptr_t *bp);
-extern int bplist_enqueue(bplist_t *bpl, const blkptr_t *bp, dmu_tx_t *tx);
-extern void bplist_enqueue_cb(void *bpl, const blkptr_t *bp, dmu_tx_t *tx);
-extern void bplist_enqueue_deferred(bplist_t *bpl, const blkptr_t *bp);
-extern void bplist_sync(bplist_t *bpl, bplist_sync_cb_t *func,
+void bplist_create(bplist_t *bpl);
+void bplist_destroy(bplist_t *bpl);
+void bplist_append(bplist_t *bpl, const blkptr_t *bp);
+void bplist_iterate(bplist_t *bpl, bplist_itor_t *func,
     void *arg, dmu_tx_t *tx);
-extern void bplist_vacate(bplist_t *bpl, dmu_tx_t *tx);
-extern int bplist_space(bplist_t *bpl,
-    uint64_t *usedp, uint64_t *compp, uint64_t *uncompp);
-extern int bplist_space_birthrange(bplist_t *bpl,
-    uint64_t mintxg, uint64_t maxtxg, uint64_t *dsizep);
 
 #ifdef	__cplusplus
 }
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/common/fs/zfs/sys/bpobj.h	Fri May 21 17:29:22 2010 -0700
@@ -0,0 +1,91 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#ifndef	_SYS_BPOBJ_H
+#define	_SYS_BPOBJ_H
+
+#include <sys/dmu.h>
+#include <sys/spa.h>
+#include <sys/txg.h>
+#include <sys/zio.h>
+#include <sys/zfs_context.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+typedef struct bpobj_phys {
+	/*
+	 * This is the bonus buffer for the dead lists.  The object's
+	 * contents is an array of bpo_entries blkptr_t's, representing
+	 * a total of bpo_bytes physical space.
+	 */
+	uint64_t	bpo_num_blkptrs;
+	uint64_t	bpo_bytes;
+	uint64_t	bpo_comp;
+	uint64_t	bpo_uncomp;
+	uint64_t	bpo_subobjs;
+	uint64_t	bpo_num_subobjs;
+} bpobj_phys_t;
+
+#define	BPOBJ_SIZE_V0	(2 * sizeof (uint64_t))
+#define	BPOBJ_SIZE_V1	(4 * sizeof (uint64_t))
+
+typedef struct bpobj {
+	kmutex_t	bpo_lock;
+	objset_t	*bpo_os;
+	uint64_t	bpo_object;
+	int		bpo_epb;
+	uint8_t		bpo_havecomp;
+	uint8_t		bpo_havesubobj;
+	bpobj_phys_t	*bpo_phys;
+	dmu_buf_t	*bpo_dbuf;
+	dmu_buf_t	*bpo_cached_dbuf;
+} bpobj_t;
+
+typedef int bpobj_itor_t(void *arg, const blkptr_t *bp, dmu_tx_t *tx);
+
+uint64_t bpobj_alloc(objset_t *mos, int blocksize, dmu_tx_t *tx);
+void bpobj_free(objset_t *os, uint64_t obj, dmu_tx_t *tx);
+
+int bpobj_open(bpobj_t *bpo, objset_t *mos, uint64_t object);
+void bpobj_close(bpobj_t *bpo);
+
+int bpobj_iterate(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx);
+int bpobj_iterate_nofree(bpobj_t *bpo, bpobj_itor_t func, void *, dmu_tx_t *);
+int bpobj_iterate_dbg(bpobj_t *bpo, uint64_t *itorp, blkptr_t *bp);
+
+void bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx);
+void bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, dmu_tx_t *tx);
+
+int bpobj_space(bpobj_t *bpo,
+    uint64_t *usedp, uint64_t *compp, uint64_t *uncompp);
+int bpobj_space_range(bpobj_t *bpo, uint64_t mintxg, uint64_t maxtxg,
+    uint64_t *usedp, uint64_t *compp, uint64_t *uncompp);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif /* _SYS_BPOBJ_H */
--- a/usr/src/uts/common/fs/zfs/sys/dmu.h	Fri May 21 15:05:41 2010 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/dmu.h	Fri May 21 17:29:22 2010 -0700
@@ -77,8 +77,8 @@
 	DMU_OT_OBJECT_ARRAY,		/* UINT64 */
 	DMU_OT_PACKED_NVLIST,		/* UINT8 (XDR by nvlist_pack/unpack) */
 	DMU_OT_PACKED_NVLIST_SIZE,	/* UINT64 */
-	DMU_OT_BPLIST,			/* UINT64 */
-	DMU_OT_BPLIST_HDR,		/* UINT64 */
+	DMU_OT_BPOBJ,			/* UINT64 */
+	DMU_OT_BPOBJ_HDR,		/* UINT64 */
 	/* spa: */
 	DMU_OT_SPACE_MAP_HEADER,	/* UINT64 */
 	DMU_OT_SPACE_MAP,		/* UINT64 */
@@ -130,6 +130,10 @@
 	DMU_OT_SA_ATTR_LAYOUTS,		/* ZAP */
 	DMU_OT_SCAN_XLATE,		/* ZAP */
 	DMU_OT_DEDUP,			/* fake dedup BP from ddt_bp_create() */
+	DMU_OT_DEADLIST,		/* ZAP */
+	DMU_OT_DEADLIST_HDR,		/* UINT64 */
+	DMU_OT_DSL_CLONES,		/* ZAP */
+	DMU_OT_BPOBJ_SUBOBJ,		/* UINT64 */
 	DMU_OT_NUMTYPES
 } dmu_object_type_t;
 
@@ -211,7 +215,7 @@
 #define	DMU_POOL_DIRECTORY_OBJECT	1
 #define	DMU_POOL_CONFIG			"config"
 #define	DMU_POOL_ROOT_DATASET		"root_dataset"
-#define	DMU_POOL_SYNC_BPLIST		"sync_bplist"
+#define	DMU_POOL_SYNC_BPOBJ		"sync_bplist"
 #define	DMU_POOL_ERRLOG_SCRUB		"errlog_scrub"
 #define	DMU_POOL_ERRLOG_LAST		"errlog_last"
 #define	DMU_POOL_SPARES			"spares"
@@ -224,6 +228,7 @@
 #define	DMU_POOL_DDT_STATS		"DDT-statistics"
 #define	DMU_POOL_CREATION_VERSION	"creation_version"
 #define	DMU_POOL_SCAN			"scan"
+#define	DMU_POOL_FREE_BPOBJ		"free_bpobj"
 
 /*
  * Allocate an object from this objset.  The range of object numbers
--- a/usr/src/uts/common/fs/zfs/sys/dsl_dataset.h	Fri May 21 15:05:41 2010 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/dsl_dataset.h	Fri May 21 17:29:22 2010 -0700
@@ -32,6 +32,7 @@
 #include <sys/bplist.h>
 #include <sys/dsl_synctask.h>
 #include <sys/zfs_context.h>
+#include <sys/dsl_deadlist.h>
 
 #ifdef	__cplusplus
 extern "C" {
@@ -82,7 +83,7 @@
 	uint64_t ds_num_children;	/* clone/snap children; ==0 for head */
 	uint64_t ds_creation_time;	/* seconds since 1970 */
 	uint64_t ds_creation_txg;
-	uint64_t ds_deadlist_obj;	/* DMU_OT_BPLIST */
+	uint64_t ds_deadlist_obj;	/* DMU_OT_DEADLIST */
 	uint64_t ds_used_bytes;
 	uint64_t ds_compressed_bytes;
 	uint64_t ds_uncompressed_bytes;
@@ -114,7 +115,8 @@
 	struct dsl_dataset *ds_prev;
 
 	/* has internal locking: */
-	bplist_t ds_deadlist;
+	dsl_deadlist_t ds_deadlist;
+	bplist_t ds_pending_deadlist;
 
 	/* to protect against multiple concurrent incremental recv */
 	kmutex_t ds_recvlock;
@@ -160,7 +162,7 @@
 	boolean_t need_prep;		/* do we need to retry due to EBUSY? */
 };
 
-#define	dsl_dataset_is_snapshot(ds)	\
+#define	dsl_dataset_is_snapshot(ds) \
 	((ds)->ds_phys->ds_num_children != 0)
 
 #define	DS_UNIQUE_IS_ACCURATE(ds)	\
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/usr/src/uts/common/fs/zfs/sys/dsl_deadlist.h	Fri May 21 17:29:22 2010 -0700
@@ -0,0 +1,87 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#ifndef	_SYS_DSL_DEADLIST_H
+#define	_SYS_DSL_DEADLIST_H
+
+#include <sys/bpobj.h>
+#include <sys/zfs_context.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+struct dmu_buf;
+struct dsl_dataset;
+
+typedef struct dsl_deadlist_phys {
+	uint64_t dl_used;
+	uint64_t dl_comp;
+	uint64_t dl_uncomp;
+	uint64_t dl_pad[37]; /* pad out to 320b for future expansion */
+} dsl_deadlist_phys_t;
+
+typedef struct dsl_deadlist {
+	objset_t *dl_os;
+	uint64_t dl_object;
+	avl_tree_t dl_tree;
+	boolean_t dl_havetree;
+	struct dmu_buf *dl_dbuf;
+	dsl_deadlist_phys_t *dl_phys;
+	kmutex_t dl_lock;
+
+	/* if it's the old on-disk format: */
+	bpobj_t dl_bpobj;
+	boolean_t dl_oldfmt;
+} dsl_deadlist_t;
+
+typedef struct dsl_deadlist_entry {
+	avl_node_t dle_node;
+	uint64_t dle_mintxg;
+	bpobj_t dle_bpobj;
+} dsl_deadlist_entry_t;
+
+void dsl_deadlist_open(dsl_deadlist_t *dl, objset_t *os, uint64_t object);
+void dsl_deadlist_close(dsl_deadlist_t *dl);
+uint64_t dsl_deadlist_alloc(objset_t *os, dmu_tx_t *tx);
+void dsl_deadlist_free(objset_t *os, uint64_t dlobj, dmu_tx_t *tx);
+void dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, dmu_tx_t *tx);
+void dsl_deadlist_add_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx);
+void dsl_deadlist_remove_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx);
+uint64_t dsl_deadlist_clone(dsl_deadlist_t *dl, uint64_t maxtxg,
+    uint64_t mrs_obj, dmu_tx_t *tx);
+void dsl_deadlist_space(dsl_deadlist_t *dl,
+    uint64_t *usedp, uint64_t *compp, uint64_t *uncompp);
+void dsl_deadlist_space_range(dsl_deadlist_t *dl,
+    uint64_t mintxg, uint64_t maxtxg,
+    uint64_t *usedp, uint64_t *compp, uint64_t *uncompp);
+void dsl_deadlist_merge(dsl_deadlist_t *dl, uint64_t obj, dmu_tx_t *tx);
+void dsl_deadlist_move_bpobj(dsl_deadlist_t *dl, bpobj_t *bpo, uint64_t mintxg,
+    dmu_tx_t *tx);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif /* _SYS_DSL_DEADLIST_H */
--- a/usr/src/uts/common/fs/zfs/sys/dsl_dir.h	Fri May 21 15:05:41 2010 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/dsl_dir.h	Fri May 21 17:29:22 2010 -0700
@@ -69,7 +69,8 @@
 	uint64_t dd_deleg_zapobj; /* dataset delegation permissions */
 	uint64_t dd_flags;
 	uint64_t dd_used_breakdown[DD_USED_NUM];
-	uint64_t dd_pad[14]; /* pad out to 256 bytes for good measure */
+	uint64_t dd_clones; /* dsl_dir objects */
+	uint64_t dd_pad[13]; /* pad out to 256 bytes for good measure */
 } dsl_dir_phys_t;
 
 struct dsl_dir {
@@ -143,6 +144,7 @@
 #define	MOS_DIR_NAME "$MOS"
 #define	ORIGIN_DIR_NAME "$ORIGIN"
 #define	XLATION_DIR_NAME "$XLATION"
+#define	FREE_DIR_NAME "$FREE"
 
 #ifdef ZFS_DEBUG
 #define	dprintf_dd(dd, fmt, ...) do { \
--- a/usr/src/uts/common/fs/zfs/sys/dsl_pool.h	Fri May 21 15:05:41 2010 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/dsl_pool.h	Fri May 21 17:29:22 2010 -0700
@@ -33,6 +33,7 @@
 #include <sys/dnode.h>
 #include <sys/ddt.h>
 #include <sys/arc.h>
+#include <sys/bpobj.h>
 
 #ifdef	__cplusplus
 extern "C" {
@@ -71,6 +72,7 @@
 	struct objset *dp_meta_objset;
 	struct dsl_dir *dp_root_dir;
 	struct dsl_dir *dp_mos_dir;
+	struct dsl_dir *dp_free_dir;
 	struct dsl_dataset *dp_origin_snap;
 	uint64_t dp_root_dir_obj;
 	struct taskq *dp_vnrele_taskq;
@@ -82,6 +84,7 @@
 	uint64_t dp_throughput; /* bytes per millisec */
 	uint64_t dp_write_limit;
 	uint64_t dp_tmp_userrefs_obj;
+	bpobj_t dp_free_bpobj;
 
 	struct dsl_scan *dp_scan;
 
@@ -130,6 +133,7 @@
     uint32_t *arc_flags, const zbookmark_t *zb);
 void dsl_pool_create_origin(dsl_pool_t *dp, dmu_tx_t *tx);
 void dsl_pool_upgrade_clones(dsl_pool_t *dp, dmu_tx_t *tx);
+void dsl_pool_upgrade_dir_clones(dsl_pool_t *dp, dmu_tx_t *tx);
 
 taskq_t *dsl_pool_vnrele_taskq(dsl_pool_t *dp);
 
--- a/usr/src/uts/common/fs/zfs/sys/dsl_scan.h	Fri May 21 15:05:41 2010 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/dsl_scan.h	Fri May 21 17:29:22 2010 -0700
@@ -77,7 +77,7 @@
 	boolean_t scn_pausing;
 	uint64_t scn_restart_txg;
 	uint64_t scn_sync_start_time;
-	zio_t *scn_prefetch_zio_root;
+	zio_t *scn_zio_root;
 
 	/* for debugging / information */
 	uint64_t scn_visited_this_txg;
@@ -99,6 +99,7 @@
 void dsl_scan_ds_snapshotted(struct dsl_dataset *ds, struct dmu_tx *tx);
 void dsl_scan_ds_clone_swapped(struct dsl_dataset *ds1, struct dsl_dataset *ds2,
     struct dmu_tx *tx);
+boolean_t dsl_scan_active(dsl_scan_t *scn);
 
 #ifdef	__cplusplus
 }
--- a/usr/src/uts/common/fs/zfs/sys/spa.h	Fri May 21 15:05:41 2010 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/spa.h	Fri May 21 17:29:22 2010 -0700
@@ -599,7 +599,6 @@
 extern uint64_t spa_bootfs(spa_t *spa);
 extern uint64_t spa_delegation(spa_t *spa);
 extern objset_t *spa_meta_objset(spa_t *spa);
-extern enum zio_checksum spa_dedup_checksum(spa_t *spa);
 
 /* Miscellaneous support routines */
 extern int spa_rename(const char *oldname, const char *newname);
--- a/usr/src/uts/common/fs/zfs/sys/spa_impl.h	Fri May 21 15:05:41 2010 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/spa_impl.h	Fri May 21 17:29:22 2010 -0700
@@ -35,6 +35,7 @@
 #include <sys/avl.h>
 #include <sys/refcount.h>
 #include <sys/bplist.h>
+#include <sys/bpobj.h>
 
 #ifdef	__cplusplus
 extern "C" {
@@ -140,8 +141,7 @@
 	uint64_t	spa_config_object;	/* MOS object for pool config */
 	uint64_t	spa_config_generation;	/* config generation number */
 	uint64_t	spa_syncing_txg;	/* txg currently syncing */
-	uint64_t	spa_deferred_bplist_obj; /* object for deferred frees */
-	bplist_t	spa_deferred_bplist;	/* deferred-free bplist */
+	bpobj_t		spa_deferred_bpobj;	/* deferred-free bplist */
 	bplist_t	spa_free_bplist[TXG_SIZE]; /* bplist of stuff to free */
 	uberblock_t	spa_ubsync;		/* last synced uberblock */
 	uberblock_t	spa_uberblock;		/* current uberblock */
--- a/usr/src/uts/common/fs/zfs/sys/zio_checksum.h	Fri May 21 15:05:41 2010 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/zio_checksum.h	Fri May 21 17:29:22 2010 -0700
@@ -19,15 +19,13 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #ifndef _SYS_ZIO_CHECKSUM_H
 #define	_SYS_ZIO_CHECKSUM_H
 
 #include <sys/zio.h>
-#include <zfs_fletcher.h>
 
 #ifdef	__cplusplus
 extern "C" {
@@ -68,6 +66,7 @@
 extern void zio_checksum_compute(zio_t *zio, enum zio_checksum checksum,
     void *data, uint64_t size);
 extern int zio_checksum_error(zio_t *zio, zio_bad_cksum_t *out);
+extern enum zio_checksum spa_dedup_checksum(spa_t *spa);
 
 #ifdef	__cplusplus
 }
--- a/usr/src/uts/common/fs/zfs/txg.c	Fri May 21 15:05:41 2010 -0700
+++ b/usr/src/uts/common/fs/zfs/txg.c	Fri May 21 17:29:22 2010 -0700
@@ -371,9 +371,7 @@
 		 * us, or we have reached our timeout.
 		 */
 		timer = (delta >= timeout ? 0 : timeout - delta);
-		while ((dp->dp_scan->scn_phys.scn_state != DSS_SCANNING ||
-		    spa_load_state(spa) != SPA_LOAD_NONE ||
-		    spa_shutting_down(spa)) &&
+		while (!dsl_scan_active(dp->dp_scan) &&
 		    !tx->tx_exiting && timer > 0 &&
 		    tx->tx_synced_txg >= tx->tx_sync_txg_waiting &&
 		    tx->tx_quiesced_txg == 0) {
--- a/usr/src/uts/common/fs/zfs/zio.c	Fri May 21 15:05:41 2010 -0700
+++ b/usr/src/uts/common/fs/zfs/zio.c	Fri May 21 17:29:22 2010 -0700
@@ -652,7 +652,7 @@
 void
 zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp)
 {
-	bplist_enqueue_deferred(&spa->spa_free_bplist[txg & TXG_MASK], bp);
+	bplist_append(&spa->spa_free_bplist[txg & TXG_MASK], bp);
 }
 
 zio_t *
--- a/usr/src/uts/common/fs/zfs/zio_checksum.c	Fri May 21 15:05:41 2010 -0700
+++ b/usr/src/uts/common/fs/zfs/zio_checksum.c	Fri May 21 17:29:22 2010 -0700
@@ -19,8 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -28,6 +27,7 @@
 #include <sys/zio.h>
 #include <sys/zio_checksum.h>
 #include <sys/zil.h>
+#include <zfs_fletcher.h>
 
 /*
  * Checksum vectors.
--- a/usr/src/uts/common/sys/fs/zfs.h	Fri May 21 15:05:41 2010 -0700
+++ b/usr/src/uts/common/sys/fs/zfs.h	Fri May 21 17:29:22 2010 -0700
@@ -334,14 +334,15 @@
 #define	SPA_VERSION_23			23ULL
 #define	SPA_VERSION_24			24ULL
 #define	SPA_VERSION_25			25ULL
+#define	SPA_VERSION_26			26ULL
 /*
  * When bumping up SPA_VERSION, make sure GRUB ZFS understands the on-disk
  * format change. Go to usr/src/grub/grub-0.97/stage2/{zfs-include/, fsys_zfs*},
  * and do the appropriate changes.  Also bump the version number in
  * usr/src/grub/capability.
  */
-#define	SPA_VERSION			SPA_VERSION_25
-#define	SPA_VERSION_STRING		"25"
+#define	SPA_VERSION			SPA_VERSION_26
+#define	SPA_VERSION_STRING		"26"
 
 /*
  * Symbolic names for the changes that caused a SPA_VERSION switch.
@@ -358,7 +359,7 @@
 #define	SPA_VERSION_DITTO_BLOCKS	SPA_VERSION_2
 #define	SPA_VERSION_SPARES		SPA_VERSION_3
 #define	SPA_VERSION_RAIDZ2		SPA_VERSION_3
-#define	SPA_VERSION_BPLIST_ACCOUNT	SPA_VERSION_3
+#define	SPA_VERSION_BPOBJ_ACCOUNT	SPA_VERSION_3
 #define	SPA_VERSION_RAIDZ_DEFLATE	SPA_VERSION_3
 #define	SPA_VERSION_DNODE_BYTES		SPA_VERSION_3
 #define	SPA_VERSION_ZPOOL_HISTORY	SPA_VERSION_4
@@ -388,6 +389,8 @@
 #define	SPA_VERSION_SLIM_ZIL		SPA_VERSION_23
 #define	SPA_VERSION_SA			SPA_VERSION_24
 #define	SPA_VERSION_SCAN		SPA_VERSION_25
+#define	SPA_VERSION_DIR_CLONES		SPA_VERSION_26
+#define	SPA_VERSION_DEADLISTS		SPA_VERSION_26
 
 /*
  * ZPL version - rev'd whenever an incompatible on-disk format change