6512391 DMU should leverage ZIO dependencies to achieve greater parallelism
authormaybee
Fri, 02 Feb 2007 15:36:58 -0800
changeset 3547 e396e0a440b1
parent 3546 82a941a9c116
child 3548 b92f0790f453
6512391 DMU should leverage ZIO dependencies to achieve greater parallelism
usr/src/cmd/zdb/zdb.c
usr/src/uts/common/fs/zfs/arc.c
usr/src/uts/common/fs/zfs/dbuf.c
usr/src/uts/common/fs/zfs/dmu.c
usr/src/uts/common/fs/zfs/dmu_objset.c
usr/src/uts/common/fs/zfs/dmu_send.c
usr/src/uts/common/fs/zfs/dnode.c
usr/src/uts/common/fs/zfs/dnode_sync.c
usr/src/uts/common/fs/zfs/dsl_dataset.c
usr/src/uts/common/fs/zfs/dsl_pool.c
usr/src/uts/common/fs/zfs/sys/arc.h
usr/src/uts/common/fs/zfs/sys/dbuf.h
usr/src/uts/common/fs/zfs/sys/dmu_impl.h
usr/src/uts/common/fs/zfs/sys/dmu_objset.h
usr/src/uts/common/fs/zfs/sys/dnode.h
usr/src/uts/common/fs/zfs/sys/dsl_dataset.h
usr/src/uts/common/fs/zfs/sys/spa.h
usr/src/uts/common/fs/zfs/sys/zio.h
usr/src/uts/common/fs/zfs/zio.c
--- a/usr/src/cmd/zdb/zdb.c	Fri Feb 02 15:36:29 2007 -0800
+++ b/usr/src/cmd/zdb/zdb.c	Fri Feb 02 15:36:58 2007 -0800
@@ -1017,21 +1017,21 @@
 
 	if (dds.dds_type == DMU_OST_META) {
 		dds.dds_creation_txg = TXG_INITIAL;
-		usedobjs = os->os->os_rootbp.blk_fill;
+		usedobjs = os->os->os_rootbp->blk_fill;
 		refdbytes =
 		    os->os->os_spa->spa_dsl_pool->dp_mos_dir->dd_used_bytes;
 	} else {
 		dmu_objset_space(os, &refdbytes, &scratch, &usedobjs, &scratch);
 	}
 
-	ASSERT3U(usedobjs, ==, os->os->os_rootbp.blk_fill);
+	ASSERT3U(usedobjs, ==, os->os->os_rootbp->blk_fill);
 
 	nicenum(refdbytes, numbuf);
 
 	if (verbosity >= 4) {
 		(void) strcpy(blkbuf, ", rootbp ");
 		sprintf_blkptr(blkbuf + strlen(blkbuf),
-		    BP_SPRINTF_LEN - strlen(blkbuf), &os->os->os_rootbp);
+		    BP_SPRINTF_LEN - strlen(blkbuf), os->os->os_rootbp);
 	} else {
 		blkbuf[0] = '\0';
 	}
--- a/usr/src/uts/common/fs/zfs/arc.c	Fri Feb 02 15:36:29 2007 -0800
+++ b/usr/src/uts/common/fs/zfs/arc.c	Fri Feb 02 15:36:58 2007 -0800
@@ -315,14 +315,23 @@
 typedef struct arc_callback arc_callback_t;
 
 struct arc_callback {
+	void			*acb_private;
 	arc_done_func_t		*acb_done;
-	void			*acb_private;
 	arc_byteswap_func_t	*acb_byteswap;
 	arc_buf_t		*acb_buf;
 	zio_t			*acb_zio_dummy;
 	arc_callback_t		*acb_next;
 };
 
+typedef struct arc_write_callback arc_write_callback_t;
+
+struct arc_write_callback {
+	void		*awcb_private;
+	arc_done_func_t	*awcb_ready;
+	arc_done_func_t	*awcb_done;
+	arc_buf_t	*awcb_buf;
+};
+
 struct arc_buf_hdr {
 	/* protected by hash lock */
 	dva_t			b_dva;
@@ -2357,6 +2366,7 @@
 			atomic_add_64(&hdr->b_state->arcs_lsize, -hdr->b_size);
 		}
 		hdr->b_datacnt -= 1;
+		arc_cksum_verify(buf);
 
 		mutex_exit(hash_lock);
 
@@ -2369,11 +2379,7 @@
 		nhdr->b_arc_access = 0;
 		nhdr->b_flags = 0;
 		nhdr->b_datacnt = 1;
-		if (hdr->b_freeze_cksum != NULL) {
-			nhdr->b_freeze_cksum =
-			    kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP);
-			*nhdr->b_freeze_cksum = *hdr->b_freeze_cksum;
-		}
+		nhdr->b_freeze_cksum = NULL;
 		buf->b_hdr = nhdr;
 		buf->b_next = NULL;
 		(void) refcount_add(&nhdr->b_refcnt, tag);
@@ -2390,10 +2396,10 @@
 		bzero(&hdr->b_dva, sizeof (dva_t));
 		hdr->b_birth = 0;
 		hdr->b_cksum0 = 0;
+		arc_buf_thaw(buf);
 	}
 	buf->b_efunc = NULL;
 	buf->b_private = NULL;
-	arc_buf_thaw(buf);
 }
 
 int
@@ -2417,17 +2423,26 @@
 #endif
 
 static void
+arc_write_ready(zio_t *zio)
+{
+	arc_write_callback_t *callback = zio->io_private;
+	arc_buf_t *buf = callback->awcb_buf;
+
+	if (callback->awcb_ready) {
+		ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt));
+		callback->awcb_ready(zio, buf, callback->awcb_private);
+	}
+	arc_cksum_compute(buf);
+}
+
+static void
 arc_write_done(zio_t *zio)
 {
-	arc_buf_t *buf;
-	arc_buf_hdr_t *hdr;
-	arc_callback_t *acb;
+	arc_write_callback_t *callback = zio->io_private;
+	arc_buf_t *buf = callback->awcb_buf;
+	arc_buf_hdr_t *hdr = buf->b_hdr;
 
-	buf = zio->io_private;
-	hdr = buf->b_hdr;
-	acb = hdr->b_acb;
 	hdr->b_acb = NULL;
-	ASSERT(acb != NULL);
 
 	/* this buffer is on no lists and is not in the hash table */
 	ASSERT3P(hdr->b_state, ==, arc_anon);
@@ -2469,7 +2484,7 @@
 		hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
 		arc_access(hdr, hash_lock);
 		mutex_exit(hash_lock);
-	} else if (acb->acb_done == NULL) {
+	} else if (callback->awcb_done == NULL) {
 		int destroy_hdr;
 		/*
 		 * This is an anonymous buffer with no user callback,
@@ -2485,23 +2500,23 @@
 		hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
 	}
 
-	if (acb->acb_done) {
+	if (callback->awcb_done) {
 		ASSERT(!refcount_is_zero(&hdr->b_refcnt));
-		acb->acb_done(zio, buf, acb->acb_private);
+		callback->awcb_done(zio, buf, callback->awcb_private);
 	}
 
-	kmem_free(acb, sizeof (arc_callback_t));
+	kmem_free(callback, sizeof (arc_write_callback_t));
 }
 
-int
+zio_t *
 arc_write(zio_t *pio, spa_t *spa, int checksum, int compress, int ncopies,
     uint64_t txg, blkptr_t *bp, arc_buf_t *buf,
-    arc_done_func_t *done, void *private, int priority, int flags,
-    uint32_t arc_flags, zbookmark_t *zb)
+    arc_done_func_t *ready, arc_done_func_t *done, void *private, int priority,
+    int flags, zbookmark_t *zb)
 {
 	arc_buf_hdr_t *hdr = buf->b_hdr;
-	arc_callback_t	*acb;
-	zio_t	*rzio;
+	arc_write_callback_t *callback;
+	zio_t	*zio;
 
 	/* this is a private buffer - no locking required */
 	ASSERT3P(hdr->b_state, ==, arc_anon);
@@ -2509,23 +2524,17 @@
 	ASSERT(!HDR_IO_ERROR(hdr));
 	ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0);
 	ASSERT(hdr->b_acb == 0);
-	acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
-	acb->acb_done = done;
-	acb->acb_private = private;
-	acb->acb_byteswap = (arc_byteswap_func_t *)-1;
-	hdr->b_acb = acb;
+	callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
+	callback->awcb_ready = ready;
+	callback->awcb_done = done;
+	callback->awcb_private = private;
+	callback->awcb_buf = buf;
 	hdr->b_flags |= ARC_IO_IN_PROGRESS;
-	arc_cksum_compute(buf);
-	rzio = zio_write(pio, spa, checksum, compress, ncopies, txg, bp,
-	    buf->b_data, hdr->b_size, arc_write_done, buf, priority, flags, zb);
+	zio = zio_write(pio, spa, checksum, compress, ncopies, txg, bp,
+	    buf->b_data, hdr->b_size, arc_write_ready, arc_write_done, callback,
+	    priority, flags, zb);
 
-	if (arc_flags & ARC_WAIT)
-		return (zio_wait(rzio));
-
-	ASSERT(arc_flags & ARC_NOWAIT);
-	zio_nowait(rzio);
-
-	return (0);
+	return (zio);
 }
 
 int
--- a/usr/src/uts/common/fs/zfs/dbuf.c	Fri Feb 02 15:36:29 2007 -0800
+++ b/usr/src/uts/common/fs/zfs/dbuf.c	Fri Feb 02 15:36:58 2007 -0800
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -39,6 +39,9 @@
 
 static void dbuf_destroy(dmu_buf_impl_t *db);
 static int dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
+static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, int checksum,
+    int compress, dmu_tx_t *tx);
+static arc_done_func_t dbuf_write_ready;
 static arc_done_func_t dbuf_write_done;
 
 int zfs_mdcomp_disable = 0;
@@ -46,7 +49,6 @@
 /*
  * Global data structures and functions for the dbuf cache.
  */
-taskq_t *dbuf_tq;
 static kmem_cache_t *dbuf_cache;
 
 /* ARGSUSED */
@@ -210,31 +212,24 @@
 {
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 
-	if (db->db_level != 0 || db->db_d.db_evict_func == NULL)
+	if (db->db_level != 0 || db->db_evict_func == NULL)
 		return;
 
-	if (db->db_d.db_user_data_ptr_ptr)
-		*db->db_d.db_user_data_ptr_ptr = db->db.db_data;
-	db->db_d.db_evict_func(&db->db, db->db_d.db_user_ptr);
-	db->db_d.db_user_ptr = NULL;
-	db->db_d.db_user_data_ptr_ptr = NULL;
-	db->db_d.db_evict_func = NULL;
+	if (db->db_user_data_ptr_ptr)
+		*db->db_user_data_ptr_ptr = db->db.db_data;
+	db->db_evict_func(&db->db, db->db_user_ptr);
+	db->db_user_ptr = NULL;
+	db->db_user_data_ptr_ptr = NULL;
+	db->db_evict_func = NULL;
 }
 
 void
 dbuf_evict(dmu_buf_impl_t *db)
 {
-	int i;
-
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 	ASSERT(db->db_buf == NULL);
+	ASSERT(db->db_data_pending == NULL);
 
-#ifdef ZFS_DEBUG
-	for (i = 0; i < TXG_SIZE; i++) {
-		ASSERT(!list_link_active(&db->db_dirty_node[i]));
-		ASSERT(db->db_level != 0 || db->db_d.db_data_old[i] == NULL);
-	}
-#endif
 	dbuf_clear(db);
 	dbuf_destroy(db);
 }
@@ -267,8 +262,6 @@
 	dbuf_cache = kmem_cache_create("dmu_buf_impl_t",
 	    sizeof (dmu_buf_impl_t),
 	    0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0);
-	dbuf_tq = taskq_create("dbuf_tq", 8, maxclsyspri, 50, INT_MAX,
-	    TASKQ_PREPOPULATE);
 
 	for (i = 0; i < DBUF_MUTEXES; i++)
 		mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL);
@@ -280,9 +273,6 @@
 	dbuf_hash_table_t *h = &dbuf_hash_table;
 	int i;
 
-	taskq_destroy(dbuf_tq);
-	dbuf_tq = NULL;
-
 	for (i = 0; i < DBUF_MUTEXES; i++)
 		mutex_destroy(&h->hash_mutexes[i]);
 	kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *));
@@ -297,7 +287,6 @@
 static void
 dbuf_verify(dmu_buf_impl_t *db)
 {
-	int i;
 	dnode_t *dn = db->db_dnode;
 
 	ASSERT(MUTEX_HELD(&db->db_mtx));
@@ -330,15 +319,13 @@
 			ASSERT3U(db->db.db_size, >=, dn->dn_datablksz);
 		}
 		if (db->db.db_object == DMU_META_DNODE_OBJECT) {
-			for (i = 0; i < TXG_SIZE; i++) {
-				/*
-				 * it should only be modified in syncing
-				 * context, so make sure we only have
-				 * one copy of the data.
-				 */
-				ASSERT(db->db_d.db_data_old[i] == NULL ||
-				    db->db_d.db_data_old[i] == db->db_buf);
-			}
+			dbuf_dirty_record_t *dr = db->db_data_pending;
+			/*
+			 * it should only be modified in syncing
+			 * context, so make sure we only have
+			 * one copy of the data.
+			 */
+			ASSERT(dr == NULL || dr->dt.dl.dr_data == db->db_buf);
 		}
 	}
 
@@ -395,9 +382,9 @@
 dbuf_update_data(dmu_buf_impl_t *db)
 {
 	ASSERT(MUTEX_HELD(&db->db_mtx));
-	if (db->db_level == 0 && db->db_d.db_user_data_ptr_ptr) {
+	if (db->db_level == 0 && db->db_user_data_ptr_ptr) {
 		ASSERT(!refcount_is_zero(&db->db_holds));
-		*db->db_d.db_user_data_ptr_ptr = db->db.db_data;
+		*db->db_user_data_ptr_ptr = db->db.db_data;
 	}
 }
 
@@ -444,12 +431,12 @@
 	ASSERT(refcount_count(&db->db_holds) > 0);
 	ASSERT(db->db_buf == NULL);
 	ASSERT(db->db.db_data == NULL);
-	if (db->db_level == 0 && db->db_d.db_freed_in_flight) {
+	if (db->db_level == 0 && db->db_freed_in_flight) {
 		/* we were freed in flight; disregard any error */
 		arc_release(buf, db);
 		bzero(buf->b_data, db->db.db_size);
 		arc_buf_freeze(buf);
-		db->db_d.db_freed_in_flight = FALSE;
+		db->db_freed_in_flight = FALSE;
 		dbuf_set_data(db, buf);
 		db->db_state = DB_CACHED;
 	} else if (zio == NULL || zio->io_error == 0) {
@@ -646,120 +633,69 @@
 static void
 dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
 {
-	arc_buf_t **quiescing, **syncing;
-	arc_buf_contents_t type;
+	dbuf_dirty_record_t *dr = db->db_last_dirty;
 
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 	ASSERT(db->db.db_data != NULL);
-	ASSERT(db->db_blkid != DB_BONUS_BLKID);
-
-	quiescing = &db->db_d.db_data_old[(txg-1)&TXG_MASK];
-	syncing = &db->db_d.db_data_old[(txg-2)&TXG_MASK];
+	ASSERT(db->db_level == 0);
+	ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT);
 
-	/*
-	 * If this buffer is referenced from the current quiescing
-	 * transaction group: either make a copy and reset the reference
-	 * to point to the copy, or (if there a no active holders) just
-	 * null out the current db_data pointer.
-	 */
-	if (*quiescing == db->db_buf) {
-		/*
-		 * If the quiescing txg is "dirty", then we better not
-		 * be referencing the same buffer from the syncing txg.
-		 */
-		ASSERT(*syncing != db->db_buf);
-		if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
-			int size = db->db.db_size;
-			type = DBUF_GET_BUFC_TYPE(db);
-			*quiescing = arc_buf_alloc(
-			    db->db_dnode->dn_objset->os_spa, size, db, type);
-			bcopy(db->db.db_data, (*quiescing)->b_data, size);
-		} else {
-			dbuf_set_data(db, NULL);
-		}
+	if (dr == NULL || dr->dt.dl.dr_data != db->db_buf)
 		return;
-	}
 
 	/*
-	 * If this buffer is referenced from the current syncing
-	 * transaction group: either
-	 *	1 - make a copy and reset the reference, or
-	 *	2 - if there are no holders, just null the current db_data.
+	 * If the last dirty record for this dbuf has not yet synced
+	 * and its referencing the dbuf data, either:
+	 * 	reset the reference to point to a new copy,
+	 * or (if there a no active holders)
+	 *	just null out the current db_data pointer.
 	 */
-	if (*syncing == db->db_buf) {
-		ASSERT3P(*quiescing, ==, NULL);
-		ASSERT3U(db->db_dirtycnt, ==, 1);
-		if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
-			int size = db->db.db_size;
-			type = DBUF_GET_BUFC_TYPE(db);
-			/* we can't copy if we have already started a write */
-			ASSERT(*syncing != db->db_data_pending);
-			*syncing = arc_buf_alloc(
-			    db->db_dnode->dn_objset->os_spa, size, db, type);
-			bcopy(db->db.db_data, (*syncing)->b_data, size);
-		} else {
-			dbuf_set_data(db, NULL);
-		}
-	}
-}
-
-/*
- * This is the "bonus buffer" version of the above routine
- */
-static void
-dbuf_fix_old_bonus_data(dmu_buf_impl_t *db, uint64_t txg)
-{
-	arc_buf_t **quiescing, **syncing;
-
-	ASSERT(MUTEX_HELD(&db->db_mtx));
-	ASSERT(db->db.db_data != NULL);
-	ASSERT(db->db_blkid == DB_BONUS_BLKID);
-
-	quiescing = &db->db_d.db_data_old[(txg-1)&TXG_MASK];
-	syncing = &db->db_d.db_data_old[(txg-2)&TXG_MASK];
-
-	if (*quiescing == db->db.db_data) {
-		ASSERT(*syncing != db->db.db_data);
-		*quiescing = zio_buf_alloc(DN_MAX_BONUSLEN);
-		bcopy(db->db.db_data, *quiescing, DN_MAX_BONUSLEN);
-	} else if (*syncing == db->db.db_data) {
-		ASSERT3P(*quiescing, ==, NULL);
-		ASSERT3U(db->db_dirtycnt, ==, 1);
-		*syncing = zio_buf_alloc(DN_MAX_BONUSLEN);
-		bcopy(db->db.db_data, *syncing, DN_MAX_BONUSLEN);
+	ASSERT(dr->dr_txg >= txg - 2);
+	if (db->db_blkid == DB_BONUS_BLKID) {
+		/* Note that the data bufs here are zio_bufs */
+		dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN);
+		bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN);
+	} else if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
+		int size = db->db.db_size;
+		arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
+		dr->dt.dl.dr_data = arc_buf_alloc(
+		    db->db_dnode->dn_objset->os_spa, size, db, type);
+		bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size);
+	} else {
+		dbuf_set_data(db, NULL);
 	}
 }
 
 void
-dbuf_unoverride(dmu_buf_impl_t *db, uint64_t txg)
+dbuf_unoverride(dbuf_dirty_record_t *dr)
 {
-	ASSERT(db->db_blkid != DB_BONUS_BLKID);
+	dmu_buf_impl_t *db = dr->dr_dbuf;
+	uint64_t txg = dr->dr_txg;
+
 	ASSERT(MUTEX_HELD(&db->db_mtx));
-	ASSERT(db->db_d.db_overridden_by[txg&TXG_MASK] != IN_DMU_SYNC);
+	ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC);
+	ASSERT(db->db_level == 0);
+
+	if (db->db_blkid == DB_BONUS_BLKID ||
+	    dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN)
+		return;
 
-	if (db->db_d.db_overridden_by[txg&TXG_MASK] != NULL) {
-		/* free this block */
-		ASSERT(list_link_active(&db->db_dirty_node[txg&TXG_MASK]) ||
-		    db->db_dnode->dn_free_txg == txg);
-		if (!BP_IS_HOLE(db->db_d.db_overridden_by[txg&TXG_MASK])) {
-			/* XXX can get silent EIO here */
-			(void) arc_free(NULL, db->db_dnode->dn_objset->os_spa,
-			    txg, db->db_d.db_overridden_by[txg&TXG_MASK],
-			    NULL, NULL, ARC_WAIT);
-		}
-		kmem_free(db->db_d.db_overridden_by[txg&TXG_MASK],
-		    sizeof (blkptr_t));
-		db->db_d.db_overridden_by[txg&TXG_MASK] = NULL;
-		/*
-		 * Release the already-written buffer, so we leave it in
-		 * a consistent dirty state.  Note that all callers are
-		 * modifying the buffer, so they will immediately do
-		 * another (redundant) arc_release().  Therefore, leave
-		 * the buf thawed to save the effort of freezing &
-		 * immediately re-thawing it.
-		 */
-		arc_release(db->db_d.db_data_old[txg&TXG_MASK], db);
+	/* free this block */
+	if (!BP_IS_HOLE(&dr->dt.dl.dr_overridden_by)) {
+		/* XXX can get silent EIO here */
+		(void) arc_free(NULL, db->db_dnode->dn_objset->os_spa,
+		    txg, &dr->dt.dl.dr_overridden_by, NULL, NULL, ARC_WAIT);
 	}
+	dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
+	/*
+	 * Release the already-written buffer, so we leave it in
+	 * a consistent dirty state.  Note that all callers are
+	 * modifying the buffer, so they will immediately do
+	 * another (redundant) arc_release().  Therefore, leave
+	 * the buf thawed to save the effort of freezing &
+	 * immediately re-thawing it.
+	 */
+	arc_release(dr->dt.dl.dr_data, db);
 }
 
 void
@@ -793,7 +729,7 @@
 		}
 		if (db->db_state == DB_READ || db->db_state == DB_FILL) {
 			/* will be handled in dbuf_read_done or dbuf_rele */
-			db->db_d.db_freed_in_flight = TRUE;
+			db->db_freed_in_flight = TRUE;
 			mutex_exit(&db->db_mtx);
 			continue;
 		}
@@ -802,26 +738,31 @@
 			dbuf_clear(db);
 			continue;
 		}
-		/* The dbuf is CACHED and referenced */
+		/* The dbuf is referenced */
+
+		if (db->db_last_dirty != NULL) {
+			dbuf_dirty_record_t *dr = db->db_last_dirty;
 
-		if (!list_link_active(&db->db_dirty_node[txg & TXG_MASK])) {
-			/*
-			 * This dbuf is not currently dirty.  Either
-			 * uncache it (if its not referenced in the open
-			 * context) or reset its contents to empty.
-			 */
-			dbuf_fix_old_data(db, txg);
-		} else {
-			if (db->db_d.db_overridden_by[txg & TXG_MASK] != NULL) {
+			if (dr->dr_txg == txg) {
 				/*
-				 * This dbuf is overridden.  Clear that state.
+				 * This buffer is "in-use", re-adjust the file
+				 * size to reflect that this buffer may
+				 * contain new data when we sync.
 				 */
-				dbuf_unoverride(db, txg);
+				if (db->db_blkid > dn->dn_maxblkid)
+					dn->dn_maxblkid = db->db_blkid;
+				dbuf_unoverride(dr);
+			} else {
+				/*
+				 * This dbuf is not dirty in the open context.
+				 * Either uncache it (if its not referenced in
+				 * the open context) or reset its contents to
+				 * empty.
+				 */
+				dbuf_fix_old_data(db, txg);
 			}
-			if (db->db_blkid > dn->dn_maxblkid)
-				dn->dn_maxblkid = db->db_blkid;
 		}
-		/* fill in with appropriate data */
+		/* clear the contents if its cached */
 		if (db->db_state == DB_CACHED) {
 			ASSERT(db->db.db_data != NULL);
 			arc_release(db->db_buf, db);
@@ -846,13 +787,13 @@
 
 	/*
 	 * We don't need any locking to protect db_blkptr:
-	 * If it's syncing, then db_dirtied will be set so we'll
-	 * ignore db_blkptr.
+	 * If it's syncing, then db_last_dirty will be set
+	 * so we'll ignore db_blkptr.
 	 */
-	ASSERT(MUTEX_HELD(&db->db_mtx)); /* XXX strictly necessary? */
+	ASSERT(MUTEX_HELD(&db->db_mtx));
 	/* If we have been dirtied since the last snapshot, its not new */
-	if (db->db_dirtied)
-		birth_txg = db->db_dirtied;
+	if (db->db_last_dirty)
+		birth_txg = db->db_last_dirty->dr_txg;
 	else if (db->db_blkptr)
 		birth_txg = db->db_blkptr->blk_birth;
 
@@ -901,18 +842,21 @@
 	VERIFY(arc_buf_remove_ref(obuf, db) == 1);
 	db->db.db_size = size;
 
-	if (db->db_level == 0)
-		db->db_d.db_data_old[tx->tx_txg&TXG_MASK] = buf;
+	if (db->db_level == 0) {
+		ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg);
+		db->db_last_dirty->dt.dl.dr_data = buf;
+	}
 	mutex_exit(&db->db_mtx);
 
 	dnode_willuse_space(db->db_dnode, size-osize, tx);
 }
 
-void
+dbuf_dirty_record_t *
 dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
 {
 	dnode_t *dn = db->db_dnode;
 	objset_impl_t *os = dn->dn_objset;
+	dbuf_dirty_record_t **drp, *dr;
 	int drop_struct_lock = FALSE;
 	int txgoff = tx->tx_txg & TXG_MASK;
 
@@ -927,12 +871,11 @@
 	 * XXX We may want to prohibit dirtying in syncing context even
 	 * if they did pre-dirty.
 	 */
-	ASSERT(!(dmu_tx_is_syncing(tx) &&
-	    !BP_IS_HOLE(&dn->dn_objset->os_rootbp) &&
-	    dn->dn_object != DMU_META_DNODE_OBJECT &&
-	    dn->dn_objset->os_dsl_dataset != NULL &&
-	    !dsl_dir_is_private(
-	    dn->dn_objset->os_dsl_dataset->ds_dir)));
+	ASSERT(!dmu_tx_is_syncing(tx) ||
+	    BP_IS_HOLE(dn->dn_objset->os_rootbp) ||
+	    dn->dn_object == DMU_META_DNODE_OBJECT ||
+	    dn->dn_objset->os_dsl_dataset == NULL ||
+	    dsl_dir_is_private(dn->dn_objset->os_dsl_dataset->ds_dir));
 
 	/*
 	 * We make this assert for private objects as well, but after we
@@ -940,23 +883,17 @@
 	 * in syncing context.
 	 */
 	ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
-	    dn->dn_dirtyctx == DN_UNDIRTIED ||
-	    dn->dn_dirtyctx ==
+	    dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
 	    (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
 
 	mutex_enter(&db->db_mtx);
-	/* XXX make this true for indirects too? */
-	ASSERT(db->db_level != 0 || db->db_state == DB_CACHED ||
-	    db->db_state == DB_FILL);
-
 	/*
-	 * If this buffer is currently part of an "overridden" region,
-	 * we now need to remove it from that region.
+	 * XXX make this true for indirects too?  The problem is that
+	 * transactions created with dmu_tx_create_assigned() from
+	 * syncing context don't bother holding ahead.
 	 */
-	if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID &&
-	    db->db_d.db_overridden_by[txgoff] != NULL) {
-		dbuf_unoverride(db, tx->tx_txg);
-	}
+	ASSERT(db->db_level != 0 ||
+	    db->db_state == DB_CACHED || db->db_state == DB_FILL);
 
 	mutex_enter(&dn->dn_mtx);
 	/*
@@ -964,7 +901,7 @@
 	 * initialize the objset.
 	 */
 	if (dn->dn_dirtyctx == DN_UNDIRTIED &&
-	    !BP_IS_HOLE(&dn->dn_objset->os_rootbp)) {
+	    !BP_IS_HOLE(dn->dn_objset->os_rootbp)) {
 		dn->dn_dirtyctx =
 		    (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN);
 		ASSERT(dn->dn_dirtyctx_firstset == NULL);
@@ -975,13 +912,23 @@
 	/*
 	 * If this buffer is already dirty, we're done.
 	 */
-	if (list_link_active(&db->db_dirty_node[txgoff])) {
-		if (db->db_blkid != DB_BONUS_BLKID && db->db_level == 0 &&
-		    db->db.db_object != DMU_META_DNODE_OBJECT)
-			arc_buf_thaw(db->db_buf);
-
+	drp = &db->db_last_dirty;
+	ASSERT(*drp == NULL || (*drp)->dr_txg <= tx->tx_txg ||
+	    db->db.db_object == DMU_META_DNODE_OBJECT);
+	while (*drp && (*drp)->dr_txg > tx->tx_txg)
+		drp = &(*drp)->dr_next;
+	if (*drp && (*drp)->dr_txg == tx->tx_txg) {
+		if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID) {
+			/*
+			 * If this buffer has already been written out,
+			 * we now need to reset its state.
+			 */
+			dbuf_unoverride(*drp);
+			if (db->db.db_object != DMU_META_DNODE_OBJECT)
+				arc_buf_thaw(db->db_buf);
+		}
 		mutex_exit(&db->db_mtx);
-		return;
+		return (*drp);
 	}
 
 	/*
@@ -1007,7 +954,7 @@
 	ASSERT(!dmu_tx_is_syncing(tx) ||
 	    os->os_dsl_dataset == NULL ||
 	    !dsl_dir_is_private(os->os_dsl_dataset->ds_dir) ||
-	    !BP_IS_HOLE(&os->os_rootbp));
+	    !BP_IS_HOLE(os->os_rootbp));
 	ASSERT(db->db.db_size != 0);
 
 	dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
@@ -1017,44 +964,50 @@
 	 * to make a copy of it so that the changes we make in this
 	 * transaction group won't leak out when we sync the older txg.
 	 */
-	if (db->db_blkid == DB_BONUS_BLKID) {
-		ASSERT(db->db.db_data != NULL);
-		ASSERT(db->db_d.db_data_old[txgoff] == NULL);
-		dbuf_fix_old_bonus_data(db, tx->tx_txg);
-		db->db_d.db_data_old[txgoff] = db->db.db_data;
-	} else if (db->db_level == 0) {
-		/*
-		 * Release the data buffer from the cache so that we
-		 * can modify it without impacting possible other users
-		 * of this cached data block.  Note that indirect blocks
-		 * and private objects are not released until the syncing
-		 * state (since they are only modified then).
-		 */
-		ASSERT(db->db_buf != NULL);
-		ASSERT(db->db_d.db_data_old[txgoff] == NULL);
-		if (db->db.db_object != DMU_META_DNODE_OBJECT) {
+	dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP);
+	if (db->db_level == 0) {
+		void *data_old = db->db_buf;
+
+		if (db->db_blkid == DB_BONUS_BLKID) {
+			dbuf_fix_old_data(db, tx->tx_txg);
+			data_old = db->db.db_data;
+		} else if (db->db.db_object != DMU_META_DNODE_OBJECT) {
+			/*
+			 * Release the data buffer from the cache so that we
+			 * can modify it without impacting possible other users
+			 * of this cached data block.  Note that indirect
+			 * blocks and private objects are not released until the
+			 * syncing state (since they are only modified then).
+			 */
 			arc_release(db->db_buf, db);
 			dbuf_fix_old_data(db, tx->tx_txg);
-			ASSERT(db->db_buf != NULL);
+			data_old = db->db_buf;
 		}
-		db->db_d.db_data_old[txgoff] = db->db_buf;
+		ASSERT(data_old != NULL);
+		dr->dt.dl.dr_data = data_old;
+	} else {
+		mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_DEFAULT, NULL);
+		list_create(&dr->dt.di.dr_children,
+		    sizeof (dbuf_dirty_record_t),
+		    offsetof(dbuf_dirty_record_t, dr_dirty_node));
 	}
+	dr->dr_dbuf = db;
+	dr->dr_txg = tx->tx_txg;
+	dr->dr_next = *drp;
+	*drp = dr;
 
-	mutex_enter(&dn->dn_mtx);
 	/*
 	 * We could have been freed_in_flight between the dbuf_noread
 	 * and dbuf_dirty.  We win, as though the dbuf_noread() had
 	 * happened after the free.
 	 */
 	if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID) {
+		mutex_enter(&dn->dn_mtx);
 		dnode_clear_range(dn, db->db_blkid, 1, tx);
-		db->db_d.db_freed_in_flight = FALSE;
+		mutex_exit(&dn->dn_mtx);
+		db->db_freed_in_flight = FALSE;
 	}
 
-	db->db_dirtied = tx->tx_txg;
-	list_insert_tail(&dn->dn_dirty_dbufs[txgoff], db);
-	mutex_exit(&dn->dn_mtx);
-
 	if (db->db_blkid != DB_BONUS_BLKID) {
 		/*
 		 * Update the accounting.
@@ -1084,8 +1037,12 @@
 	mutex_exit(&db->db_mtx);
 
 	if (db->db_blkid == DB_BONUS_BLKID) {
+		mutex_enter(&dn->dn_mtx);
+		ASSERT(!list_link_active(&dr->dr_dirty_node));
+		list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
+		mutex_exit(&dn->dn_mtx);
 		dnode_setdirty(dn, tx);
-		return;
+		return (dr);
 	}
 
 	if (db->db_level == 0) {
@@ -1099,30 +1056,61 @@
 	}
 
 	if (db->db_level+1 < dn->dn_nlevels) {
-		int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
-		dmu_buf_impl_t *parent;
-		parent = dbuf_hold_level(dn, db->db_level+1,
-		    db->db_blkid >> epbs, FTAG);
+		dmu_buf_impl_t *parent = db->db_parent;
+		dbuf_dirty_record_t *di;
+		int parent_held = FALSE;
+
+		if (db->db_parent == NULL || db->db_parent == dn->dn_dbuf) {
+			int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+
+			parent = dbuf_hold_level(dn, db->db_level+1,
+			    db->db_blkid >> epbs, FTAG);
+			parent_held = TRUE;
+		}
 		if (drop_struct_lock)
 			rw_exit(&dn->dn_struct_rwlock);
-		dbuf_dirty(parent, tx);
-		dbuf_rele(parent, FTAG);
+		ASSERT3U(db->db_level+1, ==, parent->db_level);
+		di = dbuf_dirty(parent, tx);
+		if (parent_held)
+			dbuf_rele(parent, FTAG);
+
+		mutex_enter(&db->db_mtx);
+		/*  possible race with dbuf_undirty() */
+		if (db->db_last_dirty == dr ||
+		    dn->dn_object == DMU_META_DNODE_OBJECT) {
+			mutex_enter(&di->dt.di.dr_mtx);
+			ASSERT3U(di->dr_txg, ==, tx->tx_txg);
+			ASSERT(!list_link_active(&dr->dr_dirty_node));
+			list_insert_tail(&di->dt.di.dr_children, dr);
+			mutex_exit(&di->dt.di.dr_mtx);
+			dr->dr_parent = di;
+		}
+		mutex_exit(&db->db_mtx);
 	} else {
+		ASSERT(db->db_level+1 == dn->dn_nlevels);
+		ASSERT(db->db_blkid < dn->dn_nblkptr);
+		ASSERT(db->db_parent == NULL ||
+		    db->db_parent == db->db_dnode->dn_dbuf);
+		mutex_enter(&dn->dn_mtx);
+		ASSERT(!list_link_active(&dr->dr_dirty_node));
+		list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
+		mutex_exit(&dn->dn_mtx);
 		if (drop_struct_lock)
 			rw_exit(&dn->dn_struct_rwlock);
 	}
 
 	dnode_setdirty(dn, tx);
+	return (dr);
 }
 
 static int
 dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
 {
 	dnode_t *dn = db->db_dnode;
-	int txgoff = tx->tx_txg & TXG_MASK;
-	int64_t holds;
+	uint64_t txg = tx->tx_txg;
+	dbuf_dirty_record_t *dr;
 
-	ASSERT(tx->tx_txg != 0);
+	ASSERT(txg != 0);
 	ASSERT(db->db_blkid != DB_BONUS_BLKID);
 
 	mutex_enter(&db->db_mtx);
@@ -1130,10 +1118,14 @@
 	/*
 	 * If this buffer is not dirty, we're done.
 	 */
-	if (!list_link_active(&db->db_dirty_node[txgoff])) {
+	for (dr = db->db_last_dirty; dr; dr = dr->dr_next)
+		if (dr->dr_txg <= txg)
+			break;
+	if (dr == NULL || dr->dr_txg < txg) {
 		mutex_exit(&db->db_mtx);
 		return (0);
 	}
+	ASSERT(dr->dr_txg == txg);
 
 	/*
 	 * If this buffer is currently held, we cannot undirty
@@ -1152,31 +1144,41 @@
 
 	dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
 
-	dbuf_unoverride(db, tx->tx_txg);
+	ASSERT(db->db.db_size != 0);
+
+	/* XXX would be nice to fix up dn_towrite_space[] */
+
+	db->db_last_dirty = dr->dr_next;
 
-	ASSERT(db->db.db_size != 0);
-	if (db->db_level == 0) {
-		ASSERT(db->db_buf != NULL);
-		ASSERT(db->db_d.db_data_old[txgoff] != NULL);
-		if (db->db_d.db_data_old[txgoff] != db->db_buf)
-			VERIFY(arc_buf_remove_ref(
-			    db->db_d.db_data_old[txgoff], db) == 1);
-		db->db_d.db_data_old[txgoff] = NULL;
+	if (dr->dr_parent) {
+		mutex_enter(&dr->dr_parent->dt.di.dr_mtx);
+		list_remove(&dr->dr_parent->dt.di.dr_children, dr);
+		mutex_exit(&dr->dr_parent->dt.di.dr_mtx);
+	} else if (db->db_level+1 == dn->dn_nlevels) {
+		ASSERT3P(db->db_parent, ==, dn->dn_dbuf);
+		mutex_enter(&dn->dn_mtx);
+		list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr);
+		mutex_exit(&dn->dn_mtx);
 	}
 
-	/* XXX would be nice to fix up dn_towrite_space[] */
-	/* XXX undo db_dirtied? but how? */
-	/* db->db_dirtied = tx->tx_txg; */
+	if (db->db_level == 0) {
+		dbuf_unoverride(dr);
 
-	mutex_enter(&dn->dn_mtx);
-	list_remove(&dn->dn_dirty_dbufs[txgoff], db);
-	mutex_exit(&dn->dn_mtx);
+		ASSERT(db->db_buf != NULL);
+		ASSERT(dr->dt.dl.dr_data != NULL);
+		if (dr->dt.dl.dr_data != db->db_buf)
+			VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db) == 1);
+	} else {
+		ASSERT(db->db_buf != NULL);
+		ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
+		/* XXX - mutex and list destroy? */
+	}
+	kmem_free(dr, sizeof (dbuf_dirty_record_t));
 
 	ASSERT(db->db_dirtycnt > 0);
 	db->db_dirtycnt -= 1;
 
-	if ((holds = refcount_remove(&db->db_holds,
-	    (void *)(uintptr_t)tx->tx_txg)) == 0) {
+	if (refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) {
 		arc_buf_t *buf = db->db_buf;
 
 		ASSERT(arc_released(buf));
@@ -1185,7 +1187,6 @@
 		dbuf_evict(db);
 		return (1);
 	}
-	ASSERT(holds > 0);
 
 	mutex_exit(&db->db_mtx);
 	return (0);
@@ -1203,7 +1204,7 @@
 	if (RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock))
 		rf |= DB_RF_HAVESTRUCT;
 	(void) dbuf_read(db, NULL, rf);
-	dbuf_dirty(db, tx);
+	(void) dbuf_dirty(db, tx);
 }
 
 void
@@ -1220,7 +1221,7 @@
 	    dmu_tx_private_ok(tx));
 
 	dbuf_noread(db);
-	dbuf_dirty(db, tx);
+	(void) dbuf_dirty(db, tx);
 }
 
 #pragma weak dmu_buf_fill_done = dbuf_fill_done
@@ -1232,12 +1233,12 @@
 	DBUF_VERIFY(db);
 
 	if (db->db_state == DB_FILL) {
-		if (db->db_level == 0 && db->db_d.db_freed_in_flight) {
+		if (db->db_level == 0 && db->db_freed_in_flight) {
 			ASSERT(db->db_blkid != DB_BONUS_BLKID);
 			/* we were freed while filling */
 			/* XXX dbuf_undirty? */
 			bzero(db->db.db_data, db->db.db_size);
-			db->db_d.db_freed_in_flight = FALSE;
+			db->db_freed_in_flight = FALSE;
 		}
 		db->db_state = DB_CACHED;
 		cv_broadcast(&db->db_changed);
@@ -1374,13 +1375,17 @@
 	db->db.db_object = dn->dn_object;
 	db->db_level = level;
 	db->db_blkid = blkid;
-	db->db_dirtied = 0;
+	db->db_last_dirty = NULL;
 	db->db_dirtycnt = 0;
 	db->db_dnode = dn;
 	db->db_parent = parent;
 	db->db_blkptr = blkptr;
 
-	bzero(&db->db_d, sizeof (db->db_d));
+	db->db_user_ptr = NULL;
+	db->db_user_data_ptr_ptr = NULL;
+	db->db_evict_func = NULL;
+	db->db_immediate_evict = 0;
+	db->db_freed_in_flight = 0;
 
 	if (blkid == DB_BONUS_BLKID) {
 		ASSERT3P(parent, ==, dn->dn_dbuf);
@@ -1586,22 +1591,24 @@
 	ASSERT(db->db_buf == NULL || arc_referenced(db->db_buf));
 
 	/*
-	 * If this buffer is currently syncing out, and we are
-	 * are still referencing it from db_data, we need to make
-	 * a copy of it in case we decide we want to dirty it
-	 * again in this txg.
+	 * If this buffer is currently syncing out, and we are are
+	 * still referencing it from db_data, we need to make a copy
+	 * of it in case we decide we want to dirty it again in this txg.
 	 */
-	if (db->db_level == 0 && db->db_state == DB_CACHED &&
+	if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID &&
 	    dn->dn_object != DMU_META_DNODE_OBJECT &&
-	    db->db_data_pending == db->db_buf) {
-		int size = (db->db_blkid == DB_BONUS_BLKID) ?
-		    DN_MAX_BONUSLEN : db->db.db_size;
-		arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
+	    db->db_state == DB_CACHED && db->db_data_pending) {
+		dbuf_dirty_record_t *dr = db->db_data_pending;
+
+		if (dr->dt.dl.dr_data == db->db_buf) {
+			arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
 
-		dbuf_set_data(db, arc_buf_alloc(db->db_dnode->dn_objset->os_spa,
-		    size, db, type));
-		bcopy(db->db_data_pending->b_data, db->db.db_data,
-		    db->db.db_size);
+			dbuf_set_data(db,
+			    arc_buf_alloc(db->db_dnode->dn_objset->os_spa,
+			    db->db.db_size, db, type));
+			bcopy(dr->dt.dl.dr_data->b_data, db->db.db_data,
+			    db->db.db_size);
+		}
 	}
 
 	(void) refcount_add(&db->db_holds, tag);
@@ -1669,11 +1676,15 @@
 	holds = refcount_remove(&db->db_holds, tag);
 	ASSERT(holds >= 0);
 
-	if (db->db_buf && holds == db->db_dirtycnt)
+	/*
+	 * We can't freeze indirects if there is a possibility that they
+	 * may be modified in the current syncing context.
+	 */
+	if (db->db_buf && holds == (db->db_level == 0 ? db->db_dirtycnt : 0))
 		arc_buf_freeze(db->db_buf);
 
 	if (holds == db->db_dirtycnt &&
-	    db->db_level == 0 && db->db_d.db_immediate_evict)
+	    db->db_level == 0 && db->db_immediate_evict)
 		dbuf_evict_user(db);
 
 	if (holds == 0) {
@@ -1725,7 +1736,7 @@
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 
-	db->db_d.db_immediate_evict = TRUE;
+	db->db_immediate_evict = TRUE;
 	return (dmu_buf_update_user(db_fake, NULL, user_ptr,
 	    user_data_ptr_ptr, evict_func));
 }
@@ -1741,14 +1752,14 @@
 
 	mutex_enter(&db->db_mtx);
 
-	if (db->db_d.db_user_ptr == old_user_ptr) {
-		db->db_d.db_user_ptr = user_ptr;
-		db->db_d.db_user_data_ptr_ptr = user_data_ptr_ptr;
-		db->db_d.db_evict_func = evict_func;
+	if (db->db_user_ptr == old_user_ptr) {
+		db->db_user_ptr = user_ptr;
+		db->db_user_data_ptr_ptr = user_data_ptr_ptr;
+		db->db_evict_func = evict_func;
 
 		dbuf_update_data(db);
 	} else {
-		old_user_ptr = db->db_d.db_user_ptr;
+		old_user_ptr = db->db_user_ptr;
 	}
 
 	mutex_exit(&db->db_mtx);
@@ -1761,21 +1772,106 @@
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 	ASSERT(!refcount_is_zero(&db->db_holds));
 
-	return (db->db_d.db_user_ptr);
+	return (db->db_user_ptr);
+}
+
+static void
+dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db)
+{
+	/* ASSERT(dmu_tx_is_syncing(tx) */
+	ASSERT(MUTEX_HELD(&db->db_mtx));
+
+	if (db->db_blkptr != NULL)
+		return;
+
+	if (db->db_level == dn->dn_phys->dn_nlevels-1) {
+		/*
+		 * This buffer was allocated at a time when there was
+		 * no available blkptrs from the dnode, or it was
+		 * inappropriate to hook it in (i.e., nlevels mis-match).
+		 */
+		ASSERT(db->db_blkid < dn->dn_phys->dn_nblkptr);
+		ASSERT(db->db_parent == NULL);
+		db->db_parent = dn->dn_dbuf;
+		db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid];
+		DBUF_VERIFY(db);
+	} else {
+		dmu_buf_impl_t *parent = db->db_parent;
+		int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
+
+		ASSERT(dn->dn_phys->dn_nlevels > 1);
+		if (parent == NULL) {
+			mutex_exit(&db->db_mtx);
+			rw_enter(&dn->dn_struct_rwlock, RW_READER);
+			(void) dbuf_hold_impl(dn, db->db_level+1,
+			    db->db_blkid >> epbs, FALSE, db, &parent);
+			rw_exit(&dn->dn_struct_rwlock);
+			mutex_enter(&db->db_mtx);
+			db->db_parent = parent;
+		}
+		db->db_blkptr = (blkptr_t *)parent->db.db_data +
+		    (db->db_blkid & ((1ULL << epbs) - 1));
+		DBUF_VERIFY(db);
+	}
 }
 
-void
-dbuf_sync(dmu_buf_impl_t *db, zio_t *zio, dmu_tx_t *tx)
+static void
+dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
 {
-	arc_buf_t **data;
-	uint64_t txg = tx->tx_txg;
+	dmu_buf_impl_t *db = dr->dr_dbuf;
+	dnode_t *dn = db->db_dnode;
+	zio_t *zio;
+
+	ASSERT(dmu_tx_is_syncing(tx));
+
+	dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
+
+	mutex_enter(&db->db_mtx);
+
+	ASSERT(db->db_level > 0);
+	DBUF_VERIFY(db);
+
+	if (db->db_buf == NULL) {
+		mutex_exit(&db->db_mtx);
+		(void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED);
+		mutex_enter(&db->db_mtx);
+	}
+	ASSERT3U(db->db_state, ==, DB_CACHED);
+	ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
+	ASSERT(db->db_buf != NULL);
+
+	dbuf_check_blkptr(dn, db);
+
+	db->db_data_pending = dr;
+	mutex_exit(&db->db_mtx);
+
+	arc_release(db->db_buf, db);
+
+	/*
+	 * XXX -- we should design a compression algorithm
+	 * that specializes in arrays of bps.
+	 */
+	dbuf_write(dr, db->db_buf, ZIO_CHECKSUM_FLETCHER_4,
+	    zfs_mdcomp_disable ? ZIO_COMPRESS_EMPTY : ZIO_COMPRESS_LZJB, tx);
+
+	zio = dr->dr_zio;
+	mutex_enter(&dr->dt.di.dr_mtx);
+	dbuf_sync_list(&dr->dt.di.dr_children, tx);
+	ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
+	mutex_exit(&dr->dt.di.dr_mtx);
+	zio_nowait(zio);
+}
+
+static void
+dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
+{
+	arc_buf_t **datap = &dr->dt.dl.dr_data;
+	dmu_buf_impl_t *db = dr->dr_dbuf;
 	dnode_t *dn = db->db_dnode;
 	objset_impl_t *os = dn->dn_objset;
-	int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
+	uint64_t txg = tx->tx_txg;
 	int checksum, compress;
-	zbookmark_t zb;
 	int blksz;
-	arc_buf_contents_t type;
 
 	ASSERT(dmu_tx_is_syncing(tx));
 
@@ -1791,25 +1887,20 @@
 		ASSERT(db->db.db_data == NULL);
 	} else if (db->db_state == DB_FILL) {
 		/* This buffer was freed and is now being re-filled */
-		ASSERT(db->db.db_data != db->db_d.db_data_old[txg&TXG_MASK]);
+		ASSERT(db->db.db_data != dr->dt.dl.dr_data);
 	} else {
 		ASSERT3U(db->db_state, ==, DB_CACHED);
 	}
 	DBUF_VERIFY(db);
 
 	/*
-	 * Don't need a lock on db_dirty (dn_mtx), because it can't
-	 * be modified yet.
+	 * If this is a bonus buffer, simply copy the bonus data into the
+	 * dnode.  It will be written out when the dnode is synced (and it
+	 * will be synced, since it must have been dirty for dbuf_sync to
+	 * be called).
 	 */
-
 	if (db->db_blkid == DB_BONUS_BLKID) {
-		arc_buf_t **datap = &db->db_d.db_data_old[txg&TXG_MASK];
-		/*
-		 * Simply copy the bonus data into the dnode.  It will
-		 * be written out when the dnode is synced (and it will
-		 * be synced, since it must have been dirty for dbuf_sync
-		 * to be called).
-		 */
+		dbuf_dirty_record_t **drp;
 		/*
 		 * Use dn_phys->dn_bonuslen since db.db_size is the length
 		 * of the bonus buffer in the open transaction rather than
@@ -1821,10 +1912,13 @@
 		bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen);
 		if (*datap != db->db.db_data)
 			zio_buf_free(*datap, DN_MAX_BONUSLEN);
-		db->db_d.db_data_old[txg&TXG_MASK] = NULL;
 		db->db_data_pending = NULL;
-		if (db->db_dirtied == txg)
-			db->db_dirtied = 0;
+		drp = &db->db_last_dirty;
+		while (*drp != dr)
+			drp = &(*drp)->dr_next;
+		ASSERT((*drp)->dr_next == NULL);
+		*drp = NULL;
+		kmem_free(dr, sizeof (dbuf_dirty_record_t));
 		ASSERT(db->db_dirtycnt > 0);
 		db->db_dirtycnt -= 1;
 		mutex_exit(&db->db_mtx);
@@ -1832,20 +1926,51 @@
 		return;
 	}
 
-	if (db->db_level == 0) {
-		type = DBUF_GET_BUFC_TYPE(db);
-		data = &db->db_d.db_data_old[txg&TXG_MASK];
-		blksz = arc_buf_size(*data);
+	/*
+	 * If this buffer is in the middle of an immdiate write,
+	 * wait for the synchronous IO to complete.
+	 */
+	while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) {
+		ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
+		cv_wait(&db->db_changed, &db->db_mtx);
+		ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN);
+	}
+
+	dbuf_check_blkptr(dn, db);
+
+	/*
+	 * If this dbuf has already been written out via an immediate write,
+	 * just complete the write by copying over the new block pointer and
+	 * updating the accounting via the write-completion functions.
+	 */
+	if (dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
+		zio_t zio_fake;
 
-		/*
-		 * This buffer is in the middle of an immdiate write.
-		 * Wait for the synchronous IO to complete.
-		 */
-		while (db->db_d.db_overridden_by[txg&TXG_MASK] == IN_DMU_SYNC) {
-			ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
-			cv_wait(&db->db_changed, &db->db_mtx);
-			ASSERT(db->db_d.db_overridden_by[txg&TXG_MASK]);
-		}
+		zio_fake.io_private = &db;
+		zio_fake.io_error = 0;
+		zio_fake.io_bp = db->db_blkptr;
+		zio_fake.io_bp_orig = *db->db_blkptr;
+		zio_fake.io_txg = txg;
+
+		*db->db_blkptr = dr->dt.dl.dr_overridden_by;
+		dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
+		db->db_data_pending = dr;
+		dr->dr_zio = &zio_fake;
+		mutex_exit(&db->db_mtx);
+
+		if (BP_IS_OLDER(&zio_fake.io_bp_orig, txg))
+			dsl_dataset_block_kill(os->os_dsl_dataset,
+			    &zio_fake.io_bp_orig, dn->dn_zio, tx);
+
+		dbuf_write_ready(&zio_fake, db->db_buf, db);
+		dbuf_write_done(&zio_fake, db->db_buf, db);
+
+		return;
+	}
+
+	blksz = arc_buf_size(*datap);
+
+	if (dn->dn_object != DMU_META_DNODE_OBJECT) {
 		/*
 		 * If this buffer is currently "in use" (i.e., there are
 		 * active holds and db_data still references it), then make
@@ -1853,326 +1978,154 @@
 		 * from the open txg will not leak into this write.
 		 *
 		 * NOTE: this copy does not need to be made for objects only
-		 * modified in the syncing context (e.g. DNONE_DNODE blocks)
-		 * or if there is no actual write involved (bonus blocks).
+		 * modified in the syncing context (e.g. DNONE_DNODE blocks).
 		 */
-		if (dn->dn_object != DMU_META_DNODE_OBJECT &&
-		    db->db_d.db_overridden_by[txg&TXG_MASK] == NULL) {
-			if (refcount_count(&db->db_holds) > 1 &&
-			    *data == db->db_buf) {
-				*data = arc_buf_alloc(os->os_spa, blksz, db,
-				    type);
-				bcopy(db->db.db_data, (*data)->b_data, blksz);
-			}
-			db->db_data_pending = *data;
-		} else if (dn->dn_object == DMU_META_DNODE_OBJECT) {
-			/*
-			 * Private object buffers are released here rather
-			 * than in dbuf_dirty() since they are only modified
-			 * in the syncing context and we don't want the
-			 * overhead of making multiple copies of the data.
-			 */
-			arc_release(db->db_buf, db);
+		if (refcount_count(&db->db_holds) > 1 && *datap == db->db_buf) {
+			arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
+			*datap = arc_buf_alloc(os->os_spa, blksz, db, type);
+			bcopy(db->db.db_data, (*datap)->b_data, blksz);
 		}
 	} else {
-		data = &db->db_buf;
-		if (*data == NULL) {
-			/*
-			 * This can happen if we dirty and then free
-			 * the level-0 data blocks in the same txg. So
-			 * this indirect remains unchanged.
-			 */
-			if (db->db_dirtied == txg)
-				db->db_dirtied = 0;
-			ASSERT(db->db_dirtycnt > 0);
-			db->db_dirtycnt -= 1;
-			mutex_exit(&db->db_mtx);
-			dbuf_rele(db, (void *)(uintptr_t)txg);
-			return;
-		}
-		blksz = db->db.db_size;
-		ASSERT3U(blksz, ==, 1<<dn->dn_phys->dn_indblkshift);
+		/*
+		 * Private object buffers are released here rather
+		 * than in dbuf_dirty() since they are only modified
+		 * in the syncing context and we don't want the
+		 * overhead of making multiple copies of the data.
+		 */
+		arc_release(db->db_buf, db);
+	}
+
+	ASSERT(*datap != NULL);
+	db->db_data_pending = dr;
+
+	mutex_exit(&db->db_mtx);
+
+	/*
+	 * Allow dnode settings to override objset settings,
+	 * except for metadata checksums.
+	 */
+	if (dmu_ot[dn->dn_type].ot_metadata) {
+		checksum = os->os_md_checksum;
+		compress = zio_compress_select(dn->dn_compress,
+		    os->os_md_compress);
+	} else {
+		checksum = zio_checksum_select(dn->dn_checksum,
+		    os->os_checksum);
+		compress = zio_compress_select(dn->dn_compress,
+		    os->os_compress);
 	}
 
-	ASSERT(*data != NULL);
+	dbuf_write(dr, *datap, checksum, compress, tx);
 
-	if (db->db_level > 0 && !arc_released(db->db_buf)) {
-		/*
-		 * This indirect buffer was marked dirty, but
-		 * never modified (if it had been modified, then
-		 * we would have released the buffer).  There is
-		 * no reason to write anything.
-		 */
-		db->db_data_pending = NULL;
-		if (db->db_dirtied == txg)
-			db->db_dirtied = 0;
-		ASSERT(db->db_dirtycnt > 0);
-		db->db_dirtycnt -= 1;
-		mutex_exit(&db->db_mtx);
-		dbuf_rele(db, (void *)(uintptr_t)txg);
-		return;
-	} else if (db->db_blkptr == NULL &&
-	    db->db_level == dn->dn_phys->dn_nlevels-1 &&
-	    db->db_blkid < dn->dn_phys->dn_nblkptr) {
-		/*
-		 * This buffer was allocated at a time when there was
-		 * no available blkptrs from the dnode, or it was
-		 * inappropriate to hook it in (i.e., nlevels mis-match).
-		 */
-		ASSERT(db->db_blkptr == NULL);
-		ASSERT(db->db_parent == NULL);
-		db->db_parent = dn->dn_dbuf;
-		db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid];
-		DBUF_VERIFY(db);
-		mutex_exit(&db->db_mtx);
-	} else if (db->db_blkptr == NULL) {
-		dmu_buf_impl_t *parent = db->db_parent;
+	ASSERT(!list_link_active(&dr->dr_dirty_node));
+	if (dn->dn_object == DMU_META_DNODE_OBJECT)
+		list_insert_tail(&dn->dn_dirty_records[txg&TXG_MASK], dr);
+	else
+		zio_nowait(dr->dr_zio);
+}
 
-		mutex_exit(&db->db_mtx);
-		ASSERT(dn->dn_phys->dn_nlevels > 1);
-		if (parent == NULL) {
-			rw_enter(&dn->dn_struct_rwlock, RW_READER);
-			(void) dbuf_hold_impl(dn, db->db_level+1,
-			    db->db_blkid >> epbs, FALSE, FTAG, &parent);
-			rw_exit(&dn->dn_struct_rwlock);
-			dbuf_add_ref(parent, db);
-			db->db_parent = parent;
-			dbuf_rele(parent, FTAG);
-		}
-		(void) dbuf_read(parent, NULL, DB_RF_MUST_SUCCEED);
-	} else {
-		mutex_exit(&db->db_mtx);
-	}
-
-	ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || db->db_parent != NULL);
+void
+dbuf_sync_list(list_t *list, dmu_tx_t *tx)
+{
+	dbuf_dirty_record_t *dr;
 
-	if (db->db_level > 0 &&
-	    db->db_blkid > dn->dn_phys->dn_maxblkid >> (db->db_level * epbs)) {
-		/*
-		 * Don't write indirect blocks past EOF.
-		 * We get these when we truncate a file *after* dirtying
-		 * blocks in the truncate range (we undirty the level 0
-		 * blocks in dbuf_free_range(), but not the indirects).
-		 */
-#ifdef ZFS_DEBUG
-		/*
-		 * Verify that this indirect block is empty.
-		 */
-		blkptr_t *bplist;
-		int i;
-
-		mutex_enter(&db->db_mtx);
-		bplist = db->db.db_data;
-		for (i = 0; i < (1 << epbs); i++) {
-			if (!BP_IS_HOLE(&bplist[i])) {
-				panic("data past EOF: "
-				    "db=%p level=%d id=%llu i=%d\n",
-				    db, db->db_level,
-				    (u_longlong_t)db->db_blkid, i);
-			}
+	while (dr = list_head(list)) {
+		if (dr->dr_zio != NULL) {
+			/*
+			 * If we find an already initialized zio then we
+			 * are processing the meta-dnode, and we have finished.
+			 * The dbufs for all dnodes are put back on the list
+			 * during processing, so that we can zio_wait()
+			 * these IOs after initiating all child IOs.
+			 */
+			ASSERT3U(dr->dr_dbuf->db.db_object, ==,
+			    DMU_META_DNODE_OBJECT);
+			break;
 		}
-		mutex_exit(&db->db_mtx);
-#endif
-		ASSERT(db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr));
-		mutex_enter(&db->db_mtx);
-		db->db_dirtycnt -= 1;
-		mutex_exit(&db->db_mtx);
-		dbuf_rele(db, (void *)(uintptr_t)txg);
-		return;
+		list_remove(list, dr);
+		if (dr->dr_dbuf->db_level > 0)
+			dbuf_sync_indirect(dr, tx);
+		else
+			dbuf_sync_leaf(dr, tx);
 	}
+}
 
-	if (db->db_parent != dn->dn_dbuf) {
-		dmu_buf_impl_t *parent = db->db_parent;
+static void
+dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, int checksum,
+    int compress, dmu_tx_t *tx)
+{
+	dmu_buf_impl_t *db = dr->dr_dbuf;
+	dnode_t *dn = db->db_dnode;
+	objset_impl_t *os = dn->dn_objset;
+	dmu_buf_impl_t *parent = db->db_parent;
+	uint64_t txg = tx->tx_txg;
+	zbookmark_t zb;
+	zio_t *zio;
 
-		mutex_enter(&db->db_mtx);
+	if (parent != dn->dn_dbuf) {
+		ASSERT(parent && parent->db_data_pending);
 		ASSERT(db->db_level == parent->db_level-1);
-		ASSERT(list_link_active(&parent->db_dirty_node[txg&TXG_MASK]));
-		/*
-		 * We may have read this indirect block after we dirtied it,
-		 * so never released it from the cache.
-		 */
-		arc_release(parent->db_buf, parent);
-
-		db->db_blkptr = (blkptr_t *)parent->db.db_data +
-		    (db->db_blkid & ((1ULL << epbs) - 1));
-		DBUF_VERIFY(db);
-		mutex_exit(&db->db_mtx);
-#ifdef ZFS_DEBUG
+		ASSERT(arc_released(parent->db_buf));
+		zio = parent->db_data_pending->dr_zio;
 	} else {
-		/*
-		 * We don't need to dnode_setdirty(dn) because if we got
-		 * here then the parent is already dirty.
-		 */
 		ASSERT(db->db_level == dn->dn_phys->dn_nlevels-1);
 		ASSERT3P(db->db_blkptr, ==,
 		    &dn->dn_phys->dn_blkptr[db->db_blkid]);
-#endif
-	}
-	ASSERT(db->db_parent == NULL || arc_released(db->db_parent->db_buf));
-
-	if (db->db_level == 0 &&
-	    db->db_d.db_overridden_by[txg&TXG_MASK] != NULL) {
-		arc_buf_t **old = &db->db_d.db_data_old[txg&TXG_MASK];
-		blkptr_t **bpp = &db->db_d.db_overridden_by[txg&TXG_MASK];
-		int old_size = bp_get_dasize(os->os_spa, db->db_blkptr);
-		int new_size = bp_get_dasize(os->os_spa, *bpp);
-
-		ASSERT(db->db_blkid != DB_BONUS_BLKID);
-
-		dnode_diduse_space(dn, new_size-old_size);
-		mutex_enter(&dn->dn_mtx);
-		if (db->db_blkid > dn->dn_phys->dn_maxblkid)
-			dn->dn_phys->dn_maxblkid = db->db_blkid;
-		mutex_exit(&dn->dn_mtx);
-
-		dsl_dataset_block_born(os->os_dsl_dataset, *bpp, tx);
-		if (!BP_IS_HOLE(db->db_blkptr))
-			dsl_dataset_block_kill(os->os_dsl_dataset,
-			    db->db_blkptr, os->os_synctx);
-
-		mutex_enter(&db->db_mtx);
-		*db->db_blkptr = **bpp;
-		kmem_free(*bpp, sizeof (blkptr_t));
-		*bpp = NULL;
-
-		if (*old != db->db_buf)
-			VERIFY(arc_buf_remove_ref(*old, db) == 1);
-		else if (!BP_IS_HOLE(db->db_blkptr))
-			arc_set_callback(db->db_buf, dbuf_do_evict, db);
-		else
-			ASSERT(arc_released(db->db_buf));
-		*old = NULL;
-		db->db_data_pending = NULL;
-
-		cv_broadcast(&db->db_changed);
-
-		ASSERT(db->db_dirtycnt > 0);
-		db->db_dirtycnt -= 1;
-		mutex_exit(&db->db_mtx);
-		dbuf_rele(db, (void *)(uintptr_t)txg);
-		return;
+		zio = dn->dn_zio;
 	}
 
-	if (db->db_level > 0) {
-		/*
-		 * XXX -- we should design a compression algorithm
-		 * that specializes in arrays of bps.
-		 */
-		checksum = ZIO_CHECKSUM_FLETCHER_4;
-		if (zfs_mdcomp_disable)
-			compress = ZIO_COMPRESS_EMPTY;
-		else
-			compress = ZIO_COMPRESS_LZJB;
-	} else {
-		/*
-		 * Allow dnode settings to override objset settings,
-		 * except for metadata checksums.
-		 */
-		if (dmu_ot[dn->dn_type].ot_metadata) {
-			checksum = os->os_md_checksum;
-			compress = zio_compress_select(dn->dn_compress,
-			    os->os_md_compress);
-		} else {
-			checksum = zio_checksum_select(dn->dn_checksum,
-			    os->os_checksum);
-			compress = zio_compress_select(dn->dn_compress,
-			    os->os_compress);
-		}
-	}
-#ifdef ZFS_DEBUG
-	if (db->db_parent) {
-		ASSERT(list_link_active(
-		    &db->db_parent->db_dirty_node[txg&TXG_MASK]));
-		ASSERT(db->db_parent == dn->dn_dbuf ||
-		    db->db_parent->db_level > 0);
-		if (dn->dn_object == DMU_META_DNODE_OBJECT || db->db_level > 0)
-			ASSERT(*data == db->db_buf);
-	}
-#endif
-	ASSERT3U(db->db_blkptr->blk_birth, <=, tx->tx_txg);
+	ASSERT(db->db_level == 0 || data == db->db_buf);
+	ASSERT3U(db->db_blkptr->blk_birth, <=, txg);
+	ASSERT(zio);
+
 	zb.zb_objset = os->os_dsl_dataset ? os->os_dsl_dataset->ds_object : 0;
 	zb.zb_object = db->db.db_object;
 	zb.zb_level = db->db_level;
 	zb.zb_blkid = db->db_blkid;
 
-	(void) arc_write(zio, os->os_spa, checksum, compress,
-	    dmu_get_replication_level(os->os_spa, &zb, dn->dn_type), txg,
-	    db->db_blkptr, *data, dbuf_write_done, db,
-	    ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, ARC_NOWAIT, &zb);
-	/*
-	 * We can't access db after arc_write, since it could finish
-	 * and be freed, and we have no locks on it.
-	 */
-}
-
-struct dbuf_arg {
-	objset_impl_t *os;
-	blkptr_t bp;
-};
+	if (BP_IS_OLDER(db->db_blkptr, txg))
+		dsl_dataset_block_kill(
+		    os->os_dsl_dataset, db->db_blkptr, zio, tx);
 
-static void
-dbuf_do_born(void *arg)
-{
-	struct dbuf_arg *da = arg;
-	dsl_dataset_block_born(da->os->os_dsl_dataset,
-	    &da->bp, da->os->os_synctx);
-	kmem_free(da, sizeof (struct dbuf_arg));
-}
-
-static void
-dbuf_do_kill(void *arg)
-{
-	struct dbuf_arg *da = arg;
-	dsl_dataset_block_kill(da->os->os_dsl_dataset,
-	    &da->bp, da->os->os_synctx);
-	kmem_free(da, sizeof (struct dbuf_arg));
+	dr->dr_zio = arc_write(zio, os->os_spa, checksum, compress,
+	    dmu_get_replication_level(os->os_spa, &zb, dn->dn_type), txg,
+	    db->db_blkptr, data, dbuf_write_ready, dbuf_write_done, db,
+	    ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
 }
 
 /* ARGSUSED */
 static void
-dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
+dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
 {
 	dmu_buf_impl_t *db = vdb;
 	dnode_t *dn = db->db_dnode;
 	objset_impl_t *os = dn->dn_objset;
-	uint64_t txg = zio->io_txg;
+	blkptr_t *bp_orig = &zio->io_bp_orig;
 	uint64_t fill = 0;
-	int i;
-	int old_size, new_size;
+	int old_size, new_size, i;
 
-	ASSERT3U(zio->io_error, ==, 0);
+	dprintf_dbuf_bp(db, bp_orig, "bp_orig: %s", "");
 
-	dprintf_dbuf_bp(db, &zio->io_bp_orig, "bp_orig: %s", "");
-
-	old_size = bp_get_dasize(os->os_spa, &zio->io_bp_orig);
+	old_size = bp_get_dasize(os->os_spa, bp_orig);
 	new_size = bp_get_dasize(os->os_spa, zio->io_bp);
 
 	dnode_diduse_space(dn, new_size-old_size);
 
-	mutex_enter(&db->db_mtx);
+	if (BP_IS_HOLE(zio->io_bp)) {
+		dsl_dataset_t *ds = os->os_dsl_dataset;
+		dmu_tx_t *tx = os->os_synctx;
 
-	ASSERT(db->db_d.db_overridden_by[txg&TXG_MASK] == NULL);
+		if (bp_orig->blk_birth == tx->tx_txg)
+			dsl_dataset_block_kill(ds, bp_orig, NULL, tx);
+		ASSERT3U(db->db_blkptr->blk_fill, ==, 0);
+		return;
+	}
 
-	if (db->db_dirtied == txg)
-		db->db_dirtied = 0;
+	mutex_enter(&db->db_mtx);
 
 	if (db->db_level == 0) {
-		arc_buf_t **old = &db->db_d.db_data_old[txg&TXG_MASK];
-
-		ASSERT(db->db_blkid != DB_BONUS_BLKID);
-
-		if (*old != db->db_buf)
-			VERIFY(arc_buf_remove_ref(*old, db) == 1);
-		else if (!BP_IS_HOLE(db->db_blkptr))
-			arc_set_callback(db->db_buf, dbuf_do_evict, db);
-		else
-			ASSERT(arc_released(db->db_buf));
-		*old = NULL;
-		db->db_data_pending = NULL;
-
 		mutex_enter(&dn->dn_mtx);
-		if (db->db_blkid > dn->dn_phys->dn_maxblkid &&
-		    !BP_IS_HOLE(db->db_blkptr))
+		if (db->db_blkid > dn->dn_phys->dn_maxblkid)
 			dn->dn_phys->dn_maxblkid = db->db_blkid;
 		mutex_exit(&dn->dn_mtx);
 
@@ -2184,22 +2137,11 @@
 					fill++;
 			}
 		} else {
-			if (!BP_IS_HOLE(db->db_blkptr))
-				fill = 1;
+			fill = 1;
 		}
 	} else {
 		blkptr_t *bp = db->db.db_data;
 		ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
-		if (!BP_IS_HOLE(db->db_blkptr)) {
-			int epbs =
-			    dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
-			ASSERT3U(BP_GET_LSIZE(zio->io_bp), ==, db->db.db_size);
-			ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==,
-			    db->db.db_size);
-			ASSERT3U(dn->dn_phys->dn_maxblkid
-			    >> (db->db_level * epbs), >=, db->db_blkid);
-			arc_set_callback(db->db_buf, dbuf_do_evict, db);
-		}
 		for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, bp++) {
 			if (BP_IS_HOLE(bp))
 				continue;
@@ -2210,40 +2152,78 @@
 		}
 	}
 
-	if (!BP_IS_HOLE(db->db_blkptr)) {
-		db->db_blkptr->blk_fill = fill;
-		BP_SET_TYPE(db->db_blkptr, dn->dn_type);
-		BP_SET_LEVEL(db->db_blkptr, db->db_level);
+	db->db_blkptr->blk_fill = fill;
+	BP_SET_TYPE(db->db_blkptr, dn->dn_type);
+	BP_SET_LEVEL(db->db_blkptr, db->db_level);
+
+	mutex_exit(&db->db_mtx);
+
+	/* We must do this after we've set the bp's type and level */
+	if (!DVA_EQUAL(BP_IDENTITY(zio->io_bp), BP_IDENTITY(bp_orig))) {
+		dsl_dataset_t *ds = os->os_dsl_dataset;
+		dmu_tx_t *tx = os->os_synctx;
+
+		if (bp_orig->blk_birth == tx->tx_txg)
+			dsl_dataset_block_kill(ds, bp_orig, NULL, tx);
+		dsl_dataset_block_born(ds, zio->io_bp, tx);
+	}
+}
+
+/* ARGSUSED */
+static void
+dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
+{
+	dmu_buf_impl_t *db = vdb;
+	uint64_t txg = zio->io_txg;
+	dbuf_dirty_record_t **drp, *dr;
+
+	ASSERT3U(zio->io_error, ==, 0);
+
+	mutex_enter(&db->db_mtx);
+
+	drp = &db->db_last_dirty;
+	while (*drp != db->db_data_pending)
+		drp = &(*drp)->dr_next;
+	ASSERT(!list_link_active(&(*drp)->dr_dirty_node));
+	ASSERT((*drp)->dr_txg == txg);
+	ASSERT((*drp)->dr_next == NULL);
+	dr = *drp;
+	*drp = NULL;
+
+	if (db->db_level == 0) {
+		ASSERT(db->db_blkid != DB_BONUS_BLKID);
+		ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
+
+		if (dr->dt.dl.dr_data != db->db_buf)
+			VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db) == 1);
+		else if (!BP_IS_HOLE(db->db_blkptr))
+			arc_set_callback(db->db_buf, dbuf_do_evict, db);
+		else
+			ASSERT(arc_released(db->db_buf));
 	} else {
-		ASSERT3U(fill, ==, 0);
-		ASSERT3U(db->db_blkptr->blk_fill, ==, 0);
-	}
+		dnode_t *dn = db->db_dnode;
 
-	dprintf_dbuf_bp(db, db->db_blkptr,
-	    "wrote %llu bytes to blkptr:", zio->io_size);
+		ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
+		ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
+		if (!BP_IS_HOLE(db->db_blkptr)) {
+			int epbs =
+			    dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
+			ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==,
+			    db->db.db_size);
+			ASSERT3U(dn->dn_phys->dn_maxblkid
+			    >> (db->db_level * epbs), >=, db->db_blkid);
+			arc_set_callback(db->db_buf, dbuf_do_evict, db);
+		}
+	}
+	kmem_free(dr, sizeof (dbuf_dirty_record_t));
 
-	ASSERT(db->db_parent == NULL ||
-	    list_link_active(&db->db_parent->db_dirty_node[txg&TXG_MASK]));
 	cv_broadcast(&db->db_changed);
 	ASSERT(db->db_dirtycnt > 0);
 	db->db_dirtycnt -= 1;
+	db->db_data_pending = NULL;
 	mutex_exit(&db->db_mtx);
 
-	/* We must do this after we've set the bp's type and level */
-	if (!DVA_EQUAL(BP_IDENTITY(zio->io_bp),
-	    BP_IDENTITY(&zio->io_bp_orig))) {
-		struct dbuf_arg *da;
-		da = kmem_alloc(sizeof (struct dbuf_arg), KM_SLEEP);
-		da->os = os;
-		da->bp = *zio->io_bp;
-		(void) taskq_dispatch(dbuf_tq, dbuf_do_born, da, 0);
-		if (!BP_IS_HOLE(&zio->io_bp_orig)) {
-			da = kmem_alloc(sizeof (struct dbuf_arg), KM_SLEEP);
-			da->os = os;
-			da->bp = zio->io_bp_orig;
-			(void) taskq_dispatch(dbuf_tq, dbuf_do_kill, da, 0);
-		}
-	}
+	dprintf_dbuf_bp(db, zio->io_bp, "bp: %s", "");
 
 	dbuf_rele(db, (void *)(uintptr_t)txg);
 }
--- a/usr/src/uts/common/fs/zfs/dmu.c	Fri Feb 02 15:36:29 2007 -0800
+++ b/usr/src/uts/common/fs/zfs/dmu.c	Fri Feb 02 15:36:58 2007 -0800
@@ -567,27 +567,19 @@
 #endif
 
 typedef struct {
-	uint64_t	txg;
-	dmu_buf_impl_t	*db;
-	dmu_sync_cb_t	*done;
-	void		*arg;
-} dmu_sync_cbin_t;
-
-typedef union {
-	dmu_sync_cbin_t	data;
-	blkptr_t	blk;
-} dmu_sync_cbarg_t;
+	dbuf_dirty_record_t	*dr;
+	dmu_sync_cb_t		*done;
+	void			*arg;
+} dmu_sync_arg_t;
 
 /* ARGSUSED */
 static void
 dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg)
 {
-	dmu_sync_cbin_t *in = (dmu_sync_cbin_t *)varg;
-	dmu_buf_impl_t *db = in->db;
-	uint64_t txg = in->txg;
+	dmu_sync_arg_t *in = varg;
+	dbuf_dirty_record_t *dr = in->dr;
+	dmu_buf_impl_t *db = dr->dr_dbuf;
 	dmu_sync_cb_t *done = in->done;
-	void *arg = in->arg;
-	blkptr_t *blk = (blkptr_t *)varg;
 
 	if (!BP_IS_HOLE(zio->io_bp)) {
 		zio->io_bp->blk_fill = 1;
@@ -595,16 +587,17 @@
 		BP_SET_LEVEL(zio->io_bp, 0);
 	}
 
-	*blk = *zio->io_bp; /* structure assignment */
-
 	mutex_enter(&db->db_mtx);
-	ASSERT(db->db_d.db_overridden_by[txg&TXG_MASK] == IN_DMU_SYNC);
-	db->db_d.db_overridden_by[txg&TXG_MASK] = blk;
+	ASSERT(dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC);
+	dr->dt.dl.dr_overridden_by = *zio->io_bp; /* structure assignment */
+	dr->dt.dl.dr_override_state = DR_OVERRIDDEN;
 	cv_broadcast(&db->db_changed);
 	mutex_exit(&db->db_mtx);
 
 	if (done)
-		done(&(db->db), arg);
+		done(&(db->db), in->arg);
+
+	kmem_free(in, sizeof (dmu_sync_arg_t));
 }
 
 /*
@@ -637,10 +630,10 @@
 	objset_impl_t *os = db->db_objset;
 	dsl_pool_t *dp = os->os_dsl_dataset->ds_dir->dd_pool;
 	tx_state_t *tx = &dp->dp_tx;
-	dmu_sync_cbin_t *in;
-	blkptr_t *blk;
+	dbuf_dirty_record_t *dr;
+	dmu_sync_arg_t *in;
 	zbookmark_t zb;
-	uint32_t arc_flag;
+	zio_t *zio;
 	int err;
 
 	ASSERT(BP_IS_HOLE(bp));
@@ -674,25 +667,6 @@
 
 	mutex_enter(&db->db_mtx);
 
-	blk = db->db_d.db_overridden_by[txg&TXG_MASK];
-	if (blk == IN_DMU_SYNC) {
-		/*
-		 * We have already issued a sync write for this buffer.
-		 */
-		mutex_exit(&db->db_mtx);
-		txg_resume(dp);
-		return (EALREADY);
-	} else if (blk != NULL) {
-		/*
-		 * This buffer had already been synced.  It could not
-		 * have been dirtied since, or we would have cleared blk.
-		 */
-		*bp = *blk; /* structure assignment */
-		mutex_exit(&db->db_mtx);
-		txg_resume(dp);
-		return (0);
-	}
-
 	if (txg == tx->tx_syncing_txg) {
 		while (db->db_data_pending) {
 			/*
@@ -726,7 +700,10 @@
 		}
 	}
 
-	if (db->db_d.db_data_old[txg&TXG_MASK] == NULL) {
+	dr = db->db_last_dirty;
+	while (dr && dr->dr_txg > txg)
+		dr = dr->dr_next;
+	if (dr == NULL || dr->dr_txg < txg) {
 		/*
 		 * This dbuf isn't dirty, must have been free_range'd.
 		 * There's no need to log writes to freed blocks, so we're done.
@@ -736,35 +713,52 @@
 		return (ENOENT);
 	}
 
-	ASSERT(db->db_d.db_overridden_by[txg&TXG_MASK] == NULL);
-	db->db_d.db_overridden_by[txg&TXG_MASK] = IN_DMU_SYNC;
-	/*
-	 * XXX - a little ugly to stash the blkptr in the callback
-	 * buffer.  We always need to make sure the following is true:
-	 * ASSERT(sizeof(blkptr_t) >= sizeof(dmu_sync_cbin_t));
-	 */
-	in = kmem_alloc(sizeof (blkptr_t), KM_SLEEP);
-	in->db = db;
-	in->txg = txg;
+	ASSERT(dr->dr_txg == txg);
+	if (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) {
+		/*
+		 * We have already issued a sync write for this buffer.
+		 */
+		mutex_exit(&db->db_mtx);
+		txg_resume(dp);
+		return (EALREADY);
+	} else if (dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
+		/*
+		 * This buffer has already been synced.  It could not
+		 * have been dirtied since, or we would have cleared the state.
+		 */
+		*bp = dr->dt.dl.dr_overridden_by; /* structure assignment */
+		mutex_exit(&db->db_mtx);
+		txg_resume(dp);
+		return (0);
+	}
+
+	dr->dt.dl.dr_override_state = DR_IN_DMU_SYNC;
+	in = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP);
+	in->dr = dr;
 	in->done = done;
 	in->arg = arg;
 	mutex_exit(&db->db_mtx);
 	txg_resume(dp);
 
-	arc_flag = pio == NULL ? ARC_WAIT : ARC_NOWAIT;
 	zb.zb_objset = os->os_dsl_dataset->ds_object;
 	zb.zb_object = db->db.db_object;
 	zb.zb_level = db->db_level;
 	zb.zb_blkid = db->db_blkid;
-	err = arc_write(pio, os->os_spa,
+	zio = arc_write(pio, os->os_spa,
 	    zio_checksum_select(db->db_dnode->dn_checksum, os->os_checksum),
 	    zio_compress_select(db->db_dnode->dn_compress, os->os_compress),
 	    dmu_get_replication_level(os->os_spa, &zb, db->db_dnode->dn_type),
-	    txg, bp, db->db_d.db_data_old[txg&TXG_MASK], dmu_sync_done, in,
-	    ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, arc_flag, &zb);
-	ASSERT(err == 0);
+	    txg, bp, dr->dt.dl.dr_data, NULL, dmu_sync_done, in,
+	    ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
 
-	return (arc_flag == ARC_NOWAIT ? EINPROGRESS : 0);
+	if (pio) {
+		zio_nowait(zio);
+		err = EINPROGRESS;
+	} else {
+		err = zio_wait(zio);
+		ASSERT(err == 0);
+	}
+	return (err);
 }
 
 int
--- a/usr/src/uts/common/fs/zfs/dmu_objset.c	Fri Feb 02 15:36:29 2007 -0800
+++ b/usr/src/uts/common/fs/zfs/dmu_objset.c	Fri Feb 02 15:36:58 2007 -0800
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -139,10 +139,8 @@
 	osi->os.os = osi;
 	osi->os_dsl_dataset = ds;
 	osi->os_spa = spa;
-	if (bp)
-		osi->os_rootbp = *bp;
-	osi->os_phys = zio_buf_alloc(sizeof (objset_phys_t));
-	if (!BP_IS_HOLE(&osi->os_rootbp)) {
+	osi->os_rootbp = bp;
+	if (!BP_IS_HOLE(osi->os_rootbp)) {
 		uint32_t aflags = ARC_WAIT;
 		zbookmark_t zb;
 		zb.zb_objset = ds ? ds->ds_object : 0;
@@ -150,17 +148,21 @@
 		zb.zb_level = -1;
 		zb.zb_blkid = 0;
 
-		dprintf_bp(&osi->os_rootbp, "reading %s", "");
-		err = arc_read(NULL, spa, &osi->os_rootbp,
+		dprintf_bp(osi->os_rootbp, "reading %s", "");
+		err = arc_read(NULL, spa, osi->os_rootbp,
 		    dmu_ot[DMU_OT_OBJSET].ot_byteswap,
-		    arc_bcopy_func, osi->os_phys,
+		    arc_getbuf_func, &osi->os_phys_buf,
 		    ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL, &aflags, &zb);
 		if (err) {
-			zio_buf_free(osi->os_phys, sizeof (objset_phys_t));
 			kmem_free(osi, sizeof (objset_impl_t));
 			return (err);
 		}
+		osi->os_phys = osi->os_phys_buf->b_data;
+		arc_release(osi->os_phys_buf, &osi->os_phys_buf);
 	} else {
+		osi->os_phys_buf = arc_buf_alloc(spa, sizeof (objset_phys_t),
+		    &osi->os_phys_buf, ARC_BUFC_METADATA);
+		osi->os_phys = osi->os_phys_buf->b_data;
 		bzero(osi->os_phys, sizeof (objset_phys_t));
 	}
 
@@ -177,7 +179,8 @@
 			err = dsl_prop_register(ds, "compression",
 			    compression_changed_cb, osi);
 		if (err) {
-			zio_buf_free(osi->os_phys, sizeof (objset_phys_t));
+			VERIFY(arc_buf_remove_ref(osi->os_phys_buf,
+			    &osi->os_phys_buf) == 1);
 			kmem_free(osi, sizeof (objset_impl_t));
 			return (err);
 		}
@@ -252,11 +255,8 @@
 
 	osi = dsl_dataset_get_user_ptr(ds);
 	if (osi == NULL) {
-		blkptr_t bp;
-
-		dsl_dataset_get_blkptr(ds, &bp);
 		err = dmu_objset_open_impl(dsl_dataset_get_spa(ds),
-		    ds, &bp, &osi);
+		    ds, &ds->ds_phys->ds_bp, &osi);
 		if (err) {
 			dsl_dataset_close(ds, mode, os);
 			kmem_free(os, sizeof (objset_t));
@@ -364,7 +364,7 @@
 	dnode_special_close(osi->os_meta_dnode);
 	zil_free(osi->os_zil);
 
-	zio_buf_free(osi->os_phys, sizeof (objset_phys_t));
+	VERIFY(arc_buf_remove_ref(osi->os_phys_buf, &osi->os_phys_buf) == 1);
 	mutex_destroy(&osi->os_lock);
 	mutex_destroy(&osi->os_obj_lock);
 	kmem_free(osi, sizeof (objset_impl_t));
@@ -372,14 +372,14 @@
 
 /* called from dsl for meta-objset */
 objset_impl_t *
-dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, dmu_objset_type_t type,
-    dmu_tx_t *tx)
+dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
+    dmu_objset_type_t type, dmu_tx_t *tx)
 {
 	objset_impl_t *osi;
 	dnode_t *mdn;
 
 	ASSERT(dmu_tx_is_syncing(tx));
-	VERIFY(0 == dmu_objset_open_impl(spa, ds, NULL, &osi));
+	VERIFY(0 == dmu_objset_open_impl(spa, ds, bp, &osi));
 	mdn = osi->os_meta_dnode;
 
 	dnode_allocate(mdn, DMU_OT_DNODE, 1 << DNODE_BLOCK_SHIFT,
@@ -467,7 +467,7 @@
 	dsl_dir_t *dd = arg1;
 	struct oscarg *oa = arg2;
 	dsl_dataset_t *ds;
-	blkptr_t bp;
+	blkptr_t *bp;
 	uint64_t dsobj;
 
 	ASSERT(dmu_tx_is_syncing(tx));
@@ -477,13 +477,13 @@
 
 	VERIFY(0 == dsl_dataset_open_obj(dd->dd_pool, dsobj, NULL,
 	    DS_MODE_STANDARD | DS_MODE_READONLY, FTAG, &ds));
-	dsl_dataset_get_blkptr(ds, &bp);
-	if (BP_IS_HOLE(&bp)) {
+	bp = dsl_dataset_get_blkptr(ds);
+	if (BP_IS_HOLE(bp)) {
 		objset_impl_t *osi;
 
 		/* This is an empty dmu_objset; not a clone. */
 		osi = dmu_objset_create_impl(dsl_dataset_get_spa(ds),
-		    ds, oa->type, tx);
+		    ds, bp, oa->type, tx);
 
 		if (oa->userfunc)
 			oa->userfunc(&osi->os, oa->userarg, tx);
@@ -660,41 +660,41 @@
 }
 
 static void
-dmu_objset_sync_dnodes(objset_impl_t *os, list_t *list, dmu_tx_t *tx)
+dmu_objset_sync_dnodes(list_t *list, dmu_tx_t *tx)
 {
-	dnode_t *dn = list_head(list);
-	int level, err;
+	dnode_t *dn;
 
-	for (level = 0; dn = list_head(list); level++) {
-		zio_t *zio;
-		zio = zio_root(os->os_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
-
-		ASSERT3U(level, <=, DN_MAX_LEVELS);
-
-		while (dn) {
-			dnode_t *next = list_next(list, dn);
+	while (dn = list_head(list)) {
+		ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
+		ASSERT(dn->dn_dbuf->db_data_pending);
+		/*
+		 * Initialize dn_zio outside dnode_sync()
+		 * to accomodate meta-dnode
+		 */
+		dn->dn_zio = dn->dn_dbuf->db_data_pending->dr_zio;
+		ASSERT(dn->dn_zio);
 
-			list_remove(list, dn);
-			if (dnode_sync(dn, level, zio, tx) == 0) {
-				/*
-				 * This dnode requires syncing at higher
-				 * levels; put it back onto the list.
-				 */
-				if (next)
-					list_insert_before(list, next, dn);
-				else
-					list_insert_tail(list, dn);
-			}
-			dn = next;
-		}
+		ASSERT3U(dn->dn_nlevels, <=, DN_MAX_LEVELS);
+		list_remove(list, dn);
+		dnode_sync(dn, tx);
+	}
+}
 
-		DTRACE_PROBE1(wait__begin, zio_t *, zio);
-		err = zio_wait(zio);
-		DTRACE_PROBE4(wait__end, zio_t *, zio,
-		    uint64_t, tx->tx_txg, objset_impl_t *, os, int, level);
+/* ARGSUSED */
+static void
+ready(zio_t *zio, arc_buf_t *abuf, void *arg)
+{
+	objset_impl_t *os = arg;
+	blkptr_t *bp = os->os_rootbp;
+	dnode_phys_t *dnp = &os->os_phys->os_meta_dnode;
+	int i;
 
-		ASSERT(err == 0);
-	}
+	/*
+	 * Update rootbp fill count.
+	 */
+	bp->blk_fill = 1;	/* count the meta-dnode */
+	for (i = 0; i < dnp->dn_nblkptr; i++)
+		bp->blk_fill += dnp->dn_blkptr[i].blk_fill;
 }
 
 /* ARGSUSED */
@@ -702,90 +702,81 @@
 killer(zio_t *zio, arc_buf_t *abuf, void *arg)
 {
 	objset_impl_t *os = arg;
-	objset_phys_t *osphys = zio->io_data;
-	dnode_phys_t *dnp = &osphys->os_meta_dnode;
-	int i;
 
 	ASSERT3U(zio->io_error, ==, 0);
 
-	/*
-	 * Update rootbp fill count.
-	 */
-	os->os_rootbp.blk_fill = 1;	/* count the meta-dnode */
-	for (i = 0; i < dnp->dn_nblkptr; i++)
-		os->os_rootbp.blk_fill += dnp->dn_blkptr[i].blk_fill;
-
 	BP_SET_TYPE(zio->io_bp, DMU_OT_OBJSET);
 	BP_SET_LEVEL(zio->io_bp, 0);
 
 	if (!DVA_EQUAL(BP_IDENTITY(zio->io_bp),
 	    BP_IDENTITY(&zio->io_bp_orig))) {
-		dsl_dataset_block_kill(os->os_dsl_dataset, &zio->io_bp_orig,
-		    os->os_synctx);
+		if (zio->io_bp_orig.blk_birth == os->os_synctx->tx_txg)
+			dsl_dataset_block_kill(os->os_dsl_dataset,
+			    &zio->io_bp_orig, NULL, os->os_synctx);
 		dsl_dataset_block_born(os->os_dsl_dataset, zio->io_bp,
 		    os->os_synctx);
 	}
+	arc_release(os->os_phys_buf, &os->os_phys_buf);
+
+	if (os->os_dsl_dataset)
+		dmu_buf_rele(os->os_dsl_dataset->ds_dbuf, os->os_dsl_dataset);
 }
 
 /* called from dsl */
 void
-dmu_objset_sync(objset_impl_t *os, dmu_tx_t *tx)
+dmu_objset_sync(objset_impl_t *os, zio_t *pio, dmu_tx_t *tx)
 {
-	extern taskq_t *dbuf_tq;
 	int txgoff;
-	list_t *dirty_list;
-	int err;
 	zbookmark_t zb;
-	arc_buf_t *abuf =
-	    arc_buf_alloc(os->os_spa, sizeof (objset_phys_t), FTAG,
-		ARC_BUFC_METADATA);
+	zio_t *zio;
+	list_t *list;
+	dbuf_dirty_record_t *dr;
+
+	dprintf_ds(os->os_dsl_dataset, "txg=%llu\n", tx->tx_txg);
 
 	ASSERT(dmu_tx_is_syncing(tx));
-	ASSERT(os->os_synctx == NULL);
 	/* XXX the write_done callback should really give us the tx... */
 	os->os_synctx = tx;
 
-	dprintf_ds(os->os_dsl_dataset, "txg=%llu\n", tx->tx_txg);
+	/*
+	 * Create the root block IO
+	 */
+	zb.zb_objset = os->os_dsl_dataset ? os->os_dsl_dataset->ds_object : 0;
+	zb.zb_object = 0;
+	zb.zb_level = -1;
+	zb.zb_blkid = 0;
+	if (BP_IS_OLDER(os->os_rootbp, tx->tx_txg))
+		dsl_dataset_block_kill(os->os_dsl_dataset,
+		    os->os_rootbp, pio, tx);
+	zio = arc_write(pio, os->os_spa, os->os_md_checksum,
+	    os->os_md_compress,
+	    dmu_get_replication_level(os->os_spa, &zb, DMU_OT_OBJSET),
+	    tx->tx_txg, os->os_rootbp, os->os_phys_buf, ready, killer, os,
+	    ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
+
+	/*
+	 * Sync meta-dnode - the parent IO for the sync is the root block
+	 */
+	os->os_meta_dnode->dn_zio = zio;
+	dnode_sync(os->os_meta_dnode, tx);
 
 	txgoff = tx->tx_txg & TXG_MASK;
 
-	dmu_objset_sync_dnodes(os, &os->os_free_dnodes[txgoff], tx);
-	dmu_objset_sync_dnodes(os, &os->os_dirty_dnodes[txgoff], tx);
+	dmu_objset_sync_dnodes(&os->os_free_dnodes[txgoff], tx);
+	dmu_objset_sync_dnodes(&os->os_dirty_dnodes[txgoff], tx);
 
+	list = &os->os_meta_dnode->dn_dirty_records[txgoff];
+	while (dr = list_head(list)) {
+		ASSERT(dr->dr_dbuf->db_level == 0);
+		list_remove(list, dr);
+		if (dr->dr_zio)
+			zio_nowait(dr->dr_zio);
+	}
 	/*
 	 * Free intent log blocks up to this tx.
 	 */
 	zil_sync(os->os_zil, tx);
-
-	/*
-	 * Sync meta-dnode
-	 */
-	dirty_list = &os->os_dirty_dnodes[txgoff];
-	ASSERT(list_head(dirty_list) == NULL);
-	list_insert_tail(dirty_list, os->os_meta_dnode);
-	dmu_objset_sync_dnodes(os, dirty_list, tx);
-
-	/*
-	 * Sync the root block.
-	 */
-	bcopy(os->os_phys, abuf->b_data, sizeof (objset_phys_t));
-	zb.zb_objset = os->os_dsl_dataset ? os->os_dsl_dataset->ds_object : 0;
-	zb.zb_object = 0;
-	zb.zb_level = -1;
-	zb.zb_blkid = 0;
-	err = arc_write(NULL, os->os_spa, os->os_md_checksum,
-	    os->os_md_compress,
-	    dmu_get_replication_level(os->os_spa, &zb, DMU_OT_OBJSET),
-	    tx->tx_txg, &os->os_rootbp, abuf, killer, os,
-	    ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, ARC_WAIT, &zb);
-	ASSERT(err == 0);
-	VERIFY(arc_buf_remove_ref(abuf, FTAG) == 1);
-
-	dsl_dataset_set_blkptr(os->os_dsl_dataset, &os->os_rootbp, tx);
-
-	ASSERT3P(os->os_synctx, ==, tx);
-	taskq_wait(dbuf_tq);
-	os->os_synctx = NULL;
+	zio_nowait(zio);
 }
 
 void
--- a/usr/src/uts/common/fs/zfs/dmu_send.c	Fri Feb 02 15:36:29 2007 -0800
+++ b/usr/src/uts/common/fs/zfs/dmu_send.c	Fri Feb 02 15:36:58 2007 -0800
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -382,7 +382,7 @@
 	    DS_MODE_EXCLUSIVE, FTAG, &ds));
 
 	(void) dmu_objset_create_impl(dsl_dataset_get_spa(ds),
-	    ds, drrb->drr_type, tx);
+	    ds, &ds->ds_phys->ds_bp, drrb->drr_type, tx);
 
 	dmu_buf_will_dirty(ds->ds_dbuf, tx);
 	ds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT;
--- a/usr/src/uts/common/fs/zfs/dnode.c	Fri Feb 02 15:36:29 2007 -0800
+++ b/usr/src/uts/common/fs/zfs/dnode.c	Fri Feb 02 15:36:58 2007 -0800
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -65,9 +65,9 @@
 		avl_create(&dn->dn_ranges[i], free_range_compar,
 		    sizeof (free_range_t),
 		    offsetof(struct free_range, fr_node));
-		list_create(&dn->dn_dirty_dbufs[i],
-		    sizeof (dmu_buf_impl_t),
-		    offsetof(dmu_buf_impl_t, db_dirty_node[i]));
+		list_create(&dn->dn_dirty_records[i],
+		    sizeof (dbuf_dirty_record_t),
+		    offsetof(dbuf_dirty_record_t, dr_dirty_node));
 	}
 
 	list_create(&dn->dn_dbufs, sizeof (dmu_buf_impl_t),
@@ -91,7 +91,7 @@
 
 	for (i = 0; i < TXG_SIZE; i++) {
 		avl_destroy(&dn->dn_ranges[i]);
-		list_destroy(&dn->dn_dirty_dbufs[i]);
+		list_destroy(&dn->dn_dirty_records[i]);
 	}
 
 	list_destroy(&dn->dn_dbufs);
@@ -296,7 +296,7 @@
 
 	for (i = 0; i < TXG_SIZE; i++) {
 		ASSERT(!list_link_active(&dn->dn_dirty_link[i]));
-		ASSERT(NULL == list_head(&dn->dn_dirty_dbufs[i]));
+		ASSERT(NULL == list_head(&dn->dn_dirty_records[i]));
 		ASSERT(0 == avl_numnodes(&dn->dn_ranges[i]));
 	}
 	ASSERT(NULL == list_head(&dn->dn_dbufs));
@@ -362,7 +362,7 @@
 		ASSERT3U(dn->dn_next_indblkshift[i], ==, 0);
 		ASSERT3U(dn->dn_next_blksz[i], ==, 0);
 		ASSERT(!list_link_active(&dn->dn_dirty_link[i]));
-		ASSERT3P(list_head(&dn->dn_dirty_dbufs[i]), ==, NULL);
+		ASSERT3P(list_head(&dn->dn_dirty_records[i]), ==, NULL);
 		ASSERT3U(avl_numnodes(&dn->dn_ranges[i]), ==, 0);
 	}
 
@@ -461,7 +461,7 @@
 		ASSERT(db->db.db_data != NULL);
 		db->db.db_size = bonuslen;
 		mutex_exit(&db->db_mtx);
-		dbuf_dirty(db, tx);
+		(void) dbuf_dirty(db, tx);
 	}
 
 	/* change bonus size and type */
@@ -714,7 +714,7 @@
 	 */
 	dnode_add_ref(dn, (void *)(uintptr_t)tx->tx_txg);
 
-	dbuf_dirty(dn->dn_dbuf, tx);
+	(void) dbuf_dirty(dn->dn_dbuf, tx);
 
 	dsl_dataset_dirty(os->os_dsl_dataset, tx);
 }
@@ -855,17 +855,35 @@
 	if (new_nlevels > dn->dn_nlevels) {
 		int old_nlevels = dn->dn_nlevels;
 		dmu_buf_impl_t *db;
+		list_t *list;
+		dbuf_dirty_record_t *new, *dr, *dr_next;
 
 		dn->dn_nlevels = new_nlevels;
 
 		ASSERT3U(new_nlevels, >, dn->dn_next_nlevels[txgoff]);
 		dn->dn_next_nlevels[txgoff] = new_nlevels;
 
-		/* Dirty the left indirects.  */
+		/* dirty the left indirects */
 		db = dbuf_hold_level(dn, old_nlevels, 0, FTAG);
-		dbuf_dirty(db, tx);
+		new = dbuf_dirty(db, tx);
 		dbuf_rele(db, FTAG);
 
+		/* transfer the dirty records to the new indirect */
+		mutex_enter(&dn->dn_mtx);
+		mutex_enter(&new->dt.di.dr_mtx);
+		list = &dn->dn_dirty_records[txgoff];
+		for (dr = list_head(list); dr; dr = dr_next) {
+			dr_next = list_next(&dn->dn_dirty_records[txgoff], dr);
+			if (dr->dr_dbuf->db_level != new_nlevels-1 &&
+			    dr->dr_dbuf->db_blkid != DB_BONUS_BLKID) {
+				ASSERT(dr->dr_dbuf->db_level == old_nlevels-1);
+				list_remove(&dn->dn_dirty_records[txgoff], dr);
+				list_insert_tail(&new->dt.di.dr_children, dr);
+				dr->dr_parent = new;
+			}
+		}
+		mutex_exit(&new->dt.di.dr_mtx);
+		mutex_exit(&dn->dn_mtx);
 	}
 
 out:
@@ -973,7 +991,7 @@
 			caddr_t data;
 
 			/* don't dirty if it isn't on disk and isn't dirty */
-			if (db->db_dirtied ||
+			if (db->db_last_dirty ||
 			    (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr))) {
 				rw_exit(&dn->dn_struct_rwlock);
 				dbuf_will_dirty(db, tx);
@@ -1023,7 +1041,7 @@
 			if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, off+len),
 			    TRUE, FTAG, &db) == 0) {
 				/* don't dirty if not on disk and not dirty */
-				if (db->db_dirtied ||
+				if (db->db_last_dirty ||
 				    (db->db_blkptr &&
 				    !BP_IS_HOLE(db->db_blkptr))) {
 					rw_exit(&dn->dn_struct_rwlock);
--- a/usr/src/uts/common/fs/zfs/dnode_sync.c	Fri Feb 02 15:36:29 2007 -0800
+++ b/usr/src/uts/common/fs/zfs/dnode_sync.c	Fri Feb 02 15:36:58 2007 -0800
@@ -33,78 +33,81 @@
 #include <sys/dmu_objset.h>
 #include <sys/dsl_dataset.h>
 #include <sys/spa.h>
-#include <sys/zio.h>
 
 static void
 dnode_increase_indirection(dnode_t *dn, dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db;
+	int txgoff = tx->tx_txg & TXG_MASK;
+	int nblkptr = dn->dn_phys->dn_nblkptr;
+	int old_toplvl = dn->dn_phys->dn_nlevels - 1;
+	int new_level = dn->dn_next_nlevels[txgoff];
 	int i;
-	uint64_t txg = tx->tx_txg;
 
+	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+
+	/* this dnode can't be paged out because it's dirty */
 	ASSERT(dn->dn_phys->dn_type != DMU_OT_NONE);
 	ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
-	/* this dnode can't be paged out because it's dirty */
+	ASSERT(new_level > 1 && dn->dn_phys->dn_nlevels > 0);
 
 	db = dbuf_hold_level(dn, dn->dn_phys->dn_nlevels, 0, FTAG);
 	ASSERT(db != NULL);
-	for (i = 0; i < dn->dn_phys->dn_nblkptr; i++)
-		if (!BP_IS_HOLE(&dn->dn_phys->dn_blkptr[i]))
-			break;
-	if (i != dn->dn_phys->dn_nblkptr) {
-		ASSERT(list_link_active(&db->db_dirty_node[txg&TXG_MASK]));
 
-		(void) dbuf_read(db, NULL,
-		    DB_RF_HAVESTRUCT | DB_RF_MUST_SUCCEED);
-		arc_release(db->db_buf, db);
-		/* copy dnode's block pointers to new indirect block */
-		ASSERT3U(sizeof (blkptr_t) * dn->dn_phys->dn_nblkptr, <=,
-		    db->db.db_size);
-		bcopy(dn->dn_phys->dn_blkptr, db->db.db_data,
-		    sizeof (blkptr_t) * dn->dn_phys->dn_nblkptr);
-		arc_buf_freeze(db->db_buf);
-	}
-
-	dn->dn_phys->dn_nlevels += 1;
+	dn->dn_phys->dn_nlevels = new_level;
 	dprintf("os=%p obj=%llu, increase to %d\n",
 		dn->dn_objset, dn->dn_object,
 		dn->dn_phys->dn_nlevels);
 
+	/* check for existing blkptrs in the dnode */
+	for (i = 0; i < nblkptr; i++)
+		if (!BP_IS_HOLE(&dn->dn_phys->dn_blkptr[i]))
+			break;
+	if (i != nblkptr) {
+		/* transfer dnode's block pointers to new indirect block */
+		(void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED|DB_RF_HAVESTRUCT);
+		ASSERT(db->db.db_data);
+		ASSERT(arc_released(db->db_buf));
+		ASSERT3U(sizeof (blkptr_t) * nblkptr, <=, db->db.db_size);
+		bcopy(dn->dn_phys->dn_blkptr, db->db.db_data,
+		    sizeof (blkptr_t) * nblkptr);
+		arc_buf_freeze(db->db_buf);
+	}
+
 	/* set dbuf's parent pointers to new indirect buf */
-	for (i = 0; i < dn->dn_phys->dn_nblkptr; i++) {
-		dmu_buf_impl_t *child =
-		    dbuf_find(dn, dn->dn_phys->dn_nlevels-2, i);
+	for (i = 0; i < nblkptr; i++) {
+		dmu_buf_impl_t *child = dbuf_find(dn, old_toplvl, i);
+
 		if (child == NULL)
 			continue;
-		if (child->db_dnode == NULL) {
+		ASSERT3P(child->db_dnode, ==, dn);
+		if (child->db_parent && child->db_parent != dn->dn_dbuf) {
+			ASSERT(child->db_parent->db_level == db->db_level);
+			ASSERT(child->db_blkptr !=
+			    &dn->dn_phys->dn_blkptr[child->db_blkid]);
 			mutex_exit(&child->db_mtx);
 			continue;
 		}
+		ASSERT(child->db_parent == NULL ||
+		    child->db_parent == dn->dn_dbuf);
 
-		if (child->db_parent == NULL ||
-		    child->db_parent == dn->dn_dbuf) {
-			dprintf_dbuf_bp(child, child->db_blkptr,
-			    "changing db_blkptr to new indirect %s", "");
-			child->db_parent = db;
-			dbuf_add_ref(db, child);
-			if (db->db.db_data) {
-				child->db_blkptr =
-				    (blkptr_t *)db->db.db_data + i;
-			} else {
-				child->db_blkptr = NULL;
-			}
-			dprintf_dbuf_bp(child, child->db_blkptr,
-			    "changed db_blkptr to new indirect %s", "");
-		}
-		ASSERT3P(child->db_parent, ==, db);
+		child->db_parent = db;
+		dbuf_add_ref(db, child);
+		if (db->db.db_data)
+			child->db_blkptr = (blkptr_t *)db->db.db_data + i;
+		else
+			child->db_blkptr = NULL;
+		dprintf_dbuf_bp(child, child->db_blkptr,
+		    "changed db_blkptr to new indirect %s", "");
 
 		mutex_exit(&child->db_mtx);
 	}
 
-	bzero(dn->dn_phys->dn_blkptr,
-		sizeof (blkptr_t) * dn->dn_phys->dn_nblkptr);
+	bzero(dn->dn_phys->dn_blkptr, sizeof (blkptr_t) * nblkptr);
 
 	dbuf_rele(db, FTAG);
+
+	rw_exit(&dn->dn_struct_rwlock);
 }
 
 static void
@@ -122,7 +125,8 @@
 
 		bytesfreed += bp_get_dasize(os->os_spa, bp);
 		ASSERT3U(bytesfreed, <=, DN_USED_BYTES(dn->dn_phys));
-		dsl_dataset_block_kill(os->os_dsl_dataset, bp, tx);
+		dsl_dataset_block_kill(os->os_dsl_dataset, bp, dn->dn_zio, tx);
+		bzero(bp, sizeof (blkptr_t));
 	}
 	dnode_diduse_space(dn, -bytesfreed);
 }
@@ -148,8 +152,9 @@
 
 	for (i = off; i < off+num; i++) {
 		uint64_t *buf;
+		dmu_buf_impl_t *child;
+		dbuf_dirty_record_t *dr;
 		int j;
-		dmu_buf_impl_t *child;
 
 		ASSERT(db->db_level == 1);
 
@@ -161,11 +166,14 @@
 			continue;
 		ASSERT(err == 0);
 		ASSERT(child->db_level == 0);
-		ASSERT(!list_link_active(&child->db_dirty_node[txg&TXG_MASK]));
+		dr = child->db_last_dirty;
+		while (dr && dr->dr_txg > txg)
+			dr = dr->dr_next;
+		ASSERT(dr == NULL || dr->dr_txg == txg);
 
-		/* db_data_old better be zeroed */
-		if (child->db_d.db_data_old[txg & TXG_MASK]) {
-			buf = child->db_d.db_data_old[txg & TXG_MASK]->b_data;
+		/* data_old better be zeroed */
+		if (dr) {
+			buf = dr->dt.dl.dr_data->b_data;
 			for (j = 0; j < child->db.db_size >> 3; j++) {
 				if (buf[j] != 0) {
 					panic("freed data not zero: "
@@ -182,10 +190,7 @@
 		mutex_enter(&child->db_mtx);
 		buf = child->db.db_data;
 		if (buf != NULL && child->db_state != DB_FILL &&
-		    !list_link_active(&child->db_dirty_node
-			[(txg+1) & TXG_MASK]) &&
-		    !list_link_active(&child->db_dirty_node
-			[(txg+2) & TXG_MASK])) {
+		    child->db_last_dirty == NULL) {
 			for (j = 0; j < child->db.db_size >> 3; j++) {
 				if (buf[j] != 0) {
 					panic("freed data not zero: "
@@ -210,7 +215,6 @@
 	dmu_buf_impl_t *subdb;
 	uint64_t start, end, dbstart, dbend, i;
 	int epbs, shift, err;
-	int txgoff = tx->tx_txg & TXG_MASK;
 	int all = TRUE;
 
 	(void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED);
@@ -239,7 +243,7 @@
 		FREE_VERIFY(db, start, end, tx);
 		free_blocks(dn, bp, end-start+1, tx);
 		arc_buf_freeze(db->db_buf);
-		ASSERT(all || list_link_active(&db->db_dirty_node[txgoff]));
+		ASSERT(all || db->db_last_dirty);
 		return (all);
 	}
 
@@ -270,7 +274,7 @@
 		ASSERT3U(bp->blk_birth, ==, 0);
 	}
 #endif
-	ASSERT(all || list_link_active(&db->db_dirty_node[txgoff]));
+	ASSERT(all || db->db_last_dirty);
 	return (all);
 }
 
@@ -418,31 +422,43 @@
 	return (0);
 }
 
-static int
+static void
+dnode_undirty_dbufs(list_t *list)
+{
+	dbuf_dirty_record_t *dr;
+
+	while (dr = list_head(list)) {
+		dmu_buf_impl_t *db = dr->dr_dbuf;
+		uint64_t txg = dr->dr_txg;
+
+		mutex_enter(&db->db_mtx);
+		/* XXX - use dbuf_undirty()? */
+		list_remove(list, dr);
+		ASSERT(db->db_last_dirty == dr);
+		db->db_last_dirty = NULL;
+		db->db_dirtycnt -= 1;
+		if (db->db_level == 0) {
+			ASSERT(db->db_blkid == DB_BONUS_BLKID ||
+			    dr->dt.dl.dr_data == db->db_buf);
+			dbuf_unoverride(dr);
+			mutex_exit(&db->db_mtx);
+		} else {
+			mutex_exit(&db->db_mtx);
+			dnode_undirty_dbufs(&dr->dt.di.dr_children);
+		}
+		kmem_free(dr, sizeof (dbuf_dirty_record_t));
+		dbuf_rele(db, (void *)(uintptr_t)txg);
+	}
+}
+
+static void
 dnode_sync_free(dnode_t *dn, dmu_tx_t *tx)
 {
-	dmu_buf_impl_t *db;
 	int txgoff = tx->tx_txg & TXG_MASK;
 
 	ASSERT(dmu_tx_is_syncing(tx));
 
-	/* Undirty all buffers */
-	while (db = list_head(&dn->dn_dirty_dbufs[txgoff])) {
-		mutex_enter(&db->db_mtx);
-		/* XXX - use dbuf_undirty()? */
-		list_remove(&dn->dn_dirty_dbufs[txgoff], db);
-		if (db->db_level == 0) {
-			ASSERT(db->db_blkid == DB_BONUS_BLKID ||
-			    db->db_d.db_data_old[txgoff] == db->db_buf);
-			if (db->db_d.db_overridden_by[txgoff])
-				dbuf_unoverride(db, tx->tx_txg);
-			db->db_d.db_data_old[txgoff] = NULL;
-		}
-		db->db_dirtycnt -= 1;
-		mutex_exit(&db->db_mtx);
-		dbuf_rele(db, (void *)(uintptr_t)tx->tx_txg);
-	}
-
+	dnode_undirty_dbufs(&dn->dn_dirty_records[txgoff]);
 	(void) dnode_evict_dbufs(dn, 0);
 	ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL);
 
@@ -487,32 +503,27 @@
 	 * Now that we've released our hold, the dnode may
 	 * be evicted, so we musn't access it.
 	 */
-	return (1);
 }
 
 /*
- * Write out the dnode's dirty buffers at the specified level.
- * This may create more dirty buffers at the next level up.
+ * Write out the dnode's dirty buffers.
  *
  * NOTE: The dnode is kept in memory by being dirty.  Once the
  * dirty bit is cleared, it may be evicted.  Beware of this!
  */
-int
-dnode_sync(dnode_t *dn, int level, zio_t *zio, dmu_tx_t *tx)
+void
+dnode_sync(dnode_t *dn, dmu_tx_t *tx)
 {
 	free_range_t *rp;
+	dnode_phys_t *dnp = dn->dn_phys;
 	int txgoff = tx->tx_txg & TXG_MASK;
-	dnode_phys_t *dnp = dn->dn_phys;
+	list_t *list = &dn->dn_dirty_records[txgoff];
 
 	ASSERT(dmu_tx_is_syncing(tx));
 	ASSERT(dnp->dn_type != DMU_OT_NONE || dn->dn_allocated_txg);
 	DNODE_VERIFY(dn);
 
-	/*
-	 * Make sure the dbuf for the dn_phys is released before we modify it.
-	 */
-	if (dn->dn_dbuf)
-		arc_release(dn->dn_dbuf->db_buf, dn->dn_dbuf);
+	ASSERT(dn->dn_dbuf == NULL || arc_released(dn->dn_dbuf->db_buf));
 
 	mutex_enter(&dn->dn_mtx);
 	if (dn->dn_allocated_txg == tx->tx_txg) {
@@ -536,7 +547,7 @@
 		dnp->dn_nblkptr = dn->dn_nblkptr;
 	}
 
-	ASSERT(level != 0 || dnp->dn_nlevels > 1 ||
+	ASSERT(dnp->dn_nlevels > 1 ||
 	    BP_IS_HOLE(&dnp->dn_blkptr[0]) ||
 	    BP_GET_LSIZE(&dnp->dn_blkptr[0]) ==
 	    dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT);
@@ -545,7 +556,7 @@
 		ASSERT(P2PHASE(dn->dn_next_blksz[txgoff],
 		    SPA_MINBLOCKSIZE) == 0);
 		ASSERT(BP_IS_HOLE(&dnp->dn_blkptr[0]) ||
-		    list_head(&dn->dn_dirty_dbufs[txgoff]) != NULL ||
+		    list_head(list) != NULL ||
 		    dn->dn_next_blksz[txgoff] >> SPA_MINBLOCKSHIFT ==
 		    dnp->dn_datablkszsec);
 		dnp->dn_datablkszsec =
@@ -586,68 +597,25 @@
 	mutex_exit(&dn->dn_mtx);
 
 	if (dn->dn_free_txg > 0 && dn->dn_free_txg <= tx->tx_txg) {
-		ASSERT3U(level, ==, 0);
-		return (dnode_sync_free(dn, tx));
+		dnode_sync_free(dn, tx);
+		return;
 	}
 
 	if (dn->dn_next_nlevels[txgoff]) {
-		int new_lvl = dn->dn_next_nlevels[txgoff];
-
-		rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
-		while (new_lvl > dnp->dn_nlevels)
-			dnode_increase_indirection(dn, tx);
-		rw_exit(&dn->dn_struct_rwlock);
+		dnode_increase_indirection(dn, tx);
 		dn->dn_next_nlevels[txgoff] = 0;
 	}
 
-	if (level == dnp->dn_nlevels) {
-		uint64_t off = (dn->dn_phys->dn_maxblkid + 1) *
-		    (dn->dn_phys->dn_datablkszsec << SPA_MINBLOCKSHIFT);
-
-		/* we've already synced out all data and indirect blocks */
-		/* there are no more dirty dbufs under this dnode */
-		ASSERT3P(list_head(&dn->dn_dirty_dbufs[txgoff]), ==, NULL);
-		ASSERT(dn->dn_free_txg == 0 || dn->dn_free_txg >= tx->tx_txg);
+	dbuf_sync_list(list, tx);
 
-		/* NB: the "off < maxblkid" is to catch overflow */
-		/*
-		 * NB: if blocksize is changing, we could get confused,
-		 * so only bother if there are multiple blocks and thus
-		 * it can't be changing.
-		 */
-		ASSERT(off < dn->dn_phys->dn_maxblkid ||
-		    dn->dn_phys->dn_maxblkid == 0 ||
-		    dnode_next_offset(dn, FALSE, &off, 1, 1, 0) != 0);
-
-		ASSERT(dnp->dn_nlevels > 1 ||
-		    BP_IS_HOLE(&dnp->dn_blkptr[0]) ||
-		    BP_GET_LSIZE(&dnp->dn_blkptr[0]) ==
-		    dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT);
+	if (dn->dn_object != DMU_META_DNODE_OBJECT) {
+		ASSERT3P(list_head(list), ==, NULL);
+		dnode_rele(dn, (void *)(uintptr_t)tx->tx_txg);
+	}
 
-		if (dn->dn_object != DMU_META_DNODE_OBJECT) {
-			dbuf_will_dirty(dn->dn_dbuf, tx);
-			dnode_rele(dn, (void *)(uintptr_t)tx->tx_txg);
-		}
-
-		/*
-		 * Now that we've dropped the reference, the dnode may
-		 * be evicted, so we musn't access it.
-		 */
-		return (1);
-	} else {
-		dmu_buf_impl_t *db, *db_next;
-		list_t *list = &dn->dn_dirty_dbufs[txgoff];
-		/*
-		 * Iterate over the list, removing and sync'ing dbufs
-		 * which are on the level we want, and leaving others.
-		 */
-		for (db = list_head(list); db; db = db_next) {
-			db_next = list_next(list, db);
-			if (db->db_level == level) {
-				list_remove(list, db);
-				dbuf_sync(db, zio, tx);
-			}
-		}
-		return (0);
-	}
+	/*
+	 * Although we have dropped our reference to the dnode, it
+	 * can't be evicted until its written, and we haven't yet
+	 * initiated the IO for the dnode's dbuf.
+	 */
 }
--- a/usr/src/uts/common/fs/zfs/dsl_dataset.c	Fri Feb 02 15:36:29 2007 -0800
+++ b/usr/src/uts/common/fs/zfs/dsl_dataset.c	Fri Feb 02 15:36:58 2007 -0800
@@ -105,26 +105,28 @@
 }
 
 void
-dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx)
+dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, zio_t *pio,
+    dmu_tx_t *tx)
 {
 	int used = bp_get_dasize(tx->tx_pool->dp_spa, bp);
 	int compressed = BP_GET_PSIZE(bp);
 	int uncompressed = BP_GET_UCSIZE(bp);
 
 	ASSERT(dmu_tx_is_syncing(tx));
+	/* No block pointer => nothing to free */
 	if (BP_IS_HOLE(bp))
 		return;
 
 	ASSERT(used > 0);
 	if (ds == NULL) {
+		int err;
 		/*
 		 * Account for the meta-objset space in its placeholder
 		 * dataset.
 		 */
-		/* XXX this can fail, what do we do when it does? */
-		(void) arc_free(NULL, tx->tx_pool->dp_spa,
-		    tx->tx_txg, bp, NULL, NULL, ARC_WAIT);
-		bzero(bp, sizeof (blkptr_t));
+		err = arc_free(pio, tx->tx_pool->dp_spa,
+		    tx->tx_txg, bp, NULL, NULL, pio ? ARC_NOWAIT: ARC_WAIT);
+		ASSERT(err == 0);
 
 		dsl_dir_diduse_space(tx->tx_pool->dp_mos_dir,
 		    -used, -compressed, -uncompressed, tx);
@@ -136,10 +138,12 @@
 	dmu_buf_will_dirty(ds->ds_dbuf, tx);
 
 	if (bp->blk_birth > ds->ds_phys->ds_prev_snap_txg) {
+		int err;
+
 		dprintf_bp(bp, "freeing: %s", "");
-		/* XXX check return code? */
-		(void) arc_free(NULL, tx->tx_pool->dp_spa,
-		    tx->tx_txg, bp, NULL, NULL, ARC_WAIT);
+		err = arc_free(pio, tx->tx_pool->dp_spa,
+		    tx->tx_txg, bp, NULL, NULL, pio ? ARC_NOWAIT: ARC_WAIT);
+		ASSERT(err == 0);
 
 		mutex_enter(&ds->ds_lock);
 		/* XXX unique_bytes is not accurate for head datasets */
@@ -167,7 +171,6 @@
 			}
 		}
 	}
-	bzero(bp, sizeof (blkptr_t));
 	mutex_enter(&ds->ds_lock);
 	ASSERT3U(ds->ds_phys->ds_used_bytes, >=, used);
 	ds->ds_phys->ds_used_bytes -= used;
@@ -539,7 +542,8 @@
 
 	VERIFY(0 ==
 	    dsl_dataset_open_obj(dp, dsobj, NULL, DS_MODE_NONE, FTAG, &ds));
-	(void) dmu_objset_create_impl(dp->dp_spa, ds, DMU_OST_ZFS, tx);
+	(void) dmu_objset_create_impl(dp->dp_spa, ds,
+	    &ds->ds_phys->ds_bp, DMU_OST_ZFS, tx);
 	dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
 }
 
@@ -829,10 +833,10 @@
 }
 
 
-void
-dsl_dataset_get_blkptr(dsl_dataset_t *ds, blkptr_t *bp)
+blkptr_t *
+dsl_dataset_get_blkptr(dsl_dataset_t *ds)
 {
-	*bp = ds->ds_phys->ds_bp;
+	return (&ds->ds_phys->ds_bp);
 }
 
 void
@@ -1403,17 +1407,15 @@
 }
 
 void
-dsl_dataset_sync(dsl_dataset_t *ds, dmu_tx_t *tx)
+dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx)
 {
 	ASSERT(dmu_tx_is_syncing(tx));
 	ASSERT(ds->ds_user_ptr != NULL);
 	ASSERT(ds->ds_phys->ds_next_snap_obj == 0);
 
-	dmu_objset_sync(ds->ds_user_ptr, tx);
 	dsl_dir_dirty(ds->ds_dir, tx);
-	bplist_close(&ds->ds_deadlist);
-
-	dmu_buf_rele(ds->ds_dbuf, ds);
+	dmu_objset_sync(ds->ds_user_ptr, zio, tx);
+	/* Unneeded? bplist_close(&ds->ds_deadlist); */
 }
 
 void
--- a/usr/src/uts/common/fs/zfs/dsl_pool.c	Fri Feb 02 15:36:29 2007 -0800
+++ b/usr/src/uts/common/fs/zfs/dsl_pool.c	Fri Feb 02 15:36:58 2007 -0800
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -33,6 +33,7 @@
 #include <sys/dmu_objset.h>
 #include <sys/arc.h>
 #include <sys/zap.h>
+#include <sys/zio.h>
 #include <sys/zfs_context.h>
 #include <sys/fs/zfs.h>
 
@@ -143,7 +144,7 @@
 	dsl_pool_t *dp = dsl_pool_open_impl(spa, txg);
 	dmu_tx_t *tx = dmu_tx_create_assigned(dp, txg);
 	dp->dp_meta_objset = &dmu_objset_create_impl(spa,
-	    NULL, DMU_OST_META, tx)->os;
+	    NULL, &dp->dp_meta_rootbp, DMU_OST_META, tx)->os;
 
 	/* create the pool directory */
 	err = zap_create_claim(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
@@ -167,36 +168,36 @@
 void
 dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
 {
+	zio_t *zio;
 	dmu_tx_t *tx;
+	dsl_dir_t *dd;
+	dsl_dataset_t *ds;
+	dsl_sync_task_group_t *dstg;
 	objset_impl_t *mosi = dp->dp_meta_objset->os;
+	int err;
 
 	tx = dmu_tx_create_assigned(dp, txg);
 
-	do {
-		dsl_dir_t *dd;
-		dsl_dataset_t *ds;
-		dsl_sync_task_group_t *dstg;
+	zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
+	while (ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) {
+		if (!list_link_active(&ds->ds_synced_link))
+			list_insert_tail(&dp->dp_synced_objsets, ds);
+		dsl_dataset_sync(ds, zio, tx);
+	}
+	err = zio_wait(zio);
+	ASSERT(err == 0);
 
-		while (ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) {
-			if (!list_link_active(&ds->ds_synced_link))
-				list_insert_tail(&dp->dp_synced_objsets, ds);
-			dsl_dataset_sync(ds, tx);
-		}
-		while (dstg = txg_list_remove(&dp->dp_sync_tasks, txg))
-			dsl_sync_task_group_sync(dstg, tx);
-		while (dd = txg_list_remove(&dp->dp_dirty_dirs, txg))
-			dsl_dir_sync(dd, tx);
-		/*
-		 * We need to loop since dsl_sync_task_group_sync()
-		 * could create a new (dirty) objset.
-		 * XXX - isn't this taken care of by the spa's sync to
-		 * convergence loop?
-		 */
-	} while (!txg_list_empty(&dp->dp_dirty_datasets, txg));
+	while (dstg = txg_list_remove(&dp->dp_sync_tasks, txg))
+		dsl_sync_task_group_sync(dstg, tx);
+	while (dd = txg_list_remove(&dp->dp_dirty_dirs, txg))
+		dsl_dir_sync(dd, tx);
 
 	if (list_head(&mosi->os_dirty_dnodes[txg & TXG_MASK]) != NULL ||
 	    list_head(&mosi->os_free_dnodes[txg & TXG_MASK]) != NULL) {
-		dmu_objset_sync(mosi, tx);
+		zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
+		dmu_objset_sync(mosi, zio, tx);
+		err = zio_wait(zio);
+		ASSERT(err == 0);
 		dprintf_bp(&dp->dp_meta_rootbp, "meta objset rootbp is %s", "");
 		spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp);
 	}
@@ -216,18 +217,15 @@
 	}
 }
 
+/*
+ * TRUE if the current thread is the tx_sync_thread or if we
+ * are being called from SPA context during pool initialization.
+ */
 int
 dsl_pool_sync_context(dsl_pool_t *dp)
 {
-	/*
-	 * Yeah, this is cheesy.  But the SPA needs some way to let
-	 * the sync threads invoke spa_open() and spa_close() while
-	 * it holds the namespace lock.  I'm certainly open to better
-	 * ideas for how to determine whether the current thread is
-	 * operating on behalf of spa_sync().  This works for now.
-	 */
 	return (curthread == dp->dp_tx.tx_sync_thread ||
-	    BP_IS_HOLE(&dp->dp_meta_rootbp));
+	    spa_get_dsl(dp->dp_spa) == NULL);
 }
 
 uint64_t
--- a/usr/src/uts/common/fs/zfs/sys/arc.h	Fri Feb 02 15:36:29 2007 -0800
+++ b/usr/src/uts/common/fs/zfs/sys/arc.h	Fri Feb 02 15:36:58 2007 -0800
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -84,10 +84,10 @@
 int arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_byteswap_func_t *swap,
     arc_done_func_t *done, void *private, int priority, int flags,
     uint32_t *arc_flags, zbookmark_t *zb);
-int arc_write(zio_t *pio, spa_t *spa, int checksum, int compress, int ncopies,
-    uint64_t txg, blkptr_t *bp, arc_buf_t *buf,
-    arc_done_func_t *done, void *private, int priority, int flags,
-    uint32_t arc_flags, zbookmark_t *zb);
+zio_t *arc_write(zio_t *pio, spa_t *spa, int checksum, int compress,
+    int ncopies, uint64_t txg, blkptr_t *bp, arc_buf_t *buf,
+    arc_done_func_t *ready, arc_done_func_t *done, void *private, int priority,
+    int flags, zbookmark_t *zb);
 int arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
     zio_done_func_t *done, void *private, uint32_t arc_flags);
 int arc_tryread(spa_t *spa, blkptr_t *bp, void *data);
--- a/usr/src/uts/common/fs/zfs/sys/dbuf.h	Fri Feb 02 15:36:29 2007 -0800
+++ b/usr/src/uts/common/fs/zfs/sys/dbuf.h	Fri Feb 02 15:36:58 2007 -0800
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -41,7 +41,7 @@
 #endif
 
 #define	DB_BONUS_BLKID (-1ULL)
-#define	IN_DMU_SYNC ((blkptr_t *)-1)
+#define	IN_DMU_SYNC 2
 
 /*
  * define flags for dbuf_read
@@ -86,6 +86,56 @@
 #define	LIST_LINK_INACTIVE(link) \
 	((link)->list_next == NULL && (link)->list_prev == NULL)
 
+struct dmu_buf_impl;
+
+typedef enum override_states {
+	DR_NOT_OVERRIDDEN,
+	DR_IN_DMU_SYNC,
+	DR_OVERRIDDEN
+} override_states_t;
+
+typedef struct dbuf_dirty_record {
+	/* link on our parents dirty list */
+	list_node_t dr_dirty_node;
+
+	/* transaction group this data will sync in */
+	uint64_t dr_txg;
+
+	/* zio of outstanding write IO */
+	zio_t *dr_zio;
+
+	/* pointer back to our dbuf */
+	struct dmu_buf_impl *dr_dbuf;
+
+	/* pointer to next dirty record */
+	struct dbuf_dirty_record *dr_next;
+
+	/* pointer to parent dirty record */
+	struct dbuf_dirty_record *dr_parent;
+
+	union dirty_types {
+		struct dirty_indirect {
+
+			/* protect access to list */
+			kmutex_t dr_mtx;
+
+			/* Our list of dirty children */
+			list_t dr_children;
+		} di;
+		struct dirty_leaf {
+
+			/*
+			 * dr_data is set when we dirty the buffer
+			 * so that we can retain the pointer even if it
+			 * gets COW'd in a subsequent transaction group.
+			 */
+			arc_buf_t *dr_data;
+			blkptr_t dr_overridden_by;
+			override_states_t dr_override_state;
+		} dl;
+	} dt;
+} dbuf_dirty_record_t;
+
 typedef struct dmu_buf_impl {
 	/*
 	 * The following members are immutable, with the exception of
@@ -152,53 +202,28 @@
 	arc_buf_t *db_buf;
 
 	kcondvar_t db_changed;
-	arc_buf_t *db_data_pending;
+	dbuf_dirty_record_t *db_data_pending;
+
+	/* pointer to most recent dirty record for this buffer */
+	dbuf_dirty_record_t *db_last_dirty;
 
 	/*
-	 * Last time (transaction group) this buffer was dirtied.
-	 */
-	uint64_t db_dirtied;
-
-	/*
-	 * If db_dnode != NULL, our link on the owner dnodes's dn_dbufs list.
+	 * Our link on the owner dnodes's dn_dbufs list.
 	 * Protected by its dn_dbufs_mtx.
 	 */
 	list_node_t db_link;
 
-	/* Our link on dn_dirty_dbufs[txg] */
-	list_node_t db_dirty_node[TXG_SIZE];
-	uint8_t db_dirtycnt;
-
-	/*
-	 * Data which is unique to data (leaf) blocks:
-	 */
-	struct {
-		/* stuff we store for the user (see dmu_buf_set_user) */
-		void *db_user_ptr;
-		void **db_user_data_ptr_ptr;
-		dmu_buf_evict_func_t *db_evict_func;
-		uint8_t db_immediate_evict;
-		uint8_t db_freed_in_flight;
+	/* Data which is unique to data (leaf) blocks: */
 
-		/*
-		 * db_data_old[txg&TXG_MASK] is set when we
-		 * dirty the buffer, so that we can retain the
-		 * pointer even if it gets COW'd in a subsequent
-		 * transaction group.
-		 *
-		 * If the buffer is dirty in any txg, it can't
-		 * be destroyed.
-		 */
-		/*
-		 * XXX Protected by db_mtx and dn_dirty_mtx.
-		 * db_mtx must be held to read db_dirty[], and
-		 * both db_mtx and dn_dirty_mtx must be held to
-		 * modify (dirty or clean). db_mtx must be held
-		 * before dn_dirty_mtx.
-		 */
-		arc_buf_t *db_data_old[TXG_SIZE];
-		blkptr_t *db_overridden_by[TXG_SIZE];
-	} db_d;
+	/* stuff we store for the user (see dmu_buf_set_user) */
+	void *db_user_ptr;
+	void **db_user_data_ptr_ptr;
+	dmu_buf_evict_func_t *db_evict_func;
+
+	uint8_t db_immediate_evict;
+	uint8_t db_freed_in_flight;
+
+	uint8_t db_dirtycnt;
 } dmu_buf_impl_t;
 
 /* Note: the dbuf hash table is exposed only for the mdb module */
@@ -237,14 +262,14 @@
 void dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx);
 void dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx);
 void dmu_buf_fill_done(dmu_buf_t *db, dmu_tx_t *tx);
-void dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
+dbuf_dirty_record_t *dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
 
 void dbuf_clear(dmu_buf_impl_t *db);
 void dbuf_evict(dmu_buf_impl_t *db);
 
 void dbuf_setdirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
-void dbuf_sync(dmu_buf_impl_t *db, zio_t *zio, dmu_tx_t *tx);
-void dbuf_unoverride(dmu_buf_impl_t *db, uint64_t txg);
+void dbuf_unoverride(dbuf_dirty_record_t *dr);
+void dbuf_sync_list(list_t *list, dmu_tx_t *tx);
 
 void dbuf_free_range(struct dnode *dn, uint64_t blkid, uint64_t nblks,
     struct dmu_tx *);
--- a/usr/src/uts/common/fs/zfs/sys/dmu_impl.h	Fri Feb 02 15:36:29 2007 -0800
+++ b/usr/src/uts/common/fs/zfs/sys/dmu_impl.h	Fri Feb 02 15:36:58 2007 -0800
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -218,6 +217,14 @@
  *    held from:
  *    	dsl_dataset_*
  *
+ * dr_mtx (leaf)
+ *    protects:
+ *	dr_children
+ *    held from:
+ *	dbuf_dirty
+ *	dbuf_undirty
+ *	dbuf_sync_indirect
+ *	dnode_new_blkid
  */
 
 struct objset;
--- a/usr/src/uts/common/fs/zfs/sys/dmu_objset.h	Fri Feb 02 15:36:29 2007 -0800
+++ b/usr/src/uts/common/fs/zfs/sys/dmu_objset.h	Fri Feb 02 15:36:58 2007 -0800
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -29,6 +29,7 @@
 #pragma ident	"%Z%%M%	%I%	%E% SMI"
 
 #include <sys/spa.h>
+#include <sys/arc.h>
 #include <sys/txg.h>
 #include <sys/zfs_context.h>
 #include <sys/dnode.h>
@@ -60,6 +61,7 @@
 	/* Immutable: */
 	struct dsl_dataset *os_dsl_dataset;
 	spa_t *os_spa;
+	arc_buf_t *os_phys_buf;
 	objset_phys_t *os_phys;
 	dnode_t *os_meta_dnode;
 	zilog_t *os_zil;
@@ -71,7 +73,7 @@
 
 	/* no lock needed: */
 	struct dmu_tx *os_synctx; /* XXX sketchy */
-	blkptr_t os_rootbp;
+	blkptr_t *os_rootbp;
 
 	/* Protected by os_obj_lock */
 	kmutex_t os_obj_lock;
@@ -108,9 +110,9 @@
 int dmu_objset_evict_dbufs(objset_t *os, int try);
 
 /* called from dsl */
-void dmu_objset_sync(objset_impl_t *os, dmu_tx_t *tx);
+void dmu_objset_sync(objset_impl_t *os, zio_t *zio, dmu_tx_t *tx);
 objset_impl_t *dmu_objset_create_impl(spa_t *spa, struct dsl_dataset *ds,
-    dmu_objset_type_t type, dmu_tx_t *tx);
+    blkptr_t *bp, dmu_objset_type_t type, dmu_tx_t *tx);
 int dmu_objset_open_impl(spa_t *spa, struct dsl_dataset *ds, blkptr_t *bp,
     objset_impl_t **osip);
 void dmu_objset_evict(struct dsl_dataset *ds, void *arg);
--- a/usr/src/uts/common/fs/zfs/sys/dnode.h	Fri Feb 02 15:36:29 2007 -0800
+++ b/usr/src/uts/common/fs/zfs/sys/dnode.h	Fri Feb 02 15:36:58 2007 -0800
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -32,6 +32,7 @@
 #include <sys/avl.h>
 #include <sys/spa.h>
 #include <sys/txg.h>
+#include <sys/zio.h>
 #include <sys/refcount.h>
 #include <sys/dmu_zfetch.h>
 
@@ -162,7 +163,7 @@
 
 	/* protected by dn_mtx: */
 	kmutex_t dn_mtx;
-	list_t dn_dirty_dbufs[TXG_SIZE];
+	list_t dn_dirty_records[TXG_SIZE];
 	avl_tree_t dn_ranges[TXG_SIZE];
 	uint64_t dn_allocated_txg;
 	uint64_t dn_free_txg;
@@ -179,6 +180,9 @@
 	list_t dn_dbufs;		/* linked list of descendent dbuf_t's */
 	struct dmu_buf_impl *dn_bonus;	/* bonus buffer dbuf */
 
+	/* parent IO for current sync write */
+	zio_t *dn_zio;
+
 	/* holds prefetch structure */
 	struct zfetch	dn_zfetch;
 } dnode_t;
@@ -200,7 +204,7 @@
 void dnode_add_ref(dnode_t *dn, void *ref);
 void dnode_rele(dnode_t *dn, void *ref);
 void dnode_setdirty(dnode_t *dn, dmu_tx_t *tx);
-int dnode_sync(dnode_t *dn, int level, struct zio *zio, dmu_tx_t *tx);
+void dnode_sync(dnode_t *dn, dmu_tx_t *tx);
 void dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
     dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
 void dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
--- a/usr/src/uts/common/fs/zfs/sys/dsl_dataset.h	Fri Feb 02 15:36:29 2007 -0800
+++ b/usr/src/uts/common/fs/zfs/sys/dsl_dataset.h	Fri Feb 02 15:36:58 2007 -0800
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -31,6 +31,7 @@
 #include <sys/dmu.h>
 #include <sys/spa.h>
 #include <sys/txg.h>
+#include <sys/zio.h>
 #include <sys/bplist.h>
 #include <sys/dsl_synctask.h>
 #include <sys/zfs_context.h>
@@ -138,15 +139,16 @@
     void *p, dsl_dataset_evict_func_t func);
 void *dsl_dataset_get_user_ptr(dsl_dataset_t *ds);
 
-void dsl_dataset_get_blkptr(dsl_dataset_t *ds, blkptr_t *bp);
+blkptr_t *dsl_dataset_get_blkptr(dsl_dataset_t *ds);
 void dsl_dataset_set_blkptr(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx);
 
 spa_t *dsl_dataset_get_spa(dsl_dataset_t *ds);
 
-void dsl_dataset_sync(dsl_dataset_t *os, dmu_tx_t *tx);
+void dsl_dataset_sync(dsl_dataset_t *os, zio_t *zio, dmu_tx_t *tx);
 
 void dsl_dataset_block_born(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx);
-void dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx);
+void dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, zio_t *pio,
+    dmu_tx_t *tx);
 int dsl_dataset_block_freeable(dsl_dataset_t *ds, uint64_t blk_birth);
 uint64_t dsl_dataset_prev_snap_txg(dsl_dataset_t *ds);
 
--- a/usr/src/uts/common/fs/zfs/sys/spa.h	Fri Feb 02 15:36:29 2007 -0800
+++ b/usr/src/uts/common/fs/zfs/sys/spa.h	Fri Feb 02 15:36:58 2007 -0800
@@ -272,6 +272,7 @@
 #define	BP_IDENTITY(bp)		(&(bp)->blk_dva[0])
 #define	BP_IS_GANG(bp)		DVA_GET_GANG(BP_IDENTITY(bp))
 #define	BP_IS_HOLE(bp)		((bp)->blk_birth == 0)
+#define	BP_IS_OLDER(bp, txg)	(!BP_IS_HOLE(bp) && (bp)->blk_birth < (txg))
 
 #define	BP_ZERO(bp)				\
 {						\
--- a/usr/src/uts/common/fs/zfs/sys/zio.h	Fri Feb 02 15:36:29 2007 -0800
+++ b/usr/src/uts/common/fs/zfs/sys/zio.h	Fri Feb 02 15:36:58 2007 -0800
@@ -207,6 +207,7 @@
 	zio_t		*io_logical;
 
 	/* Callback info */
+	zio_done_func_t	*io_ready;
 	zio_done_func_t	*io_done;
 	void		*io_private;
 	blkptr_t	io_bp_orig;
@@ -262,8 +263,8 @@
 
 extern zio_t *zio_write(zio_t *pio, spa_t *spa, int checksum, int compress,
     int ncopies, uint64_t txg, blkptr_t *bp, void *data, uint64_t size,
-    zio_done_func_t *done, void *private, int priority, int flags,
-    zbookmark_t *zb);
+    zio_done_func_t *ready, zio_done_func_t *done, void *private, int priority,
+    int flags, zbookmark_t *zb);
 
 extern zio_t *zio_rewrite(zio_t *pio, spa_t *spa, int checksum,
     uint64_t txg, blkptr_t *bp, void *data, uint64_t size,
--- a/usr/src/uts/common/fs/zfs/zio.c	Fri Feb 02 15:36:29 2007 -0800
+++ b/usr/src/uts/common/fs/zfs/zio.c	Fri Feb 02 15:36:58 2007 -0800
@@ -435,8 +435,8 @@
 zio_t *
 zio_write(zio_t *pio, spa_t *spa, int checksum, int compress, int ncopies,
     uint64_t txg, blkptr_t *bp, void *data, uint64_t size,
-    zio_done_func_t *done, void *private, int priority, int flags,
-    zbookmark_t *zb)
+    zio_done_func_t *ready, zio_done_func_t *done, void *private, int priority,
+    int flags, zbookmark_t *zb)
 {
 	zio_t *zio;
 
@@ -450,6 +450,8 @@
 	    ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_USER,
 	    ZIO_STAGE_OPEN, ZIO_WRITE_PIPELINE);
 
+	zio->io_ready = ready;
+
 	zio->io_bookmark = *zb;
 
 	zio->io_logical = zio;
@@ -810,6 +812,9 @@
 {
 	zio_t *pio = zio->io_parent;
 
+	if (zio->io_ready)
+		zio->io_ready(zio);
+
 	if (pio != NULL)
 		zio_notify_parent(zio, ZIO_STAGE_WAIT_CHILDREN_READY,
 		    &pio->io_children_notready);