6395371 ASSERT in dmu_tx_count_free: blkid + i < dn->dn_phys->dn_nblkptr
authorahrens
Fri, 10 Mar 2006 16:27:46 -0800
changeset 1596 2e2377ccbf85
parent 1595 a3a6f2aa4c82
child 1597 0764c6b4accf
6395371 ASSERT in dmu_tx_count_free: blkid + i < dn->dn_phys->dn_nblkptr 6396359 infinite loop due to dangling dbufs (hang on unmount)
usr/src/lib/libzpool/common/sys/zfs_context.h
usr/src/uts/common/fs/zfs/arc.c
usr/src/uts/common/fs/zfs/dbuf.c
usr/src/uts/common/fs/zfs/dmu.c
usr/src/uts/common/fs/zfs/dmu_objset.c
usr/src/uts/common/fs/zfs/dmu_tx.c
usr/src/uts/common/fs/zfs/dnode.c
usr/src/uts/common/fs/zfs/dnode_sync.c
usr/src/uts/common/fs/zfs/sys/dnode.h
--- a/usr/src/lib/libzpool/common/sys/zfs_context.h	Fri Mar 10 16:08:51 2006 -0800
+++ b/usr/src/lib/libzpool/common/sys/zfs_context.h	Fri Mar 10 16:27:46 2006 -0800
@@ -164,6 +164,11 @@
 #define	DTRACE_PROBE2(a, b, c, d, e)	((void)0)
 #endif	/* DTRACE_PROBE2 */
 
+#ifdef DTRACE_PROBE3
+#undef	DTRACE_PROBE3
+#define	DTRACE_PROBE3(a, b, c, d, e, f, g)	((void)0)
+#endif	/* DTRACE_PROBE3 */
+
 /*
  * Threads
  */
--- a/usr/src/uts/common/fs/zfs/arc.c	Fri Mar 10 16:08:51 2006 -0800
+++ b/usr/src/uts/common/fs/zfs/arc.c	Fri Mar 10 16:27:46 2006 -0800
@@ -1859,7 +1859,8 @@
 			mutex_exit(hash_lock);
 
 		ASSERT3U(hdr->b_size, ==, size);
-		DTRACE_PROBE2(arc__miss, blkptr_t *, bp, uint64_t, size);
+		DTRACE_PROBE3(arc__miss, blkptr_t *, bp, uint64_t, size,
+		    zbookmark_t *, zb);
 		atomic_add_64(&arc.misses, 1);
 
 		rzio = zio_read(pio, spa, bp, buf->b_data, size,
--- a/usr/src/uts/common/fs/zfs/dbuf.c	Fri Mar 10 16:08:51 2006 -0800
+++ b/usr/src/uts/common/fs/zfs/dbuf.c	Fri Mar 10 16:27:46 2006 -0800
@@ -761,14 +761,8 @@
 			mutex_exit(&db->db_mtx);
 			continue;
 		}
-		if (db->db_state == DB_READ) {
-			/* this will be handled in dbuf_read_done() */
-			db->db_d.db_freed_in_flight = TRUE;
-			mutex_exit(&db->db_mtx);
-			continue;
-		}
-		if (db->db_state == DB_FILL) {
-			/* this will be handled in dbuf_rele() */
+		if (db->db_state == DB_READ || db->db_state == DB_FILL) {
+			/* will be handled in dbuf_read_done or dbuf_rele */
 			db->db_d.db_freed_in_flight = TRUE;
 			mutex_exit(&db->db_mtx);
 			continue;
@@ -844,16 +838,12 @@
 	/* XXX does *this* func really need the lock? */
 	ASSERT(RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock));
 
-	if (osize == size)
-		return;
-
 	/*
 	 * This call to dbuf_will_dirty() with the dn_struct_rwlock held
 	 * is OK, because there can be no other references to the db
 	 * when we are changing its size, so no concurrent DB_FILL can
 	 * be happening.
 	 */
-	/* Make a copy of the data if necessary */
 	/*
 	 * XXX we should be doing a dbuf_read, checking the return
 	 * value and returning that up to our callers
@@ -871,12 +861,10 @@
 		bzero((uint8_t *)buf->b_data + osize, size - osize);
 
 	mutex_enter(&db->db_mtx);
-	/* ASSERT3U(refcount_count(&db->db_holds), ==, 1); */
 	dbuf_set_data(db, buf);
 	VERIFY(arc_buf_remove_ref(obuf, db) == 1);
 	db->db.db_size = size;
 
-	/* fix up the dirty info */
 	if (db->db_level == 0)
 		db->db_d.db_data_old[tx->tx_txg&TXG_MASK] = buf;
 	mutex_exit(&db->db_mtx);
@@ -1234,6 +1222,7 @@
 {
 	dnode_t *dn = db->db_dnode;
 	dmu_buf_impl_t *parent = db->db_parent;
+	dmu_buf_impl_t *dndb = dn->dn_dbuf;
 	int dbuf_gone = FALSE;
 
 	ASSERT(MUTEX_HELD(&db->db_mtx));
@@ -1270,7 +1259,7 @@
 	 * If this dbuf is referened from an indirect dbuf,
 	 * decrement the ref count on the indirect dbuf.
 	 */
-	if (parent && parent != dn->dn_dbuf)
+	if (parent && parent != dndb)
 		dbuf_rele(parent, db);
 }
 
@@ -1305,17 +1294,23 @@
 			return (err);
 		err = dbuf_read(*parentp, NULL,
 		    (DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH | DB_RF_CANFAIL));
-		if (err == 0) {
-			*bpp = ((blkptr_t *)(*parentp)->db.db_data) +
-			    (blkid & ((1ULL << epbs) - 1));
+		if (err) {
+			dbuf_rele(*parentp, NULL);
+			*parentp = NULL;
+			return (err);
 		}
-		return (err);
+		*bpp = ((blkptr_t *)(*parentp)->db.db_data) +
+		    (blkid & ((1ULL << epbs) - 1));
+		return (0);
 	} else {
 		/* the block is referenced from the dnode */
 		ASSERT3U(level, ==, nlevels-1);
 		ASSERT(dn->dn_phys->dn_nblkptr == 0 ||
 		    blkid < dn->dn_phys->dn_nblkptr);
-		*parentp = dn->dn_dbuf;
+		if (dn->dn_dbuf) {
+			dbuf_add_ref(dn->dn_dbuf, NULL);
+			*parentp = dn->dn_dbuf;
+		}
 		*bpp = &dn->dn_phys->dn_blkptr[blkid];
 		return (0);
 	}
@@ -1427,18 +1422,9 @@
 		 * remove it from that list.
 		 */
 		if (list_link_active(&db->db_link)) {
-			int need_mutex;
-
-			ASSERT(!MUTEX_HELD(&dn->dn_dbufs_mtx));
-			need_mutex = !MUTEX_HELD(&dn->dn_dbufs_mtx);
-			if (need_mutex)
-				mutex_enter(&dn->dn_dbufs_mtx);
-
-			/* remove from dn_dbufs */
+			mutex_enter(&dn->dn_dbufs_mtx);
 			list_remove(&dn->dn_dbufs, db);
-
-			if (need_mutex)
-				mutex_exit(&dn->dn_dbufs_mtx);
+			mutex_exit(&dn->dn_dbufs_mtx);
 
 			dnode_rele(dn, db);
 		}
@@ -1494,7 +1480,7 @@
 			    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
 			    (ARC_NOWAIT | ARC_PREFETCH), &zb);
 		}
-		if (parent && parent != dn->dn_dbuf)
+		if (parent)
 			dbuf_rele(parent, NULL);
 	}
 }
@@ -1522,12 +1508,13 @@
 		blkptr_t *bp = NULL;
 		int err;
 
+		ASSERT3P(parent, ==, NULL);
 		err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp);
 		if (fail_sparse) {
 			if (err == 0 && bp && BP_IS_HOLE(bp))
 				err = ENOENT;
 			if (err) {
-				if (parent && parent != dn->dn_dbuf)
+				if (parent)
 					dbuf_rele(parent, NULL);
 				return (err);
 			}
@@ -1541,6 +1528,10 @@
 		arc_buf_add_ref(db->db_buf, db);
 		if (db->db_buf->b_data == NULL) {
 			dbuf_clear(db);
+			if (parent) {
+				dbuf_rele(parent, NULL);
+				parent = NULL;
+			}
 			goto top;
 		}
 		ASSERT3P(db->db.db_data, ==, db->db_buf->b_data);
@@ -1572,7 +1563,7 @@
 	mutex_exit(&db->db_mtx);
 
 	/* NOTE: we can't rele the parent until after we drop the db_mtx */
-	if (parent && parent != dn->dn_dbuf)
+	if (parent)
 		dbuf_rele(parent, NULL);
 
 	ASSERT3P(db->db_dnode, ==, dn);
--- a/usr/src/uts/common/fs/zfs/dmu.c	Fri Mar 10 16:08:51 2006 -0800
+++ b/usr/src/uts/common/fs/zfs/dmu.c	Fri Mar 10 16:27:46 2006 -0800
@@ -1638,7 +1638,7 @@
 	 * we go trundling through the block pointers.
 	 */
 	for (i = 0; i < TXG_SIZE; i++) {
-		if (dn->dn_dirtyblksz[i])
+		if (list_link_active(&dn->dn_dirty_link[i]))
 			break;
 	}
 	if (i != TXG_SIZE) {
--- a/usr/src/uts/common/fs/zfs/dmu_objset.c	Fri Mar 10 16:08:51 2006 -0800
+++ b/usr/src/uts/common/fs/zfs/dmu_objset.c	Fri Mar 10 16:27:46 2006 -0800
@@ -279,41 +279,43 @@
 dmu_objset_evict_dbufs(objset_t *os)
 {
 	objset_impl_t *osi = os->os;
-	dnode_t *mdn = osi->os_meta_dnode;
 	dnode_t *dn;
-	int allzero = B_TRUE;
+
+	mutex_enter(&osi->os_lock);
+
+	/* process the mdn last, since the other dnodes have holds on it */
+	list_remove(&osi->os_dnodes, osi->os_meta_dnode);
+	list_insert_tail(&osi->os_dnodes, osi->os_meta_dnode);
 
 	/*
-	 * Each time we process an entry on the list, we first move it
-	 * to the tail so that we don't process it over and over again.
-	 * We use the meta-dnode as a marker: if we make a complete pass
-	 * over the list without finding any work to do, we're done.
-	 * This ensures that we complete in linear time rather than
-	 * quadratic time, as described in detail in bug 1182169.
+	 * Find the first dnode with holds.  We have to do this dance
+	 * because dnode_add_ref() only works if you already have a
+	 * hold.  If there are no holds then it has no dbufs so OK to
+	 * skip.
 	 */
-	mutex_enter(&osi->os_lock);
-	list_remove(&osi->os_dnodes, mdn);
-	list_insert_tail(&osi->os_dnodes, mdn);
-	while ((dn = list_head(&osi->os_dnodes)) != NULL) {
-		list_remove(&osi->os_dnodes, dn);
-		list_insert_tail(&osi->os_dnodes, dn);
-		if (dn == mdn) {
-			if (allzero)
-				break;
-			allzero = B_TRUE;
-			continue;
-		}
-		if (!refcount_is_zero(&dn->dn_holds)) {
-			allzero = B_FALSE;
-			dnode_add_ref(dn, FTAG);
-			mutex_exit(&osi->os_lock);
-			dnode_evict_dbufs(dn);
-			dnode_rele(dn, FTAG);
-			mutex_enter(&osi->os_lock);
-		}
+	for (dn = list_head(&osi->os_dnodes);
+	    dn && refcount_is_zero(&dn->dn_holds);
+	    dn = list_next(&osi->os_dnodes, dn))
+		continue;
+	if (dn)
+		dnode_add_ref(dn, FTAG);
+
+	while (dn) {
+		dnode_t *next_dn = dn;
+
+		do {
+			next_dn = list_next(&osi->os_dnodes, next_dn);
+		} while (next_dn && refcount_is_zero(&next_dn->dn_holds));
+		if (next_dn)
+			dnode_add_ref(next_dn, FTAG);
+
+		mutex_exit(&osi->os_lock);
+		dnode_evict_dbufs(dn);
+		dnode_rele(dn, FTAG);
+		mutex_enter(&osi->os_lock);
+		dn = next_dn;
 	}
 	mutex_exit(&osi->os_lock);
-	dnode_evict_dbufs(mdn);
 }
 
 void
--- a/usr/src/uts/common/fs/zfs/dmu_tx.c	Fri Mar 10 16:08:51 2006 -0800
+++ b/usr/src/uts/common/fs/zfs/dmu_tx.c	Fri Mar 10 16:27:46 2006 -0800
@@ -331,26 +331,53 @@
 	uint64_t space = 0;
 	dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
 
-	if (dn->dn_datablkshift == 0)
-		return;
 	/*
-	 * not that the dnode can change, since it isn't dirty, but
-	 * dbuf_hold_impl() wants us to have the struct_rwlock.
-	 * also need it to protect dn_maxblkid.
+	 * We don't use any locking to check for dirtyness because it's
+	 * OK if we get stale data -- the dnode may become dirty
+	 * immediately after our check anyway.  This is just a means to
+	 * avoid the expensive count when we aren't sure we need it.  We
+	 * need to be able to deal with a dirty dnode.
+	 */
+	if ((uintptr_t)dn->dn_assigned_tx |
+	    list_link_active(&dn->dn_dirty_link[0]) |
+	    list_link_active(&dn->dn_dirty_link[1]) |
+	    list_link_active(&dn->dn_dirty_link[2]) |
+	    list_link_active(&dn->dn_dirty_link[3]))
+		return;
+
+	/*
+	 * the struct_rwlock protects us against dn_phys->dn_nlevels
+	 * changing, in case (against all odds) we manage to dirty &
+	 * sync out the changes after we check for being dirty.
+	 * also, dbuf_hold_impl() wants us to have the struct_rwlock.
+	 *
+	 * It's fine to use dn_datablkshift rather than the dn_phys
+	 * equivalent because if it is changing, maxblkid==0 and we will
+	 * bail.
 	 */
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
-	blkid = off >> dn->dn_datablkshift;
-	nblks = (off + len) >> dn->dn_datablkshift;
+	if (dn->dn_phys->dn_maxblkid == 0) {
+		if (off == 0 && len >= dn->dn_datablksz) {
+			blkid = 0;
+			nblks = 1;
+		} else {
+			rw_exit(&dn->dn_struct_rwlock);
+			return;
+		}
+	} else {
+		blkid = off >> dn->dn_datablkshift;
+		nblks = (off + len) >> dn->dn_datablkshift;
 
-	if (blkid >= dn->dn_maxblkid) {
-		rw_exit(&dn->dn_struct_rwlock);
-		return;
+		if (blkid >= dn->dn_phys->dn_maxblkid) {
+			rw_exit(&dn->dn_struct_rwlock);
+			return;
+		}
+		if (blkid + nblks > dn->dn_phys->dn_maxblkid)
+			nblks = dn->dn_phys->dn_maxblkid - blkid;
+
+		/* don't bother after 128,000 blocks */
+		nblks = MIN(nblks, 128*1024);
 	}
-	if (blkid + nblks > dn->dn_maxblkid)
-		nblks = dn->dn_maxblkid - blkid;
-
-	/* don't bother after the 100,000 blocks */
-	nblks = MIN(nblks, 128*1024);
 
 	if (dn->dn_phys->dn_nlevels == 1) {
 		int i;
@@ -399,9 +426,10 @@
 				}
 			}
 			dbuf_rele(dbuf, FTAG);
-		} else {
-			/* the indirect block is sparse */
-			ASSERT(err == ENOENT);
+		}
+		if (err != 0 && err != ENOENT) {
+			tx->tx_err = err;
+			break;
 		}
 
 		blkid += tochk;
@@ -416,7 +444,7 @@
 dmu_tx_hold_free_impl(dmu_tx_t *tx, dnode_t *dn, uint64_t off, uint64_t len)
 {
 	uint64_t start, end, i;
-	int dirty, err, shift;
+	int err, shift;
 	zio_t *zio;
 
 	/* first block */
@@ -465,12 +493,7 @@
 	}
 
 	dmu_tx_count_dnode(tx, dn);
-
-	/* XXX locking */
-	dirty = dn->dn_dirtyblksz[0] | dn->dn_dirtyblksz[1] |
-	    dn->dn_dirtyblksz[2] | dn->dn_dirtyblksz[3];
-	if (dn->dn_assigned_tx != NULL && !dirty)
-		dmu_tx_count_free(tx, dn, off, len);
+	dmu_tx_count_free(tx, dn, off, len);
 }
 
 void
--- a/usr/src/uts/common/fs/zfs/dnode.c	Fri Mar 10 16:08:51 2006 -0800
+++ b/usr/src/uts/common/fs/zfs/dnode.c	Fri Mar 10 16:27:46 2006 -0800
@@ -356,7 +356,8 @@
 	for (i = 0; i < TXG_SIZE; i++) {
 		ASSERT3U(dn->dn_next_nlevels[i], ==, 0);
 		ASSERT3U(dn->dn_next_indblkshift[i], ==, 0);
-		ASSERT3U(dn->dn_dirtyblksz[i], ==, 0);
+		ASSERT3U(dn->dn_next_blksz[i], ==, 0);
+		ASSERT(!list_link_active(&dn->dn_dirty_link[i]));
 		ASSERT3P(list_head(&dn->dn_dirty_dbufs[i]), ==, NULL);
 		ASSERT3U(avl_numnodes(&dn->dn_ranges[i]), ==, 0);
 	}
@@ -386,6 +387,8 @@
 dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
     dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
 {
+	int i;
+
 	ASSERT3U(blocksize, >=, SPA_MINBLOCKSIZE);
 	ASSERT3U(blocksize, <=, SPA_MAXBLOCKSIZE);
 	ASSERT3U(blocksize % SPA_MINBLOCKSIZE, ==, 0);
@@ -395,10 +398,9 @@
 	    (bonustype != DMU_OT_NONE && bonuslen != 0));
 	ASSERT3U(bonustype, <, DMU_OT_NUMTYPES);
 	ASSERT3U(bonuslen, <=, DN_MAX_BONUSLEN);
-	ASSERT(dn->dn_dirtyblksz[0] == 0);
-	ASSERT(dn->dn_dirtyblksz[1] == 0);
-	ASSERT(dn->dn_dirtyblksz[2] == 0);
-	ASSERT(dn->dn_dirtyblksz[3] == 0);
+
+	for (i = 0; i < TXG_SIZE; i++)
+		ASSERT(!list_link_active(&dn->dn_dirty_link[i]));
 
 	/* clean up any unreferenced dbufs */
 	dnode_evict_dbufs(dn);
@@ -418,9 +420,7 @@
 	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
 	dnode_setdblksz(dn, blocksize);
 	dnode_setdirty(dn, tx);
-	/* don't need dd_dirty_mtx, dnode is already dirty */
-	ASSERT(dn->dn_dirtyblksz[tx->tx_txg&TXG_MASK] != 0);
-	dn->dn_dirtyblksz[tx->tx_txg&TXG_MASK] = blocksize;
+	dn->dn_next_blksz[tx->tx_txg&TXG_MASK] = blocksize;
 	rw_exit(&dn->dn_struct_rwlock);
 
 	/* change type */
@@ -508,7 +508,7 @@
 		ASSERT(refcount_is_zero(&dn->dn_tx_holds));
 
 		for (n = 0; n < TXG_SIZE; n++)
-			ASSERT(dn->dn_dirtyblksz[n] == 0);
+			ASSERT(!list_link_active(&dn->dn_dirty_link[n]));
 #endif
 		children_dnodes[i] = NULL;
 		dnode_destroy(dn);
@@ -660,14 +660,13 @@
 	/*
 	 * If we are already marked dirty, we're done.
 	 */
-	if (dn->dn_dirtyblksz[txg&TXG_MASK] > 0) {
+	if (list_link_active(&dn->dn_dirty_link[txg & TXG_MASK])) {
 		mutex_exit(&os->os_lock);
 		return;
 	}
 
 	ASSERT(!refcount_is_zero(&dn->dn_holds) || list_head(&dn->dn_dbufs));
 	ASSERT(dn->dn_datablksz != 0);
-	dn->dn_dirtyblksz[txg&TXG_MASK] = dn->dn_datablksz;
 
 	dprintf_ds(os->os_dsl_dataset, "obj=%llu txg=%llu\n",
 	    dn->dn_object, txg);
@@ -699,6 +698,8 @@
 void
 dnode_free(dnode_t *dn, dmu_tx_t *tx)
 {
+	int txgoff = tx->tx_txg & TXG_MASK;
+
 	dprintf("dn=%p txg=%llu\n", dn, tx->tx_txg);
 
 	/* we should be the only holder... hopefully */
@@ -717,11 +718,9 @@
 	 * the dirty list to the free list.
 	 */
 	mutex_enter(&dn->dn_objset->os_lock);
-	if (dn->dn_dirtyblksz[tx->tx_txg&TXG_MASK] > 0) {
-		list_remove(
-		    &dn->dn_objset->os_dirty_dnodes[tx->tx_txg&TXG_MASK], dn);
-		list_insert_tail(
-		    &dn->dn_objset->os_free_dnodes[tx->tx_txg&TXG_MASK], dn);
+	if (list_link_active(&dn->dn_dirty_link[txgoff])) {
+		list_remove(&dn->dn_objset->os_dirty_dnodes[txgoff], dn);
+		list_insert_tail(&dn->dn_objset->os_free_dnodes[txgoff], dn);
 		mutex_exit(&dn->dn_objset->os_lock);
 	} else {
 		mutex_exit(&dn->dn_objset->os_lock);
@@ -760,16 +759,7 @@
 	if (dn->dn_phys->dn_maxblkid != 0)
 		goto end;
 
-	/*
-	 * Any buffers allocated for blocks beyond the first
-	 * must be evictable/evicted, because they're the wrong size.
-	 */
 	mutex_enter(&dn->dn_dbufs_mtx);
-	/*
-	 * Since we have the dn_dbufs_mtx, nothing can be
-	 * removed from dn_dbufs.  Since we have dn_struct_rwlock/w,
-	 * nothing can be added to dn_dbufs.
-	 */
 	for (db = list_head(&dn->dn_dbufs); db; db = db_next) {
 		db_next = list_next(&dn->dn_dbufs, db);
 
@@ -782,29 +772,21 @@
 	}
 	mutex_exit(&dn->dn_dbufs_mtx);
 
-	/* Fast-track if there is no data in the file */
-	if (BP_IS_HOLE(&dn->dn_phys->dn_blkptr[0]) && !have_db0) {
-		dnode_setdblksz(dn, size);
-		dn->dn_indblkshift = ibs;
-		dnode_setdirty(dn, tx);
-		/* don't need dd_dirty_mtx, dnode is already dirty */
-		dn->dn_dirtyblksz[tx->tx_txg&TXG_MASK] = size;
-		dn->dn_next_indblkshift[tx->tx_txg&TXG_MASK] = ibs;
-		rw_exit(&dn->dn_struct_rwlock);
-		return (0);
+	db = NULL;
+	if (!BP_IS_HOLE(&dn->dn_phys->dn_blkptr[0]) || have_db0) {
+		/* obtain the old block */
+		db = dbuf_hold(dn, 0, FTAG);
+		dbuf_new_size(db, size, tx);
 	}
 
-	/* obtain the old block */
-	db = dbuf_hold(dn, 0, FTAG);
-
-	dbuf_new_size(db, size, tx);
-
 	dnode_setdblksz(dn, size);
 	dn->dn_indblkshift = ibs;
-	/* don't need dd_dirty_mtx, dnode is already dirty */
-	dn->dn_dirtyblksz[tx->tx_txg&TXG_MASK] = size;
+	dnode_setdirty(dn, tx);
+	dn->dn_next_blksz[tx->tx_txg&TXG_MASK] = size;
 	dn->dn_next_indblkshift[tx->tx_txg&TXG_MASK] = ibs;
-	dbuf_rele(db, FTAG);
+
+	if (db)
+		dbuf_rele(db, FTAG);
 
 	err = 0;
 end:
@@ -827,71 +809,45 @@
 {
 	uint64_t txgoff = tx->tx_txg & TXG_MASK;
 	int drop_struct_lock = FALSE;
-	int epbs, old_nlevels, new_nlevels;
+	int epbs, new_nlevels;
 	uint64_t sz;
 
-	if (blkid == DB_BONUS_BLKID)
-		return;
+	ASSERT(blkid != DB_BONUS_BLKID);
 
 	if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
 		rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
 		drop_struct_lock = TRUE;
 	}
 
-	if (blkid > dn->dn_maxblkid)
-		dn->dn_maxblkid = blkid;
+	if (blkid <= dn->dn_maxblkid)
+		goto out;
+
+	dn->dn_maxblkid = blkid;
 
 	/*
-	 * Compute the number of levels necessary to support the
-	 * new blkid.
+	 * Compute the number of levels necessary to support the new maxblkid.
 	 */
 	new_nlevels = 1;
 	epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
-
-	for (sz = dn->dn_nblkptr; sz <= blkid && sz >= dn->dn_nblkptr;
-	    sz <<= epbs)
+	for (sz = dn->dn_nblkptr;
+	    sz <= blkid && sz >= dn->dn_nblkptr; sz <<= epbs)
 		new_nlevels++;
-	old_nlevels = dn->dn_nlevels;
 
-	if (new_nlevels > dn->dn_next_nlevels[txgoff])
+	if (new_nlevels > dn->dn_nlevels) {
+		int old_nlevels = dn->dn_nlevels;
+		dmu_buf_impl_t *db;
+
+		dn->dn_nlevels = new_nlevels;
+
+		ASSERT3U(new_nlevels, >, dn->dn_next_nlevels[txgoff]);
 		dn->dn_next_nlevels[txgoff] = new_nlevels;
 
-	if (new_nlevels > old_nlevels) {
-		dprintf("dn %p increasing nlevels from %u to %u\n",
-		    dn, dn->dn_nlevels, new_nlevels);
-		dn->dn_nlevels = new_nlevels;
-	}
-
-	/*
-	 * Dirty the left indirects.
-	 * Note: the caller should have just dnode_use_space()'d one
-	 * data block's worth, so we could subtract that out of
-	 * dn_inflight_data to determine if there is any dirty data
-	 * besides this block.
-	 * We don't strictly need to dirty them unless there's
-	 * *something* in the object (eg. on disk or dirty)...
-	 */
-	if (new_nlevels > old_nlevels) {
-		dmu_buf_impl_t *db = dbuf_hold_level(dn, old_nlevels, 0, FTAG);
-		dprintf("dn %p dirtying left indirects\n", dn);
+		/* Dirty the left indirects.  */
+		db = dbuf_hold_level(dn, old_nlevels, 0, FTAG);
 		dbuf_dirty(db, tx);
 		dbuf_rele(db, FTAG);
+
 	}
-#ifdef ZFS_DEBUG
-	else if (old_nlevels > 1 && new_nlevels > old_nlevels) {
-		dmu_buf_impl_t *db;
-		int i;
-
-		for (i = 0; i < dn->dn_nblkptr; i++) {
-			db = dbuf_hold_level(dn, old_nlevels-1, i, FTAG);
-			ASSERT(!
-			    list_link_active(&db->db_dirty_node[txgoff]));
-			dbuf_rele(db, FTAG);
-		}
-	}
-#endif
-
-	dprintf("dn %p done\n", dn);
 
 out:
 	if (drop_struct_lock)
--- a/usr/src/uts/common/fs/zfs/dnode_sync.c	Fri Mar 10 16:08:51 2006 -0800
+++ b/usr/src/uts/common/fs/zfs/dnode_sync.c	Fri Mar 10 16:27:46 2006 -0800
@@ -346,33 +346,58 @@
 void
 dnode_evict_dbufs(dnode_t *dn)
 {
-	dmu_buf_impl_t *db;
+	int progress;
+	int pass = 0;
+
+	do {
+		dmu_buf_impl_t *db, *db_next;
+		int evicting = FALSE;
 
-	mutex_enter(&dn->dn_dbufs_mtx);
-	while (db = list_head(&dn->dn_dbufs)) {
-		int progress = 0;
-		for (; db; db = list_next(&dn->dn_dbufs, db)) {
+		progress = FALSE;
+		mutex_enter(&dn->dn_dbufs_mtx);
+		for (db = list_head(&dn->dn_dbufs); db; db = db_next) {
+			/* dbuf_clear() may remove db from this list */
+			db_next = list_next(&dn->dn_dbufs, db);
+
 			mutex_enter(&db->db_mtx);
-			if (db->db_state != DB_EVICTING &&
-			    refcount_is_zero(&db->db_holds))
-				break;
-			else if (db->db_state == DB_EVICTING)
-				progress = 1;
-			mutex_exit(&db->db_mtx);
+			if (db->db_state == DB_EVICTING) {
+				progress = TRUE;
+				evicting = TRUE;
+				mutex_exit(&db->db_mtx);
+			} else if (refcount_is_zero(&db->db_holds)) {
+				progress = TRUE;
+				ASSERT(!arc_released(db->db_buf));
+				dbuf_clear(db); /* exits db_mtx for us */
+			} else {
+				mutex_exit(&db->db_mtx);
+			}
+
 		}
-		if (db) {
-			ASSERT(!arc_released(db->db_buf));
-			dbuf_clear(db);
-			mutex_exit(&dn->dn_dbufs_mtx);
-			progress = 1;
-		} else {
-			if (progress == 0)
-				break;
-			mutex_exit(&dn->dn_dbufs_mtx);
-		}
-		mutex_enter(&dn->dn_dbufs_mtx);
+		/*
+		 * NB: we need to drop dn_dbufs_mtx between passes so
+		 * that any DB_EVICTING dbufs can make progress.
+		 * Ideally, we would have some cv we could wait on, but
+		 * since we don't, just wait a bit to give the other
+		 * thread a chance to run.
+		 */
+		mutex_exit(&dn->dn_dbufs_mtx);
+		if (evicting)
+			delay(1);
+		pass++;
+		ASSERT(pass < 100); /* sanity check */
+	} while (progress);
+
+	/*
+	 * This function works fine even if it can't evict everything,
+	 * but all of our callers need this assertion, so let's put it
+	 * here (for now).  Perhaps in the future there will be a try vs
+	 * doall flag.
+	 */
+	if (list_head(&dn->dn_dbufs) != NULL) {
+		panic("dangling dbufs (dn=%p, dbuf=%p)\n",
+		    dn, list_head(&dn->dn_dbufs));
 	}
-	mutex_exit(&dn->dn_dbufs_mtx);
+
 	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
 	if (dn->dn_bonus && refcount_is_zero(&dn->dn_bonus->db_holds)) {
 		mutex_enter(&dn->dn_bonus->db_mtx);
@@ -380,6 +405,7 @@
 		dn->dn_bonus = NULL;
 	}
 	rw_exit(&dn->dn_struct_rwlock);
+
 }
 
 static int
@@ -420,6 +446,7 @@
 	/* Undirty next bits */
 	dn->dn_next_nlevels[txgoff] = 0;
 	dn->dn_next_indblkshift[txgoff] = 0;
+	dn->dn_next_blksz[txgoff] = 0;
 
 	/* free up all the blocks in the file. */
 	dnode_sync_free_range(dn, 0, dn->dn_phys->dn_maxblkid+1, tx);
@@ -436,7 +463,6 @@
 
 	mutex_enter(&dn->dn_mtx);
 	dn->dn_type = DMU_OT_NONE;
-	dn->dn_dirtyblksz[txgoff] = 0;
 	dn->dn_maxblkid = 0;
 	dn->dn_allocated_txg = 0;
 	mutex_exit(&dn->dn_mtx);
@@ -465,13 +491,10 @@
 	int txgoff = tx->tx_txg & TXG_MASK;
 	dnode_phys_t *dnp = dn->dn_phys;
 
-	/* ASSERT(dn->dn_objset->dd_snapshot == NULL); */
 	ASSERT(dmu_tx_is_syncing(tx));
-	ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
-	    dn->dn_dirtyblksz[txgoff] > 0);
-
 	ASSERT(dnp->dn_type != DMU_OT_NONE || dn->dn_allocated_txg);
 	DNODE_VERIFY(dn);
+
 	/*
 	 * Make sure the dbuf for the dn_phys is released before we modify it.
 	 */
@@ -502,11 +525,12 @@
 		dnp->dn_nblkptr = dn->dn_nblkptr;
 	}
 
-	if (dn->dn_dirtyblksz[txgoff]) {
-		ASSERT(P2PHASE(dn->dn_dirtyblksz[txgoff],
+	if (dn->dn_next_blksz[txgoff]) {
+		ASSERT(P2PHASE(dn->dn_next_blksz[txgoff],
 		    SPA_MINBLOCKSIZE) == 0);
 		dnp->dn_datablkszsec =
-		    dn->dn_dirtyblksz[txgoff] >> SPA_MINBLOCKSHIFT;
+		    dn->dn_next_blksz[txgoff] >> SPA_MINBLOCKSHIFT;
+		dn->dn_next_blksz[txgoff] = 0;
 	}
 
 	if (dn->dn_next_indblkshift[txgoff]) {
@@ -577,9 +601,6 @@
 		    dnode_next_offset(dn, FALSE, &off, 1, 1) == ESRCH))
 			panic("data after EOF: off=%llu\n", (u_longlong_t)off);
 
-		dn->dn_dirtyblksz[txgoff] = 0;
-
-
 		if (dn->dn_object != DMU_META_DNODE_OBJECT) {
 			dbuf_will_dirty(dn->dn_dbuf, tx);
 			dnode_rele(dn, (void *)(uintptr_t)tx->tx_txg);
--- a/usr/src/uts/common/fs/zfs/sys/dnode.h	Fri Mar 10 16:08:51 2006 -0800
+++ b/usr/src/uts/common/fs/zfs/sys/dnode.h	Fri Mar 10 16:27:46 2006 -0800
@@ -112,36 +112,9 @@
 
 typedef struct dnode {
 	/*
-	 * lock ordering:
-	 *
-	 * db_mtx > dn_dirty_mtx
-	 * 	dbuf_syncdone
-	 *
-	 * dn_struct_rwlock/r > dn_dirty_mtx
-	 * 	dmu_object_info
-	 *
-	 * dn_struct_rwlock/r > db_mtx > dn_dirty_mtx
-	 * 	dbuf_dirty
-	 * 	dbuf_setdirty
-	 *
-	 * dn_struct_rwlock/w > db_mtx > dn_mtx
-	 * 	dnode_increase_indirection -> dbuf_find
-	 * 	dbuf_hold_impl
-	 * 	dnode_set_bonus
-	 *
-	 * dn_struct_rwlock/w > dn_mtx
-	 * 	dnode_increase_indirection
-	 *
-	 * dn_dirty_mtx > dn_mtx
-	 * 	dnode_buf_pageout
-	 *
-	 * db_mtx > dn_mtx
-	 * 	dbuf_create
-	 */
-
-	/*
-	 * dn_struct_rwlock protects the structure of the dnode.
-	 * In particular, it protects the number of levels of indirection.
+	 * dn_struct_rwlock protects the structure of the dnode,
+	 * including the number of levels of indirection (dn_nlevels),
+	 * dn_maxblkid, and dn_next_*
 	 */
 	krwlock_t dn_struct_rwlock;
 
@@ -158,38 +131,32 @@
 	dnode_phys_t *dn_phys; /* pointer into dn->dn_dbuf->db.db_data */
 
 	/*
-	 * Copies of stuff in dn_phys.  They're valid here even before
-	 * the dnode is first synced.
+	 * Copies of stuff in dn_phys.  They're valid in the open
+	 * context (eg. even before the dnode is first synced).
+	 * Where necessary, these are protected by dn_struct_rwlock.
 	 */
-	dmu_object_type_t dn_type;	/* object type (immutable) */
-	uint8_t dn_bonustype;		/* bonus type (immutable) */
-	uint16_t dn_bonuslen;		/* bonus length (immutable) */
+	dmu_object_type_t dn_type;	/* object type */
+	uint16_t dn_bonuslen;		/* bonus length */
+	uint8_t dn_bonustype;		/* bonus type */
 	uint8_t dn_nblkptr;		/* number of blkptrs (immutable) */
-	uint8_t dn_datablkshift;	/* zero if blksz not power of 2! */
-	uint32_t dn_datablksz;		/* in bytes */
-	uint16_t dn_datablkszsec;	/* in 512b sectors */
-
 	uint8_t dn_checksum;		/* ZIO_CHECKSUM type */
 	uint8_t dn_compress;		/* ZIO_COMPRESS type */
-
-	/*
-	 * The following are kept up-to-date in the *open* context, the syncing
-	 * context should only pay attention to the dn_next_* values.
-	 */
 	uint8_t dn_nlevels;
 	uint8_t dn_indblkshift;
-
+	uint8_t dn_datablkshift;	/* zero if blksz not power of 2! */
+	uint16_t dn_datablkszsec;	/* in 512b sectors */
+	uint32_t dn_datablksz;		/* in bytes */
+	uint64_t dn_maxblkid;
 	uint8_t dn_next_nlevels[TXG_SIZE];
 	uint8_t dn_next_indblkshift[TXG_SIZE];
+	uint32_t dn_next_blksz[TXG_SIZE];	/* next block size in bytes */
 
 	/* protected by os_lock: */
-	uint32_t dn_dirtyblksz[TXG_SIZE];	/* dirty block size in bytes */
 	list_node_t dn_dirty_link[TXG_SIZE];	/* next on dataset's dirty */
 
 	/* protected by dn_mtx: */
 	kmutex_t dn_mtx;
 	list_t dn_dirty_dbufs[TXG_SIZE];
-	uint64_t dn_maxblkid;
 	avl_tree_t dn_ranges[TXG_SIZE];
 	uint64_t dn_allocated_txg;
 	uint64_t dn_free_txg;