6569719 panic dangling dbufs (dn=ffffffff28814d30, dbuf=ffffffff20756008)
authormaybee
Sun, 26 Aug 2007 11:19:04 -0700
changeset 4944 96d96f8de974
parent 4943 f73f303e6a06
child 4945 7571ee3e5ef4
6569719 panic dangling dbufs (dn=ffffffff28814d30, dbuf=ffffffff20756008) 6573361 panic turnstile_block, unowned mutex 6584864 $MOS is not properly bounded by pool size 6585265 need bonus resize interface 6587723 BAD TRAP: type=e (#pf Page fault) occurred in module "zfs" due to a NULL pointer dereference 6589799 dangling dbuf after zinject 6594025 panic: dangling dbufs during shutdown
usr/src/cmd/ztest/ztest.c
usr/src/uts/common/fs/zfs/dbuf.c
usr/src/uts/common/fs/zfs/dmu.c
usr/src/uts/common/fs/zfs/dmu_objset.c
usr/src/uts/common/fs/zfs/dmu_send.c
usr/src/uts/common/fs/zfs/dnode.c
usr/src/uts/common/fs/zfs/dnode_sync.c
usr/src/uts/common/fs/zfs/dsl_dir.c
usr/src/uts/common/fs/zfs/metaslab.c
usr/src/uts/common/fs/zfs/spa_misc.c
usr/src/uts/common/fs/zfs/sys/dbuf.h
usr/src/uts/common/fs/zfs/sys/dmu.h
usr/src/uts/common/fs/zfs/sys/dmu_objset.h
usr/src/uts/common/fs/zfs/sys/dnode.h
usr/src/uts/common/fs/zfs/vdev.c
usr/src/uts/common/fs/zfs/zfs_vfsops.c
usr/src/uts/common/os/list.c
--- a/usr/src/cmd/ztest/ztest.c	Fri Aug 24 17:45:12 2007 -0700
+++ b/usr/src/cmd/ztest/ztest.c	Sun Aug 26 11:19:04 2007 -0700
@@ -1541,7 +1541,7 @@
 		ASSERT(doi.doi_bonus_type == DMU_OT_PLAIN_OTHER);
 		ASSERT3S(doi.doi_physical_blks, >=, 0);
 
-		bonuslen = db->db_size;
+		bonuslen = doi.doi_bonus_size;
 
 		for (c = 0; c < bonuslen; c++) {
 			if (((uint8_t *)db->db_data)[c] !=
@@ -1660,7 +1660,7 @@
 		 * Write to both the bonus buffer and the regular data.
 		 */
 		VERIFY(0 == dmu_bonus_hold(os, object, FTAG, &db));
-		ASSERT3U(bonuslen, ==, db->db_size);
+		ASSERT3U(bonuslen, <=, db->db_size);
 
 		dmu_object_size_from_db(db, &va_blksize, &va_nblocks);
 		ASSERT3S(va_nblocks, >=, 0);
@@ -1671,7 +1671,7 @@
 		 * See comments above regarding the contents of
 		 * the bonus buffer and the word at endoff.
 		 */
-		for (c = 0; c < db->db_size; c++)
+		for (c = 0; c < bonuslen; c++)
 			((uint8_t *)db->db_data)[c] = (uint8_t)(c + bonuslen);
 
 		dmu_buf_rele(db, FTAG);
@@ -1948,8 +1948,8 @@
 	 */
 
 	VERIFY(0 == dmu_bonus_hold(os, ZTEST_DIROBJ, FTAG, &db));
-	ASSERT3U(db->db_size, ==, sizeof (rbt));
-	bcopy(db->db_data, &rbt, db->db_size);
+	ASSERT3U(db->db_size, >=, sizeof (rbt));
+	bcopy(db->db_data, &rbt, sizeof (rbt));
 	if (rbt.bt_objset != 0) {
 		ASSERT3U(rbt.bt_objset, ==, dmu_objset_id(os));
 		ASSERT3U(rbt.bt_object, ==, ZTEST_DIROBJ);
@@ -2041,19 +2041,37 @@
 		wbt.bt_thread = za->za_instance;
 
 		if (off == -1ULL) {
+			dmu_object_info_t doi;
+			char *off;
+
 			wbt.bt_seq = 0;
 			VERIFY(0 == dmu_bonus_hold(os, ZTEST_DIROBJ,
 			    FTAG, &db));
-			ASSERT3U(db->db_size, ==, sizeof (wbt));
-			bcopy(db->db_data, &rbt, db->db_size);
+			dmu_object_info_from_db(db, &doi);
+			ASSERT3U(doi.doi_bonus_size, >=, sizeof (wbt));
+			off = (char *)db->db_data +
+			    doi.doi_bonus_size - sizeof (wbt);
+			bcopy(off, &rbt, sizeof (wbt));
 			if (rbt.bt_objset != 0) {
 				ASSERT3U(rbt.bt_objset, ==, wbt.bt_objset);
 				ASSERT3U(rbt.bt_object, ==, wbt.bt_object);
 				ASSERT3U(rbt.bt_offset, ==, wbt.bt_offset);
 				ASSERT3U(rbt.bt_txg, <=, wbt.bt_txg);
 			}
+			if (ztest_random(10) == 0) {
+				int newsize = (ztest_random(
+				    db->db_size / sizeof (wbt)) + 1) *
+				    sizeof (wbt);
+
+				ASSERT3U(newsize, >=, sizeof (wbt));
+				ASSERT3U(newsize, <=, db->db_size);
+				error = dmu_set_bonus(db, newsize, tx);
+				ASSERT3U(error, ==, 0);
+				off = (char *)db->db_data + newsize -
+				    sizeof (wbt);
+			}
 			dmu_buf_will_dirty(db, tx);
-			bcopy(&wbt, db->db_data, db->db_size);
+			bcopy(&wbt, off, db->db_size);
 			dmu_buf_rele(db, FTAG);
 			dmu_tx_commit(tx);
 			continue;
--- a/usr/src/uts/common/fs/zfs/dbuf.c	Fri Aug 24 17:45:12 2007 -0700
+++ b/usr/src/uts/common/fs/zfs/dbuf.c	Sun Aug 26 11:19:04 2007 -0700
@@ -307,7 +307,7 @@
 	}
 	if (db->db_blkid == DB_BONUS_BLKID) {
 		ASSERT(dn != NULL);
-		ASSERT3U(db->db.db_size, ==, dn->dn_bonuslen);
+		ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
 		ASSERT3U(db->db.db_offset, ==, DB_BONUS_BLKID);
 	} else {
 		ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size);
@@ -468,13 +468,15 @@
 	ASSERT(db->db_buf == NULL);
 
 	if (db->db_blkid == DB_BONUS_BLKID) {
-		ASSERT3U(db->db_dnode->dn_bonuslen, ==, db->db.db_size);
+		int bonuslen = db->db_dnode->dn_bonuslen;
+
+		ASSERT3U(bonuslen, <=, db->db.db_size);
 		db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN);
 		arc_space_consume(DN_MAX_BONUSLEN);
-		if (db->db.db_size < DN_MAX_BONUSLEN)
+		if (bonuslen < DN_MAX_BONUSLEN)
 			bzero(db->db.db_data, DN_MAX_BONUSLEN);
 		bcopy(DN_BONUS(db->db_dnode->dn_phys), db->db.db_data,
-		    db->db.db_size);
+		    bonuslen);
 		dbuf_update_data(db);
 		db->db_state = DB_CACHED;
 		mutex_exit(&db->db_mtx);
@@ -781,31 +783,28 @@
 }
 
 static int
-dbuf_new_block(dmu_buf_impl_t *db)
+dbuf_block_freeable(dmu_buf_impl_t *db)
 {
 	dsl_dataset_t *ds = db->db_objset->os_dsl_dataset;
 	uint64_t birth_txg = 0;
 
-	/* Don't count meta-objects */
-	if (ds == NULL)
-		return (FALSE);
-
 	/*
 	 * We don't need any locking to protect db_blkptr:
 	 * If it's syncing, then db_last_dirty will be set
 	 * so we'll ignore db_blkptr.
 	 */
 	ASSERT(MUTEX_HELD(&db->db_mtx));
-	/* If we have been dirtied since the last snapshot, its not new */
 	if (db->db_last_dirty)
 		birth_txg = db->db_last_dirty->dr_txg;
 	else if (db->db_blkptr)
 		birth_txg = db->db_blkptr->blk_birth;
 
+	/* If we don't exist or are in a snapshot, we can't be freed */
 	if (birth_txg)
-		return (!dsl_dataset_block_freeable(ds, birth_txg));
+		return (ds == NULL ||
+		    dsl_dataset_block_freeable(ds, birth_txg));
 	else
-		return (TRUE);
+		return (FALSE);
 }
 
 void
@@ -964,6 +963,27 @@
 
 	dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
 
+	if (db->db_blkid != DB_BONUS_BLKID) {
+		/*
+		 * Update the accounting.
+		 */
+		if (dbuf_block_freeable(db)) {
+			blkptr_t *bp = db->db_blkptr;
+			int64_t willfree = (bp && !BP_IS_HOLE(bp)) ?
+			    bp_get_dasize(os->os_spa, bp) : db->db.db_size;
+			/*
+			 * This is only a guess -- if the dbuf is dirty
+			 * in a previous txg, we don't know how much
+			 * space it will use on disk yet.  We should
+			 * really have the struct_rwlock to access
+			 * db_blkptr, but since this is just a guess,
+			 * it's OK if we get an odd answer.
+			 */
+			dnode_willuse_space(dn, -willfree, tx);
+		}
+		dnode_willuse_space(dn, db->db.db_size, tx);
+	}
+
 	/*
 	 * If this buffer is dirty in an old transaction group we need
 	 * to make a copy of it so that the changes we make in this
@@ -1013,25 +1033,6 @@
 		db->db_freed_in_flight = FALSE;
 	}
 
-	if (db->db_blkid != DB_BONUS_BLKID) {
-		/*
-		 * Update the accounting.
-		 */
-		if (!dbuf_new_block(db) && db->db_blkptr) {
-			/*
-			 * This is only a guess -- if the dbuf is dirty
-			 * in a previous txg, we don't know how much
-			 * space it will use on disk yet.  We should
-			 * really have the struct_rwlock to access
-			 * db_blkptr, but since this is just a guess,
-			 * it's OK if we get an odd answer.
-			 */
-			dnode_willuse_space(dn,
-			    -bp_get_dasize(os->os_spa, db->db_blkptr), tx);
-		}
-		dnode_willuse_space(dn, db->db.db_size, tx);
-	}
-
 	/*
 	 * This buffer is now part of this txg
 	 */
@@ -1297,6 +1298,7 @@
 	if (db->db_blkid != DB_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) {
 		list_remove(&dn->dn_dbufs, db);
 		dnode_rele(dn, db);
+		db->db_dnode = NULL;
 	}
 
 	if (db->db_buf)
@@ -1397,7 +1399,9 @@
 
 	if (blkid == DB_BONUS_BLKID) {
 		ASSERT3P(parent, ==, dn->dn_dbuf);
-		db->db.db_size = dn->dn_bonuslen;
+		db->db.db_size = DN_MAX_BONUSLEN -
+		    (dn->dn_nblkptr-1) * sizeof (blkptr_t);
+		ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
 		db->db.db_offset = DB_BONUS_BLKID;
 		db->db_state = DB_UNCACHED;
 		/* the bonus dbuf is not placed in the hash table */
@@ -1471,29 +1475,23 @@
 	ASSERT(refcount_is_zero(&db->db_holds));
 
 	if (db->db_blkid != DB_BONUS_BLKID) {
-		dnode_t *dn = db->db_dnode;
-		boolean_t need_mutex = !MUTEX_HELD(&dn->dn_dbufs_mtx);
-
-		if (need_mutex)
-			mutex_enter(&dn->dn_dbufs_mtx);
-
 		/*
 		 * If this dbuf is still on the dn_dbufs list,
 		 * remove it from that list.
 		 */
-		if (list_link_active(&db->db_link)) {
-			ASSERT(need_mutex);
+		if (db->db_dnode) {
+			dnode_t *dn = db->db_dnode;
+
+			mutex_enter(&dn->dn_dbufs_mtx);
 			list_remove(&dn->dn_dbufs, db);
 			mutex_exit(&dn->dn_dbufs_mtx);
 
 			dnode_rele(dn, db);
-		} else if (need_mutex) {
-			mutex_exit(&dn->dn_dbufs_mtx);
+			db->db_dnode = NULL;
 		}
 		dbuf_hash_remove(db);
 	}
 	db->db_parent = NULL;
-	db->db_dnode = NULL;
 	db->db_buf = NULL;
 
 	ASSERT(!list_link_active(&db->db_link));
@@ -1662,16 +1660,13 @@
 	return (err ? NULL : db);
 }
 
-dmu_buf_impl_t *
+void
 dbuf_create_bonus(dnode_t *dn)
 {
-	dmu_buf_impl_t *db = dn->dn_bonus;
-
 	ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
 
 	ASSERT(dn->dn_bonus == NULL);
-	db = dbuf_create(dn, 0, DB_BONUS_BLKID, dn->dn_dbuf, NULL);
-	return (db);
+	dn->dn_bonus = dbuf_create(dn, 0, DB_BONUS_BLKID, dn->dn_dbuf, NULL);
 }
 
 #pragma weak dmu_buf_add_ref = dbuf_add_ref
@@ -1919,11 +1914,7 @@
 	 */
 	if (db->db_blkid == DB_BONUS_BLKID) {
 		dbuf_dirty_record_t **drp;
-		/*
-		 * Use dn_phys->dn_bonuslen since db.db_size is the length
-		 * of the bonus buffer in the open transaction rather than
-		 * the syncing transaction.
-		 */
+
 		ASSERT(*datap != NULL);
 		ASSERT3U(db->db_level, ==, 0);
 		ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN);
--- a/usr/src/uts/common/fs/zfs/dmu.c	Fri Aug 24 17:45:12 2007 -0700
+++ b/usr/src/uts/common/fs/zfs/dmu.c	Sun Aug 26 11:19:04 2007 -0700
@@ -119,6 +119,19 @@
 	return (DN_MAX_BONUSLEN);
 }
 
+int
+dmu_set_bonus(dmu_buf_t *db, int newsize, dmu_tx_t *tx)
+{
+	dnode_t *dn = ((dmu_buf_impl_t *)db)->db_dnode;
+
+	if (dn->dn_bonus != (dmu_buf_impl_t *)db)
+		return (EINVAL);
+	if (newsize < 0 || newsize > db->db_size)
+		return (EINVAL);
+	dnode_setbonuslen(dn, newsize, tx);
+	return (0);
+}
+
 /*
  * returns ENOENT, EIO, or 0.
  */
@@ -126,27 +139,27 @@
 dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **dbp)
 {
 	dnode_t *dn;
-	int err, count;
 	dmu_buf_impl_t *db;
+	int error;
 
-	err = dnode_hold(os->os, object, FTAG, &dn);
-	if (err)
-		return (err);
+	error = dnode_hold(os->os, object, FTAG, &dn);
+	if (error)
+		return (error);
 
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
 	if (dn->dn_bonus == NULL) {
 		rw_exit(&dn->dn_struct_rwlock);
 		rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
 		if (dn->dn_bonus == NULL)
-			dn->dn_bonus = dbuf_create_bonus(dn);
+			dbuf_create_bonus(dn);
 	}
 	db = dn->dn_bonus;
 	rw_exit(&dn->dn_struct_rwlock);
-	mutex_enter(&db->db_mtx);
-	count = refcount_add(&db->db_holds, tag);
-	mutex_exit(&db->db_mtx);
-	if (count == 1)
-		dnode_add_ref(dn, db);
+
+	/* as long as the bonus buf is held, the dnode will be held */
+	if (refcount_add(&db->db_holds, tag) == 1)
+		VERIFY(dnode_add_ref(dn, db));
+
 	dnode_rele(dn, FTAG);
 
 	VERIFY(0 == dbuf_read(db, NULL, DB_RF_MUST_SUCCEED));
@@ -388,7 +401,6 @@
 
 	while (size > 0) {
 		uint64_t mylen = MIN(size, DMU_MAX_ACCESS / 2);
-		int err;
 
 		/*
 		 * NB: we could do this block-at-a-time, but it's nice
@@ -397,7 +409,7 @@
 		err = dmu_buf_hold_array_by_dnode(dn, offset, mylen,
 		    TRUE, FTAG, &numbufs, &dbp);
 		if (err)
-			return (err);
+			break;
 
 		for (i = 0; i < numbufs; i++) {
 			int tocpy;
@@ -418,7 +430,7 @@
 		dmu_buf_rele_array(dbp, numbufs, FTAG);
 	}
 	dnode_rele(dn, FTAG);
-	return (0);
+	return (err);
 }
 
 void
--- a/usr/src/uts/common/fs/zfs/dmu_objset.c	Fri Aug 24 17:45:12 2007 -0700
+++ b/usr/src/uts/common/fs/zfs/dmu_objset.c	Sun Aug 26 11:19:04 2007 -0700
@@ -309,7 +309,7 @@
 }
 
 int
-dmu_objset_evict_dbufs(objset_t *os, boolean_t try)
+dmu_objset_evict_dbufs(objset_t *os)
 {
 	objset_impl_t *osi = os->os;
 	dnode_t *dn;
@@ -327,34 +327,25 @@
 	 * skip.
 	 */
 	for (dn = list_head(&osi->os_dnodes);
-	    dn && refcount_is_zero(&dn->dn_holds);
+	    dn && !dnode_add_ref(dn, FTAG);
 	    dn = list_next(&osi->os_dnodes, dn))
 		continue;
-	if (dn)
-		dnode_add_ref(dn, FTAG);
 
 	while (dn) {
 		dnode_t *next_dn = dn;
 
 		do {
 			next_dn = list_next(&osi->os_dnodes, next_dn);
-		} while (next_dn && refcount_is_zero(&next_dn->dn_holds));
-		if (next_dn)
-			dnode_add_ref(next_dn, FTAG);
+		} while (next_dn && !dnode_add_ref(next_dn, FTAG));
 
 		mutex_exit(&osi->os_lock);
-		if (dnode_evict_dbufs(dn, try)) {
-			dnode_rele(dn, FTAG);
-			if (next_dn)
-				dnode_rele(next_dn, FTAG);
-			return (1);
-		}
+		dnode_evict_dbufs(dn);
 		dnode_rele(dn, FTAG);
 		mutex_enter(&osi->os_lock);
 		dn = next_dn;
 	}
 	mutex_exit(&osi->os_lock);
-	return (0);
+	return (list_head(&osi->os_dnodes) != osi->os_meta_dnode);
 }
 
 void
@@ -383,7 +374,7 @@
 	 * nothing can be added to the list at this point.
 	 */
 	os.os = osi;
-	(void) dmu_objset_evict_dbufs(&os, 0);
+	(void) dmu_objset_evict_dbufs(&os);
 
 	ASSERT3P(list_head(&osi->os_dnodes), ==, osi->os_meta_dnode);
 	ASSERT3P(list_tail(&osi->os_dnodes), ==, osi->os_meta_dnode);
--- a/usr/src/uts/common/fs/zfs/dmu_send.c	Fri Aug 24 17:45:12 2007 -0700
+++ b/usr/src/uts/common/fs/zfs/dmu_send.c	Sun Aug 26 11:19:04 2007 -0700
@@ -610,13 +610,13 @@
 		VERIFY(0 == dmu_bonus_hold(os, drro->drr_object, FTAG, &db));
 		dmu_buf_will_dirty(db, tx);
 
-		ASSERT3U(db->db_size, ==, drro->drr_bonuslen);
-		data = restore_read(ra, P2ROUNDUP(db->db_size, 8));
+		ASSERT3U(db->db_size, >=, drro->drr_bonuslen);
+		data = restore_read(ra, P2ROUNDUP(drro->drr_bonuslen, 8));
 		if (data == NULL) {
 			dmu_tx_commit(tx);
 			return (ra->err);
 		}
-		bcopy(data, db->db_data, db->db_size);
+		bcopy(data, db->db_data, drro->drr_bonuslen);
 		if (ra->byteswap) {
 			dmu_ot[drro->drr_bonustype].ot_byteswap(db->db_data,
 			    drro->drr_bonuslen);
--- a/usr/src/uts/common/fs/zfs/dnode.c	Fri Aug 24 17:45:12 2007 -0700
+++ b/usr/src/uts/common/fs/zfs/dnode.c	Sun Aug 26 11:19:04 2007 -0700
@@ -240,6 +240,23 @@
 	else return (0);
 }
 
+void
+dnode_setbonuslen(dnode_t *dn, int newsize, dmu_tx_t *tx)
+{
+	ASSERT3U(refcount_count(&dn->dn_holds), >=, 1);
+
+	dnode_setdirty(dn, tx);
+	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+	ASSERT3U(newsize, <=, DN_MAX_BONUSLEN -
+	    (dn->dn_nblkptr-1) * sizeof (blkptr_t));
+	dn->dn_bonuslen = newsize;
+	if (newsize == 0)
+		dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = DN_ZERO_BONUSLEN;
+	else
+		dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = dn->dn_bonuslen;
+	rw_exit(&dn->dn_struct_rwlock);
+}
+
 static void
 dnode_setdblksz(dnode_t *dn, int size)
 {
@@ -363,6 +380,7 @@
 	for (i = 0; i < TXG_SIZE; i++) {
 		ASSERT3U(dn->dn_next_nlevels[i], ==, 0);
 		ASSERT3U(dn->dn_next_indblkshift[i], ==, 0);
+		ASSERT3U(dn->dn_next_bonuslen[i], ==, 0);
 		ASSERT3U(dn->dn_next_blksz[i], ==, 0);
 		ASSERT(!list_link_active(&dn->dn_dirty_link[i]));
 		ASSERT3P(list_head(&dn->dn_dirty_records[i]), ==, NULL);
@@ -390,6 +408,7 @@
 
 	dnode_setdirty(dn, tx);
 	dn->dn_next_indblkshift[tx->tx_txg & TXG_MASK] = ibs;
+	dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = dn->dn_bonuslen;
 	dn->dn_next_blksz[tx->tx_txg & TXG_MASK] = dn->dn_datablksz;
 }
 
@@ -397,7 +416,7 @@
 dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
     dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
 {
-	int i;
+	int i, old_nblkptr;
 	dmu_buf_impl_t *db = NULL;
 
 	ASSERT3U(blocksize, >=, SPA_MINBLOCKSIZE);
@@ -414,7 +433,7 @@
 		ASSERT(!list_link_active(&dn->dn_dirty_link[i]));
 
 	/* clean up any unreferenced dbufs */
-	(void) dnode_evict_dbufs(dn, 0);
+	dnode_evict_dbufs(dn);
 	ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL);
 
 	/*
@@ -437,38 +456,18 @@
 	}
 	dnode_setdblksz(dn, blocksize);
 	dnode_setdirty(dn, tx);
+	dn->dn_next_bonuslen[tx->tx_txg&TXG_MASK] = bonuslen;
 	dn->dn_next_blksz[tx->tx_txg&TXG_MASK] = blocksize;
 	rw_exit(&dn->dn_struct_rwlock);
-	if (db) {
+	if (db)
 		dbuf_rele(db, FTAG);
-		db = NULL;
-	}
 
 	/* change type */
 	dn->dn_type = ot;
 
-	if (dn->dn_bonuslen != bonuslen) {
-		/* change bonus size */
-		if (bonuslen == 0)
-			bonuslen = 1; /* XXX */
-		rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
-		if (dn->dn_bonus == NULL)
-			dn->dn_bonus = dbuf_create_bonus(dn);
-		db = dn->dn_bonus;
-		rw_exit(&dn->dn_struct_rwlock);
-		if (refcount_add(&db->db_holds, FTAG) == 1)
-			dnode_add_ref(dn, db);
-		VERIFY(0 == dbuf_read(db, NULL, DB_RF_MUST_SUCCEED));
-		mutex_enter(&db->db_mtx);
-		ASSERT3U(db->db.db_size, ==, dn->dn_bonuslen);
-		ASSERT(db->db.db_data != NULL);
-		db->db.db_size = bonuslen;
-		mutex_exit(&db->db_mtx);
-		(void) dbuf_dirty(db, tx);
-	}
-
 	/* change bonus size and type */
 	mutex_enter(&dn->dn_mtx);
+	old_nblkptr = dn->dn_nblkptr;
 	dn->dn_bonustype = bonustype;
 	dn->dn_bonuslen = bonuslen;
 	dn->dn_nblkptr = 1 + ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT);
@@ -476,12 +475,15 @@
 	dn->dn_compress = ZIO_COMPRESS_INHERIT;
 	ASSERT3U(dn->dn_nblkptr, <=, DN_MAX_NBLKPTR);
 
-	/*
-	 * NB: we have to do the dbuf_rele after we've changed the
-	 * dn_bonuslen, for the sake of dbuf_verify().
-	 */
-	if (db)
-		dbuf_rele(db, FTAG);
+	/* XXX - for now, we can't make nblkptr smaller */
+	ASSERT3U(dn->dn_nblkptr, >=, old_nblkptr);
+
+	/* fix up the bonus db_size if dn_nblkptr has changed */
+	if (dn->dn_bonus && dn->dn_bonuslen != old_nblkptr) {
+		dn->dn_bonus->db.db_size =
+		    DN_MAX_BONUSLEN - (dn->dn_nblkptr-1) * sizeof (blkptr_t);
+		ASSERT(dn->dn_bonuslen <= dn->dn_bonus->db.db_size);
+	}
 
 	dn->dn_allocated_txg = tx->tx_txg;
 	mutex_exit(&dn->dn_mtx);
@@ -646,11 +648,22 @@
 	return (dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, tag, dnp));
 }
 
-void
+/*
+ * Can only add a reference if there is already at least one
+ * reference on the dnode.  Returns FALSE if unable to add a
+ * new reference.
+ */
+boolean_t
 dnode_add_ref(dnode_t *dn, void *tag)
 {
-	ASSERT(refcount_count(&dn->dn_holds) > 0);
-	(void) refcount_add(&dn->dn_holds, tag);
+	mutex_enter(&dn->dn_mtx);
+	if (refcount_is_zero(&dn->dn_holds)) {
+		mutex_exit(&dn->dn_mtx);
+		return (FALSE);
+	}
+	VERIFY(1 < refcount_add(&dn->dn_holds, tag));
+	mutex_exit(&dn->dn_mtx);
+	return (TRUE);
 }
 
 void
@@ -658,7 +671,9 @@
 {
 	uint64_t refs;
 
+	mutex_enter(&dn->dn_mtx);
 	refs = refcount_remove(&dn->dn_holds, tag);
+	mutex_exit(&dn->dn_mtx);
 	/* NOTE: the DNODE_DNODE does not have a dn_dbuf */
 	if (refs == 0 && dn->dn_dbuf)
 		dbuf_rele(dn->dn_dbuf, dn);
@@ -694,6 +709,7 @@
 
 	ASSERT(!refcount_is_zero(&dn->dn_holds) || list_head(&dn->dn_dbufs));
 	ASSERT(dn->dn_datablksz != 0);
+	ASSERT3U(dn->dn_next_bonuslen[txg&TXG_MASK], ==, 0);
 	ASSERT3U(dn->dn_next_blksz[txg&TXG_MASK], ==, 0);
 
 	dprintf_ds(os->os_dsl_dataset, "obj=%llu txg=%llu\n",
@@ -716,7 +732,7 @@
 	 * dnode will hang around after we finish processing its
 	 * children.
 	 */
-	dnode_add_ref(dn, (void *)(uintptr_t)tx->tx_txg);
+	VERIFY(dnode_add_ref(dn, (void *)(uintptr_t)tx->tx_txg));
 
 	(void) dbuf_dirty(dn->dn_dbuf, tx);
 
--- a/usr/src/uts/common/fs/zfs/dnode_sync.c	Fri Aug 24 17:45:12 2007 -0700
+++ b/usr/src/uts/common/fs/zfs/dnode_sync.c	Sun Aug 26 11:19:04 2007 -0700
@@ -349,8 +349,8 @@
 /*
  * Try to kick all the dnodes dbufs out of the cache...
  */
-int
-dnode_evict_dbufs(dnode_t *dn, boolean_t try)
+void
+dnode_evict_dbufs(dnode_t *dn)
 {
 	int progress;
 	int pass = 0;
@@ -397,21 +397,6 @@
 		ASSERT(pass < 100); /* sanity check */
 	} while (progress);
 
-	/*
-	 * This function works fine even if it can't evict everything.
-	 * If were only asked to try to evict everything then
-	 * return an error if we can't. Otherwise panic as the caller
-	 * expects total eviction.
-	 */
-	if (list_head(&dn->dn_dbufs) != NULL) {
-		if (try) {
-			return (1);
-		} else {
-			panic("dangling dbufs (dn=%p, dbuf=%p)\n",
-			    dn, list_head(&dn->dn_dbufs));
-		}
-	}
-
 	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
 	if (dn->dn_bonus && refcount_is_zero(&dn->dn_bonus->db_holds)) {
 		mutex_enter(&dn->dn_bonus->db_mtx);
@@ -419,7 +404,6 @@
 		dn->dn_bonus = NULL;
 	}
 	rw_exit(&dn->dn_struct_rwlock);
-	return (0);
 }
 
 static void
@@ -459,7 +443,7 @@
 	ASSERT(dmu_tx_is_syncing(tx));
 
 	dnode_undirty_dbufs(&dn->dn_dirty_records[txgoff]);
-	(void) dnode_evict_dbufs(dn, 0);
+	dnode_evict_dbufs(dn);
 	ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL);
 
 	/*
@@ -565,6 +549,15 @@
 		dn->dn_next_blksz[txgoff] = 0;
 	}
 
+	if (dn->dn_next_bonuslen[txgoff]) {
+		if (dn->dn_next_bonuslen[txgoff] == DN_ZERO_BONUSLEN)
+			dnp->dn_bonuslen = 0;
+		else
+			dnp->dn_bonuslen = dn->dn_next_bonuslen[txgoff];
+		ASSERT(dnp->dn_bonuslen <= DN_MAX_BONUSLEN);
+		dn->dn_next_bonuslen[txgoff] = 0;
+	}
+
 	if (dn->dn_next_indblkshift[txgoff]) {
 		ASSERT(dnp->dn_nlevels == 1);
 		dnp->dn_indblkshift = dn->dn_next_indblkshift[txgoff];
@@ -588,15 +581,16 @@
 			dnode_sync_free_range(dn,
 			    rp->fr_blkid, rp->fr_nblks, tx);
 	}
+	/* grab the mutex so we don't race with dnode_block_freed() */
 	mutex_enter(&dn->dn_mtx);
 	for (rp = avl_first(&dn->dn_ranges[txgoff]); rp; ) {
+
 		free_range_t *last = rp;
 		rp = AVL_NEXT(&dn->dn_ranges[txgoff], rp);
 		avl_remove(&dn->dn_ranges[txgoff], last);
 		kmem_free(last, sizeof (free_range_t));
 	}
 	mutex_exit(&dn->dn_mtx);
-
 	if (dn->dn_free_txg > 0 && dn->dn_free_txg <= tx->tx_txg) {
 		dnode_sync_free(dn, tx);
 		return;
--- a/usr/src/uts/common/fs/zfs/dsl_dir.c	Fri Aug 24 17:45:12 2007 -0700
+++ b/usr/src/uts/common/fs/zfs/dsl_dir.c	Sun Aug 26 11:19:04 2007 -0700
@@ -691,14 +691,13 @@
  * be canceled, using dsl_dir_tempreserve_clear().
  */
 static int
-dsl_dir_tempreserve_impl(dsl_dir_t *dd,
-    uint64_t asize, boolean_t netfree, list_t *tr_list, dmu_tx_t *tx)
+dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize,
+    boolean_t netfree, boolean_t noquota, list_t *tr_list, dmu_tx_t *tx)
 {
 	uint64_t txg = tx->tx_txg;
 	uint64_t est_used, quota, parent_rsrv;
 	int edquot = EDQUOT;
 	int txgidx = txg & TXG_MASK;
-	boolean_t ismos;
 	int i;
 	struct tempreserve *tr;
 
@@ -718,7 +717,7 @@
 	 * If this transaction will result in a net free of space, we want
 	 * to let it through.
 	 */
-	if (netfree || dd->dd_phys->dd_quota == 0)
+	if (netfree || noquota || dd->dd_phys->dd_quota == 0)
 		quota = UINT64_MAX;
 	else
 		quota = dd->dd_phys->dd_quota;
@@ -732,8 +731,7 @@
 	 * we're very close to full, this will allow a steady trickle of
 	 * removes to get through.
 	 */
-	ismos = (dd->dd_phys->dd_head_dataset_obj == 0);
-	if (dd->dd_parent == NULL || ismos) {
+	if (dd->dd_parent == NULL) {
 		uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool, netfree);
 		if (poolsize < quota) {
 			quota = poolsize;
@@ -773,9 +771,11 @@
 	list_insert_tail(tr_list, tr);
 
 	/* see if it's OK with our parent */
-	if (dd->dd_parent && parent_rsrv && !ismos) {
+	if (dd->dd_parent && parent_rsrv) {
+		boolean_t ismos = (dd->dd_phys->dd_head_dataset_obj == 0);
+
 		return (dsl_dir_tempreserve_impl(dd->dd_parent,
-		    parent_rsrv, netfree, tr_list, tx));
+		    parent_rsrv, netfree, ismos, tr_list, tx));
 	} else {
 		return (0);
 	}
@@ -800,7 +800,7 @@
 	ASSERT3S(asize, >=, 0);
 	ASSERT3S(fsize, >=, 0);
 
-	err = dsl_dir_tempreserve_impl(dd, asize, fsize >= asize,
+	err = dsl_dir_tempreserve_impl(dd, asize, fsize >= asize, FALSE,
 	    tr_list, tx);
 
 	if (err == 0) {
--- a/usr/src/uts/common/fs/zfs/metaslab.c	Fri Aug 24 17:45:12 2007 -0700
+++ b/usr/src/uts/common/fs/zfs/metaslab.c	Sun Aug 26 11:19:04 2007 -0700
@@ -534,8 +534,8 @@
 
 	VERIFY(0 == dmu_bonus_hold(mos, smo->smo_object, FTAG, &db));
 	dmu_buf_will_dirty(db, tx);
-	ASSERT3U(db->db_size, ==, sizeof (*smo));
-	bcopy(smo, db->db_data, db->db_size);
+	ASSERT3U(db->db_size, >=, sizeof (*smo));
+	bcopy(smo, db->db_data, sizeof (*smo));
 	dmu_buf_rele(db, FTAG);
 
 	dmu_tx_commit(tx);
--- a/usr/src/uts/common/fs/zfs/spa_misc.c	Fri Aug 24 17:45:12 2007 -0700
+++ b/usr/src/uts/common/fs/zfs/spa_misc.c	Sun Aug 26 11:19:04 2007 -0700
@@ -1016,12 +1016,15 @@
 	if (!spa->spa_deflate)
 		return (BP_GET_ASIZE(bp));
 
+	spa_config_enter(spa, RW_READER, FTAG);
 	for (i = 0; i < SPA_DVAS_PER_BP; i++) {
 		vdev_t *vd =
 		    vdev_lookup_top(spa, DVA_GET_VDEV(&bp->blk_dva[i]));
-		sz += (DVA_GET_ASIZE(&bp->blk_dva[i]) >> SPA_MINBLOCKSHIFT) *
-		    vd->vdev_deflate_ratio;
+		if (vd)
+			sz += (DVA_GET_ASIZE(&bp->blk_dva[i]) >>
+			    SPA_MINBLOCKSHIFT) * vd->vdev_deflate_ratio;
 	}
+	spa_config_exit(spa, FTAG);
 	return (sz);
 }
 
--- a/usr/src/uts/common/fs/zfs/sys/dbuf.h	Fri Aug 24 17:45:12 2007 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/dbuf.h	Sun Aug 26 11:19:04 2007 -0700
@@ -239,7 +239,7 @@
 uint64_t dbuf_whichblock(struct dnode *di, uint64_t offset);
 
 dmu_buf_impl_t *dbuf_create_tlib(struct dnode *dn, char *data);
-dmu_buf_impl_t *dbuf_create_bonus(struct dnode *dn);
+void dbuf_create_bonus(struct dnode *dn);
 
 dmu_buf_impl_t *dbuf_hold(struct dnode *dn, uint64_t blkid, void *tag);
 dmu_buf_impl_t *dbuf_hold_level(struct dnode *dn, int level, uint64_t blkid,
--- a/usr/src/uts/common/fs/zfs/sys/dmu.h	Fri Aug 24 17:45:12 2007 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/dmu.h	Sun Aug 26 11:19:04 2007 -0700
@@ -157,7 +157,7 @@
 int dmu_objset_open(const char *name, dmu_objset_type_t type, int mode,
     objset_t **osp);
 void dmu_objset_close(objset_t *os);
-int dmu_objset_evict_dbufs(objset_t *os, boolean_t try);
+int dmu_objset_evict_dbufs(objset_t *os);
 int dmu_objset_create(const char *name, dmu_objset_type_t type,
     objset_t *clone_parent,
     void (*func)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx), void *arg);
@@ -294,6 +294,7 @@
  */
 int dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **);
 int dmu_bonus_max(void);
+int dmu_set_bonus(dmu_buf_t *, int, dmu_tx_t *);
 
 /*
  * Obtain the DMU buffer from the specified object which contains the
--- a/usr/src/uts/common/fs/zfs/sys/dmu_objset.h	Fri Aug 24 17:45:12 2007 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/dmu_objset.h	Sun Aug 26 11:19:04 2007 -0700
@@ -108,7 +108,7 @@
 int dmu_objset_find(char *name, int func(char *, void *), void *arg,
     int flags);
 void dmu_objset_byteswap(void *buf, size_t size);
-int dmu_objset_evict_dbufs(objset_t *os, boolean_t try);
+int dmu_objset_evict_dbufs(objset_t *os);
 
 /* called from dsl */
 void dmu_objset_sync(objset_impl_t *os, zio_t *zio, dmu_tx_t *tx);
--- a/usr/src/uts/common/fs/zfs/sys/dnode.h	Fri Aug 24 17:45:12 2007 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/dnode.h	Sun Aug 26 11:19:04 2007 -0700
@@ -64,6 +64,7 @@
 #define	DN_MAX_NBLKPTR	((DNODE_SIZE - DNODE_CORE_SIZE) >> SPA_BLKPTRSHIFT)
 #define	DN_MAX_BONUSLEN	(DNODE_SIZE - DNODE_CORE_SIZE - (1 << SPA_BLKPTRSHIFT))
 #define	DN_MAX_OBJECT	(1ULL << DN_MAX_OBJECT_SHIFT)
+#define	DN_ZERO_BONUSLEN	(DN_MAX_BONUSLEN + 1)
 
 #define	DNODES_PER_BLOCK_SHIFT	(DNODE_BLOCK_SHIFT - DNODE_SHIFT)
 #define	DNODES_PER_BLOCK	(1ULL << DNODES_PER_BLOCK_SHIFT)
@@ -156,6 +157,7 @@
 	uint64_t dn_maxblkid;
 	uint8_t dn_next_nlevels[TXG_SIZE];
 	uint8_t dn_next_indblkshift[TXG_SIZE];
+	uint16_t dn_next_bonuslen[TXG_SIZE];
 	uint32_t dn_next_blksz[TXG_SIZE];	/* next block size in bytes */
 
 	/* protected by os_lock: */
@@ -197,11 +199,12 @@
     uint64_t object);
 void dnode_special_close(dnode_t *dn);
 
+void dnode_setbonuslen(dnode_t *dn, int newsize, dmu_tx_t *tx);
 int dnode_hold(struct objset_impl *dd, uint64_t object,
     void *ref, dnode_t **dnp);
 int dnode_hold_impl(struct objset_impl *dd, uint64_t object, int flag,
     void *ref, dnode_t **dnp);
-void dnode_add_ref(dnode_t *dn, void *ref);
+boolean_t dnode_add_ref(dnode_t *dn, void *ref);
 void dnode_rele(dnode_t *dn, void *ref);
 void dnode_setdirty(dnode_t *dn, dmu_tx_t *tx);
 void dnode_sync(dnode_t *dn, dmu_tx_t *tx);
@@ -226,7 +229,7 @@
 void dnode_fini(void);
 int dnode_next_offset(dnode_t *dn, boolean_t hole, uint64_t *off, int minlvl,
     uint64_t blkfill, uint64_t txg);
-int dnode_evict_dbufs(dnode_t *dn, boolean_t try);
+void dnode_evict_dbufs(dnode_t *dn);
 
 #ifdef ZFS_DEBUG
 
--- a/usr/src/uts/common/fs/zfs/vdev.c	Fri Aug 24 17:45:12 2007 -0700
+++ b/usr/src/uts/common/fs/zfs/vdev.c	Sun Aug 26 11:19:04 2007 -0700
@@ -765,8 +765,8 @@
 				error = dmu_bonus_hold(mos, object, FTAG, &db);
 				if (error)
 					return (error);
-				ASSERT3U(db->db_size, ==, sizeof (smo));
-				bcopy(db->db_data, &smo, db->db_size);
+				ASSERT3U(db->db_size, >=, sizeof (smo));
+				bcopy(db->db_data, &smo, sizeof (smo));
 				ASSERT3U(smo.smo_object, ==, object);
 				dmu_buf_rele(db, FTAG);
 			}
@@ -1234,8 +1234,8 @@
 	if ((error = dmu_bonus_hold(mos, smo->smo_object, FTAG, &db)) != 0)
 		return (error);
 
-	ASSERT3U(db->db_size, ==, sizeof (*smo));
-	bcopy(db->db_data, smo, db->db_size);
+	ASSERT3U(db->db_size, >=, sizeof (*smo));
+	bcopy(db->db_data, smo, sizeof (*smo));
 	dmu_buf_rele(db, FTAG);
 
 	mutex_enter(&vd->vdev_dtl_lock);
@@ -1305,8 +1305,8 @@
 
 	VERIFY(0 == dmu_bonus_hold(mos, smo->smo_object, FTAG, &db));
 	dmu_buf_will_dirty(db, tx);
-	ASSERT3U(db->db_size, ==, sizeof (*smo));
-	bcopy(smo, db->db_data, db->db_size);
+	ASSERT3U(db->db_size, >=, sizeof (*smo));
+	bcopy(smo, db->db_data, sizeof (*smo));
 	dmu_buf_rele(db, FTAG);
 
 	dmu_tx_commit(tx);
--- a/usr/src/uts/common/fs/zfs/zfs_vfsops.c	Fri Aug 24 17:45:12 2007 -0700
+++ b/usr/src/uts/common/fs/zfs/zfs_vfsops.c	Sun Aug 26 11:19:04 2007 -0700
@@ -1132,12 +1132,9 @@
 	}
 
 	/*
-	 * Evict all dbufs so that cached znodes will be freed
+	 * Evict cached data
 	 */
-	if (dmu_objset_evict_dbufs(os, B_TRUE)) {
-		txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
-		(void) dmu_objset_evict_dbufs(os, B_FALSE);
-	}
+	(void) dmu_objset_evict_dbufs(os);
 
 	/*
 	 * Finally close the objset
--- a/usr/src/uts/common/os/list.c	Fri Aug 24 17:45:12 2007 -0700
+++ b/usr/src/uts/common/os/list.c	Sun Aug 26 11:19:04 2007 -0700
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -113,6 +113,7 @@
 {
 	list_node_t *lold = list_d2l(list, object);
 	ASSERT(!list_empty(list));
+	ASSERT(lold->list_next != NULL);
 	lold->list_prev->list_next = lold->list_next;
 	lold->list_next->list_prev = lold->list_prev;
 	lold->list_next = lold->list_prev = NULL;