6569719 panic dangling dbufs (dn=ffffffff28814d30, dbuf=ffffffff20756008)
6573361 panic turnstile_block, unowned mutex
6584864 $MOS is not properly bounded by pool size
6585265 need bonus resize interface
6587723 BAD TRAP: type=e (#pf Page fault) occurred in module "zfs" due to a NULL pointer dereference
6589799 dangling dbuf after zinject
6594025 panic: dangling dbufs during shutdown
--- a/usr/src/cmd/ztest/ztest.c Fri Aug 24 17:45:12 2007 -0700
+++ b/usr/src/cmd/ztest/ztest.c Sun Aug 26 11:19:04 2007 -0700
@@ -1541,7 +1541,7 @@
ASSERT(doi.doi_bonus_type == DMU_OT_PLAIN_OTHER);
ASSERT3S(doi.doi_physical_blks, >=, 0);
- bonuslen = db->db_size;
+ bonuslen = doi.doi_bonus_size;
for (c = 0; c < bonuslen; c++) {
if (((uint8_t *)db->db_data)[c] !=
@@ -1660,7 +1660,7 @@
* Write to both the bonus buffer and the regular data.
*/
VERIFY(0 == dmu_bonus_hold(os, object, FTAG, &db));
- ASSERT3U(bonuslen, ==, db->db_size);
+ ASSERT3U(bonuslen, <=, db->db_size);
dmu_object_size_from_db(db, &va_blksize, &va_nblocks);
ASSERT3S(va_nblocks, >=, 0);
@@ -1671,7 +1671,7 @@
* See comments above regarding the contents of
* the bonus buffer and the word at endoff.
*/
- for (c = 0; c < db->db_size; c++)
+ for (c = 0; c < bonuslen; c++)
((uint8_t *)db->db_data)[c] = (uint8_t)(c + bonuslen);
dmu_buf_rele(db, FTAG);
@@ -1948,8 +1948,8 @@
*/
VERIFY(0 == dmu_bonus_hold(os, ZTEST_DIROBJ, FTAG, &db));
- ASSERT3U(db->db_size, ==, sizeof (rbt));
- bcopy(db->db_data, &rbt, db->db_size);
+ ASSERT3U(db->db_size, >=, sizeof (rbt));
+ bcopy(db->db_data, &rbt, sizeof (rbt));
if (rbt.bt_objset != 0) {
ASSERT3U(rbt.bt_objset, ==, dmu_objset_id(os));
ASSERT3U(rbt.bt_object, ==, ZTEST_DIROBJ);
@@ -2041,19 +2041,37 @@
wbt.bt_thread = za->za_instance;
if (off == -1ULL) {
+ dmu_object_info_t doi;
+ char *off;
+
wbt.bt_seq = 0;
VERIFY(0 == dmu_bonus_hold(os, ZTEST_DIROBJ,
FTAG, &db));
- ASSERT3U(db->db_size, ==, sizeof (wbt));
- bcopy(db->db_data, &rbt, db->db_size);
+ dmu_object_info_from_db(db, &doi);
+ ASSERT3U(doi.doi_bonus_size, >=, sizeof (wbt));
+ off = (char *)db->db_data +
+ doi.doi_bonus_size - sizeof (wbt);
+ bcopy(off, &rbt, sizeof (wbt));
if (rbt.bt_objset != 0) {
ASSERT3U(rbt.bt_objset, ==, wbt.bt_objset);
ASSERT3U(rbt.bt_object, ==, wbt.bt_object);
ASSERT3U(rbt.bt_offset, ==, wbt.bt_offset);
ASSERT3U(rbt.bt_txg, <=, wbt.bt_txg);
}
+ if (ztest_random(10) == 0) {
+ int newsize = (ztest_random(
+ db->db_size / sizeof (wbt)) + 1) *
+ sizeof (wbt);
+
+ ASSERT3U(newsize, >=, sizeof (wbt));
+ ASSERT3U(newsize, <=, db->db_size);
+ error = dmu_set_bonus(db, newsize, tx);
+ ASSERT3U(error, ==, 0);
+ off = (char *)db->db_data + newsize -
+ sizeof (wbt);
+ }
dmu_buf_will_dirty(db, tx);
- bcopy(&wbt, db->db_data, db->db_size);
+ bcopy(&wbt, off, db->db_size);
dmu_buf_rele(db, FTAG);
dmu_tx_commit(tx);
continue;
--- a/usr/src/uts/common/fs/zfs/dbuf.c Fri Aug 24 17:45:12 2007 -0700
+++ b/usr/src/uts/common/fs/zfs/dbuf.c Sun Aug 26 11:19:04 2007 -0700
@@ -307,7 +307,7 @@
}
if (db->db_blkid == DB_BONUS_BLKID) {
ASSERT(dn != NULL);
- ASSERT3U(db->db.db_size, ==, dn->dn_bonuslen);
+ ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
ASSERT3U(db->db.db_offset, ==, DB_BONUS_BLKID);
} else {
ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size);
@@ -468,13 +468,15 @@
ASSERT(db->db_buf == NULL);
if (db->db_blkid == DB_BONUS_BLKID) {
- ASSERT3U(db->db_dnode->dn_bonuslen, ==, db->db.db_size);
+ int bonuslen = db->db_dnode->dn_bonuslen;
+
+ ASSERT3U(bonuslen, <=, db->db.db_size);
db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN);
arc_space_consume(DN_MAX_BONUSLEN);
- if (db->db.db_size < DN_MAX_BONUSLEN)
+ if (bonuslen < DN_MAX_BONUSLEN)
bzero(db->db.db_data, DN_MAX_BONUSLEN);
bcopy(DN_BONUS(db->db_dnode->dn_phys), db->db.db_data,
- db->db.db_size);
+ bonuslen);
dbuf_update_data(db);
db->db_state = DB_CACHED;
mutex_exit(&db->db_mtx);
@@ -781,31 +783,28 @@
}
static int
-dbuf_new_block(dmu_buf_impl_t *db)
+dbuf_block_freeable(dmu_buf_impl_t *db)
{
dsl_dataset_t *ds = db->db_objset->os_dsl_dataset;
uint64_t birth_txg = 0;
- /* Don't count meta-objects */
- if (ds == NULL)
- return (FALSE);
-
/*
* We don't need any locking to protect db_blkptr:
* If it's syncing, then db_last_dirty will be set
* so we'll ignore db_blkptr.
*/
ASSERT(MUTEX_HELD(&db->db_mtx));
- /* If we have been dirtied since the last snapshot, its not new */
if (db->db_last_dirty)
birth_txg = db->db_last_dirty->dr_txg;
else if (db->db_blkptr)
birth_txg = db->db_blkptr->blk_birth;
+ /* If we don't exist or are in a snapshot, we can't be freed */
if (birth_txg)
- return (!dsl_dataset_block_freeable(ds, birth_txg));
+ return (ds == NULL ||
+ dsl_dataset_block_freeable(ds, birth_txg));
else
- return (TRUE);
+ return (FALSE);
}
void
@@ -964,6 +963,27 @@
dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
+ if (db->db_blkid != DB_BONUS_BLKID) {
+ /*
+ * Update the accounting.
+ */
+ if (dbuf_block_freeable(db)) {
+ blkptr_t *bp = db->db_blkptr;
+ int64_t willfree = (bp && !BP_IS_HOLE(bp)) ?
+ bp_get_dasize(os->os_spa, bp) : db->db.db_size;
+ /*
+ * This is only a guess -- if the dbuf is dirty
+ * in a previous txg, we don't know how much
+ * space it will use on disk yet. We should
+ * really have the struct_rwlock to access
+ * db_blkptr, but since this is just a guess,
+ * it's OK if we get an odd answer.
+ */
+ dnode_willuse_space(dn, -willfree, tx);
+ }
+ dnode_willuse_space(dn, db->db.db_size, tx);
+ }
+
/*
* If this buffer is dirty in an old transaction group we need
* to make a copy of it so that the changes we make in this
@@ -1013,25 +1033,6 @@
db->db_freed_in_flight = FALSE;
}
- if (db->db_blkid != DB_BONUS_BLKID) {
- /*
- * Update the accounting.
- */
- if (!dbuf_new_block(db) && db->db_blkptr) {
- /*
- * This is only a guess -- if the dbuf is dirty
- * in a previous txg, we don't know how much
- * space it will use on disk yet. We should
- * really have the struct_rwlock to access
- * db_blkptr, but since this is just a guess,
- * it's OK if we get an odd answer.
- */
- dnode_willuse_space(dn,
- -bp_get_dasize(os->os_spa, db->db_blkptr), tx);
- }
- dnode_willuse_space(dn, db->db.db_size, tx);
- }
-
/*
* This buffer is now part of this txg
*/
@@ -1297,6 +1298,7 @@
if (db->db_blkid != DB_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) {
list_remove(&dn->dn_dbufs, db);
dnode_rele(dn, db);
+ db->db_dnode = NULL;
}
if (db->db_buf)
@@ -1397,7 +1399,9 @@
if (blkid == DB_BONUS_BLKID) {
ASSERT3P(parent, ==, dn->dn_dbuf);
- db->db.db_size = dn->dn_bonuslen;
+ db->db.db_size = DN_MAX_BONUSLEN -
+ (dn->dn_nblkptr-1) * sizeof (blkptr_t);
+ ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
db->db.db_offset = DB_BONUS_BLKID;
db->db_state = DB_UNCACHED;
/* the bonus dbuf is not placed in the hash table */
@@ -1471,29 +1475,23 @@
ASSERT(refcount_is_zero(&db->db_holds));
if (db->db_blkid != DB_BONUS_BLKID) {
- dnode_t *dn = db->db_dnode;
- boolean_t need_mutex = !MUTEX_HELD(&dn->dn_dbufs_mtx);
-
- if (need_mutex)
- mutex_enter(&dn->dn_dbufs_mtx);
-
/*
* If this dbuf is still on the dn_dbufs list,
* remove it from that list.
*/
- if (list_link_active(&db->db_link)) {
- ASSERT(need_mutex);
+ if (db->db_dnode) {
+ dnode_t *dn = db->db_dnode;
+
+ mutex_enter(&dn->dn_dbufs_mtx);
list_remove(&dn->dn_dbufs, db);
mutex_exit(&dn->dn_dbufs_mtx);
dnode_rele(dn, db);
- } else if (need_mutex) {
- mutex_exit(&dn->dn_dbufs_mtx);
+ db->db_dnode = NULL;
}
dbuf_hash_remove(db);
}
db->db_parent = NULL;
- db->db_dnode = NULL;
db->db_buf = NULL;
ASSERT(!list_link_active(&db->db_link));
@@ -1662,16 +1660,13 @@
return (err ? NULL : db);
}
-dmu_buf_impl_t *
+void
dbuf_create_bonus(dnode_t *dn)
{
- dmu_buf_impl_t *db = dn->dn_bonus;
-
ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
ASSERT(dn->dn_bonus == NULL);
- db = dbuf_create(dn, 0, DB_BONUS_BLKID, dn->dn_dbuf, NULL);
- return (db);
+ dn->dn_bonus = dbuf_create(dn, 0, DB_BONUS_BLKID, dn->dn_dbuf, NULL);
}
#pragma weak dmu_buf_add_ref = dbuf_add_ref
@@ -1919,11 +1914,7 @@
*/
if (db->db_blkid == DB_BONUS_BLKID) {
dbuf_dirty_record_t **drp;
- /*
- * Use dn_phys->dn_bonuslen since db.db_size is the length
- * of the bonus buffer in the open transaction rather than
- * the syncing transaction.
- */
+
ASSERT(*datap != NULL);
ASSERT3U(db->db_level, ==, 0);
ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN);
--- a/usr/src/uts/common/fs/zfs/dmu.c Fri Aug 24 17:45:12 2007 -0700
+++ b/usr/src/uts/common/fs/zfs/dmu.c Sun Aug 26 11:19:04 2007 -0700
@@ -119,6 +119,19 @@
return (DN_MAX_BONUSLEN);
}
+int
+dmu_set_bonus(dmu_buf_t *db, int newsize, dmu_tx_t *tx)
+{
+ dnode_t *dn = ((dmu_buf_impl_t *)db)->db_dnode;
+
+ if (dn->dn_bonus != (dmu_buf_impl_t *)db)
+ return (EINVAL);
+ if (newsize < 0 || newsize > db->db_size)
+ return (EINVAL);
+ dnode_setbonuslen(dn, newsize, tx);
+ return (0);
+}
+
/*
* returns ENOENT, EIO, or 0.
*/
@@ -126,27 +139,27 @@
dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **dbp)
{
dnode_t *dn;
- int err, count;
dmu_buf_impl_t *db;
+ int error;
- err = dnode_hold(os->os, object, FTAG, &dn);
- if (err)
- return (err);
+ error = dnode_hold(os->os, object, FTAG, &dn);
+ if (error)
+ return (error);
rw_enter(&dn->dn_struct_rwlock, RW_READER);
if (dn->dn_bonus == NULL) {
rw_exit(&dn->dn_struct_rwlock);
rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
if (dn->dn_bonus == NULL)
- dn->dn_bonus = dbuf_create_bonus(dn);
+ dbuf_create_bonus(dn);
}
db = dn->dn_bonus;
rw_exit(&dn->dn_struct_rwlock);
- mutex_enter(&db->db_mtx);
- count = refcount_add(&db->db_holds, tag);
- mutex_exit(&db->db_mtx);
- if (count == 1)
- dnode_add_ref(dn, db);
+
+ /* as long as the bonus buf is held, the dnode will be held */
+ if (refcount_add(&db->db_holds, tag) == 1)
+ VERIFY(dnode_add_ref(dn, db));
+
dnode_rele(dn, FTAG);
VERIFY(0 == dbuf_read(db, NULL, DB_RF_MUST_SUCCEED));
@@ -388,7 +401,6 @@
while (size > 0) {
uint64_t mylen = MIN(size, DMU_MAX_ACCESS / 2);
- int err;
/*
* NB: we could do this block-at-a-time, but it's nice
@@ -397,7 +409,7 @@
err = dmu_buf_hold_array_by_dnode(dn, offset, mylen,
TRUE, FTAG, &numbufs, &dbp);
if (err)
- return (err);
+ break;
for (i = 0; i < numbufs; i++) {
int tocpy;
@@ -418,7 +430,7 @@
dmu_buf_rele_array(dbp, numbufs, FTAG);
}
dnode_rele(dn, FTAG);
- return (0);
+ return (err);
}
void
--- a/usr/src/uts/common/fs/zfs/dmu_objset.c Fri Aug 24 17:45:12 2007 -0700
+++ b/usr/src/uts/common/fs/zfs/dmu_objset.c Sun Aug 26 11:19:04 2007 -0700
@@ -309,7 +309,7 @@
}
int
-dmu_objset_evict_dbufs(objset_t *os, boolean_t try)
+dmu_objset_evict_dbufs(objset_t *os)
{
objset_impl_t *osi = os->os;
dnode_t *dn;
@@ -327,34 +327,25 @@
* skip.
*/
for (dn = list_head(&osi->os_dnodes);
- dn && refcount_is_zero(&dn->dn_holds);
+ dn && !dnode_add_ref(dn, FTAG);
dn = list_next(&osi->os_dnodes, dn))
continue;
- if (dn)
- dnode_add_ref(dn, FTAG);
while (dn) {
dnode_t *next_dn = dn;
do {
next_dn = list_next(&osi->os_dnodes, next_dn);
- } while (next_dn && refcount_is_zero(&next_dn->dn_holds));
- if (next_dn)
- dnode_add_ref(next_dn, FTAG);
+ } while (next_dn && !dnode_add_ref(next_dn, FTAG));
mutex_exit(&osi->os_lock);
- if (dnode_evict_dbufs(dn, try)) {
- dnode_rele(dn, FTAG);
- if (next_dn)
- dnode_rele(next_dn, FTAG);
- return (1);
- }
+ dnode_evict_dbufs(dn);
dnode_rele(dn, FTAG);
mutex_enter(&osi->os_lock);
dn = next_dn;
}
mutex_exit(&osi->os_lock);
- return (0);
+ return (list_head(&osi->os_dnodes) != osi->os_meta_dnode);
}
void
@@ -383,7 +374,7 @@
* nothing can be added to the list at this point.
*/
os.os = osi;
- (void) dmu_objset_evict_dbufs(&os, 0);
+ (void) dmu_objset_evict_dbufs(&os);
ASSERT3P(list_head(&osi->os_dnodes), ==, osi->os_meta_dnode);
ASSERT3P(list_tail(&osi->os_dnodes), ==, osi->os_meta_dnode);
--- a/usr/src/uts/common/fs/zfs/dmu_send.c Fri Aug 24 17:45:12 2007 -0700
+++ b/usr/src/uts/common/fs/zfs/dmu_send.c Sun Aug 26 11:19:04 2007 -0700
@@ -610,13 +610,13 @@
VERIFY(0 == dmu_bonus_hold(os, drro->drr_object, FTAG, &db));
dmu_buf_will_dirty(db, tx);
- ASSERT3U(db->db_size, ==, drro->drr_bonuslen);
- data = restore_read(ra, P2ROUNDUP(db->db_size, 8));
+ ASSERT3U(db->db_size, >=, drro->drr_bonuslen);
+ data = restore_read(ra, P2ROUNDUP(drro->drr_bonuslen, 8));
if (data == NULL) {
dmu_tx_commit(tx);
return (ra->err);
}
- bcopy(data, db->db_data, db->db_size);
+ bcopy(data, db->db_data, drro->drr_bonuslen);
if (ra->byteswap) {
dmu_ot[drro->drr_bonustype].ot_byteswap(db->db_data,
drro->drr_bonuslen);
--- a/usr/src/uts/common/fs/zfs/dnode.c Fri Aug 24 17:45:12 2007 -0700
+++ b/usr/src/uts/common/fs/zfs/dnode.c Sun Aug 26 11:19:04 2007 -0700
@@ -240,6 +240,23 @@
else return (0);
}
+void
+dnode_setbonuslen(dnode_t *dn, int newsize, dmu_tx_t *tx)
+{
+ ASSERT3U(refcount_count(&dn->dn_holds), >=, 1);
+
+ dnode_setdirty(dn, tx);
+ rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+ ASSERT3U(newsize, <=, DN_MAX_BONUSLEN -
+ (dn->dn_nblkptr-1) * sizeof (blkptr_t));
+ dn->dn_bonuslen = newsize;
+ if (newsize == 0)
+ dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = DN_ZERO_BONUSLEN;
+ else
+ dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = dn->dn_bonuslen;
+ rw_exit(&dn->dn_struct_rwlock);
+}
+
static void
dnode_setdblksz(dnode_t *dn, int size)
{
@@ -363,6 +380,7 @@
for (i = 0; i < TXG_SIZE; i++) {
ASSERT3U(dn->dn_next_nlevels[i], ==, 0);
ASSERT3U(dn->dn_next_indblkshift[i], ==, 0);
+ ASSERT3U(dn->dn_next_bonuslen[i], ==, 0);
ASSERT3U(dn->dn_next_blksz[i], ==, 0);
ASSERT(!list_link_active(&dn->dn_dirty_link[i]));
ASSERT3P(list_head(&dn->dn_dirty_records[i]), ==, NULL);
@@ -390,6 +408,7 @@
dnode_setdirty(dn, tx);
dn->dn_next_indblkshift[tx->tx_txg & TXG_MASK] = ibs;
+ dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = dn->dn_bonuslen;
dn->dn_next_blksz[tx->tx_txg & TXG_MASK] = dn->dn_datablksz;
}
@@ -397,7 +416,7 @@
dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
{
- int i;
+ int i, old_nblkptr;
dmu_buf_impl_t *db = NULL;
ASSERT3U(blocksize, >=, SPA_MINBLOCKSIZE);
@@ -414,7 +433,7 @@
ASSERT(!list_link_active(&dn->dn_dirty_link[i]));
/* clean up any unreferenced dbufs */
- (void) dnode_evict_dbufs(dn, 0);
+ dnode_evict_dbufs(dn);
ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL);
/*
@@ -437,38 +456,18 @@
}
dnode_setdblksz(dn, blocksize);
dnode_setdirty(dn, tx);
+ dn->dn_next_bonuslen[tx->tx_txg&TXG_MASK] = bonuslen;
dn->dn_next_blksz[tx->tx_txg&TXG_MASK] = blocksize;
rw_exit(&dn->dn_struct_rwlock);
- if (db) {
+ if (db)
dbuf_rele(db, FTAG);
- db = NULL;
- }
/* change type */
dn->dn_type = ot;
- if (dn->dn_bonuslen != bonuslen) {
- /* change bonus size */
- if (bonuslen == 0)
- bonuslen = 1; /* XXX */
- rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
- if (dn->dn_bonus == NULL)
- dn->dn_bonus = dbuf_create_bonus(dn);
- db = dn->dn_bonus;
- rw_exit(&dn->dn_struct_rwlock);
- if (refcount_add(&db->db_holds, FTAG) == 1)
- dnode_add_ref(dn, db);
- VERIFY(0 == dbuf_read(db, NULL, DB_RF_MUST_SUCCEED));
- mutex_enter(&db->db_mtx);
- ASSERT3U(db->db.db_size, ==, dn->dn_bonuslen);
- ASSERT(db->db.db_data != NULL);
- db->db.db_size = bonuslen;
- mutex_exit(&db->db_mtx);
- (void) dbuf_dirty(db, tx);
- }
-
/* change bonus size and type */
mutex_enter(&dn->dn_mtx);
+ old_nblkptr = dn->dn_nblkptr;
dn->dn_bonustype = bonustype;
dn->dn_bonuslen = bonuslen;
dn->dn_nblkptr = 1 + ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT);
@@ -476,12 +475,15 @@
dn->dn_compress = ZIO_COMPRESS_INHERIT;
ASSERT3U(dn->dn_nblkptr, <=, DN_MAX_NBLKPTR);
- /*
- * NB: we have to do the dbuf_rele after we've changed the
- * dn_bonuslen, for the sake of dbuf_verify().
- */
- if (db)
- dbuf_rele(db, FTAG);
+ /* XXX - for now, we can't make nblkptr smaller */
+ ASSERT3U(dn->dn_nblkptr, >=, old_nblkptr);
+
+ /* fix up the bonus db_size if dn_nblkptr has changed */
+ if (dn->dn_bonus && dn->dn_bonuslen != old_nblkptr) {
+ dn->dn_bonus->db.db_size =
+ DN_MAX_BONUSLEN - (dn->dn_nblkptr-1) * sizeof (blkptr_t);
+ ASSERT(dn->dn_bonuslen <= dn->dn_bonus->db.db_size);
+ }
dn->dn_allocated_txg = tx->tx_txg;
mutex_exit(&dn->dn_mtx);
@@ -646,11 +648,22 @@
return (dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, tag, dnp));
}
-void
+/*
+ * Can only add a reference if there is already at least one
+ * reference on the dnode. Returns FALSE if unable to add a
+ * new reference.
+ */
+boolean_t
dnode_add_ref(dnode_t *dn, void *tag)
{
- ASSERT(refcount_count(&dn->dn_holds) > 0);
- (void) refcount_add(&dn->dn_holds, tag);
+ mutex_enter(&dn->dn_mtx);
+ if (refcount_is_zero(&dn->dn_holds)) {
+ mutex_exit(&dn->dn_mtx);
+ return (FALSE);
+ }
+ VERIFY(1 < refcount_add(&dn->dn_holds, tag));
+ mutex_exit(&dn->dn_mtx);
+ return (TRUE);
}
void
@@ -658,7 +671,9 @@
{
uint64_t refs;
+ mutex_enter(&dn->dn_mtx);
refs = refcount_remove(&dn->dn_holds, tag);
+ mutex_exit(&dn->dn_mtx);
/* NOTE: the DNODE_DNODE does not have a dn_dbuf */
if (refs == 0 && dn->dn_dbuf)
dbuf_rele(dn->dn_dbuf, dn);
@@ -694,6 +709,7 @@
ASSERT(!refcount_is_zero(&dn->dn_holds) || list_head(&dn->dn_dbufs));
ASSERT(dn->dn_datablksz != 0);
+ ASSERT3U(dn->dn_next_bonuslen[txg&TXG_MASK], ==, 0);
ASSERT3U(dn->dn_next_blksz[txg&TXG_MASK], ==, 0);
dprintf_ds(os->os_dsl_dataset, "obj=%llu txg=%llu\n",
@@ -716,7 +732,7 @@
* dnode will hang around after we finish processing its
* children.
*/
- dnode_add_ref(dn, (void *)(uintptr_t)tx->tx_txg);
+ VERIFY(dnode_add_ref(dn, (void *)(uintptr_t)tx->tx_txg));
(void) dbuf_dirty(dn->dn_dbuf, tx);
--- a/usr/src/uts/common/fs/zfs/dnode_sync.c Fri Aug 24 17:45:12 2007 -0700
+++ b/usr/src/uts/common/fs/zfs/dnode_sync.c Sun Aug 26 11:19:04 2007 -0700
@@ -349,8 +349,8 @@
/*
* Try to kick all the dnodes dbufs out of the cache...
*/
-int
-dnode_evict_dbufs(dnode_t *dn, boolean_t try)
+void
+dnode_evict_dbufs(dnode_t *dn)
{
int progress;
int pass = 0;
@@ -397,21 +397,6 @@
ASSERT(pass < 100); /* sanity check */
} while (progress);
- /*
- * This function works fine even if it can't evict everything.
- * If were only asked to try to evict everything then
- * return an error if we can't. Otherwise panic as the caller
- * expects total eviction.
- */
- if (list_head(&dn->dn_dbufs) != NULL) {
- if (try) {
- return (1);
- } else {
- panic("dangling dbufs (dn=%p, dbuf=%p)\n",
- dn, list_head(&dn->dn_dbufs));
- }
- }
-
rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
if (dn->dn_bonus && refcount_is_zero(&dn->dn_bonus->db_holds)) {
mutex_enter(&dn->dn_bonus->db_mtx);
@@ -419,7 +404,6 @@
dn->dn_bonus = NULL;
}
rw_exit(&dn->dn_struct_rwlock);
- return (0);
}
static void
@@ -459,7 +443,7 @@
ASSERT(dmu_tx_is_syncing(tx));
dnode_undirty_dbufs(&dn->dn_dirty_records[txgoff]);
- (void) dnode_evict_dbufs(dn, 0);
+ dnode_evict_dbufs(dn);
ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL);
/*
@@ -565,6 +549,15 @@
dn->dn_next_blksz[txgoff] = 0;
}
+ if (dn->dn_next_bonuslen[txgoff]) {
+ if (dn->dn_next_bonuslen[txgoff] == DN_ZERO_BONUSLEN)
+ dnp->dn_bonuslen = 0;
+ else
+ dnp->dn_bonuslen = dn->dn_next_bonuslen[txgoff];
+ ASSERT(dnp->dn_bonuslen <= DN_MAX_BONUSLEN);
+ dn->dn_next_bonuslen[txgoff] = 0;
+ }
+
if (dn->dn_next_indblkshift[txgoff]) {
ASSERT(dnp->dn_nlevels == 1);
dnp->dn_indblkshift = dn->dn_next_indblkshift[txgoff];
@@ -588,15 +581,16 @@
dnode_sync_free_range(dn,
rp->fr_blkid, rp->fr_nblks, tx);
}
+ /* grab the mutex so we don't race with dnode_block_freed() */
mutex_enter(&dn->dn_mtx);
for (rp = avl_first(&dn->dn_ranges[txgoff]); rp; ) {
+
free_range_t *last = rp;
rp = AVL_NEXT(&dn->dn_ranges[txgoff], rp);
avl_remove(&dn->dn_ranges[txgoff], last);
kmem_free(last, sizeof (free_range_t));
}
mutex_exit(&dn->dn_mtx);
-
if (dn->dn_free_txg > 0 && dn->dn_free_txg <= tx->tx_txg) {
dnode_sync_free(dn, tx);
return;
--- a/usr/src/uts/common/fs/zfs/dsl_dir.c Fri Aug 24 17:45:12 2007 -0700
+++ b/usr/src/uts/common/fs/zfs/dsl_dir.c Sun Aug 26 11:19:04 2007 -0700
@@ -691,14 +691,13 @@
* be canceled, using dsl_dir_tempreserve_clear().
*/
static int
-dsl_dir_tempreserve_impl(dsl_dir_t *dd,
- uint64_t asize, boolean_t netfree, list_t *tr_list, dmu_tx_t *tx)
+dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize,
+ boolean_t netfree, boolean_t noquota, list_t *tr_list, dmu_tx_t *tx)
{
uint64_t txg = tx->tx_txg;
uint64_t est_used, quota, parent_rsrv;
int edquot = EDQUOT;
int txgidx = txg & TXG_MASK;
- boolean_t ismos;
int i;
struct tempreserve *tr;
@@ -718,7 +717,7 @@
* If this transaction will result in a net free of space, we want
* to let it through.
*/
- if (netfree || dd->dd_phys->dd_quota == 0)
+ if (netfree || noquota || dd->dd_phys->dd_quota == 0)
quota = UINT64_MAX;
else
quota = dd->dd_phys->dd_quota;
@@ -732,8 +731,7 @@
* we're very close to full, this will allow a steady trickle of
* removes to get through.
*/
- ismos = (dd->dd_phys->dd_head_dataset_obj == 0);
- if (dd->dd_parent == NULL || ismos) {
+ if (dd->dd_parent == NULL) {
uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool, netfree);
if (poolsize < quota) {
quota = poolsize;
@@ -773,9 +771,11 @@
list_insert_tail(tr_list, tr);
/* see if it's OK with our parent */
- if (dd->dd_parent && parent_rsrv && !ismos) {
+ if (dd->dd_parent && parent_rsrv) {
+ boolean_t ismos = (dd->dd_phys->dd_head_dataset_obj == 0);
+
return (dsl_dir_tempreserve_impl(dd->dd_parent,
- parent_rsrv, netfree, tr_list, tx));
+ parent_rsrv, netfree, ismos, tr_list, tx));
} else {
return (0);
}
@@ -800,7 +800,7 @@
ASSERT3S(asize, >=, 0);
ASSERT3S(fsize, >=, 0);
- err = dsl_dir_tempreserve_impl(dd, asize, fsize >= asize,
+ err = dsl_dir_tempreserve_impl(dd, asize, fsize >= asize, FALSE,
tr_list, tx);
if (err == 0) {
--- a/usr/src/uts/common/fs/zfs/metaslab.c Fri Aug 24 17:45:12 2007 -0700
+++ b/usr/src/uts/common/fs/zfs/metaslab.c Sun Aug 26 11:19:04 2007 -0700
@@ -534,8 +534,8 @@
VERIFY(0 == dmu_bonus_hold(mos, smo->smo_object, FTAG, &db));
dmu_buf_will_dirty(db, tx);
- ASSERT3U(db->db_size, ==, sizeof (*smo));
- bcopy(smo, db->db_data, db->db_size);
+ ASSERT3U(db->db_size, >=, sizeof (*smo));
+ bcopy(smo, db->db_data, sizeof (*smo));
dmu_buf_rele(db, FTAG);
dmu_tx_commit(tx);
--- a/usr/src/uts/common/fs/zfs/spa_misc.c Fri Aug 24 17:45:12 2007 -0700
+++ b/usr/src/uts/common/fs/zfs/spa_misc.c Sun Aug 26 11:19:04 2007 -0700
@@ -1016,12 +1016,15 @@
if (!spa->spa_deflate)
return (BP_GET_ASIZE(bp));
+ spa_config_enter(spa, RW_READER, FTAG);
for (i = 0; i < SPA_DVAS_PER_BP; i++) {
vdev_t *vd =
vdev_lookup_top(spa, DVA_GET_VDEV(&bp->blk_dva[i]));
- sz += (DVA_GET_ASIZE(&bp->blk_dva[i]) >> SPA_MINBLOCKSHIFT) *
- vd->vdev_deflate_ratio;
+ if (vd)
+ sz += (DVA_GET_ASIZE(&bp->blk_dva[i]) >>
+ SPA_MINBLOCKSHIFT) * vd->vdev_deflate_ratio;
}
+ spa_config_exit(spa, FTAG);
return (sz);
}
--- a/usr/src/uts/common/fs/zfs/sys/dbuf.h Fri Aug 24 17:45:12 2007 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/dbuf.h Sun Aug 26 11:19:04 2007 -0700
@@ -239,7 +239,7 @@
uint64_t dbuf_whichblock(struct dnode *di, uint64_t offset);
dmu_buf_impl_t *dbuf_create_tlib(struct dnode *dn, char *data);
-dmu_buf_impl_t *dbuf_create_bonus(struct dnode *dn);
+void dbuf_create_bonus(struct dnode *dn);
dmu_buf_impl_t *dbuf_hold(struct dnode *dn, uint64_t blkid, void *tag);
dmu_buf_impl_t *dbuf_hold_level(struct dnode *dn, int level, uint64_t blkid,
--- a/usr/src/uts/common/fs/zfs/sys/dmu.h Fri Aug 24 17:45:12 2007 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/dmu.h Sun Aug 26 11:19:04 2007 -0700
@@ -157,7 +157,7 @@
int dmu_objset_open(const char *name, dmu_objset_type_t type, int mode,
objset_t **osp);
void dmu_objset_close(objset_t *os);
-int dmu_objset_evict_dbufs(objset_t *os, boolean_t try);
+int dmu_objset_evict_dbufs(objset_t *os);
int dmu_objset_create(const char *name, dmu_objset_type_t type,
objset_t *clone_parent,
void (*func)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx), void *arg);
@@ -294,6 +294,7 @@
*/
int dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **);
int dmu_bonus_max(void);
+int dmu_set_bonus(dmu_buf_t *, int, dmu_tx_t *);
/*
* Obtain the DMU buffer from the specified object which contains the
--- a/usr/src/uts/common/fs/zfs/sys/dmu_objset.h Fri Aug 24 17:45:12 2007 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/dmu_objset.h Sun Aug 26 11:19:04 2007 -0700
@@ -108,7 +108,7 @@
int dmu_objset_find(char *name, int func(char *, void *), void *arg,
int flags);
void dmu_objset_byteswap(void *buf, size_t size);
-int dmu_objset_evict_dbufs(objset_t *os, boolean_t try);
+int dmu_objset_evict_dbufs(objset_t *os);
/* called from dsl */
void dmu_objset_sync(objset_impl_t *os, zio_t *zio, dmu_tx_t *tx);
--- a/usr/src/uts/common/fs/zfs/sys/dnode.h Fri Aug 24 17:45:12 2007 -0700
+++ b/usr/src/uts/common/fs/zfs/sys/dnode.h Sun Aug 26 11:19:04 2007 -0700
@@ -64,6 +64,7 @@
#define DN_MAX_NBLKPTR ((DNODE_SIZE - DNODE_CORE_SIZE) >> SPA_BLKPTRSHIFT)
#define DN_MAX_BONUSLEN (DNODE_SIZE - DNODE_CORE_SIZE - (1 << SPA_BLKPTRSHIFT))
#define DN_MAX_OBJECT (1ULL << DN_MAX_OBJECT_SHIFT)
+#define DN_ZERO_BONUSLEN (DN_MAX_BONUSLEN + 1)
#define DNODES_PER_BLOCK_SHIFT (DNODE_BLOCK_SHIFT - DNODE_SHIFT)
#define DNODES_PER_BLOCK (1ULL << DNODES_PER_BLOCK_SHIFT)
@@ -156,6 +157,7 @@
uint64_t dn_maxblkid;
uint8_t dn_next_nlevels[TXG_SIZE];
uint8_t dn_next_indblkshift[TXG_SIZE];
+ uint16_t dn_next_bonuslen[TXG_SIZE];
uint32_t dn_next_blksz[TXG_SIZE]; /* next block size in bytes */
/* protected by os_lock: */
@@ -197,11 +199,12 @@
uint64_t object);
void dnode_special_close(dnode_t *dn);
+void dnode_setbonuslen(dnode_t *dn, int newsize, dmu_tx_t *tx);
int dnode_hold(struct objset_impl *dd, uint64_t object,
void *ref, dnode_t **dnp);
int dnode_hold_impl(struct objset_impl *dd, uint64_t object, int flag,
void *ref, dnode_t **dnp);
-void dnode_add_ref(dnode_t *dn, void *ref);
+boolean_t dnode_add_ref(dnode_t *dn, void *ref);
void dnode_rele(dnode_t *dn, void *ref);
void dnode_setdirty(dnode_t *dn, dmu_tx_t *tx);
void dnode_sync(dnode_t *dn, dmu_tx_t *tx);
@@ -226,7 +229,7 @@
void dnode_fini(void);
int dnode_next_offset(dnode_t *dn, boolean_t hole, uint64_t *off, int minlvl,
uint64_t blkfill, uint64_t txg);
-int dnode_evict_dbufs(dnode_t *dn, boolean_t try);
+void dnode_evict_dbufs(dnode_t *dn);
#ifdef ZFS_DEBUG
--- a/usr/src/uts/common/fs/zfs/vdev.c Fri Aug 24 17:45:12 2007 -0700
+++ b/usr/src/uts/common/fs/zfs/vdev.c Sun Aug 26 11:19:04 2007 -0700
@@ -765,8 +765,8 @@
error = dmu_bonus_hold(mos, object, FTAG, &db);
if (error)
return (error);
- ASSERT3U(db->db_size, ==, sizeof (smo));
- bcopy(db->db_data, &smo, db->db_size);
+ ASSERT3U(db->db_size, >=, sizeof (smo));
+ bcopy(db->db_data, &smo, sizeof (smo));
ASSERT3U(smo.smo_object, ==, object);
dmu_buf_rele(db, FTAG);
}
@@ -1234,8 +1234,8 @@
if ((error = dmu_bonus_hold(mos, smo->smo_object, FTAG, &db)) != 0)
return (error);
- ASSERT3U(db->db_size, ==, sizeof (*smo));
- bcopy(db->db_data, smo, db->db_size);
+ ASSERT3U(db->db_size, >=, sizeof (*smo));
+ bcopy(db->db_data, smo, sizeof (*smo));
dmu_buf_rele(db, FTAG);
mutex_enter(&vd->vdev_dtl_lock);
@@ -1305,8 +1305,8 @@
VERIFY(0 == dmu_bonus_hold(mos, smo->smo_object, FTAG, &db));
dmu_buf_will_dirty(db, tx);
- ASSERT3U(db->db_size, ==, sizeof (*smo));
- bcopy(smo, db->db_data, db->db_size);
+ ASSERT3U(db->db_size, >=, sizeof (*smo));
+ bcopy(smo, db->db_data, sizeof (*smo));
dmu_buf_rele(db, FTAG);
dmu_tx_commit(tx);
--- a/usr/src/uts/common/fs/zfs/zfs_vfsops.c Fri Aug 24 17:45:12 2007 -0700
+++ b/usr/src/uts/common/fs/zfs/zfs_vfsops.c Sun Aug 26 11:19:04 2007 -0700
@@ -1132,12 +1132,9 @@
}
/*
- * Evict all dbufs so that cached znodes will be freed
+ * Evict cached data
*/
- if (dmu_objset_evict_dbufs(os, B_TRUE)) {
- txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
- (void) dmu_objset_evict_dbufs(os, B_FALSE);
- }
+ (void) dmu_objset_evict_dbufs(os);
/*
* Finally close the objset
--- a/usr/src/uts/common/os/list.c Fri Aug 24 17:45:12 2007 -0700
+++ b/usr/src/uts/common/os/list.c Sun Aug 26 11:19:04 2007 -0700
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -113,6 +113,7 @@
{
list_node_t *lold = list_d2l(list, object);
ASSERT(!list_empty(list));
+ ASSERT(lold->list_next != NULL);
lold->list_prev->list_next = lold->list_next;
lold->list_next->list_prev = lold->list_prev;
lold->list_next = lold->list_prev = NULL;