# HG changeset patch # User maybee # Date 1188152344 25200 # Node ID 96d96f8de974f2de2b704715629ab5f2771854fc # Parent f73f303e6a069eee0990dbbc17f38f19f15aeea1 6569719 panic dangling dbufs (dn=ffffffff28814d30, dbuf=ffffffff20756008) 6573361 panic turnstile_block, unowned mutex 6584864 $MOS is not properly bounded by pool size 6585265 need bonus resize interface 6587723 BAD TRAP: type=e (#pf Page fault) occurred in module "zfs" due to a NULL pointer dereference 6589799 dangling dbuf after zinject 6594025 panic: dangling dbufs during shutdown diff -r f73f303e6a06 -r 96d96f8de974 usr/src/cmd/ztest/ztest.c --- a/usr/src/cmd/ztest/ztest.c Fri Aug 24 17:45:12 2007 -0700 +++ b/usr/src/cmd/ztest/ztest.c Sun Aug 26 11:19:04 2007 -0700 @@ -1541,7 +1541,7 @@ ASSERT(doi.doi_bonus_type == DMU_OT_PLAIN_OTHER); ASSERT3S(doi.doi_physical_blks, >=, 0); - bonuslen = db->db_size; + bonuslen = doi.doi_bonus_size; for (c = 0; c < bonuslen; c++) { if (((uint8_t *)db->db_data)[c] != @@ -1660,7 +1660,7 @@ * Write to both the bonus buffer and the regular data. */ VERIFY(0 == dmu_bonus_hold(os, object, FTAG, &db)); - ASSERT3U(bonuslen, ==, db->db_size); + ASSERT3U(bonuslen, <=, db->db_size); dmu_object_size_from_db(db, &va_blksize, &va_nblocks); ASSERT3S(va_nblocks, >=, 0); @@ -1671,7 +1671,7 @@ * See comments above regarding the contents of * the bonus buffer and the word at endoff. */ - for (c = 0; c < db->db_size; c++) + for (c = 0; c < bonuslen; c++) ((uint8_t *)db->db_data)[c] = (uint8_t)(c + bonuslen); dmu_buf_rele(db, FTAG); @@ -1948,8 +1948,8 @@ */ VERIFY(0 == dmu_bonus_hold(os, ZTEST_DIROBJ, FTAG, &db)); - ASSERT3U(db->db_size, ==, sizeof (rbt)); - bcopy(db->db_data, &rbt, db->db_size); + ASSERT3U(db->db_size, >=, sizeof (rbt)); + bcopy(db->db_data, &rbt, sizeof (rbt)); if (rbt.bt_objset != 0) { ASSERT3U(rbt.bt_objset, ==, dmu_objset_id(os)); ASSERT3U(rbt.bt_object, ==, ZTEST_DIROBJ); @@ -2041,19 +2041,37 @@ wbt.bt_thread = za->za_instance; if (off == -1ULL) { + dmu_object_info_t doi; + char *off; + wbt.bt_seq = 0; VERIFY(0 == dmu_bonus_hold(os, ZTEST_DIROBJ, FTAG, &db)); - ASSERT3U(db->db_size, ==, sizeof (wbt)); - bcopy(db->db_data, &rbt, db->db_size); + dmu_object_info_from_db(db, &doi); + ASSERT3U(doi.doi_bonus_size, >=, sizeof (wbt)); + off = (char *)db->db_data + + doi.doi_bonus_size - sizeof (wbt); + bcopy(off, &rbt, sizeof (wbt)); if (rbt.bt_objset != 0) { ASSERT3U(rbt.bt_objset, ==, wbt.bt_objset); ASSERT3U(rbt.bt_object, ==, wbt.bt_object); ASSERT3U(rbt.bt_offset, ==, wbt.bt_offset); ASSERT3U(rbt.bt_txg, <=, wbt.bt_txg); } + if (ztest_random(10) == 0) { + int newsize = (ztest_random( + db->db_size / sizeof (wbt)) + 1) * + sizeof (wbt); + + ASSERT3U(newsize, >=, sizeof (wbt)); + ASSERT3U(newsize, <=, db->db_size); + error = dmu_set_bonus(db, newsize, tx); + ASSERT3U(error, ==, 0); + off = (char *)db->db_data + newsize - + sizeof (wbt); + } dmu_buf_will_dirty(db, tx); - bcopy(&wbt, db->db_data, db->db_size); + bcopy(&wbt, off, db->db_size); dmu_buf_rele(db, FTAG); dmu_tx_commit(tx); continue; diff -r f73f303e6a06 -r 96d96f8de974 usr/src/uts/common/fs/zfs/dbuf.c --- a/usr/src/uts/common/fs/zfs/dbuf.c Fri Aug 24 17:45:12 2007 -0700 +++ b/usr/src/uts/common/fs/zfs/dbuf.c Sun Aug 26 11:19:04 2007 -0700 @@ -307,7 +307,7 @@ } if (db->db_blkid == DB_BONUS_BLKID) { ASSERT(dn != NULL); - ASSERT3U(db->db.db_size, ==, dn->dn_bonuslen); + ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); ASSERT3U(db->db.db_offset, ==, DB_BONUS_BLKID); } else { ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size); @@ -468,13 +468,15 @@ ASSERT(db->db_buf == NULL); if (db->db_blkid == DB_BONUS_BLKID) { - ASSERT3U(db->db_dnode->dn_bonuslen, ==, db->db.db_size); + int bonuslen = db->db_dnode->dn_bonuslen; + + ASSERT3U(bonuslen, <=, db->db.db_size); db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN); arc_space_consume(DN_MAX_BONUSLEN); - if (db->db.db_size < DN_MAX_BONUSLEN) + if (bonuslen < DN_MAX_BONUSLEN) bzero(db->db.db_data, DN_MAX_BONUSLEN); bcopy(DN_BONUS(db->db_dnode->dn_phys), db->db.db_data, - db->db.db_size); + bonuslen); dbuf_update_data(db); db->db_state = DB_CACHED; mutex_exit(&db->db_mtx); @@ -781,31 +783,28 @@ } static int -dbuf_new_block(dmu_buf_impl_t *db) +dbuf_block_freeable(dmu_buf_impl_t *db) { dsl_dataset_t *ds = db->db_objset->os_dsl_dataset; uint64_t birth_txg = 0; - /* Don't count meta-objects */ - if (ds == NULL) - return (FALSE); - /* * We don't need any locking to protect db_blkptr: * If it's syncing, then db_last_dirty will be set * so we'll ignore db_blkptr. */ ASSERT(MUTEX_HELD(&db->db_mtx)); - /* If we have been dirtied since the last snapshot, its not new */ if (db->db_last_dirty) birth_txg = db->db_last_dirty->dr_txg; else if (db->db_blkptr) birth_txg = db->db_blkptr->blk_birth; + /* If we don't exist or are in a snapshot, we can't be freed */ if (birth_txg) - return (!dsl_dataset_block_freeable(ds, birth_txg)); + return (ds == NULL || + dsl_dataset_block_freeable(ds, birth_txg)); else - return (TRUE); + return (FALSE); } void @@ -964,6 +963,27 @@ dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); + if (db->db_blkid != DB_BONUS_BLKID) { + /* + * Update the accounting. + */ + if (dbuf_block_freeable(db)) { + blkptr_t *bp = db->db_blkptr; + int64_t willfree = (bp && !BP_IS_HOLE(bp)) ? + bp_get_dasize(os->os_spa, bp) : db->db.db_size; + /* + * This is only a guess -- if the dbuf is dirty + * in a previous txg, we don't know how much + * space it will use on disk yet. We should + * really have the struct_rwlock to access + * db_blkptr, but since this is just a guess, + * it's OK if we get an odd answer. + */ + dnode_willuse_space(dn, -willfree, tx); + } + dnode_willuse_space(dn, db->db.db_size, tx); + } + /* * If this buffer is dirty in an old transaction group we need * to make a copy of it so that the changes we make in this @@ -1013,25 +1033,6 @@ db->db_freed_in_flight = FALSE; } - if (db->db_blkid != DB_BONUS_BLKID) { - /* - * Update the accounting. - */ - if (!dbuf_new_block(db) && db->db_blkptr) { - /* - * This is only a guess -- if the dbuf is dirty - * in a previous txg, we don't know how much - * space it will use on disk yet. We should - * really have the struct_rwlock to access - * db_blkptr, but since this is just a guess, - * it's OK if we get an odd answer. - */ - dnode_willuse_space(dn, - -bp_get_dasize(os->os_spa, db->db_blkptr), tx); - } - dnode_willuse_space(dn, db->db.db_size, tx); - } - /* * This buffer is now part of this txg */ @@ -1297,6 +1298,7 @@ if (db->db_blkid != DB_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) { list_remove(&dn->dn_dbufs, db); dnode_rele(dn, db); + db->db_dnode = NULL; } if (db->db_buf) @@ -1397,7 +1399,9 @@ if (blkid == DB_BONUS_BLKID) { ASSERT3P(parent, ==, dn->dn_dbuf); - db->db.db_size = dn->dn_bonuslen; + db->db.db_size = DN_MAX_BONUSLEN - + (dn->dn_nblkptr-1) * sizeof (blkptr_t); + ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); db->db.db_offset = DB_BONUS_BLKID; db->db_state = DB_UNCACHED; /* the bonus dbuf is not placed in the hash table */ @@ -1471,29 +1475,23 @@ ASSERT(refcount_is_zero(&db->db_holds)); if (db->db_blkid != DB_BONUS_BLKID) { - dnode_t *dn = db->db_dnode; - boolean_t need_mutex = !MUTEX_HELD(&dn->dn_dbufs_mtx); - - if (need_mutex) - mutex_enter(&dn->dn_dbufs_mtx); - /* * If this dbuf is still on the dn_dbufs list, * remove it from that list. */ - if (list_link_active(&db->db_link)) { - ASSERT(need_mutex); + if (db->db_dnode) { + dnode_t *dn = db->db_dnode; + + mutex_enter(&dn->dn_dbufs_mtx); list_remove(&dn->dn_dbufs, db); mutex_exit(&dn->dn_dbufs_mtx); dnode_rele(dn, db); - } else if (need_mutex) { - mutex_exit(&dn->dn_dbufs_mtx); + db->db_dnode = NULL; } dbuf_hash_remove(db); } db->db_parent = NULL; - db->db_dnode = NULL; db->db_buf = NULL; ASSERT(!list_link_active(&db->db_link)); @@ -1662,16 +1660,13 @@ return (err ? NULL : db); } -dmu_buf_impl_t * +void dbuf_create_bonus(dnode_t *dn) { - dmu_buf_impl_t *db = dn->dn_bonus; - ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); ASSERT(dn->dn_bonus == NULL); - db = dbuf_create(dn, 0, DB_BONUS_BLKID, dn->dn_dbuf, NULL); - return (db); + dn->dn_bonus = dbuf_create(dn, 0, DB_BONUS_BLKID, dn->dn_dbuf, NULL); } #pragma weak dmu_buf_add_ref = dbuf_add_ref @@ -1919,11 +1914,7 @@ */ if (db->db_blkid == DB_BONUS_BLKID) { dbuf_dirty_record_t **drp; - /* - * Use dn_phys->dn_bonuslen since db.db_size is the length - * of the bonus buffer in the open transaction rather than - * the syncing transaction. - */ + ASSERT(*datap != NULL); ASSERT3U(db->db_level, ==, 0); ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN); diff -r f73f303e6a06 -r 96d96f8de974 usr/src/uts/common/fs/zfs/dmu.c --- a/usr/src/uts/common/fs/zfs/dmu.c Fri Aug 24 17:45:12 2007 -0700 +++ b/usr/src/uts/common/fs/zfs/dmu.c Sun Aug 26 11:19:04 2007 -0700 @@ -119,6 +119,19 @@ return (DN_MAX_BONUSLEN); } +int +dmu_set_bonus(dmu_buf_t *db, int newsize, dmu_tx_t *tx) +{ + dnode_t *dn = ((dmu_buf_impl_t *)db)->db_dnode; + + if (dn->dn_bonus != (dmu_buf_impl_t *)db) + return (EINVAL); + if (newsize < 0 || newsize > db->db_size) + return (EINVAL); + dnode_setbonuslen(dn, newsize, tx); + return (0); +} + /* * returns ENOENT, EIO, or 0. */ @@ -126,27 +139,27 @@ dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **dbp) { dnode_t *dn; - int err, count; dmu_buf_impl_t *db; + int error; - err = dnode_hold(os->os, object, FTAG, &dn); - if (err) - return (err); + error = dnode_hold(os->os, object, FTAG, &dn); + if (error) + return (error); rw_enter(&dn->dn_struct_rwlock, RW_READER); if (dn->dn_bonus == NULL) { rw_exit(&dn->dn_struct_rwlock); rw_enter(&dn->dn_struct_rwlock, RW_WRITER); if (dn->dn_bonus == NULL) - dn->dn_bonus = dbuf_create_bonus(dn); + dbuf_create_bonus(dn); } db = dn->dn_bonus; rw_exit(&dn->dn_struct_rwlock); - mutex_enter(&db->db_mtx); - count = refcount_add(&db->db_holds, tag); - mutex_exit(&db->db_mtx); - if (count == 1) - dnode_add_ref(dn, db); + + /* as long as the bonus buf is held, the dnode will be held */ + if (refcount_add(&db->db_holds, tag) == 1) + VERIFY(dnode_add_ref(dn, db)); + dnode_rele(dn, FTAG); VERIFY(0 == dbuf_read(db, NULL, DB_RF_MUST_SUCCEED)); @@ -388,7 +401,6 @@ while (size > 0) { uint64_t mylen = MIN(size, DMU_MAX_ACCESS / 2); - int err; /* * NB: we could do this block-at-a-time, but it's nice @@ -397,7 +409,7 @@ err = dmu_buf_hold_array_by_dnode(dn, offset, mylen, TRUE, FTAG, &numbufs, &dbp); if (err) - return (err); + break; for (i = 0; i < numbufs; i++) { int tocpy; @@ -418,7 +430,7 @@ dmu_buf_rele_array(dbp, numbufs, FTAG); } dnode_rele(dn, FTAG); - return (0); + return (err); } void diff -r f73f303e6a06 -r 96d96f8de974 usr/src/uts/common/fs/zfs/dmu_objset.c --- a/usr/src/uts/common/fs/zfs/dmu_objset.c Fri Aug 24 17:45:12 2007 -0700 +++ b/usr/src/uts/common/fs/zfs/dmu_objset.c Sun Aug 26 11:19:04 2007 -0700 @@ -309,7 +309,7 @@ } int -dmu_objset_evict_dbufs(objset_t *os, boolean_t try) +dmu_objset_evict_dbufs(objset_t *os) { objset_impl_t *osi = os->os; dnode_t *dn; @@ -327,34 +327,25 @@ * skip. */ for (dn = list_head(&osi->os_dnodes); - dn && refcount_is_zero(&dn->dn_holds); + dn && !dnode_add_ref(dn, FTAG); dn = list_next(&osi->os_dnodes, dn)) continue; - if (dn) - dnode_add_ref(dn, FTAG); while (dn) { dnode_t *next_dn = dn; do { next_dn = list_next(&osi->os_dnodes, next_dn); - } while (next_dn && refcount_is_zero(&next_dn->dn_holds)); - if (next_dn) - dnode_add_ref(next_dn, FTAG); + } while (next_dn && !dnode_add_ref(next_dn, FTAG)); mutex_exit(&osi->os_lock); - if (dnode_evict_dbufs(dn, try)) { - dnode_rele(dn, FTAG); - if (next_dn) - dnode_rele(next_dn, FTAG); - return (1); - } + dnode_evict_dbufs(dn); dnode_rele(dn, FTAG); mutex_enter(&osi->os_lock); dn = next_dn; } mutex_exit(&osi->os_lock); - return (0); + return (list_head(&osi->os_dnodes) != osi->os_meta_dnode); } void @@ -383,7 +374,7 @@ * nothing can be added to the list at this point. */ os.os = osi; - (void) dmu_objset_evict_dbufs(&os, 0); + (void) dmu_objset_evict_dbufs(&os); ASSERT3P(list_head(&osi->os_dnodes), ==, osi->os_meta_dnode); ASSERT3P(list_tail(&osi->os_dnodes), ==, osi->os_meta_dnode); diff -r f73f303e6a06 -r 96d96f8de974 usr/src/uts/common/fs/zfs/dmu_send.c --- a/usr/src/uts/common/fs/zfs/dmu_send.c Fri Aug 24 17:45:12 2007 -0700 +++ b/usr/src/uts/common/fs/zfs/dmu_send.c Sun Aug 26 11:19:04 2007 -0700 @@ -610,13 +610,13 @@ VERIFY(0 == dmu_bonus_hold(os, drro->drr_object, FTAG, &db)); dmu_buf_will_dirty(db, tx); - ASSERT3U(db->db_size, ==, drro->drr_bonuslen); - data = restore_read(ra, P2ROUNDUP(db->db_size, 8)); + ASSERT3U(db->db_size, >=, drro->drr_bonuslen); + data = restore_read(ra, P2ROUNDUP(drro->drr_bonuslen, 8)); if (data == NULL) { dmu_tx_commit(tx); return (ra->err); } - bcopy(data, db->db_data, db->db_size); + bcopy(data, db->db_data, drro->drr_bonuslen); if (ra->byteswap) { dmu_ot[drro->drr_bonustype].ot_byteswap(db->db_data, drro->drr_bonuslen); diff -r f73f303e6a06 -r 96d96f8de974 usr/src/uts/common/fs/zfs/dnode.c --- a/usr/src/uts/common/fs/zfs/dnode.c Fri Aug 24 17:45:12 2007 -0700 +++ b/usr/src/uts/common/fs/zfs/dnode.c Sun Aug 26 11:19:04 2007 -0700 @@ -240,6 +240,23 @@ else return (0); } +void +dnode_setbonuslen(dnode_t *dn, int newsize, dmu_tx_t *tx) +{ + ASSERT3U(refcount_count(&dn->dn_holds), >=, 1); + + dnode_setdirty(dn, tx); + rw_enter(&dn->dn_struct_rwlock, RW_WRITER); + ASSERT3U(newsize, <=, DN_MAX_BONUSLEN - + (dn->dn_nblkptr-1) * sizeof (blkptr_t)); + dn->dn_bonuslen = newsize; + if (newsize == 0) + dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = DN_ZERO_BONUSLEN; + else + dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = dn->dn_bonuslen; + rw_exit(&dn->dn_struct_rwlock); +} + static void dnode_setdblksz(dnode_t *dn, int size) { @@ -363,6 +380,7 @@ for (i = 0; i < TXG_SIZE; i++) { ASSERT3U(dn->dn_next_nlevels[i], ==, 0); ASSERT3U(dn->dn_next_indblkshift[i], ==, 0); + ASSERT3U(dn->dn_next_bonuslen[i], ==, 0); ASSERT3U(dn->dn_next_blksz[i], ==, 0); ASSERT(!list_link_active(&dn->dn_dirty_link[i])); ASSERT3P(list_head(&dn->dn_dirty_records[i]), ==, NULL); @@ -390,6 +408,7 @@ dnode_setdirty(dn, tx); dn->dn_next_indblkshift[tx->tx_txg & TXG_MASK] = ibs; + dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = dn->dn_bonuslen; dn->dn_next_blksz[tx->tx_txg & TXG_MASK] = dn->dn_datablksz; } @@ -397,7 +416,7 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) { - int i; + int i, old_nblkptr; dmu_buf_impl_t *db = NULL; ASSERT3U(blocksize, >=, SPA_MINBLOCKSIZE); @@ -414,7 +433,7 @@ ASSERT(!list_link_active(&dn->dn_dirty_link[i])); /* clean up any unreferenced dbufs */ - (void) dnode_evict_dbufs(dn, 0); + dnode_evict_dbufs(dn); ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL); /* @@ -437,38 +456,18 @@ } dnode_setdblksz(dn, blocksize); dnode_setdirty(dn, tx); + dn->dn_next_bonuslen[tx->tx_txg&TXG_MASK] = bonuslen; dn->dn_next_blksz[tx->tx_txg&TXG_MASK] = blocksize; rw_exit(&dn->dn_struct_rwlock); - if (db) { + if (db) dbuf_rele(db, FTAG); - db = NULL; - } /* change type */ dn->dn_type = ot; - if (dn->dn_bonuslen != bonuslen) { - /* change bonus size */ - if (bonuslen == 0) - bonuslen = 1; /* XXX */ - rw_enter(&dn->dn_struct_rwlock, RW_WRITER); - if (dn->dn_bonus == NULL) - dn->dn_bonus = dbuf_create_bonus(dn); - db = dn->dn_bonus; - rw_exit(&dn->dn_struct_rwlock); - if (refcount_add(&db->db_holds, FTAG) == 1) - dnode_add_ref(dn, db); - VERIFY(0 == dbuf_read(db, NULL, DB_RF_MUST_SUCCEED)); - mutex_enter(&db->db_mtx); - ASSERT3U(db->db.db_size, ==, dn->dn_bonuslen); - ASSERT(db->db.db_data != NULL); - db->db.db_size = bonuslen; - mutex_exit(&db->db_mtx); - (void) dbuf_dirty(db, tx); - } - /* change bonus size and type */ mutex_enter(&dn->dn_mtx); + old_nblkptr = dn->dn_nblkptr; dn->dn_bonustype = bonustype; dn->dn_bonuslen = bonuslen; dn->dn_nblkptr = 1 + ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT); @@ -476,12 +475,15 @@ dn->dn_compress = ZIO_COMPRESS_INHERIT; ASSERT3U(dn->dn_nblkptr, <=, DN_MAX_NBLKPTR); - /* - * NB: we have to do the dbuf_rele after we've changed the - * dn_bonuslen, for the sake of dbuf_verify(). - */ - if (db) - dbuf_rele(db, FTAG); + /* XXX - for now, we can't make nblkptr smaller */ + ASSERT3U(dn->dn_nblkptr, >=, old_nblkptr); + + /* fix up the bonus db_size if dn_nblkptr has changed */ + if (dn->dn_bonus && dn->dn_bonuslen != old_nblkptr) { + dn->dn_bonus->db.db_size = + DN_MAX_BONUSLEN - (dn->dn_nblkptr-1) * sizeof (blkptr_t); + ASSERT(dn->dn_bonuslen <= dn->dn_bonus->db.db_size); + } dn->dn_allocated_txg = tx->tx_txg; mutex_exit(&dn->dn_mtx); @@ -646,11 +648,22 @@ return (dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, tag, dnp)); } -void +/* + * Can only add a reference if there is already at least one + * reference on the dnode. Returns FALSE if unable to add a + * new reference. + */ +boolean_t dnode_add_ref(dnode_t *dn, void *tag) { - ASSERT(refcount_count(&dn->dn_holds) > 0); - (void) refcount_add(&dn->dn_holds, tag); + mutex_enter(&dn->dn_mtx); + if (refcount_is_zero(&dn->dn_holds)) { + mutex_exit(&dn->dn_mtx); + return (FALSE); + } + VERIFY(1 < refcount_add(&dn->dn_holds, tag)); + mutex_exit(&dn->dn_mtx); + return (TRUE); } void @@ -658,7 +671,9 @@ { uint64_t refs; + mutex_enter(&dn->dn_mtx); refs = refcount_remove(&dn->dn_holds, tag); + mutex_exit(&dn->dn_mtx); /* NOTE: the DNODE_DNODE does not have a dn_dbuf */ if (refs == 0 && dn->dn_dbuf) dbuf_rele(dn->dn_dbuf, dn); @@ -694,6 +709,7 @@ ASSERT(!refcount_is_zero(&dn->dn_holds) || list_head(&dn->dn_dbufs)); ASSERT(dn->dn_datablksz != 0); + ASSERT3U(dn->dn_next_bonuslen[txg&TXG_MASK], ==, 0); ASSERT3U(dn->dn_next_blksz[txg&TXG_MASK], ==, 0); dprintf_ds(os->os_dsl_dataset, "obj=%llu txg=%llu\n", @@ -716,7 +732,7 @@ * dnode will hang around after we finish processing its * children. */ - dnode_add_ref(dn, (void *)(uintptr_t)tx->tx_txg); + VERIFY(dnode_add_ref(dn, (void *)(uintptr_t)tx->tx_txg)); (void) dbuf_dirty(dn->dn_dbuf, tx); diff -r f73f303e6a06 -r 96d96f8de974 usr/src/uts/common/fs/zfs/dnode_sync.c --- a/usr/src/uts/common/fs/zfs/dnode_sync.c Fri Aug 24 17:45:12 2007 -0700 +++ b/usr/src/uts/common/fs/zfs/dnode_sync.c Sun Aug 26 11:19:04 2007 -0700 @@ -349,8 +349,8 @@ /* * Try to kick all the dnodes dbufs out of the cache... */ -int -dnode_evict_dbufs(dnode_t *dn, boolean_t try) +void +dnode_evict_dbufs(dnode_t *dn) { int progress; int pass = 0; @@ -397,21 +397,6 @@ ASSERT(pass < 100); /* sanity check */ } while (progress); - /* - * This function works fine even if it can't evict everything. - * If were only asked to try to evict everything then - * return an error if we can't. Otherwise panic as the caller - * expects total eviction. - */ - if (list_head(&dn->dn_dbufs) != NULL) { - if (try) { - return (1); - } else { - panic("dangling dbufs (dn=%p, dbuf=%p)\n", - dn, list_head(&dn->dn_dbufs)); - } - } - rw_enter(&dn->dn_struct_rwlock, RW_WRITER); if (dn->dn_bonus && refcount_is_zero(&dn->dn_bonus->db_holds)) { mutex_enter(&dn->dn_bonus->db_mtx); @@ -419,7 +404,6 @@ dn->dn_bonus = NULL; } rw_exit(&dn->dn_struct_rwlock); - return (0); } static void @@ -459,7 +443,7 @@ ASSERT(dmu_tx_is_syncing(tx)); dnode_undirty_dbufs(&dn->dn_dirty_records[txgoff]); - (void) dnode_evict_dbufs(dn, 0); + dnode_evict_dbufs(dn); ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL); /* @@ -565,6 +549,15 @@ dn->dn_next_blksz[txgoff] = 0; } + if (dn->dn_next_bonuslen[txgoff]) { + if (dn->dn_next_bonuslen[txgoff] == DN_ZERO_BONUSLEN) + dnp->dn_bonuslen = 0; + else + dnp->dn_bonuslen = dn->dn_next_bonuslen[txgoff]; + ASSERT(dnp->dn_bonuslen <= DN_MAX_BONUSLEN); + dn->dn_next_bonuslen[txgoff] = 0; + } + if (dn->dn_next_indblkshift[txgoff]) { ASSERT(dnp->dn_nlevels == 1); dnp->dn_indblkshift = dn->dn_next_indblkshift[txgoff]; @@ -588,15 +581,16 @@ dnode_sync_free_range(dn, rp->fr_blkid, rp->fr_nblks, tx); } + /* grab the mutex so we don't race with dnode_block_freed() */ mutex_enter(&dn->dn_mtx); for (rp = avl_first(&dn->dn_ranges[txgoff]); rp; ) { + free_range_t *last = rp; rp = AVL_NEXT(&dn->dn_ranges[txgoff], rp); avl_remove(&dn->dn_ranges[txgoff], last); kmem_free(last, sizeof (free_range_t)); } mutex_exit(&dn->dn_mtx); - if (dn->dn_free_txg > 0 && dn->dn_free_txg <= tx->tx_txg) { dnode_sync_free(dn, tx); return; diff -r f73f303e6a06 -r 96d96f8de974 usr/src/uts/common/fs/zfs/dsl_dir.c --- a/usr/src/uts/common/fs/zfs/dsl_dir.c Fri Aug 24 17:45:12 2007 -0700 +++ b/usr/src/uts/common/fs/zfs/dsl_dir.c Sun Aug 26 11:19:04 2007 -0700 @@ -691,14 +691,13 @@ * be canceled, using dsl_dir_tempreserve_clear(). */ static int -dsl_dir_tempreserve_impl(dsl_dir_t *dd, - uint64_t asize, boolean_t netfree, list_t *tr_list, dmu_tx_t *tx) +dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, + boolean_t netfree, boolean_t noquota, list_t *tr_list, dmu_tx_t *tx) { uint64_t txg = tx->tx_txg; uint64_t est_used, quota, parent_rsrv; int edquot = EDQUOT; int txgidx = txg & TXG_MASK; - boolean_t ismos; int i; struct tempreserve *tr; @@ -718,7 +717,7 @@ * If this transaction will result in a net free of space, we want * to let it through. */ - if (netfree || dd->dd_phys->dd_quota == 0) + if (netfree || noquota || dd->dd_phys->dd_quota == 0) quota = UINT64_MAX; else quota = dd->dd_phys->dd_quota; @@ -732,8 +731,7 @@ * we're very close to full, this will allow a steady trickle of * removes to get through. */ - ismos = (dd->dd_phys->dd_head_dataset_obj == 0); - if (dd->dd_parent == NULL || ismos) { + if (dd->dd_parent == NULL) { uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool, netfree); if (poolsize < quota) { quota = poolsize; @@ -773,9 +771,11 @@ list_insert_tail(tr_list, tr); /* see if it's OK with our parent */ - if (dd->dd_parent && parent_rsrv && !ismos) { + if (dd->dd_parent && parent_rsrv) { + boolean_t ismos = (dd->dd_phys->dd_head_dataset_obj == 0); + return (dsl_dir_tempreserve_impl(dd->dd_parent, - parent_rsrv, netfree, tr_list, tx)); + parent_rsrv, netfree, ismos, tr_list, tx)); } else { return (0); } @@ -800,7 +800,7 @@ ASSERT3S(asize, >=, 0); ASSERT3S(fsize, >=, 0); - err = dsl_dir_tempreserve_impl(dd, asize, fsize >= asize, + err = dsl_dir_tempreserve_impl(dd, asize, fsize >= asize, FALSE, tr_list, tx); if (err == 0) { diff -r f73f303e6a06 -r 96d96f8de974 usr/src/uts/common/fs/zfs/metaslab.c --- a/usr/src/uts/common/fs/zfs/metaslab.c Fri Aug 24 17:45:12 2007 -0700 +++ b/usr/src/uts/common/fs/zfs/metaslab.c Sun Aug 26 11:19:04 2007 -0700 @@ -534,8 +534,8 @@ VERIFY(0 == dmu_bonus_hold(mos, smo->smo_object, FTAG, &db)); dmu_buf_will_dirty(db, tx); - ASSERT3U(db->db_size, ==, sizeof (*smo)); - bcopy(smo, db->db_data, db->db_size); + ASSERT3U(db->db_size, >=, sizeof (*smo)); + bcopy(smo, db->db_data, sizeof (*smo)); dmu_buf_rele(db, FTAG); dmu_tx_commit(tx); diff -r f73f303e6a06 -r 96d96f8de974 usr/src/uts/common/fs/zfs/spa_misc.c --- a/usr/src/uts/common/fs/zfs/spa_misc.c Fri Aug 24 17:45:12 2007 -0700 +++ b/usr/src/uts/common/fs/zfs/spa_misc.c Sun Aug 26 11:19:04 2007 -0700 @@ -1016,12 +1016,15 @@ if (!spa->spa_deflate) return (BP_GET_ASIZE(bp)); + spa_config_enter(spa, RW_READER, FTAG); for (i = 0; i < SPA_DVAS_PER_BP; i++) { vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(&bp->blk_dva[i])); - sz += (DVA_GET_ASIZE(&bp->blk_dva[i]) >> SPA_MINBLOCKSHIFT) * - vd->vdev_deflate_ratio; + if (vd) + sz += (DVA_GET_ASIZE(&bp->blk_dva[i]) >> + SPA_MINBLOCKSHIFT) * vd->vdev_deflate_ratio; } + spa_config_exit(spa, FTAG); return (sz); } diff -r f73f303e6a06 -r 96d96f8de974 usr/src/uts/common/fs/zfs/sys/dbuf.h --- a/usr/src/uts/common/fs/zfs/sys/dbuf.h Fri Aug 24 17:45:12 2007 -0700 +++ b/usr/src/uts/common/fs/zfs/sys/dbuf.h Sun Aug 26 11:19:04 2007 -0700 @@ -239,7 +239,7 @@ uint64_t dbuf_whichblock(struct dnode *di, uint64_t offset); dmu_buf_impl_t *dbuf_create_tlib(struct dnode *dn, char *data); -dmu_buf_impl_t *dbuf_create_bonus(struct dnode *dn); +void dbuf_create_bonus(struct dnode *dn); dmu_buf_impl_t *dbuf_hold(struct dnode *dn, uint64_t blkid, void *tag); dmu_buf_impl_t *dbuf_hold_level(struct dnode *dn, int level, uint64_t blkid, diff -r f73f303e6a06 -r 96d96f8de974 usr/src/uts/common/fs/zfs/sys/dmu.h --- a/usr/src/uts/common/fs/zfs/sys/dmu.h Fri Aug 24 17:45:12 2007 -0700 +++ b/usr/src/uts/common/fs/zfs/sys/dmu.h Sun Aug 26 11:19:04 2007 -0700 @@ -157,7 +157,7 @@ int dmu_objset_open(const char *name, dmu_objset_type_t type, int mode, objset_t **osp); void dmu_objset_close(objset_t *os); -int dmu_objset_evict_dbufs(objset_t *os, boolean_t try); +int dmu_objset_evict_dbufs(objset_t *os); int dmu_objset_create(const char *name, dmu_objset_type_t type, objset_t *clone_parent, void (*func)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx), void *arg); @@ -294,6 +294,7 @@ */ int dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **); int dmu_bonus_max(void); +int dmu_set_bonus(dmu_buf_t *, int, dmu_tx_t *); /* * Obtain the DMU buffer from the specified object which contains the diff -r f73f303e6a06 -r 96d96f8de974 usr/src/uts/common/fs/zfs/sys/dmu_objset.h --- a/usr/src/uts/common/fs/zfs/sys/dmu_objset.h Fri Aug 24 17:45:12 2007 -0700 +++ b/usr/src/uts/common/fs/zfs/sys/dmu_objset.h Sun Aug 26 11:19:04 2007 -0700 @@ -108,7 +108,7 @@ int dmu_objset_find(char *name, int func(char *, void *), void *arg, int flags); void dmu_objset_byteswap(void *buf, size_t size); -int dmu_objset_evict_dbufs(objset_t *os, boolean_t try); +int dmu_objset_evict_dbufs(objset_t *os); /* called from dsl */ void dmu_objset_sync(objset_impl_t *os, zio_t *zio, dmu_tx_t *tx); diff -r f73f303e6a06 -r 96d96f8de974 usr/src/uts/common/fs/zfs/sys/dnode.h --- a/usr/src/uts/common/fs/zfs/sys/dnode.h Fri Aug 24 17:45:12 2007 -0700 +++ b/usr/src/uts/common/fs/zfs/sys/dnode.h Sun Aug 26 11:19:04 2007 -0700 @@ -64,6 +64,7 @@ #define DN_MAX_NBLKPTR ((DNODE_SIZE - DNODE_CORE_SIZE) >> SPA_BLKPTRSHIFT) #define DN_MAX_BONUSLEN (DNODE_SIZE - DNODE_CORE_SIZE - (1 << SPA_BLKPTRSHIFT)) #define DN_MAX_OBJECT (1ULL << DN_MAX_OBJECT_SHIFT) +#define DN_ZERO_BONUSLEN (DN_MAX_BONUSLEN + 1) #define DNODES_PER_BLOCK_SHIFT (DNODE_BLOCK_SHIFT - DNODE_SHIFT) #define DNODES_PER_BLOCK (1ULL << DNODES_PER_BLOCK_SHIFT) @@ -156,6 +157,7 @@ uint64_t dn_maxblkid; uint8_t dn_next_nlevels[TXG_SIZE]; uint8_t dn_next_indblkshift[TXG_SIZE]; + uint16_t dn_next_bonuslen[TXG_SIZE]; uint32_t dn_next_blksz[TXG_SIZE]; /* next block size in bytes */ /* protected by os_lock: */ @@ -197,11 +199,12 @@ uint64_t object); void dnode_special_close(dnode_t *dn); +void dnode_setbonuslen(dnode_t *dn, int newsize, dmu_tx_t *tx); int dnode_hold(struct objset_impl *dd, uint64_t object, void *ref, dnode_t **dnp); int dnode_hold_impl(struct objset_impl *dd, uint64_t object, int flag, void *ref, dnode_t **dnp); -void dnode_add_ref(dnode_t *dn, void *ref); +boolean_t dnode_add_ref(dnode_t *dn, void *ref); void dnode_rele(dnode_t *dn, void *ref); void dnode_setdirty(dnode_t *dn, dmu_tx_t *tx); void dnode_sync(dnode_t *dn, dmu_tx_t *tx); @@ -226,7 +229,7 @@ void dnode_fini(void); int dnode_next_offset(dnode_t *dn, boolean_t hole, uint64_t *off, int minlvl, uint64_t blkfill, uint64_t txg); -int dnode_evict_dbufs(dnode_t *dn, boolean_t try); +void dnode_evict_dbufs(dnode_t *dn); #ifdef ZFS_DEBUG diff -r f73f303e6a06 -r 96d96f8de974 usr/src/uts/common/fs/zfs/vdev.c --- a/usr/src/uts/common/fs/zfs/vdev.c Fri Aug 24 17:45:12 2007 -0700 +++ b/usr/src/uts/common/fs/zfs/vdev.c Sun Aug 26 11:19:04 2007 -0700 @@ -765,8 +765,8 @@ error = dmu_bonus_hold(mos, object, FTAG, &db); if (error) return (error); - ASSERT3U(db->db_size, ==, sizeof (smo)); - bcopy(db->db_data, &smo, db->db_size); + ASSERT3U(db->db_size, >=, sizeof (smo)); + bcopy(db->db_data, &smo, sizeof (smo)); ASSERT3U(smo.smo_object, ==, object); dmu_buf_rele(db, FTAG); } @@ -1234,8 +1234,8 @@ if ((error = dmu_bonus_hold(mos, smo->smo_object, FTAG, &db)) != 0) return (error); - ASSERT3U(db->db_size, ==, sizeof (*smo)); - bcopy(db->db_data, smo, db->db_size); + ASSERT3U(db->db_size, >=, sizeof (*smo)); + bcopy(db->db_data, smo, sizeof (*smo)); dmu_buf_rele(db, FTAG); mutex_enter(&vd->vdev_dtl_lock); @@ -1305,8 +1305,8 @@ VERIFY(0 == dmu_bonus_hold(mos, smo->smo_object, FTAG, &db)); dmu_buf_will_dirty(db, tx); - ASSERT3U(db->db_size, ==, sizeof (*smo)); - bcopy(smo, db->db_data, db->db_size); + ASSERT3U(db->db_size, >=, sizeof (*smo)); + bcopy(smo, db->db_data, sizeof (*smo)); dmu_buf_rele(db, FTAG); dmu_tx_commit(tx); diff -r f73f303e6a06 -r 96d96f8de974 usr/src/uts/common/fs/zfs/zfs_vfsops.c --- a/usr/src/uts/common/fs/zfs/zfs_vfsops.c Fri Aug 24 17:45:12 2007 -0700 +++ b/usr/src/uts/common/fs/zfs/zfs_vfsops.c Sun Aug 26 11:19:04 2007 -0700 @@ -1132,12 +1132,9 @@ } /* - * Evict all dbufs so that cached znodes will be freed + * Evict cached data */ - if (dmu_objset_evict_dbufs(os, B_TRUE)) { - txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0); - (void) dmu_objset_evict_dbufs(os, B_FALSE); - } + (void) dmu_objset_evict_dbufs(os); /* * Finally close the objset diff -r f73f303e6a06 -r 96d96f8de974 usr/src/uts/common/os/list.c --- a/usr/src/uts/common/os/list.c Fri Aug 24 17:45:12 2007 -0700 +++ b/usr/src/uts/common/os/list.c Sun Aug 26 11:19:04 2007 -0700 @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -113,6 +113,7 @@ { list_node_t *lold = list_d2l(list, object); ASSERT(!list_empty(list)); + ASSERT(lold->list_next != NULL); lold->list_prev->list_next = lold->list_next; lold->list_next->list_prev = lold->list_prev; lold->list_next = lold->list_prev = NULL;